From: Vladimir Sokolovsky <vlad@mellanox.com>
Date: Mon, 8 Dec 2014 15:07:59 +0000 (+0200)
Subject: Initial commit. Based on v3.18
X-Git-Url: https://openfabrics.org/gitweb/?a=commitdiff_plain;h=HEAD;p=~emulex%2Ffor-vlad%2Flinux-3.18.git

Initial commit. Based on v3.18

Signed-off-by: Vladimir Sokolovsky <vlad@mellanox.com>
---

4b9e97b1c859c8fe54a596b554215f39d22bc761
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..e213b27
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,98 @@
+#
+# NOTE! Don't add files that are generated in specific
+# subdirectories here. Add them in the ".gitignore" file
+# in that subdirectory instead.
+#
+# NOTE! Please use 'git ls-files -i --exclude-standard'
+# command after changing this file, to see if there are
+# any tracked files which get ignored after the change.
+#
+# Normal rules
+#
+.*
+*.o
+*.o.*
+*.a
+*.s
+*.ko
+*.so
+*.so.dbg
+*.mod.c
+*.i
+*.lst
+*.symtypes
+*.order
+*.elf
+*.bin
+*.gz
+*.bz2
+*.lzma
+*.xz
+*.lz4
+*.lzo
+*.patch
+*.gcno
+modules.builtin
+Module.symvers
+*.dwo
+
+#
+# Top-level generic files
+#
+/tags
+/TAGS
+/linux
+/vmlinux
+/vmlinuz
+/System.map
+/Module.markers
+
+#
+# Debian directory (make deb-pkg)
+#
+/debian/
+
+#
+# git files that we don't want to ignore even it they are dot-files
+#
+!.gitignore
+!.mailmap
+
+#
+# Generated include files
+#
+include/config
+include/generated
+arch/*/include/generated
+
+# stgit generated dirs
+patches-*
+
+# quilt's files
+patches
+series
+
+# cscope files
+cscope.*
+ncscope.*
+
+# gnu global files
+GPATH
+GRTAGS
+GSYMS
+GTAGS
+
+*.orig
+*~
+\#*#
+
+#
+# Leavings from module signing
+#
+extra_certificates
+signing_key.priv
+signing_key.x509
+x509.genkey
+
+# Kconfig presets
+all.config
diff --git a/drivers/infiniband/Kconfig b/drivers/infiniband/Kconfig
new file mode 100644
index 0000000..7708939
--- /dev/null
+++ b/drivers/infiniband/Kconfig
@@ -0,0 +1,67 @@
+menuconfig INFINIBAND
+	tristate "InfiniBand support"
+	depends on PCI || BROKEN
+	depends on HAS_IOMEM
+	depends on NET
+	depends on INET
+	depends on m || IPV6 != m
+	---help---
+	  Core support for InfiniBand (IB).  Make sure to also select
+	  any protocols you wish to use as well as drivers for your
+	  InfiniBand hardware.
+
+if INFINIBAND
+
+config INFINIBAND_USER_MAD
+	tristate "InfiniBand userspace MAD support"
+	depends on INFINIBAND
+	---help---
+	  Userspace InfiniBand Management Datagram (MAD) support.  This
+	  is the kernel side of the userspace MAD support, which allows
+	  userspace processes to send and receive MADs. You will also
+	  need libibumad from <http://www.openfabrics.org/downloads/management/>.
+
+config INFINIBAND_USER_ACCESS
+	tristate "InfiniBand userspace access (verbs and CM)"
+	select ANON_INODES
+	---help---
+	  Userspace InfiniBand access support.  This enables the
+	  kernel side of userspace verbs and the userspace
+	  communication manager (CM).  This allows userspace processes
+	  to set up connections and directly access InfiniBand
+	  hardware for fast-path operations.  You will also need
+	  libibverbs, libibcm and a hardware driver library from
+	  <http://www.openfabrics.org/git/>.
+
+config INFINIBAND_USER_MEM
+	bool
+	depends on INFINIBAND_USER_ACCESS != n
+	default y
+
+config INFINIBAND_ADDR_TRANS
+	bool
+	depends on INFINIBAND
+	default y
+
+source "drivers/infiniband/hw/mthca/Kconfig"
+source "drivers/infiniband/hw/ipath/Kconfig"
+source "drivers/infiniband/hw/qib/Kconfig"
+source "drivers/infiniband/hw/ehca/Kconfig"
+source "drivers/infiniband/hw/amso1100/Kconfig"
+source "drivers/infiniband/hw/cxgb3/Kconfig"
+source "drivers/infiniband/hw/cxgb4/Kconfig"
+source "drivers/infiniband/hw/mlx4/Kconfig"
+source "drivers/infiniband/hw/mlx5/Kconfig"
+source "drivers/infiniband/hw/nes/Kconfig"
+source "drivers/infiniband/hw/ocrdma/Kconfig"
+source "drivers/infiniband/hw/usnic/Kconfig"
+
+source "drivers/infiniband/ulp/ipoib/Kconfig"
+
+source "drivers/infiniband/ulp/srp/Kconfig"
+source "drivers/infiniband/ulp/srpt/Kconfig"
+
+source "drivers/infiniband/ulp/iser/Kconfig"
+source "drivers/infiniband/ulp/isert/Kconfig"
+
+endif # INFINIBAND
diff --git a/drivers/infiniband/Makefile b/drivers/infiniband/Makefile
new file mode 100644
index 0000000..dc21836
--- /dev/null
+++ b/drivers/infiniband/Makefile
@@ -0,0 +1,3 @@
+obj-$(CONFIG_INFINIBAND)		+= core/
+obj-$(CONFIG_INFINIBAND)		+= hw/
+obj-$(CONFIG_INFINIBAND)		+= ulp/
diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile
new file mode 100644
index 0000000..ffd0af6
--- /dev/null
+++ b/drivers/infiniband/core/Makefile
@@ -0,0 +1,33 @@
+infiniband-$(CONFIG_INFINIBAND_ADDR_TRANS)	:= rdma_cm.o
+user_access-$(CONFIG_INFINIBAND_ADDR_TRANS)	:= rdma_ucm.o
+
+obj-$(CONFIG_INFINIBAND) +=		ib_core.o ib_mad.o ib_sa.o \
+					ib_cm.o iw_cm.o ib_addr.o \
+					$(infiniband-y)
+obj-$(CONFIG_INFINIBAND_USER_MAD) +=	ib_umad.o
+obj-$(CONFIG_INFINIBAND_USER_ACCESS) +=	ib_uverbs.o ib_ucm.o \
+					$(user_access-y)
+
+ib_core-y :=			packer.o ud_header.o verbs.o sysfs.o \
+				device.o fmr_pool.o cache.o netlink.o
+ib_core-$(CONFIG_INFINIBAND_USER_MEM) += umem.o
+
+ib_mad-y :=			mad.o smi.o agent.o mad_rmpp.o
+
+ib_sa-y :=			sa_query.o multicast.o
+
+ib_cm-y :=			cm.o
+
+iw_cm-y :=			iwcm.o iwpm_util.o iwpm_msg.o
+
+rdma_cm-y :=			cma.o
+
+rdma_ucm-y :=			ucma.o
+
+ib_addr-y :=			addr.o
+
+ib_umad-y :=			user_mad.o
+
+ib_ucm-y :=			ucm.o
+
+ib_uverbs-y :=			uverbs_main.o uverbs_cmd.o uverbs_marshall.o
diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c
new file mode 100644
index 0000000..8172d37
--- /dev/null
+++ b/drivers/infiniband/core/addr.c
@@ -0,0 +1,565 @@
+/*
+ * Copyright (c) 2005 Voltaire Inc.  All rights reserved.
+ * Copyright (c) 2002-2005, Network Appliance, Inc. All rights reserved.
+ * Copyright (c) 1999-2005, Mellanox Technologies, Inc. All rights reserved.
+ * Copyright (c) 2005 Intel Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/mutex.h>
+#include <linux/inetdevice.h>
+#include <linux/slab.h>
+#include <linux/workqueue.h>
+#include <linux/module.h>
+#include <net/arp.h>
+#include <net/neighbour.h>
+#include <net/route.h>
+#include <net/netevent.h>
+#include <net/addrconf.h>
+#include <net/ip6_route.h>
+#include <rdma/ib_addr.h>
+#include <rdma/ib.h>
+
+MODULE_AUTHOR("Sean Hefty");
+MODULE_DESCRIPTION("IB Address Translation");
+MODULE_LICENSE("Dual BSD/GPL");
+
+struct addr_req {
+	struct list_head list;
+	struct sockaddr_storage src_addr;
+	struct sockaddr_storage dst_addr;
+	struct rdma_dev_addr *addr;
+	struct rdma_addr_client *client;
+	void *context;
+	void (*callback)(int status, struct sockaddr *src_addr,
+			 struct rdma_dev_addr *addr, void *context);
+	unsigned long timeout;
+	int status;
+};
+
+static void process_req(struct work_struct *work);
+
+static DEFINE_MUTEX(lock);
+static LIST_HEAD(req_list);
+static DECLARE_DELAYED_WORK(work, process_req);
+static struct workqueue_struct *addr_wq;
+
+int rdma_addr_size(struct sockaddr *addr)
+{
+	switch (addr->sa_family) {
+	case AF_INET:
+		return sizeof(struct sockaddr_in);
+	case AF_INET6:
+		return sizeof(struct sockaddr_in6);
+	case AF_IB:
+		return sizeof(struct sockaddr_ib);
+	default:
+		return 0;
+	}
+}
+EXPORT_SYMBOL(rdma_addr_size);
+
+static struct rdma_addr_client self;
+
+void rdma_addr_register_client(struct rdma_addr_client *client)
+{
+	atomic_set(&client->refcount, 1);
+	init_completion(&client->comp);
+}
+EXPORT_SYMBOL(rdma_addr_register_client);
+
+static inline void put_client(struct rdma_addr_client *client)
+{
+	if (atomic_dec_and_test(&client->refcount))
+		complete(&client->comp);
+}
+
+void rdma_addr_unregister_client(struct rdma_addr_client *client)
+{
+	put_client(client);
+	wait_for_completion(&client->comp);
+}
+EXPORT_SYMBOL(rdma_addr_unregister_client);
+
+int rdma_copy_addr(struct rdma_dev_addr *dev_addr, struct net_device *dev,
+		     const unsigned char *dst_dev_addr)
+{
+	dev_addr->dev_type = dev->type;
+	memcpy(dev_addr->src_dev_addr, dev->dev_addr, MAX_ADDR_LEN);
+	memcpy(dev_addr->broadcast, dev->broadcast, MAX_ADDR_LEN);
+	if (dst_dev_addr)
+		memcpy(dev_addr->dst_dev_addr, dst_dev_addr, MAX_ADDR_LEN);
+	dev_addr->bound_dev_if = dev->ifindex;
+	return 0;
+}
+EXPORT_SYMBOL(rdma_copy_addr);
+
+int rdma_translate_ip(struct sockaddr *addr, struct rdma_dev_addr *dev_addr,
+		      u16 *vlan_id)
+{
+	struct net_device *dev;
+	int ret = -EADDRNOTAVAIL;
+
+	if (dev_addr->bound_dev_if) {
+		dev = dev_get_by_index(&init_net, dev_addr->bound_dev_if);
+		if (!dev)
+			return -ENODEV;
+		ret = rdma_copy_addr(dev_addr, dev, NULL);
+		dev_put(dev);
+		return ret;
+	}
+
+	switch (addr->sa_family) {
+	case AF_INET:
+		dev = ip_dev_find(&init_net,
+			((struct sockaddr_in *) addr)->sin_addr.s_addr);
+
+		if (!dev)
+			return ret;
+
+		ret = rdma_copy_addr(dev_addr, dev, NULL);
+		if (vlan_id)
+			*vlan_id = rdma_vlan_dev_vlan_id(dev);
+		dev_put(dev);
+		break;
+
+#if IS_ENABLED(CONFIG_IPV6)
+	case AF_INET6:
+		rcu_read_lock();
+		for_each_netdev_rcu(&init_net, dev) {
+			if (ipv6_chk_addr(&init_net,
+					  &((struct sockaddr_in6 *) addr)->sin6_addr,
+					  dev, 1)) {
+				ret = rdma_copy_addr(dev_addr, dev, NULL);
+				if (vlan_id)
+					*vlan_id = rdma_vlan_dev_vlan_id(dev);
+				break;
+			}
+		}
+		rcu_read_unlock();
+		break;
+#endif
+	}
+	return ret;
+}
+EXPORT_SYMBOL(rdma_translate_ip);
+
+static void set_timeout(unsigned long time)
+{
+	unsigned long delay;
+
+	delay = time - jiffies;
+	if ((long)delay <= 0)
+		delay = 1;
+
+	mod_delayed_work(addr_wq, &work, delay);
+}
+
+static void queue_req(struct addr_req *req)
+{
+	struct addr_req *temp_req;
+
+	mutex_lock(&lock);
+	list_for_each_entry_reverse(temp_req, &req_list, list) {
+		if (time_after_eq(req->timeout, temp_req->timeout))
+			break;
+	}
+
+	list_add(&req->list, &temp_req->list);
+
+	if (req_list.next == &req->list)
+		set_timeout(req->timeout);
+	mutex_unlock(&lock);
+}
+
+static int dst_fetch_ha(struct dst_entry *dst, struct rdma_dev_addr *dev_addr, void *daddr)
+{
+	struct neighbour *n;
+	int ret;
+
+	n = dst_neigh_lookup(dst, daddr);
+
+	rcu_read_lock();
+	if (!n || !(n->nud_state & NUD_VALID)) {
+		if (n)
+			neigh_event_send(n, NULL);
+		ret = -ENODATA;
+	} else {
+		ret = rdma_copy_addr(dev_addr, dst->dev, n->ha);
+	}
+	rcu_read_unlock();
+
+	if (n)
+		neigh_release(n);
+
+	return ret;
+}
+
+static int addr4_resolve(struct sockaddr_in *src_in,
+			 struct sockaddr_in *dst_in,
+			 struct rdma_dev_addr *addr)
+{
+	__be32 src_ip = src_in->sin_addr.s_addr;
+	__be32 dst_ip = dst_in->sin_addr.s_addr;
+	struct rtable *rt;
+	struct flowi4 fl4;
+	int ret;
+
+	memset(&fl4, 0, sizeof(fl4));
+	fl4.daddr = dst_ip;
+	fl4.saddr = src_ip;
+	fl4.flowi4_oif = addr->bound_dev_if;
+	rt = ip_route_output_key(&init_net, &fl4);
+	if (IS_ERR(rt)) {
+		ret = PTR_ERR(rt);
+		goto out;
+	}
+	src_in->sin_family = AF_INET;
+	src_in->sin_addr.s_addr = fl4.saddr;
+
+	if (rt->dst.dev->flags & IFF_LOOPBACK) {
+		ret = rdma_translate_ip((struct sockaddr *)dst_in, addr, NULL);
+		if (!ret)
+			memcpy(addr->dst_dev_addr, addr->src_dev_addr, MAX_ADDR_LEN);
+		goto put;
+	}
+
+	/* If the device does ARP internally, return 'done' */
+	if (rt->dst.dev->flags & IFF_NOARP) {
+		ret = rdma_copy_addr(addr, rt->dst.dev, NULL);
+		goto put;
+	}
+
+	ret = dst_fetch_ha(&rt->dst, addr, &fl4.daddr);
+put:
+	ip_rt_put(rt);
+out:
+	return ret;
+}
+
+#if IS_ENABLED(CONFIG_IPV6)
+static int addr6_resolve(struct sockaddr_in6 *src_in,
+			 struct sockaddr_in6 *dst_in,
+			 struct rdma_dev_addr *addr)
+{
+	struct flowi6 fl6;
+	struct dst_entry *dst;
+	int ret;
+
+	memset(&fl6, 0, sizeof fl6);
+	fl6.daddr = dst_in->sin6_addr;
+	fl6.saddr = src_in->sin6_addr;
+	fl6.flowi6_oif = addr->bound_dev_if;
+
+	dst = ip6_route_output(&init_net, NULL, &fl6);
+	if ((ret = dst->error))
+		goto put;
+
+	if (ipv6_addr_any(&fl6.saddr)) {
+		ret = ipv6_dev_get_saddr(&init_net, ip6_dst_idev(dst)->dev,
+					 &fl6.daddr, 0, &fl6.saddr);
+		if (ret)
+			goto put;
+
+		src_in->sin6_family = AF_INET6;
+		src_in->sin6_addr = fl6.saddr;
+	}
+
+	if (dst->dev->flags & IFF_LOOPBACK) {
+		ret = rdma_translate_ip((struct sockaddr *)dst_in, addr, NULL);
+		if (!ret)
+			memcpy(addr->dst_dev_addr, addr->src_dev_addr, MAX_ADDR_LEN);
+		goto put;
+	}
+
+	/* If the device does ARP internally, return 'done' */
+	if (dst->dev->flags & IFF_NOARP) {
+		ret = rdma_copy_addr(addr, dst->dev, NULL);
+		goto put;
+	}
+
+	ret = dst_fetch_ha(dst, addr, &fl6.daddr);
+put:
+	dst_release(dst);
+	return ret;
+}
+#else
+static int addr6_resolve(struct sockaddr_in6 *src_in,
+			 struct sockaddr_in6 *dst_in,
+			 struct rdma_dev_addr *addr)
+{
+	return -EADDRNOTAVAIL;
+}
+#endif
+
+static int addr_resolve(struct sockaddr *src_in,
+			struct sockaddr *dst_in,
+			struct rdma_dev_addr *addr)
+{
+	if (src_in->sa_family == AF_INET) {
+		return addr4_resolve((struct sockaddr_in *) src_in,
+			(struct sockaddr_in *) dst_in, addr);
+	} else
+		return addr6_resolve((struct sockaddr_in6 *) src_in,
+			(struct sockaddr_in6 *) dst_in, addr);
+}
+
+static void process_req(struct work_struct *work)
+{
+	struct addr_req *req, *temp_req;
+	struct sockaddr *src_in, *dst_in;
+	struct list_head done_list;
+
+	INIT_LIST_HEAD(&done_list);
+
+	mutex_lock(&lock);
+	list_for_each_entry_safe(req, temp_req, &req_list, list) {
+		if (req->status == -ENODATA) {
+			src_in = (struct sockaddr *) &req->src_addr;
+			dst_in = (struct sockaddr *) &req->dst_addr;
+			req->status = addr_resolve(src_in, dst_in, req->addr);
+			if (req->status && time_after_eq(jiffies, req->timeout))
+				req->status = -ETIMEDOUT;
+			else if (req->status == -ENODATA)
+				continue;
+		}
+		list_move_tail(&req->list, &done_list);
+	}
+
+	if (!list_empty(&req_list)) {
+		req = list_entry(req_list.next, struct addr_req, list);
+		set_timeout(req->timeout);
+	}
+	mutex_unlock(&lock);
+
+	list_for_each_entry_safe(req, temp_req, &done_list, list) {
+		list_del(&req->list);
+		req->callback(req->status, (struct sockaddr *) &req->src_addr,
+			req->addr, req->context);
+		put_client(req->client);
+		kfree(req);
+	}
+}
+
+int rdma_resolve_ip(struct rdma_addr_client *client,
+		    struct sockaddr *src_addr, struct sockaddr *dst_addr,
+		    struct rdma_dev_addr *addr, int timeout_ms,
+		    void (*callback)(int status, struct sockaddr *src_addr,
+				     struct rdma_dev_addr *addr, void *context),
+		    void *context)
+{
+	struct sockaddr *src_in, *dst_in;
+	struct addr_req *req;
+	int ret = 0;
+
+	req = kzalloc(sizeof *req, GFP_KERNEL);
+	if (!req)
+		return -ENOMEM;
+
+	src_in = (struct sockaddr *) &req->src_addr;
+	dst_in = (struct sockaddr *) &req->dst_addr;
+
+	if (src_addr) {
+		if (src_addr->sa_family != dst_addr->sa_family) {
+			ret = -EINVAL;
+			goto err;
+		}
+
+		memcpy(src_in, src_addr, rdma_addr_size(src_addr));
+	} else {
+		src_in->sa_family = dst_addr->sa_family;
+	}
+
+	memcpy(dst_in, dst_addr, rdma_addr_size(dst_addr));
+	req->addr = addr;
+	req->callback = callback;
+	req->context = context;
+	req->client = client;
+	atomic_inc(&client->refcount);
+
+	req->status = addr_resolve(src_in, dst_in, addr);
+	switch (req->status) {
+	case 0:
+		req->timeout = jiffies;
+		queue_req(req);
+		break;
+	case -ENODATA:
+		req->timeout = msecs_to_jiffies(timeout_ms) + jiffies;
+		queue_req(req);
+		break;
+	default:
+		ret = req->status;
+		atomic_dec(&client->refcount);
+		goto err;
+	}
+	return ret;
+err:
+	kfree(req);
+	return ret;
+}
+EXPORT_SYMBOL(rdma_resolve_ip);
+
+void rdma_addr_cancel(struct rdma_dev_addr *addr)
+{
+	struct addr_req *req, *temp_req;
+
+	mutex_lock(&lock);
+	list_for_each_entry_safe(req, temp_req, &req_list, list) {
+		if (req->addr == addr) {
+			req->status = -ECANCELED;
+			req->timeout = jiffies;
+			list_move(&req->list, &req_list);
+			set_timeout(req->timeout);
+			break;
+		}
+	}
+	mutex_unlock(&lock);
+}
+EXPORT_SYMBOL(rdma_addr_cancel);
+
+struct resolve_cb_context {
+	struct rdma_dev_addr *addr;
+	struct completion comp;
+};
+
+static void resolve_cb(int status, struct sockaddr *src_addr,
+	     struct rdma_dev_addr *addr, void *context)
+{
+	memcpy(((struct resolve_cb_context *)context)->addr, addr, sizeof(struct
+				rdma_dev_addr));
+	complete(&((struct resolve_cb_context *)context)->comp);
+}
+
+int rdma_addr_find_dmac_by_grh(union ib_gid *sgid, union ib_gid *dgid, u8 *dmac,
+			       u16 *vlan_id)
+{
+	int ret = 0;
+	struct rdma_dev_addr dev_addr;
+	struct resolve_cb_context ctx;
+	struct net_device *dev;
+
+	union {
+		struct sockaddr     _sockaddr;
+		struct sockaddr_in  _sockaddr_in;
+		struct sockaddr_in6 _sockaddr_in6;
+	} sgid_addr, dgid_addr;
+
+
+	ret = rdma_gid2ip(&sgid_addr._sockaddr, sgid);
+	if (ret)
+		return ret;
+
+	ret = rdma_gid2ip(&dgid_addr._sockaddr, dgid);
+	if (ret)
+		return ret;
+
+	memset(&dev_addr, 0, sizeof(dev_addr));
+
+	ctx.addr = &dev_addr;
+	init_completion(&ctx.comp);
+	ret = rdma_resolve_ip(&self, &sgid_addr._sockaddr, &dgid_addr._sockaddr,
+			&dev_addr, 1000, resolve_cb, &ctx);
+	if (ret)
+		return ret;
+
+	wait_for_completion(&ctx.comp);
+
+	memcpy(dmac, dev_addr.dst_dev_addr, ETH_ALEN);
+	dev = dev_get_by_index(&init_net, dev_addr.bound_dev_if);
+	if (!dev)
+		return -ENODEV;
+	if (vlan_id)
+		*vlan_id = rdma_vlan_dev_vlan_id(dev);
+	dev_put(dev);
+	return ret;
+}
+EXPORT_SYMBOL(rdma_addr_find_dmac_by_grh);
+
+int rdma_addr_find_smac_by_sgid(union ib_gid *sgid, u8 *smac, u16 *vlan_id)
+{
+	int ret = 0;
+	struct rdma_dev_addr dev_addr;
+	union {
+		struct sockaddr     _sockaddr;
+		struct sockaddr_in  _sockaddr_in;
+		struct sockaddr_in6 _sockaddr_in6;
+	} gid_addr;
+
+	ret = rdma_gid2ip(&gid_addr._sockaddr, sgid);
+
+	if (ret)
+		return ret;
+	memset(&dev_addr, 0, sizeof(dev_addr));
+	ret = rdma_translate_ip(&gid_addr._sockaddr, &dev_addr, vlan_id);
+	if (ret)
+		return ret;
+
+	memcpy(smac, dev_addr.src_dev_addr, ETH_ALEN);
+	return ret;
+}
+EXPORT_SYMBOL(rdma_addr_find_smac_by_sgid);
+
+static int netevent_callback(struct notifier_block *self, unsigned long event,
+	void *ctx)
+{
+	if (event == NETEVENT_NEIGH_UPDATE) {
+		struct neighbour *neigh = ctx;
+
+		if (neigh->nud_state & NUD_VALID) {
+			set_timeout(jiffies);
+		}
+	}
+	return 0;
+}
+
+static struct notifier_block nb = {
+	.notifier_call = netevent_callback
+};
+
+static int __init addr_init(void)
+{
+	addr_wq = create_singlethread_workqueue("ib_addr");
+	if (!addr_wq)
+		return -ENOMEM;
+
+	register_netevent_notifier(&nb);
+	rdma_addr_register_client(&self);
+	return 0;
+}
+
+static void __exit addr_cleanup(void)
+{
+	rdma_addr_unregister_client(&self);
+	unregister_netevent_notifier(&nb);
+	destroy_workqueue(addr_wq);
+}
+
+module_init(addr_init);
+module_exit(addr_cleanup);
diff --git a/drivers/infiniband/core/agent.c b/drivers/infiniband/core/agent.c
new file mode 100644
index 0000000..f6d2961
--- /dev/null
+++ b/drivers/infiniband/core/agent.c
@@ -0,0 +1,217 @@
+/*
+ * Copyright (c) 2004, 2005 Mellanox Technologies Ltd.  All rights reserved.
+ * Copyright (c) 2004, 2005 Infinicon Corporation.  All rights reserved.
+ * Copyright (c) 2004, 2005 Intel Corporation.  All rights reserved.
+ * Copyright (c) 2004, 2005 Topspin Corporation.  All rights reserved.
+ * Copyright (c) 2004-2007 Voltaire Corporation.  All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <linux/slab.h>
+#include <linux/string.h>
+
+#include "agent.h"
+#include "smi.h"
+#include "mad_priv.h"
+
+#define SPFX "ib_agent: "
+
+struct ib_agent_port_private {
+	struct list_head port_list;
+	struct ib_mad_agent *agent[2];
+};
+
+static DEFINE_SPINLOCK(ib_agent_port_list_lock);
+static LIST_HEAD(ib_agent_port_list);
+
+static struct ib_agent_port_private *
+__ib_get_agent_port(struct ib_device *device, int port_num)
+{
+	struct ib_agent_port_private *entry;
+
+	list_for_each_entry(entry, &ib_agent_port_list, port_list) {
+		if (entry->agent[1]->device == device &&
+		    entry->agent[1]->port_num == port_num)
+			return entry;
+	}
+	return NULL;
+}
+
+static struct ib_agent_port_private *
+ib_get_agent_port(struct ib_device *device, int port_num)
+{
+	struct ib_agent_port_private *entry;
+	unsigned long flags;
+
+	spin_lock_irqsave(&ib_agent_port_list_lock, flags);
+	entry = __ib_get_agent_port(device, port_num);
+	spin_unlock_irqrestore(&ib_agent_port_list_lock, flags);
+	return entry;
+}
+
+void agent_send_response(struct ib_mad *mad, struct ib_grh *grh,
+			 struct ib_wc *wc, struct ib_device *device,
+			 int port_num, int qpn)
+{
+	struct ib_agent_port_private *port_priv;
+	struct ib_mad_agent *agent;
+	struct ib_mad_send_buf *send_buf;
+	struct ib_ah *ah;
+	struct ib_mad_send_wr_private *mad_send_wr;
+
+	if (device->node_type == RDMA_NODE_IB_SWITCH)
+		port_priv = ib_get_agent_port(device, 0);
+	else
+		port_priv = ib_get_agent_port(device, port_num);
+
+	if (!port_priv) {
+		dev_err(&device->dev, "Unable to find port agent\n");
+		return;
+	}
+
+	agent = port_priv->agent[qpn];
+	ah = ib_create_ah_from_wc(agent->qp->pd, wc, grh, port_num);
+	if (IS_ERR(ah)) {
+		dev_err(&device->dev, "ib_create_ah_from_wc error %ld\n",
+			PTR_ERR(ah));
+		return;
+	}
+
+	send_buf = ib_create_send_mad(agent, wc->src_qp, wc->pkey_index, 0,
+				      IB_MGMT_MAD_HDR, IB_MGMT_MAD_DATA,
+				      GFP_KERNEL);
+	if (IS_ERR(send_buf)) {
+		dev_err(&device->dev, "ib_create_send_mad error\n");
+		goto err1;
+	}
+
+	memcpy(send_buf->mad, mad, sizeof *mad);
+	send_buf->ah = ah;
+
+	if (device->node_type == RDMA_NODE_IB_SWITCH) {
+		mad_send_wr = container_of(send_buf,
+					   struct ib_mad_send_wr_private,
+					   send_buf);
+		mad_send_wr->send_wr.wr.ud.port_num = port_num;
+	}
+
+	if (ib_post_send_mad(send_buf, NULL)) {
+		dev_err(&device->dev, "ib_post_send_mad error\n");
+		goto err2;
+	}
+	return;
+err2:
+	ib_free_send_mad(send_buf);
+err1:
+	ib_destroy_ah(ah);
+}
+
+static void agent_send_handler(struct ib_mad_agent *mad_agent,
+			       struct ib_mad_send_wc *mad_send_wc)
+{
+	ib_destroy_ah(mad_send_wc->send_buf->ah);
+	ib_free_send_mad(mad_send_wc->send_buf);
+}
+
+int ib_agent_port_open(struct ib_device *device, int port_num)
+{
+	struct ib_agent_port_private *port_priv;
+	unsigned long flags;
+	int ret;
+
+	/* Create new device info */
+	port_priv = kzalloc(sizeof *port_priv, GFP_KERNEL);
+	if (!port_priv) {
+		dev_err(&device->dev, "No memory for ib_agent_port_private\n");
+		ret = -ENOMEM;
+		goto error1;
+	}
+
+	if (rdma_port_get_link_layer(device, port_num) == IB_LINK_LAYER_INFINIBAND) {
+		/* Obtain send only MAD agent for SMI QP */
+		port_priv->agent[0] = ib_register_mad_agent(device, port_num,
+							    IB_QPT_SMI, NULL, 0,
+							    &agent_send_handler,
+							    NULL, NULL, 0);
+		if (IS_ERR(port_priv->agent[0])) {
+			ret = PTR_ERR(port_priv->agent[0]);
+			goto error2;
+		}
+	}
+
+	/* Obtain send only MAD agent for GSI QP */
+	port_priv->agent[1] = ib_register_mad_agent(device, port_num,
+						    IB_QPT_GSI, NULL, 0,
+						    &agent_send_handler,
+						    NULL, NULL, 0);
+	if (IS_ERR(port_priv->agent[1])) {
+		ret = PTR_ERR(port_priv->agent[1]);
+		goto error3;
+	}
+
+	spin_lock_irqsave(&ib_agent_port_list_lock, flags);
+	list_add_tail(&port_priv->port_list, &ib_agent_port_list);
+	spin_unlock_irqrestore(&ib_agent_port_list_lock, flags);
+
+	return 0;
+
+error3:
+	if (port_priv->agent[0])
+		ib_unregister_mad_agent(port_priv->agent[0]);
+error2:
+	kfree(port_priv);
+error1:
+	return ret;
+}
+
+int ib_agent_port_close(struct ib_device *device, int port_num)
+{
+	struct ib_agent_port_private *port_priv;
+	unsigned long flags;
+
+	spin_lock_irqsave(&ib_agent_port_list_lock, flags);
+	port_priv = __ib_get_agent_port(device, port_num);
+	if (port_priv == NULL) {
+		spin_unlock_irqrestore(&ib_agent_port_list_lock, flags);
+		dev_err(&device->dev, "Port %d not found\n", port_num);
+		return -ENODEV;
+	}
+	list_del(&port_priv->port_list);
+	spin_unlock_irqrestore(&ib_agent_port_list_lock, flags);
+
+	ib_unregister_mad_agent(port_priv->agent[1]);
+	if (port_priv->agent[0])
+		ib_unregister_mad_agent(port_priv->agent[0]);
+
+	kfree(port_priv);
+	return 0;
+}
diff --git a/drivers/infiniband/core/agent.h b/drivers/infiniband/core/agent.h
new file mode 100644
index 0000000..6669287
--- /dev/null
+++ b/drivers/infiniband/core/agent.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2004 Mellanox Technologies Ltd.  All rights reserved.
+ * Copyright (c) 2004 Infinicon Corporation.  All rights reserved.
+ * Copyright (c) 2004 Intel Corporation.  All rights reserved.
+ * Copyright (c) 2004 Topspin Corporation.  All rights reserved.
+ * Copyright (c) 2004 Voltaire Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __AGENT_H_
+#define __AGENT_H_
+
+#include <linux/err.h>
+#include <rdma/ib_mad.h>
+
+extern int ib_agent_port_open(struct ib_device *device, int port_num);
+
+extern int ib_agent_port_close(struct ib_device *device, int port_num);
+
+extern void agent_send_response(struct ib_mad *mad, struct ib_grh *grh,
+				struct ib_wc *wc, struct ib_device *device,
+				int port_num, int qpn);
+
+#endif	/* __AGENT_H_ */
diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c
new file mode 100644
index 0000000..80f6cf2
--- /dev/null
+++ b/drivers/infiniband/core/cache.c
@@ -0,0 +1,439 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Intel Corporation. All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2005 Voltaire, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/workqueue.h>
+
+#include <rdma/ib_cache.h>
+
+#include "core_priv.h"
+
+struct ib_pkey_cache {
+	int             table_len;
+	u16             table[0];
+};
+
+struct ib_gid_cache {
+	int             table_len;
+	union ib_gid    table[0];
+};
+
+struct ib_update_work {
+	struct work_struct work;
+	struct ib_device  *device;
+	u8                 port_num;
+};
+
+static inline int start_port(struct ib_device *device)
+{
+	return (device->node_type == RDMA_NODE_IB_SWITCH) ? 0 : 1;
+}
+
+static inline int end_port(struct ib_device *device)
+{
+	return (device->node_type == RDMA_NODE_IB_SWITCH) ?
+		0 : device->phys_port_cnt;
+}
+
+int ib_get_cached_gid(struct ib_device *device,
+		      u8                port_num,
+		      int               index,
+		      union ib_gid     *gid)
+{
+	struct ib_gid_cache *cache;
+	unsigned long flags;
+	int ret = 0;
+
+	if (port_num < start_port(device) || port_num > end_port(device))
+		return -EINVAL;
+
+	read_lock_irqsave(&device->cache.lock, flags);
+
+	cache = device->cache.gid_cache[port_num - start_port(device)];
+
+	if (index < 0 || index >= cache->table_len)
+		ret = -EINVAL;
+	else
+		*gid = cache->table[index];
+
+	read_unlock_irqrestore(&device->cache.lock, flags);
+
+	return ret;
+}
+EXPORT_SYMBOL(ib_get_cached_gid);
+
+int ib_find_cached_gid(struct ib_device *device,
+		       union ib_gid	*gid,
+		       u8               *port_num,
+		       u16              *index)
+{
+	struct ib_gid_cache *cache;
+	unsigned long flags;
+	int p, i;
+	int ret = -ENOENT;
+
+	*port_num = -1;
+	if (index)
+		*index = -1;
+
+	read_lock_irqsave(&device->cache.lock, flags);
+
+	for (p = 0; p <= end_port(device) - start_port(device); ++p) {
+		cache = device->cache.gid_cache[p];
+		for (i = 0; i < cache->table_len; ++i) {
+			if (!memcmp(gid, &cache->table[i], sizeof *gid)) {
+				*port_num = p + start_port(device);
+				if (index)
+					*index = i;
+				ret = 0;
+				goto found;
+			}
+		}
+	}
+found:
+	read_unlock_irqrestore(&device->cache.lock, flags);
+
+	return ret;
+}
+EXPORT_SYMBOL(ib_find_cached_gid);
+
+int ib_get_cached_pkey(struct ib_device *device,
+		       u8                port_num,
+		       int               index,
+		       u16              *pkey)
+{
+	struct ib_pkey_cache *cache;
+	unsigned long flags;
+	int ret = 0;
+
+	if (port_num < start_port(device) || port_num > end_port(device))
+		return -EINVAL;
+
+	read_lock_irqsave(&device->cache.lock, flags);
+
+	cache = device->cache.pkey_cache[port_num - start_port(device)];
+
+	if (index < 0 || index >= cache->table_len)
+		ret = -EINVAL;
+	else
+		*pkey = cache->table[index];
+
+	read_unlock_irqrestore(&device->cache.lock, flags);
+
+	return ret;
+}
+EXPORT_SYMBOL(ib_get_cached_pkey);
+
+int ib_find_cached_pkey(struct ib_device *device,
+			u8                port_num,
+			u16               pkey,
+			u16              *index)
+{
+	struct ib_pkey_cache *cache;
+	unsigned long flags;
+	int i;
+	int ret = -ENOENT;
+	int partial_ix = -1;
+
+	if (port_num < start_port(device) || port_num > end_port(device))
+		return -EINVAL;
+
+	read_lock_irqsave(&device->cache.lock, flags);
+
+	cache = device->cache.pkey_cache[port_num - start_port(device)];
+
+	*index = -1;
+
+	for (i = 0; i < cache->table_len; ++i)
+		if ((cache->table[i] & 0x7fff) == (pkey & 0x7fff)) {
+			if (cache->table[i] & 0x8000) {
+				*index = i;
+				ret = 0;
+				break;
+			} else
+				partial_ix = i;
+		}
+
+	if (ret && partial_ix >= 0) {
+		*index = partial_ix;
+		ret = 0;
+	}
+
+	read_unlock_irqrestore(&device->cache.lock, flags);
+
+	return ret;
+}
+EXPORT_SYMBOL(ib_find_cached_pkey);
+
+int ib_find_exact_cached_pkey(struct ib_device *device,
+			      u8                port_num,
+			      u16               pkey,
+			      u16              *index)
+{
+	struct ib_pkey_cache *cache;
+	unsigned long flags;
+	int i;
+	int ret = -ENOENT;
+
+	if (port_num < start_port(device) || port_num > end_port(device))
+		return -EINVAL;
+
+	read_lock_irqsave(&device->cache.lock, flags);
+
+	cache = device->cache.pkey_cache[port_num - start_port(device)];
+
+	*index = -1;
+
+	for (i = 0; i < cache->table_len; ++i)
+		if (cache->table[i] == pkey) {
+			*index = i;
+			ret = 0;
+			break;
+		}
+
+	read_unlock_irqrestore(&device->cache.lock, flags);
+
+	return ret;
+}
+EXPORT_SYMBOL(ib_find_exact_cached_pkey);
+
+int ib_get_cached_lmc(struct ib_device *device,
+		      u8                port_num,
+		      u8                *lmc)
+{
+	unsigned long flags;
+	int ret = 0;
+
+	if (port_num < start_port(device) || port_num > end_port(device))
+		return -EINVAL;
+
+	read_lock_irqsave(&device->cache.lock, flags);
+	*lmc = device->cache.lmc_cache[port_num - start_port(device)];
+	read_unlock_irqrestore(&device->cache.lock, flags);
+
+	return ret;
+}
+EXPORT_SYMBOL(ib_get_cached_lmc);
+
+static void ib_cache_update(struct ib_device *device,
+			    u8                port)
+{
+	struct ib_port_attr       *tprops = NULL;
+	struct ib_pkey_cache      *pkey_cache = NULL, *old_pkey_cache;
+	struct ib_gid_cache       *gid_cache = NULL, *old_gid_cache;
+	int                        i;
+	int                        ret;
+
+	tprops = kmalloc(sizeof *tprops, GFP_KERNEL);
+	if (!tprops)
+		return;
+
+	ret = ib_query_port(device, port, tprops);
+	if (ret) {
+		printk(KERN_WARNING "ib_query_port failed (%d) for %s\n",
+		       ret, device->name);
+		goto err;
+	}
+
+	pkey_cache = kmalloc(sizeof *pkey_cache + tprops->pkey_tbl_len *
+			     sizeof *pkey_cache->table, GFP_KERNEL);
+	if (!pkey_cache)
+		goto err;
+
+	pkey_cache->table_len = tprops->pkey_tbl_len;
+
+	gid_cache = kmalloc(sizeof *gid_cache + tprops->gid_tbl_len *
+			    sizeof *gid_cache->table, GFP_KERNEL);
+	if (!gid_cache)
+		goto err;
+
+	gid_cache->table_len = tprops->gid_tbl_len;
+
+	for (i = 0; i < pkey_cache->table_len; ++i) {
+		ret = ib_query_pkey(device, port, i, pkey_cache->table + i);
+		if (ret) {
+			printk(KERN_WARNING "ib_query_pkey failed (%d) for %s (index %d)\n",
+			       ret, device->name, i);
+			goto err;
+		}
+	}
+
+	for (i = 0; i < gid_cache->table_len; ++i) {
+		ret = ib_query_gid(device, port, i, gid_cache->table + i);
+		if (ret) {
+			printk(KERN_WARNING "ib_query_gid failed (%d) for %s (index %d)\n",
+			       ret, device->name, i);
+			goto err;
+		}
+	}
+
+	write_lock_irq(&device->cache.lock);
+
+	old_pkey_cache = device->cache.pkey_cache[port - start_port(device)];
+	old_gid_cache  = device->cache.gid_cache [port - start_port(device)];
+
+	device->cache.pkey_cache[port - start_port(device)] = pkey_cache;
+	device->cache.gid_cache [port - start_port(device)] = gid_cache;
+
+	device->cache.lmc_cache[port - start_port(device)] = tprops->lmc;
+
+	write_unlock_irq(&device->cache.lock);
+
+	kfree(old_pkey_cache);
+	kfree(old_gid_cache);
+	kfree(tprops);
+	return;
+
+err:
+	kfree(pkey_cache);
+	kfree(gid_cache);
+	kfree(tprops);
+}
+
+static void ib_cache_task(struct work_struct *_work)
+{
+	struct ib_update_work *work =
+		container_of(_work, struct ib_update_work, work);
+
+	ib_cache_update(work->device, work->port_num);
+	kfree(work);
+}
+
+static void ib_cache_event(struct ib_event_handler *handler,
+			   struct ib_event *event)
+{
+	struct ib_update_work *work;
+
+	if (event->event == IB_EVENT_PORT_ERR    ||
+	    event->event == IB_EVENT_PORT_ACTIVE ||
+	    event->event == IB_EVENT_LID_CHANGE  ||
+	    event->event == IB_EVENT_PKEY_CHANGE ||
+	    event->event == IB_EVENT_SM_CHANGE   ||
+	    event->event == IB_EVENT_CLIENT_REREGISTER ||
+	    event->event == IB_EVENT_GID_CHANGE) {
+		work = kmalloc(sizeof *work, GFP_ATOMIC);
+		if (work) {
+			INIT_WORK(&work->work, ib_cache_task);
+			work->device   = event->device;
+			work->port_num = event->element.port_num;
+			queue_work(ib_wq, &work->work);
+		}
+	}
+}
+
+static void ib_cache_setup_one(struct ib_device *device)
+{
+	int p;
+
+	rwlock_init(&device->cache.lock);
+
+	device->cache.pkey_cache =
+		kmalloc(sizeof *device->cache.pkey_cache *
+			(end_port(device) - start_port(device) + 1), GFP_KERNEL);
+	device->cache.gid_cache =
+		kmalloc(sizeof *device->cache.gid_cache *
+			(end_port(device) - start_port(device) + 1), GFP_KERNEL);
+
+	device->cache.lmc_cache = kmalloc(sizeof *device->cache.lmc_cache *
+					  (end_port(device) -
+					   start_port(device) + 1),
+					  GFP_KERNEL);
+
+	if (!device->cache.pkey_cache || !device->cache.gid_cache ||
+	    !device->cache.lmc_cache) {
+		printk(KERN_WARNING "Couldn't allocate cache "
+		       "for %s\n", device->name);
+		goto err;
+	}
+
+	for (p = 0; p <= end_port(device) - start_port(device); ++p) {
+		device->cache.pkey_cache[p] = NULL;
+		device->cache.gid_cache [p] = NULL;
+		ib_cache_update(device, p + start_port(device));
+	}
+
+	INIT_IB_EVENT_HANDLER(&device->cache.event_handler,
+			      device, ib_cache_event);
+	if (ib_register_event_handler(&device->cache.event_handler))
+		goto err_cache;
+
+	return;
+
+err_cache:
+	for (p = 0; p <= end_port(device) - start_port(device); ++p) {
+		kfree(device->cache.pkey_cache[p]);
+		kfree(device->cache.gid_cache[p]);
+	}
+
+err:
+	kfree(device->cache.pkey_cache);
+	kfree(device->cache.gid_cache);
+	kfree(device->cache.lmc_cache);
+}
+
+static void ib_cache_cleanup_one(struct ib_device *device)
+{
+	int p;
+
+	ib_unregister_event_handler(&device->cache.event_handler);
+	flush_workqueue(ib_wq);
+
+	for (p = 0; p <= end_port(device) - start_port(device); ++p) {
+		kfree(device->cache.pkey_cache[p]);
+		kfree(device->cache.gid_cache[p]);
+	}
+
+	kfree(device->cache.pkey_cache);
+	kfree(device->cache.gid_cache);
+	kfree(device->cache.lmc_cache);
+}
+
+static struct ib_client cache_client = {
+	.name   = "cache",
+	.add    = ib_cache_setup_one,
+	.remove = ib_cache_cleanup_one
+};
+
+int __init ib_cache_setup(void)
+{
+	return ib_register_client(&cache_client);
+}
+
+void __exit ib_cache_cleanup(void)
+{
+	ib_unregister_client(&cache_client);
+}
diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c
new file mode 100644
index 0000000..e28a494
--- /dev/null
+++ b/drivers/infiniband/core/cm.c
@@ -0,0 +1,3932 @@
+/*
+ * Copyright (c) 2004-2007 Intel Corporation.  All rights reserved.
+ * Copyright (c) 2004 Topspin Corporation.  All rights reserved.
+ * Copyright (c) 2004, 2005 Voltaire Corporation.  All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/completion.h>
+#include <linux/dma-mapping.h>
+#include <linux/device.h>
+#include <linux/module.h>
+#include <linux/err.h>
+#include <linux/idr.h>
+#include <linux/interrupt.h>
+#include <linux/random.h>
+#include <linux/rbtree.h>
+#include <linux/spinlock.h>
+#include <linux/slab.h>
+#include <linux/sysfs.h>
+#include <linux/workqueue.h>
+#include <linux/kdev_t.h>
+#include <linux/etherdevice.h>
+
+#include <rdma/ib_cache.h>
+#include <rdma/ib_cm.h>
+#include "cm_msgs.h"
+
+MODULE_AUTHOR("Sean Hefty");
+MODULE_DESCRIPTION("InfiniBand CM");
+MODULE_LICENSE("Dual BSD/GPL");
+
+static void cm_add_one(struct ib_device *device);
+static void cm_remove_one(struct ib_device *device);
+
+static struct ib_client cm_client = {
+	.name   = "cm",
+	.add    = cm_add_one,
+	.remove = cm_remove_one
+};
+
+static struct ib_cm {
+	spinlock_t lock;
+	struct list_head device_list;
+	rwlock_t device_lock;
+	struct rb_root listen_service_table;
+	u64 listen_service_id;
+	/* struct rb_root peer_service_table; todo: fix peer to peer */
+	struct rb_root remote_qp_table;
+	struct rb_root remote_id_table;
+	struct rb_root remote_sidr_table;
+	struct idr local_id_table;
+	__be32 random_id_operand;
+	struct list_head timewait_list;
+	struct workqueue_struct *wq;
+} cm;
+
+/* Counter indexes ordered by attribute ID */
+enum {
+	CM_REQ_COUNTER,
+	CM_MRA_COUNTER,
+	CM_REJ_COUNTER,
+	CM_REP_COUNTER,
+	CM_RTU_COUNTER,
+	CM_DREQ_COUNTER,
+	CM_DREP_COUNTER,
+	CM_SIDR_REQ_COUNTER,
+	CM_SIDR_REP_COUNTER,
+	CM_LAP_COUNTER,
+	CM_APR_COUNTER,
+	CM_ATTR_COUNT,
+	CM_ATTR_ID_OFFSET = 0x0010,
+};
+
+enum {
+	CM_XMIT,
+	CM_XMIT_RETRIES,
+	CM_RECV,
+	CM_RECV_DUPLICATES,
+	CM_COUNTER_GROUPS
+};
+
+static char const counter_group_names[CM_COUNTER_GROUPS]
+				     [sizeof("cm_rx_duplicates")] = {
+	"cm_tx_msgs", "cm_tx_retries",
+	"cm_rx_msgs", "cm_rx_duplicates"
+};
+
+struct cm_counter_group {
+	struct kobject obj;
+	atomic_long_t counter[CM_ATTR_COUNT];
+};
+
+struct cm_counter_attribute {
+	struct attribute attr;
+	int index;
+};
+
+#define CM_COUNTER_ATTR(_name, _index) \
+struct cm_counter_attribute cm_##_name##_counter_attr = { \
+	.attr = { .name = __stringify(_name), .mode = 0444 }, \
+	.index = _index \
+}
+
+static CM_COUNTER_ATTR(req, CM_REQ_COUNTER);
+static CM_COUNTER_ATTR(mra, CM_MRA_COUNTER);
+static CM_COUNTER_ATTR(rej, CM_REJ_COUNTER);
+static CM_COUNTER_ATTR(rep, CM_REP_COUNTER);
+static CM_COUNTER_ATTR(rtu, CM_RTU_COUNTER);
+static CM_COUNTER_ATTR(dreq, CM_DREQ_COUNTER);
+static CM_COUNTER_ATTR(drep, CM_DREP_COUNTER);
+static CM_COUNTER_ATTR(sidr_req, CM_SIDR_REQ_COUNTER);
+static CM_COUNTER_ATTR(sidr_rep, CM_SIDR_REP_COUNTER);
+static CM_COUNTER_ATTR(lap, CM_LAP_COUNTER);
+static CM_COUNTER_ATTR(apr, CM_APR_COUNTER);
+
+static struct attribute *cm_counter_default_attrs[] = {
+	&cm_req_counter_attr.attr,
+	&cm_mra_counter_attr.attr,
+	&cm_rej_counter_attr.attr,
+	&cm_rep_counter_attr.attr,
+	&cm_rtu_counter_attr.attr,
+	&cm_dreq_counter_attr.attr,
+	&cm_drep_counter_attr.attr,
+	&cm_sidr_req_counter_attr.attr,
+	&cm_sidr_rep_counter_attr.attr,
+	&cm_lap_counter_attr.attr,
+	&cm_apr_counter_attr.attr,
+	NULL
+};
+
+struct cm_port {
+	struct cm_device *cm_dev;
+	struct ib_mad_agent *mad_agent;
+	struct kobject port_obj;
+	u8 port_num;
+	struct cm_counter_group counter_group[CM_COUNTER_GROUPS];
+};
+
+struct cm_device {
+	struct list_head list;
+	struct ib_device *ib_device;
+	struct device *device;
+	u8 ack_delay;
+	struct cm_port *port[0];
+};
+
+struct cm_av {
+	struct cm_port *port;
+	union ib_gid dgid;
+	struct ib_ah_attr ah_attr;
+	u16 pkey_index;
+	u8 timeout;
+	u8  valid;
+	u8  smac[ETH_ALEN];
+};
+
+struct cm_work {
+	struct delayed_work work;
+	struct list_head list;
+	struct cm_port *port;
+	struct ib_mad_recv_wc *mad_recv_wc;	/* Received MADs */
+	__be32 local_id;			/* Established / timewait */
+	__be32 remote_id;
+	struct ib_cm_event cm_event;
+	struct ib_sa_path_rec path[0];
+};
+
+struct cm_timewait_info {
+	struct cm_work work;			/* Must be first. */
+	struct list_head list;
+	struct rb_node remote_qp_node;
+	struct rb_node remote_id_node;
+	__be64 remote_ca_guid;
+	__be32 remote_qpn;
+	u8 inserted_remote_qp;
+	u8 inserted_remote_id;
+};
+
+struct cm_id_private {
+	struct ib_cm_id	id;
+
+	struct rb_node service_node;
+	struct rb_node sidr_id_node;
+	spinlock_t lock;	/* Do not acquire inside cm.lock */
+	struct completion comp;
+	atomic_t refcount;
+
+	struct ib_mad_send_buf *msg;
+	struct cm_timewait_info *timewait_info;
+	/* todo: use alternate port on send failure */
+	struct cm_av av;
+	struct cm_av alt_av;
+	struct ib_cm_compare_data *compare_data;
+
+	void *private_data;
+	__be64 tid;
+	__be32 local_qpn;
+	__be32 remote_qpn;
+	enum ib_qp_type qp_type;
+	__be32 sq_psn;
+	__be32 rq_psn;
+	int timeout_ms;
+	enum ib_mtu path_mtu;
+	__be16 pkey;
+	u8 private_data_len;
+	u8 max_cm_retries;
+	u8 peer_to_peer;
+	u8 responder_resources;
+	u8 initiator_depth;
+	u8 retry_count;
+	u8 rnr_retry_count;
+	u8 service_timeout;
+	u8 target_ack_delay;
+
+	struct list_head work_list;
+	atomic_t work_count;
+};
+
+static void cm_work_handler(struct work_struct *work);
+
+static inline void cm_deref_id(struct cm_id_private *cm_id_priv)
+{
+	if (atomic_dec_and_test(&cm_id_priv->refcount))
+		complete(&cm_id_priv->comp);
+}
+
+static int cm_alloc_msg(struct cm_id_private *cm_id_priv,
+			struct ib_mad_send_buf **msg)
+{
+	struct ib_mad_agent *mad_agent;
+	struct ib_mad_send_buf *m;
+	struct ib_ah *ah;
+
+	mad_agent = cm_id_priv->av.port->mad_agent;
+	ah = ib_create_ah(mad_agent->qp->pd, &cm_id_priv->av.ah_attr);
+	if (IS_ERR(ah))
+		return PTR_ERR(ah);
+
+	m = ib_create_send_mad(mad_agent, cm_id_priv->id.remote_cm_qpn,
+			       cm_id_priv->av.pkey_index,
+			       0, IB_MGMT_MAD_HDR, IB_MGMT_MAD_DATA,
+			       GFP_ATOMIC);
+	if (IS_ERR(m)) {
+		ib_destroy_ah(ah);
+		return PTR_ERR(m);
+	}
+
+	/* Timeout set by caller if response is expected. */
+	m->ah = ah;
+	m->retries = cm_id_priv->max_cm_retries;
+
+	atomic_inc(&cm_id_priv->refcount);
+	m->context[0] = cm_id_priv;
+	*msg = m;
+	return 0;
+}
+
+static int cm_alloc_response_msg(struct cm_port *port,
+				 struct ib_mad_recv_wc *mad_recv_wc,
+				 struct ib_mad_send_buf **msg)
+{
+	struct ib_mad_send_buf *m;
+	struct ib_ah *ah;
+
+	ah = ib_create_ah_from_wc(port->mad_agent->qp->pd, mad_recv_wc->wc,
+				  mad_recv_wc->recv_buf.grh, port->port_num);
+	if (IS_ERR(ah))
+		return PTR_ERR(ah);
+
+	m = ib_create_send_mad(port->mad_agent, 1, mad_recv_wc->wc->pkey_index,
+			       0, IB_MGMT_MAD_HDR, IB_MGMT_MAD_DATA,
+			       GFP_ATOMIC);
+	if (IS_ERR(m)) {
+		ib_destroy_ah(ah);
+		return PTR_ERR(m);
+	}
+	m->ah = ah;
+	*msg = m;
+	return 0;
+}
+
+static void cm_free_msg(struct ib_mad_send_buf *msg)
+{
+	ib_destroy_ah(msg->ah);
+	if (msg->context[0])
+		cm_deref_id(msg->context[0]);
+	ib_free_send_mad(msg);
+}
+
+static void * cm_copy_private_data(const void *private_data,
+				   u8 private_data_len)
+{
+	void *data;
+
+	if (!private_data || !private_data_len)
+		return NULL;
+
+	data = kmemdup(private_data, private_data_len, GFP_KERNEL);
+	if (!data)
+		return ERR_PTR(-ENOMEM);
+
+	return data;
+}
+
+static void cm_set_private_data(struct cm_id_private *cm_id_priv,
+				 void *private_data, u8 private_data_len)
+{
+	if (cm_id_priv->private_data && cm_id_priv->private_data_len)
+		kfree(cm_id_priv->private_data);
+
+	cm_id_priv->private_data = private_data;
+	cm_id_priv->private_data_len = private_data_len;
+}
+
+static void cm_init_av_for_response(struct cm_port *port, struct ib_wc *wc,
+				    struct ib_grh *grh, struct cm_av *av)
+{
+	av->port = port;
+	av->pkey_index = wc->pkey_index;
+	ib_init_ah_from_wc(port->cm_dev->ib_device, port->port_num, wc,
+			   grh, &av->ah_attr);
+}
+
+static int cm_init_av_by_path(struct ib_sa_path_rec *path, struct cm_av *av)
+{
+	struct cm_device *cm_dev;
+	struct cm_port *port = NULL;
+	unsigned long flags;
+	int ret;
+	u8 p;
+
+	read_lock_irqsave(&cm.device_lock, flags);
+	list_for_each_entry(cm_dev, &cm.device_list, list) {
+		if (!ib_find_cached_gid(cm_dev->ib_device, &path->sgid,
+					&p, NULL)) {
+			port = cm_dev->port[p-1];
+			break;
+		}
+	}
+	read_unlock_irqrestore(&cm.device_lock, flags);
+
+	if (!port)
+		return -EINVAL;
+
+	ret = ib_find_cached_pkey(cm_dev->ib_device, port->port_num,
+				  be16_to_cpu(path->pkey), &av->pkey_index);
+	if (ret)
+		return ret;
+
+	av->port = port;
+	ib_init_ah_from_path(cm_dev->ib_device, port->port_num, path,
+			     &av->ah_attr);
+	av->timeout = path->packet_life_time + 1;
+	memcpy(av->smac, path->smac, sizeof(av->smac));
+
+	av->valid = 1;
+	return 0;
+}
+
+static int cm_alloc_id(struct cm_id_private *cm_id_priv)
+{
+	unsigned long flags;
+	int id;
+
+	idr_preload(GFP_KERNEL);
+	spin_lock_irqsave(&cm.lock, flags);
+
+	id = idr_alloc_cyclic(&cm.local_id_table, cm_id_priv, 0, 0, GFP_NOWAIT);
+
+	spin_unlock_irqrestore(&cm.lock, flags);
+	idr_preload_end();
+
+	cm_id_priv->id.local_id = (__force __be32)id ^ cm.random_id_operand;
+	return id < 0 ? id : 0;
+}
+
+static void cm_free_id(__be32 local_id)
+{
+	spin_lock_irq(&cm.lock);
+	idr_remove(&cm.local_id_table,
+		   (__force int) (local_id ^ cm.random_id_operand));
+	spin_unlock_irq(&cm.lock);
+}
+
+static struct cm_id_private * cm_get_id(__be32 local_id, __be32 remote_id)
+{
+	struct cm_id_private *cm_id_priv;
+
+	cm_id_priv = idr_find(&cm.local_id_table,
+			      (__force int) (local_id ^ cm.random_id_operand));
+	if (cm_id_priv) {
+		if (cm_id_priv->id.remote_id == remote_id)
+			atomic_inc(&cm_id_priv->refcount);
+		else
+			cm_id_priv = NULL;
+	}
+
+	return cm_id_priv;
+}
+
+static struct cm_id_private * cm_acquire_id(__be32 local_id, __be32 remote_id)
+{
+	struct cm_id_private *cm_id_priv;
+
+	spin_lock_irq(&cm.lock);
+	cm_id_priv = cm_get_id(local_id, remote_id);
+	spin_unlock_irq(&cm.lock);
+
+	return cm_id_priv;
+}
+
+static void cm_mask_copy(u8 *dst, u8 *src, u8 *mask)
+{
+	int i;
+
+	for (i = 0; i < IB_CM_COMPARE_SIZE / sizeof(unsigned long); i++)
+		((unsigned long *) dst)[i] = ((unsigned long *) src)[i] &
+					     ((unsigned long *) mask)[i];
+}
+
+static int cm_compare_data(struct ib_cm_compare_data *src_data,
+			   struct ib_cm_compare_data *dst_data)
+{
+	u8 src[IB_CM_COMPARE_SIZE];
+	u8 dst[IB_CM_COMPARE_SIZE];
+
+	if (!src_data || !dst_data)
+		return 0;
+
+	cm_mask_copy(src, src_data->data, dst_data->mask);
+	cm_mask_copy(dst, dst_data->data, src_data->mask);
+	return memcmp(src, dst, IB_CM_COMPARE_SIZE);
+}
+
+static int cm_compare_private_data(u8 *private_data,
+				   struct ib_cm_compare_data *dst_data)
+{
+	u8 src[IB_CM_COMPARE_SIZE];
+
+	if (!dst_data)
+		return 0;
+
+	cm_mask_copy(src, private_data, dst_data->mask);
+	return memcmp(src, dst_data->data, IB_CM_COMPARE_SIZE);
+}
+
+/*
+ * Trivial helpers to strip endian annotation and compare; the
+ * endianness doesn't actually matter since we just need a stable
+ * order for the RB tree.
+ */
+static int be32_lt(__be32 a, __be32 b)
+{
+	return (__force u32) a < (__force u32) b;
+}
+
+static int be32_gt(__be32 a, __be32 b)
+{
+	return (__force u32) a > (__force u32) b;
+}
+
+static int be64_lt(__be64 a, __be64 b)
+{
+	return (__force u64) a < (__force u64) b;
+}
+
+static int be64_gt(__be64 a, __be64 b)
+{
+	return (__force u64) a > (__force u64) b;
+}
+
+static struct cm_id_private * cm_insert_listen(struct cm_id_private *cm_id_priv)
+{
+	struct rb_node **link = &cm.listen_service_table.rb_node;
+	struct rb_node *parent = NULL;
+	struct cm_id_private *cur_cm_id_priv;
+	__be64 service_id = cm_id_priv->id.service_id;
+	__be64 service_mask = cm_id_priv->id.service_mask;
+	int data_cmp;
+
+	while (*link) {
+		parent = *link;
+		cur_cm_id_priv = rb_entry(parent, struct cm_id_private,
+					  service_node);
+		data_cmp = cm_compare_data(cm_id_priv->compare_data,
+					   cur_cm_id_priv->compare_data);
+		if ((cur_cm_id_priv->id.service_mask & service_id) ==
+		    (service_mask & cur_cm_id_priv->id.service_id) &&
+		    (cm_id_priv->id.device == cur_cm_id_priv->id.device) &&
+		    !data_cmp)
+			return cur_cm_id_priv;
+
+		if (cm_id_priv->id.device < cur_cm_id_priv->id.device)
+			link = &(*link)->rb_left;
+		else if (cm_id_priv->id.device > cur_cm_id_priv->id.device)
+			link = &(*link)->rb_right;
+		else if (be64_lt(service_id, cur_cm_id_priv->id.service_id))
+			link = &(*link)->rb_left;
+		else if (be64_gt(service_id, cur_cm_id_priv->id.service_id))
+			link = &(*link)->rb_right;
+		else if (data_cmp < 0)
+			link = &(*link)->rb_left;
+		else
+			link = &(*link)->rb_right;
+	}
+	rb_link_node(&cm_id_priv->service_node, parent, link);
+	rb_insert_color(&cm_id_priv->service_node, &cm.listen_service_table);
+	return NULL;
+}
+
+static struct cm_id_private * cm_find_listen(struct ib_device *device,
+					     __be64 service_id,
+					     u8 *private_data)
+{
+	struct rb_node *node = cm.listen_service_table.rb_node;
+	struct cm_id_private *cm_id_priv;
+	int data_cmp;
+
+	while (node) {
+		cm_id_priv = rb_entry(node, struct cm_id_private, service_node);
+		data_cmp = cm_compare_private_data(private_data,
+						   cm_id_priv->compare_data);
+		if ((cm_id_priv->id.service_mask & service_id) ==
+		     cm_id_priv->id.service_id &&
+		    (cm_id_priv->id.device == device) && !data_cmp)
+			return cm_id_priv;
+
+		if (device < cm_id_priv->id.device)
+			node = node->rb_left;
+		else if (device > cm_id_priv->id.device)
+			node = node->rb_right;
+		else if (be64_lt(service_id, cm_id_priv->id.service_id))
+			node = node->rb_left;
+		else if (be64_gt(service_id, cm_id_priv->id.service_id))
+			node = node->rb_right;
+		else if (data_cmp < 0)
+			node = node->rb_left;
+		else
+			node = node->rb_right;
+	}
+	return NULL;
+}
+
+static struct cm_timewait_info * cm_insert_remote_id(struct cm_timewait_info
+						     *timewait_info)
+{
+	struct rb_node **link = &cm.remote_id_table.rb_node;
+	struct rb_node *parent = NULL;
+	struct cm_timewait_info *cur_timewait_info;
+	__be64 remote_ca_guid = timewait_info->remote_ca_guid;
+	__be32 remote_id = timewait_info->work.remote_id;
+
+	while (*link) {
+		parent = *link;
+		cur_timewait_info = rb_entry(parent, struct cm_timewait_info,
+					     remote_id_node);
+		if (be32_lt(remote_id, cur_timewait_info->work.remote_id))
+			link = &(*link)->rb_left;
+		else if (be32_gt(remote_id, cur_timewait_info->work.remote_id))
+			link = &(*link)->rb_right;
+		else if (be64_lt(remote_ca_guid, cur_timewait_info->remote_ca_guid))
+			link = &(*link)->rb_left;
+		else if (be64_gt(remote_ca_guid, cur_timewait_info->remote_ca_guid))
+			link = &(*link)->rb_right;
+		else
+			return cur_timewait_info;
+	}
+	timewait_info->inserted_remote_id = 1;
+	rb_link_node(&timewait_info->remote_id_node, parent, link);
+	rb_insert_color(&timewait_info->remote_id_node, &cm.remote_id_table);
+	return NULL;
+}
+
+static struct cm_timewait_info * cm_find_remote_id(__be64 remote_ca_guid,
+						   __be32 remote_id)
+{
+	struct rb_node *node = cm.remote_id_table.rb_node;
+	struct cm_timewait_info *timewait_info;
+
+	while (node) {
+		timewait_info = rb_entry(node, struct cm_timewait_info,
+					 remote_id_node);
+		if (be32_lt(remote_id, timewait_info->work.remote_id))
+			node = node->rb_left;
+		else if (be32_gt(remote_id, timewait_info->work.remote_id))
+			node = node->rb_right;
+		else if (be64_lt(remote_ca_guid, timewait_info->remote_ca_guid))
+			node = node->rb_left;
+		else if (be64_gt(remote_ca_guid, timewait_info->remote_ca_guid))
+			node = node->rb_right;
+		else
+			return timewait_info;
+	}
+	return NULL;
+}
+
+static struct cm_timewait_info * cm_insert_remote_qpn(struct cm_timewait_info
+						      *timewait_info)
+{
+	struct rb_node **link = &cm.remote_qp_table.rb_node;
+	struct rb_node *parent = NULL;
+	struct cm_timewait_info *cur_timewait_info;
+	__be64 remote_ca_guid = timewait_info->remote_ca_guid;
+	__be32 remote_qpn = timewait_info->remote_qpn;
+
+	while (*link) {
+		parent = *link;
+		cur_timewait_info = rb_entry(parent, struct cm_timewait_info,
+					     remote_qp_node);
+		if (be32_lt(remote_qpn, cur_timewait_info->remote_qpn))
+			link = &(*link)->rb_left;
+		else if (be32_gt(remote_qpn, cur_timewait_info->remote_qpn))
+			link = &(*link)->rb_right;
+		else if (be64_lt(remote_ca_guid, cur_timewait_info->remote_ca_guid))
+			link = &(*link)->rb_left;
+		else if (be64_gt(remote_ca_guid, cur_timewait_info->remote_ca_guid))
+			link = &(*link)->rb_right;
+		else
+			return cur_timewait_info;
+	}
+	timewait_info->inserted_remote_qp = 1;
+	rb_link_node(&timewait_info->remote_qp_node, parent, link);
+	rb_insert_color(&timewait_info->remote_qp_node, &cm.remote_qp_table);
+	return NULL;
+}
+
+static struct cm_id_private * cm_insert_remote_sidr(struct cm_id_private
+						    *cm_id_priv)
+{
+	struct rb_node **link = &cm.remote_sidr_table.rb_node;
+	struct rb_node *parent = NULL;
+	struct cm_id_private *cur_cm_id_priv;
+	union ib_gid *port_gid = &cm_id_priv->av.dgid;
+	__be32 remote_id = cm_id_priv->id.remote_id;
+
+	while (*link) {
+		parent = *link;
+		cur_cm_id_priv = rb_entry(parent, struct cm_id_private,
+					  sidr_id_node);
+		if (be32_lt(remote_id, cur_cm_id_priv->id.remote_id))
+			link = &(*link)->rb_left;
+		else if (be32_gt(remote_id, cur_cm_id_priv->id.remote_id))
+			link = &(*link)->rb_right;
+		else {
+			int cmp;
+			cmp = memcmp(port_gid, &cur_cm_id_priv->av.dgid,
+				     sizeof *port_gid);
+			if (cmp < 0)
+				link = &(*link)->rb_left;
+			else if (cmp > 0)
+				link = &(*link)->rb_right;
+			else
+				return cur_cm_id_priv;
+		}
+	}
+	rb_link_node(&cm_id_priv->sidr_id_node, parent, link);
+	rb_insert_color(&cm_id_priv->sidr_id_node, &cm.remote_sidr_table);
+	return NULL;
+}
+
+static void cm_reject_sidr_req(struct cm_id_private *cm_id_priv,
+			       enum ib_cm_sidr_status status)
+{
+	struct ib_cm_sidr_rep_param param;
+
+	memset(&param, 0, sizeof param);
+	param.status = status;
+	ib_send_cm_sidr_rep(&cm_id_priv->id, &param);
+}
+
+struct ib_cm_id *ib_create_cm_id(struct ib_device *device,
+				 ib_cm_handler cm_handler,
+				 void *context)
+{
+	struct cm_id_private *cm_id_priv;
+	int ret;
+
+	cm_id_priv = kzalloc(sizeof *cm_id_priv, GFP_KERNEL);
+	if (!cm_id_priv)
+		return ERR_PTR(-ENOMEM);
+
+	cm_id_priv->id.state = IB_CM_IDLE;
+	cm_id_priv->id.device = device;
+	cm_id_priv->id.cm_handler = cm_handler;
+	cm_id_priv->id.context = context;
+	cm_id_priv->id.remote_cm_qpn = 1;
+	ret = cm_alloc_id(cm_id_priv);
+	if (ret)
+		goto error;
+
+	spin_lock_init(&cm_id_priv->lock);
+	init_completion(&cm_id_priv->comp);
+	INIT_LIST_HEAD(&cm_id_priv->work_list);
+	atomic_set(&cm_id_priv->work_count, -1);
+	atomic_set(&cm_id_priv->refcount, 1);
+	return &cm_id_priv->id;
+
+error:
+	kfree(cm_id_priv);
+	return ERR_PTR(-ENOMEM);
+}
+EXPORT_SYMBOL(ib_create_cm_id);
+
+static struct cm_work * cm_dequeue_work(struct cm_id_private *cm_id_priv)
+{
+	struct cm_work *work;
+
+	if (list_empty(&cm_id_priv->work_list))
+		return NULL;
+
+	work = list_entry(cm_id_priv->work_list.next, struct cm_work, list);
+	list_del(&work->list);
+	return work;
+}
+
+static void cm_free_work(struct cm_work *work)
+{
+	if (work->mad_recv_wc)
+		ib_free_recv_mad(work->mad_recv_wc);
+	kfree(work);
+}
+
+static inline int cm_convert_to_ms(int iba_time)
+{
+	/* approximate conversion to ms from 4.096us x 2^iba_time */
+	return 1 << max(iba_time - 8, 0);
+}
+
+/*
+ * calculate: 4.096x2^ack_timeout = 4.096x2^ack_delay + 2x4.096x2^life_time
+ * Because of how ack_timeout is stored, adding one doubles the timeout.
+ * To avoid large timeouts, select the max(ack_delay, life_time + 1), and
+ * increment it (round up) only if the other is within 50%.
+ */
+static u8 cm_ack_timeout(u8 ca_ack_delay, u8 packet_life_time)
+{
+	int ack_timeout = packet_life_time + 1;
+
+	if (ack_timeout >= ca_ack_delay)
+		ack_timeout += (ca_ack_delay >= (ack_timeout - 1));
+	else
+		ack_timeout = ca_ack_delay +
+			      (ack_timeout >= (ca_ack_delay - 1));
+
+	return min(31, ack_timeout);
+}
+
+static void cm_cleanup_timewait(struct cm_timewait_info *timewait_info)
+{
+	if (timewait_info->inserted_remote_id) {
+		rb_erase(&timewait_info->remote_id_node, &cm.remote_id_table);
+		timewait_info->inserted_remote_id = 0;
+	}
+
+	if (timewait_info->inserted_remote_qp) {
+		rb_erase(&timewait_info->remote_qp_node, &cm.remote_qp_table);
+		timewait_info->inserted_remote_qp = 0;
+	}
+}
+
+static struct cm_timewait_info * cm_create_timewait_info(__be32 local_id)
+{
+	struct cm_timewait_info *timewait_info;
+
+	timewait_info = kzalloc(sizeof *timewait_info, GFP_KERNEL);
+	if (!timewait_info)
+		return ERR_PTR(-ENOMEM);
+
+	timewait_info->work.local_id = local_id;
+	INIT_DELAYED_WORK(&timewait_info->work.work, cm_work_handler);
+	timewait_info->work.cm_event.event = IB_CM_TIMEWAIT_EXIT;
+	return timewait_info;
+}
+
+static void cm_enter_timewait(struct cm_id_private *cm_id_priv)
+{
+	int wait_time;
+	unsigned long flags;
+
+	spin_lock_irqsave(&cm.lock, flags);
+	cm_cleanup_timewait(cm_id_priv->timewait_info);
+	list_add_tail(&cm_id_priv->timewait_info->list, &cm.timewait_list);
+	spin_unlock_irqrestore(&cm.lock, flags);
+
+	/*
+	 * The cm_id could be destroyed by the user before we exit timewait.
+	 * To protect against this, we search for the cm_id after exiting
+	 * timewait before notifying the user that we've exited timewait.
+	 */
+	cm_id_priv->id.state = IB_CM_TIMEWAIT;
+	wait_time = cm_convert_to_ms(cm_id_priv->av.timeout);
+	queue_delayed_work(cm.wq, &cm_id_priv->timewait_info->work.work,
+			   msecs_to_jiffies(wait_time));
+	cm_id_priv->timewait_info = NULL;
+}
+
+static void cm_reset_to_idle(struct cm_id_private *cm_id_priv)
+{
+	unsigned long flags;
+
+	cm_id_priv->id.state = IB_CM_IDLE;
+	if (cm_id_priv->timewait_info) {
+		spin_lock_irqsave(&cm.lock, flags);
+		cm_cleanup_timewait(cm_id_priv->timewait_info);
+		spin_unlock_irqrestore(&cm.lock, flags);
+		kfree(cm_id_priv->timewait_info);
+		cm_id_priv->timewait_info = NULL;
+	}
+}
+
+static void cm_destroy_id(struct ib_cm_id *cm_id, int err)
+{
+	struct cm_id_private *cm_id_priv;
+	struct cm_work *work;
+
+	cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+retest:
+	spin_lock_irq(&cm_id_priv->lock);
+	switch (cm_id->state) {
+	case IB_CM_LISTEN:
+		cm_id->state = IB_CM_IDLE;
+		spin_unlock_irq(&cm_id_priv->lock);
+		spin_lock_irq(&cm.lock);
+		rb_erase(&cm_id_priv->service_node, &cm.listen_service_table);
+		spin_unlock_irq(&cm.lock);
+		break;
+	case IB_CM_SIDR_REQ_SENT:
+		cm_id->state = IB_CM_IDLE;
+		ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
+		spin_unlock_irq(&cm_id_priv->lock);
+		break;
+	case IB_CM_SIDR_REQ_RCVD:
+		spin_unlock_irq(&cm_id_priv->lock);
+		cm_reject_sidr_req(cm_id_priv, IB_SIDR_REJECT);
+		break;
+	case IB_CM_REQ_SENT:
+		ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
+		spin_unlock_irq(&cm_id_priv->lock);
+		ib_send_cm_rej(cm_id, IB_CM_REJ_TIMEOUT,
+			       &cm_id_priv->id.device->node_guid,
+			       sizeof cm_id_priv->id.device->node_guid,
+			       NULL, 0);
+		break;
+	case IB_CM_REQ_RCVD:
+		if (err == -ENOMEM) {
+			/* Do not reject to allow future retries. */
+			cm_reset_to_idle(cm_id_priv);
+			spin_unlock_irq(&cm_id_priv->lock);
+		} else {
+			spin_unlock_irq(&cm_id_priv->lock);
+			ib_send_cm_rej(cm_id, IB_CM_REJ_CONSUMER_DEFINED,
+				       NULL, 0, NULL, 0);
+		}
+		break;
+	case IB_CM_MRA_REQ_RCVD:
+	case IB_CM_REP_SENT:
+	case IB_CM_MRA_REP_RCVD:
+		ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
+		/* Fall through */
+	case IB_CM_MRA_REQ_SENT:
+	case IB_CM_REP_RCVD:
+	case IB_CM_MRA_REP_SENT:
+		spin_unlock_irq(&cm_id_priv->lock);
+		ib_send_cm_rej(cm_id, IB_CM_REJ_CONSUMER_DEFINED,
+			       NULL, 0, NULL, 0);
+		break;
+	case IB_CM_ESTABLISHED:
+		spin_unlock_irq(&cm_id_priv->lock);
+		if (cm_id_priv->qp_type == IB_QPT_XRC_TGT)
+			break;
+		ib_send_cm_dreq(cm_id, NULL, 0);
+		goto retest;
+	case IB_CM_DREQ_SENT:
+		ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
+		cm_enter_timewait(cm_id_priv);
+		spin_unlock_irq(&cm_id_priv->lock);
+		break;
+	case IB_CM_DREQ_RCVD:
+		spin_unlock_irq(&cm_id_priv->lock);
+		ib_send_cm_drep(cm_id, NULL, 0);
+		break;
+	default:
+		spin_unlock_irq(&cm_id_priv->lock);
+		break;
+	}
+
+	cm_free_id(cm_id->local_id);
+	cm_deref_id(cm_id_priv);
+	wait_for_completion(&cm_id_priv->comp);
+	while ((work = cm_dequeue_work(cm_id_priv)) != NULL)
+		cm_free_work(work);
+	kfree(cm_id_priv->compare_data);
+	kfree(cm_id_priv->private_data);
+	kfree(cm_id_priv);
+}
+
+void ib_destroy_cm_id(struct ib_cm_id *cm_id)
+{
+	cm_destroy_id(cm_id, 0);
+}
+EXPORT_SYMBOL(ib_destroy_cm_id);
+
+int ib_cm_listen(struct ib_cm_id *cm_id, __be64 service_id, __be64 service_mask,
+		 struct ib_cm_compare_data *compare_data)
+{
+	struct cm_id_private *cm_id_priv, *cur_cm_id_priv;
+	unsigned long flags;
+	int ret = 0;
+
+	service_mask = service_mask ? service_mask : ~cpu_to_be64(0);
+	service_id &= service_mask;
+	if ((service_id & IB_SERVICE_ID_AGN_MASK) == IB_CM_ASSIGN_SERVICE_ID &&
+	    (service_id != IB_CM_ASSIGN_SERVICE_ID))
+		return -EINVAL;
+
+	cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+	if (cm_id->state != IB_CM_IDLE)
+		return -EINVAL;
+
+	if (compare_data) {
+		cm_id_priv->compare_data = kzalloc(sizeof *compare_data,
+						   GFP_KERNEL);
+		if (!cm_id_priv->compare_data)
+			return -ENOMEM;
+		cm_mask_copy(cm_id_priv->compare_data->data,
+			     compare_data->data, compare_data->mask);
+		memcpy(cm_id_priv->compare_data->mask, compare_data->mask,
+		       IB_CM_COMPARE_SIZE);
+	}
+
+	cm_id->state = IB_CM_LISTEN;
+
+	spin_lock_irqsave(&cm.lock, flags);
+	if (service_id == IB_CM_ASSIGN_SERVICE_ID) {
+		cm_id->service_id = cpu_to_be64(cm.listen_service_id++);
+		cm_id->service_mask = ~cpu_to_be64(0);
+	} else {
+		cm_id->service_id = service_id;
+		cm_id->service_mask = service_mask;
+	}
+	cur_cm_id_priv = cm_insert_listen(cm_id_priv);
+	spin_unlock_irqrestore(&cm.lock, flags);
+
+	if (cur_cm_id_priv) {
+		cm_id->state = IB_CM_IDLE;
+		kfree(cm_id_priv->compare_data);
+		cm_id_priv->compare_data = NULL;
+		ret = -EBUSY;
+	}
+	return ret;
+}
+EXPORT_SYMBOL(ib_cm_listen);
+
+static __be64 cm_form_tid(struct cm_id_private *cm_id_priv,
+			  enum cm_msg_sequence msg_seq)
+{
+	u64 hi_tid, low_tid;
+
+	hi_tid   = ((u64) cm_id_priv->av.port->mad_agent->hi_tid) << 32;
+	low_tid  = (u64) ((__force u32)cm_id_priv->id.local_id |
+			  (msg_seq << 30));
+	return cpu_to_be64(hi_tid | low_tid);
+}
+
+static void cm_format_mad_hdr(struct ib_mad_hdr *hdr,
+			      __be16 attr_id, __be64 tid)
+{
+	hdr->base_version  = IB_MGMT_BASE_VERSION;
+	hdr->mgmt_class	   = IB_MGMT_CLASS_CM;
+	hdr->class_version = IB_CM_CLASS_VERSION;
+	hdr->method	   = IB_MGMT_METHOD_SEND;
+	hdr->attr_id	   = attr_id;
+	hdr->tid	   = tid;
+}
+
+static void cm_format_req(struct cm_req_msg *req_msg,
+			  struct cm_id_private *cm_id_priv,
+			  struct ib_cm_req_param *param)
+{
+	struct ib_sa_path_rec *pri_path = param->primary_path;
+	struct ib_sa_path_rec *alt_path = param->alternate_path;
+
+	cm_format_mad_hdr(&req_msg->hdr, CM_REQ_ATTR_ID,
+			  cm_form_tid(cm_id_priv, CM_MSG_SEQUENCE_REQ));
+
+	req_msg->local_comm_id = cm_id_priv->id.local_id;
+	req_msg->service_id = param->service_id;
+	req_msg->local_ca_guid = cm_id_priv->id.device->node_guid;
+	cm_req_set_local_qpn(req_msg, cpu_to_be32(param->qp_num));
+	cm_req_set_init_depth(req_msg, param->initiator_depth);
+	cm_req_set_remote_resp_timeout(req_msg,
+				       param->remote_cm_response_timeout);
+	cm_req_set_qp_type(req_msg, param->qp_type);
+	cm_req_set_flow_ctrl(req_msg, param->flow_control);
+	cm_req_set_starting_psn(req_msg, cpu_to_be32(param->starting_psn));
+	cm_req_set_local_resp_timeout(req_msg,
+				      param->local_cm_response_timeout);
+	req_msg->pkey = param->primary_path->pkey;
+	cm_req_set_path_mtu(req_msg, param->primary_path->mtu);
+	cm_req_set_max_cm_retries(req_msg, param->max_cm_retries);
+
+	if (param->qp_type != IB_QPT_XRC_INI) {
+		cm_req_set_resp_res(req_msg, param->responder_resources);
+		cm_req_set_retry_count(req_msg, param->retry_count);
+		cm_req_set_rnr_retry_count(req_msg, param->rnr_retry_count);
+		cm_req_set_srq(req_msg, param->srq);
+	}
+
+	if (pri_path->hop_limit <= 1) {
+		req_msg->primary_local_lid = pri_path->slid;
+		req_msg->primary_remote_lid = pri_path->dlid;
+	} else {
+		/* Work-around until there's a way to obtain remote LID info */
+		req_msg->primary_local_lid = IB_LID_PERMISSIVE;
+		req_msg->primary_remote_lid = IB_LID_PERMISSIVE;
+	}
+	req_msg->primary_local_gid = pri_path->sgid;
+	req_msg->primary_remote_gid = pri_path->dgid;
+	cm_req_set_primary_flow_label(req_msg, pri_path->flow_label);
+	cm_req_set_primary_packet_rate(req_msg, pri_path->rate);
+	req_msg->primary_traffic_class = pri_path->traffic_class;
+	req_msg->primary_hop_limit = pri_path->hop_limit;
+	cm_req_set_primary_sl(req_msg, pri_path->sl);
+	cm_req_set_primary_subnet_local(req_msg, (pri_path->hop_limit <= 1));
+	cm_req_set_primary_local_ack_timeout(req_msg,
+		cm_ack_timeout(cm_id_priv->av.port->cm_dev->ack_delay,
+			       pri_path->packet_life_time));
+
+	if (alt_path) {
+		if (alt_path->hop_limit <= 1) {
+			req_msg->alt_local_lid = alt_path->slid;
+			req_msg->alt_remote_lid = alt_path->dlid;
+		} else {
+			req_msg->alt_local_lid = IB_LID_PERMISSIVE;
+			req_msg->alt_remote_lid = IB_LID_PERMISSIVE;
+		}
+		req_msg->alt_local_gid = alt_path->sgid;
+		req_msg->alt_remote_gid = alt_path->dgid;
+		cm_req_set_alt_flow_label(req_msg,
+					  alt_path->flow_label);
+		cm_req_set_alt_packet_rate(req_msg, alt_path->rate);
+		req_msg->alt_traffic_class = alt_path->traffic_class;
+		req_msg->alt_hop_limit = alt_path->hop_limit;
+		cm_req_set_alt_sl(req_msg, alt_path->sl);
+		cm_req_set_alt_subnet_local(req_msg, (alt_path->hop_limit <= 1));
+		cm_req_set_alt_local_ack_timeout(req_msg,
+			cm_ack_timeout(cm_id_priv->av.port->cm_dev->ack_delay,
+				       alt_path->packet_life_time));
+	}
+
+	if (param->private_data && param->private_data_len)
+		memcpy(req_msg->private_data, param->private_data,
+		       param->private_data_len);
+}
+
+static int cm_validate_req_param(struct ib_cm_req_param *param)
+{
+	/* peer-to-peer not supported */
+	if (param->peer_to_peer)
+		return -EINVAL;
+
+	if (!param->primary_path)
+		return -EINVAL;
+
+	if (param->qp_type != IB_QPT_RC && param->qp_type != IB_QPT_UC &&
+	    param->qp_type != IB_QPT_XRC_INI)
+		return -EINVAL;
+
+	if (param->private_data &&
+	    param->private_data_len > IB_CM_REQ_PRIVATE_DATA_SIZE)
+		return -EINVAL;
+
+	if (param->alternate_path &&
+	    (param->alternate_path->pkey != param->primary_path->pkey ||
+	     param->alternate_path->mtu != param->primary_path->mtu))
+		return -EINVAL;
+
+	return 0;
+}
+
+int ib_send_cm_req(struct ib_cm_id *cm_id,
+		   struct ib_cm_req_param *param)
+{
+	struct cm_id_private *cm_id_priv;
+	struct cm_req_msg *req_msg;
+	unsigned long flags;
+	int ret;
+
+	ret = cm_validate_req_param(param);
+	if (ret)
+		return ret;
+
+	/* Verify that we're not in timewait. */
+	cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	if (cm_id->state != IB_CM_IDLE) {
+		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+		ret = -EINVAL;
+		goto out;
+	}
+	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+
+	cm_id_priv->timewait_info = cm_create_timewait_info(cm_id_priv->
+							    id.local_id);
+	if (IS_ERR(cm_id_priv->timewait_info)) {
+		ret = PTR_ERR(cm_id_priv->timewait_info);
+		goto out;
+	}
+
+	ret = cm_init_av_by_path(param->primary_path, &cm_id_priv->av);
+	if (ret)
+		goto error1;
+	if (param->alternate_path) {
+		ret = cm_init_av_by_path(param->alternate_path,
+					 &cm_id_priv->alt_av);
+		if (ret)
+			goto error1;
+	}
+	cm_id->service_id = param->service_id;
+	cm_id->service_mask = ~cpu_to_be64(0);
+	cm_id_priv->timeout_ms = cm_convert_to_ms(
+				    param->primary_path->packet_life_time) * 2 +
+				 cm_convert_to_ms(
+				    param->remote_cm_response_timeout);
+	cm_id_priv->max_cm_retries = param->max_cm_retries;
+	cm_id_priv->initiator_depth = param->initiator_depth;
+	cm_id_priv->responder_resources = param->responder_resources;
+	cm_id_priv->retry_count = param->retry_count;
+	cm_id_priv->path_mtu = param->primary_path->mtu;
+	cm_id_priv->pkey = param->primary_path->pkey;
+	cm_id_priv->qp_type = param->qp_type;
+
+	ret = cm_alloc_msg(cm_id_priv, &cm_id_priv->msg);
+	if (ret)
+		goto error1;
+
+	req_msg = (struct cm_req_msg *) cm_id_priv->msg->mad;
+	cm_format_req(req_msg, cm_id_priv, param);
+	cm_id_priv->tid = req_msg->hdr.tid;
+	cm_id_priv->msg->timeout_ms = cm_id_priv->timeout_ms;
+	cm_id_priv->msg->context[1] = (void *) (unsigned long) IB_CM_REQ_SENT;
+
+	cm_id_priv->local_qpn = cm_req_get_local_qpn(req_msg);
+	cm_id_priv->rq_psn = cm_req_get_starting_psn(req_msg);
+
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	ret = ib_post_send_mad(cm_id_priv->msg, NULL);
+	if (ret) {
+		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+		goto error2;
+	}
+	BUG_ON(cm_id->state != IB_CM_IDLE);
+	cm_id->state = IB_CM_REQ_SENT;
+	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+	return 0;
+
+error2:	cm_free_msg(cm_id_priv->msg);
+error1:	kfree(cm_id_priv->timewait_info);
+out:	return ret;
+}
+EXPORT_SYMBOL(ib_send_cm_req);
+
+static int cm_issue_rej(struct cm_port *port,
+			struct ib_mad_recv_wc *mad_recv_wc,
+			enum ib_cm_rej_reason reason,
+			enum cm_msg_response msg_rejected,
+			void *ari, u8 ari_length)
+{
+	struct ib_mad_send_buf *msg = NULL;
+	struct cm_rej_msg *rej_msg, *rcv_msg;
+	int ret;
+
+	ret = cm_alloc_response_msg(port, mad_recv_wc, &msg);
+	if (ret)
+		return ret;
+
+	/* We just need common CM header information.  Cast to any message. */
+	rcv_msg = (struct cm_rej_msg *) mad_recv_wc->recv_buf.mad;
+	rej_msg = (struct cm_rej_msg *) msg->mad;
+
+	cm_format_mad_hdr(&rej_msg->hdr, CM_REJ_ATTR_ID, rcv_msg->hdr.tid);
+	rej_msg->remote_comm_id = rcv_msg->local_comm_id;
+	rej_msg->local_comm_id = rcv_msg->remote_comm_id;
+	cm_rej_set_msg_rejected(rej_msg, msg_rejected);
+	rej_msg->reason = cpu_to_be16(reason);
+
+	if (ari && ari_length) {
+		cm_rej_set_reject_info_len(rej_msg, ari_length);
+		memcpy(rej_msg->ari, ari, ari_length);
+	}
+
+	ret = ib_post_send_mad(msg, NULL);
+	if (ret)
+		cm_free_msg(msg);
+
+	return ret;
+}
+
+static inline int cm_is_active_peer(__be64 local_ca_guid, __be64 remote_ca_guid,
+				    __be32 local_qpn, __be32 remote_qpn)
+{
+	return (be64_to_cpu(local_ca_guid) > be64_to_cpu(remote_ca_guid) ||
+		((local_ca_guid == remote_ca_guid) &&
+		 (be32_to_cpu(local_qpn) > be32_to_cpu(remote_qpn))));
+}
+
+static void cm_format_paths_from_req(struct cm_req_msg *req_msg,
+					    struct ib_sa_path_rec *primary_path,
+					    struct ib_sa_path_rec *alt_path)
+{
+	memset(primary_path, 0, sizeof *primary_path);
+	primary_path->dgid = req_msg->primary_local_gid;
+	primary_path->sgid = req_msg->primary_remote_gid;
+	primary_path->dlid = req_msg->primary_local_lid;
+	primary_path->slid = req_msg->primary_remote_lid;
+	primary_path->flow_label = cm_req_get_primary_flow_label(req_msg);
+	primary_path->hop_limit = req_msg->primary_hop_limit;
+	primary_path->traffic_class = req_msg->primary_traffic_class;
+	primary_path->reversible = 1;
+	primary_path->pkey = req_msg->pkey;
+	primary_path->sl = cm_req_get_primary_sl(req_msg);
+	primary_path->mtu_selector = IB_SA_EQ;
+	primary_path->mtu = cm_req_get_path_mtu(req_msg);
+	primary_path->rate_selector = IB_SA_EQ;
+	primary_path->rate = cm_req_get_primary_packet_rate(req_msg);
+	primary_path->packet_life_time_selector = IB_SA_EQ;
+	primary_path->packet_life_time =
+		cm_req_get_primary_local_ack_timeout(req_msg);
+	primary_path->packet_life_time -= (primary_path->packet_life_time > 0);
+
+	if (req_msg->alt_local_lid) {
+		memset(alt_path, 0, sizeof *alt_path);
+		alt_path->dgid = req_msg->alt_local_gid;
+		alt_path->sgid = req_msg->alt_remote_gid;
+		alt_path->dlid = req_msg->alt_local_lid;
+		alt_path->slid = req_msg->alt_remote_lid;
+		alt_path->flow_label = cm_req_get_alt_flow_label(req_msg);
+		alt_path->hop_limit = req_msg->alt_hop_limit;
+		alt_path->traffic_class = req_msg->alt_traffic_class;
+		alt_path->reversible = 1;
+		alt_path->pkey = req_msg->pkey;
+		alt_path->sl = cm_req_get_alt_sl(req_msg);
+		alt_path->mtu_selector = IB_SA_EQ;
+		alt_path->mtu = cm_req_get_path_mtu(req_msg);
+		alt_path->rate_selector = IB_SA_EQ;
+		alt_path->rate = cm_req_get_alt_packet_rate(req_msg);
+		alt_path->packet_life_time_selector = IB_SA_EQ;
+		alt_path->packet_life_time =
+			cm_req_get_alt_local_ack_timeout(req_msg);
+		alt_path->packet_life_time -= (alt_path->packet_life_time > 0);
+	}
+}
+
+static void cm_format_req_event(struct cm_work *work,
+				struct cm_id_private *cm_id_priv,
+				struct ib_cm_id *listen_id)
+{
+	struct cm_req_msg *req_msg;
+	struct ib_cm_req_event_param *param;
+
+	req_msg = (struct cm_req_msg *)work->mad_recv_wc->recv_buf.mad;
+	param = &work->cm_event.param.req_rcvd;
+	param->listen_id = listen_id;
+	param->port = cm_id_priv->av.port->port_num;
+	param->primary_path = &work->path[0];
+	if (req_msg->alt_local_lid)
+		param->alternate_path = &work->path[1];
+	else
+		param->alternate_path = NULL;
+	param->remote_ca_guid = req_msg->local_ca_guid;
+	param->remote_qkey = be32_to_cpu(req_msg->local_qkey);
+	param->remote_qpn = be32_to_cpu(cm_req_get_local_qpn(req_msg));
+	param->qp_type = cm_req_get_qp_type(req_msg);
+	param->starting_psn = be32_to_cpu(cm_req_get_starting_psn(req_msg));
+	param->responder_resources = cm_req_get_init_depth(req_msg);
+	param->initiator_depth = cm_req_get_resp_res(req_msg);
+	param->local_cm_response_timeout =
+					cm_req_get_remote_resp_timeout(req_msg);
+	param->flow_control = cm_req_get_flow_ctrl(req_msg);
+	param->remote_cm_response_timeout =
+					cm_req_get_local_resp_timeout(req_msg);
+	param->retry_count = cm_req_get_retry_count(req_msg);
+	param->rnr_retry_count = cm_req_get_rnr_retry_count(req_msg);
+	param->srq = cm_req_get_srq(req_msg);
+	work->cm_event.private_data = &req_msg->private_data;
+}
+
+static void cm_process_work(struct cm_id_private *cm_id_priv,
+			    struct cm_work *work)
+{
+	int ret;
+
+	/* We will typically only have the current event to report. */
+	ret = cm_id_priv->id.cm_handler(&cm_id_priv->id, &work->cm_event);
+	cm_free_work(work);
+
+	while (!ret && !atomic_add_negative(-1, &cm_id_priv->work_count)) {
+		spin_lock_irq(&cm_id_priv->lock);
+		work = cm_dequeue_work(cm_id_priv);
+		spin_unlock_irq(&cm_id_priv->lock);
+		BUG_ON(!work);
+		ret = cm_id_priv->id.cm_handler(&cm_id_priv->id,
+						&work->cm_event);
+		cm_free_work(work);
+	}
+	cm_deref_id(cm_id_priv);
+	if (ret)
+		cm_destroy_id(&cm_id_priv->id, ret);
+}
+
+static void cm_format_mra(struct cm_mra_msg *mra_msg,
+			  struct cm_id_private *cm_id_priv,
+			  enum cm_msg_response msg_mraed, u8 service_timeout,
+			  const void *private_data, u8 private_data_len)
+{
+	cm_format_mad_hdr(&mra_msg->hdr, CM_MRA_ATTR_ID, cm_id_priv->tid);
+	cm_mra_set_msg_mraed(mra_msg, msg_mraed);
+	mra_msg->local_comm_id = cm_id_priv->id.local_id;
+	mra_msg->remote_comm_id = cm_id_priv->id.remote_id;
+	cm_mra_set_service_timeout(mra_msg, service_timeout);
+
+	if (private_data && private_data_len)
+		memcpy(mra_msg->private_data, private_data, private_data_len);
+}
+
+static void cm_format_rej(struct cm_rej_msg *rej_msg,
+			  struct cm_id_private *cm_id_priv,
+			  enum ib_cm_rej_reason reason,
+			  void *ari,
+			  u8 ari_length,
+			  const void *private_data,
+			  u8 private_data_len)
+{
+	cm_format_mad_hdr(&rej_msg->hdr, CM_REJ_ATTR_ID, cm_id_priv->tid);
+	rej_msg->remote_comm_id = cm_id_priv->id.remote_id;
+
+	switch(cm_id_priv->id.state) {
+	case IB_CM_REQ_RCVD:
+		rej_msg->local_comm_id = 0;
+		cm_rej_set_msg_rejected(rej_msg, CM_MSG_RESPONSE_REQ);
+		break;
+	case IB_CM_MRA_REQ_SENT:
+		rej_msg->local_comm_id = cm_id_priv->id.local_id;
+		cm_rej_set_msg_rejected(rej_msg, CM_MSG_RESPONSE_REQ);
+		break;
+	case IB_CM_REP_RCVD:
+	case IB_CM_MRA_REP_SENT:
+		rej_msg->local_comm_id = cm_id_priv->id.local_id;
+		cm_rej_set_msg_rejected(rej_msg, CM_MSG_RESPONSE_REP);
+		break;
+	default:
+		rej_msg->local_comm_id = cm_id_priv->id.local_id;
+		cm_rej_set_msg_rejected(rej_msg, CM_MSG_RESPONSE_OTHER);
+		break;
+	}
+
+	rej_msg->reason = cpu_to_be16(reason);
+	if (ari && ari_length) {
+		cm_rej_set_reject_info_len(rej_msg, ari_length);
+		memcpy(rej_msg->ari, ari, ari_length);
+	}
+
+	if (private_data && private_data_len)
+		memcpy(rej_msg->private_data, private_data, private_data_len);
+}
+
+static void cm_dup_req_handler(struct cm_work *work,
+			       struct cm_id_private *cm_id_priv)
+{
+	struct ib_mad_send_buf *msg = NULL;
+	int ret;
+
+	atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES].
+			counter[CM_REQ_COUNTER]);
+
+	/* Quick state check to discard duplicate REQs. */
+	if (cm_id_priv->id.state == IB_CM_REQ_RCVD)
+		return;
+
+	ret = cm_alloc_response_msg(work->port, work->mad_recv_wc, &msg);
+	if (ret)
+		return;
+
+	spin_lock_irq(&cm_id_priv->lock);
+	switch (cm_id_priv->id.state) {
+	case IB_CM_MRA_REQ_SENT:
+		cm_format_mra((struct cm_mra_msg *) msg->mad, cm_id_priv,
+			      CM_MSG_RESPONSE_REQ, cm_id_priv->service_timeout,
+			      cm_id_priv->private_data,
+			      cm_id_priv->private_data_len);
+		break;
+	case IB_CM_TIMEWAIT:
+		cm_format_rej((struct cm_rej_msg *) msg->mad, cm_id_priv,
+			      IB_CM_REJ_STALE_CONN, NULL, 0, NULL, 0);
+		break;
+	default:
+		goto unlock;
+	}
+	spin_unlock_irq(&cm_id_priv->lock);
+
+	ret = ib_post_send_mad(msg, NULL);
+	if (ret)
+		goto free;
+	return;
+
+unlock:	spin_unlock_irq(&cm_id_priv->lock);
+free:	cm_free_msg(msg);
+}
+
+static struct cm_id_private * cm_match_req(struct cm_work *work,
+					   struct cm_id_private *cm_id_priv)
+{
+	struct cm_id_private *listen_cm_id_priv, *cur_cm_id_priv;
+	struct cm_timewait_info *timewait_info;
+	struct cm_req_msg *req_msg;
+
+	req_msg = (struct cm_req_msg *)work->mad_recv_wc->recv_buf.mad;
+
+	/* Check for possible duplicate REQ. */
+	spin_lock_irq(&cm.lock);
+	timewait_info = cm_insert_remote_id(cm_id_priv->timewait_info);
+	if (timewait_info) {
+		cur_cm_id_priv = cm_get_id(timewait_info->work.local_id,
+					   timewait_info->work.remote_id);
+		spin_unlock_irq(&cm.lock);
+		if (cur_cm_id_priv) {
+			cm_dup_req_handler(work, cur_cm_id_priv);
+			cm_deref_id(cur_cm_id_priv);
+		}
+		return NULL;
+	}
+
+	/* Check for stale connections. */
+	timewait_info = cm_insert_remote_qpn(cm_id_priv->timewait_info);
+	if (timewait_info) {
+		cm_cleanup_timewait(cm_id_priv->timewait_info);
+		spin_unlock_irq(&cm.lock);
+		cm_issue_rej(work->port, work->mad_recv_wc,
+			     IB_CM_REJ_STALE_CONN, CM_MSG_RESPONSE_REQ,
+			     NULL, 0);
+		return NULL;
+	}
+
+	/* Find matching listen request. */
+	listen_cm_id_priv = cm_find_listen(cm_id_priv->id.device,
+					   req_msg->service_id,
+					   req_msg->private_data);
+	if (!listen_cm_id_priv) {
+		cm_cleanup_timewait(cm_id_priv->timewait_info);
+		spin_unlock_irq(&cm.lock);
+		cm_issue_rej(work->port, work->mad_recv_wc,
+			     IB_CM_REJ_INVALID_SERVICE_ID, CM_MSG_RESPONSE_REQ,
+			     NULL, 0);
+		goto out;
+	}
+	atomic_inc(&listen_cm_id_priv->refcount);
+	atomic_inc(&cm_id_priv->refcount);
+	cm_id_priv->id.state = IB_CM_REQ_RCVD;
+	atomic_inc(&cm_id_priv->work_count);
+	spin_unlock_irq(&cm.lock);
+out:
+	return listen_cm_id_priv;
+}
+
+/*
+ * Work-around for inter-subnet connections.  If the LIDs are permissive,
+ * we need to override the LID/SL data in the REQ with the LID information
+ * in the work completion.
+ */
+static void cm_process_routed_req(struct cm_req_msg *req_msg, struct ib_wc *wc)
+{
+	if (!cm_req_get_primary_subnet_local(req_msg)) {
+		if (req_msg->primary_local_lid == IB_LID_PERMISSIVE) {
+			req_msg->primary_local_lid = cpu_to_be16(wc->slid);
+			cm_req_set_primary_sl(req_msg, wc->sl);
+		}
+
+		if (req_msg->primary_remote_lid == IB_LID_PERMISSIVE)
+			req_msg->primary_remote_lid = cpu_to_be16(wc->dlid_path_bits);
+	}
+
+	if (!cm_req_get_alt_subnet_local(req_msg)) {
+		if (req_msg->alt_local_lid == IB_LID_PERMISSIVE) {
+			req_msg->alt_local_lid = cpu_to_be16(wc->slid);
+			cm_req_set_alt_sl(req_msg, wc->sl);
+		}
+
+		if (req_msg->alt_remote_lid == IB_LID_PERMISSIVE)
+			req_msg->alt_remote_lid = cpu_to_be16(wc->dlid_path_bits);
+	}
+}
+
+static int cm_req_handler(struct cm_work *work)
+{
+	struct ib_cm_id *cm_id;
+	struct cm_id_private *cm_id_priv, *listen_cm_id_priv;
+	struct cm_req_msg *req_msg;
+	int ret;
+
+	req_msg = (struct cm_req_msg *)work->mad_recv_wc->recv_buf.mad;
+
+	cm_id = ib_create_cm_id(work->port->cm_dev->ib_device, NULL, NULL);
+	if (IS_ERR(cm_id))
+		return PTR_ERR(cm_id);
+
+	cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+	cm_id_priv->id.remote_id = req_msg->local_comm_id;
+	cm_init_av_for_response(work->port, work->mad_recv_wc->wc,
+				work->mad_recv_wc->recv_buf.grh,
+				&cm_id_priv->av);
+	cm_id_priv->timewait_info = cm_create_timewait_info(cm_id_priv->
+							    id.local_id);
+	if (IS_ERR(cm_id_priv->timewait_info)) {
+		ret = PTR_ERR(cm_id_priv->timewait_info);
+		goto destroy;
+	}
+	cm_id_priv->timewait_info->work.remote_id = req_msg->local_comm_id;
+	cm_id_priv->timewait_info->remote_ca_guid = req_msg->local_ca_guid;
+	cm_id_priv->timewait_info->remote_qpn = cm_req_get_local_qpn(req_msg);
+
+	listen_cm_id_priv = cm_match_req(work, cm_id_priv);
+	if (!listen_cm_id_priv) {
+		ret = -EINVAL;
+		kfree(cm_id_priv->timewait_info);
+		goto destroy;
+	}
+
+	cm_id_priv->id.cm_handler = listen_cm_id_priv->id.cm_handler;
+	cm_id_priv->id.context = listen_cm_id_priv->id.context;
+	cm_id_priv->id.service_id = req_msg->service_id;
+	cm_id_priv->id.service_mask = ~cpu_to_be64(0);
+
+	cm_process_routed_req(req_msg, work->mad_recv_wc->wc);
+	cm_format_paths_from_req(req_msg, &work->path[0], &work->path[1]);
+
+	memcpy(work->path[0].dmac, cm_id_priv->av.ah_attr.dmac, ETH_ALEN);
+	work->path[0].vlan_id = cm_id_priv->av.ah_attr.vlan_id;
+	ret = cm_init_av_by_path(&work->path[0], &cm_id_priv->av);
+	if (ret) {
+		ib_get_cached_gid(work->port->cm_dev->ib_device,
+				  work->port->port_num, 0, &work->path[0].sgid);
+		ib_send_cm_rej(cm_id, IB_CM_REJ_INVALID_GID,
+			       &work->path[0].sgid, sizeof work->path[0].sgid,
+			       NULL, 0);
+		goto rejected;
+	}
+	if (req_msg->alt_local_lid) {
+		ret = cm_init_av_by_path(&work->path[1], &cm_id_priv->alt_av);
+		if (ret) {
+			ib_send_cm_rej(cm_id, IB_CM_REJ_INVALID_ALT_GID,
+				       &work->path[0].sgid,
+				       sizeof work->path[0].sgid, NULL, 0);
+			goto rejected;
+		}
+	}
+	cm_id_priv->tid = req_msg->hdr.tid;
+	cm_id_priv->timeout_ms = cm_convert_to_ms(
+					cm_req_get_local_resp_timeout(req_msg));
+	cm_id_priv->max_cm_retries = cm_req_get_max_cm_retries(req_msg);
+	cm_id_priv->remote_qpn = cm_req_get_local_qpn(req_msg);
+	cm_id_priv->initiator_depth = cm_req_get_resp_res(req_msg);
+	cm_id_priv->responder_resources = cm_req_get_init_depth(req_msg);
+	cm_id_priv->path_mtu = cm_req_get_path_mtu(req_msg);
+	cm_id_priv->pkey = req_msg->pkey;
+	cm_id_priv->sq_psn = cm_req_get_starting_psn(req_msg);
+	cm_id_priv->retry_count = cm_req_get_retry_count(req_msg);
+	cm_id_priv->rnr_retry_count = cm_req_get_rnr_retry_count(req_msg);
+	cm_id_priv->qp_type = cm_req_get_qp_type(req_msg);
+
+	cm_format_req_event(work, cm_id_priv, &listen_cm_id_priv->id);
+	cm_process_work(cm_id_priv, work);
+	cm_deref_id(listen_cm_id_priv);
+	return 0;
+
+rejected:
+	atomic_dec(&cm_id_priv->refcount);
+	cm_deref_id(listen_cm_id_priv);
+destroy:
+	ib_destroy_cm_id(cm_id);
+	return ret;
+}
+
+static void cm_format_rep(struct cm_rep_msg *rep_msg,
+			  struct cm_id_private *cm_id_priv,
+			  struct ib_cm_rep_param *param)
+{
+	cm_format_mad_hdr(&rep_msg->hdr, CM_REP_ATTR_ID, cm_id_priv->tid);
+	rep_msg->local_comm_id = cm_id_priv->id.local_id;
+	rep_msg->remote_comm_id = cm_id_priv->id.remote_id;
+	cm_rep_set_starting_psn(rep_msg, cpu_to_be32(param->starting_psn));
+	rep_msg->resp_resources = param->responder_resources;
+	cm_rep_set_target_ack_delay(rep_msg,
+				    cm_id_priv->av.port->cm_dev->ack_delay);
+	cm_rep_set_failover(rep_msg, param->failover_accepted);
+	cm_rep_set_rnr_retry_count(rep_msg, param->rnr_retry_count);
+	rep_msg->local_ca_guid = cm_id_priv->id.device->node_guid;
+
+	if (cm_id_priv->qp_type != IB_QPT_XRC_TGT) {
+		rep_msg->initiator_depth = param->initiator_depth;
+		cm_rep_set_flow_ctrl(rep_msg, param->flow_control);
+		cm_rep_set_srq(rep_msg, param->srq);
+		cm_rep_set_local_qpn(rep_msg, cpu_to_be32(param->qp_num));
+	} else {
+		cm_rep_set_srq(rep_msg, 1);
+		cm_rep_set_local_eecn(rep_msg, cpu_to_be32(param->qp_num));
+	}
+
+	if (param->private_data && param->private_data_len)
+		memcpy(rep_msg->private_data, param->private_data,
+		       param->private_data_len);
+}
+
+int ib_send_cm_rep(struct ib_cm_id *cm_id,
+		   struct ib_cm_rep_param *param)
+{
+	struct cm_id_private *cm_id_priv;
+	struct ib_mad_send_buf *msg;
+	struct cm_rep_msg *rep_msg;
+	unsigned long flags;
+	int ret;
+
+	if (param->private_data &&
+	    param->private_data_len > IB_CM_REP_PRIVATE_DATA_SIZE)
+		return -EINVAL;
+
+	cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	if (cm_id->state != IB_CM_REQ_RCVD &&
+	    cm_id->state != IB_CM_MRA_REQ_SENT) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	ret = cm_alloc_msg(cm_id_priv, &msg);
+	if (ret)
+		goto out;
+
+	rep_msg = (struct cm_rep_msg *) msg->mad;
+	cm_format_rep(rep_msg, cm_id_priv, param);
+	msg->timeout_ms = cm_id_priv->timeout_ms;
+	msg->context[1] = (void *) (unsigned long) IB_CM_REP_SENT;
+
+	ret = ib_post_send_mad(msg, NULL);
+	if (ret) {
+		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+		cm_free_msg(msg);
+		return ret;
+	}
+
+	cm_id->state = IB_CM_REP_SENT;
+	cm_id_priv->msg = msg;
+	cm_id_priv->initiator_depth = param->initiator_depth;
+	cm_id_priv->responder_resources = param->responder_resources;
+	cm_id_priv->rq_psn = cm_rep_get_starting_psn(rep_msg);
+	cm_id_priv->local_qpn = cpu_to_be32(param->qp_num & 0xFFFFFF);
+
+out:	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+	return ret;
+}
+EXPORT_SYMBOL(ib_send_cm_rep);
+
+static void cm_format_rtu(struct cm_rtu_msg *rtu_msg,
+			  struct cm_id_private *cm_id_priv,
+			  const void *private_data,
+			  u8 private_data_len)
+{
+	cm_format_mad_hdr(&rtu_msg->hdr, CM_RTU_ATTR_ID, cm_id_priv->tid);
+	rtu_msg->local_comm_id = cm_id_priv->id.local_id;
+	rtu_msg->remote_comm_id = cm_id_priv->id.remote_id;
+
+	if (private_data && private_data_len)
+		memcpy(rtu_msg->private_data, private_data, private_data_len);
+}
+
+int ib_send_cm_rtu(struct ib_cm_id *cm_id,
+		   const void *private_data,
+		   u8 private_data_len)
+{
+	struct cm_id_private *cm_id_priv;
+	struct ib_mad_send_buf *msg;
+	unsigned long flags;
+	void *data;
+	int ret;
+
+	if (private_data && private_data_len > IB_CM_RTU_PRIVATE_DATA_SIZE)
+		return -EINVAL;
+
+	data = cm_copy_private_data(private_data, private_data_len);
+	if (IS_ERR(data))
+		return PTR_ERR(data);
+
+	cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	if (cm_id->state != IB_CM_REP_RCVD &&
+	    cm_id->state != IB_CM_MRA_REP_SENT) {
+		ret = -EINVAL;
+		goto error;
+	}
+
+	ret = cm_alloc_msg(cm_id_priv, &msg);
+	if (ret)
+		goto error;
+
+	cm_format_rtu((struct cm_rtu_msg *) msg->mad, cm_id_priv,
+		      private_data, private_data_len);
+
+	ret = ib_post_send_mad(msg, NULL);
+	if (ret) {
+		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+		cm_free_msg(msg);
+		kfree(data);
+		return ret;
+	}
+
+	cm_id->state = IB_CM_ESTABLISHED;
+	cm_set_private_data(cm_id_priv, data, private_data_len);
+	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+	return 0;
+
+error:	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+	kfree(data);
+	return ret;
+}
+EXPORT_SYMBOL(ib_send_cm_rtu);
+
+static void cm_format_rep_event(struct cm_work *work, enum ib_qp_type qp_type)
+{
+	struct cm_rep_msg *rep_msg;
+	struct ib_cm_rep_event_param *param;
+
+	rep_msg = (struct cm_rep_msg *)work->mad_recv_wc->recv_buf.mad;
+	param = &work->cm_event.param.rep_rcvd;
+	param->remote_ca_guid = rep_msg->local_ca_guid;
+	param->remote_qkey = be32_to_cpu(rep_msg->local_qkey);
+	param->remote_qpn = be32_to_cpu(cm_rep_get_qpn(rep_msg, qp_type));
+	param->starting_psn = be32_to_cpu(cm_rep_get_starting_psn(rep_msg));
+	param->responder_resources = rep_msg->initiator_depth;
+	param->initiator_depth = rep_msg->resp_resources;
+	param->target_ack_delay = cm_rep_get_target_ack_delay(rep_msg);
+	param->failover_accepted = cm_rep_get_failover(rep_msg);
+	param->flow_control = cm_rep_get_flow_ctrl(rep_msg);
+	param->rnr_retry_count = cm_rep_get_rnr_retry_count(rep_msg);
+	param->srq = cm_rep_get_srq(rep_msg);
+	work->cm_event.private_data = &rep_msg->private_data;
+}
+
+static void cm_dup_rep_handler(struct cm_work *work)
+{
+	struct cm_id_private *cm_id_priv;
+	struct cm_rep_msg *rep_msg;
+	struct ib_mad_send_buf *msg = NULL;
+	int ret;
+
+	rep_msg = (struct cm_rep_msg *) work->mad_recv_wc->recv_buf.mad;
+	cm_id_priv = cm_acquire_id(rep_msg->remote_comm_id,
+				   rep_msg->local_comm_id);
+	if (!cm_id_priv)
+		return;
+
+	atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES].
+			counter[CM_REP_COUNTER]);
+	ret = cm_alloc_response_msg(work->port, work->mad_recv_wc, &msg);
+	if (ret)
+		goto deref;
+
+	spin_lock_irq(&cm_id_priv->lock);
+	if (cm_id_priv->id.state == IB_CM_ESTABLISHED)
+		cm_format_rtu((struct cm_rtu_msg *) msg->mad, cm_id_priv,
+			      cm_id_priv->private_data,
+			      cm_id_priv->private_data_len);
+	else if (cm_id_priv->id.state == IB_CM_MRA_REP_SENT)
+		cm_format_mra((struct cm_mra_msg *) msg->mad, cm_id_priv,
+			      CM_MSG_RESPONSE_REP, cm_id_priv->service_timeout,
+			      cm_id_priv->private_data,
+			      cm_id_priv->private_data_len);
+	else
+		goto unlock;
+	spin_unlock_irq(&cm_id_priv->lock);
+
+	ret = ib_post_send_mad(msg, NULL);
+	if (ret)
+		goto free;
+	goto deref;
+
+unlock:	spin_unlock_irq(&cm_id_priv->lock);
+free:	cm_free_msg(msg);
+deref:	cm_deref_id(cm_id_priv);
+}
+
+static int cm_rep_handler(struct cm_work *work)
+{
+	struct cm_id_private *cm_id_priv;
+	struct cm_rep_msg *rep_msg;
+	int ret;
+
+	rep_msg = (struct cm_rep_msg *)work->mad_recv_wc->recv_buf.mad;
+	cm_id_priv = cm_acquire_id(rep_msg->remote_comm_id, 0);
+	if (!cm_id_priv) {
+		cm_dup_rep_handler(work);
+		return -EINVAL;
+	}
+
+	cm_format_rep_event(work, cm_id_priv->qp_type);
+
+	spin_lock_irq(&cm_id_priv->lock);
+	switch (cm_id_priv->id.state) {
+	case IB_CM_REQ_SENT:
+	case IB_CM_MRA_REQ_RCVD:
+		break;
+	default:
+		spin_unlock_irq(&cm_id_priv->lock);
+		ret = -EINVAL;
+		goto error;
+	}
+
+	cm_id_priv->timewait_info->work.remote_id = rep_msg->local_comm_id;
+	cm_id_priv->timewait_info->remote_ca_guid = rep_msg->local_ca_guid;
+	cm_id_priv->timewait_info->remote_qpn = cm_rep_get_qpn(rep_msg, cm_id_priv->qp_type);
+
+	spin_lock(&cm.lock);
+	/* Check for duplicate REP. */
+	if (cm_insert_remote_id(cm_id_priv->timewait_info)) {
+		spin_unlock(&cm.lock);
+		spin_unlock_irq(&cm_id_priv->lock);
+		ret = -EINVAL;
+		goto error;
+	}
+	/* Check for a stale connection. */
+	if (cm_insert_remote_qpn(cm_id_priv->timewait_info)) {
+		rb_erase(&cm_id_priv->timewait_info->remote_id_node,
+			 &cm.remote_id_table);
+		cm_id_priv->timewait_info->inserted_remote_id = 0;
+		spin_unlock(&cm.lock);
+		spin_unlock_irq(&cm_id_priv->lock);
+		cm_issue_rej(work->port, work->mad_recv_wc,
+			     IB_CM_REJ_STALE_CONN, CM_MSG_RESPONSE_REP,
+			     NULL, 0);
+		ret = -EINVAL;
+		goto error;
+	}
+	spin_unlock(&cm.lock);
+
+	cm_id_priv->id.state = IB_CM_REP_RCVD;
+	cm_id_priv->id.remote_id = rep_msg->local_comm_id;
+	cm_id_priv->remote_qpn = cm_rep_get_qpn(rep_msg, cm_id_priv->qp_type);
+	cm_id_priv->initiator_depth = rep_msg->resp_resources;
+	cm_id_priv->responder_resources = rep_msg->initiator_depth;
+	cm_id_priv->sq_psn = cm_rep_get_starting_psn(rep_msg);
+	cm_id_priv->rnr_retry_count = cm_rep_get_rnr_retry_count(rep_msg);
+	cm_id_priv->target_ack_delay = cm_rep_get_target_ack_delay(rep_msg);
+	cm_id_priv->av.timeout =
+			cm_ack_timeout(cm_id_priv->target_ack_delay,
+				       cm_id_priv->av.timeout - 1);
+	cm_id_priv->alt_av.timeout =
+			cm_ack_timeout(cm_id_priv->target_ack_delay,
+				       cm_id_priv->alt_av.timeout - 1);
+
+	/* todo: handle peer_to_peer */
+
+	ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
+	ret = atomic_inc_and_test(&cm_id_priv->work_count);
+	if (!ret)
+		list_add_tail(&work->list, &cm_id_priv->work_list);
+	spin_unlock_irq(&cm_id_priv->lock);
+
+	if (ret)
+		cm_process_work(cm_id_priv, work);
+	else
+		cm_deref_id(cm_id_priv);
+	return 0;
+
+error:
+	cm_deref_id(cm_id_priv);
+	return ret;
+}
+
+static int cm_establish_handler(struct cm_work *work)
+{
+	struct cm_id_private *cm_id_priv;
+	int ret;
+
+	/* See comment in cm_establish about lookup. */
+	cm_id_priv = cm_acquire_id(work->local_id, work->remote_id);
+	if (!cm_id_priv)
+		return -EINVAL;
+
+	spin_lock_irq(&cm_id_priv->lock);
+	if (cm_id_priv->id.state != IB_CM_ESTABLISHED) {
+		spin_unlock_irq(&cm_id_priv->lock);
+		goto out;
+	}
+
+	ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
+	ret = atomic_inc_and_test(&cm_id_priv->work_count);
+	if (!ret)
+		list_add_tail(&work->list, &cm_id_priv->work_list);
+	spin_unlock_irq(&cm_id_priv->lock);
+
+	if (ret)
+		cm_process_work(cm_id_priv, work);
+	else
+		cm_deref_id(cm_id_priv);
+	return 0;
+out:
+	cm_deref_id(cm_id_priv);
+	return -EINVAL;
+}
+
+static int cm_rtu_handler(struct cm_work *work)
+{
+	struct cm_id_private *cm_id_priv;
+	struct cm_rtu_msg *rtu_msg;
+	int ret;
+
+	rtu_msg = (struct cm_rtu_msg *)work->mad_recv_wc->recv_buf.mad;
+	cm_id_priv = cm_acquire_id(rtu_msg->remote_comm_id,
+				   rtu_msg->local_comm_id);
+	if (!cm_id_priv)
+		return -EINVAL;
+
+	work->cm_event.private_data = &rtu_msg->private_data;
+
+	spin_lock_irq(&cm_id_priv->lock);
+	if (cm_id_priv->id.state != IB_CM_REP_SENT &&
+	    cm_id_priv->id.state != IB_CM_MRA_REP_RCVD) {
+		spin_unlock_irq(&cm_id_priv->lock);
+		atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES].
+				counter[CM_RTU_COUNTER]);
+		goto out;
+	}
+	cm_id_priv->id.state = IB_CM_ESTABLISHED;
+
+	ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
+	ret = atomic_inc_and_test(&cm_id_priv->work_count);
+	if (!ret)
+		list_add_tail(&work->list, &cm_id_priv->work_list);
+	spin_unlock_irq(&cm_id_priv->lock);
+
+	if (ret)
+		cm_process_work(cm_id_priv, work);
+	else
+		cm_deref_id(cm_id_priv);
+	return 0;
+out:
+	cm_deref_id(cm_id_priv);
+	return -EINVAL;
+}
+
+static void cm_format_dreq(struct cm_dreq_msg *dreq_msg,
+			  struct cm_id_private *cm_id_priv,
+			  const void *private_data,
+			  u8 private_data_len)
+{
+	cm_format_mad_hdr(&dreq_msg->hdr, CM_DREQ_ATTR_ID,
+			  cm_form_tid(cm_id_priv, CM_MSG_SEQUENCE_DREQ));
+	dreq_msg->local_comm_id = cm_id_priv->id.local_id;
+	dreq_msg->remote_comm_id = cm_id_priv->id.remote_id;
+	cm_dreq_set_remote_qpn(dreq_msg, cm_id_priv->remote_qpn);
+
+	if (private_data && private_data_len)
+		memcpy(dreq_msg->private_data, private_data, private_data_len);
+}
+
+int ib_send_cm_dreq(struct ib_cm_id *cm_id,
+		    const void *private_data,
+		    u8 private_data_len)
+{
+	struct cm_id_private *cm_id_priv;
+	struct ib_mad_send_buf *msg;
+	unsigned long flags;
+	int ret;
+
+	if (private_data && private_data_len > IB_CM_DREQ_PRIVATE_DATA_SIZE)
+		return -EINVAL;
+
+	cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	if (cm_id->state != IB_CM_ESTABLISHED) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (cm_id->lap_state == IB_CM_LAP_SENT ||
+	    cm_id->lap_state == IB_CM_MRA_LAP_RCVD)
+		ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
+
+	ret = cm_alloc_msg(cm_id_priv, &msg);
+	if (ret) {
+		cm_enter_timewait(cm_id_priv);
+		goto out;
+	}
+
+	cm_format_dreq((struct cm_dreq_msg *) msg->mad, cm_id_priv,
+		       private_data, private_data_len);
+	msg->timeout_ms = cm_id_priv->timeout_ms;
+	msg->context[1] = (void *) (unsigned long) IB_CM_DREQ_SENT;
+
+	ret = ib_post_send_mad(msg, NULL);
+	if (ret) {
+		cm_enter_timewait(cm_id_priv);
+		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+		cm_free_msg(msg);
+		return ret;
+	}
+
+	cm_id->state = IB_CM_DREQ_SENT;
+	cm_id_priv->msg = msg;
+out:	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+	return ret;
+}
+EXPORT_SYMBOL(ib_send_cm_dreq);
+
+static void cm_format_drep(struct cm_drep_msg *drep_msg,
+			  struct cm_id_private *cm_id_priv,
+			  const void *private_data,
+			  u8 private_data_len)
+{
+	cm_format_mad_hdr(&drep_msg->hdr, CM_DREP_ATTR_ID, cm_id_priv->tid);
+	drep_msg->local_comm_id = cm_id_priv->id.local_id;
+	drep_msg->remote_comm_id = cm_id_priv->id.remote_id;
+
+	if (private_data && private_data_len)
+		memcpy(drep_msg->private_data, private_data, private_data_len);
+}
+
+int ib_send_cm_drep(struct ib_cm_id *cm_id,
+		    const void *private_data,
+		    u8 private_data_len)
+{
+	struct cm_id_private *cm_id_priv;
+	struct ib_mad_send_buf *msg;
+	unsigned long flags;
+	void *data;
+	int ret;
+
+	if (private_data && private_data_len > IB_CM_DREP_PRIVATE_DATA_SIZE)
+		return -EINVAL;
+
+	data = cm_copy_private_data(private_data, private_data_len);
+	if (IS_ERR(data))
+		return PTR_ERR(data);
+
+	cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	if (cm_id->state != IB_CM_DREQ_RCVD) {
+		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+		kfree(data);
+		return -EINVAL;
+	}
+
+	cm_set_private_data(cm_id_priv, data, private_data_len);
+	cm_enter_timewait(cm_id_priv);
+
+	ret = cm_alloc_msg(cm_id_priv, &msg);
+	if (ret)
+		goto out;
+
+	cm_format_drep((struct cm_drep_msg *) msg->mad, cm_id_priv,
+		       private_data, private_data_len);
+
+	ret = ib_post_send_mad(msg, NULL);
+	if (ret) {
+		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+		cm_free_msg(msg);
+		return ret;
+	}
+
+out:	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+	return ret;
+}
+EXPORT_SYMBOL(ib_send_cm_drep);
+
+static int cm_issue_drep(struct cm_port *port,
+			 struct ib_mad_recv_wc *mad_recv_wc)
+{
+	struct ib_mad_send_buf *msg = NULL;
+	struct cm_dreq_msg *dreq_msg;
+	struct cm_drep_msg *drep_msg;
+	int ret;
+
+	ret = cm_alloc_response_msg(port, mad_recv_wc, &msg);
+	if (ret)
+		return ret;
+
+	dreq_msg = (struct cm_dreq_msg *) mad_recv_wc->recv_buf.mad;
+	drep_msg = (struct cm_drep_msg *) msg->mad;
+
+	cm_format_mad_hdr(&drep_msg->hdr, CM_DREP_ATTR_ID, dreq_msg->hdr.tid);
+	drep_msg->remote_comm_id = dreq_msg->local_comm_id;
+	drep_msg->local_comm_id = dreq_msg->remote_comm_id;
+
+	ret = ib_post_send_mad(msg, NULL);
+	if (ret)
+		cm_free_msg(msg);
+
+	return ret;
+}
+
+static int cm_dreq_handler(struct cm_work *work)
+{
+	struct cm_id_private *cm_id_priv;
+	struct cm_dreq_msg *dreq_msg;
+	struct ib_mad_send_buf *msg = NULL;
+	int ret;
+
+	dreq_msg = (struct cm_dreq_msg *)work->mad_recv_wc->recv_buf.mad;
+	cm_id_priv = cm_acquire_id(dreq_msg->remote_comm_id,
+				   dreq_msg->local_comm_id);
+	if (!cm_id_priv) {
+		atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES].
+				counter[CM_DREQ_COUNTER]);
+		cm_issue_drep(work->port, work->mad_recv_wc);
+		return -EINVAL;
+	}
+
+	work->cm_event.private_data = &dreq_msg->private_data;
+
+	spin_lock_irq(&cm_id_priv->lock);
+	if (cm_id_priv->local_qpn != cm_dreq_get_remote_qpn(dreq_msg))
+		goto unlock;
+
+	switch (cm_id_priv->id.state) {
+	case IB_CM_REP_SENT:
+	case IB_CM_DREQ_SENT:
+		ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
+		break;
+	case IB_CM_ESTABLISHED:
+		if (cm_id_priv->id.lap_state == IB_CM_LAP_SENT ||
+		    cm_id_priv->id.lap_state == IB_CM_MRA_LAP_RCVD)
+			ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
+		break;
+	case IB_CM_MRA_REP_RCVD:
+		break;
+	case IB_CM_TIMEWAIT:
+		atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES].
+				counter[CM_DREQ_COUNTER]);
+		if (cm_alloc_response_msg(work->port, work->mad_recv_wc, &msg))
+			goto unlock;
+
+		cm_format_drep((struct cm_drep_msg *) msg->mad, cm_id_priv,
+			       cm_id_priv->private_data,
+			       cm_id_priv->private_data_len);
+		spin_unlock_irq(&cm_id_priv->lock);
+
+		if (ib_post_send_mad(msg, NULL))
+			cm_free_msg(msg);
+		goto deref;
+	case IB_CM_DREQ_RCVD:
+		atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES].
+				counter[CM_DREQ_COUNTER]);
+		goto unlock;
+	default:
+		goto unlock;
+	}
+	cm_id_priv->id.state = IB_CM_DREQ_RCVD;
+	cm_id_priv->tid = dreq_msg->hdr.tid;
+	ret = atomic_inc_and_test(&cm_id_priv->work_count);
+	if (!ret)
+		list_add_tail(&work->list, &cm_id_priv->work_list);
+	spin_unlock_irq(&cm_id_priv->lock);
+
+	if (ret)
+		cm_process_work(cm_id_priv, work);
+	else
+		cm_deref_id(cm_id_priv);
+	return 0;
+
+unlock:	spin_unlock_irq(&cm_id_priv->lock);
+deref:	cm_deref_id(cm_id_priv);
+	return -EINVAL;
+}
+
+static int cm_drep_handler(struct cm_work *work)
+{
+	struct cm_id_private *cm_id_priv;
+	struct cm_drep_msg *drep_msg;
+	int ret;
+
+	drep_msg = (struct cm_drep_msg *)work->mad_recv_wc->recv_buf.mad;
+	cm_id_priv = cm_acquire_id(drep_msg->remote_comm_id,
+				   drep_msg->local_comm_id);
+	if (!cm_id_priv)
+		return -EINVAL;
+
+	work->cm_event.private_data = &drep_msg->private_data;
+
+	spin_lock_irq(&cm_id_priv->lock);
+	if (cm_id_priv->id.state != IB_CM_DREQ_SENT &&
+	    cm_id_priv->id.state != IB_CM_DREQ_RCVD) {
+		spin_unlock_irq(&cm_id_priv->lock);
+		goto out;
+	}
+	cm_enter_timewait(cm_id_priv);
+
+	ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
+	ret = atomic_inc_and_test(&cm_id_priv->work_count);
+	if (!ret)
+		list_add_tail(&work->list, &cm_id_priv->work_list);
+	spin_unlock_irq(&cm_id_priv->lock);
+
+	if (ret)
+		cm_process_work(cm_id_priv, work);
+	else
+		cm_deref_id(cm_id_priv);
+	return 0;
+out:
+	cm_deref_id(cm_id_priv);
+	return -EINVAL;
+}
+
+int ib_send_cm_rej(struct ib_cm_id *cm_id,
+		   enum ib_cm_rej_reason reason,
+		   void *ari,
+		   u8 ari_length,
+		   const void *private_data,
+		   u8 private_data_len)
+{
+	struct cm_id_private *cm_id_priv;
+	struct ib_mad_send_buf *msg;
+	unsigned long flags;
+	int ret;
+
+	if ((private_data && private_data_len > IB_CM_REJ_PRIVATE_DATA_SIZE) ||
+	    (ari && ari_length > IB_CM_REJ_ARI_LENGTH))
+		return -EINVAL;
+
+	cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	switch (cm_id->state) {
+	case IB_CM_REQ_SENT:
+	case IB_CM_MRA_REQ_RCVD:
+	case IB_CM_REQ_RCVD:
+	case IB_CM_MRA_REQ_SENT:
+	case IB_CM_REP_RCVD:
+	case IB_CM_MRA_REP_SENT:
+		ret = cm_alloc_msg(cm_id_priv, &msg);
+		if (!ret)
+			cm_format_rej((struct cm_rej_msg *) msg->mad,
+				      cm_id_priv, reason, ari, ari_length,
+				      private_data, private_data_len);
+
+		cm_reset_to_idle(cm_id_priv);
+		break;
+	case IB_CM_REP_SENT:
+	case IB_CM_MRA_REP_RCVD:
+		ret = cm_alloc_msg(cm_id_priv, &msg);
+		if (!ret)
+			cm_format_rej((struct cm_rej_msg *) msg->mad,
+				      cm_id_priv, reason, ari, ari_length,
+				      private_data, private_data_len);
+
+		cm_enter_timewait(cm_id_priv);
+		break;
+	default:
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (ret)
+		goto out;
+
+	ret = ib_post_send_mad(msg, NULL);
+	if (ret)
+		cm_free_msg(msg);
+
+out:	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+	return ret;
+}
+EXPORT_SYMBOL(ib_send_cm_rej);
+
+static void cm_format_rej_event(struct cm_work *work)
+{
+	struct cm_rej_msg *rej_msg;
+	struct ib_cm_rej_event_param *param;
+
+	rej_msg = (struct cm_rej_msg *)work->mad_recv_wc->recv_buf.mad;
+	param = &work->cm_event.param.rej_rcvd;
+	param->ari = rej_msg->ari;
+	param->ari_length = cm_rej_get_reject_info_len(rej_msg);
+	param->reason = __be16_to_cpu(rej_msg->reason);
+	work->cm_event.private_data = &rej_msg->private_data;
+}
+
+static struct cm_id_private * cm_acquire_rejected_id(struct cm_rej_msg *rej_msg)
+{
+	struct cm_timewait_info *timewait_info;
+	struct cm_id_private *cm_id_priv;
+	__be32 remote_id;
+
+	remote_id = rej_msg->local_comm_id;
+
+	if (__be16_to_cpu(rej_msg->reason) == IB_CM_REJ_TIMEOUT) {
+		spin_lock_irq(&cm.lock);
+		timewait_info = cm_find_remote_id( *((__be64 *) rej_msg->ari),
+						  remote_id);
+		if (!timewait_info) {
+			spin_unlock_irq(&cm.lock);
+			return NULL;
+		}
+		cm_id_priv = idr_find(&cm.local_id_table, (__force int)
+				      (timewait_info->work.local_id ^
+				       cm.random_id_operand));
+		if (cm_id_priv) {
+			if (cm_id_priv->id.remote_id == remote_id)
+				atomic_inc(&cm_id_priv->refcount);
+			else
+				cm_id_priv = NULL;
+		}
+		spin_unlock_irq(&cm.lock);
+	} else if (cm_rej_get_msg_rejected(rej_msg) == CM_MSG_RESPONSE_REQ)
+		cm_id_priv = cm_acquire_id(rej_msg->remote_comm_id, 0);
+	else
+		cm_id_priv = cm_acquire_id(rej_msg->remote_comm_id, remote_id);
+
+	return cm_id_priv;
+}
+
+static int cm_rej_handler(struct cm_work *work)
+{
+	struct cm_id_private *cm_id_priv;
+	struct cm_rej_msg *rej_msg;
+	int ret;
+
+	rej_msg = (struct cm_rej_msg *)work->mad_recv_wc->recv_buf.mad;
+	cm_id_priv = cm_acquire_rejected_id(rej_msg);
+	if (!cm_id_priv)
+		return -EINVAL;
+
+	cm_format_rej_event(work);
+
+	spin_lock_irq(&cm_id_priv->lock);
+	switch (cm_id_priv->id.state) {
+	case IB_CM_REQ_SENT:
+	case IB_CM_MRA_REQ_RCVD:
+	case IB_CM_REP_SENT:
+	case IB_CM_MRA_REP_RCVD:
+		ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
+		/* fall through */
+	case IB_CM_REQ_RCVD:
+	case IB_CM_MRA_REQ_SENT:
+		if (__be16_to_cpu(rej_msg->reason) == IB_CM_REJ_STALE_CONN)
+			cm_enter_timewait(cm_id_priv);
+		else
+			cm_reset_to_idle(cm_id_priv);
+		break;
+	case IB_CM_DREQ_SENT:
+		ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
+		/* fall through */
+	case IB_CM_REP_RCVD:
+	case IB_CM_MRA_REP_SENT:
+		cm_enter_timewait(cm_id_priv);
+		break;
+	case IB_CM_ESTABLISHED:
+		if (cm_id_priv->id.lap_state == IB_CM_LAP_UNINIT ||
+		    cm_id_priv->id.lap_state == IB_CM_LAP_SENT) {
+			if (cm_id_priv->id.lap_state == IB_CM_LAP_SENT)
+				ib_cancel_mad(cm_id_priv->av.port->mad_agent,
+					      cm_id_priv->msg);
+			cm_enter_timewait(cm_id_priv);
+			break;
+		}
+		/* fall through */
+	default:
+		spin_unlock_irq(&cm_id_priv->lock);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	ret = atomic_inc_and_test(&cm_id_priv->work_count);
+	if (!ret)
+		list_add_tail(&work->list, &cm_id_priv->work_list);
+	spin_unlock_irq(&cm_id_priv->lock);
+
+	if (ret)
+		cm_process_work(cm_id_priv, work);
+	else
+		cm_deref_id(cm_id_priv);
+	return 0;
+out:
+	cm_deref_id(cm_id_priv);
+	return -EINVAL;
+}
+
+int ib_send_cm_mra(struct ib_cm_id *cm_id,
+		   u8 service_timeout,
+		   const void *private_data,
+		   u8 private_data_len)
+{
+	struct cm_id_private *cm_id_priv;
+	struct ib_mad_send_buf *msg;
+	enum ib_cm_state cm_state;
+	enum ib_cm_lap_state lap_state;
+	enum cm_msg_response msg_response;
+	void *data;
+	unsigned long flags;
+	int ret;
+
+	if (private_data && private_data_len > IB_CM_MRA_PRIVATE_DATA_SIZE)
+		return -EINVAL;
+
+	data = cm_copy_private_data(private_data, private_data_len);
+	if (IS_ERR(data))
+		return PTR_ERR(data);
+
+	cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	switch(cm_id_priv->id.state) {
+	case IB_CM_REQ_RCVD:
+		cm_state = IB_CM_MRA_REQ_SENT;
+		lap_state = cm_id->lap_state;
+		msg_response = CM_MSG_RESPONSE_REQ;
+		break;
+	case IB_CM_REP_RCVD:
+		cm_state = IB_CM_MRA_REP_SENT;
+		lap_state = cm_id->lap_state;
+		msg_response = CM_MSG_RESPONSE_REP;
+		break;
+	case IB_CM_ESTABLISHED:
+		if (cm_id->lap_state == IB_CM_LAP_RCVD) {
+			cm_state = cm_id->state;
+			lap_state = IB_CM_MRA_LAP_SENT;
+			msg_response = CM_MSG_RESPONSE_OTHER;
+			break;
+		}
+	default:
+		ret = -EINVAL;
+		goto error1;
+	}
+
+	if (!(service_timeout & IB_CM_MRA_FLAG_DELAY)) {
+		ret = cm_alloc_msg(cm_id_priv, &msg);
+		if (ret)
+			goto error1;
+
+		cm_format_mra((struct cm_mra_msg *) msg->mad, cm_id_priv,
+			      msg_response, service_timeout,
+			      private_data, private_data_len);
+		ret = ib_post_send_mad(msg, NULL);
+		if (ret)
+			goto error2;
+	}
+
+	cm_id->state = cm_state;
+	cm_id->lap_state = lap_state;
+	cm_id_priv->service_timeout = service_timeout;
+	cm_set_private_data(cm_id_priv, data, private_data_len);
+	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+	return 0;
+
+error1:	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+	kfree(data);
+	return ret;
+
+error2:	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+	kfree(data);
+	cm_free_msg(msg);
+	return ret;
+}
+EXPORT_SYMBOL(ib_send_cm_mra);
+
+static struct cm_id_private * cm_acquire_mraed_id(struct cm_mra_msg *mra_msg)
+{
+	switch (cm_mra_get_msg_mraed(mra_msg)) {
+	case CM_MSG_RESPONSE_REQ:
+		return cm_acquire_id(mra_msg->remote_comm_id, 0);
+	case CM_MSG_RESPONSE_REP:
+	case CM_MSG_RESPONSE_OTHER:
+		return cm_acquire_id(mra_msg->remote_comm_id,
+				     mra_msg->local_comm_id);
+	default:
+		return NULL;
+	}
+}
+
+static int cm_mra_handler(struct cm_work *work)
+{
+	struct cm_id_private *cm_id_priv;
+	struct cm_mra_msg *mra_msg;
+	int timeout, ret;
+
+	mra_msg = (struct cm_mra_msg *)work->mad_recv_wc->recv_buf.mad;
+	cm_id_priv = cm_acquire_mraed_id(mra_msg);
+	if (!cm_id_priv)
+		return -EINVAL;
+
+	work->cm_event.private_data = &mra_msg->private_data;
+	work->cm_event.param.mra_rcvd.service_timeout =
+					cm_mra_get_service_timeout(mra_msg);
+	timeout = cm_convert_to_ms(cm_mra_get_service_timeout(mra_msg)) +
+		  cm_convert_to_ms(cm_id_priv->av.timeout);
+
+	spin_lock_irq(&cm_id_priv->lock);
+	switch (cm_id_priv->id.state) {
+	case IB_CM_REQ_SENT:
+		if (cm_mra_get_msg_mraed(mra_msg) != CM_MSG_RESPONSE_REQ ||
+		    ib_modify_mad(cm_id_priv->av.port->mad_agent,
+				  cm_id_priv->msg, timeout))
+			goto out;
+		cm_id_priv->id.state = IB_CM_MRA_REQ_RCVD;
+		break;
+	case IB_CM_REP_SENT:
+		if (cm_mra_get_msg_mraed(mra_msg) != CM_MSG_RESPONSE_REP ||
+		    ib_modify_mad(cm_id_priv->av.port->mad_agent,
+				  cm_id_priv->msg, timeout))
+			goto out;
+		cm_id_priv->id.state = IB_CM_MRA_REP_RCVD;
+		break;
+	case IB_CM_ESTABLISHED:
+		if (cm_mra_get_msg_mraed(mra_msg) != CM_MSG_RESPONSE_OTHER ||
+		    cm_id_priv->id.lap_state != IB_CM_LAP_SENT ||
+		    ib_modify_mad(cm_id_priv->av.port->mad_agent,
+				  cm_id_priv->msg, timeout)) {
+			if (cm_id_priv->id.lap_state == IB_CM_MRA_LAP_RCVD)
+				atomic_long_inc(&work->port->
+						counter_group[CM_RECV_DUPLICATES].
+						counter[CM_MRA_COUNTER]);
+			goto out;
+		}
+		cm_id_priv->id.lap_state = IB_CM_MRA_LAP_RCVD;
+		break;
+	case IB_CM_MRA_REQ_RCVD:
+	case IB_CM_MRA_REP_RCVD:
+		atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES].
+				counter[CM_MRA_COUNTER]);
+		/* fall through */
+	default:
+		goto out;
+	}
+
+	cm_id_priv->msg->context[1] = (void *) (unsigned long)
+				      cm_id_priv->id.state;
+	ret = atomic_inc_and_test(&cm_id_priv->work_count);
+	if (!ret)
+		list_add_tail(&work->list, &cm_id_priv->work_list);
+	spin_unlock_irq(&cm_id_priv->lock);
+
+	if (ret)
+		cm_process_work(cm_id_priv, work);
+	else
+		cm_deref_id(cm_id_priv);
+	return 0;
+out:
+	spin_unlock_irq(&cm_id_priv->lock);
+	cm_deref_id(cm_id_priv);
+	return -EINVAL;
+}
+
+static void cm_format_lap(struct cm_lap_msg *lap_msg,
+			  struct cm_id_private *cm_id_priv,
+			  struct ib_sa_path_rec *alternate_path,
+			  const void *private_data,
+			  u8 private_data_len)
+{
+	cm_format_mad_hdr(&lap_msg->hdr, CM_LAP_ATTR_ID,
+			  cm_form_tid(cm_id_priv, CM_MSG_SEQUENCE_LAP));
+	lap_msg->local_comm_id = cm_id_priv->id.local_id;
+	lap_msg->remote_comm_id = cm_id_priv->id.remote_id;
+	cm_lap_set_remote_qpn(lap_msg, cm_id_priv->remote_qpn);
+	/* todo: need remote CM response timeout */
+	cm_lap_set_remote_resp_timeout(lap_msg, 0x1F);
+	lap_msg->alt_local_lid = alternate_path->slid;
+	lap_msg->alt_remote_lid = alternate_path->dlid;
+	lap_msg->alt_local_gid = alternate_path->sgid;
+	lap_msg->alt_remote_gid = alternate_path->dgid;
+	cm_lap_set_flow_label(lap_msg, alternate_path->flow_label);
+	cm_lap_set_traffic_class(lap_msg, alternate_path->traffic_class);
+	lap_msg->alt_hop_limit = alternate_path->hop_limit;
+	cm_lap_set_packet_rate(lap_msg, alternate_path->rate);
+	cm_lap_set_sl(lap_msg, alternate_path->sl);
+	cm_lap_set_subnet_local(lap_msg, 1); /* local only... */
+	cm_lap_set_local_ack_timeout(lap_msg,
+		cm_ack_timeout(cm_id_priv->av.port->cm_dev->ack_delay,
+			       alternate_path->packet_life_time));
+
+	if (private_data && private_data_len)
+		memcpy(lap_msg->private_data, private_data, private_data_len);
+}
+
+int ib_send_cm_lap(struct ib_cm_id *cm_id,
+		   struct ib_sa_path_rec *alternate_path,
+		   const void *private_data,
+		   u8 private_data_len)
+{
+	struct cm_id_private *cm_id_priv;
+	struct ib_mad_send_buf *msg;
+	unsigned long flags;
+	int ret;
+
+	if (private_data && private_data_len > IB_CM_LAP_PRIVATE_DATA_SIZE)
+		return -EINVAL;
+
+	cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	if (cm_id->state != IB_CM_ESTABLISHED ||
+	    (cm_id->lap_state != IB_CM_LAP_UNINIT &&
+	     cm_id->lap_state != IB_CM_LAP_IDLE)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	ret = cm_init_av_by_path(alternate_path, &cm_id_priv->alt_av);
+	if (ret)
+		goto out;
+	cm_id_priv->alt_av.timeout =
+			cm_ack_timeout(cm_id_priv->target_ack_delay,
+				       cm_id_priv->alt_av.timeout - 1);
+
+	ret = cm_alloc_msg(cm_id_priv, &msg);
+	if (ret)
+		goto out;
+
+	cm_format_lap((struct cm_lap_msg *) msg->mad, cm_id_priv,
+		      alternate_path, private_data, private_data_len);
+	msg->timeout_ms = cm_id_priv->timeout_ms;
+	msg->context[1] = (void *) (unsigned long) IB_CM_ESTABLISHED;
+
+	ret = ib_post_send_mad(msg, NULL);
+	if (ret) {
+		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+		cm_free_msg(msg);
+		return ret;
+	}
+
+	cm_id->lap_state = IB_CM_LAP_SENT;
+	cm_id_priv->msg = msg;
+
+out:	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+	return ret;
+}
+EXPORT_SYMBOL(ib_send_cm_lap);
+
+static void cm_format_path_from_lap(struct cm_id_private *cm_id_priv,
+				    struct ib_sa_path_rec *path,
+				    struct cm_lap_msg *lap_msg)
+{
+	memset(path, 0, sizeof *path);
+	path->dgid = lap_msg->alt_local_gid;
+	path->sgid = lap_msg->alt_remote_gid;
+	path->dlid = lap_msg->alt_local_lid;
+	path->slid = lap_msg->alt_remote_lid;
+	path->flow_label = cm_lap_get_flow_label(lap_msg);
+	path->hop_limit = lap_msg->alt_hop_limit;
+	path->traffic_class = cm_lap_get_traffic_class(lap_msg);
+	path->reversible = 1;
+	path->pkey = cm_id_priv->pkey;
+	path->sl = cm_lap_get_sl(lap_msg);
+	path->mtu_selector = IB_SA_EQ;
+	path->mtu = cm_id_priv->path_mtu;
+	path->rate_selector = IB_SA_EQ;
+	path->rate = cm_lap_get_packet_rate(lap_msg);
+	path->packet_life_time_selector = IB_SA_EQ;
+	path->packet_life_time = cm_lap_get_local_ack_timeout(lap_msg);
+	path->packet_life_time -= (path->packet_life_time > 0);
+}
+
+static int cm_lap_handler(struct cm_work *work)
+{
+	struct cm_id_private *cm_id_priv;
+	struct cm_lap_msg *lap_msg;
+	struct ib_cm_lap_event_param *param;
+	struct ib_mad_send_buf *msg = NULL;
+	int ret;
+
+	/* todo: verify LAP request and send reject APR if invalid. */
+	lap_msg = (struct cm_lap_msg *)work->mad_recv_wc->recv_buf.mad;
+	cm_id_priv = cm_acquire_id(lap_msg->remote_comm_id,
+				   lap_msg->local_comm_id);
+	if (!cm_id_priv)
+		return -EINVAL;
+
+	param = &work->cm_event.param.lap_rcvd;
+	param->alternate_path = &work->path[0];
+	cm_format_path_from_lap(cm_id_priv, param->alternate_path, lap_msg);
+	work->cm_event.private_data = &lap_msg->private_data;
+
+	spin_lock_irq(&cm_id_priv->lock);
+	if (cm_id_priv->id.state != IB_CM_ESTABLISHED)
+		goto unlock;
+
+	switch (cm_id_priv->id.lap_state) {
+	case IB_CM_LAP_UNINIT:
+	case IB_CM_LAP_IDLE:
+		break;
+	case IB_CM_MRA_LAP_SENT:
+		atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES].
+				counter[CM_LAP_COUNTER]);
+		if (cm_alloc_response_msg(work->port, work->mad_recv_wc, &msg))
+			goto unlock;
+
+		cm_format_mra((struct cm_mra_msg *) msg->mad, cm_id_priv,
+			      CM_MSG_RESPONSE_OTHER,
+			      cm_id_priv->service_timeout,
+			      cm_id_priv->private_data,
+			      cm_id_priv->private_data_len);
+		spin_unlock_irq(&cm_id_priv->lock);
+
+		if (ib_post_send_mad(msg, NULL))
+			cm_free_msg(msg);
+		goto deref;
+	case IB_CM_LAP_RCVD:
+		atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES].
+				counter[CM_LAP_COUNTER]);
+		goto unlock;
+	default:
+		goto unlock;
+	}
+
+	cm_id_priv->id.lap_state = IB_CM_LAP_RCVD;
+	cm_id_priv->tid = lap_msg->hdr.tid;
+	cm_init_av_for_response(work->port, work->mad_recv_wc->wc,
+				work->mad_recv_wc->recv_buf.grh,
+				&cm_id_priv->av);
+	cm_init_av_by_path(param->alternate_path, &cm_id_priv->alt_av);
+	ret = atomic_inc_and_test(&cm_id_priv->work_count);
+	if (!ret)
+		list_add_tail(&work->list, &cm_id_priv->work_list);
+	spin_unlock_irq(&cm_id_priv->lock);
+
+	if (ret)
+		cm_process_work(cm_id_priv, work);
+	else
+		cm_deref_id(cm_id_priv);
+	return 0;
+
+unlock:	spin_unlock_irq(&cm_id_priv->lock);
+deref:	cm_deref_id(cm_id_priv);
+	return -EINVAL;
+}
+
+static void cm_format_apr(struct cm_apr_msg *apr_msg,
+			  struct cm_id_private *cm_id_priv,
+			  enum ib_cm_apr_status status,
+			  void *info,
+			  u8 info_length,
+			  const void *private_data,
+			  u8 private_data_len)
+{
+	cm_format_mad_hdr(&apr_msg->hdr, CM_APR_ATTR_ID, cm_id_priv->tid);
+	apr_msg->local_comm_id = cm_id_priv->id.local_id;
+	apr_msg->remote_comm_id = cm_id_priv->id.remote_id;
+	apr_msg->ap_status = (u8) status;
+
+	if (info && info_length) {
+		apr_msg->info_length = info_length;
+		memcpy(apr_msg->info, info, info_length);
+	}
+
+	if (private_data && private_data_len)
+		memcpy(apr_msg->private_data, private_data, private_data_len);
+}
+
+int ib_send_cm_apr(struct ib_cm_id *cm_id,
+		   enum ib_cm_apr_status status,
+		   void *info,
+		   u8 info_length,
+		   const void *private_data,
+		   u8 private_data_len)
+{
+	struct cm_id_private *cm_id_priv;
+	struct ib_mad_send_buf *msg;
+	unsigned long flags;
+	int ret;
+
+	if ((private_data && private_data_len > IB_CM_APR_PRIVATE_DATA_SIZE) ||
+	    (info && info_length > IB_CM_APR_INFO_LENGTH))
+		return -EINVAL;
+
+	cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	if (cm_id->state != IB_CM_ESTABLISHED ||
+	    (cm_id->lap_state != IB_CM_LAP_RCVD &&
+	     cm_id->lap_state != IB_CM_MRA_LAP_SENT)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	ret = cm_alloc_msg(cm_id_priv, &msg);
+	if (ret)
+		goto out;
+
+	cm_format_apr((struct cm_apr_msg *) msg->mad, cm_id_priv, status,
+		      info, info_length, private_data, private_data_len);
+	ret = ib_post_send_mad(msg, NULL);
+	if (ret) {
+		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+		cm_free_msg(msg);
+		return ret;
+	}
+
+	cm_id->lap_state = IB_CM_LAP_IDLE;
+out:	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+	return ret;
+}
+EXPORT_SYMBOL(ib_send_cm_apr);
+
+static int cm_apr_handler(struct cm_work *work)
+{
+	struct cm_id_private *cm_id_priv;
+	struct cm_apr_msg *apr_msg;
+	int ret;
+
+	apr_msg = (struct cm_apr_msg *)work->mad_recv_wc->recv_buf.mad;
+	cm_id_priv = cm_acquire_id(apr_msg->remote_comm_id,
+				   apr_msg->local_comm_id);
+	if (!cm_id_priv)
+		return -EINVAL; /* Unmatched reply. */
+
+	work->cm_event.param.apr_rcvd.ap_status = apr_msg->ap_status;
+	work->cm_event.param.apr_rcvd.apr_info = &apr_msg->info;
+	work->cm_event.param.apr_rcvd.info_len = apr_msg->info_length;
+	work->cm_event.private_data = &apr_msg->private_data;
+
+	spin_lock_irq(&cm_id_priv->lock);
+	if (cm_id_priv->id.state != IB_CM_ESTABLISHED ||
+	    (cm_id_priv->id.lap_state != IB_CM_LAP_SENT &&
+	     cm_id_priv->id.lap_state != IB_CM_MRA_LAP_RCVD)) {
+		spin_unlock_irq(&cm_id_priv->lock);
+		goto out;
+	}
+	cm_id_priv->id.lap_state = IB_CM_LAP_IDLE;
+	ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
+	cm_id_priv->msg = NULL;
+
+	ret = atomic_inc_and_test(&cm_id_priv->work_count);
+	if (!ret)
+		list_add_tail(&work->list, &cm_id_priv->work_list);
+	spin_unlock_irq(&cm_id_priv->lock);
+
+	if (ret)
+		cm_process_work(cm_id_priv, work);
+	else
+		cm_deref_id(cm_id_priv);
+	return 0;
+out:
+	cm_deref_id(cm_id_priv);
+	return -EINVAL;
+}
+
+static int cm_timewait_handler(struct cm_work *work)
+{
+	struct cm_timewait_info *timewait_info;
+	struct cm_id_private *cm_id_priv;
+	int ret;
+
+	timewait_info = (struct cm_timewait_info *)work;
+	spin_lock_irq(&cm.lock);
+	list_del(&timewait_info->list);
+	spin_unlock_irq(&cm.lock);
+
+	cm_id_priv = cm_acquire_id(timewait_info->work.local_id,
+				   timewait_info->work.remote_id);
+	if (!cm_id_priv)
+		return -EINVAL;
+
+	spin_lock_irq(&cm_id_priv->lock);
+	if (cm_id_priv->id.state != IB_CM_TIMEWAIT ||
+	    cm_id_priv->remote_qpn != timewait_info->remote_qpn) {
+		spin_unlock_irq(&cm_id_priv->lock);
+		goto out;
+	}
+	cm_id_priv->id.state = IB_CM_IDLE;
+	ret = atomic_inc_and_test(&cm_id_priv->work_count);
+	if (!ret)
+		list_add_tail(&work->list, &cm_id_priv->work_list);
+	spin_unlock_irq(&cm_id_priv->lock);
+
+	if (ret)
+		cm_process_work(cm_id_priv, work);
+	else
+		cm_deref_id(cm_id_priv);
+	return 0;
+out:
+	cm_deref_id(cm_id_priv);
+	return -EINVAL;
+}
+
+static void cm_format_sidr_req(struct cm_sidr_req_msg *sidr_req_msg,
+			       struct cm_id_private *cm_id_priv,
+			       struct ib_cm_sidr_req_param *param)
+{
+	cm_format_mad_hdr(&sidr_req_msg->hdr, CM_SIDR_REQ_ATTR_ID,
+			  cm_form_tid(cm_id_priv, CM_MSG_SEQUENCE_SIDR));
+	sidr_req_msg->request_id = cm_id_priv->id.local_id;
+	sidr_req_msg->pkey = param->path->pkey;
+	sidr_req_msg->service_id = param->service_id;
+
+	if (param->private_data && param->private_data_len)
+		memcpy(sidr_req_msg->private_data, param->private_data,
+		       param->private_data_len);
+}
+
+int ib_send_cm_sidr_req(struct ib_cm_id *cm_id,
+			struct ib_cm_sidr_req_param *param)
+{
+	struct cm_id_private *cm_id_priv;
+	struct ib_mad_send_buf *msg;
+	unsigned long flags;
+	int ret;
+
+	if (!param->path || (param->private_data &&
+	     param->private_data_len > IB_CM_SIDR_REQ_PRIVATE_DATA_SIZE))
+		return -EINVAL;
+
+	cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+	ret = cm_init_av_by_path(param->path, &cm_id_priv->av);
+	if (ret)
+		goto out;
+
+	cm_id->service_id = param->service_id;
+	cm_id->service_mask = ~cpu_to_be64(0);
+	cm_id_priv->timeout_ms = param->timeout_ms;
+	cm_id_priv->max_cm_retries = param->max_cm_retries;
+	ret = cm_alloc_msg(cm_id_priv, &msg);
+	if (ret)
+		goto out;
+
+	cm_format_sidr_req((struct cm_sidr_req_msg *) msg->mad, cm_id_priv,
+			   param);
+	msg->timeout_ms = cm_id_priv->timeout_ms;
+	msg->context[1] = (void *) (unsigned long) IB_CM_SIDR_REQ_SENT;
+
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	if (cm_id->state == IB_CM_IDLE)
+		ret = ib_post_send_mad(msg, NULL);
+	else
+		ret = -EINVAL;
+
+	if (ret) {
+		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+		cm_free_msg(msg);
+		goto out;
+	}
+	cm_id->state = IB_CM_SIDR_REQ_SENT;
+	cm_id_priv->msg = msg;
+	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+out:
+	return ret;
+}
+EXPORT_SYMBOL(ib_send_cm_sidr_req);
+
+static void cm_format_sidr_req_event(struct cm_work *work,
+				     struct ib_cm_id *listen_id)
+{
+	struct cm_sidr_req_msg *sidr_req_msg;
+	struct ib_cm_sidr_req_event_param *param;
+
+	sidr_req_msg = (struct cm_sidr_req_msg *)
+				work->mad_recv_wc->recv_buf.mad;
+	param = &work->cm_event.param.sidr_req_rcvd;
+	param->pkey = __be16_to_cpu(sidr_req_msg->pkey);
+	param->listen_id = listen_id;
+	param->port = work->port->port_num;
+	work->cm_event.private_data = &sidr_req_msg->private_data;
+}
+
+static int cm_sidr_req_handler(struct cm_work *work)
+{
+	struct ib_cm_id *cm_id;
+	struct cm_id_private *cm_id_priv, *cur_cm_id_priv;
+	struct cm_sidr_req_msg *sidr_req_msg;
+	struct ib_wc *wc;
+
+	cm_id = ib_create_cm_id(work->port->cm_dev->ib_device, NULL, NULL);
+	if (IS_ERR(cm_id))
+		return PTR_ERR(cm_id);
+	cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+
+	/* Record SGID/SLID and request ID for lookup. */
+	sidr_req_msg = (struct cm_sidr_req_msg *)
+				work->mad_recv_wc->recv_buf.mad;
+	wc = work->mad_recv_wc->wc;
+	cm_id_priv->av.dgid.global.subnet_prefix = cpu_to_be64(wc->slid);
+	cm_id_priv->av.dgid.global.interface_id = 0;
+	cm_init_av_for_response(work->port, work->mad_recv_wc->wc,
+				work->mad_recv_wc->recv_buf.grh,
+				&cm_id_priv->av);
+	cm_id_priv->id.remote_id = sidr_req_msg->request_id;
+	cm_id_priv->tid = sidr_req_msg->hdr.tid;
+	atomic_inc(&cm_id_priv->work_count);
+
+	spin_lock_irq(&cm.lock);
+	cur_cm_id_priv = cm_insert_remote_sidr(cm_id_priv);
+	if (cur_cm_id_priv) {
+		spin_unlock_irq(&cm.lock);
+		atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES].
+				counter[CM_SIDR_REQ_COUNTER]);
+		goto out; /* Duplicate message. */
+	}
+	cm_id_priv->id.state = IB_CM_SIDR_REQ_RCVD;
+	cur_cm_id_priv = cm_find_listen(cm_id->device,
+					sidr_req_msg->service_id,
+					sidr_req_msg->private_data);
+	if (!cur_cm_id_priv) {
+		spin_unlock_irq(&cm.lock);
+		cm_reject_sidr_req(cm_id_priv, IB_SIDR_UNSUPPORTED);
+		goto out; /* No match. */
+	}
+	atomic_inc(&cur_cm_id_priv->refcount);
+	atomic_inc(&cm_id_priv->refcount);
+	spin_unlock_irq(&cm.lock);
+
+	cm_id_priv->id.cm_handler = cur_cm_id_priv->id.cm_handler;
+	cm_id_priv->id.context = cur_cm_id_priv->id.context;
+	cm_id_priv->id.service_id = sidr_req_msg->service_id;
+	cm_id_priv->id.service_mask = ~cpu_to_be64(0);
+
+	cm_format_sidr_req_event(work, &cur_cm_id_priv->id);
+	cm_process_work(cm_id_priv, work);
+	cm_deref_id(cur_cm_id_priv);
+	return 0;
+out:
+	ib_destroy_cm_id(&cm_id_priv->id);
+	return -EINVAL;
+}
+
+static void cm_format_sidr_rep(struct cm_sidr_rep_msg *sidr_rep_msg,
+			       struct cm_id_private *cm_id_priv,
+			       struct ib_cm_sidr_rep_param *param)
+{
+	cm_format_mad_hdr(&sidr_rep_msg->hdr, CM_SIDR_REP_ATTR_ID,
+			  cm_id_priv->tid);
+	sidr_rep_msg->request_id = cm_id_priv->id.remote_id;
+	sidr_rep_msg->status = param->status;
+	cm_sidr_rep_set_qpn(sidr_rep_msg, cpu_to_be32(param->qp_num));
+	sidr_rep_msg->service_id = cm_id_priv->id.service_id;
+	sidr_rep_msg->qkey = cpu_to_be32(param->qkey);
+
+	if (param->info && param->info_length)
+		memcpy(sidr_rep_msg->info, param->info, param->info_length);
+
+	if (param->private_data && param->private_data_len)
+		memcpy(sidr_rep_msg->private_data, param->private_data,
+		       param->private_data_len);
+}
+
+int ib_send_cm_sidr_rep(struct ib_cm_id *cm_id,
+			struct ib_cm_sidr_rep_param *param)
+{
+	struct cm_id_private *cm_id_priv;
+	struct ib_mad_send_buf *msg;
+	unsigned long flags;
+	int ret;
+
+	if ((param->info && param->info_length > IB_CM_SIDR_REP_INFO_LENGTH) ||
+	    (param->private_data &&
+	     param->private_data_len > IB_CM_SIDR_REP_PRIVATE_DATA_SIZE))
+		return -EINVAL;
+
+	cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	if (cm_id->state != IB_CM_SIDR_REQ_RCVD) {
+		ret = -EINVAL;
+		goto error;
+	}
+
+	ret = cm_alloc_msg(cm_id_priv, &msg);
+	if (ret)
+		goto error;
+
+	cm_format_sidr_rep((struct cm_sidr_rep_msg *) msg->mad, cm_id_priv,
+			   param);
+	ret = ib_post_send_mad(msg, NULL);
+	if (ret) {
+		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+		cm_free_msg(msg);
+		return ret;
+	}
+	cm_id->state = IB_CM_IDLE;
+	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+
+	spin_lock_irqsave(&cm.lock, flags);
+	rb_erase(&cm_id_priv->sidr_id_node, &cm.remote_sidr_table);
+	spin_unlock_irqrestore(&cm.lock, flags);
+	return 0;
+
+error:	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+	return ret;
+}
+EXPORT_SYMBOL(ib_send_cm_sidr_rep);
+
+static void cm_format_sidr_rep_event(struct cm_work *work)
+{
+	struct cm_sidr_rep_msg *sidr_rep_msg;
+	struct ib_cm_sidr_rep_event_param *param;
+
+	sidr_rep_msg = (struct cm_sidr_rep_msg *)
+				work->mad_recv_wc->recv_buf.mad;
+	param = &work->cm_event.param.sidr_rep_rcvd;
+	param->status = sidr_rep_msg->status;
+	param->qkey = be32_to_cpu(sidr_rep_msg->qkey);
+	param->qpn = be32_to_cpu(cm_sidr_rep_get_qpn(sidr_rep_msg));
+	param->info = &sidr_rep_msg->info;
+	param->info_len = sidr_rep_msg->info_length;
+	work->cm_event.private_data = &sidr_rep_msg->private_data;
+}
+
+static int cm_sidr_rep_handler(struct cm_work *work)
+{
+	struct cm_sidr_rep_msg *sidr_rep_msg;
+	struct cm_id_private *cm_id_priv;
+
+	sidr_rep_msg = (struct cm_sidr_rep_msg *)
+				work->mad_recv_wc->recv_buf.mad;
+	cm_id_priv = cm_acquire_id(sidr_rep_msg->request_id, 0);
+	if (!cm_id_priv)
+		return -EINVAL; /* Unmatched reply. */
+
+	spin_lock_irq(&cm_id_priv->lock);
+	if (cm_id_priv->id.state != IB_CM_SIDR_REQ_SENT) {
+		spin_unlock_irq(&cm_id_priv->lock);
+		goto out;
+	}
+	cm_id_priv->id.state = IB_CM_IDLE;
+	ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
+	spin_unlock_irq(&cm_id_priv->lock);
+
+	cm_format_sidr_rep_event(work);
+	cm_process_work(cm_id_priv, work);
+	return 0;
+out:
+	cm_deref_id(cm_id_priv);
+	return -EINVAL;
+}
+
+static void cm_process_send_error(struct ib_mad_send_buf *msg,
+				  enum ib_wc_status wc_status)
+{
+	struct cm_id_private *cm_id_priv;
+	struct ib_cm_event cm_event;
+	enum ib_cm_state state;
+	int ret;
+
+	memset(&cm_event, 0, sizeof cm_event);
+	cm_id_priv = msg->context[0];
+
+	/* Discard old sends or ones without a response. */
+	spin_lock_irq(&cm_id_priv->lock);
+	state = (enum ib_cm_state) (unsigned long) msg->context[1];
+	if (msg != cm_id_priv->msg || state != cm_id_priv->id.state)
+		goto discard;
+
+	switch (state) {
+	case IB_CM_REQ_SENT:
+	case IB_CM_MRA_REQ_RCVD:
+		cm_reset_to_idle(cm_id_priv);
+		cm_event.event = IB_CM_REQ_ERROR;
+		break;
+	case IB_CM_REP_SENT:
+	case IB_CM_MRA_REP_RCVD:
+		cm_reset_to_idle(cm_id_priv);
+		cm_event.event = IB_CM_REP_ERROR;
+		break;
+	case IB_CM_DREQ_SENT:
+		cm_enter_timewait(cm_id_priv);
+		cm_event.event = IB_CM_DREQ_ERROR;
+		break;
+	case IB_CM_SIDR_REQ_SENT:
+		cm_id_priv->id.state = IB_CM_IDLE;
+		cm_event.event = IB_CM_SIDR_REQ_ERROR;
+		break;
+	default:
+		goto discard;
+	}
+	spin_unlock_irq(&cm_id_priv->lock);
+	cm_event.param.send_status = wc_status;
+
+	/* No other events can occur on the cm_id at this point. */
+	ret = cm_id_priv->id.cm_handler(&cm_id_priv->id, &cm_event);
+	cm_free_msg(msg);
+	if (ret)
+		ib_destroy_cm_id(&cm_id_priv->id);
+	return;
+discard:
+	spin_unlock_irq(&cm_id_priv->lock);
+	cm_free_msg(msg);
+}
+
+static void cm_send_handler(struct ib_mad_agent *mad_agent,
+			    struct ib_mad_send_wc *mad_send_wc)
+{
+	struct ib_mad_send_buf *msg = mad_send_wc->send_buf;
+	struct cm_port *port;
+	u16 attr_index;
+
+	port = mad_agent->context;
+	attr_index = be16_to_cpu(((struct ib_mad_hdr *)
+				  msg->mad)->attr_id) - CM_ATTR_ID_OFFSET;
+
+	/*
+	 * If the send was in response to a received message (context[0] is not
+	 * set to a cm_id), and is not a REJ, then it is a send that was
+	 * manually retried.
+	 */
+	if (!msg->context[0] && (attr_index != CM_REJ_COUNTER))
+		msg->retries = 1;
+
+	atomic_long_add(1 + msg->retries,
+			&port->counter_group[CM_XMIT].counter[attr_index]);
+	if (msg->retries)
+		atomic_long_add(msg->retries,
+				&port->counter_group[CM_XMIT_RETRIES].
+				counter[attr_index]);
+
+	switch (mad_send_wc->status) {
+	case IB_WC_SUCCESS:
+	case IB_WC_WR_FLUSH_ERR:
+		cm_free_msg(msg);
+		break;
+	default:
+		if (msg->context[0] && msg->context[1])
+			cm_process_send_error(msg, mad_send_wc->status);
+		else
+			cm_free_msg(msg);
+		break;
+	}
+}
+
+static void cm_work_handler(struct work_struct *_work)
+{
+	struct cm_work *work = container_of(_work, struct cm_work, work.work);
+	int ret;
+
+	switch (work->cm_event.event) {
+	case IB_CM_REQ_RECEIVED:
+		ret = cm_req_handler(work);
+		break;
+	case IB_CM_MRA_RECEIVED:
+		ret = cm_mra_handler(work);
+		break;
+	case IB_CM_REJ_RECEIVED:
+		ret = cm_rej_handler(work);
+		break;
+	case IB_CM_REP_RECEIVED:
+		ret = cm_rep_handler(work);
+		break;
+	case IB_CM_RTU_RECEIVED:
+		ret = cm_rtu_handler(work);
+		break;
+	case IB_CM_USER_ESTABLISHED:
+		ret = cm_establish_handler(work);
+		break;
+	case IB_CM_DREQ_RECEIVED:
+		ret = cm_dreq_handler(work);
+		break;
+	case IB_CM_DREP_RECEIVED:
+		ret = cm_drep_handler(work);
+		break;
+	case IB_CM_SIDR_REQ_RECEIVED:
+		ret = cm_sidr_req_handler(work);
+		break;
+	case IB_CM_SIDR_REP_RECEIVED:
+		ret = cm_sidr_rep_handler(work);
+		break;
+	case IB_CM_LAP_RECEIVED:
+		ret = cm_lap_handler(work);
+		break;
+	case IB_CM_APR_RECEIVED:
+		ret = cm_apr_handler(work);
+		break;
+	case IB_CM_TIMEWAIT_EXIT:
+		ret = cm_timewait_handler(work);
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+	if (ret)
+		cm_free_work(work);
+}
+
+static int cm_establish(struct ib_cm_id *cm_id)
+{
+	struct cm_id_private *cm_id_priv;
+	struct cm_work *work;
+	unsigned long flags;
+	int ret = 0;
+
+	work = kmalloc(sizeof *work, GFP_ATOMIC);
+	if (!work)
+		return -ENOMEM;
+
+	cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	switch (cm_id->state)
+	{
+	case IB_CM_REP_SENT:
+	case IB_CM_MRA_REP_RCVD:
+		cm_id->state = IB_CM_ESTABLISHED;
+		break;
+	case IB_CM_ESTABLISHED:
+		ret = -EISCONN;
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+
+	if (ret) {
+		kfree(work);
+		goto out;
+	}
+
+	/*
+	 * The CM worker thread may try to destroy the cm_id before it
+	 * can execute this work item.  To prevent potential deadlock,
+	 * we need to find the cm_id once we're in the context of the
+	 * worker thread, rather than holding a reference on it.
+	 */
+	INIT_DELAYED_WORK(&work->work, cm_work_handler);
+	work->local_id = cm_id->local_id;
+	work->remote_id = cm_id->remote_id;
+	work->mad_recv_wc = NULL;
+	work->cm_event.event = IB_CM_USER_ESTABLISHED;
+	queue_delayed_work(cm.wq, &work->work, 0);
+out:
+	return ret;
+}
+
+static int cm_migrate(struct ib_cm_id *cm_id)
+{
+	struct cm_id_private *cm_id_priv;
+	unsigned long flags;
+	int ret = 0;
+
+	cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	if (cm_id->state == IB_CM_ESTABLISHED &&
+	    (cm_id->lap_state == IB_CM_LAP_UNINIT ||
+	     cm_id->lap_state == IB_CM_LAP_IDLE)) {
+		cm_id->lap_state = IB_CM_LAP_IDLE;
+		cm_id_priv->av = cm_id_priv->alt_av;
+	} else
+		ret = -EINVAL;
+	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+
+	return ret;
+}
+
+int ib_cm_notify(struct ib_cm_id *cm_id, enum ib_event_type event)
+{
+	int ret;
+
+	switch (event) {
+	case IB_EVENT_COMM_EST:
+		ret = cm_establish(cm_id);
+		break;
+	case IB_EVENT_PATH_MIG:
+		ret = cm_migrate(cm_id);
+		break;
+	default:
+		ret = -EINVAL;
+	}
+	return ret;
+}
+EXPORT_SYMBOL(ib_cm_notify);
+
+static void cm_recv_handler(struct ib_mad_agent *mad_agent,
+			    struct ib_mad_recv_wc *mad_recv_wc)
+{
+	struct cm_port *port = mad_agent->context;
+	struct cm_work *work;
+	enum ib_cm_event_type event;
+	u16 attr_id;
+	int paths = 0;
+
+	switch (mad_recv_wc->recv_buf.mad->mad_hdr.attr_id) {
+	case CM_REQ_ATTR_ID:
+		paths = 1 + (((struct cm_req_msg *) mad_recv_wc->recv_buf.mad)->
+						    alt_local_lid != 0);
+		event = IB_CM_REQ_RECEIVED;
+		break;
+	case CM_MRA_ATTR_ID:
+		event = IB_CM_MRA_RECEIVED;
+		break;
+	case CM_REJ_ATTR_ID:
+		event = IB_CM_REJ_RECEIVED;
+		break;
+	case CM_REP_ATTR_ID:
+		event = IB_CM_REP_RECEIVED;
+		break;
+	case CM_RTU_ATTR_ID:
+		event = IB_CM_RTU_RECEIVED;
+		break;
+	case CM_DREQ_ATTR_ID:
+		event = IB_CM_DREQ_RECEIVED;
+		break;
+	case CM_DREP_ATTR_ID:
+		event = IB_CM_DREP_RECEIVED;
+		break;
+	case CM_SIDR_REQ_ATTR_ID:
+		event = IB_CM_SIDR_REQ_RECEIVED;
+		break;
+	case CM_SIDR_REP_ATTR_ID:
+		event = IB_CM_SIDR_REP_RECEIVED;
+		break;
+	case CM_LAP_ATTR_ID:
+		paths = 1;
+		event = IB_CM_LAP_RECEIVED;
+		break;
+	case CM_APR_ATTR_ID:
+		event = IB_CM_APR_RECEIVED;
+		break;
+	default:
+		ib_free_recv_mad(mad_recv_wc);
+		return;
+	}
+
+	attr_id = be16_to_cpu(mad_recv_wc->recv_buf.mad->mad_hdr.attr_id);
+	atomic_long_inc(&port->counter_group[CM_RECV].
+			counter[attr_id - CM_ATTR_ID_OFFSET]);
+
+	work = kmalloc(sizeof *work + sizeof(struct ib_sa_path_rec) * paths,
+		       GFP_KERNEL);
+	if (!work) {
+		ib_free_recv_mad(mad_recv_wc);
+		return;
+	}
+
+	INIT_DELAYED_WORK(&work->work, cm_work_handler);
+	work->cm_event.event = event;
+	work->mad_recv_wc = mad_recv_wc;
+	work->port = port;
+	queue_delayed_work(cm.wq, &work->work, 0);
+}
+
+static int cm_init_qp_init_attr(struct cm_id_private *cm_id_priv,
+				struct ib_qp_attr *qp_attr,
+				int *qp_attr_mask)
+{
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	switch (cm_id_priv->id.state) {
+	case IB_CM_REQ_SENT:
+	case IB_CM_MRA_REQ_RCVD:
+	case IB_CM_REQ_RCVD:
+	case IB_CM_MRA_REQ_SENT:
+	case IB_CM_REP_RCVD:
+	case IB_CM_MRA_REP_SENT:
+	case IB_CM_REP_SENT:
+	case IB_CM_MRA_REP_RCVD:
+	case IB_CM_ESTABLISHED:
+		*qp_attr_mask = IB_QP_STATE | IB_QP_ACCESS_FLAGS |
+				IB_QP_PKEY_INDEX | IB_QP_PORT;
+		qp_attr->qp_access_flags = IB_ACCESS_REMOTE_WRITE;
+		if (cm_id_priv->responder_resources)
+			qp_attr->qp_access_flags |= IB_ACCESS_REMOTE_READ |
+						    IB_ACCESS_REMOTE_ATOMIC;
+		qp_attr->pkey_index = cm_id_priv->av.pkey_index;
+		qp_attr->port_num = cm_id_priv->av.port->port_num;
+		ret = 0;
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+	return ret;
+}
+
+static int cm_init_qp_rtr_attr(struct cm_id_private *cm_id_priv,
+			       struct ib_qp_attr *qp_attr,
+			       int *qp_attr_mask)
+{
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	switch (cm_id_priv->id.state) {
+	case IB_CM_REQ_RCVD:
+	case IB_CM_MRA_REQ_SENT:
+	case IB_CM_REP_RCVD:
+	case IB_CM_MRA_REP_SENT:
+	case IB_CM_REP_SENT:
+	case IB_CM_MRA_REP_RCVD:
+	case IB_CM_ESTABLISHED:
+		*qp_attr_mask = IB_QP_STATE | IB_QP_AV | IB_QP_PATH_MTU |
+				IB_QP_DEST_QPN | IB_QP_RQ_PSN;
+		qp_attr->ah_attr = cm_id_priv->av.ah_attr;
+		if (!cm_id_priv->av.valid) {
+			spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+			return -EINVAL;
+		}
+		if (cm_id_priv->av.ah_attr.vlan_id != 0xffff) {
+			qp_attr->vlan_id = cm_id_priv->av.ah_attr.vlan_id;
+			*qp_attr_mask |= IB_QP_VID;
+		}
+		if (!is_zero_ether_addr(cm_id_priv->av.smac)) {
+			memcpy(qp_attr->smac, cm_id_priv->av.smac,
+			       sizeof(qp_attr->smac));
+			*qp_attr_mask |= IB_QP_SMAC;
+		}
+		if (cm_id_priv->alt_av.valid) {
+			if (cm_id_priv->alt_av.ah_attr.vlan_id != 0xffff) {
+				qp_attr->alt_vlan_id =
+					cm_id_priv->alt_av.ah_attr.vlan_id;
+				*qp_attr_mask |= IB_QP_ALT_VID;
+			}
+			if (!is_zero_ether_addr(cm_id_priv->alt_av.smac)) {
+				memcpy(qp_attr->alt_smac,
+				       cm_id_priv->alt_av.smac,
+				       sizeof(qp_attr->alt_smac));
+				*qp_attr_mask |= IB_QP_ALT_SMAC;
+			}
+		}
+		qp_attr->path_mtu = cm_id_priv->path_mtu;
+		qp_attr->dest_qp_num = be32_to_cpu(cm_id_priv->remote_qpn);
+		qp_attr->rq_psn = be32_to_cpu(cm_id_priv->rq_psn);
+		if (cm_id_priv->qp_type == IB_QPT_RC ||
+		    cm_id_priv->qp_type == IB_QPT_XRC_TGT) {
+			*qp_attr_mask |= IB_QP_MAX_DEST_RD_ATOMIC |
+					 IB_QP_MIN_RNR_TIMER;
+			qp_attr->max_dest_rd_atomic =
+					cm_id_priv->responder_resources;
+			qp_attr->min_rnr_timer = 0;
+		}
+		if (cm_id_priv->alt_av.ah_attr.dlid) {
+			*qp_attr_mask |= IB_QP_ALT_PATH;
+			qp_attr->alt_port_num = cm_id_priv->alt_av.port->port_num;
+			qp_attr->alt_pkey_index = cm_id_priv->alt_av.pkey_index;
+			qp_attr->alt_timeout = cm_id_priv->alt_av.timeout;
+			qp_attr->alt_ah_attr = cm_id_priv->alt_av.ah_attr;
+		}
+		ret = 0;
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+	return ret;
+}
+
+static int cm_init_qp_rts_attr(struct cm_id_private *cm_id_priv,
+			       struct ib_qp_attr *qp_attr,
+			       int *qp_attr_mask)
+{
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	switch (cm_id_priv->id.state) {
+	/* Allow transition to RTS before sending REP */
+	case IB_CM_REQ_RCVD:
+	case IB_CM_MRA_REQ_SENT:
+
+	case IB_CM_REP_RCVD:
+	case IB_CM_MRA_REP_SENT:
+	case IB_CM_REP_SENT:
+	case IB_CM_MRA_REP_RCVD:
+	case IB_CM_ESTABLISHED:
+		if (cm_id_priv->id.lap_state == IB_CM_LAP_UNINIT) {
+			*qp_attr_mask = IB_QP_STATE | IB_QP_SQ_PSN;
+			qp_attr->sq_psn = be32_to_cpu(cm_id_priv->sq_psn);
+			switch (cm_id_priv->qp_type) {
+			case IB_QPT_RC:
+			case IB_QPT_XRC_INI:
+				*qp_attr_mask |= IB_QP_RETRY_CNT | IB_QP_RNR_RETRY |
+						 IB_QP_MAX_QP_RD_ATOMIC;
+				qp_attr->retry_cnt = cm_id_priv->retry_count;
+				qp_attr->rnr_retry = cm_id_priv->rnr_retry_count;
+				qp_attr->max_rd_atomic = cm_id_priv->initiator_depth;
+				/* fall through */
+			case IB_QPT_XRC_TGT:
+				*qp_attr_mask |= IB_QP_TIMEOUT;
+				qp_attr->timeout = cm_id_priv->av.timeout;
+				break;
+			default:
+				break;
+			}
+			if (cm_id_priv->alt_av.ah_attr.dlid) {
+				*qp_attr_mask |= IB_QP_PATH_MIG_STATE;
+				qp_attr->path_mig_state = IB_MIG_REARM;
+			}
+		} else {
+			*qp_attr_mask = IB_QP_ALT_PATH | IB_QP_PATH_MIG_STATE;
+			qp_attr->alt_port_num = cm_id_priv->alt_av.port->port_num;
+			qp_attr->alt_pkey_index = cm_id_priv->alt_av.pkey_index;
+			qp_attr->alt_timeout = cm_id_priv->alt_av.timeout;
+			qp_attr->alt_ah_attr = cm_id_priv->alt_av.ah_attr;
+			qp_attr->path_mig_state = IB_MIG_REARM;
+		}
+		ret = 0;
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+	return ret;
+}
+
+int ib_cm_init_qp_attr(struct ib_cm_id *cm_id,
+		       struct ib_qp_attr *qp_attr,
+		       int *qp_attr_mask)
+{
+	struct cm_id_private *cm_id_priv;
+	int ret;
+
+	cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+	switch (qp_attr->qp_state) {
+	case IB_QPS_INIT:
+		ret = cm_init_qp_init_attr(cm_id_priv, qp_attr, qp_attr_mask);
+		break;
+	case IB_QPS_RTR:
+		ret = cm_init_qp_rtr_attr(cm_id_priv, qp_attr, qp_attr_mask);
+		break;
+	case IB_QPS_RTS:
+		ret = cm_init_qp_rts_attr(cm_id_priv, qp_attr, qp_attr_mask);
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+	return ret;
+}
+EXPORT_SYMBOL(ib_cm_init_qp_attr);
+
+static void cm_get_ack_delay(struct cm_device *cm_dev)
+{
+	struct ib_device_attr attr;
+
+	if (ib_query_device(cm_dev->ib_device, &attr))
+		cm_dev->ack_delay = 0; /* acks will rely on packet life time */
+	else
+		cm_dev->ack_delay = attr.local_ca_ack_delay;
+}
+
+static ssize_t cm_show_counter(struct kobject *obj, struct attribute *attr,
+			       char *buf)
+{
+	struct cm_counter_group *group;
+	struct cm_counter_attribute *cm_attr;
+
+	group = container_of(obj, struct cm_counter_group, obj);
+	cm_attr = container_of(attr, struct cm_counter_attribute, attr);
+
+	return sprintf(buf, "%ld\n",
+		       atomic_long_read(&group->counter[cm_attr->index]));
+}
+
+static const struct sysfs_ops cm_counter_ops = {
+	.show = cm_show_counter
+};
+
+static struct kobj_type cm_counter_obj_type = {
+	.sysfs_ops = &cm_counter_ops,
+	.default_attrs = cm_counter_default_attrs
+};
+
+static void cm_release_port_obj(struct kobject *obj)
+{
+	struct cm_port *cm_port;
+
+	cm_port = container_of(obj, struct cm_port, port_obj);
+	kfree(cm_port);
+}
+
+static struct kobj_type cm_port_obj_type = {
+	.release = cm_release_port_obj
+};
+
+static char *cm_devnode(struct device *dev, umode_t *mode)
+{
+	if (mode)
+		*mode = 0666;
+	return kasprintf(GFP_KERNEL, "infiniband/%s", dev_name(dev));
+}
+
+struct class cm_class = {
+	.owner   = THIS_MODULE,
+	.name    = "infiniband_cm",
+	.devnode = cm_devnode,
+};
+EXPORT_SYMBOL(cm_class);
+
+static int cm_create_port_fs(struct cm_port *port)
+{
+	int i, ret;
+
+	ret = kobject_init_and_add(&port->port_obj, &cm_port_obj_type,
+				   &port->cm_dev->device->kobj,
+				   "%d", port->port_num);
+	if (ret) {
+		kfree(port);
+		return ret;
+	}
+
+	for (i = 0; i < CM_COUNTER_GROUPS; i++) {
+		ret = kobject_init_and_add(&port->counter_group[i].obj,
+					   &cm_counter_obj_type,
+					   &port->port_obj,
+					   "%s", counter_group_names[i]);
+		if (ret)
+			goto error;
+	}
+
+	return 0;
+
+error:
+	while (i--)
+		kobject_put(&port->counter_group[i].obj);
+	kobject_put(&port->port_obj);
+	return ret;
+
+}
+
+static void cm_remove_port_fs(struct cm_port *port)
+{
+	int i;
+
+	for (i = 0; i < CM_COUNTER_GROUPS; i++)
+		kobject_put(&port->counter_group[i].obj);
+
+	kobject_put(&port->port_obj);
+}
+
+static void cm_add_one(struct ib_device *ib_device)
+{
+	struct cm_device *cm_dev;
+	struct cm_port *port;
+	struct ib_mad_reg_req reg_req = {
+		.mgmt_class = IB_MGMT_CLASS_CM,
+		.mgmt_class_version = IB_CM_CLASS_VERSION,
+	};
+	struct ib_port_modify port_modify = {
+		.set_port_cap_mask = IB_PORT_CM_SUP
+	};
+	unsigned long flags;
+	int ret;
+	u8 i;
+
+	if (rdma_node_get_transport(ib_device->node_type) != RDMA_TRANSPORT_IB)
+		return;
+
+	cm_dev = kzalloc(sizeof(*cm_dev) + sizeof(*port) *
+			 ib_device->phys_port_cnt, GFP_KERNEL);
+	if (!cm_dev)
+		return;
+
+	cm_dev->ib_device = ib_device;
+	cm_get_ack_delay(cm_dev);
+
+	cm_dev->device = device_create(&cm_class, &ib_device->dev,
+				       MKDEV(0, 0), NULL,
+				       "%s", ib_device->name);
+	if (IS_ERR(cm_dev->device)) {
+		kfree(cm_dev);
+		return;
+	}
+
+	set_bit(IB_MGMT_METHOD_SEND, reg_req.method_mask);
+	for (i = 1; i <= ib_device->phys_port_cnt; i++) {
+		port = kzalloc(sizeof *port, GFP_KERNEL);
+		if (!port)
+			goto error1;
+
+		cm_dev->port[i-1] = port;
+		port->cm_dev = cm_dev;
+		port->port_num = i;
+
+		ret = cm_create_port_fs(port);
+		if (ret)
+			goto error1;
+
+		port->mad_agent = ib_register_mad_agent(ib_device, i,
+							IB_QPT_GSI,
+							&reg_req,
+							0,
+							cm_send_handler,
+							cm_recv_handler,
+							port,
+							0);
+		if (IS_ERR(port->mad_agent))
+			goto error2;
+
+		ret = ib_modify_port(ib_device, i, 0, &port_modify);
+		if (ret)
+			goto error3;
+	}
+	ib_set_client_data(ib_device, &cm_client, cm_dev);
+
+	write_lock_irqsave(&cm.device_lock, flags);
+	list_add_tail(&cm_dev->list, &cm.device_list);
+	write_unlock_irqrestore(&cm.device_lock, flags);
+	return;
+
+error3:
+	ib_unregister_mad_agent(port->mad_agent);
+error2:
+	cm_remove_port_fs(port);
+error1:
+	port_modify.set_port_cap_mask = 0;
+	port_modify.clr_port_cap_mask = IB_PORT_CM_SUP;
+	while (--i) {
+		port = cm_dev->port[i-1];
+		ib_modify_port(ib_device, port->port_num, 0, &port_modify);
+		ib_unregister_mad_agent(port->mad_agent);
+		cm_remove_port_fs(port);
+	}
+	device_unregister(cm_dev->device);
+	kfree(cm_dev);
+}
+
+static void cm_remove_one(struct ib_device *ib_device)
+{
+	struct cm_device *cm_dev;
+	struct cm_port *port;
+	struct ib_port_modify port_modify = {
+		.clr_port_cap_mask = IB_PORT_CM_SUP
+	};
+	unsigned long flags;
+	int i;
+
+	cm_dev = ib_get_client_data(ib_device, &cm_client);
+	if (!cm_dev)
+		return;
+
+	write_lock_irqsave(&cm.device_lock, flags);
+	list_del(&cm_dev->list);
+	write_unlock_irqrestore(&cm.device_lock, flags);
+
+	for (i = 1; i <= ib_device->phys_port_cnt; i++) {
+		port = cm_dev->port[i-1];
+		ib_modify_port(ib_device, port->port_num, 0, &port_modify);
+		ib_unregister_mad_agent(port->mad_agent);
+		flush_workqueue(cm.wq);
+		cm_remove_port_fs(port);
+	}
+	device_unregister(cm_dev->device);
+	kfree(cm_dev);
+}
+
+static int __init ib_cm_init(void)
+{
+	int ret;
+
+	memset(&cm, 0, sizeof cm);
+	INIT_LIST_HEAD(&cm.device_list);
+	rwlock_init(&cm.device_lock);
+	spin_lock_init(&cm.lock);
+	cm.listen_service_table = RB_ROOT;
+	cm.listen_service_id = be64_to_cpu(IB_CM_ASSIGN_SERVICE_ID);
+	cm.remote_id_table = RB_ROOT;
+	cm.remote_qp_table = RB_ROOT;
+	cm.remote_sidr_table = RB_ROOT;
+	idr_init(&cm.local_id_table);
+	get_random_bytes(&cm.random_id_operand, sizeof cm.random_id_operand);
+	INIT_LIST_HEAD(&cm.timewait_list);
+
+	ret = class_register(&cm_class);
+	if (ret) {
+		ret = -ENOMEM;
+		goto error1;
+	}
+
+	cm.wq = create_workqueue("ib_cm");
+	if (!cm.wq) {
+		ret = -ENOMEM;
+		goto error2;
+	}
+
+	ret = ib_register_client(&cm_client);
+	if (ret)
+		goto error3;
+
+	return 0;
+error3:
+	destroy_workqueue(cm.wq);
+error2:
+	class_unregister(&cm_class);
+error1:
+	idr_destroy(&cm.local_id_table);
+	return ret;
+}
+
+static void __exit ib_cm_cleanup(void)
+{
+	struct cm_timewait_info *timewait_info, *tmp;
+
+	spin_lock_irq(&cm.lock);
+	list_for_each_entry(timewait_info, &cm.timewait_list, list)
+		cancel_delayed_work(&timewait_info->work.work);
+	spin_unlock_irq(&cm.lock);
+
+	ib_unregister_client(&cm_client);
+	destroy_workqueue(cm.wq);
+
+	list_for_each_entry_safe(timewait_info, tmp, &cm.timewait_list, list) {
+		list_del(&timewait_info->list);
+		kfree(timewait_info);
+	}
+
+	class_unregister(&cm_class);
+	idr_destroy(&cm.local_id_table);
+}
+
+module_init(ib_cm_init);
+module_exit(ib_cm_cleanup);
+
diff --git a/drivers/infiniband/core/cm_msgs.h b/drivers/infiniband/core/cm_msgs.h
new file mode 100644
index 0000000..be068f4
--- /dev/null
+++ b/drivers/infiniband/core/cm_msgs.h
@@ -0,0 +1,836 @@
+/*
+ * Copyright (c) 2004, 2011 Intel Corporation.  All rights reserved.
+ * Copyright (c) 2004 Topspin Corporation.  All rights reserved.
+ * Copyright (c) 2004 Voltaire Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING the madirectory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use source and binary forms, with or
+ *     withmodification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retathe above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHWARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS THE
+ * SOFTWARE.
+ */
+#if !defined(CM_MSGS_H)
+#define CM_MSGS_H
+
+#include <rdma/ib_mad.h>
+#include <rdma/ib_cm.h>
+
+/*
+ * Parameters to routines below should be in network-byte order, and values
+ * are returned in network-byte order.
+ */
+
+#define IB_CM_CLASS_VERSION	2 /* IB specification 1.2 */
+
+enum cm_msg_sequence {
+	CM_MSG_SEQUENCE_REQ,
+	CM_MSG_SEQUENCE_LAP,
+	CM_MSG_SEQUENCE_DREQ,
+	CM_MSG_SEQUENCE_SIDR
+};
+
+struct cm_req_msg {
+	struct ib_mad_hdr hdr;
+
+	__be32 local_comm_id;
+	__be32 rsvd4;
+	__be64 service_id;
+	__be64 local_ca_guid;
+	__be32 rsvd24;
+	__be32 local_qkey;
+	/* local QPN:24, responder resources:8 */
+	__be32 offset32;
+	/* local EECN:24, initiator depth:8 */
+	__be32 offset36;
+	/*
+	 * remote EECN:24, remote CM response timeout:5,
+	 * transport service type:2, end-to-end flow control:1
+	 */
+	__be32 offset40;
+	/* starting PSN:24, local CM response timeout:5, retry count:3 */
+	__be32 offset44;
+	__be16 pkey;
+	/* path MTU:4, RDC exists:1, RNR retry count:3. */
+	u8 offset50;
+	/* max CM Retries:4, SRQ:1, extended transport type:3 */
+	u8 offset51;
+
+	__be16 primary_local_lid;
+	__be16 primary_remote_lid;
+	union ib_gid primary_local_gid;
+	union ib_gid primary_remote_gid;
+	/* flow label:20, rsvd:6, packet rate:6 */
+	__be32 primary_offset88;
+	u8 primary_traffic_class;
+	u8 primary_hop_limit;
+	/* SL:4, subnet local:1, rsvd:3 */
+	u8 primary_offset94;
+	/* local ACK timeout:5, rsvd:3 */
+	u8 primary_offset95;
+
+	__be16 alt_local_lid;
+	__be16 alt_remote_lid;
+	union ib_gid alt_local_gid;
+	union ib_gid alt_remote_gid;
+	/* flow label:20, rsvd:6, packet rate:6 */
+	__be32 alt_offset132;
+	u8 alt_traffic_class;
+	u8 alt_hop_limit;
+	/* SL:4, subnet local:1, rsvd:3 */
+	u8 alt_offset138;
+	/* local ACK timeout:5, rsvd:3 */
+	u8 alt_offset139;
+
+	u8 private_data[IB_CM_REQ_PRIVATE_DATA_SIZE];
+
+} __attribute__ ((packed));
+
+static inline __be32 cm_req_get_local_qpn(struct cm_req_msg *req_msg)
+{
+	return cpu_to_be32(be32_to_cpu(req_msg->offset32) >> 8);
+}
+
+static inline void cm_req_set_local_qpn(struct cm_req_msg *req_msg, __be32 qpn)
+{
+	req_msg->offset32 = cpu_to_be32((be32_to_cpu(qpn) << 8) |
+					 (be32_to_cpu(req_msg->offset32) &
+					  0x000000FF));
+}
+
+static inline u8 cm_req_get_resp_res(struct cm_req_msg *req_msg)
+{
+	return (u8) be32_to_cpu(req_msg->offset32);
+}
+
+static inline void cm_req_set_resp_res(struct cm_req_msg *req_msg, u8 resp_res)
+{
+	req_msg->offset32 = cpu_to_be32(resp_res |
+					(be32_to_cpu(req_msg->offset32) &
+					 0xFFFFFF00));
+}
+
+static inline u8 cm_req_get_init_depth(struct cm_req_msg *req_msg)
+{
+	return (u8) be32_to_cpu(req_msg->offset36);
+}
+
+static inline void cm_req_set_init_depth(struct cm_req_msg *req_msg,
+					 u8 init_depth)
+{
+	req_msg->offset36 = cpu_to_be32(init_depth |
+					(be32_to_cpu(req_msg->offset36) &
+					 0xFFFFFF00));
+}
+
+static inline u8 cm_req_get_remote_resp_timeout(struct cm_req_msg *req_msg)
+{
+	return (u8) ((be32_to_cpu(req_msg->offset40) & 0xF8) >> 3);
+}
+
+static inline void cm_req_set_remote_resp_timeout(struct cm_req_msg *req_msg,
+						  u8 resp_timeout)
+{
+	req_msg->offset40 = cpu_to_be32((resp_timeout << 3) |
+					 (be32_to_cpu(req_msg->offset40) &
+					  0xFFFFFF07));
+}
+
+static inline enum ib_qp_type cm_req_get_qp_type(struct cm_req_msg *req_msg)
+{
+	u8 transport_type = (u8) (be32_to_cpu(req_msg->offset40) & 0x06) >> 1;
+	switch(transport_type) {
+	case 0: return IB_QPT_RC;
+	case 1: return IB_QPT_UC;
+	case 3:
+		switch (req_msg->offset51 & 0x7) {
+		case 1: return IB_QPT_XRC_TGT;
+		default: return 0;
+		}
+	default: return 0;
+	}
+}
+
+static inline void cm_req_set_qp_type(struct cm_req_msg *req_msg,
+				      enum ib_qp_type qp_type)
+{
+	switch(qp_type) {
+	case IB_QPT_UC:
+		req_msg->offset40 = cpu_to_be32((be32_to_cpu(
+						  req_msg->offset40) &
+						   0xFFFFFFF9) | 0x2);
+		break;
+	case IB_QPT_XRC_INI:
+		req_msg->offset40 = cpu_to_be32((be32_to_cpu(
+						 req_msg->offset40) &
+						   0xFFFFFFF9) | 0x6);
+		req_msg->offset51 = (req_msg->offset51 & 0xF8) | 1;
+		break;
+	default:
+		req_msg->offset40 = cpu_to_be32(be32_to_cpu(
+						 req_msg->offset40) &
+						  0xFFFFFFF9);
+	}
+}
+
+static inline u8 cm_req_get_flow_ctrl(struct cm_req_msg *req_msg)
+{
+	return be32_to_cpu(req_msg->offset40) & 0x1;
+}
+
+static inline void cm_req_set_flow_ctrl(struct cm_req_msg *req_msg,
+					u8 flow_ctrl)
+{
+	req_msg->offset40 = cpu_to_be32((flow_ctrl & 0x1) |
+					 (be32_to_cpu(req_msg->offset40) &
+					  0xFFFFFFFE));
+}
+
+static inline __be32 cm_req_get_starting_psn(struct cm_req_msg *req_msg)
+{
+	return cpu_to_be32(be32_to_cpu(req_msg->offset44) >> 8);
+}
+
+static inline void cm_req_set_starting_psn(struct cm_req_msg *req_msg,
+					   __be32 starting_psn)
+{
+	req_msg->offset44 = cpu_to_be32((be32_to_cpu(starting_psn) << 8) |
+			    (be32_to_cpu(req_msg->offset44) & 0x000000FF));
+}
+
+static inline u8 cm_req_get_local_resp_timeout(struct cm_req_msg *req_msg)
+{
+	return (u8) ((be32_to_cpu(req_msg->offset44) & 0xF8) >> 3);
+}
+
+static inline void cm_req_set_local_resp_timeout(struct cm_req_msg *req_msg,
+						 u8 resp_timeout)
+{
+	req_msg->offset44 = cpu_to_be32((resp_timeout << 3) |
+			    (be32_to_cpu(req_msg->offset44) & 0xFFFFFF07));
+}
+
+static inline u8 cm_req_get_retry_count(struct cm_req_msg *req_msg)
+{
+	return (u8) (be32_to_cpu(req_msg->offset44) & 0x7);
+}
+
+static inline void cm_req_set_retry_count(struct cm_req_msg *req_msg,
+					  u8 retry_count)
+{
+	req_msg->offset44 = cpu_to_be32((retry_count & 0x7) |
+			    (be32_to_cpu(req_msg->offset44) & 0xFFFFFFF8));
+}
+
+static inline u8 cm_req_get_path_mtu(struct cm_req_msg *req_msg)
+{
+	return req_msg->offset50 >> 4;
+}
+
+static inline void cm_req_set_path_mtu(struct cm_req_msg *req_msg, u8 path_mtu)
+{
+	req_msg->offset50 = (u8) ((req_msg->offset50 & 0xF) | (path_mtu << 4));
+}
+
+static inline u8 cm_req_get_rnr_retry_count(struct cm_req_msg *req_msg)
+{
+	return req_msg->offset50 & 0x7;
+}
+
+static inline void cm_req_set_rnr_retry_count(struct cm_req_msg *req_msg,
+					      u8 rnr_retry_count)
+{
+	req_msg->offset50 = (u8) ((req_msg->offset50 & 0xF8) |
+				  (rnr_retry_count & 0x7));
+}
+
+static inline u8 cm_req_get_max_cm_retries(struct cm_req_msg *req_msg)
+{
+	return req_msg->offset51 >> 4;
+}
+
+static inline void cm_req_set_max_cm_retries(struct cm_req_msg *req_msg,
+					     u8 retries)
+{
+	req_msg->offset51 = (u8) ((req_msg->offset51 & 0xF) | (retries << 4));
+}
+
+static inline u8 cm_req_get_srq(struct cm_req_msg *req_msg)
+{
+	return (req_msg->offset51 & 0x8) >> 3;
+}
+
+static inline void cm_req_set_srq(struct cm_req_msg *req_msg, u8 srq)
+{
+	req_msg->offset51 = (u8) ((req_msg->offset51 & 0xF7) |
+				  ((srq & 0x1) << 3));
+}
+
+static inline __be32 cm_req_get_primary_flow_label(struct cm_req_msg *req_msg)
+{
+	return cpu_to_be32(be32_to_cpu(req_msg->primary_offset88) >> 12);
+}
+
+static inline void cm_req_set_primary_flow_label(struct cm_req_msg *req_msg,
+						 __be32 flow_label)
+{
+	req_msg->primary_offset88 = cpu_to_be32(
+				    (be32_to_cpu(req_msg->primary_offset88) &
+				     0x00000FFF) |
+				     (be32_to_cpu(flow_label) << 12));
+}
+
+static inline u8 cm_req_get_primary_packet_rate(struct cm_req_msg *req_msg)
+{
+	return (u8) (be32_to_cpu(req_msg->primary_offset88) & 0x3F);
+}
+
+static inline void cm_req_set_primary_packet_rate(struct cm_req_msg *req_msg,
+						  u8 rate)
+{
+	req_msg->primary_offset88 = cpu_to_be32(
+				    (be32_to_cpu(req_msg->primary_offset88) &
+				     0xFFFFFFC0) | (rate & 0x3F));
+}
+
+static inline u8 cm_req_get_primary_sl(struct cm_req_msg *req_msg)
+{
+	return (u8) (req_msg->primary_offset94 >> 4);
+}
+
+static inline void cm_req_set_primary_sl(struct cm_req_msg *req_msg, u8 sl)
+{
+	req_msg->primary_offset94 = (u8) ((req_msg->primary_offset94 & 0x0F) |
+					  (sl << 4));
+}
+
+static inline u8 cm_req_get_primary_subnet_local(struct cm_req_msg *req_msg)
+{
+	return (u8) ((req_msg->primary_offset94 & 0x08) >> 3);
+}
+
+static inline void cm_req_set_primary_subnet_local(struct cm_req_msg *req_msg,
+						   u8 subnet_local)
+{
+	req_msg->primary_offset94 = (u8) ((req_msg->primary_offset94 & 0xF7) |
+					  ((subnet_local & 0x1) << 3));
+}
+
+static inline u8 cm_req_get_primary_local_ack_timeout(struct cm_req_msg *req_msg)
+{
+	return (u8) (req_msg->primary_offset95 >> 3);
+}
+
+static inline void cm_req_set_primary_local_ack_timeout(struct cm_req_msg *req_msg,
+							u8 local_ack_timeout)
+{
+	req_msg->primary_offset95 = (u8) ((req_msg->primary_offset95 & 0x07) |
+					  (local_ack_timeout << 3));
+}
+
+static inline __be32 cm_req_get_alt_flow_label(struct cm_req_msg *req_msg)
+{
+	return cpu_to_be32(be32_to_cpu(req_msg->alt_offset132) >> 12);
+}
+
+static inline void cm_req_set_alt_flow_label(struct cm_req_msg *req_msg,
+					     __be32 flow_label)
+{
+	req_msg->alt_offset132 = cpu_to_be32(
+				 (be32_to_cpu(req_msg->alt_offset132) &
+				  0x00000FFF) |
+				  (be32_to_cpu(flow_label) << 12));
+}
+
+static inline u8 cm_req_get_alt_packet_rate(struct cm_req_msg *req_msg)
+{
+	return (u8) (be32_to_cpu(req_msg->alt_offset132) & 0x3F);
+}
+
+static inline void cm_req_set_alt_packet_rate(struct cm_req_msg *req_msg,
+					      u8 rate)
+{
+	req_msg->alt_offset132 = cpu_to_be32(
+				 (be32_to_cpu(req_msg->alt_offset132) &
+				  0xFFFFFFC0) | (rate & 0x3F));
+}
+
+static inline u8 cm_req_get_alt_sl(struct cm_req_msg *req_msg)
+{
+	return (u8) (req_msg->alt_offset138 >> 4);
+}
+
+static inline void cm_req_set_alt_sl(struct cm_req_msg *req_msg, u8 sl)
+{
+	req_msg->alt_offset138 = (u8) ((req_msg->alt_offset138 & 0x0F) |
+				       (sl << 4));
+}
+
+static inline u8 cm_req_get_alt_subnet_local(struct cm_req_msg *req_msg)
+{
+	return (u8) ((req_msg->alt_offset138 & 0x08) >> 3);
+}
+
+static inline void cm_req_set_alt_subnet_local(struct cm_req_msg *req_msg,
+					       u8 subnet_local)
+{
+	req_msg->alt_offset138 = (u8) ((req_msg->alt_offset138 & 0xF7) |
+				       ((subnet_local & 0x1) << 3));
+}
+
+static inline u8 cm_req_get_alt_local_ack_timeout(struct cm_req_msg *req_msg)
+{
+	return (u8) (req_msg->alt_offset139 >> 3);
+}
+
+static inline void cm_req_set_alt_local_ack_timeout(struct cm_req_msg *req_msg,
+						    u8 local_ack_timeout)
+{
+	req_msg->alt_offset139 = (u8) ((req_msg->alt_offset139 & 0x07) |
+				       (local_ack_timeout << 3));
+}
+
+/* Message REJected or MRAed */
+enum cm_msg_response {
+	CM_MSG_RESPONSE_REQ = 0x0,
+	CM_MSG_RESPONSE_REP = 0x1,
+	CM_MSG_RESPONSE_OTHER = 0x2
+};
+
+ struct cm_mra_msg {
+	struct ib_mad_hdr hdr;
+
+	__be32 local_comm_id;
+	__be32 remote_comm_id;
+	/* message MRAed:2, rsvd:6 */
+	u8 offset8;
+	/* service timeout:5, rsvd:3 */
+	u8 offset9;
+
+	u8 private_data[IB_CM_MRA_PRIVATE_DATA_SIZE];
+
+} __attribute__ ((packed));
+
+static inline u8 cm_mra_get_msg_mraed(struct cm_mra_msg *mra_msg)
+{
+	return (u8) (mra_msg->offset8 >> 6);
+}
+
+static inline void cm_mra_set_msg_mraed(struct cm_mra_msg *mra_msg, u8 msg)
+{
+	mra_msg->offset8 = (u8) ((mra_msg->offset8 & 0x3F) | (msg << 6));
+}
+
+static inline u8 cm_mra_get_service_timeout(struct cm_mra_msg *mra_msg)
+{
+	return (u8) (mra_msg->offset9 >> 3);
+}
+
+static inline void cm_mra_set_service_timeout(struct cm_mra_msg *mra_msg,
+					      u8 service_timeout)
+{
+	mra_msg->offset9 = (u8) ((mra_msg->offset9 & 0x07) |
+				 (service_timeout << 3));
+}
+
+struct cm_rej_msg {
+	struct ib_mad_hdr hdr;
+
+	__be32 local_comm_id;
+	__be32 remote_comm_id;
+	/* message REJected:2, rsvd:6 */
+	u8 offset8;
+	/* reject info length:7, rsvd:1. */
+	u8 offset9;
+	__be16 reason;
+	u8 ari[IB_CM_REJ_ARI_LENGTH];
+
+	u8 private_data[IB_CM_REJ_PRIVATE_DATA_SIZE];
+
+} __attribute__ ((packed));
+
+static inline u8 cm_rej_get_msg_rejected(struct cm_rej_msg *rej_msg)
+{
+	return (u8) (rej_msg->offset8 >> 6);
+}
+
+static inline void cm_rej_set_msg_rejected(struct cm_rej_msg *rej_msg, u8 msg)
+{
+	rej_msg->offset8 = (u8) ((rej_msg->offset8 & 0x3F) | (msg << 6));
+}
+
+static inline u8 cm_rej_get_reject_info_len(struct cm_rej_msg *rej_msg)
+{
+	return (u8) (rej_msg->offset9 >> 1);
+}
+
+static inline void cm_rej_set_reject_info_len(struct cm_rej_msg *rej_msg,
+					      u8 len)
+{
+	rej_msg->offset9 = (u8) ((rej_msg->offset9 & 0x1) | (len << 1));
+}
+
+struct cm_rep_msg {
+	struct ib_mad_hdr hdr;
+
+	__be32 local_comm_id;
+	__be32 remote_comm_id;
+	__be32 local_qkey;
+	/* local QPN:24, rsvd:8 */
+	__be32 offset12;
+	/* local EECN:24, rsvd:8 */
+	__be32 offset16;
+	/* starting PSN:24 rsvd:8 */
+	__be32 offset20;
+	u8 resp_resources;
+	u8 initiator_depth;
+	/* target ACK delay:5, failover accepted:2, end-to-end flow control:1 */
+	u8 offset26;
+	/* RNR retry count:3, SRQ:1, rsvd:5 */
+	u8 offset27;
+	__be64 local_ca_guid;
+
+	u8 private_data[IB_CM_REP_PRIVATE_DATA_SIZE];
+
+} __attribute__ ((packed));
+
+static inline __be32 cm_rep_get_local_qpn(struct cm_rep_msg *rep_msg)
+{
+	return cpu_to_be32(be32_to_cpu(rep_msg->offset12) >> 8);
+}
+
+static inline void cm_rep_set_local_qpn(struct cm_rep_msg *rep_msg, __be32 qpn)
+{
+	rep_msg->offset12 = cpu_to_be32((be32_to_cpu(qpn) << 8) |
+			    (be32_to_cpu(rep_msg->offset12) & 0x000000FF));
+}
+
+static inline __be32 cm_rep_get_local_eecn(struct cm_rep_msg *rep_msg)
+{
+	return cpu_to_be32(be32_to_cpu(rep_msg->offset16) >> 8);
+}
+
+static inline void cm_rep_set_local_eecn(struct cm_rep_msg *rep_msg, __be32 eecn)
+{
+	rep_msg->offset16 = cpu_to_be32((be32_to_cpu(eecn) << 8) |
+			    (be32_to_cpu(rep_msg->offset16) & 0x000000FF));
+}
+
+static inline __be32 cm_rep_get_qpn(struct cm_rep_msg *rep_msg, enum ib_qp_type qp_type)
+{
+	return (qp_type == IB_QPT_XRC_INI) ?
+		cm_rep_get_local_eecn(rep_msg) : cm_rep_get_local_qpn(rep_msg);
+}
+
+static inline __be32 cm_rep_get_starting_psn(struct cm_rep_msg *rep_msg)
+{
+	return cpu_to_be32(be32_to_cpu(rep_msg->offset20) >> 8);
+}
+
+static inline void cm_rep_set_starting_psn(struct cm_rep_msg *rep_msg,
+					   __be32 starting_psn)
+{
+	rep_msg->offset20 = cpu_to_be32((be32_to_cpu(starting_psn) << 8) |
+			    (be32_to_cpu(rep_msg->offset20) & 0x000000FF));
+}
+
+static inline u8 cm_rep_get_target_ack_delay(struct cm_rep_msg *rep_msg)
+{
+	return (u8) (rep_msg->offset26 >> 3);
+}
+
+static inline void cm_rep_set_target_ack_delay(struct cm_rep_msg *rep_msg,
+					       u8 target_ack_delay)
+{
+	rep_msg->offset26 = (u8) ((rep_msg->offset26 & 0x07) |
+				  (target_ack_delay << 3));
+}
+
+static inline u8 cm_rep_get_failover(struct cm_rep_msg *rep_msg)
+{
+	return (u8) ((rep_msg->offset26 & 0x06) >> 1);
+}
+
+static inline void cm_rep_set_failover(struct cm_rep_msg *rep_msg, u8 failover)
+{
+	rep_msg->offset26 = (u8) ((rep_msg->offset26 & 0xF9) |
+				  ((failover & 0x3) << 1));
+}
+
+static inline u8 cm_rep_get_flow_ctrl(struct cm_rep_msg *rep_msg)
+{
+	return (u8) (rep_msg->offset26 & 0x01);
+}
+
+static inline void cm_rep_set_flow_ctrl(struct cm_rep_msg *rep_msg,
+					    u8 flow_ctrl)
+{
+	rep_msg->offset26 = (u8) ((rep_msg->offset26 & 0xFE) |
+				  (flow_ctrl & 0x1));
+}
+
+static inline u8 cm_rep_get_rnr_retry_count(struct cm_rep_msg *rep_msg)
+{
+	return (u8) (rep_msg->offset27 >> 5);
+}
+
+static inline void cm_rep_set_rnr_retry_count(struct cm_rep_msg *rep_msg,
+					      u8 rnr_retry_count)
+{
+	rep_msg->offset27 = (u8) ((rep_msg->offset27 & 0x1F) |
+				  (rnr_retry_count << 5));
+}
+
+static inline u8 cm_rep_get_srq(struct cm_rep_msg *rep_msg)
+{
+	return (u8) ((rep_msg->offset27 >> 4) & 0x1);
+}
+
+static inline void cm_rep_set_srq(struct cm_rep_msg *rep_msg, u8 srq)
+{
+	rep_msg->offset27 = (u8) ((rep_msg->offset27 & 0xEF) |
+				  ((srq & 0x1) << 4));
+}
+
+struct cm_rtu_msg {
+	struct ib_mad_hdr hdr;
+
+	__be32 local_comm_id;
+	__be32 remote_comm_id;
+
+	u8 private_data[IB_CM_RTU_PRIVATE_DATA_SIZE];
+
+} __attribute__ ((packed));
+
+struct cm_dreq_msg {
+	struct ib_mad_hdr hdr;
+
+	__be32 local_comm_id;
+	__be32 remote_comm_id;
+	/* remote QPN/EECN:24, rsvd:8 */
+	__be32 offset8;
+
+	u8 private_data[IB_CM_DREQ_PRIVATE_DATA_SIZE];
+
+} __attribute__ ((packed));
+
+static inline __be32 cm_dreq_get_remote_qpn(struct cm_dreq_msg *dreq_msg)
+{
+	return cpu_to_be32(be32_to_cpu(dreq_msg->offset8) >> 8);
+}
+
+static inline void cm_dreq_set_remote_qpn(struct cm_dreq_msg *dreq_msg, __be32 qpn)
+{
+	dreq_msg->offset8 = cpu_to_be32((be32_to_cpu(qpn) << 8) |
+			    (be32_to_cpu(dreq_msg->offset8) & 0x000000FF));
+}
+
+struct cm_drep_msg {
+	struct ib_mad_hdr hdr;
+
+	__be32 local_comm_id;
+	__be32 remote_comm_id;
+
+	u8 private_data[IB_CM_DREP_PRIVATE_DATA_SIZE];
+
+} __attribute__ ((packed));
+
+struct cm_lap_msg {
+	struct ib_mad_hdr hdr;
+
+	__be32 local_comm_id;
+	__be32 remote_comm_id;
+
+	__be32 rsvd8;
+	/* remote QPN/EECN:24, remote CM response timeout:5, rsvd:3 */
+	__be32 offset12;
+	__be32 rsvd16;
+
+	__be16 alt_local_lid;
+	__be16 alt_remote_lid;
+	union ib_gid alt_local_gid;
+	union ib_gid alt_remote_gid;
+	/* flow label:20, rsvd:4, traffic class:8 */
+	__be32 offset56;
+	u8 alt_hop_limit;
+	/* rsvd:2, packet rate:6 */
+	u8 offset61;
+	/* SL:4, subnet local:1, rsvd:3 */
+	u8 offset62;
+	/* local ACK timeout:5, rsvd:3 */
+	u8 offset63;
+
+	u8 private_data[IB_CM_LAP_PRIVATE_DATA_SIZE];
+} __attribute__  ((packed));
+
+static inline __be32 cm_lap_get_remote_qpn(struct cm_lap_msg *lap_msg)
+{
+	return cpu_to_be32(be32_to_cpu(lap_msg->offset12) >> 8);
+}
+
+static inline void cm_lap_set_remote_qpn(struct cm_lap_msg *lap_msg, __be32 qpn)
+{
+	lap_msg->offset12 = cpu_to_be32((be32_to_cpu(qpn) << 8) |
+					 (be32_to_cpu(lap_msg->offset12) &
+					  0x000000FF));
+}
+
+static inline u8 cm_lap_get_remote_resp_timeout(struct cm_lap_msg *lap_msg)
+{
+	return (u8) ((be32_to_cpu(lap_msg->offset12) & 0xF8) >> 3);
+}
+
+static inline void cm_lap_set_remote_resp_timeout(struct cm_lap_msg *lap_msg,
+						  u8 resp_timeout)
+{
+	lap_msg->offset12 = cpu_to_be32((resp_timeout << 3) |
+					 (be32_to_cpu(lap_msg->offset12) &
+					  0xFFFFFF07));
+}
+
+static inline __be32 cm_lap_get_flow_label(struct cm_lap_msg *lap_msg)
+{
+	return cpu_to_be32(be32_to_cpu(lap_msg->offset56) >> 12);
+}
+
+static inline void cm_lap_set_flow_label(struct cm_lap_msg *lap_msg,
+					 __be32 flow_label)
+{
+	lap_msg->offset56 = cpu_to_be32(
+				 (be32_to_cpu(lap_msg->offset56) & 0x00000FFF) |
+				 (be32_to_cpu(flow_label) << 12));
+}
+
+static inline u8 cm_lap_get_traffic_class(struct cm_lap_msg *lap_msg)
+{
+	return (u8) be32_to_cpu(lap_msg->offset56);
+}
+
+static inline void cm_lap_set_traffic_class(struct cm_lap_msg *lap_msg,
+					    u8 traffic_class)
+{
+	lap_msg->offset56 = cpu_to_be32(traffic_class |
+					 (be32_to_cpu(lap_msg->offset56) &
+					  0xFFFFFF00));
+}
+
+static inline u8 cm_lap_get_packet_rate(struct cm_lap_msg *lap_msg)
+{
+	return lap_msg->offset61 & 0x3F;
+}
+
+static inline void cm_lap_set_packet_rate(struct cm_lap_msg *lap_msg,
+					  u8 packet_rate)
+{
+	lap_msg->offset61 = (packet_rate & 0x3F) | (lap_msg->offset61 & 0xC0);
+}
+
+static inline u8 cm_lap_get_sl(struct cm_lap_msg *lap_msg)
+{
+	return lap_msg->offset62 >> 4;
+}
+
+static inline void cm_lap_set_sl(struct cm_lap_msg *lap_msg, u8 sl)
+{
+	lap_msg->offset62 = (sl << 4) | (lap_msg->offset62 & 0x0F);
+}
+
+static inline u8 cm_lap_get_subnet_local(struct cm_lap_msg *lap_msg)
+{
+	return (lap_msg->offset62 >> 3) & 0x1;
+}
+
+static inline void cm_lap_set_subnet_local(struct cm_lap_msg *lap_msg,
+					   u8 subnet_local)
+{
+	lap_msg->offset62 = ((subnet_local & 0x1) << 3) |
+			     (lap_msg->offset61 & 0xF7);
+}
+static inline u8 cm_lap_get_local_ack_timeout(struct cm_lap_msg *lap_msg)
+{
+	return lap_msg->offset63 >> 3;
+}
+
+static inline void cm_lap_set_local_ack_timeout(struct cm_lap_msg *lap_msg,
+						u8 local_ack_timeout)
+{
+	lap_msg->offset63 = (local_ack_timeout << 3) |
+			    (lap_msg->offset63 & 0x07);
+}
+
+struct cm_apr_msg {
+	struct ib_mad_hdr hdr;
+
+	__be32 local_comm_id;
+	__be32 remote_comm_id;
+
+	u8 info_length;
+	u8 ap_status;
+	__be16 rsvd;
+	u8 info[IB_CM_APR_INFO_LENGTH];
+
+	u8 private_data[IB_CM_APR_PRIVATE_DATA_SIZE];
+} __attribute__ ((packed));
+
+struct cm_sidr_req_msg {
+	struct ib_mad_hdr hdr;
+
+	__be32 request_id;
+	__be16 pkey;
+	__be16 rsvd;
+	__be64 service_id;
+
+	u8 private_data[IB_CM_SIDR_REQ_PRIVATE_DATA_SIZE];
+} __attribute__ ((packed));
+
+struct cm_sidr_rep_msg {
+	struct ib_mad_hdr hdr;
+
+	__be32 request_id;
+	u8 status;
+	u8 info_length;
+	__be16 rsvd;
+	/* QPN:24, rsvd:8 */
+	__be32 offset8;
+	__be64 service_id;
+	__be32 qkey;
+	u8 info[IB_CM_SIDR_REP_INFO_LENGTH];
+
+	u8 private_data[IB_CM_SIDR_REP_PRIVATE_DATA_SIZE];
+} __attribute__ ((packed));
+
+static inline __be32 cm_sidr_rep_get_qpn(struct cm_sidr_rep_msg *sidr_rep_msg)
+{
+	return cpu_to_be32(be32_to_cpu(sidr_rep_msg->offset8) >> 8);
+}
+
+static inline void cm_sidr_rep_set_qpn(struct cm_sidr_rep_msg *sidr_rep_msg,
+				       __be32 qpn)
+{
+	sidr_rep_msg->offset8 = cpu_to_be32((be32_to_cpu(qpn) << 8) |
+					(be32_to_cpu(sidr_rep_msg->offset8) &
+					 0x000000FF));
+}
+
+#endif /* CM_MSGS_H */
diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c
new file mode 100644
index 0000000..d570030
--- /dev/null
+++ b/drivers/infiniband/core/cma.c
@@ -0,0 +1,3703 @@
+/*
+ * Copyright (c) 2005 Voltaire Inc.  All rights reserved.
+ * Copyright (c) 2002-2005, Network Appliance, Inc. All rights reserved.
+ * Copyright (c) 1999-2005, Mellanox Technologies, Inc. All rights reserved.
+ * Copyright (c) 2005-2006 Intel Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/completion.h>
+#include <linux/in.h>
+#include <linux/in6.h>
+#include <linux/mutex.h>
+#include <linux/random.h>
+#include <linux/idr.h>
+#include <linux/inetdevice.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <net/route.h>
+
+#include <net/tcp.h>
+#include <net/ipv6.h>
+
+#include <rdma/rdma_cm.h>
+#include <rdma/rdma_cm_ib.h>
+#include <rdma/rdma_netlink.h>
+#include <rdma/ib.h>
+#include <rdma/ib_cache.h>
+#include <rdma/ib_cm.h>
+#include <rdma/ib_sa.h>
+#include <rdma/iw_cm.h>
+
+MODULE_AUTHOR("Sean Hefty");
+MODULE_DESCRIPTION("Generic RDMA CM Agent");
+MODULE_LICENSE("Dual BSD/GPL");
+
+#define CMA_CM_RESPONSE_TIMEOUT 20
+#define CMA_MAX_CM_RETRIES 15
+#define CMA_CM_MRA_SETTING (IB_CM_MRA_FLAG_DELAY | 24)
+#define CMA_IBOE_PACKET_LIFETIME 18
+
+static void cma_add_one(struct ib_device *device);
+static void cma_remove_one(struct ib_device *device);
+
+static struct ib_client cma_client = {
+	.name   = "cma",
+	.add    = cma_add_one,
+	.remove = cma_remove_one
+};
+
+static struct ib_sa_client sa_client;
+static struct rdma_addr_client addr_client;
+static LIST_HEAD(dev_list);
+static LIST_HEAD(listen_any_list);
+static DEFINE_MUTEX(lock);
+static struct workqueue_struct *cma_wq;
+static DEFINE_IDR(tcp_ps);
+static DEFINE_IDR(udp_ps);
+static DEFINE_IDR(ipoib_ps);
+static DEFINE_IDR(ib_ps);
+
+struct cma_device {
+	struct list_head	list;
+	struct ib_device	*device;
+	struct completion	comp;
+	atomic_t		refcount;
+	struct list_head	id_list;
+};
+
+struct rdma_bind_list {
+	struct idr		*ps;
+	struct hlist_head	owners;
+	unsigned short		port;
+};
+
+enum {
+	CMA_OPTION_AFONLY,
+};
+
+/*
+ * Device removal can occur at anytime, so we need extra handling to
+ * serialize notifying the user of device removal with other callbacks.
+ * We do this by disabling removal notification while a callback is in process,
+ * and reporting it after the callback completes.
+ */
+struct rdma_id_private {
+	struct rdma_cm_id	id;
+
+	struct rdma_bind_list	*bind_list;
+	struct hlist_node	node;
+	struct list_head	list; /* listen_any_list or cma_device.list */
+	struct list_head	listen_list; /* per device listens */
+	struct cma_device	*cma_dev;
+	struct list_head	mc_list;
+
+	int			internal_id;
+	enum rdma_cm_state	state;
+	spinlock_t		lock;
+	struct mutex		qp_mutex;
+
+	struct completion	comp;
+	atomic_t		refcount;
+	struct mutex		handler_mutex;
+
+	int			backlog;
+	int			timeout_ms;
+	struct ib_sa_query	*query;
+	int			query_id;
+	union {
+		struct ib_cm_id	*ib;
+		struct iw_cm_id	*iw;
+	} cm_id;
+
+	u32			seq_num;
+	u32			qkey;
+	u32			qp_num;
+	pid_t			owner;
+	u32			options;
+	u8			srq;
+	u8			tos;
+	u8			reuseaddr;
+	u8			afonly;
+};
+
+struct cma_multicast {
+	struct rdma_id_private *id_priv;
+	union {
+		struct ib_sa_multicast *ib;
+	} multicast;
+	struct list_head	list;
+	void			*context;
+	struct sockaddr_storage	addr;
+	struct kref		mcref;
+};
+
+struct cma_work {
+	struct work_struct	work;
+	struct rdma_id_private	*id;
+	enum rdma_cm_state	old_state;
+	enum rdma_cm_state	new_state;
+	struct rdma_cm_event	event;
+};
+
+struct cma_ndev_work {
+	struct work_struct	work;
+	struct rdma_id_private	*id;
+	struct rdma_cm_event	event;
+};
+
+struct iboe_mcast_work {
+	struct work_struct	 work;
+	struct rdma_id_private	*id;
+	struct cma_multicast	*mc;
+};
+
+union cma_ip_addr {
+	struct in6_addr ip6;
+	struct {
+		__be32 pad[3];
+		__be32 addr;
+	} ip4;
+};
+
+struct cma_hdr {
+	u8 cma_version;
+	u8 ip_version;	/* IP version: 7:4 */
+	__be16 port;
+	union cma_ip_addr src_addr;
+	union cma_ip_addr dst_addr;
+};
+
+#define CMA_VERSION 0x00
+
+static int cma_comp(struct rdma_id_private *id_priv, enum rdma_cm_state comp)
+{
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&id_priv->lock, flags);
+	ret = (id_priv->state == comp);
+	spin_unlock_irqrestore(&id_priv->lock, flags);
+	return ret;
+}
+
+static int cma_comp_exch(struct rdma_id_private *id_priv,
+			 enum rdma_cm_state comp, enum rdma_cm_state exch)
+{
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&id_priv->lock, flags);
+	if ((ret = (id_priv->state == comp)))
+		id_priv->state = exch;
+	spin_unlock_irqrestore(&id_priv->lock, flags);
+	return ret;
+}
+
+static enum rdma_cm_state cma_exch(struct rdma_id_private *id_priv,
+				   enum rdma_cm_state exch)
+{
+	unsigned long flags;
+	enum rdma_cm_state old;
+
+	spin_lock_irqsave(&id_priv->lock, flags);
+	old = id_priv->state;
+	id_priv->state = exch;
+	spin_unlock_irqrestore(&id_priv->lock, flags);
+	return old;
+}
+
+static inline u8 cma_get_ip_ver(struct cma_hdr *hdr)
+{
+	return hdr->ip_version >> 4;
+}
+
+static inline void cma_set_ip_ver(struct cma_hdr *hdr, u8 ip_ver)
+{
+	hdr->ip_version = (ip_ver << 4) | (hdr->ip_version & 0xF);
+}
+
+static void cma_attach_to_dev(struct rdma_id_private *id_priv,
+			      struct cma_device *cma_dev)
+{
+	atomic_inc(&cma_dev->refcount);
+	id_priv->cma_dev = cma_dev;
+	id_priv->id.device = cma_dev->device;
+	id_priv->id.route.addr.dev_addr.transport =
+		rdma_node_get_transport(cma_dev->device->node_type);
+	list_add_tail(&id_priv->list, &cma_dev->id_list);
+}
+
+static inline void cma_deref_dev(struct cma_device *cma_dev)
+{
+	if (atomic_dec_and_test(&cma_dev->refcount))
+		complete(&cma_dev->comp);
+}
+
+static inline void release_mc(struct kref *kref)
+{
+	struct cma_multicast *mc = container_of(kref, struct cma_multicast, mcref);
+
+	kfree(mc->multicast.ib);
+	kfree(mc);
+}
+
+static void cma_release_dev(struct rdma_id_private *id_priv)
+{
+	mutex_lock(&lock);
+	list_del(&id_priv->list);
+	cma_deref_dev(id_priv->cma_dev);
+	id_priv->cma_dev = NULL;
+	mutex_unlock(&lock);
+}
+
+static inline struct sockaddr *cma_src_addr(struct rdma_id_private *id_priv)
+{
+	return (struct sockaddr *) &id_priv->id.route.addr.src_addr;
+}
+
+static inline struct sockaddr *cma_dst_addr(struct rdma_id_private *id_priv)
+{
+	return (struct sockaddr *) &id_priv->id.route.addr.dst_addr;
+}
+
+static inline unsigned short cma_family(struct rdma_id_private *id_priv)
+{
+	return id_priv->id.route.addr.src_addr.ss_family;
+}
+
+static int cma_set_qkey(struct rdma_id_private *id_priv, u32 qkey)
+{
+	struct ib_sa_mcmember_rec rec;
+	int ret = 0;
+
+	if (id_priv->qkey) {
+		if (qkey && id_priv->qkey != qkey)
+			return -EINVAL;
+		return 0;
+	}
+
+	if (qkey) {
+		id_priv->qkey = qkey;
+		return 0;
+	}
+
+	switch (id_priv->id.ps) {
+	case RDMA_PS_UDP:
+	case RDMA_PS_IB:
+		id_priv->qkey = RDMA_UDP_QKEY;
+		break;
+	case RDMA_PS_IPOIB:
+		ib_addr_get_mgid(&id_priv->id.route.addr.dev_addr, &rec.mgid);
+		ret = ib_sa_get_mcmember_rec(id_priv->id.device,
+					     id_priv->id.port_num, &rec.mgid,
+					     &rec);
+		if (!ret)
+			id_priv->qkey = be32_to_cpu(rec.qkey);
+		break;
+	default:
+		break;
+	}
+	return ret;
+}
+
+static void cma_translate_ib(struct sockaddr_ib *sib, struct rdma_dev_addr *dev_addr)
+{
+	dev_addr->dev_type = ARPHRD_INFINIBAND;
+	rdma_addr_set_sgid(dev_addr, (union ib_gid *) &sib->sib_addr);
+	ib_addr_set_pkey(dev_addr, ntohs(sib->sib_pkey));
+}
+
+static int cma_translate_addr(struct sockaddr *addr, struct rdma_dev_addr *dev_addr)
+{
+	int ret;
+
+	if (addr->sa_family != AF_IB) {
+		ret = rdma_translate_ip(addr, dev_addr, NULL);
+	} else {
+		cma_translate_ib((struct sockaddr_ib *) addr, dev_addr);
+		ret = 0;
+	}
+
+	return ret;
+}
+
+static int cma_acquire_dev(struct rdma_id_private *id_priv,
+			   struct rdma_id_private *listen_id_priv)
+{
+	struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr;
+	struct cma_device *cma_dev;
+	union ib_gid gid, iboe_gid;
+	int ret = -ENODEV;
+	u8 port, found_port;
+	enum rdma_link_layer dev_ll = dev_addr->dev_type == ARPHRD_INFINIBAND ?
+		IB_LINK_LAYER_INFINIBAND : IB_LINK_LAYER_ETHERNET;
+
+	if (dev_ll != IB_LINK_LAYER_INFINIBAND &&
+	    id_priv->id.ps == RDMA_PS_IPOIB)
+		return -EINVAL;
+
+	mutex_lock(&lock);
+	rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.src_addr,
+		    &iboe_gid);
+
+	memcpy(&gid, dev_addr->src_dev_addr +
+	       rdma_addr_gid_offset(dev_addr), sizeof gid);
+	if (listen_id_priv &&
+	    rdma_port_get_link_layer(listen_id_priv->id.device,
+				     listen_id_priv->id.port_num) == dev_ll) {
+		cma_dev = listen_id_priv->cma_dev;
+		port = listen_id_priv->id.port_num;
+		if (rdma_node_get_transport(cma_dev->device->node_type) == RDMA_TRANSPORT_IB &&
+		    rdma_port_get_link_layer(cma_dev->device, port) == IB_LINK_LAYER_ETHERNET)
+			ret = ib_find_cached_gid(cma_dev->device, &iboe_gid,
+						 &found_port, NULL);
+		else
+			ret = ib_find_cached_gid(cma_dev->device, &gid,
+						 &found_port, NULL);
+
+		if (!ret && (port  == found_port)) {
+			id_priv->id.port_num = found_port;
+			goto out;
+		}
+	}
+	list_for_each_entry(cma_dev, &dev_list, list) {
+		for (port = 1; port <= cma_dev->device->phys_port_cnt; ++port) {
+			if (listen_id_priv &&
+			    listen_id_priv->cma_dev == cma_dev &&
+			    listen_id_priv->id.port_num == port)
+				continue;
+			if (rdma_port_get_link_layer(cma_dev->device, port) == dev_ll) {
+				if (rdma_node_get_transport(cma_dev->device->node_type) == RDMA_TRANSPORT_IB &&
+				    rdma_port_get_link_layer(cma_dev->device, port) == IB_LINK_LAYER_ETHERNET)
+					ret = ib_find_cached_gid(cma_dev->device, &iboe_gid, &found_port, NULL);
+				else
+					ret = ib_find_cached_gid(cma_dev->device, &gid, &found_port, NULL);
+
+				if (!ret && (port == found_port)) {
+					id_priv->id.port_num = found_port;
+					goto out;
+				}
+			}
+		}
+	}
+
+out:
+	if (!ret)
+		cma_attach_to_dev(id_priv, cma_dev);
+
+	mutex_unlock(&lock);
+	return ret;
+}
+
+/*
+ * Select the source IB device and address to reach the destination IB address.
+ */
+static int cma_resolve_ib_dev(struct rdma_id_private *id_priv)
+{
+	struct cma_device *cma_dev, *cur_dev;
+	struct sockaddr_ib *addr;
+	union ib_gid gid, sgid, *dgid;
+	u16 pkey, index;
+	u8 p;
+	int i;
+
+	cma_dev = NULL;
+	addr = (struct sockaddr_ib *) cma_dst_addr(id_priv);
+	dgid = (union ib_gid *) &addr->sib_addr;
+	pkey = ntohs(addr->sib_pkey);
+
+	list_for_each_entry(cur_dev, &dev_list, list) {
+		if (rdma_node_get_transport(cur_dev->device->node_type) != RDMA_TRANSPORT_IB)
+			continue;
+
+		for (p = 1; p <= cur_dev->device->phys_port_cnt; ++p) {
+			if (ib_find_cached_pkey(cur_dev->device, p, pkey, &index))
+				continue;
+
+			for (i = 0; !ib_get_cached_gid(cur_dev->device, p, i, &gid); i++) {
+				if (!memcmp(&gid, dgid, sizeof(gid))) {
+					cma_dev = cur_dev;
+					sgid = gid;
+					id_priv->id.port_num = p;
+					goto found;
+				}
+
+				if (!cma_dev && (gid.global.subnet_prefix ==
+						 dgid->global.subnet_prefix)) {
+					cma_dev = cur_dev;
+					sgid = gid;
+					id_priv->id.port_num = p;
+				}
+			}
+		}
+	}
+
+	if (!cma_dev)
+		return -ENODEV;
+
+found:
+	cma_attach_to_dev(id_priv, cma_dev);
+	addr = (struct sockaddr_ib *) cma_src_addr(id_priv);
+	memcpy(&addr->sib_addr, &sgid, sizeof sgid);
+	cma_translate_ib(addr, &id_priv->id.route.addr.dev_addr);
+	return 0;
+}
+
+static void cma_deref_id(struct rdma_id_private *id_priv)
+{
+	if (atomic_dec_and_test(&id_priv->refcount))
+		complete(&id_priv->comp);
+}
+
+static int cma_disable_callback(struct rdma_id_private *id_priv,
+				enum rdma_cm_state state)
+{
+	mutex_lock(&id_priv->handler_mutex);
+	if (id_priv->state != state) {
+		mutex_unlock(&id_priv->handler_mutex);
+		return -EINVAL;
+	}
+	return 0;
+}
+
+struct rdma_cm_id *rdma_create_id(rdma_cm_event_handler event_handler,
+				  void *context, enum rdma_port_space ps,
+				  enum ib_qp_type qp_type)
+{
+	struct rdma_id_private *id_priv;
+
+	id_priv = kzalloc(sizeof *id_priv, GFP_KERNEL);
+	if (!id_priv)
+		return ERR_PTR(-ENOMEM);
+
+	id_priv->owner = task_pid_nr(current);
+	id_priv->state = RDMA_CM_IDLE;
+	id_priv->id.context = context;
+	id_priv->id.event_handler = event_handler;
+	id_priv->id.ps = ps;
+	id_priv->id.qp_type = qp_type;
+	spin_lock_init(&id_priv->lock);
+	mutex_init(&id_priv->qp_mutex);
+	init_completion(&id_priv->comp);
+	atomic_set(&id_priv->refcount, 1);
+	mutex_init(&id_priv->handler_mutex);
+	INIT_LIST_HEAD(&id_priv->listen_list);
+	INIT_LIST_HEAD(&id_priv->mc_list);
+	get_random_bytes(&id_priv->seq_num, sizeof id_priv->seq_num);
+
+	return &id_priv->id;
+}
+EXPORT_SYMBOL(rdma_create_id);
+
+static int cma_init_ud_qp(struct rdma_id_private *id_priv, struct ib_qp *qp)
+{
+	struct ib_qp_attr qp_attr;
+	int qp_attr_mask, ret;
+
+	qp_attr.qp_state = IB_QPS_INIT;
+	ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask);
+	if (ret)
+		return ret;
+
+	ret = ib_modify_qp(qp, &qp_attr, qp_attr_mask);
+	if (ret)
+		return ret;
+
+	qp_attr.qp_state = IB_QPS_RTR;
+	ret = ib_modify_qp(qp, &qp_attr, IB_QP_STATE);
+	if (ret)
+		return ret;
+
+	qp_attr.qp_state = IB_QPS_RTS;
+	qp_attr.sq_psn = 0;
+	ret = ib_modify_qp(qp, &qp_attr, IB_QP_STATE | IB_QP_SQ_PSN);
+
+	return ret;
+}
+
+static int cma_init_conn_qp(struct rdma_id_private *id_priv, struct ib_qp *qp)
+{
+	struct ib_qp_attr qp_attr;
+	int qp_attr_mask, ret;
+
+	qp_attr.qp_state = IB_QPS_INIT;
+	ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask);
+	if (ret)
+		return ret;
+
+	return ib_modify_qp(qp, &qp_attr, qp_attr_mask);
+}
+
+int rdma_create_qp(struct rdma_cm_id *id, struct ib_pd *pd,
+		   struct ib_qp_init_attr *qp_init_attr)
+{
+	struct rdma_id_private *id_priv;
+	struct ib_qp *qp;
+	int ret;
+
+	id_priv = container_of(id, struct rdma_id_private, id);
+	if (id->device != pd->device)
+		return -EINVAL;
+
+	qp = ib_create_qp(pd, qp_init_attr);
+	if (IS_ERR(qp))
+		return PTR_ERR(qp);
+
+	if (id->qp_type == IB_QPT_UD)
+		ret = cma_init_ud_qp(id_priv, qp);
+	else
+		ret = cma_init_conn_qp(id_priv, qp);
+	if (ret)
+		goto err;
+
+	id->qp = qp;
+	id_priv->qp_num = qp->qp_num;
+	id_priv->srq = (qp->srq != NULL);
+	return 0;
+err:
+	ib_destroy_qp(qp);
+	return ret;
+}
+EXPORT_SYMBOL(rdma_create_qp);
+
+void rdma_destroy_qp(struct rdma_cm_id *id)
+{
+	struct rdma_id_private *id_priv;
+
+	id_priv = container_of(id, struct rdma_id_private, id);
+	mutex_lock(&id_priv->qp_mutex);
+	ib_destroy_qp(id_priv->id.qp);
+	id_priv->id.qp = NULL;
+	mutex_unlock(&id_priv->qp_mutex);
+}
+EXPORT_SYMBOL(rdma_destroy_qp);
+
+static int cma_modify_qp_rtr(struct rdma_id_private *id_priv,
+			     struct rdma_conn_param *conn_param)
+{
+	struct ib_qp_attr qp_attr;
+	int qp_attr_mask, ret;
+	union ib_gid sgid;
+
+	mutex_lock(&id_priv->qp_mutex);
+	if (!id_priv->id.qp) {
+		ret = 0;
+		goto out;
+	}
+
+	/* Need to update QP attributes from default values. */
+	qp_attr.qp_state = IB_QPS_INIT;
+	ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask);
+	if (ret)
+		goto out;
+
+	ret = ib_modify_qp(id_priv->id.qp, &qp_attr, qp_attr_mask);
+	if (ret)
+		goto out;
+
+	qp_attr.qp_state = IB_QPS_RTR;
+	ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask);
+	if (ret)
+		goto out;
+
+	ret = ib_query_gid(id_priv->id.device, id_priv->id.port_num,
+			   qp_attr.ah_attr.grh.sgid_index, &sgid);
+	if (ret)
+		goto out;
+
+	if (rdma_node_get_transport(id_priv->cma_dev->device->node_type)
+	    == RDMA_TRANSPORT_IB &&
+	    rdma_port_get_link_layer(id_priv->id.device, id_priv->id.port_num)
+	    == IB_LINK_LAYER_ETHERNET) {
+		ret = rdma_addr_find_smac_by_sgid(&sgid, qp_attr.smac, NULL);
+
+		if (ret)
+			goto out;
+	}
+	if (conn_param)
+		qp_attr.max_dest_rd_atomic = conn_param->responder_resources;
+	ret = ib_modify_qp(id_priv->id.qp, &qp_attr, qp_attr_mask);
+out:
+	mutex_unlock(&id_priv->qp_mutex);
+	return ret;
+}
+
+static int cma_modify_qp_rts(struct rdma_id_private *id_priv,
+			     struct rdma_conn_param *conn_param)
+{
+	struct ib_qp_attr qp_attr;
+	int qp_attr_mask, ret;
+
+	mutex_lock(&id_priv->qp_mutex);
+	if (!id_priv->id.qp) {
+		ret = 0;
+		goto out;
+	}
+
+	qp_attr.qp_state = IB_QPS_RTS;
+	ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask);
+	if (ret)
+		goto out;
+
+	if (conn_param)
+		qp_attr.max_rd_atomic = conn_param->initiator_depth;
+	ret = ib_modify_qp(id_priv->id.qp, &qp_attr, qp_attr_mask);
+out:
+	mutex_unlock(&id_priv->qp_mutex);
+	return ret;
+}
+
+static int cma_modify_qp_err(struct rdma_id_private *id_priv)
+{
+	struct ib_qp_attr qp_attr;
+	int ret;
+
+	mutex_lock(&id_priv->qp_mutex);
+	if (!id_priv->id.qp) {
+		ret = 0;
+		goto out;
+	}
+
+	qp_attr.qp_state = IB_QPS_ERR;
+	ret = ib_modify_qp(id_priv->id.qp, &qp_attr, IB_QP_STATE);
+out:
+	mutex_unlock(&id_priv->qp_mutex);
+	return ret;
+}
+
+static int cma_ib_init_qp_attr(struct rdma_id_private *id_priv,
+			       struct ib_qp_attr *qp_attr, int *qp_attr_mask)
+{
+	struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr;
+	int ret;
+	u16 pkey;
+
+	if (rdma_port_get_link_layer(id_priv->id.device, id_priv->id.port_num) ==
+	    IB_LINK_LAYER_INFINIBAND)
+		pkey = ib_addr_get_pkey(dev_addr);
+	else
+		pkey = 0xffff;
+
+	ret = ib_find_cached_pkey(id_priv->id.device, id_priv->id.port_num,
+				  pkey, &qp_attr->pkey_index);
+	if (ret)
+		return ret;
+
+	qp_attr->port_num = id_priv->id.port_num;
+	*qp_attr_mask = IB_QP_STATE | IB_QP_PKEY_INDEX | IB_QP_PORT;
+
+	if (id_priv->id.qp_type == IB_QPT_UD) {
+		ret = cma_set_qkey(id_priv, 0);
+		if (ret)
+			return ret;
+
+		qp_attr->qkey = id_priv->qkey;
+		*qp_attr_mask |= IB_QP_QKEY;
+	} else {
+		qp_attr->qp_access_flags = 0;
+		*qp_attr_mask |= IB_QP_ACCESS_FLAGS;
+	}
+	return 0;
+}
+
+int rdma_init_qp_attr(struct rdma_cm_id *id, struct ib_qp_attr *qp_attr,
+		       int *qp_attr_mask)
+{
+	struct rdma_id_private *id_priv;
+	int ret = 0;
+
+	id_priv = container_of(id, struct rdma_id_private, id);
+	switch (rdma_node_get_transport(id_priv->id.device->node_type)) {
+	case RDMA_TRANSPORT_IB:
+		if (!id_priv->cm_id.ib || (id_priv->id.qp_type == IB_QPT_UD))
+			ret = cma_ib_init_qp_attr(id_priv, qp_attr, qp_attr_mask);
+		else
+			ret = ib_cm_init_qp_attr(id_priv->cm_id.ib, qp_attr,
+						 qp_attr_mask);
+
+		if (qp_attr->qp_state == IB_QPS_RTR)
+			qp_attr->rq_psn = id_priv->seq_num;
+		break;
+	case RDMA_TRANSPORT_IWARP:
+		if (!id_priv->cm_id.iw) {
+			qp_attr->qp_access_flags = 0;
+			*qp_attr_mask = IB_QP_STATE | IB_QP_ACCESS_FLAGS;
+		} else
+			ret = iw_cm_init_qp_attr(id_priv->cm_id.iw, qp_attr,
+						 qp_attr_mask);
+		break;
+	default:
+		ret = -ENOSYS;
+		break;
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL(rdma_init_qp_attr);
+
+static inline int cma_zero_addr(struct sockaddr *addr)
+{
+	switch (addr->sa_family) {
+	case AF_INET:
+		return ipv4_is_zeronet(((struct sockaddr_in *)addr)->sin_addr.s_addr);
+	case AF_INET6:
+		return ipv6_addr_any(&((struct sockaddr_in6 *) addr)->sin6_addr);
+	case AF_IB:
+		return ib_addr_any(&((struct sockaddr_ib *) addr)->sib_addr);
+	default:
+		return 0;
+	}
+}
+
+static inline int cma_loopback_addr(struct sockaddr *addr)
+{
+	switch (addr->sa_family) {
+	case AF_INET:
+		return ipv4_is_loopback(((struct sockaddr_in *) addr)->sin_addr.s_addr);
+	case AF_INET6:
+		return ipv6_addr_loopback(&((struct sockaddr_in6 *) addr)->sin6_addr);
+	case AF_IB:
+		return ib_addr_loopback(&((struct sockaddr_ib *) addr)->sib_addr);
+	default:
+		return 0;
+	}
+}
+
+static inline int cma_any_addr(struct sockaddr *addr)
+{
+	return cma_zero_addr(addr) || cma_loopback_addr(addr);
+}
+
+static int cma_addr_cmp(struct sockaddr *src, struct sockaddr *dst)
+{
+	if (src->sa_family != dst->sa_family)
+		return -1;
+
+	switch (src->sa_family) {
+	case AF_INET:
+		return ((struct sockaddr_in *) src)->sin_addr.s_addr !=
+		       ((struct sockaddr_in *) dst)->sin_addr.s_addr;
+	case AF_INET6:
+		return ipv6_addr_cmp(&((struct sockaddr_in6 *) src)->sin6_addr,
+				     &((struct sockaddr_in6 *) dst)->sin6_addr);
+	default:
+		return ib_addr_cmp(&((struct sockaddr_ib *) src)->sib_addr,
+				   &((struct sockaddr_ib *) dst)->sib_addr);
+	}
+}
+
+static __be16 cma_port(struct sockaddr *addr)
+{
+	struct sockaddr_ib *sib;
+
+	switch (addr->sa_family) {
+	case AF_INET:
+		return ((struct sockaddr_in *) addr)->sin_port;
+	case AF_INET6:
+		return ((struct sockaddr_in6 *) addr)->sin6_port;
+	case AF_IB:
+		sib = (struct sockaddr_ib *) addr;
+		return htons((u16) (be64_to_cpu(sib->sib_sid) &
+				    be64_to_cpu(sib->sib_sid_mask)));
+	default:
+		return 0;
+	}
+}
+
+static inline int cma_any_port(struct sockaddr *addr)
+{
+	return !cma_port(addr);
+}
+
+static void cma_save_ib_info(struct rdma_cm_id *id, struct rdma_cm_id *listen_id,
+			     struct ib_sa_path_rec *path)
+{
+	struct sockaddr_ib *listen_ib, *ib;
+
+	listen_ib = (struct sockaddr_ib *) &listen_id->route.addr.src_addr;
+	ib = (struct sockaddr_ib *) &id->route.addr.src_addr;
+	ib->sib_family = listen_ib->sib_family;
+	ib->sib_pkey = path->pkey;
+	ib->sib_flowinfo = path->flow_label;
+	memcpy(&ib->sib_addr, &path->sgid, 16);
+	ib->sib_sid = listen_ib->sib_sid;
+	ib->sib_sid_mask = cpu_to_be64(0xffffffffffffffffULL);
+	ib->sib_scope_id = listen_ib->sib_scope_id;
+
+	ib = (struct sockaddr_ib *) &id->route.addr.dst_addr;
+	ib->sib_family = listen_ib->sib_family;
+	ib->sib_pkey = path->pkey;
+	ib->sib_flowinfo = path->flow_label;
+	memcpy(&ib->sib_addr, &path->dgid, 16);
+}
+
+static void cma_save_ip4_info(struct rdma_cm_id *id, struct rdma_cm_id *listen_id,
+			      struct cma_hdr *hdr)
+{
+	struct sockaddr_in *listen4, *ip4;
+
+	listen4 = (struct sockaddr_in *) &listen_id->route.addr.src_addr;
+	ip4 = (struct sockaddr_in *) &id->route.addr.src_addr;
+	ip4->sin_family = listen4->sin_family;
+	ip4->sin_addr.s_addr = hdr->dst_addr.ip4.addr;
+	ip4->sin_port = listen4->sin_port;
+
+	ip4 = (struct sockaddr_in *) &id->route.addr.dst_addr;
+	ip4->sin_family = listen4->sin_family;
+	ip4->sin_addr.s_addr = hdr->src_addr.ip4.addr;
+	ip4->sin_port = hdr->port;
+}
+
+static void cma_save_ip6_info(struct rdma_cm_id *id, struct rdma_cm_id *listen_id,
+			      struct cma_hdr *hdr)
+{
+	struct sockaddr_in6 *listen6, *ip6;
+
+	listen6 = (struct sockaddr_in6 *) &listen_id->route.addr.src_addr;
+	ip6 = (struct sockaddr_in6 *) &id->route.addr.src_addr;
+	ip6->sin6_family = listen6->sin6_family;
+	ip6->sin6_addr = hdr->dst_addr.ip6;
+	ip6->sin6_port = listen6->sin6_port;
+
+	ip6 = (struct sockaddr_in6 *) &id->route.addr.dst_addr;
+	ip6->sin6_family = listen6->sin6_family;
+	ip6->sin6_addr = hdr->src_addr.ip6;
+	ip6->sin6_port = hdr->port;
+}
+
+static int cma_save_net_info(struct rdma_cm_id *id, struct rdma_cm_id *listen_id,
+			     struct ib_cm_event *ib_event)
+{
+	struct cma_hdr *hdr;
+
+	if ((listen_id->route.addr.src_addr.ss_family == AF_IB) &&
+	    (ib_event->event == IB_CM_REQ_RECEIVED)) {
+		cma_save_ib_info(id, listen_id, ib_event->param.req_rcvd.primary_path);
+		return 0;
+	}
+
+	hdr = ib_event->private_data;
+	if (hdr->cma_version != CMA_VERSION)
+		return -EINVAL;
+
+	switch (cma_get_ip_ver(hdr)) {
+	case 4:
+		cma_save_ip4_info(id, listen_id, hdr);
+		break;
+	case 6:
+		cma_save_ip6_info(id, listen_id, hdr);
+		break;
+	default:
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static inline int cma_user_data_offset(struct rdma_id_private *id_priv)
+{
+	return cma_family(id_priv) == AF_IB ? 0 : sizeof(struct cma_hdr);
+}
+
+static void cma_cancel_route(struct rdma_id_private *id_priv)
+{
+	switch (rdma_port_get_link_layer(id_priv->id.device, id_priv->id.port_num)) {
+	case IB_LINK_LAYER_INFINIBAND:
+		if (id_priv->query)
+			ib_sa_cancel_query(id_priv->query_id, id_priv->query);
+		break;
+	default:
+		break;
+	}
+}
+
+static void cma_cancel_listens(struct rdma_id_private *id_priv)
+{
+	struct rdma_id_private *dev_id_priv;
+
+	/*
+	 * Remove from listen_any_list to prevent added devices from spawning
+	 * additional listen requests.
+	 */
+	mutex_lock(&lock);
+	list_del(&id_priv->list);
+
+	while (!list_empty(&id_priv->listen_list)) {
+		dev_id_priv = list_entry(id_priv->listen_list.next,
+					 struct rdma_id_private, listen_list);
+		/* sync with device removal to avoid duplicate destruction */
+		list_del_init(&dev_id_priv->list);
+		list_del(&dev_id_priv->listen_list);
+		mutex_unlock(&lock);
+
+		rdma_destroy_id(&dev_id_priv->id);
+		mutex_lock(&lock);
+	}
+	mutex_unlock(&lock);
+}
+
+static void cma_cancel_operation(struct rdma_id_private *id_priv,
+				 enum rdma_cm_state state)
+{
+	switch (state) {
+	case RDMA_CM_ADDR_QUERY:
+		rdma_addr_cancel(&id_priv->id.route.addr.dev_addr);
+		break;
+	case RDMA_CM_ROUTE_QUERY:
+		cma_cancel_route(id_priv);
+		break;
+	case RDMA_CM_LISTEN:
+		if (cma_any_addr(cma_src_addr(id_priv)) && !id_priv->cma_dev)
+			cma_cancel_listens(id_priv);
+		break;
+	default:
+		break;
+	}
+}
+
+static void cma_release_port(struct rdma_id_private *id_priv)
+{
+	struct rdma_bind_list *bind_list = id_priv->bind_list;
+
+	if (!bind_list)
+		return;
+
+	mutex_lock(&lock);
+	hlist_del(&id_priv->node);
+	if (hlist_empty(&bind_list->owners)) {
+		idr_remove(bind_list->ps, bind_list->port);
+		kfree(bind_list);
+	}
+	mutex_unlock(&lock);
+}
+
+static void cma_leave_mc_groups(struct rdma_id_private *id_priv)
+{
+	struct cma_multicast *mc;
+
+	while (!list_empty(&id_priv->mc_list)) {
+		mc = container_of(id_priv->mc_list.next,
+				  struct cma_multicast, list);
+		list_del(&mc->list);
+		switch (rdma_port_get_link_layer(id_priv->cma_dev->device, id_priv->id.port_num)) {
+		case IB_LINK_LAYER_INFINIBAND:
+			ib_sa_free_multicast(mc->multicast.ib);
+			kfree(mc);
+			break;
+		case IB_LINK_LAYER_ETHERNET:
+			kref_put(&mc->mcref, release_mc);
+			break;
+		default:
+			break;
+		}
+	}
+}
+
+void rdma_destroy_id(struct rdma_cm_id *id)
+{
+	struct rdma_id_private *id_priv;
+	enum rdma_cm_state state;
+
+	id_priv = container_of(id, struct rdma_id_private, id);
+	state = cma_exch(id_priv, RDMA_CM_DESTROYING);
+	cma_cancel_operation(id_priv, state);
+
+	/*
+	 * Wait for any active callback to finish.  New callbacks will find
+	 * the id_priv state set to destroying and abort.
+	 */
+	mutex_lock(&id_priv->handler_mutex);
+	mutex_unlock(&id_priv->handler_mutex);
+
+	if (id_priv->cma_dev) {
+		switch (rdma_node_get_transport(id_priv->id.device->node_type)) {
+		case RDMA_TRANSPORT_IB:
+			if (id_priv->cm_id.ib)
+				ib_destroy_cm_id(id_priv->cm_id.ib);
+			break;
+		case RDMA_TRANSPORT_IWARP:
+			if (id_priv->cm_id.iw)
+				iw_destroy_cm_id(id_priv->cm_id.iw);
+			break;
+		default:
+			break;
+		}
+		cma_leave_mc_groups(id_priv);
+		cma_release_dev(id_priv);
+	}
+
+	cma_release_port(id_priv);
+	cma_deref_id(id_priv);
+	wait_for_completion(&id_priv->comp);
+
+	if (id_priv->internal_id)
+		cma_deref_id(id_priv->id.context);
+
+	kfree(id_priv->id.route.path_rec);
+	kfree(id_priv);
+}
+EXPORT_SYMBOL(rdma_destroy_id);
+
+static int cma_rep_recv(struct rdma_id_private *id_priv)
+{
+	int ret;
+
+	ret = cma_modify_qp_rtr(id_priv, NULL);
+	if (ret)
+		goto reject;
+
+	ret = cma_modify_qp_rts(id_priv, NULL);
+	if (ret)
+		goto reject;
+
+	ret = ib_send_cm_rtu(id_priv->cm_id.ib, NULL, 0);
+	if (ret)
+		goto reject;
+
+	return 0;
+reject:
+	cma_modify_qp_err(id_priv);
+	ib_send_cm_rej(id_priv->cm_id.ib, IB_CM_REJ_CONSUMER_DEFINED,
+		       NULL, 0, NULL, 0);
+	return ret;
+}
+
+static void cma_set_rep_event_data(struct rdma_cm_event *event,
+				   struct ib_cm_rep_event_param *rep_data,
+				   void *private_data)
+{
+	event->param.conn.private_data = private_data;
+	event->param.conn.private_data_len = IB_CM_REP_PRIVATE_DATA_SIZE;
+	event->param.conn.responder_resources = rep_data->responder_resources;
+	event->param.conn.initiator_depth = rep_data->initiator_depth;
+	event->param.conn.flow_control = rep_data->flow_control;
+	event->param.conn.rnr_retry_count = rep_data->rnr_retry_count;
+	event->param.conn.srq = rep_data->srq;
+	event->param.conn.qp_num = rep_data->remote_qpn;
+}
+
+static int cma_ib_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event)
+{
+	struct rdma_id_private *id_priv = cm_id->context;
+	struct rdma_cm_event event;
+	int ret = 0;
+
+	if ((ib_event->event != IB_CM_TIMEWAIT_EXIT &&
+		cma_disable_callback(id_priv, RDMA_CM_CONNECT)) ||
+	    (ib_event->event == IB_CM_TIMEWAIT_EXIT &&
+		cma_disable_callback(id_priv, RDMA_CM_DISCONNECT)))
+		return 0;
+
+	memset(&event, 0, sizeof event);
+	switch (ib_event->event) {
+	case IB_CM_REQ_ERROR:
+	case IB_CM_REP_ERROR:
+		event.event = RDMA_CM_EVENT_UNREACHABLE;
+		event.status = -ETIMEDOUT;
+		break;
+	case IB_CM_REP_RECEIVED:
+		if (id_priv->id.qp) {
+			event.status = cma_rep_recv(id_priv);
+			event.event = event.status ? RDMA_CM_EVENT_CONNECT_ERROR :
+						     RDMA_CM_EVENT_ESTABLISHED;
+		} else {
+			event.event = RDMA_CM_EVENT_CONNECT_RESPONSE;
+		}
+		cma_set_rep_event_data(&event, &ib_event->param.rep_rcvd,
+				       ib_event->private_data);
+		break;
+	case IB_CM_RTU_RECEIVED:
+	case IB_CM_USER_ESTABLISHED:
+		event.event = RDMA_CM_EVENT_ESTABLISHED;
+		break;
+	case IB_CM_DREQ_ERROR:
+		event.status = -ETIMEDOUT; /* fall through */
+	case IB_CM_DREQ_RECEIVED:
+	case IB_CM_DREP_RECEIVED:
+		if (!cma_comp_exch(id_priv, RDMA_CM_CONNECT,
+				   RDMA_CM_DISCONNECT))
+			goto out;
+		event.event = RDMA_CM_EVENT_DISCONNECTED;
+		break;
+	case IB_CM_TIMEWAIT_EXIT:
+		event.event = RDMA_CM_EVENT_TIMEWAIT_EXIT;
+		break;
+	case IB_CM_MRA_RECEIVED:
+		/* ignore event */
+		goto out;
+	case IB_CM_REJ_RECEIVED:
+		cma_modify_qp_err(id_priv);
+		event.status = ib_event->param.rej_rcvd.reason;
+		event.event = RDMA_CM_EVENT_REJECTED;
+		event.param.conn.private_data = ib_event->private_data;
+		event.param.conn.private_data_len = IB_CM_REJ_PRIVATE_DATA_SIZE;
+		break;
+	default:
+		printk(KERN_ERR "RDMA CMA: unexpected IB CM event: %d\n",
+		       ib_event->event);
+		goto out;
+	}
+
+	ret = id_priv->id.event_handler(&id_priv->id, &event);
+	if (ret) {
+		/* Destroy the CM ID by returning a non-zero value. */
+		id_priv->cm_id.ib = NULL;
+		cma_exch(id_priv, RDMA_CM_DESTROYING);
+		mutex_unlock(&id_priv->handler_mutex);
+		rdma_destroy_id(&id_priv->id);
+		return ret;
+	}
+out:
+	mutex_unlock(&id_priv->handler_mutex);
+	return ret;
+}
+
+static struct rdma_id_private *cma_new_conn_id(struct rdma_cm_id *listen_id,
+					       struct ib_cm_event *ib_event)
+{
+	struct rdma_id_private *id_priv;
+	struct rdma_cm_id *id;
+	struct rdma_route *rt;
+	int ret;
+
+	id = rdma_create_id(listen_id->event_handler, listen_id->context,
+			    listen_id->ps, ib_event->param.req_rcvd.qp_type);
+	if (IS_ERR(id))
+		return NULL;
+
+	id_priv = container_of(id, struct rdma_id_private, id);
+	if (cma_save_net_info(id, listen_id, ib_event))
+		goto err;
+
+	rt = &id->route;
+	rt->num_paths = ib_event->param.req_rcvd.alternate_path ? 2 : 1;
+	rt->path_rec = kmalloc(sizeof *rt->path_rec * rt->num_paths,
+			       GFP_KERNEL);
+	if (!rt->path_rec)
+		goto err;
+
+	rt->path_rec[0] = *ib_event->param.req_rcvd.primary_path;
+	if (rt->num_paths == 2)
+		rt->path_rec[1] = *ib_event->param.req_rcvd.alternate_path;
+
+	if (cma_any_addr(cma_src_addr(id_priv))) {
+		rt->addr.dev_addr.dev_type = ARPHRD_INFINIBAND;
+		rdma_addr_set_sgid(&rt->addr.dev_addr, &rt->path_rec[0].sgid);
+		ib_addr_set_pkey(&rt->addr.dev_addr, be16_to_cpu(rt->path_rec[0].pkey));
+	} else {
+		ret = cma_translate_addr(cma_src_addr(id_priv), &rt->addr.dev_addr);
+		if (ret)
+			goto err;
+	}
+	rdma_addr_set_dgid(&rt->addr.dev_addr, &rt->path_rec[0].dgid);
+
+	id_priv->state = RDMA_CM_CONNECT;
+	return id_priv;
+
+err:
+	rdma_destroy_id(id);
+	return NULL;
+}
+
+static struct rdma_id_private *cma_new_udp_id(struct rdma_cm_id *listen_id,
+					      struct ib_cm_event *ib_event)
+{
+	struct rdma_id_private *id_priv;
+	struct rdma_cm_id *id;
+	int ret;
+
+	id = rdma_create_id(listen_id->event_handler, listen_id->context,
+			    listen_id->ps, IB_QPT_UD);
+	if (IS_ERR(id))
+		return NULL;
+
+	id_priv = container_of(id, struct rdma_id_private, id);
+	if (cma_save_net_info(id, listen_id, ib_event))
+		goto err;
+
+	if (!cma_any_addr((struct sockaddr *) &id->route.addr.src_addr)) {
+		ret = cma_translate_addr(cma_src_addr(id_priv), &id->route.addr.dev_addr);
+		if (ret)
+			goto err;
+	}
+
+	id_priv->state = RDMA_CM_CONNECT;
+	return id_priv;
+err:
+	rdma_destroy_id(id);
+	return NULL;
+}
+
+static void cma_set_req_event_data(struct rdma_cm_event *event,
+				   struct ib_cm_req_event_param *req_data,
+				   void *private_data, int offset)
+{
+	event->param.conn.private_data = private_data + offset;
+	event->param.conn.private_data_len = IB_CM_REQ_PRIVATE_DATA_SIZE - offset;
+	event->param.conn.responder_resources = req_data->responder_resources;
+	event->param.conn.initiator_depth = req_data->initiator_depth;
+	event->param.conn.flow_control = req_data->flow_control;
+	event->param.conn.retry_count = req_data->retry_count;
+	event->param.conn.rnr_retry_count = req_data->rnr_retry_count;
+	event->param.conn.srq = req_data->srq;
+	event->param.conn.qp_num = req_data->remote_qpn;
+}
+
+static int cma_check_req_qp_type(struct rdma_cm_id *id, struct ib_cm_event *ib_event)
+{
+	return (((ib_event->event == IB_CM_REQ_RECEIVED) &&
+		 (ib_event->param.req_rcvd.qp_type == id->qp_type)) ||
+		((ib_event->event == IB_CM_SIDR_REQ_RECEIVED) &&
+		 (id->qp_type == IB_QPT_UD)) ||
+		(!id->qp_type));
+}
+
+static int cma_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event)
+{
+	struct rdma_id_private *listen_id, *conn_id;
+	struct rdma_cm_event event;
+	int offset, ret;
+
+	listen_id = cm_id->context;
+	if (!cma_check_req_qp_type(&listen_id->id, ib_event))
+		return -EINVAL;
+
+	if (cma_disable_callback(listen_id, RDMA_CM_LISTEN))
+		return -ECONNABORTED;
+
+	memset(&event, 0, sizeof event);
+	offset = cma_user_data_offset(listen_id);
+	event.event = RDMA_CM_EVENT_CONNECT_REQUEST;
+	if (ib_event->event == IB_CM_SIDR_REQ_RECEIVED) {
+		conn_id = cma_new_udp_id(&listen_id->id, ib_event);
+		event.param.ud.private_data = ib_event->private_data + offset;
+		event.param.ud.private_data_len =
+				IB_CM_SIDR_REQ_PRIVATE_DATA_SIZE - offset;
+	} else {
+		conn_id = cma_new_conn_id(&listen_id->id, ib_event);
+		cma_set_req_event_data(&event, &ib_event->param.req_rcvd,
+				       ib_event->private_data, offset);
+	}
+	if (!conn_id) {
+		ret = -ENOMEM;
+		goto err1;
+	}
+
+	mutex_lock_nested(&conn_id->handler_mutex, SINGLE_DEPTH_NESTING);
+	ret = cma_acquire_dev(conn_id, listen_id);
+	if (ret)
+		goto err2;
+
+	conn_id->cm_id.ib = cm_id;
+	cm_id->context = conn_id;
+	cm_id->cm_handler = cma_ib_handler;
+
+	/*
+	 * Protect against the user destroying conn_id from another thread
+	 * until we're done accessing it.
+	 */
+	atomic_inc(&conn_id->refcount);
+	ret = conn_id->id.event_handler(&conn_id->id, &event);
+	if (ret)
+		goto err3;
+	/*
+	 * Acquire mutex to prevent user executing rdma_destroy_id()
+	 * while we're accessing the cm_id.
+	 */
+	mutex_lock(&lock);
+	if (cma_comp(conn_id, RDMA_CM_CONNECT) &&
+	    (conn_id->id.qp_type != IB_QPT_UD))
+		ib_send_cm_mra(cm_id, CMA_CM_MRA_SETTING, NULL, 0);
+	mutex_unlock(&lock);
+	mutex_unlock(&conn_id->handler_mutex);
+	mutex_unlock(&listen_id->handler_mutex);
+	cma_deref_id(conn_id);
+	return 0;
+
+err3:
+	cma_deref_id(conn_id);
+	/* Destroy the CM ID by returning a non-zero value. */
+	conn_id->cm_id.ib = NULL;
+err2:
+	cma_exch(conn_id, RDMA_CM_DESTROYING);
+	mutex_unlock(&conn_id->handler_mutex);
+err1:
+	mutex_unlock(&listen_id->handler_mutex);
+	if (conn_id)
+		rdma_destroy_id(&conn_id->id);
+	return ret;
+}
+
+__be64 rdma_get_service_id(struct rdma_cm_id *id, struct sockaddr *addr)
+{
+	if (addr->sa_family == AF_IB)
+		return ((struct sockaddr_ib *) addr)->sib_sid;
+
+	return cpu_to_be64(((u64)id->ps << 16) + be16_to_cpu(cma_port(addr)));
+}
+EXPORT_SYMBOL(rdma_get_service_id);
+
+static void cma_set_compare_data(enum rdma_port_space ps, struct sockaddr *addr,
+				 struct ib_cm_compare_data *compare)
+{
+	struct cma_hdr *cma_data, *cma_mask;
+	__be32 ip4_addr;
+	struct in6_addr ip6_addr;
+
+	memset(compare, 0, sizeof *compare);
+	cma_data = (void *) compare->data;
+	cma_mask = (void *) compare->mask;
+
+	switch (addr->sa_family) {
+	case AF_INET:
+		ip4_addr = ((struct sockaddr_in *) addr)->sin_addr.s_addr;
+		cma_set_ip_ver(cma_data, 4);
+		cma_set_ip_ver(cma_mask, 0xF);
+		if (!cma_any_addr(addr)) {
+			cma_data->dst_addr.ip4.addr = ip4_addr;
+			cma_mask->dst_addr.ip4.addr = htonl(~0);
+		}
+		break;
+	case AF_INET6:
+		ip6_addr = ((struct sockaddr_in6 *) addr)->sin6_addr;
+		cma_set_ip_ver(cma_data, 6);
+		cma_set_ip_ver(cma_mask, 0xF);
+		if (!cma_any_addr(addr)) {
+			cma_data->dst_addr.ip6 = ip6_addr;
+			memset(&cma_mask->dst_addr.ip6, 0xFF,
+			       sizeof cma_mask->dst_addr.ip6);
+		}
+		break;
+	default:
+		break;
+	}
+}
+
+static int cma_iw_handler(struct iw_cm_id *iw_id, struct iw_cm_event *iw_event)
+{
+	struct rdma_id_private *id_priv = iw_id->context;
+	struct rdma_cm_event event;
+	int ret = 0;
+	struct sockaddr *laddr = (struct sockaddr *)&iw_event->local_addr;
+	struct sockaddr *raddr = (struct sockaddr *)&iw_event->remote_addr;
+
+	if (cma_disable_callback(id_priv, RDMA_CM_CONNECT))
+		return 0;
+
+	memset(&event, 0, sizeof event);
+	switch (iw_event->event) {
+	case IW_CM_EVENT_CLOSE:
+		event.event = RDMA_CM_EVENT_DISCONNECTED;
+		break;
+	case IW_CM_EVENT_CONNECT_REPLY:
+		memcpy(cma_src_addr(id_priv), laddr,
+		       rdma_addr_size(laddr));
+		memcpy(cma_dst_addr(id_priv), raddr,
+		       rdma_addr_size(raddr));
+		switch (iw_event->status) {
+		case 0:
+			event.event = RDMA_CM_EVENT_ESTABLISHED;
+			event.param.conn.initiator_depth = iw_event->ird;
+			event.param.conn.responder_resources = iw_event->ord;
+			break;
+		case -ECONNRESET:
+		case -ECONNREFUSED:
+			event.event = RDMA_CM_EVENT_REJECTED;
+			break;
+		case -ETIMEDOUT:
+			event.event = RDMA_CM_EVENT_UNREACHABLE;
+			break;
+		default:
+			event.event = RDMA_CM_EVENT_CONNECT_ERROR;
+			break;
+		}
+		break;
+	case IW_CM_EVENT_ESTABLISHED:
+		event.event = RDMA_CM_EVENT_ESTABLISHED;
+		event.param.conn.initiator_depth = iw_event->ird;
+		event.param.conn.responder_resources = iw_event->ord;
+		break;
+	default:
+		BUG_ON(1);
+	}
+
+	event.status = iw_event->status;
+	event.param.conn.private_data = iw_event->private_data;
+	event.param.conn.private_data_len = iw_event->private_data_len;
+	ret = id_priv->id.event_handler(&id_priv->id, &event);
+	if (ret) {
+		/* Destroy the CM ID by returning a non-zero value. */
+		id_priv->cm_id.iw = NULL;
+		cma_exch(id_priv, RDMA_CM_DESTROYING);
+		mutex_unlock(&id_priv->handler_mutex);
+		rdma_destroy_id(&id_priv->id);
+		return ret;
+	}
+
+	mutex_unlock(&id_priv->handler_mutex);
+	return ret;
+}
+
+static int iw_conn_req_handler(struct iw_cm_id *cm_id,
+			       struct iw_cm_event *iw_event)
+{
+	struct rdma_cm_id *new_cm_id;
+	struct rdma_id_private *listen_id, *conn_id;
+	struct rdma_cm_event event;
+	int ret;
+	struct ib_device_attr attr;
+	struct sockaddr *laddr = (struct sockaddr *)&iw_event->local_addr;
+	struct sockaddr *raddr = (struct sockaddr *)&iw_event->remote_addr;
+
+	listen_id = cm_id->context;
+	if (cma_disable_callback(listen_id, RDMA_CM_LISTEN))
+		return -ECONNABORTED;
+
+	/* Create a new RDMA id for the new IW CM ID */
+	new_cm_id = rdma_create_id(listen_id->id.event_handler,
+				   listen_id->id.context,
+				   RDMA_PS_TCP, IB_QPT_RC);
+	if (IS_ERR(new_cm_id)) {
+		ret = -ENOMEM;
+		goto out;
+	}
+	conn_id = container_of(new_cm_id, struct rdma_id_private, id);
+	mutex_lock_nested(&conn_id->handler_mutex, SINGLE_DEPTH_NESTING);
+	conn_id->state = RDMA_CM_CONNECT;
+
+	ret = rdma_translate_ip(laddr, &conn_id->id.route.addr.dev_addr, NULL);
+	if (ret) {
+		mutex_unlock(&conn_id->handler_mutex);
+		rdma_destroy_id(new_cm_id);
+		goto out;
+	}
+
+	ret = cma_acquire_dev(conn_id, listen_id);
+	if (ret) {
+		mutex_unlock(&conn_id->handler_mutex);
+		rdma_destroy_id(new_cm_id);
+		goto out;
+	}
+
+	conn_id->cm_id.iw = cm_id;
+	cm_id->context = conn_id;
+	cm_id->cm_handler = cma_iw_handler;
+
+	memcpy(cma_src_addr(conn_id), laddr, rdma_addr_size(laddr));
+	memcpy(cma_dst_addr(conn_id), raddr, rdma_addr_size(raddr));
+
+	ret = ib_query_device(conn_id->id.device, &attr);
+	if (ret) {
+		mutex_unlock(&conn_id->handler_mutex);
+		rdma_destroy_id(new_cm_id);
+		goto out;
+	}
+
+	memset(&event, 0, sizeof event);
+	event.event = RDMA_CM_EVENT_CONNECT_REQUEST;
+	event.param.conn.private_data = iw_event->private_data;
+	event.param.conn.private_data_len = iw_event->private_data_len;
+	event.param.conn.initiator_depth = iw_event->ird;
+	event.param.conn.responder_resources = iw_event->ord;
+
+	/*
+	 * Protect against the user destroying conn_id from another thread
+	 * until we're done accessing it.
+	 */
+	atomic_inc(&conn_id->refcount);
+	ret = conn_id->id.event_handler(&conn_id->id, &event);
+	if (ret) {
+		/* User wants to destroy the CM ID */
+		conn_id->cm_id.iw = NULL;
+		cma_exch(conn_id, RDMA_CM_DESTROYING);
+		mutex_unlock(&conn_id->handler_mutex);
+		cma_deref_id(conn_id);
+		rdma_destroy_id(&conn_id->id);
+		goto out;
+	}
+
+	mutex_unlock(&conn_id->handler_mutex);
+	cma_deref_id(conn_id);
+
+out:
+	mutex_unlock(&listen_id->handler_mutex);
+	return ret;
+}
+
+static int cma_ib_listen(struct rdma_id_private *id_priv)
+{
+	struct ib_cm_compare_data compare_data;
+	struct sockaddr *addr;
+	struct ib_cm_id	*id;
+	__be64 svc_id;
+	int ret;
+
+	id = ib_create_cm_id(id_priv->id.device, cma_req_handler, id_priv);
+	if (IS_ERR(id))
+		return PTR_ERR(id);
+
+	id_priv->cm_id.ib = id;
+
+	addr = cma_src_addr(id_priv);
+	svc_id = rdma_get_service_id(&id_priv->id, addr);
+	if (cma_any_addr(addr) && !id_priv->afonly)
+		ret = ib_cm_listen(id_priv->cm_id.ib, svc_id, 0, NULL);
+	else {
+		cma_set_compare_data(id_priv->id.ps, addr, &compare_data);
+		ret = ib_cm_listen(id_priv->cm_id.ib, svc_id, 0, &compare_data);
+	}
+
+	if (ret) {
+		ib_destroy_cm_id(id_priv->cm_id.ib);
+		id_priv->cm_id.ib = NULL;
+	}
+
+	return ret;
+}
+
+static int cma_iw_listen(struct rdma_id_private *id_priv, int backlog)
+{
+	int ret;
+	struct iw_cm_id	*id;
+
+	id = iw_create_cm_id(id_priv->id.device,
+			     iw_conn_req_handler,
+			     id_priv);
+	if (IS_ERR(id))
+		return PTR_ERR(id);
+
+	id_priv->cm_id.iw = id;
+
+	memcpy(&id_priv->cm_id.iw->local_addr, cma_src_addr(id_priv),
+	       rdma_addr_size(cma_src_addr(id_priv)));
+
+	ret = iw_cm_listen(id_priv->cm_id.iw, backlog);
+
+	if (ret) {
+		iw_destroy_cm_id(id_priv->cm_id.iw);
+		id_priv->cm_id.iw = NULL;
+	}
+
+	return ret;
+}
+
+static int cma_listen_handler(struct rdma_cm_id *id,
+			      struct rdma_cm_event *event)
+{
+	struct rdma_id_private *id_priv = id->context;
+
+	id->context = id_priv->id.context;
+	id->event_handler = id_priv->id.event_handler;
+	return id_priv->id.event_handler(id, event);
+}
+
+static void cma_listen_on_dev(struct rdma_id_private *id_priv,
+			      struct cma_device *cma_dev)
+{
+	struct rdma_id_private *dev_id_priv;
+	struct rdma_cm_id *id;
+	int ret;
+
+	if (cma_family(id_priv) == AF_IB &&
+	    rdma_node_get_transport(cma_dev->device->node_type) != RDMA_TRANSPORT_IB)
+		return;
+
+	id = rdma_create_id(cma_listen_handler, id_priv, id_priv->id.ps,
+			    id_priv->id.qp_type);
+	if (IS_ERR(id))
+		return;
+
+	dev_id_priv = container_of(id, struct rdma_id_private, id);
+
+	dev_id_priv->state = RDMA_CM_ADDR_BOUND;
+	memcpy(cma_src_addr(dev_id_priv), cma_src_addr(id_priv),
+	       rdma_addr_size(cma_src_addr(id_priv)));
+
+	cma_attach_to_dev(dev_id_priv, cma_dev);
+	list_add_tail(&dev_id_priv->listen_list, &id_priv->listen_list);
+	atomic_inc(&id_priv->refcount);
+	dev_id_priv->internal_id = 1;
+	dev_id_priv->afonly = id_priv->afonly;
+
+	ret = rdma_listen(id, id_priv->backlog);
+	if (ret)
+		printk(KERN_WARNING "RDMA CMA: cma_listen_on_dev, error %d, "
+		       "listening on device %s\n", ret, cma_dev->device->name);
+}
+
+static void cma_listen_on_all(struct rdma_id_private *id_priv)
+{
+	struct cma_device *cma_dev;
+
+	mutex_lock(&lock);
+	list_add_tail(&id_priv->list, &listen_any_list);
+	list_for_each_entry(cma_dev, &dev_list, list)
+		cma_listen_on_dev(id_priv, cma_dev);
+	mutex_unlock(&lock);
+}
+
+void rdma_set_service_type(struct rdma_cm_id *id, int tos)
+{
+	struct rdma_id_private *id_priv;
+
+	id_priv = container_of(id, struct rdma_id_private, id);
+	id_priv->tos = (u8) tos;
+}
+EXPORT_SYMBOL(rdma_set_service_type);
+
+static void cma_query_handler(int status, struct ib_sa_path_rec *path_rec,
+			      void *context)
+{
+	struct cma_work *work = context;
+	struct rdma_route *route;
+
+	route = &work->id->id.route;
+
+	if (!status) {
+		route->num_paths = 1;
+		*route->path_rec = *path_rec;
+	} else {
+		work->old_state = RDMA_CM_ROUTE_QUERY;
+		work->new_state = RDMA_CM_ADDR_RESOLVED;
+		work->event.event = RDMA_CM_EVENT_ROUTE_ERROR;
+		work->event.status = status;
+	}
+
+	queue_work(cma_wq, &work->work);
+}
+
+static int cma_query_ib_route(struct rdma_id_private *id_priv, int timeout_ms,
+			      struct cma_work *work)
+{
+	struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr;
+	struct ib_sa_path_rec path_rec;
+	ib_sa_comp_mask comp_mask;
+	struct sockaddr_in6 *sin6;
+	struct sockaddr_ib *sib;
+
+	memset(&path_rec, 0, sizeof path_rec);
+	rdma_addr_get_sgid(dev_addr, &path_rec.sgid);
+	rdma_addr_get_dgid(dev_addr, &path_rec.dgid);
+	path_rec.pkey = cpu_to_be16(ib_addr_get_pkey(dev_addr));
+	path_rec.numb_path = 1;
+	path_rec.reversible = 1;
+	path_rec.service_id = rdma_get_service_id(&id_priv->id, cma_dst_addr(id_priv));
+
+	comp_mask = IB_SA_PATH_REC_DGID | IB_SA_PATH_REC_SGID |
+		    IB_SA_PATH_REC_PKEY | IB_SA_PATH_REC_NUMB_PATH |
+		    IB_SA_PATH_REC_REVERSIBLE | IB_SA_PATH_REC_SERVICE_ID;
+
+	switch (cma_family(id_priv)) {
+	case AF_INET:
+		path_rec.qos_class = cpu_to_be16((u16) id_priv->tos);
+		comp_mask |= IB_SA_PATH_REC_QOS_CLASS;
+		break;
+	case AF_INET6:
+		sin6 = (struct sockaddr_in6 *) cma_src_addr(id_priv);
+		path_rec.traffic_class = (u8) (be32_to_cpu(sin6->sin6_flowinfo) >> 20);
+		comp_mask |= IB_SA_PATH_REC_TRAFFIC_CLASS;
+		break;
+	case AF_IB:
+		sib = (struct sockaddr_ib *) cma_src_addr(id_priv);
+		path_rec.traffic_class = (u8) (be32_to_cpu(sib->sib_flowinfo) >> 20);
+		comp_mask |= IB_SA_PATH_REC_TRAFFIC_CLASS;
+		break;
+	}
+
+	id_priv->query_id = ib_sa_path_rec_get(&sa_client, id_priv->id.device,
+					       id_priv->id.port_num, &path_rec,
+					       comp_mask, timeout_ms,
+					       GFP_KERNEL, cma_query_handler,
+					       work, &id_priv->query);
+
+	return (id_priv->query_id < 0) ? id_priv->query_id : 0;
+}
+
+static void cma_work_handler(struct work_struct *_work)
+{
+	struct cma_work *work = container_of(_work, struct cma_work, work);
+	struct rdma_id_private *id_priv = work->id;
+	int destroy = 0;
+
+	mutex_lock(&id_priv->handler_mutex);
+	if (!cma_comp_exch(id_priv, work->old_state, work->new_state))
+		goto out;
+
+	if (id_priv->id.event_handler(&id_priv->id, &work->event)) {
+		cma_exch(id_priv, RDMA_CM_DESTROYING);
+		destroy = 1;
+	}
+out:
+	mutex_unlock(&id_priv->handler_mutex);
+	cma_deref_id(id_priv);
+	if (destroy)
+		rdma_destroy_id(&id_priv->id);
+	kfree(work);
+}
+
+static void cma_ndev_work_handler(struct work_struct *_work)
+{
+	struct cma_ndev_work *work = container_of(_work, struct cma_ndev_work, work);
+	struct rdma_id_private *id_priv = work->id;
+	int destroy = 0;
+
+	mutex_lock(&id_priv->handler_mutex);
+	if (id_priv->state == RDMA_CM_DESTROYING ||
+	    id_priv->state == RDMA_CM_DEVICE_REMOVAL)
+		goto out;
+
+	if (id_priv->id.event_handler(&id_priv->id, &work->event)) {
+		cma_exch(id_priv, RDMA_CM_DESTROYING);
+		destroy = 1;
+	}
+
+out:
+	mutex_unlock(&id_priv->handler_mutex);
+	cma_deref_id(id_priv);
+	if (destroy)
+		rdma_destroy_id(&id_priv->id);
+	kfree(work);
+}
+
+static int cma_resolve_ib_route(struct rdma_id_private *id_priv, int timeout_ms)
+{
+	struct rdma_route *route = &id_priv->id.route;
+	struct cma_work *work;
+	int ret;
+
+	work = kzalloc(sizeof *work, GFP_KERNEL);
+	if (!work)
+		return -ENOMEM;
+
+	work->id = id_priv;
+	INIT_WORK(&work->work, cma_work_handler);
+	work->old_state = RDMA_CM_ROUTE_QUERY;
+	work->new_state = RDMA_CM_ROUTE_RESOLVED;
+	work->event.event = RDMA_CM_EVENT_ROUTE_RESOLVED;
+
+	route->path_rec = kmalloc(sizeof *route->path_rec, GFP_KERNEL);
+	if (!route->path_rec) {
+		ret = -ENOMEM;
+		goto err1;
+	}
+
+	ret = cma_query_ib_route(id_priv, timeout_ms, work);
+	if (ret)
+		goto err2;
+
+	return 0;
+err2:
+	kfree(route->path_rec);
+	route->path_rec = NULL;
+err1:
+	kfree(work);
+	return ret;
+}
+
+int rdma_set_ib_paths(struct rdma_cm_id *id,
+		      struct ib_sa_path_rec *path_rec, int num_paths)
+{
+	struct rdma_id_private *id_priv;
+	int ret;
+
+	id_priv = container_of(id, struct rdma_id_private, id);
+	if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_RESOLVED,
+			   RDMA_CM_ROUTE_RESOLVED))
+		return -EINVAL;
+
+	id->route.path_rec = kmemdup(path_rec, sizeof *path_rec * num_paths,
+				     GFP_KERNEL);
+	if (!id->route.path_rec) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	id->route.num_paths = num_paths;
+	return 0;
+err:
+	cma_comp_exch(id_priv, RDMA_CM_ROUTE_RESOLVED, RDMA_CM_ADDR_RESOLVED);
+	return ret;
+}
+EXPORT_SYMBOL(rdma_set_ib_paths);
+
+static int cma_resolve_iw_route(struct rdma_id_private *id_priv, int timeout_ms)
+{
+	struct cma_work *work;
+
+	work = kzalloc(sizeof *work, GFP_KERNEL);
+	if (!work)
+		return -ENOMEM;
+
+	work->id = id_priv;
+	INIT_WORK(&work->work, cma_work_handler);
+	work->old_state = RDMA_CM_ROUTE_QUERY;
+	work->new_state = RDMA_CM_ROUTE_RESOLVED;
+	work->event.event = RDMA_CM_EVENT_ROUTE_RESOLVED;
+	queue_work(cma_wq, &work->work);
+	return 0;
+}
+
+static int iboe_tos_to_sl(struct net_device *ndev, int tos)
+{
+	int prio;
+	struct net_device *dev;
+
+	prio = rt_tos2priority(tos);
+	dev = ndev->priv_flags & IFF_802_1Q_VLAN ?
+		vlan_dev_real_dev(ndev) : ndev;
+
+	if (dev->num_tc)
+		return netdev_get_prio_tc_map(dev, prio);
+
+#if IS_ENABLED(CONFIG_VLAN_8021Q)
+	if (ndev->priv_flags & IFF_802_1Q_VLAN)
+		return (vlan_dev_get_egress_qos_mask(ndev, prio) &
+			VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT;
+#endif
+	return 0;
+}
+
+static int cma_resolve_iboe_route(struct rdma_id_private *id_priv)
+{
+	struct rdma_route *route = &id_priv->id.route;
+	struct rdma_addr *addr = &route->addr;
+	struct cma_work *work;
+	int ret;
+	struct net_device *ndev = NULL;
+
+
+	work = kzalloc(sizeof *work, GFP_KERNEL);
+	if (!work)
+		return -ENOMEM;
+
+	work->id = id_priv;
+	INIT_WORK(&work->work, cma_work_handler);
+
+	route->path_rec = kzalloc(sizeof *route->path_rec, GFP_KERNEL);
+	if (!route->path_rec) {
+		ret = -ENOMEM;
+		goto err1;
+	}
+
+	route->num_paths = 1;
+
+	if (addr->dev_addr.bound_dev_if)
+		ndev = dev_get_by_index(&init_net, addr->dev_addr.bound_dev_if);
+	if (!ndev) {
+		ret = -ENODEV;
+		goto err2;
+	}
+
+	route->path_rec->vlan_id = rdma_vlan_dev_vlan_id(ndev);
+	memcpy(route->path_rec->dmac, addr->dev_addr.dst_dev_addr, ETH_ALEN);
+	memcpy(route->path_rec->smac, ndev->dev_addr, ndev->addr_len);
+
+	rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.src_addr,
+		    &route->path_rec->sgid);
+	rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.dst_addr,
+		    &route->path_rec->dgid);
+
+	route->path_rec->hop_limit = 1;
+	route->path_rec->reversible = 1;
+	route->path_rec->pkey = cpu_to_be16(0xffff);
+	route->path_rec->mtu_selector = IB_SA_EQ;
+	route->path_rec->sl = iboe_tos_to_sl(ndev, id_priv->tos);
+	route->path_rec->mtu = iboe_get_mtu(ndev->mtu);
+	route->path_rec->rate_selector = IB_SA_EQ;
+	route->path_rec->rate = iboe_get_rate(ndev);
+	dev_put(ndev);
+	route->path_rec->packet_life_time_selector = IB_SA_EQ;
+	route->path_rec->packet_life_time = CMA_IBOE_PACKET_LIFETIME;
+	if (!route->path_rec->mtu) {
+		ret = -EINVAL;
+		goto err2;
+	}
+
+	work->old_state = RDMA_CM_ROUTE_QUERY;
+	work->new_state = RDMA_CM_ROUTE_RESOLVED;
+	work->event.event = RDMA_CM_EVENT_ROUTE_RESOLVED;
+	work->event.status = 0;
+
+	queue_work(cma_wq, &work->work);
+
+	return 0;
+
+err2:
+	kfree(route->path_rec);
+	route->path_rec = NULL;
+err1:
+	kfree(work);
+	return ret;
+}
+
+int rdma_resolve_route(struct rdma_cm_id *id, int timeout_ms)
+{
+	struct rdma_id_private *id_priv;
+	int ret;
+
+	id_priv = container_of(id, struct rdma_id_private, id);
+	if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_RESOLVED, RDMA_CM_ROUTE_QUERY))
+		return -EINVAL;
+
+	atomic_inc(&id_priv->refcount);
+	switch (rdma_node_get_transport(id->device->node_type)) {
+	case RDMA_TRANSPORT_IB:
+		switch (rdma_port_get_link_layer(id->device, id->port_num)) {
+		case IB_LINK_LAYER_INFINIBAND:
+			ret = cma_resolve_ib_route(id_priv, timeout_ms);
+			break;
+		case IB_LINK_LAYER_ETHERNET:
+			ret = cma_resolve_iboe_route(id_priv);
+			break;
+		default:
+			ret = -ENOSYS;
+		}
+		break;
+	case RDMA_TRANSPORT_IWARP:
+		ret = cma_resolve_iw_route(id_priv, timeout_ms);
+		break;
+	default:
+		ret = -ENOSYS;
+		break;
+	}
+	if (ret)
+		goto err;
+
+	return 0;
+err:
+	cma_comp_exch(id_priv, RDMA_CM_ROUTE_QUERY, RDMA_CM_ADDR_RESOLVED);
+	cma_deref_id(id_priv);
+	return ret;
+}
+EXPORT_SYMBOL(rdma_resolve_route);
+
+static void cma_set_loopback(struct sockaddr *addr)
+{
+	switch (addr->sa_family) {
+	case AF_INET:
+		((struct sockaddr_in *) addr)->sin_addr.s_addr = htonl(INADDR_LOOPBACK);
+		break;
+	case AF_INET6:
+		ipv6_addr_set(&((struct sockaddr_in6 *) addr)->sin6_addr,
+			      0, 0, 0, htonl(1));
+		break;
+	default:
+		ib_addr_set(&((struct sockaddr_ib *) addr)->sib_addr,
+			    0, 0, 0, htonl(1));
+		break;
+	}
+}
+
+static int cma_bind_loopback(struct rdma_id_private *id_priv)
+{
+	struct cma_device *cma_dev, *cur_dev;
+	struct ib_port_attr port_attr;
+	union ib_gid gid;
+	u16 pkey;
+	int ret;
+	u8 p;
+
+	cma_dev = NULL;
+	mutex_lock(&lock);
+	list_for_each_entry(cur_dev, &dev_list, list) {
+		if (cma_family(id_priv) == AF_IB &&
+		    rdma_node_get_transport(cur_dev->device->node_type) != RDMA_TRANSPORT_IB)
+			continue;
+
+		if (!cma_dev)
+			cma_dev = cur_dev;
+
+		for (p = 1; p <= cur_dev->device->phys_port_cnt; ++p) {
+			if (!ib_query_port(cur_dev->device, p, &port_attr) &&
+			    port_attr.state == IB_PORT_ACTIVE) {
+				cma_dev = cur_dev;
+				goto port_found;
+			}
+		}
+	}
+
+	if (!cma_dev) {
+		ret = -ENODEV;
+		goto out;
+	}
+
+	p = 1;
+
+port_found:
+	ret = ib_get_cached_gid(cma_dev->device, p, 0, &gid);
+	if (ret)
+		goto out;
+
+	ret = ib_get_cached_pkey(cma_dev->device, p, 0, &pkey);
+	if (ret)
+		goto out;
+
+	id_priv->id.route.addr.dev_addr.dev_type =
+		(rdma_port_get_link_layer(cma_dev->device, p) == IB_LINK_LAYER_INFINIBAND) ?
+		ARPHRD_INFINIBAND : ARPHRD_ETHER;
+
+	rdma_addr_set_sgid(&id_priv->id.route.addr.dev_addr, &gid);
+	ib_addr_set_pkey(&id_priv->id.route.addr.dev_addr, pkey);
+	id_priv->id.port_num = p;
+	cma_attach_to_dev(id_priv, cma_dev);
+	cma_set_loopback(cma_src_addr(id_priv));
+out:
+	mutex_unlock(&lock);
+	return ret;
+}
+
+static void addr_handler(int status, struct sockaddr *src_addr,
+			 struct rdma_dev_addr *dev_addr, void *context)
+{
+	struct rdma_id_private *id_priv = context;
+	struct rdma_cm_event event;
+
+	memset(&event, 0, sizeof event);
+	mutex_lock(&id_priv->handler_mutex);
+	if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_QUERY,
+			   RDMA_CM_ADDR_RESOLVED))
+		goto out;
+
+	memcpy(cma_src_addr(id_priv), src_addr, rdma_addr_size(src_addr));
+	if (!status && !id_priv->cma_dev)
+		status = cma_acquire_dev(id_priv, NULL);
+
+	if (status) {
+		if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_RESOLVED,
+				   RDMA_CM_ADDR_BOUND))
+			goto out;
+		event.event = RDMA_CM_EVENT_ADDR_ERROR;
+		event.status = status;
+	} else
+		event.event = RDMA_CM_EVENT_ADDR_RESOLVED;
+
+	if (id_priv->id.event_handler(&id_priv->id, &event)) {
+		cma_exch(id_priv, RDMA_CM_DESTROYING);
+		mutex_unlock(&id_priv->handler_mutex);
+		cma_deref_id(id_priv);
+		rdma_destroy_id(&id_priv->id);
+		return;
+	}
+out:
+	mutex_unlock(&id_priv->handler_mutex);
+	cma_deref_id(id_priv);
+}
+
+static int cma_resolve_loopback(struct rdma_id_private *id_priv)
+{
+	struct cma_work *work;
+	union ib_gid gid;
+	int ret;
+
+	work = kzalloc(sizeof *work, GFP_KERNEL);
+	if (!work)
+		return -ENOMEM;
+
+	if (!id_priv->cma_dev) {
+		ret = cma_bind_loopback(id_priv);
+		if (ret)
+			goto err;
+	}
+
+	rdma_addr_get_sgid(&id_priv->id.route.addr.dev_addr, &gid);
+	rdma_addr_set_dgid(&id_priv->id.route.addr.dev_addr, &gid);
+
+	work->id = id_priv;
+	INIT_WORK(&work->work, cma_work_handler);
+	work->old_state = RDMA_CM_ADDR_QUERY;
+	work->new_state = RDMA_CM_ADDR_RESOLVED;
+	work->event.event = RDMA_CM_EVENT_ADDR_RESOLVED;
+	queue_work(cma_wq, &work->work);
+	return 0;
+err:
+	kfree(work);
+	return ret;
+}
+
+static int cma_resolve_ib_addr(struct rdma_id_private *id_priv)
+{
+	struct cma_work *work;
+	int ret;
+
+	work = kzalloc(sizeof *work, GFP_KERNEL);
+	if (!work)
+		return -ENOMEM;
+
+	if (!id_priv->cma_dev) {
+		ret = cma_resolve_ib_dev(id_priv);
+		if (ret)
+			goto err;
+	}
+
+	rdma_addr_set_dgid(&id_priv->id.route.addr.dev_addr, (union ib_gid *)
+		&(((struct sockaddr_ib *) &id_priv->id.route.addr.dst_addr)->sib_addr));
+
+	work->id = id_priv;
+	INIT_WORK(&work->work, cma_work_handler);
+	work->old_state = RDMA_CM_ADDR_QUERY;
+	work->new_state = RDMA_CM_ADDR_RESOLVED;
+	work->event.event = RDMA_CM_EVENT_ADDR_RESOLVED;
+	queue_work(cma_wq, &work->work);
+	return 0;
+err:
+	kfree(work);
+	return ret;
+}
+
+static int cma_bind_addr(struct rdma_cm_id *id, struct sockaddr *src_addr,
+			 struct sockaddr *dst_addr)
+{
+	if (!src_addr || !src_addr->sa_family) {
+		src_addr = (struct sockaddr *) &id->route.addr.src_addr;
+		src_addr->sa_family = dst_addr->sa_family;
+		if (dst_addr->sa_family == AF_INET6) {
+			((struct sockaddr_in6 *) src_addr)->sin6_scope_id =
+				((struct sockaddr_in6 *) dst_addr)->sin6_scope_id;
+		} else if (dst_addr->sa_family == AF_IB) {
+			((struct sockaddr_ib *) src_addr)->sib_pkey =
+				((struct sockaddr_ib *) dst_addr)->sib_pkey;
+		}
+	}
+	return rdma_bind_addr(id, src_addr);
+}
+
+int rdma_resolve_addr(struct rdma_cm_id *id, struct sockaddr *src_addr,
+		      struct sockaddr *dst_addr, int timeout_ms)
+{
+	struct rdma_id_private *id_priv;
+	int ret;
+
+	id_priv = container_of(id, struct rdma_id_private, id);
+	if (id_priv->state == RDMA_CM_IDLE) {
+		ret = cma_bind_addr(id, src_addr, dst_addr);
+		if (ret)
+			return ret;
+	}
+
+	if (cma_family(id_priv) != dst_addr->sa_family)
+		return -EINVAL;
+
+	if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_ADDR_QUERY))
+		return -EINVAL;
+
+	atomic_inc(&id_priv->refcount);
+	memcpy(cma_dst_addr(id_priv), dst_addr, rdma_addr_size(dst_addr));
+	if (cma_any_addr(dst_addr)) {
+		ret = cma_resolve_loopback(id_priv);
+	} else {
+		if (dst_addr->sa_family == AF_IB) {
+			ret = cma_resolve_ib_addr(id_priv);
+		} else {
+			ret = rdma_resolve_ip(&addr_client, cma_src_addr(id_priv),
+					      dst_addr, &id->route.addr.dev_addr,
+					      timeout_ms, addr_handler, id_priv);
+		}
+	}
+	if (ret)
+		goto err;
+
+	return 0;
+err:
+	cma_comp_exch(id_priv, RDMA_CM_ADDR_QUERY, RDMA_CM_ADDR_BOUND);
+	cma_deref_id(id_priv);
+	return ret;
+}
+EXPORT_SYMBOL(rdma_resolve_addr);
+
+int rdma_set_reuseaddr(struct rdma_cm_id *id, int reuse)
+{
+	struct rdma_id_private *id_priv;
+	unsigned long flags;
+	int ret;
+
+	id_priv = container_of(id, struct rdma_id_private, id);
+	spin_lock_irqsave(&id_priv->lock, flags);
+	if (reuse || id_priv->state == RDMA_CM_IDLE) {
+		id_priv->reuseaddr = reuse;
+		ret = 0;
+	} else {
+		ret = -EINVAL;
+	}
+	spin_unlock_irqrestore(&id_priv->lock, flags);
+	return ret;
+}
+EXPORT_SYMBOL(rdma_set_reuseaddr);
+
+int rdma_set_afonly(struct rdma_cm_id *id, int afonly)
+{
+	struct rdma_id_private *id_priv;
+	unsigned long flags;
+	int ret;
+
+	id_priv = container_of(id, struct rdma_id_private, id);
+	spin_lock_irqsave(&id_priv->lock, flags);
+	if (id_priv->state == RDMA_CM_IDLE || id_priv->state == RDMA_CM_ADDR_BOUND) {
+		id_priv->options |= (1 << CMA_OPTION_AFONLY);
+		id_priv->afonly = afonly;
+		ret = 0;
+	} else {
+		ret = -EINVAL;
+	}
+	spin_unlock_irqrestore(&id_priv->lock, flags);
+	return ret;
+}
+EXPORT_SYMBOL(rdma_set_afonly);
+
+static void cma_bind_port(struct rdma_bind_list *bind_list,
+			  struct rdma_id_private *id_priv)
+{
+	struct sockaddr *addr;
+	struct sockaddr_ib *sib;
+	u64 sid, mask;
+	__be16 port;
+
+	addr = cma_src_addr(id_priv);
+	port = htons(bind_list->port);
+
+	switch (addr->sa_family) {
+	case AF_INET:
+		((struct sockaddr_in *) addr)->sin_port = port;
+		break;
+	case AF_INET6:
+		((struct sockaddr_in6 *) addr)->sin6_port = port;
+		break;
+	case AF_IB:
+		sib = (struct sockaddr_ib *) addr;
+		sid = be64_to_cpu(sib->sib_sid);
+		mask = be64_to_cpu(sib->sib_sid_mask);
+		sib->sib_sid = cpu_to_be64((sid & mask) | (u64) ntohs(port));
+		sib->sib_sid_mask = cpu_to_be64(~0ULL);
+		break;
+	}
+	id_priv->bind_list = bind_list;
+	hlist_add_head(&id_priv->node, &bind_list->owners);
+}
+
+static int cma_alloc_port(struct idr *ps, struct rdma_id_private *id_priv,
+			  unsigned short snum)
+{
+	struct rdma_bind_list *bind_list;
+	int ret;
+
+	bind_list = kzalloc(sizeof *bind_list, GFP_KERNEL);
+	if (!bind_list)
+		return -ENOMEM;
+
+	ret = idr_alloc(ps, bind_list, snum, snum + 1, GFP_KERNEL);
+	if (ret < 0)
+		goto err;
+
+	bind_list->ps = ps;
+	bind_list->port = (unsigned short)ret;
+	cma_bind_port(bind_list, id_priv);
+	return 0;
+err:
+	kfree(bind_list);
+	return ret == -ENOSPC ? -EADDRNOTAVAIL : ret;
+}
+
+static int cma_alloc_any_port(struct idr *ps, struct rdma_id_private *id_priv)
+{
+	static unsigned int last_used_port;
+	int low, high, remaining;
+	unsigned int rover;
+
+	inet_get_local_port_range(&init_net, &low, &high);
+	remaining = (high - low) + 1;
+	rover = prandom_u32() % remaining + low;
+retry:
+	if (last_used_port != rover &&
+	    !idr_find(ps, (unsigned short) rover)) {
+		int ret = cma_alloc_port(ps, id_priv, rover);
+		/*
+		 * Remember previously used port number in order to avoid
+		 * re-using same port immediately after it is closed.
+		 */
+		if (!ret)
+			last_used_port = rover;
+		if (ret != -EADDRNOTAVAIL)
+			return ret;
+	}
+	if (--remaining) {
+		rover++;
+		if ((rover < low) || (rover > high))
+			rover = low;
+		goto retry;
+	}
+	return -EADDRNOTAVAIL;
+}
+
+/*
+ * Check that the requested port is available.  This is called when trying to
+ * bind to a specific port, or when trying to listen on a bound port.  In
+ * the latter case, the provided id_priv may already be on the bind_list, but
+ * we still need to check that it's okay to start listening.
+ */
+static int cma_check_port(struct rdma_bind_list *bind_list,
+			  struct rdma_id_private *id_priv, uint8_t reuseaddr)
+{
+	struct rdma_id_private *cur_id;
+	struct sockaddr *addr, *cur_addr;
+
+	addr = cma_src_addr(id_priv);
+	hlist_for_each_entry(cur_id, &bind_list->owners, node) {
+		if (id_priv == cur_id)
+			continue;
+
+		if ((cur_id->state != RDMA_CM_LISTEN) && reuseaddr &&
+		    cur_id->reuseaddr)
+			continue;
+
+		cur_addr = cma_src_addr(cur_id);
+		if (id_priv->afonly && cur_id->afonly &&
+		    (addr->sa_family != cur_addr->sa_family))
+			continue;
+
+		if (cma_any_addr(addr) || cma_any_addr(cur_addr))
+			return -EADDRNOTAVAIL;
+
+		if (!cma_addr_cmp(addr, cur_addr))
+			return -EADDRINUSE;
+	}
+	return 0;
+}
+
+static int cma_use_port(struct idr *ps, struct rdma_id_private *id_priv)
+{
+	struct rdma_bind_list *bind_list;
+	unsigned short snum;
+	int ret;
+
+	snum = ntohs(cma_port(cma_src_addr(id_priv)));
+	if (snum < PROT_SOCK && !capable(CAP_NET_BIND_SERVICE))
+		return -EACCES;
+
+	bind_list = idr_find(ps, snum);
+	if (!bind_list) {
+		ret = cma_alloc_port(ps, id_priv, snum);
+	} else {
+		ret = cma_check_port(bind_list, id_priv, id_priv->reuseaddr);
+		if (!ret)
+			cma_bind_port(bind_list, id_priv);
+	}
+	return ret;
+}
+
+static int cma_bind_listen(struct rdma_id_private *id_priv)
+{
+	struct rdma_bind_list *bind_list = id_priv->bind_list;
+	int ret = 0;
+
+	mutex_lock(&lock);
+	if (bind_list->owners.first->next)
+		ret = cma_check_port(bind_list, id_priv, 0);
+	mutex_unlock(&lock);
+	return ret;
+}
+
+static struct idr *cma_select_inet_ps(struct rdma_id_private *id_priv)
+{
+	switch (id_priv->id.ps) {
+	case RDMA_PS_TCP:
+		return &tcp_ps;
+	case RDMA_PS_UDP:
+		return &udp_ps;
+	case RDMA_PS_IPOIB:
+		return &ipoib_ps;
+	case RDMA_PS_IB:
+		return &ib_ps;
+	default:
+		return NULL;
+	}
+}
+
+static struct idr *cma_select_ib_ps(struct rdma_id_private *id_priv)
+{
+	struct idr *ps = NULL;
+	struct sockaddr_ib *sib;
+	u64 sid_ps, mask, sid;
+
+	sib = (struct sockaddr_ib *) cma_src_addr(id_priv);
+	mask = be64_to_cpu(sib->sib_sid_mask) & RDMA_IB_IP_PS_MASK;
+	sid = be64_to_cpu(sib->sib_sid) & mask;
+
+	if ((id_priv->id.ps == RDMA_PS_IB) && (sid == (RDMA_IB_IP_PS_IB & mask))) {
+		sid_ps = RDMA_IB_IP_PS_IB;
+		ps = &ib_ps;
+	} else if (((id_priv->id.ps == RDMA_PS_IB) || (id_priv->id.ps == RDMA_PS_TCP)) &&
+		   (sid == (RDMA_IB_IP_PS_TCP & mask))) {
+		sid_ps = RDMA_IB_IP_PS_TCP;
+		ps = &tcp_ps;
+	} else if (((id_priv->id.ps == RDMA_PS_IB) || (id_priv->id.ps == RDMA_PS_UDP)) &&
+		   (sid == (RDMA_IB_IP_PS_UDP & mask))) {
+		sid_ps = RDMA_IB_IP_PS_UDP;
+		ps = &udp_ps;
+	}
+
+	if (ps) {
+		sib->sib_sid = cpu_to_be64(sid_ps | ntohs(cma_port((struct sockaddr *) sib)));
+		sib->sib_sid_mask = cpu_to_be64(RDMA_IB_IP_PS_MASK |
+						be64_to_cpu(sib->sib_sid_mask));
+	}
+	return ps;
+}
+
+static int cma_get_port(struct rdma_id_private *id_priv)
+{
+	struct idr *ps;
+	int ret;
+
+	if (cma_family(id_priv) != AF_IB)
+		ps = cma_select_inet_ps(id_priv);
+	else
+		ps = cma_select_ib_ps(id_priv);
+	if (!ps)
+		return -EPROTONOSUPPORT;
+
+	mutex_lock(&lock);
+	if (cma_any_port(cma_src_addr(id_priv)))
+		ret = cma_alloc_any_port(ps, id_priv);
+	else
+		ret = cma_use_port(ps, id_priv);
+	mutex_unlock(&lock);
+
+	return ret;
+}
+
+static int cma_check_linklocal(struct rdma_dev_addr *dev_addr,
+			       struct sockaddr *addr)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+	struct sockaddr_in6 *sin6;
+
+	if (addr->sa_family != AF_INET6)
+		return 0;
+
+	sin6 = (struct sockaddr_in6 *) addr;
+
+	if (!(ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL))
+		return 0;
+
+	if (!sin6->sin6_scope_id)
+			return -EINVAL;
+
+	dev_addr->bound_dev_if = sin6->sin6_scope_id;
+#endif
+	return 0;
+}
+
+int rdma_listen(struct rdma_cm_id *id, int backlog)
+{
+	struct rdma_id_private *id_priv;
+	int ret;
+
+	id_priv = container_of(id, struct rdma_id_private, id);
+	if (id_priv->state == RDMA_CM_IDLE) {
+		id->route.addr.src_addr.ss_family = AF_INET;
+		ret = rdma_bind_addr(id, cma_src_addr(id_priv));
+		if (ret)
+			return ret;
+	}
+
+	if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_LISTEN))
+		return -EINVAL;
+
+	if (id_priv->reuseaddr) {
+		ret = cma_bind_listen(id_priv);
+		if (ret)
+			goto err;
+	}
+
+	id_priv->backlog = backlog;
+	if (id->device) {
+		switch (rdma_node_get_transport(id->device->node_type)) {
+		case RDMA_TRANSPORT_IB:
+			ret = cma_ib_listen(id_priv);
+			if (ret)
+				goto err;
+			break;
+		case RDMA_TRANSPORT_IWARP:
+			ret = cma_iw_listen(id_priv, backlog);
+			if (ret)
+				goto err;
+			break;
+		default:
+			ret = -ENOSYS;
+			goto err;
+		}
+	} else
+		cma_listen_on_all(id_priv);
+
+	return 0;
+err:
+	id_priv->backlog = 0;
+	cma_comp_exch(id_priv, RDMA_CM_LISTEN, RDMA_CM_ADDR_BOUND);
+	return ret;
+}
+EXPORT_SYMBOL(rdma_listen);
+
+int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr *addr)
+{
+	struct rdma_id_private *id_priv;
+	int ret;
+
+	if (addr->sa_family != AF_INET && addr->sa_family != AF_INET6 &&
+	    addr->sa_family != AF_IB)
+		return -EAFNOSUPPORT;
+
+	id_priv = container_of(id, struct rdma_id_private, id);
+	if (!cma_comp_exch(id_priv, RDMA_CM_IDLE, RDMA_CM_ADDR_BOUND))
+		return -EINVAL;
+
+	ret = cma_check_linklocal(&id->route.addr.dev_addr, addr);
+	if (ret)
+		goto err1;
+
+	memcpy(cma_src_addr(id_priv), addr, rdma_addr_size(addr));
+	if (!cma_any_addr(addr)) {
+		ret = cma_translate_addr(addr, &id->route.addr.dev_addr);
+		if (ret)
+			goto err1;
+
+		ret = cma_acquire_dev(id_priv, NULL);
+		if (ret)
+			goto err1;
+	}
+
+	if (!(id_priv->options & (1 << CMA_OPTION_AFONLY))) {
+		if (addr->sa_family == AF_INET)
+			id_priv->afonly = 1;
+#if IS_ENABLED(CONFIG_IPV6)
+		else if (addr->sa_family == AF_INET6)
+			id_priv->afonly = init_net.ipv6.sysctl.bindv6only;
+#endif
+	}
+	ret = cma_get_port(id_priv);
+	if (ret)
+		goto err2;
+
+	return 0;
+err2:
+	if (id_priv->cma_dev)
+		cma_release_dev(id_priv);
+err1:
+	cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_IDLE);
+	return ret;
+}
+EXPORT_SYMBOL(rdma_bind_addr);
+
+static int cma_format_hdr(void *hdr, struct rdma_id_private *id_priv)
+{
+	struct cma_hdr *cma_hdr;
+
+	cma_hdr = hdr;
+	cma_hdr->cma_version = CMA_VERSION;
+	if (cma_family(id_priv) == AF_INET) {
+		struct sockaddr_in *src4, *dst4;
+
+		src4 = (struct sockaddr_in *) cma_src_addr(id_priv);
+		dst4 = (struct sockaddr_in *) cma_dst_addr(id_priv);
+
+		cma_set_ip_ver(cma_hdr, 4);
+		cma_hdr->src_addr.ip4.addr = src4->sin_addr.s_addr;
+		cma_hdr->dst_addr.ip4.addr = dst4->sin_addr.s_addr;
+		cma_hdr->port = src4->sin_port;
+	} else if (cma_family(id_priv) == AF_INET6) {
+		struct sockaddr_in6 *src6, *dst6;
+
+		src6 = (struct sockaddr_in6 *) cma_src_addr(id_priv);
+		dst6 = (struct sockaddr_in6 *) cma_dst_addr(id_priv);
+
+		cma_set_ip_ver(cma_hdr, 6);
+		cma_hdr->src_addr.ip6 = src6->sin6_addr;
+		cma_hdr->dst_addr.ip6 = dst6->sin6_addr;
+		cma_hdr->port = src6->sin6_port;
+	}
+	return 0;
+}
+
+static int cma_sidr_rep_handler(struct ib_cm_id *cm_id,
+				struct ib_cm_event *ib_event)
+{
+	struct rdma_id_private *id_priv = cm_id->context;
+	struct rdma_cm_event event;
+	struct ib_cm_sidr_rep_event_param *rep = &ib_event->param.sidr_rep_rcvd;
+	int ret = 0;
+
+	if (cma_disable_callback(id_priv, RDMA_CM_CONNECT))
+		return 0;
+
+	memset(&event, 0, sizeof event);
+	switch (ib_event->event) {
+	case IB_CM_SIDR_REQ_ERROR:
+		event.event = RDMA_CM_EVENT_UNREACHABLE;
+		event.status = -ETIMEDOUT;
+		break;
+	case IB_CM_SIDR_REP_RECEIVED:
+		event.param.ud.private_data = ib_event->private_data;
+		event.param.ud.private_data_len = IB_CM_SIDR_REP_PRIVATE_DATA_SIZE;
+		if (rep->status != IB_SIDR_SUCCESS) {
+			event.event = RDMA_CM_EVENT_UNREACHABLE;
+			event.status = ib_event->param.sidr_rep_rcvd.status;
+			break;
+		}
+		ret = cma_set_qkey(id_priv, rep->qkey);
+		if (ret) {
+			event.event = RDMA_CM_EVENT_ADDR_ERROR;
+			event.status = ret;
+			break;
+		}
+		ib_init_ah_from_path(id_priv->id.device, id_priv->id.port_num,
+				     id_priv->id.route.path_rec,
+				     &event.param.ud.ah_attr);
+		event.param.ud.qp_num = rep->qpn;
+		event.param.ud.qkey = rep->qkey;
+		event.event = RDMA_CM_EVENT_ESTABLISHED;
+		event.status = 0;
+		break;
+	default:
+		printk(KERN_ERR "RDMA CMA: unexpected IB CM event: %d\n",
+		       ib_event->event);
+		goto out;
+	}
+
+	ret = id_priv->id.event_handler(&id_priv->id, &event);
+	if (ret) {
+		/* Destroy the CM ID by returning a non-zero value. */
+		id_priv->cm_id.ib = NULL;
+		cma_exch(id_priv, RDMA_CM_DESTROYING);
+		mutex_unlock(&id_priv->handler_mutex);
+		rdma_destroy_id(&id_priv->id);
+		return ret;
+	}
+out:
+	mutex_unlock(&id_priv->handler_mutex);
+	return ret;
+}
+
+static int cma_resolve_ib_udp(struct rdma_id_private *id_priv,
+			      struct rdma_conn_param *conn_param)
+{
+	struct ib_cm_sidr_req_param req;
+	struct ib_cm_id	*id;
+	void *private_data;
+	int offset, ret;
+
+	memset(&req, 0, sizeof req);
+	offset = cma_user_data_offset(id_priv);
+	req.private_data_len = offset + conn_param->private_data_len;
+	if (req.private_data_len < conn_param->private_data_len)
+		return -EINVAL;
+
+	if (req.private_data_len) {
+		private_data = kzalloc(req.private_data_len, GFP_ATOMIC);
+		if (!private_data)
+			return -ENOMEM;
+	} else {
+		private_data = NULL;
+	}
+
+	if (conn_param->private_data && conn_param->private_data_len)
+		memcpy(private_data + offset, conn_param->private_data,
+		       conn_param->private_data_len);
+
+	if (private_data) {
+		ret = cma_format_hdr(private_data, id_priv);
+		if (ret)
+			goto out;
+		req.private_data = private_data;
+	}
+
+	id = ib_create_cm_id(id_priv->id.device, cma_sidr_rep_handler,
+			     id_priv);
+	if (IS_ERR(id)) {
+		ret = PTR_ERR(id);
+		goto out;
+	}
+	id_priv->cm_id.ib = id;
+
+	req.path = id_priv->id.route.path_rec;
+	req.service_id = rdma_get_service_id(&id_priv->id, cma_dst_addr(id_priv));
+	req.timeout_ms = 1 << (CMA_CM_RESPONSE_TIMEOUT - 8);
+	req.max_cm_retries = CMA_MAX_CM_RETRIES;
+
+	ret = ib_send_cm_sidr_req(id_priv->cm_id.ib, &req);
+	if (ret) {
+		ib_destroy_cm_id(id_priv->cm_id.ib);
+		id_priv->cm_id.ib = NULL;
+	}
+out:
+	kfree(private_data);
+	return ret;
+}
+
+static int cma_connect_ib(struct rdma_id_private *id_priv,
+			  struct rdma_conn_param *conn_param)
+{
+	struct ib_cm_req_param req;
+	struct rdma_route *route;
+	void *private_data;
+	struct ib_cm_id	*id;
+	int offset, ret;
+
+	memset(&req, 0, sizeof req);
+	offset = cma_user_data_offset(id_priv);
+	req.private_data_len = offset + conn_param->private_data_len;
+	if (req.private_data_len < conn_param->private_data_len)
+		return -EINVAL;
+
+	if (req.private_data_len) {
+		private_data = kzalloc(req.private_data_len, GFP_ATOMIC);
+		if (!private_data)
+			return -ENOMEM;
+	} else {
+		private_data = NULL;
+	}
+
+	if (conn_param->private_data && conn_param->private_data_len)
+		memcpy(private_data + offset, conn_param->private_data,
+		       conn_param->private_data_len);
+
+	id = ib_create_cm_id(id_priv->id.device, cma_ib_handler, id_priv);
+	if (IS_ERR(id)) {
+		ret = PTR_ERR(id);
+		goto out;
+	}
+	id_priv->cm_id.ib = id;
+
+	route = &id_priv->id.route;
+	if (private_data) {
+		ret = cma_format_hdr(private_data, id_priv);
+		if (ret)
+			goto out;
+		req.private_data = private_data;
+	}
+
+	req.primary_path = &route->path_rec[0];
+	if (route->num_paths == 2)
+		req.alternate_path = &route->path_rec[1];
+
+	req.service_id = rdma_get_service_id(&id_priv->id, cma_dst_addr(id_priv));
+	req.qp_num = id_priv->qp_num;
+	req.qp_type = id_priv->id.qp_type;
+	req.starting_psn = id_priv->seq_num;
+	req.responder_resources = conn_param->responder_resources;
+	req.initiator_depth = conn_param->initiator_depth;
+	req.flow_control = conn_param->flow_control;
+	req.retry_count = min_t(u8, 7, conn_param->retry_count);
+	req.rnr_retry_count = min_t(u8, 7, conn_param->rnr_retry_count);
+	req.remote_cm_response_timeout = CMA_CM_RESPONSE_TIMEOUT;
+	req.local_cm_response_timeout = CMA_CM_RESPONSE_TIMEOUT;
+	req.max_cm_retries = CMA_MAX_CM_RETRIES;
+	req.srq = id_priv->srq ? 1 : 0;
+
+	ret = ib_send_cm_req(id_priv->cm_id.ib, &req);
+out:
+	if (ret && !IS_ERR(id)) {
+		ib_destroy_cm_id(id);
+		id_priv->cm_id.ib = NULL;
+	}
+
+	kfree(private_data);
+	return ret;
+}
+
+static int cma_connect_iw(struct rdma_id_private *id_priv,
+			  struct rdma_conn_param *conn_param)
+{
+	struct iw_cm_id *cm_id;
+	int ret;
+	struct iw_cm_conn_param iw_param;
+
+	cm_id = iw_create_cm_id(id_priv->id.device, cma_iw_handler, id_priv);
+	if (IS_ERR(cm_id))
+		return PTR_ERR(cm_id);
+
+	id_priv->cm_id.iw = cm_id;
+
+	memcpy(&cm_id->local_addr, cma_src_addr(id_priv),
+	       rdma_addr_size(cma_src_addr(id_priv)));
+	memcpy(&cm_id->remote_addr, cma_dst_addr(id_priv),
+	       rdma_addr_size(cma_dst_addr(id_priv)));
+
+	ret = cma_modify_qp_rtr(id_priv, conn_param);
+	if (ret)
+		goto out;
+
+	if (conn_param) {
+		iw_param.ord = conn_param->initiator_depth;
+		iw_param.ird = conn_param->responder_resources;
+		iw_param.private_data = conn_param->private_data;
+		iw_param.private_data_len = conn_param->private_data_len;
+		iw_param.qpn = id_priv->id.qp ? id_priv->qp_num : conn_param->qp_num;
+	} else {
+		memset(&iw_param, 0, sizeof iw_param);
+		iw_param.qpn = id_priv->qp_num;
+	}
+	ret = iw_cm_connect(cm_id, &iw_param);
+out:
+	if (ret) {
+		iw_destroy_cm_id(cm_id);
+		id_priv->cm_id.iw = NULL;
+	}
+	return ret;
+}
+
+int rdma_connect(struct rdma_cm_id *id, struct rdma_conn_param *conn_param)
+{
+	struct rdma_id_private *id_priv;
+	int ret;
+
+	id_priv = container_of(id, struct rdma_id_private, id);
+	if (!cma_comp_exch(id_priv, RDMA_CM_ROUTE_RESOLVED, RDMA_CM_CONNECT))
+		return -EINVAL;
+
+	if (!id->qp) {
+		id_priv->qp_num = conn_param->qp_num;
+		id_priv->srq = conn_param->srq;
+	}
+
+	switch (rdma_node_get_transport(id->device->node_type)) {
+	case RDMA_TRANSPORT_IB:
+		if (id->qp_type == IB_QPT_UD)
+			ret = cma_resolve_ib_udp(id_priv, conn_param);
+		else
+			ret = cma_connect_ib(id_priv, conn_param);
+		break;
+	case RDMA_TRANSPORT_IWARP:
+		ret = cma_connect_iw(id_priv, conn_param);
+		break;
+	default:
+		ret = -ENOSYS;
+		break;
+	}
+	if (ret)
+		goto err;
+
+	return 0;
+err:
+	cma_comp_exch(id_priv, RDMA_CM_CONNECT, RDMA_CM_ROUTE_RESOLVED);
+	return ret;
+}
+EXPORT_SYMBOL(rdma_connect);
+
+static int cma_accept_ib(struct rdma_id_private *id_priv,
+			 struct rdma_conn_param *conn_param)
+{
+	struct ib_cm_rep_param rep;
+	int ret;
+
+	ret = cma_modify_qp_rtr(id_priv, conn_param);
+	if (ret)
+		goto out;
+
+	ret = cma_modify_qp_rts(id_priv, conn_param);
+	if (ret)
+		goto out;
+
+	memset(&rep, 0, sizeof rep);
+	rep.qp_num = id_priv->qp_num;
+	rep.starting_psn = id_priv->seq_num;
+	rep.private_data = conn_param->private_data;
+	rep.private_data_len = conn_param->private_data_len;
+	rep.responder_resources = conn_param->responder_resources;
+	rep.initiator_depth = conn_param->initiator_depth;
+	rep.failover_accepted = 0;
+	rep.flow_control = conn_param->flow_control;
+	rep.rnr_retry_count = min_t(u8, 7, conn_param->rnr_retry_count);
+	rep.srq = id_priv->srq ? 1 : 0;
+
+	ret = ib_send_cm_rep(id_priv->cm_id.ib, &rep);
+out:
+	return ret;
+}
+
+static int cma_accept_iw(struct rdma_id_private *id_priv,
+		  struct rdma_conn_param *conn_param)
+{
+	struct iw_cm_conn_param iw_param;
+	int ret;
+
+	ret = cma_modify_qp_rtr(id_priv, conn_param);
+	if (ret)
+		return ret;
+
+	iw_param.ord = conn_param->initiator_depth;
+	iw_param.ird = conn_param->responder_resources;
+	iw_param.private_data = conn_param->private_data;
+	iw_param.private_data_len = conn_param->private_data_len;
+	if (id_priv->id.qp) {
+		iw_param.qpn = id_priv->qp_num;
+	} else
+		iw_param.qpn = conn_param->qp_num;
+
+	return iw_cm_accept(id_priv->cm_id.iw, &iw_param);
+}
+
+static int cma_send_sidr_rep(struct rdma_id_private *id_priv,
+			     enum ib_cm_sidr_status status, u32 qkey,
+			     const void *private_data, int private_data_len)
+{
+	struct ib_cm_sidr_rep_param rep;
+	int ret;
+
+	memset(&rep, 0, sizeof rep);
+	rep.status = status;
+	if (status == IB_SIDR_SUCCESS) {
+		ret = cma_set_qkey(id_priv, qkey);
+		if (ret)
+			return ret;
+		rep.qp_num = id_priv->qp_num;
+		rep.qkey = id_priv->qkey;
+	}
+	rep.private_data = private_data;
+	rep.private_data_len = private_data_len;
+
+	return ib_send_cm_sidr_rep(id_priv->cm_id.ib, &rep);
+}
+
+int rdma_accept(struct rdma_cm_id *id, struct rdma_conn_param *conn_param)
+{
+	struct rdma_id_private *id_priv;
+	int ret;
+
+	id_priv = container_of(id, struct rdma_id_private, id);
+
+	id_priv->owner = task_pid_nr(current);
+
+	if (!cma_comp(id_priv, RDMA_CM_CONNECT))
+		return -EINVAL;
+
+	if (!id->qp && conn_param) {
+		id_priv->qp_num = conn_param->qp_num;
+		id_priv->srq = conn_param->srq;
+	}
+
+	switch (rdma_node_get_transport(id->device->node_type)) {
+	case RDMA_TRANSPORT_IB:
+		if (id->qp_type == IB_QPT_UD) {
+			if (conn_param)
+				ret = cma_send_sidr_rep(id_priv, IB_SIDR_SUCCESS,
+							conn_param->qkey,
+							conn_param->private_data,
+							conn_param->private_data_len);
+			else
+				ret = cma_send_sidr_rep(id_priv, IB_SIDR_SUCCESS,
+							0, NULL, 0);
+		} else {
+			if (conn_param)
+				ret = cma_accept_ib(id_priv, conn_param);
+			else
+				ret = cma_rep_recv(id_priv);
+		}
+		break;
+	case RDMA_TRANSPORT_IWARP:
+		ret = cma_accept_iw(id_priv, conn_param);
+		break;
+	default:
+		ret = -ENOSYS;
+		break;
+	}
+
+	if (ret)
+		goto reject;
+
+	return 0;
+reject:
+	cma_modify_qp_err(id_priv);
+	rdma_reject(id, NULL, 0);
+	return ret;
+}
+EXPORT_SYMBOL(rdma_accept);
+
+int rdma_notify(struct rdma_cm_id *id, enum ib_event_type event)
+{
+	struct rdma_id_private *id_priv;
+	int ret;
+
+	id_priv = container_of(id, struct rdma_id_private, id);
+	if (!id_priv->cm_id.ib)
+		return -EINVAL;
+
+	switch (id->device->node_type) {
+	case RDMA_NODE_IB_CA:
+		ret = ib_cm_notify(id_priv->cm_id.ib, event);
+		break;
+	default:
+		ret = 0;
+		break;
+	}
+	return ret;
+}
+EXPORT_SYMBOL(rdma_notify);
+
+int rdma_reject(struct rdma_cm_id *id, const void *private_data,
+		u8 private_data_len)
+{
+	struct rdma_id_private *id_priv;
+	int ret;
+
+	id_priv = container_of(id, struct rdma_id_private, id);
+	if (!id_priv->cm_id.ib)
+		return -EINVAL;
+
+	switch (rdma_node_get_transport(id->device->node_type)) {
+	case RDMA_TRANSPORT_IB:
+		if (id->qp_type == IB_QPT_UD)
+			ret = cma_send_sidr_rep(id_priv, IB_SIDR_REJECT, 0,
+						private_data, private_data_len);
+		else
+			ret = ib_send_cm_rej(id_priv->cm_id.ib,
+					     IB_CM_REJ_CONSUMER_DEFINED, NULL,
+					     0, private_data, private_data_len);
+		break;
+	case RDMA_TRANSPORT_IWARP:
+		ret = iw_cm_reject(id_priv->cm_id.iw,
+				   private_data, private_data_len);
+		break;
+	default:
+		ret = -ENOSYS;
+		break;
+	}
+	return ret;
+}
+EXPORT_SYMBOL(rdma_reject);
+
+int rdma_disconnect(struct rdma_cm_id *id)
+{
+	struct rdma_id_private *id_priv;
+	int ret;
+
+	id_priv = container_of(id, struct rdma_id_private, id);
+	if (!id_priv->cm_id.ib)
+		return -EINVAL;
+
+	switch (rdma_node_get_transport(id->device->node_type)) {
+	case RDMA_TRANSPORT_IB:
+		ret = cma_modify_qp_err(id_priv);
+		if (ret)
+			goto out;
+		/* Initiate or respond to a disconnect. */
+		if (ib_send_cm_dreq(id_priv->cm_id.ib, NULL, 0))
+			ib_send_cm_drep(id_priv->cm_id.ib, NULL, 0);
+		break;
+	case RDMA_TRANSPORT_IWARP:
+		ret = iw_cm_disconnect(id_priv->cm_id.iw, 0);
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+out:
+	return ret;
+}
+EXPORT_SYMBOL(rdma_disconnect);
+
+static int cma_ib_mc_handler(int status, struct ib_sa_multicast *multicast)
+{
+	struct rdma_id_private *id_priv;
+	struct cma_multicast *mc = multicast->context;
+	struct rdma_cm_event event;
+	int ret;
+
+	id_priv = mc->id_priv;
+	if (cma_disable_callback(id_priv, RDMA_CM_ADDR_BOUND) &&
+	    cma_disable_callback(id_priv, RDMA_CM_ADDR_RESOLVED))
+		return 0;
+
+	if (!status)
+		status = cma_set_qkey(id_priv, be32_to_cpu(multicast->rec.qkey));
+	mutex_lock(&id_priv->qp_mutex);
+	if (!status && id_priv->id.qp)
+		status = ib_attach_mcast(id_priv->id.qp, &multicast->rec.mgid,
+					 be16_to_cpu(multicast->rec.mlid));
+	mutex_unlock(&id_priv->qp_mutex);
+
+	memset(&event, 0, sizeof event);
+	event.status = status;
+	event.param.ud.private_data = mc->context;
+	if (!status) {
+		event.event = RDMA_CM_EVENT_MULTICAST_JOIN;
+		ib_init_ah_from_mcmember(id_priv->id.device,
+					 id_priv->id.port_num, &multicast->rec,
+					 &event.param.ud.ah_attr);
+		event.param.ud.qp_num = 0xFFFFFF;
+		event.param.ud.qkey = be32_to_cpu(multicast->rec.qkey);
+	} else
+		event.event = RDMA_CM_EVENT_MULTICAST_ERROR;
+
+	ret = id_priv->id.event_handler(&id_priv->id, &event);
+	if (ret) {
+		cma_exch(id_priv, RDMA_CM_DESTROYING);
+		mutex_unlock(&id_priv->handler_mutex);
+		rdma_destroy_id(&id_priv->id);
+		return 0;
+	}
+
+	mutex_unlock(&id_priv->handler_mutex);
+	return 0;
+}
+
+static void cma_set_mgid(struct rdma_id_private *id_priv,
+			 struct sockaddr *addr, union ib_gid *mgid)
+{
+	unsigned char mc_map[MAX_ADDR_LEN];
+	struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr;
+	struct sockaddr_in *sin = (struct sockaddr_in *) addr;
+	struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) addr;
+
+	if (cma_any_addr(addr)) {
+		memset(mgid, 0, sizeof *mgid);
+	} else if ((addr->sa_family == AF_INET6) &&
+		   ((be32_to_cpu(sin6->sin6_addr.s6_addr32[0]) & 0xFFF0FFFF) ==
+								 0xFF10A01B)) {
+		/* IPv6 address is an SA assigned MGID. */
+		memcpy(mgid, &sin6->sin6_addr, sizeof *mgid);
+	} else if (addr->sa_family == AF_IB) {
+		memcpy(mgid, &((struct sockaddr_ib *) addr)->sib_addr, sizeof *mgid);
+	} else if ((addr->sa_family == AF_INET6)) {
+		ipv6_ib_mc_map(&sin6->sin6_addr, dev_addr->broadcast, mc_map);
+		if (id_priv->id.ps == RDMA_PS_UDP)
+			mc_map[7] = 0x01;	/* Use RDMA CM signature */
+		*mgid = *(union ib_gid *) (mc_map + 4);
+	} else {
+		ip_ib_mc_map(sin->sin_addr.s_addr, dev_addr->broadcast, mc_map);
+		if (id_priv->id.ps == RDMA_PS_UDP)
+			mc_map[7] = 0x01;	/* Use RDMA CM signature */
+		*mgid = *(union ib_gid *) (mc_map + 4);
+	}
+}
+
+static int cma_join_ib_multicast(struct rdma_id_private *id_priv,
+				 struct cma_multicast *mc)
+{
+	struct ib_sa_mcmember_rec rec;
+	struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr;
+	ib_sa_comp_mask comp_mask;
+	int ret;
+
+	ib_addr_get_mgid(dev_addr, &rec.mgid);
+	ret = ib_sa_get_mcmember_rec(id_priv->id.device, id_priv->id.port_num,
+				     &rec.mgid, &rec);
+	if (ret)
+		return ret;
+
+	ret = cma_set_qkey(id_priv, 0);
+	if (ret)
+		return ret;
+
+	cma_set_mgid(id_priv, (struct sockaddr *) &mc->addr, &rec.mgid);
+	rec.qkey = cpu_to_be32(id_priv->qkey);
+	rdma_addr_get_sgid(dev_addr, &rec.port_gid);
+	rec.pkey = cpu_to_be16(ib_addr_get_pkey(dev_addr));
+	rec.join_state = 1;
+
+	comp_mask = IB_SA_MCMEMBER_REC_MGID | IB_SA_MCMEMBER_REC_PORT_GID |
+		    IB_SA_MCMEMBER_REC_PKEY | IB_SA_MCMEMBER_REC_JOIN_STATE |
+		    IB_SA_MCMEMBER_REC_QKEY | IB_SA_MCMEMBER_REC_SL |
+		    IB_SA_MCMEMBER_REC_FLOW_LABEL |
+		    IB_SA_MCMEMBER_REC_TRAFFIC_CLASS;
+
+	if (id_priv->id.ps == RDMA_PS_IPOIB)
+		comp_mask |= IB_SA_MCMEMBER_REC_RATE |
+			     IB_SA_MCMEMBER_REC_RATE_SELECTOR |
+			     IB_SA_MCMEMBER_REC_MTU_SELECTOR |
+			     IB_SA_MCMEMBER_REC_MTU |
+			     IB_SA_MCMEMBER_REC_HOP_LIMIT;
+
+	mc->multicast.ib = ib_sa_join_multicast(&sa_client, id_priv->id.device,
+						id_priv->id.port_num, &rec,
+						comp_mask, GFP_KERNEL,
+						cma_ib_mc_handler, mc);
+	return PTR_ERR_OR_ZERO(mc->multicast.ib);
+}
+
+static void iboe_mcast_work_handler(struct work_struct *work)
+{
+	struct iboe_mcast_work *mw = container_of(work, struct iboe_mcast_work, work);
+	struct cma_multicast *mc = mw->mc;
+	struct ib_sa_multicast *m = mc->multicast.ib;
+
+	mc->multicast.ib->context = mc;
+	cma_ib_mc_handler(0, m);
+	kref_put(&mc->mcref, release_mc);
+	kfree(mw);
+}
+
+static void cma_iboe_set_mgid(struct sockaddr *addr, union ib_gid *mgid)
+{
+	struct sockaddr_in *sin = (struct sockaddr_in *)addr;
+	struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)addr;
+
+	if (cma_any_addr(addr)) {
+		memset(mgid, 0, sizeof *mgid);
+	} else if (addr->sa_family == AF_INET6) {
+		memcpy(mgid, &sin6->sin6_addr, sizeof *mgid);
+	} else {
+		mgid->raw[0] = 0xff;
+		mgid->raw[1] = 0x0e;
+		mgid->raw[2] = 0;
+		mgid->raw[3] = 0;
+		mgid->raw[4] = 0;
+		mgid->raw[5] = 0;
+		mgid->raw[6] = 0;
+		mgid->raw[7] = 0;
+		mgid->raw[8] = 0;
+		mgid->raw[9] = 0;
+		mgid->raw[10] = 0xff;
+		mgid->raw[11] = 0xff;
+		*(__be32 *)(&mgid->raw[12]) = sin->sin_addr.s_addr;
+	}
+}
+
+static int cma_iboe_join_multicast(struct rdma_id_private *id_priv,
+				   struct cma_multicast *mc)
+{
+	struct iboe_mcast_work *work;
+	struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr;
+	int err;
+	struct sockaddr *addr = (struct sockaddr *)&mc->addr;
+	struct net_device *ndev = NULL;
+
+	if (cma_zero_addr((struct sockaddr *)&mc->addr))
+		return -EINVAL;
+
+	work = kzalloc(sizeof *work, GFP_KERNEL);
+	if (!work)
+		return -ENOMEM;
+
+	mc->multicast.ib = kzalloc(sizeof(struct ib_sa_multicast), GFP_KERNEL);
+	if (!mc->multicast.ib) {
+		err = -ENOMEM;
+		goto out1;
+	}
+
+	cma_iboe_set_mgid(addr, &mc->multicast.ib->rec.mgid);
+
+	mc->multicast.ib->rec.pkey = cpu_to_be16(0xffff);
+	if (id_priv->id.ps == RDMA_PS_UDP)
+		mc->multicast.ib->rec.qkey = cpu_to_be32(RDMA_UDP_QKEY);
+
+	if (dev_addr->bound_dev_if)
+		ndev = dev_get_by_index(&init_net, dev_addr->bound_dev_if);
+	if (!ndev) {
+		err = -ENODEV;
+		goto out2;
+	}
+	mc->multicast.ib->rec.rate = iboe_get_rate(ndev);
+	mc->multicast.ib->rec.hop_limit = 1;
+	mc->multicast.ib->rec.mtu = iboe_get_mtu(ndev->mtu);
+	dev_put(ndev);
+	if (!mc->multicast.ib->rec.mtu) {
+		err = -EINVAL;
+		goto out2;
+	}
+	rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.src_addr,
+		    &mc->multicast.ib->rec.port_gid);
+	work->id = id_priv;
+	work->mc = mc;
+	INIT_WORK(&work->work, iboe_mcast_work_handler);
+	kref_get(&mc->mcref);
+	queue_work(cma_wq, &work->work);
+
+	return 0;
+
+out2:
+	kfree(mc->multicast.ib);
+out1:
+	kfree(work);
+	return err;
+}
+
+int rdma_join_multicast(struct rdma_cm_id *id, struct sockaddr *addr,
+			void *context)
+{
+	struct rdma_id_private *id_priv;
+	struct cma_multicast *mc;
+	int ret;
+
+	id_priv = container_of(id, struct rdma_id_private, id);
+	if (!cma_comp(id_priv, RDMA_CM_ADDR_BOUND) &&
+	    !cma_comp(id_priv, RDMA_CM_ADDR_RESOLVED))
+		return -EINVAL;
+
+	mc = kmalloc(sizeof *mc, GFP_KERNEL);
+	if (!mc)
+		return -ENOMEM;
+
+	memcpy(&mc->addr, addr, rdma_addr_size(addr));
+	mc->context = context;
+	mc->id_priv = id_priv;
+
+	spin_lock(&id_priv->lock);
+	list_add(&mc->list, &id_priv->mc_list);
+	spin_unlock(&id_priv->lock);
+
+	switch (rdma_node_get_transport(id->device->node_type)) {
+	case RDMA_TRANSPORT_IB:
+		switch (rdma_port_get_link_layer(id->device, id->port_num)) {
+		case IB_LINK_LAYER_INFINIBAND:
+			ret = cma_join_ib_multicast(id_priv, mc);
+			break;
+		case IB_LINK_LAYER_ETHERNET:
+			kref_init(&mc->mcref);
+			ret = cma_iboe_join_multicast(id_priv, mc);
+			break;
+		default:
+			ret = -EINVAL;
+		}
+		break;
+	default:
+		ret = -ENOSYS;
+		break;
+	}
+
+	if (ret) {
+		spin_lock_irq(&id_priv->lock);
+		list_del(&mc->list);
+		spin_unlock_irq(&id_priv->lock);
+		kfree(mc);
+	}
+	return ret;
+}
+EXPORT_SYMBOL(rdma_join_multicast);
+
+void rdma_leave_multicast(struct rdma_cm_id *id, struct sockaddr *addr)
+{
+	struct rdma_id_private *id_priv;
+	struct cma_multicast *mc;
+
+	id_priv = container_of(id, struct rdma_id_private, id);
+	spin_lock_irq(&id_priv->lock);
+	list_for_each_entry(mc, &id_priv->mc_list, list) {
+		if (!memcmp(&mc->addr, addr, rdma_addr_size(addr))) {
+			list_del(&mc->list);
+			spin_unlock_irq(&id_priv->lock);
+
+			if (id->qp)
+				ib_detach_mcast(id->qp,
+						&mc->multicast.ib->rec.mgid,
+						be16_to_cpu(mc->multicast.ib->rec.mlid));
+			if (rdma_node_get_transport(id_priv->cma_dev->device->node_type) == RDMA_TRANSPORT_IB) {
+				switch (rdma_port_get_link_layer(id->device, id->port_num)) {
+				case IB_LINK_LAYER_INFINIBAND:
+					ib_sa_free_multicast(mc->multicast.ib);
+					kfree(mc);
+					break;
+				case IB_LINK_LAYER_ETHERNET:
+					kref_put(&mc->mcref, release_mc);
+					break;
+				default:
+					break;
+				}
+			}
+			return;
+		}
+	}
+	spin_unlock_irq(&id_priv->lock);
+}
+EXPORT_SYMBOL(rdma_leave_multicast);
+
+static int cma_netdev_change(struct net_device *ndev, struct rdma_id_private *id_priv)
+{
+	struct rdma_dev_addr *dev_addr;
+	struct cma_ndev_work *work;
+
+	dev_addr = &id_priv->id.route.addr.dev_addr;
+
+	if ((dev_addr->bound_dev_if == ndev->ifindex) &&
+	    memcmp(dev_addr->src_dev_addr, ndev->dev_addr, ndev->addr_len)) {
+		printk(KERN_INFO "RDMA CM addr change for ndev %s used by id %p\n",
+		       ndev->name, &id_priv->id);
+		work = kzalloc(sizeof *work, GFP_KERNEL);
+		if (!work)
+			return -ENOMEM;
+
+		INIT_WORK(&work->work, cma_ndev_work_handler);
+		work->id = id_priv;
+		work->event.event = RDMA_CM_EVENT_ADDR_CHANGE;
+		atomic_inc(&id_priv->refcount);
+		queue_work(cma_wq, &work->work);
+	}
+
+	return 0;
+}
+
+static int cma_netdev_callback(struct notifier_block *self, unsigned long event,
+			       void *ptr)
+{
+	struct net_device *ndev = netdev_notifier_info_to_dev(ptr);
+	struct cma_device *cma_dev;
+	struct rdma_id_private *id_priv;
+	int ret = NOTIFY_DONE;
+
+	if (dev_net(ndev) != &init_net)
+		return NOTIFY_DONE;
+
+	if (event != NETDEV_BONDING_FAILOVER)
+		return NOTIFY_DONE;
+
+	if (!(ndev->flags & IFF_MASTER) || !(ndev->priv_flags & IFF_BONDING))
+		return NOTIFY_DONE;
+
+	mutex_lock(&lock);
+	list_for_each_entry(cma_dev, &dev_list, list)
+		list_for_each_entry(id_priv, &cma_dev->id_list, list) {
+			ret = cma_netdev_change(ndev, id_priv);
+			if (ret)
+				goto out;
+		}
+
+out:
+	mutex_unlock(&lock);
+	return ret;
+}
+
+static struct notifier_block cma_nb = {
+	.notifier_call = cma_netdev_callback
+};
+
+static void cma_add_one(struct ib_device *device)
+{
+	struct cma_device *cma_dev;
+	struct rdma_id_private *id_priv;
+
+	cma_dev = kmalloc(sizeof *cma_dev, GFP_KERNEL);
+	if (!cma_dev)
+		return;
+
+	cma_dev->device = device;
+
+	init_completion(&cma_dev->comp);
+	atomic_set(&cma_dev->refcount, 1);
+	INIT_LIST_HEAD(&cma_dev->id_list);
+	ib_set_client_data(device, &cma_client, cma_dev);
+
+	mutex_lock(&lock);
+	list_add_tail(&cma_dev->list, &dev_list);
+	list_for_each_entry(id_priv, &listen_any_list, list)
+		cma_listen_on_dev(id_priv, cma_dev);
+	mutex_unlock(&lock);
+}
+
+static int cma_remove_id_dev(struct rdma_id_private *id_priv)
+{
+	struct rdma_cm_event event;
+	enum rdma_cm_state state;
+	int ret = 0;
+
+	/* Record that we want to remove the device */
+	state = cma_exch(id_priv, RDMA_CM_DEVICE_REMOVAL);
+	if (state == RDMA_CM_DESTROYING)
+		return 0;
+
+	cma_cancel_operation(id_priv, state);
+	mutex_lock(&id_priv->handler_mutex);
+
+	/* Check for destruction from another callback. */
+	if (!cma_comp(id_priv, RDMA_CM_DEVICE_REMOVAL))
+		goto out;
+
+	memset(&event, 0, sizeof event);
+	event.event = RDMA_CM_EVENT_DEVICE_REMOVAL;
+	ret = id_priv->id.event_handler(&id_priv->id, &event);
+out:
+	mutex_unlock(&id_priv->handler_mutex);
+	return ret;
+}
+
+static void cma_process_remove(struct cma_device *cma_dev)
+{
+	struct rdma_id_private *id_priv;
+	int ret;
+
+	mutex_lock(&lock);
+	while (!list_empty(&cma_dev->id_list)) {
+		id_priv = list_entry(cma_dev->id_list.next,
+				     struct rdma_id_private, list);
+
+		list_del(&id_priv->listen_list);
+		list_del_init(&id_priv->list);
+		atomic_inc(&id_priv->refcount);
+		mutex_unlock(&lock);
+
+		ret = id_priv->internal_id ? 1 : cma_remove_id_dev(id_priv);
+		cma_deref_id(id_priv);
+		if (ret)
+			rdma_destroy_id(&id_priv->id);
+
+		mutex_lock(&lock);
+	}
+	mutex_unlock(&lock);
+
+	cma_deref_dev(cma_dev);
+	wait_for_completion(&cma_dev->comp);
+}
+
+static void cma_remove_one(struct ib_device *device)
+{
+	struct cma_device *cma_dev;
+
+	cma_dev = ib_get_client_data(device, &cma_client);
+	if (!cma_dev)
+		return;
+
+	mutex_lock(&lock);
+	list_del(&cma_dev->list);
+	mutex_unlock(&lock);
+
+	cma_process_remove(cma_dev);
+	kfree(cma_dev);
+}
+
+static int cma_get_id_stats(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	struct nlmsghdr *nlh;
+	struct rdma_cm_id_stats *id_stats;
+	struct rdma_id_private *id_priv;
+	struct rdma_cm_id *id = NULL;
+	struct cma_device *cma_dev;
+	int i_dev = 0, i_id = 0;
+
+	/*
+	 * We export all of the IDs as a sequence of messages.  Each
+	 * ID gets its own netlink message.
+	 */
+	mutex_lock(&lock);
+
+	list_for_each_entry(cma_dev, &dev_list, list) {
+		if (i_dev < cb->args[0]) {
+			i_dev++;
+			continue;
+		}
+
+		i_id = 0;
+		list_for_each_entry(id_priv, &cma_dev->id_list, list) {
+			if (i_id < cb->args[1]) {
+				i_id++;
+				continue;
+			}
+
+			id_stats = ibnl_put_msg(skb, &nlh, cb->nlh->nlmsg_seq,
+						sizeof *id_stats, RDMA_NL_RDMA_CM,
+						RDMA_NL_RDMA_CM_ID_STATS,
+						NLM_F_MULTI);
+			if (!id_stats)
+				goto out;
+
+			memset(id_stats, 0, sizeof *id_stats);
+			id = &id_priv->id;
+			id_stats->node_type = id->route.addr.dev_addr.dev_type;
+			id_stats->port_num = id->port_num;
+			id_stats->bound_dev_if =
+				id->route.addr.dev_addr.bound_dev_if;
+
+			if (ibnl_put_attr(skb, nlh,
+					  rdma_addr_size(cma_src_addr(id_priv)),
+					  cma_src_addr(id_priv),
+					  RDMA_NL_RDMA_CM_ATTR_SRC_ADDR))
+				goto out;
+			if (ibnl_put_attr(skb, nlh,
+					  rdma_addr_size(cma_src_addr(id_priv)),
+					  cma_dst_addr(id_priv),
+					  RDMA_NL_RDMA_CM_ATTR_DST_ADDR))
+				goto out;
+
+			id_stats->pid		= id_priv->owner;
+			id_stats->port_space	= id->ps;
+			id_stats->cm_state	= id_priv->state;
+			id_stats->qp_num	= id_priv->qp_num;
+			id_stats->qp_type	= id->qp_type;
+
+			i_id++;
+		}
+
+		cb->args[1] = 0;
+		i_dev++;
+	}
+
+out:
+	mutex_unlock(&lock);
+	cb->args[0] = i_dev;
+	cb->args[1] = i_id;
+
+	return skb->len;
+}
+
+static const struct ibnl_client_cbs cma_cb_table[] = {
+	[RDMA_NL_RDMA_CM_ID_STATS] = { .dump = cma_get_id_stats,
+				       .module = THIS_MODULE },
+};
+
+static int __init cma_init(void)
+{
+	int ret;
+
+	cma_wq = create_singlethread_workqueue("rdma_cm");
+	if (!cma_wq)
+		return -ENOMEM;
+
+	ib_sa_register_client(&sa_client);
+	rdma_addr_register_client(&addr_client);
+	register_netdevice_notifier(&cma_nb);
+
+	ret = ib_register_client(&cma_client);
+	if (ret)
+		goto err;
+
+	if (ibnl_add_client(RDMA_NL_RDMA_CM, RDMA_NL_RDMA_CM_NUM_OPS, cma_cb_table))
+		printk(KERN_WARNING "RDMA CMA: failed to add netlink callback\n");
+
+	return 0;
+
+err:
+	unregister_netdevice_notifier(&cma_nb);
+	rdma_addr_unregister_client(&addr_client);
+	ib_sa_unregister_client(&sa_client);
+	destroy_workqueue(cma_wq);
+	return ret;
+}
+
+static void __exit cma_cleanup(void)
+{
+	ibnl_remove_client(RDMA_NL_RDMA_CM);
+	ib_unregister_client(&cma_client);
+	unregister_netdevice_notifier(&cma_nb);
+	rdma_addr_unregister_client(&addr_client);
+	ib_sa_unregister_client(&sa_client);
+	destroy_workqueue(cma_wq);
+	idr_destroy(&tcp_ps);
+	idr_destroy(&udp_ps);
+	idr_destroy(&ipoib_ps);
+	idr_destroy(&ib_ps);
+}
+
+module_init(cma_init);
+module_exit(cma_cleanup);
diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h
new file mode 100644
index 0000000..87d1936
--- /dev/null
+++ b/drivers/infiniband/core/core_priv.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _CORE_PRIV_H
+#define _CORE_PRIV_H
+
+#include <linux/list.h>
+#include <linux/spinlock.h>
+
+#include <rdma/ib_verbs.h>
+
+int  ib_device_register_sysfs(struct ib_device *device,
+			      int (*port_callback)(struct ib_device *,
+						   u8, struct kobject *));
+void ib_device_unregister_sysfs(struct ib_device *device);
+
+int  ib_sysfs_setup(void);
+void ib_sysfs_cleanup(void);
+
+int  ib_cache_setup(void);
+void ib_cache_cleanup(void);
+
+int ib_resolve_eth_l2_attrs(struct ib_qp *qp,
+			    struct ib_qp_attr *qp_attr, int *qp_attr_mask);
+#endif /* _CORE_PRIV_H */
diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
new file mode 100644
index 0000000..18c1ece
--- /dev/null
+++ b/drivers/infiniband/core/device.c
@@ -0,0 +1,785 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/mutex.h>
+#include <rdma/rdma_netlink.h>
+
+#include "core_priv.h"
+
+MODULE_AUTHOR("Roland Dreier");
+MODULE_DESCRIPTION("core kernel InfiniBand API");
+MODULE_LICENSE("Dual BSD/GPL");
+
+struct ib_client_data {
+	struct list_head  list;
+	struct ib_client *client;
+	void *            data;
+};
+
+struct workqueue_struct *ib_wq;
+EXPORT_SYMBOL_GPL(ib_wq);
+
+static LIST_HEAD(device_list);
+static LIST_HEAD(client_list);
+
+/*
+ * device_mutex protects access to both device_list and client_list.
+ * There's no real point to using multiple locks or something fancier
+ * like an rwsem: we always access both lists, and we're always
+ * modifying one list or the other list.  In any case this is not a
+ * hot path so there's no point in trying to optimize.
+ */
+static DEFINE_MUTEX(device_mutex);
+
+static int ib_device_check_mandatory(struct ib_device *device)
+{
+#define IB_MANDATORY_FUNC(x) { offsetof(struct ib_device, x), #x }
+	static const struct {
+		size_t offset;
+		char  *name;
+	} mandatory_table[] = {
+		IB_MANDATORY_FUNC(query_device),
+		IB_MANDATORY_FUNC(query_port),
+		IB_MANDATORY_FUNC(query_pkey),
+		IB_MANDATORY_FUNC(query_gid),
+		IB_MANDATORY_FUNC(alloc_pd),
+		IB_MANDATORY_FUNC(dealloc_pd),
+		IB_MANDATORY_FUNC(create_ah),
+		IB_MANDATORY_FUNC(destroy_ah),
+		IB_MANDATORY_FUNC(create_qp),
+		IB_MANDATORY_FUNC(modify_qp),
+		IB_MANDATORY_FUNC(destroy_qp),
+		IB_MANDATORY_FUNC(post_send),
+		IB_MANDATORY_FUNC(post_recv),
+		IB_MANDATORY_FUNC(create_cq),
+		IB_MANDATORY_FUNC(destroy_cq),
+		IB_MANDATORY_FUNC(poll_cq),
+		IB_MANDATORY_FUNC(req_notify_cq),
+		IB_MANDATORY_FUNC(get_dma_mr),
+		IB_MANDATORY_FUNC(dereg_mr)
+	};
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(mandatory_table); ++i) {
+		if (!*(void **) ((void *) device + mandatory_table[i].offset)) {
+			printk(KERN_WARNING "Device %s is missing mandatory function %s\n",
+			       device->name, mandatory_table[i].name);
+			return -EINVAL;
+		}
+	}
+
+	return 0;
+}
+
+static struct ib_device *__ib_device_get_by_name(const char *name)
+{
+	struct ib_device *device;
+
+	list_for_each_entry(device, &device_list, core_list)
+		if (!strncmp(name, device->name, IB_DEVICE_NAME_MAX))
+			return device;
+
+	return NULL;
+}
+
+
+static int alloc_name(char *name)
+{
+	unsigned long *inuse;
+	char buf[IB_DEVICE_NAME_MAX];
+	struct ib_device *device;
+	int i;
+
+	inuse = (unsigned long *) get_zeroed_page(GFP_KERNEL);
+	if (!inuse)
+		return -ENOMEM;
+
+	list_for_each_entry(device, &device_list, core_list) {
+		if (!sscanf(device->name, name, &i))
+			continue;
+		if (i < 0 || i >= PAGE_SIZE * 8)
+			continue;
+		snprintf(buf, sizeof buf, name, i);
+		if (!strncmp(buf, device->name, IB_DEVICE_NAME_MAX))
+			set_bit(i, inuse);
+	}
+
+	i = find_first_zero_bit(inuse, PAGE_SIZE * 8);
+	free_page((unsigned long) inuse);
+	snprintf(buf, sizeof buf, name, i);
+
+	if (__ib_device_get_by_name(buf))
+		return -ENFILE;
+
+	strlcpy(name, buf, IB_DEVICE_NAME_MAX);
+	return 0;
+}
+
+static int start_port(struct ib_device *device)
+{
+	return (device->node_type == RDMA_NODE_IB_SWITCH) ? 0 : 1;
+}
+
+
+static int end_port(struct ib_device *device)
+{
+	return (device->node_type == RDMA_NODE_IB_SWITCH) ?
+		0 : device->phys_port_cnt;
+}
+
+/**
+ * ib_alloc_device - allocate an IB device struct
+ * @size:size of structure to allocate
+ *
+ * Low-level drivers should use ib_alloc_device() to allocate &struct
+ * ib_device.  @size is the size of the structure to be allocated,
+ * including any private data used by the low-level driver.
+ * ib_dealloc_device() must be used to free structures allocated with
+ * ib_alloc_device().
+ */
+struct ib_device *ib_alloc_device(size_t size)
+{
+	BUG_ON(size < sizeof (struct ib_device));
+
+	return kzalloc(size, GFP_KERNEL);
+}
+EXPORT_SYMBOL(ib_alloc_device);
+
+/**
+ * ib_dealloc_device - free an IB device struct
+ * @device:structure to free
+ *
+ * Free a structure allocated with ib_alloc_device().
+ */
+void ib_dealloc_device(struct ib_device *device)
+{
+	if (device->reg_state == IB_DEV_UNINITIALIZED) {
+		kfree(device);
+		return;
+	}
+
+	BUG_ON(device->reg_state != IB_DEV_UNREGISTERED);
+
+	kobject_put(&device->dev.kobj);
+}
+EXPORT_SYMBOL(ib_dealloc_device);
+
+static int add_client_context(struct ib_device *device, struct ib_client *client)
+{
+	struct ib_client_data *context;
+	unsigned long flags;
+
+	context = kmalloc(sizeof *context, GFP_KERNEL);
+	if (!context) {
+		printk(KERN_WARNING "Couldn't allocate client context for %s/%s\n",
+		       device->name, client->name);
+		return -ENOMEM;
+	}
+
+	context->client = client;
+	context->data   = NULL;
+
+	spin_lock_irqsave(&device->client_data_lock, flags);
+	list_add(&context->list, &device->client_data_list);
+	spin_unlock_irqrestore(&device->client_data_lock, flags);
+
+	return 0;
+}
+
+static int read_port_table_lengths(struct ib_device *device)
+{
+	struct ib_port_attr *tprops = NULL;
+	int num_ports, ret = -ENOMEM;
+	u8 port_index;
+
+	tprops = kmalloc(sizeof *tprops, GFP_KERNEL);
+	if (!tprops)
+		goto out;
+
+	num_ports = end_port(device) - start_port(device) + 1;
+
+	device->pkey_tbl_len = kmalloc(sizeof *device->pkey_tbl_len * num_ports,
+				       GFP_KERNEL);
+	device->gid_tbl_len = kmalloc(sizeof *device->gid_tbl_len * num_ports,
+				      GFP_KERNEL);
+	if (!device->pkey_tbl_len || !device->gid_tbl_len)
+		goto err;
+
+	for (port_index = 0; port_index < num_ports; ++port_index) {
+		ret = ib_query_port(device, port_index + start_port(device),
+					tprops);
+		if (ret)
+			goto err;
+		device->pkey_tbl_len[port_index] = tprops->pkey_tbl_len;
+		device->gid_tbl_len[port_index]  = tprops->gid_tbl_len;
+	}
+
+	ret = 0;
+	goto out;
+
+err:
+	kfree(device->gid_tbl_len);
+	kfree(device->pkey_tbl_len);
+out:
+	kfree(tprops);
+	return ret;
+}
+
+/**
+ * ib_register_device - Register an IB device with IB core
+ * @device:Device to register
+ *
+ * Low-level drivers use ib_register_device() to register their
+ * devices with the IB core.  All registered clients will receive a
+ * callback for each device that is added. @device must be allocated
+ * with ib_alloc_device().
+ */
+int ib_register_device(struct ib_device *device,
+		       int (*port_callback)(struct ib_device *,
+					    u8, struct kobject *))
+{
+	int ret;
+
+	mutex_lock(&device_mutex);
+
+	if (strchr(device->name, '%')) {
+		ret = alloc_name(device->name);
+		if (ret)
+			goto out;
+	}
+
+	if (ib_device_check_mandatory(device)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	INIT_LIST_HEAD(&device->event_handler_list);
+	INIT_LIST_HEAD(&device->client_data_list);
+	spin_lock_init(&device->event_handler_lock);
+	spin_lock_init(&device->client_data_lock);
+
+	ret = read_port_table_lengths(device);
+	if (ret) {
+		printk(KERN_WARNING "Couldn't create table lengths cache for device %s\n",
+		       device->name);
+		goto out;
+	}
+
+	ret = ib_device_register_sysfs(device, port_callback);
+	if (ret) {
+		printk(KERN_WARNING "Couldn't register device %s with driver model\n",
+		       device->name);
+		kfree(device->gid_tbl_len);
+		kfree(device->pkey_tbl_len);
+		goto out;
+	}
+
+	list_add_tail(&device->core_list, &device_list);
+
+	device->reg_state = IB_DEV_REGISTERED;
+
+	{
+		struct ib_client *client;
+
+		list_for_each_entry(client, &client_list, list)
+			if (client->add && !add_client_context(device, client))
+				client->add(device);
+	}
+
+ out:
+	mutex_unlock(&device_mutex);
+	return ret;
+}
+EXPORT_SYMBOL(ib_register_device);
+
+/**
+ * ib_unregister_device - Unregister an IB device
+ * @device:Device to unregister
+ *
+ * Unregister an IB device.  All clients will receive a remove callback.
+ */
+void ib_unregister_device(struct ib_device *device)
+{
+	struct ib_client *client;
+	struct ib_client_data *context, *tmp;
+	unsigned long flags;
+
+	mutex_lock(&device_mutex);
+
+	list_for_each_entry_reverse(client, &client_list, list)
+		if (client->remove)
+			client->remove(device);
+
+	list_del(&device->core_list);
+
+	kfree(device->gid_tbl_len);
+	kfree(device->pkey_tbl_len);
+
+	mutex_unlock(&device_mutex);
+
+	ib_device_unregister_sysfs(device);
+
+	spin_lock_irqsave(&device->client_data_lock, flags);
+	list_for_each_entry_safe(context, tmp, &device->client_data_list, list)
+		kfree(context);
+	spin_unlock_irqrestore(&device->client_data_lock, flags);
+
+	device->reg_state = IB_DEV_UNREGISTERED;
+}
+EXPORT_SYMBOL(ib_unregister_device);
+
+/**
+ * ib_register_client - Register an IB client
+ * @client:Client to register
+ *
+ * Upper level users of the IB drivers can use ib_register_client() to
+ * register callbacks for IB device addition and removal.  When an IB
+ * device is added, each registered client's add method will be called
+ * (in the order the clients were registered), and when a device is
+ * removed, each client's remove method will be called (in the reverse
+ * order that clients were registered).  In addition, when
+ * ib_register_client() is called, the client will receive an add
+ * callback for all devices already registered.
+ */
+int ib_register_client(struct ib_client *client)
+{
+	struct ib_device *device;
+
+	mutex_lock(&device_mutex);
+
+	list_add_tail(&client->list, &client_list);
+	list_for_each_entry(device, &device_list, core_list)
+		if (client->add && !add_client_context(device, client))
+			client->add(device);
+
+	mutex_unlock(&device_mutex);
+
+	return 0;
+}
+EXPORT_SYMBOL(ib_register_client);
+
+/**
+ * ib_unregister_client - Unregister an IB client
+ * @client:Client to unregister
+ *
+ * Upper level users use ib_unregister_client() to remove their client
+ * registration.  When ib_unregister_client() is called, the client
+ * will receive a remove callback for each IB device still registered.
+ */
+void ib_unregister_client(struct ib_client *client)
+{
+	struct ib_client_data *context, *tmp;
+	struct ib_device *device;
+	unsigned long flags;
+
+	mutex_lock(&device_mutex);
+
+	list_for_each_entry(device, &device_list, core_list) {
+		if (client->remove)
+			client->remove(device);
+
+		spin_lock_irqsave(&device->client_data_lock, flags);
+		list_for_each_entry_safe(context, tmp, &device->client_data_list, list)
+			if (context->client == client) {
+				list_del(&context->list);
+				kfree(context);
+			}
+		spin_unlock_irqrestore(&device->client_data_lock, flags);
+	}
+	list_del(&client->list);
+
+	mutex_unlock(&device_mutex);
+}
+EXPORT_SYMBOL(ib_unregister_client);
+
+/**
+ * ib_get_client_data - Get IB client context
+ * @device:Device to get context for
+ * @client:Client to get context for
+ *
+ * ib_get_client_data() returns client context set with
+ * ib_set_client_data().
+ */
+void *ib_get_client_data(struct ib_device *device, struct ib_client *client)
+{
+	struct ib_client_data *context;
+	void *ret = NULL;
+	unsigned long flags;
+
+	spin_lock_irqsave(&device->client_data_lock, flags);
+	list_for_each_entry(context, &device->client_data_list, list)
+		if (context->client == client) {
+			ret = context->data;
+			break;
+		}
+	spin_unlock_irqrestore(&device->client_data_lock, flags);
+
+	return ret;
+}
+EXPORT_SYMBOL(ib_get_client_data);
+
+/**
+ * ib_set_client_data - Set IB client context
+ * @device:Device to set context for
+ * @client:Client to set context for
+ * @data:Context to set
+ *
+ * ib_set_client_data() sets client context that can be retrieved with
+ * ib_get_client_data().
+ */
+void ib_set_client_data(struct ib_device *device, struct ib_client *client,
+			void *data)
+{
+	struct ib_client_data *context;
+	unsigned long flags;
+
+	spin_lock_irqsave(&device->client_data_lock, flags);
+	list_for_each_entry(context, &device->client_data_list, list)
+		if (context->client == client) {
+			context->data = data;
+			goto out;
+		}
+
+	printk(KERN_WARNING "No client context found for %s/%s\n",
+	       device->name, client->name);
+
+out:
+	spin_unlock_irqrestore(&device->client_data_lock, flags);
+}
+EXPORT_SYMBOL(ib_set_client_data);
+
+/**
+ * ib_register_event_handler - Register an IB event handler
+ * @event_handler:Handler to register
+ *
+ * ib_register_event_handler() registers an event handler that will be
+ * called back when asynchronous IB events occur (as defined in
+ * chapter 11 of the InfiniBand Architecture Specification).  This
+ * callback may occur in interrupt context.
+ */
+int ib_register_event_handler  (struct ib_event_handler *event_handler)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&event_handler->device->event_handler_lock, flags);
+	list_add_tail(&event_handler->list,
+		      &event_handler->device->event_handler_list);
+	spin_unlock_irqrestore(&event_handler->device->event_handler_lock, flags);
+
+	return 0;
+}
+EXPORT_SYMBOL(ib_register_event_handler);
+
+/**
+ * ib_unregister_event_handler - Unregister an event handler
+ * @event_handler:Handler to unregister
+ *
+ * Unregister an event handler registered with
+ * ib_register_event_handler().
+ */
+int ib_unregister_event_handler(struct ib_event_handler *event_handler)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&event_handler->device->event_handler_lock, flags);
+	list_del(&event_handler->list);
+	spin_unlock_irqrestore(&event_handler->device->event_handler_lock, flags);
+
+	return 0;
+}
+EXPORT_SYMBOL(ib_unregister_event_handler);
+
+/**
+ * ib_dispatch_event - Dispatch an asynchronous event
+ * @event:Event to dispatch
+ *
+ * Low-level drivers must call ib_dispatch_event() to dispatch the
+ * event to all registered event handlers when an asynchronous event
+ * occurs.
+ */
+void ib_dispatch_event(struct ib_event *event)
+{
+	unsigned long flags;
+	struct ib_event_handler *handler;
+
+	spin_lock_irqsave(&event->device->event_handler_lock, flags);
+
+	list_for_each_entry(handler, &event->device->event_handler_list, list)
+		handler->handler(handler, event);
+
+	spin_unlock_irqrestore(&event->device->event_handler_lock, flags);
+}
+EXPORT_SYMBOL(ib_dispatch_event);
+
+/**
+ * ib_query_device - Query IB device attributes
+ * @device:Device to query
+ * @device_attr:Device attributes
+ *
+ * ib_query_device() returns the attributes of a device through the
+ * @device_attr pointer.
+ */
+int ib_query_device(struct ib_device *device,
+		    struct ib_device_attr *device_attr)
+{
+	return device->query_device(device, device_attr);
+}
+EXPORT_SYMBOL(ib_query_device);
+
+/**
+ * ib_query_port - Query IB port attributes
+ * @device:Device to query
+ * @port_num:Port number to query
+ * @port_attr:Port attributes
+ *
+ * ib_query_port() returns the attributes of a port through the
+ * @port_attr pointer.
+ */
+int ib_query_port(struct ib_device *device,
+		  u8 port_num,
+		  struct ib_port_attr *port_attr)
+{
+	if (port_num < start_port(device) || port_num > end_port(device))
+		return -EINVAL;
+
+	return device->query_port(device, port_num, port_attr);
+}
+EXPORT_SYMBOL(ib_query_port);
+
+/**
+ * ib_query_gid - Get GID table entry
+ * @device:Device to query
+ * @port_num:Port number to query
+ * @index:GID table index to query
+ * @gid:Returned GID
+ *
+ * ib_query_gid() fetches the specified GID table entry.
+ */
+int ib_query_gid(struct ib_device *device,
+		 u8 port_num, int index, union ib_gid *gid)
+{
+	return device->query_gid(device, port_num, index, gid);
+}
+EXPORT_SYMBOL(ib_query_gid);
+
+/**
+ * ib_query_pkey - Get P_Key table entry
+ * @device:Device to query
+ * @port_num:Port number to query
+ * @index:P_Key table index to query
+ * @pkey:Returned P_Key
+ *
+ * ib_query_pkey() fetches the specified P_Key table entry.
+ */
+int ib_query_pkey(struct ib_device *device,
+		  u8 port_num, u16 index, u16 *pkey)
+{
+	return device->query_pkey(device, port_num, index, pkey);
+}
+EXPORT_SYMBOL(ib_query_pkey);
+
+/**
+ * ib_modify_device - Change IB device attributes
+ * @device:Device to modify
+ * @device_modify_mask:Mask of attributes to change
+ * @device_modify:New attribute values
+ *
+ * ib_modify_device() changes a device's attributes as specified by
+ * the @device_modify_mask and @device_modify structure.
+ */
+int ib_modify_device(struct ib_device *device,
+		     int device_modify_mask,
+		     struct ib_device_modify *device_modify)
+{
+	if (!device->modify_device)
+		return -ENOSYS;
+
+	return device->modify_device(device, device_modify_mask,
+				     device_modify);
+}
+EXPORT_SYMBOL(ib_modify_device);
+
+/**
+ * ib_modify_port - Modifies the attributes for the specified port.
+ * @device: The device to modify.
+ * @port_num: The number of the port to modify.
+ * @port_modify_mask: Mask used to specify which attributes of the port
+ *   to change.
+ * @port_modify: New attribute values for the port.
+ *
+ * ib_modify_port() changes a port's attributes as specified by the
+ * @port_modify_mask and @port_modify structure.
+ */
+int ib_modify_port(struct ib_device *device,
+		   u8 port_num, int port_modify_mask,
+		   struct ib_port_modify *port_modify)
+{
+	if (!device->modify_port)
+		return -ENOSYS;
+
+	if (port_num < start_port(device) || port_num > end_port(device))
+		return -EINVAL;
+
+	return device->modify_port(device, port_num, port_modify_mask,
+				   port_modify);
+}
+EXPORT_SYMBOL(ib_modify_port);
+
+/**
+ * ib_find_gid - Returns the port number and GID table index where
+ *   a specified GID value occurs.
+ * @device: The device to query.
+ * @gid: The GID value to search for.
+ * @port_num: The port number of the device where the GID value was found.
+ * @index: The index into the GID table where the GID was found.  This
+ *   parameter may be NULL.
+ */
+int ib_find_gid(struct ib_device *device, union ib_gid *gid,
+		u8 *port_num, u16 *index)
+{
+	union ib_gid tmp_gid;
+	int ret, port, i;
+
+	for (port = start_port(device); port <= end_port(device); ++port) {
+		for (i = 0; i < device->gid_tbl_len[port - start_port(device)]; ++i) {
+			ret = ib_query_gid(device, port, i, &tmp_gid);
+			if (ret)
+				return ret;
+			if (!memcmp(&tmp_gid, gid, sizeof *gid)) {
+				*port_num = port;
+				if (index)
+					*index = i;
+				return 0;
+			}
+		}
+	}
+
+	return -ENOENT;
+}
+EXPORT_SYMBOL(ib_find_gid);
+
+/**
+ * ib_find_pkey - Returns the PKey table index where a specified
+ *   PKey value occurs.
+ * @device: The device to query.
+ * @port_num: The port number of the device to search for the PKey.
+ * @pkey: The PKey value to search for.
+ * @index: The index into the PKey table where the PKey was found.
+ */
+int ib_find_pkey(struct ib_device *device,
+		 u8 port_num, u16 pkey, u16 *index)
+{
+	int ret, i;
+	u16 tmp_pkey;
+	int partial_ix = -1;
+
+	for (i = 0; i < device->pkey_tbl_len[port_num - start_port(device)]; ++i) {
+		ret = ib_query_pkey(device, port_num, i, &tmp_pkey);
+		if (ret)
+			return ret;
+		if ((pkey & 0x7fff) == (tmp_pkey & 0x7fff)) {
+			/* if there is full-member pkey take it.*/
+			if (tmp_pkey & 0x8000) {
+				*index = i;
+				return 0;
+			}
+			if (partial_ix < 0)
+				partial_ix = i;
+		}
+	}
+
+	/*no full-member, if exists take the limited*/
+	if (partial_ix >= 0) {
+		*index = partial_ix;
+		return 0;
+	}
+	return -ENOENT;
+}
+EXPORT_SYMBOL(ib_find_pkey);
+
+static int __init ib_core_init(void)
+{
+	int ret;
+
+	ib_wq = alloc_workqueue("infiniband", 0, 0);
+	if (!ib_wq)
+		return -ENOMEM;
+
+	ret = ib_sysfs_setup();
+	if (ret) {
+		printk(KERN_WARNING "Couldn't create InfiniBand device class\n");
+		goto err;
+	}
+
+	ret = ibnl_init();
+	if (ret) {
+		printk(KERN_WARNING "Couldn't init IB netlink interface\n");
+		goto err_sysfs;
+	}
+
+	ret = ib_cache_setup();
+	if (ret) {
+		printk(KERN_WARNING "Couldn't set up InfiniBand P_Key/GID cache\n");
+		goto err_nl;
+	}
+
+	return 0;
+
+err_nl:
+	ibnl_cleanup();
+
+err_sysfs:
+	ib_sysfs_cleanup();
+
+err:
+	destroy_workqueue(ib_wq);
+	return ret;
+}
+
+static void __exit ib_core_cleanup(void)
+{
+	ib_cache_cleanup();
+	ibnl_cleanup();
+	ib_sysfs_cleanup();
+	/* Make sure that any pending umem accounting work is done. */
+	destroy_workqueue(ib_wq);
+}
+
+module_init(ib_core_init);
+module_exit(ib_core_cleanup);
diff --git a/drivers/infiniband/core/fmr_pool.c b/drivers/infiniband/core/fmr_pool.c
new file mode 100644
index 0000000..9f5ad7c
--- /dev/null
+++ b/drivers/infiniband/core/fmr_pool.c
@@ -0,0 +1,544 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/errno.h>
+#include <linux/spinlock.h>
+#include <linux/export.h>
+#include <linux/slab.h>
+#include <linux/jhash.h>
+#include <linux/kthread.h>
+
+#include <rdma/ib_fmr_pool.h>
+
+#include "core_priv.h"
+
+#define PFX "fmr_pool: "
+
+enum {
+	IB_FMR_MAX_REMAPS = 32,
+
+	IB_FMR_HASH_BITS  = 8,
+	IB_FMR_HASH_SIZE  = 1 << IB_FMR_HASH_BITS,
+	IB_FMR_HASH_MASK  = IB_FMR_HASH_SIZE - 1
+};
+
+/*
+ * If an FMR is not in use, then the list member will point to either
+ * its pool's free_list (if the FMR can be mapped again; that is,
+ * remap_count < pool->max_remaps) or its pool's dirty_list (if the
+ * FMR needs to be unmapped before being remapped).  In either of
+ * these cases it is a bug if the ref_count is not 0.  In other words,
+ * if ref_count is > 0, then the list member must not be linked into
+ * either free_list or dirty_list.
+ *
+ * The cache_node member is used to link the FMR into a cache bucket
+ * (if caching is enabled).  This is independent of the reference
+ * count of the FMR.  When a valid FMR is released, its ref_count is
+ * decremented, and if ref_count reaches 0, the FMR is placed in
+ * either free_list or dirty_list as appropriate.  However, it is not
+ * removed from the cache and may be "revived" if a call to
+ * ib_fmr_register_physical() occurs before the FMR is remapped.  In
+ * this case we just increment the ref_count and remove the FMR from
+ * free_list/dirty_list.
+ *
+ * Before we remap an FMR from free_list, we remove it from the cache
+ * (to prevent another user from obtaining a stale FMR).  When an FMR
+ * is released, we add it to the tail of the free list, so that our
+ * cache eviction policy is "least recently used."
+ *
+ * All manipulation of ref_count, list and cache_node is protected by
+ * pool_lock to maintain consistency.
+ */
+
+struct ib_fmr_pool {
+	spinlock_t                pool_lock;
+
+	int                       pool_size;
+	int                       max_pages;
+	int			  max_remaps;
+	int                       dirty_watermark;
+	int                       dirty_len;
+	struct list_head          free_list;
+	struct list_head          dirty_list;
+	struct hlist_head        *cache_bucket;
+
+	void                     (*flush_function)(struct ib_fmr_pool *pool,
+						   void *              arg);
+	void                     *flush_arg;
+
+	struct task_struct       *thread;
+
+	atomic_t                  req_ser;
+	atomic_t                  flush_ser;
+
+	wait_queue_head_t         force_wait;
+};
+
+static inline u32 ib_fmr_hash(u64 first_page)
+{
+	return jhash_2words((u32) first_page, (u32) (first_page >> 32), 0) &
+		(IB_FMR_HASH_SIZE - 1);
+}
+
+/* Caller must hold pool_lock */
+static inline struct ib_pool_fmr *ib_fmr_cache_lookup(struct ib_fmr_pool *pool,
+						      u64 *page_list,
+						      int  page_list_len,
+						      u64  io_virtual_address)
+{
+	struct hlist_head *bucket;
+	struct ib_pool_fmr *fmr;
+
+	if (!pool->cache_bucket)
+		return NULL;
+
+	bucket = pool->cache_bucket + ib_fmr_hash(*page_list);
+
+	hlist_for_each_entry(fmr, bucket, cache_node)
+		if (io_virtual_address == fmr->io_virtual_address &&
+		    page_list_len      == fmr->page_list_len      &&
+		    !memcmp(page_list, fmr->page_list,
+			    page_list_len * sizeof *page_list))
+			return fmr;
+
+	return NULL;
+}
+
+static void ib_fmr_batch_release(struct ib_fmr_pool *pool)
+{
+	int                 ret;
+	struct ib_pool_fmr *fmr;
+	LIST_HEAD(unmap_list);
+	LIST_HEAD(fmr_list);
+
+	spin_lock_irq(&pool->pool_lock);
+
+	list_for_each_entry(fmr, &pool->dirty_list, list) {
+		hlist_del_init(&fmr->cache_node);
+		fmr->remap_count = 0;
+		list_add_tail(&fmr->fmr->list, &fmr_list);
+
+#ifdef DEBUG
+		if (fmr->ref_count !=0) {
+			printk(KERN_WARNING PFX "Unmapping FMR 0x%08x with ref count %d\n",
+			       fmr, fmr->ref_count);
+		}
+#endif
+	}
+
+	list_splice_init(&pool->dirty_list, &unmap_list);
+	pool->dirty_len = 0;
+
+	spin_unlock_irq(&pool->pool_lock);
+
+	if (list_empty(&unmap_list)) {
+		return;
+	}
+
+	ret = ib_unmap_fmr(&fmr_list);
+	if (ret)
+		printk(KERN_WARNING PFX "ib_unmap_fmr returned %d\n", ret);
+
+	spin_lock_irq(&pool->pool_lock);
+	list_splice(&unmap_list, &pool->free_list);
+	spin_unlock_irq(&pool->pool_lock);
+}
+
+static int ib_fmr_cleanup_thread(void *pool_ptr)
+{
+	struct ib_fmr_pool *pool = pool_ptr;
+
+	do {
+		if (atomic_read(&pool->flush_ser) - atomic_read(&pool->req_ser) < 0) {
+			ib_fmr_batch_release(pool);
+
+			atomic_inc(&pool->flush_ser);
+			wake_up_interruptible(&pool->force_wait);
+
+			if (pool->flush_function)
+				pool->flush_function(pool, pool->flush_arg);
+		}
+
+		set_current_state(TASK_INTERRUPTIBLE);
+		if (atomic_read(&pool->flush_ser) - atomic_read(&pool->req_ser) >= 0 &&
+		    !kthread_should_stop())
+			schedule();
+		__set_current_state(TASK_RUNNING);
+	} while (!kthread_should_stop());
+
+	return 0;
+}
+
+/**
+ * ib_create_fmr_pool - Create an FMR pool
+ * @pd:Protection domain for FMRs
+ * @params:FMR pool parameters
+ *
+ * Create a pool of FMRs.  Return value is pointer to new pool or
+ * error code if creation failed.
+ */
+struct ib_fmr_pool *ib_create_fmr_pool(struct ib_pd             *pd,
+				       struct ib_fmr_pool_param *params)
+{
+	struct ib_device   *device;
+	struct ib_fmr_pool *pool;
+	struct ib_device_attr *attr;
+	int i;
+	int ret;
+	int max_remaps;
+
+	if (!params)
+		return ERR_PTR(-EINVAL);
+
+	device = pd->device;
+	if (!device->alloc_fmr    || !device->dealloc_fmr  ||
+	    !device->map_phys_fmr || !device->unmap_fmr) {
+		printk(KERN_INFO PFX "Device %s does not support FMRs\n",
+		       device->name);
+		return ERR_PTR(-ENOSYS);
+	}
+
+	attr = kmalloc(sizeof *attr, GFP_KERNEL);
+	if (!attr) {
+		printk(KERN_WARNING PFX "couldn't allocate device attr struct\n");
+		return ERR_PTR(-ENOMEM);
+	}
+
+	ret = ib_query_device(device, attr);
+	if (ret) {
+		printk(KERN_WARNING PFX "couldn't query device: %d\n", ret);
+		kfree(attr);
+		return ERR_PTR(ret);
+	}
+
+	if (!attr->max_map_per_fmr)
+		max_remaps = IB_FMR_MAX_REMAPS;
+	else
+		max_remaps = attr->max_map_per_fmr;
+
+	kfree(attr);
+
+	pool = kmalloc(sizeof *pool, GFP_KERNEL);
+	if (!pool) {
+		printk(KERN_WARNING PFX "couldn't allocate pool struct\n");
+		return ERR_PTR(-ENOMEM);
+	}
+
+	pool->cache_bucket   = NULL;
+
+	pool->flush_function = params->flush_function;
+	pool->flush_arg      = params->flush_arg;
+
+	INIT_LIST_HEAD(&pool->free_list);
+	INIT_LIST_HEAD(&pool->dirty_list);
+
+	if (params->cache) {
+		pool->cache_bucket =
+			kmalloc(IB_FMR_HASH_SIZE * sizeof *pool->cache_bucket,
+				GFP_KERNEL);
+		if (!pool->cache_bucket) {
+			printk(KERN_WARNING PFX "Failed to allocate cache in pool\n");
+			ret = -ENOMEM;
+			goto out_free_pool;
+		}
+
+		for (i = 0; i < IB_FMR_HASH_SIZE; ++i)
+			INIT_HLIST_HEAD(pool->cache_bucket + i);
+	}
+
+	pool->pool_size       = 0;
+	pool->max_pages       = params->max_pages_per_fmr;
+	pool->max_remaps      = max_remaps;
+	pool->dirty_watermark = params->dirty_watermark;
+	pool->dirty_len       = 0;
+	spin_lock_init(&pool->pool_lock);
+	atomic_set(&pool->req_ser,   0);
+	atomic_set(&pool->flush_ser, 0);
+	init_waitqueue_head(&pool->force_wait);
+
+	pool->thread = kthread_run(ib_fmr_cleanup_thread,
+				   pool,
+				   "ib_fmr(%s)",
+				   device->name);
+	if (IS_ERR(pool->thread)) {
+		printk(KERN_WARNING PFX "couldn't start cleanup thread\n");
+		ret = PTR_ERR(pool->thread);
+		goto out_free_pool;
+	}
+
+	{
+		struct ib_pool_fmr *fmr;
+		struct ib_fmr_attr fmr_attr = {
+			.max_pages  = params->max_pages_per_fmr,
+			.max_maps   = pool->max_remaps,
+			.page_shift = params->page_shift
+		};
+		int bytes_per_fmr = sizeof *fmr;
+
+		if (pool->cache_bucket)
+			bytes_per_fmr += params->max_pages_per_fmr * sizeof (u64);
+
+		for (i = 0; i < params->pool_size; ++i) {
+			fmr = kmalloc(bytes_per_fmr, GFP_KERNEL);
+			if (!fmr) {
+				printk(KERN_WARNING PFX "failed to allocate fmr "
+				       "struct for FMR %d\n", i);
+				goto out_fail;
+			}
+
+			fmr->pool             = pool;
+			fmr->remap_count      = 0;
+			fmr->ref_count        = 0;
+			INIT_HLIST_NODE(&fmr->cache_node);
+
+			fmr->fmr = ib_alloc_fmr(pd, params->access, &fmr_attr);
+			if (IS_ERR(fmr->fmr)) {
+				printk(KERN_WARNING PFX "fmr_create failed "
+				       "for FMR %d\n", i);
+				kfree(fmr);
+				goto out_fail;
+			}
+
+			list_add_tail(&fmr->list, &pool->free_list);
+			++pool->pool_size;
+		}
+	}
+
+	return pool;
+
+ out_free_pool:
+	kfree(pool->cache_bucket);
+	kfree(pool);
+
+	return ERR_PTR(ret);
+
+ out_fail:
+	ib_destroy_fmr_pool(pool);
+
+	return ERR_PTR(-ENOMEM);
+}
+EXPORT_SYMBOL(ib_create_fmr_pool);
+
+/**
+ * ib_destroy_fmr_pool - Free FMR pool
+ * @pool:FMR pool to free
+ *
+ * Destroy an FMR pool and free all associated resources.
+ */
+void ib_destroy_fmr_pool(struct ib_fmr_pool *pool)
+{
+	struct ib_pool_fmr *fmr;
+	struct ib_pool_fmr *tmp;
+	LIST_HEAD(fmr_list);
+	int                 i;
+
+	kthread_stop(pool->thread);
+	ib_fmr_batch_release(pool);
+
+	i = 0;
+	list_for_each_entry_safe(fmr, tmp, &pool->free_list, list) {
+		if (fmr->remap_count) {
+			INIT_LIST_HEAD(&fmr_list);
+			list_add_tail(&fmr->fmr->list, &fmr_list);
+			ib_unmap_fmr(&fmr_list);
+		}
+		ib_dealloc_fmr(fmr->fmr);
+		list_del(&fmr->list);
+		kfree(fmr);
+		++i;
+	}
+
+	if (i < pool->pool_size)
+		printk(KERN_WARNING PFX "pool still has %d regions registered\n",
+		       pool->pool_size - i);
+
+	kfree(pool->cache_bucket);
+	kfree(pool);
+}
+EXPORT_SYMBOL(ib_destroy_fmr_pool);
+
+/**
+ * ib_flush_fmr_pool - Invalidate all unmapped FMRs
+ * @pool:FMR pool to flush
+ *
+ * Ensure that all unmapped FMRs are fully invalidated.
+ */
+int ib_flush_fmr_pool(struct ib_fmr_pool *pool)
+{
+	int serial;
+	struct ib_pool_fmr *fmr, *next;
+
+	/*
+	 * The free_list holds FMRs that may have been used
+	 * but have not been remapped enough times to be dirty.
+	 * Put them on the dirty list now so that the cleanup
+	 * thread will reap them too.
+	 */
+	spin_lock_irq(&pool->pool_lock);
+	list_for_each_entry_safe(fmr, next, &pool->free_list, list) {
+		if (fmr->remap_count > 0)
+			list_move(&fmr->list, &pool->dirty_list);
+	}
+	spin_unlock_irq(&pool->pool_lock);
+
+	serial = atomic_inc_return(&pool->req_ser);
+	wake_up_process(pool->thread);
+
+	if (wait_event_interruptible(pool->force_wait,
+				     atomic_read(&pool->flush_ser) - serial >= 0))
+		return -EINTR;
+
+	return 0;
+}
+EXPORT_SYMBOL(ib_flush_fmr_pool);
+
+/**
+ * ib_fmr_pool_map_phys -
+ * @pool:FMR pool to allocate FMR from
+ * @page_list:List of pages to map
+ * @list_len:Number of pages in @page_list
+ * @io_virtual_address:I/O virtual address for new FMR
+ *
+ * Map an FMR from an FMR pool.
+ */
+struct ib_pool_fmr *ib_fmr_pool_map_phys(struct ib_fmr_pool *pool_handle,
+					 u64                *page_list,
+					 int                 list_len,
+					 u64                 io_virtual_address)
+{
+	struct ib_fmr_pool *pool = pool_handle;
+	struct ib_pool_fmr *fmr;
+	unsigned long       flags;
+	int                 result;
+
+	if (list_len < 1 || list_len > pool->max_pages)
+		return ERR_PTR(-EINVAL);
+
+	spin_lock_irqsave(&pool->pool_lock, flags);
+	fmr = ib_fmr_cache_lookup(pool,
+				  page_list,
+				  list_len,
+				  io_virtual_address);
+	if (fmr) {
+		/* found in cache */
+		++fmr->ref_count;
+		if (fmr->ref_count == 1) {
+			list_del(&fmr->list);
+		}
+
+		spin_unlock_irqrestore(&pool->pool_lock, flags);
+
+		return fmr;
+	}
+
+	if (list_empty(&pool->free_list)) {
+		spin_unlock_irqrestore(&pool->pool_lock, flags);
+		return ERR_PTR(-EAGAIN);
+	}
+
+	fmr = list_entry(pool->free_list.next, struct ib_pool_fmr, list);
+	list_del(&fmr->list);
+	hlist_del_init(&fmr->cache_node);
+	spin_unlock_irqrestore(&pool->pool_lock, flags);
+
+	result = ib_map_phys_fmr(fmr->fmr, page_list, list_len,
+				 io_virtual_address);
+
+	if (result) {
+		spin_lock_irqsave(&pool->pool_lock, flags);
+		list_add(&fmr->list, &pool->free_list);
+		spin_unlock_irqrestore(&pool->pool_lock, flags);
+
+		printk(KERN_WARNING PFX "fmr_map returns %d\n", result);
+
+		return ERR_PTR(result);
+	}
+
+	++fmr->remap_count;
+	fmr->ref_count = 1;
+
+	if (pool->cache_bucket) {
+		fmr->io_virtual_address = io_virtual_address;
+		fmr->page_list_len      = list_len;
+		memcpy(fmr->page_list, page_list, list_len * sizeof(*page_list));
+
+		spin_lock_irqsave(&pool->pool_lock, flags);
+		hlist_add_head(&fmr->cache_node,
+			       pool->cache_bucket + ib_fmr_hash(fmr->page_list[0]));
+		spin_unlock_irqrestore(&pool->pool_lock, flags);
+	}
+
+	return fmr;
+}
+EXPORT_SYMBOL(ib_fmr_pool_map_phys);
+
+/**
+ * ib_fmr_pool_unmap - Unmap FMR
+ * @fmr:FMR to unmap
+ *
+ * Unmap an FMR.  The FMR mapping may remain valid until the FMR is
+ * reused (or until ib_flush_fmr_pool() is called).
+ */
+int ib_fmr_pool_unmap(struct ib_pool_fmr *fmr)
+{
+	struct ib_fmr_pool *pool;
+	unsigned long flags;
+
+	pool = fmr->pool;
+
+	spin_lock_irqsave(&pool->pool_lock, flags);
+
+	--fmr->ref_count;
+	if (!fmr->ref_count) {
+		if (fmr->remap_count < pool->max_remaps) {
+			list_add_tail(&fmr->list, &pool->free_list);
+		} else {
+			list_add_tail(&fmr->list, &pool->dirty_list);
+			if (++pool->dirty_len >= pool->dirty_watermark) {
+				atomic_inc(&pool->req_ser);
+				wake_up_process(pool->thread);
+			}
+		}
+	}
+
+#ifdef DEBUG
+	if (fmr->ref_count < 0)
+		printk(KERN_WARNING PFX "FMR %p has ref count %d < 0\n",
+		       fmr, fmr->ref_count);
+#endif
+
+	spin_unlock_irqrestore(&pool->pool_lock, flags);
+
+	return 0;
+}
+EXPORT_SYMBOL(ib_fmr_pool_unmap);
diff --git a/drivers/infiniband/core/iwcm.c b/drivers/infiniband/core/iwcm.c
new file mode 100644
index 0000000..ff9163d
--- /dev/null
+++ b/drivers/infiniband/core/iwcm.c
@@ -0,0 +1,1069 @@
+/*
+ * Copyright (c) 2004, 2005 Intel Corporation.  All rights reserved.
+ * Copyright (c) 2004 Topspin Corporation.  All rights reserved.
+ * Copyright (c) 2004, 2005 Voltaire Corporation.  All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ * Copyright (c) 2005 Network Appliance, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/dma-mapping.h>
+#include <linux/err.h>
+#include <linux/idr.h>
+#include <linux/interrupt.h>
+#include <linux/rbtree.h>
+#include <linux/sched.h>
+#include <linux/spinlock.h>
+#include <linux/workqueue.h>
+#include <linux/completion.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/sysctl.h>
+
+#include <rdma/iw_cm.h>
+#include <rdma/ib_addr.h>
+
+#include "iwcm.h"
+
+MODULE_AUTHOR("Tom Tucker");
+MODULE_DESCRIPTION("iWARP CM");
+MODULE_LICENSE("Dual BSD/GPL");
+
+static struct workqueue_struct *iwcm_wq;
+struct iwcm_work {
+	struct work_struct work;
+	struct iwcm_id_private *cm_id;
+	struct list_head list;
+	struct iw_cm_event event;
+	struct list_head free_list;
+};
+
+static unsigned int default_backlog = 256;
+
+static struct ctl_table_header *iwcm_ctl_table_hdr;
+static struct ctl_table iwcm_ctl_table[] = {
+	{
+		.procname	= "default_backlog",
+		.data		= &default_backlog,
+		.maxlen		= sizeof(default_backlog),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{ }
+};
+
+/*
+ * The following services provide a mechanism for pre-allocating iwcm_work
+ * elements.  The design pre-allocates them  based on the cm_id type:
+ *	LISTENING IDS: 	Get enough elements preallocated to handle the
+ *			listen backlog.
+ *	ACTIVE IDS:	4: CONNECT_REPLY, ESTABLISHED, DISCONNECT, CLOSE
+ *	PASSIVE IDS:	3: ESTABLISHED, DISCONNECT, CLOSE
+ *
+ * Allocating them in connect and listen avoids having to deal
+ * with allocation failures on the event upcall from the provider (which
+ * is called in the interrupt context).
+ *
+ * One exception is when creating the cm_id for incoming connection requests.
+ * There are two cases:
+ * 1) in the event upcall, cm_event_handler(), for a listening cm_id.  If
+ *    the backlog is exceeded, then no more connection request events will
+ *    be processed.  cm_event_handler() returns -ENOMEM in this case.  Its up
+ *    to the provider to reject the connection request.
+ * 2) in the connection request workqueue handler, cm_conn_req_handler().
+ *    If work elements cannot be allocated for the new connect request cm_id,
+ *    then IWCM will call the provider reject method.  This is ok since
+ *    cm_conn_req_handler() runs in the workqueue thread context.
+ */
+
+static struct iwcm_work *get_work(struct iwcm_id_private *cm_id_priv)
+{
+	struct iwcm_work *work;
+
+	if (list_empty(&cm_id_priv->work_free_list))
+		return NULL;
+	work = list_entry(cm_id_priv->work_free_list.next, struct iwcm_work,
+			  free_list);
+	list_del_init(&work->free_list);
+	return work;
+}
+
+static void put_work(struct iwcm_work *work)
+{
+	list_add(&work->free_list, &work->cm_id->work_free_list);
+}
+
+static void dealloc_work_entries(struct iwcm_id_private *cm_id_priv)
+{
+	struct list_head *e, *tmp;
+
+	list_for_each_safe(e, tmp, &cm_id_priv->work_free_list)
+		kfree(list_entry(e, struct iwcm_work, free_list));
+}
+
+static int alloc_work_entries(struct iwcm_id_private *cm_id_priv, int count)
+{
+	struct iwcm_work *work;
+
+	BUG_ON(!list_empty(&cm_id_priv->work_free_list));
+	while (count--) {
+		work = kmalloc(sizeof(struct iwcm_work), GFP_KERNEL);
+		if (!work) {
+			dealloc_work_entries(cm_id_priv);
+			return -ENOMEM;
+		}
+		work->cm_id = cm_id_priv;
+		INIT_LIST_HEAD(&work->list);
+		put_work(work);
+	}
+	return 0;
+}
+
+/*
+ * Save private data from incoming connection requests to
+ * iw_cm_event, so the low level driver doesn't have to. Adjust
+ * the event ptr to point to the local copy.
+ */
+static int copy_private_data(struct iw_cm_event *event)
+{
+	void *p;
+
+	p = kmemdup(event->private_data, event->private_data_len, GFP_ATOMIC);
+	if (!p)
+		return -ENOMEM;
+	event->private_data = p;
+	return 0;
+}
+
+static void free_cm_id(struct iwcm_id_private *cm_id_priv)
+{
+	dealloc_work_entries(cm_id_priv);
+	kfree(cm_id_priv);
+}
+
+/*
+ * Release a reference on cm_id. If the last reference is being
+ * released, enable the waiting thread (in iw_destroy_cm_id) to
+ * get woken up, and return 1 if a thread is already waiting.
+ */
+static int iwcm_deref_id(struct iwcm_id_private *cm_id_priv)
+{
+	BUG_ON(atomic_read(&cm_id_priv->refcount)==0);
+	if (atomic_dec_and_test(&cm_id_priv->refcount)) {
+		BUG_ON(!list_empty(&cm_id_priv->work_list));
+		complete(&cm_id_priv->destroy_comp);
+		return 1;
+	}
+
+	return 0;
+}
+
+static void add_ref(struct iw_cm_id *cm_id)
+{
+	struct iwcm_id_private *cm_id_priv;
+	cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
+	atomic_inc(&cm_id_priv->refcount);
+}
+
+static void rem_ref(struct iw_cm_id *cm_id)
+{
+	struct iwcm_id_private *cm_id_priv;
+	int cb_destroy;
+
+	cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
+
+	/*
+	 * Test bit before deref in case the cm_id gets freed on another
+	 * thread.
+	 */
+	cb_destroy = test_bit(IWCM_F_CALLBACK_DESTROY, &cm_id_priv->flags);
+	if (iwcm_deref_id(cm_id_priv) && cb_destroy) {
+		BUG_ON(!list_empty(&cm_id_priv->work_list));
+		free_cm_id(cm_id_priv);
+	}
+}
+
+static int cm_event_handler(struct iw_cm_id *cm_id, struct iw_cm_event *event);
+
+struct iw_cm_id *iw_create_cm_id(struct ib_device *device,
+				 iw_cm_handler cm_handler,
+				 void *context)
+{
+	struct iwcm_id_private *cm_id_priv;
+
+	cm_id_priv = kzalloc(sizeof(*cm_id_priv), GFP_KERNEL);
+	if (!cm_id_priv)
+		return ERR_PTR(-ENOMEM);
+
+	cm_id_priv->state = IW_CM_STATE_IDLE;
+	cm_id_priv->id.device = device;
+	cm_id_priv->id.cm_handler = cm_handler;
+	cm_id_priv->id.context = context;
+	cm_id_priv->id.event_handler = cm_event_handler;
+	cm_id_priv->id.add_ref = add_ref;
+	cm_id_priv->id.rem_ref = rem_ref;
+	spin_lock_init(&cm_id_priv->lock);
+	atomic_set(&cm_id_priv->refcount, 1);
+	init_waitqueue_head(&cm_id_priv->connect_wait);
+	init_completion(&cm_id_priv->destroy_comp);
+	INIT_LIST_HEAD(&cm_id_priv->work_list);
+	INIT_LIST_HEAD(&cm_id_priv->work_free_list);
+
+	return &cm_id_priv->id;
+}
+EXPORT_SYMBOL(iw_create_cm_id);
+
+
+static int iwcm_modify_qp_err(struct ib_qp *qp)
+{
+	struct ib_qp_attr qp_attr;
+
+	if (!qp)
+		return -EINVAL;
+
+	qp_attr.qp_state = IB_QPS_ERR;
+	return ib_modify_qp(qp, &qp_attr, IB_QP_STATE);
+}
+
+/*
+ * This is really the RDMAC CLOSING state. It is most similar to the
+ * IB SQD QP state.
+ */
+static int iwcm_modify_qp_sqd(struct ib_qp *qp)
+{
+	struct ib_qp_attr qp_attr;
+
+	BUG_ON(qp == NULL);
+	qp_attr.qp_state = IB_QPS_SQD;
+	return ib_modify_qp(qp, &qp_attr, IB_QP_STATE);
+}
+
+/*
+ * CM_ID <-- CLOSING
+ *
+ * Block if a passive or active connection is currently being processed. Then
+ * process the event as follows:
+ * - If we are ESTABLISHED, move to CLOSING and modify the QP state
+ *   based on the abrupt flag
+ * - If the connection is already in the CLOSING or IDLE state, the peer is
+ *   disconnecting concurrently with us and we've already seen the
+ *   DISCONNECT event -- ignore the request and return 0
+ * - Disconnect on a listening endpoint returns -EINVAL
+ */
+int iw_cm_disconnect(struct iw_cm_id *cm_id, int abrupt)
+{
+	struct iwcm_id_private *cm_id_priv;
+	unsigned long flags;
+	int ret = 0;
+	struct ib_qp *qp = NULL;
+
+	cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
+	/* Wait if we're currently in a connect or accept downcall */
+	wait_event(cm_id_priv->connect_wait,
+		   !test_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags));
+
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	switch (cm_id_priv->state) {
+	case IW_CM_STATE_ESTABLISHED:
+		cm_id_priv->state = IW_CM_STATE_CLOSING;
+
+		/* QP could be <nul> for user-mode client */
+		if (cm_id_priv->qp)
+			qp = cm_id_priv->qp;
+		else
+			ret = -EINVAL;
+		break;
+	case IW_CM_STATE_LISTEN:
+		ret = -EINVAL;
+		break;
+	case IW_CM_STATE_CLOSING:
+		/* remote peer closed first */
+	case IW_CM_STATE_IDLE:
+		/* accept or connect returned !0 */
+		break;
+	case IW_CM_STATE_CONN_RECV:
+		/*
+		 * App called disconnect before/without calling accept after
+		 * connect_request event delivered.
+		 */
+		break;
+	case IW_CM_STATE_CONN_SENT:
+		/* Can only get here if wait above fails */
+	default:
+		BUG();
+	}
+	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+
+	if (qp) {
+		if (abrupt)
+			ret = iwcm_modify_qp_err(qp);
+		else
+			ret = iwcm_modify_qp_sqd(qp);
+
+		/*
+		 * If both sides are disconnecting the QP could
+		 * already be in ERR or SQD states
+		 */
+		ret = 0;
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL(iw_cm_disconnect);
+
+/*
+ * CM_ID <-- DESTROYING
+ *
+ * Clean up all resources associated with the connection and release
+ * the initial reference taken by iw_create_cm_id.
+ */
+static void destroy_cm_id(struct iw_cm_id *cm_id)
+{
+	struct iwcm_id_private *cm_id_priv;
+	unsigned long flags;
+
+	cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
+	/*
+	 * Wait if we're currently in a connect or accept downcall. A
+	 * listening endpoint should never block here.
+	 */
+	wait_event(cm_id_priv->connect_wait,
+		   !test_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags));
+
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	switch (cm_id_priv->state) {
+	case IW_CM_STATE_LISTEN:
+		cm_id_priv->state = IW_CM_STATE_DESTROYING;
+		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+		/* destroy the listening endpoint */
+		cm_id->device->iwcm->destroy_listen(cm_id);
+		spin_lock_irqsave(&cm_id_priv->lock, flags);
+		break;
+	case IW_CM_STATE_ESTABLISHED:
+		cm_id_priv->state = IW_CM_STATE_DESTROYING;
+		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+		/* Abrupt close of the connection */
+		(void)iwcm_modify_qp_err(cm_id_priv->qp);
+		spin_lock_irqsave(&cm_id_priv->lock, flags);
+		break;
+	case IW_CM_STATE_IDLE:
+	case IW_CM_STATE_CLOSING:
+		cm_id_priv->state = IW_CM_STATE_DESTROYING;
+		break;
+	case IW_CM_STATE_CONN_RECV:
+		/*
+		 * App called destroy before/without calling accept after
+		 * receiving connection request event notification or
+		 * returned non zero from the event callback function.
+		 * In either case, must tell the provider to reject.
+		 */
+		cm_id_priv->state = IW_CM_STATE_DESTROYING;
+		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+		cm_id->device->iwcm->reject(cm_id, NULL, 0);
+		spin_lock_irqsave(&cm_id_priv->lock, flags);
+		break;
+	case IW_CM_STATE_CONN_SENT:
+	case IW_CM_STATE_DESTROYING:
+	default:
+		BUG();
+		break;
+	}
+	if (cm_id_priv->qp) {
+		cm_id_priv->id.device->iwcm->rem_ref(cm_id_priv->qp);
+		cm_id_priv->qp = NULL;
+	}
+	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+
+	(void)iwcm_deref_id(cm_id_priv);
+}
+
+/*
+ * This function is only called by the application thread and cannot
+ * be called by the event thread. The function will wait for all
+ * references to be released on the cm_id and then kfree the cm_id
+ * object.
+ */
+void iw_destroy_cm_id(struct iw_cm_id *cm_id)
+{
+	struct iwcm_id_private *cm_id_priv;
+
+	cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
+	BUG_ON(test_bit(IWCM_F_CALLBACK_DESTROY, &cm_id_priv->flags));
+
+	destroy_cm_id(cm_id);
+
+	wait_for_completion(&cm_id_priv->destroy_comp);
+
+	free_cm_id(cm_id_priv);
+}
+EXPORT_SYMBOL(iw_destroy_cm_id);
+
+/*
+ * CM_ID <-- LISTEN
+ *
+ * Start listening for connect requests. Generates one CONNECT_REQUEST
+ * event for each inbound connect request.
+ */
+int iw_cm_listen(struct iw_cm_id *cm_id, int backlog)
+{
+	struct iwcm_id_private *cm_id_priv;
+	unsigned long flags;
+	int ret;
+
+	cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
+
+	if (!backlog)
+		backlog = default_backlog;
+
+	ret = alloc_work_entries(cm_id_priv, backlog);
+	if (ret)
+		return ret;
+
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	switch (cm_id_priv->state) {
+	case IW_CM_STATE_IDLE:
+		cm_id_priv->state = IW_CM_STATE_LISTEN;
+		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+		ret = cm_id->device->iwcm->create_listen(cm_id, backlog);
+		if (ret)
+			cm_id_priv->state = IW_CM_STATE_IDLE;
+		spin_lock_irqsave(&cm_id_priv->lock, flags);
+		break;
+	default:
+		ret = -EINVAL;
+	}
+	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+
+	return ret;
+}
+EXPORT_SYMBOL(iw_cm_listen);
+
+/*
+ * CM_ID <-- IDLE
+ *
+ * Rejects an inbound connection request. No events are generated.
+ */
+int iw_cm_reject(struct iw_cm_id *cm_id,
+		 const void *private_data,
+		 u8 private_data_len)
+{
+	struct iwcm_id_private *cm_id_priv;
+	unsigned long flags;
+	int ret;
+
+	cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
+	set_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
+
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	if (cm_id_priv->state != IW_CM_STATE_CONN_RECV) {
+		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+		clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
+		wake_up_all(&cm_id_priv->connect_wait);
+		return -EINVAL;
+	}
+	cm_id_priv->state = IW_CM_STATE_IDLE;
+	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+
+	ret = cm_id->device->iwcm->reject(cm_id, private_data,
+					  private_data_len);
+
+	clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
+	wake_up_all(&cm_id_priv->connect_wait);
+
+	return ret;
+}
+EXPORT_SYMBOL(iw_cm_reject);
+
+/*
+ * CM_ID <-- ESTABLISHED
+ *
+ * Accepts an inbound connection request and generates an ESTABLISHED
+ * event. Callers of iw_cm_disconnect and iw_destroy_cm_id will block
+ * until the ESTABLISHED event is received from the provider.
+ */
+int iw_cm_accept(struct iw_cm_id *cm_id,
+		 struct iw_cm_conn_param *iw_param)
+{
+	struct iwcm_id_private *cm_id_priv;
+	struct ib_qp *qp;
+	unsigned long flags;
+	int ret;
+
+	cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
+	set_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
+
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	if (cm_id_priv->state != IW_CM_STATE_CONN_RECV) {
+		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+		clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
+		wake_up_all(&cm_id_priv->connect_wait);
+		return -EINVAL;
+	}
+	/* Get the ib_qp given the QPN */
+	qp = cm_id->device->iwcm->get_qp(cm_id->device, iw_param->qpn);
+	if (!qp) {
+		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+		clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
+		wake_up_all(&cm_id_priv->connect_wait);
+		return -EINVAL;
+	}
+	cm_id->device->iwcm->add_ref(qp);
+	cm_id_priv->qp = qp;
+	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+
+	ret = cm_id->device->iwcm->accept(cm_id, iw_param);
+	if (ret) {
+		/* An error on accept precludes provider events */
+		BUG_ON(cm_id_priv->state != IW_CM_STATE_CONN_RECV);
+		cm_id_priv->state = IW_CM_STATE_IDLE;
+		spin_lock_irqsave(&cm_id_priv->lock, flags);
+		if (cm_id_priv->qp) {
+			cm_id->device->iwcm->rem_ref(qp);
+			cm_id_priv->qp = NULL;
+		}
+		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+		clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
+		wake_up_all(&cm_id_priv->connect_wait);
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL(iw_cm_accept);
+
+/*
+ * Active Side: CM_ID <-- CONN_SENT
+ *
+ * If successful, results in the generation of a CONNECT_REPLY
+ * event. iw_cm_disconnect and iw_cm_destroy will block until the
+ * CONNECT_REPLY event is received from the provider.
+ */
+int iw_cm_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *iw_param)
+{
+	struct iwcm_id_private *cm_id_priv;
+	int ret;
+	unsigned long flags;
+	struct ib_qp *qp;
+
+	cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
+
+	ret = alloc_work_entries(cm_id_priv, 4);
+	if (ret)
+		return ret;
+
+	set_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+
+	if (cm_id_priv->state != IW_CM_STATE_IDLE) {
+		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+		clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
+		wake_up_all(&cm_id_priv->connect_wait);
+		return -EINVAL;
+	}
+
+	/* Get the ib_qp given the QPN */
+	qp = cm_id->device->iwcm->get_qp(cm_id->device, iw_param->qpn);
+	if (!qp) {
+		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+		clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
+		wake_up_all(&cm_id_priv->connect_wait);
+		return -EINVAL;
+	}
+	cm_id->device->iwcm->add_ref(qp);
+	cm_id_priv->qp = qp;
+	cm_id_priv->state = IW_CM_STATE_CONN_SENT;
+	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+
+	ret = cm_id->device->iwcm->connect(cm_id, iw_param);
+	if (ret) {
+		spin_lock_irqsave(&cm_id_priv->lock, flags);
+		if (cm_id_priv->qp) {
+			cm_id->device->iwcm->rem_ref(qp);
+			cm_id_priv->qp = NULL;
+		}
+		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+		BUG_ON(cm_id_priv->state != IW_CM_STATE_CONN_SENT);
+		cm_id_priv->state = IW_CM_STATE_IDLE;
+		clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
+		wake_up_all(&cm_id_priv->connect_wait);
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL(iw_cm_connect);
+
+/*
+ * Passive Side: new CM_ID <-- CONN_RECV
+ *
+ * Handles an inbound connect request. The function creates a new
+ * iw_cm_id to represent the new connection and inherits the client
+ * callback function and other attributes from the listening parent.
+ *
+ * The work item contains a pointer to the listen_cm_id and the event. The
+ * listen_cm_id contains the client cm_handler, context and
+ * device. These are copied when the device is cloned. The event
+ * contains the new four tuple.
+ *
+ * An error on the child should not affect the parent, so this
+ * function does not return a value.
+ */
+static void cm_conn_req_handler(struct iwcm_id_private *listen_id_priv,
+				struct iw_cm_event *iw_event)
+{
+	unsigned long flags;
+	struct iw_cm_id *cm_id;
+	struct iwcm_id_private *cm_id_priv;
+	int ret;
+
+	/*
+	 * The provider should never generate a connection request
+	 * event with a bad status.
+	 */
+	BUG_ON(iw_event->status);
+
+	cm_id = iw_create_cm_id(listen_id_priv->id.device,
+				listen_id_priv->id.cm_handler,
+				listen_id_priv->id.context);
+	/* If the cm_id could not be created, ignore the request */
+	if (IS_ERR(cm_id))
+		goto out;
+
+	cm_id->provider_data = iw_event->provider_data;
+	cm_id->local_addr = iw_event->local_addr;
+	cm_id->remote_addr = iw_event->remote_addr;
+
+	cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
+	cm_id_priv->state = IW_CM_STATE_CONN_RECV;
+
+	/*
+	 * We could be destroying the listening id. If so, ignore this
+	 * upcall.
+	 */
+	spin_lock_irqsave(&listen_id_priv->lock, flags);
+	if (listen_id_priv->state != IW_CM_STATE_LISTEN) {
+		spin_unlock_irqrestore(&listen_id_priv->lock, flags);
+		iw_cm_reject(cm_id, NULL, 0);
+		iw_destroy_cm_id(cm_id);
+		goto out;
+	}
+	spin_unlock_irqrestore(&listen_id_priv->lock, flags);
+
+	ret = alloc_work_entries(cm_id_priv, 3);
+	if (ret) {
+		iw_cm_reject(cm_id, NULL, 0);
+		iw_destroy_cm_id(cm_id);
+		goto out;
+	}
+
+	/* Call the client CM handler */
+	ret = cm_id->cm_handler(cm_id, iw_event);
+	if (ret) {
+		iw_cm_reject(cm_id, NULL, 0);
+		set_bit(IWCM_F_CALLBACK_DESTROY, &cm_id_priv->flags);
+		destroy_cm_id(cm_id);
+		if (atomic_read(&cm_id_priv->refcount)==0)
+			free_cm_id(cm_id_priv);
+	}
+
+out:
+	if (iw_event->private_data_len)
+		kfree(iw_event->private_data);
+}
+
+/*
+ * Passive Side: CM_ID <-- ESTABLISHED
+ *
+ * The provider generated an ESTABLISHED event which means that
+ * the MPA negotion has completed successfully and we are now in MPA
+ * FPDU mode.
+ *
+ * This event can only be received in the CONN_RECV state. If the
+ * remote peer closed, the ESTABLISHED event would be received followed
+ * by the CLOSE event. If the app closes, it will block until we wake
+ * it up after processing this event.
+ */
+static int cm_conn_est_handler(struct iwcm_id_private *cm_id_priv,
+			       struct iw_cm_event *iw_event)
+{
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+
+	/*
+	 * We clear the CONNECT_WAIT bit here to allow the callback
+	 * function to call iw_cm_disconnect. Calling iw_destroy_cm_id
+	 * from a callback handler is not allowed.
+	 */
+	clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
+	BUG_ON(cm_id_priv->state != IW_CM_STATE_CONN_RECV);
+	cm_id_priv->state = IW_CM_STATE_ESTABLISHED;
+	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+	ret = cm_id_priv->id.cm_handler(&cm_id_priv->id, iw_event);
+	wake_up_all(&cm_id_priv->connect_wait);
+
+	return ret;
+}
+
+/*
+ * Active Side: CM_ID <-- ESTABLISHED
+ *
+ * The app has called connect and is waiting for the established event to
+ * post it's requests to the server. This event will wake up anyone
+ * blocked in iw_cm_disconnect or iw_destroy_id.
+ */
+static int cm_conn_rep_handler(struct iwcm_id_private *cm_id_priv,
+			       struct iw_cm_event *iw_event)
+{
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	/*
+	 * Clear the connect wait bit so a callback function calling
+	 * iw_cm_disconnect will not wait and deadlock this thread
+	 */
+	clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
+	BUG_ON(cm_id_priv->state != IW_CM_STATE_CONN_SENT);
+	if (iw_event->status == 0) {
+		cm_id_priv->id.local_addr = iw_event->local_addr;
+		cm_id_priv->id.remote_addr = iw_event->remote_addr;
+		cm_id_priv->state = IW_CM_STATE_ESTABLISHED;
+	} else {
+		/* REJECTED or RESET */
+		cm_id_priv->id.device->iwcm->rem_ref(cm_id_priv->qp);
+		cm_id_priv->qp = NULL;
+		cm_id_priv->state = IW_CM_STATE_IDLE;
+	}
+	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+	ret = cm_id_priv->id.cm_handler(&cm_id_priv->id, iw_event);
+
+	if (iw_event->private_data_len)
+		kfree(iw_event->private_data);
+
+	/* Wake up waiters on connect complete */
+	wake_up_all(&cm_id_priv->connect_wait);
+
+	return ret;
+}
+
+/*
+ * CM_ID <-- CLOSING
+ *
+ * If in the ESTABLISHED state, move to CLOSING.
+ */
+static void cm_disconnect_handler(struct iwcm_id_private *cm_id_priv,
+				  struct iw_cm_event *iw_event)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	if (cm_id_priv->state == IW_CM_STATE_ESTABLISHED)
+		cm_id_priv->state = IW_CM_STATE_CLOSING;
+	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+}
+
+/*
+ * CM_ID <-- IDLE
+ *
+ * If in the ESTBLISHED or CLOSING states, the QP will have have been
+ * moved by the provider to the ERR state. Disassociate the CM_ID from
+ * the QP,  move to IDLE, and remove the 'connected' reference.
+ *
+ * If in some other state, the cm_id was destroyed asynchronously.
+ * This is the last reference that will result in waking up
+ * the app thread blocked in iw_destroy_cm_id.
+ */
+static int cm_close_handler(struct iwcm_id_private *cm_id_priv,
+				  struct iw_cm_event *iw_event)
+{
+	unsigned long flags;
+	int ret = 0;
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+
+	if (cm_id_priv->qp) {
+		cm_id_priv->id.device->iwcm->rem_ref(cm_id_priv->qp);
+		cm_id_priv->qp = NULL;
+	}
+	switch (cm_id_priv->state) {
+	case IW_CM_STATE_ESTABLISHED:
+	case IW_CM_STATE_CLOSING:
+		cm_id_priv->state = IW_CM_STATE_IDLE;
+		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+		ret = cm_id_priv->id.cm_handler(&cm_id_priv->id, iw_event);
+		spin_lock_irqsave(&cm_id_priv->lock, flags);
+		break;
+	case IW_CM_STATE_DESTROYING:
+		break;
+	default:
+		BUG();
+	}
+	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+
+	return ret;
+}
+
+static int process_event(struct iwcm_id_private *cm_id_priv,
+			 struct iw_cm_event *iw_event)
+{
+	int ret = 0;
+
+	switch (iw_event->event) {
+	case IW_CM_EVENT_CONNECT_REQUEST:
+		cm_conn_req_handler(cm_id_priv, iw_event);
+		break;
+	case IW_CM_EVENT_CONNECT_REPLY:
+		ret = cm_conn_rep_handler(cm_id_priv, iw_event);
+		break;
+	case IW_CM_EVENT_ESTABLISHED:
+		ret = cm_conn_est_handler(cm_id_priv, iw_event);
+		break;
+	case IW_CM_EVENT_DISCONNECT:
+		cm_disconnect_handler(cm_id_priv, iw_event);
+		break;
+	case IW_CM_EVENT_CLOSE:
+		ret = cm_close_handler(cm_id_priv, iw_event);
+		break;
+	default:
+		BUG();
+	}
+
+	return ret;
+}
+
+/*
+ * Process events on the work_list for the cm_id. If the callback
+ * function requests that the cm_id be deleted, a flag is set in the
+ * cm_id flags to indicate that when the last reference is
+ * removed, the cm_id is to be destroyed. This is necessary to
+ * distinguish between an object that will be destroyed by the app
+ * thread asleep on the destroy_comp list vs. an object destroyed
+ * here synchronously when the last reference is removed.
+ */
+static void cm_work_handler(struct work_struct *_work)
+{
+	struct iwcm_work *work = container_of(_work, struct iwcm_work, work);
+	struct iw_cm_event levent;
+	struct iwcm_id_private *cm_id_priv = work->cm_id;
+	unsigned long flags;
+	int empty;
+	int ret = 0;
+	int destroy_id;
+
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	empty = list_empty(&cm_id_priv->work_list);
+	while (!empty) {
+		work = list_entry(cm_id_priv->work_list.next,
+				  struct iwcm_work, list);
+		list_del_init(&work->list);
+		empty = list_empty(&cm_id_priv->work_list);
+		levent = work->event;
+		put_work(work);
+		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+
+		ret = process_event(cm_id_priv, &levent);
+		if (ret) {
+			set_bit(IWCM_F_CALLBACK_DESTROY, &cm_id_priv->flags);
+			destroy_cm_id(&cm_id_priv->id);
+		}
+		BUG_ON(atomic_read(&cm_id_priv->refcount)==0);
+		destroy_id = test_bit(IWCM_F_CALLBACK_DESTROY, &cm_id_priv->flags);
+		if (iwcm_deref_id(cm_id_priv)) {
+			if (destroy_id) {
+				BUG_ON(!list_empty(&cm_id_priv->work_list));
+				free_cm_id(cm_id_priv);
+			}
+			return;
+		}
+		if (empty)
+			return;
+		spin_lock_irqsave(&cm_id_priv->lock, flags);
+	}
+	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+}
+
+/*
+ * This function is called on interrupt context. Schedule events on
+ * the iwcm_wq thread to allow callback functions to downcall into
+ * the CM and/or block.  Events are queued to a per-CM_ID
+ * work_list. If this is the first event on the work_list, the work
+ * element is also queued on the iwcm_wq thread.
+ *
+ * Each event holds a reference on the cm_id. Until the last posted
+ * event has been delivered and processed, the cm_id cannot be
+ * deleted.
+ *
+ * Returns:
+ * 	      0	- the event was handled.
+ *	-ENOMEM	- the event was not handled due to lack of resources.
+ */
+static int cm_event_handler(struct iw_cm_id *cm_id,
+			     struct iw_cm_event *iw_event)
+{
+	struct iwcm_work *work;
+	struct iwcm_id_private *cm_id_priv;
+	unsigned long flags;
+	int ret = 0;
+
+	cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
+
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	work = get_work(cm_id_priv);
+	if (!work) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	INIT_WORK(&work->work, cm_work_handler);
+	work->cm_id = cm_id_priv;
+	work->event = *iw_event;
+
+	if ((work->event.event == IW_CM_EVENT_CONNECT_REQUEST ||
+	     work->event.event == IW_CM_EVENT_CONNECT_REPLY) &&
+	    work->event.private_data_len) {
+		ret = copy_private_data(&work->event);
+		if (ret) {
+			put_work(work);
+			goto out;
+		}
+	}
+
+	atomic_inc(&cm_id_priv->refcount);
+	if (list_empty(&cm_id_priv->work_list)) {
+		list_add_tail(&work->list, &cm_id_priv->work_list);
+		queue_work(iwcm_wq, &work->work);
+	} else
+		list_add_tail(&work->list, &cm_id_priv->work_list);
+out:
+	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+	return ret;
+}
+
+static int iwcm_init_qp_init_attr(struct iwcm_id_private *cm_id_priv,
+				  struct ib_qp_attr *qp_attr,
+				  int *qp_attr_mask)
+{
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	switch (cm_id_priv->state) {
+	case IW_CM_STATE_IDLE:
+	case IW_CM_STATE_CONN_SENT:
+	case IW_CM_STATE_CONN_RECV:
+	case IW_CM_STATE_ESTABLISHED:
+		*qp_attr_mask = IB_QP_STATE | IB_QP_ACCESS_FLAGS;
+		qp_attr->qp_access_flags = IB_ACCESS_REMOTE_WRITE|
+					   IB_ACCESS_REMOTE_READ;
+		ret = 0;
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+	return ret;
+}
+
+static int iwcm_init_qp_rts_attr(struct iwcm_id_private *cm_id_priv,
+				  struct ib_qp_attr *qp_attr,
+				  int *qp_attr_mask)
+{
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	switch (cm_id_priv->state) {
+	case IW_CM_STATE_IDLE:
+	case IW_CM_STATE_CONN_SENT:
+	case IW_CM_STATE_CONN_RECV:
+	case IW_CM_STATE_ESTABLISHED:
+		*qp_attr_mask = 0;
+		ret = 0;
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+	return ret;
+}
+
+int iw_cm_init_qp_attr(struct iw_cm_id *cm_id,
+		       struct ib_qp_attr *qp_attr,
+		       int *qp_attr_mask)
+{
+	struct iwcm_id_private *cm_id_priv;
+	int ret;
+
+	cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
+	switch (qp_attr->qp_state) {
+	case IB_QPS_INIT:
+	case IB_QPS_RTR:
+		ret = iwcm_init_qp_init_attr(cm_id_priv,
+					     qp_attr, qp_attr_mask);
+		break;
+	case IB_QPS_RTS:
+		ret = iwcm_init_qp_rts_attr(cm_id_priv,
+					    qp_attr, qp_attr_mask);
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+	return ret;
+}
+EXPORT_SYMBOL(iw_cm_init_qp_attr);
+
+static int __init iw_cm_init(void)
+{
+	iwcm_wq = create_singlethread_workqueue("iw_cm_wq");
+	if (!iwcm_wq)
+		return -ENOMEM;
+
+	iwcm_ctl_table_hdr = register_net_sysctl(&init_net, "net/iw_cm",
+						 iwcm_ctl_table);
+	if (!iwcm_ctl_table_hdr) {
+		pr_err("iw_cm: couldn't register sysctl paths\n");
+		destroy_workqueue(iwcm_wq);
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static void __exit iw_cm_cleanup(void)
+{
+	unregister_net_sysctl_table(iwcm_ctl_table_hdr);
+	destroy_workqueue(iwcm_wq);
+}
+
+module_init(iw_cm_init);
+module_exit(iw_cm_cleanup);
diff --git a/drivers/infiniband/core/iwcm.h b/drivers/infiniband/core/iwcm.h
new file mode 100644
index 0000000..3f6cc82
--- /dev/null
+++ b/drivers/infiniband/core/iwcm.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2005 Network Appliance, Inc. All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef IWCM_H
+#define IWCM_H
+
+enum iw_cm_state {
+	IW_CM_STATE_IDLE,             /* unbound, inactive */
+	IW_CM_STATE_LISTEN,           /* listen waiting for connect */
+	IW_CM_STATE_CONN_RECV,        /* inbound waiting for user accept */
+	IW_CM_STATE_CONN_SENT,        /* outbound waiting for peer accept */
+	IW_CM_STATE_ESTABLISHED,      /* established */
+	IW_CM_STATE_CLOSING,	      /* disconnect */
+	IW_CM_STATE_DESTROYING        /* object being deleted */
+};
+
+struct iwcm_id_private {
+	struct iw_cm_id	id;
+	enum iw_cm_state state;
+	unsigned long flags;
+	struct ib_qp *qp;
+	struct completion destroy_comp;
+	wait_queue_head_t connect_wait;
+	struct list_head work_list;
+	spinlock_t lock;
+	atomic_t refcount;
+	struct list_head work_free_list;
+};
+
+#define IWCM_F_CALLBACK_DESTROY   1
+#define IWCM_F_CONNECT_WAIT       2
+
+#endif /* IWCM_H */
diff --git a/drivers/infiniband/core/iwpm_msg.c b/drivers/infiniband/core/iwpm_msg.c
new file mode 100644
index 0000000..b85ddbc
--- /dev/null
+++ b/drivers/infiniband/core/iwpm_msg.c
@@ -0,0 +1,685 @@
+/*
+ * Copyright (c) 2014 Intel Corporation. All rights reserved.
+ * Copyright (c) 2014 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "iwpm_util.h"
+
+static const char iwpm_ulib_name[] = "iWarpPortMapperUser";
+static int iwpm_ulib_version = 3;
+static int iwpm_user_pid = IWPM_PID_UNDEFINED;
+static atomic_t echo_nlmsg_seq;
+
+int iwpm_valid_pid(void)
+{
+	return iwpm_user_pid > 0;
+}
+EXPORT_SYMBOL(iwpm_valid_pid);
+
+/*
+ * iwpm_register_pid - Send a netlink query to user space
+ *                     for the iwarp port mapper pid
+ *
+ * nlmsg attributes:
+ *	[IWPM_NLA_REG_PID_SEQ]
+ *	[IWPM_NLA_REG_IF_NAME]
+ *	[IWPM_NLA_REG_IBDEV_NAME]
+ *	[IWPM_NLA_REG_ULIB_NAME]
+ */
+int iwpm_register_pid(struct iwpm_dev_data *pm_msg, u8 nl_client)
+{
+	struct sk_buff *skb = NULL;
+	struct iwpm_nlmsg_request *nlmsg_request = NULL;
+	struct nlmsghdr *nlh;
+	u32 msg_seq;
+	const char *err_str = "";
+	int ret = -EINVAL;
+
+	if (!iwpm_valid_client(nl_client)) {
+		err_str = "Invalid port mapper client";
+		goto pid_query_error;
+	}
+	if (iwpm_registered_client(nl_client))
+		return 0;
+	skb = iwpm_create_nlmsg(RDMA_NL_IWPM_REG_PID, &nlh, nl_client);
+	if (!skb) {
+		err_str = "Unable to create a nlmsg";
+		goto pid_query_error;
+	}
+	nlh->nlmsg_seq = iwpm_get_nlmsg_seq();
+	nlmsg_request = iwpm_get_nlmsg_request(nlh->nlmsg_seq, nl_client, GFP_KERNEL);
+	if (!nlmsg_request) {
+		err_str = "Unable to allocate netlink request";
+		goto pid_query_error;
+	}
+	msg_seq = atomic_read(&echo_nlmsg_seq);
+
+	/* fill in the pid request message */
+	err_str = "Unable to put attribute of the nlmsg";
+	ret = ibnl_put_attr(skb, nlh, sizeof(u32), &msg_seq, IWPM_NLA_REG_PID_SEQ);
+	if (ret)
+		goto pid_query_error;
+	ret = ibnl_put_attr(skb, nlh, IWPM_IFNAME_SIZE,
+				pm_msg->if_name, IWPM_NLA_REG_IF_NAME);
+	if (ret)
+		goto pid_query_error;
+	ret = ibnl_put_attr(skb, nlh, IWPM_DEVNAME_SIZE,
+				pm_msg->dev_name, IWPM_NLA_REG_IBDEV_NAME);
+	if (ret)
+		goto pid_query_error;
+	ret = ibnl_put_attr(skb, nlh, IWPM_ULIBNAME_SIZE,
+				(char *)iwpm_ulib_name, IWPM_NLA_REG_ULIB_NAME);
+	if (ret)
+		goto pid_query_error;
+
+	pr_debug("%s: Multicasting a nlmsg (dev = %s ifname = %s iwpm = %s)\n",
+		__func__, pm_msg->dev_name, pm_msg->if_name, iwpm_ulib_name);
+
+	ret = ibnl_multicast(skb, nlh, RDMA_NL_GROUP_IWPM, GFP_KERNEL);
+	if (ret) {
+		skb = NULL; /* skb is freed in the netlink send-op handling */
+		iwpm_set_registered(nl_client, 1);
+		iwpm_user_pid = IWPM_PID_UNAVAILABLE;
+		err_str = "Unable to send a nlmsg";
+		goto pid_query_error;
+	}
+	nlmsg_request->req_buffer = pm_msg;
+	ret = iwpm_wait_complete_req(nlmsg_request);
+	return ret;
+pid_query_error:
+	pr_info("%s: %s (client = %d)\n", __func__, err_str, nl_client);
+	if (skb)
+		dev_kfree_skb(skb);
+	if (nlmsg_request)
+		iwpm_free_nlmsg_request(&nlmsg_request->kref);
+	return ret;
+}
+EXPORT_SYMBOL(iwpm_register_pid);
+
+/*
+ * iwpm_add_mapping - Send a netlink add mapping message
+ *                    to the port mapper
+ * nlmsg attributes:
+ *	[IWPM_NLA_MANAGE_MAPPING_SEQ]
+ *	[IWPM_NLA_MANAGE_ADDR]
+ */
+int iwpm_add_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client)
+{
+	struct sk_buff *skb = NULL;
+	struct iwpm_nlmsg_request *nlmsg_request = NULL;
+	struct nlmsghdr *nlh;
+	u32 msg_seq;
+	const char *err_str = "";
+	int ret = -EINVAL;
+
+	if (!iwpm_valid_client(nl_client)) {
+		err_str = "Invalid port mapper client";
+		goto add_mapping_error;
+	}
+	if (!iwpm_registered_client(nl_client)) {
+		err_str = "Unregistered port mapper client";
+		goto add_mapping_error;
+	}
+	if (!iwpm_valid_pid())
+		return 0;
+	skb = iwpm_create_nlmsg(RDMA_NL_IWPM_ADD_MAPPING, &nlh, nl_client);
+	if (!skb) {
+		err_str = "Unable to create a nlmsg";
+		goto add_mapping_error;
+	}
+	nlh->nlmsg_seq = iwpm_get_nlmsg_seq();
+	nlmsg_request = iwpm_get_nlmsg_request(nlh->nlmsg_seq, nl_client, GFP_KERNEL);
+	if (!nlmsg_request) {
+		err_str = "Unable to allocate netlink request";
+		goto add_mapping_error;
+	}
+	msg_seq = atomic_read(&echo_nlmsg_seq);
+	/* fill in the add mapping message */
+	err_str = "Unable to put attribute of the nlmsg";
+	ret = ibnl_put_attr(skb, nlh, sizeof(u32), &msg_seq,
+				IWPM_NLA_MANAGE_MAPPING_SEQ);
+	if (ret)
+		goto add_mapping_error;
+	ret = ibnl_put_attr(skb, nlh, sizeof(struct sockaddr_storage),
+				&pm_msg->loc_addr, IWPM_NLA_MANAGE_ADDR);
+	if (ret)
+		goto add_mapping_error;
+	nlmsg_request->req_buffer = pm_msg;
+
+	ret = ibnl_unicast(skb, nlh, iwpm_user_pid);
+	if (ret) {
+		skb = NULL; /* skb is freed in the netlink send-op handling */
+		iwpm_user_pid = IWPM_PID_UNDEFINED;
+		err_str = "Unable to send a nlmsg";
+		goto add_mapping_error;
+	}
+	ret = iwpm_wait_complete_req(nlmsg_request);
+	return ret;
+add_mapping_error:
+	pr_info("%s: %s (client = %d)\n", __func__, err_str, nl_client);
+	if (skb)
+		dev_kfree_skb(skb);
+	if (nlmsg_request)
+		iwpm_free_nlmsg_request(&nlmsg_request->kref);
+	return ret;
+}
+EXPORT_SYMBOL(iwpm_add_mapping);
+
+/*
+ * iwpm_add_and_query_mapping - Send a netlink add and query
+ *                              mapping message to the port mapper
+ * nlmsg attributes:
+ *	[IWPM_NLA_QUERY_MAPPING_SEQ]
+ *	[IWPM_NLA_QUERY_LOCAL_ADDR]
+ *	[IWPM_NLA_QUERY_REMOTE_ADDR]
+ */
+int iwpm_add_and_query_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client)
+{
+	struct sk_buff *skb = NULL;
+	struct iwpm_nlmsg_request *nlmsg_request = NULL;
+	struct nlmsghdr *nlh;
+	u32 msg_seq;
+	const char *err_str = "";
+	int ret = -EINVAL;
+
+	if (!iwpm_valid_client(nl_client)) {
+		err_str = "Invalid port mapper client";
+		goto query_mapping_error;
+	}
+	if (!iwpm_registered_client(nl_client)) {
+		err_str = "Unregistered port mapper client";
+		goto query_mapping_error;
+	}
+	if (!iwpm_valid_pid())
+		return 0;
+	ret = -ENOMEM;
+	skb = iwpm_create_nlmsg(RDMA_NL_IWPM_QUERY_MAPPING, &nlh, nl_client);
+	if (!skb) {
+		err_str = "Unable to create a nlmsg";
+		goto query_mapping_error;
+	}
+	nlh->nlmsg_seq = iwpm_get_nlmsg_seq();
+	nlmsg_request = iwpm_get_nlmsg_request(nlh->nlmsg_seq,
+				nl_client, GFP_KERNEL);
+	if (!nlmsg_request) {
+		err_str = "Unable to allocate netlink request";
+		goto query_mapping_error;
+	}
+	msg_seq = atomic_read(&echo_nlmsg_seq);
+
+	/* fill in the query message */
+	err_str = "Unable to put attribute of the nlmsg";
+	ret = ibnl_put_attr(skb, nlh, sizeof(u32), &msg_seq,
+				IWPM_NLA_QUERY_MAPPING_SEQ);
+	if (ret)
+		goto query_mapping_error;
+	ret = ibnl_put_attr(skb, nlh, sizeof(struct sockaddr_storage),
+				&pm_msg->loc_addr, IWPM_NLA_QUERY_LOCAL_ADDR);
+	if (ret)
+		goto query_mapping_error;
+	ret = ibnl_put_attr(skb, nlh, sizeof(struct sockaddr_storage),
+				&pm_msg->rem_addr, IWPM_NLA_QUERY_REMOTE_ADDR);
+	if (ret)
+		goto query_mapping_error;
+	nlmsg_request->req_buffer = pm_msg;
+
+	ret = ibnl_unicast(skb, nlh, iwpm_user_pid);
+	if (ret) {
+		skb = NULL; /* skb is freed in the netlink send-op handling */
+		err_str = "Unable to send a nlmsg";
+		goto query_mapping_error;
+	}
+	ret = iwpm_wait_complete_req(nlmsg_request);
+	return ret;
+query_mapping_error:
+	pr_info("%s: %s (client = %d)\n", __func__, err_str, nl_client);
+	if (skb)
+		dev_kfree_skb(skb);
+	if (nlmsg_request)
+		iwpm_free_nlmsg_request(&nlmsg_request->kref);
+	return ret;
+}
+EXPORT_SYMBOL(iwpm_add_and_query_mapping);
+
+/*
+ * iwpm_remove_mapping - Send a netlink remove mapping message
+ *                       to the port mapper
+ * nlmsg attributes:
+ *	[IWPM_NLA_MANAGE_MAPPING_SEQ]
+ *	[IWPM_NLA_MANAGE_ADDR]
+ */
+int iwpm_remove_mapping(struct sockaddr_storage *local_addr, u8 nl_client)
+{
+	struct sk_buff *skb = NULL;
+	struct nlmsghdr *nlh;
+	u32 msg_seq;
+	const char *err_str = "";
+	int ret = -EINVAL;
+
+	if (!iwpm_valid_client(nl_client)) {
+		err_str = "Invalid port mapper client";
+		goto remove_mapping_error;
+	}
+	if (!iwpm_registered_client(nl_client)) {
+		err_str = "Unregistered port mapper client";
+		goto remove_mapping_error;
+	}
+	if (!iwpm_valid_pid())
+		return 0;
+	skb = iwpm_create_nlmsg(RDMA_NL_IWPM_REMOVE_MAPPING, &nlh, nl_client);
+	if (!skb) {
+		ret = -ENOMEM;
+		err_str = "Unable to create a nlmsg";
+		goto remove_mapping_error;
+	}
+	msg_seq = atomic_read(&echo_nlmsg_seq);
+	nlh->nlmsg_seq = iwpm_get_nlmsg_seq();
+	err_str = "Unable to put attribute of the nlmsg";
+	ret = ibnl_put_attr(skb, nlh, sizeof(u32), &msg_seq,
+				IWPM_NLA_MANAGE_MAPPING_SEQ);
+	if (ret)
+		goto remove_mapping_error;
+	ret = ibnl_put_attr(skb, nlh, sizeof(struct sockaddr_storage),
+				local_addr, IWPM_NLA_MANAGE_ADDR);
+	if (ret)
+		goto remove_mapping_error;
+
+	ret = ibnl_unicast(skb, nlh, iwpm_user_pid);
+	if (ret) {
+		skb = NULL; /* skb is freed in the netlink send-op handling */
+		iwpm_user_pid = IWPM_PID_UNDEFINED;
+		err_str = "Unable to send a nlmsg";
+		goto remove_mapping_error;
+	}
+	iwpm_print_sockaddr(local_addr,
+			"remove_mapping: Local sockaddr:");
+	return 0;
+remove_mapping_error:
+	pr_info("%s: %s (client = %d)\n", __func__, err_str, nl_client);
+	if (skb)
+		dev_kfree_skb_any(skb);
+	return ret;
+}
+EXPORT_SYMBOL(iwpm_remove_mapping);
+
+/* netlink attribute policy for the received response to register pid request */
+static const struct nla_policy resp_reg_policy[IWPM_NLA_RREG_PID_MAX] = {
+	[IWPM_NLA_RREG_PID_SEQ]     = { .type = NLA_U32 },
+	[IWPM_NLA_RREG_IBDEV_NAME]  = { .type = NLA_STRING,
+					.len = IWPM_DEVNAME_SIZE - 1 },
+	[IWPM_NLA_RREG_ULIB_NAME]   = { .type = NLA_STRING,
+					.len = IWPM_ULIBNAME_SIZE - 1 },
+	[IWPM_NLA_RREG_ULIB_VER]    = { .type = NLA_U16 },
+	[IWPM_NLA_RREG_PID_ERR]     = { .type = NLA_U16 }
+};
+
+/*
+ * iwpm_register_pid_cb - Process a port mapper response to
+ *                        iwpm_register_pid()
+ */
+int iwpm_register_pid_cb(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	struct iwpm_nlmsg_request *nlmsg_request = NULL;
+	struct nlattr *nltb[IWPM_NLA_RREG_PID_MAX];
+	struct iwpm_dev_data *pm_msg;
+	char *dev_name, *iwpm_name;
+	u32 msg_seq;
+	u8 nl_client;
+	u16 iwpm_version;
+	const char *msg_type = "Register Pid response";
+
+	if (iwpm_parse_nlmsg(cb, IWPM_NLA_RREG_PID_MAX,
+				resp_reg_policy, nltb, msg_type))
+		return -EINVAL;
+
+	msg_seq = nla_get_u32(nltb[IWPM_NLA_RREG_PID_SEQ]);
+	nlmsg_request = iwpm_find_nlmsg_request(msg_seq);
+	if (!nlmsg_request) {
+		pr_info("%s: Could not find a matching request (seq = %u)\n",
+				 __func__, msg_seq);
+		return -EINVAL;
+	}
+	pm_msg = nlmsg_request->req_buffer;
+	nl_client = nlmsg_request->nl_client;
+	dev_name = (char *)nla_data(nltb[IWPM_NLA_RREG_IBDEV_NAME]);
+	iwpm_name = (char *)nla_data(nltb[IWPM_NLA_RREG_ULIB_NAME]);
+	iwpm_version = nla_get_u16(nltb[IWPM_NLA_RREG_ULIB_VER]);
+
+	/* check device name, ulib name and version */
+	if (strcmp(pm_msg->dev_name, dev_name) ||
+			strcmp(iwpm_ulib_name, iwpm_name) ||
+			iwpm_version != iwpm_ulib_version) {
+
+		pr_info("%s: Incorrect info (dev = %s name = %s version = %d)\n",
+				__func__, dev_name, iwpm_name, iwpm_version);
+		nlmsg_request->err_code = IWPM_USER_LIB_INFO_ERR;
+		goto register_pid_response_exit;
+	}
+	iwpm_user_pid = cb->nlh->nlmsg_pid;
+	atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq);
+	pr_debug("%s: iWarp Port Mapper (pid = %d) is available!\n",
+			__func__, iwpm_user_pid);
+	if (iwpm_valid_client(nl_client))
+		iwpm_set_registered(nl_client, 1);
+register_pid_response_exit:
+	nlmsg_request->request_done = 1;
+	/* always for found nlmsg_request */
+	kref_put(&nlmsg_request->kref, iwpm_free_nlmsg_request);
+	barrier();
+	wake_up(&nlmsg_request->waitq);
+	return 0;
+}
+EXPORT_SYMBOL(iwpm_register_pid_cb);
+
+/* netlink attribute policy for the received response to add mapping request */
+static const struct nla_policy resp_add_policy[IWPM_NLA_RMANAGE_MAPPING_MAX] = {
+	[IWPM_NLA_MANAGE_MAPPING_SEQ]     = { .type = NLA_U32 },
+	[IWPM_NLA_MANAGE_ADDR]            = { .len = sizeof(struct sockaddr_storage) },
+	[IWPM_NLA_MANAGE_MAPPED_LOC_ADDR] = { .len = sizeof(struct sockaddr_storage) },
+	[IWPM_NLA_RMANAGE_MAPPING_ERR]	  = { .type = NLA_U16 }
+};
+
+/*
+ * iwpm_add_mapping_cb - Process a port mapper response to
+ *                       iwpm_add_mapping()
+ */
+int iwpm_add_mapping_cb(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	struct iwpm_sa_data *pm_msg;
+	struct iwpm_nlmsg_request *nlmsg_request = NULL;
+	struct nlattr *nltb[IWPM_NLA_RMANAGE_MAPPING_MAX];
+	struct sockaddr_storage *local_sockaddr;
+	struct sockaddr_storage *mapped_sockaddr;
+	const char *msg_type;
+	u32 msg_seq;
+
+	msg_type = "Add Mapping response";
+	if (iwpm_parse_nlmsg(cb, IWPM_NLA_RMANAGE_MAPPING_MAX,
+				resp_add_policy, nltb, msg_type))
+		return -EINVAL;
+
+	atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq);
+
+	msg_seq = nla_get_u32(nltb[IWPM_NLA_MANAGE_MAPPING_SEQ]);
+	nlmsg_request = iwpm_find_nlmsg_request(msg_seq);
+	if (!nlmsg_request) {
+		pr_info("%s: Could not find a matching request (seq = %u)\n",
+				 __func__, msg_seq);
+		return -EINVAL;
+	}
+	pm_msg = nlmsg_request->req_buffer;
+	local_sockaddr = (struct sockaddr_storage *)
+			nla_data(nltb[IWPM_NLA_MANAGE_ADDR]);
+	mapped_sockaddr = (struct sockaddr_storage *)
+			nla_data(nltb[IWPM_NLA_MANAGE_MAPPED_LOC_ADDR]);
+
+	if (iwpm_compare_sockaddr(local_sockaddr, &pm_msg->loc_addr)) {
+		nlmsg_request->err_code = IWPM_USER_LIB_INFO_ERR;
+		goto add_mapping_response_exit;
+	}
+	if (mapped_sockaddr->ss_family != local_sockaddr->ss_family) {
+		pr_info("%s: Sockaddr family doesn't match the requested one\n",
+				__func__);
+		nlmsg_request->err_code = IWPM_USER_LIB_INFO_ERR;
+		goto add_mapping_response_exit;
+	}
+	memcpy(&pm_msg->mapped_loc_addr, mapped_sockaddr,
+			sizeof(*mapped_sockaddr));
+	iwpm_print_sockaddr(&pm_msg->loc_addr,
+			"add_mapping: Local sockaddr:");
+	iwpm_print_sockaddr(&pm_msg->mapped_loc_addr,
+			"add_mapping: Mapped local sockaddr:");
+
+add_mapping_response_exit:
+	nlmsg_request->request_done = 1;
+	/* always for found request */
+	kref_put(&nlmsg_request->kref, iwpm_free_nlmsg_request);
+	barrier();
+	wake_up(&nlmsg_request->waitq);
+	return 0;
+}
+EXPORT_SYMBOL(iwpm_add_mapping_cb);
+
+/* netlink attribute policy for the response to add and query mapping request */
+static const struct nla_policy resp_query_policy[IWPM_NLA_RQUERY_MAPPING_MAX] = {
+	[IWPM_NLA_QUERY_MAPPING_SEQ]      = { .type = NLA_U32 },
+	[IWPM_NLA_QUERY_LOCAL_ADDR]       = { .len = sizeof(struct sockaddr_storage) },
+	[IWPM_NLA_QUERY_REMOTE_ADDR]      = { .len = sizeof(struct sockaddr_storage) },
+	[IWPM_NLA_RQUERY_MAPPED_LOC_ADDR] = { .len = sizeof(struct sockaddr_storage) },
+	[IWPM_NLA_RQUERY_MAPPED_REM_ADDR] = { .len = sizeof(struct sockaddr_storage) },
+	[IWPM_NLA_RQUERY_MAPPING_ERR]	  = { .type = NLA_U16 }
+};
+
+/*
+ * iwpm_add_and_query_mapping_cb - Process a port mapper response to
+ *                                 iwpm_add_and_query_mapping()
+ */
+int iwpm_add_and_query_mapping_cb(struct sk_buff *skb,
+				struct netlink_callback *cb)
+{
+	struct iwpm_sa_data *pm_msg;
+	struct iwpm_nlmsg_request *nlmsg_request = NULL;
+	struct nlattr *nltb[IWPM_NLA_RQUERY_MAPPING_MAX];
+	struct sockaddr_storage *local_sockaddr, *remote_sockaddr;
+	struct sockaddr_storage *mapped_loc_sockaddr, *mapped_rem_sockaddr;
+	const char *msg_type;
+	u32 msg_seq;
+	u16 err_code;
+
+	msg_type = "Query Mapping response";
+	if (iwpm_parse_nlmsg(cb, IWPM_NLA_RQUERY_MAPPING_MAX,
+				resp_query_policy, nltb, msg_type))
+		return -EINVAL;
+	atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq);
+
+	msg_seq = nla_get_u32(nltb[IWPM_NLA_QUERY_MAPPING_SEQ]);
+	nlmsg_request = iwpm_find_nlmsg_request(msg_seq);
+	if (!nlmsg_request) {
+		pr_info("%s: Could not find a matching request (seq = %u)\n",
+				 __func__, msg_seq);
+			return -EINVAL;
+	}
+	pm_msg = nlmsg_request->req_buffer;
+	local_sockaddr = (struct sockaddr_storage *)
+			nla_data(nltb[IWPM_NLA_QUERY_LOCAL_ADDR]);
+	remote_sockaddr = (struct sockaddr_storage *)
+			nla_data(nltb[IWPM_NLA_QUERY_REMOTE_ADDR]);
+	mapped_loc_sockaddr = (struct sockaddr_storage *)
+			nla_data(nltb[IWPM_NLA_RQUERY_MAPPED_LOC_ADDR]);
+	mapped_rem_sockaddr = (struct sockaddr_storage *)
+			nla_data(nltb[IWPM_NLA_RQUERY_MAPPED_REM_ADDR]);
+
+	err_code = nla_get_u16(nltb[IWPM_NLA_RQUERY_MAPPING_ERR]);
+	if (err_code == IWPM_REMOTE_QUERY_REJECT) {
+		pr_info("%s: Received a Reject (pid = %u, echo seq = %u)\n",
+			__func__, cb->nlh->nlmsg_pid, msg_seq);
+		nlmsg_request->err_code = IWPM_REMOTE_QUERY_REJECT;
+	}
+	if (iwpm_compare_sockaddr(local_sockaddr, &pm_msg->loc_addr) ||
+		iwpm_compare_sockaddr(remote_sockaddr, &pm_msg->rem_addr)) {
+		pr_info("%s: Incorrect local sockaddr\n", __func__);
+		nlmsg_request->err_code = IWPM_USER_LIB_INFO_ERR;
+		goto query_mapping_response_exit;
+	}
+	if (mapped_loc_sockaddr->ss_family != local_sockaddr->ss_family ||
+		mapped_rem_sockaddr->ss_family != remote_sockaddr->ss_family) {
+		pr_info("%s: Sockaddr family doesn't match the requested one\n",
+				__func__);
+		nlmsg_request->err_code = IWPM_USER_LIB_INFO_ERR;
+		goto query_mapping_response_exit;
+	}
+	memcpy(&pm_msg->mapped_loc_addr, mapped_loc_sockaddr,
+			sizeof(*mapped_loc_sockaddr));
+	memcpy(&pm_msg->mapped_rem_addr, mapped_rem_sockaddr,
+			sizeof(*mapped_rem_sockaddr));
+
+	iwpm_print_sockaddr(&pm_msg->loc_addr,
+			"query_mapping: Local sockaddr:");
+	iwpm_print_sockaddr(&pm_msg->mapped_loc_addr,
+			"query_mapping: Mapped local sockaddr:");
+	iwpm_print_sockaddr(&pm_msg->rem_addr,
+			"query_mapping: Remote sockaddr:");
+	iwpm_print_sockaddr(&pm_msg->mapped_rem_addr,
+			"query_mapping: Mapped remote sockaddr:");
+query_mapping_response_exit:
+	nlmsg_request->request_done = 1;
+	/* always for found request */
+	kref_put(&nlmsg_request->kref, iwpm_free_nlmsg_request);
+	barrier();
+	wake_up(&nlmsg_request->waitq);
+	return 0;
+}
+EXPORT_SYMBOL(iwpm_add_and_query_mapping_cb);
+
+/* netlink attribute policy for the received request for mapping info */
+static const struct nla_policy resp_mapinfo_policy[IWPM_NLA_MAPINFO_REQ_MAX] = {
+	[IWPM_NLA_MAPINFO_ULIB_NAME] = { .type = NLA_STRING,
+					.len = IWPM_ULIBNAME_SIZE - 1 },
+	[IWPM_NLA_MAPINFO_ULIB_VER]  = { .type = NLA_U16 }
+};
+
+/*
+ * iwpm_mapping_info_cb - Process a port mapper request for mapping info
+ */
+int iwpm_mapping_info_cb(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	struct nlattr *nltb[IWPM_NLA_MAPINFO_REQ_MAX];
+	const char *msg_type = "Mapping Info response";
+	int iwpm_pid;
+	u8 nl_client;
+	char *iwpm_name;
+	u16 iwpm_version;
+	int ret = -EINVAL;
+
+	if (iwpm_parse_nlmsg(cb, IWPM_NLA_MAPINFO_REQ_MAX,
+				resp_mapinfo_policy, nltb, msg_type)) {
+		pr_info("%s: Unable to parse nlmsg\n", __func__);
+		return ret;
+	}
+	iwpm_name = (char *)nla_data(nltb[IWPM_NLA_MAPINFO_ULIB_NAME]);
+	iwpm_version = nla_get_u16(nltb[IWPM_NLA_MAPINFO_ULIB_VER]);
+	if (strcmp(iwpm_ulib_name, iwpm_name) ||
+			iwpm_version != iwpm_ulib_version) {
+		pr_info("%s: Invalid port mapper name = %s version = %d\n",
+				__func__, iwpm_name, iwpm_version);
+		return ret;
+	}
+	nl_client = RDMA_NL_GET_CLIENT(cb->nlh->nlmsg_type);
+	if (!iwpm_valid_client(nl_client)) {
+		pr_info("%s: Invalid port mapper client = %d\n",
+				__func__, nl_client);
+		return ret;
+	}
+	iwpm_set_registered(nl_client, 0);
+	atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq);
+	if (!iwpm_mapinfo_available())
+		return 0;
+	iwpm_pid = cb->nlh->nlmsg_pid;
+	pr_debug("%s: iWarp Port Mapper (pid = %d) is available!\n",
+		 __func__, iwpm_pid);
+	ret = iwpm_send_mapinfo(nl_client, iwpm_pid);
+	return ret;
+}
+EXPORT_SYMBOL(iwpm_mapping_info_cb);
+
+/* netlink attribute policy for the received mapping info ack */
+static const struct nla_policy ack_mapinfo_policy[IWPM_NLA_MAPINFO_NUM_MAX] = {
+	[IWPM_NLA_MAPINFO_SEQ]    =   { .type = NLA_U32 },
+	[IWPM_NLA_MAPINFO_SEND_NUM] = { .type = NLA_U32 },
+	[IWPM_NLA_MAPINFO_ACK_NUM] =  { .type = NLA_U32 }
+};
+
+/*
+ * iwpm_ack_mapping_info_cb - Process a port mapper ack for
+ *                            the provided mapping info records
+ */
+int iwpm_ack_mapping_info_cb(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	struct nlattr *nltb[IWPM_NLA_MAPINFO_NUM_MAX];
+	u32 mapinfo_send, mapinfo_ack;
+	const char *msg_type = "Mapping Info Ack";
+
+	if (iwpm_parse_nlmsg(cb, IWPM_NLA_MAPINFO_NUM_MAX,
+				ack_mapinfo_policy, nltb, msg_type))
+		return -EINVAL;
+	mapinfo_send = nla_get_u32(nltb[IWPM_NLA_MAPINFO_SEND_NUM]);
+	mapinfo_ack = nla_get_u32(nltb[IWPM_NLA_MAPINFO_ACK_NUM]);
+	if (mapinfo_ack != mapinfo_send)
+		pr_info("%s: Invalid mapinfo number (sent = %u ack-ed = %u)\n",
+			__func__, mapinfo_send, mapinfo_ack);
+	atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq);
+	return 0;
+}
+EXPORT_SYMBOL(iwpm_ack_mapping_info_cb);
+
+/* netlink attribute policy for the received port mapper error message */
+static const struct nla_policy map_error_policy[IWPM_NLA_ERR_MAX] = {
+	[IWPM_NLA_ERR_SEQ]        = { .type = NLA_U32 },
+	[IWPM_NLA_ERR_CODE]       = { .type = NLA_U16 },
+};
+
+/*
+ * iwpm_mapping_error_cb - Process a port mapper error message
+ */
+int iwpm_mapping_error_cb(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	struct iwpm_nlmsg_request *nlmsg_request = NULL;
+	int nl_client = RDMA_NL_GET_CLIENT(cb->nlh->nlmsg_type);
+	struct nlattr *nltb[IWPM_NLA_ERR_MAX];
+	u32 msg_seq;
+	u16 err_code;
+	const char *msg_type = "Mapping Error Msg";
+
+	if (iwpm_parse_nlmsg(cb, IWPM_NLA_ERR_MAX,
+				map_error_policy, nltb, msg_type))
+		return -EINVAL;
+
+	msg_seq = nla_get_u32(nltb[IWPM_NLA_ERR_SEQ]);
+	err_code = nla_get_u16(nltb[IWPM_NLA_ERR_CODE]);
+	pr_info("%s: Received msg seq = %u err code = %u client = %d\n",
+				__func__, msg_seq, err_code, nl_client);
+	/* look for nlmsg_request */
+	nlmsg_request = iwpm_find_nlmsg_request(msg_seq);
+	if (!nlmsg_request) {
+		/* not all errors have associated requests */
+		pr_debug("Could not find matching req (seq = %u)\n", msg_seq);
+		return 0;
+	}
+	atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq);
+	nlmsg_request->err_code = err_code;
+	nlmsg_request->request_done = 1;
+	/* always for found request */
+	kref_put(&nlmsg_request->kref, iwpm_free_nlmsg_request);
+	barrier();
+	wake_up(&nlmsg_request->waitq);
+	return 0;
+}
+EXPORT_SYMBOL(iwpm_mapping_error_cb);
diff --git a/drivers/infiniband/core/iwpm_util.c b/drivers/infiniband/core/iwpm_util.c
new file mode 100644
index 0000000..69e9f84
--- /dev/null
+++ b/drivers/infiniband/core/iwpm_util.c
@@ -0,0 +1,607 @@
+/*
+ * Copyright (c) 2014 Chelsio, Inc. All rights reserved.
+ * Copyright (c) 2014 Intel Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer in the documentation and/or other materials
+ *	  provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "iwpm_util.h"
+
+#define IWPM_HASH_BUCKET_SIZE	512
+#define IWPM_HASH_BUCKET_MASK	(IWPM_HASH_BUCKET_SIZE - 1)
+
+static LIST_HEAD(iwpm_nlmsg_req_list);
+static DEFINE_SPINLOCK(iwpm_nlmsg_req_lock);
+
+static struct hlist_head *iwpm_hash_bucket;
+static DEFINE_SPINLOCK(iwpm_mapinfo_lock);
+
+static DEFINE_MUTEX(iwpm_admin_lock);
+static struct iwpm_admin_data iwpm_admin;
+
+int iwpm_init(u8 nl_client)
+{
+	if (iwpm_valid_client(nl_client))
+		return -EINVAL;
+	mutex_lock(&iwpm_admin_lock);
+	if (atomic_read(&iwpm_admin.refcount) == 0) {
+		iwpm_hash_bucket = kzalloc(IWPM_HASH_BUCKET_SIZE *
+					sizeof(struct hlist_head), GFP_KERNEL);
+		if (!iwpm_hash_bucket) {
+			mutex_unlock(&iwpm_admin_lock);
+			pr_err("%s Unable to create mapinfo hash table\n", __func__);
+			return -ENOMEM;
+		}
+	}
+	atomic_inc(&iwpm_admin.refcount);
+	mutex_unlock(&iwpm_admin_lock);
+	iwpm_set_valid(nl_client, 1);
+	return 0;
+}
+EXPORT_SYMBOL(iwpm_init);
+
+static void free_hash_bucket(void);
+
+int iwpm_exit(u8 nl_client)
+{
+
+	if (!iwpm_valid_client(nl_client))
+		return -EINVAL;
+	mutex_lock(&iwpm_admin_lock);
+	if (atomic_read(&iwpm_admin.refcount) == 0) {
+		mutex_unlock(&iwpm_admin_lock);
+		pr_err("%s Incorrect usage - negative refcount\n", __func__);
+		return -EINVAL;
+	}
+	if (atomic_dec_and_test(&iwpm_admin.refcount)) {
+		free_hash_bucket();
+		pr_debug("%s: Mapinfo hash table is destroyed\n", __func__);
+	}
+	mutex_unlock(&iwpm_admin_lock);
+	iwpm_set_valid(nl_client, 0);
+	return 0;
+}
+EXPORT_SYMBOL(iwpm_exit);
+
+static struct hlist_head *get_hash_bucket_head(struct sockaddr_storage *,
+					       struct sockaddr_storage *);
+
+int iwpm_create_mapinfo(struct sockaddr_storage *local_sockaddr,
+			struct sockaddr_storage *mapped_sockaddr,
+			u8 nl_client)
+{
+	struct hlist_head *hash_bucket_head;
+	struct iwpm_mapping_info *map_info;
+	unsigned long flags;
+
+	if (!iwpm_valid_client(nl_client))
+		return -EINVAL;
+	map_info = kzalloc(sizeof(struct iwpm_mapping_info), GFP_KERNEL);
+	if (!map_info) {
+		pr_err("%s: Unable to allocate a mapping info\n", __func__);
+		return -ENOMEM;
+	}
+	memcpy(&map_info->local_sockaddr, local_sockaddr,
+	       sizeof(struct sockaddr_storage));
+	memcpy(&map_info->mapped_sockaddr, mapped_sockaddr,
+	       sizeof(struct sockaddr_storage));
+	map_info->nl_client = nl_client;
+
+	spin_lock_irqsave(&iwpm_mapinfo_lock, flags);
+	if (iwpm_hash_bucket) {
+		hash_bucket_head = get_hash_bucket_head(
+					&map_info->local_sockaddr,
+					&map_info->mapped_sockaddr);
+		hlist_add_head(&map_info->hlist_node, hash_bucket_head);
+	}
+	spin_unlock_irqrestore(&iwpm_mapinfo_lock, flags);
+	return 0;
+}
+EXPORT_SYMBOL(iwpm_create_mapinfo);
+
+int iwpm_remove_mapinfo(struct sockaddr_storage *local_sockaddr,
+			struct sockaddr_storage *mapped_local_addr)
+{
+	struct hlist_node *tmp_hlist_node;
+	struct hlist_head *hash_bucket_head;
+	struct iwpm_mapping_info *map_info = NULL;
+	unsigned long flags;
+	int ret = -EINVAL;
+
+	spin_lock_irqsave(&iwpm_mapinfo_lock, flags);
+	if (iwpm_hash_bucket) {
+		hash_bucket_head = get_hash_bucket_head(
+					local_sockaddr,
+					mapped_local_addr);
+		hlist_for_each_entry_safe(map_info, tmp_hlist_node,
+					hash_bucket_head, hlist_node) {
+
+			if (!iwpm_compare_sockaddr(&map_info->mapped_sockaddr,
+						mapped_local_addr)) {
+
+				hlist_del_init(&map_info->hlist_node);
+				kfree(map_info);
+				ret = 0;
+				break;
+			}
+		}
+	}
+	spin_unlock_irqrestore(&iwpm_mapinfo_lock, flags);
+	return ret;
+}
+EXPORT_SYMBOL(iwpm_remove_mapinfo);
+
+static void free_hash_bucket(void)
+{
+	struct hlist_node *tmp_hlist_node;
+	struct iwpm_mapping_info *map_info;
+	unsigned long flags;
+	int i;
+
+	/* remove all the mapinfo data from the list */
+	spin_lock_irqsave(&iwpm_mapinfo_lock, flags);
+	for (i = 0; i < IWPM_HASH_BUCKET_SIZE; i++) {
+		hlist_for_each_entry_safe(map_info, tmp_hlist_node,
+			&iwpm_hash_bucket[i], hlist_node) {
+
+				hlist_del_init(&map_info->hlist_node);
+				kfree(map_info);
+			}
+	}
+	/* free the hash list */
+	kfree(iwpm_hash_bucket);
+	iwpm_hash_bucket = NULL;
+	spin_unlock_irqrestore(&iwpm_mapinfo_lock, flags);
+}
+
+struct iwpm_nlmsg_request *iwpm_get_nlmsg_request(__u32 nlmsg_seq,
+					u8 nl_client, gfp_t gfp)
+{
+	struct iwpm_nlmsg_request *nlmsg_request = NULL;
+	unsigned long flags;
+
+	nlmsg_request = kzalloc(sizeof(struct iwpm_nlmsg_request), gfp);
+	if (!nlmsg_request) {
+		pr_err("%s Unable to allocate a nlmsg_request\n", __func__);
+		return NULL;
+	}
+	spin_lock_irqsave(&iwpm_nlmsg_req_lock, flags);
+	list_add_tail(&nlmsg_request->inprocess_list, &iwpm_nlmsg_req_list);
+	spin_unlock_irqrestore(&iwpm_nlmsg_req_lock, flags);
+
+	kref_init(&nlmsg_request->kref);
+	kref_get(&nlmsg_request->kref);
+	nlmsg_request->nlmsg_seq = nlmsg_seq;
+	nlmsg_request->nl_client = nl_client;
+	nlmsg_request->request_done = 0;
+	nlmsg_request->err_code = 0;
+	return nlmsg_request;
+}
+
+void iwpm_free_nlmsg_request(struct kref *kref)
+{
+	struct iwpm_nlmsg_request *nlmsg_request;
+	unsigned long flags;
+
+	nlmsg_request = container_of(kref, struct iwpm_nlmsg_request, kref);
+
+	spin_lock_irqsave(&iwpm_nlmsg_req_lock, flags);
+	list_del_init(&nlmsg_request->inprocess_list);
+	spin_unlock_irqrestore(&iwpm_nlmsg_req_lock, flags);
+
+	if (!nlmsg_request->request_done)
+		pr_debug("%s Freeing incomplete nlmsg request (seq = %u).\n",
+			__func__, nlmsg_request->nlmsg_seq);
+	kfree(nlmsg_request);
+}
+
+struct iwpm_nlmsg_request *iwpm_find_nlmsg_request(__u32 echo_seq)
+{
+	struct iwpm_nlmsg_request *nlmsg_request;
+	struct iwpm_nlmsg_request *found_request = NULL;
+	unsigned long flags;
+
+	spin_lock_irqsave(&iwpm_nlmsg_req_lock, flags);
+	list_for_each_entry(nlmsg_request, &iwpm_nlmsg_req_list,
+			    inprocess_list) {
+		if (nlmsg_request->nlmsg_seq == echo_seq) {
+			found_request = nlmsg_request;
+			kref_get(&nlmsg_request->kref);
+			break;
+		}
+	}
+	spin_unlock_irqrestore(&iwpm_nlmsg_req_lock, flags);
+	return found_request;
+}
+
+int iwpm_wait_complete_req(struct iwpm_nlmsg_request *nlmsg_request)
+{
+	int ret;
+	init_waitqueue_head(&nlmsg_request->waitq);
+
+	ret = wait_event_timeout(nlmsg_request->waitq,
+			(nlmsg_request->request_done != 0), IWPM_NL_TIMEOUT);
+	if (!ret) {
+		ret = -EINVAL;
+		pr_info("%s: Timeout %d sec for netlink request (seq = %u)\n",
+			__func__, (IWPM_NL_TIMEOUT/HZ), nlmsg_request->nlmsg_seq);
+	} else {
+		ret = nlmsg_request->err_code;
+	}
+	kref_put(&nlmsg_request->kref, iwpm_free_nlmsg_request);
+	return ret;
+}
+
+int iwpm_get_nlmsg_seq(void)
+{
+	return atomic_inc_return(&iwpm_admin.nlmsg_seq);
+}
+
+int iwpm_valid_client(u8 nl_client)
+{
+	if (nl_client >= RDMA_NL_NUM_CLIENTS)
+		return 0;
+	return iwpm_admin.client_list[nl_client];
+}
+
+void iwpm_set_valid(u8 nl_client, int valid)
+{
+	if (nl_client >= RDMA_NL_NUM_CLIENTS)
+		return;
+	iwpm_admin.client_list[nl_client] = valid;
+}
+
+/* valid client */
+int iwpm_registered_client(u8 nl_client)
+{
+	return iwpm_admin.reg_list[nl_client];
+}
+
+/* valid client */
+void iwpm_set_registered(u8 nl_client, int reg)
+{
+	iwpm_admin.reg_list[nl_client] = reg;
+}
+
+int iwpm_compare_sockaddr(struct sockaddr_storage *a_sockaddr,
+				struct sockaddr_storage *b_sockaddr)
+{
+	if (a_sockaddr->ss_family != b_sockaddr->ss_family)
+		return 1;
+	if (a_sockaddr->ss_family == AF_INET) {
+		struct sockaddr_in *a4_sockaddr =
+			(struct sockaddr_in *)a_sockaddr;
+		struct sockaddr_in *b4_sockaddr =
+			(struct sockaddr_in *)b_sockaddr;
+		if (!memcmp(&a4_sockaddr->sin_addr,
+			&b4_sockaddr->sin_addr, sizeof(struct in_addr))
+			&& a4_sockaddr->sin_port == b4_sockaddr->sin_port)
+				return 0;
+
+	} else if (a_sockaddr->ss_family == AF_INET6) {
+		struct sockaddr_in6 *a6_sockaddr =
+			(struct sockaddr_in6 *)a_sockaddr;
+		struct sockaddr_in6 *b6_sockaddr =
+			(struct sockaddr_in6 *)b_sockaddr;
+		if (!memcmp(&a6_sockaddr->sin6_addr,
+			&b6_sockaddr->sin6_addr, sizeof(struct in6_addr))
+			&& a6_sockaddr->sin6_port == b6_sockaddr->sin6_port)
+				return 0;
+
+	} else {
+		pr_err("%s: Invalid sockaddr family\n", __func__);
+	}
+	return 1;
+}
+
+struct sk_buff *iwpm_create_nlmsg(u32 nl_op, struct nlmsghdr **nlh,
+						int nl_client)
+{
+	struct sk_buff *skb = NULL;
+
+	skb = dev_alloc_skb(NLMSG_GOODSIZE);
+	if (!skb) {
+		pr_err("%s Unable to allocate skb\n", __func__);
+		goto create_nlmsg_exit;
+	}
+	if (!(ibnl_put_msg(skb, nlh, 0, 0, nl_client, nl_op,
+			   NLM_F_REQUEST))) {
+		pr_warn("%s: Unable to put the nlmsg header\n", __func__);
+		dev_kfree_skb(skb);
+		skb = NULL;
+	}
+create_nlmsg_exit:
+	return skb;
+}
+
+int iwpm_parse_nlmsg(struct netlink_callback *cb, int policy_max,
+				   const struct nla_policy *nlmsg_policy,
+				   struct nlattr *nltb[], const char *msg_type)
+{
+	int nlh_len = 0;
+	int ret;
+	const char *err_str = "";
+
+	ret = nlmsg_validate(cb->nlh, nlh_len, policy_max-1, nlmsg_policy);
+	if (ret) {
+		err_str = "Invalid attribute";
+		goto parse_nlmsg_error;
+	}
+	ret = nlmsg_parse(cb->nlh, nlh_len, nltb, policy_max-1, nlmsg_policy);
+	if (ret) {
+		err_str = "Unable to parse the nlmsg";
+		goto parse_nlmsg_error;
+	}
+	ret = iwpm_validate_nlmsg_attr(nltb, policy_max);
+	if (ret) {
+		err_str = "Invalid NULL attribute";
+		goto parse_nlmsg_error;
+	}
+	return 0;
+parse_nlmsg_error:
+	pr_warn("%s: %s (msg type %s ret = %d)\n",
+			__func__, err_str, msg_type, ret);
+	return ret;
+}
+
+void iwpm_print_sockaddr(struct sockaddr_storage *sockaddr, char *msg)
+{
+	struct sockaddr_in6 *sockaddr_v6;
+	struct sockaddr_in *sockaddr_v4;
+
+	switch (sockaddr->ss_family) {
+	case AF_INET:
+		sockaddr_v4 = (struct sockaddr_in *)sockaddr;
+		pr_debug("%s IPV4 %pI4: %u(0x%04X)\n",
+			msg, &sockaddr_v4->sin_addr,
+			ntohs(sockaddr_v4->sin_port),
+			ntohs(sockaddr_v4->sin_port));
+		break;
+	case AF_INET6:
+		sockaddr_v6 = (struct sockaddr_in6 *)sockaddr;
+		pr_debug("%s IPV6 %pI6: %u(0x%04X)\n",
+			msg, &sockaddr_v6->sin6_addr,
+			ntohs(sockaddr_v6->sin6_port),
+			ntohs(sockaddr_v6->sin6_port));
+		break;
+	default:
+		break;
+	}
+}
+
+static u32 iwpm_ipv6_jhash(struct sockaddr_in6 *ipv6_sockaddr)
+{
+	u32 ipv6_hash = jhash(&ipv6_sockaddr->sin6_addr, sizeof(struct in6_addr), 0);
+	u32 hash = jhash_2words(ipv6_hash, (__force u32) ipv6_sockaddr->sin6_port, 0);
+	return hash;
+}
+
+static u32 iwpm_ipv4_jhash(struct sockaddr_in *ipv4_sockaddr)
+{
+	u32 ipv4_hash = jhash(&ipv4_sockaddr->sin_addr, sizeof(struct in_addr), 0);
+	u32 hash = jhash_2words(ipv4_hash, (__force u32) ipv4_sockaddr->sin_port, 0);
+	return hash;
+}
+
+static struct hlist_head *get_hash_bucket_head(struct sockaddr_storage
+					       *local_sockaddr,
+					       struct sockaddr_storage
+					       *mapped_sockaddr)
+{
+	u32 local_hash, mapped_hash, hash;
+
+	if (local_sockaddr->ss_family == AF_INET) {
+		local_hash = iwpm_ipv4_jhash((struct sockaddr_in *) local_sockaddr);
+		mapped_hash = iwpm_ipv4_jhash((struct sockaddr_in *) mapped_sockaddr);
+
+	} else if (local_sockaddr->ss_family == AF_INET6) {
+		local_hash = iwpm_ipv6_jhash((struct sockaddr_in6 *) local_sockaddr);
+		mapped_hash = iwpm_ipv6_jhash((struct sockaddr_in6 *) mapped_sockaddr);
+	} else {
+		pr_err("%s: Invalid sockaddr family\n", __func__);
+		return NULL;
+	}
+
+	if (local_hash == mapped_hash) /* if port mapper isn't available */
+		hash = local_hash;
+	else
+		hash = jhash_2words(local_hash, mapped_hash, 0);
+
+	return &iwpm_hash_bucket[hash & IWPM_HASH_BUCKET_MASK];
+}
+
+static int send_mapinfo_num(u32 mapping_num, u8 nl_client, int iwpm_pid)
+{
+	struct sk_buff *skb = NULL;
+	struct nlmsghdr *nlh;
+	u32 msg_seq;
+	const char *err_str = "";
+	int ret = -EINVAL;
+
+	skb = iwpm_create_nlmsg(RDMA_NL_IWPM_MAPINFO_NUM, &nlh, nl_client);
+	if (!skb) {
+		err_str = "Unable to create a nlmsg";
+		goto mapinfo_num_error;
+	}
+	nlh->nlmsg_seq = iwpm_get_nlmsg_seq();
+	msg_seq = 0;
+	err_str = "Unable to put attribute of mapinfo number nlmsg";
+	ret = ibnl_put_attr(skb, nlh, sizeof(u32), &msg_seq, IWPM_NLA_MAPINFO_SEQ);
+	if (ret)
+		goto mapinfo_num_error;
+	ret = ibnl_put_attr(skb, nlh, sizeof(u32),
+				&mapping_num, IWPM_NLA_MAPINFO_SEND_NUM);
+	if (ret)
+		goto mapinfo_num_error;
+	ret = ibnl_unicast(skb, nlh, iwpm_pid);
+	if (ret) {
+		skb = NULL;
+		err_str = "Unable to send a nlmsg";
+		goto mapinfo_num_error;
+	}
+	pr_debug("%s: Sent mapping number = %d\n", __func__, mapping_num);
+	return 0;
+mapinfo_num_error:
+	pr_info("%s: %s\n", __func__, err_str);
+	if (skb)
+		dev_kfree_skb(skb);
+	return ret;
+}
+
+static int send_nlmsg_done(struct sk_buff *skb, u8 nl_client, int iwpm_pid)
+{
+	struct nlmsghdr *nlh = NULL;
+	int ret = 0;
+
+	if (!skb)
+		return ret;
+	if (!(ibnl_put_msg(skb, &nlh, 0, 0, nl_client,
+			   RDMA_NL_IWPM_MAPINFO, NLM_F_MULTI))) {
+		pr_warn("%s Unable to put NLMSG_DONE\n", __func__);
+		return -ENOMEM;
+	}
+	nlh->nlmsg_type = NLMSG_DONE;
+	ret = ibnl_unicast(skb, (struct nlmsghdr *)skb->data, iwpm_pid);
+	if (ret)
+		pr_warn("%s Unable to send a nlmsg\n", __func__);
+	return ret;
+}
+
+int iwpm_send_mapinfo(u8 nl_client, int iwpm_pid)
+{
+	struct iwpm_mapping_info *map_info;
+	struct sk_buff *skb = NULL;
+	struct nlmsghdr *nlh;
+	int skb_num = 0, mapping_num = 0;
+	int i = 0, nlmsg_bytes = 0;
+	unsigned long flags;
+	const char *err_str = "";
+	int ret;
+
+	skb = dev_alloc_skb(NLMSG_GOODSIZE);
+	if (!skb) {
+		ret = -ENOMEM;
+		err_str = "Unable to allocate skb";
+		goto send_mapping_info_exit;
+	}
+	skb_num++;
+	spin_lock_irqsave(&iwpm_mapinfo_lock, flags);
+	for (i = 0; i < IWPM_HASH_BUCKET_SIZE; i++) {
+		hlist_for_each_entry(map_info, &iwpm_hash_bucket[i],
+				     hlist_node) {
+			if (map_info->nl_client != nl_client)
+				continue;
+			nlh = NULL;
+			if (!(ibnl_put_msg(skb, &nlh, 0, 0, nl_client,
+					RDMA_NL_IWPM_MAPINFO, NLM_F_MULTI))) {
+				ret = -ENOMEM;
+				err_str = "Unable to put the nlmsg header";
+				goto send_mapping_info_unlock;
+			}
+			err_str = "Unable to put attribute of the nlmsg";
+			ret = ibnl_put_attr(skb, nlh,
+					sizeof(struct sockaddr_storage),
+					&map_info->local_sockaddr,
+					IWPM_NLA_MAPINFO_LOCAL_ADDR);
+			if (ret)
+				goto send_mapping_info_unlock;
+
+			ret = ibnl_put_attr(skb, nlh,
+					sizeof(struct sockaddr_storage),
+					&map_info->mapped_sockaddr,
+					IWPM_NLA_MAPINFO_MAPPED_ADDR);
+			if (ret)
+				goto send_mapping_info_unlock;
+
+			iwpm_print_sockaddr(&map_info->local_sockaddr,
+				"send_mapping_info: Local sockaddr:");
+			iwpm_print_sockaddr(&map_info->mapped_sockaddr,
+				"send_mapping_info: Mapped local sockaddr:");
+			mapping_num++;
+			nlmsg_bytes += nlh->nlmsg_len;
+
+			/* check if all mappings can fit in one skb */
+			if (NLMSG_GOODSIZE - nlmsg_bytes < nlh->nlmsg_len * 2) {
+				/* and leave room for NLMSG_DONE */
+				nlmsg_bytes = 0;
+				skb_num++;
+				spin_unlock_irqrestore(&iwpm_mapinfo_lock,
+						       flags);
+				/* send the skb */
+				ret = send_nlmsg_done(skb, nl_client, iwpm_pid);
+				skb = NULL;
+				if (ret) {
+					err_str = "Unable to send map info";
+					goto send_mapping_info_exit;
+				}
+				if (skb_num == IWPM_MAPINFO_SKB_COUNT) {
+					ret = -ENOMEM;
+					err_str = "Insufficient skbs for map info";
+					goto send_mapping_info_exit;
+				}
+				skb = dev_alloc_skb(NLMSG_GOODSIZE);
+				if (!skb) {
+					ret = -ENOMEM;
+					err_str = "Unable to allocate skb";
+					goto send_mapping_info_exit;
+				}
+				spin_lock_irqsave(&iwpm_mapinfo_lock, flags);
+			}
+		}
+	}
+send_mapping_info_unlock:
+	spin_unlock_irqrestore(&iwpm_mapinfo_lock, flags);
+send_mapping_info_exit:
+	if (ret) {
+		pr_warn("%s: %s (ret = %d)\n", __func__, err_str, ret);
+		if (skb)
+			dev_kfree_skb(skb);
+		return ret;
+	}
+	send_nlmsg_done(skb, nl_client, iwpm_pid);
+	return send_mapinfo_num(mapping_num, nl_client, iwpm_pid);
+}
+
+int iwpm_mapinfo_available(void)
+{
+	unsigned long flags;
+	int full_bucket = 0, i = 0;
+
+	spin_lock_irqsave(&iwpm_mapinfo_lock, flags);
+	if (iwpm_hash_bucket) {
+		for (i = 0; i < IWPM_HASH_BUCKET_SIZE; i++) {
+			if (!hlist_empty(&iwpm_hash_bucket[i])) {
+				full_bucket = 1;
+				break;
+			}
+		}
+	}
+	spin_unlock_irqrestore(&iwpm_mapinfo_lock, flags);
+	return full_bucket;
+}
diff --git a/drivers/infiniband/core/iwpm_util.h b/drivers/infiniband/core/iwpm_util.h
new file mode 100644
index 0000000..9777c86
--- /dev/null
+++ b/drivers/infiniband/core/iwpm_util.h
@@ -0,0 +1,238 @@
+/*
+ * Copyright (c) 2014 Intel Corporation. All rights reserved.
+ * Copyright (c) 2014 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer in the documentation and/or other materials
+ *	  provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef _IWPM_UTIL_H
+#define _IWPM_UTIL_H
+
+#include <linux/module.h>
+#include <linux/io.h>
+#include <linux/in.h>
+#include <linux/in6.h>
+#include <linux/spinlock.h>
+#include <linux/kernel.h>
+#include <linux/netdevice.h>
+#include <linux/delay.h>
+#include <linux/workqueue.h>
+#include <linux/mutex.h>
+#include <linux/jhash.h>
+#include <linux/kref.h>
+#include <net/netlink.h>
+#include <linux/errno.h>
+#include <rdma/iw_portmap.h>
+#include <rdma/rdma_netlink.h>
+
+
+#define IWPM_NL_RETRANS		3
+#define IWPM_NL_TIMEOUT		(10*HZ)
+#define IWPM_MAPINFO_SKB_COUNT	20
+
+#define IWPM_PID_UNDEFINED     -1
+#define IWPM_PID_UNAVAILABLE   -2
+
+struct iwpm_nlmsg_request {
+	struct list_head    inprocess_list;
+	__u32               nlmsg_seq;
+	void                *req_buffer;
+	u8	            nl_client;
+	u8                  request_done;
+	u16                 err_code;
+	wait_queue_head_t   waitq;
+	struct kref         kref;
+};
+
+struct iwpm_mapping_info {
+	struct hlist_node hlist_node;
+	struct sockaddr_storage local_sockaddr;
+	struct sockaddr_storage mapped_sockaddr;
+	u8     nl_client;
+};
+
+struct iwpm_admin_data {
+	atomic_t refcount;
+	atomic_t nlmsg_seq;
+	int      client_list[RDMA_NL_NUM_CLIENTS];
+	int      reg_list[RDMA_NL_NUM_CLIENTS];
+};
+
+/**
+ * iwpm_get_nlmsg_request - Allocate and initialize netlink message request
+ * @nlmsg_seq: Sequence number of the netlink message
+ * @nl_client: The index of the netlink client
+ * @gfp: Indicates how the memory for the request should be allocated
+ *
+ * Returns the newly allocated netlink request object if successful,
+ * otherwise returns NULL
+ */
+struct iwpm_nlmsg_request *iwpm_get_nlmsg_request(__u32 nlmsg_seq,
+						u8 nl_client, gfp_t gfp);
+
+/**
+ * iwpm_free_nlmsg_request - Deallocate netlink message request
+ * @kref: Holds reference of netlink message request
+ */
+void iwpm_free_nlmsg_request(struct kref *kref);
+
+/**
+ * iwpm_find_nlmsg_request - Find netlink message request in the request list
+ * @echo_seq: Sequence number of the netlink request to find
+ *
+ * Returns the found netlink message request,
+ * if not found, returns NULL
+ */
+struct iwpm_nlmsg_request *iwpm_find_nlmsg_request(__u32 echo_seq);
+
+/**
+ * iwpm_wait_complete_req - Block while servicing the netlink request
+ * @nlmsg_request: Netlink message request to service
+ *
+ * Wakes up, after the request is completed or expired
+ * Returns 0 if the request is complete without error
+ */
+int iwpm_wait_complete_req(struct iwpm_nlmsg_request *nlmsg_request);
+
+/**
+ * iwpm_get_nlmsg_seq - Get the sequence number for a netlink
+ *			message to send to the port mapper
+ *
+ * Returns the sequence number for the netlink message.
+ */
+int iwpm_get_nlmsg_seq(void);
+
+/**
+ * iwpm_valid_client - Check if the port mapper client is valid
+ * @nl_client: The index of the netlink client
+ *
+ * Valid clients need to call iwpm_init() before using
+ * the port mapper
+ */
+int iwpm_valid_client(u8 nl_client);
+
+/**
+ * iwpm_set_valid - Set the port mapper client to valid or not
+ * @nl_client: The index of the netlink client
+ * @valid: 1 if valid or 0 if invalid
+ */
+void iwpm_set_valid(u8 nl_client, int valid);
+
+/**
+ * iwpm_registered_client - Check if the port mapper client is registered
+ * @nl_client: The index of the netlink client
+ *
+ * Call iwpm_register_pid() to register a client
+ */
+int iwpm_registered_client(u8 nl_client);
+
+/**
+ * iwpm_set_registered - Set the port mapper client to registered or not
+ * @nl_client: The index of the netlink client
+ * @reg: 1 if registered or 0 if not
+ */
+void iwpm_set_registered(u8 nl_client, int reg);
+
+/**
+ * iwpm_send_mapinfo - Send local and mapped IPv4/IPv6 address info of
+ *                     a client to the user space port mapper
+ * @nl_client: The index of the netlink client
+ * @iwpm_pid: The pid of the user space port mapper
+ *
+ * If successful, returns the number of sent mapping info records
+ */
+int iwpm_send_mapinfo(u8 nl_client, int iwpm_pid);
+
+/**
+ * iwpm_mapinfo_available - Check if any mapping info records is available
+ *		            in the hash table
+ *
+ * Returns 1 if mapping information is available, otherwise returns 0
+ */
+int iwpm_mapinfo_available(void);
+
+/**
+ * iwpm_compare_sockaddr - Compare two sockaddr storage structs
+ *
+ * Returns 0 if they are holding the same ip/tcp address info,
+ * otherwise returns 1
+ */
+int iwpm_compare_sockaddr(struct sockaddr_storage *a_sockaddr,
+			struct sockaddr_storage *b_sockaddr);
+
+/**
+ * iwpm_validate_nlmsg_attr - Check for NULL netlink attributes
+ * @nltb: Holds address of each netlink message attributes
+ * @nla_count: Number of netlink message attributes
+ *
+ * Returns error if any of the nla_count attributes is NULL
+ */
+static inline int iwpm_validate_nlmsg_attr(struct nlattr *nltb[],
+					   int nla_count)
+{
+	int i;
+	for (i = 1; i < nla_count; i++) {
+		if (!nltb[i])
+			return -EINVAL;
+	}
+	return 0;
+}
+
+/**
+ * iwpm_create_nlmsg - Allocate skb and form a netlink message
+ * @nl_op: Netlink message opcode
+ * @nlh: Holds address of the netlink message header in skb
+ * @nl_client: The index of the netlink client
+ *
+ * Returns the newly allcated skb, or NULL if the tailroom of the skb
+ * is insufficient to store the message header and payload
+ */
+struct sk_buff *iwpm_create_nlmsg(u32 nl_op, struct nlmsghdr **nlh,
+					int nl_client);
+
+/**
+ * iwpm_parse_nlmsg - Validate and parse the received netlink message
+ * @cb: Netlink callback structure
+ * @policy_max: Maximum attribute type to be expected
+ * @nlmsg_policy: Validation policy
+ * @nltb: Array to store policy_max parsed elements
+ * @msg_type: Type of netlink message
+ *
+ * Returns 0 on success or a negative error code
+ */
+int iwpm_parse_nlmsg(struct netlink_callback *cb, int policy_max,
+				const struct nla_policy *nlmsg_policy,
+				struct nlattr *nltb[], const char *msg_type);
+
+/**
+ * iwpm_print_sockaddr - Print IPv4/IPv6 address and TCP port
+ * @sockaddr: Socket address to print
+ * @msg: Message to print
+ */
+void iwpm_print_sockaddr(struct sockaddr_storage *sockaddr, char *msg);
+#endif
diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c
new file mode 100644
index 0000000..74c30f4
--- /dev/null
+++ b/drivers/infiniband/core/mad.c
@@ -0,0 +1,3176 @@
+/*
+ * Copyright (c) 2004-2007 Voltaire, Inc. All rights reserved.
+ * Copyright (c) 2005 Intel Corporation.  All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies Ltd.  All rights reserved.
+ * Copyright (c) 2009 HNR Consulting. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/dma-mapping.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <rdma/ib_cache.h>
+
+#include "mad_priv.h"
+#include "mad_rmpp.h"
+#include "smi.h"
+#include "agent.h"
+
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_DESCRIPTION("kernel IB MAD API");
+MODULE_AUTHOR("Hal Rosenstock");
+MODULE_AUTHOR("Sean Hefty");
+
+static int mad_sendq_size = IB_MAD_QP_SEND_SIZE;
+static int mad_recvq_size = IB_MAD_QP_RECV_SIZE;
+
+module_param_named(send_queue_size, mad_sendq_size, int, 0444);
+MODULE_PARM_DESC(send_queue_size, "Size of send queue in number of work requests");
+module_param_named(recv_queue_size, mad_recvq_size, int, 0444);
+MODULE_PARM_DESC(recv_queue_size, "Size of receive queue in number of work requests");
+
+static struct kmem_cache *ib_mad_cache;
+
+static struct list_head ib_mad_port_list;
+static u32 ib_mad_client_id = 0;
+
+/* Port list lock */
+static DEFINE_SPINLOCK(ib_mad_port_list_lock);
+
+/* Forward declarations */
+static int method_in_use(struct ib_mad_mgmt_method_table **method,
+			 struct ib_mad_reg_req *mad_reg_req);
+static void remove_mad_reg_req(struct ib_mad_agent_private *priv);
+static struct ib_mad_agent_private *find_mad_agent(
+					struct ib_mad_port_private *port_priv,
+					struct ib_mad *mad);
+static int ib_mad_post_receive_mads(struct ib_mad_qp_info *qp_info,
+				    struct ib_mad_private *mad);
+static void cancel_mads(struct ib_mad_agent_private *mad_agent_priv);
+static void timeout_sends(struct work_struct *work);
+static void local_completions(struct work_struct *work);
+static int add_nonoui_reg_req(struct ib_mad_reg_req *mad_reg_req,
+			      struct ib_mad_agent_private *agent_priv,
+			      u8 mgmt_class);
+static int add_oui_reg_req(struct ib_mad_reg_req *mad_reg_req,
+			   struct ib_mad_agent_private *agent_priv);
+
+/*
+ * Returns a ib_mad_port_private structure or NULL for a device/port
+ * Assumes ib_mad_port_list_lock is being held
+ */
+static inline struct ib_mad_port_private *
+__ib_get_mad_port(struct ib_device *device, int port_num)
+{
+	struct ib_mad_port_private *entry;
+
+	list_for_each_entry(entry, &ib_mad_port_list, port_list) {
+		if (entry->device == device && entry->port_num == port_num)
+			return entry;
+	}
+	return NULL;
+}
+
+/*
+ * Wrapper function to return a ib_mad_port_private structure or NULL
+ * for a device/port
+ */
+static inline struct ib_mad_port_private *
+ib_get_mad_port(struct ib_device *device, int port_num)
+{
+	struct ib_mad_port_private *entry;
+	unsigned long flags;
+
+	spin_lock_irqsave(&ib_mad_port_list_lock, flags);
+	entry = __ib_get_mad_port(device, port_num);
+	spin_unlock_irqrestore(&ib_mad_port_list_lock, flags);
+
+	return entry;
+}
+
+static inline u8 convert_mgmt_class(u8 mgmt_class)
+{
+	/* Alias IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE to 0 */
+	return mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE ?
+		0 : mgmt_class;
+}
+
+static int get_spl_qp_index(enum ib_qp_type qp_type)
+{
+	switch (qp_type)
+	{
+	case IB_QPT_SMI:
+		return 0;
+	case IB_QPT_GSI:
+		return 1;
+	default:
+		return -1;
+	}
+}
+
+static int vendor_class_index(u8 mgmt_class)
+{
+	return mgmt_class - IB_MGMT_CLASS_VENDOR_RANGE2_START;
+}
+
+static int is_vendor_class(u8 mgmt_class)
+{
+	if ((mgmt_class < IB_MGMT_CLASS_VENDOR_RANGE2_START) ||
+	    (mgmt_class > IB_MGMT_CLASS_VENDOR_RANGE2_END))
+		return 0;
+	return 1;
+}
+
+static int is_vendor_oui(char *oui)
+{
+	if (oui[0] || oui[1] || oui[2])
+		return 1;
+	return 0;
+}
+
+static int is_vendor_method_in_use(
+		struct ib_mad_mgmt_vendor_class *vendor_class,
+		struct ib_mad_reg_req *mad_reg_req)
+{
+	struct ib_mad_mgmt_method_table *method;
+	int i;
+
+	for (i = 0; i < MAX_MGMT_OUI; i++) {
+		if (!memcmp(vendor_class->oui[i], mad_reg_req->oui, 3)) {
+			method = vendor_class->method_table[i];
+			if (method) {
+				if (method_in_use(&method, mad_reg_req))
+					return 1;
+				else
+					break;
+			}
+		}
+	}
+	return 0;
+}
+
+int ib_response_mad(struct ib_mad *mad)
+{
+	return ((mad->mad_hdr.method & IB_MGMT_METHOD_RESP) ||
+		(mad->mad_hdr.method == IB_MGMT_METHOD_TRAP_REPRESS) ||
+		((mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_BM) &&
+		 (mad->mad_hdr.attr_mod & IB_BM_ATTR_MOD_RESP)));
+}
+EXPORT_SYMBOL(ib_response_mad);
+
+/*
+ * ib_register_mad_agent - Register to send/receive MADs
+ */
+struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device,
+					   u8 port_num,
+					   enum ib_qp_type qp_type,
+					   struct ib_mad_reg_req *mad_reg_req,
+					   u8 rmpp_version,
+					   ib_mad_send_handler send_handler,
+					   ib_mad_recv_handler recv_handler,
+					   void *context,
+					   u32 registration_flags)
+{
+	struct ib_mad_port_private *port_priv;
+	struct ib_mad_agent *ret = ERR_PTR(-EINVAL);
+	struct ib_mad_agent_private *mad_agent_priv;
+	struct ib_mad_reg_req *reg_req = NULL;
+	struct ib_mad_mgmt_class_table *class;
+	struct ib_mad_mgmt_vendor_class_table *vendor;
+	struct ib_mad_mgmt_vendor_class *vendor_class;
+	struct ib_mad_mgmt_method_table *method;
+	int ret2, qpn;
+	unsigned long flags;
+	u8 mgmt_class, vclass;
+
+	/* Validate parameters */
+	qpn = get_spl_qp_index(qp_type);
+	if (qpn == -1) {
+		dev_notice(&device->dev,
+			   "ib_register_mad_agent: invalid QP Type %d\n",
+			   qp_type);
+		goto error1;
+	}
+
+	if (rmpp_version && rmpp_version != IB_MGMT_RMPP_VERSION) {
+		dev_notice(&device->dev,
+			   "ib_register_mad_agent: invalid RMPP Version %u\n",
+			   rmpp_version);
+		goto error1;
+	}
+
+	/* Validate MAD registration request if supplied */
+	if (mad_reg_req) {
+		if (mad_reg_req->mgmt_class_version >= MAX_MGMT_VERSION) {
+			dev_notice(&device->dev,
+				   "ib_register_mad_agent: invalid Class Version %u\n",
+				   mad_reg_req->mgmt_class_version);
+			goto error1;
+		}
+		if (!recv_handler) {
+			dev_notice(&device->dev,
+				   "ib_register_mad_agent: no recv_handler\n");
+			goto error1;
+		}
+		if (mad_reg_req->mgmt_class >= MAX_MGMT_CLASS) {
+			/*
+			 * IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE is the only
+			 * one in this range currently allowed
+			 */
+			if (mad_reg_req->mgmt_class !=
+			    IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) {
+				dev_notice(&device->dev,
+					   "ib_register_mad_agent: Invalid Mgmt Class 0x%x\n",
+					   mad_reg_req->mgmt_class);
+				goto error1;
+			}
+		} else if (mad_reg_req->mgmt_class == 0) {
+			/*
+			 * Class 0 is reserved in IBA and is used for
+			 * aliasing of IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE
+			 */
+			dev_notice(&device->dev,
+				   "ib_register_mad_agent: Invalid Mgmt Class 0\n");
+			goto error1;
+		} else if (is_vendor_class(mad_reg_req->mgmt_class)) {
+			/*
+			 * If class is in "new" vendor range,
+			 * ensure supplied OUI is not zero
+			 */
+			if (!is_vendor_oui(mad_reg_req->oui)) {
+				dev_notice(&device->dev,
+					   "ib_register_mad_agent: No OUI specified for class 0x%x\n",
+					   mad_reg_req->mgmt_class);
+				goto error1;
+			}
+		}
+		/* Make sure class supplied is consistent with RMPP */
+		if (!ib_is_mad_class_rmpp(mad_reg_req->mgmt_class)) {
+			if (rmpp_version) {
+				dev_notice(&device->dev,
+					   "ib_register_mad_agent: RMPP version for non-RMPP class 0x%x\n",
+					   mad_reg_req->mgmt_class);
+				goto error1;
+			}
+		}
+
+		/* Make sure class supplied is consistent with QP type */
+		if (qp_type == IB_QPT_SMI) {
+			if ((mad_reg_req->mgmt_class !=
+					IB_MGMT_CLASS_SUBN_LID_ROUTED) &&
+			    (mad_reg_req->mgmt_class !=
+					IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)) {
+				dev_notice(&device->dev,
+					   "ib_register_mad_agent: Invalid SM QP type: class 0x%x\n",
+					   mad_reg_req->mgmt_class);
+				goto error1;
+			}
+		} else {
+			if ((mad_reg_req->mgmt_class ==
+					IB_MGMT_CLASS_SUBN_LID_ROUTED) ||
+			    (mad_reg_req->mgmt_class ==
+					IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)) {
+				dev_notice(&device->dev,
+					   "ib_register_mad_agent: Invalid GS QP type: class 0x%x\n",
+					   mad_reg_req->mgmt_class);
+				goto error1;
+			}
+		}
+	} else {
+		/* No registration request supplied */
+		if (!send_handler)
+			goto error1;
+		if (registration_flags & IB_MAD_USER_RMPP)
+			goto error1;
+	}
+
+	/* Validate device and port */
+	port_priv = ib_get_mad_port(device, port_num);
+	if (!port_priv) {
+		dev_notice(&device->dev, "ib_register_mad_agent: Invalid port\n");
+		ret = ERR_PTR(-ENODEV);
+		goto error1;
+	}
+
+	/* Verify the QP requested is supported.  For example, Ethernet devices
+	 * will not have QP0 */
+	if (!port_priv->qp_info[qpn].qp) {
+		dev_notice(&device->dev,
+			   "ib_register_mad_agent: QP %d not supported\n", qpn);
+		ret = ERR_PTR(-EPROTONOSUPPORT);
+		goto error1;
+	}
+
+	/* Allocate structures */
+	mad_agent_priv = kzalloc(sizeof *mad_agent_priv, GFP_KERNEL);
+	if (!mad_agent_priv) {
+		ret = ERR_PTR(-ENOMEM);
+		goto error1;
+	}
+
+	mad_agent_priv->agent.mr = ib_get_dma_mr(port_priv->qp_info[qpn].qp->pd,
+						 IB_ACCESS_LOCAL_WRITE);
+	if (IS_ERR(mad_agent_priv->agent.mr)) {
+		ret = ERR_PTR(-ENOMEM);
+		goto error2;
+	}
+
+	if (mad_reg_req) {
+		reg_req = kmemdup(mad_reg_req, sizeof *reg_req, GFP_KERNEL);
+		if (!reg_req) {
+			ret = ERR_PTR(-ENOMEM);
+			goto error3;
+		}
+	}
+
+	/* Now, fill in the various structures */
+	mad_agent_priv->qp_info = &port_priv->qp_info[qpn];
+	mad_agent_priv->reg_req = reg_req;
+	mad_agent_priv->agent.rmpp_version = rmpp_version;
+	mad_agent_priv->agent.device = device;
+	mad_agent_priv->agent.recv_handler = recv_handler;
+	mad_agent_priv->agent.send_handler = send_handler;
+	mad_agent_priv->agent.context = context;
+	mad_agent_priv->agent.qp = port_priv->qp_info[qpn].qp;
+	mad_agent_priv->agent.port_num = port_num;
+	mad_agent_priv->agent.flags = registration_flags;
+	spin_lock_init(&mad_agent_priv->lock);
+	INIT_LIST_HEAD(&mad_agent_priv->send_list);
+	INIT_LIST_HEAD(&mad_agent_priv->wait_list);
+	INIT_LIST_HEAD(&mad_agent_priv->done_list);
+	INIT_LIST_HEAD(&mad_agent_priv->rmpp_list);
+	INIT_DELAYED_WORK(&mad_agent_priv->timed_work, timeout_sends);
+	INIT_LIST_HEAD(&mad_agent_priv->local_list);
+	INIT_WORK(&mad_agent_priv->local_work, local_completions);
+	atomic_set(&mad_agent_priv->refcount, 1);
+	init_completion(&mad_agent_priv->comp);
+
+	spin_lock_irqsave(&port_priv->reg_lock, flags);
+	mad_agent_priv->agent.hi_tid = ++ib_mad_client_id;
+
+	/*
+	 * Make sure MAD registration (if supplied)
+	 * is non overlapping with any existing ones
+	 */
+	if (mad_reg_req) {
+		mgmt_class = convert_mgmt_class(mad_reg_req->mgmt_class);
+		if (!is_vendor_class(mgmt_class)) {
+			class = port_priv->version[mad_reg_req->
+						   mgmt_class_version].class;
+			if (class) {
+				method = class->method_table[mgmt_class];
+				if (method) {
+					if (method_in_use(&method,
+							   mad_reg_req))
+						goto error4;
+				}
+			}
+			ret2 = add_nonoui_reg_req(mad_reg_req, mad_agent_priv,
+						  mgmt_class);
+		} else {
+			/* "New" vendor class range */
+			vendor = port_priv->version[mad_reg_req->
+						    mgmt_class_version].vendor;
+			if (vendor) {
+				vclass = vendor_class_index(mgmt_class);
+				vendor_class = vendor->vendor_class[vclass];
+				if (vendor_class) {
+					if (is_vendor_method_in_use(
+							vendor_class,
+							mad_reg_req))
+						goto error4;
+				}
+			}
+			ret2 = add_oui_reg_req(mad_reg_req, mad_agent_priv);
+		}
+		if (ret2) {
+			ret = ERR_PTR(ret2);
+			goto error4;
+		}
+	}
+
+	/* Add mad agent into port's agent list */
+	list_add_tail(&mad_agent_priv->agent_list, &port_priv->agent_list);
+	spin_unlock_irqrestore(&port_priv->reg_lock, flags);
+
+	return &mad_agent_priv->agent;
+
+error4:
+	spin_unlock_irqrestore(&port_priv->reg_lock, flags);
+	kfree(reg_req);
+error3:
+	ib_dereg_mr(mad_agent_priv->agent.mr);
+error2:
+	kfree(mad_agent_priv);
+error1:
+	return ret;
+}
+EXPORT_SYMBOL(ib_register_mad_agent);
+
+static inline int is_snooping_sends(int mad_snoop_flags)
+{
+	return (mad_snoop_flags &
+		(/*IB_MAD_SNOOP_POSTED_SENDS |
+		 IB_MAD_SNOOP_RMPP_SENDS |*/
+		 IB_MAD_SNOOP_SEND_COMPLETIONS /*|
+		 IB_MAD_SNOOP_RMPP_SEND_COMPLETIONS*/));
+}
+
+static inline int is_snooping_recvs(int mad_snoop_flags)
+{
+	return (mad_snoop_flags &
+		(IB_MAD_SNOOP_RECVS /*|
+		 IB_MAD_SNOOP_RMPP_RECVS*/));
+}
+
+static int register_snoop_agent(struct ib_mad_qp_info *qp_info,
+				struct ib_mad_snoop_private *mad_snoop_priv)
+{
+	struct ib_mad_snoop_private **new_snoop_table;
+	unsigned long flags;
+	int i;
+
+	spin_lock_irqsave(&qp_info->snoop_lock, flags);
+	/* Check for empty slot in array. */
+	for (i = 0; i < qp_info->snoop_table_size; i++)
+		if (!qp_info->snoop_table[i])
+			break;
+
+	if (i == qp_info->snoop_table_size) {
+		/* Grow table. */
+		new_snoop_table = krealloc(qp_info->snoop_table,
+					   sizeof mad_snoop_priv *
+					   (qp_info->snoop_table_size + 1),
+					   GFP_ATOMIC);
+		if (!new_snoop_table) {
+			i = -ENOMEM;
+			goto out;
+		}
+
+		qp_info->snoop_table = new_snoop_table;
+		qp_info->snoop_table_size++;
+	}
+	qp_info->snoop_table[i] = mad_snoop_priv;
+	atomic_inc(&qp_info->snoop_count);
+out:
+	spin_unlock_irqrestore(&qp_info->snoop_lock, flags);
+	return i;
+}
+
+struct ib_mad_agent *ib_register_mad_snoop(struct ib_device *device,
+					   u8 port_num,
+					   enum ib_qp_type qp_type,
+					   int mad_snoop_flags,
+					   ib_mad_snoop_handler snoop_handler,
+					   ib_mad_recv_handler recv_handler,
+					   void *context)
+{
+	struct ib_mad_port_private *port_priv;
+	struct ib_mad_agent *ret;
+	struct ib_mad_snoop_private *mad_snoop_priv;
+	int qpn;
+
+	/* Validate parameters */
+	if ((is_snooping_sends(mad_snoop_flags) && !snoop_handler) ||
+	    (is_snooping_recvs(mad_snoop_flags) && !recv_handler)) {
+		ret = ERR_PTR(-EINVAL);
+		goto error1;
+	}
+	qpn = get_spl_qp_index(qp_type);
+	if (qpn == -1) {
+		ret = ERR_PTR(-EINVAL);
+		goto error1;
+	}
+	port_priv = ib_get_mad_port(device, port_num);
+	if (!port_priv) {
+		ret = ERR_PTR(-ENODEV);
+		goto error1;
+	}
+	/* Allocate structures */
+	mad_snoop_priv = kzalloc(sizeof *mad_snoop_priv, GFP_KERNEL);
+	if (!mad_snoop_priv) {
+		ret = ERR_PTR(-ENOMEM);
+		goto error1;
+	}
+
+	/* Now, fill in the various structures */
+	mad_snoop_priv->qp_info = &port_priv->qp_info[qpn];
+	mad_snoop_priv->agent.device = device;
+	mad_snoop_priv->agent.recv_handler = recv_handler;
+	mad_snoop_priv->agent.snoop_handler = snoop_handler;
+	mad_snoop_priv->agent.context = context;
+	mad_snoop_priv->agent.qp = port_priv->qp_info[qpn].qp;
+	mad_snoop_priv->agent.port_num = port_num;
+	mad_snoop_priv->mad_snoop_flags = mad_snoop_flags;
+	init_completion(&mad_snoop_priv->comp);
+	mad_snoop_priv->snoop_index = register_snoop_agent(
+						&port_priv->qp_info[qpn],
+						mad_snoop_priv);
+	if (mad_snoop_priv->snoop_index < 0) {
+		ret = ERR_PTR(mad_snoop_priv->snoop_index);
+		goto error2;
+	}
+
+	atomic_set(&mad_snoop_priv->refcount, 1);
+	return &mad_snoop_priv->agent;
+
+error2:
+	kfree(mad_snoop_priv);
+error1:
+	return ret;
+}
+EXPORT_SYMBOL(ib_register_mad_snoop);
+
+static inline void deref_mad_agent(struct ib_mad_agent_private *mad_agent_priv)
+{
+	if (atomic_dec_and_test(&mad_agent_priv->refcount))
+		complete(&mad_agent_priv->comp);
+}
+
+static inline void deref_snoop_agent(struct ib_mad_snoop_private *mad_snoop_priv)
+{
+	if (atomic_dec_and_test(&mad_snoop_priv->refcount))
+		complete(&mad_snoop_priv->comp);
+}
+
+static void unregister_mad_agent(struct ib_mad_agent_private *mad_agent_priv)
+{
+	struct ib_mad_port_private *port_priv;
+	unsigned long flags;
+
+	/* Note that we could still be handling received MADs */
+
+	/*
+	 * Canceling all sends results in dropping received response
+	 * MADs, preventing us from queuing additional work
+	 */
+	cancel_mads(mad_agent_priv);
+	port_priv = mad_agent_priv->qp_info->port_priv;
+	cancel_delayed_work(&mad_agent_priv->timed_work);
+
+	spin_lock_irqsave(&port_priv->reg_lock, flags);
+	remove_mad_reg_req(mad_agent_priv);
+	list_del(&mad_agent_priv->agent_list);
+	spin_unlock_irqrestore(&port_priv->reg_lock, flags);
+
+	flush_workqueue(port_priv->wq);
+	ib_cancel_rmpp_recvs(mad_agent_priv);
+
+	deref_mad_agent(mad_agent_priv);
+	wait_for_completion(&mad_agent_priv->comp);
+
+	kfree(mad_agent_priv->reg_req);
+	ib_dereg_mr(mad_agent_priv->agent.mr);
+	kfree(mad_agent_priv);
+}
+
+static void unregister_mad_snoop(struct ib_mad_snoop_private *mad_snoop_priv)
+{
+	struct ib_mad_qp_info *qp_info;
+	unsigned long flags;
+
+	qp_info = mad_snoop_priv->qp_info;
+	spin_lock_irqsave(&qp_info->snoop_lock, flags);
+	qp_info->snoop_table[mad_snoop_priv->snoop_index] = NULL;
+	atomic_dec(&qp_info->snoop_count);
+	spin_unlock_irqrestore(&qp_info->snoop_lock, flags);
+
+	deref_snoop_agent(mad_snoop_priv);
+	wait_for_completion(&mad_snoop_priv->comp);
+
+	kfree(mad_snoop_priv);
+}
+
+/*
+ * ib_unregister_mad_agent - Unregisters a client from using MAD services
+ */
+int ib_unregister_mad_agent(struct ib_mad_agent *mad_agent)
+{
+	struct ib_mad_agent_private *mad_agent_priv;
+	struct ib_mad_snoop_private *mad_snoop_priv;
+
+	/* If the TID is zero, the agent can only snoop. */
+	if (mad_agent->hi_tid) {
+		mad_agent_priv = container_of(mad_agent,
+					      struct ib_mad_agent_private,
+					      agent);
+		unregister_mad_agent(mad_agent_priv);
+	} else {
+		mad_snoop_priv = container_of(mad_agent,
+					      struct ib_mad_snoop_private,
+					      agent);
+		unregister_mad_snoop(mad_snoop_priv);
+	}
+	return 0;
+}
+EXPORT_SYMBOL(ib_unregister_mad_agent);
+
+static void dequeue_mad(struct ib_mad_list_head *mad_list)
+{
+	struct ib_mad_queue *mad_queue;
+	unsigned long flags;
+
+	BUG_ON(!mad_list->mad_queue);
+	mad_queue = mad_list->mad_queue;
+	spin_lock_irqsave(&mad_queue->lock, flags);
+	list_del(&mad_list->list);
+	mad_queue->count--;
+	spin_unlock_irqrestore(&mad_queue->lock, flags);
+}
+
+static void snoop_send(struct ib_mad_qp_info *qp_info,
+		       struct ib_mad_send_buf *send_buf,
+		       struct ib_mad_send_wc *mad_send_wc,
+		       int mad_snoop_flags)
+{
+	struct ib_mad_snoop_private *mad_snoop_priv;
+	unsigned long flags;
+	int i;
+
+	spin_lock_irqsave(&qp_info->snoop_lock, flags);
+	for (i = 0; i < qp_info->snoop_table_size; i++) {
+		mad_snoop_priv = qp_info->snoop_table[i];
+		if (!mad_snoop_priv ||
+		    !(mad_snoop_priv->mad_snoop_flags & mad_snoop_flags))
+			continue;
+
+		atomic_inc(&mad_snoop_priv->refcount);
+		spin_unlock_irqrestore(&qp_info->snoop_lock, flags);
+		mad_snoop_priv->agent.snoop_handler(&mad_snoop_priv->agent,
+						    send_buf, mad_send_wc);
+		deref_snoop_agent(mad_snoop_priv);
+		spin_lock_irqsave(&qp_info->snoop_lock, flags);
+	}
+	spin_unlock_irqrestore(&qp_info->snoop_lock, flags);
+}
+
+static void snoop_recv(struct ib_mad_qp_info *qp_info,
+		       struct ib_mad_recv_wc *mad_recv_wc,
+		       int mad_snoop_flags)
+{
+	struct ib_mad_snoop_private *mad_snoop_priv;
+	unsigned long flags;
+	int i;
+
+	spin_lock_irqsave(&qp_info->snoop_lock, flags);
+	for (i = 0; i < qp_info->snoop_table_size; i++) {
+		mad_snoop_priv = qp_info->snoop_table[i];
+		if (!mad_snoop_priv ||
+		    !(mad_snoop_priv->mad_snoop_flags & mad_snoop_flags))
+			continue;
+
+		atomic_inc(&mad_snoop_priv->refcount);
+		spin_unlock_irqrestore(&qp_info->snoop_lock, flags);
+		mad_snoop_priv->agent.recv_handler(&mad_snoop_priv->agent,
+						   mad_recv_wc);
+		deref_snoop_agent(mad_snoop_priv);
+		spin_lock_irqsave(&qp_info->snoop_lock, flags);
+	}
+	spin_unlock_irqrestore(&qp_info->snoop_lock, flags);
+}
+
+static void build_smp_wc(struct ib_qp *qp,
+			 u64 wr_id, u16 slid, u16 pkey_index, u8 port_num,
+			 struct ib_wc *wc)
+{
+	memset(wc, 0, sizeof *wc);
+	wc->wr_id = wr_id;
+	wc->status = IB_WC_SUCCESS;
+	wc->opcode = IB_WC_RECV;
+	wc->pkey_index = pkey_index;
+	wc->byte_len = sizeof(struct ib_mad) + sizeof(struct ib_grh);
+	wc->src_qp = IB_QP0;
+	wc->qp = qp;
+	wc->slid = slid;
+	wc->sl = 0;
+	wc->dlid_path_bits = 0;
+	wc->port_num = port_num;
+}
+
+/*
+ * Return 0 if SMP is to be sent
+ * Return 1 if SMP was consumed locally (whether or not solicited)
+ * Return < 0 if error
+ */
+static int handle_outgoing_dr_smp(struct ib_mad_agent_private *mad_agent_priv,
+				  struct ib_mad_send_wr_private *mad_send_wr)
+{
+	int ret = 0;
+	struct ib_smp *smp = mad_send_wr->send_buf.mad;
+	unsigned long flags;
+	struct ib_mad_local_private *local;
+	struct ib_mad_private *mad_priv;
+	struct ib_mad_port_private *port_priv;
+	struct ib_mad_agent_private *recv_mad_agent = NULL;
+	struct ib_device *device = mad_agent_priv->agent.device;
+	u8 port_num;
+	struct ib_wc mad_wc;
+	struct ib_send_wr *send_wr = &mad_send_wr->send_wr;
+
+	if (device->node_type == RDMA_NODE_IB_SWITCH &&
+	    smp->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)
+		port_num = send_wr->wr.ud.port_num;
+	else
+		port_num = mad_agent_priv->agent.port_num;
+
+	/*
+	 * Directed route handling starts if the initial LID routed part of
+	 * a request or the ending LID routed part of a response is empty.
+	 * If we are at the start of the LID routed part, don't update the
+	 * hop_ptr or hop_cnt.  See section 14.2.2, Vol 1 IB spec.
+	 */
+	if ((ib_get_smp_direction(smp) ? smp->dr_dlid : smp->dr_slid) ==
+	     IB_LID_PERMISSIVE &&
+	     smi_handle_dr_smp_send(smp, device->node_type, port_num) ==
+	     IB_SMI_DISCARD) {
+		ret = -EINVAL;
+		dev_err(&device->dev, "Invalid directed route\n");
+		goto out;
+	}
+
+	/* Check to post send on QP or process locally */
+	if (smi_check_local_smp(smp, device) == IB_SMI_DISCARD &&
+	    smi_check_local_returning_smp(smp, device) == IB_SMI_DISCARD)
+		goto out;
+
+	local = kmalloc(sizeof *local, GFP_ATOMIC);
+	if (!local) {
+		ret = -ENOMEM;
+		dev_err(&device->dev, "No memory for ib_mad_local_private\n");
+		goto out;
+	}
+	local->mad_priv = NULL;
+	local->recv_mad_agent = NULL;
+	mad_priv = kmem_cache_alloc(ib_mad_cache, GFP_ATOMIC);
+	if (!mad_priv) {
+		ret = -ENOMEM;
+		dev_err(&device->dev, "No memory for local response MAD\n");
+		kfree(local);
+		goto out;
+	}
+
+	build_smp_wc(mad_agent_priv->agent.qp,
+		     send_wr->wr_id, be16_to_cpu(smp->dr_slid),
+		     send_wr->wr.ud.pkey_index,
+		     send_wr->wr.ud.port_num, &mad_wc);
+
+	/* No GRH for DR SMP */
+	ret = device->process_mad(device, 0, port_num, &mad_wc, NULL,
+				  (struct ib_mad *)smp,
+				  (struct ib_mad *)&mad_priv->mad);
+	switch (ret)
+	{
+	case IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY:
+		if (ib_response_mad(&mad_priv->mad.mad) &&
+		    mad_agent_priv->agent.recv_handler) {
+			local->mad_priv = mad_priv;
+			local->recv_mad_agent = mad_agent_priv;
+			/*
+			 * Reference MAD agent until receive
+			 * side of local completion handled
+			 */
+			atomic_inc(&mad_agent_priv->refcount);
+		} else
+			kmem_cache_free(ib_mad_cache, mad_priv);
+		break;
+	case IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED:
+		kmem_cache_free(ib_mad_cache, mad_priv);
+		break;
+	case IB_MAD_RESULT_SUCCESS:
+		/* Treat like an incoming receive MAD */
+		port_priv = ib_get_mad_port(mad_agent_priv->agent.device,
+					    mad_agent_priv->agent.port_num);
+		if (port_priv) {
+			memcpy(&mad_priv->mad.mad, smp, sizeof(struct ib_mad));
+			recv_mad_agent = find_mad_agent(port_priv,
+						        &mad_priv->mad.mad);
+		}
+		if (!port_priv || !recv_mad_agent) {
+			/*
+			 * No receiving agent so drop packet and
+			 * generate send completion.
+			 */
+			kmem_cache_free(ib_mad_cache, mad_priv);
+			break;
+		}
+		local->mad_priv = mad_priv;
+		local->recv_mad_agent = recv_mad_agent;
+		break;
+	default:
+		kmem_cache_free(ib_mad_cache, mad_priv);
+		kfree(local);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	local->mad_send_wr = mad_send_wr;
+	/* Reference MAD agent until send side of local completion handled */
+	atomic_inc(&mad_agent_priv->refcount);
+	/* Queue local completion to local list */
+	spin_lock_irqsave(&mad_agent_priv->lock, flags);
+	list_add_tail(&local->completion_list, &mad_agent_priv->local_list);
+	spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+	queue_work(mad_agent_priv->qp_info->port_priv->wq,
+		   &mad_agent_priv->local_work);
+	ret = 1;
+out:
+	return ret;
+}
+
+static int get_pad_size(int hdr_len, int data_len)
+{
+	int seg_size, pad;
+
+	seg_size = sizeof(struct ib_mad) - hdr_len;
+	if (data_len && seg_size) {
+		pad = seg_size - data_len % seg_size;
+		return pad == seg_size ? 0 : pad;
+	} else
+		return seg_size;
+}
+
+static void free_send_rmpp_list(struct ib_mad_send_wr_private *mad_send_wr)
+{
+	struct ib_rmpp_segment *s, *t;
+
+	list_for_each_entry_safe(s, t, &mad_send_wr->rmpp_list, list) {
+		list_del(&s->list);
+		kfree(s);
+	}
+}
+
+static int alloc_send_rmpp_list(struct ib_mad_send_wr_private *send_wr,
+				gfp_t gfp_mask)
+{
+	struct ib_mad_send_buf *send_buf = &send_wr->send_buf;
+	struct ib_rmpp_mad *rmpp_mad = send_buf->mad;
+	struct ib_rmpp_segment *seg = NULL;
+	int left, seg_size, pad;
+
+	send_buf->seg_size = sizeof (struct ib_mad) - send_buf->hdr_len;
+	seg_size = send_buf->seg_size;
+	pad = send_wr->pad;
+
+	/* Allocate data segments. */
+	for (left = send_buf->data_len + pad; left > 0; left -= seg_size) {
+		seg = kmalloc(sizeof (*seg) + seg_size, gfp_mask);
+		if (!seg) {
+			dev_err(&send_buf->mad_agent->device->dev,
+				"alloc_send_rmpp_segs: RMPP mem alloc failed for len %zd, gfp %#x\n",
+				sizeof (*seg) + seg_size, gfp_mask);
+			free_send_rmpp_list(send_wr);
+			return -ENOMEM;
+		}
+		seg->num = ++send_buf->seg_count;
+		list_add_tail(&seg->list, &send_wr->rmpp_list);
+	}
+
+	/* Zero any padding */
+	if (pad)
+		memset(seg->data + seg_size - pad, 0, pad);
+
+	rmpp_mad->rmpp_hdr.rmpp_version = send_wr->mad_agent_priv->
+					  agent.rmpp_version;
+	rmpp_mad->rmpp_hdr.rmpp_type = IB_MGMT_RMPP_TYPE_DATA;
+	ib_set_rmpp_flags(&rmpp_mad->rmpp_hdr, IB_MGMT_RMPP_FLAG_ACTIVE);
+
+	send_wr->cur_seg = container_of(send_wr->rmpp_list.next,
+					struct ib_rmpp_segment, list);
+	send_wr->last_ack_seg = send_wr->cur_seg;
+	return 0;
+}
+
+int ib_mad_kernel_rmpp_agent(struct ib_mad_agent *agent)
+{
+	return agent->rmpp_version && !(agent->flags & IB_MAD_USER_RMPP);
+}
+EXPORT_SYMBOL(ib_mad_kernel_rmpp_agent);
+
+struct ib_mad_send_buf * ib_create_send_mad(struct ib_mad_agent *mad_agent,
+					    u32 remote_qpn, u16 pkey_index,
+					    int rmpp_active,
+					    int hdr_len, int data_len,
+					    gfp_t gfp_mask)
+{
+	struct ib_mad_agent_private *mad_agent_priv;
+	struct ib_mad_send_wr_private *mad_send_wr;
+	int pad, message_size, ret, size;
+	void *buf;
+
+	mad_agent_priv = container_of(mad_agent, struct ib_mad_agent_private,
+				      agent);
+	pad = get_pad_size(hdr_len, data_len);
+	message_size = hdr_len + data_len + pad;
+
+	if (ib_mad_kernel_rmpp_agent(mad_agent)) {
+		if (!rmpp_active && message_size > sizeof(struct ib_mad))
+			return ERR_PTR(-EINVAL);
+	} else
+		if (rmpp_active || message_size > sizeof(struct ib_mad))
+			return ERR_PTR(-EINVAL);
+
+	size = rmpp_active ? hdr_len : sizeof(struct ib_mad);
+	buf = kzalloc(sizeof *mad_send_wr + size, gfp_mask);
+	if (!buf)
+		return ERR_PTR(-ENOMEM);
+
+	mad_send_wr = buf + size;
+	INIT_LIST_HEAD(&mad_send_wr->rmpp_list);
+	mad_send_wr->send_buf.mad = buf;
+	mad_send_wr->send_buf.hdr_len = hdr_len;
+	mad_send_wr->send_buf.data_len = data_len;
+	mad_send_wr->pad = pad;
+
+	mad_send_wr->mad_agent_priv = mad_agent_priv;
+	mad_send_wr->sg_list[0].length = hdr_len;
+	mad_send_wr->sg_list[0].lkey = mad_agent->mr->lkey;
+	mad_send_wr->sg_list[1].length = sizeof(struct ib_mad) - hdr_len;
+	mad_send_wr->sg_list[1].lkey = mad_agent->mr->lkey;
+
+	mad_send_wr->send_wr.wr_id = (unsigned long) mad_send_wr;
+	mad_send_wr->send_wr.sg_list = mad_send_wr->sg_list;
+	mad_send_wr->send_wr.num_sge = 2;
+	mad_send_wr->send_wr.opcode = IB_WR_SEND;
+	mad_send_wr->send_wr.send_flags = IB_SEND_SIGNALED;
+	mad_send_wr->send_wr.wr.ud.remote_qpn = remote_qpn;
+	mad_send_wr->send_wr.wr.ud.remote_qkey = IB_QP_SET_QKEY;
+	mad_send_wr->send_wr.wr.ud.pkey_index = pkey_index;
+
+	if (rmpp_active) {
+		ret = alloc_send_rmpp_list(mad_send_wr, gfp_mask);
+		if (ret) {
+			kfree(buf);
+			return ERR_PTR(ret);
+		}
+	}
+
+	mad_send_wr->send_buf.mad_agent = mad_agent;
+	atomic_inc(&mad_agent_priv->refcount);
+	return &mad_send_wr->send_buf;
+}
+EXPORT_SYMBOL(ib_create_send_mad);
+
+int ib_get_mad_data_offset(u8 mgmt_class)
+{
+	if (mgmt_class == IB_MGMT_CLASS_SUBN_ADM)
+		return IB_MGMT_SA_HDR;
+	else if ((mgmt_class == IB_MGMT_CLASS_DEVICE_MGMT) ||
+		 (mgmt_class == IB_MGMT_CLASS_DEVICE_ADM) ||
+		 (mgmt_class == IB_MGMT_CLASS_BIS))
+		return IB_MGMT_DEVICE_HDR;
+	else if ((mgmt_class >= IB_MGMT_CLASS_VENDOR_RANGE2_START) &&
+		 (mgmt_class <= IB_MGMT_CLASS_VENDOR_RANGE2_END))
+		return IB_MGMT_VENDOR_HDR;
+	else
+		return IB_MGMT_MAD_HDR;
+}
+EXPORT_SYMBOL(ib_get_mad_data_offset);
+
+int ib_is_mad_class_rmpp(u8 mgmt_class)
+{
+	if ((mgmt_class == IB_MGMT_CLASS_SUBN_ADM) ||
+	    (mgmt_class == IB_MGMT_CLASS_DEVICE_MGMT) ||
+	    (mgmt_class == IB_MGMT_CLASS_DEVICE_ADM) ||
+	    (mgmt_class == IB_MGMT_CLASS_BIS) ||
+	    ((mgmt_class >= IB_MGMT_CLASS_VENDOR_RANGE2_START) &&
+	     (mgmt_class <= IB_MGMT_CLASS_VENDOR_RANGE2_END)))
+		return 1;
+	return 0;
+}
+EXPORT_SYMBOL(ib_is_mad_class_rmpp);
+
+void *ib_get_rmpp_segment(struct ib_mad_send_buf *send_buf, int seg_num)
+{
+	struct ib_mad_send_wr_private *mad_send_wr;
+	struct list_head *list;
+
+	mad_send_wr = container_of(send_buf, struct ib_mad_send_wr_private,
+				   send_buf);
+	list = &mad_send_wr->cur_seg->list;
+
+	if (mad_send_wr->cur_seg->num < seg_num) {
+		list_for_each_entry(mad_send_wr->cur_seg, list, list)
+			if (mad_send_wr->cur_seg->num == seg_num)
+				break;
+	} else if (mad_send_wr->cur_seg->num > seg_num) {
+		list_for_each_entry_reverse(mad_send_wr->cur_seg, list, list)
+			if (mad_send_wr->cur_seg->num == seg_num)
+				break;
+	}
+	return mad_send_wr->cur_seg->data;
+}
+EXPORT_SYMBOL(ib_get_rmpp_segment);
+
+static inline void *ib_get_payload(struct ib_mad_send_wr_private *mad_send_wr)
+{
+	if (mad_send_wr->send_buf.seg_count)
+		return ib_get_rmpp_segment(&mad_send_wr->send_buf,
+					   mad_send_wr->seg_num);
+	else
+		return mad_send_wr->send_buf.mad +
+		       mad_send_wr->send_buf.hdr_len;
+}
+
+void ib_free_send_mad(struct ib_mad_send_buf *send_buf)
+{
+	struct ib_mad_agent_private *mad_agent_priv;
+	struct ib_mad_send_wr_private *mad_send_wr;
+
+	mad_agent_priv = container_of(send_buf->mad_agent,
+				      struct ib_mad_agent_private, agent);
+	mad_send_wr = container_of(send_buf, struct ib_mad_send_wr_private,
+				   send_buf);
+
+	free_send_rmpp_list(mad_send_wr);
+	kfree(send_buf->mad);
+	deref_mad_agent(mad_agent_priv);
+}
+EXPORT_SYMBOL(ib_free_send_mad);
+
+int ib_send_mad(struct ib_mad_send_wr_private *mad_send_wr)
+{
+	struct ib_mad_qp_info *qp_info;
+	struct list_head *list;
+	struct ib_send_wr *bad_send_wr;
+	struct ib_mad_agent *mad_agent;
+	struct ib_sge *sge;
+	unsigned long flags;
+	int ret;
+
+	/* Set WR ID to find mad_send_wr upon completion */
+	qp_info = mad_send_wr->mad_agent_priv->qp_info;
+	mad_send_wr->send_wr.wr_id = (unsigned long)&mad_send_wr->mad_list;
+	mad_send_wr->mad_list.mad_queue = &qp_info->send_queue;
+
+	mad_agent = mad_send_wr->send_buf.mad_agent;
+	sge = mad_send_wr->sg_list;
+	sge[0].addr = ib_dma_map_single(mad_agent->device,
+					mad_send_wr->send_buf.mad,
+					sge[0].length,
+					DMA_TO_DEVICE);
+	if (unlikely(ib_dma_mapping_error(mad_agent->device, sge[0].addr)))
+		return -ENOMEM;
+
+	mad_send_wr->header_mapping = sge[0].addr;
+
+	sge[1].addr = ib_dma_map_single(mad_agent->device,
+					ib_get_payload(mad_send_wr),
+					sge[1].length,
+					DMA_TO_DEVICE);
+	if (unlikely(ib_dma_mapping_error(mad_agent->device, sge[1].addr))) {
+		ib_dma_unmap_single(mad_agent->device,
+				    mad_send_wr->header_mapping,
+				    sge[0].length, DMA_TO_DEVICE);
+		return -ENOMEM;
+	}
+	mad_send_wr->payload_mapping = sge[1].addr;
+
+	spin_lock_irqsave(&qp_info->send_queue.lock, flags);
+	if (qp_info->send_queue.count < qp_info->send_queue.max_active) {
+		ret = ib_post_send(mad_agent->qp, &mad_send_wr->send_wr,
+				   &bad_send_wr);
+		list = &qp_info->send_queue.list;
+	} else {
+		ret = 0;
+		list = &qp_info->overflow_list;
+	}
+
+	if (!ret) {
+		qp_info->send_queue.count++;
+		list_add_tail(&mad_send_wr->mad_list.list, list);
+	}
+	spin_unlock_irqrestore(&qp_info->send_queue.lock, flags);
+	if (ret) {
+		ib_dma_unmap_single(mad_agent->device,
+				    mad_send_wr->header_mapping,
+				    sge[0].length, DMA_TO_DEVICE);
+		ib_dma_unmap_single(mad_agent->device,
+				    mad_send_wr->payload_mapping,
+				    sge[1].length, DMA_TO_DEVICE);
+	}
+	return ret;
+}
+
+/*
+ * ib_post_send_mad - Posts MAD(s) to the send queue of the QP associated
+ *  with the registered client
+ */
+int ib_post_send_mad(struct ib_mad_send_buf *send_buf,
+		     struct ib_mad_send_buf **bad_send_buf)
+{
+	struct ib_mad_agent_private *mad_agent_priv;
+	struct ib_mad_send_buf *next_send_buf;
+	struct ib_mad_send_wr_private *mad_send_wr;
+	unsigned long flags;
+	int ret = -EINVAL;
+
+	/* Walk list of send WRs and post each on send list */
+	for (; send_buf; send_buf = next_send_buf) {
+
+		mad_send_wr = container_of(send_buf,
+					   struct ib_mad_send_wr_private,
+					   send_buf);
+		mad_agent_priv = mad_send_wr->mad_agent_priv;
+
+		if (!send_buf->mad_agent->send_handler ||
+		    (send_buf->timeout_ms &&
+		     !send_buf->mad_agent->recv_handler)) {
+			ret = -EINVAL;
+			goto error;
+		}
+
+		if (!ib_is_mad_class_rmpp(((struct ib_mad_hdr *) send_buf->mad)->mgmt_class)) {
+			if (mad_agent_priv->agent.rmpp_version) {
+				ret = -EINVAL;
+				goto error;
+			}
+		}
+
+		/*
+		 * Save pointer to next work request to post in case the
+		 * current one completes, and the user modifies the work
+		 * request associated with the completion
+		 */
+		next_send_buf = send_buf->next;
+		mad_send_wr->send_wr.wr.ud.ah = send_buf->ah;
+
+		if (((struct ib_mad_hdr *) send_buf->mad)->mgmt_class ==
+		    IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) {
+			ret = handle_outgoing_dr_smp(mad_agent_priv,
+						     mad_send_wr);
+			if (ret < 0)		/* error */
+				goto error;
+			else if (ret == 1)	/* locally consumed */
+				continue;
+		}
+
+		mad_send_wr->tid = ((struct ib_mad_hdr *) send_buf->mad)->tid;
+		/* Timeout will be updated after send completes */
+		mad_send_wr->timeout = msecs_to_jiffies(send_buf->timeout_ms);
+		mad_send_wr->max_retries = send_buf->retries;
+		mad_send_wr->retries_left = send_buf->retries;
+		send_buf->retries = 0;
+		/* Reference for work request to QP + response */
+		mad_send_wr->refcount = 1 + (mad_send_wr->timeout > 0);
+		mad_send_wr->status = IB_WC_SUCCESS;
+
+		/* Reference MAD agent until send completes */
+		atomic_inc(&mad_agent_priv->refcount);
+		spin_lock_irqsave(&mad_agent_priv->lock, flags);
+		list_add_tail(&mad_send_wr->agent_list,
+			      &mad_agent_priv->send_list);
+		spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+
+		if (ib_mad_kernel_rmpp_agent(&mad_agent_priv->agent)) {
+			ret = ib_send_rmpp_mad(mad_send_wr);
+			if (ret >= 0 && ret != IB_RMPP_RESULT_CONSUMED)
+				ret = ib_send_mad(mad_send_wr);
+		} else
+			ret = ib_send_mad(mad_send_wr);
+		if (ret < 0) {
+			/* Fail send request */
+			spin_lock_irqsave(&mad_agent_priv->lock, flags);
+			list_del(&mad_send_wr->agent_list);
+			spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+			atomic_dec(&mad_agent_priv->refcount);
+			goto error;
+		}
+	}
+	return 0;
+error:
+	if (bad_send_buf)
+		*bad_send_buf = send_buf;
+	return ret;
+}
+EXPORT_SYMBOL(ib_post_send_mad);
+
+/*
+ * ib_free_recv_mad - Returns data buffers used to receive
+ *  a MAD to the access layer
+ */
+void ib_free_recv_mad(struct ib_mad_recv_wc *mad_recv_wc)
+{
+	struct ib_mad_recv_buf *mad_recv_buf, *temp_recv_buf;
+	struct ib_mad_private_header *mad_priv_hdr;
+	struct ib_mad_private *priv;
+	struct list_head free_list;
+
+	INIT_LIST_HEAD(&free_list);
+	list_splice_init(&mad_recv_wc->rmpp_list, &free_list);
+
+	list_for_each_entry_safe(mad_recv_buf, temp_recv_buf,
+					&free_list, list) {
+		mad_recv_wc = container_of(mad_recv_buf, struct ib_mad_recv_wc,
+					   recv_buf);
+		mad_priv_hdr = container_of(mad_recv_wc,
+					    struct ib_mad_private_header,
+					    recv_wc);
+		priv = container_of(mad_priv_hdr, struct ib_mad_private,
+				    header);
+		kmem_cache_free(ib_mad_cache, priv);
+	}
+}
+EXPORT_SYMBOL(ib_free_recv_mad);
+
+struct ib_mad_agent *ib_redirect_mad_qp(struct ib_qp *qp,
+					u8 rmpp_version,
+					ib_mad_send_handler send_handler,
+					ib_mad_recv_handler recv_handler,
+					void *context)
+{
+	return ERR_PTR(-EINVAL);	/* XXX: for now */
+}
+EXPORT_SYMBOL(ib_redirect_mad_qp);
+
+int ib_process_mad_wc(struct ib_mad_agent *mad_agent,
+		      struct ib_wc *wc)
+{
+	dev_err(&mad_agent->device->dev,
+		"ib_process_mad_wc() not implemented yet\n");
+	return 0;
+}
+EXPORT_SYMBOL(ib_process_mad_wc);
+
+static int method_in_use(struct ib_mad_mgmt_method_table **method,
+			 struct ib_mad_reg_req *mad_reg_req)
+{
+	int i;
+
+	for_each_set_bit(i, mad_reg_req->method_mask, IB_MGMT_MAX_METHODS) {
+		if ((*method)->agent[i]) {
+			pr_err("Method %d already in use\n", i);
+			return -EINVAL;
+		}
+	}
+	return 0;
+}
+
+static int allocate_method_table(struct ib_mad_mgmt_method_table **method)
+{
+	/* Allocate management method table */
+	*method = kzalloc(sizeof **method, GFP_ATOMIC);
+	if (!*method) {
+		pr_err("No memory for ib_mad_mgmt_method_table\n");
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+/*
+ * Check to see if there are any methods still in use
+ */
+static int check_method_table(struct ib_mad_mgmt_method_table *method)
+{
+	int i;
+
+	for (i = 0; i < IB_MGMT_MAX_METHODS; i++)
+		if (method->agent[i])
+			return 1;
+	return 0;
+}
+
+/*
+ * Check to see if there are any method tables for this class still in use
+ */
+static int check_class_table(struct ib_mad_mgmt_class_table *class)
+{
+	int i;
+
+	for (i = 0; i < MAX_MGMT_CLASS; i++)
+		if (class->method_table[i])
+			return 1;
+	return 0;
+}
+
+static int check_vendor_class(struct ib_mad_mgmt_vendor_class *vendor_class)
+{
+	int i;
+
+	for (i = 0; i < MAX_MGMT_OUI; i++)
+		if (vendor_class->method_table[i])
+			return 1;
+	return 0;
+}
+
+static int find_vendor_oui(struct ib_mad_mgmt_vendor_class *vendor_class,
+			   char *oui)
+{
+	int i;
+
+	for (i = 0; i < MAX_MGMT_OUI; i++)
+		/* Is there matching OUI for this vendor class ? */
+		if (!memcmp(vendor_class->oui[i], oui, 3))
+			return i;
+
+	return -1;
+}
+
+static int check_vendor_table(struct ib_mad_mgmt_vendor_class_table *vendor)
+{
+	int i;
+
+	for (i = 0; i < MAX_MGMT_VENDOR_RANGE2; i++)
+		if (vendor->vendor_class[i])
+			return 1;
+
+	return 0;
+}
+
+static void remove_methods_mad_agent(struct ib_mad_mgmt_method_table *method,
+				     struct ib_mad_agent_private *agent)
+{
+	int i;
+
+	/* Remove any methods for this mad agent */
+	for (i = 0; i < IB_MGMT_MAX_METHODS; i++) {
+		if (method->agent[i] == agent) {
+			method->agent[i] = NULL;
+		}
+	}
+}
+
+static int add_nonoui_reg_req(struct ib_mad_reg_req *mad_reg_req,
+			      struct ib_mad_agent_private *agent_priv,
+			      u8 mgmt_class)
+{
+	struct ib_mad_port_private *port_priv;
+	struct ib_mad_mgmt_class_table **class;
+	struct ib_mad_mgmt_method_table **method;
+	int i, ret;
+
+	port_priv = agent_priv->qp_info->port_priv;
+	class = &port_priv->version[mad_reg_req->mgmt_class_version].class;
+	if (!*class) {
+		/* Allocate management class table for "new" class version */
+		*class = kzalloc(sizeof **class, GFP_ATOMIC);
+		if (!*class) {
+			dev_err(&agent_priv->agent.device->dev,
+				"No memory for ib_mad_mgmt_class_table\n");
+			ret = -ENOMEM;
+			goto error1;
+		}
+
+		/* Allocate method table for this management class */
+		method = &(*class)->method_table[mgmt_class];
+		if ((ret = allocate_method_table(method)))
+			goto error2;
+	} else {
+		method = &(*class)->method_table[mgmt_class];
+		if (!*method) {
+			/* Allocate method table for this management class */
+			if ((ret = allocate_method_table(method)))
+				goto error1;
+		}
+	}
+
+	/* Now, make sure methods are not already in use */
+	if (method_in_use(method, mad_reg_req))
+		goto error3;
+
+	/* Finally, add in methods being registered */
+	for_each_set_bit(i, mad_reg_req->method_mask, IB_MGMT_MAX_METHODS)
+		(*method)->agent[i] = agent_priv;
+
+	return 0;
+
+error3:
+	/* Remove any methods for this mad agent */
+	remove_methods_mad_agent(*method, agent_priv);
+	/* Now, check to see if there are any methods in use */
+	if (!check_method_table(*method)) {
+		/* If not, release management method table */
+		kfree(*method);
+		*method = NULL;
+	}
+	ret = -EINVAL;
+	goto error1;
+error2:
+	kfree(*class);
+	*class = NULL;
+error1:
+	return ret;
+}
+
+static int add_oui_reg_req(struct ib_mad_reg_req *mad_reg_req,
+			   struct ib_mad_agent_private *agent_priv)
+{
+	struct ib_mad_port_private *port_priv;
+	struct ib_mad_mgmt_vendor_class_table **vendor_table;
+	struct ib_mad_mgmt_vendor_class_table *vendor = NULL;
+	struct ib_mad_mgmt_vendor_class *vendor_class = NULL;
+	struct ib_mad_mgmt_method_table **method;
+	int i, ret = -ENOMEM;
+	u8 vclass;
+
+	/* "New" vendor (with OUI) class */
+	vclass = vendor_class_index(mad_reg_req->mgmt_class);
+	port_priv = agent_priv->qp_info->port_priv;
+	vendor_table = &port_priv->version[
+				mad_reg_req->mgmt_class_version].vendor;
+	if (!*vendor_table) {
+		/* Allocate mgmt vendor class table for "new" class version */
+		vendor = kzalloc(sizeof *vendor, GFP_ATOMIC);
+		if (!vendor) {
+			dev_err(&agent_priv->agent.device->dev,
+				"No memory for ib_mad_mgmt_vendor_class_table\n");
+			goto error1;
+		}
+
+		*vendor_table = vendor;
+	}
+	if (!(*vendor_table)->vendor_class[vclass]) {
+		/* Allocate table for this management vendor class */
+		vendor_class = kzalloc(sizeof *vendor_class, GFP_ATOMIC);
+		if (!vendor_class) {
+			dev_err(&agent_priv->agent.device->dev,
+				"No memory for ib_mad_mgmt_vendor_class\n");
+			goto error2;
+		}
+
+		(*vendor_table)->vendor_class[vclass] = vendor_class;
+	}
+	for (i = 0; i < MAX_MGMT_OUI; i++) {
+		/* Is there matching OUI for this vendor class ? */
+		if (!memcmp((*vendor_table)->vendor_class[vclass]->oui[i],
+			    mad_reg_req->oui, 3)) {
+			method = &(*vendor_table)->vendor_class[
+						vclass]->method_table[i];
+			BUG_ON(!*method);
+			goto check_in_use;
+		}
+	}
+	for (i = 0; i < MAX_MGMT_OUI; i++) {
+		/* OUI slot available ? */
+		if (!is_vendor_oui((*vendor_table)->vendor_class[
+				vclass]->oui[i])) {
+			method = &(*vendor_table)->vendor_class[
+				vclass]->method_table[i];
+			BUG_ON(*method);
+			/* Allocate method table for this OUI */
+			if ((ret = allocate_method_table(method)))
+				goto error3;
+			memcpy((*vendor_table)->vendor_class[vclass]->oui[i],
+			       mad_reg_req->oui, 3);
+			goto check_in_use;
+		}
+	}
+	dev_err(&agent_priv->agent.device->dev, "All OUI slots in use\n");
+	goto error3;
+
+check_in_use:
+	/* Now, make sure methods are not already in use */
+	if (method_in_use(method, mad_reg_req))
+		goto error4;
+
+	/* Finally, add in methods being registered */
+	for_each_set_bit(i, mad_reg_req->method_mask, IB_MGMT_MAX_METHODS)
+		(*method)->agent[i] = agent_priv;
+
+	return 0;
+
+error4:
+	/* Remove any methods for this mad agent */
+	remove_methods_mad_agent(*method, agent_priv);
+	/* Now, check to see if there are any methods in use */
+	if (!check_method_table(*method)) {
+		/* If not, release management method table */
+		kfree(*method);
+		*method = NULL;
+	}
+	ret = -EINVAL;
+error3:
+	if (vendor_class) {
+		(*vendor_table)->vendor_class[vclass] = NULL;
+		kfree(vendor_class);
+	}
+error2:
+	if (vendor) {
+		*vendor_table = NULL;
+		kfree(vendor);
+	}
+error1:
+	return ret;
+}
+
+static void remove_mad_reg_req(struct ib_mad_agent_private *agent_priv)
+{
+	struct ib_mad_port_private *port_priv;
+	struct ib_mad_mgmt_class_table *class;
+	struct ib_mad_mgmt_method_table *method;
+	struct ib_mad_mgmt_vendor_class_table *vendor;
+	struct ib_mad_mgmt_vendor_class *vendor_class;
+	int index;
+	u8 mgmt_class;
+
+	/*
+	 * Was MAD registration request supplied
+	 * with original registration ?
+	 */
+	if (!agent_priv->reg_req) {
+		goto out;
+	}
+
+	port_priv = agent_priv->qp_info->port_priv;
+	mgmt_class = convert_mgmt_class(agent_priv->reg_req->mgmt_class);
+	class = port_priv->version[
+			agent_priv->reg_req->mgmt_class_version].class;
+	if (!class)
+		goto vendor_check;
+
+	method = class->method_table[mgmt_class];
+	if (method) {
+		/* Remove any methods for this mad agent */
+		remove_methods_mad_agent(method, agent_priv);
+		/* Now, check to see if there are any methods still in use */
+		if (!check_method_table(method)) {
+			/* If not, release management method table */
+			 kfree(method);
+			 class->method_table[mgmt_class] = NULL;
+			 /* Any management classes left ? */
+			if (!check_class_table(class)) {
+				/* If not, release management class table */
+				kfree(class);
+				port_priv->version[
+					agent_priv->reg_req->
+					mgmt_class_version].class = NULL;
+			}
+		}
+	}
+
+vendor_check:
+	if (!is_vendor_class(mgmt_class))
+		goto out;
+
+	/* normalize mgmt_class to vendor range 2 */
+	mgmt_class = vendor_class_index(agent_priv->reg_req->mgmt_class);
+	vendor = port_priv->version[
+			agent_priv->reg_req->mgmt_class_version].vendor;
+
+	if (!vendor)
+		goto out;
+
+	vendor_class = vendor->vendor_class[mgmt_class];
+	if (vendor_class) {
+		index = find_vendor_oui(vendor_class, agent_priv->reg_req->oui);
+		if (index < 0)
+			goto out;
+		method = vendor_class->method_table[index];
+		if (method) {
+			/* Remove any methods for this mad agent */
+			remove_methods_mad_agent(method, agent_priv);
+			/*
+			 * Now, check to see if there are
+			 * any methods still in use
+			 */
+			if (!check_method_table(method)) {
+				/* If not, release management method table */
+				kfree(method);
+				vendor_class->method_table[index] = NULL;
+				memset(vendor_class->oui[index], 0, 3);
+				/* Any OUIs left ? */
+				if (!check_vendor_class(vendor_class)) {
+					/* If not, release vendor class table */
+					kfree(vendor_class);
+					vendor->vendor_class[mgmt_class] = NULL;
+					/* Any other vendor classes left ? */
+					if (!check_vendor_table(vendor)) {
+						kfree(vendor);
+						port_priv->version[
+							agent_priv->reg_req->
+							mgmt_class_version].
+							vendor = NULL;
+					}
+				}
+			}
+		}
+	}
+
+out:
+	return;
+}
+
+static struct ib_mad_agent_private *
+find_mad_agent(struct ib_mad_port_private *port_priv,
+	       struct ib_mad *mad)
+{
+	struct ib_mad_agent_private *mad_agent = NULL;
+	unsigned long flags;
+
+	spin_lock_irqsave(&port_priv->reg_lock, flags);
+	if (ib_response_mad(mad)) {
+		u32 hi_tid;
+		struct ib_mad_agent_private *entry;
+
+		/*
+		 * Routing is based on high 32 bits of transaction ID
+		 * of MAD.
+		 */
+		hi_tid = be64_to_cpu(mad->mad_hdr.tid) >> 32;
+		list_for_each_entry(entry, &port_priv->agent_list, agent_list) {
+			if (entry->agent.hi_tid == hi_tid) {
+				mad_agent = entry;
+				break;
+			}
+		}
+	} else {
+		struct ib_mad_mgmt_class_table *class;
+		struct ib_mad_mgmt_method_table *method;
+		struct ib_mad_mgmt_vendor_class_table *vendor;
+		struct ib_mad_mgmt_vendor_class *vendor_class;
+		struct ib_vendor_mad *vendor_mad;
+		int index;
+
+		/*
+		 * Routing is based on version, class, and method
+		 * For "newer" vendor MADs, also based on OUI
+		 */
+		if (mad->mad_hdr.class_version >= MAX_MGMT_VERSION)
+			goto out;
+		if (!is_vendor_class(mad->mad_hdr.mgmt_class)) {
+			class = port_priv->version[
+					mad->mad_hdr.class_version].class;
+			if (!class)
+				goto out;
+			if (convert_mgmt_class(mad->mad_hdr.mgmt_class) >=
+			    IB_MGMT_MAX_METHODS)
+				goto out;
+			method = class->method_table[convert_mgmt_class(
+							mad->mad_hdr.mgmt_class)];
+			if (method)
+				mad_agent = method->agent[mad->mad_hdr.method &
+							  ~IB_MGMT_METHOD_RESP];
+		} else {
+			vendor = port_priv->version[
+					mad->mad_hdr.class_version].vendor;
+			if (!vendor)
+				goto out;
+			vendor_class = vendor->vendor_class[vendor_class_index(
+						mad->mad_hdr.mgmt_class)];
+			if (!vendor_class)
+				goto out;
+			/* Find matching OUI */
+			vendor_mad = (struct ib_vendor_mad *)mad;
+			index = find_vendor_oui(vendor_class, vendor_mad->oui);
+			if (index == -1)
+				goto out;
+			method = vendor_class->method_table[index];
+			if (method) {
+				mad_agent = method->agent[mad->mad_hdr.method &
+							  ~IB_MGMT_METHOD_RESP];
+			}
+		}
+	}
+
+	if (mad_agent) {
+		if (mad_agent->agent.recv_handler)
+			atomic_inc(&mad_agent->refcount);
+		else {
+			dev_notice(&port_priv->device->dev,
+				   "No receive handler for client %p on port %d\n",
+				   &mad_agent->agent, port_priv->port_num);
+			mad_agent = NULL;
+		}
+	}
+out:
+	spin_unlock_irqrestore(&port_priv->reg_lock, flags);
+
+	return mad_agent;
+}
+
+static int validate_mad(struct ib_mad *mad, u32 qp_num)
+{
+	int valid = 0;
+
+	/* Make sure MAD base version is understood */
+	if (mad->mad_hdr.base_version != IB_MGMT_BASE_VERSION) {
+		pr_err("MAD received with unsupported base version %d\n",
+			mad->mad_hdr.base_version);
+		goto out;
+	}
+
+	/* Filter SMI packets sent to other than QP0 */
+	if ((mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED) ||
+	    (mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)) {
+		if (qp_num == 0)
+			valid = 1;
+	} else {
+		/* Filter GSI packets sent to QP0 */
+		if (qp_num != 0)
+			valid = 1;
+	}
+
+out:
+	return valid;
+}
+
+static int is_data_mad(struct ib_mad_agent_private *mad_agent_priv,
+		       struct ib_mad_hdr *mad_hdr)
+{
+	struct ib_rmpp_mad *rmpp_mad;
+
+	rmpp_mad = (struct ib_rmpp_mad *)mad_hdr;
+	return !mad_agent_priv->agent.rmpp_version ||
+		!ib_mad_kernel_rmpp_agent(&mad_agent_priv->agent) ||
+		!(ib_get_rmpp_flags(&rmpp_mad->rmpp_hdr) &
+				    IB_MGMT_RMPP_FLAG_ACTIVE) ||
+		(rmpp_mad->rmpp_hdr.rmpp_type == IB_MGMT_RMPP_TYPE_DATA);
+}
+
+static inline int rcv_has_same_class(struct ib_mad_send_wr_private *wr,
+				     struct ib_mad_recv_wc *rwc)
+{
+	return ((struct ib_mad *)(wr->send_buf.mad))->mad_hdr.mgmt_class ==
+		rwc->recv_buf.mad->mad_hdr.mgmt_class;
+}
+
+static inline int rcv_has_same_gid(struct ib_mad_agent_private *mad_agent_priv,
+				   struct ib_mad_send_wr_private *wr,
+				   struct ib_mad_recv_wc *rwc )
+{
+	struct ib_ah_attr attr;
+	u8 send_resp, rcv_resp;
+	union ib_gid sgid;
+	struct ib_device *device = mad_agent_priv->agent.device;
+	u8 port_num = mad_agent_priv->agent.port_num;
+	u8 lmc;
+
+	send_resp = ib_response_mad((struct ib_mad *)wr->send_buf.mad);
+	rcv_resp = ib_response_mad(rwc->recv_buf.mad);
+
+	if (send_resp == rcv_resp)
+		/* both requests, or both responses. GIDs different */
+		return 0;
+
+	if (ib_query_ah(wr->send_buf.ah, &attr))
+		/* Assume not equal, to avoid false positives. */
+		return 0;
+
+	if (!!(attr.ah_flags & IB_AH_GRH) !=
+	    !!(rwc->wc->wc_flags & IB_WC_GRH))
+		/* one has GID, other does not.  Assume different */
+		return 0;
+
+	if (!send_resp && rcv_resp) {
+		/* is request/response. */
+		if (!(attr.ah_flags & IB_AH_GRH)) {
+			if (ib_get_cached_lmc(device, port_num, &lmc))
+				return 0;
+			return (!lmc || !((attr.src_path_bits ^
+					   rwc->wc->dlid_path_bits) &
+					  ((1 << lmc) - 1)));
+		} else {
+			if (ib_get_cached_gid(device, port_num,
+					      attr.grh.sgid_index, &sgid))
+				return 0;
+			return !memcmp(sgid.raw, rwc->recv_buf.grh->dgid.raw,
+				       16);
+		}
+	}
+
+	if (!(attr.ah_flags & IB_AH_GRH))
+		return attr.dlid == rwc->wc->slid;
+	else
+		return !memcmp(attr.grh.dgid.raw, rwc->recv_buf.grh->sgid.raw,
+			       16);
+}
+
+static inline int is_direct(u8 class)
+{
+	return (class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE);
+}
+
+struct ib_mad_send_wr_private*
+ib_find_send_mad(struct ib_mad_agent_private *mad_agent_priv,
+		 struct ib_mad_recv_wc *wc)
+{
+	struct ib_mad_send_wr_private *wr;
+	struct ib_mad *mad;
+
+	mad = (struct ib_mad *)wc->recv_buf.mad;
+
+	list_for_each_entry(wr, &mad_agent_priv->wait_list, agent_list) {
+		if ((wr->tid == mad->mad_hdr.tid) &&
+		    rcv_has_same_class(wr, wc) &&
+		    /*
+		     * Don't check GID for direct routed MADs.
+		     * These might have permissive LIDs.
+		     */
+		    (is_direct(wc->recv_buf.mad->mad_hdr.mgmt_class) ||
+		     rcv_has_same_gid(mad_agent_priv, wr, wc)))
+			return (wr->status == IB_WC_SUCCESS) ? wr : NULL;
+	}
+
+	/*
+	 * It's possible to receive the response before we've
+	 * been notified that the send has completed
+	 */
+	list_for_each_entry(wr, &mad_agent_priv->send_list, agent_list) {
+		if (is_data_mad(mad_agent_priv, wr->send_buf.mad) &&
+		    wr->tid == mad->mad_hdr.tid &&
+		    wr->timeout &&
+		    rcv_has_same_class(wr, wc) &&
+		    /*
+		     * Don't check GID for direct routed MADs.
+		     * These might have permissive LIDs.
+		     */
+		    (is_direct(wc->recv_buf.mad->mad_hdr.mgmt_class) ||
+		     rcv_has_same_gid(mad_agent_priv, wr, wc)))
+			/* Verify request has not been canceled */
+			return (wr->status == IB_WC_SUCCESS) ? wr : NULL;
+	}
+	return NULL;
+}
+
+void ib_mark_mad_done(struct ib_mad_send_wr_private *mad_send_wr)
+{
+	mad_send_wr->timeout = 0;
+	if (mad_send_wr->refcount == 1)
+		list_move_tail(&mad_send_wr->agent_list,
+			      &mad_send_wr->mad_agent_priv->done_list);
+}
+
+static void ib_mad_complete_recv(struct ib_mad_agent_private *mad_agent_priv,
+				 struct ib_mad_recv_wc *mad_recv_wc)
+{
+	struct ib_mad_send_wr_private *mad_send_wr;
+	struct ib_mad_send_wc mad_send_wc;
+	unsigned long flags;
+
+	INIT_LIST_HEAD(&mad_recv_wc->rmpp_list);
+	list_add(&mad_recv_wc->recv_buf.list, &mad_recv_wc->rmpp_list);
+	if (ib_mad_kernel_rmpp_agent(&mad_agent_priv->agent)) {
+		mad_recv_wc = ib_process_rmpp_recv_wc(mad_agent_priv,
+						      mad_recv_wc);
+		if (!mad_recv_wc) {
+			deref_mad_agent(mad_agent_priv);
+			return;
+		}
+	}
+
+	/* Complete corresponding request */
+	if (ib_response_mad(mad_recv_wc->recv_buf.mad)) {
+		spin_lock_irqsave(&mad_agent_priv->lock, flags);
+		mad_send_wr = ib_find_send_mad(mad_agent_priv, mad_recv_wc);
+		if (!mad_send_wr) {
+			spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+			if (!ib_mad_kernel_rmpp_agent(&mad_agent_priv->agent)
+			   && ib_is_mad_class_rmpp(mad_recv_wc->recv_buf.mad->mad_hdr.mgmt_class)
+			   && (ib_get_rmpp_flags(&((struct ib_rmpp_mad *)mad_recv_wc->recv_buf.mad)->rmpp_hdr)
+					& IB_MGMT_RMPP_FLAG_ACTIVE)) {
+				/* user rmpp is in effect
+				 * and this is an active RMPP MAD
+				 */
+				mad_recv_wc->wc->wr_id = 0;
+				mad_agent_priv->agent.recv_handler(&mad_agent_priv->agent,
+								   mad_recv_wc);
+				atomic_dec(&mad_agent_priv->refcount);
+			} else {
+				/* not user rmpp, revert to normal behavior and
+				 * drop the mad */
+				ib_free_recv_mad(mad_recv_wc);
+				deref_mad_agent(mad_agent_priv);
+				return;
+			}
+		} else {
+			ib_mark_mad_done(mad_send_wr);
+			spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+
+			/* Defined behavior is to complete response before request */
+			mad_recv_wc->wc->wr_id = (unsigned long) &mad_send_wr->send_buf;
+			mad_agent_priv->agent.recv_handler(&mad_agent_priv->agent,
+							   mad_recv_wc);
+			atomic_dec(&mad_agent_priv->refcount);
+
+			mad_send_wc.status = IB_WC_SUCCESS;
+			mad_send_wc.vendor_err = 0;
+			mad_send_wc.send_buf = &mad_send_wr->send_buf;
+			ib_mad_complete_send_wr(mad_send_wr, &mad_send_wc);
+		}
+	} else {
+		mad_agent_priv->agent.recv_handler(&mad_agent_priv->agent,
+						   mad_recv_wc);
+		deref_mad_agent(mad_agent_priv);
+	}
+}
+
+static bool generate_unmatched_resp(struct ib_mad_private *recv,
+				    struct ib_mad_private *response)
+{
+	if (recv->mad.mad.mad_hdr.method == IB_MGMT_METHOD_GET ||
+	    recv->mad.mad.mad_hdr.method == IB_MGMT_METHOD_SET) {
+		memcpy(response, recv, sizeof *response);
+		response->header.recv_wc.wc = &response->header.wc;
+		response->header.recv_wc.recv_buf.mad = &response->mad.mad;
+		response->header.recv_wc.recv_buf.grh = &response->grh;
+		response->mad.mad.mad_hdr.method = IB_MGMT_METHOD_GET_RESP;
+		response->mad.mad.mad_hdr.status =
+			cpu_to_be16(IB_MGMT_MAD_STATUS_UNSUPPORTED_METHOD_ATTRIB);
+		if (recv->mad.mad.mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)
+			response->mad.mad.mad_hdr.status |= IB_SMP_DIRECTION;
+
+		return true;
+	} else {
+		return false;
+	}
+}
+static void ib_mad_recv_done_handler(struct ib_mad_port_private *port_priv,
+				     struct ib_wc *wc)
+{
+	struct ib_mad_qp_info *qp_info;
+	struct ib_mad_private_header *mad_priv_hdr;
+	struct ib_mad_private *recv, *response = NULL;
+	struct ib_mad_list_head *mad_list;
+	struct ib_mad_agent_private *mad_agent;
+	int port_num;
+	int ret = IB_MAD_RESULT_SUCCESS;
+
+	mad_list = (struct ib_mad_list_head *)(unsigned long)wc->wr_id;
+	qp_info = mad_list->mad_queue->qp_info;
+	dequeue_mad(mad_list);
+
+	mad_priv_hdr = container_of(mad_list, struct ib_mad_private_header,
+				    mad_list);
+	recv = container_of(mad_priv_hdr, struct ib_mad_private, header);
+	ib_dma_unmap_single(port_priv->device,
+			    recv->header.mapping,
+			    sizeof(struct ib_mad_private) -
+			      sizeof(struct ib_mad_private_header),
+			    DMA_FROM_DEVICE);
+
+	/* Setup MAD receive work completion from "normal" work completion */
+	recv->header.wc = *wc;
+	recv->header.recv_wc.wc = &recv->header.wc;
+	recv->header.recv_wc.mad_len = sizeof(struct ib_mad);
+	recv->header.recv_wc.recv_buf.mad = &recv->mad.mad;
+	recv->header.recv_wc.recv_buf.grh = &recv->grh;
+
+	if (atomic_read(&qp_info->snoop_count))
+		snoop_recv(qp_info, &recv->header.recv_wc, IB_MAD_SNOOP_RECVS);
+
+	/* Validate MAD */
+	if (!validate_mad(&recv->mad.mad, qp_info->qp->qp_num))
+		goto out;
+
+	response = kmem_cache_alloc(ib_mad_cache, GFP_KERNEL);
+	if (!response) {
+		dev_err(&port_priv->device->dev,
+			"ib_mad_recv_done_handler no memory for response buffer\n");
+		goto out;
+	}
+
+	if (port_priv->device->node_type == RDMA_NODE_IB_SWITCH)
+		port_num = wc->port_num;
+	else
+		port_num = port_priv->port_num;
+
+	if (recv->mad.mad.mad_hdr.mgmt_class ==
+	    IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) {
+		enum smi_forward_action retsmi;
+
+		if (smi_handle_dr_smp_recv(&recv->mad.smp,
+					   port_priv->device->node_type,
+					   port_num,
+					   port_priv->device->phys_port_cnt) ==
+					   IB_SMI_DISCARD)
+			goto out;
+
+		retsmi = smi_check_forward_dr_smp(&recv->mad.smp);
+		if (retsmi == IB_SMI_LOCAL)
+			goto local;
+
+		if (retsmi == IB_SMI_SEND) { /* don't forward */
+			if (smi_handle_dr_smp_send(&recv->mad.smp,
+						   port_priv->device->node_type,
+						   port_num) == IB_SMI_DISCARD)
+				goto out;
+
+			if (smi_check_local_smp(&recv->mad.smp, port_priv->device) == IB_SMI_DISCARD)
+				goto out;
+		} else if (port_priv->device->node_type == RDMA_NODE_IB_SWITCH) {
+			/* forward case for switches */
+			memcpy(response, recv, sizeof(*response));
+			response->header.recv_wc.wc = &response->header.wc;
+			response->header.recv_wc.recv_buf.mad = &response->mad.mad;
+			response->header.recv_wc.recv_buf.grh = &response->grh;
+
+			agent_send_response(&response->mad.mad,
+					    &response->grh, wc,
+					    port_priv->device,
+					    smi_get_fwd_port(&recv->mad.smp),
+					    qp_info->qp->qp_num);
+
+			goto out;
+		}
+	}
+
+local:
+	/* Give driver "right of first refusal" on incoming MAD */
+	if (port_priv->device->process_mad) {
+		ret = port_priv->device->process_mad(port_priv->device, 0,
+						     port_priv->port_num,
+						     wc, &recv->grh,
+						     &recv->mad.mad,
+						     &response->mad.mad);
+		if (ret & IB_MAD_RESULT_SUCCESS) {
+			if (ret & IB_MAD_RESULT_CONSUMED)
+				goto out;
+			if (ret & IB_MAD_RESULT_REPLY) {
+				agent_send_response(&response->mad.mad,
+						    &recv->grh, wc,
+						    port_priv->device,
+						    port_num,
+						    qp_info->qp->qp_num);
+				goto out;
+			}
+		}
+	}
+
+	mad_agent = find_mad_agent(port_priv, &recv->mad.mad);
+	if (mad_agent) {
+		ib_mad_complete_recv(mad_agent, &recv->header.recv_wc);
+		/*
+		 * recv is freed up in error cases in ib_mad_complete_recv
+		 * or via recv_handler in ib_mad_complete_recv()
+		 */
+		recv = NULL;
+	} else if ((ret & IB_MAD_RESULT_SUCCESS) &&
+		   generate_unmatched_resp(recv, response)) {
+		agent_send_response(&response->mad.mad, &recv->grh, wc,
+				    port_priv->device, port_num, qp_info->qp->qp_num);
+	}
+
+out:
+	/* Post another receive request for this QP */
+	if (response) {
+		ib_mad_post_receive_mads(qp_info, response);
+		if (recv)
+			kmem_cache_free(ib_mad_cache, recv);
+	} else
+		ib_mad_post_receive_mads(qp_info, recv);
+}
+
+static void adjust_timeout(struct ib_mad_agent_private *mad_agent_priv)
+{
+	struct ib_mad_send_wr_private *mad_send_wr;
+	unsigned long delay;
+
+	if (list_empty(&mad_agent_priv->wait_list)) {
+		cancel_delayed_work(&mad_agent_priv->timed_work);
+	} else {
+		mad_send_wr = list_entry(mad_agent_priv->wait_list.next,
+					 struct ib_mad_send_wr_private,
+					 agent_list);
+
+		if (time_after(mad_agent_priv->timeout,
+			       mad_send_wr->timeout)) {
+			mad_agent_priv->timeout = mad_send_wr->timeout;
+			delay = mad_send_wr->timeout - jiffies;
+			if ((long)delay <= 0)
+				delay = 1;
+			mod_delayed_work(mad_agent_priv->qp_info->port_priv->wq,
+					 &mad_agent_priv->timed_work, delay);
+		}
+	}
+}
+
+static void wait_for_response(struct ib_mad_send_wr_private *mad_send_wr)
+{
+	struct ib_mad_agent_private *mad_agent_priv;
+	struct ib_mad_send_wr_private *temp_mad_send_wr;
+	struct list_head *list_item;
+	unsigned long delay;
+
+	mad_agent_priv = mad_send_wr->mad_agent_priv;
+	list_del(&mad_send_wr->agent_list);
+
+	delay = mad_send_wr->timeout;
+	mad_send_wr->timeout += jiffies;
+
+	if (delay) {
+		list_for_each_prev(list_item, &mad_agent_priv->wait_list) {
+			temp_mad_send_wr = list_entry(list_item,
+						struct ib_mad_send_wr_private,
+						agent_list);
+			if (time_after(mad_send_wr->timeout,
+				       temp_mad_send_wr->timeout))
+				break;
+		}
+	}
+	else
+		list_item = &mad_agent_priv->wait_list;
+	list_add(&mad_send_wr->agent_list, list_item);
+
+	/* Reschedule a work item if we have a shorter timeout */
+	if (mad_agent_priv->wait_list.next == &mad_send_wr->agent_list)
+		mod_delayed_work(mad_agent_priv->qp_info->port_priv->wq,
+				 &mad_agent_priv->timed_work, delay);
+}
+
+void ib_reset_mad_timeout(struct ib_mad_send_wr_private *mad_send_wr,
+			  int timeout_ms)
+{
+	mad_send_wr->timeout = msecs_to_jiffies(timeout_ms);
+	wait_for_response(mad_send_wr);
+}
+
+/*
+ * Process a send work completion
+ */
+void ib_mad_complete_send_wr(struct ib_mad_send_wr_private *mad_send_wr,
+			     struct ib_mad_send_wc *mad_send_wc)
+{
+	struct ib_mad_agent_private	*mad_agent_priv;
+	unsigned long			flags;
+	int				ret;
+
+	mad_agent_priv = mad_send_wr->mad_agent_priv;
+	spin_lock_irqsave(&mad_agent_priv->lock, flags);
+	if (ib_mad_kernel_rmpp_agent(&mad_agent_priv->agent)) {
+		ret = ib_process_rmpp_send_wc(mad_send_wr, mad_send_wc);
+		if (ret == IB_RMPP_RESULT_CONSUMED)
+			goto done;
+	} else
+		ret = IB_RMPP_RESULT_UNHANDLED;
+
+	if (mad_send_wc->status != IB_WC_SUCCESS &&
+	    mad_send_wr->status == IB_WC_SUCCESS) {
+		mad_send_wr->status = mad_send_wc->status;
+		mad_send_wr->refcount -= (mad_send_wr->timeout > 0);
+	}
+
+	if (--mad_send_wr->refcount > 0) {
+		if (mad_send_wr->refcount == 1 && mad_send_wr->timeout &&
+		    mad_send_wr->status == IB_WC_SUCCESS) {
+			wait_for_response(mad_send_wr);
+		}
+		goto done;
+	}
+
+	/* Remove send from MAD agent and notify client of completion */
+	list_del(&mad_send_wr->agent_list);
+	adjust_timeout(mad_agent_priv);
+	spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+
+	if (mad_send_wr->status != IB_WC_SUCCESS )
+		mad_send_wc->status = mad_send_wr->status;
+	if (ret == IB_RMPP_RESULT_INTERNAL)
+		ib_rmpp_send_handler(mad_send_wc);
+	else
+		mad_agent_priv->agent.send_handler(&mad_agent_priv->agent,
+						   mad_send_wc);
+
+	/* Release reference on agent taken when sending */
+	deref_mad_agent(mad_agent_priv);
+	return;
+done:
+	spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+}
+
+static void ib_mad_send_done_handler(struct ib_mad_port_private *port_priv,
+				     struct ib_wc *wc)
+{
+	struct ib_mad_send_wr_private	*mad_send_wr, *queued_send_wr;
+	struct ib_mad_list_head		*mad_list;
+	struct ib_mad_qp_info		*qp_info;
+	struct ib_mad_queue		*send_queue;
+	struct ib_send_wr		*bad_send_wr;
+	struct ib_mad_send_wc		mad_send_wc;
+	unsigned long flags;
+	int ret;
+
+	mad_list = (struct ib_mad_list_head *)(unsigned long)wc->wr_id;
+	mad_send_wr = container_of(mad_list, struct ib_mad_send_wr_private,
+				   mad_list);
+	send_queue = mad_list->mad_queue;
+	qp_info = send_queue->qp_info;
+
+retry:
+	ib_dma_unmap_single(mad_send_wr->send_buf.mad_agent->device,
+			    mad_send_wr->header_mapping,
+			    mad_send_wr->sg_list[0].length, DMA_TO_DEVICE);
+	ib_dma_unmap_single(mad_send_wr->send_buf.mad_agent->device,
+			    mad_send_wr->payload_mapping,
+			    mad_send_wr->sg_list[1].length, DMA_TO_DEVICE);
+	queued_send_wr = NULL;
+	spin_lock_irqsave(&send_queue->lock, flags);
+	list_del(&mad_list->list);
+
+	/* Move queued send to the send queue */
+	if (send_queue->count-- > send_queue->max_active) {
+		mad_list = container_of(qp_info->overflow_list.next,
+					struct ib_mad_list_head, list);
+		queued_send_wr = container_of(mad_list,
+					struct ib_mad_send_wr_private,
+					mad_list);
+		list_move_tail(&mad_list->list, &send_queue->list);
+	}
+	spin_unlock_irqrestore(&send_queue->lock, flags);
+
+	mad_send_wc.send_buf = &mad_send_wr->send_buf;
+	mad_send_wc.status = wc->status;
+	mad_send_wc.vendor_err = wc->vendor_err;
+	if (atomic_read(&qp_info->snoop_count))
+		snoop_send(qp_info, &mad_send_wr->send_buf, &mad_send_wc,
+			   IB_MAD_SNOOP_SEND_COMPLETIONS);
+	ib_mad_complete_send_wr(mad_send_wr, &mad_send_wc);
+
+	if (queued_send_wr) {
+		ret = ib_post_send(qp_info->qp, &queued_send_wr->send_wr,
+				   &bad_send_wr);
+		if (ret) {
+			dev_err(&port_priv->device->dev,
+				"ib_post_send failed: %d\n", ret);
+			mad_send_wr = queued_send_wr;
+			wc->status = IB_WC_LOC_QP_OP_ERR;
+			goto retry;
+		}
+	}
+}
+
+static void mark_sends_for_retry(struct ib_mad_qp_info *qp_info)
+{
+	struct ib_mad_send_wr_private *mad_send_wr;
+	struct ib_mad_list_head *mad_list;
+	unsigned long flags;
+
+	spin_lock_irqsave(&qp_info->send_queue.lock, flags);
+	list_for_each_entry(mad_list, &qp_info->send_queue.list, list) {
+		mad_send_wr = container_of(mad_list,
+					   struct ib_mad_send_wr_private,
+					   mad_list);
+		mad_send_wr->retry = 1;
+	}
+	spin_unlock_irqrestore(&qp_info->send_queue.lock, flags);
+}
+
+static void mad_error_handler(struct ib_mad_port_private *port_priv,
+			      struct ib_wc *wc)
+{
+	struct ib_mad_list_head *mad_list;
+	struct ib_mad_qp_info *qp_info;
+	struct ib_mad_send_wr_private *mad_send_wr;
+	int ret;
+
+	/* Determine if failure was a send or receive */
+	mad_list = (struct ib_mad_list_head *)(unsigned long)wc->wr_id;
+	qp_info = mad_list->mad_queue->qp_info;
+	if (mad_list->mad_queue == &qp_info->recv_queue)
+		/*
+		 * Receive errors indicate that the QP has entered the error
+		 * state - error handling/shutdown code will cleanup
+		 */
+		return;
+
+	/*
+	 * Send errors will transition the QP to SQE - move
+	 * QP to RTS and repost flushed work requests
+	 */
+	mad_send_wr = container_of(mad_list, struct ib_mad_send_wr_private,
+				   mad_list);
+	if (wc->status == IB_WC_WR_FLUSH_ERR) {
+		if (mad_send_wr->retry) {
+			/* Repost send */
+			struct ib_send_wr *bad_send_wr;
+
+			mad_send_wr->retry = 0;
+			ret = ib_post_send(qp_info->qp, &mad_send_wr->send_wr,
+					&bad_send_wr);
+			if (ret)
+				ib_mad_send_done_handler(port_priv, wc);
+		} else
+			ib_mad_send_done_handler(port_priv, wc);
+	} else {
+		struct ib_qp_attr *attr;
+
+		/* Transition QP to RTS and fail offending send */
+		attr = kmalloc(sizeof *attr, GFP_KERNEL);
+		if (attr) {
+			attr->qp_state = IB_QPS_RTS;
+			attr->cur_qp_state = IB_QPS_SQE;
+			ret = ib_modify_qp(qp_info->qp, attr,
+					   IB_QP_STATE | IB_QP_CUR_STATE);
+			kfree(attr);
+			if (ret)
+				dev_err(&port_priv->device->dev,
+					"mad_error_handler - ib_modify_qp to RTS : %d\n",
+					ret);
+			else
+				mark_sends_for_retry(qp_info);
+		}
+		ib_mad_send_done_handler(port_priv, wc);
+	}
+}
+
+/*
+ * IB MAD completion callback
+ */
+static void ib_mad_completion_handler(struct work_struct *work)
+{
+	struct ib_mad_port_private *port_priv;
+	struct ib_wc wc;
+
+	port_priv = container_of(work, struct ib_mad_port_private, work);
+	ib_req_notify_cq(port_priv->cq, IB_CQ_NEXT_COMP);
+
+	while (ib_poll_cq(port_priv->cq, 1, &wc) == 1) {
+		if (wc.status == IB_WC_SUCCESS) {
+			switch (wc.opcode) {
+			case IB_WC_SEND:
+				ib_mad_send_done_handler(port_priv, &wc);
+				break;
+			case IB_WC_RECV:
+				ib_mad_recv_done_handler(port_priv, &wc);
+				break;
+			default:
+				BUG_ON(1);
+				break;
+			}
+		} else
+			mad_error_handler(port_priv, &wc);
+	}
+}
+
+static void cancel_mads(struct ib_mad_agent_private *mad_agent_priv)
+{
+	unsigned long flags;
+	struct ib_mad_send_wr_private *mad_send_wr, *temp_mad_send_wr;
+	struct ib_mad_send_wc mad_send_wc;
+	struct list_head cancel_list;
+
+	INIT_LIST_HEAD(&cancel_list);
+
+	spin_lock_irqsave(&mad_agent_priv->lock, flags);
+	list_for_each_entry_safe(mad_send_wr, temp_mad_send_wr,
+				 &mad_agent_priv->send_list, agent_list) {
+		if (mad_send_wr->status == IB_WC_SUCCESS) {
+			mad_send_wr->status = IB_WC_WR_FLUSH_ERR;
+			mad_send_wr->refcount -= (mad_send_wr->timeout > 0);
+		}
+	}
+
+	/* Empty wait list to prevent receives from finding a request */
+	list_splice_init(&mad_agent_priv->wait_list, &cancel_list);
+	spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+
+	/* Report all cancelled requests */
+	mad_send_wc.status = IB_WC_WR_FLUSH_ERR;
+	mad_send_wc.vendor_err = 0;
+
+	list_for_each_entry_safe(mad_send_wr, temp_mad_send_wr,
+				 &cancel_list, agent_list) {
+		mad_send_wc.send_buf = &mad_send_wr->send_buf;
+		list_del(&mad_send_wr->agent_list);
+		mad_agent_priv->agent.send_handler(&mad_agent_priv->agent,
+						   &mad_send_wc);
+		atomic_dec(&mad_agent_priv->refcount);
+	}
+}
+
+static struct ib_mad_send_wr_private*
+find_send_wr(struct ib_mad_agent_private *mad_agent_priv,
+	     struct ib_mad_send_buf *send_buf)
+{
+	struct ib_mad_send_wr_private *mad_send_wr;
+
+	list_for_each_entry(mad_send_wr, &mad_agent_priv->wait_list,
+			    agent_list) {
+		if (&mad_send_wr->send_buf == send_buf)
+			return mad_send_wr;
+	}
+
+	list_for_each_entry(mad_send_wr, &mad_agent_priv->send_list,
+			    agent_list) {
+		if (is_data_mad(mad_agent_priv, mad_send_wr->send_buf.mad) &&
+		    &mad_send_wr->send_buf == send_buf)
+			return mad_send_wr;
+	}
+	return NULL;
+}
+
+int ib_modify_mad(struct ib_mad_agent *mad_agent,
+		  struct ib_mad_send_buf *send_buf, u32 timeout_ms)
+{
+	struct ib_mad_agent_private *mad_agent_priv;
+	struct ib_mad_send_wr_private *mad_send_wr;
+	unsigned long flags;
+	int active;
+
+	mad_agent_priv = container_of(mad_agent, struct ib_mad_agent_private,
+				      agent);
+	spin_lock_irqsave(&mad_agent_priv->lock, flags);
+	mad_send_wr = find_send_wr(mad_agent_priv, send_buf);
+	if (!mad_send_wr || mad_send_wr->status != IB_WC_SUCCESS) {
+		spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+		return -EINVAL;
+	}
+
+	active = (!mad_send_wr->timeout || mad_send_wr->refcount > 1);
+	if (!timeout_ms) {
+		mad_send_wr->status = IB_WC_WR_FLUSH_ERR;
+		mad_send_wr->refcount -= (mad_send_wr->timeout > 0);
+	}
+
+	mad_send_wr->send_buf.timeout_ms = timeout_ms;
+	if (active)
+		mad_send_wr->timeout = msecs_to_jiffies(timeout_ms);
+	else
+		ib_reset_mad_timeout(mad_send_wr, timeout_ms);
+
+	spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+	return 0;
+}
+EXPORT_SYMBOL(ib_modify_mad);
+
+void ib_cancel_mad(struct ib_mad_agent *mad_agent,
+		   struct ib_mad_send_buf *send_buf)
+{
+	ib_modify_mad(mad_agent, send_buf, 0);
+}
+EXPORT_SYMBOL(ib_cancel_mad);
+
+static void local_completions(struct work_struct *work)
+{
+	struct ib_mad_agent_private *mad_agent_priv;
+	struct ib_mad_local_private *local;
+	struct ib_mad_agent_private *recv_mad_agent;
+	unsigned long flags;
+	int free_mad;
+	struct ib_wc wc;
+	struct ib_mad_send_wc mad_send_wc;
+
+	mad_agent_priv =
+		container_of(work, struct ib_mad_agent_private, local_work);
+
+	spin_lock_irqsave(&mad_agent_priv->lock, flags);
+	while (!list_empty(&mad_agent_priv->local_list)) {
+		local = list_entry(mad_agent_priv->local_list.next,
+				   struct ib_mad_local_private,
+				   completion_list);
+		list_del(&local->completion_list);
+		spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+		free_mad = 0;
+		if (local->mad_priv) {
+			recv_mad_agent = local->recv_mad_agent;
+			if (!recv_mad_agent) {
+				dev_err(&mad_agent_priv->agent.device->dev,
+					"No receive MAD agent for local completion\n");
+				free_mad = 1;
+				goto local_send_completion;
+			}
+
+			/*
+			 * Defined behavior is to complete response
+			 * before request
+			 */
+			build_smp_wc(recv_mad_agent->agent.qp,
+				     (unsigned long) local->mad_send_wr,
+				     be16_to_cpu(IB_LID_PERMISSIVE),
+				     0, recv_mad_agent->agent.port_num, &wc);
+
+			local->mad_priv->header.recv_wc.wc = &wc;
+			local->mad_priv->header.recv_wc.mad_len =
+						sizeof(struct ib_mad);
+			INIT_LIST_HEAD(&local->mad_priv->header.recv_wc.rmpp_list);
+			list_add(&local->mad_priv->header.recv_wc.recv_buf.list,
+				 &local->mad_priv->header.recv_wc.rmpp_list);
+			local->mad_priv->header.recv_wc.recv_buf.grh = NULL;
+			local->mad_priv->header.recv_wc.recv_buf.mad =
+						&local->mad_priv->mad.mad;
+			if (atomic_read(&recv_mad_agent->qp_info->snoop_count))
+				snoop_recv(recv_mad_agent->qp_info,
+					  &local->mad_priv->header.recv_wc,
+					   IB_MAD_SNOOP_RECVS);
+			recv_mad_agent->agent.recv_handler(
+						&recv_mad_agent->agent,
+						&local->mad_priv->header.recv_wc);
+			spin_lock_irqsave(&recv_mad_agent->lock, flags);
+			atomic_dec(&recv_mad_agent->refcount);
+			spin_unlock_irqrestore(&recv_mad_agent->lock, flags);
+		}
+
+local_send_completion:
+		/* Complete send */
+		mad_send_wc.status = IB_WC_SUCCESS;
+		mad_send_wc.vendor_err = 0;
+		mad_send_wc.send_buf = &local->mad_send_wr->send_buf;
+		if (atomic_read(&mad_agent_priv->qp_info->snoop_count))
+			snoop_send(mad_agent_priv->qp_info,
+				   &local->mad_send_wr->send_buf,
+				   &mad_send_wc, IB_MAD_SNOOP_SEND_COMPLETIONS);
+		mad_agent_priv->agent.send_handler(&mad_agent_priv->agent,
+						   &mad_send_wc);
+
+		spin_lock_irqsave(&mad_agent_priv->lock, flags);
+		atomic_dec(&mad_agent_priv->refcount);
+		if (free_mad)
+			kmem_cache_free(ib_mad_cache, local->mad_priv);
+		kfree(local);
+	}
+	spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+}
+
+static int retry_send(struct ib_mad_send_wr_private *mad_send_wr)
+{
+	int ret;
+
+	if (!mad_send_wr->retries_left)
+		return -ETIMEDOUT;
+
+	mad_send_wr->retries_left--;
+	mad_send_wr->send_buf.retries++;
+
+	mad_send_wr->timeout = msecs_to_jiffies(mad_send_wr->send_buf.timeout_ms);
+
+	if (ib_mad_kernel_rmpp_agent(&mad_send_wr->mad_agent_priv->agent)) {
+		ret = ib_retry_rmpp(mad_send_wr);
+		switch (ret) {
+		case IB_RMPP_RESULT_UNHANDLED:
+			ret = ib_send_mad(mad_send_wr);
+			break;
+		case IB_RMPP_RESULT_CONSUMED:
+			ret = 0;
+			break;
+		default:
+			ret = -ECOMM;
+			break;
+		}
+	} else
+		ret = ib_send_mad(mad_send_wr);
+
+	if (!ret) {
+		mad_send_wr->refcount++;
+		list_add_tail(&mad_send_wr->agent_list,
+			      &mad_send_wr->mad_agent_priv->send_list);
+	}
+	return ret;
+}
+
+static void timeout_sends(struct work_struct *work)
+{
+	struct ib_mad_agent_private *mad_agent_priv;
+	struct ib_mad_send_wr_private *mad_send_wr;
+	struct ib_mad_send_wc mad_send_wc;
+	unsigned long flags, delay;
+
+	mad_agent_priv = container_of(work, struct ib_mad_agent_private,
+				      timed_work.work);
+	mad_send_wc.vendor_err = 0;
+
+	spin_lock_irqsave(&mad_agent_priv->lock, flags);
+	while (!list_empty(&mad_agent_priv->wait_list)) {
+		mad_send_wr = list_entry(mad_agent_priv->wait_list.next,
+					 struct ib_mad_send_wr_private,
+					 agent_list);
+
+		if (time_after(mad_send_wr->timeout, jiffies)) {
+			delay = mad_send_wr->timeout - jiffies;
+			if ((long)delay <= 0)
+				delay = 1;
+			queue_delayed_work(mad_agent_priv->qp_info->
+					   port_priv->wq,
+					   &mad_agent_priv->timed_work, delay);
+			break;
+		}
+
+		list_del(&mad_send_wr->agent_list);
+		if (mad_send_wr->status == IB_WC_SUCCESS &&
+		    !retry_send(mad_send_wr))
+			continue;
+
+		spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+
+		if (mad_send_wr->status == IB_WC_SUCCESS)
+			mad_send_wc.status = IB_WC_RESP_TIMEOUT_ERR;
+		else
+			mad_send_wc.status = mad_send_wr->status;
+		mad_send_wc.send_buf = &mad_send_wr->send_buf;
+		mad_agent_priv->agent.send_handler(&mad_agent_priv->agent,
+						   &mad_send_wc);
+
+		atomic_dec(&mad_agent_priv->refcount);
+		spin_lock_irqsave(&mad_agent_priv->lock, flags);
+	}
+	spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+}
+
+static void ib_mad_thread_completion_handler(struct ib_cq *cq, void *arg)
+{
+	struct ib_mad_port_private *port_priv = cq->cq_context;
+	unsigned long flags;
+
+	spin_lock_irqsave(&ib_mad_port_list_lock, flags);
+	if (!list_empty(&port_priv->port_list))
+		queue_work(port_priv->wq, &port_priv->work);
+	spin_unlock_irqrestore(&ib_mad_port_list_lock, flags);
+}
+
+/*
+ * Allocate receive MADs and post receive WRs for them
+ */
+static int ib_mad_post_receive_mads(struct ib_mad_qp_info *qp_info,
+				    struct ib_mad_private *mad)
+{
+	unsigned long flags;
+	int post, ret;
+	struct ib_mad_private *mad_priv;
+	struct ib_sge sg_list;
+	struct ib_recv_wr recv_wr, *bad_recv_wr;
+	struct ib_mad_queue *recv_queue = &qp_info->recv_queue;
+
+	/* Initialize common scatter list fields */
+	sg_list.length = sizeof *mad_priv - sizeof mad_priv->header;
+	sg_list.lkey = (*qp_info->port_priv->mr).lkey;
+
+	/* Initialize common receive WR fields */
+	recv_wr.next = NULL;
+	recv_wr.sg_list = &sg_list;
+	recv_wr.num_sge = 1;
+
+	do {
+		/* Allocate and map receive buffer */
+		if (mad) {
+			mad_priv = mad;
+			mad = NULL;
+		} else {
+			mad_priv = kmem_cache_alloc(ib_mad_cache, GFP_KERNEL);
+			if (!mad_priv) {
+				dev_err(&qp_info->port_priv->device->dev,
+					"No memory for receive buffer\n");
+				ret = -ENOMEM;
+				break;
+			}
+		}
+		sg_list.addr = ib_dma_map_single(qp_info->port_priv->device,
+						 &mad_priv->grh,
+						 sizeof *mad_priv -
+						   sizeof mad_priv->header,
+						 DMA_FROM_DEVICE);
+		if (unlikely(ib_dma_mapping_error(qp_info->port_priv->device,
+						  sg_list.addr))) {
+			ret = -ENOMEM;
+			break;
+		}
+		mad_priv->header.mapping = sg_list.addr;
+		recv_wr.wr_id = (unsigned long)&mad_priv->header.mad_list;
+		mad_priv->header.mad_list.mad_queue = recv_queue;
+
+		/* Post receive WR */
+		spin_lock_irqsave(&recv_queue->lock, flags);
+		post = (++recv_queue->count < recv_queue->max_active);
+		list_add_tail(&mad_priv->header.mad_list.list, &recv_queue->list);
+		spin_unlock_irqrestore(&recv_queue->lock, flags);
+		ret = ib_post_recv(qp_info->qp, &recv_wr, &bad_recv_wr);
+		if (ret) {
+			spin_lock_irqsave(&recv_queue->lock, flags);
+			list_del(&mad_priv->header.mad_list.list);
+			recv_queue->count--;
+			spin_unlock_irqrestore(&recv_queue->lock, flags);
+			ib_dma_unmap_single(qp_info->port_priv->device,
+					    mad_priv->header.mapping,
+					    sizeof *mad_priv -
+					      sizeof mad_priv->header,
+					    DMA_FROM_DEVICE);
+			kmem_cache_free(ib_mad_cache, mad_priv);
+			dev_err(&qp_info->port_priv->device->dev,
+				"ib_post_recv failed: %d\n", ret);
+			break;
+		}
+	} while (post);
+
+	return ret;
+}
+
+/*
+ * Return all the posted receive MADs
+ */
+static void cleanup_recv_queue(struct ib_mad_qp_info *qp_info)
+{
+	struct ib_mad_private_header *mad_priv_hdr;
+	struct ib_mad_private *recv;
+	struct ib_mad_list_head *mad_list;
+
+	if (!qp_info->qp)
+		return;
+
+	while (!list_empty(&qp_info->recv_queue.list)) {
+
+		mad_list = list_entry(qp_info->recv_queue.list.next,
+				      struct ib_mad_list_head, list);
+		mad_priv_hdr = container_of(mad_list,
+					    struct ib_mad_private_header,
+					    mad_list);
+		recv = container_of(mad_priv_hdr, struct ib_mad_private,
+				    header);
+
+		/* Remove from posted receive MAD list */
+		list_del(&mad_list->list);
+
+		ib_dma_unmap_single(qp_info->port_priv->device,
+				    recv->header.mapping,
+				    sizeof(struct ib_mad_private) -
+				      sizeof(struct ib_mad_private_header),
+				    DMA_FROM_DEVICE);
+		kmem_cache_free(ib_mad_cache, recv);
+	}
+
+	qp_info->recv_queue.count = 0;
+}
+
+/*
+ * Start the port
+ */
+static int ib_mad_port_start(struct ib_mad_port_private *port_priv)
+{
+	int ret, i;
+	struct ib_qp_attr *attr;
+	struct ib_qp *qp;
+	u16 pkey_index;
+
+	attr = kmalloc(sizeof *attr, GFP_KERNEL);
+	if (!attr) {
+		dev_err(&port_priv->device->dev,
+			"Couldn't kmalloc ib_qp_attr\n");
+		return -ENOMEM;
+	}
+
+	ret = ib_find_pkey(port_priv->device, port_priv->port_num,
+			   IB_DEFAULT_PKEY_FULL, &pkey_index);
+	if (ret)
+		pkey_index = 0;
+
+	for (i = 0; i < IB_MAD_QPS_CORE; i++) {
+		qp = port_priv->qp_info[i].qp;
+		if (!qp)
+			continue;
+
+		/*
+		 * PKey index for QP1 is irrelevant but
+		 * one is needed for the Reset to Init transition
+		 */
+		attr->qp_state = IB_QPS_INIT;
+		attr->pkey_index = pkey_index;
+		attr->qkey = (qp->qp_num == 0) ? 0 : IB_QP1_QKEY;
+		ret = ib_modify_qp(qp, attr, IB_QP_STATE |
+					     IB_QP_PKEY_INDEX | IB_QP_QKEY);
+		if (ret) {
+			dev_err(&port_priv->device->dev,
+				"Couldn't change QP%d state to INIT: %d\n",
+				i, ret);
+			goto out;
+		}
+
+		attr->qp_state = IB_QPS_RTR;
+		ret = ib_modify_qp(qp, attr, IB_QP_STATE);
+		if (ret) {
+			dev_err(&port_priv->device->dev,
+				"Couldn't change QP%d state to RTR: %d\n",
+				i, ret);
+			goto out;
+		}
+
+		attr->qp_state = IB_QPS_RTS;
+		attr->sq_psn = IB_MAD_SEND_Q_PSN;
+		ret = ib_modify_qp(qp, attr, IB_QP_STATE | IB_QP_SQ_PSN);
+		if (ret) {
+			dev_err(&port_priv->device->dev,
+				"Couldn't change QP%d state to RTS: %d\n",
+				i, ret);
+			goto out;
+		}
+	}
+
+	ret = ib_req_notify_cq(port_priv->cq, IB_CQ_NEXT_COMP);
+	if (ret) {
+		dev_err(&port_priv->device->dev,
+			"Failed to request completion notification: %d\n",
+			ret);
+		goto out;
+	}
+
+	for (i = 0; i < IB_MAD_QPS_CORE; i++) {
+		if (!port_priv->qp_info[i].qp)
+			continue;
+
+		ret = ib_mad_post_receive_mads(&port_priv->qp_info[i], NULL);
+		if (ret) {
+			dev_err(&port_priv->device->dev,
+				"Couldn't post receive WRs\n");
+			goto out;
+		}
+	}
+out:
+	kfree(attr);
+	return ret;
+}
+
+static void qp_event_handler(struct ib_event *event, void *qp_context)
+{
+	struct ib_mad_qp_info	*qp_info = qp_context;
+
+	/* It's worse than that! He's dead, Jim! */
+	dev_err(&qp_info->port_priv->device->dev,
+		"Fatal error (%d) on MAD QP (%d)\n",
+		event->event, qp_info->qp->qp_num);
+}
+
+static void init_mad_queue(struct ib_mad_qp_info *qp_info,
+			   struct ib_mad_queue *mad_queue)
+{
+	mad_queue->qp_info = qp_info;
+	mad_queue->count = 0;
+	spin_lock_init(&mad_queue->lock);
+	INIT_LIST_HEAD(&mad_queue->list);
+}
+
+static void init_mad_qp(struct ib_mad_port_private *port_priv,
+			struct ib_mad_qp_info *qp_info)
+{
+	qp_info->port_priv = port_priv;
+	init_mad_queue(qp_info, &qp_info->send_queue);
+	init_mad_queue(qp_info, &qp_info->recv_queue);
+	INIT_LIST_HEAD(&qp_info->overflow_list);
+	spin_lock_init(&qp_info->snoop_lock);
+	qp_info->snoop_table = NULL;
+	qp_info->snoop_table_size = 0;
+	atomic_set(&qp_info->snoop_count, 0);
+}
+
+static int create_mad_qp(struct ib_mad_qp_info *qp_info,
+			 enum ib_qp_type qp_type)
+{
+	struct ib_qp_init_attr	qp_init_attr;
+	int ret;
+
+	memset(&qp_init_attr, 0, sizeof qp_init_attr);
+	qp_init_attr.send_cq = qp_info->port_priv->cq;
+	qp_init_attr.recv_cq = qp_info->port_priv->cq;
+	qp_init_attr.sq_sig_type = IB_SIGNAL_ALL_WR;
+	qp_init_attr.cap.max_send_wr = mad_sendq_size;
+	qp_init_attr.cap.max_recv_wr = mad_recvq_size;
+	qp_init_attr.cap.max_send_sge = IB_MAD_SEND_REQ_MAX_SG;
+	qp_init_attr.cap.max_recv_sge = IB_MAD_RECV_REQ_MAX_SG;
+	qp_init_attr.qp_type = qp_type;
+	qp_init_attr.port_num = qp_info->port_priv->port_num;
+	qp_init_attr.qp_context = qp_info;
+	qp_init_attr.event_handler = qp_event_handler;
+	qp_info->qp = ib_create_qp(qp_info->port_priv->pd, &qp_init_attr);
+	if (IS_ERR(qp_info->qp)) {
+		dev_err(&qp_info->port_priv->device->dev,
+			"Couldn't create ib_mad QP%d\n",
+			get_spl_qp_index(qp_type));
+		ret = PTR_ERR(qp_info->qp);
+		goto error;
+	}
+	/* Use minimum queue sizes unless the CQ is resized */
+	qp_info->send_queue.max_active = mad_sendq_size;
+	qp_info->recv_queue.max_active = mad_recvq_size;
+	return 0;
+
+error:
+	return ret;
+}
+
+static void destroy_mad_qp(struct ib_mad_qp_info *qp_info)
+{
+	if (!qp_info->qp)
+		return;
+
+	ib_destroy_qp(qp_info->qp);
+	kfree(qp_info->snoop_table);
+}
+
+/*
+ * Open the port
+ * Create the QP, PD, MR, and CQ if needed
+ */
+static int ib_mad_port_open(struct ib_device *device,
+			    int port_num)
+{
+	int ret, cq_size;
+	struct ib_mad_port_private *port_priv;
+	unsigned long flags;
+	char name[sizeof "ib_mad123"];
+	int has_smi;
+
+	/* Create new device info */
+	port_priv = kzalloc(sizeof *port_priv, GFP_KERNEL);
+	if (!port_priv) {
+		dev_err(&device->dev, "No memory for ib_mad_port_private\n");
+		return -ENOMEM;
+	}
+
+	port_priv->device = device;
+	port_priv->port_num = port_num;
+	spin_lock_init(&port_priv->reg_lock);
+	INIT_LIST_HEAD(&port_priv->agent_list);
+	init_mad_qp(port_priv, &port_priv->qp_info[0]);
+	init_mad_qp(port_priv, &port_priv->qp_info[1]);
+
+	cq_size = mad_sendq_size + mad_recvq_size;
+	has_smi = rdma_port_get_link_layer(device, port_num) == IB_LINK_LAYER_INFINIBAND;
+	if (has_smi)
+		cq_size *= 2;
+
+	port_priv->cq = ib_create_cq(port_priv->device,
+				     ib_mad_thread_completion_handler,
+				     NULL, port_priv, cq_size, 0);
+	if (IS_ERR(port_priv->cq)) {
+		dev_err(&device->dev, "Couldn't create ib_mad CQ\n");
+		ret = PTR_ERR(port_priv->cq);
+		goto error3;
+	}
+
+	port_priv->pd = ib_alloc_pd(device);
+	if (IS_ERR(port_priv->pd)) {
+		dev_err(&device->dev, "Couldn't create ib_mad PD\n");
+		ret = PTR_ERR(port_priv->pd);
+		goto error4;
+	}
+
+	port_priv->mr = ib_get_dma_mr(port_priv->pd, IB_ACCESS_LOCAL_WRITE);
+	if (IS_ERR(port_priv->mr)) {
+		dev_err(&device->dev, "Couldn't get ib_mad DMA MR\n");
+		ret = PTR_ERR(port_priv->mr);
+		goto error5;
+	}
+
+	if (has_smi) {
+		ret = create_mad_qp(&port_priv->qp_info[0], IB_QPT_SMI);
+		if (ret)
+			goto error6;
+	}
+	ret = create_mad_qp(&port_priv->qp_info[1], IB_QPT_GSI);
+	if (ret)
+		goto error7;
+
+	snprintf(name, sizeof name, "ib_mad%d", port_num);
+	port_priv->wq = create_singlethread_workqueue(name);
+	if (!port_priv->wq) {
+		ret = -ENOMEM;
+		goto error8;
+	}
+	INIT_WORK(&port_priv->work, ib_mad_completion_handler);
+
+	spin_lock_irqsave(&ib_mad_port_list_lock, flags);
+	list_add_tail(&port_priv->port_list, &ib_mad_port_list);
+	spin_unlock_irqrestore(&ib_mad_port_list_lock, flags);
+
+	ret = ib_mad_port_start(port_priv);
+	if (ret) {
+		dev_err(&device->dev, "Couldn't start port\n");
+		goto error9;
+	}
+
+	return 0;
+
+error9:
+	spin_lock_irqsave(&ib_mad_port_list_lock, flags);
+	list_del_init(&port_priv->port_list);
+	spin_unlock_irqrestore(&ib_mad_port_list_lock, flags);
+
+	destroy_workqueue(port_priv->wq);
+error8:
+	destroy_mad_qp(&port_priv->qp_info[1]);
+error7:
+	destroy_mad_qp(&port_priv->qp_info[0]);
+error6:
+	ib_dereg_mr(port_priv->mr);
+error5:
+	ib_dealloc_pd(port_priv->pd);
+error4:
+	ib_destroy_cq(port_priv->cq);
+	cleanup_recv_queue(&port_priv->qp_info[1]);
+	cleanup_recv_queue(&port_priv->qp_info[0]);
+error3:
+	kfree(port_priv);
+
+	return ret;
+}
+
+/*
+ * Close the port
+ * If there are no classes using the port, free the port
+ * resources (CQ, MR, PD, QP) and remove the port's info structure
+ */
+static int ib_mad_port_close(struct ib_device *device, int port_num)
+{
+	struct ib_mad_port_private *port_priv;
+	unsigned long flags;
+
+	spin_lock_irqsave(&ib_mad_port_list_lock, flags);
+	port_priv = __ib_get_mad_port(device, port_num);
+	if (port_priv == NULL) {
+		spin_unlock_irqrestore(&ib_mad_port_list_lock, flags);
+		dev_err(&device->dev, "Port %d not found\n", port_num);
+		return -ENODEV;
+	}
+	list_del_init(&port_priv->port_list);
+	spin_unlock_irqrestore(&ib_mad_port_list_lock, flags);
+
+	destroy_workqueue(port_priv->wq);
+	destroy_mad_qp(&port_priv->qp_info[1]);
+	destroy_mad_qp(&port_priv->qp_info[0]);
+	ib_dereg_mr(port_priv->mr);
+	ib_dealloc_pd(port_priv->pd);
+	ib_destroy_cq(port_priv->cq);
+	cleanup_recv_queue(&port_priv->qp_info[1]);
+	cleanup_recv_queue(&port_priv->qp_info[0]);
+	/* XXX: Handle deallocation of MAD registration tables */
+
+	kfree(port_priv);
+
+	return 0;
+}
+
+static void ib_mad_init_device(struct ib_device *device)
+{
+	int start, end, i;
+
+	if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB)
+		return;
+
+	if (device->node_type == RDMA_NODE_IB_SWITCH) {
+		start = 0;
+		end   = 0;
+	} else {
+		start = 1;
+		end   = device->phys_port_cnt;
+	}
+
+	for (i = start; i <= end; i++) {
+		if (ib_mad_port_open(device, i)) {
+			dev_err(&device->dev, "Couldn't open port %d\n", i);
+			goto error;
+		}
+		if (ib_agent_port_open(device, i)) {
+			dev_err(&device->dev,
+				"Couldn't open port %d for agents\n", i);
+			goto error_agent;
+		}
+	}
+	return;
+
+error_agent:
+	if (ib_mad_port_close(device, i))
+		dev_err(&device->dev, "Couldn't close port %d\n", i);
+
+error:
+	i--;
+
+	while (i >= start) {
+		if (ib_agent_port_close(device, i))
+			dev_err(&device->dev,
+				"Couldn't close port %d for agents\n", i);
+		if (ib_mad_port_close(device, i))
+			dev_err(&device->dev, "Couldn't close port %d\n", i);
+		i--;
+	}
+}
+
+static void ib_mad_remove_device(struct ib_device *device)
+{
+	int i, num_ports, cur_port;
+
+	if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB)
+		return;
+
+	if (device->node_type == RDMA_NODE_IB_SWITCH) {
+		num_ports = 1;
+		cur_port = 0;
+	} else {
+		num_ports = device->phys_port_cnt;
+		cur_port = 1;
+	}
+	for (i = 0; i < num_ports; i++, cur_port++) {
+		if (ib_agent_port_close(device, cur_port))
+			dev_err(&device->dev,
+				"Couldn't close port %d for agents\n",
+				cur_port);
+		if (ib_mad_port_close(device, cur_port))
+			dev_err(&device->dev, "Couldn't close port %d\n",
+				cur_port);
+	}
+}
+
+static struct ib_client mad_client = {
+	.name   = "mad",
+	.add = ib_mad_init_device,
+	.remove = ib_mad_remove_device
+};
+
+static int __init ib_mad_init_module(void)
+{
+	int ret;
+
+	mad_recvq_size = min(mad_recvq_size, IB_MAD_QP_MAX_SIZE);
+	mad_recvq_size = max(mad_recvq_size, IB_MAD_QP_MIN_SIZE);
+
+	mad_sendq_size = min(mad_sendq_size, IB_MAD_QP_MAX_SIZE);
+	mad_sendq_size = max(mad_sendq_size, IB_MAD_QP_MIN_SIZE);
+
+	ib_mad_cache = kmem_cache_create("ib_mad",
+					 sizeof(struct ib_mad_private),
+					 0,
+					 SLAB_HWCACHE_ALIGN,
+					 NULL);
+	if (!ib_mad_cache) {
+		pr_err("Couldn't create ib_mad cache\n");
+		ret = -ENOMEM;
+		goto error1;
+	}
+
+	INIT_LIST_HEAD(&ib_mad_port_list);
+
+	if (ib_register_client(&mad_client)) {
+		pr_err("Couldn't register ib_mad client\n");
+		ret = -EINVAL;
+		goto error2;
+	}
+
+	return 0;
+
+error2:
+	kmem_cache_destroy(ib_mad_cache);
+error1:
+	return ret;
+}
+
+static void __exit ib_mad_cleanup_module(void)
+{
+	ib_unregister_client(&mad_client);
+	kmem_cache_destroy(ib_mad_cache);
+}
+
+module_init(ib_mad_init_module);
+module_exit(ib_mad_cleanup_module);
diff --git a/drivers/infiniband/core/mad_priv.h b/drivers/infiniband/core/mad_priv.h
new file mode 100644
index 0000000..d1a0b0e
--- /dev/null
+++ b/drivers/infiniband/core/mad_priv.h
@@ -0,0 +1,227 @@
+/*
+ * Copyright (c) 2004, 2005, Voltaire, Inc. All rights reserved.
+ * Copyright (c) 2005 Intel Corporation. All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2009 HNR Consulting. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __IB_MAD_PRIV_H__
+#define __IB_MAD_PRIV_H__
+
+#include <linux/completion.h>
+#include <linux/err.h>
+#include <linux/workqueue.h>
+#include <rdma/ib_mad.h>
+#include <rdma/ib_smi.h>
+
+#define IB_MAD_QPS_CORE		2 /* Always QP0 and QP1 as a minimum */
+
+/* QP and CQ parameters */
+#define IB_MAD_QP_SEND_SIZE	128
+#define IB_MAD_QP_RECV_SIZE	512
+#define IB_MAD_QP_MIN_SIZE	64
+#define IB_MAD_QP_MAX_SIZE	8192
+#define IB_MAD_SEND_REQ_MAX_SG	2
+#define IB_MAD_RECV_REQ_MAX_SG	1
+
+#define IB_MAD_SEND_Q_PSN	0
+
+/* Registration table sizes */
+#define MAX_MGMT_CLASS		80
+#define MAX_MGMT_VERSION	8
+#define MAX_MGMT_OUI		8
+#define MAX_MGMT_VENDOR_RANGE2	(IB_MGMT_CLASS_VENDOR_RANGE2_END - \
+				IB_MGMT_CLASS_VENDOR_RANGE2_START + 1)
+
+struct ib_mad_list_head {
+	struct list_head list;
+	struct ib_mad_queue *mad_queue;
+};
+
+struct ib_mad_private_header {
+	struct ib_mad_list_head mad_list;
+	struct ib_mad_recv_wc recv_wc;
+	struct ib_wc wc;
+	u64 mapping;
+} __attribute__ ((packed));
+
+struct ib_mad_private {
+	struct ib_mad_private_header header;
+	struct ib_grh grh;
+	union {
+		struct ib_mad mad;
+		struct ib_rmpp_mad rmpp_mad;
+		struct ib_smp smp;
+	} mad;
+} __attribute__ ((packed));
+
+struct ib_rmpp_segment {
+	struct list_head list;
+	u32 num;
+	u8 data[0];
+};
+
+struct ib_mad_agent_private {
+	struct list_head agent_list;
+	struct ib_mad_agent agent;
+	struct ib_mad_reg_req *reg_req;
+	struct ib_mad_qp_info *qp_info;
+
+	spinlock_t lock;
+	struct list_head send_list;
+	struct list_head wait_list;
+	struct list_head done_list;
+	struct delayed_work timed_work;
+	unsigned long timeout;
+	struct list_head local_list;
+	struct work_struct local_work;
+	struct list_head rmpp_list;
+
+	atomic_t refcount;
+	struct completion comp;
+};
+
+struct ib_mad_snoop_private {
+	struct ib_mad_agent agent;
+	struct ib_mad_qp_info *qp_info;
+	int snoop_index;
+	int mad_snoop_flags;
+	atomic_t refcount;
+	struct completion comp;
+};
+
+struct ib_mad_send_wr_private {
+	struct ib_mad_list_head mad_list;
+	struct list_head agent_list;
+	struct ib_mad_agent_private *mad_agent_priv;
+	struct ib_mad_send_buf send_buf;
+	u64 header_mapping;
+	u64 payload_mapping;
+	struct ib_send_wr send_wr;
+	struct ib_sge sg_list[IB_MAD_SEND_REQ_MAX_SG];
+	__be64 tid;
+	unsigned long timeout;
+	int max_retries;
+	int retries_left;
+	int retry;
+	int refcount;
+	enum ib_wc_status status;
+
+	/* RMPP control */
+	struct list_head rmpp_list;
+	struct ib_rmpp_segment *last_ack_seg;
+	struct ib_rmpp_segment *cur_seg;
+	int last_ack;
+	int seg_num;
+	int newwin;
+	int pad;
+};
+
+struct ib_mad_local_private {
+	struct list_head completion_list;
+	struct ib_mad_private *mad_priv;
+	struct ib_mad_agent_private *recv_mad_agent;
+	struct ib_mad_send_wr_private *mad_send_wr;
+};
+
+struct ib_mad_mgmt_method_table {
+	struct ib_mad_agent_private *agent[IB_MGMT_MAX_METHODS];
+};
+
+struct ib_mad_mgmt_class_table {
+	struct ib_mad_mgmt_method_table *method_table[MAX_MGMT_CLASS];
+};
+
+struct ib_mad_mgmt_vendor_class {
+	u8	oui[MAX_MGMT_OUI][3];
+	struct ib_mad_mgmt_method_table *method_table[MAX_MGMT_OUI];
+};
+
+struct ib_mad_mgmt_vendor_class_table {
+	struct ib_mad_mgmt_vendor_class *vendor_class[MAX_MGMT_VENDOR_RANGE2];
+};
+
+struct ib_mad_mgmt_version_table {
+	struct ib_mad_mgmt_class_table *class;
+	struct ib_mad_mgmt_vendor_class_table *vendor;
+};
+
+struct ib_mad_queue {
+	spinlock_t lock;
+	struct list_head list;
+	int count;
+	int max_active;
+	struct ib_mad_qp_info *qp_info;
+};
+
+struct ib_mad_qp_info {
+	struct ib_mad_port_private *port_priv;
+	struct ib_qp *qp;
+	struct ib_mad_queue send_queue;
+	struct ib_mad_queue recv_queue;
+	struct list_head overflow_list;
+	spinlock_t snoop_lock;
+	struct ib_mad_snoop_private **snoop_table;
+	int snoop_table_size;
+	atomic_t snoop_count;
+};
+
+struct ib_mad_port_private {
+	struct list_head port_list;
+	struct ib_device *device;
+	int port_num;
+	struct ib_cq *cq;
+	struct ib_pd *pd;
+	struct ib_mr *mr;
+
+	spinlock_t reg_lock;
+	struct ib_mad_mgmt_version_table version[MAX_MGMT_VERSION];
+	struct list_head agent_list;
+	struct workqueue_struct *wq;
+	struct work_struct work;
+	struct ib_mad_qp_info qp_info[IB_MAD_QPS_CORE];
+};
+
+int ib_send_mad(struct ib_mad_send_wr_private *mad_send_wr);
+
+struct ib_mad_send_wr_private *
+ib_find_send_mad(struct ib_mad_agent_private *mad_agent_priv,
+		 struct ib_mad_recv_wc *mad_recv_wc);
+
+void ib_mad_complete_send_wr(struct ib_mad_send_wr_private *mad_send_wr,
+			     struct ib_mad_send_wc *mad_send_wc);
+
+void ib_mark_mad_done(struct ib_mad_send_wr_private *mad_send_wr);
+
+void ib_reset_mad_timeout(struct ib_mad_send_wr_private *mad_send_wr,
+			  int timeout_ms);
+
+#endif	/* __IB_MAD_PRIV_H__ */
diff --git a/drivers/infiniband/core/mad_rmpp.c b/drivers/infiniband/core/mad_rmpp.c
new file mode 100644
index 0000000..f37878c
--- /dev/null
+++ b/drivers/infiniband/core/mad_rmpp.c
@@ -0,0 +1,953 @@
+/*
+ * Copyright (c) 2005 Intel Inc. All rights reserved.
+ * Copyright (c) 2005-2006 Voltaire, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/slab.h>
+
+#include "mad_priv.h"
+#include "mad_rmpp.h"
+
+enum rmpp_state {
+	RMPP_STATE_ACTIVE,
+	RMPP_STATE_TIMEOUT,
+	RMPP_STATE_COMPLETE,
+	RMPP_STATE_CANCELING
+};
+
+struct mad_rmpp_recv {
+	struct ib_mad_agent_private *agent;
+	struct list_head list;
+	struct delayed_work timeout_work;
+	struct delayed_work cleanup_work;
+	struct completion comp;
+	enum rmpp_state state;
+	spinlock_t lock;
+	atomic_t refcount;
+
+	struct ib_ah *ah;
+	struct ib_mad_recv_wc *rmpp_wc;
+	struct ib_mad_recv_buf *cur_seg_buf;
+	int last_ack;
+	int seg_num;
+	int newwin;
+	int repwin;
+
+	__be64 tid;
+	u32 src_qp;
+	u16 slid;
+	u8 mgmt_class;
+	u8 class_version;
+	u8 method;
+};
+
+static inline void deref_rmpp_recv(struct mad_rmpp_recv *rmpp_recv)
+{
+	if (atomic_dec_and_test(&rmpp_recv->refcount))
+		complete(&rmpp_recv->comp);
+}
+
+static void destroy_rmpp_recv(struct mad_rmpp_recv *rmpp_recv)
+{
+	deref_rmpp_recv(rmpp_recv);
+	wait_for_completion(&rmpp_recv->comp);
+	ib_destroy_ah(rmpp_recv->ah);
+	kfree(rmpp_recv);
+}
+
+void ib_cancel_rmpp_recvs(struct ib_mad_agent_private *agent)
+{
+	struct mad_rmpp_recv *rmpp_recv, *temp_rmpp_recv;
+	unsigned long flags;
+
+	spin_lock_irqsave(&agent->lock, flags);
+	list_for_each_entry(rmpp_recv, &agent->rmpp_list, list) {
+		if (rmpp_recv->state != RMPP_STATE_COMPLETE)
+			ib_free_recv_mad(rmpp_recv->rmpp_wc);
+		rmpp_recv->state = RMPP_STATE_CANCELING;
+	}
+	spin_unlock_irqrestore(&agent->lock, flags);
+
+	list_for_each_entry(rmpp_recv, &agent->rmpp_list, list) {
+		cancel_delayed_work(&rmpp_recv->timeout_work);
+		cancel_delayed_work(&rmpp_recv->cleanup_work);
+	}
+
+	flush_workqueue(agent->qp_info->port_priv->wq);
+
+	list_for_each_entry_safe(rmpp_recv, temp_rmpp_recv,
+				 &agent->rmpp_list, list) {
+		list_del(&rmpp_recv->list);
+		destroy_rmpp_recv(rmpp_recv);
+	}
+}
+
+static void format_ack(struct ib_mad_send_buf *msg,
+		       struct ib_rmpp_mad *data,
+		       struct mad_rmpp_recv *rmpp_recv)
+{
+	struct ib_rmpp_mad *ack = msg->mad;
+	unsigned long flags;
+
+	memcpy(ack, &data->mad_hdr, msg->hdr_len);
+
+	ack->mad_hdr.method ^= IB_MGMT_METHOD_RESP;
+	ack->rmpp_hdr.rmpp_type = IB_MGMT_RMPP_TYPE_ACK;
+	ib_set_rmpp_flags(&ack->rmpp_hdr, IB_MGMT_RMPP_FLAG_ACTIVE);
+
+	spin_lock_irqsave(&rmpp_recv->lock, flags);
+	rmpp_recv->last_ack = rmpp_recv->seg_num;
+	ack->rmpp_hdr.seg_num = cpu_to_be32(rmpp_recv->seg_num);
+	ack->rmpp_hdr.paylen_newwin = cpu_to_be32(rmpp_recv->newwin);
+	spin_unlock_irqrestore(&rmpp_recv->lock, flags);
+}
+
+static void ack_recv(struct mad_rmpp_recv *rmpp_recv,
+		     struct ib_mad_recv_wc *recv_wc)
+{
+	struct ib_mad_send_buf *msg;
+	int ret, hdr_len;
+
+	hdr_len = ib_get_mad_data_offset(recv_wc->recv_buf.mad->mad_hdr.mgmt_class);
+	msg = ib_create_send_mad(&rmpp_recv->agent->agent, recv_wc->wc->src_qp,
+				 recv_wc->wc->pkey_index, 1, hdr_len,
+				 0, GFP_KERNEL);
+	if (IS_ERR(msg))
+		return;
+
+	format_ack(msg, (struct ib_rmpp_mad *) recv_wc->recv_buf.mad, rmpp_recv);
+	msg->ah = rmpp_recv->ah;
+	ret = ib_post_send_mad(msg, NULL);
+	if (ret)
+		ib_free_send_mad(msg);
+}
+
+static struct ib_mad_send_buf *alloc_response_msg(struct ib_mad_agent *agent,
+						  struct ib_mad_recv_wc *recv_wc)
+{
+	struct ib_mad_send_buf *msg;
+	struct ib_ah *ah;
+	int hdr_len;
+
+	ah = ib_create_ah_from_wc(agent->qp->pd, recv_wc->wc,
+				  recv_wc->recv_buf.grh, agent->port_num);
+	if (IS_ERR(ah))
+		return (void *) ah;
+
+	hdr_len = ib_get_mad_data_offset(recv_wc->recv_buf.mad->mad_hdr.mgmt_class);
+	msg = ib_create_send_mad(agent, recv_wc->wc->src_qp,
+				 recv_wc->wc->pkey_index, 1,
+				 hdr_len, 0, GFP_KERNEL);
+	if (IS_ERR(msg))
+		ib_destroy_ah(ah);
+	else {
+		msg->ah = ah;
+		msg->context[0] = ah;
+	}
+
+	return msg;
+}
+
+static void ack_ds_ack(struct ib_mad_agent_private *agent,
+		       struct ib_mad_recv_wc *recv_wc)
+{
+	struct ib_mad_send_buf *msg;
+	struct ib_rmpp_mad *rmpp_mad;
+	int ret;
+
+	msg = alloc_response_msg(&agent->agent, recv_wc);
+	if (IS_ERR(msg))
+		return;
+
+	rmpp_mad = msg->mad;
+	memcpy(rmpp_mad, recv_wc->recv_buf.mad, msg->hdr_len);
+
+	rmpp_mad->mad_hdr.method ^= IB_MGMT_METHOD_RESP;
+	ib_set_rmpp_flags(&rmpp_mad->rmpp_hdr, IB_MGMT_RMPP_FLAG_ACTIVE);
+	rmpp_mad->rmpp_hdr.seg_num = 0;
+	rmpp_mad->rmpp_hdr.paylen_newwin = cpu_to_be32(1);
+
+	ret = ib_post_send_mad(msg, NULL);
+	if (ret) {
+		ib_destroy_ah(msg->ah);
+		ib_free_send_mad(msg);
+	}
+}
+
+void ib_rmpp_send_handler(struct ib_mad_send_wc *mad_send_wc)
+{
+	if (mad_send_wc->send_buf->context[0] == mad_send_wc->send_buf->ah)
+		ib_destroy_ah(mad_send_wc->send_buf->ah);
+	ib_free_send_mad(mad_send_wc->send_buf);
+}
+
+static void nack_recv(struct ib_mad_agent_private *agent,
+		      struct ib_mad_recv_wc *recv_wc, u8 rmpp_status)
+{
+	struct ib_mad_send_buf *msg;
+	struct ib_rmpp_mad *rmpp_mad;
+	int ret;
+
+	msg = alloc_response_msg(&agent->agent, recv_wc);
+	if (IS_ERR(msg))
+		return;
+
+	rmpp_mad = msg->mad;
+	memcpy(rmpp_mad, recv_wc->recv_buf.mad, msg->hdr_len);
+
+	rmpp_mad->mad_hdr.method ^= IB_MGMT_METHOD_RESP;
+	rmpp_mad->rmpp_hdr.rmpp_version = IB_MGMT_RMPP_VERSION;
+	rmpp_mad->rmpp_hdr.rmpp_type = IB_MGMT_RMPP_TYPE_ABORT;
+	ib_set_rmpp_flags(&rmpp_mad->rmpp_hdr, IB_MGMT_RMPP_FLAG_ACTIVE);
+	rmpp_mad->rmpp_hdr.rmpp_status = rmpp_status;
+	rmpp_mad->rmpp_hdr.seg_num = 0;
+	rmpp_mad->rmpp_hdr.paylen_newwin = 0;
+
+	ret = ib_post_send_mad(msg, NULL);
+	if (ret) {
+		ib_destroy_ah(msg->ah);
+		ib_free_send_mad(msg);
+	}
+}
+
+static void recv_timeout_handler(struct work_struct *work)
+{
+	struct mad_rmpp_recv *rmpp_recv =
+		container_of(work, struct mad_rmpp_recv, timeout_work.work);
+	struct ib_mad_recv_wc *rmpp_wc;
+	unsigned long flags;
+
+	spin_lock_irqsave(&rmpp_recv->agent->lock, flags);
+	if (rmpp_recv->state != RMPP_STATE_ACTIVE) {
+		spin_unlock_irqrestore(&rmpp_recv->agent->lock, flags);
+		return;
+	}
+	rmpp_recv->state = RMPP_STATE_TIMEOUT;
+	list_del(&rmpp_recv->list);
+	spin_unlock_irqrestore(&rmpp_recv->agent->lock, flags);
+
+	rmpp_wc = rmpp_recv->rmpp_wc;
+	nack_recv(rmpp_recv->agent, rmpp_wc, IB_MGMT_RMPP_STATUS_T2L);
+	destroy_rmpp_recv(rmpp_recv);
+	ib_free_recv_mad(rmpp_wc);
+}
+
+static void recv_cleanup_handler(struct work_struct *work)
+{
+	struct mad_rmpp_recv *rmpp_recv =
+		container_of(work, struct mad_rmpp_recv, cleanup_work.work);
+	unsigned long flags;
+
+	spin_lock_irqsave(&rmpp_recv->agent->lock, flags);
+	if (rmpp_recv->state == RMPP_STATE_CANCELING) {
+		spin_unlock_irqrestore(&rmpp_recv->agent->lock, flags);
+		return;
+	}
+	list_del(&rmpp_recv->list);
+	spin_unlock_irqrestore(&rmpp_recv->agent->lock, flags);
+	destroy_rmpp_recv(rmpp_recv);
+}
+
+static struct mad_rmpp_recv *
+create_rmpp_recv(struct ib_mad_agent_private *agent,
+		 struct ib_mad_recv_wc *mad_recv_wc)
+{
+	struct mad_rmpp_recv *rmpp_recv;
+	struct ib_mad_hdr *mad_hdr;
+
+	rmpp_recv = kmalloc(sizeof *rmpp_recv, GFP_KERNEL);
+	if (!rmpp_recv)
+		return NULL;
+
+	rmpp_recv->ah = ib_create_ah_from_wc(agent->agent.qp->pd,
+					     mad_recv_wc->wc,
+					     mad_recv_wc->recv_buf.grh,
+					     agent->agent.port_num);
+	if (IS_ERR(rmpp_recv->ah))
+		goto error;
+
+	rmpp_recv->agent = agent;
+	init_completion(&rmpp_recv->comp);
+	INIT_DELAYED_WORK(&rmpp_recv->timeout_work, recv_timeout_handler);
+	INIT_DELAYED_WORK(&rmpp_recv->cleanup_work, recv_cleanup_handler);
+	spin_lock_init(&rmpp_recv->lock);
+	rmpp_recv->state = RMPP_STATE_ACTIVE;
+	atomic_set(&rmpp_recv->refcount, 1);
+
+	rmpp_recv->rmpp_wc = mad_recv_wc;
+	rmpp_recv->cur_seg_buf = &mad_recv_wc->recv_buf;
+	rmpp_recv->newwin = 1;
+	rmpp_recv->seg_num = 1;
+	rmpp_recv->last_ack = 0;
+	rmpp_recv->repwin = 1;
+
+	mad_hdr = &mad_recv_wc->recv_buf.mad->mad_hdr;
+	rmpp_recv->tid = mad_hdr->tid;
+	rmpp_recv->src_qp = mad_recv_wc->wc->src_qp;
+	rmpp_recv->slid = mad_recv_wc->wc->slid;
+	rmpp_recv->mgmt_class = mad_hdr->mgmt_class;
+	rmpp_recv->class_version = mad_hdr->class_version;
+	rmpp_recv->method  = mad_hdr->method;
+	return rmpp_recv;
+
+error:	kfree(rmpp_recv);
+	return NULL;
+}
+
+static struct mad_rmpp_recv *
+find_rmpp_recv(struct ib_mad_agent_private *agent,
+	       struct ib_mad_recv_wc *mad_recv_wc)
+{
+	struct mad_rmpp_recv *rmpp_recv;
+	struct ib_mad_hdr *mad_hdr = &mad_recv_wc->recv_buf.mad->mad_hdr;
+
+	list_for_each_entry(rmpp_recv, &agent->rmpp_list, list) {
+		if (rmpp_recv->tid == mad_hdr->tid &&
+		    rmpp_recv->src_qp == mad_recv_wc->wc->src_qp &&
+		    rmpp_recv->slid == mad_recv_wc->wc->slid &&
+		    rmpp_recv->mgmt_class == mad_hdr->mgmt_class &&
+		    rmpp_recv->class_version == mad_hdr->class_version &&
+		    rmpp_recv->method == mad_hdr->method)
+			return rmpp_recv;
+	}
+	return NULL;
+}
+
+static struct mad_rmpp_recv *
+acquire_rmpp_recv(struct ib_mad_agent_private *agent,
+		  struct ib_mad_recv_wc *mad_recv_wc)
+{
+	struct mad_rmpp_recv *rmpp_recv;
+	unsigned long flags;
+
+	spin_lock_irqsave(&agent->lock, flags);
+	rmpp_recv = find_rmpp_recv(agent, mad_recv_wc);
+	if (rmpp_recv)
+		atomic_inc(&rmpp_recv->refcount);
+	spin_unlock_irqrestore(&agent->lock, flags);
+	return rmpp_recv;
+}
+
+static struct mad_rmpp_recv *
+insert_rmpp_recv(struct ib_mad_agent_private *agent,
+		 struct mad_rmpp_recv *rmpp_recv)
+{
+	struct mad_rmpp_recv *cur_rmpp_recv;
+
+	cur_rmpp_recv = find_rmpp_recv(agent, rmpp_recv->rmpp_wc);
+	if (!cur_rmpp_recv)
+		list_add_tail(&rmpp_recv->list, &agent->rmpp_list);
+
+	return cur_rmpp_recv;
+}
+
+static inline int get_last_flag(struct ib_mad_recv_buf *seg)
+{
+	struct ib_rmpp_mad *rmpp_mad;
+
+	rmpp_mad = (struct ib_rmpp_mad *) seg->mad;
+	return ib_get_rmpp_flags(&rmpp_mad->rmpp_hdr) & IB_MGMT_RMPP_FLAG_LAST;
+}
+
+static inline int get_seg_num(struct ib_mad_recv_buf *seg)
+{
+	struct ib_rmpp_mad *rmpp_mad;
+
+	rmpp_mad = (struct ib_rmpp_mad *) seg->mad;
+	return be32_to_cpu(rmpp_mad->rmpp_hdr.seg_num);
+}
+
+static inline struct ib_mad_recv_buf * get_next_seg(struct list_head *rmpp_list,
+						    struct ib_mad_recv_buf *seg)
+{
+	if (seg->list.next == rmpp_list)
+		return NULL;
+
+	return container_of(seg->list.next, struct ib_mad_recv_buf, list);
+}
+
+static inline int window_size(struct ib_mad_agent_private *agent)
+{
+	return max(agent->qp_info->recv_queue.max_active >> 3, 1);
+}
+
+static struct ib_mad_recv_buf * find_seg_location(struct list_head *rmpp_list,
+						  int seg_num)
+{
+	struct ib_mad_recv_buf *seg_buf;
+	int cur_seg_num;
+
+	list_for_each_entry_reverse(seg_buf, rmpp_list, list) {
+		cur_seg_num = get_seg_num(seg_buf);
+		if (seg_num > cur_seg_num)
+			return seg_buf;
+		if (seg_num == cur_seg_num)
+			break;
+	}
+	return NULL;
+}
+
+static void update_seg_num(struct mad_rmpp_recv *rmpp_recv,
+			   struct ib_mad_recv_buf *new_buf)
+{
+	struct list_head *rmpp_list = &rmpp_recv->rmpp_wc->rmpp_list;
+
+	while (new_buf && (get_seg_num(new_buf) == rmpp_recv->seg_num + 1)) {
+		rmpp_recv->cur_seg_buf = new_buf;
+		rmpp_recv->seg_num++;
+		new_buf = get_next_seg(rmpp_list, new_buf);
+	}
+}
+
+static inline int get_mad_len(struct mad_rmpp_recv *rmpp_recv)
+{
+	struct ib_rmpp_mad *rmpp_mad;
+	int hdr_size, data_size, pad;
+
+	rmpp_mad = (struct ib_rmpp_mad *)rmpp_recv->cur_seg_buf->mad;
+
+	hdr_size = ib_get_mad_data_offset(rmpp_mad->mad_hdr.mgmt_class);
+	data_size = sizeof(struct ib_rmpp_mad) - hdr_size;
+	pad = IB_MGMT_RMPP_DATA - be32_to_cpu(rmpp_mad->rmpp_hdr.paylen_newwin);
+	if (pad > IB_MGMT_RMPP_DATA || pad < 0)
+		pad = 0;
+
+	return hdr_size + rmpp_recv->seg_num * data_size - pad;
+}
+
+static struct ib_mad_recv_wc * complete_rmpp(struct mad_rmpp_recv *rmpp_recv)
+{
+	struct ib_mad_recv_wc *rmpp_wc;
+
+	ack_recv(rmpp_recv, rmpp_recv->rmpp_wc);
+	if (rmpp_recv->seg_num > 1)
+		cancel_delayed_work(&rmpp_recv->timeout_work);
+
+	rmpp_wc = rmpp_recv->rmpp_wc;
+	rmpp_wc->mad_len = get_mad_len(rmpp_recv);
+	/* 10 seconds until we can find the packet lifetime */
+	queue_delayed_work(rmpp_recv->agent->qp_info->port_priv->wq,
+			   &rmpp_recv->cleanup_work, msecs_to_jiffies(10000));
+	return rmpp_wc;
+}
+
+static struct ib_mad_recv_wc *
+continue_rmpp(struct ib_mad_agent_private *agent,
+	      struct ib_mad_recv_wc *mad_recv_wc)
+{
+	struct mad_rmpp_recv *rmpp_recv;
+	struct ib_mad_recv_buf *prev_buf;
+	struct ib_mad_recv_wc *done_wc;
+	int seg_num;
+	unsigned long flags;
+
+	rmpp_recv = acquire_rmpp_recv(agent, mad_recv_wc);
+	if (!rmpp_recv)
+		goto drop1;
+
+	seg_num = get_seg_num(&mad_recv_wc->recv_buf);
+
+	spin_lock_irqsave(&rmpp_recv->lock, flags);
+	if ((rmpp_recv->state == RMPP_STATE_TIMEOUT) ||
+	    (seg_num > rmpp_recv->newwin))
+		goto drop3;
+
+	if ((seg_num <= rmpp_recv->last_ack) ||
+	    (rmpp_recv->state == RMPP_STATE_COMPLETE)) {
+		spin_unlock_irqrestore(&rmpp_recv->lock, flags);
+		ack_recv(rmpp_recv, mad_recv_wc);
+		goto drop2;
+	}
+
+	prev_buf = find_seg_location(&rmpp_recv->rmpp_wc->rmpp_list, seg_num);
+	if (!prev_buf)
+		goto drop3;
+
+	done_wc = NULL;
+	list_add(&mad_recv_wc->recv_buf.list, &prev_buf->list);
+	if (rmpp_recv->cur_seg_buf == prev_buf) {
+		update_seg_num(rmpp_recv, &mad_recv_wc->recv_buf);
+		if (get_last_flag(rmpp_recv->cur_seg_buf)) {
+			rmpp_recv->state = RMPP_STATE_COMPLETE;
+			spin_unlock_irqrestore(&rmpp_recv->lock, flags);
+			done_wc = complete_rmpp(rmpp_recv);
+			goto out;
+		} else if (rmpp_recv->seg_num == rmpp_recv->newwin) {
+			rmpp_recv->newwin += window_size(agent);
+			spin_unlock_irqrestore(&rmpp_recv->lock, flags);
+			ack_recv(rmpp_recv, mad_recv_wc);
+			goto out;
+		}
+	}
+	spin_unlock_irqrestore(&rmpp_recv->lock, flags);
+out:
+	deref_rmpp_recv(rmpp_recv);
+	return done_wc;
+
+drop3:	spin_unlock_irqrestore(&rmpp_recv->lock, flags);
+drop2:	deref_rmpp_recv(rmpp_recv);
+drop1:	ib_free_recv_mad(mad_recv_wc);
+	return NULL;
+}
+
+static struct ib_mad_recv_wc *
+start_rmpp(struct ib_mad_agent_private *agent,
+	   struct ib_mad_recv_wc *mad_recv_wc)
+{
+	struct mad_rmpp_recv *rmpp_recv;
+	unsigned long flags;
+
+	rmpp_recv = create_rmpp_recv(agent, mad_recv_wc);
+	if (!rmpp_recv) {
+		ib_free_recv_mad(mad_recv_wc);
+		return NULL;
+	}
+
+	spin_lock_irqsave(&agent->lock, flags);
+	if (insert_rmpp_recv(agent, rmpp_recv)) {
+		spin_unlock_irqrestore(&agent->lock, flags);
+		/* duplicate first MAD */
+		destroy_rmpp_recv(rmpp_recv);
+		return continue_rmpp(agent, mad_recv_wc);
+	}
+	atomic_inc(&rmpp_recv->refcount);
+
+	if (get_last_flag(&mad_recv_wc->recv_buf)) {
+		rmpp_recv->state = RMPP_STATE_COMPLETE;
+		spin_unlock_irqrestore(&agent->lock, flags);
+		complete_rmpp(rmpp_recv);
+	} else {
+		spin_unlock_irqrestore(&agent->lock, flags);
+		/* 40 seconds until we can find the packet lifetimes */
+		queue_delayed_work(agent->qp_info->port_priv->wq,
+				   &rmpp_recv->timeout_work,
+				   msecs_to_jiffies(40000));
+		rmpp_recv->newwin += window_size(agent);
+		ack_recv(rmpp_recv, mad_recv_wc);
+		mad_recv_wc = NULL;
+	}
+	deref_rmpp_recv(rmpp_recv);
+	return mad_recv_wc;
+}
+
+static int send_next_seg(struct ib_mad_send_wr_private *mad_send_wr)
+{
+	struct ib_rmpp_mad *rmpp_mad;
+	int timeout;
+	u32 paylen = 0;
+
+	rmpp_mad = mad_send_wr->send_buf.mad;
+	ib_set_rmpp_flags(&rmpp_mad->rmpp_hdr, IB_MGMT_RMPP_FLAG_ACTIVE);
+	rmpp_mad->rmpp_hdr.seg_num = cpu_to_be32(++mad_send_wr->seg_num);
+
+	if (mad_send_wr->seg_num == 1) {
+		rmpp_mad->rmpp_hdr.rmpp_rtime_flags |= IB_MGMT_RMPP_FLAG_FIRST;
+		paylen = mad_send_wr->send_buf.seg_count * IB_MGMT_RMPP_DATA -
+			 mad_send_wr->pad;
+	}
+
+	if (mad_send_wr->seg_num == mad_send_wr->send_buf.seg_count) {
+		rmpp_mad->rmpp_hdr.rmpp_rtime_flags |= IB_MGMT_RMPP_FLAG_LAST;
+		paylen = IB_MGMT_RMPP_DATA - mad_send_wr->pad;
+	}
+	rmpp_mad->rmpp_hdr.paylen_newwin = cpu_to_be32(paylen);
+
+	/* 2 seconds for an ACK until we can find the packet lifetime */
+	timeout = mad_send_wr->send_buf.timeout_ms;
+	if (!timeout || timeout > 2000)
+		mad_send_wr->timeout = msecs_to_jiffies(2000);
+
+	return ib_send_mad(mad_send_wr);
+}
+
+static void abort_send(struct ib_mad_agent_private *agent,
+		       struct ib_mad_recv_wc *mad_recv_wc, u8 rmpp_status)
+{
+	struct ib_mad_send_wr_private *mad_send_wr;
+	struct ib_mad_send_wc wc;
+	unsigned long flags;
+
+	spin_lock_irqsave(&agent->lock, flags);
+	mad_send_wr = ib_find_send_mad(agent, mad_recv_wc);
+	if (!mad_send_wr)
+		goto out;	/* Unmatched send */
+
+	if ((mad_send_wr->last_ack == mad_send_wr->send_buf.seg_count) ||
+	    (!mad_send_wr->timeout) || (mad_send_wr->status != IB_WC_SUCCESS))
+		goto out;	/* Send is already done */
+
+	ib_mark_mad_done(mad_send_wr);
+	spin_unlock_irqrestore(&agent->lock, flags);
+
+	wc.status = IB_WC_REM_ABORT_ERR;
+	wc.vendor_err = rmpp_status;
+	wc.send_buf = &mad_send_wr->send_buf;
+	ib_mad_complete_send_wr(mad_send_wr, &wc);
+	return;
+out:
+	spin_unlock_irqrestore(&agent->lock, flags);
+}
+
+static inline void adjust_last_ack(struct ib_mad_send_wr_private *wr,
+				   int seg_num)
+{
+	struct list_head *list;
+
+	wr->last_ack = seg_num;
+	list = &wr->last_ack_seg->list;
+	list_for_each_entry(wr->last_ack_seg, list, list)
+		if (wr->last_ack_seg->num == seg_num)
+			break;
+}
+
+static void process_ds_ack(struct ib_mad_agent_private *agent,
+			   struct ib_mad_recv_wc *mad_recv_wc, int newwin)
+{
+	struct mad_rmpp_recv *rmpp_recv;
+
+	rmpp_recv = find_rmpp_recv(agent, mad_recv_wc);
+	if (rmpp_recv && rmpp_recv->state == RMPP_STATE_COMPLETE)
+		rmpp_recv->repwin = newwin;
+}
+
+static void process_rmpp_ack(struct ib_mad_agent_private *agent,
+			     struct ib_mad_recv_wc *mad_recv_wc)
+{
+	struct ib_mad_send_wr_private *mad_send_wr;
+	struct ib_rmpp_mad *rmpp_mad;
+	unsigned long flags;
+	int seg_num, newwin, ret;
+
+	rmpp_mad = (struct ib_rmpp_mad *)mad_recv_wc->recv_buf.mad;
+	if (rmpp_mad->rmpp_hdr.rmpp_status) {
+		abort_send(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_BAD_STATUS);
+		nack_recv(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_BAD_STATUS);
+		return;
+	}
+
+	seg_num = be32_to_cpu(rmpp_mad->rmpp_hdr.seg_num);
+	newwin = be32_to_cpu(rmpp_mad->rmpp_hdr.paylen_newwin);
+	if (newwin < seg_num) {
+		abort_send(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_W2S);
+		nack_recv(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_W2S);
+		return;
+	}
+
+	spin_lock_irqsave(&agent->lock, flags);
+	mad_send_wr = ib_find_send_mad(agent, mad_recv_wc);
+	if (!mad_send_wr) {
+		if (!seg_num)
+			process_ds_ack(agent, mad_recv_wc, newwin);
+		goto out;	/* Unmatched or DS RMPP ACK */
+	}
+
+	if ((mad_send_wr->last_ack == mad_send_wr->send_buf.seg_count) &&
+	    (mad_send_wr->timeout)) {
+		spin_unlock_irqrestore(&agent->lock, flags);
+		ack_ds_ack(agent, mad_recv_wc);
+		return;		/* Repeated ACK for DS RMPP transaction */
+	}
+
+	if ((mad_send_wr->last_ack == mad_send_wr->send_buf.seg_count) ||
+	    (!mad_send_wr->timeout) || (mad_send_wr->status != IB_WC_SUCCESS))
+		goto out;	/* Send is already done */
+
+	if (seg_num > mad_send_wr->send_buf.seg_count ||
+	    seg_num > mad_send_wr->newwin) {
+		spin_unlock_irqrestore(&agent->lock, flags);
+		abort_send(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_S2B);
+		nack_recv(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_S2B);
+		return;
+	}
+
+	if (newwin < mad_send_wr->newwin || seg_num < mad_send_wr->last_ack)
+		goto out;	/* Old ACK */
+
+	if (seg_num > mad_send_wr->last_ack) {
+		adjust_last_ack(mad_send_wr, seg_num);
+		mad_send_wr->retries_left = mad_send_wr->max_retries;
+	}
+	mad_send_wr->newwin = newwin;
+	if (mad_send_wr->last_ack == mad_send_wr->send_buf.seg_count) {
+		/* If no response is expected, the ACK completes the send */
+		if (!mad_send_wr->send_buf.timeout_ms) {
+			struct ib_mad_send_wc wc;
+
+			ib_mark_mad_done(mad_send_wr);
+			spin_unlock_irqrestore(&agent->lock, flags);
+
+			wc.status = IB_WC_SUCCESS;
+			wc.vendor_err = 0;
+			wc.send_buf = &mad_send_wr->send_buf;
+			ib_mad_complete_send_wr(mad_send_wr, &wc);
+			return;
+		}
+		if (mad_send_wr->refcount == 1)
+			ib_reset_mad_timeout(mad_send_wr,
+					     mad_send_wr->send_buf.timeout_ms);
+		spin_unlock_irqrestore(&agent->lock, flags);
+		ack_ds_ack(agent, mad_recv_wc);
+		return;
+	} else if (mad_send_wr->refcount == 1 &&
+		   mad_send_wr->seg_num < mad_send_wr->newwin &&
+		   mad_send_wr->seg_num < mad_send_wr->send_buf.seg_count) {
+		/* Send failure will just result in a timeout/retry */
+		ret = send_next_seg(mad_send_wr);
+		if (ret)
+			goto out;
+
+		mad_send_wr->refcount++;
+		list_move_tail(&mad_send_wr->agent_list,
+			      &mad_send_wr->mad_agent_priv->send_list);
+	}
+out:
+	spin_unlock_irqrestore(&agent->lock, flags);
+}
+
+static struct ib_mad_recv_wc *
+process_rmpp_data(struct ib_mad_agent_private *agent,
+		  struct ib_mad_recv_wc *mad_recv_wc)
+{
+	struct ib_rmpp_hdr *rmpp_hdr;
+	u8 rmpp_status;
+
+	rmpp_hdr = &((struct ib_rmpp_mad *)mad_recv_wc->recv_buf.mad)->rmpp_hdr;
+
+	if (rmpp_hdr->rmpp_status) {
+		rmpp_status = IB_MGMT_RMPP_STATUS_BAD_STATUS;
+		goto bad;
+	}
+
+	if (rmpp_hdr->seg_num == cpu_to_be32(1)) {
+		if (!(ib_get_rmpp_flags(rmpp_hdr) & IB_MGMT_RMPP_FLAG_FIRST)) {
+			rmpp_status = IB_MGMT_RMPP_STATUS_BAD_SEG;
+			goto bad;
+		}
+		return start_rmpp(agent, mad_recv_wc);
+	} else {
+		if (ib_get_rmpp_flags(rmpp_hdr) & IB_MGMT_RMPP_FLAG_FIRST) {
+			rmpp_status = IB_MGMT_RMPP_STATUS_BAD_SEG;
+			goto bad;
+		}
+		return continue_rmpp(agent, mad_recv_wc);
+	}
+bad:
+	nack_recv(agent, mad_recv_wc, rmpp_status);
+	ib_free_recv_mad(mad_recv_wc);
+	return NULL;
+}
+
+static void process_rmpp_stop(struct ib_mad_agent_private *agent,
+			      struct ib_mad_recv_wc *mad_recv_wc)
+{
+	struct ib_rmpp_mad *rmpp_mad;
+
+	rmpp_mad = (struct ib_rmpp_mad *)mad_recv_wc->recv_buf.mad;
+
+	if (rmpp_mad->rmpp_hdr.rmpp_status != IB_MGMT_RMPP_STATUS_RESX) {
+		abort_send(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_BAD_STATUS);
+		nack_recv(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_BAD_STATUS);
+	} else
+		abort_send(agent, mad_recv_wc, rmpp_mad->rmpp_hdr.rmpp_status);
+}
+
+static void process_rmpp_abort(struct ib_mad_agent_private *agent,
+			       struct ib_mad_recv_wc *mad_recv_wc)
+{
+	struct ib_rmpp_mad *rmpp_mad;
+
+	rmpp_mad = (struct ib_rmpp_mad *)mad_recv_wc->recv_buf.mad;
+
+	if (rmpp_mad->rmpp_hdr.rmpp_status < IB_MGMT_RMPP_STATUS_ABORT_MIN ||
+	    rmpp_mad->rmpp_hdr.rmpp_status > IB_MGMT_RMPP_STATUS_ABORT_MAX) {
+		abort_send(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_BAD_STATUS);
+		nack_recv(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_BAD_STATUS);
+	} else
+		abort_send(agent, mad_recv_wc, rmpp_mad->rmpp_hdr.rmpp_status);
+}
+
+struct ib_mad_recv_wc *
+ib_process_rmpp_recv_wc(struct ib_mad_agent_private *agent,
+			struct ib_mad_recv_wc *mad_recv_wc)
+{
+	struct ib_rmpp_mad *rmpp_mad;
+
+	rmpp_mad = (struct ib_rmpp_mad *)mad_recv_wc->recv_buf.mad;
+	if (!(rmpp_mad->rmpp_hdr.rmpp_rtime_flags & IB_MGMT_RMPP_FLAG_ACTIVE))
+		return mad_recv_wc;
+
+	if (rmpp_mad->rmpp_hdr.rmpp_version != IB_MGMT_RMPP_VERSION) {
+		abort_send(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_UNV);
+		nack_recv(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_UNV);
+		goto out;
+	}
+
+	switch (rmpp_mad->rmpp_hdr.rmpp_type) {
+	case IB_MGMT_RMPP_TYPE_DATA:
+		return process_rmpp_data(agent, mad_recv_wc);
+	case IB_MGMT_RMPP_TYPE_ACK:
+		process_rmpp_ack(agent, mad_recv_wc);
+		break;
+	case IB_MGMT_RMPP_TYPE_STOP:
+		process_rmpp_stop(agent, mad_recv_wc);
+		break;
+	case IB_MGMT_RMPP_TYPE_ABORT:
+		process_rmpp_abort(agent, mad_recv_wc);
+		break;
+	default:
+		abort_send(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_BADT);
+		nack_recv(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_BADT);
+		break;
+	}
+out:
+	ib_free_recv_mad(mad_recv_wc);
+	return NULL;
+}
+
+static int init_newwin(struct ib_mad_send_wr_private *mad_send_wr)
+{
+	struct ib_mad_agent_private *agent = mad_send_wr->mad_agent_priv;
+	struct ib_mad_hdr *mad_hdr = mad_send_wr->send_buf.mad;
+	struct mad_rmpp_recv *rmpp_recv;
+	struct ib_ah_attr ah_attr;
+	unsigned long flags;
+	int newwin = 1;
+
+	if (!(mad_hdr->method & IB_MGMT_METHOD_RESP))
+		goto out;
+
+	spin_lock_irqsave(&agent->lock, flags);
+	list_for_each_entry(rmpp_recv, &agent->rmpp_list, list) {
+		if (rmpp_recv->tid != mad_hdr->tid ||
+		    rmpp_recv->mgmt_class != mad_hdr->mgmt_class ||
+		    rmpp_recv->class_version != mad_hdr->class_version ||
+		    (rmpp_recv->method & IB_MGMT_METHOD_RESP))
+			continue;
+
+		if (ib_query_ah(mad_send_wr->send_buf.ah, &ah_attr))
+			continue;
+
+		if (rmpp_recv->slid == ah_attr.dlid) {
+			newwin = rmpp_recv->repwin;
+			break;
+		}
+	}
+	spin_unlock_irqrestore(&agent->lock, flags);
+out:
+	return newwin;
+}
+
+int ib_send_rmpp_mad(struct ib_mad_send_wr_private *mad_send_wr)
+{
+	struct ib_rmpp_mad *rmpp_mad;
+	int ret;
+
+	rmpp_mad = mad_send_wr->send_buf.mad;
+	if (!(ib_get_rmpp_flags(&rmpp_mad->rmpp_hdr) &
+	      IB_MGMT_RMPP_FLAG_ACTIVE))
+		return IB_RMPP_RESULT_UNHANDLED;
+
+	if (rmpp_mad->rmpp_hdr.rmpp_type != IB_MGMT_RMPP_TYPE_DATA) {
+		mad_send_wr->seg_num = 1;
+		return IB_RMPP_RESULT_INTERNAL;
+	}
+
+	mad_send_wr->newwin = init_newwin(mad_send_wr);
+
+	/* We need to wait for the final ACK even if there isn't a response */
+	mad_send_wr->refcount += (mad_send_wr->timeout == 0);
+	ret = send_next_seg(mad_send_wr);
+	if (!ret)
+		return IB_RMPP_RESULT_CONSUMED;
+	return ret;
+}
+
+int ib_process_rmpp_send_wc(struct ib_mad_send_wr_private *mad_send_wr,
+			    struct ib_mad_send_wc *mad_send_wc)
+{
+	struct ib_rmpp_mad *rmpp_mad;
+	int ret;
+
+	rmpp_mad = mad_send_wr->send_buf.mad;
+	if (!(ib_get_rmpp_flags(&rmpp_mad->rmpp_hdr) &
+	      IB_MGMT_RMPP_FLAG_ACTIVE))
+		return IB_RMPP_RESULT_UNHANDLED; /* RMPP not active */
+
+	if (rmpp_mad->rmpp_hdr.rmpp_type != IB_MGMT_RMPP_TYPE_DATA)
+		return IB_RMPP_RESULT_INTERNAL;	 /* ACK, STOP, or ABORT */
+
+	if (mad_send_wc->status != IB_WC_SUCCESS ||
+	    mad_send_wr->status != IB_WC_SUCCESS)
+		return IB_RMPP_RESULT_PROCESSED; /* Canceled or send error */
+
+	if (!mad_send_wr->timeout)
+		return IB_RMPP_RESULT_PROCESSED; /* Response received */
+
+	if (mad_send_wr->last_ack == mad_send_wr->send_buf.seg_count) {
+		mad_send_wr->timeout =
+			msecs_to_jiffies(mad_send_wr->send_buf.timeout_ms);
+		return IB_RMPP_RESULT_PROCESSED; /* Send done */
+	}
+
+	if (mad_send_wr->seg_num == mad_send_wr->newwin ||
+	    mad_send_wr->seg_num == mad_send_wr->send_buf.seg_count)
+		return IB_RMPP_RESULT_PROCESSED; /* Wait for ACK */
+
+	ret = send_next_seg(mad_send_wr);
+	if (ret) {
+		mad_send_wc->status = IB_WC_GENERAL_ERR;
+		return IB_RMPP_RESULT_PROCESSED;
+	}
+	return IB_RMPP_RESULT_CONSUMED;
+}
+
+int ib_retry_rmpp(struct ib_mad_send_wr_private *mad_send_wr)
+{
+	struct ib_rmpp_mad *rmpp_mad;
+	int ret;
+
+	rmpp_mad = mad_send_wr->send_buf.mad;
+	if (!(ib_get_rmpp_flags(&rmpp_mad->rmpp_hdr) &
+	      IB_MGMT_RMPP_FLAG_ACTIVE))
+		return IB_RMPP_RESULT_UNHANDLED; /* RMPP not active */
+
+	if (mad_send_wr->last_ack == mad_send_wr->send_buf.seg_count)
+		return IB_RMPP_RESULT_PROCESSED;
+
+	mad_send_wr->seg_num = mad_send_wr->last_ack;
+	mad_send_wr->cur_seg = mad_send_wr->last_ack_seg;
+
+	ret = send_next_seg(mad_send_wr);
+	if (ret)
+		return IB_RMPP_RESULT_PROCESSED;
+
+	return IB_RMPP_RESULT_CONSUMED;
+}
diff --git a/drivers/infiniband/core/mad_rmpp.h b/drivers/infiniband/core/mad_rmpp.h
new file mode 100644
index 0000000..3d336bf
--- /dev/null
+++ b/drivers/infiniband/core/mad_rmpp.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2005 Intel Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __MAD_RMPP_H__
+#define __MAD_RMPP_H__
+
+enum {
+	IB_RMPP_RESULT_PROCESSED,
+	IB_RMPP_RESULT_CONSUMED,
+	IB_RMPP_RESULT_INTERNAL,
+	IB_RMPP_RESULT_UNHANDLED
+};
+
+int ib_send_rmpp_mad(struct ib_mad_send_wr_private *mad_send_wr);
+
+struct ib_mad_recv_wc *
+ib_process_rmpp_recv_wc(struct ib_mad_agent_private *agent,
+			struct ib_mad_recv_wc *mad_recv_wc);
+
+int ib_process_rmpp_send_wc(struct ib_mad_send_wr_private *mad_send_wr,
+			    struct ib_mad_send_wc *mad_send_wc);
+
+void ib_rmpp_send_handler(struct ib_mad_send_wc *mad_send_wc);
+
+void ib_cancel_rmpp_recvs(struct ib_mad_agent_private *agent);
+
+int ib_retry_rmpp(struct ib_mad_send_wr_private *mad_send_wr);
+
+#endif	/* __MAD_RMPP_H__ */
diff --git a/drivers/infiniband/core/multicast.c b/drivers/infiniband/core/multicast.c
new file mode 100644
index 0000000..d2360a8
--- /dev/null
+++ b/drivers/infiniband/core/multicast.c
@@ -0,0 +1,898 @@
+/*
+ * Copyright (c) 2006 Intel Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/completion.h>
+#include <linux/dma-mapping.h>
+#include <linux/err.h>
+#include <linux/interrupt.h>
+#include <linux/export.h>
+#include <linux/slab.h>
+#include <linux/bitops.h>
+#include <linux/random.h>
+
+#include <rdma/ib_cache.h>
+#include "sa.h"
+
+static void mcast_add_one(struct ib_device *device);
+static void mcast_remove_one(struct ib_device *device);
+
+static struct ib_client mcast_client = {
+	.name   = "ib_multicast",
+	.add    = mcast_add_one,
+	.remove = mcast_remove_one
+};
+
+static struct ib_sa_client	sa_client;
+static struct workqueue_struct	*mcast_wq;
+static union ib_gid mgid0;
+
+struct mcast_device;
+
+struct mcast_port {
+	struct mcast_device	*dev;
+	spinlock_t		lock;
+	struct rb_root		table;
+	atomic_t		refcount;
+	struct completion	comp;
+	u8			port_num;
+};
+
+struct mcast_device {
+	struct ib_device	*device;
+	struct ib_event_handler	event_handler;
+	int			start_port;
+	int			end_port;
+	struct mcast_port	port[0];
+};
+
+enum mcast_state {
+	MCAST_JOINING,
+	MCAST_MEMBER,
+	MCAST_ERROR,
+};
+
+enum mcast_group_state {
+	MCAST_IDLE,
+	MCAST_BUSY,
+	MCAST_GROUP_ERROR,
+	MCAST_PKEY_EVENT
+};
+
+enum {
+	MCAST_INVALID_PKEY_INDEX = 0xFFFF
+};
+
+struct mcast_member;
+
+struct mcast_group {
+	struct ib_sa_mcmember_rec rec;
+	struct rb_node		node;
+	struct mcast_port	*port;
+	spinlock_t		lock;
+	struct work_struct	work;
+	struct list_head	pending_list;
+	struct list_head	active_list;
+	struct mcast_member	*last_join;
+	int			members[3];
+	atomic_t		refcount;
+	enum mcast_group_state	state;
+	struct ib_sa_query	*query;
+	int			query_id;
+	u16			pkey_index;
+	u8			leave_state;
+	int			retries;
+};
+
+struct mcast_member {
+	struct ib_sa_multicast	multicast;
+	struct ib_sa_client	*client;
+	struct mcast_group	*group;
+	struct list_head	list;
+	enum mcast_state	state;
+	atomic_t		refcount;
+	struct completion	comp;
+};
+
+static void join_handler(int status, struct ib_sa_mcmember_rec *rec,
+			 void *context);
+static void leave_handler(int status, struct ib_sa_mcmember_rec *rec,
+			  void *context);
+
+static struct mcast_group *mcast_find(struct mcast_port *port,
+				      union ib_gid *mgid)
+{
+	struct rb_node *node = port->table.rb_node;
+	struct mcast_group *group;
+	int ret;
+
+	while (node) {
+		group = rb_entry(node, struct mcast_group, node);
+		ret = memcmp(mgid->raw, group->rec.mgid.raw, sizeof *mgid);
+		if (!ret)
+			return group;
+
+		if (ret < 0)
+			node = node->rb_left;
+		else
+			node = node->rb_right;
+	}
+	return NULL;
+}
+
+static struct mcast_group *mcast_insert(struct mcast_port *port,
+					struct mcast_group *group,
+					int allow_duplicates)
+{
+	struct rb_node **link = &port->table.rb_node;
+	struct rb_node *parent = NULL;
+	struct mcast_group *cur_group;
+	int ret;
+
+	while (*link) {
+		parent = *link;
+		cur_group = rb_entry(parent, struct mcast_group, node);
+
+		ret = memcmp(group->rec.mgid.raw, cur_group->rec.mgid.raw,
+			     sizeof group->rec.mgid);
+		if (ret < 0)
+			link = &(*link)->rb_left;
+		else if (ret > 0)
+			link = &(*link)->rb_right;
+		else if (allow_duplicates)
+			link = &(*link)->rb_left;
+		else
+			return cur_group;
+	}
+	rb_link_node(&group->node, parent, link);
+	rb_insert_color(&group->node, &port->table);
+	return NULL;
+}
+
+static void deref_port(struct mcast_port *port)
+{
+	if (atomic_dec_and_test(&port->refcount))
+		complete(&port->comp);
+}
+
+static void release_group(struct mcast_group *group)
+{
+	struct mcast_port *port = group->port;
+	unsigned long flags;
+
+	spin_lock_irqsave(&port->lock, flags);
+	if (atomic_dec_and_test(&group->refcount)) {
+		rb_erase(&group->node, &port->table);
+		spin_unlock_irqrestore(&port->lock, flags);
+		kfree(group);
+		deref_port(port);
+	} else
+		spin_unlock_irqrestore(&port->lock, flags);
+}
+
+static void deref_member(struct mcast_member *member)
+{
+	if (atomic_dec_and_test(&member->refcount))
+		complete(&member->comp);
+}
+
+static void queue_join(struct mcast_member *member)
+{
+	struct mcast_group *group = member->group;
+	unsigned long flags;
+
+	spin_lock_irqsave(&group->lock, flags);
+	list_add_tail(&member->list, &group->pending_list);
+	if (group->state == MCAST_IDLE) {
+		group->state = MCAST_BUSY;
+		atomic_inc(&group->refcount);
+		queue_work(mcast_wq, &group->work);
+	}
+	spin_unlock_irqrestore(&group->lock, flags);
+}
+
+/*
+ * A multicast group has three types of members: full member, non member, and
+ * send only member.  We need to keep track of the number of members of each
+ * type based on their join state.  Adjust the number of members the belong to
+ * the specified join states.
+ */
+static void adjust_membership(struct mcast_group *group, u8 join_state, int inc)
+{
+	int i;
+
+	for (i = 0; i < 3; i++, join_state >>= 1)
+		if (join_state & 0x1)
+			group->members[i] += inc;
+}
+
+/*
+ * If a multicast group has zero members left for a particular join state, but
+ * the group is still a member with the SA, we need to leave that join state.
+ * Determine which join states we still belong to, but that do not have any
+ * active members.
+ */
+static u8 get_leave_state(struct mcast_group *group)
+{
+	u8 leave_state = 0;
+	int i;
+
+	for (i = 0; i < 3; i++)
+		if (!group->members[i])
+			leave_state |= (0x1 << i);
+
+	return leave_state & group->rec.join_state;
+}
+
+static int check_selector(ib_sa_comp_mask comp_mask,
+			  ib_sa_comp_mask selector_mask,
+			  ib_sa_comp_mask value_mask,
+			  u8 selector, u8 src_value, u8 dst_value)
+{
+	int err;
+
+	if (!(comp_mask & selector_mask) || !(comp_mask & value_mask))
+		return 0;
+
+	switch (selector) {
+	case IB_SA_GT:
+		err = (src_value <= dst_value);
+		break;
+	case IB_SA_LT:
+		err = (src_value >= dst_value);
+		break;
+	case IB_SA_EQ:
+		err = (src_value != dst_value);
+		break;
+	default:
+		err = 0;
+		break;
+	}
+
+	return err;
+}
+
+static int cmp_rec(struct ib_sa_mcmember_rec *src,
+		   struct ib_sa_mcmember_rec *dst, ib_sa_comp_mask comp_mask)
+{
+	/* MGID must already match */
+
+	if (comp_mask & IB_SA_MCMEMBER_REC_PORT_GID &&
+	    memcmp(&src->port_gid, &dst->port_gid, sizeof src->port_gid))
+		return -EINVAL;
+	if (comp_mask & IB_SA_MCMEMBER_REC_QKEY && src->qkey != dst->qkey)
+		return -EINVAL;
+	if (comp_mask & IB_SA_MCMEMBER_REC_MLID && src->mlid != dst->mlid)
+		return -EINVAL;
+	if (check_selector(comp_mask, IB_SA_MCMEMBER_REC_MTU_SELECTOR,
+			   IB_SA_MCMEMBER_REC_MTU, dst->mtu_selector,
+			   src->mtu, dst->mtu))
+		return -EINVAL;
+	if (comp_mask & IB_SA_MCMEMBER_REC_TRAFFIC_CLASS &&
+	    src->traffic_class != dst->traffic_class)
+		return -EINVAL;
+	if (comp_mask & IB_SA_MCMEMBER_REC_PKEY && src->pkey != dst->pkey)
+		return -EINVAL;
+	if (check_selector(comp_mask, IB_SA_MCMEMBER_REC_RATE_SELECTOR,
+			   IB_SA_MCMEMBER_REC_RATE, dst->rate_selector,
+			   src->rate, dst->rate))
+		return -EINVAL;
+	if (check_selector(comp_mask,
+			   IB_SA_MCMEMBER_REC_PACKET_LIFE_TIME_SELECTOR,
+			   IB_SA_MCMEMBER_REC_PACKET_LIFE_TIME,
+			   dst->packet_life_time_selector,
+			   src->packet_life_time, dst->packet_life_time))
+		return -EINVAL;
+	if (comp_mask & IB_SA_MCMEMBER_REC_SL && src->sl != dst->sl)
+		return -EINVAL;
+	if (comp_mask & IB_SA_MCMEMBER_REC_FLOW_LABEL &&
+	    src->flow_label != dst->flow_label)
+		return -EINVAL;
+	if (comp_mask & IB_SA_MCMEMBER_REC_HOP_LIMIT &&
+	    src->hop_limit != dst->hop_limit)
+		return -EINVAL;
+	if (comp_mask & IB_SA_MCMEMBER_REC_SCOPE && src->scope != dst->scope)
+		return -EINVAL;
+
+	/* join_state checked separately, proxy_join ignored */
+
+	return 0;
+}
+
+static int send_join(struct mcast_group *group, struct mcast_member *member)
+{
+	struct mcast_port *port = group->port;
+	int ret;
+
+	group->last_join = member;
+	ret = ib_sa_mcmember_rec_query(&sa_client, port->dev->device,
+				       port->port_num, IB_MGMT_METHOD_SET,
+				       &member->multicast.rec,
+				       member->multicast.comp_mask,
+				       3000, GFP_KERNEL, join_handler, group,
+				       &group->query);
+	if (ret >= 0) {
+		group->query_id = ret;
+		ret = 0;
+	}
+	return ret;
+}
+
+static int send_leave(struct mcast_group *group, u8 leave_state)
+{
+	struct mcast_port *port = group->port;
+	struct ib_sa_mcmember_rec rec;
+	int ret;
+
+	rec = group->rec;
+	rec.join_state = leave_state;
+	group->leave_state = leave_state;
+
+	ret = ib_sa_mcmember_rec_query(&sa_client, port->dev->device,
+				       port->port_num, IB_SA_METHOD_DELETE, &rec,
+				       IB_SA_MCMEMBER_REC_MGID     |
+				       IB_SA_MCMEMBER_REC_PORT_GID |
+				       IB_SA_MCMEMBER_REC_JOIN_STATE,
+				       3000, GFP_KERNEL, leave_handler,
+				       group, &group->query);
+	if (ret >= 0) {
+		group->query_id = ret;
+		ret = 0;
+	}
+	return ret;
+}
+
+static void join_group(struct mcast_group *group, struct mcast_member *member,
+		       u8 join_state)
+{
+	member->state = MCAST_MEMBER;
+	adjust_membership(group, join_state, 1);
+	group->rec.join_state |= join_state;
+	member->multicast.rec = group->rec;
+	member->multicast.rec.join_state = join_state;
+	list_move(&member->list, &group->active_list);
+}
+
+static int fail_join(struct mcast_group *group, struct mcast_member *member,
+		     int status)
+{
+	spin_lock_irq(&group->lock);
+	list_del_init(&member->list);
+	spin_unlock_irq(&group->lock);
+	return member->multicast.callback(status, &member->multicast);
+}
+
+static void process_group_error(struct mcast_group *group)
+{
+	struct mcast_member *member;
+	int ret = 0;
+	u16 pkey_index;
+
+	if (group->state == MCAST_PKEY_EVENT)
+		ret = ib_find_pkey(group->port->dev->device,
+				   group->port->port_num,
+				   be16_to_cpu(group->rec.pkey), &pkey_index);
+
+	spin_lock_irq(&group->lock);
+	if (group->state == MCAST_PKEY_EVENT && !ret &&
+	    group->pkey_index == pkey_index)
+		goto out;
+
+	while (!list_empty(&group->active_list)) {
+		member = list_entry(group->active_list.next,
+				    struct mcast_member, list);
+		atomic_inc(&member->refcount);
+		list_del_init(&member->list);
+		adjust_membership(group, member->multicast.rec.join_state, -1);
+		member->state = MCAST_ERROR;
+		spin_unlock_irq(&group->lock);
+
+		ret = member->multicast.callback(-ENETRESET,
+						 &member->multicast);
+		deref_member(member);
+		if (ret)
+			ib_sa_free_multicast(&member->multicast);
+		spin_lock_irq(&group->lock);
+	}
+
+	group->rec.join_state = 0;
+out:
+	group->state = MCAST_BUSY;
+	spin_unlock_irq(&group->lock);
+}
+
+static void mcast_work_handler(struct work_struct *work)
+{
+	struct mcast_group *group;
+	struct mcast_member *member;
+	struct ib_sa_multicast *multicast;
+	int status, ret;
+	u8 join_state;
+
+	group = container_of(work, typeof(*group), work);
+retest:
+	spin_lock_irq(&group->lock);
+	while (!list_empty(&group->pending_list) ||
+	       (group->state != MCAST_BUSY)) {
+
+		if (group->state != MCAST_BUSY) {
+			spin_unlock_irq(&group->lock);
+			process_group_error(group);
+			goto retest;
+		}
+
+		member = list_entry(group->pending_list.next,
+				    struct mcast_member, list);
+		multicast = &member->multicast;
+		join_state = multicast->rec.join_state;
+		atomic_inc(&member->refcount);
+
+		if (join_state == (group->rec.join_state & join_state)) {
+			status = cmp_rec(&group->rec, &multicast->rec,
+					 multicast->comp_mask);
+			if (!status)
+				join_group(group, member, join_state);
+			else
+				list_del_init(&member->list);
+			spin_unlock_irq(&group->lock);
+			ret = multicast->callback(status, multicast);
+		} else {
+			spin_unlock_irq(&group->lock);
+			status = send_join(group, member);
+			if (!status) {
+				deref_member(member);
+				return;
+			}
+			ret = fail_join(group, member, status);
+		}
+
+		deref_member(member);
+		if (ret)
+			ib_sa_free_multicast(&member->multicast);
+		spin_lock_irq(&group->lock);
+	}
+
+	join_state = get_leave_state(group);
+	if (join_state) {
+		group->rec.join_state &= ~join_state;
+		spin_unlock_irq(&group->lock);
+		if (send_leave(group, join_state))
+			goto retest;
+	} else {
+		group->state = MCAST_IDLE;
+		spin_unlock_irq(&group->lock);
+		release_group(group);
+	}
+}
+
+/*
+ * Fail a join request if it is still active - at the head of the pending queue.
+ */
+static void process_join_error(struct mcast_group *group, int status)
+{
+	struct mcast_member *member;
+	int ret;
+
+	spin_lock_irq(&group->lock);
+	member = list_entry(group->pending_list.next,
+			    struct mcast_member, list);
+	if (group->last_join == member) {
+		atomic_inc(&member->refcount);
+		list_del_init(&member->list);
+		spin_unlock_irq(&group->lock);
+		ret = member->multicast.callback(status, &member->multicast);
+		deref_member(member);
+		if (ret)
+			ib_sa_free_multicast(&member->multicast);
+	} else
+		spin_unlock_irq(&group->lock);
+}
+
+static void join_handler(int status, struct ib_sa_mcmember_rec *rec,
+			 void *context)
+{
+	struct mcast_group *group = context;
+	u16 pkey_index = MCAST_INVALID_PKEY_INDEX;
+
+	if (status)
+		process_join_error(group, status);
+	else {
+		ib_find_pkey(group->port->dev->device, group->port->port_num,
+			     be16_to_cpu(rec->pkey), &pkey_index);
+
+		spin_lock_irq(&group->port->lock);
+		group->rec = *rec;
+		if (group->state == MCAST_BUSY &&
+		    group->pkey_index == MCAST_INVALID_PKEY_INDEX)
+			group->pkey_index = pkey_index;
+		if (!memcmp(&mgid0, &group->rec.mgid, sizeof mgid0)) {
+			rb_erase(&group->node, &group->port->table);
+			mcast_insert(group->port, group, 1);
+		}
+		spin_unlock_irq(&group->port->lock);
+	}
+	mcast_work_handler(&group->work);
+}
+
+static void leave_handler(int status, struct ib_sa_mcmember_rec *rec,
+			  void *context)
+{
+	struct mcast_group *group = context;
+
+	if (status && group->retries > 0 &&
+	    !send_leave(group, group->leave_state))
+		group->retries--;
+	else
+		mcast_work_handler(&group->work);
+}
+
+static struct mcast_group *acquire_group(struct mcast_port *port,
+					 union ib_gid *mgid, gfp_t gfp_mask)
+{
+	struct mcast_group *group, *cur_group;
+	unsigned long flags;
+	int is_mgid0;
+
+	is_mgid0 = !memcmp(&mgid0, mgid, sizeof mgid0);
+	if (!is_mgid0) {
+		spin_lock_irqsave(&port->lock, flags);
+		group = mcast_find(port, mgid);
+		if (group)
+			goto found;
+		spin_unlock_irqrestore(&port->lock, flags);
+	}
+
+	group = kzalloc(sizeof *group, gfp_mask);
+	if (!group)
+		return NULL;
+
+	group->retries = 3;
+	group->port = port;
+	group->rec.mgid = *mgid;
+	group->pkey_index = MCAST_INVALID_PKEY_INDEX;
+	INIT_LIST_HEAD(&group->pending_list);
+	INIT_LIST_HEAD(&group->active_list);
+	INIT_WORK(&group->work, mcast_work_handler);
+	spin_lock_init(&group->lock);
+
+	spin_lock_irqsave(&port->lock, flags);
+	cur_group = mcast_insert(port, group, is_mgid0);
+	if (cur_group) {
+		kfree(group);
+		group = cur_group;
+	} else
+		atomic_inc(&port->refcount);
+found:
+	atomic_inc(&group->refcount);
+	spin_unlock_irqrestore(&port->lock, flags);
+	return group;
+}
+
+/*
+ * We serialize all join requests to a single group to make our lives much
+ * easier.  Otherwise, two users could try to join the same group
+ * simultaneously, with different configurations, one could leave while the
+ * join is in progress, etc., which makes locking around error recovery
+ * difficult.
+ */
+struct ib_sa_multicast *
+ib_sa_join_multicast(struct ib_sa_client *client,
+		     struct ib_device *device, u8 port_num,
+		     struct ib_sa_mcmember_rec *rec,
+		     ib_sa_comp_mask comp_mask, gfp_t gfp_mask,
+		     int (*callback)(int status,
+				     struct ib_sa_multicast *multicast),
+		     void *context)
+{
+	struct mcast_device *dev;
+	struct mcast_member *member;
+	struct ib_sa_multicast *multicast;
+	int ret;
+
+	dev = ib_get_client_data(device, &mcast_client);
+	if (!dev)
+		return ERR_PTR(-ENODEV);
+
+	member = kmalloc(sizeof *member, gfp_mask);
+	if (!member)
+		return ERR_PTR(-ENOMEM);
+
+	ib_sa_client_get(client);
+	member->client = client;
+	member->multicast.rec = *rec;
+	member->multicast.comp_mask = comp_mask;
+	member->multicast.callback = callback;
+	member->multicast.context = context;
+	init_completion(&member->comp);
+	atomic_set(&member->refcount, 1);
+	member->state = MCAST_JOINING;
+
+	member->group = acquire_group(&dev->port[port_num - dev->start_port],
+				      &rec->mgid, gfp_mask);
+	if (!member->group) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	/*
+	 * The user will get the multicast structure in their callback.  They
+	 * could then free the multicast structure before we can return from
+	 * this routine.  So we save the pointer to return before queuing
+	 * any callback.
+	 */
+	multicast = &member->multicast;
+	queue_join(member);
+	return multicast;
+
+err:
+	ib_sa_client_put(client);
+	kfree(member);
+	return ERR_PTR(ret);
+}
+EXPORT_SYMBOL(ib_sa_join_multicast);
+
+void ib_sa_free_multicast(struct ib_sa_multicast *multicast)
+{
+	struct mcast_member *member;
+	struct mcast_group *group;
+
+	member = container_of(multicast, struct mcast_member, multicast);
+	group = member->group;
+
+	spin_lock_irq(&group->lock);
+	if (member->state == MCAST_MEMBER)
+		adjust_membership(group, multicast->rec.join_state, -1);
+
+	list_del_init(&member->list);
+
+	if (group->state == MCAST_IDLE) {
+		group->state = MCAST_BUSY;
+		spin_unlock_irq(&group->lock);
+		/* Continue to hold reference on group until callback */
+		queue_work(mcast_wq, &group->work);
+	} else {
+		spin_unlock_irq(&group->lock);
+		release_group(group);
+	}
+
+	deref_member(member);
+	wait_for_completion(&member->comp);
+	ib_sa_client_put(member->client);
+	kfree(member);
+}
+EXPORT_SYMBOL(ib_sa_free_multicast);
+
+int ib_sa_get_mcmember_rec(struct ib_device *device, u8 port_num,
+			   union ib_gid *mgid, struct ib_sa_mcmember_rec *rec)
+{
+	struct mcast_device *dev;
+	struct mcast_port *port;
+	struct mcast_group *group;
+	unsigned long flags;
+	int ret = 0;
+
+	dev = ib_get_client_data(device, &mcast_client);
+	if (!dev)
+		return -ENODEV;
+
+	port = &dev->port[port_num - dev->start_port];
+	spin_lock_irqsave(&port->lock, flags);
+	group = mcast_find(port, mgid);
+	if (group)
+		*rec = group->rec;
+	else
+		ret = -EADDRNOTAVAIL;
+	spin_unlock_irqrestore(&port->lock, flags);
+
+	return ret;
+}
+EXPORT_SYMBOL(ib_sa_get_mcmember_rec);
+
+int ib_init_ah_from_mcmember(struct ib_device *device, u8 port_num,
+			     struct ib_sa_mcmember_rec *rec,
+			     struct ib_ah_attr *ah_attr)
+{
+	int ret;
+	u16 gid_index;
+	u8 p;
+
+	ret = ib_find_cached_gid(device, &rec->port_gid, &p, &gid_index);
+	if (ret)
+		return ret;
+
+	memset(ah_attr, 0, sizeof *ah_attr);
+	ah_attr->dlid = be16_to_cpu(rec->mlid);
+	ah_attr->sl = rec->sl;
+	ah_attr->port_num = port_num;
+	ah_attr->static_rate = rec->rate;
+
+	ah_attr->ah_flags = IB_AH_GRH;
+	ah_attr->grh.dgid = rec->mgid;
+
+	ah_attr->grh.sgid_index = (u8) gid_index;
+	ah_attr->grh.flow_label = be32_to_cpu(rec->flow_label);
+	ah_attr->grh.hop_limit = rec->hop_limit;
+	ah_attr->grh.traffic_class = rec->traffic_class;
+
+	return 0;
+}
+EXPORT_SYMBOL(ib_init_ah_from_mcmember);
+
+static void mcast_groups_event(struct mcast_port *port,
+			       enum mcast_group_state state)
+{
+	struct mcast_group *group;
+	struct rb_node *node;
+	unsigned long flags;
+
+	spin_lock_irqsave(&port->lock, flags);
+	for (node = rb_first(&port->table); node; node = rb_next(node)) {
+		group = rb_entry(node, struct mcast_group, node);
+		spin_lock(&group->lock);
+		if (group->state == MCAST_IDLE) {
+			atomic_inc(&group->refcount);
+			queue_work(mcast_wq, &group->work);
+		}
+		if (group->state != MCAST_GROUP_ERROR)
+			group->state = state;
+		spin_unlock(&group->lock);
+	}
+	spin_unlock_irqrestore(&port->lock, flags);
+}
+
+static void mcast_event_handler(struct ib_event_handler *handler,
+				struct ib_event *event)
+{
+	struct mcast_device *dev;
+	int index;
+
+	dev = container_of(handler, struct mcast_device, event_handler);
+	if (rdma_port_get_link_layer(dev->device, event->element.port_num) !=
+	    IB_LINK_LAYER_INFINIBAND)
+		return;
+
+	index = event->element.port_num - dev->start_port;
+
+	switch (event->event) {
+	case IB_EVENT_PORT_ERR:
+	case IB_EVENT_LID_CHANGE:
+	case IB_EVENT_SM_CHANGE:
+	case IB_EVENT_CLIENT_REREGISTER:
+		mcast_groups_event(&dev->port[index], MCAST_GROUP_ERROR);
+		break;
+	case IB_EVENT_PKEY_CHANGE:
+		mcast_groups_event(&dev->port[index], MCAST_PKEY_EVENT);
+		break;
+	default:
+		break;
+	}
+}
+
+static void mcast_add_one(struct ib_device *device)
+{
+	struct mcast_device *dev;
+	struct mcast_port *port;
+	int i;
+	int count = 0;
+
+	if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB)
+		return;
+
+	dev = kmalloc(sizeof *dev + device->phys_port_cnt * sizeof *port,
+		      GFP_KERNEL);
+	if (!dev)
+		return;
+
+	if (device->node_type == RDMA_NODE_IB_SWITCH)
+		dev->start_port = dev->end_port = 0;
+	else {
+		dev->start_port = 1;
+		dev->end_port = device->phys_port_cnt;
+	}
+
+	for (i = 0; i <= dev->end_port - dev->start_port; i++) {
+		if (rdma_port_get_link_layer(device, dev->start_port + i) !=
+		    IB_LINK_LAYER_INFINIBAND)
+			continue;
+		port = &dev->port[i];
+		port->dev = dev;
+		port->port_num = dev->start_port + i;
+		spin_lock_init(&port->lock);
+		port->table = RB_ROOT;
+		init_completion(&port->comp);
+		atomic_set(&port->refcount, 1);
+		++count;
+	}
+
+	if (!count) {
+		kfree(dev);
+		return;
+	}
+
+	dev->device = device;
+	ib_set_client_data(device, &mcast_client, dev);
+
+	INIT_IB_EVENT_HANDLER(&dev->event_handler, device, mcast_event_handler);
+	ib_register_event_handler(&dev->event_handler);
+}
+
+static void mcast_remove_one(struct ib_device *device)
+{
+	struct mcast_device *dev;
+	struct mcast_port *port;
+	int i;
+
+	dev = ib_get_client_data(device, &mcast_client);
+	if (!dev)
+		return;
+
+	ib_unregister_event_handler(&dev->event_handler);
+	flush_workqueue(mcast_wq);
+
+	for (i = 0; i <= dev->end_port - dev->start_port; i++) {
+		if (rdma_port_get_link_layer(device, dev->start_port + i) ==
+		    IB_LINK_LAYER_INFINIBAND) {
+			port = &dev->port[i];
+			deref_port(port);
+			wait_for_completion(&port->comp);
+		}
+	}
+
+	kfree(dev);
+}
+
+int mcast_init(void)
+{
+	int ret;
+
+	mcast_wq = create_singlethread_workqueue("ib_mcast");
+	if (!mcast_wq)
+		return -ENOMEM;
+
+	ib_sa_register_client(&sa_client);
+
+	ret = ib_register_client(&mcast_client);
+	if (ret)
+		goto err;
+	return 0;
+
+err:
+	ib_sa_unregister_client(&sa_client);
+	destroy_workqueue(mcast_wq);
+	return ret;
+}
+
+void mcast_cleanup(void)
+{
+	ib_unregister_client(&mcast_client);
+	ib_sa_unregister_client(&sa_client);
+	destroy_workqueue(mcast_wq);
+}
diff --git a/drivers/infiniband/core/netlink.c b/drivers/infiniband/core/netlink.c
new file mode 100644
index 0000000..23dd5a5
--- /dev/null
+++ b/drivers/infiniband/core/netlink.c
@@ -0,0 +1,216 @@
+/*
+ * Copyright (c) 2010 Voltaire Inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#define pr_fmt(fmt) "%s:%s: " fmt, KBUILD_MODNAME, __func__
+
+#include <linux/export.h>
+#include <net/netlink.h>
+#include <net/net_namespace.h>
+#include <net/sock.h>
+#include <rdma/rdma_netlink.h>
+
+struct ibnl_client {
+	struct list_head		list;
+	int				index;
+	int				nops;
+	const struct ibnl_client_cbs   *cb_table;
+};
+
+static DEFINE_MUTEX(ibnl_mutex);
+static struct sock *nls;
+static LIST_HEAD(client_list);
+
+int ibnl_add_client(int index, int nops,
+		    const struct ibnl_client_cbs cb_table[])
+{
+	struct ibnl_client *cur;
+	struct ibnl_client *nl_client;
+
+	nl_client = kmalloc(sizeof *nl_client, GFP_KERNEL);
+	if (!nl_client)
+		return -ENOMEM;
+
+	nl_client->index	= index;
+	nl_client->nops		= nops;
+	nl_client->cb_table	= cb_table;
+
+	mutex_lock(&ibnl_mutex);
+
+	list_for_each_entry(cur, &client_list, list) {
+		if (cur->index == index) {
+			pr_warn("Client for %d already exists\n", index);
+			mutex_unlock(&ibnl_mutex);
+			kfree(nl_client);
+			return -EINVAL;
+		}
+	}
+
+	list_add_tail(&nl_client->list, &client_list);
+
+	mutex_unlock(&ibnl_mutex);
+
+	return 0;
+}
+EXPORT_SYMBOL(ibnl_add_client);
+
+int ibnl_remove_client(int index)
+{
+	struct ibnl_client *cur, *next;
+
+	mutex_lock(&ibnl_mutex);
+	list_for_each_entry_safe(cur, next, &client_list, list) {
+		if (cur->index == index) {
+			list_del(&(cur->list));
+			mutex_unlock(&ibnl_mutex);
+			kfree(cur);
+			return 0;
+		}
+	}
+	pr_warn("Can't remove callback for client idx %d. Not found\n", index);
+	mutex_unlock(&ibnl_mutex);
+
+	return -EINVAL;
+}
+EXPORT_SYMBOL(ibnl_remove_client);
+
+void *ibnl_put_msg(struct sk_buff *skb, struct nlmsghdr **nlh, int seq,
+		   int len, int client, int op, int flags)
+{
+	unsigned char *prev_tail;
+
+	prev_tail = skb_tail_pointer(skb);
+	*nlh = nlmsg_put(skb, 0, seq, RDMA_NL_GET_TYPE(client, op),
+			 len, flags);
+	if (!*nlh)
+		goto out_nlmsg_trim;
+	(*nlh)->nlmsg_len = skb_tail_pointer(skb) - prev_tail;
+	return nlmsg_data(*nlh);
+
+out_nlmsg_trim:
+	nlmsg_trim(skb, prev_tail);
+	return NULL;
+}
+EXPORT_SYMBOL(ibnl_put_msg);
+
+int ibnl_put_attr(struct sk_buff *skb, struct nlmsghdr *nlh,
+		  int len, void *data, int type)
+{
+	unsigned char *prev_tail;
+
+	prev_tail = skb_tail_pointer(skb);
+	if (nla_put(skb, type, len, data))
+		goto nla_put_failure;
+	nlh->nlmsg_len += skb_tail_pointer(skb) - prev_tail;
+	return 0;
+
+nla_put_failure:
+	nlmsg_trim(skb, prev_tail - nlh->nlmsg_len);
+	return -EMSGSIZE;
+}
+EXPORT_SYMBOL(ibnl_put_attr);
+
+static int ibnl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
+{
+	struct ibnl_client *client;
+	int type = nlh->nlmsg_type;
+	int index = RDMA_NL_GET_CLIENT(type);
+	int op = RDMA_NL_GET_OP(type);
+
+	list_for_each_entry(client, &client_list, list) {
+		if (client->index == index) {
+			if (op < 0 || op >= client->nops ||
+			    !client->cb_table[op].dump)
+				return -EINVAL;
+
+			{
+				struct netlink_dump_control c = {
+					.dump = client->cb_table[op].dump,
+					.module = client->cb_table[op].module,
+				};
+				return netlink_dump_start(nls, skb, nlh, &c);
+			}
+		}
+	}
+
+	pr_info("Index %d wasn't found in client list\n", index);
+	return -EINVAL;
+}
+
+static void ibnl_rcv(struct sk_buff *skb)
+{
+	mutex_lock(&ibnl_mutex);
+	netlink_rcv_skb(skb, &ibnl_rcv_msg);
+	mutex_unlock(&ibnl_mutex);
+}
+
+int ibnl_unicast(struct sk_buff *skb, struct nlmsghdr *nlh,
+			__u32 pid)
+{
+	return nlmsg_unicast(nls, skb, pid);
+}
+EXPORT_SYMBOL(ibnl_unicast);
+
+int ibnl_multicast(struct sk_buff *skb, struct nlmsghdr *nlh,
+			unsigned int group, gfp_t flags)
+{
+	return nlmsg_multicast(nls, skb, 0, group, flags);
+}
+EXPORT_SYMBOL(ibnl_multicast);
+
+int __init ibnl_init(void)
+{
+	struct netlink_kernel_cfg cfg = {
+		.input	= ibnl_rcv,
+	};
+
+	nls = netlink_kernel_create(&init_net, NETLINK_RDMA, &cfg);
+	if (!nls) {
+		pr_warn("Failed to create netlink socket\n");
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+void ibnl_cleanup(void)
+{
+	struct ibnl_client *cur, *next;
+
+	mutex_lock(&ibnl_mutex);
+	list_for_each_entry_safe(cur, next, &client_list, list) {
+		list_del(&(cur->list));
+		kfree(cur);
+	}
+	mutex_unlock(&ibnl_mutex);
+
+	netlink_kernel_release(nls);
+}
diff --git a/drivers/infiniband/core/packer.c b/drivers/infiniband/core/packer.c
new file mode 100644
index 0000000..1b65986
--- /dev/null
+++ b/drivers/infiniband/core/packer.c
@@ -0,0 +1,203 @@
+/*
+ * Copyright (c) 2004 Topspin Corporation.  All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/export.h>
+#include <linux/string.h>
+
+#include <rdma/ib_pack.h>
+
+static u64 value_read(int offset, int size, void *structure)
+{
+	switch (size) {
+	case 1: return                *(u8  *) (structure + offset);
+	case 2: return be16_to_cpup((__be16 *) (structure + offset));
+	case 4: return be32_to_cpup((__be32 *) (structure + offset));
+	case 8: return be64_to_cpup((__be64 *) (structure + offset));
+	default:
+		printk(KERN_WARNING "Field size %d bits not handled\n", size * 8);
+		return 0;
+	}
+}
+
+/**
+ * ib_pack - Pack a structure into a buffer
+ * @desc:Array of structure field descriptions
+ * @desc_len:Number of entries in @desc
+ * @structure:Structure to pack from
+ * @buf:Buffer to pack into
+ *
+ * ib_pack() packs a list of structure fields into a buffer,
+ * controlled by the array of fields in @desc.
+ */
+void ib_pack(const struct ib_field        *desc,
+	     int                           desc_len,
+	     void                         *structure,
+	     void                         *buf)
+{
+	int i;
+
+	for (i = 0; i < desc_len; ++i) {
+		if (desc[i].size_bits <= 32) {
+			int shift;
+			u32 val;
+			__be32 mask;
+			__be32 *addr;
+
+			shift = 32 - desc[i].offset_bits - desc[i].size_bits;
+			if (desc[i].struct_size_bytes)
+				val = value_read(desc[i].struct_offset_bytes,
+						 desc[i].struct_size_bytes,
+						 structure) << shift;
+			else
+				val = 0;
+
+			mask = cpu_to_be32(((1ull << desc[i].size_bits) - 1) << shift);
+			addr = (__be32 *) buf + desc[i].offset_words;
+			*addr = (*addr & ~mask) | (cpu_to_be32(val) & mask);
+		} else if (desc[i].size_bits <= 64) {
+			int shift;
+			u64 val;
+			__be64 mask;
+			__be64 *addr;
+
+			shift = 64 - desc[i].offset_bits - desc[i].size_bits;
+			if (desc[i].struct_size_bytes)
+				val = value_read(desc[i].struct_offset_bytes,
+						 desc[i].struct_size_bytes,
+						 structure) << shift;
+			else
+				val = 0;
+
+			mask = cpu_to_be64((~0ull >> (64 - desc[i].size_bits)) << shift);
+			addr = (__be64 *) ((__be32 *) buf + desc[i].offset_words);
+			*addr = (*addr & ~mask) | (cpu_to_be64(val) & mask);
+		} else {
+			if (desc[i].offset_bits % 8 ||
+			    desc[i].size_bits   % 8) {
+				printk(KERN_WARNING "Structure field %s of size %d "
+				       "bits is not byte-aligned\n",
+				       desc[i].field_name, desc[i].size_bits);
+			}
+
+			if (desc[i].struct_size_bytes)
+				memcpy(buf + desc[i].offset_words * 4 +
+				       desc[i].offset_bits / 8,
+				       structure + desc[i].struct_offset_bytes,
+				       desc[i].size_bits / 8);
+			else
+				memset(buf + desc[i].offset_words * 4 +
+				       desc[i].offset_bits / 8,
+				       0,
+				       desc[i].size_bits / 8);
+		}
+	}
+}
+EXPORT_SYMBOL(ib_pack);
+
+static void value_write(int offset, int size, u64 val, void *structure)
+{
+	switch (size * 8) {
+	case 8:  *(    u8 *) (structure + offset) = val; break;
+	case 16: *(__be16 *) (structure + offset) = cpu_to_be16(val); break;
+	case 32: *(__be32 *) (structure + offset) = cpu_to_be32(val); break;
+	case 64: *(__be64 *) (structure + offset) = cpu_to_be64(val); break;
+	default:
+		printk(KERN_WARNING "Field size %d bits not handled\n", size * 8);
+	}
+}
+
+/**
+ * ib_unpack - Unpack a buffer into a structure
+ * @desc:Array of structure field descriptions
+ * @desc_len:Number of entries in @desc
+ * @buf:Buffer to unpack from
+ * @structure:Structure to unpack into
+ *
+ * ib_pack() unpacks a list of structure fields from a buffer,
+ * controlled by the array of fields in @desc.
+ */
+void ib_unpack(const struct ib_field        *desc,
+	       int                           desc_len,
+	       void                         *buf,
+	       void                         *structure)
+{
+	int i;
+
+	for (i = 0; i < desc_len; ++i) {
+		if (!desc[i].struct_size_bytes)
+			continue;
+
+		if (desc[i].size_bits <= 32) {
+			int shift;
+			u32  val;
+			u32  mask;
+			__be32 *addr;
+
+			shift = 32 - desc[i].offset_bits - desc[i].size_bits;
+			mask = ((1ull << desc[i].size_bits) - 1) << shift;
+			addr = (__be32 *) buf + desc[i].offset_words;
+			val = (be32_to_cpup(addr) & mask) >> shift;
+			value_write(desc[i].struct_offset_bytes,
+				    desc[i].struct_size_bytes,
+				    val,
+				    structure);
+		} else if (desc[i].size_bits <= 64) {
+			int shift;
+			u64  val;
+			u64  mask;
+			__be64 *addr;
+
+			shift = 64 - desc[i].offset_bits - desc[i].size_bits;
+			mask = (~0ull >> (64 - desc[i].size_bits)) << shift;
+			addr = (__be64 *) buf + desc[i].offset_words;
+			val = (be64_to_cpup(addr) & mask) >> shift;
+			value_write(desc[i].struct_offset_bytes,
+				    desc[i].struct_size_bytes,
+				    val,
+				    structure);
+		} else {
+			if (desc[i].offset_bits % 8 ||
+			    desc[i].size_bits   % 8) {
+				printk(KERN_WARNING "Structure field %s of size %d "
+				       "bits is not byte-aligned\n",
+				       desc[i].field_name, desc[i].size_bits);
+			}
+
+			memcpy(structure + desc[i].struct_offset_bytes,
+			       buf + desc[i].offset_words * 4 +
+			       desc[i].offset_bits / 8,
+			       desc[i].size_bits / 8);
+		}
+	}
+}
+EXPORT_SYMBOL(ib_unpack);
diff --git a/drivers/infiniband/core/sa.h b/drivers/infiniband/core/sa.h
new file mode 100644
index 0000000..b1d4bbf
--- /dev/null
+++ b/drivers/infiniband/core/sa.h
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Voltaire, Inc.  All rights reserved.
+ * Copyright (c) 2006 Intel Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef SA_H
+#define SA_H
+
+#include <rdma/ib_sa.h>
+
+static inline void ib_sa_client_get(struct ib_sa_client *client)
+{
+	atomic_inc(&client->users);
+}
+
+static inline void ib_sa_client_put(struct ib_sa_client *client)
+{
+	if (atomic_dec_and_test(&client->users))
+		complete(&client->comp);
+}
+
+int ib_sa_mcmember_rec_query(struct ib_sa_client *client,
+			     struct ib_device *device, u8 port_num,
+			     u8 method,
+			     struct ib_sa_mcmember_rec *rec,
+			     ib_sa_comp_mask comp_mask,
+			     int timeout_ms, gfp_t gfp_mask,
+			     void (*callback)(int status,
+					      struct ib_sa_mcmember_rec *resp,
+					      void *context),
+			     void *context,
+			     struct ib_sa_query **sa_query);
+
+int mcast_init(void);
+void mcast_cleanup(void);
+
+#endif /* SA_H */
diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c
new file mode 100644
index 0000000..c38f030
--- /dev/null
+++ b/drivers/infiniband/core/sa_query.c
@@ -0,0 +1,1280 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Voltaire, Inc.  All rights reserved.
+ * Copyright (c) 2006 Intel Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/err.h>
+#include <linux/random.h>
+#include <linux/spinlock.h>
+#include <linux/slab.h>
+#include <linux/dma-mapping.h>
+#include <linux/kref.h>
+#include <linux/idr.h>
+#include <linux/workqueue.h>
+#include <uapi/linux/if_ether.h>
+#include <rdma/ib_pack.h>
+#include <rdma/ib_cache.h>
+#include "sa.h"
+
+MODULE_AUTHOR("Roland Dreier");
+MODULE_DESCRIPTION("InfiniBand subnet administration query support");
+MODULE_LICENSE("Dual BSD/GPL");
+
+struct ib_sa_sm_ah {
+	struct ib_ah        *ah;
+	struct kref          ref;
+	u16		     pkey_index;
+	u8		     src_path_mask;
+};
+
+struct ib_sa_port {
+	struct ib_mad_agent *agent;
+	struct ib_sa_sm_ah  *sm_ah;
+	struct work_struct   update_task;
+	spinlock_t           ah_lock;
+	u8                   port_num;
+};
+
+struct ib_sa_device {
+	int                     start_port, end_port;
+	struct ib_event_handler event_handler;
+	struct ib_sa_port port[0];
+};
+
+struct ib_sa_query {
+	void (*callback)(struct ib_sa_query *, int, struct ib_sa_mad *);
+	void (*release)(struct ib_sa_query *);
+	struct ib_sa_client    *client;
+	struct ib_sa_port      *port;
+	struct ib_mad_send_buf *mad_buf;
+	struct ib_sa_sm_ah     *sm_ah;
+	int			id;
+};
+
+struct ib_sa_service_query {
+	void (*callback)(int, struct ib_sa_service_rec *, void *);
+	void *context;
+	struct ib_sa_query sa_query;
+};
+
+struct ib_sa_path_query {
+	void (*callback)(int, struct ib_sa_path_rec *, void *);
+	void *context;
+	struct ib_sa_query sa_query;
+};
+
+struct ib_sa_guidinfo_query {
+	void (*callback)(int, struct ib_sa_guidinfo_rec *, void *);
+	void *context;
+	struct ib_sa_query sa_query;
+};
+
+struct ib_sa_mcmember_query {
+	void (*callback)(int, struct ib_sa_mcmember_rec *, void *);
+	void *context;
+	struct ib_sa_query sa_query;
+};
+
+static void ib_sa_add_one(struct ib_device *device);
+static void ib_sa_remove_one(struct ib_device *device);
+
+static struct ib_client sa_client = {
+	.name   = "sa",
+	.add    = ib_sa_add_one,
+	.remove = ib_sa_remove_one
+};
+
+static DEFINE_SPINLOCK(idr_lock);
+static DEFINE_IDR(query_idr);
+
+static DEFINE_SPINLOCK(tid_lock);
+static u32 tid;
+
+#define PATH_REC_FIELD(field) \
+	.struct_offset_bytes = offsetof(struct ib_sa_path_rec, field),		\
+	.struct_size_bytes   = sizeof ((struct ib_sa_path_rec *) 0)->field,	\
+	.field_name          = "sa_path_rec:" #field
+
+static const struct ib_field path_rec_table[] = {
+	{ PATH_REC_FIELD(service_id),
+	  .offset_words = 0,
+	  .offset_bits  = 0,
+	  .size_bits    = 64 },
+	{ PATH_REC_FIELD(dgid),
+	  .offset_words = 2,
+	  .offset_bits  = 0,
+	  .size_bits    = 128 },
+	{ PATH_REC_FIELD(sgid),
+	  .offset_words = 6,
+	  .offset_bits  = 0,
+	  .size_bits    = 128 },
+	{ PATH_REC_FIELD(dlid),
+	  .offset_words = 10,
+	  .offset_bits  = 0,
+	  .size_bits    = 16 },
+	{ PATH_REC_FIELD(slid),
+	  .offset_words = 10,
+	  .offset_bits  = 16,
+	  .size_bits    = 16 },
+	{ PATH_REC_FIELD(raw_traffic),
+	  .offset_words = 11,
+	  .offset_bits  = 0,
+	  .size_bits    = 1 },
+	{ RESERVED,
+	  .offset_words = 11,
+	  .offset_bits  = 1,
+	  .size_bits    = 3 },
+	{ PATH_REC_FIELD(flow_label),
+	  .offset_words = 11,
+	  .offset_bits  = 4,
+	  .size_bits    = 20 },
+	{ PATH_REC_FIELD(hop_limit),
+	  .offset_words = 11,
+	  .offset_bits  = 24,
+	  .size_bits    = 8 },
+	{ PATH_REC_FIELD(traffic_class),
+	  .offset_words = 12,
+	  .offset_bits  = 0,
+	  .size_bits    = 8 },
+	{ PATH_REC_FIELD(reversible),
+	  .offset_words = 12,
+	  .offset_bits  = 8,
+	  .size_bits    = 1 },
+	{ PATH_REC_FIELD(numb_path),
+	  .offset_words = 12,
+	  .offset_bits  = 9,
+	  .size_bits    = 7 },
+	{ PATH_REC_FIELD(pkey),
+	  .offset_words = 12,
+	  .offset_bits  = 16,
+	  .size_bits    = 16 },
+	{ PATH_REC_FIELD(qos_class),
+	  .offset_words = 13,
+	  .offset_bits  = 0,
+	  .size_bits    = 12 },
+	{ PATH_REC_FIELD(sl),
+	  .offset_words = 13,
+	  .offset_bits  = 12,
+	  .size_bits    = 4 },
+	{ PATH_REC_FIELD(mtu_selector),
+	  .offset_words = 13,
+	  .offset_bits  = 16,
+	  .size_bits    = 2 },
+	{ PATH_REC_FIELD(mtu),
+	  .offset_words = 13,
+	  .offset_bits  = 18,
+	  .size_bits    = 6 },
+	{ PATH_REC_FIELD(rate_selector),
+	  .offset_words = 13,
+	  .offset_bits  = 24,
+	  .size_bits    = 2 },
+	{ PATH_REC_FIELD(rate),
+	  .offset_words = 13,
+	  .offset_bits  = 26,
+	  .size_bits    = 6 },
+	{ PATH_REC_FIELD(packet_life_time_selector),
+	  .offset_words = 14,
+	  .offset_bits  = 0,
+	  .size_bits    = 2 },
+	{ PATH_REC_FIELD(packet_life_time),
+	  .offset_words = 14,
+	  .offset_bits  = 2,
+	  .size_bits    = 6 },
+	{ PATH_REC_FIELD(preference),
+	  .offset_words = 14,
+	  .offset_bits  = 8,
+	  .size_bits    = 8 },
+	{ RESERVED,
+	  .offset_words = 14,
+	  .offset_bits  = 16,
+	  .size_bits    = 48 },
+};
+
+#define MCMEMBER_REC_FIELD(field) \
+	.struct_offset_bytes = offsetof(struct ib_sa_mcmember_rec, field),	\
+	.struct_size_bytes   = sizeof ((struct ib_sa_mcmember_rec *) 0)->field,	\
+	.field_name          = "sa_mcmember_rec:" #field
+
+static const struct ib_field mcmember_rec_table[] = {
+	{ MCMEMBER_REC_FIELD(mgid),
+	  .offset_words = 0,
+	  .offset_bits  = 0,
+	  .size_bits    = 128 },
+	{ MCMEMBER_REC_FIELD(port_gid),
+	  .offset_words = 4,
+	  .offset_bits  = 0,
+	  .size_bits    = 128 },
+	{ MCMEMBER_REC_FIELD(qkey),
+	  .offset_words = 8,
+	  .offset_bits  = 0,
+	  .size_bits    = 32 },
+	{ MCMEMBER_REC_FIELD(mlid),
+	  .offset_words = 9,
+	  .offset_bits  = 0,
+	  .size_bits    = 16 },
+	{ MCMEMBER_REC_FIELD(mtu_selector),
+	  .offset_words = 9,
+	  .offset_bits  = 16,
+	  .size_bits    = 2 },
+	{ MCMEMBER_REC_FIELD(mtu),
+	  .offset_words = 9,
+	  .offset_bits  = 18,
+	  .size_bits    = 6 },
+	{ MCMEMBER_REC_FIELD(traffic_class),
+	  .offset_words = 9,
+	  .offset_bits  = 24,
+	  .size_bits    = 8 },
+	{ MCMEMBER_REC_FIELD(pkey),
+	  .offset_words = 10,
+	  .offset_bits  = 0,
+	  .size_bits    = 16 },
+	{ MCMEMBER_REC_FIELD(rate_selector),
+	  .offset_words = 10,
+	  .offset_bits  = 16,
+	  .size_bits    = 2 },
+	{ MCMEMBER_REC_FIELD(rate),
+	  .offset_words = 10,
+	  .offset_bits  = 18,
+	  .size_bits    = 6 },
+	{ MCMEMBER_REC_FIELD(packet_life_time_selector),
+	  .offset_words = 10,
+	  .offset_bits  = 24,
+	  .size_bits    = 2 },
+	{ MCMEMBER_REC_FIELD(packet_life_time),
+	  .offset_words = 10,
+	  .offset_bits  = 26,
+	  .size_bits    = 6 },
+	{ MCMEMBER_REC_FIELD(sl),
+	  .offset_words = 11,
+	  .offset_bits  = 0,
+	  .size_bits    = 4 },
+	{ MCMEMBER_REC_FIELD(flow_label),
+	  .offset_words = 11,
+	  .offset_bits  = 4,
+	  .size_bits    = 20 },
+	{ MCMEMBER_REC_FIELD(hop_limit),
+	  .offset_words = 11,
+	  .offset_bits  = 24,
+	  .size_bits    = 8 },
+	{ MCMEMBER_REC_FIELD(scope),
+	  .offset_words = 12,
+	  .offset_bits  = 0,
+	  .size_bits    = 4 },
+	{ MCMEMBER_REC_FIELD(join_state),
+	  .offset_words = 12,
+	  .offset_bits  = 4,
+	  .size_bits    = 4 },
+	{ MCMEMBER_REC_FIELD(proxy_join),
+	  .offset_words = 12,
+	  .offset_bits  = 8,
+	  .size_bits    = 1 },
+	{ RESERVED,
+	  .offset_words = 12,
+	  .offset_bits  = 9,
+	  .size_bits    = 23 },
+};
+
+#define SERVICE_REC_FIELD(field) \
+	.struct_offset_bytes = offsetof(struct ib_sa_service_rec, field),	\
+	.struct_size_bytes   = sizeof ((struct ib_sa_service_rec *) 0)->field,	\
+	.field_name          = "sa_service_rec:" #field
+
+static const struct ib_field service_rec_table[] = {
+	{ SERVICE_REC_FIELD(id),
+	  .offset_words = 0,
+	  .offset_bits  = 0,
+	  .size_bits    = 64 },
+	{ SERVICE_REC_FIELD(gid),
+	  .offset_words = 2,
+	  .offset_bits  = 0,
+	  .size_bits    = 128 },
+	{ SERVICE_REC_FIELD(pkey),
+	  .offset_words = 6,
+	  .offset_bits  = 0,
+	  .size_bits    = 16 },
+	{ SERVICE_REC_FIELD(lease),
+	  .offset_words = 7,
+	  .offset_bits  = 0,
+	  .size_bits    = 32 },
+	{ SERVICE_REC_FIELD(key),
+	  .offset_words = 8,
+	  .offset_bits  = 0,
+	  .size_bits    = 128 },
+	{ SERVICE_REC_FIELD(name),
+	  .offset_words = 12,
+	  .offset_bits  = 0,
+	  .size_bits    = 64*8 },
+	{ SERVICE_REC_FIELD(data8),
+	  .offset_words = 28,
+	  .offset_bits  = 0,
+	  .size_bits    = 16*8 },
+	{ SERVICE_REC_FIELD(data16),
+	  .offset_words = 32,
+	  .offset_bits  = 0,
+	  .size_bits    = 8*16 },
+	{ SERVICE_REC_FIELD(data32),
+	  .offset_words = 36,
+	  .offset_bits  = 0,
+	  .size_bits    = 4*32 },
+	{ SERVICE_REC_FIELD(data64),
+	  .offset_words = 40,
+	  .offset_bits  = 0,
+	  .size_bits    = 2*64 },
+};
+
+#define GUIDINFO_REC_FIELD(field) \
+	.struct_offset_bytes = offsetof(struct ib_sa_guidinfo_rec, field),	\
+	.struct_size_bytes   = sizeof((struct ib_sa_guidinfo_rec *) 0)->field,	\
+	.field_name          = "sa_guidinfo_rec:" #field
+
+static const struct ib_field guidinfo_rec_table[] = {
+	{ GUIDINFO_REC_FIELD(lid),
+	  .offset_words = 0,
+	  .offset_bits  = 0,
+	  .size_bits    = 16 },
+	{ GUIDINFO_REC_FIELD(block_num),
+	  .offset_words = 0,
+	  .offset_bits  = 16,
+	  .size_bits    = 8 },
+	{ GUIDINFO_REC_FIELD(res1),
+	  .offset_words = 0,
+	  .offset_bits  = 24,
+	  .size_bits    = 8 },
+	{ GUIDINFO_REC_FIELD(res2),
+	  .offset_words = 1,
+	  .offset_bits  = 0,
+	  .size_bits    = 32 },
+	{ GUIDINFO_REC_FIELD(guid_info_list),
+	  .offset_words = 2,
+	  .offset_bits  = 0,
+	  .size_bits    = 512 },
+};
+
+static void free_sm_ah(struct kref *kref)
+{
+	struct ib_sa_sm_ah *sm_ah = container_of(kref, struct ib_sa_sm_ah, ref);
+
+	ib_destroy_ah(sm_ah->ah);
+	kfree(sm_ah);
+}
+
+static void update_sm_ah(struct work_struct *work)
+{
+	struct ib_sa_port *port =
+		container_of(work, struct ib_sa_port, update_task);
+	struct ib_sa_sm_ah *new_ah;
+	struct ib_port_attr port_attr;
+	struct ib_ah_attr   ah_attr;
+
+	if (ib_query_port(port->agent->device, port->port_num, &port_attr)) {
+		printk(KERN_WARNING "Couldn't query port\n");
+		return;
+	}
+
+	new_ah = kmalloc(sizeof *new_ah, GFP_KERNEL);
+	if (!new_ah) {
+		printk(KERN_WARNING "Couldn't allocate new SM AH\n");
+		return;
+	}
+
+	kref_init(&new_ah->ref);
+	new_ah->src_path_mask = (1 << port_attr.lmc) - 1;
+
+	new_ah->pkey_index = 0;
+	if (ib_find_pkey(port->agent->device, port->port_num,
+			 IB_DEFAULT_PKEY_FULL, &new_ah->pkey_index))
+		printk(KERN_ERR "Couldn't find index for default PKey\n");
+
+	memset(&ah_attr, 0, sizeof ah_attr);
+	ah_attr.dlid     = port_attr.sm_lid;
+	ah_attr.sl       = port_attr.sm_sl;
+	ah_attr.port_num = port->port_num;
+
+	new_ah->ah = ib_create_ah(port->agent->qp->pd, &ah_attr);
+	if (IS_ERR(new_ah->ah)) {
+		printk(KERN_WARNING "Couldn't create new SM AH\n");
+		kfree(new_ah);
+		return;
+	}
+
+	spin_lock_irq(&port->ah_lock);
+	if (port->sm_ah)
+		kref_put(&port->sm_ah->ref, free_sm_ah);
+	port->sm_ah = new_ah;
+	spin_unlock_irq(&port->ah_lock);
+
+}
+
+static void ib_sa_event(struct ib_event_handler *handler, struct ib_event *event)
+{
+	if (event->event == IB_EVENT_PORT_ERR    ||
+	    event->event == IB_EVENT_PORT_ACTIVE ||
+	    event->event == IB_EVENT_LID_CHANGE  ||
+	    event->event == IB_EVENT_PKEY_CHANGE ||
+	    event->event == IB_EVENT_SM_CHANGE   ||
+	    event->event == IB_EVENT_CLIENT_REREGISTER) {
+		unsigned long flags;
+		struct ib_sa_device *sa_dev =
+			container_of(handler, typeof(*sa_dev), event_handler);
+		struct ib_sa_port *port =
+			&sa_dev->port[event->element.port_num - sa_dev->start_port];
+
+		if (rdma_port_get_link_layer(handler->device, port->port_num) != IB_LINK_LAYER_INFINIBAND)
+			return;
+
+		spin_lock_irqsave(&port->ah_lock, flags);
+		if (port->sm_ah)
+			kref_put(&port->sm_ah->ref, free_sm_ah);
+		port->sm_ah = NULL;
+		spin_unlock_irqrestore(&port->ah_lock, flags);
+
+		queue_work(ib_wq, &sa_dev->port[event->element.port_num -
+					    sa_dev->start_port].update_task);
+	}
+}
+
+void ib_sa_register_client(struct ib_sa_client *client)
+{
+	atomic_set(&client->users, 1);
+	init_completion(&client->comp);
+}
+EXPORT_SYMBOL(ib_sa_register_client);
+
+void ib_sa_unregister_client(struct ib_sa_client *client)
+{
+	ib_sa_client_put(client);
+	wait_for_completion(&client->comp);
+}
+EXPORT_SYMBOL(ib_sa_unregister_client);
+
+/**
+ * ib_sa_cancel_query - try to cancel an SA query
+ * @id:ID of query to cancel
+ * @query:query pointer to cancel
+ *
+ * Try to cancel an SA query.  If the id and query don't match up or
+ * the query has already completed, nothing is done.  Otherwise the
+ * query is canceled and will complete with a status of -EINTR.
+ */
+void ib_sa_cancel_query(int id, struct ib_sa_query *query)
+{
+	unsigned long flags;
+	struct ib_mad_agent *agent;
+	struct ib_mad_send_buf *mad_buf;
+
+	spin_lock_irqsave(&idr_lock, flags);
+	if (idr_find(&query_idr, id) != query) {
+		spin_unlock_irqrestore(&idr_lock, flags);
+		return;
+	}
+	agent = query->port->agent;
+	mad_buf = query->mad_buf;
+	spin_unlock_irqrestore(&idr_lock, flags);
+
+	ib_cancel_mad(agent, mad_buf);
+}
+EXPORT_SYMBOL(ib_sa_cancel_query);
+
+static u8 get_src_path_mask(struct ib_device *device, u8 port_num)
+{
+	struct ib_sa_device *sa_dev;
+	struct ib_sa_port   *port;
+	unsigned long flags;
+	u8 src_path_mask;
+
+	sa_dev = ib_get_client_data(device, &sa_client);
+	if (!sa_dev)
+		return 0x7f;
+
+	port  = &sa_dev->port[port_num - sa_dev->start_port];
+	spin_lock_irqsave(&port->ah_lock, flags);
+	src_path_mask = port->sm_ah ? port->sm_ah->src_path_mask : 0x7f;
+	spin_unlock_irqrestore(&port->ah_lock, flags);
+
+	return src_path_mask;
+}
+
+int ib_init_ah_from_path(struct ib_device *device, u8 port_num,
+			 struct ib_sa_path_rec *rec, struct ib_ah_attr *ah_attr)
+{
+	int ret;
+	u16 gid_index;
+	int force_grh;
+
+	memset(ah_attr, 0, sizeof *ah_attr);
+	ah_attr->dlid = be16_to_cpu(rec->dlid);
+	ah_attr->sl = rec->sl;
+	ah_attr->src_path_bits = be16_to_cpu(rec->slid) &
+				 get_src_path_mask(device, port_num);
+	ah_attr->port_num = port_num;
+	ah_attr->static_rate = rec->rate;
+
+	force_grh = rdma_port_get_link_layer(device, port_num) == IB_LINK_LAYER_ETHERNET;
+
+	if (rec->hop_limit > 1 || force_grh) {
+		ah_attr->ah_flags = IB_AH_GRH;
+		ah_attr->grh.dgid = rec->dgid;
+
+		ret = ib_find_cached_gid(device, &rec->sgid, &port_num,
+					 &gid_index);
+		if (ret)
+			return ret;
+
+		ah_attr->grh.sgid_index    = gid_index;
+		ah_attr->grh.flow_label    = be32_to_cpu(rec->flow_label);
+		ah_attr->grh.hop_limit     = rec->hop_limit;
+		ah_attr->grh.traffic_class = rec->traffic_class;
+	}
+	if (force_grh) {
+		memcpy(ah_attr->dmac, rec->dmac, ETH_ALEN);
+		ah_attr->vlan_id = rec->vlan_id;
+	} else {
+		ah_attr->vlan_id = 0xffff;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(ib_init_ah_from_path);
+
+static int alloc_mad(struct ib_sa_query *query, gfp_t gfp_mask)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&query->port->ah_lock, flags);
+	if (!query->port->sm_ah) {
+		spin_unlock_irqrestore(&query->port->ah_lock, flags);
+		return -EAGAIN;
+	}
+	kref_get(&query->port->sm_ah->ref);
+	query->sm_ah = query->port->sm_ah;
+	spin_unlock_irqrestore(&query->port->ah_lock, flags);
+
+	query->mad_buf = ib_create_send_mad(query->port->agent, 1,
+					    query->sm_ah->pkey_index,
+					    0, IB_MGMT_SA_HDR, IB_MGMT_SA_DATA,
+					    gfp_mask);
+	if (IS_ERR(query->mad_buf)) {
+		kref_put(&query->sm_ah->ref, free_sm_ah);
+		return -ENOMEM;
+	}
+
+	query->mad_buf->ah = query->sm_ah->ah;
+
+	return 0;
+}
+
+static void free_mad(struct ib_sa_query *query)
+{
+	ib_free_send_mad(query->mad_buf);
+	kref_put(&query->sm_ah->ref, free_sm_ah);
+}
+
+static void init_mad(struct ib_sa_mad *mad, struct ib_mad_agent *agent)
+{
+	unsigned long flags;
+
+	memset(mad, 0, sizeof *mad);
+
+	mad->mad_hdr.base_version  = IB_MGMT_BASE_VERSION;
+	mad->mad_hdr.mgmt_class    = IB_MGMT_CLASS_SUBN_ADM;
+	mad->mad_hdr.class_version = IB_SA_CLASS_VERSION;
+
+	spin_lock_irqsave(&tid_lock, flags);
+	mad->mad_hdr.tid           =
+		cpu_to_be64(((u64) agent->hi_tid) << 32 | tid++);
+	spin_unlock_irqrestore(&tid_lock, flags);
+}
+
+static int send_mad(struct ib_sa_query *query, int timeout_ms, gfp_t gfp_mask)
+{
+	bool preload = !!(gfp_mask & __GFP_WAIT);
+	unsigned long flags;
+	int ret, id;
+
+	if (preload)
+		idr_preload(gfp_mask);
+	spin_lock_irqsave(&idr_lock, flags);
+
+	id = idr_alloc(&query_idr, query, 0, 0, GFP_NOWAIT);
+
+	spin_unlock_irqrestore(&idr_lock, flags);
+	if (preload)
+		idr_preload_end();
+	if (id < 0)
+		return id;
+
+	query->mad_buf->timeout_ms  = timeout_ms;
+	query->mad_buf->context[0] = query;
+	query->id = id;
+
+	ret = ib_post_send_mad(query->mad_buf, NULL);
+	if (ret) {
+		spin_lock_irqsave(&idr_lock, flags);
+		idr_remove(&query_idr, id);
+		spin_unlock_irqrestore(&idr_lock, flags);
+	}
+
+	/*
+	 * It's not safe to dereference query any more, because the
+	 * send may already have completed and freed the query in
+	 * another context.
+	 */
+	return ret ? ret : id;
+}
+
+void ib_sa_unpack_path(void *attribute, struct ib_sa_path_rec *rec)
+{
+	ib_unpack(path_rec_table, ARRAY_SIZE(path_rec_table), attribute, rec);
+}
+EXPORT_SYMBOL(ib_sa_unpack_path);
+
+void ib_sa_pack_path(struct ib_sa_path_rec *rec, void *attribute)
+{
+	ib_pack(path_rec_table, ARRAY_SIZE(path_rec_table), rec, attribute);
+}
+EXPORT_SYMBOL(ib_sa_pack_path);
+
+static void ib_sa_path_rec_callback(struct ib_sa_query *sa_query,
+				    int status,
+				    struct ib_sa_mad *mad)
+{
+	struct ib_sa_path_query *query =
+		container_of(sa_query, struct ib_sa_path_query, sa_query);
+
+	if (mad) {
+		struct ib_sa_path_rec rec;
+
+		ib_unpack(path_rec_table, ARRAY_SIZE(path_rec_table),
+			  mad->data, &rec);
+		rec.vlan_id = 0xffff;
+		memset(rec.dmac, 0, ETH_ALEN);
+		memset(rec.smac, 0, ETH_ALEN);
+		query->callback(status, &rec, query->context);
+	} else
+		query->callback(status, NULL, query->context);
+}
+
+static void ib_sa_path_rec_release(struct ib_sa_query *sa_query)
+{
+	kfree(container_of(sa_query, struct ib_sa_path_query, sa_query));
+}
+
+/**
+ * ib_sa_path_rec_get - Start a Path get query
+ * @client:SA client
+ * @device:device to send query on
+ * @port_num: port number to send query on
+ * @rec:Path Record to send in query
+ * @comp_mask:component mask to send in query
+ * @timeout_ms:time to wait for response
+ * @gfp_mask:GFP mask to use for internal allocations
+ * @callback:function called when query completes, times out or is
+ * canceled
+ * @context:opaque user context passed to callback
+ * @sa_query:query context, used to cancel query
+ *
+ * Send a Path Record Get query to the SA to look up a path.  The
+ * callback function will be called when the query completes (or
+ * fails); status is 0 for a successful response, -EINTR if the query
+ * is canceled, -ETIMEDOUT is the query timed out, or -EIO if an error
+ * occurred sending the query.  The resp parameter of the callback is
+ * only valid if status is 0.
+ *
+ * If the return value of ib_sa_path_rec_get() is negative, it is an
+ * error code.  Otherwise it is a query ID that can be used to cancel
+ * the query.
+ */
+int ib_sa_path_rec_get(struct ib_sa_client *client,
+		       struct ib_device *device, u8 port_num,
+		       struct ib_sa_path_rec *rec,
+		       ib_sa_comp_mask comp_mask,
+		       int timeout_ms, gfp_t gfp_mask,
+		       void (*callback)(int status,
+					struct ib_sa_path_rec *resp,
+					void *context),
+		       void *context,
+		       struct ib_sa_query **sa_query)
+{
+	struct ib_sa_path_query *query;
+	struct ib_sa_device *sa_dev = ib_get_client_data(device, &sa_client);
+	struct ib_sa_port   *port;
+	struct ib_mad_agent *agent;
+	struct ib_sa_mad *mad;
+	int ret;
+
+	if (!sa_dev)
+		return -ENODEV;
+
+	port  = &sa_dev->port[port_num - sa_dev->start_port];
+	agent = port->agent;
+
+	query = kmalloc(sizeof *query, gfp_mask);
+	if (!query)
+		return -ENOMEM;
+
+	query->sa_query.port     = port;
+	ret = alloc_mad(&query->sa_query, gfp_mask);
+	if (ret)
+		goto err1;
+
+	ib_sa_client_get(client);
+	query->sa_query.client = client;
+	query->callback        = callback;
+	query->context         = context;
+
+	mad = query->sa_query.mad_buf->mad;
+	init_mad(mad, agent);
+
+	query->sa_query.callback = callback ? ib_sa_path_rec_callback : NULL;
+	query->sa_query.release  = ib_sa_path_rec_release;
+	mad->mad_hdr.method	 = IB_MGMT_METHOD_GET;
+	mad->mad_hdr.attr_id	 = cpu_to_be16(IB_SA_ATTR_PATH_REC);
+	mad->sa_hdr.comp_mask	 = comp_mask;
+
+	ib_pack(path_rec_table, ARRAY_SIZE(path_rec_table), rec, mad->data);
+
+	*sa_query = &query->sa_query;
+
+	ret = send_mad(&query->sa_query, timeout_ms, gfp_mask);
+	if (ret < 0)
+		goto err2;
+
+	return ret;
+
+err2:
+	*sa_query = NULL;
+	ib_sa_client_put(query->sa_query.client);
+	free_mad(&query->sa_query);
+
+err1:
+	kfree(query);
+	return ret;
+}
+EXPORT_SYMBOL(ib_sa_path_rec_get);
+
+static void ib_sa_service_rec_callback(struct ib_sa_query *sa_query,
+				    int status,
+				    struct ib_sa_mad *mad)
+{
+	struct ib_sa_service_query *query =
+		container_of(sa_query, struct ib_sa_service_query, sa_query);
+
+	if (mad) {
+		struct ib_sa_service_rec rec;
+
+		ib_unpack(service_rec_table, ARRAY_SIZE(service_rec_table),
+			  mad->data, &rec);
+		query->callback(status, &rec, query->context);
+	} else
+		query->callback(status, NULL, query->context);
+}
+
+static void ib_sa_service_rec_release(struct ib_sa_query *sa_query)
+{
+	kfree(container_of(sa_query, struct ib_sa_service_query, sa_query));
+}
+
+/**
+ * ib_sa_service_rec_query - Start Service Record operation
+ * @client:SA client
+ * @device:device to send request on
+ * @port_num: port number to send request on
+ * @method:SA method - should be get, set, or delete
+ * @rec:Service Record to send in request
+ * @comp_mask:component mask to send in request
+ * @timeout_ms:time to wait for response
+ * @gfp_mask:GFP mask to use for internal allocations
+ * @callback:function called when request completes, times out or is
+ * canceled
+ * @context:opaque user context passed to callback
+ * @sa_query:request context, used to cancel request
+ *
+ * Send a Service Record set/get/delete to the SA to register,
+ * unregister or query a service record.
+ * The callback function will be called when the request completes (or
+ * fails); status is 0 for a successful response, -EINTR if the query
+ * is canceled, -ETIMEDOUT is the query timed out, or -EIO if an error
+ * occurred sending the query.  The resp parameter of the callback is
+ * only valid if status is 0.
+ *
+ * If the return value of ib_sa_service_rec_query() is negative, it is an
+ * error code.  Otherwise it is a request ID that can be used to cancel
+ * the query.
+ */
+int ib_sa_service_rec_query(struct ib_sa_client *client,
+			    struct ib_device *device, u8 port_num, u8 method,
+			    struct ib_sa_service_rec *rec,
+			    ib_sa_comp_mask comp_mask,
+			    int timeout_ms, gfp_t gfp_mask,
+			    void (*callback)(int status,
+					     struct ib_sa_service_rec *resp,
+					     void *context),
+			    void *context,
+			    struct ib_sa_query **sa_query)
+{
+	struct ib_sa_service_query *query;
+	struct ib_sa_device *sa_dev = ib_get_client_data(device, &sa_client);
+	struct ib_sa_port   *port;
+	struct ib_mad_agent *agent;
+	struct ib_sa_mad *mad;
+	int ret;
+
+	if (!sa_dev)
+		return -ENODEV;
+
+	port  = &sa_dev->port[port_num - sa_dev->start_port];
+	agent = port->agent;
+
+	if (method != IB_MGMT_METHOD_GET &&
+	    method != IB_MGMT_METHOD_SET &&
+	    method != IB_SA_METHOD_DELETE)
+		return -EINVAL;
+
+	query = kmalloc(sizeof *query, gfp_mask);
+	if (!query)
+		return -ENOMEM;
+
+	query->sa_query.port     = port;
+	ret = alloc_mad(&query->sa_query, gfp_mask);
+	if (ret)
+		goto err1;
+
+	ib_sa_client_get(client);
+	query->sa_query.client = client;
+	query->callback        = callback;
+	query->context         = context;
+
+	mad = query->sa_query.mad_buf->mad;
+	init_mad(mad, agent);
+
+	query->sa_query.callback = callback ? ib_sa_service_rec_callback : NULL;
+	query->sa_query.release  = ib_sa_service_rec_release;
+	mad->mad_hdr.method	 = method;
+	mad->mad_hdr.attr_id	 = cpu_to_be16(IB_SA_ATTR_SERVICE_REC);
+	mad->sa_hdr.comp_mask	 = comp_mask;
+
+	ib_pack(service_rec_table, ARRAY_SIZE(service_rec_table),
+		rec, mad->data);
+
+	*sa_query = &query->sa_query;
+
+	ret = send_mad(&query->sa_query, timeout_ms, gfp_mask);
+	if (ret < 0)
+		goto err2;
+
+	return ret;
+
+err2:
+	*sa_query = NULL;
+	ib_sa_client_put(query->sa_query.client);
+	free_mad(&query->sa_query);
+
+err1:
+	kfree(query);
+	return ret;
+}
+EXPORT_SYMBOL(ib_sa_service_rec_query);
+
+static void ib_sa_mcmember_rec_callback(struct ib_sa_query *sa_query,
+					int status,
+					struct ib_sa_mad *mad)
+{
+	struct ib_sa_mcmember_query *query =
+		container_of(sa_query, struct ib_sa_mcmember_query, sa_query);
+
+	if (mad) {
+		struct ib_sa_mcmember_rec rec;
+
+		ib_unpack(mcmember_rec_table, ARRAY_SIZE(mcmember_rec_table),
+			  mad->data, &rec);
+		query->callback(status, &rec, query->context);
+	} else
+		query->callback(status, NULL, query->context);
+}
+
+static void ib_sa_mcmember_rec_release(struct ib_sa_query *sa_query)
+{
+	kfree(container_of(sa_query, struct ib_sa_mcmember_query, sa_query));
+}
+
+int ib_sa_mcmember_rec_query(struct ib_sa_client *client,
+			     struct ib_device *device, u8 port_num,
+			     u8 method,
+			     struct ib_sa_mcmember_rec *rec,
+			     ib_sa_comp_mask comp_mask,
+			     int timeout_ms, gfp_t gfp_mask,
+			     void (*callback)(int status,
+					      struct ib_sa_mcmember_rec *resp,
+					      void *context),
+			     void *context,
+			     struct ib_sa_query **sa_query)
+{
+	struct ib_sa_mcmember_query *query;
+	struct ib_sa_device *sa_dev = ib_get_client_data(device, &sa_client);
+	struct ib_sa_port   *port;
+	struct ib_mad_agent *agent;
+	struct ib_sa_mad *mad;
+	int ret;
+
+	if (!sa_dev)
+		return -ENODEV;
+
+	port  = &sa_dev->port[port_num - sa_dev->start_port];
+	agent = port->agent;
+
+	query = kmalloc(sizeof *query, gfp_mask);
+	if (!query)
+		return -ENOMEM;
+
+	query->sa_query.port     = port;
+	ret = alloc_mad(&query->sa_query, gfp_mask);
+	if (ret)
+		goto err1;
+
+	ib_sa_client_get(client);
+	query->sa_query.client = client;
+	query->callback        = callback;
+	query->context         = context;
+
+	mad = query->sa_query.mad_buf->mad;
+	init_mad(mad, agent);
+
+	query->sa_query.callback = callback ? ib_sa_mcmember_rec_callback : NULL;
+	query->sa_query.release  = ib_sa_mcmember_rec_release;
+	mad->mad_hdr.method	 = method;
+	mad->mad_hdr.attr_id	 = cpu_to_be16(IB_SA_ATTR_MC_MEMBER_REC);
+	mad->sa_hdr.comp_mask	 = comp_mask;
+
+	ib_pack(mcmember_rec_table, ARRAY_SIZE(mcmember_rec_table),
+		rec, mad->data);
+
+	*sa_query = &query->sa_query;
+
+	ret = send_mad(&query->sa_query, timeout_ms, gfp_mask);
+	if (ret < 0)
+		goto err2;
+
+	return ret;
+
+err2:
+	*sa_query = NULL;
+	ib_sa_client_put(query->sa_query.client);
+	free_mad(&query->sa_query);
+
+err1:
+	kfree(query);
+	return ret;
+}
+
+/* Support GuidInfoRecord */
+static void ib_sa_guidinfo_rec_callback(struct ib_sa_query *sa_query,
+					int status,
+					struct ib_sa_mad *mad)
+{
+	struct ib_sa_guidinfo_query *query =
+		container_of(sa_query, struct ib_sa_guidinfo_query, sa_query);
+
+	if (mad) {
+		struct ib_sa_guidinfo_rec rec;
+
+		ib_unpack(guidinfo_rec_table, ARRAY_SIZE(guidinfo_rec_table),
+			  mad->data, &rec);
+		query->callback(status, &rec, query->context);
+	} else
+		query->callback(status, NULL, query->context);
+}
+
+static void ib_sa_guidinfo_rec_release(struct ib_sa_query *sa_query)
+{
+	kfree(container_of(sa_query, struct ib_sa_guidinfo_query, sa_query));
+}
+
+int ib_sa_guid_info_rec_query(struct ib_sa_client *client,
+			      struct ib_device *device, u8 port_num,
+			      struct ib_sa_guidinfo_rec *rec,
+			      ib_sa_comp_mask comp_mask, u8 method,
+			      int timeout_ms, gfp_t gfp_mask,
+			      void (*callback)(int status,
+					       struct ib_sa_guidinfo_rec *resp,
+					       void *context),
+			      void *context,
+			      struct ib_sa_query **sa_query)
+{
+	struct ib_sa_guidinfo_query *query;
+	struct ib_sa_device *sa_dev = ib_get_client_data(device, &sa_client);
+	struct ib_sa_port *port;
+	struct ib_mad_agent *agent;
+	struct ib_sa_mad *mad;
+	int ret;
+
+	if (!sa_dev)
+		return -ENODEV;
+
+	if (method != IB_MGMT_METHOD_GET &&
+	    method != IB_MGMT_METHOD_SET &&
+	    method != IB_SA_METHOD_DELETE) {
+		return -EINVAL;
+	}
+
+	port  = &sa_dev->port[port_num - sa_dev->start_port];
+	agent = port->agent;
+
+	query = kmalloc(sizeof *query, gfp_mask);
+	if (!query)
+		return -ENOMEM;
+
+	query->sa_query.port = port;
+	ret = alloc_mad(&query->sa_query, gfp_mask);
+	if (ret)
+		goto err1;
+
+	ib_sa_client_get(client);
+	query->sa_query.client = client;
+	query->callback        = callback;
+	query->context         = context;
+
+	mad = query->sa_query.mad_buf->mad;
+	init_mad(mad, agent);
+
+	query->sa_query.callback = callback ? ib_sa_guidinfo_rec_callback : NULL;
+	query->sa_query.release  = ib_sa_guidinfo_rec_release;
+
+	mad->mad_hdr.method	 = method;
+	mad->mad_hdr.attr_id	 = cpu_to_be16(IB_SA_ATTR_GUID_INFO_REC);
+	mad->sa_hdr.comp_mask	 = comp_mask;
+
+	ib_pack(guidinfo_rec_table, ARRAY_SIZE(guidinfo_rec_table), rec,
+		mad->data);
+
+	*sa_query = &query->sa_query;
+
+	ret = send_mad(&query->sa_query, timeout_ms, gfp_mask);
+	if (ret < 0)
+		goto err2;
+
+	return ret;
+
+err2:
+	*sa_query = NULL;
+	ib_sa_client_put(query->sa_query.client);
+	free_mad(&query->sa_query);
+
+err1:
+	kfree(query);
+	return ret;
+}
+EXPORT_SYMBOL(ib_sa_guid_info_rec_query);
+
+static void send_handler(struct ib_mad_agent *agent,
+			 struct ib_mad_send_wc *mad_send_wc)
+{
+	struct ib_sa_query *query = mad_send_wc->send_buf->context[0];
+	unsigned long flags;
+
+	if (query->callback)
+		switch (mad_send_wc->status) {
+		case IB_WC_SUCCESS:
+			/* No callback -- already got recv */
+			break;
+		case IB_WC_RESP_TIMEOUT_ERR:
+			query->callback(query, -ETIMEDOUT, NULL);
+			break;
+		case IB_WC_WR_FLUSH_ERR:
+			query->callback(query, -EINTR, NULL);
+			break;
+		default:
+			query->callback(query, -EIO, NULL);
+			break;
+		}
+
+	spin_lock_irqsave(&idr_lock, flags);
+	idr_remove(&query_idr, query->id);
+	spin_unlock_irqrestore(&idr_lock, flags);
+
+	free_mad(query);
+	ib_sa_client_put(query->client);
+	query->release(query);
+}
+
+static void recv_handler(struct ib_mad_agent *mad_agent,
+			 struct ib_mad_recv_wc *mad_recv_wc)
+{
+	struct ib_sa_query *query;
+	struct ib_mad_send_buf *mad_buf;
+
+	mad_buf = (void *) (unsigned long) mad_recv_wc->wc->wr_id;
+	query = mad_buf->context[0];
+
+	if (query->callback) {
+		if (mad_recv_wc->wc->status == IB_WC_SUCCESS)
+			query->callback(query,
+					mad_recv_wc->recv_buf.mad->mad_hdr.status ?
+					-EINVAL : 0,
+					(struct ib_sa_mad *) mad_recv_wc->recv_buf.mad);
+		else
+			query->callback(query, -EIO, NULL);
+	}
+
+	ib_free_recv_mad(mad_recv_wc);
+}
+
+static void ib_sa_add_one(struct ib_device *device)
+{
+	struct ib_sa_device *sa_dev;
+	int s, e, i;
+
+	if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB)
+		return;
+
+	if (device->node_type == RDMA_NODE_IB_SWITCH)
+		s = e = 0;
+	else {
+		s = 1;
+		e = device->phys_port_cnt;
+	}
+
+	sa_dev = kzalloc(sizeof *sa_dev +
+			 (e - s + 1) * sizeof (struct ib_sa_port),
+			 GFP_KERNEL);
+	if (!sa_dev)
+		return;
+
+	sa_dev->start_port = s;
+	sa_dev->end_port   = e;
+
+	for (i = 0; i <= e - s; ++i) {
+		spin_lock_init(&sa_dev->port[i].ah_lock);
+		if (rdma_port_get_link_layer(device, i + 1) != IB_LINK_LAYER_INFINIBAND)
+			continue;
+
+		sa_dev->port[i].sm_ah    = NULL;
+		sa_dev->port[i].port_num = i + s;
+
+		sa_dev->port[i].agent =
+			ib_register_mad_agent(device, i + s, IB_QPT_GSI,
+					      NULL, 0, send_handler,
+					      recv_handler, sa_dev, 0);
+		if (IS_ERR(sa_dev->port[i].agent))
+			goto err;
+
+		INIT_WORK(&sa_dev->port[i].update_task, update_sm_ah);
+	}
+
+	ib_set_client_data(device, &sa_client, sa_dev);
+
+	/*
+	 * We register our event handler after everything is set up,
+	 * and then update our cached info after the event handler is
+	 * registered to avoid any problems if a port changes state
+	 * during our initialization.
+	 */
+
+	INIT_IB_EVENT_HANDLER(&sa_dev->event_handler, device, ib_sa_event);
+	if (ib_register_event_handler(&sa_dev->event_handler))
+		goto err;
+
+	for (i = 0; i <= e - s; ++i)
+		if (rdma_port_get_link_layer(device, i + 1) == IB_LINK_LAYER_INFINIBAND)
+			update_sm_ah(&sa_dev->port[i].update_task);
+
+	return;
+
+err:
+	while (--i >= 0)
+		if (rdma_port_get_link_layer(device, i + 1) == IB_LINK_LAYER_INFINIBAND)
+			ib_unregister_mad_agent(sa_dev->port[i].agent);
+
+	kfree(sa_dev);
+
+	return;
+}
+
+static void ib_sa_remove_one(struct ib_device *device)
+{
+	struct ib_sa_device *sa_dev = ib_get_client_data(device, &sa_client);
+	int i;
+
+	if (!sa_dev)
+		return;
+
+	ib_unregister_event_handler(&sa_dev->event_handler);
+
+	flush_workqueue(ib_wq);
+
+	for (i = 0; i <= sa_dev->end_port - sa_dev->start_port; ++i) {
+		if (rdma_port_get_link_layer(device, i + 1) == IB_LINK_LAYER_INFINIBAND) {
+			ib_unregister_mad_agent(sa_dev->port[i].agent);
+			if (sa_dev->port[i].sm_ah)
+				kref_put(&sa_dev->port[i].sm_ah->ref, free_sm_ah);
+		}
+
+	}
+
+	kfree(sa_dev);
+}
+
+static int __init ib_sa_init(void)
+{
+	int ret;
+
+	get_random_bytes(&tid, sizeof tid);
+
+	ret = ib_register_client(&sa_client);
+	if (ret) {
+		printk(KERN_ERR "Couldn't register ib_sa client\n");
+		goto err1;
+	}
+
+	ret = mcast_init();
+	if (ret) {
+		printk(KERN_ERR "Couldn't initialize multicast handling\n");
+		goto err2;
+	}
+
+	return 0;
+err2:
+	ib_unregister_client(&sa_client);
+err1:
+	return ret;
+}
+
+static void __exit ib_sa_cleanup(void)
+{
+	mcast_cleanup();
+	ib_unregister_client(&sa_client);
+	idr_destroy(&query_idr);
+}
+
+module_init(ib_sa_init);
+module_exit(ib_sa_cleanup);
diff --git a/drivers/infiniband/core/smi.c b/drivers/infiniband/core/smi.c
new file mode 100644
index 0000000..5855e44
--- /dev/null
+++ b/drivers/infiniband/core/smi.c
@@ -0,0 +1,253 @@
+/*
+ * Copyright (c) 2004, 2005 Mellanox Technologies Ltd.  All rights reserved.
+ * Copyright (c) 2004, 2005 Infinicon Corporation.  All rights reserved.
+ * Copyright (c) 2004, 2005 Intel Corporation.  All rights reserved.
+ * Copyright (c) 2004, 2005 Topspin Corporation.  All rights reserved.
+ * Copyright (c) 2004-2007 Voltaire Corporation.  All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <rdma/ib_smi.h>
+#include "smi.h"
+
+/*
+ * Fixup a directed route SMP for sending
+ * Return 0 if the SMP should be discarded
+ */
+enum smi_action smi_handle_dr_smp_send(struct ib_smp *smp,
+				       u8 node_type, int port_num)
+{
+	u8 hop_ptr, hop_cnt;
+
+	hop_ptr = smp->hop_ptr;
+	hop_cnt = smp->hop_cnt;
+
+	/* See section 14.2.2.2, Vol 1 IB spec */
+	/* C14-6 -- valid hop_cnt values are from 0 to 63 */
+	if (hop_cnt >= IB_SMP_MAX_PATH_HOPS)
+		return IB_SMI_DISCARD;
+
+	if (!ib_get_smp_direction(smp)) {
+		/* C14-9:1 */
+		if (hop_cnt && hop_ptr == 0) {
+			smp->hop_ptr++;
+			return (smp->initial_path[smp->hop_ptr] ==
+				port_num ? IB_SMI_HANDLE : IB_SMI_DISCARD);
+		}
+
+		/* C14-9:2 */
+		if (hop_ptr && hop_ptr < hop_cnt) {
+			if (node_type != RDMA_NODE_IB_SWITCH)
+				return IB_SMI_DISCARD;
+
+			/* smp->return_path set when received */
+			smp->hop_ptr++;
+			return (smp->initial_path[smp->hop_ptr] ==
+				port_num ? IB_SMI_HANDLE : IB_SMI_DISCARD);
+		}
+
+		/* C14-9:3 -- We're at the end of the DR segment of path */
+		if (hop_ptr == hop_cnt) {
+			/* smp->return_path set when received */
+			smp->hop_ptr++;
+			return (node_type == RDMA_NODE_IB_SWITCH ||
+				smp->dr_dlid == IB_LID_PERMISSIVE ?
+				IB_SMI_HANDLE : IB_SMI_DISCARD);
+		}
+
+		/* C14-9:4 -- hop_ptr = hop_cnt + 1 -> give to SMA/SM */
+		/* C14-9:5 -- Fail unreasonable hop pointer */
+		return (hop_ptr == hop_cnt + 1 ? IB_SMI_HANDLE : IB_SMI_DISCARD);
+
+	} else {
+		/* C14-13:1 */
+		if (hop_cnt && hop_ptr == hop_cnt + 1) {
+			smp->hop_ptr--;
+			return (smp->return_path[smp->hop_ptr] ==
+				port_num ? IB_SMI_HANDLE : IB_SMI_DISCARD);
+		}
+
+		/* C14-13:2 */
+		if (2 <= hop_ptr && hop_ptr <= hop_cnt) {
+			if (node_type != RDMA_NODE_IB_SWITCH)
+				return IB_SMI_DISCARD;
+
+			smp->hop_ptr--;
+			return (smp->return_path[smp->hop_ptr] ==
+				port_num ? IB_SMI_HANDLE : IB_SMI_DISCARD);
+		}
+
+		/* C14-13:3 -- at the end of the DR segment of path */
+		if (hop_ptr == 1) {
+			smp->hop_ptr--;
+			/* C14-13:3 -- SMPs destined for SM shouldn't be here */
+			return (node_type == RDMA_NODE_IB_SWITCH ||
+				smp->dr_slid == IB_LID_PERMISSIVE ?
+				IB_SMI_HANDLE : IB_SMI_DISCARD);
+		}
+
+		/* C14-13:4 -- hop_ptr = 0 -> should have gone to SM */
+		if (hop_ptr == 0)
+			return IB_SMI_HANDLE;
+
+		/* C14-13:5 -- Check for unreasonable hop pointer */
+		return IB_SMI_DISCARD;
+	}
+}
+
+/*
+ * Adjust information for a received SMP
+ * Return 0 if the SMP should be dropped
+ */
+enum smi_action smi_handle_dr_smp_recv(struct ib_smp *smp, u8 node_type,
+				       int port_num, int phys_port_cnt)
+{
+	u8 hop_ptr, hop_cnt;
+
+	hop_ptr = smp->hop_ptr;
+	hop_cnt = smp->hop_cnt;
+
+	/* See section 14.2.2.2, Vol 1 IB spec */
+	/* C14-6 -- valid hop_cnt values are from 0 to 63 */
+	if (hop_cnt >= IB_SMP_MAX_PATH_HOPS)
+		return IB_SMI_DISCARD;
+
+	if (!ib_get_smp_direction(smp)) {
+		/* C14-9:1 -- sender should have incremented hop_ptr */
+		if (hop_cnt && hop_ptr == 0)
+			return IB_SMI_DISCARD;
+
+		/* C14-9:2 -- intermediate hop */
+		if (hop_ptr && hop_ptr < hop_cnt) {
+			if (node_type != RDMA_NODE_IB_SWITCH)
+				return IB_SMI_DISCARD;
+
+			smp->return_path[hop_ptr] = port_num;
+			/* smp->hop_ptr updated when sending */
+			return (smp->initial_path[hop_ptr+1] <= phys_port_cnt ?
+				IB_SMI_HANDLE : IB_SMI_DISCARD);
+		}
+
+		/* C14-9:3 -- We're at the end of the DR segment of path */
+		if (hop_ptr == hop_cnt) {
+			if (hop_cnt)
+				smp->return_path[hop_ptr] = port_num;
+			/* smp->hop_ptr updated when sending */
+
+			return (node_type == RDMA_NODE_IB_SWITCH ||
+				smp->dr_dlid == IB_LID_PERMISSIVE ?
+				IB_SMI_HANDLE : IB_SMI_DISCARD);
+		}
+
+		/* C14-9:4 -- hop_ptr = hop_cnt + 1 -> give to SMA/SM */
+		/* C14-9:5 -- fail unreasonable hop pointer */
+		return (hop_ptr == hop_cnt + 1 ? IB_SMI_HANDLE : IB_SMI_DISCARD);
+
+	} else {
+
+		/* C14-13:1 */
+		if (hop_cnt && hop_ptr == hop_cnt + 1) {
+			smp->hop_ptr--;
+			return (smp->return_path[smp->hop_ptr] ==
+				port_num ? IB_SMI_HANDLE : IB_SMI_DISCARD);
+		}
+
+		/* C14-13:2 */
+		if (2 <= hop_ptr && hop_ptr <= hop_cnt) {
+			if (node_type != RDMA_NODE_IB_SWITCH)
+				return IB_SMI_DISCARD;
+
+			/* smp->hop_ptr updated when sending */
+			return (smp->return_path[hop_ptr-1] <= phys_port_cnt ?
+				IB_SMI_HANDLE : IB_SMI_DISCARD);
+		}
+
+		/* C14-13:3 -- We're at the end of the DR segment of path */
+		if (hop_ptr == 1) {
+			if (smp->dr_slid == IB_LID_PERMISSIVE) {
+				/* giving SMP to SM - update hop_ptr */
+				smp->hop_ptr--;
+				return IB_SMI_HANDLE;
+			}
+			/* smp->hop_ptr updated when sending */
+			return (node_type == RDMA_NODE_IB_SWITCH ?
+				IB_SMI_HANDLE : IB_SMI_DISCARD);
+		}
+
+		/* C14-13:4 -- hop_ptr = 0 -> give to SM */
+		/* C14-13:5 -- Check for unreasonable hop pointer */
+		return (hop_ptr == 0 ? IB_SMI_HANDLE : IB_SMI_DISCARD);
+	}
+}
+
+enum smi_forward_action smi_check_forward_dr_smp(struct ib_smp *smp)
+{
+	u8 hop_ptr, hop_cnt;
+
+	hop_ptr = smp->hop_ptr;
+	hop_cnt = smp->hop_cnt;
+
+	if (!ib_get_smp_direction(smp)) {
+		/* C14-9:2 -- intermediate hop */
+		if (hop_ptr && hop_ptr < hop_cnt)
+			return IB_SMI_FORWARD;
+
+		/* C14-9:3 -- at the end of the DR segment of path */
+		if (hop_ptr == hop_cnt)
+			return (smp->dr_dlid == IB_LID_PERMISSIVE ?
+				IB_SMI_SEND : IB_SMI_LOCAL);
+
+		/* C14-9:4 -- hop_ptr = hop_cnt + 1 -> give to SMA/SM */
+		if (hop_ptr == hop_cnt + 1)
+			return IB_SMI_SEND;
+	} else {
+		/* C14-13:2  -- intermediate hop */
+		if (2 <= hop_ptr && hop_ptr <= hop_cnt)
+			return IB_SMI_FORWARD;
+
+		/* C14-13:3 -- at the end of the DR segment of path */
+		if (hop_ptr == 1)
+			return (smp->dr_slid != IB_LID_PERMISSIVE ?
+				IB_SMI_SEND : IB_SMI_LOCAL);
+	}
+	return IB_SMI_LOCAL;
+}
+
+/*
+ * Return the forwarding port number from initial_path for outgoing SMP and
+ * from return_path for returning SMP
+ */
+int smi_get_fwd_port(struct ib_smp *smp)
+{
+	return (!ib_get_smp_direction(smp) ? smp->initial_path[smp->hop_ptr+1] :
+		smp->return_path[smp->hop_ptr-1]);
+}
diff --git a/drivers/infiniband/core/smi.h b/drivers/infiniband/core/smi.h
new file mode 100644
index 0000000..aff96ba
--- /dev/null
+++ b/drivers/infiniband/core/smi.h
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2004 Mellanox Technologies Ltd.  All rights reserved.
+ * Copyright (c) 2004 Infinicon Corporation.  All rights reserved.
+ * Copyright (c) 2004 Intel Corporation.  All rights reserved.
+ * Copyright (c) 2004 Topspin Corporation.  All rights reserved.
+ * Copyright (c) 2004-2007 Voltaire Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifndef __SMI_H_
+#define __SMI_H_
+
+#include <rdma/ib_smi.h>
+
+enum smi_action {
+	IB_SMI_DISCARD,
+	IB_SMI_HANDLE
+};
+
+enum smi_forward_action {
+	IB_SMI_LOCAL,	/* SMP should be completed up the stack */
+	IB_SMI_SEND,	/* received DR SMP should be forwarded to the send queue */
+	IB_SMI_FORWARD	/* SMP should be forwarded (for switches only) */
+};
+
+enum smi_action smi_handle_dr_smp_recv(struct ib_smp *smp, u8 node_type,
+				       int port_num, int phys_port_cnt);
+int smi_get_fwd_port(struct ib_smp *smp);
+extern enum smi_forward_action smi_check_forward_dr_smp(struct ib_smp *smp);
+extern enum smi_action smi_handle_dr_smp_send(struct ib_smp *smp,
+					      u8 node_type, int port_num);
+
+/*
+ * Return IB_SMI_HANDLE if the SMP should be handled by the local SMA/SM
+ * via process_mad
+ */
+static inline enum smi_action smi_check_local_smp(struct ib_smp *smp,
+						  struct ib_device *device)
+{
+	/* C14-9:3 -- We're at the end of the DR segment of path */
+	/* C14-9:4 -- Hop Pointer = Hop Count + 1 -> give to SMA/SM */
+	return ((device->process_mad &&
+		!ib_get_smp_direction(smp) &&
+		(smp->hop_ptr == smp->hop_cnt + 1)) ?
+		IB_SMI_HANDLE : IB_SMI_DISCARD);
+}
+
+/*
+ * Return IB_SMI_HANDLE if the SMP should be handled by the local SMA/SM
+ * via process_mad
+ */
+static inline enum smi_action smi_check_local_returning_smp(struct ib_smp *smp,
+						   struct ib_device *device)
+{
+	/* C14-13:3 -- We're at the end of the DR segment of path */
+	/* C14-13:4 -- Hop Pointer == 0 -> give to SM */
+	return ((device->process_mad &&
+		ib_get_smp_direction(smp) &&
+		!smp->hop_ptr) ? IB_SMI_HANDLE : IB_SMI_DISCARD);
+}
+
+#endif	/* __SMI_H_ */
diff --git a/drivers/infiniband/core/sysfs.c b/drivers/infiniband/core/sysfs.c
new file mode 100644
index 0000000..cbd0383
--- /dev/null
+++ b/drivers/infiniband/core/sysfs.c
@@ -0,0 +1,922 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies Ltd.  All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "core_priv.h"
+
+#include <linux/slab.h>
+#include <linux/stat.h>
+#include <linux/string.h>
+
+#include <rdma/ib_mad.h>
+
+struct ib_port {
+	struct kobject         kobj;
+	struct ib_device      *ibdev;
+	struct attribute_group gid_group;
+	struct attribute_group pkey_group;
+	u8                     port_num;
+};
+
+struct port_attribute {
+	struct attribute attr;
+	ssize_t (*show)(struct ib_port *, struct port_attribute *, char *buf);
+	ssize_t (*store)(struct ib_port *, struct port_attribute *,
+			 const char *buf, size_t count);
+};
+
+#define PORT_ATTR(_name, _mode, _show, _store) \
+struct port_attribute port_attr_##_name = __ATTR(_name, _mode, _show, _store)
+
+#define PORT_ATTR_RO(_name) \
+struct port_attribute port_attr_##_name = __ATTR_RO(_name)
+
+struct port_table_attribute {
+	struct port_attribute	attr;
+	char			name[8];
+	int			index;
+};
+
+static ssize_t port_attr_show(struct kobject *kobj,
+			      struct attribute *attr, char *buf)
+{
+	struct port_attribute *port_attr =
+		container_of(attr, struct port_attribute, attr);
+	struct ib_port *p = container_of(kobj, struct ib_port, kobj);
+
+	if (!port_attr->show)
+		return -EIO;
+
+	return port_attr->show(p, port_attr, buf);
+}
+
+static const struct sysfs_ops port_sysfs_ops = {
+	.show = port_attr_show
+};
+
+static ssize_t state_show(struct ib_port *p, struct port_attribute *unused,
+			  char *buf)
+{
+	struct ib_port_attr attr;
+	ssize_t ret;
+
+	static const char *state_name[] = {
+		[IB_PORT_NOP]		= "NOP",
+		[IB_PORT_DOWN]		= "DOWN",
+		[IB_PORT_INIT]		= "INIT",
+		[IB_PORT_ARMED]		= "ARMED",
+		[IB_PORT_ACTIVE]	= "ACTIVE",
+		[IB_PORT_ACTIVE_DEFER]	= "ACTIVE_DEFER"
+	};
+
+	ret = ib_query_port(p->ibdev, p->port_num, &attr);
+	if (ret)
+		return ret;
+
+	return sprintf(buf, "%d: %s\n", attr.state,
+		       attr.state >= 0 && attr.state < ARRAY_SIZE(state_name) ?
+		       state_name[attr.state] : "UNKNOWN");
+}
+
+static ssize_t lid_show(struct ib_port *p, struct port_attribute *unused,
+			char *buf)
+{
+	struct ib_port_attr attr;
+	ssize_t ret;
+
+	ret = ib_query_port(p->ibdev, p->port_num, &attr);
+	if (ret)
+		return ret;
+
+	return sprintf(buf, "0x%x\n", attr.lid);
+}
+
+static ssize_t lid_mask_count_show(struct ib_port *p,
+				   struct port_attribute *unused,
+				   char *buf)
+{
+	struct ib_port_attr attr;
+	ssize_t ret;
+
+	ret = ib_query_port(p->ibdev, p->port_num, &attr);
+	if (ret)
+		return ret;
+
+	return sprintf(buf, "%d\n", attr.lmc);
+}
+
+static ssize_t sm_lid_show(struct ib_port *p, struct port_attribute *unused,
+			   char *buf)
+{
+	struct ib_port_attr attr;
+	ssize_t ret;
+
+	ret = ib_query_port(p->ibdev, p->port_num, &attr);
+	if (ret)
+		return ret;
+
+	return sprintf(buf, "0x%x\n", attr.sm_lid);
+}
+
+static ssize_t sm_sl_show(struct ib_port *p, struct port_attribute *unused,
+			  char *buf)
+{
+	struct ib_port_attr attr;
+	ssize_t ret;
+
+	ret = ib_query_port(p->ibdev, p->port_num, &attr);
+	if (ret)
+		return ret;
+
+	return sprintf(buf, "%d\n", attr.sm_sl);
+}
+
+static ssize_t cap_mask_show(struct ib_port *p, struct port_attribute *unused,
+			     char *buf)
+{
+	struct ib_port_attr attr;
+	ssize_t ret;
+
+	ret = ib_query_port(p->ibdev, p->port_num, &attr);
+	if (ret)
+		return ret;
+
+	return sprintf(buf, "0x%08x\n", attr.port_cap_flags);
+}
+
+static ssize_t rate_show(struct ib_port *p, struct port_attribute *unused,
+			 char *buf)
+{
+	struct ib_port_attr attr;
+	char *speed = "";
+	int rate;		/* in deci-Gb/sec */
+	ssize_t ret;
+
+	ret = ib_query_port(p->ibdev, p->port_num, &attr);
+	if (ret)
+		return ret;
+
+	switch (attr.active_speed) {
+	case IB_SPEED_DDR:
+		speed = " DDR";
+		rate = 50;
+		break;
+	case IB_SPEED_QDR:
+		speed = " QDR";
+		rate = 100;
+		break;
+	case IB_SPEED_FDR10:
+		speed = " FDR10";
+		rate = 100;
+		break;
+	case IB_SPEED_FDR:
+		speed = " FDR";
+		rate = 140;
+		break;
+	case IB_SPEED_EDR:
+		speed = " EDR";
+		rate = 250;
+		break;
+	case IB_SPEED_SDR:
+	default:		/* default to SDR for invalid rates */
+		rate = 25;
+		break;
+	}
+
+	rate *= ib_width_enum_to_int(attr.active_width);
+	if (rate < 0)
+		return -EINVAL;
+
+	return sprintf(buf, "%d%s Gb/sec (%dX%s)\n",
+		       rate / 10, rate % 10 ? ".5" : "",
+		       ib_width_enum_to_int(attr.active_width), speed);
+}
+
+static ssize_t phys_state_show(struct ib_port *p, struct port_attribute *unused,
+			       char *buf)
+{
+	struct ib_port_attr attr;
+
+	ssize_t ret;
+
+	ret = ib_query_port(p->ibdev, p->port_num, &attr);
+	if (ret)
+		return ret;
+
+	switch (attr.phys_state) {
+	case 1:  return sprintf(buf, "1: Sleep\n");
+	case 2:  return sprintf(buf, "2: Polling\n");
+	case 3:  return sprintf(buf, "3: Disabled\n");
+	case 4:  return sprintf(buf, "4: PortConfigurationTraining\n");
+	case 5:  return sprintf(buf, "5: LinkUp\n");
+	case 6:  return sprintf(buf, "6: LinkErrorRecovery\n");
+	case 7:  return sprintf(buf, "7: Phy Test\n");
+	default: return sprintf(buf, "%d: <unknown>\n", attr.phys_state);
+	}
+}
+
+static ssize_t link_layer_show(struct ib_port *p, struct port_attribute *unused,
+			       char *buf)
+{
+	switch (rdma_port_get_link_layer(p->ibdev, p->port_num)) {
+	case IB_LINK_LAYER_INFINIBAND:
+		return sprintf(buf, "%s\n", "InfiniBand");
+	case IB_LINK_LAYER_ETHERNET:
+		return sprintf(buf, "%s\n", "Ethernet");
+	default:
+		return sprintf(buf, "%s\n", "Unknown");
+	}
+}
+
+static PORT_ATTR_RO(state);
+static PORT_ATTR_RO(lid);
+static PORT_ATTR_RO(lid_mask_count);
+static PORT_ATTR_RO(sm_lid);
+static PORT_ATTR_RO(sm_sl);
+static PORT_ATTR_RO(cap_mask);
+static PORT_ATTR_RO(rate);
+static PORT_ATTR_RO(phys_state);
+static PORT_ATTR_RO(link_layer);
+
+static struct attribute *port_default_attrs[] = {
+	&port_attr_state.attr,
+	&port_attr_lid.attr,
+	&port_attr_lid_mask_count.attr,
+	&port_attr_sm_lid.attr,
+	&port_attr_sm_sl.attr,
+	&port_attr_cap_mask.attr,
+	&port_attr_rate.attr,
+	&port_attr_phys_state.attr,
+	&port_attr_link_layer.attr,
+	NULL
+};
+
+static ssize_t show_port_gid(struct ib_port *p, struct port_attribute *attr,
+			     char *buf)
+{
+	struct port_table_attribute *tab_attr =
+		container_of(attr, struct port_table_attribute, attr);
+	union ib_gid gid;
+	ssize_t ret;
+
+	ret = ib_query_gid(p->ibdev, p->port_num, tab_attr->index, &gid);
+	if (ret)
+		return ret;
+
+	return sprintf(buf, "%pI6\n", gid.raw);
+}
+
+static ssize_t show_port_pkey(struct ib_port *p, struct port_attribute *attr,
+			      char *buf)
+{
+	struct port_table_attribute *tab_attr =
+		container_of(attr, struct port_table_attribute, attr);
+	u16 pkey;
+	ssize_t ret;
+
+	ret = ib_query_pkey(p->ibdev, p->port_num, tab_attr->index, &pkey);
+	if (ret)
+		return ret;
+
+	return sprintf(buf, "0x%04x\n", pkey);
+}
+
+#define PORT_PMA_ATTR(_name, _counter, _width, _offset)			\
+struct port_table_attribute port_pma_attr_##_name = {			\
+	.attr  = __ATTR(_name, S_IRUGO, show_pma_counter, NULL),	\
+	.index = (_offset) | ((_width) << 16) | ((_counter) << 24)	\
+}
+
+static ssize_t show_pma_counter(struct ib_port *p, struct port_attribute *attr,
+				char *buf)
+{
+	struct port_table_attribute *tab_attr =
+		container_of(attr, struct port_table_attribute, attr);
+	int offset = tab_attr->index & 0xffff;
+	int width  = (tab_attr->index >> 16) & 0xff;
+	struct ib_mad *in_mad  = NULL;
+	struct ib_mad *out_mad = NULL;
+	ssize_t ret;
+
+	if (!p->ibdev->process_mad)
+		return sprintf(buf, "N/A (no PMA)\n");
+
+	in_mad  = kzalloc(sizeof *in_mad, GFP_KERNEL);
+	out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL);
+	if (!in_mad || !out_mad) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	in_mad->mad_hdr.base_version  = 1;
+	in_mad->mad_hdr.mgmt_class    = IB_MGMT_CLASS_PERF_MGMT;
+	in_mad->mad_hdr.class_version = 1;
+	in_mad->mad_hdr.method        = IB_MGMT_METHOD_GET;
+	in_mad->mad_hdr.attr_id       = cpu_to_be16(0x12); /* PortCounters */
+
+	in_mad->data[41] = p->port_num;	/* PortSelect field */
+
+	if ((p->ibdev->process_mad(p->ibdev, IB_MAD_IGNORE_MKEY,
+		 p->port_num, NULL, NULL, in_mad, out_mad) &
+	     (IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY)) !=
+	    (IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	switch (width) {
+	case 4:
+		ret = sprintf(buf, "%u\n", (out_mad->data[40 + offset / 8] >>
+					    (4 - (offset % 8))) & 0xf);
+		break;
+	case 8:
+		ret = sprintf(buf, "%u\n", out_mad->data[40 + offset / 8]);
+		break;
+	case 16:
+		ret = sprintf(buf, "%u\n",
+			      be16_to_cpup((__be16 *)(out_mad->data + 40 + offset / 8)));
+		break;
+	case 32:
+		ret = sprintf(buf, "%u\n",
+			      be32_to_cpup((__be32 *)(out_mad->data + 40 + offset / 8)));
+		break;
+	default:
+		ret = 0;
+	}
+
+out:
+	kfree(in_mad);
+	kfree(out_mad);
+
+	return ret;
+}
+
+static PORT_PMA_ATTR(symbol_error		    ,  0, 16,  32);
+static PORT_PMA_ATTR(link_error_recovery	    ,  1,  8,  48);
+static PORT_PMA_ATTR(link_downed		    ,  2,  8,  56);
+static PORT_PMA_ATTR(port_rcv_errors		    ,  3, 16,  64);
+static PORT_PMA_ATTR(port_rcv_remote_physical_errors,  4, 16,  80);
+static PORT_PMA_ATTR(port_rcv_switch_relay_errors   ,  5, 16,  96);
+static PORT_PMA_ATTR(port_xmit_discards		    ,  6, 16, 112);
+static PORT_PMA_ATTR(port_xmit_constraint_errors    ,  7,  8, 128);
+static PORT_PMA_ATTR(port_rcv_constraint_errors	    ,  8,  8, 136);
+static PORT_PMA_ATTR(local_link_integrity_errors    ,  9,  4, 152);
+static PORT_PMA_ATTR(excessive_buffer_overrun_errors, 10,  4, 156);
+static PORT_PMA_ATTR(VL15_dropped		    , 11, 16, 176);
+static PORT_PMA_ATTR(port_xmit_data		    , 12, 32, 192);
+static PORT_PMA_ATTR(port_rcv_data		    , 13, 32, 224);
+static PORT_PMA_ATTR(port_xmit_packets		    , 14, 32, 256);
+static PORT_PMA_ATTR(port_rcv_packets		    , 15, 32, 288);
+
+static struct attribute *pma_attrs[] = {
+	&port_pma_attr_symbol_error.attr.attr,
+	&port_pma_attr_link_error_recovery.attr.attr,
+	&port_pma_attr_link_downed.attr.attr,
+	&port_pma_attr_port_rcv_errors.attr.attr,
+	&port_pma_attr_port_rcv_remote_physical_errors.attr.attr,
+	&port_pma_attr_port_rcv_switch_relay_errors.attr.attr,
+	&port_pma_attr_port_xmit_discards.attr.attr,
+	&port_pma_attr_port_xmit_constraint_errors.attr.attr,
+	&port_pma_attr_port_rcv_constraint_errors.attr.attr,
+	&port_pma_attr_local_link_integrity_errors.attr.attr,
+	&port_pma_attr_excessive_buffer_overrun_errors.attr.attr,
+	&port_pma_attr_VL15_dropped.attr.attr,
+	&port_pma_attr_port_xmit_data.attr.attr,
+	&port_pma_attr_port_rcv_data.attr.attr,
+	&port_pma_attr_port_xmit_packets.attr.attr,
+	&port_pma_attr_port_rcv_packets.attr.attr,
+	NULL
+};
+
+static struct attribute_group pma_group = {
+	.name  = "counters",
+	.attrs  = pma_attrs
+};
+
+static void ib_port_release(struct kobject *kobj)
+{
+	struct ib_port *p = container_of(kobj, struct ib_port, kobj);
+	struct attribute *a;
+	int i;
+
+	if (p->gid_group.attrs) {
+		for (i = 0; (a = p->gid_group.attrs[i]); ++i)
+			kfree(a);
+
+		kfree(p->gid_group.attrs);
+	}
+
+	if (p->pkey_group.attrs) {
+		for (i = 0; (a = p->pkey_group.attrs[i]); ++i)
+			kfree(a);
+
+		kfree(p->pkey_group.attrs);
+	}
+
+	kfree(p);
+}
+
+static struct kobj_type port_type = {
+	.release       = ib_port_release,
+	.sysfs_ops     = &port_sysfs_ops,
+	.default_attrs = port_default_attrs
+};
+
+static void ib_device_release(struct device *device)
+{
+	struct ib_device *dev = container_of(device, struct ib_device, dev);
+
+	kfree(dev);
+}
+
+static int ib_device_uevent(struct device *device,
+			    struct kobj_uevent_env *env)
+{
+	struct ib_device *dev = container_of(device, struct ib_device, dev);
+
+	if (add_uevent_var(env, "NAME=%s", dev->name))
+		return -ENOMEM;
+
+	/*
+	 * It would be nice to pass the node GUID with the event...
+	 */
+
+	return 0;
+}
+
+static struct attribute **
+alloc_group_attrs(ssize_t (*show)(struct ib_port *,
+				  struct port_attribute *, char *buf),
+		  int len)
+{
+	struct attribute **tab_attr;
+	struct port_table_attribute *element;
+	int i;
+
+	tab_attr = kcalloc(1 + len, sizeof(struct attribute *), GFP_KERNEL);
+	if (!tab_attr)
+		return NULL;
+
+	for (i = 0; i < len; i++) {
+		element = kzalloc(sizeof(struct port_table_attribute),
+				  GFP_KERNEL);
+		if (!element)
+			goto err;
+
+		if (snprintf(element->name, sizeof(element->name),
+			     "%d", i) >= sizeof(element->name)) {
+			kfree(element);
+			goto err;
+		}
+
+		element->attr.attr.name  = element->name;
+		element->attr.attr.mode  = S_IRUGO;
+		element->attr.show       = show;
+		element->index		 = i;
+		sysfs_attr_init(&element->attr.attr);
+
+		tab_attr[i] = &element->attr.attr;
+	}
+
+	return tab_attr;
+
+err:
+	while (--i >= 0)
+		kfree(tab_attr[i]);
+	kfree(tab_attr);
+	return NULL;
+}
+
+static int add_port(struct ib_device *device, int port_num,
+		    int (*port_callback)(struct ib_device *,
+					 u8, struct kobject *))
+{
+	struct ib_port *p;
+	struct ib_port_attr attr;
+	int i;
+	int ret;
+
+	ret = ib_query_port(device, port_num, &attr);
+	if (ret)
+		return ret;
+
+	p = kzalloc(sizeof *p, GFP_KERNEL);
+	if (!p)
+		return -ENOMEM;
+
+	p->ibdev      = device;
+	p->port_num   = port_num;
+
+	ret = kobject_init_and_add(&p->kobj, &port_type,
+				   device->ports_parent,
+				   "%d", port_num);
+	if (ret) {
+		kfree(p);
+		return ret;
+	}
+
+	ret = sysfs_create_group(&p->kobj, &pma_group);
+	if (ret)
+		goto err_put;
+
+	p->gid_group.name  = "gids";
+	p->gid_group.attrs = alloc_group_attrs(show_port_gid, attr.gid_tbl_len);
+	if (!p->gid_group.attrs) {
+		ret = -ENOMEM;
+		goto err_remove_pma;
+	}
+
+	ret = sysfs_create_group(&p->kobj, &p->gid_group);
+	if (ret)
+		goto err_free_gid;
+
+	p->pkey_group.name  = "pkeys";
+	p->pkey_group.attrs = alloc_group_attrs(show_port_pkey,
+						attr.pkey_tbl_len);
+	if (!p->pkey_group.attrs) {
+		ret = -ENOMEM;
+		goto err_remove_gid;
+	}
+
+	ret = sysfs_create_group(&p->kobj, &p->pkey_group);
+	if (ret)
+		goto err_free_pkey;
+
+	if (port_callback) {
+		ret = port_callback(device, port_num, &p->kobj);
+		if (ret)
+			goto err_remove_pkey;
+	}
+
+	list_add_tail(&p->kobj.entry, &device->port_list);
+
+	kobject_uevent(&p->kobj, KOBJ_ADD);
+	return 0;
+
+err_remove_pkey:
+	sysfs_remove_group(&p->kobj, &p->pkey_group);
+
+err_free_pkey:
+	for (i = 0; i < attr.pkey_tbl_len; ++i)
+		kfree(p->pkey_group.attrs[i]);
+
+	kfree(p->pkey_group.attrs);
+	p->pkey_group.attrs = NULL;
+
+err_remove_gid:
+	sysfs_remove_group(&p->kobj, &p->gid_group);
+
+err_free_gid:
+	for (i = 0; i < attr.gid_tbl_len; ++i)
+		kfree(p->gid_group.attrs[i]);
+
+	kfree(p->gid_group.attrs);
+	p->gid_group.attrs = NULL;
+
+err_remove_pma:
+	sysfs_remove_group(&p->kobj, &pma_group);
+
+err_put:
+	kobject_put(&p->kobj);
+	return ret;
+}
+
+static ssize_t show_node_type(struct device *device,
+			      struct device_attribute *attr, char *buf)
+{
+	struct ib_device *dev = container_of(device, struct ib_device, dev);
+
+	switch (dev->node_type) {
+	case RDMA_NODE_IB_CA:	  return sprintf(buf, "%d: CA\n", dev->node_type);
+	case RDMA_NODE_RNIC:	  return sprintf(buf, "%d: RNIC\n", dev->node_type);
+	case RDMA_NODE_USNIC:	  return sprintf(buf, "%d: usNIC\n", dev->node_type);
+	case RDMA_NODE_USNIC_UDP: return sprintf(buf, "%d: usNIC UDP\n", dev->node_type);
+	case RDMA_NODE_IB_SWITCH: return sprintf(buf, "%d: switch\n", dev->node_type);
+	case RDMA_NODE_IB_ROUTER: return sprintf(buf, "%d: router\n", dev->node_type);
+	default:		  return sprintf(buf, "%d: <unknown>\n", dev->node_type);
+	}
+}
+
+static ssize_t show_sys_image_guid(struct device *device,
+				   struct device_attribute *dev_attr, char *buf)
+{
+	struct ib_device *dev = container_of(device, struct ib_device, dev);
+	struct ib_device_attr attr;
+	ssize_t ret;
+
+	ret = ib_query_device(dev, &attr);
+	if (ret)
+		return ret;
+
+	return sprintf(buf, "%04x:%04x:%04x:%04x\n",
+		       be16_to_cpu(((__be16 *) &attr.sys_image_guid)[0]),
+		       be16_to_cpu(((__be16 *) &attr.sys_image_guid)[1]),
+		       be16_to_cpu(((__be16 *) &attr.sys_image_guid)[2]),
+		       be16_to_cpu(((__be16 *) &attr.sys_image_guid)[3]));
+}
+
+static ssize_t show_node_guid(struct device *device,
+			      struct device_attribute *attr, char *buf)
+{
+	struct ib_device *dev = container_of(device, struct ib_device, dev);
+
+	return sprintf(buf, "%04x:%04x:%04x:%04x\n",
+		       be16_to_cpu(((__be16 *) &dev->node_guid)[0]),
+		       be16_to_cpu(((__be16 *) &dev->node_guid)[1]),
+		       be16_to_cpu(((__be16 *) &dev->node_guid)[2]),
+		       be16_to_cpu(((__be16 *) &dev->node_guid)[3]));
+}
+
+static ssize_t show_node_desc(struct device *device,
+			      struct device_attribute *attr, char *buf)
+{
+	struct ib_device *dev = container_of(device, struct ib_device, dev);
+
+	return sprintf(buf, "%.64s\n", dev->node_desc);
+}
+
+static ssize_t set_node_desc(struct device *device,
+			     struct device_attribute *attr,
+			     const char *buf, size_t count)
+{
+	struct ib_device *dev = container_of(device, struct ib_device, dev);
+	struct ib_device_modify desc = {};
+	int ret;
+
+	if (!dev->modify_device)
+		return -EIO;
+
+	memcpy(desc.node_desc, buf, min_t(int, count, 64));
+	ret = ib_modify_device(dev, IB_DEVICE_MODIFY_NODE_DESC, &desc);
+	if (ret)
+		return ret;
+
+	return count;
+}
+
+static DEVICE_ATTR(node_type, S_IRUGO, show_node_type, NULL);
+static DEVICE_ATTR(sys_image_guid, S_IRUGO, show_sys_image_guid, NULL);
+static DEVICE_ATTR(node_guid, S_IRUGO, show_node_guid, NULL);
+static DEVICE_ATTR(node_desc, S_IRUGO | S_IWUSR, show_node_desc, set_node_desc);
+
+static struct device_attribute *ib_class_attributes[] = {
+	&dev_attr_node_type,
+	&dev_attr_sys_image_guid,
+	&dev_attr_node_guid,
+	&dev_attr_node_desc
+};
+
+static struct class ib_class = {
+	.name    = "infiniband",
+	.dev_release = ib_device_release,
+	.dev_uevent = ib_device_uevent,
+};
+
+/* Show a given an attribute in the statistics group */
+static ssize_t show_protocol_stat(const struct device *device,
+			    struct device_attribute *attr, char *buf,
+			    unsigned offset)
+{
+	struct ib_device *dev = container_of(device, struct ib_device, dev);
+	union rdma_protocol_stats stats;
+	ssize_t ret;
+
+	ret = dev->get_protocol_stats(dev, &stats);
+	if (ret)
+		return ret;
+
+	return sprintf(buf, "%llu\n",
+		       (unsigned long long) ((u64 *) &stats)[offset]);
+}
+
+/* generate a read-only iwarp statistics attribute */
+#define IW_STATS_ENTRY(name)						\
+static ssize_t show_##name(struct device *device,			\
+			   struct device_attribute *attr, char *buf)	\
+{									\
+	return show_protocol_stat(device, attr, buf,			\
+				  offsetof(struct iw_protocol_stats, name) / \
+				  sizeof (u64));			\
+}									\
+static DEVICE_ATTR(name, S_IRUGO, show_##name, NULL)
+
+IW_STATS_ENTRY(ipInReceives);
+IW_STATS_ENTRY(ipInHdrErrors);
+IW_STATS_ENTRY(ipInTooBigErrors);
+IW_STATS_ENTRY(ipInNoRoutes);
+IW_STATS_ENTRY(ipInAddrErrors);
+IW_STATS_ENTRY(ipInUnknownProtos);
+IW_STATS_ENTRY(ipInTruncatedPkts);
+IW_STATS_ENTRY(ipInDiscards);
+IW_STATS_ENTRY(ipInDelivers);
+IW_STATS_ENTRY(ipOutForwDatagrams);
+IW_STATS_ENTRY(ipOutRequests);
+IW_STATS_ENTRY(ipOutDiscards);
+IW_STATS_ENTRY(ipOutNoRoutes);
+IW_STATS_ENTRY(ipReasmTimeout);
+IW_STATS_ENTRY(ipReasmReqds);
+IW_STATS_ENTRY(ipReasmOKs);
+IW_STATS_ENTRY(ipReasmFails);
+IW_STATS_ENTRY(ipFragOKs);
+IW_STATS_ENTRY(ipFragFails);
+IW_STATS_ENTRY(ipFragCreates);
+IW_STATS_ENTRY(ipInMcastPkts);
+IW_STATS_ENTRY(ipOutMcastPkts);
+IW_STATS_ENTRY(ipInBcastPkts);
+IW_STATS_ENTRY(ipOutBcastPkts);
+IW_STATS_ENTRY(tcpRtoAlgorithm);
+IW_STATS_ENTRY(tcpRtoMin);
+IW_STATS_ENTRY(tcpRtoMax);
+IW_STATS_ENTRY(tcpMaxConn);
+IW_STATS_ENTRY(tcpActiveOpens);
+IW_STATS_ENTRY(tcpPassiveOpens);
+IW_STATS_ENTRY(tcpAttemptFails);
+IW_STATS_ENTRY(tcpEstabResets);
+IW_STATS_ENTRY(tcpCurrEstab);
+IW_STATS_ENTRY(tcpInSegs);
+IW_STATS_ENTRY(tcpOutSegs);
+IW_STATS_ENTRY(tcpRetransSegs);
+IW_STATS_ENTRY(tcpInErrs);
+IW_STATS_ENTRY(tcpOutRsts);
+
+static struct attribute *iw_proto_stats_attrs[] = {
+	&dev_attr_ipInReceives.attr,
+	&dev_attr_ipInHdrErrors.attr,
+	&dev_attr_ipInTooBigErrors.attr,
+	&dev_attr_ipInNoRoutes.attr,
+	&dev_attr_ipInAddrErrors.attr,
+	&dev_attr_ipInUnknownProtos.attr,
+	&dev_attr_ipInTruncatedPkts.attr,
+	&dev_attr_ipInDiscards.attr,
+	&dev_attr_ipInDelivers.attr,
+	&dev_attr_ipOutForwDatagrams.attr,
+	&dev_attr_ipOutRequests.attr,
+	&dev_attr_ipOutDiscards.attr,
+	&dev_attr_ipOutNoRoutes.attr,
+	&dev_attr_ipReasmTimeout.attr,
+	&dev_attr_ipReasmReqds.attr,
+	&dev_attr_ipReasmOKs.attr,
+	&dev_attr_ipReasmFails.attr,
+	&dev_attr_ipFragOKs.attr,
+	&dev_attr_ipFragFails.attr,
+	&dev_attr_ipFragCreates.attr,
+	&dev_attr_ipInMcastPkts.attr,
+	&dev_attr_ipOutMcastPkts.attr,
+	&dev_attr_ipInBcastPkts.attr,
+	&dev_attr_ipOutBcastPkts.attr,
+	&dev_attr_tcpRtoAlgorithm.attr,
+	&dev_attr_tcpRtoMin.attr,
+	&dev_attr_tcpRtoMax.attr,
+	&dev_attr_tcpMaxConn.attr,
+	&dev_attr_tcpActiveOpens.attr,
+	&dev_attr_tcpPassiveOpens.attr,
+	&dev_attr_tcpAttemptFails.attr,
+	&dev_attr_tcpEstabResets.attr,
+	&dev_attr_tcpCurrEstab.attr,
+	&dev_attr_tcpInSegs.attr,
+	&dev_attr_tcpOutSegs.attr,
+	&dev_attr_tcpRetransSegs.attr,
+	&dev_attr_tcpInErrs.attr,
+	&dev_attr_tcpOutRsts.attr,
+	NULL
+};
+
+static struct attribute_group iw_stats_group = {
+	.name	= "proto_stats",
+	.attrs	= iw_proto_stats_attrs,
+};
+
+static void free_port_list_attributes(struct ib_device *device)
+{
+	struct kobject *p, *t;
+
+	list_for_each_entry_safe(p, t, &device->port_list, entry) {
+		struct ib_port *port = container_of(p, struct ib_port, kobj);
+		list_del(&p->entry);
+		sysfs_remove_group(p, &pma_group);
+		sysfs_remove_group(p, &port->pkey_group);
+		sysfs_remove_group(p, &port->gid_group);
+		kobject_put(p);
+	}
+
+	kobject_put(device->ports_parent);
+}
+
+int ib_device_register_sysfs(struct ib_device *device,
+			     int (*port_callback)(struct ib_device *,
+						  u8, struct kobject *))
+{
+	struct device *class_dev = &device->dev;
+	int ret;
+	int i;
+
+	class_dev->class      = &ib_class;
+	class_dev->parent     = device->dma_device;
+	dev_set_name(class_dev, "%s", device->name);
+	dev_set_drvdata(class_dev, device);
+
+	INIT_LIST_HEAD(&device->port_list);
+
+	ret = device_register(class_dev);
+	if (ret)
+		goto err;
+
+	for (i = 0; i < ARRAY_SIZE(ib_class_attributes); ++i) {
+		ret = device_create_file(class_dev, ib_class_attributes[i]);
+		if (ret)
+			goto err_unregister;
+	}
+
+	device->ports_parent = kobject_create_and_add("ports",
+						      &class_dev->kobj);
+	if (!device->ports_parent) {
+		ret = -ENOMEM;
+		goto err_put;
+	}
+
+	if (device->node_type == RDMA_NODE_IB_SWITCH) {
+		ret = add_port(device, 0, port_callback);
+		if (ret)
+			goto err_put;
+	} else {
+		for (i = 1; i <= device->phys_port_cnt; ++i) {
+			ret = add_port(device, i, port_callback);
+			if (ret)
+				goto err_put;
+		}
+	}
+
+	if (device->node_type == RDMA_NODE_RNIC && device->get_protocol_stats) {
+		ret = sysfs_create_group(&class_dev->kobj, &iw_stats_group);
+		if (ret)
+			goto err_put;
+	}
+
+	return 0;
+
+err_put:
+	free_port_list_attributes(device);
+
+err_unregister:
+	device_unregister(class_dev);
+
+err:
+	return ret;
+}
+
+void ib_device_unregister_sysfs(struct ib_device *device)
+{
+	/* Hold kobject until ib_dealloc_device() */
+	struct kobject *kobj_dev = kobject_get(&device->dev.kobj);
+	int i;
+
+	if (device->node_type == RDMA_NODE_RNIC && device->get_protocol_stats)
+		sysfs_remove_group(kobj_dev, &iw_stats_group);
+
+	free_port_list_attributes(device);
+
+	for (i = 0; i < ARRAY_SIZE(ib_class_attributes); ++i)
+		device_remove_file(&device->dev, ib_class_attributes[i]);
+
+	device_unregister(&device->dev);
+}
+
+int ib_sysfs_setup(void)
+{
+	return class_register(&ib_class);
+}
+
+void ib_sysfs_cleanup(void)
+{
+	class_unregister(&ib_class);
+}
diff --git a/drivers/infiniband/core/ucm.c b/drivers/infiniband/core/ucm.c
new file mode 100644
index 0000000..f2f6393
--- /dev/null
+++ b/drivers/infiniband/core/ucm.c
@@ -0,0 +1,1370 @@
+/*
+ * Copyright (c) 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Intel Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *	copyright notice, this list of conditions and the following
+ *	disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *	copyright notice, this list of conditions and the following
+ *	disclaimer in the documentation and/or other materials
+ *	provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/completion.h>
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/module.h>
+#include <linux/device.h>
+#include <linux/err.h>
+#include <linux/poll.h>
+#include <linux/sched.h>
+#include <linux/file.h>
+#include <linux/mount.h>
+#include <linux/cdev.h>
+#include <linux/idr.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+
+#include <asm/uaccess.h>
+
+#include <rdma/ib_cm.h>
+#include <rdma/ib_user_cm.h>
+#include <rdma/ib_marshall.h>
+
+MODULE_AUTHOR("Libor Michalek");
+MODULE_DESCRIPTION("InfiniBand userspace Connection Manager access");
+MODULE_LICENSE("Dual BSD/GPL");
+
+struct ib_ucm_device {
+	int			devnum;
+	struct cdev		cdev;
+	struct device		dev;
+	struct ib_device	*ib_dev;
+};
+
+struct ib_ucm_file {
+	struct mutex file_mutex;
+	struct file *filp;
+	struct ib_ucm_device *device;
+
+	struct list_head  ctxs;
+	struct list_head  events;
+	wait_queue_head_t poll_wait;
+};
+
+struct ib_ucm_context {
+	int                 id;
+	struct completion   comp;
+	atomic_t            ref;
+	int		    events_reported;
+
+	struct ib_ucm_file *file;
+	struct ib_cm_id    *cm_id;
+	__u64		   uid;
+
+	struct list_head    events;    /* list of pending events. */
+	struct list_head    file_list; /* member in file ctx list */
+};
+
+struct ib_ucm_event {
+	struct ib_ucm_context *ctx;
+	struct list_head file_list; /* member in file event list */
+	struct list_head ctx_list;  /* member in ctx event list */
+
+	struct ib_cm_id *cm_id;
+	struct ib_ucm_event_resp resp;
+	void *data;
+	void *info;
+	int data_len;
+	int info_len;
+};
+
+enum {
+	IB_UCM_MAJOR = 231,
+	IB_UCM_BASE_MINOR = 224,
+	IB_UCM_MAX_DEVICES = 32
+};
+
+#define IB_UCM_BASE_DEV MKDEV(IB_UCM_MAJOR, IB_UCM_BASE_MINOR)
+
+static void ib_ucm_add_one(struct ib_device *device);
+static void ib_ucm_remove_one(struct ib_device *device);
+
+static struct ib_client ucm_client = {
+	.name   = "ucm",
+	.add    = ib_ucm_add_one,
+	.remove = ib_ucm_remove_one
+};
+
+static DEFINE_MUTEX(ctx_id_mutex);
+static DEFINE_IDR(ctx_id_table);
+static DECLARE_BITMAP(dev_map, IB_UCM_MAX_DEVICES);
+
+static struct ib_ucm_context *ib_ucm_ctx_get(struct ib_ucm_file *file, int id)
+{
+	struct ib_ucm_context *ctx;
+
+	mutex_lock(&ctx_id_mutex);
+	ctx = idr_find(&ctx_id_table, id);
+	if (!ctx)
+		ctx = ERR_PTR(-ENOENT);
+	else if (ctx->file != file)
+		ctx = ERR_PTR(-EINVAL);
+	else
+		atomic_inc(&ctx->ref);
+	mutex_unlock(&ctx_id_mutex);
+
+	return ctx;
+}
+
+static void ib_ucm_ctx_put(struct ib_ucm_context *ctx)
+{
+	if (atomic_dec_and_test(&ctx->ref))
+		complete(&ctx->comp);
+}
+
+static inline int ib_ucm_new_cm_id(int event)
+{
+	return event == IB_CM_REQ_RECEIVED || event == IB_CM_SIDR_REQ_RECEIVED;
+}
+
+static void ib_ucm_cleanup_events(struct ib_ucm_context *ctx)
+{
+	struct ib_ucm_event *uevent;
+
+	mutex_lock(&ctx->file->file_mutex);
+	list_del(&ctx->file_list);
+	while (!list_empty(&ctx->events)) {
+
+		uevent = list_entry(ctx->events.next,
+				    struct ib_ucm_event, ctx_list);
+		list_del(&uevent->file_list);
+		list_del(&uevent->ctx_list);
+		mutex_unlock(&ctx->file->file_mutex);
+
+		/* clear incoming connections. */
+		if (ib_ucm_new_cm_id(uevent->resp.event))
+			ib_destroy_cm_id(uevent->cm_id);
+
+		kfree(uevent);
+		mutex_lock(&ctx->file->file_mutex);
+	}
+	mutex_unlock(&ctx->file->file_mutex);
+}
+
+static struct ib_ucm_context *ib_ucm_ctx_alloc(struct ib_ucm_file *file)
+{
+	struct ib_ucm_context *ctx;
+
+	ctx = kzalloc(sizeof *ctx, GFP_KERNEL);
+	if (!ctx)
+		return NULL;
+
+	atomic_set(&ctx->ref, 1);
+	init_completion(&ctx->comp);
+	ctx->file = file;
+	INIT_LIST_HEAD(&ctx->events);
+
+	mutex_lock(&ctx_id_mutex);
+	ctx->id = idr_alloc(&ctx_id_table, ctx, 0, 0, GFP_KERNEL);
+	mutex_unlock(&ctx_id_mutex);
+	if (ctx->id < 0)
+		goto error;
+
+	list_add_tail(&ctx->file_list, &file->ctxs);
+	return ctx;
+
+error:
+	kfree(ctx);
+	return NULL;
+}
+
+static void ib_ucm_event_req_get(struct ib_ucm_req_event_resp *ureq,
+				 struct ib_cm_req_event_param *kreq)
+{
+	ureq->remote_ca_guid             = kreq->remote_ca_guid;
+	ureq->remote_qkey                = kreq->remote_qkey;
+	ureq->remote_qpn                 = kreq->remote_qpn;
+	ureq->qp_type                    = kreq->qp_type;
+	ureq->starting_psn               = kreq->starting_psn;
+	ureq->responder_resources        = kreq->responder_resources;
+	ureq->initiator_depth            = kreq->initiator_depth;
+	ureq->local_cm_response_timeout  = kreq->local_cm_response_timeout;
+	ureq->flow_control               = kreq->flow_control;
+	ureq->remote_cm_response_timeout = kreq->remote_cm_response_timeout;
+	ureq->retry_count                = kreq->retry_count;
+	ureq->rnr_retry_count            = kreq->rnr_retry_count;
+	ureq->srq                        = kreq->srq;
+	ureq->port			 = kreq->port;
+
+	ib_copy_path_rec_to_user(&ureq->primary_path, kreq->primary_path);
+	if (kreq->alternate_path)
+		ib_copy_path_rec_to_user(&ureq->alternate_path,
+					 kreq->alternate_path);
+}
+
+static void ib_ucm_event_rep_get(struct ib_ucm_rep_event_resp *urep,
+				 struct ib_cm_rep_event_param *krep)
+{
+	urep->remote_ca_guid      = krep->remote_ca_guid;
+	urep->remote_qkey         = krep->remote_qkey;
+	urep->remote_qpn          = krep->remote_qpn;
+	urep->starting_psn        = krep->starting_psn;
+	urep->responder_resources = krep->responder_resources;
+	urep->initiator_depth     = krep->initiator_depth;
+	urep->target_ack_delay    = krep->target_ack_delay;
+	urep->failover_accepted   = krep->failover_accepted;
+	urep->flow_control        = krep->flow_control;
+	urep->rnr_retry_count     = krep->rnr_retry_count;
+	urep->srq                 = krep->srq;
+}
+
+static void ib_ucm_event_sidr_rep_get(struct ib_ucm_sidr_rep_event_resp *urep,
+				      struct ib_cm_sidr_rep_event_param *krep)
+{
+	urep->status = krep->status;
+	urep->qkey   = krep->qkey;
+	urep->qpn    = krep->qpn;
+};
+
+static int ib_ucm_event_process(struct ib_cm_event *evt,
+				struct ib_ucm_event *uvt)
+{
+	void *info = NULL;
+
+	switch (evt->event) {
+	case IB_CM_REQ_RECEIVED:
+		ib_ucm_event_req_get(&uvt->resp.u.req_resp,
+				     &evt->param.req_rcvd);
+		uvt->data_len      = IB_CM_REQ_PRIVATE_DATA_SIZE;
+		uvt->resp.present  = IB_UCM_PRES_PRIMARY;
+		uvt->resp.present |= (evt->param.req_rcvd.alternate_path ?
+				      IB_UCM_PRES_ALTERNATE : 0);
+		break;
+	case IB_CM_REP_RECEIVED:
+		ib_ucm_event_rep_get(&uvt->resp.u.rep_resp,
+				     &evt->param.rep_rcvd);
+		uvt->data_len = IB_CM_REP_PRIVATE_DATA_SIZE;
+		break;
+	case IB_CM_RTU_RECEIVED:
+		uvt->data_len = IB_CM_RTU_PRIVATE_DATA_SIZE;
+		uvt->resp.u.send_status = evt->param.send_status;
+		break;
+	case IB_CM_DREQ_RECEIVED:
+		uvt->data_len = IB_CM_DREQ_PRIVATE_DATA_SIZE;
+		uvt->resp.u.send_status = evt->param.send_status;
+		break;
+	case IB_CM_DREP_RECEIVED:
+		uvt->data_len = IB_CM_DREP_PRIVATE_DATA_SIZE;
+		uvt->resp.u.send_status = evt->param.send_status;
+		break;
+	case IB_CM_MRA_RECEIVED:
+		uvt->resp.u.mra_resp.timeout =
+					evt->param.mra_rcvd.service_timeout;
+		uvt->data_len = IB_CM_MRA_PRIVATE_DATA_SIZE;
+		break;
+	case IB_CM_REJ_RECEIVED:
+		uvt->resp.u.rej_resp.reason = evt->param.rej_rcvd.reason;
+		uvt->data_len = IB_CM_REJ_PRIVATE_DATA_SIZE;
+		uvt->info_len = evt->param.rej_rcvd.ari_length;
+		info	      = evt->param.rej_rcvd.ari;
+		break;
+	case IB_CM_LAP_RECEIVED:
+		ib_copy_path_rec_to_user(&uvt->resp.u.lap_resp.path,
+					 evt->param.lap_rcvd.alternate_path);
+		uvt->data_len = IB_CM_LAP_PRIVATE_DATA_SIZE;
+		uvt->resp.present = IB_UCM_PRES_ALTERNATE;
+		break;
+	case IB_CM_APR_RECEIVED:
+		uvt->resp.u.apr_resp.status = evt->param.apr_rcvd.ap_status;
+		uvt->data_len = IB_CM_APR_PRIVATE_DATA_SIZE;
+		uvt->info_len = evt->param.apr_rcvd.info_len;
+		info	      = evt->param.apr_rcvd.apr_info;
+		break;
+	case IB_CM_SIDR_REQ_RECEIVED:
+		uvt->resp.u.sidr_req_resp.pkey =
+					evt->param.sidr_req_rcvd.pkey;
+		uvt->resp.u.sidr_req_resp.port =
+					evt->param.sidr_req_rcvd.port;
+		uvt->data_len = IB_CM_SIDR_REQ_PRIVATE_DATA_SIZE;
+		break;
+	case IB_CM_SIDR_REP_RECEIVED:
+		ib_ucm_event_sidr_rep_get(&uvt->resp.u.sidr_rep_resp,
+					  &evt->param.sidr_rep_rcvd);
+		uvt->data_len = IB_CM_SIDR_REP_PRIVATE_DATA_SIZE;
+		uvt->info_len = evt->param.sidr_rep_rcvd.info_len;
+		info	      = evt->param.sidr_rep_rcvd.info;
+		break;
+	default:
+		uvt->resp.u.send_status = evt->param.send_status;
+		break;
+	}
+
+	if (uvt->data_len) {
+		uvt->data = kmemdup(evt->private_data, uvt->data_len, GFP_KERNEL);
+		if (!uvt->data)
+			goto err1;
+
+		uvt->resp.present |= IB_UCM_PRES_DATA;
+	}
+
+	if (uvt->info_len) {
+		uvt->info = kmemdup(info, uvt->info_len, GFP_KERNEL);
+		if (!uvt->info)
+			goto err2;
+
+		uvt->resp.present |= IB_UCM_PRES_INFO;
+	}
+	return 0;
+
+err2:
+	kfree(uvt->data);
+err1:
+	return -ENOMEM;
+}
+
+static int ib_ucm_event_handler(struct ib_cm_id *cm_id,
+				struct ib_cm_event *event)
+{
+	struct ib_ucm_event *uevent;
+	struct ib_ucm_context *ctx;
+	int result = 0;
+
+	ctx = cm_id->context;
+
+	uevent = kzalloc(sizeof *uevent, GFP_KERNEL);
+	if (!uevent)
+		goto err1;
+
+	uevent->ctx = ctx;
+	uevent->cm_id = cm_id;
+	uevent->resp.uid = ctx->uid;
+	uevent->resp.id = ctx->id;
+	uevent->resp.event = event->event;
+
+	result = ib_ucm_event_process(event, uevent);
+	if (result)
+		goto err2;
+
+	mutex_lock(&ctx->file->file_mutex);
+	list_add_tail(&uevent->file_list, &ctx->file->events);
+	list_add_tail(&uevent->ctx_list, &ctx->events);
+	wake_up_interruptible(&ctx->file->poll_wait);
+	mutex_unlock(&ctx->file->file_mutex);
+	return 0;
+
+err2:
+	kfree(uevent);
+err1:
+	/* Destroy new cm_id's */
+	return ib_ucm_new_cm_id(event->event);
+}
+
+static ssize_t ib_ucm_event(struct ib_ucm_file *file,
+			    const char __user *inbuf,
+			    int in_len, int out_len)
+{
+	struct ib_ucm_context *ctx;
+	struct ib_ucm_event_get cmd;
+	struct ib_ucm_event *uevent;
+	int result = 0;
+
+	if (out_len < sizeof(struct ib_ucm_event_resp))
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	mutex_lock(&file->file_mutex);
+	while (list_empty(&file->events)) {
+		mutex_unlock(&file->file_mutex);
+
+		if (file->filp->f_flags & O_NONBLOCK)
+			return -EAGAIN;
+
+		if (wait_event_interruptible(file->poll_wait,
+					     !list_empty(&file->events)))
+			return -ERESTARTSYS;
+
+		mutex_lock(&file->file_mutex);
+	}
+
+	uevent = list_entry(file->events.next, struct ib_ucm_event, file_list);
+
+	if (ib_ucm_new_cm_id(uevent->resp.event)) {
+		ctx = ib_ucm_ctx_alloc(file);
+		if (!ctx) {
+			result = -ENOMEM;
+			goto done;
+		}
+
+		ctx->cm_id = uevent->cm_id;
+		ctx->cm_id->context = ctx;
+		uevent->resp.id = ctx->id;
+	}
+
+	if (copy_to_user((void __user *)(unsigned long)cmd.response,
+			 &uevent->resp, sizeof(uevent->resp))) {
+		result = -EFAULT;
+		goto done;
+	}
+
+	if (uevent->data) {
+		if (cmd.data_len < uevent->data_len) {
+			result = -ENOMEM;
+			goto done;
+		}
+		if (copy_to_user((void __user *)(unsigned long)cmd.data,
+				 uevent->data, uevent->data_len)) {
+			result = -EFAULT;
+			goto done;
+		}
+	}
+
+	if (uevent->info) {
+		if (cmd.info_len < uevent->info_len) {
+			result = -ENOMEM;
+			goto done;
+		}
+		if (copy_to_user((void __user *)(unsigned long)cmd.info,
+				 uevent->info, uevent->info_len)) {
+			result = -EFAULT;
+			goto done;
+		}
+	}
+
+	list_del(&uevent->file_list);
+	list_del(&uevent->ctx_list);
+	uevent->ctx->events_reported++;
+
+	kfree(uevent->data);
+	kfree(uevent->info);
+	kfree(uevent);
+done:
+	mutex_unlock(&file->file_mutex);
+	return result;
+}
+
+static ssize_t ib_ucm_create_id(struct ib_ucm_file *file,
+				const char __user *inbuf,
+				int in_len, int out_len)
+{
+	struct ib_ucm_create_id cmd;
+	struct ib_ucm_create_id_resp resp;
+	struct ib_ucm_context *ctx;
+	int result;
+
+	if (out_len < sizeof(resp))
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	mutex_lock(&file->file_mutex);
+	ctx = ib_ucm_ctx_alloc(file);
+	mutex_unlock(&file->file_mutex);
+	if (!ctx)
+		return -ENOMEM;
+
+	ctx->uid = cmd.uid;
+	ctx->cm_id = ib_create_cm_id(file->device->ib_dev,
+				     ib_ucm_event_handler, ctx);
+	if (IS_ERR(ctx->cm_id)) {
+		result = PTR_ERR(ctx->cm_id);
+		goto err1;
+	}
+
+	resp.id = ctx->id;
+	if (copy_to_user((void __user *)(unsigned long)cmd.response,
+			 &resp, sizeof(resp))) {
+		result = -EFAULT;
+		goto err2;
+	}
+	return 0;
+
+err2:
+	ib_destroy_cm_id(ctx->cm_id);
+err1:
+	mutex_lock(&ctx_id_mutex);
+	idr_remove(&ctx_id_table, ctx->id);
+	mutex_unlock(&ctx_id_mutex);
+	kfree(ctx);
+	return result;
+}
+
+static ssize_t ib_ucm_destroy_id(struct ib_ucm_file *file,
+				 const char __user *inbuf,
+				 int in_len, int out_len)
+{
+	struct ib_ucm_destroy_id cmd;
+	struct ib_ucm_destroy_id_resp resp;
+	struct ib_ucm_context *ctx;
+	int result = 0;
+
+	if (out_len < sizeof(resp))
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	mutex_lock(&ctx_id_mutex);
+	ctx = idr_find(&ctx_id_table, cmd.id);
+	if (!ctx)
+		ctx = ERR_PTR(-ENOENT);
+	else if (ctx->file != file)
+		ctx = ERR_PTR(-EINVAL);
+	else
+		idr_remove(&ctx_id_table, ctx->id);
+	mutex_unlock(&ctx_id_mutex);
+
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
+	ib_ucm_ctx_put(ctx);
+	wait_for_completion(&ctx->comp);
+
+	/* No new events will be generated after destroying the cm_id. */
+	ib_destroy_cm_id(ctx->cm_id);
+	/* Cleanup events not yet reported to the user. */
+	ib_ucm_cleanup_events(ctx);
+
+	resp.events_reported = ctx->events_reported;
+	if (copy_to_user((void __user *)(unsigned long)cmd.response,
+			 &resp, sizeof(resp)))
+		result = -EFAULT;
+
+	kfree(ctx);
+	return result;
+}
+
+static ssize_t ib_ucm_attr_id(struct ib_ucm_file *file,
+			      const char __user *inbuf,
+			      int in_len, int out_len)
+{
+	struct ib_ucm_attr_id_resp resp;
+	struct ib_ucm_attr_id cmd;
+	struct ib_ucm_context *ctx;
+	int result = 0;
+
+	if (out_len < sizeof(resp))
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	ctx = ib_ucm_ctx_get(file, cmd.id);
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
+	resp.service_id   = ctx->cm_id->service_id;
+	resp.service_mask = ctx->cm_id->service_mask;
+	resp.local_id     = ctx->cm_id->local_id;
+	resp.remote_id    = ctx->cm_id->remote_id;
+
+	if (copy_to_user((void __user *)(unsigned long)cmd.response,
+			 &resp, sizeof(resp)))
+		result = -EFAULT;
+
+	ib_ucm_ctx_put(ctx);
+	return result;
+}
+
+static ssize_t ib_ucm_init_qp_attr(struct ib_ucm_file *file,
+				   const char __user *inbuf,
+				   int in_len, int out_len)
+{
+	struct ib_uverbs_qp_attr resp;
+	struct ib_ucm_init_qp_attr cmd;
+	struct ib_ucm_context *ctx;
+	struct ib_qp_attr qp_attr;
+	int result = 0;
+
+	if (out_len < sizeof(resp))
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	ctx = ib_ucm_ctx_get(file, cmd.id);
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
+	resp.qp_attr_mask = 0;
+	memset(&qp_attr, 0, sizeof qp_attr);
+	qp_attr.qp_state = cmd.qp_state;
+	result = ib_cm_init_qp_attr(ctx->cm_id, &qp_attr, &resp.qp_attr_mask);
+	if (result)
+		goto out;
+
+	ib_copy_qp_attr_to_user(&resp, &qp_attr);
+
+	if (copy_to_user((void __user *)(unsigned long)cmd.response,
+			 &resp, sizeof(resp)))
+		result = -EFAULT;
+
+out:
+	ib_ucm_ctx_put(ctx);
+	return result;
+}
+
+static int ucm_validate_listen(__be64 service_id, __be64 service_mask)
+{
+	service_id &= service_mask;
+
+	if (((service_id & IB_CMA_SERVICE_ID_MASK) == IB_CMA_SERVICE_ID) ||
+	    ((service_id & IB_SDP_SERVICE_ID_MASK) == IB_SDP_SERVICE_ID))
+		return -EINVAL;
+
+	return 0;
+}
+
+static ssize_t ib_ucm_listen(struct ib_ucm_file *file,
+			     const char __user *inbuf,
+			     int in_len, int out_len)
+{
+	struct ib_ucm_listen cmd;
+	struct ib_ucm_context *ctx;
+	int result;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	ctx = ib_ucm_ctx_get(file, cmd.id);
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
+	result = ucm_validate_listen(cmd.service_id, cmd.service_mask);
+	if (result)
+		goto out;
+
+	result = ib_cm_listen(ctx->cm_id, cmd.service_id, cmd.service_mask,
+			      NULL);
+out:
+	ib_ucm_ctx_put(ctx);
+	return result;
+}
+
+static ssize_t ib_ucm_notify(struct ib_ucm_file *file,
+			     const char __user *inbuf,
+			     int in_len, int out_len)
+{
+	struct ib_ucm_notify cmd;
+	struct ib_ucm_context *ctx;
+	int result;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	ctx = ib_ucm_ctx_get(file, cmd.id);
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
+	result = ib_cm_notify(ctx->cm_id, (enum ib_event_type) cmd.event);
+	ib_ucm_ctx_put(ctx);
+	return result;
+}
+
+static int ib_ucm_alloc_data(const void **dest, u64 src, u32 len)
+{
+	void *data;
+
+	*dest = NULL;
+
+	if (!len)
+		return 0;
+
+	data = memdup_user((void __user *)(unsigned long)src, len);
+	if (IS_ERR(data))
+		return PTR_ERR(data);
+
+	*dest = data;
+	return 0;
+}
+
+static int ib_ucm_path_get(struct ib_sa_path_rec **path, u64 src)
+{
+	struct ib_user_path_rec upath;
+	struct ib_sa_path_rec  *sa_path;
+
+	*path = NULL;
+
+	if (!src)
+		return 0;
+
+	sa_path = kmalloc(sizeof(*sa_path), GFP_KERNEL);
+	if (!sa_path)
+		return -ENOMEM;
+
+	if (copy_from_user(&upath, (void __user *)(unsigned long)src,
+			   sizeof(upath))) {
+
+		kfree(sa_path);
+		return -EFAULT;
+	}
+
+	ib_copy_path_rec_from_user(sa_path, &upath);
+	*path = sa_path;
+	return 0;
+}
+
+static ssize_t ib_ucm_send_req(struct ib_ucm_file *file,
+			       const char __user *inbuf,
+			       int in_len, int out_len)
+{
+	struct ib_cm_req_param param;
+	struct ib_ucm_context *ctx;
+	struct ib_ucm_req cmd;
+	int result;
+
+	param.private_data   = NULL;
+	param.primary_path   = NULL;
+	param.alternate_path = NULL;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	result = ib_ucm_alloc_data(&param.private_data, cmd.data, cmd.len);
+	if (result)
+		goto done;
+
+	result = ib_ucm_path_get(&param.primary_path, cmd.primary_path);
+	if (result)
+		goto done;
+
+	result = ib_ucm_path_get(&param.alternate_path, cmd.alternate_path);
+	if (result)
+		goto done;
+
+	param.private_data_len           = cmd.len;
+	param.service_id                 = cmd.sid;
+	param.qp_num                     = cmd.qpn;
+	param.qp_type                    = cmd.qp_type;
+	param.starting_psn               = cmd.psn;
+	param.peer_to_peer               = cmd.peer_to_peer;
+	param.responder_resources        = cmd.responder_resources;
+	param.initiator_depth            = cmd.initiator_depth;
+	param.remote_cm_response_timeout = cmd.remote_cm_response_timeout;
+	param.flow_control               = cmd.flow_control;
+	param.local_cm_response_timeout  = cmd.local_cm_response_timeout;
+	param.retry_count                = cmd.retry_count;
+	param.rnr_retry_count            = cmd.rnr_retry_count;
+	param.max_cm_retries             = cmd.max_cm_retries;
+	param.srq                        = cmd.srq;
+
+	ctx = ib_ucm_ctx_get(file, cmd.id);
+	if (!IS_ERR(ctx)) {
+		result = ib_send_cm_req(ctx->cm_id, &param);
+		ib_ucm_ctx_put(ctx);
+	} else
+		result = PTR_ERR(ctx);
+
+done:
+	kfree(param.private_data);
+	kfree(param.primary_path);
+	kfree(param.alternate_path);
+	return result;
+}
+
+static ssize_t ib_ucm_send_rep(struct ib_ucm_file *file,
+			       const char __user *inbuf,
+			       int in_len, int out_len)
+{
+	struct ib_cm_rep_param param;
+	struct ib_ucm_context *ctx;
+	struct ib_ucm_rep cmd;
+	int result;
+
+	param.private_data = NULL;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	result = ib_ucm_alloc_data(&param.private_data, cmd.data, cmd.len);
+	if (result)
+		return result;
+
+	param.qp_num              = cmd.qpn;
+	param.starting_psn        = cmd.psn;
+	param.private_data_len    = cmd.len;
+	param.responder_resources = cmd.responder_resources;
+	param.initiator_depth     = cmd.initiator_depth;
+	param.failover_accepted   = cmd.failover_accepted;
+	param.flow_control        = cmd.flow_control;
+	param.rnr_retry_count     = cmd.rnr_retry_count;
+	param.srq                 = cmd.srq;
+
+	ctx = ib_ucm_ctx_get(file, cmd.id);
+	if (!IS_ERR(ctx)) {
+		ctx->uid = cmd.uid;
+		result = ib_send_cm_rep(ctx->cm_id, &param);
+		ib_ucm_ctx_put(ctx);
+	} else
+		result = PTR_ERR(ctx);
+
+	kfree(param.private_data);
+	return result;
+}
+
+static ssize_t ib_ucm_send_private_data(struct ib_ucm_file *file,
+					const char __user *inbuf, int in_len,
+					int (*func)(struct ib_cm_id *cm_id,
+						    const void *private_data,
+						    u8 private_data_len))
+{
+	struct ib_ucm_private_data cmd;
+	struct ib_ucm_context *ctx;
+	const void *private_data = NULL;
+	int result;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	result = ib_ucm_alloc_data(&private_data, cmd.data, cmd.len);
+	if (result)
+		return result;
+
+	ctx = ib_ucm_ctx_get(file, cmd.id);
+	if (!IS_ERR(ctx)) {
+		result = func(ctx->cm_id, private_data, cmd.len);
+		ib_ucm_ctx_put(ctx);
+	} else
+		result = PTR_ERR(ctx);
+
+	kfree(private_data);
+	return result;
+}
+
+static ssize_t ib_ucm_send_rtu(struct ib_ucm_file *file,
+			       const char __user *inbuf,
+			       int in_len, int out_len)
+{
+	return ib_ucm_send_private_data(file, inbuf, in_len, ib_send_cm_rtu);
+}
+
+static ssize_t ib_ucm_send_dreq(struct ib_ucm_file *file,
+				const char __user *inbuf,
+				int in_len, int out_len)
+{
+	return ib_ucm_send_private_data(file, inbuf, in_len, ib_send_cm_dreq);
+}
+
+static ssize_t ib_ucm_send_drep(struct ib_ucm_file *file,
+				const char __user *inbuf,
+				int in_len, int out_len)
+{
+	return ib_ucm_send_private_data(file, inbuf, in_len, ib_send_cm_drep);
+}
+
+static ssize_t ib_ucm_send_info(struct ib_ucm_file *file,
+				const char __user *inbuf, int in_len,
+				int (*func)(struct ib_cm_id *cm_id,
+					    int status,
+					    const void *info,
+					    u8 info_len,
+					    const void *data,
+					    u8 data_len))
+{
+	struct ib_ucm_context *ctx;
+	struct ib_ucm_info cmd;
+	const void *data = NULL;
+	const void *info = NULL;
+	int result;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	result = ib_ucm_alloc_data(&data, cmd.data, cmd.data_len);
+	if (result)
+		goto done;
+
+	result = ib_ucm_alloc_data(&info, cmd.info, cmd.info_len);
+	if (result)
+		goto done;
+
+	ctx = ib_ucm_ctx_get(file, cmd.id);
+	if (!IS_ERR(ctx)) {
+		result = func(ctx->cm_id, cmd.status, info, cmd.info_len,
+			      data, cmd.data_len);
+		ib_ucm_ctx_put(ctx);
+	} else
+		result = PTR_ERR(ctx);
+
+done:
+	kfree(data);
+	kfree(info);
+	return result;
+}
+
+static ssize_t ib_ucm_send_rej(struct ib_ucm_file *file,
+			       const char __user *inbuf,
+			       int in_len, int out_len)
+{
+	return ib_ucm_send_info(file, inbuf, in_len, (void *)ib_send_cm_rej);
+}
+
+static ssize_t ib_ucm_send_apr(struct ib_ucm_file *file,
+			       const char __user *inbuf,
+			       int in_len, int out_len)
+{
+	return ib_ucm_send_info(file, inbuf, in_len, (void *)ib_send_cm_apr);
+}
+
+static ssize_t ib_ucm_send_mra(struct ib_ucm_file *file,
+			       const char __user *inbuf,
+			       int in_len, int out_len)
+{
+	struct ib_ucm_context *ctx;
+	struct ib_ucm_mra cmd;
+	const void *data = NULL;
+	int result;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	result = ib_ucm_alloc_data(&data, cmd.data, cmd.len);
+	if (result)
+		return result;
+
+	ctx = ib_ucm_ctx_get(file, cmd.id);
+	if (!IS_ERR(ctx)) {
+		result = ib_send_cm_mra(ctx->cm_id, cmd.timeout, data, cmd.len);
+		ib_ucm_ctx_put(ctx);
+	} else
+		result = PTR_ERR(ctx);
+
+	kfree(data);
+	return result;
+}
+
+static ssize_t ib_ucm_send_lap(struct ib_ucm_file *file,
+			       const char __user *inbuf,
+			       int in_len, int out_len)
+{
+	struct ib_ucm_context *ctx;
+	struct ib_sa_path_rec *path = NULL;
+	struct ib_ucm_lap cmd;
+	const void *data = NULL;
+	int result;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	result = ib_ucm_alloc_data(&data, cmd.data, cmd.len);
+	if (result)
+		goto done;
+
+	result = ib_ucm_path_get(&path, cmd.path);
+	if (result)
+		goto done;
+
+	ctx = ib_ucm_ctx_get(file, cmd.id);
+	if (!IS_ERR(ctx)) {
+		result = ib_send_cm_lap(ctx->cm_id, path, data, cmd.len);
+		ib_ucm_ctx_put(ctx);
+	} else
+		result = PTR_ERR(ctx);
+
+done:
+	kfree(data);
+	kfree(path);
+	return result;
+}
+
+static ssize_t ib_ucm_send_sidr_req(struct ib_ucm_file *file,
+				    const char __user *inbuf,
+				    int in_len, int out_len)
+{
+	struct ib_cm_sidr_req_param param;
+	struct ib_ucm_context *ctx;
+	struct ib_ucm_sidr_req cmd;
+	int result;
+
+	param.private_data = NULL;
+	param.path = NULL;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	result = ib_ucm_alloc_data(&param.private_data, cmd.data, cmd.len);
+	if (result)
+		goto done;
+
+	result = ib_ucm_path_get(&param.path, cmd.path);
+	if (result)
+		goto done;
+
+	param.private_data_len = cmd.len;
+	param.service_id       = cmd.sid;
+	param.timeout_ms       = cmd.timeout;
+	param.max_cm_retries   = cmd.max_cm_retries;
+
+	ctx = ib_ucm_ctx_get(file, cmd.id);
+	if (!IS_ERR(ctx)) {
+		result = ib_send_cm_sidr_req(ctx->cm_id, &param);
+		ib_ucm_ctx_put(ctx);
+	} else
+		result = PTR_ERR(ctx);
+
+done:
+	kfree(param.private_data);
+	kfree(param.path);
+	return result;
+}
+
+static ssize_t ib_ucm_send_sidr_rep(struct ib_ucm_file *file,
+				    const char __user *inbuf,
+				    int in_len, int out_len)
+{
+	struct ib_cm_sidr_rep_param param;
+	struct ib_ucm_sidr_rep cmd;
+	struct ib_ucm_context *ctx;
+	int result;
+
+	param.info = NULL;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	result = ib_ucm_alloc_data(&param.private_data,
+				   cmd.data, cmd.data_len);
+	if (result)
+		goto done;
+
+	result = ib_ucm_alloc_data(&param.info, cmd.info, cmd.info_len);
+	if (result)
+		goto done;
+
+	param.qp_num		= cmd.qpn;
+	param.qkey		= cmd.qkey;
+	param.status		= cmd.status;
+	param.info_length	= cmd.info_len;
+	param.private_data_len	= cmd.data_len;
+
+	ctx = ib_ucm_ctx_get(file, cmd.id);
+	if (!IS_ERR(ctx)) {
+		result = ib_send_cm_sidr_rep(ctx->cm_id, &param);
+		ib_ucm_ctx_put(ctx);
+	} else
+		result = PTR_ERR(ctx);
+
+done:
+	kfree(param.private_data);
+	kfree(param.info);
+	return result;
+}
+
+static ssize_t (*ucm_cmd_table[])(struct ib_ucm_file *file,
+				  const char __user *inbuf,
+				  int in_len, int out_len) = {
+	[IB_USER_CM_CMD_CREATE_ID]     = ib_ucm_create_id,
+	[IB_USER_CM_CMD_DESTROY_ID]    = ib_ucm_destroy_id,
+	[IB_USER_CM_CMD_ATTR_ID]       = ib_ucm_attr_id,
+	[IB_USER_CM_CMD_LISTEN]        = ib_ucm_listen,
+	[IB_USER_CM_CMD_NOTIFY]        = ib_ucm_notify,
+	[IB_USER_CM_CMD_SEND_REQ]      = ib_ucm_send_req,
+	[IB_USER_CM_CMD_SEND_REP]      = ib_ucm_send_rep,
+	[IB_USER_CM_CMD_SEND_RTU]      = ib_ucm_send_rtu,
+	[IB_USER_CM_CMD_SEND_DREQ]     = ib_ucm_send_dreq,
+	[IB_USER_CM_CMD_SEND_DREP]     = ib_ucm_send_drep,
+	[IB_USER_CM_CMD_SEND_REJ]      = ib_ucm_send_rej,
+	[IB_USER_CM_CMD_SEND_MRA]      = ib_ucm_send_mra,
+	[IB_USER_CM_CMD_SEND_LAP]      = ib_ucm_send_lap,
+	[IB_USER_CM_CMD_SEND_APR]      = ib_ucm_send_apr,
+	[IB_USER_CM_CMD_SEND_SIDR_REQ] = ib_ucm_send_sidr_req,
+	[IB_USER_CM_CMD_SEND_SIDR_REP] = ib_ucm_send_sidr_rep,
+	[IB_USER_CM_CMD_EVENT]	       = ib_ucm_event,
+	[IB_USER_CM_CMD_INIT_QP_ATTR]  = ib_ucm_init_qp_attr,
+};
+
+static ssize_t ib_ucm_write(struct file *filp, const char __user *buf,
+			    size_t len, loff_t *pos)
+{
+	struct ib_ucm_file *file = filp->private_data;
+	struct ib_ucm_cmd_hdr hdr;
+	ssize_t result;
+
+	if (len < sizeof(hdr))
+		return -EINVAL;
+
+	if (copy_from_user(&hdr, buf, sizeof(hdr)))
+		return -EFAULT;
+
+	if (hdr.cmd >= ARRAY_SIZE(ucm_cmd_table))
+		return -EINVAL;
+
+	if (hdr.in + sizeof(hdr) > len)
+		return -EINVAL;
+
+	result = ucm_cmd_table[hdr.cmd](file, buf + sizeof(hdr),
+					hdr.in, hdr.out);
+	if (!result)
+		result = len;
+
+	return result;
+}
+
+static unsigned int ib_ucm_poll(struct file *filp,
+				struct poll_table_struct *wait)
+{
+	struct ib_ucm_file *file = filp->private_data;
+	unsigned int mask = 0;
+
+	poll_wait(filp, &file->poll_wait, wait);
+
+	if (!list_empty(&file->events))
+		mask = POLLIN | POLLRDNORM;
+
+	return mask;
+}
+
+/*
+ * ib_ucm_open() does not need the BKL:
+ *
+ *  - no global state is referred to;
+ *  - there is no ioctl method to race against;
+ *  - no further module initialization is required for open to work
+ *    after the device is registered.
+ */
+static int ib_ucm_open(struct inode *inode, struct file *filp)
+{
+	struct ib_ucm_file *file;
+
+	file = kmalloc(sizeof(*file), GFP_KERNEL);
+	if (!file)
+		return -ENOMEM;
+
+	INIT_LIST_HEAD(&file->events);
+	INIT_LIST_HEAD(&file->ctxs);
+	init_waitqueue_head(&file->poll_wait);
+
+	mutex_init(&file->file_mutex);
+
+	filp->private_data = file;
+	file->filp = filp;
+	file->device = container_of(inode->i_cdev, struct ib_ucm_device, cdev);
+
+	return nonseekable_open(inode, filp);
+}
+
+static int ib_ucm_close(struct inode *inode, struct file *filp)
+{
+	struct ib_ucm_file *file = filp->private_data;
+	struct ib_ucm_context *ctx;
+
+	mutex_lock(&file->file_mutex);
+	while (!list_empty(&file->ctxs)) {
+		ctx = list_entry(file->ctxs.next,
+				 struct ib_ucm_context, file_list);
+		mutex_unlock(&file->file_mutex);
+
+		mutex_lock(&ctx_id_mutex);
+		idr_remove(&ctx_id_table, ctx->id);
+		mutex_unlock(&ctx_id_mutex);
+
+		ib_destroy_cm_id(ctx->cm_id);
+		ib_ucm_cleanup_events(ctx);
+		kfree(ctx);
+
+		mutex_lock(&file->file_mutex);
+	}
+	mutex_unlock(&file->file_mutex);
+	kfree(file);
+	return 0;
+}
+
+static void ib_ucm_release_dev(struct device *dev)
+{
+	struct ib_ucm_device *ucm_dev;
+
+	ucm_dev = container_of(dev, struct ib_ucm_device, dev);
+	cdev_del(&ucm_dev->cdev);
+	if (ucm_dev->devnum < IB_UCM_MAX_DEVICES)
+		clear_bit(ucm_dev->devnum, dev_map);
+	else
+		clear_bit(ucm_dev->devnum - IB_UCM_MAX_DEVICES, dev_map);
+	kfree(ucm_dev);
+}
+
+static const struct file_operations ucm_fops = {
+	.owner	 = THIS_MODULE,
+	.open	 = ib_ucm_open,
+	.release = ib_ucm_close,
+	.write	 = ib_ucm_write,
+	.poll    = ib_ucm_poll,
+	.llseek	 = no_llseek,
+};
+
+static ssize_t show_ibdev(struct device *dev, struct device_attribute *attr,
+			  char *buf)
+{
+	struct ib_ucm_device *ucm_dev;
+
+	ucm_dev = container_of(dev, struct ib_ucm_device, dev);
+	return sprintf(buf, "%s\n", ucm_dev->ib_dev->name);
+}
+static DEVICE_ATTR(ibdev, S_IRUGO, show_ibdev, NULL);
+
+static dev_t overflow_maj;
+static DECLARE_BITMAP(overflow_map, IB_UCM_MAX_DEVICES);
+static int find_overflow_devnum(void)
+{
+	int ret;
+
+	if (!overflow_maj) {
+		ret = alloc_chrdev_region(&overflow_maj, 0, IB_UCM_MAX_DEVICES,
+					  "infiniband_cm");
+		if (ret) {
+			printk(KERN_ERR "ucm: couldn't register dynamic device number\n");
+			return ret;
+		}
+	}
+
+	ret = find_first_zero_bit(overflow_map, IB_UCM_MAX_DEVICES);
+	if (ret >= IB_UCM_MAX_DEVICES)
+		return -1;
+
+	return ret;
+}
+
+static void ib_ucm_add_one(struct ib_device *device)
+{
+	int devnum;
+	dev_t base;
+	struct ib_ucm_device *ucm_dev;
+
+	if (!device->alloc_ucontext ||
+	    rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB)
+		return;
+
+	ucm_dev = kzalloc(sizeof *ucm_dev, GFP_KERNEL);
+	if (!ucm_dev)
+		return;
+
+	ucm_dev->ib_dev = device;
+
+	devnum = find_first_zero_bit(dev_map, IB_UCM_MAX_DEVICES);
+	if (devnum >= IB_UCM_MAX_DEVICES) {
+		devnum = find_overflow_devnum();
+		if (devnum < 0)
+			goto err;
+
+		ucm_dev->devnum = devnum + IB_UCM_MAX_DEVICES;
+		base = devnum + overflow_maj;
+		set_bit(devnum, overflow_map);
+	} else {
+		ucm_dev->devnum = devnum;
+		base = devnum + IB_UCM_BASE_DEV;
+		set_bit(devnum, dev_map);
+	}
+
+	cdev_init(&ucm_dev->cdev, &ucm_fops);
+	ucm_dev->cdev.owner = THIS_MODULE;
+	kobject_set_name(&ucm_dev->cdev.kobj, "ucm%d", ucm_dev->devnum);
+	if (cdev_add(&ucm_dev->cdev, base, 1))
+		goto err;
+
+	ucm_dev->dev.class = &cm_class;
+	ucm_dev->dev.parent = device->dma_device;
+	ucm_dev->dev.devt = ucm_dev->cdev.dev;
+	ucm_dev->dev.release = ib_ucm_release_dev;
+	dev_set_name(&ucm_dev->dev, "ucm%d", ucm_dev->devnum);
+	if (device_register(&ucm_dev->dev))
+		goto err_cdev;
+
+	if (device_create_file(&ucm_dev->dev, &dev_attr_ibdev))
+		goto err_dev;
+
+	ib_set_client_data(device, &ucm_client, ucm_dev);
+	return;
+
+err_dev:
+	device_unregister(&ucm_dev->dev);
+err_cdev:
+	cdev_del(&ucm_dev->cdev);
+	if (ucm_dev->devnum < IB_UCM_MAX_DEVICES)
+		clear_bit(devnum, dev_map);
+	else
+		clear_bit(devnum, overflow_map);
+err:
+	kfree(ucm_dev);
+	return;
+}
+
+static void ib_ucm_remove_one(struct ib_device *device)
+{
+	struct ib_ucm_device *ucm_dev = ib_get_client_data(device, &ucm_client);
+
+	if (!ucm_dev)
+		return;
+
+	device_unregister(&ucm_dev->dev);
+}
+
+static CLASS_ATTR_STRING(abi_version, S_IRUGO,
+			 __stringify(IB_USER_CM_ABI_VERSION));
+
+static int __init ib_ucm_init(void)
+{
+	int ret;
+
+	ret = register_chrdev_region(IB_UCM_BASE_DEV, IB_UCM_MAX_DEVICES,
+				     "infiniband_cm");
+	if (ret) {
+		printk(KERN_ERR "ucm: couldn't register device number\n");
+		goto error1;
+	}
+
+	ret = class_create_file(&cm_class, &class_attr_abi_version.attr);
+	if (ret) {
+		printk(KERN_ERR "ucm: couldn't create abi_version attribute\n");
+		goto error2;
+	}
+
+	ret = ib_register_client(&ucm_client);
+	if (ret) {
+		printk(KERN_ERR "ucm: couldn't register client\n");
+		goto error3;
+	}
+	return 0;
+
+error3:
+	class_remove_file(&cm_class, &class_attr_abi_version.attr);
+error2:
+	unregister_chrdev_region(IB_UCM_BASE_DEV, IB_UCM_MAX_DEVICES);
+error1:
+	return ret;
+}
+
+static void __exit ib_ucm_cleanup(void)
+{
+	ib_unregister_client(&ucm_client);
+	class_remove_file(&cm_class, &class_attr_abi_version.attr);
+	unregister_chrdev_region(IB_UCM_BASE_DEV, IB_UCM_MAX_DEVICES);
+	if (overflow_maj)
+		unregister_chrdev_region(overflow_maj, IB_UCM_MAX_DEVICES);
+	idr_destroy(&ctx_id_table);
+}
+
+module_init(ib_ucm_init);
+module_exit(ib_ucm_cleanup);
diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c
new file mode 100644
index 0000000..56a4b7c
--- /dev/null
+++ b/drivers/infiniband/core/ucma.c
@@ -0,0 +1,1632 @@
+/*
+ * Copyright (c) 2005-2006 Intel Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *	copyright notice, this list of conditions and the following
+ *	disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *	copyright notice, this list of conditions and the following
+ *	disclaimer in the documentation and/or other materials
+ *	provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/completion.h>
+#include <linux/file.h>
+#include <linux/mutex.h>
+#include <linux/poll.h>
+#include <linux/sched.h>
+#include <linux/idr.h>
+#include <linux/in.h>
+#include <linux/in6.h>
+#include <linux/miscdevice.h>
+#include <linux/slab.h>
+#include <linux/sysctl.h>
+#include <linux/module.h>
+
+#include <rdma/rdma_user_cm.h>
+#include <rdma/ib_marshall.h>
+#include <rdma/rdma_cm.h>
+#include <rdma/rdma_cm_ib.h>
+#include <rdma/ib_addr.h>
+#include <rdma/ib.h>
+
+MODULE_AUTHOR("Sean Hefty");
+MODULE_DESCRIPTION("RDMA Userspace Connection Manager Access");
+MODULE_LICENSE("Dual BSD/GPL");
+
+static unsigned int max_backlog = 1024;
+
+static struct ctl_table_header *ucma_ctl_table_hdr;
+static struct ctl_table ucma_ctl_table[] = {
+	{
+		.procname	= "max_backlog",
+		.data		= &max_backlog,
+		.maxlen		= sizeof max_backlog,
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{ }
+};
+
+struct ucma_file {
+	struct mutex		mut;
+	struct file		*filp;
+	struct list_head	ctx_list;
+	struct list_head	event_list;
+	wait_queue_head_t	poll_wait;
+};
+
+struct ucma_context {
+	int			id;
+	struct completion	comp;
+	atomic_t		ref;
+	int			events_reported;
+	int			backlog;
+
+	struct ucma_file	*file;
+	struct rdma_cm_id	*cm_id;
+	u64			uid;
+
+	struct list_head	list;
+	struct list_head	mc_list;
+};
+
+struct ucma_multicast {
+	struct ucma_context	*ctx;
+	int			id;
+	int			events_reported;
+
+	u64			uid;
+	struct list_head	list;
+	struct sockaddr_storage	addr;
+};
+
+struct ucma_event {
+	struct ucma_context	*ctx;
+	struct ucma_multicast	*mc;
+	struct list_head	list;
+	struct rdma_cm_id	*cm_id;
+	struct rdma_ucm_event_resp resp;
+};
+
+static DEFINE_MUTEX(mut);
+static DEFINE_IDR(ctx_idr);
+static DEFINE_IDR(multicast_idr);
+
+static inline struct ucma_context *_ucma_find_context(int id,
+						      struct ucma_file *file)
+{
+	struct ucma_context *ctx;
+
+	ctx = idr_find(&ctx_idr, id);
+	if (!ctx)
+		ctx = ERR_PTR(-ENOENT);
+	else if (ctx->file != file)
+		ctx = ERR_PTR(-EINVAL);
+	return ctx;
+}
+
+static struct ucma_context *ucma_get_ctx(struct ucma_file *file, int id)
+{
+	struct ucma_context *ctx;
+
+	mutex_lock(&mut);
+	ctx = _ucma_find_context(id, file);
+	if (!IS_ERR(ctx))
+		atomic_inc(&ctx->ref);
+	mutex_unlock(&mut);
+	return ctx;
+}
+
+static void ucma_put_ctx(struct ucma_context *ctx)
+{
+	if (atomic_dec_and_test(&ctx->ref))
+		complete(&ctx->comp);
+}
+
+static struct ucma_context *ucma_alloc_ctx(struct ucma_file *file)
+{
+	struct ucma_context *ctx;
+
+	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+	if (!ctx)
+		return NULL;
+
+	atomic_set(&ctx->ref, 1);
+	init_completion(&ctx->comp);
+	INIT_LIST_HEAD(&ctx->mc_list);
+	ctx->file = file;
+
+	mutex_lock(&mut);
+	ctx->id = idr_alloc(&ctx_idr, ctx, 0, 0, GFP_KERNEL);
+	mutex_unlock(&mut);
+	if (ctx->id < 0)
+		goto error;
+
+	list_add_tail(&ctx->list, &file->ctx_list);
+	return ctx;
+
+error:
+	kfree(ctx);
+	return NULL;
+}
+
+static struct ucma_multicast* ucma_alloc_multicast(struct ucma_context *ctx)
+{
+	struct ucma_multicast *mc;
+
+	mc = kzalloc(sizeof(*mc), GFP_KERNEL);
+	if (!mc)
+		return NULL;
+
+	mutex_lock(&mut);
+	mc->id = idr_alloc(&multicast_idr, mc, 0, 0, GFP_KERNEL);
+	mutex_unlock(&mut);
+	if (mc->id < 0)
+		goto error;
+
+	mc->ctx = ctx;
+	list_add_tail(&mc->list, &ctx->mc_list);
+	return mc;
+
+error:
+	kfree(mc);
+	return NULL;
+}
+
+static void ucma_copy_conn_event(struct rdma_ucm_conn_param *dst,
+				 struct rdma_conn_param *src)
+{
+	if (src->private_data_len)
+		memcpy(dst->private_data, src->private_data,
+		       src->private_data_len);
+	dst->private_data_len = src->private_data_len;
+	dst->responder_resources =src->responder_resources;
+	dst->initiator_depth = src->initiator_depth;
+	dst->flow_control = src->flow_control;
+	dst->retry_count = src->retry_count;
+	dst->rnr_retry_count = src->rnr_retry_count;
+	dst->srq = src->srq;
+	dst->qp_num = src->qp_num;
+}
+
+static void ucma_copy_ud_event(struct rdma_ucm_ud_param *dst,
+			       struct rdma_ud_param *src)
+{
+	if (src->private_data_len)
+		memcpy(dst->private_data, src->private_data,
+		       src->private_data_len);
+	dst->private_data_len = src->private_data_len;
+	ib_copy_ah_attr_to_user(&dst->ah_attr, &src->ah_attr);
+	dst->qp_num = src->qp_num;
+	dst->qkey = src->qkey;
+}
+
+static void ucma_set_event_context(struct ucma_context *ctx,
+				   struct rdma_cm_event *event,
+				   struct ucma_event *uevent)
+{
+	uevent->ctx = ctx;
+	switch (event->event) {
+	case RDMA_CM_EVENT_MULTICAST_JOIN:
+	case RDMA_CM_EVENT_MULTICAST_ERROR:
+		uevent->mc = (struct ucma_multicast *)
+			     event->param.ud.private_data;
+		uevent->resp.uid = uevent->mc->uid;
+		uevent->resp.id = uevent->mc->id;
+		break;
+	default:
+		uevent->resp.uid = ctx->uid;
+		uevent->resp.id = ctx->id;
+		break;
+	}
+}
+
+static int ucma_event_handler(struct rdma_cm_id *cm_id,
+			      struct rdma_cm_event *event)
+{
+	struct ucma_event *uevent;
+	struct ucma_context *ctx = cm_id->context;
+	int ret = 0;
+
+	uevent = kzalloc(sizeof(*uevent), GFP_KERNEL);
+	if (!uevent)
+		return event->event == RDMA_CM_EVENT_CONNECT_REQUEST;
+
+	mutex_lock(&ctx->file->mut);
+	uevent->cm_id = cm_id;
+	ucma_set_event_context(ctx, event, uevent);
+	uevent->resp.event = event->event;
+	uevent->resp.status = event->status;
+	if (cm_id->qp_type == IB_QPT_UD)
+		ucma_copy_ud_event(&uevent->resp.param.ud, &event->param.ud);
+	else
+		ucma_copy_conn_event(&uevent->resp.param.conn,
+				     &event->param.conn);
+
+	if (event->event == RDMA_CM_EVENT_CONNECT_REQUEST) {
+		if (!ctx->backlog) {
+			ret = -ENOMEM;
+			kfree(uevent);
+			goto out;
+		}
+		ctx->backlog--;
+	} else if (!ctx->uid || ctx->cm_id != cm_id) {
+		/*
+		 * We ignore events for new connections until userspace has set
+		 * their context.  This can only happen if an error occurs on a
+		 * new connection before the user accepts it.  This is okay,
+		 * since the accept will just fail later.
+		 */
+		kfree(uevent);
+		goto out;
+	}
+
+	list_add_tail(&uevent->list, &ctx->file->event_list);
+	wake_up_interruptible(&ctx->file->poll_wait);
+out:
+	mutex_unlock(&ctx->file->mut);
+	return ret;
+}
+
+static ssize_t ucma_get_event(struct ucma_file *file, const char __user *inbuf,
+			      int in_len, int out_len)
+{
+	struct ucma_context *ctx;
+	struct rdma_ucm_get_event cmd;
+	struct ucma_event *uevent;
+	int ret = 0;
+
+	if (out_len < sizeof uevent->resp)
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	mutex_lock(&file->mut);
+	while (list_empty(&file->event_list)) {
+		mutex_unlock(&file->mut);
+
+		if (file->filp->f_flags & O_NONBLOCK)
+			return -EAGAIN;
+
+		if (wait_event_interruptible(file->poll_wait,
+					     !list_empty(&file->event_list)))
+			return -ERESTARTSYS;
+
+		mutex_lock(&file->mut);
+	}
+
+	uevent = list_entry(file->event_list.next, struct ucma_event, list);
+
+	if (uevent->resp.event == RDMA_CM_EVENT_CONNECT_REQUEST) {
+		ctx = ucma_alloc_ctx(file);
+		if (!ctx) {
+			ret = -ENOMEM;
+			goto done;
+		}
+		uevent->ctx->backlog++;
+		ctx->cm_id = uevent->cm_id;
+		ctx->cm_id->context = ctx;
+		uevent->resp.id = ctx->id;
+	}
+
+	if (copy_to_user((void __user *)(unsigned long)cmd.response,
+			 &uevent->resp, sizeof uevent->resp)) {
+		ret = -EFAULT;
+		goto done;
+	}
+
+	list_del(&uevent->list);
+	uevent->ctx->events_reported++;
+	if (uevent->mc)
+		uevent->mc->events_reported++;
+	kfree(uevent);
+done:
+	mutex_unlock(&file->mut);
+	return ret;
+}
+
+static int ucma_get_qp_type(struct rdma_ucm_create_id *cmd, enum ib_qp_type *qp_type)
+{
+	switch (cmd->ps) {
+	case RDMA_PS_TCP:
+		*qp_type = IB_QPT_RC;
+		return 0;
+	case RDMA_PS_UDP:
+	case RDMA_PS_IPOIB:
+		*qp_type = IB_QPT_UD;
+		return 0;
+	case RDMA_PS_IB:
+		*qp_type = cmd->qp_type;
+		return 0;
+	default:
+		return -EINVAL;
+	}
+}
+
+static ssize_t ucma_create_id(struct ucma_file *file, const char __user *inbuf,
+			      int in_len, int out_len)
+{
+	struct rdma_ucm_create_id cmd;
+	struct rdma_ucm_create_id_resp resp;
+	struct ucma_context *ctx;
+	enum ib_qp_type qp_type;
+	int ret;
+
+	if (out_len < sizeof(resp))
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	ret = ucma_get_qp_type(&cmd, &qp_type);
+	if (ret)
+		return ret;
+
+	mutex_lock(&file->mut);
+	ctx = ucma_alloc_ctx(file);
+	mutex_unlock(&file->mut);
+	if (!ctx)
+		return -ENOMEM;
+
+	ctx->uid = cmd.uid;
+	ctx->cm_id = rdma_create_id(ucma_event_handler, ctx, cmd.ps, qp_type);
+	if (IS_ERR(ctx->cm_id)) {
+		ret = PTR_ERR(ctx->cm_id);
+		goto err1;
+	}
+
+	resp.id = ctx->id;
+	if (copy_to_user((void __user *)(unsigned long)cmd.response,
+			 &resp, sizeof(resp))) {
+		ret = -EFAULT;
+		goto err2;
+	}
+	return 0;
+
+err2:
+	rdma_destroy_id(ctx->cm_id);
+err1:
+	mutex_lock(&mut);
+	idr_remove(&ctx_idr, ctx->id);
+	mutex_unlock(&mut);
+	kfree(ctx);
+	return ret;
+}
+
+static void ucma_cleanup_multicast(struct ucma_context *ctx)
+{
+	struct ucma_multicast *mc, *tmp;
+
+	mutex_lock(&mut);
+	list_for_each_entry_safe(mc, tmp, &ctx->mc_list, list) {
+		list_del(&mc->list);
+		idr_remove(&multicast_idr, mc->id);
+		kfree(mc);
+	}
+	mutex_unlock(&mut);
+}
+
+static void ucma_cleanup_mc_events(struct ucma_multicast *mc)
+{
+	struct ucma_event *uevent, *tmp;
+
+	list_for_each_entry_safe(uevent, tmp, &mc->ctx->file->event_list, list) {
+		if (uevent->mc != mc)
+			continue;
+
+		list_del(&uevent->list);
+		kfree(uevent);
+	}
+}
+
+/*
+ * We cannot hold file->mut when calling rdma_destroy_id() or we can
+ * deadlock.  We also acquire file->mut in ucma_event_handler(), and
+ * rdma_destroy_id() will wait until all callbacks have completed.
+ */
+static int ucma_free_ctx(struct ucma_context *ctx)
+{
+	int events_reported;
+	struct ucma_event *uevent, *tmp;
+	LIST_HEAD(list);
+
+	/* No new events will be generated after destroying the id. */
+	rdma_destroy_id(ctx->cm_id);
+
+	ucma_cleanup_multicast(ctx);
+
+	/* Cleanup events not yet reported to the user. */
+	mutex_lock(&ctx->file->mut);
+	list_for_each_entry_safe(uevent, tmp, &ctx->file->event_list, list) {
+		if (uevent->ctx == ctx)
+			list_move_tail(&uevent->list, &list);
+	}
+	list_del(&ctx->list);
+	mutex_unlock(&ctx->file->mut);
+
+	list_for_each_entry_safe(uevent, tmp, &list, list) {
+		list_del(&uevent->list);
+		if (uevent->resp.event == RDMA_CM_EVENT_CONNECT_REQUEST)
+			rdma_destroy_id(uevent->cm_id);
+		kfree(uevent);
+	}
+
+	events_reported = ctx->events_reported;
+	kfree(ctx);
+	return events_reported;
+}
+
+static ssize_t ucma_destroy_id(struct ucma_file *file, const char __user *inbuf,
+			       int in_len, int out_len)
+{
+	struct rdma_ucm_destroy_id cmd;
+	struct rdma_ucm_destroy_id_resp resp;
+	struct ucma_context *ctx;
+	int ret = 0;
+
+	if (out_len < sizeof(resp))
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	mutex_lock(&mut);
+	ctx = _ucma_find_context(cmd.id, file);
+	if (!IS_ERR(ctx))
+		idr_remove(&ctx_idr, ctx->id);
+	mutex_unlock(&mut);
+
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
+	ucma_put_ctx(ctx);
+	wait_for_completion(&ctx->comp);
+	resp.events_reported = ucma_free_ctx(ctx);
+
+	if (copy_to_user((void __user *)(unsigned long)cmd.response,
+			 &resp, sizeof(resp)))
+		ret = -EFAULT;
+
+	return ret;
+}
+
+static ssize_t ucma_bind_ip(struct ucma_file *file, const char __user *inbuf,
+			      int in_len, int out_len)
+{
+	struct rdma_ucm_bind_ip cmd;
+	struct ucma_context *ctx;
+	int ret;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	ctx = ucma_get_ctx(file, cmd.id);
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
+	ret = rdma_bind_addr(ctx->cm_id, (struct sockaddr *) &cmd.addr);
+	ucma_put_ctx(ctx);
+	return ret;
+}
+
+static ssize_t ucma_bind(struct ucma_file *file, const char __user *inbuf,
+			 int in_len, int out_len)
+{
+	struct rdma_ucm_bind cmd;
+	struct sockaddr *addr;
+	struct ucma_context *ctx;
+	int ret;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	addr = (struct sockaddr *) &cmd.addr;
+	if (cmd.reserved || !cmd.addr_size || (cmd.addr_size != rdma_addr_size(addr)))
+		return -EINVAL;
+
+	ctx = ucma_get_ctx(file, cmd.id);
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
+	ret = rdma_bind_addr(ctx->cm_id, addr);
+	ucma_put_ctx(ctx);
+	return ret;
+}
+
+static ssize_t ucma_resolve_ip(struct ucma_file *file,
+			       const char __user *inbuf,
+			       int in_len, int out_len)
+{
+	struct rdma_ucm_resolve_ip cmd;
+	struct ucma_context *ctx;
+	int ret;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	ctx = ucma_get_ctx(file, cmd.id);
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
+	ret = rdma_resolve_addr(ctx->cm_id, (struct sockaddr *) &cmd.src_addr,
+				(struct sockaddr *) &cmd.dst_addr,
+				cmd.timeout_ms);
+	ucma_put_ctx(ctx);
+	return ret;
+}
+
+static ssize_t ucma_resolve_addr(struct ucma_file *file,
+				 const char __user *inbuf,
+				 int in_len, int out_len)
+{
+	struct rdma_ucm_resolve_addr cmd;
+	struct sockaddr *src, *dst;
+	struct ucma_context *ctx;
+	int ret;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	src = (struct sockaddr *) &cmd.src_addr;
+	dst = (struct sockaddr *) &cmd.dst_addr;
+	if (cmd.reserved || (cmd.src_size && (cmd.src_size != rdma_addr_size(src))) ||
+	    !cmd.dst_size || (cmd.dst_size != rdma_addr_size(dst)))
+		return -EINVAL;
+
+	ctx = ucma_get_ctx(file, cmd.id);
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
+	ret = rdma_resolve_addr(ctx->cm_id, src, dst, cmd.timeout_ms);
+	ucma_put_ctx(ctx);
+	return ret;
+}
+
+static ssize_t ucma_resolve_route(struct ucma_file *file,
+				  const char __user *inbuf,
+				  int in_len, int out_len)
+{
+	struct rdma_ucm_resolve_route cmd;
+	struct ucma_context *ctx;
+	int ret;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	ctx = ucma_get_ctx(file, cmd.id);
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
+	ret = rdma_resolve_route(ctx->cm_id, cmd.timeout_ms);
+	ucma_put_ctx(ctx);
+	return ret;
+}
+
+static void ucma_copy_ib_route(struct rdma_ucm_query_route_resp *resp,
+			       struct rdma_route *route)
+{
+	struct rdma_dev_addr *dev_addr;
+
+	resp->num_paths = route->num_paths;
+	switch (route->num_paths) {
+	case 0:
+		dev_addr = &route->addr.dev_addr;
+		rdma_addr_get_dgid(dev_addr,
+				   (union ib_gid *) &resp->ib_route[0].dgid);
+		rdma_addr_get_sgid(dev_addr,
+				   (union ib_gid *) &resp->ib_route[0].sgid);
+		resp->ib_route[0].pkey = cpu_to_be16(ib_addr_get_pkey(dev_addr));
+		break;
+	case 2:
+		ib_copy_path_rec_to_user(&resp->ib_route[1],
+					 &route->path_rec[1]);
+		/* fall through */
+	case 1:
+		ib_copy_path_rec_to_user(&resp->ib_route[0],
+					 &route->path_rec[0]);
+		break;
+	default:
+		break;
+	}
+}
+
+static void ucma_copy_iboe_route(struct rdma_ucm_query_route_resp *resp,
+				 struct rdma_route *route)
+{
+
+	resp->num_paths = route->num_paths;
+	switch (route->num_paths) {
+	case 0:
+		rdma_ip2gid((struct sockaddr *)&route->addr.dst_addr,
+			    (union ib_gid *)&resp->ib_route[0].dgid);
+		rdma_ip2gid((struct sockaddr *)&route->addr.src_addr,
+			    (union ib_gid *)&resp->ib_route[0].sgid);
+		resp->ib_route[0].pkey = cpu_to_be16(0xffff);
+		break;
+	case 2:
+		ib_copy_path_rec_to_user(&resp->ib_route[1],
+					 &route->path_rec[1]);
+		/* fall through */
+	case 1:
+		ib_copy_path_rec_to_user(&resp->ib_route[0],
+					 &route->path_rec[0]);
+		break;
+	default:
+		break;
+	}
+}
+
+static void ucma_copy_iw_route(struct rdma_ucm_query_route_resp *resp,
+			       struct rdma_route *route)
+{
+	struct rdma_dev_addr *dev_addr;
+
+	dev_addr = &route->addr.dev_addr;
+	rdma_addr_get_dgid(dev_addr, (union ib_gid *) &resp->ib_route[0].dgid);
+	rdma_addr_get_sgid(dev_addr, (union ib_gid *) &resp->ib_route[0].sgid);
+}
+
+static ssize_t ucma_query_route(struct ucma_file *file,
+				const char __user *inbuf,
+				int in_len, int out_len)
+{
+	struct rdma_ucm_query cmd;
+	struct rdma_ucm_query_route_resp resp;
+	struct ucma_context *ctx;
+	struct sockaddr *addr;
+	int ret = 0;
+
+	if (out_len < sizeof(resp))
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	ctx = ucma_get_ctx(file, cmd.id);
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
+	memset(&resp, 0, sizeof resp);
+	addr = (struct sockaddr *) &ctx->cm_id->route.addr.src_addr;
+	memcpy(&resp.src_addr, addr, addr->sa_family == AF_INET ?
+				     sizeof(struct sockaddr_in) :
+				     sizeof(struct sockaddr_in6));
+	addr = (struct sockaddr *) &ctx->cm_id->route.addr.dst_addr;
+	memcpy(&resp.dst_addr, addr, addr->sa_family == AF_INET ?
+				     sizeof(struct sockaddr_in) :
+				     sizeof(struct sockaddr_in6));
+	if (!ctx->cm_id->device)
+		goto out;
+
+	resp.node_guid = (__force __u64) ctx->cm_id->device->node_guid;
+	resp.port_num = ctx->cm_id->port_num;
+	switch (rdma_node_get_transport(ctx->cm_id->device->node_type)) {
+	case RDMA_TRANSPORT_IB:
+		switch (rdma_port_get_link_layer(ctx->cm_id->device,
+			ctx->cm_id->port_num)) {
+		case IB_LINK_LAYER_INFINIBAND:
+			ucma_copy_ib_route(&resp, &ctx->cm_id->route);
+			break;
+		case IB_LINK_LAYER_ETHERNET:
+			ucma_copy_iboe_route(&resp, &ctx->cm_id->route);
+			break;
+		default:
+			break;
+		}
+		break;
+	case RDMA_TRANSPORT_IWARP:
+		ucma_copy_iw_route(&resp, &ctx->cm_id->route);
+		break;
+	default:
+		break;
+	}
+
+out:
+	if (copy_to_user((void __user *)(unsigned long)cmd.response,
+			 &resp, sizeof(resp)))
+		ret = -EFAULT;
+
+	ucma_put_ctx(ctx);
+	return ret;
+}
+
+static void ucma_query_device_addr(struct rdma_cm_id *cm_id,
+				   struct rdma_ucm_query_addr_resp *resp)
+{
+	if (!cm_id->device)
+		return;
+
+	resp->node_guid = (__force __u64) cm_id->device->node_guid;
+	resp->port_num = cm_id->port_num;
+	resp->pkey = (__force __u16) cpu_to_be16(
+		     ib_addr_get_pkey(&cm_id->route.addr.dev_addr));
+}
+
+static ssize_t ucma_query_addr(struct ucma_context *ctx,
+			       void __user *response, int out_len)
+{
+	struct rdma_ucm_query_addr_resp resp;
+	struct sockaddr *addr;
+	int ret = 0;
+
+	if (out_len < sizeof(resp))
+		return -ENOSPC;
+
+	memset(&resp, 0, sizeof resp);
+
+	addr = (struct sockaddr *) &ctx->cm_id->route.addr.src_addr;
+	resp.src_size = rdma_addr_size(addr);
+	memcpy(&resp.src_addr, addr, resp.src_size);
+
+	addr = (struct sockaddr *) &ctx->cm_id->route.addr.dst_addr;
+	resp.dst_size = rdma_addr_size(addr);
+	memcpy(&resp.dst_addr, addr, resp.dst_size);
+
+	ucma_query_device_addr(ctx->cm_id, &resp);
+
+	if (copy_to_user(response, &resp, sizeof(resp)))
+		ret = -EFAULT;
+
+	return ret;
+}
+
+static ssize_t ucma_query_path(struct ucma_context *ctx,
+			       void __user *response, int out_len)
+{
+	struct rdma_ucm_query_path_resp *resp;
+	int i, ret = 0;
+
+	if (out_len < sizeof(*resp))
+		return -ENOSPC;
+
+	resp = kzalloc(out_len, GFP_KERNEL);
+	if (!resp)
+		return -ENOMEM;
+
+	resp->num_paths = ctx->cm_id->route.num_paths;
+	for (i = 0, out_len -= sizeof(*resp);
+	     i < resp->num_paths && out_len > sizeof(struct ib_path_rec_data);
+	     i++, out_len -= sizeof(struct ib_path_rec_data)) {
+
+		resp->path_data[i].flags = IB_PATH_GMP | IB_PATH_PRIMARY |
+					   IB_PATH_BIDIRECTIONAL;
+		ib_sa_pack_path(&ctx->cm_id->route.path_rec[i],
+				&resp->path_data[i].path_rec);
+	}
+
+	if (copy_to_user(response, resp,
+			 sizeof(*resp) + (i * sizeof(struct ib_path_rec_data))))
+		ret = -EFAULT;
+
+	kfree(resp);
+	return ret;
+}
+
+static ssize_t ucma_query_gid(struct ucma_context *ctx,
+			      void __user *response, int out_len)
+{
+	struct rdma_ucm_query_addr_resp resp;
+	struct sockaddr_ib *addr;
+	int ret = 0;
+
+	if (out_len < sizeof(resp))
+		return -ENOSPC;
+
+	memset(&resp, 0, sizeof resp);
+
+	ucma_query_device_addr(ctx->cm_id, &resp);
+
+	addr = (struct sockaddr_ib *) &resp.src_addr;
+	resp.src_size = sizeof(*addr);
+	if (ctx->cm_id->route.addr.src_addr.ss_family == AF_IB) {
+		memcpy(addr, &ctx->cm_id->route.addr.src_addr, resp.src_size);
+	} else {
+		addr->sib_family = AF_IB;
+		addr->sib_pkey = (__force __be16) resp.pkey;
+		rdma_addr_get_sgid(&ctx->cm_id->route.addr.dev_addr,
+				   (union ib_gid *) &addr->sib_addr);
+		addr->sib_sid = rdma_get_service_id(ctx->cm_id, (struct sockaddr *)
+						    &ctx->cm_id->route.addr.src_addr);
+	}
+
+	addr = (struct sockaddr_ib *) &resp.dst_addr;
+	resp.dst_size = sizeof(*addr);
+	if (ctx->cm_id->route.addr.dst_addr.ss_family == AF_IB) {
+		memcpy(addr, &ctx->cm_id->route.addr.dst_addr, resp.dst_size);
+	} else {
+		addr->sib_family = AF_IB;
+		addr->sib_pkey = (__force __be16) resp.pkey;
+		rdma_addr_get_dgid(&ctx->cm_id->route.addr.dev_addr,
+				   (union ib_gid *) &addr->sib_addr);
+		addr->sib_sid = rdma_get_service_id(ctx->cm_id, (struct sockaddr *)
+						    &ctx->cm_id->route.addr.dst_addr);
+	}
+
+	if (copy_to_user(response, &resp, sizeof(resp)))
+		ret = -EFAULT;
+
+	return ret;
+}
+
+static ssize_t ucma_query(struct ucma_file *file,
+			  const char __user *inbuf,
+			  int in_len, int out_len)
+{
+	struct rdma_ucm_query cmd;
+	struct ucma_context *ctx;
+	void __user *response;
+	int ret;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	response = (void __user *)(unsigned long) cmd.response;
+	ctx = ucma_get_ctx(file, cmd.id);
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
+	switch (cmd.option) {
+	case RDMA_USER_CM_QUERY_ADDR:
+		ret = ucma_query_addr(ctx, response, out_len);
+		break;
+	case RDMA_USER_CM_QUERY_PATH:
+		ret = ucma_query_path(ctx, response, out_len);
+		break;
+	case RDMA_USER_CM_QUERY_GID:
+		ret = ucma_query_gid(ctx, response, out_len);
+		break;
+	default:
+		ret = -ENOSYS;
+		break;
+	}
+
+	ucma_put_ctx(ctx);
+	return ret;
+}
+
+static void ucma_copy_conn_param(struct rdma_cm_id *id,
+				 struct rdma_conn_param *dst,
+				 struct rdma_ucm_conn_param *src)
+{
+	dst->private_data = src->private_data;
+	dst->private_data_len = src->private_data_len;
+	dst->responder_resources =src->responder_resources;
+	dst->initiator_depth = src->initiator_depth;
+	dst->flow_control = src->flow_control;
+	dst->retry_count = src->retry_count;
+	dst->rnr_retry_count = src->rnr_retry_count;
+	dst->srq = src->srq;
+	dst->qp_num = src->qp_num;
+	dst->qkey = (id->route.addr.src_addr.ss_family == AF_IB) ? src->qkey : 0;
+}
+
+static ssize_t ucma_connect(struct ucma_file *file, const char __user *inbuf,
+			    int in_len, int out_len)
+{
+	struct rdma_ucm_connect cmd;
+	struct rdma_conn_param conn_param;
+	struct ucma_context *ctx;
+	int ret;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	if (!cmd.conn_param.valid)
+		return -EINVAL;
+
+	ctx = ucma_get_ctx(file, cmd.id);
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
+	ucma_copy_conn_param(ctx->cm_id, &conn_param, &cmd.conn_param);
+	ret = rdma_connect(ctx->cm_id, &conn_param);
+	ucma_put_ctx(ctx);
+	return ret;
+}
+
+static ssize_t ucma_listen(struct ucma_file *file, const char __user *inbuf,
+			   int in_len, int out_len)
+{
+	struct rdma_ucm_listen cmd;
+	struct ucma_context *ctx;
+	int ret;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	ctx = ucma_get_ctx(file, cmd.id);
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
+	ctx->backlog = cmd.backlog > 0 && cmd.backlog < max_backlog ?
+		       cmd.backlog : max_backlog;
+	ret = rdma_listen(ctx->cm_id, ctx->backlog);
+	ucma_put_ctx(ctx);
+	return ret;
+}
+
+static ssize_t ucma_accept(struct ucma_file *file, const char __user *inbuf,
+			   int in_len, int out_len)
+{
+	struct rdma_ucm_accept cmd;
+	struct rdma_conn_param conn_param;
+	struct ucma_context *ctx;
+	int ret;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	ctx = ucma_get_ctx(file, cmd.id);
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
+	if (cmd.conn_param.valid) {
+		ucma_copy_conn_param(ctx->cm_id, &conn_param, &cmd.conn_param);
+		mutex_lock(&file->mut);
+		ret = rdma_accept(ctx->cm_id, &conn_param);
+		if (!ret)
+			ctx->uid = cmd.uid;
+		mutex_unlock(&file->mut);
+	} else
+		ret = rdma_accept(ctx->cm_id, NULL);
+
+	ucma_put_ctx(ctx);
+	return ret;
+}
+
+static ssize_t ucma_reject(struct ucma_file *file, const char __user *inbuf,
+			   int in_len, int out_len)
+{
+	struct rdma_ucm_reject cmd;
+	struct ucma_context *ctx;
+	int ret;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	ctx = ucma_get_ctx(file, cmd.id);
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
+	ret = rdma_reject(ctx->cm_id, cmd.private_data, cmd.private_data_len);
+	ucma_put_ctx(ctx);
+	return ret;
+}
+
+static ssize_t ucma_disconnect(struct ucma_file *file, const char __user *inbuf,
+			       int in_len, int out_len)
+{
+	struct rdma_ucm_disconnect cmd;
+	struct ucma_context *ctx;
+	int ret;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	ctx = ucma_get_ctx(file, cmd.id);
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
+	ret = rdma_disconnect(ctx->cm_id);
+	ucma_put_ctx(ctx);
+	return ret;
+}
+
+static ssize_t ucma_init_qp_attr(struct ucma_file *file,
+				 const char __user *inbuf,
+				 int in_len, int out_len)
+{
+	struct rdma_ucm_init_qp_attr cmd;
+	struct ib_uverbs_qp_attr resp;
+	struct ucma_context *ctx;
+	struct ib_qp_attr qp_attr;
+	int ret;
+
+	if (out_len < sizeof(resp))
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	ctx = ucma_get_ctx(file, cmd.id);
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
+	resp.qp_attr_mask = 0;
+	memset(&qp_attr, 0, sizeof qp_attr);
+	qp_attr.qp_state = cmd.qp_state;
+	ret = rdma_init_qp_attr(ctx->cm_id, &qp_attr, &resp.qp_attr_mask);
+	if (ret)
+		goto out;
+
+	ib_copy_qp_attr_to_user(&resp, &qp_attr);
+	if (copy_to_user((void __user *)(unsigned long)cmd.response,
+			 &resp, sizeof(resp)))
+		ret = -EFAULT;
+
+out:
+	ucma_put_ctx(ctx);
+	return ret;
+}
+
+static int ucma_set_option_id(struct ucma_context *ctx, int optname,
+			      void *optval, size_t optlen)
+{
+	int ret = 0;
+
+	switch (optname) {
+	case RDMA_OPTION_ID_TOS:
+		if (optlen != sizeof(u8)) {
+			ret = -EINVAL;
+			break;
+		}
+		rdma_set_service_type(ctx->cm_id, *((u8 *) optval));
+		break;
+	case RDMA_OPTION_ID_REUSEADDR:
+		if (optlen != sizeof(int)) {
+			ret = -EINVAL;
+			break;
+		}
+		ret = rdma_set_reuseaddr(ctx->cm_id, *((int *) optval) ? 1 : 0);
+		break;
+	case RDMA_OPTION_ID_AFONLY:
+		if (optlen != sizeof(int)) {
+			ret = -EINVAL;
+			break;
+		}
+		ret = rdma_set_afonly(ctx->cm_id, *((int *) optval) ? 1 : 0);
+		break;
+	default:
+		ret = -ENOSYS;
+	}
+
+	return ret;
+}
+
+static int ucma_set_ib_path(struct ucma_context *ctx,
+			    struct ib_path_rec_data *path_data, size_t optlen)
+{
+	struct ib_sa_path_rec sa_path;
+	struct rdma_cm_event event;
+	int ret;
+
+	if (optlen % sizeof(*path_data))
+		return -EINVAL;
+
+	for (; optlen; optlen -= sizeof(*path_data), path_data++) {
+		if (path_data->flags == (IB_PATH_GMP | IB_PATH_PRIMARY |
+					 IB_PATH_BIDIRECTIONAL))
+			break;
+	}
+
+	if (!optlen)
+		return -EINVAL;
+
+	ib_sa_unpack_path(path_data->path_rec, &sa_path);
+	ret = rdma_set_ib_paths(ctx->cm_id, &sa_path, 1);
+	if (ret)
+		return ret;
+
+	memset(&event, 0, sizeof event);
+	event.event = RDMA_CM_EVENT_ROUTE_RESOLVED;
+	return ucma_event_handler(ctx->cm_id, &event);
+}
+
+static int ucma_set_option_ib(struct ucma_context *ctx, int optname,
+			      void *optval, size_t optlen)
+{
+	int ret;
+
+	switch (optname) {
+	case RDMA_OPTION_IB_PATH:
+		ret = ucma_set_ib_path(ctx, optval, optlen);
+		break;
+	default:
+		ret = -ENOSYS;
+	}
+
+	return ret;
+}
+
+static int ucma_set_option_level(struct ucma_context *ctx, int level,
+				 int optname, void *optval, size_t optlen)
+{
+	int ret;
+
+	switch (level) {
+	case RDMA_OPTION_ID:
+		ret = ucma_set_option_id(ctx, optname, optval, optlen);
+		break;
+	case RDMA_OPTION_IB:
+		ret = ucma_set_option_ib(ctx, optname, optval, optlen);
+		break;
+	default:
+		ret = -ENOSYS;
+	}
+
+	return ret;
+}
+
+static ssize_t ucma_set_option(struct ucma_file *file, const char __user *inbuf,
+			       int in_len, int out_len)
+{
+	struct rdma_ucm_set_option cmd;
+	struct ucma_context *ctx;
+	void *optval;
+	int ret;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	ctx = ucma_get_ctx(file, cmd.id);
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
+	optval = memdup_user((void __user *) (unsigned long) cmd.optval,
+			     cmd.optlen);
+	if (IS_ERR(optval)) {
+		ret = PTR_ERR(optval);
+		goto out;
+	}
+
+	ret = ucma_set_option_level(ctx, cmd.level, cmd.optname, optval,
+				    cmd.optlen);
+	kfree(optval);
+
+out:
+	ucma_put_ctx(ctx);
+	return ret;
+}
+
+static ssize_t ucma_notify(struct ucma_file *file, const char __user *inbuf,
+			   int in_len, int out_len)
+{
+	struct rdma_ucm_notify cmd;
+	struct ucma_context *ctx;
+	int ret;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	ctx = ucma_get_ctx(file, cmd.id);
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
+	ret = rdma_notify(ctx->cm_id, (enum ib_event_type) cmd.event);
+	ucma_put_ctx(ctx);
+	return ret;
+}
+
+static ssize_t ucma_process_join(struct ucma_file *file,
+				 struct rdma_ucm_join_mcast *cmd,  int out_len)
+{
+	struct rdma_ucm_create_id_resp resp;
+	struct ucma_context *ctx;
+	struct ucma_multicast *mc;
+	struct sockaddr *addr;
+	int ret;
+
+	if (out_len < sizeof(resp))
+		return -ENOSPC;
+
+	addr = (struct sockaddr *) &cmd->addr;
+	if (cmd->reserved || !cmd->addr_size || (cmd->addr_size != rdma_addr_size(addr)))
+		return -EINVAL;
+
+	ctx = ucma_get_ctx(file, cmd->id);
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
+	mutex_lock(&file->mut);
+	mc = ucma_alloc_multicast(ctx);
+	if (!mc) {
+		ret = -ENOMEM;
+		goto err1;
+	}
+
+	mc->uid = cmd->uid;
+	memcpy(&mc->addr, addr, cmd->addr_size);
+	ret = rdma_join_multicast(ctx->cm_id, (struct sockaddr *) &mc->addr, mc);
+	if (ret)
+		goto err2;
+
+	resp.id = mc->id;
+	if (copy_to_user((void __user *)(unsigned long) cmd->response,
+			 &resp, sizeof(resp))) {
+		ret = -EFAULT;
+		goto err3;
+	}
+
+	mutex_unlock(&file->mut);
+	ucma_put_ctx(ctx);
+	return 0;
+
+err3:
+	rdma_leave_multicast(ctx->cm_id, (struct sockaddr *) &mc->addr);
+	ucma_cleanup_mc_events(mc);
+err2:
+	mutex_lock(&mut);
+	idr_remove(&multicast_idr, mc->id);
+	mutex_unlock(&mut);
+	list_del(&mc->list);
+	kfree(mc);
+err1:
+	mutex_unlock(&file->mut);
+	ucma_put_ctx(ctx);
+	return ret;
+}
+
+static ssize_t ucma_join_ip_multicast(struct ucma_file *file,
+				      const char __user *inbuf,
+				      int in_len, int out_len)
+{
+	struct rdma_ucm_join_ip_mcast cmd;
+	struct rdma_ucm_join_mcast join_cmd;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	join_cmd.response = cmd.response;
+	join_cmd.uid = cmd.uid;
+	join_cmd.id = cmd.id;
+	join_cmd.addr_size = rdma_addr_size((struct sockaddr *) &cmd.addr);
+	join_cmd.reserved = 0;
+	memcpy(&join_cmd.addr, &cmd.addr, join_cmd.addr_size);
+
+	return ucma_process_join(file, &join_cmd, out_len);
+}
+
+static ssize_t ucma_join_multicast(struct ucma_file *file,
+				   const char __user *inbuf,
+				   int in_len, int out_len)
+{
+	struct rdma_ucm_join_mcast cmd;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	return ucma_process_join(file, &cmd, out_len);
+}
+
+static ssize_t ucma_leave_multicast(struct ucma_file *file,
+				    const char __user *inbuf,
+				    int in_len, int out_len)
+{
+	struct rdma_ucm_destroy_id cmd;
+	struct rdma_ucm_destroy_id_resp resp;
+	struct ucma_multicast *mc;
+	int ret = 0;
+
+	if (out_len < sizeof(resp))
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	mutex_lock(&mut);
+	mc = idr_find(&multicast_idr, cmd.id);
+	if (!mc)
+		mc = ERR_PTR(-ENOENT);
+	else if (mc->ctx->file != file)
+		mc = ERR_PTR(-EINVAL);
+	else {
+		idr_remove(&multicast_idr, mc->id);
+		atomic_inc(&mc->ctx->ref);
+	}
+	mutex_unlock(&mut);
+
+	if (IS_ERR(mc)) {
+		ret = PTR_ERR(mc);
+		goto out;
+	}
+
+	rdma_leave_multicast(mc->ctx->cm_id, (struct sockaddr *) &mc->addr);
+	mutex_lock(&mc->ctx->file->mut);
+	ucma_cleanup_mc_events(mc);
+	list_del(&mc->list);
+	mutex_unlock(&mc->ctx->file->mut);
+
+	ucma_put_ctx(mc->ctx);
+	resp.events_reported = mc->events_reported;
+	kfree(mc);
+
+	if (copy_to_user((void __user *)(unsigned long)cmd.response,
+			 &resp, sizeof(resp)))
+		ret = -EFAULT;
+out:
+	return ret;
+}
+
+static void ucma_lock_files(struct ucma_file *file1, struct ucma_file *file2)
+{
+	/* Acquire mutex's based on pointer comparison to prevent deadlock. */
+	if (file1 < file2) {
+		mutex_lock(&file1->mut);
+		mutex_lock(&file2->mut);
+	} else {
+		mutex_lock(&file2->mut);
+		mutex_lock(&file1->mut);
+	}
+}
+
+static void ucma_unlock_files(struct ucma_file *file1, struct ucma_file *file2)
+{
+	if (file1 < file2) {
+		mutex_unlock(&file2->mut);
+		mutex_unlock(&file1->mut);
+	} else {
+		mutex_unlock(&file1->mut);
+		mutex_unlock(&file2->mut);
+	}
+}
+
+static void ucma_move_events(struct ucma_context *ctx, struct ucma_file *file)
+{
+	struct ucma_event *uevent, *tmp;
+
+	list_for_each_entry_safe(uevent, tmp, &ctx->file->event_list, list)
+		if (uevent->ctx == ctx)
+			list_move_tail(&uevent->list, &file->event_list);
+}
+
+static ssize_t ucma_migrate_id(struct ucma_file *new_file,
+			       const char __user *inbuf,
+			       int in_len, int out_len)
+{
+	struct rdma_ucm_migrate_id cmd;
+	struct rdma_ucm_migrate_resp resp;
+	struct ucma_context *ctx;
+	struct fd f;
+	struct ucma_file *cur_file;
+	int ret = 0;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	/* Get current fd to protect against it being closed */
+	f = fdget(cmd.fd);
+	if (!f.file)
+		return -ENOENT;
+
+	/* Validate current fd and prevent destruction of id. */
+	ctx = ucma_get_ctx(f.file->private_data, cmd.id);
+	if (IS_ERR(ctx)) {
+		ret = PTR_ERR(ctx);
+		goto file_put;
+	}
+
+	cur_file = ctx->file;
+	if (cur_file == new_file) {
+		resp.events_reported = ctx->events_reported;
+		goto response;
+	}
+
+	/*
+	 * Migrate events between fd's, maintaining order, and avoiding new
+	 * events being added before existing events.
+	 */
+	ucma_lock_files(cur_file, new_file);
+	mutex_lock(&mut);
+
+	list_move_tail(&ctx->list, &new_file->ctx_list);
+	ucma_move_events(ctx, new_file);
+	ctx->file = new_file;
+	resp.events_reported = ctx->events_reported;
+
+	mutex_unlock(&mut);
+	ucma_unlock_files(cur_file, new_file);
+
+response:
+	if (copy_to_user((void __user *)(unsigned long)cmd.response,
+			 &resp, sizeof(resp)))
+		ret = -EFAULT;
+
+	ucma_put_ctx(ctx);
+file_put:
+	fdput(f);
+	return ret;
+}
+
+static ssize_t (*ucma_cmd_table[])(struct ucma_file *file,
+				   const char __user *inbuf,
+				   int in_len, int out_len) = {
+	[RDMA_USER_CM_CMD_CREATE_ID] 	 = ucma_create_id,
+	[RDMA_USER_CM_CMD_DESTROY_ID]	 = ucma_destroy_id,
+	[RDMA_USER_CM_CMD_BIND_IP]	 = ucma_bind_ip,
+	[RDMA_USER_CM_CMD_RESOLVE_IP]	 = ucma_resolve_ip,
+	[RDMA_USER_CM_CMD_RESOLVE_ROUTE] = ucma_resolve_route,
+	[RDMA_USER_CM_CMD_QUERY_ROUTE]	 = ucma_query_route,
+	[RDMA_USER_CM_CMD_CONNECT]	 = ucma_connect,
+	[RDMA_USER_CM_CMD_LISTEN]	 = ucma_listen,
+	[RDMA_USER_CM_CMD_ACCEPT]	 = ucma_accept,
+	[RDMA_USER_CM_CMD_REJECT]	 = ucma_reject,
+	[RDMA_USER_CM_CMD_DISCONNECT]	 = ucma_disconnect,
+	[RDMA_USER_CM_CMD_INIT_QP_ATTR]	 = ucma_init_qp_attr,
+	[RDMA_USER_CM_CMD_GET_EVENT]	 = ucma_get_event,
+	[RDMA_USER_CM_CMD_GET_OPTION]	 = NULL,
+	[RDMA_USER_CM_CMD_SET_OPTION]	 = ucma_set_option,
+	[RDMA_USER_CM_CMD_NOTIFY]	 = ucma_notify,
+	[RDMA_USER_CM_CMD_JOIN_IP_MCAST] = ucma_join_ip_multicast,
+	[RDMA_USER_CM_CMD_LEAVE_MCAST]	 = ucma_leave_multicast,
+	[RDMA_USER_CM_CMD_MIGRATE_ID]	 = ucma_migrate_id,
+	[RDMA_USER_CM_CMD_QUERY]	 = ucma_query,
+	[RDMA_USER_CM_CMD_BIND]		 = ucma_bind,
+	[RDMA_USER_CM_CMD_RESOLVE_ADDR]	 = ucma_resolve_addr,
+	[RDMA_USER_CM_CMD_JOIN_MCAST]	 = ucma_join_multicast
+};
+
+static ssize_t ucma_write(struct file *filp, const char __user *buf,
+			  size_t len, loff_t *pos)
+{
+	struct ucma_file *file = filp->private_data;
+	struct rdma_ucm_cmd_hdr hdr;
+	ssize_t ret;
+
+	if (len < sizeof(hdr))
+		return -EINVAL;
+
+	if (copy_from_user(&hdr, buf, sizeof(hdr)))
+		return -EFAULT;
+
+	if (hdr.cmd >= ARRAY_SIZE(ucma_cmd_table))
+		return -EINVAL;
+
+	if (hdr.in + sizeof(hdr) > len)
+		return -EINVAL;
+
+	if (!ucma_cmd_table[hdr.cmd])
+		return -ENOSYS;
+
+	ret = ucma_cmd_table[hdr.cmd](file, buf + sizeof(hdr), hdr.in, hdr.out);
+	if (!ret)
+		ret = len;
+
+	return ret;
+}
+
+static unsigned int ucma_poll(struct file *filp, struct poll_table_struct *wait)
+{
+	struct ucma_file *file = filp->private_data;
+	unsigned int mask = 0;
+
+	poll_wait(filp, &file->poll_wait, wait);
+
+	if (!list_empty(&file->event_list))
+		mask = POLLIN | POLLRDNORM;
+
+	return mask;
+}
+
+/*
+ * ucma_open() does not need the BKL:
+ *
+ *  - no global state is referred to;
+ *  - there is no ioctl method to race against;
+ *  - no further module initialization is required for open to work
+ *    after the device is registered.
+ */
+static int ucma_open(struct inode *inode, struct file *filp)
+{
+	struct ucma_file *file;
+
+	file = kmalloc(sizeof *file, GFP_KERNEL);
+	if (!file)
+		return -ENOMEM;
+
+	INIT_LIST_HEAD(&file->event_list);
+	INIT_LIST_HEAD(&file->ctx_list);
+	init_waitqueue_head(&file->poll_wait);
+	mutex_init(&file->mut);
+
+	filp->private_data = file;
+	file->filp = filp;
+
+	return nonseekable_open(inode, filp);
+}
+
+static int ucma_close(struct inode *inode, struct file *filp)
+{
+	struct ucma_file *file = filp->private_data;
+	struct ucma_context *ctx, *tmp;
+
+	mutex_lock(&file->mut);
+	list_for_each_entry_safe(ctx, tmp, &file->ctx_list, list) {
+		mutex_unlock(&file->mut);
+
+		mutex_lock(&mut);
+		idr_remove(&ctx_idr, ctx->id);
+		mutex_unlock(&mut);
+
+		ucma_free_ctx(ctx);
+		mutex_lock(&file->mut);
+	}
+	mutex_unlock(&file->mut);
+	kfree(file);
+	return 0;
+}
+
+static const struct file_operations ucma_fops = {
+	.owner 	 = THIS_MODULE,
+	.open 	 = ucma_open,
+	.release = ucma_close,
+	.write	 = ucma_write,
+	.poll    = ucma_poll,
+	.llseek	 = no_llseek,
+};
+
+static struct miscdevice ucma_misc = {
+	.minor		= MISC_DYNAMIC_MINOR,
+	.name		= "rdma_cm",
+	.nodename	= "infiniband/rdma_cm",
+	.mode		= 0666,
+	.fops		= &ucma_fops,
+};
+
+static ssize_t show_abi_version(struct device *dev,
+				struct device_attribute *attr,
+				char *buf)
+{
+	return sprintf(buf, "%d\n", RDMA_USER_CM_ABI_VERSION);
+}
+static DEVICE_ATTR(abi_version, S_IRUGO, show_abi_version, NULL);
+
+static int __init ucma_init(void)
+{
+	int ret;
+
+	ret = misc_register(&ucma_misc);
+	if (ret)
+		return ret;
+
+	ret = device_create_file(ucma_misc.this_device, &dev_attr_abi_version);
+	if (ret) {
+		printk(KERN_ERR "rdma_ucm: couldn't create abi_version attr\n");
+		goto err1;
+	}
+
+	ucma_ctl_table_hdr = register_net_sysctl(&init_net, "net/rdma_ucm", ucma_ctl_table);
+	if (!ucma_ctl_table_hdr) {
+		printk(KERN_ERR "rdma_ucm: couldn't register sysctl paths\n");
+		ret = -ENOMEM;
+		goto err2;
+	}
+	return 0;
+err2:
+	device_remove_file(ucma_misc.this_device, &dev_attr_abi_version);
+err1:
+	misc_deregister(&ucma_misc);
+	return ret;
+}
+
+static void __exit ucma_cleanup(void)
+{
+	unregister_net_sysctl_table(ucma_ctl_table_hdr);
+	device_remove_file(ucma_misc.this_device, &dev_attr_abi_version);
+	misc_deregister(&ucma_misc);
+	idr_destroy(&ctx_idr);
+}
+
+module_init(ucma_init);
+module_exit(ucma_cleanup);
diff --git a/drivers/infiniband/core/ud_header.c b/drivers/infiniband/core/ud_header.c
new file mode 100644
index 0000000..72feee6
--- /dev/null
+++ b/drivers/infiniband/core/ud_header.c
@@ -0,0 +1,414 @@
+/*
+ * Copyright (c) 2004 Topspin Corporation.  All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/export.h>
+#include <linux/if_ether.h>
+
+#include <rdma/ib_pack.h>
+
+#define STRUCT_FIELD(header, field) \
+	.struct_offset_bytes = offsetof(struct ib_unpacked_ ## header, field),      \
+	.struct_size_bytes   = sizeof ((struct ib_unpacked_ ## header *) 0)->field, \
+	.field_name          = #header ":" #field
+
+static const struct ib_field lrh_table[]  = {
+	{ STRUCT_FIELD(lrh, virtual_lane),
+	  .offset_words = 0,
+	  .offset_bits  = 0,
+	  .size_bits    = 4 },
+	{ STRUCT_FIELD(lrh, link_version),
+	  .offset_words = 0,
+	  .offset_bits  = 4,
+	  .size_bits    = 4 },
+	{ STRUCT_FIELD(lrh, service_level),
+	  .offset_words = 0,
+	  .offset_bits  = 8,
+	  .size_bits    = 4 },
+	{ RESERVED,
+	  .offset_words = 0,
+	  .offset_bits  = 12,
+	  .size_bits    = 2 },
+	{ STRUCT_FIELD(lrh, link_next_header),
+	  .offset_words = 0,
+	  .offset_bits  = 14,
+	  .size_bits    = 2 },
+	{ STRUCT_FIELD(lrh, destination_lid),
+	  .offset_words = 0,
+	  .offset_bits  = 16,
+	  .size_bits    = 16 },
+	{ RESERVED,
+	  .offset_words = 1,
+	  .offset_bits  = 0,
+	  .size_bits    = 5 },
+	{ STRUCT_FIELD(lrh, packet_length),
+	  .offset_words = 1,
+	  .offset_bits  = 5,
+	  .size_bits    = 11 },
+	{ STRUCT_FIELD(lrh, source_lid),
+	  .offset_words = 1,
+	  .offset_bits  = 16,
+	  .size_bits    = 16 }
+};
+
+static const struct ib_field eth_table[]  = {
+	{ STRUCT_FIELD(eth, dmac_h),
+	  .offset_words = 0,
+	  .offset_bits  = 0,
+	  .size_bits    = 32 },
+	{ STRUCT_FIELD(eth, dmac_l),
+	  .offset_words = 1,
+	  .offset_bits  = 0,
+	  .size_bits    = 16 },
+	{ STRUCT_FIELD(eth, smac_h),
+	  .offset_words = 1,
+	  .offset_bits  = 16,
+	  .size_bits    = 16 },
+	{ STRUCT_FIELD(eth, smac_l),
+	  .offset_words = 2,
+	  .offset_bits  = 0,
+	  .size_bits    = 32 },
+	{ STRUCT_FIELD(eth, type),
+	  .offset_words = 3,
+	  .offset_bits  = 0,
+	  .size_bits    = 16 }
+};
+
+static const struct ib_field vlan_table[]  = {
+	{ STRUCT_FIELD(vlan, tag),
+	  .offset_words = 0,
+	  .offset_bits  = 0,
+	  .size_bits    = 16 },
+	{ STRUCT_FIELD(vlan, type),
+	  .offset_words = 0,
+	  .offset_bits  = 16,
+	  .size_bits    = 16 }
+};
+
+static const struct ib_field grh_table[]  = {
+	{ STRUCT_FIELD(grh, ip_version),
+	  .offset_words = 0,
+	  .offset_bits  = 0,
+	  .size_bits    = 4 },
+	{ STRUCT_FIELD(grh, traffic_class),
+	  .offset_words = 0,
+	  .offset_bits  = 4,
+	  .size_bits    = 8 },
+	{ STRUCT_FIELD(grh, flow_label),
+	  .offset_words = 0,
+	  .offset_bits  = 12,
+	  .size_bits    = 20 },
+	{ STRUCT_FIELD(grh, payload_length),
+	  .offset_words = 1,
+	  .offset_bits  = 0,
+	  .size_bits    = 16 },
+	{ STRUCT_FIELD(grh, next_header),
+	  .offset_words = 1,
+	  .offset_bits  = 16,
+	  .size_bits    = 8 },
+	{ STRUCT_FIELD(grh, hop_limit),
+	  .offset_words = 1,
+	  .offset_bits  = 24,
+	  .size_bits    = 8 },
+	{ STRUCT_FIELD(grh, source_gid),
+	  .offset_words = 2,
+	  .offset_bits  = 0,
+	  .size_bits    = 128 },
+	{ STRUCT_FIELD(grh, destination_gid),
+	  .offset_words = 6,
+	  .offset_bits  = 0,
+	  .size_bits    = 128 }
+};
+
+static const struct ib_field bth_table[]  = {
+	{ STRUCT_FIELD(bth, opcode),
+	  .offset_words = 0,
+	  .offset_bits  = 0,
+	  .size_bits    = 8 },
+	{ STRUCT_FIELD(bth, solicited_event),
+	  .offset_words = 0,
+	  .offset_bits  = 8,
+	  .size_bits    = 1 },
+	{ STRUCT_FIELD(bth, mig_req),
+	  .offset_words = 0,
+	  .offset_bits  = 9,
+	  .size_bits    = 1 },
+	{ STRUCT_FIELD(bth, pad_count),
+	  .offset_words = 0,
+	  .offset_bits  = 10,
+	  .size_bits    = 2 },
+	{ STRUCT_FIELD(bth, transport_header_version),
+	  .offset_words = 0,
+	  .offset_bits  = 12,
+	  .size_bits    = 4 },
+	{ STRUCT_FIELD(bth, pkey),
+	  .offset_words = 0,
+	  .offset_bits  = 16,
+	  .size_bits    = 16 },
+	{ RESERVED,
+	  .offset_words = 1,
+	  .offset_bits  = 0,
+	  .size_bits    = 8 },
+	{ STRUCT_FIELD(bth, destination_qpn),
+	  .offset_words = 1,
+	  .offset_bits  = 8,
+	  .size_bits    = 24 },
+	{ STRUCT_FIELD(bth, ack_req),
+	  .offset_words = 2,
+	  .offset_bits  = 0,
+	  .size_bits    = 1 },
+	{ RESERVED,
+	  .offset_words = 2,
+	  .offset_bits  = 1,
+	  .size_bits    = 7 },
+	{ STRUCT_FIELD(bth, psn),
+	  .offset_words = 2,
+	  .offset_bits  = 8,
+	  .size_bits    = 24 }
+};
+
+static const struct ib_field deth_table[] = {
+	{ STRUCT_FIELD(deth, qkey),
+	  .offset_words = 0,
+	  .offset_bits  = 0,
+	  .size_bits    = 32 },
+	{ RESERVED,
+	  .offset_words = 1,
+	  .offset_bits  = 0,
+	  .size_bits    = 8 },
+	{ STRUCT_FIELD(deth, source_qpn),
+	  .offset_words = 1,
+	  .offset_bits  = 8,
+	  .size_bits    = 24 }
+};
+
+/**
+ * ib_ud_header_init - Initialize UD header structure
+ * @payload_bytes:Length of packet payload
+ * @lrh_present: specify if LRH is present
+ * @eth_present: specify if Eth header is present
+ * @vlan_present: packet is tagged vlan
+ * @grh_present:GRH flag (if non-zero, GRH will be included)
+ * @immediate_present: specify if immediate data is present
+ * @header:Structure to initialize
+ */
+void ib_ud_header_init(int     		    payload_bytes,
+		       int		    lrh_present,
+		       int		    eth_present,
+		       int		    vlan_present,
+		       int    		    grh_present,
+		       int		    immediate_present,
+		       struct ib_ud_header *header)
+{
+	memset(header, 0, sizeof *header);
+
+	if (lrh_present) {
+		u16 packet_length;
+
+		header->lrh.link_version     = 0;
+		header->lrh.link_next_header =
+			grh_present ? IB_LNH_IBA_GLOBAL : IB_LNH_IBA_LOCAL;
+		packet_length = (IB_LRH_BYTES	+
+				 IB_BTH_BYTES	+
+				 IB_DETH_BYTES	+
+				 (grh_present ? IB_GRH_BYTES : 0) +
+				 payload_bytes	+
+				 4		+ /* ICRC     */
+				 3) / 4;	  /* round up */
+		header->lrh.packet_length = cpu_to_be16(packet_length);
+	}
+
+	if (vlan_present)
+		header->eth.type = cpu_to_be16(ETH_P_8021Q);
+
+	if (grh_present) {
+		header->grh.ip_version      = 6;
+		header->grh.payload_length  =
+			cpu_to_be16((IB_BTH_BYTES     +
+				     IB_DETH_BYTES    +
+				     payload_bytes    +
+				     4                + /* ICRC     */
+				     3) & ~3);          /* round up */
+		header->grh.next_header     = 0x1b;
+	}
+
+	if (immediate_present)
+		header->bth.opcode           = IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE;
+	else
+		header->bth.opcode           = IB_OPCODE_UD_SEND_ONLY;
+	header->bth.pad_count                = (4 - payload_bytes) & 3;
+	header->bth.transport_header_version = 0;
+
+	header->lrh_present = lrh_present;
+	header->eth_present = eth_present;
+	header->vlan_present = vlan_present;
+	header->grh_present = grh_present;
+	header->immediate_present = immediate_present;
+}
+EXPORT_SYMBOL(ib_ud_header_init);
+
+/**
+ * ib_ud_header_pack - Pack UD header struct into wire format
+ * @header:UD header struct
+ * @buf:Buffer to pack into
+ *
+ * ib_ud_header_pack() packs the UD header structure @header into wire
+ * format in the buffer @buf.
+ */
+int ib_ud_header_pack(struct ib_ud_header *header,
+		      void                *buf)
+{
+	int len = 0;
+
+	if (header->lrh_present) {
+		ib_pack(lrh_table, ARRAY_SIZE(lrh_table),
+			&header->lrh, buf + len);
+		len += IB_LRH_BYTES;
+	}
+	if (header->eth_present) {
+		ib_pack(eth_table, ARRAY_SIZE(eth_table),
+			&header->eth, buf + len);
+		len += IB_ETH_BYTES;
+	}
+	if (header->vlan_present) {
+		ib_pack(vlan_table, ARRAY_SIZE(vlan_table),
+			&header->vlan, buf + len);
+		len += IB_VLAN_BYTES;
+	}
+	if (header->grh_present) {
+		ib_pack(grh_table, ARRAY_SIZE(grh_table),
+			&header->grh, buf + len);
+		len += IB_GRH_BYTES;
+	}
+
+	ib_pack(bth_table, ARRAY_SIZE(bth_table),
+		&header->bth, buf + len);
+	len += IB_BTH_BYTES;
+
+	ib_pack(deth_table, ARRAY_SIZE(deth_table),
+		&header->deth, buf + len);
+	len += IB_DETH_BYTES;
+
+	if (header->immediate_present) {
+		memcpy(buf + len, &header->immediate_data, sizeof header->immediate_data);
+		len += sizeof header->immediate_data;
+	}
+
+	return len;
+}
+EXPORT_SYMBOL(ib_ud_header_pack);
+
+/**
+ * ib_ud_header_unpack - Unpack UD header struct from wire format
+ * @header:UD header struct
+ * @buf:Buffer to pack into
+ *
+ * ib_ud_header_pack() unpacks the UD header structure @header from wire
+ * format in the buffer @buf.
+ */
+int ib_ud_header_unpack(void                *buf,
+			struct ib_ud_header *header)
+{
+	ib_unpack(lrh_table, ARRAY_SIZE(lrh_table),
+		  buf, &header->lrh);
+	buf += IB_LRH_BYTES;
+
+	if (header->lrh.link_version != 0) {
+		printk(KERN_WARNING "Invalid LRH.link_version %d\n",
+		       header->lrh.link_version);
+		return -EINVAL;
+	}
+
+	switch (header->lrh.link_next_header) {
+	case IB_LNH_IBA_LOCAL:
+		header->grh_present = 0;
+		break;
+
+	case IB_LNH_IBA_GLOBAL:
+		header->grh_present = 1;
+		ib_unpack(grh_table, ARRAY_SIZE(grh_table),
+			  buf, &header->grh);
+		buf += IB_GRH_BYTES;
+
+		if (header->grh.ip_version != 6) {
+			printk(KERN_WARNING "Invalid GRH.ip_version %d\n",
+			       header->grh.ip_version);
+			return -EINVAL;
+		}
+		if (header->grh.next_header != 0x1b) {
+			printk(KERN_WARNING "Invalid GRH.next_header 0x%02x\n",
+			       header->grh.next_header);
+			return -EINVAL;
+		}
+		break;
+
+	default:
+		printk(KERN_WARNING "Invalid LRH.link_next_header %d\n",
+		       header->lrh.link_next_header);
+		return -EINVAL;
+	}
+
+	ib_unpack(bth_table, ARRAY_SIZE(bth_table),
+		  buf, &header->bth);
+	buf += IB_BTH_BYTES;
+
+	switch (header->bth.opcode) {
+	case IB_OPCODE_UD_SEND_ONLY:
+		header->immediate_present = 0;
+		break;
+	case IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE:
+		header->immediate_present = 1;
+		break;
+	default:
+		printk(KERN_WARNING "Invalid BTH.opcode 0x%02x\n",
+		       header->bth.opcode);
+		return -EINVAL;
+	}
+
+	if (header->bth.transport_header_version != 0) {
+		printk(KERN_WARNING "Invalid BTH.transport_header_version %d\n",
+		       header->bth.transport_header_version);
+		return -EINVAL;
+	}
+
+	ib_unpack(deth_table, ARRAY_SIZE(deth_table),
+		  buf, &header->deth);
+	buf += IB_DETH_BYTES;
+
+	if (header->immediate_present)
+		memcpy(&header->immediate_data, buf, sizeof header->immediate_data);
+
+	return 0;
+}
+EXPORT_SYMBOL(ib_ud_header_unpack);
diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c
new file mode 100644
index 0000000..df0c4f6
--- /dev/null
+++ b/drivers/infiniband/core/umem.c
@@ -0,0 +1,294 @@
+/*
+ * Copyright (c) 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Cisco Systems.  All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/mm.h>
+#include <linux/dma-mapping.h>
+#include <linux/sched.h>
+#include <linux/export.h>
+#include <linux/hugetlb.h>
+#include <linux/dma-attrs.h>
+#include <linux/slab.h>
+
+#include "uverbs.h"
+
+
+static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int dirty)
+{
+	struct scatterlist *sg;
+	struct page *page;
+	int i;
+
+	if (umem->nmap > 0)
+		ib_dma_unmap_sg(dev, umem->sg_head.sgl,
+				umem->nmap,
+				DMA_BIDIRECTIONAL);
+
+	for_each_sg(umem->sg_head.sgl, sg, umem->npages, i) {
+
+		page = sg_page(sg);
+		if (umem->writable && dirty)
+			set_page_dirty_lock(page);
+		put_page(page);
+	}
+
+	sg_free_table(&umem->sg_head);
+	return;
+
+}
+
+/**
+ * ib_umem_get - Pin and DMA map userspace memory.
+ * @context: userspace context to pin memory for
+ * @addr: userspace virtual address to start at
+ * @size: length of region to pin
+ * @access: IB_ACCESS_xxx flags for memory being pinned
+ * @dmasync: flush in-flight DMA when the memory region is written
+ */
+struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr,
+			    size_t size, int access, int dmasync)
+{
+	struct ib_umem *umem;
+	struct page **page_list;
+	struct vm_area_struct **vma_list;
+	unsigned long locked;
+	unsigned long lock_limit;
+	unsigned long cur_base;
+	unsigned long npages;
+	int ret;
+	int i;
+	DEFINE_DMA_ATTRS(attrs);
+	struct scatterlist *sg, *sg_list_start;
+	int need_release = 0;
+
+	if (dmasync)
+		dma_set_attr(DMA_ATTR_WRITE_BARRIER, &attrs);
+
+	if (!can_do_mlock())
+		return ERR_PTR(-EPERM);
+
+	umem = kzalloc(sizeof *umem, GFP_KERNEL);
+	if (!umem)
+		return ERR_PTR(-ENOMEM);
+
+	umem->context   = context;
+	umem->length    = size;
+	umem->offset    = addr & ~PAGE_MASK;
+	umem->page_size = PAGE_SIZE;
+	umem->pid       = get_task_pid(current, PIDTYPE_PID);
+	/*
+	 * We ask for writable memory if any access flags other than
+	 * "remote read" are set.  "Local write" and "remote write"
+	 * obviously require write access.  "Remote atomic" can do
+	 * things like fetch and add, which will modify memory, and
+	 * "MW bind" can change permissions by binding a window.
+	 */
+	umem->writable  = !!(access & ~IB_ACCESS_REMOTE_READ);
+
+	/* We assume the memory is from hugetlb until proved otherwise */
+	umem->hugetlb   = 1;
+
+	page_list = (struct page **) __get_free_page(GFP_KERNEL);
+	if (!page_list) {
+		kfree(umem);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	/*
+	 * if we can't alloc the vma_list, it's not so bad;
+	 * just assume the memory is not hugetlb memory
+	 */
+	vma_list = (struct vm_area_struct **) __get_free_page(GFP_KERNEL);
+	if (!vma_list)
+		umem->hugetlb = 0;
+
+	npages = PAGE_ALIGN(size + umem->offset) >> PAGE_SHIFT;
+
+	down_write(&current->mm->mmap_sem);
+
+	locked     = npages + current->mm->pinned_vm;
+	lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+
+	if ((locked > lock_limit) && !capable(CAP_IPC_LOCK)) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	cur_base = addr & PAGE_MASK;
+
+	if (npages == 0) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	ret = sg_alloc_table(&umem->sg_head, npages, GFP_KERNEL);
+	if (ret)
+		goto out;
+
+	need_release = 1;
+	sg_list_start = umem->sg_head.sgl;
+
+	while (npages) {
+		ret = get_user_pages(current, current->mm, cur_base,
+				     min_t(unsigned long, npages,
+					   PAGE_SIZE / sizeof (struct page *)),
+				     1, !umem->writable, page_list, vma_list);
+
+		if (ret < 0)
+			goto out;
+
+		umem->npages += ret;
+		cur_base += ret * PAGE_SIZE;
+		npages   -= ret;
+
+		for_each_sg(sg_list_start, sg, ret, i) {
+			if (vma_list && !is_vm_hugetlb_page(vma_list[i]))
+				umem->hugetlb = 0;
+
+			sg_set_page(sg, page_list[i], PAGE_SIZE, 0);
+		}
+
+		/* preparing for next loop */
+		sg_list_start = sg;
+	}
+
+	umem->nmap = ib_dma_map_sg_attrs(context->device,
+				  umem->sg_head.sgl,
+				  umem->npages,
+				  DMA_BIDIRECTIONAL,
+				  &attrs);
+
+	if (umem->nmap <= 0) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	ret = 0;
+
+out:
+	if (ret < 0) {
+		if (need_release)
+			__ib_umem_release(context->device, umem, 0);
+		put_pid(umem->pid);
+		kfree(umem);
+	} else
+		current->mm->pinned_vm = locked;
+
+	up_write(&current->mm->mmap_sem);
+	if (vma_list)
+		free_page((unsigned long) vma_list);
+	free_page((unsigned long) page_list);
+
+	return ret < 0 ? ERR_PTR(ret) : umem;
+}
+EXPORT_SYMBOL(ib_umem_get);
+
+static void ib_umem_account(struct work_struct *work)
+{
+	struct ib_umem *umem = container_of(work, struct ib_umem, work);
+
+	down_write(&umem->mm->mmap_sem);
+	umem->mm->pinned_vm -= umem->diff;
+	up_write(&umem->mm->mmap_sem);
+	mmput(umem->mm);
+	kfree(umem);
+}
+
+/**
+ * ib_umem_release - release memory pinned with ib_umem_get
+ * @umem: umem struct to release
+ */
+void ib_umem_release(struct ib_umem *umem)
+{
+	struct ib_ucontext *context = umem->context;
+	struct mm_struct *mm;
+	struct task_struct *task;
+	unsigned long diff;
+
+	__ib_umem_release(umem->context->device, umem, 1);
+
+	task = get_pid_task(umem->pid, PIDTYPE_PID);
+	put_pid(umem->pid);
+	if (!task)
+		goto out;
+	mm = get_task_mm(task);
+	put_task_struct(task);
+	if (!mm)
+		goto out;
+
+	diff = PAGE_ALIGN(umem->length + umem->offset) >> PAGE_SHIFT;
+
+	/*
+	 * We may be called with the mm's mmap_sem already held.  This
+	 * can happen when a userspace munmap() is the call that drops
+	 * the last reference to our file and calls our release
+	 * method.  If there are memory regions to destroy, we'll end
+	 * up here and not be able to take the mmap_sem.  In that case
+	 * we defer the vm_locked accounting to the system workqueue.
+	 */
+	if (context->closing) {
+		if (!down_write_trylock(&mm->mmap_sem)) {
+			INIT_WORK(&umem->work, ib_umem_account);
+			umem->mm   = mm;
+			umem->diff = diff;
+
+			queue_work(ib_wq, &umem->work);
+			return;
+		}
+	} else
+		down_write(&mm->mmap_sem);
+
+	mm->pinned_vm -= diff;
+	up_write(&mm->mmap_sem);
+	mmput(mm);
+out:
+	kfree(umem);
+}
+EXPORT_SYMBOL(ib_umem_release);
+
+int ib_umem_page_count(struct ib_umem *umem)
+{
+	int shift;
+	int i;
+	int n;
+	struct scatterlist *sg;
+
+	shift = ilog2(umem->page_size);
+
+	n = 0;
+	for_each_sg(umem->sg_head.sgl, sg, umem->nmap, i)
+		n += sg_dma_len(sg) >> shift;
+
+	return n;
+}
+EXPORT_SYMBOL(ib_umem_page_count);
diff --git a/drivers/infiniband/core/user_mad.c b/drivers/infiniband/core/user_mad.c
new file mode 100644
index 0000000..928cdd2
--- /dev/null
+++ b/drivers/infiniband/core/user_mad.c
@@ -0,0 +1,1390 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Voltaire, Inc. All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2008 Cisco. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#define pr_fmt(fmt) "user_mad: " fmt
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/device.h>
+#include <linux/err.h>
+#include <linux/fs.h>
+#include <linux/cdev.h>
+#include <linux/dma-mapping.h>
+#include <linux/poll.h>
+#include <linux/mutex.h>
+#include <linux/kref.h>
+#include <linux/compat.h>
+#include <linux/sched.h>
+#include <linux/semaphore.h>
+#include <linux/slab.h>
+
+#include <asm/uaccess.h>
+
+#include <rdma/ib_mad.h>
+#include <rdma/ib_user_mad.h>
+
+MODULE_AUTHOR("Roland Dreier");
+MODULE_DESCRIPTION("InfiniBand userspace MAD packet access");
+MODULE_LICENSE("Dual BSD/GPL");
+
+enum {
+	IB_UMAD_MAX_PORTS  = 64,
+	IB_UMAD_MAX_AGENTS = 32,
+
+	IB_UMAD_MAJOR      = 231,
+	IB_UMAD_MINOR_BASE = 0
+};
+
+/*
+ * Our lifetime rules for these structs are the following:
+ * device special file is opened, we take a reference on the
+ * ib_umad_port's struct ib_umad_device. We drop these
+ * references in the corresponding close().
+ *
+ * In addition to references coming from open character devices, there
+ * is one more reference to each ib_umad_device representing the
+ * module's reference taken when allocating the ib_umad_device in
+ * ib_umad_add_one().
+ *
+ * When destroying an ib_umad_device, we drop the module's reference.
+ */
+
+struct ib_umad_port {
+	struct cdev           cdev;
+	struct device	      *dev;
+
+	struct cdev           sm_cdev;
+	struct device	      *sm_dev;
+	struct semaphore       sm_sem;
+
+	struct mutex	       file_mutex;
+	struct list_head       file_list;
+
+	struct ib_device      *ib_dev;
+	struct ib_umad_device *umad_dev;
+	int                    dev_num;
+	u8                     port_num;
+};
+
+struct ib_umad_device {
+	int                  start_port, end_port;
+	struct kobject       kobj;
+	struct ib_umad_port  port[0];
+};
+
+struct ib_umad_file {
+	struct mutex		mutex;
+	struct ib_umad_port    *port;
+	struct list_head	recv_list;
+	struct list_head	send_list;
+	struct list_head	port_list;
+	spinlock_t		send_lock;
+	wait_queue_head_t	recv_wait;
+	struct ib_mad_agent    *agent[IB_UMAD_MAX_AGENTS];
+	int			agents_dead;
+	u8			use_pkey_index;
+	u8			already_used;
+};
+
+struct ib_umad_packet {
+	struct ib_mad_send_buf *msg;
+	struct ib_mad_recv_wc  *recv_wc;
+	struct list_head   list;
+	int		   length;
+	struct ib_user_mad mad;
+};
+
+static struct class *umad_class;
+
+static const dev_t base_dev = MKDEV(IB_UMAD_MAJOR, IB_UMAD_MINOR_BASE);
+
+static DEFINE_SPINLOCK(port_lock);
+static DECLARE_BITMAP(dev_map, IB_UMAD_MAX_PORTS);
+
+static void ib_umad_add_one(struct ib_device *device);
+static void ib_umad_remove_one(struct ib_device *device);
+
+static void ib_umad_release_dev(struct kobject *kobj)
+{
+	struct ib_umad_device *dev =
+		container_of(kobj, struct ib_umad_device, kobj);
+
+	kfree(dev);
+}
+
+static struct kobj_type ib_umad_dev_ktype = {
+	.release = ib_umad_release_dev,
+};
+
+static int hdr_size(struct ib_umad_file *file)
+{
+	return file->use_pkey_index ? sizeof (struct ib_user_mad_hdr) :
+		sizeof (struct ib_user_mad_hdr_old);
+}
+
+/* caller must hold file->mutex */
+static struct ib_mad_agent *__get_agent(struct ib_umad_file *file, int id)
+{
+	return file->agents_dead ? NULL : file->agent[id];
+}
+
+static int queue_packet(struct ib_umad_file *file,
+			struct ib_mad_agent *agent,
+			struct ib_umad_packet *packet)
+{
+	int ret = 1;
+
+	mutex_lock(&file->mutex);
+
+	for (packet->mad.hdr.id = 0;
+	     packet->mad.hdr.id < IB_UMAD_MAX_AGENTS;
+	     packet->mad.hdr.id++)
+		if (agent == __get_agent(file, packet->mad.hdr.id)) {
+			list_add_tail(&packet->list, &file->recv_list);
+			wake_up_interruptible(&file->recv_wait);
+			ret = 0;
+			break;
+		}
+
+	mutex_unlock(&file->mutex);
+
+	return ret;
+}
+
+static void dequeue_send(struct ib_umad_file *file,
+			 struct ib_umad_packet *packet)
+{
+	spin_lock_irq(&file->send_lock);
+	list_del(&packet->list);
+	spin_unlock_irq(&file->send_lock);
+}
+
+static void send_handler(struct ib_mad_agent *agent,
+			 struct ib_mad_send_wc *send_wc)
+{
+	struct ib_umad_file *file = agent->context;
+	struct ib_umad_packet *packet = send_wc->send_buf->context[0];
+
+	dequeue_send(file, packet);
+	ib_destroy_ah(packet->msg->ah);
+	ib_free_send_mad(packet->msg);
+
+	if (send_wc->status == IB_WC_RESP_TIMEOUT_ERR) {
+		packet->length = IB_MGMT_MAD_HDR;
+		packet->mad.hdr.status = ETIMEDOUT;
+		if (!queue_packet(file, agent, packet))
+			return;
+	}
+	kfree(packet);
+}
+
+static void recv_handler(struct ib_mad_agent *agent,
+			 struct ib_mad_recv_wc *mad_recv_wc)
+{
+	struct ib_umad_file *file = agent->context;
+	struct ib_umad_packet *packet;
+
+	if (mad_recv_wc->wc->status != IB_WC_SUCCESS)
+		goto err1;
+
+	packet = kzalloc(sizeof *packet, GFP_KERNEL);
+	if (!packet)
+		goto err1;
+
+	packet->length = mad_recv_wc->mad_len;
+	packet->recv_wc = mad_recv_wc;
+
+	packet->mad.hdr.status	   = 0;
+	packet->mad.hdr.length	   = hdr_size(file) + mad_recv_wc->mad_len;
+	packet->mad.hdr.qpn	   = cpu_to_be32(mad_recv_wc->wc->src_qp);
+	packet->mad.hdr.lid	   = cpu_to_be16(mad_recv_wc->wc->slid);
+	packet->mad.hdr.sl	   = mad_recv_wc->wc->sl;
+	packet->mad.hdr.path_bits  = mad_recv_wc->wc->dlid_path_bits;
+	packet->mad.hdr.pkey_index = mad_recv_wc->wc->pkey_index;
+	packet->mad.hdr.grh_present = !!(mad_recv_wc->wc->wc_flags & IB_WC_GRH);
+	if (packet->mad.hdr.grh_present) {
+		struct ib_ah_attr ah_attr;
+
+		ib_init_ah_from_wc(agent->device, agent->port_num,
+				   mad_recv_wc->wc, mad_recv_wc->recv_buf.grh,
+				   &ah_attr);
+
+		packet->mad.hdr.gid_index = ah_attr.grh.sgid_index;
+		packet->mad.hdr.hop_limit = ah_attr.grh.hop_limit;
+		packet->mad.hdr.traffic_class = ah_attr.grh.traffic_class;
+		memcpy(packet->mad.hdr.gid, &ah_attr.grh.dgid, 16);
+		packet->mad.hdr.flow_label = cpu_to_be32(ah_attr.grh.flow_label);
+	}
+
+	if (queue_packet(file, agent, packet))
+		goto err2;
+	return;
+
+err2:
+	kfree(packet);
+err1:
+	ib_free_recv_mad(mad_recv_wc);
+}
+
+static ssize_t copy_recv_mad(struct ib_umad_file *file, char __user *buf,
+			     struct ib_umad_packet *packet, size_t count)
+{
+	struct ib_mad_recv_buf *recv_buf;
+	int left, seg_payload, offset, max_seg_payload;
+
+	/* We need enough room to copy the first (or only) MAD segment. */
+	recv_buf = &packet->recv_wc->recv_buf;
+	if ((packet->length <= sizeof (*recv_buf->mad) &&
+	     count < hdr_size(file) + packet->length) ||
+	    (packet->length > sizeof (*recv_buf->mad) &&
+	     count < hdr_size(file) + sizeof (*recv_buf->mad)))
+		return -EINVAL;
+
+	if (copy_to_user(buf, &packet->mad, hdr_size(file)))
+		return -EFAULT;
+
+	buf += hdr_size(file);
+	seg_payload = min_t(int, packet->length, sizeof (*recv_buf->mad));
+	if (copy_to_user(buf, recv_buf->mad, seg_payload))
+		return -EFAULT;
+
+	if (seg_payload < packet->length) {
+		/*
+		 * Multipacket RMPP MAD message. Copy remainder of message.
+		 * Note that last segment may have a shorter payload.
+		 */
+		if (count < hdr_size(file) + packet->length) {
+			/*
+			 * The buffer is too small, return the first RMPP segment,
+			 * which includes the RMPP message length.
+			 */
+			return -ENOSPC;
+		}
+		offset = ib_get_mad_data_offset(recv_buf->mad->mad_hdr.mgmt_class);
+		max_seg_payload = sizeof (struct ib_mad) - offset;
+
+		for (left = packet->length - seg_payload, buf += seg_payload;
+		     left; left -= seg_payload, buf += seg_payload) {
+			recv_buf = container_of(recv_buf->list.next,
+						struct ib_mad_recv_buf, list);
+			seg_payload = min(left, max_seg_payload);
+			if (copy_to_user(buf, ((void *) recv_buf->mad) + offset,
+					 seg_payload))
+				return -EFAULT;
+		}
+	}
+	return hdr_size(file) + packet->length;
+}
+
+static ssize_t copy_send_mad(struct ib_umad_file *file, char __user *buf,
+			     struct ib_umad_packet *packet, size_t count)
+{
+	ssize_t size = hdr_size(file) + packet->length;
+
+	if (count < size)
+		return -EINVAL;
+
+	if (copy_to_user(buf, &packet->mad, hdr_size(file)))
+		return -EFAULT;
+
+	buf += hdr_size(file);
+
+	if (copy_to_user(buf, packet->mad.data, packet->length))
+		return -EFAULT;
+
+	return size;
+}
+
+static ssize_t ib_umad_read(struct file *filp, char __user *buf,
+			    size_t count, loff_t *pos)
+{
+	struct ib_umad_file *file = filp->private_data;
+	struct ib_umad_packet *packet;
+	ssize_t ret;
+
+	if (count < hdr_size(file))
+		return -EINVAL;
+
+	mutex_lock(&file->mutex);
+
+	while (list_empty(&file->recv_list)) {
+		mutex_unlock(&file->mutex);
+
+		if (filp->f_flags & O_NONBLOCK)
+			return -EAGAIN;
+
+		if (wait_event_interruptible(file->recv_wait,
+					     !list_empty(&file->recv_list)))
+			return -ERESTARTSYS;
+
+		mutex_lock(&file->mutex);
+	}
+
+	packet = list_entry(file->recv_list.next, struct ib_umad_packet, list);
+	list_del(&packet->list);
+
+	mutex_unlock(&file->mutex);
+
+	if (packet->recv_wc)
+		ret = copy_recv_mad(file, buf, packet, count);
+	else
+		ret = copy_send_mad(file, buf, packet, count);
+
+	if (ret < 0) {
+		/* Requeue packet */
+		mutex_lock(&file->mutex);
+		list_add(&packet->list, &file->recv_list);
+		mutex_unlock(&file->mutex);
+	} else {
+		if (packet->recv_wc)
+			ib_free_recv_mad(packet->recv_wc);
+		kfree(packet);
+	}
+	return ret;
+}
+
+static int copy_rmpp_mad(struct ib_mad_send_buf *msg, const char __user *buf)
+{
+	int left, seg;
+
+	/* Copy class specific header */
+	if ((msg->hdr_len > IB_MGMT_RMPP_HDR) &&
+	    copy_from_user(msg->mad + IB_MGMT_RMPP_HDR, buf + IB_MGMT_RMPP_HDR,
+			   msg->hdr_len - IB_MGMT_RMPP_HDR))
+		return -EFAULT;
+
+	/* All headers are in place.  Copy data segments. */
+	for (seg = 1, left = msg->data_len, buf += msg->hdr_len; left > 0;
+	     seg++, left -= msg->seg_size, buf += msg->seg_size) {
+		if (copy_from_user(ib_get_rmpp_segment(msg, seg), buf,
+				   min(left, msg->seg_size)))
+			return -EFAULT;
+	}
+	return 0;
+}
+
+static int same_destination(struct ib_user_mad_hdr *hdr1,
+			    struct ib_user_mad_hdr *hdr2)
+{
+	if (!hdr1->grh_present && !hdr2->grh_present)
+	   return (hdr1->lid == hdr2->lid);
+
+	if (hdr1->grh_present && hdr2->grh_present)
+	   return !memcmp(hdr1->gid, hdr2->gid, 16);
+
+	return 0;
+}
+
+static int is_duplicate(struct ib_umad_file *file,
+			struct ib_umad_packet *packet)
+{
+	struct ib_umad_packet *sent_packet;
+	struct ib_mad_hdr *sent_hdr, *hdr;
+
+	hdr = (struct ib_mad_hdr *) packet->mad.data;
+	list_for_each_entry(sent_packet, &file->send_list, list) {
+		sent_hdr = (struct ib_mad_hdr *) sent_packet->mad.data;
+
+		if ((hdr->tid != sent_hdr->tid) ||
+		    (hdr->mgmt_class != sent_hdr->mgmt_class))
+			continue;
+
+		/*
+		 * No need to be overly clever here.  If two new operations have
+		 * the same TID, reject the second as a duplicate.  This is more
+		 * restrictive than required by the spec.
+		 */
+		if (!ib_response_mad((struct ib_mad *) hdr)) {
+			if (!ib_response_mad((struct ib_mad *) sent_hdr))
+				return 1;
+			continue;
+		} else if (!ib_response_mad((struct ib_mad *) sent_hdr))
+			continue;
+
+		if (same_destination(&packet->mad.hdr, &sent_packet->mad.hdr))
+			return 1;
+	}
+
+	return 0;
+}
+
+static ssize_t ib_umad_write(struct file *filp, const char __user *buf,
+			     size_t count, loff_t *pos)
+{
+	struct ib_umad_file *file = filp->private_data;
+	struct ib_umad_packet *packet;
+	struct ib_mad_agent *agent;
+	struct ib_ah_attr ah_attr;
+	struct ib_ah *ah;
+	struct ib_rmpp_mad *rmpp_mad;
+	__be64 *tid;
+	int ret, data_len, hdr_len, copy_offset, rmpp_active;
+
+	if (count < hdr_size(file) + IB_MGMT_RMPP_HDR)
+		return -EINVAL;
+
+	packet = kzalloc(sizeof *packet + IB_MGMT_RMPP_HDR, GFP_KERNEL);
+	if (!packet)
+		return -ENOMEM;
+
+	if (copy_from_user(&packet->mad, buf, hdr_size(file))) {
+		ret = -EFAULT;
+		goto err;
+	}
+
+	if (packet->mad.hdr.id >= IB_UMAD_MAX_AGENTS) {
+		ret = -EINVAL;
+		goto err;
+	}
+
+	buf += hdr_size(file);
+
+	if (copy_from_user(packet->mad.data, buf, IB_MGMT_RMPP_HDR)) {
+		ret = -EFAULT;
+		goto err;
+	}
+
+	mutex_lock(&file->mutex);
+
+	agent = __get_agent(file, packet->mad.hdr.id);
+	if (!agent) {
+		ret = -EINVAL;
+		goto err_up;
+	}
+
+	memset(&ah_attr, 0, sizeof ah_attr);
+	ah_attr.dlid          = be16_to_cpu(packet->mad.hdr.lid);
+	ah_attr.sl            = packet->mad.hdr.sl;
+	ah_attr.src_path_bits = packet->mad.hdr.path_bits;
+	ah_attr.port_num      = file->port->port_num;
+	if (packet->mad.hdr.grh_present) {
+		ah_attr.ah_flags = IB_AH_GRH;
+		memcpy(ah_attr.grh.dgid.raw, packet->mad.hdr.gid, 16);
+		ah_attr.grh.sgid_index	   = packet->mad.hdr.gid_index;
+		ah_attr.grh.flow_label	   = be32_to_cpu(packet->mad.hdr.flow_label);
+		ah_attr.grh.hop_limit	   = packet->mad.hdr.hop_limit;
+		ah_attr.grh.traffic_class  = packet->mad.hdr.traffic_class;
+	}
+
+	ah = ib_create_ah(agent->qp->pd, &ah_attr);
+	if (IS_ERR(ah)) {
+		ret = PTR_ERR(ah);
+		goto err_up;
+	}
+
+	rmpp_mad = (struct ib_rmpp_mad *) packet->mad.data;
+	hdr_len = ib_get_mad_data_offset(rmpp_mad->mad_hdr.mgmt_class);
+
+	if (ib_is_mad_class_rmpp(rmpp_mad->mad_hdr.mgmt_class)
+	    && ib_mad_kernel_rmpp_agent(agent)) {
+		copy_offset = IB_MGMT_RMPP_HDR;
+		rmpp_active = ib_get_rmpp_flags(&rmpp_mad->rmpp_hdr) &
+						IB_MGMT_RMPP_FLAG_ACTIVE;
+	} else {
+		copy_offset = IB_MGMT_MAD_HDR;
+		rmpp_active = 0;
+	}
+
+	data_len = count - hdr_size(file) - hdr_len;
+	packet->msg = ib_create_send_mad(agent,
+					 be32_to_cpu(packet->mad.hdr.qpn),
+					 packet->mad.hdr.pkey_index, rmpp_active,
+					 hdr_len, data_len, GFP_KERNEL);
+	if (IS_ERR(packet->msg)) {
+		ret = PTR_ERR(packet->msg);
+		goto err_ah;
+	}
+
+	packet->msg->ah		= ah;
+	packet->msg->timeout_ms = packet->mad.hdr.timeout_ms;
+	packet->msg->retries	= packet->mad.hdr.retries;
+	packet->msg->context[0] = packet;
+
+	/* Copy MAD header.  Any RMPP header is already in place. */
+	memcpy(packet->msg->mad, packet->mad.data, IB_MGMT_MAD_HDR);
+
+	if (!rmpp_active) {
+		if (copy_from_user(packet->msg->mad + copy_offset,
+				   buf + copy_offset,
+				   hdr_len + data_len - copy_offset)) {
+			ret = -EFAULT;
+			goto err_msg;
+		}
+	} else {
+		ret = copy_rmpp_mad(packet->msg, buf);
+		if (ret)
+			goto err_msg;
+	}
+
+	/*
+	 * Set the high-order part of the transaction ID to make MADs from
+	 * different agents unique, and allow routing responses back to the
+	 * original requestor.
+	 */
+	if (!ib_response_mad(packet->msg->mad)) {
+		tid = &((struct ib_mad_hdr *) packet->msg->mad)->tid;
+		*tid = cpu_to_be64(((u64) agent->hi_tid) << 32 |
+				   (be64_to_cpup(tid) & 0xffffffff));
+		rmpp_mad->mad_hdr.tid = *tid;
+	}
+
+	if (!ib_mad_kernel_rmpp_agent(agent)
+	   && ib_is_mad_class_rmpp(rmpp_mad->mad_hdr.mgmt_class)
+	   && (ib_get_rmpp_flags(&rmpp_mad->rmpp_hdr) & IB_MGMT_RMPP_FLAG_ACTIVE)) {
+		spin_lock_irq(&file->send_lock);
+		list_add_tail(&packet->list, &file->send_list);
+		spin_unlock_irq(&file->send_lock);
+	} else {
+		spin_lock_irq(&file->send_lock);
+		ret = is_duplicate(file, packet);
+		if (!ret)
+			list_add_tail(&packet->list, &file->send_list);
+		spin_unlock_irq(&file->send_lock);
+		if (ret) {
+			ret = -EINVAL;
+			goto err_msg;
+		}
+	}
+
+	ret = ib_post_send_mad(packet->msg, NULL);
+	if (ret)
+		goto err_send;
+
+	mutex_unlock(&file->mutex);
+	return count;
+
+err_send:
+	dequeue_send(file, packet);
+err_msg:
+	ib_free_send_mad(packet->msg);
+err_ah:
+	ib_destroy_ah(ah);
+err_up:
+	mutex_unlock(&file->mutex);
+err:
+	kfree(packet);
+	return ret;
+}
+
+static unsigned int ib_umad_poll(struct file *filp, struct poll_table_struct *wait)
+{
+	struct ib_umad_file *file = filp->private_data;
+
+	/* we will always be able to post a MAD send */
+	unsigned int mask = POLLOUT | POLLWRNORM;
+
+	poll_wait(filp, &file->recv_wait, wait);
+
+	if (!list_empty(&file->recv_list))
+		mask |= POLLIN | POLLRDNORM;
+
+	return mask;
+}
+
+static int ib_umad_reg_agent(struct ib_umad_file *file, void __user *arg,
+			     int compat_method_mask)
+{
+	struct ib_user_mad_reg_req ureq;
+	struct ib_mad_reg_req req;
+	struct ib_mad_agent *agent = NULL;
+	int agent_id;
+	int ret;
+
+	mutex_lock(&file->port->file_mutex);
+	mutex_lock(&file->mutex);
+
+	if (!file->port->ib_dev) {
+		dev_notice(file->port->dev,
+			   "ib_umad_reg_agent: invalid device\n");
+		ret = -EPIPE;
+		goto out;
+	}
+
+	if (copy_from_user(&ureq, arg, sizeof ureq)) {
+		ret = -EFAULT;
+		goto out;
+	}
+
+	if (ureq.qpn != 0 && ureq.qpn != 1) {
+		dev_notice(file->port->dev,
+			   "ib_umad_reg_agent: invalid QPN %d specified\n",
+			   ureq.qpn);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	for (agent_id = 0; agent_id < IB_UMAD_MAX_AGENTS; ++agent_id)
+		if (!__get_agent(file, agent_id))
+			goto found;
+
+	dev_notice(file->port->dev,
+		   "ib_umad_reg_agent: Max Agents (%u) reached\n",
+		   IB_UMAD_MAX_AGENTS);
+	ret = -ENOMEM;
+	goto out;
+
+found:
+	if (ureq.mgmt_class) {
+		memset(&req, 0, sizeof(req));
+		req.mgmt_class         = ureq.mgmt_class;
+		req.mgmt_class_version = ureq.mgmt_class_version;
+		memcpy(req.oui, ureq.oui, sizeof req.oui);
+
+		if (compat_method_mask) {
+			u32 *umm = (u32 *) ureq.method_mask;
+			int i;
+
+			for (i = 0; i < BITS_TO_LONGS(IB_MGMT_MAX_METHODS); ++i)
+				req.method_mask[i] =
+					umm[i * 2] | ((u64) umm[i * 2 + 1] << 32);
+		} else
+			memcpy(req.method_mask, ureq.method_mask,
+			       sizeof req.method_mask);
+	}
+
+	agent = ib_register_mad_agent(file->port->ib_dev, file->port->port_num,
+				      ureq.qpn ? IB_QPT_GSI : IB_QPT_SMI,
+				      ureq.mgmt_class ? &req : NULL,
+				      ureq.rmpp_version,
+				      send_handler, recv_handler, file, 0);
+	if (IS_ERR(agent)) {
+		ret = PTR_ERR(agent);
+		agent = NULL;
+		goto out;
+	}
+
+	if (put_user(agent_id,
+		     (u32 __user *) (arg + offsetof(struct ib_user_mad_reg_req, id)))) {
+		ret = -EFAULT;
+		goto out;
+	}
+
+	if (!file->already_used) {
+		file->already_used = 1;
+		if (!file->use_pkey_index) {
+			dev_warn(file->port->dev,
+				"process %s did not enable P_Key index support.\n",
+				current->comm);
+			dev_warn(file->port->dev,
+				"   Documentation/infiniband/user_mad.txt has info on the new ABI.\n");
+		}
+	}
+
+	file->agent[agent_id] = agent;
+	ret = 0;
+
+out:
+	mutex_unlock(&file->mutex);
+
+	if (ret && agent)
+		ib_unregister_mad_agent(agent);
+
+	mutex_unlock(&file->port->file_mutex);
+
+	return ret;
+}
+
+static int ib_umad_reg_agent2(struct ib_umad_file *file, void __user *arg)
+{
+	struct ib_user_mad_reg_req2 ureq;
+	struct ib_mad_reg_req req;
+	struct ib_mad_agent *agent = NULL;
+	int agent_id;
+	int ret;
+
+	mutex_lock(&file->port->file_mutex);
+	mutex_lock(&file->mutex);
+
+	if (!file->port->ib_dev) {
+		dev_notice(file->port->dev,
+			   "ib_umad_reg_agent2: invalid device\n");
+		ret = -EPIPE;
+		goto out;
+	}
+
+	if (copy_from_user(&ureq, arg, sizeof(ureq))) {
+		ret = -EFAULT;
+		goto out;
+	}
+
+	if (ureq.qpn != 0 && ureq.qpn != 1) {
+		dev_notice(file->port->dev,
+			   "ib_umad_reg_agent2: invalid QPN %d specified\n",
+			   ureq.qpn);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (ureq.flags & ~IB_USER_MAD_REG_FLAGS_CAP) {
+		dev_notice(file->port->dev,
+			   "ib_umad_reg_agent2 failed: invalid registration flags specified 0x%x; supported 0x%x\n",
+			   ureq.flags, IB_USER_MAD_REG_FLAGS_CAP);
+		ret = -EINVAL;
+
+		if (put_user((u32)IB_USER_MAD_REG_FLAGS_CAP,
+				(u32 __user *) (arg + offsetof(struct
+				ib_user_mad_reg_req2, flags))))
+			ret = -EFAULT;
+
+		goto out;
+	}
+
+	for (agent_id = 0; agent_id < IB_UMAD_MAX_AGENTS; ++agent_id)
+		if (!__get_agent(file, agent_id))
+			goto found;
+
+	dev_notice(file->port->dev,
+		   "ib_umad_reg_agent2: Max Agents (%u) reached\n",
+		   IB_UMAD_MAX_AGENTS);
+	ret = -ENOMEM;
+	goto out;
+
+found:
+	if (ureq.mgmt_class) {
+		memset(&req, 0, sizeof(req));
+		req.mgmt_class         = ureq.mgmt_class;
+		req.mgmt_class_version = ureq.mgmt_class_version;
+		if (ureq.oui & 0xff000000) {
+			dev_notice(file->port->dev,
+				   "ib_umad_reg_agent2 failed: oui invalid 0x%08x\n",
+				   ureq.oui);
+			ret = -EINVAL;
+			goto out;
+		}
+		req.oui[2] =  ureq.oui & 0x0000ff;
+		req.oui[1] = (ureq.oui & 0x00ff00) >> 8;
+		req.oui[0] = (ureq.oui & 0xff0000) >> 16;
+		memcpy(req.method_mask, ureq.method_mask,
+			sizeof(req.method_mask));
+	}
+
+	agent = ib_register_mad_agent(file->port->ib_dev, file->port->port_num,
+				      ureq.qpn ? IB_QPT_GSI : IB_QPT_SMI,
+				      ureq.mgmt_class ? &req : NULL,
+				      ureq.rmpp_version,
+				      send_handler, recv_handler, file,
+				      ureq.flags);
+	if (IS_ERR(agent)) {
+		ret = PTR_ERR(agent);
+		agent = NULL;
+		goto out;
+	}
+
+	if (put_user(agent_id,
+		     (u32 __user *)(arg +
+				offsetof(struct ib_user_mad_reg_req2, id)))) {
+		ret = -EFAULT;
+		goto out;
+	}
+
+	if (!file->already_used) {
+		file->already_used = 1;
+		file->use_pkey_index = 1;
+	}
+
+	file->agent[agent_id] = agent;
+	ret = 0;
+
+out:
+	mutex_unlock(&file->mutex);
+
+	if (ret && agent)
+		ib_unregister_mad_agent(agent);
+
+	mutex_unlock(&file->port->file_mutex);
+
+	return ret;
+}
+
+
+static int ib_umad_unreg_agent(struct ib_umad_file *file, u32 __user *arg)
+{
+	struct ib_mad_agent *agent = NULL;
+	u32 id;
+	int ret = 0;
+
+	if (get_user(id, arg))
+		return -EFAULT;
+
+	mutex_lock(&file->port->file_mutex);
+	mutex_lock(&file->mutex);
+
+	if (id >= IB_UMAD_MAX_AGENTS || !__get_agent(file, id)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	agent = file->agent[id];
+	file->agent[id] = NULL;
+
+out:
+	mutex_unlock(&file->mutex);
+
+	if (agent)
+		ib_unregister_mad_agent(agent);
+
+	mutex_unlock(&file->port->file_mutex);
+
+	return ret;
+}
+
+static long ib_umad_enable_pkey(struct ib_umad_file *file)
+{
+	int ret = 0;
+
+	mutex_lock(&file->mutex);
+	if (file->already_used)
+		ret = -EINVAL;
+	else
+		file->use_pkey_index = 1;
+	mutex_unlock(&file->mutex);
+
+	return ret;
+}
+
+static long ib_umad_ioctl(struct file *filp, unsigned int cmd,
+			  unsigned long arg)
+{
+	switch (cmd) {
+	case IB_USER_MAD_REGISTER_AGENT:
+		return ib_umad_reg_agent(filp->private_data, (void __user *) arg, 0);
+	case IB_USER_MAD_UNREGISTER_AGENT:
+		return ib_umad_unreg_agent(filp->private_data, (__u32 __user *) arg);
+	case IB_USER_MAD_ENABLE_PKEY:
+		return ib_umad_enable_pkey(filp->private_data);
+	case IB_USER_MAD_REGISTER_AGENT2:
+		return ib_umad_reg_agent2(filp->private_data, (void __user *) arg);
+	default:
+		return -ENOIOCTLCMD;
+	}
+}
+
+#ifdef CONFIG_COMPAT
+static long ib_umad_compat_ioctl(struct file *filp, unsigned int cmd,
+				 unsigned long arg)
+{
+	switch (cmd) {
+	case IB_USER_MAD_REGISTER_AGENT:
+		return ib_umad_reg_agent(filp->private_data, compat_ptr(arg), 1);
+	case IB_USER_MAD_UNREGISTER_AGENT:
+		return ib_umad_unreg_agent(filp->private_data, compat_ptr(arg));
+	case IB_USER_MAD_ENABLE_PKEY:
+		return ib_umad_enable_pkey(filp->private_data);
+	case IB_USER_MAD_REGISTER_AGENT2:
+		return ib_umad_reg_agent2(filp->private_data, compat_ptr(arg));
+	default:
+		return -ENOIOCTLCMD;
+	}
+}
+#endif
+
+/*
+ * ib_umad_open() does not need the BKL:
+ *
+ *  - the ib_umad_port structures are properly reference counted, and
+ *    everything else is purely local to the file being created, so
+ *    races against other open calls are not a problem;
+ *  - the ioctl method does not affect any global state outside of the
+ *    file structure being operated on;
+ */
+static int ib_umad_open(struct inode *inode, struct file *filp)
+{
+	struct ib_umad_port *port;
+	struct ib_umad_file *file;
+	int ret = -ENXIO;
+
+	port = container_of(inode->i_cdev, struct ib_umad_port, cdev);
+
+	mutex_lock(&port->file_mutex);
+
+	if (!port->ib_dev)
+		goto out;
+
+	ret = -ENOMEM;
+	file = kzalloc(sizeof *file, GFP_KERNEL);
+	if (!file)
+		goto out;
+
+	mutex_init(&file->mutex);
+	spin_lock_init(&file->send_lock);
+	INIT_LIST_HEAD(&file->recv_list);
+	INIT_LIST_HEAD(&file->send_list);
+	init_waitqueue_head(&file->recv_wait);
+
+	file->port = port;
+	filp->private_data = file;
+
+	list_add_tail(&file->port_list, &port->file_list);
+
+	ret = nonseekable_open(inode, filp);
+	if (ret) {
+		list_del(&file->port_list);
+		kfree(file);
+		goto out;
+	}
+
+	kobject_get(&port->umad_dev->kobj);
+
+out:
+	mutex_unlock(&port->file_mutex);
+	return ret;
+}
+
+static int ib_umad_close(struct inode *inode, struct file *filp)
+{
+	struct ib_umad_file *file = filp->private_data;
+	struct ib_umad_device *dev = file->port->umad_dev;
+	struct ib_umad_packet *packet, *tmp;
+	int already_dead;
+	int i;
+
+	mutex_lock(&file->port->file_mutex);
+	mutex_lock(&file->mutex);
+
+	already_dead = file->agents_dead;
+	file->agents_dead = 1;
+
+	list_for_each_entry_safe(packet, tmp, &file->recv_list, list) {
+		if (packet->recv_wc)
+			ib_free_recv_mad(packet->recv_wc);
+		kfree(packet);
+	}
+
+	list_del(&file->port_list);
+
+	mutex_unlock(&file->mutex);
+
+	if (!already_dead)
+		for (i = 0; i < IB_UMAD_MAX_AGENTS; ++i)
+			if (file->agent[i])
+				ib_unregister_mad_agent(file->agent[i]);
+
+	mutex_unlock(&file->port->file_mutex);
+
+	kfree(file);
+	kobject_put(&dev->kobj);
+
+	return 0;
+}
+
+static const struct file_operations umad_fops = {
+	.owner		= THIS_MODULE,
+	.read		= ib_umad_read,
+	.write		= ib_umad_write,
+	.poll		= ib_umad_poll,
+	.unlocked_ioctl = ib_umad_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= ib_umad_compat_ioctl,
+#endif
+	.open		= ib_umad_open,
+	.release	= ib_umad_close,
+	.llseek		= no_llseek,
+};
+
+static int ib_umad_sm_open(struct inode *inode, struct file *filp)
+{
+	struct ib_umad_port *port;
+	struct ib_port_modify props = {
+		.set_port_cap_mask = IB_PORT_SM
+	};
+	int ret;
+
+	port = container_of(inode->i_cdev, struct ib_umad_port, sm_cdev);
+
+	if (filp->f_flags & O_NONBLOCK) {
+		if (down_trylock(&port->sm_sem)) {
+			ret = -EAGAIN;
+			goto fail;
+		}
+	} else {
+		if (down_interruptible(&port->sm_sem)) {
+			ret = -ERESTARTSYS;
+			goto fail;
+		}
+	}
+
+	ret = ib_modify_port(port->ib_dev, port->port_num, 0, &props);
+	if (ret)
+		goto err_up_sem;
+
+	filp->private_data = port;
+
+	ret = nonseekable_open(inode, filp);
+	if (ret)
+		goto err_clr_sm_cap;
+
+	kobject_get(&port->umad_dev->kobj);
+
+	return 0;
+
+err_clr_sm_cap:
+	swap(props.set_port_cap_mask, props.clr_port_cap_mask);
+	ib_modify_port(port->ib_dev, port->port_num, 0, &props);
+
+err_up_sem:
+	up(&port->sm_sem);
+
+fail:
+	return ret;
+}
+
+static int ib_umad_sm_close(struct inode *inode, struct file *filp)
+{
+	struct ib_umad_port *port = filp->private_data;
+	struct ib_port_modify props = {
+		.clr_port_cap_mask = IB_PORT_SM
+	};
+	int ret = 0;
+
+	mutex_lock(&port->file_mutex);
+	if (port->ib_dev)
+		ret = ib_modify_port(port->ib_dev, port->port_num, 0, &props);
+	mutex_unlock(&port->file_mutex);
+
+	up(&port->sm_sem);
+
+	kobject_put(&port->umad_dev->kobj);
+
+	return ret;
+}
+
+static const struct file_operations umad_sm_fops = {
+	.owner	 = THIS_MODULE,
+	.open	 = ib_umad_sm_open,
+	.release = ib_umad_sm_close,
+	.llseek	 = no_llseek,
+};
+
+static struct ib_client umad_client = {
+	.name   = "umad",
+	.add    = ib_umad_add_one,
+	.remove = ib_umad_remove_one
+};
+
+static ssize_t show_ibdev(struct device *dev, struct device_attribute *attr,
+			  char *buf)
+{
+	struct ib_umad_port *port = dev_get_drvdata(dev);
+
+	if (!port)
+		return -ENODEV;
+
+	return sprintf(buf, "%s\n", port->ib_dev->name);
+}
+static DEVICE_ATTR(ibdev, S_IRUGO, show_ibdev, NULL);
+
+static ssize_t show_port(struct device *dev, struct device_attribute *attr,
+			 char *buf)
+{
+	struct ib_umad_port *port = dev_get_drvdata(dev);
+
+	if (!port)
+		return -ENODEV;
+
+	return sprintf(buf, "%d\n", port->port_num);
+}
+static DEVICE_ATTR(port, S_IRUGO, show_port, NULL);
+
+static CLASS_ATTR_STRING(abi_version, S_IRUGO,
+			 __stringify(IB_USER_MAD_ABI_VERSION));
+
+static dev_t overflow_maj;
+static DECLARE_BITMAP(overflow_map, IB_UMAD_MAX_PORTS);
+static int find_overflow_devnum(struct ib_device *device)
+{
+	int ret;
+
+	if (!overflow_maj) {
+		ret = alloc_chrdev_region(&overflow_maj, 0, IB_UMAD_MAX_PORTS * 2,
+					  "infiniband_mad");
+		if (ret) {
+			dev_err(&device->dev,
+				"couldn't register dynamic device number\n");
+			return ret;
+		}
+	}
+
+	ret = find_first_zero_bit(overflow_map, IB_UMAD_MAX_PORTS);
+	if (ret >= IB_UMAD_MAX_PORTS)
+		return -1;
+
+	return ret;
+}
+
+static int ib_umad_init_port(struct ib_device *device, int port_num,
+			     struct ib_umad_device *umad_dev,
+			     struct ib_umad_port *port)
+{
+	int devnum;
+	dev_t base;
+
+	spin_lock(&port_lock);
+	devnum = find_first_zero_bit(dev_map, IB_UMAD_MAX_PORTS);
+	if (devnum >= IB_UMAD_MAX_PORTS) {
+		spin_unlock(&port_lock);
+		devnum = find_overflow_devnum(device);
+		if (devnum < 0)
+			return -1;
+
+		spin_lock(&port_lock);
+		port->dev_num = devnum + IB_UMAD_MAX_PORTS;
+		base = devnum + overflow_maj;
+		set_bit(devnum, overflow_map);
+	} else {
+		port->dev_num = devnum;
+		base = devnum + base_dev;
+		set_bit(devnum, dev_map);
+	}
+	spin_unlock(&port_lock);
+
+	port->ib_dev   = device;
+	port->port_num = port_num;
+	sema_init(&port->sm_sem, 1);
+	mutex_init(&port->file_mutex);
+	INIT_LIST_HEAD(&port->file_list);
+
+	cdev_init(&port->cdev, &umad_fops);
+	port->cdev.owner = THIS_MODULE;
+	port->cdev.kobj.parent = &umad_dev->kobj;
+	kobject_set_name(&port->cdev.kobj, "umad%d", port->dev_num);
+	if (cdev_add(&port->cdev, base, 1))
+		goto err_cdev;
+
+	port->dev = device_create(umad_class, device->dma_device,
+				  port->cdev.dev, port,
+				  "umad%d", port->dev_num);
+	if (IS_ERR(port->dev))
+		goto err_cdev;
+
+	if (device_create_file(port->dev, &dev_attr_ibdev))
+		goto err_dev;
+	if (device_create_file(port->dev, &dev_attr_port))
+		goto err_dev;
+
+	base += IB_UMAD_MAX_PORTS;
+	cdev_init(&port->sm_cdev, &umad_sm_fops);
+	port->sm_cdev.owner = THIS_MODULE;
+	port->sm_cdev.kobj.parent = &umad_dev->kobj;
+	kobject_set_name(&port->sm_cdev.kobj, "issm%d", port->dev_num);
+	if (cdev_add(&port->sm_cdev, base, 1))
+		goto err_sm_cdev;
+
+	port->sm_dev = device_create(umad_class, device->dma_device,
+				     port->sm_cdev.dev, port,
+				     "issm%d", port->dev_num);
+	if (IS_ERR(port->sm_dev))
+		goto err_sm_cdev;
+
+	if (device_create_file(port->sm_dev, &dev_attr_ibdev))
+		goto err_sm_dev;
+	if (device_create_file(port->sm_dev, &dev_attr_port))
+		goto err_sm_dev;
+
+	return 0;
+
+err_sm_dev:
+	device_destroy(umad_class, port->sm_cdev.dev);
+
+err_sm_cdev:
+	cdev_del(&port->sm_cdev);
+
+err_dev:
+	device_destroy(umad_class, port->cdev.dev);
+
+err_cdev:
+	cdev_del(&port->cdev);
+	if (port->dev_num < IB_UMAD_MAX_PORTS)
+		clear_bit(devnum, dev_map);
+	else
+		clear_bit(devnum, overflow_map);
+
+	return -1;
+}
+
+static void ib_umad_kill_port(struct ib_umad_port *port)
+{
+	struct ib_umad_file *file;
+	int id;
+
+	dev_set_drvdata(port->dev,    NULL);
+	dev_set_drvdata(port->sm_dev, NULL);
+
+	device_destroy(umad_class, port->cdev.dev);
+	device_destroy(umad_class, port->sm_cdev.dev);
+
+	cdev_del(&port->cdev);
+	cdev_del(&port->sm_cdev);
+
+	mutex_lock(&port->file_mutex);
+
+	port->ib_dev = NULL;
+
+	list_for_each_entry(file, &port->file_list, port_list) {
+		mutex_lock(&file->mutex);
+		file->agents_dead = 1;
+		mutex_unlock(&file->mutex);
+
+		for (id = 0; id < IB_UMAD_MAX_AGENTS; ++id)
+			if (file->agent[id])
+				ib_unregister_mad_agent(file->agent[id]);
+	}
+
+	mutex_unlock(&port->file_mutex);
+
+	if (port->dev_num < IB_UMAD_MAX_PORTS)
+		clear_bit(port->dev_num, dev_map);
+	else
+		clear_bit(port->dev_num - IB_UMAD_MAX_PORTS, overflow_map);
+}
+
+static void ib_umad_add_one(struct ib_device *device)
+{
+	struct ib_umad_device *umad_dev;
+	int s, e, i;
+
+	if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB)
+		return;
+
+	if (device->node_type == RDMA_NODE_IB_SWITCH)
+		s = e = 0;
+	else {
+		s = 1;
+		e = device->phys_port_cnt;
+	}
+
+	umad_dev = kzalloc(sizeof *umad_dev +
+			   (e - s + 1) * sizeof (struct ib_umad_port),
+			   GFP_KERNEL);
+	if (!umad_dev)
+		return;
+
+	kobject_init(&umad_dev->kobj, &ib_umad_dev_ktype);
+
+	umad_dev->start_port = s;
+	umad_dev->end_port   = e;
+
+	for (i = s; i <= e; ++i) {
+		umad_dev->port[i - s].umad_dev = umad_dev;
+
+		if (ib_umad_init_port(device, i, umad_dev,
+				      &umad_dev->port[i - s]))
+			goto err;
+	}
+
+	ib_set_client_data(device, &umad_client, umad_dev);
+
+	return;
+
+err:
+	while (--i >= s)
+		ib_umad_kill_port(&umad_dev->port[i - s]);
+
+	kobject_put(&umad_dev->kobj);
+}
+
+static void ib_umad_remove_one(struct ib_device *device)
+{
+	struct ib_umad_device *umad_dev = ib_get_client_data(device, &umad_client);
+	int i;
+
+	if (!umad_dev)
+		return;
+
+	for (i = 0; i <= umad_dev->end_port - umad_dev->start_port; ++i)
+		ib_umad_kill_port(&umad_dev->port[i]);
+
+	kobject_put(&umad_dev->kobj);
+}
+
+static char *umad_devnode(struct device *dev, umode_t *mode)
+{
+	return kasprintf(GFP_KERNEL, "infiniband/%s", dev_name(dev));
+}
+
+static int __init ib_umad_init(void)
+{
+	int ret;
+
+	ret = register_chrdev_region(base_dev, IB_UMAD_MAX_PORTS * 2,
+				     "infiniband_mad");
+	if (ret) {
+		pr_err("couldn't register device number\n");
+		goto out;
+	}
+
+	umad_class = class_create(THIS_MODULE, "infiniband_mad");
+	if (IS_ERR(umad_class)) {
+		ret = PTR_ERR(umad_class);
+		pr_err("couldn't create class infiniband_mad\n");
+		goto out_chrdev;
+	}
+
+	umad_class->devnode = umad_devnode;
+
+	ret = class_create_file(umad_class, &class_attr_abi_version.attr);
+	if (ret) {
+		pr_err("couldn't create abi_version attribute\n");
+		goto out_class;
+	}
+
+	ret = ib_register_client(&umad_client);
+	if (ret) {
+		pr_err("couldn't register ib_umad client\n");
+		goto out_class;
+	}
+
+	return 0;
+
+out_class:
+	class_destroy(umad_class);
+
+out_chrdev:
+	unregister_chrdev_region(base_dev, IB_UMAD_MAX_PORTS * 2);
+
+out:
+	return ret;
+}
+
+static void __exit ib_umad_cleanup(void)
+{
+	ib_unregister_client(&umad_client);
+	class_destroy(umad_class);
+	unregister_chrdev_region(base_dev, IB_UMAD_MAX_PORTS * 2);
+	if (overflow_maj)
+		unregister_chrdev_region(overflow_maj, IB_UMAD_MAX_PORTS * 2);
+}
+
+module_init(ib_umad_init);
+module_exit(ib_umad_cleanup);
diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h
new file mode 100644
index 0000000..643c08a
--- /dev/null
+++ b/drivers/infiniband/core/uverbs.h
@@ -0,0 +1,262 @@
+/*
+ * Copyright (c) 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005, 2006 Cisco Systems.  All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2005 Voltaire, Inc. All rights reserved.
+ * Copyright (c) 2005 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef UVERBS_H
+#define UVERBS_H
+
+#include <linux/kref.h>
+#include <linux/idr.h>
+#include <linux/mutex.h>
+#include <linux/completion.h>
+#include <linux/cdev.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_umem.h>
+#include <rdma/ib_user_verbs.h>
+
+#define INIT_UDATA(udata, ibuf, obuf, ilen, olen)			\
+	do {								\
+		(udata)->inbuf  = (const void __user *) (ibuf);		\
+		(udata)->outbuf = (void __user *) (obuf);		\
+		(udata)->inlen  = (ilen);				\
+		(udata)->outlen = (olen);				\
+	} while (0)
+
+#define INIT_UDATA_BUF_OR_NULL(udata, ibuf, obuf, ilen, olen)			\
+	do {									\
+		(udata)->inbuf  = (ilen) ? (const void __user *) (ibuf) : NULL;	\
+		(udata)->outbuf = (olen) ? (void __user *) (obuf) : NULL;	\
+		(udata)->inlen  = (ilen);					\
+		(udata)->outlen = (olen);					\
+	} while (0)
+
+/*
+ * Our lifetime rules for these structs are the following:
+ *
+ * struct ib_uverbs_device: One reference is held by the module and
+ * released in ib_uverbs_remove_one().  Another reference is taken by
+ * ib_uverbs_open() each time the character special file is opened,
+ * and released in ib_uverbs_release_file() when the file is released.
+ *
+ * struct ib_uverbs_file: One reference is held by the VFS and
+ * released when the file is closed.  Another reference is taken when
+ * an asynchronous event queue file is created and released when the
+ * event file is closed.
+ *
+ * struct ib_uverbs_event_file: One reference is held by the VFS and
+ * released when the file is closed.  For asynchronous event files,
+ * another reference is held by the corresponding main context file
+ * and released when that file is closed.  For completion event files,
+ * a reference is taken when a CQ is created that uses the file, and
+ * released when the CQ is destroyed.
+ */
+
+struct ib_uverbs_device {
+	struct kref				ref;
+	int					num_comp_vectors;
+	struct completion			comp;
+	struct device			       *dev;
+	struct ib_device		       *ib_dev;
+	int					devnum;
+	struct cdev			        cdev;
+	struct rb_root				xrcd_tree;
+	struct mutex				xrcd_tree_mutex;
+};
+
+struct ib_uverbs_event_file {
+	struct kref				ref;
+	int					is_async;
+	struct ib_uverbs_file		       *uverbs_file;
+	spinlock_t				lock;
+	int					is_closed;
+	wait_queue_head_t			poll_wait;
+	struct fasync_struct		       *async_queue;
+	struct list_head			event_list;
+};
+
+struct ib_uverbs_file {
+	struct kref				ref;
+	struct mutex				mutex;
+	struct ib_uverbs_device		       *device;
+	struct ib_ucontext		       *ucontext;
+	struct ib_event_handler			event_handler;
+	struct ib_uverbs_event_file	       *async_file;
+};
+
+struct ib_uverbs_event {
+	union {
+		struct ib_uverbs_async_event_desc	async;
+		struct ib_uverbs_comp_event_desc	comp;
+	}					desc;
+	struct list_head			list;
+	struct list_head			obj_list;
+	u32				       *counter;
+};
+
+struct ib_uverbs_mcast_entry {
+	struct list_head	list;
+	union ib_gid 		gid;
+	u16 			lid;
+};
+
+struct ib_uevent_object {
+	struct ib_uobject	uobject;
+	struct list_head	event_list;
+	u32			events_reported;
+};
+
+struct ib_uxrcd_object {
+	struct ib_uobject	uobject;
+	atomic_t		refcnt;
+};
+
+struct ib_usrq_object {
+	struct ib_uevent_object	uevent;
+	struct ib_uxrcd_object *uxrcd;
+};
+
+struct ib_uqp_object {
+	struct ib_uevent_object	uevent;
+	struct list_head 	mcast_list;
+	struct ib_uxrcd_object *uxrcd;
+};
+
+struct ib_ucq_object {
+	struct ib_uobject	uobject;
+	struct ib_uverbs_file  *uverbs_file;
+	struct list_head	comp_list;
+	struct list_head	async_list;
+	u32			comp_events_reported;
+	u32			async_events_reported;
+};
+
+extern spinlock_t ib_uverbs_idr_lock;
+extern struct idr ib_uverbs_pd_idr;
+extern struct idr ib_uverbs_mr_idr;
+extern struct idr ib_uverbs_mw_idr;
+extern struct idr ib_uverbs_ah_idr;
+extern struct idr ib_uverbs_cq_idr;
+extern struct idr ib_uverbs_qp_idr;
+extern struct idr ib_uverbs_srq_idr;
+extern struct idr ib_uverbs_xrcd_idr;
+extern struct idr ib_uverbs_rule_idr;
+
+void idr_remove_uobj(struct idr *idp, struct ib_uobject *uobj);
+
+struct file *ib_uverbs_alloc_event_file(struct ib_uverbs_file *uverbs_file,
+					int is_async);
+struct ib_uverbs_event_file *ib_uverbs_lookup_comp_file(int fd);
+
+void ib_uverbs_release_ucq(struct ib_uverbs_file *file,
+			   struct ib_uverbs_event_file *ev_file,
+			   struct ib_ucq_object *uobj);
+void ib_uverbs_release_uevent(struct ib_uverbs_file *file,
+			      struct ib_uevent_object *uobj);
+
+void ib_uverbs_comp_handler(struct ib_cq *cq, void *cq_context);
+void ib_uverbs_cq_event_handler(struct ib_event *event, void *context_ptr);
+void ib_uverbs_qp_event_handler(struct ib_event *event, void *context_ptr);
+void ib_uverbs_srq_event_handler(struct ib_event *event, void *context_ptr);
+void ib_uverbs_event_handler(struct ib_event_handler *handler,
+			     struct ib_event *event);
+void ib_uverbs_dealloc_xrcd(struct ib_uverbs_device *dev, struct ib_xrcd *xrcd);
+
+struct ib_uverbs_flow_spec {
+	union {
+		union {
+			struct ib_uverbs_flow_spec_hdr hdr;
+			struct {
+				__u32 type;
+				__u16 size;
+				__u16 reserved;
+			};
+		};
+		struct ib_uverbs_flow_spec_eth     eth;
+		struct ib_uverbs_flow_spec_ipv4    ipv4;
+		struct ib_uverbs_flow_spec_tcp_udp tcp_udp;
+	};
+};
+
+#define IB_UVERBS_DECLARE_CMD(name)					\
+	ssize_t ib_uverbs_##name(struct ib_uverbs_file *file,		\
+				 const char __user *buf, int in_len,	\
+				 int out_len)
+
+IB_UVERBS_DECLARE_CMD(get_context);
+IB_UVERBS_DECLARE_CMD(query_device);
+IB_UVERBS_DECLARE_CMD(query_port);
+IB_UVERBS_DECLARE_CMD(alloc_pd);
+IB_UVERBS_DECLARE_CMD(dealloc_pd);
+IB_UVERBS_DECLARE_CMD(reg_mr);
+IB_UVERBS_DECLARE_CMD(rereg_mr);
+IB_UVERBS_DECLARE_CMD(dereg_mr);
+IB_UVERBS_DECLARE_CMD(alloc_mw);
+IB_UVERBS_DECLARE_CMD(dealloc_mw);
+IB_UVERBS_DECLARE_CMD(create_comp_channel);
+IB_UVERBS_DECLARE_CMD(create_cq);
+IB_UVERBS_DECLARE_CMD(resize_cq);
+IB_UVERBS_DECLARE_CMD(poll_cq);
+IB_UVERBS_DECLARE_CMD(req_notify_cq);
+IB_UVERBS_DECLARE_CMD(destroy_cq);
+IB_UVERBS_DECLARE_CMD(create_qp);
+IB_UVERBS_DECLARE_CMD(open_qp);
+IB_UVERBS_DECLARE_CMD(query_qp);
+IB_UVERBS_DECLARE_CMD(modify_qp);
+IB_UVERBS_DECLARE_CMD(destroy_qp);
+IB_UVERBS_DECLARE_CMD(post_send);
+IB_UVERBS_DECLARE_CMD(post_recv);
+IB_UVERBS_DECLARE_CMD(post_srq_recv);
+IB_UVERBS_DECLARE_CMD(create_ah);
+IB_UVERBS_DECLARE_CMD(destroy_ah);
+IB_UVERBS_DECLARE_CMD(attach_mcast);
+IB_UVERBS_DECLARE_CMD(detach_mcast);
+IB_UVERBS_DECLARE_CMD(create_srq);
+IB_UVERBS_DECLARE_CMD(modify_srq);
+IB_UVERBS_DECLARE_CMD(query_srq);
+IB_UVERBS_DECLARE_CMD(destroy_srq);
+IB_UVERBS_DECLARE_CMD(create_xsrq);
+IB_UVERBS_DECLARE_CMD(open_xrcd);
+IB_UVERBS_DECLARE_CMD(close_xrcd);
+
+#define IB_UVERBS_DECLARE_EX_CMD(name)				\
+	int ib_uverbs_ex_##name(struct ib_uverbs_file *file,	\
+				struct ib_udata *ucore,		\
+				struct ib_udata *uhw)
+
+IB_UVERBS_DECLARE_EX_CMD(create_flow);
+IB_UVERBS_DECLARE_EX_CMD(destroy_flow);
+
+#endif /* UVERBS_H */
diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c
new file mode 100644
index 0000000..5ba2a86
--- /dev/null
+++ b/drivers/infiniband/core/uverbs_cmd.c
@@ -0,0 +1,3255 @@
+/*
+ * Copyright (c) 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005, 2006, 2007 Cisco Systems.  All rights reserved.
+ * Copyright (c) 2005 PathScale, Inc.  All rights reserved.
+ * Copyright (c) 2006 Mellanox Technologies.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+
+#include <asm/uaccess.h>
+
+#include "uverbs.h"
+#include "core_priv.h"
+
+struct uverbs_lock_class {
+	struct lock_class_key	key;
+	char			name[16];
+};
+
+static struct uverbs_lock_class pd_lock_class	= { .name = "PD-uobj" };
+static struct uverbs_lock_class mr_lock_class	= { .name = "MR-uobj" };
+static struct uverbs_lock_class mw_lock_class	= { .name = "MW-uobj" };
+static struct uverbs_lock_class cq_lock_class	= { .name = "CQ-uobj" };
+static struct uverbs_lock_class qp_lock_class	= { .name = "QP-uobj" };
+static struct uverbs_lock_class ah_lock_class	= { .name = "AH-uobj" };
+static struct uverbs_lock_class srq_lock_class	= { .name = "SRQ-uobj" };
+static struct uverbs_lock_class xrcd_lock_class = { .name = "XRCD-uobj" };
+static struct uverbs_lock_class rule_lock_class = { .name = "RULE-uobj" };
+
+/*
+ * The ib_uobject locking scheme is as follows:
+ *
+ * - ib_uverbs_idr_lock protects the uverbs idrs themselves, so it
+ *   needs to be held during all idr operations.  When an object is
+ *   looked up, a reference must be taken on the object's kref before
+ *   dropping this lock.
+ *
+ * - Each object also has an rwsem.  This rwsem must be held for
+ *   reading while an operation that uses the object is performed.
+ *   For example, while registering an MR, the associated PD's
+ *   uobject.mutex must be held for reading.  The rwsem must be held
+ *   for writing while initializing or destroying an object.
+ *
+ * - In addition, each object has a "live" flag.  If this flag is not
+ *   set, then lookups of the object will fail even if it is found in
+ *   the idr.  This handles a reader that blocks and does not acquire
+ *   the rwsem until after the object is destroyed.  The destroy
+ *   operation will set the live flag to 0 and then drop the rwsem;
+ *   this will allow the reader to acquire the rwsem, see that the
+ *   live flag is 0, and then drop the rwsem and its reference to
+ *   object.  The underlying storage will not be freed until the last
+ *   reference to the object is dropped.
+ */
+
+static void init_uobj(struct ib_uobject *uobj, u64 user_handle,
+		      struct ib_ucontext *context, struct uverbs_lock_class *c)
+{
+	uobj->user_handle = user_handle;
+	uobj->context     = context;
+	kref_init(&uobj->ref);
+	init_rwsem(&uobj->mutex);
+	lockdep_set_class_and_name(&uobj->mutex, &c->key, c->name);
+	uobj->live        = 0;
+}
+
+static void release_uobj(struct kref *kref)
+{
+	kfree(container_of(kref, struct ib_uobject, ref));
+}
+
+static void put_uobj(struct ib_uobject *uobj)
+{
+	kref_put(&uobj->ref, release_uobj);
+}
+
+static void put_uobj_read(struct ib_uobject *uobj)
+{
+	up_read(&uobj->mutex);
+	put_uobj(uobj);
+}
+
+static void put_uobj_write(struct ib_uobject *uobj)
+{
+	up_write(&uobj->mutex);
+	put_uobj(uobj);
+}
+
+static int idr_add_uobj(struct idr *idr, struct ib_uobject *uobj)
+{
+	int ret;
+
+	idr_preload(GFP_KERNEL);
+	spin_lock(&ib_uverbs_idr_lock);
+
+	ret = idr_alloc(idr, uobj, 0, 0, GFP_NOWAIT);
+	if (ret >= 0)
+		uobj->id = ret;
+
+	spin_unlock(&ib_uverbs_idr_lock);
+	idr_preload_end();
+
+	return ret < 0 ? ret : 0;
+}
+
+void idr_remove_uobj(struct idr *idr, struct ib_uobject *uobj)
+{
+	spin_lock(&ib_uverbs_idr_lock);
+	idr_remove(idr, uobj->id);
+	spin_unlock(&ib_uverbs_idr_lock);
+}
+
+static struct ib_uobject *__idr_get_uobj(struct idr *idr, int id,
+					 struct ib_ucontext *context)
+{
+	struct ib_uobject *uobj;
+
+	spin_lock(&ib_uverbs_idr_lock);
+	uobj = idr_find(idr, id);
+	if (uobj) {
+		if (uobj->context == context)
+			kref_get(&uobj->ref);
+		else
+			uobj = NULL;
+	}
+	spin_unlock(&ib_uverbs_idr_lock);
+
+	return uobj;
+}
+
+static struct ib_uobject *idr_read_uobj(struct idr *idr, int id,
+					struct ib_ucontext *context, int nested)
+{
+	struct ib_uobject *uobj;
+
+	uobj = __idr_get_uobj(idr, id, context);
+	if (!uobj)
+		return NULL;
+
+	if (nested)
+		down_read_nested(&uobj->mutex, SINGLE_DEPTH_NESTING);
+	else
+		down_read(&uobj->mutex);
+	if (!uobj->live) {
+		put_uobj_read(uobj);
+		return NULL;
+	}
+
+	return uobj;
+}
+
+static struct ib_uobject *idr_write_uobj(struct idr *idr, int id,
+					 struct ib_ucontext *context)
+{
+	struct ib_uobject *uobj;
+
+	uobj = __idr_get_uobj(idr, id, context);
+	if (!uobj)
+		return NULL;
+
+	down_write(&uobj->mutex);
+	if (!uobj->live) {
+		put_uobj_write(uobj);
+		return NULL;
+	}
+
+	return uobj;
+}
+
+static void *idr_read_obj(struct idr *idr, int id, struct ib_ucontext *context,
+			  int nested)
+{
+	struct ib_uobject *uobj;
+
+	uobj = idr_read_uobj(idr, id, context, nested);
+	return uobj ? uobj->object : NULL;
+}
+
+static struct ib_pd *idr_read_pd(int pd_handle, struct ib_ucontext *context)
+{
+	return idr_read_obj(&ib_uverbs_pd_idr, pd_handle, context, 0);
+}
+
+static void put_pd_read(struct ib_pd *pd)
+{
+	put_uobj_read(pd->uobject);
+}
+
+static struct ib_cq *idr_read_cq(int cq_handle, struct ib_ucontext *context, int nested)
+{
+	return idr_read_obj(&ib_uverbs_cq_idr, cq_handle, context, nested);
+}
+
+static void put_cq_read(struct ib_cq *cq)
+{
+	put_uobj_read(cq->uobject);
+}
+
+static struct ib_ah *idr_read_ah(int ah_handle, struct ib_ucontext *context)
+{
+	return idr_read_obj(&ib_uverbs_ah_idr, ah_handle, context, 0);
+}
+
+static void put_ah_read(struct ib_ah *ah)
+{
+	put_uobj_read(ah->uobject);
+}
+
+static struct ib_qp *idr_read_qp(int qp_handle, struct ib_ucontext *context)
+{
+	return idr_read_obj(&ib_uverbs_qp_idr, qp_handle, context, 0);
+}
+
+static struct ib_qp *idr_write_qp(int qp_handle, struct ib_ucontext *context)
+{
+	struct ib_uobject *uobj;
+
+	uobj = idr_write_uobj(&ib_uverbs_qp_idr, qp_handle, context);
+	return uobj ? uobj->object : NULL;
+}
+
+static void put_qp_read(struct ib_qp *qp)
+{
+	put_uobj_read(qp->uobject);
+}
+
+static void put_qp_write(struct ib_qp *qp)
+{
+	put_uobj_write(qp->uobject);
+}
+
+static struct ib_srq *idr_read_srq(int srq_handle, struct ib_ucontext *context)
+{
+	return idr_read_obj(&ib_uverbs_srq_idr, srq_handle, context, 0);
+}
+
+static void put_srq_read(struct ib_srq *srq)
+{
+	put_uobj_read(srq->uobject);
+}
+
+static struct ib_xrcd *idr_read_xrcd(int xrcd_handle, struct ib_ucontext *context,
+				     struct ib_uobject **uobj)
+{
+	*uobj = idr_read_uobj(&ib_uverbs_xrcd_idr, xrcd_handle, context, 0);
+	return *uobj ? (*uobj)->object : NULL;
+}
+
+static void put_xrcd_read(struct ib_uobject *uobj)
+{
+	put_uobj_read(uobj);
+}
+
+ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file,
+			      const char __user *buf,
+			      int in_len, int out_len)
+{
+	struct ib_uverbs_get_context      cmd;
+	struct ib_uverbs_get_context_resp resp;
+	struct ib_udata                   udata;
+	struct ib_device                 *ibdev = file->device->ib_dev;
+	struct ib_ucontext		 *ucontext;
+	struct file			 *filp;
+	int ret;
+
+	if (out_len < sizeof resp)
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	mutex_lock(&file->mutex);
+
+	if (file->ucontext) {
+		ret = -EINVAL;
+		goto err;
+	}
+
+	INIT_UDATA(&udata, buf + sizeof cmd,
+		   (unsigned long) cmd.response + sizeof resp,
+		   in_len - sizeof cmd, out_len - sizeof resp);
+
+	ucontext = ibdev->alloc_ucontext(ibdev, &udata);
+	if (IS_ERR(ucontext)) {
+		ret = PTR_ERR(ucontext);
+		goto err;
+	}
+
+	ucontext->device = ibdev;
+	INIT_LIST_HEAD(&ucontext->pd_list);
+	INIT_LIST_HEAD(&ucontext->mr_list);
+	INIT_LIST_HEAD(&ucontext->mw_list);
+	INIT_LIST_HEAD(&ucontext->cq_list);
+	INIT_LIST_HEAD(&ucontext->qp_list);
+	INIT_LIST_HEAD(&ucontext->srq_list);
+	INIT_LIST_HEAD(&ucontext->ah_list);
+	INIT_LIST_HEAD(&ucontext->xrcd_list);
+	INIT_LIST_HEAD(&ucontext->rule_list);
+	ucontext->closing = 0;
+
+	resp.num_comp_vectors = file->device->num_comp_vectors;
+
+	ret = get_unused_fd_flags(O_CLOEXEC);
+	if (ret < 0)
+		goto err_free;
+	resp.async_fd = ret;
+
+	filp = ib_uverbs_alloc_event_file(file, 1);
+	if (IS_ERR(filp)) {
+		ret = PTR_ERR(filp);
+		goto err_fd;
+	}
+
+	if (copy_to_user((void __user *) (unsigned long) cmd.response,
+			 &resp, sizeof resp)) {
+		ret = -EFAULT;
+		goto err_file;
+	}
+
+	file->async_file = filp->private_data;
+
+	INIT_IB_EVENT_HANDLER(&file->event_handler, file->device->ib_dev,
+			      ib_uverbs_event_handler);
+	ret = ib_register_event_handler(&file->event_handler);
+	if (ret)
+		goto err_file;
+
+	kref_get(&file->async_file->ref);
+	kref_get(&file->ref);
+	file->ucontext = ucontext;
+
+	fd_install(resp.async_fd, filp);
+
+	mutex_unlock(&file->mutex);
+
+	return in_len;
+
+err_file:
+	fput(filp);
+
+err_fd:
+	put_unused_fd(resp.async_fd);
+
+err_free:
+	ibdev->dealloc_ucontext(ucontext);
+
+err:
+	mutex_unlock(&file->mutex);
+	return ret;
+}
+
+ssize_t ib_uverbs_query_device(struct ib_uverbs_file *file,
+			       const char __user *buf,
+			       int in_len, int out_len)
+{
+	struct ib_uverbs_query_device      cmd;
+	struct ib_uverbs_query_device_resp resp;
+	struct ib_device_attr              attr;
+	int                                ret;
+
+	if (out_len < sizeof resp)
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	ret = ib_query_device(file->device->ib_dev, &attr);
+	if (ret)
+		return ret;
+
+	memset(&resp, 0, sizeof resp);
+
+	resp.fw_ver 		       = attr.fw_ver;
+	resp.node_guid 		       = file->device->ib_dev->node_guid;
+	resp.sys_image_guid 	       = attr.sys_image_guid;
+	resp.max_mr_size 	       = attr.max_mr_size;
+	resp.page_size_cap 	       = attr.page_size_cap;
+	resp.vendor_id 		       = attr.vendor_id;
+	resp.vendor_part_id 	       = attr.vendor_part_id;
+	resp.hw_ver 		       = attr.hw_ver;
+	resp.max_qp 		       = attr.max_qp;
+	resp.max_qp_wr 		       = attr.max_qp_wr;
+	resp.device_cap_flags 	       = attr.device_cap_flags;
+	resp.max_sge 		       = attr.max_sge;
+	resp.max_sge_rd 	       = attr.max_sge_rd;
+	resp.max_cq 		       = attr.max_cq;
+	resp.max_cqe 		       = attr.max_cqe;
+	resp.max_mr 		       = attr.max_mr;
+	resp.max_pd 		       = attr.max_pd;
+	resp.max_qp_rd_atom 	       = attr.max_qp_rd_atom;
+	resp.max_ee_rd_atom 	       = attr.max_ee_rd_atom;
+	resp.max_res_rd_atom 	       = attr.max_res_rd_atom;
+	resp.max_qp_init_rd_atom       = attr.max_qp_init_rd_atom;
+	resp.max_ee_init_rd_atom       = attr.max_ee_init_rd_atom;
+	resp.atomic_cap 	       = attr.atomic_cap;
+	resp.max_ee 		       = attr.max_ee;
+	resp.max_rdd 		       = attr.max_rdd;
+	resp.max_mw 		       = attr.max_mw;
+	resp.max_raw_ipv6_qp 	       = attr.max_raw_ipv6_qp;
+	resp.max_raw_ethy_qp 	       = attr.max_raw_ethy_qp;
+	resp.max_mcast_grp 	       = attr.max_mcast_grp;
+	resp.max_mcast_qp_attach       = attr.max_mcast_qp_attach;
+	resp.max_total_mcast_qp_attach = attr.max_total_mcast_qp_attach;
+	resp.max_ah 		       = attr.max_ah;
+	resp.max_fmr 		       = attr.max_fmr;
+	resp.max_map_per_fmr 	       = attr.max_map_per_fmr;
+	resp.max_srq 		       = attr.max_srq;
+	resp.max_srq_wr 	       = attr.max_srq_wr;
+	resp.max_srq_sge 	       = attr.max_srq_sge;
+	resp.max_pkeys 		       = attr.max_pkeys;
+	resp.local_ca_ack_delay        = attr.local_ca_ack_delay;
+	resp.phys_port_cnt	       = file->device->ib_dev->phys_port_cnt;
+
+	if (copy_to_user((void __user *) (unsigned long) cmd.response,
+			 &resp, sizeof resp))
+		return -EFAULT;
+
+	return in_len;
+}
+
+ssize_t ib_uverbs_query_port(struct ib_uverbs_file *file,
+			     const char __user *buf,
+			     int in_len, int out_len)
+{
+	struct ib_uverbs_query_port      cmd;
+	struct ib_uverbs_query_port_resp resp;
+	struct ib_port_attr              attr;
+	int                              ret;
+
+	if (out_len < sizeof resp)
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	ret = ib_query_port(file->device->ib_dev, cmd.port_num, &attr);
+	if (ret)
+		return ret;
+
+	memset(&resp, 0, sizeof resp);
+
+	resp.state 	     = attr.state;
+	resp.max_mtu 	     = attr.max_mtu;
+	resp.active_mtu      = attr.active_mtu;
+	resp.gid_tbl_len     = attr.gid_tbl_len;
+	resp.port_cap_flags  = attr.port_cap_flags;
+	resp.max_msg_sz      = attr.max_msg_sz;
+	resp.bad_pkey_cntr   = attr.bad_pkey_cntr;
+	resp.qkey_viol_cntr  = attr.qkey_viol_cntr;
+	resp.pkey_tbl_len    = attr.pkey_tbl_len;
+	resp.lid 	     = attr.lid;
+	resp.sm_lid 	     = attr.sm_lid;
+	resp.lmc 	     = attr.lmc;
+	resp.max_vl_num      = attr.max_vl_num;
+	resp.sm_sl 	     = attr.sm_sl;
+	resp.subnet_timeout  = attr.subnet_timeout;
+	resp.init_type_reply = attr.init_type_reply;
+	resp.active_width    = attr.active_width;
+	resp.active_speed    = attr.active_speed;
+	resp.phys_state      = attr.phys_state;
+	resp.link_layer      = rdma_port_get_link_layer(file->device->ib_dev,
+							cmd.port_num);
+
+	if (copy_to_user((void __user *) (unsigned long) cmd.response,
+			 &resp, sizeof resp))
+		return -EFAULT;
+
+	return in_len;
+}
+
+ssize_t ib_uverbs_alloc_pd(struct ib_uverbs_file *file,
+			   const char __user *buf,
+			   int in_len, int out_len)
+{
+	struct ib_uverbs_alloc_pd      cmd;
+	struct ib_uverbs_alloc_pd_resp resp;
+	struct ib_udata                udata;
+	struct ib_uobject             *uobj;
+	struct ib_pd                  *pd;
+	int                            ret;
+
+	if (out_len < sizeof resp)
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	INIT_UDATA(&udata, buf + sizeof cmd,
+		   (unsigned long) cmd.response + sizeof resp,
+		   in_len - sizeof cmd, out_len - sizeof resp);
+
+	uobj = kmalloc(sizeof *uobj, GFP_KERNEL);
+	if (!uobj)
+		return -ENOMEM;
+
+	init_uobj(uobj, 0, file->ucontext, &pd_lock_class);
+	down_write(&uobj->mutex);
+
+	pd = file->device->ib_dev->alloc_pd(file->device->ib_dev,
+					    file->ucontext, &udata);
+	if (IS_ERR(pd)) {
+		ret = PTR_ERR(pd);
+		goto err;
+	}
+
+	pd->device  = file->device->ib_dev;
+	pd->uobject = uobj;
+	atomic_set(&pd->usecnt, 0);
+
+	uobj->object = pd;
+	ret = idr_add_uobj(&ib_uverbs_pd_idr, uobj);
+	if (ret)
+		goto err_idr;
+
+	memset(&resp, 0, sizeof resp);
+	resp.pd_handle = uobj->id;
+
+	if (copy_to_user((void __user *) (unsigned long) cmd.response,
+			 &resp, sizeof resp)) {
+		ret = -EFAULT;
+		goto err_copy;
+	}
+
+	mutex_lock(&file->mutex);
+	list_add_tail(&uobj->list, &file->ucontext->pd_list);
+	mutex_unlock(&file->mutex);
+
+	uobj->live = 1;
+
+	up_write(&uobj->mutex);
+
+	return in_len;
+
+err_copy:
+	idr_remove_uobj(&ib_uverbs_pd_idr, uobj);
+
+err_idr:
+	ib_dealloc_pd(pd);
+
+err:
+	put_uobj_write(uobj);
+	return ret;
+}
+
+ssize_t ib_uverbs_dealloc_pd(struct ib_uverbs_file *file,
+			     const char __user *buf,
+			     int in_len, int out_len)
+{
+	struct ib_uverbs_dealloc_pd cmd;
+	struct ib_uobject          *uobj;
+	int                         ret;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	uobj = idr_write_uobj(&ib_uverbs_pd_idr, cmd.pd_handle, file->ucontext);
+	if (!uobj)
+		return -EINVAL;
+
+	ret = ib_dealloc_pd(uobj->object);
+	if (!ret)
+		uobj->live = 0;
+
+	put_uobj_write(uobj);
+
+	if (ret)
+		return ret;
+
+	idr_remove_uobj(&ib_uverbs_pd_idr, uobj);
+
+	mutex_lock(&file->mutex);
+	list_del(&uobj->list);
+	mutex_unlock(&file->mutex);
+
+	put_uobj(uobj);
+
+	return in_len;
+}
+
+struct xrcd_table_entry {
+	struct rb_node  node;
+	struct ib_xrcd *xrcd;
+	struct inode   *inode;
+};
+
+static int xrcd_table_insert(struct ib_uverbs_device *dev,
+			    struct inode *inode,
+			    struct ib_xrcd *xrcd)
+{
+	struct xrcd_table_entry *entry, *scan;
+	struct rb_node **p = &dev->xrcd_tree.rb_node;
+	struct rb_node *parent = NULL;
+
+	entry = kmalloc(sizeof *entry, GFP_KERNEL);
+	if (!entry)
+		return -ENOMEM;
+
+	entry->xrcd  = xrcd;
+	entry->inode = inode;
+
+	while (*p) {
+		parent = *p;
+		scan = rb_entry(parent, struct xrcd_table_entry, node);
+
+		if (inode < scan->inode) {
+			p = &(*p)->rb_left;
+		} else if (inode > scan->inode) {
+			p = &(*p)->rb_right;
+		} else {
+			kfree(entry);
+			return -EEXIST;
+		}
+	}
+
+	rb_link_node(&entry->node, parent, p);
+	rb_insert_color(&entry->node, &dev->xrcd_tree);
+	igrab(inode);
+	return 0;
+}
+
+static struct xrcd_table_entry *xrcd_table_search(struct ib_uverbs_device *dev,
+						  struct inode *inode)
+{
+	struct xrcd_table_entry *entry;
+	struct rb_node *p = dev->xrcd_tree.rb_node;
+
+	while (p) {
+		entry = rb_entry(p, struct xrcd_table_entry, node);
+
+		if (inode < entry->inode)
+			p = p->rb_left;
+		else if (inode > entry->inode)
+			p = p->rb_right;
+		else
+			return entry;
+	}
+
+	return NULL;
+}
+
+static struct ib_xrcd *find_xrcd(struct ib_uverbs_device *dev, struct inode *inode)
+{
+	struct xrcd_table_entry *entry;
+
+	entry = xrcd_table_search(dev, inode);
+	if (!entry)
+		return NULL;
+
+	return entry->xrcd;
+}
+
+static void xrcd_table_delete(struct ib_uverbs_device *dev,
+			      struct inode *inode)
+{
+	struct xrcd_table_entry *entry;
+
+	entry = xrcd_table_search(dev, inode);
+	if (entry) {
+		iput(inode);
+		rb_erase(&entry->node, &dev->xrcd_tree);
+		kfree(entry);
+	}
+}
+
+ssize_t ib_uverbs_open_xrcd(struct ib_uverbs_file *file,
+			    const char __user *buf, int in_len,
+			    int out_len)
+{
+	struct ib_uverbs_open_xrcd	cmd;
+	struct ib_uverbs_open_xrcd_resp	resp;
+	struct ib_udata			udata;
+	struct ib_uxrcd_object         *obj;
+	struct ib_xrcd                 *xrcd = NULL;
+	struct fd			f = {NULL, 0};
+	struct inode                   *inode = NULL;
+	int				ret = 0;
+	int				new_xrcd = 0;
+
+	if (out_len < sizeof resp)
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	INIT_UDATA(&udata, buf + sizeof cmd,
+		   (unsigned long) cmd.response + sizeof resp,
+		   in_len - sizeof cmd, out_len - sizeof  resp);
+
+	mutex_lock(&file->device->xrcd_tree_mutex);
+
+	if (cmd.fd != -1) {
+		/* search for file descriptor */
+		f = fdget(cmd.fd);
+		if (!f.file) {
+			ret = -EBADF;
+			goto err_tree_mutex_unlock;
+		}
+
+		inode = file_inode(f.file);
+		xrcd = find_xrcd(file->device, inode);
+		if (!xrcd && !(cmd.oflags & O_CREAT)) {
+			/* no file descriptor. Need CREATE flag */
+			ret = -EAGAIN;
+			goto err_tree_mutex_unlock;
+		}
+
+		if (xrcd && cmd.oflags & O_EXCL) {
+			ret = -EINVAL;
+			goto err_tree_mutex_unlock;
+		}
+	}
+
+	obj = kmalloc(sizeof *obj, GFP_KERNEL);
+	if (!obj) {
+		ret = -ENOMEM;
+		goto err_tree_mutex_unlock;
+	}
+
+	init_uobj(&obj->uobject, 0, file->ucontext, &xrcd_lock_class);
+
+	down_write(&obj->uobject.mutex);
+
+	if (!xrcd) {
+		xrcd = file->device->ib_dev->alloc_xrcd(file->device->ib_dev,
+							file->ucontext, &udata);
+		if (IS_ERR(xrcd)) {
+			ret = PTR_ERR(xrcd);
+			goto err;
+		}
+
+		xrcd->inode   = inode;
+		xrcd->device  = file->device->ib_dev;
+		atomic_set(&xrcd->usecnt, 0);
+		mutex_init(&xrcd->tgt_qp_mutex);
+		INIT_LIST_HEAD(&xrcd->tgt_qp_list);
+		new_xrcd = 1;
+	}
+
+	atomic_set(&obj->refcnt, 0);
+	obj->uobject.object = xrcd;
+	ret = idr_add_uobj(&ib_uverbs_xrcd_idr, &obj->uobject);
+	if (ret)
+		goto err_idr;
+
+	memset(&resp, 0, sizeof resp);
+	resp.xrcd_handle = obj->uobject.id;
+
+	if (inode) {
+		if (new_xrcd) {
+			/* create new inode/xrcd table entry */
+			ret = xrcd_table_insert(file->device, inode, xrcd);
+			if (ret)
+				goto err_insert_xrcd;
+		}
+		atomic_inc(&xrcd->usecnt);
+	}
+
+	if (copy_to_user((void __user *) (unsigned long) cmd.response,
+			 &resp, sizeof resp)) {
+		ret = -EFAULT;
+		goto err_copy;
+	}
+
+	if (f.file)
+		fdput(f);
+
+	mutex_lock(&file->mutex);
+	list_add_tail(&obj->uobject.list, &file->ucontext->xrcd_list);
+	mutex_unlock(&file->mutex);
+
+	obj->uobject.live = 1;
+	up_write(&obj->uobject.mutex);
+
+	mutex_unlock(&file->device->xrcd_tree_mutex);
+	return in_len;
+
+err_copy:
+	if (inode) {
+		if (new_xrcd)
+			xrcd_table_delete(file->device, inode);
+		atomic_dec(&xrcd->usecnt);
+	}
+
+err_insert_xrcd:
+	idr_remove_uobj(&ib_uverbs_xrcd_idr, &obj->uobject);
+
+err_idr:
+	ib_dealloc_xrcd(xrcd);
+
+err:
+	put_uobj_write(&obj->uobject);
+
+err_tree_mutex_unlock:
+	if (f.file)
+		fdput(f);
+
+	mutex_unlock(&file->device->xrcd_tree_mutex);
+
+	return ret;
+}
+
+ssize_t ib_uverbs_close_xrcd(struct ib_uverbs_file *file,
+			     const char __user *buf, int in_len,
+			     int out_len)
+{
+	struct ib_uverbs_close_xrcd cmd;
+	struct ib_uobject           *uobj;
+	struct ib_xrcd              *xrcd = NULL;
+	struct inode                *inode = NULL;
+	struct ib_uxrcd_object      *obj;
+	int                         live;
+	int                         ret = 0;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	mutex_lock(&file->device->xrcd_tree_mutex);
+	uobj = idr_write_uobj(&ib_uverbs_xrcd_idr, cmd.xrcd_handle, file->ucontext);
+	if (!uobj) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	xrcd  = uobj->object;
+	inode = xrcd->inode;
+	obj   = container_of(uobj, struct ib_uxrcd_object, uobject);
+	if (atomic_read(&obj->refcnt)) {
+		put_uobj_write(uobj);
+		ret = -EBUSY;
+		goto out;
+	}
+
+	if (!inode || atomic_dec_and_test(&xrcd->usecnt)) {
+		ret = ib_dealloc_xrcd(uobj->object);
+		if (!ret)
+			uobj->live = 0;
+	}
+
+	live = uobj->live;
+	if (inode && ret)
+		atomic_inc(&xrcd->usecnt);
+
+	put_uobj_write(uobj);
+
+	if (ret)
+		goto out;
+
+	if (inode && !live)
+		xrcd_table_delete(file->device, inode);
+
+	idr_remove_uobj(&ib_uverbs_xrcd_idr, uobj);
+	mutex_lock(&file->mutex);
+	list_del(&uobj->list);
+	mutex_unlock(&file->mutex);
+
+	put_uobj(uobj);
+	ret = in_len;
+
+out:
+	mutex_unlock(&file->device->xrcd_tree_mutex);
+	return ret;
+}
+
+void ib_uverbs_dealloc_xrcd(struct ib_uverbs_device *dev,
+			    struct ib_xrcd *xrcd)
+{
+	struct inode *inode;
+
+	inode = xrcd->inode;
+	if (inode && !atomic_dec_and_test(&xrcd->usecnt))
+		return;
+
+	ib_dealloc_xrcd(xrcd);
+
+	if (inode)
+		xrcd_table_delete(dev, inode);
+}
+
+ssize_t ib_uverbs_reg_mr(struct ib_uverbs_file *file,
+			 const char __user *buf, int in_len,
+			 int out_len)
+{
+	struct ib_uverbs_reg_mr      cmd;
+	struct ib_uverbs_reg_mr_resp resp;
+	struct ib_udata              udata;
+	struct ib_uobject           *uobj;
+	struct ib_pd                *pd;
+	struct ib_mr                *mr;
+	int                          ret;
+
+	if (out_len < sizeof resp)
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	INIT_UDATA(&udata, buf + sizeof cmd,
+		   (unsigned long) cmd.response + sizeof resp,
+		   in_len - sizeof cmd, out_len - sizeof resp);
+
+	if ((cmd.start & ~PAGE_MASK) != (cmd.hca_va & ~PAGE_MASK))
+		return -EINVAL;
+
+	ret = ib_check_mr_access(cmd.access_flags);
+	if (ret)
+		return ret;
+
+	uobj = kmalloc(sizeof *uobj, GFP_KERNEL);
+	if (!uobj)
+		return -ENOMEM;
+
+	init_uobj(uobj, 0, file->ucontext, &mr_lock_class);
+	down_write(&uobj->mutex);
+
+	pd = idr_read_pd(cmd.pd_handle, file->ucontext);
+	if (!pd) {
+		ret = -EINVAL;
+		goto err_free;
+	}
+
+	mr = pd->device->reg_user_mr(pd, cmd.start, cmd.length, cmd.hca_va,
+				     cmd.access_flags, &udata);
+	if (IS_ERR(mr)) {
+		ret = PTR_ERR(mr);
+		goto err_put;
+	}
+
+	mr->device  = pd->device;
+	mr->pd      = pd;
+	mr->uobject = uobj;
+	atomic_inc(&pd->usecnt);
+	atomic_set(&mr->usecnt, 0);
+
+	uobj->object = mr;
+	ret = idr_add_uobj(&ib_uverbs_mr_idr, uobj);
+	if (ret)
+		goto err_unreg;
+
+	memset(&resp, 0, sizeof resp);
+	resp.lkey      = mr->lkey;
+	resp.rkey      = mr->rkey;
+	resp.mr_handle = uobj->id;
+
+	if (copy_to_user((void __user *) (unsigned long) cmd.response,
+			 &resp, sizeof resp)) {
+		ret = -EFAULT;
+		goto err_copy;
+	}
+
+	put_pd_read(pd);
+
+	mutex_lock(&file->mutex);
+	list_add_tail(&uobj->list, &file->ucontext->mr_list);
+	mutex_unlock(&file->mutex);
+
+	uobj->live = 1;
+
+	up_write(&uobj->mutex);
+
+	return in_len;
+
+err_copy:
+	idr_remove_uobj(&ib_uverbs_mr_idr, uobj);
+
+err_unreg:
+	ib_dereg_mr(mr);
+
+err_put:
+	put_pd_read(pd);
+
+err_free:
+	put_uobj_write(uobj);
+	return ret;
+}
+
+ssize_t ib_uverbs_rereg_mr(struct ib_uverbs_file *file,
+			   const char __user *buf, int in_len,
+			   int out_len)
+{
+	struct ib_uverbs_rereg_mr      cmd;
+	struct ib_uverbs_rereg_mr_resp resp;
+	struct ib_udata              udata;
+	struct ib_pd                *pd = NULL;
+	struct ib_mr                *mr;
+	struct ib_pd		    *old_pd;
+	int                          ret;
+	struct ib_uobject	    *uobj;
+
+	if (out_len < sizeof(resp))
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, buf, sizeof(cmd)))
+		return -EFAULT;
+
+	INIT_UDATA(&udata, buf + sizeof(cmd),
+		   (unsigned long) cmd.response + sizeof(resp),
+		   in_len - sizeof(cmd), out_len - sizeof(resp));
+
+	if (cmd.flags & ~IB_MR_REREG_SUPPORTED || !cmd.flags)
+		return -EINVAL;
+
+	if ((cmd.flags & IB_MR_REREG_TRANS) &&
+	    (!cmd.start || !cmd.hca_va || 0 >= cmd.length ||
+	     (cmd.start & ~PAGE_MASK) != (cmd.hca_va & ~PAGE_MASK)))
+			return -EINVAL;
+
+	uobj = idr_write_uobj(&ib_uverbs_mr_idr, cmd.mr_handle,
+			      file->ucontext);
+
+	if (!uobj)
+		return -EINVAL;
+
+	mr = uobj->object;
+
+	if (cmd.flags & IB_MR_REREG_ACCESS) {
+		ret = ib_check_mr_access(cmd.access_flags);
+		if (ret)
+			goto put_uobjs;
+	}
+
+	if (cmd.flags & IB_MR_REREG_PD) {
+		pd = idr_read_pd(cmd.pd_handle, file->ucontext);
+		if (!pd) {
+			ret = -EINVAL;
+			goto put_uobjs;
+		}
+	}
+
+	if (atomic_read(&mr->usecnt)) {
+		ret = -EBUSY;
+		goto put_uobj_pd;
+	}
+
+	old_pd = mr->pd;
+	ret = mr->device->rereg_user_mr(mr, cmd.flags, cmd.start,
+					cmd.length, cmd.hca_va,
+					cmd.access_flags, pd, &udata);
+	if (!ret) {
+		if (cmd.flags & IB_MR_REREG_PD) {
+			atomic_inc(&pd->usecnt);
+			mr->pd = pd;
+			atomic_dec(&old_pd->usecnt);
+		}
+	} else {
+		goto put_uobj_pd;
+	}
+
+	memset(&resp, 0, sizeof(resp));
+	resp.lkey      = mr->lkey;
+	resp.rkey      = mr->rkey;
+
+	if (copy_to_user((void __user *)(unsigned long)cmd.response,
+			 &resp, sizeof(resp)))
+		ret = -EFAULT;
+	else
+		ret = in_len;
+
+put_uobj_pd:
+	if (cmd.flags & IB_MR_REREG_PD)
+		put_pd_read(pd);
+
+put_uobjs:
+
+	put_uobj_write(mr->uobject);
+
+	return ret;
+}
+
+ssize_t ib_uverbs_dereg_mr(struct ib_uverbs_file *file,
+			   const char __user *buf, int in_len,
+			   int out_len)
+{
+	struct ib_uverbs_dereg_mr cmd;
+	struct ib_mr             *mr;
+	struct ib_uobject	 *uobj;
+	int                       ret = -EINVAL;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	uobj = idr_write_uobj(&ib_uverbs_mr_idr, cmd.mr_handle, file->ucontext);
+	if (!uobj)
+		return -EINVAL;
+
+	mr = uobj->object;
+
+	ret = ib_dereg_mr(mr);
+	if (!ret)
+		uobj->live = 0;
+
+	put_uobj_write(uobj);
+
+	if (ret)
+		return ret;
+
+	idr_remove_uobj(&ib_uverbs_mr_idr, uobj);
+
+	mutex_lock(&file->mutex);
+	list_del(&uobj->list);
+	mutex_unlock(&file->mutex);
+
+	put_uobj(uobj);
+
+	return in_len;
+}
+
+ssize_t ib_uverbs_alloc_mw(struct ib_uverbs_file *file,
+			 const char __user *buf, int in_len,
+			 int out_len)
+{
+	struct ib_uverbs_alloc_mw      cmd;
+	struct ib_uverbs_alloc_mw_resp resp;
+	struct ib_uobject             *uobj;
+	struct ib_pd                  *pd;
+	struct ib_mw                  *mw;
+	int                            ret;
+
+	if (out_len < sizeof(resp))
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, buf, sizeof(cmd)))
+		return -EFAULT;
+
+	uobj = kmalloc(sizeof(*uobj), GFP_KERNEL);
+	if (!uobj)
+		return -ENOMEM;
+
+	init_uobj(uobj, 0, file->ucontext, &mw_lock_class);
+	down_write(&uobj->mutex);
+
+	pd = idr_read_pd(cmd.pd_handle, file->ucontext);
+	if (!pd) {
+		ret = -EINVAL;
+		goto err_free;
+	}
+
+	mw = pd->device->alloc_mw(pd, cmd.mw_type);
+	if (IS_ERR(mw)) {
+		ret = PTR_ERR(mw);
+		goto err_put;
+	}
+
+	mw->device  = pd->device;
+	mw->pd      = pd;
+	mw->uobject = uobj;
+	atomic_inc(&pd->usecnt);
+
+	uobj->object = mw;
+	ret = idr_add_uobj(&ib_uverbs_mw_idr, uobj);
+	if (ret)
+		goto err_unalloc;
+
+	memset(&resp, 0, sizeof(resp));
+	resp.rkey      = mw->rkey;
+	resp.mw_handle = uobj->id;
+
+	if (copy_to_user((void __user *)(unsigned long)cmd.response,
+			 &resp, sizeof(resp))) {
+		ret = -EFAULT;
+		goto err_copy;
+	}
+
+	put_pd_read(pd);
+
+	mutex_lock(&file->mutex);
+	list_add_tail(&uobj->list, &file->ucontext->mw_list);
+	mutex_unlock(&file->mutex);
+
+	uobj->live = 1;
+
+	up_write(&uobj->mutex);
+
+	return in_len;
+
+err_copy:
+	idr_remove_uobj(&ib_uverbs_mw_idr, uobj);
+
+err_unalloc:
+	ib_dealloc_mw(mw);
+
+err_put:
+	put_pd_read(pd);
+
+err_free:
+	put_uobj_write(uobj);
+	return ret;
+}
+
+ssize_t ib_uverbs_dealloc_mw(struct ib_uverbs_file *file,
+			   const char __user *buf, int in_len,
+			   int out_len)
+{
+	struct ib_uverbs_dealloc_mw cmd;
+	struct ib_mw               *mw;
+	struct ib_uobject	   *uobj;
+	int                         ret = -EINVAL;
+
+	if (copy_from_user(&cmd, buf, sizeof(cmd)))
+		return -EFAULT;
+
+	uobj = idr_write_uobj(&ib_uverbs_mw_idr, cmd.mw_handle, file->ucontext);
+	if (!uobj)
+		return -EINVAL;
+
+	mw = uobj->object;
+
+	ret = ib_dealloc_mw(mw);
+	if (!ret)
+		uobj->live = 0;
+
+	put_uobj_write(uobj);
+
+	if (ret)
+		return ret;
+
+	idr_remove_uobj(&ib_uverbs_mw_idr, uobj);
+
+	mutex_lock(&file->mutex);
+	list_del(&uobj->list);
+	mutex_unlock(&file->mutex);
+
+	put_uobj(uobj);
+
+	return in_len;
+}
+
+ssize_t ib_uverbs_create_comp_channel(struct ib_uverbs_file *file,
+				      const char __user *buf, int in_len,
+				      int out_len)
+{
+	struct ib_uverbs_create_comp_channel	   cmd;
+	struct ib_uverbs_create_comp_channel_resp  resp;
+	struct file				  *filp;
+	int ret;
+
+	if (out_len < sizeof resp)
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	ret = get_unused_fd_flags(O_CLOEXEC);
+	if (ret < 0)
+		return ret;
+	resp.fd = ret;
+
+	filp = ib_uverbs_alloc_event_file(file, 0);
+	if (IS_ERR(filp)) {
+		put_unused_fd(resp.fd);
+		return PTR_ERR(filp);
+	}
+
+	if (copy_to_user((void __user *) (unsigned long) cmd.response,
+			 &resp, sizeof resp)) {
+		put_unused_fd(resp.fd);
+		fput(filp);
+		return -EFAULT;
+	}
+
+	fd_install(resp.fd, filp);
+	return in_len;
+}
+
+ssize_t ib_uverbs_create_cq(struct ib_uverbs_file *file,
+			    const char __user *buf, int in_len,
+			    int out_len)
+{
+	struct ib_uverbs_create_cq      cmd;
+	struct ib_uverbs_create_cq_resp resp;
+	struct ib_udata                 udata;
+	struct ib_ucq_object           *obj;
+	struct ib_uverbs_event_file    *ev_file = NULL;
+	struct ib_cq                   *cq;
+	int                             ret;
+
+	if (out_len < sizeof resp)
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	INIT_UDATA(&udata, buf + sizeof cmd,
+		   (unsigned long) cmd.response + sizeof resp,
+		   in_len - sizeof cmd, out_len - sizeof resp);
+
+	if (cmd.comp_vector >= file->device->num_comp_vectors)
+		return -EINVAL;
+
+	obj = kmalloc(sizeof *obj, GFP_KERNEL);
+	if (!obj)
+		return -ENOMEM;
+
+	init_uobj(&obj->uobject, cmd.user_handle, file->ucontext, &cq_lock_class);
+	down_write(&obj->uobject.mutex);
+
+	if (cmd.comp_channel >= 0) {
+		ev_file = ib_uverbs_lookup_comp_file(cmd.comp_channel);
+		if (!ev_file) {
+			ret = -EINVAL;
+			goto err;
+		}
+	}
+
+	obj->uverbs_file	   = file;
+	obj->comp_events_reported  = 0;
+	obj->async_events_reported = 0;
+	INIT_LIST_HEAD(&obj->comp_list);
+	INIT_LIST_HEAD(&obj->async_list);
+
+	cq = file->device->ib_dev->create_cq(file->device->ib_dev, cmd.cqe,
+					     cmd.comp_vector,
+					     file->ucontext, &udata);
+	if (IS_ERR(cq)) {
+		ret = PTR_ERR(cq);
+		goto err_file;
+	}
+
+	cq->device        = file->device->ib_dev;
+	cq->uobject       = &obj->uobject;
+	cq->comp_handler  = ib_uverbs_comp_handler;
+	cq->event_handler = ib_uverbs_cq_event_handler;
+	cq->cq_context    = ev_file;
+	atomic_set(&cq->usecnt, 0);
+
+	obj->uobject.object = cq;
+	ret = idr_add_uobj(&ib_uverbs_cq_idr, &obj->uobject);
+	if (ret)
+		goto err_free;
+
+	memset(&resp, 0, sizeof resp);
+	resp.cq_handle = obj->uobject.id;
+	resp.cqe       = cq->cqe;
+
+	if (copy_to_user((void __user *) (unsigned long) cmd.response,
+			 &resp, sizeof resp)) {
+		ret = -EFAULT;
+		goto err_copy;
+	}
+
+	mutex_lock(&file->mutex);
+	list_add_tail(&obj->uobject.list, &file->ucontext->cq_list);
+	mutex_unlock(&file->mutex);
+
+	obj->uobject.live = 1;
+
+	up_write(&obj->uobject.mutex);
+
+	return in_len;
+
+err_copy:
+	idr_remove_uobj(&ib_uverbs_cq_idr, &obj->uobject);
+
+err_free:
+	ib_destroy_cq(cq);
+
+err_file:
+	if (ev_file)
+		ib_uverbs_release_ucq(file, ev_file, obj);
+
+err:
+	put_uobj_write(&obj->uobject);
+	return ret;
+}
+
+ssize_t ib_uverbs_resize_cq(struct ib_uverbs_file *file,
+			    const char __user *buf, int in_len,
+			    int out_len)
+{
+	struct ib_uverbs_resize_cq	cmd;
+	struct ib_uverbs_resize_cq_resp	resp;
+	struct ib_udata                 udata;
+	struct ib_cq			*cq;
+	int				ret = -EINVAL;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	INIT_UDATA(&udata, buf + sizeof cmd,
+		   (unsigned long) cmd.response + sizeof resp,
+		   in_len - sizeof cmd, out_len - sizeof resp);
+
+	cq = idr_read_cq(cmd.cq_handle, file->ucontext, 0);
+	if (!cq)
+		return -EINVAL;
+
+	ret = cq->device->resize_cq(cq, cmd.cqe, &udata);
+	if (ret)
+		goto out;
+
+	resp.cqe = cq->cqe;
+
+	if (copy_to_user((void __user *) (unsigned long) cmd.response,
+			 &resp, sizeof resp.cqe))
+		ret = -EFAULT;
+
+out:
+	put_cq_read(cq);
+
+	return ret ? ret : in_len;
+}
+
+static int copy_wc_to_user(void __user *dest, struct ib_wc *wc)
+{
+	struct ib_uverbs_wc tmp;
+
+	tmp.wr_id		= wc->wr_id;
+	tmp.status		= wc->status;
+	tmp.opcode		= wc->opcode;
+	tmp.vendor_err		= wc->vendor_err;
+	tmp.byte_len		= wc->byte_len;
+	tmp.ex.imm_data		= (__u32 __force) wc->ex.imm_data;
+	tmp.qp_num		= wc->qp->qp_num;
+	tmp.src_qp		= wc->src_qp;
+	tmp.wc_flags		= wc->wc_flags;
+	tmp.pkey_index		= wc->pkey_index;
+	tmp.slid		= wc->slid;
+	tmp.sl			= wc->sl;
+	tmp.dlid_path_bits	= wc->dlid_path_bits;
+	tmp.port_num		= wc->port_num;
+	tmp.reserved		= 0;
+
+	if (copy_to_user(dest, &tmp, sizeof tmp))
+		return -EFAULT;
+
+	return 0;
+}
+
+ssize_t ib_uverbs_poll_cq(struct ib_uverbs_file *file,
+			  const char __user *buf, int in_len,
+			  int out_len)
+{
+	struct ib_uverbs_poll_cq       cmd;
+	struct ib_uverbs_poll_cq_resp  resp;
+	u8 __user                     *header_ptr;
+	u8 __user                     *data_ptr;
+	struct ib_cq                  *cq;
+	struct ib_wc                   wc;
+	int                            ret;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	cq = idr_read_cq(cmd.cq_handle, file->ucontext, 0);
+	if (!cq)
+		return -EINVAL;
+
+	/* we copy a struct ib_uverbs_poll_cq_resp to user space */
+	header_ptr = (void __user *)(unsigned long) cmd.response;
+	data_ptr = header_ptr + sizeof resp;
+
+	memset(&resp, 0, sizeof resp);
+	while (resp.count < cmd.ne) {
+		ret = ib_poll_cq(cq, 1, &wc);
+		if (ret < 0)
+			goto out_put;
+		if (!ret)
+			break;
+
+		ret = copy_wc_to_user(data_ptr, &wc);
+		if (ret)
+			goto out_put;
+
+		data_ptr += sizeof(struct ib_uverbs_wc);
+		++resp.count;
+	}
+
+	if (copy_to_user(header_ptr, &resp, sizeof resp)) {
+		ret = -EFAULT;
+		goto out_put;
+	}
+
+	ret = in_len;
+
+out_put:
+	put_cq_read(cq);
+	return ret;
+}
+
+ssize_t ib_uverbs_req_notify_cq(struct ib_uverbs_file *file,
+				const char __user *buf, int in_len,
+				int out_len)
+{
+	struct ib_uverbs_req_notify_cq cmd;
+	struct ib_cq                  *cq;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	cq = idr_read_cq(cmd.cq_handle, file->ucontext, 0);
+	if (!cq)
+		return -EINVAL;
+
+	ib_req_notify_cq(cq, cmd.solicited_only ?
+			 IB_CQ_SOLICITED : IB_CQ_NEXT_COMP);
+
+	put_cq_read(cq);
+
+	return in_len;
+}
+
+ssize_t ib_uverbs_destroy_cq(struct ib_uverbs_file *file,
+			     const char __user *buf, int in_len,
+			     int out_len)
+{
+	struct ib_uverbs_destroy_cq      cmd;
+	struct ib_uverbs_destroy_cq_resp resp;
+	struct ib_uobject		*uobj;
+	struct ib_cq               	*cq;
+	struct ib_ucq_object        	*obj;
+	struct ib_uverbs_event_file	*ev_file;
+	int                        	 ret = -EINVAL;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	uobj = idr_write_uobj(&ib_uverbs_cq_idr, cmd.cq_handle, file->ucontext);
+	if (!uobj)
+		return -EINVAL;
+	cq      = uobj->object;
+	ev_file = cq->cq_context;
+	obj     = container_of(cq->uobject, struct ib_ucq_object, uobject);
+
+	ret = ib_destroy_cq(cq);
+	if (!ret)
+		uobj->live = 0;
+
+	put_uobj_write(uobj);
+
+	if (ret)
+		return ret;
+
+	idr_remove_uobj(&ib_uverbs_cq_idr, uobj);
+
+	mutex_lock(&file->mutex);
+	list_del(&uobj->list);
+	mutex_unlock(&file->mutex);
+
+	ib_uverbs_release_ucq(file, ev_file, obj);
+
+	memset(&resp, 0, sizeof resp);
+	resp.comp_events_reported  = obj->comp_events_reported;
+	resp.async_events_reported = obj->async_events_reported;
+
+	put_uobj(uobj);
+
+	if (copy_to_user((void __user *) (unsigned long) cmd.response,
+			 &resp, sizeof resp))
+		return -EFAULT;
+
+	return in_len;
+}
+
+ssize_t ib_uverbs_create_qp(struct ib_uverbs_file *file,
+			    const char __user *buf, int in_len,
+			    int out_len)
+{
+	struct ib_uverbs_create_qp      cmd;
+	struct ib_uverbs_create_qp_resp resp;
+	struct ib_udata                 udata;
+	struct ib_uqp_object           *obj;
+	struct ib_device	       *device;
+	struct ib_pd                   *pd = NULL;
+	struct ib_xrcd		       *xrcd = NULL;
+	struct ib_uobject	       *uninitialized_var(xrcd_uobj);
+	struct ib_cq                   *scq = NULL, *rcq = NULL;
+	struct ib_srq                  *srq = NULL;
+	struct ib_qp                   *qp;
+	struct ib_qp_init_attr          attr;
+	int ret;
+
+	if (out_len < sizeof resp)
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	if (cmd.qp_type == IB_QPT_RAW_PACKET && !capable(CAP_NET_RAW))
+		return -EPERM;
+
+	INIT_UDATA(&udata, buf + sizeof cmd,
+		   (unsigned long) cmd.response + sizeof resp,
+		   in_len - sizeof cmd, out_len - sizeof resp);
+
+	obj = kzalloc(sizeof *obj, GFP_KERNEL);
+	if (!obj)
+		return -ENOMEM;
+
+	init_uobj(&obj->uevent.uobject, cmd.user_handle, file->ucontext, &qp_lock_class);
+	down_write(&obj->uevent.uobject.mutex);
+
+	if (cmd.qp_type == IB_QPT_XRC_TGT) {
+		xrcd = idr_read_xrcd(cmd.pd_handle, file->ucontext, &xrcd_uobj);
+		if (!xrcd) {
+			ret = -EINVAL;
+			goto err_put;
+		}
+		device = xrcd->device;
+	} else {
+		if (cmd.qp_type == IB_QPT_XRC_INI) {
+			cmd.max_recv_wr = cmd.max_recv_sge = 0;
+		} else {
+			if (cmd.is_srq) {
+				srq = idr_read_srq(cmd.srq_handle, file->ucontext);
+				if (!srq || srq->srq_type != IB_SRQT_BASIC) {
+					ret = -EINVAL;
+					goto err_put;
+				}
+			}
+
+			if (cmd.recv_cq_handle != cmd.send_cq_handle) {
+				rcq = idr_read_cq(cmd.recv_cq_handle, file->ucontext, 0);
+				if (!rcq) {
+					ret = -EINVAL;
+					goto err_put;
+				}
+			}
+		}
+
+		scq = idr_read_cq(cmd.send_cq_handle, file->ucontext, !!rcq);
+		rcq = rcq ?: scq;
+		pd  = idr_read_pd(cmd.pd_handle, file->ucontext);
+		if (!pd || !scq) {
+			ret = -EINVAL;
+			goto err_put;
+		}
+
+		device = pd->device;
+	}
+
+	attr.event_handler = ib_uverbs_qp_event_handler;
+	attr.qp_context    = file;
+	attr.send_cq       = scq;
+	attr.recv_cq       = rcq;
+	attr.srq           = srq;
+	attr.xrcd	   = xrcd;
+	attr.sq_sig_type   = cmd.sq_sig_all ? IB_SIGNAL_ALL_WR : IB_SIGNAL_REQ_WR;
+	attr.qp_type       = cmd.qp_type;
+	attr.create_flags  = 0;
+
+	attr.cap.max_send_wr     = cmd.max_send_wr;
+	attr.cap.max_recv_wr     = cmd.max_recv_wr;
+	attr.cap.max_send_sge    = cmd.max_send_sge;
+	attr.cap.max_recv_sge    = cmd.max_recv_sge;
+	attr.cap.max_inline_data = cmd.max_inline_data;
+
+	obj->uevent.events_reported     = 0;
+	INIT_LIST_HEAD(&obj->uevent.event_list);
+	INIT_LIST_HEAD(&obj->mcast_list);
+
+	if (cmd.qp_type == IB_QPT_XRC_TGT)
+		qp = ib_create_qp(pd, &attr);
+	else
+		qp = device->create_qp(pd, &attr, &udata);
+
+	if (IS_ERR(qp)) {
+		ret = PTR_ERR(qp);
+		goto err_put;
+	}
+
+	if (cmd.qp_type != IB_QPT_XRC_TGT) {
+		qp->real_qp	  = qp;
+		qp->device	  = device;
+		qp->pd		  = pd;
+		qp->send_cq	  = attr.send_cq;
+		qp->recv_cq	  = attr.recv_cq;
+		qp->srq		  = attr.srq;
+		qp->event_handler = attr.event_handler;
+		qp->qp_context	  = attr.qp_context;
+		qp->qp_type	  = attr.qp_type;
+		atomic_set(&qp->usecnt, 0);
+		atomic_inc(&pd->usecnt);
+		atomic_inc(&attr.send_cq->usecnt);
+		if (attr.recv_cq)
+			atomic_inc(&attr.recv_cq->usecnt);
+		if (attr.srq)
+			atomic_inc(&attr.srq->usecnt);
+	}
+	qp->uobject = &obj->uevent.uobject;
+
+	obj->uevent.uobject.object = qp;
+	ret = idr_add_uobj(&ib_uverbs_qp_idr, &obj->uevent.uobject);
+	if (ret)
+		goto err_destroy;
+
+	memset(&resp, 0, sizeof resp);
+	resp.qpn             = qp->qp_num;
+	resp.qp_handle       = obj->uevent.uobject.id;
+	resp.max_recv_sge    = attr.cap.max_recv_sge;
+	resp.max_send_sge    = attr.cap.max_send_sge;
+	resp.max_recv_wr     = attr.cap.max_recv_wr;
+	resp.max_send_wr     = attr.cap.max_send_wr;
+	resp.max_inline_data = attr.cap.max_inline_data;
+
+	if (copy_to_user((void __user *) (unsigned long) cmd.response,
+			 &resp, sizeof resp)) {
+		ret = -EFAULT;
+		goto err_copy;
+	}
+
+	if (xrcd) {
+		obj->uxrcd = container_of(xrcd_uobj, struct ib_uxrcd_object,
+					  uobject);
+		atomic_inc(&obj->uxrcd->refcnt);
+		put_xrcd_read(xrcd_uobj);
+	}
+
+	if (pd)
+		put_pd_read(pd);
+	if (scq)
+		put_cq_read(scq);
+	if (rcq && rcq != scq)
+		put_cq_read(rcq);
+	if (srq)
+		put_srq_read(srq);
+
+	mutex_lock(&file->mutex);
+	list_add_tail(&obj->uevent.uobject.list, &file->ucontext->qp_list);
+	mutex_unlock(&file->mutex);
+
+	obj->uevent.uobject.live = 1;
+
+	up_write(&obj->uevent.uobject.mutex);
+
+	return in_len;
+
+err_copy:
+	idr_remove_uobj(&ib_uverbs_qp_idr, &obj->uevent.uobject);
+
+err_destroy:
+	ib_destroy_qp(qp);
+
+err_put:
+	if (xrcd)
+		put_xrcd_read(xrcd_uobj);
+	if (pd)
+		put_pd_read(pd);
+	if (scq)
+		put_cq_read(scq);
+	if (rcq && rcq != scq)
+		put_cq_read(rcq);
+	if (srq)
+		put_srq_read(srq);
+
+	put_uobj_write(&obj->uevent.uobject);
+	return ret;
+}
+
+ssize_t ib_uverbs_open_qp(struct ib_uverbs_file *file,
+			  const char __user *buf, int in_len, int out_len)
+{
+	struct ib_uverbs_open_qp        cmd;
+	struct ib_uverbs_create_qp_resp resp;
+	struct ib_udata                 udata;
+	struct ib_uqp_object           *obj;
+	struct ib_xrcd		       *xrcd;
+	struct ib_uobject	       *uninitialized_var(xrcd_uobj);
+	struct ib_qp                   *qp;
+	struct ib_qp_open_attr          attr;
+	int ret;
+
+	if (out_len < sizeof resp)
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	INIT_UDATA(&udata, buf + sizeof cmd,
+		   (unsigned long) cmd.response + sizeof resp,
+		   in_len - sizeof cmd, out_len - sizeof resp);
+
+	obj = kmalloc(sizeof *obj, GFP_KERNEL);
+	if (!obj)
+		return -ENOMEM;
+
+	init_uobj(&obj->uevent.uobject, cmd.user_handle, file->ucontext, &qp_lock_class);
+	down_write(&obj->uevent.uobject.mutex);
+
+	xrcd = idr_read_xrcd(cmd.pd_handle, file->ucontext, &xrcd_uobj);
+	if (!xrcd) {
+		ret = -EINVAL;
+		goto err_put;
+	}
+
+	attr.event_handler = ib_uverbs_qp_event_handler;
+	attr.qp_context    = file;
+	attr.qp_num        = cmd.qpn;
+	attr.qp_type       = cmd.qp_type;
+
+	obj->uevent.events_reported = 0;
+	INIT_LIST_HEAD(&obj->uevent.event_list);
+	INIT_LIST_HEAD(&obj->mcast_list);
+
+	qp = ib_open_qp(xrcd, &attr);
+	if (IS_ERR(qp)) {
+		ret = PTR_ERR(qp);
+		goto err_put;
+	}
+
+	qp->uobject = &obj->uevent.uobject;
+
+	obj->uevent.uobject.object = qp;
+	ret = idr_add_uobj(&ib_uverbs_qp_idr, &obj->uevent.uobject);
+	if (ret)
+		goto err_destroy;
+
+	memset(&resp, 0, sizeof resp);
+	resp.qpn       = qp->qp_num;
+	resp.qp_handle = obj->uevent.uobject.id;
+
+	if (copy_to_user((void __user *) (unsigned long) cmd.response,
+			 &resp, sizeof resp)) {
+		ret = -EFAULT;
+		goto err_remove;
+	}
+
+	obj->uxrcd = container_of(xrcd_uobj, struct ib_uxrcd_object, uobject);
+	atomic_inc(&obj->uxrcd->refcnt);
+	put_xrcd_read(xrcd_uobj);
+
+	mutex_lock(&file->mutex);
+	list_add_tail(&obj->uevent.uobject.list, &file->ucontext->qp_list);
+	mutex_unlock(&file->mutex);
+
+	obj->uevent.uobject.live = 1;
+
+	up_write(&obj->uevent.uobject.mutex);
+
+	return in_len;
+
+err_remove:
+	idr_remove_uobj(&ib_uverbs_qp_idr, &obj->uevent.uobject);
+
+err_destroy:
+	ib_destroy_qp(qp);
+
+err_put:
+	put_xrcd_read(xrcd_uobj);
+	put_uobj_write(&obj->uevent.uobject);
+	return ret;
+}
+
+ssize_t ib_uverbs_query_qp(struct ib_uverbs_file *file,
+			   const char __user *buf, int in_len,
+			   int out_len)
+{
+	struct ib_uverbs_query_qp      cmd;
+	struct ib_uverbs_query_qp_resp resp;
+	struct ib_qp                   *qp;
+	struct ib_qp_attr              *attr;
+	struct ib_qp_init_attr         *init_attr;
+	int                            ret;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	attr      = kmalloc(sizeof *attr, GFP_KERNEL);
+	init_attr = kmalloc(sizeof *init_attr, GFP_KERNEL);
+	if (!attr || !init_attr) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	qp = idr_read_qp(cmd.qp_handle, file->ucontext);
+	if (!qp) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	ret = ib_query_qp(qp, attr, cmd.attr_mask, init_attr);
+
+	put_qp_read(qp);
+
+	if (ret)
+		goto out;
+
+	memset(&resp, 0, sizeof resp);
+
+	resp.qp_state               = attr->qp_state;
+	resp.cur_qp_state           = attr->cur_qp_state;
+	resp.path_mtu               = attr->path_mtu;
+	resp.path_mig_state         = attr->path_mig_state;
+	resp.qkey                   = attr->qkey;
+	resp.rq_psn                 = attr->rq_psn;
+	resp.sq_psn                 = attr->sq_psn;
+	resp.dest_qp_num            = attr->dest_qp_num;
+	resp.qp_access_flags        = attr->qp_access_flags;
+	resp.pkey_index             = attr->pkey_index;
+	resp.alt_pkey_index         = attr->alt_pkey_index;
+	resp.sq_draining            = attr->sq_draining;
+	resp.max_rd_atomic          = attr->max_rd_atomic;
+	resp.max_dest_rd_atomic     = attr->max_dest_rd_atomic;
+	resp.min_rnr_timer          = attr->min_rnr_timer;
+	resp.port_num               = attr->port_num;
+	resp.timeout                = attr->timeout;
+	resp.retry_cnt              = attr->retry_cnt;
+	resp.rnr_retry              = attr->rnr_retry;
+	resp.alt_port_num           = attr->alt_port_num;
+	resp.alt_timeout            = attr->alt_timeout;
+
+	memcpy(resp.dest.dgid, attr->ah_attr.grh.dgid.raw, 16);
+	resp.dest.flow_label        = attr->ah_attr.grh.flow_label;
+	resp.dest.sgid_index        = attr->ah_attr.grh.sgid_index;
+	resp.dest.hop_limit         = attr->ah_attr.grh.hop_limit;
+	resp.dest.traffic_class     = attr->ah_attr.grh.traffic_class;
+	resp.dest.dlid              = attr->ah_attr.dlid;
+	resp.dest.sl                = attr->ah_attr.sl;
+	resp.dest.src_path_bits     = attr->ah_attr.src_path_bits;
+	resp.dest.static_rate       = attr->ah_attr.static_rate;
+	resp.dest.is_global         = !!(attr->ah_attr.ah_flags & IB_AH_GRH);
+	resp.dest.port_num          = attr->ah_attr.port_num;
+
+	memcpy(resp.alt_dest.dgid, attr->alt_ah_attr.grh.dgid.raw, 16);
+	resp.alt_dest.flow_label    = attr->alt_ah_attr.grh.flow_label;
+	resp.alt_dest.sgid_index    = attr->alt_ah_attr.grh.sgid_index;
+	resp.alt_dest.hop_limit     = attr->alt_ah_attr.grh.hop_limit;
+	resp.alt_dest.traffic_class = attr->alt_ah_attr.grh.traffic_class;
+	resp.alt_dest.dlid          = attr->alt_ah_attr.dlid;
+	resp.alt_dest.sl            = attr->alt_ah_attr.sl;
+	resp.alt_dest.src_path_bits = attr->alt_ah_attr.src_path_bits;
+	resp.alt_dest.static_rate   = attr->alt_ah_attr.static_rate;
+	resp.alt_dest.is_global     = !!(attr->alt_ah_attr.ah_flags & IB_AH_GRH);
+	resp.alt_dest.port_num      = attr->alt_ah_attr.port_num;
+
+	resp.max_send_wr            = init_attr->cap.max_send_wr;
+	resp.max_recv_wr            = init_attr->cap.max_recv_wr;
+	resp.max_send_sge           = init_attr->cap.max_send_sge;
+	resp.max_recv_sge           = init_attr->cap.max_recv_sge;
+	resp.max_inline_data        = init_attr->cap.max_inline_data;
+	resp.sq_sig_all             = init_attr->sq_sig_type == IB_SIGNAL_ALL_WR;
+
+	if (copy_to_user((void __user *) (unsigned long) cmd.response,
+			 &resp, sizeof resp))
+		ret = -EFAULT;
+
+out:
+	kfree(attr);
+	kfree(init_attr);
+
+	return ret ? ret : in_len;
+}
+
+/* Remove ignored fields set in the attribute mask */
+static int modify_qp_mask(enum ib_qp_type qp_type, int mask)
+{
+	switch (qp_type) {
+	case IB_QPT_XRC_INI:
+		return mask & ~(IB_QP_MAX_DEST_RD_ATOMIC | IB_QP_MIN_RNR_TIMER);
+	case IB_QPT_XRC_TGT:
+		return mask & ~(IB_QP_MAX_QP_RD_ATOMIC | IB_QP_RETRY_CNT |
+				IB_QP_RNR_RETRY);
+	default:
+		return mask;
+	}
+}
+
+ssize_t ib_uverbs_modify_qp(struct ib_uverbs_file *file,
+			    const char __user *buf, int in_len,
+			    int out_len)
+{
+	struct ib_uverbs_modify_qp cmd;
+	struct ib_udata            udata;
+	struct ib_qp              *qp;
+	struct ib_qp_attr         *attr;
+	int                        ret;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	INIT_UDATA(&udata, buf + sizeof cmd, NULL, in_len - sizeof cmd,
+		   out_len);
+
+	attr = kmalloc(sizeof *attr, GFP_KERNEL);
+	if (!attr)
+		return -ENOMEM;
+
+	qp = idr_read_qp(cmd.qp_handle, file->ucontext);
+	if (!qp) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	attr->qp_state 		  = cmd.qp_state;
+	attr->cur_qp_state 	  = cmd.cur_qp_state;
+	attr->path_mtu 		  = cmd.path_mtu;
+	attr->path_mig_state 	  = cmd.path_mig_state;
+	attr->qkey 		  = cmd.qkey;
+	attr->rq_psn 		  = cmd.rq_psn;
+	attr->sq_psn 		  = cmd.sq_psn;
+	attr->dest_qp_num 	  = cmd.dest_qp_num;
+	attr->qp_access_flags 	  = cmd.qp_access_flags;
+	attr->pkey_index 	  = cmd.pkey_index;
+	attr->alt_pkey_index 	  = cmd.alt_pkey_index;
+	attr->en_sqd_async_notify = cmd.en_sqd_async_notify;
+	attr->max_rd_atomic 	  = cmd.max_rd_atomic;
+	attr->max_dest_rd_atomic  = cmd.max_dest_rd_atomic;
+	attr->min_rnr_timer 	  = cmd.min_rnr_timer;
+	attr->port_num 		  = cmd.port_num;
+	attr->timeout 		  = cmd.timeout;
+	attr->retry_cnt 	  = cmd.retry_cnt;
+	attr->rnr_retry 	  = cmd.rnr_retry;
+	attr->alt_port_num 	  = cmd.alt_port_num;
+	attr->alt_timeout 	  = cmd.alt_timeout;
+
+	memcpy(attr->ah_attr.grh.dgid.raw, cmd.dest.dgid, 16);
+	attr->ah_attr.grh.flow_label        = cmd.dest.flow_label;
+	attr->ah_attr.grh.sgid_index        = cmd.dest.sgid_index;
+	attr->ah_attr.grh.hop_limit         = cmd.dest.hop_limit;
+	attr->ah_attr.grh.traffic_class     = cmd.dest.traffic_class;
+	attr->ah_attr.dlid 	    	    = cmd.dest.dlid;
+	attr->ah_attr.sl   	    	    = cmd.dest.sl;
+	attr->ah_attr.src_path_bits 	    = cmd.dest.src_path_bits;
+	attr->ah_attr.static_rate   	    = cmd.dest.static_rate;
+	attr->ah_attr.ah_flags 	    	    = cmd.dest.is_global ? IB_AH_GRH : 0;
+	attr->ah_attr.port_num 	    	    = cmd.dest.port_num;
+
+	memcpy(attr->alt_ah_attr.grh.dgid.raw, cmd.alt_dest.dgid, 16);
+	attr->alt_ah_attr.grh.flow_label    = cmd.alt_dest.flow_label;
+	attr->alt_ah_attr.grh.sgid_index    = cmd.alt_dest.sgid_index;
+	attr->alt_ah_attr.grh.hop_limit     = cmd.alt_dest.hop_limit;
+	attr->alt_ah_attr.grh.traffic_class = cmd.alt_dest.traffic_class;
+	attr->alt_ah_attr.dlid 	    	    = cmd.alt_dest.dlid;
+	attr->alt_ah_attr.sl   	    	    = cmd.alt_dest.sl;
+	attr->alt_ah_attr.src_path_bits     = cmd.alt_dest.src_path_bits;
+	attr->alt_ah_attr.static_rate       = cmd.alt_dest.static_rate;
+	attr->alt_ah_attr.ah_flags 	    = cmd.alt_dest.is_global ? IB_AH_GRH : 0;
+	attr->alt_ah_attr.port_num 	    = cmd.alt_dest.port_num;
+
+	if (qp->real_qp == qp) {
+		ret = ib_resolve_eth_l2_attrs(qp, attr, &cmd.attr_mask);
+		if (ret)
+			goto out;
+		ret = qp->device->modify_qp(qp, attr,
+			modify_qp_mask(qp->qp_type, cmd.attr_mask), &udata);
+	} else {
+		ret = ib_modify_qp(qp, attr, modify_qp_mask(qp->qp_type, cmd.attr_mask));
+	}
+
+	put_qp_read(qp);
+
+	if (ret)
+		goto out;
+
+	ret = in_len;
+
+out:
+	kfree(attr);
+
+	return ret;
+}
+
+ssize_t ib_uverbs_destroy_qp(struct ib_uverbs_file *file,
+			     const char __user *buf, int in_len,
+			     int out_len)
+{
+	struct ib_uverbs_destroy_qp      cmd;
+	struct ib_uverbs_destroy_qp_resp resp;
+	struct ib_uobject		*uobj;
+	struct ib_qp               	*qp;
+	struct ib_uqp_object        	*obj;
+	int                        	 ret = -EINVAL;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	memset(&resp, 0, sizeof resp);
+
+	uobj = idr_write_uobj(&ib_uverbs_qp_idr, cmd.qp_handle, file->ucontext);
+	if (!uobj)
+		return -EINVAL;
+	qp  = uobj->object;
+	obj = container_of(uobj, struct ib_uqp_object, uevent.uobject);
+
+	if (!list_empty(&obj->mcast_list)) {
+		put_uobj_write(uobj);
+		return -EBUSY;
+	}
+
+	ret = ib_destroy_qp(qp);
+	if (!ret)
+		uobj->live = 0;
+
+	put_uobj_write(uobj);
+
+	if (ret)
+		return ret;
+
+	if (obj->uxrcd)
+		atomic_dec(&obj->uxrcd->refcnt);
+
+	idr_remove_uobj(&ib_uverbs_qp_idr, uobj);
+
+	mutex_lock(&file->mutex);
+	list_del(&uobj->list);
+	mutex_unlock(&file->mutex);
+
+	ib_uverbs_release_uevent(file, &obj->uevent);
+
+	resp.events_reported = obj->uevent.events_reported;
+
+	put_uobj(uobj);
+
+	if (copy_to_user((void __user *) (unsigned long) cmd.response,
+			 &resp, sizeof resp))
+		return -EFAULT;
+
+	return in_len;
+}
+
+ssize_t ib_uverbs_post_send(struct ib_uverbs_file *file,
+			    const char __user *buf, int in_len,
+			    int out_len)
+{
+	struct ib_uverbs_post_send      cmd;
+	struct ib_uverbs_post_send_resp resp;
+	struct ib_uverbs_send_wr       *user_wr;
+	struct ib_send_wr              *wr = NULL, *last, *next, *bad_wr;
+	struct ib_qp                   *qp;
+	int                             i, sg_ind;
+	int				is_ud;
+	ssize_t                         ret = -EINVAL;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	if (in_len < sizeof cmd + cmd.wqe_size * cmd.wr_count +
+	    cmd.sge_count * sizeof (struct ib_uverbs_sge))
+		return -EINVAL;
+
+	if (cmd.wqe_size < sizeof (struct ib_uverbs_send_wr))
+		return -EINVAL;
+
+	user_wr = kmalloc(cmd.wqe_size, GFP_KERNEL);
+	if (!user_wr)
+		return -ENOMEM;
+
+	qp = idr_read_qp(cmd.qp_handle, file->ucontext);
+	if (!qp)
+		goto out;
+
+	is_ud = qp->qp_type == IB_QPT_UD;
+	sg_ind = 0;
+	last = NULL;
+	for (i = 0; i < cmd.wr_count; ++i) {
+		if (copy_from_user(user_wr,
+				   buf + sizeof cmd + i * cmd.wqe_size,
+				   cmd.wqe_size)) {
+			ret = -EFAULT;
+			goto out_put;
+		}
+
+		if (user_wr->num_sge + sg_ind > cmd.sge_count) {
+			ret = -EINVAL;
+			goto out_put;
+		}
+
+		next = kmalloc(ALIGN(sizeof *next, sizeof (struct ib_sge)) +
+			       user_wr->num_sge * sizeof (struct ib_sge),
+			       GFP_KERNEL);
+		if (!next) {
+			ret = -ENOMEM;
+			goto out_put;
+		}
+
+		if (!last)
+			wr = next;
+		else
+			last->next = next;
+		last = next;
+
+		next->next       = NULL;
+		next->wr_id      = user_wr->wr_id;
+		next->num_sge    = user_wr->num_sge;
+		next->opcode     = user_wr->opcode;
+		next->send_flags = user_wr->send_flags;
+
+		if (is_ud) {
+			next->wr.ud.ah = idr_read_ah(user_wr->wr.ud.ah,
+						     file->ucontext);
+			if (!next->wr.ud.ah) {
+				ret = -EINVAL;
+				goto out_put;
+			}
+			next->wr.ud.remote_qpn  = user_wr->wr.ud.remote_qpn;
+			next->wr.ud.remote_qkey = user_wr->wr.ud.remote_qkey;
+			if (next->opcode == IB_WR_SEND_WITH_IMM)
+				next->ex.imm_data =
+					(__be32 __force) user_wr->ex.imm_data;
+		} else {
+			switch (next->opcode) {
+			case IB_WR_RDMA_WRITE_WITH_IMM:
+				next->ex.imm_data =
+					(__be32 __force) user_wr->ex.imm_data;
+			case IB_WR_RDMA_WRITE:
+			case IB_WR_RDMA_READ:
+				next->wr.rdma.remote_addr =
+					user_wr->wr.rdma.remote_addr;
+				next->wr.rdma.rkey        =
+					user_wr->wr.rdma.rkey;
+				break;
+			case IB_WR_SEND_WITH_IMM:
+				next->ex.imm_data =
+					(__be32 __force) user_wr->ex.imm_data;
+				break;
+			case IB_WR_SEND_WITH_INV:
+				next->ex.invalidate_rkey =
+					user_wr->ex.invalidate_rkey;
+				break;
+			case IB_WR_ATOMIC_CMP_AND_SWP:
+			case IB_WR_ATOMIC_FETCH_AND_ADD:
+				next->wr.atomic.remote_addr =
+					user_wr->wr.atomic.remote_addr;
+				next->wr.atomic.compare_add =
+					user_wr->wr.atomic.compare_add;
+				next->wr.atomic.swap = user_wr->wr.atomic.swap;
+				next->wr.atomic.rkey = user_wr->wr.atomic.rkey;
+				break;
+			default:
+				break;
+			}
+		}
+
+		if (next->num_sge) {
+			next->sg_list = (void *) next +
+				ALIGN(sizeof *next, sizeof (struct ib_sge));
+			if (copy_from_user(next->sg_list,
+					   buf + sizeof cmd +
+					   cmd.wr_count * cmd.wqe_size +
+					   sg_ind * sizeof (struct ib_sge),
+					   next->num_sge * sizeof (struct ib_sge))) {
+				ret = -EFAULT;
+				goto out_put;
+			}
+			sg_ind += next->num_sge;
+		} else
+			next->sg_list = NULL;
+	}
+
+	resp.bad_wr = 0;
+	ret = qp->device->post_send(qp->real_qp, wr, &bad_wr);
+	if (ret)
+		for (next = wr; next; next = next->next) {
+			++resp.bad_wr;
+			if (next == bad_wr)
+				break;
+		}
+
+	if (copy_to_user((void __user *) (unsigned long) cmd.response,
+			 &resp, sizeof resp))
+		ret = -EFAULT;
+
+out_put:
+	put_qp_read(qp);
+
+	while (wr) {
+		if (is_ud && wr->wr.ud.ah)
+			put_ah_read(wr->wr.ud.ah);
+		next = wr->next;
+		kfree(wr);
+		wr = next;
+	}
+
+out:
+	kfree(user_wr);
+
+	return ret ? ret : in_len;
+}
+
+static struct ib_recv_wr *ib_uverbs_unmarshall_recv(const char __user *buf,
+						    int in_len,
+						    u32 wr_count,
+						    u32 sge_count,
+						    u32 wqe_size)
+{
+	struct ib_uverbs_recv_wr *user_wr;
+	struct ib_recv_wr        *wr = NULL, *last, *next;
+	int                       sg_ind;
+	int                       i;
+	int                       ret;
+
+	if (in_len < wqe_size * wr_count +
+	    sge_count * sizeof (struct ib_uverbs_sge))
+		return ERR_PTR(-EINVAL);
+
+	if (wqe_size < sizeof (struct ib_uverbs_recv_wr))
+		return ERR_PTR(-EINVAL);
+
+	user_wr = kmalloc(wqe_size, GFP_KERNEL);
+	if (!user_wr)
+		return ERR_PTR(-ENOMEM);
+
+	sg_ind = 0;
+	last = NULL;
+	for (i = 0; i < wr_count; ++i) {
+		if (copy_from_user(user_wr, buf + i * wqe_size,
+				   wqe_size)) {
+			ret = -EFAULT;
+			goto err;
+		}
+
+		if (user_wr->num_sge + sg_ind > sge_count) {
+			ret = -EINVAL;
+			goto err;
+		}
+
+		next = kmalloc(ALIGN(sizeof *next, sizeof (struct ib_sge)) +
+			       user_wr->num_sge * sizeof (struct ib_sge),
+			       GFP_KERNEL);
+		if (!next) {
+			ret = -ENOMEM;
+			goto err;
+		}
+
+		if (!last)
+			wr = next;
+		else
+			last->next = next;
+		last = next;
+
+		next->next       = NULL;
+		next->wr_id      = user_wr->wr_id;
+		next->num_sge    = user_wr->num_sge;
+
+		if (next->num_sge) {
+			next->sg_list = (void *) next +
+				ALIGN(sizeof *next, sizeof (struct ib_sge));
+			if (copy_from_user(next->sg_list,
+					   buf + wr_count * wqe_size +
+					   sg_ind * sizeof (struct ib_sge),
+					   next->num_sge * sizeof (struct ib_sge))) {
+				ret = -EFAULT;
+				goto err;
+			}
+			sg_ind += next->num_sge;
+		} else
+			next->sg_list = NULL;
+	}
+
+	kfree(user_wr);
+	return wr;
+
+err:
+	kfree(user_wr);
+
+	while (wr) {
+		next = wr->next;
+		kfree(wr);
+		wr = next;
+	}
+
+	return ERR_PTR(ret);
+}
+
+ssize_t ib_uverbs_post_recv(struct ib_uverbs_file *file,
+			    const char __user *buf, int in_len,
+			    int out_len)
+{
+	struct ib_uverbs_post_recv      cmd;
+	struct ib_uverbs_post_recv_resp resp;
+	struct ib_recv_wr              *wr, *next, *bad_wr;
+	struct ib_qp                   *qp;
+	ssize_t                         ret = -EINVAL;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	wr = ib_uverbs_unmarshall_recv(buf + sizeof cmd,
+				       in_len - sizeof cmd, cmd.wr_count,
+				       cmd.sge_count, cmd.wqe_size);
+	if (IS_ERR(wr))
+		return PTR_ERR(wr);
+
+	qp = idr_read_qp(cmd.qp_handle, file->ucontext);
+	if (!qp)
+		goto out;
+
+	resp.bad_wr = 0;
+	ret = qp->device->post_recv(qp->real_qp, wr, &bad_wr);
+
+	put_qp_read(qp);
+
+	if (ret)
+		for (next = wr; next; next = next->next) {
+			++resp.bad_wr;
+			if (next == bad_wr)
+				break;
+		}
+
+	if (copy_to_user((void __user *) (unsigned long) cmd.response,
+			 &resp, sizeof resp))
+		ret = -EFAULT;
+
+out:
+	while (wr) {
+		next = wr->next;
+		kfree(wr);
+		wr = next;
+	}
+
+	return ret ? ret : in_len;
+}
+
+ssize_t ib_uverbs_post_srq_recv(struct ib_uverbs_file *file,
+				const char __user *buf, int in_len,
+				int out_len)
+{
+	struct ib_uverbs_post_srq_recv      cmd;
+	struct ib_uverbs_post_srq_recv_resp resp;
+	struct ib_recv_wr                  *wr, *next, *bad_wr;
+	struct ib_srq                      *srq;
+	ssize_t                             ret = -EINVAL;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	wr = ib_uverbs_unmarshall_recv(buf + sizeof cmd,
+				       in_len - sizeof cmd, cmd.wr_count,
+				       cmd.sge_count, cmd.wqe_size);
+	if (IS_ERR(wr))
+		return PTR_ERR(wr);
+
+	srq = idr_read_srq(cmd.srq_handle, file->ucontext);
+	if (!srq)
+		goto out;
+
+	resp.bad_wr = 0;
+	ret = srq->device->post_srq_recv(srq, wr, &bad_wr);
+
+	put_srq_read(srq);
+
+	if (ret)
+		for (next = wr; next; next = next->next) {
+			++resp.bad_wr;
+			if (next == bad_wr)
+				break;
+		}
+
+	if (copy_to_user((void __user *) (unsigned long) cmd.response,
+			 &resp, sizeof resp))
+		ret = -EFAULT;
+
+out:
+	while (wr) {
+		next = wr->next;
+		kfree(wr);
+		wr = next;
+	}
+
+	return ret ? ret : in_len;
+}
+
+ssize_t ib_uverbs_create_ah(struct ib_uverbs_file *file,
+			    const char __user *buf, int in_len,
+			    int out_len)
+{
+	struct ib_uverbs_create_ah	 cmd;
+	struct ib_uverbs_create_ah_resp	 resp;
+	struct ib_uobject		*uobj;
+	struct ib_pd			*pd;
+	struct ib_ah			*ah;
+	struct ib_ah_attr		attr;
+	int ret;
+
+	if (out_len < sizeof resp)
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	uobj = kmalloc(sizeof *uobj, GFP_KERNEL);
+	if (!uobj)
+		return -ENOMEM;
+
+	init_uobj(uobj, cmd.user_handle, file->ucontext, &ah_lock_class);
+	down_write(&uobj->mutex);
+
+	pd = idr_read_pd(cmd.pd_handle, file->ucontext);
+	if (!pd) {
+		ret = -EINVAL;
+		goto err;
+	}
+
+	attr.dlid 	       = cmd.attr.dlid;
+	attr.sl 	       = cmd.attr.sl;
+	attr.src_path_bits     = cmd.attr.src_path_bits;
+	attr.static_rate       = cmd.attr.static_rate;
+	attr.ah_flags          = cmd.attr.is_global ? IB_AH_GRH : 0;
+	attr.port_num 	       = cmd.attr.port_num;
+	attr.grh.flow_label    = cmd.attr.grh.flow_label;
+	attr.grh.sgid_index    = cmd.attr.grh.sgid_index;
+	attr.grh.hop_limit     = cmd.attr.grh.hop_limit;
+	attr.grh.traffic_class = cmd.attr.grh.traffic_class;
+	attr.vlan_id           = 0;
+	memset(&attr.dmac, 0, sizeof(attr.dmac));
+	memcpy(attr.grh.dgid.raw, cmd.attr.grh.dgid, 16);
+
+	ah = ib_create_ah(pd, &attr);
+	if (IS_ERR(ah)) {
+		ret = PTR_ERR(ah);
+		goto err_put;
+	}
+
+	ah->uobject  = uobj;
+	uobj->object = ah;
+
+	ret = idr_add_uobj(&ib_uverbs_ah_idr, uobj);
+	if (ret)
+		goto err_destroy;
+
+	resp.ah_handle = uobj->id;
+
+	if (copy_to_user((void __user *) (unsigned long) cmd.response,
+			 &resp, sizeof resp)) {
+		ret = -EFAULT;
+		goto err_copy;
+	}
+
+	put_pd_read(pd);
+
+	mutex_lock(&file->mutex);
+	list_add_tail(&uobj->list, &file->ucontext->ah_list);
+	mutex_unlock(&file->mutex);
+
+	uobj->live = 1;
+
+	up_write(&uobj->mutex);
+
+	return in_len;
+
+err_copy:
+	idr_remove_uobj(&ib_uverbs_ah_idr, uobj);
+
+err_destroy:
+	ib_destroy_ah(ah);
+
+err_put:
+	put_pd_read(pd);
+
+err:
+	put_uobj_write(uobj);
+	return ret;
+}
+
+ssize_t ib_uverbs_destroy_ah(struct ib_uverbs_file *file,
+			     const char __user *buf, int in_len, int out_len)
+{
+	struct ib_uverbs_destroy_ah cmd;
+	struct ib_ah		   *ah;
+	struct ib_uobject	   *uobj;
+	int			    ret;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	uobj = idr_write_uobj(&ib_uverbs_ah_idr, cmd.ah_handle, file->ucontext);
+	if (!uobj)
+		return -EINVAL;
+	ah = uobj->object;
+
+	ret = ib_destroy_ah(ah);
+	if (!ret)
+		uobj->live = 0;
+
+	put_uobj_write(uobj);
+
+	if (ret)
+		return ret;
+
+	idr_remove_uobj(&ib_uverbs_ah_idr, uobj);
+
+	mutex_lock(&file->mutex);
+	list_del(&uobj->list);
+	mutex_unlock(&file->mutex);
+
+	put_uobj(uobj);
+
+	return in_len;
+}
+
+ssize_t ib_uverbs_attach_mcast(struct ib_uverbs_file *file,
+			       const char __user *buf, int in_len,
+			       int out_len)
+{
+	struct ib_uverbs_attach_mcast cmd;
+	struct ib_qp                 *qp;
+	struct ib_uqp_object         *obj;
+	struct ib_uverbs_mcast_entry *mcast;
+	int                           ret;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	qp = idr_write_qp(cmd.qp_handle, file->ucontext);
+	if (!qp)
+		return -EINVAL;
+
+	obj = container_of(qp->uobject, struct ib_uqp_object, uevent.uobject);
+
+	list_for_each_entry(mcast, &obj->mcast_list, list)
+		if (cmd.mlid == mcast->lid &&
+		    !memcmp(cmd.gid, mcast->gid.raw, sizeof mcast->gid.raw)) {
+			ret = 0;
+			goto out_put;
+		}
+
+	mcast = kmalloc(sizeof *mcast, GFP_KERNEL);
+	if (!mcast) {
+		ret = -ENOMEM;
+		goto out_put;
+	}
+
+	mcast->lid = cmd.mlid;
+	memcpy(mcast->gid.raw, cmd.gid, sizeof mcast->gid.raw);
+
+	ret = ib_attach_mcast(qp, &mcast->gid, cmd.mlid);
+	if (!ret)
+		list_add_tail(&mcast->list, &obj->mcast_list);
+	else
+		kfree(mcast);
+
+out_put:
+	put_qp_write(qp);
+
+	return ret ? ret : in_len;
+}
+
+ssize_t ib_uverbs_detach_mcast(struct ib_uverbs_file *file,
+			       const char __user *buf, int in_len,
+			       int out_len)
+{
+	struct ib_uverbs_detach_mcast cmd;
+	struct ib_uqp_object         *obj;
+	struct ib_qp                 *qp;
+	struct ib_uverbs_mcast_entry *mcast;
+	int                           ret = -EINVAL;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	qp = idr_write_qp(cmd.qp_handle, file->ucontext);
+	if (!qp)
+		return -EINVAL;
+
+	ret = ib_detach_mcast(qp, (union ib_gid *) cmd.gid, cmd.mlid);
+	if (ret)
+		goto out_put;
+
+	obj = container_of(qp->uobject, struct ib_uqp_object, uevent.uobject);
+
+	list_for_each_entry(mcast, &obj->mcast_list, list)
+		if (cmd.mlid == mcast->lid &&
+		    !memcmp(cmd.gid, mcast->gid.raw, sizeof mcast->gid.raw)) {
+			list_del(&mcast->list);
+			kfree(mcast);
+			break;
+		}
+
+out_put:
+	put_qp_write(qp);
+
+	return ret ? ret : in_len;
+}
+
+static int kern_spec_to_ib_spec(struct ib_uverbs_flow_spec *kern_spec,
+				union ib_flow_spec *ib_spec)
+{
+	if (kern_spec->reserved)
+		return -EINVAL;
+
+	ib_spec->type = kern_spec->type;
+
+	switch (ib_spec->type) {
+	case IB_FLOW_SPEC_ETH:
+		ib_spec->eth.size = sizeof(struct ib_flow_spec_eth);
+		if (ib_spec->eth.size != kern_spec->eth.size)
+			return -EINVAL;
+		memcpy(&ib_spec->eth.val, &kern_spec->eth.val,
+		       sizeof(struct ib_flow_eth_filter));
+		memcpy(&ib_spec->eth.mask, &kern_spec->eth.mask,
+		       sizeof(struct ib_flow_eth_filter));
+		break;
+	case IB_FLOW_SPEC_IPV4:
+		ib_spec->ipv4.size = sizeof(struct ib_flow_spec_ipv4);
+		if (ib_spec->ipv4.size != kern_spec->ipv4.size)
+			return -EINVAL;
+		memcpy(&ib_spec->ipv4.val, &kern_spec->ipv4.val,
+		       sizeof(struct ib_flow_ipv4_filter));
+		memcpy(&ib_spec->ipv4.mask, &kern_spec->ipv4.mask,
+		       sizeof(struct ib_flow_ipv4_filter));
+		break;
+	case IB_FLOW_SPEC_TCP:
+	case IB_FLOW_SPEC_UDP:
+		ib_spec->tcp_udp.size = sizeof(struct ib_flow_spec_tcp_udp);
+		if (ib_spec->tcp_udp.size != kern_spec->tcp_udp.size)
+			return -EINVAL;
+		memcpy(&ib_spec->tcp_udp.val, &kern_spec->tcp_udp.val,
+		       sizeof(struct ib_flow_tcp_udp_filter));
+		memcpy(&ib_spec->tcp_udp.mask, &kern_spec->tcp_udp.mask,
+		       sizeof(struct ib_flow_tcp_udp_filter));
+		break;
+	default:
+		return -EINVAL;
+	}
+	return 0;
+}
+
+int ib_uverbs_ex_create_flow(struct ib_uverbs_file *file,
+			     struct ib_udata *ucore,
+			     struct ib_udata *uhw)
+{
+	struct ib_uverbs_create_flow	  cmd;
+	struct ib_uverbs_create_flow_resp resp;
+	struct ib_uobject		  *uobj;
+	struct ib_flow			  *flow_id;
+	struct ib_uverbs_flow_attr	  *kern_flow_attr;
+	struct ib_flow_attr		  *flow_attr;
+	struct ib_qp			  *qp;
+	int err = 0;
+	void *kern_spec;
+	void *ib_spec;
+	int i;
+
+	if (ucore->inlen < sizeof(cmd))
+		return -EINVAL;
+
+	if (ucore->outlen < sizeof(resp))
+		return -ENOSPC;
+
+	err = ib_copy_from_udata(&cmd, ucore, sizeof(cmd));
+	if (err)
+		return err;
+
+	ucore->inbuf += sizeof(cmd);
+	ucore->inlen -= sizeof(cmd);
+
+	if (cmd.comp_mask)
+		return -EINVAL;
+
+	if ((cmd.flow_attr.type == IB_FLOW_ATTR_SNIFFER &&
+	     !capable(CAP_NET_ADMIN)) || !capable(CAP_NET_RAW))
+		return -EPERM;
+
+	if (cmd.flow_attr.num_of_specs > IB_FLOW_SPEC_SUPPORT_LAYERS)
+		return -EINVAL;
+
+	if (cmd.flow_attr.size > ucore->inlen ||
+	    cmd.flow_attr.size >
+	    (cmd.flow_attr.num_of_specs * sizeof(struct ib_uverbs_flow_spec)))
+		return -EINVAL;
+
+	if (cmd.flow_attr.reserved[0] ||
+	    cmd.flow_attr.reserved[1])
+		return -EINVAL;
+
+	if (cmd.flow_attr.num_of_specs) {
+		kern_flow_attr = kmalloc(sizeof(*kern_flow_attr) + cmd.flow_attr.size,
+					 GFP_KERNEL);
+		if (!kern_flow_attr)
+			return -ENOMEM;
+
+		memcpy(kern_flow_attr, &cmd.flow_attr, sizeof(*kern_flow_attr));
+		err = ib_copy_from_udata(kern_flow_attr + 1, ucore,
+					 cmd.flow_attr.size);
+		if (err)
+			goto err_free_attr;
+	} else {
+		kern_flow_attr = &cmd.flow_attr;
+	}
+
+	uobj = kmalloc(sizeof(*uobj), GFP_KERNEL);
+	if (!uobj) {
+		err = -ENOMEM;
+		goto err_free_attr;
+	}
+	init_uobj(uobj, 0, file->ucontext, &rule_lock_class);
+	down_write(&uobj->mutex);
+
+	qp = idr_read_qp(cmd.qp_handle, file->ucontext);
+	if (!qp) {
+		err = -EINVAL;
+		goto err_uobj;
+	}
+
+	flow_attr = kmalloc(sizeof(*flow_attr) + cmd.flow_attr.size, GFP_KERNEL);
+	if (!flow_attr) {
+		err = -ENOMEM;
+		goto err_put;
+	}
+
+	flow_attr->type = kern_flow_attr->type;
+	flow_attr->priority = kern_flow_attr->priority;
+	flow_attr->num_of_specs = kern_flow_attr->num_of_specs;
+	flow_attr->port = kern_flow_attr->port;
+	flow_attr->flags = kern_flow_attr->flags;
+	flow_attr->size = sizeof(*flow_attr);
+
+	kern_spec = kern_flow_attr + 1;
+	ib_spec = flow_attr + 1;
+	for (i = 0; i < flow_attr->num_of_specs &&
+	     cmd.flow_attr.size > offsetof(struct ib_uverbs_flow_spec, reserved) &&
+	     cmd.flow_attr.size >=
+	     ((struct ib_uverbs_flow_spec *)kern_spec)->size; i++) {
+		err = kern_spec_to_ib_spec(kern_spec, ib_spec);
+		if (err)
+			goto err_free;
+		flow_attr->size +=
+			((union ib_flow_spec *) ib_spec)->size;
+		cmd.flow_attr.size -= ((struct ib_uverbs_flow_spec *)kern_spec)->size;
+		kern_spec += ((struct ib_uverbs_flow_spec *) kern_spec)->size;
+		ib_spec += ((union ib_flow_spec *) ib_spec)->size;
+	}
+	if (cmd.flow_attr.size || (i != flow_attr->num_of_specs)) {
+		pr_warn("create flow failed, flow %d: %d bytes left from uverb cmd\n",
+			i, cmd.flow_attr.size);
+		err = -EINVAL;
+		goto err_free;
+	}
+	flow_id = ib_create_flow(qp, flow_attr, IB_FLOW_DOMAIN_USER);
+	if (IS_ERR(flow_id)) {
+		err = PTR_ERR(flow_id);
+		goto err_free;
+	}
+	flow_id->qp = qp;
+	flow_id->uobject = uobj;
+	uobj->object = flow_id;
+
+	err = idr_add_uobj(&ib_uverbs_rule_idr, uobj);
+	if (err)
+		goto destroy_flow;
+
+	memset(&resp, 0, sizeof(resp));
+	resp.flow_handle = uobj->id;
+
+	err = ib_copy_to_udata(ucore,
+			       &resp, sizeof(resp));
+	if (err)
+		goto err_copy;
+
+	put_qp_read(qp);
+	mutex_lock(&file->mutex);
+	list_add_tail(&uobj->list, &file->ucontext->rule_list);
+	mutex_unlock(&file->mutex);
+
+	uobj->live = 1;
+
+	up_write(&uobj->mutex);
+	kfree(flow_attr);
+	if (cmd.flow_attr.num_of_specs)
+		kfree(kern_flow_attr);
+	return 0;
+err_copy:
+	idr_remove_uobj(&ib_uverbs_rule_idr, uobj);
+destroy_flow:
+	ib_destroy_flow(flow_id);
+err_free:
+	kfree(flow_attr);
+err_put:
+	put_qp_read(qp);
+err_uobj:
+	put_uobj_write(uobj);
+err_free_attr:
+	if (cmd.flow_attr.num_of_specs)
+		kfree(kern_flow_attr);
+	return err;
+}
+
+int ib_uverbs_ex_destroy_flow(struct ib_uverbs_file *file,
+			      struct ib_udata *ucore,
+			      struct ib_udata *uhw)
+{
+	struct ib_uverbs_destroy_flow	cmd;
+	struct ib_flow			*flow_id;
+	struct ib_uobject		*uobj;
+	int				ret;
+
+	if (ucore->inlen < sizeof(cmd))
+		return -EINVAL;
+
+	ret = ib_copy_from_udata(&cmd, ucore, sizeof(cmd));
+	if (ret)
+		return ret;
+
+	if (cmd.comp_mask)
+		return -EINVAL;
+
+	uobj = idr_write_uobj(&ib_uverbs_rule_idr, cmd.flow_handle,
+			      file->ucontext);
+	if (!uobj)
+		return -EINVAL;
+	flow_id = uobj->object;
+
+	ret = ib_destroy_flow(flow_id);
+	if (!ret)
+		uobj->live = 0;
+
+	put_uobj_write(uobj);
+
+	idr_remove_uobj(&ib_uverbs_rule_idr, uobj);
+
+	mutex_lock(&file->mutex);
+	list_del(&uobj->list);
+	mutex_unlock(&file->mutex);
+
+	put_uobj(uobj);
+
+	return ret;
+}
+
+static int __uverbs_create_xsrq(struct ib_uverbs_file *file,
+				struct ib_uverbs_create_xsrq *cmd,
+				struct ib_udata *udata)
+{
+	struct ib_uverbs_create_srq_resp resp;
+	struct ib_usrq_object           *obj;
+	struct ib_pd                    *pd;
+	struct ib_srq                   *srq;
+	struct ib_uobject               *uninitialized_var(xrcd_uobj);
+	struct ib_srq_init_attr          attr;
+	int ret;
+
+	obj = kmalloc(sizeof *obj, GFP_KERNEL);
+	if (!obj)
+		return -ENOMEM;
+
+	init_uobj(&obj->uevent.uobject, cmd->user_handle, file->ucontext, &srq_lock_class);
+	down_write(&obj->uevent.uobject.mutex);
+
+	if (cmd->srq_type == IB_SRQT_XRC) {
+		attr.ext.xrc.xrcd  = idr_read_xrcd(cmd->xrcd_handle, file->ucontext, &xrcd_uobj);
+		if (!attr.ext.xrc.xrcd) {
+			ret = -EINVAL;
+			goto err;
+		}
+
+		obj->uxrcd = container_of(xrcd_uobj, struct ib_uxrcd_object, uobject);
+		atomic_inc(&obj->uxrcd->refcnt);
+
+		attr.ext.xrc.cq  = idr_read_cq(cmd->cq_handle, file->ucontext, 0);
+		if (!attr.ext.xrc.cq) {
+			ret = -EINVAL;
+			goto err_put_xrcd;
+		}
+	}
+
+	pd  = idr_read_pd(cmd->pd_handle, file->ucontext);
+	if (!pd) {
+		ret = -EINVAL;
+		goto err_put_cq;
+	}
+
+	attr.event_handler  = ib_uverbs_srq_event_handler;
+	attr.srq_context    = file;
+	attr.srq_type       = cmd->srq_type;
+	attr.attr.max_wr    = cmd->max_wr;
+	attr.attr.max_sge   = cmd->max_sge;
+	attr.attr.srq_limit = cmd->srq_limit;
+
+	obj->uevent.events_reported = 0;
+	INIT_LIST_HEAD(&obj->uevent.event_list);
+
+	srq = pd->device->create_srq(pd, &attr, udata);
+	if (IS_ERR(srq)) {
+		ret = PTR_ERR(srq);
+		goto err_put;
+	}
+
+	srq->device        = pd->device;
+	srq->pd            = pd;
+	srq->srq_type	   = cmd->srq_type;
+	srq->uobject       = &obj->uevent.uobject;
+	srq->event_handler = attr.event_handler;
+	srq->srq_context   = attr.srq_context;
+
+	if (cmd->srq_type == IB_SRQT_XRC) {
+		srq->ext.xrc.cq   = attr.ext.xrc.cq;
+		srq->ext.xrc.xrcd = attr.ext.xrc.xrcd;
+		atomic_inc(&attr.ext.xrc.cq->usecnt);
+		atomic_inc(&attr.ext.xrc.xrcd->usecnt);
+	}
+
+	atomic_inc(&pd->usecnt);
+	atomic_set(&srq->usecnt, 0);
+
+	obj->uevent.uobject.object = srq;
+	ret = idr_add_uobj(&ib_uverbs_srq_idr, &obj->uevent.uobject);
+	if (ret)
+		goto err_destroy;
+
+	memset(&resp, 0, sizeof resp);
+	resp.srq_handle = obj->uevent.uobject.id;
+	resp.max_wr     = attr.attr.max_wr;
+	resp.max_sge    = attr.attr.max_sge;
+	if (cmd->srq_type == IB_SRQT_XRC)
+		resp.srqn = srq->ext.xrc.srq_num;
+
+	if (copy_to_user((void __user *) (unsigned long) cmd->response,
+			 &resp, sizeof resp)) {
+		ret = -EFAULT;
+		goto err_copy;
+	}
+
+	if (cmd->srq_type == IB_SRQT_XRC) {
+		put_uobj_read(xrcd_uobj);
+		put_cq_read(attr.ext.xrc.cq);
+	}
+	put_pd_read(pd);
+
+	mutex_lock(&file->mutex);
+	list_add_tail(&obj->uevent.uobject.list, &file->ucontext->srq_list);
+	mutex_unlock(&file->mutex);
+
+	obj->uevent.uobject.live = 1;
+
+	up_write(&obj->uevent.uobject.mutex);
+
+	return 0;
+
+err_copy:
+	idr_remove_uobj(&ib_uverbs_srq_idr, &obj->uevent.uobject);
+
+err_destroy:
+	ib_destroy_srq(srq);
+
+err_put:
+	put_pd_read(pd);
+
+err_put_cq:
+	if (cmd->srq_type == IB_SRQT_XRC)
+		put_cq_read(attr.ext.xrc.cq);
+
+err_put_xrcd:
+	if (cmd->srq_type == IB_SRQT_XRC) {
+		atomic_dec(&obj->uxrcd->refcnt);
+		put_uobj_read(xrcd_uobj);
+	}
+
+err:
+	put_uobj_write(&obj->uevent.uobject);
+	return ret;
+}
+
+ssize_t ib_uverbs_create_srq(struct ib_uverbs_file *file,
+			     const char __user *buf, int in_len,
+			     int out_len)
+{
+	struct ib_uverbs_create_srq      cmd;
+	struct ib_uverbs_create_xsrq     xcmd;
+	struct ib_uverbs_create_srq_resp resp;
+	struct ib_udata                  udata;
+	int ret;
+
+	if (out_len < sizeof resp)
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	xcmd.response	 = cmd.response;
+	xcmd.user_handle = cmd.user_handle;
+	xcmd.srq_type	 = IB_SRQT_BASIC;
+	xcmd.pd_handle	 = cmd.pd_handle;
+	xcmd.max_wr	 = cmd.max_wr;
+	xcmd.max_sge	 = cmd.max_sge;
+	xcmd.srq_limit	 = cmd.srq_limit;
+
+	INIT_UDATA(&udata, buf + sizeof cmd,
+		   (unsigned long) cmd.response + sizeof resp,
+		   in_len - sizeof cmd, out_len - sizeof resp);
+
+	ret = __uverbs_create_xsrq(file, &xcmd, &udata);
+	if (ret)
+		return ret;
+
+	return in_len;
+}
+
+ssize_t ib_uverbs_create_xsrq(struct ib_uverbs_file *file,
+			      const char __user *buf, int in_len, int out_len)
+{
+	struct ib_uverbs_create_xsrq     cmd;
+	struct ib_uverbs_create_srq_resp resp;
+	struct ib_udata                  udata;
+	int ret;
+
+	if (out_len < sizeof resp)
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	INIT_UDATA(&udata, buf + sizeof cmd,
+		   (unsigned long) cmd.response + sizeof resp,
+		   in_len - sizeof cmd, out_len - sizeof resp);
+
+	ret = __uverbs_create_xsrq(file, &cmd, &udata);
+	if (ret)
+		return ret;
+
+	return in_len;
+}
+
+ssize_t ib_uverbs_modify_srq(struct ib_uverbs_file *file,
+			     const char __user *buf, int in_len,
+			     int out_len)
+{
+	struct ib_uverbs_modify_srq cmd;
+	struct ib_udata             udata;
+	struct ib_srq              *srq;
+	struct ib_srq_attr          attr;
+	int                         ret;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	INIT_UDATA(&udata, buf + sizeof cmd, NULL, in_len - sizeof cmd,
+		   out_len);
+
+	srq = idr_read_srq(cmd.srq_handle, file->ucontext);
+	if (!srq)
+		return -EINVAL;
+
+	attr.max_wr    = cmd.max_wr;
+	attr.srq_limit = cmd.srq_limit;
+
+	ret = srq->device->modify_srq(srq, &attr, cmd.attr_mask, &udata);
+
+	put_srq_read(srq);
+
+	return ret ? ret : in_len;
+}
+
+ssize_t ib_uverbs_query_srq(struct ib_uverbs_file *file,
+			    const char __user *buf,
+			    int in_len, int out_len)
+{
+	struct ib_uverbs_query_srq      cmd;
+	struct ib_uverbs_query_srq_resp resp;
+	struct ib_srq_attr              attr;
+	struct ib_srq                   *srq;
+	int                             ret;
+
+	if (out_len < sizeof resp)
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	srq = idr_read_srq(cmd.srq_handle, file->ucontext);
+	if (!srq)
+		return -EINVAL;
+
+	ret = ib_query_srq(srq, &attr);
+
+	put_srq_read(srq);
+
+	if (ret)
+		return ret;
+
+	memset(&resp, 0, sizeof resp);
+
+	resp.max_wr    = attr.max_wr;
+	resp.max_sge   = attr.max_sge;
+	resp.srq_limit = attr.srq_limit;
+
+	if (copy_to_user((void __user *) (unsigned long) cmd.response,
+			 &resp, sizeof resp))
+		return -EFAULT;
+
+	return in_len;
+}
+
+ssize_t ib_uverbs_destroy_srq(struct ib_uverbs_file *file,
+			      const char __user *buf, int in_len,
+			      int out_len)
+{
+	struct ib_uverbs_destroy_srq      cmd;
+	struct ib_uverbs_destroy_srq_resp resp;
+	struct ib_uobject		 *uobj;
+	struct ib_srq               	 *srq;
+	struct ib_uevent_object        	 *obj;
+	int                         	  ret = -EINVAL;
+	struct ib_usrq_object		 *us;
+	enum ib_srq_type		  srq_type;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	uobj = idr_write_uobj(&ib_uverbs_srq_idr, cmd.srq_handle, file->ucontext);
+	if (!uobj)
+		return -EINVAL;
+	srq = uobj->object;
+	obj = container_of(uobj, struct ib_uevent_object, uobject);
+	srq_type = srq->srq_type;
+
+	ret = ib_destroy_srq(srq);
+	if (!ret)
+		uobj->live = 0;
+
+	put_uobj_write(uobj);
+
+	if (ret)
+		return ret;
+
+	if (srq_type == IB_SRQT_XRC) {
+		us = container_of(obj, struct ib_usrq_object, uevent);
+		atomic_dec(&us->uxrcd->refcnt);
+	}
+
+	idr_remove_uobj(&ib_uverbs_srq_idr, uobj);
+
+	mutex_lock(&file->mutex);
+	list_del(&uobj->list);
+	mutex_unlock(&file->mutex);
+
+	ib_uverbs_release_uevent(file, obj);
+
+	memset(&resp, 0, sizeof resp);
+	resp.events_reported = obj->events_reported;
+
+	put_uobj(uobj);
+
+	if (copy_to_user((void __user *) (unsigned long) cmd.response,
+			 &resp, sizeof resp))
+		ret = -EFAULT;
+
+	return ret ? ret : in_len;
+}
diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c
new file mode 100644
index 0000000..71ab83f
--- /dev/null
+++ b/drivers/infiniband/core/uverbs_main.c
@@ -0,0 +1,1036 @@
+/*
+ * Copyright (c) 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005, 2006 Cisco Systems.  All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2005 Voltaire, Inc. All rights reserved.
+ * Copyright (c) 2005 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/device.h>
+#include <linux/err.h>
+#include <linux/fs.h>
+#include <linux/poll.h>
+#include <linux/sched.h>
+#include <linux/file.h>
+#include <linux/cdev.h>
+#include <linux/anon_inodes.h>
+#include <linux/slab.h>
+
+#include <asm/uaccess.h>
+
+#include "uverbs.h"
+
+MODULE_AUTHOR("Roland Dreier");
+MODULE_DESCRIPTION("InfiniBand userspace verbs access");
+MODULE_LICENSE("Dual BSD/GPL");
+
+enum {
+	IB_UVERBS_MAJOR       = 231,
+	IB_UVERBS_BASE_MINOR  = 192,
+	IB_UVERBS_MAX_DEVICES = 32
+};
+
+#define IB_UVERBS_BASE_DEV	MKDEV(IB_UVERBS_MAJOR, IB_UVERBS_BASE_MINOR)
+
+static struct class *uverbs_class;
+
+DEFINE_SPINLOCK(ib_uverbs_idr_lock);
+DEFINE_IDR(ib_uverbs_pd_idr);
+DEFINE_IDR(ib_uverbs_mr_idr);
+DEFINE_IDR(ib_uverbs_mw_idr);
+DEFINE_IDR(ib_uverbs_ah_idr);
+DEFINE_IDR(ib_uverbs_cq_idr);
+DEFINE_IDR(ib_uverbs_qp_idr);
+DEFINE_IDR(ib_uverbs_srq_idr);
+DEFINE_IDR(ib_uverbs_xrcd_idr);
+DEFINE_IDR(ib_uverbs_rule_idr);
+
+static DEFINE_SPINLOCK(map_lock);
+static DECLARE_BITMAP(dev_map, IB_UVERBS_MAX_DEVICES);
+
+static ssize_t (*uverbs_cmd_table[])(struct ib_uverbs_file *file,
+				     const char __user *buf, int in_len,
+				     int out_len) = {
+	[IB_USER_VERBS_CMD_GET_CONTEXT]		= ib_uverbs_get_context,
+	[IB_USER_VERBS_CMD_QUERY_DEVICE]	= ib_uverbs_query_device,
+	[IB_USER_VERBS_CMD_QUERY_PORT]		= ib_uverbs_query_port,
+	[IB_USER_VERBS_CMD_ALLOC_PD]		= ib_uverbs_alloc_pd,
+	[IB_USER_VERBS_CMD_DEALLOC_PD]		= ib_uverbs_dealloc_pd,
+	[IB_USER_VERBS_CMD_REG_MR]		= ib_uverbs_reg_mr,
+	[IB_USER_VERBS_CMD_REREG_MR]		= ib_uverbs_rereg_mr,
+	[IB_USER_VERBS_CMD_DEREG_MR]		= ib_uverbs_dereg_mr,
+	[IB_USER_VERBS_CMD_ALLOC_MW]		= ib_uverbs_alloc_mw,
+	[IB_USER_VERBS_CMD_DEALLOC_MW]		= ib_uverbs_dealloc_mw,
+	[IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL] = ib_uverbs_create_comp_channel,
+	[IB_USER_VERBS_CMD_CREATE_CQ]		= ib_uverbs_create_cq,
+	[IB_USER_VERBS_CMD_RESIZE_CQ]		= ib_uverbs_resize_cq,
+	[IB_USER_VERBS_CMD_POLL_CQ]		= ib_uverbs_poll_cq,
+	[IB_USER_VERBS_CMD_REQ_NOTIFY_CQ]	= ib_uverbs_req_notify_cq,
+	[IB_USER_VERBS_CMD_DESTROY_CQ]		= ib_uverbs_destroy_cq,
+	[IB_USER_VERBS_CMD_CREATE_QP]		= ib_uverbs_create_qp,
+	[IB_USER_VERBS_CMD_QUERY_QP]		= ib_uverbs_query_qp,
+	[IB_USER_VERBS_CMD_MODIFY_QP]		= ib_uverbs_modify_qp,
+	[IB_USER_VERBS_CMD_DESTROY_QP]		= ib_uverbs_destroy_qp,
+	[IB_USER_VERBS_CMD_POST_SEND]		= ib_uverbs_post_send,
+	[IB_USER_VERBS_CMD_POST_RECV]		= ib_uverbs_post_recv,
+	[IB_USER_VERBS_CMD_POST_SRQ_RECV]	= ib_uverbs_post_srq_recv,
+	[IB_USER_VERBS_CMD_CREATE_AH]		= ib_uverbs_create_ah,
+	[IB_USER_VERBS_CMD_DESTROY_AH]		= ib_uverbs_destroy_ah,
+	[IB_USER_VERBS_CMD_ATTACH_MCAST]	= ib_uverbs_attach_mcast,
+	[IB_USER_VERBS_CMD_DETACH_MCAST]	= ib_uverbs_detach_mcast,
+	[IB_USER_VERBS_CMD_CREATE_SRQ]		= ib_uverbs_create_srq,
+	[IB_USER_VERBS_CMD_MODIFY_SRQ]		= ib_uverbs_modify_srq,
+	[IB_USER_VERBS_CMD_QUERY_SRQ]		= ib_uverbs_query_srq,
+	[IB_USER_VERBS_CMD_DESTROY_SRQ]		= ib_uverbs_destroy_srq,
+	[IB_USER_VERBS_CMD_OPEN_XRCD]		= ib_uverbs_open_xrcd,
+	[IB_USER_VERBS_CMD_CLOSE_XRCD]		= ib_uverbs_close_xrcd,
+	[IB_USER_VERBS_CMD_CREATE_XSRQ]		= ib_uverbs_create_xsrq,
+	[IB_USER_VERBS_CMD_OPEN_QP]		= ib_uverbs_open_qp,
+};
+
+static int (*uverbs_ex_cmd_table[])(struct ib_uverbs_file *file,
+				    struct ib_udata *ucore,
+				    struct ib_udata *uhw) = {
+	[IB_USER_VERBS_EX_CMD_CREATE_FLOW]	= ib_uverbs_ex_create_flow,
+	[IB_USER_VERBS_EX_CMD_DESTROY_FLOW]	= ib_uverbs_ex_destroy_flow
+};
+
+static void ib_uverbs_add_one(struct ib_device *device);
+static void ib_uverbs_remove_one(struct ib_device *device);
+
+static void ib_uverbs_release_dev(struct kref *ref)
+{
+	struct ib_uverbs_device *dev =
+		container_of(ref, struct ib_uverbs_device, ref);
+
+	complete(&dev->comp);
+}
+
+static void ib_uverbs_release_event_file(struct kref *ref)
+{
+	struct ib_uverbs_event_file *file =
+		container_of(ref, struct ib_uverbs_event_file, ref);
+
+	kfree(file);
+}
+
+void ib_uverbs_release_ucq(struct ib_uverbs_file *file,
+			  struct ib_uverbs_event_file *ev_file,
+			  struct ib_ucq_object *uobj)
+{
+	struct ib_uverbs_event *evt, *tmp;
+
+	if (ev_file) {
+		spin_lock_irq(&ev_file->lock);
+		list_for_each_entry_safe(evt, tmp, &uobj->comp_list, obj_list) {
+			list_del(&evt->list);
+			kfree(evt);
+		}
+		spin_unlock_irq(&ev_file->lock);
+
+		kref_put(&ev_file->ref, ib_uverbs_release_event_file);
+	}
+
+	spin_lock_irq(&file->async_file->lock);
+	list_for_each_entry_safe(evt, tmp, &uobj->async_list, obj_list) {
+		list_del(&evt->list);
+		kfree(evt);
+	}
+	spin_unlock_irq(&file->async_file->lock);
+}
+
+void ib_uverbs_release_uevent(struct ib_uverbs_file *file,
+			      struct ib_uevent_object *uobj)
+{
+	struct ib_uverbs_event *evt, *tmp;
+
+	spin_lock_irq(&file->async_file->lock);
+	list_for_each_entry_safe(evt, tmp, &uobj->event_list, obj_list) {
+		list_del(&evt->list);
+		kfree(evt);
+	}
+	spin_unlock_irq(&file->async_file->lock);
+}
+
+static void ib_uverbs_detach_umcast(struct ib_qp *qp,
+				    struct ib_uqp_object *uobj)
+{
+	struct ib_uverbs_mcast_entry *mcast, *tmp;
+
+	list_for_each_entry_safe(mcast, tmp, &uobj->mcast_list, list) {
+		ib_detach_mcast(qp, &mcast->gid, mcast->lid);
+		list_del(&mcast->list);
+		kfree(mcast);
+	}
+}
+
+static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file,
+				      struct ib_ucontext *context)
+{
+	struct ib_uobject *uobj, *tmp;
+
+	if (!context)
+		return 0;
+
+	context->closing = 1;
+
+	list_for_each_entry_safe(uobj, tmp, &context->ah_list, list) {
+		struct ib_ah *ah = uobj->object;
+
+		idr_remove_uobj(&ib_uverbs_ah_idr, uobj);
+		ib_destroy_ah(ah);
+		kfree(uobj);
+	}
+
+	/* Remove MWs before QPs, in order to support type 2A MWs. */
+	list_for_each_entry_safe(uobj, tmp, &context->mw_list, list) {
+		struct ib_mw *mw = uobj->object;
+
+		idr_remove_uobj(&ib_uverbs_mw_idr, uobj);
+		ib_dealloc_mw(mw);
+		kfree(uobj);
+	}
+
+	list_for_each_entry_safe(uobj, tmp, &context->rule_list, list) {
+		struct ib_flow *flow_id = uobj->object;
+
+		idr_remove_uobj(&ib_uverbs_rule_idr, uobj);
+		ib_destroy_flow(flow_id);
+		kfree(uobj);
+	}
+
+	list_for_each_entry_safe(uobj, tmp, &context->qp_list, list) {
+		struct ib_qp *qp = uobj->object;
+		struct ib_uqp_object *uqp =
+			container_of(uobj, struct ib_uqp_object, uevent.uobject);
+
+		idr_remove_uobj(&ib_uverbs_qp_idr, uobj);
+		if (qp != qp->real_qp) {
+			ib_close_qp(qp);
+		} else {
+			ib_uverbs_detach_umcast(qp, uqp);
+			ib_destroy_qp(qp);
+		}
+		ib_uverbs_release_uevent(file, &uqp->uevent);
+		kfree(uqp);
+	}
+
+	list_for_each_entry_safe(uobj, tmp, &context->cq_list, list) {
+		struct ib_cq *cq = uobj->object;
+		struct ib_uverbs_event_file *ev_file = cq->cq_context;
+		struct ib_ucq_object *ucq =
+			container_of(uobj, struct ib_ucq_object, uobject);
+
+		idr_remove_uobj(&ib_uverbs_cq_idr, uobj);
+		ib_destroy_cq(cq);
+		ib_uverbs_release_ucq(file, ev_file, ucq);
+		kfree(ucq);
+	}
+
+	list_for_each_entry_safe(uobj, tmp, &context->srq_list, list) {
+		struct ib_srq *srq = uobj->object;
+		struct ib_uevent_object *uevent =
+			container_of(uobj, struct ib_uevent_object, uobject);
+
+		idr_remove_uobj(&ib_uverbs_srq_idr, uobj);
+		ib_destroy_srq(srq);
+		ib_uverbs_release_uevent(file, uevent);
+		kfree(uevent);
+	}
+
+	list_for_each_entry_safe(uobj, tmp, &context->mr_list, list) {
+		struct ib_mr *mr = uobj->object;
+
+		idr_remove_uobj(&ib_uverbs_mr_idr, uobj);
+		ib_dereg_mr(mr);
+		kfree(uobj);
+	}
+
+	mutex_lock(&file->device->xrcd_tree_mutex);
+	list_for_each_entry_safe(uobj, tmp, &context->xrcd_list, list) {
+		struct ib_xrcd *xrcd = uobj->object;
+		struct ib_uxrcd_object *uxrcd =
+			container_of(uobj, struct ib_uxrcd_object, uobject);
+
+		idr_remove_uobj(&ib_uverbs_xrcd_idr, uobj);
+		ib_uverbs_dealloc_xrcd(file->device, xrcd);
+		kfree(uxrcd);
+	}
+	mutex_unlock(&file->device->xrcd_tree_mutex);
+
+	list_for_each_entry_safe(uobj, tmp, &context->pd_list, list) {
+		struct ib_pd *pd = uobj->object;
+
+		idr_remove_uobj(&ib_uverbs_pd_idr, uobj);
+		ib_dealloc_pd(pd);
+		kfree(uobj);
+	}
+
+	return context->device->dealloc_ucontext(context);
+}
+
+static void ib_uverbs_release_file(struct kref *ref)
+{
+	struct ib_uverbs_file *file =
+		container_of(ref, struct ib_uverbs_file, ref);
+
+	module_put(file->device->ib_dev->owner);
+	kref_put(&file->device->ref, ib_uverbs_release_dev);
+
+	kfree(file);
+}
+
+static ssize_t ib_uverbs_event_read(struct file *filp, char __user *buf,
+				    size_t count, loff_t *pos)
+{
+	struct ib_uverbs_event_file *file = filp->private_data;
+	struct ib_uverbs_event *event;
+	int eventsz;
+	int ret = 0;
+
+	spin_lock_irq(&file->lock);
+
+	while (list_empty(&file->event_list)) {
+		spin_unlock_irq(&file->lock);
+
+		if (filp->f_flags & O_NONBLOCK)
+			return -EAGAIN;
+
+		if (wait_event_interruptible(file->poll_wait,
+					     !list_empty(&file->event_list)))
+			return -ERESTARTSYS;
+
+		spin_lock_irq(&file->lock);
+	}
+
+	event = list_entry(file->event_list.next, struct ib_uverbs_event, list);
+
+	if (file->is_async)
+		eventsz = sizeof (struct ib_uverbs_async_event_desc);
+	else
+		eventsz = sizeof (struct ib_uverbs_comp_event_desc);
+
+	if (eventsz > count) {
+		ret   = -EINVAL;
+		event = NULL;
+	} else {
+		list_del(file->event_list.next);
+		if (event->counter) {
+			++(*event->counter);
+			list_del(&event->obj_list);
+		}
+	}
+
+	spin_unlock_irq(&file->lock);
+
+	if (event) {
+		if (copy_to_user(buf, event, eventsz))
+			ret = -EFAULT;
+		else
+			ret = eventsz;
+	}
+
+	kfree(event);
+
+	return ret;
+}
+
+static unsigned int ib_uverbs_event_poll(struct file *filp,
+					 struct poll_table_struct *wait)
+{
+	unsigned int pollflags = 0;
+	struct ib_uverbs_event_file *file = filp->private_data;
+
+	poll_wait(filp, &file->poll_wait, wait);
+
+	spin_lock_irq(&file->lock);
+	if (!list_empty(&file->event_list))
+		pollflags = POLLIN | POLLRDNORM;
+	spin_unlock_irq(&file->lock);
+
+	return pollflags;
+}
+
+static int ib_uverbs_event_fasync(int fd, struct file *filp, int on)
+{
+	struct ib_uverbs_event_file *file = filp->private_data;
+
+	return fasync_helper(fd, filp, on, &file->async_queue);
+}
+
+static int ib_uverbs_event_close(struct inode *inode, struct file *filp)
+{
+	struct ib_uverbs_event_file *file = filp->private_data;
+	struct ib_uverbs_event *entry, *tmp;
+
+	spin_lock_irq(&file->lock);
+	file->is_closed = 1;
+	list_for_each_entry_safe(entry, tmp, &file->event_list, list) {
+		if (entry->counter)
+			list_del(&entry->obj_list);
+		kfree(entry);
+	}
+	spin_unlock_irq(&file->lock);
+
+	if (file->is_async) {
+		ib_unregister_event_handler(&file->uverbs_file->event_handler);
+		kref_put(&file->uverbs_file->ref, ib_uverbs_release_file);
+	}
+	kref_put(&file->ref, ib_uverbs_release_event_file);
+
+	return 0;
+}
+
+static const struct file_operations uverbs_event_fops = {
+	.owner	 = THIS_MODULE,
+	.read	 = ib_uverbs_event_read,
+	.poll    = ib_uverbs_event_poll,
+	.release = ib_uverbs_event_close,
+	.fasync  = ib_uverbs_event_fasync,
+	.llseek	 = no_llseek,
+};
+
+void ib_uverbs_comp_handler(struct ib_cq *cq, void *cq_context)
+{
+	struct ib_uverbs_event_file    *file = cq_context;
+	struct ib_ucq_object	       *uobj;
+	struct ib_uverbs_event	       *entry;
+	unsigned long			flags;
+
+	if (!file)
+		return;
+
+	spin_lock_irqsave(&file->lock, flags);
+	if (file->is_closed) {
+		spin_unlock_irqrestore(&file->lock, flags);
+		return;
+	}
+
+	entry = kmalloc(sizeof *entry, GFP_ATOMIC);
+	if (!entry) {
+		spin_unlock_irqrestore(&file->lock, flags);
+		return;
+	}
+
+	uobj = container_of(cq->uobject, struct ib_ucq_object, uobject);
+
+	entry->desc.comp.cq_handle = cq->uobject->user_handle;
+	entry->counter		   = &uobj->comp_events_reported;
+
+	list_add_tail(&entry->list, &file->event_list);
+	list_add_tail(&entry->obj_list, &uobj->comp_list);
+	spin_unlock_irqrestore(&file->lock, flags);
+
+	wake_up_interruptible(&file->poll_wait);
+	kill_fasync(&file->async_queue, SIGIO, POLL_IN);
+}
+
+static void ib_uverbs_async_handler(struct ib_uverbs_file *file,
+				    __u64 element, __u64 event,
+				    struct list_head *obj_list,
+				    u32 *counter)
+{
+	struct ib_uverbs_event *entry;
+	unsigned long flags;
+
+	spin_lock_irqsave(&file->async_file->lock, flags);
+	if (file->async_file->is_closed) {
+		spin_unlock_irqrestore(&file->async_file->lock, flags);
+		return;
+	}
+
+	entry = kmalloc(sizeof *entry, GFP_ATOMIC);
+	if (!entry) {
+		spin_unlock_irqrestore(&file->async_file->lock, flags);
+		return;
+	}
+
+	entry->desc.async.element    = element;
+	entry->desc.async.event_type = event;
+	entry->desc.async.reserved   = 0;
+	entry->counter               = counter;
+
+	list_add_tail(&entry->list, &file->async_file->event_list);
+	if (obj_list)
+		list_add_tail(&entry->obj_list, obj_list);
+	spin_unlock_irqrestore(&file->async_file->lock, flags);
+
+	wake_up_interruptible(&file->async_file->poll_wait);
+	kill_fasync(&file->async_file->async_queue, SIGIO, POLL_IN);
+}
+
+void ib_uverbs_cq_event_handler(struct ib_event *event, void *context_ptr)
+{
+	struct ib_ucq_object *uobj = container_of(event->element.cq->uobject,
+						  struct ib_ucq_object, uobject);
+
+	ib_uverbs_async_handler(uobj->uverbs_file, uobj->uobject.user_handle,
+				event->event, &uobj->async_list,
+				&uobj->async_events_reported);
+}
+
+void ib_uverbs_qp_event_handler(struct ib_event *event, void *context_ptr)
+{
+	struct ib_uevent_object *uobj;
+
+	/* for XRC target qp's, check that qp is live */
+	if (!event->element.qp->uobject || !event->element.qp->uobject->live)
+		return;
+
+	uobj = container_of(event->element.qp->uobject,
+			    struct ib_uevent_object, uobject);
+
+	ib_uverbs_async_handler(context_ptr, uobj->uobject.user_handle,
+				event->event, &uobj->event_list,
+				&uobj->events_reported);
+}
+
+void ib_uverbs_srq_event_handler(struct ib_event *event, void *context_ptr)
+{
+	struct ib_uevent_object *uobj;
+
+	uobj = container_of(event->element.srq->uobject,
+			    struct ib_uevent_object, uobject);
+
+	ib_uverbs_async_handler(context_ptr, uobj->uobject.user_handle,
+				event->event, &uobj->event_list,
+				&uobj->events_reported);
+}
+
+void ib_uverbs_event_handler(struct ib_event_handler *handler,
+			     struct ib_event *event)
+{
+	struct ib_uverbs_file *file =
+		container_of(handler, struct ib_uverbs_file, event_handler);
+
+	ib_uverbs_async_handler(file, event->element.port_num, event->event,
+				NULL, NULL);
+}
+
+struct file *ib_uverbs_alloc_event_file(struct ib_uverbs_file *uverbs_file,
+					int is_async)
+{
+	struct ib_uverbs_event_file *ev_file;
+	struct file *filp;
+
+	ev_file = kmalloc(sizeof *ev_file, GFP_KERNEL);
+	if (!ev_file)
+		return ERR_PTR(-ENOMEM);
+
+	kref_init(&ev_file->ref);
+	spin_lock_init(&ev_file->lock);
+	INIT_LIST_HEAD(&ev_file->event_list);
+	init_waitqueue_head(&ev_file->poll_wait);
+	ev_file->uverbs_file = uverbs_file;
+	ev_file->async_queue = NULL;
+	ev_file->is_async    = is_async;
+	ev_file->is_closed   = 0;
+
+	filp = anon_inode_getfile("[infinibandevent]", &uverbs_event_fops,
+				  ev_file, O_RDONLY);
+	if (IS_ERR(filp))
+		kfree(ev_file);
+
+	return filp;
+}
+
+/*
+ * Look up a completion event file by FD.  If lookup is successful,
+ * takes a ref to the event file struct that it returns; if
+ * unsuccessful, returns NULL.
+ */
+struct ib_uverbs_event_file *ib_uverbs_lookup_comp_file(int fd)
+{
+	struct ib_uverbs_event_file *ev_file = NULL;
+	struct fd f = fdget(fd);
+
+	if (!f.file)
+		return NULL;
+
+	if (f.file->f_op != &uverbs_event_fops)
+		goto out;
+
+	ev_file = f.file->private_data;
+	if (ev_file->is_async) {
+		ev_file = NULL;
+		goto out;
+	}
+
+	kref_get(&ev_file->ref);
+
+out:
+	fdput(f);
+	return ev_file;
+}
+
+static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf,
+			     size_t count, loff_t *pos)
+{
+	struct ib_uverbs_file *file = filp->private_data;
+	struct ib_uverbs_cmd_hdr hdr;
+	__u32 flags;
+
+	if (count < sizeof hdr)
+		return -EINVAL;
+
+	if (copy_from_user(&hdr, buf, sizeof hdr))
+		return -EFAULT;
+
+	flags = (hdr.command &
+		 IB_USER_VERBS_CMD_FLAGS_MASK) >> IB_USER_VERBS_CMD_FLAGS_SHIFT;
+
+	if (!flags) {
+		__u32 command;
+
+		if (hdr.command & ~(__u32)(IB_USER_VERBS_CMD_FLAGS_MASK |
+					   IB_USER_VERBS_CMD_COMMAND_MASK))
+			return -EINVAL;
+
+		command = hdr.command & IB_USER_VERBS_CMD_COMMAND_MASK;
+
+		if (command >= ARRAY_SIZE(uverbs_cmd_table) ||
+		    !uverbs_cmd_table[command])
+			return -EINVAL;
+
+		if (!file->ucontext &&
+		    command != IB_USER_VERBS_CMD_GET_CONTEXT)
+			return -EINVAL;
+
+		if (!(file->device->ib_dev->uverbs_cmd_mask & (1ull << command)))
+			return -ENOSYS;
+
+		if (hdr.in_words * 4 != count)
+			return -EINVAL;
+
+		return uverbs_cmd_table[command](file,
+						 buf + sizeof(hdr),
+						 hdr.in_words * 4,
+						 hdr.out_words * 4);
+
+	} else if (flags == IB_USER_VERBS_CMD_FLAG_EXTENDED) {
+		__u32 command;
+
+		struct ib_uverbs_ex_cmd_hdr ex_hdr;
+		struct ib_udata ucore;
+		struct ib_udata uhw;
+		int err;
+		size_t written_count = count;
+
+		if (hdr.command & ~(__u32)(IB_USER_VERBS_CMD_FLAGS_MASK |
+					   IB_USER_VERBS_CMD_COMMAND_MASK))
+			return -EINVAL;
+
+		command = hdr.command & IB_USER_VERBS_CMD_COMMAND_MASK;
+
+		if (command >= ARRAY_SIZE(uverbs_ex_cmd_table) ||
+		    !uverbs_ex_cmd_table[command])
+			return -ENOSYS;
+
+		if (!file->ucontext)
+			return -EINVAL;
+
+		if (!(file->device->ib_dev->uverbs_ex_cmd_mask & (1ull << command)))
+			return -ENOSYS;
+
+		if (count < (sizeof(hdr) + sizeof(ex_hdr)))
+			return -EINVAL;
+
+		if (copy_from_user(&ex_hdr, buf + sizeof(hdr), sizeof(ex_hdr)))
+			return -EFAULT;
+
+		count -= sizeof(hdr) + sizeof(ex_hdr);
+		buf += sizeof(hdr) + sizeof(ex_hdr);
+
+		if ((hdr.in_words + ex_hdr.provider_in_words) * 8 != count)
+			return -EINVAL;
+
+		if (ex_hdr.cmd_hdr_reserved)
+			return -EINVAL;
+
+		if (ex_hdr.response) {
+			if (!hdr.out_words && !ex_hdr.provider_out_words)
+				return -EINVAL;
+
+			if (!access_ok(VERIFY_WRITE,
+				       (void __user *) (unsigned long) ex_hdr.response,
+				       (hdr.out_words + ex_hdr.provider_out_words) * 8))
+				return -EFAULT;
+		} else {
+			if (hdr.out_words || ex_hdr.provider_out_words)
+				return -EINVAL;
+		}
+
+		INIT_UDATA_BUF_OR_NULL(&ucore, buf, (unsigned long) ex_hdr.response,
+				       hdr.in_words * 8, hdr.out_words * 8);
+
+		INIT_UDATA_BUF_OR_NULL(&uhw,
+				       buf + ucore.inlen,
+				       (unsigned long) ex_hdr.response + ucore.outlen,
+				       ex_hdr.provider_in_words * 8,
+				       ex_hdr.provider_out_words * 8);
+
+		err = uverbs_ex_cmd_table[command](file,
+						   &ucore,
+						   &uhw);
+
+		if (err)
+			return err;
+
+		return written_count;
+	}
+
+	return -ENOSYS;
+}
+
+static int ib_uverbs_mmap(struct file *filp, struct vm_area_struct *vma)
+{
+	struct ib_uverbs_file *file = filp->private_data;
+
+	if (!file->ucontext)
+		return -ENODEV;
+	else
+		return file->device->ib_dev->mmap(file->ucontext, vma);
+}
+
+/*
+ * ib_uverbs_open() does not need the BKL:
+ *
+ *  - the ib_uverbs_device structures are properly reference counted and
+ *    everything else is purely local to the file being created, so
+ *    races against other open calls are not a problem;
+ *  - there is no ioctl method to race against;
+ *  - the open method will either immediately run -ENXIO, or all
+ *    required initialization will be done.
+ */
+static int ib_uverbs_open(struct inode *inode, struct file *filp)
+{
+	struct ib_uverbs_device *dev;
+	struct ib_uverbs_file *file;
+	int ret;
+
+	dev = container_of(inode->i_cdev, struct ib_uverbs_device, cdev);
+	if (dev)
+		kref_get(&dev->ref);
+	else
+		return -ENXIO;
+
+	if (!try_module_get(dev->ib_dev->owner)) {
+		ret = -ENODEV;
+		goto err;
+	}
+
+	file = kmalloc(sizeof *file, GFP_KERNEL);
+	if (!file) {
+		ret = -ENOMEM;
+		goto err_module;
+	}
+
+	file->device	 = dev;
+	file->ucontext	 = NULL;
+	file->async_file = NULL;
+	kref_init(&file->ref);
+	mutex_init(&file->mutex);
+
+	filp->private_data = file;
+
+	return nonseekable_open(inode, filp);
+
+err_module:
+	module_put(dev->ib_dev->owner);
+
+err:
+	kref_put(&dev->ref, ib_uverbs_release_dev);
+	return ret;
+}
+
+static int ib_uverbs_close(struct inode *inode, struct file *filp)
+{
+	struct ib_uverbs_file *file = filp->private_data;
+
+	ib_uverbs_cleanup_ucontext(file, file->ucontext);
+
+	if (file->async_file)
+		kref_put(&file->async_file->ref, ib_uverbs_release_event_file);
+
+	kref_put(&file->ref, ib_uverbs_release_file);
+
+	return 0;
+}
+
+static const struct file_operations uverbs_fops = {
+	.owner	 = THIS_MODULE,
+	.write	 = ib_uverbs_write,
+	.open	 = ib_uverbs_open,
+	.release = ib_uverbs_close,
+	.llseek	 = no_llseek,
+};
+
+static const struct file_operations uverbs_mmap_fops = {
+	.owner	 = THIS_MODULE,
+	.write	 = ib_uverbs_write,
+	.mmap    = ib_uverbs_mmap,
+	.open	 = ib_uverbs_open,
+	.release = ib_uverbs_close,
+	.llseek	 = no_llseek,
+};
+
+static struct ib_client uverbs_client = {
+	.name   = "uverbs",
+	.add    = ib_uverbs_add_one,
+	.remove = ib_uverbs_remove_one
+};
+
+static ssize_t show_ibdev(struct device *device, struct device_attribute *attr,
+			  char *buf)
+{
+	struct ib_uverbs_device *dev = dev_get_drvdata(device);
+
+	if (!dev)
+		return -ENODEV;
+
+	return sprintf(buf, "%s\n", dev->ib_dev->name);
+}
+static DEVICE_ATTR(ibdev, S_IRUGO, show_ibdev, NULL);
+
+static ssize_t show_dev_abi_version(struct device *device,
+				    struct device_attribute *attr, char *buf)
+{
+	struct ib_uverbs_device *dev = dev_get_drvdata(device);
+
+	if (!dev)
+		return -ENODEV;
+
+	return sprintf(buf, "%d\n", dev->ib_dev->uverbs_abi_ver);
+}
+static DEVICE_ATTR(abi_version, S_IRUGO, show_dev_abi_version, NULL);
+
+static CLASS_ATTR_STRING(abi_version, S_IRUGO,
+			 __stringify(IB_USER_VERBS_ABI_VERSION));
+
+static dev_t overflow_maj;
+static DECLARE_BITMAP(overflow_map, IB_UVERBS_MAX_DEVICES);
+
+/*
+ * If we have more than IB_UVERBS_MAX_DEVICES, dynamically overflow by
+ * requesting a new major number and doubling the number of max devices we
+ * support. It's stupid, but simple.
+ */
+static int find_overflow_devnum(void)
+{
+	int ret;
+
+	if (!overflow_maj) {
+		ret = alloc_chrdev_region(&overflow_maj, 0, IB_UVERBS_MAX_DEVICES,
+					  "infiniband_verbs");
+		if (ret) {
+			printk(KERN_ERR "user_verbs: couldn't register dynamic device number\n");
+			return ret;
+		}
+	}
+
+	ret = find_first_zero_bit(overflow_map, IB_UVERBS_MAX_DEVICES);
+	if (ret >= IB_UVERBS_MAX_DEVICES)
+		return -1;
+
+	return ret;
+}
+
+static void ib_uverbs_add_one(struct ib_device *device)
+{
+	int devnum;
+	dev_t base;
+	struct ib_uverbs_device *uverbs_dev;
+
+	if (!device->alloc_ucontext)
+		return;
+
+	uverbs_dev = kzalloc(sizeof *uverbs_dev, GFP_KERNEL);
+	if (!uverbs_dev)
+		return;
+
+	kref_init(&uverbs_dev->ref);
+	init_completion(&uverbs_dev->comp);
+	uverbs_dev->xrcd_tree = RB_ROOT;
+	mutex_init(&uverbs_dev->xrcd_tree_mutex);
+
+	spin_lock(&map_lock);
+	devnum = find_first_zero_bit(dev_map, IB_UVERBS_MAX_DEVICES);
+	if (devnum >= IB_UVERBS_MAX_DEVICES) {
+		spin_unlock(&map_lock);
+		devnum = find_overflow_devnum();
+		if (devnum < 0)
+			goto err;
+
+		spin_lock(&map_lock);
+		uverbs_dev->devnum = devnum + IB_UVERBS_MAX_DEVICES;
+		base = devnum + overflow_maj;
+		set_bit(devnum, overflow_map);
+	} else {
+		uverbs_dev->devnum = devnum;
+		base = devnum + IB_UVERBS_BASE_DEV;
+		set_bit(devnum, dev_map);
+	}
+	spin_unlock(&map_lock);
+
+	uverbs_dev->ib_dev           = device;
+	uverbs_dev->num_comp_vectors = device->num_comp_vectors;
+
+	cdev_init(&uverbs_dev->cdev, NULL);
+	uverbs_dev->cdev.owner = THIS_MODULE;
+	uverbs_dev->cdev.ops = device->mmap ? &uverbs_mmap_fops : &uverbs_fops;
+	kobject_set_name(&uverbs_dev->cdev.kobj, "uverbs%d", uverbs_dev->devnum);
+	if (cdev_add(&uverbs_dev->cdev, base, 1))
+		goto err_cdev;
+
+	uverbs_dev->dev = device_create(uverbs_class, device->dma_device,
+					uverbs_dev->cdev.dev, uverbs_dev,
+					"uverbs%d", uverbs_dev->devnum);
+	if (IS_ERR(uverbs_dev->dev))
+		goto err_cdev;
+
+	if (device_create_file(uverbs_dev->dev, &dev_attr_ibdev))
+		goto err_class;
+	if (device_create_file(uverbs_dev->dev, &dev_attr_abi_version))
+		goto err_class;
+
+	ib_set_client_data(device, &uverbs_client, uverbs_dev);
+
+	return;
+
+err_class:
+	device_destroy(uverbs_class, uverbs_dev->cdev.dev);
+
+err_cdev:
+	cdev_del(&uverbs_dev->cdev);
+	if (uverbs_dev->devnum < IB_UVERBS_MAX_DEVICES)
+		clear_bit(devnum, dev_map);
+	else
+		clear_bit(devnum, overflow_map);
+
+err:
+	kref_put(&uverbs_dev->ref, ib_uverbs_release_dev);
+	wait_for_completion(&uverbs_dev->comp);
+	kfree(uverbs_dev);
+	return;
+}
+
+static void ib_uverbs_remove_one(struct ib_device *device)
+{
+	struct ib_uverbs_device *uverbs_dev = ib_get_client_data(device, &uverbs_client);
+
+	if (!uverbs_dev)
+		return;
+
+	dev_set_drvdata(uverbs_dev->dev, NULL);
+	device_destroy(uverbs_class, uverbs_dev->cdev.dev);
+	cdev_del(&uverbs_dev->cdev);
+
+	if (uverbs_dev->devnum < IB_UVERBS_MAX_DEVICES)
+		clear_bit(uverbs_dev->devnum, dev_map);
+	else
+		clear_bit(uverbs_dev->devnum - IB_UVERBS_MAX_DEVICES, overflow_map);
+
+	kref_put(&uverbs_dev->ref, ib_uverbs_release_dev);
+	wait_for_completion(&uverbs_dev->comp);
+	kfree(uverbs_dev);
+}
+
+static char *uverbs_devnode(struct device *dev, umode_t *mode)
+{
+	if (mode)
+		*mode = 0666;
+	return kasprintf(GFP_KERNEL, "infiniband/%s", dev_name(dev));
+}
+
+static int __init ib_uverbs_init(void)
+{
+	int ret;
+
+	ret = register_chrdev_region(IB_UVERBS_BASE_DEV, IB_UVERBS_MAX_DEVICES,
+				     "infiniband_verbs");
+	if (ret) {
+		printk(KERN_ERR "user_verbs: couldn't register device number\n");
+		goto out;
+	}
+
+	uverbs_class = class_create(THIS_MODULE, "infiniband_verbs");
+	if (IS_ERR(uverbs_class)) {
+		ret = PTR_ERR(uverbs_class);
+		printk(KERN_ERR "user_verbs: couldn't create class infiniband_verbs\n");
+		goto out_chrdev;
+	}
+
+	uverbs_class->devnode = uverbs_devnode;
+
+	ret = class_create_file(uverbs_class, &class_attr_abi_version.attr);
+	if (ret) {
+		printk(KERN_ERR "user_verbs: couldn't create abi_version attribute\n");
+		goto out_class;
+	}
+
+	ret = ib_register_client(&uverbs_client);
+	if (ret) {
+		printk(KERN_ERR "user_verbs: couldn't register client\n");
+		goto out_class;
+	}
+
+	return 0;
+
+out_class:
+	class_destroy(uverbs_class);
+
+out_chrdev:
+	unregister_chrdev_region(IB_UVERBS_BASE_DEV, IB_UVERBS_MAX_DEVICES);
+
+out:
+	return ret;
+}
+
+static void __exit ib_uverbs_cleanup(void)
+{
+	ib_unregister_client(&uverbs_client);
+	class_destroy(uverbs_class);
+	unregister_chrdev_region(IB_UVERBS_BASE_DEV, IB_UVERBS_MAX_DEVICES);
+	if (overflow_maj)
+		unregister_chrdev_region(overflow_maj, IB_UVERBS_MAX_DEVICES);
+	idr_destroy(&ib_uverbs_pd_idr);
+	idr_destroy(&ib_uverbs_mr_idr);
+	idr_destroy(&ib_uverbs_mw_idr);
+	idr_destroy(&ib_uverbs_ah_idr);
+	idr_destroy(&ib_uverbs_cq_idr);
+	idr_destroy(&ib_uverbs_qp_idr);
+	idr_destroy(&ib_uverbs_srq_idr);
+}
+
+module_init(ib_uverbs_init);
+module_exit(ib_uverbs_cleanup);
diff --git a/drivers/infiniband/core/uverbs_marshall.c b/drivers/infiniband/core/uverbs_marshall.c
new file mode 100644
index 0000000..abd9724
--- /dev/null
+++ b/drivers/infiniband/core/uverbs_marshall.c
@@ -0,0 +1,148 @@
+/*
+ * Copyright (c) 2005 Intel Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/export.h>
+#include <rdma/ib_marshall.h>
+
+void ib_copy_ah_attr_to_user(struct ib_uverbs_ah_attr *dst,
+			     struct ib_ah_attr *src)
+{
+	memcpy(dst->grh.dgid, src->grh.dgid.raw, sizeof src->grh.dgid);
+	dst->grh.flow_label        = src->grh.flow_label;
+	dst->grh.sgid_index        = src->grh.sgid_index;
+	dst->grh.hop_limit         = src->grh.hop_limit;
+	dst->grh.traffic_class     = src->grh.traffic_class;
+	memset(&dst->grh.reserved, 0, sizeof(dst->grh.reserved));
+	dst->dlid 	    	   = src->dlid;
+	dst->sl   	    	   = src->sl;
+	dst->src_path_bits 	   = src->src_path_bits;
+	dst->static_rate   	   = src->static_rate;
+	dst->is_global             = src->ah_flags & IB_AH_GRH ? 1 : 0;
+	dst->port_num 	    	   = src->port_num;
+	dst->reserved 		   = 0;
+}
+EXPORT_SYMBOL(ib_copy_ah_attr_to_user);
+
+void ib_copy_qp_attr_to_user(struct ib_uverbs_qp_attr *dst,
+			     struct ib_qp_attr *src)
+{
+	dst->qp_state	        = src->qp_state;
+	dst->cur_qp_state	= src->cur_qp_state;
+	dst->path_mtu		= src->path_mtu;
+	dst->path_mig_state	= src->path_mig_state;
+	dst->qkey		= src->qkey;
+	dst->rq_psn		= src->rq_psn;
+	dst->sq_psn		= src->sq_psn;
+	dst->dest_qp_num	= src->dest_qp_num;
+	dst->qp_access_flags	= src->qp_access_flags;
+
+	dst->max_send_wr	= src->cap.max_send_wr;
+	dst->max_recv_wr	= src->cap.max_recv_wr;
+	dst->max_send_sge	= src->cap.max_send_sge;
+	dst->max_recv_sge	= src->cap.max_recv_sge;
+	dst->max_inline_data	= src->cap.max_inline_data;
+
+	ib_copy_ah_attr_to_user(&dst->ah_attr, &src->ah_attr);
+	ib_copy_ah_attr_to_user(&dst->alt_ah_attr, &src->alt_ah_attr);
+
+	dst->pkey_index		= src->pkey_index;
+	dst->alt_pkey_index	= src->alt_pkey_index;
+	dst->en_sqd_async_notify = src->en_sqd_async_notify;
+	dst->sq_draining	= src->sq_draining;
+	dst->max_rd_atomic	= src->max_rd_atomic;
+	dst->max_dest_rd_atomic	= src->max_dest_rd_atomic;
+	dst->min_rnr_timer	= src->min_rnr_timer;
+	dst->port_num		= src->port_num;
+	dst->timeout		= src->timeout;
+	dst->retry_cnt		= src->retry_cnt;
+	dst->rnr_retry		= src->rnr_retry;
+	dst->alt_port_num	= src->alt_port_num;
+	dst->alt_timeout	= src->alt_timeout;
+	memset(dst->reserved, 0, sizeof(dst->reserved));
+}
+EXPORT_SYMBOL(ib_copy_qp_attr_to_user);
+
+void ib_copy_path_rec_to_user(struct ib_user_path_rec *dst,
+			      struct ib_sa_path_rec *src)
+{
+	memcpy(dst->dgid, src->dgid.raw, sizeof src->dgid);
+	memcpy(dst->sgid, src->sgid.raw, sizeof src->sgid);
+
+	dst->dlid		= src->dlid;
+	dst->slid		= src->slid;
+	dst->raw_traffic	= src->raw_traffic;
+	dst->flow_label		= src->flow_label;
+	dst->hop_limit		= src->hop_limit;
+	dst->traffic_class	= src->traffic_class;
+	dst->reversible		= src->reversible;
+	dst->numb_path		= src->numb_path;
+	dst->pkey		= src->pkey;
+	dst->sl			= src->sl;
+	dst->mtu_selector	= src->mtu_selector;
+	dst->mtu		= src->mtu;
+	dst->rate_selector	= src->rate_selector;
+	dst->rate		= src->rate;
+	dst->packet_life_time	= src->packet_life_time;
+	dst->preference		= src->preference;
+	dst->packet_life_time_selector = src->packet_life_time_selector;
+}
+EXPORT_SYMBOL(ib_copy_path_rec_to_user);
+
+void ib_copy_path_rec_from_user(struct ib_sa_path_rec *dst,
+				struct ib_user_path_rec *src)
+{
+	memcpy(dst->dgid.raw, src->dgid, sizeof dst->dgid);
+	memcpy(dst->sgid.raw, src->sgid, sizeof dst->sgid);
+
+	dst->dlid		= src->dlid;
+	dst->slid		= src->slid;
+	dst->raw_traffic	= src->raw_traffic;
+	dst->flow_label		= src->flow_label;
+	dst->hop_limit		= src->hop_limit;
+	dst->traffic_class	= src->traffic_class;
+	dst->reversible		= src->reversible;
+	dst->numb_path		= src->numb_path;
+	dst->pkey		= src->pkey;
+	dst->sl			= src->sl;
+	dst->mtu_selector	= src->mtu_selector;
+	dst->mtu		= src->mtu;
+	dst->rate_selector	= src->rate_selector;
+	dst->rate		= src->rate;
+	dst->packet_life_time	= src->packet_life_time;
+	dst->preference		= src->preference;
+	dst->packet_life_time_selector = src->packet_life_time_selector;
+
+	memset(dst->smac, 0, sizeof(dst->smac));
+	memset(dst->dmac, 0, sizeof(dst->dmac));
+	dst->vlan_id = 0xffff;
+}
+EXPORT_SYMBOL(ib_copy_path_rec_from_user);
diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c
new file mode 100644
index 0000000..c2b89cc
--- /dev/null
+++ b/drivers/infiniband/core/verbs.c
@@ -0,0 +1,1447 @@
+/*
+ * Copyright (c) 2004 Mellanox Technologies Ltd.  All rights reserved.
+ * Copyright (c) 2004 Infinicon Corporation.  All rights reserved.
+ * Copyright (c) 2004 Intel Corporation.  All rights reserved.
+ * Copyright (c) 2004 Topspin Corporation.  All rights reserved.
+ * Copyright (c) 2004 Voltaire Corporation.  All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2005, 2006 Cisco Systems.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/errno.h>
+#include <linux/err.h>
+#include <linux/export.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_cache.h>
+#include <rdma/ib_addr.h>
+
+#include "core_priv.h"
+
+__attribute_const__ int ib_rate_to_mult(enum ib_rate rate)
+{
+	switch (rate) {
+	case IB_RATE_2_5_GBPS: return  1;
+	case IB_RATE_5_GBPS:   return  2;
+	case IB_RATE_10_GBPS:  return  4;
+	case IB_RATE_20_GBPS:  return  8;
+	case IB_RATE_30_GBPS:  return 12;
+	case IB_RATE_40_GBPS:  return 16;
+	case IB_RATE_60_GBPS:  return 24;
+	case IB_RATE_80_GBPS:  return 32;
+	case IB_RATE_120_GBPS: return 48;
+	default:	       return -1;
+	}
+}
+EXPORT_SYMBOL(ib_rate_to_mult);
+
+__attribute_const__ enum ib_rate mult_to_ib_rate(int mult)
+{
+	switch (mult) {
+	case 1:  return IB_RATE_2_5_GBPS;
+	case 2:  return IB_RATE_5_GBPS;
+	case 4:  return IB_RATE_10_GBPS;
+	case 8:  return IB_RATE_20_GBPS;
+	case 12: return IB_RATE_30_GBPS;
+	case 16: return IB_RATE_40_GBPS;
+	case 24: return IB_RATE_60_GBPS;
+	case 32: return IB_RATE_80_GBPS;
+	case 48: return IB_RATE_120_GBPS;
+	default: return IB_RATE_PORT_CURRENT;
+	}
+}
+EXPORT_SYMBOL(mult_to_ib_rate);
+
+__attribute_const__ int ib_rate_to_mbps(enum ib_rate rate)
+{
+	switch (rate) {
+	case IB_RATE_2_5_GBPS: return 2500;
+	case IB_RATE_5_GBPS:   return 5000;
+	case IB_RATE_10_GBPS:  return 10000;
+	case IB_RATE_20_GBPS:  return 20000;
+	case IB_RATE_30_GBPS:  return 30000;
+	case IB_RATE_40_GBPS:  return 40000;
+	case IB_RATE_60_GBPS:  return 60000;
+	case IB_RATE_80_GBPS:  return 80000;
+	case IB_RATE_120_GBPS: return 120000;
+	case IB_RATE_14_GBPS:  return 14062;
+	case IB_RATE_56_GBPS:  return 56250;
+	case IB_RATE_112_GBPS: return 112500;
+	case IB_RATE_168_GBPS: return 168750;
+	case IB_RATE_25_GBPS:  return 25781;
+	case IB_RATE_100_GBPS: return 103125;
+	case IB_RATE_200_GBPS: return 206250;
+	case IB_RATE_300_GBPS: return 309375;
+	default:	       return -1;
+	}
+}
+EXPORT_SYMBOL(ib_rate_to_mbps);
+
+__attribute_const__ enum rdma_transport_type
+rdma_node_get_transport(enum rdma_node_type node_type)
+{
+	switch (node_type) {
+	case RDMA_NODE_IB_CA:
+	case RDMA_NODE_IB_SWITCH:
+	case RDMA_NODE_IB_ROUTER:
+		return RDMA_TRANSPORT_IB;
+	case RDMA_NODE_RNIC:
+		return RDMA_TRANSPORT_IWARP;
+	case RDMA_NODE_USNIC:
+		return RDMA_TRANSPORT_USNIC;
+	case RDMA_NODE_USNIC_UDP:
+		return RDMA_TRANSPORT_USNIC_UDP;
+	default:
+		BUG();
+		return 0;
+	}
+}
+EXPORT_SYMBOL(rdma_node_get_transport);
+
+enum rdma_link_layer rdma_port_get_link_layer(struct ib_device *device, u8 port_num)
+{
+	if (device->get_link_layer)
+		return device->get_link_layer(device, port_num);
+
+	switch (rdma_node_get_transport(device->node_type)) {
+	case RDMA_TRANSPORT_IB:
+		return IB_LINK_LAYER_INFINIBAND;
+	case RDMA_TRANSPORT_IWARP:
+	case RDMA_TRANSPORT_USNIC:
+	case RDMA_TRANSPORT_USNIC_UDP:
+		return IB_LINK_LAYER_ETHERNET;
+	default:
+		return IB_LINK_LAYER_UNSPECIFIED;
+	}
+}
+EXPORT_SYMBOL(rdma_port_get_link_layer);
+
+/* Protection domains */
+
+struct ib_pd *ib_alloc_pd(struct ib_device *device)
+{
+	struct ib_pd *pd;
+
+	pd = device->alloc_pd(device, NULL, NULL);
+
+	if (!IS_ERR(pd)) {
+		pd->device  = device;
+		pd->uobject = NULL;
+		atomic_set(&pd->usecnt, 0);
+	}
+
+	return pd;
+}
+EXPORT_SYMBOL(ib_alloc_pd);
+
+int ib_dealloc_pd(struct ib_pd *pd)
+{
+	if (atomic_read(&pd->usecnt))
+		return -EBUSY;
+
+	return pd->device->dealloc_pd(pd);
+}
+EXPORT_SYMBOL(ib_dealloc_pd);
+
+/* Address handles */
+
+struct ib_ah *ib_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr)
+{
+	struct ib_ah *ah;
+
+	ah = pd->device->create_ah(pd, ah_attr);
+
+	if (!IS_ERR(ah)) {
+		ah->device  = pd->device;
+		ah->pd      = pd;
+		ah->uobject = NULL;
+		atomic_inc(&pd->usecnt);
+	}
+
+	return ah;
+}
+EXPORT_SYMBOL(ib_create_ah);
+
+int ib_init_ah_from_wc(struct ib_device *device, u8 port_num, struct ib_wc *wc,
+		       struct ib_grh *grh, struct ib_ah_attr *ah_attr)
+{
+	u32 flow_class;
+	u16 gid_index;
+	int ret;
+	int is_eth = (rdma_port_get_link_layer(device, port_num) ==
+			IB_LINK_LAYER_ETHERNET);
+
+	memset(ah_attr, 0, sizeof *ah_attr);
+	if (is_eth) {
+		if (!(wc->wc_flags & IB_WC_GRH))
+			return -EPROTOTYPE;
+
+		if (wc->wc_flags & IB_WC_WITH_SMAC &&
+		    wc->wc_flags & IB_WC_WITH_VLAN) {
+			memcpy(ah_attr->dmac, wc->smac, ETH_ALEN);
+			ah_attr->vlan_id = wc->vlan_id;
+		} else {
+			ret = rdma_addr_find_dmac_by_grh(&grh->dgid, &grh->sgid,
+					ah_attr->dmac, &ah_attr->vlan_id);
+			if (ret)
+				return ret;
+		}
+	} else {
+		ah_attr->vlan_id = 0xffff;
+	}
+
+	ah_attr->dlid = wc->slid;
+	ah_attr->sl = wc->sl;
+	ah_attr->src_path_bits = wc->dlid_path_bits;
+	ah_attr->port_num = port_num;
+
+	if (wc->wc_flags & IB_WC_GRH) {
+		ah_attr->ah_flags = IB_AH_GRH;
+		ah_attr->grh.dgid = grh->sgid;
+
+		ret = ib_find_cached_gid(device, &grh->dgid, &port_num,
+					 &gid_index);
+		if (ret)
+			return ret;
+
+		ah_attr->grh.sgid_index = (u8) gid_index;
+		flow_class = be32_to_cpu(grh->version_tclass_flow);
+		ah_attr->grh.flow_label = flow_class & 0xFFFFF;
+		ah_attr->grh.hop_limit = 0xFF;
+		ah_attr->grh.traffic_class = (flow_class >> 20) & 0xFF;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(ib_init_ah_from_wc);
+
+struct ib_ah *ib_create_ah_from_wc(struct ib_pd *pd, struct ib_wc *wc,
+				   struct ib_grh *grh, u8 port_num)
+{
+	struct ib_ah_attr ah_attr;
+	int ret;
+
+	ret = ib_init_ah_from_wc(pd->device, port_num, wc, grh, &ah_attr);
+	if (ret)
+		return ERR_PTR(ret);
+
+	return ib_create_ah(pd, &ah_attr);
+}
+EXPORT_SYMBOL(ib_create_ah_from_wc);
+
+int ib_modify_ah(struct ib_ah *ah, struct ib_ah_attr *ah_attr)
+{
+	return ah->device->modify_ah ?
+		ah->device->modify_ah(ah, ah_attr) :
+		-ENOSYS;
+}
+EXPORT_SYMBOL(ib_modify_ah);
+
+int ib_query_ah(struct ib_ah *ah, struct ib_ah_attr *ah_attr)
+{
+	return ah->device->query_ah ?
+		ah->device->query_ah(ah, ah_attr) :
+		-ENOSYS;
+}
+EXPORT_SYMBOL(ib_query_ah);
+
+int ib_destroy_ah(struct ib_ah *ah)
+{
+	struct ib_pd *pd;
+	int ret;
+
+	pd = ah->pd;
+	ret = ah->device->destroy_ah(ah);
+	if (!ret)
+		atomic_dec(&pd->usecnt);
+
+	return ret;
+}
+EXPORT_SYMBOL(ib_destroy_ah);
+
+/* Shared receive queues */
+
+struct ib_srq *ib_create_srq(struct ib_pd *pd,
+			     struct ib_srq_init_attr *srq_init_attr)
+{
+	struct ib_srq *srq;
+
+	if (!pd->device->create_srq)
+		return ERR_PTR(-ENOSYS);
+
+	srq = pd->device->create_srq(pd, srq_init_attr, NULL);
+
+	if (!IS_ERR(srq)) {
+		srq->device    	   = pd->device;
+		srq->pd        	   = pd;
+		srq->uobject       = NULL;
+		srq->event_handler = srq_init_attr->event_handler;
+		srq->srq_context   = srq_init_attr->srq_context;
+		srq->srq_type      = srq_init_attr->srq_type;
+		if (srq->srq_type == IB_SRQT_XRC) {
+			srq->ext.xrc.xrcd = srq_init_attr->ext.xrc.xrcd;
+			srq->ext.xrc.cq   = srq_init_attr->ext.xrc.cq;
+			atomic_inc(&srq->ext.xrc.xrcd->usecnt);
+			atomic_inc(&srq->ext.xrc.cq->usecnt);
+		}
+		atomic_inc(&pd->usecnt);
+		atomic_set(&srq->usecnt, 0);
+	}
+
+	return srq;
+}
+EXPORT_SYMBOL(ib_create_srq);
+
+int ib_modify_srq(struct ib_srq *srq,
+		  struct ib_srq_attr *srq_attr,
+		  enum ib_srq_attr_mask srq_attr_mask)
+{
+	return srq->device->modify_srq ?
+		srq->device->modify_srq(srq, srq_attr, srq_attr_mask, NULL) :
+		-ENOSYS;
+}
+EXPORT_SYMBOL(ib_modify_srq);
+
+int ib_query_srq(struct ib_srq *srq,
+		 struct ib_srq_attr *srq_attr)
+{
+	return srq->device->query_srq ?
+		srq->device->query_srq(srq, srq_attr) : -ENOSYS;
+}
+EXPORT_SYMBOL(ib_query_srq);
+
+int ib_destroy_srq(struct ib_srq *srq)
+{
+	struct ib_pd *pd;
+	enum ib_srq_type srq_type;
+	struct ib_xrcd *uninitialized_var(xrcd);
+	struct ib_cq *uninitialized_var(cq);
+	int ret;
+
+	if (atomic_read(&srq->usecnt))
+		return -EBUSY;
+
+	pd = srq->pd;
+	srq_type = srq->srq_type;
+	if (srq_type == IB_SRQT_XRC) {
+		xrcd = srq->ext.xrc.xrcd;
+		cq = srq->ext.xrc.cq;
+	}
+
+	ret = srq->device->destroy_srq(srq);
+	if (!ret) {
+		atomic_dec(&pd->usecnt);
+		if (srq_type == IB_SRQT_XRC) {
+			atomic_dec(&xrcd->usecnt);
+			atomic_dec(&cq->usecnt);
+		}
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL(ib_destroy_srq);
+
+/* Queue pairs */
+
+static void __ib_shared_qp_event_handler(struct ib_event *event, void *context)
+{
+	struct ib_qp *qp = context;
+	unsigned long flags;
+
+	spin_lock_irqsave(&qp->device->event_handler_lock, flags);
+	list_for_each_entry(event->element.qp, &qp->open_list, open_list)
+		if (event->element.qp->event_handler)
+			event->element.qp->event_handler(event, event->element.qp->qp_context);
+	spin_unlock_irqrestore(&qp->device->event_handler_lock, flags);
+}
+
+static void __ib_insert_xrcd_qp(struct ib_xrcd *xrcd, struct ib_qp *qp)
+{
+	mutex_lock(&xrcd->tgt_qp_mutex);
+	list_add(&qp->xrcd_list, &xrcd->tgt_qp_list);
+	mutex_unlock(&xrcd->tgt_qp_mutex);
+}
+
+static struct ib_qp *__ib_open_qp(struct ib_qp *real_qp,
+				  void (*event_handler)(struct ib_event *, void *),
+				  void *qp_context)
+{
+	struct ib_qp *qp;
+	unsigned long flags;
+
+	qp = kzalloc(sizeof *qp, GFP_KERNEL);
+	if (!qp)
+		return ERR_PTR(-ENOMEM);
+
+	qp->real_qp = real_qp;
+	atomic_inc(&real_qp->usecnt);
+	qp->device = real_qp->device;
+	qp->event_handler = event_handler;
+	qp->qp_context = qp_context;
+	qp->qp_num = real_qp->qp_num;
+	qp->qp_type = real_qp->qp_type;
+
+	spin_lock_irqsave(&real_qp->device->event_handler_lock, flags);
+	list_add(&qp->open_list, &real_qp->open_list);
+	spin_unlock_irqrestore(&real_qp->device->event_handler_lock, flags);
+
+	return qp;
+}
+
+struct ib_qp *ib_open_qp(struct ib_xrcd *xrcd,
+			 struct ib_qp_open_attr *qp_open_attr)
+{
+	struct ib_qp *qp, *real_qp;
+
+	if (qp_open_attr->qp_type != IB_QPT_XRC_TGT)
+		return ERR_PTR(-EINVAL);
+
+	qp = ERR_PTR(-EINVAL);
+	mutex_lock(&xrcd->tgt_qp_mutex);
+	list_for_each_entry(real_qp, &xrcd->tgt_qp_list, xrcd_list) {
+		if (real_qp->qp_num == qp_open_attr->qp_num) {
+			qp = __ib_open_qp(real_qp, qp_open_attr->event_handler,
+					  qp_open_attr->qp_context);
+			break;
+		}
+	}
+	mutex_unlock(&xrcd->tgt_qp_mutex);
+	return qp;
+}
+EXPORT_SYMBOL(ib_open_qp);
+
+struct ib_qp *ib_create_qp(struct ib_pd *pd,
+			   struct ib_qp_init_attr *qp_init_attr)
+{
+	struct ib_qp *qp, *real_qp;
+	struct ib_device *device;
+
+	device = pd ? pd->device : qp_init_attr->xrcd->device;
+	qp = device->create_qp(pd, qp_init_attr, NULL);
+
+	if (!IS_ERR(qp)) {
+		qp->device     = device;
+		qp->real_qp    = qp;
+		qp->uobject    = NULL;
+		qp->qp_type    = qp_init_attr->qp_type;
+
+		atomic_set(&qp->usecnt, 0);
+		if (qp_init_attr->qp_type == IB_QPT_XRC_TGT) {
+			qp->event_handler = __ib_shared_qp_event_handler;
+			qp->qp_context = qp;
+			qp->pd = NULL;
+			qp->send_cq = qp->recv_cq = NULL;
+			qp->srq = NULL;
+			qp->xrcd = qp_init_attr->xrcd;
+			atomic_inc(&qp_init_attr->xrcd->usecnt);
+			INIT_LIST_HEAD(&qp->open_list);
+
+			real_qp = qp;
+			qp = __ib_open_qp(real_qp, qp_init_attr->event_handler,
+					  qp_init_attr->qp_context);
+			if (!IS_ERR(qp))
+				__ib_insert_xrcd_qp(qp_init_attr->xrcd, real_qp);
+			else
+				real_qp->device->destroy_qp(real_qp);
+		} else {
+			qp->event_handler = qp_init_attr->event_handler;
+			qp->qp_context = qp_init_attr->qp_context;
+			if (qp_init_attr->qp_type == IB_QPT_XRC_INI) {
+				qp->recv_cq = NULL;
+				qp->srq = NULL;
+			} else {
+				qp->recv_cq = qp_init_attr->recv_cq;
+				atomic_inc(&qp_init_attr->recv_cq->usecnt);
+				qp->srq = qp_init_attr->srq;
+				if (qp->srq)
+					atomic_inc(&qp_init_attr->srq->usecnt);
+			}
+
+			qp->pd	    = pd;
+			qp->send_cq = qp_init_attr->send_cq;
+			qp->xrcd    = NULL;
+
+			atomic_inc(&pd->usecnt);
+			atomic_inc(&qp_init_attr->send_cq->usecnt);
+		}
+	}
+
+	return qp;
+}
+EXPORT_SYMBOL(ib_create_qp);
+
+static const struct {
+	int			valid;
+	enum ib_qp_attr_mask	req_param[IB_QPT_MAX];
+	enum ib_qp_attr_mask	req_param_add_eth[IB_QPT_MAX];
+	enum ib_qp_attr_mask	opt_param[IB_QPT_MAX];
+	enum ib_qp_attr_mask	opt_param_add_eth[IB_QPT_MAX];
+} qp_state_table[IB_QPS_ERR + 1][IB_QPS_ERR + 1] = {
+	[IB_QPS_RESET] = {
+		[IB_QPS_RESET] = { .valid = 1 },
+		[IB_QPS_INIT]  = {
+			.valid = 1,
+			.req_param = {
+				[IB_QPT_UD]  = (IB_QP_PKEY_INDEX		|
+						IB_QP_PORT			|
+						IB_QP_QKEY),
+				[IB_QPT_RAW_PACKET] = IB_QP_PORT,
+				[IB_QPT_UC]  = (IB_QP_PKEY_INDEX		|
+						IB_QP_PORT			|
+						IB_QP_ACCESS_FLAGS),
+				[IB_QPT_RC]  = (IB_QP_PKEY_INDEX		|
+						IB_QP_PORT			|
+						IB_QP_ACCESS_FLAGS),
+				[IB_QPT_XRC_INI] = (IB_QP_PKEY_INDEX		|
+						IB_QP_PORT			|
+						IB_QP_ACCESS_FLAGS),
+				[IB_QPT_XRC_TGT] = (IB_QP_PKEY_INDEX		|
+						IB_QP_PORT			|
+						IB_QP_ACCESS_FLAGS),
+				[IB_QPT_SMI] = (IB_QP_PKEY_INDEX		|
+						IB_QP_QKEY),
+				[IB_QPT_GSI] = (IB_QP_PKEY_INDEX		|
+						IB_QP_QKEY),
+			}
+		},
+	},
+	[IB_QPS_INIT]  = {
+		[IB_QPS_RESET] = { .valid = 1 },
+		[IB_QPS_ERR] =   { .valid = 1 },
+		[IB_QPS_INIT]  = {
+			.valid = 1,
+			.opt_param = {
+				[IB_QPT_UD]  = (IB_QP_PKEY_INDEX		|
+						IB_QP_PORT			|
+						IB_QP_QKEY),
+				[IB_QPT_UC]  = (IB_QP_PKEY_INDEX		|
+						IB_QP_PORT			|
+						IB_QP_ACCESS_FLAGS),
+				[IB_QPT_RC]  = (IB_QP_PKEY_INDEX		|
+						IB_QP_PORT			|
+						IB_QP_ACCESS_FLAGS),
+				[IB_QPT_XRC_INI] = (IB_QP_PKEY_INDEX		|
+						IB_QP_PORT			|
+						IB_QP_ACCESS_FLAGS),
+				[IB_QPT_XRC_TGT] = (IB_QP_PKEY_INDEX		|
+						IB_QP_PORT			|
+						IB_QP_ACCESS_FLAGS),
+				[IB_QPT_SMI] = (IB_QP_PKEY_INDEX		|
+						IB_QP_QKEY),
+				[IB_QPT_GSI] = (IB_QP_PKEY_INDEX		|
+						IB_QP_QKEY),
+			}
+		},
+		[IB_QPS_RTR]   = {
+			.valid = 1,
+			.req_param = {
+				[IB_QPT_UC]  = (IB_QP_AV			|
+						IB_QP_PATH_MTU			|
+						IB_QP_DEST_QPN			|
+						IB_QP_RQ_PSN),
+				[IB_QPT_RC]  = (IB_QP_AV			|
+						IB_QP_PATH_MTU			|
+						IB_QP_DEST_QPN			|
+						IB_QP_RQ_PSN			|
+						IB_QP_MAX_DEST_RD_ATOMIC	|
+						IB_QP_MIN_RNR_TIMER),
+				[IB_QPT_XRC_INI] = (IB_QP_AV			|
+						IB_QP_PATH_MTU			|
+						IB_QP_DEST_QPN			|
+						IB_QP_RQ_PSN),
+				[IB_QPT_XRC_TGT] = (IB_QP_AV			|
+						IB_QP_PATH_MTU			|
+						IB_QP_DEST_QPN			|
+						IB_QP_RQ_PSN			|
+						IB_QP_MAX_DEST_RD_ATOMIC	|
+						IB_QP_MIN_RNR_TIMER),
+			},
+			.req_param_add_eth = {
+				[IB_QPT_RC]  = (IB_QP_SMAC),
+				[IB_QPT_UC]  = (IB_QP_SMAC),
+				[IB_QPT_XRC_INI]  = (IB_QP_SMAC),
+				[IB_QPT_XRC_TGT]  = (IB_QP_SMAC)
+			},
+			.opt_param = {
+				 [IB_QPT_UD]  = (IB_QP_PKEY_INDEX		|
+						 IB_QP_QKEY),
+				 [IB_QPT_UC]  = (IB_QP_ALT_PATH			|
+						 IB_QP_ACCESS_FLAGS		|
+						 IB_QP_PKEY_INDEX),
+				 [IB_QPT_RC]  = (IB_QP_ALT_PATH			|
+						 IB_QP_ACCESS_FLAGS		|
+						 IB_QP_PKEY_INDEX),
+				 [IB_QPT_XRC_INI] = (IB_QP_ALT_PATH		|
+						 IB_QP_ACCESS_FLAGS		|
+						 IB_QP_PKEY_INDEX),
+				 [IB_QPT_XRC_TGT] = (IB_QP_ALT_PATH		|
+						 IB_QP_ACCESS_FLAGS		|
+						 IB_QP_PKEY_INDEX),
+				 [IB_QPT_SMI] = (IB_QP_PKEY_INDEX		|
+						 IB_QP_QKEY),
+				 [IB_QPT_GSI] = (IB_QP_PKEY_INDEX		|
+						 IB_QP_QKEY),
+			 },
+			.opt_param_add_eth = {
+				[IB_QPT_RC]  = (IB_QP_ALT_SMAC			|
+						IB_QP_VID			|
+						IB_QP_ALT_VID),
+				[IB_QPT_UC]  = (IB_QP_ALT_SMAC			|
+						IB_QP_VID			|
+						IB_QP_ALT_VID),
+				[IB_QPT_XRC_INI]  = (IB_QP_ALT_SMAC			|
+						IB_QP_VID			|
+						IB_QP_ALT_VID),
+				[IB_QPT_XRC_TGT]  = (IB_QP_ALT_SMAC			|
+						IB_QP_VID			|
+						IB_QP_ALT_VID)
+			}
+		}
+	},
+	[IB_QPS_RTR]   = {
+		[IB_QPS_RESET] = { .valid = 1 },
+		[IB_QPS_ERR] =   { .valid = 1 },
+		[IB_QPS_RTS]   = {
+			.valid = 1,
+			.req_param = {
+				[IB_QPT_UD]  = IB_QP_SQ_PSN,
+				[IB_QPT_UC]  = IB_QP_SQ_PSN,
+				[IB_QPT_RC]  = (IB_QP_TIMEOUT			|
+						IB_QP_RETRY_CNT			|
+						IB_QP_RNR_RETRY			|
+						IB_QP_SQ_PSN			|
+						IB_QP_MAX_QP_RD_ATOMIC),
+				[IB_QPT_XRC_INI] = (IB_QP_TIMEOUT		|
+						IB_QP_RETRY_CNT			|
+						IB_QP_RNR_RETRY			|
+						IB_QP_SQ_PSN			|
+						IB_QP_MAX_QP_RD_ATOMIC),
+				[IB_QPT_XRC_TGT] = (IB_QP_TIMEOUT		|
+						IB_QP_SQ_PSN),
+				[IB_QPT_SMI] = IB_QP_SQ_PSN,
+				[IB_QPT_GSI] = IB_QP_SQ_PSN,
+			},
+			.opt_param = {
+				 [IB_QPT_UD]  = (IB_QP_CUR_STATE		|
+						 IB_QP_QKEY),
+				 [IB_QPT_UC]  = (IB_QP_CUR_STATE		|
+						 IB_QP_ALT_PATH			|
+						 IB_QP_ACCESS_FLAGS		|
+						 IB_QP_PATH_MIG_STATE),
+				 [IB_QPT_RC]  = (IB_QP_CUR_STATE		|
+						 IB_QP_ALT_PATH			|
+						 IB_QP_ACCESS_FLAGS		|
+						 IB_QP_MIN_RNR_TIMER		|
+						 IB_QP_PATH_MIG_STATE),
+				 [IB_QPT_XRC_INI] = (IB_QP_CUR_STATE		|
+						 IB_QP_ALT_PATH			|
+						 IB_QP_ACCESS_FLAGS		|
+						 IB_QP_PATH_MIG_STATE),
+				 [IB_QPT_XRC_TGT] = (IB_QP_CUR_STATE		|
+						 IB_QP_ALT_PATH			|
+						 IB_QP_ACCESS_FLAGS		|
+						 IB_QP_MIN_RNR_TIMER		|
+						 IB_QP_PATH_MIG_STATE),
+				 [IB_QPT_SMI] = (IB_QP_CUR_STATE		|
+						 IB_QP_QKEY),
+				 [IB_QPT_GSI] = (IB_QP_CUR_STATE		|
+						 IB_QP_QKEY),
+			 }
+		}
+	},
+	[IB_QPS_RTS]   = {
+		[IB_QPS_RESET] = { .valid = 1 },
+		[IB_QPS_ERR] =   { .valid = 1 },
+		[IB_QPS_RTS]   = {
+			.valid = 1,
+			.opt_param = {
+				[IB_QPT_UD]  = (IB_QP_CUR_STATE			|
+						IB_QP_QKEY),
+				[IB_QPT_UC]  = (IB_QP_CUR_STATE			|
+						IB_QP_ACCESS_FLAGS		|
+						IB_QP_ALT_PATH			|
+						IB_QP_PATH_MIG_STATE),
+				[IB_QPT_RC]  = (IB_QP_CUR_STATE			|
+						IB_QP_ACCESS_FLAGS		|
+						IB_QP_ALT_PATH			|
+						IB_QP_PATH_MIG_STATE		|
+						IB_QP_MIN_RNR_TIMER),
+				[IB_QPT_XRC_INI] = (IB_QP_CUR_STATE		|
+						IB_QP_ACCESS_FLAGS		|
+						IB_QP_ALT_PATH			|
+						IB_QP_PATH_MIG_STATE),
+				[IB_QPT_XRC_TGT] = (IB_QP_CUR_STATE		|
+						IB_QP_ACCESS_FLAGS		|
+						IB_QP_ALT_PATH			|
+						IB_QP_PATH_MIG_STATE		|
+						IB_QP_MIN_RNR_TIMER),
+				[IB_QPT_SMI] = (IB_QP_CUR_STATE			|
+						IB_QP_QKEY),
+				[IB_QPT_GSI] = (IB_QP_CUR_STATE			|
+						IB_QP_QKEY),
+			}
+		},
+		[IB_QPS_SQD]   = {
+			.valid = 1,
+			.opt_param = {
+				[IB_QPT_UD]  = IB_QP_EN_SQD_ASYNC_NOTIFY,
+				[IB_QPT_UC]  = IB_QP_EN_SQD_ASYNC_NOTIFY,
+				[IB_QPT_RC]  = IB_QP_EN_SQD_ASYNC_NOTIFY,
+				[IB_QPT_XRC_INI] = IB_QP_EN_SQD_ASYNC_NOTIFY,
+				[IB_QPT_XRC_TGT] = IB_QP_EN_SQD_ASYNC_NOTIFY, /* ??? */
+				[IB_QPT_SMI] = IB_QP_EN_SQD_ASYNC_NOTIFY,
+				[IB_QPT_GSI] = IB_QP_EN_SQD_ASYNC_NOTIFY
+			}
+		},
+	},
+	[IB_QPS_SQD]   = {
+		[IB_QPS_RESET] = { .valid = 1 },
+		[IB_QPS_ERR] =   { .valid = 1 },
+		[IB_QPS_RTS]   = {
+			.valid = 1,
+			.opt_param = {
+				[IB_QPT_UD]  = (IB_QP_CUR_STATE			|
+						IB_QP_QKEY),
+				[IB_QPT_UC]  = (IB_QP_CUR_STATE			|
+						IB_QP_ALT_PATH			|
+						IB_QP_ACCESS_FLAGS		|
+						IB_QP_PATH_MIG_STATE),
+				[IB_QPT_RC]  = (IB_QP_CUR_STATE			|
+						IB_QP_ALT_PATH			|
+						IB_QP_ACCESS_FLAGS		|
+						IB_QP_MIN_RNR_TIMER		|
+						IB_QP_PATH_MIG_STATE),
+				[IB_QPT_XRC_INI] = (IB_QP_CUR_STATE		|
+						IB_QP_ALT_PATH			|
+						IB_QP_ACCESS_FLAGS		|
+						IB_QP_PATH_MIG_STATE),
+				[IB_QPT_XRC_TGT] = (IB_QP_CUR_STATE		|
+						IB_QP_ALT_PATH			|
+						IB_QP_ACCESS_FLAGS		|
+						IB_QP_MIN_RNR_TIMER		|
+						IB_QP_PATH_MIG_STATE),
+				[IB_QPT_SMI] = (IB_QP_CUR_STATE			|
+						IB_QP_QKEY),
+				[IB_QPT_GSI] = (IB_QP_CUR_STATE			|
+						IB_QP_QKEY),
+			}
+		},
+		[IB_QPS_SQD]   = {
+			.valid = 1,
+			.opt_param = {
+				[IB_QPT_UD]  = (IB_QP_PKEY_INDEX		|
+						IB_QP_QKEY),
+				[IB_QPT_UC]  = (IB_QP_AV			|
+						IB_QP_ALT_PATH			|
+						IB_QP_ACCESS_FLAGS		|
+						IB_QP_PKEY_INDEX		|
+						IB_QP_PATH_MIG_STATE),
+				[IB_QPT_RC]  = (IB_QP_PORT			|
+						IB_QP_AV			|
+						IB_QP_TIMEOUT			|
+						IB_QP_RETRY_CNT			|
+						IB_QP_RNR_RETRY			|
+						IB_QP_MAX_QP_RD_ATOMIC		|
+						IB_QP_MAX_DEST_RD_ATOMIC	|
+						IB_QP_ALT_PATH			|
+						IB_QP_ACCESS_FLAGS		|
+						IB_QP_PKEY_INDEX		|
+						IB_QP_MIN_RNR_TIMER		|
+						IB_QP_PATH_MIG_STATE),
+				[IB_QPT_XRC_INI] = (IB_QP_PORT			|
+						IB_QP_AV			|
+						IB_QP_TIMEOUT			|
+						IB_QP_RETRY_CNT			|
+						IB_QP_RNR_RETRY			|
+						IB_QP_MAX_QP_RD_ATOMIC		|
+						IB_QP_ALT_PATH			|
+						IB_QP_ACCESS_FLAGS		|
+						IB_QP_PKEY_INDEX		|
+						IB_QP_PATH_MIG_STATE),
+				[IB_QPT_XRC_TGT] = (IB_QP_PORT			|
+						IB_QP_AV			|
+						IB_QP_TIMEOUT			|
+						IB_QP_MAX_DEST_RD_ATOMIC	|
+						IB_QP_ALT_PATH			|
+						IB_QP_ACCESS_FLAGS		|
+						IB_QP_PKEY_INDEX		|
+						IB_QP_MIN_RNR_TIMER		|
+						IB_QP_PATH_MIG_STATE),
+				[IB_QPT_SMI] = (IB_QP_PKEY_INDEX		|
+						IB_QP_QKEY),
+				[IB_QPT_GSI] = (IB_QP_PKEY_INDEX		|
+						IB_QP_QKEY),
+			}
+		}
+	},
+	[IB_QPS_SQE]   = {
+		[IB_QPS_RESET] = { .valid = 1 },
+		[IB_QPS_ERR] =   { .valid = 1 },
+		[IB_QPS_RTS]   = {
+			.valid = 1,
+			.opt_param = {
+				[IB_QPT_UD]  = (IB_QP_CUR_STATE			|
+						IB_QP_QKEY),
+				[IB_QPT_UC]  = (IB_QP_CUR_STATE			|
+						IB_QP_ACCESS_FLAGS),
+				[IB_QPT_SMI] = (IB_QP_CUR_STATE			|
+						IB_QP_QKEY),
+				[IB_QPT_GSI] = (IB_QP_CUR_STATE			|
+						IB_QP_QKEY),
+			}
+		}
+	},
+	[IB_QPS_ERR] = {
+		[IB_QPS_RESET] = { .valid = 1 },
+		[IB_QPS_ERR] =   { .valid = 1 }
+	}
+};
+
+int ib_modify_qp_is_ok(enum ib_qp_state cur_state, enum ib_qp_state next_state,
+		       enum ib_qp_type type, enum ib_qp_attr_mask mask,
+		       enum rdma_link_layer ll)
+{
+	enum ib_qp_attr_mask req_param, opt_param;
+
+	if (cur_state  < 0 || cur_state  > IB_QPS_ERR ||
+	    next_state < 0 || next_state > IB_QPS_ERR)
+		return 0;
+
+	if (mask & IB_QP_CUR_STATE  &&
+	    cur_state != IB_QPS_RTR && cur_state != IB_QPS_RTS &&
+	    cur_state != IB_QPS_SQD && cur_state != IB_QPS_SQE)
+		return 0;
+
+	if (!qp_state_table[cur_state][next_state].valid)
+		return 0;
+
+	req_param = qp_state_table[cur_state][next_state].req_param[type];
+	opt_param = qp_state_table[cur_state][next_state].opt_param[type];
+
+	if (ll == IB_LINK_LAYER_ETHERNET) {
+		req_param |= qp_state_table[cur_state][next_state].
+			req_param_add_eth[type];
+		opt_param |= qp_state_table[cur_state][next_state].
+			opt_param_add_eth[type];
+	}
+
+	if ((mask & req_param) != req_param)
+		return 0;
+
+	if (mask & ~(req_param | opt_param | IB_QP_STATE))
+		return 0;
+
+	return 1;
+}
+EXPORT_SYMBOL(ib_modify_qp_is_ok);
+
+int ib_resolve_eth_l2_attrs(struct ib_qp *qp,
+			    struct ib_qp_attr *qp_attr, int *qp_attr_mask)
+{
+	int           ret = 0;
+	union ib_gid  sgid;
+
+	if ((*qp_attr_mask & IB_QP_AV)  &&
+	    (rdma_port_get_link_layer(qp->device, qp_attr->ah_attr.port_num) == IB_LINK_LAYER_ETHERNET)) {
+		ret = ib_query_gid(qp->device, qp_attr->ah_attr.port_num,
+				   qp_attr->ah_attr.grh.sgid_index, &sgid);
+		if (ret)
+			goto out;
+		if (rdma_link_local_addr((struct in6_addr *)qp_attr->ah_attr.grh.dgid.raw)) {
+			rdma_get_ll_mac((struct in6_addr *)qp_attr->ah_attr.grh.dgid.raw, qp_attr->ah_attr.dmac);
+			rdma_get_ll_mac((struct in6_addr *)sgid.raw, qp_attr->smac);
+			qp_attr->vlan_id = rdma_get_vlan_id(&sgid);
+		} else {
+			ret = rdma_addr_find_dmac_by_grh(&sgid, &qp_attr->ah_attr.grh.dgid,
+					qp_attr->ah_attr.dmac, &qp_attr->vlan_id);
+			if (ret)
+				goto out;
+			ret = rdma_addr_find_smac_by_sgid(&sgid, qp_attr->smac, NULL);
+			if (ret)
+				goto out;
+		}
+		*qp_attr_mask |= IB_QP_SMAC;
+		if (qp_attr->vlan_id < 0xFFFF)
+			*qp_attr_mask |= IB_QP_VID;
+	}
+out:
+	return ret;
+}
+EXPORT_SYMBOL(ib_resolve_eth_l2_attrs);
+
+
+int ib_modify_qp(struct ib_qp *qp,
+		 struct ib_qp_attr *qp_attr,
+		 int qp_attr_mask)
+{
+	int ret;
+
+	ret = ib_resolve_eth_l2_attrs(qp, qp_attr, &qp_attr_mask);
+	if (ret)
+		return ret;
+
+	return qp->device->modify_qp(qp->real_qp, qp_attr, qp_attr_mask, NULL);
+}
+EXPORT_SYMBOL(ib_modify_qp);
+
+int ib_query_qp(struct ib_qp *qp,
+		struct ib_qp_attr *qp_attr,
+		int qp_attr_mask,
+		struct ib_qp_init_attr *qp_init_attr)
+{
+	return qp->device->query_qp ?
+		qp->device->query_qp(qp->real_qp, qp_attr, qp_attr_mask, qp_init_attr) :
+		-ENOSYS;
+}
+EXPORT_SYMBOL(ib_query_qp);
+
+int ib_close_qp(struct ib_qp *qp)
+{
+	struct ib_qp *real_qp;
+	unsigned long flags;
+
+	real_qp = qp->real_qp;
+	if (real_qp == qp)
+		return -EINVAL;
+
+	spin_lock_irqsave(&real_qp->device->event_handler_lock, flags);
+	list_del(&qp->open_list);
+	spin_unlock_irqrestore(&real_qp->device->event_handler_lock, flags);
+
+	atomic_dec(&real_qp->usecnt);
+	kfree(qp);
+
+	return 0;
+}
+EXPORT_SYMBOL(ib_close_qp);
+
+static int __ib_destroy_shared_qp(struct ib_qp *qp)
+{
+	struct ib_xrcd *xrcd;
+	struct ib_qp *real_qp;
+	int ret;
+
+	real_qp = qp->real_qp;
+	xrcd = real_qp->xrcd;
+
+	mutex_lock(&xrcd->tgt_qp_mutex);
+	ib_close_qp(qp);
+	if (atomic_read(&real_qp->usecnt) == 0)
+		list_del(&real_qp->xrcd_list);
+	else
+		real_qp = NULL;
+	mutex_unlock(&xrcd->tgt_qp_mutex);
+
+	if (real_qp) {
+		ret = ib_destroy_qp(real_qp);
+		if (!ret)
+			atomic_dec(&xrcd->usecnt);
+		else
+			__ib_insert_xrcd_qp(xrcd, real_qp);
+	}
+
+	return 0;
+}
+
+int ib_destroy_qp(struct ib_qp *qp)
+{
+	struct ib_pd *pd;
+	struct ib_cq *scq, *rcq;
+	struct ib_srq *srq;
+	int ret;
+
+	if (atomic_read(&qp->usecnt))
+		return -EBUSY;
+
+	if (qp->real_qp != qp)
+		return __ib_destroy_shared_qp(qp);
+
+	pd   = qp->pd;
+	scq  = qp->send_cq;
+	rcq  = qp->recv_cq;
+	srq  = qp->srq;
+
+	ret = qp->device->destroy_qp(qp);
+	if (!ret) {
+		if (pd)
+			atomic_dec(&pd->usecnt);
+		if (scq)
+			atomic_dec(&scq->usecnt);
+		if (rcq)
+			atomic_dec(&rcq->usecnt);
+		if (srq)
+			atomic_dec(&srq->usecnt);
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL(ib_destroy_qp);
+
+/* Completion queues */
+
+struct ib_cq *ib_create_cq(struct ib_device *device,
+			   ib_comp_handler comp_handler,
+			   void (*event_handler)(struct ib_event *, void *),
+			   void *cq_context, int cqe, int comp_vector)
+{
+	struct ib_cq *cq;
+
+	cq = device->create_cq(device, cqe, comp_vector, NULL, NULL);
+
+	if (!IS_ERR(cq)) {
+		cq->device        = device;
+		cq->uobject       = NULL;
+		cq->comp_handler  = comp_handler;
+		cq->event_handler = event_handler;
+		cq->cq_context    = cq_context;
+		atomic_set(&cq->usecnt, 0);
+	}
+
+	return cq;
+}
+EXPORT_SYMBOL(ib_create_cq);
+
+int ib_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period)
+{
+	return cq->device->modify_cq ?
+		cq->device->modify_cq(cq, cq_count, cq_period) : -ENOSYS;
+}
+EXPORT_SYMBOL(ib_modify_cq);
+
+int ib_destroy_cq(struct ib_cq *cq)
+{
+	if (atomic_read(&cq->usecnt))
+		return -EBUSY;
+
+	return cq->device->destroy_cq(cq);
+}
+EXPORT_SYMBOL(ib_destroy_cq);
+
+int ib_resize_cq(struct ib_cq *cq, int cqe)
+{
+	return cq->device->resize_cq ?
+		cq->device->resize_cq(cq, cqe, NULL) : -ENOSYS;
+}
+EXPORT_SYMBOL(ib_resize_cq);
+
+/* Memory regions */
+
+struct ib_mr *ib_get_dma_mr(struct ib_pd *pd, int mr_access_flags)
+{
+	struct ib_mr *mr;
+	int err;
+
+	err = ib_check_mr_access(mr_access_flags);
+	if (err)
+		return ERR_PTR(err);
+
+	mr = pd->device->get_dma_mr(pd, mr_access_flags);
+
+	if (!IS_ERR(mr)) {
+		mr->device  = pd->device;
+		mr->pd      = pd;
+		mr->uobject = NULL;
+		atomic_inc(&pd->usecnt);
+		atomic_set(&mr->usecnt, 0);
+	}
+
+	return mr;
+}
+EXPORT_SYMBOL(ib_get_dma_mr);
+
+struct ib_mr *ib_reg_phys_mr(struct ib_pd *pd,
+			     struct ib_phys_buf *phys_buf_array,
+			     int num_phys_buf,
+			     int mr_access_flags,
+			     u64 *iova_start)
+{
+	struct ib_mr *mr;
+	int err;
+
+	err = ib_check_mr_access(mr_access_flags);
+	if (err)
+		return ERR_PTR(err);
+
+	if (!pd->device->reg_phys_mr)
+		return ERR_PTR(-ENOSYS);
+
+	mr = pd->device->reg_phys_mr(pd, phys_buf_array, num_phys_buf,
+				     mr_access_flags, iova_start);
+
+	if (!IS_ERR(mr)) {
+		mr->device  = pd->device;
+		mr->pd      = pd;
+		mr->uobject = NULL;
+		atomic_inc(&pd->usecnt);
+		atomic_set(&mr->usecnt, 0);
+	}
+
+	return mr;
+}
+EXPORT_SYMBOL(ib_reg_phys_mr);
+
+int ib_rereg_phys_mr(struct ib_mr *mr,
+		     int mr_rereg_mask,
+		     struct ib_pd *pd,
+		     struct ib_phys_buf *phys_buf_array,
+		     int num_phys_buf,
+		     int mr_access_flags,
+		     u64 *iova_start)
+{
+	struct ib_pd *old_pd;
+	int ret;
+
+	ret = ib_check_mr_access(mr_access_flags);
+	if (ret)
+		return ret;
+
+	if (!mr->device->rereg_phys_mr)
+		return -ENOSYS;
+
+	if (atomic_read(&mr->usecnt))
+		return -EBUSY;
+
+	old_pd = mr->pd;
+
+	ret = mr->device->rereg_phys_mr(mr, mr_rereg_mask, pd,
+					phys_buf_array, num_phys_buf,
+					mr_access_flags, iova_start);
+
+	if (!ret && (mr_rereg_mask & IB_MR_REREG_PD)) {
+		atomic_dec(&old_pd->usecnt);
+		atomic_inc(&pd->usecnt);
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL(ib_rereg_phys_mr);
+
+int ib_query_mr(struct ib_mr *mr, struct ib_mr_attr *mr_attr)
+{
+	return mr->device->query_mr ?
+		mr->device->query_mr(mr, mr_attr) : -ENOSYS;
+}
+EXPORT_SYMBOL(ib_query_mr);
+
+int ib_dereg_mr(struct ib_mr *mr)
+{
+	struct ib_pd *pd;
+	int ret;
+
+	if (atomic_read(&mr->usecnt))
+		return -EBUSY;
+
+	pd = mr->pd;
+	ret = mr->device->dereg_mr(mr);
+	if (!ret)
+		atomic_dec(&pd->usecnt);
+
+	return ret;
+}
+EXPORT_SYMBOL(ib_dereg_mr);
+
+struct ib_mr *ib_create_mr(struct ib_pd *pd,
+			   struct ib_mr_init_attr *mr_init_attr)
+{
+	struct ib_mr *mr;
+
+	if (!pd->device->create_mr)
+		return ERR_PTR(-ENOSYS);
+
+	mr = pd->device->create_mr(pd, mr_init_attr);
+
+	if (!IS_ERR(mr)) {
+		mr->device  = pd->device;
+		mr->pd      = pd;
+		mr->uobject = NULL;
+		atomic_inc(&pd->usecnt);
+		atomic_set(&mr->usecnt, 0);
+	}
+
+	return mr;
+}
+EXPORT_SYMBOL(ib_create_mr);
+
+int ib_destroy_mr(struct ib_mr *mr)
+{
+	struct ib_pd *pd;
+	int ret;
+
+	if (atomic_read(&mr->usecnt))
+		return -EBUSY;
+
+	pd = mr->pd;
+	ret = mr->device->destroy_mr(mr);
+	if (!ret)
+		atomic_dec(&pd->usecnt);
+
+	return ret;
+}
+EXPORT_SYMBOL(ib_destroy_mr);
+
+struct ib_mr *ib_alloc_fast_reg_mr(struct ib_pd *pd, int max_page_list_len)
+{
+	struct ib_mr *mr;
+
+	if (!pd->device->alloc_fast_reg_mr)
+		return ERR_PTR(-ENOSYS);
+
+	mr = pd->device->alloc_fast_reg_mr(pd, max_page_list_len);
+
+	if (!IS_ERR(mr)) {
+		mr->device  = pd->device;
+		mr->pd      = pd;
+		mr->uobject = NULL;
+		atomic_inc(&pd->usecnt);
+		atomic_set(&mr->usecnt, 0);
+	}
+
+	return mr;
+}
+EXPORT_SYMBOL(ib_alloc_fast_reg_mr);
+
+struct ib_fast_reg_page_list *ib_alloc_fast_reg_page_list(struct ib_device *device,
+							  int max_page_list_len)
+{
+	struct ib_fast_reg_page_list *page_list;
+
+	if (!device->alloc_fast_reg_page_list)
+		return ERR_PTR(-ENOSYS);
+
+	page_list = device->alloc_fast_reg_page_list(device, max_page_list_len);
+
+	if (!IS_ERR(page_list)) {
+		page_list->device = device;
+		page_list->max_page_list_len = max_page_list_len;
+	}
+
+	return page_list;
+}
+EXPORT_SYMBOL(ib_alloc_fast_reg_page_list);
+
+void ib_free_fast_reg_page_list(struct ib_fast_reg_page_list *page_list)
+{
+	page_list->device->free_fast_reg_page_list(page_list);
+}
+EXPORT_SYMBOL(ib_free_fast_reg_page_list);
+
+/* Memory windows */
+
+struct ib_mw *ib_alloc_mw(struct ib_pd *pd, enum ib_mw_type type)
+{
+	struct ib_mw *mw;
+
+	if (!pd->device->alloc_mw)
+		return ERR_PTR(-ENOSYS);
+
+	mw = pd->device->alloc_mw(pd, type);
+	if (!IS_ERR(mw)) {
+		mw->device  = pd->device;
+		mw->pd      = pd;
+		mw->uobject = NULL;
+		mw->type    = type;
+		atomic_inc(&pd->usecnt);
+	}
+
+	return mw;
+}
+EXPORT_SYMBOL(ib_alloc_mw);
+
+int ib_dealloc_mw(struct ib_mw *mw)
+{
+	struct ib_pd *pd;
+	int ret;
+
+	pd = mw->pd;
+	ret = mw->device->dealloc_mw(mw);
+	if (!ret)
+		atomic_dec(&pd->usecnt);
+
+	return ret;
+}
+EXPORT_SYMBOL(ib_dealloc_mw);
+
+/* "Fast" memory regions */
+
+struct ib_fmr *ib_alloc_fmr(struct ib_pd *pd,
+			    int mr_access_flags,
+			    struct ib_fmr_attr *fmr_attr)
+{
+	struct ib_fmr *fmr;
+
+	if (!pd->device->alloc_fmr)
+		return ERR_PTR(-ENOSYS);
+
+	fmr = pd->device->alloc_fmr(pd, mr_access_flags, fmr_attr);
+	if (!IS_ERR(fmr)) {
+		fmr->device = pd->device;
+		fmr->pd     = pd;
+		atomic_inc(&pd->usecnt);
+	}
+
+	return fmr;
+}
+EXPORT_SYMBOL(ib_alloc_fmr);
+
+int ib_unmap_fmr(struct list_head *fmr_list)
+{
+	struct ib_fmr *fmr;
+
+	if (list_empty(fmr_list))
+		return 0;
+
+	fmr = list_entry(fmr_list->next, struct ib_fmr, list);
+	return fmr->device->unmap_fmr(fmr_list);
+}
+EXPORT_SYMBOL(ib_unmap_fmr);
+
+int ib_dealloc_fmr(struct ib_fmr *fmr)
+{
+	struct ib_pd *pd;
+	int ret;
+
+	pd = fmr->pd;
+	ret = fmr->device->dealloc_fmr(fmr);
+	if (!ret)
+		atomic_dec(&pd->usecnt);
+
+	return ret;
+}
+EXPORT_SYMBOL(ib_dealloc_fmr);
+
+/* Multicast groups */
+
+int ib_attach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid)
+{
+	int ret;
+
+	if (!qp->device->attach_mcast)
+		return -ENOSYS;
+	if (gid->raw[0] != 0xff || qp->qp_type != IB_QPT_UD)
+		return -EINVAL;
+
+	ret = qp->device->attach_mcast(qp, gid, lid);
+	if (!ret)
+		atomic_inc(&qp->usecnt);
+	return ret;
+}
+EXPORT_SYMBOL(ib_attach_mcast);
+
+int ib_detach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid)
+{
+	int ret;
+
+	if (!qp->device->detach_mcast)
+		return -ENOSYS;
+	if (gid->raw[0] != 0xff || qp->qp_type != IB_QPT_UD)
+		return -EINVAL;
+
+	ret = qp->device->detach_mcast(qp, gid, lid);
+	if (!ret)
+		atomic_dec(&qp->usecnt);
+	return ret;
+}
+EXPORT_SYMBOL(ib_detach_mcast);
+
+struct ib_xrcd *ib_alloc_xrcd(struct ib_device *device)
+{
+	struct ib_xrcd *xrcd;
+
+	if (!device->alloc_xrcd)
+		return ERR_PTR(-ENOSYS);
+
+	xrcd = device->alloc_xrcd(device, NULL, NULL);
+	if (!IS_ERR(xrcd)) {
+		xrcd->device = device;
+		xrcd->inode = NULL;
+		atomic_set(&xrcd->usecnt, 0);
+		mutex_init(&xrcd->tgt_qp_mutex);
+		INIT_LIST_HEAD(&xrcd->tgt_qp_list);
+	}
+
+	return xrcd;
+}
+EXPORT_SYMBOL(ib_alloc_xrcd);
+
+int ib_dealloc_xrcd(struct ib_xrcd *xrcd)
+{
+	struct ib_qp *qp;
+	int ret;
+
+	if (atomic_read(&xrcd->usecnt))
+		return -EBUSY;
+
+	while (!list_empty(&xrcd->tgt_qp_list)) {
+		qp = list_entry(xrcd->tgt_qp_list.next, struct ib_qp, xrcd_list);
+		ret = ib_destroy_qp(qp);
+		if (ret)
+			return ret;
+	}
+
+	return xrcd->device->dealloc_xrcd(xrcd);
+}
+EXPORT_SYMBOL(ib_dealloc_xrcd);
+
+struct ib_flow *ib_create_flow(struct ib_qp *qp,
+			       struct ib_flow_attr *flow_attr,
+			       int domain)
+{
+	struct ib_flow *flow_id;
+	if (!qp->device->create_flow)
+		return ERR_PTR(-ENOSYS);
+
+	flow_id = qp->device->create_flow(qp, flow_attr, domain);
+	if (!IS_ERR(flow_id))
+		atomic_inc(&qp->usecnt);
+	return flow_id;
+}
+EXPORT_SYMBOL(ib_create_flow);
+
+int ib_destroy_flow(struct ib_flow *flow_id)
+{
+	int err;
+	struct ib_qp *qp = flow_id->qp;
+
+	err = qp->device->destroy_flow(flow_id);
+	if (!err)
+		atomic_dec(&qp->usecnt);
+	return err;
+}
+EXPORT_SYMBOL(ib_destroy_flow);
+
+int ib_check_mr_status(struct ib_mr *mr, u32 check_mask,
+		       struct ib_mr_status *mr_status)
+{
+	return mr->device->check_mr_status ?
+		mr->device->check_mr_status(mr, check_mask, mr_status) : -ENOSYS;
+}
+EXPORT_SYMBOL(ib_check_mr_status);
diff --git a/drivers/infiniband/hw/Makefile b/drivers/infiniband/hw/Makefile
new file mode 100644
index 0000000..e900b03
--- /dev/null
+++ b/drivers/infiniband/hw/Makefile
@@ -0,0 +1,12 @@
+obj-$(CONFIG_INFINIBAND_MTHCA)		+= mthca/
+obj-$(CONFIG_INFINIBAND_IPATH)		+= ipath/
+obj-$(CONFIG_INFINIBAND_QIB)		+= qib/
+obj-$(CONFIG_INFINIBAND_EHCA)		+= ehca/
+obj-$(CONFIG_INFINIBAND_AMSO1100)	+= amso1100/
+obj-$(CONFIG_INFINIBAND_CXGB3)		+= cxgb3/
+obj-$(CONFIG_INFINIBAND_CXGB4)		+= cxgb4/
+obj-$(CONFIG_MLX4_INFINIBAND)		+= mlx4/
+obj-$(CONFIG_MLX5_INFINIBAND)		+= mlx5/
+obj-$(CONFIG_INFINIBAND_NES)		+= nes/
+obj-$(CONFIG_INFINIBAND_OCRDMA)		+= ocrdma/
+obj-$(CONFIG_INFINIBAND_USNIC)		+= usnic/
diff --git a/drivers/infiniband/hw/amso1100/Kbuild b/drivers/infiniband/hw/amso1100/Kbuild
new file mode 100644
index 0000000..950dfab
--- /dev/null
+++ b/drivers/infiniband/hw/amso1100/Kbuild
@@ -0,0 +1,6 @@
+ccflags-$(CONFIG_INFINIBAND_AMSO1100_DEBUG) := -DDEBUG
+
+obj-$(CONFIG_INFINIBAND_AMSO1100) += iw_c2.o
+
+iw_c2-y := c2.o c2_provider.o c2_rnic.o c2_alloc.o c2_mq.o c2_ae.o c2_vq.o \
+	c2_intr.o c2_cq.o c2_qp.o c2_cm.o c2_mm.o c2_pd.o
diff --git a/drivers/infiniband/hw/amso1100/Kconfig b/drivers/infiniband/hw/amso1100/Kconfig
new file mode 100644
index 0000000..e6ce5f2
--- /dev/null
+++ b/drivers/infiniband/hw/amso1100/Kconfig
@@ -0,0 +1,15 @@
+config INFINIBAND_AMSO1100
+	tristate "Ammasso 1100 HCA support"
+	depends on PCI && INET
+	---help---
+	  This is a low-level driver for the Ammasso 1100 host
+	  channel adapter (HCA).
+
+config INFINIBAND_AMSO1100_DEBUG
+	bool "Verbose debugging output"
+	depends on INFINIBAND_AMSO1100
+	default n
+	---help---
+	  This option causes the amso1100 driver to produce a bunch of
+	  debug messages.  Select this if you are developing the driver
+	  or trying to diagnose a problem.
diff --git a/drivers/infiniband/hw/amso1100/c2.c b/drivers/infiniband/hw/amso1100/c2.c
new file mode 100644
index 0000000..766a71c
--- /dev/null
+++ b/drivers/infiniband/hw/amso1100/c2.c
@@ -0,0 +1,1241 @@
+/*
+ * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/pci.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/inetdevice.h>
+#include <linux/interrupt.h>
+#include <linux/delay.h>
+#include <linux/ethtool.h>
+#include <linux/mii.h>
+#include <linux/if_vlan.h>
+#include <linux/crc32.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>
+#include <linux/init.h>
+#include <linux/dma-mapping.h>
+#include <linux/slab.h>
+#include <linux/prefetch.h>
+
+#include <asm/io.h>
+#include <asm/irq.h>
+#include <asm/byteorder.h>
+
+#include <rdma/ib_smi.h>
+#include "c2.h"
+#include "c2_provider.h"
+
+MODULE_AUTHOR("Tom Tucker <tom@opengridcomputing.com>");
+MODULE_DESCRIPTION("Ammasso AMSO1100 Low-level iWARP Driver");
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_VERSION(DRV_VERSION);
+
+static const u32 default_msg = NETIF_MSG_DRV | NETIF_MSG_PROBE | NETIF_MSG_LINK
+    | NETIF_MSG_IFUP | NETIF_MSG_IFDOWN;
+
+static int debug = -1;		/* defaults above */
+module_param(debug, int, 0);
+MODULE_PARM_DESC(debug, "Debug level (0=none,...,16=all)");
+
+static int c2_up(struct net_device *netdev);
+static int c2_down(struct net_device *netdev);
+static int c2_xmit_frame(struct sk_buff *skb, struct net_device *netdev);
+static void c2_tx_interrupt(struct net_device *netdev);
+static void c2_rx_interrupt(struct net_device *netdev);
+static irqreturn_t c2_interrupt(int irq, void *dev_id);
+static void c2_tx_timeout(struct net_device *netdev);
+static int c2_change_mtu(struct net_device *netdev, int new_mtu);
+static void c2_reset(struct c2_port *c2_port);
+
+static struct pci_device_id c2_pci_table[] = {
+	{ PCI_DEVICE(0x18b8, 0xb001) },
+	{ 0 }
+};
+
+MODULE_DEVICE_TABLE(pci, c2_pci_table);
+
+static void c2_print_macaddr(struct net_device *netdev)
+{
+	pr_debug("%s: MAC %pM, IRQ %u\n", netdev->name, netdev->dev_addr, netdev->irq);
+}
+
+static void c2_set_rxbufsize(struct c2_port *c2_port)
+{
+	struct net_device *netdev = c2_port->netdev;
+
+	if (netdev->mtu > RX_BUF_SIZE)
+		c2_port->rx_buf_size =
+		    netdev->mtu + ETH_HLEN + sizeof(struct c2_rxp_hdr) +
+		    NET_IP_ALIGN;
+	else
+		c2_port->rx_buf_size = sizeof(struct c2_rxp_hdr) + RX_BUF_SIZE;
+}
+
+/*
+ * Allocate TX ring elements and chain them together.
+ * One-to-one association of adapter descriptors with ring elements.
+ */
+static int c2_tx_ring_alloc(struct c2_ring *tx_ring, void *vaddr,
+			    dma_addr_t base, void __iomem * mmio_txp_ring)
+{
+	struct c2_tx_desc *tx_desc;
+	struct c2_txp_desc __iomem *txp_desc;
+	struct c2_element *elem;
+	int i;
+
+	tx_ring->start = kmalloc(sizeof(*elem) * tx_ring->count, GFP_KERNEL);
+	if (!tx_ring->start)
+		return -ENOMEM;
+
+	elem = tx_ring->start;
+	tx_desc = vaddr;
+	txp_desc = mmio_txp_ring;
+	for (i = 0; i < tx_ring->count; i++, elem++, tx_desc++, txp_desc++) {
+		tx_desc->len = 0;
+		tx_desc->status = 0;
+
+		/* Set TXP_HTXD_UNINIT */
+		__raw_writeq((__force u64) cpu_to_be64(0x1122334455667788ULL),
+			     (void __iomem *) txp_desc + C2_TXP_ADDR);
+		__raw_writew(0, (void __iomem *) txp_desc + C2_TXP_LEN);
+		__raw_writew((__force u16) cpu_to_be16(TXP_HTXD_UNINIT),
+			     (void __iomem *) txp_desc + C2_TXP_FLAGS);
+
+		elem->skb = NULL;
+		elem->ht_desc = tx_desc;
+		elem->hw_desc = txp_desc;
+
+		if (i == tx_ring->count - 1) {
+			elem->next = tx_ring->start;
+			tx_desc->next_offset = base;
+		} else {
+			elem->next = elem + 1;
+			tx_desc->next_offset =
+			    base + (i + 1) * sizeof(*tx_desc);
+		}
+	}
+
+	tx_ring->to_use = tx_ring->to_clean = tx_ring->start;
+
+	return 0;
+}
+
+/*
+ * Allocate RX ring elements and chain them together.
+ * One-to-one association of adapter descriptors with ring elements.
+ */
+static int c2_rx_ring_alloc(struct c2_ring *rx_ring, void *vaddr,
+			    dma_addr_t base, void __iomem * mmio_rxp_ring)
+{
+	struct c2_rx_desc *rx_desc;
+	struct c2_rxp_desc __iomem *rxp_desc;
+	struct c2_element *elem;
+	int i;
+
+	rx_ring->start = kmalloc(sizeof(*elem) * rx_ring->count, GFP_KERNEL);
+	if (!rx_ring->start)
+		return -ENOMEM;
+
+	elem = rx_ring->start;
+	rx_desc = vaddr;
+	rxp_desc = mmio_rxp_ring;
+	for (i = 0; i < rx_ring->count; i++, elem++, rx_desc++, rxp_desc++) {
+		rx_desc->len = 0;
+		rx_desc->status = 0;
+
+		/* Set RXP_HRXD_UNINIT */
+		__raw_writew((__force u16) cpu_to_be16(RXP_HRXD_OK),
+		       (void __iomem *) rxp_desc + C2_RXP_STATUS);
+		__raw_writew(0, (void __iomem *) rxp_desc + C2_RXP_COUNT);
+		__raw_writew(0, (void __iomem *) rxp_desc + C2_RXP_LEN);
+		__raw_writeq((__force u64) cpu_to_be64(0x99aabbccddeeffULL),
+			     (void __iomem *) rxp_desc + C2_RXP_ADDR);
+		__raw_writew((__force u16) cpu_to_be16(RXP_HRXD_UNINIT),
+			     (void __iomem *) rxp_desc + C2_RXP_FLAGS);
+
+		elem->skb = NULL;
+		elem->ht_desc = rx_desc;
+		elem->hw_desc = rxp_desc;
+
+		if (i == rx_ring->count - 1) {
+			elem->next = rx_ring->start;
+			rx_desc->next_offset = base;
+		} else {
+			elem->next = elem + 1;
+			rx_desc->next_offset =
+			    base + (i + 1) * sizeof(*rx_desc);
+		}
+	}
+
+	rx_ring->to_use = rx_ring->to_clean = rx_ring->start;
+
+	return 0;
+}
+
+/* Setup buffer for receiving */
+static inline int c2_rx_alloc(struct c2_port *c2_port, struct c2_element *elem)
+{
+	struct c2_dev *c2dev = c2_port->c2dev;
+	struct c2_rx_desc *rx_desc = elem->ht_desc;
+	struct sk_buff *skb;
+	dma_addr_t mapaddr;
+	u32 maplen;
+	struct c2_rxp_hdr *rxp_hdr;
+
+	skb = dev_alloc_skb(c2_port->rx_buf_size);
+	if (unlikely(!skb)) {
+		pr_debug("%s: out of memory for receive\n",
+			c2_port->netdev->name);
+		return -ENOMEM;
+	}
+
+	/* Zero out the rxp hdr in the sk_buff */
+	memset(skb->data, 0, sizeof(*rxp_hdr));
+
+	skb->dev = c2_port->netdev;
+
+	maplen = c2_port->rx_buf_size;
+	mapaddr =
+	    pci_map_single(c2dev->pcidev, skb->data, maplen,
+			   PCI_DMA_FROMDEVICE);
+
+	/* Set the sk_buff RXP_header to RXP_HRXD_READY */
+	rxp_hdr = (struct c2_rxp_hdr *) skb->data;
+	rxp_hdr->flags = RXP_HRXD_READY;
+
+	__raw_writew(0, elem->hw_desc + C2_RXP_STATUS);
+	__raw_writew((__force u16) cpu_to_be16((u16) maplen - sizeof(*rxp_hdr)),
+		     elem->hw_desc + C2_RXP_LEN);
+	__raw_writeq((__force u64) cpu_to_be64(mapaddr), elem->hw_desc + C2_RXP_ADDR);
+	__raw_writew((__force u16) cpu_to_be16(RXP_HRXD_READY),
+		     elem->hw_desc + C2_RXP_FLAGS);
+
+	elem->skb = skb;
+	elem->mapaddr = mapaddr;
+	elem->maplen = maplen;
+	rx_desc->len = maplen;
+
+	return 0;
+}
+
+/*
+ * Allocate buffers for the Rx ring
+ * For receive:  rx_ring.to_clean is next received frame
+ */
+static int c2_rx_fill(struct c2_port *c2_port)
+{
+	struct c2_ring *rx_ring = &c2_port->rx_ring;
+	struct c2_element *elem;
+	int ret = 0;
+
+	elem = rx_ring->start;
+	do {
+		if (c2_rx_alloc(c2_port, elem)) {
+			ret = 1;
+			break;
+		}
+	} while ((elem = elem->next) != rx_ring->start);
+
+	rx_ring->to_clean = rx_ring->start;
+	return ret;
+}
+
+/* Free all buffers in RX ring, assumes receiver stopped */
+static void c2_rx_clean(struct c2_port *c2_port)
+{
+	struct c2_dev *c2dev = c2_port->c2dev;
+	struct c2_ring *rx_ring = &c2_port->rx_ring;
+	struct c2_element *elem;
+	struct c2_rx_desc *rx_desc;
+
+	elem = rx_ring->start;
+	do {
+		rx_desc = elem->ht_desc;
+		rx_desc->len = 0;
+
+		__raw_writew(0, elem->hw_desc + C2_RXP_STATUS);
+		__raw_writew(0, elem->hw_desc + C2_RXP_COUNT);
+		__raw_writew(0, elem->hw_desc + C2_RXP_LEN);
+		__raw_writeq((__force u64) cpu_to_be64(0x99aabbccddeeffULL),
+			     elem->hw_desc + C2_RXP_ADDR);
+		__raw_writew((__force u16) cpu_to_be16(RXP_HRXD_UNINIT),
+			     elem->hw_desc + C2_RXP_FLAGS);
+
+		if (elem->skb) {
+			pci_unmap_single(c2dev->pcidev, elem->mapaddr,
+					 elem->maplen, PCI_DMA_FROMDEVICE);
+			dev_kfree_skb(elem->skb);
+			elem->skb = NULL;
+		}
+	} while ((elem = elem->next) != rx_ring->start);
+}
+
+static inline int c2_tx_free(struct c2_dev *c2dev, struct c2_element *elem)
+{
+	struct c2_tx_desc *tx_desc = elem->ht_desc;
+
+	tx_desc->len = 0;
+
+	pci_unmap_single(c2dev->pcidev, elem->mapaddr, elem->maplen,
+			 PCI_DMA_TODEVICE);
+
+	if (elem->skb) {
+		dev_kfree_skb_any(elem->skb);
+		elem->skb = NULL;
+	}
+
+	return 0;
+}
+
+/* Free all buffers in TX ring, assumes transmitter stopped */
+static void c2_tx_clean(struct c2_port *c2_port)
+{
+	struct c2_ring *tx_ring = &c2_port->tx_ring;
+	struct c2_element *elem;
+	struct c2_txp_desc txp_htxd;
+	int retry;
+	unsigned long flags;
+
+	spin_lock_irqsave(&c2_port->tx_lock, flags);
+
+	elem = tx_ring->start;
+
+	do {
+		retry = 0;
+		do {
+			txp_htxd.flags =
+			    readw(elem->hw_desc + C2_TXP_FLAGS);
+
+			if (txp_htxd.flags == TXP_HTXD_READY) {
+				retry = 1;
+				__raw_writew(0,
+					     elem->hw_desc + C2_TXP_LEN);
+				__raw_writeq(0,
+					     elem->hw_desc + C2_TXP_ADDR);
+				__raw_writew((__force u16) cpu_to_be16(TXP_HTXD_DONE),
+					     elem->hw_desc + C2_TXP_FLAGS);
+				c2_port->netdev->stats.tx_dropped++;
+				break;
+			} else {
+				__raw_writew(0,
+					     elem->hw_desc + C2_TXP_LEN);
+				__raw_writeq((__force u64) cpu_to_be64(0x1122334455667788ULL),
+					     elem->hw_desc + C2_TXP_ADDR);
+				__raw_writew((__force u16) cpu_to_be16(TXP_HTXD_UNINIT),
+					     elem->hw_desc + C2_TXP_FLAGS);
+			}
+
+			c2_tx_free(c2_port->c2dev, elem);
+
+		} while ((elem = elem->next) != tx_ring->start);
+	} while (retry);
+
+	c2_port->tx_avail = c2_port->tx_ring.count - 1;
+	c2_port->c2dev->cur_tx = tx_ring->to_use - tx_ring->start;
+
+	if (c2_port->tx_avail > MAX_SKB_FRAGS + 1)
+		netif_wake_queue(c2_port->netdev);
+
+	spin_unlock_irqrestore(&c2_port->tx_lock, flags);
+}
+
+/*
+ * Process transmit descriptors marked 'DONE' by the firmware,
+ * freeing up their unneeded sk_buffs.
+ */
+static void c2_tx_interrupt(struct net_device *netdev)
+{
+	struct c2_port *c2_port = netdev_priv(netdev);
+	struct c2_dev *c2dev = c2_port->c2dev;
+	struct c2_ring *tx_ring = &c2_port->tx_ring;
+	struct c2_element *elem;
+	struct c2_txp_desc txp_htxd;
+
+	spin_lock(&c2_port->tx_lock);
+
+	for (elem = tx_ring->to_clean; elem != tx_ring->to_use;
+	     elem = elem->next) {
+		txp_htxd.flags =
+		    be16_to_cpu((__force __be16) readw(elem->hw_desc + C2_TXP_FLAGS));
+
+		if (txp_htxd.flags != TXP_HTXD_DONE)
+			break;
+
+		if (netif_msg_tx_done(c2_port)) {
+			/* PCI reads are expensive in fast path */
+			txp_htxd.len =
+			    be16_to_cpu((__force __be16) readw(elem->hw_desc + C2_TXP_LEN));
+			pr_debug("%s: tx done slot %3Zu status 0x%x len "
+				"%5u bytes\n",
+				netdev->name, elem - tx_ring->start,
+				txp_htxd.flags, txp_htxd.len);
+		}
+
+		c2_tx_free(c2dev, elem);
+		++(c2_port->tx_avail);
+	}
+
+	tx_ring->to_clean = elem;
+
+	if (netif_queue_stopped(netdev)
+	    && c2_port->tx_avail > MAX_SKB_FRAGS + 1)
+		netif_wake_queue(netdev);
+
+	spin_unlock(&c2_port->tx_lock);
+}
+
+static void c2_rx_error(struct c2_port *c2_port, struct c2_element *elem)
+{
+	struct c2_rx_desc *rx_desc = elem->ht_desc;
+	struct c2_rxp_hdr *rxp_hdr = (struct c2_rxp_hdr *) elem->skb->data;
+
+	if (rxp_hdr->status != RXP_HRXD_OK ||
+	    rxp_hdr->len > (rx_desc->len - sizeof(*rxp_hdr))) {
+		pr_debug("BAD RXP_HRXD\n");
+		pr_debug("  rx_desc : %p\n", rx_desc);
+		pr_debug("    index : %Zu\n",
+			elem - c2_port->rx_ring.start);
+		pr_debug("    len   : %u\n", rx_desc->len);
+		pr_debug("  rxp_hdr : %p [PA %p]\n", rxp_hdr,
+			(void *) __pa((unsigned long) rxp_hdr));
+		pr_debug("    flags : 0x%x\n", rxp_hdr->flags);
+		pr_debug("    status: 0x%x\n", rxp_hdr->status);
+		pr_debug("    len   : %u\n", rxp_hdr->len);
+		pr_debug("    rsvd  : 0x%x\n", rxp_hdr->rsvd);
+	}
+
+	/* Setup the skb for reuse since we're dropping this pkt */
+	elem->skb->data = elem->skb->head;
+	skb_reset_tail_pointer(elem->skb);
+
+	/* Zero out the rxp hdr in the sk_buff */
+	memset(elem->skb->data, 0, sizeof(*rxp_hdr));
+
+	/* Write the descriptor to the adapter's rx ring */
+	__raw_writew(0, elem->hw_desc + C2_RXP_STATUS);
+	__raw_writew(0, elem->hw_desc + C2_RXP_COUNT);
+	__raw_writew((__force u16) cpu_to_be16((u16) elem->maplen - sizeof(*rxp_hdr)),
+		     elem->hw_desc + C2_RXP_LEN);
+	__raw_writeq((__force u64) cpu_to_be64(elem->mapaddr),
+		     elem->hw_desc + C2_RXP_ADDR);
+	__raw_writew((__force u16) cpu_to_be16(RXP_HRXD_READY),
+		     elem->hw_desc + C2_RXP_FLAGS);
+
+	pr_debug("packet dropped\n");
+	c2_port->netdev->stats.rx_dropped++;
+}
+
+static void c2_rx_interrupt(struct net_device *netdev)
+{
+	struct c2_port *c2_port = netdev_priv(netdev);
+	struct c2_dev *c2dev = c2_port->c2dev;
+	struct c2_ring *rx_ring = &c2_port->rx_ring;
+	struct c2_element *elem;
+	struct c2_rx_desc *rx_desc;
+	struct c2_rxp_hdr *rxp_hdr;
+	struct sk_buff *skb;
+	dma_addr_t mapaddr;
+	u32 maplen, buflen;
+	unsigned long flags;
+
+	spin_lock_irqsave(&c2dev->lock, flags);
+
+	/* Begin where we left off */
+	rx_ring->to_clean = rx_ring->start + c2dev->cur_rx;
+
+	for (elem = rx_ring->to_clean; elem->next != rx_ring->to_clean;
+	     elem = elem->next) {
+		rx_desc = elem->ht_desc;
+		mapaddr = elem->mapaddr;
+		maplen = elem->maplen;
+		skb = elem->skb;
+		rxp_hdr = (struct c2_rxp_hdr *) skb->data;
+
+		if (rxp_hdr->flags != RXP_HRXD_DONE)
+			break;
+		buflen = rxp_hdr->len;
+
+		/* Sanity check the RXP header */
+		if (rxp_hdr->status != RXP_HRXD_OK ||
+		    buflen > (rx_desc->len - sizeof(*rxp_hdr))) {
+			c2_rx_error(c2_port, elem);
+			continue;
+		}
+
+		/*
+		 * Allocate and map a new skb for replenishing the host
+		 * RX desc
+		 */
+		if (c2_rx_alloc(c2_port, elem)) {
+			c2_rx_error(c2_port, elem);
+			continue;
+		}
+
+		/* Unmap the old skb */
+		pci_unmap_single(c2dev->pcidev, mapaddr, maplen,
+				 PCI_DMA_FROMDEVICE);
+
+		prefetch(skb->data);
+
+		/*
+		 * Skip past the leading 8 bytes comprising of the
+		 * "struct c2_rxp_hdr", prepended by the adapter
+		 * to the usual Ethernet header ("struct ethhdr"),
+		 * to the start of the raw Ethernet packet.
+		 *
+		 * Fix up the various fields in the sk_buff before
+		 * passing it up to netif_rx(). The transfer size
+		 * (in bytes) specified by the adapter len field of
+		 * the "struct rxp_hdr_t" does NOT include the
+		 * "sizeof(struct c2_rxp_hdr)".
+		 */
+		skb->data += sizeof(*rxp_hdr);
+		skb_set_tail_pointer(skb, buflen);
+		skb->len = buflen;
+		skb->protocol = eth_type_trans(skb, netdev);
+
+		netif_rx(skb);
+
+		netdev->stats.rx_packets++;
+		netdev->stats.rx_bytes += buflen;
+	}
+
+	/* Save where we left off */
+	rx_ring->to_clean = elem;
+	c2dev->cur_rx = elem - rx_ring->start;
+	C2_SET_CUR_RX(c2dev, c2dev->cur_rx);
+
+	spin_unlock_irqrestore(&c2dev->lock, flags);
+}
+
+/*
+ * Handle netisr0 TX & RX interrupts.
+ */
+static irqreturn_t c2_interrupt(int irq, void *dev_id)
+{
+	unsigned int netisr0, dmaisr;
+	int handled = 0;
+	struct c2_dev *c2dev = (struct c2_dev *) dev_id;
+
+	/* Process CCILNET interrupts */
+	netisr0 = readl(c2dev->regs + C2_NISR0);
+	if (netisr0) {
+
+		/*
+		 * There is an issue with the firmware that always
+		 * provides the status of RX for both TX & RX
+		 * interrupts.  So process both queues here.
+		 */
+		c2_rx_interrupt(c2dev->netdev);
+		c2_tx_interrupt(c2dev->netdev);
+
+		/* Clear the interrupt */
+		writel(netisr0, c2dev->regs + C2_NISR0);
+		handled++;
+	}
+
+	/* Process RNIC interrupts */
+	dmaisr = readl(c2dev->regs + C2_DISR);
+	if (dmaisr) {
+		writel(dmaisr, c2dev->regs + C2_DISR);
+		c2_rnic_interrupt(c2dev);
+		handled++;
+	}
+
+	if (handled) {
+		return IRQ_HANDLED;
+	} else {
+		return IRQ_NONE;
+	}
+}
+
+static int c2_up(struct net_device *netdev)
+{
+	struct c2_port *c2_port = netdev_priv(netdev);
+	struct c2_dev *c2dev = c2_port->c2dev;
+	struct c2_element *elem;
+	struct c2_rxp_hdr *rxp_hdr;
+	struct in_device *in_dev;
+	size_t rx_size, tx_size;
+	int ret, i;
+	unsigned int netimr0;
+
+	if (netif_msg_ifup(c2_port))
+		pr_debug("%s: enabling interface\n", netdev->name);
+
+	/* Set the Rx buffer size based on MTU */
+	c2_set_rxbufsize(c2_port);
+
+	/* Allocate DMA'able memory for Tx/Rx host descriptor rings */
+	rx_size = c2_port->rx_ring.count * sizeof(struct c2_rx_desc);
+	tx_size = c2_port->tx_ring.count * sizeof(struct c2_tx_desc);
+
+	c2_port->mem_size = tx_size + rx_size;
+	c2_port->mem = pci_zalloc_consistent(c2dev->pcidev, c2_port->mem_size,
+					     &c2_port->dma);
+	if (c2_port->mem == NULL) {
+		pr_debug("Unable to allocate memory for "
+			"host descriptor rings\n");
+		return -ENOMEM;
+	}
+
+	/* Create the Rx host descriptor ring */
+	if ((ret =
+	     c2_rx_ring_alloc(&c2_port->rx_ring, c2_port->mem, c2_port->dma,
+			      c2dev->mmio_rxp_ring))) {
+		pr_debug("Unable to create RX ring\n");
+		goto bail0;
+	}
+
+	/* Allocate Rx buffers for the host descriptor ring */
+	if (c2_rx_fill(c2_port)) {
+		pr_debug("Unable to fill RX ring\n");
+		goto bail1;
+	}
+
+	/* Create the Tx host descriptor ring */
+	if ((ret = c2_tx_ring_alloc(&c2_port->tx_ring, c2_port->mem + rx_size,
+				    c2_port->dma + rx_size,
+				    c2dev->mmio_txp_ring))) {
+		pr_debug("Unable to create TX ring\n");
+		goto bail1;
+	}
+
+	/* Set the TX pointer to where we left off */
+	c2_port->tx_avail = c2_port->tx_ring.count - 1;
+	c2_port->tx_ring.to_use = c2_port->tx_ring.to_clean =
+	    c2_port->tx_ring.start + c2dev->cur_tx;
+
+	/* missing: Initialize MAC */
+
+	BUG_ON(c2_port->tx_ring.to_use != c2_port->tx_ring.to_clean);
+
+	/* Reset the adapter, ensures the driver is in sync with the RXP */
+	c2_reset(c2_port);
+
+	/* Reset the READY bit in the sk_buff RXP headers & adapter HRXDQ */
+	for (i = 0, elem = c2_port->rx_ring.start; i < c2_port->rx_ring.count;
+	     i++, elem++) {
+		rxp_hdr = (struct c2_rxp_hdr *) elem->skb->data;
+		rxp_hdr->flags = 0;
+		__raw_writew((__force u16) cpu_to_be16(RXP_HRXD_READY),
+			     elem->hw_desc + C2_RXP_FLAGS);
+	}
+
+	/* Enable network packets */
+	netif_start_queue(netdev);
+
+	/* Enable IRQ */
+	writel(0, c2dev->regs + C2_IDIS);
+	netimr0 = readl(c2dev->regs + C2_NIMR0);
+	netimr0 &= ~(C2_PCI_HTX_INT | C2_PCI_HRX_INT);
+	writel(netimr0, c2dev->regs + C2_NIMR0);
+
+	/* Tell the stack to ignore arp requests for ipaddrs bound to
+	 * other interfaces.  This is needed to prevent the host stack
+	 * from responding to arp requests to the ipaddr bound on the
+	 * rdma interface.
+	 */
+	in_dev = in_dev_get(netdev);
+	IN_DEV_CONF_SET(in_dev, ARP_IGNORE, 1);
+	in_dev_put(in_dev);
+
+	return 0;
+
+      bail1:
+	c2_rx_clean(c2_port);
+	kfree(c2_port->rx_ring.start);
+
+      bail0:
+	pci_free_consistent(c2dev->pcidev, c2_port->mem_size, c2_port->mem,
+			    c2_port->dma);
+
+	return ret;
+}
+
+static int c2_down(struct net_device *netdev)
+{
+	struct c2_port *c2_port = netdev_priv(netdev);
+	struct c2_dev *c2dev = c2_port->c2dev;
+
+	if (netif_msg_ifdown(c2_port))
+		pr_debug("%s: disabling interface\n",
+			netdev->name);
+
+	/* Wait for all the queued packets to get sent */
+	c2_tx_interrupt(netdev);
+
+	/* Disable network packets */
+	netif_stop_queue(netdev);
+
+	/* Disable IRQs by clearing the interrupt mask */
+	writel(1, c2dev->regs + C2_IDIS);
+	writel(0, c2dev->regs + C2_NIMR0);
+
+	/* missing: Stop transmitter */
+
+	/* missing: Stop receiver */
+
+	/* Reset the adapter, ensures the driver is in sync with the RXP */
+	c2_reset(c2_port);
+
+	/* missing: Turn off LEDs here */
+
+	/* Free all buffers in the host descriptor rings */
+	c2_tx_clean(c2_port);
+	c2_rx_clean(c2_port);
+
+	/* Free the host descriptor rings */
+	kfree(c2_port->rx_ring.start);
+	kfree(c2_port->tx_ring.start);
+	pci_free_consistent(c2dev->pcidev, c2_port->mem_size, c2_port->mem,
+			    c2_port->dma);
+
+	return 0;
+}
+
+static void c2_reset(struct c2_port *c2_port)
+{
+	struct c2_dev *c2dev = c2_port->c2dev;
+	unsigned int cur_rx = c2dev->cur_rx;
+
+	/* Tell the hardware to quiesce */
+	C2_SET_CUR_RX(c2dev, cur_rx | C2_PCI_HRX_QUI);
+
+	/*
+	 * The hardware will reset the C2_PCI_HRX_QUI bit once
+	 * the RXP is quiesced.  Wait 2 seconds for this.
+	 */
+	ssleep(2);
+
+	cur_rx = C2_GET_CUR_RX(c2dev);
+
+	if (cur_rx & C2_PCI_HRX_QUI)
+		pr_debug("c2_reset: failed to quiesce the hardware!\n");
+
+	cur_rx &= ~C2_PCI_HRX_QUI;
+
+	c2dev->cur_rx = cur_rx;
+
+	pr_debug("Current RX: %u\n", c2dev->cur_rx);
+}
+
+static int c2_xmit_frame(struct sk_buff *skb, struct net_device *netdev)
+{
+	struct c2_port *c2_port = netdev_priv(netdev);
+	struct c2_dev *c2dev = c2_port->c2dev;
+	struct c2_ring *tx_ring = &c2_port->tx_ring;
+	struct c2_element *elem;
+	dma_addr_t mapaddr;
+	u32 maplen;
+	unsigned long flags;
+	unsigned int i;
+
+	spin_lock_irqsave(&c2_port->tx_lock, flags);
+
+	if (unlikely(c2_port->tx_avail < (skb_shinfo(skb)->nr_frags + 1))) {
+		netif_stop_queue(netdev);
+		spin_unlock_irqrestore(&c2_port->tx_lock, flags);
+
+		pr_debug("%s: Tx ring full when queue awake!\n",
+			netdev->name);
+		return NETDEV_TX_BUSY;
+	}
+
+	maplen = skb_headlen(skb);
+	mapaddr =
+	    pci_map_single(c2dev->pcidev, skb->data, maplen, PCI_DMA_TODEVICE);
+
+	elem = tx_ring->to_use;
+	elem->skb = skb;
+	elem->mapaddr = mapaddr;
+	elem->maplen = maplen;
+
+	/* Tell HW to xmit */
+	__raw_writeq((__force u64) cpu_to_be64(mapaddr),
+		     elem->hw_desc + C2_TXP_ADDR);
+	__raw_writew((__force u16) cpu_to_be16(maplen),
+		     elem->hw_desc + C2_TXP_LEN);
+	__raw_writew((__force u16) cpu_to_be16(TXP_HTXD_READY),
+		     elem->hw_desc + C2_TXP_FLAGS);
+
+	netdev->stats.tx_packets++;
+	netdev->stats.tx_bytes += maplen;
+
+	/* Loop thru additional data fragments and queue them */
+	if (skb_shinfo(skb)->nr_frags) {
+		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+			const skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+			maplen = skb_frag_size(frag);
+			mapaddr = skb_frag_dma_map(&c2dev->pcidev->dev, frag,
+						   0, maplen, DMA_TO_DEVICE);
+			elem = elem->next;
+			elem->skb = NULL;
+			elem->mapaddr = mapaddr;
+			elem->maplen = maplen;
+
+			/* Tell HW to xmit */
+			__raw_writeq((__force u64) cpu_to_be64(mapaddr),
+				     elem->hw_desc + C2_TXP_ADDR);
+			__raw_writew((__force u16) cpu_to_be16(maplen),
+				     elem->hw_desc + C2_TXP_LEN);
+			__raw_writew((__force u16) cpu_to_be16(TXP_HTXD_READY),
+				     elem->hw_desc + C2_TXP_FLAGS);
+
+			netdev->stats.tx_packets++;
+			netdev->stats.tx_bytes += maplen;
+		}
+	}
+
+	tx_ring->to_use = elem->next;
+	c2_port->tx_avail -= (skb_shinfo(skb)->nr_frags + 1);
+
+	if (c2_port->tx_avail <= MAX_SKB_FRAGS + 1) {
+		netif_stop_queue(netdev);
+		if (netif_msg_tx_queued(c2_port))
+			pr_debug("%s: transmit queue full\n",
+				netdev->name);
+	}
+
+	spin_unlock_irqrestore(&c2_port->tx_lock, flags);
+
+	netdev->trans_start = jiffies;
+
+	return NETDEV_TX_OK;
+}
+
+static void c2_tx_timeout(struct net_device *netdev)
+{
+	struct c2_port *c2_port = netdev_priv(netdev);
+
+	if (netif_msg_timer(c2_port))
+		pr_debug("%s: tx timeout\n", netdev->name);
+
+	c2_tx_clean(c2_port);
+}
+
+static int c2_change_mtu(struct net_device *netdev, int new_mtu)
+{
+	int ret = 0;
+
+	if (new_mtu < ETH_ZLEN || new_mtu > ETH_JUMBO_MTU)
+		return -EINVAL;
+
+	netdev->mtu = new_mtu;
+
+	if (netif_running(netdev)) {
+		c2_down(netdev);
+
+		c2_up(netdev);
+	}
+
+	return ret;
+}
+
+static const struct net_device_ops c2_netdev = {
+	.ndo_open 		= c2_up,
+	.ndo_stop 		= c2_down,
+	.ndo_start_xmit		= c2_xmit_frame,
+	.ndo_tx_timeout		= c2_tx_timeout,
+	.ndo_change_mtu		= c2_change_mtu,
+	.ndo_set_mac_address 	= eth_mac_addr,
+	.ndo_validate_addr	= eth_validate_addr,
+};
+
+/* Initialize network device */
+static struct net_device *c2_devinit(struct c2_dev *c2dev,
+				     void __iomem * mmio_addr)
+{
+	struct c2_port *c2_port = NULL;
+	struct net_device *netdev = alloc_etherdev(sizeof(*c2_port));
+
+	if (!netdev) {
+		pr_debug("c2_port etherdev alloc failed");
+		return NULL;
+	}
+
+	SET_NETDEV_DEV(netdev, &c2dev->pcidev->dev);
+
+	netdev->netdev_ops = &c2_netdev;
+	netdev->watchdog_timeo = C2_TX_TIMEOUT;
+	netdev->irq = c2dev->pcidev->irq;
+
+	c2_port = netdev_priv(netdev);
+	c2_port->netdev = netdev;
+	c2_port->c2dev = c2dev;
+	c2_port->msg_enable = netif_msg_init(debug, default_msg);
+	c2_port->tx_ring.count = C2_NUM_TX_DESC;
+	c2_port->rx_ring.count = C2_NUM_RX_DESC;
+
+	spin_lock_init(&c2_port->tx_lock);
+
+	/* Copy our 48-bit ethernet hardware address */
+	memcpy_fromio(netdev->dev_addr, mmio_addr + C2_REGS_ENADDR, 6);
+
+	/* Validate the MAC address */
+	if (!is_valid_ether_addr(netdev->dev_addr)) {
+		pr_debug("Invalid MAC Address\n");
+		c2_print_macaddr(netdev);
+		free_netdev(netdev);
+		return NULL;
+	}
+
+	c2dev->netdev = netdev;
+
+	return netdev;
+}
+
+static int c2_probe(struct pci_dev *pcidev, const struct pci_device_id *ent)
+{
+	int ret = 0, i;
+	unsigned long reg0_start, reg0_flags, reg0_len;
+	unsigned long reg2_start, reg2_flags, reg2_len;
+	unsigned long reg4_start, reg4_flags, reg4_len;
+	unsigned kva_map_size;
+	struct net_device *netdev = NULL;
+	struct c2_dev *c2dev = NULL;
+	void __iomem *mmio_regs = NULL;
+
+	printk(KERN_INFO PFX "AMSO1100 Gigabit Ethernet driver v%s loaded\n",
+		DRV_VERSION);
+
+	/* Enable PCI device */
+	ret = pci_enable_device(pcidev);
+	if (ret) {
+		printk(KERN_ERR PFX "%s: Unable to enable PCI device\n",
+			pci_name(pcidev));
+		goto bail0;
+	}
+
+	reg0_start = pci_resource_start(pcidev, BAR_0);
+	reg0_len = pci_resource_len(pcidev, BAR_0);
+	reg0_flags = pci_resource_flags(pcidev, BAR_0);
+
+	reg2_start = pci_resource_start(pcidev, BAR_2);
+	reg2_len = pci_resource_len(pcidev, BAR_2);
+	reg2_flags = pci_resource_flags(pcidev, BAR_2);
+
+	reg4_start = pci_resource_start(pcidev, BAR_4);
+	reg4_len = pci_resource_len(pcidev, BAR_4);
+	reg4_flags = pci_resource_flags(pcidev, BAR_4);
+
+	pr_debug("BAR0 size = 0x%lX bytes\n", reg0_len);
+	pr_debug("BAR2 size = 0x%lX bytes\n", reg2_len);
+	pr_debug("BAR4 size = 0x%lX bytes\n", reg4_len);
+
+	/* Make sure PCI base addr are MMIO */
+	if (!(reg0_flags & IORESOURCE_MEM) ||
+	    !(reg2_flags & IORESOURCE_MEM) || !(reg4_flags & IORESOURCE_MEM)) {
+		printk(KERN_ERR PFX "PCI regions not an MMIO resource\n");
+		ret = -ENODEV;
+		goto bail1;
+	}
+
+	/* Check for weird/broken PCI region reporting */
+	if ((reg0_len < C2_REG0_SIZE) ||
+	    (reg2_len < C2_REG2_SIZE) || (reg4_len < C2_REG4_SIZE)) {
+		printk(KERN_ERR PFX "Invalid PCI region sizes\n");
+		ret = -ENODEV;
+		goto bail1;
+	}
+
+	/* Reserve PCI I/O and memory resources */
+	ret = pci_request_regions(pcidev, DRV_NAME);
+	if (ret) {
+		printk(KERN_ERR PFX "%s: Unable to request regions\n",
+			pci_name(pcidev));
+		goto bail1;
+	}
+
+	if ((sizeof(dma_addr_t) > 4)) {
+		ret = pci_set_dma_mask(pcidev, DMA_BIT_MASK(64));
+		if (ret < 0) {
+			printk(KERN_ERR PFX "64b DMA configuration failed\n");
+			goto bail2;
+		}
+	} else {
+		ret = pci_set_dma_mask(pcidev, DMA_BIT_MASK(32));
+		if (ret < 0) {
+			printk(KERN_ERR PFX "32b DMA configuration failed\n");
+			goto bail2;
+		}
+	}
+
+	/* Enables bus-mastering on the device */
+	pci_set_master(pcidev);
+
+	/* Remap the adapter PCI registers in BAR4 */
+	mmio_regs = ioremap_nocache(reg4_start + C2_PCI_REGS_OFFSET,
+				    sizeof(struct c2_adapter_pci_regs));
+	if (!mmio_regs) {
+		printk(KERN_ERR PFX
+			"Unable to remap adapter PCI registers in BAR4\n");
+		ret = -EIO;
+		goto bail2;
+	}
+
+	/* Validate PCI regs magic */
+	for (i = 0; i < sizeof(c2_magic); i++) {
+		if (c2_magic[i] != readb(mmio_regs + C2_REGS_MAGIC + i)) {
+			printk(KERN_ERR PFX "Downlevel Firmware boot loader "
+				"[%d/%Zd: got 0x%x, exp 0x%x]. Use the cc_flash "
+			       "utility to update your boot loader\n",
+				i + 1, sizeof(c2_magic),
+				readb(mmio_regs + C2_REGS_MAGIC + i),
+				c2_magic[i]);
+			printk(KERN_ERR PFX "Adapter not claimed\n");
+			iounmap(mmio_regs);
+			ret = -EIO;
+			goto bail2;
+		}
+	}
+
+	/* Validate the adapter version */
+	if (be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_VERS)) != C2_VERSION) {
+		printk(KERN_ERR PFX "Version mismatch "
+			"[fw=%u, c2=%u], Adapter not claimed\n",
+			be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_VERS)),
+			C2_VERSION);
+		ret = -EINVAL;
+		iounmap(mmio_regs);
+		goto bail2;
+	}
+
+	/* Validate the adapter IVN */
+	if (be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_IVN)) != C2_IVN) {
+		printk(KERN_ERR PFX "Downlevel FIrmware level. You should be using "
+		       "the OpenIB device support kit. "
+		       "[fw=0x%x, c2=0x%x], Adapter not claimed\n",
+		       be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_IVN)),
+		       C2_IVN);
+		ret = -EINVAL;
+		iounmap(mmio_regs);
+		goto bail2;
+	}
+
+	/* Allocate hardware structure */
+	c2dev = (struct c2_dev *) ib_alloc_device(sizeof(*c2dev));
+	if (!c2dev) {
+		printk(KERN_ERR PFX "%s: Unable to alloc hardware struct\n",
+			pci_name(pcidev));
+		ret = -ENOMEM;
+		iounmap(mmio_regs);
+		goto bail2;
+	}
+
+	memset(c2dev, 0, sizeof(*c2dev));
+	spin_lock_init(&c2dev->lock);
+	c2dev->pcidev = pcidev;
+	c2dev->cur_tx = 0;
+
+	/* Get the last RX index */
+	c2dev->cur_rx =
+	    (be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_HRX_CUR)) -
+	     0xffffc000) / sizeof(struct c2_rxp_desc);
+
+	/* Request an interrupt line for the driver */
+	ret = request_irq(pcidev->irq, c2_interrupt, IRQF_SHARED, DRV_NAME, c2dev);
+	if (ret) {
+		printk(KERN_ERR PFX "%s: requested IRQ %u is busy\n",
+			pci_name(pcidev), pcidev->irq);
+		iounmap(mmio_regs);
+		goto bail3;
+	}
+
+	/* Set driver specific data */
+	pci_set_drvdata(pcidev, c2dev);
+
+	/* Initialize network device */
+	if ((netdev = c2_devinit(c2dev, mmio_regs)) == NULL) {
+		ret = -ENOMEM;
+		iounmap(mmio_regs);
+		goto bail4;
+	}
+
+	/* Save off the actual size prior to unmapping mmio_regs */
+	kva_map_size = be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_PCI_WINSIZE));
+
+	/* Unmap the adapter PCI registers in BAR4 */
+	iounmap(mmio_regs);
+
+	/* Register network device */
+	ret = register_netdev(netdev);
+	if (ret) {
+		printk(KERN_ERR PFX "Unable to register netdev, ret = %d\n",
+			ret);
+		goto bail5;
+	}
+
+	/* Disable network packets */
+	netif_stop_queue(netdev);
+
+	/* Remap the adapter HRXDQ PA space to kernel VA space */
+	c2dev->mmio_rxp_ring = ioremap_nocache(reg4_start + C2_RXP_HRXDQ_OFFSET,
+					       C2_RXP_HRXDQ_SIZE);
+	if (!c2dev->mmio_rxp_ring) {
+		printk(KERN_ERR PFX "Unable to remap MMIO HRXDQ region\n");
+		ret = -EIO;
+		goto bail6;
+	}
+
+	/* Remap the adapter HTXDQ PA space to kernel VA space */
+	c2dev->mmio_txp_ring = ioremap_nocache(reg4_start + C2_TXP_HTXDQ_OFFSET,
+					       C2_TXP_HTXDQ_SIZE);
+	if (!c2dev->mmio_txp_ring) {
+		printk(KERN_ERR PFX "Unable to remap MMIO HTXDQ region\n");
+		ret = -EIO;
+		goto bail7;
+	}
+
+	/* Save off the current RX index in the last 4 bytes of the TXP Ring */
+	C2_SET_CUR_RX(c2dev, c2dev->cur_rx);
+
+	/* Remap the PCI registers in adapter BAR0 to kernel VA space */
+	c2dev->regs = ioremap_nocache(reg0_start, reg0_len);
+	if (!c2dev->regs) {
+		printk(KERN_ERR PFX "Unable to remap BAR0\n");
+		ret = -EIO;
+		goto bail8;
+	}
+
+	/* Remap the PCI registers in adapter BAR4 to kernel VA space */
+	c2dev->pa = reg4_start + C2_PCI_REGS_OFFSET;
+	c2dev->kva = ioremap_nocache(reg4_start + C2_PCI_REGS_OFFSET,
+				     kva_map_size);
+	if (!c2dev->kva) {
+		printk(KERN_ERR PFX "Unable to remap BAR4\n");
+		ret = -EIO;
+		goto bail9;
+	}
+
+	/* Print out the MAC address */
+	c2_print_macaddr(netdev);
+
+	ret = c2_rnic_init(c2dev);
+	if (ret) {
+		printk(KERN_ERR PFX "c2_rnic_init failed: %d\n", ret);
+		goto bail10;
+	}
+
+	ret = c2_register_device(c2dev);
+	if (ret)
+		goto bail10;
+
+	return 0;
+
+ bail10:
+	iounmap(c2dev->kva);
+
+ bail9:
+	iounmap(c2dev->regs);
+
+ bail8:
+	iounmap(c2dev->mmio_txp_ring);
+
+ bail7:
+	iounmap(c2dev->mmio_rxp_ring);
+
+ bail6:
+	unregister_netdev(netdev);
+
+ bail5:
+	free_netdev(netdev);
+
+ bail4:
+	free_irq(pcidev->irq, c2dev);
+
+ bail3:
+	ib_dealloc_device(&c2dev->ibdev);
+
+ bail2:
+	pci_release_regions(pcidev);
+
+ bail1:
+	pci_disable_device(pcidev);
+
+ bail0:
+	return ret;
+}
+
+static void c2_remove(struct pci_dev *pcidev)
+{
+	struct c2_dev *c2dev = pci_get_drvdata(pcidev);
+	struct net_device *netdev = c2dev->netdev;
+
+	/* Unregister with OpenIB */
+	c2_unregister_device(c2dev);
+
+	/* Clean up the RNIC resources */
+	c2_rnic_term(c2dev);
+
+	/* Remove network device from the kernel */
+	unregister_netdev(netdev);
+
+	/* Free network device */
+	free_netdev(netdev);
+
+	/* Free the interrupt line */
+	free_irq(pcidev->irq, c2dev);
+
+	/* missing: Turn LEDs off here */
+
+	/* Unmap adapter PA space */
+	iounmap(c2dev->kva);
+	iounmap(c2dev->regs);
+	iounmap(c2dev->mmio_txp_ring);
+	iounmap(c2dev->mmio_rxp_ring);
+
+	/* Free the hardware structure */
+	ib_dealloc_device(&c2dev->ibdev);
+
+	/* Release reserved PCI I/O and memory resources */
+	pci_release_regions(pcidev);
+
+	/* Disable PCI device */
+	pci_disable_device(pcidev);
+
+	/* Clear driver specific data */
+	pci_set_drvdata(pcidev, NULL);
+}
+
+static struct pci_driver c2_pci_driver = {
+	.name = DRV_NAME,
+	.id_table = c2_pci_table,
+	.probe = c2_probe,
+	.remove = c2_remove,
+};
+
+module_pci_driver(c2_pci_driver);
diff --git a/drivers/infiniband/hw/amso1100/c2.h b/drivers/infiniband/hw/amso1100/c2.h
new file mode 100644
index 0000000..d619d73
--- /dev/null
+++ b/drivers/infiniband/hw/amso1100/c2.h
@@ -0,0 +1,547 @@
+/*
+ * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __C2_H
+#define __C2_H
+
+#include <linux/netdevice.h>
+#include <linux/spinlock.h>
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include <linux/dma-mapping.h>
+#include <linux/idr.h>
+
+#include "c2_provider.h"
+#include "c2_mq.h"
+#include "c2_status.h"
+
+#define DRV_NAME     "c2"
+#define DRV_VERSION  "1.1"
+#define PFX          DRV_NAME ": "
+
+#define BAR_0                0
+#define BAR_2                2
+#define BAR_4                4
+
+#define RX_BUF_SIZE         (1536 + 8)
+#define ETH_JUMBO_MTU        9000
+#define C2_MAGIC            "CEPHEUS"
+#define C2_VERSION           4
+#define C2_IVN              (18 & 0x7fffffff)
+
+#define C2_REG0_SIZE        (16 * 1024)
+#define C2_REG2_SIZE        (2 * 1024 * 1024)
+#define C2_REG4_SIZE        (256 * 1024 * 1024)
+#define C2_NUM_TX_DESC       341
+#define C2_NUM_RX_DESC       256
+#define C2_PCI_REGS_OFFSET  (0x10000)
+#define C2_RXP_HRXDQ_OFFSET (((C2_REG4_SIZE)/2))
+#define C2_RXP_HRXDQ_SIZE   (4096)
+#define C2_TXP_HTXDQ_OFFSET (((C2_REG4_SIZE)/2) + C2_RXP_HRXDQ_SIZE)
+#define C2_TXP_HTXDQ_SIZE   (4096)
+#define C2_TX_TIMEOUT	    (6*HZ)
+
+/* CEPHEUS */
+static const u8 c2_magic[] = {
+	0x43, 0x45, 0x50, 0x48, 0x45, 0x55, 0x53
+};
+
+enum adapter_pci_regs {
+	C2_REGS_MAGIC = 0x0000,
+	C2_REGS_VERS = 0x0008,
+	C2_REGS_IVN = 0x000C,
+	C2_REGS_PCI_WINSIZE = 0x0010,
+	C2_REGS_Q0_QSIZE = 0x0014,
+	C2_REGS_Q0_MSGSIZE = 0x0018,
+	C2_REGS_Q0_POOLSTART = 0x001C,
+	C2_REGS_Q0_SHARED = 0x0020,
+	C2_REGS_Q1_QSIZE = 0x0024,
+	C2_REGS_Q1_MSGSIZE = 0x0028,
+	C2_REGS_Q1_SHARED = 0x0030,
+	C2_REGS_Q2_QSIZE = 0x0034,
+	C2_REGS_Q2_MSGSIZE = 0x0038,
+	C2_REGS_Q2_SHARED = 0x0040,
+	C2_REGS_ENADDR = 0x004C,
+	C2_REGS_RDMA_ENADDR = 0x0054,
+	C2_REGS_HRX_CUR = 0x006C,
+};
+
+struct c2_adapter_pci_regs {
+	char reg_magic[8];
+	u32 version;
+	u32 ivn;
+	u32 pci_window_size;
+	u32 q0_q_size;
+	u32 q0_msg_size;
+	u32 q0_pool_start;
+	u32 q0_shared;
+	u32 q1_q_size;
+	u32 q1_msg_size;
+	u32 q1_pool_start;
+	u32 q1_shared;
+	u32 q2_q_size;
+	u32 q2_msg_size;
+	u32 q2_pool_start;
+	u32 q2_shared;
+	u32 log_start;
+	u32 log_size;
+	u8 host_enaddr[8];
+	u8 rdma_enaddr[8];
+	u32 crash_entry;
+	u32 crash_ready[2];
+	u32 fw_txd_cur;
+	u32 fw_hrxd_cur;
+	u32 fw_rxd_cur;
+};
+
+enum pci_regs {
+	C2_HISR = 0x0000,
+	C2_DISR = 0x0004,
+	C2_HIMR = 0x0008,
+	C2_DIMR = 0x000C,
+	C2_NISR0 = 0x0010,
+	C2_NISR1 = 0x0014,
+	C2_NIMR0 = 0x0018,
+	C2_NIMR1 = 0x001C,
+	C2_IDIS = 0x0020,
+};
+
+enum {
+	C2_PCI_HRX_INT = 1 << 8,
+	C2_PCI_HTX_INT = 1 << 17,
+	C2_PCI_HRX_QUI = 1 << 31,
+};
+
+/*
+ * Cepheus registers in BAR0.
+ */
+struct c2_pci_regs {
+	u32 hostisr;
+	u32 dmaisr;
+	u32 hostimr;
+	u32 dmaimr;
+	u32 netisr0;
+	u32 netisr1;
+	u32 netimr0;
+	u32 netimr1;
+	u32 int_disable;
+};
+
+/* TXP flags */
+enum c2_txp_flags {
+	TXP_HTXD_DONE = 0,
+	TXP_HTXD_READY = 1 << 0,
+	TXP_HTXD_UNINIT = 1 << 1,
+};
+
+/* RXP flags */
+enum c2_rxp_flags {
+	RXP_HRXD_UNINIT = 0,
+	RXP_HRXD_READY = 1 << 0,
+	RXP_HRXD_DONE = 1 << 1,
+};
+
+/* RXP status */
+enum c2_rxp_status {
+	RXP_HRXD_ZERO = 0,
+	RXP_HRXD_OK = 1 << 0,
+	RXP_HRXD_BUF_OV = 1 << 1,
+};
+
+/* TXP descriptor fields */
+enum txp_desc {
+	C2_TXP_FLAGS = 0x0000,
+	C2_TXP_LEN = 0x0002,
+	C2_TXP_ADDR = 0x0004,
+};
+
+/* RXP descriptor fields */
+enum rxp_desc {
+	C2_RXP_FLAGS = 0x0000,
+	C2_RXP_STATUS = 0x0002,
+	C2_RXP_COUNT = 0x0004,
+	C2_RXP_LEN = 0x0006,
+	C2_RXP_ADDR = 0x0008,
+};
+
+struct c2_txp_desc {
+	u16 flags;
+	u16 len;
+	u64 addr;
+} __attribute__ ((packed));
+
+struct c2_rxp_desc {
+	u16 flags;
+	u16 status;
+	u16 count;
+	u16 len;
+	u64 addr;
+} __attribute__ ((packed));
+
+struct c2_rxp_hdr {
+	u16 flags;
+	u16 status;
+	u16 len;
+	u16 rsvd;
+} __attribute__ ((packed));
+
+struct c2_tx_desc {
+	u32 len;
+	u32 status;
+	dma_addr_t next_offset;
+};
+
+struct c2_rx_desc {
+	u32 len;
+	u32 status;
+	dma_addr_t next_offset;
+};
+
+struct c2_alloc {
+	u32 last;
+	u32 max;
+	spinlock_t lock;
+	unsigned long *table;
+};
+
+struct c2_array {
+	struct {
+		void **page;
+		int used;
+	} *page_list;
+};
+
+/*
+ * The MQ shared pointer pool is organized as a linked list of
+ * chunks. Each chunk contains a linked list of free shared pointers
+ * that can be allocated to a given user mode client.
+ *
+ */
+struct sp_chunk {
+	struct sp_chunk *next;
+	dma_addr_t dma_addr;
+	DEFINE_DMA_UNMAP_ADDR(mapping);
+	u16 head;
+	u16 shared_ptr[0];
+};
+
+struct c2_pd_table {
+	u32 last;
+	u32 max;
+	spinlock_t lock;
+	unsigned long *table;
+};
+
+struct c2_qp_table {
+	struct idr idr;
+	spinlock_t lock;
+};
+
+struct c2_element {
+	struct c2_element *next;
+	void *ht_desc;		/* host     descriptor */
+	void __iomem *hw_desc;	/* hardware descriptor */
+	struct sk_buff *skb;
+	dma_addr_t mapaddr;
+	u32 maplen;
+};
+
+struct c2_ring {
+	struct c2_element *to_clean;
+	struct c2_element *to_use;
+	struct c2_element *start;
+	unsigned long count;
+};
+
+struct c2_dev {
+	struct ib_device ibdev;
+	void __iomem *regs;
+	void __iomem *mmio_txp_ring; /* remapped adapter memory for hw rings */
+	void __iomem *mmio_rxp_ring;
+	spinlock_t lock;
+	struct pci_dev *pcidev;
+	struct net_device *netdev;
+	struct net_device *pseudo_netdev;
+	unsigned int cur_tx;
+	unsigned int cur_rx;
+	u32 adapter_handle;
+	int device_cap_flags;
+	void __iomem *kva;	/* KVA device memory */
+	unsigned long pa;	/* PA device memory */
+	void **qptr_array;
+
+	struct kmem_cache *host_msg_cache;
+
+	struct list_head cca_link;		/* adapter list */
+	struct list_head eh_wakeup_list;	/* event wakeup list */
+	wait_queue_head_t req_vq_wo;
+
+	/* Cached RNIC properties */
+	struct ib_device_attr props;
+
+	struct c2_pd_table pd_table;
+	struct c2_qp_table qp_table;
+	int ports;		/* num of GigE ports */
+	int devnum;
+	spinlock_t vqlock;	/* sync vbs req MQ */
+
+	/* Verbs Queues */
+	struct c2_mq req_vq;	/* Verbs Request MQ */
+	struct c2_mq rep_vq;	/* Verbs Reply MQ */
+	struct c2_mq aeq;	/* Async Events MQ */
+
+	/* Kernel client MQs */
+	struct sp_chunk *kern_mqsp_pool;
+
+	/* Device updates these values when posting messages to a host
+	 * target queue */
+	u16 req_vq_shared;
+	u16 rep_vq_shared;
+	u16 aeq_shared;
+	u16 irq_claimed;
+
+	/*
+	 * Shared host target pages for user-accessible MQs.
+	 */
+	int hthead;		/* index of first free entry */
+	void *htpages;		/* kernel vaddr */
+	int htlen;		/* length of htpages memory */
+	void *htuva;		/* user mapped vaddr */
+	spinlock_t htlock;	/* serialize allocation */
+
+	u64 adapter_hint_uva;	/* access to the activity FIFO */
+
+	//	spinlock_t aeq_lock;
+	//	spinlock_t rnic_lock;
+
+	__be16 *hint_count;
+	dma_addr_t hint_count_dma;
+	u16 hints_read;
+
+	int init;		/* TRUE if it's ready */
+	char ae_cache_name[16];
+	char vq_cache_name[16];
+};
+
+struct c2_port {
+	u32 msg_enable;
+	struct c2_dev *c2dev;
+	struct net_device *netdev;
+
+	spinlock_t tx_lock;
+	u32 tx_avail;
+	struct c2_ring tx_ring;
+	struct c2_ring rx_ring;
+
+	void *mem;		/* PCI memory for host rings */
+	dma_addr_t dma;
+	unsigned long mem_size;
+
+	u32 rx_buf_size;
+};
+
+/*
+ * Activity FIFO registers in BAR0.
+ */
+#define PCI_BAR0_HOST_HINT	0x100
+#define PCI_BAR0_ADAPTER_HINT	0x2000
+
+/*
+ * Ammasso PCI vendor id and Cepheus PCI device id.
+ */
+#define CQ_ARMED 	0x01
+#define CQ_WAIT_FOR_DMA	0x80
+
+/*
+ * The format of a hint is as follows:
+ * Lower 16 bits are the count of hints for the queue.
+ * Next 15 bits are the qp_index
+ * Upper most bit depends on who reads it:
+ *    If read by producer, then it means Full (1) or Not-Full (0)
+ *    If read by consumer, then it means Empty (1) or Not-Empty (0)
+ */
+#define C2_HINT_MAKE(q_index, hint_count) (((q_index) << 16) | hint_count)
+#define C2_HINT_GET_INDEX(hint) (((hint) & 0x7FFF0000) >> 16)
+#define C2_HINT_GET_COUNT(hint) ((hint) & 0x0000FFFF)
+
+
+/*
+ * The following defines the offset in SDRAM for the c2_adapter_pci_regs_t
+ * struct.
+ */
+#define C2_ADAPTER_PCI_REGS_OFFSET 0x10000
+
+#ifndef readq
+static inline u64 readq(const void __iomem * addr)
+{
+	u64 ret = readl(addr + 4);
+	ret <<= 32;
+	ret |= readl(addr);
+
+	return ret;
+}
+#endif
+
+#ifndef writeq
+static inline void __raw_writeq(u64 val, void __iomem * addr)
+{
+	__raw_writel((u32) (val), addr);
+	__raw_writel((u32) (val >> 32), (addr + 4));
+}
+#endif
+
+#define C2_SET_CUR_RX(c2dev, cur_rx) \
+	__raw_writel((__force u32) cpu_to_be32(cur_rx), c2dev->mmio_txp_ring + 4092)
+
+#define C2_GET_CUR_RX(c2dev) \
+	be32_to_cpu((__force __be32) readl(c2dev->mmio_txp_ring + 4092))
+
+static inline struct c2_dev *to_c2dev(struct ib_device *ibdev)
+{
+	return container_of(ibdev, struct c2_dev, ibdev);
+}
+
+static inline int c2_errno(void *reply)
+{
+	switch (c2_wr_get_result(reply)) {
+	case C2_OK:
+		return 0;
+	case CCERR_NO_BUFS:
+	case CCERR_INSUFFICIENT_RESOURCES:
+	case CCERR_ZERO_RDMA_READ_RESOURCES:
+		return -ENOMEM;
+	case CCERR_MR_IN_USE:
+	case CCERR_QP_IN_USE:
+		return -EBUSY;
+	case CCERR_ADDR_IN_USE:
+		return -EADDRINUSE;
+	case CCERR_ADDR_NOT_AVAIL:
+		return -EADDRNOTAVAIL;
+	case CCERR_CONN_RESET:
+		return -ECONNRESET;
+	case CCERR_NOT_IMPLEMENTED:
+	case CCERR_INVALID_WQE:
+		return -ENOSYS;
+	case CCERR_QP_NOT_PRIVILEGED:
+		return -EPERM;
+	case CCERR_STACK_ERROR:
+		return -EPROTO;
+	case CCERR_ACCESS_VIOLATION:
+	case CCERR_BASE_AND_BOUNDS_VIOLATION:
+		return -EFAULT;
+	case CCERR_STAG_STATE_NOT_INVALID:
+	case CCERR_INVALID_ADDRESS:
+	case CCERR_INVALID_CQ:
+	case CCERR_INVALID_EP:
+	case CCERR_INVALID_MODIFIER:
+	case CCERR_INVALID_MTU:
+	case CCERR_INVALID_PD_ID:
+	case CCERR_INVALID_QP:
+	case CCERR_INVALID_RNIC:
+	case CCERR_INVALID_STAG:
+		return -EINVAL;
+	default:
+		return -EAGAIN;
+	}
+}
+
+/* Device */
+extern int c2_register_device(struct c2_dev *c2dev);
+extern void c2_unregister_device(struct c2_dev *c2dev);
+extern int c2_rnic_init(struct c2_dev *c2dev);
+extern void c2_rnic_term(struct c2_dev *c2dev);
+extern void c2_rnic_interrupt(struct c2_dev *c2dev);
+extern int c2_del_addr(struct c2_dev *c2dev, __be32 inaddr, __be32 inmask);
+extern int c2_add_addr(struct c2_dev *c2dev, __be32 inaddr, __be32 inmask);
+
+/* QPs */
+extern int c2_alloc_qp(struct c2_dev *c2dev, struct c2_pd *pd,
+		       struct ib_qp_init_attr *qp_attrs, struct c2_qp *qp);
+extern void c2_free_qp(struct c2_dev *c2dev, struct c2_qp *qp);
+extern struct ib_qp *c2_get_qp(struct ib_device *device, int qpn);
+extern int c2_qp_modify(struct c2_dev *c2dev, struct c2_qp *qp,
+			struct ib_qp_attr *attr, int attr_mask);
+extern int c2_qp_set_read_limits(struct c2_dev *c2dev, struct c2_qp *qp,
+				 int ord, int ird);
+extern int c2_post_send(struct ib_qp *ibqp, struct ib_send_wr *ib_wr,
+			struct ib_send_wr **bad_wr);
+extern int c2_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *ib_wr,
+			   struct ib_recv_wr **bad_wr);
+extern void c2_init_qp_table(struct c2_dev *c2dev);
+extern void c2_cleanup_qp_table(struct c2_dev *c2dev);
+extern void c2_set_qp_state(struct c2_qp *, int);
+extern struct c2_qp *c2_find_qpn(struct c2_dev *c2dev, int qpn);
+
+/* PDs */
+extern int c2_pd_alloc(struct c2_dev *c2dev, int privileged, struct c2_pd *pd);
+extern void c2_pd_free(struct c2_dev *c2dev, struct c2_pd *pd);
+extern int c2_init_pd_table(struct c2_dev *c2dev);
+extern void c2_cleanup_pd_table(struct c2_dev *c2dev);
+
+/* CQs */
+extern int c2_init_cq(struct c2_dev *c2dev, int entries,
+		      struct c2_ucontext *ctx, struct c2_cq *cq);
+extern void c2_free_cq(struct c2_dev *c2dev, struct c2_cq *cq);
+extern void c2_cq_event(struct c2_dev *c2dev, u32 mq_index);
+extern void c2_cq_clean(struct c2_dev *c2dev, struct c2_qp *qp, u32 mq_index);
+extern int c2_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry);
+extern int c2_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags);
+
+/* CM */
+extern int c2_llp_connect(struct iw_cm_id *cm_id,
+			  struct iw_cm_conn_param *iw_param);
+extern int c2_llp_accept(struct iw_cm_id *cm_id,
+			 struct iw_cm_conn_param *iw_param);
+extern int c2_llp_reject(struct iw_cm_id *cm_id, const void *pdata,
+			 u8 pdata_len);
+extern int c2_llp_service_create(struct iw_cm_id *cm_id, int backlog);
+extern int c2_llp_service_destroy(struct iw_cm_id *cm_id);
+
+/* MM */
+extern int c2_nsmr_register_phys_kern(struct c2_dev *c2dev, u64 *addr_list,
+ 				      int page_size, int pbl_depth, u32 length,
+ 				      u32 off, u64 *va, enum c2_acf acf,
+				      struct c2_mr *mr);
+extern int c2_stag_dealloc(struct c2_dev *c2dev, u32 stag_index);
+
+/* AE */
+extern void c2_ae_event(struct c2_dev *c2dev, u32 mq_index);
+
+/* MQSP Allocator */
+extern int c2_init_mqsp_pool(struct c2_dev *c2dev, gfp_t gfp_mask,
+			     struct sp_chunk **root);
+extern void c2_free_mqsp_pool(struct c2_dev *c2dev, struct sp_chunk *root);
+extern __be16 *c2_alloc_mqsp(struct c2_dev *c2dev, struct sp_chunk *head,
+			     dma_addr_t *dma_addr, gfp_t gfp_mask);
+extern void c2_free_mqsp(__be16* mqsp);
+#endif
diff --git a/drivers/infiniband/hw/amso1100/c2_ae.c b/drivers/infiniband/hw/amso1100/c2_ae.c
new file mode 100644
index 0000000..cedda25
--- /dev/null
+++ b/drivers/infiniband/hw/amso1100/c2_ae.c
@@ -0,0 +1,327 @@
+/*
+ * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "c2.h"
+#include <rdma/iw_cm.h>
+#include "c2_status.h"
+#include "c2_ae.h"
+
+static int c2_convert_cm_status(u32 c2_status)
+{
+	switch (c2_status) {
+	case C2_CONN_STATUS_SUCCESS:
+		return 0;
+	case C2_CONN_STATUS_REJECTED:
+		return -ENETRESET;
+	case C2_CONN_STATUS_REFUSED:
+		return -ECONNREFUSED;
+	case C2_CONN_STATUS_TIMEDOUT:
+		return -ETIMEDOUT;
+	case C2_CONN_STATUS_NETUNREACH:
+		return -ENETUNREACH;
+	case C2_CONN_STATUS_HOSTUNREACH:
+		return -EHOSTUNREACH;
+	case C2_CONN_STATUS_INVALID_RNIC:
+		return -EINVAL;
+	case C2_CONN_STATUS_INVALID_QP:
+		return -EINVAL;
+	case C2_CONN_STATUS_INVALID_QP_STATE:
+		return -EINVAL;
+	case C2_CONN_STATUS_ADDR_NOT_AVAIL:
+		return -EADDRNOTAVAIL;
+	default:
+		printk(KERN_ERR PFX
+		       "%s - Unable to convert CM status: %d\n",
+		       __func__, c2_status);
+		return -EIO;
+	}
+}
+
+static const char* to_event_str(int event)
+{
+	static const char* event_str[] = {
+		"CCAE_REMOTE_SHUTDOWN",
+		"CCAE_ACTIVE_CONNECT_RESULTS",
+		"CCAE_CONNECTION_REQUEST",
+		"CCAE_LLP_CLOSE_COMPLETE",
+		"CCAE_TERMINATE_MESSAGE_RECEIVED",
+		"CCAE_LLP_CONNECTION_RESET",
+		"CCAE_LLP_CONNECTION_LOST",
+		"CCAE_LLP_SEGMENT_SIZE_INVALID",
+		"CCAE_LLP_INVALID_CRC",
+		"CCAE_LLP_BAD_FPDU",
+		"CCAE_INVALID_DDP_VERSION",
+		"CCAE_INVALID_RDMA_VERSION",
+		"CCAE_UNEXPECTED_OPCODE",
+		"CCAE_INVALID_DDP_QUEUE_NUMBER",
+		"CCAE_RDMA_READ_NOT_ENABLED",
+		"CCAE_RDMA_WRITE_NOT_ENABLED",
+		"CCAE_RDMA_READ_TOO_SMALL",
+		"CCAE_NO_L_BIT",
+		"CCAE_TAGGED_INVALID_STAG",
+		"CCAE_TAGGED_BASE_BOUNDS_VIOLATION",
+		"CCAE_TAGGED_ACCESS_RIGHTS_VIOLATION",
+		"CCAE_TAGGED_INVALID_PD",
+		"CCAE_WRAP_ERROR",
+		"CCAE_BAD_CLOSE",
+		"CCAE_BAD_LLP_CLOSE",
+		"CCAE_INVALID_MSN_RANGE",
+		"CCAE_INVALID_MSN_GAP",
+		"CCAE_IRRQ_OVERFLOW",
+		"CCAE_IRRQ_MSN_GAP",
+		"CCAE_IRRQ_MSN_RANGE",
+		"CCAE_IRRQ_INVALID_STAG",
+		"CCAE_IRRQ_BASE_BOUNDS_VIOLATION",
+		"CCAE_IRRQ_ACCESS_RIGHTS_VIOLATION",
+		"CCAE_IRRQ_INVALID_PD",
+		"CCAE_IRRQ_WRAP_ERROR",
+		"CCAE_CQ_SQ_COMPLETION_OVERFLOW",
+		"CCAE_CQ_RQ_COMPLETION_ERROR",
+		"CCAE_QP_SRQ_WQE_ERROR",
+		"CCAE_QP_LOCAL_CATASTROPHIC_ERROR",
+		"CCAE_CQ_OVERFLOW",
+		"CCAE_CQ_OPERATION_ERROR",
+		"CCAE_SRQ_LIMIT_REACHED",
+		"CCAE_QP_RQ_LIMIT_REACHED",
+		"CCAE_SRQ_CATASTROPHIC_ERROR",
+		"CCAE_RNIC_CATASTROPHIC_ERROR"
+	};
+
+	if (event < CCAE_REMOTE_SHUTDOWN ||
+	    event > CCAE_RNIC_CATASTROPHIC_ERROR)
+		return "<invalid event>";
+
+	event -= CCAE_REMOTE_SHUTDOWN;
+	return event_str[event];
+}
+
+static const char *to_qp_state_str(int state)
+{
+	switch (state) {
+	case C2_QP_STATE_IDLE:
+		return "C2_QP_STATE_IDLE";
+	case C2_QP_STATE_CONNECTING:
+		return "C2_QP_STATE_CONNECTING";
+	case C2_QP_STATE_RTS:
+		return "C2_QP_STATE_RTS";
+	case C2_QP_STATE_CLOSING:
+		return "C2_QP_STATE_CLOSING";
+	case C2_QP_STATE_TERMINATE:
+		return "C2_QP_STATE_TERMINATE";
+	case C2_QP_STATE_ERROR:
+		return "C2_QP_STATE_ERROR";
+	default:
+		return "<invalid QP state>";
+	}
+}
+
+void c2_ae_event(struct c2_dev *c2dev, u32 mq_index)
+{
+	struct c2_mq *mq = c2dev->qptr_array[mq_index];
+	union c2wr *wr;
+	void *resource_user_context;
+	struct iw_cm_event cm_event;
+	struct ib_event ib_event;
+	enum c2_resource_indicator resource_indicator;
+	enum c2_event_id event_id;
+	unsigned long flags;
+	int status;
+	struct sockaddr_in *laddr = (struct sockaddr_in *)&cm_event.local_addr;
+	struct sockaddr_in *raddr = (struct sockaddr_in *)&cm_event.remote_addr;
+
+	/*
+	 * retrieve the message
+	 */
+	wr = c2_mq_consume(mq);
+	if (!wr)
+		return;
+
+	memset(&ib_event, 0, sizeof(ib_event));
+	memset(&cm_event, 0, sizeof(cm_event));
+
+	event_id = c2_wr_get_id(wr);
+	resource_indicator = be32_to_cpu(wr->ae.ae_generic.resource_type);
+	resource_user_context =
+	    (void *) (unsigned long) wr->ae.ae_generic.user_context;
+
+	status = cm_event.status = c2_convert_cm_status(c2_wr_get_result(wr));
+
+	pr_debug("event received c2_dev=%p, event_id=%d, "
+		"resource_indicator=%d, user_context=%p, status = %d\n",
+		c2dev, event_id, resource_indicator, resource_user_context,
+		status);
+
+	switch (resource_indicator) {
+	case C2_RES_IND_QP:{
+
+		struct c2_qp *qp = (struct c2_qp *)resource_user_context;
+		struct iw_cm_id *cm_id = qp->cm_id;
+		struct c2wr_ae_active_connect_results *res;
+
+		if (!cm_id) {
+			pr_debug("event received, but cm_id is <nul>, qp=%p!\n",
+				qp);
+			goto ignore_it;
+		}
+		pr_debug("%s: event = %s, user_context=%llx, "
+			"resource_type=%x, "
+			"resource=%x, qp_state=%s\n",
+			__func__,
+			to_event_str(event_id),
+			(unsigned long long) wr->ae.ae_generic.user_context,
+			be32_to_cpu(wr->ae.ae_generic.resource_type),
+			be32_to_cpu(wr->ae.ae_generic.resource),
+			to_qp_state_str(be32_to_cpu(wr->ae.ae_generic.qp_state)));
+
+		c2_set_qp_state(qp, be32_to_cpu(wr->ae.ae_generic.qp_state));
+
+		switch (event_id) {
+		case CCAE_ACTIVE_CONNECT_RESULTS:
+			res = &wr->ae.ae_active_connect_results;
+			cm_event.event = IW_CM_EVENT_CONNECT_REPLY;
+			laddr->sin_addr.s_addr = res->laddr;
+			raddr->sin_addr.s_addr = res->raddr;
+			laddr->sin_port = res->lport;
+			raddr->sin_port = res->rport;
+			if (status == 0) {
+				cm_event.private_data_len =
+					be32_to_cpu(res->private_data_length);
+				cm_event.private_data = res->private_data;
+			} else {
+				spin_lock_irqsave(&qp->lock, flags);
+				if (qp->cm_id) {
+					qp->cm_id->rem_ref(qp->cm_id);
+					qp->cm_id = NULL;
+				}
+				spin_unlock_irqrestore(&qp->lock, flags);
+				cm_event.private_data_len = 0;
+				cm_event.private_data = NULL;
+			}
+			if (cm_id->event_handler)
+				cm_id->event_handler(cm_id, &cm_event);
+			break;
+		case CCAE_TERMINATE_MESSAGE_RECEIVED:
+		case CCAE_CQ_SQ_COMPLETION_OVERFLOW:
+			ib_event.device = &c2dev->ibdev;
+			ib_event.element.qp = &qp->ibqp;
+			ib_event.event = IB_EVENT_QP_REQ_ERR;
+
+			if (qp->ibqp.event_handler)
+				qp->ibqp.event_handler(&ib_event,
+						       qp->ibqp.
+						       qp_context);
+			break;
+		case CCAE_BAD_CLOSE:
+		case CCAE_LLP_CLOSE_COMPLETE:
+		case CCAE_LLP_CONNECTION_RESET:
+		case CCAE_LLP_CONNECTION_LOST:
+			BUG_ON(cm_id->event_handler==(void*)0x6b6b6b6b);
+
+			spin_lock_irqsave(&qp->lock, flags);
+			if (qp->cm_id) {
+				qp->cm_id->rem_ref(qp->cm_id);
+				qp->cm_id = NULL;
+			}
+			spin_unlock_irqrestore(&qp->lock, flags);
+			cm_event.event = IW_CM_EVENT_CLOSE;
+			cm_event.status = 0;
+			if (cm_id->event_handler)
+				cm_id->event_handler(cm_id, &cm_event);
+			break;
+		default:
+			BUG_ON(1);
+			pr_debug("%s:%d Unexpected event_id=%d on QP=%p, "
+				"CM_ID=%p\n",
+				__func__, __LINE__,
+				event_id, qp, cm_id);
+			break;
+		}
+		break;
+	}
+
+	case C2_RES_IND_EP:{
+
+		struct c2wr_ae_connection_request *req =
+			&wr->ae.ae_connection_request;
+		struct iw_cm_id *cm_id =
+			(struct iw_cm_id *)resource_user_context;
+
+		pr_debug("C2_RES_IND_EP event_id=%d\n", event_id);
+		if (event_id != CCAE_CONNECTION_REQUEST) {
+			pr_debug("%s: Invalid event_id: %d\n",
+				__func__, event_id);
+			break;
+		}
+		cm_event.event = IW_CM_EVENT_CONNECT_REQUEST;
+		cm_event.provider_data = (void*)(unsigned long)req->cr_handle;
+		laddr->sin_addr.s_addr = req->laddr;
+		raddr->sin_addr.s_addr = req->raddr;
+		laddr->sin_port = req->lport;
+		raddr->sin_port = req->rport;
+		cm_event.private_data_len =
+			be32_to_cpu(req->private_data_length);
+		cm_event.private_data = req->private_data;
+		/*
+		 * Until ird/ord negotiation via MPAv2 support is added, send
+		 * max supported values
+		 */
+		cm_event.ird = cm_event.ord = 128;
+
+		if (cm_id->event_handler)
+			cm_id->event_handler(cm_id, &cm_event);
+		break;
+	}
+
+	case C2_RES_IND_CQ:{
+		struct c2_cq *cq =
+		    (struct c2_cq *) resource_user_context;
+
+		pr_debug("IB_EVENT_CQ_ERR\n");
+		ib_event.device = &c2dev->ibdev;
+		ib_event.element.cq = &cq->ibcq;
+		ib_event.event = IB_EVENT_CQ_ERR;
+
+		if (cq->ibcq.event_handler)
+			cq->ibcq.event_handler(&ib_event,
+					       cq->ibcq.cq_context);
+		break;
+	}
+
+	default:
+		printk("Bad resource indicator = %d\n",
+		       resource_indicator);
+		break;
+	}
+
+ ignore_it:
+	c2_mq_free(mq);
+}
diff --git a/drivers/infiniband/hw/amso1100/c2_ae.h b/drivers/infiniband/hw/amso1100/c2_ae.h
new file mode 100644
index 0000000..3a065c3
--- /dev/null
+++ b/drivers/infiniband/hw/amso1100/c2_ae.h
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef _C2_AE_H_
+#define _C2_AE_H_
+
+/*
+ * WARNING: If you change this file, also bump C2_IVN_BASE
+ * in common/include/clustercore/c2_ivn.h.
+ */
+
+/*
+ * Asynchronous Event Identifiers
+ *
+ * These start at 0x80 only so it's obvious from inspection that
+ * they are not work-request statuses.  This isn't critical.
+ *
+ * NOTE: these event id's must fit in eight bits.
+ */
+enum c2_event_id {
+	CCAE_REMOTE_SHUTDOWN = 0x80,
+	CCAE_ACTIVE_CONNECT_RESULTS,
+	CCAE_CONNECTION_REQUEST,
+	CCAE_LLP_CLOSE_COMPLETE,
+	CCAE_TERMINATE_MESSAGE_RECEIVED,
+	CCAE_LLP_CONNECTION_RESET,
+	CCAE_LLP_CONNECTION_LOST,
+	CCAE_LLP_SEGMENT_SIZE_INVALID,
+	CCAE_LLP_INVALID_CRC,
+	CCAE_LLP_BAD_FPDU,
+	CCAE_INVALID_DDP_VERSION,
+	CCAE_INVALID_RDMA_VERSION,
+	CCAE_UNEXPECTED_OPCODE,
+	CCAE_INVALID_DDP_QUEUE_NUMBER,
+	CCAE_RDMA_READ_NOT_ENABLED,
+	CCAE_RDMA_WRITE_NOT_ENABLED,
+	CCAE_RDMA_READ_TOO_SMALL,
+	CCAE_NO_L_BIT,
+	CCAE_TAGGED_INVALID_STAG,
+	CCAE_TAGGED_BASE_BOUNDS_VIOLATION,
+	CCAE_TAGGED_ACCESS_RIGHTS_VIOLATION,
+	CCAE_TAGGED_INVALID_PD,
+	CCAE_WRAP_ERROR,
+	CCAE_BAD_CLOSE,
+	CCAE_BAD_LLP_CLOSE,
+	CCAE_INVALID_MSN_RANGE,
+	CCAE_INVALID_MSN_GAP,
+	CCAE_IRRQ_OVERFLOW,
+	CCAE_IRRQ_MSN_GAP,
+	CCAE_IRRQ_MSN_RANGE,
+	CCAE_IRRQ_INVALID_STAG,
+	CCAE_IRRQ_BASE_BOUNDS_VIOLATION,
+	CCAE_IRRQ_ACCESS_RIGHTS_VIOLATION,
+	CCAE_IRRQ_INVALID_PD,
+	CCAE_IRRQ_WRAP_ERROR,
+	CCAE_CQ_SQ_COMPLETION_OVERFLOW,
+	CCAE_CQ_RQ_COMPLETION_ERROR,
+	CCAE_QP_SRQ_WQE_ERROR,
+	CCAE_QP_LOCAL_CATASTROPHIC_ERROR,
+	CCAE_CQ_OVERFLOW,
+	CCAE_CQ_OPERATION_ERROR,
+	CCAE_SRQ_LIMIT_REACHED,
+	CCAE_QP_RQ_LIMIT_REACHED,
+	CCAE_SRQ_CATASTROPHIC_ERROR,
+	CCAE_RNIC_CATASTROPHIC_ERROR
+/* WARNING If you add more id's, make sure their values fit in eight bits. */
+};
+
+/*
+ * Resource Indicators and Identifiers
+ */
+enum c2_resource_indicator {
+	C2_RES_IND_QP = 1,
+	C2_RES_IND_EP,
+	C2_RES_IND_CQ,
+	C2_RES_IND_SRQ,
+};
+
+#endif /* _C2_AE_H_ */
diff --git a/drivers/infiniband/hw/amso1100/c2_alloc.c b/drivers/infiniband/hw/amso1100/c2_alloc.c
new file mode 100644
index 0000000..78d247e
--- /dev/null
+++ b/drivers/infiniband/hw/amso1100/c2_alloc.c
@@ -0,0 +1,142 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/errno.h>
+#include <linux/bitmap.h>
+
+#include "c2.h"
+
+static int c2_alloc_mqsp_chunk(struct c2_dev *c2dev, gfp_t gfp_mask,
+			       struct sp_chunk **head)
+{
+	int i;
+	struct sp_chunk *new_head;
+	dma_addr_t dma_addr;
+
+	new_head = dma_alloc_coherent(&c2dev->pcidev->dev, PAGE_SIZE,
+				      &dma_addr, gfp_mask);
+	if (new_head == NULL)
+		return -ENOMEM;
+
+	new_head->dma_addr = dma_addr;
+	dma_unmap_addr_set(new_head, mapping, new_head->dma_addr);
+
+	new_head->next = NULL;
+	new_head->head = 0;
+
+	/* build list where each index is the next free slot */
+	for (i = 0;
+	     i < (PAGE_SIZE - sizeof(struct sp_chunk) -
+		  sizeof(u16)) / sizeof(u16) - 1;
+	     i++) {
+		new_head->shared_ptr[i] = i + 1;
+	}
+	/* terminate list */
+	new_head->shared_ptr[i] = 0xFFFF;
+
+	*head = new_head;
+	return 0;
+}
+
+int c2_init_mqsp_pool(struct c2_dev *c2dev, gfp_t gfp_mask,
+		      struct sp_chunk **root)
+{
+	return c2_alloc_mqsp_chunk(c2dev, gfp_mask, root);
+}
+
+void c2_free_mqsp_pool(struct c2_dev *c2dev, struct sp_chunk *root)
+{
+	struct sp_chunk *next;
+
+	while (root) {
+		next = root->next;
+		dma_free_coherent(&c2dev->pcidev->dev, PAGE_SIZE, root,
+				  dma_unmap_addr(root, mapping));
+		root = next;
+	}
+}
+
+__be16 *c2_alloc_mqsp(struct c2_dev *c2dev, struct sp_chunk *head,
+		      dma_addr_t *dma_addr, gfp_t gfp_mask)
+{
+	u16 mqsp;
+
+	while (head) {
+		mqsp = head->head;
+		if (mqsp != 0xFFFF) {
+			head->head = head->shared_ptr[mqsp];
+			break;
+		} else if (head->next == NULL) {
+			if (c2_alloc_mqsp_chunk(c2dev, gfp_mask, &head->next) ==
+			    0) {
+				head = head->next;
+				mqsp = head->head;
+				head->head = head->shared_ptr[mqsp];
+				break;
+			} else
+				return NULL;
+		} else
+			head = head->next;
+	}
+	if (head) {
+		*dma_addr = head->dma_addr +
+			    ((unsigned long) &(head->shared_ptr[mqsp]) -
+			     (unsigned long) head);
+		pr_debug("%s addr %p dma_addr %llx\n", __func__,
+			 &(head->shared_ptr[mqsp]), (unsigned long long) *dma_addr);
+		return (__force __be16 *) &(head->shared_ptr[mqsp]);
+	}
+	return NULL;
+}
+
+void c2_free_mqsp(__be16 *mqsp)
+{
+	struct sp_chunk *head;
+	u16 idx;
+
+	/* The chunk containing this ptr begins at the page boundary */
+	head = (struct sp_chunk *) ((unsigned long) mqsp & PAGE_MASK);
+
+	/* Link head to new mqsp */
+	*mqsp = (__force __be16) head->head;
+
+	/* Compute the shared_ptr index */
+	idx = ((unsigned long) mqsp & ~PAGE_MASK) >> 1;
+	idx -= (unsigned long) &(((struct sp_chunk *) 0)->shared_ptr[0]) >> 1;
+
+	/* Point this index at the head */
+	head->shared_ptr[idx] = head->head;
+
+	/* Point head at this index */
+	head->head = idx;
+}
diff --git a/drivers/infiniband/hw/amso1100/c2_cm.c b/drivers/infiniband/hw/amso1100/c2_cm.c
new file mode 100644
index 0000000..23bfa94
--- /dev/null
+++ b/drivers/infiniband/hw/amso1100/c2_cm.c
@@ -0,0 +1,461 @@
+/*
+ * Copyright (c) 2005 Ammasso, Inc.  All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/slab.h>
+
+#include "c2.h"
+#include "c2_wr.h"
+#include "c2_vq.h"
+#include <rdma/iw_cm.h>
+
+int c2_llp_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *iw_param)
+{
+	struct c2_dev *c2dev = to_c2dev(cm_id->device);
+	struct ib_qp *ibqp;
+	struct c2_qp *qp;
+	struct c2wr_qp_connect_req *wr;	/* variable size needs a malloc. */
+	struct c2_vq_req *vq_req;
+	int err;
+	struct sockaddr_in *raddr = (struct sockaddr_in *)&cm_id->remote_addr;
+
+	if (cm_id->remote_addr.ss_family != AF_INET)
+		return -ENOSYS;
+
+	ibqp = c2_get_qp(cm_id->device, iw_param->qpn);
+	if (!ibqp)
+		return -EINVAL;
+	qp = to_c2qp(ibqp);
+
+	/* Associate QP <--> CM_ID */
+	cm_id->provider_data = qp;
+	cm_id->add_ref(cm_id);
+	qp->cm_id = cm_id;
+
+	/*
+	 * only support the max private_data length
+	 */
+	if (iw_param->private_data_len > C2_MAX_PRIVATE_DATA_SIZE) {
+		err = -EINVAL;
+		goto bail0;
+	}
+	/*
+	 * Set the rdma read limits
+	 */
+	err = c2_qp_set_read_limits(c2dev, qp, iw_param->ord, iw_param->ird);
+	if (err)
+		goto bail0;
+
+	/*
+	 * Create and send a WR_QP_CONNECT...
+	 */
+	wr = kmalloc(c2dev->req_vq.msg_size, GFP_KERNEL);
+	if (!wr) {
+		err = -ENOMEM;
+		goto bail0;
+	}
+
+	vq_req = vq_req_alloc(c2dev);
+	if (!vq_req) {
+		err = -ENOMEM;
+		goto bail1;
+	}
+
+	c2_wr_set_id(wr, CCWR_QP_CONNECT);
+	wr->hdr.context = 0;
+	wr->rnic_handle = c2dev->adapter_handle;
+	wr->qp_handle = qp->adapter_handle;
+
+	wr->remote_addr = raddr->sin_addr.s_addr;
+	wr->remote_port = raddr->sin_port;
+
+	/*
+	 * Move any private data from the callers's buf into
+	 * the WR.
+	 */
+	if (iw_param->private_data) {
+		wr->private_data_length =
+			cpu_to_be32(iw_param->private_data_len);
+		memcpy(&wr->private_data[0], iw_param->private_data,
+		       iw_param->private_data_len);
+	} else
+		wr->private_data_length = 0;
+
+	/*
+	 * Send WR to adapter.  NOTE: There is no synch reply from
+	 * the adapter.
+	 */
+	err = vq_send_wr(c2dev, (union c2wr *) wr);
+	vq_req_free(c2dev, vq_req);
+
+ bail1:
+	kfree(wr);
+ bail0:
+	if (err) {
+		/*
+		 * If we fail, release reference on QP and
+		 * disassociate QP from CM_ID
+		 */
+		cm_id->provider_data = NULL;
+		qp->cm_id = NULL;
+		cm_id->rem_ref(cm_id);
+	}
+	return err;
+}
+
+int c2_llp_service_create(struct iw_cm_id *cm_id, int backlog)
+{
+	struct c2_dev *c2dev;
+	struct c2wr_ep_listen_create_req wr;
+	struct c2wr_ep_listen_create_rep *reply;
+	struct c2_vq_req *vq_req;
+	int err;
+	struct sockaddr_in *laddr = (struct sockaddr_in *)&cm_id->local_addr;
+
+	if (cm_id->local_addr.ss_family != AF_INET)
+		return -ENOSYS;
+
+	c2dev = to_c2dev(cm_id->device);
+	if (c2dev == NULL)
+		return -EINVAL;
+
+	/*
+	 * Allocate verbs request.
+	 */
+	vq_req = vq_req_alloc(c2dev);
+	if (!vq_req)
+		return -ENOMEM;
+
+	/*
+	 * Build the WR
+	 */
+	c2_wr_set_id(&wr, CCWR_EP_LISTEN_CREATE);
+	wr.hdr.context = (u64) (unsigned long) vq_req;
+	wr.rnic_handle = c2dev->adapter_handle;
+	wr.local_addr = laddr->sin_addr.s_addr;
+	wr.local_port = laddr->sin_port;
+	wr.backlog = cpu_to_be32(backlog);
+	wr.user_context = (u64) (unsigned long) cm_id;
+
+	/*
+	 * Reference the request struct.  Dereferenced in the int handler.
+	 */
+	vq_req_get(c2dev, vq_req);
+
+	/*
+	 * Send WR to adapter
+	 */
+	err = vq_send_wr(c2dev, (union c2wr *) & wr);
+	if (err) {
+		vq_req_put(c2dev, vq_req);
+		goto bail0;
+	}
+
+	/*
+	 * Wait for reply from adapter
+	 */
+	err = vq_wait_for_reply(c2dev, vq_req);
+	if (err)
+		goto bail0;
+
+	/*
+	 * Process reply
+	 */
+	reply =
+	    (struct c2wr_ep_listen_create_rep *) (unsigned long) vq_req->reply_msg;
+	if (!reply) {
+		err = -ENOMEM;
+		goto bail1;
+	}
+
+	if ((err = c2_errno(reply)) != 0)
+		goto bail1;
+
+	/*
+	 * Keep the adapter handle. Used in subsequent destroy
+	 */
+	cm_id->provider_data = (void*)(unsigned long) reply->ep_handle;
+
+	/*
+	 * free vq stuff
+	 */
+	vq_repbuf_free(c2dev, reply);
+	vq_req_free(c2dev, vq_req);
+
+	return 0;
+
+ bail1:
+	vq_repbuf_free(c2dev, reply);
+ bail0:
+	vq_req_free(c2dev, vq_req);
+	return err;
+}
+
+
+int c2_llp_service_destroy(struct iw_cm_id *cm_id)
+{
+
+	struct c2_dev *c2dev;
+	struct c2wr_ep_listen_destroy_req wr;
+	struct c2wr_ep_listen_destroy_rep *reply;
+	struct c2_vq_req *vq_req;
+	int err;
+
+	c2dev = to_c2dev(cm_id->device);
+	if (c2dev == NULL)
+		return -EINVAL;
+
+	/*
+	 * Allocate verbs request.
+	 */
+	vq_req = vq_req_alloc(c2dev);
+	if (!vq_req)
+		return -ENOMEM;
+
+	/*
+	 * Build the WR
+	 */
+	c2_wr_set_id(&wr, CCWR_EP_LISTEN_DESTROY);
+	wr.hdr.context = (unsigned long) vq_req;
+	wr.rnic_handle = c2dev->adapter_handle;
+	wr.ep_handle = (u32)(unsigned long)cm_id->provider_data;
+
+	/*
+	 * reference the request struct.  dereferenced in the int handler.
+	 */
+	vq_req_get(c2dev, vq_req);
+
+	/*
+	 * Send WR to adapter
+	 */
+	err = vq_send_wr(c2dev, (union c2wr *) & wr);
+	if (err) {
+		vq_req_put(c2dev, vq_req);
+		goto bail0;
+	}
+
+	/*
+	 * Wait for reply from adapter
+	 */
+	err = vq_wait_for_reply(c2dev, vq_req);
+	if (err)
+		goto bail0;
+
+	/*
+	 * Process reply
+	 */
+	reply=(struct c2wr_ep_listen_destroy_rep *)(unsigned long)vq_req->reply_msg;
+	if (!reply) {
+		err = -ENOMEM;
+		goto bail0;
+	}
+	if ((err = c2_errno(reply)) != 0)
+		goto bail1;
+
+ bail1:
+	vq_repbuf_free(c2dev, reply);
+ bail0:
+	vq_req_free(c2dev, vq_req);
+	return err;
+}
+
+int c2_llp_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *iw_param)
+{
+	struct c2_dev *c2dev = to_c2dev(cm_id->device);
+	struct c2_qp *qp;
+	struct ib_qp *ibqp;
+	struct c2wr_cr_accept_req *wr;	/* variable length WR */
+	struct c2_vq_req *vq_req;
+	struct c2wr_cr_accept_rep *reply;	/* VQ Reply msg ptr. */
+	int err;
+
+	ibqp = c2_get_qp(cm_id->device, iw_param->qpn);
+	if (!ibqp)
+		return -EINVAL;
+	qp = to_c2qp(ibqp);
+
+	/* Set the RDMA read limits */
+	err = c2_qp_set_read_limits(c2dev, qp, iw_param->ord, iw_param->ird);
+	if (err)
+		goto bail0;
+
+	/* Allocate verbs request. */
+	vq_req = vq_req_alloc(c2dev);
+	if (!vq_req) {
+		err = -ENOMEM;
+		goto bail0;
+	}
+	vq_req->qp = qp;
+	vq_req->cm_id = cm_id;
+	vq_req->event = IW_CM_EVENT_ESTABLISHED;
+
+	wr = kmalloc(c2dev->req_vq.msg_size, GFP_KERNEL);
+	if (!wr) {
+		err = -ENOMEM;
+		goto bail1;
+	}
+
+	/* Build the WR */
+	c2_wr_set_id(wr, CCWR_CR_ACCEPT);
+	wr->hdr.context = (unsigned long) vq_req;
+	wr->rnic_handle = c2dev->adapter_handle;
+	wr->ep_handle = (u32) (unsigned long) cm_id->provider_data;
+	wr->qp_handle = qp->adapter_handle;
+
+	/* Replace the cr_handle with the QP after accept */
+	cm_id->provider_data = qp;
+	cm_id->add_ref(cm_id);
+	qp->cm_id = cm_id;
+
+	cm_id->provider_data = qp;
+
+	/* Validate private_data length */
+	if (iw_param->private_data_len > C2_MAX_PRIVATE_DATA_SIZE) {
+		err = -EINVAL;
+		goto bail1;
+	}
+
+	if (iw_param->private_data) {
+		wr->private_data_length = cpu_to_be32(iw_param->private_data_len);
+		memcpy(&wr->private_data[0],
+		       iw_param->private_data, iw_param->private_data_len);
+	} else
+		wr->private_data_length = 0;
+
+	/* Reference the request struct.  Dereferenced in the int handler. */
+	vq_req_get(c2dev, vq_req);
+
+	/* Send WR to adapter */
+	err = vq_send_wr(c2dev, (union c2wr *) wr);
+	if (err) {
+		vq_req_put(c2dev, vq_req);
+		goto bail1;
+	}
+
+	/* Wait for reply from adapter */
+	err = vq_wait_for_reply(c2dev, vq_req);
+	if (err)
+		goto bail1;
+
+	/* Check that reply is present */
+	reply = (struct c2wr_cr_accept_rep *) (unsigned long) vq_req->reply_msg;
+	if (!reply) {
+		err = -ENOMEM;
+		goto bail1;
+	}
+
+	err = c2_errno(reply);
+	vq_repbuf_free(c2dev, reply);
+
+	if (!err)
+		c2_set_qp_state(qp, C2_QP_STATE_RTS);
+ bail1:
+	kfree(wr);
+	vq_req_free(c2dev, vq_req);
+ bail0:
+	if (err) {
+		/*
+		 * If we fail, release reference on QP and
+		 * disassociate QP from CM_ID
+		 */
+		cm_id->provider_data = NULL;
+		qp->cm_id = NULL;
+		cm_id->rem_ref(cm_id);
+	}
+	return err;
+}
+
+int c2_llp_reject(struct iw_cm_id *cm_id, const void *pdata, u8 pdata_len)
+{
+	struct c2_dev *c2dev;
+	struct c2wr_cr_reject_req wr;
+	struct c2_vq_req *vq_req;
+	struct c2wr_cr_reject_rep *reply;
+	int err;
+
+	c2dev = to_c2dev(cm_id->device);
+
+	/*
+	 * Allocate verbs request.
+	 */
+	vq_req = vq_req_alloc(c2dev);
+	if (!vq_req)
+		return -ENOMEM;
+
+	/*
+	 * Build the WR
+	 */
+	c2_wr_set_id(&wr, CCWR_CR_REJECT);
+	wr.hdr.context = (unsigned long) vq_req;
+	wr.rnic_handle = c2dev->adapter_handle;
+	wr.ep_handle = (u32) (unsigned long) cm_id->provider_data;
+
+	/*
+	 * reference the request struct.  dereferenced in the int handler.
+	 */
+	vq_req_get(c2dev, vq_req);
+
+	/*
+	 * Send WR to adapter
+	 */
+	err = vq_send_wr(c2dev, (union c2wr *) & wr);
+	if (err) {
+		vq_req_put(c2dev, vq_req);
+		goto bail0;
+	}
+
+	/*
+	 * Wait for reply from adapter
+	 */
+	err = vq_wait_for_reply(c2dev, vq_req);
+	if (err)
+		goto bail0;
+
+	/*
+	 * Process reply
+	 */
+	reply = (struct c2wr_cr_reject_rep *) (unsigned long)
+		vq_req->reply_msg;
+	if (!reply) {
+		err = -ENOMEM;
+		goto bail0;
+	}
+	err = c2_errno(reply);
+	/*
+	 * free vq stuff
+	 */
+	vq_repbuf_free(c2dev, reply);
+
+ bail0:
+	vq_req_free(c2dev, vq_req);
+	return err;
+}
diff --git a/drivers/infiniband/hw/amso1100/c2_cq.c b/drivers/infiniband/hw/amso1100/c2_cq.c
new file mode 100644
index 0000000..1b63185
--- /dev/null
+++ b/drivers/infiniband/hw/amso1100/c2_cq.c
@@ -0,0 +1,440 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2005 Cisco Systems, Inc. All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2004 Voltaire, Inc. All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/gfp.h>
+
+#include "c2.h"
+#include "c2_vq.h"
+#include "c2_status.h"
+
+#define C2_CQ_MSG_SIZE ((sizeof(struct c2wr_ce) + 32-1) & ~(32-1))
+
+static struct c2_cq *c2_cq_get(struct c2_dev *c2dev, int cqn)
+{
+	struct c2_cq *cq;
+	unsigned long flags;
+
+	spin_lock_irqsave(&c2dev->lock, flags);
+	cq = c2dev->qptr_array[cqn];
+	if (!cq) {
+		spin_unlock_irqrestore(&c2dev->lock, flags);
+		return NULL;
+	}
+	atomic_inc(&cq->refcount);
+	spin_unlock_irqrestore(&c2dev->lock, flags);
+	return cq;
+}
+
+static void c2_cq_put(struct c2_cq *cq)
+{
+	if (atomic_dec_and_test(&cq->refcount))
+		wake_up(&cq->wait);
+}
+
+void c2_cq_event(struct c2_dev *c2dev, u32 mq_index)
+{
+	struct c2_cq *cq;
+
+	cq = c2_cq_get(c2dev, mq_index);
+	if (!cq) {
+		printk("discarding events on destroyed CQN=%d\n", mq_index);
+		return;
+	}
+
+	(*cq->ibcq.comp_handler) (&cq->ibcq, cq->ibcq.cq_context);
+	c2_cq_put(cq);
+}
+
+void c2_cq_clean(struct c2_dev *c2dev, struct c2_qp *qp, u32 mq_index)
+{
+	struct c2_cq *cq;
+	struct c2_mq *q;
+
+	cq = c2_cq_get(c2dev, mq_index);
+	if (!cq)
+		return;
+
+	spin_lock_irq(&cq->lock);
+	q = &cq->mq;
+	if (q && !c2_mq_empty(q)) {
+		u16 priv = q->priv;
+		struct c2wr_ce *msg;
+
+		while (priv != be16_to_cpu(*q->shared)) {
+			msg = (struct c2wr_ce *)
+				(q->msg_pool.host + priv * q->msg_size);
+			if (msg->qp_user_context == (u64) (unsigned long) qp) {
+				msg->qp_user_context = (u64) 0;
+			}
+			priv = (priv + 1) % q->q_size;
+		}
+	}
+	spin_unlock_irq(&cq->lock);
+	c2_cq_put(cq);
+}
+
+static inline enum ib_wc_status c2_cqe_status_to_openib(u8 status)
+{
+	switch (status) {
+	case C2_OK:
+		return IB_WC_SUCCESS;
+	case CCERR_FLUSHED:
+		return IB_WC_WR_FLUSH_ERR;
+	case CCERR_BASE_AND_BOUNDS_VIOLATION:
+		return IB_WC_LOC_PROT_ERR;
+	case CCERR_ACCESS_VIOLATION:
+		return IB_WC_LOC_ACCESS_ERR;
+	case CCERR_TOTAL_LENGTH_TOO_BIG:
+		return IB_WC_LOC_LEN_ERR;
+	case CCERR_INVALID_WINDOW:
+		return IB_WC_MW_BIND_ERR;
+	default:
+		return IB_WC_GENERAL_ERR;
+	}
+}
+
+
+static inline int c2_poll_one(struct c2_dev *c2dev,
+			      struct c2_cq *cq, struct ib_wc *entry)
+{
+	struct c2wr_ce *ce;
+	struct c2_qp *qp;
+	int is_recv = 0;
+
+	ce = c2_mq_consume(&cq->mq);
+	if (!ce) {
+		return -EAGAIN;
+	}
+
+	/*
+	 * if the qp returned is null then this qp has already
+	 * been freed and we are unable process the completion.
+	 * try pulling the next message
+	 */
+	while ((qp =
+		(struct c2_qp *) (unsigned long) ce->qp_user_context) == NULL) {
+		c2_mq_free(&cq->mq);
+		ce = c2_mq_consume(&cq->mq);
+		if (!ce)
+			return -EAGAIN;
+	}
+
+	entry->status = c2_cqe_status_to_openib(c2_wr_get_result(ce));
+	entry->wr_id = ce->hdr.context;
+	entry->qp = &qp->ibqp;
+	entry->wc_flags = 0;
+	entry->slid = 0;
+	entry->sl = 0;
+	entry->src_qp = 0;
+	entry->dlid_path_bits = 0;
+	entry->pkey_index = 0;
+
+	switch (c2_wr_get_id(ce)) {
+	case C2_WR_TYPE_SEND:
+		entry->opcode = IB_WC_SEND;
+		break;
+	case C2_WR_TYPE_RDMA_WRITE:
+		entry->opcode = IB_WC_RDMA_WRITE;
+		break;
+	case C2_WR_TYPE_RDMA_READ:
+		entry->opcode = IB_WC_RDMA_READ;
+		break;
+	case C2_WR_TYPE_BIND_MW:
+		entry->opcode = IB_WC_BIND_MW;
+		break;
+	case C2_WR_TYPE_RECV:
+		entry->byte_len = be32_to_cpu(ce->bytes_rcvd);
+		entry->opcode = IB_WC_RECV;
+		is_recv = 1;
+		break;
+	default:
+		break;
+	}
+
+	/* consume the WQEs */
+	if (is_recv)
+		c2_mq_lconsume(&qp->rq_mq, 1);
+	else
+		c2_mq_lconsume(&qp->sq_mq,
+			       be32_to_cpu(c2_wr_get_wqe_count(ce)) + 1);
+
+	/* free the message */
+	c2_mq_free(&cq->mq);
+
+	return 0;
+}
+
+int c2_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry)
+{
+	struct c2_dev *c2dev = to_c2dev(ibcq->device);
+	struct c2_cq *cq = to_c2cq(ibcq);
+	unsigned long flags;
+	int npolled, err;
+
+	spin_lock_irqsave(&cq->lock, flags);
+
+	for (npolled = 0; npolled < num_entries; ++npolled) {
+
+		err = c2_poll_one(c2dev, cq, entry + npolled);
+		if (err)
+			break;
+	}
+
+	spin_unlock_irqrestore(&cq->lock, flags);
+
+	return npolled;
+}
+
+int c2_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags notify_flags)
+{
+	struct c2_mq_shared __iomem *shared;
+	struct c2_cq *cq;
+	unsigned long flags;
+	int ret = 0;
+
+	cq = to_c2cq(ibcq);
+	shared = cq->mq.peer;
+
+	if ((notify_flags & IB_CQ_SOLICITED_MASK) == IB_CQ_NEXT_COMP)
+		writeb(C2_CQ_NOTIFICATION_TYPE_NEXT, &shared->notification_type);
+	else if ((notify_flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED)
+		writeb(C2_CQ_NOTIFICATION_TYPE_NEXT_SE, &shared->notification_type);
+	else
+		return -EINVAL;
+
+	writeb(CQ_WAIT_FOR_DMA | CQ_ARMED, &shared->armed);
+
+	/*
+	 * Now read back shared->armed to make the PCI
+	 * write synchronous.  This is necessary for
+	 * correct cq notification semantics.
+	 */
+	readb(&shared->armed);
+
+	if (notify_flags & IB_CQ_REPORT_MISSED_EVENTS) {
+		spin_lock_irqsave(&cq->lock, flags);
+		ret = !c2_mq_empty(&cq->mq);
+		spin_unlock_irqrestore(&cq->lock, flags);
+	}
+
+	return ret;
+}
+
+static void c2_free_cq_buf(struct c2_dev *c2dev, struct c2_mq *mq)
+{
+	dma_free_coherent(&c2dev->pcidev->dev, mq->q_size * mq->msg_size,
+			  mq->msg_pool.host, dma_unmap_addr(mq, mapping));
+}
+
+static int c2_alloc_cq_buf(struct c2_dev *c2dev, struct c2_mq *mq,
+			   size_t q_size, size_t msg_size)
+{
+	u8 *pool_start;
+
+	if (q_size > SIZE_MAX / msg_size)
+		return -EINVAL;
+
+	pool_start = dma_alloc_coherent(&c2dev->pcidev->dev, q_size * msg_size,
+					&mq->host_dma, GFP_KERNEL);
+	if (!pool_start)
+		return -ENOMEM;
+
+	c2_mq_rep_init(mq,
+		       0,		/* index (currently unknown) */
+		       q_size,
+		       msg_size,
+		       pool_start,
+		       NULL,	/* peer (currently unknown) */
+		       C2_MQ_HOST_TARGET);
+
+	dma_unmap_addr_set(mq, mapping, mq->host_dma);
+
+	return 0;
+}
+
+int c2_init_cq(struct c2_dev *c2dev, int entries,
+	       struct c2_ucontext *ctx, struct c2_cq *cq)
+{
+	struct c2wr_cq_create_req wr;
+	struct c2wr_cq_create_rep *reply;
+	unsigned long peer_pa;
+	struct c2_vq_req *vq_req;
+	int err;
+
+	might_sleep();
+
+	cq->ibcq.cqe = entries - 1;
+	cq->is_kernel = !ctx;
+
+	/* Allocate a shared pointer */
+	cq->mq.shared = c2_alloc_mqsp(c2dev, c2dev->kern_mqsp_pool,
+				      &cq->mq.shared_dma, GFP_KERNEL);
+	if (!cq->mq.shared)
+		return -ENOMEM;
+
+	/* Allocate pages for the message pool */
+	err = c2_alloc_cq_buf(c2dev, &cq->mq, entries + 1, C2_CQ_MSG_SIZE);
+	if (err)
+		goto bail0;
+
+	vq_req = vq_req_alloc(c2dev);
+	if (!vq_req) {
+		err = -ENOMEM;
+		goto bail1;
+	}
+
+	memset(&wr, 0, sizeof(wr));
+	c2_wr_set_id(&wr, CCWR_CQ_CREATE);
+	wr.hdr.context = (unsigned long) vq_req;
+	wr.rnic_handle = c2dev->adapter_handle;
+	wr.msg_size = cpu_to_be32(cq->mq.msg_size);
+	wr.depth = cpu_to_be32(cq->mq.q_size);
+	wr.shared_ht = cpu_to_be64(cq->mq.shared_dma);
+	wr.msg_pool = cpu_to_be64(cq->mq.host_dma);
+	wr.user_context = (u64) (unsigned long) (cq);
+
+	vq_req_get(c2dev, vq_req);
+
+	err = vq_send_wr(c2dev, (union c2wr *) & wr);
+	if (err) {
+		vq_req_put(c2dev, vq_req);
+		goto bail2;
+	}
+
+	err = vq_wait_for_reply(c2dev, vq_req);
+	if (err)
+		goto bail2;
+
+	reply = (struct c2wr_cq_create_rep *) (unsigned long) (vq_req->reply_msg);
+	if (!reply) {
+		err = -ENOMEM;
+		goto bail2;
+	}
+
+	if ((err = c2_errno(reply)) != 0)
+		goto bail3;
+
+	cq->adapter_handle = reply->cq_handle;
+	cq->mq.index = be32_to_cpu(reply->mq_index);
+
+	peer_pa = c2dev->pa + be32_to_cpu(reply->adapter_shared);
+	cq->mq.peer = ioremap_nocache(peer_pa, PAGE_SIZE);
+	if (!cq->mq.peer) {
+		err = -ENOMEM;
+		goto bail3;
+	}
+
+	vq_repbuf_free(c2dev, reply);
+	vq_req_free(c2dev, vq_req);
+
+	spin_lock_init(&cq->lock);
+	atomic_set(&cq->refcount, 1);
+	init_waitqueue_head(&cq->wait);
+
+	/*
+	 * Use the MQ index allocated by the adapter to
+	 * store the CQ in the qptr_array
+	 */
+	cq->cqn = cq->mq.index;
+	c2dev->qptr_array[cq->cqn] = cq;
+
+	return 0;
+
+      bail3:
+	vq_repbuf_free(c2dev, reply);
+      bail2:
+	vq_req_free(c2dev, vq_req);
+      bail1:
+	c2_free_cq_buf(c2dev, &cq->mq);
+      bail0:
+	c2_free_mqsp(cq->mq.shared);
+
+	return err;
+}
+
+void c2_free_cq(struct c2_dev *c2dev, struct c2_cq *cq)
+{
+	int err;
+	struct c2_vq_req *vq_req;
+	struct c2wr_cq_destroy_req wr;
+	struct c2wr_cq_destroy_rep *reply;
+
+	might_sleep();
+
+	/* Clear CQ from the qptr array */
+	spin_lock_irq(&c2dev->lock);
+	c2dev->qptr_array[cq->mq.index] = NULL;
+	atomic_dec(&cq->refcount);
+	spin_unlock_irq(&c2dev->lock);
+
+	wait_event(cq->wait, !atomic_read(&cq->refcount));
+
+	vq_req = vq_req_alloc(c2dev);
+	if (!vq_req) {
+		goto bail0;
+	}
+
+	memset(&wr, 0, sizeof(wr));
+	c2_wr_set_id(&wr, CCWR_CQ_DESTROY);
+	wr.hdr.context = (unsigned long) vq_req;
+	wr.rnic_handle = c2dev->adapter_handle;
+	wr.cq_handle = cq->adapter_handle;
+
+	vq_req_get(c2dev, vq_req);
+
+	err = vq_send_wr(c2dev, (union c2wr *) & wr);
+	if (err) {
+		vq_req_put(c2dev, vq_req);
+		goto bail1;
+	}
+
+	err = vq_wait_for_reply(c2dev, vq_req);
+	if (err)
+		goto bail1;
+
+	reply = (struct c2wr_cq_destroy_rep *) (unsigned long) (vq_req->reply_msg);
+	if (reply)
+		vq_repbuf_free(c2dev, reply);
+      bail1:
+	vq_req_free(c2dev, vq_req);
+      bail0:
+	if (cq->is_kernel) {
+		c2_free_cq_buf(c2dev, &cq->mq);
+	}
+
+	return;
+}
diff --git a/drivers/infiniband/hw/amso1100/c2_intr.c b/drivers/infiniband/hw/amso1100/c2_intr.c
new file mode 100644
index 0000000..3a17d9b
--- /dev/null
+++ b/drivers/infiniband/hw/amso1100/c2_intr.c
@@ -0,0 +1,219 @@
+/*
+ * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "c2.h"
+#include <rdma/iw_cm.h>
+#include "c2_vq.h"
+
+static void handle_mq(struct c2_dev *c2dev, u32 index);
+static void handle_vq(struct c2_dev *c2dev, u32 mq_index);
+
+/*
+ * Handle RNIC interrupts
+ */
+void c2_rnic_interrupt(struct c2_dev *c2dev)
+{
+	unsigned int mq_index;
+
+	while (c2dev->hints_read != be16_to_cpu(*c2dev->hint_count)) {
+		mq_index = readl(c2dev->regs + PCI_BAR0_HOST_HINT);
+		if (mq_index & 0x80000000) {
+			break;
+		}
+
+		c2dev->hints_read++;
+		handle_mq(c2dev, mq_index);
+	}
+
+}
+
+/*
+ * Top level MQ handler
+ */
+static void handle_mq(struct c2_dev *c2dev, u32 mq_index)
+{
+	if (c2dev->qptr_array[mq_index] == NULL) {
+		pr_debug("handle_mq: stray activity for mq_index=%d\n",
+			 mq_index);
+		return;
+	}
+
+	switch (mq_index) {
+	case (0):
+		/*
+		 * An index of 0 in the activity queue
+		 * indicates the req vq now has messages
+		 * available...
+		 *
+		 * Wake up any waiters waiting on req VQ
+		 * message availability.
+		 */
+		wake_up(&c2dev->req_vq_wo);
+		break;
+	case (1):
+		handle_vq(c2dev, mq_index);
+		break;
+	case (2):
+		/* We have to purge the VQ in case there are pending
+		 * accept reply requests that would result in the
+		 * generation of an ESTABLISHED event. If we don't
+		 * generate these first, a CLOSE event could end up
+		 * being delivered before the ESTABLISHED event.
+		 */
+		handle_vq(c2dev, 1);
+
+		c2_ae_event(c2dev, mq_index);
+		break;
+	default:
+		/* There is no event synchronization between CQ events
+		 * and AE or CM events. In fact, CQE could be
+		 * delivered for all of the I/O up to and including the
+		 * FLUSH for a peer disconenct prior to the ESTABLISHED
+		 * event being delivered to the app. The reason for this
+		 * is that CM events are delivered on a thread, while AE
+		 * and CM events are delivered on interrupt context.
+		 */
+		c2_cq_event(c2dev, mq_index);
+		break;
+	}
+
+	return;
+}
+
+/*
+ * Handles verbs WR replies.
+ */
+static void handle_vq(struct c2_dev *c2dev, u32 mq_index)
+{
+	void *adapter_msg, *reply_msg;
+	struct c2wr_hdr *host_msg;
+	struct c2wr_hdr tmp;
+	struct c2_mq *reply_vq;
+	struct c2_vq_req *req;
+	struct iw_cm_event cm_event;
+	int err;
+
+	reply_vq = (struct c2_mq *) c2dev->qptr_array[mq_index];
+
+	/*
+	 * get next msg from mq_index into adapter_msg.
+	 * don't free it yet.
+	 */
+	adapter_msg = c2_mq_consume(reply_vq);
+	if (adapter_msg == NULL) {
+		return;
+	}
+
+	host_msg = vq_repbuf_alloc(c2dev);
+
+	/*
+	 * If we can't get a host buffer, then we'll still
+	 * wakeup the waiter, we just won't give him the msg.
+	 * It is assumed the waiter will deal with this...
+	 */
+	if (!host_msg) {
+		pr_debug("handle_vq: no repbufs!\n");
+
+		/*
+		 * just copy the WR header into a local variable.
+		 * this allows us to still demux on the context
+		 */
+		host_msg = &tmp;
+		memcpy(host_msg, adapter_msg, sizeof(tmp));
+		reply_msg = NULL;
+	} else {
+		memcpy(host_msg, adapter_msg, reply_vq->msg_size);
+		reply_msg = host_msg;
+	}
+
+	/*
+	 * consume the msg from the MQ
+	 */
+	c2_mq_free(reply_vq);
+
+	/*
+	 * wakeup the waiter.
+	 */
+	req = (struct c2_vq_req *) (unsigned long) host_msg->context;
+	if (req == NULL) {
+		/*
+		 * We should never get here, as the adapter should
+		 * never send us a reply that we're not expecting.
+		 */
+		if (reply_msg != NULL)
+			vq_repbuf_free(c2dev, host_msg);
+		pr_debug("handle_vq: UNEXPECTEDLY got NULL req\n");
+		return;
+	}
+
+	if (reply_msg)
+		err = c2_errno(reply_msg);
+	else
+		err = -ENOMEM;
+
+	if (!err) switch (req->event) {
+	case IW_CM_EVENT_ESTABLISHED:
+		c2_set_qp_state(req->qp,
+				C2_QP_STATE_RTS);
+		/*
+		 * Until ird/ord negotiation via MPAv2 support is added, send
+		 * max supported values
+		 */
+		cm_event.ird = cm_event.ord = 128;
+	case IW_CM_EVENT_CLOSE:
+
+		/*
+		 * Move the QP to RTS if this is
+		 * the established event
+		 */
+		cm_event.event = req->event;
+		cm_event.status = 0;
+		cm_event.local_addr = req->cm_id->local_addr;
+		cm_event.remote_addr = req->cm_id->remote_addr;
+		cm_event.private_data = NULL;
+		cm_event.private_data_len = 0;
+		req->cm_id->event_handler(req->cm_id, &cm_event);
+		break;
+	default:
+		break;
+	}
+
+	req->reply_msg = (u64) (unsigned long) (reply_msg);
+	atomic_set(&req->reply_ready, 1);
+	wake_up(&req->wait_object);
+
+	/*
+	 * If the request was cancelled, then this put will
+	 * free the vq_req memory...and reply_msg!!!
+	 */
+	vq_req_put(c2dev, req);
+}
diff --git a/drivers/infiniband/hw/amso1100/c2_mm.c b/drivers/infiniband/hw/amso1100/c2_mm.c
new file mode 100644
index 0000000..119c4f3
--- /dev/null
+++ b/drivers/infiniband/hw/amso1100/c2_mm.c
@@ -0,0 +1,377 @@
+/*
+ * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include <linux/slab.h>
+
+#include "c2.h"
+#include "c2_vq.h"
+
+#define PBL_VIRT 1
+#define PBL_PHYS 2
+
+/*
+ * Send all the PBL messages to convey the remainder of the PBL
+ * Wait for the adapter's reply on the last one.
+ * This is indicated by setting the MEM_PBL_COMPLETE in the flags.
+ *
+ * NOTE:  vq_req is _not_ freed by this function.  The VQ Host
+ *	  Reply buffer _is_ freed by this function.
+ */
+static int
+send_pbl_messages(struct c2_dev *c2dev, __be32 stag_index,
+		  unsigned long va, u32 pbl_depth,
+		  struct c2_vq_req *vq_req, int pbl_type)
+{
+	u32 pbe_count;		/* amt that fits in a PBL msg */
+	u32 count;		/* amt in this PBL MSG. */
+	struct c2wr_nsmr_pbl_req *wr;	/* PBL WR ptr */
+	struct c2wr_nsmr_pbl_rep *reply;	/* reply ptr */
+ 	int err, pbl_virt, pbl_index, i;
+
+	switch (pbl_type) {
+	case PBL_VIRT:
+		pbl_virt = 1;
+		break;
+	case PBL_PHYS:
+		pbl_virt = 0;
+		break;
+	default:
+		return -EINVAL;
+		break;
+	}
+
+	pbe_count = (c2dev->req_vq.msg_size -
+		     sizeof(struct c2wr_nsmr_pbl_req)) / sizeof(u64);
+	wr = kmalloc(c2dev->req_vq.msg_size, GFP_KERNEL);
+	if (!wr) {
+		return -ENOMEM;
+	}
+	c2_wr_set_id(wr, CCWR_NSMR_PBL);
+
+	/*
+	 * Only the last PBL message will generate a reply from the verbs,
+	 * so we set the context to 0 indicating there is no kernel verbs
+	 * handler blocked awaiting this reply.
+	 */
+	wr->hdr.context = 0;
+	wr->rnic_handle = c2dev->adapter_handle;
+	wr->stag_index = stag_index;	/* already swapped */
+	wr->flags = 0;
+	pbl_index = 0;
+	while (pbl_depth) {
+		count = min(pbe_count, pbl_depth);
+		wr->addrs_length = cpu_to_be32(count);
+
+		/*
+		 *  If this is the last message, then reference the
+		 *  vq request struct cuz we're gonna wait for a reply.
+		 *  also make this PBL msg as the last one.
+		 */
+		if (count == pbl_depth) {
+			/*
+			 * reference the request struct.  dereferenced in the
+			 * int handler.
+			 */
+			vq_req_get(c2dev, vq_req);
+			wr->flags = cpu_to_be32(MEM_PBL_COMPLETE);
+
+			/*
+			 * This is the last PBL message.
+			 * Set the context to our VQ Request Object so we can
+			 * wait for the reply.
+			 */
+			wr->hdr.context = (unsigned long) vq_req;
+		}
+
+		/*
+		 * If pbl_virt is set then va is a virtual address
+		 * that describes a virtually contiguous memory
+		 * allocation. The wr needs the start of each virtual page
+		 * to be converted to the corresponding physical address
+		 * of the page. If pbl_virt is not set then va is an array
+		 * of physical addresses and there is no conversion to do.
+		 * Just fill in the wr with what is in the array.
+		 */
+		for (i = 0; i < count; i++) {
+			if (pbl_virt) {
+				va += PAGE_SIZE;
+			} else {
+ 				wr->paddrs[i] =
+				    cpu_to_be64(((u64 *)va)[pbl_index + i]);
+			}
+		}
+
+		/*
+		 * Send WR to adapter
+		 */
+		err = vq_send_wr(c2dev, (union c2wr *) wr);
+		if (err) {
+			if (count <= pbe_count) {
+				vq_req_put(c2dev, vq_req);
+			}
+			goto bail0;
+		}
+		pbl_depth -= count;
+		pbl_index += count;
+	}
+
+	/*
+	 *  Now wait for the reply...
+	 */
+	err = vq_wait_for_reply(c2dev, vq_req);
+	if (err) {
+		goto bail0;
+	}
+
+	/*
+	 * Process reply
+	 */
+	reply = (struct c2wr_nsmr_pbl_rep *) (unsigned long) vq_req->reply_msg;
+	if (!reply) {
+		err = -ENOMEM;
+		goto bail0;
+	}
+
+	err = c2_errno(reply);
+
+	vq_repbuf_free(c2dev, reply);
+      bail0:
+	kfree(wr);
+	return err;
+}
+
+#define C2_PBL_MAX_DEPTH 131072
+int
+c2_nsmr_register_phys_kern(struct c2_dev *c2dev, u64 *addr_list,
+ 			   int page_size, int pbl_depth, u32 length,
+ 			   u32 offset, u64 *va, enum c2_acf acf,
+			   struct c2_mr *mr)
+{
+	struct c2_vq_req *vq_req;
+	struct c2wr_nsmr_register_req *wr;
+	struct c2wr_nsmr_register_rep *reply;
+	u16 flags;
+	int i, pbe_count, count;
+	int err;
+
+	if (!va || !length || !addr_list || !pbl_depth)
+		return -EINTR;
+
+	/*
+	 * Verify PBL depth is within rnic max
+	 */
+	if (pbl_depth > C2_PBL_MAX_DEPTH) {
+		return -EINTR;
+	}
+
+	/*
+	 * allocate verbs request object
+	 */
+	vq_req = vq_req_alloc(c2dev);
+	if (!vq_req)
+		return -ENOMEM;
+
+	wr = kmalloc(c2dev->req_vq.msg_size, GFP_KERNEL);
+	if (!wr) {
+		err = -ENOMEM;
+		goto bail0;
+	}
+
+	/*
+	 * build the WR
+	 */
+	c2_wr_set_id(wr, CCWR_NSMR_REGISTER);
+	wr->hdr.context = (unsigned long) vq_req;
+	wr->rnic_handle = c2dev->adapter_handle;
+
+	flags = (acf | MEM_VA_BASED | MEM_REMOTE);
+
+	/*
+	 * compute how many pbes can fit in the message
+	 */
+	pbe_count = (c2dev->req_vq.msg_size -
+		     sizeof(struct c2wr_nsmr_register_req)) / sizeof(u64);
+
+	if (pbl_depth <= pbe_count) {
+		flags |= MEM_PBL_COMPLETE;
+	}
+	wr->flags = cpu_to_be16(flags);
+	wr->stag_key = 0;	//stag_key;
+	wr->va = cpu_to_be64(*va);
+	wr->pd_id = mr->pd->pd_id;
+	wr->pbe_size = cpu_to_be32(page_size);
+	wr->length = cpu_to_be32(length);
+	wr->pbl_depth = cpu_to_be32(pbl_depth);
+	wr->fbo = cpu_to_be32(offset);
+	count = min(pbl_depth, pbe_count);
+	wr->addrs_length = cpu_to_be32(count);
+
+	/*
+	 * fill out the PBL for this message
+	 */
+	for (i = 0; i < count; i++) {
+		wr->paddrs[i] = cpu_to_be64(addr_list[i]);
+	}
+
+	/*
+	 * regerence the request struct
+	 */
+	vq_req_get(c2dev, vq_req);
+
+	/*
+	 * send the WR to the adapter
+	 */
+	err = vq_send_wr(c2dev, (union c2wr *) wr);
+	if (err) {
+		vq_req_put(c2dev, vq_req);
+		goto bail1;
+	}
+
+	/*
+	 * wait for reply from adapter
+	 */
+	err = vq_wait_for_reply(c2dev, vq_req);
+	if (err) {
+		goto bail1;
+	}
+
+	/*
+	 * process reply
+	 */
+	reply =
+	    (struct c2wr_nsmr_register_rep *) (unsigned long) (vq_req->reply_msg);
+	if (!reply) {
+		err = -ENOMEM;
+		goto bail1;
+	}
+	if ((err = c2_errno(reply))) {
+		goto bail2;
+	}
+	//*p_pb_entries = be32_to_cpu(reply->pbl_depth);
+	mr->ibmr.lkey = mr->ibmr.rkey = be32_to_cpu(reply->stag_index);
+	vq_repbuf_free(c2dev, reply);
+
+	/*
+	 * if there are still more PBEs we need to send them to
+	 * the adapter and wait for a reply on the final one.
+	 * reuse vq_req for this purpose.
+	 */
+	pbl_depth -= count;
+	if (pbl_depth) {
+
+		vq_req->reply_msg = (unsigned long) NULL;
+		atomic_set(&vq_req->reply_ready, 0);
+		err = send_pbl_messages(c2dev,
+					cpu_to_be32(mr->ibmr.lkey),
+					(unsigned long) &addr_list[i],
+					pbl_depth, vq_req, PBL_PHYS);
+		if (err) {
+			goto bail1;
+		}
+	}
+
+	vq_req_free(c2dev, vq_req);
+	kfree(wr);
+
+	return err;
+
+      bail2:
+	vq_repbuf_free(c2dev, reply);
+      bail1:
+	kfree(wr);
+      bail0:
+	vq_req_free(c2dev, vq_req);
+	return err;
+}
+
+int c2_stag_dealloc(struct c2_dev *c2dev, u32 stag_index)
+{
+	struct c2_vq_req *vq_req;	/* verbs request object */
+	struct c2wr_stag_dealloc_req wr;	/* work request */
+	struct c2wr_stag_dealloc_rep *reply;	/* WR reply  */
+	int err;
+
+
+	/*
+	 * allocate verbs request object
+	 */
+	vq_req = vq_req_alloc(c2dev);
+	if (!vq_req) {
+		return -ENOMEM;
+	}
+
+	/*
+	 * Build the WR
+	 */
+	c2_wr_set_id(&wr, CCWR_STAG_DEALLOC);
+	wr.hdr.context = (u64) (unsigned long) vq_req;
+	wr.rnic_handle = c2dev->adapter_handle;
+	wr.stag_index = cpu_to_be32(stag_index);
+
+	/*
+	 * reference the request struct.  dereferenced in the int handler.
+	 */
+	vq_req_get(c2dev, vq_req);
+
+	/*
+	 * Send WR to adapter
+	 */
+	err = vq_send_wr(c2dev, (union c2wr *) & wr);
+	if (err) {
+		vq_req_put(c2dev, vq_req);
+		goto bail0;
+	}
+
+	/*
+	 * Wait for reply from adapter
+	 */
+	err = vq_wait_for_reply(c2dev, vq_req);
+	if (err) {
+		goto bail0;
+	}
+
+	/*
+	 * Process reply
+	 */
+	reply = (struct c2wr_stag_dealloc_rep *) (unsigned long) vq_req->reply_msg;
+	if (!reply) {
+		err = -ENOMEM;
+		goto bail0;
+	}
+
+	err = c2_errno(reply);
+
+	vq_repbuf_free(c2dev, reply);
+      bail0:
+	vq_req_free(c2dev, vq_req);
+	return err;
+}
diff --git a/drivers/infiniband/hw/amso1100/c2_mq.c b/drivers/infiniband/hw/amso1100/c2_mq.c
new file mode 100644
index 0000000..0cddc49
--- /dev/null
+++ b/drivers/infiniband/hw/amso1100/c2_mq.c
@@ -0,0 +1,174 @@
+/*
+ * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "c2.h"
+#include "c2_mq.h"
+
+void *c2_mq_alloc(struct c2_mq *q)
+{
+	BUG_ON(q->magic != C2_MQ_MAGIC);
+	BUG_ON(q->type != C2_MQ_ADAPTER_TARGET);
+
+	if (c2_mq_full(q)) {
+		return NULL;
+	} else {
+#ifdef DEBUG
+		struct c2wr_hdr *m =
+		    (struct c2wr_hdr *) (q->msg_pool.host + q->priv * q->msg_size);
+#ifdef CCMSGMAGIC
+		BUG_ON(m->magic != be32_to_cpu(~CCWR_MAGIC));
+		m->magic = cpu_to_be32(CCWR_MAGIC);
+#endif
+		return m;
+#else
+		return q->msg_pool.host + q->priv * q->msg_size;
+#endif
+	}
+}
+
+void c2_mq_produce(struct c2_mq *q)
+{
+	BUG_ON(q->magic != C2_MQ_MAGIC);
+	BUG_ON(q->type != C2_MQ_ADAPTER_TARGET);
+
+	if (!c2_mq_full(q)) {
+		q->priv = (q->priv + 1) % q->q_size;
+		q->hint_count++;
+		/* Update peer's offset. */
+		__raw_writew((__force u16) cpu_to_be16(q->priv), &q->peer->shared);
+	}
+}
+
+void *c2_mq_consume(struct c2_mq *q)
+{
+	BUG_ON(q->magic != C2_MQ_MAGIC);
+	BUG_ON(q->type != C2_MQ_HOST_TARGET);
+
+	if (c2_mq_empty(q)) {
+		return NULL;
+	} else {
+#ifdef DEBUG
+		struct c2wr_hdr *m = (struct c2wr_hdr *)
+		    (q->msg_pool.host + q->priv * q->msg_size);
+#ifdef CCMSGMAGIC
+		BUG_ON(m->magic != be32_to_cpu(CCWR_MAGIC));
+#endif
+		return m;
+#else
+		return q->msg_pool.host + q->priv * q->msg_size;
+#endif
+	}
+}
+
+void c2_mq_free(struct c2_mq *q)
+{
+	BUG_ON(q->magic != C2_MQ_MAGIC);
+	BUG_ON(q->type != C2_MQ_HOST_TARGET);
+
+	if (!c2_mq_empty(q)) {
+
+#ifdef CCMSGMAGIC
+		{
+			struct c2wr_hdr __iomem *m = (struct c2wr_hdr __iomem *)
+			    (q->msg_pool.adapter + q->priv * q->msg_size);
+			__raw_writel(cpu_to_be32(~CCWR_MAGIC), &m->magic);
+		}
+#endif
+		q->priv = (q->priv + 1) % q->q_size;
+		/* Update peer's offset. */
+		__raw_writew((__force u16) cpu_to_be16(q->priv), &q->peer->shared);
+	}
+}
+
+
+void c2_mq_lconsume(struct c2_mq *q, u32 wqe_count)
+{
+	BUG_ON(q->magic != C2_MQ_MAGIC);
+	BUG_ON(q->type != C2_MQ_ADAPTER_TARGET);
+
+	while (wqe_count--) {
+		BUG_ON(c2_mq_empty(q));
+		*q->shared = cpu_to_be16((be16_to_cpu(*q->shared)+1) % q->q_size);
+	}
+}
+
+#if 0
+u32 c2_mq_count(struct c2_mq *q)
+{
+	s32 count;
+
+	if (q->type == C2_MQ_HOST_TARGET)
+		count = be16_to_cpu(*q->shared) - q->priv;
+	else
+		count = q->priv - be16_to_cpu(*q->shared);
+
+	if (count < 0)
+		count += q->q_size;
+
+	return (u32) count;
+}
+#endif  /*  0  */
+
+void c2_mq_req_init(struct c2_mq *q, u32 index, u32 q_size, u32 msg_size,
+		    u8 __iomem *pool_start, u16 __iomem *peer, u32 type)
+{
+	BUG_ON(!q->shared);
+
+	/* This code assumes the byte swapping has already been done! */
+	q->index = index;
+	q->q_size = q_size;
+	q->msg_size = msg_size;
+	q->msg_pool.adapter = pool_start;
+	q->peer = (struct c2_mq_shared __iomem *) peer;
+	q->magic = C2_MQ_MAGIC;
+	q->type = type;
+	q->priv = 0;
+	q->hint_count = 0;
+	return;
+}
+void c2_mq_rep_init(struct c2_mq *q, u32 index, u32 q_size, u32 msg_size,
+		    u8 *pool_start, u16 __iomem *peer, u32 type)
+{
+	BUG_ON(!q->shared);
+
+	/* This code assumes the byte swapping has already been done! */
+	q->index = index;
+	q->q_size = q_size;
+	q->msg_size = msg_size;
+	q->msg_pool.host = pool_start;
+	q->peer = (struct c2_mq_shared __iomem *) peer;
+	q->magic = C2_MQ_MAGIC;
+	q->type = type;
+	q->priv = 0;
+	q->hint_count = 0;
+	return;
+}
diff --git a/drivers/infiniband/hw/amso1100/c2_mq.h b/drivers/infiniband/hw/amso1100/c2_mq.h
new file mode 100644
index 0000000..fc1b9a7
--- /dev/null
+++ b/drivers/infiniband/hw/amso1100/c2_mq.h
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _C2_MQ_H_
+#define _C2_MQ_H_
+#include <linux/kernel.h>
+#include <linux/dma-mapping.h>
+#include "c2_wr.h"
+
+enum c2_shared_regs {
+
+	C2_SHARED_ARMED = 0x10,
+	C2_SHARED_NOTIFY = 0x18,
+	C2_SHARED_SHARED = 0x40,
+};
+
+struct c2_mq_shared {
+	u16 unused1;
+	u8 armed;
+	u8 notification_type;
+	u32 unused2;
+	u16 shared;
+	/* Pad to 64 bytes. */
+	u8 pad[64 - sizeof(u16) - 2 * sizeof(u8) - sizeof(u32) - sizeof(u16)];
+};
+
+enum c2_mq_type {
+	C2_MQ_HOST_TARGET = 1,
+	C2_MQ_ADAPTER_TARGET = 2,
+};
+
+/*
+ * c2_mq_t is for kernel-mode MQs like the VQs Cand the AEQ.
+ * c2_user_mq_t (which is the same format) is for user-mode MQs...
+ */
+#define C2_MQ_MAGIC 0x4d512020	/* 'MQ  ' */
+struct c2_mq {
+	u32 magic;
+	union {
+		u8 *host;
+		u8 __iomem *adapter;
+	} msg_pool;
+	dma_addr_t host_dma;
+	DEFINE_DMA_UNMAP_ADDR(mapping);
+	u16 hint_count;
+	u16 priv;
+	struct c2_mq_shared __iomem *peer;
+	__be16 *shared;
+	dma_addr_t shared_dma;
+	u32 q_size;
+	u32 msg_size;
+	u32 index;
+	enum c2_mq_type type;
+};
+
+static __inline__ int c2_mq_empty(struct c2_mq *q)
+{
+	return q->priv == be16_to_cpu(*q->shared);
+}
+
+static __inline__ int c2_mq_full(struct c2_mq *q)
+{
+	return q->priv == (be16_to_cpu(*q->shared) + q->q_size - 1) % q->q_size;
+}
+
+extern void c2_mq_lconsume(struct c2_mq *q, u32 wqe_count);
+extern void *c2_mq_alloc(struct c2_mq *q);
+extern void c2_mq_produce(struct c2_mq *q);
+extern void *c2_mq_consume(struct c2_mq *q);
+extern void c2_mq_free(struct c2_mq *q);
+extern void c2_mq_req_init(struct c2_mq *q, u32 index, u32 q_size, u32 msg_size,
+		       u8 __iomem *pool_start, u16 __iomem *peer, u32 type);
+extern void c2_mq_rep_init(struct c2_mq *q, u32 index, u32 q_size, u32 msg_size,
+			   u8 *pool_start, u16 __iomem *peer, u32 type);
+
+#endif				/* _C2_MQ_H_ */
diff --git a/drivers/infiniband/hw/amso1100/c2_pd.c b/drivers/infiniband/hw/amso1100/c2_pd.c
new file mode 100644
index 0000000..f3e81dc
--- /dev/null
+++ b/drivers/infiniband/hw/amso1100/c2_pd.c
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Cisco Systems.  All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/errno.h>
+
+#include "c2.h"
+#include "c2_provider.h"
+
+int c2_pd_alloc(struct c2_dev *c2dev, int privileged, struct c2_pd *pd)
+{
+	u32 obj;
+	int ret = 0;
+
+	spin_lock(&c2dev->pd_table.lock);
+	obj = find_next_zero_bit(c2dev->pd_table.table, c2dev->pd_table.max,
+				 c2dev->pd_table.last);
+	if (obj >= c2dev->pd_table.max)
+		obj = find_first_zero_bit(c2dev->pd_table.table,
+					  c2dev->pd_table.max);
+	if (obj < c2dev->pd_table.max) {
+		pd->pd_id = obj;
+		__set_bit(obj, c2dev->pd_table.table);
+		c2dev->pd_table.last = obj+1;
+		if (c2dev->pd_table.last >= c2dev->pd_table.max)
+			c2dev->pd_table.last = 0;
+	} else
+		ret = -ENOMEM;
+	spin_unlock(&c2dev->pd_table.lock);
+	return ret;
+}
+
+void c2_pd_free(struct c2_dev *c2dev, struct c2_pd *pd)
+{
+	spin_lock(&c2dev->pd_table.lock);
+	__clear_bit(pd->pd_id, c2dev->pd_table.table);
+	spin_unlock(&c2dev->pd_table.lock);
+}
+
+int c2_init_pd_table(struct c2_dev *c2dev)
+{
+
+	c2dev->pd_table.last = 0;
+	c2dev->pd_table.max = c2dev->props.max_pd;
+	spin_lock_init(&c2dev->pd_table.lock);
+	c2dev->pd_table.table = kmalloc(BITS_TO_LONGS(c2dev->props.max_pd) *
+					sizeof(long), GFP_KERNEL);
+	if (!c2dev->pd_table.table)
+		return -ENOMEM;
+	bitmap_zero(c2dev->pd_table.table, c2dev->props.max_pd);
+	return 0;
+}
+
+void c2_cleanup_pd_table(struct c2_dev *c2dev)
+{
+	kfree(c2dev->pd_table.table);
+}
diff --git a/drivers/infiniband/hw/amso1100/c2_provider.c b/drivers/infiniband/hw/amso1100/c2_provider.c
new file mode 100644
index 0000000..2d5cbf4
--- /dev/null
+++ b/drivers/infiniband/hw/amso1100/c2_provider.c
@@ -0,0 +1,882 @@
+/*
+ * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/pci.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/inetdevice.h>
+#include <linux/delay.h>
+#include <linux/ethtool.h>
+#include <linux/mii.h>
+#include <linux/if_vlan.h>
+#include <linux/crc32.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>
+#include <linux/init.h>
+#include <linux/dma-mapping.h>
+#include <linux/if_arp.h>
+#include <linux/vmalloc.h>
+#include <linux/slab.h>
+
+#include <asm/io.h>
+#include <asm/irq.h>
+#include <asm/byteorder.h>
+
+#include <rdma/ib_smi.h>
+#include <rdma/ib_umem.h>
+#include <rdma/ib_user_verbs.h>
+#include "c2.h"
+#include "c2_provider.h"
+#include "c2_user.h"
+
+static int c2_query_device(struct ib_device *ibdev,
+			   struct ib_device_attr *props)
+{
+	struct c2_dev *c2dev = to_c2dev(ibdev);
+
+	pr_debug("%s:%u\n", __func__, __LINE__);
+
+	*props = c2dev->props;
+	return 0;
+}
+
+static int c2_query_port(struct ib_device *ibdev,
+			 u8 port, struct ib_port_attr *props)
+{
+	pr_debug("%s:%u\n", __func__, __LINE__);
+
+	props->max_mtu = IB_MTU_4096;
+	props->lid = 0;
+	props->lmc = 0;
+	props->sm_lid = 0;
+	props->sm_sl = 0;
+	props->state = IB_PORT_ACTIVE;
+	props->phys_state = 0;
+	props->port_cap_flags =
+	    IB_PORT_CM_SUP |
+	    IB_PORT_REINIT_SUP |
+	    IB_PORT_VENDOR_CLASS_SUP | IB_PORT_BOOT_MGMT_SUP;
+	props->gid_tbl_len = 1;
+	props->pkey_tbl_len = 1;
+	props->qkey_viol_cntr = 0;
+	props->active_width = 1;
+	props->active_speed = IB_SPEED_SDR;
+
+	return 0;
+}
+
+static int c2_query_pkey(struct ib_device *ibdev,
+			 u8 port, u16 index, u16 * pkey)
+{
+	pr_debug("%s:%u\n", __func__, __LINE__);
+	*pkey = 0;
+	return 0;
+}
+
+static int c2_query_gid(struct ib_device *ibdev, u8 port,
+			int index, union ib_gid *gid)
+{
+	struct c2_dev *c2dev = to_c2dev(ibdev);
+
+	pr_debug("%s:%u\n", __func__, __LINE__);
+	memset(&(gid->raw[0]), 0, sizeof(gid->raw));
+	memcpy(&(gid->raw[0]), c2dev->pseudo_netdev->dev_addr, 6);
+
+	return 0;
+}
+
+/* Allocate the user context data structure. This keeps track
+ * of all objects associated with a particular user-mode client.
+ */
+static struct ib_ucontext *c2_alloc_ucontext(struct ib_device *ibdev,
+					     struct ib_udata *udata)
+{
+	struct c2_ucontext *context;
+
+	pr_debug("%s:%u\n", __func__, __LINE__);
+	context = kmalloc(sizeof(*context), GFP_KERNEL);
+	if (!context)
+		return ERR_PTR(-ENOMEM);
+
+	return &context->ibucontext;
+}
+
+static int c2_dealloc_ucontext(struct ib_ucontext *context)
+{
+	pr_debug("%s:%u\n", __func__, __LINE__);
+	kfree(context);
+	return 0;
+}
+
+static int c2_mmap_uar(struct ib_ucontext *context, struct vm_area_struct *vma)
+{
+	pr_debug("%s:%u\n", __func__, __LINE__);
+	return -ENOSYS;
+}
+
+static struct ib_pd *c2_alloc_pd(struct ib_device *ibdev,
+				 struct ib_ucontext *context,
+				 struct ib_udata *udata)
+{
+	struct c2_pd *pd;
+	int err;
+
+	pr_debug("%s:%u\n", __func__, __LINE__);
+
+	pd = kmalloc(sizeof(*pd), GFP_KERNEL);
+	if (!pd)
+		return ERR_PTR(-ENOMEM);
+
+	err = c2_pd_alloc(to_c2dev(ibdev), !context, pd);
+	if (err) {
+		kfree(pd);
+		return ERR_PTR(err);
+	}
+
+	if (context) {
+		if (ib_copy_to_udata(udata, &pd->pd_id, sizeof(__u32))) {
+			c2_pd_free(to_c2dev(ibdev), pd);
+			kfree(pd);
+			return ERR_PTR(-EFAULT);
+		}
+	}
+
+	return &pd->ibpd;
+}
+
+static int c2_dealloc_pd(struct ib_pd *pd)
+{
+	pr_debug("%s:%u\n", __func__, __LINE__);
+	c2_pd_free(to_c2dev(pd->device), to_c2pd(pd));
+	kfree(pd);
+
+	return 0;
+}
+
+static struct ib_ah *c2_ah_create(struct ib_pd *pd, struct ib_ah_attr *ah_attr)
+{
+	pr_debug("%s:%u\n", __func__, __LINE__);
+	return ERR_PTR(-ENOSYS);
+}
+
+static int c2_ah_destroy(struct ib_ah *ah)
+{
+	pr_debug("%s:%u\n", __func__, __LINE__);
+	return -ENOSYS;
+}
+
+static void c2_add_ref(struct ib_qp *ibqp)
+{
+	struct c2_qp *qp;
+	BUG_ON(!ibqp);
+	qp = to_c2qp(ibqp);
+	atomic_inc(&qp->refcount);
+}
+
+static void c2_rem_ref(struct ib_qp *ibqp)
+{
+	struct c2_qp *qp;
+	BUG_ON(!ibqp);
+	qp = to_c2qp(ibqp);
+	if (atomic_dec_and_test(&qp->refcount))
+		wake_up(&qp->wait);
+}
+
+struct ib_qp *c2_get_qp(struct ib_device *device, int qpn)
+{
+	struct c2_dev* c2dev = to_c2dev(device);
+	struct c2_qp *qp;
+
+	qp = c2_find_qpn(c2dev, qpn);
+	pr_debug("%s Returning QP=%p for QPN=%d, device=%p, refcount=%d\n",
+		__func__, qp, qpn, device,
+		(qp?atomic_read(&qp->refcount):0));
+
+	return (qp?&qp->ibqp:NULL);
+}
+
+static struct ib_qp *c2_create_qp(struct ib_pd *pd,
+				  struct ib_qp_init_attr *init_attr,
+				  struct ib_udata *udata)
+{
+	struct c2_qp *qp;
+	int err;
+
+	pr_debug("%s:%u\n", __func__, __LINE__);
+
+	if (init_attr->create_flags)
+		return ERR_PTR(-EINVAL);
+
+	switch (init_attr->qp_type) {
+	case IB_QPT_RC:
+		qp = kzalloc(sizeof(*qp), GFP_KERNEL);
+		if (!qp) {
+			pr_debug("%s: Unable to allocate QP\n", __func__);
+			return ERR_PTR(-ENOMEM);
+		}
+		spin_lock_init(&qp->lock);
+		if (pd->uobject) {
+			/* userspace specific */
+		}
+
+		err = c2_alloc_qp(to_c2dev(pd->device),
+				  to_c2pd(pd), init_attr, qp);
+
+		if (err && pd->uobject) {
+			/* userspace specific */
+		}
+
+		break;
+	default:
+		pr_debug("%s: Invalid QP type: %d\n", __func__,
+			init_attr->qp_type);
+		return ERR_PTR(-EINVAL);
+	}
+
+	if (err) {
+		kfree(qp);
+		return ERR_PTR(err);
+	}
+
+	return &qp->ibqp;
+}
+
+static int c2_destroy_qp(struct ib_qp *ib_qp)
+{
+	struct c2_qp *qp = to_c2qp(ib_qp);
+
+	pr_debug("%s:%u qp=%p,qp->state=%d\n",
+		__func__, __LINE__, ib_qp, qp->state);
+	c2_free_qp(to_c2dev(ib_qp->device), qp);
+	kfree(qp);
+	return 0;
+}
+
+static struct ib_cq *c2_create_cq(struct ib_device *ibdev, int entries, int vector,
+				  struct ib_ucontext *context,
+				  struct ib_udata *udata)
+{
+	struct c2_cq *cq;
+	int err;
+
+	cq = kmalloc(sizeof(*cq), GFP_KERNEL);
+	if (!cq) {
+		pr_debug("%s: Unable to allocate CQ\n", __func__);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	err = c2_init_cq(to_c2dev(ibdev), entries, NULL, cq);
+	if (err) {
+		pr_debug("%s: error initializing CQ\n", __func__);
+		kfree(cq);
+		return ERR_PTR(err);
+	}
+
+	return &cq->ibcq;
+}
+
+static int c2_destroy_cq(struct ib_cq *ib_cq)
+{
+	struct c2_cq *cq = to_c2cq(ib_cq);
+
+	pr_debug("%s:%u\n", __func__, __LINE__);
+
+	c2_free_cq(to_c2dev(ib_cq->device), cq);
+	kfree(cq);
+
+	return 0;
+}
+
+static inline u32 c2_convert_access(int acc)
+{
+	return (acc & IB_ACCESS_REMOTE_WRITE ? C2_ACF_REMOTE_WRITE : 0) |
+	    (acc & IB_ACCESS_REMOTE_READ ? C2_ACF_REMOTE_READ : 0) |
+	    (acc & IB_ACCESS_LOCAL_WRITE ? C2_ACF_LOCAL_WRITE : 0) |
+	    C2_ACF_LOCAL_READ | C2_ACF_WINDOW_BIND;
+}
+
+static struct ib_mr *c2_reg_phys_mr(struct ib_pd *ib_pd,
+				    struct ib_phys_buf *buffer_list,
+				    int num_phys_buf, int acc, u64 * iova_start)
+{
+	struct c2_mr *mr;
+	u64 *page_list;
+	u32 total_len;
+	int err, i, j, k, page_shift, pbl_depth;
+
+	pbl_depth = 0;
+	total_len = 0;
+
+	page_shift = PAGE_SHIFT;
+	/*
+	 * If there is only 1 buffer we assume this could
+	 * be a map of all phy mem...use a 32k page_shift.
+	 */
+	if (num_phys_buf == 1)
+		page_shift += 3;
+
+	for (i = 0; i < num_phys_buf; i++) {
+
+		if (buffer_list[i].addr & ~PAGE_MASK) {
+			pr_debug("Unaligned Memory Buffer: 0x%x\n",
+				(unsigned int) buffer_list[i].addr);
+			return ERR_PTR(-EINVAL);
+		}
+
+		if (!buffer_list[i].size) {
+			pr_debug("Invalid Buffer Size\n");
+			return ERR_PTR(-EINVAL);
+		}
+
+		total_len += buffer_list[i].size;
+		pbl_depth += ALIGN(buffer_list[i].size,
+				   (1 << page_shift)) >> page_shift;
+	}
+
+	page_list = vmalloc(sizeof(u64) * pbl_depth);
+	if (!page_list) {
+		pr_debug("couldn't vmalloc page_list of size %zd\n",
+			(sizeof(u64) * pbl_depth));
+		return ERR_PTR(-ENOMEM);
+	}
+
+	for (i = 0, j = 0; i < num_phys_buf; i++) {
+
+		int naddrs;
+
+ 		naddrs = ALIGN(buffer_list[i].size,
+			       (1 << page_shift)) >> page_shift;
+		for (k = 0; k < naddrs; k++)
+			page_list[j++] = (buffer_list[i].addr +
+						     (k << page_shift));
+	}
+
+	mr = kmalloc(sizeof(*mr), GFP_KERNEL);
+	if (!mr) {
+		vfree(page_list);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	mr->pd = to_c2pd(ib_pd);
+	mr->umem = NULL;
+	pr_debug("%s - page shift %d, pbl_depth %d, total_len %u, "
+		"*iova_start %llx, first pa %llx, last pa %llx\n",
+		__func__, page_shift, pbl_depth, total_len,
+		(unsigned long long) *iova_start,
+	       	(unsigned long long) page_list[0],
+	       	(unsigned long long) page_list[pbl_depth-1]);
+  	err = c2_nsmr_register_phys_kern(to_c2dev(ib_pd->device), page_list,
+ 					 (1 << page_shift), pbl_depth,
+					 total_len, 0, iova_start,
+					 c2_convert_access(acc), mr);
+	vfree(page_list);
+	if (err) {
+		kfree(mr);
+		return ERR_PTR(err);
+	}
+
+	return &mr->ibmr;
+}
+
+static struct ib_mr *c2_get_dma_mr(struct ib_pd *pd, int acc)
+{
+	struct ib_phys_buf bl;
+	u64 kva = 0;
+
+	pr_debug("%s:%u\n", __func__, __LINE__);
+
+	/* AMSO1100 limit */
+	bl.size = 0xffffffff;
+	bl.addr = 0;
+	return c2_reg_phys_mr(pd, &bl, 1, acc, &kva);
+}
+
+static struct ib_mr *c2_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
+				    u64 virt, int acc, struct ib_udata *udata)
+{
+	u64 *pages;
+	u64 kva = 0;
+	int shift, n, len;
+	int i, k, entry;
+	int err = 0;
+	struct scatterlist *sg;
+	struct c2_pd *c2pd = to_c2pd(pd);
+	struct c2_mr *c2mr;
+
+	pr_debug("%s:%u\n", __func__, __LINE__);
+
+	c2mr = kmalloc(sizeof(*c2mr), GFP_KERNEL);
+	if (!c2mr)
+		return ERR_PTR(-ENOMEM);
+	c2mr->pd = c2pd;
+
+	c2mr->umem = ib_umem_get(pd->uobject->context, start, length, acc, 0);
+	if (IS_ERR(c2mr->umem)) {
+		err = PTR_ERR(c2mr->umem);
+		kfree(c2mr);
+		return ERR_PTR(err);
+	}
+
+	shift = ffs(c2mr->umem->page_size) - 1;
+	n = c2mr->umem->nmap;
+
+	pages = kmalloc(n * sizeof(u64), GFP_KERNEL);
+	if (!pages) {
+		err = -ENOMEM;
+		goto err;
+	}
+
+	i = 0;
+	for_each_sg(c2mr->umem->sg_head.sgl, sg, c2mr->umem->nmap, entry) {
+		len = sg_dma_len(sg) >> shift;
+		for (k = 0; k < len; ++k) {
+			pages[i++] =
+				sg_dma_address(sg) +
+				(c2mr->umem->page_size * k);
+		}
+	}
+
+	kva = virt;
+  	err = c2_nsmr_register_phys_kern(to_c2dev(pd->device),
+					 pages,
+					 c2mr->umem->page_size,
+					 i,
+					 length,
+					 c2mr->umem->offset,
+					 &kva,
+					 c2_convert_access(acc),
+					 c2mr);
+	kfree(pages);
+	if (err)
+		goto err;
+	return &c2mr->ibmr;
+
+err:
+	ib_umem_release(c2mr->umem);
+	kfree(c2mr);
+	return ERR_PTR(err);
+}
+
+static int c2_dereg_mr(struct ib_mr *ib_mr)
+{
+	struct c2_mr *mr = to_c2mr(ib_mr);
+	int err;
+
+	pr_debug("%s:%u\n", __func__, __LINE__);
+
+	err = c2_stag_dealloc(to_c2dev(ib_mr->device), ib_mr->lkey);
+	if (err)
+		pr_debug("c2_stag_dealloc failed: %d\n", err);
+	else {
+		if (mr->umem)
+			ib_umem_release(mr->umem);
+		kfree(mr);
+	}
+
+	return err;
+}
+
+static ssize_t show_rev(struct device *dev, struct device_attribute *attr,
+			char *buf)
+{
+	struct c2_dev *c2dev = container_of(dev, struct c2_dev, ibdev.dev);
+	pr_debug("%s:%u\n", __func__, __LINE__);
+	return sprintf(buf, "%x\n", c2dev->props.hw_ver);
+}
+
+static ssize_t show_fw_ver(struct device *dev, struct device_attribute *attr,
+			   char *buf)
+{
+	struct c2_dev *c2dev = container_of(dev, struct c2_dev, ibdev.dev);
+	pr_debug("%s:%u\n", __func__, __LINE__);
+	return sprintf(buf, "%x.%x.%x\n",
+		       (int) (c2dev->props.fw_ver >> 32),
+		       (int) (c2dev->props.fw_ver >> 16) & 0xffff,
+		       (int) (c2dev->props.fw_ver & 0xffff));
+}
+
+static ssize_t show_hca(struct device *dev, struct device_attribute *attr,
+			char *buf)
+{
+	pr_debug("%s:%u\n", __func__, __LINE__);
+	return sprintf(buf, "AMSO1100\n");
+}
+
+static ssize_t show_board(struct device *dev, struct device_attribute *attr,
+			  char *buf)
+{
+	pr_debug("%s:%u\n", __func__, __LINE__);
+	return sprintf(buf, "%.*s\n", 32, "AMSO1100 Board ID");
+}
+
+static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL);
+static DEVICE_ATTR(fw_ver, S_IRUGO, show_fw_ver, NULL);
+static DEVICE_ATTR(hca_type, S_IRUGO, show_hca, NULL);
+static DEVICE_ATTR(board_id, S_IRUGO, show_board, NULL);
+
+static struct device_attribute *c2_dev_attributes[] = {
+	&dev_attr_hw_rev,
+	&dev_attr_fw_ver,
+	&dev_attr_hca_type,
+	&dev_attr_board_id
+};
+
+static int c2_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+			int attr_mask, struct ib_udata *udata)
+{
+	int err;
+
+	err =
+	    c2_qp_modify(to_c2dev(ibqp->device), to_c2qp(ibqp), attr,
+			 attr_mask);
+
+	return err;
+}
+
+static int c2_multicast_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
+{
+	pr_debug("%s:%u\n", __func__, __LINE__);
+	return -ENOSYS;
+}
+
+static int c2_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
+{
+	pr_debug("%s:%u\n", __func__, __LINE__);
+	return -ENOSYS;
+}
+
+static int c2_process_mad(struct ib_device *ibdev,
+			  int mad_flags,
+			  u8 port_num,
+			  struct ib_wc *in_wc,
+			  struct ib_grh *in_grh,
+			  struct ib_mad *in_mad, struct ib_mad *out_mad)
+{
+	pr_debug("%s:%u\n", __func__, __LINE__);
+	return -ENOSYS;
+}
+
+static int c2_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *iw_param)
+{
+	pr_debug("%s:%u\n", __func__, __LINE__);
+
+	/* Request a connection */
+	return c2_llp_connect(cm_id, iw_param);
+}
+
+static int c2_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *iw_param)
+{
+	pr_debug("%s:%u\n", __func__, __LINE__);
+
+	/* Accept the new connection */
+	return c2_llp_accept(cm_id, iw_param);
+}
+
+static int c2_reject(struct iw_cm_id *cm_id, const void *pdata, u8 pdata_len)
+{
+	int err;
+
+	pr_debug("%s:%u\n", __func__, __LINE__);
+
+	err = c2_llp_reject(cm_id, pdata, pdata_len);
+	return err;
+}
+
+static int c2_service_create(struct iw_cm_id *cm_id, int backlog)
+{
+	int err;
+
+	pr_debug("%s:%u\n", __func__, __LINE__);
+	err = c2_llp_service_create(cm_id, backlog);
+	pr_debug("%s:%u err=%d\n",
+		__func__, __LINE__,
+		err);
+	return err;
+}
+
+static int c2_service_destroy(struct iw_cm_id *cm_id)
+{
+	int err;
+	pr_debug("%s:%u\n", __func__, __LINE__);
+
+	err = c2_llp_service_destroy(cm_id);
+
+	return err;
+}
+
+static int c2_pseudo_up(struct net_device *netdev)
+{
+	struct in_device *ind;
+	struct c2_dev *c2dev = netdev->ml_priv;
+
+	ind = in_dev_get(netdev);
+	if (!ind)
+		return 0;
+
+	pr_debug("adding...\n");
+	for_ifa(ind) {
+#ifdef DEBUG
+		u8 *ip = (u8 *) & ifa->ifa_address;
+
+		pr_debug("%s: %d.%d.%d.%d\n",
+		       ifa->ifa_label, ip[0], ip[1], ip[2], ip[3]);
+#endif
+		c2_add_addr(c2dev, ifa->ifa_address, ifa->ifa_mask);
+	}
+	endfor_ifa(ind);
+	in_dev_put(ind);
+
+	return 0;
+}
+
+static int c2_pseudo_down(struct net_device *netdev)
+{
+	struct in_device *ind;
+	struct c2_dev *c2dev = netdev->ml_priv;
+
+	ind = in_dev_get(netdev);
+	if (!ind)
+		return 0;
+
+	pr_debug("deleting...\n");
+	for_ifa(ind) {
+#ifdef DEBUG
+		u8 *ip = (u8 *) & ifa->ifa_address;
+
+		pr_debug("%s: %d.%d.%d.%d\n",
+		       ifa->ifa_label, ip[0], ip[1], ip[2], ip[3]);
+#endif
+		c2_del_addr(c2dev, ifa->ifa_address, ifa->ifa_mask);
+	}
+	endfor_ifa(ind);
+	in_dev_put(ind);
+
+	return 0;
+}
+
+static int c2_pseudo_xmit_frame(struct sk_buff *skb, struct net_device *netdev)
+{
+	kfree_skb(skb);
+	return NETDEV_TX_OK;
+}
+
+static int c2_pseudo_change_mtu(struct net_device *netdev, int new_mtu)
+{
+	if (new_mtu < ETH_ZLEN || new_mtu > ETH_JUMBO_MTU)
+		return -EINVAL;
+
+	netdev->mtu = new_mtu;
+
+	/* TODO: Tell rnic about new rmda interface mtu */
+	return 0;
+}
+
+static const struct net_device_ops c2_pseudo_netdev_ops = {
+	.ndo_open 		= c2_pseudo_up,
+	.ndo_stop 		= c2_pseudo_down,
+	.ndo_start_xmit 	= c2_pseudo_xmit_frame,
+	.ndo_change_mtu 	= c2_pseudo_change_mtu,
+	.ndo_validate_addr	= eth_validate_addr,
+};
+
+static void setup(struct net_device *netdev)
+{
+	netdev->netdev_ops = &c2_pseudo_netdev_ops;
+
+	netdev->watchdog_timeo = 0;
+	netdev->type = ARPHRD_ETHER;
+	netdev->mtu = 1500;
+	netdev->hard_header_len = ETH_HLEN;
+	netdev->addr_len = ETH_ALEN;
+	netdev->tx_queue_len = 0;
+	netdev->flags |= IFF_NOARP;
+}
+
+static struct net_device *c2_pseudo_netdev_init(struct c2_dev *c2dev)
+{
+	char name[IFNAMSIZ];
+	struct net_device *netdev;
+
+	/* change ethxxx to iwxxx */
+	strcpy(name, "iw");
+	strcat(name, &c2dev->netdev->name[3]);
+	netdev = alloc_netdev(0, name, NET_NAME_UNKNOWN, setup);
+	if (!netdev) {
+		printk(KERN_ERR PFX "%s -  etherdev alloc failed",
+			__func__);
+		return NULL;
+	}
+
+	netdev->ml_priv = c2dev;
+
+	SET_NETDEV_DEV(netdev, &c2dev->pcidev->dev);
+
+	memcpy_fromio(netdev->dev_addr, c2dev->kva + C2_REGS_RDMA_ENADDR, 6);
+
+	/* Print out the MAC address */
+	pr_debug("%s: MAC %pM\n", netdev->name, netdev->dev_addr);
+
+#if 0
+	/* Disable network packets */
+	netif_stop_queue(netdev);
+#endif
+	return netdev;
+}
+
+int c2_register_device(struct c2_dev *dev)
+{
+	int ret = -ENOMEM;
+	int i;
+
+	/* Register pseudo network device */
+	dev->pseudo_netdev = c2_pseudo_netdev_init(dev);
+	if (!dev->pseudo_netdev)
+		goto out;
+
+	ret = register_netdev(dev->pseudo_netdev);
+	if (ret)
+		goto out_free_netdev;
+
+	pr_debug("%s:%u\n", __func__, __LINE__);
+	strlcpy(dev->ibdev.name, "amso%d", IB_DEVICE_NAME_MAX);
+	dev->ibdev.owner = THIS_MODULE;
+	dev->ibdev.uverbs_cmd_mask =
+	    (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) |
+	    (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) |
+	    (1ull << IB_USER_VERBS_CMD_QUERY_PORT) |
+	    (1ull << IB_USER_VERBS_CMD_ALLOC_PD) |
+	    (1ull << IB_USER_VERBS_CMD_DEALLOC_PD) |
+	    (1ull << IB_USER_VERBS_CMD_REG_MR) |
+	    (1ull << IB_USER_VERBS_CMD_DEREG_MR) |
+	    (1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) |
+	    (1ull << IB_USER_VERBS_CMD_CREATE_CQ) |
+	    (1ull << IB_USER_VERBS_CMD_DESTROY_CQ) |
+	    (1ull << IB_USER_VERBS_CMD_REQ_NOTIFY_CQ) |
+	    (1ull << IB_USER_VERBS_CMD_CREATE_QP) |
+	    (1ull << IB_USER_VERBS_CMD_MODIFY_QP) |
+	    (1ull << IB_USER_VERBS_CMD_POLL_CQ) |
+	    (1ull << IB_USER_VERBS_CMD_DESTROY_QP) |
+	    (1ull << IB_USER_VERBS_CMD_POST_SEND) |
+	    (1ull << IB_USER_VERBS_CMD_POST_RECV);
+
+	dev->ibdev.node_type = RDMA_NODE_RNIC;
+	memset(&dev->ibdev.node_guid, 0, sizeof(dev->ibdev.node_guid));
+	memcpy(&dev->ibdev.node_guid, dev->pseudo_netdev->dev_addr, 6);
+	dev->ibdev.phys_port_cnt = 1;
+	dev->ibdev.num_comp_vectors = 1;
+	dev->ibdev.dma_device = &dev->pcidev->dev;
+	dev->ibdev.query_device = c2_query_device;
+	dev->ibdev.query_port = c2_query_port;
+	dev->ibdev.query_pkey = c2_query_pkey;
+	dev->ibdev.query_gid = c2_query_gid;
+	dev->ibdev.alloc_ucontext = c2_alloc_ucontext;
+	dev->ibdev.dealloc_ucontext = c2_dealloc_ucontext;
+	dev->ibdev.mmap = c2_mmap_uar;
+	dev->ibdev.alloc_pd = c2_alloc_pd;
+	dev->ibdev.dealloc_pd = c2_dealloc_pd;
+	dev->ibdev.create_ah = c2_ah_create;
+	dev->ibdev.destroy_ah = c2_ah_destroy;
+	dev->ibdev.create_qp = c2_create_qp;
+	dev->ibdev.modify_qp = c2_modify_qp;
+	dev->ibdev.destroy_qp = c2_destroy_qp;
+	dev->ibdev.create_cq = c2_create_cq;
+	dev->ibdev.destroy_cq = c2_destroy_cq;
+	dev->ibdev.poll_cq = c2_poll_cq;
+	dev->ibdev.get_dma_mr = c2_get_dma_mr;
+	dev->ibdev.reg_phys_mr = c2_reg_phys_mr;
+	dev->ibdev.reg_user_mr = c2_reg_user_mr;
+	dev->ibdev.dereg_mr = c2_dereg_mr;
+
+	dev->ibdev.alloc_fmr = NULL;
+	dev->ibdev.unmap_fmr = NULL;
+	dev->ibdev.dealloc_fmr = NULL;
+	dev->ibdev.map_phys_fmr = NULL;
+
+	dev->ibdev.attach_mcast = c2_multicast_attach;
+	dev->ibdev.detach_mcast = c2_multicast_detach;
+	dev->ibdev.process_mad = c2_process_mad;
+
+	dev->ibdev.req_notify_cq = c2_arm_cq;
+	dev->ibdev.post_send = c2_post_send;
+	dev->ibdev.post_recv = c2_post_receive;
+
+	dev->ibdev.iwcm = kmalloc(sizeof(*dev->ibdev.iwcm), GFP_KERNEL);
+	if (dev->ibdev.iwcm == NULL) {
+		ret = -ENOMEM;
+		goto out_unregister_netdev;
+	}
+	dev->ibdev.iwcm->add_ref = c2_add_ref;
+	dev->ibdev.iwcm->rem_ref = c2_rem_ref;
+	dev->ibdev.iwcm->get_qp = c2_get_qp;
+	dev->ibdev.iwcm->connect = c2_connect;
+	dev->ibdev.iwcm->accept = c2_accept;
+	dev->ibdev.iwcm->reject = c2_reject;
+	dev->ibdev.iwcm->create_listen = c2_service_create;
+	dev->ibdev.iwcm->destroy_listen = c2_service_destroy;
+
+	ret = ib_register_device(&dev->ibdev, NULL);
+	if (ret)
+		goto out_free_iwcm;
+
+	for (i = 0; i < ARRAY_SIZE(c2_dev_attributes); ++i) {
+		ret = device_create_file(&dev->ibdev.dev,
+					       c2_dev_attributes[i]);
+		if (ret)
+			goto out_unregister_ibdev;
+	}
+	goto out;
+
+out_unregister_ibdev:
+	ib_unregister_device(&dev->ibdev);
+out_free_iwcm:
+	kfree(dev->ibdev.iwcm);
+out_unregister_netdev:
+	unregister_netdev(dev->pseudo_netdev);
+out_free_netdev:
+	free_netdev(dev->pseudo_netdev);
+out:
+	pr_debug("%s:%u ret=%d\n", __func__, __LINE__, ret);
+	return ret;
+}
+
+void c2_unregister_device(struct c2_dev *dev)
+{
+	pr_debug("%s:%u\n", __func__, __LINE__);
+	unregister_netdev(dev->pseudo_netdev);
+	free_netdev(dev->pseudo_netdev);
+	ib_unregister_device(&dev->ibdev);
+}
diff --git a/drivers/infiniband/hw/amso1100/c2_provider.h b/drivers/infiniband/hw/amso1100/c2_provider.h
new file mode 100644
index 0000000..bf18998
--- /dev/null
+++ b/drivers/infiniband/hw/amso1100/c2_provider.h
@@ -0,0 +1,182 @@
+/*
+ * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifndef C2_PROVIDER_H
+#define C2_PROVIDER_H
+#include <linux/inetdevice.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_pack.h>
+
+#include "c2_mq.h"
+#include <rdma/iw_cm.h>
+
+#define C2_MPT_FLAG_ATOMIC        (1 << 14)
+#define C2_MPT_FLAG_REMOTE_WRITE  (1 << 13)
+#define C2_MPT_FLAG_REMOTE_READ   (1 << 12)
+#define C2_MPT_FLAG_LOCAL_WRITE   (1 << 11)
+#define C2_MPT_FLAG_LOCAL_READ    (1 << 10)
+
+struct c2_buf_list {
+	void *buf;
+	DEFINE_DMA_UNMAP_ADDR(mapping);
+};
+
+
+/* The user context keeps track of objects allocated for a
+ * particular user-mode client. */
+struct c2_ucontext {
+	struct ib_ucontext ibucontext;
+};
+
+struct c2_mtt;
+
+/* All objects associated with a PD are kept in the
+ * associated user context if present.
+ */
+struct c2_pd {
+	struct ib_pd ibpd;
+	u32 pd_id;
+};
+
+struct c2_mr {
+	struct ib_mr ibmr;
+	struct c2_pd *pd;
+	struct ib_umem *umem;
+};
+
+struct c2_av;
+
+enum c2_ah_type {
+	C2_AH_ON_HCA,
+	C2_AH_PCI_POOL,
+	C2_AH_KMALLOC
+};
+
+struct c2_ah {
+	struct ib_ah ibah;
+};
+
+struct c2_cq {
+	struct ib_cq ibcq;
+	spinlock_t lock;
+	atomic_t refcount;
+	int cqn;
+	int is_kernel;
+	wait_queue_head_t wait;
+
+	u32 adapter_handle;
+	struct c2_mq mq;
+};
+
+struct c2_wq {
+	spinlock_t lock;
+};
+struct iw_cm_id;
+struct c2_qp {
+	struct ib_qp ibqp;
+	struct iw_cm_id *cm_id;
+	spinlock_t lock;
+	atomic_t refcount;
+	wait_queue_head_t wait;
+	int qpn;
+
+	u32 adapter_handle;
+	u32 send_sgl_depth;
+	u32 recv_sgl_depth;
+	u32 rdma_write_sgl_depth;
+	u8 state;
+
+	struct c2_mq sq_mq;
+	struct c2_mq rq_mq;
+};
+
+struct c2_cr_query_attrs {
+	u32 local_addr;
+	u32 remote_addr;
+	u16 local_port;
+	u16 remote_port;
+};
+
+static inline struct c2_pd *to_c2pd(struct ib_pd *ibpd)
+{
+	return container_of(ibpd, struct c2_pd, ibpd);
+}
+
+static inline struct c2_ucontext *to_c2ucontext(struct ib_ucontext *ibucontext)
+{
+	return container_of(ibucontext, struct c2_ucontext, ibucontext);
+}
+
+static inline struct c2_mr *to_c2mr(struct ib_mr *ibmr)
+{
+	return container_of(ibmr, struct c2_mr, ibmr);
+}
+
+
+static inline struct c2_ah *to_c2ah(struct ib_ah *ibah)
+{
+	return container_of(ibah, struct c2_ah, ibah);
+}
+
+static inline struct c2_cq *to_c2cq(struct ib_cq *ibcq)
+{
+	return container_of(ibcq, struct c2_cq, ibcq);
+}
+
+static inline struct c2_qp *to_c2qp(struct ib_qp *ibqp)
+{
+	return container_of(ibqp, struct c2_qp, ibqp);
+}
+
+static inline int is_rnic_addr(struct net_device *netdev, u32 addr)
+{
+	struct in_device *ind;
+	int ret = 0;
+
+	ind = in_dev_get(netdev);
+	if (!ind)
+		return 0;
+
+	for_ifa(ind) {
+		if (ifa->ifa_address == addr) {
+			ret = 1;
+			break;
+		}
+	}
+	endfor_ifa(ind);
+	in_dev_put(ind);
+	return ret;
+}
+#endif				/* C2_PROVIDER_H */
diff --git a/drivers/infiniband/hw/amso1100/c2_qp.c b/drivers/infiniband/hw/amso1100/c2_qp.c
new file mode 100644
index 0000000..86708de
--- /dev/null
+++ b/drivers/infiniband/hw/amso1100/c2_qp.c
@@ -0,0 +1,1024 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Cisco Systems. All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2004 Voltaire, Inc. All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <linux/delay.h>
+#include <linux/gfp.h>
+
+#include "c2.h"
+#include "c2_vq.h"
+#include "c2_status.h"
+
+#define C2_MAX_ORD_PER_QP 128
+#define C2_MAX_IRD_PER_QP 128
+
+#define C2_HINT_MAKE(q_index, hint_count) (((q_index) << 16) | hint_count)
+#define C2_HINT_GET_INDEX(hint) (((hint) & 0x7FFF0000) >> 16)
+#define C2_HINT_GET_COUNT(hint) ((hint) & 0x0000FFFF)
+
+#define NO_SUPPORT -1
+static const u8 c2_opcode[] = {
+	[IB_WR_SEND] = C2_WR_TYPE_SEND,
+	[IB_WR_SEND_WITH_IMM] = NO_SUPPORT,
+	[IB_WR_RDMA_WRITE] = C2_WR_TYPE_RDMA_WRITE,
+	[IB_WR_RDMA_WRITE_WITH_IMM] = NO_SUPPORT,
+	[IB_WR_RDMA_READ] = C2_WR_TYPE_RDMA_READ,
+	[IB_WR_ATOMIC_CMP_AND_SWP] = NO_SUPPORT,
+	[IB_WR_ATOMIC_FETCH_AND_ADD] = NO_SUPPORT,
+};
+
+static int to_c2_state(enum ib_qp_state ib_state)
+{
+	switch (ib_state) {
+	case IB_QPS_RESET:
+		return C2_QP_STATE_IDLE;
+	case IB_QPS_RTS:
+		return C2_QP_STATE_RTS;
+	case IB_QPS_SQD:
+		return C2_QP_STATE_CLOSING;
+	case IB_QPS_SQE:
+		return C2_QP_STATE_CLOSING;
+	case IB_QPS_ERR:
+		return C2_QP_STATE_ERROR;
+	default:
+		return -1;
+	}
+}
+
+static int to_ib_state(enum c2_qp_state c2_state)
+{
+	switch (c2_state) {
+	case C2_QP_STATE_IDLE:
+		return IB_QPS_RESET;
+	case C2_QP_STATE_CONNECTING:
+		return IB_QPS_RTR;
+	case C2_QP_STATE_RTS:
+		return IB_QPS_RTS;
+	case C2_QP_STATE_CLOSING:
+		return IB_QPS_SQD;
+	case C2_QP_STATE_ERROR:
+		return IB_QPS_ERR;
+	case C2_QP_STATE_TERMINATE:
+		return IB_QPS_SQE;
+	default:
+		return -1;
+	}
+}
+
+static const char *to_ib_state_str(int ib_state)
+{
+	static const char *state_str[] = {
+		"IB_QPS_RESET",
+		"IB_QPS_INIT",
+		"IB_QPS_RTR",
+		"IB_QPS_RTS",
+		"IB_QPS_SQD",
+		"IB_QPS_SQE",
+		"IB_QPS_ERR"
+	};
+	if (ib_state < IB_QPS_RESET ||
+	    ib_state > IB_QPS_ERR)
+		return "<invalid IB QP state>";
+
+	ib_state -= IB_QPS_RESET;
+	return state_str[ib_state];
+}
+
+void c2_set_qp_state(struct c2_qp *qp, int c2_state)
+{
+	int new_state = to_ib_state(c2_state);
+
+	pr_debug("%s: qp[%p] state modify %s --> %s\n",
+	       __func__,
+		qp,
+		to_ib_state_str(qp->state),
+		to_ib_state_str(new_state));
+	qp->state = new_state;
+}
+
+#define C2_QP_NO_ATTR_CHANGE 0xFFFFFFFF
+
+int c2_qp_modify(struct c2_dev *c2dev, struct c2_qp *qp,
+		 struct ib_qp_attr *attr, int attr_mask)
+{
+	struct c2wr_qp_modify_req wr;
+	struct c2wr_qp_modify_rep *reply;
+	struct c2_vq_req *vq_req;
+	unsigned long flags;
+	u8 next_state;
+	int err;
+
+	pr_debug("%s:%d qp=%p, %s --> %s\n",
+		__func__, __LINE__,
+		qp,
+		to_ib_state_str(qp->state),
+		to_ib_state_str(attr->qp_state));
+
+	vq_req = vq_req_alloc(c2dev);
+	if (!vq_req)
+		return -ENOMEM;
+
+	c2_wr_set_id(&wr, CCWR_QP_MODIFY);
+	wr.hdr.context = (unsigned long) vq_req;
+	wr.rnic_handle = c2dev->adapter_handle;
+	wr.qp_handle = qp->adapter_handle;
+	wr.ord = cpu_to_be32(C2_QP_NO_ATTR_CHANGE);
+	wr.ird = cpu_to_be32(C2_QP_NO_ATTR_CHANGE);
+	wr.sq_depth = cpu_to_be32(C2_QP_NO_ATTR_CHANGE);
+	wr.rq_depth = cpu_to_be32(C2_QP_NO_ATTR_CHANGE);
+
+	if (attr_mask & IB_QP_STATE) {
+		/* Ensure the state is valid */
+		if (attr->qp_state < 0 || attr->qp_state > IB_QPS_ERR) {
+			err = -EINVAL;
+			goto bail0;
+		}
+
+		wr.next_qp_state = cpu_to_be32(to_c2_state(attr->qp_state));
+
+		if (attr->qp_state == IB_QPS_ERR) {
+			spin_lock_irqsave(&qp->lock, flags);
+			if (qp->cm_id && qp->state == IB_QPS_RTS) {
+				pr_debug("Generating CLOSE event for QP-->ERR, "
+					"qp=%p, cm_id=%p\n",qp,qp->cm_id);
+				/* Generate an CLOSE event */
+				vq_req->cm_id = qp->cm_id;
+				vq_req->event = IW_CM_EVENT_CLOSE;
+			}
+			spin_unlock_irqrestore(&qp->lock, flags);
+		}
+		next_state =  attr->qp_state;
+
+	} else if (attr_mask & IB_QP_CUR_STATE) {
+
+		if (attr->cur_qp_state != IB_QPS_RTR &&
+		    attr->cur_qp_state != IB_QPS_RTS &&
+		    attr->cur_qp_state != IB_QPS_SQD &&
+		    attr->cur_qp_state != IB_QPS_SQE) {
+			err = -EINVAL;
+			goto bail0;
+		} else
+			wr.next_qp_state =
+			    cpu_to_be32(to_c2_state(attr->cur_qp_state));
+
+		next_state = attr->cur_qp_state;
+
+	} else {
+		err = 0;
+		goto bail0;
+	}
+
+	/* reference the request struct */
+	vq_req_get(c2dev, vq_req);
+
+	err = vq_send_wr(c2dev, (union c2wr *) & wr);
+	if (err) {
+		vq_req_put(c2dev, vq_req);
+		goto bail0;
+	}
+
+	err = vq_wait_for_reply(c2dev, vq_req);
+	if (err)
+		goto bail0;
+
+	reply = (struct c2wr_qp_modify_rep *) (unsigned long) vq_req->reply_msg;
+	if (!reply) {
+		err = -ENOMEM;
+		goto bail0;
+	}
+
+	err = c2_errno(reply);
+	if (!err)
+		qp->state = next_state;
+#ifdef DEBUG
+	else
+		pr_debug("%s: c2_errno=%d\n", __func__, err);
+#endif
+	/*
+	 * If we're going to error and generating the event here, then
+	 * we need to remove the reference because there will be no
+	 * close event generated by the adapter
+	*/
+	spin_lock_irqsave(&qp->lock, flags);
+	if (vq_req->event==IW_CM_EVENT_CLOSE && qp->cm_id) {
+		qp->cm_id->rem_ref(qp->cm_id);
+		qp->cm_id = NULL;
+	}
+	spin_unlock_irqrestore(&qp->lock, flags);
+
+	vq_repbuf_free(c2dev, reply);
+      bail0:
+	vq_req_free(c2dev, vq_req);
+
+	pr_debug("%s:%d qp=%p, cur_state=%s\n",
+		__func__, __LINE__,
+		qp,
+		to_ib_state_str(qp->state));
+	return err;
+}
+
+int c2_qp_set_read_limits(struct c2_dev *c2dev, struct c2_qp *qp,
+			  int ord, int ird)
+{
+	struct c2wr_qp_modify_req wr;
+	struct c2wr_qp_modify_rep *reply;
+	struct c2_vq_req *vq_req;
+	int err;
+
+	vq_req = vq_req_alloc(c2dev);
+	if (!vq_req)
+		return -ENOMEM;
+
+	c2_wr_set_id(&wr, CCWR_QP_MODIFY);
+	wr.hdr.context = (unsigned long) vq_req;
+	wr.rnic_handle = c2dev->adapter_handle;
+	wr.qp_handle = qp->adapter_handle;
+	wr.ord = cpu_to_be32(ord);
+	wr.ird = cpu_to_be32(ird);
+	wr.sq_depth = cpu_to_be32(C2_QP_NO_ATTR_CHANGE);
+	wr.rq_depth = cpu_to_be32(C2_QP_NO_ATTR_CHANGE);
+	wr.next_qp_state = cpu_to_be32(C2_QP_NO_ATTR_CHANGE);
+
+	/* reference the request struct */
+	vq_req_get(c2dev, vq_req);
+
+	err = vq_send_wr(c2dev, (union c2wr *) & wr);
+	if (err) {
+		vq_req_put(c2dev, vq_req);
+		goto bail0;
+	}
+
+	err = vq_wait_for_reply(c2dev, vq_req);
+	if (err)
+		goto bail0;
+
+	reply = (struct c2wr_qp_modify_rep *) (unsigned long)
+		vq_req->reply_msg;
+	if (!reply) {
+		err = -ENOMEM;
+		goto bail0;
+	}
+
+	err = c2_errno(reply);
+	vq_repbuf_free(c2dev, reply);
+      bail0:
+	vq_req_free(c2dev, vq_req);
+	return err;
+}
+
+static int destroy_qp(struct c2_dev *c2dev, struct c2_qp *qp)
+{
+	struct c2_vq_req *vq_req;
+	struct c2wr_qp_destroy_req wr;
+	struct c2wr_qp_destroy_rep *reply;
+	unsigned long flags;
+	int err;
+
+	/*
+	 * Allocate a verb request message
+	 */
+	vq_req = vq_req_alloc(c2dev);
+	if (!vq_req) {
+		return -ENOMEM;
+	}
+
+	/*
+	 * Initialize the WR
+	 */
+	c2_wr_set_id(&wr, CCWR_QP_DESTROY);
+	wr.hdr.context = (unsigned long) vq_req;
+	wr.rnic_handle = c2dev->adapter_handle;
+	wr.qp_handle = qp->adapter_handle;
+
+	/*
+	 * reference the request struct.  dereferenced in the int handler.
+	 */
+	vq_req_get(c2dev, vq_req);
+
+	spin_lock_irqsave(&qp->lock, flags);
+	if (qp->cm_id && qp->state == IB_QPS_RTS) {
+		pr_debug("destroy_qp: generating CLOSE event for QP-->ERR, "
+			"qp=%p, cm_id=%p\n",qp,qp->cm_id);
+		/* Generate an CLOSE event */
+		vq_req->qp = qp;
+		vq_req->cm_id = qp->cm_id;
+		vq_req->event = IW_CM_EVENT_CLOSE;
+	}
+	spin_unlock_irqrestore(&qp->lock, flags);
+
+	/*
+	 * Send WR to adapter
+	 */
+	err = vq_send_wr(c2dev, (union c2wr *) & wr);
+	if (err) {
+		vq_req_put(c2dev, vq_req);
+		goto bail0;
+	}
+
+	/*
+	 * Wait for reply from adapter
+	 */
+	err = vq_wait_for_reply(c2dev, vq_req);
+	if (err) {
+		goto bail0;
+	}
+
+	/*
+	 * Process reply
+	 */
+	reply = (struct c2wr_qp_destroy_rep *) (unsigned long) (vq_req->reply_msg);
+	if (!reply) {
+		err = -ENOMEM;
+		goto bail0;
+	}
+
+	spin_lock_irqsave(&qp->lock, flags);
+	if (qp->cm_id) {
+		qp->cm_id->rem_ref(qp->cm_id);
+		qp->cm_id = NULL;
+	}
+	spin_unlock_irqrestore(&qp->lock, flags);
+
+	vq_repbuf_free(c2dev, reply);
+      bail0:
+	vq_req_free(c2dev, vq_req);
+	return err;
+}
+
+static int c2_alloc_qpn(struct c2_dev *c2dev, struct c2_qp *qp)
+{
+	int ret;
+
+	idr_preload(GFP_KERNEL);
+	spin_lock_irq(&c2dev->qp_table.lock);
+
+	ret = idr_alloc_cyclic(&c2dev->qp_table.idr, qp, 0, 0, GFP_NOWAIT);
+	if (ret >= 0)
+		qp->qpn = ret;
+
+	spin_unlock_irq(&c2dev->qp_table.lock);
+	idr_preload_end();
+	return ret < 0 ? ret : 0;
+}
+
+static void c2_free_qpn(struct c2_dev *c2dev, int qpn)
+{
+	spin_lock_irq(&c2dev->qp_table.lock);
+	idr_remove(&c2dev->qp_table.idr, qpn);
+	spin_unlock_irq(&c2dev->qp_table.lock);
+}
+
+struct c2_qp *c2_find_qpn(struct c2_dev *c2dev, int qpn)
+{
+	unsigned long flags;
+	struct c2_qp *qp;
+
+	spin_lock_irqsave(&c2dev->qp_table.lock, flags);
+	qp = idr_find(&c2dev->qp_table.idr, qpn);
+	spin_unlock_irqrestore(&c2dev->qp_table.lock, flags);
+	return qp;
+}
+
+int c2_alloc_qp(struct c2_dev *c2dev,
+		struct c2_pd *pd,
+		struct ib_qp_init_attr *qp_attrs, struct c2_qp *qp)
+{
+	struct c2wr_qp_create_req wr;
+	struct c2wr_qp_create_rep *reply;
+	struct c2_vq_req *vq_req;
+	struct c2_cq *send_cq = to_c2cq(qp_attrs->send_cq);
+	struct c2_cq *recv_cq = to_c2cq(qp_attrs->recv_cq);
+	unsigned long peer_pa;
+	u32 q_size, msg_size, mmap_size;
+	void __iomem *mmap;
+	int err;
+
+	err = c2_alloc_qpn(c2dev, qp);
+	if (err)
+		return err;
+	qp->ibqp.qp_num = qp->qpn;
+	qp->ibqp.qp_type = IB_QPT_RC;
+
+	/* Allocate the SQ and RQ shared pointers */
+	qp->sq_mq.shared = c2_alloc_mqsp(c2dev, c2dev->kern_mqsp_pool,
+					 &qp->sq_mq.shared_dma, GFP_KERNEL);
+	if (!qp->sq_mq.shared) {
+		err = -ENOMEM;
+		goto bail0;
+	}
+
+	qp->rq_mq.shared = c2_alloc_mqsp(c2dev, c2dev->kern_mqsp_pool,
+					 &qp->rq_mq.shared_dma, GFP_KERNEL);
+	if (!qp->rq_mq.shared) {
+		err = -ENOMEM;
+		goto bail1;
+	}
+
+	/* Allocate the verbs request */
+	vq_req = vq_req_alloc(c2dev);
+	if (vq_req == NULL) {
+		err = -ENOMEM;
+		goto bail2;
+	}
+
+	/* Initialize the work request */
+	memset(&wr, 0, sizeof(wr));
+	c2_wr_set_id(&wr, CCWR_QP_CREATE);
+	wr.hdr.context = (unsigned long) vq_req;
+	wr.rnic_handle = c2dev->adapter_handle;
+	wr.sq_cq_handle = send_cq->adapter_handle;
+	wr.rq_cq_handle = recv_cq->adapter_handle;
+	wr.sq_depth = cpu_to_be32(qp_attrs->cap.max_send_wr + 1);
+	wr.rq_depth = cpu_to_be32(qp_attrs->cap.max_recv_wr + 1);
+	wr.srq_handle = 0;
+	wr.flags = cpu_to_be32(QP_RDMA_READ | QP_RDMA_WRITE | QP_MW_BIND |
+			       QP_ZERO_STAG | QP_RDMA_READ_RESPONSE);
+	wr.send_sgl_depth = cpu_to_be32(qp_attrs->cap.max_send_sge);
+	wr.recv_sgl_depth = cpu_to_be32(qp_attrs->cap.max_recv_sge);
+	wr.rdma_write_sgl_depth = cpu_to_be32(qp_attrs->cap.max_send_sge);
+	wr.shared_sq_ht = cpu_to_be64(qp->sq_mq.shared_dma);
+	wr.shared_rq_ht = cpu_to_be64(qp->rq_mq.shared_dma);
+	wr.ord = cpu_to_be32(C2_MAX_ORD_PER_QP);
+	wr.ird = cpu_to_be32(C2_MAX_IRD_PER_QP);
+	wr.pd_id = pd->pd_id;
+	wr.user_context = (unsigned long) qp;
+
+	vq_req_get(c2dev, vq_req);
+
+	/* Send the WR to the adapter */
+	err = vq_send_wr(c2dev, (union c2wr *) & wr);
+	if (err) {
+		vq_req_put(c2dev, vq_req);
+		goto bail3;
+	}
+
+	/* Wait for the verb reply  */
+	err = vq_wait_for_reply(c2dev, vq_req);
+	if (err) {
+		goto bail3;
+	}
+
+	/* Process the reply */
+	reply = (struct c2wr_qp_create_rep *) (unsigned long) (vq_req->reply_msg);
+	if (!reply) {
+		err = -ENOMEM;
+		goto bail3;
+	}
+
+	if ((err = c2_wr_get_result(reply)) != 0) {
+		goto bail4;
+	}
+
+	/* Fill in the kernel QP struct */
+	atomic_set(&qp->refcount, 1);
+	qp->adapter_handle = reply->qp_handle;
+	qp->state = IB_QPS_RESET;
+	qp->send_sgl_depth = qp_attrs->cap.max_send_sge;
+	qp->rdma_write_sgl_depth = qp_attrs->cap.max_send_sge;
+	qp->recv_sgl_depth = qp_attrs->cap.max_recv_sge;
+	init_waitqueue_head(&qp->wait);
+
+	/* Initialize the SQ MQ */
+	q_size = be32_to_cpu(reply->sq_depth);
+	msg_size = be32_to_cpu(reply->sq_msg_size);
+	peer_pa = c2dev->pa + be32_to_cpu(reply->sq_mq_start);
+	mmap_size = PAGE_ALIGN(sizeof(struct c2_mq_shared) + msg_size * q_size);
+	mmap = ioremap_nocache(peer_pa, mmap_size);
+	if (!mmap) {
+		err = -ENOMEM;
+		goto bail5;
+	}
+
+	c2_mq_req_init(&qp->sq_mq,
+		       be32_to_cpu(reply->sq_mq_index),
+		       q_size,
+		       msg_size,
+		       mmap + sizeof(struct c2_mq_shared),	/* pool start */
+		       mmap,				/* peer */
+		       C2_MQ_ADAPTER_TARGET);
+
+	/* Initialize the RQ mq */
+	q_size = be32_to_cpu(reply->rq_depth);
+	msg_size = be32_to_cpu(reply->rq_msg_size);
+	peer_pa = c2dev->pa + be32_to_cpu(reply->rq_mq_start);
+	mmap_size = PAGE_ALIGN(sizeof(struct c2_mq_shared) + msg_size * q_size);
+	mmap = ioremap_nocache(peer_pa, mmap_size);
+	if (!mmap) {
+		err = -ENOMEM;
+		goto bail6;
+	}
+
+	c2_mq_req_init(&qp->rq_mq,
+		       be32_to_cpu(reply->rq_mq_index),
+		       q_size,
+		       msg_size,
+		       mmap + sizeof(struct c2_mq_shared),	/* pool start */
+		       mmap,				/* peer */
+		       C2_MQ_ADAPTER_TARGET);
+
+	vq_repbuf_free(c2dev, reply);
+	vq_req_free(c2dev, vq_req);
+
+	return 0;
+
+      bail6:
+	iounmap(qp->sq_mq.peer);
+      bail5:
+	destroy_qp(c2dev, qp);
+      bail4:
+	vq_repbuf_free(c2dev, reply);
+      bail3:
+	vq_req_free(c2dev, vq_req);
+      bail2:
+	c2_free_mqsp(qp->rq_mq.shared);
+      bail1:
+	c2_free_mqsp(qp->sq_mq.shared);
+      bail0:
+	c2_free_qpn(c2dev, qp->qpn);
+	return err;
+}
+
+static inline void c2_lock_cqs(struct c2_cq *send_cq, struct c2_cq *recv_cq)
+{
+	if (send_cq == recv_cq)
+		spin_lock_irq(&send_cq->lock);
+	else if (send_cq > recv_cq) {
+		spin_lock_irq(&send_cq->lock);
+		spin_lock_nested(&recv_cq->lock, SINGLE_DEPTH_NESTING);
+	} else {
+		spin_lock_irq(&recv_cq->lock);
+		spin_lock_nested(&send_cq->lock, SINGLE_DEPTH_NESTING);
+	}
+}
+
+static inline void c2_unlock_cqs(struct c2_cq *send_cq, struct c2_cq *recv_cq)
+{
+	if (send_cq == recv_cq)
+		spin_unlock_irq(&send_cq->lock);
+	else if (send_cq > recv_cq) {
+		spin_unlock(&recv_cq->lock);
+		spin_unlock_irq(&send_cq->lock);
+	} else {
+		spin_unlock(&send_cq->lock);
+		spin_unlock_irq(&recv_cq->lock);
+	}
+}
+
+void c2_free_qp(struct c2_dev *c2dev, struct c2_qp *qp)
+{
+	struct c2_cq *send_cq;
+	struct c2_cq *recv_cq;
+
+	send_cq = to_c2cq(qp->ibqp.send_cq);
+	recv_cq = to_c2cq(qp->ibqp.recv_cq);
+
+	/*
+	 * Lock CQs here, so that CQ polling code can do QP lookup
+	 * without taking a lock.
+	 */
+	c2_lock_cqs(send_cq, recv_cq);
+	c2_free_qpn(c2dev, qp->qpn);
+	c2_unlock_cqs(send_cq, recv_cq);
+
+	/*
+	 * Destroy qp in the rnic...
+	 */
+	destroy_qp(c2dev, qp);
+
+	/*
+	 * Mark any unreaped CQEs as null and void.
+	 */
+	c2_cq_clean(c2dev, qp, send_cq->cqn);
+	if (send_cq != recv_cq)
+		c2_cq_clean(c2dev, qp, recv_cq->cqn);
+	/*
+	 * Unmap the MQs and return the shared pointers
+	 * to the message pool.
+	 */
+	iounmap(qp->sq_mq.peer);
+	iounmap(qp->rq_mq.peer);
+	c2_free_mqsp(qp->sq_mq.shared);
+	c2_free_mqsp(qp->rq_mq.shared);
+
+	atomic_dec(&qp->refcount);
+	wait_event(qp->wait, !atomic_read(&qp->refcount));
+}
+
+/*
+ * Function: move_sgl
+ *
+ * Description:
+ * Move an SGL from the user's work request struct into a CCIL Work Request
+ * message, swapping to WR byte order and ensure the total length doesn't
+ * overflow.
+ *
+ * IN:
+ * dst		- ptr to CCIL Work Request message SGL memory.
+ * src		- ptr to the consumers SGL memory.
+ *
+ * OUT: none
+ *
+ * Return:
+ * CCIL status codes.
+ */
+static int
+move_sgl(struct c2_data_addr * dst, struct ib_sge *src, int count, u32 * p_len,
+	 u8 * actual_count)
+{
+	u32 tot = 0;		/* running total */
+	u8 acount = 0;		/* running total non-0 len sge's */
+
+	while (count > 0) {
+		/*
+		 * If the addition of this SGE causes the
+		 * total SGL length to exceed 2^32-1, then
+		 * fail-n-bail.
+		 *
+		 * If the current total plus the next element length
+		 * wraps, then it will go negative and be less than the
+		 * current total...
+		 */
+		if ((tot + src->length) < tot) {
+			return -EINVAL;
+		}
+		/*
+		 * Bug: 1456 (as well as 1498 & 1643)
+		 * Skip over any sge's supplied with len=0
+		 */
+		if (src->length) {
+			tot += src->length;
+			dst->stag = cpu_to_be32(src->lkey);
+			dst->to = cpu_to_be64(src->addr);
+			dst->length = cpu_to_be32(src->length);
+			dst++;
+			acount++;
+		}
+		src++;
+		count--;
+	}
+
+	if (acount == 0) {
+		/*
+		 * Bug: 1476 (as well as 1498, 1456 and 1643)
+		 * Setup the SGL in the WR to make it easier for the RNIC.
+		 * This way, the FW doesn't have to deal with special cases.
+		 * Setting length=0 should be sufficient.
+		 */
+		dst->stag = 0;
+		dst->to = 0;
+		dst->length = 0;
+	}
+
+	*p_len = tot;
+	*actual_count = acount;
+	return 0;
+}
+
+/*
+ * Function: c2_activity (private function)
+ *
+ * Description:
+ * Post an mq index to the host->adapter activity fifo.
+ *
+ * IN:
+ * c2dev	- ptr to c2dev structure
+ * mq_index	- mq index to post
+ * shared	- value most recently written to shared
+ *
+ * OUT:
+ *
+ * Return:
+ * none
+ */
+static inline void c2_activity(struct c2_dev *c2dev, u32 mq_index, u16 shared)
+{
+	/*
+	 * First read the register to see if the FIFO is full, and if so,
+	 * spin until it's not.  This isn't perfect -- there is no
+	 * synchronization among the clients of the register, but in
+	 * practice it prevents multiple CPU from hammering the bus
+	 * with PCI RETRY. Note that when this does happen, the card
+	 * cannot get on the bus and the card and system hang in a
+	 * deadlock -- thus the need for this code. [TOT]
+	 */
+	while (readl(c2dev->regs + PCI_BAR0_ADAPTER_HINT) & 0x80000000)
+		udelay(10);
+
+	__raw_writel(C2_HINT_MAKE(mq_index, shared),
+		     c2dev->regs + PCI_BAR0_ADAPTER_HINT);
+}
+
+/*
+ * Function: qp_wr_post
+ *
+ * Description:
+ * This in-line function allocates a MQ msg, then moves the host-copy of
+ * the completed WR into msg.  Then it posts the message.
+ *
+ * IN:
+ * q		- ptr to user MQ.
+ * wr		- ptr to host-copy of the WR.
+ * qp		- ptr to user qp
+ * size		- Number of bytes to post.  Assumed to be divisible by 4.
+ *
+ * OUT: none
+ *
+ * Return:
+ * CCIL status codes.
+ */
+static int qp_wr_post(struct c2_mq *q, union c2wr * wr, struct c2_qp *qp, u32 size)
+{
+	union c2wr *msg;
+
+	msg = c2_mq_alloc(q);
+	if (msg == NULL) {
+		return -EINVAL;
+	}
+#ifdef CCMSGMAGIC
+	((c2wr_hdr_t *) wr)->magic = cpu_to_be32(CCWR_MAGIC);
+#endif
+
+	/*
+	 * Since all header fields in the WR are the same as the
+	 * CQE, set the following so the adapter need not.
+	 */
+	c2_wr_set_result(wr, CCERR_PENDING);
+
+	/*
+	 * Copy the wr down to the adapter
+	 */
+	memcpy((void *) msg, (void *) wr, size);
+
+	c2_mq_produce(q);
+	return 0;
+}
+
+
+int c2_post_send(struct ib_qp *ibqp, struct ib_send_wr *ib_wr,
+		 struct ib_send_wr **bad_wr)
+{
+	struct c2_dev *c2dev = to_c2dev(ibqp->device);
+	struct c2_qp *qp = to_c2qp(ibqp);
+	union c2wr wr;
+	unsigned long lock_flags;
+	int err = 0;
+
+	u32 flags;
+	u32 tot_len;
+	u8 actual_sge_count;
+	u32 msg_size;
+
+	if (qp->state > IB_QPS_RTS) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	while (ib_wr) {
+
+		flags = 0;
+		wr.sqwr.sq_hdr.user_hdr.hdr.context = ib_wr->wr_id;
+		if (ib_wr->send_flags & IB_SEND_SIGNALED) {
+			flags |= SQ_SIGNALED;
+		}
+
+		switch (ib_wr->opcode) {
+		case IB_WR_SEND:
+		case IB_WR_SEND_WITH_INV:
+			if (ib_wr->opcode == IB_WR_SEND) {
+				if (ib_wr->send_flags & IB_SEND_SOLICITED)
+					c2_wr_set_id(&wr, C2_WR_TYPE_SEND_SE);
+				else
+					c2_wr_set_id(&wr, C2_WR_TYPE_SEND);
+				wr.sqwr.send.remote_stag = 0;
+			} else {
+				if (ib_wr->send_flags & IB_SEND_SOLICITED)
+					c2_wr_set_id(&wr, C2_WR_TYPE_SEND_SE_INV);
+				else
+					c2_wr_set_id(&wr, C2_WR_TYPE_SEND_INV);
+				wr.sqwr.send.remote_stag =
+					cpu_to_be32(ib_wr->ex.invalidate_rkey);
+			}
+
+			msg_size = sizeof(struct c2wr_send_req) +
+				sizeof(struct c2_data_addr) * ib_wr->num_sge;
+			if (ib_wr->num_sge > qp->send_sgl_depth) {
+				err = -EINVAL;
+				break;
+			}
+			if (ib_wr->send_flags & IB_SEND_FENCE) {
+				flags |= SQ_READ_FENCE;
+			}
+			err = move_sgl((struct c2_data_addr *) & (wr.sqwr.send.data),
+				       ib_wr->sg_list,
+				       ib_wr->num_sge,
+				       &tot_len, &actual_sge_count);
+			wr.sqwr.send.sge_len = cpu_to_be32(tot_len);
+			c2_wr_set_sge_count(&wr, actual_sge_count);
+			break;
+		case IB_WR_RDMA_WRITE:
+			c2_wr_set_id(&wr, C2_WR_TYPE_RDMA_WRITE);
+			msg_size = sizeof(struct c2wr_rdma_write_req) +
+			    (sizeof(struct c2_data_addr) * ib_wr->num_sge);
+			if (ib_wr->num_sge > qp->rdma_write_sgl_depth) {
+				err = -EINVAL;
+				break;
+			}
+			if (ib_wr->send_flags & IB_SEND_FENCE) {
+				flags |= SQ_READ_FENCE;
+			}
+			wr.sqwr.rdma_write.remote_stag =
+			    cpu_to_be32(ib_wr->wr.rdma.rkey);
+			wr.sqwr.rdma_write.remote_to =
+			    cpu_to_be64(ib_wr->wr.rdma.remote_addr);
+			err = move_sgl((struct c2_data_addr *)
+				       & (wr.sqwr.rdma_write.data),
+				       ib_wr->sg_list,
+				       ib_wr->num_sge,
+				       &tot_len, &actual_sge_count);
+			wr.sqwr.rdma_write.sge_len = cpu_to_be32(tot_len);
+			c2_wr_set_sge_count(&wr, actual_sge_count);
+			break;
+		case IB_WR_RDMA_READ:
+			c2_wr_set_id(&wr, C2_WR_TYPE_RDMA_READ);
+			msg_size = sizeof(struct c2wr_rdma_read_req);
+
+			/* IWarp only suppots 1 sge for RDMA reads */
+			if (ib_wr->num_sge > 1) {
+				err = -EINVAL;
+				break;
+			}
+
+			/*
+			 * Move the local and remote stag/to/len into the WR.
+			 */
+			wr.sqwr.rdma_read.local_stag =
+			    cpu_to_be32(ib_wr->sg_list->lkey);
+			wr.sqwr.rdma_read.local_to =
+			    cpu_to_be64(ib_wr->sg_list->addr);
+			wr.sqwr.rdma_read.remote_stag =
+			    cpu_to_be32(ib_wr->wr.rdma.rkey);
+			wr.sqwr.rdma_read.remote_to =
+			    cpu_to_be64(ib_wr->wr.rdma.remote_addr);
+			wr.sqwr.rdma_read.length =
+			    cpu_to_be32(ib_wr->sg_list->length);
+			break;
+		default:
+			/* error */
+			msg_size = 0;
+			err = -EINVAL;
+			break;
+		}
+
+		/*
+		 * If we had an error on the last wr build, then
+		 * break out.  Possible errors include bogus WR
+		 * type, and a bogus SGL length...
+		 */
+		if (err) {
+			break;
+		}
+
+		/*
+		 * Store flags
+		 */
+		c2_wr_set_flags(&wr, flags);
+
+		/*
+		 * Post the puppy!
+		 */
+		spin_lock_irqsave(&qp->lock, lock_flags);
+		err = qp_wr_post(&qp->sq_mq, &wr, qp, msg_size);
+		if (err) {
+			spin_unlock_irqrestore(&qp->lock, lock_flags);
+			break;
+		}
+
+		/*
+		 * Enqueue mq index to activity FIFO.
+		 */
+		c2_activity(c2dev, qp->sq_mq.index, qp->sq_mq.hint_count);
+		spin_unlock_irqrestore(&qp->lock, lock_flags);
+
+		ib_wr = ib_wr->next;
+	}
+
+out:
+	if (err)
+		*bad_wr = ib_wr;
+	return err;
+}
+
+int c2_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *ib_wr,
+		    struct ib_recv_wr **bad_wr)
+{
+	struct c2_dev *c2dev = to_c2dev(ibqp->device);
+	struct c2_qp *qp = to_c2qp(ibqp);
+	union c2wr wr;
+	unsigned long lock_flags;
+	int err = 0;
+
+	if (qp->state > IB_QPS_RTS) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	/*
+	 * Try and post each work request
+	 */
+	while (ib_wr) {
+		u32 tot_len;
+		u8 actual_sge_count;
+
+		if (ib_wr->num_sge > qp->recv_sgl_depth) {
+			err = -EINVAL;
+			break;
+		}
+
+		/*
+		 * Create local host-copy of the WR
+		 */
+		wr.rqwr.rq_hdr.user_hdr.hdr.context = ib_wr->wr_id;
+		c2_wr_set_id(&wr, CCWR_RECV);
+		c2_wr_set_flags(&wr, 0);
+
+		/* sge_count is limited to eight bits. */
+		BUG_ON(ib_wr->num_sge >= 256);
+		err = move_sgl((struct c2_data_addr *) & (wr.rqwr.data),
+			       ib_wr->sg_list,
+			       ib_wr->num_sge, &tot_len, &actual_sge_count);
+		c2_wr_set_sge_count(&wr, actual_sge_count);
+
+		/*
+		 * If we had an error on the last wr build, then
+		 * break out.  Possible errors include bogus WR
+		 * type, and a bogus SGL length...
+		 */
+		if (err) {
+			break;
+		}
+
+		spin_lock_irqsave(&qp->lock, lock_flags);
+		err = qp_wr_post(&qp->rq_mq, &wr, qp, qp->rq_mq.msg_size);
+		if (err) {
+			spin_unlock_irqrestore(&qp->lock, lock_flags);
+			break;
+		}
+
+		/*
+		 * Enqueue mq index to activity FIFO
+		 */
+		c2_activity(c2dev, qp->rq_mq.index, qp->rq_mq.hint_count);
+		spin_unlock_irqrestore(&qp->lock, lock_flags);
+
+		ib_wr = ib_wr->next;
+	}
+
+out:
+	if (err)
+		*bad_wr = ib_wr;
+	return err;
+}
+
+void c2_init_qp_table(struct c2_dev *c2dev)
+{
+	spin_lock_init(&c2dev->qp_table.lock);
+	idr_init(&c2dev->qp_table.idr);
+}
+
+void c2_cleanup_qp_table(struct c2_dev *c2dev)
+{
+	idr_destroy(&c2dev->qp_table.idr);
+}
diff --git a/drivers/infiniband/hw/amso1100/c2_rnic.c b/drivers/infiniband/hw/amso1100/c2_rnic.c
new file mode 100644
index 0000000..d2a6d96
--- /dev/null
+++ b/drivers/infiniband/hw/amso1100/c2_rnic.c
@@ -0,0 +1,655 @@
+/*
+ * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/pci.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/delay.h>
+#include <linux/ethtool.h>
+#include <linux/mii.h>
+#include <linux/if_vlan.h>
+#include <linux/crc32.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>
+#include <linux/init.h>
+#include <linux/dma-mapping.h>
+#include <linux/mm.h>
+#include <linux/inet.h>
+#include <linux/vmalloc.h>
+#include <linux/slab.h>
+
+#include <linux/route.h>
+
+#include <asm/io.h>
+#include <asm/irq.h>
+#include <asm/byteorder.h>
+#include <rdma/ib_smi.h>
+#include "c2.h"
+#include "c2_vq.h"
+
+/* Device capabilities */
+#define C2_MIN_PAGESIZE  1024
+
+#define C2_MAX_MRS       32768
+#define C2_MAX_QPS       16000
+#define C2_MAX_WQE_SZ    256
+#define C2_MAX_QP_WR     ((128*1024)/C2_MAX_WQE_SZ)
+#define C2_MAX_SGES      4
+#define C2_MAX_SGE_RD    1
+#define C2_MAX_CQS       32768
+#define C2_MAX_CQES      4096
+#define C2_MAX_PDS       16384
+
+/*
+ * Send the adapter INIT message to the amso1100
+ */
+static int c2_adapter_init(struct c2_dev *c2dev)
+{
+	struct c2wr_init_req wr;
+	int err;
+
+	memset(&wr, 0, sizeof(wr));
+	c2_wr_set_id(&wr, CCWR_INIT);
+	wr.hdr.context = 0;
+	wr.hint_count = cpu_to_be64(c2dev->hint_count_dma);
+	wr.q0_host_shared = cpu_to_be64(c2dev->req_vq.shared_dma);
+	wr.q1_host_shared = cpu_to_be64(c2dev->rep_vq.shared_dma);
+	wr.q1_host_msg_pool = cpu_to_be64(c2dev->rep_vq.host_dma);
+	wr.q2_host_shared = cpu_to_be64(c2dev->aeq.shared_dma);
+	wr.q2_host_msg_pool = cpu_to_be64(c2dev->aeq.host_dma);
+
+	/* Post the init message */
+	err = vq_send_wr(c2dev, (union c2wr *) & wr);
+
+	return err;
+}
+
+/*
+ * Send the adapter TERM message to the amso1100
+ */
+static void c2_adapter_term(struct c2_dev *c2dev)
+{
+	struct c2wr_init_req wr;
+
+	memset(&wr, 0, sizeof(wr));
+	c2_wr_set_id(&wr, CCWR_TERM);
+	wr.hdr.context = 0;
+
+	/* Post the init message */
+	vq_send_wr(c2dev, (union c2wr *) & wr);
+	c2dev->init = 0;
+
+	return;
+}
+
+/*
+ * Query the adapter
+ */
+static int c2_rnic_query(struct c2_dev *c2dev, struct ib_device_attr *props)
+{
+	struct c2_vq_req *vq_req;
+	struct c2wr_rnic_query_req wr;
+	struct c2wr_rnic_query_rep *reply;
+	int err;
+
+	vq_req = vq_req_alloc(c2dev);
+	if (!vq_req)
+		return -ENOMEM;
+
+	c2_wr_set_id(&wr, CCWR_RNIC_QUERY);
+	wr.hdr.context = (unsigned long) vq_req;
+	wr.rnic_handle = c2dev->adapter_handle;
+
+	vq_req_get(c2dev, vq_req);
+
+	err = vq_send_wr(c2dev, (union c2wr *) &wr);
+	if (err) {
+		vq_req_put(c2dev, vq_req);
+		goto bail1;
+	}
+
+	err = vq_wait_for_reply(c2dev, vq_req);
+	if (err)
+		goto bail1;
+
+	reply =
+	    (struct c2wr_rnic_query_rep *) (unsigned long) (vq_req->reply_msg);
+	if (!reply)
+		err = -ENOMEM;
+	else
+		err = c2_errno(reply);
+	if (err)
+		goto bail2;
+
+	props->fw_ver =
+		((u64)be32_to_cpu(reply->fw_ver_major) << 32) |
+		((be32_to_cpu(reply->fw_ver_minor) & 0xFFFF) << 16) |
+		(be32_to_cpu(reply->fw_ver_patch) & 0xFFFF);
+	memcpy(&props->sys_image_guid, c2dev->netdev->dev_addr, 6);
+	props->max_mr_size         = 0xFFFFFFFF;
+	props->page_size_cap       = ~(C2_MIN_PAGESIZE-1);
+	props->vendor_id           = be32_to_cpu(reply->vendor_id);
+	props->vendor_part_id      = be32_to_cpu(reply->part_number);
+	props->hw_ver              = be32_to_cpu(reply->hw_version);
+	props->max_qp              = be32_to_cpu(reply->max_qps);
+	props->max_qp_wr           = be32_to_cpu(reply->max_qp_depth);
+	props->device_cap_flags    = c2dev->device_cap_flags;
+	props->max_sge             = C2_MAX_SGES;
+	props->max_sge_rd          = C2_MAX_SGE_RD;
+	props->max_cq              = be32_to_cpu(reply->max_cqs);
+	props->max_cqe             = be32_to_cpu(reply->max_cq_depth);
+	props->max_mr              = be32_to_cpu(reply->max_mrs);
+	props->max_pd              = be32_to_cpu(reply->max_pds);
+	props->max_qp_rd_atom      = be32_to_cpu(reply->max_qp_ird);
+	props->max_ee_rd_atom      = 0;
+	props->max_res_rd_atom     = be32_to_cpu(reply->max_global_ird);
+	props->max_qp_init_rd_atom = be32_to_cpu(reply->max_qp_ord);
+	props->max_ee_init_rd_atom = 0;
+	props->atomic_cap          = IB_ATOMIC_NONE;
+	props->max_ee              = 0;
+	props->max_rdd             = 0;
+	props->max_mw              = be32_to_cpu(reply->max_mws);
+	props->max_raw_ipv6_qp     = 0;
+	props->max_raw_ethy_qp     = 0;
+	props->max_mcast_grp       = 0;
+	props->max_mcast_qp_attach = 0;
+	props->max_total_mcast_qp_attach = 0;
+	props->max_ah              = 0;
+	props->max_fmr             = 0;
+	props->max_map_per_fmr     = 0;
+	props->max_srq             = 0;
+	props->max_srq_wr          = 0;
+	props->max_srq_sge         = 0;
+	props->max_pkeys           = 0;
+	props->local_ca_ack_delay  = 0;
+
+ bail2:
+	vq_repbuf_free(c2dev, reply);
+
+ bail1:
+	vq_req_free(c2dev, vq_req);
+	return err;
+}
+
+/*
+ * Add an IP address to the RNIC interface
+ */
+int c2_add_addr(struct c2_dev *c2dev, __be32 inaddr, __be32 inmask)
+{
+	struct c2_vq_req *vq_req;
+	struct c2wr_rnic_setconfig_req *wr;
+	struct c2wr_rnic_setconfig_rep *reply;
+	struct c2_netaddr netaddr;
+	int err, len;
+
+	vq_req = vq_req_alloc(c2dev);
+	if (!vq_req)
+		return -ENOMEM;
+
+	len = sizeof(struct c2_netaddr);
+	wr = kmalloc(c2dev->req_vq.msg_size, GFP_KERNEL);
+	if (!wr) {
+		err = -ENOMEM;
+		goto bail0;
+	}
+
+	c2_wr_set_id(wr, CCWR_RNIC_SETCONFIG);
+	wr->hdr.context = (unsigned long) vq_req;
+	wr->rnic_handle = c2dev->adapter_handle;
+	wr->option = cpu_to_be32(C2_CFG_ADD_ADDR);
+
+	netaddr.ip_addr = inaddr;
+	netaddr.netmask = inmask;
+	netaddr.mtu = 0;
+
+	memcpy(wr->data, &netaddr, len);
+
+	vq_req_get(c2dev, vq_req);
+
+	err = vq_send_wr(c2dev, (union c2wr *) wr);
+	if (err) {
+		vq_req_put(c2dev, vq_req);
+		goto bail1;
+	}
+
+	err = vq_wait_for_reply(c2dev, vq_req);
+	if (err)
+		goto bail1;
+
+	reply =
+	    (struct c2wr_rnic_setconfig_rep *) (unsigned long) (vq_req->reply_msg);
+	if (!reply) {
+		err = -ENOMEM;
+		goto bail1;
+	}
+
+	err = c2_errno(reply);
+	vq_repbuf_free(c2dev, reply);
+
+      bail1:
+	kfree(wr);
+      bail0:
+	vq_req_free(c2dev, vq_req);
+	return err;
+}
+
+/*
+ * Delete an IP address from the RNIC interface
+ */
+int c2_del_addr(struct c2_dev *c2dev, __be32 inaddr, __be32 inmask)
+{
+	struct c2_vq_req *vq_req;
+	struct c2wr_rnic_setconfig_req *wr;
+	struct c2wr_rnic_setconfig_rep *reply;
+	struct c2_netaddr netaddr;
+	int err, len;
+
+	vq_req = vq_req_alloc(c2dev);
+	if (!vq_req)
+		return -ENOMEM;
+
+	len = sizeof(struct c2_netaddr);
+	wr = kmalloc(c2dev->req_vq.msg_size, GFP_KERNEL);
+	if (!wr) {
+		err = -ENOMEM;
+		goto bail0;
+	}
+
+	c2_wr_set_id(wr, CCWR_RNIC_SETCONFIG);
+	wr->hdr.context = (unsigned long) vq_req;
+	wr->rnic_handle = c2dev->adapter_handle;
+	wr->option = cpu_to_be32(C2_CFG_DEL_ADDR);
+
+	netaddr.ip_addr = inaddr;
+	netaddr.netmask = inmask;
+	netaddr.mtu = 0;
+
+	memcpy(wr->data, &netaddr, len);
+
+	vq_req_get(c2dev, vq_req);
+
+	err = vq_send_wr(c2dev, (union c2wr *) wr);
+	if (err) {
+		vq_req_put(c2dev, vq_req);
+		goto bail1;
+	}
+
+	err = vq_wait_for_reply(c2dev, vq_req);
+	if (err)
+		goto bail1;
+
+	reply =
+	    (struct c2wr_rnic_setconfig_rep *) (unsigned long) (vq_req->reply_msg);
+	if (!reply) {
+		err = -ENOMEM;
+		goto bail1;
+	}
+
+	err = c2_errno(reply);
+	vq_repbuf_free(c2dev, reply);
+
+      bail1:
+	kfree(wr);
+      bail0:
+	vq_req_free(c2dev, vq_req);
+	return err;
+}
+
+/*
+ * Open a single RNIC instance to use with all
+ * low level openib calls
+ */
+static int c2_rnic_open(struct c2_dev *c2dev)
+{
+	struct c2_vq_req *vq_req;
+	union c2wr wr;
+	struct c2wr_rnic_open_rep *reply;
+	int err;
+
+	vq_req = vq_req_alloc(c2dev);
+	if (vq_req == NULL) {
+		return -ENOMEM;
+	}
+
+	memset(&wr, 0, sizeof(wr));
+	c2_wr_set_id(&wr, CCWR_RNIC_OPEN);
+	wr.rnic_open.req.hdr.context = (unsigned long) (vq_req);
+	wr.rnic_open.req.flags = cpu_to_be16(RNIC_PRIV_MODE);
+	wr.rnic_open.req.port_num = cpu_to_be16(0);
+	wr.rnic_open.req.user_context = (unsigned long) c2dev;
+
+	vq_req_get(c2dev, vq_req);
+
+	err = vq_send_wr(c2dev, &wr);
+	if (err) {
+		vq_req_put(c2dev, vq_req);
+		goto bail0;
+	}
+
+	err = vq_wait_for_reply(c2dev, vq_req);
+	if (err) {
+		goto bail0;
+	}
+
+	reply = (struct c2wr_rnic_open_rep *) (unsigned long) (vq_req->reply_msg);
+	if (!reply) {
+		err = -ENOMEM;
+		goto bail0;
+	}
+
+	if ((err = c2_errno(reply)) != 0) {
+		goto bail1;
+	}
+
+	c2dev->adapter_handle = reply->rnic_handle;
+
+      bail1:
+	vq_repbuf_free(c2dev, reply);
+      bail0:
+	vq_req_free(c2dev, vq_req);
+	return err;
+}
+
+/*
+ * Close the RNIC instance
+ */
+static int c2_rnic_close(struct c2_dev *c2dev)
+{
+	struct c2_vq_req *vq_req;
+	union c2wr wr;
+	struct c2wr_rnic_close_rep *reply;
+	int err;
+
+	vq_req = vq_req_alloc(c2dev);
+	if (vq_req == NULL) {
+		return -ENOMEM;
+	}
+
+	memset(&wr, 0, sizeof(wr));
+	c2_wr_set_id(&wr, CCWR_RNIC_CLOSE);
+	wr.rnic_close.req.hdr.context = (unsigned long) vq_req;
+	wr.rnic_close.req.rnic_handle = c2dev->adapter_handle;
+
+	vq_req_get(c2dev, vq_req);
+
+	err = vq_send_wr(c2dev, &wr);
+	if (err) {
+		vq_req_put(c2dev, vq_req);
+		goto bail0;
+	}
+
+	err = vq_wait_for_reply(c2dev, vq_req);
+	if (err) {
+		goto bail0;
+	}
+
+	reply = (struct c2wr_rnic_close_rep *) (unsigned long) (vq_req->reply_msg);
+	if (!reply) {
+		err = -ENOMEM;
+		goto bail0;
+	}
+
+	if ((err = c2_errno(reply)) != 0) {
+		goto bail1;
+	}
+
+	c2dev->adapter_handle = 0;
+
+      bail1:
+	vq_repbuf_free(c2dev, reply);
+      bail0:
+	vq_req_free(c2dev, vq_req);
+	return err;
+}
+
+/*
+ * Called by c2_probe to initialize the RNIC. This principally
+ * involves initializing the various limits and resource pools that
+ * comprise the RNIC instance.
+ */
+int c2_rnic_init(struct c2_dev *c2dev)
+{
+	int err;
+	u32 qsize, msgsize;
+	void *q1_pages;
+	void *q2_pages;
+	void __iomem *mmio_regs;
+
+	/* Device capabilities */
+	c2dev->device_cap_flags =
+	    (IB_DEVICE_RESIZE_MAX_WR |
+	     IB_DEVICE_CURR_QP_STATE_MOD |
+	     IB_DEVICE_SYS_IMAGE_GUID |
+	     IB_DEVICE_LOCAL_DMA_LKEY |
+	     IB_DEVICE_MEM_WINDOW);
+
+	/* Allocate the qptr_array */
+	c2dev->qptr_array = vzalloc(C2_MAX_CQS * sizeof(void *));
+	if (!c2dev->qptr_array) {
+		return -ENOMEM;
+	}
+
+	/* Initialize the qptr_array */
+	c2dev->qptr_array[0] = (void *) &c2dev->req_vq;
+	c2dev->qptr_array[1] = (void *) &c2dev->rep_vq;
+	c2dev->qptr_array[2] = (void *) &c2dev->aeq;
+
+	/* Initialize data structures */
+	init_waitqueue_head(&c2dev->req_vq_wo);
+	spin_lock_init(&c2dev->vqlock);
+	spin_lock_init(&c2dev->lock);
+
+	/* Allocate MQ shared pointer pool for kernel clients. User
+	 * mode client pools are hung off the user context
+	 */
+	err = c2_init_mqsp_pool(c2dev, GFP_KERNEL, &c2dev->kern_mqsp_pool);
+	if (err) {
+		goto bail0;
+	}
+
+	/* Allocate shared pointers for Q0, Q1, and Q2 from
+	 * the shared pointer pool.
+	 */
+
+	c2dev->hint_count = c2_alloc_mqsp(c2dev, c2dev->kern_mqsp_pool,
+					     &c2dev->hint_count_dma,
+					     GFP_KERNEL);
+	c2dev->req_vq.shared = c2_alloc_mqsp(c2dev, c2dev->kern_mqsp_pool,
+					     &c2dev->req_vq.shared_dma,
+					     GFP_KERNEL);
+	c2dev->rep_vq.shared = c2_alloc_mqsp(c2dev, c2dev->kern_mqsp_pool,
+					     &c2dev->rep_vq.shared_dma,
+					     GFP_KERNEL);
+	c2dev->aeq.shared = c2_alloc_mqsp(c2dev, c2dev->kern_mqsp_pool,
+					  &c2dev->aeq.shared_dma, GFP_KERNEL);
+	if (!c2dev->hint_count || !c2dev->req_vq.shared ||
+	    !c2dev->rep_vq.shared || !c2dev->aeq.shared) {
+		err = -ENOMEM;
+		goto bail1;
+	}
+
+	mmio_regs = c2dev->kva;
+	/* Initialize the Verbs Request Queue */
+	c2_mq_req_init(&c2dev->req_vq, 0,
+		       be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_Q0_QSIZE)),
+		       be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_Q0_MSGSIZE)),
+		       mmio_regs +
+		       be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_Q0_POOLSTART)),
+		       mmio_regs +
+		       be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_Q0_SHARED)),
+		       C2_MQ_ADAPTER_TARGET);
+
+	/* Initialize the Verbs Reply Queue */
+	qsize = be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_Q1_QSIZE));
+	msgsize = be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_Q1_MSGSIZE));
+	q1_pages = dma_alloc_coherent(&c2dev->pcidev->dev, qsize * msgsize,
+				      &c2dev->rep_vq.host_dma, GFP_KERNEL);
+	if (!q1_pages) {
+		err = -ENOMEM;
+		goto bail1;
+	}
+	dma_unmap_addr_set(&c2dev->rep_vq, mapping, c2dev->rep_vq.host_dma);
+	pr_debug("%s rep_vq va %p dma %llx\n", __func__, q1_pages,
+		 (unsigned long long) c2dev->rep_vq.host_dma);
+	c2_mq_rep_init(&c2dev->rep_vq,
+		   1,
+		   qsize,
+		   msgsize,
+		   q1_pages,
+		   mmio_regs +
+		   be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_Q1_SHARED)),
+		   C2_MQ_HOST_TARGET);
+
+	/* Initialize the Asynchronus Event Queue */
+	qsize = be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_Q2_QSIZE));
+	msgsize = be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_Q2_MSGSIZE));
+	q2_pages = dma_alloc_coherent(&c2dev->pcidev->dev, qsize * msgsize,
+				      &c2dev->aeq.host_dma, GFP_KERNEL);
+	if (!q2_pages) {
+		err = -ENOMEM;
+		goto bail2;
+	}
+	dma_unmap_addr_set(&c2dev->aeq, mapping, c2dev->aeq.host_dma);
+	pr_debug("%s aeq va %p dma %llx\n", __func__, q2_pages,
+		 (unsigned long long) c2dev->aeq.host_dma);
+	c2_mq_rep_init(&c2dev->aeq,
+		       2,
+		       qsize,
+		       msgsize,
+		       q2_pages,
+		       mmio_regs +
+		       be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_Q2_SHARED)),
+		       C2_MQ_HOST_TARGET);
+
+	/* Initialize the verbs request allocator */
+	err = vq_init(c2dev);
+	if (err)
+		goto bail3;
+
+	/* Enable interrupts on the adapter */
+	writel(0, c2dev->regs + C2_IDIS);
+
+	/* create the WR init message */
+	err = c2_adapter_init(c2dev);
+	if (err)
+		goto bail4;
+	c2dev->init++;
+
+	/* open an adapter instance */
+	err = c2_rnic_open(c2dev);
+	if (err)
+		goto bail4;
+
+	/* Initialize cached the adapter limits */
+	err = c2_rnic_query(c2dev, &c2dev->props);
+	if (err)
+		goto bail5;
+
+	/* Initialize the PD pool */
+	err = c2_init_pd_table(c2dev);
+	if (err)
+		goto bail5;
+
+	/* Initialize the QP pool */
+	c2_init_qp_table(c2dev);
+	return 0;
+
+      bail5:
+	c2_rnic_close(c2dev);
+      bail4:
+	vq_term(c2dev);
+      bail3:
+	dma_free_coherent(&c2dev->pcidev->dev,
+			  c2dev->aeq.q_size * c2dev->aeq.msg_size,
+			  q2_pages, dma_unmap_addr(&c2dev->aeq, mapping));
+      bail2:
+	dma_free_coherent(&c2dev->pcidev->dev,
+			  c2dev->rep_vq.q_size * c2dev->rep_vq.msg_size,
+			  q1_pages, dma_unmap_addr(&c2dev->rep_vq, mapping));
+      bail1:
+	c2_free_mqsp_pool(c2dev, c2dev->kern_mqsp_pool);
+      bail0:
+	vfree(c2dev->qptr_array);
+
+	return err;
+}
+
+/*
+ * Called by c2_remove to cleanup the RNIC resources.
+ */
+void c2_rnic_term(struct c2_dev *c2dev)
+{
+
+	/* Close the open adapter instance */
+	c2_rnic_close(c2dev);
+
+	/* Send the TERM message to the adapter */
+	c2_adapter_term(c2dev);
+
+	/* Disable interrupts on the adapter */
+	writel(1, c2dev->regs + C2_IDIS);
+
+	/* Free the QP pool */
+	c2_cleanup_qp_table(c2dev);
+
+	/* Free the PD pool */
+	c2_cleanup_pd_table(c2dev);
+
+	/* Free the verbs request allocator */
+	vq_term(c2dev);
+
+	/* Free the asynchronus event queue */
+	dma_free_coherent(&c2dev->pcidev->dev,
+			  c2dev->aeq.q_size * c2dev->aeq.msg_size,
+			  c2dev->aeq.msg_pool.host,
+			  dma_unmap_addr(&c2dev->aeq, mapping));
+
+	/* Free the verbs reply queue */
+	dma_free_coherent(&c2dev->pcidev->dev,
+			  c2dev->rep_vq.q_size * c2dev->rep_vq.msg_size,
+			  c2dev->rep_vq.msg_pool.host,
+			  dma_unmap_addr(&c2dev->rep_vq, mapping));
+
+	/* Free the MQ shared pointer pool */
+	c2_free_mqsp_pool(c2dev, c2dev->kern_mqsp_pool);
+
+	/* Free the qptr_array */
+	vfree(c2dev->qptr_array);
+
+	return;
+}
diff --git a/drivers/infiniband/hw/amso1100/c2_status.h b/drivers/infiniband/hw/amso1100/c2_status.h
new file mode 100644
index 0000000..6ee4aa9
--- /dev/null
+++ b/drivers/infiniband/hw/amso1100/c2_status.h
@@ -0,0 +1,158 @@
+/*
+ * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef	_C2_STATUS_H_
+#define _C2_STATUS_H_
+
+/*
+ * Verbs Status Codes
+ */
+enum c2_status {
+	C2_OK = 0,		/* This must be zero */
+	CCERR_INSUFFICIENT_RESOURCES = 1,
+	CCERR_INVALID_MODIFIER = 2,
+	CCERR_INVALID_MODE = 3,
+	CCERR_IN_USE = 4,
+	CCERR_INVALID_RNIC = 5,
+	CCERR_INTERRUPTED_OPERATION = 6,
+	CCERR_INVALID_EH = 7,
+	CCERR_INVALID_CQ = 8,
+	CCERR_CQ_EMPTY = 9,
+	CCERR_NOT_IMPLEMENTED = 10,
+	CCERR_CQ_DEPTH_TOO_SMALL = 11,
+	CCERR_PD_IN_USE = 12,
+	CCERR_INVALID_PD = 13,
+	CCERR_INVALID_SRQ = 14,
+	CCERR_INVALID_ADDRESS = 15,
+	CCERR_INVALID_NETMASK = 16,
+	CCERR_INVALID_QP = 17,
+	CCERR_INVALID_QP_STATE = 18,
+	CCERR_TOO_MANY_WRS_POSTED = 19,
+	CCERR_INVALID_WR_TYPE = 20,
+	CCERR_INVALID_SGL_LENGTH = 21,
+	CCERR_INVALID_SQ_DEPTH = 22,
+	CCERR_INVALID_RQ_DEPTH = 23,
+	CCERR_INVALID_ORD = 24,
+	CCERR_INVALID_IRD = 25,
+	CCERR_QP_ATTR_CANNOT_CHANGE = 26,
+	CCERR_INVALID_STAG = 27,
+	CCERR_QP_IN_USE = 28,
+	CCERR_OUTSTANDING_WRS = 29,
+	CCERR_STAG_IN_USE = 30,
+	CCERR_INVALID_STAG_INDEX = 31,
+	CCERR_INVALID_SGL_FORMAT = 32,
+	CCERR_ADAPTER_TIMEOUT = 33,
+	CCERR_INVALID_CQ_DEPTH = 34,
+	CCERR_INVALID_PRIVATE_DATA_LENGTH = 35,
+	CCERR_INVALID_EP = 36,
+	CCERR_MR_IN_USE = CCERR_STAG_IN_USE,
+	CCERR_FLUSHED = 38,
+	CCERR_INVALID_WQE = 39,
+	CCERR_LOCAL_QP_CATASTROPHIC_ERROR = 40,
+	CCERR_REMOTE_TERMINATION_ERROR = 41,
+	CCERR_BASE_AND_BOUNDS_VIOLATION = 42,
+	CCERR_ACCESS_VIOLATION = 43,
+	CCERR_INVALID_PD_ID = 44,
+	CCERR_WRAP_ERROR = 45,
+	CCERR_INV_STAG_ACCESS_ERROR = 46,
+	CCERR_ZERO_RDMA_READ_RESOURCES = 47,
+	CCERR_QP_NOT_PRIVILEGED = 48,
+	CCERR_STAG_STATE_NOT_INVALID = 49,
+	CCERR_INVALID_PAGE_SIZE = 50,
+	CCERR_INVALID_BUFFER_SIZE = 51,
+	CCERR_INVALID_PBE = 52,
+	CCERR_INVALID_FBO = 53,
+	CCERR_INVALID_LENGTH = 54,
+	CCERR_INVALID_ACCESS_RIGHTS = 55,
+	CCERR_PBL_TOO_BIG = 56,
+	CCERR_INVALID_VA = 57,
+	CCERR_INVALID_REGION = 58,
+	CCERR_INVALID_WINDOW = 59,
+	CCERR_TOTAL_LENGTH_TOO_BIG = 60,
+	CCERR_INVALID_QP_ID = 61,
+	CCERR_ADDR_IN_USE = 62,
+	CCERR_ADDR_NOT_AVAIL = 63,
+	CCERR_NET_DOWN = 64,
+	CCERR_NET_UNREACHABLE = 65,
+	CCERR_CONN_ABORTED = 66,
+	CCERR_CONN_RESET = 67,
+	CCERR_NO_BUFS = 68,
+	CCERR_CONN_TIMEDOUT = 69,
+	CCERR_CONN_REFUSED = 70,
+	CCERR_HOST_UNREACHABLE = 71,
+	CCERR_INVALID_SEND_SGL_DEPTH = 72,
+	CCERR_INVALID_RECV_SGL_DEPTH = 73,
+	CCERR_INVALID_RDMA_WRITE_SGL_DEPTH = 74,
+	CCERR_INSUFFICIENT_PRIVILEGES = 75,
+	CCERR_STACK_ERROR = 76,
+	CCERR_INVALID_VERSION = 77,
+	CCERR_INVALID_MTU = 78,
+	CCERR_INVALID_IMAGE = 79,
+	CCERR_PENDING = 98,	/* not an error; user internally by adapter */
+	CCERR_DEFER = 99,	/* not an error; used internally by adapter */
+	CCERR_FAILED_WRITE = 100,
+	CCERR_FAILED_ERASE = 101,
+	CCERR_FAILED_VERIFICATION = 102,
+	CCERR_NOT_FOUND = 103,
+
+};
+
+/*
+ * CCAE_ACTIVE_CONNECT_RESULTS status result codes.
+ */
+enum c2_connect_status {
+	C2_CONN_STATUS_SUCCESS = C2_OK,
+	C2_CONN_STATUS_NO_MEM = CCERR_INSUFFICIENT_RESOURCES,
+	C2_CONN_STATUS_TIMEDOUT = CCERR_CONN_TIMEDOUT,
+	C2_CONN_STATUS_REFUSED = CCERR_CONN_REFUSED,
+	C2_CONN_STATUS_NETUNREACH = CCERR_NET_UNREACHABLE,
+	C2_CONN_STATUS_HOSTUNREACH = CCERR_HOST_UNREACHABLE,
+	C2_CONN_STATUS_INVALID_RNIC = CCERR_INVALID_RNIC,
+	C2_CONN_STATUS_INVALID_QP = CCERR_INVALID_QP,
+	C2_CONN_STATUS_INVALID_QP_STATE = CCERR_INVALID_QP_STATE,
+	C2_CONN_STATUS_REJECTED = CCERR_CONN_RESET,
+	C2_CONN_STATUS_ADDR_NOT_AVAIL = CCERR_ADDR_NOT_AVAIL,
+};
+
+/*
+ * Flash programming status codes.
+ */
+enum c2_flash_status {
+	C2_FLASH_STATUS_SUCCESS = 0x0000,
+	C2_FLASH_STATUS_VERIFY_ERR = 0x0002,
+	C2_FLASH_STATUS_IMAGE_ERR = 0x0004,
+	C2_FLASH_STATUS_ECLBS = 0x0400,
+	C2_FLASH_STATUS_PSLBS = 0x0800,
+	C2_FLASH_STATUS_VPENS = 0x1000,
+};
+
+#endif				/* _C2_STATUS_H_ */
diff --git a/drivers/infiniband/hw/amso1100/c2_user.h b/drivers/infiniband/hw/amso1100/c2_user.h
new file mode 100644
index 0000000..7e9e7ad
--- /dev/null
+++ b/drivers/infiniband/hw/amso1100/c2_user.h
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Cisco Systems.  All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifndef C2_USER_H
+#define C2_USER_H
+
+#include <linux/types.h>
+
+/*
+ * Make sure that all structs defined in this file remain laid out so
+ * that they pack the same way on 32-bit and 64-bit architectures (to
+ * avoid incompatibility between 32-bit userspace and 64-bit kernels).
+ * In particular do not use pointer types -- pass pointers in __u64
+ * instead.
+ */
+
+struct c2_alloc_ucontext_resp {
+	__u32 qp_tab_size;
+	__u32 uarc_size;
+};
+
+struct c2_alloc_pd_resp {
+	__u32 pdn;
+	__u32 reserved;
+};
+
+struct c2_create_cq {
+	__u32 lkey;
+	__u32 pdn;
+	__u64 arm_db_page;
+	__u64 set_db_page;
+	__u32 arm_db_index;
+	__u32 set_db_index;
+};
+
+struct c2_create_cq_resp {
+	__u32 cqn;
+	__u32 reserved;
+};
+
+struct c2_create_qp {
+	__u32 lkey;
+	__u32 reserved;
+	__u64 sq_db_page;
+	__u64 rq_db_page;
+	__u32 sq_db_index;
+	__u32 rq_db_index;
+};
+
+#endif				/* C2_USER_H */
diff --git a/drivers/infiniband/hw/amso1100/c2_vq.c b/drivers/infiniband/hw/amso1100/c2_vq.c
new file mode 100644
index 0000000..2ec716f
--- /dev/null
+++ b/drivers/infiniband/hw/amso1100/c2_vq.c
@@ -0,0 +1,260 @@
+/*
+ * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+
+#include "c2_vq.h"
+#include "c2_provider.h"
+
+/*
+ * Verbs Request Objects:
+ *
+ * VQ Request Objects are allocated by the kernel verbs handlers.
+ * They contain a wait object, a refcnt, an atomic bool indicating that the
+ * adapter has replied, and a copy of the verb reply work request.
+ * A pointer to the VQ Request Object is passed down in the context
+ * field of the work request message, and reflected back by the adapter
+ * in the verbs reply message.  The function handle_vq() in the interrupt
+ * path will use this pointer to:
+ * 	1) append a copy of the verbs reply message
+ * 	2) mark that the reply is ready
+ * 	3) wake up the kernel verbs handler blocked awaiting the reply.
+ *
+ *
+ * The kernel verbs handlers do a "get" to put a 2nd reference on the
+ * VQ Request object.  If the kernel verbs handler exits before the adapter
+ * can respond, this extra reference will keep the VQ Request object around
+ * until the adapter's reply can be processed.  The reason we need this is
+ * because a pointer to this object is stuffed into the context field of
+ * the verbs work request message, and reflected back in the reply message.
+ * It is used in the interrupt handler (handle_vq()) to wake up the appropriate
+ * kernel verb handler that is blocked awaiting the verb reply.
+ * So handle_vq() will do a "put" on the object when it's done accessing it.
+ * NOTE:  If we guarantee that the kernel verb handler will never bail before
+ *        getting the reply, then we don't need these refcnts.
+ *
+ *
+ * VQ Request objects are freed by the kernel verbs handlers only
+ * after the verb has been processed, or when the adapter fails and
+ * does not reply.
+ *
+ *
+ * Verbs Reply Buffers:
+ *
+ * VQ Reply bufs are local host memory copies of a
+ * outstanding Verb Request reply
+ * message.  The are always allocated by the kernel verbs handlers, and _may_ be
+ * freed by either the kernel verbs handler -or- the interrupt handler.  The
+ * kernel verbs handler _must_ free the repbuf, then free the vq request object
+ * in that order.
+ */
+
+int vq_init(struct c2_dev *c2dev)
+{
+	sprintf(c2dev->vq_cache_name, "c2-vq:dev%c",
+		(char) ('0' + c2dev->devnum));
+	c2dev->host_msg_cache =
+	    kmem_cache_create(c2dev->vq_cache_name, c2dev->rep_vq.msg_size, 0,
+			      SLAB_HWCACHE_ALIGN, NULL);
+	if (c2dev->host_msg_cache == NULL) {
+		return -ENOMEM;
+	}
+	return 0;
+}
+
+void vq_term(struct c2_dev *c2dev)
+{
+	kmem_cache_destroy(c2dev->host_msg_cache);
+}
+
+/* vq_req_alloc - allocate a VQ Request Object and initialize it.
+ * The refcnt is set to 1.
+ */
+struct c2_vq_req *vq_req_alloc(struct c2_dev *c2dev)
+{
+	struct c2_vq_req *r;
+
+	r = kmalloc(sizeof(struct c2_vq_req), GFP_KERNEL);
+	if (r) {
+		init_waitqueue_head(&r->wait_object);
+		r->reply_msg = 0;
+		r->event = 0;
+		r->cm_id = NULL;
+		r->qp = NULL;
+		atomic_set(&r->refcnt, 1);
+		atomic_set(&r->reply_ready, 0);
+	}
+	return r;
+}
+
+
+/* vq_req_free - free the VQ Request Object.  It is assumed the verbs handler
+ * has already free the VQ Reply Buffer if it existed.
+ */
+void vq_req_free(struct c2_dev *c2dev, struct c2_vq_req *r)
+{
+	r->reply_msg = 0;
+	if (atomic_dec_and_test(&r->refcnt)) {
+		kfree(r);
+	}
+}
+
+/* vq_req_get - reference a VQ Request Object.  Done
+ * only in the kernel verbs handlers.
+ */
+void vq_req_get(struct c2_dev *c2dev, struct c2_vq_req *r)
+{
+	atomic_inc(&r->refcnt);
+}
+
+
+/* vq_req_put - dereference and potentially free a VQ Request Object.
+ *
+ * This is only called by handle_vq() on the
+ * interrupt when it is done processing
+ * a verb reply message.  If the associated
+ * kernel verbs handler has already bailed,
+ * then this put will actually free the VQ
+ * Request object _and_ the VQ Reply Buffer
+ * if it exists.
+ */
+void vq_req_put(struct c2_dev *c2dev, struct c2_vq_req *r)
+{
+	if (atomic_dec_and_test(&r->refcnt)) {
+		if (r->reply_msg != 0)
+			vq_repbuf_free(c2dev,
+				       (void *) (unsigned long) r->reply_msg);
+		kfree(r);
+	}
+}
+
+
+/*
+ * vq_repbuf_alloc - allocate a VQ Reply Buffer.
+ */
+void *vq_repbuf_alloc(struct c2_dev *c2dev)
+{
+	return kmem_cache_alloc(c2dev->host_msg_cache, GFP_ATOMIC);
+}
+
+/*
+ * vq_send_wr - post a verbs request message to the Verbs Request Queue.
+ * If a message is not available in the MQ, then block until one is available.
+ * NOTE: handle_mq() on the interrupt context will wake up threads blocked here.
+ * When the adapter drains the Verbs Request Queue,
+ * it inserts MQ index 0 in to the
+ * adapter->host activity fifo and interrupts the host.
+ */
+int vq_send_wr(struct c2_dev *c2dev, union c2wr *wr)
+{
+	void *msg;
+	wait_queue_t __wait;
+
+	/*
+	 * grab adapter vq lock
+	 */
+	spin_lock(&c2dev->vqlock);
+
+	/*
+	 * allocate msg
+	 */
+	msg = c2_mq_alloc(&c2dev->req_vq);
+
+	/*
+	 * If we cannot get a msg, then we'll wait
+	 * When a messages are available, the int handler will wake_up()
+	 * any waiters.
+	 */
+	while (msg == NULL) {
+		pr_debug("%s:%d no available msg in VQ, waiting...\n",
+		       __func__, __LINE__);
+		init_waitqueue_entry(&__wait, current);
+		add_wait_queue(&c2dev->req_vq_wo, &__wait);
+		spin_unlock(&c2dev->vqlock);
+		for (;;) {
+			set_current_state(TASK_INTERRUPTIBLE);
+			if (!c2_mq_full(&c2dev->req_vq)) {
+				break;
+			}
+			if (!signal_pending(current)) {
+				schedule_timeout(1 * HZ);	/* 1 second... */
+				continue;
+			}
+			set_current_state(TASK_RUNNING);
+			remove_wait_queue(&c2dev->req_vq_wo, &__wait);
+			return -EINTR;
+		}
+		set_current_state(TASK_RUNNING);
+		remove_wait_queue(&c2dev->req_vq_wo, &__wait);
+		spin_lock(&c2dev->vqlock);
+		msg = c2_mq_alloc(&c2dev->req_vq);
+	}
+
+	/*
+	 * copy wr into adapter msg
+	 */
+	memcpy(msg, wr, c2dev->req_vq.msg_size);
+
+	/*
+	 * post msg
+	 */
+	c2_mq_produce(&c2dev->req_vq);
+
+	/*
+	 * release adapter vq lock
+	 */
+	spin_unlock(&c2dev->vqlock);
+	return 0;
+}
+
+
+/*
+ * vq_wait_for_reply - block until the adapter posts a Verb Reply Message.
+ */
+int vq_wait_for_reply(struct c2_dev *c2dev, struct c2_vq_req *req)
+{
+	if (!wait_event_timeout(req->wait_object,
+				atomic_read(&req->reply_ready),
+				60*HZ))
+		return -ETIMEDOUT;
+
+	return 0;
+}
+
+/*
+ * vq_repbuf_free - Free a Verbs Reply Buffer.
+ */
+void vq_repbuf_free(struct c2_dev *c2dev, void *reply)
+{
+	kmem_cache_free(c2dev->host_msg_cache, reply);
+}
diff --git a/drivers/infiniband/hw/amso1100/c2_vq.h b/drivers/infiniband/hw/amso1100/c2_vq.h
new file mode 100644
index 0000000..3380562
--- /dev/null
+++ b/drivers/infiniband/hw/amso1100/c2_vq.h
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef _C2_VQ_H_
+#define _C2_VQ_H_
+#include <linux/sched.h>
+#include "c2.h"
+#include "c2_wr.h"
+#include "c2_provider.h"
+
+struct c2_vq_req {
+	u64 reply_msg;		/* ptr to reply msg */
+	wait_queue_head_t wait_object;	/* wait object for vq reqs */
+	atomic_t reply_ready;	/* set when reply is ready */
+	atomic_t refcnt;	/* used to cancel WRs... */
+	int event;
+	struct iw_cm_id *cm_id;
+	struct c2_qp *qp;
+};
+
+extern int vq_init(struct c2_dev *c2dev);
+extern void vq_term(struct c2_dev *c2dev);
+
+extern struct c2_vq_req *vq_req_alloc(struct c2_dev *c2dev);
+extern void vq_req_free(struct c2_dev *c2dev, struct c2_vq_req *req);
+extern void vq_req_get(struct c2_dev *c2dev, struct c2_vq_req *req);
+extern void vq_req_put(struct c2_dev *c2dev, struct c2_vq_req *req);
+extern int vq_send_wr(struct c2_dev *c2dev, union c2wr * wr);
+
+extern void *vq_repbuf_alloc(struct c2_dev *c2dev);
+extern void vq_repbuf_free(struct c2_dev *c2dev, void *reply);
+
+extern int vq_wait_for_reply(struct c2_dev *c2dev, struct c2_vq_req *req);
+#endif				/* _C2_VQ_H_ */
diff --git a/drivers/infiniband/hw/amso1100/c2_wr.h b/drivers/infiniband/hw/amso1100/c2_wr.h
new file mode 100644
index 0000000..8d4b4ca
--- /dev/null
+++ b/drivers/infiniband/hw/amso1100/c2_wr.h
@@ -0,0 +1,1520 @@
+/*
+ * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef _C2_WR_H_
+#define _C2_WR_H_
+
+#ifdef CCDEBUG
+#define CCWR_MAGIC		0xb07700b0
+#endif
+
+#define C2_QP_NO_ATTR_CHANGE 0xFFFFFFFF
+
+/* Maximum allowed size in bytes of private_data exchange
+ * on connect.
+ */
+#define C2_MAX_PRIVATE_DATA_SIZE 200
+
+/*
+ * These types are shared among the adapter, host, and CCIL consumer.
+ */
+enum c2_cq_notification_type {
+	C2_CQ_NOTIFICATION_TYPE_NONE = 1,
+	C2_CQ_NOTIFICATION_TYPE_NEXT,
+	C2_CQ_NOTIFICATION_TYPE_NEXT_SE
+};
+
+enum c2_setconfig_cmd {
+	C2_CFG_ADD_ADDR = 1,
+	C2_CFG_DEL_ADDR = 2,
+	C2_CFG_ADD_ROUTE = 3,
+	C2_CFG_DEL_ROUTE = 4
+};
+
+enum c2_getconfig_cmd {
+	C2_GETCONFIG_ROUTES = 1,
+	C2_GETCONFIG_ADDRS
+};
+
+/*
+ *  CCIL Work Request Identifiers
+ */
+enum c2wr_ids {
+	CCWR_RNIC_OPEN = 1,
+	CCWR_RNIC_QUERY,
+	CCWR_RNIC_SETCONFIG,
+	CCWR_RNIC_GETCONFIG,
+	CCWR_RNIC_CLOSE,
+	CCWR_CQ_CREATE,
+	CCWR_CQ_QUERY,
+	CCWR_CQ_MODIFY,
+	CCWR_CQ_DESTROY,
+	CCWR_QP_CONNECT,
+	CCWR_PD_ALLOC,
+	CCWR_PD_DEALLOC,
+	CCWR_SRQ_CREATE,
+	CCWR_SRQ_QUERY,
+	CCWR_SRQ_MODIFY,
+	CCWR_SRQ_DESTROY,
+	CCWR_QP_CREATE,
+	CCWR_QP_QUERY,
+	CCWR_QP_MODIFY,
+	CCWR_QP_DESTROY,
+	CCWR_NSMR_STAG_ALLOC,
+	CCWR_NSMR_REGISTER,
+	CCWR_NSMR_PBL,
+	CCWR_STAG_DEALLOC,
+	CCWR_NSMR_REREGISTER,
+	CCWR_SMR_REGISTER,
+	CCWR_MR_QUERY,
+	CCWR_MW_ALLOC,
+	CCWR_MW_QUERY,
+	CCWR_EP_CREATE,
+	CCWR_EP_GETOPT,
+	CCWR_EP_SETOPT,
+	CCWR_EP_DESTROY,
+	CCWR_EP_BIND,
+	CCWR_EP_CONNECT,
+	CCWR_EP_LISTEN,
+	CCWR_EP_SHUTDOWN,
+	CCWR_EP_LISTEN_CREATE,
+	CCWR_EP_LISTEN_DESTROY,
+	CCWR_EP_QUERY,
+	CCWR_CR_ACCEPT,
+	CCWR_CR_REJECT,
+	CCWR_CONSOLE,
+	CCWR_TERM,
+	CCWR_FLASH_INIT,
+	CCWR_FLASH,
+	CCWR_BUF_ALLOC,
+	CCWR_BUF_FREE,
+	CCWR_FLASH_WRITE,
+	CCWR_INIT,		/* WARNING: Don't move this ever again! */
+
+
+
+	/* Add new IDs here */
+
+
+
+	/*
+	 * WARNING: CCWR_LAST must always be the last verbs id defined!
+	 *          All the preceding IDs are fixed, and must not change.
+	 *          You can add new IDs, but must not remove or reorder
+	 *          any IDs. If you do, YOU will ruin any hope of
+	 *          compatibility between versions.
+	 */
+	CCWR_LAST,
+
+	/*
+	 * Start over at 1 so that arrays indexed by user wr id's
+	 * begin at 1.  This is OK since the verbs and user wr id's
+	 * are always used on disjoint sets of queues.
+	 */
+	/*
+	 * The order of the CCWR_SEND_XX verbs must
+	 * match the order of the RDMA_OPs
+	 */
+	CCWR_SEND = 1,
+	CCWR_SEND_INV,
+	CCWR_SEND_SE,
+	CCWR_SEND_SE_INV,
+	CCWR_RDMA_WRITE,
+	CCWR_RDMA_READ,
+	CCWR_RDMA_READ_INV,
+	CCWR_MW_BIND,
+	CCWR_NSMR_FASTREG,
+	CCWR_STAG_INVALIDATE,
+	CCWR_RECV,
+	CCWR_NOP,
+	CCWR_UNIMPL,
+/* WARNING: This must always be the last user wr id defined! */
+};
+#define RDMA_SEND_OPCODE_FROM_WR_ID(x)   (x+2)
+
+/*
+ * SQ/RQ Work Request Types
+ */
+enum c2_wr_type {
+	C2_WR_TYPE_SEND = CCWR_SEND,
+	C2_WR_TYPE_SEND_SE = CCWR_SEND_SE,
+	C2_WR_TYPE_SEND_INV = CCWR_SEND_INV,
+	C2_WR_TYPE_SEND_SE_INV = CCWR_SEND_SE_INV,
+	C2_WR_TYPE_RDMA_WRITE = CCWR_RDMA_WRITE,
+	C2_WR_TYPE_RDMA_READ = CCWR_RDMA_READ,
+	C2_WR_TYPE_RDMA_READ_INV_STAG = CCWR_RDMA_READ_INV,
+	C2_WR_TYPE_BIND_MW = CCWR_MW_BIND,
+	C2_WR_TYPE_FASTREG_NSMR = CCWR_NSMR_FASTREG,
+	C2_WR_TYPE_INV_STAG = CCWR_STAG_INVALIDATE,
+	C2_WR_TYPE_RECV = CCWR_RECV,
+	C2_WR_TYPE_NOP = CCWR_NOP,
+};
+
+struct c2_netaddr {
+	__be32 ip_addr;
+	__be32 netmask;
+	u32 mtu;
+};
+
+struct c2_route {
+	u32 ip_addr;		/* 0 indicates the default route */
+	u32 netmask;		/* netmask associated with dst */
+	u32 flags;
+	union {
+		u32 ipaddr;	/* address of the nexthop interface */
+		u8 enaddr[6];
+	} nexthop;
+};
+
+/*
+ * A Scatter Gather Entry.
+ */
+struct c2_data_addr {
+	__be32 stag;
+	__be32 length;
+	__be64 to;
+};
+
+/*
+ * MR and MW flags used by the consumer, RI, and RNIC.
+ */
+enum c2_mm_flags {
+	MEM_REMOTE = 0x0001,	/* allow mw binds with remote access. */
+	MEM_VA_BASED = 0x0002,	/* Not Zero-based */
+	MEM_PBL_COMPLETE = 0x0004,	/* PBL array is complete in this msg */
+	MEM_LOCAL_READ = 0x0008,	/* allow local reads */
+	MEM_LOCAL_WRITE = 0x0010,	/* allow local writes */
+	MEM_REMOTE_READ = 0x0020,	/* allow remote reads */
+	MEM_REMOTE_WRITE = 0x0040,	/* allow remote writes */
+	MEM_WINDOW_BIND = 0x0080,	/* binds allowed */
+	MEM_SHARED = 0x0100,	/* set if MR is shared */
+	MEM_STAG_VALID = 0x0200	/* set if STAG is in valid state */
+};
+
+/*
+ * CCIL API ACF flags defined in terms of the low level mem flags.
+ * This minimizes translation needed in the user API
+ */
+enum c2_acf {
+	C2_ACF_LOCAL_READ = MEM_LOCAL_READ,
+	C2_ACF_LOCAL_WRITE = MEM_LOCAL_WRITE,
+	C2_ACF_REMOTE_READ = MEM_REMOTE_READ,
+	C2_ACF_REMOTE_WRITE = MEM_REMOTE_WRITE,
+	C2_ACF_WINDOW_BIND = MEM_WINDOW_BIND
+};
+
+/*
+ * Image types of objects written to flash
+ */
+#define C2_FLASH_IMG_BITFILE 1
+#define C2_FLASH_IMG_OPTION_ROM 2
+#define C2_FLASH_IMG_VPD 3
+
+/*
+ *  to fix bug 1815 we define the max size allowable of the
+ *  terminate message (per the IETF spec).Refer to the IETF
+ *  protocol specification, section 12.1.6, page 64)
+ *  The message is prefixed by 20 types of DDP info.
+ *
+ *  Then the message has 6 bytes for the terminate control
+ *  and DDP segment length info plus a DDP header (either
+ *  14 or 18 byts) plus 28 bytes for the RDMA header.
+ *  Thus the max size in:
+ *  20 + (6 + 18 + 28) = 72
+ */
+#define C2_MAX_TERMINATE_MESSAGE_SIZE (72)
+
+/*
+ * Build String Length.  It must be the same as C2_BUILD_STR_LEN in ccil_api.h
+ */
+#define WR_BUILD_STR_LEN 64
+
+/*
+ * WARNING:  All of these structs need to align any 64bit types on
+ * 64 bit boundaries!  64bit types include u64 and u64.
+ */
+
+/*
+ * Clustercore Work Request Header.  Be sensitive to field layout
+ * and alignment.
+ */
+struct c2wr_hdr {
+	/* wqe_count is part of the cqe.  It is put here so the
+	 * adapter can write to it while the wr is pending without
+	 * clobbering part of the wr.  This word need not be dma'd
+	 * from the host to adapter by libccil, but we copy it anyway
+	 * to make the memcpy to the adapter better aligned.
+	 */
+	__be32 wqe_count;
+
+	/* Put these fields next so that later 32- and 64-bit
+	 * quantities are naturally aligned.
+	 */
+	u8 id;
+	u8 result;		/* adapter -> host */
+	u8 sge_count;		/* host -> adapter */
+	u8 flags;		/* host -> adapter */
+
+	u64 context;
+#ifdef CCMSGMAGIC
+	u32 magic;
+	u32 pad;
+#endif
+} __attribute__((packed));
+
+/*
+ *------------------------ RNIC ------------------------
+ */
+
+/*
+ * WR_RNIC_OPEN
+ */
+
+/*
+ * Flags for the RNIC WRs
+ */
+enum c2_rnic_flags {
+	RNIC_IRD_STATIC = 0x0001,
+	RNIC_ORD_STATIC = 0x0002,
+	RNIC_QP_STATIC = 0x0004,
+	RNIC_SRQ_SUPPORTED = 0x0008,
+	RNIC_PBL_BLOCK_MODE = 0x0010,
+	RNIC_SRQ_MODEL_ARRIVAL = 0x0020,
+	RNIC_CQ_OVF_DETECTED = 0x0040,
+	RNIC_PRIV_MODE = 0x0080
+};
+
+struct c2wr_rnic_open_req {
+	struct c2wr_hdr hdr;
+	u64 user_context;
+	__be16 flags;		/* See enum c2_rnic_flags */
+	__be16 port_num;
+} __attribute__((packed));
+
+struct c2wr_rnic_open_rep {
+	struct c2wr_hdr hdr;
+	u32 rnic_handle;
+} __attribute__((packed));
+
+union c2wr_rnic_open {
+	struct c2wr_rnic_open_req req;
+	struct c2wr_rnic_open_rep rep;
+} __attribute__((packed));
+
+struct c2wr_rnic_query_req {
+	struct c2wr_hdr hdr;
+	u32 rnic_handle;
+} __attribute__((packed));
+
+/*
+ * WR_RNIC_QUERY
+ */
+struct c2wr_rnic_query_rep {
+	struct c2wr_hdr hdr;
+	u64 user_context;
+	__be32 vendor_id;
+	__be32 part_number;
+	__be32 hw_version;
+	__be32 fw_ver_major;
+	__be32 fw_ver_minor;
+	__be32 fw_ver_patch;
+	char fw_ver_build_str[WR_BUILD_STR_LEN];
+	__be32 max_qps;
+	__be32 max_qp_depth;
+	u32 max_srq_depth;
+	u32 max_send_sgl_depth;
+	u32 max_rdma_sgl_depth;
+	__be32 max_cqs;
+	__be32 max_cq_depth;
+	u32 max_cq_event_handlers;
+	__be32 max_mrs;
+	u32 max_pbl_depth;
+	__be32 max_pds;
+	__be32 max_global_ird;
+	u32 max_global_ord;
+	__be32 max_qp_ird;
+	__be32 max_qp_ord;
+	u32 flags;
+	__be32 max_mws;
+	u32 pbe_range_low;
+	u32 pbe_range_high;
+	u32 max_srqs;
+	u32 page_size;
+} __attribute__((packed));
+
+union c2wr_rnic_query {
+	struct c2wr_rnic_query_req req;
+	struct c2wr_rnic_query_rep rep;
+} __attribute__((packed));
+
+/*
+ * WR_RNIC_GETCONFIG
+ */
+
+struct c2wr_rnic_getconfig_req {
+	struct c2wr_hdr hdr;
+	u32 rnic_handle;
+	u32 option;		/* see c2_getconfig_cmd_t */
+	u64 reply_buf;
+	u32 reply_buf_len;
+} __attribute__((packed)) ;
+
+struct c2wr_rnic_getconfig_rep {
+	struct c2wr_hdr hdr;
+	u32 option;		/* see c2_getconfig_cmd_t */
+	u32 count_len;		/* length of the number of addresses configured */
+} __attribute__((packed)) ;
+
+union c2wr_rnic_getconfig {
+	struct c2wr_rnic_getconfig_req req;
+	struct c2wr_rnic_getconfig_rep rep;
+} __attribute__((packed)) ;
+
+/*
+ * WR_RNIC_SETCONFIG
+ */
+struct c2wr_rnic_setconfig_req {
+	struct c2wr_hdr hdr;
+	u32 rnic_handle;
+	__be32 option;		/* See c2_setconfig_cmd_t */
+	/* variable data and pad. See c2_netaddr and c2_route */
+	u8 data[0];
+} __attribute__((packed)) ;
+
+struct c2wr_rnic_setconfig_rep {
+	struct c2wr_hdr hdr;
+} __attribute__((packed)) ;
+
+union c2wr_rnic_setconfig {
+	struct c2wr_rnic_setconfig_req req;
+	struct c2wr_rnic_setconfig_rep rep;
+} __attribute__((packed)) ;
+
+/*
+ * WR_RNIC_CLOSE
+ */
+struct c2wr_rnic_close_req {
+	struct c2wr_hdr hdr;
+	u32 rnic_handle;
+} __attribute__((packed)) ;
+
+struct c2wr_rnic_close_rep {
+	struct c2wr_hdr hdr;
+} __attribute__((packed)) ;
+
+union c2wr_rnic_close {
+	struct c2wr_rnic_close_req req;
+	struct c2wr_rnic_close_rep rep;
+} __attribute__((packed)) ;
+
+/*
+ *------------------------ CQ ------------------------
+ */
+struct c2wr_cq_create_req {
+	struct c2wr_hdr hdr;
+	__be64 shared_ht;
+	u64 user_context;
+	__be64 msg_pool;
+	u32 rnic_handle;
+	__be32 msg_size;
+	__be32 depth;
+} __attribute__((packed)) ;
+
+struct c2wr_cq_create_rep {
+	struct c2wr_hdr hdr;
+	__be32 mq_index;
+	__be32 adapter_shared;
+	u32 cq_handle;
+} __attribute__((packed)) ;
+
+union c2wr_cq_create {
+	struct c2wr_cq_create_req req;
+	struct c2wr_cq_create_rep rep;
+} __attribute__((packed)) ;
+
+struct c2wr_cq_modify_req {
+	struct c2wr_hdr hdr;
+	u32 rnic_handle;
+	u32 cq_handle;
+	u32 new_depth;
+	u64 new_msg_pool;
+} __attribute__((packed)) ;
+
+struct c2wr_cq_modify_rep {
+	struct c2wr_hdr hdr;
+} __attribute__((packed)) ;
+
+union c2wr_cq_modify {
+	struct c2wr_cq_modify_req req;
+	struct c2wr_cq_modify_rep rep;
+} __attribute__((packed)) ;
+
+struct c2wr_cq_destroy_req {
+	struct c2wr_hdr hdr;
+	u32 rnic_handle;
+	u32 cq_handle;
+} __attribute__((packed)) ;
+
+struct c2wr_cq_destroy_rep {
+	struct c2wr_hdr hdr;
+} __attribute__((packed)) ;
+
+union c2wr_cq_destroy {
+	struct c2wr_cq_destroy_req req;
+	struct c2wr_cq_destroy_rep rep;
+} __attribute__((packed)) ;
+
+/*
+ *------------------------ PD ------------------------
+ */
+struct c2wr_pd_alloc_req {
+	struct c2wr_hdr hdr;
+	u32 rnic_handle;
+	u32 pd_id;
+} __attribute__((packed)) ;
+
+struct c2wr_pd_alloc_rep {
+	struct c2wr_hdr hdr;
+} __attribute__((packed)) ;
+
+union c2wr_pd_alloc {
+	struct c2wr_pd_alloc_req req;
+	struct c2wr_pd_alloc_rep rep;
+} __attribute__((packed)) ;
+
+struct c2wr_pd_dealloc_req {
+	struct c2wr_hdr hdr;
+	u32 rnic_handle;
+	u32 pd_id;
+} __attribute__((packed)) ;
+
+struct c2wr_pd_dealloc_rep {
+	struct c2wr_hdr hdr;
+} __attribute__((packed)) ;
+
+union c2wr_pd_dealloc {
+	struct c2wr_pd_dealloc_req req;
+	struct c2wr_pd_dealloc_rep rep;
+} __attribute__((packed)) ;
+
+/*
+ *------------------------ SRQ ------------------------
+ */
+struct c2wr_srq_create_req {
+	struct c2wr_hdr hdr;
+	u64 shared_ht;
+	u64 user_context;
+	u32 rnic_handle;
+	u32 srq_depth;
+	u32 srq_limit;
+	u32 sgl_depth;
+	u32 pd_id;
+} __attribute__((packed)) ;
+
+struct c2wr_srq_create_rep {
+	struct c2wr_hdr hdr;
+	u32 srq_depth;
+	u32 sgl_depth;
+	u32 msg_size;
+	u32 mq_index;
+	u32 mq_start;
+	u32 srq_handle;
+} __attribute__((packed)) ;
+
+union c2wr_srq_create {
+	struct c2wr_srq_create_req req;
+	struct c2wr_srq_create_rep rep;
+} __attribute__((packed)) ;
+
+struct c2wr_srq_destroy_req {
+	struct c2wr_hdr hdr;
+	u32 rnic_handle;
+	u32 srq_handle;
+} __attribute__((packed)) ;
+
+struct c2wr_srq_destroy_rep {
+	struct c2wr_hdr hdr;
+} __attribute__((packed)) ;
+
+union c2wr_srq_destroy {
+	struct c2wr_srq_destroy_req req;
+	struct c2wr_srq_destroy_rep rep;
+} __attribute__((packed)) ;
+
+/*
+ *------------------------ QP ------------------------
+ */
+enum c2wr_qp_flags {
+	QP_RDMA_READ = 0x00000001,	/* RDMA read enabled? */
+	QP_RDMA_WRITE = 0x00000002,	/* RDMA write enabled? */
+	QP_MW_BIND = 0x00000004,	/* MWs enabled */
+	QP_ZERO_STAG = 0x00000008,	/* enabled? */
+	QP_REMOTE_TERMINATION = 0x00000010,	/* remote end terminated */
+	QP_RDMA_READ_RESPONSE = 0x00000020	/* Remote RDMA read  */
+	    /* enabled? */
+};
+
+struct c2wr_qp_create_req {
+	struct c2wr_hdr hdr;
+	__be64 shared_sq_ht;
+	__be64 shared_rq_ht;
+	u64 user_context;
+	u32 rnic_handle;
+	u32 sq_cq_handle;
+	u32 rq_cq_handle;
+	__be32 sq_depth;
+	__be32 rq_depth;
+	u32 srq_handle;
+	u32 srq_limit;
+	__be32 flags;		/* see enum c2wr_qp_flags */
+	__be32 send_sgl_depth;
+	__be32 recv_sgl_depth;
+	__be32 rdma_write_sgl_depth;
+	__be32 ord;
+	__be32 ird;
+	u32 pd_id;
+} __attribute__((packed)) ;
+
+struct c2wr_qp_create_rep {
+	struct c2wr_hdr hdr;
+	__be32 sq_depth;
+	__be32 rq_depth;
+	u32 send_sgl_depth;
+	u32 recv_sgl_depth;
+	u32 rdma_write_sgl_depth;
+	u32 ord;
+	u32 ird;
+	__be32 sq_msg_size;
+	__be32 sq_mq_index;
+	__be32 sq_mq_start;
+	__be32 rq_msg_size;
+	__be32 rq_mq_index;
+	__be32 rq_mq_start;
+	u32 qp_handle;
+} __attribute__((packed)) ;
+
+union c2wr_qp_create {
+	struct c2wr_qp_create_req req;
+	struct c2wr_qp_create_rep rep;
+} __attribute__((packed)) ;
+
+struct c2wr_qp_query_req {
+	struct c2wr_hdr hdr;
+	u32 rnic_handle;
+	u32 qp_handle;
+} __attribute__((packed)) ;
+
+struct c2wr_qp_query_rep {
+	struct c2wr_hdr hdr;
+	u64 user_context;
+	u32 rnic_handle;
+	u32 sq_depth;
+	u32 rq_depth;
+	u32 send_sgl_depth;
+	u32 rdma_write_sgl_depth;
+	u32 recv_sgl_depth;
+	u32 ord;
+	u32 ird;
+	u16 qp_state;
+	u16 flags;		/* see c2wr_qp_flags_t */
+	u32 qp_id;
+	u32 local_addr;
+	u32 remote_addr;
+	u16 local_port;
+	u16 remote_port;
+	u32 terminate_msg_length;	/* 0 if not present */
+	u8 data[0];
+	/* Terminate Message in-line here. */
+} __attribute__((packed)) ;
+
+union c2wr_qp_query {
+	struct c2wr_qp_query_req req;
+	struct c2wr_qp_query_rep rep;
+} __attribute__((packed)) ;
+
+struct c2wr_qp_modify_req {
+	struct c2wr_hdr hdr;
+	u64 stream_msg;
+	u32 stream_msg_length;
+	u32 rnic_handle;
+	u32 qp_handle;
+	__be32 next_qp_state;
+	__be32 ord;
+	__be32 ird;
+	__be32 sq_depth;
+	__be32 rq_depth;
+	u32 llp_ep_handle;
+} __attribute__((packed)) ;
+
+struct c2wr_qp_modify_rep {
+	struct c2wr_hdr hdr;
+	u32 ord;
+	u32 ird;
+	u32 sq_depth;
+	u32 rq_depth;
+	u32 sq_msg_size;
+	u32 sq_mq_index;
+	u32 sq_mq_start;
+	u32 rq_msg_size;
+	u32 rq_mq_index;
+	u32 rq_mq_start;
+} __attribute__((packed)) ;
+
+union c2wr_qp_modify {
+	struct c2wr_qp_modify_req req;
+	struct c2wr_qp_modify_rep rep;
+} __attribute__((packed)) ;
+
+struct c2wr_qp_destroy_req {
+	struct c2wr_hdr hdr;
+	u32 rnic_handle;
+	u32 qp_handle;
+} __attribute__((packed)) ;
+
+struct c2wr_qp_destroy_rep {
+	struct c2wr_hdr hdr;
+} __attribute__((packed)) ;
+
+union c2wr_qp_destroy {
+	struct c2wr_qp_destroy_req req;
+	struct c2wr_qp_destroy_rep rep;
+} __attribute__((packed)) ;
+
+/*
+ * The CCWR_QP_CONNECT msg is posted on the verbs request queue.  It can
+ * only be posted when a QP is in IDLE state.  After the connect request is
+ * submitted to the LLP, the adapter moves the QP to CONNECT_PENDING state.
+ * No synchronous reply from adapter to this WR.  The results of
+ * connection are passed back in an async event CCAE_ACTIVE_CONNECT_RESULTS
+ * See c2wr_ae_active_connect_results_t
+ */
+struct c2wr_qp_connect_req {
+	struct c2wr_hdr hdr;
+	u32 rnic_handle;
+	u32 qp_handle;
+	__be32 remote_addr;
+	__be16 remote_port;
+	u16 pad;
+	__be32 private_data_length;
+	u8 private_data[0];	/* Private data in-line. */
+} __attribute__((packed)) ;
+
+struct c2wr_qp_connect {
+	struct c2wr_qp_connect_req req;
+	/* no synchronous reply.         */
+} __attribute__((packed)) ;
+
+
+/*
+ *------------------------ MM ------------------------
+ */
+
+struct c2wr_nsmr_stag_alloc_req {
+	struct c2wr_hdr hdr;
+	u32 rnic_handle;
+	u32 pbl_depth;
+	u32 pd_id;
+	u32 flags;
+} __attribute__((packed)) ;
+
+struct c2wr_nsmr_stag_alloc_rep {
+	struct c2wr_hdr hdr;
+	u32 pbl_depth;
+	u32 stag_index;
+} __attribute__((packed)) ;
+
+union c2wr_nsmr_stag_alloc {
+	struct c2wr_nsmr_stag_alloc_req req;
+	struct c2wr_nsmr_stag_alloc_rep rep;
+} __attribute__((packed)) ;
+
+struct c2wr_nsmr_register_req {
+	struct c2wr_hdr hdr;
+	__be64 va;
+	u32 rnic_handle;
+	__be16 flags;
+	u8 stag_key;
+	u8 pad;
+	u32 pd_id;
+	__be32 pbl_depth;
+	__be32 pbe_size;
+	__be32 fbo;
+	__be32 length;
+	__be32 addrs_length;
+	/* array of paddrs (must be aligned on a 64bit boundary) */
+	__be64 paddrs[0];
+} __attribute__((packed)) ;
+
+struct c2wr_nsmr_register_rep {
+	struct c2wr_hdr hdr;
+	u32 pbl_depth;
+	__be32 stag_index;
+} __attribute__((packed)) ;
+
+union c2wr_nsmr_register {
+	struct c2wr_nsmr_register_req req;
+	struct c2wr_nsmr_register_rep rep;
+} __attribute__((packed)) ;
+
+struct c2wr_nsmr_pbl_req {
+	struct c2wr_hdr hdr;
+	u32 rnic_handle;
+	__be32 flags;
+	__be32 stag_index;
+	__be32 addrs_length;
+	/* array of paddrs (must be aligned on a 64bit boundary) */
+	__be64 paddrs[0];
+} __attribute__((packed)) ;
+
+struct c2wr_nsmr_pbl_rep {
+	struct c2wr_hdr hdr;
+} __attribute__((packed)) ;
+
+union c2wr_nsmr_pbl {
+	struct c2wr_nsmr_pbl_req req;
+	struct c2wr_nsmr_pbl_rep rep;
+} __attribute__((packed)) ;
+
+struct c2wr_mr_query_req {
+	struct c2wr_hdr hdr;
+	u32 rnic_handle;
+	u32 stag_index;
+} __attribute__((packed)) ;
+
+struct c2wr_mr_query_rep {
+	struct c2wr_hdr hdr;
+	u8 stag_key;
+	u8 pad[3];
+	u32 pd_id;
+	u32 flags;
+	u32 pbl_depth;
+} __attribute__((packed)) ;
+
+union c2wr_mr_query {
+	struct c2wr_mr_query_req req;
+	struct c2wr_mr_query_rep rep;
+} __attribute__((packed)) ;
+
+struct c2wr_mw_query_req {
+	struct c2wr_hdr hdr;
+	u32 rnic_handle;
+	u32 stag_index;
+} __attribute__((packed)) ;
+
+struct c2wr_mw_query_rep {
+	struct c2wr_hdr hdr;
+	u8 stag_key;
+	u8 pad[3];
+	u32 pd_id;
+	u32 flags;
+} __attribute__((packed)) ;
+
+union c2wr_mw_query {
+	struct c2wr_mw_query_req req;
+	struct c2wr_mw_query_rep rep;
+} __attribute__((packed)) ;
+
+
+struct c2wr_stag_dealloc_req {
+	struct c2wr_hdr hdr;
+	u32 rnic_handle;
+	__be32 stag_index;
+} __attribute__((packed)) ;
+
+struct c2wr_stag_dealloc_rep {
+	struct c2wr_hdr hdr;
+} __attribute__((packed)) ;
+
+union c2wr_stag_dealloc {
+	struct c2wr_stag_dealloc_req req;
+	struct c2wr_stag_dealloc_rep rep;
+} __attribute__((packed)) ;
+
+struct c2wr_nsmr_reregister_req {
+	struct c2wr_hdr hdr;
+	u64 va;
+	u32 rnic_handle;
+	u16 flags;
+	u8 stag_key;
+	u8 pad;
+	u32 stag_index;
+	u32 pd_id;
+	u32 pbl_depth;
+	u32 pbe_size;
+	u32 fbo;
+	u32 length;
+	u32 addrs_length;
+	u32 pad1;
+	/* array of paddrs (must be aligned on a 64bit boundary) */
+	u64 paddrs[0];
+} __attribute__((packed)) ;
+
+struct c2wr_nsmr_reregister_rep {
+	struct c2wr_hdr hdr;
+	u32 pbl_depth;
+	u32 stag_index;
+} __attribute__((packed)) ;
+
+union c2wr_nsmr_reregister {
+	struct c2wr_nsmr_reregister_req req;
+	struct c2wr_nsmr_reregister_rep rep;
+} __attribute__((packed)) ;
+
+struct c2wr_smr_register_req {
+	struct c2wr_hdr hdr;
+	u64 va;
+	u32 rnic_handle;
+	u16 flags;
+	u8 stag_key;
+	u8 pad;
+	u32 stag_index;
+	u32 pd_id;
+} __attribute__((packed)) ;
+
+struct c2wr_smr_register_rep {
+	struct c2wr_hdr hdr;
+	u32 stag_index;
+} __attribute__((packed)) ;
+
+union c2wr_smr_register {
+	struct c2wr_smr_register_req req;
+	struct c2wr_smr_register_rep rep;
+} __attribute__((packed)) ;
+
+struct c2wr_mw_alloc_req {
+	struct c2wr_hdr hdr;
+	u32 rnic_handle;
+	u32 pd_id;
+} __attribute__((packed)) ;
+
+struct c2wr_mw_alloc_rep {
+	struct c2wr_hdr hdr;
+	u32 stag_index;
+} __attribute__((packed)) ;
+
+union c2wr_mw_alloc {
+	struct c2wr_mw_alloc_req req;
+	struct c2wr_mw_alloc_rep rep;
+} __attribute__((packed)) ;
+
+/*
+ *------------------------ WRs -----------------------
+ */
+
+struct c2wr_user_hdr {
+	struct c2wr_hdr hdr;		/* Has status and WR Type */
+} __attribute__((packed)) ;
+
+enum c2_qp_state {
+	C2_QP_STATE_IDLE = 0x01,
+	C2_QP_STATE_CONNECTING = 0x02,
+	C2_QP_STATE_RTS = 0x04,
+	C2_QP_STATE_CLOSING = 0x08,
+	C2_QP_STATE_TERMINATE = 0x10,
+	C2_QP_STATE_ERROR = 0x20,
+};
+
+/* Completion queue entry. */
+struct c2wr_ce {
+	struct c2wr_hdr hdr;		/* Has status and WR Type */
+	u64 qp_user_context;	/* c2_user_qp_t * */
+	u32 qp_state;		/* Current QP State */
+	u32 handle;		/* QPID or EP Handle */
+	__be32 bytes_rcvd;		/* valid for RECV WCs */
+	u32 stag;
+} __attribute__((packed)) ;
+
+
+/*
+ * Flags used for all post-sq WRs.  These must fit in the flags
+ * field of the struct c2wr_hdr (eight bits).
+ */
+enum {
+	SQ_SIGNALED = 0x01,
+	SQ_READ_FENCE = 0x02,
+	SQ_FENCE = 0x04,
+};
+
+/*
+ * Common fields for all post-sq WRs.  Namely the standard header and a
+ * secondary header with fields common to all post-sq WRs.
+ */
+struct c2_sq_hdr {
+	struct c2wr_user_hdr user_hdr;
+} __attribute__((packed));
+
+/*
+ * Same as above but for post-rq WRs.
+ */
+struct c2_rq_hdr {
+	struct c2wr_user_hdr user_hdr;
+} __attribute__((packed));
+
+/*
+ * use the same struct for all sends.
+ */
+struct c2wr_send_req {
+	struct c2_sq_hdr sq_hdr;
+	__be32 sge_len;
+	__be32 remote_stag;
+	u8 data[0];		/* SGE array */
+} __attribute__((packed));
+
+union c2wr_send {
+	struct c2wr_send_req req;
+	struct c2wr_ce rep;
+} __attribute__((packed));
+
+struct c2wr_rdma_write_req {
+	struct c2_sq_hdr sq_hdr;
+	__be64 remote_to;
+	__be32 remote_stag;
+	__be32 sge_len;
+	u8 data[0];		/* SGE array */
+} __attribute__((packed));
+
+union c2wr_rdma_write {
+	struct c2wr_rdma_write_req req;
+	struct c2wr_ce rep;
+} __attribute__((packed));
+
+struct c2wr_rdma_read_req {
+	struct c2_sq_hdr sq_hdr;
+	__be64 local_to;
+	__be64 remote_to;
+	__be32 local_stag;
+	__be32 remote_stag;
+	__be32 length;
+} __attribute__((packed));
+
+union c2wr_rdma_read {
+	struct c2wr_rdma_read_req req;
+	struct c2wr_ce rep;
+} __attribute__((packed));
+
+struct c2wr_mw_bind_req {
+	struct c2_sq_hdr sq_hdr;
+	u64 va;
+	u8 stag_key;
+	u8 pad[3];
+	u32 mw_stag_index;
+	u32 mr_stag_index;
+	u32 length;
+	u32 flags;
+} __attribute__((packed));
+
+union c2wr_mw_bind {
+	struct c2wr_mw_bind_req req;
+	struct c2wr_ce rep;
+} __attribute__((packed));
+
+struct c2wr_nsmr_fastreg_req {
+	struct c2_sq_hdr sq_hdr;
+	u64 va;
+	u8 stag_key;
+	u8 pad[3];
+	u32 stag_index;
+	u32 pbe_size;
+	u32 fbo;
+	u32 length;
+	u32 addrs_length;
+	/* array of paddrs (must be aligned on a 64bit boundary) */
+	u64 paddrs[0];
+} __attribute__((packed));
+
+union c2wr_nsmr_fastreg {
+	struct c2wr_nsmr_fastreg_req req;
+	struct c2wr_ce rep;
+} __attribute__((packed));
+
+struct c2wr_stag_invalidate_req {
+	struct c2_sq_hdr sq_hdr;
+	u8 stag_key;
+	u8 pad[3];
+	u32 stag_index;
+} __attribute__((packed));
+
+union c2wr_stag_invalidate {
+	struct c2wr_stag_invalidate_req req;
+	struct c2wr_ce rep;
+} __attribute__((packed));
+
+union c2wr_sqwr {
+	struct c2_sq_hdr sq_hdr;
+	struct c2wr_send_req send;
+	struct c2wr_send_req send_se;
+	struct c2wr_send_req send_inv;
+	struct c2wr_send_req send_se_inv;
+	struct c2wr_rdma_write_req rdma_write;
+	struct c2wr_rdma_read_req rdma_read;
+	struct c2wr_mw_bind_req mw_bind;
+	struct c2wr_nsmr_fastreg_req nsmr_fastreg;
+	struct c2wr_stag_invalidate_req stag_inv;
+} __attribute__((packed));
+
+
+/*
+ * RQ WRs
+ */
+struct c2wr_rqwr {
+	struct c2_rq_hdr rq_hdr;
+	u8 data[0];		/* array of SGEs */
+} __attribute__((packed));
+
+union c2wr_recv {
+	struct c2wr_rqwr req;
+	struct c2wr_ce rep;
+} __attribute__((packed));
+
+/*
+ * All AEs start with this header.  Most AEs only need to convey the
+ * information in the header.  Some, like LLP connection events, need
+ * more info.  The union typdef c2wr_ae_t has all the possible AEs.
+ *
+ * hdr.context is the user_context from the rnic_open WR.  NULL If this
+ * is not affiliated with an rnic
+ *
+ * hdr.id is the AE identifier (eg;  CCAE_REMOTE_SHUTDOWN,
+ * CCAE_LLP_CLOSE_COMPLETE)
+ *
+ * resource_type is one of:  C2_RES_IND_QP, C2_RES_IND_CQ, C2_RES_IND_SRQ
+ *
+ * user_context is the context passed down when the host created the resource.
+ */
+struct c2wr_ae_hdr {
+	struct c2wr_hdr hdr;
+	u64 user_context;	/* user context for this res. */
+	__be32 resource_type;	/* see enum c2_resource_indicator */
+	__be32 resource;	/* handle for resource */
+	__be32 qp_state;	/* current QP State */
+} __attribute__((packed));
+
+/*
+ * After submitting the CCAE_ACTIVE_CONNECT_RESULTS message on the AEQ,
+ * the adapter moves the QP into RTS state
+ */
+struct c2wr_ae_active_connect_results {
+	struct c2wr_ae_hdr ae_hdr;
+	__be32 laddr;
+	__be32 raddr;
+	__be16 lport;
+	__be16 rport;
+	__be32 private_data_length;
+	u8 private_data[0];	/* data is in-line in the msg. */
+} __attribute__((packed));
+
+/*
+ * When connections are established by the stack (and the private data
+ * MPA frame is received), the adapter will generate an event to the host.
+ * The details of the connection, any private data, and the new connection
+ * request handle is passed up via the CCAE_CONNECTION_REQUEST msg on the
+ * AE queue:
+ */
+struct c2wr_ae_connection_request {
+	struct c2wr_ae_hdr ae_hdr;
+	u32 cr_handle;		/* connreq handle (sock ptr) */
+	__be32 laddr;
+	__be32 raddr;
+	__be16 lport;
+	__be16 rport;
+	__be32 private_data_length;
+	u8 private_data[0];	/* data is in-line in the msg. */
+} __attribute__((packed));
+
+union c2wr_ae {
+	struct c2wr_ae_hdr ae_generic;
+	struct c2wr_ae_active_connect_results ae_active_connect_results;
+	struct c2wr_ae_connection_request ae_connection_request;
+} __attribute__((packed));
+
+struct c2wr_init_req {
+	struct c2wr_hdr hdr;
+	__be64 hint_count;
+	__be64 q0_host_shared;
+	__be64 q1_host_shared;
+	__be64 q1_host_msg_pool;
+	__be64 q2_host_shared;
+	__be64 q2_host_msg_pool;
+} __attribute__((packed));
+
+struct c2wr_init_rep {
+	struct c2wr_hdr hdr;
+} __attribute__((packed));
+
+union c2wr_init {
+	struct c2wr_init_req req;
+	struct c2wr_init_rep rep;
+} __attribute__((packed));
+
+/*
+ * For upgrading flash.
+ */
+
+struct c2wr_flash_init_req {
+	struct c2wr_hdr hdr;
+	u32 rnic_handle;
+} __attribute__((packed));
+
+struct c2wr_flash_init_rep {
+	struct c2wr_hdr hdr;
+	u32 adapter_flash_buf_offset;
+	u32 adapter_flash_len;
+} __attribute__((packed));
+
+union c2wr_flash_init {
+	struct c2wr_flash_init_req req;
+	struct c2wr_flash_init_rep rep;
+} __attribute__((packed));
+
+struct c2wr_flash_req {
+	struct c2wr_hdr hdr;
+	u32 rnic_handle;
+	u32 len;
+} __attribute__((packed));
+
+struct c2wr_flash_rep {
+	struct c2wr_hdr hdr;
+	u32 status;
+} __attribute__((packed));
+
+union c2wr_flash {
+	struct c2wr_flash_req req;
+	struct c2wr_flash_rep rep;
+} __attribute__((packed));
+
+struct c2wr_buf_alloc_req {
+	struct c2wr_hdr hdr;
+	u32 rnic_handle;
+	u32 size;
+} __attribute__((packed));
+
+struct c2wr_buf_alloc_rep {
+	struct c2wr_hdr hdr;
+	u32 offset;		/* 0 if mem not available */
+	u32 size;		/* 0 if mem not available */
+} __attribute__((packed));
+
+union c2wr_buf_alloc {
+	struct c2wr_buf_alloc_req req;
+	struct c2wr_buf_alloc_rep rep;
+} __attribute__((packed));
+
+struct c2wr_buf_free_req {
+	struct c2wr_hdr hdr;
+	u32 rnic_handle;
+	u32 offset;		/* Must match value from alloc */
+	u32 size;		/* Must match value from alloc */
+} __attribute__((packed));
+
+struct c2wr_buf_free_rep {
+	struct c2wr_hdr hdr;
+} __attribute__((packed));
+
+union c2wr_buf_free {
+	struct c2wr_buf_free_req req;
+	struct c2wr_ce rep;
+} __attribute__((packed));
+
+struct c2wr_flash_write_req {
+	struct c2wr_hdr hdr;
+	u32 rnic_handle;
+	u32 offset;
+	u32 size;
+	u32 type;
+	u32 flags;
+} __attribute__((packed));
+
+struct c2wr_flash_write_rep {
+	struct c2wr_hdr hdr;
+	u32 status;
+} __attribute__((packed));
+
+union c2wr_flash_write {
+	struct c2wr_flash_write_req req;
+	struct c2wr_flash_write_rep rep;
+} __attribute__((packed));
+
+/*
+ * Messages for LLP connection setup.
+ */
+
+/*
+ * Listen Request.  This allocates a listening endpoint to allow passive
+ * connection setup.  Newly established LLP connections are passed up
+ * via an AE.  See c2wr_ae_connection_request_t
+ */
+struct c2wr_ep_listen_create_req {
+	struct c2wr_hdr hdr;
+	u64 user_context;	/* returned in AEs. */
+	u32 rnic_handle;
+	__be32 local_addr;		/* local addr, or 0  */
+	__be16 local_port;		/* 0 means "pick one" */
+	u16 pad;
+	__be32 backlog;		/* tradional tcp listen bl */
+} __attribute__((packed));
+
+struct c2wr_ep_listen_create_rep {
+	struct c2wr_hdr hdr;
+	u32 ep_handle;		/* handle to new listening ep */
+	u16 local_port;		/* resulting port... */
+	u16 pad;
+} __attribute__((packed));
+
+union c2wr_ep_listen_create {
+	struct c2wr_ep_listen_create_req req;
+	struct c2wr_ep_listen_create_rep rep;
+} __attribute__((packed));
+
+struct c2wr_ep_listen_destroy_req {
+	struct c2wr_hdr hdr;
+	u32 rnic_handle;
+	u32 ep_handle;
+} __attribute__((packed));
+
+struct c2wr_ep_listen_destroy_rep {
+	struct c2wr_hdr hdr;
+} __attribute__((packed));
+
+union c2wr_ep_listen_destroy {
+	struct c2wr_ep_listen_destroy_req req;
+	struct c2wr_ep_listen_destroy_rep rep;
+} __attribute__((packed));
+
+struct c2wr_ep_query_req {
+	struct c2wr_hdr hdr;
+	u32 rnic_handle;
+	u32 ep_handle;
+} __attribute__((packed));
+
+struct c2wr_ep_query_rep {
+	struct c2wr_hdr hdr;
+	u32 rnic_handle;
+	u32 local_addr;
+	u32 remote_addr;
+	u16 local_port;
+	u16 remote_port;
+} __attribute__((packed));
+
+union c2wr_ep_query {
+	struct c2wr_ep_query_req req;
+	struct c2wr_ep_query_rep rep;
+} __attribute__((packed));
+
+
+/*
+ * The host passes this down to indicate acceptance of a pending iWARP
+ * connection.  The cr_handle was obtained from the CONNECTION_REQUEST
+ * AE passed up by the adapter.  See c2wr_ae_connection_request_t.
+ */
+struct c2wr_cr_accept_req {
+	struct c2wr_hdr hdr;
+	u32 rnic_handle;
+	u32 qp_handle;		/* QP to bind to this LLP conn */
+	u32 ep_handle;		/* LLP  handle to accept */
+	__be32 private_data_length;
+	u8 private_data[0];	/* data in-line in msg. */
+} __attribute__((packed));
+
+/*
+ * adapter sends reply when private data is successfully submitted to
+ * the LLP.
+ */
+struct c2wr_cr_accept_rep {
+	struct c2wr_hdr hdr;
+} __attribute__((packed));
+
+union c2wr_cr_accept {
+	struct c2wr_cr_accept_req req;
+	struct c2wr_cr_accept_rep rep;
+} __attribute__((packed));
+
+/*
+ * The host sends this down if a given iWARP connection request was
+ * rejected by the consumer.  The cr_handle was obtained from a
+ * previous c2wr_ae_connection_request_t AE sent by the adapter.
+ */
+struct  c2wr_cr_reject_req {
+	struct c2wr_hdr hdr;
+	u32 rnic_handle;
+	u32 ep_handle;		/* LLP handle to reject */
+} __attribute__((packed));
+
+/*
+ * Dunno if this is needed, but we'll add it for now.  The adapter will
+ * send the reject_reply after the LLP endpoint has been destroyed.
+ */
+struct  c2wr_cr_reject_rep {
+	struct c2wr_hdr hdr;
+} __attribute__((packed));
+
+union c2wr_cr_reject {
+	struct c2wr_cr_reject_req req;
+	struct c2wr_cr_reject_rep rep;
+} __attribute__((packed));
+
+/*
+ * console command.  Used to implement a debug console over the verbs
+ * request and reply queues.
+ */
+
+/*
+ * Console request message.  It contains:
+ *	- message hdr with id = CCWR_CONSOLE
+ *	- the physaddr/len of host memory to be used for the reply.
+ *	- the command string.  eg:  "netstat -s" or "zoneinfo"
+ */
+struct c2wr_console_req {
+	struct c2wr_hdr hdr;		/* id = CCWR_CONSOLE */
+	u64 reply_buf;		/* pinned host buf for reply */
+	u32 reply_buf_len;	/* length of reply buffer */
+	u8 command[0];		/* NUL terminated ascii string */
+	/* containing the command req */
+} __attribute__((packed));
+
+/*
+ * flags used in the console reply.
+ */
+enum c2_console_flags {
+	CONS_REPLY_TRUNCATED = 0x00000001	/* reply was truncated */
+} __attribute__((packed));
+
+/*
+ * Console reply message.
+ * hdr.result contains the c2_status_t error if the reply was _not_ generated,
+ * or C2_OK if the reply was generated.
+ */
+struct c2wr_console_rep {
+	struct c2wr_hdr hdr;		/* id = CCWR_CONSOLE */
+	u32 flags;
+} __attribute__((packed));
+
+union c2wr_console {
+	struct c2wr_console_req req;
+	struct c2wr_console_rep rep;
+} __attribute__((packed));
+
+
+/*
+ * Giant union with all WRs.  Makes life easier...
+ */
+union c2wr {
+	struct c2wr_hdr hdr;
+	struct c2wr_user_hdr user_hdr;
+	union c2wr_rnic_open rnic_open;
+	union c2wr_rnic_query rnic_query;
+	union c2wr_rnic_getconfig rnic_getconfig;
+	union c2wr_rnic_setconfig rnic_setconfig;
+	union c2wr_rnic_close rnic_close;
+	union c2wr_cq_create cq_create;
+	union c2wr_cq_modify cq_modify;
+	union c2wr_cq_destroy cq_destroy;
+	union c2wr_pd_alloc pd_alloc;
+	union c2wr_pd_dealloc pd_dealloc;
+	union c2wr_srq_create srq_create;
+	union c2wr_srq_destroy srq_destroy;
+	union c2wr_qp_create qp_create;
+	union c2wr_qp_query qp_query;
+	union c2wr_qp_modify qp_modify;
+	union c2wr_qp_destroy qp_destroy;
+	struct c2wr_qp_connect qp_connect;
+	union c2wr_nsmr_stag_alloc nsmr_stag_alloc;
+	union c2wr_nsmr_register nsmr_register;
+	union c2wr_nsmr_pbl nsmr_pbl;
+	union c2wr_mr_query mr_query;
+	union c2wr_mw_query mw_query;
+	union c2wr_stag_dealloc stag_dealloc;
+	union c2wr_sqwr sqwr;
+	struct c2wr_rqwr rqwr;
+	struct c2wr_ce ce;
+	union c2wr_ae ae;
+	union c2wr_init init;
+	union c2wr_ep_listen_create ep_listen_create;
+	union c2wr_ep_listen_destroy ep_listen_destroy;
+	union c2wr_cr_accept cr_accept;
+	union c2wr_cr_reject cr_reject;
+	union c2wr_console console;
+	union c2wr_flash_init flash_init;
+	union c2wr_flash flash;
+	union c2wr_buf_alloc buf_alloc;
+	union c2wr_buf_free buf_free;
+	union c2wr_flash_write flash_write;
+} __attribute__((packed));
+
+
+/*
+ * Accessors for the wr fields that are packed together tightly to
+ * reduce the wr message size.  The wr arguments are void* so that
+ * either a struct c2wr*, a struct c2wr_hdr*, or a pointer to any of the types
+ * in the struct c2wr union can be passed in.
+ */
+static __inline__ u8 c2_wr_get_id(void *wr)
+{
+	return ((struct c2wr_hdr *) wr)->id;
+}
+static __inline__ void c2_wr_set_id(void *wr, u8 id)
+{
+	((struct c2wr_hdr *) wr)->id = id;
+}
+static __inline__ u8 c2_wr_get_result(void *wr)
+{
+	return ((struct c2wr_hdr *) wr)->result;
+}
+static __inline__ void c2_wr_set_result(void *wr, u8 result)
+{
+	((struct c2wr_hdr *) wr)->result = result;
+}
+static __inline__ u8 c2_wr_get_flags(void *wr)
+{
+	return ((struct c2wr_hdr *) wr)->flags;
+}
+static __inline__ void c2_wr_set_flags(void *wr, u8 flags)
+{
+	((struct c2wr_hdr *) wr)->flags = flags;
+}
+static __inline__ u8 c2_wr_get_sge_count(void *wr)
+{
+	return ((struct c2wr_hdr *) wr)->sge_count;
+}
+static __inline__ void c2_wr_set_sge_count(void *wr, u8 sge_count)
+{
+	((struct c2wr_hdr *) wr)->sge_count = sge_count;
+}
+static __inline__ __be32 c2_wr_get_wqe_count(void *wr)
+{
+	return ((struct c2wr_hdr *) wr)->wqe_count;
+}
+static __inline__ void c2_wr_set_wqe_count(void *wr, u32 wqe_count)
+{
+	((struct c2wr_hdr *) wr)->wqe_count = wqe_count;
+}
+
+#endif				/* _C2_WR_H_ */
diff --git a/drivers/infiniband/hw/cxgb3/Kconfig b/drivers/infiniband/hw/cxgb3/Kconfig
new file mode 100644
index 0000000..2b6352b
--- /dev/null
+++ b/drivers/infiniband/hw/cxgb3/Kconfig
@@ -0,0 +1,27 @@
+config INFINIBAND_CXGB3
+	tristate "Chelsio RDMA Driver"
+	depends on CHELSIO_T3 && INET
+	select GENERIC_ALLOCATOR
+	---help---
+	  This is an iWARP/RDMA driver for the Chelsio T3 1GbE and
+	  10GbE adapters.
+
+	  For general information about Chelsio and our products, visit
+	  our website at <http://www.chelsio.com>.
+
+	  For customer support, please visit our customer support page at
+	  <http://www.chelsio.com/support.html>.
+
+	  Please send feedback to <linux-bugs@chelsio.com>.
+
+	  To compile this driver as a module, choose M here: the module
+	  will be called iw_cxgb3.
+
+config INFINIBAND_CXGB3_DEBUG
+	bool "Verbose debugging output"
+	depends on INFINIBAND_CXGB3
+	default n
+	---help---
+	  This option causes the Chelsio RDMA driver to produce copious
+	  amounts of debug messages.  Select this if you are developing
+	  the driver or trying to diagnose a problem.
diff --git a/drivers/infiniband/hw/cxgb3/Makefile b/drivers/infiniband/hw/cxgb3/Makefile
new file mode 100644
index 0000000..2761364
--- /dev/null
+++ b/drivers/infiniband/hw/cxgb3/Makefile
@@ -0,0 +1,8 @@
+ccflags-y := -Idrivers/net/ethernet/chelsio/cxgb3
+
+obj-$(CONFIG_INFINIBAND_CXGB3) += iw_cxgb3.o
+
+iw_cxgb3-y :=  iwch_cm.o iwch_ev.o iwch_cq.o iwch_qp.o iwch_mem.o \
+	       iwch_provider.o iwch.o cxio_hal.o cxio_resource.o
+
+ccflags-$(CONFIG_INFINIBAND_CXGB3_DEBUG) += -DDEBUG
diff --git a/drivers/infiniband/hw/cxgb3/cxio_dbg.c b/drivers/infiniband/hw/cxgb3/cxio_dbg.c
new file mode 100644
index 0000000..8bca6b4
--- /dev/null
+++ b/drivers/infiniband/hw/cxgb3/cxio_dbg.c
@@ -0,0 +1,207 @@
+/*
+ * Copyright (c) 2006 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef DEBUG
+#include <linux/types.h>
+#include <linux/slab.h>
+#include "common.h"
+#include "cxgb3_ioctl.h"
+#include "cxio_hal.h"
+#include "cxio_wr.h"
+
+void cxio_dump_tpt(struct cxio_rdev *rdev, u32 stag)
+{
+	struct ch_mem_range *m;
+	u64 *data;
+	int rc;
+	int size = 32;
+
+	m = kmalloc(sizeof(*m) + size, GFP_ATOMIC);
+	if (!m) {
+		PDBG("%s couldn't allocate memory.\n", __func__);
+		return;
+	}
+	m->mem_id = MEM_PMRX;
+	m->addr = (stag>>8) * 32 + rdev->rnic_info.tpt_base;
+	m->len = size;
+	PDBG("%s TPT addr 0x%x len %d\n", __func__, m->addr, m->len);
+	rc = rdev->t3cdev_p->ctl(rdev->t3cdev_p, RDMA_GET_MEM, m);
+	if (rc) {
+		PDBG("%s toectl returned error %d\n", __func__, rc);
+		kfree(m);
+		return;
+	}
+
+	data = (u64 *)m->buf;
+	while (size > 0) {
+		PDBG("TPT %08x: %016llx\n", m->addr, (unsigned long long) *data);
+		size -= 8;
+		data++;
+		m->addr += 8;
+	}
+	kfree(m);
+}
+
+void cxio_dump_pbl(struct cxio_rdev *rdev, u32 pbl_addr, uint len, u8 shift)
+{
+	struct ch_mem_range *m;
+	u64 *data;
+	int rc;
+	int size, npages;
+
+	shift += 12;
+	npages = (len + (1ULL << shift) - 1) >> shift;
+	size = npages * sizeof(u64);
+
+	m = kmalloc(sizeof(*m) + size, GFP_ATOMIC);
+	if (!m) {
+		PDBG("%s couldn't allocate memory.\n", __func__);
+		return;
+	}
+	m->mem_id = MEM_PMRX;
+	m->addr = pbl_addr;
+	m->len = size;
+	PDBG("%s PBL addr 0x%x len %d depth %d\n",
+		__func__, m->addr, m->len, npages);
+	rc = rdev->t3cdev_p->ctl(rdev->t3cdev_p, RDMA_GET_MEM, m);
+	if (rc) {
+		PDBG("%s toectl returned error %d\n", __func__, rc);
+		kfree(m);
+		return;
+	}
+
+	data = (u64 *)m->buf;
+	while (size > 0) {
+		PDBG("PBL %08x: %016llx\n", m->addr, (unsigned long long) *data);
+		size -= 8;
+		data++;
+		m->addr += 8;
+	}
+	kfree(m);
+}
+
+void cxio_dump_wqe(union t3_wr *wqe)
+{
+	__be64 *data = (__be64 *)wqe;
+	uint size = (uint)(be64_to_cpu(*data) & 0xff);
+
+	if (size == 0)
+		size = 8;
+	while (size > 0) {
+		PDBG("WQE %p: %016llx\n", data,
+		     (unsigned long long) be64_to_cpu(*data));
+		size--;
+		data++;
+	}
+}
+
+void cxio_dump_wce(struct t3_cqe *wce)
+{
+	__be64 *data = (__be64 *)wce;
+	int size = sizeof(*wce);
+
+	while (size > 0) {
+		PDBG("WCE %p: %016llx\n", data,
+		     (unsigned long long) be64_to_cpu(*data));
+		size -= 8;
+		data++;
+	}
+}
+
+void cxio_dump_rqt(struct cxio_rdev *rdev, u32 hwtid, int nents)
+{
+	struct ch_mem_range *m;
+	int size = nents * 64;
+	u64 *data;
+	int rc;
+
+	m = kmalloc(sizeof(*m) + size, GFP_ATOMIC);
+	if (!m) {
+		PDBG("%s couldn't allocate memory.\n", __func__);
+		return;
+	}
+	m->mem_id = MEM_PMRX;
+	m->addr = ((hwtid)<<10) + rdev->rnic_info.rqt_base;
+	m->len = size;
+	PDBG("%s RQT addr 0x%x len %d\n", __func__, m->addr, m->len);
+	rc = rdev->t3cdev_p->ctl(rdev->t3cdev_p, RDMA_GET_MEM, m);
+	if (rc) {
+		PDBG("%s toectl returned error %d\n", __func__, rc);
+		kfree(m);
+		return;
+	}
+
+	data = (u64 *)m->buf;
+	while (size > 0) {
+		PDBG("RQT %08x: %016llx\n", m->addr, (unsigned long long) *data);
+		size -= 8;
+		data++;
+		m->addr += 8;
+	}
+	kfree(m);
+}
+
+void cxio_dump_tcb(struct cxio_rdev *rdev, u32 hwtid)
+{
+	struct ch_mem_range *m;
+	int size = TCB_SIZE;
+	u32 *data;
+	int rc;
+
+	m = kmalloc(sizeof(*m) + size, GFP_ATOMIC);
+	if (!m) {
+		PDBG("%s couldn't allocate memory.\n", __func__);
+		return;
+	}
+	m->mem_id = MEM_CM;
+	m->addr = hwtid * size;
+	m->len = size;
+	PDBG("%s TCB %d len %d\n", __func__, m->addr, m->len);
+	rc = rdev->t3cdev_p->ctl(rdev->t3cdev_p, RDMA_GET_MEM, m);
+	if (rc) {
+		PDBG("%s toectl returned error %d\n", __func__, rc);
+		kfree(m);
+		return;
+	}
+
+	data = (u32 *)m->buf;
+	while (size > 0) {
+		printk("%2u: %08x %08x %08x %08x %08x %08x %08x %08x\n",
+			m->addr,
+			*(data+2), *(data+3), *(data),*(data+1),
+			*(data+6), *(data+7), *(data+4), *(data+5));
+		size -= 32;
+		data += 8;
+		m->addr += 32;
+	}
+	kfree(m);
+}
+#endif
diff --git a/drivers/infiniband/hw/cxgb3/cxio_hal.c b/drivers/infiniband/hw/cxgb3/cxio_hal.c
new file mode 100644
index 0000000..de1c61b
--- /dev/null
+++ b/drivers/infiniband/hw/cxgb3/cxio_hal.c
@@ -0,0 +1,1343 @@
+/*
+ * Copyright (c) 2006 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include <asm/delay.h>
+
+#include <linux/mutex.h>
+#include <linux/netdevice.h>
+#include <linux/sched.h>
+#include <linux/spinlock.h>
+#include <linux/pci.h>
+#include <linux/dma-mapping.h>
+#include <linux/slab.h>
+#include <net/net_namespace.h>
+
+#include "cxio_resource.h"
+#include "cxio_hal.h"
+#include "cxgb3_offload.h"
+#include "sge_defs.h"
+
+static LIST_HEAD(rdev_list);
+static cxio_hal_ev_callback_func_t cxio_ev_cb = NULL;
+
+static struct cxio_rdev *cxio_hal_find_rdev_by_name(char *dev_name)
+{
+	struct cxio_rdev *rdev;
+
+	list_for_each_entry(rdev, &rdev_list, entry)
+		if (!strcmp(rdev->dev_name, dev_name))
+			return rdev;
+	return NULL;
+}
+
+static struct cxio_rdev *cxio_hal_find_rdev_by_t3cdev(struct t3cdev *tdev)
+{
+	struct cxio_rdev *rdev;
+
+	list_for_each_entry(rdev, &rdev_list, entry)
+		if (rdev->t3cdev_p == tdev)
+			return rdev;
+	return NULL;
+}
+
+int cxio_hal_cq_op(struct cxio_rdev *rdev_p, struct t3_cq *cq,
+		   enum t3_cq_opcode op, u32 credit)
+{
+	int ret;
+	struct t3_cqe *cqe;
+	u32 rptr;
+
+	struct rdma_cq_op setup;
+	setup.id = cq->cqid;
+	setup.credits = (op == CQ_CREDIT_UPDATE) ? credit : 0;
+	setup.op = op;
+	ret = rdev_p->t3cdev_p->ctl(rdev_p->t3cdev_p, RDMA_CQ_OP, &setup);
+
+	if ((ret < 0) || (op == CQ_CREDIT_UPDATE))
+		return ret;
+
+	/*
+	 * If the rearm returned an index other than our current index,
+	 * then there might be CQE's in flight (being DMA'd).  We must wait
+	 * here for them to complete or the consumer can miss a notification.
+	 */
+	if (Q_PTR2IDX((cq->rptr), cq->size_log2) != ret) {
+		int i=0;
+
+		rptr = cq->rptr;
+
+		/*
+		 * Keep the generation correct by bumping rptr until it
+		 * matches the index returned by the rearm - 1.
+		 */
+		while (Q_PTR2IDX((rptr+1), cq->size_log2) != ret)
+			rptr++;
+
+		/*
+		 * Now rptr is the index for the (last) cqe that was
+		 * in-flight at the time the HW rearmed the CQ.  We
+		 * spin until that CQE is valid.
+		 */
+		cqe = cq->queue + Q_PTR2IDX(rptr, cq->size_log2);
+		while (!CQ_VLD_ENTRY(rptr, cq->size_log2, cqe)) {
+			udelay(1);
+			if (i++ > 1000000) {
+				printk(KERN_ERR "%s: stalled rnic\n",
+				       rdev_p->dev_name);
+				return -EIO;
+			}
+		}
+
+		return 1;
+	}
+
+	return 0;
+}
+
+static int cxio_hal_clear_cq_ctx(struct cxio_rdev *rdev_p, u32 cqid)
+{
+	struct rdma_cq_setup setup;
+	setup.id = cqid;
+	setup.base_addr = 0;	/* NULL address */
+	setup.size = 0;		/* disaable the CQ */
+	setup.credits = 0;
+	setup.credit_thres = 0;
+	setup.ovfl_mode = 0;
+	return (rdev_p->t3cdev_p->ctl(rdev_p->t3cdev_p, RDMA_CQ_SETUP, &setup));
+}
+
+static int cxio_hal_clear_qp_ctx(struct cxio_rdev *rdev_p, u32 qpid)
+{
+	u64 sge_cmd;
+	struct t3_modify_qp_wr *wqe;
+	struct sk_buff *skb = alloc_skb(sizeof(*wqe), GFP_KERNEL);
+	if (!skb) {
+		PDBG("%s alloc_skb failed\n", __func__);
+		return -ENOMEM;
+	}
+	wqe = (struct t3_modify_qp_wr *) skb_put(skb, sizeof(*wqe));
+	memset(wqe, 0, sizeof(*wqe));
+	build_fw_riwrh((struct fw_riwrh *) wqe, T3_WR_QP_MOD,
+		       T3_COMPLETION_FLAG | T3_NOTIFY_FLAG, 0, qpid, 7,
+		       T3_SOPEOP);
+	wqe->flags = cpu_to_be32(MODQP_WRITE_EC);
+	sge_cmd = qpid << 8 | 3;
+	wqe->sge_cmd = cpu_to_be64(sge_cmd);
+	skb->priority = CPL_PRIORITY_CONTROL;
+	return iwch_cxgb3_ofld_send(rdev_p->t3cdev_p, skb);
+}
+
+int cxio_create_cq(struct cxio_rdev *rdev_p, struct t3_cq *cq, int kernel)
+{
+	struct rdma_cq_setup setup;
+	int size = (1UL << (cq->size_log2)) * sizeof(struct t3_cqe);
+
+	size += 1; /* one extra page for storing cq-in-err state */
+	cq->cqid = cxio_hal_get_cqid(rdev_p->rscp);
+	if (!cq->cqid)
+		return -ENOMEM;
+	if (kernel) {
+		cq->sw_queue = kzalloc(size, GFP_KERNEL);
+		if (!cq->sw_queue)
+			return -ENOMEM;
+	}
+	cq->queue = dma_alloc_coherent(&(rdev_p->rnic_info.pdev->dev), size,
+					     &(cq->dma_addr), GFP_KERNEL);
+	if (!cq->queue) {
+		kfree(cq->sw_queue);
+		return -ENOMEM;
+	}
+	dma_unmap_addr_set(cq, mapping, cq->dma_addr);
+	memset(cq->queue, 0, size);
+	setup.id = cq->cqid;
+	setup.base_addr = (u64) (cq->dma_addr);
+	setup.size = 1UL << cq->size_log2;
+	setup.credits = 65535;
+	setup.credit_thres = 1;
+	if (rdev_p->t3cdev_p->type != T3A)
+		setup.ovfl_mode = 0;
+	else
+		setup.ovfl_mode = 1;
+	return (rdev_p->t3cdev_p->ctl(rdev_p->t3cdev_p, RDMA_CQ_SETUP, &setup));
+}
+
+#ifdef notyet
+int cxio_resize_cq(struct cxio_rdev *rdev_p, struct t3_cq *cq)
+{
+	struct rdma_cq_setup setup;
+	setup.id = cq->cqid;
+	setup.base_addr = (u64) (cq->dma_addr);
+	setup.size = 1UL << cq->size_log2;
+	setup.credits = setup.size;
+	setup.credit_thres = setup.size;	/* TBD: overflow recovery */
+	setup.ovfl_mode = 1;
+	return (rdev_p->t3cdev_p->ctl(rdev_p->t3cdev_p, RDMA_CQ_SETUP, &setup));
+}
+#endif
+
+static u32 get_qpid(struct cxio_rdev *rdev_p, struct cxio_ucontext *uctx)
+{
+	struct cxio_qpid_list *entry;
+	u32 qpid;
+	int i;
+
+	mutex_lock(&uctx->lock);
+	if (!list_empty(&uctx->qpids)) {
+		entry = list_entry(uctx->qpids.next, struct cxio_qpid_list,
+				   entry);
+		list_del(&entry->entry);
+		qpid = entry->qpid;
+		kfree(entry);
+	} else {
+		qpid = cxio_hal_get_qpid(rdev_p->rscp);
+		if (!qpid)
+			goto out;
+		for (i = qpid+1; i & rdev_p->qpmask; i++) {
+			entry = kmalloc(sizeof *entry, GFP_KERNEL);
+			if (!entry)
+				break;
+			entry->qpid = i;
+			list_add_tail(&entry->entry, &uctx->qpids);
+		}
+	}
+out:
+	mutex_unlock(&uctx->lock);
+	PDBG("%s qpid 0x%x\n", __func__, qpid);
+	return qpid;
+}
+
+static void put_qpid(struct cxio_rdev *rdev_p, u32 qpid,
+		     struct cxio_ucontext *uctx)
+{
+	struct cxio_qpid_list *entry;
+
+	entry = kmalloc(sizeof *entry, GFP_KERNEL);
+	if (!entry)
+		return;
+	PDBG("%s qpid 0x%x\n", __func__, qpid);
+	entry->qpid = qpid;
+	mutex_lock(&uctx->lock);
+	list_add_tail(&entry->entry, &uctx->qpids);
+	mutex_unlock(&uctx->lock);
+}
+
+void cxio_release_ucontext(struct cxio_rdev *rdev_p, struct cxio_ucontext *uctx)
+{
+	struct list_head *pos, *nxt;
+	struct cxio_qpid_list *entry;
+
+	mutex_lock(&uctx->lock);
+	list_for_each_safe(pos, nxt, &uctx->qpids) {
+		entry = list_entry(pos, struct cxio_qpid_list, entry);
+		list_del_init(&entry->entry);
+		if (!(entry->qpid & rdev_p->qpmask))
+			cxio_hal_put_qpid(rdev_p->rscp, entry->qpid);
+		kfree(entry);
+	}
+	mutex_unlock(&uctx->lock);
+}
+
+void cxio_init_ucontext(struct cxio_rdev *rdev_p, struct cxio_ucontext *uctx)
+{
+	INIT_LIST_HEAD(&uctx->qpids);
+	mutex_init(&uctx->lock);
+}
+
+int cxio_create_qp(struct cxio_rdev *rdev_p, u32 kernel_domain,
+		   struct t3_wq *wq, struct cxio_ucontext *uctx)
+{
+	int depth = 1UL << wq->size_log2;
+	int rqsize = 1UL << wq->rq_size_log2;
+
+	wq->qpid = get_qpid(rdev_p, uctx);
+	if (!wq->qpid)
+		return -ENOMEM;
+
+	wq->rq = kzalloc(depth * sizeof(struct t3_swrq), GFP_KERNEL);
+	if (!wq->rq)
+		goto err1;
+
+	wq->rq_addr = cxio_hal_rqtpool_alloc(rdev_p, rqsize);
+	if (!wq->rq_addr)
+		goto err2;
+
+	wq->sq = kzalloc(depth * sizeof(struct t3_swsq), GFP_KERNEL);
+	if (!wq->sq)
+		goto err3;
+
+	wq->queue = dma_alloc_coherent(&(rdev_p->rnic_info.pdev->dev),
+					     depth * sizeof(union t3_wr),
+					     &(wq->dma_addr), GFP_KERNEL);
+	if (!wq->queue)
+		goto err4;
+
+	memset(wq->queue, 0, depth * sizeof(union t3_wr));
+	dma_unmap_addr_set(wq, mapping, wq->dma_addr);
+	wq->doorbell = (void __iomem *)rdev_p->rnic_info.kdb_addr;
+	if (!kernel_domain)
+		wq->udb = (u64)rdev_p->rnic_info.udbell_physbase +
+					(wq->qpid << rdev_p->qpshift);
+	wq->rdev = rdev_p;
+	PDBG("%s qpid 0x%x doorbell 0x%p udb 0x%llx\n", __func__,
+	     wq->qpid, wq->doorbell, (unsigned long long) wq->udb);
+	return 0;
+err4:
+	kfree(wq->sq);
+err3:
+	cxio_hal_rqtpool_free(rdev_p, wq->rq_addr, rqsize);
+err2:
+	kfree(wq->rq);
+err1:
+	put_qpid(rdev_p, wq->qpid, uctx);
+	return -ENOMEM;
+}
+
+int cxio_destroy_cq(struct cxio_rdev *rdev_p, struct t3_cq *cq)
+{
+	int err;
+	err = cxio_hal_clear_cq_ctx(rdev_p, cq->cqid);
+	kfree(cq->sw_queue);
+	dma_free_coherent(&(rdev_p->rnic_info.pdev->dev),
+			  (1UL << (cq->size_log2))
+			  * sizeof(struct t3_cqe), cq->queue,
+			  dma_unmap_addr(cq, mapping));
+	cxio_hal_put_cqid(rdev_p->rscp, cq->cqid);
+	return err;
+}
+
+int cxio_destroy_qp(struct cxio_rdev *rdev_p, struct t3_wq *wq,
+		    struct cxio_ucontext *uctx)
+{
+	dma_free_coherent(&(rdev_p->rnic_info.pdev->dev),
+			  (1UL << (wq->size_log2))
+			  * sizeof(union t3_wr), wq->queue,
+			  dma_unmap_addr(wq, mapping));
+	kfree(wq->sq);
+	cxio_hal_rqtpool_free(rdev_p, wq->rq_addr, (1UL << wq->rq_size_log2));
+	kfree(wq->rq);
+	put_qpid(rdev_p, wq->qpid, uctx);
+	return 0;
+}
+
+static void insert_recv_cqe(struct t3_wq *wq, struct t3_cq *cq)
+{
+	struct t3_cqe cqe;
+
+	PDBG("%s wq %p cq %p sw_rptr 0x%x sw_wptr 0x%x\n", __func__,
+	     wq, cq, cq->sw_rptr, cq->sw_wptr);
+	memset(&cqe, 0, sizeof(cqe));
+	cqe.header = cpu_to_be32(V_CQE_STATUS(TPT_ERR_SWFLUSH) |
+			         V_CQE_OPCODE(T3_SEND) |
+				 V_CQE_TYPE(0) |
+				 V_CQE_SWCQE(1) |
+				 V_CQE_QPID(wq->qpid) |
+				 V_CQE_GENBIT(Q_GENBIT(cq->sw_wptr,
+						       cq->size_log2)));
+	*(cq->sw_queue + Q_PTR2IDX(cq->sw_wptr, cq->size_log2)) = cqe;
+	cq->sw_wptr++;
+}
+
+int cxio_flush_rq(struct t3_wq *wq, struct t3_cq *cq, int count)
+{
+	u32 ptr;
+	int flushed = 0;
+
+	PDBG("%s wq %p cq %p\n", __func__, wq, cq);
+
+	/* flush RQ */
+	PDBG("%s rq_rptr %u rq_wptr %u skip count %u\n", __func__,
+	    wq->rq_rptr, wq->rq_wptr, count);
+	ptr = wq->rq_rptr + count;
+	while (ptr++ != wq->rq_wptr) {
+		insert_recv_cqe(wq, cq);
+		flushed++;
+	}
+	return flushed;
+}
+
+static void insert_sq_cqe(struct t3_wq *wq, struct t3_cq *cq,
+		          struct t3_swsq *sqp)
+{
+	struct t3_cqe cqe;
+
+	PDBG("%s wq %p cq %p sw_rptr 0x%x sw_wptr 0x%x\n", __func__,
+	     wq, cq, cq->sw_rptr, cq->sw_wptr);
+	memset(&cqe, 0, sizeof(cqe));
+	cqe.header = cpu_to_be32(V_CQE_STATUS(TPT_ERR_SWFLUSH) |
+			         V_CQE_OPCODE(sqp->opcode) |
+			         V_CQE_TYPE(1) |
+			         V_CQE_SWCQE(1) |
+			         V_CQE_QPID(wq->qpid) |
+			         V_CQE_GENBIT(Q_GENBIT(cq->sw_wptr,
+						       cq->size_log2)));
+	cqe.u.scqe.wrid_hi = sqp->sq_wptr;
+
+	*(cq->sw_queue + Q_PTR2IDX(cq->sw_wptr, cq->size_log2)) = cqe;
+	cq->sw_wptr++;
+}
+
+int cxio_flush_sq(struct t3_wq *wq, struct t3_cq *cq, int count)
+{
+	__u32 ptr;
+	int flushed = 0;
+	struct t3_swsq *sqp = wq->sq + Q_PTR2IDX(wq->sq_rptr, wq->sq_size_log2);
+
+	ptr = wq->sq_rptr + count;
+	sqp = wq->sq + Q_PTR2IDX(ptr, wq->sq_size_log2);
+	while (ptr != wq->sq_wptr) {
+		sqp->signaled = 0;
+		insert_sq_cqe(wq, cq, sqp);
+		ptr++;
+		sqp = wq->sq + Q_PTR2IDX(ptr, wq->sq_size_log2);
+		flushed++;
+	}
+	return flushed;
+}
+
+/*
+ * Move all CQEs from the HWCQ into the SWCQ.
+ */
+void cxio_flush_hw_cq(struct t3_cq *cq)
+{
+	struct t3_cqe *cqe, *swcqe;
+
+	PDBG("%s cq %p cqid 0x%x\n", __func__, cq, cq->cqid);
+	cqe = cxio_next_hw_cqe(cq);
+	while (cqe) {
+		PDBG("%s flushing hwcq rptr 0x%x to swcq wptr 0x%x\n",
+		     __func__, cq->rptr, cq->sw_wptr);
+		swcqe = cq->sw_queue + Q_PTR2IDX(cq->sw_wptr, cq->size_log2);
+		*swcqe = *cqe;
+		swcqe->header |= cpu_to_be32(V_CQE_SWCQE(1));
+		cq->sw_wptr++;
+		cq->rptr++;
+		cqe = cxio_next_hw_cqe(cq);
+	}
+}
+
+static int cqe_completes_wr(struct t3_cqe *cqe, struct t3_wq *wq)
+{
+	if (CQE_OPCODE(*cqe) == T3_TERMINATE)
+		return 0;
+
+	if ((CQE_OPCODE(*cqe) == T3_RDMA_WRITE) && RQ_TYPE(*cqe))
+		return 0;
+
+	if ((CQE_OPCODE(*cqe) == T3_READ_RESP) && SQ_TYPE(*cqe))
+		return 0;
+
+	if (CQE_SEND_OPCODE(*cqe) && RQ_TYPE(*cqe) &&
+	    Q_EMPTY(wq->rq_rptr, wq->rq_wptr))
+		return 0;
+
+	return 1;
+}
+
+void cxio_count_scqes(struct t3_cq *cq, struct t3_wq *wq, int *count)
+{
+	struct t3_cqe *cqe;
+	u32 ptr;
+
+	*count = 0;
+	ptr = cq->sw_rptr;
+	while (!Q_EMPTY(ptr, cq->sw_wptr)) {
+		cqe = cq->sw_queue + (Q_PTR2IDX(ptr, cq->size_log2));
+		if ((SQ_TYPE(*cqe) ||
+		     ((CQE_OPCODE(*cqe) == T3_READ_RESP) && wq->oldest_read)) &&
+		    (CQE_QPID(*cqe) == wq->qpid))
+			(*count)++;
+		ptr++;
+	}
+	PDBG("%s cq %p count %d\n", __func__, cq, *count);
+}
+
+void cxio_count_rcqes(struct t3_cq *cq, struct t3_wq *wq, int *count)
+{
+	struct t3_cqe *cqe;
+	u32 ptr;
+
+	*count = 0;
+	PDBG("%s count zero %d\n", __func__, *count);
+	ptr = cq->sw_rptr;
+	while (!Q_EMPTY(ptr, cq->sw_wptr)) {
+		cqe = cq->sw_queue + (Q_PTR2IDX(ptr, cq->size_log2));
+		if (RQ_TYPE(*cqe) && (CQE_OPCODE(*cqe) != T3_READ_RESP) &&
+		    (CQE_QPID(*cqe) == wq->qpid) && cqe_completes_wr(cqe, wq))
+			(*count)++;
+		ptr++;
+	}
+	PDBG("%s cq %p count %d\n", __func__, cq, *count);
+}
+
+static int cxio_hal_init_ctrl_cq(struct cxio_rdev *rdev_p)
+{
+	struct rdma_cq_setup setup;
+	setup.id = 0;
+	setup.base_addr = 0;	/* NULL address */
+	setup.size = 1;		/* enable the CQ */
+	setup.credits = 0;
+
+	/* force SGE to redirect to RspQ and interrupt */
+	setup.credit_thres = 0;
+	setup.ovfl_mode = 1;
+	return (rdev_p->t3cdev_p->ctl(rdev_p->t3cdev_p, RDMA_CQ_SETUP, &setup));
+}
+
+static int cxio_hal_init_ctrl_qp(struct cxio_rdev *rdev_p)
+{
+	int err;
+	u64 sge_cmd, ctx0, ctx1;
+	u64 base_addr;
+	struct t3_modify_qp_wr *wqe;
+	struct sk_buff *skb;
+
+	skb = alloc_skb(sizeof(*wqe), GFP_KERNEL);
+	if (!skb) {
+		PDBG("%s alloc_skb failed\n", __func__);
+		return -ENOMEM;
+	}
+	err = cxio_hal_init_ctrl_cq(rdev_p);
+	if (err) {
+		PDBG("%s err %d initializing ctrl_cq\n", __func__, err);
+		goto err;
+	}
+	rdev_p->ctrl_qp.workq = dma_alloc_coherent(
+					&(rdev_p->rnic_info.pdev->dev),
+					(1 << T3_CTRL_QP_SIZE_LOG2) *
+					sizeof(union t3_wr),
+					&(rdev_p->ctrl_qp.dma_addr),
+					GFP_KERNEL);
+	if (!rdev_p->ctrl_qp.workq) {
+		PDBG("%s dma_alloc_coherent failed\n", __func__);
+		err = -ENOMEM;
+		goto err;
+	}
+	dma_unmap_addr_set(&rdev_p->ctrl_qp, mapping,
+			   rdev_p->ctrl_qp.dma_addr);
+	rdev_p->ctrl_qp.doorbell = (void __iomem *)rdev_p->rnic_info.kdb_addr;
+	memset(rdev_p->ctrl_qp.workq, 0,
+	       (1 << T3_CTRL_QP_SIZE_LOG2) * sizeof(union t3_wr));
+
+	mutex_init(&rdev_p->ctrl_qp.lock);
+	init_waitqueue_head(&rdev_p->ctrl_qp.waitq);
+
+	/* update HW Ctrl QP context */
+	base_addr = rdev_p->ctrl_qp.dma_addr;
+	base_addr >>= 12;
+	ctx0 = (V_EC_SIZE((1 << T3_CTRL_QP_SIZE_LOG2)) |
+		V_EC_BASE_LO((u32) base_addr & 0xffff));
+	ctx0 <<= 32;
+	ctx0 |= V_EC_CREDITS(FW_WR_NUM);
+	base_addr >>= 16;
+	ctx1 = (u32) base_addr;
+	base_addr >>= 32;
+	ctx1 |= ((u64) (V_EC_BASE_HI((u32) base_addr & 0xf) | V_EC_RESPQ(0) |
+			V_EC_TYPE(0) | V_EC_GEN(1) |
+			V_EC_UP_TOKEN(T3_CTL_QP_TID) | F_EC_VALID)) << 32;
+	wqe = (struct t3_modify_qp_wr *) skb_put(skb, sizeof(*wqe));
+	memset(wqe, 0, sizeof(*wqe));
+	build_fw_riwrh((struct fw_riwrh *) wqe, T3_WR_QP_MOD, 0, 0,
+		       T3_CTL_QP_TID, 7, T3_SOPEOP);
+	wqe->flags = cpu_to_be32(MODQP_WRITE_EC);
+	sge_cmd = (3ULL << 56) | FW_RI_SGEEC_START << 8 | 3;
+	wqe->sge_cmd = cpu_to_be64(sge_cmd);
+	wqe->ctx1 = cpu_to_be64(ctx1);
+	wqe->ctx0 = cpu_to_be64(ctx0);
+	PDBG("CtrlQP dma_addr 0x%llx workq %p size %d\n",
+	     (unsigned long long) rdev_p->ctrl_qp.dma_addr,
+	     rdev_p->ctrl_qp.workq, 1 << T3_CTRL_QP_SIZE_LOG2);
+	skb->priority = CPL_PRIORITY_CONTROL;
+	return iwch_cxgb3_ofld_send(rdev_p->t3cdev_p, skb);
+err:
+	kfree_skb(skb);
+	return err;
+}
+
+static int cxio_hal_destroy_ctrl_qp(struct cxio_rdev *rdev_p)
+{
+	dma_free_coherent(&(rdev_p->rnic_info.pdev->dev),
+			  (1UL << T3_CTRL_QP_SIZE_LOG2)
+			  * sizeof(union t3_wr), rdev_p->ctrl_qp.workq,
+			  dma_unmap_addr(&rdev_p->ctrl_qp, mapping));
+	return cxio_hal_clear_qp_ctx(rdev_p, T3_CTRL_QP_ID);
+}
+
+/* write len bytes of data into addr (32B aligned address)
+ * If data is NULL, clear len byte of memory to zero.
+ * caller acquires the ctrl_qp lock before the call
+ */
+static int cxio_hal_ctrl_qp_write_mem(struct cxio_rdev *rdev_p, u32 addr,
+				      u32 len, void *data)
+{
+	u32 i, nr_wqe, copy_len;
+	u8 *copy_data;
+	u8 wr_len, utx_len;	/* length in 8 byte flit */
+	enum t3_wr_flags flag;
+	__be64 *wqe;
+	u64 utx_cmd;
+	addr &= 0x7FFFFFF;
+	nr_wqe = len % 96 ? len / 96 + 1 : len / 96;	/* 96B max per WQE */
+	PDBG("%s wptr 0x%x rptr 0x%x len %d, nr_wqe %d data %p addr 0x%0x\n",
+	     __func__, rdev_p->ctrl_qp.wptr, rdev_p->ctrl_qp.rptr, len,
+	     nr_wqe, data, addr);
+	utx_len = 3;		/* in 32B unit */
+	for (i = 0; i < nr_wqe; i++) {
+		if (Q_FULL(rdev_p->ctrl_qp.rptr, rdev_p->ctrl_qp.wptr,
+		           T3_CTRL_QP_SIZE_LOG2)) {
+			PDBG("%s ctrl_qp full wtpr 0x%0x rptr 0x%0x, "
+			     "wait for more space i %d\n", __func__,
+			     rdev_p->ctrl_qp.wptr, rdev_p->ctrl_qp.rptr, i);
+			if (wait_event_interruptible(rdev_p->ctrl_qp.waitq,
+					     !Q_FULL(rdev_p->ctrl_qp.rptr,
+						     rdev_p->ctrl_qp.wptr,
+						     T3_CTRL_QP_SIZE_LOG2))) {
+				PDBG("%s ctrl_qp workq interrupted\n",
+				     __func__);
+				return -ERESTARTSYS;
+			}
+			PDBG("%s ctrl_qp wakeup, continue posting work request "
+			     "i %d\n", __func__, i);
+		}
+		wqe = (__be64 *)(rdev_p->ctrl_qp.workq + (rdev_p->ctrl_qp.wptr %
+						(1 << T3_CTRL_QP_SIZE_LOG2)));
+		flag = 0;
+		if (i == (nr_wqe - 1)) {
+			/* last WQE */
+			flag = T3_COMPLETION_FLAG;
+			if (len % 32)
+				utx_len = len / 32 + 1;
+			else
+				utx_len = len / 32;
+		}
+
+		/*
+		 * Force a CQE to return the credit to the workq in case
+		 * we posted more than half the max QP size of WRs
+		 */
+		if ((i != 0) &&
+		    (i % (((1 << T3_CTRL_QP_SIZE_LOG2)) >> 1) == 0)) {
+			flag = T3_COMPLETION_FLAG;
+			PDBG("%s force completion at i %d\n", __func__, i);
+		}
+
+		/* build the utx mem command */
+		wqe += (sizeof(struct t3_bypass_wr) >> 3);
+		utx_cmd = (T3_UTX_MEM_WRITE << 28) | (addr + i * 3);
+		utx_cmd <<= 32;
+		utx_cmd |= (utx_len << 28) | ((utx_len << 2) + 1);
+		*wqe = cpu_to_be64(utx_cmd);
+		wqe++;
+		copy_data = (u8 *) data + i * 96;
+		copy_len = len > 96 ? 96 : len;
+
+		/* clear memory content if data is NULL */
+		if (data)
+			memcpy(wqe, copy_data, copy_len);
+		else
+			memset(wqe, 0, copy_len);
+		if (copy_len % 32)
+			memset(((u8 *) wqe) + copy_len, 0,
+			       32 - (copy_len % 32));
+		wr_len = ((sizeof(struct t3_bypass_wr)) >> 3) + 1 +
+			 (utx_len << 2);
+		wqe = (__be64 *)(rdev_p->ctrl_qp.workq + (rdev_p->ctrl_qp.wptr %
+			      (1 << T3_CTRL_QP_SIZE_LOG2)));
+
+		/* wptr in the WRID[31:0] */
+		((union t3_wrid *)(wqe+1))->id0.low = rdev_p->ctrl_qp.wptr;
+
+		/*
+		 * This must be the last write with a memory barrier
+		 * for the genbit
+		 */
+		build_fw_riwrh((struct fw_riwrh *) wqe, T3_WR_BP, flag,
+			       Q_GENBIT(rdev_p->ctrl_qp.wptr,
+					T3_CTRL_QP_SIZE_LOG2), T3_CTRL_QP_ID,
+			       wr_len, T3_SOPEOP);
+		if (flag == T3_COMPLETION_FLAG)
+			ring_doorbell(rdev_p->ctrl_qp.doorbell, T3_CTRL_QP_ID);
+		len -= 96;
+		rdev_p->ctrl_qp.wptr++;
+	}
+	return 0;
+}
+
+/* IN: stag key, pdid, perm, zbva, to, len, page_size, pbl_size and pbl_addr
+ * OUT: stag index
+ * TBD: shared memory region support
+ */
+static int __cxio_tpt_op(struct cxio_rdev *rdev_p, u32 reset_tpt_entry,
+			 u32 *stag, u8 stag_state, u32 pdid,
+			 enum tpt_mem_type type, enum tpt_mem_perm perm,
+			 u32 zbva, u64 to, u32 len, u8 page_size,
+			 u32 pbl_size, u32 pbl_addr)
+{
+	int err;
+	struct tpt_entry tpt;
+	u32 stag_idx;
+	u32 wptr;
+
+	if (cxio_fatal_error(rdev_p))
+		return -EIO;
+
+	stag_state = stag_state > 0;
+	stag_idx = (*stag) >> 8;
+
+	if ((!reset_tpt_entry) && !(*stag != T3_STAG_UNSET)) {
+		stag_idx = cxio_hal_get_stag(rdev_p->rscp);
+		if (!stag_idx)
+			return -ENOMEM;
+		*stag = (stag_idx << 8) | ((*stag) & 0xFF);
+	}
+	PDBG("%s stag_state 0x%0x type 0x%0x pdid 0x%0x, stag_idx 0x%x\n",
+	     __func__, stag_state, type, pdid, stag_idx);
+
+	mutex_lock(&rdev_p->ctrl_qp.lock);
+
+	/* write TPT entry */
+	if (reset_tpt_entry)
+		memset(&tpt, 0, sizeof(tpt));
+	else {
+		tpt.valid_stag_pdid = cpu_to_be32(F_TPT_VALID |
+				V_TPT_STAG_KEY((*stag) & M_TPT_STAG_KEY) |
+				V_TPT_STAG_STATE(stag_state) |
+				V_TPT_STAG_TYPE(type) | V_TPT_PDID(pdid));
+		BUG_ON(page_size >= 28);
+		tpt.flags_pagesize_qpid = cpu_to_be32(V_TPT_PERM(perm) |
+			((perm & TPT_MW_BIND) ? F_TPT_MW_BIND_ENABLE : 0) |
+			V_TPT_ADDR_TYPE((zbva ? TPT_ZBTO : TPT_VATO)) |
+			V_TPT_PAGE_SIZE(page_size));
+		tpt.rsvd_pbl_addr = cpu_to_be32(V_TPT_PBL_ADDR(PBL_OFF(rdev_p, pbl_addr)>>3));
+		tpt.len = cpu_to_be32(len);
+		tpt.va_hi = cpu_to_be32((u32) (to >> 32));
+		tpt.va_low_or_fbo = cpu_to_be32((u32) (to & 0xFFFFFFFFULL));
+		tpt.rsvd_bind_cnt_or_pstag = 0;
+		tpt.rsvd_pbl_size = cpu_to_be32(V_TPT_PBL_SIZE(pbl_size >> 2));
+	}
+	err = cxio_hal_ctrl_qp_write_mem(rdev_p,
+				       stag_idx +
+				       (rdev_p->rnic_info.tpt_base >> 5),
+				       sizeof(tpt), &tpt);
+
+	/* release the stag index to free pool */
+	if (reset_tpt_entry)
+		cxio_hal_put_stag(rdev_p->rscp, stag_idx);
+
+	wptr = rdev_p->ctrl_qp.wptr;
+	mutex_unlock(&rdev_p->ctrl_qp.lock);
+	if (!err)
+		if (wait_event_interruptible(rdev_p->ctrl_qp.waitq,
+					     SEQ32_GE(rdev_p->ctrl_qp.rptr,
+						      wptr)))
+			return -ERESTARTSYS;
+	return err;
+}
+
+int cxio_write_pbl(struct cxio_rdev *rdev_p, __be64 *pbl,
+		   u32 pbl_addr, u32 pbl_size)
+{
+	u32 wptr;
+	int err;
+
+	PDBG("%s *pdb_addr 0x%x, pbl_base 0x%x, pbl_size %d\n",
+	     __func__, pbl_addr, rdev_p->rnic_info.pbl_base,
+	     pbl_size);
+
+	mutex_lock(&rdev_p->ctrl_qp.lock);
+	err = cxio_hal_ctrl_qp_write_mem(rdev_p, pbl_addr >> 5, pbl_size << 3,
+					 pbl);
+	wptr = rdev_p->ctrl_qp.wptr;
+	mutex_unlock(&rdev_p->ctrl_qp.lock);
+	if (err)
+		return err;
+
+	if (wait_event_interruptible(rdev_p->ctrl_qp.waitq,
+				     SEQ32_GE(rdev_p->ctrl_qp.rptr,
+					      wptr)))
+		return -ERESTARTSYS;
+
+	return 0;
+}
+
+int cxio_register_phys_mem(struct cxio_rdev *rdev_p, u32 *stag, u32 pdid,
+			   enum tpt_mem_perm perm, u32 zbva, u64 to, u32 len,
+			   u8 page_size, u32 pbl_size, u32 pbl_addr)
+{
+	*stag = T3_STAG_UNSET;
+	return __cxio_tpt_op(rdev_p, 0, stag, 1, pdid, TPT_NON_SHARED_MR, perm,
+			     zbva, to, len, page_size, pbl_size, pbl_addr);
+}
+
+int cxio_reregister_phys_mem(struct cxio_rdev *rdev_p, u32 *stag, u32 pdid,
+			   enum tpt_mem_perm perm, u32 zbva, u64 to, u32 len,
+			   u8 page_size, u32 pbl_size, u32 pbl_addr)
+{
+	return __cxio_tpt_op(rdev_p, 0, stag, 1, pdid, TPT_NON_SHARED_MR, perm,
+			     zbva, to, len, page_size, pbl_size, pbl_addr);
+}
+
+int cxio_dereg_mem(struct cxio_rdev *rdev_p, u32 stag, u32 pbl_size,
+		   u32 pbl_addr)
+{
+	return __cxio_tpt_op(rdev_p, 1, &stag, 0, 0, 0, 0, 0, 0ULL, 0, 0,
+			     pbl_size, pbl_addr);
+}
+
+int cxio_allocate_window(struct cxio_rdev *rdev_p, u32 * stag, u32 pdid)
+{
+	*stag = T3_STAG_UNSET;
+	return __cxio_tpt_op(rdev_p, 0, stag, 0, pdid, TPT_MW, 0, 0, 0ULL, 0, 0,
+			     0, 0);
+}
+
+int cxio_deallocate_window(struct cxio_rdev *rdev_p, u32 stag)
+{
+	return __cxio_tpt_op(rdev_p, 1, &stag, 0, 0, 0, 0, 0, 0ULL, 0, 0,
+			     0, 0);
+}
+
+int cxio_allocate_stag(struct cxio_rdev *rdev_p, u32 *stag, u32 pdid, u32 pbl_size, u32 pbl_addr)
+{
+	*stag = T3_STAG_UNSET;
+	return __cxio_tpt_op(rdev_p, 0, stag, 0, pdid, TPT_NON_SHARED_MR,
+			     0, 0, 0ULL, 0, 0, pbl_size, pbl_addr);
+}
+
+int cxio_rdma_init(struct cxio_rdev *rdev_p, struct t3_rdma_init_attr *attr)
+{
+	struct t3_rdma_init_wr *wqe;
+	struct sk_buff *skb = alloc_skb(sizeof(*wqe), GFP_ATOMIC);
+	if (!skb)
+		return -ENOMEM;
+	PDBG("%s rdev_p %p\n", __func__, rdev_p);
+	wqe = (struct t3_rdma_init_wr *) __skb_put(skb, sizeof(*wqe));
+	wqe->wrh.op_seop_flags = cpu_to_be32(V_FW_RIWR_OP(T3_WR_INIT));
+	wqe->wrh.gen_tid_len = cpu_to_be32(V_FW_RIWR_TID(attr->tid) |
+					   V_FW_RIWR_LEN(sizeof(*wqe) >> 3));
+	wqe->wrid.id1 = 0;
+	wqe->qpid = cpu_to_be32(attr->qpid);
+	wqe->pdid = cpu_to_be32(attr->pdid);
+	wqe->scqid = cpu_to_be32(attr->scqid);
+	wqe->rcqid = cpu_to_be32(attr->rcqid);
+	wqe->rq_addr = cpu_to_be32(attr->rq_addr - rdev_p->rnic_info.rqt_base);
+	wqe->rq_size = cpu_to_be32(attr->rq_size);
+	wqe->mpaattrs = attr->mpaattrs;
+	wqe->qpcaps = attr->qpcaps;
+	wqe->ulpdu_size = cpu_to_be16(attr->tcp_emss);
+	wqe->rqe_count = cpu_to_be16(attr->rqe_count);
+	wqe->flags_rtr_type = cpu_to_be16(attr->flags |
+					  V_RTR_TYPE(attr->rtr_type) |
+					  V_CHAN(attr->chan));
+	wqe->ord = cpu_to_be32(attr->ord);
+	wqe->ird = cpu_to_be32(attr->ird);
+	wqe->qp_dma_addr = cpu_to_be64(attr->qp_dma_addr);
+	wqe->qp_dma_size = cpu_to_be32(attr->qp_dma_size);
+	wqe->irs = cpu_to_be32(attr->irs);
+	skb->priority = 0;	/* 0=>ToeQ; 1=>CtrlQ */
+	return iwch_cxgb3_ofld_send(rdev_p->t3cdev_p, skb);
+}
+
+void cxio_register_ev_cb(cxio_hal_ev_callback_func_t ev_cb)
+{
+	cxio_ev_cb = ev_cb;
+}
+
+void cxio_unregister_ev_cb(cxio_hal_ev_callback_func_t ev_cb)
+{
+	cxio_ev_cb = NULL;
+}
+
+static int cxio_hal_ev_handler(struct t3cdev *t3cdev_p, struct sk_buff *skb)
+{
+	static int cnt;
+	struct cxio_rdev *rdev_p = NULL;
+	struct respQ_msg_t *rsp_msg = (struct respQ_msg_t *) skb->data;
+	PDBG("%d: %s cq_id 0x%x cq_ptr 0x%x genbit %0x overflow %0x an %0x"
+	     " se %0x notify %0x cqbranch %0x creditth %0x\n",
+	     cnt, __func__, RSPQ_CQID(rsp_msg), RSPQ_CQPTR(rsp_msg),
+	     RSPQ_GENBIT(rsp_msg), RSPQ_OVERFLOW(rsp_msg), RSPQ_AN(rsp_msg),
+	     RSPQ_SE(rsp_msg), RSPQ_NOTIFY(rsp_msg), RSPQ_CQBRANCH(rsp_msg),
+	     RSPQ_CREDIT_THRESH(rsp_msg));
+	PDBG("CQE: QPID 0x%0x genbit %0x type 0x%0x status 0x%0x opcode %d "
+	     "len 0x%0x wrid_hi_stag 0x%x wrid_low_msn 0x%x\n",
+	     CQE_QPID(rsp_msg->cqe), CQE_GENBIT(rsp_msg->cqe),
+	     CQE_TYPE(rsp_msg->cqe), CQE_STATUS(rsp_msg->cqe),
+	     CQE_OPCODE(rsp_msg->cqe), CQE_LEN(rsp_msg->cqe),
+	     CQE_WRID_HI(rsp_msg->cqe), CQE_WRID_LOW(rsp_msg->cqe));
+	rdev_p = (struct cxio_rdev *)t3cdev_p->ulp;
+	if (!rdev_p) {
+		PDBG("%s called by t3cdev %p with null ulp\n", __func__,
+		     t3cdev_p);
+		return 0;
+	}
+	if (CQE_QPID(rsp_msg->cqe) == T3_CTRL_QP_ID) {
+		rdev_p->ctrl_qp.rptr = CQE_WRID_LOW(rsp_msg->cqe) + 1;
+		wake_up_interruptible(&rdev_p->ctrl_qp.waitq);
+		dev_kfree_skb_irq(skb);
+	} else if (CQE_QPID(rsp_msg->cqe) == 0xfff8)
+		dev_kfree_skb_irq(skb);
+	else if (cxio_ev_cb)
+		(*cxio_ev_cb) (rdev_p, skb);
+	else
+		dev_kfree_skb_irq(skb);
+	cnt++;
+	return 0;
+}
+
+/* Caller takes care of locking if needed */
+int cxio_rdev_open(struct cxio_rdev *rdev_p)
+{
+	struct net_device *netdev_p = NULL;
+	int err = 0;
+	if (strlen(rdev_p->dev_name)) {
+		if (cxio_hal_find_rdev_by_name(rdev_p->dev_name)) {
+			return -EBUSY;
+		}
+		netdev_p = dev_get_by_name(&init_net, rdev_p->dev_name);
+		if (!netdev_p) {
+			return -EINVAL;
+		}
+		dev_put(netdev_p);
+	} else if (rdev_p->t3cdev_p) {
+		if (cxio_hal_find_rdev_by_t3cdev(rdev_p->t3cdev_p)) {
+			return -EBUSY;
+		}
+		netdev_p = rdev_p->t3cdev_p->lldev;
+		strncpy(rdev_p->dev_name, rdev_p->t3cdev_p->name,
+			T3_MAX_DEV_NAME_LEN);
+	} else {
+		PDBG("%s t3cdev_p or dev_name must be set\n", __func__);
+		return -EINVAL;
+	}
+
+	list_add_tail(&rdev_p->entry, &rdev_list);
+
+	PDBG("%s opening rnic dev %s\n", __func__, rdev_p->dev_name);
+	memset(&rdev_p->ctrl_qp, 0, sizeof(rdev_p->ctrl_qp));
+	if (!rdev_p->t3cdev_p)
+		rdev_p->t3cdev_p = dev2t3cdev(netdev_p);
+	rdev_p->t3cdev_p->ulp = (void *) rdev_p;
+
+	err = rdev_p->t3cdev_p->ctl(rdev_p->t3cdev_p, GET_EMBEDDED_INFO,
+					 &(rdev_p->fw_info));
+	if (err) {
+		printk(KERN_ERR "%s t3cdev_p(%p)->ctl returned error %d.\n",
+		     __func__, rdev_p->t3cdev_p, err);
+		goto err1;
+	}
+	if (G_FW_VERSION_MAJOR(rdev_p->fw_info.fw_vers) != CXIO_FW_MAJ) {
+		printk(KERN_ERR MOD "fatal firmware version mismatch: "
+		       "need version %u but adapter has version %u\n",
+		       CXIO_FW_MAJ,
+		       G_FW_VERSION_MAJOR(rdev_p->fw_info.fw_vers));
+		err = -EINVAL;
+		goto err1;
+	}
+
+	err = rdev_p->t3cdev_p->ctl(rdev_p->t3cdev_p, RDMA_GET_PARAMS,
+					 &(rdev_p->rnic_info));
+	if (err) {
+		printk(KERN_ERR "%s t3cdev_p(%p)->ctl returned error %d.\n",
+		     __func__, rdev_p->t3cdev_p, err);
+		goto err1;
+	}
+	err = rdev_p->t3cdev_p->ctl(rdev_p->t3cdev_p, GET_PORTS,
+				    &(rdev_p->port_info));
+	if (err) {
+		printk(KERN_ERR "%s t3cdev_p(%p)->ctl returned error %d.\n",
+		     __func__, rdev_p->t3cdev_p, err);
+		goto err1;
+	}
+
+	/*
+	 * qpshift is the number of bits to shift the qpid left in order
+	 * to get the correct address of the doorbell for that qp.
+	 */
+	cxio_init_ucontext(rdev_p, &rdev_p->uctx);
+	rdev_p->qpshift = PAGE_SHIFT -
+			  ilog2(65536 >>
+			            ilog2(rdev_p->rnic_info.udbell_len >>
+					      PAGE_SHIFT));
+	rdev_p->qpnr = rdev_p->rnic_info.udbell_len >> PAGE_SHIFT;
+	rdev_p->qpmask = (65536 >> ilog2(rdev_p->qpnr)) - 1;
+	PDBG("%s rnic %s info: tpt_base 0x%0x tpt_top 0x%0x num stags %d "
+	     "pbl_base 0x%0x pbl_top 0x%0x rqt_base 0x%0x, rqt_top 0x%0x\n",
+	     __func__, rdev_p->dev_name, rdev_p->rnic_info.tpt_base,
+	     rdev_p->rnic_info.tpt_top, cxio_num_stags(rdev_p),
+	     rdev_p->rnic_info.pbl_base,
+	     rdev_p->rnic_info.pbl_top, rdev_p->rnic_info.rqt_base,
+	     rdev_p->rnic_info.rqt_top);
+	PDBG("udbell_len 0x%0x udbell_physbase 0x%lx kdb_addr %p qpshift %lu "
+	     "qpnr %d qpmask 0x%x\n",
+	     rdev_p->rnic_info.udbell_len,
+	     rdev_p->rnic_info.udbell_physbase, rdev_p->rnic_info.kdb_addr,
+	     rdev_p->qpshift, rdev_p->qpnr, rdev_p->qpmask);
+
+	err = cxio_hal_init_ctrl_qp(rdev_p);
+	if (err) {
+		printk(KERN_ERR "%s error %d initializing ctrl_qp.\n",
+		       __func__, err);
+		goto err1;
+	}
+	err = cxio_hal_init_resource(rdev_p, cxio_num_stags(rdev_p), 0,
+				     0, T3_MAX_NUM_QP, T3_MAX_NUM_CQ,
+				     T3_MAX_NUM_PD);
+	if (err) {
+		printk(KERN_ERR "%s error %d initializing hal resources.\n",
+		       __func__, err);
+		goto err2;
+	}
+	err = cxio_hal_pblpool_create(rdev_p);
+	if (err) {
+		printk(KERN_ERR "%s error %d initializing pbl mem pool.\n",
+		       __func__, err);
+		goto err3;
+	}
+	err = cxio_hal_rqtpool_create(rdev_p);
+	if (err) {
+		printk(KERN_ERR "%s error %d initializing rqt mem pool.\n",
+		       __func__, err);
+		goto err4;
+	}
+	return 0;
+err4:
+	cxio_hal_pblpool_destroy(rdev_p);
+err3:
+	cxio_hal_destroy_resource(rdev_p->rscp);
+err2:
+	cxio_hal_destroy_ctrl_qp(rdev_p);
+err1:
+	rdev_p->t3cdev_p->ulp = NULL;
+	list_del(&rdev_p->entry);
+	return err;
+}
+
+void cxio_rdev_close(struct cxio_rdev *rdev_p)
+{
+	if (rdev_p) {
+		cxio_hal_pblpool_destroy(rdev_p);
+		cxio_hal_rqtpool_destroy(rdev_p);
+		list_del(&rdev_p->entry);
+		cxio_hal_destroy_ctrl_qp(rdev_p);
+		cxio_hal_destroy_resource(rdev_p->rscp);
+		rdev_p->t3cdev_p->ulp = NULL;
+	}
+}
+
+int __init cxio_hal_init(void)
+{
+	if (cxio_hal_init_rhdl_resource(T3_MAX_NUM_RI))
+		return -ENOMEM;
+	t3_register_cpl_handler(CPL_ASYNC_NOTIF, cxio_hal_ev_handler);
+	return 0;
+}
+
+void __exit cxio_hal_exit(void)
+{
+	struct cxio_rdev *rdev, *tmp;
+
+	t3_register_cpl_handler(CPL_ASYNC_NOTIF, NULL);
+	list_for_each_entry_safe(rdev, tmp, &rdev_list, entry)
+		cxio_rdev_close(rdev);
+	cxio_hal_destroy_rhdl_resource();
+}
+
+static void flush_completed_wrs(struct t3_wq *wq, struct t3_cq *cq)
+{
+	struct t3_swsq *sqp;
+	__u32 ptr = wq->sq_rptr;
+	int count = Q_COUNT(wq->sq_rptr, wq->sq_wptr);
+
+	sqp = wq->sq + Q_PTR2IDX(ptr, wq->sq_size_log2);
+	while (count--)
+		if (!sqp->signaled) {
+			ptr++;
+			sqp = wq->sq + Q_PTR2IDX(ptr,  wq->sq_size_log2);
+		} else if (sqp->complete) {
+
+			/*
+			 * Insert this completed cqe into the swcq.
+			 */
+			PDBG("%s moving cqe into swcq sq idx %ld cq idx %ld\n",
+			     __func__, Q_PTR2IDX(ptr,  wq->sq_size_log2),
+			     Q_PTR2IDX(cq->sw_wptr, cq->size_log2));
+			sqp->cqe.header |= htonl(V_CQE_SWCQE(1));
+			*(cq->sw_queue + Q_PTR2IDX(cq->sw_wptr, cq->size_log2))
+				= sqp->cqe;
+			cq->sw_wptr++;
+			sqp->signaled = 0;
+			break;
+		} else
+			break;
+}
+
+static void create_read_req_cqe(struct t3_wq *wq, struct t3_cqe *hw_cqe,
+				struct t3_cqe *read_cqe)
+{
+	read_cqe->u.scqe.wrid_hi = wq->oldest_read->sq_wptr;
+	read_cqe->len = wq->oldest_read->read_len;
+	read_cqe->header = htonl(V_CQE_QPID(CQE_QPID(*hw_cqe)) |
+				 V_CQE_SWCQE(SW_CQE(*hw_cqe)) |
+				 V_CQE_OPCODE(T3_READ_REQ) |
+				 V_CQE_TYPE(1));
+}
+
+/*
+ * Return a ptr to the next read wr in the SWSQ or NULL.
+ */
+static void advance_oldest_read(struct t3_wq *wq)
+{
+
+	u32 rptr = wq->oldest_read - wq->sq + 1;
+	u32 wptr = Q_PTR2IDX(wq->sq_wptr, wq->sq_size_log2);
+
+	while (Q_PTR2IDX(rptr, wq->sq_size_log2) != wptr) {
+		wq->oldest_read = wq->sq + Q_PTR2IDX(rptr, wq->sq_size_log2);
+
+		if (wq->oldest_read->opcode == T3_READ_REQ)
+			return;
+		rptr++;
+	}
+	wq->oldest_read = NULL;
+}
+
+/*
+ * cxio_poll_cq
+ *
+ * Caller must:
+ *     check the validity of the first CQE,
+ *     supply the wq assicated with the qpid.
+ *
+ * credit: cq credit to return to sge.
+ * cqe_flushed: 1 iff the CQE is flushed.
+ * cqe: copy of the polled CQE.
+ *
+ * return value:
+ *     0       CQE returned,
+ *    -1       CQE skipped, try again.
+ */
+int cxio_poll_cq(struct t3_wq *wq, struct t3_cq *cq, struct t3_cqe *cqe,
+		     u8 *cqe_flushed, u64 *cookie, u32 *credit)
+{
+	int ret = 0;
+	struct t3_cqe *hw_cqe, read_cqe;
+
+	*cqe_flushed = 0;
+	*credit = 0;
+	hw_cqe = cxio_next_cqe(cq);
+
+	PDBG("%s CQE OOO %d qpid 0x%0x genbit %d type %d status 0x%0x"
+	     " opcode 0x%0x len 0x%0x wrid_hi_stag 0x%x wrid_low_msn 0x%x\n",
+	     __func__, CQE_OOO(*hw_cqe), CQE_QPID(*hw_cqe),
+	     CQE_GENBIT(*hw_cqe), CQE_TYPE(*hw_cqe), CQE_STATUS(*hw_cqe),
+	     CQE_OPCODE(*hw_cqe), CQE_LEN(*hw_cqe), CQE_WRID_HI(*hw_cqe),
+	     CQE_WRID_LOW(*hw_cqe));
+
+	/*
+	 * skip cqe's not affiliated with a QP.
+	 */
+	if (wq == NULL) {
+		ret = -1;
+		goto skip_cqe;
+	}
+
+	/*
+	 * Gotta tweak READ completions:
+	 *	1) the cqe doesn't contain the sq_wptr from the wr.
+	 *	2) opcode not reflected from the wr.
+	 *	3) read_len not reflected from the wr.
+	 *	4) cq_type is RQ_TYPE not SQ_TYPE.
+	 */
+	if (RQ_TYPE(*hw_cqe) && (CQE_OPCODE(*hw_cqe) == T3_READ_RESP)) {
+
+		/*
+		 * If this is an unsolicited read response, then the read
+		 * was generated by the kernel driver as part of peer-2-peer
+		 * connection setup.  So ignore the completion.
+		 */
+		if (!wq->oldest_read) {
+			if (CQE_STATUS(*hw_cqe))
+				wq->error = 1;
+			ret = -1;
+			goto skip_cqe;
+		}
+
+		/*
+		 * Don't write to the HWCQ, so create a new read req CQE
+		 * in local memory.
+		 */
+		create_read_req_cqe(wq, hw_cqe, &read_cqe);
+		hw_cqe = &read_cqe;
+		advance_oldest_read(wq);
+	}
+
+	/*
+	 * T3A: Discard TERMINATE CQEs.
+	 */
+	if (CQE_OPCODE(*hw_cqe) == T3_TERMINATE) {
+		ret = -1;
+		wq->error = 1;
+		goto skip_cqe;
+	}
+
+	if (CQE_STATUS(*hw_cqe) || wq->error) {
+		*cqe_flushed = wq->error;
+		wq->error = 1;
+
+		/*
+		 * T3A inserts errors into the CQE.  We cannot return
+		 * these as work completions.
+		 */
+		/* incoming write failures */
+		if ((CQE_OPCODE(*hw_cqe) == T3_RDMA_WRITE)
+		     && RQ_TYPE(*hw_cqe)) {
+			ret = -1;
+			goto skip_cqe;
+		}
+		/* incoming read request failures */
+		if ((CQE_OPCODE(*hw_cqe) == T3_READ_RESP) && SQ_TYPE(*hw_cqe)) {
+			ret = -1;
+			goto skip_cqe;
+		}
+
+		/* incoming SEND with no receive posted failures */
+		if (CQE_SEND_OPCODE(*hw_cqe) && RQ_TYPE(*hw_cqe) &&
+		    Q_EMPTY(wq->rq_rptr, wq->rq_wptr)) {
+			ret = -1;
+			goto skip_cqe;
+		}
+		BUG_ON((*cqe_flushed == 0) && !SW_CQE(*hw_cqe));
+		goto proc_cqe;
+	}
+
+	/*
+	 * RECV completion.
+	 */
+	if (RQ_TYPE(*hw_cqe)) {
+
+		/*
+		 * HW only validates 4 bits of MSN.  So we must validate that
+		 * the MSN in the SEND is the next expected MSN.  If its not,
+		 * then we complete this with TPT_ERR_MSN and mark the wq in
+		 * error.
+		 */
+
+		if (Q_EMPTY(wq->rq_rptr, wq->rq_wptr)) {
+			wq->error = 1;
+			ret = -1;
+			goto skip_cqe;
+		}
+
+		if (unlikely((CQE_WRID_MSN(*hw_cqe) != (wq->rq_rptr + 1)))) {
+			wq->error = 1;
+			hw_cqe->header |= htonl(V_CQE_STATUS(TPT_ERR_MSN));
+			goto proc_cqe;
+		}
+		goto proc_cqe;
+	}
+
+	/*
+	 * If we get here its a send completion.
+	 *
+	 * Handle out of order completion. These get stuffed
+	 * in the SW SQ. Then the SW SQ is walked to move any
+	 * now in-order completions into the SW CQ.  This handles
+	 * 2 cases:
+	 *	1) reaping unsignaled WRs when the first subsequent
+	 *	   signaled WR is completed.
+	 *	2) out of order read completions.
+	 */
+	if (!SW_CQE(*hw_cqe) && (CQE_WRID_SQ_WPTR(*hw_cqe) != wq->sq_rptr)) {
+		struct t3_swsq *sqp;
+
+		PDBG("%s out of order completion going in swsq at idx %ld\n",
+		     __func__,
+		     Q_PTR2IDX(CQE_WRID_SQ_WPTR(*hw_cqe), wq->sq_size_log2));
+		sqp = wq->sq +
+		      Q_PTR2IDX(CQE_WRID_SQ_WPTR(*hw_cqe), wq->sq_size_log2);
+		sqp->cqe = *hw_cqe;
+		sqp->complete = 1;
+		ret = -1;
+		goto flush_wq;
+	}
+
+proc_cqe:
+	*cqe = *hw_cqe;
+
+	/*
+	 * Reap the associated WR(s) that are freed up with this
+	 * completion.
+	 */
+	if (SQ_TYPE(*hw_cqe)) {
+		wq->sq_rptr = CQE_WRID_SQ_WPTR(*hw_cqe);
+		PDBG("%s completing sq idx %ld\n", __func__,
+		     Q_PTR2IDX(wq->sq_rptr, wq->sq_size_log2));
+		*cookie = wq->sq[Q_PTR2IDX(wq->sq_rptr, wq->sq_size_log2)].wr_id;
+		wq->sq_rptr++;
+	} else {
+		PDBG("%s completing rq idx %ld\n", __func__,
+		     Q_PTR2IDX(wq->rq_rptr, wq->rq_size_log2));
+		*cookie = wq->rq[Q_PTR2IDX(wq->rq_rptr, wq->rq_size_log2)].wr_id;
+		if (wq->rq[Q_PTR2IDX(wq->rq_rptr, wq->rq_size_log2)].pbl_addr)
+			cxio_hal_pblpool_free(wq->rdev,
+				wq->rq[Q_PTR2IDX(wq->rq_rptr,
+				wq->rq_size_log2)].pbl_addr, T3_STAG0_PBL_SIZE);
+		BUG_ON(Q_EMPTY(wq->rq_rptr, wq->rq_wptr));
+		wq->rq_rptr++;
+	}
+
+flush_wq:
+	/*
+	 * Flush any completed cqes that are now in-order.
+	 */
+	flush_completed_wrs(wq, cq);
+
+skip_cqe:
+	if (SW_CQE(*hw_cqe)) {
+		PDBG("%s cq %p cqid 0x%x skip sw cqe sw_rptr 0x%x\n",
+		     __func__, cq, cq->cqid, cq->sw_rptr);
+		++cq->sw_rptr;
+	} else {
+		PDBG("%s cq %p cqid 0x%x skip hw cqe rptr 0x%x\n",
+		     __func__, cq, cq->cqid, cq->rptr);
+		++cq->rptr;
+
+		/*
+		 * T3A: compute credits.
+		 */
+		if (((cq->rptr - cq->wptr) > (1 << (cq->size_log2 - 1)))
+		    || ((cq->rptr - cq->wptr) >= 128)) {
+			*credit = cq->rptr - cq->wptr;
+			cq->wptr = cq->rptr;
+		}
+	}
+	return ret;
+}
diff --git a/drivers/infiniband/hw/cxgb3/cxio_hal.h b/drivers/infiniband/hw/cxgb3/cxio_hal.h
new file mode 100644
index 0000000..78fbe9f
--- /dev/null
+++ b/drivers/infiniband/hw/cxgb3/cxio_hal.h
@@ -0,0 +1,211 @@
+/*
+ * Copyright (c) 2006 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef  __CXIO_HAL_H__
+#define  __CXIO_HAL_H__
+
+#include <linux/list.h>
+#include <linux/mutex.h>
+#include <linux/kfifo.h>
+
+#include "t3_cpl.h"
+#include "t3cdev.h"
+#include "cxgb3_ctl_defs.h"
+#include "cxio_wr.h"
+
+#define T3_CTRL_QP_ID    FW_RI_SGEEC_START
+#define T3_CTL_QP_TID	 FW_RI_TID_START
+#define T3_CTRL_QP_SIZE_LOG2  8
+#define T3_CTRL_CQ_ID    0
+
+#define T3_MAX_NUM_RI (1<<15)
+#define T3_MAX_NUM_QP (1<<15)
+#define T3_MAX_NUM_CQ (1<<15)
+#define T3_MAX_NUM_PD (1<<15)
+#define T3_MAX_PBL_SIZE 256
+#define T3_MAX_RQ_SIZE 1024
+#define T3_MAX_QP_DEPTH (T3_MAX_RQ_SIZE-1)
+#define T3_MAX_CQ_DEPTH 65536
+#define T3_MAX_NUM_STAG (1<<15)
+#define T3_MAX_MR_SIZE 0x100000000ULL
+#define T3_PAGESIZE_MASK 0xffff000  /* 4KB-128MB */
+
+#define T3_STAG_UNSET 0xffffffff
+
+#define T3_MAX_DEV_NAME_LEN 32
+
+#define CXIO_FW_MAJ 7
+
+struct cxio_hal_ctrl_qp {
+	u32 wptr;
+	u32 rptr;
+	struct mutex lock;	/* for the wtpr, can sleep */
+	wait_queue_head_t waitq;/* wait for RspQ/CQE msg */
+	union t3_wr *workq;	/* the work request queue */
+	dma_addr_t dma_addr;	/* pci bus address of the workq */
+	DEFINE_DMA_UNMAP_ADDR(mapping);
+	void __iomem *doorbell;
+};
+
+struct cxio_hal_resource {
+	struct kfifo tpt_fifo;
+	spinlock_t tpt_fifo_lock;
+	struct kfifo qpid_fifo;
+	spinlock_t qpid_fifo_lock;
+	struct kfifo cqid_fifo;
+	spinlock_t cqid_fifo_lock;
+	struct kfifo pdid_fifo;
+	spinlock_t pdid_fifo_lock;
+};
+
+struct cxio_qpid_list {
+	struct list_head entry;
+	u32 qpid;
+};
+
+struct cxio_ucontext {
+	struct list_head qpids;
+	struct mutex lock;
+};
+
+struct cxio_rdev {
+	char dev_name[T3_MAX_DEV_NAME_LEN];
+	struct t3cdev *t3cdev_p;
+	struct rdma_info rnic_info;
+	struct adap_ports port_info;
+	struct cxio_hal_resource *rscp;
+	struct cxio_hal_ctrl_qp ctrl_qp;
+	void *ulp;
+	unsigned long qpshift;
+	u32 qpnr;
+	u32 qpmask;
+	struct cxio_ucontext uctx;
+	struct gen_pool *pbl_pool;
+	struct gen_pool *rqt_pool;
+	struct list_head entry;
+	struct ch_embedded_info fw_info;
+	u32	flags;
+#define	CXIO_ERROR_FATAL	1
+};
+
+static inline int cxio_fatal_error(struct cxio_rdev *rdev_p)
+{
+	return rdev_p->flags & CXIO_ERROR_FATAL;
+}
+
+static inline int cxio_num_stags(struct cxio_rdev *rdev_p)
+{
+	return min((int)T3_MAX_NUM_STAG, (int)((rdev_p->rnic_info.tpt_top - rdev_p->rnic_info.tpt_base) >> 5));
+}
+
+typedef void (*cxio_hal_ev_callback_func_t) (struct cxio_rdev * rdev_p,
+					     struct sk_buff * skb);
+
+#define RSPQ_CQID(rsp) (be32_to_cpu(rsp->cq_ptrid) & 0xffff)
+#define RSPQ_CQPTR(rsp) ((be32_to_cpu(rsp->cq_ptrid) >> 16) & 0xffff)
+#define RSPQ_GENBIT(rsp) ((be32_to_cpu(rsp->flags) >> 16) & 1)
+#define RSPQ_OVERFLOW(rsp) ((be32_to_cpu(rsp->flags) >> 17) & 1)
+#define RSPQ_AN(rsp) ((be32_to_cpu(rsp->flags) >> 18) & 1)
+#define RSPQ_SE(rsp) ((be32_to_cpu(rsp->flags) >> 19) & 1)
+#define RSPQ_NOTIFY(rsp) ((be32_to_cpu(rsp->flags) >> 20) & 1)
+#define RSPQ_CQBRANCH(rsp) ((be32_to_cpu(rsp->flags) >> 21) & 1)
+#define RSPQ_CREDIT_THRESH(rsp) ((be32_to_cpu(rsp->flags) >> 22) & 1)
+
+struct respQ_msg_t {
+	__be32 flags;		/* flit 0 */
+	__be32 cq_ptrid;
+	__be64 rsvd;		/* flit 1 */
+	struct t3_cqe cqe;	/* flits 2-3 */
+};
+
+enum t3_cq_opcode {
+	CQ_ARM_AN = 0x2,
+	CQ_ARM_SE = 0x6,
+	CQ_FORCE_AN = 0x3,
+	CQ_CREDIT_UPDATE = 0x7
+};
+
+int cxio_rdev_open(struct cxio_rdev *rdev);
+void cxio_rdev_close(struct cxio_rdev *rdev);
+int cxio_hal_cq_op(struct cxio_rdev *rdev, struct t3_cq *cq,
+		   enum t3_cq_opcode op, u32 credit);
+int cxio_create_cq(struct cxio_rdev *rdev, struct t3_cq *cq, int kernel);
+int cxio_destroy_cq(struct cxio_rdev *rdev, struct t3_cq *cq);
+int cxio_resize_cq(struct cxio_rdev *rdev, struct t3_cq *cq);
+void cxio_release_ucontext(struct cxio_rdev *rdev, struct cxio_ucontext *uctx);
+void cxio_init_ucontext(struct cxio_rdev *rdev, struct cxio_ucontext *uctx);
+int cxio_create_qp(struct cxio_rdev *rdev, u32 kernel_domain, struct t3_wq *wq,
+		   struct cxio_ucontext *uctx);
+int cxio_destroy_qp(struct cxio_rdev *rdev, struct t3_wq *wq,
+		    struct cxio_ucontext *uctx);
+int cxio_peek_cq(struct t3_wq *wr, struct t3_cq *cq, int opcode);
+int cxio_write_pbl(struct cxio_rdev *rdev_p, __be64 *pbl,
+		   u32 pbl_addr, u32 pbl_size);
+int cxio_register_phys_mem(struct cxio_rdev *rdev, u32 * stag, u32 pdid,
+			   enum tpt_mem_perm perm, u32 zbva, u64 to, u32 len,
+			   u8 page_size, u32 pbl_size, u32 pbl_addr);
+int cxio_reregister_phys_mem(struct cxio_rdev *rdev, u32 * stag, u32 pdid,
+			   enum tpt_mem_perm perm, u32 zbva, u64 to, u32 len,
+			   u8 page_size, u32 pbl_size, u32 pbl_addr);
+int cxio_dereg_mem(struct cxio_rdev *rdev, u32 stag, u32 pbl_size,
+		   u32 pbl_addr);
+int cxio_allocate_window(struct cxio_rdev *rdev, u32 * stag, u32 pdid);
+int cxio_allocate_stag(struct cxio_rdev *rdev, u32 *stag, u32 pdid, u32 pbl_size, u32 pbl_addr);
+int cxio_deallocate_window(struct cxio_rdev *rdev, u32 stag);
+int cxio_rdma_init(struct cxio_rdev *rdev, struct t3_rdma_init_attr *attr);
+void cxio_register_ev_cb(cxio_hal_ev_callback_func_t ev_cb);
+void cxio_unregister_ev_cb(cxio_hal_ev_callback_func_t ev_cb);
+u32 cxio_hal_get_pdid(struct cxio_hal_resource *rscp);
+void cxio_hal_put_pdid(struct cxio_hal_resource *rscp, u32 pdid);
+int __init cxio_hal_init(void);
+void __exit cxio_hal_exit(void);
+int cxio_flush_rq(struct t3_wq *wq, struct t3_cq *cq, int count);
+int cxio_flush_sq(struct t3_wq *wq, struct t3_cq *cq, int count);
+void cxio_count_rcqes(struct t3_cq *cq, struct t3_wq *wq, int *count);
+void cxio_count_scqes(struct t3_cq *cq, struct t3_wq *wq, int *count);
+void cxio_flush_hw_cq(struct t3_cq *cq);
+int cxio_poll_cq(struct t3_wq *wq, struct t3_cq *cq, struct t3_cqe *cqe,
+		     u8 *cqe_flushed, u64 *cookie, u32 *credit);
+int iwch_cxgb3_ofld_send(struct t3cdev *tdev, struct sk_buff *skb);
+
+#define MOD "iw_cxgb3: "
+#define PDBG(fmt, args...) pr_debug(MOD fmt, ## args)
+
+#ifdef DEBUG
+void cxio_dump_tpt(struct cxio_rdev *rev, u32 stag);
+void cxio_dump_pbl(struct cxio_rdev *rev, u32 pbl_addr, uint len, u8 shift);
+void cxio_dump_wqe(union t3_wr *wqe);
+void cxio_dump_wce(struct t3_cqe *wce);
+void cxio_dump_rqt(struct cxio_rdev *rdev, u32 hwtid, int nents);
+void cxio_dump_tcb(struct cxio_rdev *rdev, u32 hwtid);
+#endif
+
+#endif
diff --git a/drivers/infiniband/hw/cxgb3/cxio_resource.c b/drivers/infiniband/hw/cxgb3/cxio_resource.c
new file mode 100644
index 0000000..c40088e
--- /dev/null
+++ b/drivers/infiniband/hw/cxgb3/cxio_resource.c
@@ -0,0 +1,343 @@
+/*
+ * Copyright (c) 2006 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/* Crude resource management */
+#include <linux/kernel.h>
+#include <linux/random.h>
+#include <linux/slab.h>
+#include <linux/kfifo.h>
+#include <linux/spinlock.h>
+#include <linux/errno.h>
+#include "cxio_resource.h"
+#include "cxio_hal.h"
+
+static struct kfifo rhdl_fifo;
+static spinlock_t rhdl_fifo_lock;
+
+#define RANDOM_SIZE 16
+
+static int __cxio_init_resource_fifo(struct kfifo *fifo,
+				   spinlock_t *fifo_lock,
+				   u32 nr, u32 skip_low,
+				   u32 skip_high,
+				   int random)
+{
+	u32 i, j, entry = 0, idx;
+	u32 random_bytes;
+	u32 rarray[16];
+	spin_lock_init(fifo_lock);
+
+	if (kfifo_alloc(fifo, nr * sizeof(u32), GFP_KERNEL))
+		return -ENOMEM;
+
+	for (i = 0; i < skip_low + skip_high; i++)
+		kfifo_in(fifo, (unsigned char *) &entry, sizeof(u32));
+	if (random) {
+		j = 0;
+		random_bytes = prandom_u32();
+		for (i = 0; i < RANDOM_SIZE; i++)
+			rarray[i] = i + skip_low;
+		for (i = skip_low + RANDOM_SIZE; i < nr - skip_high; i++) {
+			if (j >= RANDOM_SIZE) {
+				j = 0;
+				random_bytes = prandom_u32();
+			}
+			idx = (random_bytes >> (j * 2)) & 0xF;
+			kfifo_in(fifo,
+				(unsigned char *) &rarray[idx],
+				sizeof(u32));
+			rarray[idx] = i;
+			j++;
+		}
+		for (i = 0; i < RANDOM_SIZE; i++)
+			kfifo_in(fifo,
+				(unsigned char *) &rarray[i],
+				sizeof(u32));
+	} else
+		for (i = skip_low; i < nr - skip_high; i++)
+			kfifo_in(fifo, (unsigned char *) &i, sizeof(u32));
+
+	for (i = 0; i < skip_low + skip_high; i++)
+		if (kfifo_out_locked(fifo, (unsigned char *) &entry,
+				sizeof(u32), fifo_lock) != sizeof(u32))
+					break;
+	return 0;
+}
+
+static int cxio_init_resource_fifo(struct kfifo *fifo, spinlock_t * fifo_lock,
+				   u32 nr, u32 skip_low, u32 skip_high)
+{
+	return (__cxio_init_resource_fifo(fifo, fifo_lock, nr, skip_low,
+					  skip_high, 0));
+}
+
+static int cxio_init_resource_fifo_random(struct kfifo *fifo,
+				   spinlock_t * fifo_lock,
+				   u32 nr, u32 skip_low, u32 skip_high)
+{
+
+	return (__cxio_init_resource_fifo(fifo, fifo_lock, nr, skip_low,
+					  skip_high, 1));
+}
+
+static int cxio_init_qpid_fifo(struct cxio_rdev *rdev_p)
+{
+	u32 i;
+
+	spin_lock_init(&rdev_p->rscp->qpid_fifo_lock);
+
+	if (kfifo_alloc(&rdev_p->rscp->qpid_fifo, T3_MAX_NUM_QP * sizeof(u32),
+					      GFP_KERNEL))
+		return -ENOMEM;
+
+	for (i = 16; i < T3_MAX_NUM_QP; i++)
+		if (!(i & rdev_p->qpmask))
+			kfifo_in(&rdev_p->rscp->qpid_fifo,
+				    (unsigned char *) &i, sizeof(u32));
+	return 0;
+}
+
+int cxio_hal_init_rhdl_resource(u32 nr_rhdl)
+{
+	return cxio_init_resource_fifo(&rhdl_fifo, &rhdl_fifo_lock, nr_rhdl, 1,
+				       0);
+}
+
+void cxio_hal_destroy_rhdl_resource(void)
+{
+	kfifo_free(&rhdl_fifo);
+}
+
+/* nr_* must be power of 2 */
+int cxio_hal_init_resource(struct cxio_rdev *rdev_p,
+			   u32 nr_tpt, u32 nr_pbl,
+			   u32 nr_rqt, u32 nr_qpid, u32 nr_cqid, u32 nr_pdid)
+{
+	int err = 0;
+	struct cxio_hal_resource *rscp;
+
+	rscp = kmalloc(sizeof(*rscp), GFP_KERNEL);
+	if (!rscp)
+		return -ENOMEM;
+	rdev_p->rscp = rscp;
+	err = cxio_init_resource_fifo_random(&rscp->tpt_fifo,
+				      &rscp->tpt_fifo_lock,
+				      nr_tpt, 1, 0);
+	if (err)
+		goto tpt_err;
+	err = cxio_init_qpid_fifo(rdev_p);
+	if (err)
+		goto qpid_err;
+	err = cxio_init_resource_fifo(&rscp->cqid_fifo, &rscp->cqid_fifo_lock,
+				      nr_cqid, 1, 0);
+	if (err)
+		goto cqid_err;
+	err = cxio_init_resource_fifo(&rscp->pdid_fifo, &rscp->pdid_fifo_lock,
+				      nr_pdid, 1, 0);
+	if (err)
+		goto pdid_err;
+	return 0;
+pdid_err:
+	kfifo_free(&rscp->cqid_fifo);
+cqid_err:
+	kfifo_free(&rscp->qpid_fifo);
+qpid_err:
+	kfifo_free(&rscp->tpt_fifo);
+tpt_err:
+	return -ENOMEM;
+}
+
+/*
+ * returns 0 if no resource available
+ */
+static u32 cxio_hal_get_resource(struct kfifo *fifo, spinlock_t * lock)
+{
+	u32 entry;
+	if (kfifo_out_locked(fifo, (unsigned char *) &entry, sizeof(u32), lock))
+		return entry;
+	else
+		return 0;	/* fifo emptry */
+}
+
+static void cxio_hal_put_resource(struct kfifo *fifo, spinlock_t * lock,
+		u32 entry)
+{
+	BUG_ON(
+	kfifo_in_locked(fifo, (unsigned char *) &entry, sizeof(u32), lock)
+	== 0);
+}
+
+u32 cxio_hal_get_stag(struct cxio_hal_resource *rscp)
+{
+	return cxio_hal_get_resource(&rscp->tpt_fifo, &rscp->tpt_fifo_lock);
+}
+
+void cxio_hal_put_stag(struct cxio_hal_resource *rscp, u32 stag)
+{
+	cxio_hal_put_resource(&rscp->tpt_fifo, &rscp->tpt_fifo_lock, stag);
+}
+
+u32 cxio_hal_get_qpid(struct cxio_hal_resource *rscp)
+{
+	u32 qpid = cxio_hal_get_resource(&rscp->qpid_fifo,
+			&rscp->qpid_fifo_lock);
+	PDBG("%s qpid 0x%x\n", __func__, qpid);
+	return qpid;
+}
+
+void cxio_hal_put_qpid(struct cxio_hal_resource *rscp, u32 qpid)
+{
+	PDBG("%s qpid 0x%x\n", __func__, qpid);
+	cxio_hal_put_resource(&rscp->qpid_fifo, &rscp->qpid_fifo_lock, qpid);
+}
+
+u32 cxio_hal_get_cqid(struct cxio_hal_resource *rscp)
+{
+	return cxio_hal_get_resource(&rscp->cqid_fifo, &rscp->cqid_fifo_lock);
+}
+
+void cxio_hal_put_cqid(struct cxio_hal_resource *rscp, u32 cqid)
+{
+	cxio_hal_put_resource(&rscp->cqid_fifo, &rscp->cqid_fifo_lock, cqid);
+}
+
+u32 cxio_hal_get_pdid(struct cxio_hal_resource *rscp)
+{
+	return cxio_hal_get_resource(&rscp->pdid_fifo, &rscp->pdid_fifo_lock);
+}
+
+void cxio_hal_put_pdid(struct cxio_hal_resource *rscp, u32 pdid)
+{
+	cxio_hal_put_resource(&rscp->pdid_fifo, &rscp->pdid_fifo_lock, pdid);
+}
+
+void cxio_hal_destroy_resource(struct cxio_hal_resource *rscp)
+{
+	kfifo_free(&rscp->tpt_fifo);
+	kfifo_free(&rscp->cqid_fifo);
+	kfifo_free(&rscp->qpid_fifo);
+	kfifo_free(&rscp->pdid_fifo);
+	kfree(rscp);
+}
+
+/*
+ * PBL Memory Manager.  Uses Linux generic allocator.
+ */
+
+#define MIN_PBL_SHIFT 8			/* 256B == min PBL size (32 entries) */
+
+u32 cxio_hal_pblpool_alloc(struct cxio_rdev *rdev_p, int size)
+{
+	unsigned long addr = gen_pool_alloc(rdev_p->pbl_pool, size);
+	PDBG("%s addr 0x%x size %d\n", __func__, (u32)addr, size);
+	return (u32)addr;
+}
+
+void cxio_hal_pblpool_free(struct cxio_rdev *rdev_p, u32 addr, int size)
+{
+	PDBG("%s addr 0x%x size %d\n", __func__, addr, size);
+	gen_pool_free(rdev_p->pbl_pool, (unsigned long)addr, size);
+}
+
+int cxio_hal_pblpool_create(struct cxio_rdev *rdev_p)
+{
+	unsigned pbl_start, pbl_chunk;
+
+	rdev_p->pbl_pool = gen_pool_create(MIN_PBL_SHIFT, -1);
+	if (!rdev_p->pbl_pool)
+		return -ENOMEM;
+
+	pbl_start = rdev_p->rnic_info.pbl_base;
+	pbl_chunk = rdev_p->rnic_info.pbl_top - pbl_start + 1;
+
+	while (pbl_start < rdev_p->rnic_info.pbl_top) {
+		pbl_chunk = min(rdev_p->rnic_info.pbl_top - pbl_start + 1,
+				pbl_chunk);
+		if (gen_pool_add(rdev_p->pbl_pool, pbl_start, pbl_chunk, -1)) {
+			PDBG("%s failed to add PBL chunk (%x/%x)\n",
+			     __func__, pbl_start, pbl_chunk);
+			if (pbl_chunk <= 1024 << MIN_PBL_SHIFT) {
+				printk(KERN_WARNING MOD "%s: Failed to add all PBL chunks (%x/%x)\n",
+				       __func__, pbl_start, rdev_p->rnic_info.pbl_top - pbl_start);
+				return 0;
+			}
+			pbl_chunk >>= 1;
+		} else {
+			PDBG("%s added PBL chunk (%x/%x)\n",
+			     __func__, pbl_start, pbl_chunk);
+			pbl_start += pbl_chunk;
+		}
+	}
+
+	return 0;
+}
+
+void cxio_hal_pblpool_destroy(struct cxio_rdev *rdev_p)
+{
+	gen_pool_destroy(rdev_p->pbl_pool);
+}
+
+/*
+ * RQT Memory Manager.  Uses Linux generic allocator.
+ */
+
+#define MIN_RQT_SHIFT 10	/* 1KB == mini RQT size (16 entries) */
+#define RQT_CHUNK 2*1024*1024
+
+u32 cxio_hal_rqtpool_alloc(struct cxio_rdev *rdev_p, int size)
+{
+	unsigned long addr = gen_pool_alloc(rdev_p->rqt_pool, size << 6);
+	PDBG("%s addr 0x%x size %d\n", __func__, (u32)addr, size << 6);
+	return (u32)addr;
+}
+
+void cxio_hal_rqtpool_free(struct cxio_rdev *rdev_p, u32 addr, int size)
+{
+	PDBG("%s addr 0x%x size %d\n", __func__, addr, size << 6);
+	gen_pool_free(rdev_p->rqt_pool, (unsigned long)addr, size << 6);
+}
+
+int cxio_hal_rqtpool_create(struct cxio_rdev *rdev_p)
+{
+	unsigned long i;
+	rdev_p->rqt_pool = gen_pool_create(MIN_RQT_SHIFT, -1);
+	if (rdev_p->rqt_pool)
+		for (i = rdev_p->rnic_info.rqt_base;
+		     i <= rdev_p->rnic_info.rqt_top - RQT_CHUNK + 1;
+		     i += RQT_CHUNK)
+			gen_pool_add(rdev_p->rqt_pool, i, RQT_CHUNK, -1);
+	return rdev_p->rqt_pool ? 0 : -ENOMEM;
+}
+
+void cxio_hal_rqtpool_destroy(struct cxio_rdev *rdev_p)
+{
+	gen_pool_destroy(rdev_p->rqt_pool);
+}
diff --git a/drivers/infiniband/hw/cxgb3/cxio_resource.h b/drivers/infiniband/hw/cxgb3/cxio_resource.h
new file mode 100644
index 0000000..a2703a3
--- /dev/null
+++ b/drivers/infiniband/hw/cxgb3/cxio_resource.h
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2006 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __CXIO_RESOURCE_H__
+#define __CXIO_RESOURCE_H__
+
+#include <linux/kernel.h>
+#include <linux/random.h>
+#include <linux/slab.h>
+#include <linux/kfifo.h>
+#include <linux/spinlock.h>
+#include <linux/errno.h>
+#include <linux/genalloc.h>
+#include "cxio_hal.h"
+
+extern int cxio_hal_init_rhdl_resource(u32 nr_rhdl);
+extern void cxio_hal_destroy_rhdl_resource(void);
+extern int cxio_hal_init_resource(struct cxio_rdev *rdev_p,
+				  u32 nr_tpt, u32 nr_pbl,
+				  u32 nr_rqt, u32 nr_qpid, u32 nr_cqid,
+				  u32 nr_pdid);
+extern u32 cxio_hal_get_stag(struct cxio_hal_resource *rscp);
+extern void cxio_hal_put_stag(struct cxio_hal_resource *rscp, u32 stag);
+extern u32 cxio_hal_get_qpid(struct cxio_hal_resource *rscp);
+extern void cxio_hal_put_qpid(struct cxio_hal_resource *rscp, u32 qpid);
+extern u32 cxio_hal_get_cqid(struct cxio_hal_resource *rscp);
+extern void cxio_hal_put_cqid(struct cxio_hal_resource *rscp, u32 cqid);
+extern void cxio_hal_destroy_resource(struct cxio_hal_resource *rscp);
+
+#define PBL_OFF(rdev_p, a) ( (a) - (rdev_p)->rnic_info.pbl_base )
+extern int cxio_hal_pblpool_create(struct cxio_rdev *rdev_p);
+extern void cxio_hal_pblpool_destroy(struct cxio_rdev *rdev_p);
+extern u32 cxio_hal_pblpool_alloc(struct cxio_rdev *rdev_p, int size);
+extern void cxio_hal_pblpool_free(struct cxio_rdev *rdev_p, u32 addr, int size);
+
+#define RQT_OFF(rdev_p, a) ( (a) - (rdev_p)->rnic_info.rqt_base )
+extern int cxio_hal_rqtpool_create(struct cxio_rdev *rdev_p);
+extern void cxio_hal_rqtpool_destroy(struct cxio_rdev *rdev_p);
+extern u32 cxio_hal_rqtpool_alloc(struct cxio_rdev *rdev_p, int size);
+extern void cxio_hal_rqtpool_free(struct cxio_rdev *rdev_p, u32 addr, int size);
+#endif
diff --git a/drivers/infiniband/hw/cxgb3/cxio_wr.h b/drivers/infiniband/hw/cxgb3/cxio_wr.h
new file mode 100644
index 0000000..83d2e19
--- /dev/null
+++ b/drivers/infiniband/hw/cxgb3/cxio_wr.h
@@ -0,0 +1,802 @@
+/*
+ * Copyright (c) 2006 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __CXIO_WR_H__
+#define __CXIO_WR_H__
+
+#include <asm/io.h>
+#include <linux/pci.h>
+#include <linux/timer.h>
+#include "firmware_exports.h"
+
+#define T3_MAX_SGE      4
+#define T3_MAX_INLINE	64
+#define T3_STAG0_PBL_SIZE (2 * T3_MAX_SGE << 3)
+#define T3_STAG0_MAX_PBE_LEN (128 * 1024 * 1024)
+#define T3_STAG0_PAGE_SHIFT 15
+
+#define Q_EMPTY(rptr,wptr) ((rptr)==(wptr))
+#define Q_FULL(rptr,wptr,size_log2)  ( (((wptr)-(rptr))>>(size_log2)) && \
+				       ((rptr)!=(wptr)) )
+#define Q_GENBIT(ptr,size_log2) (!(((ptr)>>size_log2)&0x1))
+#define Q_FREECNT(rptr,wptr,size_log2) ((1UL<<size_log2)-((wptr)-(rptr)))
+#define Q_COUNT(rptr,wptr) ((wptr)-(rptr))
+#define Q_PTR2IDX(ptr,size_log2) (ptr & ((1UL<<size_log2)-1))
+
+static inline void ring_doorbell(void __iomem *doorbell, u32 qpid)
+{
+	writel(((1<<31) | qpid), doorbell);
+}
+
+#define SEQ32_GE(x,y) (!( (((u32) (x)) - ((u32) (y))) & 0x80000000 ))
+
+enum t3_wr_flags {
+	T3_COMPLETION_FLAG = 0x01,
+	T3_NOTIFY_FLAG = 0x02,
+	T3_SOLICITED_EVENT_FLAG = 0x04,
+	T3_READ_FENCE_FLAG = 0x08,
+	T3_LOCAL_FENCE_FLAG = 0x10
+} __attribute__ ((packed));
+
+enum t3_wr_opcode {
+	T3_WR_BP = FW_WROPCODE_RI_BYPASS,
+	T3_WR_SEND = FW_WROPCODE_RI_SEND,
+	T3_WR_WRITE = FW_WROPCODE_RI_RDMA_WRITE,
+	T3_WR_READ = FW_WROPCODE_RI_RDMA_READ,
+	T3_WR_INV_STAG = FW_WROPCODE_RI_LOCAL_INV,
+	T3_WR_BIND = FW_WROPCODE_RI_BIND_MW,
+	T3_WR_RCV = FW_WROPCODE_RI_RECEIVE,
+	T3_WR_INIT = FW_WROPCODE_RI_RDMA_INIT,
+	T3_WR_QP_MOD = FW_WROPCODE_RI_MODIFY_QP,
+	T3_WR_FASTREG = FW_WROPCODE_RI_FASTREGISTER_MR
+} __attribute__ ((packed));
+
+enum t3_rdma_opcode {
+	T3_RDMA_WRITE,		/* IETF RDMAP v1.0 ... */
+	T3_READ_REQ,
+	T3_READ_RESP,
+	T3_SEND,
+	T3_SEND_WITH_INV,
+	T3_SEND_WITH_SE,
+	T3_SEND_WITH_SE_INV,
+	T3_TERMINATE,
+	T3_RDMA_INIT,		/* CHELSIO RI specific ... */
+	T3_BIND_MW,
+	T3_FAST_REGISTER,
+	T3_LOCAL_INV,
+	T3_QP_MOD,
+	T3_BYPASS,
+	T3_RDMA_READ_REQ_WITH_INV,
+} __attribute__ ((packed));
+
+static inline enum t3_rdma_opcode wr2opcode(enum t3_wr_opcode wrop)
+{
+	switch (wrop) {
+		case T3_WR_BP: return T3_BYPASS;
+		case T3_WR_SEND: return T3_SEND;
+		case T3_WR_WRITE: return T3_RDMA_WRITE;
+		case T3_WR_READ: return T3_READ_REQ;
+		case T3_WR_INV_STAG: return T3_LOCAL_INV;
+		case T3_WR_BIND: return T3_BIND_MW;
+		case T3_WR_INIT: return T3_RDMA_INIT;
+		case T3_WR_QP_MOD: return T3_QP_MOD;
+		case T3_WR_FASTREG: return T3_FAST_REGISTER;
+		default: break;
+	}
+	return -1;
+}
+
+
+/* Work request id */
+union t3_wrid {
+	struct {
+		u32 hi;
+		u32 low;
+	} id0;
+	u64 id1;
+};
+
+#define WRID(wrid)		(wrid.id1)
+#define WRID_GEN(wrid)		(wrid.id0.wr_gen)
+#define WRID_IDX(wrid)		(wrid.id0.wr_idx)
+#define WRID_LO(wrid)		(wrid.id0.wr_lo)
+
+struct fw_riwrh {
+	__be32 op_seop_flags;
+	__be32 gen_tid_len;
+};
+
+#define S_FW_RIWR_OP		24
+#define M_FW_RIWR_OP		0xff
+#define V_FW_RIWR_OP(x)		((x) << S_FW_RIWR_OP)
+#define G_FW_RIWR_OP(x)	((((x) >> S_FW_RIWR_OP)) & M_FW_RIWR_OP)
+
+#define S_FW_RIWR_SOPEOP	22
+#define M_FW_RIWR_SOPEOP	0x3
+#define V_FW_RIWR_SOPEOP(x)	((x) << S_FW_RIWR_SOPEOP)
+
+#define S_FW_RIWR_FLAGS		8
+#define M_FW_RIWR_FLAGS		0x3fffff
+#define V_FW_RIWR_FLAGS(x)	((x) << S_FW_RIWR_FLAGS)
+#define G_FW_RIWR_FLAGS(x)	((((x) >> S_FW_RIWR_FLAGS)) & M_FW_RIWR_FLAGS)
+
+#define S_FW_RIWR_TID		8
+#define V_FW_RIWR_TID(x)	((x) << S_FW_RIWR_TID)
+
+#define S_FW_RIWR_LEN		0
+#define V_FW_RIWR_LEN(x)	((x) << S_FW_RIWR_LEN)
+
+#define S_FW_RIWR_GEN           31
+#define V_FW_RIWR_GEN(x)        ((x)  << S_FW_RIWR_GEN)
+
+struct t3_sge {
+	__be32 stag;
+	__be32 len;
+	__be64 to;
+};
+
+/* If num_sgle is zero, flit 5+ contains immediate data.*/
+struct t3_send_wr {
+	struct fw_riwrh wrh;	/* 0 */
+	union t3_wrid wrid;	/* 1 */
+
+	u8 rdmaop;		/* 2 */
+	u8 reserved[3];
+	__be32 rem_stag;
+	__be32 plen;		/* 3 */
+	__be32 num_sgle;
+	struct t3_sge sgl[T3_MAX_SGE];	/* 4+ */
+};
+
+#define T3_MAX_FASTREG_DEPTH 10
+#define T3_MAX_FASTREG_FRAG 10
+
+struct t3_fastreg_wr {
+	struct fw_riwrh wrh;	/* 0 */
+	union t3_wrid wrid;	/* 1 */
+	__be32 stag;		/* 2 */
+	__be32 len;
+	__be32 va_base_hi;	/* 3 */
+	__be32 va_base_lo_fbo;
+	__be32 page_type_perms; /* 4 */
+	__be32 reserved1;
+	__be64 pbl_addrs[0];	/* 5+ */
+};
+
+/*
+ * If a fastreg wr spans multiple wqes, then the 2nd fragment look like this.
+ */
+struct t3_pbl_frag {
+	struct fw_riwrh wrh;	/* 0 */
+	__be64 pbl_addrs[14];	/* 1..14 */
+};
+
+#define S_FR_PAGE_COUNT		24
+#define M_FR_PAGE_COUNT		0xff
+#define V_FR_PAGE_COUNT(x)	((x) << S_FR_PAGE_COUNT)
+#define G_FR_PAGE_COUNT(x)	((((x) >> S_FR_PAGE_COUNT)) & M_FR_PAGE_COUNT)
+
+#define S_FR_PAGE_SIZE		16
+#define M_FR_PAGE_SIZE		0x1f
+#define V_FR_PAGE_SIZE(x)	((x) << S_FR_PAGE_SIZE)
+#define G_FR_PAGE_SIZE(x)	((((x) >> S_FR_PAGE_SIZE)) & M_FR_PAGE_SIZE)
+
+#define S_FR_TYPE		8
+#define M_FR_TYPE		0x1
+#define V_FR_TYPE(x)		((x) << S_FR_TYPE)
+#define G_FR_TYPE(x)		((((x) >> S_FR_TYPE)) & M_FR_TYPE)
+
+#define S_FR_PERMS		0
+#define M_FR_PERMS		0xff
+#define V_FR_PERMS(x)		((x) << S_FR_PERMS)
+#define G_FR_PERMS(x)		((((x) >> S_FR_PERMS)) & M_FR_PERMS)
+
+struct t3_local_inv_wr {
+	struct fw_riwrh wrh;	/* 0 */
+	union t3_wrid wrid;	/* 1 */
+	__be32 stag;		/* 2 */
+	__be32 reserved;
+};
+
+struct t3_rdma_write_wr {
+	struct fw_riwrh wrh;	/* 0 */
+	union t3_wrid wrid;	/* 1 */
+	u8 rdmaop;		/* 2 */
+	u8 reserved[3];
+	__be32 stag_sink;
+	__be64 to_sink;		/* 3 */
+	__be32 plen;		/* 4 */
+	__be32 num_sgle;
+	struct t3_sge sgl[T3_MAX_SGE];	/* 5+ */
+};
+
+struct t3_rdma_read_wr {
+	struct fw_riwrh wrh;	/* 0 */
+	union t3_wrid wrid;	/* 1 */
+	u8 rdmaop;		/* 2 */
+	u8 local_inv;
+	u8 reserved[2];
+	__be32 rem_stag;
+	__be64 rem_to;		/* 3 */
+	__be32 local_stag;	/* 4 */
+	__be32 local_len;
+	__be64 local_to;	/* 5 */
+};
+
+struct t3_bind_mw_wr {
+	struct fw_riwrh wrh;	/* 0 */
+	union t3_wrid wrid;	/* 1 */
+	u16 reserved;		/* 2 */
+	u8 type;
+	u8 perms;
+	__be32 mr_stag;
+	__be32 mw_stag;		/* 3 */
+	__be32 mw_len;
+	__be64 mw_va;		/* 4 */
+	__be32 mr_pbl_addr;	/* 5 */
+	u8 reserved2[3];
+	u8 mr_pagesz;
+};
+
+struct t3_receive_wr {
+	struct fw_riwrh wrh;	/* 0 */
+	union t3_wrid wrid;	/* 1 */
+	u8 pagesz[T3_MAX_SGE];
+	__be32 num_sgle;		/* 2 */
+	struct t3_sge sgl[T3_MAX_SGE];	/* 3+ */
+	__be32 pbl_addr[T3_MAX_SGE];
+};
+
+struct t3_bypass_wr {
+	struct fw_riwrh wrh;
+	union t3_wrid wrid;	/* 1 */
+};
+
+struct t3_modify_qp_wr {
+	struct fw_riwrh wrh;	/* 0 */
+	union t3_wrid wrid;	/* 1 */
+	__be32 flags;		/* 2 */
+	__be32 quiesce;		/* 2 */
+	__be32 max_ird;		/* 3 */
+	__be32 max_ord;		/* 3 */
+	__be64 sge_cmd;		/* 4 */
+	__be64 ctx1;		/* 5 */
+	__be64 ctx0;		/* 6 */
+};
+
+enum t3_modify_qp_flags {
+	MODQP_QUIESCE  = 0x01,
+	MODQP_MAX_IRD  = 0x02,
+	MODQP_MAX_ORD  = 0x04,
+	MODQP_WRITE_EC = 0x08,
+	MODQP_READ_EC  = 0x10,
+};
+
+
+enum t3_mpa_attrs {
+	uP_RI_MPA_RX_MARKER_ENABLE = 0x1,
+	uP_RI_MPA_TX_MARKER_ENABLE = 0x2,
+	uP_RI_MPA_CRC_ENABLE = 0x4,
+	uP_RI_MPA_IETF_ENABLE = 0x8
+} __attribute__ ((packed));
+
+enum t3_qp_caps {
+	uP_RI_QP_RDMA_READ_ENABLE = 0x01,
+	uP_RI_QP_RDMA_WRITE_ENABLE = 0x02,
+	uP_RI_QP_BIND_ENABLE = 0x04,
+	uP_RI_QP_FAST_REGISTER_ENABLE = 0x08,
+	uP_RI_QP_STAG0_ENABLE = 0x10
+} __attribute__ ((packed));
+
+enum rdma_init_rtr_types {
+	RTR_READ = 1,
+	RTR_WRITE = 2,
+	RTR_SEND = 3,
+};
+
+#define S_RTR_TYPE	2
+#define M_RTR_TYPE	0x3
+#define V_RTR_TYPE(x)	((x) << S_RTR_TYPE)
+#define G_RTR_TYPE(x)	((((x) >> S_RTR_TYPE)) & M_RTR_TYPE)
+
+#define S_CHAN		4
+#define M_CHAN		0x3
+#define V_CHAN(x)	((x) << S_CHAN)
+#define G_CHAN(x)	((((x) >> S_CHAN)) & M_CHAN)
+
+struct t3_rdma_init_attr {
+	u32 tid;
+	u32 qpid;
+	u32 pdid;
+	u32 scqid;
+	u32 rcqid;
+	u32 rq_addr;
+	u32 rq_size;
+	enum t3_mpa_attrs mpaattrs;
+	enum t3_qp_caps qpcaps;
+	u16 tcp_emss;
+	u32 ord;
+	u32 ird;
+	u64 qp_dma_addr;
+	u32 qp_dma_size;
+	enum rdma_init_rtr_types rtr_type;
+	u16 flags;
+	u16 rqe_count;
+	u32 irs;
+	u32 chan;
+};
+
+struct t3_rdma_init_wr {
+	struct fw_riwrh wrh;	/* 0 */
+	union t3_wrid wrid;	/* 1 */
+	__be32 qpid;		/* 2 */
+	__be32 pdid;
+	__be32 scqid;		/* 3 */
+	__be32 rcqid;
+	__be32 rq_addr;		/* 4 */
+	__be32 rq_size;
+	u8 mpaattrs;		/* 5 */
+	u8 qpcaps;
+	__be16 ulpdu_size;
+	__be16 flags_rtr_type;
+	__be16 rqe_count;
+	__be32 ord;		/* 6 */
+	__be32 ird;
+	__be64 qp_dma_addr;	/* 7 */
+	__be32 qp_dma_size;	/* 8 */
+	__be32 irs;
+};
+
+struct t3_genbit {
+	u64 flit[15];
+	__be64 genbit;
+};
+
+struct t3_wq_in_err {
+	u64 flit[13];
+	u64 err;
+};
+
+enum rdma_init_wr_flags {
+	MPA_INITIATOR = (1<<0),
+	PRIV_QP = (1<<1),
+};
+
+union t3_wr {
+	struct t3_send_wr send;
+	struct t3_rdma_write_wr write;
+	struct t3_rdma_read_wr read;
+	struct t3_receive_wr recv;
+	struct t3_fastreg_wr fastreg;
+	struct t3_pbl_frag pbl_frag;
+	struct t3_local_inv_wr local_inv;
+	struct t3_bind_mw_wr bind;
+	struct t3_bypass_wr bypass;
+	struct t3_rdma_init_wr init;
+	struct t3_modify_qp_wr qp_mod;
+	struct t3_genbit genbit;
+	struct t3_wq_in_err wq_in_err;
+	__be64 flit[16];
+};
+
+#define T3_SQ_CQE_FLIT	  13
+#define T3_SQ_COOKIE_FLIT 14
+
+#define T3_RQ_COOKIE_FLIT 13
+#define T3_RQ_CQE_FLIT	  14
+
+static inline enum t3_wr_opcode fw_riwrh_opcode(struct fw_riwrh *wqe)
+{
+	return G_FW_RIWR_OP(be32_to_cpu(wqe->op_seop_flags));
+}
+
+enum t3_wr_hdr_bits {
+	T3_EOP = 1,
+	T3_SOP = 2,
+	T3_SOPEOP = T3_EOP|T3_SOP,
+};
+
+static inline void build_fw_riwrh(struct fw_riwrh *wqe, enum t3_wr_opcode op,
+				  enum t3_wr_flags flags, u8 genbit, u32 tid,
+				  u8 len, u8 sopeop)
+{
+	wqe->op_seop_flags = cpu_to_be32(V_FW_RIWR_OP(op) |
+					 V_FW_RIWR_SOPEOP(sopeop) |
+					 V_FW_RIWR_FLAGS(flags));
+	wmb();
+	wqe->gen_tid_len = cpu_to_be32(V_FW_RIWR_GEN(genbit) |
+				       V_FW_RIWR_TID(tid) |
+				       V_FW_RIWR_LEN(len));
+	/* 2nd gen bit... */
+	((union t3_wr *)wqe)->genbit.genbit = cpu_to_be64(genbit);
+}
+
+/*
+ * T3 ULP2_TX commands
+ */
+enum t3_utx_mem_op {
+	T3_UTX_MEM_READ = 2,
+	T3_UTX_MEM_WRITE = 3
+};
+
+/* T3 MC7 RDMA TPT entry format */
+
+enum tpt_mem_type {
+	TPT_NON_SHARED_MR = 0x0,
+	TPT_SHARED_MR = 0x1,
+	TPT_MW = 0x2,
+	TPT_MW_RELAXED_PROTECTION = 0x3
+};
+
+enum tpt_addr_type {
+	TPT_ZBTO = 0,
+	TPT_VATO = 1
+};
+
+enum tpt_mem_perm {
+	TPT_MW_BIND = 0x10,
+	TPT_LOCAL_READ = 0x8,
+	TPT_LOCAL_WRITE = 0x4,
+	TPT_REMOTE_READ = 0x2,
+	TPT_REMOTE_WRITE = 0x1
+};
+
+struct tpt_entry {
+	__be32 valid_stag_pdid;
+	__be32 flags_pagesize_qpid;
+
+	__be32 rsvd_pbl_addr;
+	__be32 len;
+	__be32 va_hi;
+	__be32 va_low_or_fbo;
+
+	__be32 rsvd_bind_cnt_or_pstag;
+	__be32 rsvd_pbl_size;
+};
+
+#define S_TPT_VALID		31
+#define V_TPT_VALID(x)		((x) << S_TPT_VALID)
+#define F_TPT_VALID		V_TPT_VALID(1U)
+
+#define S_TPT_STAG_KEY		23
+#define M_TPT_STAG_KEY		0xFF
+#define V_TPT_STAG_KEY(x)	((x) << S_TPT_STAG_KEY)
+#define G_TPT_STAG_KEY(x)	(((x) >> S_TPT_STAG_KEY) & M_TPT_STAG_KEY)
+
+#define S_TPT_STAG_STATE	22
+#define V_TPT_STAG_STATE(x)	((x) << S_TPT_STAG_STATE)
+#define F_TPT_STAG_STATE	V_TPT_STAG_STATE(1U)
+
+#define S_TPT_STAG_TYPE		20
+#define M_TPT_STAG_TYPE		0x3
+#define V_TPT_STAG_TYPE(x)	((x) << S_TPT_STAG_TYPE)
+#define G_TPT_STAG_TYPE(x)	(((x) >> S_TPT_STAG_TYPE) & M_TPT_STAG_TYPE)
+
+#define S_TPT_PDID		0
+#define M_TPT_PDID		0xFFFFF
+#define V_TPT_PDID(x)		((x) << S_TPT_PDID)
+#define G_TPT_PDID(x)		(((x) >> S_TPT_PDID) & M_TPT_PDID)
+
+#define S_TPT_PERM		28
+#define M_TPT_PERM		0xF
+#define V_TPT_PERM(x)		((x) << S_TPT_PERM)
+#define G_TPT_PERM(x)		(((x) >> S_TPT_PERM) & M_TPT_PERM)
+
+#define S_TPT_REM_INV_DIS	27
+#define V_TPT_REM_INV_DIS(x)	((x) << S_TPT_REM_INV_DIS)
+#define F_TPT_REM_INV_DIS	V_TPT_REM_INV_DIS(1U)
+
+#define S_TPT_ADDR_TYPE		26
+#define V_TPT_ADDR_TYPE(x)	((x) << S_TPT_ADDR_TYPE)
+#define F_TPT_ADDR_TYPE		V_TPT_ADDR_TYPE(1U)
+
+#define S_TPT_MW_BIND_ENABLE	25
+#define V_TPT_MW_BIND_ENABLE(x)	((x) << S_TPT_MW_BIND_ENABLE)
+#define F_TPT_MW_BIND_ENABLE    V_TPT_MW_BIND_ENABLE(1U)
+
+#define S_TPT_PAGE_SIZE		20
+#define M_TPT_PAGE_SIZE		0x1F
+#define V_TPT_PAGE_SIZE(x)	((x) << S_TPT_PAGE_SIZE)
+#define G_TPT_PAGE_SIZE(x)	(((x) >> S_TPT_PAGE_SIZE) & M_TPT_PAGE_SIZE)
+
+#define S_TPT_PBL_ADDR		0
+#define M_TPT_PBL_ADDR		0x1FFFFFFF
+#define V_TPT_PBL_ADDR(x)	((x) << S_TPT_PBL_ADDR)
+#define G_TPT_PBL_ADDR(x)       (((x) >> S_TPT_PBL_ADDR) & M_TPT_PBL_ADDR)
+
+#define S_TPT_QPID		0
+#define M_TPT_QPID		0xFFFFF
+#define V_TPT_QPID(x)		((x) << S_TPT_QPID)
+#define G_TPT_QPID(x)		(((x) >> S_TPT_QPID) & M_TPT_QPID)
+
+#define S_TPT_PSTAG		0
+#define M_TPT_PSTAG		0xFFFFFF
+#define V_TPT_PSTAG(x)		((x) << S_TPT_PSTAG)
+#define G_TPT_PSTAG(x)		(((x) >> S_TPT_PSTAG) & M_TPT_PSTAG)
+
+#define S_TPT_PBL_SIZE		0
+#define M_TPT_PBL_SIZE		0xFFFFF
+#define V_TPT_PBL_SIZE(x)	((x) << S_TPT_PBL_SIZE)
+#define G_TPT_PBL_SIZE(x)	(((x) >> S_TPT_PBL_SIZE) & M_TPT_PBL_SIZE)
+
+/*
+ * CQE defs
+ */
+struct t3_cqe {
+	__be32 header;
+	__be32 len;
+	union {
+		struct {
+			__be32 stag;
+			__be32 msn;
+		} rcqe;
+		struct {
+			u32 wrid_hi;
+			u32 wrid_low;
+		} scqe;
+	} u;
+};
+
+#define S_CQE_OOO	  31
+#define M_CQE_OOO	  0x1
+#define G_CQE_OOO(x)	  ((((x) >> S_CQE_OOO)) & M_CQE_OOO)
+#define V_CEQ_OOO(x)	  ((x)<<S_CQE_OOO)
+
+#define S_CQE_QPID        12
+#define M_CQE_QPID        0x7FFFF
+#define G_CQE_QPID(x)     ((((x) >> S_CQE_QPID)) & M_CQE_QPID)
+#define V_CQE_QPID(x)	  ((x)<<S_CQE_QPID)
+
+#define S_CQE_SWCQE       11
+#define M_CQE_SWCQE       0x1
+#define G_CQE_SWCQE(x)    ((((x) >> S_CQE_SWCQE)) & M_CQE_SWCQE)
+#define V_CQE_SWCQE(x)	  ((x)<<S_CQE_SWCQE)
+
+#define S_CQE_GENBIT      10
+#define M_CQE_GENBIT      0x1
+#define G_CQE_GENBIT(x)   (((x) >> S_CQE_GENBIT) & M_CQE_GENBIT)
+#define V_CQE_GENBIT(x)	  ((x)<<S_CQE_GENBIT)
+
+#define S_CQE_STATUS      5
+#define M_CQE_STATUS      0x1F
+#define G_CQE_STATUS(x)   ((((x) >> S_CQE_STATUS)) & M_CQE_STATUS)
+#define V_CQE_STATUS(x)   ((x)<<S_CQE_STATUS)
+
+#define S_CQE_TYPE        4
+#define M_CQE_TYPE        0x1
+#define G_CQE_TYPE(x)     ((((x) >> S_CQE_TYPE)) & M_CQE_TYPE)
+#define V_CQE_TYPE(x)     ((x)<<S_CQE_TYPE)
+
+#define S_CQE_OPCODE      0
+#define M_CQE_OPCODE      0xF
+#define G_CQE_OPCODE(x)   ((((x) >> S_CQE_OPCODE)) & M_CQE_OPCODE)
+#define V_CQE_OPCODE(x)   ((x)<<S_CQE_OPCODE)
+
+#define SW_CQE(x)         (G_CQE_SWCQE(be32_to_cpu((x).header)))
+#define CQE_OOO(x)        (G_CQE_OOO(be32_to_cpu((x).header)))
+#define CQE_QPID(x)       (G_CQE_QPID(be32_to_cpu((x).header)))
+#define CQE_GENBIT(x)     (G_CQE_GENBIT(be32_to_cpu((x).header)))
+#define CQE_TYPE(x)       (G_CQE_TYPE(be32_to_cpu((x).header)))
+#define SQ_TYPE(x)	  (CQE_TYPE((x)))
+#define RQ_TYPE(x)	  (!CQE_TYPE((x)))
+#define CQE_STATUS(x)     (G_CQE_STATUS(be32_to_cpu((x).header)))
+#define CQE_OPCODE(x)     (G_CQE_OPCODE(be32_to_cpu((x).header)))
+
+#define CQE_SEND_OPCODE(x)( \
+	(G_CQE_OPCODE(be32_to_cpu((x).header)) == T3_SEND) || \
+	(G_CQE_OPCODE(be32_to_cpu((x).header)) == T3_SEND_WITH_SE) || \
+	(G_CQE_OPCODE(be32_to_cpu((x).header)) == T3_SEND_WITH_INV) || \
+	(G_CQE_OPCODE(be32_to_cpu((x).header)) == T3_SEND_WITH_SE_INV))
+
+#define CQE_LEN(x)        (be32_to_cpu((x).len))
+
+/* used for RQ completion processing */
+#define CQE_WRID_STAG(x)  (be32_to_cpu((x).u.rcqe.stag))
+#define CQE_WRID_MSN(x)   (be32_to_cpu((x).u.rcqe.msn))
+
+/* used for SQ completion processing */
+#define CQE_WRID_SQ_WPTR(x)	((x).u.scqe.wrid_hi)
+#define CQE_WRID_WPTR(x)	((x).u.scqe.wrid_low)
+
+/* generic accessor macros */
+#define CQE_WRID_HI(x)		((x).u.scqe.wrid_hi)
+#define CQE_WRID_LOW(x)		((x).u.scqe.wrid_low)
+
+#define TPT_ERR_SUCCESS                     0x0
+#define TPT_ERR_STAG                        0x1	 /* STAG invalid: either the */
+						 /* STAG is offlimt, being 0, */
+						 /* or STAG_key mismatch */
+#define TPT_ERR_PDID                        0x2	 /* PDID mismatch */
+#define TPT_ERR_QPID                        0x3	 /* QPID mismatch */
+#define TPT_ERR_ACCESS                      0x4	 /* Invalid access right */
+#define TPT_ERR_WRAP                        0x5	 /* Wrap error */
+#define TPT_ERR_BOUND                       0x6	 /* base and bounds voilation */
+#define TPT_ERR_INVALIDATE_SHARED_MR        0x7	 /* attempt to invalidate a  */
+						 /* shared memory region */
+#define TPT_ERR_INVALIDATE_MR_WITH_MW_BOUND 0x8	 /* attempt to invalidate a  */
+						 /* shared memory region */
+#define TPT_ERR_ECC                         0x9	 /* ECC error detected */
+#define TPT_ERR_ECC_PSTAG                   0xA	 /* ECC error detected when  */
+						 /* reading PSTAG for a MW  */
+						 /* Invalidate */
+#define TPT_ERR_PBL_ADDR_BOUND              0xB	 /* pbl addr out of bounds:  */
+						 /* software error */
+#define TPT_ERR_SWFLUSH			    0xC	 /* SW FLUSHED */
+#define TPT_ERR_CRC                         0x10 /* CRC error */
+#define TPT_ERR_MARKER                      0x11 /* Marker error */
+#define TPT_ERR_PDU_LEN_ERR                 0x12 /* invalid PDU length */
+#define TPT_ERR_OUT_OF_RQE                  0x13 /* out of RQE */
+#define TPT_ERR_DDP_VERSION                 0x14 /* wrong DDP version */
+#define TPT_ERR_RDMA_VERSION                0x15 /* wrong RDMA version */
+#define TPT_ERR_OPCODE                      0x16 /* invalid rdma opcode */
+#define TPT_ERR_DDP_QUEUE_NUM               0x17 /* invalid ddp queue number */
+#define TPT_ERR_MSN                         0x18 /* MSN error */
+#define TPT_ERR_TBIT                        0x19 /* tag bit not set correctly */
+#define TPT_ERR_MO                          0x1A /* MO not 0 for TERMINATE  */
+						 /* or READ_REQ */
+#define TPT_ERR_MSN_GAP                     0x1B
+#define TPT_ERR_MSN_RANGE                   0x1C
+#define TPT_ERR_IRD_OVERFLOW                0x1D
+#define TPT_ERR_RQE_ADDR_BOUND              0x1E /* RQE addr out of bounds:  */
+						 /* software error */
+#define TPT_ERR_INTERNAL_ERR                0x1F /* internal error (opcode  */
+						 /* mismatch) */
+
+struct t3_swsq {
+	__u64			wr_id;
+	struct t3_cqe		cqe;
+	__u32			sq_wptr;
+	__be32			read_len;
+	int			opcode;
+	int			complete;
+	int			signaled;
+};
+
+struct t3_swrq {
+	__u64			wr_id;
+	__u32			pbl_addr;
+};
+
+/*
+ * A T3 WQ implements both the SQ and RQ.
+ */
+struct t3_wq {
+	union t3_wr *queue;		/* DMA accessible memory */
+	dma_addr_t dma_addr;		/* DMA address for HW */
+	DEFINE_DMA_UNMAP_ADDR(mapping); /* unmap kruft */
+	u32 error;			/* 1 once we go to ERROR */
+	u32 qpid;
+	u32 wptr;			/* idx to next available WR slot */
+	u32 size_log2;			/* total wq size */
+	struct t3_swsq *sq;		/* SW SQ */
+	struct t3_swsq *oldest_read;	/* tracks oldest pending read */
+	u32 sq_wptr;			/* sq_wptr - sq_rptr == count of */
+	u32 sq_rptr;			/* pending wrs */
+	u32 sq_size_log2;		/* sq size */
+	struct t3_swrq *rq;		/* SW RQ (holds consumer wr_ids */
+	u32 rq_wptr;			/* rq_wptr - rq_rptr == count of */
+	u32 rq_rptr;			/* pending wrs */
+	struct t3_swrq *rq_oldest_wr;	/* oldest wr on the SW RQ */
+	u32 rq_size_log2;		/* rq size */
+	u32 rq_addr;			/* rq adapter address */
+	void __iomem *doorbell;		/* kernel db */
+	u64 udb;			/* user db if any */
+	struct cxio_rdev *rdev;
+};
+
+struct t3_cq {
+	u32 cqid;
+	u32 rptr;
+	u32 wptr;
+	u32 size_log2;
+	dma_addr_t dma_addr;
+	DEFINE_DMA_UNMAP_ADDR(mapping);
+	struct t3_cqe *queue;
+	struct t3_cqe *sw_queue;
+	u32 sw_rptr;
+	u32 sw_wptr;
+};
+
+#define CQ_VLD_ENTRY(ptr,size_log2,cqe) (Q_GENBIT(ptr,size_log2) == \
+					 CQE_GENBIT(*cqe))
+
+struct t3_cq_status_page {
+	u32 cq_err;
+};
+
+static inline int cxio_cq_in_error(struct t3_cq *cq)
+{
+	return ((struct t3_cq_status_page *)
+		&cq->queue[1 << cq->size_log2])->cq_err;
+}
+
+static inline void cxio_set_cq_in_error(struct t3_cq *cq)
+{
+	((struct t3_cq_status_page *)
+	 &cq->queue[1 << cq->size_log2])->cq_err = 1;
+}
+
+static inline void cxio_set_wq_in_error(struct t3_wq *wq)
+{
+	wq->queue->wq_in_err.err |= 1;
+}
+
+static inline void cxio_disable_wq_db(struct t3_wq *wq)
+{
+	wq->queue->wq_in_err.err |= 2;
+}
+
+static inline void cxio_enable_wq_db(struct t3_wq *wq)
+{
+	wq->queue->wq_in_err.err &= ~2;
+}
+
+static inline int cxio_wq_db_enabled(struct t3_wq *wq)
+{
+	return !(wq->queue->wq_in_err.err & 2);
+}
+
+static inline struct t3_cqe *cxio_next_hw_cqe(struct t3_cq *cq)
+{
+	struct t3_cqe *cqe;
+
+	cqe = cq->queue + (Q_PTR2IDX(cq->rptr, cq->size_log2));
+	if (CQ_VLD_ENTRY(cq->rptr, cq->size_log2, cqe))
+		return cqe;
+	return NULL;
+}
+
+static inline struct t3_cqe *cxio_next_sw_cqe(struct t3_cq *cq)
+{
+	struct t3_cqe *cqe;
+
+	if (!Q_EMPTY(cq->sw_rptr, cq->sw_wptr)) {
+		cqe = cq->sw_queue + (Q_PTR2IDX(cq->sw_rptr, cq->size_log2));
+		return cqe;
+	}
+	return NULL;
+}
+
+static inline struct t3_cqe *cxio_next_cqe(struct t3_cq *cq)
+{
+	struct t3_cqe *cqe;
+
+	if (!Q_EMPTY(cq->sw_rptr, cq->sw_wptr)) {
+		cqe = cq->sw_queue + (Q_PTR2IDX(cq->sw_rptr, cq->size_log2));
+		return cqe;
+	}
+	cqe = cq->queue + (Q_PTR2IDX(cq->rptr, cq->size_log2));
+	if (CQ_VLD_ENTRY(cq->rptr, cq->size_log2, cqe))
+		return cqe;
+	return NULL;
+}
+
+#endif
diff --git a/drivers/infiniband/hw/cxgb3/iwch.c b/drivers/infiniband/hw/cxgb3/iwch.c
new file mode 100644
index 0000000..8e77dc5
--- /dev/null
+++ b/drivers/infiniband/hw/cxgb3/iwch.c
@@ -0,0 +1,292 @@
+/*
+ * Copyright (c) 2006 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+
+#include <rdma/ib_verbs.h>
+
+#include "cxgb3_offload.h"
+#include "iwch_provider.h"
+#include "iwch_user.h"
+#include "iwch.h"
+#include "iwch_cm.h"
+
+#define DRV_VERSION "1.1"
+
+MODULE_AUTHOR("Boyd Faulkner, Steve Wise");
+MODULE_DESCRIPTION("Chelsio T3 RDMA Driver");
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_VERSION(DRV_VERSION);
+
+static void open_rnic_dev(struct t3cdev *);
+static void close_rnic_dev(struct t3cdev *);
+static void iwch_event_handler(struct t3cdev *, u32, u32);
+
+struct cxgb3_client t3c_client = {
+	.name = "iw_cxgb3",
+	.add = open_rnic_dev,
+	.remove = close_rnic_dev,
+	.handlers = t3c_handlers,
+	.redirect = iwch_ep_redirect,
+	.event_handler = iwch_event_handler
+};
+
+static LIST_HEAD(dev_list);
+static DEFINE_MUTEX(dev_mutex);
+
+static int disable_qp_db(int id, void *p, void *data)
+{
+	struct iwch_qp *qhp = p;
+
+	cxio_disable_wq_db(&qhp->wq);
+	return 0;
+}
+
+static int enable_qp_db(int id, void *p, void *data)
+{
+	struct iwch_qp *qhp = p;
+
+	if (data)
+		ring_doorbell(qhp->rhp->rdev.ctrl_qp.doorbell, qhp->wq.qpid);
+	cxio_enable_wq_db(&qhp->wq);
+	return 0;
+}
+
+static void disable_dbs(struct iwch_dev *rnicp)
+{
+	spin_lock_irq(&rnicp->lock);
+	idr_for_each(&rnicp->qpidr, disable_qp_db, NULL);
+	spin_unlock_irq(&rnicp->lock);
+}
+
+static void enable_dbs(struct iwch_dev *rnicp, int ring_db)
+{
+	spin_lock_irq(&rnicp->lock);
+	idr_for_each(&rnicp->qpidr, enable_qp_db,
+		     (void *)(unsigned long)ring_db);
+	spin_unlock_irq(&rnicp->lock);
+}
+
+static void iwch_db_drop_task(struct work_struct *work)
+{
+	struct iwch_dev *rnicp = container_of(work, struct iwch_dev,
+					      db_drop_task.work);
+	enable_dbs(rnicp, 1);
+}
+
+static void rnic_init(struct iwch_dev *rnicp)
+{
+	PDBG("%s iwch_dev %p\n", __func__,  rnicp);
+	idr_init(&rnicp->cqidr);
+	idr_init(&rnicp->qpidr);
+	idr_init(&rnicp->mmidr);
+	spin_lock_init(&rnicp->lock);
+	INIT_DELAYED_WORK(&rnicp->db_drop_task, iwch_db_drop_task);
+
+	rnicp->attr.max_qps = T3_MAX_NUM_QP - 32;
+	rnicp->attr.max_wrs = T3_MAX_QP_DEPTH;
+	rnicp->attr.max_sge_per_wr = T3_MAX_SGE;
+	rnicp->attr.max_sge_per_rdma_write_wr = T3_MAX_SGE;
+	rnicp->attr.max_cqs = T3_MAX_NUM_CQ - 1;
+	rnicp->attr.max_cqes_per_cq = T3_MAX_CQ_DEPTH;
+	rnicp->attr.max_mem_regs = cxio_num_stags(&rnicp->rdev);
+	rnicp->attr.max_phys_buf_entries = T3_MAX_PBL_SIZE;
+	rnicp->attr.max_pds = T3_MAX_NUM_PD - 1;
+	rnicp->attr.mem_pgsizes_bitmask = T3_PAGESIZE_MASK;
+	rnicp->attr.max_mr_size = T3_MAX_MR_SIZE;
+	rnicp->attr.can_resize_wq = 0;
+	rnicp->attr.max_rdma_reads_per_qp = 8;
+	rnicp->attr.max_rdma_read_resources =
+	    rnicp->attr.max_rdma_reads_per_qp * rnicp->attr.max_qps;
+	rnicp->attr.max_rdma_read_qp_depth = 8;	/* IRD */
+	rnicp->attr.max_rdma_read_depth =
+	    rnicp->attr.max_rdma_read_qp_depth * rnicp->attr.max_qps;
+	rnicp->attr.rq_overflow_handled = 0;
+	rnicp->attr.can_modify_ird = 0;
+	rnicp->attr.can_modify_ord = 0;
+	rnicp->attr.max_mem_windows = rnicp->attr.max_mem_regs - 1;
+	rnicp->attr.stag0_value = 1;
+	rnicp->attr.zbva_support = 1;
+	rnicp->attr.local_invalidate_fence = 1;
+	rnicp->attr.cq_overflow_detection = 1;
+	return;
+}
+
+static void open_rnic_dev(struct t3cdev *tdev)
+{
+	struct iwch_dev *rnicp;
+
+	PDBG("%s t3cdev %p\n", __func__,  tdev);
+	printk_once(KERN_INFO MOD "Chelsio T3 RDMA Driver - version %s\n",
+		       DRV_VERSION);
+	rnicp = (struct iwch_dev *)ib_alloc_device(sizeof(*rnicp));
+	if (!rnicp) {
+		printk(KERN_ERR MOD "Cannot allocate ib device\n");
+		return;
+	}
+	rnicp->rdev.ulp = rnicp;
+	rnicp->rdev.t3cdev_p = tdev;
+
+	mutex_lock(&dev_mutex);
+
+	if (cxio_rdev_open(&rnicp->rdev)) {
+		mutex_unlock(&dev_mutex);
+		printk(KERN_ERR MOD "Unable to open CXIO rdev\n");
+		ib_dealloc_device(&rnicp->ibdev);
+		return;
+	}
+
+	rnic_init(rnicp);
+
+	list_add_tail(&rnicp->entry, &dev_list);
+	mutex_unlock(&dev_mutex);
+
+	if (iwch_register_device(rnicp)) {
+		printk(KERN_ERR MOD "Unable to register device\n");
+		close_rnic_dev(tdev);
+	}
+	printk(KERN_INFO MOD "Initialized device %s\n",
+	       pci_name(rnicp->rdev.rnic_info.pdev));
+	return;
+}
+
+static void close_rnic_dev(struct t3cdev *tdev)
+{
+	struct iwch_dev *dev, *tmp;
+	PDBG("%s t3cdev %p\n", __func__,  tdev);
+	mutex_lock(&dev_mutex);
+	list_for_each_entry_safe(dev, tmp, &dev_list, entry) {
+		if (dev->rdev.t3cdev_p == tdev) {
+			dev->rdev.flags = CXIO_ERROR_FATAL;
+			synchronize_net();
+			cancel_delayed_work_sync(&dev->db_drop_task);
+			list_del(&dev->entry);
+			iwch_unregister_device(dev);
+			cxio_rdev_close(&dev->rdev);
+			idr_destroy(&dev->cqidr);
+			idr_destroy(&dev->qpidr);
+			idr_destroy(&dev->mmidr);
+			ib_dealloc_device(&dev->ibdev);
+			break;
+		}
+	}
+	mutex_unlock(&dev_mutex);
+}
+
+static void iwch_event_handler(struct t3cdev *tdev, u32 evt, u32 port_id)
+{
+	struct cxio_rdev *rdev = tdev->ulp;
+	struct iwch_dev *rnicp;
+	struct ib_event event;
+	u32 portnum = port_id + 1;
+	int dispatch = 0;
+
+	if (!rdev)
+		return;
+	rnicp = rdev_to_iwch_dev(rdev);
+	switch (evt) {
+	case OFFLOAD_STATUS_DOWN: {
+		rdev->flags = CXIO_ERROR_FATAL;
+		synchronize_net();
+		event.event  = IB_EVENT_DEVICE_FATAL;
+		dispatch = 1;
+		break;
+		}
+	case OFFLOAD_PORT_DOWN: {
+		event.event  = IB_EVENT_PORT_ERR;
+		dispatch = 1;
+		break;
+		}
+	case OFFLOAD_PORT_UP: {
+		event.event  = IB_EVENT_PORT_ACTIVE;
+		dispatch = 1;
+		break;
+		}
+	case OFFLOAD_DB_FULL: {
+		disable_dbs(rnicp);
+		break;
+		}
+	case OFFLOAD_DB_EMPTY: {
+		enable_dbs(rnicp, 1);
+		break;
+		}
+	case OFFLOAD_DB_DROP: {
+		unsigned long delay = 1000;
+		unsigned short r;
+
+		disable_dbs(rnicp);
+		get_random_bytes(&r, 2);
+		delay += r & 1023;
+
+		/*
+		 * delay is between 1000-2023 usecs.
+		 */
+		schedule_delayed_work(&rnicp->db_drop_task,
+			usecs_to_jiffies(delay));
+		break;
+		}
+	}
+
+	if (dispatch) {
+		event.device = &rnicp->ibdev;
+		event.element.port_num = portnum;
+		ib_dispatch_event(&event);
+	}
+
+	return;
+}
+
+static int __init iwch_init_module(void)
+{
+	int err;
+
+	err = cxio_hal_init();
+	if (err)
+		return err;
+	err = iwch_cm_init();
+	if (err)
+		return err;
+	cxio_register_ev_cb(iwch_ev_dispatch);
+	cxgb3_register_client(&t3c_client);
+	return 0;
+}
+
+static void __exit iwch_exit_module(void)
+{
+	cxgb3_unregister_client(&t3c_client);
+	cxio_unregister_ev_cb(iwch_ev_dispatch);
+	iwch_cm_term();
+	cxio_hal_exit();
+}
+
+module_init(iwch_init_module);
+module_exit(iwch_exit_module);
diff --git a/drivers/infiniband/hw/cxgb3/iwch.h b/drivers/infiniband/hw/cxgb3/iwch.h
new file mode 100644
index 0000000..8378622
--- /dev/null
+++ b/drivers/infiniband/hw/cxgb3/iwch.h
@@ -0,0 +1,180 @@
+/*
+ * Copyright (c) 2006 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __IWCH_H__
+#define __IWCH_H__
+
+#include <linux/mutex.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/idr.h>
+#include <linux/workqueue.h>
+
+#include <rdma/ib_verbs.h>
+
+#include "cxio_hal.h"
+#include "cxgb3_offload.h"
+
+struct iwch_pd;
+struct iwch_cq;
+struct iwch_qp;
+struct iwch_mr;
+
+struct iwch_rnic_attributes {
+	u32 max_qps;
+	u32 max_wrs;				/* Max for any SQ/RQ */
+	u32 max_sge_per_wr;
+	u32 max_sge_per_rdma_write_wr;	/* for RDMA Write WR */
+	u32 max_cqs;
+	u32 max_cqes_per_cq;
+	u32 max_mem_regs;
+	u32 max_phys_buf_entries;		/* for phys buf list */
+	u32 max_pds;
+
+	/*
+	 * The memory page sizes supported by this RNIC.
+	 * Bit position i in bitmap indicates page of
+	 * size (4k)^i.  Phys block list mode unsupported.
+	 */
+	u32 mem_pgsizes_bitmask;
+	u64 max_mr_size;
+	u8 can_resize_wq;
+
+	/*
+	 * The maximum number of RDMA Reads that can be outstanding
+	 * per QP with this RNIC as the target.
+	 */
+	u32 max_rdma_reads_per_qp;
+
+	/*
+	 * The maximum number of resources used for RDMA Reads
+	 * by this RNIC with this RNIC as the target.
+	 */
+	u32 max_rdma_read_resources;
+
+	/*
+	 * The max depth per QP for initiation of RDMA Read
+	 * by this RNIC.
+	 */
+	u32 max_rdma_read_qp_depth;
+
+	/*
+	 * The maximum depth for initiation of RDMA Read
+	 * operations by this RNIC on all QPs
+	 */
+	u32 max_rdma_read_depth;
+	u8 rq_overflow_handled;
+	u32 can_modify_ird;
+	u32 can_modify_ord;
+	u32 max_mem_windows;
+	u32 stag0_value;
+	u8 zbva_support;
+	u8 local_invalidate_fence;
+	u32 cq_overflow_detection;
+};
+
+struct iwch_dev {
+	struct ib_device ibdev;
+	struct cxio_rdev rdev;
+	u32 device_cap_flags;
+	struct iwch_rnic_attributes attr;
+	struct idr cqidr;
+	struct idr qpidr;
+	struct idr mmidr;
+	spinlock_t lock;
+	struct list_head entry;
+	struct delayed_work db_drop_task;
+};
+
+static inline struct iwch_dev *to_iwch_dev(struct ib_device *ibdev)
+{
+	return container_of(ibdev, struct iwch_dev, ibdev);
+}
+
+static inline struct iwch_dev *rdev_to_iwch_dev(struct cxio_rdev *rdev)
+{
+	return container_of(rdev, struct iwch_dev, rdev);
+}
+
+static inline int t3b_device(const struct iwch_dev *rhp)
+{
+	return rhp->rdev.t3cdev_p->type == T3B;
+}
+
+static inline int t3a_device(const struct iwch_dev *rhp)
+{
+	return rhp->rdev.t3cdev_p->type == T3A;
+}
+
+static inline struct iwch_cq *get_chp(struct iwch_dev *rhp, u32 cqid)
+{
+	return idr_find(&rhp->cqidr, cqid);
+}
+
+static inline struct iwch_qp *get_qhp(struct iwch_dev *rhp, u32 qpid)
+{
+	return idr_find(&rhp->qpidr, qpid);
+}
+
+static inline struct iwch_mr *get_mhp(struct iwch_dev *rhp, u32 mmid)
+{
+	return idr_find(&rhp->mmidr, mmid);
+}
+
+static inline int insert_handle(struct iwch_dev *rhp, struct idr *idr,
+				void *handle, u32 id)
+{
+	int ret;
+
+	idr_preload(GFP_KERNEL);
+	spin_lock_irq(&rhp->lock);
+
+	ret = idr_alloc(idr, handle, id, id + 1, GFP_NOWAIT);
+
+	spin_unlock_irq(&rhp->lock);
+	idr_preload_end();
+
+	BUG_ON(ret == -ENOSPC);
+	return ret < 0 ? ret : 0;
+}
+
+static inline void remove_handle(struct iwch_dev *rhp, struct idr *idr, u32 id)
+{
+	spin_lock_irq(&rhp->lock);
+	idr_remove(idr, id);
+	spin_unlock_irq(&rhp->lock);
+}
+
+extern struct cxgb3_client t3c_client;
+extern cxgb3_cpl_handler_func t3c_handlers[NUM_CPL_CMDS];
+extern void iwch_ev_dispatch(struct cxio_rdev *rdev_p, struct sk_buff *skb);
+
+#endif
diff --git a/drivers/infiniband/hw/cxgb3/iwch_cm.c b/drivers/infiniband/hw/cxgb3/iwch_cm.c
new file mode 100644
index 0000000..cb78b1e
--- /dev/null
+++ b/drivers/infiniband/hw/cxgb3/iwch_cm.c
@@ -0,0 +1,2272 @@
+/*
+ * Copyright (c) 2006 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include <linux/module.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/workqueue.h>
+#include <linux/skbuff.h>
+#include <linux/timer.h>
+#include <linux/notifier.h>
+#include <linux/inetdevice.h>
+
+#include <net/neighbour.h>
+#include <net/netevent.h>
+#include <net/route.h>
+
+#include "tcb.h"
+#include "cxgb3_offload.h"
+#include "iwch.h"
+#include "iwch_provider.h"
+#include "iwch_cm.h"
+
+static char *states[] = {
+	"idle",
+	"listen",
+	"connecting",
+	"mpa_wait_req",
+	"mpa_req_sent",
+	"mpa_req_rcvd",
+	"mpa_rep_sent",
+	"fpdu_mode",
+	"aborting",
+	"closing",
+	"moribund",
+	"dead",
+	NULL,
+};
+
+int peer2peer = 0;
+module_param(peer2peer, int, 0644);
+MODULE_PARM_DESC(peer2peer, "Support peer2peer ULPs (default=0)");
+
+static int ep_timeout_secs = 60;
+module_param(ep_timeout_secs, int, 0644);
+MODULE_PARM_DESC(ep_timeout_secs, "CM Endpoint operation timeout "
+				   "in seconds (default=60)");
+
+static int mpa_rev = 1;
+module_param(mpa_rev, int, 0644);
+MODULE_PARM_DESC(mpa_rev, "MPA Revision, 0 supports amso1100, "
+		 "1 is spec compliant. (default=1)");
+
+static int markers_enabled = 0;
+module_param(markers_enabled, int, 0644);
+MODULE_PARM_DESC(markers_enabled, "Enable MPA MARKERS (default(0)=disabled)");
+
+static int crc_enabled = 1;
+module_param(crc_enabled, int, 0644);
+MODULE_PARM_DESC(crc_enabled, "Enable MPA CRC (default(1)=enabled)");
+
+static int rcv_win = 256 * 1024;
+module_param(rcv_win, int, 0644);
+MODULE_PARM_DESC(rcv_win, "TCP receive window in bytes (default=256)");
+
+static int snd_win = 32 * 1024;
+module_param(snd_win, int, 0644);
+MODULE_PARM_DESC(snd_win, "TCP send window in bytes (default=32KB)");
+
+static unsigned int nocong = 0;
+module_param(nocong, uint, 0644);
+MODULE_PARM_DESC(nocong, "Turn off congestion control (default=0)");
+
+static unsigned int cong_flavor = 1;
+module_param(cong_flavor, uint, 0644);
+MODULE_PARM_DESC(cong_flavor, "TCP Congestion control flavor (default=1)");
+
+static struct workqueue_struct *workq;
+
+static struct sk_buff_head rxq;
+
+static struct sk_buff *get_skb(struct sk_buff *skb, int len, gfp_t gfp);
+static void ep_timeout(unsigned long arg);
+static void connect_reply_upcall(struct iwch_ep *ep, int status);
+
+static void start_ep_timer(struct iwch_ep *ep)
+{
+	PDBG("%s ep %p\n", __func__, ep);
+	if (timer_pending(&ep->timer)) {
+		PDBG("%s stopped / restarted timer ep %p\n", __func__, ep);
+		del_timer_sync(&ep->timer);
+	} else
+		get_ep(&ep->com);
+	ep->timer.expires = jiffies + ep_timeout_secs * HZ;
+	ep->timer.data = (unsigned long)ep;
+	ep->timer.function = ep_timeout;
+	add_timer(&ep->timer);
+}
+
+static void stop_ep_timer(struct iwch_ep *ep)
+{
+	PDBG("%s ep %p\n", __func__, ep);
+	if (!timer_pending(&ep->timer)) {
+		WARN(1, "%s timer stopped when its not running!  ep %p state %u\n",
+			__func__, ep, ep->com.state);
+		return;
+	}
+	del_timer_sync(&ep->timer);
+	put_ep(&ep->com);
+}
+
+static int iwch_l2t_send(struct t3cdev *tdev, struct sk_buff *skb, struct l2t_entry *l2e)
+{
+	int	error = 0;
+	struct cxio_rdev *rdev;
+
+	rdev = (struct cxio_rdev *)tdev->ulp;
+	if (cxio_fatal_error(rdev)) {
+		kfree_skb(skb);
+		return -EIO;
+	}
+	error = l2t_send(tdev, skb, l2e);
+	if (error < 0)
+		kfree_skb(skb);
+	return error;
+}
+
+int iwch_cxgb3_ofld_send(struct t3cdev *tdev, struct sk_buff *skb)
+{
+	int	error = 0;
+	struct cxio_rdev *rdev;
+
+	rdev = (struct cxio_rdev *)tdev->ulp;
+	if (cxio_fatal_error(rdev)) {
+		kfree_skb(skb);
+		return -EIO;
+	}
+	error = cxgb3_ofld_send(tdev, skb);
+	if (error < 0)
+		kfree_skb(skb);
+	return error;
+}
+
+static void release_tid(struct t3cdev *tdev, u32 hwtid, struct sk_buff *skb)
+{
+	struct cpl_tid_release *req;
+
+	skb = get_skb(skb, sizeof *req, GFP_KERNEL);
+	if (!skb)
+		return;
+	req = (struct cpl_tid_release *) skb_put(skb, sizeof(*req));
+	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
+	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_TID_RELEASE, hwtid));
+	skb->priority = CPL_PRIORITY_SETUP;
+	iwch_cxgb3_ofld_send(tdev, skb);
+	return;
+}
+
+int iwch_quiesce_tid(struct iwch_ep *ep)
+{
+	struct cpl_set_tcb_field *req;
+	struct sk_buff *skb = get_skb(NULL, sizeof(*req), GFP_KERNEL);
+
+	if (!skb)
+		return -ENOMEM;
+	req = (struct cpl_set_tcb_field *) skb_put(skb, sizeof(*req));
+	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
+	req->wr.wr_lo = htonl(V_WR_TID(ep->hwtid));
+	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, ep->hwtid));
+	req->reply = 0;
+	req->cpu_idx = 0;
+	req->word = htons(W_TCB_RX_QUIESCE);
+	req->mask = cpu_to_be64(1ULL << S_TCB_RX_QUIESCE);
+	req->val = cpu_to_be64(1 << S_TCB_RX_QUIESCE);
+
+	skb->priority = CPL_PRIORITY_DATA;
+	return iwch_cxgb3_ofld_send(ep->com.tdev, skb);
+}
+
+int iwch_resume_tid(struct iwch_ep *ep)
+{
+	struct cpl_set_tcb_field *req;
+	struct sk_buff *skb = get_skb(NULL, sizeof(*req), GFP_KERNEL);
+
+	if (!skb)
+		return -ENOMEM;
+	req = (struct cpl_set_tcb_field *) skb_put(skb, sizeof(*req));
+	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
+	req->wr.wr_lo = htonl(V_WR_TID(ep->hwtid));
+	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, ep->hwtid));
+	req->reply = 0;
+	req->cpu_idx = 0;
+	req->word = htons(W_TCB_RX_QUIESCE);
+	req->mask = cpu_to_be64(1ULL << S_TCB_RX_QUIESCE);
+	req->val = 0;
+
+	skb->priority = CPL_PRIORITY_DATA;
+	return iwch_cxgb3_ofld_send(ep->com.tdev, skb);
+}
+
+static void set_emss(struct iwch_ep *ep, u16 opt)
+{
+	PDBG("%s ep %p opt %u\n", __func__, ep, opt);
+	ep->emss = T3C_DATA(ep->com.tdev)->mtus[G_TCPOPT_MSS(opt)] - 40;
+	if (G_TCPOPT_TSTAMP(opt))
+		ep->emss -= 12;
+	if (ep->emss < 128)
+		ep->emss = 128;
+	PDBG("emss=%d\n", ep->emss);
+}
+
+static enum iwch_ep_state state_read(struct iwch_ep_common *epc)
+{
+	unsigned long flags;
+	enum iwch_ep_state state;
+
+	spin_lock_irqsave(&epc->lock, flags);
+	state = epc->state;
+	spin_unlock_irqrestore(&epc->lock, flags);
+	return state;
+}
+
+static void __state_set(struct iwch_ep_common *epc, enum iwch_ep_state new)
+{
+	epc->state = new;
+}
+
+static void state_set(struct iwch_ep_common *epc, enum iwch_ep_state new)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&epc->lock, flags);
+	PDBG("%s - %s -> %s\n", __func__, states[epc->state], states[new]);
+	__state_set(epc, new);
+	spin_unlock_irqrestore(&epc->lock, flags);
+	return;
+}
+
+static void *alloc_ep(int size, gfp_t gfp)
+{
+	struct iwch_ep_common *epc;
+
+	epc = kzalloc(size, gfp);
+	if (epc) {
+		kref_init(&epc->kref);
+		spin_lock_init(&epc->lock);
+		init_waitqueue_head(&epc->waitq);
+	}
+	PDBG("%s alloc ep %p\n", __func__, epc);
+	return epc;
+}
+
+void __free_ep(struct kref *kref)
+{
+	struct iwch_ep *ep;
+	ep = container_of(container_of(kref, struct iwch_ep_common, kref),
+			  struct iwch_ep, com);
+	PDBG("%s ep %p state %s\n", __func__, ep, states[state_read(&ep->com)]);
+	if (test_bit(RELEASE_RESOURCES, &ep->com.flags)) {
+		cxgb3_remove_tid(ep->com.tdev, (void *)ep, ep->hwtid);
+		dst_release(ep->dst);
+		l2t_release(ep->com.tdev, ep->l2t);
+	}
+	kfree(ep);
+}
+
+static void release_ep_resources(struct iwch_ep *ep)
+{
+	PDBG("%s ep %p tid %d\n", __func__, ep, ep->hwtid);
+	set_bit(RELEASE_RESOURCES, &ep->com.flags);
+	put_ep(&ep->com);
+}
+
+static int status2errno(int status)
+{
+	switch (status) {
+	case CPL_ERR_NONE:
+		return 0;
+	case CPL_ERR_CONN_RESET:
+		return -ECONNRESET;
+	case CPL_ERR_ARP_MISS:
+		return -EHOSTUNREACH;
+	case CPL_ERR_CONN_TIMEDOUT:
+		return -ETIMEDOUT;
+	case CPL_ERR_TCAM_FULL:
+		return -ENOMEM;
+	case CPL_ERR_CONN_EXIST:
+		return -EADDRINUSE;
+	default:
+		return -EIO;
+	}
+}
+
+/*
+ * Try and reuse skbs already allocated...
+ */
+static struct sk_buff *get_skb(struct sk_buff *skb, int len, gfp_t gfp)
+{
+	if (skb && !skb_is_nonlinear(skb) && !skb_cloned(skb)) {
+		skb_trim(skb, 0);
+		skb_get(skb);
+	} else {
+		skb = alloc_skb(len, gfp);
+	}
+	return skb;
+}
+
+static struct rtable *find_route(struct t3cdev *dev, __be32 local_ip,
+				 __be32 peer_ip, __be16 local_port,
+				 __be16 peer_port, u8 tos)
+{
+	struct rtable *rt;
+	struct flowi4 fl4;
+
+	rt = ip_route_output_ports(&init_net, &fl4, NULL, peer_ip, local_ip,
+				   peer_port, local_port, IPPROTO_TCP,
+				   tos, 0);
+	if (IS_ERR(rt))
+		return NULL;
+	return rt;
+}
+
+static unsigned int find_best_mtu(const struct t3c_data *d, unsigned short mtu)
+{
+	int i = 0;
+
+	while (i < d->nmtus - 1 && d->mtus[i + 1] <= mtu)
+		++i;
+	return i;
+}
+
+static void arp_failure_discard(struct t3cdev *dev, struct sk_buff *skb)
+{
+	PDBG("%s t3cdev %p\n", __func__, dev);
+	kfree_skb(skb);
+}
+
+/*
+ * Handle an ARP failure for an active open.
+ */
+static void act_open_req_arp_failure(struct t3cdev *dev, struct sk_buff *skb)
+{
+	printk(KERN_ERR MOD "ARP failure duing connect\n");
+	kfree_skb(skb);
+}
+
+/*
+ * Handle an ARP failure for a CPL_ABORT_REQ.  Change it into a no RST variant
+ * and send it along.
+ */
+static void abort_arp_failure(struct t3cdev *dev, struct sk_buff *skb)
+{
+	struct cpl_abort_req *req = cplhdr(skb);
+
+	PDBG("%s t3cdev %p\n", __func__, dev);
+	req->cmd = CPL_ABORT_NO_RST;
+	iwch_cxgb3_ofld_send(dev, skb);
+}
+
+static int send_halfclose(struct iwch_ep *ep, gfp_t gfp)
+{
+	struct cpl_close_con_req *req;
+	struct sk_buff *skb;
+
+	PDBG("%s ep %p\n", __func__, ep);
+	skb = get_skb(NULL, sizeof(*req), gfp);
+	if (!skb) {
+		printk(KERN_ERR MOD "%s - failed to alloc skb\n", __func__);
+		return -ENOMEM;
+	}
+	skb->priority = CPL_PRIORITY_DATA;
+	set_arp_failure_handler(skb, arp_failure_discard);
+	req = (struct cpl_close_con_req *) skb_put(skb, sizeof(*req));
+	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_CLOSE_CON));
+	req->wr.wr_lo = htonl(V_WR_TID(ep->hwtid));
+	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_CON_REQ, ep->hwtid));
+	return iwch_l2t_send(ep->com.tdev, skb, ep->l2t);
+}
+
+static int send_abort(struct iwch_ep *ep, struct sk_buff *skb, gfp_t gfp)
+{
+	struct cpl_abort_req *req;
+
+	PDBG("%s ep %p\n", __func__, ep);
+	skb = get_skb(skb, sizeof(*req), gfp);
+	if (!skb) {
+		printk(KERN_ERR MOD "%s - failed to alloc skb.\n",
+		       __func__);
+		return -ENOMEM;
+	}
+	skb->priority = CPL_PRIORITY_DATA;
+	set_arp_failure_handler(skb, abort_arp_failure);
+	req = (struct cpl_abort_req *) skb_put(skb, sizeof(*req));
+	memset(req, 0, sizeof(*req));
+	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_REQ));
+	req->wr.wr_lo = htonl(V_WR_TID(ep->hwtid));
+	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ABORT_REQ, ep->hwtid));
+	req->cmd = CPL_ABORT_SEND_RST;
+	return iwch_l2t_send(ep->com.tdev, skb, ep->l2t);
+}
+
+static int send_connect(struct iwch_ep *ep)
+{
+	struct cpl_act_open_req *req;
+	struct sk_buff *skb;
+	u32 opt0h, opt0l, opt2;
+	unsigned int mtu_idx;
+	int wscale;
+
+	PDBG("%s ep %p\n", __func__, ep);
+
+	skb = get_skb(NULL, sizeof(*req), GFP_KERNEL);
+	if (!skb) {
+		printk(KERN_ERR MOD "%s - failed to alloc skb.\n",
+		       __func__);
+		return -ENOMEM;
+	}
+	mtu_idx = find_best_mtu(T3C_DATA(ep->com.tdev), dst_mtu(ep->dst));
+	wscale = compute_wscale(rcv_win);
+	opt0h = V_NAGLE(0) |
+	    V_NO_CONG(nocong) |
+	    V_KEEP_ALIVE(1) |
+	    F_TCAM_BYPASS |
+	    V_WND_SCALE(wscale) |
+	    V_MSS_IDX(mtu_idx) |
+	    V_L2T_IDX(ep->l2t->idx) | V_TX_CHANNEL(ep->l2t->smt_idx);
+	opt0l = V_TOS((ep->tos >> 2) & M_TOS) | V_RCV_BUFSIZ(rcv_win>>10);
+	opt2 = F_RX_COALESCE_VALID | V_RX_COALESCE(0) | V_FLAVORS_VALID(1) |
+	       V_CONG_CONTROL_FLAVOR(cong_flavor);
+	skb->priority = CPL_PRIORITY_SETUP;
+	set_arp_failure_handler(skb, act_open_req_arp_failure);
+
+	req = (struct cpl_act_open_req *) skb_put(skb, sizeof(*req));
+	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
+	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ACT_OPEN_REQ, ep->atid));
+	req->local_port = ep->com.local_addr.sin_port;
+	req->peer_port = ep->com.remote_addr.sin_port;
+	req->local_ip = ep->com.local_addr.sin_addr.s_addr;
+	req->peer_ip = ep->com.remote_addr.sin_addr.s_addr;
+	req->opt0h = htonl(opt0h);
+	req->opt0l = htonl(opt0l);
+	req->params = 0;
+	req->opt2 = htonl(opt2);
+	return iwch_l2t_send(ep->com.tdev, skb, ep->l2t);
+}
+
+static void send_mpa_req(struct iwch_ep *ep, struct sk_buff *skb)
+{
+	int mpalen;
+	struct tx_data_wr *req;
+	struct mpa_message *mpa;
+	int len;
+
+	PDBG("%s ep %p pd_len %d\n", __func__, ep, ep->plen);
+
+	BUG_ON(skb_cloned(skb));
+
+	mpalen = sizeof(*mpa) + ep->plen;
+	if (skb->data + mpalen + sizeof(*req) > skb_end_pointer(skb)) {
+		kfree_skb(skb);
+		skb=alloc_skb(mpalen + sizeof(*req), GFP_KERNEL);
+		if (!skb) {
+			connect_reply_upcall(ep, -ENOMEM);
+			return;
+		}
+	}
+	skb_trim(skb, 0);
+	skb_reserve(skb, sizeof(*req));
+	skb_put(skb, mpalen);
+	skb->priority = CPL_PRIORITY_DATA;
+	mpa = (struct mpa_message *) skb->data;
+	memset(mpa, 0, sizeof(*mpa));
+	memcpy(mpa->key, MPA_KEY_REQ, sizeof(mpa->key));
+	mpa->flags = (crc_enabled ? MPA_CRC : 0) |
+		     (markers_enabled ? MPA_MARKERS : 0);
+	mpa->private_data_size = htons(ep->plen);
+	mpa->revision = mpa_rev;
+
+	if (ep->plen)
+		memcpy(mpa->private_data, ep->mpa_pkt + sizeof(*mpa), ep->plen);
+
+	/*
+	 * Reference the mpa skb.  This ensures the data area
+	 * will remain in memory until the hw acks the tx.
+	 * Function tx_ack() will deref it.
+	 */
+	skb_get(skb);
+	set_arp_failure_handler(skb, arp_failure_discard);
+	skb_reset_transport_header(skb);
+	len = skb->len;
+	req = (struct tx_data_wr *) skb_push(skb, sizeof(*req));
+	req->wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_TX_DATA)|F_WR_COMPL);
+	req->wr_lo = htonl(V_WR_TID(ep->hwtid));
+	req->len = htonl(len);
+	req->param = htonl(V_TX_PORT(ep->l2t->smt_idx) |
+			   V_TX_SNDBUF(snd_win>>15));
+	req->flags = htonl(F_TX_INIT);
+	req->sndseq = htonl(ep->snd_seq);
+	BUG_ON(ep->mpa_skb);
+	ep->mpa_skb = skb;
+	iwch_l2t_send(ep->com.tdev, skb, ep->l2t);
+	start_ep_timer(ep);
+	state_set(&ep->com, MPA_REQ_SENT);
+	return;
+}
+
+static int send_mpa_reject(struct iwch_ep *ep, const void *pdata, u8 plen)
+{
+	int mpalen;
+	struct tx_data_wr *req;
+	struct mpa_message *mpa;
+	struct sk_buff *skb;
+
+	PDBG("%s ep %p plen %d\n", __func__, ep, plen);
+
+	mpalen = sizeof(*mpa) + plen;
+
+	skb = get_skb(NULL, mpalen + sizeof(*req), GFP_KERNEL);
+	if (!skb) {
+		printk(KERN_ERR MOD "%s - cannot alloc skb!\n", __func__);
+		return -ENOMEM;
+	}
+	skb_reserve(skb, sizeof(*req));
+	mpa = (struct mpa_message *) skb_put(skb, mpalen);
+	memset(mpa, 0, sizeof(*mpa));
+	memcpy(mpa->key, MPA_KEY_REP, sizeof(mpa->key));
+	mpa->flags = MPA_REJECT;
+	mpa->revision = mpa_rev;
+	mpa->private_data_size = htons(plen);
+	if (plen)
+		memcpy(mpa->private_data, pdata, plen);
+
+	/*
+	 * Reference the mpa skb again.  This ensures the data area
+	 * will remain in memory until the hw acks the tx.
+	 * Function tx_ack() will deref it.
+	 */
+	skb_get(skb);
+	skb->priority = CPL_PRIORITY_DATA;
+	set_arp_failure_handler(skb, arp_failure_discard);
+	skb_reset_transport_header(skb);
+	req = (struct tx_data_wr *) skb_push(skb, sizeof(*req));
+	req->wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_TX_DATA)|F_WR_COMPL);
+	req->wr_lo = htonl(V_WR_TID(ep->hwtid));
+	req->len = htonl(mpalen);
+	req->param = htonl(V_TX_PORT(ep->l2t->smt_idx) |
+			   V_TX_SNDBUF(snd_win>>15));
+	req->flags = htonl(F_TX_INIT);
+	req->sndseq = htonl(ep->snd_seq);
+	BUG_ON(ep->mpa_skb);
+	ep->mpa_skb = skb;
+	return iwch_l2t_send(ep->com.tdev, skb, ep->l2t);
+}
+
+static int send_mpa_reply(struct iwch_ep *ep, const void *pdata, u8 plen)
+{
+	int mpalen;
+	struct tx_data_wr *req;
+	struct mpa_message *mpa;
+	int len;
+	struct sk_buff *skb;
+
+	PDBG("%s ep %p plen %d\n", __func__, ep, plen);
+
+	mpalen = sizeof(*mpa) + plen;
+
+	skb = get_skb(NULL, mpalen + sizeof(*req), GFP_KERNEL);
+	if (!skb) {
+		printk(KERN_ERR MOD "%s - cannot alloc skb!\n", __func__);
+		return -ENOMEM;
+	}
+	skb->priority = CPL_PRIORITY_DATA;
+	skb_reserve(skb, sizeof(*req));
+	mpa = (struct mpa_message *) skb_put(skb, mpalen);
+	memset(mpa, 0, sizeof(*mpa));
+	memcpy(mpa->key, MPA_KEY_REP, sizeof(mpa->key));
+	mpa->flags = (ep->mpa_attr.crc_enabled ? MPA_CRC : 0) |
+		     (markers_enabled ? MPA_MARKERS : 0);
+	mpa->revision = mpa_rev;
+	mpa->private_data_size = htons(plen);
+	if (plen)
+		memcpy(mpa->private_data, pdata, plen);
+
+	/*
+	 * Reference the mpa skb.  This ensures the data area
+	 * will remain in memory until the hw acks the tx.
+	 * Function tx_ack() will deref it.
+	 */
+	skb_get(skb);
+	set_arp_failure_handler(skb, arp_failure_discard);
+	skb_reset_transport_header(skb);
+	len = skb->len;
+	req = (struct tx_data_wr *) skb_push(skb, sizeof(*req));
+	req->wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_TX_DATA)|F_WR_COMPL);
+	req->wr_lo = htonl(V_WR_TID(ep->hwtid));
+	req->len = htonl(len);
+	req->param = htonl(V_TX_PORT(ep->l2t->smt_idx) |
+			   V_TX_SNDBUF(snd_win>>15));
+	req->flags = htonl(F_TX_INIT);
+	req->sndseq = htonl(ep->snd_seq);
+	ep->mpa_skb = skb;
+	state_set(&ep->com, MPA_REP_SENT);
+	return iwch_l2t_send(ep->com.tdev, skb, ep->l2t);
+}
+
+static int act_establish(struct t3cdev *tdev, struct sk_buff *skb, void *ctx)
+{
+	struct iwch_ep *ep = ctx;
+	struct cpl_act_establish *req = cplhdr(skb);
+	unsigned int tid = GET_TID(req);
+
+	PDBG("%s ep %p tid %d\n", __func__, ep, tid);
+
+	dst_confirm(ep->dst);
+
+	/* setup the hwtid for this connection */
+	ep->hwtid = tid;
+	cxgb3_insert_tid(ep->com.tdev, &t3c_client, ep, tid);
+
+	ep->snd_seq = ntohl(req->snd_isn);
+	ep->rcv_seq = ntohl(req->rcv_isn);
+
+	set_emss(ep, ntohs(req->tcp_opt));
+
+	/* dealloc the atid */
+	cxgb3_free_atid(ep->com.tdev, ep->atid);
+
+	/* start MPA negotiation */
+	send_mpa_req(ep, skb);
+
+	return 0;
+}
+
+static void abort_connection(struct iwch_ep *ep, struct sk_buff *skb, gfp_t gfp)
+{
+	PDBG("%s ep %p\n", __FILE__, ep);
+	state_set(&ep->com, ABORTING);
+	send_abort(ep, skb, gfp);
+}
+
+static void close_complete_upcall(struct iwch_ep *ep)
+{
+	struct iw_cm_event event;
+
+	PDBG("%s ep %p\n", __func__, ep);
+	memset(&event, 0, sizeof(event));
+	event.event = IW_CM_EVENT_CLOSE;
+	if (ep->com.cm_id) {
+		PDBG("close complete delivered ep %p cm_id %p tid %d\n",
+		     ep, ep->com.cm_id, ep->hwtid);
+		ep->com.cm_id->event_handler(ep->com.cm_id, &event);
+		ep->com.cm_id->rem_ref(ep->com.cm_id);
+		ep->com.cm_id = NULL;
+		ep->com.qp = NULL;
+	}
+}
+
+static void peer_close_upcall(struct iwch_ep *ep)
+{
+	struct iw_cm_event event;
+
+	PDBG("%s ep %p\n", __func__, ep);
+	memset(&event, 0, sizeof(event));
+	event.event = IW_CM_EVENT_DISCONNECT;
+	if (ep->com.cm_id) {
+		PDBG("peer close delivered ep %p cm_id %p tid %d\n",
+		     ep, ep->com.cm_id, ep->hwtid);
+		ep->com.cm_id->event_handler(ep->com.cm_id, &event);
+	}
+}
+
+static void peer_abort_upcall(struct iwch_ep *ep)
+{
+	struct iw_cm_event event;
+
+	PDBG("%s ep %p\n", __func__, ep);
+	memset(&event, 0, sizeof(event));
+	event.event = IW_CM_EVENT_CLOSE;
+	event.status = -ECONNRESET;
+	if (ep->com.cm_id) {
+		PDBG("abort delivered ep %p cm_id %p tid %d\n", ep,
+		     ep->com.cm_id, ep->hwtid);
+		ep->com.cm_id->event_handler(ep->com.cm_id, &event);
+		ep->com.cm_id->rem_ref(ep->com.cm_id);
+		ep->com.cm_id = NULL;
+		ep->com.qp = NULL;
+	}
+}
+
+static void connect_reply_upcall(struct iwch_ep *ep, int status)
+{
+	struct iw_cm_event event;
+
+	PDBG("%s ep %p status %d\n", __func__, ep, status);
+	memset(&event, 0, sizeof(event));
+	event.event = IW_CM_EVENT_CONNECT_REPLY;
+	event.status = status;
+	memcpy(&event.local_addr, &ep->com.local_addr,
+	       sizeof(ep->com.local_addr));
+	memcpy(&event.remote_addr, &ep->com.remote_addr,
+	       sizeof(ep->com.remote_addr));
+
+	if ((status == 0) || (status == -ECONNREFUSED)) {
+		event.private_data_len = ep->plen;
+		event.private_data = ep->mpa_pkt + sizeof(struct mpa_message);
+	}
+	if (ep->com.cm_id) {
+		PDBG("%s ep %p tid %d status %d\n", __func__, ep,
+		     ep->hwtid, status);
+		ep->com.cm_id->event_handler(ep->com.cm_id, &event);
+	}
+	if (status < 0) {
+		ep->com.cm_id->rem_ref(ep->com.cm_id);
+		ep->com.cm_id = NULL;
+		ep->com.qp = NULL;
+	}
+}
+
+static void connect_request_upcall(struct iwch_ep *ep)
+{
+	struct iw_cm_event event;
+
+	PDBG("%s ep %p tid %d\n", __func__, ep, ep->hwtid);
+	memset(&event, 0, sizeof(event));
+	event.event = IW_CM_EVENT_CONNECT_REQUEST;
+	memcpy(&event.local_addr, &ep->com.local_addr,
+	       sizeof(ep->com.local_addr));
+	memcpy(&event.remote_addr, &ep->com.remote_addr,
+	       sizeof(ep->com.local_addr));
+	event.private_data_len = ep->plen;
+	event.private_data = ep->mpa_pkt + sizeof(struct mpa_message);
+	event.provider_data = ep;
+	/*
+	 * Until ird/ord negotiation via MPAv2 support is added, send max
+	 * supported values
+	 */
+	event.ird = event.ord = 8;
+	if (state_read(&ep->parent_ep->com) != DEAD) {
+		get_ep(&ep->com);
+		ep->parent_ep->com.cm_id->event_handler(
+						ep->parent_ep->com.cm_id,
+						&event);
+	}
+	put_ep(&ep->parent_ep->com);
+	ep->parent_ep = NULL;
+}
+
+static void established_upcall(struct iwch_ep *ep)
+{
+	struct iw_cm_event event;
+
+	PDBG("%s ep %p\n", __func__, ep);
+	memset(&event, 0, sizeof(event));
+	event.event = IW_CM_EVENT_ESTABLISHED;
+	/*
+	 * Until ird/ord negotiation via MPAv2 support is added, send max
+	 * supported values
+	 */
+	event.ird = event.ord = 8;
+	if (ep->com.cm_id) {
+		PDBG("%s ep %p tid %d\n", __func__, ep, ep->hwtid);
+		ep->com.cm_id->event_handler(ep->com.cm_id, &event);
+	}
+}
+
+static int update_rx_credits(struct iwch_ep *ep, u32 credits)
+{
+	struct cpl_rx_data_ack *req;
+	struct sk_buff *skb;
+
+	PDBG("%s ep %p credits %u\n", __func__, ep, credits);
+	skb = get_skb(NULL, sizeof(*req), GFP_KERNEL);
+	if (!skb) {
+		printk(KERN_ERR MOD "update_rx_credits - cannot alloc skb!\n");
+		return 0;
+	}
+
+	req = (struct cpl_rx_data_ack *) skb_put(skb, sizeof(*req));
+	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
+	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, ep->hwtid));
+	req->credit_dack = htonl(V_RX_CREDITS(credits) | V_RX_FORCE_ACK(1));
+	skb->priority = CPL_PRIORITY_ACK;
+	iwch_cxgb3_ofld_send(ep->com.tdev, skb);
+	return credits;
+}
+
+static void process_mpa_reply(struct iwch_ep *ep, struct sk_buff *skb)
+{
+	struct mpa_message *mpa;
+	u16 plen;
+	struct iwch_qp_attributes attrs;
+	enum iwch_qp_attr_mask mask;
+	int err;
+
+	PDBG("%s ep %p\n", __func__, ep);
+
+	/*
+	 * Stop mpa timer.  If it expired, then the state has
+	 * changed and we bail since ep_timeout already aborted
+	 * the connection.
+	 */
+	stop_ep_timer(ep);
+	if (state_read(&ep->com) != MPA_REQ_SENT)
+		return;
+
+	/*
+	 * If we get more than the supported amount of private data
+	 * then we must fail this connection.
+	 */
+	if (ep->mpa_pkt_len + skb->len > sizeof(ep->mpa_pkt)) {
+		err = -EINVAL;
+		goto err;
+	}
+
+	/*
+	 * copy the new data into our accumulation buffer.
+	 */
+	skb_copy_from_linear_data(skb, &(ep->mpa_pkt[ep->mpa_pkt_len]),
+				  skb->len);
+	ep->mpa_pkt_len += skb->len;
+
+	/*
+	 * if we don't even have the mpa message, then bail.
+	 */
+	if (ep->mpa_pkt_len < sizeof(*mpa))
+		return;
+	mpa = (struct mpa_message *) ep->mpa_pkt;
+
+	/* Validate MPA header. */
+	if (mpa->revision != mpa_rev) {
+		err = -EPROTO;
+		goto err;
+	}
+	if (memcmp(mpa->key, MPA_KEY_REP, sizeof(mpa->key))) {
+		err = -EPROTO;
+		goto err;
+	}
+
+	plen = ntohs(mpa->private_data_size);
+
+	/*
+	 * Fail if there's too much private data.
+	 */
+	if (plen > MPA_MAX_PRIVATE_DATA) {
+		err = -EPROTO;
+		goto err;
+	}
+
+	/*
+	 * If plen does not account for pkt size
+	 */
+	if (ep->mpa_pkt_len > (sizeof(*mpa) + plen)) {
+		err = -EPROTO;
+		goto err;
+	}
+
+	ep->plen = (u8) plen;
+
+	/*
+	 * If we don't have all the pdata yet, then bail.
+	 * We'll continue process when more data arrives.
+	 */
+	if (ep->mpa_pkt_len < (sizeof(*mpa) + plen))
+		return;
+
+	if (mpa->flags & MPA_REJECT) {
+		err = -ECONNREFUSED;
+		goto err;
+	}
+
+	/*
+	 * If we get here we have accumulated the entire mpa
+	 * start reply message including private data. And
+	 * the MPA header is valid.
+	 */
+	state_set(&ep->com, FPDU_MODE);
+	ep->mpa_attr.initiator = 1;
+	ep->mpa_attr.crc_enabled = (mpa->flags & MPA_CRC) | crc_enabled ? 1 : 0;
+	ep->mpa_attr.recv_marker_enabled = markers_enabled;
+	ep->mpa_attr.xmit_marker_enabled = mpa->flags & MPA_MARKERS ? 1 : 0;
+	ep->mpa_attr.version = mpa_rev;
+	PDBG("%s - crc_enabled=%d, recv_marker_enabled=%d, "
+	     "xmit_marker_enabled=%d, version=%d\n", __func__,
+	     ep->mpa_attr.crc_enabled, ep->mpa_attr.recv_marker_enabled,
+	     ep->mpa_attr.xmit_marker_enabled, ep->mpa_attr.version);
+
+	attrs.mpa_attr = ep->mpa_attr;
+	attrs.max_ird = ep->ird;
+	attrs.max_ord = ep->ord;
+	attrs.llp_stream_handle = ep;
+	attrs.next_state = IWCH_QP_STATE_RTS;
+
+	mask = IWCH_QP_ATTR_NEXT_STATE |
+	    IWCH_QP_ATTR_LLP_STREAM_HANDLE | IWCH_QP_ATTR_MPA_ATTR |
+	    IWCH_QP_ATTR_MAX_IRD | IWCH_QP_ATTR_MAX_ORD;
+
+	/* bind QP and TID with INIT_WR */
+	err = iwch_modify_qp(ep->com.qp->rhp,
+			     ep->com.qp, mask, &attrs, 1);
+	if (err)
+		goto err;
+
+	if (peer2peer && iwch_rqes_posted(ep->com.qp) == 0) {
+		iwch_post_zb_read(ep);
+	}
+
+	goto out;
+err:
+	abort_connection(ep, skb, GFP_KERNEL);
+out:
+	connect_reply_upcall(ep, err);
+	return;
+}
+
+static void process_mpa_request(struct iwch_ep *ep, struct sk_buff *skb)
+{
+	struct mpa_message *mpa;
+	u16 plen;
+
+	PDBG("%s ep %p\n", __func__, ep);
+
+	/*
+	 * Stop mpa timer.  If it expired, then the state has
+	 * changed and we bail since ep_timeout already aborted
+	 * the connection.
+	 */
+	stop_ep_timer(ep);
+	if (state_read(&ep->com) != MPA_REQ_WAIT)
+		return;
+
+	/*
+	 * If we get more than the supported amount of private data
+	 * then we must fail this connection.
+	 */
+	if (ep->mpa_pkt_len + skb->len > sizeof(ep->mpa_pkt)) {
+		abort_connection(ep, skb, GFP_KERNEL);
+		return;
+	}
+
+	PDBG("%s enter (%s line %u)\n", __func__, __FILE__, __LINE__);
+
+	/*
+	 * Copy the new data into our accumulation buffer.
+	 */
+	skb_copy_from_linear_data(skb, &(ep->mpa_pkt[ep->mpa_pkt_len]),
+				  skb->len);
+	ep->mpa_pkt_len += skb->len;
+
+	/*
+	 * If we don't even have the mpa message, then bail.
+	 * We'll continue process when more data arrives.
+	 */
+	if (ep->mpa_pkt_len < sizeof(*mpa))
+		return;
+	PDBG("%s enter (%s line %u)\n", __func__, __FILE__, __LINE__);
+	mpa = (struct mpa_message *) ep->mpa_pkt;
+
+	/*
+	 * Validate MPA Header.
+	 */
+	if (mpa->revision != mpa_rev) {
+		abort_connection(ep, skb, GFP_KERNEL);
+		return;
+	}
+
+	if (memcmp(mpa->key, MPA_KEY_REQ, sizeof(mpa->key))) {
+		abort_connection(ep, skb, GFP_KERNEL);
+		return;
+	}
+
+	plen = ntohs(mpa->private_data_size);
+
+	/*
+	 * Fail if there's too much private data.
+	 */
+	if (plen > MPA_MAX_PRIVATE_DATA) {
+		abort_connection(ep, skb, GFP_KERNEL);
+		return;
+	}
+
+	/*
+	 * If plen does not account for pkt size
+	 */
+	if (ep->mpa_pkt_len > (sizeof(*mpa) + plen)) {
+		abort_connection(ep, skb, GFP_KERNEL);
+		return;
+	}
+	ep->plen = (u8) plen;
+
+	/*
+	 * If we don't have all the pdata yet, then bail.
+	 */
+	if (ep->mpa_pkt_len < (sizeof(*mpa) + plen))
+		return;
+
+	/*
+	 * If we get here we have accumulated the entire mpa
+	 * start reply message including private data.
+	 */
+	ep->mpa_attr.initiator = 0;
+	ep->mpa_attr.crc_enabled = (mpa->flags & MPA_CRC) | crc_enabled ? 1 : 0;
+	ep->mpa_attr.recv_marker_enabled = markers_enabled;
+	ep->mpa_attr.xmit_marker_enabled = mpa->flags & MPA_MARKERS ? 1 : 0;
+	ep->mpa_attr.version = mpa_rev;
+	PDBG("%s - crc_enabled=%d, recv_marker_enabled=%d, "
+	     "xmit_marker_enabled=%d, version=%d\n", __func__,
+	     ep->mpa_attr.crc_enabled, ep->mpa_attr.recv_marker_enabled,
+	     ep->mpa_attr.xmit_marker_enabled, ep->mpa_attr.version);
+
+	state_set(&ep->com, MPA_REQ_RCVD);
+
+	/* drive upcall */
+	connect_request_upcall(ep);
+	return;
+}
+
+static int rx_data(struct t3cdev *tdev, struct sk_buff *skb, void *ctx)
+{
+	struct iwch_ep *ep = ctx;
+	struct cpl_rx_data *hdr = cplhdr(skb);
+	unsigned int dlen = ntohs(hdr->len);
+
+	PDBG("%s ep %p dlen %u\n", __func__, ep, dlen);
+
+	skb_pull(skb, sizeof(*hdr));
+	skb_trim(skb, dlen);
+
+	ep->rcv_seq += dlen;
+	BUG_ON(ep->rcv_seq != (ntohl(hdr->seq) + dlen));
+
+	switch (state_read(&ep->com)) {
+	case MPA_REQ_SENT:
+		process_mpa_reply(ep, skb);
+		break;
+	case MPA_REQ_WAIT:
+		process_mpa_request(ep, skb);
+		break;
+	case MPA_REP_SENT:
+		break;
+	default:
+		printk(KERN_ERR MOD "%s Unexpected streaming data."
+		       " ep %p state %d tid %d\n",
+		       __func__, ep, state_read(&ep->com), ep->hwtid);
+
+		/*
+		 * The ep will timeout and inform the ULP of the failure.
+		 * See ep_timeout().
+		 */
+		break;
+	}
+
+	/* update RX credits */
+	update_rx_credits(ep, dlen);
+
+	return CPL_RET_BUF_DONE;
+}
+
+/*
+ * Upcall from the adapter indicating data has been transmitted.
+ * For us its just the single MPA request or reply.  We can now free
+ * the skb holding the mpa message.
+ */
+static int tx_ack(struct t3cdev *tdev, struct sk_buff *skb, void *ctx)
+{
+	struct iwch_ep *ep = ctx;
+	struct cpl_wr_ack *hdr = cplhdr(skb);
+	unsigned int credits = ntohs(hdr->credits);
+	unsigned long flags;
+	int post_zb = 0;
+
+	PDBG("%s ep %p credits %u\n", __func__, ep, credits);
+
+	if (credits == 0) {
+		PDBG("%s 0 credit ack  ep %p state %u\n",
+		     __func__, ep, state_read(&ep->com));
+		return CPL_RET_BUF_DONE;
+	}
+
+	spin_lock_irqsave(&ep->com.lock, flags);
+	BUG_ON(credits != 1);
+	dst_confirm(ep->dst);
+	if (!ep->mpa_skb) {
+		PDBG("%s rdma_init wr_ack ep %p state %u\n",
+			__func__, ep, ep->com.state);
+		if (ep->mpa_attr.initiator) {
+			PDBG("%s initiator ep %p state %u\n",
+				__func__, ep, ep->com.state);
+			if (peer2peer && ep->com.state == FPDU_MODE)
+				post_zb = 1;
+		} else {
+			PDBG("%s responder ep %p state %u\n",
+				__func__, ep, ep->com.state);
+			if (ep->com.state == MPA_REQ_RCVD) {
+				ep->com.rpl_done = 1;
+				wake_up(&ep->com.waitq);
+			}
+		}
+	} else {
+		PDBG("%s lsm ack ep %p state %u freeing skb\n",
+			__func__, ep, ep->com.state);
+		kfree_skb(ep->mpa_skb);
+		ep->mpa_skb = NULL;
+	}
+	spin_unlock_irqrestore(&ep->com.lock, flags);
+	if (post_zb)
+		iwch_post_zb_read(ep);
+	return CPL_RET_BUF_DONE;
+}
+
+static int abort_rpl(struct t3cdev *tdev, struct sk_buff *skb, void *ctx)
+{
+	struct iwch_ep *ep = ctx;
+	unsigned long flags;
+	int release = 0;
+
+	PDBG("%s ep %p\n", __func__, ep);
+	BUG_ON(!ep);
+
+	/*
+	 * We get 2 abort replies from the HW.  The first one must
+	 * be ignored except for scribbling that we need one more.
+	 */
+	if (!test_and_set_bit(ABORT_REQ_IN_PROGRESS, &ep->com.flags)) {
+		return CPL_RET_BUF_DONE;
+	}
+
+	spin_lock_irqsave(&ep->com.lock, flags);
+	switch (ep->com.state) {
+	case ABORTING:
+		close_complete_upcall(ep);
+		__state_set(&ep->com, DEAD);
+		release = 1;
+		break;
+	default:
+		printk(KERN_ERR "%s ep %p state %d\n",
+		     __func__, ep, ep->com.state);
+		break;
+	}
+	spin_unlock_irqrestore(&ep->com.lock, flags);
+
+	if (release)
+		release_ep_resources(ep);
+	return CPL_RET_BUF_DONE;
+}
+
+/*
+ * Return whether a failed active open has allocated a TID
+ */
+static inline int act_open_has_tid(int status)
+{
+	return status != CPL_ERR_TCAM_FULL && status != CPL_ERR_CONN_EXIST &&
+	       status != CPL_ERR_ARP_MISS;
+}
+
+static int act_open_rpl(struct t3cdev *tdev, struct sk_buff *skb, void *ctx)
+{
+	struct iwch_ep *ep = ctx;
+	struct cpl_act_open_rpl *rpl = cplhdr(skb);
+
+	PDBG("%s ep %p status %u errno %d\n", __func__, ep, rpl->status,
+	     status2errno(rpl->status));
+	connect_reply_upcall(ep, status2errno(rpl->status));
+	state_set(&ep->com, DEAD);
+	if (ep->com.tdev->type != T3A && act_open_has_tid(rpl->status))
+		release_tid(ep->com.tdev, GET_TID(rpl), NULL);
+	cxgb3_free_atid(ep->com.tdev, ep->atid);
+	dst_release(ep->dst);
+	l2t_release(ep->com.tdev, ep->l2t);
+	put_ep(&ep->com);
+	return CPL_RET_BUF_DONE;
+}
+
+static int listen_start(struct iwch_listen_ep *ep)
+{
+	struct sk_buff *skb;
+	struct cpl_pass_open_req *req;
+
+	PDBG("%s ep %p\n", __func__, ep);
+	skb = get_skb(NULL, sizeof(*req), GFP_KERNEL);
+	if (!skb) {
+		printk(KERN_ERR MOD "t3c_listen_start failed to alloc skb!\n");
+		return -ENOMEM;
+	}
+
+	req = (struct cpl_pass_open_req *) skb_put(skb, sizeof(*req));
+	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
+	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_PASS_OPEN_REQ, ep->stid));
+	req->local_port = ep->com.local_addr.sin_port;
+	req->local_ip = ep->com.local_addr.sin_addr.s_addr;
+	req->peer_port = 0;
+	req->peer_ip = 0;
+	req->peer_netmask = 0;
+	req->opt0h = htonl(F_DELACK | F_TCAM_BYPASS);
+	req->opt0l = htonl(V_RCV_BUFSIZ(rcv_win>>10));
+	req->opt1 = htonl(V_CONN_POLICY(CPL_CONN_POLICY_ASK));
+
+	skb->priority = 1;
+	return iwch_cxgb3_ofld_send(ep->com.tdev, skb);
+}
+
+static int pass_open_rpl(struct t3cdev *tdev, struct sk_buff *skb, void *ctx)
+{
+	struct iwch_listen_ep *ep = ctx;
+	struct cpl_pass_open_rpl *rpl = cplhdr(skb);
+
+	PDBG("%s ep %p status %d error %d\n", __func__, ep,
+	     rpl->status, status2errno(rpl->status));
+	ep->com.rpl_err = status2errno(rpl->status);
+	ep->com.rpl_done = 1;
+	wake_up(&ep->com.waitq);
+
+	return CPL_RET_BUF_DONE;
+}
+
+static int listen_stop(struct iwch_listen_ep *ep)
+{
+	struct sk_buff *skb;
+	struct cpl_close_listserv_req *req;
+
+	PDBG("%s ep %p\n", __func__, ep);
+	skb = get_skb(NULL, sizeof(*req), GFP_KERNEL);
+	if (!skb) {
+		printk(KERN_ERR MOD "%s - failed to alloc skb\n", __func__);
+		return -ENOMEM;
+	}
+	req = (struct cpl_close_listserv_req *) skb_put(skb, sizeof(*req));
+	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
+	req->cpu_idx = 0;
+	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_LISTSRV_REQ, ep->stid));
+	skb->priority = 1;
+	return iwch_cxgb3_ofld_send(ep->com.tdev, skb);
+}
+
+static int close_listsrv_rpl(struct t3cdev *tdev, struct sk_buff *skb,
+			     void *ctx)
+{
+	struct iwch_listen_ep *ep = ctx;
+	struct cpl_close_listserv_rpl *rpl = cplhdr(skb);
+
+	PDBG("%s ep %p\n", __func__, ep);
+	ep->com.rpl_err = status2errno(rpl->status);
+	ep->com.rpl_done = 1;
+	wake_up(&ep->com.waitq);
+	return CPL_RET_BUF_DONE;
+}
+
+static void accept_cr(struct iwch_ep *ep, __be32 peer_ip, struct sk_buff *skb)
+{
+	struct cpl_pass_accept_rpl *rpl;
+	unsigned int mtu_idx;
+	u32 opt0h, opt0l, opt2;
+	int wscale;
+
+	PDBG("%s ep %p\n", __func__, ep);
+	BUG_ON(skb_cloned(skb));
+	skb_trim(skb, sizeof(*rpl));
+	skb_get(skb);
+	mtu_idx = find_best_mtu(T3C_DATA(ep->com.tdev), dst_mtu(ep->dst));
+	wscale = compute_wscale(rcv_win);
+	opt0h = V_NAGLE(0) |
+	    V_NO_CONG(nocong) |
+	    V_KEEP_ALIVE(1) |
+	    F_TCAM_BYPASS |
+	    V_WND_SCALE(wscale) |
+	    V_MSS_IDX(mtu_idx) |
+	    V_L2T_IDX(ep->l2t->idx) | V_TX_CHANNEL(ep->l2t->smt_idx);
+	opt0l = V_TOS((ep->tos >> 2) & M_TOS) | V_RCV_BUFSIZ(rcv_win>>10);
+	opt2 = F_RX_COALESCE_VALID | V_RX_COALESCE(0) | V_FLAVORS_VALID(1) |
+	       V_CONG_CONTROL_FLAVOR(cong_flavor);
+
+	rpl = cplhdr(skb);
+	rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
+	OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, ep->hwtid));
+	rpl->peer_ip = peer_ip;
+	rpl->opt0h = htonl(opt0h);
+	rpl->opt0l_status = htonl(opt0l | CPL_PASS_OPEN_ACCEPT);
+	rpl->opt2 = htonl(opt2);
+	rpl->rsvd = rpl->opt2;	/* workaround for HW bug */
+	skb->priority = CPL_PRIORITY_SETUP;
+	iwch_l2t_send(ep->com.tdev, skb, ep->l2t);
+
+	return;
+}
+
+static void reject_cr(struct t3cdev *tdev, u32 hwtid, __be32 peer_ip,
+		      struct sk_buff *skb)
+{
+	PDBG("%s t3cdev %p tid %u peer_ip %x\n", __func__, tdev, hwtid,
+	     peer_ip);
+	BUG_ON(skb_cloned(skb));
+	skb_trim(skb, sizeof(struct cpl_tid_release));
+	skb_get(skb);
+
+	if (tdev->type != T3A)
+		release_tid(tdev, hwtid, skb);
+	else {
+		struct cpl_pass_accept_rpl *rpl;
+
+		rpl = cplhdr(skb);
+		skb->priority = CPL_PRIORITY_SETUP;
+		rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
+		OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL,
+						      hwtid));
+		rpl->peer_ip = peer_ip;
+		rpl->opt0h = htonl(F_TCAM_BYPASS);
+		rpl->opt0l_status = htonl(CPL_PASS_OPEN_REJECT);
+		rpl->opt2 = 0;
+		rpl->rsvd = rpl->opt2;
+		iwch_cxgb3_ofld_send(tdev, skb);
+	}
+}
+
+static int pass_accept_req(struct t3cdev *tdev, struct sk_buff *skb, void *ctx)
+{
+	struct iwch_ep *child_ep, *parent_ep = ctx;
+	struct cpl_pass_accept_req *req = cplhdr(skb);
+	unsigned int hwtid = GET_TID(req);
+	struct dst_entry *dst;
+	struct l2t_entry *l2t;
+	struct rtable *rt;
+	struct iff_mac tim;
+
+	PDBG("%s parent ep %p tid %u\n", __func__, parent_ep, hwtid);
+
+	if (state_read(&parent_ep->com) != LISTEN) {
+		printk(KERN_ERR "%s - listening ep not in LISTEN\n",
+		       __func__);
+		goto reject;
+	}
+
+	/*
+	 * Find the netdev for this connection request.
+	 */
+	tim.mac_addr = req->dst_mac;
+	tim.vlan_tag = ntohs(req->vlan_tag);
+	if (tdev->ctl(tdev, GET_IFF_FROM_MAC, &tim) < 0 || !tim.dev) {
+		printk(KERN_ERR "%s bad dst mac %pM\n",
+			__func__, req->dst_mac);
+		goto reject;
+	}
+
+	/* Find output route */
+	rt = find_route(tdev,
+			req->local_ip,
+			req->peer_ip,
+			req->local_port,
+			req->peer_port, G_PASS_OPEN_TOS(ntohl(req->tos_tid)));
+	if (!rt) {
+		printk(KERN_ERR MOD "%s - failed to find dst entry!\n",
+		       __func__);
+		goto reject;
+	}
+	dst = &rt->dst;
+	l2t = t3_l2t_get(tdev, dst, NULL, &req->peer_ip);
+	if (!l2t) {
+		printk(KERN_ERR MOD "%s - failed to allocate l2t entry!\n",
+		       __func__);
+		dst_release(dst);
+		goto reject;
+	}
+	child_ep = alloc_ep(sizeof(*child_ep), GFP_KERNEL);
+	if (!child_ep) {
+		printk(KERN_ERR MOD "%s - failed to allocate ep entry!\n",
+		       __func__);
+		l2t_release(tdev, l2t);
+		dst_release(dst);
+		goto reject;
+	}
+	state_set(&child_ep->com, CONNECTING);
+	child_ep->com.tdev = tdev;
+	child_ep->com.cm_id = NULL;
+	child_ep->com.local_addr.sin_family = PF_INET;
+	child_ep->com.local_addr.sin_port = req->local_port;
+	child_ep->com.local_addr.sin_addr.s_addr = req->local_ip;
+	child_ep->com.remote_addr.sin_family = PF_INET;
+	child_ep->com.remote_addr.sin_port = req->peer_port;
+	child_ep->com.remote_addr.sin_addr.s_addr = req->peer_ip;
+	get_ep(&parent_ep->com);
+	child_ep->parent_ep = parent_ep;
+	child_ep->tos = G_PASS_OPEN_TOS(ntohl(req->tos_tid));
+	child_ep->l2t = l2t;
+	child_ep->dst = dst;
+	child_ep->hwtid = hwtid;
+	init_timer(&child_ep->timer);
+	cxgb3_insert_tid(tdev, &t3c_client, child_ep, hwtid);
+	accept_cr(child_ep, req->peer_ip, skb);
+	goto out;
+reject:
+	reject_cr(tdev, hwtid, req->peer_ip, skb);
+out:
+	return CPL_RET_BUF_DONE;
+}
+
+static int pass_establish(struct t3cdev *tdev, struct sk_buff *skb, void *ctx)
+{
+	struct iwch_ep *ep = ctx;
+	struct cpl_pass_establish *req = cplhdr(skb);
+
+	PDBG("%s ep %p\n", __func__, ep);
+	ep->snd_seq = ntohl(req->snd_isn);
+	ep->rcv_seq = ntohl(req->rcv_isn);
+
+	set_emss(ep, ntohs(req->tcp_opt));
+
+	dst_confirm(ep->dst);
+	state_set(&ep->com, MPA_REQ_WAIT);
+	start_ep_timer(ep);
+
+	return CPL_RET_BUF_DONE;
+}
+
+static int peer_close(struct t3cdev *tdev, struct sk_buff *skb, void *ctx)
+{
+	struct iwch_ep *ep = ctx;
+	struct iwch_qp_attributes attrs;
+	unsigned long flags;
+	int disconnect = 1;
+	int release = 0;
+
+	PDBG("%s ep %p\n", __func__, ep);
+	dst_confirm(ep->dst);
+
+	spin_lock_irqsave(&ep->com.lock, flags);
+	switch (ep->com.state) {
+	case MPA_REQ_WAIT:
+		__state_set(&ep->com, CLOSING);
+		break;
+	case MPA_REQ_SENT:
+		__state_set(&ep->com, CLOSING);
+		connect_reply_upcall(ep, -ECONNRESET);
+		break;
+	case MPA_REQ_RCVD:
+
+		/*
+		 * We're gonna mark this puppy DEAD, but keep
+		 * the reference on it until the ULP accepts or
+		 * rejects the CR. Also wake up anyone waiting
+		 * in rdma connection migration (see iwch_accept_cr()).
+		 */
+		__state_set(&ep->com, CLOSING);
+		ep->com.rpl_done = 1;
+		ep->com.rpl_err = -ECONNRESET;
+		PDBG("waking up ep %p\n", ep);
+		wake_up(&ep->com.waitq);
+		break;
+	case MPA_REP_SENT:
+		__state_set(&ep->com, CLOSING);
+		ep->com.rpl_done = 1;
+		ep->com.rpl_err = -ECONNRESET;
+		PDBG("waking up ep %p\n", ep);
+		wake_up(&ep->com.waitq);
+		break;
+	case FPDU_MODE:
+		start_ep_timer(ep);
+		__state_set(&ep->com, CLOSING);
+		attrs.next_state = IWCH_QP_STATE_CLOSING;
+		iwch_modify_qp(ep->com.qp->rhp, ep->com.qp,
+			       IWCH_QP_ATTR_NEXT_STATE, &attrs, 1);
+		peer_close_upcall(ep);
+		break;
+	case ABORTING:
+		disconnect = 0;
+		break;
+	case CLOSING:
+		__state_set(&ep->com, MORIBUND);
+		disconnect = 0;
+		break;
+	case MORIBUND:
+		stop_ep_timer(ep);
+		if (ep->com.cm_id && ep->com.qp) {
+			attrs.next_state = IWCH_QP_STATE_IDLE;
+			iwch_modify_qp(ep->com.qp->rhp, ep->com.qp,
+				       IWCH_QP_ATTR_NEXT_STATE, &attrs, 1);
+		}
+		close_complete_upcall(ep);
+		__state_set(&ep->com, DEAD);
+		release = 1;
+		disconnect = 0;
+		break;
+	case DEAD:
+		disconnect = 0;
+		break;
+	default:
+		BUG_ON(1);
+	}
+	spin_unlock_irqrestore(&ep->com.lock, flags);
+	if (disconnect)
+		iwch_ep_disconnect(ep, 0, GFP_KERNEL);
+	if (release)
+		release_ep_resources(ep);
+	return CPL_RET_BUF_DONE;
+}
+
+/*
+ * Returns whether an ABORT_REQ_RSS message is a negative advice.
+ */
+static int is_neg_adv_abort(unsigned int status)
+{
+	return status == CPL_ERR_RTX_NEG_ADVICE ||
+	       status == CPL_ERR_PERSIST_NEG_ADVICE;
+}
+
+static int peer_abort(struct t3cdev *tdev, struct sk_buff *skb, void *ctx)
+{
+	struct cpl_abort_req_rss *req = cplhdr(skb);
+	struct iwch_ep *ep = ctx;
+	struct cpl_abort_rpl *rpl;
+	struct sk_buff *rpl_skb;
+	struct iwch_qp_attributes attrs;
+	int ret;
+	int release = 0;
+	unsigned long flags;
+
+	if (is_neg_adv_abort(req->status)) {
+		PDBG("%s neg_adv_abort ep %p tid %d\n", __func__, ep,
+		     ep->hwtid);
+		t3_l2t_send_event(ep->com.tdev, ep->l2t);
+		return CPL_RET_BUF_DONE;
+	}
+
+	/*
+	 * We get 2 peer aborts from the HW.  The first one must
+	 * be ignored except for scribbling that we need one more.
+	 */
+	if (!test_and_set_bit(PEER_ABORT_IN_PROGRESS, &ep->com.flags)) {
+		return CPL_RET_BUF_DONE;
+	}
+
+	spin_lock_irqsave(&ep->com.lock, flags);
+	PDBG("%s ep %p state %u\n", __func__, ep, ep->com.state);
+	switch (ep->com.state) {
+	case CONNECTING:
+		break;
+	case MPA_REQ_WAIT:
+		stop_ep_timer(ep);
+		break;
+	case MPA_REQ_SENT:
+		stop_ep_timer(ep);
+		connect_reply_upcall(ep, -ECONNRESET);
+		break;
+	case MPA_REP_SENT:
+		ep->com.rpl_done = 1;
+		ep->com.rpl_err = -ECONNRESET;
+		PDBG("waking up ep %p\n", ep);
+		wake_up(&ep->com.waitq);
+		break;
+	case MPA_REQ_RCVD:
+
+		/*
+		 * We're gonna mark this puppy DEAD, but keep
+		 * the reference on it until the ULP accepts or
+		 * rejects the CR. Also wake up anyone waiting
+		 * in rdma connection migration (see iwch_accept_cr()).
+		 */
+		ep->com.rpl_done = 1;
+		ep->com.rpl_err = -ECONNRESET;
+		PDBG("waking up ep %p\n", ep);
+		wake_up(&ep->com.waitq);
+		break;
+	case MORIBUND:
+	case CLOSING:
+		stop_ep_timer(ep);
+		/*FALLTHROUGH*/
+	case FPDU_MODE:
+		if (ep->com.cm_id && ep->com.qp) {
+			attrs.next_state = IWCH_QP_STATE_ERROR;
+			ret = iwch_modify_qp(ep->com.qp->rhp,
+				     ep->com.qp, IWCH_QP_ATTR_NEXT_STATE,
+				     &attrs, 1);
+			if (ret)
+				printk(KERN_ERR MOD
+				       "%s - qp <- error failed!\n",
+				       __func__);
+		}
+		peer_abort_upcall(ep);
+		break;
+	case ABORTING:
+		break;
+	case DEAD:
+		PDBG("%s PEER_ABORT IN DEAD STATE!!!!\n", __func__);
+		spin_unlock_irqrestore(&ep->com.lock, flags);
+		return CPL_RET_BUF_DONE;
+	default:
+		BUG_ON(1);
+		break;
+	}
+	dst_confirm(ep->dst);
+	if (ep->com.state != ABORTING) {
+		__state_set(&ep->com, DEAD);
+		release = 1;
+	}
+	spin_unlock_irqrestore(&ep->com.lock, flags);
+
+	rpl_skb = get_skb(skb, sizeof(*rpl), GFP_KERNEL);
+	if (!rpl_skb) {
+		printk(KERN_ERR MOD "%s - cannot allocate skb!\n",
+		       __func__);
+		release = 1;
+		goto out;
+	}
+	rpl_skb->priority = CPL_PRIORITY_DATA;
+	rpl = (struct cpl_abort_rpl *) skb_put(rpl_skb, sizeof(*rpl));
+	rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_RPL));
+	rpl->wr.wr_lo = htonl(V_WR_TID(ep->hwtid));
+	OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_ABORT_RPL, ep->hwtid));
+	rpl->cmd = CPL_ABORT_NO_RST;
+	iwch_cxgb3_ofld_send(ep->com.tdev, rpl_skb);
+out:
+	if (release)
+		release_ep_resources(ep);
+	return CPL_RET_BUF_DONE;
+}
+
+static int close_con_rpl(struct t3cdev *tdev, struct sk_buff *skb, void *ctx)
+{
+	struct iwch_ep *ep = ctx;
+	struct iwch_qp_attributes attrs;
+	unsigned long flags;
+	int release = 0;
+
+	PDBG("%s ep %p\n", __func__, ep);
+	BUG_ON(!ep);
+
+	/* The cm_id may be null if we failed to connect */
+	spin_lock_irqsave(&ep->com.lock, flags);
+	switch (ep->com.state) {
+	case CLOSING:
+		__state_set(&ep->com, MORIBUND);
+		break;
+	case MORIBUND:
+		stop_ep_timer(ep);
+		if ((ep->com.cm_id) && (ep->com.qp)) {
+			attrs.next_state = IWCH_QP_STATE_IDLE;
+			iwch_modify_qp(ep->com.qp->rhp,
+					     ep->com.qp,
+					     IWCH_QP_ATTR_NEXT_STATE,
+					     &attrs, 1);
+		}
+		close_complete_upcall(ep);
+		__state_set(&ep->com, DEAD);
+		release = 1;
+		break;
+	case ABORTING:
+	case DEAD:
+		break;
+	default:
+		BUG_ON(1);
+		break;
+	}
+	spin_unlock_irqrestore(&ep->com.lock, flags);
+	if (release)
+		release_ep_resources(ep);
+	return CPL_RET_BUF_DONE;
+}
+
+/*
+ * T3A does 3 things when a TERM is received:
+ * 1) send up a CPL_RDMA_TERMINATE message with the TERM packet
+ * 2) generate an async event on the QP with the TERMINATE opcode
+ * 3) post a TERMINATE opcode cqe into the associated CQ.
+ *
+ * For (1), we save the message in the qp for later consumer consumption.
+ * For (2), we move the QP into TERMINATE, post a QP event and disconnect.
+ * For (3), we toss the CQE in cxio_poll_cq().
+ *
+ * terminate() handles case (1)...
+ */
+static int terminate(struct t3cdev *tdev, struct sk_buff *skb, void *ctx)
+{
+	struct iwch_ep *ep = ctx;
+
+	if (state_read(&ep->com) != FPDU_MODE)
+		return CPL_RET_BUF_DONE;
+
+	PDBG("%s ep %p\n", __func__, ep);
+	skb_pull(skb, sizeof(struct cpl_rdma_terminate));
+	PDBG("%s saving %d bytes of term msg\n", __func__, skb->len);
+	skb_copy_from_linear_data(skb, ep->com.qp->attr.terminate_buffer,
+				  skb->len);
+	ep->com.qp->attr.terminate_msg_len = skb->len;
+	ep->com.qp->attr.is_terminate_local = 0;
+	return CPL_RET_BUF_DONE;
+}
+
+static int ec_status(struct t3cdev *tdev, struct sk_buff *skb, void *ctx)
+{
+	struct cpl_rdma_ec_status *rep = cplhdr(skb);
+	struct iwch_ep *ep = ctx;
+
+	PDBG("%s ep %p tid %u status %d\n", __func__, ep, ep->hwtid,
+	     rep->status);
+	if (rep->status) {
+		struct iwch_qp_attributes attrs;
+
+		printk(KERN_ERR MOD "%s BAD CLOSE - Aborting tid %u\n",
+		       __func__, ep->hwtid);
+		stop_ep_timer(ep);
+		attrs.next_state = IWCH_QP_STATE_ERROR;
+		iwch_modify_qp(ep->com.qp->rhp,
+			       ep->com.qp, IWCH_QP_ATTR_NEXT_STATE,
+			       &attrs, 1);
+		abort_connection(ep, NULL, GFP_KERNEL);
+	}
+	return CPL_RET_BUF_DONE;
+}
+
+static void ep_timeout(unsigned long arg)
+{
+	struct iwch_ep *ep = (struct iwch_ep *)arg;
+	struct iwch_qp_attributes attrs;
+	unsigned long flags;
+	int abort = 1;
+
+	spin_lock_irqsave(&ep->com.lock, flags);
+	PDBG("%s ep %p tid %u state %d\n", __func__, ep, ep->hwtid,
+	     ep->com.state);
+	switch (ep->com.state) {
+	case MPA_REQ_SENT:
+		__state_set(&ep->com, ABORTING);
+		connect_reply_upcall(ep, -ETIMEDOUT);
+		break;
+	case MPA_REQ_WAIT:
+		__state_set(&ep->com, ABORTING);
+		break;
+	case CLOSING:
+	case MORIBUND:
+		if (ep->com.cm_id && ep->com.qp) {
+			attrs.next_state = IWCH_QP_STATE_ERROR;
+			iwch_modify_qp(ep->com.qp->rhp,
+				     ep->com.qp, IWCH_QP_ATTR_NEXT_STATE,
+				     &attrs, 1);
+		}
+		__state_set(&ep->com, ABORTING);
+		break;
+	default:
+		WARN(1, "%s unexpected state ep %p state %u\n",
+			__func__, ep, ep->com.state);
+		abort = 0;
+	}
+	spin_unlock_irqrestore(&ep->com.lock, flags);
+	if (abort)
+		abort_connection(ep, NULL, GFP_ATOMIC);
+	put_ep(&ep->com);
+}
+
+int iwch_reject_cr(struct iw_cm_id *cm_id, const void *pdata, u8 pdata_len)
+{
+	int err;
+	struct iwch_ep *ep = to_ep(cm_id);
+	PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid);
+
+	if (state_read(&ep->com) == DEAD) {
+		put_ep(&ep->com);
+		return -ECONNRESET;
+	}
+	BUG_ON(state_read(&ep->com) != MPA_REQ_RCVD);
+	if (mpa_rev == 0)
+		abort_connection(ep, NULL, GFP_KERNEL);
+	else {
+		err = send_mpa_reject(ep, pdata, pdata_len);
+		err = iwch_ep_disconnect(ep, 0, GFP_KERNEL);
+	}
+	put_ep(&ep->com);
+	return 0;
+}
+
+int iwch_accept_cr(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
+{
+	int err;
+	struct iwch_qp_attributes attrs;
+	enum iwch_qp_attr_mask mask;
+	struct iwch_ep *ep = to_ep(cm_id);
+	struct iwch_dev *h = to_iwch_dev(cm_id->device);
+	struct iwch_qp *qp = get_qhp(h, conn_param->qpn);
+
+	PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid);
+	if (state_read(&ep->com) == DEAD) {
+		err = -ECONNRESET;
+		goto err;
+	}
+
+	BUG_ON(state_read(&ep->com) != MPA_REQ_RCVD);
+	BUG_ON(!qp);
+
+	if ((conn_param->ord > qp->rhp->attr.max_rdma_read_qp_depth) ||
+	    (conn_param->ird > qp->rhp->attr.max_rdma_reads_per_qp)) {
+		abort_connection(ep, NULL, GFP_KERNEL);
+		err = -EINVAL;
+		goto err;
+	}
+
+	cm_id->add_ref(cm_id);
+	ep->com.cm_id = cm_id;
+	ep->com.qp = qp;
+
+	ep->ird = conn_param->ird;
+	ep->ord = conn_param->ord;
+
+	if (peer2peer && ep->ird == 0)
+		ep->ird = 1;
+
+	PDBG("%s %d ird %d ord %d\n", __func__, __LINE__, ep->ird, ep->ord);
+
+	/* bind QP to EP and move to RTS */
+	attrs.mpa_attr = ep->mpa_attr;
+	attrs.max_ird = ep->ird;
+	attrs.max_ord = ep->ord;
+	attrs.llp_stream_handle = ep;
+	attrs.next_state = IWCH_QP_STATE_RTS;
+
+	/* bind QP and TID with INIT_WR */
+	mask = IWCH_QP_ATTR_NEXT_STATE |
+			     IWCH_QP_ATTR_LLP_STREAM_HANDLE |
+			     IWCH_QP_ATTR_MPA_ATTR |
+			     IWCH_QP_ATTR_MAX_IRD |
+			     IWCH_QP_ATTR_MAX_ORD;
+
+	err = iwch_modify_qp(ep->com.qp->rhp,
+			     ep->com.qp, mask, &attrs, 1);
+	if (err)
+		goto err1;
+
+	/* if needed, wait for wr_ack */
+	if (iwch_rqes_posted(qp)) {
+		wait_event(ep->com.waitq, ep->com.rpl_done);
+		err = ep->com.rpl_err;
+		if (err)
+			goto err1;
+	}
+
+	err = send_mpa_reply(ep, conn_param->private_data,
+			     conn_param->private_data_len);
+	if (err)
+		goto err1;
+
+
+	state_set(&ep->com, FPDU_MODE);
+	established_upcall(ep);
+	put_ep(&ep->com);
+	return 0;
+err1:
+	ep->com.cm_id = NULL;
+	ep->com.qp = NULL;
+	cm_id->rem_ref(cm_id);
+err:
+	put_ep(&ep->com);
+	return err;
+}
+
+static int is_loopback_dst(struct iw_cm_id *cm_id)
+{
+	struct net_device *dev;
+	struct sockaddr_in *raddr = (struct sockaddr_in *)&cm_id->remote_addr;
+
+	dev = ip_dev_find(&init_net, raddr->sin_addr.s_addr);
+	if (!dev)
+		return 0;
+	dev_put(dev);
+	return 1;
+}
+
+int iwch_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
+{
+	struct iwch_dev *h = to_iwch_dev(cm_id->device);
+	struct iwch_ep *ep;
+	struct rtable *rt;
+	int err = 0;
+	struct sockaddr_in *laddr = (struct sockaddr_in *)&cm_id->local_addr;
+	struct sockaddr_in *raddr = (struct sockaddr_in *)&cm_id->remote_addr;
+
+	if (cm_id->remote_addr.ss_family != PF_INET) {
+		err = -ENOSYS;
+		goto out;
+	}
+
+	if (is_loopback_dst(cm_id)) {
+		err = -ENOSYS;
+		goto out;
+	}
+
+	ep = alloc_ep(sizeof(*ep), GFP_KERNEL);
+	if (!ep) {
+		printk(KERN_ERR MOD "%s - cannot alloc ep.\n", __func__);
+		err = -ENOMEM;
+		goto out;
+	}
+	init_timer(&ep->timer);
+	ep->plen = conn_param->private_data_len;
+	if (ep->plen)
+		memcpy(ep->mpa_pkt + sizeof(struct mpa_message),
+		       conn_param->private_data, ep->plen);
+	ep->ird = conn_param->ird;
+	ep->ord = conn_param->ord;
+
+	if (peer2peer && ep->ord == 0)
+		ep->ord = 1;
+
+	ep->com.tdev = h->rdev.t3cdev_p;
+
+	cm_id->add_ref(cm_id);
+	ep->com.cm_id = cm_id;
+	ep->com.qp = get_qhp(h, conn_param->qpn);
+	BUG_ON(!ep->com.qp);
+	PDBG("%s qpn 0x%x qp %p cm_id %p\n", __func__, conn_param->qpn,
+	     ep->com.qp, cm_id);
+
+	/*
+	 * Allocate an active TID to initiate a TCP connection.
+	 */
+	ep->atid = cxgb3_alloc_atid(h->rdev.t3cdev_p, &t3c_client, ep);
+	if (ep->atid == -1) {
+		printk(KERN_ERR MOD "%s - cannot alloc atid.\n", __func__);
+		err = -ENOMEM;
+		goto fail2;
+	}
+
+	/* find a route */
+	rt = find_route(h->rdev.t3cdev_p, laddr->sin_addr.s_addr,
+			raddr->sin_addr.s_addr, laddr->sin_port,
+			raddr->sin_port, IPTOS_LOWDELAY);
+	if (!rt) {
+		printk(KERN_ERR MOD "%s - cannot find route.\n", __func__);
+		err = -EHOSTUNREACH;
+		goto fail3;
+	}
+	ep->dst = &rt->dst;
+	ep->l2t = t3_l2t_get(ep->com.tdev, ep->dst, NULL,
+			     &raddr->sin_addr.s_addr);
+	if (!ep->l2t) {
+		printk(KERN_ERR MOD "%s - cannot alloc l2e.\n", __func__);
+		err = -ENOMEM;
+		goto fail4;
+	}
+
+	state_set(&ep->com, CONNECTING);
+	ep->tos = IPTOS_LOWDELAY;
+	memcpy(&ep->com.local_addr, &cm_id->local_addr,
+	       sizeof(ep->com.local_addr));
+	memcpy(&ep->com.remote_addr, &cm_id->remote_addr,
+	       sizeof(ep->com.remote_addr));
+
+	/* send connect request to rnic */
+	err = send_connect(ep);
+	if (!err)
+		goto out;
+
+	l2t_release(h->rdev.t3cdev_p, ep->l2t);
+fail4:
+	dst_release(ep->dst);
+fail3:
+	cxgb3_free_atid(ep->com.tdev, ep->atid);
+fail2:
+	cm_id->rem_ref(cm_id);
+	put_ep(&ep->com);
+out:
+	return err;
+}
+
+int iwch_create_listen(struct iw_cm_id *cm_id, int backlog)
+{
+	int err = 0;
+	struct iwch_dev *h = to_iwch_dev(cm_id->device);
+	struct iwch_listen_ep *ep;
+
+
+	might_sleep();
+
+	if (cm_id->local_addr.ss_family != PF_INET) {
+		err = -ENOSYS;
+		goto fail1;
+	}
+
+	ep = alloc_ep(sizeof(*ep), GFP_KERNEL);
+	if (!ep) {
+		printk(KERN_ERR MOD "%s - cannot alloc ep.\n", __func__);
+		err = -ENOMEM;
+		goto fail1;
+	}
+	PDBG("%s ep %p\n", __func__, ep);
+	ep->com.tdev = h->rdev.t3cdev_p;
+	cm_id->add_ref(cm_id);
+	ep->com.cm_id = cm_id;
+	ep->backlog = backlog;
+	memcpy(&ep->com.local_addr, &cm_id->local_addr,
+	       sizeof(ep->com.local_addr));
+
+	/*
+	 * Allocate a server TID.
+	 */
+	ep->stid = cxgb3_alloc_stid(h->rdev.t3cdev_p, &t3c_client, ep);
+	if (ep->stid == -1) {
+		printk(KERN_ERR MOD "%s - cannot alloc atid.\n", __func__);
+		err = -ENOMEM;
+		goto fail2;
+	}
+
+	state_set(&ep->com, LISTEN);
+	err = listen_start(ep);
+	if (err)
+		goto fail3;
+
+	/* wait for pass_open_rpl */
+	wait_event(ep->com.waitq, ep->com.rpl_done);
+	err = ep->com.rpl_err;
+	if (!err) {
+		cm_id->provider_data = ep;
+		goto out;
+	}
+fail3:
+	cxgb3_free_stid(ep->com.tdev, ep->stid);
+fail2:
+	cm_id->rem_ref(cm_id);
+	put_ep(&ep->com);
+fail1:
+out:
+	return err;
+}
+
+int iwch_destroy_listen(struct iw_cm_id *cm_id)
+{
+	int err;
+	struct iwch_listen_ep *ep = to_listen_ep(cm_id);
+
+	PDBG("%s ep %p\n", __func__, ep);
+
+	might_sleep();
+	state_set(&ep->com, DEAD);
+	ep->com.rpl_done = 0;
+	ep->com.rpl_err = 0;
+	err = listen_stop(ep);
+	if (err)
+		goto done;
+	wait_event(ep->com.waitq, ep->com.rpl_done);
+	cxgb3_free_stid(ep->com.tdev, ep->stid);
+done:
+	err = ep->com.rpl_err;
+	cm_id->rem_ref(cm_id);
+	put_ep(&ep->com);
+	return err;
+}
+
+int iwch_ep_disconnect(struct iwch_ep *ep, int abrupt, gfp_t gfp)
+{
+	int ret=0;
+	unsigned long flags;
+	int close = 0;
+	int fatal = 0;
+	struct t3cdev *tdev;
+	struct cxio_rdev *rdev;
+
+	spin_lock_irqsave(&ep->com.lock, flags);
+
+	PDBG("%s ep %p state %s, abrupt %d\n", __func__, ep,
+	     states[ep->com.state], abrupt);
+
+	tdev = (struct t3cdev *)ep->com.tdev;
+	rdev = (struct cxio_rdev *)tdev->ulp;
+	if (cxio_fatal_error(rdev)) {
+		fatal = 1;
+		close_complete_upcall(ep);
+		ep->com.state = DEAD;
+	}
+	switch (ep->com.state) {
+	case MPA_REQ_WAIT:
+	case MPA_REQ_SENT:
+	case MPA_REQ_RCVD:
+	case MPA_REP_SENT:
+	case FPDU_MODE:
+		close = 1;
+		if (abrupt)
+			ep->com.state = ABORTING;
+		else {
+			ep->com.state = CLOSING;
+			start_ep_timer(ep);
+		}
+		set_bit(CLOSE_SENT, &ep->com.flags);
+		break;
+	case CLOSING:
+		if (!test_and_set_bit(CLOSE_SENT, &ep->com.flags)) {
+			close = 1;
+			if (abrupt) {
+				stop_ep_timer(ep);
+				ep->com.state = ABORTING;
+			} else
+				ep->com.state = MORIBUND;
+		}
+		break;
+	case MORIBUND:
+	case ABORTING:
+	case DEAD:
+		PDBG("%s ignoring disconnect ep %p state %u\n",
+		     __func__, ep, ep->com.state);
+		break;
+	default:
+		BUG();
+		break;
+	}
+
+	spin_unlock_irqrestore(&ep->com.lock, flags);
+	if (close) {
+		if (abrupt)
+			ret = send_abort(ep, NULL, gfp);
+		else
+			ret = send_halfclose(ep, gfp);
+		if (ret)
+			fatal = 1;
+	}
+	if (fatal)
+		release_ep_resources(ep);
+	return ret;
+}
+
+int iwch_ep_redirect(void *ctx, struct dst_entry *old, struct dst_entry *new,
+		     struct l2t_entry *l2t)
+{
+	struct iwch_ep *ep = ctx;
+
+	if (ep->dst != old)
+		return 0;
+
+	PDBG("%s ep %p redirect to dst %p l2t %p\n", __func__, ep, new,
+	     l2t);
+	dst_hold(new);
+	l2t_release(ep->com.tdev, ep->l2t);
+	ep->l2t = l2t;
+	dst_release(old);
+	ep->dst = new;
+	return 1;
+}
+
+/*
+ * All the CM events are handled on a work queue to have a safe context.
+ * These are the real handlers that are called from the work queue.
+ */
+static const cxgb3_cpl_handler_func work_handlers[NUM_CPL_CMDS] = {
+	[CPL_ACT_ESTABLISH]	= act_establish,
+	[CPL_ACT_OPEN_RPL]	= act_open_rpl,
+	[CPL_RX_DATA]		= rx_data,
+	[CPL_TX_DMA_ACK]	= tx_ack,
+	[CPL_ABORT_RPL_RSS]	= abort_rpl,
+	[CPL_ABORT_RPL]		= abort_rpl,
+	[CPL_PASS_OPEN_RPL]	= pass_open_rpl,
+	[CPL_CLOSE_LISTSRV_RPL]	= close_listsrv_rpl,
+	[CPL_PASS_ACCEPT_REQ]	= pass_accept_req,
+	[CPL_PASS_ESTABLISH]	= pass_establish,
+	[CPL_PEER_CLOSE]	= peer_close,
+	[CPL_ABORT_REQ_RSS]	= peer_abort,
+	[CPL_CLOSE_CON_RPL]	= close_con_rpl,
+	[CPL_RDMA_TERMINATE]	= terminate,
+	[CPL_RDMA_EC_STATUS]	= ec_status,
+};
+
+static void process_work(struct work_struct *work)
+{
+	struct sk_buff *skb = NULL;
+	void *ep;
+	struct t3cdev *tdev;
+	int ret;
+
+	while ((skb = skb_dequeue(&rxq))) {
+		ep = *((void **) (skb->cb));
+		tdev = *((struct t3cdev **) (skb->cb + sizeof(void *)));
+		ret = work_handlers[G_OPCODE(ntohl((__force __be32)skb->csum))](tdev, skb, ep);
+		if (ret & CPL_RET_BUF_DONE)
+			kfree_skb(skb);
+
+		/*
+		 * ep was referenced in sched(), and is freed here.
+		 */
+		put_ep((struct iwch_ep_common *)ep);
+	}
+}
+
+static DECLARE_WORK(skb_work, process_work);
+
+static int sched(struct t3cdev *tdev, struct sk_buff *skb, void *ctx)
+{
+	struct iwch_ep_common *epc = ctx;
+
+	get_ep(epc);
+
+	/*
+	 * Save ctx and tdev in the skb->cb area.
+	 */
+	*((void **) skb->cb) = ctx;
+	*((struct t3cdev **) (skb->cb + sizeof(void *))) = tdev;
+
+	/*
+	 * Queue the skb and schedule the worker thread.
+	 */
+	skb_queue_tail(&rxq, skb);
+	queue_work(workq, &skb_work);
+	return 0;
+}
+
+static int set_tcb_rpl(struct t3cdev *tdev, struct sk_buff *skb, void *ctx)
+{
+	struct cpl_set_tcb_rpl *rpl = cplhdr(skb);
+
+	if (rpl->status != CPL_ERR_NONE) {
+		printk(KERN_ERR MOD "Unexpected SET_TCB_RPL status %u "
+		       "for tid %u\n", rpl->status, GET_TID(rpl));
+	}
+	return CPL_RET_BUF_DONE;
+}
+
+/*
+ * All upcalls from the T3 Core go to sched() to schedule the
+ * processing on a work queue.
+ */
+cxgb3_cpl_handler_func t3c_handlers[NUM_CPL_CMDS] = {
+	[CPL_ACT_ESTABLISH]	= sched,
+	[CPL_ACT_OPEN_RPL]	= sched,
+	[CPL_RX_DATA]		= sched,
+	[CPL_TX_DMA_ACK]	= sched,
+	[CPL_ABORT_RPL_RSS]	= sched,
+	[CPL_ABORT_RPL]		= sched,
+	[CPL_PASS_OPEN_RPL]	= sched,
+	[CPL_CLOSE_LISTSRV_RPL]	= sched,
+	[CPL_PASS_ACCEPT_REQ]	= sched,
+	[CPL_PASS_ESTABLISH]	= sched,
+	[CPL_PEER_CLOSE]	= sched,
+	[CPL_CLOSE_CON_RPL]	= sched,
+	[CPL_ABORT_REQ_RSS]	= sched,
+	[CPL_RDMA_TERMINATE]	= sched,
+	[CPL_RDMA_EC_STATUS]	= sched,
+	[CPL_SET_TCB_RPL]	= set_tcb_rpl,
+};
+
+int __init iwch_cm_init(void)
+{
+	skb_queue_head_init(&rxq);
+
+	workq = create_singlethread_workqueue("iw_cxgb3");
+	if (!workq)
+		return -ENOMEM;
+
+	return 0;
+}
+
+void __exit iwch_cm_term(void)
+{
+	flush_workqueue(workq);
+	destroy_workqueue(workq);
+}
diff --git a/drivers/infiniband/hw/cxgb3/iwch_cm.h b/drivers/infiniband/hw/cxgb3/iwch_cm.h
new file mode 100644
index 0000000..b9efadf
--- /dev/null
+++ b/drivers/infiniband/hw/cxgb3/iwch_cm.h
@@ -0,0 +1,233 @@
+/*
+ * Copyright (c) 2006 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef _IWCH_CM_H_
+#define _IWCH_CM_H_
+
+#include <linux/inet.h>
+#include <linux/wait.h>
+#include <linux/spinlock.h>
+#include <linux/kref.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/iw_cm.h>
+
+#include "cxgb3_offload.h"
+#include "iwch_provider.h"
+
+#define MPA_KEY_REQ "MPA ID Req Frame"
+#define MPA_KEY_REP "MPA ID Rep Frame"
+
+#define MPA_MAX_PRIVATE_DATA	256
+#define MPA_REV		0	/* XXX - amso1100 uses rev 0 ! */
+#define MPA_REJECT		0x20
+#define MPA_CRC			0x40
+#define MPA_MARKERS		0x80
+#define MPA_FLAGS_MASK		0xE0
+
+#define put_ep(ep) { \
+	PDBG("put_ep (via %s:%u) ep %p refcnt %d\n", __func__, __LINE__,  \
+	     ep, atomic_read(&((ep)->kref.refcount))); \
+	WARN_ON(atomic_read(&((ep)->kref.refcount)) < 1); \
+	kref_put(&((ep)->kref), __free_ep); \
+}
+
+#define get_ep(ep) { \
+	PDBG("get_ep (via %s:%u) ep %p, refcnt %d\n", __func__, __LINE__, \
+	     ep, atomic_read(&((ep)->kref.refcount))); \
+	kref_get(&((ep)->kref));  \
+}
+
+struct mpa_message {
+	u8 key[16];
+	u8 flags;
+	u8 revision;
+	__be16 private_data_size;
+	u8 private_data[0];
+};
+
+struct terminate_message {
+	u8 layer_etype;
+	u8 ecode;
+	__be16 hdrct_rsvd;
+	u8 len_hdrs[0];
+};
+
+#define TERM_MAX_LENGTH (sizeof(struct terminate_message) + 2 + 18 + 28)
+
+enum iwch_layers_types {
+	LAYER_RDMAP		= 0x00,
+	LAYER_DDP		= 0x10,
+	LAYER_MPA		= 0x20,
+	RDMAP_LOCAL_CATA	= 0x00,
+	RDMAP_REMOTE_PROT	= 0x01,
+	RDMAP_REMOTE_OP		= 0x02,
+	DDP_LOCAL_CATA		= 0x00,
+	DDP_TAGGED_ERR		= 0x01,
+	DDP_UNTAGGED_ERR	= 0x02,
+	DDP_LLP			= 0x03
+};
+
+enum iwch_rdma_ecodes {
+	RDMAP_INV_STAG		= 0x00,
+	RDMAP_BASE_BOUNDS	= 0x01,
+	RDMAP_ACC_VIOL		= 0x02,
+	RDMAP_STAG_NOT_ASSOC	= 0x03,
+	RDMAP_TO_WRAP		= 0x04,
+	RDMAP_INV_VERS		= 0x05,
+	RDMAP_INV_OPCODE	= 0x06,
+	RDMAP_STREAM_CATA	= 0x07,
+	RDMAP_GLOBAL_CATA	= 0x08,
+	RDMAP_CANT_INV_STAG	= 0x09,
+	RDMAP_UNSPECIFIED	= 0xff
+};
+
+enum iwch_ddp_ecodes {
+	DDPT_INV_STAG		= 0x00,
+	DDPT_BASE_BOUNDS	= 0x01,
+	DDPT_STAG_NOT_ASSOC	= 0x02,
+	DDPT_TO_WRAP		= 0x03,
+	DDPT_INV_VERS		= 0x04,
+	DDPU_INV_QN		= 0x01,
+	DDPU_INV_MSN_NOBUF	= 0x02,
+	DDPU_INV_MSN_RANGE	= 0x03,
+	DDPU_INV_MO		= 0x04,
+	DDPU_MSG_TOOBIG		= 0x05,
+	DDPU_INV_VERS		= 0x06
+};
+
+enum iwch_mpa_ecodes {
+	MPA_CRC_ERR		= 0x02,
+	MPA_MARKER_ERR		= 0x03
+};
+
+enum iwch_ep_state {
+	IDLE = 0,
+	LISTEN,
+	CONNECTING,
+	MPA_REQ_WAIT,
+	MPA_REQ_SENT,
+	MPA_REQ_RCVD,
+	MPA_REP_SENT,
+	FPDU_MODE,
+	ABORTING,
+	CLOSING,
+	MORIBUND,
+	DEAD,
+};
+
+enum iwch_ep_flags {
+	PEER_ABORT_IN_PROGRESS	= 0,
+	ABORT_REQ_IN_PROGRESS	= 1,
+	RELEASE_RESOURCES	= 2,
+	CLOSE_SENT		= 3,
+};
+
+struct iwch_ep_common {
+	struct iw_cm_id *cm_id;
+	struct iwch_qp *qp;
+	struct t3cdev *tdev;
+	enum iwch_ep_state state;
+	struct kref kref;
+	spinlock_t lock;
+	struct sockaddr_in local_addr;
+	struct sockaddr_in remote_addr;
+	wait_queue_head_t waitq;
+	int rpl_done;
+	int rpl_err;
+	unsigned long flags;
+};
+
+struct iwch_listen_ep {
+	struct iwch_ep_common com;
+	unsigned int stid;
+	int backlog;
+};
+
+struct iwch_ep {
+	struct iwch_ep_common com;
+	struct iwch_ep *parent_ep;
+	struct timer_list timer;
+	unsigned int atid;
+	u32 hwtid;
+	u32 snd_seq;
+	u32 rcv_seq;
+	struct l2t_entry *l2t;
+	struct dst_entry *dst;
+	struct sk_buff *mpa_skb;
+	struct iwch_mpa_attributes mpa_attr;
+	unsigned int mpa_pkt_len;
+	u8 mpa_pkt[sizeof(struct mpa_message) + MPA_MAX_PRIVATE_DATA];
+	u8 tos;
+	u16 emss;
+	u16 plen;
+	u32 ird;
+	u32 ord;
+};
+
+static inline struct iwch_ep *to_ep(struct iw_cm_id *cm_id)
+{
+	return cm_id->provider_data;
+}
+
+static inline struct iwch_listen_ep *to_listen_ep(struct iw_cm_id *cm_id)
+{
+	return cm_id->provider_data;
+}
+
+static inline int compute_wscale(int win)
+{
+	int wscale = 0;
+
+	while (wscale < 14 && (65535<<wscale) < win)
+		wscale++;
+	return wscale;
+}
+
+/* CM prototypes */
+
+int iwch_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param);
+int iwch_create_listen(struct iw_cm_id *cm_id, int backlog);
+int iwch_destroy_listen(struct iw_cm_id *cm_id);
+int iwch_reject_cr(struct iw_cm_id *cm_id, const void *pdata, u8 pdata_len);
+int iwch_accept_cr(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param);
+int iwch_ep_disconnect(struct iwch_ep *ep, int abrupt, gfp_t gfp);
+int iwch_quiesce_tid(struct iwch_ep *ep);
+int iwch_resume_tid(struct iwch_ep *ep);
+void __free_ep(struct kref *kref);
+void iwch_rearp(struct iwch_ep *ep);
+int iwch_ep_redirect(void *ctx, struct dst_entry *old, struct dst_entry *new, struct l2t_entry *l2t);
+
+int __init iwch_cm_init(void);
+void __exit iwch_cm_term(void);
+extern int peer2peer;
+
+#endif				/* _IWCH_CM_H_ */
diff --git a/drivers/infiniband/hw/cxgb3/iwch_cq.c b/drivers/infiniband/hw/cxgb3/iwch_cq.c
new file mode 100644
index 0000000..cf5474a
--- /dev/null
+++ b/drivers/infiniband/hw/cxgb3/iwch_cq.c
@@ -0,0 +1,233 @@
+/*
+ * Copyright (c) 2006 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "iwch_provider.h"
+#include "iwch.h"
+
+/*
+ * Get one cq entry from cxio and map it to openib.
+ *
+ * Returns:
+ *	0			EMPTY;
+ *	1			cqe returned
+ *	-EAGAIN		caller must try again
+ *	any other -errno	fatal error
+ */
+static int iwch_poll_cq_one(struct iwch_dev *rhp, struct iwch_cq *chp,
+			    struct ib_wc *wc)
+{
+	struct iwch_qp *qhp = NULL;
+	struct t3_cqe cqe, *rd_cqe;
+	struct t3_wq *wq;
+	u32 credit = 0;
+	u8 cqe_flushed;
+	u64 cookie;
+	int ret = 1;
+
+	rd_cqe = cxio_next_cqe(&chp->cq);
+
+	if (!rd_cqe)
+		return 0;
+
+	qhp = get_qhp(rhp, CQE_QPID(*rd_cqe));
+	if (!qhp)
+		wq = NULL;
+	else {
+		spin_lock(&qhp->lock);
+		wq = &(qhp->wq);
+	}
+	ret = cxio_poll_cq(wq, &(chp->cq), &cqe, &cqe_flushed, &cookie,
+				   &credit);
+	if (t3a_device(chp->rhp) && credit) {
+		PDBG("%s updating %d cq credits on id %d\n", __func__,
+		     credit, chp->cq.cqid);
+		cxio_hal_cq_op(&rhp->rdev, &chp->cq, CQ_CREDIT_UPDATE, credit);
+	}
+
+	if (ret) {
+		ret = -EAGAIN;
+		goto out;
+	}
+	ret = 1;
+
+	wc->wr_id = cookie;
+	wc->qp = &qhp->ibqp;
+	wc->vendor_err = CQE_STATUS(cqe);
+	wc->wc_flags = 0;
+
+	PDBG("%s qpid 0x%x type %d opcode %d status 0x%x wrid hi 0x%x "
+	     "lo 0x%x cookie 0x%llx\n", __func__,
+	     CQE_QPID(cqe), CQE_TYPE(cqe),
+	     CQE_OPCODE(cqe), CQE_STATUS(cqe), CQE_WRID_HI(cqe),
+	     CQE_WRID_LOW(cqe), (unsigned long long) cookie);
+
+	if (CQE_TYPE(cqe) == 0) {
+		if (!CQE_STATUS(cqe))
+			wc->byte_len = CQE_LEN(cqe);
+		else
+			wc->byte_len = 0;
+		wc->opcode = IB_WC_RECV;
+		if (CQE_OPCODE(cqe) == T3_SEND_WITH_INV ||
+		    CQE_OPCODE(cqe) == T3_SEND_WITH_SE_INV) {
+			wc->ex.invalidate_rkey = CQE_WRID_STAG(cqe);
+			wc->wc_flags |= IB_WC_WITH_INVALIDATE;
+		}
+	} else {
+		switch (CQE_OPCODE(cqe)) {
+		case T3_RDMA_WRITE:
+			wc->opcode = IB_WC_RDMA_WRITE;
+			break;
+		case T3_READ_REQ:
+			wc->opcode = IB_WC_RDMA_READ;
+			wc->byte_len = CQE_LEN(cqe);
+			break;
+		case T3_SEND:
+		case T3_SEND_WITH_SE:
+		case T3_SEND_WITH_INV:
+		case T3_SEND_WITH_SE_INV:
+			wc->opcode = IB_WC_SEND;
+			break;
+		case T3_BIND_MW:
+			wc->opcode = IB_WC_BIND_MW;
+			break;
+
+		case T3_LOCAL_INV:
+			wc->opcode = IB_WC_LOCAL_INV;
+			break;
+		case T3_FAST_REGISTER:
+			wc->opcode = IB_WC_FAST_REG_MR;
+			break;
+		default:
+			printk(KERN_ERR MOD "Unexpected opcode %d "
+			       "in the CQE received for QPID=0x%0x\n",
+			       CQE_OPCODE(cqe), CQE_QPID(cqe));
+			ret = -EINVAL;
+			goto out;
+		}
+	}
+
+	if (cqe_flushed)
+		wc->status = IB_WC_WR_FLUSH_ERR;
+	else {
+
+		switch (CQE_STATUS(cqe)) {
+		case TPT_ERR_SUCCESS:
+			wc->status = IB_WC_SUCCESS;
+			break;
+		case TPT_ERR_STAG:
+			wc->status = IB_WC_LOC_ACCESS_ERR;
+			break;
+		case TPT_ERR_PDID:
+			wc->status = IB_WC_LOC_PROT_ERR;
+			break;
+		case TPT_ERR_QPID:
+		case TPT_ERR_ACCESS:
+			wc->status = IB_WC_LOC_ACCESS_ERR;
+			break;
+		case TPT_ERR_WRAP:
+			wc->status = IB_WC_GENERAL_ERR;
+			break;
+		case TPT_ERR_BOUND:
+			wc->status = IB_WC_LOC_LEN_ERR;
+			break;
+		case TPT_ERR_INVALIDATE_SHARED_MR:
+		case TPT_ERR_INVALIDATE_MR_WITH_MW_BOUND:
+			wc->status = IB_WC_MW_BIND_ERR;
+			break;
+		case TPT_ERR_CRC:
+		case TPT_ERR_MARKER:
+		case TPT_ERR_PDU_LEN_ERR:
+		case TPT_ERR_OUT_OF_RQE:
+		case TPT_ERR_DDP_VERSION:
+		case TPT_ERR_RDMA_VERSION:
+		case TPT_ERR_DDP_QUEUE_NUM:
+		case TPT_ERR_MSN:
+		case TPT_ERR_TBIT:
+		case TPT_ERR_MO:
+		case TPT_ERR_MSN_RANGE:
+		case TPT_ERR_IRD_OVERFLOW:
+		case TPT_ERR_OPCODE:
+			wc->status = IB_WC_FATAL_ERR;
+			break;
+		case TPT_ERR_SWFLUSH:
+			wc->status = IB_WC_WR_FLUSH_ERR;
+			break;
+		default:
+			printk(KERN_ERR MOD "Unexpected cqe_status 0x%x for "
+			       "QPID=0x%0x\n", CQE_STATUS(cqe), CQE_QPID(cqe));
+			ret = -EINVAL;
+		}
+	}
+out:
+	if (wq)
+		spin_unlock(&qhp->lock);
+	return ret;
+}
+
+int iwch_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc)
+{
+	struct iwch_dev *rhp;
+	struct iwch_cq *chp;
+	unsigned long flags;
+	int npolled;
+	int err = 0;
+
+	chp = to_iwch_cq(ibcq);
+	rhp = chp->rhp;
+
+	spin_lock_irqsave(&chp->lock, flags);
+	for (npolled = 0; npolled < num_entries; ++npolled) {
+#ifdef DEBUG
+		int i=0;
+#endif
+
+		/*
+		 * Because T3 can post CQEs that are _not_ associated
+		 * with a WR, we might have to poll again after removing
+		 * one of these.
+		 */
+		do {
+			err = iwch_poll_cq_one(rhp, chp, wc + npolled);
+#ifdef DEBUG
+			BUG_ON(++i > 1000);
+#endif
+		} while (err == -EAGAIN);
+		if (err <= 0)
+			break;
+	}
+	spin_unlock_irqrestore(&chp->lock, flags);
+
+	if (err < 0)
+		return err;
+	else {
+		return npolled;
+	}
+}
diff --git a/drivers/infiniband/hw/cxgb3/iwch_ev.c b/drivers/infiniband/hw/cxgb3/iwch_ev.c
new file mode 100644
index 0000000..abcc9e7
--- /dev/null
+++ b/drivers/infiniband/hw/cxgb3/iwch_ev.c
@@ -0,0 +1,232 @@
+/*
+ * Copyright (c) 2006 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include <linux/gfp.h>
+#include <linux/mman.h>
+#include <net/sock.h>
+#include "iwch_provider.h"
+#include "iwch.h"
+#include "iwch_cm.h"
+#include "cxio_hal.h"
+#include "cxio_wr.h"
+
+static void post_qp_event(struct iwch_dev *rnicp, struct iwch_cq *chp,
+			  struct respQ_msg_t *rsp_msg,
+			  enum ib_event_type ib_event,
+			  int send_term)
+{
+	struct ib_event event;
+	struct iwch_qp_attributes attrs;
+	struct iwch_qp *qhp;
+	unsigned long flag;
+
+	spin_lock(&rnicp->lock);
+	qhp = get_qhp(rnicp, CQE_QPID(rsp_msg->cqe));
+
+	if (!qhp) {
+		printk(KERN_ERR "%s unaffiliated error 0x%x qpid 0x%x\n",
+		       __func__, CQE_STATUS(rsp_msg->cqe),
+		       CQE_QPID(rsp_msg->cqe));
+		spin_unlock(&rnicp->lock);
+		return;
+	}
+
+	if ((qhp->attr.state == IWCH_QP_STATE_ERROR) ||
+	    (qhp->attr.state == IWCH_QP_STATE_TERMINATE)) {
+		PDBG("%s AE received after RTS - "
+		     "qp state %d qpid 0x%x status 0x%x\n", __func__,
+		     qhp->attr.state, qhp->wq.qpid, CQE_STATUS(rsp_msg->cqe));
+		spin_unlock(&rnicp->lock);
+		return;
+	}
+
+	printk(KERN_ERR "%s - AE qpid 0x%x opcode %d status 0x%x "
+	       "type %d wrid.hi 0x%x wrid.lo 0x%x \n", __func__,
+	       CQE_QPID(rsp_msg->cqe), CQE_OPCODE(rsp_msg->cqe),
+	       CQE_STATUS(rsp_msg->cqe), CQE_TYPE(rsp_msg->cqe),
+	       CQE_WRID_HI(rsp_msg->cqe), CQE_WRID_LOW(rsp_msg->cqe));
+
+	atomic_inc(&qhp->refcnt);
+	spin_unlock(&rnicp->lock);
+
+	if (qhp->attr.state == IWCH_QP_STATE_RTS) {
+		attrs.next_state = IWCH_QP_STATE_TERMINATE;
+		iwch_modify_qp(qhp->rhp, qhp, IWCH_QP_ATTR_NEXT_STATE,
+			       &attrs, 1);
+		if (send_term)
+			iwch_post_terminate(qhp, rsp_msg);
+	}
+
+	event.event = ib_event;
+	event.device = chp->ibcq.device;
+	if (ib_event == IB_EVENT_CQ_ERR)
+		event.element.cq = &chp->ibcq;
+	else
+		event.element.qp = &qhp->ibqp;
+
+	if (qhp->ibqp.event_handler)
+		(*qhp->ibqp.event_handler)(&event, qhp->ibqp.qp_context);
+
+	spin_lock_irqsave(&chp->comp_handler_lock, flag);
+	(*chp->ibcq.comp_handler)(&chp->ibcq, chp->ibcq.cq_context);
+	spin_unlock_irqrestore(&chp->comp_handler_lock, flag);
+
+	if (atomic_dec_and_test(&qhp->refcnt))
+		wake_up(&qhp->wait);
+}
+
+void iwch_ev_dispatch(struct cxio_rdev *rdev_p, struct sk_buff *skb)
+{
+	struct iwch_dev *rnicp;
+	struct respQ_msg_t *rsp_msg = (struct respQ_msg_t *) skb->data;
+	struct iwch_cq *chp;
+	struct iwch_qp *qhp;
+	u32 cqid = RSPQ_CQID(rsp_msg);
+	unsigned long flag;
+
+	rnicp = (struct iwch_dev *) rdev_p->ulp;
+	spin_lock(&rnicp->lock);
+	chp = get_chp(rnicp, cqid);
+	qhp = get_qhp(rnicp, CQE_QPID(rsp_msg->cqe));
+	if (!chp || !qhp) {
+		printk(KERN_ERR MOD "BAD AE cqid 0x%x qpid 0x%x opcode %d "
+		       "status 0x%x type %d wrid.hi 0x%x wrid.lo 0x%x \n",
+		       cqid, CQE_QPID(rsp_msg->cqe),
+		       CQE_OPCODE(rsp_msg->cqe), CQE_STATUS(rsp_msg->cqe),
+		       CQE_TYPE(rsp_msg->cqe), CQE_WRID_HI(rsp_msg->cqe),
+		       CQE_WRID_LOW(rsp_msg->cqe));
+		spin_unlock(&rnicp->lock);
+		goto out;
+	}
+	iwch_qp_add_ref(&qhp->ibqp);
+	atomic_inc(&chp->refcnt);
+	spin_unlock(&rnicp->lock);
+
+	/*
+	 * 1) completion of our sending a TERMINATE.
+	 * 2) incoming TERMINATE message.
+	 */
+	if ((CQE_OPCODE(rsp_msg->cqe) == T3_TERMINATE) &&
+	    (CQE_STATUS(rsp_msg->cqe) == 0)) {
+		if (SQ_TYPE(rsp_msg->cqe)) {
+			PDBG("%s QPID 0x%x ep %p disconnecting\n",
+			     __func__, qhp->wq.qpid, qhp->ep);
+			iwch_ep_disconnect(qhp->ep, 0, GFP_ATOMIC);
+		} else {
+			PDBG("%s post REQ_ERR AE QPID 0x%x\n", __func__,
+			     qhp->wq.qpid);
+			post_qp_event(rnicp, chp, rsp_msg,
+				      IB_EVENT_QP_REQ_ERR, 0);
+			iwch_ep_disconnect(qhp->ep, 0, GFP_ATOMIC);
+		}
+		goto done;
+	}
+
+	/* Bad incoming Read request */
+	if (SQ_TYPE(rsp_msg->cqe) &&
+	    (CQE_OPCODE(rsp_msg->cqe) == T3_READ_RESP)) {
+		post_qp_event(rnicp, chp, rsp_msg, IB_EVENT_QP_REQ_ERR, 1);
+		goto done;
+	}
+
+	/* Bad incoming write */
+	if (RQ_TYPE(rsp_msg->cqe) &&
+	    (CQE_OPCODE(rsp_msg->cqe) == T3_RDMA_WRITE)) {
+		post_qp_event(rnicp, chp, rsp_msg, IB_EVENT_QP_REQ_ERR, 1);
+		goto done;
+	}
+
+	switch (CQE_STATUS(rsp_msg->cqe)) {
+
+	/* Completion Events */
+	case TPT_ERR_SUCCESS:
+
+		/*
+		 * Confirm the destination entry if this is a RECV completion.
+		 */
+		if (qhp->ep && SQ_TYPE(rsp_msg->cqe))
+			dst_confirm(qhp->ep->dst);
+		spin_lock_irqsave(&chp->comp_handler_lock, flag);
+		(*chp->ibcq.comp_handler)(&chp->ibcq, chp->ibcq.cq_context);
+		spin_unlock_irqrestore(&chp->comp_handler_lock, flag);
+		break;
+
+	case TPT_ERR_STAG:
+	case TPT_ERR_PDID:
+	case TPT_ERR_QPID:
+	case TPT_ERR_ACCESS:
+	case TPT_ERR_WRAP:
+	case TPT_ERR_BOUND:
+	case TPT_ERR_INVALIDATE_SHARED_MR:
+	case TPT_ERR_INVALIDATE_MR_WITH_MW_BOUND:
+		post_qp_event(rnicp, chp, rsp_msg, IB_EVENT_QP_ACCESS_ERR, 1);
+		break;
+
+	/* Device Fatal Errors */
+	case TPT_ERR_ECC:
+	case TPT_ERR_ECC_PSTAG:
+	case TPT_ERR_INTERNAL_ERR:
+		post_qp_event(rnicp, chp, rsp_msg, IB_EVENT_DEVICE_FATAL, 1);
+		break;
+
+	/* QP Fatal Errors */
+	case TPT_ERR_OUT_OF_RQE:
+	case TPT_ERR_PBL_ADDR_BOUND:
+	case TPT_ERR_CRC:
+	case TPT_ERR_MARKER:
+	case TPT_ERR_PDU_LEN_ERR:
+	case TPT_ERR_DDP_VERSION:
+	case TPT_ERR_RDMA_VERSION:
+	case TPT_ERR_OPCODE:
+	case TPT_ERR_DDP_QUEUE_NUM:
+	case TPT_ERR_MSN:
+	case TPT_ERR_TBIT:
+	case TPT_ERR_MO:
+	case TPT_ERR_MSN_GAP:
+	case TPT_ERR_MSN_RANGE:
+	case TPT_ERR_RQE_ADDR_BOUND:
+	case TPT_ERR_IRD_OVERFLOW:
+		post_qp_event(rnicp, chp, rsp_msg, IB_EVENT_QP_FATAL, 1);
+		break;
+
+	default:
+		printk(KERN_ERR MOD "Unknown T3 status 0x%x QPID 0x%x\n",
+		       CQE_STATUS(rsp_msg->cqe), qhp->wq.qpid);
+		post_qp_event(rnicp, chp, rsp_msg, IB_EVENT_QP_FATAL, 1);
+		break;
+	}
+done:
+	if (atomic_dec_and_test(&chp->refcnt))
+	        wake_up(&chp->wait);
+	iwch_qp_rem_ref(&qhp->ibqp);
+out:
+	dev_kfree_skb_irq(skb);
+}
diff --git a/drivers/infiniband/hw/cxgb3/iwch_mem.c b/drivers/infiniband/hw/cxgb3/iwch_mem.c
new file mode 100644
index 0000000..5c36ee2
--- /dev/null
+++ b/drivers/infiniband/hw/cxgb3/iwch_mem.c
@@ -0,0 +1,203 @@
+/*
+ * Copyright (c) 2006 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include <linux/slab.h>
+#include <asm/byteorder.h>
+
+#include <rdma/iw_cm.h>
+#include <rdma/ib_verbs.h>
+
+#include "cxio_hal.h"
+#include "cxio_resource.h"
+#include "iwch.h"
+#include "iwch_provider.h"
+
+static int iwch_finish_mem_reg(struct iwch_mr *mhp, u32 stag)
+{
+	u32 mmid;
+
+	mhp->attr.state = 1;
+	mhp->attr.stag = stag;
+	mmid = stag >> 8;
+	mhp->ibmr.rkey = mhp->ibmr.lkey = stag;
+	PDBG("%s mmid 0x%x mhp %p\n", __func__, mmid, mhp);
+	return insert_handle(mhp->rhp, &mhp->rhp->mmidr, mhp, mmid);
+}
+
+int iwch_register_mem(struct iwch_dev *rhp, struct iwch_pd *php,
+		      struct iwch_mr *mhp, int shift)
+{
+	u32 stag;
+	int ret;
+
+	if (cxio_register_phys_mem(&rhp->rdev,
+				   &stag, mhp->attr.pdid,
+				   mhp->attr.perms,
+				   mhp->attr.zbva,
+				   mhp->attr.va_fbo,
+				   mhp->attr.len,
+				   shift - 12,
+				   mhp->attr.pbl_size, mhp->attr.pbl_addr))
+		return -ENOMEM;
+
+	ret = iwch_finish_mem_reg(mhp, stag);
+	if (ret)
+		cxio_dereg_mem(&rhp->rdev, mhp->attr.stag, mhp->attr.pbl_size,
+		       mhp->attr.pbl_addr);
+	return ret;
+}
+
+int iwch_reregister_mem(struct iwch_dev *rhp, struct iwch_pd *php,
+					struct iwch_mr *mhp,
+					int shift,
+					int npages)
+{
+	u32 stag;
+	int ret;
+
+	/* We could support this... */
+	if (npages > mhp->attr.pbl_size)
+		return -ENOMEM;
+
+	stag = mhp->attr.stag;
+	if (cxio_reregister_phys_mem(&rhp->rdev,
+				   &stag, mhp->attr.pdid,
+				   mhp->attr.perms,
+				   mhp->attr.zbva,
+				   mhp->attr.va_fbo,
+				   mhp->attr.len,
+				   shift - 12,
+				   mhp->attr.pbl_size, mhp->attr.pbl_addr))
+		return -ENOMEM;
+
+	ret = iwch_finish_mem_reg(mhp, stag);
+	if (ret)
+		cxio_dereg_mem(&rhp->rdev, mhp->attr.stag, mhp->attr.pbl_size,
+		       mhp->attr.pbl_addr);
+
+	return ret;
+}
+
+int iwch_alloc_pbl(struct iwch_mr *mhp, int npages)
+{
+	mhp->attr.pbl_addr = cxio_hal_pblpool_alloc(&mhp->rhp->rdev,
+						    npages << 3);
+
+	if (!mhp->attr.pbl_addr)
+		return -ENOMEM;
+
+	mhp->attr.pbl_size = npages;
+
+	return 0;
+}
+
+void iwch_free_pbl(struct iwch_mr *mhp)
+{
+	cxio_hal_pblpool_free(&mhp->rhp->rdev, mhp->attr.pbl_addr,
+			      mhp->attr.pbl_size << 3);
+}
+
+int iwch_write_pbl(struct iwch_mr *mhp, __be64 *pages, int npages, int offset)
+{
+	return cxio_write_pbl(&mhp->rhp->rdev, pages,
+			      mhp->attr.pbl_addr + (offset << 3), npages);
+}
+
+int build_phys_page_list(struct ib_phys_buf *buffer_list,
+					int num_phys_buf,
+					u64 *iova_start,
+					u64 *total_size,
+					int *npages,
+					int *shift,
+					__be64 **page_list)
+{
+	u64 mask;
+	int i, j, n;
+
+	mask = 0;
+	*total_size = 0;
+	for (i = 0; i < num_phys_buf; ++i) {
+		if (i != 0 && buffer_list[i].addr & ~PAGE_MASK)
+			return -EINVAL;
+		if (i != 0 && i != num_phys_buf - 1 &&
+		    (buffer_list[i].size & ~PAGE_MASK))
+			return -EINVAL;
+		*total_size += buffer_list[i].size;
+		if (i > 0)
+			mask |= buffer_list[i].addr;
+		else
+			mask |= buffer_list[i].addr & PAGE_MASK;
+		if (i != num_phys_buf - 1)
+			mask |= buffer_list[i].addr + buffer_list[i].size;
+		else
+			mask |= (buffer_list[i].addr + buffer_list[i].size +
+				PAGE_SIZE - 1) & PAGE_MASK;
+	}
+
+	if (*total_size > 0xFFFFFFFFULL)
+		return -ENOMEM;
+
+	/* Find largest page shift we can use to cover buffers */
+	for (*shift = PAGE_SHIFT; *shift < 27; ++(*shift))
+		if ((1ULL << *shift) & mask)
+			break;
+
+	buffer_list[0].size += buffer_list[0].addr & ((1ULL << *shift) - 1);
+	buffer_list[0].addr &= ~0ull << *shift;
+
+	*npages = 0;
+	for (i = 0; i < num_phys_buf; ++i)
+		*npages += (buffer_list[i].size +
+			(1ULL << *shift) - 1) >> *shift;
+
+	if (!*npages)
+		return -EINVAL;
+
+	*page_list = kmalloc(sizeof(u64) * *npages, GFP_KERNEL);
+	if (!*page_list)
+		return -ENOMEM;
+
+	n = 0;
+	for (i = 0; i < num_phys_buf; ++i)
+		for (j = 0;
+		     j < (buffer_list[i].size + (1ULL << *shift) - 1) >> *shift;
+		     ++j)
+			(*page_list)[n++] = cpu_to_be64(buffer_list[i].addr +
+			    ((u64) j << *shift));
+
+	PDBG("%s va 0x%llx mask 0x%llx shift %d len %lld pbl_size %d\n",
+	     __func__, (unsigned long long) *iova_start,
+	     (unsigned long long) mask, *shift, (unsigned long long) *total_size,
+	     *npages);
+
+	return 0;
+
+}
diff --git a/drivers/infiniband/hw/cxgb3/iwch_provider.c b/drivers/infiniband/hw/cxgb3/iwch_provider.c
new file mode 100644
index 0000000..811b24a
--- /dev/null
+++ b/drivers/infiniband/hw/cxgb3/iwch_provider.c
@@ -0,0 +1,1467 @@
+/*
+ * Copyright (c) 2006 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/device.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/delay.h>
+#include <linux/errno.h>
+#include <linux/list.h>
+#include <linux/sched.h>
+#include <linux/spinlock.h>
+#include <linux/ethtool.h>
+#include <linux/rtnetlink.h>
+#include <linux/inetdevice.h>
+#include <linux/slab.h>
+
+#include <asm/io.h>
+#include <asm/irq.h>
+#include <asm/byteorder.h>
+
+#include <rdma/iw_cm.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_smi.h>
+#include <rdma/ib_umem.h>
+#include <rdma/ib_user_verbs.h>
+
+#include "cxio_hal.h"
+#include "iwch.h"
+#include "iwch_provider.h"
+#include "iwch_cm.h"
+#include "iwch_user.h"
+#include "common.h"
+
+static struct ib_ah *iwch_ah_create(struct ib_pd *pd,
+				    struct ib_ah_attr *ah_attr)
+{
+	return ERR_PTR(-ENOSYS);
+}
+
+static int iwch_ah_destroy(struct ib_ah *ah)
+{
+	return -ENOSYS;
+}
+
+static int iwch_multicast_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
+{
+	return -ENOSYS;
+}
+
+static int iwch_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
+{
+	return -ENOSYS;
+}
+
+static int iwch_process_mad(struct ib_device *ibdev,
+			    int mad_flags,
+			    u8 port_num,
+			    struct ib_wc *in_wc,
+			    struct ib_grh *in_grh,
+			    struct ib_mad *in_mad, struct ib_mad *out_mad)
+{
+	return -ENOSYS;
+}
+
+static int iwch_dealloc_ucontext(struct ib_ucontext *context)
+{
+	struct iwch_dev *rhp = to_iwch_dev(context->device);
+	struct iwch_ucontext *ucontext = to_iwch_ucontext(context);
+	struct iwch_mm_entry *mm, *tmp;
+
+	PDBG("%s context %p\n", __func__, context);
+	list_for_each_entry_safe(mm, tmp, &ucontext->mmaps, entry)
+		kfree(mm);
+	cxio_release_ucontext(&rhp->rdev, &ucontext->uctx);
+	kfree(ucontext);
+	return 0;
+}
+
+static struct ib_ucontext *iwch_alloc_ucontext(struct ib_device *ibdev,
+					struct ib_udata *udata)
+{
+	struct iwch_ucontext *context;
+	struct iwch_dev *rhp = to_iwch_dev(ibdev);
+
+	PDBG("%s ibdev %p\n", __func__, ibdev);
+	context = kzalloc(sizeof(*context), GFP_KERNEL);
+	if (!context)
+		return ERR_PTR(-ENOMEM);
+	cxio_init_ucontext(&rhp->rdev, &context->uctx);
+	INIT_LIST_HEAD(&context->mmaps);
+	spin_lock_init(&context->mmap_lock);
+	return &context->ibucontext;
+}
+
+static int iwch_destroy_cq(struct ib_cq *ib_cq)
+{
+	struct iwch_cq *chp;
+
+	PDBG("%s ib_cq %p\n", __func__, ib_cq);
+	chp = to_iwch_cq(ib_cq);
+
+	remove_handle(chp->rhp, &chp->rhp->cqidr, chp->cq.cqid);
+	atomic_dec(&chp->refcnt);
+	wait_event(chp->wait, !atomic_read(&chp->refcnt));
+
+	cxio_destroy_cq(&chp->rhp->rdev, &chp->cq);
+	kfree(chp);
+	return 0;
+}
+
+static struct ib_cq *iwch_create_cq(struct ib_device *ibdev, int entries, int vector,
+			     struct ib_ucontext *ib_context,
+			     struct ib_udata *udata)
+{
+	struct iwch_dev *rhp;
+	struct iwch_cq *chp;
+	struct iwch_create_cq_resp uresp;
+	struct iwch_create_cq_req ureq;
+	struct iwch_ucontext *ucontext = NULL;
+	static int warned;
+	size_t resplen;
+
+	PDBG("%s ib_dev %p entries %d\n", __func__, ibdev, entries);
+	rhp = to_iwch_dev(ibdev);
+	chp = kzalloc(sizeof(*chp), GFP_KERNEL);
+	if (!chp)
+		return ERR_PTR(-ENOMEM);
+
+	if (ib_context) {
+		ucontext = to_iwch_ucontext(ib_context);
+		if (!t3a_device(rhp)) {
+			if (ib_copy_from_udata(&ureq, udata, sizeof (ureq))) {
+				kfree(chp);
+				return ERR_PTR(-EFAULT);
+			}
+			chp->user_rptr_addr = (u32 __user *)(unsigned long)ureq.user_rptr_addr;
+		}
+	}
+
+	if (t3a_device(rhp)) {
+
+		/*
+		 * T3A: Add some fluff to handle extra CQEs inserted
+		 * for various errors.
+		 * Additional CQE possibilities:
+		 *      TERMINATE,
+		 *      incoming RDMA WRITE Failures
+		 *      incoming RDMA READ REQUEST FAILUREs
+		 * NOTE: We cannot ensure the CQ won't overflow.
+		 */
+		entries += 16;
+	}
+	entries = roundup_pow_of_two(entries);
+	chp->cq.size_log2 = ilog2(entries);
+
+	if (cxio_create_cq(&rhp->rdev, &chp->cq, !ucontext)) {
+		kfree(chp);
+		return ERR_PTR(-ENOMEM);
+	}
+	chp->rhp = rhp;
+	chp->ibcq.cqe = 1 << chp->cq.size_log2;
+	spin_lock_init(&chp->lock);
+	spin_lock_init(&chp->comp_handler_lock);
+	atomic_set(&chp->refcnt, 1);
+	init_waitqueue_head(&chp->wait);
+	if (insert_handle(rhp, &rhp->cqidr, chp, chp->cq.cqid)) {
+		cxio_destroy_cq(&chp->rhp->rdev, &chp->cq);
+		kfree(chp);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	if (ucontext) {
+		struct iwch_mm_entry *mm;
+
+		mm = kmalloc(sizeof *mm, GFP_KERNEL);
+		if (!mm) {
+			iwch_destroy_cq(&chp->ibcq);
+			return ERR_PTR(-ENOMEM);
+		}
+		uresp.cqid = chp->cq.cqid;
+		uresp.size_log2 = chp->cq.size_log2;
+		spin_lock(&ucontext->mmap_lock);
+		uresp.key = ucontext->key;
+		ucontext->key += PAGE_SIZE;
+		spin_unlock(&ucontext->mmap_lock);
+		mm->key = uresp.key;
+		mm->addr = virt_to_phys(chp->cq.queue);
+		if (udata->outlen < sizeof uresp) {
+			if (!warned++)
+				printk(KERN_WARNING MOD "Warning - "
+				       "downlevel libcxgb3 (non-fatal).\n");
+			mm->len = PAGE_ALIGN((1UL << uresp.size_log2) *
+					     sizeof(struct t3_cqe));
+			resplen = sizeof(struct iwch_create_cq_resp_v0);
+		} else {
+			mm->len = PAGE_ALIGN(((1UL << uresp.size_log2) + 1) *
+					     sizeof(struct t3_cqe));
+			uresp.memsize = mm->len;
+			uresp.reserved = 0;
+			resplen = sizeof uresp;
+		}
+		if (ib_copy_to_udata(udata, &uresp, resplen)) {
+			kfree(mm);
+			iwch_destroy_cq(&chp->ibcq);
+			return ERR_PTR(-EFAULT);
+		}
+		insert_mmap(ucontext, mm);
+	}
+	PDBG("created cqid 0x%0x chp %p size 0x%0x, dma_addr 0x%0llx\n",
+	     chp->cq.cqid, chp, (1 << chp->cq.size_log2),
+	     (unsigned long long) chp->cq.dma_addr);
+	return &chp->ibcq;
+}
+
+static int iwch_resize_cq(struct ib_cq *cq, int cqe, struct ib_udata *udata)
+{
+#ifdef notyet
+	struct iwch_cq *chp = to_iwch_cq(cq);
+	struct t3_cq oldcq, newcq;
+	int ret;
+
+	PDBG("%s ib_cq %p cqe %d\n", __func__, cq, cqe);
+
+	/* We don't downsize... */
+	if (cqe <= cq->cqe)
+		return 0;
+
+	/* create new t3_cq with new size */
+	cqe = roundup_pow_of_two(cqe+1);
+	newcq.size_log2 = ilog2(cqe);
+
+	/* Dont allow resize to less than the current wce count */
+	if (cqe < Q_COUNT(chp->cq.rptr, chp->cq.wptr)) {
+		return -ENOMEM;
+	}
+
+	/* Quiesce all QPs using this CQ */
+	ret = iwch_quiesce_qps(chp);
+	if (ret) {
+		return ret;
+	}
+
+	ret = cxio_create_cq(&chp->rhp->rdev, &newcq);
+	if (ret) {
+		return ret;
+	}
+
+	/* copy CQEs */
+	memcpy(newcq.queue, chp->cq.queue, (1 << chp->cq.size_log2) *
+				        sizeof(struct t3_cqe));
+
+	/* old iwch_qp gets new t3_cq but keeps old cqid */
+	oldcq = chp->cq;
+	chp->cq = newcq;
+	chp->cq.cqid = oldcq.cqid;
+
+	/* resize new t3_cq to update the HW context */
+	ret = cxio_resize_cq(&chp->rhp->rdev, &chp->cq);
+	if (ret) {
+		chp->cq = oldcq;
+		return ret;
+	}
+	chp->ibcq.cqe = (1<<chp->cq.size_log2) - 1;
+
+	/* destroy old t3_cq */
+	oldcq.cqid = newcq.cqid;
+	ret = cxio_destroy_cq(&chp->rhp->rdev, &oldcq);
+	if (ret) {
+		printk(KERN_ERR MOD "%s - cxio_destroy_cq failed %d\n",
+			__func__, ret);
+	}
+
+	/* add user hooks here */
+
+	/* resume qps */
+	ret = iwch_resume_qps(chp);
+	return ret;
+#else
+	return -ENOSYS;
+#endif
+}
+
+static int iwch_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags)
+{
+	struct iwch_dev *rhp;
+	struct iwch_cq *chp;
+	enum t3_cq_opcode cq_op;
+	int err;
+	unsigned long flag;
+	u32 rptr;
+
+	chp = to_iwch_cq(ibcq);
+	rhp = chp->rhp;
+	if ((flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED)
+		cq_op = CQ_ARM_SE;
+	else
+		cq_op = CQ_ARM_AN;
+	if (chp->user_rptr_addr) {
+		if (get_user(rptr, chp->user_rptr_addr))
+			return -EFAULT;
+		spin_lock_irqsave(&chp->lock, flag);
+		chp->cq.rptr = rptr;
+	} else
+		spin_lock_irqsave(&chp->lock, flag);
+	PDBG("%s rptr 0x%x\n", __func__, chp->cq.rptr);
+	err = cxio_hal_cq_op(&rhp->rdev, &chp->cq, cq_op, 0);
+	spin_unlock_irqrestore(&chp->lock, flag);
+	if (err < 0)
+		printk(KERN_ERR MOD "Error %d rearming CQID 0x%x\n", err,
+		       chp->cq.cqid);
+	if (err > 0 && !(flags & IB_CQ_REPORT_MISSED_EVENTS))
+		err = 0;
+	return err;
+}
+
+static int iwch_mmap(struct ib_ucontext *context, struct vm_area_struct *vma)
+{
+	int len = vma->vm_end - vma->vm_start;
+	u32 key = vma->vm_pgoff << PAGE_SHIFT;
+	struct cxio_rdev *rdev_p;
+	int ret = 0;
+	struct iwch_mm_entry *mm;
+	struct iwch_ucontext *ucontext;
+	u64 addr;
+
+	PDBG("%s pgoff 0x%lx key 0x%x len %d\n", __func__, vma->vm_pgoff,
+	     key, len);
+
+	if (vma->vm_start & (PAGE_SIZE-1)) {
+	        return -EINVAL;
+	}
+
+	rdev_p = &(to_iwch_dev(context->device)->rdev);
+	ucontext = to_iwch_ucontext(context);
+
+	mm = remove_mmap(ucontext, key, len);
+	if (!mm)
+		return -EINVAL;
+	addr = mm->addr;
+	kfree(mm);
+
+	if ((addr >= rdev_p->rnic_info.udbell_physbase) &&
+	    (addr < (rdev_p->rnic_info.udbell_physbase +
+		       rdev_p->rnic_info.udbell_len))) {
+
+		/*
+		 * Map T3 DB register.
+		 */
+		if (vma->vm_flags & VM_READ) {
+			return -EPERM;
+		}
+
+		vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+		vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND;
+		vma->vm_flags &= ~VM_MAYREAD;
+		ret = io_remap_pfn_range(vma, vma->vm_start,
+					 addr >> PAGE_SHIFT,
+				         len, vma->vm_page_prot);
+	} else {
+
+		/*
+		 * Map WQ or CQ contig dma memory...
+		 */
+		ret = remap_pfn_range(vma, vma->vm_start,
+				      addr >> PAGE_SHIFT,
+				      len, vma->vm_page_prot);
+	}
+
+	return ret;
+}
+
+static int iwch_deallocate_pd(struct ib_pd *pd)
+{
+	struct iwch_dev *rhp;
+	struct iwch_pd *php;
+
+	php = to_iwch_pd(pd);
+	rhp = php->rhp;
+	PDBG("%s ibpd %p pdid 0x%x\n", __func__, pd, php->pdid);
+	cxio_hal_put_pdid(rhp->rdev.rscp, php->pdid);
+	kfree(php);
+	return 0;
+}
+
+static struct ib_pd *iwch_allocate_pd(struct ib_device *ibdev,
+			       struct ib_ucontext *context,
+			       struct ib_udata *udata)
+{
+	struct iwch_pd *php;
+	u32 pdid;
+	struct iwch_dev *rhp;
+
+	PDBG("%s ibdev %p\n", __func__, ibdev);
+	rhp = (struct iwch_dev *) ibdev;
+	pdid = cxio_hal_get_pdid(rhp->rdev.rscp);
+	if (!pdid)
+		return ERR_PTR(-EINVAL);
+	php = kzalloc(sizeof(*php), GFP_KERNEL);
+	if (!php) {
+		cxio_hal_put_pdid(rhp->rdev.rscp, pdid);
+		return ERR_PTR(-ENOMEM);
+	}
+	php->pdid = pdid;
+	php->rhp = rhp;
+	if (context) {
+		if (ib_copy_to_udata(udata, &php->pdid, sizeof (__u32))) {
+			iwch_deallocate_pd(&php->ibpd);
+			return ERR_PTR(-EFAULT);
+		}
+	}
+	PDBG("%s pdid 0x%0x ptr 0x%p\n", __func__, pdid, php);
+	return &php->ibpd;
+}
+
+static int iwch_dereg_mr(struct ib_mr *ib_mr)
+{
+	struct iwch_dev *rhp;
+	struct iwch_mr *mhp;
+	u32 mmid;
+
+	PDBG("%s ib_mr %p\n", __func__, ib_mr);
+	/* There can be no memory windows */
+	if (atomic_read(&ib_mr->usecnt))
+		return -EINVAL;
+
+	mhp = to_iwch_mr(ib_mr);
+	rhp = mhp->rhp;
+	mmid = mhp->attr.stag >> 8;
+	cxio_dereg_mem(&rhp->rdev, mhp->attr.stag, mhp->attr.pbl_size,
+		       mhp->attr.pbl_addr);
+	iwch_free_pbl(mhp);
+	remove_handle(rhp, &rhp->mmidr, mmid);
+	if (mhp->kva)
+		kfree((void *) (unsigned long) mhp->kva);
+	if (mhp->umem)
+		ib_umem_release(mhp->umem);
+	PDBG("%s mmid 0x%x ptr %p\n", __func__, mmid, mhp);
+	kfree(mhp);
+	return 0;
+}
+
+static struct ib_mr *iwch_register_phys_mem(struct ib_pd *pd,
+					struct ib_phys_buf *buffer_list,
+					int num_phys_buf,
+					int acc,
+					u64 *iova_start)
+{
+	__be64 *page_list;
+	int shift;
+	u64 total_size;
+	int npages;
+	struct iwch_dev *rhp;
+	struct iwch_pd *php;
+	struct iwch_mr *mhp;
+	int ret;
+
+	PDBG("%s ib_pd %p\n", __func__, pd);
+	php = to_iwch_pd(pd);
+	rhp = php->rhp;
+
+	mhp = kzalloc(sizeof(*mhp), GFP_KERNEL);
+	if (!mhp)
+		return ERR_PTR(-ENOMEM);
+
+	mhp->rhp = rhp;
+
+	/* First check that we have enough alignment */
+	if ((*iova_start & ~PAGE_MASK) != (buffer_list[0].addr & ~PAGE_MASK)) {
+		ret = -EINVAL;
+		goto err;
+	}
+
+	if (num_phys_buf > 1 &&
+	    ((buffer_list[0].addr + buffer_list[0].size) & ~PAGE_MASK)) {
+		ret = -EINVAL;
+		goto err;
+	}
+
+	ret = build_phys_page_list(buffer_list, num_phys_buf, iova_start,
+				   &total_size, &npages, &shift, &page_list);
+	if (ret)
+		goto err;
+
+	ret = iwch_alloc_pbl(mhp, npages);
+	if (ret) {
+		kfree(page_list);
+		goto err_pbl;
+	}
+
+	ret = iwch_write_pbl(mhp, page_list, npages, 0);
+	kfree(page_list);
+	if (ret)
+		goto err_pbl;
+
+	mhp->attr.pdid = php->pdid;
+	mhp->attr.zbva = 0;
+
+	mhp->attr.perms = iwch_ib_to_tpt_access(acc);
+	mhp->attr.va_fbo = *iova_start;
+	mhp->attr.page_size = shift - 12;
+
+	mhp->attr.len = (u32) total_size;
+	mhp->attr.pbl_size = npages;
+	ret = iwch_register_mem(rhp, php, mhp, shift);
+	if (ret)
+		goto err_pbl;
+
+	return &mhp->ibmr;
+
+err_pbl:
+	iwch_free_pbl(mhp);
+
+err:
+	kfree(mhp);
+	return ERR_PTR(ret);
+
+}
+
+static int iwch_reregister_phys_mem(struct ib_mr *mr,
+				     int mr_rereg_mask,
+				     struct ib_pd *pd,
+	                             struct ib_phys_buf *buffer_list,
+	                             int num_phys_buf,
+	                             int acc, u64 * iova_start)
+{
+
+	struct iwch_mr mh, *mhp;
+	struct iwch_pd *php;
+	struct iwch_dev *rhp;
+	__be64 *page_list = NULL;
+	int shift = 0;
+	u64 total_size;
+	int npages = 0;
+	int ret;
+
+	PDBG("%s ib_mr %p ib_pd %p\n", __func__, mr, pd);
+
+	/* There can be no memory windows */
+	if (atomic_read(&mr->usecnt))
+		return -EINVAL;
+
+	mhp = to_iwch_mr(mr);
+	rhp = mhp->rhp;
+	php = to_iwch_pd(mr->pd);
+
+	/* make sure we are on the same adapter */
+	if (rhp != php->rhp)
+		return -EINVAL;
+
+	memcpy(&mh, mhp, sizeof *mhp);
+
+	if (mr_rereg_mask & IB_MR_REREG_PD)
+		php = to_iwch_pd(pd);
+	if (mr_rereg_mask & IB_MR_REREG_ACCESS)
+		mh.attr.perms = iwch_ib_to_tpt_access(acc);
+	if (mr_rereg_mask & IB_MR_REREG_TRANS) {
+		ret = build_phys_page_list(buffer_list, num_phys_buf,
+					   iova_start,
+					   &total_size, &npages,
+					   &shift, &page_list);
+		if (ret)
+			return ret;
+	}
+
+	ret = iwch_reregister_mem(rhp, php, &mh, shift, npages);
+	kfree(page_list);
+	if (ret) {
+		return ret;
+	}
+	if (mr_rereg_mask & IB_MR_REREG_PD)
+		mhp->attr.pdid = php->pdid;
+	if (mr_rereg_mask & IB_MR_REREG_ACCESS)
+		mhp->attr.perms = iwch_ib_to_tpt_access(acc);
+	if (mr_rereg_mask & IB_MR_REREG_TRANS) {
+		mhp->attr.zbva = 0;
+		mhp->attr.va_fbo = *iova_start;
+		mhp->attr.page_size = shift - 12;
+		mhp->attr.len = (u32) total_size;
+		mhp->attr.pbl_size = npages;
+	}
+
+	return 0;
+}
+
+
+static struct ib_mr *iwch_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
+				      u64 virt, int acc, struct ib_udata *udata)
+{
+	__be64 *pages;
+	int shift, n, len;
+	int i, k, entry;
+	int err = 0;
+	struct iwch_dev *rhp;
+	struct iwch_pd *php;
+	struct iwch_mr *mhp;
+	struct iwch_reg_user_mr_resp uresp;
+	struct scatterlist *sg;
+	PDBG("%s ib_pd %p\n", __func__, pd);
+
+	php = to_iwch_pd(pd);
+	rhp = php->rhp;
+	mhp = kzalloc(sizeof(*mhp), GFP_KERNEL);
+	if (!mhp)
+		return ERR_PTR(-ENOMEM);
+
+	mhp->rhp = rhp;
+
+	mhp->umem = ib_umem_get(pd->uobject->context, start, length, acc, 0);
+	if (IS_ERR(mhp->umem)) {
+		err = PTR_ERR(mhp->umem);
+		kfree(mhp);
+		return ERR_PTR(err);
+	}
+
+	shift = ffs(mhp->umem->page_size) - 1;
+
+	n = mhp->umem->nmap;
+
+	err = iwch_alloc_pbl(mhp, n);
+	if (err)
+		goto err;
+
+	pages = (__be64 *) __get_free_page(GFP_KERNEL);
+	if (!pages) {
+		err = -ENOMEM;
+		goto err_pbl;
+	}
+
+	i = n = 0;
+
+	for_each_sg(mhp->umem->sg_head.sgl, sg, mhp->umem->nmap, entry) {
+			len = sg_dma_len(sg) >> shift;
+			for (k = 0; k < len; ++k) {
+				pages[i++] = cpu_to_be64(sg_dma_address(sg) +
+					mhp->umem->page_size * k);
+				if (i == PAGE_SIZE / sizeof *pages) {
+					err = iwch_write_pbl(mhp, pages, i, n);
+					if (err)
+						goto pbl_done;
+					n += i;
+					i = 0;
+				}
+			}
+	}
+
+	if (i)
+		err = iwch_write_pbl(mhp, pages, i, n);
+
+pbl_done:
+	free_page((unsigned long) pages);
+	if (err)
+		goto err_pbl;
+
+	mhp->attr.pdid = php->pdid;
+	mhp->attr.zbva = 0;
+	mhp->attr.perms = iwch_ib_to_tpt_access(acc);
+	mhp->attr.va_fbo = virt;
+	mhp->attr.page_size = shift - 12;
+	mhp->attr.len = (u32) length;
+
+	err = iwch_register_mem(rhp, php, mhp, shift);
+	if (err)
+		goto err_pbl;
+
+	if (udata && !t3a_device(rhp)) {
+		uresp.pbl_addr = (mhp->attr.pbl_addr -
+				 rhp->rdev.rnic_info.pbl_base) >> 3;
+		PDBG("%s user resp pbl_addr 0x%x\n", __func__,
+		     uresp.pbl_addr);
+
+		if (ib_copy_to_udata(udata, &uresp, sizeof (uresp))) {
+			iwch_dereg_mr(&mhp->ibmr);
+			err = -EFAULT;
+			goto err;
+		}
+	}
+
+	return &mhp->ibmr;
+
+err_pbl:
+	iwch_free_pbl(mhp);
+
+err:
+	ib_umem_release(mhp->umem);
+	kfree(mhp);
+	return ERR_PTR(err);
+}
+
+static struct ib_mr *iwch_get_dma_mr(struct ib_pd *pd, int acc)
+{
+	struct ib_phys_buf bl;
+	u64 kva;
+	struct ib_mr *ibmr;
+
+	PDBG("%s ib_pd %p\n", __func__, pd);
+
+	/*
+	 * T3 only supports 32 bits of size.
+	 */
+	bl.size = 0xffffffff;
+	bl.addr = 0;
+	kva = 0;
+	ibmr = iwch_register_phys_mem(pd, &bl, 1, acc, &kva);
+	return ibmr;
+}
+
+static struct ib_mw *iwch_alloc_mw(struct ib_pd *pd, enum ib_mw_type type)
+{
+	struct iwch_dev *rhp;
+	struct iwch_pd *php;
+	struct iwch_mw *mhp;
+	u32 mmid;
+	u32 stag = 0;
+	int ret;
+
+	if (type != IB_MW_TYPE_1)
+		return ERR_PTR(-EINVAL);
+
+	php = to_iwch_pd(pd);
+	rhp = php->rhp;
+	mhp = kzalloc(sizeof(*mhp), GFP_KERNEL);
+	if (!mhp)
+		return ERR_PTR(-ENOMEM);
+	ret = cxio_allocate_window(&rhp->rdev, &stag, php->pdid);
+	if (ret) {
+		kfree(mhp);
+		return ERR_PTR(ret);
+	}
+	mhp->rhp = rhp;
+	mhp->attr.pdid = php->pdid;
+	mhp->attr.type = TPT_MW;
+	mhp->attr.stag = stag;
+	mmid = (stag) >> 8;
+	mhp->ibmw.rkey = stag;
+	if (insert_handle(rhp, &rhp->mmidr, mhp, mmid)) {
+		cxio_deallocate_window(&rhp->rdev, mhp->attr.stag);
+		kfree(mhp);
+		return ERR_PTR(-ENOMEM);
+	}
+	PDBG("%s mmid 0x%x mhp %p stag 0x%x\n", __func__, mmid, mhp, stag);
+	return &(mhp->ibmw);
+}
+
+static int iwch_dealloc_mw(struct ib_mw *mw)
+{
+	struct iwch_dev *rhp;
+	struct iwch_mw *mhp;
+	u32 mmid;
+
+	mhp = to_iwch_mw(mw);
+	rhp = mhp->rhp;
+	mmid = (mw->rkey) >> 8;
+	cxio_deallocate_window(&rhp->rdev, mhp->attr.stag);
+	remove_handle(rhp, &rhp->mmidr, mmid);
+	PDBG("%s ib_mw %p mmid 0x%x ptr %p\n", __func__, mw, mmid, mhp);
+	kfree(mhp);
+	return 0;
+}
+
+static struct ib_mr *iwch_alloc_fast_reg_mr(struct ib_pd *pd, int pbl_depth)
+{
+	struct iwch_dev *rhp;
+	struct iwch_pd *php;
+	struct iwch_mr *mhp;
+	u32 mmid;
+	u32 stag = 0;
+	int ret = 0;
+
+	php = to_iwch_pd(pd);
+	rhp = php->rhp;
+	mhp = kzalloc(sizeof(*mhp), GFP_KERNEL);
+	if (!mhp)
+		goto err;
+
+	mhp->rhp = rhp;
+	ret = iwch_alloc_pbl(mhp, pbl_depth);
+	if (ret)
+		goto err1;
+	mhp->attr.pbl_size = pbl_depth;
+	ret = cxio_allocate_stag(&rhp->rdev, &stag, php->pdid,
+				 mhp->attr.pbl_size, mhp->attr.pbl_addr);
+	if (ret)
+		goto err2;
+	mhp->attr.pdid = php->pdid;
+	mhp->attr.type = TPT_NON_SHARED_MR;
+	mhp->attr.stag = stag;
+	mhp->attr.state = 1;
+	mmid = (stag) >> 8;
+	mhp->ibmr.rkey = mhp->ibmr.lkey = stag;
+	if (insert_handle(rhp, &rhp->mmidr, mhp, mmid))
+		goto err3;
+
+	PDBG("%s mmid 0x%x mhp %p stag 0x%x\n", __func__, mmid, mhp, stag);
+	return &(mhp->ibmr);
+err3:
+	cxio_dereg_mem(&rhp->rdev, stag, mhp->attr.pbl_size,
+		       mhp->attr.pbl_addr);
+err2:
+	iwch_free_pbl(mhp);
+err1:
+	kfree(mhp);
+err:
+	return ERR_PTR(ret);
+}
+
+static struct ib_fast_reg_page_list *iwch_alloc_fastreg_pbl(
+					struct ib_device *device,
+					int page_list_len)
+{
+	struct ib_fast_reg_page_list *page_list;
+
+	page_list = kmalloc(sizeof *page_list + page_list_len * sizeof(u64),
+			    GFP_KERNEL);
+	if (!page_list)
+		return ERR_PTR(-ENOMEM);
+
+	page_list->page_list = (u64 *)(page_list + 1);
+	page_list->max_page_list_len = page_list_len;
+
+	return page_list;
+}
+
+static void iwch_free_fastreg_pbl(struct ib_fast_reg_page_list *page_list)
+{
+	kfree(page_list);
+}
+
+static int iwch_destroy_qp(struct ib_qp *ib_qp)
+{
+	struct iwch_dev *rhp;
+	struct iwch_qp *qhp;
+	struct iwch_qp_attributes attrs;
+	struct iwch_ucontext *ucontext;
+
+	qhp = to_iwch_qp(ib_qp);
+	rhp = qhp->rhp;
+
+	attrs.next_state = IWCH_QP_STATE_ERROR;
+	iwch_modify_qp(rhp, qhp, IWCH_QP_ATTR_NEXT_STATE, &attrs, 0);
+	wait_event(qhp->wait, !qhp->ep);
+
+	remove_handle(rhp, &rhp->qpidr, qhp->wq.qpid);
+
+	atomic_dec(&qhp->refcnt);
+	wait_event(qhp->wait, !atomic_read(&qhp->refcnt));
+
+	ucontext = ib_qp->uobject ? to_iwch_ucontext(ib_qp->uobject->context)
+				  : NULL;
+	cxio_destroy_qp(&rhp->rdev, &qhp->wq,
+			ucontext ? &ucontext->uctx : &rhp->rdev.uctx);
+
+	PDBG("%s ib_qp %p qpid 0x%0x qhp %p\n", __func__,
+	     ib_qp, qhp->wq.qpid, qhp);
+	kfree(qhp);
+	return 0;
+}
+
+static struct ib_qp *iwch_create_qp(struct ib_pd *pd,
+			     struct ib_qp_init_attr *attrs,
+			     struct ib_udata *udata)
+{
+	struct iwch_dev *rhp;
+	struct iwch_qp *qhp;
+	struct iwch_pd *php;
+	struct iwch_cq *schp;
+	struct iwch_cq *rchp;
+	struct iwch_create_qp_resp uresp;
+	int wqsize, sqsize, rqsize;
+	struct iwch_ucontext *ucontext;
+
+	PDBG("%s ib_pd %p\n", __func__, pd);
+	if (attrs->qp_type != IB_QPT_RC)
+		return ERR_PTR(-EINVAL);
+	php = to_iwch_pd(pd);
+	rhp = php->rhp;
+	schp = get_chp(rhp, ((struct iwch_cq *) attrs->send_cq)->cq.cqid);
+	rchp = get_chp(rhp, ((struct iwch_cq *) attrs->recv_cq)->cq.cqid);
+	if (!schp || !rchp)
+		return ERR_PTR(-EINVAL);
+
+	/* The RQT size must be # of entries + 1 rounded up to a power of two */
+	rqsize = roundup_pow_of_two(attrs->cap.max_recv_wr);
+	if (rqsize == attrs->cap.max_recv_wr)
+		rqsize = roundup_pow_of_two(attrs->cap.max_recv_wr+1);
+
+	/* T3 doesn't support RQT depth < 16 */
+	if (rqsize < 16)
+		rqsize = 16;
+
+	if (rqsize > T3_MAX_RQ_SIZE)
+		return ERR_PTR(-EINVAL);
+
+	if (attrs->cap.max_inline_data > T3_MAX_INLINE)
+		return ERR_PTR(-EINVAL);
+
+	/*
+	 * NOTE: The SQ and total WQ sizes don't need to be
+	 * a power of two.  However, all the code assumes
+	 * they are. EG: Q_FREECNT() and friends.
+	 */
+	sqsize = roundup_pow_of_two(attrs->cap.max_send_wr);
+	wqsize = roundup_pow_of_two(rqsize + sqsize);
+
+	/*
+	 * Kernel users need more wq space for fastreg WRs which can take
+	 * 2 WR fragments.
+	 */
+	ucontext = pd->uobject ? to_iwch_ucontext(pd->uobject->context) : NULL;
+	if (!ucontext && wqsize < (rqsize + (2 * sqsize)))
+		wqsize = roundup_pow_of_two(rqsize +
+				roundup_pow_of_two(attrs->cap.max_send_wr * 2));
+	PDBG("%s wqsize %d sqsize %d rqsize %d\n", __func__,
+	     wqsize, sqsize, rqsize);
+	qhp = kzalloc(sizeof(*qhp), GFP_KERNEL);
+	if (!qhp)
+		return ERR_PTR(-ENOMEM);
+	qhp->wq.size_log2 = ilog2(wqsize);
+	qhp->wq.rq_size_log2 = ilog2(rqsize);
+	qhp->wq.sq_size_log2 = ilog2(sqsize);
+	if (cxio_create_qp(&rhp->rdev, !udata, &qhp->wq,
+			   ucontext ? &ucontext->uctx : &rhp->rdev.uctx)) {
+		kfree(qhp);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	attrs->cap.max_recv_wr = rqsize - 1;
+	attrs->cap.max_send_wr = sqsize;
+	attrs->cap.max_inline_data = T3_MAX_INLINE;
+
+	qhp->rhp = rhp;
+	qhp->attr.pd = php->pdid;
+	qhp->attr.scq = ((struct iwch_cq *) attrs->send_cq)->cq.cqid;
+	qhp->attr.rcq = ((struct iwch_cq *) attrs->recv_cq)->cq.cqid;
+	qhp->attr.sq_num_entries = attrs->cap.max_send_wr;
+	qhp->attr.rq_num_entries = attrs->cap.max_recv_wr;
+	qhp->attr.sq_max_sges = attrs->cap.max_send_sge;
+	qhp->attr.sq_max_sges_rdma_write = attrs->cap.max_send_sge;
+	qhp->attr.rq_max_sges = attrs->cap.max_recv_sge;
+	qhp->attr.state = IWCH_QP_STATE_IDLE;
+	qhp->attr.next_state = IWCH_QP_STATE_IDLE;
+
+	/*
+	 * XXX - These don't get passed in from the openib user
+	 * at create time.  The CM sets them via a QP modify.
+	 * Need to fix...  I think the CM should
+	 */
+	qhp->attr.enable_rdma_read = 1;
+	qhp->attr.enable_rdma_write = 1;
+	qhp->attr.enable_bind = 1;
+	qhp->attr.max_ord = 1;
+	qhp->attr.max_ird = 1;
+
+	spin_lock_init(&qhp->lock);
+	init_waitqueue_head(&qhp->wait);
+	atomic_set(&qhp->refcnt, 1);
+
+	if (insert_handle(rhp, &rhp->qpidr, qhp, qhp->wq.qpid)) {
+		cxio_destroy_qp(&rhp->rdev, &qhp->wq,
+			ucontext ? &ucontext->uctx : &rhp->rdev.uctx);
+		kfree(qhp);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	if (udata) {
+
+		struct iwch_mm_entry *mm1, *mm2;
+
+		mm1 = kmalloc(sizeof *mm1, GFP_KERNEL);
+		if (!mm1) {
+			iwch_destroy_qp(&qhp->ibqp);
+			return ERR_PTR(-ENOMEM);
+		}
+
+		mm2 = kmalloc(sizeof *mm2, GFP_KERNEL);
+		if (!mm2) {
+			kfree(mm1);
+			iwch_destroy_qp(&qhp->ibqp);
+			return ERR_PTR(-ENOMEM);
+		}
+
+		uresp.qpid = qhp->wq.qpid;
+		uresp.size_log2 = qhp->wq.size_log2;
+		uresp.sq_size_log2 = qhp->wq.sq_size_log2;
+		uresp.rq_size_log2 = qhp->wq.rq_size_log2;
+		spin_lock(&ucontext->mmap_lock);
+		uresp.key = ucontext->key;
+		ucontext->key += PAGE_SIZE;
+		uresp.db_key = ucontext->key;
+		ucontext->key += PAGE_SIZE;
+		spin_unlock(&ucontext->mmap_lock);
+		if (ib_copy_to_udata(udata, &uresp, sizeof (uresp))) {
+			kfree(mm1);
+			kfree(mm2);
+			iwch_destroy_qp(&qhp->ibqp);
+			return ERR_PTR(-EFAULT);
+		}
+		mm1->key = uresp.key;
+		mm1->addr = virt_to_phys(qhp->wq.queue);
+		mm1->len = PAGE_ALIGN(wqsize * sizeof (union t3_wr));
+		insert_mmap(ucontext, mm1);
+		mm2->key = uresp.db_key;
+		mm2->addr = qhp->wq.udb & PAGE_MASK;
+		mm2->len = PAGE_SIZE;
+		insert_mmap(ucontext, mm2);
+	}
+	qhp->ibqp.qp_num = qhp->wq.qpid;
+	init_timer(&(qhp->timer));
+	PDBG("%s sq_num_entries %d, rq_num_entries %d "
+	     "qpid 0x%0x qhp %p dma_addr 0x%llx size %d rq_addr 0x%x\n",
+	     __func__, qhp->attr.sq_num_entries, qhp->attr.rq_num_entries,
+	     qhp->wq.qpid, qhp, (unsigned long long) qhp->wq.dma_addr,
+	     1 << qhp->wq.size_log2, qhp->wq.rq_addr);
+	return &qhp->ibqp;
+}
+
+static int iwch_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+		      int attr_mask, struct ib_udata *udata)
+{
+	struct iwch_dev *rhp;
+	struct iwch_qp *qhp;
+	enum iwch_qp_attr_mask mask = 0;
+	struct iwch_qp_attributes attrs;
+
+	PDBG("%s ib_qp %p\n", __func__, ibqp);
+
+	/* iwarp does not support the RTR state */
+	if ((attr_mask & IB_QP_STATE) && (attr->qp_state == IB_QPS_RTR))
+		attr_mask &= ~IB_QP_STATE;
+
+	/* Make sure we still have something left to do */
+	if (!attr_mask)
+		return 0;
+
+	memset(&attrs, 0, sizeof attrs);
+	qhp = to_iwch_qp(ibqp);
+	rhp = qhp->rhp;
+
+	attrs.next_state = iwch_convert_state(attr->qp_state);
+	attrs.enable_rdma_read = (attr->qp_access_flags &
+			       IB_ACCESS_REMOTE_READ) ?  1 : 0;
+	attrs.enable_rdma_write = (attr->qp_access_flags &
+				IB_ACCESS_REMOTE_WRITE) ? 1 : 0;
+	attrs.enable_bind = (attr->qp_access_flags & IB_ACCESS_MW_BIND) ? 1 : 0;
+
+
+	mask |= (attr_mask & IB_QP_STATE) ? IWCH_QP_ATTR_NEXT_STATE : 0;
+	mask |= (attr_mask & IB_QP_ACCESS_FLAGS) ?
+			(IWCH_QP_ATTR_ENABLE_RDMA_READ |
+			 IWCH_QP_ATTR_ENABLE_RDMA_WRITE |
+			 IWCH_QP_ATTR_ENABLE_RDMA_BIND) : 0;
+
+	return iwch_modify_qp(rhp, qhp, mask, &attrs, 0);
+}
+
+void iwch_qp_add_ref(struct ib_qp *qp)
+{
+	PDBG("%s ib_qp %p\n", __func__, qp);
+	atomic_inc(&(to_iwch_qp(qp)->refcnt));
+}
+
+void iwch_qp_rem_ref(struct ib_qp *qp)
+{
+	PDBG("%s ib_qp %p\n", __func__, qp);
+	if (atomic_dec_and_test(&(to_iwch_qp(qp)->refcnt)))
+	        wake_up(&(to_iwch_qp(qp)->wait));
+}
+
+static struct ib_qp *iwch_get_qp(struct ib_device *dev, int qpn)
+{
+	PDBG("%s ib_dev %p qpn 0x%x\n", __func__, dev, qpn);
+	return (struct ib_qp *)get_qhp(to_iwch_dev(dev), qpn);
+}
+
+
+static int iwch_query_pkey(struct ib_device *ibdev,
+			   u8 port, u16 index, u16 * pkey)
+{
+	PDBG("%s ibdev %p\n", __func__, ibdev);
+	*pkey = 0;
+	return 0;
+}
+
+static int iwch_query_gid(struct ib_device *ibdev, u8 port,
+			  int index, union ib_gid *gid)
+{
+	struct iwch_dev *dev;
+
+	PDBG("%s ibdev %p, port %d, index %d, gid %p\n",
+	       __func__, ibdev, port, index, gid);
+	dev = to_iwch_dev(ibdev);
+	BUG_ON(port == 0 || port > 2);
+	memset(&(gid->raw[0]), 0, sizeof(gid->raw));
+	memcpy(&(gid->raw[0]), dev->rdev.port_info.lldevs[port-1]->dev_addr, 6);
+	return 0;
+}
+
+static u64 fw_vers_string_to_u64(struct iwch_dev *iwch_dev)
+{
+	struct ethtool_drvinfo info;
+	struct net_device *lldev = iwch_dev->rdev.t3cdev_p->lldev;
+	char *cp, *next;
+	unsigned fw_maj, fw_min, fw_mic;
+
+	lldev->ethtool_ops->get_drvinfo(lldev, &info);
+
+	next = info.fw_version + 1;
+	cp = strsep(&next, ".");
+	sscanf(cp, "%i", &fw_maj);
+	cp = strsep(&next, ".");
+	sscanf(cp, "%i", &fw_min);
+	cp = strsep(&next, ".");
+	sscanf(cp, "%i", &fw_mic);
+
+	return (((u64)fw_maj & 0xffff) << 32) | ((fw_min & 0xffff) << 16) |
+	       (fw_mic & 0xffff);
+}
+
+static int iwch_query_device(struct ib_device *ibdev,
+			     struct ib_device_attr *props)
+{
+
+	struct iwch_dev *dev;
+	PDBG("%s ibdev %p\n", __func__, ibdev);
+
+	dev = to_iwch_dev(ibdev);
+	memset(props, 0, sizeof *props);
+	memcpy(&props->sys_image_guid, dev->rdev.t3cdev_p->lldev->dev_addr, 6);
+	props->hw_ver = dev->rdev.t3cdev_p->type;
+	props->fw_ver = fw_vers_string_to_u64(dev);
+	props->device_cap_flags = dev->device_cap_flags;
+	props->page_size_cap = dev->attr.mem_pgsizes_bitmask;
+	props->vendor_id = (u32)dev->rdev.rnic_info.pdev->vendor;
+	props->vendor_part_id = (u32)dev->rdev.rnic_info.pdev->device;
+	props->max_mr_size = dev->attr.max_mr_size;
+	props->max_qp = dev->attr.max_qps;
+	props->max_qp_wr = dev->attr.max_wrs;
+	props->max_sge = dev->attr.max_sge_per_wr;
+	props->max_sge_rd = 1;
+	props->max_qp_rd_atom = dev->attr.max_rdma_reads_per_qp;
+	props->max_qp_init_rd_atom = dev->attr.max_rdma_reads_per_qp;
+	props->max_cq = dev->attr.max_cqs;
+	props->max_cqe = dev->attr.max_cqes_per_cq;
+	props->max_mr = dev->attr.max_mem_regs;
+	props->max_pd = dev->attr.max_pds;
+	props->local_ca_ack_delay = 0;
+	props->max_fast_reg_page_list_len = T3_MAX_FASTREG_DEPTH;
+
+	return 0;
+}
+
+static int iwch_query_port(struct ib_device *ibdev,
+			   u8 port, struct ib_port_attr *props)
+{
+	struct iwch_dev *dev;
+	struct net_device *netdev;
+	struct in_device *inetdev;
+
+	PDBG("%s ibdev %p\n", __func__, ibdev);
+
+	dev = to_iwch_dev(ibdev);
+	netdev = dev->rdev.port_info.lldevs[port-1];
+
+	memset(props, 0, sizeof(struct ib_port_attr));
+	props->max_mtu = IB_MTU_4096;
+	if (netdev->mtu >= 4096)
+		props->active_mtu = IB_MTU_4096;
+	else if (netdev->mtu >= 2048)
+		props->active_mtu = IB_MTU_2048;
+	else if (netdev->mtu >= 1024)
+		props->active_mtu = IB_MTU_1024;
+	else if (netdev->mtu >= 512)
+		props->active_mtu = IB_MTU_512;
+	else
+		props->active_mtu = IB_MTU_256;
+
+	if (!netif_carrier_ok(netdev))
+		props->state = IB_PORT_DOWN;
+	else {
+		inetdev = in_dev_get(netdev);
+		if (inetdev) {
+			if (inetdev->ifa_list)
+				props->state = IB_PORT_ACTIVE;
+			else
+				props->state = IB_PORT_INIT;
+			in_dev_put(inetdev);
+		} else
+			props->state = IB_PORT_INIT;
+	}
+
+	props->port_cap_flags =
+	    IB_PORT_CM_SUP |
+	    IB_PORT_SNMP_TUNNEL_SUP |
+	    IB_PORT_REINIT_SUP |
+	    IB_PORT_DEVICE_MGMT_SUP |
+	    IB_PORT_VENDOR_CLASS_SUP | IB_PORT_BOOT_MGMT_SUP;
+	props->gid_tbl_len = 1;
+	props->pkey_tbl_len = 1;
+	props->active_width = 2;
+	props->active_speed = IB_SPEED_DDR;
+	props->max_msg_sz = -1;
+
+	return 0;
+}
+
+static ssize_t show_rev(struct device *dev, struct device_attribute *attr,
+			char *buf)
+{
+	struct iwch_dev *iwch_dev = container_of(dev, struct iwch_dev,
+						 ibdev.dev);
+	PDBG("%s dev 0x%p\n", __func__, dev);
+	return sprintf(buf, "%d\n", iwch_dev->rdev.t3cdev_p->type);
+}
+
+static ssize_t show_fw_ver(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct iwch_dev *iwch_dev = container_of(dev, struct iwch_dev,
+						 ibdev.dev);
+	struct ethtool_drvinfo info;
+	struct net_device *lldev = iwch_dev->rdev.t3cdev_p->lldev;
+
+	PDBG("%s dev 0x%p\n", __func__, dev);
+	lldev->ethtool_ops->get_drvinfo(lldev, &info);
+	return sprintf(buf, "%s\n", info.fw_version);
+}
+
+static ssize_t show_hca(struct device *dev, struct device_attribute *attr,
+			char *buf)
+{
+	struct iwch_dev *iwch_dev = container_of(dev, struct iwch_dev,
+						 ibdev.dev);
+	struct ethtool_drvinfo info;
+	struct net_device *lldev = iwch_dev->rdev.t3cdev_p->lldev;
+
+	PDBG("%s dev 0x%p\n", __func__, dev);
+	lldev->ethtool_ops->get_drvinfo(lldev, &info);
+	return sprintf(buf, "%s\n", info.driver);
+}
+
+static ssize_t show_board(struct device *dev, struct device_attribute *attr,
+			  char *buf)
+{
+	struct iwch_dev *iwch_dev = container_of(dev, struct iwch_dev,
+						 ibdev.dev);
+	PDBG("%s dev 0x%p\n", __func__, dev);
+	return sprintf(buf, "%x.%x\n", iwch_dev->rdev.rnic_info.pdev->vendor,
+		       iwch_dev->rdev.rnic_info.pdev->device);
+}
+
+static int iwch_get_mib(struct ib_device *ibdev,
+			union rdma_protocol_stats *stats)
+{
+	struct iwch_dev *dev;
+	struct tp_mib_stats m;
+	int ret;
+
+	PDBG("%s ibdev %p\n", __func__, ibdev);
+	dev = to_iwch_dev(ibdev);
+	ret = dev->rdev.t3cdev_p->ctl(dev->rdev.t3cdev_p, RDMA_GET_MIB, &m);
+	if (ret)
+		return -ENOSYS;
+
+	memset(stats, 0, sizeof *stats);
+	stats->iw.ipInReceives = ((u64) m.ipInReceive_hi << 32) +
+				m.ipInReceive_lo;
+	stats->iw.ipInHdrErrors = ((u64) m.ipInHdrErrors_hi << 32) +
+				  m.ipInHdrErrors_lo;
+	stats->iw.ipInAddrErrors = ((u64) m.ipInAddrErrors_hi << 32) +
+				   m.ipInAddrErrors_lo;
+	stats->iw.ipInUnknownProtos = ((u64) m.ipInUnknownProtos_hi << 32) +
+				      m.ipInUnknownProtos_lo;
+	stats->iw.ipInDiscards = ((u64) m.ipInDiscards_hi << 32) +
+				 m.ipInDiscards_lo;
+	stats->iw.ipInDelivers = ((u64) m.ipInDelivers_hi << 32) +
+				 m.ipInDelivers_lo;
+	stats->iw.ipOutRequests = ((u64) m.ipOutRequests_hi << 32) +
+				  m.ipOutRequests_lo;
+	stats->iw.ipOutDiscards = ((u64) m.ipOutDiscards_hi << 32) +
+				  m.ipOutDiscards_lo;
+	stats->iw.ipOutNoRoutes = ((u64) m.ipOutNoRoutes_hi << 32) +
+				  m.ipOutNoRoutes_lo;
+	stats->iw.ipReasmTimeout = (u64) m.ipReasmTimeout;
+	stats->iw.ipReasmReqds = (u64) m.ipReasmReqds;
+	stats->iw.ipReasmOKs = (u64) m.ipReasmOKs;
+	stats->iw.ipReasmFails = (u64) m.ipReasmFails;
+	stats->iw.tcpActiveOpens = (u64) m.tcpActiveOpens;
+	stats->iw.tcpPassiveOpens = (u64) m.tcpPassiveOpens;
+	stats->iw.tcpAttemptFails = (u64) m.tcpAttemptFails;
+	stats->iw.tcpEstabResets = (u64) m.tcpEstabResets;
+	stats->iw.tcpOutRsts = (u64) m.tcpOutRsts;
+	stats->iw.tcpCurrEstab = (u64) m.tcpCurrEstab;
+	stats->iw.tcpInSegs = ((u64) m.tcpInSegs_hi << 32) +
+			      m.tcpInSegs_lo;
+	stats->iw.tcpOutSegs = ((u64) m.tcpOutSegs_hi << 32) +
+			       m.tcpOutSegs_lo;
+	stats->iw.tcpRetransSegs = ((u64) m.tcpRetransSeg_hi << 32) +
+				  m.tcpRetransSeg_lo;
+	stats->iw.tcpInErrs = ((u64) m.tcpInErrs_hi << 32) +
+			      m.tcpInErrs_lo;
+	stats->iw.tcpRtoMin = (u64) m.tcpRtoMin;
+	stats->iw.tcpRtoMax = (u64) m.tcpRtoMax;
+	return 0;
+}
+
+static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL);
+static DEVICE_ATTR(fw_ver, S_IRUGO, show_fw_ver, NULL);
+static DEVICE_ATTR(hca_type, S_IRUGO, show_hca, NULL);
+static DEVICE_ATTR(board_id, S_IRUGO, show_board, NULL);
+
+static struct device_attribute *iwch_class_attributes[] = {
+	&dev_attr_hw_rev,
+	&dev_attr_fw_ver,
+	&dev_attr_hca_type,
+	&dev_attr_board_id,
+};
+
+int iwch_register_device(struct iwch_dev *dev)
+{
+	int ret;
+	int i;
+
+	PDBG("%s iwch_dev %p\n", __func__, dev);
+	strlcpy(dev->ibdev.name, "cxgb3_%d", IB_DEVICE_NAME_MAX);
+	memset(&dev->ibdev.node_guid, 0, sizeof(dev->ibdev.node_guid));
+	memcpy(&dev->ibdev.node_guid, dev->rdev.t3cdev_p->lldev->dev_addr, 6);
+	dev->ibdev.owner = THIS_MODULE;
+	dev->device_cap_flags = IB_DEVICE_LOCAL_DMA_LKEY |
+				IB_DEVICE_MEM_WINDOW |
+				IB_DEVICE_MEM_MGT_EXTENSIONS;
+
+	/* cxgb3 supports STag 0. */
+	dev->ibdev.local_dma_lkey = 0;
+
+	dev->ibdev.uverbs_cmd_mask =
+	    (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) |
+	    (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) |
+	    (1ull << IB_USER_VERBS_CMD_QUERY_PORT) |
+	    (1ull << IB_USER_VERBS_CMD_ALLOC_PD) |
+	    (1ull << IB_USER_VERBS_CMD_DEALLOC_PD) |
+	    (1ull << IB_USER_VERBS_CMD_REG_MR) |
+	    (1ull << IB_USER_VERBS_CMD_DEREG_MR) |
+	    (1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) |
+	    (1ull << IB_USER_VERBS_CMD_CREATE_CQ) |
+	    (1ull << IB_USER_VERBS_CMD_DESTROY_CQ) |
+	    (1ull << IB_USER_VERBS_CMD_REQ_NOTIFY_CQ) |
+	    (1ull << IB_USER_VERBS_CMD_CREATE_QP) |
+	    (1ull << IB_USER_VERBS_CMD_MODIFY_QP) |
+	    (1ull << IB_USER_VERBS_CMD_POLL_CQ) |
+	    (1ull << IB_USER_VERBS_CMD_DESTROY_QP) |
+	    (1ull << IB_USER_VERBS_CMD_POST_SEND) |
+	    (1ull << IB_USER_VERBS_CMD_POST_RECV);
+	dev->ibdev.node_type = RDMA_NODE_RNIC;
+	memcpy(dev->ibdev.node_desc, IWCH_NODE_DESC, sizeof(IWCH_NODE_DESC));
+	dev->ibdev.phys_port_cnt = dev->rdev.port_info.nports;
+	dev->ibdev.num_comp_vectors = 1;
+	dev->ibdev.dma_device = &(dev->rdev.rnic_info.pdev->dev);
+	dev->ibdev.query_device = iwch_query_device;
+	dev->ibdev.query_port = iwch_query_port;
+	dev->ibdev.query_pkey = iwch_query_pkey;
+	dev->ibdev.query_gid = iwch_query_gid;
+	dev->ibdev.alloc_ucontext = iwch_alloc_ucontext;
+	dev->ibdev.dealloc_ucontext = iwch_dealloc_ucontext;
+	dev->ibdev.mmap = iwch_mmap;
+	dev->ibdev.alloc_pd = iwch_allocate_pd;
+	dev->ibdev.dealloc_pd = iwch_deallocate_pd;
+	dev->ibdev.create_ah = iwch_ah_create;
+	dev->ibdev.destroy_ah = iwch_ah_destroy;
+	dev->ibdev.create_qp = iwch_create_qp;
+	dev->ibdev.modify_qp = iwch_ib_modify_qp;
+	dev->ibdev.destroy_qp = iwch_destroy_qp;
+	dev->ibdev.create_cq = iwch_create_cq;
+	dev->ibdev.destroy_cq = iwch_destroy_cq;
+	dev->ibdev.resize_cq = iwch_resize_cq;
+	dev->ibdev.poll_cq = iwch_poll_cq;
+	dev->ibdev.get_dma_mr = iwch_get_dma_mr;
+	dev->ibdev.reg_phys_mr = iwch_register_phys_mem;
+	dev->ibdev.rereg_phys_mr = iwch_reregister_phys_mem;
+	dev->ibdev.reg_user_mr = iwch_reg_user_mr;
+	dev->ibdev.dereg_mr = iwch_dereg_mr;
+	dev->ibdev.alloc_mw = iwch_alloc_mw;
+	dev->ibdev.bind_mw = iwch_bind_mw;
+	dev->ibdev.dealloc_mw = iwch_dealloc_mw;
+	dev->ibdev.alloc_fast_reg_mr = iwch_alloc_fast_reg_mr;
+	dev->ibdev.alloc_fast_reg_page_list = iwch_alloc_fastreg_pbl;
+	dev->ibdev.free_fast_reg_page_list = iwch_free_fastreg_pbl;
+	dev->ibdev.attach_mcast = iwch_multicast_attach;
+	dev->ibdev.detach_mcast = iwch_multicast_detach;
+	dev->ibdev.process_mad = iwch_process_mad;
+	dev->ibdev.req_notify_cq = iwch_arm_cq;
+	dev->ibdev.post_send = iwch_post_send;
+	dev->ibdev.post_recv = iwch_post_receive;
+	dev->ibdev.get_protocol_stats = iwch_get_mib;
+	dev->ibdev.uverbs_abi_ver = IWCH_UVERBS_ABI_VERSION;
+
+	dev->ibdev.iwcm = kmalloc(sizeof(struct iw_cm_verbs), GFP_KERNEL);
+	if (!dev->ibdev.iwcm)
+		return -ENOMEM;
+
+	dev->ibdev.iwcm->connect = iwch_connect;
+	dev->ibdev.iwcm->accept = iwch_accept_cr;
+	dev->ibdev.iwcm->reject = iwch_reject_cr;
+	dev->ibdev.iwcm->create_listen = iwch_create_listen;
+	dev->ibdev.iwcm->destroy_listen = iwch_destroy_listen;
+	dev->ibdev.iwcm->add_ref = iwch_qp_add_ref;
+	dev->ibdev.iwcm->rem_ref = iwch_qp_rem_ref;
+	dev->ibdev.iwcm->get_qp = iwch_get_qp;
+
+	ret = ib_register_device(&dev->ibdev, NULL);
+	if (ret)
+		goto bail1;
+
+	for (i = 0; i < ARRAY_SIZE(iwch_class_attributes); ++i) {
+		ret = device_create_file(&dev->ibdev.dev,
+					 iwch_class_attributes[i]);
+		if (ret) {
+			goto bail2;
+		}
+	}
+	return 0;
+bail2:
+	ib_unregister_device(&dev->ibdev);
+bail1:
+	kfree(dev->ibdev.iwcm);
+	return ret;
+}
+
+void iwch_unregister_device(struct iwch_dev *dev)
+{
+	int i;
+
+	PDBG("%s iwch_dev %p\n", __func__, dev);
+	for (i = 0; i < ARRAY_SIZE(iwch_class_attributes); ++i)
+		device_remove_file(&dev->ibdev.dev,
+				   iwch_class_attributes[i]);
+	ib_unregister_device(&dev->ibdev);
+	kfree(dev->ibdev.iwcm);
+	return;
+}
diff --git a/drivers/infiniband/hw/cxgb3/iwch_provider.h b/drivers/infiniband/hw/cxgb3/iwch_provider.h
new file mode 100644
index 0000000..87c14b0
--- /dev/null
+++ b/drivers/infiniband/hw/cxgb3/iwch_provider.h
@@ -0,0 +1,360 @@
+/*
+ * Copyright (c) 2006 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __IWCH_PROVIDER_H__
+#define __IWCH_PROVIDER_H__
+
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <rdma/ib_verbs.h>
+#include <asm/types.h>
+#include "t3cdev.h"
+#include "iwch.h"
+#include "cxio_wr.h"
+#include "cxio_hal.h"
+
+struct iwch_pd {
+	struct ib_pd ibpd;
+	u32 pdid;
+	struct iwch_dev *rhp;
+};
+
+static inline struct iwch_pd *to_iwch_pd(struct ib_pd *ibpd)
+{
+	return container_of(ibpd, struct iwch_pd, ibpd);
+}
+
+struct tpt_attributes {
+	u32 stag;
+	u32 state:1;
+	u32 type:2;
+	u32 rsvd:1;
+	enum tpt_mem_perm perms;
+	u32 remote_invaliate_disable:1;
+	u32 zbva:1;
+	u32 mw_bind_enable:1;
+	u32 page_size:5;
+
+	u32 pdid;
+	u32 qpid;
+	u32 pbl_addr;
+	u32 len;
+	u64 va_fbo;
+	u32 pbl_size;
+};
+
+struct iwch_mr {
+	struct ib_mr ibmr;
+	struct ib_umem *umem;
+	struct iwch_dev *rhp;
+	u64 kva;
+	struct tpt_attributes attr;
+};
+
+typedef struct iwch_mw iwch_mw_handle;
+
+static inline struct iwch_mr *to_iwch_mr(struct ib_mr *ibmr)
+{
+	return container_of(ibmr, struct iwch_mr, ibmr);
+}
+
+struct iwch_mw {
+	struct ib_mw ibmw;
+	struct iwch_dev *rhp;
+	u64 kva;
+	struct tpt_attributes attr;
+};
+
+static inline struct iwch_mw *to_iwch_mw(struct ib_mw *ibmw)
+{
+	return container_of(ibmw, struct iwch_mw, ibmw);
+}
+
+struct iwch_cq {
+	struct ib_cq ibcq;
+	struct iwch_dev *rhp;
+	struct t3_cq cq;
+	spinlock_t lock;
+	spinlock_t comp_handler_lock;
+	atomic_t refcnt;
+	wait_queue_head_t wait;
+	u32 __user *user_rptr_addr;
+};
+
+static inline struct iwch_cq *to_iwch_cq(struct ib_cq *ibcq)
+{
+	return container_of(ibcq, struct iwch_cq, ibcq);
+}
+
+enum IWCH_QP_FLAGS {
+	QP_QUIESCED = 0x01
+};
+
+struct iwch_mpa_attributes {
+	u8 initiator;
+	u8 recv_marker_enabled;
+	u8 xmit_marker_enabled;	/* iWARP: enable inbound Read Resp. */
+	u8 crc_enabled;
+	u8 version;	/* 0 or 1 */
+};
+
+struct iwch_qp_attributes {
+	u32 scq;
+	u32 rcq;
+	u32 sq_num_entries;
+	u32 rq_num_entries;
+	u32 sq_max_sges;
+	u32 sq_max_sges_rdma_write;
+	u32 rq_max_sges;
+	u32 state;
+	u8 enable_rdma_read;
+	u8 enable_rdma_write;	/* enable inbound Read Resp. */
+	u8 enable_bind;
+	u8 enable_mmid0_fastreg;	/* Enable STAG0 + Fast-register */
+	/*
+	 * Next QP state. If specify the current state, only the
+	 * QP attributes will be modified.
+	 */
+	u32 max_ord;
+	u32 max_ird;
+	u32 pd;	/* IN */
+	u32 next_state;
+	char terminate_buffer[52];
+	u32 terminate_msg_len;
+	u8 is_terminate_local;
+	struct iwch_mpa_attributes mpa_attr;	/* IN-OUT */
+	struct iwch_ep *llp_stream_handle;
+	char *stream_msg_buf;	/* Last stream msg. before Idle -> RTS */
+	u32 stream_msg_buf_len;	/* Only on Idle -> RTS */
+};
+
+struct iwch_qp {
+	struct ib_qp ibqp;
+	struct iwch_dev *rhp;
+	struct iwch_ep *ep;
+	struct iwch_qp_attributes attr;
+	struct t3_wq wq;
+	spinlock_t lock;
+	atomic_t refcnt;
+	wait_queue_head_t wait;
+	enum IWCH_QP_FLAGS flags;
+	struct timer_list timer;
+};
+
+static inline int qp_quiesced(struct iwch_qp *qhp)
+{
+	return qhp->flags & QP_QUIESCED;
+}
+
+static inline struct iwch_qp *to_iwch_qp(struct ib_qp *ibqp)
+{
+	return container_of(ibqp, struct iwch_qp, ibqp);
+}
+
+void iwch_qp_add_ref(struct ib_qp *qp);
+void iwch_qp_rem_ref(struct ib_qp *qp);
+
+struct iwch_ucontext {
+	struct ib_ucontext ibucontext;
+	struct cxio_ucontext uctx;
+	u32 key;
+	spinlock_t mmap_lock;
+	struct list_head mmaps;
+};
+
+static inline struct iwch_ucontext *to_iwch_ucontext(struct ib_ucontext *c)
+{
+	return container_of(c, struct iwch_ucontext, ibucontext);
+}
+
+struct iwch_mm_entry {
+	struct list_head entry;
+	u64 addr;
+	u32 key;
+	unsigned len;
+};
+
+static inline struct iwch_mm_entry *remove_mmap(struct iwch_ucontext *ucontext,
+						u32 key, unsigned len)
+{
+	struct list_head *pos, *nxt;
+	struct iwch_mm_entry *mm;
+
+	spin_lock(&ucontext->mmap_lock);
+	list_for_each_safe(pos, nxt, &ucontext->mmaps) {
+
+		mm = list_entry(pos, struct iwch_mm_entry, entry);
+		if (mm->key == key && mm->len == len) {
+			list_del_init(&mm->entry);
+			spin_unlock(&ucontext->mmap_lock);
+			PDBG("%s key 0x%x addr 0x%llx len %d\n", __func__,
+			     key, (unsigned long long) mm->addr, mm->len);
+			return mm;
+		}
+	}
+	spin_unlock(&ucontext->mmap_lock);
+	return NULL;
+}
+
+static inline void insert_mmap(struct iwch_ucontext *ucontext,
+			       struct iwch_mm_entry *mm)
+{
+	spin_lock(&ucontext->mmap_lock);
+	PDBG("%s key 0x%x addr 0x%llx len %d\n", __func__,
+	     mm->key, (unsigned long long) mm->addr, mm->len);
+	list_add_tail(&mm->entry, &ucontext->mmaps);
+	spin_unlock(&ucontext->mmap_lock);
+}
+
+enum iwch_qp_attr_mask {
+	IWCH_QP_ATTR_NEXT_STATE = 1 << 0,
+	IWCH_QP_ATTR_ENABLE_RDMA_READ = 1 << 7,
+	IWCH_QP_ATTR_ENABLE_RDMA_WRITE = 1 << 8,
+	IWCH_QP_ATTR_ENABLE_RDMA_BIND = 1 << 9,
+	IWCH_QP_ATTR_MAX_ORD = 1 << 11,
+	IWCH_QP_ATTR_MAX_IRD = 1 << 12,
+	IWCH_QP_ATTR_LLP_STREAM_HANDLE = 1 << 22,
+	IWCH_QP_ATTR_STREAM_MSG_BUFFER = 1 << 23,
+	IWCH_QP_ATTR_MPA_ATTR = 1 << 24,
+	IWCH_QP_ATTR_QP_CONTEXT_ACTIVATE = 1 << 25,
+	IWCH_QP_ATTR_VALID_MODIFY = (IWCH_QP_ATTR_ENABLE_RDMA_READ |
+				     IWCH_QP_ATTR_ENABLE_RDMA_WRITE |
+				     IWCH_QP_ATTR_MAX_ORD |
+				     IWCH_QP_ATTR_MAX_IRD |
+				     IWCH_QP_ATTR_LLP_STREAM_HANDLE |
+				     IWCH_QP_ATTR_STREAM_MSG_BUFFER |
+				     IWCH_QP_ATTR_MPA_ATTR |
+				     IWCH_QP_ATTR_QP_CONTEXT_ACTIVATE)
+};
+
+int iwch_modify_qp(struct iwch_dev *rhp,
+				struct iwch_qp *qhp,
+				enum iwch_qp_attr_mask mask,
+				struct iwch_qp_attributes *attrs,
+				int internal);
+
+enum iwch_qp_state {
+	IWCH_QP_STATE_IDLE,
+	IWCH_QP_STATE_RTS,
+	IWCH_QP_STATE_ERROR,
+	IWCH_QP_STATE_TERMINATE,
+	IWCH_QP_STATE_CLOSING,
+	IWCH_QP_STATE_TOT
+};
+
+static inline int iwch_convert_state(enum ib_qp_state ib_state)
+{
+	switch (ib_state) {
+	case IB_QPS_RESET:
+	case IB_QPS_INIT:
+		return IWCH_QP_STATE_IDLE;
+	case IB_QPS_RTS:
+		return IWCH_QP_STATE_RTS;
+	case IB_QPS_SQD:
+		return IWCH_QP_STATE_CLOSING;
+	case IB_QPS_SQE:
+		return IWCH_QP_STATE_TERMINATE;
+	case IB_QPS_ERR:
+		return IWCH_QP_STATE_ERROR;
+	default:
+		return -1;
+	}
+}
+
+static inline u32 iwch_ib_to_tpt_access(int acc)
+{
+	return (acc & IB_ACCESS_REMOTE_WRITE ? TPT_REMOTE_WRITE : 0) |
+	       (acc & IB_ACCESS_REMOTE_READ ? TPT_REMOTE_READ : 0) |
+	       (acc & IB_ACCESS_LOCAL_WRITE ? TPT_LOCAL_WRITE : 0) |
+	       (acc & IB_ACCESS_MW_BIND ? TPT_MW_BIND : 0) |
+	       TPT_LOCAL_READ;
+}
+
+static inline u32 iwch_ib_to_tpt_bind_access(int acc)
+{
+	return (acc & IB_ACCESS_REMOTE_WRITE ? TPT_REMOTE_WRITE : 0) |
+	       (acc & IB_ACCESS_REMOTE_READ ? TPT_REMOTE_READ : 0);
+}
+
+enum iwch_mmid_state {
+	IWCH_STAG_STATE_VALID,
+	IWCH_STAG_STATE_INVALID
+};
+
+enum iwch_qp_query_flags {
+	IWCH_QP_QUERY_CONTEXT_NONE = 0x0,	/* No ctx; Only attrs */
+	IWCH_QP_QUERY_CONTEXT_GET = 0x1,	/* Get ctx + attrs */
+	IWCH_QP_QUERY_CONTEXT_SUSPEND = 0x2,	/* Not Supported */
+
+	/*
+	 * Quiesce QP context; Consumer
+	 * will NOT replay outstanding WR
+	 */
+	IWCH_QP_QUERY_CONTEXT_QUIESCE = 0x4,
+	IWCH_QP_QUERY_CONTEXT_REMOVE = 0x8,
+	IWCH_QP_QUERY_TEST_USERWRITE = 0x32	/* Test special */
+};
+
+u16 iwch_rqes_posted(struct iwch_qp *qhp);
+int iwch_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
+		      struct ib_send_wr **bad_wr);
+int iwch_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr,
+		      struct ib_recv_wr **bad_wr);
+int iwch_bind_mw(struct ib_qp *qp,
+			     struct ib_mw *mw,
+			     struct ib_mw_bind *mw_bind);
+int iwch_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc);
+int iwch_post_terminate(struct iwch_qp *qhp, struct respQ_msg_t *rsp_msg);
+int iwch_post_zb_read(struct iwch_ep *ep);
+int iwch_register_device(struct iwch_dev *dev);
+void iwch_unregister_device(struct iwch_dev *dev);
+void stop_read_rep_timer(struct iwch_qp *qhp);
+int iwch_register_mem(struct iwch_dev *rhp, struct iwch_pd *php,
+		      struct iwch_mr *mhp, int shift);
+int iwch_reregister_mem(struct iwch_dev *rhp, struct iwch_pd *php,
+					struct iwch_mr *mhp,
+					int shift,
+					int npages);
+int iwch_alloc_pbl(struct iwch_mr *mhp, int npages);
+void iwch_free_pbl(struct iwch_mr *mhp);
+int iwch_write_pbl(struct iwch_mr *mhp, __be64 *pages, int npages, int offset);
+int build_phys_page_list(struct ib_phys_buf *buffer_list,
+					int num_phys_buf,
+					u64 *iova_start,
+					u64 *total_size,
+					int *npages,
+					int *shift,
+					__be64 **page_list);
+
+
+#define IWCH_NODE_DESC "cxgb3 Chelsio Communications"
+
+#endif
diff --git a/drivers/infiniband/hw/cxgb3/iwch_qp.c b/drivers/infiniband/hw/cxgb3/iwch_qp.c
new file mode 100644
index 0000000..b57c0be
--- /dev/null
+++ b/drivers/infiniband/hw/cxgb3/iwch_qp.c
@@ -0,0 +1,1163 @@
+/*
+ * Copyright (c) 2006 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include <linux/sched.h>
+#include <linux/gfp.h>
+#include "iwch_provider.h"
+#include "iwch.h"
+#include "iwch_cm.h"
+#include "cxio_hal.h"
+#include "cxio_resource.h"
+
+#define NO_SUPPORT -1
+
+static int build_rdma_send(union t3_wr *wqe, struct ib_send_wr *wr,
+				u8 * flit_cnt)
+{
+	int i;
+	u32 plen;
+
+	switch (wr->opcode) {
+	case IB_WR_SEND:
+		if (wr->send_flags & IB_SEND_SOLICITED)
+			wqe->send.rdmaop = T3_SEND_WITH_SE;
+		else
+			wqe->send.rdmaop = T3_SEND;
+		wqe->send.rem_stag = 0;
+		break;
+	case IB_WR_SEND_WITH_INV:
+		if (wr->send_flags & IB_SEND_SOLICITED)
+			wqe->send.rdmaop = T3_SEND_WITH_SE_INV;
+		else
+			wqe->send.rdmaop = T3_SEND_WITH_INV;
+		wqe->send.rem_stag = cpu_to_be32(wr->ex.invalidate_rkey);
+		break;
+	default:
+		return -EINVAL;
+	}
+	if (wr->num_sge > T3_MAX_SGE)
+		return -EINVAL;
+	wqe->send.reserved[0] = 0;
+	wqe->send.reserved[1] = 0;
+	wqe->send.reserved[2] = 0;
+	plen = 0;
+	for (i = 0; i < wr->num_sge; i++) {
+		if ((plen + wr->sg_list[i].length) < plen)
+			return -EMSGSIZE;
+
+		plen += wr->sg_list[i].length;
+		wqe->send.sgl[i].stag = cpu_to_be32(wr->sg_list[i].lkey);
+		wqe->send.sgl[i].len = cpu_to_be32(wr->sg_list[i].length);
+		wqe->send.sgl[i].to = cpu_to_be64(wr->sg_list[i].addr);
+	}
+	wqe->send.num_sgle = cpu_to_be32(wr->num_sge);
+	*flit_cnt = 4 + ((wr->num_sge) << 1);
+	wqe->send.plen = cpu_to_be32(plen);
+	return 0;
+}
+
+static int build_rdma_write(union t3_wr *wqe, struct ib_send_wr *wr,
+				 u8 *flit_cnt)
+{
+	int i;
+	u32 plen;
+	if (wr->num_sge > T3_MAX_SGE)
+		return -EINVAL;
+	wqe->write.rdmaop = T3_RDMA_WRITE;
+	wqe->write.reserved[0] = 0;
+	wqe->write.reserved[1] = 0;
+	wqe->write.reserved[2] = 0;
+	wqe->write.stag_sink = cpu_to_be32(wr->wr.rdma.rkey);
+	wqe->write.to_sink = cpu_to_be64(wr->wr.rdma.remote_addr);
+
+	if (wr->opcode == IB_WR_RDMA_WRITE_WITH_IMM) {
+		plen = 4;
+		wqe->write.sgl[0].stag = wr->ex.imm_data;
+		wqe->write.sgl[0].len = cpu_to_be32(0);
+		wqe->write.num_sgle = cpu_to_be32(0);
+		*flit_cnt = 6;
+	} else {
+		plen = 0;
+		for (i = 0; i < wr->num_sge; i++) {
+			if ((plen + wr->sg_list[i].length) < plen) {
+				return -EMSGSIZE;
+			}
+			plen += wr->sg_list[i].length;
+			wqe->write.sgl[i].stag =
+			    cpu_to_be32(wr->sg_list[i].lkey);
+			wqe->write.sgl[i].len =
+			    cpu_to_be32(wr->sg_list[i].length);
+			wqe->write.sgl[i].to =
+			    cpu_to_be64(wr->sg_list[i].addr);
+		}
+		wqe->write.num_sgle = cpu_to_be32(wr->num_sge);
+		*flit_cnt = 5 + ((wr->num_sge) << 1);
+	}
+	wqe->write.plen = cpu_to_be32(plen);
+	return 0;
+}
+
+static int build_rdma_read(union t3_wr *wqe, struct ib_send_wr *wr,
+				u8 *flit_cnt)
+{
+	if (wr->num_sge > 1)
+		return -EINVAL;
+	wqe->read.rdmaop = T3_READ_REQ;
+	if (wr->opcode == IB_WR_RDMA_READ_WITH_INV)
+		wqe->read.local_inv = 1;
+	else
+		wqe->read.local_inv = 0;
+	wqe->read.reserved[0] = 0;
+	wqe->read.reserved[1] = 0;
+	wqe->read.rem_stag = cpu_to_be32(wr->wr.rdma.rkey);
+	wqe->read.rem_to = cpu_to_be64(wr->wr.rdma.remote_addr);
+	wqe->read.local_stag = cpu_to_be32(wr->sg_list[0].lkey);
+	wqe->read.local_len = cpu_to_be32(wr->sg_list[0].length);
+	wqe->read.local_to = cpu_to_be64(wr->sg_list[0].addr);
+	*flit_cnt = sizeof(struct t3_rdma_read_wr) >> 3;
+	return 0;
+}
+
+static int build_fastreg(union t3_wr *wqe, struct ib_send_wr *wr,
+				u8 *flit_cnt, int *wr_cnt, struct t3_wq *wq)
+{
+	int i;
+	__be64 *p;
+
+	if (wr->wr.fast_reg.page_list_len > T3_MAX_FASTREG_DEPTH)
+		return -EINVAL;
+	*wr_cnt = 1;
+	wqe->fastreg.stag = cpu_to_be32(wr->wr.fast_reg.rkey);
+	wqe->fastreg.len = cpu_to_be32(wr->wr.fast_reg.length);
+	wqe->fastreg.va_base_hi = cpu_to_be32(wr->wr.fast_reg.iova_start >> 32);
+	wqe->fastreg.va_base_lo_fbo =
+				cpu_to_be32(wr->wr.fast_reg.iova_start & 0xffffffff);
+	wqe->fastreg.page_type_perms = cpu_to_be32(
+		V_FR_PAGE_COUNT(wr->wr.fast_reg.page_list_len) |
+		V_FR_PAGE_SIZE(wr->wr.fast_reg.page_shift-12) |
+		V_FR_TYPE(TPT_VATO) |
+		V_FR_PERMS(iwch_ib_to_tpt_access(wr->wr.fast_reg.access_flags)));
+	p = &wqe->fastreg.pbl_addrs[0];
+	for (i = 0; i < wr->wr.fast_reg.page_list_len; i++, p++) {
+
+		/* If we need a 2nd WR, then set it up */
+		if (i == T3_MAX_FASTREG_FRAG) {
+			*wr_cnt = 2;
+			wqe = (union t3_wr *)(wq->queue +
+				Q_PTR2IDX((wq->wptr+1), wq->size_log2));
+			build_fw_riwrh((void *)wqe, T3_WR_FASTREG, 0,
+			       Q_GENBIT(wq->wptr + 1, wq->size_log2),
+			       0, 1 + wr->wr.fast_reg.page_list_len - T3_MAX_FASTREG_FRAG,
+			       T3_EOP);
+
+			p = &wqe->pbl_frag.pbl_addrs[0];
+		}
+		*p = cpu_to_be64((u64)wr->wr.fast_reg.page_list->page_list[i]);
+	}
+	*flit_cnt = 5 + wr->wr.fast_reg.page_list_len;
+	if (*flit_cnt > 15)
+		*flit_cnt = 15;
+	return 0;
+}
+
+static int build_inv_stag(union t3_wr *wqe, struct ib_send_wr *wr,
+				u8 *flit_cnt)
+{
+	wqe->local_inv.stag = cpu_to_be32(wr->ex.invalidate_rkey);
+	wqe->local_inv.reserved = 0;
+	*flit_cnt = sizeof(struct t3_local_inv_wr) >> 3;
+	return 0;
+}
+
+static int iwch_sgl2pbl_map(struct iwch_dev *rhp, struct ib_sge *sg_list,
+			    u32 num_sgle, u32 * pbl_addr, u8 * page_size)
+{
+	int i;
+	struct iwch_mr *mhp;
+	u64 offset;
+	for (i = 0; i < num_sgle; i++) {
+
+		mhp = get_mhp(rhp, (sg_list[i].lkey) >> 8);
+		if (!mhp) {
+			PDBG("%s %d\n", __func__, __LINE__);
+			return -EIO;
+		}
+		if (!mhp->attr.state) {
+			PDBG("%s %d\n", __func__, __LINE__);
+			return -EIO;
+		}
+		if (mhp->attr.zbva) {
+			PDBG("%s %d\n", __func__, __LINE__);
+			return -EIO;
+		}
+
+		if (sg_list[i].addr < mhp->attr.va_fbo) {
+			PDBG("%s %d\n", __func__, __LINE__);
+			return -EINVAL;
+		}
+		if (sg_list[i].addr + ((u64) sg_list[i].length) <
+		    sg_list[i].addr) {
+			PDBG("%s %d\n", __func__, __LINE__);
+			return -EINVAL;
+		}
+		if (sg_list[i].addr + ((u64) sg_list[i].length) >
+		    mhp->attr.va_fbo + ((u64) mhp->attr.len)) {
+			PDBG("%s %d\n", __func__, __LINE__);
+			return -EINVAL;
+		}
+		offset = sg_list[i].addr - mhp->attr.va_fbo;
+		offset += mhp->attr.va_fbo &
+			  ((1UL << (12 + mhp->attr.page_size)) - 1);
+		pbl_addr[i] = ((mhp->attr.pbl_addr -
+			        rhp->rdev.rnic_info.pbl_base) >> 3) +
+			      (offset >> (12 + mhp->attr.page_size));
+		page_size[i] = mhp->attr.page_size;
+	}
+	return 0;
+}
+
+static int build_rdma_recv(struct iwch_qp *qhp, union t3_wr *wqe,
+				struct ib_recv_wr *wr)
+{
+	int i, err = 0;
+	u32 pbl_addr[T3_MAX_SGE];
+	u8 page_size[T3_MAX_SGE];
+
+	err = iwch_sgl2pbl_map(qhp->rhp, wr->sg_list, wr->num_sge, pbl_addr,
+			       page_size);
+	if (err)
+		return err;
+	wqe->recv.pagesz[0] = page_size[0];
+	wqe->recv.pagesz[1] = page_size[1];
+	wqe->recv.pagesz[2] = page_size[2];
+	wqe->recv.pagesz[3] = page_size[3];
+	wqe->recv.num_sgle = cpu_to_be32(wr->num_sge);
+	for (i = 0; i < wr->num_sge; i++) {
+		wqe->recv.sgl[i].stag = cpu_to_be32(wr->sg_list[i].lkey);
+		wqe->recv.sgl[i].len = cpu_to_be32(wr->sg_list[i].length);
+
+		/* to in the WQE == the offset into the page */
+		wqe->recv.sgl[i].to = cpu_to_be64(((u32)wr->sg_list[i].addr) &
+				((1UL << (12 + page_size[i])) - 1));
+
+		/* pbl_addr is the adapters address in the PBL */
+		wqe->recv.pbl_addr[i] = cpu_to_be32(pbl_addr[i]);
+	}
+	for (; i < T3_MAX_SGE; i++) {
+		wqe->recv.sgl[i].stag = 0;
+		wqe->recv.sgl[i].len = 0;
+		wqe->recv.sgl[i].to = 0;
+		wqe->recv.pbl_addr[i] = 0;
+	}
+	qhp->wq.rq[Q_PTR2IDX(qhp->wq.rq_wptr,
+			     qhp->wq.rq_size_log2)].wr_id = wr->wr_id;
+	qhp->wq.rq[Q_PTR2IDX(qhp->wq.rq_wptr,
+			     qhp->wq.rq_size_log2)].pbl_addr = 0;
+	return 0;
+}
+
+static int build_zero_stag_recv(struct iwch_qp *qhp, union t3_wr *wqe,
+				struct ib_recv_wr *wr)
+{
+	int i;
+	u32 pbl_addr;
+	u32 pbl_offset;
+
+
+	/*
+	 * The T3 HW requires the PBL in the HW recv descriptor to reference
+	 * a PBL entry.  So we allocate the max needed PBL memory here and pass
+	 * it to the uP in the recv WR.  The uP will build the PBL and setup
+	 * the HW recv descriptor.
+	 */
+	pbl_addr = cxio_hal_pblpool_alloc(&qhp->rhp->rdev, T3_STAG0_PBL_SIZE);
+	if (!pbl_addr)
+		return -ENOMEM;
+
+	/*
+	 * Compute the 8B aligned offset.
+	 */
+	pbl_offset = (pbl_addr - qhp->rhp->rdev.rnic_info.pbl_base) >> 3;
+
+	wqe->recv.num_sgle = cpu_to_be32(wr->num_sge);
+
+	for (i = 0; i < wr->num_sge; i++) {
+
+		/*
+		 * Use a 128MB page size. This and an imposed 128MB
+		 * sge length limit allows us to require only a 2-entry HW
+		 * PBL for each SGE.  This restriction is acceptable since
+		 * since it is not possible to allocate 128MB of contiguous
+		 * DMA coherent memory!
+		 */
+		if (wr->sg_list[i].length > T3_STAG0_MAX_PBE_LEN)
+			return -EINVAL;
+		wqe->recv.pagesz[i] = T3_STAG0_PAGE_SHIFT;
+
+		/*
+		 * T3 restricts a recv to all zero-stag or all non-zero-stag.
+		 */
+		if (wr->sg_list[i].lkey != 0)
+			return -EINVAL;
+		wqe->recv.sgl[i].stag = 0;
+		wqe->recv.sgl[i].len = cpu_to_be32(wr->sg_list[i].length);
+		wqe->recv.sgl[i].to = cpu_to_be64(wr->sg_list[i].addr);
+		wqe->recv.pbl_addr[i] = cpu_to_be32(pbl_offset);
+		pbl_offset += 2;
+	}
+	for (; i < T3_MAX_SGE; i++) {
+		wqe->recv.pagesz[i] = 0;
+		wqe->recv.sgl[i].stag = 0;
+		wqe->recv.sgl[i].len = 0;
+		wqe->recv.sgl[i].to = 0;
+		wqe->recv.pbl_addr[i] = 0;
+	}
+	qhp->wq.rq[Q_PTR2IDX(qhp->wq.rq_wptr,
+			     qhp->wq.rq_size_log2)].wr_id = wr->wr_id;
+	qhp->wq.rq[Q_PTR2IDX(qhp->wq.rq_wptr,
+			     qhp->wq.rq_size_log2)].pbl_addr = pbl_addr;
+	return 0;
+}
+
+int iwch_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
+		      struct ib_send_wr **bad_wr)
+{
+	int err = 0;
+	u8 uninitialized_var(t3_wr_flit_cnt);
+	enum t3_wr_opcode t3_wr_opcode = 0;
+	enum t3_wr_flags t3_wr_flags;
+	struct iwch_qp *qhp;
+	u32 idx;
+	union t3_wr *wqe;
+	u32 num_wrs;
+	unsigned long flag;
+	struct t3_swsq *sqp;
+	int wr_cnt = 1;
+
+	qhp = to_iwch_qp(ibqp);
+	spin_lock_irqsave(&qhp->lock, flag);
+	if (qhp->attr.state > IWCH_QP_STATE_RTS) {
+		spin_unlock_irqrestore(&qhp->lock, flag);
+		err = -EINVAL;
+		goto out;
+	}
+	num_wrs = Q_FREECNT(qhp->wq.sq_rptr, qhp->wq.sq_wptr,
+		  qhp->wq.sq_size_log2);
+	if (num_wrs == 0) {
+		spin_unlock_irqrestore(&qhp->lock, flag);
+		err = -ENOMEM;
+		goto out;
+	}
+	while (wr) {
+		if (num_wrs == 0) {
+			err = -ENOMEM;
+			break;
+		}
+		idx = Q_PTR2IDX(qhp->wq.wptr, qhp->wq.size_log2);
+		wqe = (union t3_wr *) (qhp->wq.queue + idx);
+		t3_wr_flags = 0;
+		if (wr->send_flags & IB_SEND_SOLICITED)
+			t3_wr_flags |= T3_SOLICITED_EVENT_FLAG;
+		if (wr->send_flags & IB_SEND_SIGNALED)
+			t3_wr_flags |= T3_COMPLETION_FLAG;
+		sqp = qhp->wq.sq +
+		      Q_PTR2IDX(qhp->wq.sq_wptr, qhp->wq.sq_size_log2);
+		switch (wr->opcode) {
+		case IB_WR_SEND:
+		case IB_WR_SEND_WITH_INV:
+			if (wr->send_flags & IB_SEND_FENCE)
+				t3_wr_flags |= T3_READ_FENCE_FLAG;
+			t3_wr_opcode = T3_WR_SEND;
+			err = build_rdma_send(wqe, wr, &t3_wr_flit_cnt);
+			break;
+		case IB_WR_RDMA_WRITE:
+		case IB_WR_RDMA_WRITE_WITH_IMM:
+			t3_wr_opcode = T3_WR_WRITE;
+			err = build_rdma_write(wqe, wr, &t3_wr_flit_cnt);
+			break;
+		case IB_WR_RDMA_READ:
+		case IB_WR_RDMA_READ_WITH_INV:
+			t3_wr_opcode = T3_WR_READ;
+			t3_wr_flags = 0; /* T3 reads are always signaled */
+			err = build_rdma_read(wqe, wr, &t3_wr_flit_cnt);
+			if (err)
+				break;
+			sqp->read_len = wqe->read.local_len;
+			if (!qhp->wq.oldest_read)
+				qhp->wq.oldest_read = sqp;
+			break;
+		case IB_WR_FAST_REG_MR:
+			t3_wr_opcode = T3_WR_FASTREG;
+			err = build_fastreg(wqe, wr, &t3_wr_flit_cnt,
+						 &wr_cnt, &qhp->wq);
+			break;
+		case IB_WR_LOCAL_INV:
+			if (wr->send_flags & IB_SEND_FENCE)
+				t3_wr_flags |= T3_LOCAL_FENCE_FLAG;
+			t3_wr_opcode = T3_WR_INV_STAG;
+			err = build_inv_stag(wqe, wr, &t3_wr_flit_cnt);
+			break;
+		default:
+			PDBG("%s post of type=%d TBD!\n", __func__,
+			     wr->opcode);
+			err = -EINVAL;
+		}
+		if (err)
+			break;
+		wqe->send.wrid.id0.hi = qhp->wq.sq_wptr;
+		sqp->wr_id = wr->wr_id;
+		sqp->opcode = wr2opcode(t3_wr_opcode);
+		sqp->sq_wptr = qhp->wq.sq_wptr;
+		sqp->complete = 0;
+		sqp->signaled = (wr->send_flags & IB_SEND_SIGNALED);
+
+		build_fw_riwrh((void *) wqe, t3_wr_opcode, t3_wr_flags,
+			       Q_GENBIT(qhp->wq.wptr, qhp->wq.size_log2),
+			       0, t3_wr_flit_cnt,
+			       (wr_cnt == 1) ? T3_SOPEOP : T3_SOP);
+		PDBG("%s cookie 0x%llx wq idx 0x%x swsq idx %ld opcode %d\n",
+		     __func__, (unsigned long long) wr->wr_id, idx,
+		     Q_PTR2IDX(qhp->wq.sq_wptr, qhp->wq.sq_size_log2),
+		     sqp->opcode);
+		wr = wr->next;
+		num_wrs--;
+		qhp->wq.wptr += wr_cnt;
+		++(qhp->wq.sq_wptr);
+	}
+	spin_unlock_irqrestore(&qhp->lock, flag);
+	if (cxio_wq_db_enabled(&qhp->wq))
+		ring_doorbell(qhp->wq.doorbell, qhp->wq.qpid);
+
+out:
+	if (err)
+		*bad_wr = wr;
+	return err;
+}
+
+int iwch_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr,
+		      struct ib_recv_wr **bad_wr)
+{
+	int err = 0;
+	struct iwch_qp *qhp;
+	u32 idx;
+	union t3_wr *wqe;
+	u32 num_wrs;
+	unsigned long flag;
+
+	qhp = to_iwch_qp(ibqp);
+	spin_lock_irqsave(&qhp->lock, flag);
+	if (qhp->attr.state > IWCH_QP_STATE_RTS) {
+		spin_unlock_irqrestore(&qhp->lock, flag);
+		err = -EINVAL;
+		goto out;
+	}
+	num_wrs = Q_FREECNT(qhp->wq.rq_rptr, qhp->wq.rq_wptr,
+			    qhp->wq.rq_size_log2) - 1;
+	if (!wr) {
+		spin_unlock_irqrestore(&qhp->lock, flag);
+		err = -ENOMEM;
+		goto out;
+	}
+	while (wr) {
+		if (wr->num_sge > T3_MAX_SGE) {
+			err = -EINVAL;
+			break;
+		}
+		idx = Q_PTR2IDX(qhp->wq.wptr, qhp->wq.size_log2);
+		wqe = (union t3_wr *) (qhp->wq.queue + idx);
+		if (num_wrs)
+			if (wr->sg_list[0].lkey)
+				err = build_rdma_recv(qhp, wqe, wr);
+			else
+				err = build_zero_stag_recv(qhp, wqe, wr);
+		else
+			err = -ENOMEM;
+
+		if (err)
+			break;
+
+		build_fw_riwrh((void *) wqe, T3_WR_RCV, T3_COMPLETION_FLAG,
+			       Q_GENBIT(qhp->wq.wptr, qhp->wq.size_log2),
+			       0, sizeof(struct t3_receive_wr) >> 3, T3_SOPEOP);
+		PDBG("%s cookie 0x%llx idx 0x%x rq_wptr 0x%x rw_rptr 0x%x "
+		     "wqe %p \n", __func__, (unsigned long long) wr->wr_id,
+		     idx, qhp->wq.rq_wptr, qhp->wq.rq_rptr, wqe);
+		++(qhp->wq.rq_wptr);
+		++(qhp->wq.wptr);
+		wr = wr->next;
+		num_wrs--;
+	}
+	spin_unlock_irqrestore(&qhp->lock, flag);
+	if (cxio_wq_db_enabled(&qhp->wq))
+		ring_doorbell(qhp->wq.doorbell, qhp->wq.qpid);
+
+out:
+	if (err)
+		*bad_wr = wr;
+	return err;
+}
+
+int iwch_bind_mw(struct ib_qp *qp,
+			     struct ib_mw *mw,
+			     struct ib_mw_bind *mw_bind)
+{
+	struct iwch_dev *rhp;
+	struct iwch_mw *mhp;
+	struct iwch_qp *qhp;
+	union t3_wr *wqe;
+	u32 pbl_addr;
+	u8 page_size;
+	u32 num_wrs;
+	unsigned long flag;
+	struct ib_sge sgl;
+	int err=0;
+	enum t3_wr_flags t3_wr_flags;
+	u32 idx;
+	struct t3_swsq *sqp;
+
+	qhp = to_iwch_qp(qp);
+	mhp = to_iwch_mw(mw);
+	rhp = qhp->rhp;
+
+	spin_lock_irqsave(&qhp->lock, flag);
+	if (qhp->attr.state > IWCH_QP_STATE_RTS) {
+		spin_unlock_irqrestore(&qhp->lock, flag);
+		return -EINVAL;
+	}
+	num_wrs = Q_FREECNT(qhp->wq.sq_rptr, qhp->wq.sq_wptr,
+			    qhp->wq.sq_size_log2);
+	if (num_wrs == 0) {
+		spin_unlock_irqrestore(&qhp->lock, flag);
+		return -ENOMEM;
+	}
+	idx = Q_PTR2IDX(qhp->wq.wptr, qhp->wq.size_log2);
+	PDBG("%s: idx 0x%0x, mw 0x%p, mw_bind 0x%p\n", __func__, idx,
+	     mw, mw_bind);
+	wqe = (union t3_wr *) (qhp->wq.queue + idx);
+
+	t3_wr_flags = 0;
+	if (mw_bind->send_flags & IB_SEND_SIGNALED)
+		t3_wr_flags = T3_COMPLETION_FLAG;
+
+	sgl.addr = mw_bind->bind_info.addr;
+	sgl.lkey = mw_bind->bind_info.mr->lkey;
+	sgl.length = mw_bind->bind_info.length;
+	wqe->bind.reserved = 0;
+	wqe->bind.type = TPT_VATO;
+
+	/* TBD: check perms */
+	wqe->bind.perms = iwch_ib_to_tpt_bind_access(
+		mw_bind->bind_info.mw_access_flags);
+	wqe->bind.mr_stag = cpu_to_be32(mw_bind->bind_info.mr->lkey);
+	wqe->bind.mw_stag = cpu_to_be32(mw->rkey);
+	wqe->bind.mw_len = cpu_to_be32(mw_bind->bind_info.length);
+	wqe->bind.mw_va = cpu_to_be64(mw_bind->bind_info.addr);
+	err = iwch_sgl2pbl_map(rhp, &sgl, 1, &pbl_addr, &page_size);
+	if (err) {
+		spin_unlock_irqrestore(&qhp->lock, flag);
+		return err;
+	}
+	wqe->send.wrid.id0.hi = qhp->wq.sq_wptr;
+	sqp = qhp->wq.sq + Q_PTR2IDX(qhp->wq.sq_wptr, qhp->wq.sq_size_log2);
+	sqp->wr_id = mw_bind->wr_id;
+	sqp->opcode = T3_BIND_MW;
+	sqp->sq_wptr = qhp->wq.sq_wptr;
+	sqp->complete = 0;
+	sqp->signaled = (mw_bind->send_flags & IB_SEND_SIGNALED);
+	wqe->bind.mr_pbl_addr = cpu_to_be32(pbl_addr);
+	wqe->bind.mr_pagesz = page_size;
+	build_fw_riwrh((void *)wqe, T3_WR_BIND, t3_wr_flags,
+		       Q_GENBIT(qhp->wq.wptr, qhp->wq.size_log2), 0,
+		       sizeof(struct t3_bind_mw_wr) >> 3, T3_SOPEOP);
+	++(qhp->wq.wptr);
+	++(qhp->wq.sq_wptr);
+	spin_unlock_irqrestore(&qhp->lock, flag);
+
+	if (cxio_wq_db_enabled(&qhp->wq))
+		ring_doorbell(qhp->wq.doorbell, qhp->wq.qpid);
+
+	return err;
+}
+
+static inline void build_term_codes(struct respQ_msg_t *rsp_msg,
+				    u8 *layer_type, u8 *ecode)
+{
+	int status = TPT_ERR_INTERNAL_ERR;
+	int tagged = 0;
+	int opcode = -1;
+	int rqtype = 0;
+	int send_inv = 0;
+
+	if (rsp_msg) {
+		status = CQE_STATUS(rsp_msg->cqe);
+		opcode = CQE_OPCODE(rsp_msg->cqe);
+		rqtype = RQ_TYPE(rsp_msg->cqe);
+		send_inv = (opcode == T3_SEND_WITH_INV) ||
+		           (opcode == T3_SEND_WITH_SE_INV);
+		tagged = (opcode == T3_RDMA_WRITE) ||
+			 (rqtype && (opcode == T3_READ_RESP));
+	}
+
+	switch (status) {
+	case TPT_ERR_STAG:
+		if (send_inv) {
+			*layer_type = LAYER_RDMAP|RDMAP_REMOTE_OP;
+			*ecode = RDMAP_CANT_INV_STAG;
+		} else {
+			*layer_type = LAYER_RDMAP|RDMAP_REMOTE_PROT;
+			*ecode = RDMAP_INV_STAG;
+		}
+		break;
+	case TPT_ERR_PDID:
+		*layer_type = LAYER_RDMAP|RDMAP_REMOTE_PROT;
+		if ((opcode == T3_SEND_WITH_INV) ||
+		    (opcode == T3_SEND_WITH_SE_INV))
+			*ecode = RDMAP_CANT_INV_STAG;
+		else
+			*ecode = RDMAP_STAG_NOT_ASSOC;
+		break;
+	case TPT_ERR_QPID:
+		*layer_type = LAYER_RDMAP|RDMAP_REMOTE_PROT;
+		*ecode = RDMAP_STAG_NOT_ASSOC;
+		break;
+	case TPT_ERR_ACCESS:
+		*layer_type = LAYER_RDMAP|RDMAP_REMOTE_PROT;
+		*ecode = RDMAP_ACC_VIOL;
+		break;
+	case TPT_ERR_WRAP:
+		*layer_type = LAYER_RDMAP|RDMAP_REMOTE_PROT;
+		*ecode = RDMAP_TO_WRAP;
+		break;
+	case TPT_ERR_BOUND:
+		if (tagged) {
+			*layer_type = LAYER_DDP|DDP_TAGGED_ERR;
+			*ecode = DDPT_BASE_BOUNDS;
+		} else {
+			*layer_type = LAYER_RDMAP|RDMAP_REMOTE_PROT;
+			*ecode = RDMAP_BASE_BOUNDS;
+		}
+		break;
+	case TPT_ERR_INVALIDATE_SHARED_MR:
+	case TPT_ERR_INVALIDATE_MR_WITH_MW_BOUND:
+		*layer_type = LAYER_RDMAP|RDMAP_REMOTE_OP;
+		*ecode = RDMAP_CANT_INV_STAG;
+		break;
+	case TPT_ERR_ECC:
+	case TPT_ERR_ECC_PSTAG:
+	case TPT_ERR_INTERNAL_ERR:
+		*layer_type = LAYER_RDMAP|RDMAP_LOCAL_CATA;
+		*ecode = 0;
+		break;
+	case TPT_ERR_OUT_OF_RQE:
+		*layer_type = LAYER_DDP|DDP_UNTAGGED_ERR;
+		*ecode = DDPU_INV_MSN_NOBUF;
+		break;
+	case TPT_ERR_PBL_ADDR_BOUND:
+		*layer_type = LAYER_DDP|DDP_TAGGED_ERR;
+		*ecode = DDPT_BASE_BOUNDS;
+		break;
+	case TPT_ERR_CRC:
+		*layer_type = LAYER_MPA|DDP_LLP;
+		*ecode = MPA_CRC_ERR;
+		break;
+	case TPT_ERR_MARKER:
+		*layer_type = LAYER_MPA|DDP_LLP;
+		*ecode = MPA_MARKER_ERR;
+		break;
+	case TPT_ERR_PDU_LEN_ERR:
+		*layer_type = LAYER_DDP|DDP_UNTAGGED_ERR;
+		*ecode = DDPU_MSG_TOOBIG;
+		break;
+	case TPT_ERR_DDP_VERSION:
+		if (tagged) {
+			*layer_type = LAYER_DDP|DDP_TAGGED_ERR;
+			*ecode = DDPT_INV_VERS;
+		} else {
+			*layer_type = LAYER_DDP|DDP_UNTAGGED_ERR;
+			*ecode = DDPU_INV_VERS;
+		}
+		break;
+	case TPT_ERR_RDMA_VERSION:
+		*layer_type = LAYER_RDMAP|RDMAP_REMOTE_OP;
+		*ecode = RDMAP_INV_VERS;
+		break;
+	case TPT_ERR_OPCODE:
+		*layer_type = LAYER_RDMAP|RDMAP_REMOTE_OP;
+		*ecode = RDMAP_INV_OPCODE;
+		break;
+	case TPT_ERR_DDP_QUEUE_NUM:
+		*layer_type = LAYER_DDP|DDP_UNTAGGED_ERR;
+		*ecode = DDPU_INV_QN;
+		break;
+	case TPT_ERR_MSN:
+	case TPT_ERR_MSN_GAP:
+	case TPT_ERR_MSN_RANGE:
+	case TPT_ERR_IRD_OVERFLOW:
+		*layer_type = LAYER_DDP|DDP_UNTAGGED_ERR;
+		*ecode = DDPU_INV_MSN_RANGE;
+		break;
+	case TPT_ERR_TBIT:
+		*layer_type = LAYER_DDP|DDP_LOCAL_CATA;
+		*ecode = 0;
+		break;
+	case TPT_ERR_MO:
+		*layer_type = LAYER_DDP|DDP_UNTAGGED_ERR;
+		*ecode = DDPU_INV_MO;
+		break;
+	default:
+		*layer_type = LAYER_RDMAP|DDP_LOCAL_CATA;
+		*ecode = 0;
+		break;
+	}
+}
+
+int iwch_post_zb_read(struct iwch_ep *ep)
+{
+	union t3_wr *wqe;
+	struct sk_buff *skb;
+	u8 flit_cnt = sizeof(struct t3_rdma_read_wr) >> 3;
+
+	PDBG("%s enter\n", __func__);
+	skb = alloc_skb(40, GFP_KERNEL);
+	if (!skb) {
+		printk(KERN_ERR "%s cannot send zb_read!!\n", __func__);
+		return -ENOMEM;
+	}
+	wqe = (union t3_wr *)skb_put(skb, sizeof(struct t3_rdma_read_wr));
+	memset(wqe, 0, sizeof(struct t3_rdma_read_wr));
+	wqe->read.rdmaop = T3_READ_REQ;
+	wqe->read.reserved[0] = 0;
+	wqe->read.reserved[1] = 0;
+	wqe->read.rem_stag = cpu_to_be32(1);
+	wqe->read.rem_to = cpu_to_be64(1);
+	wqe->read.local_stag = cpu_to_be32(1);
+	wqe->read.local_len = cpu_to_be32(0);
+	wqe->read.local_to = cpu_to_be64(1);
+	wqe->send.wrh.op_seop_flags = cpu_to_be32(V_FW_RIWR_OP(T3_WR_READ));
+	wqe->send.wrh.gen_tid_len = cpu_to_be32(V_FW_RIWR_TID(ep->hwtid)|
+						V_FW_RIWR_LEN(flit_cnt));
+	skb->priority = CPL_PRIORITY_DATA;
+	return iwch_cxgb3_ofld_send(ep->com.qp->rhp->rdev.t3cdev_p, skb);
+}
+
+/*
+ * This posts a TERMINATE with layer=RDMA, type=catastrophic.
+ */
+int iwch_post_terminate(struct iwch_qp *qhp, struct respQ_msg_t *rsp_msg)
+{
+	union t3_wr *wqe;
+	struct terminate_message *term;
+	struct sk_buff *skb;
+
+	PDBG("%s %d\n", __func__, __LINE__);
+	skb = alloc_skb(40, GFP_ATOMIC);
+	if (!skb) {
+		printk(KERN_ERR "%s cannot send TERMINATE!\n", __func__);
+		return -ENOMEM;
+	}
+	wqe = (union t3_wr *)skb_put(skb, 40);
+	memset(wqe, 0, 40);
+	wqe->send.rdmaop = T3_TERMINATE;
+
+	/* immediate data length */
+	wqe->send.plen = htonl(4);
+
+	/* immediate data starts here. */
+	term = (struct terminate_message *)wqe->send.sgl;
+	build_term_codes(rsp_msg, &term->layer_etype, &term->ecode);
+	wqe->send.wrh.op_seop_flags = cpu_to_be32(V_FW_RIWR_OP(T3_WR_SEND) |
+			 V_FW_RIWR_FLAGS(T3_COMPLETION_FLAG | T3_NOTIFY_FLAG));
+	wqe->send.wrh.gen_tid_len = cpu_to_be32(V_FW_RIWR_TID(qhp->ep->hwtid));
+	skb->priority = CPL_PRIORITY_DATA;
+	return iwch_cxgb3_ofld_send(qhp->rhp->rdev.t3cdev_p, skb);
+}
+
+/*
+ * Assumes qhp lock is held.
+ */
+static void __flush_qp(struct iwch_qp *qhp, struct iwch_cq *rchp,
+				struct iwch_cq *schp)
+{
+	int count;
+	int flushed;
+
+
+	PDBG("%s qhp %p rchp %p schp %p\n", __func__, qhp, rchp, schp);
+	/* take a ref on the qhp since we must release the lock */
+	atomic_inc(&qhp->refcnt);
+	spin_unlock(&qhp->lock);
+
+	/* locking hierarchy: cq lock first, then qp lock. */
+	spin_lock(&rchp->lock);
+	spin_lock(&qhp->lock);
+	cxio_flush_hw_cq(&rchp->cq);
+	cxio_count_rcqes(&rchp->cq, &qhp->wq, &count);
+	flushed = cxio_flush_rq(&qhp->wq, &rchp->cq, count);
+	spin_unlock(&qhp->lock);
+	spin_unlock(&rchp->lock);
+	if (flushed) {
+		spin_lock(&rchp->comp_handler_lock);
+		(*rchp->ibcq.comp_handler)(&rchp->ibcq, rchp->ibcq.cq_context);
+		spin_unlock(&rchp->comp_handler_lock);
+	}
+
+	/* locking hierarchy: cq lock first, then qp lock. */
+	spin_lock(&schp->lock);
+	spin_lock(&qhp->lock);
+	cxio_flush_hw_cq(&schp->cq);
+	cxio_count_scqes(&schp->cq, &qhp->wq, &count);
+	flushed = cxio_flush_sq(&qhp->wq, &schp->cq, count);
+	spin_unlock(&qhp->lock);
+	spin_unlock(&schp->lock);
+	if (flushed) {
+		spin_lock(&schp->comp_handler_lock);
+		(*schp->ibcq.comp_handler)(&schp->ibcq, schp->ibcq.cq_context);
+		spin_unlock(&schp->comp_handler_lock);
+	}
+
+	/* deref */
+	if (atomic_dec_and_test(&qhp->refcnt))
+	        wake_up(&qhp->wait);
+
+	spin_lock(&qhp->lock);
+}
+
+static void flush_qp(struct iwch_qp *qhp)
+{
+	struct iwch_cq *rchp, *schp;
+
+	rchp = get_chp(qhp->rhp, qhp->attr.rcq);
+	schp = get_chp(qhp->rhp, qhp->attr.scq);
+
+	if (qhp->ibqp.uobject) {
+		cxio_set_wq_in_error(&qhp->wq);
+		cxio_set_cq_in_error(&rchp->cq);
+		spin_lock(&rchp->comp_handler_lock);
+		(*rchp->ibcq.comp_handler)(&rchp->ibcq, rchp->ibcq.cq_context);
+		spin_unlock(&rchp->comp_handler_lock);
+		if (schp != rchp) {
+			cxio_set_cq_in_error(&schp->cq);
+			spin_lock(&schp->comp_handler_lock);
+			(*schp->ibcq.comp_handler)(&schp->ibcq,
+						   schp->ibcq.cq_context);
+			spin_unlock(&schp->comp_handler_lock);
+		}
+		return;
+	}
+	__flush_qp(qhp, rchp, schp);
+}
+
+
+/*
+ * Return count of RECV WRs posted
+ */
+u16 iwch_rqes_posted(struct iwch_qp *qhp)
+{
+	union t3_wr *wqe = qhp->wq.queue;
+	u16 count = 0;
+
+	while (count < USHRT_MAX && fw_riwrh_opcode((struct fw_riwrh *)wqe) == T3_WR_RCV) {
+		count++;
+		wqe++;
+	}
+	PDBG("%s qhp %p count %u\n", __func__, qhp, count);
+	return count;
+}
+
+static int rdma_init(struct iwch_dev *rhp, struct iwch_qp *qhp,
+				enum iwch_qp_attr_mask mask,
+				struct iwch_qp_attributes *attrs)
+{
+	struct t3_rdma_init_attr init_attr;
+	int ret;
+
+	init_attr.tid = qhp->ep->hwtid;
+	init_attr.qpid = qhp->wq.qpid;
+	init_attr.pdid = qhp->attr.pd;
+	init_attr.scqid = qhp->attr.scq;
+	init_attr.rcqid = qhp->attr.rcq;
+	init_attr.rq_addr = qhp->wq.rq_addr;
+	init_attr.rq_size = 1 << qhp->wq.rq_size_log2;
+	init_attr.mpaattrs = uP_RI_MPA_IETF_ENABLE |
+		qhp->attr.mpa_attr.recv_marker_enabled |
+		(qhp->attr.mpa_attr.xmit_marker_enabled << 1) |
+		(qhp->attr.mpa_attr.crc_enabled << 2);
+
+	init_attr.qpcaps = uP_RI_QP_RDMA_READ_ENABLE |
+			   uP_RI_QP_RDMA_WRITE_ENABLE |
+			   uP_RI_QP_BIND_ENABLE;
+	if (!qhp->ibqp.uobject)
+		init_attr.qpcaps |= uP_RI_QP_STAG0_ENABLE |
+				    uP_RI_QP_FAST_REGISTER_ENABLE;
+
+	init_attr.tcp_emss = qhp->ep->emss;
+	init_attr.ord = qhp->attr.max_ord;
+	init_attr.ird = qhp->attr.max_ird;
+	init_attr.qp_dma_addr = qhp->wq.dma_addr;
+	init_attr.qp_dma_size = (1UL << qhp->wq.size_log2);
+	init_attr.rqe_count = iwch_rqes_posted(qhp);
+	init_attr.flags = qhp->attr.mpa_attr.initiator ? MPA_INITIATOR : 0;
+	init_attr.chan = qhp->ep->l2t->smt_idx;
+	if (peer2peer) {
+		init_attr.rtr_type = RTR_READ;
+		if (init_attr.ord == 0 && qhp->attr.mpa_attr.initiator)
+			init_attr.ord = 1;
+		if (init_attr.ird == 0 && !qhp->attr.mpa_attr.initiator)
+			init_attr.ird = 1;
+	} else
+		init_attr.rtr_type = 0;
+	init_attr.irs = qhp->ep->rcv_seq;
+	PDBG("%s init_attr.rq_addr 0x%x init_attr.rq_size = %d "
+	     "flags 0x%x qpcaps 0x%x\n", __func__,
+	     init_attr.rq_addr, init_attr.rq_size,
+	     init_attr.flags, init_attr.qpcaps);
+	ret = cxio_rdma_init(&rhp->rdev, &init_attr);
+	PDBG("%s ret %d\n", __func__, ret);
+	return ret;
+}
+
+int iwch_modify_qp(struct iwch_dev *rhp, struct iwch_qp *qhp,
+				enum iwch_qp_attr_mask mask,
+				struct iwch_qp_attributes *attrs,
+				int internal)
+{
+	int ret = 0;
+	struct iwch_qp_attributes newattr = qhp->attr;
+	unsigned long flag;
+	int disconnect = 0;
+	int terminate = 0;
+	int abort = 0;
+	int free = 0;
+	struct iwch_ep *ep = NULL;
+
+	PDBG("%s qhp %p qpid 0x%x ep %p state %d -> %d\n", __func__,
+	     qhp, qhp->wq.qpid, qhp->ep, qhp->attr.state,
+	     (mask & IWCH_QP_ATTR_NEXT_STATE) ? attrs->next_state : -1);
+
+	spin_lock_irqsave(&qhp->lock, flag);
+
+	/* Process attr changes if in IDLE */
+	if (mask & IWCH_QP_ATTR_VALID_MODIFY) {
+		if (qhp->attr.state != IWCH_QP_STATE_IDLE) {
+			ret = -EIO;
+			goto out;
+		}
+		if (mask & IWCH_QP_ATTR_ENABLE_RDMA_READ)
+			newattr.enable_rdma_read = attrs->enable_rdma_read;
+		if (mask & IWCH_QP_ATTR_ENABLE_RDMA_WRITE)
+			newattr.enable_rdma_write = attrs->enable_rdma_write;
+		if (mask & IWCH_QP_ATTR_ENABLE_RDMA_BIND)
+			newattr.enable_bind = attrs->enable_bind;
+		if (mask & IWCH_QP_ATTR_MAX_ORD) {
+			if (attrs->max_ord >
+			    rhp->attr.max_rdma_read_qp_depth) {
+				ret = -EINVAL;
+				goto out;
+			}
+			newattr.max_ord = attrs->max_ord;
+		}
+		if (mask & IWCH_QP_ATTR_MAX_IRD) {
+			if (attrs->max_ird >
+			    rhp->attr.max_rdma_reads_per_qp) {
+				ret = -EINVAL;
+				goto out;
+			}
+			newattr.max_ird = attrs->max_ird;
+		}
+		qhp->attr = newattr;
+	}
+
+	if (!(mask & IWCH_QP_ATTR_NEXT_STATE))
+		goto out;
+	if (qhp->attr.state == attrs->next_state)
+		goto out;
+
+	switch (qhp->attr.state) {
+	case IWCH_QP_STATE_IDLE:
+		switch (attrs->next_state) {
+		case IWCH_QP_STATE_RTS:
+			if (!(mask & IWCH_QP_ATTR_LLP_STREAM_HANDLE)) {
+				ret = -EINVAL;
+				goto out;
+			}
+			if (!(mask & IWCH_QP_ATTR_MPA_ATTR)) {
+				ret = -EINVAL;
+				goto out;
+			}
+			qhp->attr.mpa_attr = attrs->mpa_attr;
+			qhp->attr.llp_stream_handle = attrs->llp_stream_handle;
+			qhp->ep = qhp->attr.llp_stream_handle;
+			qhp->attr.state = IWCH_QP_STATE_RTS;
+
+			/*
+			 * Ref the endpoint here and deref when we
+			 * disassociate the endpoint from the QP.  This
+			 * happens in CLOSING->IDLE transition or *->ERROR
+			 * transition.
+			 */
+			get_ep(&qhp->ep->com);
+			spin_unlock_irqrestore(&qhp->lock, flag);
+			ret = rdma_init(rhp, qhp, mask, attrs);
+			spin_lock_irqsave(&qhp->lock, flag);
+			if (ret)
+				goto err;
+			break;
+		case IWCH_QP_STATE_ERROR:
+			qhp->attr.state = IWCH_QP_STATE_ERROR;
+			flush_qp(qhp);
+			break;
+		default:
+			ret = -EINVAL;
+			goto out;
+		}
+		break;
+	case IWCH_QP_STATE_RTS:
+		switch (attrs->next_state) {
+		case IWCH_QP_STATE_CLOSING:
+			BUG_ON(atomic_read(&qhp->ep->com.kref.refcount) < 2);
+			qhp->attr.state = IWCH_QP_STATE_CLOSING;
+			if (!internal) {
+				abort=0;
+				disconnect = 1;
+				ep = qhp->ep;
+				get_ep(&ep->com);
+			}
+			break;
+		case IWCH_QP_STATE_TERMINATE:
+			qhp->attr.state = IWCH_QP_STATE_TERMINATE;
+			if (qhp->ibqp.uobject)
+				cxio_set_wq_in_error(&qhp->wq);
+			if (!internal)
+				terminate = 1;
+			break;
+		case IWCH_QP_STATE_ERROR:
+			qhp->attr.state = IWCH_QP_STATE_ERROR;
+			if (!internal) {
+				abort=1;
+				disconnect = 1;
+				ep = qhp->ep;
+				get_ep(&ep->com);
+			}
+			goto err;
+			break;
+		default:
+			ret = -EINVAL;
+			goto out;
+		}
+		break;
+	case IWCH_QP_STATE_CLOSING:
+		if (!internal) {
+			ret = -EINVAL;
+			goto out;
+		}
+		switch (attrs->next_state) {
+			case IWCH_QP_STATE_IDLE:
+				flush_qp(qhp);
+				qhp->attr.state = IWCH_QP_STATE_IDLE;
+				qhp->attr.llp_stream_handle = NULL;
+				put_ep(&qhp->ep->com);
+				qhp->ep = NULL;
+				wake_up(&qhp->wait);
+				break;
+			case IWCH_QP_STATE_ERROR:
+				goto err;
+			default:
+				ret = -EINVAL;
+				goto err;
+		}
+		break;
+	case IWCH_QP_STATE_ERROR:
+		if (attrs->next_state != IWCH_QP_STATE_IDLE) {
+			ret = -EINVAL;
+			goto out;
+		}
+
+		if (!Q_EMPTY(qhp->wq.sq_rptr, qhp->wq.sq_wptr) ||
+		    !Q_EMPTY(qhp->wq.rq_rptr, qhp->wq.rq_wptr)) {
+			ret = -EINVAL;
+			goto out;
+		}
+		qhp->attr.state = IWCH_QP_STATE_IDLE;
+		break;
+	case IWCH_QP_STATE_TERMINATE:
+		if (!internal) {
+			ret = -EINVAL;
+			goto out;
+		}
+		goto err;
+		break;
+	default:
+		printk(KERN_ERR "%s in a bad state %d\n",
+		       __func__, qhp->attr.state);
+		ret = -EINVAL;
+		goto err;
+		break;
+	}
+	goto out;
+err:
+	PDBG("%s disassociating ep %p qpid 0x%x\n", __func__, qhp->ep,
+	     qhp->wq.qpid);
+
+	/* disassociate the LLP connection */
+	qhp->attr.llp_stream_handle = NULL;
+	ep = qhp->ep;
+	qhp->ep = NULL;
+	qhp->attr.state = IWCH_QP_STATE_ERROR;
+	free=1;
+	wake_up(&qhp->wait);
+	BUG_ON(!ep);
+	flush_qp(qhp);
+out:
+	spin_unlock_irqrestore(&qhp->lock, flag);
+
+	if (terminate)
+		iwch_post_terminate(qhp, NULL);
+
+	/*
+	 * If disconnect is 1, then we need to initiate a disconnect
+	 * on the EP.  This can be a normal close (RTS->CLOSING) or
+	 * an abnormal close (RTS/CLOSING->ERROR).
+	 */
+	if (disconnect) {
+		iwch_ep_disconnect(ep, abort, GFP_KERNEL);
+		put_ep(&ep->com);
+	}
+
+	/*
+	 * If free is 1, then we've disassociated the EP from the QP
+	 * and we need to dereference the EP.
+	 */
+	if (free)
+		put_ep(&ep->com);
+
+	PDBG("%s exit state %d\n", __func__, qhp->attr.state);
+	return ret;
+}
diff --git a/drivers/infiniband/hw/cxgb3/iwch_user.h b/drivers/infiniband/hw/cxgb3/iwch_user.h
new file mode 100644
index 0000000..a277c31
--- /dev/null
+++ b/drivers/infiniband/hw/cxgb3/iwch_user.h
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2006 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __IWCH_USER_H__
+#define __IWCH_USER_H__
+
+#define IWCH_UVERBS_ABI_VERSION	1
+
+/*
+ * Make sure that all structs defined in this file remain laid out so
+ * that they pack the same way on 32-bit and 64-bit architectures (to
+ * avoid incompatibility between 32-bit userspace and 64-bit kernels).
+ * In particular do not use pointer types -- pass pointers in __u64
+ * instead.
+ */
+struct iwch_create_cq_req {
+	__u64 user_rptr_addr;
+};
+
+struct iwch_create_cq_resp_v0 {
+	__u64 key;
+	__u32 cqid;
+	__u32 size_log2;
+};
+
+struct iwch_create_cq_resp {
+	__u64 key;
+	__u32 cqid;
+	__u32 size_log2;
+	__u32 memsize;
+	__u32 reserved;
+};
+
+struct iwch_create_qp_resp {
+	__u64 key;
+	__u64 db_key;
+	__u32 qpid;
+	__u32 size_log2;
+	__u32 sq_size_log2;
+	__u32 rq_size_log2;
+};
+
+struct iwch_reg_user_mr_resp {
+	__u32 pbl_addr;
+};
+#endif
diff --git a/drivers/infiniband/hw/cxgb3/tcb.h b/drivers/infiniband/hw/cxgb3/tcb.h
new file mode 100644
index 0000000..c702dc1
--- /dev/null
+++ b/drivers/infiniband/hw/cxgb3/tcb.h
@@ -0,0 +1,632 @@
+/*
+ * Copyright (c) 2007 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef _TCB_DEFS_H
+#define _TCB_DEFS_H
+
+#define W_TCB_T_STATE    0
+#define S_TCB_T_STATE    0
+#define M_TCB_T_STATE    0xfULL
+#define V_TCB_T_STATE(x) ((x) << S_TCB_T_STATE)
+
+#define W_TCB_TIMER    0
+#define S_TCB_TIMER    4
+#define M_TCB_TIMER    0x1ULL
+#define V_TCB_TIMER(x) ((x) << S_TCB_TIMER)
+
+#define W_TCB_DACK_TIMER    0
+#define S_TCB_DACK_TIMER    5
+#define M_TCB_DACK_TIMER    0x1ULL
+#define V_TCB_DACK_TIMER(x) ((x) << S_TCB_DACK_TIMER)
+
+#define W_TCB_DEL_FLAG    0
+#define S_TCB_DEL_FLAG    6
+#define M_TCB_DEL_FLAG    0x1ULL
+#define V_TCB_DEL_FLAG(x) ((x) << S_TCB_DEL_FLAG)
+
+#define W_TCB_L2T_IX    0
+#define S_TCB_L2T_IX    7
+#define M_TCB_L2T_IX    0x7ffULL
+#define V_TCB_L2T_IX(x) ((x) << S_TCB_L2T_IX)
+
+#define W_TCB_SMAC_SEL    0
+#define S_TCB_SMAC_SEL    18
+#define M_TCB_SMAC_SEL    0x3ULL
+#define V_TCB_SMAC_SEL(x) ((x) << S_TCB_SMAC_SEL)
+
+#define W_TCB_TOS    0
+#define S_TCB_TOS    20
+#define M_TCB_TOS    0x3fULL
+#define V_TCB_TOS(x) ((x) << S_TCB_TOS)
+
+#define W_TCB_MAX_RT    0
+#define S_TCB_MAX_RT    26
+#define M_TCB_MAX_RT    0xfULL
+#define V_TCB_MAX_RT(x) ((x) << S_TCB_MAX_RT)
+
+#define W_TCB_T_RXTSHIFT    0
+#define S_TCB_T_RXTSHIFT    30
+#define M_TCB_T_RXTSHIFT    0xfULL
+#define V_TCB_T_RXTSHIFT(x) ((x) << S_TCB_T_RXTSHIFT)
+
+#define W_TCB_T_DUPACKS    1
+#define S_TCB_T_DUPACKS    2
+#define M_TCB_T_DUPACKS    0xfULL
+#define V_TCB_T_DUPACKS(x) ((x) << S_TCB_T_DUPACKS)
+
+#define W_TCB_T_MAXSEG    1
+#define S_TCB_T_MAXSEG    6
+#define M_TCB_T_MAXSEG    0xfULL
+#define V_TCB_T_MAXSEG(x) ((x) << S_TCB_T_MAXSEG)
+
+#define W_TCB_T_FLAGS1    1
+#define S_TCB_T_FLAGS1    10
+#define M_TCB_T_FLAGS1    0xffffffffULL
+#define V_TCB_T_FLAGS1(x) ((x) << S_TCB_T_FLAGS1)
+
+#define W_TCB_T_MIGRATION    1
+#define S_TCB_T_MIGRATION    20
+#define M_TCB_T_MIGRATION    0x1ULL
+#define V_TCB_T_MIGRATION(x) ((x) << S_TCB_T_MIGRATION)
+
+#define W_TCB_T_FLAGS2    2
+#define S_TCB_T_FLAGS2    10
+#define M_TCB_T_FLAGS2    0x7fULL
+#define V_TCB_T_FLAGS2(x) ((x) << S_TCB_T_FLAGS2)
+
+#define W_TCB_SND_SCALE    2
+#define S_TCB_SND_SCALE    17
+#define M_TCB_SND_SCALE    0xfULL
+#define V_TCB_SND_SCALE(x) ((x) << S_TCB_SND_SCALE)
+
+#define W_TCB_RCV_SCALE    2
+#define S_TCB_RCV_SCALE    21
+#define M_TCB_RCV_SCALE    0xfULL
+#define V_TCB_RCV_SCALE(x) ((x) << S_TCB_RCV_SCALE)
+
+#define W_TCB_SND_UNA_RAW    2
+#define S_TCB_SND_UNA_RAW    25
+#define M_TCB_SND_UNA_RAW    0x7ffffffULL
+#define V_TCB_SND_UNA_RAW(x) ((x) << S_TCB_SND_UNA_RAW)
+
+#define W_TCB_SND_NXT_RAW    3
+#define S_TCB_SND_NXT_RAW    20
+#define M_TCB_SND_NXT_RAW    0x7ffffffULL
+#define V_TCB_SND_NXT_RAW(x) ((x) << S_TCB_SND_NXT_RAW)
+
+#define W_TCB_RCV_NXT    4
+#define S_TCB_RCV_NXT    15
+#define M_TCB_RCV_NXT    0xffffffffULL
+#define V_TCB_RCV_NXT(x) ((x) << S_TCB_RCV_NXT)
+
+#define W_TCB_RCV_ADV    5
+#define S_TCB_RCV_ADV    15
+#define M_TCB_RCV_ADV    0xffffULL
+#define V_TCB_RCV_ADV(x) ((x) << S_TCB_RCV_ADV)
+
+#define W_TCB_SND_MAX_RAW    5
+#define S_TCB_SND_MAX_RAW    31
+#define M_TCB_SND_MAX_RAW    0x7ffffffULL
+#define V_TCB_SND_MAX_RAW(x) ((x) << S_TCB_SND_MAX_RAW)
+
+#define W_TCB_SND_CWND    6
+#define S_TCB_SND_CWND    26
+#define M_TCB_SND_CWND    0x7ffffffULL
+#define V_TCB_SND_CWND(x) ((x) << S_TCB_SND_CWND)
+
+#define W_TCB_SND_SSTHRESH    7
+#define S_TCB_SND_SSTHRESH    21
+#define M_TCB_SND_SSTHRESH    0x7ffffffULL
+#define V_TCB_SND_SSTHRESH(x) ((x) << S_TCB_SND_SSTHRESH)
+
+#define W_TCB_T_RTT_TS_RECENT_AGE    8
+#define S_TCB_T_RTT_TS_RECENT_AGE    16
+#define M_TCB_T_RTT_TS_RECENT_AGE    0xffffffffULL
+#define V_TCB_T_RTT_TS_RECENT_AGE(x) ((x) << S_TCB_T_RTT_TS_RECENT_AGE)
+
+#define W_TCB_T_RTSEQ_RECENT    9
+#define S_TCB_T_RTSEQ_RECENT    16
+#define M_TCB_T_RTSEQ_RECENT    0xffffffffULL
+#define V_TCB_T_RTSEQ_RECENT(x) ((x) << S_TCB_T_RTSEQ_RECENT)
+
+#define W_TCB_T_SRTT    10
+#define S_TCB_T_SRTT    16
+#define M_TCB_T_SRTT    0xffffULL
+#define V_TCB_T_SRTT(x) ((x) << S_TCB_T_SRTT)
+
+#define W_TCB_T_RTTVAR    11
+#define S_TCB_T_RTTVAR    0
+#define M_TCB_T_RTTVAR    0xffffULL
+#define V_TCB_T_RTTVAR(x) ((x) << S_TCB_T_RTTVAR)
+
+#define W_TCB_TS_LAST_ACK_SENT_RAW    11
+#define S_TCB_TS_LAST_ACK_SENT_RAW    16
+#define M_TCB_TS_LAST_ACK_SENT_RAW    0x7ffffffULL
+#define V_TCB_TS_LAST_ACK_SENT_RAW(x) ((x) << S_TCB_TS_LAST_ACK_SENT_RAW)
+
+#define W_TCB_DIP    12
+#define S_TCB_DIP    11
+#define M_TCB_DIP    0xffffffffULL
+#define V_TCB_DIP(x) ((x) << S_TCB_DIP)
+
+#define W_TCB_SIP    13
+#define S_TCB_SIP    11
+#define M_TCB_SIP    0xffffffffULL
+#define V_TCB_SIP(x) ((x) << S_TCB_SIP)
+
+#define W_TCB_DP    14
+#define S_TCB_DP    11
+#define M_TCB_DP    0xffffULL
+#define V_TCB_DP(x) ((x) << S_TCB_DP)
+
+#define W_TCB_SP    14
+#define S_TCB_SP    27
+#define M_TCB_SP    0xffffULL
+#define V_TCB_SP(x) ((x) << S_TCB_SP)
+
+#define W_TCB_TIMESTAMP    15
+#define S_TCB_TIMESTAMP    11
+#define M_TCB_TIMESTAMP    0xffffffffULL
+#define V_TCB_TIMESTAMP(x) ((x) << S_TCB_TIMESTAMP)
+
+#define W_TCB_TIMESTAMP_OFFSET    16
+#define S_TCB_TIMESTAMP_OFFSET    11
+#define M_TCB_TIMESTAMP_OFFSET    0xfULL
+#define V_TCB_TIMESTAMP_OFFSET(x) ((x) << S_TCB_TIMESTAMP_OFFSET)
+
+#define W_TCB_TX_MAX    16
+#define S_TCB_TX_MAX    15
+#define M_TCB_TX_MAX    0xffffffffULL
+#define V_TCB_TX_MAX(x) ((x) << S_TCB_TX_MAX)
+
+#define W_TCB_TX_HDR_PTR_RAW    17
+#define S_TCB_TX_HDR_PTR_RAW    15
+#define M_TCB_TX_HDR_PTR_RAW    0x1ffffULL
+#define V_TCB_TX_HDR_PTR_RAW(x) ((x) << S_TCB_TX_HDR_PTR_RAW)
+
+#define W_TCB_TX_LAST_PTR_RAW    18
+#define S_TCB_TX_LAST_PTR_RAW    0
+#define M_TCB_TX_LAST_PTR_RAW    0x1ffffULL
+#define V_TCB_TX_LAST_PTR_RAW(x) ((x) << S_TCB_TX_LAST_PTR_RAW)
+
+#define W_TCB_TX_COMPACT    18
+#define S_TCB_TX_COMPACT    17
+#define M_TCB_TX_COMPACT    0x1ULL
+#define V_TCB_TX_COMPACT(x) ((x) << S_TCB_TX_COMPACT)
+
+#define W_TCB_RX_COMPACT    18
+#define S_TCB_RX_COMPACT    18
+#define M_TCB_RX_COMPACT    0x1ULL
+#define V_TCB_RX_COMPACT(x) ((x) << S_TCB_RX_COMPACT)
+
+#define W_TCB_RCV_WND    18
+#define S_TCB_RCV_WND    19
+#define M_TCB_RCV_WND    0x7ffffffULL
+#define V_TCB_RCV_WND(x) ((x) << S_TCB_RCV_WND)
+
+#define W_TCB_RX_HDR_OFFSET    19
+#define S_TCB_RX_HDR_OFFSET    14
+#define M_TCB_RX_HDR_OFFSET    0x7ffffffULL
+#define V_TCB_RX_HDR_OFFSET(x) ((x) << S_TCB_RX_HDR_OFFSET)
+
+#define W_TCB_RX_FRAG0_START_IDX_RAW    20
+#define S_TCB_RX_FRAG0_START_IDX_RAW    9
+#define M_TCB_RX_FRAG0_START_IDX_RAW    0x7ffffffULL
+#define V_TCB_RX_FRAG0_START_IDX_RAW(x) ((x) << S_TCB_RX_FRAG0_START_IDX_RAW)
+
+#define W_TCB_RX_FRAG1_START_IDX_OFFSET    21
+#define S_TCB_RX_FRAG1_START_IDX_OFFSET    4
+#define M_TCB_RX_FRAG1_START_IDX_OFFSET    0x7ffffffULL
+#define V_TCB_RX_FRAG1_START_IDX_OFFSET(x) ((x) << S_TCB_RX_FRAG1_START_IDX_OFFSET)
+
+#define W_TCB_RX_FRAG0_LEN    21
+#define S_TCB_RX_FRAG0_LEN    31
+#define M_TCB_RX_FRAG0_LEN    0x7ffffffULL
+#define V_TCB_RX_FRAG0_LEN(x) ((x) << S_TCB_RX_FRAG0_LEN)
+
+#define W_TCB_RX_FRAG1_LEN    22
+#define S_TCB_RX_FRAG1_LEN    26
+#define M_TCB_RX_FRAG1_LEN    0x7ffffffULL
+#define V_TCB_RX_FRAG1_LEN(x) ((x) << S_TCB_RX_FRAG1_LEN)
+
+#define W_TCB_NEWRENO_RECOVER    23
+#define S_TCB_NEWRENO_RECOVER    21
+#define M_TCB_NEWRENO_RECOVER    0x7ffffffULL
+#define V_TCB_NEWRENO_RECOVER(x) ((x) << S_TCB_NEWRENO_RECOVER)
+
+#define W_TCB_PDU_HAVE_LEN    24
+#define S_TCB_PDU_HAVE_LEN    16
+#define M_TCB_PDU_HAVE_LEN    0x1ULL
+#define V_TCB_PDU_HAVE_LEN(x) ((x) << S_TCB_PDU_HAVE_LEN)
+
+#define W_TCB_PDU_LEN    24
+#define S_TCB_PDU_LEN    17
+#define M_TCB_PDU_LEN    0xffffULL
+#define V_TCB_PDU_LEN(x) ((x) << S_TCB_PDU_LEN)
+
+#define W_TCB_RX_QUIESCE    25
+#define S_TCB_RX_QUIESCE    1
+#define M_TCB_RX_QUIESCE    0x1ULL
+#define V_TCB_RX_QUIESCE(x) ((x) << S_TCB_RX_QUIESCE)
+
+#define W_TCB_RX_PTR_RAW    25
+#define S_TCB_RX_PTR_RAW    2
+#define M_TCB_RX_PTR_RAW    0x1ffffULL
+#define V_TCB_RX_PTR_RAW(x) ((x) << S_TCB_RX_PTR_RAW)
+
+#define W_TCB_CPU_NO    25
+#define S_TCB_CPU_NO    19
+#define M_TCB_CPU_NO    0x7fULL
+#define V_TCB_CPU_NO(x) ((x) << S_TCB_CPU_NO)
+
+#define W_TCB_ULP_TYPE    25
+#define S_TCB_ULP_TYPE    26
+#define M_TCB_ULP_TYPE    0xfULL
+#define V_TCB_ULP_TYPE(x) ((x) << S_TCB_ULP_TYPE)
+
+#define W_TCB_RX_FRAG1_PTR_RAW    25
+#define S_TCB_RX_FRAG1_PTR_RAW    30
+#define M_TCB_RX_FRAG1_PTR_RAW    0x1ffffULL
+#define V_TCB_RX_FRAG1_PTR_RAW(x) ((x) << S_TCB_RX_FRAG1_PTR_RAW)
+
+#define W_TCB_RX_FRAG2_START_IDX_OFFSET_RAW    26
+#define S_TCB_RX_FRAG2_START_IDX_OFFSET_RAW    15
+#define M_TCB_RX_FRAG2_START_IDX_OFFSET_RAW    0x7ffffffULL
+#define V_TCB_RX_FRAG2_START_IDX_OFFSET_RAW(x) ((x) << S_TCB_RX_FRAG2_START_IDX_OFFSET_RAW)
+
+#define W_TCB_RX_FRAG2_PTR_RAW    27
+#define S_TCB_RX_FRAG2_PTR_RAW    10
+#define M_TCB_RX_FRAG2_PTR_RAW    0x1ffffULL
+#define V_TCB_RX_FRAG2_PTR_RAW(x) ((x) << S_TCB_RX_FRAG2_PTR_RAW)
+
+#define W_TCB_RX_FRAG2_LEN_RAW    27
+#define S_TCB_RX_FRAG2_LEN_RAW    27
+#define M_TCB_RX_FRAG2_LEN_RAW    0x7ffffffULL
+#define V_TCB_RX_FRAG2_LEN_RAW(x) ((x) << S_TCB_RX_FRAG2_LEN_RAW)
+
+#define W_TCB_RX_FRAG3_PTR_RAW    28
+#define S_TCB_RX_FRAG3_PTR_RAW    22
+#define M_TCB_RX_FRAG3_PTR_RAW    0x1ffffULL
+#define V_TCB_RX_FRAG3_PTR_RAW(x) ((x) << S_TCB_RX_FRAG3_PTR_RAW)
+
+#define W_TCB_RX_FRAG3_LEN_RAW    29
+#define S_TCB_RX_FRAG3_LEN_RAW    7
+#define M_TCB_RX_FRAG3_LEN_RAW    0x7ffffffULL
+#define V_TCB_RX_FRAG3_LEN_RAW(x) ((x) << S_TCB_RX_FRAG3_LEN_RAW)
+
+#define W_TCB_RX_FRAG3_START_IDX_OFFSET_RAW    30
+#define S_TCB_RX_FRAG3_START_IDX_OFFSET_RAW    2
+#define M_TCB_RX_FRAG3_START_IDX_OFFSET_RAW    0x7ffffffULL
+#define V_TCB_RX_FRAG3_START_IDX_OFFSET_RAW(x) ((x) << S_TCB_RX_FRAG3_START_IDX_OFFSET_RAW)
+
+#define W_TCB_PDU_HDR_LEN    30
+#define S_TCB_PDU_HDR_LEN    29
+#define M_TCB_PDU_HDR_LEN    0xffULL
+#define V_TCB_PDU_HDR_LEN(x) ((x) << S_TCB_PDU_HDR_LEN)
+
+#define W_TCB_SLUSH1    31
+#define S_TCB_SLUSH1    5
+#define M_TCB_SLUSH1    0x7ffffULL
+#define V_TCB_SLUSH1(x) ((x) << S_TCB_SLUSH1)
+
+#define W_TCB_ULP_RAW    31
+#define S_TCB_ULP_RAW    24
+#define M_TCB_ULP_RAW    0xffULL
+#define V_TCB_ULP_RAW(x) ((x) << S_TCB_ULP_RAW)
+
+#define W_TCB_DDP_RDMAP_VERSION    25
+#define S_TCB_DDP_RDMAP_VERSION    30
+#define M_TCB_DDP_RDMAP_VERSION    0x1ULL
+#define V_TCB_DDP_RDMAP_VERSION(x) ((x) << S_TCB_DDP_RDMAP_VERSION)
+
+#define W_TCB_MARKER_ENABLE_RX    25
+#define S_TCB_MARKER_ENABLE_RX    31
+#define M_TCB_MARKER_ENABLE_RX    0x1ULL
+#define V_TCB_MARKER_ENABLE_RX(x) ((x) << S_TCB_MARKER_ENABLE_RX)
+
+#define W_TCB_MARKER_ENABLE_TX    26
+#define S_TCB_MARKER_ENABLE_TX    0
+#define M_TCB_MARKER_ENABLE_TX    0x1ULL
+#define V_TCB_MARKER_ENABLE_TX(x) ((x) << S_TCB_MARKER_ENABLE_TX)
+
+#define W_TCB_CRC_ENABLE    26
+#define S_TCB_CRC_ENABLE    1
+#define M_TCB_CRC_ENABLE    0x1ULL
+#define V_TCB_CRC_ENABLE(x) ((x) << S_TCB_CRC_ENABLE)
+
+#define W_TCB_IRS_ULP    26
+#define S_TCB_IRS_ULP    2
+#define M_TCB_IRS_ULP    0x1ffULL
+#define V_TCB_IRS_ULP(x) ((x) << S_TCB_IRS_ULP)
+
+#define W_TCB_ISS_ULP    26
+#define S_TCB_ISS_ULP    11
+#define M_TCB_ISS_ULP    0x1ffULL
+#define V_TCB_ISS_ULP(x) ((x) << S_TCB_ISS_ULP)
+
+#define W_TCB_TX_PDU_LEN    26
+#define S_TCB_TX_PDU_LEN    20
+#define M_TCB_TX_PDU_LEN    0x3fffULL
+#define V_TCB_TX_PDU_LEN(x) ((x) << S_TCB_TX_PDU_LEN)
+
+#define W_TCB_TX_PDU_OUT    27
+#define S_TCB_TX_PDU_OUT    2
+#define M_TCB_TX_PDU_OUT    0x1ULL
+#define V_TCB_TX_PDU_OUT(x) ((x) << S_TCB_TX_PDU_OUT)
+
+#define W_TCB_CQ_IDX_SQ    27
+#define S_TCB_CQ_IDX_SQ    3
+#define M_TCB_CQ_IDX_SQ    0xffffULL
+#define V_TCB_CQ_IDX_SQ(x) ((x) << S_TCB_CQ_IDX_SQ)
+
+#define W_TCB_CQ_IDX_RQ    27
+#define S_TCB_CQ_IDX_RQ    19
+#define M_TCB_CQ_IDX_RQ    0xffffULL
+#define V_TCB_CQ_IDX_RQ(x) ((x) << S_TCB_CQ_IDX_RQ)
+
+#define W_TCB_QP_ID    28
+#define S_TCB_QP_ID    3
+#define M_TCB_QP_ID    0xffffULL
+#define V_TCB_QP_ID(x) ((x) << S_TCB_QP_ID)
+
+#define W_TCB_PD_ID    28
+#define S_TCB_PD_ID    19
+#define M_TCB_PD_ID    0xffffULL
+#define V_TCB_PD_ID(x) ((x) << S_TCB_PD_ID)
+
+#define W_TCB_STAG    29
+#define S_TCB_STAG    3
+#define M_TCB_STAG    0xffffffffULL
+#define V_TCB_STAG(x) ((x) << S_TCB_STAG)
+
+#define W_TCB_RQ_START    30
+#define S_TCB_RQ_START    3
+#define M_TCB_RQ_START    0x3ffffffULL
+#define V_TCB_RQ_START(x) ((x) << S_TCB_RQ_START)
+
+#define W_TCB_RQ_MSN    30
+#define S_TCB_RQ_MSN    29
+#define M_TCB_RQ_MSN    0x3ffULL
+#define V_TCB_RQ_MSN(x) ((x) << S_TCB_RQ_MSN)
+
+#define W_TCB_RQ_MAX_OFFSET    31
+#define S_TCB_RQ_MAX_OFFSET    7
+#define M_TCB_RQ_MAX_OFFSET    0xfULL
+#define V_TCB_RQ_MAX_OFFSET(x) ((x) << S_TCB_RQ_MAX_OFFSET)
+
+#define W_TCB_RQ_WRITE_PTR    31
+#define S_TCB_RQ_WRITE_PTR    11
+#define M_TCB_RQ_WRITE_PTR    0x3ffULL
+#define V_TCB_RQ_WRITE_PTR(x) ((x) << S_TCB_RQ_WRITE_PTR)
+
+#define W_TCB_INB_WRITE_PERM    31
+#define S_TCB_INB_WRITE_PERM    21
+#define M_TCB_INB_WRITE_PERM    0x1ULL
+#define V_TCB_INB_WRITE_PERM(x) ((x) << S_TCB_INB_WRITE_PERM)
+
+#define W_TCB_INB_READ_PERM    31
+#define S_TCB_INB_READ_PERM    22
+#define M_TCB_INB_READ_PERM    0x1ULL
+#define V_TCB_INB_READ_PERM(x) ((x) << S_TCB_INB_READ_PERM)
+
+#define W_TCB_ORD_L_BIT_VLD    31
+#define S_TCB_ORD_L_BIT_VLD    23
+#define M_TCB_ORD_L_BIT_VLD    0x1ULL
+#define V_TCB_ORD_L_BIT_VLD(x) ((x) << S_TCB_ORD_L_BIT_VLD)
+
+#define W_TCB_RDMAP_OPCODE    31
+#define S_TCB_RDMAP_OPCODE    24
+#define M_TCB_RDMAP_OPCODE    0xfULL
+#define V_TCB_RDMAP_OPCODE(x) ((x) << S_TCB_RDMAP_OPCODE)
+
+#define W_TCB_TX_FLUSH    31
+#define S_TCB_TX_FLUSH    28
+#define M_TCB_TX_FLUSH    0x1ULL
+#define V_TCB_TX_FLUSH(x) ((x) << S_TCB_TX_FLUSH)
+
+#define W_TCB_TX_OOS_RXMT    31
+#define S_TCB_TX_OOS_RXMT    29
+#define M_TCB_TX_OOS_RXMT    0x1ULL
+#define V_TCB_TX_OOS_RXMT(x) ((x) << S_TCB_TX_OOS_RXMT)
+
+#define W_TCB_TX_OOS_TXMT    31
+#define S_TCB_TX_OOS_TXMT    30
+#define M_TCB_TX_OOS_TXMT    0x1ULL
+#define V_TCB_TX_OOS_TXMT(x) ((x) << S_TCB_TX_OOS_TXMT)
+
+#define W_TCB_SLUSH_AUX2    31
+#define S_TCB_SLUSH_AUX2    31
+#define M_TCB_SLUSH_AUX2    0x1ULL
+#define V_TCB_SLUSH_AUX2(x) ((x) << S_TCB_SLUSH_AUX2)
+
+#define W_TCB_RX_FRAG1_PTR_RAW2    25
+#define S_TCB_RX_FRAG1_PTR_RAW2    30
+#define M_TCB_RX_FRAG1_PTR_RAW2    0x1ffffULL
+#define V_TCB_RX_FRAG1_PTR_RAW2(x) ((x) << S_TCB_RX_FRAG1_PTR_RAW2)
+
+#define W_TCB_RX_DDP_FLAGS    26
+#define S_TCB_RX_DDP_FLAGS    15
+#define M_TCB_RX_DDP_FLAGS    0x3ffULL
+#define V_TCB_RX_DDP_FLAGS(x) ((x) << S_TCB_RX_DDP_FLAGS)
+
+#define W_TCB_SLUSH_AUX3    26
+#define S_TCB_SLUSH_AUX3    31
+#define M_TCB_SLUSH_AUX3    0x1ffULL
+#define V_TCB_SLUSH_AUX3(x) ((x) << S_TCB_SLUSH_AUX3)
+
+#define W_TCB_RX_DDP_BUF0_OFFSET    27
+#define S_TCB_RX_DDP_BUF0_OFFSET    8
+#define M_TCB_RX_DDP_BUF0_OFFSET    0x3fffffULL
+#define V_TCB_RX_DDP_BUF0_OFFSET(x) ((x) << S_TCB_RX_DDP_BUF0_OFFSET)
+
+#define W_TCB_RX_DDP_BUF0_LEN    27
+#define S_TCB_RX_DDP_BUF0_LEN    30
+#define M_TCB_RX_DDP_BUF0_LEN    0x3fffffULL
+#define V_TCB_RX_DDP_BUF0_LEN(x) ((x) << S_TCB_RX_DDP_BUF0_LEN)
+
+#define W_TCB_RX_DDP_BUF1_OFFSET    28
+#define S_TCB_RX_DDP_BUF1_OFFSET    20
+#define M_TCB_RX_DDP_BUF1_OFFSET    0x3fffffULL
+#define V_TCB_RX_DDP_BUF1_OFFSET(x) ((x) << S_TCB_RX_DDP_BUF1_OFFSET)
+
+#define W_TCB_RX_DDP_BUF1_LEN    29
+#define S_TCB_RX_DDP_BUF1_LEN    10
+#define M_TCB_RX_DDP_BUF1_LEN    0x3fffffULL
+#define V_TCB_RX_DDP_BUF1_LEN(x) ((x) << S_TCB_RX_DDP_BUF1_LEN)
+
+#define W_TCB_RX_DDP_BUF0_TAG    30
+#define S_TCB_RX_DDP_BUF0_TAG    0
+#define M_TCB_RX_DDP_BUF0_TAG    0xffffffffULL
+#define V_TCB_RX_DDP_BUF0_TAG(x) ((x) << S_TCB_RX_DDP_BUF0_TAG)
+
+#define W_TCB_RX_DDP_BUF1_TAG    31
+#define S_TCB_RX_DDP_BUF1_TAG    0
+#define M_TCB_RX_DDP_BUF1_TAG    0xffffffffULL
+#define V_TCB_RX_DDP_BUF1_TAG(x) ((x) << S_TCB_RX_DDP_BUF1_TAG)
+
+#define S_TF_DACK    10
+#define V_TF_DACK(x) ((x) << S_TF_DACK)
+
+#define S_TF_NAGLE    11
+#define V_TF_NAGLE(x) ((x) << S_TF_NAGLE)
+
+#define S_TF_RECV_SCALE    12
+#define V_TF_RECV_SCALE(x) ((x) << S_TF_RECV_SCALE)
+
+#define S_TF_RECV_TSTMP    13
+#define V_TF_RECV_TSTMP(x) ((x) << S_TF_RECV_TSTMP)
+
+#define S_TF_RECV_SACK    14
+#define V_TF_RECV_SACK(x) ((x) << S_TF_RECV_SACK)
+
+#define S_TF_TURBO    15
+#define V_TF_TURBO(x) ((x) << S_TF_TURBO)
+
+#define S_TF_KEEPALIVE    16
+#define V_TF_KEEPALIVE(x) ((x) << S_TF_KEEPALIVE)
+
+#define S_TF_TCAM_BYPASS    17
+#define V_TF_TCAM_BYPASS(x) ((x) << S_TF_TCAM_BYPASS)
+
+#define S_TF_CORE_FIN    18
+#define V_TF_CORE_FIN(x) ((x) << S_TF_CORE_FIN)
+
+#define S_TF_CORE_MORE    19
+#define V_TF_CORE_MORE(x) ((x) << S_TF_CORE_MORE)
+
+#define S_TF_MIGRATING    20
+#define V_TF_MIGRATING(x) ((x) << S_TF_MIGRATING)
+
+#define S_TF_ACTIVE_OPEN    21
+#define V_TF_ACTIVE_OPEN(x) ((x) << S_TF_ACTIVE_OPEN)
+
+#define S_TF_ASK_MODE    22
+#define V_TF_ASK_MODE(x) ((x) << S_TF_ASK_MODE)
+
+#define S_TF_NON_OFFLOAD    23
+#define V_TF_NON_OFFLOAD(x) ((x) << S_TF_NON_OFFLOAD)
+
+#define S_TF_MOD_SCHD    24
+#define V_TF_MOD_SCHD(x) ((x) << S_TF_MOD_SCHD)
+
+#define S_TF_MOD_SCHD_REASON0    25
+#define V_TF_MOD_SCHD_REASON0(x) ((x) << S_TF_MOD_SCHD_REASON0)
+
+#define S_TF_MOD_SCHD_REASON1    26
+#define V_TF_MOD_SCHD_REASON1(x) ((x) << S_TF_MOD_SCHD_REASON1)
+
+#define S_TF_MOD_SCHD_RX    27
+#define V_TF_MOD_SCHD_RX(x) ((x) << S_TF_MOD_SCHD_RX)
+
+#define S_TF_CORE_PUSH    28
+#define V_TF_CORE_PUSH(x) ((x) << S_TF_CORE_PUSH)
+
+#define S_TF_RCV_COALESCE_ENABLE    29
+#define V_TF_RCV_COALESCE_ENABLE(x) ((x) << S_TF_RCV_COALESCE_ENABLE)
+
+#define S_TF_RCV_COALESCE_PUSH    30
+#define V_TF_RCV_COALESCE_PUSH(x) ((x) << S_TF_RCV_COALESCE_PUSH)
+
+#define S_TF_RCV_COALESCE_LAST_PSH    31
+#define V_TF_RCV_COALESCE_LAST_PSH(x) ((x) << S_TF_RCV_COALESCE_LAST_PSH)
+
+#define S_TF_RCV_COALESCE_HEARTBEAT    32
+#define V_TF_RCV_COALESCE_HEARTBEAT(x) ((x) << S_TF_RCV_COALESCE_HEARTBEAT)
+
+#define S_TF_HALF_CLOSE    33
+#define V_TF_HALF_CLOSE(x) ((x) << S_TF_HALF_CLOSE)
+
+#define S_TF_DACK_MSS    34
+#define V_TF_DACK_MSS(x) ((x) << S_TF_DACK_MSS)
+
+#define S_TF_CCTRL_SEL0    35
+#define V_TF_CCTRL_SEL0(x) ((x) << S_TF_CCTRL_SEL0)
+
+#define S_TF_CCTRL_SEL1    36
+#define V_TF_CCTRL_SEL1(x) ((x) << S_TF_CCTRL_SEL1)
+
+#define S_TF_TCP_NEWRENO_FAST_RECOVERY    37
+#define V_TF_TCP_NEWRENO_FAST_RECOVERY(x) ((x) << S_TF_TCP_NEWRENO_FAST_RECOVERY)
+
+#define S_TF_TX_PACE_AUTO    38
+#define V_TF_TX_PACE_AUTO(x) ((x) << S_TF_TX_PACE_AUTO)
+
+#define S_TF_PEER_FIN_HELD    39
+#define V_TF_PEER_FIN_HELD(x) ((x) << S_TF_PEER_FIN_HELD)
+
+#define S_TF_CORE_URG    40
+#define V_TF_CORE_URG(x) ((x) << S_TF_CORE_URG)
+
+#define S_TF_RDMA_ERROR    41
+#define V_TF_RDMA_ERROR(x) ((x) << S_TF_RDMA_ERROR)
+
+#define S_TF_SSWS_DISABLED    42
+#define V_TF_SSWS_DISABLED(x) ((x) << S_TF_SSWS_DISABLED)
+
+#define S_TF_DUPACK_COUNT_ODD    43
+#define V_TF_DUPACK_COUNT_ODD(x) ((x) << S_TF_DUPACK_COUNT_ODD)
+
+#define S_TF_TX_CHANNEL    44
+#define V_TF_TX_CHANNEL(x) ((x) << S_TF_TX_CHANNEL)
+
+#define S_TF_RX_CHANNEL    45
+#define V_TF_RX_CHANNEL(x) ((x) << S_TF_RX_CHANNEL)
+
+#define S_TF_TX_PACE_FIXED    46
+#define V_TF_TX_PACE_FIXED(x) ((x) << S_TF_TX_PACE_FIXED)
+
+#define S_TF_RDMA_FLM_ERROR    47
+#define V_TF_RDMA_FLM_ERROR(x) ((x) << S_TF_RDMA_FLM_ERROR)
+
+#define S_TF_RX_FLOW_CONTROL_DISABLE    48
+#define V_TF_RX_FLOW_CONTROL_DISABLE(x) ((x) << S_TF_RX_FLOW_CONTROL_DISABLE)
+
+#endif /* _TCB_DEFS_H */
diff --git a/drivers/infiniband/hw/cxgb4/Kconfig b/drivers/infiniband/hw/cxgb4/Kconfig
new file mode 100644
index 0000000..23f38cf
--- /dev/null
+++ b/drivers/infiniband/hw/cxgb4/Kconfig
@@ -0,0 +1,18 @@
+config INFINIBAND_CXGB4
+	tristate "Chelsio T4/T5 RDMA Driver"
+	depends on CHELSIO_T4 && INET && (IPV6 || IPV6=n)
+	select GENERIC_ALLOCATOR
+	---help---
+	  This is an iWARP/RDMA driver for the Chelsio T4 and T5
+	  1GbE, 10GbE adapters and T5 40GbE adapter.
+
+	  For general information about Chelsio and our products, visit
+	  our website at <http://www.chelsio.com>.
+
+	  For customer support, please visit our customer support page at
+	  <http://www.chelsio.com/support.html>.
+
+	  Please send feedback to <linux-bugs@chelsio.com>.
+
+	  To compile this driver as a module, choose M here: the module
+	  will be called iw_cxgb4.
diff --git a/drivers/infiniband/hw/cxgb4/Makefile b/drivers/infiniband/hw/cxgb4/Makefile
new file mode 100644
index 0000000..e11cf72
--- /dev/null
+++ b/drivers/infiniband/hw/cxgb4/Makefile
@@ -0,0 +1,5 @@
+ccflags-y := -Idrivers/net/ethernet/chelsio/cxgb4
+
+obj-$(CONFIG_INFINIBAND_CXGB4) += iw_cxgb4.o
+
+iw_cxgb4-y :=  device.o cm.o provider.o mem.o cq.o qp.o resource.o ev.o id_table.o
diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c
new file mode 100644
index 0000000..fb61f66
--- /dev/null
+++ b/drivers/infiniband/hw/cxgb4/cm.c
@@ -0,0 +1,3993 @@
+/*
+ * Copyright (c) 2009-2014 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer in the documentation and/or other materials
+ *	  provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include <linux/module.h>
+#include <linux/list.h>
+#include <linux/workqueue.h>
+#include <linux/skbuff.h>
+#include <linux/timer.h>
+#include <linux/notifier.h>
+#include <linux/inetdevice.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>
+#include <linux/if_vlan.h>
+
+#include <net/neighbour.h>
+#include <net/netevent.h>
+#include <net/route.h>
+#include <net/tcp.h>
+#include <net/ip6_route.h>
+#include <net/addrconf.h>
+
+#include <rdma/ib_addr.h>
+
+#include "iw_cxgb4.h"
+
+static char *states[] = {
+	"idle",
+	"listen",
+	"connecting",
+	"mpa_wait_req",
+	"mpa_req_sent",
+	"mpa_req_rcvd",
+	"mpa_rep_sent",
+	"fpdu_mode",
+	"aborting",
+	"closing",
+	"moribund",
+	"dead",
+	NULL,
+};
+
+static int nocong;
+module_param(nocong, int, 0644);
+MODULE_PARM_DESC(nocong, "Turn of congestion control (default=0)");
+
+static int enable_ecn;
+module_param(enable_ecn, int, 0644);
+MODULE_PARM_DESC(enable_ecn, "Enable ECN (default=0/disabled)");
+
+static int dack_mode = 1;
+module_param(dack_mode, int, 0644);
+MODULE_PARM_DESC(dack_mode, "Delayed ack mode (default=1)");
+
+uint c4iw_max_read_depth = 32;
+module_param(c4iw_max_read_depth, int, 0644);
+MODULE_PARM_DESC(c4iw_max_read_depth,
+		 "Per-connection max ORD/IRD (default=32)");
+
+static int enable_tcp_timestamps;
+module_param(enable_tcp_timestamps, int, 0644);
+MODULE_PARM_DESC(enable_tcp_timestamps, "Enable tcp timestamps (default=0)");
+
+static int enable_tcp_sack;
+module_param(enable_tcp_sack, int, 0644);
+MODULE_PARM_DESC(enable_tcp_sack, "Enable tcp SACK (default=0)");
+
+static int enable_tcp_window_scaling = 1;
+module_param(enable_tcp_window_scaling, int, 0644);
+MODULE_PARM_DESC(enable_tcp_window_scaling,
+		 "Enable tcp window scaling (default=1)");
+
+int c4iw_debug;
+module_param(c4iw_debug, int, 0644);
+MODULE_PARM_DESC(c4iw_debug, "Enable debug logging (default=0)");
+
+static int peer2peer = 1;
+module_param(peer2peer, int, 0644);
+MODULE_PARM_DESC(peer2peer, "Support peer2peer ULPs (default=1)");
+
+static int p2p_type = FW_RI_INIT_P2PTYPE_READ_REQ;
+module_param(p2p_type, int, 0644);
+MODULE_PARM_DESC(p2p_type, "RDMAP opcode to use for the RTR message: "
+			   "1=RDMA_READ 0=RDMA_WRITE (default 1)");
+
+static int ep_timeout_secs = 60;
+module_param(ep_timeout_secs, int, 0644);
+MODULE_PARM_DESC(ep_timeout_secs, "CM Endpoint operation timeout "
+				   "in seconds (default=60)");
+
+static int mpa_rev = 1;
+module_param(mpa_rev, int, 0644);
+MODULE_PARM_DESC(mpa_rev, "MPA Revision, 0 supports amso1100, "
+		"1 is RFC0544 spec compliant, 2 is IETF MPA Peer Connect Draft"
+		" compliant (default=1)");
+
+static int markers_enabled;
+module_param(markers_enabled, int, 0644);
+MODULE_PARM_DESC(markers_enabled, "Enable MPA MARKERS (default(0)=disabled)");
+
+static int crc_enabled = 1;
+module_param(crc_enabled, int, 0644);
+MODULE_PARM_DESC(crc_enabled, "Enable MPA CRC (default(1)=enabled)");
+
+static int rcv_win = 256 * 1024;
+module_param(rcv_win, int, 0644);
+MODULE_PARM_DESC(rcv_win, "TCP receive window in bytes (default=256KB)");
+
+static int snd_win = 128 * 1024;
+module_param(snd_win, int, 0644);
+MODULE_PARM_DESC(snd_win, "TCP send window in bytes (default=128KB)");
+
+static struct workqueue_struct *workq;
+
+static struct sk_buff_head rxq;
+
+static struct sk_buff *get_skb(struct sk_buff *skb, int len, gfp_t gfp);
+static void ep_timeout(unsigned long arg);
+static void connect_reply_upcall(struct c4iw_ep *ep, int status);
+
+static LIST_HEAD(timeout_list);
+static spinlock_t timeout_lock;
+
+static void deref_qp(struct c4iw_ep *ep)
+{
+	c4iw_qp_rem_ref(&ep->com.qp->ibqp);
+	clear_bit(QP_REFERENCED, &ep->com.flags);
+}
+
+static void ref_qp(struct c4iw_ep *ep)
+{
+	set_bit(QP_REFERENCED, &ep->com.flags);
+	c4iw_qp_add_ref(&ep->com.qp->ibqp);
+}
+
+static void start_ep_timer(struct c4iw_ep *ep)
+{
+	PDBG("%s ep %p\n", __func__, ep);
+	if (timer_pending(&ep->timer)) {
+		pr_err("%s timer already started! ep %p\n",
+		       __func__, ep);
+		return;
+	}
+	clear_bit(TIMEOUT, &ep->com.flags);
+	c4iw_get_ep(&ep->com);
+	ep->timer.expires = jiffies + ep_timeout_secs * HZ;
+	ep->timer.data = (unsigned long)ep;
+	ep->timer.function = ep_timeout;
+	add_timer(&ep->timer);
+}
+
+static int stop_ep_timer(struct c4iw_ep *ep)
+{
+	PDBG("%s ep %p stopping\n", __func__, ep);
+	del_timer_sync(&ep->timer);
+	if (!test_and_set_bit(TIMEOUT, &ep->com.flags)) {
+		c4iw_put_ep(&ep->com);
+		return 0;
+	}
+	return 1;
+}
+
+static int c4iw_l2t_send(struct c4iw_rdev *rdev, struct sk_buff *skb,
+		  struct l2t_entry *l2e)
+{
+	int	error = 0;
+
+	if (c4iw_fatal_error(rdev)) {
+		kfree_skb(skb);
+		PDBG("%s - device in error state - dropping\n", __func__);
+		return -EIO;
+	}
+	error = cxgb4_l2t_send(rdev->lldi.ports[0], skb, l2e);
+	if (error < 0)
+		kfree_skb(skb);
+	return error < 0 ? error : 0;
+}
+
+int c4iw_ofld_send(struct c4iw_rdev *rdev, struct sk_buff *skb)
+{
+	int	error = 0;
+
+	if (c4iw_fatal_error(rdev)) {
+		kfree_skb(skb);
+		PDBG("%s - device in error state - dropping\n", __func__);
+		return -EIO;
+	}
+	error = cxgb4_ofld_send(rdev->lldi.ports[0], skb);
+	if (error < 0)
+		kfree_skb(skb);
+	return error < 0 ? error : 0;
+}
+
+static void release_tid(struct c4iw_rdev *rdev, u32 hwtid, struct sk_buff *skb)
+{
+	struct cpl_tid_release *req;
+
+	skb = get_skb(skb, sizeof *req, GFP_KERNEL);
+	if (!skb)
+		return;
+	req = (struct cpl_tid_release *) skb_put(skb, sizeof(*req));
+	INIT_TP_WR(req, hwtid);
+	OPCODE_TID(req) = cpu_to_be32(MK_OPCODE_TID(CPL_TID_RELEASE, hwtid));
+	set_wr_txq(skb, CPL_PRIORITY_SETUP, 0);
+	c4iw_ofld_send(rdev, skb);
+	return;
+}
+
+static void set_emss(struct c4iw_ep *ep, u16 opt)
+{
+	ep->emss = ep->com.dev->rdev.lldi.mtus[GET_TCPOPT_MSS(opt)] -
+		   ((AF_INET == ep->com.remote_addr.ss_family) ?
+		    sizeof(struct iphdr) : sizeof(struct ipv6hdr)) -
+		   sizeof(struct tcphdr);
+	ep->mss = ep->emss;
+	if (GET_TCPOPT_TSTAMP(opt))
+		ep->emss -= round_up(TCPOLEN_TIMESTAMP, 4);
+	if (ep->emss < 128)
+		ep->emss = 128;
+	if (ep->emss & 7)
+		PDBG("Warning: misaligned mtu idx %u mss %u emss=%u\n",
+		     GET_TCPOPT_MSS(opt), ep->mss, ep->emss);
+	PDBG("%s mss_idx %u mss %u emss=%u\n", __func__, GET_TCPOPT_MSS(opt),
+	     ep->mss, ep->emss);
+}
+
+static enum c4iw_ep_state state_read(struct c4iw_ep_common *epc)
+{
+	enum c4iw_ep_state state;
+
+	mutex_lock(&epc->mutex);
+	state = epc->state;
+	mutex_unlock(&epc->mutex);
+	return state;
+}
+
+static void __state_set(struct c4iw_ep_common *epc, enum c4iw_ep_state new)
+{
+	epc->state = new;
+}
+
+static void state_set(struct c4iw_ep_common *epc, enum c4iw_ep_state new)
+{
+	mutex_lock(&epc->mutex);
+	PDBG("%s - %s -> %s\n", __func__, states[epc->state], states[new]);
+	__state_set(epc, new);
+	mutex_unlock(&epc->mutex);
+	return;
+}
+
+static void *alloc_ep(int size, gfp_t gfp)
+{
+	struct c4iw_ep_common *epc;
+
+	epc = kzalloc(size, gfp);
+	if (epc) {
+		kref_init(&epc->kref);
+		mutex_init(&epc->mutex);
+		c4iw_init_wr_wait(&epc->wr_wait);
+	}
+	PDBG("%s alloc ep %p\n", __func__, epc);
+	return epc;
+}
+
+void _c4iw_free_ep(struct kref *kref)
+{
+	struct c4iw_ep *ep;
+
+	ep = container_of(kref, struct c4iw_ep, com.kref);
+	PDBG("%s ep %p state %s\n", __func__, ep, states[state_read(&ep->com)]);
+	if (test_bit(QP_REFERENCED, &ep->com.flags))
+		deref_qp(ep);
+	if (test_bit(RELEASE_RESOURCES, &ep->com.flags)) {
+		remove_handle(ep->com.dev, &ep->com.dev->hwtid_idr, ep->hwtid);
+		cxgb4_remove_tid(ep->com.dev->rdev.lldi.tids, 0, ep->hwtid);
+		dst_release(ep->dst);
+		cxgb4_l2t_release(ep->l2t);
+	}
+	if (test_bit(RELEASE_MAPINFO, &ep->com.flags)) {
+		print_addr(&ep->com, __func__, "remove_mapinfo/mapping");
+		iwpm_remove_mapinfo(&ep->com.local_addr,
+				    &ep->com.mapped_local_addr);
+		iwpm_remove_mapping(&ep->com.local_addr, RDMA_NL_C4IW);
+	}
+	kfree(ep);
+}
+
+static void release_ep_resources(struct c4iw_ep *ep)
+{
+	set_bit(RELEASE_RESOURCES, &ep->com.flags);
+	c4iw_put_ep(&ep->com);
+}
+
+static int status2errno(int status)
+{
+	switch (status) {
+	case CPL_ERR_NONE:
+		return 0;
+	case CPL_ERR_CONN_RESET:
+		return -ECONNRESET;
+	case CPL_ERR_ARP_MISS:
+		return -EHOSTUNREACH;
+	case CPL_ERR_CONN_TIMEDOUT:
+		return -ETIMEDOUT;
+	case CPL_ERR_TCAM_FULL:
+		return -ENOMEM;
+	case CPL_ERR_CONN_EXIST:
+		return -EADDRINUSE;
+	default:
+		return -EIO;
+	}
+}
+
+/*
+ * Try and reuse skbs already allocated...
+ */
+static struct sk_buff *get_skb(struct sk_buff *skb, int len, gfp_t gfp)
+{
+	if (skb && !skb_is_nonlinear(skb) && !skb_cloned(skb)) {
+		skb_trim(skb, 0);
+		skb_get(skb);
+		skb_reset_transport_header(skb);
+	} else {
+		skb = alloc_skb(len, gfp);
+	}
+	t4_set_arp_err_handler(skb, NULL, NULL);
+	return skb;
+}
+
+static struct net_device *get_real_dev(struct net_device *egress_dev)
+{
+	return rdma_vlan_dev_real_dev(egress_dev) ? : egress_dev;
+}
+
+static int our_interface(struct c4iw_dev *dev, struct net_device *egress_dev)
+{
+	int i;
+
+	egress_dev = get_real_dev(egress_dev);
+	for (i = 0; i < dev->rdev.lldi.nports; i++)
+		if (dev->rdev.lldi.ports[i] == egress_dev)
+			return 1;
+	return 0;
+}
+
+static struct dst_entry *find_route6(struct c4iw_dev *dev, __u8 *local_ip,
+				     __u8 *peer_ip, __be16 local_port,
+				     __be16 peer_port, u8 tos,
+				     __u32 sin6_scope_id)
+{
+	struct dst_entry *dst = NULL;
+
+	if (IS_ENABLED(CONFIG_IPV6)) {
+		struct flowi6 fl6;
+
+		memset(&fl6, 0, sizeof(fl6));
+		memcpy(&fl6.daddr, peer_ip, 16);
+		memcpy(&fl6.saddr, local_ip, 16);
+		if (ipv6_addr_type(&fl6.daddr) & IPV6_ADDR_LINKLOCAL)
+			fl6.flowi6_oif = sin6_scope_id;
+		dst = ip6_route_output(&init_net, NULL, &fl6);
+		if (!dst)
+			goto out;
+		if (!our_interface(dev, ip6_dst_idev(dst)->dev) &&
+		    !(ip6_dst_idev(dst)->dev->flags & IFF_LOOPBACK)) {
+			dst_release(dst);
+			dst = NULL;
+		}
+	}
+
+out:
+	return dst;
+}
+
+static struct dst_entry *find_route(struct c4iw_dev *dev, __be32 local_ip,
+				 __be32 peer_ip, __be16 local_port,
+				 __be16 peer_port, u8 tos)
+{
+	struct rtable *rt;
+	struct flowi4 fl4;
+	struct neighbour *n;
+
+	rt = ip_route_output_ports(&init_net, &fl4, NULL, peer_ip, local_ip,
+				   peer_port, local_port, IPPROTO_TCP,
+				   tos, 0);
+	if (IS_ERR(rt))
+		return NULL;
+	n = dst_neigh_lookup(&rt->dst, &peer_ip);
+	if (!n)
+		return NULL;
+	if (!our_interface(dev, n->dev) &&
+	    !(n->dev->flags & IFF_LOOPBACK)) {
+		neigh_release(n);
+		dst_release(&rt->dst);
+		return NULL;
+	}
+	neigh_release(n);
+	return &rt->dst;
+}
+
+static void arp_failure_discard(void *handle, struct sk_buff *skb)
+{
+	PDBG("%s c4iw_dev %p\n", __func__, handle);
+	kfree_skb(skb);
+}
+
+/*
+ * Handle an ARP failure for an active open.
+ */
+static void act_open_req_arp_failure(void *handle, struct sk_buff *skb)
+{
+	struct c4iw_ep *ep = handle;
+
+	printk(KERN_ERR MOD "ARP failure duing connect\n");
+	kfree_skb(skb);
+	connect_reply_upcall(ep, -EHOSTUNREACH);
+	state_set(&ep->com, DEAD);
+	remove_handle(ep->com.dev, &ep->com.dev->atid_idr, ep->atid);
+	cxgb4_free_atid(ep->com.dev->rdev.lldi.tids, ep->atid);
+	dst_release(ep->dst);
+	cxgb4_l2t_release(ep->l2t);
+	c4iw_put_ep(&ep->com);
+}
+
+/*
+ * Handle an ARP failure for a CPL_ABORT_REQ.  Change it into a no RST variant
+ * and send it along.
+ */
+static void abort_arp_failure(void *handle, struct sk_buff *skb)
+{
+	struct c4iw_rdev *rdev = handle;
+	struct cpl_abort_req *req = cplhdr(skb);
+
+	PDBG("%s rdev %p\n", __func__, rdev);
+	req->cmd = CPL_ABORT_NO_RST;
+	c4iw_ofld_send(rdev, skb);
+}
+
+static void send_flowc(struct c4iw_ep *ep, struct sk_buff *skb)
+{
+	unsigned int flowclen = 80;
+	struct fw_flowc_wr *flowc;
+	int i;
+
+	skb = get_skb(skb, flowclen, GFP_KERNEL);
+	flowc = (struct fw_flowc_wr *)__skb_put(skb, flowclen);
+
+	flowc->op_to_nparams = cpu_to_be32(FW_WR_OP(FW_FLOWC_WR) |
+					   FW_FLOWC_WR_NPARAMS(8));
+	flowc->flowid_len16 = cpu_to_be32(FW_WR_LEN16(DIV_ROUND_UP(flowclen,
+					  16)) | FW_WR_FLOWID(ep->hwtid));
+
+	flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_PFNVFN;
+	flowc->mnemval[0].val = cpu_to_be32(FW_PFVF_CMD_PFN
+					    (ep->com.dev->rdev.lldi.pf));
+	flowc->mnemval[1].mnemonic = FW_FLOWC_MNEM_CH;
+	flowc->mnemval[1].val = cpu_to_be32(ep->tx_chan);
+	flowc->mnemval[2].mnemonic = FW_FLOWC_MNEM_PORT;
+	flowc->mnemval[2].val = cpu_to_be32(ep->tx_chan);
+	flowc->mnemval[3].mnemonic = FW_FLOWC_MNEM_IQID;
+	flowc->mnemval[3].val = cpu_to_be32(ep->rss_qid);
+	flowc->mnemval[4].mnemonic = FW_FLOWC_MNEM_SNDNXT;
+	flowc->mnemval[4].val = cpu_to_be32(ep->snd_seq);
+	flowc->mnemval[5].mnemonic = FW_FLOWC_MNEM_RCVNXT;
+	flowc->mnemval[5].val = cpu_to_be32(ep->rcv_seq);
+	flowc->mnemval[6].mnemonic = FW_FLOWC_MNEM_SNDBUF;
+	flowc->mnemval[6].val = cpu_to_be32(ep->snd_win);
+	flowc->mnemval[7].mnemonic = FW_FLOWC_MNEM_MSS;
+	flowc->mnemval[7].val = cpu_to_be32(ep->emss);
+	/* Pad WR to 16 byte boundary */
+	flowc->mnemval[8].mnemonic = 0;
+	flowc->mnemval[8].val = 0;
+	for (i = 0; i < 9; i++) {
+		flowc->mnemval[i].r4[0] = 0;
+		flowc->mnemval[i].r4[1] = 0;
+		flowc->mnemval[i].r4[2] = 0;
+	}
+
+	set_wr_txq(skb, CPL_PRIORITY_DATA, ep->txq_idx);
+	c4iw_ofld_send(&ep->com.dev->rdev, skb);
+}
+
+static int send_halfclose(struct c4iw_ep *ep, gfp_t gfp)
+{
+	struct cpl_close_con_req *req;
+	struct sk_buff *skb;
+	int wrlen = roundup(sizeof *req, 16);
+
+	PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid);
+	skb = get_skb(NULL, wrlen, gfp);
+	if (!skb) {
+		printk(KERN_ERR MOD "%s - failed to alloc skb\n", __func__);
+		return -ENOMEM;
+	}
+	set_wr_txq(skb, CPL_PRIORITY_DATA, ep->txq_idx);
+	t4_set_arp_err_handler(skb, NULL, arp_failure_discard);
+	req = (struct cpl_close_con_req *) skb_put(skb, wrlen);
+	memset(req, 0, wrlen);
+	INIT_TP_WR(req, ep->hwtid);
+	OPCODE_TID(req) = cpu_to_be32(MK_OPCODE_TID(CPL_CLOSE_CON_REQ,
+						    ep->hwtid));
+	return c4iw_l2t_send(&ep->com.dev->rdev, skb, ep->l2t);
+}
+
+static int send_abort(struct c4iw_ep *ep, struct sk_buff *skb, gfp_t gfp)
+{
+	struct cpl_abort_req *req;
+	int wrlen = roundup(sizeof *req, 16);
+
+	PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid);
+	skb = get_skb(skb, wrlen, gfp);
+	if (!skb) {
+		printk(KERN_ERR MOD "%s - failed to alloc skb.\n",
+		       __func__);
+		return -ENOMEM;
+	}
+	set_wr_txq(skb, CPL_PRIORITY_DATA, ep->txq_idx);
+	t4_set_arp_err_handler(skb, &ep->com.dev->rdev, abort_arp_failure);
+	req = (struct cpl_abort_req *) skb_put(skb, wrlen);
+	memset(req, 0, wrlen);
+	INIT_TP_WR(req, ep->hwtid);
+	OPCODE_TID(req) = cpu_to_be32(MK_OPCODE_TID(CPL_ABORT_REQ, ep->hwtid));
+	req->cmd = CPL_ABORT_SEND_RST;
+	return c4iw_l2t_send(&ep->com.dev->rdev, skb, ep->l2t);
+}
+
+/*
+ * c4iw_form_pm_msg - Form a port mapper message with mapping info
+ */
+static void c4iw_form_pm_msg(struct c4iw_ep *ep,
+				struct iwpm_sa_data *pm_msg)
+{
+	memcpy(&pm_msg->loc_addr, &ep->com.local_addr,
+		sizeof(ep->com.local_addr));
+	memcpy(&pm_msg->rem_addr, &ep->com.remote_addr,
+		sizeof(ep->com.remote_addr));
+}
+
+/*
+ * c4iw_form_reg_msg - Form a port mapper message with dev info
+ */
+static void c4iw_form_reg_msg(struct c4iw_dev *dev,
+				struct iwpm_dev_data *pm_msg)
+{
+	memcpy(pm_msg->dev_name, dev->ibdev.name, IWPM_DEVNAME_SIZE);
+	memcpy(pm_msg->if_name, dev->rdev.lldi.ports[0]->name,
+				IWPM_IFNAME_SIZE);
+}
+
+static void c4iw_record_pm_msg(struct c4iw_ep *ep,
+			struct iwpm_sa_data *pm_msg)
+{
+	memcpy(&ep->com.mapped_local_addr, &pm_msg->mapped_loc_addr,
+		sizeof(ep->com.mapped_local_addr));
+	memcpy(&ep->com.mapped_remote_addr, &pm_msg->mapped_rem_addr,
+		sizeof(ep->com.mapped_remote_addr));
+}
+
+static void best_mtu(const unsigned short *mtus, unsigned short mtu,
+		     unsigned int *idx, int use_ts, int ipv6)
+{
+	unsigned short hdr_size = (ipv6 ?
+				   sizeof(struct ipv6hdr) :
+				   sizeof(struct iphdr)) +
+				  sizeof(struct tcphdr) +
+				  (use_ts ?
+				   round_up(TCPOLEN_TIMESTAMP, 4) : 0);
+	unsigned short data_size = mtu - hdr_size;
+
+	cxgb4_best_aligned_mtu(mtus, hdr_size, data_size, 8, idx);
+}
+
+static int send_connect(struct c4iw_ep *ep)
+{
+	struct cpl_act_open_req *req;
+	struct cpl_t5_act_open_req *t5_req;
+	struct cpl_act_open_req6 *req6;
+	struct cpl_t5_act_open_req6 *t5_req6;
+	struct sk_buff *skb;
+	u64 opt0;
+	u32 opt2;
+	unsigned int mtu_idx;
+	int wscale;
+	int wrlen;
+	int sizev4 = is_t4(ep->com.dev->rdev.lldi.adapter_type) ?
+				sizeof(struct cpl_act_open_req) :
+				sizeof(struct cpl_t5_act_open_req);
+	int sizev6 = is_t4(ep->com.dev->rdev.lldi.adapter_type) ?
+				sizeof(struct cpl_act_open_req6) :
+				sizeof(struct cpl_t5_act_open_req6);
+	struct sockaddr_in *la = (struct sockaddr_in *)
+				 &ep->com.mapped_local_addr;
+	struct sockaddr_in *ra = (struct sockaddr_in *)
+				 &ep->com.mapped_remote_addr;
+	struct sockaddr_in6 *la6 = (struct sockaddr_in6 *)
+				   &ep->com.mapped_local_addr;
+	struct sockaddr_in6 *ra6 = (struct sockaddr_in6 *)
+				   &ep->com.mapped_remote_addr;
+	int win;
+
+	wrlen = (ep->com.remote_addr.ss_family == AF_INET) ?
+			roundup(sizev4, 16) :
+			roundup(sizev6, 16);
+
+	PDBG("%s ep %p atid %u\n", __func__, ep, ep->atid);
+
+	skb = get_skb(NULL, wrlen, GFP_KERNEL);
+	if (!skb) {
+		printk(KERN_ERR MOD "%s - failed to alloc skb.\n",
+		       __func__);
+		return -ENOMEM;
+	}
+	set_wr_txq(skb, CPL_PRIORITY_SETUP, ep->ctrlq_idx);
+
+	best_mtu(ep->com.dev->rdev.lldi.mtus, ep->mtu, &mtu_idx,
+		 enable_tcp_timestamps,
+		 (AF_INET == ep->com.remote_addr.ss_family) ? 0 : 1);
+	wscale = compute_wscale(rcv_win);
+
+	/*
+	 * Specify the largest window that will fit in opt0. The
+	 * remainder will be specified in the rx_data_ack.
+	 */
+	win = ep->rcv_win >> 10;
+	if (win > RCV_BUFSIZ_MASK)
+		win = RCV_BUFSIZ_MASK;
+
+	opt0 = (nocong ? NO_CONG(1) : 0) |
+	       KEEP_ALIVE(1) |
+	       DELACK(1) |
+	       WND_SCALE(wscale) |
+	       MSS_IDX(mtu_idx) |
+	       L2T_IDX(ep->l2t->idx) |
+	       TX_CHAN(ep->tx_chan) |
+	       SMAC_SEL(ep->smac_idx) |
+	       DSCP(ep->tos) |
+	       ULP_MODE(ULP_MODE_TCPDDP) |
+	       RCV_BUFSIZ(win);
+	opt2 = RX_CHANNEL(0) |
+	       CCTRL_ECN(enable_ecn) |
+	       RSS_QUEUE_VALID | RSS_QUEUE(ep->rss_qid);
+	if (enable_tcp_timestamps)
+		opt2 |= TSTAMPS_EN(1);
+	if (enable_tcp_sack)
+		opt2 |= SACK_EN(1);
+	if (wscale && enable_tcp_window_scaling)
+		opt2 |= WND_SCALE_EN(1);
+	if (is_t5(ep->com.dev->rdev.lldi.adapter_type)) {
+		opt2 |= T5_OPT_2_VALID;
+		opt2 |= V_CONG_CNTRL(CONG_ALG_TAHOE);
+		opt2 |= CONG_CNTRL_VALID; /* OPT_2_ISS for T5 */
+	}
+	t4_set_arp_err_handler(skb, ep, act_open_req_arp_failure);
+
+	if (is_t4(ep->com.dev->rdev.lldi.adapter_type)) {
+		if (ep->com.remote_addr.ss_family == AF_INET) {
+			req = (struct cpl_act_open_req *) skb_put(skb, wrlen);
+			INIT_TP_WR(req, 0);
+			OPCODE_TID(req) = cpu_to_be32(
+					MK_OPCODE_TID(CPL_ACT_OPEN_REQ,
+					((ep->rss_qid << 14) | ep->atid)));
+			req->local_port = la->sin_port;
+			req->peer_port = ra->sin_port;
+			req->local_ip = la->sin_addr.s_addr;
+			req->peer_ip = ra->sin_addr.s_addr;
+			req->opt0 = cpu_to_be64(opt0);
+			req->params = cpu_to_be32(cxgb4_select_ntuple(
+						ep->com.dev->rdev.lldi.ports[0],
+						ep->l2t));
+			req->opt2 = cpu_to_be32(opt2);
+		} else {
+			req6 = (struct cpl_act_open_req6 *)skb_put(skb, wrlen);
+
+			INIT_TP_WR(req6, 0);
+			OPCODE_TID(req6) = cpu_to_be32(
+					   MK_OPCODE_TID(CPL_ACT_OPEN_REQ6,
+					   ((ep->rss_qid<<14)|ep->atid)));
+			req6->local_port = la6->sin6_port;
+			req6->peer_port = ra6->sin6_port;
+			req6->local_ip_hi = *((__be64 *)
+						(la6->sin6_addr.s6_addr));
+			req6->local_ip_lo = *((__be64 *)
+						(la6->sin6_addr.s6_addr + 8));
+			req6->peer_ip_hi = *((__be64 *)
+						(ra6->sin6_addr.s6_addr));
+			req6->peer_ip_lo = *((__be64 *)
+						(ra6->sin6_addr.s6_addr + 8));
+			req6->opt0 = cpu_to_be64(opt0);
+			req6->params = cpu_to_be32(cxgb4_select_ntuple(
+						ep->com.dev->rdev.lldi.ports[0],
+						ep->l2t));
+			req6->opt2 = cpu_to_be32(opt2);
+		}
+	} else {
+		u32 isn = (prandom_u32() & ~7UL) - 1;
+
+		if (peer2peer)
+			isn += 4;
+
+		if (ep->com.remote_addr.ss_family == AF_INET) {
+			t5_req = (struct cpl_t5_act_open_req *)
+				 skb_put(skb, wrlen);
+			INIT_TP_WR(t5_req, 0);
+			OPCODE_TID(t5_req) = cpu_to_be32(
+					MK_OPCODE_TID(CPL_ACT_OPEN_REQ,
+					((ep->rss_qid << 14) | ep->atid)));
+			t5_req->local_port = la->sin_port;
+			t5_req->peer_port = ra->sin_port;
+			t5_req->local_ip = la->sin_addr.s_addr;
+			t5_req->peer_ip = ra->sin_addr.s_addr;
+			t5_req->opt0 = cpu_to_be64(opt0);
+			t5_req->params = cpu_to_be64(V_FILTER_TUPLE(
+						     cxgb4_select_ntuple(
+					     ep->com.dev->rdev.lldi.ports[0],
+					     ep->l2t)));
+			t5_req->rsvd = cpu_to_be32(isn);
+			PDBG("%s snd_isn %u\n", __func__,
+			     be32_to_cpu(t5_req->rsvd));
+			t5_req->opt2 = cpu_to_be32(opt2);
+		} else {
+			t5_req6 = (struct cpl_t5_act_open_req6 *)
+				  skb_put(skb, wrlen);
+			INIT_TP_WR(t5_req6, 0);
+			OPCODE_TID(t5_req6) = cpu_to_be32(
+					      MK_OPCODE_TID(CPL_ACT_OPEN_REQ6,
+					      ((ep->rss_qid<<14)|ep->atid)));
+			t5_req6->local_port = la6->sin6_port;
+			t5_req6->peer_port = ra6->sin6_port;
+			t5_req6->local_ip_hi = *((__be64 *)
+						(la6->sin6_addr.s6_addr));
+			t5_req6->local_ip_lo = *((__be64 *)
+						(la6->sin6_addr.s6_addr + 8));
+			t5_req6->peer_ip_hi = *((__be64 *)
+						(ra6->sin6_addr.s6_addr));
+			t5_req6->peer_ip_lo = *((__be64 *)
+						(ra6->sin6_addr.s6_addr + 8));
+			t5_req6->opt0 = cpu_to_be64(opt0);
+			t5_req6->params = cpu_to_be64(V_FILTER_TUPLE(
+							cxgb4_select_ntuple(
+						ep->com.dev->rdev.lldi.ports[0],
+						ep->l2t)));
+			t5_req6->rsvd = cpu_to_be32(isn);
+			PDBG("%s snd_isn %u\n", __func__,
+			     be32_to_cpu(t5_req6->rsvd));
+			t5_req6->opt2 = cpu_to_be32(opt2);
+		}
+	}
+
+	set_bit(ACT_OPEN_REQ, &ep->com.history);
+	return c4iw_l2t_send(&ep->com.dev->rdev, skb, ep->l2t);
+}
+
+static void send_mpa_req(struct c4iw_ep *ep, struct sk_buff *skb,
+		u8 mpa_rev_to_use)
+{
+	int mpalen, wrlen;
+	struct fw_ofld_tx_data_wr *req;
+	struct mpa_message *mpa;
+	struct mpa_v2_conn_params mpa_v2_params;
+
+	PDBG("%s ep %p tid %u pd_len %d\n", __func__, ep, ep->hwtid, ep->plen);
+
+	BUG_ON(skb_cloned(skb));
+
+	mpalen = sizeof(*mpa) + ep->plen;
+	if (mpa_rev_to_use == 2)
+		mpalen += sizeof(struct mpa_v2_conn_params);
+	wrlen = roundup(mpalen + sizeof *req, 16);
+	skb = get_skb(skb, wrlen, GFP_KERNEL);
+	if (!skb) {
+		connect_reply_upcall(ep, -ENOMEM);
+		return;
+	}
+	set_wr_txq(skb, CPL_PRIORITY_DATA, ep->txq_idx);
+
+	req = (struct fw_ofld_tx_data_wr *)skb_put(skb, wrlen);
+	memset(req, 0, wrlen);
+	req->op_to_immdlen = cpu_to_be32(
+		FW_WR_OP(FW_OFLD_TX_DATA_WR) |
+		FW_WR_COMPL(1) |
+		FW_WR_IMMDLEN(mpalen));
+	req->flowid_len16 = cpu_to_be32(
+		FW_WR_FLOWID(ep->hwtid) |
+		FW_WR_LEN16(wrlen >> 4));
+	req->plen = cpu_to_be32(mpalen);
+	req->tunnel_to_proxy = cpu_to_be32(
+		FW_OFLD_TX_DATA_WR_FLUSH(1) |
+		FW_OFLD_TX_DATA_WR_SHOVE(1));
+
+	mpa = (struct mpa_message *)(req + 1);
+	memcpy(mpa->key, MPA_KEY_REQ, sizeof(mpa->key));
+	mpa->flags = (crc_enabled ? MPA_CRC : 0) |
+		     (markers_enabled ? MPA_MARKERS : 0) |
+		     (mpa_rev_to_use == 2 ? MPA_ENHANCED_RDMA_CONN : 0);
+	mpa->private_data_size = htons(ep->plen);
+	mpa->revision = mpa_rev_to_use;
+	if (mpa_rev_to_use == 1) {
+		ep->tried_with_mpa_v1 = 1;
+		ep->retry_with_mpa_v1 = 0;
+	}
+
+	if (mpa_rev_to_use == 2) {
+		mpa->private_data_size = htons(ntohs(mpa->private_data_size) +
+					       sizeof (struct mpa_v2_conn_params));
+		PDBG("%s initiator ird %u ord %u\n", __func__, ep->ird,
+		     ep->ord);
+		mpa_v2_params.ird = htons((u16)ep->ird);
+		mpa_v2_params.ord = htons((u16)ep->ord);
+
+		if (peer2peer) {
+			mpa_v2_params.ird |= htons(MPA_V2_PEER2PEER_MODEL);
+			if (p2p_type == FW_RI_INIT_P2PTYPE_RDMA_WRITE)
+				mpa_v2_params.ord |=
+					htons(MPA_V2_RDMA_WRITE_RTR);
+			else if (p2p_type == FW_RI_INIT_P2PTYPE_READ_REQ)
+				mpa_v2_params.ord |=
+					htons(MPA_V2_RDMA_READ_RTR);
+		}
+		memcpy(mpa->private_data, &mpa_v2_params,
+		       sizeof(struct mpa_v2_conn_params));
+
+		if (ep->plen)
+			memcpy(mpa->private_data +
+			       sizeof(struct mpa_v2_conn_params),
+			       ep->mpa_pkt + sizeof(*mpa), ep->plen);
+	} else
+		if (ep->plen)
+			memcpy(mpa->private_data,
+					ep->mpa_pkt + sizeof(*mpa), ep->plen);
+
+	/*
+	 * Reference the mpa skb.  This ensures the data area
+	 * will remain in memory until the hw acks the tx.
+	 * Function fw4_ack() will deref it.
+	 */
+	skb_get(skb);
+	t4_set_arp_err_handler(skb, NULL, arp_failure_discard);
+	BUG_ON(ep->mpa_skb);
+	ep->mpa_skb = skb;
+	c4iw_l2t_send(&ep->com.dev->rdev, skb, ep->l2t);
+	start_ep_timer(ep);
+	__state_set(&ep->com, MPA_REQ_SENT);
+	ep->mpa_attr.initiator = 1;
+	ep->snd_seq += mpalen;
+	return;
+}
+
+static int send_mpa_reject(struct c4iw_ep *ep, const void *pdata, u8 plen)
+{
+	int mpalen, wrlen;
+	struct fw_ofld_tx_data_wr *req;
+	struct mpa_message *mpa;
+	struct sk_buff *skb;
+	struct mpa_v2_conn_params mpa_v2_params;
+
+	PDBG("%s ep %p tid %u pd_len %d\n", __func__, ep, ep->hwtid, ep->plen);
+
+	mpalen = sizeof(*mpa) + plen;
+	if (ep->mpa_attr.version == 2 && ep->mpa_attr.enhanced_rdma_conn)
+		mpalen += sizeof(struct mpa_v2_conn_params);
+	wrlen = roundup(mpalen + sizeof *req, 16);
+
+	skb = get_skb(NULL, wrlen, GFP_KERNEL);
+	if (!skb) {
+		printk(KERN_ERR MOD "%s - cannot alloc skb!\n", __func__);
+		return -ENOMEM;
+	}
+	set_wr_txq(skb, CPL_PRIORITY_DATA, ep->txq_idx);
+
+	req = (struct fw_ofld_tx_data_wr *)skb_put(skb, wrlen);
+	memset(req, 0, wrlen);
+	req->op_to_immdlen = cpu_to_be32(
+		FW_WR_OP(FW_OFLD_TX_DATA_WR) |
+		FW_WR_COMPL(1) |
+		FW_WR_IMMDLEN(mpalen));
+	req->flowid_len16 = cpu_to_be32(
+		FW_WR_FLOWID(ep->hwtid) |
+		FW_WR_LEN16(wrlen >> 4));
+	req->plen = cpu_to_be32(mpalen);
+	req->tunnel_to_proxy = cpu_to_be32(
+		FW_OFLD_TX_DATA_WR_FLUSH(1) |
+		FW_OFLD_TX_DATA_WR_SHOVE(1));
+
+	mpa = (struct mpa_message *)(req + 1);
+	memset(mpa, 0, sizeof(*mpa));
+	memcpy(mpa->key, MPA_KEY_REP, sizeof(mpa->key));
+	mpa->flags = MPA_REJECT;
+	mpa->revision = ep->mpa_attr.version;
+	mpa->private_data_size = htons(plen);
+
+	if (ep->mpa_attr.version == 2 && ep->mpa_attr.enhanced_rdma_conn) {
+		mpa->flags |= MPA_ENHANCED_RDMA_CONN;
+		mpa->private_data_size = htons(ntohs(mpa->private_data_size) +
+					       sizeof (struct mpa_v2_conn_params));
+		mpa_v2_params.ird = htons(((u16)ep->ird) |
+					  (peer2peer ? MPA_V2_PEER2PEER_MODEL :
+					   0));
+		mpa_v2_params.ord = htons(((u16)ep->ord) | (peer2peer ?
+					  (p2p_type ==
+					   FW_RI_INIT_P2PTYPE_RDMA_WRITE ?
+					   MPA_V2_RDMA_WRITE_RTR : p2p_type ==
+					   FW_RI_INIT_P2PTYPE_READ_REQ ?
+					   MPA_V2_RDMA_READ_RTR : 0) : 0));
+		memcpy(mpa->private_data, &mpa_v2_params,
+		       sizeof(struct mpa_v2_conn_params));
+
+		if (ep->plen)
+			memcpy(mpa->private_data +
+			       sizeof(struct mpa_v2_conn_params), pdata, plen);
+	} else
+		if (plen)
+			memcpy(mpa->private_data, pdata, plen);
+
+	/*
+	 * Reference the mpa skb again.  This ensures the data area
+	 * will remain in memory until the hw acks the tx.
+	 * Function fw4_ack() will deref it.
+	 */
+	skb_get(skb);
+	set_wr_txq(skb, CPL_PRIORITY_DATA, ep->txq_idx);
+	t4_set_arp_err_handler(skb, NULL, arp_failure_discard);
+	BUG_ON(ep->mpa_skb);
+	ep->mpa_skb = skb;
+	ep->snd_seq += mpalen;
+	return c4iw_l2t_send(&ep->com.dev->rdev, skb, ep->l2t);
+}
+
+static int send_mpa_reply(struct c4iw_ep *ep, const void *pdata, u8 plen)
+{
+	int mpalen, wrlen;
+	struct fw_ofld_tx_data_wr *req;
+	struct mpa_message *mpa;
+	struct sk_buff *skb;
+	struct mpa_v2_conn_params mpa_v2_params;
+
+	PDBG("%s ep %p tid %u pd_len %d\n", __func__, ep, ep->hwtid, ep->plen);
+
+	mpalen = sizeof(*mpa) + plen;
+	if (ep->mpa_attr.version == 2 && ep->mpa_attr.enhanced_rdma_conn)
+		mpalen += sizeof(struct mpa_v2_conn_params);
+	wrlen = roundup(mpalen + sizeof *req, 16);
+
+	skb = get_skb(NULL, wrlen, GFP_KERNEL);
+	if (!skb) {
+		printk(KERN_ERR MOD "%s - cannot alloc skb!\n", __func__);
+		return -ENOMEM;
+	}
+	set_wr_txq(skb, CPL_PRIORITY_DATA, ep->txq_idx);
+
+	req = (struct fw_ofld_tx_data_wr *) skb_put(skb, wrlen);
+	memset(req, 0, wrlen);
+	req->op_to_immdlen = cpu_to_be32(
+		FW_WR_OP(FW_OFLD_TX_DATA_WR) |
+		FW_WR_COMPL(1) |
+		FW_WR_IMMDLEN(mpalen));
+	req->flowid_len16 = cpu_to_be32(
+		FW_WR_FLOWID(ep->hwtid) |
+		FW_WR_LEN16(wrlen >> 4));
+	req->plen = cpu_to_be32(mpalen);
+	req->tunnel_to_proxy = cpu_to_be32(
+		FW_OFLD_TX_DATA_WR_FLUSH(1) |
+		FW_OFLD_TX_DATA_WR_SHOVE(1));
+
+	mpa = (struct mpa_message *)(req + 1);
+	memset(mpa, 0, sizeof(*mpa));
+	memcpy(mpa->key, MPA_KEY_REP, sizeof(mpa->key));
+	mpa->flags = (ep->mpa_attr.crc_enabled ? MPA_CRC : 0) |
+		     (markers_enabled ? MPA_MARKERS : 0);
+	mpa->revision = ep->mpa_attr.version;
+	mpa->private_data_size = htons(plen);
+
+	if (ep->mpa_attr.version == 2 && ep->mpa_attr.enhanced_rdma_conn) {
+		mpa->flags |= MPA_ENHANCED_RDMA_CONN;
+		mpa->private_data_size = htons(ntohs(mpa->private_data_size) +
+					       sizeof (struct mpa_v2_conn_params));
+		mpa_v2_params.ird = htons((u16)ep->ird);
+		mpa_v2_params.ord = htons((u16)ep->ord);
+		if (peer2peer && (ep->mpa_attr.p2p_type !=
+					FW_RI_INIT_P2PTYPE_DISABLED)) {
+			mpa_v2_params.ird |= htons(MPA_V2_PEER2PEER_MODEL);
+
+			if (p2p_type == FW_RI_INIT_P2PTYPE_RDMA_WRITE)
+				mpa_v2_params.ord |=
+					htons(MPA_V2_RDMA_WRITE_RTR);
+			else if (p2p_type == FW_RI_INIT_P2PTYPE_READ_REQ)
+				mpa_v2_params.ord |=
+					htons(MPA_V2_RDMA_READ_RTR);
+		}
+
+		memcpy(mpa->private_data, &mpa_v2_params,
+		       sizeof(struct mpa_v2_conn_params));
+
+		if (ep->plen)
+			memcpy(mpa->private_data +
+			       sizeof(struct mpa_v2_conn_params), pdata, plen);
+	} else
+		if (plen)
+			memcpy(mpa->private_data, pdata, plen);
+
+	/*
+	 * Reference the mpa skb.  This ensures the data area
+	 * will remain in memory until the hw acks the tx.
+	 * Function fw4_ack() will deref it.
+	 */
+	skb_get(skb);
+	t4_set_arp_err_handler(skb, NULL, arp_failure_discard);
+	ep->mpa_skb = skb;
+	__state_set(&ep->com, MPA_REP_SENT);
+	ep->snd_seq += mpalen;
+	return c4iw_l2t_send(&ep->com.dev->rdev, skb, ep->l2t);
+}
+
+static int act_establish(struct c4iw_dev *dev, struct sk_buff *skb)
+{
+	struct c4iw_ep *ep;
+	struct cpl_act_establish *req = cplhdr(skb);
+	unsigned int tid = GET_TID(req);
+	unsigned int atid = GET_TID_TID(ntohl(req->tos_atid));
+	struct tid_info *t = dev->rdev.lldi.tids;
+
+	ep = lookup_atid(t, atid);
+
+	PDBG("%s ep %p tid %u snd_isn %u rcv_isn %u\n", __func__, ep, tid,
+	     be32_to_cpu(req->snd_isn), be32_to_cpu(req->rcv_isn));
+
+	mutex_lock(&ep->com.mutex);
+	dst_confirm(ep->dst);
+
+	/* setup the hwtid for this connection */
+	ep->hwtid = tid;
+	cxgb4_insert_tid(t, ep, tid);
+	insert_handle(dev, &dev->hwtid_idr, ep, ep->hwtid);
+
+	ep->snd_seq = be32_to_cpu(req->snd_isn);
+	ep->rcv_seq = be32_to_cpu(req->rcv_isn);
+
+	set_emss(ep, ntohs(req->tcp_opt));
+
+	/* dealloc the atid */
+	remove_handle(ep->com.dev, &ep->com.dev->atid_idr, atid);
+	cxgb4_free_atid(t, atid);
+	set_bit(ACT_ESTAB, &ep->com.history);
+
+	/* start MPA negotiation */
+	send_flowc(ep, NULL);
+	if (ep->retry_with_mpa_v1)
+		send_mpa_req(ep, skb, 1);
+	else
+		send_mpa_req(ep, skb, mpa_rev);
+	mutex_unlock(&ep->com.mutex);
+	return 0;
+}
+
+static void close_complete_upcall(struct c4iw_ep *ep, int status)
+{
+	struct iw_cm_event event;
+
+	PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid);
+	memset(&event, 0, sizeof(event));
+	event.event = IW_CM_EVENT_CLOSE;
+	event.status = status;
+	if (ep->com.cm_id) {
+		PDBG("close complete delivered ep %p cm_id %p tid %u\n",
+		     ep, ep->com.cm_id, ep->hwtid);
+		ep->com.cm_id->event_handler(ep->com.cm_id, &event);
+		ep->com.cm_id->rem_ref(ep->com.cm_id);
+		ep->com.cm_id = NULL;
+		set_bit(CLOSE_UPCALL, &ep->com.history);
+	}
+}
+
+static int abort_connection(struct c4iw_ep *ep, struct sk_buff *skb, gfp_t gfp)
+{
+	PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid);
+	__state_set(&ep->com, ABORTING);
+	set_bit(ABORT_CONN, &ep->com.history);
+	return send_abort(ep, skb, gfp);
+}
+
+static void peer_close_upcall(struct c4iw_ep *ep)
+{
+	struct iw_cm_event event;
+
+	PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid);
+	memset(&event, 0, sizeof(event));
+	event.event = IW_CM_EVENT_DISCONNECT;
+	if (ep->com.cm_id) {
+		PDBG("peer close delivered ep %p cm_id %p tid %u\n",
+		     ep, ep->com.cm_id, ep->hwtid);
+		ep->com.cm_id->event_handler(ep->com.cm_id, &event);
+		set_bit(DISCONN_UPCALL, &ep->com.history);
+	}
+}
+
+static void peer_abort_upcall(struct c4iw_ep *ep)
+{
+	struct iw_cm_event event;
+
+	PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid);
+	memset(&event, 0, sizeof(event));
+	event.event = IW_CM_EVENT_CLOSE;
+	event.status = -ECONNRESET;
+	if (ep->com.cm_id) {
+		PDBG("abort delivered ep %p cm_id %p tid %u\n", ep,
+		     ep->com.cm_id, ep->hwtid);
+		ep->com.cm_id->event_handler(ep->com.cm_id, &event);
+		ep->com.cm_id->rem_ref(ep->com.cm_id);
+		ep->com.cm_id = NULL;
+		set_bit(ABORT_UPCALL, &ep->com.history);
+	}
+}
+
+static void connect_reply_upcall(struct c4iw_ep *ep, int status)
+{
+	struct iw_cm_event event;
+
+	PDBG("%s ep %p tid %u status %d\n", __func__, ep, ep->hwtid, status);
+	memset(&event, 0, sizeof(event));
+	event.event = IW_CM_EVENT_CONNECT_REPLY;
+	event.status = status;
+	memcpy(&event.local_addr, &ep->com.local_addr,
+	       sizeof(ep->com.local_addr));
+	memcpy(&event.remote_addr, &ep->com.remote_addr,
+	       sizeof(ep->com.remote_addr));
+
+	if ((status == 0) || (status == -ECONNREFUSED)) {
+		if (!ep->tried_with_mpa_v1) {
+			/* this means MPA_v2 is used */
+			event.private_data_len = ep->plen -
+				sizeof(struct mpa_v2_conn_params);
+			event.private_data = ep->mpa_pkt +
+				sizeof(struct mpa_message) +
+				sizeof(struct mpa_v2_conn_params);
+		} else {
+			/* this means MPA_v1 is used */
+			event.private_data_len = ep->plen;
+			event.private_data = ep->mpa_pkt +
+				sizeof(struct mpa_message);
+		}
+	}
+
+	PDBG("%s ep %p tid %u status %d\n", __func__, ep,
+	     ep->hwtid, status);
+	set_bit(CONN_RPL_UPCALL, &ep->com.history);
+	ep->com.cm_id->event_handler(ep->com.cm_id, &event);
+
+	if (status < 0) {
+		ep->com.cm_id->rem_ref(ep->com.cm_id);
+		ep->com.cm_id = NULL;
+	}
+}
+
+static int connect_request_upcall(struct c4iw_ep *ep)
+{
+	struct iw_cm_event event;
+	int ret;
+
+	PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid);
+	memset(&event, 0, sizeof(event));
+	event.event = IW_CM_EVENT_CONNECT_REQUEST;
+	memcpy(&event.local_addr, &ep->com.local_addr,
+	       sizeof(ep->com.local_addr));
+	memcpy(&event.remote_addr, &ep->com.remote_addr,
+	       sizeof(ep->com.remote_addr));
+	event.provider_data = ep;
+	if (!ep->tried_with_mpa_v1) {
+		/* this means MPA_v2 is used */
+		event.ord = ep->ord;
+		event.ird = ep->ird;
+		event.private_data_len = ep->plen -
+			sizeof(struct mpa_v2_conn_params);
+		event.private_data = ep->mpa_pkt + sizeof(struct mpa_message) +
+			sizeof(struct mpa_v2_conn_params);
+	} else {
+		/* this means MPA_v1 is used. Send max supported */
+		event.ord = cur_max_read_depth(ep->com.dev);
+		event.ird = cur_max_read_depth(ep->com.dev);
+		event.private_data_len = ep->plen;
+		event.private_data = ep->mpa_pkt + sizeof(struct mpa_message);
+	}
+	c4iw_get_ep(&ep->com);
+	ret = ep->parent_ep->com.cm_id->event_handler(ep->parent_ep->com.cm_id,
+						      &event);
+	if (ret)
+		c4iw_put_ep(&ep->com);
+	set_bit(CONNREQ_UPCALL, &ep->com.history);
+	c4iw_put_ep(&ep->parent_ep->com);
+	return ret;
+}
+
+static void established_upcall(struct c4iw_ep *ep)
+{
+	struct iw_cm_event event;
+
+	PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid);
+	memset(&event, 0, sizeof(event));
+	event.event = IW_CM_EVENT_ESTABLISHED;
+	event.ird = ep->ird;
+	event.ord = ep->ord;
+	if (ep->com.cm_id) {
+		PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid);
+		ep->com.cm_id->event_handler(ep->com.cm_id, &event);
+		set_bit(ESTAB_UPCALL, &ep->com.history);
+	}
+}
+
+static int update_rx_credits(struct c4iw_ep *ep, u32 credits)
+{
+	struct cpl_rx_data_ack *req;
+	struct sk_buff *skb;
+	int wrlen = roundup(sizeof *req, 16);
+
+	PDBG("%s ep %p tid %u credits %u\n", __func__, ep, ep->hwtid, credits);
+	skb = get_skb(NULL, wrlen, GFP_KERNEL);
+	if (!skb) {
+		printk(KERN_ERR MOD "update_rx_credits - cannot alloc skb!\n");
+		return 0;
+	}
+
+	/*
+	 * If we couldn't specify the entire rcv window at connection setup
+	 * due to the limit in the number of bits in the RCV_BUFSIZ field,
+	 * then add the overage in to the credits returned.
+	 */
+	if (ep->rcv_win > RCV_BUFSIZ_MASK * 1024)
+		credits += ep->rcv_win - RCV_BUFSIZ_MASK * 1024;
+
+	req = (struct cpl_rx_data_ack *) skb_put(skb, wrlen);
+	memset(req, 0, wrlen);
+	INIT_TP_WR(req, ep->hwtid);
+	OPCODE_TID(req) = cpu_to_be32(MK_OPCODE_TID(CPL_RX_DATA_ACK,
+						    ep->hwtid));
+	req->credit_dack = cpu_to_be32(credits | RX_FORCE_ACK(1) |
+				       F_RX_DACK_CHANGE |
+				       V_RX_DACK_MODE(dack_mode));
+	set_wr_txq(skb, CPL_PRIORITY_ACK, ep->ctrlq_idx);
+	c4iw_ofld_send(&ep->com.dev->rdev, skb);
+	return credits;
+}
+
+#define RELAXED_IRD_NEGOTIATION 1
+
+static int process_mpa_reply(struct c4iw_ep *ep, struct sk_buff *skb)
+{
+	struct mpa_message *mpa;
+	struct mpa_v2_conn_params *mpa_v2_params;
+	u16 plen;
+	u16 resp_ird, resp_ord;
+	u8 rtr_mismatch = 0, insuff_ird = 0;
+	struct c4iw_qp_attributes attrs;
+	enum c4iw_qp_attr_mask mask;
+	int err;
+	int disconnect = 0;
+
+	PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid);
+
+	/*
+	 * Stop mpa timer.  If it expired, then
+	 * we ignore the MPA reply.  process_timeout()
+	 * will abort the connection.
+	 */
+	if (stop_ep_timer(ep))
+		return 0;
+
+	/*
+	 * If we get more than the supported amount of private data
+	 * then we must fail this connection.
+	 */
+	if (ep->mpa_pkt_len + skb->len > sizeof(ep->mpa_pkt)) {
+		err = -EINVAL;
+		goto err;
+	}
+
+	/*
+	 * copy the new data into our accumulation buffer.
+	 */
+	skb_copy_from_linear_data(skb, &(ep->mpa_pkt[ep->mpa_pkt_len]),
+				  skb->len);
+	ep->mpa_pkt_len += skb->len;
+
+	/*
+	 * if we don't even have the mpa message, then bail.
+	 */
+	if (ep->mpa_pkt_len < sizeof(*mpa))
+		return 0;
+	mpa = (struct mpa_message *) ep->mpa_pkt;
+
+	/* Validate MPA header. */
+	if (mpa->revision > mpa_rev) {
+		printk(KERN_ERR MOD "%s MPA version mismatch. Local = %d,"
+		       " Received = %d\n", __func__, mpa_rev, mpa->revision);
+		err = -EPROTO;
+		goto err;
+	}
+	if (memcmp(mpa->key, MPA_KEY_REP, sizeof(mpa->key))) {
+		err = -EPROTO;
+		goto err;
+	}
+
+	plen = ntohs(mpa->private_data_size);
+
+	/*
+	 * Fail if there's too much private data.
+	 */
+	if (plen > MPA_MAX_PRIVATE_DATA) {
+		err = -EPROTO;
+		goto err;
+	}
+
+	/*
+	 * If plen does not account for pkt size
+	 */
+	if (ep->mpa_pkt_len > (sizeof(*mpa) + plen)) {
+		err = -EPROTO;
+		goto err;
+	}
+
+	ep->plen = (u8) plen;
+
+	/*
+	 * If we don't have all the pdata yet, then bail.
+	 * We'll continue process when more data arrives.
+	 */
+	if (ep->mpa_pkt_len < (sizeof(*mpa) + plen))
+		return 0;
+
+	if (mpa->flags & MPA_REJECT) {
+		err = -ECONNREFUSED;
+		goto err;
+	}
+
+	/*
+	 * If we get here we have accumulated the entire mpa
+	 * start reply message including private data. And
+	 * the MPA header is valid.
+	 */
+	__state_set(&ep->com, FPDU_MODE);
+	ep->mpa_attr.crc_enabled = (mpa->flags & MPA_CRC) | crc_enabled ? 1 : 0;
+	ep->mpa_attr.recv_marker_enabled = markers_enabled;
+	ep->mpa_attr.xmit_marker_enabled = mpa->flags & MPA_MARKERS ? 1 : 0;
+	ep->mpa_attr.version = mpa->revision;
+	ep->mpa_attr.p2p_type = FW_RI_INIT_P2PTYPE_DISABLED;
+
+	if (mpa->revision == 2) {
+		ep->mpa_attr.enhanced_rdma_conn =
+			mpa->flags & MPA_ENHANCED_RDMA_CONN ? 1 : 0;
+		if (ep->mpa_attr.enhanced_rdma_conn) {
+			mpa_v2_params = (struct mpa_v2_conn_params *)
+				(ep->mpa_pkt + sizeof(*mpa));
+			resp_ird = ntohs(mpa_v2_params->ird) &
+				MPA_V2_IRD_ORD_MASK;
+			resp_ord = ntohs(mpa_v2_params->ord) &
+				MPA_V2_IRD_ORD_MASK;
+			PDBG("%s responder ird %u ord %u ep ird %u ord %u\n",
+			     __func__, resp_ird, resp_ord, ep->ird, ep->ord);
+
+			/*
+			 * This is a double-check. Ideally, below checks are
+			 * not required since ird/ord stuff has been taken
+			 * care of in c4iw_accept_cr
+			 */
+			if (ep->ird < resp_ord) {
+				if (RELAXED_IRD_NEGOTIATION && resp_ord <=
+				    ep->com.dev->rdev.lldi.max_ordird_qp)
+					ep->ird = resp_ord;
+				else
+					insuff_ird = 1;
+			} else if (ep->ird > resp_ord) {
+				ep->ird = resp_ord;
+			}
+			if (ep->ord > resp_ird) {
+				if (RELAXED_IRD_NEGOTIATION)
+					ep->ord = resp_ird;
+				else
+					insuff_ird = 1;
+			}
+			if (insuff_ird) {
+				err = -ENOMEM;
+				ep->ird = resp_ord;
+				ep->ord = resp_ird;
+			}
+
+			if (ntohs(mpa_v2_params->ird) &
+					MPA_V2_PEER2PEER_MODEL) {
+				if (ntohs(mpa_v2_params->ord) &
+						MPA_V2_RDMA_WRITE_RTR)
+					ep->mpa_attr.p2p_type =
+						FW_RI_INIT_P2PTYPE_RDMA_WRITE;
+				else if (ntohs(mpa_v2_params->ord) &
+						MPA_V2_RDMA_READ_RTR)
+					ep->mpa_attr.p2p_type =
+						FW_RI_INIT_P2PTYPE_READ_REQ;
+			}
+		}
+	} else if (mpa->revision == 1)
+		if (peer2peer)
+			ep->mpa_attr.p2p_type = p2p_type;
+
+	PDBG("%s - crc_enabled=%d, recv_marker_enabled=%d, "
+	     "xmit_marker_enabled=%d, version=%d p2p_type=%d local-p2p_type = "
+	     "%d\n", __func__, ep->mpa_attr.crc_enabled,
+	     ep->mpa_attr.recv_marker_enabled,
+	     ep->mpa_attr.xmit_marker_enabled, ep->mpa_attr.version,
+	     ep->mpa_attr.p2p_type, p2p_type);
+
+	/*
+	 * If responder's RTR does not match with that of initiator, assign
+	 * FW_RI_INIT_P2PTYPE_DISABLED in mpa attributes so that RTR is not
+	 * generated when moving QP to RTS state.
+	 * A TERM message will be sent after QP has moved to RTS state
+	 */
+	if ((ep->mpa_attr.version == 2) && peer2peer &&
+			(ep->mpa_attr.p2p_type != p2p_type)) {
+		ep->mpa_attr.p2p_type = FW_RI_INIT_P2PTYPE_DISABLED;
+		rtr_mismatch = 1;
+	}
+
+	attrs.mpa_attr = ep->mpa_attr;
+	attrs.max_ird = ep->ird;
+	attrs.max_ord = ep->ord;
+	attrs.llp_stream_handle = ep;
+	attrs.next_state = C4IW_QP_STATE_RTS;
+
+	mask = C4IW_QP_ATTR_NEXT_STATE |
+	    C4IW_QP_ATTR_LLP_STREAM_HANDLE | C4IW_QP_ATTR_MPA_ATTR |
+	    C4IW_QP_ATTR_MAX_IRD | C4IW_QP_ATTR_MAX_ORD;
+
+	/* bind QP and TID with INIT_WR */
+	err = c4iw_modify_qp(ep->com.qp->rhp,
+			     ep->com.qp, mask, &attrs, 1);
+	if (err)
+		goto err;
+
+	/*
+	 * If responder's RTR requirement did not match with what initiator
+	 * supports, generate TERM message
+	 */
+	if (rtr_mismatch) {
+		printk(KERN_ERR "%s: RTR mismatch, sending TERM\n", __func__);
+		attrs.layer_etype = LAYER_MPA | DDP_LLP;
+		attrs.ecode = MPA_NOMATCH_RTR;
+		attrs.next_state = C4IW_QP_STATE_TERMINATE;
+		attrs.send_term = 1;
+		err = c4iw_modify_qp(ep->com.qp->rhp, ep->com.qp,
+				C4IW_QP_ATTR_NEXT_STATE, &attrs, 1);
+		err = -ENOMEM;
+		disconnect = 1;
+		goto out;
+	}
+
+	/*
+	 * Generate TERM if initiator IRD is not sufficient for responder
+	 * provided ORD. Currently, we do the same behaviour even when
+	 * responder provided IRD is also not sufficient as regards to
+	 * initiator ORD.
+	 */
+	if (insuff_ird) {
+		printk(KERN_ERR "%s: Insufficient IRD, sending TERM\n",
+				__func__);
+		attrs.layer_etype = LAYER_MPA | DDP_LLP;
+		attrs.ecode = MPA_INSUFF_IRD;
+		attrs.next_state = C4IW_QP_STATE_TERMINATE;
+		attrs.send_term = 1;
+		err = c4iw_modify_qp(ep->com.qp->rhp, ep->com.qp,
+				C4IW_QP_ATTR_NEXT_STATE, &attrs, 1);
+		err = -ENOMEM;
+		disconnect = 1;
+		goto out;
+	}
+	goto out;
+err:
+	__state_set(&ep->com, ABORTING);
+	send_abort(ep, skb, GFP_KERNEL);
+out:
+	connect_reply_upcall(ep, err);
+	return disconnect;
+}
+
+static void process_mpa_request(struct c4iw_ep *ep, struct sk_buff *skb)
+{
+	struct mpa_message *mpa;
+	struct mpa_v2_conn_params *mpa_v2_params;
+	u16 plen;
+
+	PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid);
+
+	/*
+	 * If we get more than the supported amount of private data
+	 * then we must fail this connection.
+	 */
+	if (ep->mpa_pkt_len + skb->len > sizeof(ep->mpa_pkt)) {
+		(void)stop_ep_timer(ep);
+		abort_connection(ep, skb, GFP_KERNEL);
+		return;
+	}
+
+	PDBG("%s enter (%s line %u)\n", __func__, __FILE__, __LINE__);
+
+	/*
+	 * Copy the new data into our accumulation buffer.
+	 */
+	skb_copy_from_linear_data(skb, &(ep->mpa_pkt[ep->mpa_pkt_len]),
+				  skb->len);
+	ep->mpa_pkt_len += skb->len;
+
+	/*
+	 * If we don't even have the mpa message, then bail.
+	 * We'll continue process when more data arrives.
+	 */
+	if (ep->mpa_pkt_len < sizeof(*mpa))
+		return;
+
+	PDBG("%s enter (%s line %u)\n", __func__, __FILE__, __LINE__);
+	mpa = (struct mpa_message *) ep->mpa_pkt;
+
+	/*
+	 * Validate MPA Header.
+	 */
+	if (mpa->revision > mpa_rev) {
+		printk(KERN_ERR MOD "%s MPA version mismatch. Local = %d,"
+		       " Received = %d\n", __func__, mpa_rev, mpa->revision);
+		(void)stop_ep_timer(ep);
+		abort_connection(ep, skb, GFP_KERNEL);
+		return;
+	}
+
+	if (memcmp(mpa->key, MPA_KEY_REQ, sizeof(mpa->key))) {
+		(void)stop_ep_timer(ep);
+		abort_connection(ep, skb, GFP_KERNEL);
+		return;
+	}
+
+	plen = ntohs(mpa->private_data_size);
+
+	/*
+	 * Fail if there's too much private data.
+	 */
+	if (plen > MPA_MAX_PRIVATE_DATA) {
+		(void)stop_ep_timer(ep);
+		abort_connection(ep, skb, GFP_KERNEL);
+		return;
+	}
+
+	/*
+	 * If plen does not account for pkt size
+	 */
+	if (ep->mpa_pkt_len > (sizeof(*mpa) + plen)) {
+		(void)stop_ep_timer(ep);
+		abort_connection(ep, skb, GFP_KERNEL);
+		return;
+	}
+	ep->plen = (u8) plen;
+
+	/*
+	 * If we don't have all the pdata yet, then bail.
+	 */
+	if (ep->mpa_pkt_len < (sizeof(*mpa) + plen))
+		return;
+
+	/*
+	 * If we get here we have accumulated the entire mpa
+	 * start reply message including private data.
+	 */
+	ep->mpa_attr.initiator = 0;
+	ep->mpa_attr.crc_enabled = (mpa->flags & MPA_CRC) | crc_enabled ? 1 : 0;
+	ep->mpa_attr.recv_marker_enabled = markers_enabled;
+	ep->mpa_attr.xmit_marker_enabled = mpa->flags & MPA_MARKERS ? 1 : 0;
+	ep->mpa_attr.version = mpa->revision;
+	if (mpa->revision == 1)
+		ep->tried_with_mpa_v1 = 1;
+	ep->mpa_attr.p2p_type = FW_RI_INIT_P2PTYPE_DISABLED;
+
+	if (mpa->revision == 2) {
+		ep->mpa_attr.enhanced_rdma_conn =
+			mpa->flags & MPA_ENHANCED_RDMA_CONN ? 1 : 0;
+		if (ep->mpa_attr.enhanced_rdma_conn) {
+			mpa_v2_params = (struct mpa_v2_conn_params *)
+				(ep->mpa_pkt + sizeof(*mpa));
+			ep->ird = ntohs(mpa_v2_params->ird) &
+				MPA_V2_IRD_ORD_MASK;
+			ep->ord = ntohs(mpa_v2_params->ord) &
+				MPA_V2_IRD_ORD_MASK;
+			PDBG("%s initiator ird %u ord %u\n", __func__, ep->ird,
+			     ep->ord);
+			if (ntohs(mpa_v2_params->ird) & MPA_V2_PEER2PEER_MODEL)
+				if (peer2peer) {
+					if (ntohs(mpa_v2_params->ord) &
+							MPA_V2_RDMA_WRITE_RTR)
+						ep->mpa_attr.p2p_type =
+						FW_RI_INIT_P2PTYPE_RDMA_WRITE;
+					else if (ntohs(mpa_v2_params->ord) &
+							MPA_V2_RDMA_READ_RTR)
+						ep->mpa_attr.p2p_type =
+						FW_RI_INIT_P2PTYPE_READ_REQ;
+				}
+		}
+	} else if (mpa->revision == 1)
+		if (peer2peer)
+			ep->mpa_attr.p2p_type = p2p_type;
+
+	PDBG("%s - crc_enabled=%d, recv_marker_enabled=%d, "
+	     "xmit_marker_enabled=%d, version=%d p2p_type=%d\n", __func__,
+	     ep->mpa_attr.crc_enabled, ep->mpa_attr.recv_marker_enabled,
+	     ep->mpa_attr.xmit_marker_enabled, ep->mpa_attr.version,
+	     ep->mpa_attr.p2p_type);
+
+	/*
+	 * If the endpoint timer already expired, then we ignore
+	 * the start request.  process_timeout() will abort
+	 * the connection.
+	 */
+	if (!stop_ep_timer(ep)) {
+		__state_set(&ep->com, MPA_REQ_RCVD);
+
+		/* drive upcall */
+		mutex_lock(&ep->parent_ep->com.mutex);
+		if (ep->parent_ep->com.state != DEAD) {
+			if (connect_request_upcall(ep))
+				abort_connection(ep, skb, GFP_KERNEL);
+		} else {
+			abort_connection(ep, skb, GFP_KERNEL);
+		}
+		mutex_unlock(&ep->parent_ep->com.mutex);
+	}
+	return;
+}
+
+static int rx_data(struct c4iw_dev *dev, struct sk_buff *skb)
+{
+	struct c4iw_ep *ep;
+	struct cpl_rx_data *hdr = cplhdr(skb);
+	unsigned int dlen = ntohs(hdr->len);
+	unsigned int tid = GET_TID(hdr);
+	struct tid_info *t = dev->rdev.lldi.tids;
+	__u8 status = hdr->status;
+	int disconnect = 0;
+
+	ep = lookup_tid(t, tid);
+	if (!ep)
+		return 0;
+	PDBG("%s ep %p tid %u dlen %u\n", __func__, ep, ep->hwtid, dlen);
+	skb_pull(skb, sizeof(*hdr));
+	skb_trim(skb, dlen);
+	mutex_lock(&ep->com.mutex);
+
+	/* update RX credits */
+	update_rx_credits(ep, dlen);
+
+	switch (ep->com.state) {
+	case MPA_REQ_SENT:
+		ep->rcv_seq += dlen;
+		disconnect = process_mpa_reply(ep, skb);
+		break;
+	case MPA_REQ_WAIT:
+		ep->rcv_seq += dlen;
+		process_mpa_request(ep, skb);
+		break;
+	case FPDU_MODE: {
+		struct c4iw_qp_attributes attrs;
+		BUG_ON(!ep->com.qp);
+		if (status)
+			pr_err("%s Unexpected streaming data." \
+			       " qpid %u ep %p state %d tid %u status %d\n",
+			       __func__, ep->com.qp->wq.sq.qid, ep,
+			       ep->com.state, ep->hwtid, status);
+		attrs.next_state = C4IW_QP_STATE_TERMINATE;
+		c4iw_modify_qp(ep->com.qp->rhp, ep->com.qp,
+			       C4IW_QP_ATTR_NEXT_STATE, &attrs, 1);
+		disconnect = 1;
+		break;
+	}
+	default:
+		break;
+	}
+	mutex_unlock(&ep->com.mutex);
+	if (disconnect)
+		c4iw_ep_disconnect(ep, 0, GFP_KERNEL);
+	return 0;
+}
+
+static int abort_rpl(struct c4iw_dev *dev, struct sk_buff *skb)
+{
+	struct c4iw_ep *ep;
+	struct cpl_abort_rpl_rss *rpl = cplhdr(skb);
+	int release = 0;
+	unsigned int tid = GET_TID(rpl);
+	struct tid_info *t = dev->rdev.lldi.tids;
+
+	ep = lookup_tid(t, tid);
+	if (!ep) {
+		printk(KERN_WARNING MOD "Abort rpl to freed endpoint\n");
+		return 0;
+	}
+	PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid);
+	mutex_lock(&ep->com.mutex);
+	switch (ep->com.state) {
+	case ABORTING:
+		c4iw_wake_up(&ep->com.wr_wait, -ECONNRESET);
+		__state_set(&ep->com, DEAD);
+		release = 1;
+		break;
+	default:
+		printk(KERN_ERR "%s ep %p state %d\n",
+		     __func__, ep, ep->com.state);
+		break;
+	}
+	mutex_unlock(&ep->com.mutex);
+
+	if (release)
+		release_ep_resources(ep);
+	return 0;
+}
+
+static void send_fw_act_open_req(struct c4iw_ep *ep, unsigned int atid)
+{
+	struct sk_buff *skb;
+	struct fw_ofld_connection_wr *req;
+	unsigned int mtu_idx;
+	int wscale;
+	struct sockaddr_in *sin;
+	int win;
+
+	skb = get_skb(NULL, sizeof(*req), GFP_KERNEL);
+	req = (struct fw_ofld_connection_wr *)__skb_put(skb, sizeof(*req));
+	memset(req, 0, sizeof(*req));
+	req->op_compl = htonl(V_WR_OP(FW_OFLD_CONNECTION_WR));
+	req->len16_pkd = htonl(FW_WR_LEN16(DIV_ROUND_UP(sizeof(*req), 16)));
+	req->le.filter = cpu_to_be32(cxgb4_select_ntuple(
+				     ep->com.dev->rdev.lldi.ports[0],
+				     ep->l2t));
+	sin = (struct sockaddr_in *)&ep->com.mapped_local_addr;
+	req->le.lport = sin->sin_port;
+	req->le.u.ipv4.lip = sin->sin_addr.s_addr;
+	sin = (struct sockaddr_in *)&ep->com.mapped_remote_addr;
+	req->le.pport = sin->sin_port;
+	req->le.u.ipv4.pip = sin->sin_addr.s_addr;
+	req->tcb.t_state_to_astid =
+			htonl(V_FW_OFLD_CONNECTION_WR_T_STATE(TCP_SYN_SENT) |
+			V_FW_OFLD_CONNECTION_WR_ASTID(atid));
+	req->tcb.cplrxdataack_cplpassacceptrpl =
+			htons(F_FW_OFLD_CONNECTION_WR_CPLRXDATAACK);
+	req->tcb.tx_max = (__force __be32) jiffies;
+	req->tcb.rcv_adv = htons(1);
+	best_mtu(ep->com.dev->rdev.lldi.mtus, ep->mtu, &mtu_idx,
+		 enable_tcp_timestamps,
+		 (AF_INET == ep->com.remote_addr.ss_family) ? 0 : 1);
+	wscale = compute_wscale(rcv_win);
+
+	/*
+	 * Specify the largest window that will fit in opt0. The
+	 * remainder will be specified in the rx_data_ack.
+	 */
+	win = ep->rcv_win >> 10;
+	if (win > RCV_BUFSIZ_MASK)
+		win = RCV_BUFSIZ_MASK;
+
+	req->tcb.opt0 = (__force __be64) (TCAM_BYPASS(1) |
+		(nocong ? NO_CONG(1) : 0) |
+		KEEP_ALIVE(1) |
+		DELACK(1) |
+		WND_SCALE(wscale) |
+		MSS_IDX(mtu_idx) |
+		L2T_IDX(ep->l2t->idx) |
+		TX_CHAN(ep->tx_chan) |
+		SMAC_SEL(ep->smac_idx) |
+		DSCP(ep->tos) |
+		ULP_MODE(ULP_MODE_TCPDDP) |
+		RCV_BUFSIZ(win));
+	req->tcb.opt2 = (__force __be32) (PACE(1) |
+		TX_QUEUE(ep->com.dev->rdev.lldi.tx_modq[ep->tx_chan]) |
+		RX_CHANNEL(0) |
+		CCTRL_ECN(enable_ecn) |
+		RSS_QUEUE_VALID | RSS_QUEUE(ep->rss_qid));
+	if (enable_tcp_timestamps)
+		req->tcb.opt2 |= (__force __be32) TSTAMPS_EN(1);
+	if (enable_tcp_sack)
+		req->tcb.opt2 |= (__force __be32) SACK_EN(1);
+	if (wscale && enable_tcp_window_scaling)
+		req->tcb.opt2 |= (__force __be32) WND_SCALE_EN(1);
+	req->tcb.opt0 = cpu_to_be64((__force u64) req->tcb.opt0);
+	req->tcb.opt2 = cpu_to_be32((__force u32) req->tcb.opt2);
+	set_wr_txq(skb, CPL_PRIORITY_CONTROL, ep->ctrlq_idx);
+	set_bit(ACT_OFLD_CONN, &ep->com.history);
+	c4iw_l2t_send(&ep->com.dev->rdev, skb, ep->l2t);
+}
+
+/*
+ * Return whether a failed active open has allocated a TID
+ */
+static inline int act_open_has_tid(int status)
+{
+	return status != CPL_ERR_TCAM_FULL && status != CPL_ERR_CONN_EXIST &&
+	       status != CPL_ERR_ARP_MISS;
+}
+
+/* Returns whether a CPL status conveys negative advice.
+ */
+static int is_neg_adv(unsigned int status)
+{
+	return status == CPL_ERR_RTX_NEG_ADVICE ||
+	       status == CPL_ERR_PERSIST_NEG_ADVICE ||
+	       status == CPL_ERR_KEEPALV_NEG_ADVICE;
+}
+
+static char *neg_adv_str(unsigned int status)
+{
+	switch (status) {
+	case CPL_ERR_RTX_NEG_ADVICE:
+		return "Retransmit timeout";
+	case CPL_ERR_PERSIST_NEG_ADVICE:
+		return "Persist timeout";
+	case CPL_ERR_KEEPALV_NEG_ADVICE:
+		return "Keepalive timeout";
+	default:
+		return "Unknown";
+	}
+}
+
+static void set_tcp_window(struct c4iw_ep *ep, struct port_info *pi)
+{
+	ep->snd_win = snd_win;
+	ep->rcv_win = rcv_win;
+	PDBG("%s snd_win %d rcv_win %d\n", __func__, ep->snd_win, ep->rcv_win);
+}
+
+#define ACT_OPEN_RETRY_COUNT 2
+
+static int import_ep(struct c4iw_ep *ep, int iptype, __u8 *peer_ip,
+		     struct dst_entry *dst, struct c4iw_dev *cdev,
+		     bool clear_mpa_v1)
+{
+	struct neighbour *n;
+	int err, step;
+	struct net_device *pdev;
+
+	n = dst_neigh_lookup(dst, peer_ip);
+	if (!n)
+		return -ENODEV;
+
+	rcu_read_lock();
+	err = -ENOMEM;
+	if (n->dev->flags & IFF_LOOPBACK) {
+		if (iptype == 4)
+			pdev = ip_dev_find(&init_net, *(__be32 *)peer_ip);
+		else if (IS_ENABLED(CONFIG_IPV6))
+			for_each_netdev(&init_net, pdev) {
+				if (ipv6_chk_addr(&init_net,
+						  (struct in6_addr *)peer_ip,
+						  pdev, 1))
+					break;
+			}
+		else
+			pdev = NULL;
+
+		if (!pdev) {
+			err = -ENODEV;
+			goto out;
+		}
+		ep->l2t = cxgb4_l2t_get(cdev->rdev.lldi.l2t,
+					n, pdev, 0);
+		if (!ep->l2t)
+			goto out;
+		ep->mtu = pdev->mtu;
+		ep->tx_chan = cxgb4_port_chan(pdev);
+		ep->smac_idx = (cxgb4_port_viid(pdev) & 0x7F) << 1;
+		step = cdev->rdev.lldi.ntxq /
+			cdev->rdev.lldi.nchan;
+		ep->txq_idx = cxgb4_port_idx(pdev) * step;
+		step = cdev->rdev.lldi.nrxq /
+			cdev->rdev.lldi.nchan;
+		ep->ctrlq_idx = cxgb4_port_idx(pdev);
+		ep->rss_qid = cdev->rdev.lldi.rxq_ids[
+			cxgb4_port_idx(pdev) * step];
+		set_tcp_window(ep, (struct port_info *)netdev_priv(pdev));
+		dev_put(pdev);
+	} else {
+		pdev = get_real_dev(n->dev);
+		ep->l2t = cxgb4_l2t_get(cdev->rdev.lldi.l2t,
+					n, pdev, 0);
+		if (!ep->l2t)
+			goto out;
+		ep->mtu = dst_mtu(dst);
+		ep->tx_chan = cxgb4_port_chan(pdev);
+		ep->smac_idx = (cxgb4_port_viid(pdev) & 0x7F) << 1;
+		step = cdev->rdev.lldi.ntxq /
+			cdev->rdev.lldi.nchan;
+		ep->txq_idx = cxgb4_port_idx(pdev) * step;
+		ep->ctrlq_idx = cxgb4_port_idx(pdev);
+		step = cdev->rdev.lldi.nrxq /
+			cdev->rdev.lldi.nchan;
+		ep->rss_qid = cdev->rdev.lldi.rxq_ids[
+			cxgb4_port_idx(pdev) * step];
+		set_tcp_window(ep, (struct port_info *)netdev_priv(pdev));
+
+		if (clear_mpa_v1) {
+			ep->retry_with_mpa_v1 = 0;
+			ep->tried_with_mpa_v1 = 0;
+		}
+	}
+	err = 0;
+out:
+	rcu_read_unlock();
+
+	neigh_release(n);
+
+	return err;
+}
+
+static int c4iw_reconnect(struct c4iw_ep *ep)
+{
+	int err = 0;
+	struct sockaddr_in *laddr = (struct sockaddr_in *)
+				    &ep->com.cm_id->local_addr;
+	struct sockaddr_in *raddr = (struct sockaddr_in *)
+				    &ep->com.cm_id->remote_addr;
+	struct sockaddr_in6 *laddr6 = (struct sockaddr_in6 *)
+				      &ep->com.cm_id->local_addr;
+	struct sockaddr_in6 *raddr6 = (struct sockaddr_in6 *)
+				      &ep->com.cm_id->remote_addr;
+	int iptype;
+	__u8 *ra;
+
+	PDBG("%s qp %p cm_id %p\n", __func__, ep->com.qp, ep->com.cm_id);
+	init_timer(&ep->timer);
+
+	/*
+	 * Allocate an active TID to initiate a TCP connection.
+	 */
+	ep->atid = cxgb4_alloc_atid(ep->com.dev->rdev.lldi.tids, ep);
+	if (ep->atid == -1) {
+		pr_err("%s - cannot alloc atid.\n", __func__);
+		err = -ENOMEM;
+		goto fail2;
+	}
+	insert_handle(ep->com.dev, &ep->com.dev->atid_idr, ep, ep->atid);
+
+	/* find a route */
+	if (ep->com.cm_id->local_addr.ss_family == AF_INET) {
+		ep->dst = find_route(ep->com.dev, laddr->sin_addr.s_addr,
+				     raddr->sin_addr.s_addr, laddr->sin_port,
+				     raddr->sin_port, 0);
+		iptype = 4;
+		ra = (__u8 *)&raddr->sin_addr;
+	} else {
+		ep->dst = find_route6(ep->com.dev, laddr6->sin6_addr.s6_addr,
+				      raddr6->sin6_addr.s6_addr,
+				      laddr6->sin6_port, raddr6->sin6_port, 0,
+				      raddr6->sin6_scope_id);
+		iptype = 6;
+		ra = (__u8 *)&raddr6->sin6_addr;
+	}
+	if (!ep->dst) {
+		pr_err("%s - cannot find route.\n", __func__);
+		err = -EHOSTUNREACH;
+		goto fail3;
+	}
+	err = import_ep(ep, iptype, ra, ep->dst, ep->com.dev, false);
+	if (err) {
+		pr_err("%s - cannot alloc l2e.\n", __func__);
+		goto fail4;
+	}
+
+	PDBG("%s txq_idx %u tx_chan %u smac_idx %u rss_qid %u l2t_idx %u\n",
+	     __func__, ep->txq_idx, ep->tx_chan, ep->smac_idx, ep->rss_qid,
+	     ep->l2t->idx);
+
+	state_set(&ep->com, CONNECTING);
+	ep->tos = 0;
+
+	/* send connect request to rnic */
+	err = send_connect(ep);
+	if (!err)
+		goto out;
+
+	cxgb4_l2t_release(ep->l2t);
+fail4:
+	dst_release(ep->dst);
+fail3:
+	remove_handle(ep->com.dev, &ep->com.dev->atid_idr, ep->atid);
+	cxgb4_free_atid(ep->com.dev->rdev.lldi.tids, ep->atid);
+fail2:
+	/*
+	 * remember to send notification to upper layer.
+	 * We are in here so the upper layer is not aware that this is
+	 * re-connect attempt and so, upper layer is still waiting for
+	 * response of 1st connect request.
+	 */
+	connect_reply_upcall(ep, -ECONNRESET);
+	c4iw_put_ep(&ep->com);
+out:
+	return err;
+}
+
+static int act_open_rpl(struct c4iw_dev *dev, struct sk_buff *skb)
+{
+	struct c4iw_ep *ep;
+	struct cpl_act_open_rpl *rpl = cplhdr(skb);
+	unsigned int atid = GET_TID_TID(GET_AOPEN_ATID(
+					ntohl(rpl->atid_status)));
+	struct tid_info *t = dev->rdev.lldi.tids;
+	int status = GET_AOPEN_STATUS(ntohl(rpl->atid_status));
+	struct sockaddr_in *la;
+	struct sockaddr_in *ra;
+	struct sockaddr_in6 *la6;
+	struct sockaddr_in6 *ra6;
+
+	ep = lookup_atid(t, atid);
+	la = (struct sockaddr_in *)&ep->com.mapped_local_addr;
+	ra = (struct sockaddr_in *)&ep->com.mapped_remote_addr;
+	la6 = (struct sockaddr_in6 *)&ep->com.mapped_local_addr;
+	ra6 = (struct sockaddr_in6 *)&ep->com.mapped_remote_addr;
+
+	PDBG("%s ep %p atid %u status %u errno %d\n", __func__, ep, atid,
+	     status, status2errno(status));
+
+	if (is_neg_adv(status)) {
+		dev_warn(&dev->rdev.lldi.pdev->dev,
+			 "Connection problems for atid %u status %u (%s)\n",
+			 atid, status, neg_adv_str(status));
+		return 0;
+	}
+
+	set_bit(ACT_OPEN_RPL, &ep->com.history);
+
+	/*
+	 * Log interesting failures.
+	 */
+	switch (status) {
+	case CPL_ERR_CONN_RESET:
+	case CPL_ERR_CONN_TIMEDOUT:
+		break;
+	case CPL_ERR_TCAM_FULL:
+		mutex_lock(&dev->rdev.stats.lock);
+		dev->rdev.stats.tcam_full++;
+		mutex_unlock(&dev->rdev.stats.lock);
+		if (ep->com.local_addr.ss_family == AF_INET &&
+		    dev->rdev.lldi.enable_fw_ofld_conn) {
+			send_fw_act_open_req(ep,
+					     GET_TID_TID(GET_AOPEN_ATID(
+					     ntohl(rpl->atid_status))));
+			return 0;
+		}
+		break;
+	case CPL_ERR_CONN_EXIST:
+		if (ep->retry_count++ < ACT_OPEN_RETRY_COUNT) {
+			set_bit(ACT_RETRY_INUSE, &ep->com.history);
+			remove_handle(ep->com.dev, &ep->com.dev->atid_idr,
+					atid);
+			cxgb4_free_atid(t, atid);
+			dst_release(ep->dst);
+			cxgb4_l2t_release(ep->l2t);
+			c4iw_reconnect(ep);
+			return 0;
+		}
+		break;
+	default:
+		if (ep->com.local_addr.ss_family == AF_INET) {
+			pr_info("Active open failure - atid %u status %u errno %d %pI4:%u->%pI4:%u\n",
+				atid, status, status2errno(status),
+				&la->sin_addr.s_addr, ntohs(la->sin_port),
+				&ra->sin_addr.s_addr, ntohs(ra->sin_port));
+		} else {
+			pr_info("Active open failure - atid %u status %u errno %d %pI6:%u->%pI6:%u\n",
+				atid, status, status2errno(status),
+				la6->sin6_addr.s6_addr, ntohs(la6->sin6_port),
+				ra6->sin6_addr.s6_addr, ntohs(ra6->sin6_port));
+		}
+		break;
+	}
+
+	connect_reply_upcall(ep, status2errno(status));
+	state_set(&ep->com, DEAD);
+
+	if (status && act_open_has_tid(status))
+		cxgb4_remove_tid(ep->com.dev->rdev.lldi.tids, 0, GET_TID(rpl));
+
+	remove_handle(ep->com.dev, &ep->com.dev->atid_idr, atid);
+	cxgb4_free_atid(t, atid);
+	dst_release(ep->dst);
+	cxgb4_l2t_release(ep->l2t);
+	c4iw_put_ep(&ep->com);
+
+	return 0;
+}
+
+static int pass_open_rpl(struct c4iw_dev *dev, struct sk_buff *skb)
+{
+	struct cpl_pass_open_rpl *rpl = cplhdr(skb);
+	struct tid_info *t = dev->rdev.lldi.tids;
+	unsigned int stid = GET_TID(rpl);
+	struct c4iw_listen_ep *ep = lookup_stid(t, stid);
+
+	if (!ep) {
+		PDBG("%s stid %d lookup failure!\n", __func__, stid);
+		goto out;
+	}
+	PDBG("%s ep %p status %d error %d\n", __func__, ep,
+	     rpl->status, status2errno(rpl->status));
+	c4iw_wake_up(&ep->com.wr_wait, status2errno(rpl->status));
+
+out:
+	return 0;
+}
+
+static int close_listsrv_rpl(struct c4iw_dev *dev, struct sk_buff *skb)
+{
+	struct cpl_close_listsvr_rpl *rpl = cplhdr(skb);
+	struct tid_info *t = dev->rdev.lldi.tids;
+	unsigned int stid = GET_TID(rpl);
+	struct c4iw_listen_ep *ep = lookup_stid(t, stid);
+
+	PDBG("%s ep %p\n", __func__, ep);
+	c4iw_wake_up(&ep->com.wr_wait, status2errno(rpl->status));
+	return 0;
+}
+
+static void accept_cr(struct c4iw_ep *ep, struct sk_buff *skb,
+		      struct cpl_pass_accept_req *req)
+{
+	struct cpl_pass_accept_rpl *rpl;
+	unsigned int mtu_idx;
+	u64 opt0;
+	u32 opt2;
+	int wscale;
+	struct cpl_t5_pass_accept_rpl *rpl5 = NULL;
+	int win;
+
+	PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid);
+	BUG_ON(skb_cloned(skb));
+
+	skb_get(skb);
+	rpl = cplhdr(skb);
+	if (is_t5(ep->com.dev->rdev.lldi.adapter_type)) {
+		skb_trim(skb, roundup(sizeof(*rpl5), 16));
+		rpl5 = (void *)rpl;
+		INIT_TP_WR(rpl5, ep->hwtid);
+	} else {
+		skb_trim(skb, sizeof(*rpl));
+		INIT_TP_WR(rpl, ep->hwtid);
+	}
+	OPCODE_TID(rpl) = cpu_to_be32(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL,
+						    ep->hwtid));
+
+	best_mtu(ep->com.dev->rdev.lldi.mtus, ep->mtu, &mtu_idx,
+		 enable_tcp_timestamps && req->tcpopt.tstamp,
+		 (AF_INET == ep->com.remote_addr.ss_family) ? 0 : 1);
+	wscale = compute_wscale(rcv_win);
+
+	/*
+	 * Specify the largest window that will fit in opt0. The
+	 * remainder will be specified in the rx_data_ack.
+	 */
+	win = ep->rcv_win >> 10;
+	if (win > RCV_BUFSIZ_MASK)
+		win = RCV_BUFSIZ_MASK;
+	opt0 = (nocong ? NO_CONG(1) : 0) |
+	       KEEP_ALIVE(1) |
+	       DELACK(1) |
+	       WND_SCALE(wscale) |
+	       MSS_IDX(mtu_idx) |
+	       L2T_IDX(ep->l2t->idx) |
+	       TX_CHAN(ep->tx_chan) |
+	       SMAC_SEL(ep->smac_idx) |
+	       DSCP(ep->tos >> 2) |
+	       ULP_MODE(ULP_MODE_TCPDDP) |
+	       RCV_BUFSIZ(win);
+	opt2 = RX_CHANNEL(0) |
+	       RSS_QUEUE_VALID | RSS_QUEUE(ep->rss_qid);
+
+	if (enable_tcp_timestamps && req->tcpopt.tstamp)
+		opt2 |= TSTAMPS_EN(1);
+	if (enable_tcp_sack && req->tcpopt.sack)
+		opt2 |= SACK_EN(1);
+	if (wscale && enable_tcp_window_scaling)
+		opt2 |= WND_SCALE_EN(1);
+	if (enable_ecn) {
+		const struct tcphdr *tcph;
+		u32 hlen = ntohl(req->hdr_len);
+
+		tcph = (const void *)(req + 1) + G_ETH_HDR_LEN(hlen) +
+			G_IP_HDR_LEN(hlen);
+		if (tcph->ece && tcph->cwr)
+			opt2 |= CCTRL_ECN(1);
+	}
+	if (is_t5(ep->com.dev->rdev.lldi.adapter_type)) {
+		u32 isn = (prandom_u32() & ~7UL) - 1;
+		opt2 |= T5_OPT_2_VALID;
+		opt2 |= V_CONG_CNTRL(CONG_ALG_TAHOE);
+		opt2 |= CONG_CNTRL_VALID; /* OPT_2_ISS for T5 */
+		rpl5 = (void *)rpl;
+		memset(&rpl5->iss, 0, roundup(sizeof(*rpl5)-sizeof(*rpl), 16));
+		if (peer2peer)
+			isn += 4;
+		rpl5->iss = cpu_to_be32(isn);
+		PDBG("%s iss %u\n", __func__, be32_to_cpu(rpl5->iss));
+	}
+
+	rpl->opt0 = cpu_to_be64(opt0);
+	rpl->opt2 = cpu_to_be32(opt2);
+	set_wr_txq(skb, CPL_PRIORITY_SETUP, ep->ctrlq_idx);
+	t4_set_arp_err_handler(skb, NULL, arp_failure_discard);
+	c4iw_l2t_send(&ep->com.dev->rdev, skb, ep->l2t);
+
+	return;
+}
+
+static void reject_cr(struct c4iw_dev *dev, u32 hwtid, struct sk_buff *skb)
+{
+	PDBG("%s c4iw_dev %p tid %u\n", __func__, dev, hwtid);
+	BUG_ON(skb_cloned(skb));
+	skb_trim(skb, sizeof(struct cpl_tid_release));
+	release_tid(&dev->rdev, hwtid, skb);
+	return;
+}
+
+static void get_4tuple(struct cpl_pass_accept_req *req, int *iptype,
+		       __u8 *local_ip, __u8 *peer_ip,
+		       __be16 *local_port, __be16 *peer_port)
+{
+	int eth_len = G_ETH_HDR_LEN(be32_to_cpu(req->hdr_len));
+	int ip_len = G_IP_HDR_LEN(be32_to_cpu(req->hdr_len));
+	struct iphdr *ip = (struct iphdr *)((u8 *)(req + 1) + eth_len);
+	struct ipv6hdr *ip6 = (struct ipv6hdr *)((u8 *)(req + 1) + eth_len);
+	struct tcphdr *tcp = (struct tcphdr *)
+			     ((u8 *)(req + 1) + eth_len + ip_len);
+
+	if (ip->version == 4) {
+		PDBG("%s saddr 0x%x daddr 0x%x sport %u dport %u\n", __func__,
+		     ntohl(ip->saddr), ntohl(ip->daddr), ntohs(tcp->source),
+		     ntohs(tcp->dest));
+		*iptype = 4;
+		memcpy(peer_ip, &ip->saddr, 4);
+		memcpy(local_ip, &ip->daddr, 4);
+	} else {
+		PDBG("%s saddr %pI6 daddr %pI6 sport %u dport %u\n", __func__,
+		     ip6->saddr.s6_addr, ip6->daddr.s6_addr, ntohs(tcp->source),
+		     ntohs(tcp->dest));
+		*iptype = 6;
+		memcpy(peer_ip, ip6->saddr.s6_addr, 16);
+		memcpy(local_ip, ip6->daddr.s6_addr, 16);
+	}
+	*peer_port = tcp->source;
+	*local_port = tcp->dest;
+
+	return;
+}
+
+static int pass_accept_req(struct c4iw_dev *dev, struct sk_buff *skb)
+{
+	struct c4iw_ep *child_ep = NULL, *parent_ep;
+	struct cpl_pass_accept_req *req = cplhdr(skb);
+	unsigned int stid = GET_POPEN_TID(ntohl(req->tos_stid));
+	struct tid_info *t = dev->rdev.lldi.tids;
+	unsigned int hwtid = GET_TID(req);
+	struct dst_entry *dst;
+	__u8 local_ip[16], peer_ip[16];
+	__be16 local_port, peer_port;
+	int err;
+	u16 peer_mss = ntohs(req->tcpopt.mss);
+	int iptype;
+	unsigned short hdrs;
+
+	parent_ep = lookup_stid(t, stid);
+	if (!parent_ep) {
+		PDBG("%s connect request on invalid stid %d\n", __func__, stid);
+		goto reject;
+	}
+
+	if (state_read(&parent_ep->com) != LISTEN) {
+		printk(KERN_ERR "%s - listening ep not in LISTEN\n",
+		       __func__);
+		goto reject;
+	}
+
+	get_4tuple(req, &iptype, local_ip, peer_ip, &local_port, &peer_port);
+
+	/* Find output route */
+	if (iptype == 4)  {
+		PDBG("%s parent ep %p hwtid %u laddr %pI4 raddr %pI4 lport %d rport %d peer_mss %d\n"
+		     , __func__, parent_ep, hwtid,
+		     local_ip, peer_ip, ntohs(local_port),
+		     ntohs(peer_port), peer_mss);
+		dst = find_route(dev, *(__be32 *)local_ip, *(__be32 *)peer_ip,
+				 local_port, peer_port,
+				 GET_POPEN_TOS(ntohl(req->tos_stid)));
+	} else {
+		PDBG("%s parent ep %p hwtid %u laddr %pI6 raddr %pI6 lport %d rport %d peer_mss %d\n"
+		     , __func__, parent_ep, hwtid,
+		     local_ip, peer_ip, ntohs(local_port),
+		     ntohs(peer_port), peer_mss);
+		dst = find_route6(dev, local_ip, peer_ip, local_port, peer_port,
+				  PASS_OPEN_TOS(ntohl(req->tos_stid)),
+				  ((struct sockaddr_in6 *)
+				  &parent_ep->com.local_addr)->sin6_scope_id);
+	}
+	if (!dst) {
+		printk(KERN_ERR MOD "%s - failed to find dst entry!\n",
+		       __func__);
+		goto reject;
+	}
+
+	child_ep = alloc_ep(sizeof(*child_ep), GFP_KERNEL);
+	if (!child_ep) {
+		printk(KERN_ERR MOD "%s - failed to allocate ep entry!\n",
+		       __func__);
+		dst_release(dst);
+		goto reject;
+	}
+
+	err = import_ep(child_ep, iptype, peer_ip, dst, dev, false);
+	if (err) {
+		printk(KERN_ERR MOD "%s - failed to allocate l2t entry!\n",
+		       __func__);
+		dst_release(dst);
+		kfree(child_ep);
+		goto reject;
+	}
+
+	hdrs = sizeof(struct iphdr) + sizeof(struct tcphdr) +
+	       ((enable_tcp_timestamps && req->tcpopt.tstamp) ? 12 : 0);
+	if (peer_mss && child_ep->mtu > (peer_mss + hdrs))
+		child_ep->mtu = peer_mss + hdrs;
+
+	state_set(&child_ep->com, CONNECTING);
+	child_ep->com.dev = dev;
+	child_ep->com.cm_id = NULL;
+	if (iptype == 4) {
+		struct sockaddr_in *sin = (struct sockaddr_in *)
+			&child_ep->com.local_addr;
+		sin->sin_family = PF_INET;
+		sin->sin_port = local_port;
+		sin->sin_addr.s_addr = *(__be32 *)local_ip;
+		sin = (struct sockaddr_in *)&child_ep->com.remote_addr;
+		sin->sin_family = PF_INET;
+		sin->sin_port = peer_port;
+		sin->sin_addr.s_addr = *(__be32 *)peer_ip;
+	} else {
+		struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)
+			&child_ep->com.local_addr;
+		sin6->sin6_family = PF_INET6;
+		sin6->sin6_port = local_port;
+		memcpy(sin6->sin6_addr.s6_addr, local_ip, 16);
+		sin6 = (struct sockaddr_in6 *)&child_ep->com.remote_addr;
+		sin6->sin6_family = PF_INET6;
+		sin6->sin6_port = peer_port;
+		memcpy(sin6->sin6_addr.s6_addr, peer_ip, 16);
+	}
+	c4iw_get_ep(&parent_ep->com);
+	child_ep->parent_ep = parent_ep;
+	child_ep->tos = GET_POPEN_TOS(ntohl(req->tos_stid));
+	child_ep->dst = dst;
+	child_ep->hwtid = hwtid;
+
+	PDBG("%s tx_chan %u smac_idx %u rss_qid %u\n", __func__,
+	     child_ep->tx_chan, child_ep->smac_idx, child_ep->rss_qid);
+
+	init_timer(&child_ep->timer);
+	cxgb4_insert_tid(t, child_ep, hwtid);
+	insert_handle(dev, &dev->hwtid_idr, child_ep, child_ep->hwtid);
+	accept_cr(child_ep, skb, req);
+	set_bit(PASS_ACCEPT_REQ, &child_ep->com.history);
+	goto out;
+reject:
+	reject_cr(dev, hwtid, skb);
+out:
+	return 0;
+}
+
+static int pass_establish(struct c4iw_dev *dev, struct sk_buff *skb)
+{
+	struct c4iw_ep *ep;
+	struct cpl_pass_establish *req = cplhdr(skb);
+	struct tid_info *t = dev->rdev.lldi.tids;
+	unsigned int tid = GET_TID(req);
+
+	ep = lookup_tid(t, tid);
+	PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid);
+	ep->snd_seq = be32_to_cpu(req->snd_isn);
+	ep->rcv_seq = be32_to_cpu(req->rcv_isn);
+
+	PDBG("%s ep %p hwtid %u tcp_opt 0x%02x\n", __func__, ep, tid,
+	     ntohs(req->tcp_opt));
+
+	set_emss(ep, ntohs(req->tcp_opt));
+
+	dst_confirm(ep->dst);
+	state_set(&ep->com, MPA_REQ_WAIT);
+	start_ep_timer(ep);
+	send_flowc(ep, skb);
+	set_bit(PASS_ESTAB, &ep->com.history);
+
+	return 0;
+}
+
+static int peer_close(struct c4iw_dev *dev, struct sk_buff *skb)
+{
+	struct cpl_peer_close *hdr = cplhdr(skb);
+	struct c4iw_ep *ep;
+	struct c4iw_qp_attributes attrs;
+	int disconnect = 1;
+	int release = 0;
+	struct tid_info *t = dev->rdev.lldi.tids;
+	unsigned int tid = GET_TID(hdr);
+	int ret;
+
+	ep = lookup_tid(t, tid);
+	PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid);
+	dst_confirm(ep->dst);
+
+	set_bit(PEER_CLOSE, &ep->com.history);
+	mutex_lock(&ep->com.mutex);
+	switch (ep->com.state) {
+	case MPA_REQ_WAIT:
+		__state_set(&ep->com, CLOSING);
+		break;
+	case MPA_REQ_SENT:
+		__state_set(&ep->com, CLOSING);
+		connect_reply_upcall(ep, -ECONNRESET);
+		break;
+	case MPA_REQ_RCVD:
+
+		/*
+		 * We're gonna mark this puppy DEAD, but keep
+		 * the reference on it until the ULP accepts or
+		 * rejects the CR. Also wake up anyone waiting
+		 * in rdma connection migration (see c4iw_accept_cr()).
+		 */
+		__state_set(&ep->com, CLOSING);
+		PDBG("waking up ep %p tid %u\n", ep, ep->hwtid);
+		c4iw_wake_up(&ep->com.wr_wait, -ECONNRESET);
+		break;
+	case MPA_REP_SENT:
+		__state_set(&ep->com, CLOSING);
+		PDBG("waking up ep %p tid %u\n", ep, ep->hwtid);
+		c4iw_wake_up(&ep->com.wr_wait, -ECONNRESET);
+		break;
+	case FPDU_MODE:
+		start_ep_timer(ep);
+		__state_set(&ep->com, CLOSING);
+		attrs.next_state = C4IW_QP_STATE_CLOSING;
+		ret = c4iw_modify_qp(ep->com.qp->rhp, ep->com.qp,
+				       C4IW_QP_ATTR_NEXT_STATE, &attrs, 1);
+		if (ret != -ECONNRESET) {
+			peer_close_upcall(ep);
+			disconnect = 1;
+		}
+		break;
+	case ABORTING:
+		disconnect = 0;
+		break;
+	case CLOSING:
+		__state_set(&ep->com, MORIBUND);
+		disconnect = 0;
+		break;
+	case MORIBUND:
+		(void)stop_ep_timer(ep);
+		if (ep->com.cm_id && ep->com.qp) {
+			attrs.next_state = C4IW_QP_STATE_IDLE;
+			c4iw_modify_qp(ep->com.qp->rhp, ep->com.qp,
+				       C4IW_QP_ATTR_NEXT_STATE, &attrs, 1);
+		}
+		close_complete_upcall(ep, 0);
+		__state_set(&ep->com, DEAD);
+		release = 1;
+		disconnect = 0;
+		break;
+	case DEAD:
+		disconnect = 0;
+		break;
+	default:
+		BUG_ON(1);
+	}
+	mutex_unlock(&ep->com.mutex);
+	if (disconnect)
+		c4iw_ep_disconnect(ep, 0, GFP_KERNEL);
+	if (release)
+		release_ep_resources(ep);
+	return 0;
+}
+
+static int peer_abort(struct c4iw_dev *dev, struct sk_buff *skb)
+{
+	struct cpl_abort_req_rss *req = cplhdr(skb);
+	struct c4iw_ep *ep;
+	struct cpl_abort_rpl *rpl;
+	struct sk_buff *rpl_skb;
+	struct c4iw_qp_attributes attrs;
+	int ret;
+	int release = 0;
+	struct tid_info *t = dev->rdev.lldi.tids;
+	unsigned int tid = GET_TID(req);
+
+	ep = lookup_tid(t, tid);
+	if (is_neg_adv(req->status)) {
+		dev_warn(&dev->rdev.lldi.pdev->dev,
+			 "Negative advice on abort - tid %u status %d (%s)\n",
+			 ep->hwtid, req->status, neg_adv_str(req->status));
+		return 0;
+	}
+	PDBG("%s ep %p tid %u state %u\n", __func__, ep, ep->hwtid,
+	     ep->com.state);
+	set_bit(PEER_ABORT, &ep->com.history);
+
+	/*
+	 * Wake up any threads in rdma_init() or rdma_fini().
+	 * However, this is not needed if com state is just
+	 * MPA_REQ_SENT
+	 */
+	if (ep->com.state != MPA_REQ_SENT)
+		c4iw_wake_up(&ep->com.wr_wait, -ECONNRESET);
+
+	mutex_lock(&ep->com.mutex);
+	switch (ep->com.state) {
+	case CONNECTING:
+		break;
+	case MPA_REQ_WAIT:
+		(void)stop_ep_timer(ep);
+		break;
+	case MPA_REQ_SENT:
+		(void)stop_ep_timer(ep);
+		if (mpa_rev == 1 || (mpa_rev == 2 && ep->tried_with_mpa_v1))
+			connect_reply_upcall(ep, -ECONNRESET);
+		else {
+			/*
+			 * we just don't send notification upwards because we
+			 * want to retry with mpa_v1 without upper layers even
+			 * knowing it.
+			 *
+			 * do some housekeeping so as to re-initiate the
+			 * connection
+			 */
+			PDBG("%s: mpa_rev=%d. Retrying with mpav1\n", __func__,
+			     mpa_rev);
+			ep->retry_with_mpa_v1 = 1;
+		}
+		break;
+	case MPA_REP_SENT:
+		break;
+	case MPA_REQ_RCVD:
+		break;
+	case MORIBUND:
+	case CLOSING:
+		stop_ep_timer(ep);
+		/*FALLTHROUGH*/
+	case FPDU_MODE:
+		if (ep->com.cm_id && ep->com.qp) {
+			attrs.next_state = C4IW_QP_STATE_ERROR;
+			ret = c4iw_modify_qp(ep->com.qp->rhp,
+				     ep->com.qp, C4IW_QP_ATTR_NEXT_STATE,
+				     &attrs, 1);
+			if (ret)
+				printk(KERN_ERR MOD
+				       "%s - qp <- error failed!\n",
+				       __func__);
+		}
+		peer_abort_upcall(ep);
+		break;
+	case ABORTING:
+		break;
+	case DEAD:
+		PDBG("%s PEER_ABORT IN DEAD STATE!!!!\n", __func__);
+		mutex_unlock(&ep->com.mutex);
+		return 0;
+	default:
+		BUG_ON(1);
+		break;
+	}
+	dst_confirm(ep->dst);
+	if (ep->com.state != ABORTING) {
+		__state_set(&ep->com, DEAD);
+		/* we don't release if we want to retry with mpa_v1 */
+		if (!ep->retry_with_mpa_v1)
+			release = 1;
+	}
+	mutex_unlock(&ep->com.mutex);
+
+	rpl_skb = get_skb(skb, sizeof(*rpl), GFP_KERNEL);
+	if (!rpl_skb) {
+		printk(KERN_ERR MOD "%s - cannot allocate skb!\n",
+		       __func__);
+		release = 1;
+		goto out;
+	}
+	set_wr_txq(skb, CPL_PRIORITY_DATA, ep->txq_idx);
+	rpl = (struct cpl_abort_rpl *) skb_put(rpl_skb, sizeof(*rpl));
+	INIT_TP_WR(rpl, ep->hwtid);
+	OPCODE_TID(rpl) = cpu_to_be32(MK_OPCODE_TID(CPL_ABORT_RPL, ep->hwtid));
+	rpl->cmd = CPL_ABORT_NO_RST;
+	c4iw_ofld_send(&ep->com.dev->rdev, rpl_skb);
+out:
+	if (release)
+		release_ep_resources(ep);
+	else if (ep->retry_with_mpa_v1) {
+		remove_handle(ep->com.dev, &ep->com.dev->hwtid_idr, ep->hwtid);
+		cxgb4_remove_tid(ep->com.dev->rdev.lldi.tids, 0, ep->hwtid);
+		dst_release(ep->dst);
+		cxgb4_l2t_release(ep->l2t);
+		c4iw_reconnect(ep);
+	}
+
+	return 0;
+}
+
+static int close_con_rpl(struct c4iw_dev *dev, struct sk_buff *skb)
+{
+	struct c4iw_ep *ep;
+	struct c4iw_qp_attributes attrs;
+	struct cpl_close_con_rpl *rpl = cplhdr(skb);
+	int release = 0;
+	struct tid_info *t = dev->rdev.lldi.tids;
+	unsigned int tid = GET_TID(rpl);
+
+	ep = lookup_tid(t, tid);
+
+	PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid);
+	BUG_ON(!ep);
+
+	/* The cm_id may be null if we failed to connect */
+	mutex_lock(&ep->com.mutex);
+	switch (ep->com.state) {
+	case CLOSING:
+		__state_set(&ep->com, MORIBUND);
+		break;
+	case MORIBUND:
+		(void)stop_ep_timer(ep);
+		if ((ep->com.cm_id) && (ep->com.qp)) {
+			attrs.next_state = C4IW_QP_STATE_IDLE;
+			c4iw_modify_qp(ep->com.qp->rhp,
+					     ep->com.qp,
+					     C4IW_QP_ATTR_NEXT_STATE,
+					     &attrs, 1);
+		}
+		close_complete_upcall(ep, 0);
+		__state_set(&ep->com, DEAD);
+		release = 1;
+		break;
+	case ABORTING:
+	case DEAD:
+		break;
+	default:
+		BUG_ON(1);
+		break;
+	}
+	mutex_unlock(&ep->com.mutex);
+	if (release)
+		release_ep_resources(ep);
+	return 0;
+}
+
+static int terminate(struct c4iw_dev *dev, struct sk_buff *skb)
+{
+	struct cpl_rdma_terminate *rpl = cplhdr(skb);
+	struct tid_info *t = dev->rdev.lldi.tids;
+	unsigned int tid = GET_TID(rpl);
+	struct c4iw_ep *ep;
+	struct c4iw_qp_attributes attrs;
+
+	ep = lookup_tid(t, tid);
+	BUG_ON(!ep);
+
+	if (ep && ep->com.qp) {
+		printk(KERN_WARNING MOD "TERM received tid %u qpid %u\n", tid,
+		       ep->com.qp->wq.sq.qid);
+		attrs.next_state = C4IW_QP_STATE_TERMINATE;
+		c4iw_modify_qp(ep->com.qp->rhp, ep->com.qp,
+			       C4IW_QP_ATTR_NEXT_STATE, &attrs, 1);
+	} else
+		printk(KERN_WARNING MOD "TERM received tid %u no ep/qp\n", tid);
+
+	return 0;
+}
+
+/*
+ * Upcall from the adapter indicating data has been transmitted.
+ * For us its just the single MPA request or reply.  We can now free
+ * the skb holding the mpa message.
+ */
+static int fw4_ack(struct c4iw_dev *dev, struct sk_buff *skb)
+{
+	struct c4iw_ep *ep;
+	struct cpl_fw4_ack *hdr = cplhdr(skb);
+	u8 credits = hdr->credits;
+	unsigned int tid = GET_TID(hdr);
+	struct tid_info *t = dev->rdev.lldi.tids;
+
+
+	ep = lookup_tid(t, tid);
+	PDBG("%s ep %p tid %u credits %u\n", __func__, ep, ep->hwtid, credits);
+	if (credits == 0) {
+		PDBG("%s 0 credit ack ep %p tid %u state %u\n",
+		     __func__, ep, ep->hwtid, state_read(&ep->com));
+		return 0;
+	}
+
+	dst_confirm(ep->dst);
+	if (ep->mpa_skb) {
+		PDBG("%s last streaming msg ack ep %p tid %u state %u "
+		     "initiator %u freeing skb\n", __func__, ep, ep->hwtid,
+		     state_read(&ep->com), ep->mpa_attr.initiator ? 1 : 0);
+		kfree_skb(ep->mpa_skb);
+		ep->mpa_skb = NULL;
+	}
+	return 0;
+}
+
+int c4iw_reject_cr(struct iw_cm_id *cm_id, const void *pdata, u8 pdata_len)
+{
+	int err = 0;
+	int disconnect = 0;
+	struct c4iw_ep *ep = to_ep(cm_id);
+	PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid);
+
+	mutex_lock(&ep->com.mutex);
+	if (ep->com.state == DEAD) {
+		mutex_unlock(&ep->com.mutex);
+		c4iw_put_ep(&ep->com);
+		return -ECONNRESET;
+	}
+	set_bit(ULP_REJECT, &ep->com.history);
+	BUG_ON(ep->com.state != MPA_REQ_RCVD);
+	if (mpa_rev == 0)
+		abort_connection(ep, NULL, GFP_KERNEL);
+	else {
+		err = send_mpa_reject(ep, pdata, pdata_len);
+		disconnect = 1;
+	}
+	mutex_unlock(&ep->com.mutex);
+	if (disconnect)
+		err = c4iw_ep_disconnect(ep, 0, GFP_KERNEL);
+	c4iw_put_ep(&ep->com);
+	return 0;
+}
+
+int c4iw_accept_cr(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
+{
+	int err;
+	struct c4iw_qp_attributes attrs;
+	enum c4iw_qp_attr_mask mask;
+	struct c4iw_ep *ep = to_ep(cm_id);
+	struct c4iw_dev *h = to_c4iw_dev(cm_id->device);
+	struct c4iw_qp *qp = get_qhp(h, conn_param->qpn);
+
+	PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid);
+
+	mutex_lock(&ep->com.mutex);
+	if (ep->com.state == DEAD) {
+		err = -ECONNRESET;
+		goto err;
+	}
+
+	BUG_ON(ep->com.state != MPA_REQ_RCVD);
+	BUG_ON(!qp);
+
+	set_bit(ULP_ACCEPT, &ep->com.history);
+	if ((conn_param->ord > cur_max_read_depth(ep->com.dev)) ||
+	    (conn_param->ird > cur_max_read_depth(ep->com.dev))) {
+		abort_connection(ep, NULL, GFP_KERNEL);
+		err = -EINVAL;
+		goto err;
+	}
+
+	if (ep->mpa_attr.version == 2 && ep->mpa_attr.enhanced_rdma_conn) {
+		if (conn_param->ord > ep->ird) {
+			if (RELAXED_IRD_NEGOTIATION) {
+				ep->ord = ep->ird;
+			} else {
+				ep->ird = conn_param->ird;
+				ep->ord = conn_param->ord;
+				send_mpa_reject(ep, conn_param->private_data,
+						conn_param->private_data_len);
+				abort_connection(ep, NULL, GFP_KERNEL);
+				err = -ENOMEM;
+				goto err;
+			}
+		}
+		if (conn_param->ird < ep->ord) {
+			if (RELAXED_IRD_NEGOTIATION &&
+			    ep->ord <= h->rdev.lldi.max_ordird_qp) {
+				conn_param->ird = ep->ord;
+			} else {
+				abort_connection(ep, NULL, GFP_KERNEL);
+				err = -ENOMEM;
+				goto err;
+			}
+		}
+	}
+	ep->ird = conn_param->ird;
+	ep->ord = conn_param->ord;
+
+	if (ep->mpa_attr.version == 1) {
+		if (peer2peer && ep->ird == 0)
+			ep->ird = 1;
+	} else {
+		if (peer2peer &&
+		    (ep->mpa_attr.p2p_type != FW_RI_INIT_P2PTYPE_DISABLED) &&
+		    (p2p_type == FW_RI_INIT_P2PTYPE_READ_REQ) && ep->ord == 0)
+			ep->ird = 1;
+	}
+
+	PDBG("%s %d ird %d ord %d\n", __func__, __LINE__, ep->ird, ep->ord);
+
+	cm_id->add_ref(cm_id);
+	ep->com.cm_id = cm_id;
+	ep->com.qp = qp;
+	ref_qp(ep);
+
+	/* bind QP to EP and move to RTS */
+	attrs.mpa_attr = ep->mpa_attr;
+	attrs.max_ird = ep->ird;
+	attrs.max_ord = ep->ord;
+	attrs.llp_stream_handle = ep;
+	attrs.next_state = C4IW_QP_STATE_RTS;
+
+	/* bind QP and TID with INIT_WR */
+	mask = C4IW_QP_ATTR_NEXT_STATE |
+			     C4IW_QP_ATTR_LLP_STREAM_HANDLE |
+			     C4IW_QP_ATTR_MPA_ATTR |
+			     C4IW_QP_ATTR_MAX_IRD |
+			     C4IW_QP_ATTR_MAX_ORD;
+
+	err = c4iw_modify_qp(ep->com.qp->rhp,
+			     ep->com.qp, mask, &attrs, 1);
+	if (err)
+		goto err1;
+	err = send_mpa_reply(ep, conn_param->private_data,
+			     conn_param->private_data_len);
+	if (err)
+		goto err1;
+
+	__state_set(&ep->com, FPDU_MODE);
+	established_upcall(ep);
+	mutex_unlock(&ep->com.mutex);
+	c4iw_put_ep(&ep->com);
+	return 0;
+err1:
+	ep->com.cm_id = NULL;
+	abort_connection(ep, NULL, GFP_KERNEL);
+	cm_id->rem_ref(cm_id);
+err:
+	mutex_unlock(&ep->com.mutex);
+	c4iw_put_ep(&ep->com);
+	return err;
+}
+
+static int pick_local_ipaddrs(struct c4iw_dev *dev, struct iw_cm_id *cm_id)
+{
+	struct in_device *ind;
+	int found = 0;
+	struct sockaddr_in *laddr = (struct sockaddr_in *)&cm_id->local_addr;
+	struct sockaddr_in *raddr = (struct sockaddr_in *)&cm_id->remote_addr;
+
+	ind = in_dev_get(dev->rdev.lldi.ports[0]);
+	if (!ind)
+		return -EADDRNOTAVAIL;
+	for_primary_ifa(ind) {
+		laddr->sin_addr.s_addr = ifa->ifa_address;
+		raddr->sin_addr.s_addr = ifa->ifa_address;
+		found = 1;
+		break;
+	}
+	endfor_ifa(ind);
+	in_dev_put(ind);
+	return found ? 0 : -EADDRNOTAVAIL;
+}
+
+static int get_lladdr(struct net_device *dev, struct in6_addr *addr,
+		      unsigned char banned_flags)
+{
+	struct inet6_dev *idev;
+	int err = -EADDRNOTAVAIL;
+
+	rcu_read_lock();
+	idev = __in6_dev_get(dev);
+	if (idev != NULL) {
+		struct inet6_ifaddr *ifp;
+
+		read_lock_bh(&idev->lock);
+		list_for_each_entry(ifp, &idev->addr_list, if_list) {
+			if (ifp->scope == IFA_LINK &&
+			    !(ifp->flags & banned_flags)) {
+				memcpy(addr, &ifp->addr, 16);
+				err = 0;
+				break;
+			}
+		}
+		read_unlock_bh(&idev->lock);
+	}
+	rcu_read_unlock();
+	return err;
+}
+
+static int pick_local_ip6addrs(struct c4iw_dev *dev, struct iw_cm_id *cm_id)
+{
+	struct in6_addr uninitialized_var(addr);
+	struct sockaddr_in6 *la6 = (struct sockaddr_in6 *)&cm_id->local_addr;
+	struct sockaddr_in6 *ra6 = (struct sockaddr_in6 *)&cm_id->remote_addr;
+
+	if (get_lladdr(dev->rdev.lldi.ports[0], &addr, IFA_F_TENTATIVE)) {
+		memcpy(la6->sin6_addr.s6_addr, &addr, 16);
+		memcpy(ra6->sin6_addr.s6_addr, &addr, 16);
+		return 0;
+	}
+	return -EADDRNOTAVAIL;
+}
+
+int c4iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
+{
+	struct c4iw_dev *dev = to_c4iw_dev(cm_id->device);
+	struct c4iw_ep *ep;
+	int err = 0;
+	struct sockaddr_in *laddr;
+	struct sockaddr_in *raddr;
+	struct sockaddr_in6 *laddr6;
+	struct sockaddr_in6 *raddr6;
+	struct iwpm_dev_data pm_reg_msg;
+	struct iwpm_sa_data pm_msg;
+	__u8 *ra;
+	int iptype;
+	int iwpm_err = 0;
+
+	if ((conn_param->ord > cur_max_read_depth(dev)) ||
+	    (conn_param->ird > cur_max_read_depth(dev))) {
+		err = -EINVAL;
+		goto out;
+	}
+	ep = alloc_ep(sizeof(*ep), GFP_KERNEL);
+	if (!ep) {
+		printk(KERN_ERR MOD "%s - cannot alloc ep.\n", __func__);
+		err = -ENOMEM;
+		goto out;
+	}
+	init_timer(&ep->timer);
+	ep->plen = conn_param->private_data_len;
+	if (ep->plen)
+		memcpy(ep->mpa_pkt + sizeof(struct mpa_message),
+		       conn_param->private_data, ep->plen);
+	ep->ird = conn_param->ird;
+	ep->ord = conn_param->ord;
+
+	if (peer2peer && ep->ord == 0)
+		ep->ord = 1;
+
+	cm_id->add_ref(cm_id);
+	ep->com.dev = dev;
+	ep->com.cm_id = cm_id;
+	ep->com.qp = get_qhp(dev, conn_param->qpn);
+	if (!ep->com.qp) {
+		PDBG("%s qpn 0x%x not found!\n", __func__, conn_param->qpn);
+		err = -EINVAL;
+		goto fail1;
+	}
+	ref_qp(ep);
+	PDBG("%s qpn 0x%x qp %p cm_id %p\n", __func__, conn_param->qpn,
+	     ep->com.qp, cm_id);
+
+	/*
+	 * Allocate an active TID to initiate a TCP connection.
+	 */
+	ep->atid = cxgb4_alloc_atid(dev->rdev.lldi.tids, ep);
+	if (ep->atid == -1) {
+		printk(KERN_ERR MOD "%s - cannot alloc atid.\n", __func__);
+		err = -ENOMEM;
+		goto fail1;
+	}
+	insert_handle(dev, &dev->atid_idr, ep, ep->atid);
+
+	memcpy(&ep->com.local_addr, &cm_id->local_addr,
+	       sizeof(ep->com.local_addr));
+	memcpy(&ep->com.remote_addr, &cm_id->remote_addr,
+	       sizeof(ep->com.remote_addr));
+
+	/* No port mapper available, go with the specified peer information */
+	memcpy(&ep->com.mapped_local_addr, &cm_id->local_addr,
+	       sizeof(ep->com.mapped_local_addr));
+	memcpy(&ep->com.mapped_remote_addr, &cm_id->remote_addr,
+	       sizeof(ep->com.mapped_remote_addr));
+
+	c4iw_form_reg_msg(dev, &pm_reg_msg);
+	iwpm_err = iwpm_register_pid(&pm_reg_msg, RDMA_NL_C4IW);
+	if (iwpm_err) {
+		PDBG("%s: Port Mapper reg pid fail (err = %d).\n",
+			__func__, iwpm_err);
+	}
+	if (iwpm_valid_pid() && !iwpm_err) {
+		c4iw_form_pm_msg(ep, &pm_msg);
+		iwpm_err = iwpm_add_and_query_mapping(&pm_msg, RDMA_NL_C4IW);
+		if (iwpm_err)
+			PDBG("%s: Port Mapper query fail (err = %d).\n",
+				__func__, iwpm_err);
+		else
+			c4iw_record_pm_msg(ep, &pm_msg);
+	}
+	if (iwpm_create_mapinfo(&ep->com.local_addr,
+				&ep->com.mapped_local_addr, RDMA_NL_C4IW)) {
+		iwpm_remove_mapping(&ep->com.local_addr, RDMA_NL_C4IW);
+		err = -ENOMEM;
+		goto fail1;
+	}
+	print_addr(&ep->com, __func__, "add_query/create_mapinfo");
+	set_bit(RELEASE_MAPINFO, &ep->com.flags);
+
+	laddr = (struct sockaddr_in *)&ep->com.mapped_local_addr;
+	raddr = (struct sockaddr_in *)&ep->com.mapped_remote_addr;
+	laddr6 = (struct sockaddr_in6 *)&ep->com.mapped_local_addr;
+	raddr6 = (struct sockaddr_in6 *) &ep->com.mapped_remote_addr;
+
+	if (cm_id->remote_addr.ss_family == AF_INET) {
+		iptype = 4;
+		ra = (__u8 *)&raddr->sin_addr;
+
+		/*
+		 * Handle loopback requests to INADDR_ANY.
+		 */
+		if ((__force int)raddr->sin_addr.s_addr == INADDR_ANY) {
+			err = pick_local_ipaddrs(dev, cm_id);
+			if (err)
+				goto fail1;
+		}
+
+		/* find a route */
+		PDBG("%s saddr %pI4 sport 0x%x raddr %pI4 rport 0x%x\n",
+		     __func__, &laddr->sin_addr, ntohs(laddr->sin_port),
+		     ra, ntohs(raddr->sin_port));
+		ep->dst = find_route(dev, laddr->sin_addr.s_addr,
+				     raddr->sin_addr.s_addr, laddr->sin_port,
+				     raddr->sin_port, 0);
+	} else {
+		iptype = 6;
+		ra = (__u8 *)&raddr6->sin6_addr;
+
+		/*
+		 * Handle loopback requests to INADDR_ANY.
+		 */
+		if (ipv6_addr_type(&raddr6->sin6_addr) == IPV6_ADDR_ANY) {
+			err = pick_local_ip6addrs(dev, cm_id);
+			if (err)
+				goto fail1;
+		}
+
+		/* find a route */
+		PDBG("%s saddr %pI6 sport 0x%x raddr %pI6 rport 0x%x\n",
+		     __func__, laddr6->sin6_addr.s6_addr,
+		     ntohs(laddr6->sin6_port),
+		     raddr6->sin6_addr.s6_addr, ntohs(raddr6->sin6_port));
+		ep->dst = find_route6(dev, laddr6->sin6_addr.s6_addr,
+				      raddr6->sin6_addr.s6_addr,
+				      laddr6->sin6_port, raddr6->sin6_port, 0,
+				      raddr6->sin6_scope_id);
+	}
+	if (!ep->dst) {
+		printk(KERN_ERR MOD "%s - cannot find route.\n", __func__);
+		err = -EHOSTUNREACH;
+		goto fail2;
+	}
+
+	err = import_ep(ep, iptype, ra, ep->dst, ep->com.dev, true);
+	if (err) {
+		printk(KERN_ERR MOD "%s - cannot alloc l2e.\n", __func__);
+		goto fail3;
+	}
+
+	PDBG("%s txq_idx %u tx_chan %u smac_idx %u rss_qid %u l2t_idx %u\n",
+		__func__, ep->txq_idx, ep->tx_chan, ep->smac_idx, ep->rss_qid,
+		ep->l2t->idx);
+
+	state_set(&ep->com, CONNECTING);
+	ep->tos = 0;
+
+	/* send connect request to rnic */
+	err = send_connect(ep);
+	if (!err)
+		goto out;
+
+	cxgb4_l2t_release(ep->l2t);
+fail3:
+	dst_release(ep->dst);
+fail2:
+	remove_handle(ep->com.dev, &ep->com.dev->atid_idr, ep->atid);
+	cxgb4_free_atid(ep->com.dev->rdev.lldi.tids, ep->atid);
+fail1:
+	cm_id->rem_ref(cm_id);
+	c4iw_put_ep(&ep->com);
+out:
+	return err;
+}
+
+static int create_server6(struct c4iw_dev *dev, struct c4iw_listen_ep *ep)
+{
+	int err;
+	struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)
+				    &ep->com.mapped_local_addr;
+
+	c4iw_init_wr_wait(&ep->com.wr_wait);
+	err = cxgb4_create_server6(ep->com.dev->rdev.lldi.ports[0],
+				   ep->stid, &sin6->sin6_addr,
+				   sin6->sin6_port,
+				   ep->com.dev->rdev.lldi.rxq_ids[0]);
+	if (!err)
+		err = c4iw_wait_for_reply(&ep->com.dev->rdev,
+					  &ep->com.wr_wait,
+					  0, 0, __func__);
+	if (err)
+		pr_err("cxgb4_create_server6/filter failed err %d stid %d laddr %pI6 lport %d\n",
+		       err, ep->stid,
+		       sin6->sin6_addr.s6_addr, ntohs(sin6->sin6_port));
+	return err;
+}
+
+static int create_server4(struct c4iw_dev *dev, struct c4iw_listen_ep *ep)
+{
+	int err;
+	struct sockaddr_in *sin = (struct sockaddr_in *)
+				  &ep->com.mapped_local_addr;
+
+	if (dev->rdev.lldi.enable_fw_ofld_conn) {
+		do {
+			err = cxgb4_create_server_filter(
+				ep->com.dev->rdev.lldi.ports[0], ep->stid,
+				sin->sin_addr.s_addr, sin->sin_port, 0,
+				ep->com.dev->rdev.lldi.rxq_ids[0], 0, 0);
+			if (err == -EBUSY) {
+				set_current_state(TASK_UNINTERRUPTIBLE);
+				schedule_timeout(usecs_to_jiffies(100));
+			}
+		} while (err == -EBUSY);
+	} else {
+		c4iw_init_wr_wait(&ep->com.wr_wait);
+		err = cxgb4_create_server(ep->com.dev->rdev.lldi.ports[0],
+				ep->stid, sin->sin_addr.s_addr, sin->sin_port,
+				0, ep->com.dev->rdev.lldi.rxq_ids[0]);
+		if (!err)
+			err = c4iw_wait_for_reply(&ep->com.dev->rdev,
+						  &ep->com.wr_wait,
+						  0, 0, __func__);
+	}
+	if (err)
+		pr_err("cxgb4_create_server/filter failed err %d stid %d laddr %pI4 lport %d\n"
+		       , err, ep->stid,
+		       &sin->sin_addr, ntohs(sin->sin_port));
+	return err;
+}
+
+int c4iw_create_listen(struct iw_cm_id *cm_id, int backlog)
+{
+	int err = 0;
+	struct c4iw_dev *dev = to_c4iw_dev(cm_id->device);
+	struct c4iw_listen_ep *ep;
+	struct iwpm_dev_data pm_reg_msg;
+	struct iwpm_sa_data pm_msg;
+	int iwpm_err = 0;
+
+	might_sleep();
+
+	ep = alloc_ep(sizeof(*ep), GFP_KERNEL);
+	if (!ep) {
+		printk(KERN_ERR MOD "%s - cannot alloc ep.\n", __func__);
+		err = -ENOMEM;
+		goto fail1;
+	}
+	PDBG("%s ep %p\n", __func__, ep);
+	cm_id->add_ref(cm_id);
+	ep->com.cm_id = cm_id;
+	ep->com.dev = dev;
+	ep->backlog = backlog;
+	memcpy(&ep->com.local_addr, &cm_id->local_addr,
+	       sizeof(ep->com.local_addr));
+
+	/*
+	 * Allocate a server TID.
+	 */
+	if (dev->rdev.lldi.enable_fw_ofld_conn &&
+	    ep->com.local_addr.ss_family == AF_INET)
+		ep->stid = cxgb4_alloc_sftid(dev->rdev.lldi.tids,
+					     cm_id->local_addr.ss_family, ep);
+	else
+		ep->stid = cxgb4_alloc_stid(dev->rdev.lldi.tids,
+					    cm_id->local_addr.ss_family, ep);
+
+	if (ep->stid == -1) {
+		printk(KERN_ERR MOD "%s - cannot alloc stid.\n", __func__);
+		err = -ENOMEM;
+		goto fail2;
+	}
+	insert_handle(dev, &dev->stid_idr, ep, ep->stid);
+
+	/* No port mapper available, go with the specified info */
+	memcpy(&ep->com.mapped_local_addr, &cm_id->local_addr,
+	       sizeof(ep->com.mapped_local_addr));
+
+	c4iw_form_reg_msg(dev, &pm_reg_msg);
+	iwpm_err = iwpm_register_pid(&pm_reg_msg, RDMA_NL_C4IW);
+	if (iwpm_err) {
+		PDBG("%s: Port Mapper reg pid fail (err = %d).\n",
+			__func__, iwpm_err);
+	}
+	if (iwpm_valid_pid() && !iwpm_err) {
+		memcpy(&pm_msg.loc_addr, &ep->com.local_addr,
+				sizeof(ep->com.local_addr));
+		iwpm_err = iwpm_add_mapping(&pm_msg, RDMA_NL_C4IW);
+		if (iwpm_err)
+			PDBG("%s: Port Mapper query fail (err = %d).\n",
+				__func__, iwpm_err);
+		else
+			memcpy(&ep->com.mapped_local_addr,
+				&pm_msg.mapped_loc_addr,
+				sizeof(ep->com.mapped_local_addr));
+	}
+	if (iwpm_create_mapinfo(&ep->com.local_addr,
+				&ep->com.mapped_local_addr, RDMA_NL_C4IW)) {
+		err = -ENOMEM;
+		goto fail3;
+	}
+	print_addr(&ep->com, __func__, "add_mapping/create_mapinfo");
+
+	set_bit(RELEASE_MAPINFO, &ep->com.flags);
+	state_set(&ep->com, LISTEN);
+	if (ep->com.local_addr.ss_family == AF_INET)
+		err = create_server4(dev, ep);
+	else
+		err = create_server6(dev, ep);
+	if (!err) {
+		cm_id->provider_data = ep;
+		goto out;
+	}
+
+fail3:
+	cxgb4_free_stid(ep->com.dev->rdev.lldi.tids, ep->stid,
+			ep->com.local_addr.ss_family);
+fail2:
+	cm_id->rem_ref(cm_id);
+	c4iw_put_ep(&ep->com);
+fail1:
+out:
+	return err;
+}
+
+int c4iw_destroy_listen(struct iw_cm_id *cm_id)
+{
+	int err;
+	struct c4iw_listen_ep *ep = to_listen_ep(cm_id);
+
+	PDBG("%s ep %p\n", __func__, ep);
+
+	might_sleep();
+	state_set(&ep->com, DEAD);
+	if (ep->com.dev->rdev.lldi.enable_fw_ofld_conn &&
+	    ep->com.local_addr.ss_family == AF_INET) {
+		err = cxgb4_remove_server_filter(
+			ep->com.dev->rdev.lldi.ports[0], ep->stid,
+			ep->com.dev->rdev.lldi.rxq_ids[0], 0);
+	} else {
+		c4iw_init_wr_wait(&ep->com.wr_wait);
+		err = cxgb4_remove_server(
+				ep->com.dev->rdev.lldi.ports[0], ep->stid,
+				ep->com.dev->rdev.lldi.rxq_ids[0], 0);
+		if (err)
+			goto done;
+		err = c4iw_wait_for_reply(&ep->com.dev->rdev, &ep->com.wr_wait,
+					  0, 0, __func__);
+	}
+	remove_handle(ep->com.dev, &ep->com.dev->stid_idr, ep->stid);
+	cxgb4_free_stid(ep->com.dev->rdev.lldi.tids, ep->stid,
+			ep->com.local_addr.ss_family);
+done:
+	cm_id->rem_ref(cm_id);
+	c4iw_put_ep(&ep->com);
+	return err;
+}
+
+int c4iw_ep_disconnect(struct c4iw_ep *ep, int abrupt, gfp_t gfp)
+{
+	int ret = 0;
+	int close = 0;
+	int fatal = 0;
+	struct c4iw_rdev *rdev;
+
+	mutex_lock(&ep->com.mutex);
+
+	PDBG("%s ep %p state %s, abrupt %d\n", __func__, ep,
+	     states[ep->com.state], abrupt);
+
+	rdev = &ep->com.dev->rdev;
+	if (c4iw_fatal_error(rdev)) {
+		fatal = 1;
+		close_complete_upcall(ep, -EIO);
+		ep->com.state = DEAD;
+	}
+	switch (ep->com.state) {
+	case MPA_REQ_WAIT:
+	case MPA_REQ_SENT:
+	case MPA_REQ_RCVD:
+	case MPA_REP_SENT:
+	case FPDU_MODE:
+		close = 1;
+		if (abrupt)
+			ep->com.state = ABORTING;
+		else {
+			ep->com.state = CLOSING;
+			start_ep_timer(ep);
+		}
+		set_bit(CLOSE_SENT, &ep->com.flags);
+		break;
+	case CLOSING:
+		if (!test_and_set_bit(CLOSE_SENT, &ep->com.flags)) {
+			close = 1;
+			if (abrupt) {
+				(void)stop_ep_timer(ep);
+				ep->com.state = ABORTING;
+			} else
+				ep->com.state = MORIBUND;
+		}
+		break;
+	case MORIBUND:
+	case ABORTING:
+	case DEAD:
+		PDBG("%s ignoring disconnect ep %p state %u\n",
+		     __func__, ep, ep->com.state);
+		break;
+	default:
+		BUG();
+		break;
+	}
+
+	if (close) {
+		if (abrupt) {
+			set_bit(EP_DISC_ABORT, &ep->com.history);
+			close_complete_upcall(ep, -ECONNRESET);
+			ret = send_abort(ep, NULL, gfp);
+		} else {
+			set_bit(EP_DISC_CLOSE, &ep->com.history);
+			ret = send_halfclose(ep, gfp);
+		}
+		if (ret)
+			fatal = 1;
+	}
+	mutex_unlock(&ep->com.mutex);
+	if (fatal)
+		release_ep_resources(ep);
+	return ret;
+}
+
+static void active_ofld_conn_reply(struct c4iw_dev *dev, struct sk_buff *skb,
+			struct cpl_fw6_msg_ofld_connection_wr_rpl *req)
+{
+	struct c4iw_ep *ep;
+	int atid = be32_to_cpu(req->tid);
+
+	ep = (struct c4iw_ep *)lookup_atid(dev->rdev.lldi.tids,
+					   (__force u32) req->tid);
+	if (!ep)
+		return;
+
+	switch (req->retval) {
+	case FW_ENOMEM:
+		set_bit(ACT_RETRY_NOMEM, &ep->com.history);
+		if (ep->retry_count++ < ACT_OPEN_RETRY_COUNT) {
+			send_fw_act_open_req(ep, atid);
+			return;
+		}
+	case FW_EADDRINUSE:
+		set_bit(ACT_RETRY_INUSE, &ep->com.history);
+		if (ep->retry_count++ < ACT_OPEN_RETRY_COUNT) {
+			send_fw_act_open_req(ep, atid);
+			return;
+		}
+		break;
+	default:
+		pr_info("%s unexpected ofld conn wr retval %d\n",
+		       __func__, req->retval);
+		break;
+	}
+	pr_err("active ofld_connect_wr failure %d atid %d\n",
+	       req->retval, atid);
+	mutex_lock(&dev->rdev.stats.lock);
+	dev->rdev.stats.act_ofld_conn_fails++;
+	mutex_unlock(&dev->rdev.stats.lock);
+	connect_reply_upcall(ep, status2errno(req->retval));
+	state_set(&ep->com, DEAD);
+	remove_handle(dev, &dev->atid_idr, atid);
+	cxgb4_free_atid(dev->rdev.lldi.tids, atid);
+	dst_release(ep->dst);
+	cxgb4_l2t_release(ep->l2t);
+	c4iw_put_ep(&ep->com);
+}
+
+static void passive_ofld_conn_reply(struct c4iw_dev *dev, struct sk_buff *skb,
+			struct cpl_fw6_msg_ofld_connection_wr_rpl *req)
+{
+	struct sk_buff *rpl_skb;
+	struct cpl_pass_accept_req *cpl;
+	int ret;
+
+	rpl_skb = (struct sk_buff *)(unsigned long)req->cookie;
+	BUG_ON(!rpl_skb);
+	if (req->retval) {
+		PDBG("%s passive open failure %d\n", __func__, req->retval);
+		mutex_lock(&dev->rdev.stats.lock);
+		dev->rdev.stats.pas_ofld_conn_fails++;
+		mutex_unlock(&dev->rdev.stats.lock);
+		kfree_skb(rpl_skb);
+	} else {
+		cpl = (struct cpl_pass_accept_req *)cplhdr(rpl_skb);
+		OPCODE_TID(cpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_REQ,
+					(__force u32) htonl(
+					(__force u32) req->tid)));
+		ret = pass_accept_req(dev, rpl_skb);
+		if (!ret)
+			kfree_skb(rpl_skb);
+	}
+	return;
+}
+
+static int deferred_fw6_msg(struct c4iw_dev *dev, struct sk_buff *skb)
+{
+	struct cpl_fw6_msg *rpl = cplhdr(skb);
+	struct cpl_fw6_msg_ofld_connection_wr_rpl *req;
+
+	switch (rpl->type) {
+	case FW6_TYPE_CQE:
+		c4iw_ev_dispatch(dev, (struct t4_cqe *)&rpl->data[0]);
+		break;
+	case FW6_TYPE_OFLD_CONNECTION_WR_RPL:
+		req = (struct cpl_fw6_msg_ofld_connection_wr_rpl *)rpl->data;
+		switch (req->t_state) {
+		case TCP_SYN_SENT:
+			active_ofld_conn_reply(dev, skb, req);
+			break;
+		case TCP_SYN_RECV:
+			passive_ofld_conn_reply(dev, skb, req);
+			break;
+		default:
+			pr_err("%s unexpected ofld conn wr state %d\n",
+			       __func__, req->t_state);
+			break;
+		}
+		break;
+	}
+	return 0;
+}
+
+static void build_cpl_pass_accept_req(struct sk_buff *skb, int stid , u8 tos)
+{
+	u32 l2info;
+	u16 vlantag, len, hdr_len, eth_hdr_len;
+	u8 intf;
+	struct cpl_rx_pkt *cpl = cplhdr(skb);
+	struct cpl_pass_accept_req *req;
+	struct tcp_options_received tmp_opt;
+	struct c4iw_dev *dev;
+
+	dev = *((struct c4iw_dev **) (skb->cb + sizeof(void *)));
+	/* Store values from cpl_rx_pkt in temporary location. */
+	vlantag = (__force u16) cpl->vlan;
+	len = (__force u16) cpl->len;
+	l2info  = (__force u32) cpl->l2info;
+	hdr_len = (__force u16) cpl->hdr_len;
+	intf = cpl->iff;
+
+	__skb_pull(skb, sizeof(*req) + sizeof(struct rss_header));
+
+	/*
+	 * We need to parse the TCP options from SYN packet.
+	 * to generate cpl_pass_accept_req.
+	 */
+	memset(&tmp_opt, 0, sizeof(tmp_opt));
+	tcp_clear_options(&tmp_opt);
+	tcp_parse_options(skb, &tmp_opt, 0, NULL);
+
+	req = (struct cpl_pass_accept_req *)__skb_push(skb, sizeof(*req));
+	memset(req, 0, sizeof(*req));
+	req->l2info = cpu_to_be16(V_SYN_INTF(intf) |
+			 V_SYN_MAC_IDX(G_RX_MACIDX(
+			 (__force int) htonl(l2info))) |
+			 F_SYN_XACT_MATCH);
+	eth_hdr_len = is_t4(dev->rdev.lldi.adapter_type) ?
+			    G_RX_ETHHDR_LEN((__force int) htonl(l2info)) :
+			    G_RX_T5_ETHHDR_LEN((__force int) htonl(l2info));
+	req->hdr_len = cpu_to_be32(V_SYN_RX_CHAN(G_RX_CHAN(
+					(__force int) htonl(l2info))) |
+				   V_TCP_HDR_LEN(G_RX_TCPHDR_LEN(
+					(__force int) htons(hdr_len))) |
+				   V_IP_HDR_LEN(G_RX_IPHDR_LEN(
+					(__force int) htons(hdr_len))) |
+				   V_ETH_HDR_LEN(G_RX_ETHHDR_LEN(eth_hdr_len)));
+	req->vlan = (__force __be16) vlantag;
+	req->len = (__force __be16) len;
+	req->tos_stid = cpu_to_be32(PASS_OPEN_TID(stid) |
+				    PASS_OPEN_TOS(tos));
+	req->tcpopt.mss = htons(tmp_opt.mss_clamp);
+	if (tmp_opt.wscale_ok)
+		req->tcpopt.wsf = tmp_opt.snd_wscale;
+	req->tcpopt.tstamp = tmp_opt.saw_tstamp;
+	if (tmp_opt.sack_ok)
+		req->tcpopt.sack = 1;
+	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_REQ, 0));
+	return;
+}
+
+static void send_fw_pass_open_req(struct c4iw_dev *dev, struct sk_buff *skb,
+				  __be32 laddr, __be16 lport,
+				  __be32 raddr, __be16 rport,
+				  u32 rcv_isn, u32 filter, u16 window,
+				  u32 rss_qid, u8 port_id)
+{
+	struct sk_buff *req_skb;
+	struct fw_ofld_connection_wr *req;
+	struct cpl_pass_accept_req *cpl = cplhdr(skb);
+	int ret;
+
+	req_skb = alloc_skb(sizeof(struct fw_ofld_connection_wr), GFP_KERNEL);
+	req = (struct fw_ofld_connection_wr *)__skb_put(req_skb, sizeof(*req));
+	memset(req, 0, sizeof(*req));
+	req->op_compl = htonl(V_WR_OP(FW_OFLD_CONNECTION_WR) | FW_WR_COMPL(1));
+	req->len16_pkd = htonl(FW_WR_LEN16(DIV_ROUND_UP(sizeof(*req), 16)));
+	req->le.version_cpl = htonl(F_FW_OFLD_CONNECTION_WR_CPL);
+	req->le.filter = (__force __be32) filter;
+	req->le.lport = lport;
+	req->le.pport = rport;
+	req->le.u.ipv4.lip = laddr;
+	req->le.u.ipv4.pip = raddr;
+	req->tcb.rcv_nxt = htonl(rcv_isn + 1);
+	req->tcb.rcv_adv = htons(window);
+	req->tcb.t_state_to_astid =
+		 htonl(V_FW_OFLD_CONNECTION_WR_T_STATE(TCP_SYN_RECV) |
+			V_FW_OFLD_CONNECTION_WR_RCV_SCALE(cpl->tcpopt.wsf) |
+			V_FW_OFLD_CONNECTION_WR_ASTID(
+			GET_PASS_OPEN_TID(ntohl(cpl->tos_stid))));
+
+	/*
+	 * We store the qid in opt2 which will be used by the firmware
+	 * to send us the wr response.
+	 */
+	req->tcb.opt2 = htonl(V_RSS_QUEUE(rss_qid));
+
+	/*
+	 * We initialize the MSS index in TCB to 0xF.
+	 * So that when driver sends cpl_pass_accept_rpl
+	 * TCB picks up the correct value. If this was 0
+	 * TP will ignore any value > 0 for MSS index.
+	 */
+	req->tcb.opt0 = cpu_to_be64(V_MSS_IDX(0xF));
+	req->cookie = (unsigned long)skb;
+
+	set_wr_txq(req_skb, CPL_PRIORITY_CONTROL, port_id);
+	ret = cxgb4_ofld_send(dev->rdev.lldi.ports[0], req_skb);
+	if (ret < 0) {
+		pr_err("%s - cxgb4_ofld_send error %d - dropping\n", __func__,
+		       ret);
+		kfree_skb(skb);
+		kfree_skb(req_skb);
+	}
+}
+
+/*
+ * Handler for CPL_RX_PKT message. Need to handle cpl_rx_pkt
+ * messages when a filter is being used instead of server to
+ * redirect a syn packet. When packets hit filter they are redirected
+ * to the offload queue and driver tries to establish the connection
+ * using firmware work request.
+ */
+static int rx_pkt(struct c4iw_dev *dev, struct sk_buff *skb)
+{
+	int stid;
+	unsigned int filter;
+	struct ethhdr *eh = NULL;
+	struct vlan_ethhdr *vlan_eh = NULL;
+	struct iphdr *iph;
+	struct tcphdr *tcph;
+	struct rss_header *rss = (void *)skb->data;
+	struct cpl_rx_pkt *cpl = (void *)skb->data;
+	struct cpl_pass_accept_req *req = (void *)(rss + 1);
+	struct l2t_entry *e;
+	struct dst_entry *dst;
+	struct c4iw_ep *lep;
+	u16 window;
+	struct port_info *pi;
+	struct net_device *pdev;
+	u16 rss_qid, eth_hdr_len;
+	int step;
+	u32 tx_chan;
+	struct neighbour *neigh;
+
+	/* Drop all non-SYN packets */
+	if (!(cpl->l2info & cpu_to_be32(F_RXF_SYN)))
+		goto reject;
+
+	/*
+	 * Drop all packets which did not hit the filter.
+	 * Unlikely to happen.
+	 */
+	if (!(rss->filter_hit && rss->filter_tid))
+		goto reject;
+
+	/*
+	 * Calculate the server tid from filter hit index from cpl_rx_pkt.
+	 */
+	stid = (__force int) cpu_to_be32((__force u32) rss->hash_val);
+
+	lep = (struct c4iw_ep *)lookup_stid(dev->rdev.lldi.tids, stid);
+	if (!lep) {
+		PDBG("%s connect request on invalid stid %d\n", __func__, stid);
+		goto reject;
+	}
+
+	eth_hdr_len = is_t4(dev->rdev.lldi.adapter_type) ?
+			    G_RX_ETHHDR_LEN(htonl(cpl->l2info)) :
+			    G_RX_T5_ETHHDR_LEN(htonl(cpl->l2info));
+	if (eth_hdr_len == ETH_HLEN) {
+		eh = (struct ethhdr *)(req + 1);
+		iph = (struct iphdr *)(eh + 1);
+	} else {
+		vlan_eh = (struct vlan_ethhdr *)(req + 1);
+		iph = (struct iphdr *)(vlan_eh + 1);
+		skb->vlan_tci = ntohs(cpl->vlan);
+	}
+
+	if (iph->version != 0x4)
+		goto reject;
+
+	tcph = (struct tcphdr *)(iph + 1);
+	skb_set_network_header(skb, (void *)iph - (void *)rss);
+	skb_set_transport_header(skb, (void *)tcph - (void *)rss);
+	skb_get(skb);
+
+	PDBG("%s lip 0x%x lport %u pip 0x%x pport %u tos %d\n", __func__,
+	     ntohl(iph->daddr), ntohs(tcph->dest), ntohl(iph->saddr),
+	     ntohs(tcph->source), iph->tos);
+
+	dst = find_route(dev, iph->daddr, iph->saddr, tcph->dest, tcph->source,
+			 iph->tos);
+	if (!dst) {
+		pr_err("%s - failed to find dst entry!\n",
+		       __func__);
+		goto reject;
+	}
+	neigh = dst_neigh_lookup_skb(dst, skb);
+
+	if (!neigh) {
+		pr_err("%s - failed to allocate neigh!\n",
+		       __func__);
+		goto free_dst;
+	}
+
+	if (neigh->dev->flags & IFF_LOOPBACK) {
+		pdev = ip_dev_find(&init_net, iph->daddr);
+		e = cxgb4_l2t_get(dev->rdev.lldi.l2t, neigh,
+				    pdev, 0);
+		pi = (struct port_info *)netdev_priv(pdev);
+		tx_chan = cxgb4_port_chan(pdev);
+		dev_put(pdev);
+	} else {
+		pdev = get_real_dev(neigh->dev);
+		e = cxgb4_l2t_get(dev->rdev.lldi.l2t, neigh,
+					pdev, 0);
+		pi = (struct port_info *)netdev_priv(pdev);
+		tx_chan = cxgb4_port_chan(pdev);
+	}
+	neigh_release(neigh);
+	if (!e) {
+		pr_err("%s - failed to allocate l2t entry!\n",
+		       __func__);
+		goto free_dst;
+	}
+
+	step = dev->rdev.lldi.nrxq / dev->rdev.lldi.nchan;
+	rss_qid = dev->rdev.lldi.rxq_ids[pi->port_id * step];
+	window = (__force u16) htons((__force u16)tcph->window);
+
+	/* Calcuate filter portion for LE region. */
+	filter = (__force unsigned int) cpu_to_be32(cxgb4_select_ntuple(
+						    dev->rdev.lldi.ports[0],
+						    e));
+
+	/*
+	 * Synthesize the cpl_pass_accept_req. We have everything except the
+	 * TID. Once firmware sends a reply with TID we update the TID field
+	 * in cpl and pass it through the regular cpl_pass_accept_req path.
+	 */
+	build_cpl_pass_accept_req(skb, stid, iph->tos);
+	send_fw_pass_open_req(dev, skb, iph->daddr, tcph->dest, iph->saddr,
+			      tcph->source, ntohl(tcph->seq), filter, window,
+			      rss_qid, pi->port_id);
+	cxgb4_l2t_release(e);
+free_dst:
+	dst_release(dst);
+reject:
+	return 0;
+}
+
+/*
+ * These are the real handlers that are called from a
+ * work queue.
+ */
+static c4iw_handler_func work_handlers[NUM_CPL_CMDS] = {
+	[CPL_ACT_ESTABLISH] = act_establish,
+	[CPL_ACT_OPEN_RPL] = act_open_rpl,
+	[CPL_RX_DATA] = rx_data,
+	[CPL_ABORT_RPL_RSS] = abort_rpl,
+	[CPL_ABORT_RPL] = abort_rpl,
+	[CPL_PASS_OPEN_RPL] = pass_open_rpl,
+	[CPL_CLOSE_LISTSRV_RPL] = close_listsrv_rpl,
+	[CPL_PASS_ACCEPT_REQ] = pass_accept_req,
+	[CPL_PASS_ESTABLISH] = pass_establish,
+	[CPL_PEER_CLOSE] = peer_close,
+	[CPL_ABORT_REQ_RSS] = peer_abort,
+	[CPL_CLOSE_CON_RPL] = close_con_rpl,
+	[CPL_RDMA_TERMINATE] = terminate,
+	[CPL_FW4_ACK] = fw4_ack,
+	[CPL_FW6_MSG] = deferred_fw6_msg,
+	[CPL_RX_PKT] = rx_pkt
+};
+
+static void process_timeout(struct c4iw_ep *ep)
+{
+	struct c4iw_qp_attributes attrs;
+	int abort = 1;
+
+	mutex_lock(&ep->com.mutex);
+	PDBG("%s ep %p tid %u state %d\n", __func__, ep, ep->hwtid,
+	     ep->com.state);
+	set_bit(TIMEDOUT, &ep->com.history);
+	switch (ep->com.state) {
+	case MPA_REQ_SENT:
+		__state_set(&ep->com, ABORTING);
+		connect_reply_upcall(ep, -ETIMEDOUT);
+		break;
+	case MPA_REQ_WAIT:
+		__state_set(&ep->com, ABORTING);
+		break;
+	case CLOSING:
+	case MORIBUND:
+		if (ep->com.cm_id && ep->com.qp) {
+			attrs.next_state = C4IW_QP_STATE_ERROR;
+			c4iw_modify_qp(ep->com.qp->rhp,
+				     ep->com.qp, C4IW_QP_ATTR_NEXT_STATE,
+				     &attrs, 1);
+		}
+		__state_set(&ep->com, ABORTING);
+		close_complete_upcall(ep, -ETIMEDOUT);
+		break;
+	case ABORTING:
+	case DEAD:
+
+		/*
+		 * These states are expected if the ep timed out at the same
+		 * time as another thread was calling stop_ep_timer().
+		 * So we silently do nothing for these states.
+		 */
+		abort = 0;
+		break;
+	default:
+		WARN(1, "%s unexpected state ep %p tid %u state %u\n",
+			__func__, ep, ep->hwtid, ep->com.state);
+		abort = 0;
+	}
+	if (abort)
+		abort_connection(ep, NULL, GFP_KERNEL);
+	mutex_unlock(&ep->com.mutex);
+	c4iw_put_ep(&ep->com);
+}
+
+static void process_timedout_eps(void)
+{
+	struct c4iw_ep *ep;
+
+	spin_lock_irq(&timeout_lock);
+	while (!list_empty(&timeout_list)) {
+		struct list_head *tmp;
+
+		tmp = timeout_list.next;
+		list_del(tmp);
+		tmp->next = NULL;
+		tmp->prev = NULL;
+		spin_unlock_irq(&timeout_lock);
+		ep = list_entry(tmp, struct c4iw_ep, entry);
+		process_timeout(ep);
+		spin_lock_irq(&timeout_lock);
+	}
+	spin_unlock_irq(&timeout_lock);
+}
+
+static void process_work(struct work_struct *work)
+{
+	struct sk_buff *skb = NULL;
+	struct c4iw_dev *dev;
+	struct cpl_act_establish *rpl;
+	unsigned int opcode;
+	int ret;
+
+	process_timedout_eps();
+	while ((skb = skb_dequeue(&rxq))) {
+		rpl = cplhdr(skb);
+		dev = *((struct c4iw_dev **) (skb->cb + sizeof(void *)));
+		opcode = rpl->ot.opcode;
+
+		BUG_ON(!work_handlers[opcode]);
+		ret = work_handlers[opcode](dev, skb);
+		if (!ret)
+			kfree_skb(skb);
+		process_timedout_eps();
+	}
+}
+
+static DECLARE_WORK(skb_work, process_work);
+
+static void ep_timeout(unsigned long arg)
+{
+	struct c4iw_ep *ep = (struct c4iw_ep *)arg;
+	int kickit = 0;
+
+	spin_lock(&timeout_lock);
+	if (!test_and_set_bit(TIMEOUT, &ep->com.flags)) {
+		/*
+		 * Only insert if it is not already on the list.
+		 */
+		if (!ep->entry.next) {
+			list_add_tail(&ep->entry, &timeout_list);
+			kickit = 1;
+		}
+	}
+	spin_unlock(&timeout_lock);
+	if (kickit)
+		queue_work(workq, &skb_work);
+}
+
+/*
+ * All the CM events are handled on a work queue to have a safe context.
+ */
+static int sched(struct c4iw_dev *dev, struct sk_buff *skb)
+{
+
+	/*
+	 * Save dev in the skb->cb area.
+	 */
+	*((struct c4iw_dev **) (skb->cb + sizeof(void *))) = dev;
+
+	/*
+	 * Queue the skb and schedule the worker thread.
+	 */
+	skb_queue_tail(&rxq, skb);
+	queue_work(workq, &skb_work);
+	return 0;
+}
+
+static int set_tcb_rpl(struct c4iw_dev *dev, struct sk_buff *skb)
+{
+	struct cpl_set_tcb_rpl *rpl = cplhdr(skb);
+
+	if (rpl->status != CPL_ERR_NONE) {
+		printk(KERN_ERR MOD "Unexpected SET_TCB_RPL status %u "
+		       "for tid %u\n", rpl->status, GET_TID(rpl));
+	}
+	kfree_skb(skb);
+	return 0;
+}
+
+static int fw6_msg(struct c4iw_dev *dev, struct sk_buff *skb)
+{
+	struct cpl_fw6_msg *rpl = cplhdr(skb);
+	struct c4iw_wr_wait *wr_waitp;
+	int ret;
+
+	PDBG("%s type %u\n", __func__, rpl->type);
+
+	switch (rpl->type) {
+	case FW6_TYPE_WR_RPL:
+		ret = (int)((be64_to_cpu(rpl->data[0]) >> 8) & 0xff);
+		wr_waitp = (struct c4iw_wr_wait *)(__force unsigned long) rpl->data[1];
+		PDBG("%s wr_waitp %p ret %u\n", __func__, wr_waitp, ret);
+		if (wr_waitp)
+			c4iw_wake_up(wr_waitp, ret ? -ret : 0);
+		kfree_skb(skb);
+		break;
+	case FW6_TYPE_CQE:
+	case FW6_TYPE_OFLD_CONNECTION_WR_RPL:
+		sched(dev, skb);
+		break;
+	default:
+		printk(KERN_ERR MOD "%s unexpected fw6 msg type %u\n", __func__,
+		       rpl->type);
+		kfree_skb(skb);
+		break;
+	}
+	return 0;
+}
+
+static int peer_abort_intr(struct c4iw_dev *dev, struct sk_buff *skb)
+{
+	struct cpl_abort_req_rss *req = cplhdr(skb);
+	struct c4iw_ep *ep;
+	struct tid_info *t = dev->rdev.lldi.tids;
+	unsigned int tid = GET_TID(req);
+
+	ep = lookup_tid(t, tid);
+	if (!ep) {
+		printk(KERN_WARNING MOD
+		       "Abort on non-existent endpoint, tid %d\n", tid);
+		kfree_skb(skb);
+		return 0;
+	}
+	if (is_neg_adv(req->status)) {
+		dev_warn(&dev->rdev.lldi.pdev->dev,
+			 "Negative advice on abort - tid %u status %d (%s)\n",
+			 ep->hwtid, req->status, neg_adv_str(req->status));
+		kfree_skb(skb);
+		return 0;
+	}
+	PDBG("%s ep %p tid %u state %u\n", __func__, ep, ep->hwtid,
+	     ep->com.state);
+
+	/*
+	 * Wake up any threads in rdma_init() or rdma_fini().
+	 * However, if we are on MPAv2 and want to retry with MPAv1
+	 * then, don't wake up yet.
+	 */
+	if (mpa_rev == 2 && !ep->tried_with_mpa_v1) {
+		if (ep->com.state != MPA_REQ_SENT)
+			c4iw_wake_up(&ep->com.wr_wait, -ECONNRESET);
+	} else
+		c4iw_wake_up(&ep->com.wr_wait, -ECONNRESET);
+	sched(dev, skb);
+	return 0;
+}
+
+/*
+ * Most upcalls from the T4 Core go to sched() to
+ * schedule the processing on a work queue.
+ */
+c4iw_handler_func c4iw_handlers[NUM_CPL_CMDS] = {
+	[CPL_ACT_ESTABLISH] = sched,
+	[CPL_ACT_OPEN_RPL] = sched,
+	[CPL_RX_DATA] = sched,
+	[CPL_ABORT_RPL_RSS] = sched,
+	[CPL_ABORT_RPL] = sched,
+	[CPL_PASS_OPEN_RPL] = sched,
+	[CPL_CLOSE_LISTSRV_RPL] = sched,
+	[CPL_PASS_ACCEPT_REQ] = sched,
+	[CPL_PASS_ESTABLISH] = sched,
+	[CPL_PEER_CLOSE] = sched,
+	[CPL_CLOSE_CON_RPL] = sched,
+	[CPL_ABORT_REQ_RSS] = peer_abort_intr,
+	[CPL_RDMA_TERMINATE] = sched,
+	[CPL_FW4_ACK] = sched,
+	[CPL_SET_TCB_RPL] = set_tcb_rpl,
+	[CPL_FW6_MSG] = fw6_msg,
+	[CPL_RX_PKT] = sched
+};
+
+int __init c4iw_cm_init(void)
+{
+	spin_lock_init(&timeout_lock);
+	skb_queue_head_init(&rxq);
+
+	workq = create_singlethread_workqueue("iw_cxgb4");
+	if (!workq)
+		return -ENOMEM;
+
+	return 0;
+}
+
+void c4iw_cm_term(void)
+{
+	WARN_ON(!list_empty(&timeout_list));
+	flush_workqueue(workq);
+	destroy_workqueue(workq);
+}
diff --git a/drivers/infiniband/hw/cxgb4/cq.c b/drivers/infiniband/hw/cxgb4/cq.c
new file mode 100644
index 0000000..0f773e7
--- /dev/null
+++ b/drivers/infiniband/hw/cxgb4/cq.c
@@ -0,0 +1,1009 @@
+/*
+ * Copyright (c) 2009-2010 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer in the documentation and/or other materials
+ *	  provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "iw_cxgb4.h"
+
+static int destroy_cq(struct c4iw_rdev *rdev, struct t4_cq *cq,
+		      struct c4iw_dev_ucontext *uctx)
+{
+	struct fw_ri_res_wr *res_wr;
+	struct fw_ri_res *res;
+	int wr_len;
+	struct c4iw_wr_wait wr_wait;
+	struct sk_buff *skb;
+	int ret;
+
+	wr_len = sizeof *res_wr + sizeof *res;
+	skb = alloc_skb(wr_len, GFP_KERNEL);
+	if (!skb)
+		return -ENOMEM;
+	set_wr_txq(skb, CPL_PRIORITY_CONTROL, 0);
+
+	res_wr = (struct fw_ri_res_wr *)__skb_put(skb, wr_len);
+	memset(res_wr, 0, wr_len);
+	res_wr->op_nres = cpu_to_be32(
+			FW_WR_OP(FW_RI_RES_WR) |
+			V_FW_RI_RES_WR_NRES(1) |
+			FW_WR_COMPL(1));
+	res_wr->len16_pkd = cpu_to_be32(DIV_ROUND_UP(wr_len, 16));
+	res_wr->cookie = (unsigned long) &wr_wait;
+	res = res_wr->res;
+	res->u.cq.restype = FW_RI_RES_TYPE_CQ;
+	res->u.cq.op = FW_RI_RES_OP_RESET;
+	res->u.cq.iqid = cpu_to_be32(cq->cqid);
+
+	c4iw_init_wr_wait(&wr_wait);
+	ret = c4iw_ofld_send(rdev, skb);
+	if (!ret) {
+		ret = c4iw_wait_for_reply(rdev, &wr_wait, 0, 0, __func__);
+	}
+
+	kfree(cq->sw_queue);
+	dma_free_coherent(&(rdev->lldi.pdev->dev),
+			  cq->memsize, cq->queue,
+			  dma_unmap_addr(cq, mapping));
+	c4iw_put_cqid(rdev, cq->cqid, uctx);
+	return ret;
+}
+
+static int create_cq(struct c4iw_rdev *rdev, struct t4_cq *cq,
+		     struct c4iw_dev_ucontext *uctx)
+{
+	struct fw_ri_res_wr *res_wr;
+	struct fw_ri_res *res;
+	int wr_len;
+	int user = (uctx != &rdev->uctx);
+	struct c4iw_wr_wait wr_wait;
+	int ret;
+	struct sk_buff *skb;
+
+	cq->cqid = c4iw_get_cqid(rdev, uctx);
+	if (!cq->cqid) {
+		ret = -ENOMEM;
+		goto err1;
+	}
+
+	if (!user) {
+		cq->sw_queue = kzalloc(cq->memsize, GFP_KERNEL);
+		if (!cq->sw_queue) {
+			ret = -ENOMEM;
+			goto err2;
+		}
+	}
+	cq->queue = dma_alloc_coherent(&rdev->lldi.pdev->dev, cq->memsize,
+				       &cq->dma_addr, GFP_KERNEL);
+	if (!cq->queue) {
+		ret = -ENOMEM;
+		goto err3;
+	}
+	dma_unmap_addr_set(cq, mapping, cq->dma_addr);
+	memset(cq->queue, 0, cq->memsize);
+
+	/* build fw_ri_res_wr */
+	wr_len = sizeof *res_wr + sizeof *res;
+
+	skb = alloc_skb(wr_len, GFP_KERNEL);
+	if (!skb) {
+		ret = -ENOMEM;
+		goto err4;
+	}
+	set_wr_txq(skb, CPL_PRIORITY_CONTROL, 0);
+
+	res_wr = (struct fw_ri_res_wr *)__skb_put(skb, wr_len);
+	memset(res_wr, 0, wr_len);
+	res_wr->op_nres = cpu_to_be32(
+			FW_WR_OP(FW_RI_RES_WR) |
+			V_FW_RI_RES_WR_NRES(1) |
+			FW_WR_COMPL(1));
+	res_wr->len16_pkd = cpu_to_be32(DIV_ROUND_UP(wr_len, 16));
+	res_wr->cookie = (unsigned long) &wr_wait;
+	res = res_wr->res;
+	res->u.cq.restype = FW_RI_RES_TYPE_CQ;
+	res->u.cq.op = FW_RI_RES_OP_WRITE;
+	res->u.cq.iqid = cpu_to_be32(cq->cqid);
+	res->u.cq.iqandst_to_iqandstindex = cpu_to_be32(
+			V_FW_RI_RES_WR_IQANUS(0) |
+			V_FW_RI_RES_WR_IQANUD(1) |
+			F_FW_RI_RES_WR_IQANDST |
+			V_FW_RI_RES_WR_IQANDSTINDEX(
+				rdev->lldi.ciq_ids[cq->vector]));
+	res->u.cq.iqdroprss_to_iqesize = cpu_to_be16(
+			F_FW_RI_RES_WR_IQDROPRSS |
+			V_FW_RI_RES_WR_IQPCIECH(2) |
+			V_FW_RI_RES_WR_IQINTCNTTHRESH(0) |
+			F_FW_RI_RES_WR_IQO |
+			V_FW_RI_RES_WR_IQESIZE(1));
+	res->u.cq.iqsize = cpu_to_be16(cq->size);
+	res->u.cq.iqaddr = cpu_to_be64(cq->dma_addr);
+
+	c4iw_init_wr_wait(&wr_wait);
+
+	ret = c4iw_ofld_send(rdev, skb);
+	if (ret)
+		goto err4;
+	PDBG("%s wait_event wr_wait %p\n", __func__, &wr_wait);
+	ret = c4iw_wait_for_reply(rdev, &wr_wait, 0, 0, __func__);
+	if (ret)
+		goto err4;
+
+	cq->gen = 1;
+	cq->gts = rdev->lldi.gts_reg;
+	cq->rdev = rdev;
+	if (user) {
+		cq->ugts = (u64)pci_resource_start(rdev->lldi.pdev, 2) +
+					(cq->cqid << rdev->cqshift);
+		cq->ugts &= PAGE_MASK;
+	}
+	return 0;
+err4:
+	dma_free_coherent(&rdev->lldi.pdev->dev, cq->memsize, cq->queue,
+			  dma_unmap_addr(cq, mapping));
+err3:
+	kfree(cq->sw_queue);
+err2:
+	c4iw_put_cqid(rdev, cq->cqid, uctx);
+err1:
+	return ret;
+}
+
+static void insert_recv_cqe(struct t4_wq *wq, struct t4_cq *cq)
+{
+	struct t4_cqe cqe;
+
+	PDBG("%s wq %p cq %p sw_cidx %u sw_pidx %u\n", __func__,
+	     wq, cq, cq->sw_cidx, cq->sw_pidx);
+	memset(&cqe, 0, sizeof(cqe));
+	cqe.header = cpu_to_be32(V_CQE_STATUS(T4_ERR_SWFLUSH) |
+				 V_CQE_OPCODE(FW_RI_SEND) |
+				 V_CQE_TYPE(0) |
+				 V_CQE_SWCQE(1) |
+				 V_CQE_QPID(wq->sq.qid));
+	cqe.bits_type_ts = cpu_to_be64(V_CQE_GENBIT((u64)cq->gen));
+	cq->sw_queue[cq->sw_pidx] = cqe;
+	t4_swcq_produce(cq);
+}
+
+int c4iw_flush_rq(struct t4_wq *wq, struct t4_cq *cq, int count)
+{
+	int flushed = 0;
+	int in_use = wq->rq.in_use - count;
+
+	BUG_ON(in_use < 0);
+	PDBG("%s wq %p cq %p rq.in_use %u skip count %u\n", __func__,
+	     wq, cq, wq->rq.in_use, count);
+	while (in_use--) {
+		insert_recv_cqe(wq, cq);
+		flushed++;
+	}
+	return flushed;
+}
+
+static void insert_sq_cqe(struct t4_wq *wq, struct t4_cq *cq,
+			  struct t4_swsqe *swcqe)
+{
+	struct t4_cqe cqe;
+
+	PDBG("%s wq %p cq %p sw_cidx %u sw_pidx %u\n", __func__,
+	     wq, cq, cq->sw_cidx, cq->sw_pidx);
+	memset(&cqe, 0, sizeof(cqe));
+	cqe.header = cpu_to_be32(V_CQE_STATUS(T4_ERR_SWFLUSH) |
+				 V_CQE_OPCODE(swcqe->opcode) |
+				 V_CQE_TYPE(1) |
+				 V_CQE_SWCQE(1) |
+				 V_CQE_QPID(wq->sq.qid));
+	CQE_WRID_SQ_IDX(&cqe) = swcqe->idx;
+	cqe.bits_type_ts = cpu_to_be64(V_CQE_GENBIT((u64)cq->gen));
+	cq->sw_queue[cq->sw_pidx] = cqe;
+	t4_swcq_produce(cq);
+}
+
+static void advance_oldest_read(struct t4_wq *wq);
+
+int c4iw_flush_sq(struct c4iw_qp *qhp)
+{
+	int flushed = 0;
+	struct t4_wq *wq = &qhp->wq;
+	struct c4iw_cq *chp = to_c4iw_cq(qhp->ibqp.send_cq);
+	struct t4_cq *cq = &chp->cq;
+	int idx;
+	struct t4_swsqe *swsqe;
+
+	if (wq->sq.flush_cidx == -1)
+		wq->sq.flush_cidx = wq->sq.cidx;
+	idx = wq->sq.flush_cidx;
+	BUG_ON(idx >= wq->sq.size);
+	while (idx != wq->sq.pidx) {
+		swsqe = &wq->sq.sw_sq[idx];
+		BUG_ON(swsqe->flushed);
+		swsqe->flushed = 1;
+		insert_sq_cqe(wq, cq, swsqe);
+		if (wq->sq.oldest_read == swsqe) {
+			BUG_ON(swsqe->opcode != FW_RI_READ_REQ);
+			advance_oldest_read(wq);
+		}
+		flushed++;
+		if (++idx == wq->sq.size)
+			idx = 0;
+	}
+	wq->sq.flush_cidx += flushed;
+	if (wq->sq.flush_cidx >= wq->sq.size)
+		wq->sq.flush_cidx -= wq->sq.size;
+	return flushed;
+}
+
+static void flush_completed_wrs(struct t4_wq *wq, struct t4_cq *cq)
+{
+	struct t4_swsqe *swsqe;
+	int cidx;
+
+	if (wq->sq.flush_cidx == -1)
+		wq->sq.flush_cidx = wq->sq.cidx;
+	cidx = wq->sq.flush_cidx;
+	BUG_ON(cidx > wq->sq.size);
+
+	while (cidx != wq->sq.pidx) {
+		swsqe = &wq->sq.sw_sq[cidx];
+		if (!swsqe->signaled) {
+			if (++cidx == wq->sq.size)
+				cidx = 0;
+		} else if (swsqe->complete) {
+
+			BUG_ON(swsqe->flushed);
+
+			/*
+			 * Insert this completed cqe into the swcq.
+			 */
+			PDBG("%s moving cqe into swcq sq idx %u cq idx %u\n",
+					__func__, cidx, cq->sw_pidx);
+			swsqe->cqe.header |= htonl(V_CQE_SWCQE(1));
+			cq->sw_queue[cq->sw_pidx] = swsqe->cqe;
+			t4_swcq_produce(cq);
+			swsqe->flushed = 1;
+			if (++cidx == wq->sq.size)
+				cidx = 0;
+			wq->sq.flush_cidx = cidx;
+		} else
+			break;
+	}
+}
+
+static void create_read_req_cqe(struct t4_wq *wq, struct t4_cqe *hw_cqe,
+		struct t4_cqe *read_cqe)
+{
+	read_cqe->u.scqe.cidx = wq->sq.oldest_read->idx;
+	read_cqe->len = htonl(wq->sq.oldest_read->read_len);
+	read_cqe->header = htonl(V_CQE_QPID(CQE_QPID(hw_cqe)) |
+			V_CQE_SWCQE(SW_CQE(hw_cqe)) |
+			V_CQE_OPCODE(FW_RI_READ_REQ) |
+			V_CQE_TYPE(1));
+	read_cqe->bits_type_ts = hw_cqe->bits_type_ts;
+}
+
+static void advance_oldest_read(struct t4_wq *wq)
+{
+
+	u32 rptr = wq->sq.oldest_read - wq->sq.sw_sq + 1;
+
+	if (rptr == wq->sq.size)
+		rptr = 0;
+	while (rptr != wq->sq.pidx) {
+		wq->sq.oldest_read = &wq->sq.sw_sq[rptr];
+
+		if (wq->sq.oldest_read->opcode == FW_RI_READ_REQ)
+			return;
+		if (++rptr == wq->sq.size)
+			rptr = 0;
+	}
+	wq->sq.oldest_read = NULL;
+}
+
+/*
+ * Move all CQEs from the HWCQ into the SWCQ.
+ * Deal with out-of-order and/or completions that complete
+ * prior unsignalled WRs.
+ */
+void c4iw_flush_hw_cq(struct c4iw_cq *chp)
+{
+	struct t4_cqe *hw_cqe, *swcqe, read_cqe;
+	struct c4iw_qp *qhp;
+	struct t4_swsqe *swsqe;
+	int ret;
+
+	PDBG("%s  cqid 0x%x\n", __func__, chp->cq.cqid);
+	ret = t4_next_hw_cqe(&chp->cq, &hw_cqe);
+
+	/*
+	 * This logic is similar to poll_cq(), but not quite the same
+	 * unfortunately.  Need to move pertinent HW CQEs to the SW CQ but
+	 * also do any translation magic that poll_cq() normally does.
+	 */
+	while (!ret) {
+		qhp = get_qhp(chp->rhp, CQE_QPID(hw_cqe));
+
+		/*
+		 * drop CQEs with no associated QP
+		 */
+		if (qhp == NULL)
+			goto next_cqe;
+
+		if (CQE_OPCODE(hw_cqe) == FW_RI_TERMINATE)
+			goto next_cqe;
+
+		if (CQE_OPCODE(hw_cqe) == FW_RI_READ_RESP) {
+
+			/* If we have reached here because of async
+			 * event or other error, and have egress error
+			 * then drop
+			 */
+			if (CQE_TYPE(hw_cqe) == 1)
+				goto next_cqe;
+
+			/* drop peer2peer RTR reads.
+			 */
+			if (CQE_WRID_STAG(hw_cqe) == 1)
+				goto next_cqe;
+
+			/*
+			 * Eat completions for unsignaled read WRs.
+			 */
+			if (!qhp->wq.sq.oldest_read->signaled) {
+				advance_oldest_read(&qhp->wq);
+				goto next_cqe;
+			}
+
+			/*
+			 * Don't write to the HWCQ, create a new read req CQE
+			 * in local memory and move it into the swcq.
+			 */
+			create_read_req_cqe(&qhp->wq, hw_cqe, &read_cqe);
+			hw_cqe = &read_cqe;
+			advance_oldest_read(&qhp->wq);
+		}
+
+		/* if its a SQ completion, then do the magic to move all the
+		 * unsignaled and now in-order completions into the swcq.
+		 */
+		if (SQ_TYPE(hw_cqe)) {
+			swsqe = &qhp->wq.sq.sw_sq[CQE_WRID_SQ_IDX(hw_cqe)];
+			swsqe->cqe = *hw_cqe;
+			swsqe->complete = 1;
+			flush_completed_wrs(&qhp->wq, &chp->cq);
+		} else {
+			swcqe = &chp->cq.sw_queue[chp->cq.sw_pidx];
+			*swcqe = *hw_cqe;
+			swcqe->header |= cpu_to_be32(V_CQE_SWCQE(1));
+			t4_swcq_produce(&chp->cq);
+		}
+next_cqe:
+		t4_hwcq_consume(&chp->cq);
+		ret = t4_next_hw_cqe(&chp->cq, &hw_cqe);
+	}
+}
+
+static int cqe_completes_wr(struct t4_cqe *cqe, struct t4_wq *wq)
+{
+	if (CQE_OPCODE(cqe) == FW_RI_TERMINATE)
+		return 0;
+
+	if ((CQE_OPCODE(cqe) == FW_RI_RDMA_WRITE) && RQ_TYPE(cqe))
+		return 0;
+
+	if ((CQE_OPCODE(cqe) == FW_RI_READ_RESP) && SQ_TYPE(cqe))
+		return 0;
+
+	if (CQE_SEND_OPCODE(cqe) && RQ_TYPE(cqe) && t4_rq_empty(wq))
+		return 0;
+	return 1;
+}
+
+void c4iw_count_rcqes(struct t4_cq *cq, struct t4_wq *wq, int *count)
+{
+	struct t4_cqe *cqe;
+	u32 ptr;
+
+	*count = 0;
+	PDBG("%s count zero %d\n", __func__, *count);
+	ptr = cq->sw_cidx;
+	while (ptr != cq->sw_pidx) {
+		cqe = &cq->sw_queue[ptr];
+		if (RQ_TYPE(cqe) && (CQE_OPCODE(cqe) != FW_RI_READ_RESP) &&
+		    (CQE_QPID(cqe) == wq->sq.qid) && cqe_completes_wr(cqe, wq))
+			(*count)++;
+		if (++ptr == cq->size)
+			ptr = 0;
+	}
+	PDBG("%s cq %p count %d\n", __func__, cq, *count);
+}
+
+/*
+ * poll_cq
+ *
+ * Caller must:
+ *     check the validity of the first CQE,
+ *     supply the wq assicated with the qpid.
+ *
+ * credit: cq credit to return to sge.
+ * cqe_flushed: 1 iff the CQE is flushed.
+ * cqe: copy of the polled CQE.
+ *
+ * return value:
+ *    0		    CQE returned ok.
+ *    -EAGAIN       CQE skipped, try again.
+ *    -EOVERFLOW    CQ overflow detected.
+ */
+static int poll_cq(struct t4_wq *wq, struct t4_cq *cq, struct t4_cqe *cqe,
+		   u8 *cqe_flushed, u64 *cookie, u32 *credit)
+{
+	int ret = 0;
+	struct t4_cqe *hw_cqe, read_cqe;
+
+	*cqe_flushed = 0;
+	*credit = 0;
+	ret = t4_next_cqe(cq, &hw_cqe);
+	if (ret)
+		return ret;
+
+	PDBG("%s CQE OVF %u qpid 0x%0x genbit %u type %u status 0x%0x"
+	     " opcode 0x%0x len 0x%0x wrid_hi_stag 0x%x wrid_low_msn 0x%x\n",
+	     __func__, CQE_OVFBIT(hw_cqe), CQE_QPID(hw_cqe),
+	     CQE_GENBIT(hw_cqe), CQE_TYPE(hw_cqe), CQE_STATUS(hw_cqe),
+	     CQE_OPCODE(hw_cqe), CQE_LEN(hw_cqe), CQE_WRID_HI(hw_cqe),
+	     CQE_WRID_LOW(hw_cqe));
+
+	/*
+	 * skip cqe's not affiliated with a QP.
+	 */
+	if (wq == NULL) {
+		ret = -EAGAIN;
+		goto skip_cqe;
+	}
+
+	/*
+	* skip hw cqe's if the wq is flushed.
+	*/
+	if (wq->flushed && !SW_CQE(hw_cqe)) {
+		ret = -EAGAIN;
+		goto skip_cqe;
+	}
+
+	/*
+	 * skip TERMINATE cqes...
+	 */
+	if (CQE_OPCODE(hw_cqe) == FW_RI_TERMINATE) {
+		ret = -EAGAIN;
+		goto skip_cqe;
+	}
+
+	/*
+	 * Gotta tweak READ completions:
+	 *	1) the cqe doesn't contain the sq_wptr from the wr.
+	 *	2) opcode not reflected from the wr.
+	 *	3) read_len not reflected from the wr.
+	 *	4) cq_type is RQ_TYPE not SQ_TYPE.
+	 */
+	if (RQ_TYPE(hw_cqe) && (CQE_OPCODE(hw_cqe) == FW_RI_READ_RESP)) {
+
+		/* If we have reached here because of async
+		 * event or other error, and have egress error
+		 * then drop
+		 */
+		if (CQE_TYPE(hw_cqe) == 1) {
+			if (CQE_STATUS(hw_cqe))
+				t4_set_wq_in_error(wq);
+			ret = -EAGAIN;
+			goto skip_cqe;
+		}
+
+		/* If this is an unsolicited read response, then the read
+		 * was generated by the kernel driver as part of peer-2-peer
+		 * connection setup.  So ignore the completion.
+		 */
+		if (CQE_WRID_STAG(hw_cqe) == 1) {
+			if (CQE_STATUS(hw_cqe))
+				t4_set_wq_in_error(wq);
+			ret = -EAGAIN;
+			goto skip_cqe;
+		}
+
+		/*
+		 * Eat completions for unsignaled read WRs.
+		 */
+		if (!wq->sq.oldest_read->signaled) {
+			advance_oldest_read(wq);
+			ret = -EAGAIN;
+			goto skip_cqe;
+		}
+
+		/*
+		 * Don't write to the HWCQ, so create a new read req CQE
+		 * in local memory.
+		 */
+		create_read_req_cqe(wq, hw_cqe, &read_cqe);
+		hw_cqe = &read_cqe;
+		advance_oldest_read(wq);
+	}
+
+	if (CQE_STATUS(hw_cqe) || t4_wq_in_error(wq)) {
+		*cqe_flushed = (CQE_STATUS(hw_cqe) == T4_ERR_SWFLUSH);
+		t4_set_wq_in_error(wq);
+	}
+
+	/*
+	 * RECV completion.
+	 */
+	if (RQ_TYPE(hw_cqe)) {
+
+		/*
+		 * HW only validates 4 bits of MSN.  So we must validate that
+		 * the MSN in the SEND is the next expected MSN.  If its not,
+		 * then we complete this with T4_ERR_MSN and mark the wq in
+		 * error.
+		 */
+
+		if (t4_rq_empty(wq)) {
+			t4_set_wq_in_error(wq);
+			ret = -EAGAIN;
+			goto skip_cqe;
+		}
+		if (unlikely((CQE_WRID_MSN(hw_cqe) != (wq->rq.msn)))) {
+			t4_set_wq_in_error(wq);
+			hw_cqe->header |= htonl(V_CQE_STATUS(T4_ERR_MSN));
+			goto proc_cqe;
+		}
+		goto proc_cqe;
+	}
+
+	/*
+	 * If we get here its a send completion.
+	 *
+	 * Handle out of order completion. These get stuffed
+	 * in the SW SQ. Then the SW SQ is walked to move any
+	 * now in-order completions into the SW CQ.  This handles
+	 * 2 cases:
+	 *	1) reaping unsignaled WRs when the first subsequent
+	 *	   signaled WR is completed.
+	 *	2) out of order read completions.
+	 */
+	if (!SW_CQE(hw_cqe) && (CQE_WRID_SQ_IDX(hw_cqe) != wq->sq.cidx)) {
+		struct t4_swsqe *swsqe;
+
+		PDBG("%s out of order completion going in sw_sq at idx %u\n",
+		     __func__, CQE_WRID_SQ_IDX(hw_cqe));
+		swsqe = &wq->sq.sw_sq[CQE_WRID_SQ_IDX(hw_cqe)];
+		swsqe->cqe = *hw_cqe;
+		swsqe->complete = 1;
+		ret = -EAGAIN;
+		goto flush_wq;
+	}
+
+proc_cqe:
+	*cqe = *hw_cqe;
+
+	/*
+	 * Reap the associated WR(s) that are freed up with this
+	 * completion.
+	 */
+	if (SQ_TYPE(hw_cqe)) {
+		int idx = CQE_WRID_SQ_IDX(hw_cqe);
+		BUG_ON(idx >= wq->sq.size);
+
+		/*
+		* Account for any unsignaled completions completed by
+		* this signaled completion.  In this case, cidx points
+		* to the first unsignaled one, and idx points to the
+		* signaled one.  So adjust in_use based on this delta.
+		* if this is not completing any unsigned wrs, then the
+		* delta will be 0. Handle wrapping also!
+		*/
+		if (idx < wq->sq.cidx)
+			wq->sq.in_use -= wq->sq.size + idx - wq->sq.cidx;
+		else
+			wq->sq.in_use -= idx - wq->sq.cidx;
+		BUG_ON(wq->sq.in_use <= 0 && wq->sq.in_use >= wq->sq.size);
+
+		wq->sq.cidx = (uint16_t)idx;
+		PDBG("%s completing sq idx %u\n", __func__, wq->sq.cidx);
+		*cookie = wq->sq.sw_sq[wq->sq.cidx].wr_id;
+		if (c4iw_wr_log)
+			c4iw_log_wr_stats(wq, hw_cqe);
+		t4_sq_consume(wq);
+	} else {
+		PDBG("%s completing rq idx %u\n", __func__, wq->rq.cidx);
+		*cookie = wq->rq.sw_rq[wq->rq.cidx].wr_id;
+		BUG_ON(t4_rq_empty(wq));
+		if (c4iw_wr_log)
+			c4iw_log_wr_stats(wq, hw_cqe);
+		t4_rq_consume(wq);
+		goto skip_cqe;
+	}
+
+flush_wq:
+	/*
+	 * Flush any completed cqes that are now in-order.
+	 */
+	flush_completed_wrs(wq, cq);
+
+skip_cqe:
+	if (SW_CQE(hw_cqe)) {
+		PDBG("%s cq %p cqid 0x%x skip sw cqe cidx %u\n",
+		     __func__, cq, cq->cqid, cq->sw_cidx);
+		t4_swcq_consume(cq);
+	} else {
+		PDBG("%s cq %p cqid 0x%x skip hw cqe cidx %u\n",
+		     __func__, cq, cq->cqid, cq->cidx);
+		t4_hwcq_consume(cq);
+	}
+	return ret;
+}
+
+/*
+ * Get one cq entry from c4iw and map it to openib.
+ *
+ * Returns:
+ *	0			cqe returned
+ *	-ENODATA		EMPTY;
+ *	-EAGAIN			caller must try again
+ *	any other -errno	fatal error
+ */
+static int c4iw_poll_cq_one(struct c4iw_cq *chp, struct ib_wc *wc)
+{
+	struct c4iw_qp *qhp = NULL;
+	struct t4_cqe uninitialized_var(cqe), *rd_cqe;
+	struct t4_wq *wq;
+	u32 credit = 0;
+	u8 cqe_flushed;
+	u64 cookie = 0;
+	int ret;
+
+	ret = t4_next_cqe(&chp->cq, &rd_cqe);
+
+	if (ret)
+		return ret;
+
+	qhp = get_qhp(chp->rhp, CQE_QPID(rd_cqe));
+	if (!qhp)
+		wq = NULL;
+	else {
+		spin_lock(&qhp->lock);
+		wq = &(qhp->wq);
+	}
+	ret = poll_cq(wq, &(chp->cq), &cqe, &cqe_flushed, &cookie, &credit);
+	if (ret)
+		goto out;
+
+	wc->wr_id = cookie;
+	wc->qp = &qhp->ibqp;
+	wc->vendor_err = CQE_STATUS(&cqe);
+	wc->wc_flags = 0;
+
+	PDBG("%s qpid 0x%x type %d opcode %d status 0x%x len %u wrid hi 0x%x "
+	     "lo 0x%x cookie 0x%llx\n", __func__, CQE_QPID(&cqe),
+	     CQE_TYPE(&cqe), CQE_OPCODE(&cqe), CQE_STATUS(&cqe), CQE_LEN(&cqe),
+	     CQE_WRID_HI(&cqe), CQE_WRID_LOW(&cqe), (unsigned long long)cookie);
+
+	if (CQE_TYPE(&cqe) == 0) {
+		if (!CQE_STATUS(&cqe))
+			wc->byte_len = CQE_LEN(&cqe);
+		else
+			wc->byte_len = 0;
+		wc->opcode = IB_WC_RECV;
+		if (CQE_OPCODE(&cqe) == FW_RI_SEND_WITH_INV ||
+		    CQE_OPCODE(&cqe) == FW_RI_SEND_WITH_SE_INV) {
+			wc->ex.invalidate_rkey = CQE_WRID_STAG(&cqe);
+			wc->wc_flags |= IB_WC_WITH_INVALIDATE;
+		}
+	} else {
+		switch (CQE_OPCODE(&cqe)) {
+		case FW_RI_RDMA_WRITE:
+			wc->opcode = IB_WC_RDMA_WRITE;
+			break;
+		case FW_RI_READ_REQ:
+			wc->opcode = IB_WC_RDMA_READ;
+			wc->byte_len = CQE_LEN(&cqe);
+			break;
+		case FW_RI_SEND_WITH_INV:
+		case FW_RI_SEND_WITH_SE_INV:
+			wc->opcode = IB_WC_SEND;
+			wc->wc_flags |= IB_WC_WITH_INVALIDATE;
+			break;
+		case FW_RI_SEND:
+		case FW_RI_SEND_WITH_SE:
+			wc->opcode = IB_WC_SEND;
+			break;
+		case FW_RI_BIND_MW:
+			wc->opcode = IB_WC_BIND_MW;
+			break;
+
+		case FW_RI_LOCAL_INV:
+			wc->opcode = IB_WC_LOCAL_INV;
+			break;
+		case FW_RI_FAST_REGISTER:
+			wc->opcode = IB_WC_FAST_REG_MR;
+			break;
+		default:
+			printk(KERN_ERR MOD "Unexpected opcode %d "
+			       "in the CQE received for QPID=0x%0x\n",
+			       CQE_OPCODE(&cqe), CQE_QPID(&cqe));
+			ret = -EINVAL;
+			goto out;
+		}
+	}
+
+	if (cqe_flushed)
+		wc->status = IB_WC_WR_FLUSH_ERR;
+	else {
+
+		switch (CQE_STATUS(&cqe)) {
+		case T4_ERR_SUCCESS:
+			wc->status = IB_WC_SUCCESS;
+			break;
+		case T4_ERR_STAG:
+			wc->status = IB_WC_LOC_ACCESS_ERR;
+			break;
+		case T4_ERR_PDID:
+			wc->status = IB_WC_LOC_PROT_ERR;
+			break;
+		case T4_ERR_QPID:
+		case T4_ERR_ACCESS:
+			wc->status = IB_WC_LOC_ACCESS_ERR;
+			break;
+		case T4_ERR_WRAP:
+			wc->status = IB_WC_GENERAL_ERR;
+			break;
+		case T4_ERR_BOUND:
+			wc->status = IB_WC_LOC_LEN_ERR;
+			break;
+		case T4_ERR_INVALIDATE_SHARED_MR:
+		case T4_ERR_INVALIDATE_MR_WITH_MW_BOUND:
+			wc->status = IB_WC_MW_BIND_ERR;
+			break;
+		case T4_ERR_CRC:
+		case T4_ERR_MARKER:
+		case T4_ERR_PDU_LEN_ERR:
+		case T4_ERR_OUT_OF_RQE:
+		case T4_ERR_DDP_VERSION:
+		case T4_ERR_RDMA_VERSION:
+		case T4_ERR_DDP_QUEUE_NUM:
+		case T4_ERR_MSN:
+		case T4_ERR_TBIT:
+		case T4_ERR_MO:
+		case T4_ERR_MSN_RANGE:
+		case T4_ERR_IRD_OVERFLOW:
+		case T4_ERR_OPCODE:
+		case T4_ERR_INTERNAL_ERR:
+			wc->status = IB_WC_FATAL_ERR;
+			break;
+		case T4_ERR_SWFLUSH:
+			wc->status = IB_WC_WR_FLUSH_ERR;
+			break;
+		default:
+			printk(KERN_ERR MOD
+			       "Unexpected cqe_status 0x%x for QPID=0x%0x\n",
+			       CQE_STATUS(&cqe), CQE_QPID(&cqe));
+			ret = -EINVAL;
+		}
+	}
+out:
+	if (wq)
+		spin_unlock(&qhp->lock);
+	return ret;
+}
+
+int c4iw_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc)
+{
+	struct c4iw_cq *chp;
+	unsigned long flags;
+	int npolled;
+	int err = 0;
+
+	chp = to_c4iw_cq(ibcq);
+
+	spin_lock_irqsave(&chp->lock, flags);
+	for (npolled = 0; npolled < num_entries; ++npolled) {
+		do {
+			err = c4iw_poll_cq_one(chp, wc + npolled);
+		} while (err == -EAGAIN);
+		if (err)
+			break;
+	}
+	spin_unlock_irqrestore(&chp->lock, flags);
+	return !err || err == -ENODATA ? npolled : err;
+}
+
+int c4iw_destroy_cq(struct ib_cq *ib_cq)
+{
+	struct c4iw_cq *chp;
+	struct c4iw_ucontext *ucontext;
+
+	PDBG("%s ib_cq %p\n", __func__, ib_cq);
+	chp = to_c4iw_cq(ib_cq);
+
+	remove_handle(chp->rhp, &chp->rhp->cqidr, chp->cq.cqid);
+	atomic_dec(&chp->refcnt);
+	wait_event(chp->wait, !atomic_read(&chp->refcnt));
+
+	ucontext = ib_cq->uobject ? to_c4iw_ucontext(ib_cq->uobject->context)
+				  : NULL;
+	destroy_cq(&chp->rhp->rdev, &chp->cq,
+		   ucontext ? &ucontext->uctx : &chp->cq.rdev->uctx);
+	kfree(chp);
+	return 0;
+}
+
+struct ib_cq *c4iw_create_cq(struct ib_device *ibdev, int entries,
+			     int vector, struct ib_ucontext *ib_context,
+			     struct ib_udata *udata)
+{
+	struct c4iw_dev *rhp;
+	struct c4iw_cq *chp;
+	struct c4iw_create_cq_resp uresp;
+	struct c4iw_ucontext *ucontext = NULL;
+	int ret;
+	size_t memsize, hwentries;
+	struct c4iw_mm_entry *mm, *mm2;
+
+	PDBG("%s ib_dev %p entries %d\n", __func__, ibdev, entries);
+
+	rhp = to_c4iw_dev(ibdev);
+
+	if (vector >= rhp->rdev.lldi.nciq)
+		return ERR_PTR(-EINVAL);
+
+	chp = kzalloc(sizeof(*chp), GFP_KERNEL);
+	if (!chp)
+		return ERR_PTR(-ENOMEM);
+
+	if (ib_context)
+		ucontext = to_c4iw_ucontext(ib_context);
+
+	/* account for the status page. */
+	entries++;
+
+	/* IQ needs one extra entry to differentiate full vs empty. */
+	entries++;
+
+	/*
+	 * entries must be multiple of 16 for HW.
+	 */
+	entries = roundup(entries, 16);
+
+	/*
+	 * Make actual HW queue 2x to avoid cdix_inc overflows.
+	 */
+	hwentries = min(entries * 2, rhp->rdev.hw_queue.t4_max_iq_size);
+
+	/*
+	 * Make HW queue at least 64 entries so GTS updates aren't too
+	 * frequent.
+	 */
+	if (hwentries < 64)
+		hwentries = 64;
+
+	memsize = hwentries * sizeof *chp->cq.queue;
+
+	/*
+	 * memsize must be a multiple of the page size if its a user cq.
+	 */
+	if (ucontext)
+		memsize = roundup(memsize, PAGE_SIZE);
+	chp->cq.size = hwentries;
+	chp->cq.memsize = memsize;
+	chp->cq.vector = vector;
+
+	ret = create_cq(&rhp->rdev, &chp->cq,
+			ucontext ? &ucontext->uctx : &rhp->rdev.uctx);
+	if (ret)
+		goto err1;
+
+	chp->rhp = rhp;
+	chp->cq.size--;				/* status page */
+	chp->ibcq.cqe = entries - 2;
+	spin_lock_init(&chp->lock);
+	spin_lock_init(&chp->comp_handler_lock);
+	atomic_set(&chp->refcnt, 1);
+	init_waitqueue_head(&chp->wait);
+	ret = insert_handle(rhp, &rhp->cqidr, chp, chp->cq.cqid);
+	if (ret)
+		goto err2;
+
+	if (ucontext) {
+		mm = kmalloc(sizeof *mm, GFP_KERNEL);
+		if (!mm)
+			goto err3;
+		mm2 = kmalloc(sizeof *mm2, GFP_KERNEL);
+		if (!mm2)
+			goto err4;
+
+		uresp.qid_mask = rhp->rdev.cqmask;
+		uresp.cqid = chp->cq.cqid;
+		uresp.size = chp->cq.size;
+		uresp.memsize = chp->cq.memsize;
+		spin_lock(&ucontext->mmap_lock);
+		uresp.key = ucontext->key;
+		ucontext->key += PAGE_SIZE;
+		uresp.gts_key = ucontext->key;
+		ucontext->key += PAGE_SIZE;
+		spin_unlock(&ucontext->mmap_lock);
+		ret = ib_copy_to_udata(udata, &uresp,
+				       sizeof(uresp) - sizeof(uresp.reserved));
+		if (ret)
+			goto err5;
+
+		mm->key = uresp.key;
+		mm->addr = virt_to_phys(chp->cq.queue);
+		mm->len = chp->cq.memsize;
+		insert_mmap(ucontext, mm);
+
+		mm2->key = uresp.gts_key;
+		mm2->addr = chp->cq.ugts;
+		mm2->len = PAGE_SIZE;
+		insert_mmap(ucontext, mm2);
+	}
+	PDBG("%s cqid 0x%0x chp %p size %u memsize %zu, dma_addr 0x%0llx\n",
+	     __func__, chp->cq.cqid, chp, chp->cq.size,
+	     chp->cq.memsize,
+	     (unsigned long long) chp->cq.dma_addr);
+	return &chp->ibcq;
+err5:
+	kfree(mm2);
+err4:
+	kfree(mm);
+err3:
+	remove_handle(rhp, &rhp->cqidr, chp->cq.cqid);
+err2:
+	destroy_cq(&chp->rhp->rdev, &chp->cq,
+		   ucontext ? &ucontext->uctx : &rhp->rdev.uctx);
+err1:
+	kfree(chp);
+	return ERR_PTR(ret);
+}
+
+int c4iw_resize_cq(struct ib_cq *cq, int cqe, struct ib_udata *udata)
+{
+	return -ENOSYS;
+}
+
+int c4iw_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags)
+{
+	struct c4iw_cq *chp;
+	int ret;
+	unsigned long flag;
+
+	chp = to_c4iw_cq(ibcq);
+	spin_lock_irqsave(&chp->lock, flag);
+	ret = t4_arm_cq(&chp->cq,
+			(flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED);
+	spin_unlock_irqrestore(&chp->lock, flag);
+	if (ret && !(flags & IB_CQ_REPORT_MISSED_EVENTS))
+		ret = 0;
+	return ret;
+}
diff --git a/drivers/infiniband/hw/cxgb4/device.c b/drivers/infiniband/hw/cxgb4/device.c
new file mode 100644
index 0000000..72f1f05
--- /dev/null
+++ b/drivers/infiniband/hw/cxgb4/device.c
@@ -0,0 +1,1546 @@
+/*
+ * Copyright (c) 2009-2010 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer in the documentation and/or other materials
+ *	  provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/debugfs.h>
+#include <linux/vmalloc.h>
+#include <linux/math64.h>
+
+#include <rdma/ib_verbs.h>
+
+#include "iw_cxgb4.h"
+
+#define DRV_VERSION "0.1"
+
+MODULE_AUTHOR("Steve Wise");
+MODULE_DESCRIPTION("Chelsio T4/T5 RDMA Driver");
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_VERSION(DRV_VERSION);
+
+static int allow_db_fc_on_t5;
+module_param(allow_db_fc_on_t5, int, 0644);
+MODULE_PARM_DESC(allow_db_fc_on_t5,
+		 "Allow DB Flow Control on T5 (default = 0)");
+
+static int allow_db_coalescing_on_t5;
+module_param(allow_db_coalescing_on_t5, int, 0644);
+MODULE_PARM_DESC(allow_db_coalescing_on_t5,
+		 "Allow DB Coalescing on T5 (default = 0)");
+
+int c4iw_wr_log = 0;
+module_param(c4iw_wr_log, int, 0444);
+MODULE_PARM_DESC(c4iw_wr_log, "Enables logging of work request timing data.");
+
+static int c4iw_wr_log_size_order = 12;
+module_param(c4iw_wr_log_size_order, int, 0444);
+MODULE_PARM_DESC(c4iw_wr_log_size_order,
+		 "Number of entries (log2) in the work request timing log.");
+
+struct uld_ctx {
+	struct list_head entry;
+	struct cxgb4_lld_info lldi;
+	struct c4iw_dev *dev;
+};
+
+static LIST_HEAD(uld_ctx_list);
+static DEFINE_MUTEX(dev_mutex);
+
+#define DB_FC_RESUME_SIZE 64
+#define DB_FC_RESUME_DELAY 1
+#define DB_FC_DRAIN_THRESH 0
+
+static struct dentry *c4iw_debugfs_root;
+
+struct c4iw_debugfs_data {
+	struct c4iw_dev *devp;
+	char *buf;
+	int bufsize;
+	int pos;
+};
+
+/* registered cxgb4 netlink callbacks */
+static struct ibnl_client_cbs c4iw_nl_cb_table[] = {
+	[RDMA_NL_IWPM_REG_PID] = {.dump = iwpm_register_pid_cb},
+	[RDMA_NL_IWPM_ADD_MAPPING] = {.dump = iwpm_add_mapping_cb},
+	[RDMA_NL_IWPM_QUERY_MAPPING] = {.dump = iwpm_add_and_query_mapping_cb},
+	[RDMA_NL_IWPM_HANDLE_ERR] = {.dump = iwpm_mapping_error_cb},
+	[RDMA_NL_IWPM_MAPINFO] = {.dump = iwpm_mapping_info_cb},
+	[RDMA_NL_IWPM_MAPINFO_NUM] = {.dump = iwpm_ack_mapping_info_cb}
+};
+
+static int count_idrs(int id, void *p, void *data)
+{
+	int *countp = data;
+
+	*countp = *countp + 1;
+	return 0;
+}
+
+static ssize_t debugfs_read(struct file *file, char __user *buf, size_t count,
+			    loff_t *ppos)
+{
+	struct c4iw_debugfs_data *d = file->private_data;
+
+	return simple_read_from_buffer(buf, count, ppos, d->buf, d->pos);
+}
+
+void c4iw_log_wr_stats(struct t4_wq *wq, struct t4_cqe *cqe)
+{
+	struct wr_log_entry le;
+	int idx;
+
+	if (!wq->rdev->wr_log)
+		return;
+
+	idx = (atomic_inc_return(&wq->rdev->wr_log_idx) - 1) &
+		(wq->rdev->wr_log_size - 1);
+	le.poll_sge_ts = cxgb4_read_sge_timestamp(wq->rdev->lldi.ports[0]);
+	getnstimeofday(&le.poll_host_ts);
+	le.valid = 1;
+	le.cqe_sge_ts = CQE_TS(cqe);
+	if (SQ_TYPE(cqe)) {
+		le.qid = wq->sq.qid;
+		le.opcode = CQE_OPCODE(cqe);
+		le.post_host_ts = wq->sq.sw_sq[wq->sq.cidx].host_ts;
+		le.post_sge_ts = wq->sq.sw_sq[wq->sq.cidx].sge_ts;
+		le.wr_id = CQE_WRID_SQ_IDX(cqe);
+	} else {
+		le.qid = wq->rq.qid;
+		le.opcode = FW_RI_RECEIVE;
+		le.post_host_ts = wq->rq.sw_rq[wq->rq.cidx].host_ts;
+		le.post_sge_ts = wq->rq.sw_rq[wq->rq.cidx].sge_ts;
+		le.wr_id = CQE_WRID_MSN(cqe);
+	}
+	wq->rdev->wr_log[idx] = le;
+}
+
+static int wr_log_show(struct seq_file *seq, void *v)
+{
+	struct c4iw_dev *dev = seq->private;
+	struct timespec prev_ts = {0, 0};
+	struct wr_log_entry *lep;
+	int prev_ts_set = 0;
+	int idx, end;
+
+#define ts2ns(ts) div64_ul((ts) * dev->rdev.lldi.cclk_ps, 1000)
+
+	idx = atomic_read(&dev->rdev.wr_log_idx) &
+		(dev->rdev.wr_log_size - 1);
+	end = idx - 1;
+	if (end < 0)
+		end = dev->rdev.wr_log_size - 1;
+	lep = &dev->rdev.wr_log[idx];
+	while (idx != end) {
+		if (lep->valid) {
+			if (!prev_ts_set) {
+				prev_ts_set = 1;
+				prev_ts = lep->poll_host_ts;
+			}
+			seq_printf(seq, "%04u: sec %lu nsec %lu qid %u opcode "
+				   "%u %s 0x%x host_wr_delta sec %lu nsec %lu "
+				   "post_sge_ts 0x%llx cqe_sge_ts 0x%llx "
+				   "poll_sge_ts 0x%llx post_poll_delta_ns %llu "
+				   "cqe_poll_delta_ns %llu\n",
+				   idx,
+				   timespec_sub(lep->poll_host_ts,
+						prev_ts).tv_sec,
+				   timespec_sub(lep->poll_host_ts,
+						prev_ts).tv_nsec,
+				   lep->qid, lep->opcode,
+				   lep->opcode == FW_RI_RECEIVE ?
+							"msn" : "wrid",
+				   lep->wr_id,
+				   timespec_sub(lep->poll_host_ts,
+						lep->post_host_ts).tv_sec,
+				   timespec_sub(lep->poll_host_ts,
+						lep->post_host_ts).tv_nsec,
+				   lep->post_sge_ts, lep->cqe_sge_ts,
+				   lep->poll_sge_ts,
+				   ts2ns(lep->poll_sge_ts - lep->post_sge_ts),
+				   ts2ns(lep->poll_sge_ts - lep->cqe_sge_ts));
+			prev_ts = lep->poll_host_ts;
+		}
+		idx++;
+		if (idx > (dev->rdev.wr_log_size - 1))
+			idx = 0;
+		lep = &dev->rdev.wr_log[idx];
+	}
+#undef ts2ns
+	return 0;
+}
+
+static int wr_log_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, wr_log_show, inode->i_private);
+}
+
+static ssize_t wr_log_clear(struct file *file, const char __user *buf,
+			    size_t count, loff_t *pos)
+{
+	struct c4iw_dev *dev = ((struct seq_file *)file->private_data)->private;
+	int i;
+
+	if (dev->rdev.wr_log)
+		for (i = 0; i < dev->rdev.wr_log_size; i++)
+			dev->rdev.wr_log[i].valid = 0;
+	return count;
+}
+
+static const struct file_operations wr_log_debugfs_fops = {
+	.owner   = THIS_MODULE,
+	.open    = wr_log_open,
+	.release = single_release,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.write   = wr_log_clear,
+};
+
+static int dump_qp(int id, void *p, void *data)
+{
+	struct c4iw_qp *qp = p;
+	struct c4iw_debugfs_data *qpd = data;
+	int space;
+	int cc;
+
+	if (id != qp->wq.sq.qid)
+		return 0;
+
+	space = qpd->bufsize - qpd->pos - 1;
+	if (space == 0)
+		return 1;
+
+	if (qp->ep) {
+		if (qp->ep->com.local_addr.ss_family == AF_INET) {
+			struct sockaddr_in *lsin = (struct sockaddr_in *)
+				&qp->ep->com.local_addr;
+			struct sockaddr_in *rsin = (struct sockaddr_in *)
+				&qp->ep->com.remote_addr;
+			struct sockaddr_in *mapped_lsin = (struct sockaddr_in *)
+				&qp->ep->com.mapped_local_addr;
+			struct sockaddr_in *mapped_rsin = (struct sockaddr_in *)
+				&qp->ep->com.mapped_remote_addr;
+
+			cc = snprintf(qpd->buf + qpd->pos, space,
+				      "rc qp sq id %u rq id %u state %u "
+				      "onchip %u ep tid %u state %u "
+				      "%pI4:%u/%u->%pI4:%u/%u\n",
+				      qp->wq.sq.qid, qp->wq.rq.qid,
+				      (int)qp->attr.state,
+				      qp->wq.sq.flags & T4_SQ_ONCHIP,
+				      qp->ep->hwtid, (int)qp->ep->com.state,
+				      &lsin->sin_addr, ntohs(lsin->sin_port),
+				      ntohs(mapped_lsin->sin_port),
+				      &rsin->sin_addr, ntohs(rsin->sin_port),
+				      ntohs(mapped_rsin->sin_port));
+		} else {
+			struct sockaddr_in6 *lsin6 = (struct sockaddr_in6 *)
+				&qp->ep->com.local_addr;
+			struct sockaddr_in6 *rsin6 = (struct sockaddr_in6 *)
+				&qp->ep->com.remote_addr;
+			struct sockaddr_in6 *mapped_lsin6 =
+				(struct sockaddr_in6 *)
+				&qp->ep->com.mapped_local_addr;
+			struct sockaddr_in6 *mapped_rsin6 =
+				(struct sockaddr_in6 *)
+				&qp->ep->com.mapped_remote_addr;
+
+			cc = snprintf(qpd->buf + qpd->pos, space,
+				      "rc qp sq id %u rq id %u state %u "
+				      "onchip %u ep tid %u state %u "
+				      "%pI6:%u/%u->%pI6:%u/%u\n",
+				      qp->wq.sq.qid, qp->wq.rq.qid,
+				      (int)qp->attr.state,
+				      qp->wq.sq.flags & T4_SQ_ONCHIP,
+				      qp->ep->hwtid, (int)qp->ep->com.state,
+				      &lsin6->sin6_addr,
+				      ntohs(lsin6->sin6_port),
+				      ntohs(mapped_lsin6->sin6_port),
+				      &rsin6->sin6_addr,
+				      ntohs(rsin6->sin6_port),
+				      ntohs(mapped_rsin6->sin6_port));
+		}
+	} else
+		cc = snprintf(qpd->buf + qpd->pos, space,
+			     "qp sq id %u rq id %u state %u onchip %u\n",
+			      qp->wq.sq.qid, qp->wq.rq.qid,
+			      (int)qp->attr.state,
+			      qp->wq.sq.flags & T4_SQ_ONCHIP);
+	if (cc < space)
+		qpd->pos += cc;
+	return 0;
+}
+
+static int qp_release(struct inode *inode, struct file *file)
+{
+	struct c4iw_debugfs_data *qpd = file->private_data;
+	if (!qpd) {
+		printk(KERN_INFO "%s null qpd?\n", __func__);
+		return 0;
+	}
+	vfree(qpd->buf);
+	kfree(qpd);
+	return 0;
+}
+
+static int qp_open(struct inode *inode, struct file *file)
+{
+	struct c4iw_debugfs_data *qpd;
+	int ret = 0;
+	int count = 1;
+
+	qpd = kmalloc(sizeof *qpd, GFP_KERNEL);
+	if (!qpd) {
+		ret = -ENOMEM;
+		goto out;
+	}
+	qpd->devp = inode->i_private;
+	qpd->pos = 0;
+
+	spin_lock_irq(&qpd->devp->lock);
+	idr_for_each(&qpd->devp->qpidr, count_idrs, &count);
+	spin_unlock_irq(&qpd->devp->lock);
+
+	qpd->bufsize = count * 128;
+	qpd->buf = vmalloc(qpd->bufsize);
+	if (!qpd->buf) {
+		ret = -ENOMEM;
+		goto err1;
+	}
+
+	spin_lock_irq(&qpd->devp->lock);
+	idr_for_each(&qpd->devp->qpidr, dump_qp, qpd);
+	spin_unlock_irq(&qpd->devp->lock);
+
+	qpd->buf[qpd->pos++] = 0;
+	file->private_data = qpd;
+	goto out;
+err1:
+	kfree(qpd);
+out:
+	return ret;
+}
+
+static const struct file_operations qp_debugfs_fops = {
+	.owner   = THIS_MODULE,
+	.open    = qp_open,
+	.release = qp_release,
+	.read    = debugfs_read,
+	.llseek  = default_llseek,
+};
+
+static int dump_stag(int id, void *p, void *data)
+{
+	struct c4iw_debugfs_data *stagd = data;
+	int space;
+	int cc;
+	struct fw_ri_tpte tpte;
+	int ret;
+
+	space = stagd->bufsize - stagd->pos - 1;
+	if (space == 0)
+		return 1;
+
+	ret = cxgb4_read_tpte(stagd->devp->rdev.lldi.ports[0], (u32)id<<8,
+			      (__be32 *)&tpte);
+	if (ret) {
+		dev_err(&stagd->devp->rdev.lldi.pdev->dev,
+			"%s cxgb4_read_tpte err %d\n", __func__, ret);
+		return ret;
+	}
+	cc = snprintf(stagd->buf + stagd->pos, space,
+		      "stag: idx 0x%x valid %d key 0x%x state %d pdid %d "
+		      "perm 0x%x ps %d len 0x%llx va 0x%llx\n",
+		      (u32)id<<8,
+		      G_FW_RI_TPTE_VALID(ntohl(tpte.valid_to_pdid)),
+		      G_FW_RI_TPTE_STAGKEY(ntohl(tpte.valid_to_pdid)),
+		      G_FW_RI_TPTE_STAGSTATE(ntohl(tpte.valid_to_pdid)),
+		      G_FW_RI_TPTE_PDID(ntohl(tpte.valid_to_pdid)),
+		      G_FW_RI_TPTE_PERM(ntohl(tpte.locread_to_qpid)),
+		      G_FW_RI_TPTE_PS(ntohl(tpte.locread_to_qpid)),
+		      ((u64)ntohl(tpte.len_hi) << 32) | ntohl(tpte.len_lo),
+		      ((u64)ntohl(tpte.va_hi) << 32) | ntohl(tpte.va_lo_fbo));
+	if (cc < space)
+		stagd->pos += cc;
+	return 0;
+}
+
+static int stag_release(struct inode *inode, struct file *file)
+{
+	struct c4iw_debugfs_data *stagd = file->private_data;
+	if (!stagd) {
+		printk(KERN_INFO "%s null stagd?\n", __func__);
+		return 0;
+	}
+	vfree(stagd->buf);
+	kfree(stagd);
+	return 0;
+}
+
+static int stag_open(struct inode *inode, struct file *file)
+{
+	struct c4iw_debugfs_data *stagd;
+	int ret = 0;
+	int count = 1;
+
+	stagd = kmalloc(sizeof *stagd, GFP_KERNEL);
+	if (!stagd) {
+		ret = -ENOMEM;
+		goto out;
+	}
+	stagd->devp = inode->i_private;
+	stagd->pos = 0;
+
+	spin_lock_irq(&stagd->devp->lock);
+	idr_for_each(&stagd->devp->mmidr, count_idrs, &count);
+	spin_unlock_irq(&stagd->devp->lock);
+
+	stagd->bufsize = count * 256;
+	stagd->buf = vmalloc(stagd->bufsize);
+	if (!stagd->buf) {
+		ret = -ENOMEM;
+		goto err1;
+	}
+
+	spin_lock_irq(&stagd->devp->lock);
+	idr_for_each(&stagd->devp->mmidr, dump_stag, stagd);
+	spin_unlock_irq(&stagd->devp->lock);
+
+	stagd->buf[stagd->pos++] = 0;
+	file->private_data = stagd;
+	goto out;
+err1:
+	kfree(stagd);
+out:
+	return ret;
+}
+
+static const struct file_operations stag_debugfs_fops = {
+	.owner   = THIS_MODULE,
+	.open    = stag_open,
+	.release = stag_release,
+	.read    = debugfs_read,
+	.llseek  = default_llseek,
+};
+
+static char *db_state_str[] = {"NORMAL", "FLOW_CONTROL", "RECOVERY", "STOPPED"};
+
+static int stats_show(struct seq_file *seq, void *v)
+{
+	struct c4iw_dev *dev = seq->private;
+
+	seq_printf(seq, "   Object: %10s %10s %10s %10s\n", "Total", "Current",
+		   "Max", "Fail");
+	seq_printf(seq, "     PDID: %10llu %10llu %10llu %10llu\n",
+			dev->rdev.stats.pd.total, dev->rdev.stats.pd.cur,
+			dev->rdev.stats.pd.max, dev->rdev.stats.pd.fail);
+	seq_printf(seq, "      QID: %10llu %10llu %10llu %10llu\n",
+			dev->rdev.stats.qid.total, dev->rdev.stats.qid.cur,
+			dev->rdev.stats.qid.max, dev->rdev.stats.qid.fail);
+	seq_printf(seq, "   TPTMEM: %10llu %10llu %10llu %10llu\n",
+			dev->rdev.stats.stag.total, dev->rdev.stats.stag.cur,
+			dev->rdev.stats.stag.max, dev->rdev.stats.stag.fail);
+	seq_printf(seq, "   PBLMEM: %10llu %10llu %10llu %10llu\n",
+			dev->rdev.stats.pbl.total, dev->rdev.stats.pbl.cur,
+			dev->rdev.stats.pbl.max, dev->rdev.stats.pbl.fail);
+	seq_printf(seq, "   RQTMEM: %10llu %10llu %10llu %10llu\n",
+			dev->rdev.stats.rqt.total, dev->rdev.stats.rqt.cur,
+			dev->rdev.stats.rqt.max, dev->rdev.stats.rqt.fail);
+	seq_printf(seq, "  OCQPMEM: %10llu %10llu %10llu %10llu\n",
+			dev->rdev.stats.ocqp.total, dev->rdev.stats.ocqp.cur,
+			dev->rdev.stats.ocqp.max, dev->rdev.stats.ocqp.fail);
+	seq_printf(seq, "  DB FULL: %10llu\n", dev->rdev.stats.db_full);
+	seq_printf(seq, " DB EMPTY: %10llu\n", dev->rdev.stats.db_empty);
+	seq_printf(seq, "  DB DROP: %10llu\n", dev->rdev.stats.db_drop);
+	seq_printf(seq, " DB State: %s Transitions %llu FC Interruptions %llu\n",
+		   db_state_str[dev->db_state],
+		   dev->rdev.stats.db_state_transitions,
+		   dev->rdev.stats.db_fc_interruptions);
+	seq_printf(seq, "TCAM_FULL: %10llu\n", dev->rdev.stats.tcam_full);
+	seq_printf(seq, "ACT_OFLD_CONN_FAILS: %10llu\n",
+		   dev->rdev.stats.act_ofld_conn_fails);
+	seq_printf(seq, "PAS_OFLD_CONN_FAILS: %10llu\n",
+		   dev->rdev.stats.pas_ofld_conn_fails);
+	seq_printf(seq, "AVAILABLE IRD: %10u\n", dev->avail_ird);
+	return 0;
+}
+
+static int stats_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, stats_show, inode->i_private);
+}
+
+static ssize_t stats_clear(struct file *file, const char __user *buf,
+		size_t count, loff_t *pos)
+{
+	struct c4iw_dev *dev = ((struct seq_file *)file->private_data)->private;
+
+	mutex_lock(&dev->rdev.stats.lock);
+	dev->rdev.stats.pd.max = 0;
+	dev->rdev.stats.pd.fail = 0;
+	dev->rdev.stats.qid.max = 0;
+	dev->rdev.stats.qid.fail = 0;
+	dev->rdev.stats.stag.max = 0;
+	dev->rdev.stats.stag.fail = 0;
+	dev->rdev.stats.pbl.max = 0;
+	dev->rdev.stats.pbl.fail = 0;
+	dev->rdev.stats.rqt.max = 0;
+	dev->rdev.stats.rqt.fail = 0;
+	dev->rdev.stats.ocqp.max = 0;
+	dev->rdev.stats.ocqp.fail = 0;
+	dev->rdev.stats.db_full = 0;
+	dev->rdev.stats.db_empty = 0;
+	dev->rdev.stats.db_drop = 0;
+	dev->rdev.stats.db_state_transitions = 0;
+	dev->rdev.stats.tcam_full = 0;
+	dev->rdev.stats.act_ofld_conn_fails = 0;
+	dev->rdev.stats.pas_ofld_conn_fails = 0;
+	mutex_unlock(&dev->rdev.stats.lock);
+	return count;
+}
+
+static const struct file_operations stats_debugfs_fops = {
+	.owner   = THIS_MODULE,
+	.open    = stats_open,
+	.release = single_release,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.write   = stats_clear,
+};
+
+static int dump_ep(int id, void *p, void *data)
+{
+	struct c4iw_ep *ep = p;
+	struct c4iw_debugfs_data *epd = data;
+	int space;
+	int cc;
+
+	space = epd->bufsize - epd->pos - 1;
+	if (space == 0)
+		return 1;
+
+	if (ep->com.local_addr.ss_family == AF_INET) {
+		struct sockaddr_in *lsin = (struct sockaddr_in *)
+			&ep->com.local_addr;
+		struct sockaddr_in *rsin = (struct sockaddr_in *)
+			&ep->com.remote_addr;
+		struct sockaddr_in *mapped_lsin = (struct sockaddr_in *)
+			&ep->com.mapped_local_addr;
+		struct sockaddr_in *mapped_rsin = (struct sockaddr_in *)
+			&ep->com.mapped_remote_addr;
+
+		cc = snprintf(epd->buf + epd->pos, space,
+			      "ep %p cm_id %p qp %p state %d flags 0x%lx "
+			      "history 0x%lx hwtid %d atid %d "
+			      "%pI4:%d/%d <-> %pI4:%d/%d\n",
+			      ep, ep->com.cm_id, ep->com.qp,
+			      (int)ep->com.state, ep->com.flags,
+			      ep->com.history, ep->hwtid, ep->atid,
+			      &lsin->sin_addr, ntohs(lsin->sin_port),
+			      ntohs(mapped_lsin->sin_port),
+			      &rsin->sin_addr, ntohs(rsin->sin_port),
+			      ntohs(mapped_rsin->sin_port));
+	} else {
+		struct sockaddr_in6 *lsin6 = (struct sockaddr_in6 *)
+			&ep->com.local_addr;
+		struct sockaddr_in6 *rsin6 = (struct sockaddr_in6 *)
+			&ep->com.remote_addr;
+		struct sockaddr_in6 *mapped_lsin6 = (struct sockaddr_in6 *)
+			&ep->com.mapped_local_addr;
+		struct sockaddr_in6 *mapped_rsin6 = (struct sockaddr_in6 *)
+			&ep->com.mapped_remote_addr;
+
+		cc = snprintf(epd->buf + epd->pos, space,
+			      "ep %p cm_id %p qp %p state %d flags 0x%lx "
+			      "history 0x%lx hwtid %d atid %d "
+			      "%pI6:%d/%d <-> %pI6:%d/%d\n",
+			      ep, ep->com.cm_id, ep->com.qp,
+			      (int)ep->com.state, ep->com.flags,
+			      ep->com.history, ep->hwtid, ep->atid,
+			      &lsin6->sin6_addr, ntohs(lsin6->sin6_port),
+			      ntohs(mapped_lsin6->sin6_port),
+			      &rsin6->sin6_addr, ntohs(rsin6->sin6_port),
+			      ntohs(mapped_rsin6->sin6_port));
+	}
+	if (cc < space)
+		epd->pos += cc;
+	return 0;
+}
+
+static int dump_listen_ep(int id, void *p, void *data)
+{
+	struct c4iw_listen_ep *ep = p;
+	struct c4iw_debugfs_data *epd = data;
+	int space;
+	int cc;
+
+	space = epd->bufsize - epd->pos - 1;
+	if (space == 0)
+		return 1;
+
+	if (ep->com.local_addr.ss_family == AF_INET) {
+		struct sockaddr_in *lsin = (struct sockaddr_in *)
+			&ep->com.local_addr;
+		struct sockaddr_in *mapped_lsin = (struct sockaddr_in *)
+			&ep->com.mapped_local_addr;
+
+		cc = snprintf(epd->buf + epd->pos, space,
+			      "ep %p cm_id %p state %d flags 0x%lx stid %d "
+			      "backlog %d %pI4:%d/%d\n",
+			      ep, ep->com.cm_id, (int)ep->com.state,
+			      ep->com.flags, ep->stid, ep->backlog,
+			      &lsin->sin_addr, ntohs(lsin->sin_port),
+			      ntohs(mapped_lsin->sin_port));
+	} else {
+		struct sockaddr_in6 *lsin6 = (struct sockaddr_in6 *)
+			&ep->com.local_addr;
+		struct sockaddr_in6 *mapped_lsin6 = (struct sockaddr_in6 *)
+			&ep->com.mapped_local_addr;
+
+		cc = snprintf(epd->buf + epd->pos, space,
+			      "ep %p cm_id %p state %d flags 0x%lx stid %d "
+			      "backlog %d %pI6:%d/%d\n",
+			      ep, ep->com.cm_id, (int)ep->com.state,
+			      ep->com.flags, ep->stid, ep->backlog,
+			      &lsin6->sin6_addr, ntohs(lsin6->sin6_port),
+			      ntohs(mapped_lsin6->sin6_port));
+	}
+	if (cc < space)
+		epd->pos += cc;
+	return 0;
+}
+
+static int ep_release(struct inode *inode, struct file *file)
+{
+	struct c4iw_debugfs_data *epd = file->private_data;
+	if (!epd) {
+		pr_info("%s null qpd?\n", __func__);
+		return 0;
+	}
+	vfree(epd->buf);
+	kfree(epd);
+	return 0;
+}
+
+static int ep_open(struct inode *inode, struct file *file)
+{
+	struct c4iw_debugfs_data *epd;
+	int ret = 0;
+	int count = 1;
+
+	epd = kmalloc(sizeof(*epd), GFP_KERNEL);
+	if (!epd) {
+		ret = -ENOMEM;
+		goto out;
+	}
+	epd->devp = inode->i_private;
+	epd->pos = 0;
+
+	spin_lock_irq(&epd->devp->lock);
+	idr_for_each(&epd->devp->hwtid_idr, count_idrs, &count);
+	idr_for_each(&epd->devp->atid_idr, count_idrs, &count);
+	idr_for_each(&epd->devp->stid_idr, count_idrs, &count);
+	spin_unlock_irq(&epd->devp->lock);
+
+	epd->bufsize = count * 160;
+	epd->buf = vmalloc(epd->bufsize);
+	if (!epd->buf) {
+		ret = -ENOMEM;
+		goto err1;
+	}
+
+	spin_lock_irq(&epd->devp->lock);
+	idr_for_each(&epd->devp->hwtid_idr, dump_ep, epd);
+	idr_for_each(&epd->devp->atid_idr, dump_ep, epd);
+	idr_for_each(&epd->devp->stid_idr, dump_listen_ep, epd);
+	spin_unlock_irq(&epd->devp->lock);
+
+	file->private_data = epd;
+	goto out;
+err1:
+	kfree(epd);
+out:
+	return ret;
+}
+
+static const struct file_operations ep_debugfs_fops = {
+	.owner   = THIS_MODULE,
+	.open    = ep_open,
+	.release = ep_release,
+	.read    = debugfs_read,
+};
+
+static int setup_debugfs(struct c4iw_dev *devp)
+{
+	struct dentry *de;
+
+	if (!devp->debugfs_root)
+		return -1;
+
+	de = debugfs_create_file("qps", S_IWUSR, devp->debugfs_root,
+				 (void *)devp, &qp_debugfs_fops);
+	if (de && de->d_inode)
+		de->d_inode->i_size = 4096;
+
+	de = debugfs_create_file("stags", S_IWUSR, devp->debugfs_root,
+				 (void *)devp, &stag_debugfs_fops);
+	if (de && de->d_inode)
+		de->d_inode->i_size = 4096;
+
+	de = debugfs_create_file("stats", S_IWUSR, devp->debugfs_root,
+			(void *)devp, &stats_debugfs_fops);
+	if (de && de->d_inode)
+		de->d_inode->i_size = 4096;
+
+	de = debugfs_create_file("eps", S_IWUSR, devp->debugfs_root,
+			(void *)devp, &ep_debugfs_fops);
+	if (de && de->d_inode)
+		de->d_inode->i_size = 4096;
+
+	if (c4iw_wr_log) {
+		de = debugfs_create_file("wr_log", S_IWUSR, devp->debugfs_root,
+					 (void *)devp, &wr_log_debugfs_fops);
+		if (de && de->d_inode)
+			de->d_inode->i_size = 4096;
+	}
+	return 0;
+}
+
+void c4iw_release_dev_ucontext(struct c4iw_rdev *rdev,
+			       struct c4iw_dev_ucontext *uctx)
+{
+	struct list_head *pos, *nxt;
+	struct c4iw_qid_list *entry;
+
+	mutex_lock(&uctx->lock);
+	list_for_each_safe(pos, nxt, &uctx->qpids) {
+		entry = list_entry(pos, struct c4iw_qid_list, entry);
+		list_del_init(&entry->entry);
+		if (!(entry->qid & rdev->qpmask)) {
+			c4iw_put_resource(&rdev->resource.qid_table,
+					  entry->qid);
+			mutex_lock(&rdev->stats.lock);
+			rdev->stats.qid.cur -= rdev->qpmask + 1;
+			mutex_unlock(&rdev->stats.lock);
+		}
+		kfree(entry);
+	}
+
+	list_for_each_safe(pos, nxt, &uctx->qpids) {
+		entry = list_entry(pos, struct c4iw_qid_list, entry);
+		list_del_init(&entry->entry);
+		kfree(entry);
+	}
+	mutex_unlock(&uctx->lock);
+}
+
+void c4iw_init_dev_ucontext(struct c4iw_rdev *rdev,
+			    struct c4iw_dev_ucontext *uctx)
+{
+	INIT_LIST_HEAD(&uctx->qpids);
+	INIT_LIST_HEAD(&uctx->cqids);
+	mutex_init(&uctx->lock);
+}
+
+/* Caller takes care of locking if needed */
+static int c4iw_rdev_open(struct c4iw_rdev *rdev)
+{
+	int err;
+
+	c4iw_init_dev_ucontext(rdev, &rdev->uctx);
+
+	/*
+	 * qpshift is the number of bits to shift the qpid left in order
+	 * to get the correct address of the doorbell for that qp.
+	 */
+	rdev->qpshift = PAGE_SHIFT - ilog2(rdev->lldi.udb_density);
+	rdev->qpmask = rdev->lldi.udb_density - 1;
+	rdev->cqshift = PAGE_SHIFT - ilog2(rdev->lldi.ucq_density);
+	rdev->cqmask = rdev->lldi.ucq_density - 1;
+	PDBG("%s dev %s stag start 0x%0x size 0x%0x num stags %d "
+	     "pbl start 0x%0x size 0x%0x rq start 0x%0x size 0x%0x "
+	     "qp qid start %u size %u cq qid start %u size %u\n",
+	     __func__, pci_name(rdev->lldi.pdev), rdev->lldi.vr->stag.start,
+	     rdev->lldi.vr->stag.size, c4iw_num_stags(rdev),
+	     rdev->lldi.vr->pbl.start,
+	     rdev->lldi.vr->pbl.size, rdev->lldi.vr->rq.start,
+	     rdev->lldi.vr->rq.size,
+	     rdev->lldi.vr->qp.start,
+	     rdev->lldi.vr->qp.size,
+	     rdev->lldi.vr->cq.start,
+	     rdev->lldi.vr->cq.size);
+	PDBG("udb len 0x%x udb base %llx db_reg %p gts_reg %p qpshift %lu "
+	     "qpmask 0x%x cqshift %lu cqmask 0x%x\n",
+	     (unsigned)pci_resource_len(rdev->lldi.pdev, 2),
+	     (u64)pci_resource_start(rdev->lldi.pdev, 2),
+	     rdev->lldi.db_reg,
+	     rdev->lldi.gts_reg,
+	     rdev->qpshift, rdev->qpmask,
+	     rdev->cqshift, rdev->cqmask);
+
+	if (c4iw_num_stags(rdev) == 0) {
+		err = -EINVAL;
+		goto err1;
+	}
+
+	rdev->stats.pd.total = T4_MAX_NUM_PD;
+	rdev->stats.stag.total = rdev->lldi.vr->stag.size;
+	rdev->stats.pbl.total = rdev->lldi.vr->pbl.size;
+	rdev->stats.rqt.total = rdev->lldi.vr->rq.size;
+	rdev->stats.ocqp.total = rdev->lldi.vr->ocq.size;
+	rdev->stats.qid.total = rdev->lldi.vr->qp.size;
+
+	err = c4iw_init_resource(rdev, c4iw_num_stags(rdev), T4_MAX_NUM_PD);
+	if (err) {
+		printk(KERN_ERR MOD "error %d initializing resources\n", err);
+		goto err1;
+	}
+	err = c4iw_pblpool_create(rdev);
+	if (err) {
+		printk(KERN_ERR MOD "error %d initializing pbl pool\n", err);
+		goto err2;
+	}
+	err = c4iw_rqtpool_create(rdev);
+	if (err) {
+		printk(KERN_ERR MOD "error %d initializing rqt pool\n", err);
+		goto err3;
+	}
+	err = c4iw_ocqp_pool_create(rdev);
+	if (err) {
+		printk(KERN_ERR MOD "error %d initializing ocqp pool\n", err);
+		goto err4;
+	}
+	rdev->status_page = (struct t4_dev_status_page *)
+			    __get_free_page(GFP_KERNEL);
+	if (!rdev->status_page) {
+		pr_err(MOD "error allocating status page\n");
+		goto err4;
+	}
+
+	if (c4iw_wr_log) {
+		rdev->wr_log = kzalloc((1 << c4iw_wr_log_size_order) *
+				       sizeof(*rdev->wr_log), GFP_KERNEL);
+		if (rdev->wr_log) {
+			rdev->wr_log_size = 1 << c4iw_wr_log_size_order;
+			atomic_set(&rdev->wr_log_idx, 0);
+		} else {
+			pr_err(MOD "error allocating wr_log. Logging disabled\n");
+		}
+	}
+
+	rdev->status_page->db_off = 0;
+
+	return 0;
+err4:
+	c4iw_rqtpool_destroy(rdev);
+err3:
+	c4iw_pblpool_destroy(rdev);
+err2:
+	c4iw_destroy_resource(&rdev->resource);
+err1:
+	return err;
+}
+
+static void c4iw_rdev_close(struct c4iw_rdev *rdev)
+{
+	kfree(rdev->wr_log);
+	free_page((unsigned long)rdev->status_page);
+	c4iw_pblpool_destroy(rdev);
+	c4iw_rqtpool_destroy(rdev);
+	c4iw_destroy_resource(&rdev->resource);
+}
+
+static void c4iw_dealloc(struct uld_ctx *ctx)
+{
+	c4iw_rdev_close(&ctx->dev->rdev);
+	idr_destroy(&ctx->dev->cqidr);
+	idr_destroy(&ctx->dev->qpidr);
+	idr_destroy(&ctx->dev->mmidr);
+	idr_destroy(&ctx->dev->hwtid_idr);
+	idr_destroy(&ctx->dev->stid_idr);
+	idr_destroy(&ctx->dev->atid_idr);
+	if (ctx->dev->rdev.bar2_kva)
+		iounmap(ctx->dev->rdev.bar2_kva);
+	if (ctx->dev->rdev.oc_mw_kva)
+		iounmap(ctx->dev->rdev.oc_mw_kva);
+	ib_dealloc_device(&ctx->dev->ibdev);
+	ctx->dev = NULL;
+}
+
+static void c4iw_remove(struct uld_ctx *ctx)
+{
+	PDBG("%s c4iw_dev %p\n", __func__,  ctx->dev);
+	c4iw_unregister_device(ctx->dev);
+	c4iw_dealloc(ctx);
+}
+
+static int rdma_supported(const struct cxgb4_lld_info *infop)
+{
+	return infop->vr->stag.size > 0 && infop->vr->pbl.size > 0 &&
+	       infop->vr->rq.size > 0 && infop->vr->qp.size > 0 &&
+	       infop->vr->cq.size > 0;
+}
+
+static struct c4iw_dev *c4iw_alloc(const struct cxgb4_lld_info *infop)
+{
+	struct c4iw_dev *devp;
+	int ret;
+
+	if (!rdma_supported(infop)) {
+		printk(KERN_INFO MOD "%s: RDMA not supported on this device.\n",
+		       pci_name(infop->pdev));
+		return ERR_PTR(-ENOSYS);
+	}
+	if (!ocqp_supported(infop))
+		pr_info("%s: On-Chip Queues not supported on this device.\n",
+			pci_name(infop->pdev));
+
+	devp = (struct c4iw_dev *)ib_alloc_device(sizeof(*devp));
+	if (!devp) {
+		printk(KERN_ERR MOD "Cannot allocate ib device\n");
+		return ERR_PTR(-ENOMEM);
+	}
+	devp->rdev.lldi = *infop;
+
+	/* init various hw-queue params based on lld info */
+	PDBG("%s: Ing. padding boundary is %d, egrsstatuspagesize = %d\n",
+	     __func__, devp->rdev.lldi.sge_ingpadboundary,
+	     devp->rdev.lldi.sge_egrstatuspagesize);
+
+	devp->rdev.hw_queue.t4_eq_status_entries =
+		devp->rdev.lldi.sge_ingpadboundary > 64 ? 2 : 1;
+	devp->rdev.hw_queue.t4_max_eq_size = 65520;
+	devp->rdev.hw_queue.t4_max_iq_size = 65520;
+	devp->rdev.hw_queue.t4_max_rq_size = 8192 -
+		devp->rdev.hw_queue.t4_eq_status_entries - 1;
+	devp->rdev.hw_queue.t4_max_sq_size =
+		devp->rdev.hw_queue.t4_max_eq_size -
+		devp->rdev.hw_queue.t4_eq_status_entries - 1;
+	devp->rdev.hw_queue.t4_max_qp_depth =
+		devp->rdev.hw_queue.t4_max_rq_size;
+	devp->rdev.hw_queue.t4_max_cq_depth =
+		devp->rdev.hw_queue.t4_max_iq_size - 2;
+	devp->rdev.hw_queue.t4_stat_len =
+		devp->rdev.lldi.sge_egrstatuspagesize;
+
+	/*
+	 * For T5 devices, we map all of BAR2 with WC.
+	 * For T4 devices with onchip qp mem, we map only that part
+	 * of BAR2 with WC.
+	 */
+	devp->rdev.bar2_pa = pci_resource_start(devp->rdev.lldi.pdev, 2);
+	if (is_t5(devp->rdev.lldi.adapter_type)) {
+		devp->rdev.bar2_kva = ioremap_wc(devp->rdev.bar2_pa,
+			pci_resource_len(devp->rdev.lldi.pdev, 2));
+		if (!devp->rdev.bar2_kva) {
+			pr_err(MOD "Unable to ioremap BAR2\n");
+			ib_dealloc_device(&devp->ibdev);
+			return ERR_PTR(-EINVAL);
+		}
+	} else if (ocqp_supported(infop)) {
+		devp->rdev.oc_mw_pa =
+			pci_resource_start(devp->rdev.lldi.pdev, 2) +
+			pci_resource_len(devp->rdev.lldi.pdev, 2) -
+			roundup_pow_of_two(devp->rdev.lldi.vr->ocq.size);
+		devp->rdev.oc_mw_kva = ioremap_wc(devp->rdev.oc_mw_pa,
+			devp->rdev.lldi.vr->ocq.size);
+		if (!devp->rdev.oc_mw_kva) {
+			pr_err(MOD "Unable to ioremap onchip mem\n");
+			ib_dealloc_device(&devp->ibdev);
+			return ERR_PTR(-EINVAL);
+		}
+	}
+
+	PDBG(KERN_INFO MOD "ocq memory: "
+	       "hw_start 0x%x size %u mw_pa 0x%lx mw_kva %p\n",
+	       devp->rdev.lldi.vr->ocq.start, devp->rdev.lldi.vr->ocq.size,
+	       devp->rdev.oc_mw_pa, devp->rdev.oc_mw_kva);
+
+	ret = c4iw_rdev_open(&devp->rdev);
+	if (ret) {
+		printk(KERN_ERR MOD "Unable to open CXIO rdev err %d\n", ret);
+		ib_dealloc_device(&devp->ibdev);
+		return ERR_PTR(ret);
+	}
+
+	idr_init(&devp->cqidr);
+	idr_init(&devp->qpidr);
+	idr_init(&devp->mmidr);
+	idr_init(&devp->hwtid_idr);
+	idr_init(&devp->stid_idr);
+	idr_init(&devp->atid_idr);
+	spin_lock_init(&devp->lock);
+	mutex_init(&devp->rdev.stats.lock);
+	mutex_init(&devp->db_mutex);
+	INIT_LIST_HEAD(&devp->db_fc_list);
+	devp->avail_ird = devp->rdev.lldi.max_ird_adapter;
+
+	if (c4iw_debugfs_root) {
+		devp->debugfs_root = debugfs_create_dir(
+					pci_name(devp->rdev.lldi.pdev),
+					c4iw_debugfs_root);
+		setup_debugfs(devp);
+	}
+
+
+	return devp;
+}
+
+static void *c4iw_uld_add(const struct cxgb4_lld_info *infop)
+{
+	struct uld_ctx *ctx;
+	static int vers_printed;
+	int i;
+
+	if (!vers_printed++)
+		pr_info("Chelsio T4/T5 RDMA Driver - version %s\n",
+			DRV_VERSION);
+
+	ctx = kzalloc(sizeof *ctx, GFP_KERNEL);
+	if (!ctx) {
+		ctx = ERR_PTR(-ENOMEM);
+		goto out;
+	}
+	ctx->lldi = *infop;
+
+	PDBG("%s found device %s nchan %u nrxq %u ntxq %u nports %u\n",
+	     __func__, pci_name(ctx->lldi.pdev),
+	     ctx->lldi.nchan, ctx->lldi.nrxq,
+	     ctx->lldi.ntxq, ctx->lldi.nports);
+
+	mutex_lock(&dev_mutex);
+	list_add_tail(&ctx->entry, &uld_ctx_list);
+	mutex_unlock(&dev_mutex);
+
+	for (i = 0; i < ctx->lldi.nrxq; i++)
+		PDBG("rxqid[%u] %u\n", i, ctx->lldi.rxq_ids[i]);
+out:
+	return ctx;
+}
+
+static inline struct sk_buff *copy_gl_to_skb_pkt(const struct pkt_gl *gl,
+						 const __be64 *rsp,
+						 u32 pktshift)
+{
+	struct sk_buff *skb;
+
+	/*
+	 * Allocate space for cpl_pass_accept_req which will be synthesized by
+	 * driver. Once the driver synthesizes the request the skb will go
+	 * through the regular cpl_pass_accept_req processing.
+	 * The math here assumes sizeof cpl_pass_accept_req >= sizeof
+	 * cpl_rx_pkt.
+	 */
+	skb = alloc_skb(gl->tot_len + sizeof(struct cpl_pass_accept_req) +
+			sizeof(struct rss_header) - pktshift, GFP_ATOMIC);
+	if (unlikely(!skb))
+		return NULL;
+
+	 __skb_put(skb, gl->tot_len + sizeof(struct cpl_pass_accept_req) +
+		   sizeof(struct rss_header) - pktshift);
+
+	/*
+	 * This skb will contain:
+	 *   rss_header from the rspq descriptor (1 flit)
+	 *   cpl_rx_pkt struct from the rspq descriptor (2 flits)
+	 *   space for the difference between the size of an
+	 *      rx_pkt and pass_accept_req cpl (1 flit)
+	 *   the packet data from the gl
+	 */
+	skb_copy_to_linear_data(skb, rsp, sizeof(struct cpl_pass_accept_req) +
+				sizeof(struct rss_header));
+	skb_copy_to_linear_data_offset(skb, sizeof(struct rss_header) +
+				       sizeof(struct cpl_pass_accept_req),
+				       gl->va + pktshift,
+				       gl->tot_len - pktshift);
+	return skb;
+}
+
+static inline int recv_rx_pkt(struct c4iw_dev *dev, const struct pkt_gl *gl,
+			   const __be64 *rsp)
+{
+	unsigned int opcode = *(u8 *)rsp;
+	struct sk_buff *skb;
+
+	if (opcode != CPL_RX_PKT)
+		goto out;
+
+	skb = copy_gl_to_skb_pkt(gl , rsp, dev->rdev.lldi.sge_pktshift);
+	if (skb == NULL)
+		goto out;
+
+	if (c4iw_handlers[opcode] == NULL) {
+		pr_info("%s no handler opcode 0x%x...\n", __func__,
+		       opcode);
+		kfree_skb(skb);
+		goto out;
+	}
+	c4iw_handlers[opcode](dev, skb);
+	return 1;
+out:
+	return 0;
+}
+
+static int c4iw_uld_rx_handler(void *handle, const __be64 *rsp,
+			const struct pkt_gl *gl)
+{
+	struct uld_ctx *ctx = handle;
+	struct c4iw_dev *dev = ctx->dev;
+	struct sk_buff *skb;
+	u8 opcode;
+
+	if (gl == NULL) {
+		/* omit RSS and rsp_ctrl at end of descriptor */
+		unsigned int len = 64 - sizeof(struct rsp_ctrl) - 8;
+
+		skb = alloc_skb(256, GFP_ATOMIC);
+		if (!skb)
+			goto nomem;
+		__skb_put(skb, len);
+		skb_copy_to_linear_data(skb, &rsp[1], len);
+	} else if (gl == CXGB4_MSG_AN) {
+		const struct rsp_ctrl *rc = (void *)rsp;
+
+		u32 qid = be32_to_cpu(rc->pldbuflen_qid);
+		c4iw_ev_handler(dev, qid);
+		return 0;
+	} else if (unlikely(*(u8 *)rsp != *(u8 *)gl->va)) {
+		if (recv_rx_pkt(dev, gl, rsp))
+			return 0;
+
+		pr_info("%s: unexpected FL contents at %p, " \
+		       "RSS %#llx, FL %#llx, len %u\n",
+		       pci_name(ctx->lldi.pdev), gl->va,
+		       (unsigned long long)be64_to_cpu(*rsp),
+		       (unsigned long long)be64_to_cpu(
+		       *(__force __be64 *)gl->va),
+		       gl->tot_len);
+
+		return 0;
+	} else {
+		skb = cxgb4_pktgl_to_skb(gl, 128, 128);
+		if (unlikely(!skb))
+			goto nomem;
+	}
+
+	opcode = *(u8 *)rsp;
+	if (c4iw_handlers[opcode]) {
+		c4iw_handlers[opcode](dev, skb);
+	} else {
+		pr_info("%s no handler opcode 0x%x...\n", __func__,
+		       opcode);
+		kfree_skb(skb);
+	}
+
+	return 0;
+nomem:
+	return -1;
+}
+
+static int c4iw_uld_state_change(void *handle, enum cxgb4_state new_state)
+{
+	struct uld_ctx *ctx = handle;
+
+	PDBG("%s new_state %u\n", __func__, new_state);
+	switch (new_state) {
+	case CXGB4_STATE_UP:
+		printk(KERN_INFO MOD "%s: Up\n", pci_name(ctx->lldi.pdev));
+		if (!ctx->dev) {
+			int ret;
+
+			ctx->dev = c4iw_alloc(&ctx->lldi);
+			if (IS_ERR(ctx->dev)) {
+				printk(KERN_ERR MOD
+				       "%s: initialization failed: %ld\n",
+				       pci_name(ctx->lldi.pdev),
+				       PTR_ERR(ctx->dev));
+				ctx->dev = NULL;
+				break;
+			}
+			ret = c4iw_register_device(ctx->dev);
+			if (ret) {
+				printk(KERN_ERR MOD
+				       "%s: RDMA registration failed: %d\n",
+				       pci_name(ctx->lldi.pdev), ret);
+				c4iw_dealloc(ctx);
+			}
+		}
+		break;
+	case CXGB4_STATE_DOWN:
+		printk(KERN_INFO MOD "%s: Down\n",
+		       pci_name(ctx->lldi.pdev));
+		if (ctx->dev)
+			c4iw_remove(ctx);
+		break;
+	case CXGB4_STATE_START_RECOVERY:
+		printk(KERN_INFO MOD "%s: Fatal Error\n",
+		       pci_name(ctx->lldi.pdev));
+		if (ctx->dev) {
+			struct ib_event event;
+
+			ctx->dev->rdev.flags |= T4_FATAL_ERROR;
+			memset(&event, 0, sizeof event);
+			event.event  = IB_EVENT_DEVICE_FATAL;
+			event.device = &ctx->dev->ibdev;
+			ib_dispatch_event(&event);
+			c4iw_remove(ctx);
+		}
+		break;
+	case CXGB4_STATE_DETACH:
+		printk(KERN_INFO MOD "%s: Detach\n",
+		       pci_name(ctx->lldi.pdev));
+		if (ctx->dev)
+			c4iw_remove(ctx);
+		break;
+	}
+	return 0;
+}
+
+static int disable_qp_db(int id, void *p, void *data)
+{
+	struct c4iw_qp *qp = p;
+
+	t4_disable_wq_db(&qp->wq);
+	return 0;
+}
+
+static void stop_queues(struct uld_ctx *ctx)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&ctx->dev->lock, flags);
+	ctx->dev->rdev.stats.db_state_transitions++;
+	ctx->dev->db_state = STOPPED;
+	if (ctx->dev->rdev.flags & T4_STATUS_PAGE_DISABLED)
+		idr_for_each(&ctx->dev->qpidr, disable_qp_db, NULL);
+	else
+		ctx->dev->rdev.status_page->db_off = 1;
+	spin_unlock_irqrestore(&ctx->dev->lock, flags);
+}
+
+static int enable_qp_db(int id, void *p, void *data)
+{
+	struct c4iw_qp *qp = p;
+
+	t4_enable_wq_db(&qp->wq);
+	return 0;
+}
+
+static void resume_rc_qp(struct c4iw_qp *qp)
+{
+	spin_lock(&qp->lock);
+	t4_ring_sq_db(&qp->wq, qp->wq.sq.wq_pidx_inc,
+		      is_t5(qp->rhp->rdev.lldi.adapter_type), NULL);
+	qp->wq.sq.wq_pidx_inc = 0;
+	t4_ring_rq_db(&qp->wq, qp->wq.rq.wq_pidx_inc,
+		      is_t5(qp->rhp->rdev.lldi.adapter_type), NULL);
+	qp->wq.rq.wq_pidx_inc = 0;
+	spin_unlock(&qp->lock);
+}
+
+static void resume_a_chunk(struct uld_ctx *ctx)
+{
+	int i;
+	struct c4iw_qp *qp;
+
+	for (i = 0; i < DB_FC_RESUME_SIZE; i++) {
+		qp = list_first_entry(&ctx->dev->db_fc_list, struct c4iw_qp,
+				      db_fc_entry);
+		list_del_init(&qp->db_fc_entry);
+		resume_rc_qp(qp);
+		if (list_empty(&ctx->dev->db_fc_list))
+			break;
+	}
+}
+
+static void resume_queues(struct uld_ctx *ctx)
+{
+	spin_lock_irq(&ctx->dev->lock);
+	if (ctx->dev->db_state != STOPPED)
+		goto out;
+	ctx->dev->db_state = FLOW_CONTROL;
+	while (1) {
+		if (list_empty(&ctx->dev->db_fc_list)) {
+			WARN_ON(ctx->dev->db_state != FLOW_CONTROL);
+			ctx->dev->db_state = NORMAL;
+			ctx->dev->rdev.stats.db_state_transitions++;
+			if (ctx->dev->rdev.flags & T4_STATUS_PAGE_DISABLED) {
+				idr_for_each(&ctx->dev->qpidr, enable_qp_db,
+					     NULL);
+			} else {
+				ctx->dev->rdev.status_page->db_off = 0;
+			}
+			break;
+		} else {
+			if (cxgb4_dbfifo_count(ctx->dev->rdev.lldi.ports[0], 1)
+			    < (ctx->dev->rdev.lldi.dbfifo_int_thresh <<
+			       DB_FC_DRAIN_THRESH)) {
+				resume_a_chunk(ctx);
+			}
+			if (!list_empty(&ctx->dev->db_fc_list)) {
+				spin_unlock_irq(&ctx->dev->lock);
+				if (DB_FC_RESUME_DELAY) {
+					set_current_state(TASK_UNINTERRUPTIBLE);
+					schedule_timeout(DB_FC_RESUME_DELAY);
+				}
+				spin_lock_irq(&ctx->dev->lock);
+				if (ctx->dev->db_state != FLOW_CONTROL)
+					break;
+			}
+		}
+	}
+out:
+	if (ctx->dev->db_state != NORMAL)
+		ctx->dev->rdev.stats.db_fc_interruptions++;
+	spin_unlock_irq(&ctx->dev->lock);
+}
+
+struct qp_list {
+	unsigned idx;
+	struct c4iw_qp **qps;
+};
+
+static int add_and_ref_qp(int id, void *p, void *data)
+{
+	struct qp_list *qp_listp = data;
+	struct c4iw_qp *qp = p;
+
+	c4iw_qp_add_ref(&qp->ibqp);
+	qp_listp->qps[qp_listp->idx++] = qp;
+	return 0;
+}
+
+static int count_qps(int id, void *p, void *data)
+{
+	unsigned *countp = data;
+	(*countp)++;
+	return 0;
+}
+
+static void deref_qps(struct qp_list *qp_list)
+{
+	int idx;
+
+	for (idx = 0; idx < qp_list->idx; idx++)
+		c4iw_qp_rem_ref(&qp_list->qps[idx]->ibqp);
+}
+
+static void recover_lost_dbs(struct uld_ctx *ctx, struct qp_list *qp_list)
+{
+	int idx;
+	int ret;
+
+	for (idx = 0; idx < qp_list->idx; idx++) {
+		struct c4iw_qp *qp = qp_list->qps[idx];
+
+		spin_lock_irq(&qp->rhp->lock);
+		spin_lock(&qp->lock);
+		ret = cxgb4_sync_txq_pidx(qp->rhp->rdev.lldi.ports[0],
+					  qp->wq.sq.qid,
+					  t4_sq_host_wq_pidx(&qp->wq),
+					  t4_sq_wq_size(&qp->wq));
+		if (ret) {
+			pr_err(KERN_ERR MOD "%s: Fatal error - "
+			       "DB overflow recovery failed - "
+			       "error syncing SQ qid %u\n",
+			       pci_name(ctx->lldi.pdev), qp->wq.sq.qid);
+			spin_unlock(&qp->lock);
+			spin_unlock_irq(&qp->rhp->lock);
+			return;
+		}
+		qp->wq.sq.wq_pidx_inc = 0;
+
+		ret = cxgb4_sync_txq_pidx(qp->rhp->rdev.lldi.ports[0],
+					  qp->wq.rq.qid,
+					  t4_rq_host_wq_pidx(&qp->wq),
+					  t4_rq_wq_size(&qp->wq));
+
+		if (ret) {
+			pr_err(KERN_ERR MOD "%s: Fatal error - "
+			       "DB overflow recovery failed - "
+			       "error syncing RQ qid %u\n",
+			       pci_name(ctx->lldi.pdev), qp->wq.rq.qid);
+			spin_unlock(&qp->lock);
+			spin_unlock_irq(&qp->rhp->lock);
+			return;
+		}
+		qp->wq.rq.wq_pidx_inc = 0;
+		spin_unlock(&qp->lock);
+		spin_unlock_irq(&qp->rhp->lock);
+
+		/* Wait for the dbfifo to drain */
+		while (cxgb4_dbfifo_count(qp->rhp->rdev.lldi.ports[0], 1) > 0) {
+			set_current_state(TASK_UNINTERRUPTIBLE);
+			schedule_timeout(usecs_to_jiffies(10));
+		}
+	}
+}
+
+static void recover_queues(struct uld_ctx *ctx)
+{
+	int count = 0;
+	struct qp_list qp_list;
+	int ret;
+
+	/* slow everybody down */
+	set_current_state(TASK_UNINTERRUPTIBLE);
+	schedule_timeout(usecs_to_jiffies(1000));
+
+	/* flush the SGE contexts */
+	ret = cxgb4_flush_eq_cache(ctx->dev->rdev.lldi.ports[0]);
+	if (ret) {
+		printk(KERN_ERR MOD "%s: Fatal error - DB overflow recovery failed\n",
+		       pci_name(ctx->lldi.pdev));
+		return;
+	}
+
+	/* Count active queues so we can build a list of queues to recover */
+	spin_lock_irq(&ctx->dev->lock);
+	WARN_ON(ctx->dev->db_state != STOPPED);
+	ctx->dev->db_state = RECOVERY;
+	idr_for_each(&ctx->dev->qpidr, count_qps, &count);
+
+	qp_list.qps = kzalloc(count * sizeof *qp_list.qps, GFP_ATOMIC);
+	if (!qp_list.qps) {
+		printk(KERN_ERR MOD "%s: Fatal error - DB overflow recovery failed\n",
+		       pci_name(ctx->lldi.pdev));
+		spin_unlock_irq(&ctx->dev->lock);
+		return;
+	}
+	qp_list.idx = 0;
+
+	/* add and ref each qp so it doesn't get freed */
+	idr_for_each(&ctx->dev->qpidr, add_and_ref_qp, &qp_list);
+
+	spin_unlock_irq(&ctx->dev->lock);
+
+	/* now traverse the list in a safe context to recover the db state*/
+	recover_lost_dbs(ctx, &qp_list);
+
+	/* we're almost done!  deref the qps and clean up */
+	deref_qps(&qp_list);
+	kfree(qp_list.qps);
+
+	spin_lock_irq(&ctx->dev->lock);
+	WARN_ON(ctx->dev->db_state != RECOVERY);
+	ctx->dev->db_state = STOPPED;
+	spin_unlock_irq(&ctx->dev->lock);
+}
+
+static int c4iw_uld_control(void *handle, enum cxgb4_control control, ...)
+{
+	struct uld_ctx *ctx = handle;
+
+	switch (control) {
+	case CXGB4_CONTROL_DB_FULL:
+		stop_queues(ctx);
+		ctx->dev->rdev.stats.db_full++;
+		break;
+	case CXGB4_CONTROL_DB_EMPTY:
+		resume_queues(ctx);
+		mutex_lock(&ctx->dev->rdev.stats.lock);
+		ctx->dev->rdev.stats.db_empty++;
+		mutex_unlock(&ctx->dev->rdev.stats.lock);
+		break;
+	case CXGB4_CONTROL_DB_DROP:
+		recover_queues(ctx);
+		mutex_lock(&ctx->dev->rdev.stats.lock);
+		ctx->dev->rdev.stats.db_drop++;
+		mutex_unlock(&ctx->dev->rdev.stats.lock);
+		break;
+	default:
+		printk(KERN_WARNING MOD "%s: unknown control cmd %u\n",
+		       pci_name(ctx->lldi.pdev), control);
+		break;
+	}
+	return 0;
+}
+
+static struct cxgb4_uld_info c4iw_uld_info = {
+	.name = DRV_NAME,
+	.add = c4iw_uld_add,
+	.rx_handler = c4iw_uld_rx_handler,
+	.state_change = c4iw_uld_state_change,
+	.control = c4iw_uld_control,
+};
+
+static int __init c4iw_init_module(void)
+{
+	int err;
+
+	err = c4iw_cm_init();
+	if (err)
+		return err;
+
+	c4iw_debugfs_root = debugfs_create_dir(DRV_NAME, NULL);
+	if (!c4iw_debugfs_root)
+		printk(KERN_WARNING MOD
+		       "could not create debugfs entry, continuing\n");
+
+	if (ibnl_add_client(RDMA_NL_C4IW, RDMA_NL_IWPM_NUM_OPS,
+			    c4iw_nl_cb_table))
+		pr_err("%s[%u]: Failed to add netlink callback\n"
+		       , __func__, __LINE__);
+
+	err = iwpm_init(RDMA_NL_C4IW);
+	if (err) {
+		pr_err("port mapper initialization failed with %d\n", err);
+		ibnl_remove_client(RDMA_NL_C4IW);
+		c4iw_cm_term();
+		debugfs_remove_recursive(c4iw_debugfs_root);
+		return err;
+	}
+
+	cxgb4_register_uld(CXGB4_ULD_RDMA, &c4iw_uld_info);
+
+	return 0;
+}
+
+static void __exit c4iw_exit_module(void)
+{
+	struct uld_ctx *ctx, *tmp;
+
+	mutex_lock(&dev_mutex);
+	list_for_each_entry_safe(ctx, tmp, &uld_ctx_list, entry) {
+		if (ctx->dev)
+			c4iw_remove(ctx);
+		kfree(ctx);
+	}
+	mutex_unlock(&dev_mutex);
+	cxgb4_unregister_uld(CXGB4_ULD_RDMA);
+	iwpm_exit(RDMA_NL_C4IW);
+	ibnl_remove_client(RDMA_NL_C4IW);
+	c4iw_cm_term();
+	debugfs_remove_recursive(c4iw_debugfs_root);
+}
+
+module_init(c4iw_init_module);
+module_exit(c4iw_exit_module);
diff --git a/drivers/infiniband/hw/cxgb4/ev.c b/drivers/infiniband/hw/cxgb4/ev.c
new file mode 100644
index 0000000..c9df054
--- /dev/null
+++ b/drivers/infiniband/hw/cxgb4/ev.c
@@ -0,0 +1,237 @@
+/*
+ * Copyright (c) 2009-2010 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include <linux/slab.h>
+#include <linux/mman.h>
+#include <net/sock.h>
+
+#include "iw_cxgb4.h"
+
+static void print_tpte(struct c4iw_dev *dev, u32 stag)
+{
+	int ret;
+	struct fw_ri_tpte tpte;
+
+	ret = cxgb4_read_tpte(dev->rdev.lldi.ports[0], stag,
+			      (__be32 *)&tpte);
+	if (ret) {
+		dev_err(&dev->rdev.lldi.pdev->dev,
+			"%s cxgb4_read_tpte err %d\n", __func__, ret);
+		return;
+	}
+	PDBG("stag idx 0x%x valid %d key 0x%x state %d pdid %d "
+	       "perm 0x%x ps %d len 0x%llx va 0x%llx\n",
+	       stag & 0xffffff00,
+	       G_FW_RI_TPTE_VALID(ntohl(tpte.valid_to_pdid)),
+	       G_FW_RI_TPTE_STAGKEY(ntohl(tpte.valid_to_pdid)),
+	       G_FW_RI_TPTE_STAGSTATE(ntohl(tpte.valid_to_pdid)),
+	       G_FW_RI_TPTE_PDID(ntohl(tpte.valid_to_pdid)),
+	       G_FW_RI_TPTE_PERM(ntohl(tpte.locread_to_qpid)),
+	       G_FW_RI_TPTE_PS(ntohl(tpte.locread_to_qpid)),
+	       ((u64)ntohl(tpte.len_hi) << 32) | ntohl(tpte.len_lo),
+	       ((u64)ntohl(tpte.va_hi) << 32) | ntohl(tpte.va_lo_fbo));
+}
+
+static void dump_err_cqe(struct c4iw_dev *dev, struct t4_cqe *err_cqe)
+{
+	__be64 *p = (void *)err_cqe;
+
+	dev_err(&dev->rdev.lldi.pdev->dev,
+		"AE qpid %d opcode %d status 0x%x "
+		"type %d len 0x%x wrid.hi 0x%x wrid.lo 0x%x\n",
+		CQE_QPID(err_cqe), CQE_OPCODE(err_cqe),
+		CQE_STATUS(err_cqe), CQE_TYPE(err_cqe), ntohl(err_cqe->len),
+		CQE_WRID_HI(err_cqe), CQE_WRID_LOW(err_cqe));
+
+	PDBG("%016llx %016llx %016llx %016llx\n",
+	     be64_to_cpu(p[0]), be64_to_cpu(p[1]), be64_to_cpu(p[2]),
+	     be64_to_cpu(p[3]));
+
+	/*
+	 * Ingress WRITE and READ_RESP errors provide
+	 * the offending stag, so parse and log it.
+	 */
+	if (RQ_TYPE(err_cqe) && (CQE_OPCODE(err_cqe) == FW_RI_RDMA_WRITE ||
+				 CQE_OPCODE(err_cqe) == FW_RI_READ_RESP))
+		print_tpte(dev, CQE_WRID_STAG(err_cqe));
+}
+
+static void post_qp_event(struct c4iw_dev *dev, struct c4iw_cq *chp,
+			  struct c4iw_qp *qhp,
+			  struct t4_cqe *err_cqe,
+			  enum ib_event_type ib_event)
+{
+	struct ib_event event;
+	struct c4iw_qp_attributes attrs;
+	unsigned long flag;
+
+	dump_err_cqe(dev, err_cqe);
+
+	if (qhp->attr.state == C4IW_QP_STATE_RTS) {
+		attrs.next_state = C4IW_QP_STATE_TERMINATE;
+		c4iw_modify_qp(qhp->rhp, qhp, C4IW_QP_ATTR_NEXT_STATE,
+			       &attrs, 0);
+	}
+
+	event.event = ib_event;
+	event.device = chp->ibcq.device;
+	if (ib_event == IB_EVENT_CQ_ERR)
+		event.element.cq = &chp->ibcq;
+	else
+		event.element.qp = &qhp->ibqp;
+	if (qhp->ibqp.event_handler)
+		(*qhp->ibqp.event_handler)(&event, qhp->ibqp.qp_context);
+
+	spin_lock_irqsave(&chp->comp_handler_lock, flag);
+	(*chp->ibcq.comp_handler)(&chp->ibcq, chp->ibcq.cq_context);
+	spin_unlock_irqrestore(&chp->comp_handler_lock, flag);
+}
+
+void c4iw_ev_dispatch(struct c4iw_dev *dev, struct t4_cqe *err_cqe)
+{
+	struct c4iw_cq *chp;
+	struct c4iw_qp *qhp;
+	u32 cqid;
+
+	spin_lock_irq(&dev->lock);
+	qhp = get_qhp(dev, CQE_QPID(err_cqe));
+	if (!qhp) {
+		printk(KERN_ERR MOD "BAD AE qpid 0x%x opcode %d "
+		       "status 0x%x type %d wrid.hi 0x%x wrid.lo 0x%x\n",
+		       CQE_QPID(err_cqe),
+		       CQE_OPCODE(err_cqe), CQE_STATUS(err_cqe),
+		       CQE_TYPE(err_cqe), CQE_WRID_HI(err_cqe),
+		       CQE_WRID_LOW(err_cqe));
+		spin_unlock_irq(&dev->lock);
+		goto out;
+	}
+
+	if (SQ_TYPE(err_cqe))
+		cqid = qhp->attr.scq;
+	else
+		cqid = qhp->attr.rcq;
+	chp = get_chp(dev, cqid);
+	if (!chp) {
+		printk(KERN_ERR MOD "BAD AE cqid 0x%x qpid 0x%x opcode %d "
+		       "status 0x%x type %d wrid.hi 0x%x wrid.lo 0x%x\n",
+		       cqid, CQE_QPID(err_cqe),
+		       CQE_OPCODE(err_cqe), CQE_STATUS(err_cqe),
+		       CQE_TYPE(err_cqe), CQE_WRID_HI(err_cqe),
+		       CQE_WRID_LOW(err_cqe));
+		spin_unlock_irq(&dev->lock);
+		goto out;
+	}
+
+	c4iw_qp_add_ref(&qhp->ibqp);
+	atomic_inc(&chp->refcnt);
+	spin_unlock_irq(&dev->lock);
+
+	/* Bad incoming write */
+	if (RQ_TYPE(err_cqe) &&
+	    (CQE_OPCODE(err_cqe) == FW_RI_RDMA_WRITE)) {
+		post_qp_event(dev, chp, qhp, err_cqe, IB_EVENT_QP_REQ_ERR);
+		goto done;
+	}
+
+	switch (CQE_STATUS(err_cqe)) {
+
+	/* Completion Events */
+	case T4_ERR_SUCCESS:
+		printk(KERN_ERR MOD "AE with status 0!\n");
+		break;
+
+	case T4_ERR_STAG:
+	case T4_ERR_PDID:
+	case T4_ERR_QPID:
+	case T4_ERR_ACCESS:
+	case T4_ERR_WRAP:
+	case T4_ERR_BOUND:
+	case T4_ERR_INVALIDATE_SHARED_MR:
+	case T4_ERR_INVALIDATE_MR_WITH_MW_BOUND:
+		post_qp_event(dev, chp, qhp, err_cqe, IB_EVENT_QP_ACCESS_ERR);
+		break;
+
+	/* Device Fatal Errors */
+	case T4_ERR_ECC:
+	case T4_ERR_ECC_PSTAG:
+	case T4_ERR_INTERNAL_ERR:
+		post_qp_event(dev, chp, qhp, err_cqe, IB_EVENT_DEVICE_FATAL);
+		break;
+
+	/* QP Fatal Errors */
+	case T4_ERR_OUT_OF_RQE:
+	case T4_ERR_PBL_ADDR_BOUND:
+	case T4_ERR_CRC:
+	case T4_ERR_MARKER:
+	case T4_ERR_PDU_LEN_ERR:
+	case T4_ERR_DDP_VERSION:
+	case T4_ERR_RDMA_VERSION:
+	case T4_ERR_OPCODE:
+	case T4_ERR_DDP_QUEUE_NUM:
+	case T4_ERR_MSN:
+	case T4_ERR_TBIT:
+	case T4_ERR_MO:
+	case T4_ERR_MSN_GAP:
+	case T4_ERR_MSN_RANGE:
+	case T4_ERR_RQE_ADDR_BOUND:
+	case T4_ERR_IRD_OVERFLOW:
+		post_qp_event(dev, chp, qhp, err_cqe, IB_EVENT_QP_FATAL);
+		break;
+
+	default:
+		printk(KERN_ERR MOD "Unknown T4 status 0x%x QPID 0x%x\n",
+		       CQE_STATUS(err_cqe), qhp->wq.sq.qid);
+		post_qp_event(dev, chp, qhp, err_cqe, IB_EVENT_QP_FATAL);
+		break;
+	}
+done:
+	if (atomic_dec_and_test(&chp->refcnt))
+		wake_up(&chp->wait);
+	c4iw_qp_rem_ref(&qhp->ibqp);
+out:
+	return;
+}
+
+int c4iw_ev_handler(struct c4iw_dev *dev, u32 qid)
+{
+	struct c4iw_cq *chp;
+	unsigned long flag;
+
+	chp = get_chp(dev, qid);
+	if (chp) {
+		t4_clear_cq_armed(&chp->cq);
+		spin_lock_irqsave(&chp->comp_handler_lock, flag);
+		(*chp->ibcq.comp_handler)(&chp->ibcq, chp->ibcq.cq_context);
+		spin_unlock_irqrestore(&chp->comp_handler_lock, flag);
+	} else
+		PDBG("%s unknown cqid 0x%x\n", __func__, qid);
+	return 0;
+}
diff --git a/drivers/infiniband/hw/cxgb4/id_table.c b/drivers/infiniband/hw/cxgb4/id_table.c
new file mode 100644
index 0000000..0161ae6
--- /dev/null
+++ b/drivers/infiniband/hw/cxgb4/id_table.c
@@ -0,0 +1,112 @@
+/*
+ * Copyright (c) 2011 Chelsio Communications.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include <linux/kernel.h>
+#include <linux/random.h>
+#include "iw_cxgb4.h"
+
+#define RANDOM_SKIP 16
+
+/*
+ * Trivial bitmap-based allocator. If the random flag is set, the
+ * allocator is designed to:
+ * - pseudo-randomize the id returned such that it is not trivially predictable.
+ * - avoid reuse of recently used id (at the expense of predictability)
+ */
+u32 c4iw_id_alloc(struct c4iw_id_table *alloc)
+{
+	unsigned long flags;
+	u32 obj;
+
+	spin_lock_irqsave(&alloc->lock, flags);
+
+	obj = find_next_zero_bit(alloc->table, alloc->max, alloc->last);
+	if (obj >= alloc->max)
+		obj = find_first_zero_bit(alloc->table, alloc->max);
+
+	if (obj < alloc->max) {
+		if (alloc->flags & C4IW_ID_TABLE_F_RANDOM)
+			alloc->last += prandom_u32() % RANDOM_SKIP;
+		else
+			alloc->last = obj + 1;
+		if (alloc->last >= alloc->max)
+			alloc->last = 0;
+		set_bit(obj, alloc->table);
+		obj += alloc->start;
+	} else
+		obj = -1;
+
+	spin_unlock_irqrestore(&alloc->lock, flags);
+	return obj;
+}
+
+void c4iw_id_free(struct c4iw_id_table *alloc, u32 obj)
+{
+	unsigned long flags;
+
+	obj -= alloc->start;
+	BUG_ON((int)obj < 0);
+
+	spin_lock_irqsave(&alloc->lock, flags);
+	clear_bit(obj, alloc->table);
+	spin_unlock_irqrestore(&alloc->lock, flags);
+}
+
+int c4iw_id_table_alloc(struct c4iw_id_table *alloc, u32 start, u32 num,
+			u32 reserved, u32 flags)
+{
+	int i;
+
+	alloc->start = start;
+	alloc->flags = flags;
+	if (flags & C4IW_ID_TABLE_F_RANDOM)
+		alloc->last = prandom_u32() % RANDOM_SKIP;
+	else
+		alloc->last = 0;
+	alloc->max  = num;
+	spin_lock_init(&alloc->lock);
+	alloc->table = kmalloc(BITS_TO_LONGS(num) * sizeof(long),
+				GFP_KERNEL);
+	if (!alloc->table)
+		return -ENOMEM;
+
+	bitmap_zero(alloc->table, num);
+	if (!(alloc->flags & C4IW_ID_TABLE_F_EMPTY))
+		for (i = 0; i < reserved; ++i)
+			set_bit(i, alloc->table);
+
+	return 0;
+}
+
+void c4iw_id_table_free(struct c4iw_id_table *alloc)
+{
+	kfree(alloc->table);
+}
diff --git a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h
new file mode 100644
index 0000000..b5678ac
--- /dev/null
+++ b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h
@@ -0,0 +1,1036 @@
+/*
+ * Copyright (c) 2009-2010 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer.
+ *      - Redistributions in binary form must reproduce the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer in the documentation and/or other materials
+ *	  provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __IW_CXGB4_H__
+#define __IW_CXGB4_H__
+
+#include <linux/mutex.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/idr.h>
+#include <linux/completion.h>
+#include <linux/netdevice.h>
+#include <linux/sched.h>
+#include <linux/pci.h>
+#include <linux/dma-mapping.h>
+#include <linux/inet.h>
+#include <linux/wait.h>
+#include <linux/kref.h>
+#include <linux/timer.h>
+#include <linux/io.h>
+
+#include <asm/byteorder.h>
+
+#include <net/net_namespace.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/iw_cm.h>
+#include <rdma/rdma_netlink.h>
+#include <rdma/iw_portmap.h>
+
+#include "cxgb4.h"
+#include "cxgb4_uld.h"
+#include "l2t.h"
+#include "user.h"
+
+#define DRV_NAME "iw_cxgb4"
+#define MOD DRV_NAME ":"
+
+extern int c4iw_debug;
+#define PDBG(fmt, args...) \
+do { \
+	if (c4iw_debug) \
+		printk(MOD fmt, ## args); \
+} while (0)
+
+#include "t4.h"
+
+#define PBL_OFF(rdev_p, a) ((a) - (rdev_p)->lldi.vr->pbl.start)
+#define RQT_OFF(rdev_p, a) ((a) - (rdev_p)->lldi.vr->rq.start)
+
+static inline void *cplhdr(struct sk_buff *skb)
+{
+	return skb->data;
+}
+
+#define C4IW_ID_TABLE_F_RANDOM 1       /* Pseudo-randomize the id's returned */
+#define C4IW_ID_TABLE_F_EMPTY  2       /* Table is initially empty */
+
+struct c4iw_id_table {
+	u32 flags;
+	u32 start;              /* logical minimal id */
+	u32 last;               /* hint for find */
+	u32 max;
+	spinlock_t lock;
+	unsigned long *table;
+};
+
+struct c4iw_resource {
+	struct c4iw_id_table tpt_table;
+	struct c4iw_id_table qid_table;
+	struct c4iw_id_table pdid_table;
+};
+
+struct c4iw_qid_list {
+	struct list_head entry;
+	u32 qid;
+};
+
+struct c4iw_dev_ucontext {
+	struct list_head qpids;
+	struct list_head cqids;
+	struct mutex lock;
+};
+
+enum c4iw_rdev_flags {
+	T4_FATAL_ERROR = (1<<0),
+	T4_STATUS_PAGE_DISABLED = (1<<1),
+};
+
+struct c4iw_stat {
+	u64 total;
+	u64 cur;
+	u64 max;
+	u64 fail;
+};
+
+struct c4iw_stats {
+	struct mutex lock;
+	struct c4iw_stat qid;
+	struct c4iw_stat pd;
+	struct c4iw_stat stag;
+	struct c4iw_stat pbl;
+	struct c4iw_stat rqt;
+	struct c4iw_stat ocqp;
+	u64  db_full;
+	u64  db_empty;
+	u64  db_drop;
+	u64  db_state_transitions;
+	u64  db_fc_interruptions;
+	u64  tcam_full;
+	u64  act_ofld_conn_fails;
+	u64  pas_ofld_conn_fails;
+};
+
+struct c4iw_hw_queue {
+	int t4_eq_status_entries;
+	int t4_max_eq_size;
+	int t4_max_iq_size;
+	int t4_max_rq_size;
+	int t4_max_sq_size;
+	int t4_max_qp_depth;
+	int t4_max_cq_depth;
+	int t4_stat_len;
+};
+
+struct wr_log_entry {
+	struct timespec post_host_ts;
+	struct timespec poll_host_ts;
+	u64 post_sge_ts;
+	u64 cqe_sge_ts;
+	u64 poll_sge_ts;
+	u16 qid;
+	u16 wr_id;
+	u8 opcode;
+	u8 valid;
+};
+
+struct c4iw_rdev {
+	struct c4iw_resource resource;
+	unsigned long qpshift;
+	u32 qpmask;
+	unsigned long cqshift;
+	u32 cqmask;
+	struct c4iw_dev_ucontext uctx;
+	struct gen_pool *pbl_pool;
+	struct gen_pool *rqt_pool;
+	struct gen_pool *ocqp_pool;
+	u32 flags;
+	struct cxgb4_lld_info lldi;
+	unsigned long bar2_pa;
+	void __iomem *bar2_kva;
+	unsigned long oc_mw_pa;
+	void __iomem *oc_mw_kva;
+	struct c4iw_stats stats;
+	struct c4iw_hw_queue hw_queue;
+	struct t4_dev_status_page *status_page;
+	atomic_t wr_log_idx;
+	struct wr_log_entry *wr_log;
+	int wr_log_size;
+};
+
+static inline int c4iw_fatal_error(struct c4iw_rdev *rdev)
+{
+	return rdev->flags & T4_FATAL_ERROR;
+}
+
+static inline int c4iw_num_stags(struct c4iw_rdev *rdev)
+{
+	return (int)(rdev->lldi.vr->stag.size >> 5);
+}
+
+#define C4IW_WR_TO (30*HZ)
+
+struct c4iw_wr_wait {
+	struct completion completion;
+	int ret;
+};
+
+static inline void c4iw_init_wr_wait(struct c4iw_wr_wait *wr_waitp)
+{
+	wr_waitp->ret = 0;
+	init_completion(&wr_waitp->completion);
+}
+
+static inline void c4iw_wake_up(struct c4iw_wr_wait *wr_waitp, int ret)
+{
+	wr_waitp->ret = ret;
+	complete(&wr_waitp->completion);
+}
+
+static inline int c4iw_wait_for_reply(struct c4iw_rdev *rdev,
+				 struct c4iw_wr_wait *wr_waitp,
+				 u32 hwtid, u32 qpid,
+				 const char *func)
+{
+	unsigned to = C4IW_WR_TO;
+	int ret;
+
+	do {
+		ret = wait_for_completion_timeout(&wr_waitp->completion, to);
+		if (!ret) {
+			printk(KERN_ERR MOD "%s - Device %s not responding - "
+			       "tid %u qpid %u\n", func,
+			       pci_name(rdev->lldi.pdev), hwtid, qpid);
+			if (c4iw_fatal_error(rdev)) {
+				wr_waitp->ret = -EIO;
+				break;
+			}
+			to = to << 2;
+		}
+	} while (!ret);
+	if (wr_waitp->ret)
+		PDBG("%s: FW reply %d tid %u qpid %u\n",
+		     pci_name(rdev->lldi.pdev), wr_waitp->ret, hwtid, qpid);
+	return wr_waitp->ret;
+}
+
+enum db_state {
+	NORMAL = 0,
+	FLOW_CONTROL = 1,
+	RECOVERY = 2,
+	STOPPED = 3
+};
+
+struct c4iw_dev {
+	struct ib_device ibdev;
+	struct c4iw_rdev rdev;
+	u32 device_cap_flags;
+	struct idr cqidr;
+	struct idr qpidr;
+	struct idr mmidr;
+	spinlock_t lock;
+	struct mutex db_mutex;
+	struct dentry *debugfs_root;
+	enum db_state db_state;
+	struct idr hwtid_idr;
+	struct idr atid_idr;
+	struct idr stid_idr;
+	struct list_head db_fc_list;
+	u32 avail_ird;
+};
+
+static inline struct c4iw_dev *to_c4iw_dev(struct ib_device *ibdev)
+{
+	return container_of(ibdev, struct c4iw_dev, ibdev);
+}
+
+static inline struct c4iw_dev *rdev_to_c4iw_dev(struct c4iw_rdev *rdev)
+{
+	return container_of(rdev, struct c4iw_dev, rdev);
+}
+
+static inline struct c4iw_cq *get_chp(struct c4iw_dev *rhp, u32 cqid)
+{
+	return idr_find(&rhp->cqidr, cqid);
+}
+
+static inline struct c4iw_qp *get_qhp(struct c4iw_dev *rhp, u32 qpid)
+{
+	return idr_find(&rhp->qpidr, qpid);
+}
+
+static inline struct c4iw_mr *get_mhp(struct c4iw_dev *rhp, u32 mmid)
+{
+	return idr_find(&rhp->mmidr, mmid);
+}
+
+static inline int _insert_handle(struct c4iw_dev *rhp, struct idr *idr,
+				 void *handle, u32 id, int lock)
+{
+	int ret;
+
+	if (lock) {
+		idr_preload(GFP_KERNEL);
+		spin_lock_irq(&rhp->lock);
+	}
+
+	ret = idr_alloc(idr, handle, id, id + 1, GFP_ATOMIC);
+
+	if (lock) {
+		spin_unlock_irq(&rhp->lock);
+		idr_preload_end();
+	}
+
+	BUG_ON(ret == -ENOSPC);
+	return ret < 0 ? ret : 0;
+}
+
+static inline int insert_handle(struct c4iw_dev *rhp, struct idr *idr,
+				void *handle, u32 id)
+{
+	return _insert_handle(rhp, idr, handle, id, 1);
+}
+
+static inline int insert_handle_nolock(struct c4iw_dev *rhp, struct idr *idr,
+				       void *handle, u32 id)
+{
+	return _insert_handle(rhp, idr, handle, id, 0);
+}
+
+static inline void _remove_handle(struct c4iw_dev *rhp, struct idr *idr,
+				   u32 id, int lock)
+{
+	if (lock)
+		spin_lock_irq(&rhp->lock);
+	idr_remove(idr, id);
+	if (lock)
+		spin_unlock_irq(&rhp->lock);
+}
+
+static inline void remove_handle(struct c4iw_dev *rhp, struct idr *idr, u32 id)
+{
+	_remove_handle(rhp, idr, id, 1);
+}
+
+static inline void remove_handle_nolock(struct c4iw_dev *rhp,
+					 struct idr *idr, u32 id)
+{
+	_remove_handle(rhp, idr, id, 0);
+}
+
+extern uint c4iw_max_read_depth;
+
+static inline int cur_max_read_depth(struct c4iw_dev *dev)
+{
+	return min(dev->rdev.lldi.max_ordird_qp, c4iw_max_read_depth);
+}
+
+struct c4iw_pd {
+	struct ib_pd ibpd;
+	u32 pdid;
+	struct c4iw_dev *rhp;
+};
+
+static inline struct c4iw_pd *to_c4iw_pd(struct ib_pd *ibpd)
+{
+	return container_of(ibpd, struct c4iw_pd, ibpd);
+}
+
+struct tpt_attributes {
+	u64 len;
+	u64 va_fbo;
+	enum fw_ri_mem_perms perms;
+	u32 stag;
+	u32 pdid;
+	u32 qpid;
+	u32 pbl_addr;
+	u32 pbl_size;
+	u32 state:1;
+	u32 type:2;
+	u32 rsvd:1;
+	u32 remote_invaliate_disable:1;
+	u32 zbva:1;
+	u32 mw_bind_enable:1;
+	u32 page_size:5;
+};
+
+struct c4iw_mr {
+	struct ib_mr ibmr;
+	struct ib_umem *umem;
+	struct c4iw_dev *rhp;
+	u64 kva;
+	struct tpt_attributes attr;
+};
+
+static inline struct c4iw_mr *to_c4iw_mr(struct ib_mr *ibmr)
+{
+	return container_of(ibmr, struct c4iw_mr, ibmr);
+}
+
+struct c4iw_mw {
+	struct ib_mw ibmw;
+	struct c4iw_dev *rhp;
+	u64 kva;
+	struct tpt_attributes attr;
+};
+
+static inline struct c4iw_mw *to_c4iw_mw(struct ib_mw *ibmw)
+{
+	return container_of(ibmw, struct c4iw_mw, ibmw);
+}
+
+struct c4iw_fr_page_list {
+	struct ib_fast_reg_page_list ibpl;
+	DEFINE_DMA_UNMAP_ADDR(mapping);
+	dma_addr_t dma_addr;
+	struct c4iw_dev *dev;
+	int pll_len;
+};
+
+static inline struct c4iw_fr_page_list *to_c4iw_fr_page_list(
+					struct ib_fast_reg_page_list *ibpl)
+{
+	return container_of(ibpl, struct c4iw_fr_page_list, ibpl);
+}
+
+struct c4iw_cq {
+	struct ib_cq ibcq;
+	struct c4iw_dev *rhp;
+	struct t4_cq cq;
+	spinlock_t lock;
+	spinlock_t comp_handler_lock;
+	atomic_t refcnt;
+	wait_queue_head_t wait;
+};
+
+static inline struct c4iw_cq *to_c4iw_cq(struct ib_cq *ibcq)
+{
+	return container_of(ibcq, struct c4iw_cq, ibcq);
+}
+
+struct c4iw_mpa_attributes {
+	u8 initiator;
+	u8 recv_marker_enabled;
+	u8 xmit_marker_enabled;
+	u8 crc_enabled;
+	u8 enhanced_rdma_conn;
+	u8 version;
+	u8 p2p_type;
+};
+
+struct c4iw_qp_attributes {
+	u32 scq;
+	u32 rcq;
+	u32 sq_num_entries;
+	u32 rq_num_entries;
+	u32 sq_max_sges;
+	u32 sq_max_sges_rdma_write;
+	u32 rq_max_sges;
+	u32 state;
+	u8 enable_rdma_read;
+	u8 enable_rdma_write;
+	u8 enable_bind;
+	u8 enable_mmid0_fastreg;
+	u32 max_ord;
+	u32 max_ird;
+	u32 pd;
+	u32 next_state;
+	char terminate_buffer[52];
+	u32 terminate_msg_len;
+	u8 is_terminate_local;
+	struct c4iw_mpa_attributes mpa_attr;
+	struct c4iw_ep *llp_stream_handle;
+	u8 layer_etype;
+	u8 ecode;
+	u16 sq_db_inc;
+	u16 rq_db_inc;
+	u8 send_term;
+};
+
+struct c4iw_qp {
+	struct ib_qp ibqp;
+	struct list_head db_fc_entry;
+	struct c4iw_dev *rhp;
+	struct c4iw_ep *ep;
+	struct c4iw_qp_attributes attr;
+	struct t4_wq wq;
+	spinlock_t lock;
+	struct mutex mutex;
+	atomic_t refcnt;
+	wait_queue_head_t wait;
+	struct timer_list timer;
+	int sq_sig_all;
+};
+
+static inline struct c4iw_qp *to_c4iw_qp(struct ib_qp *ibqp)
+{
+	return container_of(ibqp, struct c4iw_qp, ibqp);
+}
+
+struct c4iw_ucontext {
+	struct ib_ucontext ibucontext;
+	struct c4iw_dev_ucontext uctx;
+	u32 key;
+	spinlock_t mmap_lock;
+	struct list_head mmaps;
+};
+
+static inline struct c4iw_ucontext *to_c4iw_ucontext(struct ib_ucontext *c)
+{
+	return container_of(c, struct c4iw_ucontext, ibucontext);
+}
+
+struct c4iw_mm_entry {
+	struct list_head entry;
+	u64 addr;
+	u32 key;
+	unsigned len;
+};
+
+static inline struct c4iw_mm_entry *remove_mmap(struct c4iw_ucontext *ucontext,
+						u32 key, unsigned len)
+{
+	struct list_head *pos, *nxt;
+	struct c4iw_mm_entry *mm;
+
+	spin_lock(&ucontext->mmap_lock);
+	list_for_each_safe(pos, nxt, &ucontext->mmaps) {
+
+		mm = list_entry(pos, struct c4iw_mm_entry, entry);
+		if (mm->key == key && mm->len == len) {
+			list_del_init(&mm->entry);
+			spin_unlock(&ucontext->mmap_lock);
+			PDBG("%s key 0x%x addr 0x%llx len %d\n", __func__,
+			     key, (unsigned long long) mm->addr, mm->len);
+			return mm;
+		}
+	}
+	spin_unlock(&ucontext->mmap_lock);
+	return NULL;
+}
+
+static inline void insert_mmap(struct c4iw_ucontext *ucontext,
+			       struct c4iw_mm_entry *mm)
+{
+	spin_lock(&ucontext->mmap_lock);
+	PDBG("%s key 0x%x addr 0x%llx len %d\n", __func__,
+	     mm->key, (unsigned long long) mm->addr, mm->len);
+	list_add_tail(&mm->entry, &ucontext->mmaps);
+	spin_unlock(&ucontext->mmap_lock);
+}
+
+enum c4iw_qp_attr_mask {
+	C4IW_QP_ATTR_NEXT_STATE = 1 << 0,
+	C4IW_QP_ATTR_SQ_DB = 1<<1,
+	C4IW_QP_ATTR_RQ_DB = 1<<2,
+	C4IW_QP_ATTR_ENABLE_RDMA_READ = 1 << 7,
+	C4IW_QP_ATTR_ENABLE_RDMA_WRITE = 1 << 8,
+	C4IW_QP_ATTR_ENABLE_RDMA_BIND = 1 << 9,
+	C4IW_QP_ATTR_MAX_ORD = 1 << 11,
+	C4IW_QP_ATTR_MAX_IRD = 1 << 12,
+	C4IW_QP_ATTR_LLP_STREAM_HANDLE = 1 << 22,
+	C4IW_QP_ATTR_STREAM_MSG_BUFFER = 1 << 23,
+	C4IW_QP_ATTR_MPA_ATTR = 1 << 24,
+	C4IW_QP_ATTR_QP_CONTEXT_ACTIVATE = 1 << 25,
+	C4IW_QP_ATTR_VALID_MODIFY = (C4IW_QP_ATTR_ENABLE_RDMA_READ |
+				     C4IW_QP_ATTR_ENABLE_RDMA_WRITE |
+				     C4IW_QP_ATTR_MAX_ORD |
+				     C4IW_QP_ATTR_MAX_IRD |
+				     C4IW_QP_ATTR_LLP_STREAM_HANDLE |
+				     C4IW_QP_ATTR_STREAM_MSG_BUFFER |
+				     C4IW_QP_ATTR_MPA_ATTR |
+				     C4IW_QP_ATTR_QP_CONTEXT_ACTIVATE)
+};
+
+int c4iw_modify_qp(struct c4iw_dev *rhp,
+				struct c4iw_qp *qhp,
+				enum c4iw_qp_attr_mask mask,
+				struct c4iw_qp_attributes *attrs,
+				int internal);
+
+enum c4iw_qp_state {
+	C4IW_QP_STATE_IDLE,
+	C4IW_QP_STATE_RTS,
+	C4IW_QP_STATE_ERROR,
+	C4IW_QP_STATE_TERMINATE,
+	C4IW_QP_STATE_CLOSING,
+	C4IW_QP_STATE_TOT
+};
+
+static inline int c4iw_convert_state(enum ib_qp_state ib_state)
+{
+	switch (ib_state) {
+	case IB_QPS_RESET:
+	case IB_QPS_INIT:
+		return C4IW_QP_STATE_IDLE;
+	case IB_QPS_RTS:
+		return C4IW_QP_STATE_RTS;
+	case IB_QPS_SQD:
+		return C4IW_QP_STATE_CLOSING;
+	case IB_QPS_SQE:
+		return C4IW_QP_STATE_TERMINATE;
+	case IB_QPS_ERR:
+		return C4IW_QP_STATE_ERROR;
+	default:
+		return -1;
+	}
+}
+
+static inline int to_ib_qp_state(int c4iw_qp_state)
+{
+	switch (c4iw_qp_state) {
+	case C4IW_QP_STATE_IDLE:
+		return IB_QPS_INIT;
+	case C4IW_QP_STATE_RTS:
+		return IB_QPS_RTS;
+	case C4IW_QP_STATE_CLOSING:
+		return IB_QPS_SQD;
+	case C4IW_QP_STATE_TERMINATE:
+		return IB_QPS_SQE;
+	case C4IW_QP_STATE_ERROR:
+		return IB_QPS_ERR;
+	}
+	return IB_QPS_ERR;
+}
+
+static inline u32 c4iw_ib_to_tpt_access(int a)
+{
+	return (a & IB_ACCESS_REMOTE_WRITE ? FW_RI_MEM_ACCESS_REM_WRITE : 0) |
+	       (a & IB_ACCESS_REMOTE_READ ? FW_RI_MEM_ACCESS_REM_READ : 0) |
+	       (a & IB_ACCESS_LOCAL_WRITE ? FW_RI_MEM_ACCESS_LOCAL_WRITE : 0) |
+	       FW_RI_MEM_ACCESS_LOCAL_READ;
+}
+
+static inline u32 c4iw_ib_to_tpt_bind_access(int acc)
+{
+	return (acc & IB_ACCESS_REMOTE_WRITE ? FW_RI_MEM_ACCESS_REM_WRITE : 0) |
+	       (acc & IB_ACCESS_REMOTE_READ ? FW_RI_MEM_ACCESS_REM_READ : 0);
+}
+
+enum c4iw_mmid_state {
+	C4IW_STAG_STATE_VALID,
+	C4IW_STAG_STATE_INVALID
+};
+
+#define C4IW_NODE_DESC "cxgb4 Chelsio Communications"
+
+#define MPA_KEY_REQ "MPA ID Req Frame"
+#define MPA_KEY_REP "MPA ID Rep Frame"
+
+#define MPA_MAX_PRIVATE_DATA	256
+#define MPA_ENHANCED_RDMA_CONN	0x10
+#define MPA_REJECT		0x20
+#define MPA_CRC			0x40
+#define MPA_MARKERS		0x80
+#define MPA_FLAGS_MASK		0xE0
+
+#define MPA_V2_PEER2PEER_MODEL          0x8000
+#define MPA_V2_ZERO_LEN_FPDU_RTR        0x4000
+#define MPA_V2_RDMA_WRITE_RTR           0x8000
+#define MPA_V2_RDMA_READ_RTR            0x4000
+#define MPA_V2_IRD_ORD_MASK             0x3FFF
+
+#define c4iw_put_ep(ep) { \
+	PDBG("put_ep (via %s:%u) ep %p refcnt %d\n", __func__, __LINE__,  \
+	     ep, atomic_read(&((ep)->kref.refcount))); \
+	WARN_ON(atomic_read(&((ep)->kref.refcount)) < 1); \
+	kref_put(&((ep)->kref), _c4iw_free_ep); \
+}
+
+#define c4iw_get_ep(ep) { \
+	PDBG("get_ep (via %s:%u) ep %p, refcnt %d\n", __func__, __LINE__, \
+	     ep, atomic_read(&((ep)->kref.refcount))); \
+	kref_get(&((ep)->kref));  \
+}
+void _c4iw_free_ep(struct kref *kref);
+
+struct mpa_message {
+	u8 key[16];
+	u8 flags;
+	u8 revision;
+	__be16 private_data_size;
+	u8 private_data[0];
+};
+
+struct mpa_v2_conn_params {
+	__be16 ird;
+	__be16 ord;
+};
+
+struct terminate_message {
+	u8 layer_etype;
+	u8 ecode;
+	__be16 hdrct_rsvd;
+	u8 len_hdrs[0];
+};
+
+#define TERM_MAX_LENGTH (sizeof(struct terminate_message) + 2 + 18 + 28)
+
+enum c4iw_layers_types {
+	LAYER_RDMAP		= 0x00,
+	LAYER_DDP		= 0x10,
+	LAYER_MPA		= 0x20,
+	RDMAP_LOCAL_CATA	= 0x00,
+	RDMAP_REMOTE_PROT	= 0x01,
+	RDMAP_REMOTE_OP		= 0x02,
+	DDP_LOCAL_CATA		= 0x00,
+	DDP_TAGGED_ERR		= 0x01,
+	DDP_UNTAGGED_ERR	= 0x02,
+	DDP_LLP			= 0x03
+};
+
+enum c4iw_rdma_ecodes {
+	RDMAP_INV_STAG		= 0x00,
+	RDMAP_BASE_BOUNDS	= 0x01,
+	RDMAP_ACC_VIOL		= 0x02,
+	RDMAP_STAG_NOT_ASSOC	= 0x03,
+	RDMAP_TO_WRAP		= 0x04,
+	RDMAP_INV_VERS		= 0x05,
+	RDMAP_INV_OPCODE	= 0x06,
+	RDMAP_STREAM_CATA	= 0x07,
+	RDMAP_GLOBAL_CATA	= 0x08,
+	RDMAP_CANT_INV_STAG	= 0x09,
+	RDMAP_UNSPECIFIED	= 0xff
+};
+
+enum c4iw_ddp_ecodes {
+	DDPT_INV_STAG		= 0x00,
+	DDPT_BASE_BOUNDS	= 0x01,
+	DDPT_STAG_NOT_ASSOC	= 0x02,
+	DDPT_TO_WRAP		= 0x03,
+	DDPT_INV_VERS		= 0x04,
+	DDPU_INV_QN		= 0x01,
+	DDPU_INV_MSN_NOBUF	= 0x02,
+	DDPU_INV_MSN_RANGE	= 0x03,
+	DDPU_INV_MO		= 0x04,
+	DDPU_MSG_TOOBIG		= 0x05,
+	DDPU_INV_VERS		= 0x06
+};
+
+enum c4iw_mpa_ecodes {
+	MPA_CRC_ERR		= 0x02,
+	MPA_MARKER_ERR          = 0x03,
+	MPA_LOCAL_CATA          = 0x05,
+	MPA_INSUFF_IRD          = 0x06,
+	MPA_NOMATCH_RTR         = 0x07,
+};
+
+enum c4iw_ep_state {
+	IDLE = 0,
+	LISTEN,
+	CONNECTING,
+	MPA_REQ_WAIT,
+	MPA_REQ_SENT,
+	MPA_REQ_RCVD,
+	MPA_REP_SENT,
+	FPDU_MODE,
+	ABORTING,
+	CLOSING,
+	MORIBUND,
+	DEAD,
+};
+
+enum c4iw_ep_flags {
+	PEER_ABORT_IN_PROGRESS	= 0,
+	ABORT_REQ_IN_PROGRESS	= 1,
+	RELEASE_RESOURCES	= 2,
+	CLOSE_SENT		= 3,
+	TIMEOUT                 = 4,
+	QP_REFERENCED           = 5,
+	RELEASE_MAPINFO		= 6,
+};
+
+enum c4iw_ep_history {
+	ACT_OPEN_REQ            = 0,
+	ACT_OFLD_CONN           = 1,
+	ACT_OPEN_RPL            = 2,
+	ACT_ESTAB               = 3,
+	PASS_ACCEPT_REQ         = 4,
+	PASS_ESTAB              = 5,
+	ABORT_UPCALL            = 6,
+	ESTAB_UPCALL            = 7,
+	CLOSE_UPCALL            = 8,
+	ULP_ACCEPT              = 9,
+	ULP_REJECT              = 10,
+	TIMEDOUT                = 11,
+	PEER_ABORT              = 12,
+	PEER_CLOSE              = 13,
+	CONNREQ_UPCALL          = 14,
+	ABORT_CONN              = 15,
+	DISCONN_UPCALL          = 16,
+	EP_DISC_CLOSE           = 17,
+	EP_DISC_ABORT           = 18,
+	CONN_RPL_UPCALL         = 19,
+	ACT_RETRY_NOMEM         = 20,
+	ACT_RETRY_INUSE         = 21
+};
+
+struct c4iw_ep_common {
+	struct iw_cm_id *cm_id;
+	struct c4iw_qp *qp;
+	struct c4iw_dev *dev;
+	enum c4iw_ep_state state;
+	struct kref kref;
+	struct mutex mutex;
+	struct sockaddr_storage local_addr;
+	struct sockaddr_storage remote_addr;
+	struct sockaddr_storage mapped_local_addr;
+	struct sockaddr_storage mapped_remote_addr;
+	struct c4iw_wr_wait wr_wait;
+	unsigned long flags;
+	unsigned long history;
+};
+
+struct c4iw_listen_ep {
+	struct c4iw_ep_common com;
+	unsigned int stid;
+	int backlog;
+};
+
+struct c4iw_ep {
+	struct c4iw_ep_common com;
+	struct c4iw_ep *parent_ep;
+	struct timer_list timer;
+	struct list_head entry;
+	unsigned int atid;
+	u32 hwtid;
+	u32 snd_seq;
+	u32 rcv_seq;
+	struct l2t_entry *l2t;
+	struct dst_entry *dst;
+	struct sk_buff *mpa_skb;
+	struct c4iw_mpa_attributes mpa_attr;
+	u8 mpa_pkt[sizeof(struct mpa_message) + MPA_MAX_PRIVATE_DATA];
+	unsigned int mpa_pkt_len;
+	u32 ird;
+	u32 ord;
+	u32 smac_idx;
+	u32 tx_chan;
+	u32 mtu;
+	u16 mss;
+	u16 emss;
+	u16 plen;
+	u16 rss_qid;
+	u16 txq_idx;
+	u16 ctrlq_idx;
+	u8 tos;
+	u8 retry_with_mpa_v1;
+	u8 tried_with_mpa_v1;
+	unsigned int retry_count;
+	int snd_win;
+	int rcv_win;
+};
+
+static inline void print_addr(struct c4iw_ep_common *epc, const char *func,
+			      const char *msg)
+{
+
+#define SINA(a) (&(((struct sockaddr_in *)(a))->sin_addr.s_addr))
+#define SINP(a) ntohs(((struct sockaddr_in *)(a))->sin_port)
+#define SIN6A(a) (&(((struct sockaddr_in6 *)(a))->sin6_addr))
+#define SIN6P(a) ntohs(((struct sockaddr_in6 *)(a))->sin6_port)
+
+	if (c4iw_debug) {
+		switch (epc->local_addr.ss_family) {
+		case AF_INET:
+			PDBG("%s %s %pI4:%u/%u <-> %pI4:%u/%u\n",
+			     func, msg, SINA(&epc->local_addr),
+			     SINP(&epc->local_addr),
+			     SINP(&epc->mapped_local_addr),
+			     SINA(&epc->remote_addr),
+			     SINP(&epc->remote_addr),
+			     SINP(&epc->mapped_remote_addr));
+			break;
+		case AF_INET6:
+			PDBG("%s %s %pI6:%u/%u <-> %pI6:%u/%u\n",
+			     func, msg, SIN6A(&epc->local_addr),
+			     SIN6P(&epc->local_addr),
+			     SIN6P(&epc->mapped_local_addr),
+			     SIN6A(&epc->remote_addr),
+			     SIN6P(&epc->remote_addr),
+			     SIN6P(&epc->mapped_remote_addr));
+			break;
+		default:
+			break;
+		}
+	}
+#undef SINA
+#undef SINP
+#undef SIN6A
+#undef SIN6P
+}
+
+static inline struct c4iw_ep *to_ep(struct iw_cm_id *cm_id)
+{
+	return cm_id->provider_data;
+}
+
+static inline struct c4iw_listen_ep *to_listen_ep(struct iw_cm_id *cm_id)
+{
+	return cm_id->provider_data;
+}
+
+static inline int compute_wscale(int win)
+{
+	int wscale = 0;
+
+	while (wscale < 14 && (65535<<wscale) < win)
+		wscale++;
+	return wscale;
+}
+
+static inline int ocqp_supported(const struct cxgb4_lld_info *infop)
+{
+#if defined(__i386__) || defined(__x86_64__) || defined(CONFIG_PPC64)
+	return infop->vr->ocq.size > 0;
+#else
+	return 0;
+#endif
+}
+
+u32 c4iw_id_alloc(struct c4iw_id_table *alloc);
+void c4iw_id_free(struct c4iw_id_table *alloc, u32 obj);
+int c4iw_id_table_alloc(struct c4iw_id_table *alloc, u32 start, u32 num,
+			u32 reserved, u32 flags);
+void c4iw_id_table_free(struct c4iw_id_table *alloc);
+
+typedef int (*c4iw_handler_func)(struct c4iw_dev *dev, struct sk_buff *skb);
+
+int c4iw_ep_redirect(void *ctx, struct dst_entry *old, struct dst_entry *new,
+		     struct l2t_entry *l2t);
+void c4iw_put_qpid(struct c4iw_rdev *rdev, u32 qpid,
+		   struct c4iw_dev_ucontext *uctx);
+u32 c4iw_get_resource(struct c4iw_id_table *id_table);
+void c4iw_put_resource(struct c4iw_id_table *id_table, u32 entry);
+int c4iw_init_resource(struct c4iw_rdev *rdev, u32 nr_tpt, u32 nr_pdid);
+int c4iw_init_ctrl_qp(struct c4iw_rdev *rdev);
+int c4iw_pblpool_create(struct c4iw_rdev *rdev);
+int c4iw_rqtpool_create(struct c4iw_rdev *rdev);
+int c4iw_ocqp_pool_create(struct c4iw_rdev *rdev);
+void c4iw_pblpool_destroy(struct c4iw_rdev *rdev);
+void c4iw_rqtpool_destroy(struct c4iw_rdev *rdev);
+void c4iw_ocqp_pool_destroy(struct c4iw_rdev *rdev);
+void c4iw_destroy_resource(struct c4iw_resource *rscp);
+int c4iw_destroy_ctrl_qp(struct c4iw_rdev *rdev);
+int c4iw_register_device(struct c4iw_dev *dev);
+void c4iw_unregister_device(struct c4iw_dev *dev);
+int __init c4iw_cm_init(void);
+void c4iw_cm_term(void);
+void c4iw_release_dev_ucontext(struct c4iw_rdev *rdev,
+			       struct c4iw_dev_ucontext *uctx);
+void c4iw_init_dev_ucontext(struct c4iw_rdev *rdev,
+			    struct c4iw_dev_ucontext *uctx);
+int c4iw_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc);
+int c4iw_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
+		      struct ib_send_wr **bad_wr);
+int c4iw_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr,
+		      struct ib_recv_wr **bad_wr);
+int c4iw_bind_mw(struct ib_qp *qp, struct ib_mw *mw,
+		 struct ib_mw_bind *mw_bind);
+int c4iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param);
+int c4iw_create_listen(struct iw_cm_id *cm_id, int backlog);
+int c4iw_destroy_listen(struct iw_cm_id *cm_id);
+int c4iw_accept_cr(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param);
+int c4iw_reject_cr(struct iw_cm_id *cm_id, const void *pdata, u8 pdata_len);
+void c4iw_qp_add_ref(struct ib_qp *qp);
+void c4iw_qp_rem_ref(struct ib_qp *qp);
+void c4iw_free_fastreg_pbl(struct ib_fast_reg_page_list *page_list);
+struct ib_fast_reg_page_list *c4iw_alloc_fastreg_pbl(
+					struct ib_device *device,
+					int page_list_len);
+struct ib_mr *c4iw_alloc_fast_reg_mr(struct ib_pd *pd, int pbl_depth);
+int c4iw_dealloc_mw(struct ib_mw *mw);
+struct ib_mw *c4iw_alloc_mw(struct ib_pd *pd, enum ib_mw_type type);
+struct ib_mr *c4iw_reg_user_mr(struct ib_pd *pd, u64 start,
+					   u64 length, u64 virt, int acc,
+					   struct ib_udata *udata);
+struct ib_mr *c4iw_get_dma_mr(struct ib_pd *pd, int acc);
+struct ib_mr *c4iw_register_phys_mem(struct ib_pd *pd,
+					struct ib_phys_buf *buffer_list,
+					int num_phys_buf,
+					int acc,
+					u64 *iova_start);
+int c4iw_reregister_phys_mem(struct ib_mr *mr,
+				     int mr_rereg_mask,
+				     struct ib_pd *pd,
+				     struct ib_phys_buf *buffer_list,
+				     int num_phys_buf,
+				     int acc, u64 *iova_start);
+int c4iw_dereg_mr(struct ib_mr *ib_mr);
+int c4iw_destroy_cq(struct ib_cq *ib_cq);
+struct ib_cq *c4iw_create_cq(struct ib_device *ibdev, int entries,
+					int vector,
+					struct ib_ucontext *ib_context,
+					struct ib_udata *udata);
+int c4iw_resize_cq(struct ib_cq *cq, int cqe, struct ib_udata *udata);
+int c4iw_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags);
+int c4iw_destroy_qp(struct ib_qp *ib_qp);
+struct ib_qp *c4iw_create_qp(struct ib_pd *pd,
+			     struct ib_qp_init_attr *attrs,
+			     struct ib_udata *udata);
+int c4iw_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+				 int attr_mask, struct ib_udata *udata);
+int c4iw_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+		     int attr_mask, struct ib_qp_init_attr *init_attr);
+struct ib_qp *c4iw_get_qp(struct ib_device *dev, int qpn);
+u32 c4iw_rqtpool_alloc(struct c4iw_rdev *rdev, int size);
+void c4iw_rqtpool_free(struct c4iw_rdev *rdev, u32 addr, int size);
+u32 c4iw_pblpool_alloc(struct c4iw_rdev *rdev, int size);
+void c4iw_pblpool_free(struct c4iw_rdev *rdev, u32 addr, int size);
+u32 c4iw_ocqp_pool_alloc(struct c4iw_rdev *rdev, int size);
+void c4iw_ocqp_pool_free(struct c4iw_rdev *rdev, u32 addr, int size);
+int c4iw_ofld_send(struct c4iw_rdev *rdev, struct sk_buff *skb);
+void c4iw_flush_hw_cq(struct c4iw_cq *chp);
+void c4iw_count_rcqes(struct t4_cq *cq, struct t4_wq *wq, int *count);
+int c4iw_ep_disconnect(struct c4iw_ep *ep, int abrupt, gfp_t gfp);
+int c4iw_flush_rq(struct t4_wq *wq, struct t4_cq *cq, int count);
+int c4iw_flush_sq(struct c4iw_qp *qhp);
+int c4iw_ev_handler(struct c4iw_dev *rnicp, u32 qid);
+u16 c4iw_rqes_posted(struct c4iw_qp *qhp);
+int c4iw_post_terminate(struct c4iw_qp *qhp, struct t4_cqe *err_cqe);
+u32 c4iw_get_cqid(struct c4iw_rdev *rdev, struct c4iw_dev_ucontext *uctx);
+void c4iw_put_cqid(struct c4iw_rdev *rdev, u32 qid,
+		struct c4iw_dev_ucontext *uctx);
+u32 c4iw_get_qpid(struct c4iw_rdev *rdev, struct c4iw_dev_ucontext *uctx);
+void c4iw_put_qpid(struct c4iw_rdev *rdev, u32 qid,
+		struct c4iw_dev_ucontext *uctx);
+void c4iw_ev_dispatch(struct c4iw_dev *dev, struct t4_cqe *err_cqe);
+
+extern struct cxgb4_client t4c_client;
+extern c4iw_handler_func c4iw_handlers[NUM_CPL_CMDS];
+extern void c4iw_log_wr_stats(struct t4_wq *wq, struct t4_cqe *cqe);
+extern int c4iw_wr_log;
+extern int db_fc_threshold;
+extern int db_coalescing_threshold;
+extern int use_dsgl;
+
+
+#endif
diff --git a/drivers/infiniband/hw/cxgb4/mem.c b/drivers/infiniband/hw/cxgb4/mem.c
new file mode 100644
index 0000000..ec7a298
--- /dev/null
+++ b/drivers/infiniband/hw/cxgb4/mem.c
@@ -0,0 +1,955 @@
+/*
+ * Copyright (c) 2009-2010 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <rdma/ib_umem.h>
+#include <linux/atomic.h>
+
+#include "iw_cxgb4.h"
+
+int use_dsgl = 0;
+module_param(use_dsgl, int, 0644);
+MODULE_PARM_DESC(use_dsgl, "Use DSGL for PBL/FastReg (default=0)");
+
+#define T4_ULPTX_MIN_IO 32
+#define C4IW_MAX_INLINE_SIZE 96
+#define T4_ULPTX_MAX_DMA 1024
+#define C4IW_INLINE_THRESHOLD 128
+
+static int inline_threshold = C4IW_INLINE_THRESHOLD;
+module_param(inline_threshold, int, 0644);
+MODULE_PARM_DESC(inline_threshold, "inline vs dsgl threshold (default=128)");
+
+static int _c4iw_write_mem_dma_aligned(struct c4iw_rdev *rdev, u32 addr,
+				       u32 len, dma_addr_t data, int wait)
+{
+	struct sk_buff *skb;
+	struct ulp_mem_io *req;
+	struct ulptx_sgl *sgl;
+	u8 wr_len;
+	int ret = 0;
+	struct c4iw_wr_wait wr_wait;
+
+	addr &= 0x7FFFFFF;
+
+	if (wait)
+		c4iw_init_wr_wait(&wr_wait);
+	wr_len = roundup(sizeof(*req) + sizeof(*sgl), 16);
+
+	skb = alloc_skb(wr_len, GFP_KERNEL | __GFP_NOFAIL);
+	if (!skb)
+		return -ENOMEM;
+	set_wr_txq(skb, CPL_PRIORITY_CONTROL, 0);
+
+	req = (struct ulp_mem_io *)__skb_put(skb, wr_len);
+	memset(req, 0, wr_len);
+	INIT_ULPTX_WR(req, wr_len, 0, 0);
+	req->wr.wr_hi = cpu_to_be32(FW_WR_OP(FW_ULPTX_WR) |
+			(wait ? FW_WR_COMPL(1) : 0));
+	req->wr.wr_lo = wait ? (__force __be64)(unsigned long) &wr_wait : 0L;
+	req->wr.wr_mid = cpu_to_be32(FW_WR_LEN16(DIV_ROUND_UP(wr_len, 16)));
+	req->cmd = cpu_to_be32(ULPTX_CMD(ULP_TX_MEM_WRITE));
+	req->cmd |= cpu_to_be32(V_T5_ULP_MEMIO_ORDER(1));
+	req->dlen = cpu_to_be32(ULP_MEMIO_DATA_LEN(len>>5));
+	req->len16 = cpu_to_be32(DIV_ROUND_UP(wr_len-sizeof(req->wr), 16));
+	req->lock_addr = cpu_to_be32(ULP_MEMIO_ADDR(addr));
+
+	sgl = (struct ulptx_sgl *)(req + 1);
+	sgl->cmd_nsge = cpu_to_be32(ULPTX_CMD(ULP_TX_SC_DSGL) |
+				    ULPTX_NSGE(1));
+	sgl->len0 = cpu_to_be32(len);
+	sgl->addr0 = cpu_to_be64(data);
+
+	ret = c4iw_ofld_send(rdev, skb);
+	if (ret)
+		return ret;
+	if (wait)
+		ret = c4iw_wait_for_reply(rdev, &wr_wait, 0, 0, __func__);
+	return ret;
+}
+
+static int _c4iw_write_mem_inline(struct c4iw_rdev *rdev, u32 addr, u32 len,
+				  void *data)
+{
+	struct sk_buff *skb;
+	struct ulp_mem_io *req;
+	struct ulptx_idata *sc;
+	u8 wr_len, *to_dp, *from_dp;
+	int copy_len, num_wqe, i, ret = 0;
+	struct c4iw_wr_wait wr_wait;
+	__be32 cmd = cpu_to_be32(ULPTX_CMD(ULP_TX_MEM_WRITE));
+
+	if (is_t4(rdev->lldi.adapter_type))
+		cmd |= cpu_to_be32(ULP_MEMIO_ORDER(1));
+	else
+		cmd |= cpu_to_be32(V_T5_ULP_MEMIO_IMM(1));
+
+	addr &= 0x7FFFFFF;
+	PDBG("%s addr 0x%x len %u\n", __func__, addr, len);
+	num_wqe = DIV_ROUND_UP(len, C4IW_MAX_INLINE_SIZE);
+	c4iw_init_wr_wait(&wr_wait);
+	for (i = 0; i < num_wqe; i++) {
+
+		copy_len = len > C4IW_MAX_INLINE_SIZE ? C4IW_MAX_INLINE_SIZE :
+			   len;
+		wr_len = roundup(sizeof *req + sizeof *sc +
+				 roundup(copy_len, T4_ULPTX_MIN_IO), 16);
+
+		skb = alloc_skb(wr_len, GFP_KERNEL);
+		if (!skb)
+			return -ENOMEM;
+		set_wr_txq(skb, CPL_PRIORITY_CONTROL, 0);
+
+		req = (struct ulp_mem_io *)__skb_put(skb, wr_len);
+		memset(req, 0, wr_len);
+		INIT_ULPTX_WR(req, wr_len, 0, 0);
+
+		if (i == (num_wqe-1)) {
+			req->wr.wr_hi = cpu_to_be32(FW_WR_OP(FW_ULPTX_WR) |
+						    FW_WR_COMPL(1));
+			req->wr.wr_lo = (__force __be64)(unsigned long) &wr_wait;
+		} else
+			req->wr.wr_hi = cpu_to_be32(FW_WR_OP(FW_ULPTX_WR));
+		req->wr.wr_mid = cpu_to_be32(
+				       FW_WR_LEN16(DIV_ROUND_UP(wr_len, 16)));
+
+		req->cmd = cmd;
+		req->dlen = cpu_to_be32(ULP_MEMIO_DATA_LEN(
+				DIV_ROUND_UP(copy_len, T4_ULPTX_MIN_IO)));
+		req->len16 = cpu_to_be32(DIV_ROUND_UP(wr_len-sizeof(req->wr),
+						      16));
+		req->lock_addr = cpu_to_be32(ULP_MEMIO_ADDR(addr + i * 3));
+
+		sc = (struct ulptx_idata *)(req + 1);
+		sc->cmd_more = cpu_to_be32(ULPTX_CMD(ULP_TX_SC_IMM));
+		sc->len = cpu_to_be32(roundup(copy_len, T4_ULPTX_MIN_IO));
+
+		to_dp = (u8 *)(sc + 1);
+		from_dp = (u8 *)data + i * C4IW_MAX_INLINE_SIZE;
+		if (data)
+			memcpy(to_dp, from_dp, copy_len);
+		else
+			memset(to_dp, 0, copy_len);
+		if (copy_len % T4_ULPTX_MIN_IO)
+			memset(to_dp + copy_len, 0, T4_ULPTX_MIN_IO -
+			       (copy_len % T4_ULPTX_MIN_IO));
+		ret = c4iw_ofld_send(rdev, skb);
+		if (ret)
+			return ret;
+		len -= C4IW_MAX_INLINE_SIZE;
+	}
+
+	ret = c4iw_wait_for_reply(rdev, &wr_wait, 0, 0, __func__);
+	return ret;
+}
+
+static int _c4iw_write_mem_dma(struct c4iw_rdev *rdev, u32 addr, u32 len, void *data)
+{
+	u32 remain = len;
+	u32 dmalen;
+	int ret = 0;
+	dma_addr_t daddr;
+	dma_addr_t save;
+
+	daddr = dma_map_single(&rdev->lldi.pdev->dev, data, len, DMA_TO_DEVICE);
+	if (dma_mapping_error(&rdev->lldi.pdev->dev, daddr))
+		return -1;
+	save = daddr;
+
+	while (remain > inline_threshold) {
+		if (remain < T4_ULPTX_MAX_DMA) {
+			if (remain & ~T4_ULPTX_MIN_IO)
+				dmalen = remain & ~(T4_ULPTX_MIN_IO-1);
+			else
+				dmalen = remain;
+		} else
+			dmalen = T4_ULPTX_MAX_DMA;
+		remain -= dmalen;
+		ret = _c4iw_write_mem_dma_aligned(rdev, addr, dmalen, daddr,
+						 !remain);
+		if (ret)
+			goto out;
+		addr += dmalen >> 5;
+		data += dmalen;
+		daddr += dmalen;
+	}
+	if (remain)
+		ret = _c4iw_write_mem_inline(rdev, addr, remain, data);
+out:
+	dma_unmap_single(&rdev->lldi.pdev->dev, save, len, DMA_TO_DEVICE);
+	return ret;
+}
+
+/*
+ * write len bytes of data into addr (32B aligned address)
+ * If data is NULL, clear len byte of memory to zero.
+ */
+static int write_adapter_mem(struct c4iw_rdev *rdev, u32 addr, u32 len,
+			     void *data)
+{
+	if (is_t5(rdev->lldi.adapter_type) && use_dsgl) {
+		if (len > inline_threshold) {
+			if (_c4iw_write_mem_dma(rdev, addr, len, data)) {
+				printk_ratelimited(KERN_WARNING
+						   "%s: dma map"
+						   " failure (non fatal)\n",
+						   pci_name(rdev->lldi.pdev));
+				return _c4iw_write_mem_inline(rdev, addr, len,
+							      data);
+			} else
+				return 0;
+		} else
+			return _c4iw_write_mem_inline(rdev, addr, len, data);
+	} else
+		return _c4iw_write_mem_inline(rdev, addr, len, data);
+}
+
+/*
+ * Build and write a TPT entry.
+ * IN: stag key, pdid, perm, bind_enabled, zbva, to, len, page_size,
+ *     pbl_size and pbl_addr
+ * OUT: stag index
+ */
+static int write_tpt_entry(struct c4iw_rdev *rdev, u32 reset_tpt_entry,
+			   u32 *stag, u8 stag_state, u32 pdid,
+			   enum fw_ri_stag_type type, enum fw_ri_mem_perms perm,
+			   int bind_enabled, u32 zbva, u64 to,
+			   u64 len, u8 page_size, u32 pbl_size, u32 pbl_addr)
+{
+	int err;
+	struct fw_ri_tpte tpt;
+	u32 stag_idx;
+	static atomic_t key;
+
+	if (c4iw_fatal_error(rdev))
+		return -EIO;
+
+	stag_state = stag_state > 0;
+	stag_idx = (*stag) >> 8;
+
+	if ((!reset_tpt_entry) && (*stag == T4_STAG_UNSET)) {
+		stag_idx = c4iw_get_resource(&rdev->resource.tpt_table);
+		if (!stag_idx) {
+			mutex_lock(&rdev->stats.lock);
+			rdev->stats.stag.fail++;
+			mutex_unlock(&rdev->stats.lock);
+			return -ENOMEM;
+		}
+		mutex_lock(&rdev->stats.lock);
+		rdev->stats.stag.cur += 32;
+		if (rdev->stats.stag.cur > rdev->stats.stag.max)
+			rdev->stats.stag.max = rdev->stats.stag.cur;
+		mutex_unlock(&rdev->stats.lock);
+		*stag = (stag_idx << 8) | (atomic_inc_return(&key) & 0xff);
+	}
+	PDBG("%s stag_state 0x%0x type 0x%0x pdid 0x%0x, stag_idx 0x%x\n",
+	     __func__, stag_state, type, pdid, stag_idx);
+
+	/* write TPT entry */
+	if (reset_tpt_entry)
+		memset(&tpt, 0, sizeof(tpt));
+	else {
+		tpt.valid_to_pdid = cpu_to_be32(F_FW_RI_TPTE_VALID |
+			V_FW_RI_TPTE_STAGKEY((*stag & M_FW_RI_TPTE_STAGKEY)) |
+			V_FW_RI_TPTE_STAGSTATE(stag_state) |
+			V_FW_RI_TPTE_STAGTYPE(type) | V_FW_RI_TPTE_PDID(pdid));
+		tpt.locread_to_qpid = cpu_to_be32(V_FW_RI_TPTE_PERM(perm) |
+			(bind_enabled ? F_FW_RI_TPTE_MWBINDEN : 0) |
+			V_FW_RI_TPTE_ADDRTYPE((zbva ? FW_RI_ZERO_BASED_TO :
+						      FW_RI_VA_BASED_TO))|
+			V_FW_RI_TPTE_PS(page_size));
+		tpt.nosnoop_pbladdr = !pbl_size ? 0 : cpu_to_be32(
+			V_FW_RI_TPTE_PBLADDR(PBL_OFF(rdev, pbl_addr)>>3));
+		tpt.len_lo = cpu_to_be32((u32)(len & 0xffffffffUL));
+		tpt.va_hi = cpu_to_be32((u32)(to >> 32));
+		tpt.va_lo_fbo = cpu_to_be32((u32)(to & 0xffffffffUL));
+		tpt.dca_mwbcnt_pstag = cpu_to_be32(0);
+		tpt.len_hi = cpu_to_be32((u32)(len >> 32));
+	}
+	err = write_adapter_mem(rdev, stag_idx +
+				(rdev->lldi.vr->stag.start >> 5),
+				sizeof(tpt), &tpt);
+
+	if (reset_tpt_entry) {
+		c4iw_put_resource(&rdev->resource.tpt_table, stag_idx);
+		mutex_lock(&rdev->stats.lock);
+		rdev->stats.stag.cur -= 32;
+		mutex_unlock(&rdev->stats.lock);
+	}
+	return err;
+}
+
+static int write_pbl(struct c4iw_rdev *rdev, __be64 *pbl,
+		     u32 pbl_addr, u32 pbl_size)
+{
+	int err;
+
+	PDBG("%s *pdb_addr 0x%x, pbl_base 0x%x, pbl_size %d\n",
+	     __func__, pbl_addr, rdev->lldi.vr->pbl.start,
+	     pbl_size);
+
+	err = write_adapter_mem(rdev, pbl_addr >> 5, pbl_size << 3, pbl);
+	return err;
+}
+
+static int dereg_mem(struct c4iw_rdev *rdev, u32 stag, u32 pbl_size,
+		     u32 pbl_addr)
+{
+	return write_tpt_entry(rdev, 1, &stag, 0, 0, 0, 0, 0, 0, 0UL, 0, 0,
+			       pbl_size, pbl_addr);
+}
+
+static int allocate_window(struct c4iw_rdev *rdev, u32 * stag, u32 pdid)
+{
+	*stag = T4_STAG_UNSET;
+	return write_tpt_entry(rdev, 0, stag, 0, pdid, FW_RI_STAG_MW, 0, 0, 0,
+			       0UL, 0, 0, 0, 0);
+}
+
+static int deallocate_window(struct c4iw_rdev *rdev, u32 stag)
+{
+	return write_tpt_entry(rdev, 1, &stag, 0, 0, 0, 0, 0, 0, 0UL, 0, 0, 0,
+			       0);
+}
+
+static int allocate_stag(struct c4iw_rdev *rdev, u32 *stag, u32 pdid,
+			 u32 pbl_size, u32 pbl_addr)
+{
+	*stag = T4_STAG_UNSET;
+	return write_tpt_entry(rdev, 0, stag, 0, pdid, FW_RI_STAG_NSMR, 0, 0, 0,
+			       0UL, 0, 0, pbl_size, pbl_addr);
+}
+
+static int finish_mem_reg(struct c4iw_mr *mhp, u32 stag)
+{
+	u32 mmid;
+
+	mhp->attr.state = 1;
+	mhp->attr.stag = stag;
+	mmid = stag >> 8;
+	mhp->ibmr.rkey = mhp->ibmr.lkey = stag;
+	PDBG("%s mmid 0x%x mhp %p\n", __func__, mmid, mhp);
+	return insert_handle(mhp->rhp, &mhp->rhp->mmidr, mhp, mmid);
+}
+
+static int register_mem(struct c4iw_dev *rhp, struct c4iw_pd *php,
+		      struct c4iw_mr *mhp, int shift)
+{
+	u32 stag = T4_STAG_UNSET;
+	int ret;
+
+	ret = write_tpt_entry(&rhp->rdev, 0, &stag, 1, mhp->attr.pdid,
+			      FW_RI_STAG_NSMR, mhp->attr.perms,
+			      mhp->attr.mw_bind_enable, mhp->attr.zbva,
+			      mhp->attr.va_fbo, mhp->attr.len, shift - 12,
+			      mhp->attr.pbl_size, mhp->attr.pbl_addr);
+	if (ret)
+		return ret;
+
+	ret = finish_mem_reg(mhp, stag);
+	if (ret)
+		dereg_mem(&rhp->rdev, mhp->attr.stag, mhp->attr.pbl_size,
+		       mhp->attr.pbl_addr);
+	return ret;
+}
+
+static int reregister_mem(struct c4iw_dev *rhp, struct c4iw_pd *php,
+			  struct c4iw_mr *mhp, int shift, int npages)
+{
+	u32 stag;
+	int ret;
+
+	if (npages > mhp->attr.pbl_size)
+		return -ENOMEM;
+
+	stag = mhp->attr.stag;
+	ret = write_tpt_entry(&rhp->rdev, 0, &stag, 1, mhp->attr.pdid,
+			      FW_RI_STAG_NSMR, mhp->attr.perms,
+			      mhp->attr.mw_bind_enable, mhp->attr.zbva,
+			      mhp->attr.va_fbo, mhp->attr.len, shift - 12,
+			      mhp->attr.pbl_size, mhp->attr.pbl_addr);
+	if (ret)
+		return ret;
+
+	ret = finish_mem_reg(mhp, stag);
+	if (ret)
+		dereg_mem(&rhp->rdev, mhp->attr.stag, mhp->attr.pbl_size,
+		       mhp->attr.pbl_addr);
+
+	return ret;
+}
+
+static int alloc_pbl(struct c4iw_mr *mhp, int npages)
+{
+	mhp->attr.pbl_addr = c4iw_pblpool_alloc(&mhp->rhp->rdev,
+						    npages << 3);
+
+	if (!mhp->attr.pbl_addr)
+		return -ENOMEM;
+
+	mhp->attr.pbl_size = npages;
+
+	return 0;
+}
+
+static int build_phys_page_list(struct ib_phys_buf *buffer_list,
+				int num_phys_buf, u64 *iova_start,
+				u64 *total_size, int *npages,
+				int *shift, __be64 **page_list)
+{
+	u64 mask;
+	int i, j, n;
+
+	mask = 0;
+	*total_size = 0;
+	for (i = 0; i < num_phys_buf; ++i) {
+		if (i != 0 && buffer_list[i].addr & ~PAGE_MASK)
+			return -EINVAL;
+		if (i != 0 && i != num_phys_buf - 1 &&
+		    (buffer_list[i].size & ~PAGE_MASK))
+			return -EINVAL;
+		*total_size += buffer_list[i].size;
+		if (i > 0)
+			mask |= buffer_list[i].addr;
+		else
+			mask |= buffer_list[i].addr & PAGE_MASK;
+		if (i != num_phys_buf - 1)
+			mask |= buffer_list[i].addr + buffer_list[i].size;
+		else
+			mask |= (buffer_list[i].addr + buffer_list[i].size +
+				PAGE_SIZE - 1) & PAGE_MASK;
+	}
+
+	if (*total_size > 0xFFFFFFFFULL)
+		return -ENOMEM;
+
+	/* Find largest page shift we can use to cover buffers */
+	for (*shift = PAGE_SHIFT; *shift < 27; ++(*shift))
+		if ((1ULL << *shift) & mask)
+			break;
+
+	buffer_list[0].size += buffer_list[0].addr & ((1ULL << *shift) - 1);
+	buffer_list[0].addr &= ~0ull << *shift;
+
+	*npages = 0;
+	for (i = 0; i < num_phys_buf; ++i)
+		*npages += (buffer_list[i].size +
+			(1ULL << *shift) - 1) >> *shift;
+
+	if (!*npages)
+		return -EINVAL;
+
+	*page_list = kmalloc(sizeof(u64) * *npages, GFP_KERNEL);
+	if (!*page_list)
+		return -ENOMEM;
+
+	n = 0;
+	for (i = 0; i < num_phys_buf; ++i)
+		for (j = 0;
+		     j < (buffer_list[i].size + (1ULL << *shift) - 1) >> *shift;
+		     ++j)
+			(*page_list)[n++] = cpu_to_be64(buffer_list[i].addr +
+			    ((u64) j << *shift));
+
+	PDBG("%s va 0x%llx mask 0x%llx shift %d len %lld pbl_size %d\n",
+	     __func__, (unsigned long long)*iova_start,
+	     (unsigned long long)mask, *shift, (unsigned long long)*total_size,
+	     *npages);
+
+	return 0;
+
+}
+
+int c4iw_reregister_phys_mem(struct ib_mr *mr, int mr_rereg_mask,
+			     struct ib_pd *pd, struct ib_phys_buf *buffer_list,
+			     int num_phys_buf, int acc, u64 *iova_start)
+{
+
+	struct c4iw_mr mh, *mhp;
+	struct c4iw_pd *php;
+	struct c4iw_dev *rhp;
+	__be64 *page_list = NULL;
+	int shift = 0;
+	u64 total_size;
+	int npages;
+	int ret;
+
+	PDBG("%s ib_mr %p ib_pd %p\n", __func__, mr, pd);
+
+	/* There can be no memory windows */
+	if (atomic_read(&mr->usecnt))
+		return -EINVAL;
+
+	mhp = to_c4iw_mr(mr);
+	rhp = mhp->rhp;
+	php = to_c4iw_pd(mr->pd);
+
+	/* make sure we are on the same adapter */
+	if (rhp != php->rhp)
+		return -EINVAL;
+
+	memcpy(&mh, mhp, sizeof *mhp);
+
+	if (mr_rereg_mask & IB_MR_REREG_PD)
+		php = to_c4iw_pd(pd);
+	if (mr_rereg_mask & IB_MR_REREG_ACCESS) {
+		mh.attr.perms = c4iw_ib_to_tpt_access(acc);
+		mh.attr.mw_bind_enable = (acc & IB_ACCESS_MW_BIND) ==
+					 IB_ACCESS_MW_BIND;
+	}
+	if (mr_rereg_mask & IB_MR_REREG_TRANS) {
+		ret = build_phys_page_list(buffer_list, num_phys_buf,
+						iova_start,
+						&total_size, &npages,
+						&shift, &page_list);
+		if (ret)
+			return ret;
+	}
+
+	ret = reregister_mem(rhp, php, &mh, shift, npages);
+	kfree(page_list);
+	if (ret)
+		return ret;
+	if (mr_rereg_mask & IB_MR_REREG_PD)
+		mhp->attr.pdid = php->pdid;
+	if (mr_rereg_mask & IB_MR_REREG_ACCESS)
+		mhp->attr.perms = c4iw_ib_to_tpt_access(acc);
+	if (mr_rereg_mask & IB_MR_REREG_TRANS) {
+		mhp->attr.zbva = 0;
+		mhp->attr.va_fbo = *iova_start;
+		mhp->attr.page_size = shift - 12;
+		mhp->attr.len = (u32) total_size;
+		mhp->attr.pbl_size = npages;
+	}
+
+	return 0;
+}
+
+struct ib_mr *c4iw_register_phys_mem(struct ib_pd *pd,
+				     struct ib_phys_buf *buffer_list,
+				     int num_phys_buf, int acc, u64 *iova_start)
+{
+	__be64 *page_list;
+	int shift;
+	u64 total_size;
+	int npages;
+	struct c4iw_dev *rhp;
+	struct c4iw_pd *php;
+	struct c4iw_mr *mhp;
+	int ret;
+
+	PDBG("%s ib_pd %p\n", __func__, pd);
+	php = to_c4iw_pd(pd);
+	rhp = php->rhp;
+
+	mhp = kzalloc(sizeof(*mhp), GFP_KERNEL);
+	if (!mhp)
+		return ERR_PTR(-ENOMEM);
+
+	mhp->rhp = rhp;
+
+	/* First check that we have enough alignment */
+	if ((*iova_start & ~PAGE_MASK) != (buffer_list[0].addr & ~PAGE_MASK)) {
+		ret = -EINVAL;
+		goto err;
+	}
+
+	if (num_phys_buf > 1 &&
+	    ((buffer_list[0].addr + buffer_list[0].size) & ~PAGE_MASK)) {
+		ret = -EINVAL;
+		goto err;
+	}
+
+	ret = build_phys_page_list(buffer_list, num_phys_buf, iova_start,
+					&total_size, &npages, &shift,
+					&page_list);
+	if (ret)
+		goto err;
+
+	ret = alloc_pbl(mhp, npages);
+	if (ret) {
+		kfree(page_list);
+		goto err;
+	}
+
+	ret = write_pbl(&mhp->rhp->rdev, page_list, mhp->attr.pbl_addr,
+			     npages);
+	kfree(page_list);
+	if (ret)
+		goto err_pbl;
+
+	mhp->attr.pdid = php->pdid;
+	mhp->attr.zbva = 0;
+
+	mhp->attr.perms = c4iw_ib_to_tpt_access(acc);
+	mhp->attr.va_fbo = *iova_start;
+	mhp->attr.page_size = shift - 12;
+
+	mhp->attr.len = (u32) total_size;
+	mhp->attr.pbl_size = npages;
+	ret = register_mem(rhp, php, mhp, shift);
+	if (ret)
+		goto err_pbl;
+
+	return &mhp->ibmr;
+
+err_pbl:
+	c4iw_pblpool_free(&mhp->rhp->rdev, mhp->attr.pbl_addr,
+			      mhp->attr.pbl_size << 3);
+
+err:
+	kfree(mhp);
+	return ERR_PTR(ret);
+
+}
+
+struct ib_mr *c4iw_get_dma_mr(struct ib_pd *pd, int acc)
+{
+	struct c4iw_dev *rhp;
+	struct c4iw_pd *php;
+	struct c4iw_mr *mhp;
+	int ret;
+	u32 stag = T4_STAG_UNSET;
+
+	PDBG("%s ib_pd %p\n", __func__, pd);
+	php = to_c4iw_pd(pd);
+	rhp = php->rhp;
+
+	mhp = kzalloc(sizeof(*mhp), GFP_KERNEL);
+	if (!mhp)
+		return ERR_PTR(-ENOMEM);
+
+	mhp->rhp = rhp;
+	mhp->attr.pdid = php->pdid;
+	mhp->attr.perms = c4iw_ib_to_tpt_access(acc);
+	mhp->attr.mw_bind_enable = (acc&IB_ACCESS_MW_BIND) == IB_ACCESS_MW_BIND;
+	mhp->attr.zbva = 0;
+	mhp->attr.va_fbo = 0;
+	mhp->attr.page_size = 0;
+	mhp->attr.len = ~0UL;
+	mhp->attr.pbl_size = 0;
+
+	ret = write_tpt_entry(&rhp->rdev, 0, &stag, 1, php->pdid,
+			      FW_RI_STAG_NSMR, mhp->attr.perms,
+			      mhp->attr.mw_bind_enable, 0, 0, ~0UL, 0, 0, 0);
+	if (ret)
+		goto err1;
+
+	ret = finish_mem_reg(mhp, stag);
+	if (ret)
+		goto err2;
+	return &mhp->ibmr;
+err2:
+	dereg_mem(&rhp->rdev, mhp->attr.stag, mhp->attr.pbl_size,
+		  mhp->attr.pbl_addr);
+err1:
+	kfree(mhp);
+	return ERR_PTR(ret);
+}
+
+struct ib_mr *c4iw_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
+			       u64 virt, int acc, struct ib_udata *udata)
+{
+	__be64 *pages;
+	int shift, n, len;
+	int i, k, entry;
+	int err = 0;
+	struct scatterlist *sg;
+	struct c4iw_dev *rhp;
+	struct c4iw_pd *php;
+	struct c4iw_mr *mhp;
+
+	PDBG("%s ib_pd %p\n", __func__, pd);
+
+	if (length == ~0ULL)
+		return ERR_PTR(-EINVAL);
+
+	if ((length + start) < start)
+		return ERR_PTR(-EINVAL);
+
+	php = to_c4iw_pd(pd);
+	rhp = php->rhp;
+	mhp = kzalloc(sizeof(*mhp), GFP_KERNEL);
+	if (!mhp)
+		return ERR_PTR(-ENOMEM);
+
+	mhp->rhp = rhp;
+
+	mhp->umem = ib_umem_get(pd->uobject->context, start, length, acc, 0);
+	if (IS_ERR(mhp->umem)) {
+		err = PTR_ERR(mhp->umem);
+		kfree(mhp);
+		return ERR_PTR(err);
+	}
+
+	shift = ffs(mhp->umem->page_size) - 1;
+
+	n = mhp->umem->nmap;
+	err = alloc_pbl(mhp, n);
+	if (err)
+		goto err;
+
+	pages = (__be64 *) __get_free_page(GFP_KERNEL);
+	if (!pages) {
+		err = -ENOMEM;
+		goto err_pbl;
+	}
+
+	i = n = 0;
+
+	for_each_sg(mhp->umem->sg_head.sgl, sg, mhp->umem->nmap, entry) {
+		len = sg_dma_len(sg) >> shift;
+		for (k = 0; k < len; ++k) {
+			pages[i++] = cpu_to_be64(sg_dma_address(sg) +
+				mhp->umem->page_size * k);
+			if (i == PAGE_SIZE / sizeof *pages) {
+				err = write_pbl(&mhp->rhp->rdev,
+				      pages,
+				      mhp->attr.pbl_addr + (n << 3), i);
+				if (err)
+					goto pbl_done;
+				n += i;
+				i = 0;
+			}
+		}
+	}
+
+	if (i)
+		err = write_pbl(&mhp->rhp->rdev, pages,
+				     mhp->attr.pbl_addr + (n << 3), i);
+
+pbl_done:
+	free_page((unsigned long) pages);
+	if (err)
+		goto err_pbl;
+
+	mhp->attr.pdid = php->pdid;
+	mhp->attr.zbva = 0;
+	mhp->attr.perms = c4iw_ib_to_tpt_access(acc);
+	mhp->attr.va_fbo = virt;
+	mhp->attr.page_size = shift - 12;
+	mhp->attr.len = length;
+
+	err = register_mem(rhp, php, mhp, shift);
+	if (err)
+		goto err_pbl;
+
+	return &mhp->ibmr;
+
+err_pbl:
+	c4iw_pblpool_free(&mhp->rhp->rdev, mhp->attr.pbl_addr,
+			      mhp->attr.pbl_size << 3);
+
+err:
+	ib_umem_release(mhp->umem);
+	kfree(mhp);
+	return ERR_PTR(err);
+}
+
+struct ib_mw *c4iw_alloc_mw(struct ib_pd *pd, enum ib_mw_type type)
+{
+	struct c4iw_dev *rhp;
+	struct c4iw_pd *php;
+	struct c4iw_mw *mhp;
+	u32 mmid;
+	u32 stag = 0;
+	int ret;
+
+	if (type != IB_MW_TYPE_1)
+		return ERR_PTR(-EINVAL);
+
+	php = to_c4iw_pd(pd);
+	rhp = php->rhp;
+	mhp = kzalloc(sizeof(*mhp), GFP_KERNEL);
+	if (!mhp)
+		return ERR_PTR(-ENOMEM);
+	ret = allocate_window(&rhp->rdev, &stag, php->pdid);
+	if (ret) {
+		kfree(mhp);
+		return ERR_PTR(ret);
+	}
+	mhp->rhp = rhp;
+	mhp->attr.pdid = php->pdid;
+	mhp->attr.type = FW_RI_STAG_MW;
+	mhp->attr.stag = stag;
+	mmid = (stag) >> 8;
+	mhp->ibmw.rkey = stag;
+	if (insert_handle(rhp, &rhp->mmidr, mhp, mmid)) {
+		deallocate_window(&rhp->rdev, mhp->attr.stag);
+		kfree(mhp);
+		return ERR_PTR(-ENOMEM);
+	}
+	PDBG("%s mmid 0x%x mhp %p stag 0x%x\n", __func__, mmid, mhp, stag);
+	return &(mhp->ibmw);
+}
+
+int c4iw_dealloc_mw(struct ib_mw *mw)
+{
+	struct c4iw_dev *rhp;
+	struct c4iw_mw *mhp;
+	u32 mmid;
+
+	mhp = to_c4iw_mw(mw);
+	rhp = mhp->rhp;
+	mmid = (mw->rkey) >> 8;
+	remove_handle(rhp, &rhp->mmidr, mmid);
+	deallocate_window(&rhp->rdev, mhp->attr.stag);
+	kfree(mhp);
+	PDBG("%s ib_mw %p mmid 0x%x ptr %p\n", __func__, mw, mmid, mhp);
+	return 0;
+}
+
+struct ib_mr *c4iw_alloc_fast_reg_mr(struct ib_pd *pd, int pbl_depth)
+{
+	struct c4iw_dev *rhp;
+	struct c4iw_pd *php;
+	struct c4iw_mr *mhp;
+	u32 mmid;
+	u32 stag = 0;
+	int ret = 0;
+
+	php = to_c4iw_pd(pd);
+	rhp = php->rhp;
+	mhp = kzalloc(sizeof(*mhp), GFP_KERNEL);
+	if (!mhp) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	mhp->rhp = rhp;
+	ret = alloc_pbl(mhp, pbl_depth);
+	if (ret)
+		goto err1;
+	mhp->attr.pbl_size = pbl_depth;
+	ret = allocate_stag(&rhp->rdev, &stag, php->pdid,
+				 mhp->attr.pbl_size, mhp->attr.pbl_addr);
+	if (ret)
+		goto err2;
+	mhp->attr.pdid = php->pdid;
+	mhp->attr.type = FW_RI_STAG_NSMR;
+	mhp->attr.stag = stag;
+	mhp->attr.state = 1;
+	mmid = (stag) >> 8;
+	mhp->ibmr.rkey = mhp->ibmr.lkey = stag;
+	if (insert_handle(rhp, &rhp->mmidr, mhp, mmid)) {
+		ret = -ENOMEM;
+		goto err3;
+	}
+
+	PDBG("%s mmid 0x%x mhp %p stag 0x%x\n", __func__, mmid, mhp, stag);
+	return &(mhp->ibmr);
+err3:
+	dereg_mem(&rhp->rdev, stag, mhp->attr.pbl_size,
+		       mhp->attr.pbl_addr);
+err2:
+	c4iw_pblpool_free(&mhp->rhp->rdev, mhp->attr.pbl_addr,
+			      mhp->attr.pbl_size << 3);
+err1:
+	kfree(mhp);
+err:
+	return ERR_PTR(ret);
+}
+
+struct ib_fast_reg_page_list *c4iw_alloc_fastreg_pbl(struct ib_device *device,
+						     int page_list_len)
+{
+	struct c4iw_fr_page_list *c4pl;
+	struct c4iw_dev *dev = to_c4iw_dev(device);
+	dma_addr_t dma_addr;
+	int pll_len = roundup(page_list_len * sizeof(u64), 32);
+
+	c4pl = kmalloc(sizeof(*c4pl), GFP_KERNEL);
+	if (!c4pl)
+		return ERR_PTR(-ENOMEM);
+
+	c4pl->ibpl.page_list = dma_alloc_coherent(&dev->rdev.lldi.pdev->dev,
+						  pll_len, &dma_addr,
+						  GFP_KERNEL);
+	if (!c4pl->ibpl.page_list) {
+		kfree(c4pl);
+		return ERR_PTR(-ENOMEM);
+	}
+	dma_unmap_addr_set(c4pl, mapping, dma_addr);
+	c4pl->dma_addr = dma_addr;
+	c4pl->dev = dev;
+	c4pl->pll_len = pll_len;
+
+	PDBG("%s c4pl %p pll_len %u page_list %p dma_addr %pad\n",
+	     __func__, c4pl, c4pl->pll_len, c4pl->ibpl.page_list,
+	     &c4pl->dma_addr);
+
+	return &c4pl->ibpl;
+}
+
+void c4iw_free_fastreg_pbl(struct ib_fast_reg_page_list *ibpl)
+{
+	struct c4iw_fr_page_list *c4pl = to_c4iw_fr_page_list(ibpl);
+
+	PDBG("%s c4pl %p pll_len %u page_list %p dma_addr %pad\n",
+	     __func__, c4pl, c4pl->pll_len, c4pl->ibpl.page_list,
+	     &c4pl->dma_addr);
+
+	dma_free_coherent(&c4pl->dev->rdev.lldi.pdev->dev,
+			  c4pl->pll_len,
+			  c4pl->ibpl.page_list, dma_unmap_addr(c4pl, mapping));
+	kfree(c4pl);
+}
+
+int c4iw_dereg_mr(struct ib_mr *ib_mr)
+{
+	struct c4iw_dev *rhp;
+	struct c4iw_mr *mhp;
+	u32 mmid;
+
+	PDBG("%s ib_mr %p\n", __func__, ib_mr);
+	/* There can be no memory windows */
+	if (atomic_read(&ib_mr->usecnt))
+		return -EINVAL;
+
+	mhp = to_c4iw_mr(ib_mr);
+	rhp = mhp->rhp;
+	mmid = mhp->attr.stag >> 8;
+	remove_handle(rhp, &rhp->mmidr, mmid);
+	dereg_mem(&rhp->rdev, mhp->attr.stag, mhp->attr.pbl_size,
+		       mhp->attr.pbl_addr);
+	if (mhp->attr.pbl_size)
+		c4iw_pblpool_free(&mhp->rhp->rdev, mhp->attr.pbl_addr,
+				  mhp->attr.pbl_size << 3);
+	if (mhp->kva)
+		kfree((void *) (unsigned long) mhp->kva);
+	if (mhp->umem)
+		ib_umem_release(mhp->umem);
+	PDBG("%s mmid 0x%x ptr %p\n", __func__, mmid, mhp);
+	kfree(mhp);
+	return 0;
+}
diff --git a/drivers/infiniband/hw/cxgb4/provider.c b/drivers/infiniband/hw/cxgb4/provider.c
new file mode 100644
index 0000000..72e3b69
--- /dev/null
+++ b/drivers/infiniband/hw/cxgb4/provider.c
@@ -0,0 +1,588 @@
+/*
+ * Copyright (c) 2009-2010 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/device.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/delay.h>
+#include <linux/errno.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/ethtool.h>
+#include <linux/rtnetlink.h>
+#include <linux/inetdevice.h>
+#include <linux/io.h>
+
+#include <asm/irq.h>
+#include <asm/byteorder.h>
+
+#include <rdma/iw_cm.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_smi.h>
+#include <rdma/ib_umem.h>
+#include <rdma/ib_user_verbs.h>
+
+#include "iw_cxgb4.h"
+
+static int fastreg_support = 1;
+module_param(fastreg_support, int, 0644);
+MODULE_PARM_DESC(fastreg_support, "Advertise fastreg support (default=1)");
+
+static struct ib_ah *c4iw_ah_create(struct ib_pd *pd,
+				    struct ib_ah_attr *ah_attr)
+{
+	return ERR_PTR(-ENOSYS);
+}
+
+static int c4iw_ah_destroy(struct ib_ah *ah)
+{
+	return -ENOSYS;
+}
+
+static int c4iw_multicast_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
+{
+	return -ENOSYS;
+}
+
+static int c4iw_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
+{
+	return -ENOSYS;
+}
+
+static int c4iw_process_mad(struct ib_device *ibdev, int mad_flags,
+			    u8 port_num, struct ib_wc *in_wc,
+			    struct ib_grh *in_grh, struct ib_mad *in_mad,
+			    struct ib_mad *out_mad)
+{
+	return -ENOSYS;
+}
+
+static int c4iw_dealloc_ucontext(struct ib_ucontext *context)
+{
+	struct c4iw_dev *rhp = to_c4iw_dev(context->device);
+	struct c4iw_ucontext *ucontext = to_c4iw_ucontext(context);
+	struct c4iw_mm_entry *mm, *tmp;
+
+	PDBG("%s context %p\n", __func__, context);
+	list_for_each_entry_safe(mm, tmp, &ucontext->mmaps, entry)
+		kfree(mm);
+	c4iw_release_dev_ucontext(&rhp->rdev, &ucontext->uctx);
+	kfree(ucontext);
+	return 0;
+}
+
+static struct ib_ucontext *c4iw_alloc_ucontext(struct ib_device *ibdev,
+					       struct ib_udata *udata)
+{
+	struct c4iw_ucontext *context;
+	struct c4iw_dev *rhp = to_c4iw_dev(ibdev);
+	static int warned;
+	struct c4iw_alloc_ucontext_resp uresp;
+	int ret = 0;
+	struct c4iw_mm_entry *mm = NULL;
+
+	PDBG("%s ibdev %p\n", __func__, ibdev);
+	context = kzalloc(sizeof(*context), GFP_KERNEL);
+	if (!context) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	c4iw_init_dev_ucontext(&rhp->rdev, &context->uctx);
+	INIT_LIST_HEAD(&context->mmaps);
+	spin_lock_init(&context->mmap_lock);
+
+	if (udata->outlen < sizeof(uresp) - sizeof(uresp.reserved)) {
+		if (!warned++)
+			pr_err(MOD "Warning - downlevel libcxgb4 (non-fatal), device status page disabled.");
+		rhp->rdev.flags |= T4_STATUS_PAGE_DISABLED;
+	} else {
+		mm = kmalloc(sizeof(*mm), GFP_KERNEL);
+		if (!mm) {
+			ret = -ENOMEM;
+			goto err_free;
+		}
+
+		uresp.status_page_size = PAGE_SIZE;
+
+		spin_lock(&context->mmap_lock);
+		uresp.status_page_key = context->key;
+		context->key += PAGE_SIZE;
+		spin_unlock(&context->mmap_lock);
+
+		ret = ib_copy_to_udata(udata, &uresp,
+				       sizeof(uresp) - sizeof(uresp.reserved));
+		if (ret)
+			goto err_mm;
+
+		mm->key = uresp.status_page_key;
+		mm->addr = virt_to_phys(rhp->rdev.status_page);
+		mm->len = PAGE_SIZE;
+		insert_mmap(context, mm);
+	}
+	return &context->ibucontext;
+err_mm:
+	kfree(mm);
+err_free:
+	kfree(context);
+err:
+	return ERR_PTR(ret);
+}
+
+static int c4iw_mmap(struct ib_ucontext *context, struct vm_area_struct *vma)
+{
+	int len = vma->vm_end - vma->vm_start;
+	u32 key = vma->vm_pgoff << PAGE_SHIFT;
+	struct c4iw_rdev *rdev;
+	int ret = 0;
+	struct c4iw_mm_entry *mm;
+	struct c4iw_ucontext *ucontext;
+	u64 addr;
+
+	PDBG("%s pgoff 0x%lx key 0x%x len %d\n", __func__, vma->vm_pgoff,
+	     key, len);
+
+	if (vma->vm_start & (PAGE_SIZE-1))
+		return -EINVAL;
+
+	rdev = &(to_c4iw_dev(context->device)->rdev);
+	ucontext = to_c4iw_ucontext(context);
+
+	mm = remove_mmap(ucontext, key, len);
+	if (!mm)
+		return -EINVAL;
+	addr = mm->addr;
+	kfree(mm);
+
+	if ((addr >= pci_resource_start(rdev->lldi.pdev, 0)) &&
+	    (addr < (pci_resource_start(rdev->lldi.pdev, 0) +
+		    pci_resource_len(rdev->lldi.pdev, 0)))) {
+
+		/*
+		 * MA_SYNC register...
+		 */
+		vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+		ret = io_remap_pfn_range(vma, vma->vm_start,
+					 addr >> PAGE_SHIFT,
+					 len, vma->vm_page_prot);
+	} else if ((addr >= pci_resource_start(rdev->lldi.pdev, 2)) &&
+		   (addr < (pci_resource_start(rdev->lldi.pdev, 2) +
+		    pci_resource_len(rdev->lldi.pdev, 2)))) {
+
+		/*
+		 * Map user DB or OCQP memory...
+		 */
+		if (addr >= rdev->oc_mw_pa)
+			vma->vm_page_prot = t4_pgprot_wc(vma->vm_page_prot);
+		else {
+			if (is_t5(rdev->lldi.adapter_type))
+				vma->vm_page_prot =
+					t4_pgprot_wc(vma->vm_page_prot);
+			else
+				vma->vm_page_prot =
+					pgprot_noncached(vma->vm_page_prot);
+		}
+		ret = io_remap_pfn_range(vma, vma->vm_start,
+					 addr >> PAGE_SHIFT,
+					 len, vma->vm_page_prot);
+	} else {
+
+		/*
+		 * Map WQ or CQ contig dma memory...
+		 */
+		ret = remap_pfn_range(vma, vma->vm_start,
+				      addr >> PAGE_SHIFT,
+				      len, vma->vm_page_prot);
+	}
+
+	return ret;
+}
+
+static int c4iw_deallocate_pd(struct ib_pd *pd)
+{
+	struct c4iw_dev *rhp;
+	struct c4iw_pd *php;
+
+	php = to_c4iw_pd(pd);
+	rhp = php->rhp;
+	PDBG("%s ibpd %p pdid 0x%x\n", __func__, pd, php->pdid);
+	c4iw_put_resource(&rhp->rdev.resource.pdid_table, php->pdid);
+	mutex_lock(&rhp->rdev.stats.lock);
+	rhp->rdev.stats.pd.cur--;
+	mutex_unlock(&rhp->rdev.stats.lock);
+	kfree(php);
+	return 0;
+}
+
+static struct ib_pd *c4iw_allocate_pd(struct ib_device *ibdev,
+				      struct ib_ucontext *context,
+				      struct ib_udata *udata)
+{
+	struct c4iw_pd *php;
+	u32 pdid;
+	struct c4iw_dev *rhp;
+
+	PDBG("%s ibdev %p\n", __func__, ibdev);
+	rhp = (struct c4iw_dev *) ibdev;
+	pdid =  c4iw_get_resource(&rhp->rdev.resource.pdid_table);
+	if (!pdid)
+		return ERR_PTR(-EINVAL);
+	php = kzalloc(sizeof(*php), GFP_KERNEL);
+	if (!php) {
+		c4iw_put_resource(&rhp->rdev.resource.pdid_table, pdid);
+		return ERR_PTR(-ENOMEM);
+	}
+	php->pdid = pdid;
+	php->rhp = rhp;
+	if (context) {
+		if (ib_copy_to_udata(udata, &php->pdid, sizeof(u32))) {
+			c4iw_deallocate_pd(&php->ibpd);
+			return ERR_PTR(-EFAULT);
+		}
+	}
+	mutex_lock(&rhp->rdev.stats.lock);
+	rhp->rdev.stats.pd.cur++;
+	if (rhp->rdev.stats.pd.cur > rhp->rdev.stats.pd.max)
+		rhp->rdev.stats.pd.max = rhp->rdev.stats.pd.cur;
+	mutex_unlock(&rhp->rdev.stats.lock);
+	PDBG("%s pdid 0x%0x ptr 0x%p\n", __func__, pdid, php);
+	return &php->ibpd;
+}
+
+static int c4iw_query_pkey(struct ib_device *ibdev, u8 port, u16 index,
+			   u16 *pkey)
+{
+	PDBG("%s ibdev %p\n", __func__, ibdev);
+	*pkey = 0;
+	return 0;
+}
+
+static int c4iw_query_gid(struct ib_device *ibdev, u8 port, int index,
+			  union ib_gid *gid)
+{
+	struct c4iw_dev *dev;
+
+	PDBG("%s ibdev %p, port %d, index %d, gid %p\n",
+	       __func__, ibdev, port, index, gid);
+	dev = to_c4iw_dev(ibdev);
+	BUG_ON(port == 0);
+	memset(&(gid->raw[0]), 0, sizeof(gid->raw));
+	memcpy(&(gid->raw[0]), dev->rdev.lldi.ports[port-1]->dev_addr, 6);
+	return 0;
+}
+
+static int c4iw_query_device(struct ib_device *ibdev,
+			     struct ib_device_attr *props)
+{
+
+	struct c4iw_dev *dev;
+	PDBG("%s ibdev %p\n", __func__, ibdev);
+
+	dev = to_c4iw_dev(ibdev);
+	memset(props, 0, sizeof *props);
+	memcpy(&props->sys_image_guid, dev->rdev.lldi.ports[0]->dev_addr, 6);
+	props->hw_ver = CHELSIO_CHIP_RELEASE(dev->rdev.lldi.adapter_type);
+	props->fw_ver = dev->rdev.lldi.fw_vers;
+	props->device_cap_flags = dev->device_cap_flags;
+	props->page_size_cap = T4_PAGESIZE_MASK;
+	props->vendor_id = (u32)dev->rdev.lldi.pdev->vendor;
+	props->vendor_part_id = (u32)dev->rdev.lldi.pdev->device;
+	props->max_mr_size = T4_MAX_MR_SIZE;
+	props->max_qp = dev->rdev.lldi.vr->qp.size / 2;
+	props->max_qp_wr = dev->rdev.hw_queue.t4_max_qp_depth;
+	props->max_sge = T4_MAX_RECV_SGE;
+	props->max_sge_rd = 1;
+	props->max_res_rd_atom = dev->rdev.lldi.max_ird_adapter;
+	props->max_qp_rd_atom = min(dev->rdev.lldi.max_ordird_qp,
+				    c4iw_max_read_depth);
+	props->max_qp_init_rd_atom = props->max_qp_rd_atom;
+	props->max_cq = dev->rdev.lldi.vr->qp.size;
+	props->max_cqe = dev->rdev.hw_queue.t4_max_cq_depth;
+	props->max_mr = c4iw_num_stags(&dev->rdev);
+	props->max_pd = T4_MAX_NUM_PD;
+	props->local_ca_ack_delay = 0;
+	props->max_fast_reg_page_list_len = t4_max_fr_depth(use_dsgl);
+
+	return 0;
+}
+
+static int c4iw_query_port(struct ib_device *ibdev, u8 port,
+			   struct ib_port_attr *props)
+{
+	struct c4iw_dev *dev;
+	struct net_device *netdev;
+	struct in_device *inetdev;
+
+	PDBG("%s ibdev %p\n", __func__, ibdev);
+
+	dev = to_c4iw_dev(ibdev);
+	netdev = dev->rdev.lldi.ports[port-1];
+
+	memset(props, 0, sizeof(struct ib_port_attr));
+	props->max_mtu = IB_MTU_4096;
+	if (netdev->mtu >= 4096)
+		props->active_mtu = IB_MTU_4096;
+	else if (netdev->mtu >= 2048)
+		props->active_mtu = IB_MTU_2048;
+	else if (netdev->mtu >= 1024)
+		props->active_mtu = IB_MTU_1024;
+	else if (netdev->mtu >= 512)
+		props->active_mtu = IB_MTU_512;
+	else
+		props->active_mtu = IB_MTU_256;
+
+	if (!netif_carrier_ok(netdev))
+		props->state = IB_PORT_DOWN;
+	else {
+		inetdev = in_dev_get(netdev);
+		if (inetdev) {
+			if (inetdev->ifa_list)
+				props->state = IB_PORT_ACTIVE;
+			else
+				props->state = IB_PORT_INIT;
+			in_dev_put(inetdev);
+		} else
+			props->state = IB_PORT_INIT;
+	}
+
+	props->port_cap_flags =
+	    IB_PORT_CM_SUP |
+	    IB_PORT_SNMP_TUNNEL_SUP |
+	    IB_PORT_REINIT_SUP |
+	    IB_PORT_DEVICE_MGMT_SUP |
+	    IB_PORT_VENDOR_CLASS_SUP | IB_PORT_BOOT_MGMT_SUP;
+	props->gid_tbl_len = 1;
+	props->pkey_tbl_len = 1;
+	props->active_width = 2;
+	props->active_speed = IB_SPEED_DDR;
+	props->max_msg_sz = -1;
+
+	return 0;
+}
+
+static ssize_t show_rev(struct device *dev, struct device_attribute *attr,
+			char *buf)
+{
+	struct c4iw_dev *c4iw_dev = container_of(dev, struct c4iw_dev,
+						 ibdev.dev);
+	PDBG("%s dev 0x%p\n", __func__, dev);
+	return sprintf(buf, "%d\n",
+		       CHELSIO_CHIP_RELEASE(c4iw_dev->rdev.lldi.adapter_type));
+}
+
+static ssize_t show_fw_ver(struct device *dev, struct device_attribute *attr,
+			   char *buf)
+{
+	struct c4iw_dev *c4iw_dev = container_of(dev, struct c4iw_dev,
+						 ibdev.dev);
+	PDBG("%s dev 0x%p\n", __func__, dev);
+
+	return sprintf(buf, "%u.%u.%u.%u\n",
+			FW_HDR_FW_VER_MAJOR_GET(c4iw_dev->rdev.lldi.fw_vers),
+			FW_HDR_FW_VER_MINOR_GET(c4iw_dev->rdev.lldi.fw_vers),
+			FW_HDR_FW_VER_MICRO_GET(c4iw_dev->rdev.lldi.fw_vers),
+			FW_HDR_FW_VER_BUILD_GET(c4iw_dev->rdev.lldi.fw_vers));
+}
+
+static ssize_t show_hca(struct device *dev, struct device_attribute *attr,
+			char *buf)
+{
+	struct c4iw_dev *c4iw_dev = container_of(dev, struct c4iw_dev,
+						 ibdev.dev);
+	struct ethtool_drvinfo info;
+	struct net_device *lldev = c4iw_dev->rdev.lldi.ports[0];
+
+	PDBG("%s dev 0x%p\n", __func__, dev);
+	lldev->ethtool_ops->get_drvinfo(lldev, &info);
+	return sprintf(buf, "%s\n", info.driver);
+}
+
+static ssize_t show_board(struct device *dev, struct device_attribute *attr,
+			  char *buf)
+{
+	struct c4iw_dev *c4iw_dev = container_of(dev, struct c4iw_dev,
+						 ibdev.dev);
+	PDBG("%s dev 0x%p\n", __func__, dev);
+	return sprintf(buf, "%x.%x\n", c4iw_dev->rdev.lldi.pdev->vendor,
+		       c4iw_dev->rdev.lldi.pdev->device);
+}
+
+static int c4iw_get_mib(struct ib_device *ibdev,
+			union rdma_protocol_stats *stats)
+{
+	struct tp_tcp_stats v4, v6;
+	struct c4iw_dev *c4iw_dev = to_c4iw_dev(ibdev);
+
+	cxgb4_get_tcp_stats(c4iw_dev->rdev.lldi.pdev, &v4, &v6);
+	memset(stats, 0, sizeof *stats);
+	stats->iw.tcpInSegs = v4.tcpInSegs + v6.tcpInSegs;
+	stats->iw.tcpOutSegs = v4.tcpOutSegs + v6.tcpOutSegs;
+	stats->iw.tcpRetransSegs = v4.tcpRetransSegs + v6.tcpRetransSegs;
+	stats->iw.tcpOutRsts = v4.tcpOutRsts + v6.tcpOutSegs;
+
+	return 0;
+}
+
+static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL);
+static DEVICE_ATTR(fw_ver, S_IRUGO, show_fw_ver, NULL);
+static DEVICE_ATTR(hca_type, S_IRUGO, show_hca, NULL);
+static DEVICE_ATTR(board_id, S_IRUGO, show_board, NULL);
+
+static struct device_attribute *c4iw_class_attributes[] = {
+	&dev_attr_hw_rev,
+	&dev_attr_fw_ver,
+	&dev_attr_hca_type,
+	&dev_attr_board_id,
+};
+
+int c4iw_register_device(struct c4iw_dev *dev)
+{
+	int ret;
+	int i;
+
+	PDBG("%s c4iw_dev %p\n", __func__, dev);
+	BUG_ON(!dev->rdev.lldi.ports[0]);
+	strlcpy(dev->ibdev.name, "cxgb4_%d", IB_DEVICE_NAME_MAX);
+	memset(&dev->ibdev.node_guid, 0, sizeof(dev->ibdev.node_guid));
+	memcpy(&dev->ibdev.node_guid, dev->rdev.lldi.ports[0]->dev_addr, 6);
+	dev->ibdev.owner = THIS_MODULE;
+	dev->device_cap_flags = IB_DEVICE_LOCAL_DMA_LKEY | IB_DEVICE_MEM_WINDOW;
+	if (fastreg_support)
+		dev->device_cap_flags |= IB_DEVICE_MEM_MGT_EXTENSIONS;
+	dev->ibdev.local_dma_lkey = 0;
+	dev->ibdev.uverbs_cmd_mask =
+	    (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) |
+	    (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) |
+	    (1ull << IB_USER_VERBS_CMD_QUERY_PORT) |
+	    (1ull << IB_USER_VERBS_CMD_ALLOC_PD) |
+	    (1ull << IB_USER_VERBS_CMD_DEALLOC_PD) |
+	    (1ull << IB_USER_VERBS_CMD_REG_MR) |
+	    (1ull << IB_USER_VERBS_CMD_DEREG_MR) |
+	    (1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) |
+	    (1ull << IB_USER_VERBS_CMD_CREATE_CQ) |
+	    (1ull << IB_USER_VERBS_CMD_DESTROY_CQ) |
+	    (1ull << IB_USER_VERBS_CMD_REQ_NOTIFY_CQ) |
+	    (1ull << IB_USER_VERBS_CMD_CREATE_QP) |
+	    (1ull << IB_USER_VERBS_CMD_MODIFY_QP) |
+	    (1ull << IB_USER_VERBS_CMD_QUERY_QP) |
+	    (1ull << IB_USER_VERBS_CMD_POLL_CQ) |
+	    (1ull << IB_USER_VERBS_CMD_DESTROY_QP) |
+	    (1ull << IB_USER_VERBS_CMD_POST_SEND) |
+	    (1ull << IB_USER_VERBS_CMD_POST_RECV);
+	dev->ibdev.node_type = RDMA_NODE_RNIC;
+	memcpy(dev->ibdev.node_desc, C4IW_NODE_DESC, sizeof(C4IW_NODE_DESC));
+	dev->ibdev.phys_port_cnt = dev->rdev.lldi.nports;
+	dev->ibdev.num_comp_vectors =  dev->rdev.lldi.nciq;
+	dev->ibdev.dma_device = &(dev->rdev.lldi.pdev->dev);
+	dev->ibdev.query_device = c4iw_query_device;
+	dev->ibdev.query_port = c4iw_query_port;
+	dev->ibdev.query_pkey = c4iw_query_pkey;
+	dev->ibdev.query_gid = c4iw_query_gid;
+	dev->ibdev.alloc_ucontext = c4iw_alloc_ucontext;
+	dev->ibdev.dealloc_ucontext = c4iw_dealloc_ucontext;
+	dev->ibdev.mmap = c4iw_mmap;
+	dev->ibdev.alloc_pd = c4iw_allocate_pd;
+	dev->ibdev.dealloc_pd = c4iw_deallocate_pd;
+	dev->ibdev.create_ah = c4iw_ah_create;
+	dev->ibdev.destroy_ah = c4iw_ah_destroy;
+	dev->ibdev.create_qp = c4iw_create_qp;
+	dev->ibdev.modify_qp = c4iw_ib_modify_qp;
+	dev->ibdev.query_qp = c4iw_ib_query_qp;
+	dev->ibdev.destroy_qp = c4iw_destroy_qp;
+	dev->ibdev.create_cq = c4iw_create_cq;
+	dev->ibdev.destroy_cq = c4iw_destroy_cq;
+	dev->ibdev.resize_cq = c4iw_resize_cq;
+	dev->ibdev.poll_cq = c4iw_poll_cq;
+	dev->ibdev.get_dma_mr = c4iw_get_dma_mr;
+	dev->ibdev.reg_phys_mr = c4iw_register_phys_mem;
+	dev->ibdev.rereg_phys_mr = c4iw_reregister_phys_mem;
+	dev->ibdev.reg_user_mr = c4iw_reg_user_mr;
+	dev->ibdev.dereg_mr = c4iw_dereg_mr;
+	dev->ibdev.alloc_mw = c4iw_alloc_mw;
+	dev->ibdev.bind_mw = c4iw_bind_mw;
+	dev->ibdev.dealloc_mw = c4iw_dealloc_mw;
+	dev->ibdev.alloc_fast_reg_mr = c4iw_alloc_fast_reg_mr;
+	dev->ibdev.alloc_fast_reg_page_list = c4iw_alloc_fastreg_pbl;
+	dev->ibdev.free_fast_reg_page_list = c4iw_free_fastreg_pbl;
+	dev->ibdev.attach_mcast = c4iw_multicast_attach;
+	dev->ibdev.detach_mcast = c4iw_multicast_detach;
+	dev->ibdev.process_mad = c4iw_process_mad;
+	dev->ibdev.req_notify_cq = c4iw_arm_cq;
+	dev->ibdev.post_send = c4iw_post_send;
+	dev->ibdev.post_recv = c4iw_post_receive;
+	dev->ibdev.get_protocol_stats = c4iw_get_mib;
+	dev->ibdev.uverbs_abi_ver = C4IW_UVERBS_ABI_VERSION;
+
+	dev->ibdev.iwcm = kmalloc(sizeof(struct iw_cm_verbs), GFP_KERNEL);
+	if (!dev->ibdev.iwcm)
+		return -ENOMEM;
+
+	dev->ibdev.iwcm->connect = c4iw_connect;
+	dev->ibdev.iwcm->accept = c4iw_accept_cr;
+	dev->ibdev.iwcm->reject = c4iw_reject_cr;
+	dev->ibdev.iwcm->create_listen = c4iw_create_listen;
+	dev->ibdev.iwcm->destroy_listen = c4iw_destroy_listen;
+	dev->ibdev.iwcm->add_ref = c4iw_qp_add_ref;
+	dev->ibdev.iwcm->rem_ref = c4iw_qp_rem_ref;
+	dev->ibdev.iwcm->get_qp = c4iw_get_qp;
+
+	ret = ib_register_device(&dev->ibdev, NULL);
+	if (ret)
+		goto bail1;
+
+	for (i = 0; i < ARRAY_SIZE(c4iw_class_attributes); ++i) {
+		ret = device_create_file(&dev->ibdev.dev,
+					 c4iw_class_attributes[i]);
+		if (ret)
+			goto bail2;
+	}
+	return 0;
+bail2:
+	ib_unregister_device(&dev->ibdev);
+bail1:
+	kfree(dev->ibdev.iwcm);
+	return ret;
+}
+
+void c4iw_unregister_device(struct c4iw_dev *dev)
+{
+	int i;
+
+	PDBG("%s c4iw_dev %p\n", __func__, dev);
+	for (i = 0; i < ARRAY_SIZE(c4iw_class_attributes); ++i)
+		device_remove_file(&dev->ibdev.dev,
+				   c4iw_class_attributes[i]);
+	ib_unregister_device(&dev->ibdev);
+	kfree(dev->ibdev.iwcm);
+	return;
+}
diff --git a/drivers/infiniband/hw/cxgb4/qp.c b/drivers/infiniband/hw/cxgb4/qp.c
new file mode 100644
index 0000000..41cd688
--- /dev/null
+++ b/drivers/infiniband/hw/cxgb4/qp.c
@@ -0,0 +1,1886 @@
+/*
+ * Copyright (c) 2009-2010 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/module.h>
+
+#include "iw_cxgb4.h"
+
+static int db_delay_usecs = 1;
+module_param(db_delay_usecs, int, 0644);
+MODULE_PARM_DESC(db_delay_usecs, "Usecs to delay awaiting db fifo to drain");
+
+static int ocqp_support = 1;
+module_param(ocqp_support, int, 0644);
+MODULE_PARM_DESC(ocqp_support, "Support on-chip SQs (default=1)");
+
+int db_fc_threshold = 1000;
+module_param(db_fc_threshold, int, 0644);
+MODULE_PARM_DESC(db_fc_threshold,
+		 "QP count/threshold that triggers"
+		 " automatic db flow control mode (default = 1000)");
+
+int db_coalescing_threshold;
+module_param(db_coalescing_threshold, int, 0644);
+MODULE_PARM_DESC(db_coalescing_threshold,
+		 "QP count/threshold that triggers"
+		 " disabling db coalescing (default = 0)");
+
+static int max_fr_immd = T4_MAX_FR_IMMD;
+module_param(max_fr_immd, int, 0644);
+MODULE_PARM_DESC(max_fr_immd, "fastreg threshold for using DSGL instead of immedate");
+
+static int alloc_ird(struct c4iw_dev *dev, u32 ird)
+{
+	int ret = 0;
+
+	spin_lock_irq(&dev->lock);
+	if (ird <= dev->avail_ird)
+		dev->avail_ird -= ird;
+	else
+		ret = -ENOMEM;
+	spin_unlock_irq(&dev->lock);
+
+	if (ret)
+		dev_warn(&dev->rdev.lldi.pdev->dev,
+			 "device IRD resources exhausted\n");
+
+	return ret;
+}
+
+static void free_ird(struct c4iw_dev *dev, int ird)
+{
+	spin_lock_irq(&dev->lock);
+	dev->avail_ird += ird;
+	spin_unlock_irq(&dev->lock);
+}
+
+static void set_state(struct c4iw_qp *qhp, enum c4iw_qp_state state)
+{
+	unsigned long flag;
+	spin_lock_irqsave(&qhp->lock, flag);
+	qhp->attr.state = state;
+	spin_unlock_irqrestore(&qhp->lock, flag);
+}
+
+static void dealloc_oc_sq(struct c4iw_rdev *rdev, struct t4_sq *sq)
+{
+	c4iw_ocqp_pool_free(rdev, sq->dma_addr, sq->memsize);
+}
+
+static void dealloc_host_sq(struct c4iw_rdev *rdev, struct t4_sq *sq)
+{
+	dma_free_coherent(&(rdev->lldi.pdev->dev), sq->memsize, sq->queue,
+			  pci_unmap_addr(sq, mapping));
+}
+
+static void dealloc_sq(struct c4iw_rdev *rdev, struct t4_sq *sq)
+{
+	if (t4_sq_onchip(sq))
+		dealloc_oc_sq(rdev, sq);
+	else
+		dealloc_host_sq(rdev, sq);
+}
+
+static int alloc_oc_sq(struct c4iw_rdev *rdev, struct t4_sq *sq)
+{
+	if (!ocqp_support || !ocqp_supported(&rdev->lldi))
+		return -ENOSYS;
+	sq->dma_addr = c4iw_ocqp_pool_alloc(rdev, sq->memsize);
+	if (!sq->dma_addr)
+		return -ENOMEM;
+	sq->phys_addr = rdev->oc_mw_pa + sq->dma_addr -
+			rdev->lldi.vr->ocq.start;
+	sq->queue = (__force union t4_wr *)(rdev->oc_mw_kva + sq->dma_addr -
+					    rdev->lldi.vr->ocq.start);
+	sq->flags |= T4_SQ_ONCHIP;
+	return 0;
+}
+
+static int alloc_host_sq(struct c4iw_rdev *rdev, struct t4_sq *sq)
+{
+	sq->queue = dma_alloc_coherent(&(rdev->lldi.pdev->dev), sq->memsize,
+				       &(sq->dma_addr), GFP_KERNEL);
+	if (!sq->queue)
+		return -ENOMEM;
+	sq->phys_addr = virt_to_phys(sq->queue);
+	pci_unmap_addr_set(sq, mapping, sq->dma_addr);
+	return 0;
+}
+
+static int alloc_sq(struct c4iw_rdev *rdev, struct t4_sq *sq, int user)
+{
+	int ret = -ENOSYS;
+	if (user)
+		ret = alloc_oc_sq(rdev, sq);
+	if (ret)
+		ret = alloc_host_sq(rdev, sq);
+	return ret;
+}
+
+static int destroy_qp(struct c4iw_rdev *rdev, struct t4_wq *wq,
+		      struct c4iw_dev_ucontext *uctx)
+{
+	/*
+	 * uP clears EQ contexts when the connection exits rdma mode,
+	 * so no need to post a RESET WR for these EQs.
+	 */
+	dma_free_coherent(&(rdev->lldi.pdev->dev),
+			  wq->rq.memsize, wq->rq.queue,
+			  dma_unmap_addr(&wq->rq, mapping));
+	dealloc_sq(rdev, &wq->sq);
+	c4iw_rqtpool_free(rdev, wq->rq.rqt_hwaddr, wq->rq.rqt_size);
+	kfree(wq->rq.sw_rq);
+	kfree(wq->sq.sw_sq);
+	c4iw_put_qpid(rdev, wq->rq.qid, uctx);
+	c4iw_put_qpid(rdev, wq->sq.qid, uctx);
+	return 0;
+}
+
+static int create_qp(struct c4iw_rdev *rdev, struct t4_wq *wq,
+		     struct t4_cq *rcq, struct t4_cq *scq,
+		     struct c4iw_dev_ucontext *uctx)
+{
+	int user = (uctx != &rdev->uctx);
+	struct fw_ri_res_wr *res_wr;
+	struct fw_ri_res *res;
+	int wr_len;
+	struct c4iw_wr_wait wr_wait;
+	struct sk_buff *skb;
+	int ret = 0;
+	int eqsize;
+
+	wq->sq.qid = c4iw_get_qpid(rdev, uctx);
+	if (!wq->sq.qid)
+		return -ENOMEM;
+
+	wq->rq.qid = c4iw_get_qpid(rdev, uctx);
+	if (!wq->rq.qid) {
+		ret = -ENOMEM;
+		goto free_sq_qid;
+	}
+
+	if (!user) {
+		wq->sq.sw_sq = kzalloc(wq->sq.size * sizeof *wq->sq.sw_sq,
+				 GFP_KERNEL);
+		if (!wq->sq.sw_sq) {
+			ret = -ENOMEM;
+			goto free_rq_qid;
+		}
+
+		wq->rq.sw_rq = kzalloc(wq->rq.size * sizeof *wq->rq.sw_rq,
+				 GFP_KERNEL);
+		if (!wq->rq.sw_rq) {
+			ret = -ENOMEM;
+			goto free_sw_sq;
+		}
+	}
+
+	/*
+	 * RQT must be a power of 2 and at least 16 deep.
+	 */
+	wq->rq.rqt_size = roundup_pow_of_two(max_t(u16, wq->rq.size, 16));
+	wq->rq.rqt_hwaddr = c4iw_rqtpool_alloc(rdev, wq->rq.rqt_size);
+	if (!wq->rq.rqt_hwaddr) {
+		ret = -ENOMEM;
+		goto free_sw_rq;
+	}
+
+	ret = alloc_sq(rdev, &wq->sq, user);
+	if (ret)
+		goto free_hwaddr;
+	memset(wq->sq.queue, 0, wq->sq.memsize);
+	dma_unmap_addr_set(&wq->sq, mapping, wq->sq.dma_addr);
+
+	wq->rq.queue = dma_alloc_coherent(&(rdev->lldi.pdev->dev),
+					  wq->rq.memsize, &(wq->rq.dma_addr),
+					  GFP_KERNEL);
+	if (!wq->rq.queue) {
+		ret = -ENOMEM;
+		goto free_sq;
+	}
+	PDBG("%s sq base va 0x%p pa 0x%llx rq base va 0x%p pa 0x%llx\n",
+		__func__, wq->sq.queue,
+		(unsigned long long)virt_to_phys(wq->sq.queue),
+		wq->rq.queue,
+		(unsigned long long)virt_to_phys(wq->rq.queue));
+	memset(wq->rq.queue, 0, wq->rq.memsize);
+	dma_unmap_addr_set(&wq->rq, mapping, wq->rq.dma_addr);
+
+	wq->db = rdev->lldi.db_reg;
+	wq->gts = rdev->lldi.gts_reg;
+	if (user || is_t5(rdev->lldi.adapter_type)) {
+		u32 off;
+
+		off = (wq->sq.qid << rdev->qpshift) & PAGE_MASK;
+		if (user) {
+			wq->sq.udb = (u64 __iomem *)(rdev->bar2_pa + off);
+		} else {
+			off += 128 * (wq->sq.qid & rdev->qpmask) + 8;
+			wq->sq.udb = (u64 __iomem *)(rdev->bar2_kva + off);
+		}
+		off = (wq->rq.qid << rdev->qpshift) & PAGE_MASK;
+		if (user) {
+			wq->rq.udb = (u64 __iomem *)(rdev->bar2_pa + off);
+		} else {
+			off += 128 * (wq->rq.qid & rdev->qpmask) + 8;
+			wq->rq.udb = (u64 __iomem *)(rdev->bar2_kva + off);
+		}
+	}
+	wq->rdev = rdev;
+	wq->rq.msn = 1;
+
+	/* build fw_ri_res_wr */
+	wr_len = sizeof *res_wr + 2 * sizeof *res;
+
+	skb = alloc_skb(wr_len, GFP_KERNEL);
+	if (!skb) {
+		ret = -ENOMEM;
+		goto free_dma;
+	}
+	set_wr_txq(skb, CPL_PRIORITY_CONTROL, 0);
+
+	res_wr = (struct fw_ri_res_wr *)__skb_put(skb, wr_len);
+	memset(res_wr, 0, wr_len);
+	res_wr->op_nres = cpu_to_be32(
+			FW_WR_OP(FW_RI_RES_WR) |
+			V_FW_RI_RES_WR_NRES(2) |
+			FW_WR_COMPL(1));
+	res_wr->len16_pkd = cpu_to_be32(DIV_ROUND_UP(wr_len, 16));
+	res_wr->cookie = (unsigned long) &wr_wait;
+	res = res_wr->res;
+	res->u.sqrq.restype = FW_RI_RES_TYPE_SQ;
+	res->u.sqrq.op = FW_RI_RES_OP_WRITE;
+
+	/*
+	 * eqsize is the number of 64B entries plus the status page size.
+	 */
+	eqsize = wq->sq.size * T4_SQ_NUM_SLOTS +
+		rdev->hw_queue.t4_eq_status_entries;
+
+	res->u.sqrq.fetchszm_to_iqid = cpu_to_be32(
+		V_FW_RI_RES_WR_HOSTFCMODE(0) |	/* no host cidx updates */
+		V_FW_RI_RES_WR_CPRIO(0) |	/* don't keep in chip cache */
+		V_FW_RI_RES_WR_PCIECHN(0) |	/* set by uP at ri_init time */
+		(t4_sq_onchip(&wq->sq) ? F_FW_RI_RES_WR_ONCHIP : 0) |
+		V_FW_RI_RES_WR_IQID(scq->cqid));
+	res->u.sqrq.dcaen_to_eqsize = cpu_to_be32(
+		V_FW_RI_RES_WR_DCAEN(0) |
+		V_FW_RI_RES_WR_DCACPU(0) |
+		V_FW_RI_RES_WR_FBMIN(2) |
+		V_FW_RI_RES_WR_FBMAX(2) |
+		V_FW_RI_RES_WR_CIDXFTHRESHO(0) |
+		V_FW_RI_RES_WR_CIDXFTHRESH(0) |
+		V_FW_RI_RES_WR_EQSIZE(eqsize));
+	res->u.sqrq.eqid = cpu_to_be32(wq->sq.qid);
+	res->u.sqrq.eqaddr = cpu_to_be64(wq->sq.dma_addr);
+	res++;
+	res->u.sqrq.restype = FW_RI_RES_TYPE_RQ;
+	res->u.sqrq.op = FW_RI_RES_OP_WRITE;
+
+	/*
+	 * eqsize is the number of 64B entries plus the status page size.
+	 */
+	eqsize = wq->rq.size * T4_RQ_NUM_SLOTS +
+		rdev->hw_queue.t4_eq_status_entries;
+	res->u.sqrq.fetchszm_to_iqid = cpu_to_be32(
+		V_FW_RI_RES_WR_HOSTFCMODE(0) |	/* no host cidx updates */
+		V_FW_RI_RES_WR_CPRIO(0) |	/* don't keep in chip cache */
+		V_FW_RI_RES_WR_PCIECHN(0) |	/* set by uP at ri_init time */
+		V_FW_RI_RES_WR_IQID(rcq->cqid));
+	res->u.sqrq.dcaen_to_eqsize = cpu_to_be32(
+		V_FW_RI_RES_WR_DCAEN(0) |
+		V_FW_RI_RES_WR_DCACPU(0) |
+		V_FW_RI_RES_WR_FBMIN(2) |
+		V_FW_RI_RES_WR_FBMAX(2) |
+		V_FW_RI_RES_WR_CIDXFTHRESHO(0) |
+		V_FW_RI_RES_WR_CIDXFTHRESH(0) |
+		V_FW_RI_RES_WR_EQSIZE(eqsize));
+	res->u.sqrq.eqid = cpu_to_be32(wq->rq.qid);
+	res->u.sqrq.eqaddr = cpu_to_be64(wq->rq.dma_addr);
+
+	c4iw_init_wr_wait(&wr_wait);
+
+	ret = c4iw_ofld_send(rdev, skb);
+	if (ret)
+		goto free_dma;
+	ret = c4iw_wait_for_reply(rdev, &wr_wait, 0, wq->sq.qid, __func__);
+	if (ret)
+		goto free_dma;
+
+	PDBG("%s sqid 0x%x rqid 0x%x kdb 0x%p squdb 0x%lx rqudb 0x%lx\n",
+	     __func__, wq->sq.qid, wq->rq.qid, wq->db,
+	     (__force unsigned long) wq->sq.udb,
+	     (__force unsigned long) wq->rq.udb);
+
+	return 0;
+free_dma:
+	dma_free_coherent(&(rdev->lldi.pdev->dev),
+			  wq->rq.memsize, wq->rq.queue,
+			  dma_unmap_addr(&wq->rq, mapping));
+free_sq:
+	dealloc_sq(rdev, &wq->sq);
+free_hwaddr:
+	c4iw_rqtpool_free(rdev, wq->rq.rqt_hwaddr, wq->rq.rqt_size);
+free_sw_rq:
+	kfree(wq->rq.sw_rq);
+free_sw_sq:
+	kfree(wq->sq.sw_sq);
+free_rq_qid:
+	c4iw_put_qpid(rdev, wq->rq.qid, uctx);
+free_sq_qid:
+	c4iw_put_qpid(rdev, wq->sq.qid, uctx);
+	return ret;
+}
+
+static int build_immd(struct t4_sq *sq, struct fw_ri_immd *immdp,
+		      struct ib_send_wr *wr, int max, u32 *plenp)
+{
+	u8 *dstp, *srcp;
+	u32 plen = 0;
+	int i;
+	int rem, len;
+
+	dstp = (u8 *)immdp->data;
+	for (i = 0; i < wr->num_sge; i++) {
+		if ((plen + wr->sg_list[i].length) > max)
+			return -EMSGSIZE;
+		srcp = (u8 *)(unsigned long)wr->sg_list[i].addr;
+		plen += wr->sg_list[i].length;
+		rem = wr->sg_list[i].length;
+		while (rem) {
+			if (dstp == (u8 *)&sq->queue[sq->size])
+				dstp = (u8 *)sq->queue;
+			if (rem <= (u8 *)&sq->queue[sq->size] - dstp)
+				len = rem;
+			else
+				len = (u8 *)&sq->queue[sq->size] - dstp;
+			memcpy(dstp, srcp, len);
+			dstp += len;
+			srcp += len;
+			rem -= len;
+		}
+	}
+	len = roundup(plen + sizeof *immdp, 16) - (plen + sizeof *immdp);
+	if (len)
+		memset(dstp, 0, len);
+	immdp->op = FW_RI_DATA_IMMD;
+	immdp->r1 = 0;
+	immdp->r2 = 0;
+	immdp->immdlen = cpu_to_be32(plen);
+	*plenp = plen;
+	return 0;
+}
+
+static int build_isgl(__be64 *queue_start, __be64 *queue_end,
+		      struct fw_ri_isgl *isglp, struct ib_sge *sg_list,
+		      int num_sge, u32 *plenp)
+
+{
+	int i;
+	u32 plen = 0;
+	__be64 *flitp = (__be64 *)isglp->sge;
+
+	for (i = 0; i < num_sge; i++) {
+		if ((plen + sg_list[i].length) < plen)
+			return -EMSGSIZE;
+		plen += sg_list[i].length;
+		*flitp = cpu_to_be64(((u64)sg_list[i].lkey << 32) |
+				     sg_list[i].length);
+		if (++flitp == queue_end)
+			flitp = queue_start;
+		*flitp = cpu_to_be64(sg_list[i].addr);
+		if (++flitp == queue_end)
+			flitp = queue_start;
+	}
+	*flitp = (__force __be64)0;
+	isglp->op = FW_RI_DATA_ISGL;
+	isglp->r1 = 0;
+	isglp->nsge = cpu_to_be16(num_sge);
+	isglp->r2 = 0;
+	if (plenp)
+		*plenp = plen;
+	return 0;
+}
+
+static int build_rdma_send(struct t4_sq *sq, union t4_wr *wqe,
+			   struct ib_send_wr *wr, u8 *len16)
+{
+	u32 plen;
+	int size;
+	int ret;
+
+	if (wr->num_sge > T4_MAX_SEND_SGE)
+		return -EINVAL;
+	switch (wr->opcode) {
+	case IB_WR_SEND:
+		if (wr->send_flags & IB_SEND_SOLICITED)
+			wqe->send.sendop_pkd = cpu_to_be32(
+				V_FW_RI_SEND_WR_SENDOP(FW_RI_SEND_WITH_SE));
+		else
+			wqe->send.sendop_pkd = cpu_to_be32(
+				V_FW_RI_SEND_WR_SENDOP(FW_RI_SEND));
+		wqe->send.stag_inv = 0;
+		break;
+	case IB_WR_SEND_WITH_INV:
+		if (wr->send_flags & IB_SEND_SOLICITED)
+			wqe->send.sendop_pkd = cpu_to_be32(
+				V_FW_RI_SEND_WR_SENDOP(FW_RI_SEND_WITH_SE_INV));
+		else
+			wqe->send.sendop_pkd = cpu_to_be32(
+				V_FW_RI_SEND_WR_SENDOP(FW_RI_SEND_WITH_INV));
+		wqe->send.stag_inv = cpu_to_be32(wr->ex.invalidate_rkey);
+		break;
+
+	default:
+		return -EINVAL;
+	}
+	wqe->send.r3 = 0;
+	wqe->send.r4 = 0;
+
+	plen = 0;
+	if (wr->num_sge) {
+		if (wr->send_flags & IB_SEND_INLINE) {
+			ret = build_immd(sq, wqe->send.u.immd_src, wr,
+					 T4_MAX_SEND_INLINE, &plen);
+			if (ret)
+				return ret;
+			size = sizeof wqe->send + sizeof(struct fw_ri_immd) +
+			       plen;
+		} else {
+			ret = build_isgl((__be64 *)sq->queue,
+					 (__be64 *)&sq->queue[sq->size],
+					 wqe->send.u.isgl_src,
+					 wr->sg_list, wr->num_sge, &plen);
+			if (ret)
+				return ret;
+			size = sizeof wqe->send + sizeof(struct fw_ri_isgl) +
+			       wr->num_sge * sizeof(struct fw_ri_sge);
+		}
+	} else {
+		wqe->send.u.immd_src[0].op = FW_RI_DATA_IMMD;
+		wqe->send.u.immd_src[0].r1 = 0;
+		wqe->send.u.immd_src[0].r2 = 0;
+		wqe->send.u.immd_src[0].immdlen = 0;
+		size = sizeof wqe->send + sizeof(struct fw_ri_immd);
+		plen = 0;
+	}
+	*len16 = DIV_ROUND_UP(size, 16);
+	wqe->send.plen = cpu_to_be32(plen);
+	return 0;
+}
+
+static int build_rdma_write(struct t4_sq *sq, union t4_wr *wqe,
+			    struct ib_send_wr *wr, u8 *len16)
+{
+	u32 plen;
+	int size;
+	int ret;
+
+	if (wr->num_sge > T4_MAX_SEND_SGE)
+		return -EINVAL;
+	wqe->write.r2 = 0;
+	wqe->write.stag_sink = cpu_to_be32(wr->wr.rdma.rkey);
+	wqe->write.to_sink = cpu_to_be64(wr->wr.rdma.remote_addr);
+	if (wr->num_sge) {
+		if (wr->send_flags & IB_SEND_INLINE) {
+			ret = build_immd(sq, wqe->write.u.immd_src, wr,
+					 T4_MAX_WRITE_INLINE, &plen);
+			if (ret)
+				return ret;
+			size = sizeof wqe->write + sizeof(struct fw_ri_immd) +
+			       plen;
+		} else {
+			ret = build_isgl((__be64 *)sq->queue,
+					 (__be64 *)&sq->queue[sq->size],
+					 wqe->write.u.isgl_src,
+					 wr->sg_list, wr->num_sge, &plen);
+			if (ret)
+				return ret;
+			size = sizeof wqe->write + sizeof(struct fw_ri_isgl) +
+			       wr->num_sge * sizeof(struct fw_ri_sge);
+		}
+	} else {
+		wqe->write.u.immd_src[0].op = FW_RI_DATA_IMMD;
+		wqe->write.u.immd_src[0].r1 = 0;
+		wqe->write.u.immd_src[0].r2 = 0;
+		wqe->write.u.immd_src[0].immdlen = 0;
+		size = sizeof wqe->write + sizeof(struct fw_ri_immd);
+		plen = 0;
+	}
+	*len16 = DIV_ROUND_UP(size, 16);
+	wqe->write.plen = cpu_to_be32(plen);
+	return 0;
+}
+
+static int build_rdma_read(union t4_wr *wqe, struct ib_send_wr *wr, u8 *len16)
+{
+	if (wr->num_sge > 1)
+		return -EINVAL;
+	if (wr->num_sge) {
+		wqe->read.stag_src = cpu_to_be32(wr->wr.rdma.rkey);
+		wqe->read.to_src_hi = cpu_to_be32((u32)(wr->wr.rdma.remote_addr
+							>> 32));
+		wqe->read.to_src_lo = cpu_to_be32((u32)wr->wr.rdma.remote_addr);
+		wqe->read.stag_sink = cpu_to_be32(wr->sg_list[0].lkey);
+		wqe->read.plen = cpu_to_be32(wr->sg_list[0].length);
+		wqe->read.to_sink_hi = cpu_to_be32((u32)(wr->sg_list[0].addr
+							 >> 32));
+		wqe->read.to_sink_lo = cpu_to_be32((u32)(wr->sg_list[0].addr));
+	} else {
+		wqe->read.stag_src = cpu_to_be32(2);
+		wqe->read.to_src_hi = 0;
+		wqe->read.to_src_lo = 0;
+		wqe->read.stag_sink = cpu_to_be32(2);
+		wqe->read.plen = 0;
+		wqe->read.to_sink_hi = 0;
+		wqe->read.to_sink_lo = 0;
+	}
+	wqe->read.r2 = 0;
+	wqe->read.r5 = 0;
+	*len16 = DIV_ROUND_UP(sizeof wqe->read, 16);
+	return 0;
+}
+
+static int build_rdma_recv(struct c4iw_qp *qhp, union t4_recv_wr *wqe,
+			   struct ib_recv_wr *wr, u8 *len16)
+{
+	int ret;
+
+	ret = build_isgl((__be64 *)qhp->wq.rq.queue,
+			 (__be64 *)&qhp->wq.rq.queue[qhp->wq.rq.size],
+			 &wqe->recv.isgl, wr->sg_list, wr->num_sge, NULL);
+	if (ret)
+		return ret;
+	*len16 = DIV_ROUND_UP(sizeof wqe->recv +
+			      wr->num_sge * sizeof(struct fw_ri_sge), 16);
+	return 0;
+}
+
+static int build_fastreg(struct t4_sq *sq, union t4_wr *wqe,
+			 struct ib_send_wr *wr, u8 *len16, u8 t5dev)
+{
+
+	struct fw_ri_immd *imdp;
+	__be64 *p;
+	int i;
+	int pbllen = roundup(wr->wr.fast_reg.page_list_len * sizeof(u64), 32);
+	int rem;
+
+	if (wr->wr.fast_reg.page_list_len >
+	    t4_max_fr_depth(use_dsgl))
+		return -EINVAL;
+
+	wqe->fr.qpbinde_to_dcacpu = 0;
+	wqe->fr.pgsz_shift = wr->wr.fast_reg.page_shift - 12;
+	wqe->fr.addr_type = FW_RI_VA_BASED_TO;
+	wqe->fr.mem_perms = c4iw_ib_to_tpt_access(wr->wr.fast_reg.access_flags);
+	wqe->fr.len_hi = 0;
+	wqe->fr.len_lo = cpu_to_be32(wr->wr.fast_reg.length);
+	wqe->fr.stag = cpu_to_be32(wr->wr.fast_reg.rkey);
+	wqe->fr.va_hi = cpu_to_be32(wr->wr.fast_reg.iova_start >> 32);
+	wqe->fr.va_lo_fbo = cpu_to_be32(wr->wr.fast_reg.iova_start &
+					0xffffffff);
+
+	if (t5dev && use_dsgl && (pbllen > max_fr_immd)) {
+		struct c4iw_fr_page_list *c4pl =
+			to_c4iw_fr_page_list(wr->wr.fast_reg.page_list);
+		struct fw_ri_dsgl *sglp;
+
+		for (i = 0; i < wr->wr.fast_reg.page_list_len; i++) {
+			wr->wr.fast_reg.page_list->page_list[i] = (__force u64)
+				cpu_to_be64((u64)
+				wr->wr.fast_reg.page_list->page_list[i]);
+		}
+
+		sglp = (struct fw_ri_dsgl *)(&wqe->fr + 1);
+		sglp->op = FW_RI_DATA_DSGL;
+		sglp->r1 = 0;
+		sglp->nsge = cpu_to_be16(1);
+		sglp->addr0 = cpu_to_be64(c4pl->dma_addr);
+		sglp->len0 = cpu_to_be32(pbllen);
+
+		*len16 = DIV_ROUND_UP(sizeof(wqe->fr) + sizeof(*sglp), 16);
+	} else {
+		imdp = (struct fw_ri_immd *)(&wqe->fr + 1);
+		imdp->op = FW_RI_DATA_IMMD;
+		imdp->r1 = 0;
+		imdp->r2 = 0;
+		imdp->immdlen = cpu_to_be32(pbllen);
+		p = (__be64 *)(imdp + 1);
+		rem = pbllen;
+		for (i = 0; i < wr->wr.fast_reg.page_list_len; i++) {
+			*p = cpu_to_be64(
+				(u64)wr->wr.fast_reg.page_list->page_list[i]);
+			rem -= sizeof(*p);
+			if (++p == (__be64 *)&sq->queue[sq->size])
+				p = (__be64 *)sq->queue;
+		}
+		BUG_ON(rem < 0);
+		while (rem) {
+			*p = 0;
+			rem -= sizeof(*p);
+			if (++p == (__be64 *)&sq->queue[sq->size])
+				p = (__be64 *)sq->queue;
+		}
+		*len16 = DIV_ROUND_UP(sizeof(wqe->fr) + sizeof(*imdp)
+				      + pbllen, 16);
+	}
+	return 0;
+}
+
+static int build_inv_stag(union t4_wr *wqe, struct ib_send_wr *wr,
+			  u8 *len16)
+{
+	wqe->inv.stag_inv = cpu_to_be32(wr->ex.invalidate_rkey);
+	wqe->inv.r2 = 0;
+	*len16 = DIV_ROUND_UP(sizeof wqe->inv, 16);
+	return 0;
+}
+
+void c4iw_qp_add_ref(struct ib_qp *qp)
+{
+	PDBG("%s ib_qp %p\n", __func__, qp);
+	atomic_inc(&(to_c4iw_qp(qp)->refcnt));
+}
+
+void c4iw_qp_rem_ref(struct ib_qp *qp)
+{
+	PDBG("%s ib_qp %p\n", __func__, qp);
+	if (atomic_dec_and_test(&(to_c4iw_qp(qp)->refcnt)))
+		wake_up(&(to_c4iw_qp(qp)->wait));
+}
+
+static void add_to_fc_list(struct list_head *head, struct list_head *entry)
+{
+	if (list_empty(entry))
+		list_add_tail(entry, head);
+}
+
+static int ring_kernel_sq_db(struct c4iw_qp *qhp, u16 inc)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&qhp->rhp->lock, flags);
+	spin_lock(&qhp->lock);
+	if (qhp->rhp->db_state == NORMAL)
+		t4_ring_sq_db(&qhp->wq, inc,
+			      is_t5(qhp->rhp->rdev.lldi.adapter_type), NULL);
+	else {
+		add_to_fc_list(&qhp->rhp->db_fc_list, &qhp->db_fc_entry);
+		qhp->wq.sq.wq_pidx_inc += inc;
+	}
+	spin_unlock(&qhp->lock);
+	spin_unlock_irqrestore(&qhp->rhp->lock, flags);
+	return 0;
+}
+
+static int ring_kernel_rq_db(struct c4iw_qp *qhp, u16 inc)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&qhp->rhp->lock, flags);
+	spin_lock(&qhp->lock);
+	if (qhp->rhp->db_state == NORMAL)
+		t4_ring_rq_db(&qhp->wq, inc,
+			      is_t5(qhp->rhp->rdev.lldi.adapter_type), NULL);
+	else {
+		add_to_fc_list(&qhp->rhp->db_fc_list, &qhp->db_fc_entry);
+		qhp->wq.rq.wq_pidx_inc += inc;
+	}
+	spin_unlock(&qhp->lock);
+	spin_unlock_irqrestore(&qhp->rhp->lock, flags);
+	return 0;
+}
+
+int c4iw_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
+		   struct ib_send_wr **bad_wr)
+{
+	int err = 0;
+	u8 len16 = 0;
+	enum fw_wr_opcodes fw_opcode = 0;
+	enum fw_ri_wr_flags fw_flags;
+	struct c4iw_qp *qhp;
+	union t4_wr *wqe = NULL;
+	u32 num_wrs;
+	struct t4_swsqe *swsqe;
+	unsigned long flag;
+	u16 idx = 0;
+
+	qhp = to_c4iw_qp(ibqp);
+	spin_lock_irqsave(&qhp->lock, flag);
+	if (t4_wq_in_error(&qhp->wq)) {
+		spin_unlock_irqrestore(&qhp->lock, flag);
+		return -EINVAL;
+	}
+	num_wrs = t4_sq_avail(&qhp->wq);
+	if (num_wrs == 0) {
+		spin_unlock_irqrestore(&qhp->lock, flag);
+		return -ENOMEM;
+	}
+	while (wr) {
+		if (num_wrs == 0) {
+			err = -ENOMEM;
+			*bad_wr = wr;
+			break;
+		}
+		wqe = (union t4_wr *)((u8 *)qhp->wq.sq.queue +
+		      qhp->wq.sq.wq_pidx * T4_EQ_ENTRY_SIZE);
+
+		fw_flags = 0;
+		if (wr->send_flags & IB_SEND_SOLICITED)
+			fw_flags |= FW_RI_SOLICITED_EVENT_FLAG;
+		if (wr->send_flags & IB_SEND_SIGNALED || qhp->sq_sig_all)
+			fw_flags |= FW_RI_COMPLETION_FLAG;
+		swsqe = &qhp->wq.sq.sw_sq[qhp->wq.sq.pidx];
+		switch (wr->opcode) {
+		case IB_WR_SEND_WITH_INV:
+		case IB_WR_SEND:
+			if (wr->send_flags & IB_SEND_FENCE)
+				fw_flags |= FW_RI_READ_FENCE_FLAG;
+			fw_opcode = FW_RI_SEND_WR;
+			if (wr->opcode == IB_WR_SEND)
+				swsqe->opcode = FW_RI_SEND;
+			else
+				swsqe->opcode = FW_RI_SEND_WITH_INV;
+			err = build_rdma_send(&qhp->wq.sq, wqe, wr, &len16);
+			break;
+		case IB_WR_RDMA_WRITE:
+			fw_opcode = FW_RI_RDMA_WRITE_WR;
+			swsqe->opcode = FW_RI_RDMA_WRITE;
+			err = build_rdma_write(&qhp->wq.sq, wqe, wr, &len16);
+			break;
+		case IB_WR_RDMA_READ:
+		case IB_WR_RDMA_READ_WITH_INV:
+			fw_opcode = FW_RI_RDMA_READ_WR;
+			swsqe->opcode = FW_RI_READ_REQ;
+			if (wr->opcode == IB_WR_RDMA_READ_WITH_INV)
+				fw_flags = FW_RI_RDMA_READ_INVALIDATE;
+			else
+				fw_flags = 0;
+			err = build_rdma_read(wqe, wr, &len16);
+			if (err)
+				break;
+			swsqe->read_len = wr->sg_list[0].length;
+			if (!qhp->wq.sq.oldest_read)
+				qhp->wq.sq.oldest_read = swsqe;
+			break;
+		case IB_WR_FAST_REG_MR:
+			fw_opcode = FW_RI_FR_NSMR_WR;
+			swsqe->opcode = FW_RI_FAST_REGISTER;
+			err = build_fastreg(&qhp->wq.sq, wqe, wr, &len16,
+					    is_t5(
+					    qhp->rhp->rdev.lldi.adapter_type) ?
+					    1 : 0);
+			break;
+		case IB_WR_LOCAL_INV:
+			if (wr->send_flags & IB_SEND_FENCE)
+				fw_flags |= FW_RI_LOCAL_FENCE_FLAG;
+			fw_opcode = FW_RI_INV_LSTAG_WR;
+			swsqe->opcode = FW_RI_LOCAL_INV;
+			err = build_inv_stag(wqe, wr, &len16);
+			break;
+		default:
+			PDBG("%s post of type=%d TBD!\n", __func__,
+			     wr->opcode);
+			err = -EINVAL;
+		}
+		if (err) {
+			*bad_wr = wr;
+			break;
+		}
+		swsqe->idx = qhp->wq.sq.pidx;
+		swsqe->complete = 0;
+		swsqe->signaled = (wr->send_flags & IB_SEND_SIGNALED) ||
+				  qhp->sq_sig_all;
+		swsqe->flushed = 0;
+		swsqe->wr_id = wr->wr_id;
+		if (c4iw_wr_log) {
+			swsqe->sge_ts = cxgb4_read_sge_timestamp(
+					qhp->rhp->rdev.lldi.ports[0]);
+			getnstimeofday(&swsqe->host_ts);
+		}
+
+		init_wr_hdr(wqe, qhp->wq.sq.pidx, fw_opcode, fw_flags, len16);
+
+		PDBG("%s cookie 0x%llx pidx 0x%x opcode 0x%x read_len %u\n",
+		     __func__, (unsigned long long)wr->wr_id, qhp->wq.sq.pidx,
+		     swsqe->opcode, swsqe->read_len);
+		wr = wr->next;
+		num_wrs--;
+		t4_sq_produce(&qhp->wq, len16);
+		idx += DIV_ROUND_UP(len16*16, T4_EQ_ENTRY_SIZE);
+	}
+	if (!qhp->rhp->rdev.status_page->db_off) {
+		t4_ring_sq_db(&qhp->wq, idx,
+			      is_t5(qhp->rhp->rdev.lldi.adapter_type), wqe);
+		spin_unlock_irqrestore(&qhp->lock, flag);
+	} else {
+		spin_unlock_irqrestore(&qhp->lock, flag);
+		ring_kernel_sq_db(qhp, idx);
+	}
+	return err;
+}
+
+int c4iw_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr,
+		      struct ib_recv_wr **bad_wr)
+{
+	int err = 0;
+	struct c4iw_qp *qhp;
+	union t4_recv_wr *wqe = NULL;
+	u32 num_wrs;
+	u8 len16 = 0;
+	unsigned long flag;
+	u16 idx = 0;
+
+	qhp = to_c4iw_qp(ibqp);
+	spin_lock_irqsave(&qhp->lock, flag);
+	if (t4_wq_in_error(&qhp->wq)) {
+		spin_unlock_irqrestore(&qhp->lock, flag);
+		return -EINVAL;
+	}
+	num_wrs = t4_rq_avail(&qhp->wq);
+	if (num_wrs == 0) {
+		spin_unlock_irqrestore(&qhp->lock, flag);
+		return -ENOMEM;
+	}
+	while (wr) {
+		if (wr->num_sge > T4_MAX_RECV_SGE) {
+			err = -EINVAL;
+			*bad_wr = wr;
+			break;
+		}
+		wqe = (union t4_recv_wr *)((u8 *)qhp->wq.rq.queue +
+					   qhp->wq.rq.wq_pidx *
+					   T4_EQ_ENTRY_SIZE);
+		if (num_wrs)
+			err = build_rdma_recv(qhp, wqe, wr, &len16);
+		else
+			err = -ENOMEM;
+		if (err) {
+			*bad_wr = wr;
+			break;
+		}
+
+		qhp->wq.rq.sw_rq[qhp->wq.rq.pidx].wr_id = wr->wr_id;
+		if (c4iw_wr_log) {
+			qhp->wq.rq.sw_rq[qhp->wq.rq.pidx].sge_ts =
+				cxgb4_read_sge_timestamp(
+						qhp->rhp->rdev.lldi.ports[0]);
+			getnstimeofday(
+				&qhp->wq.rq.sw_rq[qhp->wq.rq.pidx].host_ts);
+		}
+
+		wqe->recv.opcode = FW_RI_RECV_WR;
+		wqe->recv.r1 = 0;
+		wqe->recv.wrid = qhp->wq.rq.pidx;
+		wqe->recv.r2[0] = 0;
+		wqe->recv.r2[1] = 0;
+		wqe->recv.r2[2] = 0;
+		wqe->recv.len16 = len16;
+		PDBG("%s cookie 0x%llx pidx %u\n", __func__,
+		     (unsigned long long) wr->wr_id, qhp->wq.rq.pidx);
+		t4_rq_produce(&qhp->wq, len16);
+		idx += DIV_ROUND_UP(len16*16, T4_EQ_ENTRY_SIZE);
+		wr = wr->next;
+		num_wrs--;
+	}
+	if (!qhp->rhp->rdev.status_page->db_off) {
+		t4_ring_rq_db(&qhp->wq, idx,
+			      is_t5(qhp->rhp->rdev.lldi.adapter_type), wqe);
+		spin_unlock_irqrestore(&qhp->lock, flag);
+	} else {
+		spin_unlock_irqrestore(&qhp->lock, flag);
+		ring_kernel_rq_db(qhp, idx);
+	}
+	return err;
+}
+
+int c4iw_bind_mw(struct ib_qp *qp, struct ib_mw *mw, struct ib_mw_bind *mw_bind)
+{
+	return -ENOSYS;
+}
+
+static inline void build_term_codes(struct t4_cqe *err_cqe, u8 *layer_type,
+				    u8 *ecode)
+{
+	int status;
+	int tagged;
+	int opcode;
+	int rqtype;
+	int send_inv;
+
+	if (!err_cqe) {
+		*layer_type = LAYER_RDMAP|DDP_LOCAL_CATA;
+		*ecode = 0;
+		return;
+	}
+
+	status = CQE_STATUS(err_cqe);
+	opcode = CQE_OPCODE(err_cqe);
+	rqtype = RQ_TYPE(err_cqe);
+	send_inv = (opcode == FW_RI_SEND_WITH_INV) ||
+		   (opcode == FW_RI_SEND_WITH_SE_INV);
+	tagged = (opcode == FW_RI_RDMA_WRITE) ||
+		 (rqtype && (opcode == FW_RI_READ_RESP));
+
+	switch (status) {
+	case T4_ERR_STAG:
+		if (send_inv) {
+			*layer_type = LAYER_RDMAP|RDMAP_REMOTE_OP;
+			*ecode = RDMAP_CANT_INV_STAG;
+		} else {
+			*layer_type = LAYER_RDMAP|RDMAP_REMOTE_PROT;
+			*ecode = RDMAP_INV_STAG;
+		}
+		break;
+	case T4_ERR_PDID:
+		*layer_type = LAYER_RDMAP|RDMAP_REMOTE_PROT;
+		if ((opcode == FW_RI_SEND_WITH_INV) ||
+		    (opcode == FW_RI_SEND_WITH_SE_INV))
+			*ecode = RDMAP_CANT_INV_STAG;
+		else
+			*ecode = RDMAP_STAG_NOT_ASSOC;
+		break;
+	case T4_ERR_QPID:
+		*layer_type = LAYER_RDMAP|RDMAP_REMOTE_PROT;
+		*ecode = RDMAP_STAG_NOT_ASSOC;
+		break;
+	case T4_ERR_ACCESS:
+		*layer_type = LAYER_RDMAP|RDMAP_REMOTE_PROT;
+		*ecode = RDMAP_ACC_VIOL;
+		break;
+	case T4_ERR_WRAP:
+		*layer_type = LAYER_RDMAP|RDMAP_REMOTE_PROT;
+		*ecode = RDMAP_TO_WRAP;
+		break;
+	case T4_ERR_BOUND:
+		if (tagged) {
+			*layer_type = LAYER_DDP|DDP_TAGGED_ERR;
+			*ecode = DDPT_BASE_BOUNDS;
+		} else {
+			*layer_type = LAYER_RDMAP|RDMAP_REMOTE_PROT;
+			*ecode = RDMAP_BASE_BOUNDS;
+		}
+		break;
+	case T4_ERR_INVALIDATE_SHARED_MR:
+	case T4_ERR_INVALIDATE_MR_WITH_MW_BOUND:
+		*layer_type = LAYER_RDMAP|RDMAP_REMOTE_OP;
+		*ecode = RDMAP_CANT_INV_STAG;
+		break;
+	case T4_ERR_ECC:
+	case T4_ERR_ECC_PSTAG:
+	case T4_ERR_INTERNAL_ERR:
+		*layer_type = LAYER_RDMAP|RDMAP_LOCAL_CATA;
+		*ecode = 0;
+		break;
+	case T4_ERR_OUT_OF_RQE:
+		*layer_type = LAYER_DDP|DDP_UNTAGGED_ERR;
+		*ecode = DDPU_INV_MSN_NOBUF;
+		break;
+	case T4_ERR_PBL_ADDR_BOUND:
+		*layer_type = LAYER_DDP|DDP_TAGGED_ERR;
+		*ecode = DDPT_BASE_BOUNDS;
+		break;
+	case T4_ERR_CRC:
+		*layer_type = LAYER_MPA|DDP_LLP;
+		*ecode = MPA_CRC_ERR;
+		break;
+	case T4_ERR_MARKER:
+		*layer_type = LAYER_MPA|DDP_LLP;
+		*ecode = MPA_MARKER_ERR;
+		break;
+	case T4_ERR_PDU_LEN_ERR:
+		*layer_type = LAYER_DDP|DDP_UNTAGGED_ERR;
+		*ecode = DDPU_MSG_TOOBIG;
+		break;
+	case T4_ERR_DDP_VERSION:
+		if (tagged) {
+			*layer_type = LAYER_DDP|DDP_TAGGED_ERR;
+			*ecode = DDPT_INV_VERS;
+		} else {
+			*layer_type = LAYER_DDP|DDP_UNTAGGED_ERR;
+			*ecode = DDPU_INV_VERS;
+		}
+		break;
+	case T4_ERR_RDMA_VERSION:
+		*layer_type = LAYER_RDMAP|RDMAP_REMOTE_OP;
+		*ecode = RDMAP_INV_VERS;
+		break;
+	case T4_ERR_OPCODE:
+		*layer_type = LAYER_RDMAP|RDMAP_REMOTE_OP;
+		*ecode = RDMAP_INV_OPCODE;
+		break;
+	case T4_ERR_DDP_QUEUE_NUM:
+		*layer_type = LAYER_DDP|DDP_UNTAGGED_ERR;
+		*ecode = DDPU_INV_QN;
+		break;
+	case T4_ERR_MSN:
+	case T4_ERR_MSN_GAP:
+	case T4_ERR_MSN_RANGE:
+	case T4_ERR_IRD_OVERFLOW:
+		*layer_type = LAYER_DDP|DDP_UNTAGGED_ERR;
+		*ecode = DDPU_INV_MSN_RANGE;
+		break;
+	case T4_ERR_TBIT:
+		*layer_type = LAYER_DDP|DDP_LOCAL_CATA;
+		*ecode = 0;
+		break;
+	case T4_ERR_MO:
+		*layer_type = LAYER_DDP|DDP_UNTAGGED_ERR;
+		*ecode = DDPU_INV_MO;
+		break;
+	default:
+		*layer_type = LAYER_RDMAP|DDP_LOCAL_CATA;
+		*ecode = 0;
+		break;
+	}
+}
+
+static void post_terminate(struct c4iw_qp *qhp, struct t4_cqe *err_cqe,
+			   gfp_t gfp)
+{
+	struct fw_ri_wr *wqe;
+	struct sk_buff *skb;
+	struct terminate_message *term;
+
+	PDBG("%s qhp %p qid 0x%x tid %u\n", __func__, qhp, qhp->wq.sq.qid,
+	     qhp->ep->hwtid);
+
+	skb = alloc_skb(sizeof *wqe, gfp);
+	if (!skb)
+		return;
+	set_wr_txq(skb, CPL_PRIORITY_DATA, qhp->ep->txq_idx);
+
+	wqe = (struct fw_ri_wr *)__skb_put(skb, sizeof(*wqe));
+	memset(wqe, 0, sizeof *wqe);
+	wqe->op_compl = cpu_to_be32(FW_WR_OP(FW_RI_INIT_WR));
+	wqe->flowid_len16 = cpu_to_be32(
+		FW_WR_FLOWID(qhp->ep->hwtid) |
+		FW_WR_LEN16(DIV_ROUND_UP(sizeof *wqe, 16)));
+
+	wqe->u.terminate.type = FW_RI_TYPE_TERMINATE;
+	wqe->u.terminate.immdlen = cpu_to_be32(sizeof *term);
+	term = (struct terminate_message *)wqe->u.terminate.termmsg;
+	if (qhp->attr.layer_etype == (LAYER_MPA|DDP_LLP)) {
+		term->layer_etype = qhp->attr.layer_etype;
+		term->ecode = qhp->attr.ecode;
+	} else
+		build_term_codes(err_cqe, &term->layer_etype, &term->ecode);
+	c4iw_ofld_send(&qhp->rhp->rdev, skb);
+}
+
+/*
+ * Assumes qhp lock is held.
+ */
+static void __flush_qp(struct c4iw_qp *qhp, struct c4iw_cq *rchp,
+		       struct c4iw_cq *schp)
+{
+	int count;
+	int rq_flushed, sq_flushed;
+	unsigned long flag;
+
+	PDBG("%s qhp %p rchp %p schp %p\n", __func__, qhp, rchp, schp);
+
+	/* locking hierarchy: cq lock first, then qp lock. */
+	spin_lock_irqsave(&rchp->lock, flag);
+	spin_lock(&qhp->lock);
+
+	if (qhp->wq.flushed) {
+		spin_unlock(&qhp->lock);
+		spin_unlock_irqrestore(&rchp->lock, flag);
+		return;
+	}
+	qhp->wq.flushed = 1;
+
+	c4iw_flush_hw_cq(rchp);
+	c4iw_count_rcqes(&rchp->cq, &qhp->wq, &count);
+	rq_flushed = c4iw_flush_rq(&qhp->wq, &rchp->cq, count);
+	spin_unlock(&qhp->lock);
+	spin_unlock_irqrestore(&rchp->lock, flag);
+
+	/* locking hierarchy: cq lock first, then qp lock. */
+	spin_lock_irqsave(&schp->lock, flag);
+	spin_lock(&qhp->lock);
+	if (schp != rchp)
+		c4iw_flush_hw_cq(schp);
+	sq_flushed = c4iw_flush_sq(qhp);
+	spin_unlock(&qhp->lock);
+	spin_unlock_irqrestore(&schp->lock, flag);
+
+	if (schp == rchp) {
+		if (t4_clear_cq_armed(&rchp->cq) &&
+		    (rq_flushed || sq_flushed)) {
+			spin_lock_irqsave(&rchp->comp_handler_lock, flag);
+			(*rchp->ibcq.comp_handler)(&rchp->ibcq,
+						   rchp->ibcq.cq_context);
+			spin_unlock_irqrestore(&rchp->comp_handler_lock, flag);
+		}
+	} else {
+		if (t4_clear_cq_armed(&rchp->cq) && rq_flushed) {
+			spin_lock_irqsave(&rchp->comp_handler_lock, flag);
+			(*rchp->ibcq.comp_handler)(&rchp->ibcq,
+						   rchp->ibcq.cq_context);
+			spin_unlock_irqrestore(&rchp->comp_handler_lock, flag);
+		}
+		if (t4_clear_cq_armed(&schp->cq) && sq_flushed) {
+			spin_lock_irqsave(&schp->comp_handler_lock, flag);
+			(*schp->ibcq.comp_handler)(&schp->ibcq,
+						   schp->ibcq.cq_context);
+			spin_unlock_irqrestore(&schp->comp_handler_lock, flag);
+		}
+	}
+}
+
+static void flush_qp(struct c4iw_qp *qhp)
+{
+	struct c4iw_cq *rchp, *schp;
+	unsigned long flag;
+
+	rchp = to_c4iw_cq(qhp->ibqp.recv_cq);
+	schp = to_c4iw_cq(qhp->ibqp.send_cq);
+
+	t4_set_wq_in_error(&qhp->wq);
+	if (qhp->ibqp.uobject) {
+		t4_set_cq_in_error(&rchp->cq);
+		spin_lock_irqsave(&rchp->comp_handler_lock, flag);
+		(*rchp->ibcq.comp_handler)(&rchp->ibcq, rchp->ibcq.cq_context);
+		spin_unlock_irqrestore(&rchp->comp_handler_lock, flag);
+		if (schp != rchp) {
+			t4_set_cq_in_error(&schp->cq);
+			spin_lock_irqsave(&schp->comp_handler_lock, flag);
+			(*schp->ibcq.comp_handler)(&schp->ibcq,
+					schp->ibcq.cq_context);
+			spin_unlock_irqrestore(&schp->comp_handler_lock, flag);
+		}
+		return;
+	}
+	__flush_qp(qhp, rchp, schp);
+}
+
+static int rdma_fini(struct c4iw_dev *rhp, struct c4iw_qp *qhp,
+		     struct c4iw_ep *ep)
+{
+	struct fw_ri_wr *wqe;
+	int ret;
+	struct sk_buff *skb;
+
+	PDBG("%s qhp %p qid 0x%x tid %u\n", __func__, qhp, qhp->wq.sq.qid,
+	     ep->hwtid);
+
+	skb = alloc_skb(sizeof *wqe, GFP_KERNEL);
+	if (!skb)
+		return -ENOMEM;
+	set_wr_txq(skb, CPL_PRIORITY_DATA, ep->txq_idx);
+
+	wqe = (struct fw_ri_wr *)__skb_put(skb, sizeof(*wqe));
+	memset(wqe, 0, sizeof *wqe);
+	wqe->op_compl = cpu_to_be32(
+		FW_WR_OP(FW_RI_INIT_WR) |
+		FW_WR_COMPL(1));
+	wqe->flowid_len16 = cpu_to_be32(
+		FW_WR_FLOWID(ep->hwtid) |
+		FW_WR_LEN16(DIV_ROUND_UP(sizeof *wqe, 16)));
+	wqe->cookie = (unsigned long) &ep->com.wr_wait;
+
+	wqe->u.fini.type = FW_RI_TYPE_FINI;
+	ret = c4iw_ofld_send(&rhp->rdev, skb);
+	if (ret)
+		goto out;
+
+	ret = c4iw_wait_for_reply(&rhp->rdev, &ep->com.wr_wait, qhp->ep->hwtid,
+			     qhp->wq.sq.qid, __func__);
+out:
+	PDBG("%s ret %d\n", __func__, ret);
+	return ret;
+}
+
+static void build_rtr_msg(u8 p2p_type, struct fw_ri_init *init)
+{
+	PDBG("%s p2p_type = %d\n", __func__, p2p_type);
+	memset(&init->u, 0, sizeof init->u);
+	switch (p2p_type) {
+	case FW_RI_INIT_P2PTYPE_RDMA_WRITE:
+		init->u.write.opcode = FW_RI_RDMA_WRITE_WR;
+		init->u.write.stag_sink = cpu_to_be32(1);
+		init->u.write.to_sink = cpu_to_be64(1);
+		init->u.write.u.immd_src[0].op = FW_RI_DATA_IMMD;
+		init->u.write.len16 = DIV_ROUND_UP(sizeof init->u.write +
+						   sizeof(struct fw_ri_immd),
+						   16);
+		break;
+	case FW_RI_INIT_P2PTYPE_READ_REQ:
+		init->u.write.opcode = FW_RI_RDMA_READ_WR;
+		init->u.read.stag_src = cpu_to_be32(1);
+		init->u.read.to_src_lo = cpu_to_be32(1);
+		init->u.read.stag_sink = cpu_to_be32(1);
+		init->u.read.to_sink_lo = cpu_to_be32(1);
+		init->u.read.len16 = DIV_ROUND_UP(sizeof init->u.read, 16);
+		break;
+	}
+}
+
+static int rdma_init(struct c4iw_dev *rhp, struct c4iw_qp *qhp)
+{
+	struct fw_ri_wr *wqe;
+	int ret;
+	struct sk_buff *skb;
+
+	PDBG("%s qhp %p qid 0x%x tid %u ird %u ord %u\n", __func__, qhp,
+	     qhp->wq.sq.qid, qhp->ep->hwtid, qhp->ep->ird, qhp->ep->ord);
+
+	skb = alloc_skb(sizeof *wqe, GFP_KERNEL);
+	if (!skb) {
+		ret = -ENOMEM;
+		goto out;
+	}
+	ret = alloc_ird(rhp, qhp->attr.max_ird);
+	if (ret) {
+		qhp->attr.max_ird = 0;
+		kfree_skb(skb);
+		goto out;
+	}
+	set_wr_txq(skb, CPL_PRIORITY_DATA, qhp->ep->txq_idx);
+
+	wqe = (struct fw_ri_wr *)__skb_put(skb, sizeof(*wqe));
+	memset(wqe, 0, sizeof *wqe);
+	wqe->op_compl = cpu_to_be32(
+		FW_WR_OP(FW_RI_INIT_WR) |
+		FW_WR_COMPL(1));
+	wqe->flowid_len16 = cpu_to_be32(
+		FW_WR_FLOWID(qhp->ep->hwtid) |
+		FW_WR_LEN16(DIV_ROUND_UP(sizeof *wqe, 16)));
+
+	wqe->cookie = (unsigned long) &qhp->ep->com.wr_wait;
+
+	wqe->u.init.type = FW_RI_TYPE_INIT;
+	wqe->u.init.mpareqbit_p2ptype =
+		V_FW_RI_WR_MPAREQBIT(qhp->attr.mpa_attr.initiator) |
+		V_FW_RI_WR_P2PTYPE(qhp->attr.mpa_attr.p2p_type);
+	wqe->u.init.mpa_attrs = FW_RI_MPA_IETF_ENABLE;
+	if (qhp->attr.mpa_attr.recv_marker_enabled)
+		wqe->u.init.mpa_attrs |= FW_RI_MPA_RX_MARKER_ENABLE;
+	if (qhp->attr.mpa_attr.xmit_marker_enabled)
+		wqe->u.init.mpa_attrs |= FW_RI_MPA_TX_MARKER_ENABLE;
+	if (qhp->attr.mpa_attr.crc_enabled)
+		wqe->u.init.mpa_attrs |= FW_RI_MPA_CRC_ENABLE;
+
+	wqe->u.init.qp_caps = FW_RI_QP_RDMA_READ_ENABLE |
+			    FW_RI_QP_RDMA_WRITE_ENABLE |
+			    FW_RI_QP_BIND_ENABLE;
+	if (!qhp->ibqp.uobject)
+		wqe->u.init.qp_caps |= FW_RI_QP_FAST_REGISTER_ENABLE |
+				     FW_RI_QP_STAG0_ENABLE;
+	wqe->u.init.nrqe = cpu_to_be16(t4_rqes_posted(&qhp->wq));
+	wqe->u.init.pdid = cpu_to_be32(qhp->attr.pd);
+	wqe->u.init.qpid = cpu_to_be32(qhp->wq.sq.qid);
+	wqe->u.init.sq_eqid = cpu_to_be32(qhp->wq.sq.qid);
+	wqe->u.init.rq_eqid = cpu_to_be32(qhp->wq.rq.qid);
+	wqe->u.init.scqid = cpu_to_be32(qhp->attr.scq);
+	wqe->u.init.rcqid = cpu_to_be32(qhp->attr.rcq);
+	wqe->u.init.ord_max = cpu_to_be32(qhp->attr.max_ord);
+	wqe->u.init.ird_max = cpu_to_be32(qhp->attr.max_ird);
+	wqe->u.init.iss = cpu_to_be32(qhp->ep->snd_seq);
+	wqe->u.init.irs = cpu_to_be32(qhp->ep->rcv_seq);
+	wqe->u.init.hwrqsize = cpu_to_be32(qhp->wq.rq.rqt_size);
+	wqe->u.init.hwrqaddr = cpu_to_be32(qhp->wq.rq.rqt_hwaddr -
+					 rhp->rdev.lldi.vr->rq.start);
+	if (qhp->attr.mpa_attr.initiator)
+		build_rtr_msg(qhp->attr.mpa_attr.p2p_type, &wqe->u.init);
+
+	ret = c4iw_ofld_send(&rhp->rdev, skb);
+	if (ret)
+		goto err1;
+
+	ret = c4iw_wait_for_reply(&rhp->rdev, &qhp->ep->com.wr_wait,
+				  qhp->ep->hwtid, qhp->wq.sq.qid, __func__);
+	if (!ret)
+		goto out;
+err1:
+	free_ird(rhp, qhp->attr.max_ird);
+out:
+	PDBG("%s ret %d\n", __func__, ret);
+	return ret;
+}
+
+int c4iw_modify_qp(struct c4iw_dev *rhp, struct c4iw_qp *qhp,
+		   enum c4iw_qp_attr_mask mask,
+		   struct c4iw_qp_attributes *attrs,
+		   int internal)
+{
+	int ret = 0;
+	struct c4iw_qp_attributes newattr = qhp->attr;
+	int disconnect = 0;
+	int terminate = 0;
+	int abort = 0;
+	int free = 0;
+	struct c4iw_ep *ep = NULL;
+
+	PDBG("%s qhp %p sqid 0x%x rqid 0x%x ep %p state %d -> %d\n", __func__,
+	     qhp, qhp->wq.sq.qid, qhp->wq.rq.qid, qhp->ep, qhp->attr.state,
+	     (mask & C4IW_QP_ATTR_NEXT_STATE) ? attrs->next_state : -1);
+
+	mutex_lock(&qhp->mutex);
+
+	/* Process attr changes if in IDLE */
+	if (mask & C4IW_QP_ATTR_VALID_MODIFY) {
+		if (qhp->attr.state != C4IW_QP_STATE_IDLE) {
+			ret = -EIO;
+			goto out;
+		}
+		if (mask & C4IW_QP_ATTR_ENABLE_RDMA_READ)
+			newattr.enable_rdma_read = attrs->enable_rdma_read;
+		if (mask & C4IW_QP_ATTR_ENABLE_RDMA_WRITE)
+			newattr.enable_rdma_write = attrs->enable_rdma_write;
+		if (mask & C4IW_QP_ATTR_ENABLE_RDMA_BIND)
+			newattr.enable_bind = attrs->enable_bind;
+		if (mask & C4IW_QP_ATTR_MAX_ORD) {
+			if (attrs->max_ord > c4iw_max_read_depth) {
+				ret = -EINVAL;
+				goto out;
+			}
+			newattr.max_ord = attrs->max_ord;
+		}
+		if (mask & C4IW_QP_ATTR_MAX_IRD) {
+			if (attrs->max_ird > cur_max_read_depth(rhp)) {
+				ret = -EINVAL;
+				goto out;
+			}
+			newattr.max_ird = attrs->max_ird;
+		}
+		qhp->attr = newattr;
+	}
+
+	if (mask & C4IW_QP_ATTR_SQ_DB) {
+		ret = ring_kernel_sq_db(qhp, attrs->sq_db_inc);
+		goto out;
+	}
+	if (mask & C4IW_QP_ATTR_RQ_DB) {
+		ret = ring_kernel_rq_db(qhp, attrs->rq_db_inc);
+		goto out;
+	}
+
+	if (!(mask & C4IW_QP_ATTR_NEXT_STATE))
+		goto out;
+	if (qhp->attr.state == attrs->next_state)
+		goto out;
+
+	switch (qhp->attr.state) {
+	case C4IW_QP_STATE_IDLE:
+		switch (attrs->next_state) {
+		case C4IW_QP_STATE_RTS:
+			if (!(mask & C4IW_QP_ATTR_LLP_STREAM_HANDLE)) {
+				ret = -EINVAL;
+				goto out;
+			}
+			if (!(mask & C4IW_QP_ATTR_MPA_ATTR)) {
+				ret = -EINVAL;
+				goto out;
+			}
+			qhp->attr.mpa_attr = attrs->mpa_attr;
+			qhp->attr.llp_stream_handle = attrs->llp_stream_handle;
+			qhp->ep = qhp->attr.llp_stream_handle;
+			set_state(qhp, C4IW_QP_STATE_RTS);
+
+			/*
+			 * Ref the endpoint here and deref when we
+			 * disassociate the endpoint from the QP.  This
+			 * happens in CLOSING->IDLE transition or *->ERROR
+			 * transition.
+			 */
+			c4iw_get_ep(&qhp->ep->com);
+			ret = rdma_init(rhp, qhp);
+			if (ret)
+				goto err;
+			break;
+		case C4IW_QP_STATE_ERROR:
+			set_state(qhp, C4IW_QP_STATE_ERROR);
+			flush_qp(qhp);
+			break;
+		default:
+			ret = -EINVAL;
+			goto out;
+		}
+		break;
+	case C4IW_QP_STATE_RTS:
+		switch (attrs->next_state) {
+		case C4IW_QP_STATE_CLOSING:
+			BUG_ON(atomic_read(&qhp->ep->com.kref.refcount) < 2);
+			t4_set_wq_in_error(&qhp->wq);
+			set_state(qhp, C4IW_QP_STATE_CLOSING);
+			ep = qhp->ep;
+			if (!internal) {
+				abort = 0;
+				disconnect = 1;
+				c4iw_get_ep(&qhp->ep->com);
+			}
+			ret = rdma_fini(rhp, qhp, ep);
+			if (ret)
+				goto err;
+			break;
+		case C4IW_QP_STATE_TERMINATE:
+			t4_set_wq_in_error(&qhp->wq);
+			set_state(qhp, C4IW_QP_STATE_TERMINATE);
+			qhp->attr.layer_etype = attrs->layer_etype;
+			qhp->attr.ecode = attrs->ecode;
+			ep = qhp->ep;
+			if (!internal) {
+				c4iw_get_ep(&qhp->ep->com);
+				terminate = 1;
+				disconnect = 1;
+			} else {
+				terminate = qhp->attr.send_term;
+				ret = rdma_fini(rhp, qhp, ep);
+				if (ret)
+					goto err;
+			}
+			break;
+		case C4IW_QP_STATE_ERROR:
+			t4_set_wq_in_error(&qhp->wq);
+			set_state(qhp, C4IW_QP_STATE_ERROR);
+			if (!internal) {
+				abort = 1;
+				disconnect = 1;
+				ep = qhp->ep;
+				c4iw_get_ep(&qhp->ep->com);
+			}
+			goto err;
+			break;
+		default:
+			ret = -EINVAL;
+			goto out;
+		}
+		break;
+	case C4IW_QP_STATE_CLOSING:
+		if (!internal) {
+			ret = -EINVAL;
+			goto out;
+		}
+		switch (attrs->next_state) {
+		case C4IW_QP_STATE_IDLE:
+			flush_qp(qhp);
+			set_state(qhp, C4IW_QP_STATE_IDLE);
+			qhp->attr.llp_stream_handle = NULL;
+			c4iw_put_ep(&qhp->ep->com);
+			qhp->ep = NULL;
+			wake_up(&qhp->wait);
+			break;
+		case C4IW_QP_STATE_ERROR:
+			goto err;
+		default:
+			ret = -EINVAL;
+			goto err;
+		}
+		break;
+	case C4IW_QP_STATE_ERROR:
+		if (attrs->next_state != C4IW_QP_STATE_IDLE) {
+			ret = -EINVAL;
+			goto out;
+		}
+		if (!t4_sq_empty(&qhp->wq) || !t4_rq_empty(&qhp->wq)) {
+			ret = -EINVAL;
+			goto out;
+		}
+		set_state(qhp, C4IW_QP_STATE_IDLE);
+		break;
+	case C4IW_QP_STATE_TERMINATE:
+		if (!internal) {
+			ret = -EINVAL;
+			goto out;
+		}
+		goto err;
+		break;
+	default:
+		printk(KERN_ERR "%s in a bad state %d\n",
+		       __func__, qhp->attr.state);
+		ret = -EINVAL;
+		goto err;
+		break;
+	}
+	goto out;
+err:
+	PDBG("%s disassociating ep %p qpid 0x%x\n", __func__, qhp->ep,
+	     qhp->wq.sq.qid);
+
+	/* disassociate the LLP connection */
+	qhp->attr.llp_stream_handle = NULL;
+	if (!ep)
+		ep = qhp->ep;
+	qhp->ep = NULL;
+	set_state(qhp, C4IW_QP_STATE_ERROR);
+	free = 1;
+	abort = 1;
+	wake_up(&qhp->wait);
+	BUG_ON(!ep);
+	flush_qp(qhp);
+out:
+	mutex_unlock(&qhp->mutex);
+
+	if (terminate)
+		post_terminate(qhp, NULL, internal ? GFP_ATOMIC : GFP_KERNEL);
+
+	/*
+	 * If disconnect is 1, then we need to initiate a disconnect
+	 * on the EP.  This can be a normal close (RTS->CLOSING) or
+	 * an abnormal close (RTS/CLOSING->ERROR).
+	 */
+	if (disconnect) {
+		c4iw_ep_disconnect(ep, abort, internal ? GFP_ATOMIC :
+							 GFP_KERNEL);
+		c4iw_put_ep(&ep->com);
+	}
+
+	/*
+	 * If free is 1, then we've disassociated the EP from the QP
+	 * and we need to dereference the EP.
+	 */
+	if (free)
+		c4iw_put_ep(&ep->com);
+	PDBG("%s exit state %d\n", __func__, qhp->attr.state);
+	return ret;
+}
+
+int c4iw_destroy_qp(struct ib_qp *ib_qp)
+{
+	struct c4iw_dev *rhp;
+	struct c4iw_qp *qhp;
+	struct c4iw_qp_attributes attrs;
+	struct c4iw_ucontext *ucontext;
+
+	qhp = to_c4iw_qp(ib_qp);
+	rhp = qhp->rhp;
+
+	attrs.next_state = C4IW_QP_STATE_ERROR;
+	if (qhp->attr.state == C4IW_QP_STATE_TERMINATE)
+		c4iw_modify_qp(rhp, qhp, C4IW_QP_ATTR_NEXT_STATE, &attrs, 1);
+	else
+		c4iw_modify_qp(rhp, qhp, C4IW_QP_ATTR_NEXT_STATE, &attrs, 0);
+	wait_event(qhp->wait, !qhp->ep);
+
+	remove_handle(rhp, &rhp->qpidr, qhp->wq.sq.qid);
+	atomic_dec(&qhp->refcnt);
+	wait_event(qhp->wait, !atomic_read(&qhp->refcnt));
+
+	spin_lock_irq(&rhp->lock);
+	if (!list_empty(&qhp->db_fc_entry))
+		list_del_init(&qhp->db_fc_entry);
+	spin_unlock_irq(&rhp->lock);
+	free_ird(rhp, qhp->attr.max_ird);
+
+	ucontext = ib_qp->uobject ?
+		   to_c4iw_ucontext(ib_qp->uobject->context) : NULL;
+	destroy_qp(&rhp->rdev, &qhp->wq,
+		   ucontext ? &ucontext->uctx : &rhp->rdev.uctx);
+
+	PDBG("%s ib_qp %p qpid 0x%0x\n", __func__, ib_qp, qhp->wq.sq.qid);
+	kfree(qhp);
+	return 0;
+}
+
+struct ib_qp *c4iw_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *attrs,
+			     struct ib_udata *udata)
+{
+	struct c4iw_dev *rhp;
+	struct c4iw_qp *qhp;
+	struct c4iw_pd *php;
+	struct c4iw_cq *schp;
+	struct c4iw_cq *rchp;
+	struct c4iw_create_qp_resp uresp;
+	unsigned int sqsize, rqsize;
+	struct c4iw_ucontext *ucontext;
+	int ret;
+	struct c4iw_mm_entry *mm1, *mm2, *mm3, *mm4, *mm5 = NULL;
+
+	PDBG("%s ib_pd %p\n", __func__, pd);
+
+	if (attrs->qp_type != IB_QPT_RC)
+		return ERR_PTR(-EINVAL);
+
+	php = to_c4iw_pd(pd);
+	rhp = php->rhp;
+	schp = get_chp(rhp, ((struct c4iw_cq *)attrs->send_cq)->cq.cqid);
+	rchp = get_chp(rhp, ((struct c4iw_cq *)attrs->recv_cq)->cq.cqid);
+	if (!schp || !rchp)
+		return ERR_PTR(-EINVAL);
+
+	if (attrs->cap.max_inline_data > T4_MAX_SEND_INLINE)
+		return ERR_PTR(-EINVAL);
+
+	if (attrs->cap.max_recv_wr > rhp->rdev.hw_queue.t4_max_rq_size)
+		return ERR_PTR(-E2BIG);
+	rqsize = attrs->cap.max_recv_wr + 1;
+	if (rqsize < 8)
+		rqsize = 8;
+
+	if (attrs->cap.max_send_wr > rhp->rdev.hw_queue.t4_max_sq_size)
+		return ERR_PTR(-E2BIG);
+	sqsize = attrs->cap.max_send_wr + 1;
+	if (sqsize < 8)
+		sqsize = 8;
+
+	ucontext = pd->uobject ? to_c4iw_ucontext(pd->uobject->context) : NULL;
+
+	qhp = kzalloc(sizeof(*qhp), GFP_KERNEL);
+	if (!qhp)
+		return ERR_PTR(-ENOMEM);
+	qhp->wq.sq.size = sqsize;
+	qhp->wq.sq.memsize =
+		(sqsize + rhp->rdev.hw_queue.t4_eq_status_entries) *
+		sizeof(*qhp->wq.sq.queue) + 16 * sizeof(__be64);
+	qhp->wq.sq.flush_cidx = -1;
+	qhp->wq.rq.size = rqsize;
+	qhp->wq.rq.memsize =
+		(rqsize + rhp->rdev.hw_queue.t4_eq_status_entries) *
+		sizeof(*qhp->wq.rq.queue);
+
+	if (ucontext) {
+		qhp->wq.sq.memsize = roundup(qhp->wq.sq.memsize, PAGE_SIZE);
+		qhp->wq.rq.memsize = roundup(qhp->wq.rq.memsize, PAGE_SIZE);
+	}
+
+	ret = create_qp(&rhp->rdev, &qhp->wq, &schp->cq, &rchp->cq,
+			ucontext ? &ucontext->uctx : &rhp->rdev.uctx);
+	if (ret)
+		goto err1;
+
+	attrs->cap.max_recv_wr = rqsize - 1;
+	attrs->cap.max_send_wr = sqsize - 1;
+	attrs->cap.max_inline_data = T4_MAX_SEND_INLINE;
+
+	qhp->rhp = rhp;
+	qhp->attr.pd = php->pdid;
+	qhp->attr.scq = ((struct c4iw_cq *) attrs->send_cq)->cq.cqid;
+	qhp->attr.rcq = ((struct c4iw_cq *) attrs->recv_cq)->cq.cqid;
+	qhp->attr.sq_num_entries = attrs->cap.max_send_wr;
+	qhp->attr.rq_num_entries = attrs->cap.max_recv_wr;
+	qhp->attr.sq_max_sges = attrs->cap.max_send_sge;
+	qhp->attr.sq_max_sges_rdma_write = attrs->cap.max_send_sge;
+	qhp->attr.rq_max_sges = attrs->cap.max_recv_sge;
+	qhp->attr.state = C4IW_QP_STATE_IDLE;
+	qhp->attr.next_state = C4IW_QP_STATE_IDLE;
+	qhp->attr.enable_rdma_read = 1;
+	qhp->attr.enable_rdma_write = 1;
+	qhp->attr.enable_bind = 1;
+	qhp->attr.max_ord = 0;
+	qhp->attr.max_ird = 0;
+	qhp->sq_sig_all = attrs->sq_sig_type == IB_SIGNAL_ALL_WR;
+	spin_lock_init(&qhp->lock);
+	mutex_init(&qhp->mutex);
+	init_waitqueue_head(&qhp->wait);
+	atomic_set(&qhp->refcnt, 1);
+
+	ret = insert_handle(rhp, &rhp->qpidr, qhp, qhp->wq.sq.qid);
+	if (ret)
+		goto err2;
+
+	if (udata) {
+		mm1 = kmalloc(sizeof *mm1, GFP_KERNEL);
+		if (!mm1) {
+			ret = -ENOMEM;
+			goto err3;
+		}
+		mm2 = kmalloc(sizeof *mm2, GFP_KERNEL);
+		if (!mm2) {
+			ret = -ENOMEM;
+			goto err4;
+		}
+		mm3 = kmalloc(sizeof *mm3, GFP_KERNEL);
+		if (!mm3) {
+			ret = -ENOMEM;
+			goto err5;
+		}
+		mm4 = kmalloc(sizeof *mm4, GFP_KERNEL);
+		if (!mm4) {
+			ret = -ENOMEM;
+			goto err6;
+		}
+		if (t4_sq_onchip(&qhp->wq.sq)) {
+			mm5 = kmalloc(sizeof *mm5, GFP_KERNEL);
+			if (!mm5) {
+				ret = -ENOMEM;
+				goto err7;
+			}
+			uresp.flags = C4IW_QPF_ONCHIP;
+		} else
+			uresp.flags = 0;
+		uresp.qid_mask = rhp->rdev.qpmask;
+		uresp.sqid = qhp->wq.sq.qid;
+		uresp.sq_size = qhp->wq.sq.size;
+		uresp.sq_memsize = qhp->wq.sq.memsize;
+		uresp.rqid = qhp->wq.rq.qid;
+		uresp.rq_size = qhp->wq.rq.size;
+		uresp.rq_memsize = qhp->wq.rq.memsize;
+		spin_lock(&ucontext->mmap_lock);
+		if (mm5) {
+			uresp.ma_sync_key = ucontext->key;
+			ucontext->key += PAGE_SIZE;
+		} else {
+			uresp.ma_sync_key =  0;
+		}
+		uresp.sq_key = ucontext->key;
+		ucontext->key += PAGE_SIZE;
+		uresp.rq_key = ucontext->key;
+		ucontext->key += PAGE_SIZE;
+		uresp.sq_db_gts_key = ucontext->key;
+		ucontext->key += PAGE_SIZE;
+		uresp.rq_db_gts_key = ucontext->key;
+		ucontext->key += PAGE_SIZE;
+		spin_unlock(&ucontext->mmap_lock);
+		ret = ib_copy_to_udata(udata, &uresp, sizeof uresp);
+		if (ret)
+			goto err8;
+		mm1->key = uresp.sq_key;
+		mm1->addr = qhp->wq.sq.phys_addr;
+		mm1->len = PAGE_ALIGN(qhp->wq.sq.memsize);
+		insert_mmap(ucontext, mm1);
+		mm2->key = uresp.rq_key;
+		mm2->addr = virt_to_phys(qhp->wq.rq.queue);
+		mm2->len = PAGE_ALIGN(qhp->wq.rq.memsize);
+		insert_mmap(ucontext, mm2);
+		mm3->key = uresp.sq_db_gts_key;
+		mm3->addr = (__force unsigned long) qhp->wq.sq.udb;
+		mm3->len = PAGE_SIZE;
+		insert_mmap(ucontext, mm3);
+		mm4->key = uresp.rq_db_gts_key;
+		mm4->addr = (__force unsigned long) qhp->wq.rq.udb;
+		mm4->len = PAGE_SIZE;
+		insert_mmap(ucontext, mm4);
+		if (mm5) {
+			mm5->key = uresp.ma_sync_key;
+			mm5->addr = (pci_resource_start(rhp->rdev.lldi.pdev, 0)
+				    + A_PCIE_MA_SYNC) & PAGE_MASK;
+			mm5->len = PAGE_SIZE;
+			insert_mmap(ucontext, mm5);
+		}
+	}
+	qhp->ibqp.qp_num = qhp->wq.sq.qid;
+	init_timer(&(qhp->timer));
+	INIT_LIST_HEAD(&qhp->db_fc_entry);
+	PDBG("%s sq id %u size %u memsize %zu num_entries %u "
+	     "rq id %u size %u memsize %zu num_entries %u\n", __func__,
+	     qhp->wq.sq.qid, qhp->wq.sq.size, qhp->wq.sq.memsize,
+	     attrs->cap.max_send_wr, qhp->wq.rq.qid, qhp->wq.rq.size,
+	     qhp->wq.rq.memsize, attrs->cap.max_recv_wr);
+	return &qhp->ibqp;
+err8:
+	kfree(mm5);
+err7:
+	kfree(mm4);
+err6:
+	kfree(mm3);
+err5:
+	kfree(mm2);
+err4:
+	kfree(mm1);
+err3:
+	remove_handle(rhp, &rhp->qpidr, qhp->wq.sq.qid);
+err2:
+	destroy_qp(&rhp->rdev, &qhp->wq,
+		   ucontext ? &ucontext->uctx : &rhp->rdev.uctx);
+err1:
+	kfree(qhp);
+	return ERR_PTR(ret);
+}
+
+int c4iw_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+		      int attr_mask, struct ib_udata *udata)
+{
+	struct c4iw_dev *rhp;
+	struct c4iw_qp *qhp;
+	enum c4iw_qp_attr_mask mask = 0;
+	struct c4iw_qp_attributes attrs;
+
+	PDBG("%s ib_qp %p\n", __func__, ibqp);
+
+	/* iwarp does not support the RTR state */
+	if ((attr_mask & IB_QP_STATE) && (attr->qp_state == IB_QPS_RTR))
+		attr_mask &= ~IB_QP_STATE;
+
+	/* Make sure we still have something left to do */
+	if (!attr_mask)
+		return 0;
+
+	memset(&attrs, 0, sizeof attrs);
+	qhp = to_c4iw_qp(ibqp);
+	rhp = qhp->rhp;
+
+	attrs.next_state = c4iw_convert_state(attr->qp_state);
+	attrs.enable_rdma_read = (attr->qp_access_flags &
+			       IB_ACCESS_REMOTE_READ) ?  1 : 0;
+	attrs.enable_rdma_write = (attr->qp_access_flags &
+				IB_ACCESS_REMOTE_WRITE) ? 1 : 0;
+	attrs.enable_bind = (attr->qp_access_flags & IB_ACCESS_MW_BIND) ? 1 : 0;
+
+
+	mask |= (attr_mask & IB_QP_STATE) ? C4IW_QP_ATTR_NEXT_STATE : 0;
+	mask |= (attr_mask & IB_QP_ACCESS_FLAGS) ?
+			(C4IW_QP_ATTR_ENABLE_RDMA_READ |
+			 C4IW_QP_ATTR_ENABLE_RDMA_WRITE |
+			 C4IW_QP_ATTR_ENABLE_RDMA_BIND) : 0;
+
+	/*
+	 * Use SQ_PSN and RQ_PSN to pass in IDX_INC values for
+	 * ringing the queue db when we're in DB_FULL mode.
+	 * Only allow this on T4 devices.
+	 */
+	attrs.sq_db_inc = attr->sq_psn;
+	attrs.rq_db_inc = attr->rq_psn;
+	mask |= (attr_mask & IB_QP_SQ_PSN) ? C4IW_QP_ATTR_SQ_DB : 0;
+	mask |= (attr_mask & IB_QP_RQ_PSN) ? C4IW_QP_ATTR_RQ_DB : 0;
+	if (is_t5(to_c4iw_qp(ibqp)->rhp->rdev.lldi.adapter_type) &&
+	    (mask & (C4IW_QP_ATTR_SQ_DB|C4IW_QP_ATTR_RQ_DB)))
+		return -EINVAL;
+
+	return c4iw_modify_qp(rhp, qhp, mask, &attrs, 0);
+}
+
+struct ib_qp *c4iw_get_qp(struct ib_device *dev, int qpn)
+{
+	PDBG("%s ib_dev %p qpn 0x%x\n", __func__, dev, qpn);
+	return (struct ib_qp *)get_qhp(to_c4iw_dev(dev), qpn);
+}
+
+int c4iw_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+		     int attr_mask, struct ib_qp_init_attr *init_attr)
+{
+	struct c4iw_qp *qhp = to_c4iw_qp(ibqp);
+
+	memset(attr, 0, sizeof *attr);
+	memset(init_attr, 0, sizeof *init_attr);
+	attr->qp_state = to_ib_qp_state(qhp->attr.state);
+	init_attr->cap.max_send_wr = qhp->attr.sq_num_entries;
+	init_attr->cap.max_recv_wr = qhp->attr.rq_num_entries;
+	init_attr->cap.max_send_sge = qhp->attr.sq_max_sges;
+	init_attr->cap.max_recv_sge = qhp->attr.sq_max_sges;
+	init_attr->cap.max_inline_data = T4_MAX_SEND_INLINE;
+	init_attr->sq_sig_type = qhp->sq_sig_all ? IB_SIGNAL_ALL_WR : 0;
+	return 0;
+}
diff --git a/drivers/infiniband/hw/cxgb4/resource.c b/drivers/infiniband/hw/cxgb4/resource.c
new file mode 100644
index 0000000..67df71a
--- /dev/null
+++ b/drivers/infiniband/hw/cxgb4/resource.c
@@ -0,0 +1,453 @@
+/*
+ * Copyright (c) 2009-2010 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/* Crude resource management */
+#include <linux/spinlock.h>
+#include <linux/genalloc.h>
+#include <linux/ratelimit.h>
+#include "iw_cxgb4.h"
+
+static int c4iw_init_qid_table(struct c4iw_rdev *rdev)
+{
+	u32 i;
+
+	if (c4iw_id_table_alloc(&rdev->resource.qid_table,
+				rdev->lldi.vr->qp.start,
+				rdev->lldi.vr->qp.size,
+				rdev->lldi.vr->qp.size, 0))
+		return -ENOMEM;
+
+	for (i = rdev->lldi.vr->qp.start;
+		i < rdev->lldi.vr->qp.start + rdev->lldi.vr->qp.size; i++)
+		if (!(i & rdev->qpmask))
+			c4iw_id_free(&rdev->resource.qid_table, i);
+	return 0;
+}
+
+/* nr_* must be power of 2 */
+int c4iw_init_resource(struct c4iw_rdev *rdev, u32 nr_tpt, u32 nr_pdid)
+{
+	int err = 0;
+	err = c4iw_id_table_alloc(&rdev->resource.tpt_table, 0, nr_tpt, 1,
+					C4IW_ID_TABLE_F_RANDOM);
+	if (err)
+		goto tpt_err;
+	err = c4iw_init_qid_table(rdev);
+	if (err)
+		goto qid_err;
+	err = c4iw_id_table_alloc(&rdev->resource.pdid_table, 0,
+					nr_pdid, 1, 0);
+	if (err)
+		goto pdid_err;
+	return 0;
+ pdid_err:
+	c4iw_id_table_free(&rdev->resource.qid_table);
+ qid_err:
+	c4iw_id_table_free(&rdev->resource.tpt_table);
+ tpt_err:
+	return -ENOMEM;
+}
+
+/*
+ * returns 0 if no resource available
+ */
+u32 c4iw_get_resource(struct c4iw_id_table *id_table)
+{
+	u32 entry;
+	entry = c4iw_id_alloc(id_table);
+	if (entry == (u32)(-1))
+		return 0;
+	return entry;
+}
+
+void c4iw_put_resource(struct c4iw_id_table *id_table, u32 entry)
+{
+	PDBG("%s entry 0x%x\n", __func__, entry);
+	c4iw_id_free(id_table, entry);
+}
+
+u32 c4iw_get_cqid(struct c4iw_rdev *rdev, struct c4iw_dev_ucontext *uctx)
+{
+	struct c4iw_qid_list *entry;
+	u32 qid;
+	int i;
+
+	mutex_lock(&uctx->lock);
+	if (!list_empty(&uctx->cqids)) {
+		entry = list_entry(uctx->cqids.next, struct c4iw_qid_list,
+				   entry);
+		list_del(&entry->entry);
+		qid = entry->qid;
+		kfree(entry);
+	} else {
+		qid = c4iw_get_resource(&rdev->resource.qid_table);
+		if (!qid)
+			goto out;
+		mutex_lock(&rdev->stats.lock);
+		rdev->stats.qid.cur += rdev->qpmask + 1;
+		mutex_unlock(&rdev->stats.lock);
+		for (i = qid+1; i & rdev->qpmask; i++) {
+			entry = kmalloc(sizeof *entry, GFP_KERNEL);
+			if (!entry)
+				goto out;
+			entry->qid = i;
+			list_add_tail(&entry->entry, &uctx->cqids);
+		}
+
+		/*
+		 * now put the same ids on the qp list since they all
+		 * map to the same db/gts page.
+		 */
+		entry = kmalloc(sizeof *entry, GFP_KERNEL);
+		if (!entry)
+			goto out;
+		entry->qid = qid;
+		list_add_tail(&entry->entry, &uctx->qpids);
+		for (i = qid+1; i & rdev->qpmask; i++) {
+			entry = kmalloc(sizeof *entry, GFP_KERNEL);
+			if (!entry)
+				goto out;
+			entry->qid = i;
+			list_add_tail(&entry->entry, &uctx->qpids);
+		}
+	}
+out:
+	mutex_unlock(&uctx->lock);
+	PDBG("%s qid 0x%x\n", __func__, qid);
+	mutex_lock(&rdev->stats.lock);
+	if (rdev->stats.qid.cur > rdev->stats.qid.max)
+		rdev->stats.qid.max = rdev->stats.qid.cur;
+	mutex_unlock(&rdev->stats.lock);
+	return qid;
+}
+
+void c4iw_put_cqid(struct c4iw_rdev *rdev, u32 qid,
+		   struct c4iw_dev_ucontext *uctx)
+{
+	struct c4iw_qid_list *entry;
+
+	entry = kmalloc(sizeof *entry, GFP_KERNEL);
+	if (!entry)
+		return;
+	PDBG("%s qid 0x%x\n", __func__, qid);
+	entry->qid = qid;
+	mutex_lock(&uctx->lock);
+	list_add_tail(&entry->entry, &uctx->cqids);
+	mutex_unlock(&uctx->lock);
+}
+
+u32 c4iw_get_qpid(struct c4iw_rdev *rdev, struct c4iw_dev_ucontext *uctx)
+{
+	struct c4iw_qid_list *entry;
+	u32 qid;
+	int i;
+
+	mutex_lock(&uctx->lock);
+	if (!list_empty(&uctx->qpids)) {
+		entry = list_entry(uctx->qpids.next, struct c4iw_qid_list,
+				   entry);
+		list_del(&entry->entry);
+		qid = entry->qid;
+		kfree(entry);
+	} else {
+		qid = c4iw_get_resource(&rdev->resource.qid_table);
+		if (!qid) {
+			mutex_lock(&rdev->stats.lock);
+			rdev->stats.qid.fail++;
+			mutex_unlock(&rdev->stats.lock);
+			goto out;
+		}
+		mutex_lock(&rdev->stats.lock);
+		rdev->stats.qid.cur += rdev->qpmask + 1;
+		mutex_unlock(&rdev->stats.lock);
+		for (i = qid+1; i & rdev->qpmask; i++) {
+			entry = kmalloc(sizeof *entry, GFP_KERNEL);
+			if (!entry)
+				goto out;
+			entry->qid = i;
+			list_add_tail(&entry->entry, &uctx->qpids);
+		}
+
+		/*
+		 * now put the same ids on the cq list since they all
+		 * map to the same db/gts page.
+		 */
+		entry = kmalloc(sizeof *entry, GFP_KERNEL);
+		if (!entry)
+			goto out;
+		entry->qid = qid;
+		list_add_tail(&entry->entry, &uctx->cqids);
+		for (i = qid; i & rdev->qpmask; i++) {
+			entry = kmalloc(sizeof *entry, GFP_KERNEL);
+			if (!entry)
+				goto out;
+			entry->qid = i;
+			list_add_tail(&entry->entry, &uctx->cqids);
+		}
+	}
+out:
+	mutex_unlock(&uctx->lock);
+	PDBG("%s qid 0x%x\n", __func__, qid);
+	mutex_lock(&rdev->stats.lock);
+	if (rdev->stats.qid.cur > rdev->stats.qid.max)
+		rdev->stats.qid.max = rdev->stats.qid.cur;
+	mutex_unlock(&rdev->stats.lock);
+	return qid;
+}
+
+void c4iw_put_qpid(struct c4iw_rdev *rdev, u32 qid,
+		   struct c4iw_dev_ucontext *uctx)
+{
+	struct c4iw_qid_list *entry;
+
+	entry = kmalloc(sizeof *entry, GFP_KERNEL);
+	if (!entry)
+		return;
+	PDBG("%s qid 0x%x\n", __func__, qid);
+	entry->qid = qid;
+	mutex_lock(&uctx->lock);
+	list_add_tail(&entry->entry, &uctx->qpids);
+	mutex_unlock(&uctx->lock);
+}
+
+void c4iw_destroy_resource(struct c4iw_resource *rscp)
+{
+	c4iw_id_table_free(&rscp->tpt_table);
+	c4iw_id_table_free(&rscp->qid_table);
+	c4iw_id_table_free(&rscp->pdid_table);
+}
+
+/*
+ * PBL Memory Manager.  Uses Linux generic allocator.
+ */
+
+#define MIN_PBL_SHIFT 8			/* 256B == min PBL size (32 entries) */
+
+u32 c4iw_pblpool_alloc(struct c4iw_rdev *rdev, int size)
+{
+	unsigned long addr = gen_pool_alloc(rdev->pbl_pool, size);
+	PDBG("%s addr 0x%x size %d\n", __func__, (u32)addr, size);
+	mutex_lock(&rdev->stats.lock);
+	if (addr) {
+		rdev->stats.pbl.cur += roundup(size, 1 << MIN_PBL_SHIFT);
+		if (rdev->stats.pbl.cur > rdev->stats.pbl.max)
+			rdev->stats.pbl.max = rdev->stats.pbl.cur;
+	} else
+		rdev->stats.pbl.fail++;
+	mutex_unlock(&rdev->stats.lock);
+	return (u32)addr;
+}
+
+void c4iw_pblpool_free(struct c4iw_rdev *rdev, u32 addr, int size)
+{
+	PDBG("%s addr 0x%x size %d\n", __func__, addr, size);
+	mutex_lock(&rdev->stats.lock);
+	rdev->stats.pbl.cur -= roundup(size, 1 << MIN_PBL_SHIFT);
+	mutex_unlock(&rdev->stats.lock);
+	gen_pool_free(rdev->pbl_pool, (unsigned long)addr, size);
+}
+
+int c4iw_pblpool_create(struct c4iw_rdev *rdev)
+{
+	unsigned pbl_start, pbl_chunk, pbl_top;
+
+	rdev->pbl_pool = gen_pool_create(MIN_PBL_SHIFT, -1);
+	if (!rdev->pbl_pool)
+		return -ENOMEM;
+
+	pbl_start = rdev->lldi.vr->pbl.start;
+	pbl_chunk = rdev->lldi.vr->pbl.size;
+	pbl_top = pbl_start + pbl_chunk;
+
+	while (pbl_start < pbl_top) {
+		pbl_chunk = min(pbl_top - pbl_start + 1, pbl_chunk);
+		if (gen_pool_add(rdev->pbl_pool, pbl_start, pbl_chunk, -1)) {
+			PDBG("%s failed to add PBL chunk (%x/%x)\n",
+			     __func__, pbl_start, pbl_chunk);
+			if (pbl_chunk <= 1024 << MIN_PBL_SHIFT) {
+				printk(KERN_WARNING MOD
+				       "Failed to add all PBL chunks (%x/%x)\n",
+				       pbl_start,
+				       pbl_top - pbl_start);
+				return 0;
+			}
+			pbl_chunk >>= 1;
+		} else {
+			PDBG("%s added PBL chunk (%x/%x)\n",
+			     __func__, pbl_start, pbl_chunk);
+			pbl_start += pbl_chunk;
+		}
+	}
+
+	return 0;
+}
+
+void c4iw_pblpool_destroy(struct c4iw_rdev *rdev)
+{
+	gen_pool_destroy(rdev->pbl_pool);
+}
+
+/*
+ * RQT Memory Manager.  Uses Linux generic allocator.
+ */
+
+#define MIN_RQT_SHIFT 10	/* 1KB == min RQT size (16 entries) */
+
+u32 c4iw_rqtpool_alloc(struct c4iw_rdev *rdev, int size)
+{
+	unsigned long addr = gen_pool_alloc(rdev->rqt_pool, size << 6);
+	PDBG("%s addr 0x%x size %d\n", __func__, (u32)addr, size << 6);
+	if (!addr)
+		pr_warn_ratelimited(MOD "%s: Out of RQT memory\n",
+				    pci_name(rdev->lldi.pdev));
+	mutex_lock(&rdev->stats.lock);
+	if (addr) {
+		rdev->stats.rqt.cur += roundup(size << 6, 1 << MIN_RQT_SHIFT);
+		if (rdev->stats.rqt.cur > rdev->stats.rqt.max)
+			rdev->stats.rqt.max = rdev->stats.rqt.cur;
+	} else
+		rdev->stats.rqt.fail++;
+	mutex_unlock(&rdev->stats.lock);
+	return (u32)addr;
+}
+
+void c4iw_rqtpool_free(struct c4iw_rdev *rdev, u32 addr, int size)
+{
+	PDBG("%s addr 0x%x size %d\n", __func__, addr, size << 6);
+	mutex_lock(&rdev->stats.lock);
+	rdev->stats.rqt.cur -= roundup(size << 6, 1 << MIN_RQT_SHIFT);
+	mutex_unlock(&rdev->stats.lock);
+	gen_pool_free(rdev->rqt_pool, (unsigned long)addr, size << 6);
+}
+
+int c4iw_rqtpool_create(struct c4iw_rdev *rdev)
+{
+	unsigned rqt_start, rqt_chunk, rqt_top;
+
+	rdev->rqt_pool = gen_pool_create(MIN_RQT_SHIFT, -1);
+	if (!rdev->rqt_pool)
+		return -ENOMEM;
+
+	rqt_start = rdev->lldi.vr->rq.start;
+	rqt_chunk = rdev->lldi.vr->rq.size;
+	rqt_top = rqt_start + rqt_chunk;
+
+	while (rqt_start < rqt_top) {
+		rqt_chunk = min(rqt_top - rqt_start + 1, rqt_chunk);
+		if (gen_pool_add(rdev->rqt_pool, rqt_start, rqt_chunk, -1)) {
+			PDBG("%s failed to add RQT chunk (%x/%x)\n",
+			     __func__, rqt_start, rqt_chunk);
+			if (rqt_chunk <= 1024 << MIN_RQT_SHIFT) {
+				printk(KERN_WARNING MOD
+				       "Failed to add all RQT chunks (%x/%x)\n",
+				       rqt_start, rqt_top - rqt_start);
+				return 0;
+			}
+			rqt_chunk >>= 1;
+		} else {
+			PDBG("%s added RQT chunk (%x/%x)\n",
+			     __func__, rqt_start, rqt_chunk);
+			rqt_start += rqt_chunk;
+		}
+	}
+	return 0;
+}
+
+void c4iw_rqtpool_destroy(struct c4iw_rdev *rdev)
+{
+	gen_pool_destroy(rdev->rqt_pool);
+}
+
+/*
+ * On-Chip QP Memory.
+ */
+#define MIN_OCQP_SHIFT 12	/* 4KB == min ocqp size */
+
+u32 c4iw_ocqp_pool_alloc(struct c4iw_rdev *rdev, int size)
+{
+	unsigned long addr = gen_pool_alloc(rdev->ocqp_pool, size);
+	PDBG("%s addr 0x%x size %d\n", __func__, (u32)addr, size);
+	if (addr) {
+		mutex_lock(&rdev->stats.lock);
+		rdev->stats.ocqp.cur += roundup(size, 1 << MIN_OCQP_SHIFT);
+		if (rdev->stats.ocqp.cur > rdev->stats.ocqp.max)
+			rdev->stats.ocqp.max = rdev->stats.ocqp.cur;
+		mutex_unlock(&rdev->stats.lock);
+	}
+	return (u32)addr;
+}
+
+void c4iw_ocqp_pool_free(struct c4iw_rdev *rdev, u32 addr, int size)
+{
+	PDBG("%s addr 0x%x size %d\n", __func__, addr, size);
+	mutex_lock(&rdev->stats.lock);
+	rdev->stats.ocqp.cur -= roundup(size, 1 << MIN_OCQP_SHIFT);
+	mutex_unlock(&rdev->stats.lock);
+	gen_pool_free(rdev->ocqp_pool, (unsigned long)addr, size);
+}
+
+int c4iw_ocqp_pool_create(struct c4iw_rdev *rdev)
+{
+	unsigned start, chunk, top;
+
+	rdev->ocqp_pool = gen_pool_create(MIN_OCQP_SHIFT, -1);
+	if (!rdev->ocqp_pool)
+		return -ENOMEM;
+
+	start = rdev->lldi.vr->ocq.start;
+	chunk = rdev->lldi.vr->ocq.size;
+	top = start + chunk;
+
+	while (start < top) {
+		chunk = min(top - start + 1, chunk);
+		if (gen_pool_add(rdev->ocqp_pool, start, chunk, -1)) {
+			PDBG("%s failed to add OCQP chunk (%x/%x)\n",
+			     __func__, start, chunk);
+			if (chunk <= 1024 << MIN_OCQP_SHIFT) {
+				printk(KERN_WARNING MOD
+				       "Failed to add all OCQP chunks (%x/%x)\n",
+				       start, top - start);
+				return 0;
+			}
+			chunk >>= 1;
+		} else {
+			PDBG("%s added OCQP chunk (%x/%x)\n",
+			     __func__, start, chunk);
+			start += chunk;
+		}
+	}
+	return 0;
+}
+
+void c4iw_ocqp_pool_destroy(struct c4iw_rdev *rdev)
+{
+	gen_pool_destroy(rdev->ocqp_pool);
+}
diff --git a/drivers/infiniband/hw/cxgb4/t4.h b/drivers/infiniband/hw/cxgb4/t4.h
new file mode 100644
index 0000000..c04e513
--- /dev/null
+++ b/drivers/infiniband/hw/cxgb4/t4.h
@@ -0,0 +1,684 @@
+/*
+ * Copyright (c) 2009-2010 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __T4_H__
+#define __T4_H__
+
+#include "t4_hw.h"
+#include "t4_regs.h"
+#include "t4_msg.h"
+#include "t4fw_ri_api.h"
+
+#define T4_MAX_NUM_PD 65536
+#define T4_MAX_MR_SIZE (~0ULL)
+#define T4_PAGESIZE_MASK 0xffff000  /* 4KB-128MB */
+#define T4_STAG_UNSET 0xffffffff
+#define T4_FW_MAJ 0
+#define A_PCIE_MA_SYNC 0x30b4
+
+struct t4_status_page {
+	__be32 rsvd1;	/* flit 0 - hw owns */
+	__be16 rsvd2;
+	__be16 qid;
+	__be16 cidx;
+	__be16 pidx;
+	u8 qp_err;	/* flit 1 - sw owns */
+	u8 db_off;
+	u8 pad;
+	u16 host_wq_pidx;
+	u16 host_cidx;
+	u16 host_pidx;
+};
+
+#define T4_EQ_ENTRY_SIZE 64
+
+#define T4_SQ_NUM_SLOTS 5
+#define T4_SQ_NUM_BYTES (T4_EQ_ENTRY_SIZE * T4_SQ_NUM_SLOTS)
+#define T4_MAX_SEND_SGE ((T4_SQ_NUM_BYTES - sizeof(struct fw_ri_send_wr) - \
+			sizeof(struct fw_ri_isgl)) / sizeof(struct fw_ri_sge))
+#define T4_MAX_SEND_INLINE ((T4_SQ_NUM_BYTES - sizeof(struct fw_ri_send_wr) - \
+			sizeof(struct fw_ri_immd)))
+#define T4_MAX_WRITE_INLINE ((T4_SQ_NUM_BYTES - \
+			sizeof(struct fw_ri_rdma_write_wr) - \
+			sizeof(struct fw_ri_immd)))
+#define T4_MAX_WRITE_SGE ((T4_SQ_NUM_BYTES - \
+			sizeof(struct fw_ri_rdma_write_wr) - \
+			sizeof(struct fw_ri_isgl)) / sizeof(struct fw_ri_sge))
+#define T4_MAX_FR_IMMD ((T4_SQ_NUM_BYTES - sizeof(struct fw_ri_fr_nsmr_wr) - \
+			sizeof(struct fw_ri_immd)) & ~31UL)
+#define T4_MAX_FR_IMMD_DEPTH (T4_MAX_FR_IMMD / sizeof(u64))
+#define T4_MAX_FR_DSGL 1024
+#define T4_MAX_FR_DSGL_DEPTH (T4_MAX_FR_DSGL / sizeof(u64))
+
+static inline int t4_max_fr_depth(int use_dsgl)
+{
+	return use_dsgl ? T4_MAX_FR_DSGL_DEPTH : T4_MAX_FR_IMMD_DEPTH;
+}
+
+#define T4_RQ_NUM_SLOTS 2
+#define T4_RQ_NUM_BYTES (T4_EQ_ENTRY_SIZE * T4_RQ_NUM_SLOTS)
+#define T4_MAX_RECV_SGE 4
+
+union t4_wr {
+	struct fw_ri_res_wr res;
+	struct fw_ri_wr ri;
+	struct fw_ri_rdma_write_wr write;
+	struct fw_ri_send_wr send;
+	struct fw_ri_rdma_read_wr read;
+	struct fw_ri_bind_mw_wr bind;
+	struct fw_ri_fr_nsmr_wr fr;
+	struct fw_ri_inv_lstag_wr inv;
+	struct t4_status_page status;
+	__be64 flits[T4_EQ_ENTRY_SIZE / sizeof(__be64) * T4_SQ_NUM_SLOTS];
+};
+
+union t4_recv_wr {
+	struct fw_ri_recv_wr recv;
+	struct t4_status_page status;
+	__be64 flits[T4_EQ_ENTRY_SIZE / sizeof(__be64) * T4_RQ_NUM_SLOTS];
+};
+
+static inline void init_wr_hdr(union t4_wr *wqe, u16 wrid,
+			       enum fw_wr_opcodes opcode, u8 flags, u8 len16)
+{
+	wqe->send.opcode = (u8)opcode;
+	wqe->send.flags = flags;
+	wqe->send.wrid = wrid;
+	wqe->send.r1[0] = 0;
+	wqe->send.r1[1] = 0;
+	wqe->send.r1[2] = 0;
+	wqe->send.len16 = len16;
+}
+
+/* CQE/AE status codes */
+#define T4_ERR_SUCCESS                     0x0
+#define T4_ERR_STAG                        0x1	/* STAG invalid: either the */
+						/* STAG is offlimt, being 0, */
+						/* or STAG_key mismatch */
+#define T4_ERR_PDID                        0x2	/* PDID mismatch */
+#define T4_ERR_QPID                        0x3	/* QPID mismatch */
+#define T4_ERR_ACCESS                      0x4	/* Invalid access right */
+#define T4_ERR_WRAP                        0x5	/* Wrap error */
+#define T4_ERR_BOUND                       0x6	/* base and bounds voilation */
+#define T4_ERR_INVALIDATE_SHARED_MR        0x7	/* attempt to invalidate a  */
+						/* shared memory region */
+#define T4_ERR_INVALIDATE_MR_WITH_MW_BOUND 0x8	/* attempt to invalidate a  */
+						/* shared memory region */
+#define T4_ERR_ECC                         0x9	/* ECC error detected */
+#define T4_ERR_ECC_PSTAG                   0xA	/* ECC error detected when  */
+						/* reading PSTAG for a MW  */
+						/* Invalidate */
+#define T4_ERR_PBL_ADDR_BOUND              0xB	/* pbl addr out of bounds:  */
+						/* software error */
+#define T4_ERR_SWFLUSH			   0xC	/* SW FLUSHED */
+#define T4_ERR_CRC                         0x10 /* CRC error */
+#define T4_ERR_MARKER                      0x11 /* Marker error */
+#define T4_ERR_PDU_LEN_ERR                 0x12 /* invalid PDU length */
+#define T4_ERR_OUT_OF_RQE                  0x13 /* out of RQE */
+#define T4_ERR_DDP_VERSION                 0x14 /* wrong DDP version */
+#define T4_ERR_RDMA_VERSION                0x15 /* wrong RDMA version */
+#define T4_ERR_OPCODE                      0x16 /* invalid rdma opcode */
+#define T4_ERR_DDP_QUEUE_NUM               0x17 /* invalid ddp queue number */
+#define T4_ERR_MSN                         0x18 /* MSN error */
+#define T4_ERR_TBIT                        0x19 /* tag bit not set correctly */
+#define T4_ERR_MO                          0x1A /* MO not 0 for TERMINATE  */
+						/* or READ_REQ */
+#define T4_ERR_MSN_GAP                     0x1B
+#define T4_ERR_MSN_RANGE                   0x1C
+#define T4_ERR_IRD_OVERFLOW                0x1D
+#define T4_ERR_RQE_ADDR_BOUND              0x1E /* RQE addr out of bounds:  */
+						/* software error */
+#define T4_ERR_INTERNAL_ERR                0x1F /* internal error (opcode  */
+						/* mismatch) */
+/*
+ * CQE defs
+ */
+struct t4_cqe {
+	__be32 header;
+	__be32 len;
+	union {
+		struct {
+			__be32 stag;
+			__be32 msn;
+		} rcqe;
+		struct {
+			u32 nada1;
+			u16 nada2;
+			u16 cidx;
+		} scqe;
+		struct {
+			__be32 wrid_hi;
+			__be32 wrid_low;
+		} gen;
+	} u;
+	__be64 reserved;
+	__be64 bits_type_ts;
+};
+
+/* macros for flit 0 of the cqe */
+
+#define S_CQE_QPID        12
+#define M_CQE_QPID        0xFFFFF
+#define G_CQE_QPID(x)     ((((x) >> S_CQE_QPID)) & M_CQE_QPID)
+#define V_CQE_QPID(x)	  ((x)<<S_CQE_QPID)
+
+#define S_CQE_SWCQE       11
+#define M_CQE_SWCQE       0x1
+#define G_CQE_SWCQE(x)    ((((x) >> S_CQE_SWCQE)) & M_CQE_SWCQE)
+#define V_CQE_SWCQE(x)	  ((x)<<S_CQE_SWCQE)
+
+#define S_CQE_STATUS      5
+#define M_CQE_STATUS      0x1F
+#define G_CQE_STATUS(x)   ((((x) >> S_CQE_STATUS)) & M_CQE_STATUS)
+#define V_CQE_STATUS(x)   ((x)<<S_CQE_STATUS)
+
+#define S_CQE_TYPE        4
+#define M_CQE_TYPE        0x1
+#define G_CQE_TYPE(x)     ((((x) >> S_CQE_TYPE)) & M_CQE_TYPE)
+#define V_CQE_TYPE(x)     ((x)<<S_CQE_TYPE)
+
+#define S_CQE_OPCODE      0
+#define M_CQE_OPCODE      0xF
+#define G_CQE_OPCODE(x)   ((((x) >> S_CQE_OPCODE)) & M_CQE_OPCODE)
+#define V_CQE_OPCODE(x)   ((x)<<S_CQE_OPCODE)
+
+#define SW_CQE(x)         (G_CQE_SWCQE(be32_to_cpu((x)->header)))
+#define CQE_QPID(x)       (G_CQE_QPID(be32_to_cpu((x)->header)))
+#define CQE_TYPE(x)       (G_CQE_TYPE(be32_to_cpu((x)->header)))
+#define SQ_TYPE(x)	  (CQE_TYPE((x)))
+#define RQ_TYPE(x)	  (!CQE_TYPE((x)))
+#define CQE_STATUS(x)     (G_CQE_STATUS(be32_to_cpu((x)->header)))
+#define CQE_OPCODE(x)     (G_CQE_OPCODE(be32_to_cpu((x)->header)))
+
+#define CQE_SEND_OPCODE(x)( \
+	(G_CQE_OPCODE(be32_to_cpu((x)->header)) == FW_RI_SEND) || \
+	(G_CQE_OPCODE(be32_to_cpu((x)->header)) == FW_RI_SEND_WITH_SE) || \
+	(G_CQE_OPCODE(be32_to_cpu((x)->header)) == FW_RI_SEND_WITH_INV) || \
+	(G_CQE_OPCODE(be32_to_cpu((x)->header)) == FW_RI_SEND_WITH_SE_INV))
+
+#define CQE_LEN(x)        (be32_to_cpu((x)->len))
+
+/* used for RQ completion processing */
+#define CQE_WRID_STAG(x)  (be32_to_cpu((x)->u.rcqe.stag))
+#define CQE_WRID_MSN(x)   (be32_to_cpu((x)->u.rcqe.msn))
+
+/* used for SQ completion processing */
+#define CQE_WRID_SQ_IDX(x)	((x)->u.scqe.cidx)
+
+/* generic accessor macros */
+#define CQE_WRID_HI(x)		(be32_to_cpu((x)->u.gen.wrid_hi))
+#define CQE_WRID_LOW(x)		(be32_to_cpu((x)->u.gen.wrid_low))
+
+/* macros for flit 3 of the cqe */
+#define S_CQE_GENBIT	63
+#define M_CQE_GENBIT	0x1
+#define G_CQE_GENBIT(x)	(((x) >> S_CQE_GENBIT) & M_CQE_GENBIT)
+#define V_CQE_GENBIT(x) ((x)<<S_CQE_GENBIT)
+
+#define S_CQE_OVFBIT	62
+#define M_CQE_OVFBIT	0x1
+#define G_CQE_OVFBIT(x)	((((x) >> S_CQE_OVFBIT)) & M_CQE_OVFBIT)
+
+#define S_CQE_IQTYPE	60
+#define M_CQE_IQTYPE	0x3
+#define G_CQE_IQTYPE(x)	((((x) >> S_CQE_IQTYPE)) & M_CQE_IQTYPE)
+
+#define M_CQE_TS	0x0fffffffffffffffULL
+#define G_CQE_TS(x)	((x) & M_CQE_TS)
+
+#define CQE_OVFBIT(x)	((unsigned)G_CQE_OVFBIT(be64_to_cpu((x)->bits_type_ts)))
+#define CQE_GENBIT(x)	((unsigned)G_CQE_GENBIT(be64_to_cpu((x)->bits_type_ts)))
+#define CQE_TS(x)	(G_CQE_TS(be64_to_cpu((x)->bits_type_ts)))
+
+struct t4_swsqe {
+	u64			wr_id;
+	struct t4_cqe		cqe;
+	int			read_len;
+	int			opcode;
+	int			complete;
+	int			signaled;
+	u16			idx;
+	int                     flushed;
+	struct timespec         host_ts;
+	u64                     sge_ts;
+};
+
+static inline pgprot_t t4_pgprot_wc(pgprot_t prot)
+{
+#if defined(__i386__) || defined(__x86_64__) || defined(CONFIG_PPC64)
+	return pgprot_writecombine(prot);
+#else
+	return pgprot_noncached(prot);
+#endif
+}
+
+enum {
+	T4_SQ_ONCHIP = (1<<0),
+};
+
+struct t4_sq {
+	union t4_wr *queue;
+	dma_addr_t dma_addr;
+	DEFINE_DMA_UNMAP_ADDR(mapping);
+	unsigned long phys_addr;
+	struct t4_swsqe *sw_sq;
+	struct t4_swsqe *oldest_read;
+	u64 __iomem *udb;
+	size_t memsize;
+	u32 qid;
+	u16 in_use;
+	u16 size;
+	u16 cidx;
+	u16 pidx;
+	u16 wq_pidx;
+	u16 wq_pidx_inc;
+	u16 flags;
+	short flush_cidx;
+};
+
+struct t4_swrqe {
+	u64 wr_id;
+	struct timespec host_ts;
+	u64 sge_ts;
+};
+
+struct t4_rq {
+	union  t4_recv_wr *queue;
+	dma_addr_t dma_addr;
+	DEFINE_DMA_UNMAP_ADDR(mapping);
+	struct t4_swrqe *sw_rq;
+	u64 __iomem *udb;
+	size_t memsize;
+	u32 qid;
+	u32 msn;
+	u32 rqt_hwaddr;
+	u16 rqt_size;
+	u16 in_use;
+	u16 size;
+	u16 cidx;
+	u16 pidx;
+	u16 wq_pidx;
+	u16 wq_pidx_inc;
+};
+
+struct t4_wq {
+	struct t4_sq sq;
+	struct t4_rq rq;
+	void __iomem *db;
+	void __iomem *gts;
+	struct c4iw_rdev *rdev;
+	int flushed;
+};
+
+static inline int t4_rqes_posted(struct t4_wq *wq)
+{
+	return wq->rq.in_use;
+}
+
+static inline int t4_rq_empty(struct t4_wq *wq)
+{
+	return wq->rq.in_use == 0;
+}
+
+static inline int t4_rq_full(struct t4_wq *wq)
+{
+	return wq->rq.in_use == (wq->rq.size - 1);
+}
+
+static inline u32 t4_rq_avail(struct t4_wq *wq)
+{
+	return wq->rq.size - 1 - wq->rq.in_use;
+}
+
+static inline void t4_rq_produce(struct t4_wq *wq, u8 len16)
+{
+	wq->rq.in_use++;
+	if (++wq->rq.pidx == wq->rq.size)
+		wq->rq.pidx = 0;
+	wq->rq.wq_pidx += DIV_ROUND_UP(len16*16, T4_EQ_ENTRY_SIZE);
+	if (wq->rq.wq_pidx >= wq->rq.size * T4_RQ_NUM_SLOTS)
+		wq->rq.wq_pidx %= wq->rq.size * T4_RQ_NUM_SLOTS;
+}
+
+static inline void t4_rq_consume(struct t4_wq *wq)
+{
+	wq->rq.in_use--;
+	wq->rq.msn++;
+	if (++wq->rq.cidx == wq->rq.size)
+		wq->rq.cidx = 0;
+}
+
+static inline u16 t4_rq_host_wq_pidx(struct t4_wq *wq)
+{
+	return wq->rq.queue[wq->rq.size].status.host_wq_pidx;
+}
+
+static inline u16 t4_rq_wq_size(struct t4_wq *wq)
+{
+		return wq->rq.size * T4_RQ_NUM_SLOTS;
+}
+
+static inline int t4_sq_onchip(struct t4_sq *sq)
+{
+	return sq->flags & T4_SQ_ONCHIP;
+}
+
+static inline int t4_sq_empty(struct t4_wq *wq)
+{
+	return wq->sq.in_use == 0;
+}
+
+static inline int t4_sq_full(struct t4_wq *wq)
+{
+	return wq->sq.in_use == (wq->sq.size - 1);
+}
+
+static inline u32 t4_sq_avail(struct t4_wq *wq)
+{
+	return wq->sq.size - 1 - wq->sq.in_use;
+}
+
+static inline void t4_sq_produce(struct t4_wq *wq, u8 len16)
+{
+	wq->sq.in_use++;
+	if (++wq->sq.pidx == wq->sq.size)
+		wq->sq.pidx = 0;
+	wq->sq.wq_pidx += DIV_ROUND_UP(len16*16, T4_EQ_ENTRY_SIZE);
+	if (wq->sq.wq_pidx >= wq->sq.size * T4_SQ_NUM_SLOTS)
+		wq->sq.wq_pidx %= wq->sq.size * T4_SQ_NUM_SLOTS;
+}
+
+static inline void t4_sq_consume(struct t4_wq *wq)
+{
+	BUG_ON(wq->sq.in_use < 1);
+	if (wq->sq.cidx == wq->sq.flush_cidx)
+		wq->sq.flush_cidx = -1;
+	wq->sq.in_use--;
+	if (++wq->sq.cidx == wq->sq.size)
+		wq->sq.cidx = 0;
+}
+
+static inline u16 t4_sq_host_wq_pidx(struct t4_wq *wq)
+{
+	return wq->sq.queue[wq->sq.size].status.host_wq_pidx;
+}
+
+static inline u16 t4_sq_wq_size(struct t4_wq *wq)
+{
+		return wq->sq.size * T4_SQ_NUM_SLOTS;
+}
+
+/* This function copies 64 byte coalesced work request to memory
+ * mapped BAR2 space. For coalesced WRs, the SGE fetches data
+ * from the FIFO instead of from Host.
+ */
+static inline void pio_copy(u64 __iomem *dst, u64 *src)
+{
+	int count = 8;
+
+	while (count) {
+		writeq(*src, dst);
+		src++;
+		dst++;
+		count--;
+	}
+}
+
+static inline void t4_ring_sq_db(struct t4_wq *wq, u16 inc, u8 t5,
+				 union t4_wr *wqe)
+{
+
+	/* Flush host queue memory writes. */
+	wmb();
+	if (t5) {
+		if (inc == 1 && wqe) {
+			PDBG("%s: WC wq->sq.pidx = %d\n",
+			     __func__, wq->sq.pidx);
+			pio_copy(wq->sq.udb + 7, (void *)wqe);
+		} else {
+			PDBG("%s: DB wq->sq.pidx = %d\n",
+			     __func__, wq->sq.pidx);
+			writel(PIDX_T5(inc), wq->sq.udb);
+		}
+
+		/* Flush user doorbell area writes. */
+		wmb();
+		return;
+	}
+	writel(QID(wq->sq.qid) | PIDX(inc), wq->db);
+}
+
+static inline void t4_ring_rq_db(struct t4_wq *wq, u16 inc, u8 t5,
+				 union t4_recv_wr *wqe)
+{
+
+	/* Flush host queue memory writes. */
+	wmb();
+	if (t5) {
+		if (inc == 1 && wqe) {
+			PDBG("%s: WC wq->rq.pidx = %d\n",
+			     __func__, wq->rq.pidx);
+			pio_copy(wq->rq.udb + 7, (void *)wqe);
+		} else {
+			PDBG("%s: DB wq->rq.pidx = %d\n",
+			     __func__, wq->rq.pidx);
+			writel(PIDX_T5(inc), wq->rq.udb);
+		}
+
+		/* Flush user doorbell area writes. */
+		wmb();
+		return;
+	}
+	writel(QID(wq->rq.qid) | PIDX(inc), wq->db);
+}
+
+static inline int t4_wq_in_error(struct t4_wq *wq)
+{
+	return wq->rq.queue[wq->rq.size].status.qp_err;
+}
+
+static inline void t4_set_wq_in_error(struct t4_wq *wq)
+{
+	wq->rq.queue[wq->rq.size].status.qp_err = 1;
+}
+
+static inline void t4_disable_wq_db(struct t4_wq *wq)
+{
+	wq->rq.queue[wq->rq.size].status.db_off = 1;
+}
+
+static inline void t4_enable_wq_db(struct t4_wq *wq)
+{
+	wq->rq.queue[wq->rq.size].status.db_off = 0;
+}
+
+static inline int t4_wq_db_enabled(struct t4_wq *wq)
+{
+	return !wq->rq.queue[wq->rq.size].status.db_off;
+}
+
+enum t4_cq_flags {
+	CQ_ARMED	= 1,
+};
+
+struct t4_cq {
+	struct t4_cqe *queue;
+	dma_addr_t dma_addr;
+	DEFINE_DMA_UNMAP_ADDR(mapping);
+	struct t4_cqe *sw_queue;
+	void __iomem *gts;
+	struct c4iw_rdev *rdev;
+	u64 ugts;
+	size_t memsize;
+	__be64 bits_type_ts;
+	u32 cqid;
+	int vector;
+	u16 size; /* including status page */
+	u16 cidx;
+	u16 sw_pidx;
+	u16 sw_cidx;
+	u16 sw_in_use;
+	u16 cidx_inc;
+	u8 gen;
+	u8 error;
+	unsigned long flags;
+};
+
+static inline int t4_clear_cq_armed(struct t4_cq *cq)
+{
+	return test_and_clear_bit(CQ_ARMED, &cq->flags);
+}
+
+static inline int t4_arm_cq(struct t4_cq *cq, int se)
+{
+	u32 val;
+
+	set_bit(CQ_ARMED, &cq->flags);
+	while (cq->cidx_inc > CIDXINC_MASK) {
+		val = SEINTARM(0) | CIDXINC(CIDXINC_MASK) | TIMERREG(7) |
+		      INGRESSQID(cq->cqid);
+		writel(val, cq->gts);
+		cq->cidx_inc -= CIDXINC_MASK;
+	}
+	val = SEINTARM(se) | CIDXINC(cq->cidx_inc) | TIMERREG(6) |
+	      INGRESSQID(cq->cqid);
+	writel(val, cq->gts);
+	cq->cidx_inc = 0;
+	return 0;
+}
+
+static inline void t4_swcq_produce(struct t4_cq *cq)
+{
+	cq->sw_in_use++;
+	if (cq->sw_in_use == cq->size) {
+		PDBG("%s cxgb4 sw cq overflow cqid %u\n", __func__, cq->cqid);
+		cq->error = 1;
+		BUG_ON(1);
+	}
+	if (++cq->sw_pidx == cq->size)
+		cq->sw_pidx = 0;
+}
+
+static inline void t4_swcq_consume(struct t4_cq *cq)
+{
+	BUG_ON(cq->sw_in_use < 1);
+	cq->sw_in_use--;
+	if (++cq->sw_cidx == cq->size)
+		cq->sw_cidx = 0;
+}
+
+static inline void t4_hwcq_consume(struct t4_cq *cq)
+{
+	cq->bits_type_ts = cq->queue[cq->cidx].bits_type_ts;
+	if (++cq->cidx_inc == (cq->size >> 4) || cq->cidx_inc == CIDXINC_MASK) {
+		u32 val;
+
+		val = SEINTARM(0) | CIDXINC(cq->cidx_inc) | TIMERREG(7) |
+		      INGRESSQID(cq->cqid);
+		writel(val, cq->gts);
+		cq->cidx_inc = 0;
+	}
+	if (++cq->cidx == cq->size) {
+		cq->cidx = 0;
+		cq->gen ^= 1;
+	}
+}
+
+static inline int t4_valid_cqe(struct t4_cq *cq, struct t4_cqe *cqe)
+{
+	return (CQE_GENBIT(cqe) == cq->gen);
+}
+
+static inline int t4_next_hw_cqe(struct t4_cq *cq, struct t4_cqe **cqe)
+{
+	int ret;
+	u16 prev_cidx;
+
+	if (cq->cidx == 0)
+		prev_cidx = cq->size - 1;
+	else
+		prev_cidx = cq->cidx - 1;
+
+	if (cq->queue[prev_cidx].bits_type_ts != cq->bits_type_ts) {
+		ret = -EOVERFLOW;
+		cq->error = 1;
+		printk(KERN_ERR MOD "cq overflow cqid %u\n", cq->cqid);
+		BUG_ON(1);
+	} else if (t4_valid_cqe(cq, &cq->queue[cq->cidx])) {
+
+		/* Ensure CQE is flushed to memory */
+		rmb();
+		*cqe = &cq->queue[cq->cidx];
+		ret = 0;
+	} else
+		ret = -ENODATA;
+	return ret;
+}
+
+static inline struct t4_cqe *t4_next_sw_cqe(struct t4_cq *cq)
+{
+	if (cq->sw_in_use == cq->size) {
+		PDBG("%s cxgb4 sw cq overflow cqid %u\n", __func__, cq->cqid);
+		cq->error = 1;
+		BUG_ON(1);
+		return NULL;
+	}
+	if (cq->sw_in_use)
+		return &cq->sw_queue[cq->sw_cidx];
+	return NULL;
+}
+
+static inline int t4_next_cqe(struct t4_cq *cq, struct t4_cqe **cqe)
+{
+	int ret = 0;
+
+	if (cq->error)
+		ret = -ENODATA;
+	else if (cq->sw_in_use)
+		*cqe = &cq->sw_queue[cq->sw_cidx];
+	else
+		ret = t4_next_hw_cqe(cq, cqe);
+	return ret;
+}
+
+static inline int t4_cq_in_error(struct t4_cq *cq)
+{
+	return ((struct t4_status_page *)&cq->queue[cq->size])->qp_err;
+}
+
+static inline void t4_set_cq_in_error(struct t4_cq *cq)
+{
+	((struct t4_status_page *)&cq->queue[cq->size])->qp_err = 1;
+}
+#endif
+
+struct t4_dev_status_page {
+	u8 db_off;
+};
diff --git a/drivers/infiniband/hw/cxgb4/t4fw_ri_api.h b/drivers/infiniband/hw/cxgb4/t4fw_ri_api.h
new file mode 100644
index 0000000..5709e77
--- /dev/null
+++ b/drivers/infiniband/hw/cxgb4/t4fw_ri_api.h
@@ -0,0 +1,853 @@
+/*
+ * Copyright (c) 2009-2010 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef _T4FW_RI_API_H_
+#define _T4FW_RI_API_H_
+
+#include "t4fw_api.h"
+
+enum fw_ri_wr_opcode {
+	FW_RI_RDMA_WRITE		= 0x0,	/* IETF RDMAP v1.0 ... */
+	FW_RI_READ_REQ			= 0x1,
+	FW_RI_READ_RESP			= 0x2,
+	FW_RI_SEND			= 0x3,
+	FW_RI_SEND_WITH_INV		= 0x4,
+	FW_RI_SEND_WITH_SE		= 0x5,
+	FW_RI_SEND_WITH_SE_INV		= 0x6,
+	FW_RI_TERMINATE			= 0x7,
+	FW_RI_RDMA_INIT			= 0x8,	/* CHELSIO RI specific ... */
+	FW_RI_BIND_MW			= 0x9,
+	FW_RI_FAST_REGISTER		= 0xa,
+	FW_RI_LOCAL_INV			= 0xb,
+	FW_RI_QP_MODIFY			= 0xc,
+	FW_RI_BYPASS			= 0xd,
+	FW_RI_RECEIVE			= 0xe,
+
+	FW_RI_SGE_EC_CR_RETURN		= 0xf
+};
+
+enum fw_ri_wr_flags {
+	FW_RI_COMPLETION_FLAG		= 0x01,
+	FW_RI_NOTIFICATION_FLAG		= 0x02,
+	FW_RI_SOLICITED_EVENT_FLAG	= 0x04,
+	FW_RI_READ_FENCE_FLAG		= 0x08,
+	FW_RI_LOCAL_FENCE_FLAG		= 0x10,
+	FW_RI_RDMA_READ_INVALIDATE	= 0x20
+};
+
+enum fw_ri_mpa_attrs {
+	FW_RI_MPA_RX_MARKER_ENABLE	= 0x01,
+	FW_RI_MPA_TX_MARKER_ENABLE	= 0x02,
+	FW_RI_MPA_CRC_ENABLE		= 0x04,
+	FW_RI_MPA_IETF_ENABLE		= 0x08
+};
+
+enum fw_ri_qp_caps {
+	FW_RI_QP_RDMA_READ_ENABLE	= 0x01,
+	FW_RI_QP_RDMA_WRITE_ENABLE	= 0x02,
+	FW_RI_QP_BIND_ENABLE		= 0x04,
+	FW_RI_QP_FAST_REGISTER_ENABLE	= 0x08,
+	FW_RI_QP_STAG0_ENABLE		= 0x10
+};
+
+enum fw_ri_addr_type {
+	FW_RI_ZERO_BASED_TO		= 0x00,
+	FW_RI_VA_BASED_TO		= 0x01
+};
+
+enum fw_ri_mem_perms {
+	FW_RI_MEM_ACCESS_REM_WRITE	= 0x01,
+	FW_RI_MEM_ACCESS_REM_READ	= 0x02,
+	FW_RI_MEM_ACCESS_REM		= 0x03,
+	FW_RI_MEM_ACCESS_LOCAL_WRITE	= 0x04,
+	FW_RI_MEM_ACCESS_LOCAL_READ	= 0x08,
+	FW_RI_MEM_ACCESS_LOCAL		= 0x0C
+};
+
+enum fw_ri_stag_type {
+	FW_RI_STAG_NSMR			= 0x00,
+	FW_RI_STAG_SMR			= 0x01,
+	FW_RI_STAG_MW			= 0x02,
+	FW_RI_STAG_MW_RELAXED		= 0x03
+};
+
+enum fw_ri_data_op {
+	FW_RI_DATA_IMMD			= 0x81,
+	FW_RI_DATA_DSGL			= 0x82,
+	FW_RI_DATA_ISGL			= 0x83
+};
+
+enum fw_ri_sgl_depth {
+	FW_RI_SGL_DEPTH_MAX_SQ		= 16,
+	FW_RI_SGL_DEPTH_MAX_RQ		= 4
+};
+
+struct fw_ri_dsge_pair {
+	__be32	len[2];
+	__be64	addr[2];
+};
+
+struct fw_ri_dsgl {
+	__u8	op;
+	__u8	r1;
+	__be16	nsge;
+	__be32	len0;
+	__be64	addr0;
+#ifndef C99_NOT_SUPPORTED
+	struct fw_ri_dsge_pair sge[0];
+#endif
+};
+
+struct fw_ri_sge {
+	__be32 stag;
+	__be32 len;
+	__be64 to;
+};
+
+struct fw_ri_isgl {
+	__u8	op;
+	__u8	r1;
+	__be16	nsge;
+	__be32	r2;
+#ifndef C99_NOT_SUPPORTED
+	struct fw_ri_sge sge[0];
+#endif
+};
+
+struct fw_ri_immd {
+	__u8	op;
+	__u8	r1;
+	__be16	r2;
+	__be32	immdlen;
+#ifndef C99_NOT_SUPPORTED
+	__u8	data[0];
+#endif
+};
+
+struct fw_ri_tpte {
+	__be32 valid_to_pdid;
+	__be32 locread_to_qpid;
+	__be32 nosnoop_pbladdr;
+	__be32 len_lo;
+	__be32 va_hi;
+	__be32 va_lo_fbo;
+	__be32 dca_mwbcnt_pstag;
+	__be32 len_hi;
+};
+
+#define S_FW_RI_TPTE_VALID		31
+#define M_FW_RI_TPTE_VALID		0x1
+#define V_FW_RI_TPTE_VALID(x)		((x) << S_FW_RI_TPTE_VALID)
+#define G_FW_RI_TPTE_VALID(x)		\
+    (((x) >> S_FW_RI_TPTE_VALID) & M_FW_RI_TPTE_VALID)
+#define F_FW_RI_TPTE_VALID		V_FW_RI_TPTE_VALID(1U)
+
+#define S_FW_RI_TPTE_STAGKEY		23
+#define M_FW_RI_TPTE_STAGKEY		0xff
+#define V_FW_RI_TPTE_STAGKEY(x)		((x) << S_FW_RI_TPTE_STAGKEY)
+#define G_FW_RI_TPTE_STAGKEY(x)		\
+    (((x) >> S_FW_RI_TPTE_STAGKEY) & M_FW_RI_TPTE_STAGKEY)
+
+#define S_FW_RI_TPTE_STAGSTATE		22
+#define M_FW_RI_TPTE_STAGSTATE		0x1
+#define V_FW_RI_TPTE_STAGSTATE(x)	((x) << S_FW_RI_TPTE_STAGSTATE)
+#define G_FW_RI_TPTE_STAGSTATE(x)	\
+    (((x) >> S_FW_RI_TPTE_STAGSTATE) & M_FW_RI_TPTE_STAGSTATE)
+#define F_FW_RI_TPTE_STAGSTATE		V_FW_RI_TPTE_STAGSTATE(1U)
+
+#define S_FW_RI_TPTE_STAGTYPE		20
+#define M_FW_RI_TPTE_STAGTYPE		0x3
+#define V_FW_RI_TPTE_STAGTYPE(x)	((x) << S_FW_RI_TPTE_STAGTYPE)
+#define G_FW_RI_TPTE_STAGTYPE(x)	\
+    (((x) >> S_FW_RI_TPTE_STAGTYPE) & M_FW_RI_TPTE_STAGTYPE)
+
+#define S_FW_RI_TPTE_PDID		0
+#define M_FW_RI_TPTE_PDID		0xfffff
+#define V_FW_RI_TPTE_PDID(x)		((x) << S_FW_RI_TPTE_PDID)
+#define G_FW_RI_TPTE_PDID(x)		\
+    (((x) >> S_FW_RI_TPTE_PDID) & M_FW_RI_TPTE_PDID)
+
+#define S_FW_RI_TPTE_PERM		28
+#define M_FW_RI_TPTE_PERM		0xf
+#define V_FW_RI_TPTE_PERM(x)		((x) << S_FW_RI_TPTE_PERM)
+#define G_FW_RI_TPTE_PERM(x)		\
+    (((x) >> S_FW_RI_TPTE_PERM) & M_FW_RI_TPTE_PERM)
+
+#define S_FW_RI_TPTE_REMINVDIS		27
+#define M_FW_RI_TPTE_REMINVDIS		0x1
+#define V_FW_RI_TPTE_REMINVDIS(x)	((x) << S_FW_RI_TPTE_REMINVDIS)
+#define G_FW_RI_TPTE_REMINVDIS(x)	\
+    (((x) >> S_FW_RI_TPTE_REMINVDIS) & M_FW_RI_TPTE_REMINVDIS)
+#define F_FW_RI_TPTE_REMINVDIS		V_FW_RI_TPTE_REMINVDIS(1U)
+
+#define S_FW_RI_TPTE_ADDRTYPE		26
+#define M_FW_RI_TPTE_ADDRTYPE		1
+#define V_FW_RI_TPTE_ADDRTYPE(x)	((x) << S_FW_RI_TPTE_ADDRTYPE)
+#define G_FW_RI_TPTE_ADDRTYPE(x)	\
+    (((x) >> S_FW_RI_TPTE_ADDRTYPE) & M_FW_RI_TPTE_ADDRTYPE)
+#define F_FW_RI_TPTE_ADDRTYPE		V_FW_RI_TPTE_ADDRTYPE(1U)
+
+#define S_FW_RI_TPTE_MWBINDEN		25
+#define M_FW_RI_TPTE_MWBINDEN		0x1
+#define V_FW_RI_TPTE_MWBINDEN(x)	((x) << S_FW_RI_TPTE_MWBINDEN)
+#define G_FW_RI_TPTE_MWBINDEN(x)	\
+    (((x) >> S_FW_RI_TPTE_MWBINDEN) & M_FW_RI_TPTE_MWBINDEN)
+#define F_FW_RI_TPTE_MWBINDEN		V_FW_RI_TPTE_MWBINDEN(1U)
+
+#define S_FW_RI_TPTE_PS			20
+#define M_FW_RI_TPTE_PS			0x1f
+#define V_FW_RI_TPTE_PS(x)		((x) << S_FW_RI_TPTE_PS)
+#define G_FW_RI_TPTE_PS(x)		\
+    (((x) >> S_FW_RI_TPTE_PS) & M_FW_RI_TPTE_PS)
+
+#define S_FW_RI_TPTE_QPID		0
+#define M_FW_RI_TPTE_QPID		0xfffff
+#define V_FW_RI_TPTE_QPID(x)		((x) << S_FW_RI_TPTE_QPID)
+#define G_FW_RI_TPTE_QPID(x)		\
+    (((x) >> S_FW_RI_TPTE_QPID) & M_FW_RI_TPTE_QPID)
+
+#define S_FW_RI_TPTE_NOSNOOP		30
+#define M_FW_RI_TPTE_NOSNOOP		0x1
+#define V_FW_RI_TPTE_NOSNOOP(x)		((x) << S_FW_RI_TPTE_NOSNOOP)
+#define G_FW_RI_TPTE_NOSNOOP(x)		\
+    (((x) >> S_FW_RI_TPTE_NOSNOOP) & M_FW_RI_TPTE_NOSNOOP)
+#define F_FW_RI_TPTE_NOSNOOP		V_FW_RI_TPTE_NOSNOOP(1U)
+
+#define S_FW_RI_TPTE_PBLADDR		0
+#define M_FW_RI_TPTE_PBLADDR		0x1fffffff
+#define V_FW_RI_TPTE_PBLADDR(x)		((x) << S_FW_RI_TPTE_PBLADDR)
+#define G_FW_RI_TPTE_PBLADDR(x)		\
+    (((x) >> S_FW_RI_TPTE_PBLADDR) & M_FW_RI_TPTE_PBLADDR)
+
+#define S_FW_RI_TPTE_DCA		24
+#define M_FW_RI_TPTE_DCA		0x1f
+#define V_FW_RI_TPTE_DCA(x)		((x) << S_FW_RI_TPTE_DCA)
+#define G_FW_RI_TPTE_DCA(x)		\
+    (((x) >> S_FW_RI_TPTE_DCA) & M_FW_RI_TPTE_DCA)
+
+#define S_FW_RI_TPTE_MWBCNT_PSTAG	0
+#define M_FW_RI_TPTE_MWBCNT_PSTAG	0xffffff
+#define V_FW_RI_TPTE_MWBCNT_PSTAT(x)	\
+    ((x) << S_FW_RI_TPTE_MWBCNT_PSTAG)
+#define G_FW_RI_TPTE_MWBCNT_PSTAG(x)	\
+    (((x) >> S_FW_RI_TPTE_MWBCNT_PSTAG) & M_FW_RI_TPTE_MWBCNT_PSTAG)
+
+enum fw_ri_res_type {
+	FW_RI_RES_TYPE_SQ,
+	FW_RI_RES_TYPE_RQ,
+	FW_RI_RES_TYPE_CQ,
+};
+
+enum fw_ri_res_op {
+	FW_RI_RES_OP_WRITE,
+	FW_RI_RES_OP_RESET,
+};
+
+struct fw_ri_res {
+	union fw_ri_restype {
+		struct fw_ri_res_sqrq {
+			__u8   restype;
+			__u8   op;
+			__be16 r3;
+			__be32 eqid;
+			__be32 r4[2];
+			__be32 fetchszm_to_iqid;
+			__be32 dcaen_to_eqsize;
+			__be64 eqaddr;
+		} sqrq;
+		struct fw_ri_res_cq {
+			__u8   restype;
+			__u8   op;
+			__be16 r3;
+			__be32 iqid;
+			__be32 r4[2];
+			__be32 iqandst_to_iqandstindex;
+			__be16 iqdroprss_to_iqesize;
+			__be16 iqsize;
+			__be64 iqaddr;
+			__be32 iqns_iqro;
+			__be32 r6_lo;
+			__be64 r7;
+		} cq;
+	} u;
+};
+
+struct fw_ri_res_wr {
+	__be32 op_nres;
+	__be32 len16_pkd;
+	__u64  cookie;
+#ifndef C99_NOT_SUPPORTED
+	struct fw_ri_res res[0];
+#endif
+};
+
+#define S_FW_RI_RES_WR_NRES	0
+#define M_FW_RI_RES_WR_NRES	0xff
+#define V_FW_RI_RES_WR_NRES(x)	((x) << S_FW_RI_RES_WR_NRES)
+#define G_FW_RI_RES_WR_NRES(x)	\
+    (((x) >> S_FW_RI_RES_WR_NRES) & M_FW_RI_RES_WR_NRES)
+
+#define S_FW_RI_RES_WR_FETCHSZM		26
+#define M_FW_RI_RES_WR_FETCHSZM		0x1
+#define V_FW_RI_RES_WR_FETCHSZM(x)	((x) << S_FW_RI_RES_WR_FETCHSZM)
+#define G_FW_RI_RES_WR_FETCHSZM(x)	\
+    (((x) >> S_FW_RI_RES_WR_FETCHSZM) & M_FW_RI_RES_WR_FETCHSZM)
+#define F_FW_RI_RES_WR_FETCHSZM	V_FW_RI_RES_WR_FETCHSZM(1U)
+
+#define S_FW_RI_RES_WR_STATUSPGNS	25
+#define M_FW_RI_RES_WR_STATUSPGNS	0x1
+#define V_FW_RI_RES_WR_STATUSPGNS(x)	((x) << S_FW_RI_RES_WR_STATUSPGNS)
+#define G_FW_RI_RES_WR_STATUSPGNS(x)	\
+    (((x) >> S_FW_RI_RES_WR_STATUSPGNS) & M_FW_RI_RES_WR_STATUSPGNS)
+#define F_FW_RI_RES_WR_STATUSPGNS	V_FW_RI_RES_WR_STATUSPGNS(1U)
+
+#define S_FW_RI_RES_WR_STATUSPGRO	24
+#define M_FW_RI_RES_WR_STATUSPGRO	0x1
+#define V_FW_RI_RES_WR_STATUSPGRO(x)	((x) << S_FW_RI_RES_WR_STATUSPGRO)
+#define G_FW_RI_RES_WR_STATUSPGRO(x)	\
+    (((x) >> S_FW_RI_RES_WR_STATUSPGRO) & M_FW_RI_RES_WR_STATUSPGRO)
+#define F_FW_RI_RES_WR_STATUSPGRO	V_FW_RI_RES_WR_STATUSPGRO(1U)
+
+#define S_FW_RI_RES_WR_FETCHNS		23
+#define M_FW_RI_RES_WR_FETCHNS		0x1
+#define V_FW_RI_RES_WR_FETCHNS(x)	((x) << S_FW_RI_RES_WR_FETCHNS)
+#define G_FW_RI_RES_WR_FETCHNS(x)	\
+    (((x) >> S_FW_RI_RES_WR_FETCHNS) & M_FW_RI_RES_WR_FETCHNS)
+#define F_FW_RI_RES_WR_FETCHNS	V_FW_RI_RES_WR_FETCHNS(1U)
+
+#define S_FW_RI_RES_WR_FETCHRO		22
+#define M_FW_RI_RES_WR_FETCHRO		0x1
+#define V_FW_RI_RES_WR_FETCHRO(x)	((x) << S_FW_RI_RES_WR_FETCHRO)
+#define G_FW_RI_RES_WR_FETCHRO(x)	\
+    (((x) >> S_FW_RI_RES_WR_FETCHRO) & M_FW_RI_RES_WR_FETCHRO)
+#define F_FW_RI_RES_WR_FETCHRO	V_FW_RI_RES_WR_FETCHRO(1U)
+
+#define S_FW_RI_RES_WR_HOSTFCMODE	20
+#define M_FW_RI_RES_WR_HOSTFCMODE	0x3
+#define V_FW_RI_RES_WR_HOSTFCMODE(x)	((x) << S_FW_RI_RES_WR_HOSTFCMODE)
+#define G_FW_RI_RES_WR_HOSTFCMODE(x)	\
+    (((x) >> S_FW_RI_RES_WR_HOSTFCMODE) & M_FW_RI_RES_WR_HOSTFCMODE)
+
+#define S_FW_RI_RES_WR_CPRIO	19
+#define M_FW_RI_RES_WR_CPRIO	0x1
+#define V_FW_RI_RES_WR_CPRIO(x)	((x) << S_FW_RI_RES_WR_CPRIO)
+#define G_FW_RI_RES_WR_CPRIO(x)	\
+    (((x) >> S_FW_RI_RES_WR_CPRIO) & M_FW_RI_RES_WR_CPRIO)
+#define F_FW_RI_RES_WR_CPRIO	V_FW_RI_RES_WR_CPRIO(1U)
+
+#define S_FW_RI_RES_WR_ONCHIP		18
+#define M_FW_RI_RES_WR_ONCHIP		0x1
+#define V_FW_RI_RES_WR_ONCHIP(x)	((x) << S_FW_RI_RES_WR_ONCHIP)
+#define G_FW_RI_RES_WR_ONCHIP(x)	\
+    (((x) >> S_FW_RI_RES_WR_ONCHIP) & M_FW_RI_RES_WR_ONCHIP)
+#define F_FW_RI_RES_WR_ONCHIP	V_FW_RI_RES_WR_ONCHIP(1U)
+
+#define S_FW_RI_RES_WR_PCIECHN		16
+#define M_FW_RI_RES_WR_PCIECHN		0x3
+#define V_FW_RI_RES_WR_PCIECHN(x)	((x) << S_FW_RI_RES_WR_PCIECHN)
+#define G_FW_RI_RES_WR_PCIECHN(x)	\
+    (((x) >> S_FW_RI_RES_WR_PCIECHN) & M_FW_RI_RES_WR_PCIECHN)
+
+#define S_FW_RI_RES_WR_IQID	0
+#define M_FW_RI_RES_WR_IQID	0xffff
+#define V_FW_RI_RES_WR_IQID(x)	((x) << S_FW_RI_RES_WR_IQID)
+#define G_FW_RI_RES_WR_IQID(x)	\
+    (((x) >> S_FW_RI_RES_WR_IQID) & M_FW_RI_RES_WR_IQID)
+
+#define S_FW_RI_RES_WR_DCAEN	31
+#define M_FW_RI_RES_WR_DCAEN	0x1
+#define V_FW_RI_RES_WR_DCAEN(x)	((x) << S_FW_RI_RES_WR_DCAEN)
+#define G_FW_RI_RES_WR_DCAEN(x)	\
+    (((x) >> S_FW_RI_RES_WR_DCAEN) & M_FW_RI_RES_WR_DCAEN)
+#define F_FW_RI_RES_WR_DCAEN	V_FW_RI_RES_WR_DCAEN(1U)
+
+#define S_FW_RI_RES_WR_DCACPU		26
+#define M_FW_RI_RES_WR_DCACPU		0x1f
+#define V_FW_RI_RES_WR_DCACPU(x)	((x) << S_FW_RI_RES_WR_DCACPU)
+#define G_FW_RI_RES_WR_DCACPU(x)	\
+    (((x) >> S_FW_RI_RES_WR_DCACPU) & M_FW_RI_RES_WR_DCACPU)
+
+#define S_FW_RI_RES_WR_FBMIN	23
+#define M_FW_RI_RES_WR_FBMIN	0x7
+#define V_FW_RI_RES_WR_FBMIN(x)	((x) << S_FW_RI_RES_WR_FBMIN)
+#define G_FW_RI_RES_WR_FBMIN(x)	\
+    (((x) >> S_FW_RI_RES_WR_FBMIN) & M_FW_RI_RES_WR_FBMIN)
+
+#define S_FW_RI_RES_WR_FBMAX	20
+#define M_FW_RI_RES_WR_FBMAX	0x7
+#define V_FW_RI_RES_WR_FBMAX(x)	((x) << S_FW_RI_RES_WR_FBMAX)
+#define G_FW_RI_RES_WR_FBMAX(x)	\
+    (((x) >> S_FW_RI_RES_WR_FBMAX) & M_FW_RI_RES_WR_FBMAX)
+
+#define S_FW_RI_RES_WR_CIDXFTHRESHO	19
+#define M_FW_RI_RES_WR_CIDXFTHRESHO	0x1
+#define V_FW_RI_RES_WR_CIDXFTHRESHO(x)	((x) << S_FW_RI_RES_WR_CIDXFTHRESHO)
+#define G_FW_RI_RES_WR_CIDXFTHRESHO(x)	\
+    (((x) >> S_FW_RI_RES_WR_CIDXFTHRESHO) & M_FW_RI_RES_WR_CIDXFTHRESHO)
+#define F_FW_RI_RES_WR_CIDXFTHRESHO	V_FW_RI_RES_WR_CIDXFTHRESHO(1U)
+
+#define S_FW_RI_RES_WR_CIDXFTHRESH	16
+#define M_FW_RI_RES_WR_CIDXFTHRESH	0x7
+#define V_FW_RI_RES_WR_CIDXFTHRESH(x)	((x) << S_FW_RI_RES_WR_CIDXFTHRESH)
+#define G_FW_RI_RES_WR_CIDXFTHRESH(x)	\
+    (((x) >> S_FW_RI_RES_WR_CIDXFTHRESH) & M_FW_RI_RES_WR_CIDXFTHRESH)
+
+#define S_FW_RI_RES_WR_EQSIZE		0
+#define M_FW_RI_RES_WR_EQSIZE		0xffff
+#define V_FW_RI_RES_WR_EQSIZE(x)	((x) << S_FW_RI_RES_WR_EQSIZE)
+#define G_FW_RI_RES_WR_EQSIZE(x)	\
+    (((x) >> S_FW_RI_RES_WR_EQSIZE) & M_FW_RI_RES_WR_EQSIZE)
+
+#define S_FW_RI_RES_WR_IQANDST		15
+#define M_FW_RI_RES_WR_IQANDST		0x1
+#define V_FW_RI_RES_WR_IQANDST(x)	((x) << S_FW_RI_RES_WR_IQANDST)
+#define G_FW_RI_RES_WR_IQANDST(x)	\
+    (((x) >> S_FW_RI_RES_WR_IQANDST) & M_FW_RI_RES_WR_IQANDST)
+#define F_FW_RI_RES_WR_IQANDST	V_FW_RI_RES_WR_IQANDST(1U)
+
+#define S_FW_RI_RES_WR_IQANUS		14
+#define M_FW_RI_RES_WR_IQANUS		0x1
+#define V_FW_RI_RES_WR_IQANUS(x)	((x) << S_FW_RI_RES_WR_IQANUS)
+#define G_FW_RI_RES_WR_IQANUS(x)	\
+    (((x) >> S_FW_RI_RES_WR_IQANUS) & M_FW_RI_RES_WR_IQANUS)
+#define F_FW_RI_RES_WR_IQANUS	V_FW_RI_RES_WR_IQANUS(1U)
+
+#define S_FW_RI_RES_WR_IQANUD		12
+#define M_FW_RI_RES_WR_IQANUD		0x3
+#define V_FW_RI_RES_WR_IQANUD(x)	((x) << S_FW_RI_RES_WR_IQANUD)
+#define G_FW_RI_RES_WR_IQANUD(x)	\
+    (((x) >> S_FW_RI_RES_WR_IQANUD) & M_FW_RI_RES_WR_IQANUD)
+
+#define S_FW_RI_RES_WR_IQANDSTINDEX	0
+#define M_FW_RI_RES_WR_IQANDSTINDEX	0xfff
+#define V_FW_RI_RES_WR_IQANDSTINDEX(x)	((x) << S_FW_RI_RES_WR_IQANDSTINDEX)
+#define G_FW_RI_RES_WR_IQANDSTINDEX(x)	\
+    (((x) >> S_FW_RI_RES_WR_IQANDSTINDEX) & M_FW_RI_RES_WR_IQANDSTINDEX)
+
+#define S_FW_RI_RES_WR_IQDROPRSS	15
+#define M_FW_RI_RES_WR_IQDROPRSS	0x1
+#define V_FW_RI_RES_WR_IQDROPRSS(x)	((x) << S_FW_RI_RES_WR_IQDROPRSS)
+#define G_FW_RI_RES_WR_IQDROPRSS(x)	\
+    (((x) >> S_FW_RI_RES_WR_IQDROPRSS) & M_FW_RI_RES_WR_IQDROPRSS)
+#define F_FW_RI_RES_WR_IQDROPRSS	V_FW_RI_RES_WR_IQDROPRSS(1U)
+
+#define S_FW_RI_RES_WR_IQGTSMODE	14
+#define M_FW_RI_RES_WR_IQGTSMODE	0x1
+#define V_FW_RI_RES_WR_IQGTSMODE(x)	((x) << S_FW_RI_RES_WR_IQGTSMODE)
+#define G_FW_RI_RES_WR_IQGTSMODE(x)	\
+    (((x) >> S_FW_RI_RES_WR_IQGTSMODE) & M_FW_RI_RES_WR_IQGTSMODE)
+#define F_FW_RI_RES_WR_IQGTSMODE	V_FW_RI_RES_WR_IQGTSMODE(1U)
+
+#define S_FW_RI_RES_WR_IQPCIECH		12
+#define M_FW_RI_RES_WR_IQPCIECH		0x3
+#define V_FW_RI_RES_WR_IQPCIECH(x)	((x) << S_FW_RI_RES_WR_IQPCIECH)
+#define G_FW_RI_RES_WR_IQPCIECH(x)	\
+    (((x) >> S_FW_RI_RES_WR_IQPCIECH) & M_FW_RI_RES_WR_IQPCIECH)
+
+#define S_FW_RI_RES_WR_IQDCAEN		11
+#define M_FW_RI_RES_WR_IQDCAEN		0x1
+#define V_FW_RI_RES_WR_IQDCAEN(x)	((x) << S_FW_RI_RES_WR_IQDCAEN)
+#define G_FW_RI_RES_WR_IQDCAEN(x)	\
+    (((x) >> S_FW_RI_RES_WR_IQDCAEN) & M_FW_RI_RES_WR_IQDCAEN)
+#define F_FW_RI_RES_WR_IQDCAEN	V_FW_RI_RES_WR_IQDCAEN(1U)
+
+#define S_FW_RI_RES_WR_IQDCACPU		6
+#define M_FW_RI_RES_WR_IQDCACPU		0x1f
+#define V_FW_RI_RES_WR_IQDCACPU(x)	((x) << S_FW_RI_RES_WR_IQDCACPU)
+#define G_FW_RI_RES_WR_IQDCACPU(x)	\
+    (((x) >> S_FW_RI_RES_WR_IQDCACPU) & M_FW_RI_RES_WR_IQDCACPU)
+
+#define S_FW_RI_RES_WR_IQINTCNTTHRESH		4
+#define M_FW_RI_RES_WR_IQINTCNTTHRESH		0x3
+#define V_FW_RI_RES_WR_IQINTCNTTHRESH(x)	\
+    ((x) << S_FW_RI_RES_WR_IQINTCNTTHRESH)
+#define G_FW_RI_RES_WR_IQINTCNTTHRESH(x)	\
+    (((x) >> S_FW_RI_RES_WR_IQINTCNTTHRESH) & M_FW_RI_RES_WR_IQINTCNTTHRESH)
+
+#define S_FW_RI_RES_WR_IQO	3
+#define M_FW_RI_RES_WR_IQO	0x1
+#define V_FW_RI_RES_WR_IQO(x)	((x) << S_FW_RI_RES_WR_IQO)
+#define G_FW_RI_RES_WR_IQO(x)	\
+    (((x) >> S_FW_RI_RES_WR_IQO) & M_FW_RI_RES_WR_IQO)
+#define F_FW_RI_RES_WR_IQO	V_FW_RI_RES_WR_IQO(1U)
+
+#define S_FW_RI_RES_WR_IQCPRIO		2
+#define M_FW_RI_RES_WR_IQCPRIO		0x1
+#define V_FW_RI_RES_WR_IQCPRIO(x)	((x) << S_FW_RI_RES_WR_IQCPRIO)
+#define G_FW_RI_RES_WR_IQCPRIO(x)	\
+    (((x) >> S_FW_RI_RES_WR_IQCPRIO) & M_FW_RI_RES_WR_IQCPRIO)
+#define F_FW_RI_RES_WR_IQCPRIO	V_FW_RI_RES_WR_IQCPRIO(1U)
+
+#define S_FW_RI_RES_WR_IQESIZE		0
+#define M_FW_RI_RES_WR_IQESIZE		0x3
+#define V_FW_RI_RES_WR_IQESIZE(x)	((x) << S_FW_RI_RES_WR_IQESIZE)
+#define G_FW_RI_RES_WR_IQESIZE(x)	\
+    (((x) >> S_FW_RI_RES_WR_IQESIZE) & M_FW_RI_RES_WR_IQESIZE)
+
+#define S_FW_RI_RES_WR_IQNS	31
+#define M_FW_RI_RES_WR_IQNS	0x1
+#define V_FW_RI_RES_WR_IQNS(x)	((x) << S_FW_RI_RES_WR_IQNS)
+#define G_FW_RI_RES_WR_IQNS(x)	\
+    (((x) >> S_FW_RI_RES_WR_IQNS) & M_FW_RI_RES_WR_IQNS)
+#define F_FW_RI_RES_WR_IQNS	V_FW_RI_RES_WR_IQNS(1U)
+
+#define S_FW_RI_RES_WR_IQRO	30
+#define M_FW_RI_RES_WR_IQRO	0x1
+#define V_FW_RI_RES_WR_IQRO(x)	((x) << S_FW_RI_RES_WR_IQRO)
+#define G_FW_RI_RES_WR_IQRO(x)	\
+    (((x) >> S_FW_RI_RES_WR_IQRO) & M_FW_RI_RES_WR_IQRO)
+#define F_FW_RI_RES_WR_IQRO	V_FW_RI_RES_WR_IQRO(1U)
+
+struct fw_ri_rdma_write_wr {
+	__u8   opcode;
+	__u8   flags;
+	__u16  wrid;
+	__u8   r1[3];
+	__u8   len16;
+	__be64 r2;
+	__be32 plen;
+	__be32 stag_sink;
+	__be64 to_sink;
+#ifndef C99_NOT_SUPPORTED
+	union {
+		struct fw_ri_immd immd_src[0];
+		struct fw_ri_isgl isgl_src[0];
+	} u;
+#endif
+};
+
+struct fw_ri_send_wr {
+	__u8   opcode;
+	__u8   flags;
+	__u16  wrid;
+	__u8   r1[3];
+	__u8   len16;
+	__be32 sendop_pkd;
+	__be32 stag_inv;
+	__be32 plen;
+	__be32 r3;
+	__be64 r4;
+#ifndef C99_NOT_SUPPORTED
+	union {
+		struct fw_ri_immd immd_src[0];
+		struct fw_ri_isgl isgl_src[0];
+	} u;
+#endif
+};
+
+#define S_FW_RI_SEND_WR_SENDOP		0
+#define M_FW_RI_SEND_WR_SENDOP		0xf
+#define V_FW_RI_SEND_WR_SENDOP(x)	((x) << S_FW_RI_SEND_WR_SENDOP)
+#define G_FW_RI_SEND_WR_SENDOP(x)	\
+    (((x) >> S_FW_RI_SEND_WR_SENDOP) & M_FW_RI_SEND_WR_SENDOP)
+
+struct fw_ri_rdma_read_wr {
+	__u8   opcode;
+	__u8   flags;
+	__u16  wrid;
+	__u8   r1[3];
+	__u8   len16;
+	__be64 r2;
+	__be32 stag_sink;
+	__be32 to_sink_hi;
+	__be32 to_sink_lo;
+	__be32 plen;
+	__be32 stag_src;
+	__be32 to_src_hi;
+	__be32 to_src_lo;
+	__be32 r5;
+};
+
+struct fw_ri_recv_wr {
+	__u8   opcode;
+	__u8   r1;
+	__u16  wrid;
+	__u8   r2[3];
+	__u8   len16;
+	struct fw_ri_isgl isgl;
+};
+
+struct fw_ri_bind_mw_wr {
+	__u8   opcode;
+	__u8   flags;
+	__u16  wrid;
+	__u8   r1[3];
+	__u8   len16;
+	__u8   qpbinde_to_dcacpu;
+	__u8   pgsz_shift;
+	__u8   addr_type;
+	__u8   mem_perms;
+	__be32 stag_mr;
+	__be32 stag_mw;
+	__be32 r3;
+	__be64 len_mw;
+	__be64 va_fbo;
+	__be64 r4;
+};
+
+#define S_FW_RI_BIND_MW_WR_QPBINDE	6
+#define M_FW_RI_BIND_MW_WR_QPBINDE	0x1
+#define V_FW_RI_BIND_MW_WR_QPBINDE(x)	((x) << S_FW_RI_BIND_MW_WR_QPBINDE)
+#define G_FW_RI_BIND_MW_WR_QPBINDE(x)	\
+    (((x) >> S_FW_RI_BIND_MW_WR_QPBINDE) & M_FW_RI_BIND_MW_WR_QPBINDE)
+#define F_FW_RI_BIND_MW_WR_QPBINDE	V_FW_RI_BIND_MW_WR_QPBINDE(1U)
+
+#define S_FW_RI_BIND_MW_WR_NS		5
+#define M_FW_RI_BIND_MW_WR_NS		0x1
+#define V_FW_RI_BIND_MW_WR_NS(x)	((x) << S_FW_RI_BIND_MW_WR_NS)
+#define G_FW_RI_BIND_MW_WR_NS(x)	\
+    (((x) >> S_FW_RI_BIND_MW_WR_NS) & M_FW_RI_BIND_MW_WR_NS)
+#define F_FW_RI_BIND_MW_WR_NS	V_FW_RI_BIND_MW_WR_NS(1U)
+
+#define S_FW_RI_BIND_MW_WR_DCACPU	0
+#define M_FW_RI_BIND_MW_WR_DCACPU	0x1f
+#define V_FW_RI_BIND_MW_WR_DCACPU(x)	((x) << S_FW_RI_BIND_MW_WR_DCACPU)
+#define G_FW_RI_BIND_MW_WR_DCACPU(x)	\
+    (((x) >> S_FW_RI_BIND_MW_WR_DCACPU) & M_FW_RI_BIND_MW_WR_DCACPU)
+
+struct fw_ri_fr_nsmr_wr {
+	__u8   opcode;
+	__u8   flags;
+	__u16  wrid;
+	__u8   r1[3];
+	__u8   len16;
+	__u8   qpbinde_to_dcacpu;
+	__u8   pgsz_shift;
+	__u8   addr_type;
+	__u8   mem_perms;
+	__be32 stag;
+	__be32 len_hi;
+	__be32 len_lo;
+	__be32 va_hi;
+	__be32 va_lo_fbo;
+};
+
+#define S_FW_RI_FR_NSMR_WR_QPBINDE	6
+#define M_FW_RI_FR_NSMR_WR_QPBINDE	0x1
+#define V_FW_RI_FR_NSMR_WR_QPBINDE(x)	((x) << S_FW_RI_FR_NSMR_WR_QPBINDE)
+#define G_FW_RI_FR_NSMR_WR_QPBINDE(x)	\
+    (((x) >> S_FW_RI_FR_NSMR_WR_QPBINDE) & M_FW_RI_FR_NSMR_WR_QPBINDE)
+#define F_FW_RI_FR_NSMR_WR_QPBINDE	V_FW_RI_FR_NSMR_WR_QPBINDE(1U)
+
+#define S_FW_RI_FR_NSMR_WR_NS		5
+#define M_FW_RI_FR_NSMR_WR_NS		0x1
+#define V_FW_RI_FR_NSMR_WR_NS(x)	((x) << S_FW_RI_FR_NSMR_WR_NS)
+#define G_FW_RI_FR_NSMR_WR_NS(x)	\
+    (((x) >> S_FW_RI_FR_NSMR_WR_NS) & M_FW_RI_FR_NSMR_WR_NS)
+#define F_FW_RI_FR_NSMR_WR_NS	V_FW_RI_FR_NSMR_WR_NS(1U)
+
+#define S_FW_RI_FR_NSMR_WR_DCACPU	0
+#define M_FW_RI_FR_NSMR_WR_DCACPU	0x1f
+#define V_FW_RI_FR_NSMR_WR_DCACPU(x)	((x) << S_FW_RI_FR_NSMR_WR_DCACPU)
+#define G_FW_RI_FR_NSMR_WR_DCACPU(x)	\
+    (((x) >> S_FW_RI_FR_NSMR_WR_DCACPU) & M_FW_RI_FR_NSMR_WR_DCACPU)
+
+struct fw_ri_inv_lstag_wr {
+	__u8   opcode;
+	__u8   flags;
+	__u16  wrid;
+	__u8   r1[3];
+	__u8   len16;
+	__be32 r2;
+	__be32 stag_inv;
+};
+
+enum fw_ri_type {
+	FW_RI_TYPE_INIT,
+	FW_RI_TYPE_FINI,
+	FW_RI_TYPE_TERMINATE
+};
+
+enum fw_ri_init_p2ptype {
+	FW_RI_INIT_P2PTYPE_RDMA_WRITE		= FW_RI_RDMA_WRITE,
+	FW_RI_INIT_P2PTYPE_READ_REQ		= FW_RI_READ_REQ,
+	FW_RI_INIT_P2PTYPE_SEND			= FW_RI_SEND,
+	FW_RI_INIT_P2PTYPE_SEND_WITH_INV	= FW_RI_SEND_WITH_INV,
+	FW_RI_INIT_P2PTYPE_SEND_WITH_SE		= FW_RI_SEND_WITH_SE,
+	FW_RI_INIT_P2PTYPE_SEND_WITH_SE_INV	= FW_RI_SEND_WITH_SE_INV,
+	FW_RI_INIT_P2PTYPE_DISABLED		= 0xf,
+};
+
+struct fw_ri_wr {
+	__be32 op_compl;
+	__be32 flowid_len16;
+	__u64  cookie;
+	union fw_ri {
+		struct fw_ri_init {
+			__u8   type;
+			__u8   mpareqbit_p2ptype;
+			__u8   r4[2];
+			__u8   mpa_attrs;
+			__u8   qp_caps;
+			__be16 nrqe;
+			__be32 pdid;
+			__be32 qpid;
+			__be32 sq_eqid;
+			__be32 rq_eqid;
+			__be32 scqid;
+			__be32 rcqid;
+			__be32 ord_max;
+			__be32 ird_max;
+			__be32 iss;
+			__be32 irs;
+			__be32 hwrqsize;
+			__be32 hwrqaddr;
+			__be64 r5;
+			union fw_ri_init_p2p {
+				struct fw_ri_rdma_write_wr write;
+				struct fw_ri_rdma_read_wr read;
+				struct fw_ri_send_wr send;
+			} u;
+		} init;
+		struct fw_ri_fini {
+			__u8   type;
+			__u8   r3[7];
+			__be64 r4;
+		} fini;
+		struct fw_ri_terminate {
+			__u8   type;
+			__u8   r3[3];
+			__be32 immdlen;
+			__u8   termmsg[40];
+		} terminate;
+	} u;
+};
+
+#define S_FW_RI_WR_MPAREQBIT	7
+#define M_FW_RI_WR_MPAREQBIT	0x1
+#define V_FW_RI_WR_MPAREQBIT(x)	((x) << S_FW_RI_WR_MPAREQBIT)
+#define G_FW_RI_WR_MPAREQBIT(x)	\
+    (((x) >> S_FW_RI_WR_MPAREQBIT) & M_FW_RI_WR_MPAREQBIT)
+#define F_FW_RI_WR_MPAREQBIT	V_FW_RI_WR_MPAREQBIT(1U)
+
+#define S_FW_RI_WR_P2PTYPE	0
+#define M_FW_RI_WR_P2PTYPE	0xf
+#define V_FW_RI_WR_P2PTYPE(x)	((x) << S_FW_RI_WR_P2PTYPE)
+#define G_FW_RI_WR_P2PTYPE(x)	\
+    (((x) >> S_FW_RI_WR_P2PTYPE) & M_FW_RI_WR_P2PTYPE)
+
+struct tcp_options {
+	__be16 mss;
+	__u8 wsf;
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+	__u8:4;
+	__u8 unknown:1;
+	__u8:1;
+	__u8 sack:1;
+	__u8 tstamp:1;
+#else
+	__u8 tstamp:1;
+	__u8 sack:1;
+	__u8:1;
+	__u8 unknown:1;
+	__u8:4;
+#endif
+};
+
+struct cpl_pass_accept_req {
+	union opcode_tid ot;
+	__be16 rsvd;
+	__be16 len;
+	__be32 hdr_len;
+	__be16 vlan;
+	__be16 l2info;
+	__be32 tos_stid;
+	struct tcp_options tcpopt;
+};
+
+/* cpl_pass_accept_req.hdr_len fields */
+#define S_SYN_RX_CHAN    0
+#define M_SYN_RX_CHAN    0xF
+#define V_SYN_RX_CHAN(x) ((x) << S_SYN_RX_CHAN)
+#define G_SYN_RX_CHAN(x) (((x) >> S_SYN_RX_CHAN) & M_SYN_RX_CHAN)
+
+#define S_TCP_HDR_LEN    10
+#define M_TCP_HDR_LEN    0x3F
+#define V_TCP_HDR_LEN(x) ((x) << S_TCP_HDR_LEN)
+#define G_TCP_HDR_LEN(x) (((x) >> S_TCP_HDR_LEN) & M_TCP_HDR_LEN)
+
+#define S_IP_HDR_LEN    16
+#define M_IP_HDR_LEN    0x3FF
+#define V_IP_HDR_LEN(x) ((x) << S_IP_HDR_LEN)
+#define G_IP_HDR_LEN(x) (((x) >> S_IP_HDR_LEN) & M_IP_HDR_LEN)
+
+#define S_ETH_HDR_LEN    26
+#define M_ETH_HDR_LEN    0x1F
+#define V_ETH_HDR_LEN(x) ((x) << S_ETH_HDR_LEN)
+#define G_ETH_HDR_LEN(x) (((x) >> S_ETH_HDR_LEN) & M_ETH_HDR_LEN)
+
+/* cpl_pass_accept_req.l2info fields */
+#define S_SYN_MAC_IDX    0
+#define M_SYN_MAC_IDX    0x1FF
+#define V_SYN_MAC_IDX(x) ((x) << S_SYN_MAC_IDX)
+#define G_SYN_MAC_IDX(x) (((x) >> S_SYN_MAC_IDX) & M_SYN_MAC_IDX)
+
+#define S_SYN_XACT_MATCH    9
+#define V_SYN_XACT_MATCH(x) ((x) << S_SYN_XACT_MATCH)
+#define F_SYN_XACT_MATCH    V_SYN_XACT_MATCH(1U)
+
+#define S_SYN_INTF    12
+#define M_SYN_INTF    0xF
+#define V_SYN_INTF(x) ((x) << S_SYN_INTF)
+#define G_SYN_INTF(x) (((x) >> S_SYN_INTF) & M_SYN_INTF)
+
+struct ulptx_idata {
+	__be32 cmd_more;
+	__be32 len;
+};
+
+#define S_ULPTX_NSGE    0
+#define M_ULPTX_NSGE    0xFFFF
+#define V_ULPTX_NSGE(x) ((x) << S_ULPTX_NSGE)
+
+#define S_RX_DACK_MODE    29
+#define M_RX_DACK_MODE    0x3
+#define V_RX_DACK_MODE(x) ((x) << S_RX_DACK_MODE)
+#define G_RX_DACK_MODE(x) (((x) >> S_RX_DACK_MODE) & M_RX_DACK_MODE)
+
+#define S_RX_DACK_CHANGE    31
+#define V_RX_DACK_CHANGE(x) ((x) << S_RX_DACK_CHANGE)
+#define F_RX_DACK_CHANGE    V_RX_DACK_CHANGE(1U)
+
+enum {                     /* TCP congestion control algorithms */
+	CONG_ALG_RENO,
+	CONG_ALG_TAHOE,
+	CONG_ALG_NEWRENO,
+	CONG_ALG_HIGHSPEED
+};
+
+#define S_CONG_CNTRL    14
+#define M_CONG_CNTRL    0x3
+#define V_CONG_CNTRL(x) ((x) << S_CONG_CNTRL)
+#define G_CONG_CNTRL(x) (((x) >> S_CONG_CNTRL) & M_CONG_CNTRL)
+
+#define CONG_CNTRL_VALID   (1 << 18)
+
+#endif /* _T4FW_RI_API_H_ */
diff --git a/drivers/infiniband/hw/cxgb4/user.h b/drivers/infiniband/hw/cxgb4/user.h
new file mode 100644
index 0000000..cbd0ce1
--- /dev/null
+++ b/drivers/infiniband/hw/cxgb4/user.h
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2009-2010 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __C4IW_USER_H__
+#define __C4IW_USER_H__
+
+#define C4IW_UVERBS_ABI_VERSION	2
+
+/*
+ * Make sure that all structs defined in this file remain laid out so
+ * that they pack the same way on 32-bit and 64-bit architectures (to
+ * avoid incompatibility between 32-bit userspace and 64-bit kernels).
+ * In particular do not use pointer types -- pass pointers in __u64
+ * instead.
+ */
+struct c4iw_create_cq_resp {
+	__u64 key;
+	__u64 gts_key;
+	__u64 memsize;
+	__u32 cqid;
+	__u32 size;
+	__u32 qid_mask;
+	__u32 reserved; /* explicit padding (optional for i386) */
+};
+
+
+enum {
+	C4IW_QPF_ONCHIP = (1<<0)
+};
+
+struct c4iw_create_qp_resp {
+	__u64 ma_sync_key;
+	__u64 sq_key;
+	__u64 rq_key;
+	__u64 sq_db_gts_key;
+	__u64 rq_db_gts_key;
+	__u64 sq_memsize;
+	__u64 rq_memsize;
+	__u32 sqid;
+	__u32 rqid;
+	__u32 sq_size;
+	__u32 rq_size;
+	__u32 qid_mask;
+	__u32 flags;
+};
+
+struct c4iw_alloc_ucontext_resp {
+	__u64 status_page_key;
+	__u32 status_page_size;
+	__u32 reserved; /* explicit padding (optional for i386) */
+};
+#endif
diff --git a/drivers/infiniband/hw/ehca/Kconfig b/drivers/infiniband/hw/ehca/Kconfig
new file mode 100644
index 0000000..59f807d
--- /dev/null
+++ b/drivers/infiniband/hw/ehca/Kconfig
@@ -0,0 +1,9 @@
+config INFINIBAND_EHCA
+	tristate "eHCA support"
+	depends on IBMEBUS
+	---help---
+	This driver supports the IBM pSeries eHCA InfiniBand adapter.
+
+	To compile the driver as a module, choose M here. The module
+	will be called ib_ehca.
+
diff --git a/drivers/infiniband/hw/ehca/Makefile b/drivers/infiniband/hw/ehca/Makefile
new file mode 100644
index 0000000..74d284e
--- /dev/null
+++ b/drivers/infiniband/hw/ehca/Makefile
@@ -0,0 +1,16 @@
+#  Authors: Heiko J Schick <schickhj@de.ibm.com>
+#           Christoph Raisch <raisch@de.ibm.com>
+#           Joachim Fenkes <fenkes@de.ibm.com>
+#
+#  Copyright (c) 2005 IBM Corporation
+#
+#  All rights reserved.
+#
+#  This source code is distributed under a dual license of GPL v2.0 and OpenIB BSD.
+
+obj-$(CONFIG_INFINIBAND_EHCA) += ib_ehca.o
+
+ib_ehca-objs  = ehca_main.o ehca_hca.o ehca_mcast.o ehca_pd.o ehca_av.o ehca_eq.o \
+		ehca_cq.o ehca_qp.o ehca_sqp.o ehca_mrmw.o ehca_reqs.o ehca_irq.o \
+		ehca_uverbs.o ipz_pt_fn.o hcp_if.o hcp_phyp.o
+
diff --git a/drivers/infiniband/hw/ehca/ehca_av.c b/drivers/infiniband/hw/ehca/ehca_av.c
new file mode 100644
index 0000000..4659263
--- /dev/null
+++ b/drivers/infiniband/hw/ehca/ehca_av.c
@@ -0,0 +1,277 @@
+/*
+ *  IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ *  address vector functions
+ *
+ *  Authors: Hoang-Nam Nguyen <hnguyen@de.ibm.com>
+ *           Khadija Souissi <souissik@de.ibm.com>
+ *           Reinhard Ernst <rernst@de.ibm.com>
+ *           Christoph Raisch <raisch@de.ibm.com>
+ *
+ *  Copyright (c) 2005 IBM Corporation
+ *
+ *  All rights reserved.
+ *
+ *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ *  BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/slab.h>
+
+#include "ehca_tools.h"
+#include "ehca_iverbs.h"
+#include "hcp_if.h"
+
+static struct kmem_cache *av_cache;
+
+int ehca_calc_ipd(struct ehca_shca *shca, int port,
+		  enum ib_rate path_rate, u32 *ipd)
+{
+	int path = ib_rate_to_mult(path_rate);
+	int link, ret;
+	struct ib_port_attr pa;
+
+	if (path_rate == IB_RATE_PORT_CURRENT) {
+		*ipd = 0;
+		return 0;
+	}
+
+	if (unlikely(path < 0)) {
+		ehca_err(&shca->ib_device, "Invalid static rate! path_rate=%x",
+			 path_rate);
+		return -EINVAL;
+	}
+
+	ret = ehca_query_port(&shca->ib_device, port, &pa);
+	if (unlikely(ret < 0)) {
+		ehca_err(&shca->ib_device, "Failed to query port  ret=%i", ret);
+		return ret;
+	}
+
+	link = ib_width_enum_to_int(pa.active_width) * pa.active_speed;
+
+	if (path >= link)
+		/* no need to throttle if path faster than link */
+		*ipd = 0;
+	else
+		/* IPD = round((link / path) - 1) */
+		*ipd = ((link + (path >> 1)) / path) - 1;
+
+	return 0;
+}
+
+struct ib_ah *ehca_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr)
+{
+	int ret;
+	struct ehca_av *av;
+	struct ehca_shca *shca = container_of(pd->device, struct ehca_shca,
+					      ib_device);
+
+	av = kmem_cache_alloc(av_cache, GFP_KERNEL);
+	if (!av) {
+		ehca_err(pd->device, "Out of memory pd=%p ah_attr=%p",
+			 pd, ah_attr);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	av->av.sl = ah_attr->sl;
+	av->av.dlid = ah_attr->dlid;
+	av->av.slid_path_bits = ah_attr->src_path_bits;
+
+	if (ehca_static_rate < 0) {
+		u32 ipd;
+		if (ehca_calc_ipd(shca, ah_attr->port_num,
+				  ah_attr->static_rate, &ipd)) {
+			ret = -EINVAL;
+			goto create_ah_exit1;
+		}
+		av->av.ipd = ipd;
+	} else
+		av->av.ipd = ehca_static_rate;
+
+	av->av.lnh = ah_attr->ah_flags;
+	av->av.grh.word_0 = EHCA_BMASK_SET(GRH_IPVERSION_MASK, 6);
+	av->av.grh.word_0 |= EHCA_BMASK_SET(GRH_TCLASS_MASK,
+					    ah_attr->grh.traffic_class);
+	av->av.grh.word_0 |= EHCA_BMASK_SET(GRH_FLOWLABEL_MASK,
+					    ah_attr->grh.flow_label);
+	av->av.grh.word_0 |= EHCA_BMASK_SET(GRH_HOPLIMIT_MASK,
+					    ah_attr->grh.hop_limit);
+	av->av.grh.word_0 |= EHCA_BMASK_SET(GRH_NEXTHEADER_MASK, 0x1B);
+	/* set sgid in grh.word_1 */
+	if (ah_attr->ah_flags & IB_AH_GRH) {
+		int rc;
+		struct ib_port_attr port_attr;
+		union ib_gid gid;
+		memset(&port_attr, 0, sizeof(port_attr));
+		rc = ehca_query_port(pd->device, ah_attr->port_num,
+				     &port_attr);
+		if (rc) { /* invalid port number */
+			ret = -EINVAL;
+			ehca_err(pd->device, "Invalid port number "
+				 "ehca_query_port() returned %x "
+				 "pd=%p ah_attr=%p", rc, pd, ah_attr);
+			goto create_ah_exit1;
+		}
+		memset(&gid, 0, sizeof(gid));
+		rc = ehca_query_gid(pd->device,
+				    ah_attr->port_num,
+				    ah_attr->grh.sgid_index, &gid);
+		if (rc) {
+			ret = -EINVAL;
+			ehca_err(pd->device, "Failed to retrieve sgid "
+				 "ehca_query_gid() returned %x "
+				 "pd=%p ah_attr=%p", rc, pd, ah_attr);
+			goto create_ah_exit1;
+		}
+		memcpy(&av->av.grh.word_1, &gid, sizeof(gid));
+	}
+	av->av.pmtu = shca->max_mtu;
+
+	/* dgid comes in grh.word_3 */
+	memcpy(&av->av.grh.word_3, &ah_attr->grh.dgid,
+	       sizeof(ah_attr->grh.dgid));
+
+	return &av->ib_ah;
+
+create_ah_exit1:
+	kmem_cache_free(av_cache, av);
+
+	return ERR_PTR(ret);
+}
+
+int ehca_modify_ah(struct ib_ah *ah, struct ib_ah_attr *ah_attr)
+{
+	struct ehca_av *av;
+	struct ehca_ud_av new_ehca_av;
+	struct ehca_shca *shca = container_of(ah->pd->device, struct ehca_shca,
+					      ib_device);
+
+	memset(&new_ehca_av, 0, sizeof(new_ehca_av));
+	new_ehca_av.sl = ah_attr->sl;
+	new_ehca_av.dlid = ah_attr->dlid;
+	new_ehca_av.slid_path_bits = ah_attr->src_path_bits;
+	new_ehca_av.ipd = ah_attr->static_rate;
+	new_ehca_av.lnh = EHCA_BMASK_SET(GRH_FLAG_MASK,
+					 (ah_attr->ah_flags & IB_AH_GRH) > 0);
+	new_ehca_av.grh.word_0 = EHCA_BMASK_SET(GRH_TCLASS_MASK,
+						ah_attr->grh.traffic_class);
+	new_ehca_av.grh.word_0 |= EHCA_BMASK_SET(GRH_FLOWLABEL_MASK,
+						 ah_attr->grh.flow_label);
+	new_ehca_av.grh.word_0 |= EHCA_BMASK_SET(GRH_HOPLIMIT_MASK,
+						 ah_attr->grh.hop_limit);
+	new_ehca_av.grh.word_0 |= EHCA_BMASK_SET(GRH_NEXTHEADER_MASK, 0x1b);
+
+	/* set sgid in grh.word_1 */
+	if (ah_attr->ah_flags & IB_AH_GRH) {
+		int rc;
+		struct ib_port_attr port_attr;
+		union ib_gid gid;
+		memset(&port_attr, 0, sizeof(port_attr));
+		rc = ehca_query_port(ah->device, ah_attr->port_num,
+				     &port_attr);
+		if (rc) { /* invalid port number */
+			ehca_err(ah->device, "Invalid port number "
+				 "ehca_query_port() returned %x "
+				 "ah=%p ah_attr=%p port_num=%x",
+				 rc, ah, ah_attr, ah_attr->port_num);
+			return -EINVAL;
+		}
+		memset(&gid, 0, sizeof(gid));
+		rc = ehca_query_gid(ah->device,
+				    ah_attr->port_num,
+				    ah_attr->grh.sgid_index, &gid);
+		if (rc) {
+			ehca_err(ah->device, "Failed to retrieve sgid "
+				 "ehca_query_gid() returned %x "
+				 "ah=%p ah_attr=%p port_num=%x "
+				 "sgid_index=%x",
+				 rc, ah, ah_attr, ah_attr->port_num,
+				 ah_attr->grh.sgid_index);
+			return -EINVAL;
+		}
+		memcpy(&new_ehca_av.grh.word_1, &gid, sizeof(gid));
+	}
+
+	new_ehca_av.pmtu = shca->max_mtu;
+
+	memcpy(&new_ehca_av.grh.word_3, &ah_attr->grh.dgid,
+	       sizeof(ah_attr->grh.dgid));
+
+	av = container_of(ah, struct ehca_av, ib_ah);
+	av->av = new_ehca_av;
+
+	return 0;
+}
+
+int ehca_query_ah(struct ib_ah *ah, struct ib_ah_attr *ah_attr)
+{
+	struct ehca_av *av = container_of(ah, struct ehca_av, ib_ah);
+
+	memcpy(&ah_attr->grh.dgid, &av->av.grh.word_3,
+	       sizeof(ah_attr->grh.dgid));
+	ah_attr->sl = av->av.sl;
+
+	ah_attr->dlid = av->av.dlid;
+
+	ah_attr->src_path_bits = av->av.slid_path_bits;
+	ah_attr->static_rate = av->av.ipd;
+	ah_attr->ah_flags = EHCA_BMASK_GET(GRH_FLAG_MASK, av->av.lnh);
+	ah_attr->grh.traffic_class = EHCA_BMASK_GET(GRH_TCLASS_MASK,
+						    av->av.grh.word_0);
+	ah_attr->grh.hop_limit = EHCA_BMASK_GET(GRH_HOPLIMIT_MASK,
+						av->av.grh.word_0);
+	ah_attr->grh.flow_label = EHCA_BMASK_GET(GRH_FLOWLABEL_MASK,
+						 av->av.grh.word_0);
+
+	return 0;
+}
+
+int ehca_destroy_ah(struct ib_ah *ah)
+{
+	kmem_cache_free(av_cache, container_of(ah, struct ehca_av, ib_ah));
+
+	return 0;
+}
+
+int ehca_init_av_cache(void)
+{
+	av_cache = kmem_cache_create("ehca_cache_av",
+				   sizeof(struct ehca_av), 0,
+				   SLAB_HWCACHE_ALIGN,
+				   NULL);
+	if (!av_cache)
+		return -ENOMEM;
+	return 0;
+}
+
+void ehca_cleanup_av_cache(void)
+{
+	if (av_cache)
+		kmem_cache_destroy(av_cache);
+}
diff --git a/drivers/infiniband/hw/ehca/ehca_classes.h b/drivers/infiniband/hw/ehca/ehca_classes.h
new file mode 100644
index 0000000..bd45e0f
--- /dev/null
+++ b/drivers/infiniband/hw/ehca/ehca_classes.h
@@ -0,0 +1,482 @@
+/*
+ *  IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ *  Struct definition for eHCA internal structures
+ *
+ *  Authors: Heiko J Schick <schickhj@de.ibm.com>
+ *           Christoph Raisch <raisch@de.ibm.com>
+ *           Joachim Fenkes <fenkes@de.ibm.com>
+ *
+ *  Copyright (c) 2005 IBM Corporation
+ *
+ *  All rights reserved.
+ *
+ *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ *  BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __EHCA_CLASSES_H__
+#define __EHCA_CLASSES_H__
+
+struct ehca_module;
+struct ehca_qp;
+struct ehca_cq;
+struct ehca_eq;
+struct ehca_mr;
+struct ehca_mw;
+struct ehca_pd;
+struct ehca_av;
+
+#include <linux/wait.h>
+#include <linux/mutex.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_user_verbs.h>
+
+#ifdef CONFIG_PPC64
+#include "ehca_classes_pSeries.h"
+#endif
+#include "ipz_pt_fn.h"
+#include "ehca_qes.h"
+#include "ehca_irq.h"
+
+#define EHCA_EQE_CACHE_SIZE 20
+#define EHCA_MAX_NUM_QUEUES 0xffff
+
+struct ehca_eqe_cache_entry {
+	struct ehca_eqe *eqe;
+	struct ehca_cq *cq;
+};
+
+struct ehca_eq {
+	u32 length;
+	struct ipz_queue ipz_queue;
+	struct ipz_eq_handle ipz_eq_handle;
+	struct work_struct work;
+	struct h_galpas galpas;
+	int is_initialized;
+	struct ehca_pfeq pf;
+	spinlock_t spinlock;
+	struct tasklet_struct interrupt_task;
+	u32 ist;
+	spinlock_t irq_spinlock;
+	struct ehca_eqe_cache_entry eqe_cache[EHCA_EQE_CACHE_SIZE];
+};
+
+struct ehca_sma_attr {
+	u16 lid, lmc, sm_sl, sm_lid;
+	u16 pkey_tbl_len, pkeys[16];
+};
+
+struct ehca_sport {
+	struct ib_cq *ibcq_aqp1;
+	struct ib_qp *ibqp_sqp[2];
+	/* lock to serialze modify_qp() calls for sqp in normal
+	 * and irq path (when event PORT_ACTIVE is received first time)
+	 */
+	spinlock_t mod_sqp_lock;
+	enum ib_port_state port_state;
+	struct ehca_sma_attr saved_attr;
+	u32 pma_qp_nr;
+};
+
+#define HCA_CAP_MR_PGSIZE_4K  0x80000000
+#define HCA_CAP_MR_PGSIZE_64K 0x40000000
+#define HCA_CAP_MR_PGSIZE_1M  0x20000000
+#define HCA_CAP_MR_PGSIZE_16M 0x10000000
+
+struct ehca_shca {
+	struct ib_device ib_device;
+	struct platform_device *ofdev;
+	u8 num_ports;
+	int hw_level;
+	struct list_head shca_list;
+	struct ipz_adapter_handle ipz_hca_handle;
+	struct ehca_sport sport[2];
+	struct ehca_eq eq;
+	struct ehca_eq neq;
+	struct ehca_mr *maxmr;
+	struct ehca_pd *pd;
+	struct h_galpas galpas;
+	struct mutex modify_mutex;
+	u64 hca_cap;
+	/* MR pgsize: bit 0-3 means 4K, 64K, 1M, 16M respectively */
+	u32 hca_cap_mr_pgsize;
+	int max_mtu;
+	int max_num_qps;
+	int max_num_cqs;
+	atomic_t num_cqs;
+	atomic_t num_qps;
+};
+
+struct ehca_pd {
+	struct ib_pd ib_pd;
+	struct ipz_pd fw_pd;
+	/* small queue mgmt */
+	struct mutex lock;
+	struct list_head free[2];
+	struct list_head full[2];
+};
+
+enum ehca_ext_qp_type {
+	EQPT_NORMAL    = 0,
+	EQPT_LLQP      = 1,
+	EQPT_SRQBASE   = 2,
+	EQPT_SRQ       = 3,
+};
+
+/* struct to cache modify_qp()'s parms for GSI/SMI qp */
+struct ehca_mod_qp_parm {
+	int mask;
+	struct ib_qp_attr attr;
+};
+
+#define EHCA_MOD_QP_PARM_MAX 4
+
+#define QMAP_IDX_MASK 0xFFFFULL
+
+/* struct for tracking if cqes have been reported to the application */
+struct ehca_qmap_entry {
+	u16 app_wr_id;
+	u8 reported;
+	u8 cqe_req;
+};
+
+struct ehca_queue_map {
+	struct ehca_qmap_entry *map;
+	unsigned int entries;
+	unsigned int tail;
+	unsigned int left_to_poll;
+	unsigned int next_wqe_idx;   /* Idx to first wqe to be flushed */
+};
+
+/* function to calculate the next index for the qmap */
+static inline unsigned int next_index(unsigned int cur_index, unsigned int limit)
+{
+	unsigned int temp = cur_index + 1;
+	return (temp == limit) ? 0 : temp;
+}
+
+struct ehca_qp {
+	union {
+		struct ib_qp ib_qp;
+		struct ib_srq ib_srq;
+	};
+	u32 qp_type;
+	enum ehca_ext_qp_type ext_type;
+	enum ib_qp_state state;
+	struct ipz_queue ipz_squeue;
+	struct ehca_queue_map sq_map;
+	struct ipz_queue ipz_rqueue;
+	struct ehca_queue_map rq_map;
+	struct h_galpas galpas;
+	u32 qkey;
+	u32 real_qp_num;
+	u32 token;
+	spinlock_t spinlock_s;
+	spinlock_t spinlock_r;
+	u32 sq_max_inline_data_size;
+	struct ipz_qp_handle ipz_qp_handle;
+	struct ehca_pfqp pf;
+	struct ib_qp_init_attr init_attr;
+	struct ehca_cq *send_cq;
+	struct ehca_cq *recv_cq;
+	unsigned int sqerr_purgeflag;
+	struct hlist_node list_entries;
+	/* array to cache modify_qp()'s parms for GSI/SMI qp */
+	struct ehca_mod_qp_parm *mod_qp_parm;
+	int mod_qp_parm_idx;
+	/* mmap counter for resources mapped into user space */
+	u32 mm_count_squeue;
+	u32 mm_count_rqueue;
+	u32 mm_count_galpa;
+	/* unsolicited ack circumvention */
+	int unsol_ack_circ;
+	int mtu_shift;
+	u32 message_count;
+	u32 packet_count;
+	atomic_t nr_events; /* events seen */
+	wait_queue_head_t wait_completion;
+	int mig_armed;
+	struct list_head sq_err_node;
+	struct list_head rq_err_node;
+};
+
+#define IS_SRQ(qp) (qp->ext_type == EQPT_SRQ)
+#define HAS_SQ(qp) (qp->ext_type != EQPT_SRQ)
+#define HAS_RQ(qp) (qp->ext_type != EQPT_SRQBASE)
+
+/* must be power of 2 */
+#define QP_HASHTAB_LEN 8
+
+struct ehca_cq {
+	struct ib_cq ib_cq;
+	struct ipz_queue ipz_queue;
+	struct h_galpas galpas;
+	spinlock_t spinlock;
+	u32 cq_number;
+	u32 token;
+	u32 nr_of_entries;
+	struct ipz_cq_handle ipz_cq_handle;
+	struct ehca_pfcq pf;
+	spinlock_t cb_lock;
+	struct hlist_head qp_hashtab[QP_HASHTAB_LEN];
+	struct list_head entry;
+	u32 nr_callbacks;   /* #events assigned to cpu by scaling code */
+	atomic_t nr_events; /* #events seen */
+	wait_queue_head_t wait_completion;
+	spinlock_t task_lock;
+	/* mmap counter for resources mapped into user space */
+	u32 mm_count_queue;
+	u32 mm_count_galpa;
+	struct list_head sqp_err_list;
+	struct list_head rqp_err_list;
+};
+
+enum ehca_mr_flag {
+	EHCA_MR_FLAG_FMR = 0x80000000,	 /* FMR, created with ehca_alloc_fmr */
+	EHCA_MR_FLAG_MAXMR = 0x40000000, /* max-MR                           */
+};
+
+struct ehca_mr {
+	union {
+		struct ib_mr ib_mr;	/* must always be first in ehca_mr */
+		struct ib_fmr ib_fmr;	/* must always be first in ehca_mr */
+	} ib;
+	struct ib_umem *umem;
+	spinlock_t mrlock;
+
+	enum ehca_mr_flag flags;
+	u32 num_kpages;		/* number of kernel pages */
+	u32 num_hwpages;	/* number of hw pages to form MR */
+	u64 hwpage_size;	/* hw page size used for this MR */
+	int acl;		/* ACL (stored here for usage in reregister) */
+	u64 *start;		/* virtual start address (stored here for */
+				/* usage in reregister) */
+	u64 size;		/* size (stored here for usage in reregister) */
+	u32 fmr_page_size;	/* page size for FMR */
+	u32 fmr_max_pages;	/* max pages for FMR */
+	u32 fmr_max_maps;	/* max outstanding maps for FMR */
+	u32 fmr_map_cnt;	/* map counter for FMR */
+	/* fw specific data */
+	struct ipz_mrmw_handle ipz_mr_handle;	/* MR handle for h-calls */
+	struct h_galpas galpas;
+};
+
+struct ehca_mw {
+	struct ib_mw ib_mw;	/* gen2 mw, must always be first in ehca_mw */
+	spinlock_t mwlock;
+
+	u8 never_bound;		/* indication MW was never bound */
+	struct ipz_mrmw_handle ipz_mw_handle;	/* MW handle for h-calls */
+	struct h_galpas galpas;
+};
+
+enum ehca_mr_pgi_type {
+	EHCA_MR_PGI_PHYS   = 1,  /* type of ehca_reg_phys_mr,
+				  * ehca_rereg_phys_mr,
+				  * ehca_reg_internal_maxmr */
+	EHCA_MR_PGI_USER   = 2,  /* type of ehca_reg_user_mr */
+	EHCA_MR_PGI_FMR    = 3   /* type of ehca_map_phys_fmr */
+};
+
+struct ehca_mr_pginfo {
+	enum ehca_mr_pgi_type type;
+	u64 num_kpages;
+	u64 kpage_cnt;
+	u64 hwpage_size;     /* hw page size used for this MR */
+	u64 num_hwpages;     /* number of hw pages */
+	u64 hwpage_cnt;      /* counter for hw pages */
+	u64 next_hwpage;     /* next hw page in buffer/chunk/listelem */
+
+	union {
+		struct { /* type EHCA_MR_PGI_PHYS section */
+			int num_phys_buf;
+			struct ib_phys_buf *phys_buf_array;
+			u64 next_buf;
+		} phy;
+		struct { /* type EHCA_MR_PGI_USER section */
+			struct ib_umem *region;
+			struct scatterlist *next_sg;
+			u64 next_nmap;
+		} usr;
+		struct { /* type EHCA_MR_PGI_FMR section */
+			u64 fmr_pgsize;
+			u64 *page_list;
+			u64 next_listelem;
+		} fmr;
+	} u;
+};
+
+/* output parameters for MR/FMR hipz calls */
+struct ehca_mr_hipzout_parms {
+	struct ipz_mrmw_handle handle;
+	u32 lkey;
+	u32 rkey;
+	u64 len;
+	u64 vaddr;
+	u32 acl;
+};
+
+/* output parameters for MW hipz calls */
+struct ehca_mw_hipzout_parms {
+	struct ipz_mrmw_handle handle;
+	u32 rkey;
+};
+
+struct ehca_av {
+	struct ib_ah ib_ah;
+	struct ehca_ud_av av;
+};
+
+struct ehca_ucontext {
+	struct ib_ucontext ib_ucontext;
+};
+
+int ehca_init_pd_cache(void);
+void ehca_cleanup_pd_cache(void);
+int ehca_init_cq_cache(void);
+void ehca_cleanup_cq_cache(void);
+int ehca_init_qp_cache(void);
+void ehca_cleanup_qp_cache(void);
+int ehca_init_av_cache(void);
+void ehca_cleanup_av_cache(void);
+int ehca_init_mrmw_cache(void);
+void ehca_cleanup_mrmw_cache(void);
+int ehca_init_small_qp_cache(void);
+void ehca_cleanup_small_qp_cache(void);
+
+extern rwlock_t ehca_qp_idr_lock;
+extern rwlock_t ehca_cq_idr_lock;
+extern struct idr ehca_qp_idr;
+extern struct idr ehca_cq_idr;
+extern spinlock_t shca_list_lock;
+
+extern int ehca_static_rate;
+extern int ehca_port_act_time;
+extern bool ehca_use_hp_mr;
+extern bool ehca_scaling_code;
+extern int ehca_lock_hcalls;
+extern int ehca_nr_ports;
+extern int ehca_max_cq;
+extern int ehca_max_qp;
+
+struct ipzu_queue_resp {
+	u32 qe_size;      /* queue entry size */
+	u32 act_nr_of_sg;
+	u32 queue_length; /* queue length allocated in bytes */
+	u32 pagesize;
+	u32 toggle_state;
+	u32 offset; /* save offset within a page for small_qp */
+};
+
+struct ehca_create_cq_resp {
+	u32 cq_number;
+	u32 token;
+	struct ipzu_queue_resp ipz_queue;
+	u32 fw_handle_ofs;
+	u32 dummy;
+};
+
+struct ehca_create_qp_resp {
+	u32 qp_num;
+	u32 token;
+	u32 qp_type;
+	u32 ext_type;
+	u32 qkey;
+	/* qp_num assigned by ehca: sqp0/1 may have got different numbers */
+	u32 real_qp_num;
+	u32 fw_handle_ofs;
+	u32 dummy;
+	struct ipzu_queue_resp ipz_squeue;
+	struct ipzu_queue_resp ipz_rqueue;
+};
+
+struct ehca_alloc_cq_parms {
+	u32 nr_cqe;
+	u32 act_nr_of_entries;
+	u32 act_pages;
+	struct ipz_eq_handle eq_handle;
+};
+
+enum ehca_service_type {
+	ST_RC  = 0,
+	ST_UC  = 1,
+	ST_RD  = 2,
+	ST_UD  = 3,
+};
+
+enum ehca_ll_comp_flags {
+	LLQP_SEND_COMP = 0x20,
+	LLQP_RECV_COMP = 0x40,
+	LLQP_COMP_MASK = 0x60,
+};
+
+struct ehca_alloc_queue_parms {
+	/* input parameters */
+	int max_wr;
+	int max_sge;
+	int page_size;
+	int is_small;
+
+	/* output parameters */
+	u16 act_nr_wqes;
+	u8  act_nr_sges;
+	u32 queue_size; /* bytes for small queues, pages otherwise */
+};
+
+struct ehca_alloc_qp_parms {
+	struct ehca_alloc_queue_parms squeue;
+	struct ehca_alloc_queue_parms rqueue;
+
+	/* input parameters */
+	enum ehca_service_type servicetype;
+	int qp_storage;
+	int sigtype;
+	enum ehca_ext_qp_type ext_type;
+	enum ehca_ll_comp_flags ll_comp_flags;
+	int ud_av_l_key_ctl;
+
+	u32 token;
+	struct ipz_eq_handle eq_handle;
+	struct ipz_pd pd;
+	struct ipz_cq_handle send_cq_handle, recv_cq_handle;
+
+	u32 srq_qpn, srq_token, srq_limit;
+
+	/* output parameters */
+	u32 real_qp_num;
+	struct ipz_qp_handle qp_handle;
+	struct h_galpas galpas;
+};
+
+int ehca_cq_assign_qp(struct ehca_cq *cq, struct ehca_qp *qp);
+int ehca_cq_unassign_qp(struct ehca_cq *cq, unsigned int qp_num);
+struct ehca_qp *ehca_cq_get_qp(struct ehca_cq *cq, int qp_num);
+
+#endif
diff --git a/drivers/infiniband/hw/ehca/ehca_classes_pSeries.h b/drivers/infiniband/hw/ehca/ehca_classes_pSeries.h
new file mode 100644
index 0000000..689c357
--- /dev/null
+++ b/drivers/infiniband/hw/ehca/ehca_classes_pSeries.h
@@ -0,0 +1,208 @@
+/*
+ *  IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ *  pSeries interface definitions
+ *
+ *  Authors: Waleri Fomin <fomin@de.ibm.com>
+ *           Christoph Raisch <raisch@de.ibm.com>
+ *
+ *  Copyright (c) 2005 IBM Corporation
+ *
+ *  All rights reserved.
+ *
+ *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ *  BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __EHCA_CLASSES_PSERIES_H__
+#define __EHCA_CLASSES_PSERIES_H__
+
+#include "hcp_phyp.h"
+#include "ipz_pt_fn.h"
+
+
+struct ehca_pfqp {
+	struct ipz_qpt sqpt;
+	struct ipz_qpt rqpt;
+};
+
+struct ehca_pfcq {
+	struct ipz_qpt qpt;
+	u32 cqnr;
+};
+
+struct ehca_pfeq {
+	struct ipz_qpt qpt;
+	struct h_galpa galpa;
+	u32 eqnr;
+};
+
+struct ipz_adapter_handle {
+	u64 handle;
+};
+
+struct ipz_cq_handle {
+	u64 handle;
+};
+
+struct ipz_eq_handle {
+	u64 handle;
+};
+
+struct ipz_qp_handle {
+	u64 handle;
+};
+struct ipz_mrmw_handle {
+	u64 handle;
+};
+
+struct ipz_pd {
+	u32 value;
+};
+
+struct hcp_modify_qp_control_block {
+	u32 qkey;                      /* 00 */
+	u32 rdd;                       /* reliable datagram domain */
+	u32 send_psn;                  /* 02 */
+	u32 receive_psn;               /* 03 */
+	u32 prim_phys_port;            /* 04 */
+	u32 alt_phys_port;             /* 05 */
+	u32 prim_p_key_idx;            /* 06 */
+	u32 alt_p_key_idx;             /* 07 */
+	u32 rdma_atomic_ctrl;          /* 08 */
+	u32 qp_state;                  /* 09 */
+	u32 reserved_10;               /* 10 */
+	u32 rdma_nr_atomic_resp_res;   /* 11 */
+	u32 path_migration_state;      /* 12 */
+	u32 rdma_atomic_outst_dest_qp; /* 13 */
+	u32 dest_qp_nr;                /* 14 */
+	u32 min_rnr_nak_timer_field;   /* 15 */
+	u32 service_level;             /* 16 */
+	u32 send_grh_flag;             /* 17 */
+	u32 retry_count;               /* 18 */
+	u32 timeout;                   /* 19 */
+	u32 path_mtu;                  /* 20 */
+	u32 max_static_rate;           /* 21 */
+	u32 dlid;                      /* 22 */
+	u32 rnr_retry_count;           /* 23 */
+	u32 source_path_bits;          /* 24 */
+	u32 traffic_class;             /* 25 */
+	u32 hop_limit;                 /* 26 */
+	u32 source_gid_idx;            /* 27 */
+	u32 flow_label;                /* 28 */
+	u32 reserved_29;               /* 29 */
+	union {                        /* 30 */
+		u64 dw[2];
+		u8 byte[16];
+	} dest_gid;
+	u32 service_level_al;          /* 34 */
+	u32 send_grh_flag_al;          /* 35 */
+	u32 retry_count_al;            /* 36 */
+	u32 timeout_al;                /* 37 */
+	u32 max_static_rate_al;        /* 38 */
+	u32 dlid_al;                   /* 39 */
+	u32 rnr_retry_count_al;        /* 40 */
+	u32 source_path_bits_al;       /* 41 */
+	u32 traffic_class_al;          /* 42 */
+	u32 hop_limit_al;              /* 43 */
+	u32 source_gid_idx_al;         /* 44 */
+	u32 flow_label_al;             /* 45 */
+	u32 reserved_46;               /* 46 */
+	u32 reserved_47;               /* 47 */
+	union {                        /* 48 */
+		u64 dw[2];
+		u8 byte[16];
+	} dest_gid_al;
+	u32 max_nr_outst_send_wr;      /* 52 */
+	u32 max_nr_outst_recv_wr;      /* 53 */
+	u32 disable_ete_credit_check;  /* 54 */
+	u32 qp_number;                 /* 55 */
+	u64 send_queue_handle;         /* 56 */
+	u64 recv_queue_handle;         /* 58 */
+	u32 actual_nr_sges_in_sq_wqe;  /* 60 */
+	u32 actual_nr_sges_in_rq_wqe;  /* 61 */
+	u32 qp_enable;                 /* 62 */
+	u32 curr_srq_limit;            /* 63 */
+	u64 qp_aff_asyn_ev_log_reg;    /* 64 */
+	u64 shared_rq_hndl;            /* 66 */
+	u64 trigg_doorbell_qp_hndl;    /* 68 */
+	u32 reserved_70_127[58];       /* 70 */
+};
+
+#define MQPCB_MASK_QKEY                         EHCA_BMASK_IBM( 0,  0)
+#define MQPCB_MASK_SEND_PSN                     EHCA_BMASK_IBM( 2,  2)
+#define MQPCB_MASK_RECEIVE_PSN                  EHCA_BMASK_IBM( 3,  3)
+#define MQPCB_MASK_PRIM_PHYS_PORT               EHCA_BMASK_IBM( 4,  4)
+#define MQPCB_PRIM_PHYS_PORT                    EHCA_BMASK_IBM(24, 31)
+#define MQPCB_MASK_ALT_PHYS_PORT                EHCA_BMASK_IBM( 5,  5)
+#define MQPCB_MASK_PRIM_P_KEY_IDX               EHCA_BMASK_IBM( 6,  6)
+#define MQPCB_PRIM_P_KEY_IDX                    EHCA_BMASK_IBM(24, 31)
+#define MQPCB_MASK_ALT_P_KEY_IDX                EHCA_BMASK_IBM( 7,  7)
+#define MQPCB_MASK_RDMA_ATOMIC_CTRL             EHCA_BMASK_IBM( 8,  8)
+#define MQPCB_MASK_QP_STATE                     EHCA_BMASK_IBM( 9,  9)
+#define MQPCB_MASK_RDMA_NR_ATOMIC_RESP_RES      EHCA_BMASK_IBM(11, 11)
+#define MQPCB_MASK_PATH_MIGRATION_STATE         EHCA_BMASK_IBM(12, 12)
+#define MQPCB_MASK_RDMA_ATOMIC_OUTST_DEST_QP    EHCA_BMASK_IBM(13, 13)
+#define MQPCB_MASK_DEST_QP_NR                   EHCA_BMASK_IBM(14, 14)
+#define MQPCB_MASK_MIN_RNR_NAK_TIMER_FIELD      EHCA_BMASK_IBM(15, 15)
+#define MQPCB_MASK_SERVICE_LEVEL                EHCA_BMASK_IBM(16, 16)
+#define MQPCB_MASK_SEND_GRH_FLAG                EHCA_BMASK_IBM(17, 17)
+#define MQPCB_MASK_RETRY_COUNT                  EHCA_BMASK_IBM(18, 18)
+#define MQPCB_MASK_TIMEOUT                      EHCA_BMASK_IBM(19, 19)
+#define MQPCB_MASK_PATH_MTU                     EHCA_BMASK_IBM(20, 20)
+#define MQPCB_MASK_MAX_STATIC_RATE              EHCA_BMASK_IBM(21, 21)
+#define MQPCB_MASK_DLID                         EHCA_BMASK_IBM(22, 22)
+#define MQPCB_MASK_RNR_RETRY_COUNT              EHCA_BMASK_IBM(23, 23)
+#define MQPCB_MASK_SOURCE_PATH_BITS             EHCA_BMASK_IBM(24, 24)
+#define MQPCB_MASK_TRAFFIC_CLASS                EHCA_BMASK_IBM(25, 25)
+#define MQPCB_MASK_HOP_LIMIT                    EHCA_BMASK_IBM(26, 26)
+#define MQPCB_MASK_SOURCE_GID_IDX               EHCA_BMASK_IBM(27, 27)
+#define MQPCB_MASK_FLOW_LABEL                   EHCA_BMASK_IBM(28, 28)
+#define MQPCB_MASK_DEST_GID                     EHCA_BMASK_IBM(30, 30)
+#define MQPCB_MASK_SERVICE_LEVEL_AL             EHCA_BMASK_IBM(31, 31)
+#define MQPCB_MASK_SEND_GRH_FLAG_AL             EHCA_BMASK_IBM(32, 32)
+#define MQPCB_MASK_RETRY_COUNT_AL               EHCA_BMASK_IBM(33, 33)
+#define MQPCB_MASK_TIMEOUT_AL                   EHCA_BMASK_IBM(34, 34)
+#define MQPCB_MASK_MAX_STATIC_RATE_AL           EHCA_BMASK_IBM(35, 35)
+#define MQPCB_MASK_DLID_AL                      EHCA_BMASK_IBM(36, 36)
+#define MQPCB_MASK_RNR_RETRY_COUNT_AL           EHCA_BMASK_IBM(37, 37)
+#define MQPCB_MASK_SOURCE_PATH_BITS_AL          EHCA_BMASK_IBM(38, 38)
+#define MQPCB_MASK_TRAFFIC_CLASS_AL             EHCA_BMASK_IBM(39, 39)
+#define MQPCB_MASK_HOP_LIMIT_AL                 EHCA_BMASK_IBM(40, 40)
+#define MQPCB_MASK_SOURCE_GID_IDX_AL            EHCA_BMASK_IBM(41, 41)
+#define MQPCB_MASK_FLOW_LABEL_AL                EHCA_BMASK_IBM(42, 42)
+#define MQPCB_MASK_DEST_GID_AL                  EHCA_BMASK_IBM(44, 44)
+#define MQPCB_MASK_MAX_NR_OUTST_SEND_WR         EHCA_BMASK_IBM(45, 45)
+#define MQPCB_MASK_MAX_NR_OUTST_RECV_WR         EHCA_BMASK_IBM(46, 46)
+#define MQPCB_MASK_DISABLE_ETE_CREDIT_CHECK     EHCA_BMASK_IBM(47, 47)
+#define MQPCB_MASK_QP_ENABLE                    EHCA_BMASK_IBM(48, 48)
+#define MQPCB_MASK_CURR_SRQ_LIMIT               EHCA_BMASK_IBM(49, 49)
+#define MQPCB_MASK_QP_AFF_ASYN_EV_LOG_REG       EHCA_BMASK_IBM(50, 50)
+#define MQPCB_MASK_SHARED_RQ_HNDL               EHCA_BMASK_IBM(51, 51)
+
+#endif /* __EHCA_CLASSES_PSERIES_H__ */
diff --git a/drivers/infiniband/hw/ehca/ehca_cq.c b/drivers/infiniband/hw/ehca/ehca_cq.c
new file mode 100644
index 0000000..8cc8375
--- /dev/null
+++ b/drivers/infiniband/hw/ehca/ehca_cq.c
@@ -0,0 +1,392 @@
+/*
+ *  IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ *  Completion queue handling
+ *
+ *  Authors: Waleri Fomin <fomin@de.ibm.com>
+ *           Khadija Souissi <souissi@de.ibm.com>
+ *           Reinhard Ernst <rernst@de.ibm.com>
+ *           Heiko J Schick <schickhj@de.ibm.com>
+ *           Hoang-Nam Nguyen <hnguyen@de.ibm.com>
+ *
+ *
+ *  Copyright (c) 2005 IBM Corporation
+ *
+ *  All rights reserved.
+ *
+ *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ *  BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/slab.h>
+
+#include "ehca_iverbs.h"
+#include "ehca_classes.h"
+#include "ehca_irq.h"
+#include "hcp_if.h"
+
+static struct kmem_cache *cq_cache;
+
+int ehca_cq_assign_qp(struct ehca_cq *cq, struct ehca_qp *qp)
+{
+	unsigned int qp_num = qp->real_qp_num;
+	unsigned int key = qp_num & (QP_HASHTAB_LEN-1);
+	unsigned long flags;
+
+	spin_lock_irqsave(&cq->spinlock, flags);
+	hlist_add_head(&qp->list_entries, &cq->qp_hashtab[key]);
+	spin_unlock_irqrestore(&cq->spinlock, flags);
+
+	ehca_dbg(cq->ib_cq.device, "cq_num=%x real_qp_num=%x",
+		 cq->cq_number, qp_num);
+
+	return 0;
+}
+
+int ehca_cq_unassign_qp(struct ehca_cq *cq, unsigned int real_qp_num)
+{
+	int ret = -EINVAL;
+	unsigned int key = real_qp_num & (QP_HASHTAB_LEN-1);
+	struct hlist_node *iter;
+	struct ehca_qp *qp;
+	unsigned long flags;
+
+	spin_lock_irqsave(&cq->spinlock, flags);
+	hlist_for_each(iter, &cq->qp_hashtab[key]) {
+		qp = hlist_entry(iter, struct ehca_qp, list_entries);
+		if (qp->real_qp_num == real_qp_num) {
+			hlist_del(iter);
+			ehca_dbg(cq->ib_cq.device,
+				 "removed qp from cq .cq_num=%x real_qp_num=%x",
+				 cq->cq_number, real_qp_num);
+			ret = 0;
+			break;
+		}
+	}
+	spin_unlock_irqrestore(&cq->spinlock, flags);
+	if (ret)
+		ehca_err(cq->ib_cq.device,
+			 "qp not found cq_num=%x real_qp_num=%x",
+			 cq->cq_number, real_qp_num);
+
+	return ret;
+}
+
+struct ehca_qp *ehca_cq_get_qp(struct ehca_cq *cq, int real_qp_num)
+{
+	struct ehca_qp *ret = NULL;
+	unsigned int key = real_qp_num & (QP_HASHTAB_LEN-1);
+	struct hlist_node *iter;
+	struct ehca_qp *qp;
+	hlist_for_each(iter, &cq->qp_hashtab[key]) {
+		qp = hlist_entry(iter, struct ehca_qp, list_entries);
+		if (qp->real_qp_num == real_qp_num) {
+			ret = qp;
+			break;
+		}
+	}
+	return ret;
+}
+
+struct ib_cq *ehca_create_cq(struct ib_device *device, int cqe, int comp_vector,
+			     struct ib_ucontext *context,
+			     struct ib_udata *udata)
+{
+	static const u32 additional_cqe = 20;
+	struct ib_cq *cq;
+	struct ehca_cq *my_cq;
+	struct ehca_shca *shca =
+		container_of(device, struct ehca_shca, ib_device);
+	struct ipz_adapter_handle adapter_handle;
+	struct ehca_alloc_cq_parms param; /* h_call's out parameters */
+	struct h_galpa gal;
+	void *vpage;
+	u32 counter;
+	u64 rpage, cqx_fec, h_ret;
+	int ipz_rc, i;
+	unsigned long flags;
+
+	if (cqe >= 0xFFFFFFFF - 64 - additional_cqe)
+		return ERR_PTR(-EINVAL);
+
+	if (!atomic_add_unless(&shca->num_cqs, 1, shca->max_num_cqs)) {
+		ehca_err(device, "Unable to create CQ, max number of %i "
+			"CQs reached.", shca->max_num_cqs);
+		ehca_err(device, "To increase the maximum number of CQs "
+			"use the number_of_cqs module parameter.\n");
+		return ERR_PTR(-ENOSPC);
+	}
+
+	my_cq = kmem_cache_zalloc(cq_cache, GFP_KERNEL);
+	if (!my_cq) {
+		ehca_err(device, "Out of memory for ehca_cq struct device=%p",
+			 device);
+		atomic_dec(&shca->num_cqs);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	memset(&param, 0, sizeof(struct ehca_alloc_cq_parms));
+
+	spin_lock_init(&my_cq->spinlock);
+	spin_lock_init(&my_cq->cb_lock);
+	spin_lock_init(&my_cq->task_lock);
+	atomic_set(&my_cq->nr_events, 0);
+	init_waitqueue_head(&my_cq->wait_completion);
+
+	cq = &my_cq->ib_cq;
+
+	adapter_handle = shca->ipz_hca_handle;
+	param.eq_handle = shca->eq.ipz_eq_handle;
+
+	idr_preload(GFP_KERNEL);
+	write_lock_irqsave(&ehca_cq_idr_lock, flags);
+	my_cq->token = idr_alloc(&ehca_cq_idr, my_cq, 0, 0x2000000, GFP_NOWAIT);
+	write_unlock_irqrestore(&ehca_cq_idr_lock, flags);
+	idr_preload_end();
+
+	if (my_cq->token < 0) {
+		cq = ERR_PTR(-ENOMEM);
+		ehca_err(device, "Can't allocate new idr entry. device=%p",
+			 device);
+		goto create_cq_exit1;
+	}
+
+	/*
+	 * CQs maximum depth is 4GB-64, but we need additional 20 as buffer
+	 * for receiving errors CQEs.
+	 */
+	param.nr_cqe = cqe + additional_cqe;
+	h_ret = hipz_h_alloc_resource_cq(adapter_handle, my_cq, &param);
+
+	if (h_ret != H_SUCCESS) {
+		ehca_err(device, "hipz_h_alloc_resource_cq() failed "
+			 "h_ret=%lli device=%p", h_ret, device);
+		cq = ERR_PTR(ehca2ib_return_code(h_ret));
+		goto create_cq_exit2;
+	}
+
+	ipz_rc = ipz_queue_ctor(NULL, &my_cq->ipz_queue, param.act_pages,
+				EHCA_PAGESIZE, sizeof(struct ehca_cqe), 0, 0);
+	if (!ipz_rc) {
+		ehca_err(device, "ipz_queue_ctor() failed ipz_rc=%i device=%p",
+			 ipz_rc, device);
+		cq = ERR_PTR(-EINVAL);
+		goto create_cq_exit3;
+	}
+
+	for (counter = 0; counter < param.act_pages; counter++) {
+		vpage = ipz_qpageit_get_inc(&my_cq->ipz_queue);
+		if (!vpage) {
+			ehca_err(device, "ipz_qpageit_get_inc() "
+				 "returns NULL device=%p", device);
+			cq = ERR_PTR(-EAGAIN);
+			goto create_cq_exit4;
+		}
+		rpage = __pa(vpage);
+
+		h_ret = hipz_h_register_rpage_cq(adapter_handle,
+						 my_cq->ipz_cq_handle,
+						 &my_cq->pf,
+						 0,
+						 0,
+						 rpage,
+						 1,
+						 my_cq->galpas.
+						 kernel);
+
+		if (h_ret < H_SUCCESS) {
+			ehca_err(device, "hipz_h_register_rpage_cq() failed "
+				 "ehca_cq=%p cq_num=%x h_ret=%lli counter=%i "
+				 "act_pages=%i", my_cq, my_cq->cq_number,
+				 h_ret, counter, param.act_pages);
+			cq = ERR_PTR(-EINVAL);
+			goto create_cq_exit4;
+		}
+
+		if (counter == (param.act_pages - 1)) {
+			vpage = ipz_qpageit_get_inc(&my_cq->ipz_queue);
+			if ((h_ret != H_SUCCESS) || vpage) {
+				ehca_err(device, "Registration of pages not "
+					 "complete ehca_cq=%p cq_num=%x "
+					 "h_ret=%lli", my_cq, my_cq->cq_number,
+					 h_ret);
+				cq = ERR_PTR(-EAGAIN);
+				goto create_cq_exit4;
+			}
+		} else {
+			if (h_ret != H_PAGE_REGISTERED) {
+				ehca_err(device, "Registration of page failed "
+					 "ehca_cq=%p cq_num=%x h_ret=%lli "
+					 "counter=%i act_pages=%i",
+					 my_cq, my_cq->cq_number,
+					 h_ret, counter, param.act_pages);
+				cq = ERR_PTR(-ENOMEM);
+				goto create_cq_exit4;
+			}
+		}
+	}
+
+	ipz_qeit_reset(&my_cq->ipz_queue);
+
+	gal = my_cq->galpas.kernel;
+	cqx_fec = hipz_galpa_load(gal, CQTEMM_OFFSET(cqx_fec));
+	ehca_dbg(device, "ehca_cq=%p cq_num=%x CQX_FEC=%llx",
+		 my_cq, my_cq->cq_number, cqx_fec);
+
+	my_cq->ib_cq.cqe = my_cq->nr_of_entries =
+		param.act_nr_of_entries - additional_cqe;
+	my_cq->cq_number = (my_cq->ipz_cq_handle.handle) & 0xffff;
+
+	for (i = 0; i < QP_HASHTAB_LEN; i++)
+		INIT_HLIST_HEAD(&my_cq->qp_hashtab[i]);
+
+	INIT_LIST_HEAD(&my_cq->sqp_err_list);
+	INIT_LIST_HEAD(&my_cq->rqp_err_list);
+
+	if (context) {
+		struct ipz_queue *ipz_queue = &my_cq->ipz_queue;
+		struct ehca_create_cq_resp resp;
+		memset(&resp, 0, sizeof(resp));
+		resp.cq_number = my_cq->cq_number;
+		resp.token = my_cq->token;
+		resp.ipz_queue.qe_size = ipz_queue->qe_size;
+		resp.ipz_queue.act_nr_of_sg = ipz_queue->act_nr_of_sg;
+		resp.ipz_queue.queue_length = ipz_queue->queue_length;
+		resp.ipz_queue.pagesize = ipz_queue->pagesize;
+		resp.ipz_queue.toggle_state = ipz_queue->toggle_state;
+		resp.fw_handle_ofs = (u32)
+			(my_cq->galpas.user.fw_handle & (PAGE_SIZE - 1));
+		if (ib_copy_to_udata(udata, &resp, sizeof(resp))) {
+			ehca_err(device, "Copy to udata failed.");
+			cq = ERR_PTR(-EFAULT);
+			goto create_cq_exit4;
+		}
+	}
+
+	return cq;
+
+create_cq_exit4:
+	ipz_queue_dtor(NULL, &my_cq->ipz_queue);
+
+create_cq_exit3:
+	h_ret = hipz_h_destroy_cq(adapter_handle, my_cq, 1);
+	if (h_ret != H_SUCCESS)
+		ehca_err(device, "hipz_h_destroy_cq() failed ehca_cq=%p "
+			 "cq_num=%x h_ret=%lli", my_cq, my_cq->cq_number, h_ret);
+
+create_cq_exit2:
+	write_lock_irqsave(&ehca_cq_idr_lock, flags);
+	idr_remove(&ehca_cq_idr, my_cq->token);
+	write_unlock_irqrestore(&ehca_cq_idr_lock, flags);
+
+create_cq_exit1:
+	kmem_cache_free(cq_cache, my_cq);
+
+	atomic_dec(&shca->num_cqs);
+	return cq;
+}
+
+int ehca_destroy_cq(struct ib_cq *cq)
+{
+	u64 h_ret;
+	struct ehca_cq *my_cq = container_of(cq, struct ehca_cq, ib_cq);
+	int cq_num = my_cq->cq_number;
+	struct ib_device *device = cq->device;
+	struct ehca_shca *shca = container_of(device, struct ehca_shca,
+					      ib_device);
+	struct ipz_adapter_handle adapter_handle = shca->ipz_hca_handle;
+	unsigned long flags;
+
+	if (cq->uobject) {
+		if (my_cq->mm_count_galpa || my_cq->mm_count_queue) {
+			ehca_err(device, "Resources still referenced in "
+				 "user space cq_num=%x", my_cq->cq_number);
+			return -EINVAL;
+		}
+	}
+
+	/*
+	 * remove the CQ from the idr first to make sure
+	 * no more interrupt tasklets will touch this CQ
+	 */
+	write_lock_irqsave(&ehca_cq_idr_lock, flags);
+	idr_remove(&ehca_cq_idr, my_cq->token);
+	write_unlock_irqrestore(&ehca_cq_idr_lock, flags);
+
+	/* now wait until all pending events have completed */
+	wait_event(my_cq->wait_completion, !atomic_read(&my_cq->nr_events));
+
+	/* nobody's using our CQ any longer -- we can destroy it */
+	h_ret = hipz_h_destroy_cq(adapter_handle, my_cq, 0);
+	if (h_ret == H_R_STATE) {
+		/* cq in err: read err data and destroy it forcibly */
+		ehca_dbg(device, "ehca_cq=%p cq_num=%x resource=%llx in err "
+			 "state. Try to delete it forcibly.",
+			 my_cq, cq_num, my_cq->ipz_cq_handle.handle);
+		ehca_error_data(shca, my_cq, my_cq->ipz_cq_handle.handle);
+		h_ret = hipz_h_destroy_cq(adapter_handle, my_cq, 1);
+		if (h_ret == H_SUCCESS)
+			ehca_dbg(device, "cq_num=%x deleted successfully.",
+				 cq_num);
+	}
+	if (h_ret != H_SUCCESS) {
+		ehca_err(device, "hipz_h_destroy_cq() failed h_ret=%lli "
+			 "ehca_cq=%p cq_num=%x", h_ret, my_cq, cq_num);
+		return ehca2ib_return_code(h_ret);
+	}
+	ipz_queue_dtor(NULL, &my_cq->ipz_queue);
+	kmem_cache_free(cq_cache, my_cq);
+
+	atomic_dec(&shca->num_cqs);
+	return 0;
+}
+
+int ehca_resize_cq(struct ib_cq *cq, int cqe, struct ib_udata *udata)
+{
+	/* TODO: proper resize needs to be done */
+	ehca_err(cq->device, "not implemented yet");
+
+	return -EFAULT;
+}
+
+int ehca_init_cq_cache(void)
+{
+	cq_cache = kmem_cache_create("ehca_cache_cq",
+				     sizeof(struct ehca_cq), 0,
+				     SLAB_HWCACHE_ALIGN,
+				     NULL);
+	if (!cq_cache)
+		return -ENOMEM;
+	return 0;
+}
+
+void ehca_cleanup_cq_cache(void)
+{
+	if (cq_cache)
+		kmem_cache_destroy(cq_cache);
+}
diff --git a/drivers/infiniband/hw/ehca/ehca_eq.c b/drivers/infiniband/hw/ehca/ehca_eq.c
new file mode 100644
index 0000000..90da674
--- /dev/null
+++ b/drivers/infiniband/hw/ehca/ehca_eq.c
@@ -0,0 +1,189 @@
+/*
+ *  IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ *  Event queue handling
+ *
+ *  Authors: Waleri Fomin <fomin@de.ibm.com>
+ *           Khadija Souissi <souissi@de.ibm.com>
+ *           Reinhard Ernst <rernst@de.ibm.com>
+ *           Heiko J Schick <schickhj@de.ibm.com>
+ *           Hoang-Nam Nguyen <hnguyen@de.ibm.com>
+ *
+ *
+ *  Copyright (c) 2005 IBM Corporation
+ *
+ *  All rights reserved.
+ *
+ *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ *  BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "ehca_classes.h"
+#include "ehca_irq.h"
+#include "ehca_iverbs.h"
+#include "ehca_qes.h"
+#include "hcp_if.h"
+#include "ipz_pt_fn.h"
+
+int ehca_create_eq(struct ehca_shca *shca,
+		   struct ehca_eq *eq,
+		   const enum ehca_eq_type type, const u32 length)
+{
+	int ret;
+	u64 h_ret;
+	u32 nr_pages;
+	u32 i;
+	void *vpage;
+	struct ib_device *ib_dev = &shca->ib_device;
+
+	spin_lock_init(&eq->spinlock);
+	spin_lock_init(&eq->irq_spinlock);
+	eq->is_initialized = 0;
+
+	if (type != EHCA_EQ && type != EHCA_NEQ) {
+		ehca_err(ib_dev, "Invalid EQ type %x. eq=%p", type, eq);
+		return -EINVAL;
+	}
+	if (!length) {
+		ehca_err(ib_dev, "EQ length must not be zero. eq=%p", eq);
+		return -EINVAL;
+	}
+
+	h_ret = hipz_h_alloc_resource_eq(shca->ipz_hca_handle,
+					 &eq->pf,
+					 type,
+					 length,
+					 &eq->ipz_eq_handle,
+					 &eq->length,
+					 &nr_pages, &eq->ist);
+
+	if (h_ret != H_SUCCESS) {
+		ehca_err(ib_dev, "Can't allocate EQ/NEQ. eq=%p", eq);
+		return -EINVAL;
+	}
+
+	ret = ipz_queue_ctor(NULL, &eq->ipz_queue, nr_pages,
+			     EHCA_PAGESIZE, sizeof(struct ehca_eqe), 0, 0);
+	if (!ret) {
+		ehca_err(ib_dev, "Can't allocate EQ pages eq=%p", eq);
+		goto create_eq_exit1;
+	}
+
+	for (i = 0; i < nr_pages; i++) {
+		u64 rpage;
+
+		vpage = ipz_qpageit_get_inc(&eq->ipz_queue);
+		if (!vpage)
+			goto create_eq_exit2;
+
+		rpage = __pa(vpage);
+		h_ret = hipz_h_register_rpage_eq(shca->ipz_hca_handle,
+						 eq->ipz_eq_handle,
+						 &eq->pf,
+						 0, 0, rpage, 1);
+
+		if (i == (nr_pages - 1)) {
+			/* last page */
+			vpage = ipz_qpageit_get_inc(&eq->ipz_queue);
+			if (h_ret != H_SUCCESS || vpage)
+				goto create_eq_exit2;
+		} else {
+			if (h_ret != H_PAGE_REGISTERED)
+				goto create_eq_exit2;
+		}
+	}
+
+	ipz_qeit_reset(&eq->ipz_queue);
+
+	/* register interrupt handlers and initialize work queues */
+	if (type == EHCA_EQ) {
+		tasklet_init(&eq->interrupt_task, ehca_tasklet_eq, (long)shca);
+
+		ret = ibmebus_request_irq(eq->ist, ehca_interrupt_eq,
+					  0, "ehca_eq",
+					  (void *)shca);
+		if (ret < 0)
+			ehca_err(ib_dev, "Can't map interrupt handler.");
+	} else if (type == EHCA_NEQ) {
+		tasklet_init(&eq->interrupt_task, ehca_tasklet_neq, (long)shca);
+
+		ret = ibmebus_request_irq(eq->ist, ehca_interrupt_neq,
+					  0, "ehca_neq",
+					  (void *)shca);
+		if (ret < 0)
+			ehca_err(ib_dev, "Can't map interrupt handler.");
+	}
+
+	eq->is_initialized = 1;
+
+	return 0;
+
+create_eq_exit2:
+	ipz_queue_dtor(NULL, &eq->ipz_queue);
+
+create_eq_exit1:
+	hipz_h_destroy_eq(shca->ipz_hca_handle, eq);
+
+	return -EINVAL;
+}
+
+void *ehca_poll_eq(struct ehca_shca *shca, struct ehca_eq *eq)
+{
+	unsigned long flags;
+	void *eqe;
+
+	spin_lock_irqsave(&eq->spinlock, flags);
+	eqe = ipz_eqit_eq_get_inc_valid(&eq->ipz_queue);
+	spin_unlock_irqrestore(&eq->spinlock, flags);
+
+	return eqe;
+}
+
+int ehca_destroy_eq(struct ehca_shca *shca, struct ehca_eq *eq)
+{
+	unsigned long flags;
+	u64 h_ret;
+
+	ibmebus_free_irq(eq->ist, (void *)shca);
+
+	spin_lock_irqsave(&shca_list_lock, flags);
+	eq->is_initialized = 0;
+	spin_unlock_irqrestore(&shca_list_lock, flags);
+
+	tasklet_kill(&eq->interrupt_task);
+
+	h_ret = hipz_h_destroy_eq(shca->ipz_hca_handle, eq);
+
+	if (h_ret != H_SUCCESS) {
+		ehca_err(&shca->ib_device, "Can't free EQ resources.");
+		return -EINVAL;
+	}
+	ipz_queue_dtor(NULL, &eq->ipz_queue);
+
+	return 0;
+}
diff --git a/drivers/infiniband/hw/ehca/ehca_hca.c b/drivers/infiniband/hw/ehca/ehca_hca.c
new file mode 100644
index 0000000..9ed4d25
--- /dev/null
+++ b/drivers/infiniband/hw/ehca/ehca_hca.c
@@ -0,0 +1,410 @@
+/*
+ *  IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ *  HCA query functions
+ *
+ *  Authors: Heiko J Schick <schickhj@de.ibm.com>
+ *           Christoph Raisch <raisch@de.ibm.com>
+ *
+ *  Copyright (c) 2005 IBM Corporation
+ *
+ *  All rights reserved.
+ *
+ *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ *  BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/gfp.h>
+
+#include "ehca_tools.h"
+#include "ehca_iverbs.h"
+#include "hcp_if.h"
+
+static unsigned int limit_uint(unsigned int value)
+{
+	return min_t(unsigned int, value, INT_MAX);
+}
+
+int ehca_query_device(struct ib_device *ibdev, struct ib_device_attr *props)
+{
+	int i, ret = 0;
+	struct ehca_shca *shca = container_of(ibdev, struct ehca_shca,
+					      ib_device);
+	struct hipz_query_hca *rblock;
+
+	static const u32 cap_mapping[] = {
+		IB_DEVICE_RESIZE_MAX_WR,      HCA_CAP_WQE_RESIZE,
+		IB_DEVICE_BAD_PKEY_CNTR,      HCA_CAP_BAD_P_KEY_CTR,
+		IB_DEVICE_BAD_QKEY_CNTR,      HCA_CAP_Q_KEY_VIOL_CTR,
+		IB_DEVICE_RAW_MULTI,          HCA_CAP_RAW_PACKET_MCAST,
+		IB_DEVICE_AUTO_PATH_MIG,      HCA_CAP_AUTO_PATH_MIG,
+		IB_DEVICE_CHANGE_PHY_PORT,    HCA_CAP_SQD_RTS_PORT_CHANGE,
+		IB_DEVICE_UD_AV_PORT_ENFORCE, HCA_CAP_AH_PORT_NR_CHECK,
+		IB_DEVICE_CURR_QP_STATE_MOD,  HCA_CAP_CUR_QP_STATE_MOD,
+		IB_DEVICE_SHUTDOWN_PORT,      HCA_CAP_SHUTDOWN_PORT,
+		IB_DEVICE_INIT_TYPE,          HCA_CAP_INIT_TYPE,
+		IB_DEVICE_PORT_ACTIVE_EVENT,  HCA_CAP_PORT_ACTIVE_EVENT,
+	};
+
+	rblock = ehca_alloc_fw_ctrlblock(GFP_KERNEL);
+	if (!rblock) {
+		ehca_err(&shca->ib_device, "Can't allocate rblock memory.");
+		return -ENOMEM;
+	}
+
+	if (hipz_h_query_hca(shca->ipz_hca_handle, rblock) != H_SUCCESS) {
+		ehca_err(&shca->ib_device, "Can't query device properties");
+		ret = -EINVAL;
+		goto query_device1;
+	}
+
+	memset(props, 0, sizeof(struct ib_device_attr));
+	props->page_size_cap   = shca->hca_cap_mr_pgsize;
+	props->fw_ver          = rblock->hw_ver;
+	props->max_mr_size     = rblock->max_mr_size;
+	props->vendor_id       = rblock->vendor_id >> 8;
+	props->vendor_part_id  = rblock->vendor_part_id >> 16;
+	props->hw_ver          = rblock->hw_ver;
+	props->max_qp          = limit_uint(rblock->max_qp);
+	props->max_qp_wr       = limit_uint(rblock->max_wqes_wq);
+	props->max_sge         = limit_uint(rblock->max_sge);
+	props->max_sge_rd      = limit_uint(rblock->max_sge_rd);
+	props->max_cq          = limit_uint(rblock->max_cq);
+	props->max_cqe         = limit_uint(rblock->max_cqe);
+	props->max_mr          = limit_uint(rblock->max_mr);
+	props->max_mw          = limit_uint(rblock->max_mw);
+	props->max_pd          = limit_uint(rblock->max_pd);
+	props->max_ah          = limit_uint(rblock->max_ah);
+	props->max_ee          = limit_uint(rblock->max_rd_ee_context);
+	props->max_rdd         = limit_uint(rblock->max_rd_domain);
+	props->max_fmr         = limit_uint(rblock->max_mr);
+	props->max_qp_rd_atom  = limit_uint(rblock->max_rr_qp);
+	props->max_ee_rd_atom  = limit_uint(rblock->max_rr_ee_context);
+	props->max_res_rd_atom = limit_uint(rblock->max_rr_hca);
+	props->max_qp_init_rd_atom = limit_uint(rblock->max_act_wqs_qp);
+	props->max_ee_init_rd_atom = limit_uint(rblock->max_act_wqs_ee_context);
+
+	if (EHCA_BMASK_GET(HCA_CAP_SRQ, shca->hca_cap)) {
+		props->max_srq         = limit_uint(props->max_qp);
+		props->max_srq_wr      = limit_uint(props->max_qp_wr);
+		props->max_srq_sge     = 3;
+	}
+
+	props->max_pkeys           = 16;
+	/* Some FW versions say 0 here; insert sensible value in that case */
+	props->local_ca_ack_delay  = rblock->local_ca_ack_delay ?
+		min_t(u8, rblock->local_ca_ack_delay, 255) : 12;
+	props->max_raw_ipv6_qp     = limit_uint(rblock->max_raw_ipv6_qp);
+	props->max_raw_ethy_qp     = limit_uint(rblock->max_raw_ethy_qp);
+	props->max_mcast_grp       = limit_uint(rblock->max_mcast_grp);
+	props->max_mcast_qp_attach = limit_uint(rblock->max_mcast_qp_attach);
+	props->max_total_mcast_qp_attach
+		= limit_uint(rblock->max_total_mcast_qp_attach);
+
+	/* translate device capabilities */
+	props->device_cap_flags = IB_DEVICE_SYS_IMAGE_GUID |
+		IB_DEVICE_RC_RNR_NAK_GEN | IB_DEVICE_N_NOTIFY_CQ;
+	for (i = 0; i < ARRAY_SIZE(cap_mapping); i += 2)
+		if (rblock->hca_cap_indicators & cap_mapping[i + 1])
+			props->device_cap_flags |= cap_mapping[i];
+
+query_device1:
+	ehca_free_fw_ctrlblock(rblock);
+
+	return ret;
+}
+
+static enum ib_mtu map_mtu(struct ehca_shca *shca, u32 fw_mtu)
+{
+	switch (fw_mtu) {
+	case 0x1:
+		return IB_MTU_256;
+	case 0x2:
+		return IB_MTU_512;
+	case 0x3:
+		return IB_MTU_1024;
+	case 0x4:
+		return IB_MTU_2048;
+	case 0x5:
+		return IB_MTU_4096;
+	default:
+		ehca_err(&shca->ib_device, "Unknown MTU size: %x.",
+			 fw_mtu);
+		return 0;
+	}
+}
+
+static u8 map_number_of_vls(struct ehca_shca *shca, u32 vl_cap)
+{
+	switch (vl_cap) {
+	case 0x1:
+		return 1;
+	case 0x2:
+		return 2;
+	case 0x3:
+		return 4;
+	case 0x4:
+		return 8;
+	case 0x5:
+		return 15;
+	default:
+		ehca_err(&shca->ib_device, "invalid Vl Capability: %x.",
+			 vl_cap);
+		return 0;
+	}
+}
+
+int ehca_query_port(struct ib_device *ibdev,
+		    u8 port, struct ib_port_attr *props)
+{
+	int ret = 0;
+	u64 h_ret;
+	struct ehca_shca *shca = container_of(ibdev, struct ehca_shca,
+					      ib_device);
+	struct hipz_query_port *rblock;
+
+	rblock = ehca_alloc_fw_ctrlblock(GFP_KERNEL);
+	if (!rblock) {
+		ehca_err(&shca->ib_device, "Can't allocate rblock memory.");
+		return -ENOMEM;
+	}
+
+	h_ret = hipz_h_query_port(shca->ipz_hca_handle, port, rblock);
+	if (h_ret != H_SUCCESS) {
+		ehca_err(&shca->ib_device, "Can't query port properties");
+		ret = -EINVAL;
+		goto query_port1;
+	}
+
+	memset(props, 0, sizeof(struct ib_port_attr));
+
+	props->active_mtu = props->max_mtu = map_mtu(shca, rblock->max_mtu);
+	props->port_cap_flags  = rblock->capability_mask;
+	props->gid_tbl_len     = rblock->gid_tbl_len;
+	if (rblock->max_msg_sz)
+		props->max_msg_sz      = rblock->max_msg_sz;
+	else
+		props->max_msg_sz      = 0x1 << 31;
+	props->bad_pkey_cntr   = rblock->bad_pkey_cntr;
+	props->qkey_viol_cntr  = rblock->qkey_viol_cntr;
+	props->pkey_tbl_len    = rblock->pkey_tbl_len;
+	props->lid             = rblock->lid;
+	props->sm_lid          = rblock->sm_lid;
+	props->lmc             = rblock->lmc;
+	props->sm_sl           = rblock->sm_sl;
+	props->subnet_timeout  = rblock->subnet_timeout;
+	props->init_type_reply = rblock->init_type_reply;
+	props->max_vl_num      = map_number_of_vls(shca, rblock->vl_cap);
+
+	if (rblock->state && rblock->phys_width) {
+		props->phys_state      = rblock->phys_pstate;
+		props->state           = rblock->phys_state;
+		props->active_width    = rblock->phys_width;
+		props->active_speed    = rblock->phys_speed;
+	} else {
+		/* old firmware releases don't report physical
+		 * port info, so use default values
+		 */
+		props->phys_state      = 5;
+		props->state           = rblock->state;
+		props->active_width    = IB_WIDTH_12X;
+		props->active_speed    = IB_SPEED_SDR;
+	}
+
+query_port1:
+	ehca_free_fw_ctrlblock(rblock);
+
+	return ret;
+}
+
+int ehca_query_sma_attr(struct ehca_shca *shca,
+			u8 port, struct ehca_sma_attr *attr)
+{
+	int ret = 0;
+	u64 h_ret;
+	struct hipz_query_port *rblock;
+
+	rblock = ehca_alloc_fw_ctrlblock(GFP_ATOMIC);
+	if (!rblock) {
+		ehca_err(&shca->ib_device, "Can't allocate rblock memory.");
+		return -ENOMEM;
+	}
+
+	h_ret = hipz_h_query_port(shca->ipz_hca_handle, port, rblock);
+	if (h_ret != H_SUCCESS) {
+		ehca_err(&shca->ib_device, "Can't query port properties");
+		ret = -EINVAL;
+		goto query_sma_attr1;
+	}
+
+	memset(attr, 0, sizeof(struct ehca_sma_attr));
+
+	attr->lid    = rblock->lid;
+	attr->lmc    = rblock->lmc;
+	attr->sm_sl  = rblock->sm_sl;
+	attr->sm_lid = rblock->sm_lid;
+
+	attr->pkey_tbl_len = rblock->pkey_tbl_len;
+	memcpy(attr->pkeys, rblock->pkey_entries, sizeof(attr->pkeys));
+
+query_sma_attr1:
+	ehca_free_fw_ctrlblock(rblock);
+
+	return ret;
+}
+
+int ehca_query_pkey(struct ib_device *ibdev, u8 port, u16 index, u16 *pkey)
+{
+	int ret = 0;
+	u64 h_ret;
+	struct ehca_shca *shca;
+	struct hipz_query_port *rblock;
+
+	shca = container_of(ibdev, struct ehca_shca, ib_device);
+	if (index > 16) {
+		ehca_err(&shca->ib_device, "Invalid index: %x.", index);
+		return -EINVAL;
+	}
+
+	rblock = ehca_alloc_fw_ctrlblock(GFP_KERNEL);
+	if (!rblock) {
+		ehca_err(&shca->ib_device,  "Can't allocate rblock memory.");
+		return -ENOMEM;
+	}
+
+	h_ret = hipz_h_query_port(shca->ipz_hca_handle, port, rblock);
+	if (h_ret != H_SUCCESS) {
+		ehca_err(&shca->ib_device, "Can't query port properties");
+		ret = -EINVAL;
+		goto query_pkey1;
+	}
+
+	memcpy(pkey, &rblock->pkey_entries + index, sizeof(u16));
+
+query_pkey1:
+	ehca_free_fw_ctrlblock(rblock);
+
+	return ret;
+}
+
+int ehca_query_gid(struct ib_device *ibdev, u8 port,
+		   int index, union ib_gid *gid)
+{
+	int ret = 0;
+	u64 h_ret;
+	struct ehca_shca *shca = container_of(ibdev, struct ehca_shca,
+					      ib_device);
+	struct hipz_query_port *rblock;
+
+	if (index < 0 || index > 255) {
+		ehca_err(&shca->ib_device, "Invalid index: %x.", index);
+		return -EINVAL;
+	}
+
+	rblock = ehca_alloc_fw_ctrlblock(GFP_KERNEL);
+	if (!rblock) {
+		ehca_err(&shca->ib_device, "Can't allocate rblock memory.");
+		return -ENOMEM;
+	}
+
+	h_ret = hipz_h_query_port(shca->ipz_hca_handle, port, rblock);
+	if (h_ret != H_SUCCESS) {
+		ehca_err(&shca->ib_device, "Can't query port properties");
+		ret = -EINVAL;
+		goto query_gid1;
+	}
+
+	memcpy(&gid->raw[0], &rblock->gid_prefix, sizeof(u64));
+	memcpy(&gid->raw[8], &rblock->guid_entries[index], sizeof(u64));
+
+query_gid1:
+	ehca_free_fw_ctrlblock(rblock);
+
+	return ret;
+}
+
+static const u32 allowed_port_caps = (
+	IB_PORT_SM | IB_PORT_LED_INFO_SUP | IB_PORT_CM_SUP |
+	IB_PORT_SNMP_TUNNEL_SUP | IB_PORT_DEVICE_MGMT_SUP |
+	IB_PORT_VENDOR_CLASS_SUP);
+
+int ehca_modify_port(struct ib_device *ibdev,
+		     u8 port, int port_modify_mask,
+		     struct ib_port_modify *props)
+{
+	int ret = 0;
+	struct ehca_shca *shca;
+	struct hipz_query_port *rblock;
+	u32 cap;
+	u64 hret;
+
+	shca = container_of(ibdev, struct ehca_shca, ib_device);
+	if ((props->set_port_cap_mask | props->clr_port_cap_mask)
+	    & ~allowed_port_caps) {
+		ehca_err(&shca->ib_device, "Non-changeable bits set in masks  "
+			 "set=%x  clr=%x  allowed=%x", props->set_port_cap_mask,
+			 props->clr_port_cap_mask, allowed_port_caps);
+		return -EINVAL;
+	}
+
+	if (mutex_lock_interruptible(&shca->modify_mutex))
+		return -ERESTARTSYS;
+
+	rblock = ehca_alloc_fw_ctrlblock(GFP_KERNEL);
+	if (!rblock) {
+		ehca_err(&shca->ib_device,  "Can't allocate rblock memory.");
+		ret = -ENOMEM;
+		goto modify_port1;
+	}
+
+	hret = hipz_h_query_port(shca->ipz_hca_handle, port, rblock);
+	if (hret != H_SUCCESS) {
+		ehca_err(&shca->ib_device, "Can't query port properties");
+		ret = -EINVAL;
+		goto modify_port2;
+	}
+
+	cap = (rblock->capability_mask | props->set_port_cap_mask)
+		& ~props->clr_port_cap_mask;
+
+	hret = hipz_h_modify_port(shca->ipz_hca_handle, port,
+				  cap, props->init_type, port_modify_mask);
+	if (hret != H_SUCCESS) {
+		ehca_err(&shca->ib_device, "Modify port failed  h_ret=%lli",
+			 hret);
+		ret = -EINVAL;
+	}
+
+modify_port2:
+	ehca_free_fw_ctrlblock(rblock);
+
+modify_port1:
+	mutex_unlock(&shca->modify_mutex);
+
+	return ret;
+}
diff --git a/drivers/infiniband/hw/ehca/ehca_irq.c b/drivers/infiniband/hw/ehca/ehca_irq.c
new file mode 100644
index 0000000..8615d7c
--- /dev/null
+++ b/drivers/infiniband/hw/ehca/ehca_irq.c
@@ -0,0 +1,870 @@
+/*
+ *  IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ *  Functions for EQs, NEQs and interrupts
+ *
+ *  Authors: Heiko J Schick <schickhj@de.ibm.com>
+ *           Khadija Souissi <souissi@de.ibm.com>
+ *           Hoang-Nam Nguyen <hnguyen@de.ibm.com>
+ *           Joachim Fenkes <fenkes@de.ibm.com>
+ *
+ *  Copyright (c) 2005 IBM Corporation
+ *
+ *  All rights reserved.
+ *
+ *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ *  BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/slab.h>
+#include <linux/smpboot.h>
+
+#include "ehca_classes.h"
+#include "ehca_irq.h"
+#include "ehca_iverbs.h"
+#include "ehca_tools.h"
+#include "hcp_if.h"
+#include "hipz_fns.h"
+#include "ipz_pt_fn.h"
+
+#define EQE_COMPLETION_EVENT   EHCA_BMASK_IBM( 1,  1)
+#define EQE_CQ_QP_NUMBER       EHCA_BMASK_IBM( 8, 31)
+#define EQE_EE_IDENTIFIER      EHCA_BMASK_IBM( 2,  7)
+#define EQE_CQ_NUMBER          EHCA_BMASK_IBM( 8, 31)
+#define EQE_QP_NUMBER          EHCA_BMASK_IBM( 8, 31)
+#define EQE_QP_TOKEN           EHCA_BMASK_IBM(32, 63)
+#define EQE_CQ_TOKEN           EHCA_BMASK_IBM(32, 63)
+
+#define NEQE_COMPLETION_EVENT  EHCA_BMASK_IBM( 1,  1)
+#define NEQE_EVENT_CODE        EHCA_BMASK_IBM( 2,  7)
+#define NEQE_PORT_NUMBER       EHCA_BMASK_IBM( 8, 15)
+#define NEQE_PORT_AVAILABILITY EHCA_BMASK_IBM(16, 16)
+#define NEQE_DISRUPTIVE        EHCA_BMASK_IBM(16, 16)
+#define NEQE_SPECIFIC_EVENT    EHCA_BMASK_IBM(16, 23)
+
+#define ERROR_DATA_LENGTH      EHCA_BMASK_IBM(52, 63)
+#define ERROR_DATA_TYPE        EHCA_BMASK_IBM( 0,  7)
+
+static void queue_comp_task(struct ehca_cq *__cq);
+
+static struct ehca_comp_pool *pool;
+
+static inline void comp_event_callback(struct ehca_cq *cq)
+{
+	if (!cq->ib_cq.comp_handler)
+		return;
+
+	spin_lock(&cq->cb_lock);
+	cq->ib_cq.comp_handler(&cq->ib_cq, cq->ib_cq.cq_context);
+	spin_unlock(&cq->cb_lock);
+
+	return;
+}
+
+static void print_error_data(struct ehca_shca *shca, void *data,
+			     u64 *rblock, int length)
+{
+	u64 type = EHCA_BMASK_GET(ERROR_DATA_TYPE, rblock[2]);
+	u64 resource = rblock[1];
+
+	switch (type) {
+	case 0x1: /* Queue Pair */
+	{
+		struct ehca_qp *qp = (struct ehca_qp *)data;
+
+		/* only print error data if AER is set */
+		if (rblock[6] == 0)
+			return;
+
+		ehca_err(&shca->ib_device,
+			 "QP 0x%x (resource=%llx) has errors.",
+			 qp->ib_qp.qp_num, resource);
+		break;
+	}
+	case 0x4: /* Completion Queue */
+	{
+		struct ehca_cq *cq = (struct ehca_cq *)data;
+
+		ehca_err(&shca->ib_device,
+			 "CQ 0x%x (resource=%llx) has errors.",
+			 cq->cq_number, resource);
+		break;
+	}
+	default:
+		ehca_err(&shca->ib_device,
+			 "Unknown error type: %llx on %s.",
+			 type, shca->ib_device.name);
+		break;
+	}
+
+	ehca_err(&shca->ib_device, "Error data is available: %llx.", resource);
+	ehca_err(&shca->ib_device, "EHCA ----- error data begin "
+		 "---------------------------------------------------");
+	ehca_dmp(rblock, length, "resource=%llx", resource);
+	ehca_err(&shca->ib_device, "EHCA ----- error data end "
+		 "----------------------------------------------------");
+
+	return;
+}
+
+int ehca_error_data(struct ehca_shca *shca, void *data,
+		    u64 resource)
+{
+
+	unsigned long ret;
+	u64 *rblock;
+	unsigned long block_count;
+
+	rblock = ehca_alloc_fw_ctrlblock(GFP_ATOMIC);
+	if (!rblock) {
+		ehca_err(&shca->ib_device, "Cannot allocate rblock memory.");
+		ret = -ENOMEM;
+		goto error_data1;
+	}
+
+	/* rblock must be 4K aligned and should be 4K large */
+	ret = hipz_h_error_data(shca->ipz_hca_handle,
+				resource,
+				rblock,
+				&block_count);
+
+	if (ret == H_R_STATE)
+		ehca_err(&shca->ib_device,
+			 "No error data is available: %llx.", resource);
+	else if (ret == H_SUCCESS) {
+		int length;
+
+		length = EHCA_BMASK_GET(ERROR_DATA_LENGTH, rblock[0]);
+
+		if (length > EHCA_PAGESIZE)
+			length = EHCA_PAGESIZE;
+
+		print_error_data(shca, data, rblock, length);
+	} else
+		ehca_err(&shca->ib_device,
+			 "Error data could not be fetched: %llx", resource);
+
+	ehca_free_fw_ctrlblock(rblock);
+
+error_data1:
+	return ret;
+
+}
+
+static void dispatch_qp_event(struct ehca_shca *shca, struct ehca_qp *qp,
+			      enum ib_event_type event_type)
+{
+	struct ib_event event;
+
+	/* PATH_MIG without the QP ever having been armed is false alarm */
+	if (event_type == IB_EVENT_PATH_MIG && !qp->mig_armed)
+		return;
+
+	event.device = &shca->ib_device;
+	event.event = event_type;
+
+	if (qp->ext_type == EQPT_SRQ) {
+		if (!qp->ib_srq.event_handler)
+			return;
+
+		event.element.srq = &qp->ib_srq;
+		qp->ib_srq.event_handler(&event, qp->ib_srq.srq_context);
+	} else {
+		if (!qp->ib_qp.event_handler)
+			return;
+
+		event.element.qp = &qp->ib_qp;
+		qp->ib_qp.event_handler(&event, qp->ib_qp.qp_context);
+	}
+}
+
+static void qp_event_callback(struct ehca_shca *shca, u64 eqe,
+			      enum ib_event_type event_type, int fatal)
+{
+	struct ehca_qp *qp;
+	u32 token = EHCA_BMASK_GET(EQE_QP_TOKEN, eqe);
+
+	read_lock(&ehca_qp_idr_lock);
+	qp = idr_find(&ehca_qp_idr, token);
+	if (qp)
+		atomic_inc(&qp->nr_events);
+	read_unlock(&ehca_qp_idr_lock);
+
+	if (!qp)
+		return;
+
+	if (fatal)
+		ehca_error_data(shca, qp, qp->ipz_qp_handle.handle);
+
+	dispatch_qp_event(shca, qp, fatal && qp->ext_type == EQPT_SRQ ?
+			  IB_EVENT_SRQ_ERR : event_type);
+
+	/*
+	 * eHCA only processes one WQE at a time for SRQ base QPs,
+	 * so the last WQE has been processed as soon as the QP enters
+	 * error state.
+	 */
+	if (fatal && qp->ext_type == EQPT_SRQBASE)
+		dispatch_qp_event(shca, qp, IB_EVENT_QP_LAST_WQE_REACHED);
+
+	if (atomic_dec_and_test(&qp->nr_events))
+		wake_up(&qp->wait_completion);
+	return;
+}
+
+static void cq_event_callback(struct ehca_shca *shca,
+			      u64 eqe)
+{
+	struct ehca_cq *cq;
+	u32 token = EHCA_BMASK_GET(EQE_CQ_TOKEN, eqe);
+
+	read_lock(&ehca_cq_idr_lock);
+	cq = idr_find(&ehca_cq_idr, token);
+	if (cq)
+		atomic_inc(&cq->nr_events);
+	read_unlock(&ehca_cq_idr_lock);
+
+	if (!cq)
+		return;
+
+	ehca_error_data(shca, cq, cq->ipz_cq_handle.handle);
+
+	if (atomic_dec_and_test(&cq->nr_events))
+		wake_up(&cq->wait_completion);
+
+	return;
+}
+
+static void parse_identifier(struct ehca_shca *shca, u64 eqe)
+{
+	u8 identifier = EHCA_BMASK_GET(EQE_EE_IDENTIFIER, eqe);
+
+	switch (identifier) {
+	case 0x02: /* path migrated */
+		qp_event_callback(shca, eqe, IB_EVENT_PATH_MIG, 0);
+		break;
+	case 0x03: /* communication established */
+		qp_event_callback(shca, eqe, IB_EVENT_COMM_EST, 0);
+		break;
+	case 0x04: /* send queue drained */
+		qp_event_callback(shca, eqe, IB_EVENT_SQ_DRAINED, 0);
+		break;
+	case 0x05: /* QP error */
+	case 0x06: /* QP error */
+		qp_event_callback(shca, eqe, IB_EVENT_QP_FATAL, 1);
+		break;
+	case 0x07: /* CQ error */
+	case 0x08: /* CQ error */
+		cq_event_callback(shca, eqe);
+		break;
+	case 0x09: /* MRMWPTE error */
+		ehca_err(&shca->ib_device, "MRMWPTE error.");
+		break;
+	case 0x0A: /* port event */
+		ehca_err(&shca->ib_device, "Port event.");
+		break;
+	case 0x0B: /* MR access error */
+		ehca_err(&shca->ib_device, "MR access error.");
+		break;
+	case 0x0C: /* EQ error */
+		ehca_err(&shca->ib_device, "EQ error.");
+		break;
+	case 0x0D: /* P/Q_Key mismatch */
+		ehca_err(&shca->ib_device, "P/Q_Key mismatch.");
+		break;
+	case 0x10: /* sampling complete */
+		ehca_err(&shca->ib_device, "Sampling complete.");
+		break;
+	case 0x11: /* unaffiliated access error */
+		ehca_err(&shca->ib_device, "Unaffiliated access error.");
+		break;
+	case 0x12: /* path migrating */
+		ehca_err(&shca->ib_device, "Path migrating.");
+		break;
+	case 0x13: /* interface trace stopped */
+		ehca_err(&shca->ib_device, "Interface trace stopped.");
+		break;
+	case 0x14: /* first error capture info available */
+		ehca_info(&shca->ib_device, "First error capture available");
+		break;
+	case 0x15: /* SRQ limit reached */
+		qp_event_callback(shca, eqe, IB_EVENT_SRQ_LIMIT_REACHED, 0);
+		break;
+	default:
+		ehca_err(&shca->ib_device, "Unknown identifier: %x on %s.",
+			 identifier, shca->ib_device.name);
+		break;
+	}
+
+	return;
+}
+
+static void dispatch_port_event(struct ehca_shca *shca, int port_num,
+				enum ib_event_type type, const char *msg)
+{
+	struct ib_event event;
+
+	ehca_info(&shca->ib_device, "port %d %s.", port_num, msg);
+	event.device = &shca->ib_device;
+	event.event = type;
+	event.element.port_num = port_num;
+	ib_dispatch_event(&event);
+}
+
+static void notify_port_conf_change(struct ehca_shca *shca, int port_num)
+{
+	struct ehca_sma_attr  new_attr;
+	struct ehca_sma_attr *old_attr = &shca->sport[port_num - 1].saved_attr;
+
+	ehca_query_sma_attr(shca, port_num, &new_attr);
+
+	if (new_attr.sm_sl  != old_attr->sm_sl ||
+	    new_attr.sm_lid != old_attr->sm_lid)
+		dispatch_port_event(shca, port_num, IB_EVENT_SM_CHANGE,
+				    "SM changed");
+
+	if (new_attr.lid != old_attr->lid ||
+	    new_attr.lmc != old_attr->lmc)
+		dispatch_port_event(shca, port_num, IB_EVENT_LID_CHANGE,
+				    "LID changed");
+
+	if (new_attr.pkey_tbl_len != old_attr->pkey_tbl_len ||
+	    memcmp(new_attr.pkeys, old_attr->pkeys,
+		   sizeof(u16) * new_attr.pkey_tbl_len))
+		dispatch_port_event(shca, port_num, IB_EVENT_PKEY_CHANGE,
+				    "P_Key changed");
+
+	*old_attr = new_attr;
+}
+
+/* replay modify_qp for sqps -- return 0 if all is well, 1 if AQP1 destroyed */
+static int replay_modify_qp(struct ehca_sport *sport)
+{
+	int aqp1_destroyed;
+	unsigned long flags;
+
+	spin_lock_irqsave(&sport->mod_sqp_lock, flags);
+
+	aqp1_destroyed = !sport->ibqp_sqp[IB_QPT_GSI];
+
+	if (sport->ibqp_sqp[IB_QPT_SMI])
+		ehca_recover_sqp(sport->ibqp_sqp[IB_QPT_SMI]);
+	if (!aqp1_destroyed)
+		ehca_recover_sqp(sport->ibqp_sqp[IB_QPT_GSI]);
+
+	spin_unlock_irqrestore(&sport->mod_sqp_lock, flags);
+
+	return aqp1_destroyed;
+}
+
+static void parse_ec(struct ehca_shca *shca, u64 eqe)
+{
+	u8 ec   = EHCA_BMASK_GET(NEQE_EVENT_CODE, eqe);
+	u8 port = EHCA_BMASK_GET(NEQE_PORT_NUMBER, eqe);
+	u8 spec_event;
+	struct ehca_sport *sport = &shca->sport[port - 1];
+
+	switch (ec) {
+	case 0x30: /* port availability change */
+		if (EHCA_BMASK_GET(NEQE_PORT_AVAILABILITY, eqe)) {
+			/* only replay modify_qp calls in autodetect mode;
+			 * if AQP1 was destroyed, the port is already down
+			 * again and we can drop the event.
+			 */
+			if (ehca_nr_ports < 0)
+				if (replay_modify_qp(sport))
+					break;
+
+			sport->port_state = IB_PORT_ACTIVE;
+			dispatch_port_event(shca, port, IB_EVENT_PORT_ACTIVE,
+					    "is active");
+			ehca_query_sma_attr(shca, port, &sport->saved_attr);
+		} else {
+			sport->port_state = IB_PORT_DOWN;
+			dispatch_port_event(shca, port, IB_EVENT_PORT_ERR,
+					    "is inactive");
+		}
+		break;
+	case 0x31:
+		/* port configuration change
+		 * disruptive change is caused by
+		 * LID, PKEY or SM change
+		 */
+		if (EHCA_BMASK_GET(NEQE_DISRUPTIVE, eqe)) {
+			ehca_warn(&shca->ib_device, "disruptive port "
+				  "%d configuration change", port);
+
+			sport->port_state = IB_PORT_DOWN;
+			dispatch_port_event(shca, port, IB_EVENT_PORT_ERR,
+					    "is inactive");
+
+			sport->port_state = IB_PORT_ACTIVE;
+			dispatch_port_event(shca, port, IB_EVENT_PORT_ACTIVE,
+					    "is active");
+			ehca_query_sma_attr(shca, port,
+					    &sport->saved_attr);
+		} else
+			notify_port_conf_change(shca, port);
+		break;
+	case 0x32: /* adapter malfunction */
+		ehca_err(&shca->ib_device, "Adapter malfunction.");
+		break;
+	case 0x33:  /* trace stopped */
+		ehca_err(&shca->ib_device, "Traced stopped.");
+		break;
+	case 0x34: /* util async event */
+		spec_event = EHCA_BMASK_GET(NEQE_SPECIFIC_EVENT, eqe);
+		if (spec_event == 0x80) /* client reregister required */
+			dispatch_port_event(shca, port,
+					    IB_EVENT_CLIENT_REREGISTER,
+					    "client reregister req.");
+		else
+			ehca_warn(&shca->ib_device, "Unknown util async "
+				  "event %x on port %x", spec_event, port);
+		break;
+	default:
+		ehca_err(&shca->ib_device, "Unknown event code: %x on %s.",
+			 ec, shca->ib_device.name);
+		break;
+	}
+
+	return;
+}
+
+static inline void reset_eq_pending(struct ehca_cq *cq)
+{
+	u64 CQx_EP;
+	struct h_galpa gal = cq->galpas.kernel;
+
+	hipz_galpa_store_cq(gal, cqx_ep, 0x0);
+	CQx_EP = hipz_galpa_load(gal, CQTEMM_OFFSET(cqx_ep));
+
+	return;
+}
+
+irqreturn_t ehca_interrupt_neq(int irq, void *dev_id)
+{
+	struct ehca_shca *shca = (struct ehca_shca*)dev_id;
+
+	tasklet_hi_schedule(&shca->neq.interrupt_task);
+
+	return IRQ_HANDLED;
+}
+
+void ehca_tasklet_neq(unsigned long data)
+{
+	struct ehca_shca *shca = (struct ehca_shca*)data;
+	struct ehca_eqe *eqe;
+	u64 ret;
+
+	eqe = ehca_poll_eq(shca, &shca->neq);
+
+	while (eqe) {
+		if (!EHCA_BMASK_GET(NEQE_COMPLETION_EVENT, eqe->entry))
+			parse_ec(shca, eqe->entry);
+
+		eqe = ehca_poll_eq(shca, &shca->neq);
+	}
+
+	ret = hipz_h_reset_event(shca->ipz_hca_handle,
+				 shca->neq.ipz_eq_handle, 0xFFFFFFFFFFFFFFFFL);
+
+	if (ret != H_SUCCESS)
+		ehca_err(&shca->ib_device, "Can't clear notification events.");
+
+	return;
+}
+
+irqreturn_t ehca_interrupt_eq(int irq, void *dev_id)
+{
+	struct ehca_shca *shca = (struct ehca_shca*)dev_id;
+
+	tasklet_hi_schedule(&shca->eq.interrupt_task);
+
+	return IRQ_HANDLED;
+}
+
+
+static inline void process_eqe(struct ehca_shca *shca, struct ehca_eqe *eqe)
+{
+	u64 eqe_value;
+	u32 token;
+	struct ehca_cq *cq;
+
+	eqe_value = eqe->entry;
+	ehca_dbg(&shca->ib_device, "eqe_value=%llx", eqe_value);
+	if (EHCA_BMASK_GET(EQE_COMPLETION_EVENT, eqe_value)) {
+		ehca_dbg(&shca->ib_device, "Got completion event");
+		token = EHCA_BMASK_GET(EQE_CQ_TOKEN, eqe_value);
+		read_lock(&ehca_cq_idr_lock);
+		cq = idr_find(&ehca_cq_idr, token);
+		if (cq)
+			atomic_inc(&cq->nr_events);
+		read_unlock(&ehca_cq_idr_lock);
+		if (cq == NULL) {
+			ehca_err(&shca->ib_device,
+				 "Invalid eqe for non-existing cq token=%x",
+				 token);
+			return;
+		}
+		reset_eq_pending(cq);
+		if (ehca_scaling_code)
+			queue_comp_task(cq);
+		else {
+			comp_event_callback(cq);
+			if (atomic_dec_and_test(&cq->nr_events))
+				wake_up(&cq->wait_completion);
+		}
+	} else {
+		ehca_dbg(&shca->ib_device, "Got non completion event");
+		parse_identifier(shca, eqe_value);
+	}
+}
+
+void ehca_process_eq(struct ehca_shca *shca, int is_irq)
+{
+	struct ehca_eq *eq = &shca->eq;
+	struct ehca_eqe_cache_entry *eqe_cache = eq->eqe_cache;
+	u64 eqe_value, ret;
+	int eqe_cnt, i;
+	int eq_empty = 0;
+
+	spin_lock(&eq->irq_spinlock);
+	if (is_irq) {
+		const int max_query_cnt = 100;
+		int query_cnt = 0;
+		int int_state = 1;
+		do {
+			int_state = hipz_h_query_int_state(
+				shca->ipz_hca_handle, eq->ist);
+			query_cnt++;
+			iosync();
+		} while (int_state && query_cnt < max_query_cnt);
+		if (unlikely((query_cnt == max_query_cnt)))
+			ehca_dbg(&shca->ib_device, "int_state=%x query_cnt=%x",
+				 int_state, query_cnt);
+	}
+
+	/* read out all eqes */
+	eqe_cnt = 0;
+	do {
+		u32 token;
+		eqe_cache[eqe_cnt].eqe = ehca_poll_eq(shca, eq);
+		if (!eqe_cache[eqe_cnt].eqe)
+			break;
+		eqe_value = eqe_cache[eqe_cnt].eqe->entry;
+		if (EHCA_BMASK_GET(EQE_COMPLETION_EVENT, eqe_value)) {
+			token = EHCA_BMASK_GET(EQE_CQ_TOKEN, eqe_value);
+			read_lock(&ehca_cq_idr_lock);
+			eqe_cache[eqe_cnt].cq = idr_find(&ehca_cq_idr, token);
+			if (eqe_cache[eqe_cnt].cq)
+				atomic_inc(&eqe_cache[eqe_cnt].cq->nr_events);
+			read_unlock(&ehca_cq_idr_lock);
+			if (!eqe_cache[eqe_cnt].cq) {
+				ehca_err(&shca->ib_device,
+					 "Invalid eqe for non-existing cq "
+					 "token=%x", token);
+				continue;
+			}
+		} else
+			eqe_cache[eqe_cnt].cq = NULL;
+		eqe_cnt++;
+	} while (eqe_cnt < EHCA_EQE_CACHE_SIZE);
+	if (!eqe_cnt) {
+		if (is_irq)
+			ehca_dbg(&shca->ib_device,
+				 "No eqe found for irq event");
+		goto unlock_irq_spinlock;
+	} else if (!is_irq) {
+		ret = hipz_h_eoi(eq->ist);
+		if (ret != H_SUCCESS)
+			ehca_err(&shca->ib_device,
+				 "bad return code EOI -rc = %lld\n", ret);
+		ehca_dbg(&shca->ib_device, "deadman found %x eqe", eqe_cnt);
+	}
+	if (unlikely(eqe_cnt == EHCA_EQE_CACHE_SIZE))
+		ehca_dbg(&shca->ib_device, "too many eqes for one irq event");
+	/* enable irq for new packets */
+	for (i = 0; i < eqe_cnt; i++) {
+		if (eq->eqe_cache[i].cq)
+			reset_eq_pending(eq->eqe_cache[i].cq);
+	}
+	/* check eq */
+	spin_lock(&eq->spinlock);
+	eq_empty = (!ipz_eqit_eq_peek_valid(&shca->eq.ipz_queue));
+	spin_unlock(&eq->spinlock);
+	/* call completion handler for cached eqes */
+	for (i = 0; i < eqe_cnt; i++)
+		if (eq->eqe_cache[i].cq) {
+			if (ehca_scaling_code)
+				queue_comp_task(eq->eqe_cache[i].cq);
+			else {
+				struct ehca_cq *cq = eq->eqe_cache[i].cq;
+				comp_event_callback(cq);
+				if (atomic_dec_and_test(&cq->nr_events))
+					wake_up(&cq->wait_completion);
+			}
+		} else {
+			ehca_dbg(&shca->ib_device, "Got non completion event");
+			parse_identifier(shca, eq->eqe_cache[i].eqe->entry);
+		}
+	/* poll eq if not empty */
+	if (eq_empty)
+		goto unlock_irq_spinlock;
+	do {
+		struct ehca_eqe *eqe;
+		eqe = ehca_poll_eq(shca, &shca->eq);
+		if (!eqe)
+			break;
+		process_eqe(shca, eqe);
+	} while (1);
+
+unlock_irq_spinlock:
+	spin_unlock(&eq->irq_spinlock);
+}
+
+void ehca_tasklet_eq(unsigned long data)
+{
+	ehca_process_eq((struct ehca_shca*)data, 1);
+}
+
+static int find_next_online_cpu(struct ehca_comp_pool *pool)
+{
+	int cpu;
+	unsigned long flags;
+
+	WARN_ON_ONCE(!in_interrupt());
+	if (ehca_debug_level >= 3)
+		ehca_dmp(cpu_online_mask, cpumask_size(), "");
+
+	spin_lock_irqsave(&pool->last_cpu_lock, flags);
+	do {
+		cpu = cpumask_next(pool->last_cpu, cpu_online_mask);
+		if (cpu >= nr_cpu_ids)
+			cpu = cpumask_first(cpu_online_mask);
+		pool->last_cpu = cpu;
+	} while (!per_cpu_ptr(pool->cpu_comp_tasks, cpu)->active);
+	spin_unlock_irqrestore(&pool->last_cpu_lock, flags);
+
+	return cpu;
+}
+
+static void __queue_comp_task(struct ehca_cq *__cq,
+			      struct ehca_cpu_comp_task *cct,
+			      struct task_struct *thread)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&cct->task_lock, flags);
+	spin_lock(&__cq->task_lock);
+
+	if (__cq->nr_callbacks == 0) {
+		__cq->nr_callbacks++;
+		list_add_tail(&__cq->entry, &cct->cq_list);
+		cct->cq_jobs++;
+		wake_up_process(thread);
+	} else
+		__cq->nr_callbacks++;
+
+	spin_unlock(&__cq->task_lock);
+	spin_unlock_irqrestore(&cct->task_lock, flags);
+}
+
+static void queue_comp_task(struct ehca_cq *__cq)
+{
+	int cpu_id;
+	struct ehca_cpu_comp_task *cct;
+	struct task_struct *thread;
+	int cq_jobs;
+	unsigned long flags;
+
+	cpu_id = find_next_online_cpu(pool);
+	BUG_ON(!cpu_online(cpu_id));
+
+	cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu_id);
+	thread = *per_cpu_ptr(pool->cpu_comp_threads, cpu_id);
+	BUG_ON(!cct || !thread);
+
+	spin_lock_irqsave(&cct->task_lock, flags);
+	cq_jobs = cct->cq_jobs;
+	spin_unlock_irqrestore(&cct->task_lock, flags);
+	if (cq_jobs > 0) {
+		cpu_id = find_next_online_cpu(pool);
+		cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu_id);
+		thread = *per_cpu_ptr(pool->cpu_comp_threads, cpu_id);
+		BUG_ON(!cct || !thread);
+	}
+	__queue_comp_task(__cq, cct, thread);
+}
+
+static void run_comp_task(struct ehca_cpu_comp_task *cct)
+{
+	struct ehca_cq *cq;
+
+	while (!list_empty(&cct->cq_list)) {
+		cq = list_entry(cct->cq_list.next, struct ehca_cq, entry);
+		spin_unlock_irq(&cct->task_lock);
+
+		comp_event_callback(cq);
+		if (atomic_dec_and_test(&cq->nr_events))
+			wake_up(&cq->wait_completion);
+
+		spin_lock_irq(&cct->task_lock);
+		spin_lock(&cq->task_lock);
+		cq->nr_callbacks--;
+		if (!cq->nr_callbacks) {
+			list_del_init(cct->cq_list.next);
+			cct->cq_jobs--;
+		}
+		spin_unlock(&cq->task_lock);
+	}
+}
+
+static void comp_task_park(unsigned int cpu)
+{
+	struct ehca_cpu_comp_task *cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu);
+	struct ehca_cpu_comp_task *target;
+	struct task_struct *thread;
+	struct ehca_cq *cq, *tmp;
+	LIST_HEAD(list);
+
+	spin_lock_irq(&cct->task_lock);
+	cct->cq_jobs = 0;
+	cct->active = 0;
+	list_splice_init(&cct->cq_list, &list);
+	spin_unlock_irq(&cct->task_lock);
+
+	cpu = find_next_online_cpu(pool);
+	target = per_cpu_ptr(pool->cpu_comp_tasks, cpu);
+	thread = *per_cpu_ptr(pool->cpu_comp_threads, cpu);
+	spin_lock_irq(&target->task_lock);
+	list_for_each_entry_safe(cq, tmp, &list, entry) {
+		list_del(&cq->entry);
+		__queue_comp_task(cq, target, thread);
+	}
+	spin_unlock_irq(&target->task_lock);
+}
+
+static void comp_task_stop(unsigned int cpu, bool online)
+{
+	struct ehca_cpu_comp_task *cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu);
+
+	spin_lock_irq(&cct->task_lock);
+	cct->cq_jobs = 0;
+	cct->active = 0;
+	WARN_ON(!list_empty(&cct->cq_list));
+	spin_unlock_irq(&cct->task_lock);
+}
+
+static int comp_task_should_run(unsigned int cpu)
+{
+	struct ehca_cpu_comp_task *cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu);
+
+	return cct->cq_jobs;
+}
+
+static void comp_task(unsigned int cpu)
+{
+	struct ehca_cpu_comp_task *cct = this_cpu_ptr(pool->cpu_comp_tasks);
+	int cql_empty;
+
+	spin_lock_irq(&cct->task_lock);
+	cql_empty = list_empty(&cct->cq_list);
+	if (!cql_empty) {
+		__set_current_state(TASK_RUNNING);
+		run_comp_task(cct);
+	}
+	spin_unlock_irq(&cct->task_lock);
+}
+
+static struct smp_hotplug_thread comp_pool_threads = {
+	.thread_should_run	= comp_task_should_run,
+	.thread_fn		= comp_task,
+	.thread_comm		= "ehca_comp/%u",
+	.cleanup		= comp_task_stop,
+	.park			= comp_task_park,
+};
+
+int ehca_create_comp_pool(void)
+{
+	int cpu, ret = -ENOMEM;
+
+	if (!ehca_scaling_code)
+		return 0;
+
+	pool = kzalloc(sizeof(struct ehca_comp_pool), GFP_KERNEL);
+	if (pool == NULL)
+		return -ENOMEM;
+
+	spin_lock_init(&pool->last_cpu_lock);
+	pool->last_cpu = cpumask_any(cpu_online_mask);
+
+	pool->cpu_comp_tasks = alloc_percpu(struct ehca_cpu_comp_task);
+	if (!pool->cpu_comp_tasks)
+		goto out_pool;
+
+	pool->cpu_comp_threads = alloc_percpu(struct task_struct *);
+	if (!pool->cpu_comp_threads)
+		goto out_tasks;
+
+	for_each_present_cpu(cpu) {
+		struct ehca_cpu_comp_task *cct;
+
+		cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu);
+		spin_lock_init(&cct->task_lock);
+		INIT_LIST_HEAD(&cct->cq_list);
+	}
+
+	comp_pool_threads.store = pool->cpu_comp_threads;
+	ret = smpboot_register_percpu_thread(&comp_pool_threads);
+	if (ret)
+		goto out_threads;
+
+	pr_info("eHCA scaling code enabled\n");
+	return ret;
+
+out_threads:
+	free_percpu(pool->cpu_comp_threads);
+out_tasks:
+	free_percpu(pool->cpu_comp_tasks);
+out_pool:
+	kfree(pool);
+	return ret;
+}
+
+void ehca_destroy_comp_pool(void)
+{
+	if (!ehca_scaling_code)
+		return;
+
+	smpboot_unregister_percpu_thread(&comp_pool_threads);
+
+	free_percpu(pool->cpu_comp_threads);
+	free_percpu(pool->cpu_comp_tasks);
+	kfree(pool);
+}
diff --git a/drivers/infiniband/hw/ehca/ehca_irq.h b/drivers/infiniband/hw/ehca/ehca_irq.h
new file mode 100644
index 0000000..5370199
--- /dev/null
+++ b/drivers/infiniband/hw/ehca/ehca_irq.h
@@ -0,0 +1,77 @@
+/*
+ *  IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ *  Function definitions and structs for EQs, NEQs and interrupts
+ *
+ *  Authors: Heiko J Schick <schickhj@de.ibm.com>
+ *           Khadija Souissi <souissi@de.ibm.com>
+ *
+ *  Copyright (c) 2005 IBM Corporation
+ *
+ *  All rights reserved.
+ *
+ *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ *  BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __EHCA_IRQ_H
+#define __EHCA_IRQ_H
+
+
+struct ehca_shca;
+
+#include <linux/interrupt.h>
+#include <linux/types.h>
+
+int ehca_error_data(struct ehca_shca *shca, void *data, u64 resource);
+
+irqreturn_t ehca_interrupt_neq(int irq, void *dev_id);
+void ehca_tasklet_neq(unsigned long data);
+
+irqreturn_t ehca_interrupt_eq(int irq, void *dev_id);
+void ehca_tasklet_eq(unsigned long data);
+void ehca_process_eq(struct ehca_shca *shca, int is_irq);
+
+struct ehca_cpu_comp_task {
+	struct list_head cq_list;
+	spinlock_t task_lock;
+	int cq_jobs;
+	int active;
+};
+
+struct ehca_comp_pool {
+	struct ehca_cpu_comp_task __percpu *cpu_comp_tasks;
+	struct task_struct * __percpu *cpu_comp_threads;
+	int last_cpu;
+	spinlock_t last_cpu_lock;
+};
+
+int ehca_create_comp_pool(void);
+void ehca_destroy_comp_pool(void);
+
+#endif
diff --git a/drivers/infiniband/hw/ehca/ehca_iverbs.h b/drivers/infiniband/hw/ehca/ehca_iverbs.h
new file mode 100644
index 0000000..22f79af
--- /dev/null
+++ b/drivers/infiniband/hw/ehca/ehca_iverbs.h
@@ -0,0 +1,212 @@
+/*
+ *  IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ *  Function definitions for internal functions
+ *
+ *  Authors: Heiko J Schick <schickhj@de.ibm.com>
+ *           Dietmar Decker <ddecker@de.ibm.com>
+ *
+ *  Copyright (c) 2005 IBM Corporation
+ *
+ *  All rights reserved.
+ *
+ *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ *  BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __EHCA_IVERBS_H__
+#define __EHCA_IVERBS_H__
+
+#include "ehca_classes.h"
+
+int ehca_query_device(struct ib_device *ibdev, struct ib_device_attr *props);
+
+int ehca_query_port(struct ib_device *ibdev, u8 port,
+		    struct ib_port_attr *props);
+
+int ehca_query_sma_attr(struct ehca_shca *shca, u8 port,
+			struct ehca_sma_attr *attr);
+
+int ehca_query_pkey(struct ib_device *ibdev, u8 port, u16 index, u16 * pkey);
+
+int ehca_query_gid(struct ib_device *ibdev, u8 port, int index,
+		   union ib_gid *gid);
+
+int ehca_modify_port(struct ib_device *ibdev, u8 port, int port_modify_mask,
+		     struct ib_port_modify *props);
+
+struct ib_pd *ehca_alloc_pd(struct ib_device *device,
+			    struct ib_ucontext *context,
+			    struct ib_udata *udata);
+
+int ehca_dealloc_pd(struct ib_pd *pd);
+
+struct ib_ah *ehca_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr);
+
+int ehca_modify_ah(struct ib_ah *ah, struct ib_ah_attr *ah_attr);
+
+int ehca_query_ah(struct ib_ah *ah, struct ib_ah_attr *ah_attr);
+
+int ehca_destroy_ah(struct ib_ah *ah);
+
+struct ib_mr *ehca_get_dma_mr(struct ib_pd *pd, int mr_access_flags);
+
+struct ib_mr *ehca_reg_phys_mr(struct ib_pd *pd,
+			       struct ib_phys_buf *phys_buf_array,
+			       int num_phys_buf,
+			       int mr_access_flags, u64 *iova_start);
+
+struct ib_mr *ehca_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
+			       u64 virt, int mr_access_flags,
+			       struct ib_udata *udata);
+
+int ehca_rereg_phys_mr(struct ib_mr *mr,
+		       int mr_rereg_mask,
+		       struct ib_pd *pd,
+		       struct ib_phys_buf *phys_buf_array,
+		       int num_phys_buf, int mr_access_flags, u64 *iova_start);
+
+int ehca_query_mr(struct ib_mr *mr, struct ib_mr_attr *mr_attr);
+
+int ehca_dereg_mr(struct ib_mr *mr);
+
+struct ib_mw *ehca_alloc_mw(struct ib_pd *pd, enum ib_mw_type type);
+
+int ehca_bind_mw(struct ib_qp *qp, struct ib_mw *mw,
+		 struct ib_mw_bind *mw_bind);
+
+int ehca_dealloc_mw(struct ib_mw *mw);
+
+struct ib_fmr *ehca_alloc_fmr(struct ib_pd *pd,
+			      int mr_access_flags,
+			      struct ib_fmr_attr *fmr_attr);
+
+int ehca_map_phys_fmr(struct ib_fmr *fmr,
+		      u64 *page_list, int list_len, u64 iova);
+
+int ehca_unmap_fmr(struct list_head *fmr_list);
+
+int ehca_dealloc_fmr(struct ib_fmr *fmr);
+
+enum ehca_eq_type {
+	EHCA_EQ = 0, /* Event Queue              */
+	EHCA_NEQ     /* Notification Event Queue */
+};
+
+int ehca_create_eq(struct ehca_shca *shca, struct ehca_eq *eq,
+		   enum ehca_eq_type type, const u32 length);
+
+int ehca_destroy_eq(struct ehca_shca *shca, struct ehca_eq *eq);
+
+void *ehca_poll_eq(struct ehca_shca *shca, struct ehca_eq *eq);
+
+
+struct ib_cq *ehca_create_cq(struct ib_device *device, int cqe, int comp_vector,
+			     struct ib_ucontext *context,
+			     struct ib_udata *udata);
+
+int ehca_destroy_cq(struct ib_cq *cq);
+
+int ehca_resize_cq(struct ib_cq *cq, int cqe, struct ib_udata *udata);
+
+int ehca_poll_cq(struct ib_cq *cq, int num_entries, struct ib_wc *wc);
+
+int ehca_peek_cq(struct ib_cq *cq, int wc_cnt);
+
+int ehca_req_notify_cq(struct ib_cq *cq, enum ib_cq_notify_flags notify_flags);
+
+struct ib_qp *ehca_create_qp(struct ib_pd *pd,
+			     struct ib_qp_init_attr *init_attr,
+			     struct ib_udata *udata);
+
+int ehca_destroy_qp(struct ib_qp *qp);
+
+int ehca_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask,
+		   struct ib_udata *udata);
+
+int ehca_query_qp(struct ib_qp *qp, struct ib_qp_attr *qp_attr,
+		  int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr);
+
+int ehca_post_send(struct ib_qp *qp, struct ib_send_wr *send_wr,
+		   struct ib_send_wr **bad_send_wr);
+
+int ehca_post_recv(struct ib_qp *qp, struct ib_recv_wr *recv_wr,
+		   struct ib_recv_wr **bad_recv_wr);
+
+int ehca_post_srq_recv(struct ib_srq *srq,
+		       struct ib_recv_wr *recv_wr,
+		       struct ib_recv_wr **bad_recv_wr);
+
+struct ib_srq *ehca_create_srq(struct ib_pd *pd,
+			       struct ib_srq_init_attr *init_attr,
+			       struct ib_udata *udata);
+
+int ehca_modify_srq(struct ib_srq *srq, struct ib_srq_attr *attr,
+		    enum ib_srq_attr_mask attr_mask, struct ib_udata *udata);
+
+int ehca_query_srq(struct ib_srq *srq, struct ib_srq_attr *srq_attr);
+
+int ehca_destroy_srq(struct ib_srq *srq);
+
+u64 ehca_define_sqp(struct ehca_shca *shca, struct ehca_qp *ibqp,
+		    struct ib_qp_init_attr *qp_init_attr);
+
+int ehca_attach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid);
+
+int ehca_detach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid);
+
+struct ib_ucontext *ehca_alloc_ucontext(struct ib_device *device,
+					struct ib_udata *udata);
+
+int ehca_dealloc_ucontext(struct ib_ucontext *context);
+
+int ehca_mmap(struct ib_ucontext *context, struct vm_area_struct *vma);
+
+int ehca_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
+		     struct ib_wc *in_wc, struct ib_grh *in_grh,
+		     struct ib_mad *in_mad,
+		     struct ib_mad *out_mad);
+
+void ehca_poll_eqs(unsigned long data);
+
+int ehca_calc_ipd(struct ehca_shca *shca, int port,
+		  enum ib_rate path_rate, u32 *ipd);
+
+void ehca_add_to_err_list(struct ehca_qp *qp, int on_sq);
+
+#ifdef CONFIG_PPC_64K_PAGES
+void *ehca_alloc_fw_ctrlblock(gfp_t flags);
+void ehca_free_fw_ctrlblock(void *ptr);
+#else
+#define ehca_alloc_fw_ctrlblock(flags) ((void *)get_zeroed_page(flags))
+#define ehca_free_fw_ctrlblock(ptr) free_page((unsigned long)(ptr))
+#endif
+
+void ehca_recover_sqp(struct ib_qp *sqp);
+
+#endif
diff --git a/drivers/infiniband/hw/ehca/ehca_main.c b/drivers/infiniband/hw/ehca/ehca_main.c
new file mode 100644
index 0000000..cd8d290
--- /dev/null
+++ b/drivers/infiniband/hw/ehca/ehca_main.c
@@ -0,0 +1,1100 @@
+/*
+ *  IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ *  module start stop, hca detection
+ *
+ *  Authors: Heiko J Schick <schickhj@de.ibm.com>
+ *           Hoang-Nam Nguyen <hnguyen@de.ibm.com>
+ *           Joachim Fenkes <fenkes@de.ibm.com>
+ *
+ *  Copyright (c) 2005 IBM Corporation
+ *
+ *  All rights reserved.
+ *
+ *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ *  BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifdef CONFIG_PPC_64K_PAGES
+#include <linux/slab.h>
+#endif
+
+#include <linux/notifier.h>
+#include <linux/memory.h>
+#include "ehca_classes.h"
+#include "ehca_iverbs.h"
+#include "ehca_mrmw.h"
+#include "ehca_tools.h"
+#include "hcp_if.h"
+
+#define HCAD_VERSION "0029"
+
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_AUTHOR("Christoph Raisch <raisch@de.ibm.com>");
+MODULE_DESCRIPTION("IBM eServer HCA InfiniBand Device Driver");
+MODULE_VERSION(HCAD_VERSION);
+
+static bool ehca_open_aqp1    = 0;
+static int ehca_hw_level      = 0;
+static bool ehca_poll_all_eqs = 1;
+
+int ehca_debug_level   = 0;
+int ehca_nr_ports      = -1;
+bool ehca_use_hp_mr    = 0;
+int ehca_port_act_time = 30;
+int ehca_static_rate   = -1;
+bool ehca_scaling_code = 0;
+int ehca_lock_hcalls   = -1;
+int ehca_max_cq        = -1;
+int ehca_max_qp        = -1;
+
+module_param_named(open_aqp1,     ehca_open_aqp1,     bool, S_IRUGO);
+module_param_named(debug_level,   ehca_debug_level,   int,  S_IRUGO);
+module_param_named(hw_level,      ehca_hw_level,      int,  S_IRUGO);
+module_param_named(nr_ports,      ehca_nr_ports,      int,  S_IRUGO);
+module_param_named(use_hp_mr,     ehca_use_hp_mr,     bool, S_IRUGO);
+module_param_named(port_act_time, ehca_port_act_time, int,  S_IRUGO);
+module_param_named(poll_all_eqs,  ehca_poll_all_eqs,  bool, S_IRUGO);
+module_param_named(static_rate,   ehca_static_rate,   int,  S_IRUGO);
+module_param_named(scaling_code,  ehca_scaling_code,  bool, S_IRUGO);
+module_param_named(lock_hcalls,   ehca_lock_hcalls,   bint, S_IRUGO);
+module_param_named(number_of_cqs, ehca_max_cq,        int,  S_IRUGO);
+module_param_named(number_of_qps, ehca_max_qp,        int,  S_IRUGO);
+
+MODULE_PARM_DESC(open_aqp1,
+		 "Open AQP1 on startup (default: no)");
+MODULE_PARM_DESC(debug_level,
+		 "Amount of debug output (0: none (default), 1: traces, "
+		 "2: some dumps, 3: lots)");
+MODULE_PARM_DESC(hw_level,
+		 "Hardware level (0: autosensing (default), "
+		 "0x10..0x14: eHCA, 0x20..0x23: eHCA2)");
+MODULE_PARM_DESC(nr_ports,
+		 "number of connected ports (-1: autodetect (default), "
+		 "1: port one only, 2: two ports)");
+MODULE_PARM_DESC(use_hp_mr,
+		 "Use high performance MRs (default: no)");
+MODULE_PARM_DESC(port_act_time,
+		 "Time to wait for port activation (default: 30 sec)");
+MODULE_PARM_DESC(poll_all_eqs,
+		 "Poll all event queues periodically (default: yes)");
+MODULE_PARM_DESC(static_rate,
+		 "Set permanent static rate (default: no static rate)");
+MODULE_PARM_DESC(scaling_code,
+		 "Enable scaling code (default: no)");
+MODULE_PARM_DESC(lock_hcalls,
+		 "Serialize all hCalls made by the driver "
+		 "(default: autodetect)");
+MODULE_PARM_DESC(number_of_cqs,
+		"Max number of CQs which can be allocated "
+		"(default: autodetect)");
+MODULE_PARM_DESC(number_of_qps,
+		"Max number of QPs which can be allocated "
+		"(default: autodetect)");
+
+DEFINE_RWLOCK(ehca_qp_idr_lock);
+DEFINE_RWLOCK(ehca_cq_idr_lock);
+DEFINE_IDR(ehca_qp_idr);
+DEFINE_IDR(ehca_cq_idr);
+
+static LIST_HEAD(shca_list); /* list of all registered ehcas */
+DEFINE_SPINLOCK(shca_list_lock);
+
+static struct timer_list poll_eqs_timer;
+
+#ifdef CONFIG_PPC_64K_PAGES
+static struct kmem_cache *ctblk_cache;
+
+void *ehca_alloc_fw_ctrlblock(gfp_t flags)
+{
+	void *ret = kmem_cache_zalloc(ctblk_cache, flags);
+	if (!ret)
+		ehca_gen_err("Out of memory for ctblk");
+	return ret;
+}
+
+void ehca_free_fw_ctrlblock(void *ptr)
+{
+	if (ptr)
+		kmem_cache_free(ctblk_cache, ptr);
+
+}
+#endif
+
+int ehca2ib_return_code(u64 ehca_rc)
+{
+	switch (ehca_rc) {
+	case H_SUCCESS:
+		return 0;
+	case H_RESOURCE:             /* Resource in use */
+	case H_BUSY:
+		return -EBUSY;
+	case H_NOT_ENOUGH_RESOURCES: /* insufficient resources */
+	case H_CONSTRAINED:          /* resource constraint */
+	case H_NO_MEM:
+		return -ENOMEM;
+	default:
+		return -EINVAL;
+	}
+}
+
+static int ehca_create_slab_caches(void)
+{
+	int ret;
+
+	ret = ehca_init_pd_cache();
+	if (ret) {
+		ehca_gen_err("Cannot create PD SLAB cache.");
+		return ret;
+	}
+
+	ret = ehca_init_cq_cache();
+	if (ret) {
+		ehca_gen_err("Cannot create CQ SLAB cache.");
+		goto create_slab_caches2;
+	}
+
+	ret = ehca_init_qp_cache();
+	if (ret) {
+		ehca_gen_err("Cannot create QP SLAB cache.");
+		goto create_slab_caches3;
+	}
+
+	ret = ehca_init_av_cache();
+	if (ret) {
+		ehca_gen_err("Cannot create AV SLAB cache.");
+		goto create_slab_caches4;
+	}
+
+	ret = ehca_init_mrmw_cache();
+	if (ret) {
+		ehca_gen_err("Cannot create MR&MW SLAB cache.");
+		goto create_slab_caches5;
+	}
+
+	ret = ehca_init_small_qp_cache();
+	if (ret) {
+		ehca_gen_err("Cannot create small queue SLAB cache.");
+		goto create_slab_caches6;
+	}
+
+#ifdef CONFIG_PPC_64K_PAGES
+	ctblk_cache = kmem_cache_create("ehca_cache_ctblk",
+					EHCA_PAGESIZE, H_CB_ALIGNMENT,
+					SLAB_HWCACHE_ALIGN,
+					NULL);
+	if (!ctblk_cache) {
+		ehca_gen_err("Cannot create ctblk SLAB cache.");
+		ehca_cleanup_small_qp_cache();
+		ret = -ENOMEM;
+		goto create_slab_caches6;
+	}
+#endif
+	return 0;
+
+create_slab_caches6:
+	ehca_cleanup_mrmw_cache();
+
+create_slab_caches5:
+	ehca_cleanup_av_cache();
+
+create_slab_caches4:
+	ehca_cleanup_qp_cache();
+
+create_slab_caches3:
+	ehca_cleanup_cq_cache();
+
+create_slab_caches2:
+	ehca_cleanup_pd_cache();
+
+	return ret;
+}
+
+static void ehca_destroy_slab_caches(void)
+{
+	ehca_cleanup_small_qp_cache();
+	ehca_cleanup_mrmw_cache();
+	ehca_cleanup_av_cache();
+	ehca_cleanup_qp_cache();
+	ehca_cleanup_cq_cache();
+	ehca_cleanup_pd_cache();
+#ifdef CONFIG_PPC_64K_PAGES
+	if (ctblk_cache)
+		kmem_cache_destroy(ctblk_cache);
+#endif
+}
+
+#define EHCA_HCAAVER  EHCA_BMASK_IBM(32, 39)
+#define EHCA_REVID    EHCA_BMASK_IBM(40, 63)
+
+static struct cap_descr {
+	u64 mask;
+	char *descr;
+} hca_cap_descr[] = {
+	{ HCA_CAP_AH_PORT_NR_CHECK, "HCA_CAP_AH_PORT_NR_CHECK" },
+	{ HCA_CAP_ATOMIC, "HCA_CAP_ATOMIC" },
+	{ HCA_CAP_AUTO_PATH_MIG, "HCA_CAP_AUTO_PATH_MIG" },
+	{ HCA_CAP_BAD_P_KEY_CTR, "HCA_CAP_BAD_P_KEY_CTR" },
+	{ HCA_CAP_SQD_RTS_PORT_CHANGE, "HCA_CAP_SQD_RTS_PORT_CHANGE" },
+	{ HCA_CAP_CUR_QP_STATE_MOD, "HCA_CAP_CUR_QP_STATE_MOD" },
+	{ HCA_CAP_INIT_TYPE, "HCA_CAP_INIT_TYPE" },
+	{ HCA_CAP_PORT_ACTIVE_EVENT, "HCA_CAP_PORT_ACTIVE_EVENT" },
+	{ HCA_CAP_Q_KEY_VIOL_CTR, "HCA_CAP_Q_KEY_VIOL_CTR" },
+	{ HCA_CAP_WQE_RESIZE, "HCA_CAP_WQE_RESIZE" },
+	{ HCA_CAP_RAW_PACKET_MCAST, "HCA_CAP_RAW_PACKET_MCAST" },
+	{ HCA_CAP_SHUTDOWN_PORT, "HCA_CAP_SHUTDOWN_PORT" },
+	{ HCA_CAP_RC_LL_QP, "HCA_CAP_RC_LL_QP" },
+	{ HCA_CAP_SRQ, "HCA_CAP_SRQ" },
+	{ HCA_CAP_UD_LL_QP, "HCA_CAP_UD_LL_QP" },
+	{ HCA_CAP_RESIZE_MR, "HCA_CAP_RESIZE_MR" },
+	{ HCA_CAP_MINI_QP, "HCA_CAP_MINI_QP" },
+	{ HCA_CAP_H_ALLOC_RES_SYNC, "HCA_CAP_H_ALLOC_RES_SYNC" },
+};
+
+static int ehca_sense_attributes(struct ehca_shca *shca)
+{
+	int i, ret = 0;
+	u64 h_ret;
+	struct hipz_query_hca *rblock;
+	struct hipz_query_port *port;
+	const char *loc_code;
+
+	static const u32 pgsize_map[] = {
+		HCA_CAP_MR_PGSIZE_4K,  0x1000,
+		HCA_CAP_MR_PGSIZE_64K, 0x10000,
+		HCA_CAP_MR_PGSIZE_1M,  0x100000,
+		HCA_CAP_MR_PGSIZE_16M, 0x1000000,
+	};
+
+	ehca_gen_dbg("Probing adapter %s...",
+		     shca->ofdev->dev.of_node->full_name);
+	loc_code = of_get_property(shca->ofdev->dev.of_node, "ibm,loc-code",
+				   NULL);
+	if (loc_code)
+		ehca_gen_dbg(" ... location lode=%s", loc_code);
+
+	rblock = ehca_alloc_fw_ctrlblock(GFP_KERNEL);
+	if (!rblock) {
+		ehca_gen_err("Cannot allocate rblock memory.");
+		return -ENOMEM;
+	}
+
+	h_ret = hipz_h_query_hca(shca->ipz_hca_handle, rblock);
+	if (h_ret != H_SUCCESS) {
+		ehca_gen_err("Cannot query device properties. h_ret=%lli",
+			     h_ret);
+		ret = -EPERM;
+		goto sense_attributes1;
+	}
+
+	if (ehca_nr_ports == 1)
+		shca->num_ports = 1;
+	else
+		shca->num_ports = (u8)rblock->num_ports;
+
+	ehca_gen_dbg(" ... found %x ports", rblock->num_ports);
+
+	if (ehca_hw_level == 0) {
+		u32 hcaaver;
+		u32 revid;
+
+		hcaaver = EHCA_BMASK_GET(EHCA_HCAAVER, rblock->hw_ver);
+		revid   = EHCA_BMASK_GET(EHCA_REVID, rblock->hw_ver);
+
+		ehca_gen_dbg(" ... hardware version=%x:%x", hcaaver, revid);
+
+		if (hcaaver == 1) {
+			if (revid <= 3)
+				shca->hw_level = 0x10 | (revid + 1);
+			else
+				shca->hw_level = 0x14;
+		} else if (hcaaver == 2) {
+			if (revid == 0)
+				shca->hw_level = 0x21;
+			else if (revid == 0x10)
+				shca->hw_level = 0x22;
+			else if (revid == 0x20 || revid == 0x21)
+				shca->hw_level = 0x23;
+		}
+
+		if (!shca->hw_level) {
+			ehca_gen_warn("unknown hardware version"
+				      " - assuming default level");
+			shca->hw_level = 0x22;
+		}
+	} else
+		shca->hw_level = ehca_hw_level;
+	ehca_gen_dbg(" ... hardware level=%x", shca->hw_level);
+
+	shca->hca_cap = rblock->hca_cap_indicators;
+	ehca_gen_dbg(" ... HCA capabilities:");
+	for (i = 0; i < ARRAY_SIZE(hca_cap_descr); i++)
+		if (EHCA_BMASK_GET(hca_cap_descr[i].mask, shca->hca_cap))
+			ehca_gen_dbg("   %s", hca_cap_descr[i].descr);
+
+	/* Autodetect hCall locking -- the "H_ALLOC_RESOURCE synced" flag is
+	 * a firmware property, so it's valid across all adapters
+	 */
+	if (ehca_lock_hcalls == -1)
+		ehca_lock_hcalls = !EHCA_BMASK_GET(HCA_CAP_H_ALLOC_RES_SYNC,
+					shca->hca_cap);
+
+	/* translate supported MR page sizes; always support 4K */
+	shca->hca_cap_mr_pgsize = EHCA_PAGESIZE;
+	for (i = 0; i < ARRAY_SIZE(pgsize_map); i += 2)
+		if (rblock->memory_page_size_supported & pgsize_map[i])
+			shca->hca_cap_mr_pgsize |= pgsize_map[i + 1];
+
+	/* Set maximum number of CQs and QPs to calculate EQ size */
+	if (shca->max_num_qps == -1)
+		shca->max_num_qps = min_t(int, rblock->max_qp,
+					  EHCA_MAX_NUM_QUEUES);
+	else if (shca->max_num_qps < 1 || shca->max_num_qps > rblock->max_qp) {
+		ehca_gen_warn("The requested number of QPs is out of range "
+			      "(1 - %i) specified by HW. Value is set to %i",
+			      rblock->max_qp, rblock->max_qp);
+		shca->max_num_qps = rblock->max_qp;
+	}
+
+	if (shca->max_num_cqs == -1)
+		shca->max_num_cqs = min_t(int, rblock->max_cq,
+					  EHCA_MAX_NUM_QUEUES);
+	else if (shca->max_num_cqs < 1 || shca->max_num_cqs > rblock->max_cq) {
+		ehca_gen_warn("The requested number of CQs is out of range "
+			      "(1 - %i) specified by HW. Value is set to %i",
+			      rblock->max_cq, rblock->max_cq);
+	}
+
+	/* query max MTU from first port -- it's the same for all ports */
+	port = (struct hipz_query_port *)rblock;
+	h_ret = hipz_h_query_port(shca->ipz_hca_handle, 1, port);
+	if (h_ret != H_SUCCESS) {
+		ehca_gen_err("Cannot query port properties. h_ret=%lli",
+			     h_ret);
+		ret = -EPERM;
+		goto sense_attributes1;
+	}
+
+	shca->max_mtu = port->max_mtu;
+
+sense_attributes1:
+	ehca_free_fw_ctrlblock(rblock);
+	return ret;
+}
+
+static int init_node_guid(struct ehca_shca *shca)
+{
+	int ret = 0;
+	struct hipz_query_hca *rblock;
+
+	rblock = ehca_alloc_fw_ctrlblock(GFP_KERNEL);
+	if (!rblock) {
+		ehca_err(&shca->ib_device, "Can't allocate rblock memory.");
+		return -ENOMEM;
+	}
+
+	if (hipz_h_query_hca(shca->ipz_hca_handle, rblock) != H_SUCCESS) {
+		ehca_err(&shca->ib_device, "Can't query device properties");
+		ret = -EINVAL;
+		goto init_node_guid1;
+	}
+
+	memcpy(&shca->ib_device.node_guid, &rblock->node_guid, sizeof(u64));
+
+init_node_guid1:
+	ehca_free_fw_ctrlblock(rblock);
+	return ret;
+}
+
+static int ehca_init_device(struct ehca_shca *shca)
+{
+	int ret;
+
+	ret = init_node_guid(shca);
+	if (ret)
+		return ret;
+
+	strlcpy(shca->ib_device.name, "ehca%d", IB_DEVICE_NAME_MAX);
+	shca->ib_device.owner               = THIS_MODULE;
+
+	shca->ib_device.uverbs_abi_ver	    = 8;
+	shca->ib_device.uverbs_cmd_mask	    =
+		(1ull << IB_USER_VERBS_CMD_GET_CONTEXT)		|
+		(1ull << IB_USER_VERBS_CMD_QUERY_DEVICE)	|
+		(1ull << IB_USER_VERBS_CMD_QUERY_PORT)		|
+		(1ull << IB_USER_VERBS_CMD_ALLOC_PD)		|
+		(1ull << IB_USER_VERBS_CMD_DEALLOC_PD)		|
+		(1ull << IB_USER_VERBS_CMD_REG_MR)		|
+		(1ull << IB_USER_VERBS_CMD_DEREG_MR)		|
+		(1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL)	|
+		(1ull << IB_USER_VERBS_CMD_CREATE_CQ)		|
+		(1ull << IB_USER_VERBS_CMD_DESTROY_CQ)		|
+		(1ull << IB_USER_VERBS_CMD_CREATE_QP)		|
+		(1ull << IB_USER_VERBS_CMD_MODIFY_QP)		|
+		(1ull << IB_USER_VERBS_CMD_QUERY_QP)		|
+		(1ull << IB_USER_VERBS_CMD_DESTROY_QP)		|
+		(1ull << IB_USER_VERBS_CMD_ATTACH_MCAST)	|
+		(1ull << IB_USER_VERBS_CMD_DETACH_MCAST);
+
+	shca->ib_device.node_type           = RDMA_NODE_IB_CA;
+	shca->ib_device.phys_port_cnt       = shca->num_ports;
+	shca->ib_device.num_comp_vectors    = 1;
+	shca->ib_device.dma_device          = &shca->ofdev->dev;
+	shca->ib_device.query_device        = ehca_query_device;
+	shca->ib_device.query_port          = ehca_query_port;
+	shca->ib_device.query_gid           = ehca_query_gid;
+	shca->ib_device.query_pkey          = ehca_query_pkey;
+	/* shca->in_device.modify_device    = ehca_modify_device    */
+	shca->ib_device.modify_port         = ehca_modify_port;
+	shca->ib_device.alloc_ucontext      = ehca_alloc_ucontext;
+	shca->ib_device.dealloc_ucontext    = ehca_dealloc_ucontext;
+	shca->ib_device.alloc_pd            = ehca_alloc_pd;
+	shca->ib_device.dealloc_pd          = ehca_dealloc_pd;
+	shca->ib_device.create_ah	    = ehca_create_ah;
+	/* shca->ib_device.modify_ah	    = ehca_modify_ah;	    */
+	shca->ib_device.query_ah	    = ehca_query_ah;
+	shca->ib_device.destroy_ah	    = ehca_destroy_ah;
+	shca->ib_device.create_qp	    = ehca_create_qp;
+	shca->ib_device.modify_qp	    = ehca_modify_qp;
+	shca->ib_device.query_qp	    = ehca_query_qp;
+	shca->ib_device.destroy_qp	    = ehca_destroy_qp;
+	shca->ib_device.post_send	    = ehca_post_send;
+	shca->ib_device.post_recv	    = ehca_post_recv;
+	shca->ib_device.create_cq	    = ehca_create_cq;
+	shca->ib_device.destroy_cq	    = ehca_destroy_cq;
+	shca->ib_device.resize_cq	    = ehca_resize_cq;
+	shca->ib_device.poll_cq		    = ehca_poll_cq;
+	/* shca->ib_device.peek_cq	    = ehca_peek_cq;	    */
+	shca->ib_device.req_notify_cq	    = ehca_req_notify_cq;
+	/* shca->ib_device.req_ncomp_notif  = ehca_req_ncomp_notif; */
+	shca->ib_device.get_dma_mr	    = ehca_get_dma_mr;
+	shca->ib_device.reg_phys_mr	    = ehca_reg_phys_mr;
+	shca->ib_device.reg_user_mr	    = ehca_reg_user_mr;
+	shca->ib_device.query_mr	    = ehca_query_mr;
+	shca->ib_device.dereg_mr	    = ehca_dereg_mr;
+	shca->ib_device.rereg_phys_mr	    = ehca_rereg_phys_mr;
+	shca->ib_device.alloc_mw	    = ehca_alloc_mw;
+	shca->ib_device.bind_mw		    = ehca_bind_mw;
+	shca->ib_device.dealloc_mw	    = ehca_dealloc_mw;
+	shca->ib_device.alloc_fmr	    = ehca_alloc_fmr;
+	shca->ib_device.map_phys_fmr	    = ehca_map_phys_fmr;
+	shca->ib_device.unmap_fmr	    = ehca_unmap_fmr;
+	shca->ib_device.dealloc_fmr	    = ehca_dealloc_fmr;
+	shca->ib_device.attach_mcast	    = ehca_attach_mcast;
+	shca->ib_device.detach_mcast	    = ehca_detach_mcast;
+	shca->ib_device.process_mad	    = ehca_process_mad;
+	shca->ib_device.mmap		    = ehca_mmap;
+	shca->ib_device.dma_ops		    = &ehca_dma_mapping_ops;
+
+	if (EHCA_BMASK_GET(HCA_CAP_SRQ, shca->hca_cap)) {
+		shca->ib_device.uverbs_cmd_mask |=
+			(1ull << IB_USER_VERBS_CMD_CREATE_SRQ) |
+			(1ull << IB_USER_VERBS_CMD_MODIFY_SRQ) |
+			(1ull << IB_USER_VERBS_CMD_QUERY_SRQ) |
+			(1ull << IB_USER_VERBS_CMD_DESTROY_SRQ);
+
+		shca->ib_device.create_srq          = ehca_create_srq;
+		shca->ib_device.modify_srq          = ehca_modify_srq;
+		shca->ib_device.query_srq           = ehca_query_srq;
+		shca->ib_device.destroy_srq         = ehca_destroy_srq;
+		shca->ib_device.post_srq_recv       = ehca_post_srq_recv;
+	}
+
+	return ret;
+}
+
+static int ehca_create_aqp1(struct ehca_shca *shca, u32 port)
+{
+	struct ehca_sport *sport = &shca->sport[port - 1];
+	struct ib_cq *ibcq;
+	struct ib_qp *ibqp;
+	struct ib_qp_init_attr qp_init_attr;
+	int ret;
+
+	if (sport->ibcq_aqp1) {
+		ehca_err(&shca->ib_device, "AQP1 CQ is already created.");
+		return -EPERM;
+	}
+
+	ibcq = ib_create_cq(&shca->ib_device, NULL, NULL, (void *)(-1), 10, 0);
+	if (IS_ERR(ibcq)) {
+		ehca_err(&shca->ib_device, "Cannot create AQP1 CQ.");
+		return PTR_ERR(ibcq);
+	}
+	sport->ibcq_aqp1 = ibcq;
+
+	if (sport->ibqp_sqp[IB_QPT_GSI]) {
+		ehca_err(&shca->ib_device, "AQP1 QP is already created.");
+		ret = -EPERM;
+		goto create_aqp1;
+	}
+
+	memset(&qp_init_attr, 0, sizeof(struct ib_qp_init_attr));
+	qp_init_attr.send_cq          = ibcq;
+	qp_init_attr.recv_cq          = ibcq;
+	qp_init_attr.sq_sig_type      = IB_SIGNAL_ALL_WR;
+	qp_init_attr.cap.max_send_wr  = 100;
+	qp_init_attr.cap.max_recv_wr  = 100;
+	qp_init_attr.cap.max_send_sge = 2;
+	qp_init_attr.cap.max_recv_sge = 1;
+	qp_init_attr.qp_type          = IB_QPT_GSI;
+	qp_init_attr.port_num         = port;
+	qp_init_attr.qp_context       = NULL;
+	qp_init_attr.event_handler    = NULL;
+	qp_init_attr.srq              = NULL;
+
+	ibqp = ib_create_qp(&shca->pd->ib_pd, &qp_init_attr);
+	if (IS_ERR(ibqp)) {
+		ehca_err(&shca->ib_device, "Cannot create AQP1 QP.");
+		ret = PTR_ERR(ibqp);
+		goto create_aqp1;
+	}
+	sport->ibqp_sqp[IB_QPT_GSI] = ibqp;
+
+	return 0;
+
+create_aqp1:
+	ib_destroy_cq(sport->ibcq_aqp1);
+	return ret;
+}
+
+static int ehca_destroy_aqp1(struct ehca_sport *sport)
+{
+	int ret;
+
+	ret = ib_destroy_qp(sport->ibqp_sqp[IB_QPT_GSI]);
+	if (ret) {
+		ehca_gen_err("Cannot destroy AQP1 QP. ret=%i", ret);
+		return ret;
+	}
+
+	ret = ib_destroy_cq(sport->ibcq_aqp1);
+	if (ret)
+		ehca_gen_err("Cannot destroy AQP1 CQ. ret=%i", ret);
+
+	return ret;
+}
+
+static ssize_t ehca_show_debug_level(struct device_driver *ddp, char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, "%d\n", ehca_debug_level);
+}
+
+static ssize_t ehca_store_debug_level(struct device_driver *ddp,
+				      const char *buf, size_t count)
+{
+	int value = (*buf) - '0';
+	if (value >= 0 && value <= 9)
+		ehca_debug_level = value;
+	return 1;
+}
+
+static DRIVER_ATTR(debug_level, S_IRUSR | S_IWUSR,
+		   ehca_show_debug_level, ehca_store_debug_level);
+
+static struct attribute *ehca_drv_attrs[] = {
+	&driver_attr_debug_level.attr,
+	NULL
+};
+
+static struct attribute_group ehca_drv_attr_grp = {
+	.attrs = ehca_drv_attrs
+};
+
+static const struct attribute_group *ehca_drv_attr_groups[] = {
+	&ehca_drv_attr_grp,
+	NULL,
+};
+
+#define EHCA_RESOURCE_ATTR(name)                                           \
+static ssize_t  ehca_show_##name(struct device *dev,                       \
+				 struct device_attribute *attr,            \
+				 char *buf)                                \
+{									   \
+	struct ehca_shca *shca;						   \
+	struct hipz_query_hca *rblock;				           \
+	int data;                                                          \
+									   \
+	shca = dev_get_drvdata(dev);					   \
+									   \
+	rblock = ehca_alloc_fw_ctrlblock(GFP_KERNEL);			   \
+	if (!rblock) {						           \
+		dev_err(dev, "Can't allocate rblock memory.\n");           \
+		return 0;						   \
+	}								   \
+									   \
+	if (hipz_h_query_hca(shca->ipz_hca_handle, rblock) != H_SUCCESS) { \
+		dev_err(dev, "Can't query device properties\n");           \
+		ehca_free_fw_ctrlblock(rblock);			   	   \
+		return 0;					   	   \
+	}								   \
+									   \
+	data = rblock->name;                                               \
+	ehca_free_fw_ctrlblock(rblock);                                    \
+									   \
+	if ((strcmp(#name, "num_ports") == 0) && (ehca_nr_ports == 1))	   \
+		return snprintf(buf, 256, "1\n");			   \
+	else								   \
+		return snprintf(buf, 256, "%d\n", data);		   \
+									   \
+}									   \
+static DEVICE_ATTR(name, S_IRUGO, ehca_show_##name, NULL);
+
+EHCA_RESOURCE_ATTR(num_ports);
+EHCA_RESOURCE_ATTR(hw_ver);
+EHCA_RESOURCE_ATTR(max_eq);
+EHCA_RESOURCE_ATTR(cur_eq);
+EHCA_RESOURCE_ATTR(max_cq);
+EHCA_RESOURCE_ATTR(cur_cq);
+EHCA_RESOURCE_ATTR(max_qp);
+EHCA_RESOURCE_ATTR(cur_qp);
+EHCA_RESOURCE_ATTR(max_mr);
+EHCA_RESOURCE_ATTR(cur_mr);
+EHCA_RESOURCE_ATTR(max_mw);
+EHCA_RESOURCE_ATTR(cur_mw);
+EHCA_RESOURCE_ATTR(max_pd);
+EHCA_RESOURCE_ATTR(max_ah);
+
+static ssize_t ehca_show_adapter_handle(struct device *dev,
+					struct device_attribute *attr,
+					char *buf)
+{
+	struct ehca_shca *shca = dev_get_drvdata(dev);
+
+	return sprintf(buf, "%llx\n", shca->ipz_hca_handle.handle);
+
+}
+static DEVICE_ATTR(adapter_handle, S_IRUGO, ehca_show_adapter_handle, NULL);
+
+static struct attribute *ehca_dev_attrs[] = {
+	&dev_attr_adapter_handle.attr,
+	&dev_attr_num_ports.attr,
+	&dev_attr_hw_ver.attr,
+	&dev_attr_max_eq.attr,
+	&dev_attr_cur_eq.attr,
+	&dev_attr_max_cq.attr,
+	&dev_attr_cur_cq.attr,
+	&dev_attr_max_qp.attr,
+	&dev_attr_cur_qp.attr,
+	&dev_attr_max_mr.attr,
+	&dev_attr_cur_mr.attr,
+	&dev_attr_max_mw.attr,
+	&dev_attr_cur_mw.attr,
+	&dev_attr_max_pd.attr,
+	&dev_attr_max_ah.attr,
+	NULL
+};
+
+static struct attribute_group ehca_dev_attr_grp = {
+	.attrs = ehca_dev_attrs
+};
+
+static int ehca_probe(struct platform_device *dev)
+{
+	struct ehca_shca *shca;
+	const u64 *handle;
+	struct ib_pd *ibpd;
+	int ret, i, eq_size;
+	unsigned long flags;
+
+	handle = of_get_property(dev->dev.of_node, "ibm,hca-handle", NULL);
+	if (!handle) {
+		ehca_gen_err("Cannot get eHCA handle for adapter: %s.",
+			     dev->dev.of_node->full_name);
+		return -ENODEV;
+	}
+
+	if (!(*handle)) {
+		ehca_gen_err("Wrong eHCA handle for adapter: %s.",
+			     dev->dev.of_node->full_name);
+		return -ENODEV;
+	}
+
+	shca = (struct ehca_shca *)ib_alloc_device(sizeof(*shca));
+	if (!shca) {
+		ehca_gen_err("Cannot allocate shca memory.");
+		return -ENOMEM;
+	}
+
+	mutex_init(&shca->modify_mutex);
+	atomic_set(&shca->num_cqs, 0);
+	atomic_set(&shca->num_qps, 0);
+	shca->max_num_qps = ehca_max_qp;
+	shca->max_num_cqs = ehca_max_cq;
+
+	for (i = 0; i < ARRAY_SIZE(shca->sport); i++)
+		spin_lock_init(&shca->sport[i].mod_sqp_lock);
+
+	shca->ofdev = dev;
+	shca->ipz_hca_handle.handle = *handle;
+	dev_set_drvdata(&dev->dev, shca);
+
+	ret = ehca_sense_attributes(shca);
+	if (ret < 0) {
+		ehca_gen_err("Cannot sense eHCA attributes.");
+		goto probe1;
+	}
+
+	ret = ehca_init_device(shca);
+	if (ret) {
+		ehca_gen_err("Cannot init ehca  device struct");
+		goto probe1;
+	}
+
+	eq_size = 2 * shca->max_num_cqs + 4 * shca->max_num_qps;
+	/* create event queues */
+	ret = ehca_create_eq(shca, &shca->eq, EHCA_EQ, eq_size);
+	if (ret) {
+		ehca_err(&shca->ib_device, "Cannot create EQ.");
+		goto probe1;
+	}
+
+	ret = ehca_create_eq(shca, &shca->neq, EHCA_NEQ, 513);
+	if (ret) {
+		ehca_err(&shca->ib_device, "Cannot create NEQ.");
+		goto probe3;
+	}
+
+	/* create internal protection domain */
+	ibpd = ehca_alloc_pd(&shca->ib_device, (void *)(-1), NULL);
+	if (IS_ERR(ibpd)) {
+		ehca_err(&shca->ib_device, "Cannot create internal PD.");
+		ret = PTR_ERR(ibpd);
+		goto probe4;
+	}
+
+	shca->pd = container_of(ibpd, struct ehca_pd, ib_pd);
+	shca->pd->ib_pd.device = &shca->ib_device;
+
+	/* create internal max MR */
+	ret = ehca_reg_internal_maxmr(shca, shca->pd, &shca->maxmr);
+
+	if (ret) {
+		ehca_err(&shca->ib_device, "Cannot create internal MR ret=%i",
+			 ret);
+		goto probe5;
+	}
+
+	ret = ib_register_device(&shca->ib_device, NULL);
+	if (ret) {
+		ehca_err(&shca->ib_device,
+			 "ib_register_device() failed ret=%i", ret);
+		goto probe6;
+	}
+
+	/* create AQP1 for port 1 */
+	if (ehca_open_aqp1 == 1) {
+		shca->sport[0].port_state = IB_PORT_DOWN;
+		ret = ehca_create_aqp1(shca, 1);
+		if (ret) {
+			ehca_err(&shca->ib_device,
+				 "Cannot create AQP1 for port 1.");
+			goto probe7;
+		}
+	}
+
+	/* create AQP1 for port 2 */
+	if ((ehca_open_aqp1 == 1) && (shca->num_ports == 2)) {
+		shca->sport[1].port_state = IB_PORT_DOWN;
+		ret = ehca_create_aqp1(shca, 2);
+		if (ret) {
+			ehca_err(&shca->ib_device,
+				 "Cannot create AQP1 for port 2.");
+			goto probe8;
+		}
+	}
+
+	ret = sysfs_create_group(&dev->dev.kobj, &ehca_dev_attr_grp);
+	if (ret) /* only complain; we can live without attributes */
+		ehca_err(&shca->ib_device,
+			 "Cannot create device attributes  ret=%d", ret);
+
+	spin_lock_irqsave(&shca_list_lock, flags);
+	list_add(&shca->shca_list, &shca_list);
+	spin_unlock_irqrestore(&shca_list_lock, flags);
+
+	return 0;
+
+probe8:
+	ret = ehca_destroy_aqp1(&shca->sport[0]);
+	if (ret)
+		ehca_err(&shca->ib_device,
+			 "Cannot destroy AQP1 for port 1. ret=%i", ret);
+
+probe7:
+	ib_unregister_device(&shca->ib_device);
+
+probe6:
+	ret = ehca_dereg_internal_maxmr(shca);
+	if (ret)
+		ehca_err(&shca->ib_device,
+			 "Cannot destroy internal MR. ret=%x", ret);
+
+probe5:
+	ret = ehca_dealloc_pd(&shca->pd->ib_pd);
+	if (ret)
+		ehca_err(&shca->ib_device,
+			 "Cannot destroy internal PD. ret=%x", ret);
+
+probe4:
+	ret = ehca_destroy_eq(shca, &shca->neq);
+	if (ret)
+		ehca_err(&shca->ib_device,
+			 "Cannot destroy NEQ. ret=%x", ret);
+
+probe3:
+	ret = ehca_destroy_eq(shca, &shca->eq);
+	if (ret)
+		ehca_err(&shca->ib_device,
+			 "Cannot destroy EQ. ret=%x", ret);
+
+probe1:
+	ib_dealloc_device(&shca->ib_device);
+
+	return -EINVAL;
+}
+
+static int ehca_remove(struct platform_device *dev)
+{
+	struct ehca_shca *shca = dev_get_drvdata(&dev->dev);
+	unsigned long flags;
+	int ret;
+
+	sysfs_remove_group(&dev->dev.kobj, &ehca_dev_attr_grp);
+
+	if (ehca_open_aqp1 == 1) {
+		int i;
+		for (i = 0; i < shca->num_ports; i++) {
+			ret = ehca_destroy_aqp1(&shca->sport[i]);
+			if (ret)
+				ehca_err(&shca->ib_device,
+					 "Cannot destroy AQP1 for port %x "
+					 "ret=%i", ret, i);
+		}
+	}
+
+	ib_unregister_device(&shca->ib_device);
+
+	ret = ehca_dereg_internal_maxmr(shca);
+	if (ret)
+		ehca_err(&shca->ib_device,
+			 "Cannot destroy internal MR. ret=%i", ret);
+
+	ret = ehca_dealloc_pd(&shca->pd->ib_pd);
+	if (ret)
+		ehca_err(&shca->ib_device,
+			 "Cannot destroy internal PD. ret=%i", ret);
+
+	ret = ehca_destroy_eq(shca, &shca->eq);
+	if (ret)
+		ehca_err(&shca->ib_device, "Cannot destroy EQ. ret=%i", ret);
+
+	ret = ehca_destroy_eq(shca, &shca->neq);
+	if (ret)
+		ehca_err(&shca->ib_device, "Canot destroy NEQ. ret=%i", ret);
+
+	ib_dealloc_device(&shca->ib_device);
+
+	spin_lock_irqsave(&shca_list_lock, flags);
+	list_del(&shca->shca_list);
+	spin_unlock_irqrestore(&shca_list_lock, flags);
+
+	return ret;
+}
+
+static struct of_device_id ehca_device_table[] =
+{
+	{
+		.name       = "lhca",
+		.compatible = "IBM,lhca",
+	},
+	{},
+};
+MODULE_DEVICE_TABLE(of, ehca_device_table);
+
+static struct platform_driver ehca_driver = {
+	.probe       = ehca_probe,
+	.remove      = ehca_remove,
+	.driver = {
+		.name = "ehca",
+		.owner = THIS_MODULE,
+		.groups = ehca_drv_attr_groups,
+		.of_match_table = ehca_device_table,
+	},
+};
+
+void ehca_poll_eqs(unsigned long data)
+{
+	struct ehca_shca *shca;
+
+	spin_lock(&shca_list_lock);
+	list_for_each_entry(shca, &shca_list, shca_list) {
+		if (shca->eq.is_initialized) {
+			/* call deadman proc only if eq ptr does not change */
+			struct ehca_eq *eq = &shca->eq;
+			int max = 3;
+			volatile u64 q_ofs, q_ofs2;
+			unsigned long flags;
+			spin_lock_irqsave(&eq->spinlock, flags);
+			q_ofs = eq->ipz_queue.current_q_offset;
+			spin_unlock_irqrestore(&eq->spinlock, flags);
+			do {
+				spin_lock_irqsave(&eq->spinlock, flags);
+				q_ofs2 = eq->ipz_queue.current_q_offset;
+				spin_unlock_irqrestore(&eq->spinlock, flags);
+				max--;
+			} while (q_ofs == q_ofs2 && max > 0);
+			if (q_ofs == q_ofs2)
+				ehca_process_eq(shca, 0);
+		}
+	}
+	mod_timer(&poll_eqs_timer, round_jiffies(jiffies + HZ));
+	spin_unlock(&shca_list_lock);
+}
+
+static int ehca_mem_notifier(struct notifier_block *nb,
+			     unsigned long action, void *data)
+{
+	static unsigned long ehca_dmem_warn_time;
+	unsigned long flags;
+
+	switch (action) {
+	case MEM_CANCEL_OFFLINE:
+	case MEM_CANCEL_ONLINE:
+	case MEM_ONLINE:
+	case MEM_OFFLINE:
+		return NOTIFY_OK;
+	case MEM_GOING_ONLINE:
+	case MEM_GOING_OFFLINE:
+		/* only ok if no hca is attached to the lpar */
+		spin_lock_irqsave(&shca_list_lock, flags);
+		if (list_empty(&shca_list)) {
+			spin_unlock_irqrestore(&shca_list_lock, flags);
+			return NOTIFY_OK;
+		} else {
+			spin_unlock_irqrestore(&shca_list_lock, flags);
+			if (printk_timed_ratelimit(&ehca_dmem_warn_time,
+						   30 * 1000))
+				ehca_gen_err("DMEM operations are not allowed"
+					     "in conjunction with eHCA");
+			return NOTIFY_BAD;
+		}
+	}
+	return NOTIFY_OK;
+}
+
+static struct notifier_block ehca_mem_nb = {
+	.notifier_call = ehca_mem_notifier,
+};
+
+static int __init ehca_module_init(void)
+{
+	int ret;
+
+	printk(KERN_INFO "eHCA Infiniband Device Driver "
+	       "(Version " HCAD_VERSION ")\n");
+
+	ret = ehca_create_comp_pool();
+	if (ret) {
+		ehca_gen_err("Cannot create comp pool.");
+		return ret;
+	}
+
+	ret = ehca_create_slab_caches();
+	if (ret) {
+		ehca_gen_err("Cannot create SLAB caches");
+		ret = -ENOMEM;
+		goto module_init1;
+	}
+
+	ret = ehca_create_busmap();
+	if (ret) {
+		ehca_gen_err("Cannot create busmap.");
+		goto module_init2;
+	}
+
+	ret = ibmebus_register_driver(&ehca_driver);
+	if (ret) {
+		ehca_gen_err("Cannot register eHCA device driver");
+		ret = -EINVAL;
+		goto module_init3;
+	}
+
+	ret = register_memory_notifier(&ehca_mem_nb);
+	if (ret) {
+		ehca_gen_err("Failed registering memory add/remove notifier");
+		goto module_init4;
+	}
+
+	if (ehca_poll_all_eqs != 1) {
+		ehca_gen_err("WARNING!!!");
+		ehca_gen_err("It is possible to lose interrupts.");
+	} else {
+		init_timer(&poll_eqs_timer);
+		poll_eqs_timer.function = ehca_poll_eqs;
+		poll_eqs_timer.expires = jiffies + HZ;
+		add_timer(&poll_eqs_timer);
+	}
+
+	return 0;
+
+module_init4:
+	ibmebus_unregister_driver(&ehca_driver);
+
+module_init3:
+	ehca_destroy_busmap();
+
+module_init2:
+	ehca_destroy_slab_caches();
+
+module_init1:
+	ehca_destroy_comp_pool();
+	return ret;
+};
+
+static void __exit ehca_module_exit(void)
+{
+	if (ehca_poll_all_eqs == 1)
+		del_timer_sync(&poll_eqs_timer);
+
+	ibmebus_unregister_driver(&ehca_driver);
+
+	unregister_memory_notifier(&ehca_mem_nb);
+
+	ehca_destroy_busmap();
+
+	ehca_destroy_slab_caches();
+
+	ehca_destroy_comp_pool();
+
+	idr_destroy(&ehca_cq_idr);
+	idr_destroy(&ehca_qp_idr);
+};
+
+module_init(ehca_module_init);
+module_exit(ehca_module_exit);
diff --git a/drivers/infiniband/hw/ehca/ehca_mcast.c b/drivers/infiniband/hw/ehca/ehca_mcast.c
new file mode 100644
index 0000000..120aedf
--- /dev/null
+++ b/drivers/infiniband/hw/ehca/ehca_mcast.c
@@ -0,0 +1,131 @@
+/*
+ *  IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ *  mcast  functions
+ *
+ *  Authors: Khadija Souissi <souissik@de.ibm.com>
+ *           Waleri Fomin <fomin@de.ibm.com>
+ *           Reinhard Ernst <rernst@de.ibm.com>
+ *           Hoang-Nam Nguyen <hnguyen@de.ibm.com>
+ *           Heiko J Schick <schickhj@de.ibm.com>
+ *
+ *  Copyright (c) 2005 IBM Corporation
+ *
+ *  All rights reserved.
+ *
+ *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ *  BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/module.h>
+#include <linux/err.h>
+#include "ehca_classes.h"
+#include "ehca_tools.h"
+#include "ehca_qes.h"
+#include "ehca_iverbs.h"
+#include "hcp_if.h"
+
+#define MAX_MC_LID 0xFFFE
+#define MIN_MC_LID 0xC000	/* Multicast limits */
+#define EHCA_VALID_MULTICAST_GID(gid)  ((gid)[0] == 0xFF)
+#define EHCA_VALID_MULTICAST_LID(lid) \
+	(((lid) >= MIN_MC_LID) && ((lid) <= MAX_MC_LID))
+
+int ehca_attach_mcast(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
+{
+	struct ehca_qp *my_qp = container_of(ibqp, struct ehca_qp, ib_qp);
+	struct ehca_shca *shca = container_of(ibqp->device, struct ehca_shca,
+					      ib_device);
+	union ib_gid my_gid;
+	u64 subnet_prefix, interface_id, h_ret;
+
+	if (ibqp->qp_type != IB_QPT_UD) {
+		ehca_err(ibqp->device, "invalid qp_type=%x", ibqp->qp_type);
+		return -EINVAL;
+	}
+
+	if (!(EHCA_VALID_MULTICAST_GID(gid->raw))) {
+		ehca_err(ibqp->device, "invalid mulitcast gid");
+		return -EINVAL;
+	} else if ((lid < MIN_MC_LID) || (lid > MAX_MC_LID)) {
+		ehca_err(ibqp->device, "invalid mulitcast lid=%x", lid);
+		return -EINVAL;
+	}
+
+	memcpy(&my_gid.raw, gid->raw, sizeof(union ib_gid));
+
+	subnet_prefix = be64_to_cpu(my_gid.global.subnet_prefix);
+	interface_id = be64_to_cpu(my_gid.global.interface_id);
+	h_ret = hipz_h_attach_mcqp(shca->ipz_hca_handle,
+				   my_qp->ipz_qp_handle,
+				   my_qp->galpas.kernel,
+				   lid, subnet_prefix, interface_id);
+	if (h_ret != H_SUCCESS)
+		ehca_err(ibqp->device,
+			 "ehca_qp=%p qp_num=%x hipz_h_attach_mcqp() failed "
+			 "h_ret=%lli", my_qp, ibqp->qp_num, h_ret);
+
+	return ehca2ib_return_code(h_ret);
+}
+
+int ehca_detach_mcast(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
+{
+	struct ehca_qp *my_qp = container_of(ibqp, struct ehca_qp, ib_qp);
+	struct ehca_shca *shca = container_of(ibqp->pd->device,
+					      struct ehca_shca, ib_device);
+	union ib_gid my_gid;
+	u64 subnet_prefix, interface_id, h_ret;
+
+	if (ibqp->qp_type != IB_QPT_UD) {
+		ehca_err(ibqp->device, "invalid qp_type %x", ibqp->qp_type);
+		return -EINVAL;
+	}
+
+	if (!(EHCA_VALID_MULTICAST_GID(gid->raw))) {
+		ehca_err(ibqp->device, "invalid mulitcast gid");
+		return -EINVAL;
+	} else if ((lid < MIN_MC_LID) || (lid > MAX_MC_LID)) {
+		ehca_err(ibqp->device, "invalid mulitcast lid=%x", lid);
+		return -EINVAL;
+	}
+
+	memcpy(&my_gid.raw, gid->raw, sizeof(union ib_gid));
+
+	subnet_prefix = be64_to_cpu(my_gid.global.subnet_prefix);
+	interface_id = be64_to_cpu(my_gid.global.interface_id);
+	h_ret = hipz_h_detach_mcqp(shca->ipz_hca_handle,
+				   my_qp->ipz_qp_handle,
+				   my_qp->galpas.kernel,
+				   lid, subnet_prefix, interface_id);
+	if (h_ret != H_SUCCESS)
+		ehca_err(ibqp->device,
+			 "ehca_qp=%p qp_num=%x hipz_h_detach_mcqp() failed "
+			 "h_ret=%lli", my_qp, ibqp->qp_num, h_ret);
+
+	return ehca2ib_return_code(h_ret);
+}
diff --git a/drivers/infiniband/hw/ehca/ehca_mrmw.c b/drivers/infiniband/hw/ehca/ehca_mrmw.c
new file mode 100644
index 0000000..3488e8c
--- /dev/null
+++ b/drivers/infiniband/hw/ehca/ehca_mrmw.c
@@ -0,0 +1,2593 @@
+/*
+ *  IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ *  MR/MW functions
+ *
+ *  Authors: Dietmar Decker <ddecker@de.ibm.com>
+ *           Christoph Raisch <raisch@de.ibm.com>
+ *           Hoang-Nam Nguyen <hnguyen@de.ibm.com>
+ *
+ *  Copyright (c) 2005 IBM Corporation
+ *
+ *  All rights reserved.
+ *
+ *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ *  BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/slab.h>
+#include <rdma/ib_umem.h>
+
+#include "ehca_iverbs.h"
+#include "ehca_mrmw.h"
+#include "hcp_if.h"
+#include "hipz_hw.h"
+
+#define NUM_CHUNKS(length, chunk_size) \
+	(((length) + (chunk_size - 1)) / (chunk_size))
+
+/* max number of rpages (per hcall register_rpages) */
+#define MAX_RPAGES 512
+
+/* DMEM toleration management */
+#define EHCA_SECTSHIFT        SECTION_SIZE_BITS
+#define EHCA_SECTSIZE          (1UL << EHCA_SECTSHIFT)
+#define EHCA_HUGEPAGESHIFT     34
+#define EHCA_HUGEPAGE_SIZE     (1UL << EHCA_HUGEPAGESHIFT)
+#define EHCA_HUGEPAGE_PFN_MASK ((EHCA_HUGEPAGE_SIZE - 1) >> PAGE_SHIFT)
+#define EHCA_INVAL_ADDR        0xFFFFFFFFFFFFFFFFULL
+#define EHCA_DIR_INDEX_SHIFT 13                   /* 8k Entries in 64k block */
+#define EHCA_TOP_INDEX_SHIFT (EHCA_DIR_INDEX_SHIFT * 2)
+#define EHCA_MAP_ENTRIES (1 << EHCA_DIR_INDEX_SHIFT)
+#define EHCA_TOP_MAP_SIZE (0x10000)               /* currently fixed map size */
+#define EHCA_DIR_MAP_SIZE (0x10000)
+#define EHCA_ENT_MAP_SIZE (0x10000)
+#define EHCA_INDEX_MASK (EHCA_MAP_ENTRIES - 1)
+
+static unsigned long ehca_mr_len;
+
+/*
+ * Memory map data structures
+ */
+struct ehca_dir_bmap {
+	u64 ent[EHCA_MAP_ENTRIES];
+};
+struct ehca_top_bmap {
+	struct ehca_dir_bmap *dir[EHCA_MAP_ENTRIES];
+};
+struct ehca_bmap {
+	struct ehca_top_bmap *top[EHCA_MAP_ENTRIES];
+};
+
+static struct ehca_bmap *ehca_bmap;
+
+static struct kmem_cache *mr_cache;
+static struct kmem_cache *mw_cache;
+
+enum ehca_mr_pgsize {
+	EHCA_MR_PGSIZE4K  = 0x1000L,
+	EHCA_MR_PGSIZE64K = 0x10000L,
+	EHCA_MR_PGSIZE1M  = 0x100000L,
+	EHCA_MR_PGSIZE16M = 0x1000000L
+};
+
+#define EHCA_MR_PGSHIFT4K  12
+#define EHCA_MR_PGSHIFT64K 16
+#define EHCA_MR_PGSHIFT1M  20
+#define EHCA_MR_PGSHIFT16M 24
+
+static u64 ehca_map_vaddr(void *caddr);
+
+static u32 ehca_encode_hwpage_size(u32 pgsize)
+{
+	int log = ilog2(pgsize);
+	WARN_ON(log < 12 || log > 24 || log & 3);
+	return (log - 12) / 4;
+}
+
+static u64 ehca_get_max_hwpage_size(struct ehca_shca *shca)
+{
+	return rounddown_pow_of_two(shca->hca_cap_mr_pgsize);
+}
+
+static struct ehca_mr *ehca_mr_new(void)
+{
+	struct ehca_mr *me;
+
+	me = kmem_cache_zalloc(mr_cache, GFP_KERNEL);
+	if (me)
+		spin_lock_init(&me->mrlock);
+	else
+		ehca_gen_err("alloc failed");
+
+	return me;
+}
+
+static void ehca_mr_delete(struct ehca_mr *me)
+{
+	kmem_cache_free(mr_cache, me);
+}
+
+static struct ehca_mw *ehca_mw_new(void)
+{
+	struct ehca_mw *me;
+
+	me = kmem_cache_zalloc(mw_cache, GFP_KERNEL);
+	if (me)
+		spin_lock_init(&me->mwlock);
+	else
+		ehca_gen_err("alloc failed");
+
+	return me;
+}
+
+static void ehca_mw_delete(struct ehca_mw *me)
+{
+	kmem_cache_free(mw_cache, me);
+}
+
+/*----------------------------------------------------------------------*/
+
+struct ib_mr *ehca_get_dma_mr(struct ib_pd *pd, int mr_access_flags)
+{
+	struct ib_mr *ib_mr;
+	int ret;
+	struct ehca_mr *e_maxmr;
+	struct ehca_pd *e_pd = container_of(pd, struct ehca_pd, ib_pd);
+	struct ehca_shca *shca =
+		container_of(pd->device, struct ehca_shca, ib_device);
+
+	if (shca->maxmr) {
+		e_maxmr = ehca_mr_new();
+		if (!e_maxmr) {
+			ehca_err(&shca->ib_device, "out of memory");
+			ib_mr = ERR_PTR(-ENOMEM);
+			goto get_dma_mr_exit0;
+		}
+
+		ret = ehca_reg_maxmr(shca, e_maxmr,
+				     (void *)ehca_map_vaddr((void *)(KERNELBASE + PHYSICAL_START)),
+				     mr_access_flags, e_pd,
+				     &e_maxmr->ib.ib_mr.lkey,
+				     &e_maxmr->ib.ib_mr.rkey);
+		if (ret) {
+			ehca_mr_delete(e_maxmr);
+			ib_mr = ERR_PTR(ret);
+			goto get_dma_mr_exit0;
+		}
+		ib_mr = &e_maxmr->ib.ib_mr;
+	} else {
+		ehca_err(&shca->ib_device, "no internal max-MR exist!");
+		ib_mr = ERR_PTR(-EINVAL);
+		goto get_dma_mr_exit0;
+	}
+
+get_dma_mr_exit0:
+	if (IS_ERR(ib_mr))
+		ehca_err(&shca->ib_device, "h_ret=%li pd=%p mr_access_flags=%x",
+			 PTR_ERR(ib_mr), pd, mr_access_flags);
+	return ib_mr;
+} /* end ehca_get_dma_mr() */
+
+/*----------------------------------------------------------------------*/
+
+struct ib_mr *ehca_reg_phys_mr(struct ib_pd *pd,
+			       struct ib_phys_buf *phys_buf_array,
+			       int num_phys_buf,
+			       int mr_access_flags,
+			       u64 *iova_start)
+{
+	struct ib_mr *ib_mr;
+	int ret;
+	struct ehca_mr *e_mr;
+	struct ehca_shca *shca =
+		container_of(pd->device, struct ehca_shca, ib_device);
+	struct ehca_pd *e_pd = container_of(pd, struct ehca_pd, ib_pd);
+
+	u64 size;
+
+	if ((num_phys_buf <= 0) || !phys_buf_array) {
+		ehca_err(pd->device, "bad input values: num_phys_buf=%x "
+			 "phys_buf_array=%p", num_phys_buf, phys_buf_array);
+		ib_mr = ERR_PTR(-EINVAL);
+		goto reg_phys_mr_exit0;
+	}
+	if (((mr_access_flags & IB_ACCESS_REMOTE_WRITE) &&
+	     !(mr_access_flags & IB_ACCESS_LOCAL_WRITE)) ||
+	    ((mr_access_flags & IB_ACCESS_REMOTE_ATOMIC) &&
+	     !(mr_access_flags & IB_ACCESS_LOCAL_WRITE))) {
+		/*
+		 * Remote Write Access requires Local Write Access
+		 * Remote Atomic Access requires Local Write Access
+		 */
+		ehca_err(pd->device, "bad input values: mr_access_flags=%x",
+			 mr_access_flags);
+		ib_mr = ERR_PTR(-EINVAL);
+		goto reg_phys_mr_exit0;
+	}
+
+	/* check physical buffer list and calculate size */
+	ret = ehca_mr_chk_buf_and_calc_size(phys_buf_array, num_phys_buf,
+					    iova_start, &size);
+	if (ret) {
+		ib_mr = ERR_PTR(ret);
+		goto reg_phys_mr_exit0;
+	}
+	if ((size == 0) ||
+	    (((u64)iova_start + size) < (u64)iova_start)) {
+		ehca_err(pd->device, "bad input values: size=%llx iova_start=%p",
+			 size, iova_start);
+		ib_mr = ERR_PTR(-EINVAL);
+		goto reg_phys_mr_exit0;
+	}
+
+	e_mr = ehca_mr_new();
+	if (!e_mr) {
+		ehca_err(pd->device, "out of memory");
+		ib_mr = ERR_PTR(-ENOMEM);
+		goto reg_phys_mr_exit0;
+	}
+
+	/* register MR on HCA */
+	if (ehca_mr_is_maxmr(size, iova_start)) {
+		e_mr->flags |= EHCA_MR_FLAG_MAXMR;
+		ret = ehca_reg_maxmr(shca, e_mr, iova_start, mr_access_flags,
+				     e_pd, &e_mr->ib.ib_mr.lkey,
+				     &e_mr->ib.ib_mr.rkey);
+		if (ret) {
+			ib_mr = ERR_PTR(ret);
+			goto reg_phys_mr_exit1;
+		}
+	} else {
+		struct ehca_mr_pginfo pginfo;
+		u32 num_kpages;
+		u32 num_hwpages;
+		u64 hw_pgsize;
+
+		num_kpages = NUM_CHUNKS(((u64)iova_start % PAGE_SIZE) + size,
+					PAGE_SIZE);
+		/* for kernel space we try most possible pgsize */
+		hw_pgsize = ehca_get_max_hwpage_size(shca);
+		num_hwpages = NUM_CHUNKS(((u64)iova_start % hw_pgsize) + size,
+					 hw_pgsize);
+		memset(&pginfo, 0, sizeof(pginfo));
+		pginfo.type = EHCA_MR_PGI_PHYS;
+		pginfo.num_kpages = num_kpages;
+		pginfo.hwpage_size = hw_pgsize;
+		pginfo.num_hwpages = num_hwpages;
+		pginfo.u.phy.num_phys_buf = num_phys_buf;
+		pginfo.u.phy.phys_buf_array = phys_buf_array;
+		pginfo.next_hwpage =
+			((u64)iova_start & ~PAGE_MASK) / hw_pgsize;
+
+		ret = ehca_reg_mr(shca, e_mr, iova_start, size, mr_access_flags,
+				  e_pd, &pginfo, &e_mr->ib.ib_mr.lkey,
+				  &e_mr->ib.ib_mr.rkey, EHCA_REG_MR);
+		if (ret) {
+			ib_mr = ERR_PTR(ret);
+			goto reg_phys_mr_exit1;
+		}
+	}
+
+	/* successful registration of all pages */
+	return &e_mr->ib.ib_mr;
+
+reg_phys_mr_exit1:
+	ehca_mr_delete(e_mr);
+reg_phys_mr_exit0:
+	if (IS_ERR(ib_mr))
+		ehca_err(pd->device, "h_ret=%li pd=%p phys_buf_array=%p "
+			 "num_phys_buf=%x mr_access_flags=%x iova_start=%p",
+			 PTR_ERR(ib_mr), pd, phys_buf_array,
+			 num_phys_buf, mr_access_flags, iova_start);
+	return ib_mr;
+} /* end ehca_reg_phys_mr() */
+
+/*----------------------------------------------------------------------*/
+
+struct ib_mr *ehca_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
+			       u64 virt, int mr_access_flags,
+			       struct ib_udata *udata)
+{
+	struct ib_mr *ib_mr;
+	struct ehca_mr *e_mr;
+	struct ehca_shca *shca =
+		container_of(pd->device, struct ehca_shca, ib_device);
+	struct ehca_pd *e_pd = container_of(pd, struct ehca_pd, ib_pd);
+	struct ehca_mr_pginfo pginfo;
+	int ret, page_shift;
+	u32 num_kpages;
+	u32 num_hwpages;
+	u64 hwpage_size;
+
+	if (!pd) {
+		ehca_gen_err("bad pd=%p", pd);
+		return ERR_PTR(-EFAULT);
+	}
+
+	if (((mr_access_flags & IB_ACCESS_REMOTE_WRITE) &&
+	     !(mr_access_flags & IB_ACCESS_LOCAL_WRITE)) ||
+	    ((mr_access_flags & IB_ACCESS_REMOTE_ATOMIC) &&
+	     !(mr_access_flags & IB_ACCESS_LOCAL_WRITE))) {
+		/*
+		 * Remote Write Access requires Local Write Access
+		 * Remote Atomic Access requires Local Write Access
+		 */
+		ehca_err(pd->device, "bad input values: mr_access_flags=%x",
+			 mr_access_flags);
+		ib_mr = ERR_PTR(-EINVAL);
+		goto reg_user_mr_exit0;
+	}
+
+	if (length == 0 || virt + length < virt) {
+		ehca_err(pd->device, "bad input values: length=%llx "
+			 "virt_base=%llx", length, virt);
+		ib_mr = ERR_PTR(-EINVAL);
+		goto reg_user_mr_exit0;
+	}
+
+	e_mr = ehca_mr_new();
+	if (!e_mr) {
+		ehca_err(pd->device, "out of memory");
+		ib_mr = ERR_PTR(-ENOMEM);
+		goto reg_user_mr_exit0;
+	}
+
+	e_mr->umem = ib_umem_get(pd->uobject->context, start, length,
+				 mr_access_flags, 0);
+	if (IS_ERR(e_mr->umem)) {
+		ib_mr = (void *)e_mr->umem;
+		goto reg_user_mr_exit1;
+	}
+
+	if (e_mr->umem->page_size != PAGE_SIZE) {
+		ehca_err(pd->device, "page size not supported, "
+			 "e_mr->umem->page_size=%x", e_mr->umem->page_size);
+		ib_mr = ERR_PTR(-EINVAL);
+		goto reg_user_mr_exit2;
+	}
+
+	/* determine number of MR pages */
+	num_kpages = NUM_CHUNKS((virt % PAGE_SIZE) + length, PAGE_SIZE);
+	/* select proper hw_pgsize */
+	page_shift = PAGE_SHIFT;
+	if (e_mr->umem->hugetlb) {
+		/* determine page_shift, clamp between 4K and 16M */
+		page_shift = (fls64(length - 1) + 3) & ~3;
+		page_shift = min(max(page_shift, EHCA_MR_PGSHIFT4K),
+				 EHCA_MR_PGSHIFT16M);
+	}
+	hwpage_size = 1UL << page_shift;
+
+	/* now that we have the desired page size, shift until it's
+	 * supported, too. 4K is always supported, so this terminates.
+	 */
+	while (!(hwpage_size & shca->hca_cap_mr_pgsize))
+		hwpage_size >>= 4;
+
+reg_user_mr_fallback:
+	num_hwpages = NUM_CHUNKS((virt % hwpage_size) + length, hwpage_size);
+	/* register MR on HCA */
+	memset(&pginfo, 0, sizeof(pginfo));
+	pginfo.type = EHCA_MR_PGI_USER;
+	pginfo.hwpage_size = hwpage_size;
+	pginfo.num_kpages = num_kpages;
+	pginfo.num_hwpages = num_hwpages;
+	pginfo.u.usr.region = e_mr->umem;
+	pginfo.next_hwpage = e_mr->umem->offset / hwpage_size;
+	pginfo.u.usr.next_sg = pginfo.u.usr.region->sg_head.sgl;
+	ret = ehca_reg_mr(shca, e_mr, (u64 *)virt, length, mr_access_flags,
+			  e_pd, &pginfo, &e_mr->ib.ib_mr.lkey,
+			  &e_mr->ib.ib_mr.rkey, EHCA_REG_MR);
+	if (ret == -EINVAL && pginfo.hwpage_size > PAGE_SIZE) {
+		ehca_warn(pd->device, "failed to register mr "
+			  "with hwpage_size=%llx", hwpage_size);
+		ehca_info(pd->device, "try to register mr with "
+			  "kpage_size=%lx", PAGE_SIZE);
+		/*
+		 * this means kpages are not contiguous for a hw page
+		 * try kernel page size as fallback solution
+		 */
+		hwpage_size = PAGE_SIZE;
+		goto reg_user_mr_fallback;
+	}
+	if (ret) {
+		ib_mr = ERR_PTR(ret);
+		goto reg_user_mr_exit2;
+	}
+
+	/* successful registration of all pages */
+	return &e_mr->ib.ib_mr;
+
+reg_user_mr_exit2:
+	ib_umem_release(e_mr->umem);
+reg_user_mr_exit1:
+	ehca_mr_delete(e_mr);
+reg_user_mr_exit0:
+	if (IS_ERR(ib_mr))
+		ehca_err(pd->device, "rc=%li pd=%p mr_access_flags=%x udata=%p",
+			 PTR_ERR(ib_mr), pd, mr_access_flags, udata);
+	return ib_mr;
+} /* end ehca_reg_user_mr() */
+
+/*----------------------------------------------------------------------*/
+
+int ehca_rereg_phys_mr(struct ib_mr *mr,
+		       int mr_rereg_mask,
+		       struct ib_pd *pd,
+		       struct ib_phys_buf *phys_buf_array,
+		       int num_phys_buf,
+		       int mr_access_flags,
+		       u64 *iova_start)
+{
+	int ret;
+
+	struct ehca_shca *shca =
+		container_of(mr->device, struct ehca_shca, ib_device);
+	struct ehca_mr *e_mr = container_of(mr, struct ehca_mr, ib.ib_mr);
+	u64 new_size;
+	u64 *new_start;
+	u32 new_acl;
+	struct ehca_pd *new_pd;
+	u32 tmp_lkey, tmp_rkey;
+	unsigned long sl_flags;
+	u32 num_kpages = 0;
+	u32 num_hwpages = 0;
+	struct ehca_mr_pginfo pginfo;
+
+	if (!(mr_rereg_mask & IB_MR_REREG_TRANS)) {
+		/* TODO not supported, because PHYP rereg hCall needs pages */
+		ehca_err(mr->device, "rereg without IB_MR_REREG_TRANS not "
+			 "supported yet, mr_rereg_mask=%x", mr_rereg_mask);
+		ret = -EINVAL;
+		goto rereg_phys_mr_exit0;
+	}
+
+	if (mr_rereg_mask & IB_MR_REREG_PD) {
+		if (!pd) {
+			ehca_err(mr->device, "rereg with bad pd, pd=%p "
+				 "mr_rereg_mask=%x", pd, mr_rereg_mask);
+			ret = -EINVAL;
+			goto rereg_phys_mr_exit0;
+		}
+	}
+
+	if ((mr_rereg_mask &
+	     ~(IB_MR_REREG_TRANS | IB_MR_REREG_PD | IB_MR_REREG_ACCESS)) ||
+	    (mr_rereg_mask == 0)) {
+		ret = -EINVAL;
+		goto rereg_phys_mr_exit0;
+	}
+
+	/* check other parameters */
+	if (e_mr == shca->maxmr) {
+		/* should be impossible, however reject to be sure */
+		ehca_err(mr->device, "rereg internal max-MR impossible, mr=%p "
+			 "shca->maxmr=%p mr->lkey=%x",
+			 mr, shca->maxmr, mr->lkey);
+		ret = -EINVAL;
+		goto rereg_phys_mr_exit0;
+	}
+	if (mr_rereg_mask & IB_MR_REREG_TRANS) { /* transl., i.e. addr/size */
+		if (e_mr->flags & EHCA_MR_FLAG_FMR) {
+			ehca_err(mr->device, "not supported for FMR, mr=%p "
+				 "flags=%x", mr, e_mr->flags);
+			ret = -EINVAL;
+			goto rereg_phys_mr_exit0;
+		}
+		if (!phys_buf_array || num_phys_buf <= 0) {
+			ehca_err(mr->device, "bad input values mr_rereg_mask=%x"
+				 " phys_buf_array=%p num_phys_buf=%x",
+				 mr_rereg_mask, phys_buf_array, num_phys_buf);
+			ret = -EINVAL;
+			goto rereg_phys_mr_exit0;
+		}
+	}
+	if ((mr_rereg_mask & IB_MR_REREG_ACCESS) &&	/* change ACL */
+	    (((mr_access_flags & IB_ACCESS_REMOTE_WRITE) &&
+	      !(mr_access_flags & IB_ACCESS_LOCAL_WRITE)) ||
+	     ((mr_access_flags & IB_ACCESS_REMOTE_ATOMIC) &&
+	      !(mr_access_flags & IB_ACCESS_LOCAL_WRITE)))) {
+		/*
+		 * Remote Write Access requires Local Write Access
+		 * Remote Atomic Access requires Local Write Access
+		 */
+		ehca_err(mr->device, "bad input values: mr_rereg_mask=%x "
+			 "mr_access_flags=%x", mr_rereg_mask, mr_access_flags);
+		ret = -EINVAL;
+		goto rereg_phys_mr_exit0;
+	}
+
+	/* set requested values dependent on rereg request */
+	spin_lock_irqsave(&e_mr->mrlock, sl_flags);
+	new_start = e_mr->start;
+	new_size = e_mr->size;
+	new_acl = e_mr->acl;
+	new_pd = container_of(mr->pd, struct ehca_pd, ib_pd);
+
+	if (mr_rereg_mask & IB_MR_REREG_TRANS) {
+		u64 hw_pgsize = ehca_get_max_hwpage_size(shca);
+
+		new_start = iova_start;	/* change address */
+		/* check physical buffer list and calculate size */
+		ret = ehca_mr_chk_buf_and_calc_size(phys_buf_array,
+						    num_phys_buf, iova_start,
+						    &new_size);
+		if (ret)
+			goto rereg_phys_mr_exit1;
+		if ((new_size == 0) ||
+		    (((u64)iova_start + new_size) < (u64)iova_start)) {
+			ehca_err(mr->device, "bad input values: new_size=%llx "
+				 "iova_start=%p", new_size, iova_start);
+			ret = -EINVAL;
+			goto rereg_phys_mr_exit1;
+		}
+		num_kpages = NUM_CHUNKS(((u64)new_start % PAGE_SIZE) +
+					new_size, PAGE_SIZE);
+		num_hwpages = NUM_CHUNKS(((u64)new_start % hw_pgsize) +
+					 new_size, hw_pgsize);
+		memset(&pginfo, 0, sizeof(pginfo));
+		pginfo.type = EHCA_MR_PGI_PHYS;
+		pginfo.num_kpages = num_kpages;
+		pginfo.hwpage_size = hw_pgsize;
+		pginfo.num_hwpages = num_hwpages;
+		pginfo.u.phy.num_phys_buf = num_phys_buf;
+		pginfo.u.phy.phys_buf_array = phys_buf_array;
+		pginfo.next_hwpage =
+			((u64)iova_start & ~PAGE_MASK) / hw_pgsize;
+	}
+	if (mr_rereg_mask & IB_MR_REREG_ACCESS)
+		new_acl = mr_access_flags;
+	if (mr_rereg_mask & IB_MR_REREG_PD)
+		new_pd = container_of(pd, struct ehca_pd, ib_pd);
+
+	ret = ehca_rereg_mr(shca, e_mr, new_start, new_size, new_acl,
+			    new_pd, &pginfo, &tmp_lkey, &tmp_rkey);
+	if (ret)
+		goto rereg_phys_mr_exit1;
+
+	/* successful reregistration */
+	if (mr_rereg_mask & IB_MR_REREG_PD)
+		mr->pd = pd;
+	mr->lkey = tmp_lkey;
+	mr->rkey = tmp_rkey;
+
+rereg_phys_mr_exit1:
+	spin_unlock_irqrestore(&e_mr->mrlock, sl_flags);
+rereg_phys_mr_exit0:
+	if (ret)
+		ehca_err(mr->device, "ret=%i mr=%p mr_rereg_mask=%x pd=%p "
+			 "phys_buf_array=%p num_phys_buf=%x mr_access_flags=%x "
+			 "iova_start=%p",
+			 ret, mr, mr_rereg_mask, pd, phys_buf_array,
+			 num_phys_buf, mr_access_flags, iova_start);
+	return ret;
+} /* end ehca_rereg_phys_mr() */
+
+/*----------------------------------------------------------------------*/
+
+int ehca_query_mr(struct ib_mr *mr, struct ib_mr_attr *mr_attr)
+{
+	int ret = 0;
+	u64 h_ret;
+	struct ehca_shca *shca =
+		container_of(mr->device, struct ehca_shca, ib_device);
+	struct ehca_mr *e_mr = container_of(mr, struct ehca_mr, ib.ib_mr);
+	unsigned long sl_flags;
+	struct ehca_mr_hipzout_parms hipzout;
+
+	if ((e_mr->flags & EHCA_MR_FLAG_FMR)) {
+		ehca_err(mr->device, "not supported for FMR, mr=%p e_mr=%p "
+			 "e_mr->flags=%x", mr, e_mr, e_mr->flags);
+		ret = -EINVAL;
+		goto query_mr_exit0;
+	}
+
+	memset(mr_attr, 0, sizeof(struct ib_mr_attr));
+	spin_lock_irqsave(&e_mr->mrlock, sl_flags);
+
+	h_ret = hipz_h_query_mr(shca->ipz_hca_handle, e_mr, &hipzout);
+	if (h_ret != H_SUCCESS) {
+		ehca_err(mr->device, "hipz_mr_query failed, h_ret=%lli mr=%p "
+			 "hca_hndl=%llx mr_hndl=%llx lkey=%x",
+			 h_ret, mr, shca->ipz_hca_handle.handle,
+			 e_mr->ipz_mr_handle.handle, mr->lkey);
+		ret = ehca2ib_return_code(h_ret);
+		goto query_mr_exit1;
+	}
+	mr_attr->pd = mr->pd;
+	mr_attr->device_virt_addr = hipzout.vaddr;
+	mr_attr->size = hipzout.len;
+	mr_attr->lkey = hipzout.lkey;
+	mr_attr->rkey = hipzout.rkey;
+	ehca_mrmw_reverse_map_acl(&hipzout.acl, &mr_attr->mr_access_flags);
+
+query_mr_exit1:
+	spin_unlock_irqrestore(&e_mr->mrlock, sl_flags);
+query_mr_exit0:
+	if (ret)
+		ehca_err(mr->device, "ret=%i mr=%p mr_attr=%p",
+			 ret, mr, mr_attr);
+	return ret;
+} /* end ehca_query_mr() */
+
+/*----------------------------------------------------------------------*/
+
+int ehca_dereg_mr(struct ib_mr *mr)
+{
+	int ret = 0;
+	u64 h_ret;
+	struct ehca_shca *shca =
+		container_of(mr->device, struct ehca_shca, ib_device);
+	struct ehca_mr *e_mr = container_of(mr, struct ehca_mr, ib.ib_mr);
+
+	if ((e_mr->flags & EHCA_MR_FLAG_FMR)) {
+		ehca_err(mr->device, "not supported for FMR, mr=%p e_mr=%p "
+			 "e_mr->flags=%x", mr, e_mr, e_mr->flags);
+		ret = -EINVAL;
+		goto dereg_mr_exit0;
+	} else if (e_mr == shca->maxmr) {
+		/* should be impossible, however reject to be sure */
+		ehca_err(mr->device, "dereg internal max-MR impossible, mr=%p "
+			 "shca->maxmr=%p mr->lkey=%x",
+			 mr, shca->maxmr, mr->lkey);
+		ret = -EINVAL;
+		goto dereg_mr_exit0;
+	}
+
+	/* TODO: BUSY: MR still has bound window(s) */
+	h_ret = hipz_h_free_resource_mr(shca->ipz_hca_handle, e_mr);
+	if (h_ret != H_SUCCESS) {
+		ehca_err(mr->device, "hipz_free_mr failed, h_ret=%lli shca=%p "
+			 "e_mr=%p hca_hndl=%llx mr_hndl=%llx mr->lkey=%x",
+			 h_ret, shca, e_mr, shca->ipz_hca_handle.handle,
+			 e_mr->ipz_mr_handle.handle, mr->lkey);
+		ret = ehca2ib_return_code(h_ret);
+		goto dereg_mr_exit0;
+	}
+
+	if (e_mr->umem)
+		ib_umem_release(e_mr->umem);
+
+	/* successful deregistration */
+	ehca_mr_delete(e_mr);
+
+dereg_mr_exit0:
+	if (ret)
+		ehca_err(mr->device, "ret=%i mr=%p", ret, mr);
+	return ret;
+} /* end ehca_dereg_mr() */
+
+/*----------------------------------------------------------------------*/
+
+struct ib_mw *ehca_alloc_mw(struct ib_pd *pd, enum ib_mw_type type)
+{
+	struct ib_mw *ib_mw;
+	u64 h_ret;
+	struct ehca_mw *e_mw;
+	struct ehca_pd *e_pd = container_of(pd, struct ehca_pd, ib_pd);
+	struct ehca_shca *shca =
+		container_of(pd->device, struct ehca_shca, ib_device);
+	struct ehca_mw_hipzout_parms hipzout;
+
+	if (type != IB_MW_TYPE_1)
+		return ERR_PTR(-EINVAL);
+
+	e_mw = ehca_mw_new();
+	if (!e_mw) {
+		ib_mw = ERR_PTR(-ENOMEM);
+		goto alloc_mw_exit0;
+	}
+
+	h_ret = hipz_h_alloc_resource_mw(shca->ipz_hca_handle, e_mw,
+					 e_pd->fw_pd, &hipzout);
+	if (h_ret != H_SUCCESS) {
+		ehca_err(pd->device, "hipz_mw_allocate failed, h_ret=%lli "
+			 "shca=%p hca_hndl=%llx mw=%p",
+			 h_ret, shca, shca->ipz_hca_handle.handle, e_mw);
+		ib_mw = ERR_PTR(ehca2ib_return_code(h_ret));
+		goto alloc_mw_exit1;
+	}
+	/* successful MW allocation */
+	e_mw->ipz_mw_handle = hipzout.handle;
+	e_mw->ib_mw.rkey    = hipzout.rkey;
+	return &e_mw->ib_mw;
+
+alloc_mw_exit1:
+	ehca_mw_delete(e_mw);
+alloc_mw_exit0:
+	if (IS_ERR(ib_mw))
+		ehca_err(pd->device, "h_ret=%li pd=%p", PTR_ERR(ib_mw), pd);
+	return ib_mw;
+} /* end ehca_alloc_mw() */
+
+/*----------------------------------------------------------------------*/
+
+int ehca_bind_mw(struct ib_qp *qp,
+		 struct ib_mw *mw,
+		 struct ib_mw_bind *mw_bind)
+{
+	/* TODO: not supported up to now */
+	ehca_gen_err("bind MW currently not supported by HCAD");
+
+	return -EPERM;
+} /* end ehca_bind_mw() */
+
+/*----------------------------------------------------------------------*/
+
+int ehca_dealloc_mw(struct ib_mw *mw)
+{
+	u64 h_ret;
+	struct ehca_shca *shca =
+		container_of(mw->device, struct ehca_shca, ib_device);
+	struct ehca_mw *e_mw = container_of(mw, struct ehca_mw, ib_mw);
+
+	h_ret = hipz_h_free_resource_mw(shca->ipz_hca_handle, e_mw);
+	if (h_ret != H_SUCCESS) {
+		ehca_err(mw->device, "hipz_free_mw failed, h_ret=%lli shca=%p "
+			 "mw=%p rkey=%x hca_hndl=%llx mw_hndl=%llx",
+			 h_ret, shca, mw, mw->rkey, shca->ipz_hca_handle.handle,
+			 e_mw->ipz_mw_handle.handle);
+		return ehca2ib_return_code(h_ret);
+	}
+	/* successful deallocation */
+	ehca_mw_delete(e_mw);
+	return 0;
+} /* end ehca_dealloc_mw() */
+
+/*----------------------------------------------------------------------*/
+
+struct ib_fmr *ehca_alloc_fmr(struct ib_pd *pd,
+			      int mr_access_flags,
+			      struct ib_fmr_attr *fmr_attr)
+{
+	struct ib_fmr *ib_fmr;
+	struct ehca_shca *shca =
+		container_of(pd->device, struct ehca_shca, ib_device);
+	struct ehca_pd *e_pd = container_of(pd, struct ehca_pd, ib_pd);
+	struct ehca_mr *e_fmr;
+	int ret;
+	u32 tmp_lkey, tmp_rkey;
+	struct ehca_mr_pginfo pginfo;
+	u64 hw_pgsize;
+
+	/* check other parameters */
+	if (((mr_access_flags & IB_ACCESS_REMOTE_WRITE) &&
+	     !(mr_access_flags & IB_ACCESS_LOCAL_WRITE)) ||
+	    ((mr_access_flags & IB_ACCESS_REMOTE_ATOMIC) &&
+	     !(mr_access_flags & IB_ACCESS_LOCAL_WRITE))) {
+		/*
+		 * Remote Write Access requires Local Write Access
+		 * Remote Atomic Access requires Local Write Access
+		 */
+		ehca_err(pd->device, "bad input values: mr_access_flags=%x",
+			 mr_access_flags);
+		ib_fmr = ERR_PTR(-EINVAL);
+		goto alloc_fmr_exit0;
+	}
+	if (mr_access_flags & IB_ACCESS_MW_BIND) {
+		ehca_err(pd->device, "bad input values: mr_access_flags=%x",
+			 mr_access_flags);
+		ib_fmr = ERR_PTR(-EINVAL);
+		goto alloc_fmr_exit0;
+	}
+	if ((fmr_attr->max_pages == 0) || (fmr_attr->max_maps == 0)) {
+		ehca_err(pd->device, "bad input values: fmr_attr->max_pages=%x "
+			 "fmr_attr->max_maps=%x fmr_attr->page_shift=%x",
+			 fmr_attr->max_pages, fmr_attr->max_maps,
+			 fmr_attr->page_shift);
+		ib_fmr = ERR_PTR(-EINVAL);
+		goto alloc_fmr_exit0;
+	}
+
+	hw_pgsize = 1 << fmr_attr->page_shift;
+	if (!(hw_pgsize & shca->hca_cap_mr_pgsize)) {
+		ehca_err(pd->device, "unsupported fmr_attr->page_shift=%x",
+			 fmr_attr->page_shift);
+		ib_fmr = ERR_PTR(-EINVAL);
+		goto alloc_fmr_exit0;
+	}
+
+	e_fmr = ehca_mr_new();
+	if (!e_fmr) {
+		ib_fmr = ERR_PTR(-ENOMEM);
+		goto alloc_fmr_exit0;
+	}
+	e_fmr->flags |= EHCA_MR_FLAG_FMR;
+
+	/* register MR on HCA */
+	memset(&pginfo, 0, sizeof(pginfo));
+	pginfo.hwpage_size = hw_pgsize;
+	/*
+	 * pginfo.num_hwpages==0, ie register_rpages() will not be called
+	 * but deferred to map_phys_fmr()
+	 */
+	ret = ehca_reg_mr(shca, e_fmr, NULL,
+			  fmr_attr->max_pages * (1 << fmr_attr->page_shift),
+			  mr_access_flags, e_pd, &pginfo,
+			  &tmp_lkey, &tmp_rkey, EHCA_REG_MR);
+	if (ret) {
+		ib_fmr = ERR_PTR(ret);
+		goto alloc_fmr_exit1;
+	}
+
+	/* successful */
+	e_fmr->hwpage_size = hw_pgsize;
+	e_fmr->fmr_page_size = 1 << fmr_attr->page_shift;
+	e_fmr->fmr_max_pages = fmr_attr->max_pages;
+	e_fmr->fmr_max_maps = fmr_attr->max_maps;
+	e_fmr->fmr_map_cnt = 0;
+	return &e_fmr->ib.ib_fmr;
+
+alloc_fmr_exit1:
+	ehca_mr_delete(e_fmr);
+alloc_fmr_exit0:
+	return ib_fmr;
+} /* end ehca_alloc_fmr() */
+
+/*----------------------------------------------------------------------*/
+
+int ehca_map_phys_fmr(struct ib_fmr *fmr,
+		      u64 *page_list,
+		      int list_len,
+		      u64 iova)
+{
+	int ret;
+	struct ehca_shca *shca =
+		container_of(fmr->device, struct ehca_shca, ib_device);
+	struct ehca_mr *e_fmr = container_of(fmr, struct ehca_mr, ib.ib_fmr);
+	struct ehca_pd *e_pd = container_of(fmr->pd, struct ehca_pd, ib_pd);
+	struct ehca_mr_pginfo pginfo;
+	u32 tmp_lkey, tmp_rkey;
+
+	if (!(e_fmr->flags & EHCA_MR_FLAG_FMR)) {
+		ehca_err(fmr->device, "not a FMR, e_fmr=%p e_fmr->flags=%x",
+			 e_fmr, e_fmr->flags);
+		ret = -EINVAL;
+		goto map_phys_fmr_exit0;
+	}
+	ret = ehca_fmr_check_page_list(e_fmr, page_list, list_len);
+	if (ret)
+		goto map_phys_fmr_exit0;
+	if (iova % e_fmr->fmr_page_size) {
+		/* only whole-numbered pages */
+		ehca_err(fmr->device, "bad iova, iova=%llx fmr_page_size=%x",
+			 iova, e_fmr->fmr_page_size);
+		ret = -EINVAL;
+		goto map_phys_fmr_exit0;
+	}
+	if (e_fmr->fmr_map_cnt >= e_fmr->fmr_max_maps) {
+		/* HCAD does not limit the maps, however trace this anyway */
+		ehca_info(fmr->device, "map limit exceeded, fmr=%p "
+			  "e_fmr->fmr_map_cnt=%x e_fmr->fmr_max_maps=%x",
+			  fmr, e_fmr->fmr_map_cnt, e_fmr->fmr_max_maps);
+	}
+
+	memset(&pginfo, 0, sizeof(pginfo));
+	pginfo.type = EHCA_MR_PGI_FMR;
+	pginfo.num_kpages = list_len;
+	pginfo.hwpage_size = e_fmr->hwpage_size;
+	pginfo.num_hwpages =
+		list_len * e_fmr->fmr_page_size / pginfo.hwpage_size;
+	pginfo.u.fmr.page_list = page_list;
+	pginfo.next_hwpage =
+		(iova & (e_fmr->fmr_page_size-1)) / pginfo.hwpage_size;
+	pginfo.u.fmr.fmr_pgsize = e_fmr->fmr_page_size;
+
+	ret = ehca_rereg_mr(shca, e_fmr, (u64 *)iova,
+			    list_len * e_fmr->fmr_page_size,
+			    e_fmr->acl, e_pd, &pginfo, &tmp_lkey, &tmp_rkey);
+	if (ret)
+		goto map_phys_fmr_exit0;
+
+	/* successful reregistration */
+	e_fmr->fmr_map_cnt++;
+	e_fmr->ib.ib_fmr.lkey = tmp_lkey;
+	e_fmr->ib.ib_fmr.rkey = tmp_rkey;
+	return 0;
+
+map_phys_fmr_exit0:
+	if (ret)
+		ehca_err(fmr->device, "ret=%i fmr=%p page_list=%p list_len=%x "
+			 "iova=%llx", ret, fmr, page_list, list_len, iova);
+	return ret;
+} /* end ehca_map_phys_fmr() */
+
+/*----------------------------------------------------------------------*/
+
+int ehca_unmap_fmr(struct list_head *fmr_list)
+{
+	int ret = 0;
+	struct ib_fmr *ib_fmr;
+	struct ehca_shca *shca = NULL;
+	struct ehca_shca *prev_shca;
+	struct ehca_mr *e_fmr;
+	u32 num_fmr = 0;
+	u32 unmap_fmr_cnt = 0;
+
+	/* check all FMR belong to same SHCA, and check internal flag */
+	list_for_each_entry(ib_fmr, fmr_list, list) {
+		prev_shca = shca;
+		shca = container_of(ib_fmr->device, struct ehca_shca,
+				    ib_device);
+		e_fmr = container_of(ib_fmr, struct ehca_mr, ib.ib_fmr);
+		if ((shca != prev_shca) && prev_shca) {
+			ehca_err(&shca->ib_device, "SHCA mismatch, shca=%p "
+				 "prev_shca=%p e_fmr=%p",
+				 shca, prev_shca, e_fmr);
+			ret = -EINVAL;
+			goto unmap_fmr_exit0;
+		}
+		if (!(e_fmr->flags & EHCA_MR_FLAG_FMR)) {
+			ehca_err(&shca->ib_device, "not a FMR, e_fmr=%p "
+				 "e_fmr->flags=%x", e_fmr, e_fmr->flags);
+			ret = -EINVAL;
+			goto unmap_fmr_exit0;
+		}
+		num_fmr++;
+	}
+
+	/* loop over all FMRs to unmap */
+	list_for_each_entry(ib_fmr, fmr_list, list) {
+		unmap_fmr_cnt++;
+		e_fmr = container_of(ib_fmr, struct ehca_mr, ib.ib_fmr);
+		shca = container_of(ib_fmr->device, struct ehca_shca,
+				    ib_device);
+		ret = ehca_unmap_one_fmr(shca, e_fmr);
+		if (ret) {
+			/* unmap failed, stop unmapping of rest of FMRs */
+			ehca_err(&shca->ib_device, "unmap of one FMR failed, "
+				 "stop rest, e_fmr=%p num_fmr=%x "
+				 "unmap_fmr_cnt=%x lkey=%x", e_fmr, num_fmr,
+				 unmap_fmr_cnt, e_fmr->ib.ib_fmr.lkey);
+			goto unmap_fmr_exit0;
+		}
+	}
+
+unmap_fmr_exit0:
+	if (ret)
+		ehca_gen_err("ret=%i fmr_list=%p num_fmr=%x unmap_fmr_cnt=%x",
+			     ret, fmr_list, num_fmr, unmap_fmr_cnt);
+	return ret;
+} /* end ehca_unmap_fmr() */
+
+/*----------------------------------------------------------------------*/
+
+int ehca_dealloc_fmr(struct ib_fmr *fmr)
+{
+	int ret;
+	u64 h_ret;
+	struct ehca_shca *shca =
+		container_of(fmr->device, struct ehca_shca, ib_device);
+	struct ehca_mr *e_fmr = container_of(fmr, struct ehca_mr, ib.ib_fmr);
+
+	if (!(e_fmr->flags & EHCA_MR_FLAG_FMR)) {
+		ehca_err(fmr->device, "not a FMR, e_fmr=%p e_fmr->flags=%x",
+			 e_fmr, e_fmr->flags);
+		ret = -EINVAL;
+		goto free_fmr_exit0;
+	}
+
+	h_ret = hipz_h_free_resource_mr(shca->ipz_hca_handle, e_fmr);
+	if (h_ret != H_SUCCESS) {
+		ehca_err(fmr->device, "hipz_free_mr failed, h_ret=%lli e_fmr=%p "
+			 "hca_hndl=%llx fmr_hndl=%llx fmr->lkey=%x",
+			 h_ret, e_fmr, shca->ipz_hca_handle.handle,
+			 e_fmr->ipz_mr_handle.handle, fmr->lkey);
+		ret = ehca2ib_return_code(h_ret);
+		goto free_fmr_exit0;
+	}
+	/* successful deregistration */
+	ehca_mr_delete(e_fmr);
+	return 0;
+
+free_fmr_exit0:
+	if (ret)
+		ehca_err(&shca->ib_device, "ret=%i fmr=%p", ret, fmr);
+	return ret;
+} /* end ehca_dealloc_fmr() */
+
+/*----------------------------------------------------------------------*/
+
+static int ehca_reg_bmap_mr_rpages(struct ehca_shca *shca,
+				   struct ehca_mr *e_mr,
+				   struct ehca_mr_pginfo *pginfo);
+
+int ehca_reg_mr(struct ehca_shca *shca,
+		struct ehca_mr *e_mr,
+		u64 *iova_start,
+		u64 size,
+		int acl,
+		struct ehca_pd *e_pd,
+		struct ehca_mr_pginfo *pginfo,
+		u32 *lkey, /*OUT*/
+		u32 *rkey, /*OUT*/
+		enum ehca_reg_type reg_type)
+{
+	int ret;
+	u64 h_ret;
+	u32 hipz_acl;
+	struct ehca_mr_hipzout_parms hipzout;
+
+	ehca_mrmw_map_acl(acl, &hipz_acl);
+	ehca_mrmw_set_pgsize_hipz_acl(pginfo->hwpage_size, &hipz_acl);
+	if (ehca_use_hp_mr == 1)
+		hipz_acl |= 0x00000001;
+
+	h_ret = hipz_h_alloc_resource_mr(shca->ipz_hca_handle, e_mr,
+					 (u64)iova_start, size, hipz_acl,
+					 e_pd->fw_pd, &hipzout);
+	if (h_ret != H_SUCCESS) {
+		ehca_err(&shca->ib_device, "hipz_alloc_mr failed, h_ret=%lli "
+			 "hca_hndl=%llx", h_ret, shca->ipz_hca_handle.handle);
+		ret = ehca2ib_return_code(h_ret);
+		goto ehca_reg_mr_exit0;
+	}
+
+	e_mr->ipz_mr_handle = hipzout.handle;
+
+	if (reg_type == EHCA_REG_BUSMAP_MR)
+		ret = ehca_reg_bmap_mr_rpages(shca, e_mr, pginfo);
+	else if (reg_type == EHCA_REG_MR)
+		ret = ehca_reg_mr_rpages(shca, e_mr, pginfo);
+	else
+		ret = -EINVAL;
+
+	if (ret)
+		goto ehca_reg_mr_exit1;
+
+	/* successful registration */
+	e_mr->num_kpages = pginfo->num_kpages;
+	e_mr->num_hwpages = pginfo->num_hwpages;
+	e_mr->hwpage_size = pginfo->hwpage_size;
+	e_mr->start = iova_start;
+	e_mr->size = size;
+	e_mr->acl = acl;
+	*lkey = hipzout.lkey;
+	*rkey = hipzout.rkey;
+	return 0;
+
+ehca_reg_mr_exit1:
+	h_ret = hipz_h_free_resource_mr(shca->ipz_hca_handle, e_mr);
+	if (h_ret != H_SUCCESS) {
+		ehca_err(&shca->ib_device, "h_ret=%lli shca=%p e_mr=%p "
+			 "iova_start=%p size=%llx acl=%x e_pd=%p lkey=%x "
+			 "pginfo=%p num_kpages=%llx num_hwpages=%llx ret=%i",
+			 h_ret, shca, e_mr, iova_start, size, acl, e_pd,
+			 hipzout.lkey, pginfo, pginfo->num_kpages,
+			 pginfo->num_hwpages, ret);
+		ehca_err(&shca->ib_device, "internal error in ehca_reg_mr, "
+			 "not recoverable");
+	}
+ehca_reg_mr_exit0:
+	if (ret)
+		ehca_err(&shca->ib_device, "ret=%i shca=%p e_mr=%p "
+			 "iova_start=%p size=%llx acl=%x e_pd=%p pginfo=%p "
+			 "num_kpages=%llx num_hwpages=%llx",
+			 ret, shca, e_mr, iova_start, size, acl, e_pd, pginfo,
+			 pginfo->num_kpages, pginfo->num_hwpages);
+	return ret;
+} /* end ehca_reg_mr() */
+
+/*----------------------------------------------------------------------*/
+
+int ehca_reg_mr_rpages(struct ehca_shca *shca,
+		       struct ehca_mr *e_mr,
+		       struct ehca_mr_pginfo *pginfo)
+{
+	int ret = 0;
+	u64 h_ret;
+	u32 rnum;
+	u64 rpage;
+	u32 i;
+	u64 *kpage;
+
+	if (!pginfo->num_hwpages) /* in case of fmr */
+		return 0;
+
+	kpage = ehca_alloc_fw_ctrlblock(GFP_KERNEL);
+	if (!kpage) {
+		ehca_err(&shca->ib_device, "kpage alloc failed");
+		ret = -ENOMEM;
+		goto ehca_reg_mr_rpages_exit0;
+	}
+
+	/* max MAX_RPAGES ehca mr pages per register call */
+	for (i = 0; i < NUM_CHUNKS(pginfo->num_hwpages, MAX_RPAGES); i++) {
+
+		if (i == NUM_CHUNKS(pginfo->num_hwpages, MAX_RPAGES) - 1) {
+			rnum = pginfo->num_hwpages % MAX_RPAGES; /* last shot */
+			if (rnum == 0)
+				rnum = MAX_RPAGES;      /* last shot is full */
+		} else
+			rnum = MAX_RPAGES;
+
+		ret = ehca_set_pagebuf(pginfo, rnum, kpage);
+		if (ret) {
+			ehca_err(&shca->ib_device, "ehca_set_pagebuf "
+				 "bad rc, ret=%i rnum=%x kpage=%p",
+				 ret, rnum, kpage);
+			goto ehca_reg_mr_rpages_exit1;
+		}
+
+		if (rnum > 1) {
+			rpage = __pa(kpage);
+			if (!rpage) {
+				ehca_err(&shca->ib_device, "kpage=%p i=%x",
+					 kpage, i);
+				ret = -EFAULT;
+				goto ehca_reg_mr_rpages_exit1;
+			}
+		} else
+			rpage = *kpage;
+
+		h_ret = hipz_h_register_rpage_mr(
+			shca->ipz_hca_handle, e_mr,
+			ehca_encode_hwpage_size(pginfo->hwpage_size),
+			0, rpage, rnum);
+
+		if (i == NUM_CHUNKS(pginfo->num_hwpages, MAX_RPAGES) - 1) {
+			/*
+			 * check for 'registration complete'==H_SUCCESS
+			 * and for 'page registered'==H_PAGE_REGISTERED
+			 */
+			if (h_ret != H_SUCCESS) {
+				ehca_err(&shca->ib_device, "last "
+					 "hipz_reg_rpage_mr failed, h_ret=%lli "
+					 "e_mr=%p i=%x hca_hndl=%llx mr_hndl=%llx"
+					 " lkey=%x", h_ret, e_mr, i,
+					 shca->ipz_hca_handle.handle,
+					 e_mr->ipz_mr_handle.handle,
+					 e_mr->ib.ib_mr.lkey);
+				ret = ehca2ib_return_code(h_ret);
+				break;
+			} else
+				ret = 0;
+		} else if (h_ret != H_PAGE_REGISTERED) {
+			ehca_err(&shca->ib_device, "hipz_reg_rpage_mr failed, "
+				 "h_ret=%lli e_mr=%p i=%x lkey=%x hca_hndl=%llx "
+				 "mr_hndl=%llx", h_ret, e_mr, i,
+				 e_mr->ib.ib_mr.lkey,
+				 shca->ipz_hca_handle.handle,
+				 e_mr->ipz_mr_handle.handle);
+			ret = ehca2ib_return_code(h_ret);
+			break;
+		} else
+			ret = 0;
+	} /* end for(i) */
+
+
+ehca_reg_mr_rpages_exit1:
+	ehca_free_fw_ctrlblock(kpage);
+ehca_reg_mr_rpages_exit0:
+	if (ret)
+		ehca_err(&shca->ib_device, "ret=%i shca=%p e_mr=%p pginfo=%p "
+			 "num_kpages=%llx num_hwpages=%llx", ret, shca, e_mr,
+			 pginfo, pginfo->num_kpages, pginfo->num_hwpages);
+	return ret;
+} /* end ehca_reg_mr_rpages() */
+
+/*----------------------------------------------------------------------*/
+
+inline int ehca_rereg_mr_rereg1(struct ehca_shca *shca,
+				struct ehca_mr *e_mr,
+				u64 *iova_start,
+				u64 size,
+				u32 acl,
+				struct ehca_pd *e_pd,
+				struct ehca_mr_pginfo *pginfo,
+				u32 *lkey, /*OUT*/
+				u32 *rkey) /*OUT*/
+{
+	int ret;
+	u64 h_ret;
+	u32 hipz_acl;
+	u64 *kpage;
+	u64 rpage;
+	struct ehca_mr_pginfo pginfo_save;
+	struct ehca_mr_hipzout_parms hipzout;
+
+	ehca_mrmw_map_acl(acl, &hipz_acl);
+	ehca_mrmw_set_pgsize_hipz_acl(pginfo->hwpage_size, &hipz_acl);
+
+	kpage = ehca_alloc_fw_ctrlblock(GFP_KERNEL);
+	if (!kpage) {
+		ehca_err(&shca->ib_device, "kpage alloc failed");
+		ret = -ENOMEM;
+		goto ehca_rereg_mr_rereg1_exit0;
+	}
+
+	pginfo_save = *pginfo;
+	ret = ehca_set_pagebuf(pginfo, pginfo->num_hwpages, kpage);
+	if (ret) {
+		ehca_err(&shca->ib_device, "set pagebuf failed, e_mr=%p "
+			 "pginfo=%p type=%x num_kpages=%llx num_hwpages=%llx "
+			 "kpage=%p", e_mr, pginfo, pginfo->type,
+			 pginfo->num_kpages, pginfo->num_hwpages, kpage);
+		goto ehca_rereg_mr_rereg1_exit1;
+	}
+	rpage = __pa(kpage);
+	if (!rpage) {
+		ehca_err(&shca->ib_device, "kpage=%p", kpage);
+		ret = -EFAULT;
+		goto ehca_rereg_mr_rereg1_exit1;
+	}
+	h_ret = hipz_h_reregister_pmr(shca->ipz_hca_handle, e_mr,
+				      (u64)iova_start, size, hipz_acl,
+				      e_pd->fw_pd, rpage, &hipzout);
+	if (h_ret != H_SUCCESS) {
+		/*
+		 * reregistration unsuccessful, try it again with the 3 hCalls,
+		 * e.g. this is required in case H_MR_CONDITION
+		 * (MW bound or MR is shared)
+		 */
+		ehca_warn(&shca->ib_device, "hipz_h_reregister_pmr failed "
+			  "(Rereg1), h_ret=%lli e_mr=%p", h_ret, e_mr);
+		*pginfo = pginfo_save;
+		ret = -EAGAIN;
+	} else if ((u64 *)hipzout.vaddr != iova_start) {
+		ehca_err(&shca->ib_device, "PHYP changed iova_start in "
+			 "rereg_pmr, iova_start=%p iova_start_out=%llx e_mr=%p "
+			 "mr_handle=%llx lkey=%x lkey_out=%x", iova_start,
+			 hipzout.vaddr, e_mr, e_mr->ipz_mr_handle.handle,
+			 e_mr->ib.ib_mr.lkey, hipzout.lkey);
+		ret = -EFAULT;
+	} else {
+		/*
+		 * successful reregistration
+		 * note: start and start_out are identical for eServer HCAs
+		 */
+		e_mr->num_kpages = pginfo->num_kpages;
+		e_mr->num_hwpages = pginfo->num_hwpages;
+		e_mr->hwpage_size = pginfo->hwpage_size;
+		e_mr->start = iova_start;
+		e_mr->size = size;
+		e_mr->acl = acl;
+		*lkey = hipzout.lkey;
+		*rkey = hipzout.rkey;
+	}
+
+ehca_rereg_mr_rereg1_exit1:
+	ehca_free_fw_ctrlblock(kpage);
+ehca_rereg_mr_rereg1_exit0:
+	if ( ret && (ret != -EAGAIN) )
+		ehca_err(&shca->ib_device, "ret=%i lkey=%x rkey=%x "
+			 "pginfo=%p num_kpages=%llx num_hwpages=%llx",
+			 ret, *lkey, *rkey, pginfo, pginfo->num_kpages,
+			 pginfo->num_hwpages);
+	return ret;
+} /* end ehca_rereg_mr_rereg1() */
+
+/*----------------------------------------------------------------------*/
+
+int ehca_rereg_mr(struct ehca_shca *shca,
+		  struct ehca_mr *e_mr,
+		  u64 *iova_start,
+		  u64 size,
+		  int acl,
+		  struct ehca_pd *e_pd,
+		  struct ehca_mr_pginfo *pginfo,
+		  u32 *lkey,
+		  u32 *rkey)
+{
+	int ret = 0;
+	u64 h_ret;
+	int rereg_1_hcall = 1; /* 1: use hipz_h_reregister_pmr directly */
+	int rereg_3_hcall = 0; /* 1: use 3 hipz calls for reregistration */
+
+	/* first determine reregistration hCall(s) */
+	if ((pginfo->num_hwpages > MAX_RPAGES) ||
+	    (e_mr->num_hwpages > MAX_RPAGES) ||
+	    (pginfo->num_hwpages > e_mr->num_hwpages)) {
+		ehca_dbg(&shca->ib_device, "Rereg3 case, "
+			 "pginfo->num_hwpages=%llx e_mr->num_hwpages=%x",
+			 pginfo->num_hwpages, e_mr->num_hwpages);
+		rereg_1_hcall = 0;
+		rereg_3_hcall = 1;
+	}
+
+	if (e_mr->flags & EHCA_MR_FLAG_MAXMR) {	/* check for max-MR */
+		rereg_1_hcall = 0;
+		rereg_3_hcall = 1;
+		e_mr->flags &= ~EHCA_MR_FLAG_MAXMR;
+		ehca_err(&shca->ib_device, "Rereg MR for max-MR! e_mr=%p",
+			 e_mr);
+	}
+
+	if (rereg_1_hcall) {
+		ret = ehca_rereg_mr_rereg1(shca, e_mr, iova_start, size,
+					   acl, e_pd, pginfo, lkey, rkey);
+		if (ret) {
+			if (ret == -EAGAIN)
+				rereg_3_hcall = 1;
+			else
+				goto ehca_rereg_mr_exit0;
+		}
+	}
+
+	if (rereg_3_hcall) {
+		struct ehca_mr save_mr;
+
+		/* first deregister old MR */
+		h_ret = hipz_h_free_resource_mr(shca->ipz_hca_handle, e_mr);
+		if (h_ret != H_SUCCESS) {
+			ehca_err(&shca->ib_device, "hipz_free_mr failed, "
+				 "h_ret=%lli e_mr=%p hca_hndl=%llx mr_hndl=%llx "
+				 "mr->lkey=%x",
+				 h_ret, e_mr, shca->ipz_hca_handle.handle,
+				 e_mr->ipz_mr_handle.handle,
+				 e_mr->ib.ib_mr.lkey);
+			ret = ehca2ib_return_code(h_ret);
+			goto ehca_rereg_mr_exit0;
+		}
+		/* clean ehca_mr_t, without changing struct ib_mr and lock */
+		save_mr = *e_mr;
+		ehca_mr_deletenew(e_mr);
+
+		/* set some MR values */
+		e_mr->flags = save_mr.flags;
+		e_mr->hwpage_size = save_mr.hwpage_size;
+		e_mr->fmr_page_size = save_mr.fmr_page_size;
+		e_mr->fmr_max_pages = save_mr.fmr_max_pages;
+		e_mr->fmr_max_maps = save_mr.fmr_max_maps;
+		e_mr->fmr_map_cnt = save_mr.fmr_map_cnt;
+
+		ret = ehca_reg_mr(shca, e_mr, iova_start, size, acl,
+				  e_pd, pginfo, lkey, rkey, EHCA_REG_MR);
+		if (ret) {
+			u32 offset = (u64)(&e_mr->flags) - (u64)e_mr;
+			memcpy(&e_mr->flags, &(save_mr.flags),
+			       sizeof(struct ehca_mr) - offset);
+			goto ehca_rereg_mr_exit0;
+		}
+	}
+
+ehca_rereg_mr_exit0:
+	if (ret)
+		ehca_err(&shca->ib_device, "ret=%i shca=%p e_mr=%p "
+			 "iova_start=%p size=%llx acl=%x e_pd=%p pginfo=%p "
+			 "num_kpages=%llx lkey=%x rkey=%x rereg_1_hcall=%x "
+			 "rereg_3_hcall=%x", ret, shca, e_mr, iova_start, size,
+			 acl, e_pd, pginfo, pginfo->num_kpages, *lkey, *rkey,
+			 rereg_1_hcall, rereg_3_hcall);
+	return ret;
+} /* end ehca_rereg_mr() */
+
+/*----------------------------------------------------------------------*/
+
+int ehca_unmap_one_fmr(struct ehca_shca *shca,
+		       struct ehca_mr *e_fmr)
+{
+	int ret = 0;
+	u64 h_ret;
+	struct ehca_pd *e_pd =
+		container_of(e_fmr->ib.ib_fmr.pd, struct ehca_pd, ib_pd);
+	struct ehca_mr save_fmr;
+	u32 tmp_lkey, tmp_rkey;
+	struct ehca_mr_pginfo pginfo;
+	struct ehca_mr_hipzout_parms hipzout;
+	struct ehca_mr save_mr;
+
+	if (e_fmr->fmr_max_pages <= MAX_RPAGES) {
+		/*
+		 * note: after using rereg hcall with len=0,
+		 * rereg hcall must be used again for registering pages
+		 */
+		h_ret = hipz_h_reregister_pmr(shca->ipz_hca_handle, e_fmr, 0,
+					      0, 0, e_pd->fw_pd, 0, &hipzout);
+		if (h_ret == H_SUCCESS) {
+			/* successful reregistration */
+			e_fmr->start = NULL;
+			e_fmr->size = 0;
+			tmp_lkey = hipzout.lkey;
+			tmp_rkey = hipzout.rkey;
+			return 0;
+		}
+		/*
+		 * should not happen, because length checked above,
+		 * FMRs are not shared and no MW bound to FMRs
+		 */
+		ehca_err(&shca->ib_device, "hipz_reregister_pmr failed "
+			 "(Rereg1), h_ret=%lli e_fmr=%p hca_hndl=%llx "
+			 "mr_hndl=%llx lkey=%x lkey_out=%x",
+			 h_ret, e_fmr, shca->ipz_hca_handle.handle,
+			 e_fmr->ipz_mr_handle.handle,
+			 e_fmr->ib.ib_fmr.lkey, hipzout.lkey);
+		/* try free and rereg */
+	}
+
+	/* first free old FMR */
+	h_ret = hipz_h_free_resource_mr(shca->ipz_hca_handle, e_fmr);
+	if (h_ret != H_SUCCESS) {
+		ehca_err(&shca->ib_device, "hipz_free_mr failed, "
+			 "h_ret=%lli e_fmr=%p hca_hndl=%llx mr_hndl=%llx "
+			 "lkey=%x",
+			 h_ret, e_fmr, shca->ipz_hca_handle.handle,
+			 e_fmr->ipz_mr_handle.handle,
+			 e_fmr->ib.ib_fmr.lkey);
+		ret = ehca2ib_return_code(h_ret);
+		goto ehca_unmap_one_fmr_exit0;
+	}
+	/* clean ehca_mr_t, without changing lock */
+	save_fmr = *e_fmr;
+	ehca_mr_deletenew(e_fmr);
+
+	/* set some MR values */
+	e_fmr->flags = save_fmr.flags;
+	e_fmr->hwpage_size = save_fmr.hwpage_size;
+	e_fmr->fmr_page_size = save_fmr.fmr_page_size;
+	e_fmr->fmr_max_pages = save_fmr.fmr_max_pages;
+	e_fmr->fmr_max_maps = save_fmr.fmr_max_maps;
+	e_fmr->fmr_map_cnt = save_fmr.fmr_map_cnt;
+	e_fmr->acl = save_fmr.acl;
+
+	memset(&pginfo, 0, sizeof(pginfo));
+	pginfo.type = EHCA_MR_PGI_FMR;
+	ret = ehca_reg_mr(shca, e_fmr, NULL,
+			  (e_fmr->fmr_max_pages * e_fmr->fmr_page_size),
+			  e_fmr->acl, e_pd, &pginfo, &tmp_lkey,
+			  &tmp_rkey, EHCA_REG_MR);
+	if (ret) {
+		u32 offset = (u64)(&e_fmr->flags) - (u64)e_fmr;
+		memcpy(&e_fmr->flags, &(save_mr.flags),
+		       sizeof(struct ehca_mr) - offset);
+	}
+
+ehca_unmap_one_fmr_exit0:
+	if (ret)
+		ehca_err(&shca->ib_device, "ret=%i tmp_lkey=%x tmp_rkey=%x "
+			 "fmr_max_pages=%x",
+			 ret, tmp_lkey, tmp_rkey, e_fmr->fmr_max_pages);
+	return ret;
+} /* end ehca_unmap_one_fmr() */
+
+/*----------------------------------------------------------------------*/
+
+int ehca_reg_smr(struct ehca_shca *shca,
+		 struct ehca_mr *e_origmr,
+		 struct ehca_mr *e_newmr,
+		 u64 *iova_start,
+		 int acl,
+		 struct ehca_pd *e_pd,
+		 u32 *lkey, /*OUT*/
+		 u32 *rkey) /*OUT*/
+{
+	int ret = 0;
+	u64 h_ret;
+	u32 hipz_acl;
+	struct ehca_mr_hipzout_parms hipzout;
+
+	ehca_mrmw_map_acl(acl, &hipz_acl);
+	ehca_mrmw_set_pgsize_hipz_acl(e_origmr->hwpage_size, &hipz_acl);
+
+	h_ret = hipz_h_register_smr(shca->ipz_hca_handle, e_newmr, e_origmr,
+				    (u64)iova_start, hipz_acl, e_pd->fw_pd,
+				    &hipzout);
+	if (h_ret != H_SUCCESS) {
+		ehca_err(&shca->ib_device, "hipz_reg_smr failed, h_ret=%lli "
+			 "shca=%p e_origmr=%p e_newmr=%p iova_start=%p acl=%x "
+			 "e_pd=%p hca_hndl=%llx mr_hndl=%llx lkey=%x",
+			 h_ret, shca, e_origmr, e_newmr, iova_start, acl, e_pd,
+			 shca->ipz_hca_handle.handle,
+			 e_origmr->ipz_mr_handle.handle,
+			 e_origmr->ib.ib_mr.lkey);
+		ret = ehca2ib_return_code(h_ret);
+		goto ehca_reg_smr_exit0;
+	}
+	/* successful registration */
+	e_newmr->num_kpages = e_origmr->num_kpages;
+	e_newmr->num_hwpages = e_origmr->num_hwpages;
+	e_newmr->hwpage_size   = e_origmr->hwpage_size;
+	e_newmr->start = iova_start;
+	e_newmr->size = e_origmr->size;
+	e_newmr->acl = acl;
+	e_newmr->ipz_mr_handle = hipzout.handle;
+	*lkey = hipzout.lkey;
+	*rkey = hipzout.rkey;
+	return 0;
+
+ehca_reg_smr_exit0:
+	if (ret)
+		ehca_err(&shca->ib_device, "ret=%i shca=%p e_origmr=%p "
+			 "e_newmr=%p iova_start=%p acl=%x e_pd=%p",
+			 ret, shca, e_origmr, e_newmr, iova_start, acl, e_pd);
+	return ret;
+} /* end ehca_reg_smr() */
+
+/*----------------------------------------------------------------------*/
+static inline void *ehca_calc_sectbase(int top, int dir, int idx)
+{
+	unsigned long ret = idx;
+	ret |= dir << EHCA_DIR_INDEX_SHIFT;
+	ret |= top << EHCA_TOP_INDEX_SHIFT;
+	return __va(ret << SECTION_SIZE_BITS);
+}
+
+#define ehca_bmap_valid(entry) \
+	((u64)entry != (u64)EHCA_INVAL_ADDR)
+
+static u64 ehca_reg_mr_section(int top, int dir, int idx, u64 *kpage,
+			       struct ehca_shca *shca, struct ehca_mr *mr,
+			       struct ehca_mr_pginfo *pginfo)
+{
+	u64 h_ret = 0;
+	unsigned long page = 0;
+	u64 rpage = __pa(kpage);
+	int page_count;
+
+	void *sectbase = ehca_calc_sectbase(top, dir, idx);
+	if ((unsigned long)sectbase & (pginfo->hwpage_size - 1)) {
+		ehca_err(&shca->ib_device, "reg_mr_section will probably fail:"
+					   "hwpage_size does not fit to "
+					   "section start address");
+	}
+	page_count = EHCA_SECTSIZE / pginfo->hwpage_size;
+
+	while (page < page_count) {
+		u64 rnum;
+		for (rnum = 0; (rnum < MAX_RPAGES) && (page < page_count);
+		     rnum++) {
+			void *pg = sectbase + ((page++) * pginfo->hwpage_size);
+			kpage[rnum] = __pa(pg);
+		}
+
+		h_ret = hipz_h_register_rpage_mr(shca->ipz_hca_handle, mr,
+			ehca_encode_hwpage_size(pginfo->hwpage_size),
+			0, rpage, rnum);
+
+		if ((h_ret != H_SUCCESS) && (h_ret != H_PAGE_REGISTERED)) {
+			ehca_err(&shca->ib_device, "register_rpage_mr failed");
+			return h_ret;
+		}
+	}
+	return h_ret;
+}
+
+static u64 ehca_reg_mr_sections(int top, int dir, u64 *kpage,
+				struct ehca_shca *shca, struct ehca_mr *mr,
+				struct ehca_mr_pginfo *pginfo)
+{
+	u64 hret = H_SUCCESS;
+	int idx;
+
+	for (idx = 0; idx < EHCA_MAP_ENTRIES; idx++) {
+		if (!ehca_bmap_valid(ehca_bmap->top[top]->dir[dir]->ent[idx]))
+			continue;
+
+		hret = ehca_reg_mr_section(top, dir, idx, kpage, shca, mr,
+					   pginfo);
+		if ((hret != H_SUCCESS) && (hret != H_PAGE_REGISTERED))
+				return hret;
+	}
+	return hret;
+}
+
+static u64 ehca_reg_mr_dir_sections(int top, u64 *kpage, struct ehca_shca *shca,
+				    struct ehca_mr *mr,
+				    struct ehca_mr_pginfo *pginfo)
+{
+	u64 hret = H_SUCCESS;
+	int dir;
+
+	for (dir = 0; dir < EHCA_MAP_ENTRIES; dir++) {
+		if (!ehca_bmap_valid(ehca_bmap->top[top]->dir[dir]))
+			continue;
+
+		hret = ehca_reg_mr_sections(top, dir, kpage, shca, mr, pginfo);
+		if ((hret != H_SUCCESS) && (hret != H_PAGE_REGISTERED))
+				return hret;
+	}
+	return hret;
+}
+
+/* register internal max-MR to internal SHCA */
+int ehca_reg_internal_maxmr(
+	struct ehca_shca *shca,
+	struct ehca_pd *e_pd,
+	struct ehca_mr **e_maxmr)  /*OUT*/
+{
+	int ret;
+	struct ehca_mr *e_mr;
+	u64 *iova_start;
+	u64 size_maxmr;
+	struct ehca_mr_pginfo pginfo;
+	struct ib_phys_buf ib_pbuf;
+	u32 num_kpages;
+	u32 num_hwpages;
+	u64 hw_pgsize;
+
+	if (!ehca_bmap) {
+		ret = -EFAULT;
+		goto ehca_reg_internal_maxmr_exit0;
+	}
+
+	e_mr = ehca_mr_new();
+	if (!e_mr) {
+		ehca_err(&shca->ib_device, "out of memory");
+		ret = -ENOMEM;
+		goto ehca_reg_internal_maxmr_exit0;
+	}
+	e_mr->flags |= EHCA_MR_FLAG_MAXMR;
+
+	/* register internal max-MR on HCA */
+	size_maxmr = ehca_mr_len;
+	iova_start = (u64 *)ehca_map_vaddr((void *)(KERNELBASE + PHYSICAL_START));
+	ib_pbuf.addr = 0;
+	ib_pbuf.size = size_maxmr;
+	num_kpages = NUM_CHUNKS(((u64)iova_start % PAGE_SIZE) + size_maxmr,
+				PAGE_SIZE);
+	hw_pgsize = ehca_get_max_hwpage_size(shca);
+	num_hwpages = NUM_CHUNKS(((u64)iova_start % hw_pgsize) + size_maxmr,
+				 hw_pgsize);
+
+	memset(&pginfo, 0, sizeof(pginfo));
+	pginfo.type = EHCA_MR_PGI_PHYS;
+	pginfo.num_kpages = num_kpages;
+	pginfo.num_hwpages = num_hwpages;
+	pginfo.hwpage_size = hw_pgsize;
+	pginfo.u.phy.num_phys_buf = 1;
+	pginfo.u.phy.phys_buf_array = &ib_pbuf;
+
+	ret = ehca_reg_mr(shca, e_mr, iova_start, size_maxmr, 0, e_pd,
+			  &pginfo, &e_mr->ib.ib_mr.lkey,
+			  &e_mr->ib.ib_mr.rkey, EHCA_REG_BUSMAP_MR);
+	if (ret) {
+		ehca_err(&shca->ib_device, "reg of internal max MR failed, "
+			 "e_mr=%p iova_start=%p size_maxmr=%llx num_kpages=%x "
+			 "num_hwpages=%x", e_mr, iova_start, size_maxmr,
+			 num_kpages, num_hwpages);
+		goto ehca_reg_internal_maxmr_exit1;
+	}
+
+	/* successful registration of all pages */
+	e_mr->ib.ib_mr.device = e_pd->ib_pd.device;
+	e_mr->ib.ib_mr.pd = &e_pd->ib_pd;
+	e_mr->ib.ib_mr.uobject = NULL;
+	atomic_inc(&(e_pd->ib_pd.usecnt));
+	atomic_set(&(e_mr->ib.ib_mr.usecnt), 0);
+	*e_maxmr = e_mr;
+	return 0;
+
+ehca_reg_internal_maxmr_exit1:
+	ehca_mr_delete(e_mr);
+ehca_reg_internal_maxmr_exit0:
+	if (ret)
+		ehca_err(&shca->ib_device, "ret=%i shca=%p e_pd=%p e_maxmr=%p",
+			 ret, shca, e_pd, e_maxmr);
+	return ret;
+} /* end ehca_reg_internal_maxmr() */
+
+/*----------------------------------------------------------------------*/
+
+int ehca_reg_maxmr(struct ehca_shca *shca,
+		   struct ehca_mr *e_newmr,
+		   u64 *iova_start,
+		   int acl,
+		   struct ehca_pd *e_pd,
+		   u32 *lkey,
+		   u32 *rkey)
+{
+	u64 h_ret;
+	struct ehca_mr *e_origmr = shca->maxmr;
+	u32 hipz_acl;
+	struct ehca_mr_hipzout_parms hipzout;
+
+	ehca_mrmw_map_acl(acl, &hipz_acl);
+	ehca_mrmw_set_pgsize_hipz_acl(e_origmr->hwpage_size, &hipz_acl);
+
+	h_ret = hipz_h_register_smr(shca->ipz_hca_handle, e_newmr, e_origmr,
+				    (u64)iova_start, hipz_acl, e_pd->fw_pd,
+				    &hipzout);
+	if (h_ret != H_SUCCESS) {
+		ehca_err(&shca->ib_device, "hipz_reg_smr failed, h_ret=%lli "
+			 "e_origmr=%p hca_hndl=%llx mr_hndl=%llx lkey=%x",
+			 h_ret, e_origmr, shca->ipz_hca_handle.handle,
+			 e_origmr->ipz_mr_handle.handle,
+			 e_origmr->ib.ib_mr.lkey);
+		return ehca2ib_return_code(h_ret);
+	}
+	/* successful registration */
+	e_newmr->num_kpages = e_origmr->num_kpages;
+	e_newmr->num_hwpages = e_origmr->num_hwpages;
+	e_newmr->hwpage_size = e_origmr->hwpage_size;
+	e_newmr->start = iova_start;
+	e_newmr->size = e_origmr->size;
+	e_newmr->acl = acl;
+	e_newmr->ipz_mr_handle = hipzout.handle;
+	*lkey = hipzout.lkey;
+	*rkey = hipzout.rkey;
+	return 0;
+} /* end ehca_reg_maxmr() */
+
+/*----------------------------------------------------------------------*/
+
+int ehca_dereg_internal_maxmr(struct ehca_shca *shca)
+{
+	int ret;
+	struct ehca_mr *e_maxmr;
+	struct ib_pd *ib_pd;
+
+	if (!shca->maxmr) {
+		ehca_err(&shca->ib_device, "bad call, shca=%p", shca);
+		ret = -EINVAL;
+		goto ehca_dereg_internal_maxmr_exit0;
+	}
+
+	e_maxmr = shca->maxmr;
+	ib_pd = e_maxmr->ib.ib_mr.pd;
+	shca->maxmr = NULL; /* remove internal max-MR indication from SHCA */
+
+	ret = ehca_dereg_mr(&e_maxmr->ib.ib_mr);
+	if (ret) {
+		ehca_err(&shca->ib_device, "dereg internal max-MR failed, "
+			 "ret=%i e_maxmr=%p shca=%p lkey=%x",
+			 ret, e_maxmr, shca, e_maxmr->ib.ib_mr.lkey);
+		shca->maxmr = e_maxmr;
+		goto ehca_dereg_internal_maxmr_exit0;
+	}
+
+	atomic_dec(&ib_pd->usecnt);
+
+ehca_dereg_internal_maxmr_exit0:
+	if (ret)
+		ehca_err(&shca->ib_device, "ret=%i shca=%p shca->maxmr=%p",
+			 ret, shca, shca->maxmr);
+	return ret;
+} /* end ehca_dereg_internal_maxmr() */
+
+/*----------------------------------------------------------------------*/
+
+/*
+ * check physical buffer array of MR verbs for validness and
+ * calculates MR size
+ */
+int ehca_mr_chk_buf_and_calc_size(struct ib_phys_buf *phys_buf_array,
+				  int num_phys_buf,
+				  u64 *iova_start,
+				  u64 *size)
+{
+	struct ib_phys_buf *pbuf = phys_buf_array;
+	u64 size_count = 0;
+	u32 i;
+
+	if (num_phys_buf == 0) {
+		ehca_gen_err("bad phys buf array len, num_phys_buf=0");
+		return -EINVAL;
+	}
+	/* check first buffer */
+	if (((u64)iova_start & ~PAGE_MASK) != (pbuf->addr & ~PAGE_MASK)) {
+		ehca_gen_err("iova_start/addr mismatch, iova_start=%p "
+			     "pbuf->addr=%llx pbuf->size=%llx",
+			     iova_start, pbuf->addr, pbuf->size);
+		return -EINVAL;
+	}
+	if (((pbuf->addr + pbuf->size) % PAGE_SIZE) &&
+	    (num_phys_buf > 1)) {
+		ehca_gen_err("addr/size mismatch in 1st buf, pbuf->addr=%llx "
+			     "pbuf->size=%llx", pbuf->addr, pbuf->size);
+		return -EINVAL;
+	}
+
+	for (i = 0; i < num_phys_buf; i++) {
+		if ((i > 0) && (pbuf->addr % PAGE_SIZE)) {
+			ehca_gen_err("bad address, i=%x pbuf->addr=%llx "
+				     "pbuf->size=%llx",
+				     i, pbuf->addr, pbuf->size);
+			return -EINVAL;
+		}
+		if (((i > 0) &&	/* not 1st */
+		     (i < (num_phys_buf - 1)) &&	/* not last */
+		     (pbuf->size % PAGE_SIZE)) || (pbuf->size == 0)) {
+			ehca_gen_err("bad size, i=%x pbuf->size=%llx",
+				     i, pbuf->size);
+			return -EINVAL;
+		}
+		size_count += pbuf->size;
+		pbuf++;
+	}
+
+	*size = size_count;
+	return 0;
+} /* end ehca_mr_chk_buf_and_calc_size() */
+
+/*----------------------------------------------------------------------*/
+
+/* check page list of map FMR verb for validness */
+int ehca_fmr_check_page_list(struct ehca_mr *e_fmr,
+			     u64 *page_list,
+			     int list_len)
+{
+	u32 i;
+	u64 *page;
+
+	if ((list_len == 0) || (list_len > e_fmr->fmr_max_pages)) {
+		ehca_gen_err("bad list_len, list_len=%x "
+			     "e_fmr->fmr_max_pages=%x fmr=%p",
+			     list_len, e_fmr->fmr_max_pages, e_fmr);
+		return -EINVAL;
+	}
+
+	/* each page must be aligned */
+	page = page_list;
+	for (i = 0; i < list_len; i++) {
+		if (*page % e_fmr->fmr_page_size) {
+			ehca_gen_err("bad page, i=%x *page=%llx page=%p fmr=%p "
+				     "fmr_page_size=%x", i, *page, page, e_fmr,
+				     e_fmr->fmr_page_size);
+			return -EINVAL;
+		}
+		page++;
+	}
+
+	return 0;
+} /* end ehca_fmr_check_page_list() */
+
+/*----------------------------------------------------------------------*/
+
+/* PAGE_SIZE >= pginfo->hwpage_size */
+static int ehca_set_pagebuf_user1(struct ehca_mr_pginfo *pginfo,
+				  u32 number,
+				  u64 *kpage)
+{
+	int ret = 0;
+	u64 pgaddr;
+	u32 j = 0;
+	int hwpages_per_kpage = PAGE_SIZE / pginfo->hwpage_size;
+	struct scatterlist **sg = &pginfo->u.usr.next_sg;
+
+	while (*sg != NULL) {
+		pgaddr = page_to_pfn(sg_page(*sg))
+			<< PAGE_SHIFT;
+		*kpage = pgaddr + (pginfo->next_hwpage *
+				   pginfo->hwpage_size);
+		if (!(*kpage)) {
+			ehca_gen_err("pgaddr=%llx "
+				     "sg_dma_address=%llx "
+				     "entry=%llx next_hwpage=%llx",
+				     pgaddr, (u64)sg_dma_address(*sg),
+				     pginfo->u.usr.next_nmap,
+				     pginfo->next_hwpage);
+			return -EFAULT;
+		}
+		(pginfo->hwpage_cnt)++;
+		(pginfo->next_hwpage)++;
+		kpage++;
+		if (pginfo->next_hwpage % hwpages_per_kpage == 0) {
+			(pginfo->kpage_cnt)++;
+			(pginfo->u.usr.next_nmap)++;
+			pginfo->next_hwpage = 0;
+			*sg = sg_next(*sg);
+		}
+		j++;
+		if (j >= number)
+			break;
+	}
+
+	return ret;
+}
+
+/*
+ * check given pages for contiguous layout
+ * last page addr is returned in prev_pgaddr for further check
+ */
+static int ehca_check_kpages_per_ate(struct scatterlist **sg,
+				     int num_pages,
+				     u64 *prev_pgaddr)
+{
+	for (; *sg && num_pages > 0; *sg = sg_next(*sg), num_pages--) {
+		u64 pgaddr = page_to_pfn(sg_page(*sg)) << PAGE_SHIFT;
+		if (ehca_debug_level >= 3)
+			ehca_gen_dbg("chunk_page=%llx value=%016llx", pgaddr,
+				     *(u64 *)__va(pgaddr));
+		if (pgaddr - PAGE_SIZE != *prev_pgaddr) {
+			ehca_gen_err("uncontiguous page found pgaddr=%llx "
+				     "prev_pgaddr=%llx entries_left_in_hwpage=%x",
+				     pgaddr, *prev_pgaddr, num_pages);
+			return -EINVAL;
+		}
+		*prev_pgaddr = pgaddr;
+	}
+	return 0;
+}
+
+/* PAGE_SIZE < pginfo->hwpage_size */
+static int ehca_set_pagebuf_user2(struct ehca_mr_pginfo *pginfo,
+				  u32 number,
+				  u64 *kpage)
+{
+	int ret = 0;
+	u64 pgaddr, prev_pgaddr;
+	u32 j = 0;
+	int kpages_per_hwpage = pginfo->hwpage_size / PAGE_SIZE;
+	int nr_kpages = kpages_per_hwpage;
+	struct scatterlist **sg = &pginfo->u.usr.next_sg;
+
+	while (*sg != NULL) {
+
+		if (nr_kpages == kpages_per_hwpage) {
+			pgaddr = (page_to_pfn(sg_page(*sg))
+				   << PAGE_SHIFT);
+			*kpage = pgaddr;
+			if (!(*kpage)) {
+				ehca_gen_err("pgaddr=%llx entry=%llx",
+					     pgaddr, pginfo->u.usr.next_nmap);
+				ret = -EFAULT;
+				return ret;
+			}
+			/*
+			 * The first page in a hwpage must be aligned;
+			 * the first MR page is exempt from this rule.
+			 */
+			if (pgaddr & (pginfo->hwpage_size - 1)) {
+				if (pginfo->hwpage_cnt) {
+					ehca_gen_err(
+						"invalid alignment "
+						"pgaddr=%llx entry=%llx "
+						"mr_pgsize=%llx",
+						pgaddr, pginfo->u.usr.next_nmap,
+						pginfo->hwpage_size);
+					ret = -EFAULT;
+					return ret;
+				}
+				/* first MR page */
+				pginfo->kpage_cnt =
+					(pgaddr &
+					 (pginfo->hwpage_size - 1)) >>
+					PAGE_SHIFT;
+				nr_kpages -= pginfo->kpage_cnt;
+				*kpage = pgaddr &
+					 ~(pginfo->hwpage_size - 1);
+			}
+			if (ehca_debug_level >= 3) {
+				u64 val = *(u64 *)__va(pgaddr);
+				ehca_gen_dbg("kpage=%llx page=%llx "
+					     "value=%016llx",
+					     *kpage, pgaddr, val);
+			}
+			prev_pgaddr = pgaddr;
+			*sg = sg_next(*sg);
+			pginfo->kpage_cnt++;
+			pginfo->u.usr.next_nmap++;
+			nr_kpages--;
+			if (!nr_kpages)
+				goto next_kpage;
+			continue;
+		}
+
+		ret = ehca_check_kpages_per_ate(sg, nr_kpages,
+						&prev_pgaddr);
+		if (ret)
+			return ret;
+		pginfo->kpage_cnt += nr_kpages;
+		pginfo->u.usr.next_nmap += nr_kpages;
+
+next_kpage:
+		nr_kpages = kpages_per_hwpage;
+		(pginfo->hwpage_cnt)++;
+		kpage++;
+		j++;
+		if (j >= number)
+			break;
+	}
+
+	return ret;
+}
+
+static int ehca_set_pagebuf_phys(struct ehca_mr_pginfo *pginfo,
+				 u32 number, u64 *kpage)
+{
+	int ret = 0;
+	struct ib_phys_buf *pbuf;
+	u64 num_hw, offs_hw;
+	u32 i = 0;
+
+	/* loop over desired phys_buf_array entries */
+	while (i < number) {
+		pbuf   = pginfo->u.phy.phys_buf_array + pginfo->u.phy.next_buf;
+		num_hw  = NUM_CHUNKS((pbuf->addr % pginfo->hwpage_size) +
+				     pbuf->size, pginfo->hwpage_size);
+		offs_hw = (pbuf->addr & ~(pginfo->hwpage_size - 1)) /
+			pginfo->hwpage_size;
+		while (pginfo->next_hwpage < offs_hw + num_hw) {
+			/* sanity check */
+			if ((pginfo->kpage_cnt >= pginfo->num_kpages) ||
+			    (pginfo->hwpage_cnt >= pginfo->num_hwpages)) {
+				ehca_gen_err("kpage_cnt >= num_kpages, "
+					     "kpage_cnt=%llx num_kpages=%llx "
+					     "hwpage_cnt=%llx "
+					     "num_hwpages=%llx i=%x",
+					     pginfo->kpage_cnt,
+					     pginfo->num_kpages,
+					     pginfo->hwpage_cnt,
+					     pginfo->num_hwpages, i);
+				return -EFAULT;
+			}
+			*kpage = (pbuf->addr & ~(pginfo->hwpage_size - 1)) +
+				 (pginfo->next_hwpage * pginfo->hwpage_size);
+			if ( !(*kpage) && pbuf->addr ) {
+				ehca_gen_err("pbuf->addr=%llx pbuf->size=%llx "
+					     "next_hwpage=%llx", pbuf->addr,
+					     pbuf->size, pginfo->next_hwpage);
+				return -EFAULT;
+			}
+			(pginfo->hwpage_cnt)++;
+			(pginfo->next_hwpage)++;
+			if (PAGE_SIZE >= pginfo->hwpage_size) {
+				if (pginfo->next_hwpage %
+				    (PAGE_SIZE / pginfo->hwpage_size) == 0)
+					(pginfo->kpage_cnt)++;
+			} else
+				pginfo->kpage_cnt += pginfo->hwpage_size /
+					PAGE_SIZE;
+			kpage++;
+			i++;
+			if (i >= number) break;
+		}
+		if (pginfo->next_hwpage >= offs_hw + num_hw) {
+			(pginfo->u.phy.next_buf)++;
+			pginfo->next_hwpage = 0;
+		}
+	}
+	return ret;
+}
+
+static int ehca_set_pagebuf_fmr(struct ehca_mr_pginfo *pginfo,
+				u32 number, u64 *kpage)
+{
+	int ret = 0;
+	u64 *fmrlist;
+	u32 i;
+
+	/* loop over desired page_list entries */
+	fmrlist = pginfo->u.fmr.page_list + pginfo->u.fmr.next_listelem;
+	for (i = 0; i < number; i++) {
+		*kpage = (*fmrlist & ~(pginfo->hwpage_size - 1)) +
+			   pginfo->next_hwpage * pginfo->hwpage_size;
+		if ( !(*kpage) ) {
+			ehca_gen_err("*fmrlist=%llx fmrlist=%p "
+				     "next_listelem=%llx next_hwpage=%llx",
+				     *fmrlist, fmrlist,
+				     pginfo->u.fmr.next_listelem,
+				     pginfo->next_hwpage);
+			return -EFAULT;
+		}
+		(pginfo->hwpage_cnt)++;
+		if (pginfo->u.fmr.fmr_pgsize >= pginfo->hwpage_size) {
+			if (pginfo->next_hwpage %
+			    (pginfo->u.fmr.fmr_pgsize /
+			     pginfo->hwpage_size) == 0) {
+				(pginfo->kpage_cnt)++;
+				(pginfo->u.fmr.next_listelem)++;
+				fmrlist++;
+				pginfo->next_hwpage = 0;
+			} else
+				(pginfo->next_hwpage)++;
+		} else {
+			unsigned int cnt_per_hwpage = pginfo->hwpage_size /
+				pginfo->u.fmr.fmr_pgsize;
+			unsigned int j;
+			u64 prev = *kpage;
+			/* check if adrs are contiguous */
+			for (j = 1; j < cnt_per_hwpage; j++) {
+				u64 p = fmrlist[j] & ~(pginfo->hwpage_size - 1);
+				if (prev + pginfo->u.fmr.fmr_pgsize != p) {
+					ehca_gen_err("uncontiguous fmr pages "
+						     "found prev=%llx p=%llx "
+						     "idx=%x", prev, p, i + j);
+					return -EINVAL;
+				}
+				prev = p;
+			}
+			pginfo->kpage_cnt += cnt_per_hwpage;
+			pginfo->u.fmr.next_listelem += cnt_per_hwpage;
+			fmrlist += cnt_per_hwpage;
+		}
+		kpage++;
+	}
+	return ret;
+}
+
+/* setup page buffer from page info */
+int ehca_set_pagebuf(struct ehca_mr_pginfo *pginfo,
+		     u32 number,
+		     u64 *kpage)
+{
+	int ret;
+
+	switch (pginfo->type) {
+	case EHCA_MR_PGI_PHYS:
+		ret = ehca_set_pagebuf_phys(pginfo, number, kpage);
+		break;
+	case EHCA_MR_PGI_USER:
+		ret = PAGE_SIZE >= pginfo->hwpage_size ?
+			ehca_set_pagebuf_user1(pginfo, number, kpage) :
+			ehca_set_pagebuf_user2(pginfo, number, kpage);
+		break;
+	case EHCA_MR_PGI_FMR:
+		ret = ehca_set_pagebuf_fmr(pginfo, number, kpage);
+		break;
+	default:
+		ehca_gen_err("bad pginfo->type=%x", pginfo->type);
+		ret = -EFAULT;
+		break;
+	}
+	return ret;
+} /* end ehca_set_pagebuf() */
+
+/*----------------------------------------------------------------------*/
+
+/*
+ * check MR if it is a max-MR, i.e. uses whole memory
+ * in case it's a max-MR 1 is returned, else 0
+ */
+int ehca_mr_is_maxmr(u64 size,
+		     u64 *iova_start)
+{
+	/* a MR is treated as max-MR only if it fits following: */
+	if ((size == ehca_mr_len) &&
+	    (iova_start == (void *)ehca_map_vaddr((void *)(KERNELBASE + PHYSICAL_START)))) {
+		ehca_gen_dbg("this is a max-MR");
+		return 1;
+	} else
+		return 0;
+} /* end ehca_mr_is_maxmr() */
+
+/*----------------------------------------------------------------------*/
+
+/* map access control for MR/MW. This routine is used for MR and MW. */
+void ehca_mrmw_map_acl(int ib_acl,
+		       u32 *hipz_acl)
+{
+	*hipz_acl = 0;
+	if (ib_acl & IB_ACCESS_REMOTE_READ)
+		*hipz_acl |= HIPZ_ACCESSCTRL_R_READ;
+	if (ib_acl & IB_ACCESS_REMOTE_WRITE)
+		*hipz_acl |= HIPZ_ACCESSCTRL_R_WRITE;
+	if (ib_acl & IB_ACCESS_REMOTE_ATOMIC)
+		*hipz_acl |= HIPZ_ACCESSCTRL_R_ATOMIC;
+	if (ib_acl & IB_ACCESS_LOCAL_WRITE)
+		*hipz_acl |= HIPZ_ACCESSCTRL_L_WRITE;
+	if (ib_acl & IB_ACCESS_MW_BIND)
+		*hipz_acl |= HIPZ_ACCESSCTRL_MW_BIND;
+} /* end ehca_mrmw_map_acl() */
+
+/*----------------------------------------------------------------------*/
+
+/* sets page size in hipz access control for MR/MW. */
+void ehca_mrmw_set_pgsize_hipz_acl(u32 pgsize, u32 *hipz_acl) /*INOUT*/
+{
+	*hipz_acl |= (ehca_encode_hwpage_size(pgsize) << 24);
+} /* end ehca_mrmw_set_pgsize_hipz_acl() */
+
+/*----------------------------------------------------------------------*/
+
+/*
+ * reverse map access control for MR/MW.
+ * This routine is used for MR and MW.
+ */
+void ehca_mrmw_reverse_map_acl(const u32 *hipz_acl,
+			       int *ib_acl) /*OUT*/
+{
+	*ib_acl = 0;
+	if (*hipz_acl & HIPZ_ACCESSCTRL_R_READ)
+		*ib_acl |= IB_ACCESS_REMOTE_READ;
+	if (*hipz_acl & HIPZ_ACCESSCTRL_R_WRITE)
+		*ib_acl |= IB_ACCESS_REMOTE_WRITE;
+	if (*hipz_acl & HIPZ_ACCESSCTRL_R_ATOMIC)
+		*ib_acl |= IB_ACCESS_REMOTE_ATOMIC;
+	if (*hipz_acl & HIPZ_ACCESSCTRL_L_WRITE)
+		*ib_acl |= IB_ACCESS_LOCAL_WRITE;
+	if (*hipz_acl & HIPZ_ACCESSCTRL_MW_BIND)
+		*ib_acl |= IB_ACCESS_MW_BIND;
+} /* end ehca_mrmw_reverse_map_acl() */
+
+
+/*----------------------------------------------------------------------*/
+
+/*
+ * MR destructor and constructor
+ * used in Reregister MR verb, sets all fields in ehca_mr_t to 0,
+ * except struct ib_mr and spinlock
+ */
+void ehca_mr_deletenew(struct ehca_mr *mr)
+{
+	mr->flags = 0;
+	mr->num_kpages = 0;
+	mr->num_hwpages = 0;
+	mr->acl = 0;
+	mr->start = NULL;
+	mr->fmr_page_size = 0;
+	mr->fmr_max_pages = 0;
+	mr->fmr_max_maps = 0;
+	mr->fmr_map_cnt = 0;
+	memset(&mr->ipz_mr_handle, 0, sizeof(mr->ipz_mr_handle));
+	memset(&mr->galpas, 0, sizeof(mr->galpas));
+} /* end ehca_mr_deletenew() */
+
+int ehca_init_mrmw_cache(void)
+{
+	mr_cache = kmem_cache_create("ehca_cache_mr",
+				     sizeof(struct ehca_mr), 0,
+				     SLAB_HWCACHE_ALIGN,
+				     NULL);
+	if (!mr_cache)
+		return -ENOMEM;
+	mw_cache = kmem_cache_create("ehca_cache_mw",
+				     sizeof(struct ehca_mw), 0,
+				     SLAB_HWCACHE_ALIGN,
+				     NULL);
+	if (!mw_cache) {
+		kmem_cache_destroy(mr_cache);
+		mr_cache = NULL;
+		return -ENOMEM;
+	}
+	return 0;
+}
+
+void ehca_cleanup_mrmw_cache(void)
+{
+	if (mr_cache)
+		kmem_cache_destroy(mr_cache);
+	if (mw_cache)
+		kmem_cache_destroy(mw_cache);
+}
+
+static inline int ehca_init_top_bmap(struct ehca_top_bmap *ehca_top_bmap,
+				     int dir)
+{
+	if (!ehca_bmap_valid(ehca_top_bmap->dir[dir])) {
+		ehca_top_bmap->dir[dir] =
+			kmalloc(sizeof(struct ehca_dir_bmap), GFP_KERNEL);
+		if (!ehca_top_bmap->dir[dir])
+			return -ENOMEM;
+		/* Set map block to 0xFF according to EHCA_INVAL_ADDR */
+		memset(ehca_top_bmap->dir[dir], 0xFF, EHCA_ENT_MAP_SIZE);
+	}
+	return 0;
+}
+
+static inline int ehca_init_bmap(struct ehca_bmap *ehca_bmap, int top, int dir)
+{
+	if (!ehca_bmap_valid(ehca_bmap->top[top])) {
+		ehca_bmap->top[top] =
+			kmalloc(sizeof(struct ehca_top_bmap), GFP_KERNEL);
+		if (!ehca_bmap->top[top])
+			return -ENOMEM;
+		/* Set map block to 0xFF according to EHCA_INVAL_ADDR */
+		memset(ehca_bmap->top[top], 0xFF, EHCA_DIR_MAP_SIZE);
+	}
+	return ehca_init_top_bmap(ehca_bmap->top[top], dir);
+}
+
+static inline int ehca_calc_index(unsigned long i, unsigned long s)
+{
+	return (i >> s) & EHCA_INDEX_MASK;
+}
+
+void ehca_destroy_busmap(void)
+{
+	int top, dir;
+
+	if (!ehca_bmap)
+		return;
+
+	for (top = 0; top < EHCA_MAP_ENTRIES; top++) {
+		if (!ehca_bmap_valid(ehca_bmap->top[top]))
+			continue;
+		for (dir = 0; dir < EHCA_MAP_ENTRIES; dir++) {
+			if (!ehca_bmap_valid(ehca_bmap->top[top]->dir[dir]))
+				continue;
+
+			kfree(ehca_bmap->top[top]->dir[dir]);
+		}
+
+		kfree(ehca_bmap->top[top]);
+	}
+
+	kfree(ehca_bmap);
+	ehca_bmap = NULL;
+}
+
+static int ehca_update_busmap(unsigned long pfn, unsigned long nr_pages)
+{
+	unsigned long i, start_section, end_section;
+	int top, dir, idx;
+
+	if (!nr_pages)
+		return 0;
+
+	if (!ehca_bmap) {
+		ehca_bmap = kmalloc(sizeof(struct ehca_bmap), GFP_KERNEL);
+		if (!ehca_bmap)
+			return -ENOMEM;
+		/* Set map block to 0xFF according to EHCA_INVAL_ADDR */
+		memset(ehca_bmap, 0xFF, EHCA_TOP_MAP_SIZE);
+	}
+
+	start_section = (pfn * PAGE_SIZE) / EHCA_SECTSIZE;
+	end_section = ((pfn + nr_pages) * PAGE_SIZE) / EHCA_SECTSIZE;
+	for (i = start_section; i < end_section; i++) {
+		int ret;
+		top = ehca_calc_index(i, EHCA_TOP_INDEX_SHIFT);
+		dir = ehca_calc_index(i, EHCA_DIR_INDEX_SHIFT);
+		idx = i & EHCA_INDEX_MASK;
+
+		ret = ehca_init_bmap(ehca_bmap, top, dir);
+		if (ret) {
+			ehca_destroy_busmap();
+			return ret;
+		}
+		ehca_bmap->top[top]->dir[dir]->ent[idx] = ehca_mr_len;
+		ehca_mr_len += EHCA_SECTSIZE;
+	}
+	return 0;
+}
+
+static int ehca_is_hugepage(unsigned long pfn)
+{
+	int page_order;
+
+	if (pfn & EHCA_HUGEPAGE_PFN_MASK)
+		return 0;
+
+	page_order = compound_order(pfn_to_page(pfn));
+	if (page_order + PAGE_SHIFT != EHCA_HUGEPAGESHIFT)
+		return 0;
+
+	return 1;
+}
+
+static int ehca_create_busmap_callback(unsigned long initial_pfn,
+				       unsigned long total_nr_pages, void *arg)
+{
+	int ret;
+	unsigned long pfn, start_pfn, end_pfn, nr_pages;
+
+	if ((total_nr_pages * PAGE_SIZE) < EHCA_HUGEPAGE_SIZE)
+		return ehca_update_busmap(initial_pfn, total_nr_pages);
+
+	/* Given chunk is >= 16GB -> check for hugepages */
+	start_pfn = initial_pfn;
+	end_pfn = initial_pfn + total_nr_pages;
+	pfn = start_pfn;
+
+	while (pfn < end_pfn) {
+		if (ehca_is_hugepage(pfn)) {
+			/* Add mem found in front of the hugepage */
+			nr_pages = pfn - start_pfn;
+			ret = ehca_update_busmap(start_pfn, nr_pages);
+			if (ret)
+				return ret;
+			/* Skip the hugepage */
+			pfn += (EHCA_HUGEPAGE_SIZE / PAGE_SIZE);
+			start_pfn = pfn;
+		} else
+			pfn += (EHCA_SECTSIZE / PAGE_SIZE);
+	}
+
+	/* Add mem found behind the hugepage(s)  */
+	nr_pages = pfn - start_pfn;
+	return ehca_update_busmap(start_pfn, nr_pages);
+}
+
+int ehca_create_busmap(void)
+{
+	int ret;
+
+	ehca_mr_len = 0;
+	ret = walk_system_ram_range(0, 1ULL << MAX_PHYSMEM_BITS, NULL,
+				   ehca_create_busmap_callback);
+	return ret;
+}
+
+static int ehca_reg_bmap_mr_rpages(struct ehca_shca *shca,
+				   struct ehca_mr *e_mr,
+				   struct ehca_mr_pginfo *pginfo)
+{
+	int top;
+	u64 hret, *kpage;
+
+	kpage = ehca_alloc_fw_ctrlblock(GFP_KERNEL);
+	if (!kpage) {
+		ehca_err(&shca->ib_device, "kpage alloc failed");
+		return -ENOMEM;
+	}
+	for (top = 0; top < EHCA_MAP_ENTRIES; top++) {
+		if (!ehca_bmap_valid(ehca_bmap->top[top]))
+			continue;
+		hret = ehca_reg_mr_dir_sections(top, kpage, shca, e_mr, pginfo);
+		if ((hret != H_PAGE_REGISTERED) && (hret != H_SUCCESS))
+			break;
+	}
+
+	ehca_free_fw_ctrlblock(kpage);
+
+	if (hret == H_SUCCESS)
+		return 0; /* Everything is fine */
+	else {
+		ehca_err(&shca->ib_device, "ehca_reg_bmap_mr_rpages failed, "
+				 "h_ret=%lli e_mr=%p top=%x lkey=%x "
+				 "hca_hndl=%llx mr_hndl=%llx", hret, e_mr, top,
+				 e_mr->ib.ib_mr.lkey,
+				 shca->ipz_hca_handle.handle,
+				 e_mr->ipz_mr_handle.handle);
+		return ehca2ib_return_code(hret);
+	}
+}
+
+static u64 ehca_map_vaddr(void *caddr)
+{
+	int top, dir, idx;
+	unsigned long abs_addr, offset;
+	u64 entry;
+
+	if (!ehca_bmap)
+		return EHCA_INVAL_ADDR;
+
+	abs_addr = __pa(caddr);
+	top = ehca_calc_index(abs_addr, EHCA_TOP_INDEX_SHIFT + EHCA_SECTSHIFT);
+	if (!ehca_bmap_valid(ehca_bmap->top[top]))
+		return EHCA_INVAL_ADDR;
+
+	dir = ehca_calc_index(abs_addr, EHCA_DIR_INDEX_SHIFT + EHCA_SECTSHIFT);
+	if (!ehca_bmap_valid(ehca_bmap->top[top]->dir[dir]))
+		return EHCA_INVAL_ADDR;
+
+	idx = ehca_calc_index(abs_addr, EHCA_SECTSHIFT);
+
+	entry = ehca_bmap->top[top]->dir[dir]->ent[idx];
+	if (ehca_bmap_valid(entry)) {
+		offset = (unsigned long)caddr & (EHCA_SECTSIZE - 1);
+		return entry | offset;
+	} else
+		return EHCA_INVAL_ADDR;
+}
+
+static int ehca_dma_mapping_error(struct ib_device *dev, u64 dma_addr)
+{
+	return dma_addr == EHCA_INVAL_ADDR;
+}
+
+static u64 ehca_dma_map_single(struct ib_device *dev, void *cpu_addr,
+			       size_t size, enum dma_data_direction direction)
+{
+	if (cpu_addr)
+		return ehca_map_vaddr(cpu_addr);
+	else
+		return EHCA_INVAL_ADDR;
+}
+
+static void ehca_dma_unmap_single(struct ib_device *dev, u64 addr, size_t size,
+				  enum dma_data_direction direction)
+{
+	/* This is only a stub; nothing to be done here */
+}
+
+static u64 ehca_dma_map_page(struct ib_device *dev, struct page *page,
+			     unsigned long offset, size_t size,
+			     enum dma_data_direction direction)
+{
+	u64 addr;
+
+	if (offset + size > PAGE_SIZE)
+		return EHCA_INVAL_ADDR;
+
+	addr = ehca_map_vaddr(page_address(page));
+	if (!ehca_dma_mapping_error(dev, addr))
+		addr += offset;
+
+	return addr;
+}
+
+static void ehca_dma_unmap_page(struct ib_device *dev, u64 addr, size_t size,
+				enum dma_data_direction direction)
+{
+	/* This is only a stub; nothing to be done here */
+}
+
+static int ehca_dma_map_sg(struct ib_device *dev, struct scatterlist *sgl,
+			   int nents, enum dma_data_direction direction)
+{
+	struct scatterlist *sg;
+	int i;
+
+	for_each_sg(sgl, sg, nents, i) {
+		u64 addr;
+		addr = ehca_map_vaddr(sg_virt(sg));
+		if (ehca_dma_mapping_error(dev, addr))
+			return 0;
+
+		sg->dma_address = addr;
+		sg->dma_length = sg->length;
+	}
+	return nents;
+}
+
+static void ehca_dma_unmap_sg(struct ib_device *dev, struct scatterlist *sg,
+			      int nents, enum dma_data_direction direction)
+{
+	/* This is only a stub; nothing to be done here */
+}
+
+static void ehca_dma_sync_single_for_cpu(struct ib_device *dev, u64 addr,
+					 size_t size,
+					 enum dma_data_direction dir)
+{
+	dma_sync_single_for_cpu(dev->dma_device, addr, size, dir);
+}
+
+static void ehca_dma_sync_single_for_device(struct ib_device *dev, u64 addr,
+					    size_t size,
+					    enum dma_data_direction dir)
+{
+	dma_sync_single_for_device(dev->dma_device, addr, size, dir);
+}
+
+static void *ehca_dma_alloc_coherent(struct ib_device *dev, size_t size,
+				     u64 *dma_handle, gfp_t flag)
+{
+	struct page *p;
+	void *addr = NULL;
+	u64 dma_addr;
+
+	p = alloc_pages(flag, get_order(size));
+	if (p) {
+		addr = page_address(p);
+		dma_addr = ehca_map_vaddr(addr);
+		if (ehca_dma_mapping_error(dev, dma_addr)) {
+			free_pages((unsigned long)addr,	get_order(size));
+			return NULL;
+		}
+		if (dma_handle)
+			*dma_handle = dma_addr;
+		return addr;
+	}
+	return NULL;
+}
+
+static void ehca_dma_free_coherent(struct ib_device *dev, size_t size,
+				   void *cpu_addr, u64 dma_handle)
+{
+	if (cpu_addr && size)
+		free_pages((unsigned long)cpu_addr, get_order(size));
+}
+
+
+struct ib_dma_mapping_ops ehca_dma_mapping_ops = {
+	.mapping_error          = ehca_dma_mapping_error,
+	.map_single             = ehca_dma_map_single,
+	.unmap_single           = ehca_dma_unmap_single,
+	.map_page               = ehca_dma_map_page,
+	.unmap_page             = ehca_dma_unmap_page,
+	.map_sg                 = ehca_dma_map_sg,
+	.unmap_sg               = ehca_dma_unmap_sg,
+	.sync_single_for_cpu    = ehca_dma_sync_single_for_cpu,
+	.sync_single_for_device = ehca_dma_sync_single_for_device,
+	.alloc_coherent         = ehca_dma_alloc_coherent,
+	.free_coherent          = ehca_dma_free_coherent,
+};
diff --git a/drivers/infiniband/hw/ehca/ehca_mrmw.h b/drivers/infiniband/hw/ehca/ehca_mrmw.h
new file mode 100644
index 0000000..50d8b51
--- /dev/null
+++ b/drivers/infiniband/hw/ehca/ehca_mrmw.h
@@ -0,0 +1,132 @@
+/*
+ *  IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ *  MR/MW declarations and inline functions
+ *
+ *  Authors: Dietmar Decker <ddecker@de.ibm.com>
+ *           Christoph Raisch <raisch@de.ibm.com>
+ *
+ *  Copyright (c) 2005 IBM Corporation
+ *
+ *  All rights reserved.
+ *
+ *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ *  BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _EHCA_MRMW_H_
+#define _EHCA_MRMW_H_
+
+enum ehca_reg_type {
+	EHCA_REG_MR,
+	EHCA_REG_BUSMAP_MR
+};
+
+int ehca_reg_mr(struct ehca_shca *shca,
+		struct ehca_mr *e_mr,
+		u64 *iova_start,
+		u64 size,
+		int acl,
+		struct ehca_pd *e_pd,
+		struct ehca_mr_pginfo *pginfo,
+		u32 *lkey,
+		u32 *rkey,
+		enum ehca_reg_type reg_type);
+
+int ehca_reg_mr_rpages(struct ehca_shca *shca,
+		       struct ehca_mr *e_mr,
+		       struct ehca_mr_pginfo *pginfo);
+
+int ehca_rereg_mr(struct ehca_shca *shca,
+		  struct ehca_mr *e_mr,
+		  u64 *iova_start,
+		  u64 size,
+		  int mr_access_flags,
+		  struct ehca_pd *e_pd,
+		  struct ehca_mr_pginfo *pginfo,
+		  u32 *lkey,
+		  u32 *rkey);
+
+int ehca_unmap_one_fmr(struct ehca_shca *shca,
+		       struct ehca_mr *e_fmr);
+
+int ehca_reg_smr(struct ehca_shca *shca,
+		 struct ehca_mr *e_origmr,
+		 struct ehca_mr *e_newmr,
+		 u64 *iova_start,
+		 int acl,
+		 struct ehca_pd *e_pd,
+		 u32 *lkey,
+		 u32 *rkey);
+
+int ehca_reg_internal_maxmr(struct ehca_shca *shca,
+			    struct ehca_pd *e_pd,
+			    struct ehca_mr **maxmr);
+
+int ehca_reg_maxmr(struct ehca_shca *shca,
+		   struct ehca_mr *e_newmr,
+		   u64 *iova_start,
+		   int acl,
+		   struct ehca_pd *e_pd,
+		   u32 *lkey,
+		   u32 *rkey);
+
+int ehca_dereg_internal_maxmr(struct ehca_shca *shca);
+
+int ehca_mr_chk_buf_and_calc_size(struct ib_phys_buf *phys_buf_array,
+				  int num_phys_buf,
+				  u64 *iova_start,
+				  u64 *size);
+
+int ehca_fmr_check_page_list(struct ehca_mr *e_fmr,
+			     u64 *page_list,
+			     int list_len);
+
+int ehca_set_pagebuf(struct ehca_mr_pginfo *pginfo,
+		     u32 number,
+		     u64 *kpage);
+
+int ehca_mr_is_maxmr(u64 size,
+		     u64 *iova_start);
+
+void ehca_mrmw_map_acl(int ib_acl,
+		       u32 *hipz_acl);
+
+void ehca_mrmw_set_pgsize_hipz_acl(u32 pgsize, u32 *hipz_acl);
+
+void ehca_mrmw_reverse_map_acl(const u32 *hipz_acl,
+			       int *ib_acl);
+
+void ehca_mr_deletenew(struct ehca_mr *mr);
+
+int ehca_create_busmap(void);
+
+void ehca_destroy_busmap(void);
+
+extern struct ib_dma_mapping_ops ehca_dma_mapping_ops;
+#endif  /*_EHCA_MRMW_H_*/
diff --git a/drivers/infiniband/hw/ehca/ehca_pd.c b/drivers/infiniband/hw/ehca/ehca_pd.c
new file mode 100644
index 0000000..351577a
--- /dev/null
+++ b/drivers/infiniband/hw/ehca/ehca_pd.c
@@ -0,0 +1,124 @@
+/*
+ *  IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ *  PD functions
+ *
+ *  Authors: Christoph Raisch <raisch@de.ibm.com>
+ *
+ *  Copyright (c) 2005 IBM Corporation
+ *
+ *  All rights reserved.
+ *
+ *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ *  BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/slab.h>
+
+#include "ehca_tools.h"
+#include "ehca_iverbs.h"
+
+static struct kmem_cache *pd_cache;
+
+struct ib_pd *ehca_alloc_pd(struct ib_device *device,
+			    struct ib_ucontext *context, struct ib_udata *udata)
+{
+	struct ehca_pd *pd;
+	int i;
+
+	pd = kmem_cache_zalloc(pd_cache, GFP_KERNEL);
+	if (!pd) {
+		ehca_err(device, "device=%p context=%p out of memory",
+			 device, context);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	for (i = 0; i < 2; i++) {
+		INIT_LIST_HEAD(&pd->free[i]);
+		INIT_LIST_HEAD(&pd->full[i]);
+	}
+	mutex_init(&pd->lock);
+
+	/*
+	 * Kernel PD: when device = -1, 0
+	 * User   PD: when context != -1
+	 */
+	if (!context) {
+		/*
+		 * Kernel PDs after init reuses always
+		 * the one created in ehca_shca_reopen()
+		 */
+		struct ehca_shca *shca = container_of(device, struct ehca_shca,
+						      ib_device);
+		pd->fw_pd.value = shca->pd->fw_pd.value;
+	} else
+		pd->fw_pd.value = (u64)pd;
+
+	return &pd->ib_pd;
+}
+
+int ehca_dealloc_pd(struct ib_pd *pd)
+{
+	struct ehca_pd *my_pd = container_of(pd, struct ehca_pd, ib_pd);
+	int i, leftovers = 0;
+	struct ipz_small_queue_page *page, *tmp;
+
+	for (i = 0; i < 2; i++) {
+		list_splice(&my_pd->full[i], &my_pd->free[i]);
+		list_for_each_entry_safe(page, tmp, &my_pd->free[i], list) {
+			leftovers = 1;
+			free_page(page->page);
+			kmem_cache_free(small_qp_cache, page);
+		}
+	}
+
+	if (leftovers)
+		ehca_warn(pd->device,
+			  "Some small queue pages were not freed");
+
+	kmem_cache_free(pd_cache, my_pd);
+
+	return 0;
+}
+
+int ehca_init_pd_cache(void)
+{
+	pd_cache = kmem_cache_create("ehca_cache_pd",
+				     sizeof(struct ehca_pd), 0,
+				     SLAB_HWCACHE_ALIGN,
+				     NULL);
+	if (!pd_cache)
+		return -ENOMEM;
+	return 0;
+}
+
+void ehca_cleanup_pd_cache(void)
+{
+	if (pd_cache)
+		kmem_cache_destroy(pd_cache);
+}
diff --git a/drivers/infiniband/hw/ehca/ehca_qes.h b/drivers/infiniband/hw/ehca/ehca_qes.h
new file mode 100644
index 0000000..90c4efa
--- /dev/null
+++ b/drivers/infiniband/hw/ehca/ehca_qes.h
@@ -0,0 +1,260 @@
+/*
+ *  IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ *  Hardware request structures
+ *
+ *  Authors: Waleri Fomin <fomin@de.ibm.com>
+ *           Reinhard Ernst <rernst@de.ibm.com>
+ *           Christoph Raisch <raisch@de.ibm.com>
+ *
+ *  Copyright (c) 2005 IBM Corporation
+ *
+ *  All rights reserved.
+ *
+ *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ *  BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+#ifndef _EHCA_QES_H_
+#define _EHCA_QES_H_
+
+#include "ehca_tools.h"
+
+/* virtual scatter gather entry to specify remote addresses with length */
+struct ehca_vsgentry {
+	u64 vaddr;
+	u32 lkey;
+	u32 length;
+};
+
+#define GRH_FLAG_MASK        EHCA_BMASK_IBM( 7,  7)
+#define GRH_IPVERSION_MASK   EHCA_BMASK_IBM( 0,  3)
+#define GRH_TCLASS_MASK      EHCA_BMASK_IBM( 4, 12)
+#define GRH_FLOWLABEL_MASK   EHCA_BMASK_IBM(13, 31)
+#define GRH_PAYLEN_MASK      EHCA_BMASK_IBM(32, 47)
+#define GRH_NEXTHEADER_MASK  EHCA_BMASK_IBM(48, 55)
+#define GRH_HOPLIMIT_MASK    EHCA_BMASK_IBM(56, 63)
+
+/*
+ * Unreliable Datagram Address Vector Format
+ * see IBTA Vol1 chapter 8.3 Global Routing Header
+ */
+struct ehca_ud_av {
+	u8 sl;
+	u8 lnh;
+	u16 dlid;
+	u8 reserved1;
+	u8 reserved2;
+	u8 reserved3;
+	u8 slid_path_bits;
+	u8 reserved4;
+	u8 ipd;
+	u8 reserved5;
+	u8 pmtu;
+	u32 reserved6;
+	u64 reserved7;
+	union {
+		struct {
+			u64 word_0; /* always set to 6  */
+			/*should be 0x1B for IB transport */
+			u64 word_1;
+			u64 word_2;
+			u64 word_3;
+			u64 word_4;
+		} grh;
+		struct {
+			u32 wd_0;
+			u32 wd_1;
+			/* DWord_1 --> SGID */
+
+			u32 sgid_wd3;
+			u32 sgid_wd2;
+
+			u32 sgid_wd1;
+			u32 sgid_wd0;
+			/* DWord_3 --> DGID */
+
+			u32 dgid_wd3;
+			u32 dgid_wd2;
+
+			u32 dgid_wd1;
+			u32 dgid_wd0;
+		} grh_l;
+	};
+};
+
+/* maximum number of sg entries allowed in a WQE */
+#define MAX_WQE_SG_ENTRIES 252
+
+#define WQE_OPTYPE_SEND             0x80
+#define WQE_OPTYPE_RDMAREAD         0x40
+#define WQE_OPTYPE_RDMAWRITE        0x20
+#define WQE_OPTYPE_CMPSWAP          0x10
+#define WQE_OPTYPE_FETCHADD         0x08
+#define WQE_OPTYPE_BIND             0x04
+
+#define WQE_WRFLAG_REQ_SIGNAL_COM   0x80
+#define WQE_WRFLAG_FENCE            0x40
+#define WQE_WRFLAG_IMM_DATA_PRESENT 0x20
+#define WQE_WRFLAG_SOLIC_EVENT      0x10
+
+#define WQEF_CACHE_HINT             0x80
+#define WQEF_CACHE_HINT_RD_WR       0x40
+#define WQEF_TIMED_WQE              0x20
+#define WQEF_PURGE                  0x08
+#define WQEF_HIGH_NIBBLE            0xF0
+
+#define MW_BIND_ACCESSCTRL_R_WRITE   0x40
+#define MW_BIND_ACCESSCTRL_R_READ    0x20
+#define MW_BIND_ACCESSCTRL_R_ATOMIC  0x10
+
+struct ehca_wqe {
+	u64 work_request_id;
+	u8 optype;
+	u8 wr_flag;
+	u16 pkeyi;
+	u8 wqef;
+	u8 nr_of_data_seg;
+	u16 wqe_provided_slid;
+	u32 destination_qp_number;
+	u32 resync_psn_sqp;
+	u32 local_ee_context_qkey;
+	u32 immediate_data;
+	union {
+		struct {
+			u64 remote_virtual_address;
+			u32 rkey;
+			u32 reserved;
+			u64 atomic_1st_op_dma_len;
+			u64 atomic_2nd_op;
+			struct ehca_vsgentry sg_list[MAX_WQE_SG_ENTRIES];
+
+		} nud;
+		struct {
+			u64 ehca_ud_av_ptr;
+			u64 reserved1;
+			u64 reserved2;
+			u64 reserved3;
+			struct ehca_vsgentry sg_list[MAX_WQE_SG_ENTRIES];
+		} ud_avp;
+		struct {
+			struct ehca_ud_av ud_av;
+			struct ehca_vsgentry sg_list[MAX_WQE_SG_ENTRIES -
+						     2];
+		} ud_av;
+		struct {
+			u64 reserved0;
+			u64 reserved1;
+			u64 reserved2;
+			u64 reserved3;
+			struct ehca_vsgentry sg_list[MAX_WQE_SG_ENTRIES];
+		} all_rcv;
+
+		struct {
+			u64 reserved;
+			u32 rkey;
+			u32 old_rkey;
+			u64 reserved1;
+			u64 reserved2;
+			u64 virtual_address;
+			u32 reserved3;
+			u32 length;
+			u32 reserved4;
+			u16 reserved5;
+			u8 reserved6;
+			u8 lr_ctl;
+			u32 lkey;
+			u32 reserved7;
+			u64 reserved8;
+			u64 reserved9;
+			u64 reserved10;
+			u64 reserved11;
+		} bind;
+		struct {
+			u64 reserved12;
+			u64 reserved13;
+			u32 size;
+			u32 start;
+		} inline_data;
+	} u;
+
+};
+
+#define WC_SEND_RECEIVE EHCA_BMASK_IBM(0, 0)
+#define WC_IMM_DATA     EHCA_BMASK_IBM(1, 1)
+#define WC_GRH_PRESENT  EHCA_BMASK_IBM(2, 2)
+#define WC_SE_BIT       EHCA_BMASK_IBM(3, 3)
+#define WC_STATUS_ERROR_BIT 0x80000000
+#define WC_STATUS_REMOTE_ERROR_FLAGS 0x0000F800
+#define WC_STATUS_PURGE_BIT 0x10
+#define WC_SEND_RECEIVE_BIT 0x80
+
+struct ehca_cqe {
+	u64 work_request_id;
+	u8 optype;
+	u8 w_completion_flags;
+	u16 reserved1;
+	u32 nr_bytes_transferred;
+	u32 immediate_data;
+	u32 local_qp_number;
+	u8 freed_resource_count;
+	u8 service_level;
+	u16 wqe_count;
+	u32 qp_token;
+	u32 qkey_ee_token;
+	u32 remote_qp_number;
+	u16 dlid;
+	u16 rlid;
+	u16 reserved2;
+	u16 pkey_index;
+	u32 cqe_timestamp;
+	u32 wqe_timestamp;
+	u8 wqe_timestamp_valid;
+	u8 reserved3;
+	u8 reserved4;
+	u8 cqe_flags;
+	u32 status;
+};
+
+struct ehca_eqe {
+	u64 entry;
+};
+
+struct ehca_mrte {
+	u64 starting_va;
+	u64 length; /* length of memory region in bytes*/
+	u32 pd;
+	u8 key_instance;
+	u8 pagesize;
+	u8 mr_control;
+	u8 local_remote_access_ctrl;
+	u8 reserved[0x20 - 0x18];
+	u64 at_pointer[4];
+};
+#endif /*_EHCA_QES_H_*/
diff --git a/drivers/infiniband/hw/ehca/ehca_qp.c b/drivers/infiniband/hw/ehca/ehca_qp.c
new file mode 100644
index 0000000..2e89356
--- /dev/null
+++ b/drivers/infiniband/hw/ehca/ehca_qp.c
@@ -0,0 +1,2257 @@
+/*
+ *  IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ *  QP functions
+ *
+ *  Authors: Joachim Fenkes <fenkes@de.ibm.com>
+ *           Stefan Roscher <stefan.roscher@de.ibm.com>
+ *           Waleri Fomin <fomin@de.ibm.com>
+ *           Hoang-Nam Nguyen <hnguyen@de.ibm.com>
+ *           Reinhard Ernst <rernst@de.ibm.com>
+ *           Heiko J Schick <schickhj@de.ibm.com>
+ *
+ *  Copyright (c) 2005 IBM Corporation
+ *
+ *  All rights reserved.
+ *
+ *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ *  BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/slab.h>
+
+#include "ehca_classes.h"
+#include "ehca_tools.h"
+#include "ehca_qes.h"
+#include "ehca_iverbs.h"
+#include "hcp_if.h"
+#include "hipz_fns.h"
+
+static struct kmem_cache *qp_cache;
+
+/*
+ * attributes not supported by query qp
+ */
+#define QP_ATTR_QUERY_NOT_SUPPORTED (IB_QP_ACCESS_FLAGS       | \
+				     IB_QP_EN_SQD_ASYNC_NOTIFY)
+
+/*
+ * ehca (internal) qp state values
+ */
+enum ehca_qp_state {
+	EHCA_QPS_RESET = 1,
+	EHCA_QPS_INIT = 2,
+	EHCA_QPS_RTR = 3,
+	EHCA_QPS_RTS = 5,
+	EHCA_QPS_SQD = 6,
+	EHCA_QPS_SQE = 8,
+	EHCA_QPS_ERR = 128
+};
+
+/*
+ * qp state transitions as defined by IB Arch Rel 1.1 page 431
+ */
+enum ib_qp_statetrans {
+	IB_QPST_ANY2RESET,
+	IB_QPST_ANY2ERR,
+	IB_QPST_RESET2INIT,
+	IB_QPST_INIT2RTR,
+	IB_QPST_INIT2INIT,
+	IB_QPST_RTR2RTS,
+	IB_QPST_RTS2SQD,
+	IB_QPST_RTS2RTS,
+	IB_QPST_SQD2RTS,
+	IB_QPST_SQE2RTS,
+	IB_QPST_SQD2SQD,
+	IB_QPST_MAX	/* nr of transitions, this must be last!!! */
+};
+
+/*
+ * ib2ehca_qp_state maps IB to ehca qp_state
+ * returns ehca qp state corresponding to given ib qp state
+ */
+static inline enum ehca_qp_state ib2ehca_qp_state(enum ib_qp_state ib_qp_state)
+{
+	switch (ib_qp_state) {
+	case IB_QPS_RESET:
+		return EHCA_QPS_RESET;
+	case IB_QPS_INIT:
+		return EHCA_QPS_INIT;
+	case IB_QPS_RTR:
+		return EHCA_QPS_RTR;
+	case IB_QPS_RTS:
+		return EHCA_QPS_RTS;
+	case IB_QPS_SQD:
+		return EHCA_QPS_SQD;
+	case IB_QPS_SQE:
+		return EHCA_QPS_SQE;
+	case IB_QPS_ERR:
+		return EHCA_QPS_ERR;
+	default:
+		ehca_gen_err("invalid ib_qp_state=%x", ib_qp_state);
+		return -EINVAL;
+	}
+}
+
+/*
+ * ehca2ib_qp_state maps ehca to IB qp_state
+ * returns ib qp state corresponding to given ehca qp state
+ */
+static inline enum ib_qp_state ehca2ib_qp_state(enum ehca_qp_state
+						ehca_qp_state)
+{
+	switch (ehca_qp_state) {
+	case EHCA_QPS_RESET:
+		return IB_QPS_RESET;
+	case EHCA_QPS_INIT:
+		return IB_QPS_INIT;
+	case EHCA_QPS_RTR:
+		return IB_QPS_RTR;
+	case EHCA_QPS_RTS:
+		return IB_QPS_RTS;
+	case EHCA_QPS_SQD:
+		return IB_QPS_SQD;
+	case EHCA_QPS_SQE:
+		return IB_QPS_SQE;
+	case EHCA_QPS_ERR:
+		return IB_QPS_ERR;
+	default:
+		ehca_gen_err("invalid ehca_qp_state=%x", ehca_qp_state);
+		return -EINVAL;
+	}
+}
+
+/*
+ * ehca_qp_type used as index for req_attr and opt_attr of
+ * struct ehca_modqp_statetrans
+ */
+enum ehca_qp_type {
+	QPT_RC = 0,
+	QPT_UC = 1,
+	QPT_UD = 2,
+	QPT_SQP = 3,
+	QPT_MAX
+};
+
+/*
+ * ib2ehcaqptype maps Ib to ehca qp_type
+ * returns ehca qp type corresponding to ib qp type
+ */
+static inline enum ehca_qp_type ib2ehcaqptype(enum ib_qp_type ibqptype)
+{
+	switch (ibqptype) {
+	case IB_QPT_SMI:
+	case IB_QPT_GSI:
+		return QPT_SQP;
+	case IB_QPT_RC:
+		return QPT_RC;
+	case IB_QPT_UC:
+		return QPT_UC;
+	case IB_QPT_UD:
+		return QPT_UD;
+	default:
+		ehca_gen_err("Invalid ibqptype=%x", ibqptype);
+		return -EINVAL;
+	}
+}
+
+static inline enum ib_qp_statetrans get_modqp_statetrans(int ib_fromstate,
+							 int ib_tostate)
+{
+	int index = -EINVAL;
+	switch (ib_tostate) {
+	case IB_QPS_RESET:
+		index = IB_QPST_ANY2RESET;
+		break;
+	case IB_QPS_INIT:
+		switch (ib_fromstate) {
+		case IB_QPS_RESET:
+			index = IB_QPST_RESET2INIT;
+			break;
+		case IB_QPS_INIT:
+			index = IB_QPST_INIT2INIT;
+			break;
+		}
+		break;
+	case IB_QPS_RTR:
+		if (ib_fromstate == IB_QPS_INIT)
+			index = IB_QPST_INIT2RTR;
+		break;
+	case IB_QPS_RTS:
+		switch (ib_fromstate) {
+		case IB_QPS_RTR:
+			index = IB_QPST_RTR2RTS;
+			break;
+		case IB_QPS_RTS:
+			index = IB_QPST_RTS2RTS;
+			break;
+		case IB_QPS_SQD:
+			index = IB_QPST_SQD2RTS;
+			break;
+		case IB_QPS_SQE:
+			index = IB_QPST_SQE2RTS;
+			break;
+		}
+		break;
+	case IB_QPS_SQD:
+		if (ib_fromstate == IB_QPS_RTS)
+			index = IB_QPST_RTS2SQD;
+		break;
+	case IB_QPS_SQE:
+		break;
+	case IB_QPS_ERR:
+		index = IB_QPST_ANY2ERR;
+		break;
+	default:
+		break;
+	}
+	return index;
+}
+
+/*
+ * ibqptype2servicetype returns hcp service type corresponding to given
+ * ib qp type used by create_qp()
+ */
+static inline int ibqptype2servicetype(enum ib_qp_type ibqptype)
+{
+	switch (ibqptype) {
+	case IB_QPT_SMI:
+	case IB_QPT_GSI:
+		return ST_UD;
+	case IB_QPT_RC:
+		return ST_RC;
+	case IB_QPT_UC:
+		return ST_UC;
+	case IB_QPT_UD:
+		return ST_UD;
+	case IB_QPT_RAW_IPV6:
+		return -EINVAL;
+	case IB_QPT_RAW_ETHERTYPE:
+		return -EINVAL;
+	default:
+		ehca_gen_err("Invalid ibqptype=%x", ibqptype);
+		return -EINVAL;
+	}
+}
+
+/*
+ * init userspace queue info from ipz_queue data
+ */
+static inline void queue2resp(struct ipzu_queue_resp *resp,
+			      struct ipz_queue *queue)
+{
+	resp->qe_size = queue->qe_size;
+	resp->act_nr_of_sg = queue->act_nr_of_sg;
+	resp->queue_length = queue->queue_length;
+	resp->pagesize = queue->pagesize;
+	resp->toggle_state = queue->toggle_state;
+	resp->offset = queue->offset;
+}
+
+/*
+ * init_qp_queue initializes/constructs r/squeue and registers queue pages.
+ */
+static inline int init_qp_queue(struct ehca_shca *shca,
+				struct ehca_pd *pd,
+				struct ehca_qp *my_qp,
+				struct ipz_queue *queue,
+				int q_type,
+				u64 expected_hret,
+				struct ehca_alloc_queue_parms *parms,
+				int wqe_size)
+{
+	int ret, cnt, ipz_rc, nr_q_pages;
+	void *vpage;
+	u64 rpage, h_ret;
+	struct ib_device *ib_dev = &shca->ib_device;
+	struct ipz_adapter_handle ipz_hca_handle = shca->ipz_hca_handle;
+
+	if (!parms->queue_size)
+		return 0;
+
+	if (parms->is_small) {
+		nr_q_pages = 1;
+		ipz_rc = ipz_queue_ctor(pd, queue, nr_q_pages,
+					128 << parms->page_size,
+					wqe_size, parms->act_nr_sges, 1);
+	} else {
+		nr_q_pages = parms->queue_size;
+		ipz_rc = ipz_queue_ctor(pd, queue, nr_q_pages,
+					EHCA_PAGESIZE, wqe_size,
+					parms->act_nr_sges, 0);
+	}
+
+	if (!ipz_rc) {
+		ehca_err(ib_dev, "Cannot allocate page for queue. ipz_rc=%i",
+			 ipz_rc);
+		return -EBUSY;
+	}
+
+	/* register queue pages */
+	for (cnt = 0; cnt < nr_q_pages; cnt++) {
+		vpage = ipz_qpageit_get_inc(queue);
+		if (!vpage) {
+			ehca_err(ib_dev, "ipz_qpageit_get_inc() "
+				 "failed p_vpage= %p", vpage);
+			ret = -EINVAL;
+			goto init_qp_queue1;
+		}
+		rpage = __pa(vpage);
+
+		h_ret = hipz_h_register_rpage_qp(ipz_hca_handle,
+						 my_qp->ipz_qp_handle,
+						 NULL, 0, q_type,
+						 rpage, parms->is_small ? 0 : 1,
+						 my_qp->galpas.kernel);
+		if (cnt == (nr_q_pages - 1)) {	/* last page! */
+			if (h_ret != expected_hret) {
+				ehca_err(ib_dev, "hipz_qp_register_rpage() "
+					 "h_ret=%lli", h_ret);
+				ret = ehca2ib_return_code(h_ret);
+				goto init_qp_queue1;
+			}
+			vpage = ipz_qpageit_get_inc(&my_qp->ipz_rqueue);
+			if (vpage) {
+				ehca_err(ib_dev, "ipz_qpageit_get_inc() "
+					 "should not succeed vpage=%p", vpage);
+				ret = -EINVAL;
+				goto init_qp_queue1;
+			}
+		} else {
+			if (h_ret != H_PAGE_REGISTERED) {
+				ehca_err(ib_dev, "hipz_qp_register_rpage() "
+					 "h_ret=%lli", h_ret);
+				ret = ehca2ib_return_code(h_ret);
+				goto init_qp_queue1;
+			}
+		}
+	}
+
+	ipz_qeit_reset(queue);
+
+	return 0;
+
+init_qp_queue1:
+	ipz_queue_dtor(pd, queue);
+	return ret;
+}
+
+static inline int ehca_calc_wqe_size(int act_nr_sge, int is_llqp)
+{
+	if (is_llqp)
+		return 128 << act_nr_sge;
+	else
+		return offsetof(struct ehca_wqe,
+				u.nud.sg_list[act_nr_sge]);
+}
+
+static void ehca_determine_small_queue(struct ehca_alloc_queue_parms *queue,
+				       int req_nr_sge, int is_llqp)
+{
+	u32 wqe_size, q_size;
+	int act_nr_sge = req_nr_sge;
+
+	if (!is_llqp)
+		/* round up #SGEs so WQE size is a power of 2 */
+		for (act_nr_sge = 4; act_nr_sge <= 252;
+		     act_nr_sge = 4 + 2 * act_nr_sge)
+			if (act_nr_sge >= req_nr_sge)
+				break;
+
+	wqe_size = ehca_calc_wqe_size(act_nr_sge, is_llqp);
+	q_size = wqe_size * (queue->max_wr + 1);
+
+	if (q_size <= 512)
+		queue->page_size = 2;
+	else if (q_size <= 1024)
+		queue->page_size = 3;
+	else
+		queue->page_size = 0;
+
+	queue->is_small = (queue->page_size != 0);
+}
+
+/* needs to be called with cq->spinlock held */
+void ehca_add_to_err_list(struct ehca_qp *qp, int on_sq)
+{
+	struct list_head *list, *node;
+
+	/* TODO: support low latency QPs */
+	if (qp->ext_type == EQPT_LLQP)
+		return;
+
+	if (on_sq) {
+		list = &qp->send_cq->sqp_err_list;
+		node = &qp->sq_err_node;
+	} else {
+		list = &qp->recv_cq->rqp_err_list;
+		node = &qp->rq_err_node;
+	}
+
+	if (list_empty(node))
+		list_add_tail(node, list);
+
+	return;
+}
+
+static void del_from_err_list(struct ehca_cq *cq, struct list_head *node)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&cq->spinlock, flags);
+
+	if (!list_empty(node))
+		list_del_init(node);
+
+	spin_unlock_irqrestore(&cq->spinlock, flags);
+}
+
+static void reset_queue_map(struct ehca_queue_map *qmap)
+{
+	int i;
+
+	qmap->tail = qmap->entries - 1;
+	qmap->left_to_poll = 0;
+	qmap->next_wqe_idx = 0;
+	for (i = 0; i < qmap->entries; i++) {
+		qmap->map[i].reported = 1;
+		qmap->map[i].cqe_req = 0;
+	}
+}
+
+/*
+ * Create an ib_qp struct that is either a QP or an SRQ, depending on
+ * the value of the is_srq parameter. If init_attr and srq_init_attr share
+ * fields, the field out of init_attr is used.
+ */
+static struct ehca_qp *internal_create_qp(
+	struct ib_pd *pd,
+	struct ib_qp_init_attr *init_attr,
+	struct ib_srq_init_attr *srq_init_attr,
+	struct ib_udata *udata, int is_srq)
+{
+	struct ehca_qp *my_qp, *my_srq = NULL;
+	struct ehca_pd *my_pd = container_of(pd, struct ehca_pd, ib_pd);
+	struct ehca_shca *shca = container_of(pd->device, struct ehca_shca,
+					      ib_device);
+	struct ib_ucontext *context = NULL;
+	u64 h_ret;
+	int is_llqp = 0, has_srq = 0, is_user = 0;
+	int qp_type, max_send_sge, max_recv_sge, ret;
+
+	/* h_call's out parameters */
+	struct ehca_alloc_qp_parms parms;
+	u32 swqe_size = 0, rwqe_size = 0, ib_qp_num;
+	unsigned long flags;
+
+	if (!atomic_add_unless(&shca->num_qps, 1, shca->max_num_qps)) {
+		ehca_err(pd->device, "Unable to create QP, max number of %i "
+			 "QPs reached.", shca->max_num_qps);
+		ehca_err(pd->device, "To increase the maximum number of QPs "
+			 "use the number_of_qps module parameter.\n");
+		return ERR_PTR(-ENOSPC);
+	}
+
+	if (init_attr->create_flags) {
+		atomic_dec(&shca->num_qps);
+		return ERR_PTR(-EINVAL);
+	}
+
+	memset(&parms, 0, sizeof(parms));
+	qp_type = init_attr->qp_type;
+
+	if (init_attr->sq_sig_type != IB_SIGNAL_REQ_WR &&
+		init_attr->sq_sig_type != IB_SIGNAL_ALL_WR) {
+		ehca_err(pd->device, "init_attr->sg_sig_type=%x not allowed",
+			 init_attr->sq_sig_type);
+		atomic_dec(&shca->num_qps);
+		return ERR_PTR(-EINVAL);
+	}
+
+	/* save LLQP info */
+	if (qp_type & 0x80) {
+		is_llqp = 1;
+		parms.ext_type = EQPT_LLQP;
+		parms.ll_comp_flags = qp_type & LLQP_COMP_MASK;
+	}
+	qp_type &= 0x1F;
+	init_attr->qp_type &= 0x1F;
+
+	/* handle SRQ base QPs */
+	if (init_attr->srq) {
+		my_srq = container_of(init_attr->srq, struct ehca_qp, ib_srq);
+
+		if (qp_type == IB_QPT_UC) {
+			ehca_err(pd->device, "UC with SRQ not supported");
+			atomic_dec(&shca->num_qps);
+			return ERR_PTR(-EINVAL);
+		}
+
+		has_srq = 1;
+		parms.ext_type = EQPT_SRQBASE;
+		parms.srq_qpn = my_srq->real_qp_num;
+	}
+
+	if (is_llqp && has_srq) {
+		ehca_err(pd->device, "LLQPs can't have an SRQ");
+		atomic_dec(&shca->num_qps);
+		return ERR_PTR(-EINVAL);
+	}
+
+	/* handle SRQs */
+	if (is_srq) {
+		parms.ext_type = EQPT_SRQ;
+		parms.srq_limit = srq_init_attr->attr.srq_limit;
+		if (init_attr->cap.max_recv_sge > 3) {
+			ehca_err(pd->device, "no more than three SGEs "
+				 "supported for SRQ  pd=%p  max_sge=%x",
+				 pd, init_attr->cap.max_recv_sge);
+			atomic_dec(&shca->num_qps);
+			return ERR_PTR(-EINVAL);
+		}
+	}
+
+	/* check QP type */
+	if (qp_type != IB_QPT_UD &&
+	    qp_type != IB_QPT_UC &&
+	    qp_type != IB_QPT_RC &&
+	    qp_type != IB_QPT_SMI &&
+	    qp_type != IB_QPT_GSI) {
+		ehca_err(pd->device, "wrong QP Type=%x", qp_type);
+		atomic_dec(&shca->num_qps);
+		return ERR_PTR(-EINVAL);
+	}
+
+	if (is_llqp) {
+		switch (qp_type) {
+		case IB_QPT_RC:
+			if ((init_attr->cap.max_send_wr > 255) ||
+			    (init_attr->cap.max_recv_wr > 255)) {
+				ehca_err(pd->device,
+					 "Invalid Number of max_sq_wr=%x "
+					 "or max_rq_wr=%x for RC LLQP",
+					 init_attr->cap.max_send_wr,
+					 init_attr->cap.max_recv_wr);
+				atomic_dec(&shca->num_qps);
+				return ERR_PTR(-EINVAL);
+			}
+			break;
+		case IB_QPT_UD:
+			if (!EHCA_BMASK_GET(HCA_CAP_UD_LL_QP, shca->hca_cap)) {
+				ehca_err(pd->device, "UD LLQP not supported "
+					 "by this adapter");
+				atomic_dec(&shca->num_qps);
+				return ERR_PTR(-ENOSYS);
+			}
+			if (!(init_attr->cap.max_send_sge <= 5
+			    && init_attr->cap.max_send_sge >= 1
+			    && init_attr->cap.max_recv_sge <= 5
+			    && init_attr->cap.max_recv_sge >= 1)) {
+				ehca_err(pd->device,
+					 "Invalid Number of max_send_sge=%x "
+					 "or max_recv_sge=%x for UD LLQP",
+					 init_attr->cap.max_send_sge,
+					 init_attr->cap.max_recv_sge);
+				atomic_dec(&shca->num_qps);
+				return ERR_PTR(-EINVAL);
+			} else if (init_attr->cap.max_send_wr > 255) {
+				ehca_err(pd->device,
+					 "Invalid Number of "
+					 "max_send_wr=%x for UD QP_TYPE=%x",
+					 init_attr->cap.max_send_wr, qp_type);
+				atomic_dec(&shca->num_qps);
+				return ERR_PTR(-EINVAL);
+			}
+			break;
+		default:
+			ehca_err(pd->device, "unsupported LL QP Type=%x",
+				 qp_type);
+			atomic_dec(&shca->num_qps);
+			return ERR_PTR(-EINVAL);
+		}
+	} else {
+		int max_sge = (qp_type == IB_QPT_UD || qp_type == IB_QPT_SMI
+			       || qp_type == IB_QPT_GSI) ? 250 : 252;
+
+		if (init_attr->cap.max_send_sge > max_sge
+		    || init_attr->cap.max_recv_sge > max_sge) {
+			ehca_err(pd->device, "Invalid number of SGEs requested "
+				 "send_sge=%x recv_sge=%x max_sge=%x",
+				 init_attr->cap.max_send_sge,
+				 init_attr->cap.max_recv_sge, max_sge);
+			atomic_dec(&shca->num_qps);
+			return ERR_PTR(-EINVAL);
+		}
+	}
+
+	my_qp = kmem_cache_zalloc(qp_cache, GFP_KERNEL);
+	if (!my_qp) {
+		ehca_err(pd->device, "pd=%p not enough memory to alloc qp", pd);
+		atomic_dec(&shca->num_qps);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	if (pd->uobject && udata) {
+		is_user = 1;
+		context = pd->uobject->context;
+	}
+
+	atomic_set(&my_qp->nr_events, 0);
+	init_waitqueue_head(&my_qp->wait_completion);
+	spin_lock_init(&my_qp->spinlock_s);
+	spin_lock_init(&my_qp->spinlock_r);
+	my_qp->qp_type = qp_type;
+	my_qp->ext_type = parms.ext_type;
+	my_qp->state = IB_QPS_RESET;
+
+	if (init_attr->recv_cq)
+		my_qp->recv_cq =
+			container_of(init_attr->recv_cq, struct ehca_cq, ib_cq);
+	if (init_attr->send_cq)
+		my_qp->send_cq =
+			container_of(init_attr->send_cq, struct ehca_cq, ib_cq);
+
+	idr_preload(GFP_KERNEL);
+	write_lock_irqsave(&ehca_qp_idr_lock, flags);
+
+	ret = idr_alloc(&ehca_qp_idr, my_qp, 0, 0x2000000, GFP_NOWAIT);
+	if (ret >= 0)
+		my_qp->token = ret;
+
+	write_unlock_irqrestore(&ehca_qp_idr_lock, flags);
+	idr_preload_end();
+	if (ret < 0) {
+		if (ret == -ENOSPC) {
+			ret = -EINVAL;
+			ehca_err(pd->device, "Invalid number of qp");
+		} else {
+			ret = -ENOMEM;
+			ehca_err(pd->device, "Can't allocate new idr entry.");
+		}
+		goto create_qp_exit0;
+	}
+
+	if (has_srq)
+		parms.srq_token = my_qp->token;
+
+	parms.servicetype = ibqptype2servicetype(qp_type);
+	if (parms.servicetype < 0) {
+		ret = -EINVAL;
+		ehca_err(pd->device, "Invalid qp_type=%x", qp_type);
+		goto create_qp_exit1;
+	}
+
+	/* Always signal by WQE so we can hide circ. WQEs */
+	parms.sigtype = HCALL_SIGT_BY_WQE;
+
+	/* UD_AV CIRCUMVENTION */
+	max_send_sge = init_attr->cap.max_send_sge;
+	max_recv_sge = init_attr->cap.max_recv_sge;
+	if (parms.servicetype == ST_UD && !is_llqp) {
+		max_send_sge += 2;
+		max_recv_sge += 2;
+	}
+
+	parms.token = my_qp->token;
+	parms.eq_handle = shca->eq.ipz_eq_handle;
+	parms.pd = my_pd->fw_pd;
+	if (my_qp->send_cq)
+		parms.send_cq_handle = my_qp->send_cq->ipz_cq_handle;
+	if (my_qp->recv_cq)
+		parms.recv_cq_handle = my_qp->recv_cq->ipz_cq_handle;
+
+	parms.squeue.max_wr = init_attr->cap.max_send_wr;
+	parms.rqueue.max_wr = init_attr->cap.max_recv_wr;
+	parms.squeue.max_sge = max_send_sge;
+	parms.rqueue.max_sge = max_recv_sge;
+
+	/* RC QPs need one more SWQE for unsolicited ack circumvention */
+	if (qp_type == IB_QPT_RC)
+		parms.squeue.max_wr++;
+
+	if (EHCA_BMASK_GET(HCA_CAP_MINI_QP, shca->hca_cap)) {
+		if (HAS_SQ(my_qp))
+			ehca_determine_small_queue(
+				&parms.squeue, max_send_sge, is_llqp);
+		if (HAS_RQ(my_qp))
+			ehca_determine_small_queue(
+				&parms.rqueue, max_recv_sge, is_llqp);
+		parms.qp_storage =
+			(parms.squeue.is_small || parms.rqueue.is_small);
+	}
+
+	h_ret = hipz_h_alloc_resource_qp(shca->ipz_hca_handle, &parms, is_user);
+	if (h_ret != H_SUCCESS) {
+		ehca_err(pd->device, "h_alloc_resource_qp() failed h_ret=%lli",
+			 h_ret);
+		ret = ehca2ib_return_code(h_ret);
+		goto create_qp_exit1;
+	}
+
+	ib_qp_num = my_qp->real_qp_num = parms.real_qp_num;
+	my_qp->ipz_qp_handle = parms.qp_handle;
+	my_qp->galpas = parms.galpas;
+
+	swqe_size = ehca_calc_wqe_size(parms.squeue.act_nr_sges, is_llqp);
+	rwqe_size = ehca_calc_wqe_size(parms.rqueue.act_nr_sges, is_llqp);
+
+	switch (qp_type) {
+	case IB_QPT_RC:
+		if (is_llqp) {
+			parms.squeue.act_nr_sges = 1;
+			parms.rqueue.act_nr_sges = 1;
+		}
+		/* hide the extra WQE */
+		parms.squeue.act_nr_wqes--;
+		break;
+	case IB_QPT_UD:
+	case IB_QPT_GSI:
+	case IB_QPT_SMI:
+		/* UD circumvention */
+		if (is_llqp) {
+			parms.squeue.act_nr_sges = 1;
+			parms.rqueue.act_nr_sges = 1;
+		} else {
+			parms.squeue.act_nr_sges -= 2;
+			parms.rqueue.act_nr_sges -= 2;
+		}
+
+		if (IB_QPT_GSI == qp_type || IB_QPT_SMI == qp_type) {
+			parms.squeue.act_nr_wqes = init_attr->cap.max_send_wr;
+			parms.rqueue.act_nr_wqes = init_attr->cap.max_recv_wr;
+			parms.squeue.act_nr_sges = init_attr->cap.max_send_sge;
+			parms.rqueue.act_nr_sges = init_attr->cap.max_recv_sge;
+			ib_qp_num = (qp_type == IB_QPT_SMI) ? 0 : 1;
+		}
+
+		break;
+
+	default:
+		break;
+	}
+
+	/* initialize r/squeue and register queue pages */
+	if (HAS_SQ(my_qp)) {
+		ret = init_qp_queue(
+			shca, my_pd, my_qp, &my_qp->ipz_squeue, 0,
+			HAS_RQ(my_qp) ? H_PAGE_REGISTERED : H_SUCCESS,
+			&parms.squeue, swqe_size);
+		if (ret) {
+			ehca_err(pd->device, "Couldn't initialize squeue "
+				 "and pages ret=%i", ret);
+			goto create_qp_exit2;
+		}
+
+		if (!is_user) {
+			my_qp->sq_map.entries = my_qp->ipz_squeue.queue_length /
+				my_qp->ipz_squeue.qe_size;
+			my_qp->sq_map.map = vmalloc(my_qp->sq_map.entries *
+						    sizeof(struct ehca_qmap_entry));
+			if (!my_qp->sq_map.map) {
+				ehca_err(pd->device, "Couldn't allocate squeue "
+					 "map ret=%i", ret);
+				goto create_qp_exit3;
+			}
+			INIT_LIST_HEAD(&my_qp->sq_err_node);
+			/* to avoid the generation of bogus flush CQEs */
+			reset_queue_map(&my_qp->sq_map);
+		}
+	}
+
+	if (HAS_RQ(my_qp)) {
+		ret = init_qp_queue(
+			shca, my_pd, my_qp, &my_qp->ipz_rqueue, 1,
+			H_SUCCESS, &parms.rqueue, rwqe_size);
+		if (ret) {
+			ehca_err(pd->device, "Couldn't initialize rqueue "
+				 "and pages ret=%i", ret);
+			goto create_qp_exit4;
+		}
+		if (!is_user) {
+			my_qp->rq_map.entries = my_qp->ipz_rqueue.queue_length /
+				my_qp->ipz_rqueue.qe_size;
+			my_qp->rq_map.map = vmalloc(my_qp->rq_map.entries *
+						    sizeof(struct ehca_qmap_entry));
+			if (!my_qp->rq_map.map) {
+				ehca_err(pd->device, "Couldn't allocate squeue "
+					 "map ret=%i", ret);
+				goto create_qp_exit5;
+			}
+			INIT_LIST_HEAD(&my_qp->rq_err_node);
+			/* to avoid the generation of bogus flush CQEs */
+			reset_queue_map(&my_qp->rq_map);
+		}
+	} else if (init_attr->srq && !is_user) {
+		/* this is a base QP, use the queue map of the SRQ */
+		my_qp->rq_map = my_srq->rq_map;
+		INIT_LIST_HEAD(&my_qp->rq_err_node);
+
+		my_qp->ipz_rqueue = my_srq->ipz_rqueue;
+	}
+
+	if (is_srq) {
+		my_qp->ib_srq.pd = &my_pd->ib_pd;
+		my_qp->ib_srq.device = my_pd->ib_pd.device;
+
+		my_qp->ib_srq.srq_context = init_attr->qp_context;
+		my_qp->ib_srq.event_handler = init_attr->event_handler;
+	} else {
+		my_qp->ib_qp.qp_num = ib_qp_num;
+		my_qp->ib_qp.pd = &my_pd->ib_pd;
+		my_qp->ib_qp.device = my_pd->ib_pd.device;
+
+		my_qp->ib_qp.recv_cq = init_attr->recv_cq;
+		my_qp->ib_qp.send_cq = init_attr->send_cq;
+
+		my_qp->ib_qp.qp_type = qp_type;
+		my_qp->ib_qp.srq = init_attr->srq;
+
+		my_qp->ib_qp.qp_context = init_attr->qp_context;
+		my_qp->ib_qp.event_handler = init_attr->event_handler;
+	}
+
+	init_attr->cap.max_inline_data = 0; /* not supported yet */
+	init_attr->cap.max_recv_sge = parms.rqueue.act_nr_sges;
+	init_attr->cap.max_recv_wr = parms.rqueue.act_nr_wqes;
+	init_attr->cap.max_send_sge = parms.squeue.act_nr_sges;
+	init_attr->cap.max_send_wr = parms.squeue.act_nr_wqes;
+	my_qp->init_attr = *init_attr;
+
+	if (qp_type == IB_QPT_SMI || qp_type == IB_QPT_GSI) {
+		shca->sport[init_attr->port_num - 1].ibqp_sqp[qp_type] =
+			&my_qp->ib_qp;
+		if (ehca_nr_ports < 0) {
+			/* alloc array to cache subsequent modify qp parms
+			 * for autodetect mode
+			 */
+			my_qp->mod_qp_parm =
+				kzalloc(EHCA_MOD_QP_PARM_MAX *
+					sizeof(*my_qp->mod_qp_parm),
+					GFP_KERNEL);
+			if (!my_qp->mod_qp_parm) {
+				ehca_err(pd->device,
+					 "Could not alloc mod_qp_parm");
+				goto create_qp_exit5;
+			}
+		}
+	}
+
+	/* NOTE: define_apq0() not supported yet */
+	if (qp_type == IB_QPT_GSI) {
+		h_ret = ehca_define_sqp(shca, my_qp, init_attr);
+		if (h_ret != H_SUCCESS) {
+			kfree(my_qp->mod_qp_parm);
+			my_qp->mod_qp_parm = NULL;
+			/* the QP pointer is no longer valid */
+			shca->sport[init_attr->port_num - 1].ibqp_sqp[qp_type] =
+				NULL;
+			ret = ehca2ib_return_code(h_ret);
+			goto create_qp_exit6;
+		}
+	}
+
+	if (my_qp->send_cq) {
+		ret = ehca_cq_assign_qp(my_qp->send_cq, my_qp);
+		if (ret) {
+			ehca_err(pd->device,
+				 "Couldn't assign qp to send_cq ret=%i", ret);
+			goto create_qp_exit7;
+		}
+	}
+
+	/* copy queues, galpa data to user space */
+	if (context && udata) {
+		struct ehca_create_qp_resp resp;
+		memset(&resp, 0, sizeof(resp));
+
+		resp.qp_num = my_qp->real_qp_num;
+		resp.token = my_qp->token;
+		resp.qp_type = my_qp->qp_type;
+		resp.ext_type = my_qp->ext_type;
+		resp.qkey = my_qp->qkey;
+		resp.real_qp_num = my_qp->real_qp_num;
+
+		if (HAS_SQ(my_qp))
+			queue2resp(&resp.ipz_squeue, &my_qp->ipz_squeue);
+		if (HAS_RQ(my_qp))
+			queue2resp(&resp.ipz_rqueue, &my_qp->ipz_rqueue);
+		resp.fw_handle_ofs = (u32)
+			(my_qp->galpas.user.fw_handle & (PAGE_SIZE - 1));
+
+		if (ib_copy_to_udata(udata, &resp, sizeof resp)) {
+			ehca_err(pd->device, "Copy to udata failed");
+			ret = -EINVAL;
+			goto create_qp_exit8;
+		}
+	}
+
+	return my_qp;
+
+create_qp_exit8:
+	ehca_cq_unassign_qp(my_qp->send_cq, my_qp->real_qp_num);
+
+create_qp_exit7:
+	kfree(my_qp->mod_qp_parm);
+
+create_qp_exit6:
+	if (HAS_RQ(my_qp) && !is_user)
+		vfree(my_qp->rq_map.map);
+
+create_qp_exit5:
+	if (HAS_RQ(my_qp))
+		ipz_queue_dtor(my_pd, &my_qp->ipz_rqueue);
+
+create_qp_exit4:
+	if (HAS_SQ(my_qp) && !is_user)
+		vfree(my_qp->sq_map.map);
+
+create_qp_exit3:
+	if (HAS_SQ(my_qp))
+		ipz_queue_dtor(my_pd, &my_qp->ipz_squeue);
+
+create_qp_exit2:
+	hipz_h_destroy_qp(shca->ipz_hca_handle, my_qp);
+
+create_qp_exit1:
+	write_lock_irqsave(&ehca_qp_idr_lock, flags);
+	idr_remove(&ehca_qp_idr, my_qp->token);
+	write_unlock_irqrestore(&ehca_qp_idr_lock, flags);
+
+create_qp_exit0:
+	kmem_cache_free(qp_cache, my_qp);
+	atomic_dec(&shca->num_qps);
+	return ERR_PTR(ret);
+}
+
+struct ib_qp *ehca_create_qp(struct ib_pd *pd,
+			     struct ib_qp_init_attr *qp_init_attr,
+			     struct ib_udata *udata)
+{
+	struct ehca_qp *ret;
+
+	ret = internal_create_qp(pd, qp_init_attr, NULL, udata, 0);
+	return IS_ERR(ret) ? (struct ib_qp *)ret : &ret->ib_qp;
+}
+
+static int internal_destroy_qp(struct ib_device *dev, struct ehca_qp *my_qp,
+			       struct ib_uobject *uobject);
+
+struct ib_srq *ehca_create_srq(struct ib_pd *pd,
+			       struct ib_srq_init_attr *srq_init_attr,
+			       struct ib_udata *udata)
+{
+	struct ib_qp_init_attr qp_init_attr;
+	struct ehca_qp *my_qp;
+	struct ib_srq *ret;
+	struct ehca_shca *shca = container_of(pd->device, struct ehca_shca,
+					      ib_device);
+	struct hcp_modify_qp_control_block *mqpcb;
+	u64 hret, update_mask;
+
+	if (srq_init_attr->srq_type != IB_SRQT_BASIC)
+		return ERR_PTR(-ENOSYS);
+
+	/* For common attributes, internal_create_qp() takes its info
+	 * out of qp_init_attr, so copy all common attrs there.
+	 */
+	memset(&qp_init_attr, 0, sizeof(qp_init_attr));
+	qp_init_attr.event_handler = srq_init_attr->event_handler;
+	qp_init_attr.qp_context = srq_init_attr->srq_context;
+	qp_init_attr.sq_sig_type = IB_SIGNAL_ALL_WR;
+	qp_init_attr.qp_type = IB_QPT_RC;
+	qp_init_attr.cap.max_recv_wr = srq_init_attr->attr.max_wr;
+	qp_init_attr.cap.max_recv_sge = srq_init_attr->attr.max_sge;
+
+	my_qp = internal_create_qp(pd, &qp_init_attr, srq_init_attr, udata, 1);
+	if (IS_ERR(my_qp))
+		return (struct ib_srq *)my_qp;
+
+	/* copy back return values */
+	srq_init_attr->attr.max_wr = qp_init_attr.cap.max_recv_wr;
+	srq_init_attr->attr.max_sge = 3;
+
+	/* drive SRQ into RTR state */
+	mqpcb = ehca_alloc_fw_ctrlblock(GFP_KERNEL);
+	if (!mqpcb) {
+		ehca_err(pd->device, "Could not get zeroed page for mqpcb "
+			 "ehca_qp=%p qp_num=%x ", my_qp, my_qp->real_qp_num);
+		ret = ERR_PTR(-ENOMEM);
+		goto create_srq1;
+	}
+
+	mqpcb->qp_state = EHCA_QPS_INIT;
+	mqpcb->prim_phys_port = 1;
+	update_mask = EHCA_BMASK_SET(MQPCB_MASK_QP_STATE, 1);
+	hret = hipz_h_modify_qp(shca->ipz_hca_handle,
+				my_qp->ipz_qp_handle,
+				&my_qp->pf,
+				update_mask,
+				mqpcb, my_qp->galpas.kernel);
+	if (hret != H_SUCCESS) {
+		ehca_err(pd->device, "Could not modify SRQ to INIT "
+			 "ehca_qp=%p qp_num=%x h_ret=%lli",
+			 my_qp, my_qp->real_qp_num, hret);
+		goto create_srq2;
+	}
+
+	mqpcb->qp_enable = 1;
+	update_mask = EHCA_BMASK_SET(MQPCB_MASK_QP_ENABLE, 1);
+	hret = hipz_h_modify_qp(shca->ipz_hca_handle,
+				my_qp->ipz_qp_handle,
+				&my_qp->pf,
+				update_mask,
+				mqpcb, my_qp->galpas.kernel);
+	if (hret != H_SUCCESS) {
+		ehca_err(pd->device, "Could not enable SRQ "
+			 "ehca_qp=%p qp_num=%x h_ret=%lli",
+			 my_qp, my_qp->real_qp_num, hret);
+		goto create_srq2;
+	}
+
+	mqpcb->qp_state  = EHCA_QPS_RTR;
+	update_mask = EHCA_BMASK_SET(MQPCB_MASK_QP_STATE, 1);
+	hret = hipz_h_modify_qp(shca->ipz_hca_handle,
+				my_qp->ipz_qp_handle,
+				&my_qp->pf,
+				update_mask,
+				mqpcb, my_qp->galpas.kernel);
+	if (hret != H_SUCCESS) {
+		ehca_err(pd->device, "Could not modify SRQ to RTR "
+			 "ehca_qp=%p qp_num=%x h_ret=%lli",
+			 my_qp, my_qp->real_qp_num, hret);
+		goto create_srq2;
+	}
+
+	ehca_free_fw_ctrlblock(mqpcb);
+
+	return &my_qp->ib_srq;
+
+create_srq2:
+	ret = ERR_PTR(ehca2ib_return_code(hret));
+	ehca_free_fw_ctrlblock(mqpcb);
+
+create_srq1:
+	internal_destroy_qp(pd->device, my_qp, my_qp->ib_srq.uobject);
+
+	return ret;
+}
+
+/*
+ * prepare_sqe_rts called by internal_modify_qp() at trans sqe -> rts
+ * set purge bit of bad wqe and subsequent wqes to avoid reentering sqe
+ * returns total number of bad wqes in bad_wqe_cnt
+ */
+static int prepare_sqe_rts(struct ehca_qp *my_qp, struct ehca_shca *shca,
+			   int *bad_wqe_cnt)
+{
+	u64 h_ret;
+	struct ipz_queue *squeue;
+	void *bad_send_wqe_p, *bad_send_wqe_v;
+	u64 q_ofs;
+	struct ehca_wqe *wqe;
+	int qp_num = my_qp->ib_qp.qp_num;
+
+	/* get send wqe pointer */
+	h_ret = hipz_h_disable_and_get_wqe(shca->ipz_hca_handle,
+					   my_qp->ipz_qp_handle, &my_qp->pf,
+					   &bad_send_wqe_p, NULL, 2);
+	if (h_ret != H_SUCCESS) {
+		ehca_err(&shca->ib_device, "hipz_h_disable_and_get_wqe() failed"
+			 " ehca_qp=%p qp_num=%x h_ret=%lli",
+			 my_qp, qp_num, h_ret);
+		return ehca2ib_return_code(h_ret);
+	}
+	bad_send_wqe_p = (void *)((u64)bad_send_wqe_p & (~(1L << 63)));
+	ehca_dbg(&shca->ib_device, "qp_num=%x bad_send_wqe_p=%p",
+		 qp_num, bad_send_wqe_p);
+	/* convert wqe pointer to vadr */
+	bad_send_wqe_v = __va((u64)bad_send_wqe_p);
+	if (ehca_debug_level >= 2)
+		ehca_dmp(bad_send_wqe_v, 32, "qp_num=%x bad_wqe", qp_num);
+	squeue = &my_qp->ipz_squeue;
+	if (ipz_queue_abs_to_offset(squeue, (u64)bad_send_wqe_p, &q_ofs)) {
+		ehca_err(&shca->ib_device, "failed to get wqe offset qp_num=%x"
+			 " bad_send_wqe_p=%p", qp_num, bad_send_wqe_p);
+		return -EFAULT;
+	}
+
+	/* loop sets wqe's purge bit */
+	wqe = (struct ehca_wqe *)ipz_qeit_calc(squeue, q_ofs);
+	*bad_wqe_cnt = 0;
+	while (wqe->optype != 0xff && wqe->wqef != 0xff) {
+		if (ehca_debug_level >= 2)
+			ehca_dmp(wqe, 32, "qp_num=%x wqe", qp_num);
+		wqe->nr_of_data_seg = 0; /* suppress data access */
+		wqe->wqef = WQEF_PURGE; /* WQE to be purged */
+		q_ofs = ipz_queue_advance_offset(squeue, q_ofs);
+		wqe = (struct ehca_wqe *)ipz_qeit_calc(squeue, q_ofs);
+		*bad_wqe_cnt = (*bad_wqe_cnt)+1;
+	}
+	/*
+	 * bad wqe will be reprocessed and ignored when pol_cq() is called,
+	 *  i.e. nr of wqes with flush error status is one less
+	 */
+	ehca_dbg(&shca->ib_device, "qp_num=%x flusherr_wqe_cnt=%x",
+		 qp_num, (*bad_wqe_cnt)-1);
+	wqe->wqef = 0;
+
+	return 0;
+}
+
+static int calc_left_cqes(u64 wqe_p, struct ipz_queue *ipz_queue,
+			  struct ehca_queue_map *qmap)
+{
+	void *wqe_v;
+	u64 q_ofs;
+	u32 wqe_idx;
+	unsigned int tail_idx;
+
+	/* convert real to abs address */
+	wqe_p = wqe_p & (~(1UL << 63));
+
+	wqe_v = __va(wqe_p);
+
+	if (ipz_queue_abs_to_offset(ipz_queue, wqe_p, &q_ofs)) {
+		ehca_gen_err("Invalid offset for calculating left cqes "
+				"wqe_p=%#llx wqe_v=%p\n", wqe_p, wqe_v);
+		return -EFAULT;
+	}
+
+	tail_idx = next_index(qmap->tail, qmap->entries);
+	wqe_idx = q_ofs / ipz_queue->qe_size;
+
+	/* check all processed wqes, whether a cqe is requested or not */
+	while (tail_idx != wqe_idx) {
+		if (qmap->map[tail_idx].cqe_req)
+			qmap->left_to_poll++;
+		tail_idx = next_index(tail_idx, qmap->entries);
+	}
+	/* save index in queue, where we have to start flushing */
+	qmap->next_wqe_idx = wqe_idx;
+	return 0;
+}
+
+static int check_for_left_cqes(struct ehca_qp *my_qp, struct ehca_shca *shca)
+{
+	u64 h_ret;
+	void *send_wqe_p, *recv_wqe_p;
+	int ret;
+	unsigned long flags;
+	int qp_num = my_qp->ib_qp.qp_num;
+
+	/* this hcall is not supported on base QPs */
+	if (my_qp->ext_type != EQPT_SRQBASE) {
+		/* get send and receive wqe pointer */
+		h_ret = hipz_h_disable_and_get_wqe(shca->ipz_hca_handle,
+				my_qp->ipz_qp_handle, &my_qp->pf,
+				&send_wqe_p, &recv_wqe_p, 4);
+		if (h_ret != H_SUCCESS) {
+			ehca_err(&shca->ib_device, "disable_and_get_wqe() "
+				 "failed ehca_qp=%p qp_num=%x h_ret=%lli",
+				 my_qp, qp_num, h_ret);
+			return ehca2ib_return_code(h_ret);
+		}
+
+		/*
+		 * acquire lock to ensure that nobody is polling the cq which
+		 * could mean that the qmap->tail pointer is in an
+		 * inconsistent state.
+		 */
+		spin_lock_irqsave(&my_qp->send_cq->spinlock, flags);
+		ret = calc_left_cqes((u64)send_wqe_p, &my_qp->ipz_squeue,
+				&my_qp->sq_map);
+		spin_unlock_irqrestore(&my_qp->send_cq->spinlock, flags);
+		if (ret)
+			return ret;
+
+
+		spin_lock_irqsave(&my_qp->recv_cq->spinlock, flags);
+		ret = calc_left_cqes((u64)recv_wqe_p, &my_qp->ipz_rqueue,
+				&my_qp->rq_map);
+		spin_unlock_irqrestore(&my_qp->recv_cq->spinlock, flags);
+		if (ret)
+			return ret;
+	} else {
+		spin_lock_irqsave(&my_qp->send_cq->spinlock, flags);
+		my_qp->sq_map.left_to_poll = 0;
+		my_qp->sq_map.next_wqe_idx = next_index(my_qp->sq_map.tail,
+							my_qp->sq_map.entries);
+		spin_unlock_irqrestore(&my_qp->send_cq->spinlock, flags);
+
+		spin_lock_irqsave(&my_qp->recv_cq->spinlock, flags);
+		my_qp->rq_map.left_to_poll = 0;
+		my_qp->rq_map.next_wqe_idx = next_index(my_qp->rq_map.tail,
+							my_qp->rq_map.entries);
+		spin_unlock_irqrestore(&my_qp->recv_cq->spinlock, flags);
+	}
+
+	/* this assures flush cqes being generated only for pending wqes */
+	if ((my_qp->sq_map.left_to_poll == 0) &&
+				(my_qp->rq_map.left_to_poll == 0)) {
+		spin_lock_irqsave(&my_qp->send_cq->spinlock, flags);
+		ehca_add_to_err_list(my_qp, 1);
+		spin_unlock_irqrestore(&my_qp->send_cq->spinlock, flags);
+
+		if (HAS_RQ(my_qp)) {
+			spin_lock_irqsave(&my_qp->recv_cq->spinlock, flags);
+			ehca_add_to_err_list(my_qp, 0);
+			spin_unlock_irqrestore(&my_qp->recv_cq->spinlock,
+					flags);
+		}
+	}
+
+	return 0;
+}
+
+/*
+ * internal_modify_qp with circumvention to handle aqp0 properly
+ * smi_reset2init indicates if this is an internal reset-to-init-call for
+ * smi. This flag must always be zero if called from ehca_modify_qp()!
+ * This internal func was intorduced to avoid recursion of ehca_modify_qp()!
+ */
+static int internal_modify_qp(struct ib_qp *ibqp,
+			      struct ib_qp_attr *attr,
+			      int attr_mask, int smi_reset2init)
+{
+	enum ib_qp_state qp_cur_state, qp_new_state;
+	int cnt, qp_attr_idx, ret = 0;
+	enum ib_qp_statetrans statetrans;
+	struct hcp_modify_qp_control_block *mqpcb;
+	struct ehca_qp *my_qp = container_of(ibqp, struct ehca_qp, ib_qp);
+	struct ehca_shca *shca =
+		container_of(ibqp->pd->device, struct ehca_shca, ib_device);
+	u64 update_mask;
+	u64 h_ret;
+	int bad_wqe_cnt = 0;
+	int is_user = 0;
+	int squeue_locked = 0;
+	unsigned long flags = 0;
+
+	/* do query_qp to obtain current attr values */
+	mqpcb = ehca_alloc_fw_ctrlblock(GFP_ATOMIC);
+	if (!mqpcb) {
+		ehca_err(ibqp->device, "Could not get zeroed page for mqpcb "
+			 "ehca_qp=%p qp_num=%x ", my_qp, ibqp->qp_num);
+		return -ENOMEM;
+	}
+
+	h_ret = hipz_h_query_qp(shca->ipz_hca_handle,
+				my_qp->ipz_qp_handle,
+				&my_qp->pf,
+				mqpcb, my_qp->galpas.kernel);
+	if (h_ret != H_SUCCESS) {
+		ehca_err(ibqp->device, "hipz_h_query_qp() failed "
+			 "ehca_qp=%p qp_num=%x h_ret=%lli",
+			 my_qp, ibqp->qp_num, h_ret);
+		ret = ehca2ib_return_code(h_ret);
+		goto modify_qp_exit1;
+	}
+	if (ibqp->uobject)
+		is_user = 1;
+
+	qp_cur_state = ehca2ib_qp_state(mqpcb->qp_state);
+
+	if (qp_cur_state == -EINVAL) {	/* invalid qp state */
+		ret = -EINVAL;
+		ehca_err(ibqp->device, "Invalid current ehca_qp_state=%x "
+			 "ehca_qp=%p qp_num=%x",
+			 mqpcb->qp_state, my_qp, ibqp->qp_num);
+		goto modify_qp_exit1;
+	}
+	/*
+	 * circumvention to set aqp0 initial state to init
+	 * as expected by IB spec
+	 */
+	if (smi_reset2init == 0 &&
+	    ibqp->qp_type == IB_QPT_SMI &&
+	    qp_cur_state == IB_QPS_RESET &&
+	    (attr_mask & IB_QP_STATE) &&
+	    attr->qp_state == IB_QPS_INIT) { /* RESET -> INIT */
+		struct ib_qp_attr smiqp_attr = {
+			.qp_state = IB_QPS_INIT,
+			.port_num = my_qp->init_attr.port_num,
+			.pkey_index = 0,
+			.qkey = 0
+		};
+		int smiqp_attr_mask = IB_QP_STATE | IB_QP_PORT |
+			IB_QP_PKEY_INDEX | IB_QP_QKEY;
+		int smirc = internal_modify_qp(
+			ibqp, &smiqp_attr, smiqp_attr_mask, 1);
+		if (smirc) {
+			ehca_err(ibqp->device, "SMI RESET -> INIT failed. "
+				 "ehca_modify_qp() rc=%i", smirc);
+			ret = H_PARAMETER;
+			goto modify_qp_exit1;
+		}
+		qp_cur_state = IB_QPS_INIT;
+		ehca_dbg(ibqp->device, "SMI RESET -> INIT succeeded");
+	}
+	/* is transmitted current state  equal to "real" current state */
+	if ((attr_mask & IB_QP_CUR_STATE) &&
+	    qp_cur_state != attr->cur_qp_state) {
+		ret = -EINVAL;
+		ehca_err(ibqp->device,
+			 "Invalid IB_QP_CUR_STATE attr->curr_qp_state=%x <>"
+			 " actual cur_qp_state=%x. ehca_qp=%p qp_num=%x",
+			 attr->cur_qp_state, qp_cur_state, my_qp, ibqp->qp_num);
+		goto modify_qp_exit1;
+	}
+
+	ehca_dbg(ibqp->device, "ehca_qp=%p qp_num=%x current qp_state=%x "
+		 "new qp_state=%x attribute_mask=%x",
+		 my_qp, ibqp->qp_num, qp_cur_state, attr->qp_state, attr_mask);
+
+	qp_new_state = attr_mask & IB_QP_STATE ? attr->qp_state : qp_cur_state;
+	if (!smi_reset2init &&
+	    !ib_modify_qp_is_ok(qp_cur_state, qp_new_state, ibqp->qp_type,
+				attr_mask, IB_LINK_LAYER_UNSPECIFIED)) {
+		ret = -EINVAL;
+		ehca_err(ibqp->device,
+			 "Invalid qp transition new_state=%x cur_state=%x "
+			 "ehca_qp=%p qp_num=%x attr_mask=%x", qp_new_state,
+			 qp_cur_state, my_qp, ibqp->qp_num, attr_mask);
+		goto modify_qp_exit1;
+	}
+
+	mqpcb->qp_state = ib2ehca_qp_state(qp_new_state);
+	if (mqpcb->qp_state)
+		update_mask = EHCA_BMASK_SET(MQPCB_MASK_QP_STATE, 1);
+	else {
+		ret = -EINVAL;
+		ehca_err(ibqp->device, "Invalid new qp state=%x "
+			 "ehca_qp=%p qp_num=%x",
+			 qp_new_state, my_qp, ibqp->qp_num);
+		goto modify_qp_exit1;
+	}
+
+	/* retrieve state transition struct to get req and opt attrs */
+	statetrans = get_modqp_statetrans(qp_cur_state, qp_new_state);
+	if (statetrans < 0) {
+		ret = -EINVAL;
+		ehca_err(ibqp->device, "<INVALID STATE CHANGE> qp_cur_state=%x "
+			 "new_qp_state=%x State_xsition=%x ehca_qp=%p "
+			 "qp_num=%x", qp_cur_state, qp_new_state,
+			 statetrans, my_qp, ibqp->qp_num);
+		goto modify_qp_exit1;
+	}
+
+	qp_attr_idx = ib2ehcaqptype(ibqp->qp_type);
+
+	if (qp_attr_idx < 0) {
+		ret = qp_attr_idx;
+		ehca_err(ibqp->device,
+			 "Invalid QP type=%x ehca_qp=%p qp_num=%x",
+			 ibqp->qp_type, my_qp, ibqp->qp_num);
+		goto modify_qp_exit1;
+	}
+
+	ehca_dbg(ibqp->device,
+		 "ehca_qp=%p qp_num=%x <VALID STATE CHANGE> qp_state_xsit=%x",
+		 my_qp, ibqp->qp_num, statetrans);
+
+	/* eHCA2 rev2 and higher require the SEND_GRH_FLAG to be set
+	 * in non-LL UD QPs.
+	 */
+	if ((my_qp->qp_type == IB_QPT_UD) &&
+	    (my_qp->ext_type != EQPT_LLQP) &&
+	    (statetrans == IB_QPST_INIT2RTR) &&
+	    (shca->hw_level >= 0x22)) {
+		update_mask |= EHCA_BMASK_SET(MQPCB_MASK_SEND_GRH_FLAG, 1);
+		mqpcb->send_grh_flag = 1;
+	}
+
+	/* sqe -> rts: set purge bit of bad wqe before actual trans */
+	if ((my_qp->qp_type == IB_QPT_UD ||
+	     my_qp->qp_type == IB_QPT_GSI ||
+	     my_qp->qp_type == IB_QPT_SMI) &&
+	    statetrans == IB_QPST_SQE2RTS) {
+		/* mark next free wqe if kernel */
+		if (!ibqp->uobject) {
+			struct ehca_wqe *wqe;
+			/* lock send queue */
+			spin_lock_irqsave(&my_qp->spinlock_s, flags);
+			squeue_locked = 1;
+			/* mark next free wqe */
+			wqe = (struct ehca_wqe *)
+				ipz_qeit_get(&my_qp->ipz_squeue);
+			wqe->optype = wqe->wqef = 0xff;
+			ehca_dbg(ibqp->device, "qp_num=%x next_free_wqe=%p",
+				 ibqp->qp_num, wqe);
+		}
+		ret = prepare_sqe_rts(my_qp, shca, &bad_wqe_cnt);
+		if (ret) {
+			ehca_err(ibqp->device, "prepare_sqe_rts() failed "
+				 "ehca_qp=%p qp_num=%x ret=%i",
+				 my_qp, ibqp->qp_num, ret);
+			goto modify_qp_exit2;
+		}
+	}
+
+	/*
+	 * enable RDMA_Atomic_Control if reset->init und reliable con
+	 * this is necessary since gen2 does not provide that flag,
+	 * but pHyp requires it
+	 */
+	if (statetrans == IB_QPST_RESET2INIT &&
+	    (ibqp->qp_type == IB_QPT_RC || ibqp->qp_type == IB_QPT_UC)) {
+		mqpcb->rdma_atomic_ctrl = 3;
+		update_mask |= EHCA_BMASK_SET(MQPCB_MASK_RDMA_ATOMIC_CTRL, 1);
+	}
+	/* circ. pHyp requires #RDMA/Atomic Resp Res for UC INIT -> RTR */
+	if (statetrans == IB_QPST_INIT2RTR &&
+	    (ibqp->qp_type == IB_QPT_UC) &&
+	    !(attr_mask & IB_QP_MAX_DEST_RD_ATOMIC)) {
+		mqpcb->rdma_nr_atomic_resp_res = 1; /* default to 1 */
+		update_mask |=
+			EHCA_BMASK_SET(MQPCB_MASK_RDMA_NR_ATOMIC_RESP_RES, 1);
+	}
+
+	if (attr_mask & IB_QP_PKEY_INDEX) {
+		if (attr->pkey_index >= 16) {
+			ret = -EINVAL;
+			ehca_err(ibqp->device, "Invalid pkey_index=%x. "
+				 "ehca_qp=%p qp_num=%x max_pkey_index=f",
+				 attr->pkey_index, my_qp, ibqp->qp_num);
+			goto modify_qp_exit2;
+		}
+		mqpcb->prim_p_key_idx = attr->pkey_index;
+		update_mask |= EHCA_BMASK_SET(MQPCB_MASK_PRIM_P_KEY_IDX, 1);
+	}
+	if (attr_mask & IB_QP_PORT) {
+		struct ehca_sport *sport;
+		struct ehca_qp *aqp1;
+		if (attr->port_num < 1 || attr->port_num > shca->num_ports) {
+			ret = -EINVAL;
+			ehca_err(ibqp->device, "Invalid port=%x. "
+				 "ehca_qp=%p qp_num=%x num_ports=%x",
+				 attr->port_num, my_qp, ibqp->qp_num,
+				 shca->num_ports);
+			goto modify_qp_exit2;
+		}
+		sport = &shca->sport[attr->port_num - 1];
+		if (!sport->ibqp_sqp[IB_QPT_GSI]) {
+			/* should not occur */
+			ret = -EFAULT;
+			ehca_err(ibqp->device, "AQP1 was not created for "
+				 "port=%x", attr->port_num);
+			goto modify_qp_exit2;
+		}
+		aqp1 = container_of(sport->ibqp_sqp[IB_QPT_GSI],
+				    struct ehca_qp, ib_qp);
+		if (ibqp->qp_type != IB_QPT_GSI &&
+		    ibqp->qp_type != IB_QPT_SMI &&
+		    aqp1->mod_qp_parm) {
+			/*
+			 * firmware will reject this modify_qp() because
+			 * port is not activated/initialized fully
+			 */
+			ret = -EFAULT;
+			ehca_warn(ibqp->device, "Couldn't modify qp port=%x: "
+				  "either port is being activated (try again) "
+				  "or cabling issue", attr->port_num);
+			goto modify_qp_exit2;
+		}
+		mqpcb->prim_phys_port = attr->port_num;
+		update_mask |= EHCA_BMASK_SET(MQPCB_MASK_PRIM_PHYS_PORT, 1);
+	}
+	if (attr_mask & IB_QP_QKEY) {
+		mqpcb->qkey = attr->qkey;
+		update_mask |= EHCA_BMASK_SET(MQPCB_MASK_QKEY, 1);
+	}
+	if (attr_mask & IB_QP_AV) {
+		mqpcb->dlid = attr->ah_attr.dlid;
+		update_mask |= EHCA_BMASK_SET(MQPCB_MASK_DLID, 1);
+		mqpcb->source_path_bits = attr->ah_attr.src_path_bits;
+		update_mask |= EHCA_BMASK_SET(MQPCB_MASK_SOURCE_PATH_BITS, 1);
+		mqpcb->service_level = attr->ah_attr.sl;
+		update_mask |= EHCA_BMASK_SET(MQPCB_MASK_SERVICE_LEVEL, 1);
+
+		if (ehca_calc_ipd(shca, mqpcb->prim_phys_port,
+				  attr->ah_attr.static_rate,
+				  &mqpcb->max_static_rate)) {
+			ret = -EINVAL;
+			goto modify_qp_exit2;
+		}
+		update_mask |= EHCA_BMASK_SET(MQPCB_MASK_MAX_STATIC_RATE, 1);
+
+		/*
+		 * Always supply the GRH flag, even if it's zero, to give the
+		 * hypervisor a clear "yes" or "no" instead of a "perhaps"
+		 */
+		update_mask |= EHCA_BMASK_SET(MQPCB_MASK_SEND_GRH_FLAG, 1);
+
+		/*
+		 * only if GRH is TRUE we might consider SOURCE_GID_IDX
+		 * and DEST_GID otherwise phype will return H_ATTR_PARM!!!
+		 */
+		if (attr->ah_attr.ah_flags == IB_AH_GRH) {
+			mqpcb->send_grh_flag = 1;
+
+			mqpcb->source_gid_idx = attr->ah_attr.grh.sgid_index;
+			update_mask |=
+				EHCA_BMASK_SET(MQPCB_MASK_SOURCE_GID_IDX, 1);
+
+			for (cnt = 0; cnt < 16; cnt++)
+				mqpcb->dest_gid.byte[cnt] =
+					attr->ah_attr.grh.dgid.raw[cnt];
+
+			update_mask |= EHCA_BMASK_SET(MQPCB_MASK_DEST_GID, 1);
+			mqpcb->flow_label = attr->ah_attr.grh.flow_label;
+			update_mask |= EHCA_BMASK_SET(MQPCB_MASK_FLOW_LABEL, 1);
+			mqpcb->hop_limit = attr->ah_attr.grh.hop_limit;
+			update_mask |= EHCA_BMASK_SET(MQPCB_MASK_HOP_LIMIT, 1);
+			mqpcb->traffic_class = attr->ah_attr.grh.traffic_class;
+			update_mask |=
+				EHCA_BMASK_SET(MQPCB_MASK_TRAFFIC_CLASS, 1);
+		}
+	}
+
+	if (attr_mask & IB_QP_PATH_MTU) {
+		/* store ld(MTU) */
+		my_qp->mtu_shift = attr->path_mtu + 7;
+		mqpcb->path_mtu = attr->path_mtu;
+		update_mask |= EHCA_BMASK_SET(MQPCB_MASK_PATH_MTU, 1);
+	}
+	if (attr_mask & IB_QP_TIMEOUT) {
+		mqpcb->timeout = attr->timeout;
+		update_mask |= EHCA_BMASK_SET(MQPCB_MASK_TIMEOUT, 1);
+	}
+	if (attr_mask & IB_QP_RETRY_CNT) {
+		mqpcb->retry_count = attr->retry_cnt;
+		update_mask |= EHCA_BMASK_SET(MQPCB_MASK_RETRY_COUNT, 1);
+	}
+	if (attr_mask & IB_QP_RNR_RETRY) {
+		mqpcb->rnr_retry_count = attr->rnr_retry;
+		update_mask |= EHCA_BMASK_SET(MQPCB_MASK_RNR_RETRY_COUNT, 1);
+	}
+	if (attr_mask & IB_QP_RQ_PSN) {
+		mqpcb->receive_psn = attr->rq_psn;
+		update_mask |= EHCA_BMASK_SET(MQPCB_MASK_RECEIVE_PSN, 1);
+	}
+	if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) {
+		mqpcb->rdma_nr_atomic_resp_res = attr->max_dest_rd_atomic < 3 ?
+			attr->max_dest_rd_atomic : 2;
+		update_mask |=
+			EHCA_BMASK_SET(MQPCB_MASK_RDMA_NR_ATOMIC_RESP_RES, 1);
+	}
+	if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC) {
+		mqpcb->rdma_atomic_outst_dest_qp = attr->max_rd_atomic < 3 ?
+			attr->max_rd_atomic : 2;
+		update_mask |=
+			EHCA_BMASK_SET
+			(MQPCB_MASK_RDMA_ATOMIC_OUTST_DEST_QP, 1);
+	}
+	if (attr_mask & IB_QP_ALT_PATH) {
+		if (attr->alt_port_num < 1
+		    || attr->alt_port_num > shca->num_ports) {
+			ret = -EINVAL;
+			ehca_err(ibqp->device, "Invalid alt_port=%x. "
+				 "ehca_qp=%p qp_num=%x num_ports=%x",
+				 attr->alt_port_num, my_qp, ibqp->qp_num,
+				 shca->num_ports);
+			goto modify_qp_exit2;
+		}
+		mqpcb->alt_phys_port = attr->alt_port_num;
+
+		if (attr->alt_pkey_index >= 16) {
+			ret = -EINVAL;
+			ehca_err(ibqp->device, "Invalid alt_pkey_index=%x. "
+				 "ehca_qp=%p qp_num=%x max_pkey_index=f",
+				 attr->pkey_index, my_qp, ibqp->qp_num);
+			goto modify_qp_exit2;
+		}
+		mqpcb->alt_p_key_idx = attr->alt_pkey_index;
+
+		mqpcb->timeout_al = attr->alt_timeout;
+		mqpcb->dlid_al = attr->alt_ah_attr.dlid;
+		mqpcb->source_path_bits_al = attr->alt_ah_attr.src_path_bits;
+		mqpcb->service_level_al = attr->alt_ah_attr.sl;
+
+		if (ehca_calc_ipd(shca, mqpcb->alt_phys_port,
+				  attr->alt_ah_attr.static_rate,
+				  &mqpcb->max_static_rate_al)) {
+			ret = -EINVAL;
+			goto modify_qp_exit2;
+		}
+
+		/* OpenIB doesn't support alternate retry counts - copy them */
+		mqpcb->retry_count_al = mqpcb->retry_count;
+		mqpcb->rnr_retry_count_al = mqpcb->rnr_retry_count;
+
+		update_mask |= EHCA_BMASK_SET(MQPCB_MASK_ALT_PHYS_PORT, 1)
+			| EHCA_BMASK_SET(MQPCB_MASK_ALT_P_KEY_IDX, 1)
+			| EHCA_BMASK_SET(MQPCB_MASK_TIMEOUT_AL, 1)
+			| EHCA_BMASK_SET(MQPCB_MASK_DLID_AL, 1)
+			| EHCA_BMASK_SET(MQPCB_MASK_SOURCE_PATH_BITS_AL, 1)
+			| EHCA_BMASK_SET(MQPCB_MASK_SERVICE_LEVEL_AL, 1)
+			| EHCA_BMASK_SET(MQPCB_MASK_MAX_STATIC_RATE_AL, 1)
+			| EHCA_BMASK_SET(MQPCB_MASK_RETRY_COUNT_AL, 1)
+			| EHCA_BMASK_SET(MQPCB_MASK_RNR_RETRY_COUNT_AL, 1);
+
+		/*
+		 * Always supply the GRH flag, even if it's zero, to give the
+		 * hypervisor a clear "yes" or "no" instead of a "perhaps"
+		 */
+		update_mask |= EHCA_BMASK_SET(MQPCB_MASK_SEND_GRH_FLAG_AL, 1);
+
+		/*
+		 * only if GRH is TRUE we might consider SOURCE_GID_IDX
+		 * and DEST_GID otherwise phype will return H_ATTR_PARM!!!
+		 */
+		if (attr->alt_ah_attr.ah_flags == IB_AH_GRH) {
+			mqpcb->send_grh_flag_al = 1;
+
+			for (cnt = 0; cnt < 16; cnt++)
+				mqpcb->dest_gid_al.byte[cnt] =
+					attr->alt_ah_attr.grh.dgid.raw[cnt];
+			mqpcb->source_gid_idx_al =
+				attr->alt_ah_attr.grh.sgid_index;
+			mqpcb->flow_label_al = attr->alt_ah_attr.grh.flow_label;
+			mqpcb->hop_limit_al = attr->alt_ah_attr.grh.hop_limit;
+			mqpcb->traffic_class_al =
+				attr->alt_ah_attr.grh.traffic_class;
+
+			update_mask |=
+				EHCA_BMASK_SET(MQPCB_MASK_SOURCE_GID_IDX_AL, 1)
+				| EHCA_BMASK_SET(MQPCB_MASK_DEST_GID_AL, 1)
+				| EHCA_BMASK_SET(MQPCB_MASK_FLOW_LABEL_AL, 1)
+				| EHCA_BMASK_SET(MQPCB_MASK_HOP_LIMIT_AL, 1) |
+				EHCA_BMASK_SET(MQPCB_MASK_TRAFFIC_CLASS_AL, 1);
+		}
+	}
+
+	if (attr_mask & IB_QP_MIN_RNR_TIMER) {
+		mqpcb->min_rnr_nak_timer_field = attr->min_rnr_timer;
+		update_mask |=
+			EHCA_BMASK_SET(MQPCB_MASK_MIN_RNR_NAK_TIMER_FIELD, 1);
+	}
+
+	if (attr_mask & IB_QP_SQ_PSN) {
+		mqpcb->send_psn = attr->sq_psn;
+		update_mask |= EHCA_BMASK_SET(MQPCB_MASK_SEND_PSN, 1);
+	}
+
+	if (attr_mask & IB_QP_DEST_QPN) {
+		mqpcb->dest_qp_nr = attr->dest_qp_num;
+		update_mask |= EHCA_BMASK_SET(MQPCB_MASK_DEST_QP_NR, 1);
+	}
+
+	if (attr_mask & IB_QP_PATH_MIG_STATE) {
+		if (attr->path_mig_state != IB_MIG_REARM
+		    && attr->path_mig_state != IB_MIG_MIGRATED) {
+			ret = -EINVAL;
+			ehca_err(ibqp->device, "Invalid mig_state=%x",
+				 attr->path_mig_state);
+			goto modify_qp_exit2;
+		}
+		mqpcb->path_migration_state = attr->path_mig_state + 1;
+		if (attr->path_mig_state == IB_MIG_REARM)
+			my_qp->mig_armed = 1;
+		update_mask |=
+			EHCA_BMASK_SET(MQPCB_MASK_PATH_MIGRATION_STATE, 1);
+	}
+
+	if (attr_mask & IB_QP_CAP) {
+		mqpcb->max_nr_outst_send_wr = attr->cap.max_send_wr+1;
+		update_mask |=
+			EHCA_BMASK_SET(MQPCB_MASK_MAX_NR_OUTST_SEND_WR, 1);
+		mqpcb->max_nr_outst_recv_wr = attr->cap.max_recv_wr+1;
+		update_mask |=
+			EHCA_BMASK_SET(MQPCB_MASK_MAX_NR_OUTST_RECV_WR, 1);
+		/* no support for max_send/recv_sge yet */
+	}
+
+	if (ehca_debug_level >= 2)
+		ehca_dmp(mqpcb, 4*70, "qp_num=%x", ibqp->qp_num);
+
+	h_ret = hipz_h_modify_qp(shca->ipz_hca_handle,
+				 my_qp->ipz_qp_handle,
+				 &my_qp->pf,
+				 update_mask,
+				 mqpcb, my_qp->galpas.kernel);
+
+	if (h_ret != H_SUCCESS) {
+		ret = ehca2ib_return_code(h_ret);
+		ehca_err(ibqp->device, "hipz_h_modify_qp() failed h_ret=%lli "
+			 "ehca_qp=%p qp_num=%x", h_ret, my_qp, ibqp->qp_num);
+		goto modify_qp_exit2;
+	}
+
+	if ((my_qp->qp_type == IB_QPT_UD ||
+	     my_qp->qp_type == IB_QPT_GSI ||
+	     my_qp->qp_type == IB_QPT_SMI) &&
+	    statetrans == IB_QPST_SQE2RTS) {
+		/* doorbell to reprocessing wqes */
+		iosync(); /* serialize GAL register access */
+		hipz_update_sqa(my_qp, bad_wqe_cnt-1);
+		ehca_gen_dbg("doorbell for %x wqes", bad_wqe_cnt);
+	}
+
+	if (statetrans == IB_QPST_RESET2INIT ||
+	    statetrans == IB_QPST_INIT2INIT) {
+		mqpcb->qp_enable = 1;
+		mqpcb->qp_state = EHCA_QPS_INIT;
+		update_mask = 0;
+		update_mask = EHCA_BMASK_SET(MQPCB_MASK_QP_ENABLE, 1);
+
+		h_ret = hipz_h_modify_qp(shca->ipz_hca_handle,
+					 my_qp->ipz_qp_handle,
+					 &my_qp->pf,
+					 update_mask,
+					 mqpcb,
+					 my_qp->galpas.kernel);
+
+		if (h_ret != H_SUCCESS) {
+			ret = ehca2ib_return_code(h_ret);
+			ehca_err(ibqp->device, "ENABLE in context of "
+				 "RESET_2_INIT failed! Maybe you didn't get "
+				 "a LID h_ret=%lli ehca_qp=%p qp_num=%x",
+				 h_ret, my_qp, ibqp->qp_num);
+			goto modify_qp_exit2;
+		}
+	}
+	if ((qp_new_state == IB_QPS_ERR) && (qp_cur_state != IB_QPS_ERR)
+	    && !is_user) {
+		ret = check_for_left_cqes(my_qp, shca);
+		if (ret)
+			goto modify_qp_exit2;
+	}
+
+	if (statetrans == IB_QPST_ANY2RESET) {
+		ipz_qeit_reset(&my_qp->ipz_rqueue);
+		ipz_qeit_reset(&my_qp->ipz_squeue);
+
+		if (qp_cur_state == IB_QPS_ERR && !is_user) {
+			del_from_err_list(my_qp->send_cq, &my_qp->sq_err_node);
+
+			if (HAS_RQ(my_qp))
+				del_from_err_list(my_qp->recv_cq,
+						  &my_qp->rq_err_node);
+		}
+		if (!is_user)
+			reset_queue_map(&my_qp->sq_map);
+
+		if (HAS_RQ(my_qp) && !is_user)
+			reset_queue_map(&my_qp->rq_map);
+	}
+
+	if (attr_mask & IB_QP_QKEY)
+		my_qp->qkey = attr->qkey;
+
+modify_qp_exit2:
+	if (squeue_locked) { /* this means: sqe -> rts */
+		spin_unlock_irqrestore(&my_qp->spinlock_s, flags);
+		my_qp->sqerr_purgeflag = 1;
+	}
+
+modify_qp_exit1:
+	ehca_free_fw_ctrlblock(mqpcb);
+
+	return ret;
+}
+
+int ehca_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask,
+		   struct ib_udata *udata)
+{
+	int ret = 0;
+
+	struct ehca_shca *shca = container_of(ibqp->device, struct ehca_shca,
+					      ib_device);
+	struct ehca_qp *my_qp = container_of(ibqp, struct ehca_qp, ib_qp);
+
+	/* The if-block below caches qp_attr to be modified for GSI and SMI
+	 * qps during the initialization by ib_mad. When the respective port
+	 * is activated, ie we got an event PORT_ACTIVE, we'll replay the
+	 * cached modify calls sequence, see ehca_recover_sqs() below.
+	 * Why that is required:
+	 * 1) If one port is connected, older code requires that port one
+	 *    to be connected and module option nr_ports=1 to be given by
+	 *    user, which is very inconvenient for end user.
+	 * 2) Firmware accepts modify_qp() only if respective port has become
+	 *    active. Older code had a wait loop of 30sec create_qp()/
+	 *    define_aqp1(), which is not appropriate in practice. This
+	 *    code now removes that wait loop, see define_aqp1(), and always
+	 *    reports all ports to ib_mad resp. users. Only activated ports
+	 *    will then usable for the users.
+	 */
+	if (ibqp->qp_type == IB_QPT_GSI || ibqp->qp_type == IB_QPT_SMI) {
+		int port = my_qp->init_attr.port_num;
+		struct ehca_sport *sport = &shca->sport[port - 1];
+		unsigned long flags;
+		spin_lock_irqsave(&sport->mod_sqp_lock, flags);
+		/* cache qp_attr only during init */
+		if (my_qp->mod_qp_parm) {
+			struct ehca_mod_qp_parm *p;
+			if (my_qp->mod_qp_parm_idx >= EHCA_MOD_QP_PARM_MAX) {
+				ehca_err(&shca->ib_device,
+					 "mod_qp_parm overflow state=%x port=%x"
+					 " type=%x", attr->qp_state,
+					 my_qp->init_attr.port_num,
+					 ibqp->qp_type);
+				spin_unlock_irqrestore(&sport->mod_sqp_lock,
+						       flags);
+				return -EINVAL;
+			}
+			p = &my_qp->mod_qp_parm[my_qp->mod_qp_parm_idx];
+			p->mask = attr_mask;
+			p->attr = *attr;
+			my_qp->mod_qp_parm_idx++;
+			ehca_dbg(&shca->ib_device,
+				 "Saved qp_attr for state=%x port=%x type=%x",
+				 attr->qp_state, my_qp->init_attr.port_num,
+				 ibqp->qp_type);
+			spin_unlock_irqrestore(&sport->mod_sqp_lock, flags);
+			goto out;
+		}
+		spin_unlock_irqrestore(&sport->mod_sqp_lock, flags);
+	}
+
+	ret = internal_modify_qp(ibqp, attr, attr_mask, 0);
+
+out:
+	if ((ret == 0) && (attr_mask & IB_QP_STATE))
+		my_qp->state = attr->qp_state;
+
+	return ret;
+}
+
+void ehca_recover_sqp(struct ib_qp *sqp)
+{
+	struct ehca_qp *my_sqp = container_of(sqp, struct ehca_qp, ib_qp);
+	int port = my_sqp->init_attr.port_num;
+	struct ib_qp_attr attr;
+	struct ehca_mod_qp_parm *qp_parm;
+	int i, qp_parm_idx, ret;
+	unsigned long flags, wr_cnt;
+
+	if (!my_sqp->mod_qp_parm)
+		return;
+	ehca_dbg(sqp->device, "SQP port=%x qp_num=%x", port, sqp->qp_num);
+
+	qp_parm = my_sqp->mod_qp_parm;
+	qp_parm_idx = my_sqp->mod_qp_parm_idx;
+	for (i = 0; i < qp_parm_idx; i++) {
+		attr = qp_parm[i].attr;
+		ret = internal_modify_qp(sqp, &attr, qp_parm[i].mask, 0);
+		if (ret) {
+			ehca_err(sqp->device, "Could not modify SQP port=%x "
+				 "qp_num=%x ret=%x", port, sqp->qp_num, ret);
+			goto free_qp_parm;
+		}
+		ehca_dbg(sqp->device, "SQP port=%x qp_num=%x in state=%x",
+			 port, sqp->qp_num, attr.qp_state);
+	}
+
+	/* re-trigger posted recv wrs */
+	wr_cnt =  my_sqp->ipz_rqueue.current_q_offset /
+		my_sqp->ipz_rqueue.qe_size;
+	if (wr_cnt) {
+		spin_lock_irqsave(&my_sqp->spinlock_r, flags);
+		hipz_update_rqa(my_sqp, wr_cnt);
+		spin_unlock_irqrestore(&my_sqp->spinlock_r, flags);
+		ehca_dbg(sqp->device, "doorbell port=%x qp_num=%x wr_cnt=%lx",
+			 port, sqp->qp_num, wr_cnt);
+	}
+
+free_qp_parm:
+	kfree(qp_parm);
+	/* this prevents subsequent calls to modify_qp() to cache qp_attr */
+	my_sqp->mod_qp_parm = NULL;
+}
+
+int ehca_query_qp(struct ib_qp *qp,
+		  struct ib_qp_attr *qp_attr,
+		  int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr)
+{
+	struct ehca_qp *my_qp = container_of(qp, struct ehca_qp, ib_qp);
+	struct ehca_shca *shca = container_of(qp->device, struct ehca_shca,
+					      ib_device);
+	struct ipz_adapter_handle adapter_handle = shca->ipz_hca_handle;
+	struct hcp_modify_qp_control_block *qpcb;
+	int cnt, ret = 0;
+	u64 h_ret;
+
+	if (qp_attr_mask & QP_ATTR_QUERY_NOT_SUPPORTED) {
+		ehca_err(qp->device, "Invalid attribute mask "
+			 "ehca_qp=%p qp_num=%x qp_attr_mask=%x ",
+			 my_qp, qp->qp_num, qp_attr_mask);
+		return -EINVAL;
+	}
+
+	qpcb = ehca_alloc_fw_ctrlblock(GFP_KERNEL);
+	if (!qpcb) {
+		ehca_err(qp->device, "Out of memory for qpcb "
+			 "ehca_qp=%p qp_num=%x", my_qp, qp->qp_num);
+		return -ENOMEM;
+	}
+
+	h_ret = hipz_h_query_qp(adapter_handle,
+				my_qp->ipz_qp_handle,
+				&my_qp->pf,
+				qpcb, my_qp->galpas.kernel);
+
+	if (h_ret != H_SUCCESS) {
+		ret = ehca2ib_return_code(h_ret);
+		ehca_err(qp->device, "hipz_h_query_qp() failed "
+			 "ehca_qp=%p qp_num=%x h_ret=%lli",
+			 my_qp, qp->qp_num, h_ret);
+		goto query_qp_exit1;
+	}
+
+	qp_attr->cur_qp_state = ehca2ib_qp_state(qpcb->qp_state);
+	qp_attr->qp_state = qp_attr->cur_qp_state;
+
+	if (qp_attr->cur_qp_state == -EINVAL) {
+		ret = -EINVAL;
+		ehca_err(qp->device, "Got invalid ehca_qp_state=%x "
+			 "ehca_qp=%p qp_num=%x",
+			 qpcb->qp_state, my_qp, qp->qp_num);
+		goto query_qp_exit1;
+	}
+
+	if (qp_attr->qp_state == IB_QPS_SQD)
+		qp_attr->sq_draining = 1;
+
+	qp_attr->qkey = qpcb->qkey;
+	qp_attr->path_mtu = qpcb->path_mtu;
+	qp_attr->path_mig_state = qpcb->path_migration_state - 1;
+	qp_attr->rq_psn = qpcb->receive_psn;
+	qp_attr->sq_psn = qpcb->send_psn;
+	qp_attr->min_rnr_timer = qpcb->min_rnr_nak_timer_field;
+	qp_attr->cap.max_send_wr = qpcb->max_nr_outst_send_wr-1;
+	qp_attr->cap.max_recv_wr = qpcb->max_nr_outst_recv_wr-1;
+	/* UD_AV CIRCUMVENTION */
+	if (my_qp->qp_type == IB_QPT_UD) {
+		qp_attr->cap.max_send_sge =
+			qpcb->actual_nr_sges_in_sq_wqe - 2;
+		qp_attr->cap.max_recv_sge =
+			qpcb->actual_nr_sges_in_rq_wqe - 2;
+	} else {
+		qp_attr->cap.max_send_sge =
+			qpcb->actual_nr_sges_in_sq_wqe;
+		qp_attr->cap.max_recv_sge =
+			qpcb->actual_nr_sges_in_rq_wqe;
+	}
+
+	qp_attr->cap.max_inline_data = my_qp->sq_max_inline_data_size;
+	qp_attr->dest_qp_num = qpcb->dest_qp_nr;
+
+	qp_attr->pkey_index = qpcb->prim_p_key_idx;
+	qp_attr->port_num = qpcb->prim_phys_port;
+	qp_attr->timeout = qpcb->timeout;
+	qp_attr->retry_cnt = qpcb->retry_count;
+	qp_attr->rnr_retry = qpcb->rnr_retry_count;
+
+	qp_attr->alt_pkey_index = qpcb->alt_p_key_idx;
+	qp_attr->alt_port_num = qpcb->alt_phys_port;
+	qp_attr->alt_timeout = qpcb->timeout_al;
+
+	qp_attr->max_dest_rd_atomic = qpcb->rdma_nr_atomic_resp_res;
+	qp_attr->max_rd_atomic = qpcb->rdma_atomic_outst_dest_qp;
+
+	/* primary av */
+	qp_attr->ah_attr.sl = qpcb->service_level;
+
+	if (qpcb->send_grh_flag) {
+		qp_attr->ah_attr.ah_flags = IB_AH_GRH;
+	}
+
+	qp_attr->ah_attr.static_rate = qpcb->max_static_rate;
+	qp_attr->ah_attr.dlid = qpcb->dlid;
+	qp_attr->ah_attr.src_path_bits = qpcb->source_path_bits;
+	qp_attr->ah_attr.port_num = qp_attr->port_num;
+
+	/* primary GRH */
+	qp_attr->ah_attr.grh.traffic_class = qpcb->traffic_class;
+	qp_attr->ah_attr.grh.hop_limit = qpcb->hop_limit;
+	qp_attr->ah_attr.grh.sgid_index = qpcb->source_gid_idx;
+	qp_attr->ah_attr.grh.flow_label = qpcb->flow_label;
+
+	for (cnt = 0; cnt < 16; cnt++)
+		qp_attr->ah_attr.grh.dgid.raw[cnt] =
+			qpcb->dest_gid.byte[cnt];
+
+	/* alternate AV */
+	qp_attr->alt_ah_attr.sl = qpcb->service_level_al;
+	if (qpcb->send_grh_flag_al) {
+		qp_attr->alt_ah_attr.ah_flags = IB_AH_GRH;
+	}
+
+	qp_attr->alt_ah_attr.static_rate = qpcb->max_static_rate_al;
+	qp_attr->alt_ah_attr.dlid = qpcb->dlid_al;
+	qp_attr->alt_ah_attr.src_path_bits = qpcb->source_path_bits_al;
+
+	/* alternate GRH */
+	qp_attr->alt_ah_attr.grh.traffic_class = qpcb->traffic_class_al;
+	qp_attr->alt_ah_attr.grh.hop_limit = qpcb->hop_limit_al;
+	qp_attr->alt_ah_attr.grh.sgid_index = qpcb->source_gid_idx_al;
+	qp_attr->alt_ah_attr.grh.flow_label = qpcb->flow_label_al;
+
+	for (cnt = 0; cnt < 16; cnt++)
+		qp_attr->alt_ah_attr.grh.dgid.raw[cnt] =
+			qpcb->dest_gid_al.byte[cnt];
+
+	/* return init attributes given in ehca_create_qp */
+	if (qp_init_attr)
+		*qp_init_attr = my_qp->init_attr;
+
+	if (ehca_debug_level >= 2)
+		ehca_dmp(qpcb, 4*70, "qp_num=%x", qp->qp_num);
+
+query_qp_exit1:
+	ehca_free_fw_ctrlblock(qpcb);
+
+	return ret;
+}
+
+int ehca_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
+		    enum ib_srq_attr_mask attr_mask, struct ib_udata *udata)
+{
+	struct ehca_qp *my_qp =
+		container_of(ibsrq, struct ehca_qp, ib_srq);
+	struct ehca_shca *shca =
+		container_of(ibsrq->pd->device, struct ehca_shca, ib_device);
+	struct hcp_modify_qp_control_block *mqpcb;
+	u64 update_mask;
+	u64 h_ret;
+	int ret = 0;
+
+	mqpcb = ehca_alloc_fw_ctrlblock(GFP_KERNEL);
+	if (!mqpcb) {
+		ehca_err(ibsrq->device, "Could not get zeroed page for mqpcb "
+			 "ehca_qp=%p qp_num=%x ", my_qp, my_qp->real_qp_num);
+		return -ENOMEM;
+	}
+
+	update_mask = 0;
+	if (attr_mask & IB_SRQ_LIMIT) {
+		attr_mask &= ~IB_SRQ_LIMIT;
+		update_mask |=
+			EHCA_BMASK_SET(MQPCB_MASK_CURR_SRQ_LIMIT, 1)
+			| EHCA_BMASK_SET(MQPCB_MASK_QP_AFF_ASYN_EV_LOG_REG, 1);
+		mqpcb->curr_srq_limit = attr->srq_limit;
+		mqpcb->qp_aff_asyn_ev_log_reg =
+			EHCA_BMASK_SET(QPX_AAELOG_RESET_SRQ_LIMIT, 1);
+	}
+
+	/* by now, all bits in attr_mask should have been cleared */
+	if (attr_mask) {
+		ehca_err(ibsrq->device, "invalid attribute mask bits set  "
+			 "attr_mask=%x", attr_mask);
+		ret = -EINVAL;
+		goto modify_srq_exit0;
+	}
+
+	if (ehca_debug_level >= 2)
+		ehca_dmp(mqpcb, 4*70, "qp_num=%x", my_qp->real_qp_num);
+
+	h_ret = hipz_h_modify_qp(shca->ipz_hca_handle, my_qp->ipz_qp_handle,
+				 NULL, update_mask, mqpcb,
+				 my_qp->galpas.kernel);
+
+	if (h_ret != H_SUCCESS) {
+		ret = ehca2ib_return_code(h_ret);
+		ehca_err(ibsrq->device, "hipz_h_modify_qp() failed h_ret=%lli "
+			 "ehca_qp=%p qp_num=%x",
+			 h_ret, my_qp, my_qp->real_qp_num);
+	}
+
+modify_srq_exit0:
+	ehca_free_fw_ctrlblock(mqpcb);
+
+	return ret;
+}
+
+int ehca_query_srq(struct ib_srq *srq, struct ib_srq_attr *srq_attr)
+{
+	struct ehca_qp *my_qp = container_of(srq, struct ehca_qp, ib_srq);
+	struct ehca_shca *shca = container_of(srq->device, struct ehca_shca,
+					      ib_device);
+	struct ipz_adapter_handle adapter_handle = shca->ipz_hca_handle;
+	struct hcp_modify_qp_control_block *qpcb;
+	int ret = 0;
+	u64 h_ret;
+
+	qpcb = ehca_alloc_fw_ctrlblock(GFP_KERNEL);
+	if (!qpcb) {
+		ehca_err(srq->device, "Out of memory for qpcb "
+			 "ehca_qp=%p qp_num=%x", my_qp, my_qp->real_qp_num);
+		return -ENOMEM;
+	}
+
+	h_ret = hipz_h_query_qp(adapter_handle, my_qp->ipz_qp_handle,
+				NULL, qpcb, my_qp->galpas.kernel);
+
+	if (h_ret != H_SUCCESS) {
+		ret = ehca2ib_return_code(h_ret);
+		ehca_err(srq->device, "hipz_h_query_qp() failed "
+			 "ehca_qp=%p qp_num=%x h_ret=%lli",
+			 my_qp, my_qp->real_qp_num, h_ret);
+		goto query_srq_exit1;
+	}
+
+	srq_attr->max_wr = qpcb->max_nr_outst_recv_wr - 1;
+	srq_attr->max_sge = 3;
+	srq_attr->srq_limit = qpcb->curr_srq_limit;
+
+	if (ehca_debug_level >= 2)
+		ehca_dmp(qpcb, 4*70, "qp_num=%x", my_qp->real_qp_num);
+
+query_srq_exit1:
+	ehca_free_fw_ctrlblock(qpcb);
+
+	return ret;
+}
+
+static int internal_destroy_qp(struct ib_device *dev, struct ehca_qp *my_qp,
+			       struct ib_uobject *uobject)
+{
+	struct ehca_shca *shca = container_of(dev, struct ehca_shca, ib_device);
+	struct ehca_pd *my_pd = container_of(my_qp->ib_qp.pd, struct ehca_pd,
+					     ib_pd);
+	struct ehca_sport *sport = &shca->sport[my_qp->init_attr.port_num - 1];
+	u32 qp_num = my_qp->real_qp_num;
+	int ret;
+	u64 h_ret;
+	u8 port_num;
+	int is_user = 0;
+	enum ib_qp_type	qp_type;
+	unsigned long flags;
+
+	if (uobject) {
+		is_user = 1;
+		if (my_qp->mm_count_galpa ||
+		    my_qp->mm_count_rqueue || my_qp->mm_count_squeue) {
+			ehca_err(dev, "Resources still referenced in "
+				 "user space qp_num=%x", qp_num);
+			return -EINVAL;
+		}
+	}
+
+	if (my_qp->send_cq) {
+		ret = ehca_cq_unassign_qp(my_qp->send_cq, qp_num);
+		if (ret) {
+			ehca_err(dev, "Couldn't unassign qp from "
+				 "send_cq ret=%i qp_num=%x cq_num=%x", ret,
+				 qp_num, my_qp->send_cq->cq_number);
+			return ret;
+		}
+	}
+
+	write_lock_irqsave(&ehca_qp_idr_lock, flags);
+	idr_remove(&ehca_qp_idr, my_qp->token);
+	write_unlock_irqrestore(&ehca_qp_idr_lock, flags);
+
+	/*
+	 * SRQs will never get into an error list and do not have a recv_cq,
+	 * so we need to skip them here.
+	 */
+	if (HAS_RQ(my_qp) && !IS_SRQ(my_qp) && !is_user)
+		del_from_err_list(my_qp->recv_cq, &my_qp->rq_err_node);
+
+	if (HAS_SQ(my_qp) && !is_user)
+		del_from_err_list(my_qp->send_cq, &my_qp->sq_err_node);
+
+	/* now wait until all pending events have completed */
+	wait_event(my_qp->wait_completion, !atomic_read(&my_qp->nr_events));
+
+	h_ret = hipz_h_destroy_qp(shca->ipz_hca_handle, my_qp);
+	if (h_ret != H_SUCCESS) {
+		ehca_err(dev, "hipz_h_destroy_qp() failed h_ret=%lli "
+			 "ehca_qp=%p qp_num=%x", h_ret, my_qp, qp_num);
+		return ehca2ib_return_code(h_ret);
+	}
+
+	port_num = my_qp->init_attr.port_num;
+	qp_type  = my_qp->init_attr.qp_type;
+
+	if (qp_type == IB_QPT_SMI || qp_type == IB_QPT_GSI) {
+		spin_lock_irqsave(&sport->mod_sqp_lock, flags);
+		kfree(my_qp->mod_qp_parm);
+		my_qp->mod_qp_parm = NULL;
+		shca->sport[port_num - 1].ibqp_sqp[qp_type] = NULL;
+		spin_unlock_irqrestore(&sport->mod_sqp_lock, flags);
+	}
+
+	/* no support for IB_QPT_SMI yet */
+	if (qp_type == IB_QPT_GSI) {
+		struct ib_event event;
+		ehca_info(dev, "device %s: port %x is inactive.",
+				shca->ib_device.name, port_num);
+		event.device = &shca->ib_device;
+		event.event = IB_EVENT_PORT_ERR;
+		event.element.port_num = port_num;
+		shca->sport[port_num - 1].port_state = IB_PORT_DOWN;
+		ib_dispatch_event(&event);
+	}
+
+	if (HAS_RQ(my_qp)) {
+		ipz_queue_dtor(my_pd, &my_qp->ipz_rqueue);
+		if (!is_user)
+			vfree(my_qp->rq_map.map);
+	}
+	if (HAS_SQ(my_qp)) {
+		ipz_queue_dtor(my_pd, &my_qp->ipz_squeue);
+		if (!is_user)
+			vfree(my_qp->sq_map.map);
+	}
+	kmem_cache_free(qp_cache, my_qp);
+	atomic_dec(&shca->num_qps);
+	return 0;
+}
+
+int ehca_destroy_qp(struct ib_qp *qp)
+{
+	return internal_destroy_qp(qp->device,
+				   container_of(qp, struct ehca_qp, ib_qp),
+				   qp->uobject);
+}
+
+int ehca_destroy_srq(struct ib_srq *srq)
+{
+	return internal_destroy_qp(srq->device,
+				   container_of(srq, struct ehca_qp, ib_srq),
+				   srq->uobject);
+}
+
+int ehca_init_qp_cache(void)
+{
+	qp_cache = kmem_cache_create("ehca_cache_qp",
+				     sizeof(struct ehca_qp), 0,
+				     SLAB_HWCACHE_ALIGN,
+				     NULL);
+	if (!qp_cache)
+		return -ENOMEM;
+	return 0;
+}
+
+void ehca_cleanup_qp_cache(void)
+{
+	if (qp_cache)
+		kmem_cache_destroy(qp_cache);
+}
diff --git a/drivers/infiniband/hw/ehca/ehca_reqs.c b/drivers/infiniband/hw/ehca/ehca_reqs.c
new file mode 100644
index 0000000..47f9498
--- /dev/null
+++ b/drivers/infiniband/hw/ehca/ehca_reqs.c
@@ -0,0 +1,953 @@
+/*
+ *  IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ *  post_send/recv, poll_cq, req_notify
+ *
+ *  Authors: Hoang-Nam Nguyen <hnguyen@de.ibm.com>
+ *           Waleri Fomin <fomin@de.ibm.com>
+ *           Joachim Fenkes <fenkes@de.ibm.com>
+ *           Reinhard Ernst <rernst@de.ibm.com>
+ *
+ *  Copyright (c) 2005 IBM Corporation
+ *
+ *  All rights reserved.
+ *
+ *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ *  BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+#include "ehca_classes.h"
+#include "ehca_tools.h"
+#include "ehca_qes.h"
+#include "ehca_iverbs.h"
+#include "hcp_if.h"
+#include "hipz_fns.h"
+
+/* in RC traffic, insert an empty RDMA READ every this many packets */
+#define ACK_CIRC_THRESHOLD 2000000
+
+static u64 replace_wr_id(u64 wr_id, u16 idx)
+{
+	u64 ret;
+
+	ret = wr_id & ~QMAP_IDX_MASK;
+	ret |= idx & QMAP_IDX_MASK;
+
+	return ret;
+}
+
+static u16 get_app_wr_id(u64 wr_id)
+{
+	return wr_id & QMAP_IDX_MASK;
+}
+
+static inline int ehca_write_rwqe(struct ipz_queue *ipz_rqueue,
+				  struct ehca_wqe *wqe_p,
+				  struct ib_recv_wr *recv_wr,
+				  u32 rq_map_idx)
+{
+	u8 cnt_ds;
+	if (unlikely((recv_wr->num_sge < 0) ||
+		     (recv_wr->num_sge > ipz_rqueue->act_nr_of_sg))) {
+		ehca_gen_err("Invalid number of WQE SGE. "
+			 "num_sqe=%x max_nr_of_sg=%x",
+			 recv_wr->num_sge, ipz_rqueue->act_nr_of_sg);
+		return -EINVAL; /* invalid SG list length */
+	}
+
+	/* clear wqe header until sglist */
+	memset(wqe_p, 0, offsetof(struct ehca_wqe, u.ud_av.sg_list));
+
+	wqe_p->work_request_id = replace_wr_id(recv_wr->wr_id, rq_map_idx);
+	wqe_p->nr_of_data_seg = recv_wr->num_sge;
+
+	for (cnt_ds = 0; cnt_ds < recv_wr->num_sge; cnt_ds++) {
+		wqe_p->u.all_rcv.sg_list[cnt_ds].vaddr =
+			recv_wr->sg_list[cnt_ds].addr;
+		wqe_p->u.all_rcv.sg_list[cnt_ds].lkey =
+			recv_wr->sg_list[cnt_ds].lkey;
+		wqe_p->u.all_rcv.sg_list[cnt_ds].length =
+			recv_wr->sg_list[cnt_ds].length;
+	}
+
+	if (ehca_debug_level >= 3) {
+		ehca_gen_dbg("RECEIVE WQE written into ipz_rqueue=%p",
+			     ipz_rqueue);
+		ehca_dmp(wqe_p, 16*(6 + wqe_p->nr_of_data_seg), "recv wqe");
+	}
+
+	return 0;
+}
+
+#if defined(DEBUG_GSI_SEND_WR)
+
+/* need ib_mad struct */
+#include <rdma/ib_mad.h>
+
+static void trace_send_wr_ud(const struct ib_send_wr *send_wr)
+{
+	int idx;
+	int j;
+	while (send_wr) {
+		struct ib_mad_hdr *mad_hdr = send_wr->wr.ud.mad_hdr;
+		struct ib_sge *sge = send_wr->sg_list;
+		ehca_gen_dbg("send_wr#%x wr_id=%lx num_sge=%x "
+			     "send_flags=%x opcode=%x", idx, send_wr->wr_id,
+			     send_wr->num_sge, send_wr->send_flags,
+			     send_wr->opcode);
+		if (mad_hdr) {
+			ehca_gen_dbg("send_wr#%x mad_hdr base_version=%x "
+				     "mgmt_class=%x class_version=%x method=%x "
+				     "status=%x class_specific=%x tid=%lx "
+				     "attr_id=%x resv=%x attr_mod=%x",
+				     idx, mad_hdr->base_version,
+				     mad_hdr->mgmt_class,
+				     mad_hdr->class_version, mad_hdr->method,
+				     mad_hdr->status, mad_hdr->class_specific,
+				     mad_hdr->tid, mad_hdr->attr_id,
+				     mad_hdr->resv,
+				     mad_hdr->attr_mod);
+		}
+		for (j = 0; j < send_wr->num_sge; j++) {
+			u8 *data = __va(sge->addr);
+			ehca_gen_dbg("send_wr#%x sge#%x addr=%p length=%x "
+				     "lkey=%x",
+				     idx, j, data, sge->length, sge->lkey);
+			/* assume length is n*16 */
+			ehca_dmp(data, sge->length, "send_wr#%x sge#%x",
+				 idx, j);
+			sge++;
+		} /* eof for j */
+		idx++;
+		send_wr = send_wr->next;
+	} /* eof while send_wr */
+}
+
+#endif /* DEBUG_GSI_SEND_WR */
+
+static inline int ehca_write_swqe(struct ehca_qp *qp,
+				  struct ehca_wqe *wqe_p,
+				  const struct ib_send_wr *send_wr,
+				  u32 sq_map_idx,
+				  int hidden)
+{
+	u32 idx;
+	u64 dma_length;
+	struct ehca_av *my_av;
+	u32 remote_qkey = send_wr->wr.ud.remote_qkey;
+	struct ehca_qmap_entry *qmap_entry = &qp->sq_map.map[sq_map_idx];
+
+	if (unlikely((send_wr->num_sge < 0) ||
+		     (send_wr->num_sge > qp->ipz_squeue.act_nr_of_sg))) {
+		ehca_gen_err("Invalid number of WQE SGE. "
+			 "num_sqe=%x max_nr_of_sg=%x",
+			 send_wr->num_sge, qp->ipz_squeue.act_nr_of_sg);
+		return -EINVAL; /* invalid SG list length */
+	}
+
+	/* clear wqe header until sglist */
+	memset(wqe_p, 0, offsetof(struct ehca_wqe, u.ud_av.sg_list));
+
+	wqe_p->work_request_id = replace_wr_id(send_wr->wr_id, sq_map_idx);
+
+	qmap_entry->app_wr_id = get_app_wr_id(send_wr->wr_id);
+	qmap_entry->reported = 0;
+	qmap_entry->cqe_req = 0;
+
+	switch (send_wr->opcode) {
+	case IB_WR_SEND:
+	case IB_WR_SEND_WITH_IMM:
+		wqe_p->optype = WQE_OPTYPE_SEND;
+		break;
+	case IB_WR_RDMA_WRITE:
+	case IB_WR_RDMA_WRITE_WITH_IMM:
+		wqe_p->optype = WQE_OPTYPE_RDMAWRITE;
+		break;
+	case IB_WR_RDMA_READ:
+		wqe_p->optype = WQE_OPTYPE_RDMAREAD;
+		break;
+	default:
+		ehca_gen_err("Invalid opcode=%x", send_wr->opcode);
+		return -EINVAL; /* invalid opcode */
+	}
+
+	wqe_p->wqef = (send_wr->opcode) & WQEF_HIGH_NIBBLE;
+
+	wqe_p->wr_flag = 0;
+
+	if ((send_wr->send_flags & IB_SEND_SIGNALED ||
+	    qp->init_attr.sq_sig_type == IB_SIGNAL_ALL_WR)
+	    && !hidden) {
+		wqe_p->wr_flag |= WQE_WRFLAG_REQ_SIGNAL_COM;
+		qmap_entry->cqe_req = 1;
+	}
+
+	if (send_wr->opcode == IB_WR_SEND_WITH_IMM ||
+	    send_wr->opcode == IB_WR_RDMA_WRITE_WITH_IMM) {
+		/* this might not work as long as HW does not support it */
+		wqe_p->immediate_data = be32_to_cpu(send_wr->ex.imm_data);
+		wqe_p->wr_flag |= WQE_WRFLAG_IMM_DATA_PRESENT;
+	}
+
+	wqe_p->nr_of_data_seg = send_wr->num_sge;
+
+	switch (qp->qp_type) {
+	case IB_QPT_SMI:
+	case IB_QPT_GSI:
+		/* no break is intential here */
+	case IB_QPT_UD:
+		/* IB 1.2 spec C10-15 compliance */
+		if (send_wr->wr.ud.remote_qkey & 0x80000000)
+			remote_qkey = qp->qkey;
+
+		wqe_p->destination_qp_number = send_wr->wr.ud.remote_qpn << 8;
+		wqe_p->local_ee_context_qkey = remote_qkey;
+		if (unlikely(!send_wr->wr.ud.ah)) {
+			ehca_gen_err("wr.ud.ah is NULL. qp=%p", qp);
+			return -EINVAL;
+		}
+		if (unlikely(send_wr->wr.ud.remote_qpn == 0)) {
+			ehca_gen_err("dest QP# is 0. qp=%x", qp->real_qp_num);
+			return -EINVAL;
+		}
+		my_av = container_of(send_wr->wr.ud.ah, struct ehca_av, ib_ah);
+		wqe_p->u.ud_av.ud_av = my_av->av;
+
+		/*
+		 * omitted check of IB_SEND_INLINE
+		 * since HW does not support it
+		 */
+		for (idx = 0; idx < send_wr->num_sge; idx++) {
+			wqe_p->u.ud_av.sg_list[idx].vaddr =
+				send_wr->sg_list[idx].addr;
+			wqe_p->u.ud_av.sg_list[idx].lkey =
+				send_wr->sg_list[idx].lkey;
+			wqe_p->u.ud_av.sg_list[idx].length =
+				send_wr->sg_list[idx].length;
+		} /* eof for idx */
+		if (qp->qp_type == IB_QPT_SMI ||
+		    qp->qp_type == IB_QPT_GSI)
+			wqe_p->u.ud_av.ud_av.pmtu = 1;
+		if (qp->qp_type == IB_QPT_GSI) {
+			wqe_p->pkeyi = send_wr->wr.ud.pkey_index;
+#ifdef DEBUG_GSI_SEND_WR
+			trace_send_wr_ud(send_wr);
+#endif /* DEBUG_GSI_SEND_WR */
+		}
+		break;
+
+	case IB_QPT_UC:
+		if (send_wr->send_flags & IB_SEND_FENCE)
+			wqe_p->wr_flag |= WQE_WRFLAG_FENCE;
+		/* no break is intentional here */
+	case IB_QPT_RC:
+		/* TODO: atomic not implemented */
+		wqe_p->u.nud.remote_virtual_address =
+			send_wr->wr.rdma.remote_addr;
+		wqe_p->u.nud.rkey = send_wr->wr.rdma.rkey;
+
+		/*
+		 * omitted checking of IB_SEND_INLINE
+		 * since HW does not support it
+		 */
+		dma_length = 0;
+		for (idx = 0; idx < send_wr->num_sge; idx++) {
+			wqe_p->u.nud.sg_list[idx].vaddr =
+				send_wr->sg_list[idx].addr;
+			wqe_p->u.nud.sg_list[idx].lkey =
+				send_wr->sg_list[idx].lkey;
+			wqe_p->u.nud.sg_list[idx].length =
+				send_wr->sg_list[idx].length;
+			dma_length += send_wr->sg_list[idx].length;
+		} /* eof idx */
+		wqe_p->u.nud.atomic_1st_op_dma_len = dma_length;
+
+		/* unsolicited ack circumvention */
+		if (send_wr->opcode == IB_WR_RDMA_READ) {
+			/* on RDMA read, switch on and reset counters */
+			qp->message_count = qp->packet_count = 0;
+			qp->unsol_ack_circ = 1;
+		} else
+			/* else estimate #packets */
+			qp->packet_count += (dma_length >> qp->mtu_shift) + 1;
+
+		break;
+
+	default:
+		ehca_gen_err("Invalid qptype=%x", qp->qp_type);
+		return -EINVAL;
+	}
+
+	if (ehca_debug_level >= 3) {
+		ehca_gen_dbg("SEND WQE written into queue qp=%p ", qp);
+		ehca_dmp( wqe_p, 16*(6 + wqe_p->nr_of_data_seg), "send wqe");
+	}
+	return 0;
+}
+
+/* map_ib_wc_status converts raw cqe_status to ib_wc_status */
+static inline void map_ib_wc_status(u32 cqe_status,
+				    enum ib_wc_status *wc_status)
+{
+	if (unlikely(cqe_status & WC_STATUS_ERROR_BIT)) {
+		switch (cqe_status & 0x3F) {
+		case 0x01:
+		case 0x21:
+			*wc_status = IB_WC_LOC_LEN_ERR;
+			break;
+		case 0x02:
+		case 0x22:
+			*wc_status = IB_WC_LOC_QP_OP_ERR;
+			break;
+		case 0x03:
+		case 0x23:
+			*wc_status = IB_WC_LOC_EEC_OP_ERR;
+			break;
+		case 0x04:
+		case 0x24:
+			*wc_status = IB_WC_LOC_PROT_ERR;
+			break;
+		case 0x05:
+		case 0x25:
+			*wc_status = IB_WC_WR_FLUSH_ERR;
+			break;
+		case 0x06:
+			*wc_status = IB_WC_MW_BIND_ERR;
+			break;
+		case 0x07: /* remote error - look into bits 20:24 */
+			switch ((cqe_status
+				 & WC_STATUS_REMOTE_ERROR_FLAGS) >> 11) {
+			case 0x0:
+				/*
+				 * PSN Sequence Error!
+				 * couldn't find a matching status!
+				 */
+				*wc_status = IB_WC_GENERAL_ERR;
+				break;
+			case 0x1:
+				*wc_status = IB_WC_REM_INV_REQ_ERR;
+				break;
+			case 0x2:
+				*wc_status = IB_WC_REM_ACCESS_ERR;
+				break;
+			case 0x3:
+				*wc_status = IB_WC_REM_OP_ERR;
+				break;
+			case 0x4:
+				*wc_status = IB_WC_REM_INV_RD_REQ_ERR;
+				break;
+			}
+			break;
+		case 0x08:
+			*wc_status = IB_WC_RETRY_EXC_ERR;
+			break;
+		case 0x09:
+			*wc_status = IB_WC_RNR_RETRY_EXC_ERR;
+			break;
+		case 0x0A:
+		case 0x2D:
+			*wc_status = IB_WC_REM_ABORT_ERR;
+			break;
+		case 0x0B:
+		case 0x2E:
+			*wc_status = IB_WC_INV_EECN_ERR;
+			break;
+		case 0x0C:
+		case 0x2F:
+			*wc_status = IB_WC_INV_EEC_STATE_ERR;
+			break;
+		case 0x0D:
+			*wc_status = IB_WC_BAD_RESP_ERR;
+			break;
+		case 0x10:
+			/* WQE purged */
+			*wc_status = IB_WC_WR_FLUSH_ERR;
+			break;
+		default:
+			*wc_status = IB_WC_FATAL_ERR;
+
+		}
+	} else
+		*wc_status = IB_WC_SUCCESS;
+}
+
+static inline int post_one_send(struct ehca_qp *my_qp,
+			 struct ib_send_wr *cur_send_wr,
+			 int hidden)
+{
+	struct ehca_wqe *wqe_p;
+	int ret;
+	u32 sq_map_idx;
+	u64 start_offset = my_qp->ipz_squeue.current_q_offset;
+
+	/* get pointer next to free WQE */
+	wqe_p = ipz_qeit_get_inc(&my_qp->ipz_squeue);
+	if (unlikely(!wqe_p)) {
+		/* too many posted work requests: queue overflow */
+		ehca_err(my_qp->ib_qp.device, "Too many posted WQEs "
+			 "qp_num=%x", my_qp->ib_qp.qp_num);
+		return -ENOMEM;
+	}
+
+	/*
+	 * Get the index of the WQE in the send queue. The same index is used
+	 * for writing into the sq_map.
+	 */
+	sq_map_idx = start_offset / my_qp->ipz_squeue.qe_size;
+
+	/* write a SEND WQE into the QUEUE */
+	ret = ehca_write_swqe(my_qp, wqe_p, cur_send_wr, sq_map_idx, hidden);
+	/*
+	 * if something failed,
+	 * reset the free entry pointer to the start value
+	 */
+	if (unlikely(ret)) {
+		my_qp->ipz_squeue.current_q_offset = start_offset;
+		ehca_err(my_qp->ib_qp.device, "Could not write WQE "
+			 "qp_num=%x", my_qp->ib_qp.qp_num);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+int ehca_post_send(struct ib_qp *qp,
+		   struct ib_send_wr *send_wr,
+		   struct ib_send_wr **bad_send_wr)
+{
+	struct ehca_qp *my_qp = container_of(qp, struct ehca_qp, ib_qp);
+	int wqe_cnt = 0;
+	int ret = 0;
+	unsigned long flags;
+
+	/* Reject WR if QP is in RESET, INIT or RTR state */
+	if (unlikely(my_qp->state < IB_QPS_RTS)) {
+		ehca_err(qp->device, "Invalid QP state  qp_state=%d qpn=%x",
+			 my_qp->state, qp->qp_num);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	/* LOCK the QUEUE */
+	spin_lock_irqsave(&my_qp->spinlock_s, flags);
+
+	/* Send an empty extra RDMA read if:
+	 *  1) there has been an RDMA read on this connection before
+	 *  2) no RDMA read occurred for ACK_CIRC_THRESHOLD link packets
+	 *  3) we can be sure that any previous extra RDMA read has been
+	 *     processed so we don't overflow the SQ
+	 */
+	if (unlikely(my_qp->unsol_ack_circ &&
+		     my_qp->packet_count > ACK_CIRC_THRESHOLD &&
+		     my_qp->message_count > my_qp->init_attr.cap.max_send_wr)) {
+		/* insert an empty RDMA READ to fix up the remote QP state */
+		struct ib_send_wr circ_wr;
+		memset(&circ_wr, 0, sizeof(circ_wr));
+		circ_wr.opcode = IB_WR_RDMA_READ;
+		post_one_send(my_qp, &circ_wr, 1); /* ignore retcode */
+		wqe_cnt++;
+		ehca_dbg(qp->device, "posted circ wr  qp_num=%x", qp->qp_num);
+		my_qp->message_count = my_qp->packet_count = 0;
+	}
+
+	/* loop processes list of send reqs */
+	while (send_wr) {
+		ret = post_one_send(my_qp, send_wr, 0);
+		if (unlikely(ret)) {
+			goto post_send_exit0;
+		}
+		wqe_cnt++;
+		send_wr = send_wr->next;
+	}
+
+post_send_exit0:
+	iosync(); /* serialize GAL register access */
+	hipz_update_sqa(my_qp, wqe_cnt);
+	if (unlikely(ret || ehca_debug_level >= 2))
+		ehca_dbg(qp->device, "ehca_qp=%p qp_num=%x wqe_cnt=%d ret=%i",
+			 my_qp, qp->qp_num, wqe_cnt, ret);
+	my_qp->message_count += wqe_cnt;
+	spin_unlock_irqrestore(&my_qp->spinlock_s, flags);
+
+out:
+	if (ret)
+		*bad_send_wr = send_wr;
+	return ret;
+}
+
+static int internal_post_recv(struct ehca_qp *my_qp,
+			      struct ib_device *dev,
+			      struct ib_recv_wr *recv_wr,
+			      struct ib_recv_wr **bad_recv_wr)
+{
+	struct ehca_wqe *wqe_p;
+	int wqe_cnt = 0;
+	int ret = 0;
+	u32 rq_map_idx;
+	unsigned long flags;
+	struct ehca_qmap_entry *qmap_entry;
+
+	if (unlikely(!HAS_RQ(my_qp))) {
+		ehca_err(dev, "QP has no RQ  ehca_qp=%p qp_num=%x ext_type=%d",
+			 my_qp, my_qp->real_qp_num, my_qp->ext_type);
+		ret = -ENODEV;
+		goto out;
+	}
+
+	/* LOCK the QUEUE */
+	spin_lock_irqsave(&my_qp->spinlock_r, flags);
+
+	/* loop processes list of recv reqs */
+	while (recv_wr) {
+		u64 start_offset = my_qp->ipz_rqueue.current_q_offset;
+		/* get pointer next to free WQE */
+		wqe_p = ipz_qeit_get_inc(&my_qp->ipz_rqueue);
+		if (unlikely(!wqe_p)) {
+			/* too many posted work requests: queue overflow */
+			ret = -ENOMEM;
+			ehca_err(dev, "Too many posted WQEs "
+				"qp_num=%x", my_qp->real_qp_num);
+			goto post_recv_exit0;
+		}
+		/*
+		 * Get the index of the WQE in the recv queue. The same index
+		 * is used for writing into the rq_map.
+		 */
+		rq_map_idx = start_offset / my_qp->ipz_rqueue.qe_size;
+
+		/* write a RECV WQE into the QUEUE */
+		ret = ehca_write_rwqe(&my_qp->ipz_rqueue, wqe_p, recv_wr,
+				rq_map_idx);
+		/*
+		 * if something failed,
+		 * reset the free entry pointer to the start value
+		 */
+		if (unlikely(ret)) {
+			my_qp->ipz_rqueue.current_q_offset = start_offset;
+			ret = -EINVAL;
+			ehca_err(dev, "Could not write WQE "
+				"qp_num=%x", my_qp->real_qp_num);
+			goto post_recv_exit0;
+		}
+
+		qmap_entry = &my_qp->rq_map.map[rq_map_idx];
+		qmap_entry->app_wr_id = get_app_wr_id(recv_wr->wr_id);
+		qmap_entry->reported = 0;
+		qmap_entry->cqe_req = 1;
+
+		wqe_cnt++;
+		recv_wr = recv_wr->next;
+	} /* eof for recv_wr */
+
+post_recv_exit0:
+	iosync(); /* serialize GAL register access */
+	hipz_update_rqa(my_qp, wqe_cnt);
+	if (unlikely(ret || ehca_debug_level >= 2))
+	    ehca_dbg(dev, "ehca_qp=%p qp_num=%x wqe_cnt=%d ret=%i",
+		     my_qp, my_qp->real_qp_num, wqe_cnt, ret);
+	spin_unlock_irqrestore(&my_qp->spinlock_r, flags);
+
+out:
+	if (ret)
+		*bad_recv_wr = recv_wr;
+
+	return ret;
+}
+
+int ehca_post_recv(struct ib_qp *qp,
+		   struct ib_recv_wr *recv_wr,
+		   struct ib_recv_wr **bad_recv_wr)
+{
+	struct ehca_qp *my_qp = container_of(qp, struct ehca_qp, ib_qp);
+
+	/* Reject WR if QP is in RESET state */
+	if (unlikely(my_qp->state == IB_QPS_RESET)) {
+		ehca_err(qp->device, "Invalid QP state  qp_state=%d qpn=%x",
+			 my_qp->state, qp->qp_num);
+		*bad_recv_wr = recv_wr;
+		return -EINVAL;
+	}
+
+	return internal_post_recv(my_qp, qp->device, recv_wr, bad_recv_wr);
+}
+
+int ehca_post_srq_recv(struct ib_srq *srq,
+		       struct ib_recv_wr *recv_wr,
+		       struct ib_recv_wr **bad_recv_wr)
+{
+	return internal_post_recv(container_of(srq, struct ehca_qp, ib_srq),
+				  srq->device, recv_wr, bad_recv_wr);
+}
+
+/*
+ * ib_wc_opcode table converts ehca wc opcode to ib
+ * Since we use zero to indicate invalid opcode, the actual ib opcode must
+ * be decremented!!!
+ */
+static const u8 ib_wc_opcode[255] = {
+	[0x01] = IB_WC_RECV+1,
+	[0x02] = IB_WC_RECV_RDMA_WITH_IMM+1,
+	[0x04] = IB_WC_BIND_MW+1,
+	[0x08] = IB_WC_FETCH_ADD+1,
+	[0x10] = IB_WC_COMP_SWAP+1,
+	[0x20] = IB_WC_RDMA_WRITE+1,
+	[0x40] = IB_WC_RDMA_READ+1,
+	[0x80] = IB_WC_SEND+1
+};
+
+/* internal function to poll one entry of cq */
+static inline int ehca_poll_cq_one(struct ib_cq *cq, struct ib_wc *wc)
+{
+	int ret = 0, qmap_tail_idx;
+	struct ehca_cq *my_cq = container_of(cq, struct ehca_cq, ib_cq);
+	struct ehca_cqe *cqe;
+	struct ehca_qp *my_qp;
+	struct ehca_qmap_entry *qmap_entry;
+	struct ehca_queue_map *qmap;
+	int cqe_count = 0, is_error;
+
+repoll:
+	cqe = (struct ehca_cqe *)
+		ipz_qeit_get_inc_valid(&my_cq->ipz_queue);
+	if (!cqe) {
+		ret = -EAGAIN;
+		if (ehca_debug_level >= 3)
+			ehca_dbg(cq->device, "Completion queue is empty  "
+				 "my_cq=%p cq_num=%x", my_cq, my_cq->cq_number);
+		goto poll_cq_one_exit0;
+	}
+
+	/* prevents loads being reordered across this point */
+	rmb();
+
+	cqe_count++;
+	if (unlikely(cqe->status & WC_STATUS_PURGE_BIT)) {
+		struct ehca_qp *qp;
+		int purgeflag;
+		unsigned long flags;
+
+		qp = ehca_cq_get_qp(my_cq, cqe->local_qp_number);
+		if (!qp) {
+			ehca_err(cq->device, "cq_num=%x qp_num=%x "
+				 "could not find qp -> ignore cqe",
+				 my_cq->cq_number, cqe->local_qp_number);
+			ehca_dmp(cqe, 64, "cq_num=%x qp_num=%x",
+				 my_cq->cq_number, cqe->local_qp_number);
+			/* ignore this purged cqe */
+			goto repoll;
+		}
+		spin_lock_irqsave(&qp->spinlock_s, flags);
+		purgeflag = qp->sqerr_purgeflag;
+		spin_unlock_irqrestore(&qp->spinlock_s, flags);
+
+		if (purgeflag) {
+			ehca_dbg(cq->device,
+				 "Got CQE with purged bit qp_num=%x src_qp=%x",
+				 cqe->local_qp_number, cqe->remote_qp_number);
+			if (ehca_debug_level >= 2)
+				ehca_dmp(cqe, 64, "qp_num=%x src_qp=%x",
+					 cqe->local_qp_number,
+					 cqe->remote_qp_number);
+			/*
+			 * ignore this to avoid double cqes of bad wqe
+			 * that caused sqe and turn off purge flag
+			 */
+			qp->sqerr_purgeflag = 0;
+			goto repoll;
+		}
+	}
+
+	is_error = cqe->status & WC_STATUS_ERROR_BIT;
+
+	/* trace error CQEs if debug_level >= 1, trace all CQEs if >= 3 */
+	if (unlikely(ehca_debug_level >= 3 || (ehca_debug_level && is_error))) {
+		ehca_dbg(cq->device,
+			 "Received %sCOMPLETION ehca_cq=%p cq_num=%x -----",
+			 is_error ? "ERROR " : "", my_cq, my_cq->cq_number);
+		ehca_dmp(cqe, 64, "ehca_cq=%p cq_num=%x",
+			 my_cq, my_cq->cq_number);
+		ehca_dbg(cq->device,
+			 "ehca_cq=%p cq_num=%x -------------------------",
+			 my_cq, my_cq->cq_number);
+	}
+
+	read_lock(&ehca_qp_idr_lock);
+	my_qp = idr_find(&ehca_qp_idr, cqe->qp_token);
+	read_unlock(&ehca_qp_idr_lock);
+	if (!my_qp)
+		goto repoll;
+	wc->qp = &my_qp->ib_qp;
+
+	qmap_tail_idx = get_app_wr_id(cqe->work_request_id);
+	if (!(cqe->w_completion_flags & WC_SEND_RECEIVE_BIT))
+		/* We got a send completion. */
+		qmap = &my_qp->sq_map;
+	else
+		/* We got a receive completion. */
+		qmap = &my_qp->rq_map;
+
+	/* advance the tail pointer */
+	qmap->tail = qmap_tail_idx;
+
+	if (is_error) {
+		/*
+		 * set left_to_poll to 0 because in error state, we will not
+		 * get any additional CQEs
+		 */
+		my_qp->sq_map.next_wqe_idx = next_index(my_qp->sq_map.tail,
+							my_qp->sq_map.entries);
+		my_qp->sq_map.left_to_poll = 0;
+		ehca_add_to_err_list(my_qp, 1);
+
+		my_qp->rq_map.next_wqe_idx = next_index(my_qp->rq_map.tail,
+							my_qp->rq_map.entries);
+		my_qp->rq_map.left_to_poll = 0;
+		if (HAS_RQ(my_qp))
+			ehca_add_to_err_list(my_qp, 0);
+	}
+
+	qmap_entry = &qmap->map[qmap_tail_idx];
+	if (qmap_entry->reported) {
+		ehca_warn(cq->device, "Double cqe on qp_num=%#x",
+				my_qp->real_qp_num);
+		/* found a double cqe, discard it and read next one */
+		goto repoll;
+	}
+
+	wc->wr_id = replace_wr_id(cqe->work_request_id, qmap_entry->app_wr_id);
+	qmap_entry->reported = 1;
+
+	/* if left_to_poll is decremented to 0, add the QP to the error list */
+	if (qmap->left_to_poll > 0) {
+		qmap->left_to_poll--;
+		if ((my_qp->sq_map.left_to_poll == 0) &&
+				(my_qp->rq_map.left_to_poll == 0)) {
+			ehca_add_to_err_list(my_qp, 1);
+			if (HAS_RQ(my_qp))
+				ehca_add_to_err_list(my_qp, 0);
+		}
+	}
+
+	/* eval ib_wc_opcode */
+	wc->opcode = ib_wc_opcode[cqe->optype]-1;
+	if (unlikely(wc->opcode == -1)) {
+		ehca_err(cq->device, "Invalid cqe->OPType=%x cqe->status=%x "
+			 "ehca_cq=%p cq_num=%x",
+			 cqe->optype, cqe->status, my_cq, my_cq->cq_number);
+		/* dump cqe for other infos */
+		ehca_dmp(cqe, 64, "ehca_cq=%p cq_num=%x",
+			 my_cq, my_cq->cq_number);
+		/* update also queue adder to throw away this entry!!! */
+		goto repoll;
+	}
+
+	/* eval ib_wc_status */
+	if (unlikely(is_error)) {
+		/* complete with errors */
+		map_ib_wc_status(cqe->status, &wc->status);
+		wc->vendor_err = wc->status;
+	} else
+		wc->status = IB_WC_SUCCESS;
+
+	wc->byte_len = cqe->nr_bytes_transferred;
+	wc->pkey_index = cqe->pkey_index;
+	wc->slid = cqe->rlid;
+	wc->dlid_path_bits = cqe->dlid;
+	wc->src_qp = cqe->remote_qp_number;
+	/*
+	 * HW has "Immed data present" and "GRH present" in bits 6 and 5.
+	 * SW defines those in bits 1 and 0, so we can just shift and mask.
+	 */
+	wc->wc_flags = (cqe->w_completion_flags >> 5) & 3;
+	wc->ex.imm_data = cpu_to_be32(cqe->immediate_data);
+	wc->sl = cqe->service_level;
+
+poll_cq_one_exit0:
+	if (cqe_count > 0)
+		hipz_update_feca(my_cq, cqe_count);
+
+	return ret;
+}
+
+static int generate_flush_cqes(struct ehca_qp *my_qp, struct ib_cq *cq,
+			       struct ib_wc *wc, int num_entries,
+			       struct ipz_queue *ipz_queue, int on_sq)
+{
+	int nr = 0;
+	struct ehca_wqe *wqe;
+	u64 offset;
+	struct ehca_queue_map *qmap;
+	struct ehca_qmap_entry *qmap_entry;
+
+	if (on_sq)
+		qmap = &my_qp->sq_map;
+	else
+		qmap = &my_qp->rq_map;
+
+	qmap_entry = &qmap->map[qmap->next_wqe_idx];
+
+	while ((nr < num_entries) && (qmap_entry->reported == 0)) {
+		/* generate flush CQE */
+
+		memset(wc, 0, sizeof(*wc));
+
+		offset = qmap->next_wqe_idx * ipz_queue->qe_size;
+		wqe = (struct ehca_wqe *)ipz_qeit_calc(ipz_queue, offset);
+		if (!wqe) {
+			ehca_err(cq->device, "Invalid wqe offset=%#llx on "
+				 "qp_num=%#x", offset, my_qp->real_qp_num);
+			return nr;
+		}
+
+		wc->wr_id = replace_wr_id(wqe->work_request_id,
+					  qmap_entry->app_wr_id);
+
+		if (on_sq) {
+			switch (wqe->optype) {
+			case WQE_OPTYPE_SEND:
+				wc->opcode = IB_WC_SEND;
+				break;
+			case WQE_OPTYPE_RDMAWRITE:
+				wc->opcode = IB_WC_RDMA_WRITE;
+				break;
+			case WQE_OPTYPE_RDMAREAD:
+				wc->opcode = IB_WC_RDMA_READ;
+				break;
+			default:
+				ehca_err(cq->device, "Invalid optype=%x",
+						wqe->optype);
+				return nr;
+			}
+		} else
+			wc->opcode = IB_WC_RECV;
+
+		if (wqe->wr_flag & WQE_WRFLAG_IMM_DATA_PRESENT) {
+			wc->ex.imm_data = wqe->immediate_data;
+			wc->wc_flags |= IB_WC_WITH_IMM;
+		}
+
+		wc->status = IB_WC_WR_FLUSH_ERR;
+
+		wc->qp = &my_qp->ib_qp;
+
+		/* mark as reported and advance next_wqe pointer */
+		qmap_entry->reported = 1;
+		qmap->next_wqe_idx = next_index(qmap->next_wqe_idx,
+						qmap->entries);
+		qmap_entry = &qmap->map[qmap->next_wqe_idx];
+
+		wc++; nr++;
+	}
+
+	return nr;
+
+}
+
+int ehca_poll_cq(struct ib_cq *cq, int num_entries, struct ib_wc *wc)
+{
+	struct ehca_cq *my_cq = container_of(cq, struct ehca_cq, ib_cq);
+	int nr;
+	struct ehca_qp *err_qp;
+	struct ib_wc *current_wc = wc;
+	int ret = 0;
+	unsigned long flags;
+	int entries_left = num_entries;
+
+	if (num_entries < 1) {
+		ehca_err(cq->device, "Invalid num_entries=%d ehca_cq=%p "
+			 "cq_num=%x", num_entries, my_cq, my_cq->cq_number);
+		ret = -EINVAL;
+		goto poll_cq_exit0;
+	}
+
+	spin_lock_irqsave(&my_cq->spinlock, flags);
+
+	/* generate flush cqes for send queues */
+	list_for_each_entry(err_qp, &my_cq->sqp_err_list, sq_err_node) {
+		nr = generate_flush_cqes(err_qp, cq, current_wc, entries_left,
+				&err_qp->ipz_squeue, 1);
+		entries_left -= nr;
+		current_wc += nr;
+
+		if (entries_left == 0)
+			break;
+	}
+
+	/* generate flush cqes for receive queues */
+	list_for_each_entry(err_qp, &my_cq->rqp_err_list, rq_err_node) {
+		nr = generate_flush_cqes(err_qp, cq, current_wc, entries_left,
+				&err_qp->ipz_rqueue, 0);
+		entries_left -= nr;
+		current_wc += nr;
+
+		if (entries_left == 0)
+			break;
+	}
+
+	for (nr = 0; nr < entries_left; nr++) {
+		ret = ehca_poll_cq_one(cq, current_wc);
+		if (ret)
+			break;
+		current_wc++;
+	} /* eof for nr */
+	entries_left -= nr;
+
+	spin_unlock_irqrestore(&my_cq->spinlock, flags);
+	if (ret == -EAGAIN  || !ret)
+		ret = num_entries - entries_left;
+
+poll_cq_exit0:
+	return ret;
+}
+
+int ehca_req_notify_cq(struct ib_cq *cq, enum ib_cq_notify_flags notify_flags)
+{
+	struct ehca_cq *my_cq = container_of(cq, struct ehca_cq, ib_cq);
+	int ret = 0;
+
+	switch (notify_flags & IB_CQ_SOLICITED_MASK) {
+	case IB_CQ_SOLICITED:
+		hipz_set_cqx_n0(my_cq, 1);
+		break;
+	case IB_CQ_NEXT_COMP:
+		hipz_set_cqx_n1(my_cq, 1);
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	if (notify_flags & IB_CQ_REPORT_MISSED_EVENTS) {
+		unsigned long spl_flags;
+		spin_lock_irqsave(&my_cq->spinlock, spl_flags);
+		ret = ipz_qeit_is_valid(&my_cq->ipz_queue);
+		spin_unlock_irqrestore(&my_cq->spinlock, spl_flags);
+	}
+
+	return ret;
+}
diff --git a/drivers/infiniband/hw/ehca/ehca_sqp.c b/drivers/infiniband/hw/ehca/ehca_sqp.c
new file mode 100644
index 0000000..dba8f9f
--- /dev/null
+++ b/drivers/infiniband/hw/ehca/ehca_sqp.c
@@ -0,0 +1,237 @@
+/*
+ *  IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ *  SQP functions
+ *
+ *  Authors: Khadija Souissi <souissi@de.ibm.com>
+ *           Heiko J Schick <schickhj@de.ibm.com>
+ *
+ *  Copyright (c) 2005 IBM Corporation
+ *
+ *  All rights reserved.
+ *
+ *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ *  BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <rdma/ib_mad.h>
+
+#include "ehca_classes.h"
+#include "ehca_tools.h"
+#include "ehca_iverbs.h"
+#include "hcp_if.h"
+
+#define IB_MAD_STATUS_REDIRECT		cpu_to_be16(0x0002)
+#define IB_MAD_STATUS_UNSUP_VERSION	cpu_to_be16(0x0004)
+#define IB_MAD_STATUS_UNSUP_METHOD	cpu_to_be16(0x0008)
+
+#define IB_PMA_CLASS_PORT_INFO		cpu_to_be16(0x0001)
+
+/**
+ * ehca_define_sqp - Defines special queue pair 1 (GSI QP). When special queue
+ * pair is created successfully, the corresponding port gets active.
+ *
+ * Define Special Queue pair 0 (SMI QP) is still not supported.
+ *
+ * @qp_init_attr: Queue pair init attributes with port and queue pair type
+ */
+
+u64 ehca_define_sqp(struct ehca_shca *shca,
+		    struct ehca_qp *ehca_qp,
+		    struct ib_qp_init_attr *qp_init_attr)
+{
+	u32 pma_qp_nr, bma_qp_nr;
+	u64 ret;
+	u8 port = qp_init_attr->port_num;
+	int counter;
+
+	shca->sport[port - 1].port_state = IB_PORT_DOWN;
+
+	switch (qp_init_attr->qp_type) {
+	case IB_QPT_SMI:
+		/* function not supported yet */
+		break;
+	case IB_QPT_GSI:
+		ret = hipz_h_define_aqp1(shca->ipz_hca_handle,
+					 ehca_qp->ipz_qp_handle,
+					 ehca_qp->galpas.kernel,
+					 (u32) qp_init_attr->port_num,
+					 &pma_qp_nr, &bma_qp_nr);
+
+		if (ret != H_SUCCESS) {
+			ehca_err(&shca->ib_device,
+				 "Can't define AQP1 for port %x. h_ret=%lli",
+				 port, ret);
+			return ret;
+		}
+		shca->sport[port - 1].pma_qp_nr = pma_qp_nr;
+		ehca_dbg(&shca->ib_device, "port=%x pma_qp_nr=%x",
+			 port, pma_qp_nr);
+		break;
+	default:
+		ehca_err(&shca->ib_device, "invalid qp_type=%x",
+			 qp_init_attr->qp_type);
+		return H_PARAMETER;
+	}
+
+	if (ehca_nr_ports < 0) /* autodetect mode */
+		return H_SUCCESS;
+
+	for (counter = 0;
+	     shca->sport[port - 1].port_state != IB_PORT_ACTIVE &&
+		     counter < ehca_port_act_time;
+	     counter++) {
+		ehca_dbg(&shca->ib_device, "... wait until port %x is active",
+			 port);
+		msleep_interruptible(1000);
+	}
+
+	if (counter == ehca_port_act_time) {
+		ehca_err(&shca->ib_device, "Port %x is not active.", port);
+		return H_HARDWARE;
+	}
+
+	return H_SUCCESS;
+}
+
+struct ib_perf {
+	struct ib_mad_hdr mad_hdr;
+	u8 reserved[40];
+	u8 data[192];
+} __attribute__ ((packed));
+
+/* TC/SL/FL packed into 32 bits, as in ClassPortInfo */
+struct tcslfl {
+	u32 tc:8;
+	u32 sl:4;
+	u32 fl:20;
+} __attribute__ ((packed));
+
+/* IP Version/TC/FL packed into 32 bits, as in GRH */
+struct vertcfl {
+	u32 ver:4;
+	u32 tc:8;
+	u32 fl:20;
+} __attribute__ ((packed));
+
+static int ehca_process_perf(struct ib_device *ibdev, u8 port_num,
+			     struct ib_wc *in_wc, struct ib_grh *in_grh,
+			     struct ib_mad *in_mad, struct ib_mad *out_mad)
+{
+	struct ib_perf *in_perf = (struct ib_perf *)in_mad;
+	struct ib_perf *out_perf = (struct ib_perf *)out_mad;
+	struct ib_class_port_info *poi =
+		(struct ib_class_port_info *)out_perf->data;
+	struct tcslfl *tcslfl =
+		(struct tcslfl *)&poi->redirect_tcslfl;
+	struct ehca_shca *shca =
+		container_of(ibdev, struct ehca_shca, ib_device);
+	struct ehca_sport *sport = &shca->sport[port_num - 1];
+
+	ehca_dbg(ibdev, "method=%x", in_perf->mad_hdr.method);
+
+	*out_mad = *in_mad;
+
+	if (in_perf->mad_hdr.class_version != 1) {
+		ehca_warn(ibdev, "Unsupported class_version=%x",
+			  in_perf->mad_hdr.class_version);
+		out_perf->mad_hdr.status = IB_MAD_STATUS_UNSUP_VERSION;
+		goto perf_reply;
+	}
+
+	switch (in_perf->mad_hdr.method) {
+	case IB_MGMT_METHOD_GET:
+	case IB_MGMT_METHOD_SET:
+		/* set class port info for redirection */
+		out_perf->mad_hdr.attr_id = IB_PMA_CLASS_PORT_INFO;
+		out_perf->mad_hdr.status = IB_MAD_STATUS_REDIRECT;
+		memset(poi, 0, sizeof(*poi));
+		poi->base_version = 1;
+		poi->class_version = 1;
+		poi->resp_time_value = 18;
+
+		/* copy local routing information from WC where applicable */
+		tcslfl->sl         = in_wc->sl;
+		poi->redirect_lid  =
+			sport->saved_attr.lid | in_wc->dlid_path_bits;
+		poi->redirect_qp   = sport->pma_qp_nr;
+		poi->redirect_qkey = IB_QP1_QKEY;
+
+		ehca_query_pkey(ibdev, port_num, in_wc->pkey_index,
+				&poi->redirect_pkey);
+
+		/* if request was globally routed, copy route info */
+		if (in_grh) {
+			struct vertcfl *vertcfl =
+				(struct vertcfl *)&in_grh->version_tclass_flow;
+			memcpy(poi->redirect_gid, in_grh->dgid.raw,
+			       sizeof(poi->redirect_gid));
+			tcslfl->tc        = vertcfl->tc;
+			tcslfl->fl        = vertcfl->fl;
+		} else
+			/* else only fill in default GID */
+			ehca_query_gid(ibdev, port_num, 0,
+				       (union ib_gid *)&poi->redirect_gid);
+
+		ehca_dbg(ibdev, "ehca_pma_lid=%x ehca_pma_qp=%x",
+			 sport->saved_attr.lid, sport->pma_qp_nr);
+		break;
+
+	case IB_MGMT_METHOD_GET_RESP:
+		return IB_MAD_RESULT_FAILURE;
+
+	default:
+		out_perf->mad_hdr.status = IB_MAD_STATUS_UNSUP_METHOD;
+		break;
+	}
+
+perf_reply:
+	out_perf->mad_hdr.method = IB_MGMT_METHOD_GET_RESP;
+
+	return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY;
+}
+
+int ehca_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
+		     struct ib_wc *in_wc, struct ib_grh *in_grh,
+		     struct ib_mad *in_mad, struct ib_mad *out_mad)
+{
+	int ret;
+
+	if (!port_num || port_num > ibdev->phys_port_cnt || !in_wc)
+		return IB_MAD_RESULT_FAILURE;
+
+	/* accept only pma request */
+	if (in_mad->mad_hdr.mgmt_class != IB_MGMT_CLASS_PERF_MGMT)
+		return IB_MAD_RESULT_SUCCESS;
+
+	ehca_dbg(ibdev, "port_num=%x src_qp=%x", port_num, in_wc->src_qp);
+	ret = ehca_process_perf(ibdev, port_num, in_wc, in_grh,
+				in_mad, out_mad);
+
+	return ret;
+}
diff --git a/drivers/infiniband/hw/ehca/ehca_tools.h b/drivers/infiniband/hw/ehca/ehca_tools.h
new file mode 100644
index 0000000..d280b12
--- /dev/null
+++ b/drivers/infiniband/hw/ehca/ehca_tools.h
@@ -0,0 +1,155 @@
+/*
+ *  IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ *  auxiliary functions
+ *
+ *  Authors: Christoph Raisch <raisch@de.ibm.com>
+ *           Hoang-Nam Nguyen <hnguyen@de.ibm.com>
+ *           Khadija Souissi <souissik@de.ibm.com>
+ *           Waleri Fomin <fomin@de.ibm.com>
+ *           Heiko J Schick <schickhj@de.ibm.com>
+ *
+ *  Copyright (c) 2005 IBM Corporation
+ *
+ *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ *  BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+#ifndef EHCA_TOOLS_H
+#define EHCA_TOOLS_H
+
+#include <linux/kernel.h>
+#include <linux/spinlock.h>
+#include <linux/delay.h>
+#include <linux/idr.h>
+#include <linux/kthread.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/vmalloc.h>
+#include <linux/notifier.h>
+#include <linux/cpu.h>
+#include <linux/device.h>
+
+#include <linux/atomic.h>
+#include <asm/ibmebus.h>
+#include <asm/io.h>
+#include <asm/pgtable.h>
+#include <asm/hvcall.h>
+
+extern int ehca_debug_level;
+
+#define ehca_dbg(ib_dev, format, arg...) \
+	do { \
+		if (unlikely(ehca_debug_level)) \
+			dev_printk(KERN_DEBUG, (ib_dev)->dma_device, \
+				   "PU%04x EHCA_DBG:%s " format "\n", \
+				   raw_smp_processor_id(), __func__, \
+				   ## arg); \
+	} while (0)
+
+#define ehca_info(ib_dev, format, arg...) \
+	dev_info((ib_dev)->dma_device, "PU%04x EHCA_INFO:%s " format "\n", \
+		 raw_smp_processor_id(), __func__, ## arg)
+
+#define ehca_warn(ib_dev, format, arg...) \
+	dev_warn((ib_dev)->dma_device, "PU%04x EHCA_WARN:%s " format "\n", \
+		 raw_smp_processor_id(), __func__, ## arg)
+
+#define ehca_err(ib_dev, format, arg...) \
+	dev_err((ib_dev)->dma_device, "PU%04x EHCA_ERR:%s " format "\n", \
+		raw_smp_processor_id(), __func__, ## arg)
+
+/* use this one only if no ib_dev available */
+#define ehca_gen_dbg(format, arg...) \
+	do { \
+		if (unlikely(ehca_debug_level)) \
+			printk(KERN_DEBUG "PU%04x EHCA_DBG:%s " format "\n", \
+			       raw_smp_processor_id(), __func__, ## arg); \
+	} while (0)
+
+#define ehca_gen_warn(format, arg...) \
+	printk(KERN_INFO "PU%04x EHCA_WARN:%s " format "\n", \
+	       raw_smp_processor_id(), __func__, ## arg)
+
+#define ehca_gen_err(format, arg...) \
+	printk(KERN_ERR "PU%04x EHCA_ERR:%s " format "\n", \
+	       raw_smp_processor_id(), __func__, ## arg)
+
+/**
+ * ehca_dmp - printk a memory block, whose length is n*8 bytes.
+ * Each line has the following layout:
+ * <format string> adr=X ofs=Y <8 bytes hex> <8 bytes hex>
+ */
+#define ehca_dmp(adr, len, format, args...) \
+	do { \
+		unsigned int x; \
+		unsigned int l = (unsigned int)(len); \
+		unsigned char *deb = (unsigned char *)(adr); \
+		for (x = 0; x < l; x += 16) { \
+			printk(KERN_INFO "EHCA_DMP:%s " format \
+			       " adr=%p ofs=%04x %016llx %016llx\n", \
+			       __func__, ##args, deb, x, \
+			       *((u64 *)&deb[0]), *((u64 *)&deb[8])); \
+			deb += 16; \
+		} \
+	} while (0)
+
+/* define a bitmask, little endian version */
+#define EHCA_BMASK(pos, length) (((pos) << 16) + (length))
+
+/* define a bitmask, the ibm way... */
+#define EHCA_BMASK_IBM(from, to) (((63 - to) << 16) + ((to) - (from) + 1))
+
+/* internal function, don't use */
+#define EHCA_BMASK_SHIFTPOS(mask) (((mask) >> 16) & 0xffff)
+
+/* internal function, don't use */
+#define EHCA_BMASK_MASK(mask) (~0ULL >> ((64 - (mask)) & 0xffff))
+
+/**
+ * EHCA_BMASK_SET - return value shifted and masked by mask
+ * variable|=EHCA_BMASK_SET(MY_MASK,0x4711) ORs the bits in variable
+ * variable&=~EHCA_BMASK_SET(MY_MASK,-1) clears the bits from the mask
+ * in variable
+ */
+#define EHCA_BMASK_SET(mask, value) \
+	((EHCA_BMASK_MASK(mask) & ((u64)(value))) << EHCA_BMASK_SHIFTPOS(mask))
+
+/**
+ * EHCA_BMASK_GET - extract a parameter from value by mask
+ */
+#define EHCA_BMASK_GET(mask, value) \
+	(EHCA_BMASK_MASK(mask) & (((u64)(value)) >> EHCA_BMASK_SHIFTPOS(mask)))
+
+/* Converts ehca to ib return code */
+int ehca2ib_return_code(u64 ehca_rc);
+
+#endif /* EHCA_TOOLS_H */
diff --git a/drivers/infiniband/hw/ehca/ehca_uverbs.c b/drivers/infiniband/hw/ehca/ehca_uverbs.c
new file mode 100644
index 0000000..1a1d5d9
--- /dev/null
+++ b/drivers/infiniband/hw/ehca/ehca_uverbs.c
@@ -0,0 +1,309 @@
+/*
+ *  IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ *  userspace support verbs
+ *
+ *  Authors: Christoph Raisch <raisch@de.ibm.com>
+ *           Hoang-Nam Nguyen <hnguyen@de.ibm.com>
+ *           Heiko J Schick <schickhj@de.ibm.com>
+ *
+ *  Copyright (c) 2005 IBM Corporation
+ *
+ *  All rights reserved.
+ *
+ *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ *  BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/slab.h>
+
+#include "ehca_classes.h"
+#include "ehca_iverbs.h"
+#include "ehca_mrmw.h"
+#include "ehca_tools.h"
+#include "hcp_if.h"
+
+struct ib_ucontext *ehca_alloc_ucontext(struct ib_device *device,
+					struct ib_udata *udata)
+{
+	struct ehca_ucontext *my_context;
+
+	my_context = kzalloc(sizeof *my_context, GFP_KERNEL);
+	if (!my_context) {
+		ehca_err(device, "Out of memory device=%p", device);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	return &my_context->ib_ucontext;
+}
+
+int ehca_dealloc_ucontext(struct ib_ucontext *context)
+{
+	kfree(container_of(context, struct ehca_ucontext, ib_ucontext));
+	return 0;
+}
+
+static void ehca_mm_open(struct vm_area_struct *vma)
+{
+	u32 *count = (u32 *)vma->vm_private_data;
+	if (!count) {
+		ehca_gen_err("Invalid vma struct vm_start=%lx vm_end=%lx",
+			     vma->vm_start, vma->vm_end);
+		return;
+	}
+	(*count)++;
+	if (!(*count))
+		ehca_gen_err("Use count overflow vm_start=%lx vm_end=%lx",
+			     vma->vm_start, vma->vm_end);
+	ehca_gen_dbg("vm_start=%lx vm_end=%lx count=%x",
+		     vma->vm_start, vma->vm_end, *count);
+}
+
+static void ehca_mm_close(struct vm_area_struct *vma)
+{
+	u32 *count = (u32 *)vma->vm_private_data;
+	if (!count) {
+		ehca_gen_err("Invalid vma struct vm_start=%lx vm_end=%lx",
+			     vma->vm_start, vma->vm_end);
+		return;
+	}
+	(*count)--;
+	ehca_gen_dbg("vm_start=%lx vm_end=%lx count=%x",
+		     vma->vm_start, vma->vm_end, *count);
+}
+
+static const struct vm_operations_struct vm_ops = {
+	.open =	ehca_mm_open,
+	.close = ehca_mm_close,
+};
+
+static int ehca_mmap_fw(struct vm_area_struct *vma, struct h_galpas *galpas,
+			u32 *mm_count)
+{
+	int ret;
+	u64 vsize, physical;
+
+	vsize = vma->vm_end - vma->vm_start;
+	if (vsize < EHCA_PAGESIZE) {
+		ehca_gen_err("invalid vsize=%lx", vma->vm_end - vma->vm_start);
+		return -EINVAL;
+	}
+
+	physical = galpas->user.fw_handle;
+	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+	ehca_gen_dbg("vsize=%llx physical=%llx", vsize, physical);
+	/* VM_IO | VM_DONTEXPAND | VM_DONTDUMP are set by remap_pfn_range() */
+	ret = remap_4k_pfn(vma, vma->vm_start, physical >> EHCA_PAGESHIFT,
+			   vma->vm_page_prot);
+	if (unlikely(ret)) {
+		ehca_gen_err("remap_pfn_range() failed ret=%i", ret);
+		return -ENOMEM;
+	}
+
+	vma->vm_private_data = mm_count;
+	(*mm_count)++;
+	vma->vm_ops = &vm_ops;
+
+	return 0;
+}
+
+static int ehca_mmap_queue(struct vm_area_struct *vma, struct ipz_queue *queue,
+			   u32 *mm_count)
+{
+	int ret;
+	u64 start, ofs;
+	struct page *page;
+
+	vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
+	start = vma->vm_start;
+	for (ofs = 0; ofs < queue->queue_length; ofs += PAGE_SIZE) {
+		u64 virt_addr = (u64)ipz_qeit_calc(queue, ofs);
+		page = virt_to_page(virt_addr);
+		ret = vm_insert_page(vma, start, page);
+		if (unlikely(ret)) {
+			ehca_gen_err("vm_insert_page() failed rc=%i", ret);
+			return ret;
+		}
+		start += PAGE_SIZE;
+	}
+	vma->vm_private_data = mm_count;
+	(*mm_count)++;
+	vma->vm_ops = &vm_ops;
+
+	return 0;
+}
+
+static int ehca_mmap_cq(struct vm_area_struct *vma, struct ehca_cq *cq,
+			u32 rsrc_type)
+{
+	int ret;
+
+	switch (rsrc_type) {
+	case 0: /* galpa fw handle */
+		ehca_dbg(cq->ib_cq.device, "cq_num=%x fw", cq->cq_number);
+		ret = ehca_mmap_fw(vma, &cq->galpas, &cq->mm_count_galpa);
+		if (unlikely(ret)) {
+			ehca_err(cq->ib_cq.device,
+				 "ehca_mmap_fw() failed rc=%i cq_num=%x",
+				 ret, cq->cq_number);
+			return ret;
+		}
+		break;
+
+	case 1: /* cq queue_addr */
+		ehca_dbg(cq->ib_cq.device, "cq_num=%x queue", cq->cq_number);
+		ret = ehca_mmap_queue(vma, &cq->ipz_queue, &cq->mm_count_queue);
+		if (unlikely(ret)) {
+			ehca_err(cq->ib_cq.device,
+				 "ehca_mmap_queue() failed rc=%i cq_num=%x",
+				 ret, cq->cq_number);
+			return ret;
+		}
+		break;
+
+	default:
+		ehca_err(cq->ib_cq.device, "bad resource type=%x cq_num=%x",
+			 rsrc_type, cq->cq_number);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int ehca_mmap_qp(struct vm_area_struct *vma, struct ehca_qp *qp,
+			u32 rsrc_type)
+{
+	int ret;
+
+	switch (rsrc_type) {
+	case 0: /* galpa fw handle */
+		ehca_dbg(qp->ib_qp.device, "qp_num=%x fw", qp->ib_qp.qp_num);
+		ret = ehca_mmap_fw(vma, &qp->galpas, &qp->mm_count_galpa);
+		if (unlikely(ret)) {
+			ehca_err(qp->ib_qp.device,
+				 "remap_pfn_range() failed ret=%i qp_num=%x",
+				 ret, qp->ib_qp.qp_num);
+			return -ENOMEM;
+		}
+		break;
+
+	case 1: /* qp rqueue_addr */
+		ehca_dbg(qp->ib_qp.device, "qp_num=%x rq", qp->ib_qp.qp_num);
+		ret = ehca_mmap_queue(vma, &qp->ipz_rqueue,
+				      &qp->mm_count_rqueue);
+		if (unlikely(ret)) {
+			ehca_err(qp->ib_qp.device,
+				 "ehca_mmap_queue(rq) failed rc=%i qp_num=%x",
+				 ret, qp->ib_qp.qp_num);
+			return ret;
+		}
+		break;
+
+	case 2: /* qp squeue_addr */
+		ehca_dbg(qp->ib_qp.device, "qp_num=%x sq", qp->ib_qp.qp_num);
+		ret = ehca_mmap_queue(vma, &qp->ipz_squeue,
+				      &qp->mm_count_squeue);
+		if (unlikely(ret)) {
+			ehca_err(qp->ib_qp.device,
+				 "ehca_mmap_queue(sq) failed rc=%i qp_num=%x",
+				 ret, qp->ib_qp.qp_num);
+			return ret;
+		}
+		break;
+
+	default:
+		ehca_err(qp->ib_qp.device, "bad resource type=%x qp=num=%x",
+			 rsrc_type, qp->ib_qp.qp_num);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+int ehca_mmap(struct ib_ucontext *context, struct vm_area_struct *vma)
+{
+	u64 fileoffset = vma->vm_pgoff;
+	u32 idr_handle = fileoffset & 0x1FFFFFF;
+	u32 q_type = (fileoffset >> 27) & 0x1;	  /* CQ, QP,...        */
+	u32 rsrc_type = (fileoffset >> 25) & 0x3; /* sq,rq,cmnd_window */
+	u32 ret;
+	struct ehca_cq *cq;
+	struct ehca_qp *qp;
+	struct ib_uobject *uobject;
+
+	switch (q_type) {
+	case  0: /* CQ */
+		read_lock(&ehca_cq_idr_lock);
+		cq = idr_find(&ehca_cq_idr, idr_handle);
+		read_unlock(&ehca_cq_idr_lock);
+
+		/* make sure this mmap really belongs to the authorized user */
+		if (!cq)
+			return -EINVAL;
+
+		if (!cq->ib_cq.uobject || cq->ib_cq.uobject->context != context)
+			return -EINVAL;
+
+		ret = ehca_mmap_cq(vma, cq, rsrc_type);
+		if (unlikely(ret)) {
+			ehca_err(cq->ib_cq.device,
+				 "ehca_mmap_cq() failed rc=%i cq_num=%x",
+				 ret, cq->cq_number);
+			return ret;
+		}
+		break;
+
+	case 1: /* QP */
+		read_lock(&ehca_qp_idr_lock);
+		qp = idr_find(&ehca_qp_idr, idr_handle);
+		read_unlock(&ehca_qp_idr_lock);
+
+		/* make sure this mmap really belongs to the authorized user */
+		if (!qp)
+			return -EINVAL;
+
+		uobject = IS_SRQ(qp) ? qp->ib_srq.uobject : qp->ib_qp.uobject;
+		if (!uobject || uobject->context != context)
+			return -EINVAL;
+
+		ret = ehca_mmap_qp(vma, qp, rsrc_type);
+		if (unlikely(ret)) {
+			ehca_err(qp->ib_qp.device,
+				 "ehca_mmap_qp() failed rc=%i qp_num=%x",
+				 ret, qp->ib_qp.qp_num);
+			return ret;
+		}
+		break;
+
+	default:
+		ehca_gen_err("bad queue type %x", q_type);
+		return -EINVAL;
+	}
+
+	return 0;
+}
diff --git a/drivers/infiniband/hw/ehca/hcp_if.c b/drivers/infiniband/hw/ehca/hcp_if.c
new file mode 100644
index 0000000..89517ff
--- /dev/null
+++ b/drivers/infiniband/hw/ehca/hcp_if.c
@@ -0,0 +1,949 @@
+/*
+ *  IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ *  Firmware Infiniband Interface code for POWER
+ *
+ *  Authors: Christoph Raisch <raisch@de.ibm.com>
+ *           Hoang-Nam Nguyen <hnguyen@de.ibm.com>
+ *           Joachim Fenkes <fenkes@de.ibm.com>
+ *           Gerd Bayer <gerd.bayer@de.ibm.com>
+ *           Waleri Fomin <fomin@de.ibm.com>
+ *
+ *  Copyright (c) 2005 IBM Corporation
+ *
+ *  All rights reserved.
+ *
+ *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ *  BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <asm/hvcall.h>
+#include "ehca_tools.h"
+#include "hcp_if.h"
+#include "hcp_phyp.h"
+#include "hipz_fns.h"
+#include "ipz_pt_fn.h"
+
+#define H_ALL_RES_QP_ENHANCED_OPS       EHCA_BMASK_IBM(9, 11)
+#define H_ALL_RES_QP_PTE_PIN            EHCA_BMASK_IBM(12, 12)
+#define H_ALL_RES_QP_SERVICE_TYPE       EHCA_BMASK_IBM(13, 15)
+#define H_ALL_RES_QP_STORAGE            EHCA_BMASK_IBM(16, 17)
+#define H_ALL_RES_QP_LL_RQ_CQE_POSTING  EHCA_BMASK_IBM(18, 18)
+#define H_ALL_RES_QP_LL_SQ_CQE_POSTING  EHCA_BMASK_IBM(19, 21)
+#define H_ALL_RES_QP_SIGNALING_TYPE     EHCA_BMASK_IBM(22, 23)
+#define H_ALL_RES_QP_UD_AV_LKEY_CTRL    EHCA_BMASK_IBM(31, 31)
+#define H_ALL_RES_QP_SMALL_SQ_PAGE_SIZE EHCA_BMASK_IBM(32, 35)
+#define H_ALL_RES_QP_SMALL_RQ_PAGE_SIZE EHCA_BMASK_IBM(36, 39)
+#define H_ALL_RES_QP_RESOURCE_TYPE      EHCA_BMASK_IBM(56, 63)
+
+#define H_ALL_RES_QP_MAX_OUTST_SEND_WR  EHCA_BMASK_IBM(0, 15)
+#define H_ALL_RES_QP_MAX_OUTST_RECV_WR  EHCA_BMASK_IBM(16, 31)
+#define H_ALL_RES_QP_MAX_SEND_SGE       EHCA_BMASK_IBM(32, 39)
+#define H_ALL_RES_QP_MAX_RECV_SGE       EHCA_BMASK_IBM(40, 47)
+
+#define H_ALL_RES_QP_UD_AV_LKEY         EHCA_BMASK_IBM(32, 63)
+#define H_ALL_RES_QP_SRQ_QP_TOKEN       EHCA_BMASK_IBM(0, 31)
+#define H_ALL_RES_QP_SRQ_QP_HANDLE      EHCA_BMASK_IBM(0, 64)
+#define H_ALL_RES_QP_SRQ_LIMIT          EHCA_BMASK_IBM(48, 63)
+#define H_ALL_RES_QP_SRQ_QPN            EHCA_BMASK_IBM(40, 63)
+
+#define H_ALL_RES_QP_ACT_OUTST_SEND_WR  EHCA_BMASK_IBM(16, 31)
+#define H_ALL_RES_QP_ACT_OUTST_RECV_WR  EHCA_BMASK_IBM(48, 63)
+#define H_ALL_RES_QP_ACT_SEND_SGE       EHCA_BMASK_IBM(8, 15)
+#define H_ALL_RES_QP_ACT_RECV_SGE       EHCA_BMASK_IBM(24, 31)
+
+#define H_ALL_RES_QP_SQUEUE_SIZE_PAGES  EHCA_BMASK_IBM(0, 31)
+#define H_ALL_RES_QP_RQUEUE_SIZE_PAGES  EHCA_BMASK_IBM(32, 63)
+
+#define H_MP_INIT_TYPE                  EHCA_BMASK_IBM(44, 47)
+#define H_MP_SHUTDOWN                   EHCA_BMASK_IBM(48, 48)
+#define H_MP_RESET_QKEY_CTR             EHCA_BMASK_IBM(49, 49)
+
+#define HCALL4_REGS_FORMAT "r4=%lx r5=%lx r6=%lx r7=%lx"
+#define HCALL7_REGS_FORMAT HCALL4_REGS_FORMAT " r8=%lx r9=%lx r10=%lx"
+#define HCALL9_REGS_FORMAT HCALL7_REGS_FORMAT " r11=%lx r12=%lx"
+
+static DEFINE_SPINLOCK(hcall_lock);
+
+static long ehca_plpar_hcall_norets(unsigned long opcode,
+				    unsigned long arg1,
+				    unsigned long arg2,
+				    unsigned long arg3,
+				    unsigned long arg4,
+				    unsigned long arg5,
+				    unsigned long arg6,
+				    unsigned long arg7)
+{
+	long ret;
+	int i, sleep_msecs;
+	unsigned long flags = 0;
+
+	if (unlikely(ehca_debug_level >= 2))
+		ehca_gen_dbg("opcode=%lx " HCALL7_REGS_FORMAT,
+			     opcode, arg1, arg2, arg3, arg4, arg5, arg6, arg7);
+
+	for (i = 0; i < 5; i++) {
+		/* serialize hCalls to work around firmware issue */
+		if (ehca_lock_hcalls)
+			spin_lock_irqsave(&hcall_lock, flags);
+
+		ret = plpar_hcall_norets(opcode, arg1, arg2, arg3, arg4,
+					 arg5, arg6, arg7);
+
+		if (ehca_lock_hcalls)
+			spin_unlock_irqrestore(&hcall_lock, flags);
+
+		if (H_IS_LONG_BUSY(ret)) {
+			sleep_msecs = get_longbusy_msecs(ret);
+			msleep_interruptible(sleep_msecs);
+			continue;
+		}
+
+		if (ret < H_SUCCESS)
+			ehca_gen_err("opcode=%lx ret=%li " HCALL7_REGS_FORMAT,
+				     opcode, ret, arg1, arg2, arg3,
+				     arg4, arg5, arg6, arg7);
+		else
+			if (unlikely(ehca_debug_level >= 2))
+				ehca_gen_dbg("opcode=%lx ret=%li", opcode, ret);
+
+		return ret;
+	}
+
+	return H_BUSY;
+}
+
+static long ehca_plpar_hcall9(unsigned long opcode,
+			      unsigned long *outs, /* array of 9 outputs */
+			      unsigned long arg1,
+			      unsigned long arg2,
+			      unsigned long arg3,
+			      unsigned long arg4,
+			      unsigned long arg5,
+			      unsigned long arg6,
+			      unsigned long arg7,
+			      unsigned long arg8,
+			      unsigned long arg9)
+{
+	long ret;
+	int i, sleep_msecs;
+	unsigned long flags = 0;
+
+	if (unlikely(ehca_debug_level >= 2))
+		ehca_gen_dbg("INPUT -- opcode=%lx " HCALL9_REGS_FORMAT, opcode,
+			     arg1, arg2, arg3, arg4, arg5,
+			     arg6, arg7, arg8, arg9);
+
+	for (i = 0; i < 5; i++) {
+		/* serialize hCalls to work around firmware issue */
+		if (ehca_lock_hcalls)
+			spin_lock_irqsave(&hcall_lock, flags);
+
+		ret = plpar_hcall9(opcode, outs,
+				   arg1, arg2, arg3, arg4, arg5,
+				   arg6, arg7, arg8, arg9);
+
+		if (ehca_lock_hcalls)
+			spin_unlock_irqrestore(&hcall_lock, flags);
+
+		if (H_IS_LONG_BUSY(ret)) {
+			sleep_msecs = get_longbusy_msecs(ret);
+			msleep_interruptible(sleep_msecs);
+			continue;
+		}
+
+		if (ret < H_SUCCESS) {
+			ehca_gen_err("INPUT -- opcode=%lx " HCALL9_REGS_FORMAT,
+				     opcode, arg1, arg2, arg3, arg4, arg5,
+				     arg6, arg7, arg8, arg9);
+			ehca_gen_err("OUTPUT -- ret=%li " HCALL9_REGS_FORMAT,
+				     ret, outs[0], outs[1], outs[2], outs[3],
+				     outs[4], outs[5], outs[6], outs[7],
+				     outs[8]);
+		} else if (unlikely(ehca_debug_level >= 2))
+			ehca_gen_dbg("OUTPUT -- ret=%li " HCALL9_REGS_FORMAT,
+				     ret, outs[0], outs[1], outs[2], outs[3],
+				     outs[4], outs[5], outs[6], outs[7],
+				     outs[8]);
+		return ret;
+	}
+
+	return H_BUSY;
+}
+
+u64 hipz_h_alloc_resource_eq(const struct ipz_adapter_handle adapter_handle,
+			     struct ehca_pfeq *pfeq,
+			     const u32 neq_control,
+			     const u32 number_of_entries,
+			     struct ipz_eq_handle *eq_handle,
+			     u32 *act_nr_of_entries,
+			     u32 *act_pages,
+			     u32 *eq_ist)
+{
+	u64 ret;
+	unsigned long outs[PLPAR_HCALL9_BUFSIZE];
+	u64 allocate_controls;
+
+	/* resource type */
+	allocate_controls = 3ULL;
+
+	/* ISN is associated */
+	if (neq_control != 1)
+		allocate_controls = (1ULL << (63 - 7)) | allocate_controls;
+	else /* notification event queue */
+		allocate_controls = (1ULL << 63) | allocate_controls;
+
+	ret = ehca_plpar_hcall9(H_ALLOC_RESOURCE, outs,
+				adapter_handle.handle,  /* r4 */
+				allocate_controls,      /* r5 */
+				number_of_entries,      /* r6 */
+				0, 0, 0, 0, 0, 0);
+	eq_handle->handle = outs[0];
+	*act_nr_of_entries = (u32)outs[3];
+	*act_pages = (u32)outs[4];
+	*eq_ist = (u32)outs[5];
+
+	if (ret == H_NOT_ENOUGH_RESOURCES)
+		ehca_gen_err("Not enough resource - ret=%lli ", ret);
+
+	return ret;
+}
+
+u64 hipz_h_reset_event(const struct ipz_adapter_handle adapter_handle,
+		       struct ipz_eq_handle eq_handle,
+		       const u64 event_mask)
+{
+	return ehca_plpar_hcall_norets(H_RESET_EVENTS,
+				       adapter_handle.handle, /* r4 */
+				       eq_handle.handle,      /* r5 */
+				       event_mask,	      /* r6 */
+				       0, 0, 0, 0);
+}
+
+u64 hipz_h_alloc_resource_cq(const struct ipz_adapter_handle adapter_handle,
+			     struct ehca_cq *cq,
+			     struct ehca_alloc_cq_parms *param)
+{
+	int rc;
+	u64 ret;
+	unsigned long outs[PLPAR_HCALL9_BUFSIZE];
+
+	ret = ehca_plpar_hcall9(H_ALLOC_RESOURCE, outs,
+				adapter_handle.handle,   /* r4  */
+				2,	                 /* r5  */
+				param->eq_handle.handle, /* r6  */
+				cq->token,	         /* r7  */
+				param->nr_cqe,           /* r8  */
+				0, 0, 0, 0);
+	cq->ipz_cq_handle.handle = outs[0];
+	param->act_nr_of_entries = (u32)outs[3];
+	param->act_pages = (u32)outs[4];
+
+	if (ret == H_SUCCESS) {
+		rc = hcp_galpas_ctor(&cq->galpas, 0, outs[5], outs[6]);
+		if (rc) {
+			ehca_gen_err("Could not establish HW access. rc=%d paddr=%#lx",
+				     rc, outs[5]);
+
+			ehca_plpar_hcall_norets(H_FREE_RESOURCE,
+						adapter_handle.handle,     /* r4 */
+						cq->ipz_cq_handle.handle,  /* r5 */
+						0, 0, 0, 0, 0);
+			ret = H_NO_MEM;
+		}
+	}
+
+	if (ret == H_NOT_ENOUGH_RESOURCES)
+		ehca_gen_err("Not enough resources. ret=%lli", ret);
+
+	return ret;
+}
+
+u64 hipz_h_alloc_resource_qp(const struct ipz_adapter_handle adapter_handle,
+			     struct ehca_alloc_qp_parms *parms, int is_user)
+{
+	int rc;
+	u64 ret;
+	u64 allocate_controls, max_r10_reg, r11, r12;
+	unsigned long outs[PLPAR_HCALL9_BUFSIZE];
+
+	allocate_controls =
+		EHCA_BMASK_SET(H_ALL_RES_QP_ENHANCED_OPS, parms->ext_type)
+		| EHCA_BMASK_SET(H_ALL_RES_QP_PTE_PIN, 0)
+		| EHCA_BMASK_SET(H_ALL_RES_QP_SERVICE_TYPE, parms->servicetype)
+		| EHCA_BMASK_SET(H_ALL_RES_QP_SIGNALING_TYPE, parms->sigtype)
+		| EHCA_BMASK_SET(H_ALL_RES_QP_STORAGE, parms->qp_storage)
+		| EHCA_BMASK_SET(H_ALL_RES_QP_SMALL_SQ_PAGE_SIZE,
+				 parms->squeue.page_size)
+		| EHCA_BMASK_SET(H_ALL_RES_QP_SMALL_RQ_PAGE_SIZE,
+				 parms->rqueue.page_size)
+		| EHCA_BMASK_SET(H_ALL_RES_QP_LL_RQ_CQE_POSTING,
+				 !!(parms->ll_comp_flags & LLQP_RECV_COMP))
+		| EHCA_BMASK_SET(H_ALL_RES_QP_LL_SQ_CQE_POSTING,
+				 !!(parms->ll_comp_flags & LLQP_SEND_COMP))
+		| EHCA_BMASK_SET(H_ALL_RES_QP_UD_AV_LKEY_CTRL,
+				 parms->ud_av_l_key_ctl)
+		| EHCA_BMASK_SET(H_ALL_RES_QP_RESOURCE_TYPE, 1);
+
+	max_r10_reg =
+		EHCA_BMASK_SET(H_ALL_RES_QP_MAX_OUTST_SEND_WR,
+			       parms->squeue.max_wr + 1)
+		| EHCA_BMASK_SET(H_ALL_RES_QP_MAX_OUTST_RECV_WR,
+				 parms->rqueue.max_wr + 1)
+		| EHCA_BMASK_SET(H_ALL_RES_QP_MAX_SEND_SGE,
+				 parms->squeue.max_sge)
+		| EHCA_BMASK_SET(H_ALL_RES_QP_MAX_RECV_SGE,
+				 parms->rqueue.max_sge);
+
+	r11 = EHCA_BMASK_SET(H_ALL_RES_QP_SRQ_QP_TOKEN, parms->srq_token);
+
+	if (parms->ext_type == EQPT_SRQ)
+		r12 = EHCA_BMASK_SET(H_ALL_RES_QP_SRQ_LIMIT, parms->srq_limit);
+	else
+		r12 = EHCA_BMASK_SET(H_ALL_RES_QP_SRQ_QPN, parms->srq_qpn);
+
+	ret = ehca_plpar_hcall9(H_ALLOC_RESOURCE, outs,
+				adapter_handle.handle,	           /* r4  */
+				allocate_controls,	           /* r5  */
+				parms->send_cq_handle.handle,
+				parms->recv_cq_handle.handle,
+				parms->eq_handle.handle,
+				((u64)parms->token << 32) | parms->pd.value,
+				max_r10_reg, r11, r12);
+
+	parms->qp_handle.handle = outs[0];
+	parms->real_qp_num = (u32)outs[1];
+	parms->squeue.act_nr_wqes =
+		(u16)EHCA_BMASK_GET(H_ALL_RES_QP_ACT_OUTST_SEND_WR, outs[2]);
+	parms->rqueue.act_nr_wqes =
+		(u16)EHCA_BMASK_GET(H_ALL_RES_QP_ACT_OUTST_RECV_WR, outs[2]);
+	parms->squeue.act_nr_sges =
+		(u8)EHCA_BMASK_GET(H_ALL_RES_QP_ACT_SEND_SGE, outs[3]);
+	parms->rqueue.act_nr_sges =
+		(u8)EHCA_BMASK_GET(H_ALL_RES_QP_ACT_RECV_SGE, outs[3]);
+	parms->squeue.queue_size =
+		(u32)EHCA_BMASK_GET(H_ALL_RES_QP_SQUEUE_SIZE_PAGES, outs[4]);
+	parms->rqueue.queue_size =
+		(u32)EHCA_BMASK_GET(H_ALL_RES_QP_RQUEUE_SIZE_PAGES, outs[4]);
+
+	if (ret == H_SUCCESS) {
+		rc = hcp_galpas_ctor(&parms->galpas, is_user, outs[6], outs[6]);
+		if (rc) {
+			ehca_gen_err("Could not establish HW access. rc=%d paddr=%#lx",
+				     rc, outs[6]);
+
+			ehca_plpar_hcall_norets(H_FREE_RESOURCE,
+						adapter_handle.handle,     /* r4 */
+						parms->qp_handle.handle,  /* r5 */
+						0, 0, 0, 0, 0);
+			ret = H_NO_MEM;
+		}
+	}
+
+	if (ret == H_NOT_ENOUGH_RESOURCES)
+		ehca_gen_err("Not enough resources. ret=%lli", ret);
+
+	return ret;
+}
+
+u64 hipz_h_query_port(const struct ipz_adapter_handle adapter_handle,
+		      const u8 port_id,
+		      struct hipz_query_port *query_port_response_block)
+{
+	u64 ret;
+	u64 r_cb = __pa(query_port_response_block);
+
+	if (r_cb & (EHCA_PAGESIZE-1)) {
+		ehca_gen_err("response block not page aligned");
+		return H_PARAMETER;
+	}
+
+	ret = ehca_plpar_hcall_norets(H_QUERY_PORT,
+				      adapter_handle.handle, /* r4 */
+				      port_id,	             /* r5 */
+				      r_cb,	             /* r6 */
+				      0, 0, 0, 0);
+
+	if (ehca_debug_level >= 2)
+		ehca_dmp(query_port_response_block, 64, "response_block");
+
+	return ret;
+}
+
+u64 hipz_h_modify_port(const struct ipz_adapter_handle adapter_handle,
+		       const u8 port_id, const u32 port_cap,
+		       const u8 init_type, const int modify_mask)
+{
+	u64 port_attributes = port_cap;
+
+	if (modify_mask & IB_PORT_SHUTDOWN)
+		port_attributes |= EHCA_BMASK_SET(H_MP_SHUTDOWN, 1);
+	if (modify_mask & IB_PORT_INIT_TYPE)
+		port_attributes |= EHCA_BMASK_SET(H_MP_INIT_TYPE, init_type);
+	if (modify_mask & IB_PORT_RESET_QKEY_CNTR)
+		port_attributes |= EHCA_BMASK_SET(H_MP_RESET_QKEY_CTR, 1);
+
+	return ehca_plpar_hcall_norets(H_MODIFY_PORT,
+				       adapter_handle.handle, /* r4 */
+				       port_id,               /* r5 */
+				       port_attributes,       /* r6 */
+				       0, 0, 0, 0);
+}
+
+u64 hipz_h_query_hca(const struct ipz_adapter_handle adapter_handle,
+		     struct hipz_query_hca *query_hca_rblock)
+{
+	u64 r_cb = __pa(query_hca_rblock);
+
+	if (r_cb & (EHCA_PAGESIZE-1)) {
+		ehca_gen_err("response_block=%p not page aligned",
+			     query_hca_rblock);
+		return H_PARAMETER;
+	}
+
+	return ehca_plpar_hcall_norets(H_QUERY_HCA,
+				       adapter_handle.handle, /* r4 */
+				       r_cb,                  /* r5 */
+				       0, 0, 0, 0, 0);
+}
+
+u64 hipz_h_register_rpage(const struct ipz_adapter_handle adapter_handle,
+			  const u8 pagesize,
+			  const u8 queue_type,
+			  const u64 resource_handle,
+			  const u64 logical_address_of_page,
+			  u64 count)
+{
+	return ehca_plpar_hcall_norets(H_REGISTER_RPAGES,
+				       adapter_handle.handle,      /* r4  */
+				       (u64)queue_type | ((u64)pagesize) << 8,
+				       /* r5  */
+				       resource_handle,	           /* r6  */
+				       logical_address_of_page,    /* r7  */
+				       count,	                   /* r8  */
+				       0, 0);
+}
+
+u64 hipz_h_register_rpage_eq(const struct ipz_adapter_handle adapter_handle,
+			     const struct ipz_eq_handle eq_handle,
+			     struct ehca_pfeq *pfeq,
+			     const u8 pagesize,
+			     const u8 queue_type,
+			     const u64 logical_address_of_page,
+			     const u64 count)
+{
+	if (count != 1) {
+		ehca_gen_err("Ppage counter=%llx", count);
+		return H_PARAMETER;
+	}
+	return hipz_h_register_rpage(adapter_handle,
+				     pagesize,
+				     queue_type,
+				     eq_handle.handle,
+				     logical_address_of_page, count);
+}
+
+u64 hipz_h_query_int_state(const struct ipz_adapter_handle adapter_handle,
+			   u32 ist)
+{
+	u64 ret;
+	ret = ehca_plpar_hcall_norets(H_QUERY_INT_STATE,
+				      adapter_handle.handle, /* r4 */
+				      ist,                   /* r5 */
+				      0, 0, 0, 0, 0);
+
+	if (ret != H_SUCCESS && ret != H_BUSY)
+		ehca_gen_err("Could not query interrupt state.");
+
+	return ret;
+}
+
+u64 hipz_h_register_rpage_cq(const struct ipz_adapter_handle adapter_handle,
+			     const struct ipz_cq_handle cq_handle,
+			     struct ehca_pfcq *pfcq,
+			     const u8 pagesize,
+			     const u8 queue_type,
+			     const u64 logical_address_of_page,
+			     const u64 count,
+			     const struct h_galpa gal)
+{
+	if (count != 1) {
+		ehca_gen_err("Page counter=%llx", count);
+		return H_PARAMETER;
+	}
+
+	return hipz_h_register_rpage(adapter_handle, pagesize, queue_type,
+				     cq_handle.handle, logical_address_of_page,
+				     count);
+}
+
+u64 hipz_h_register_rpage_qp(const struct ipz_adapter_handle adapter_handle,
+			     const struct ipz_qp_handle qp_handle,
+			     struct ehca_pfqp *pfqp,
+			     const u8 pagesize,
+			     const u8 queue_type,
+			     const u64 logical_address_of_page,
+			     const u64 count,
+			     const struct h_galpa galpa)
+{
+	if (count > 1) {
+		ehca_gen_err("Page counter=%llx", count);
+		return H_PARAMETER;
+	}
+
+	return hipz_h_register_rpage(adapter_handle, pagesize, queue_type,
+				     qp_handle.handle, logical_address_of_page,
+				     count);
+}
+
+u64 hipz_h_disable_and_get_wqe(const struct ipz_adapter_handle adapter_handle,
+			       const struct ipz_qp_handle qp_handle,
+			       struct ehca_pfqp *pfqp,
+			       void **log_addr_next_sq_wqe2processed,
+			       void **log_addr_next_rq_wqe2processed,
+			       int dis_and_get_function_code)
+{
+	u64 ret;
+	unsigned long outs[PLPAR_HCALL9_BUFSIZE];
+
+	ret = ehca_plpar_hcall9(H_DISABLE_AND_GETC, outs,
+				adapter_handle.handle,     /* r4 */
+				dis_and_get_function_code, /* r5 */
+				qp_handle.handle,	   /* r6 */
+				0, 0, 0, 0, 0, 0);
+	if (log_addr_next_sq_wqe2processed)
+		*log_addr_next_sq_wqe2processed = (void *)outs[0];
+	if (log_addr_next_rq_wqe2processed)
+		*log_addr_next_rq_wqe2processed = (void *)outs[1];
+
+	return ret;
+}
+
+u64 hipz_h_modify_qp(const struct ipz_adapter_handle adapter_handle,
+		     const struct ipz_qp_handle qp_handle,
+		     struct ehca_pfqp *pfqp,
+		     const u64 update_mask,
+		     struct hcp_modify_qp_control_block *mqpcb,
+		     struct h_galpa gal)
+{
+	u64 ret;
+	unsigned long outs[PLPAR_HCALL9_BUFSIZE];
+	ret = ehca_plpar_hcall9(H_MODIFY_QP, outs,
+				adapter_handle.handle, /* r4 */
+				qp_handle.handle,      /* r5 */
+				update_mask,	       /* r6 */
+				__pa(mqpcb),	       /* r7 */
+				0, 0, 0, 0, 0);
+
+	if (ret == H_NOT_ENOUGH_RESOURCES)
+		ehca_gen_err("Insufficient resources ret=%lli", ret);
+
+	return ret;
+}
+
+u64 hipz_h_query_qp(const struct ipz_adapter_handle adapter_handle,
+		    const struct ipz_qp_handle qp_handle,
+		    struct ehca_pfqp *pfqp,
+		    struct hcp_modify_qp_control_block *qqpcb,
+		    struct h_galpa gal)
+{
+	return ehca_plpar_hcall_norets(H_QUERY_QP,
+				       adapter_handle.handle, /* r4 */
+				       qp_handle.handle,      /* r5 */
+				       __pa(qqpcb),	      /* r6 */
+				       0, 0, 0, 0);
+}
+
+u64 hipz_h_destroy_qp(const struct ipz_adapter_handle adapter_handle,
+		      struct ehca_qp *qp)
+{
+	u64 ret;
+	unsigned long outs[PLPAR_HCALL9_BUFSIZE];
+
+	ret = hcp_galpas_dtor(&qp->galpas);
+	if (ret) {
+		ehca_gen_err("Could not destruct qp->galpas");
+		return H_RESOURCE;
+	}
+	ret = ehca_plpar_hcall9(H_DISABLE_AND_GETC, outs,
+				adapter_handle.handle,     /* r4 */
+				/* function code */
+				1,	                   /* r5 */
+				qp->ipz_qp_handle.handle,  /* r6 */
+				0, 0, 0, 0, 0, 0);
+	if (ret == H_HARDWARE)
+		ehca_gen_err("HCA not operational. ret=%lli", ret);
+
+	ret = ehca_plpar_hcall_norets(H_FREE_RESOURCE,
+				      adapter_handle.handle,     /* r4 */
+				      qp->ipz_qp_handle.handle,  /* r5 */
+				      0, 0, 0, 0, 0);
+
+	if (ret == H_RESOURCE)
+		ehca_gen_err("Resource still in use. ret=%lli", ret);
+
+	return ret;
+}
+
+u64 hipz_h_define_aqp0(const struct ipz_adapter_handle adapter_handle,
+		       const struct ipz_qp_handle qp_handle,
+		       struct h_galpa gal,
+		       u32 port)
+{
+	return ehca_plpar_hcall_norets(H_DEFINE_AQP0,
+				       adapter_handle.handle, /* r4 */
+				       qp_handle.handle,      /* r5 */
+				       port,                  /* r6 */
+				       0, 0, 0, 0);
+}
+
+u64 hipz_h_define_aqp1(const struct ipz_adapter_handle adapter_handle,
+		       const struct ipz_qp_handle qp_handle,
+		       struct h_galpa gal,
+		       u32 port, u32 * pma_qp_nr,
+		       u32 * bma_qp_nr)
+{
+	u64 ret;
+	unsigned long outs[PLPAR_HCALL9_BUFSIZE];
+
+	ret = ehca_plpar_hcall9(H_DEFINE_AQP1, outs,
+				adapter_handle.handle, /* r4 */
+				qp_handle.handle,      /* r5 */
+				port,	               /* r6 */
+				0, 0, 0, 0, 0, 0);
+	*pma_qp_nr = (u32)outs[0];
+	*bma_qp_nr = (u32)outs[1];
+
+	if (ret == H_ALIAS_EXIST)
+		ehca_gen_err("AQP1 already exists. ret=%lli", ret);
+
+	return ret;
+}
+
+u64 hipz_h_attach_mcqp(const struct ipz_adapter_handle adapter_handle,
+		       const struct ipz_qp_handle qp_handle,
+		       struct h_galpa gal,
+		       u16 mcg_dlid,
+		       u64 subnet_prefix, u64 interface_id)
+{
+	u64 ret;
+
+	ret = ehca_plpar_hcall_norets(H_ATTACH_MCQP,
+				      adapter_handle.handle,  /* r4 */
+				      qp_handle.handle,       /* r5 */
+				      mcg_dlid,               /* r6 */
+				      interface_id,           /* r7 */
+				      subnet_prefix,          /* r8 */
+				      0, 0);
+
+	if (ret == H_NOT_ENOUGH_RESOURCES)
+		ehca_gen_err("Not enough resources. ret=%lli", ret);
+
+	return ret;
+}
+
+u64 hipz_h_detach_mcqp(const struct ipz_adapter_handle adapter_handle,
+		       const struct ipz_qp_handle qp_handle,
+		       struct h_galpa gal,
+		       u16 mcg_dlid,
+		       u64 subnet_prefix, u64 interface_id)
+{
+	return ehca_plpar_hcall_norets(H_DETACH_MCQP,
+				       adapter_handle.handle, /* r4 */
+				       qp_handle.handle,      /* r5 */
+				       mcg_dlid,              /* r6 */
+				       interface_id,          /* r7 */
+				       subnet_prefix,         /* r8 */
+				       0, 0);
+}
+
+u64 hipz_h_destroy_cq(const struct ipz_adapter_handle adapter_handle,
+		      struct ehca_cq *cq,
+		      u8 force_flag)
+{
+	u64 ret;
+
+	ret = hcp_galpas_dtor(&cq->galpas);
+	if (ret) {
+		ehca_gen_err("Could not destruct cp->galpas");
+		return H_RESOURCE;
+	}
+
+	ret = ehca_plpar_hcall_norets(H_FREE_RESOURCE,
+				      adapter_handle.handle,     /* r4 */
+				      cq->ipz_cq_handle.handle,  /* r5 */
+				      force_flag != 0 ? 1L : 0L, /* r6 */
+				      0, 0, 0, 0);
+
+	if (ret == H_RESOURCE)
+		ehca_gen_err("H_FREE_RESOURCE failed ret=%lli ", ret);
+
+	return ret;
+}
+
+u64 hipz_h_destroy_eq(const struct ipz_adapter_handle adapter_handle,
+		      struct ehca_eq *eq)
+{
+	u64 ret;
+
+	ret = hcp_galpas_dtor(&eq->galpas);
+	if (ret) {
+		ehca_gen_err("Could not destruct eq->galpas");
+		return H_RESOURCE;
+	}
+
+	ret = ehca_plpar_hcall_norets(H_FREE_RESOURCE,
+				      adapter_handle.handle,     /* r4 */
+				      eq->ipz_eq_handle.handle,  /* r5 */
+				      0, 0, 0, 0, 0);
+
+	if (ret == H_RESOURCE)
+		ehca_gen_err("Resource in use. ret=%lli ", ret);
+
+	return ret;
+}
+
+u64 hipz_h_alloc_resource_mr(const struct ipz_adapter_handle adapter_handle,
+			     const struct ehca_mr *mr,
+			     const u64 vaddr,
+			     const u64 length,
+			     const u32 access_ctrl,
+			     const struct ipz_pd pd,
+			     struct ehca_mr_hipzout_parms *outparms)
+{
+	u64 ret;
+	unsigned long outs[PLPAR_HCALL9_BUFSIZE];
+
+	ret = ehca_plpar_hcall9(H_ALLOC_RESOURCE, outs,
+				adapter_handle.handle,            /* r4 */
+				5,                                /* r5 */
+				vaddr,                            /* r6 */
+				length,                           /* r7 */
+				(((u64)access_ctrl) << 32ULL),    /* r8 */
+				pd.value,                         /* r9 */
+				0, 0, 0);
+	outparms->handle.handle = outs[0];
+	outparms->lkey = (u32)outs[2];
+	outparms->rkey = (u32)outs[3];
+
+	return ret;
+}
+
+u64 hipz_h_register_rpage_mr(const struct ipz_adapter_handle adapter_handle,
+			     const struct ehca_mr *mr,
+			     const u8 pagesize,
+			     const u8 queue_type,
+			     const u64 logical_address_of_page,
+			     const u64 count)
+{
+	u64 ret;
+
+	if (unlikely(ehca_debug_level >= 3)) {
+		if (count > 1) {
+			u64 *kpage;
+			int i;
+			kpage = __va(logical_address_of_page);
+			for (i = 0; i < count; i++)
+				ehca_gen_dbg("kpage[%d]=%p",
+					     i, (void *)kpage[i]);
+		} else
+			ehca_gen_dbg("kpage=%p",
+				     (void *)logical_address_of_page);
+	}
+
+	if ((count > 1) && (logical_address_of_page & (EHCA_PAGESIZE-1))) {
+		ehca_gen_err("logical_address_of_page not on a 4k boundary "
+			     "adapter_handle=%llx mr=%p mr_handle=%llx "
+			     "pagesize=%x queue_type=%x "
+			     "logical_address_of_page=%llx count=%llx",
+			     adapter_handle.handle, mr,
+			     mr->ipz_mr_handle.handle, pagesize, queue_type,
+			     logical_address_of_page, count);
+		ret = H_PARAMETER;
+	} else
+		ret = hipz_h_register_rpage(adapter_handle, pagesize,
+					    queue_type,
+					    mr->ipz_mr_handle.handle,
+					    logical_address_of_page, count);
+	return ret;
+}
+
+u64 hipz_h_query_mr(const struct ipz_adapter_handle adapter_handle,
+		    const struct ehca_mr *mr,
+		    struct ehca_mr_hipzout_parms *outparms)
+{
+	u64 ret;
+	unsigned long outs[PLPAR_HCALL9_BUFSIZE];
+
+	ret = ehca_plpar_hcall9(H_QUERY_MR, outs,
+				adapter_handle.handle,     /* r4 */
+				mr->ipz_mr_handle.handle,  /* r5 */
+				0, 0, 0, 0, 0, 0, 0);
+	outparms->len = outs[0];
+	outparms->vaddr = outs[1];
+	outparms->acl  = outs[4] >> 32;
+	outparms->lkey = (u32)(outs[5] >> 32);
+	outparms->rkey = (u32)(outs[5] & (0xffffffff));
+
+	return ret;
+}
+
+u64 hipz_h_free_resource_mr(const struct ipz_adapter_handle adapter_handle,
+			    const struct ehca_mr *mr)
+{
+	return ehca_plpar_hcall_norets(H_FREE_RESOURCE,
+				       adapter_handle.handle,    /* r4 */
+				       mr->ipz_mr_handle.handle, /* r5 */
+				       0, 0, 0, 0, 0);
+}
+
+u64 hipz_h_reregister_pmr(const struct ipz_adapter_handle adapter_handle,
+			  const struct ehca_mr *mr,
+			  const u64 vaddr_in,
+			  const u64 length,
+			  const u32 access_ctrl,
+			  const struct ipz_pd pd,
+			  const u64 mr_addr_cb,
+			  struct ehca_mr_hipzout_parms *outparms)
+{
+	u64 ret;
+	unsigned long outs[PLPAR_HCALL9_BUFSIZE];
+
+	ret = ehca_plpar_hcall9(H_REREGISTER_PMR, outs,
+				adapter_handle.handle,    /* r4 */
+				mr->ipz_mr_handle.handle, /* r5 */
+				vaddr_in,	          /* r6 */
+				length,                   /* r7 */
+				/* r8 */
+				((((u64)access_ctrl) << 32ULL) | pd.value),
+				mr_addr_cb,               /* r9 */
+				0, 0, 0);
+	outparms->vaddr = outs[1];
+	outparms->lkey = (u32)outs[2];
+	outparms->rkey = (u32)outs[3];
+
+	return ret;
+}
+
+u64 hipz_h_register_smr(const struct ipz_adapter_handle adapter_handle,
+			const struct ehca_mr *mr,
+			const struct ehca_mr *orig_mr,
+			const u64 vaddr_in,
+			const u32 access_ctrl,
+			const struct ipz_pd pd,
+			struct ehca_mr_hipzout_parms *outparms)
+{
+	u64 ret;
+	unsigned long outs[PLPAR_HCALL9_BUFSIZE];
+
+	ret = ehca_plpar_hcall9(H_REGISTER_SMR, outs,
+				adapter_handle.handle,            /* r4 */
+				orig_mr->ipz_mr_handle.handle,    /* r5 */
+				vaddr_in,                         /* r6 */
+				(((u64)access_ctrl) << 32ULL),    /* r7 */
+				pd.value,                         /* r8 */
+				0, 0, 0, 0);
+	outparms->handle.handle = outs[0];
+	outparms->lkey = (u32)outs[2];
+	outparms->rkey = (u32)outs[3];
+
+	return ret;
+}
+
+u64 hipz_h_alloc_resource_mw(const struct ipz_adapter_handle adapter_handle,
+			     const struct ehca_mw *mw,
+			     const struct ipz_pd pd,
+			     struct ehca_mw_hipzout_parms *outparms)
+{
+	u64 ret;
+	unsigned long outs[PLPAR_HCALL9_BUFSIZE];
+
+	ret = ehca_plpar_hcall9(H_ALLOC_RESOURCE, outs,
+				adapter_handle.handle,      /* r4 */
+				6,                          /* r5 */
+				pd.value,                   /* r6 */
+				0, 0, 0, 0, 0, 0);
+	outparms->handle.handle = outs[0];
+	outparms->rkey = (u32)outs[3];
+
+	return ret;
+}
+
+u64 hipz_h_query_mw(const struct ipz_adapter_handle adapter_handle,
+		    const struct ehca_mw *mw,
+		    struct ehca_mw_hipzout_parms *outparms)
+{
+	u64 ret;
+	unsigned long outs[PLPAR_HCALL9_BUFSIZE];
+
+	ret = ehca_plpar_hcall9(H_QUERY_MW, outs,
+				adapter_handle.handle,    /* r4 */
+				mw->ipz_mw_handle.handle, /* r5 */
+				0, 0, 0, 0, 0, 0, 0);
+	outparms->rkey = (u32)outs[3];
+
+	return ret;
+}
+
+u64 hipz_h_free_resource_mw(const struct ipz_adapter_handle adapter_handle,
+			    const struct ehca_mw *mw)
+{
+	return ehca_plpar_hcall_norets(H_FREE_RESOURCE,
+				       adapter_handle.handle,    /* r4 */
+				       mw->ipz_mw_handle.handle, /* r5 */
+				       0, 0, 0, 0, 0);
+}
+
+u64 hipz_h_error_data(const struct ipz_adapter_handle adapter_handle,
+		      const u64 ressource_handle,
+		      void *rblock,
+		      unsigned long *byte_count)
+{
+	u64 r_cb = __pa(rblock);
+
+	if (r_cb & (EHCA_PAGESIZE-1)) {
+		ehca_gen_err("rblock not page aligned.");
+		return H_PARAMETER;
+	}
+
+	return ehca_plpar_hcall_norets(H_ERROR_DATA,
+				       adapter_handle.handle,
+				       ressource_handle,
+				       r_cb,
+				       0, 0, 0, 0);
+}
+
+u64 hipz_h_eoi(int irq)
+{
+	unsigned long xirr;
+
+	iosync();
+	xirr = (0xffULL << 24) | irq;
+
+	return plpar_hcall_norets(H_EOI, xirr);
+}
diff --git a/drivers/infiniband/hw/ehca/hcp_if.h b/drivers/infiniband/hw/ehca/hcp_if.h
new file mode 100644
index 0000000..a46e514
--- /dev/null
+++ b/drivers/infiniband/hw/ehca/hcp_if.h
@@ -0,0 +1,265 @@
+/*
+ *  IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ *  Firmware Infiniband Interface code for POWER
+ *
+ *  Authors: Christoph Raisch <raisch@de.ibm.com>
+ *           Hoang-Nam Nguyen <hnguyen@de.ibm.com>
+ *           Gerd Bayer <gerd.bayer@de.ibm.com>
+ *           Waleri Fomin <fomin@de.ibm.com>
+ *
+ *  Copyright (c) 2005 IBM Corporation
+ *
+ *  All rights reserved.
+ *
+ *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ *  BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __HCP_IF_H__
+#define __HCP_IF_H__
+
+#include "ehca_classes.h"
+#include "ehca_tools.h"
+#include "hipz_hw.h"
+
+/*
+ * hipz_h_alloc_resource_eq allocates EQ resources in HW and FW, initialize
+ * resources, create the empty EQPT (ring).
+ */
+u64 hipz_h_alloc_resource_eq(const struct ipz_adapter_handle adapter_handle,
+			     struct ehca_pfeq *pfeq,
+			     const u32 neq_control,
+			     const u32 number_of_entries,
+			     struct ipz_eq_handle *eq_handle,
+			     u32 * act_nr_of_entries,
+			     u32 * act_pages,
+			     u32 * eq_ist);
+
+u64 hipz_h_reset_event(const struct ipz_adapter_handle adapter_handle,
+		       struct ipz_eq_handle eq_handle,
+		       const u64 event_mask);
+/*
+ * hipz_h_allocate_resource_cq allocates CQ resources in HW and FW, initialize
+ * resources, create the empty CQPT (ring).
+ */
+u64 hipz_h_alloc_resource_cq(const struct ipz_adapter_handle adapter_handle,
+			     struct ehca_cq *cq,
+			     struct ehca_alloc_cq_parms *param);
+
+
+/*
+ * hipz_h_alloc_resource_qp allocates QP resources in HW and FW,
+ * initialize resources, create empty QPPTs (2 rings).
+ */
+u64 hipz_h_alloc_resource_qp(const struct ipz_adapter_handle adapter_handle,
+			     struct ehca_alloc_qp_parms *parms, int is_user);
+
+u64 hipz_h_query_port(const struct ipz_adapter_handle adapter_handle,
+		      const u8 port_id,
+		      struct hipz_query_port *query_port_response_block);
+
+u64 hipz_h_modify_port(const struct ipz_adapter_handle adapter_handle,
+		       const u8 port_id, const u32 port_cap,
+		       const u8 init_type, const int modify_mask);
+
+u64 hipz_h_query_hca(const struct ipz_adapter_handle adapter_handle,
+		     struct hipz_query_hca *query_hca_rblock);
+
+/*
+ * hipz_h_register_rpage internal function in hcp_if.h for all
+ * hcp_H_REGISTER_RPAGE calls.
+ */
+u64 hipz_h_register_rpage(const struct ipz_adapter_handle adapter_handle,
+			  const u8 pagesize,
+			  const u8 queue_type,
+			  const u64 resource_handle,
+			  const u64 logical_address_of_page,
+			  u64 count);
+
+u64 hipz_h_register_rpage_eq(const struct ipz_adapter_handle adapter_handle,
+			     const struct ipz_eq_handle eq_handle,
+			     struct ehca_pfeq *pfeq,
+			     const u8 pagesize,
+			     const u8 queue_type,
+			     const u64 logical_address_of_page,
+			     const u64 count);
+
+u64 hipz_h_query_int_state(const struct ipz_adapter_handle
+			   hcp_adapter_handle,
+			   u32 ist);
+
+u64 hipz_h_register_rpage_cq(const struct ipz_adapter_handle adapter_handle,
+			     const struct ipz_cq_handle cq_handle,
+			     struct ehca_pfcq *pfcq,
+			     const u8 pagesize,
+			     const u8 queue_type,
+			     const u64 logical_address_of_page,
+			     const u64 count,
+			     const struct h_galpa gal);
+
+u64 hipz_h_register_rpage_qp(const struct ipz_adapter_handle adapter_handle,
+			     const struct ipz_qp_handle qp_handle,
+			     struct ehca_pfqp *pfqp,
+			     const u8 pagesize,
+			     const u8 queue_type,
+			     const u64 logical_address_of_page,
+			     const u64 count,
+			     const struct h_galpa galpa);
+
+u64 hipz_h_disable_and_get_wqe(const struct ipz_adapter_handle adapter_handle,
+			       const struct ipz_qp_handle qp_handle,
+			       struct ehca_pfqp *pfqp,
+			       void **log_addr_next_sq_wqe_tb_processed,
+			       void **log_addr_next_rq_wqe_tb_processed,
+			       int dis_and_get_function_code);
+enum hcall_sigt {
+	HCALL_SIGT_NO_CQE = 0,
+	HCALL_SIGT_BY_WQE = 1,
+	HCALL_SIGT_EVERY = 2
+};
+
+u64 hipz_h_modify_qp(const struct ipz_adapter_handle adapter_handle,
+		     const struct ipz_qp_handle qp_handle,
+		     struct ehca_pfqp *pfqp,
+		     const u64 update_mask,
+		     struct hcp_modify_qp_control_block *mqpcb,
+		     struct h_galpa gal);
+
+u64 hipz_h_query_qp(const struct ipz_adapter_handle adapter_handle,
+		    const struct ipz_qp_handle qp_handle,
+		    struct ehca_pfqp *pfqp,
+		    struct hcp_modify_qp_control_block *qqpcb,
+		    struct h_galpa gal);
+
+u64 hipz_h_destroy_qp(const struct ipz_adapter_handle adapter_handle,
+		      struct ehca_qp *qp);
+
+u64 hipz_h_define_aqp0(const struct ipz_adapter_handle adapter_handle,
+		       const struct ipz_qp_handle qp_handle,
+		       struct h_galpa gal,
+		       u32 port);
+
+u64 hipz_h_define_aqp1(const struct ipz_adapter_handle adapter_handle,
+		       const struct ipz_qp_handle qp_handle,
+		       struct h_galpa gal,
+		       u32 port, u32 * pma_qp_nr,
+		       u32 * bma_qp_nr);
+
+u64 hipz_h_attach_mcqp(const struct ipz_adapter_handle adapter_handle,
+		       const struct ipz_qp_handle qp_handle,
+		       struct h_galpa gal,
+		       u16 mcg_dlid,
+		       u64 subnet_prefix, u64 interface_id);
+
+u64 hipz_h_detach_mcqp(const struct ipz_adapter_handle adapter_handle,
+		       const struct ipz_qp_handle qp_handle,
+		       struct h_galpa gal,
+		       u16 mcg_dlid,
+		       u64 subnet_prefix, u64 interface_id);
+
+u64 hipz_h_destroy_cq(const struct ipz_adapter_handle adapter_handle,
+		      struct ehca_cq *cq,
+		      u8 force_flag);
+
+u64 hipz_h_destroy_eq(const struct ipz_adapter_handle adapter_handle,
+		      struct ehca_eq *eq);
+
+/*
+ * hipz_h_alloc_resource_mr allocates MR resources in HW and FW, initialize
+ * resources.
+ */
+u64 hipz_h_alloc_resource_mr(const struct ipz_adapter_handle adapter_handle,
+			     const struct ehca_mr *mr,
+			     const u64 vaddr,
+			     const u64 length,
+			     const u32 access_ctrl,
+			     const struct ipz_pd pd,
+			     struct ehca_mr_hipzout_parms *outparms);
+
+/* hipz_h_register_rpage_mr registers MR resource pages in HW and FW */
+u64 hipz_h_register_rpage_mr(const struct ipz_adapter_handle adapter_handle,
+			     const struct ehca_mr *mr,
+			     const u8 pagesize,
+			     const u8 queue_type,
+			     const u64 logical_address_of_page,
+			     const u64 count);
+
+/* hipz_h_query_mr queries MR in HW and FW */
+u64 hipz_h_query_mr(const struct ipz_adapter_handle adapter_handle,
+		    const struct ehca_mr *mr,
+		    struct ehca_mr_hipzout_parms *outparms);
+
+/* hipz_h_free_resource_mr frees MR resources in HW and FW */
+u64 hipz_h_free_resource_mr(const struct ipz_adapter_handle adapter_handle,
+			    const struct ehca_mr *mr);
+
+/* hipz_h_reregister_pmr reregisters MR in HW and FW */
+u64 hipz_h_reregister_pmr(const struct ipz_adapter_handle adapter_handle,
+			  const struct ehca_mr *mr,
+			  const u64 vaddr_in,
+			  const u64 length,
+			  const u32 access_ctrl,
+			  const struct ipz_pd pd,
+			  const u64 mr_addr_cb,
+			  struct ehca_mr_hipzout_parms *outparms);
+
+/* hipz_h_register_smr register shared MR in HW and FW */
+u64 hipz_h_register_smr(const struct ipz_adapter_handle adapter_handle,
+			const struct ehca_mr *mr,
+			const struct ehca_mr *orig_mr,
+			const u64 vaddr_in,
+			const u32 access_ctrl,
+			const struct ipz_pd pd,
+			struct ehca_mr_hipzout_parms *outparms);
+
+/*
+ * hipz_h_alloc_resource_mw allocates MW resources in HW and FW, initialize
+ * resources.
+ */
+u64 hipz_h_alloc_resource_mw(const struct ipz_adapter_handle adapter_handle,
+			     const struct ehca_mw *mw,
+			     const struct ipz_pd pd,
+			     struct ehca_mw_hipzout_parms *outparms);
+
+/* hipz_h_query_mw queries MW in HW and FW */
+u64 hipz_h_query_mw(const struct ipz_adapter_handle adapter_handle,
+		    const struct ehca_mw *mw,
+		    struct ehca_mw_hipzout_parms *outparms);
+
+/* hipz_h_free_resource_mw frees MW resources in HW and FW */
+u64 hipz_h_free_resource_mw(const struct ipz_adapter_handle adapter_handle,
+			    const struct ehca_mw *mw);
+
+u64 hipz_h_error_data(const struct ipz_adapter_handle adapter_handle,
+		      const u64 ressource_handle,
+		      void *rblock,
+		      unsigned long *byte_count);
+u64 hipz_h_eoi(int irq);
+
+#endif /* __HCP_IF_H__ */
diff --git a/drivers/infiniband/hw/ehca/hcp_phyp.c b/drivers/infiniband/hw/ehca/hcp_phyp.c
new file mode 100644
index 0000000..077376f
--- /dev/null
+++ b/drivers/infiniband/hw/ehca/hcp_phyp.c
@@ -0,0 +1,82 @@
+/*
+ *  IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ *   load store abstraction for ehca register access with tracing
+ *
+ *  Authors: Christoph Raisch <raisch@de.ibm.com>
+ *           Hoang-Nam Nguyen <hnguyen@de.ibm.com>
+ *
+ *  Copyright (c) 2005 IBM Corporation
+ *
+ *  All rights reserved.
+ *
+ *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ *  BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "ehca_classes.h"
+#include "hipz_hw.h"
+
+u64 hcall_map_page(u64 physaddr)
+{
+	return (u64)ioremap(physaddr, EHCA_PAGESIZE);
+}
+
+int hcall_unmap_page(u64 mapaddr)
+{
+	iounmap((volatile void __iomem *) mapaddr);
+	return 0;
+}
+
+int hcp_galpas_ctor(struct h_galpas *galpas, int is_user,
+		    u64 paddr_kernel, u64 paddr_user)
+{
+	if (!is_user) {
+		galpas->kernel.fw_handle = hcall_map_page(paddr_kernel);
+		if (!galpas->kernel.fw_handle)
+			return -ENOMEM;
+	} else
+		galpas->kernel.fw_handle = 0;
+
+	galpas->user.fw_handle = paddr_user;
+
+	return 0;
+}
+
+int hcp_galpas_dtor(struct h_galpas *galpas)
+{
+	if (galpas->kernel.fw_handle) {
+		int ret = hcall_unmap_page(galpas->kernel.fw_handle);
+		if (ret)
+			return ret;
+	}
+
+	galpas->user.fw_handle = galpas->kernel.fw_handle = 0;
+
+	return 0;
+}
diff --git a/drivers/infiniband/hw/ehca/hcp_phyp.h b/drivers/infiniband/hw/ehca/hcp_phyp.h
new file mode 100644
index 0000000..d1b0299
--- /dev/null
+++ b/drivers/infiniband/hw/ehca/hcp_phyp.h
@@ -0,0 +1,90 @@
+/*
+ *  IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ *  Firmware calls
+ *
+ *  Authors: Christoph Raisch <raisch@de.ibm.com>
+ *           Hoang-Nam Nguyen <hnguyen@de.ibm.com>
+ *           Waleri Fomin <fomin@de.ibm.com>
+ *           Gerd Bayer <gerd.bayer@de.ibm.com>
+ *
+ *  Copyright (c) 2005 IBM Corporation
+ *
+ *  All rights reserved.
+ *
+ *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ *  BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __HCP_PHYP_H__
+#define __HCP_PHYP_H__
+
+
+/*
+ * eHCA page (mapped into memory)
+ * resource to access eHCA register pages in CPU address space
+*/
+struct h_galpa {
+	u64 fw_handle;
+	/* for pSeries this is a 64bit memory address where
+	   I/O memory is mapped into CPU address space (kv) */
+};
+
+/*
+ * resource to access eHCA address space registers, all types
+ */
+struct h_galpas {
+	u32 pid;		/*PID of userspace galpa checking */
+	struct h_galpa user;	/* user space accessible resource,
+				   set to 0 if unused */
+	struct h_galpa kernel;	/* kernel space accessible resource,
+				   set to 0 if unused */
+};
+
+static inline u64 hipz_galpa_load(struct h_galpa galpa, u32 offset)
+{
+	u64 addr = galpa.fw_handle + offset;
+	return *(volatile u64 __force *)addr;
+}
+
+static inline void hipz_galpa_store(struct h_galpa galpa, u32 offset, u64 value)
+{
+	u64 addr = galpa.fw_handle + offset;
+	*(volatile u64 __force *)addr = value;
+}
+
+int hcp_galpas_ctor(struct h_galpas *galpas, int is_user,
+		    u64 paddr_kernel, u64 paddr_user);
+
+int hcp_galpas_dtor(struct h_galpas *galpas);
+
+u64 hcall_map_page(u64 physaddr);
+
+int hcall_unmap_page(u64 mapaddr);
+
+#endif
diff --git a/drivers/infiniband/hw/ehca/hipz_fns.h b/drivers/infiniband/hw/ehca/hipz_fns.h
new file mode 100644
index 0000000..9dac93d
--- /dev/null
+++ b/drivers/infiniband/hw/ehca/hipz_fns.h
@@ -0,0 +1,68 @@
+/*
+ *  IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ *  HW abstraction register functions
+ *
+ *  Authors: Christoph Raisch <raisch@de.ibm.com>
+ *           Reinhard Ernst <rernst@de.ibm.com>
+ *
+ *  Copyright (c) 2005 IBM Corporation
+ *
+ *  All rights reserved.
+ *
+ *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ *  BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __HIPZ_FNS_H__
+#define __HIPZ_FNS_H__
+
+#include "ehca_classes.h"
+#include "hipz_hw.h"
+
+#include "hipz_fns_core.h"
+
+#define hipz_galpa_store_eq(gal, offset, value) \
+	hipz_galpa_store(gal, EQTEMM_OFFSET(offset), value)
+
+#define hipz_galpa_load_eq(gal, offset) \
+	hipz_galpa_load(gal, EQTEMM_OFFSET(offset))
+
+#define hipz_galpa_store_qped(gal, offset, value) \
+	hipz_galpa_store(gal, QPEDMM_OFFSET(offset), value)
+
+#define hipz_galpa_load_qped(gal, offset) \
+	hipz_galpa_load(gal, QPEDMM_OFFSET(offset))
+
+#define hipz_galpa_store_mrmw(gal, offset, value) \
+	hipz_galpa_store(gal, MRMWMM_OFFSET(offset), value)
+
+#define hipz_galpa_load_mrmw(gal, offset) \
+	hipz_galpa_load(gal, MRMWMM_OFFSET(offset))
+
+#endif
diff --git a/drivers/infiniband/hw/ehca/hipz_fns_core.h b/drivers/infiniband/hw/ehca/hipz_fns_core.h
new file mode 100644
index 0000000..868735f
--- /dev/null
+++ b/drivers/infiniband/hw/ehca/hipz_fns_core.h
@@ -0,0 +1,100 @@
+/*
+ *  IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ *  HW abstraction register functions
+ *
+ *  Authors: Christoph Raisch <raisch@de.ibm.com>
+ *           Heiko J Schick <schickhj@de.ibm.com>
+ *           Hoang-Nam Nguyen <hnguyen@de.ibm.com>
+ *           Reinhard Ernst <rernst@de.ibm.com>
+ *
+ *  Copyright (c) 2005 IBM Corporation
+ *
+ *  All rights reserved.
+ *
+ *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ *  BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __HIPZ_FNS_CORE_H__
+#define __HIPZ_FNS_CORE_H__
+
+#include "hcp_phyp.h"
+#include "hipz_hw.h"
+
+#define hipz_galpa_store_cq(gal, offset, value) \
+	hipz_galpa_store(gal, CQTEMM_OFFSET(offset), value)
+
+#define hipz_galpa_load_cq(gal, offset) \
+	hipz_galpa_load(gal, CQTEMM_OFFSET(offset))
+
+#define hipz_galpa_store_qp(gal, offset, value) \
+	hipz_galpa_store(gal, QPTEMM_OFFSET(offset), value)
+#define hipz_galpa_load_qp(gal, offset) \
+	hipz_galpa_load(gal, QPTEMM_OFFSET(offset))
+
+static inline void hipz_update_sqa(struct ehca_qp *qp, u16 nr_wqes)
+{
+	/*  ringing doorbell :-) */
+	hipz_galpa_store_qp(qp->galpas.kernel, qpx_sqa,
+			    EHCA_BMASK_SET(QPX_SQADDER, nr_wqes));
+}
+
+static inline void hipz_update_rqa(struct ehca_qp *qp, u16 nr_wqes)
+{
+	/*  ringing doorbell :-) */
+	hipz_galpa_store_qp(qp->galpas.kernel, qpx_rqa,
+			    EHCA_BMASK_SET(QPX_RQADDER, nr_wqes));
+}
+
+static inline void hipz_update_feca(struct ehca_cq *cq, u32 nr_cqes)
+{
+	hipz_galpa_store_cq(cq->galpas.kernel, cqx_feca,
+			    EHCA_BMASK_SET(CQX_FECADDER, nr_cqes));
+}
+
+static inline void hipz_set_cqx_n0(struct ehca_cq *cq, u32 value)
+{
+	u64 cqx_n0_reg;
+
+	hipz_galpa_store_cq(cq->galpas.kernel, cqx_n0,
+			    EHCA_BMASK_SET(CQX_N0_GENERATE_SOLICITED_COMP_EVENT,
+					   value));
+	cqx_n0_reg = hipz_galpa_load_cq(cq->galpas.kernel, cqx_n0);
+}
+
+static inline void hipz_set_cqx_n1(struct ehca_cq *cq, u32 value)
+{
+	u64 cqx_n1_reg;
+
+	hipz_galpa_store_cq(cq->galpas.kernel, cqx_n1,
+			    EHCA_BMASK_SET(CQX_N1_GENERATE_COMP_EVENT, value));
+	cqx_n1_reg = hipz_galpa_load_cq(cq->galpas.kernel, cqx_n1);
+}
+
+#endif /* __HIPZ_FNC_CORE_H__ */
diff --git a/drivers/infiniband/hw/ehca/hipz_hw.h b/drivers/infiniband/hw/ehca/hipz_hw.h
new file mode 100644
index 0000000..bf996c7
--- /dev/null
+++ b/drivers/infiniband/hw/ehca/hipz_hw.h
@@ -0,0 +1,414 @@
+/*
+ *  IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ *  eHCA register definitions
+ *
+ *  Authors: Waleri Fomin <fomin@de.ibm.com>
+ *           Christoph Raisch <raisch@de.ibm.com>
+ *           Reinhard Ernst <rernst@de.ibm.com>
+ *
+ *  Copyright (c) 2005 IBM Corporation
+ *
+ *  All rights reserved.
+ *
+ *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ *  BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __HIPZ_HW_H__
+#define __HIPZ_HW_H__
+
+#include "ehca_tools.h"
+
+#define EHCA_MAX_MTU 4
+
+/* QP Table Entry Memory Map */
+struct hipz_qptemm {
+	u64 qpx_hcr;
+	u64 qpx_c;
+	u64 qpx_herr;
+	u64 qpx_aer;
+/* 0x20*/
+	u64 qpx_sqa;
+	u64 qpx_sqc;
+	u64 qpx_rqa;
+	u64 qpx_rqc;
+/* 0x40*/
+	u64 qpx_st;
+	u64 qpx_pmstate;
+	u64 qpx_pmfa;
+	u64 qpx_pkey;
+/* 0x60*/
+	u64 qpx_pkeya;
+	u64 qpx_pkeyb;
+	u64 qpx_pkeyc;
+	u64 qpx_pkeyd;
+/* 0x80*/
+	u64 qpx_qkey;
+	u64 qpx_dqp;
+	u64 qpx_dlidp;
+	u64 qpx_portp;
+/* 0xa0*/
+	u64 qpx_slidp;
+	u64 qpx_slidpp;
+	u64 qpx_dlida;
+	u64 qpx_porta;
+/* 0xc0*/
+	u64 qpx_slida;
+	u64 qpx_slidpa;
+	u64 qpx_slvl;
+	u64 qpx_ipd;
+/* 0xe0*/
+	u64 qpx_mtu;
+	u64 qpx_lato;
+	u64 qpx_rlimit;
+	u64 qpx_rnrlimit;
+/* 0x100*/
+	u64 qpx_t;
+	u64 qpx_sqhp;
+	u64 qpx_sqptp;
+	u64 qpx_nspsn;
+/* 0x120*/
+	u64 qpx_nspsnhwm;
+	u64 reserved1;
+	u64 qpx_sdsi;
+	u64 qpx_sdsbc;
+/* 0x140*/
+	u64 qpx_sqwsize;
+	u64 qpx_sqwts;
+	u64 qpx_lsn;
+	u64 qpx_nssn;
+/* 0x160 */
+	u64 qpx_mor;
+	u64 qpx_cor;
+	u64 qpx_sqsize;
+	u64 qpx_erc;
+/* 0x180*/
+	u64 qpx_rnrrc;
+	u64 qpx_ernrwt;
+	u64 qpx_rnrresp;
+	u64 qpx_lmsna;
+/* 0x1a0 */
+	u64 qpx_sqhpc;
+	u64 qpx_sqcptp;
+	u64 qpx_sigt;
+	u64 qpx_wqecnt;
+/* 0x1c0*/
+	u64 qpx_rqhp;
+	u64 qpx_rqptp;
+	u64 qpx_rqsize;
+	u64 qpx_nrr;
+/* 0x1e0*/
+	u64 qpx_rdmac;
+	u64 qpx_nrpsn;
+	u64 qpx_lapsn;
+	u64 qpx_lcr;
+/* 0x200*/
+	u64 qpx_rwc;
+	u64 qpx_rwva;
+	u64 qpx_rdsi;
+	u64 qpx_rdsbc;
+/* 0x220*/
+	u64 qpx_rqwsize;
+	u64 qpx_crmsn;
+	u64 qpx_rdd;
+	u64 qpx_larpsn;
+/* 0x240*/
+	u64 qpx_pd;
+	u64 qpx_scqn;
+	u64 qpx_rcqn;
+	u64 qpx_aeqn;
+/* 0x260*/
+	u64 qpx_aaelog;
+	u64 qpx_ram;
+	u64 qpx_rdmaqe0;
+	u64 qpx_rdmaqe1;
+/* 0x280*/
+	u64 qpx_rdmaqe2;
+	u64 qpx_rdmaqe3;
+	u64 qpx_nrpsnhwm;
+/* 0x298*/
+	u64 reserved[(0x400 - 0x298) / 8];
+/* 0x400 extended data */
+	u64 reserved_ext[(0x500 - 0x400) / 8];
+/* 0x500 */
+	u64 reserved2[(0x1000 - 0x500) / 8];
+/* 0x1000      */
+};
+
+#define QPX_SQADDER EHCA_BMASK_IBM(48, 63)
+#define QPX_RQADDER EHCA_BMASK_IBM(48, 63)
+#define QPX_AAELOG_RESET_SRQ_LIMIT EHCA_BMASK_IBM(3, 3)
+
+#define QPTEMM_OFFSET(x) offsetof(struct hipz_qptemm, x)
+
+/* MRMWPT Entry Memory Map */
+struct hipz_mrmwmm {
+	/* 0x00 */
+	u64 mrx_hcr;
+
+	u64 mrx_c;
+	u64 mrx_herr;
+	u64 mrx_aer;
+	/* 0x20 */
+	u64 mrx_pp;
+	u64 reserved1;
+	u64 reserved2;
+	u64 reserved3;
+	/* 0x40 */
+	u64 reserved4[(0x200 - 0x40) / 8];
+	/* 0x200 */
+	u64 mrx_ctl[64];
+
+};
+
+#define MRMWMM_OFFSET(x) offsetof(struct hipz_mrmwmm, x)
+
+struct hipz_qpedmm {
+	/* 0x00 */
+	u64 reserved0[(0x400) / 8];
+	/* 0x400 */
+	u64 qpedx_phh;
+	u64 qpedx_ppsgp;
+	/* 0x410 */
+	u64 qpedx_ppsgu;
+	u64 qpedx_ppdgp;
+	/* 0x420 */
+	u64 qpedx_ppdgu;
+	u64 qpedx_aph;
+	/* 0x430 */
+	u64 qpedx_apsgp;
+	u64 qpedx_apsgu;
+	/* 0x440 */
+	u64 qpedx_apdgp;
+	u64 qpedx_apdgu;
+	/* 0x450 */
+	u64 qpedx_apav;
+	u64 qpedx_apsav;
+	/* 0x460  */
+	u64 qpedx_hcr;
+	u64 reserved1[4];
+	/* 0x488 */
+	u64 qpedx_rrl0;
+	/* 0x490 */
+	u64 qpedx_rrrkey0;
+	u64 qpedx_rrva0;
+	/* 0x4a0 */
+	u64 reserved2;
+	u64 qpedx_rrl1;
+	/* 0x4b0 */
+	u64 qpedx_rrrkey1;
+	u64 qpedx_rrva1;
+	/* 0x4c0 */
+	u64 reserved3;
+	u64 qpedx_rrl2;
+	/* 0x4d0 */
+	u64 qpedx_rrrkey2;
+	u64 qpedx_rrva2;
+	/* 0x4e0 */
+	u64 reserved4;
+	u64 qpedx_rrl3;
+	/* 0x4f0 */
+	u64 qpedx_rrrkey3;
+	u64 qpedx_rrva3;
+};
+
+#define QPEDMM_OFFSET(x) offsetof(struct hipz_qpedmm, x)
+
+/* CQ Table Entry Memory Map */
+struct hipz_cqtemm {
+	u64 cqx_hcr;
+	u64 cqx_c;
+	u64 cqx_herr;
+	u64 cqx_aer;
+/* 0x20  */
+	u64 cqx_ptp;
+	u64 cqx_tp;
+	u64 cqx_fec;
+	u64 cqx_feca;
+/* 0x40  */
+	u64 cqx_ep;
+	u64 cqx_eq;
+/* 0x50  */
+	u64 reserved1;
+	u64 cqx_n0;
+/* 0x60  */
+	u64 cqx_n1;
+	u64 reserved2[(0x1000 - 0x60) / 8];
+/* 0x1000 */
+};
+
+#define CQX_FEC_CQE_CNT           EHCA_BMASK_IBM(32, 63)
+#define CQX_FECADDER              EHCA_BMASK_IBM(32, 63)
+#define CQX_N0_GENERATE_SOLICITED_COMP_EVENT EHCA_BMASK_IBM(0, 0)
+#define CQX_N1_GENERATE_COMP_EVENT EHCA_BMASK_IBM(0, 0)
+
+#define CQTEMM_OFFSET(x) offsetof(struct hipz_cqtemm, x)
+
+/* EQ Table Entry Memory Map */
+struct hipz_eqtemm {
+	u64 eqx_hcr;
+	u64 eqx_c;
+
+	u64 eqx_herr;
+	u64 eqx_aer;
+/* 0x20 */
+	u64 eqx_ptp;
+	u64 eqx_tp;
+	u64 eqx_ssba;
+	u64 eqx_psba;
+
+/* 0x40 */
+	u64 eqx_cec;
+	u64 eqx_meql;
+	u64 eqx_xisbi;
+	u64 eqx_xisc;
+/* 0x60 */
+	u64 eqx_it;
+
+};
+
+#define EQTEMM_OFFSET(x) offsetof(struct hipz_eqtemm, x)
+
+/* access control defines for MR/MW */
+#define HIPZ_ACCESSCTRL_L_WRITE  0x00800000
+#define HIPZ_ACCESSCTRL_R_WRITE  0x00400000
+#define HIPZ_ACCESSCTRL_R_READ   0x00200000
+#define HIPZ_ACCESSCTRL_R_ATOMIC 0x00100000
+#define HIPZ_ACCESSCTRL_MW_BIND  0x00080000
+
+/* query hca response block */
+struct hipz_query_hca {
+	u32 cur_reliable_dg;
+	u32 cur_qp;
+	u32 cur_cq;
+	u32 cur_eq;
+	u32 cur_mr;
+	u32 cur_mw;
+	u32 cur_ee_context;
+	u32 cur_mcast_grp;
+	u32 cur_qp_attached_mcast_grp;
+	u32 reserved1;
+	u32 cur_ipv6_qp;
+	u32 cur_eth_qp;
+	u32 cur_hp_mr;
+	u32 reserved2[3];
+	u32 max_rd_domain;
+	u32 max_qp;
+	u32 max_cq;
+	u32 max_eq;
+	u32 max_mr;
+	u32 max_hp_mr;
+	u32 max_mw;
+	u32 max_mrwpte;
+	u32 max_special_mrwpte;
+	u32 max_rd_ee_context;
+	u32 max_mcast_grp;
+	u32 max_total_mcast_qp_attach;
+	u32 max_mcast_qp_attach;
+	u32 max_raw_ipv6_qp;
+	u32 max_raw_ethy_qp;
+	u32 internal_clock_frequency;
+	u32 max_pd;
+	u32 max_ah;
+	u32 max_cqe;
+	u32 max_wqes_wq;
+	u32 max_partitions;
+	u32 max_rr_ee_context;
+	u32 max_rr_qp;
+	u32 max_rr_hca;
+	u32 max_act_wqs_ee_context;
+	u32 max_act_wqs_qp;
+	u32 max_sge;
+	u32 max_sge_rd;
+	u32 memory_page_size_supported;
+	u64 max_mr_size;
+	u32 local_ca_ack_delay;
+	u32 num_ports;
+	u32 vendor_id;
+	u32 vendor_part_id;
+	u32 hw_ver;
+	u64 node_guid;
+	u64 hca_cap_indicators;
+	u32 data_counter_register_size;
+	u32 max_shared_rq;
+	u32 max_isns_eq;
+	u32 max_neq;
+} __attribute__ ((packed));
+
+#define HCA_CAP_AH_PORT_NR_CHECK      EHCA_BMASK_IBM( 0,  0)
+#define HCA_CAP_ATOMIC                EHCA_BMASK_IBM( 1,  1)
+#define HCA_CAP_AUTO_PATH_MIG         EHCA_BMASK_IBM( 2,  2)
+#define HCA_CAP_BAD_P_KEY_CTR         EHCA_BMASK_IBM( 3,  3)
+#define HCA_CAP_SQD_RTS_PORT_CHANGE   EHCA_BMASK_IBM( 4,  4)
+#define HCA_CAP_CUR_QP_STATE_MOD      EHCA_BMASK_IBM( 5,  5)
+#define HCA_CAP_INIT_TYPE             EHCA_BMASK_IBM( 6,  6)
+#define HCA_CAP_PORT_ACTIVE_EVENT     EHCA_BMASK_IBM( 7,  7)
+#define HCA_CAP_Q_KEY_VIOL_CTR        EHCA_BMASK_IBM( 8,  8)
+#define HCA_CAP_WQE_RESIZE            EHCA_BMASK_IBM( 9,  9)
+#define HCA_CAP_RAW_PACKET_MCAST      EHCA_BMASK_IBM(10, 10)
+#define HCA_CAP_SHUTDOWN_PORT         EHCA_BMASK_IBM(11, 11)
+#define HCA_CAP_RC_LL_QP              EHCA_BMASK_IBM(12, 12)
+#define HCA_CAP_SRQ                   EHCA_BMASK_IBM(13, 13)
+#define HCA_CAP_UD_LL_QP              EHCA_BMASK_IBM(16, 16)
+#define HCA_CAP_RESIZE_MR             EHCA_BMASK_IBM(17, 17)
+#define HCA_CAP_MINI_QP               EHCA_BMASK_IBM(18, 18)
+#define HCA_CAP_H_ALLOC_RES_SYNC      EHCA_BMASK_IBM(19, 19)
+
+/* query port response block */
+struct hipz_query_port {
+	u32 state;
+	u32 bad_pkey_cntr;
+	u32 lmc;
+	u32 lid;
+	u32 subnet_timeout;
+	u32 qkey_viol_cntr;
+	u32 sm_sl;
+	u32 sm_lid;
+	u32 capability_mask;
+	u32 init_type_reply;
+	u32 pkey_tbl_len;
+	u32 gid_tbl_len;
+	u64 gid_prefix;
+	u32 port_nr;
+	u16 pkey_entries[16];
+	u8  reserved1[32];
+	u32 trent_size;
+	u32 trbuf_size;
+	u64 max_msg_sz;
+	u32 max_mtu;
+	u32 vl_cap;
+	u32 phys_pstate;
+	u32 phys_state;
+	u32 phys_speed;
+	u32 phys_width;
+	u8  reserved2[1884];
+	u64 guid_entries[255];
+} __attribute__ ((packed));
+
+#endif
diff --git a/drivers/infiniband/hw/ehca/ipz_pt_fn.c b/drivers/infiniband/hw/ehca/ipz_pt_fn.c
new file mode 100644
index 0000000..8d59451
--- /dev/null
+++ b/drivers/infiniband/hw/ehca/ipz_pt_fn.c
@@ -0,0 +1,295 @@
+/*
+ *  IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ *  internal queue handling
+ *
+ *  Authors: Waleri Fomin <fomin@de.ibm.com>
+ *           Reinhard Ernst <rernst@de.ibm.com>
+ *           Christoph Raisch <raisch@de.ibm.com>
+ *
+ *  Copyright (c) 2005 IBM Corporation
+ *
+ *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ *  BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/slab.h>
+
+#include "ehca_tools.h"
+#include "ipz_pt_fn.h"
+#include "ehca_classes.h"
+
+#define PAGES_PER_KPAGE (PAGE_SIZE >> EHCA_PAGESHIFT)
+
+struct kmem_cache *small_qp_cache;
+
+void *ipz_qpageit_get_inc(struct ipz_queue *queue)
+{
+	void *ret = ipz_qeit_get(queue);
+	queue->current_q_offset += queue->pagesize;
+	if (queue->current_q_offset > queue->queue_length) {
+		queue->current_q_offset -= queue->pagesize;
+		ret = NULL;
+	}
+	if (((u64)ret) % queue->pagesize) {
+		ehca_gen_err("ERROR!! not at PAGE-Boundary");
+		return NULL;
+	}
+	return ret;
+}
+
+void *ipz_qeit_eq_get_inc(struct ipz_queue *queue)
+{
+	void *ret = ipz_qeit_get(queue);
+	u64 last_entry_in_q = queue->queue_length - queue->qe_size;
+
+	queue->current_q_offset += queue->qe_size;
+	if (queue->current_q_offset > last_entry_in_q) {
+		queue->current_q_offset = 0;
+		queue->toggle_state = (~queue->toggle_state) & 1;
+	}
+
+	return ret;
+}
+
+int ipz_queue_abs_to_offset(struct ipz_queue *queue, u64 addr, u64 *q_offset)
+{
+	int i;
+	for (i = 0; i < queue->queue_length / queue->pagesize; i++) {
+		u64 page = __pa(queue->queue_pages[i]);
+		if (addr >= page && addr < page + queue->pagesize) {
+			*q_offset = addr - page + i * queue->pagesize;
+			return 0;
+		}
+	}
+	return -EINVAL;
+}
+
+#if PAGE_SHIFT < EHCA_PAGESHIFT
+#error Kernel pages must be at least as large than eHCA pages (4K) !
+#endif
+
+/*
+ * allocate pages for queue:
+ * outer loop allocates whole kernel pages (page aligned) and
+ * inner loop divides a kernel page into smaller hca queue pages
+ */
+static int alloc_queue_pages(struct ipz_queue *queue, const u32 nr_of_pages)
+{
+	int k, f = 0;
+	u8 *kpage;
+
+	while (f < nr_of_pages) {
+		kpage = (u8 *)get_zeroed_page(GFP_KERNEL);
+		if (!kpage)
+			goto out;
+
+		for (k = 0; k < PAGES_PER_KPAGE && f < nr_of_pages; k++) {
+			queue->queue_pages[f] = (struct ipz_page *)kpage;
+			kpage += EHCA_PAGESIZE;
+			f++;
+		}
+	}
+	return 1;
+
+out:
+	for (f = 0; f < nr_of_pages && queue->queue_pages[f];
+	     f += PAGES_PER_KPAGE)
+		free_page((unsigned long)(queue->queue_pages)[f]);
+	return 0;
+}
+
+static int alloc_small_queue_page(struct ipz_queue *queue, struct ehca_pd *pd)
+{
+	int order = ilog2(queue->pagesize) - 9;
+	struct ipz_small_queue_page *page;
+	unsigned long bit;
+
+	mutex_lock(&pd->lock);
+
+	if (!list_empty(&pd->free[order]))
+		page = list_entry(pd->free[order].next,
+				  struct ipz_small_queue_page, list);
+	else {
+		page = kmem_cache_zalloc(small_qp_cache, GFP_KERNEL);
+		if (!page)
+			goto out;
+
+		page->page = get_zeroed_page(GFP_KERNEL);
+		if (!page->page) {
+			kmem_cache_free(small_qp_cache, page);
+			goto out;
+		}
+
+		list_add(&page->list, &pd->free[order]);
+	}
+
+	bit = find_first_zero_bit(page->bitmap, IPZ_SPAGE_PER_KPAGE >> order);
+	__set_bit(bit, page->bitmap);
+	page->fill++;
+
+	if (page->fill == IPZ_SPAGE_PER_KPAGE >> order)
+		list_move(&page->list, &pd->full[order]);
+
+	mutex_unlock(&pd->lock);
+
+	queue->queue_pages[0] = (void *)(page->page | (bit << (order + 9)));
+	queue->small_page = page;
+	queue->offset = bit << (order + 9);
+	return 1;
+
+out:
+	ehca_err(pd->ib_pd.device, "failed to allocate small queue page");
+	mutex_unlock(&pd->lock);
+	return 0;
+}
+
+static void free_small_queue_page(struct ipz_queue *queue, struct ehca_pd *pd)
+{
+	int order = ilog2(queue->pagesize) - 9;
+	struct ipz_small_queue_page *page = queue->small_page;
+	unsigned long bit;
+	int free_page = 0;
+
+	bit = ((unsigned long)queue->queue_pages[0] & ~PAGE_MASK)
+		>> (order + 9);
+
+	mutex_lock(&pd->lock);
+
+	__clear_bit(bit, page->bitmap);
+	page->fill--;
+
+	if (page->fill == 0) {
+		list_del(&page->list);
+		free_page = 1;
+	}
+
+	if (page->fill == (IPZ_SPAGE_PER_KPAGE >> order) - 1)
+		/* the page was full until we freed the chunk */
+		list_move_tail(&page->list, &pd->free[order]);
+
+	mutex_unlock(&pd->lock);
+
+	if (free_page) {
+		free_page(page->page);
+		kmem_cache_free(small_qp_cache, page);
+	}
+}
+
+int ipz_queue_ctor(struct ehca_pd *pd, struct ipz_queue *queue,
+		   const u32 nr_of_pages, const u32 pagesize,
+		   const u32 qe_size, const u32 nr_of_sg,
+		   int is_small)
+{
+	if (pagesize > PAGE_SIZE) {
+		ehca_gen_err("FATAL ERROR: pagesize=%x "
+			     "is greater than kernel page size", pagesize);
+		return 0;
+	}
+
+	/* init queue fields */
+	queue->queue_length = nr_of_pages * pagesize;
+	queue->pagesize = pagesize;
+	queue->qe_size = qe_size;
+	queue->act_nr_of_sg = nr_of_sg;
+	queue->current_q_offset = 0;
+	queue->toggle_state = 1;
+	queue->small_page = NULL;
+
+	/* allocate queue page pointers */
+	queue->queue_pages = kzalloc(nr_of_pages * sizeof(void *),
+				     GFP_KERNEL | __GFP_NOWARN);
+	if (!queue->queue_pages) {
+		queue->queue_pages = vzalloc(nr_of_pages * sizeof(void *));
+		if (!queue->queue_pages) {
+			ehca_gen_err("Couldn't allocate queue page list");
+			return 0;
+		}
+	}
+
+	/* allocate actual queue pages */
+	if (is_small) {
+		if (!alloc_small_queue_page(queue, pd))
+			goto ipz_queue_ctor_exit0;
+	} else
+		if (!alloc_queue_pages(queue, nr_of_pages))
+			goto ipz_queue_ctor_exit0;
+
+	return 1;
+
+ipz_queue_ctor_exit0:
+	ehca_gen_err("Couldn't alloc pages queue=%p "
+		 "nr_of_pages=%x",  queue, nr_of_pages);
+	if (is_vmalloc_addr(queue->queue_pages))
+		vfree(queue->queue_pages);
+	else
+		kfree(queue->queue_pages);
+
+	return 0;
+}
+
+int ipz_queue_dtor(struct ehca_pd *pd, struct ipz_queue *queue)
+{
+	int i, nr_pages;
+
+	if (!queue || !queue->queue_pages) {
+		ehca_gen_dbg("queue or queue_pages is NULL");
+		return 0;
+	}
+
+	if (queue->small_page)
+		free_small_queue_page(queue, pd);
+	else {
+		nr_pages = queue->queue_length / queue->pagesize;
+		for (i = 0; i < nr_pages; i += PAGES_PER_KPAGE)
+			free_page((unsigned long)queue->queue_pages[i]);
+	}
+
+	if (is_vmalloc_addr(queue->queue_pages))
+		vfree(queue->queue_pages);
+	else
+		kfree(queue->queue_pages);
+
+	return 1;
+}
+
+int ehca_init_small_qp_cache(void)
+{
+	small_qp_cache = kmem_cache_create("ehca_cache_small_qp",
+					   sizeof(struct ipz_small_queue_page),
+					   0, SLAB_HWCACHE_ALIGN, NULL);
+	if (!small_qp_cache)
+		return -ENOMEM;
+
+	return 0;
+}
+
+void ehca_cleanup_small_qp_cache(void)
+{
+	kmem_cache_destroy(small_qp_cache);
+}
diff --git a/drivers/infiniband/hw/ehca/ipz_pt_fn.h b/drivers/infiniband/hw/ehca/ipz_pt_fn.h
new file mode 100644
index 0000000..a801274
--- /dev/null
+++ b/drivers/infiniband/hw/ehca/ipz_pt_fn.h
@@ -0,0 +1,289 @@
+/*
+ *  IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ *  internal queue handling
+ *
+ *  Authors: Waleri Fomin <fomin@de.ibm.com>
+ *           Reinhard Ernst <rernst@de.ibm.com>
+ *           Christoph Raisch <raisch@de.ibm.com>
+ *
+ *  Copyright (c) 2005 IBM Corporation
+ *
+ *  All rights reserved.
+ *
+ *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ *  BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __IPZ_PT_FN_H__
+#define __IPZ_PT_FN_H__
+
+#define EHCA_PAGESHIFT   12
+#define EHCA_PAGESIZE   4096UL
+#define EHCA_PAGEMASK   (~(EHCA_PAGESIZE-1))
+#define EHCA_PT_ENTRIES 512UL
+
+#include "ehca_tools.h"
+#include "ehca_qes.h"
+
+struct ehca_pd;
+struct ipz_small_queue_page;
+
+extern struct kmem_cache *small_qp_cache;
+
+/* struct generic ehca page */
+struct ipz_page {
+	u8 entries[EHCA_PAGESIZE];
+};
+
+#define IPZ_SPAGE_PER_KPAGE (PAGE_SIZE / 512)
+
+struct ipz_small_queue_page {
+	unsigned long page;
+	unsigned long bitmap[IPZ_SPAGE_PER_KPAGE / BITS_PER_LONG];
+	int fill;
+	void *mapped_addr;
+	u32 mmap_count;
+	struct list_head list;
+};
+
+/* struct generic queue in linux kernel virtual memory (kv) */
+struct ipz_queue {
+	u64 current_q_offset;	/* current queue entry */
+
+	struct ipz_page **queue_pages;	/* array of pages belonging to queue */
+	u32 qe_size;		/* queue entry size */
+	u32 act_nr_of_sg;
+	u32 queue_length;	/* queue length allocated in bytes */
+	u32 pagesize;
+	u32 toggle_state;	/* toggle flag - per page */
+	u32 offset; /* save offset within page for small_qp */
+	struct ipz_small_queue_page *small_page;
+};
+
+/*
+ * return current Queue Entry for a certain q_offset
+ * returns address (kv) of Queue Entry
+ */
+static inline void *ipz_qeit_calc(struct ipz_queue *queue, u64 q_offset)
+{
+	struct ipz_page *current_page;
+	if (q_offset >= queue->queue_length)
+		return NULL;
+	current_page = (queue->queue_pages)[q_offset >> EHCA_PAGESHIFT];
+	return &current_page->entries[q_offset & (EHCA_PAGESIZE - 1)];
+}
+
+/*
+ * return current Queue Entry
+ * returns address (kv) of Queue Entry
+ */
+static inline void *ipz_qeit_get(struct ipz_queue *queue)
+{
+	return ipz_qeit_calc(queue, queue->current_q_offset);
+}
+
+/*
+ * return current Queue Page , increment Queue Page iterator from
+ * page to page in struct ipz_queue, last increment will return 0! and
+ * NOT wrap
+ * returns address (kv) of Queue Page
+ * warning don't use in parallel with ipz_QE_get_inc()
+ */
+void *ipz_qpageit_get_inc(struct ipz_queue *queue);
+
+/*
+ * return current Queue Entry, increment Queue Entry iterator by one
+ * step in struct ipz_queue, will wrap in ringbuffer
+ * returns address (kv) of Queue Entry BEFORE increment
+ * warning don't use in parallel with ipz_qpageit_get_inc()
+ */
+static inline void *ipz_qeit_get_inc(struct ipz_queue *queue)
+{
+	void *ret = ipz_qeit_get(queue);
+	queue->current_q_offset += queue->qe_size;
+	if (queue->current_q_offset >= queue->queue_length) {
+		queue->current_q_offset = 0;
+		/* toggle the valid flag */
+		queue->toggle_state = (~queue->toggle_state) & 1;
+	}
+
+	return ret;
+}
+
+/*
+ * return a bool indicating whether current Queue Entry is valid
+ */
+static inline int ipz_qeit_is_valid(struct ipz_queue *queue)
+{
+	struct ehca_cqe *cqe = ipz_qeit_get(queue);
+	return ((cqe->cqe_flags >> 7) == (queue->toggle_state & 1));
+}
+
+/*
+ * return current Queue Entry, increment Queue Entry iterator by one
+ * step in struct ipz_queue, will wrap in ringbuffer
+ * returns address (kv) of Queue Entry BEFORE increment
+ * returns 0 and does not increment, if wrong valid state
+ * warning don't use in parallel with ipz_qpageit_get_inc()
+ */
+static inline void *ipz_qeit_get_inc_valid(struct ipz_queue *queue)
+{
+	return ipz_qeit_is_valid(queue) ? ipz_qeit_get_inc(queue) : NULL;
+}
+
+/*
+ * returns and resets Queue Entry iterator
+ * returns address (kv) of first Queue Entry
+ */
+static inline void *ipz_qeit_reset(struct ipz_queue *queue)
+{
+	queue->current_q_offset = 0;
+	return ipz_qeit_get(queue);
+}
+
+/*
+ * return the q_offset corresponding to an absolute address
+ */
+int ipz_queue_abs_to_offset(struct ipz_queue *queue, u64 addr, u64 *q_offset);
+
+/*
+ * return the next queue offset. don't modify the queue.
+ */
+static inline u64 ipz_queue_advance_offset(struct ipz_queue *queue, u64 offset)
+{
+	offset += queue->qe_size;
+	if (offset >= queue->queue_length) offset = 0;
+	return offset;
+}
+
+/* struct generic page table */
+struct ipz_pt {
+	u64 entries[EHCA_PT_ENTRIES];
+};
+
+/* struct page table for a queue, only to be used in pf */
+struct ipz_qpt {
+	/* queue page tables (kv), use u64 because we know the element length */
+	u64 *qpts;
+	u32 n_qpts;
+	u32 n_ptes;       /*  number of page table entries */
+	u64 *current_pte_addr;
+};
+
+/*
+ * constructor for a ipz_queue_t, placement new for ipz_queue_t,
+ * new for all dependent datastructors
+ * all QP Tables are the same
+ * flow:
+ *    allocate+pin queue
+ * see ipz_qpt_ctor()
+ * returns true if ok, false if out of memory
+ */
+int ipz_queue_ctor(struct ehca_pd *pd, struct ipz_queue *queue,
+		   const u32 nr_of_pages, const u32 pagesize,
+		   const u32 qe_size, const u32 nr_of_sg,
+		   int is_small);
+
+/*
+ * destructor for a ipz_queue_t
+ *  -# free queue
+ *  see ipz_queue_ctor()
+ *  returns true if ok, false if queue was NULL-ptr of free failed
+ */
+int ipz_queue_dtor(struct ehca_pd *pd, struct ipz_queue *queue);
+
+/*
+ * constructor for a ipz_qpt_t,
+ * placement new for struct ipz_queue, new for all dependent datastructors
+ * all QP Tables are the same,
+ * flow:
+ * -# allocate+pin queue
+ * -# initialise ptcb
+ * -# allocate+pin PTs
+ * -# link PTs to a ring, according to HCA Arch, set bit62 id needed
+ * -# the ring must have room for exactly nr_of_PTEs
+ * see ipz_qpt_ctor()
+ */
+void ipz_qpt_ctor(struct ipz_qpt *qpt,
+		  const u32 nr_of_qes,
+		  const u32 pagesize,
+		  const u32 qe_size,
+		  const u8 lowbyte, const u8 toggle,
+		  u32 * act_nr_of_QEs, u32 * act_nr_of_pages);
+
+/*
+ * return current Queue Entry, increment Queue Entry iterator by one
+ * step in struct ipz_queue, will wrap in ringbuffer
+ * returns address (kv) of Queue Entry BEFORE increment
+ * warning don't use in parallel with ipz_qpageit_get_inc()
+ * warning unpredictable results may occur if steps>act_nr_of_queue_entries
+ * fix EQ page problems
+ */
+void *ipz_qeit_eq_get_inc(struct ipz_queue *queue);
+
+/*
+ * return current Event Queue Entry, increment Queue Entry iterator
+ * by one step in struct ipz_queue if valid, will wrap in ringbuffer
+ * returns address (kv) of Queue Entry BEFORE increment
+ * returns 0 and does not increment, if wrong valid state
+ * warning don't use in parallel with ipz_queue_QPageit_get_inc()
+ * warning unpredictable results may occur if steps>act_nr_of_queue_entries
+ */
+static inline void *ipz_eqit_eq_get_inc_valid(struct ipz_queue *queue)
+{
+	void *ret = ipz_qeit_get(queue);
+	u32 qe = *(u8 *)ret;
+	if ((qe >> 7) != (queue->toggle_state & 1))
+		return NULL;
+	ipz_qeit_eq_get_inc(queue); /* this is a good one */
+	return ret;
+}
+
+static inline void *ipz_eqit_eq_peek_valid(struct ipz_queue *queue)
+{
+	void *ret = ipz_qeit_get(queue);
+	u32 qe = *(u8 *)ret;
+	if ((qe >> 7) != (queue->toggle_state & 1))
+		return NULL;
+	return ret;
+}
+
+/* returns address (GX) of first queue entry */
+static inline u64 ipz_qpt_get_firstpage(struct ipz_qpt *qpt)
+{
+	return be64_to_cpu(qpt->qpts[0]);
+}
+
+/* returns address (kv) of first page of queue page table */
+static inline void *ipz_qpt_get_qpt(struct ipz_qpt *qpt)
+{
+	return qpt->qpts;
+}
+
+#endif				/* __IPZ_PT_FN_H__ */
diff --git a/drivers/infiniband/hw/ipath/Kconfig b/drivers/infiniband/hw/ipath/Kconfig
new file mode 100644
index 0000000..1d9bb11
--- /dev/null
+++ b/drivers/infiniband/hw/ipath/Kconfig
@@ -0,0 +1,11 @@
+config INFINIBAND_IPATH
+	tristate "QLogic HTX HCA support"
+	depends on 64BIT && NET && HT_IRQ
+	---help---
+	This is a driver for the obsolete QLogic Hyper-Transport
+	IB host channel adapter (model QHT7140),
+	including InfiniBand verbs support.  This driver allows these
+	devices to be used with both kernel upper level protocols such
+	as IP-over-InfiniBand as well as with userspace applications
+	(in conjunction with InfiniBand userspace access).
+	For QLogic PCIe QLE based cards, use the QIB driver instead.
diff --git a/drivers/infiniband/hw/ipath/Makefile b/drivers/infiniband/hw/ipath/Makefile
new file mode 100644
index 0000000..4496f28
--- /dev/null
+++ b/drivers/infiniband/hw/ipath/Makefile
@@ -0,0 +1,37 @@
+ccflags-y := -DIPATH_IDSTR='"QLogic kernel.org driver"' \
+	-DIPATH_KERN_TYPE=0
+
+obj-$(CONFIG_INFINIBAND_IPATH) += ib_ipath.o
+
+ib_ipath-y := \
+	ipath_cq.o \
+	ipath_diag.o \
+	ipath_dma.o \
+	ipath_driver.o \
+	ipath_eeprom.o \
+	ipath_file_ops.o \
+	ipath_fs.o \
+	ipath_init_chip.o \
+	ipath_intr.o \
+	ipath_keys.o \
+	ipath_mad.o \
+	ipath_mmap.o \
+	ipath_mr.o \
+	ipath_qp.o \
+	ipath_rc.o \
+	ipath_ruc.o \
+	ipath_sdma.o \
+	ipath_srq.o \
+	ipath_stats.o \
+	ipath_sysfs.o \
+	ipath_uc.o \
+	ipath_ud.o \
+	ipath_user_pages.o \
+	ipath_user_sdma.o \
+	ipath_verbs_mcast.o \
+	ipath_verbs.o
+
+ib_ipath-$(CONFIG_HT_IRQ) += ipath_iba6110.o
+
+ib_ipath-$(CONFIG_X86_64) += ipath_wc_x86_64.o
+ib_ipath-$(CONFIG_PPC64) += ipath_wc_ppc64.o
diff --git a/drivers/infiniband/hw/ipath/ipath_common.h b/drivers/infiniband/hw/ipath/ipath_common.h
new file mode 100644
index 0000000..28cfe97
--- /dev/null
+++ b/drivers/infiniband/hw/ipath/ipath_common.h
@@ -0,0 +1,851 @@
+/*
+ * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _IPATH_COMMON_H
+#define _IPATH_COMMON_H
+
+/*
+ * This file contains defines, structures, etc. that are used
+ * to communicate between kernel and user code.
+ */
+
+
+/* This is the IEEE-assigned OUI for QLogic Inc. InfiniPath */
+#define IPATH_SRC_OUI_1 0x00
+#define IPATH_SRC_OUI_2 0x11
+#define IPATH_SRC_OUI_3 0x75
+
+/* version of protocol header (known to chip also). In the long run,
+ * we should be able to generate and accept a range of version numbers;
+ * for now we only accept one, and it's compiled in.
+ */
+#define IPS_PROTO_VERSION 2
+
+/*
+ * These are compile time constants that you may want to enable or disable
+ * if you are trying to debug problems with code or performance.
+ * IPATH_VERBOSE_TRACING define as 1 if you want additional tracing in
+ * fastpath code
+ * IPATH_TRACE_REGWRITES define as 1 if you want register writes to be
+ * traced in faspath code
+ * _IPATH_TRACING define as 0 if you want to remove all tracing in a
+ * compilation unit
+ * _IPATH_DEBUGGING define as 0 if you want to remove debug prints
+ */
+
+/*
+ * The value in the BTH QP field that InfiniPath uses to differentiate
+ * an infinipath protocol IB packet vs standard IB transport
+ */
+#define IPATH_KD_QP 0x656b79
+
+/*
+ * valid states passed to ipath_set_linkstate() user call
+ */
+#define IPATH_IB_LINKDOWN		0
+#define IPATH_IB_LINKARM		1
+#define IPATH_IB_LINKACTIVE		2
+#define IPATH_IB_LINKDOWN_ONLY		3
+#define IPATH_IB_LINKDOWN_SLEEP		4
+#define IPATH_IB_LINKDOWN_DISABLE	5
+#define IPATH_IB_LINK_LOOPBACK	6 /* enable local loopback */
+#define IPATH_IB_LINK_EXTERNAL	7 /* normal, disable local loopback */
+#define IPATH_IB_LINK_NO_HRTBT	8 /* disable Heartbeat, e.g. for loopback */
+#define IPATH_IB_LINK_HRTBT	9 /* enable heartbeat, normal, non-loopback */
+
+/*
+ * These 3 values (SDR and DDR may be ORed for auto-speed
+ * negotiation) are used for the 3rd argument to path_f_set_ib_cfg
+ * with cmd IPATH_IB_CFG_SPD_ENB, by direct calls or via sysfs.  They
+ * are also the the possible values for ipath_link_speed_enabled and active
+ * The values were chosen to match values used within the IB spec.
+ */
+#define IPATH_IB_SDR 1
+#define IPATH_IB_DDR 2
+
+/*
+ * stats maintained by the driver.  For now, at least, this is global
+ * to all minor devices.
+ */
+struct infinipath_stats {
+	/* number of interrupts taken */
+	__u64 sps_ints;
+	/* number of interrupts for errors */
+	__u64 sps_errints;
+	/* number of errors from chip (not incl. packet errors or CRC) */
+	__u64 sps_errs;
+	/* number of packet errors from chip other than CRC */
+	__u64 sps_pkterrs;
+	/* number of packets with CRC errors (ICRC and VCRC) */
+	__u64 sps_crcerrs;
+	/* number of hardware errors reported (parity, etc.) */
+	__u64 sps_hwerrs;
+	/* number of times IB link changed state unexpectedly */
+	__u64 sps_iblink;
+	__u64 sps_unused; /* was fastrcvint, no longer implemented */
+	/* number of kernel (port0) packets received */
+	__u64 sps_port0pkts;
+	/* number of "ethernet" packets sent by driver */
+	__u64 sps_ether_spkts;
+	/* number of "ethernet" packets received by driver */
+	__u64 sps_ether_rpkts;
+	/* number of SMA packets sent by driver. Obsolete. */
+	__u64 sps_sma_spkts;
+	/* number of SMA packets received by driver. Obsolete. */
+	__u64 sps_sma_rpkts;
+	/* number of times all ports rcvhdrq was full and packet dropped */
+	__u64 sps_hdrqfull;
+	/* number of times all ports egrtid was full and packet dropped */
+	__u64 sps_etidfull;
+	/*
+	 * number of times we tried to send from driver, but no pio buffers
+	 * avail
+	 */
+	__u64 sps_nopiobufs;
+	/* number of ports currently open */
+	__u64 sps_ports;
+	/* list of pkeys (other than default) accepted (0 means not set) */
+	__u16 sps_pkeys[4];
+	__u16 sps_unused16[4]; /* available; maintaining compatible layout */
+	/* number of user ports per chip (not IB ports) */
+	__u32 sps_nports;
+	/* not our interrupt, or already handled */
+	__u32 sps_nullintr;
+	/* max number of packets handled per receive call */
+	__u32 sps_maxpkts_call;
+	/* avg number of packets handled per receive call */
+	__u32 sps_avgpkts_call;
+	/* total number of pages locked */
+	__u64 sps_pagelocks;
+	/* total number of pages unlocked */
+	__u64 sps_pageunlocks;
+	/*
+	 * Number of packets dropped in kernel other than errors (ether
+	 * packets if ipath not configured, etc.)
+	 */
+	__u64 sps_krdrops;
+	__u64 sps_txeparity; /* PIO buffer parity error, recovered */
+	/* pad for future growth */
+	__u64 __sps_pad[45];
+};
+
+/*
+ * These are the status bits readable (in ascii form, 64bit value)
+ * from the "status" sysfs file.
+ */
+#define IPATH_STATUS_INITTED       0x1	/* basic initialization done */
+#define IPATH_STATUS_DISABLED      0x2	/* hardware disabled */
+/* Device has been disabled via admin request */
+#define IPATH_STATUS_ADMIN_DISABLED    0x4
+/* Chip has been found and initted */
+#define IPATH_STATUS_CHIP_PRESENT 0x20
+/* IB link is at ACTIVE, usable for data traffic */
+#define IPATH_STATUS_IB_READY     0x40
+/* link is configured, LID, MTU, etc. have been set */
+#define IPATH_STATUS_IB_CONF      0x80
+/* no link established, probably no cable */
+#define IPATH_STATUS_IB_NOCABLE  0x100
+/* A Fatal hardware error has occurred. */
+#define IPATH_STATUS_HWERROR     0x200
+
+/*
+ * The list of usermode accessible registers.  Also see Reg_* later in file.
+ */
+typedef enum _ipath_ureg {
+	/* (RO)  DMA RcvHdr to be used next. */
+	ur_rcvhdrtail = 0,
+	/* (RW)  RcvHdr entry to be processed next by host. */
+	ur_rcvhdrhead = 1,
+	/* (RO)  Index of next Eager index to use. */
+	ur_rcvegrindextail = 2,
+	/* (RW)  Eager TID to be processed next */
+	ur_rcvegrindexhead = 3,
+	/* For internal use only; max register number. */
+	_IPATH_UregMax
+} ipath_ureg;
+
+/* bit values for spi_runtime_flags */
+#define IPATH_RUNTIME_HT	0x1
+#define IPATH_RUNTIME_PCIE	0x2
+#define IPATH_RUNTIME_FORCE_WC_ORDER	0x4
+#define IPATH_RUNTIME_RCVHDR_COPY	0x8
+#define IPATH_RUNTIME_MASTER	0x10
+#define IPATH_RUNTIME_NODMA_RTAIL 0x80
+#define IPATH_RUNTIME_SDMA	      0x200
+#define IPATH_RUNTIME_FORCE_PIOAVAIL 0x400
+#define IPATH_RUNTIME_PIO_REGSWAPPED 0x800
+
+/*
+ * This structure is returned by ipath_userinit() immediately after
+ * open to get implementation-specific info, and info specific to this
+ * instance.
+ *
+ * This struct must have explict pad fields where type sizes
+ * may result in different alignments between 32 and 64 bit
+ * programs, since the 64 bit * bit kernel requires the user code
+ * to have matching offsets
+ */
+struct ipath_base_info {
+	/* version of hardware, for feature checking. */
+	__u32 spi_hw_version;
+	/* version of software, for feature checking. */
+	__u32 spi_sw_version;
+	/* InfiniPath port assigned, goes into sent packets */
+	__u16 spi_port;
+	__u16 spi_subport;
+	/*
+	 * IB MTU, packets IB data must be less than this.
+	 * The MTU is in bytes, and will be a multiple of 4 bytes.
+	 */
+	__u32 spi_mtu;
+	/*
+	 * Size of a PIO buffer.  Any given packet's total size must be less
+	 * than this (in words).  Included is the starting control word, so
+	 * if 513 is returned, then total pkt size is 512 words or less.
+	 */
+	__u32 spi_piosize;
+	/* size of the TID cache in infinipath, in entries */
+	__u32 spi_tidcnt;
+	/* size of the TID Eager list in infinipath, in entries */
+	__u32 spi_tidegrcnt;
+	/* size of a single receive header queue entry in words. */
+	__u32 spi_rcvhdrent_size;
+	/*
+	 * Count of receive header queue entries allocated.
+	 * This may be less than the spu_rcvhdrcnt passed in!.
+	 */
+	__u32 spi_rcvhdr_cnt;
+
+	/* per-chip and other runtime features bitmap (IPATH_RUNTIME_*) */
+	__u32 spi_runtime_flags;
+
+	/* address where receive buffer queue is mapped into */
+	__u64 spi_rcvhdr_base;
+
+	/* user program. */
+
+	/* base address of eager TID receive buffers. */
+	__u64 spi_rcv_egrbufs;
+
+	/* Allocated by initialization code, not by protocol. */
+
+	/*
+	 * Size of each TID buffer in host memory, starting at
+	 * spi_rcv_egrbufs.  The buffers are virtually contiguous.
+	 */
+	__u32 spi_rcv_egrbufsize;
+	/*
+	 * The special QP (queue pair) value that identifies an infinipath
+	 * protocol packet from standard IB packets.  More, probably much
+	 * more, to be added.
+	 */
+	__u32 spi_qpair;
+
+	/*
+	 * User register base for init code, not to be used directly by
+	 * protocol or applications.
+	 */
+	__u64 __spi_uregbase;
+	/*
+	 * Maximum buffer size in bytes that can be used in a single TID
+	 * entry (assuming the buffer is aligned to this boundary).  This is
+	 * the minimum of what the hardware and software support Guaranteed
+	 * to be a power of 2.
+	 */
+	__u32 spi_tid_maxsize;
+	/*
+	 * alignment of each pio send buffer (byte count
+	 * to add to spi_piobufbase to get to second buffer)
+	 */
+	__u32 spi_pioalign;
+	/*
+	 * The index of the first pio buffer available to this process;
+	 * needed to do lookup in spi_pioavailaddr; not added to
+	 * spi_piobufbase.
+	 */
+	__u32 spi_pioindex;
+	 /* number of buffers mapped for this process */
+	__u32 spi_piocnt;
+
+	/*
+	 * Base address of writeonly pio buffers for this process.
+	 * Each buffer has spi_piosize words, and is aligned on spi_pioalign
+	 * boundaries.  spi_piocnt buffers are mapped from this address
+	 */
+	__u64 spi_piobufbase;
+
+	/*
+	 * Base address of readonly memory copy of the pioavail registers.
+	 * There are 2 bits for each buffer.
+	 */
+	__u64 spi_pioavailaddr;
+
+	/*
+	 * Address where driver updates a copy of the interface and driver
+	 * status (IPATH_STATUS_*) as a 64 bit value.  It's followed by a
+	 * string indicating hardware error, if there was one.
+	 */
+	__u64 spi_status;
+
+	/* number of chip ports available to user processes */
+	__u32 spi_nports;
+	/* unit number of chip we are using */
+	__u32 spi_unit;
+	/* num bufs in each contiguous set */
+	__u32 spi_rcv_egrperchunk;
+	/* size in bytes of each contiguous set */
+	__u32 spi_rcv_egrchunksize;
+	/* total size of mmap to cover full rcvegrbuffers */
+	__u32 spi_rcv_egrbuftotlen;
+	__u32 spi_filler_for_align;
+	/* address of readonly memory copy of the rcvhdrq tail register. */
+	__u64 spi_rcvhdr_tailaddr;
+
+	/* shared memory pages for subports if port is shared */
+	__u64 spi_subport_uregbase;
+	__u64 spi_subport_rcvegrbuf;
+	__u64 spi_subport_rcvhdr_base;
+
+	/* shared memory page for hardware port if it is shared */
+	__u64 spi_port_uregbase;
+	__u64 spi_port_rcvegrbuf;
+	__u64 spi_port_rcvhdr_base;
+	__u64 spi_port_rcvhdr_tailaddr;
+
+} __attribute__ ((aligned(8)));
+
+
+/*
+ * This version number is given to the driver by the user code during
+ * initialization in the spu_userversion field of ipath_user_info, so
+ * the driver can check for compatibility with user code.
+ *
+ * The major version changes when data structures
+ * change in an incompatible way.  The driver must be the same or higher
+ * for initialization to succeed.  In some cases, a higher version
+ * driver will not interoperate with older software, and initialization
+ * will return an error.
+ */
+#define IPATH_USER_SWMAJOR 1
+
+/*
+ * Minor version differences are always compatible
+ * a within a major version, however if user software is larger
+ * than driver software, some new features and/or structure fields
+ * may not be implemented; the user code must deal with this if it
+ * cares, or it must abort after initialization reports the difference.
+ */
+#define IPATH_USER_SWMINOR 6
+
+#define IPATH_USER_SWVERSION ((IPATH_USER_SWMAJOR<<16) | IPATH_USER_SWMINOR)
+
+#define IPATH_KERN_TYPE 0
+
+/*
+ * Similarly, this is the kernel version going back to the user.  It's
+ * slightly different, in that we want to tell if the driver was built as
+ * part of a QLogic release, or from the driver from openfabrics.org,
+ * kernel.org, or a standard distribution, for support reasons.
+ * The high bit is 0 for non-QLogic and 1 for QLogic-built/supplied.
+ *
+ * It's returned by the driver to the user code during initialization in the
+ * spi_sw_version field of ipath_base_info, so the user code can in turn
+ * check for compatibility with the kernel.
+*/
+#define IPATH_KERN_SWVERSION ((IPATH_KERN_TYPE<<31) | IPATH_USER_SWVERSION)
+
+/*
+ * This structure is passed to ipath_userinit() to tell the driver where
+ * user code buffers are, sizes, etc.   The offsets and sizes of the
+ * fields must remain unchanged, for binary compatibility.  It can
+ * be extended, if userversion is changed so user code can tell, if needed
+ */
+struct ipath_user_info {
+	/*
+	 * version of user software, to detect compatibility issues.
+	 * Should be set to IPATH_USER_SWVERSION.
+	 */
+	__u32 spu_userversion;
+
+	/* desired number of receive header queue entries */
+	__u32 spu_rcvhdrcnt;
+
+	/* size of struct base_info to write to */
+	__u32 spu_base_info_size;
+
+	/*
+	 * number of words in KD protocol header
+	 * This tells InfiniPath how many words to copy to rcvhdrq.  If 0,
+	 * kernel uses a default.  Once set, attempts to set any other value
+	 * are an error (EAGAIN) until driver is reloaded.
+	 */
+	__u32 spu_rcvhdrsize;
+
+	/*
+	 * If two or more processes wish to share a port, each process
+	 * must set the spu_subport_cnt and spu_subport_id to the same
+	 * values.  The only restriction on the spu_subport_id is that
+	 * it be unique for a given node.
+	 */
+	__u16 spu_subport_cnt;
+	__u16 spu_subport_id;
+
+	__u32 spu_unused; /* kept for compatible layout */
+
+	/*
+	 * address of struct base_info to write to
+	 */
+	__u64 spu_base_info;
+
+} __attribute__ ((aligned(8)));
+
+/* User commands. */
+
+#define IPATH_CMD_MIN		16
+
+#define __IPATH_CMD_USER_INIT	16	/* old set up userspace (for old user code) */
+#define IPATH_CMD_PORT_INFO	17	/* find out what resources we got */
+#define IPATH_CMD_RECV_CTRL	18	/* control receipt of packets */
+#define IPATH_CMD_TID_UPDATE	19	/* update expected TID entries */
+#define IPATH_CMD_TID_FREE	20	/* free expected TID entries */
+#define IPATH_CMD_SET_PART_KEY	21	/* add partition key */
+#define __IPATH_CMD_SLAVE_INFO	22	/* return info on slave processes (for old user code) */
+#define IPATH_CMD_ASSIGN_PORT	23	/* allocate HCA and port */
+#define IPATH_CMD_USER_INIT 	24	/* set up userspace */
+#define IPATH_CMD_UNUSED_1	25
+#define IPATH_CMD_UNUSED_2	26
+#define IPATH_CMD_PIOAVAILUPD	27	/* force an update of PIOAvail reg */
+#define IPATH_CMD_POLL_TYPE	28	/* set the kind of polling we want */
+#define IPATH_CMD_ARMLAUNCH_CTRL	29 /* armlaunch detection control */
+/* 30 is unused */
+#define IPATH_CMD_SDMA_INFLIGHT 31	/* sdma inflight counter request */
+#define IPATH_CMD_SDMA_COMPLETE 32	/* sdma completion counter request */
+
+/*
+ * Poll types
+ */
+#define IPATH_POLL_TYPE_URGENT	 0x01
+#define IPATH_POLL_TYPE_OVERFLOW 0x02
+
+struct ipath_port_info {
+	__u32 num_active;	/* number of active units */
+	__u32 unit;		/* unit (chip) assigned to caller */
+	__u16 port;		/* port on unit assigned to caller */
+	__u16 subport;		/* subport on unit assigned to caller */
+	__u16 num_ports;	/* number of ports available on unit */
+	__u16 num_subports;	/* number of subports opened on port */
+};
+
+struct ipath_tid_info {
+	__u32 tidcnt;
+	/* make structure same size in 32 and 64 bit */
+	__u32 tid__unused;
+	/* virtual address of first page in transfer */
+	__u64 tidvaddr;
+	/* pointer (same size 32/64 bit) to __u16 tid array */
+	__u64 tidlist;
+
+	/*
+	 * pointer (same size 32/64 bit) to bitmap of TIDs used
+	 * for this call; checked for being large enough at open
+	 */
+	__u64 tidmap;
+};
+
+struct ipath_cmd {
+	__u32 type;			/* command type */
+	union {
+		struct ipath_tid_info tid_info;
+		struct ipath_user_info user_info;
+
+		/*
+		 * address in userspace where we should put the sdma
+		 * inflight counter
+		 */
+		__u64 sdma_inflight;
+		/*
+		 * address in userspace where we should put the sdma
+		 * completion counter
+		 */
+		__u64 sdma_complete;
+		/* address in userspace of struct ipath_port_info to
+		   write result to */
+		__u64 port_info;
+		/* enable/disable receipt of packets */
+		__u32 recv_ctrl;
+		/* enable/disable armlaunch errors (non-zero to enable) */
+		__u32 armlaunch_ctrl;
+		/* partition key to set */
+		__u16 part_key;
+		/* user address of __u32 bitmask of active slaves */
+		__u64 slave_mask_addr;
+		/* type of polling we want */
+		__u16 poll_type;
+	} cmd;
+};
+
+struct ipath_iovec {
+	/* Pointer to data, but same size 32 and 64 bit */
+	__u64 iov_base;
+
+	/*
+	 * Length of data; don't need 64 bits, but want
+	 * ipath_sendpkt to remain same size as before 32 bit changes, so...
+	 */
+	__u64 iov_len;
+};
+
+/*
+ * Describes a single packet for send.  Each packet can have one or more
+ * buffers, but the total length (exclusive of IB headers) must be less
+ * than the MTU, and if using the PIO method, entire packet length,
+ * including IB headers, must be less than the ipath_piosize value (words).
+ * Use of this necessitates including sys/uio.h
+ */
+struct __ipath_sendpkt {
+	__u32 sps_flags;	/* flags for packet (TBD) */
+	__u32 sps_cnt;		/* number of entries to use in sps_iov */
+	/* array of iov's describing packet. TEMPORARY */
+	struct ipath_iovec sps_iov[4];
+};
+
+/*
+ * diagnostics can send a packet by "writing" one of the following
+ * two structs to diag data special file
+ * The first is the legacy version for backward compatibility
+ */
+struct ipath_diag_pkt {
+	__u32 unit;
+	__u64 data;
+	__u32 len;
+};
+
+/* The second diag_pkt struct is the expanded version that allows
+ * more control over the packet, specifically, by allowing a custom
+ * pbc (+ static rate) qword, so that special modes and deliberate
+ * changes to CRCs can be used. The elements were also re-ordered
+ * for better alignment and to avoid padding issues.
+ */
+struct ipath_diag_xpkt {
+	__u64 data;
+	__u64 pbc_wd;
+	__u32 unit;
+	__u32 len;
+};
+
+/*
+ * Data layout in I2C flash (for GUID, etc.)
+ * All fields are little-endian binary unless otherwise stated
+ */
+#define IPATH_FLASH_VERSION 2
+struct ipath_flash {
+	/* flash layout version (IPATH_FLASH_VERSION) */
+	__u8 if_fversion;
+	/* checksum protecting if_length bytes */
+	__u8 if_csum;
+	/*
+	 * valid length (in use, protected by if_csum), including
+	 * if_fversion and if_csum themselves)
+	 */
+	__u8 if_length;
+	/* the GUID, in network order */
+	__u8 if_guid[8];
+	/* number of GUIDs to use, starting from if_guid */
+	__u8 if_numguid;
+	/* the (last 10 characters of) board serial number, in ASCII */
+	char if_serial[12];
+	/* board mfg date (YYYYMMDD ASCII) */
+	char if_mfgdate[8];
+	/* last board rework/test date (YYYYMMDD ASCII) */
+	char if_testdate[8];
+	/* logging of error counts, TBD */
+	__u8 if_errcntp[4];
+	/* powered on hours, updated at driver unload */
+	__u8 if_powerhour[2];
+	/* ASCII free-form comment field */
+	char if_comment[32];
+	/* Backwards compatible prefix for longer QLogic Serial Numbers */
+	char if_sprefix[4];
+	/* 82 bytes used, min flash size is 128 bytes */
+	__u8 if_future[46];
+};
+
+/*
+ * These are the counters implemented in the chip, and are listed in order.
+ * The InterCaps naming is taken straight from the chip spec.
+ */
+struct infinipath_counters {
+	__u64 LBIntCnt;
+	__u64 LBFlowStallCnt;
+	__u64 TxSDmaDescCnt;	/* was Reserved1 */
+	__u64 TxUnsupVLErrCnt;
+	__u64 TxDataPktCnt;
+	__u64 TxFlowPktCnt;
+	__u64 TxDwordCnt;
+	__u64 TxLenErrCnt;
+	__u64 TxMaxMinLenErrCnt;
+	__u64 TxUnderrunCnt;
+	__u64 TxFlowStallCnt;
+	__u64 TxDroppedPktCnt;
+	__u64 RxDroppedPktCnt;
+	__u64 RxDataPktCnt;
+	__u64 RxFlowPktCnt;
+	__u64 RxDwordCnt;
+	__u64 RxLenErrCnt;
+	__u64 RxMaxMinLenErrCnt;
+	__u64 RxICRCErrCnt;
+	__u64 RxVCRCErrCnt;
+	__u64 RxFlowCtrlErrCnt;
+	__u64 RxBadFormatCnt;
+	__u64 RxLinkProblemCnt;
+	__u64 RxEBPCnt;
+	__u64 RxLPCRCErrCnt;
+	__u64 RxBufOvflCnt;
+	__u64 RxTIDFullErrCnt;
+	__u64 RxTIDValidErrCnt;
+	__u64 RxPKeyMismatchCnt;
+	__u64 RxP0HdrEgrOvflCnt;
+	__u64 RxP1HdrEgrOvflCnt;
+	__u64 RxP2HdrEgrOvflCnt;
+	__u64 RxP3HdrEgrOvflCnt;
+	__u64 RxP4HdrEgrOvflCnt;
+	__u64 RxP5HdrEgrOvflCnt;
+	__u64 RxP6HdrEgrOvflCnt;
+	__u64 RxP7HdrEgrOvflCnt;
+	__u64 RxP8HdrEgrOvflCnt;
+	__u64 RxP9HdrEgrOvflCnt;	/* was Reserved6 */
+	__u64 RxP10HdrEgrOvflCnt;	/* was Reserved7 */
+	__u64 RxP11HdrEgrOvflCnt;	/* new for IBA7220 */
+	__u64 RxP12HdrEgrOvflCnt;	/* new for IBA7220 */
+	__u64 RxP13HdrEgrOvflCnt;	/* new for IBA7220 */
+	__u64 RxP14HdrEgrOvflCnt;	/* new for IBA7220 */
+	__u64 RxP15HdrEgrOvflCnt;	/* new for IBA7220 */
+	__u64 RxP16HdrEgrOvflCnt;	/* new for IBA7220 */
+	__u64 IBStatusChangeCnt;
+	__u64 IBLinkErrRecoveryCnt;
+	__u64 IBLinkDownedCnt;
+	__u64 IBSymbolErrCnt;
+	/* The following are new for IBA7220 */
+	__u64 RxVL15DroppedPktCnt;
+	__u64 RxOtherLocalPhyErrCnt;
+	__u64 PcieRetryBufDiagQwordCnt;
+	__u64 ExcessBufferOvflCnt;
+	__u64 LocalLinkIntegrityErrCnt;
+	__u64 RxVlErrCnt;
+	__u64 RxDlidFltrCnt;
+};
+
+/*
+ * The next set of defines are for packet headers, and chip register
+ * and memory bits that are visible to and/or used by user-mode software
+ * The other bits that are used only by the driver or diags are in
+ * ipath_registers.h
+ */
+
+/* RcvHdrFlags bits */
+#define INFINIPATH_RHF_LENGTH_MASK 0x7FF
+#define INFINIPATH_RHF_LENGTH_SHIFT 0
+#define INFINIPATH_RHF_RCVTYPE_MASK 0x7
+#define INFINIPATH_RHF_RCVTYPE_SHIFT 11
+#define INFINIPATH_RHF_EGRINDEX_MASK 0xFFF
+#define INFINIPATH_RHF_EGRINDEX_SHIFT 16
+#define INFINIPATH_RHF_SEQ_MASK 0xF
+#define INFINIPATH_RHF_SEQ_SHIFT 0
+#define INFINIPATH_RHF_HDRQ_OFFSET_MASK 0x7FF
+#define INFINIPATH_RHF_HDRQ_OFFSET_SHIFT 4
+#define INFINIPATH_RHF_H_ICRCERR   0x80000000
+#define INFINIPATH_RHF_H_VCRCERR   0x40000000
+#define INFINIPATH_RHF_H_PARITYERR 0x20000000
+#define INFINIPATH_RHF_H_LENERR    0x10000000
+#define INFINIPATH_RHF_H_MTUERR    0x08000000
+#define INFINIPATH_RHF_H_IHDRERR   0x04000000
+#define INFINIPATH_RHF_H_TIDERR    0x02000000
+#define INFINIPATH_RHF_H_MKERR     0x01000000
+#define INFINIPATH_RHF_H_IBERR     0x00800000
+#define INFINIPATH_RHF_H_ERR_MASK  0xFF800000
+#define INFINIPATH_RHF_L_USE_EGR   0x80000000
+#define INFINIPATH_RHF_L_SWA       0x00008000
+#define INFINIPATH_RHF_L_SWB       0x00004000
+
+/* infinipath header fields */
+#define INFINIPATH_I_VERS_MASK 0xF
+#define INFINIPATH_I_VERS_SHIFT 28
+#define INFINIPATH_I_PORT_MASK 0xF
+#define INFINIPATH_I_PORT_SHIFT 24
+#define INFINIPATH_I_TID_MASK 0x7FF
+#define INFINIPATH_I_TID_SHIFT 13
+#define INFINIPATH_I_OFFSET_MASK 0x1FFF
+#define INFINIPATH_I_OFFSET_SHIFT 0
+
+/* K_PktFlags bits */
+#define INFINIPATH_KPF_INTR 0x1
+#define INFINIPATH_KPF_SUBPORT_MASK 0x3
+#define INFINIPATH_KPF_SUBPORT_SHIFT 1
+
+#define INFINIPATH_MAX_SUBPORT	4
+
+/* SendPIO per-buffer control */
+#define INFINIPATH_SP_TEST    0x40
+#define INFINIPATH_SP_TESTEBP 0x20
+#define INFINIPATH_SP_TRIGGER_SHIFT  15
+
+/* SendPIOAvail bits */
+#define INFINIPATH_SENDPIOAVAIL_BUSY_SHIFT 1
+#define INFINIPATH_SENDPIOAVAIL_CHECK_SHIFT 0
+
+/* infinipath header format */
+struct ipath_header {
+	/*
+	 * Version - 4 bits, Port - 4 bits, TID - 10 bits and Offset -
+	 * 14 bits before ECO change ~28 Dec 03.  After that, Vers 4,
+	 * Port 4, TID 11, offset 13.
+	 */
+	__le32 ver_port_tid_offset;
+	__le16 chksum;
+	__le16 pkt_flags;
+};
+
+/* infinipath user message header format.
+ * This structure contains the first 4 fields common to all protocols
+ * that employ infinipath.
+ */
+struct ipath_message_header {
+	__be16 lrh[4];
+	__be32 bth[3];
+	/* fields below this point are in host byte order */
+	struct ipath_header iph;
+	__u8 sub_opcode;
+};
+
+/* infinipath ethernet header format */
+struct ether_header {
+	__be16 lrh[4];
+	__be32 bth[3];
+	struct ipath_header iph;
+	__u8 sub_opcode;
+	__u8 cmd;
+	__be16 lid;
+	__u16 mac[3];
+	__u8 frag_num;
+	__u8 seq_num;
+	__le32 len;
+	/* MUST be of word size due to PIO write requirements */
+	__le32 csum;
+	__le16 csum_offset;
+	__le16 flags;
+	__u16 first_2_bytes;
+	__u8 unused[2];		/* currently unused */
+};
+
+
+/* IB - LRH header consts */
+#define IPATH_LRH_GRH 0x0003	/* 1. word of IB LRH - next header: GRH */
+#define IPATH_LRH_BTH 0x0002	/* 1. word of IB LRH - next header: BTH */
+
+/* misc. */
+#define SIZE_OF_CRC 1
+
+#define IPATH_DEFAULT_P_KEY 0xFFFF
+#define IPATH_PERMISSIVE_LID 0xFFFF
+#define IPATH_AETH_CREDIT_SHIFT 24
+#define IPATH_AETH_CREDIT_MASK 0x1F
+#define IPATH_AETH_CREDIT_INVAL 0x1F
+#define IPATH_PSN_MASK 0xFFFFFF
+#define IPATH_MSN_MASK 0xFFFFFF
+#define IPATH_QPN_MASK 0xFFFFFF
+#define IPATH_MULTICAST_LID_BASE 0xC000
+#define IPATH_EAGER_TID_ID INFINIPATH_I_TID_MASK
+#define IPATH_MULTICAST_QPN 0xFFFFFF
+
+/* Receive Header Queue: receive type (from infinipath) */
+#define RCVHQ_RCV_TYPE_EXPECTED  0
+#define RCVHQ_RCV_TYPE_EAGER     1
+#define RCVHQ_RCV_TYPE_NON_KD    2
+#define RCVHQ_RCV_TYPE_ERROR     3
+
+
+/* sub OpCodes - ith4x  */
+#define IPATH_ITH4X_OPCODE_ENCAP 0x81
+#define IPATH_ITH4X_OPCODE_LID_ARP 0x82
+
+#define IPATH_HEADER_QUEUE_WORDS 9
+
+/* functions for extracting fields from rcvhdrq entries for the driver.
+ */
+static inline __u32 ipath_hdrget_err_flags(const __le32 * rbuf)
+{
+	return __le32_to_cpu(rbuf[1]) & INFINIPATH_RHF_H_ERR_MASK;
+}
+
+static inline __u32 ipath_hdrget_rcv_type(const __le32 * rbuf)
+{
+	return (__le32_to_cpu(rbuf[0]) >> INFINIPATH_RHF_RCVTYPE_SHIFT)
+	    & INFINIPATH_RHF_RCVTYPE_MASK;
+}
+
+static inline __u32 ipath_hdrget_length_in_bytes(const __le32 * rbuf)
+{
+	return ((__le32_to_cpu(rbuf[0]) >> INFINIPATH_RHF_LENGTH_SHIFT)
+		& INFINIPATH_RHF_LENGTH_MASK) << 2;
+}
+
+static inline __u32 ipath_hdrget_index(const __le32 * rbuf)
+{
+	return (__le32_to_cpu(rbuf[0]) >> INFINIPATH_RHF_EGRINDEX_SHIFT)
+	    & INFINIPATH_RHF_EGRINDEX_MASK;
+}
+
+static inline __u32 ipath_hdrget_seq(const __le32 *rbuf)
+{
+	return (__le32_to_cpu(rbuf[1]) >> INFINIPATH_RHF_SEQ_SHIFT)
+		& INFINIPATH_RHF_SEQ_MASK;
+}
+
+static inline __u32 ipath_hdrget_offset(const __le32 *rbuf)
+{
+	return (__le32_to_cpu(rbuf[1]) >> INFINIPATH_RHF_HDRQ_OFFSET_SHIFT)
+		& INFINIPATH_RHF_HDRQ_OFFSET_MASK;
+}
+
+static inline __u32 ipath_hdrget_use_egr_buf(const __le32 *rbuf)
+{
+	return __le32_to_cpu(rbuf[0]) & INFINIPATH_RHF_L_USE_EGR;
+}
+
+static inline __u32 ipath_hdrget_ipath_ver(__le32 hdrword)
+{
+	return (__le32_to_cpu(hdrword) >> INFINIPATH_I_VERS_SHIFT)
+	    & INFINIPATH_I_VERS_MASK;
+}
+
+#endif				/* _IPATH_COMMON_H */
diff --git a/drivers/infiniband/hw/ipath/ipath_cq.c b/drivers/infiniband/hw/ipath/ipath_cq.c
new file mode 100644
index 0000000..0416c6c
--- /dev/null
+++ b/drivers/infiniband/hw/ipath/ipath_cq.c
@@ -0,0 +1,478 @@
+/*
+ * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/err.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+
+#include "ipath_verbs.h"
+
+/**
+ * ipath_cq_enter - add a new entry to the completion queue
+ * @cq: completion queue
+ * @entry: work completion entry to add
+ * @sig: true if @entry is a solicitated entry
+ *
+ * This may be called with qp->s_lock held.
+ */
+void ipath_cq_enter(struct ipath_cq *cq, struct ib_wc *entry, int solicited)
+{
+	struct ipath_cq_wc *wc;
+	unsigned long flags;
+	u32 head;
+	u32 next;
+
+	spin_lock_irqsave(&cq->lock, flags);
+
+	/*
+	 * Note that the head pointer might be writable by user processes.
+	 * Take care to verify it is a sane value.
+	 */
+	wc = cq->queue;
+	head = wc->head;
+	if (head >= (unsigned) cq->ibcq.cqe) {
+		head = cq->ibcq.cqe;
+		next = 0;
+	} else
+		next = head + 1;
+	if (unlikely(next == wc->tail)) {
+		spin_unlock_irqrestore(&cq->lock, flags);
+		if (cq->ibcq.event_handler) {
+			struct ib_event ev;
+
+			ev.device = cq->ibcq.device;
+			ev.element.cq = &cq->ibcq;
+			ev.event = IB_EVENT_CQ_ERR;
+			cq->ibcq.event_handler(&ev, cq->ibcq.cq_context);
+		}
+		return;
+	}
+	if (cq->ip) {
+		wc->uqueue[head].wr_id = entry->wr_id;
+		wc->uqueue[head].status = entry->status;
+		wc->uqueue[head].opcode = entry->opcode;
+		wc->uqueue[head].vendor_err = entry->vendor_err;
+		wc->uqueue[head].byte_len = entry->byte_len;
+		wc->uqueue[head].ex.imm_data = (__u32 __force) entry->ex.imm_data;
+		wc->uqueue[head].qp_num = entry->qp->qp_num;
+		wc->uqueue[head].src_qp = entry->src_qp;
+		wc->uqueue[head].wc_flags = entry->wc_flags;
+		wc->uqueue[head].pkey_index = entry->pkey_index;
+		wc->uqueue[head].slid = entry->slid;
+		wc->uqueue[head].sl = entry->sl;
+		wc->uqueue[head].dlid_path_bits = entry->dlid_path_bits;
+		wc->uqueue[head].port_num = entry->port_num;
+		/* Make sure entry is written before the head index. */
+		smp_wmb();
+	} else
+		wc->kqueue[head] = *entry;
+	wc->head = next;
+
+	if (cq->notify == IB_CQ_NEXT_COMP ||
+	    (cq->notify == IB_CQ_SOLICITED && solicited)) {
+		cq->notify = IB_CQ_NONE;
+		cq->triggered++;
+		/*
+		 * This will cause send_complete() to be called in
+		 * another thread.
+		 */
+		tasklet_hi_schedule(&cq->comptask);
+	}
+
+	spin_unlock_irqrestore(&cq->lock, flags);
+
+	if (entry->status != IB_WC_SUCCESS)
+		to_idev(cq->ibcq.device)->n_wqe_errs++;
+}
+
+/**
+ * ipath_poll_cq - poll for work completion entries
+ * @ibcq: the completion queue to poll
+ * @num_entries: the maximum number of entries to return
+ * @entry: pointer to array where work completions are placed
+ *
+ * Returns the number of completion entries polled.
+ *
+ * This may be called from interrupt context.  Also called by ib_poll_cq()
+ * in the generic verbs code.
+ */
+int ipath_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry)
+{
+	struct ipath_cq *cq = to_icq(ibcq);
+	struct ipath_cq_wc *wc;
+	unsigned long flags;
+	int npolled;
+	u32 tail;
+
+	/* The kernel can only poll a kernel completion queue */
+	if (cq->ip) {
+		npolled = -EINVAL;
+		goto bail;
+	}
+
+	spin_lock_irqsave(&cq->lock, flags);
+
+	wc = cq->queue;
+	tail = wc->tail;
+	if (tail > (u32) cq->ibcq.cqe)
+		tail = (u32) cq->ibcq.cqe;
+	for (npolled = 0; npolled < num_entries; ++npolled, ++entry) {
+		if (tail == wc->head)
+			break;
+		/* The kernel doesn't need a RMB since it has the lock. */
+		*entry = wc->kqueue[tail];
+		if (tail >= cq->ibcq.cqe)
+			tail = 0;
+		else
+			tail++;
+	}
+	wc->tail = tail;
+
+	spin_unlock_irqrestore(&cq->lock, flags);
+
+bail:
+	return npolled;
+}
+
+static void send_complete(unsigned long data)
+{
+	struct ipath_cq *cq = (struct ipath_cq *)data;
+
+	/*
+	 * The completion handler will most likely rearm the notification
+	 * and poll for all pending entries.  If a new completion entry
+	 * is added while we are in this routine, tasklet_hi_schedule()
+	 * won't call us again until we return so we check triggered to
+	 * see if we need to call the handler again.
+	 */
+	for (;;) {
+		u8 triggered = cq->triggered;
+
+		cq->ibcq.comp_handler(&cq->ibcq, cq->ibcq.cq_context);
+
+		if (cq->triggered == triggered)
+			return;
+	}
+}
+
+/**
+ * ipath_create_cq - create a completion queue
+ * @ibdev: the device this completion queue is attached to
+ * @entries: the minimum size of the completion queue
+ * @context: unused by the InfiniPath driver
+ * @udata: unused by the InfiniPath driver
+ *
+ * Returns a pointer to the completion queue or negative errno values
+ * for failure.
+ *
+ * Called by ib_create_cq() in the generic verbs code.
+ */
+struct ib_cq *ipath_create_cq(struct ib_device *ibdev, int entries, int comp_vector,
+			      struct ib_ucontext *context,
+			      struct ib_udata *udata)
+{
+	struct ipath_ibdev *dev = to_idev(ibdev);
+	struct ipath_cq *cq;
+	struct ipath_cq_wc *wc;
+	struct ib_cq *ret;
+	u32 sz;
+
+	if (entries < 1 || entries > ib_ipath_max_cqes) {
+		ret = ERR_PTR(-EINVAL);
+		goto done;
+	}
+
+	/* Allocate the completion queue structure. */
+	cq = kmalloc(sizeof(*cq), GFP_KERNEL);
+	if (!cq) {
+		ret = ERR_PTR(-ENOMEM);
+		goto done;
+	}
+
+	/*
+	 * Allocate the completion queue entries and head/tail pointers.
+	 * This is allocated separately so that it can be resized and
+	 * also mapped into user space.
+	 * We need to use vmalloc() in order to support mmap and large
+	 * numbers of entries.
+	 */
+	sz = sizeof(*wc);
+	if (udata && udata->outlen >= sizeof(__u64))
+		sz += sizeof(struct ib_uverbs_wc) * (entries + 1);
+	else
+		sz += sizeof(struct ib_wc) * (entries + 1);
+	wc = vmalloc_user(sz);
+	if (!wc) {
+		ret = ERR_PTR(-ENOMEM);
+		goto bail_cq;
+	}
+
+	/*
+	 * Return the address of the WC as the offset to mmap.
+	 * See ipath_mmap() for details.
+	 */
+	if (udata && udata->outlen >= sizeof(__u64)) {
+		int err;
+
+		cq->ip = ipath_create_mmap_info(dev, sz, context, wc);
+		if (!cq->ip) {
+			ret = ERR_PTR(-ENOMEM);
+			goto bail_wc;
+		}
+
+		err = ib_copy_to_udata(udata, &cq->ip->offset,
+				       sizeof(cq->ip->offset));
+		if (err) {
+			ret = ERR_PTR(err);
+			goto bail_ip;
+		}
+	} else
+		cq->ip = NULL;
+
+	spin_lock(&dev->n_cqs_lock);
+	if (dev->n_cqs_allocated == ib_ipath_max_cqs) {
+		spin_unlock(&dev->n_cqs_lock);
+		ret = ERR_PTR(-ENOMEM);
+		goto bail_ip;
+	}
+
+	dev->n_cqs_allocated++;
+	spin_unlock(&dev->n_cqs_lock);
+
+	if (cq->ip) {
+		spin_lock_irq(&dev->pending_lock);
+		list_add(&cq->ip->pending_mmaps, &dev->pending_mmaps);
+		spin_unlock_irq(&dev->pending_lock);
+	}
+
+	/*
+	 * ib_create_cq() will initialize cq->ibcq except for cq->ibcq.cqe.
+	 * The number of entries should be >= the number requested or return
+	 * an error.
+	 */
+	cq->ibcq.cqe = entries;
+	cq->notify = IB_CQ_NONE;
+	cq->triggered = 0;
+	spin_lock_init(&cq->lock);
+	tasklet_init(&cq->comptask, send_complete, (unsigned long)cq);
+	wc->head = 0;
+	wc->tail = 0;
+	cq->queue = wc;
+
+	ret = &cq->ibcq;
+
+	goto done;
+
+bail_ip:
+	kfree(cq->ip);
+bail_wc:
+	vfree(wc);
+bail_cq:
+	kfree(cq);
+done:
+	return ret;
+}
+
+/**
+ * ipath_destroy_cq - destroy a completion queue
+ * @ibcq: the completion queue to destroy.
+ *
+ * Returns 0 for success.
+ *
+ * Called by ib_destroy_cq() in the generic verbs code.
+ */
+int ipath_destroy_cq(struct ib_cq *ibcq)
+{
+	struct ipath_ibdev *dev = to_idev(ibcq->device);
+	struct ipath_cq *cq = to_icq(ibcq);
+
+	tasklet_kill(&cq->comptask);
+	spin_lock(&dev->n_cqs_lock);
+	dev->n_cqs_allocated--;
+	spin_unlock(&dev->n_cqs_lock);
+	if (cq->ip)
+		kref_put(&cq->ip->ref, ipath_release_mmap_info);
+	else
+		vfree(cq->queue);
+	kfree(cq);
+
+	return 0;
+}
+
+/**
+ * ipath_req_notify_cq - change the notification type for a completion queue
+ * @ibcq: the completion queue
+ * @notify_flags: the type of notification to request
+ *
+ * Returns 0 for success.
+ *
+ * This may be called from interrupt context.  Also called by
+ * ib_req_notify_cq() in the generic verbs code.
+ */
+int ipath_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags notify_flags)
+{
+	struct ipath_cq *cq = to_icq(ibcq);
+	unsigned long flags;
+	int ret = 0;
+
+	spin_lock_irqsave(&cq->lock, flags);
+	/*
+	 * Don't change IB_CQ_NEXT_COMP to IB_CQ_SOLICITED but allow
+	 * any other transitions (see C11-31 and C11-32 in ch. 11.4.2.2).
+	 */
+	if (cq->notify != IB_CQ_NEXT_COMP)
+		cq->notify = notify_flags & IB_CQ_SOLICITED_MASK;
+
+	if ((notify_flags & IB_CQ_REPORT_MISSED_EVENTS) &&
+	    cq->queue->head != cq->queue->tail)
+		ret = 1;
+
+	spin_unlock_irqrestore(&cq->lock, flags);
+
+	return ret;
+}
+
+/**
+ * ipath_resize_cq - change the size of the CQ
+ * @ibcq: the completion queue
+ *
+ * Returns 0 for success.
+ */
+int ipath_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata)
+{
+	struct ipath_cq *cq = to_icq(ibcq);
+	struct ipath_cq_wc *old_wc;
+	struct ipath_cq_wc *wc;
+	u32 head, tail, n;
+	int ret;
+	u32 sz;
+
+	if (cqe < 1 || cqe > ib_ipath_max_cqes) {
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	/*
+	 * Need to use vmalloc() if we want to support large #s of entries.
+	 */
+	sz = sizeof(*wc);
+	if (udata && udata->outlen >= sizeof(__u64))
+		sz += sizeof(struct ib_uverbs_wc) * (cqe + 1);
+	else
+		sz += sizeof(struct ib_wc) * (cqe + 1);
+	wc = vmalloc_user(sz);
+	if (!wc) {
+		ret = -ENOMEM;
+		goto bail;
+	}
+
+	/* Check that we can write the offset to mmap. */
+	if (udata && udata->outlen >= sizeof(__u64)) {
+		__u64 offset = 0;
+
+		ret = ib_copy_to_udata(udata, &offset, sizeof(offset));
+		if (ret)
+			goto bail_free;
+	}
+
+	spin_lock_irq(&cq->lock);
+	/*
+	 * Make sure head and tail are sane since they
+	 * might be user writable.
+	 */
+	old_wc = cq->queue;
+	head = old_wc->head;
+	if (head > (u32) cq->ibcq.cqe)
+		head = (u32) cq->ibcq.cqe;
+	tail = old_wc->tail;
+	if (tail > (u32) cq->ibcq.cqe)
+		tail = (u32) cq->ibcq.cqe;
+	if (head < tail)
+		n = cq->ibcq.cqe + 1 + head - tail;
+	else
+		n = head - tail;
+	if (unlikely((u32)cqe < n)) {
+		ret = -EINVAL;
+		goto bail_unlock;
+	}
+	for (n = 0; tail != head; n++) {
+		if (cq->ip)
+			wc->uqueue[n] = old_wc->uqueue[tail];
+		else
+			wc->kqueue[n] = old_wc->kqueue[tail];
+		if (tail == (u32) cq->ibcq.cqe)
+			tail = 0;
+		else
+			tail++;
+	}
+	cq->ibcq.cqe = cqe;
+	wc->head = n;
+	wc->tail = 0;
+	cq->queue = wc;
+	spin_unlock_irq(&cq->lock);
+
+	vfree(old_wc);
+
+	if (cq->ip) {
+		struct ipath_ibdev *dev = to_idev(ibcq->device);
+		struct ipath_mmap_info *ip = cq->ip;
+
+		ipath_update_mmap_info(dev, ip, sz, wc);
+
+		/*
+		 * Return the offset to mmap.
+		 * See ipath_mmap() for details.
+		 */
+		if (udata && udata->outlen >= sizeof(__u64)) {
+			ret = ib_copy_to_udata(udata, &ip->offset,
+					       sizeof(ip->offset));
+			if (ret)
+				goto bail;
+		}
+
+		spin_lock_irq(&dev->pending_lock);
+		if (list_empty(&ip->pending_mmaps))
+			list_add(&ip->pending_mmaps, &dev->pending_mmaps);
+		spin_unlock_irq(&dev->pending_lock);
+	}
+
+	ret = 0;
+	goto bail;
+
+bail_unlock:
+	spin_unlock_irq(&cq->lock);
+bail_free:
+	vfree(wc);
+bail:
+	return ret;
+}
diff --git a/drivers/infiniband/hw/ipath/ipath_debug.h b/drivers/infiniband/hw/ipath/ipath_debug.h
new file mode 100644
index 0000000..65926cd
--- /dev/null
+++ b/drivers/infiniband/hw/ipath/ipath_debug.h
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _IPATH_DEBUG_H
+#define _IPATH_DEBUG_H
+
+#ifndef _IPATH_DEBUGGING	/* debugging enabled or not */
+#define _IPATH_DEBUGGING 1
+#endif
+
+#if _IPATH_DEBUGGING
+
+/*
+ * Mask values for debugging.  The scheme allows us to compile out any
+ * of the debug tracing stuff, and if compiled in, to enable or disable
+ * dynamically.  This can be set at modprobe time also:
+ *      modprobe infinipath.ko infinipath_debug=7
+ */
+
+#define __IPATH_INFO        0x1	/* generic low verbosity stuff */
+#define __IPATH_DBG         0x2	/* generic debug */
+#define __IPATH_TRSAMPLE    0x8	/* generate trace buffer sample entries */
+/* leave some low verbosity spots open */
+#define __IPATH_VERBDBG     0x40	/* very verbose debug */
+#define __IPATH_PKTDBG      0x80	/* print packet data */
+/* print process startup (init)/exit messages */
+#define __IPATH_PROCDBG     0x100
+/* print mmap/fault stuff, not using VDBG any more */
+#define __IPATH_MMDBG       0x200
+#define __IPATH_ERRPKTDBG   0x400
+#define __IPATH_USER_SEND   0x1000	/* use user mode send */
+#define __IPATH_KERNEL_SEND 0x2000	/* use kernel mode send */
+#define __IPATH_EPKTDBG     0x4000	/* print ethernet packet data */
+#define __IPATH_IPATHDBG    0x10000	/* Ethernet (IPATH) gen debug */
+#define __IPATH_IPATHWARN   0x20000	/* Ethernet (IPATH) warnings */
+#define __IPATH_IPATHERR    0x40000	/* Ethernet (IPATH) errors */
+#define __IPATH_IPATHPD     0x80000	/* Ethernet (IPATH) packet dump */
+#define __IPATH_IPATHTABLE  0x100000	/* Ethernet (IPATH) table dump */
+#define __IPATH_LINKVERBDBG 0x200000	/* very verbose linkchange debug */
+
+#else				/* _IPATH_DEBUGGING */
+
+/*
+ * define all of these even with debugging off, for the few places that do
+ * if(infinipath_debug & _IPATH_xyzzy), but in a way that will make the
+ * compiler eliminate the code
+ */
+
+#define __IPATH_INFO      0x0	/* generic low verbosity stuff */
+#define __IPATH_DBG       0x0	/* generic debug */
+#define __IPATH_TRSAMPLE  0x0	/* generate trace buffer sample entries */
+#define __IPATH_VERBDBG   0x0	/* very verbose debug */
+#define __IPATH_PKTDBG    0x0	/* print packet data */
+#define __IPATH_PROCDBG   0x0	/* process startup (init)/exit messages */
+/* print mmap/fault stuff, not using VDBG any more */
+#define __IPATH_MMDBG     0x0
+#define __IPATH_EPKTDBG   0x0	/* print ethernet packet data */
+#define __IPATH_IPATHDBG  0x0	/* Ethernet (IPATH) table dump on */
+#define __IPATH_IPATHWARN 0x0	/* Ethernet (IPATH) warnings on   */
+#define __IPATH_IPATHERR  0x0	/* Ethernet (IPATH) errors on   */
+#define __IPATH_IPATHPD   0x0	/* Ethernet (IPATH) packet dump on   */
+#define __IPATH_IPATHTABLE 0x0	/* Ethernet (IPATH) packet dump on   */
+#define __IPATH_LINKVERBDBG 0x0	/* very verbose linkchange debug */
+
+#endif				/* _IPATH_DEBUGGING */
+
+#define __IPATH_VERBOSEDBG __IPATH_VERBDBG
+
+#endif				/* _IPATH_DEBUG_H */
diff --git a/drivers/infiniband/hw/ipath/ipath_diag.c b/drivers/infiniband/hw/ipath/ipath_diag.c
new file mode 100644
index 0000000..45802e9
--- /dev/null
+++ b/drivers/infiniband/hw/ipath/ipath_diag.c
@@ -0,0 +1,551 @@
+/*
+ * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/*
+ * This file contains support for diagnostic functions.  It is accessed by
+ * opening the ipath_diag device, normally minor number 129.  Diagnostic use
+ * of the InfiniPath chip may render the chip or board unusable until the
+ * driver is unloaded, or in some cases, until the system is rebooted.
+ *
+ * Accesses to the chip through this interface are not similar to going
+ * through the /sys/bus/pci resource mmap interface.
+ */
+
+#include <linux/io.h>
+#include <linux/pci.h>
+#include <linux/vmalloc.h>
+#include <linux/fs.h>
+#include <linux/export.h>
+#include <asm/uaccess.h>
+
+#include "ipath_kernel.h"
+#include "ipath_common.h"
+
+int ipath_diag_inuse;
+static int diag_set_link;
+
+static int ipath_diag_open(struct inode *in, struct file *fp);
+static int ipath_diag_release(struct inode *in, struct file *fp);
+static ssize_t ipath_diag_read(struct file *fp, char __user *data,
+			       size_t count, loff_t *off);
+static ssize_t ipath_diag_write(struct file *fp, const char __user *data,
+				size_t count, loff_t *off);
+
+static const struct file_operations diag_file_ops = {
+	.owner = THIS_MODULE,
+	.write = ipath_diag_write,
+	.read = ipath_diag_read,
+	.open = ipath_diag_open,
+	.release = ipath_diag_release,
+	.llseek = default_llseek,
+};
+
+static ssize_t ipath_diagpkt_write(struct file *fp,
+				   const char __user *data,
+				   size_t count, loff_t *off);
+
+static const struct file_operations diagpkt_file_ops = {
+	.owner = THIS_MODULE,
+	.write = ipath_diagpkt_write,
+	.llseek = noop_llseek,
+};
+
+static atomic_t diagpkt_count = ATOMIC_INIT(0);
+static struct cdev *diagpkt_cdev;
+static struct device *diagpkt_dev;
+
+int ipath_diag_add(struct ipath_devdata *dd)
+{
+	char name[16];
+	int ret = 0;
+
+	if (atomic_inc_return(&diagpkt_count) == 1) {
+		ret = ipath_cdev_init(IPATH_DIAGPKT_MINOR,
+				      "ipath_diagpkt", &diagpkt_file_ops,
+				      &diagpkt_cdev, &diagpkt_dev);
+
+		if (ret) {
+			ipath_dev_err(dd, "Couldn't create ipath_diagpkt "
+				      "device: %d", ret);
+			goto done;
+		}
+	}
+
+	snprintf(name, sizeof(name), "ipath_diag%d", dd->ipath_unit);
+
+	ret = ipath_cdev_init(IPATH_DIAG_MINOR_BASE + dd->ipath_unit, name,
+			      &diag_file_ops, &dd->diag_cdev,
+			      &dd->diag_dev);
+	if (ret)
+		ipath_dev_err(dd, "Couldn't create %s device: %d",
+			      name, ret);
+
+done:
+	return ret;
+}
+
+void ipath_diag_remove(struct ipath_devdata *dd)
+{
+	if (atomic_dec_and_test(&diagpkt_count))
+		ipath_cdev_cleanup(&diagpkt_cdev, &diagpkt_dev);
+
+	ipath_cdev_cleanup(&dd->diag_cdev, &dd->diag_dev);
+}
+
+/**
+ * ipath_read_umem64 - read a 64-bit quantity from the chip into user space
+ * @dd: the infinipath device
+ * @uaddr: the location to store the data in user memory
+ * @caddr: the source chip address (full pointer, not offset)
+ * @count: number of bytes to copy (multiple of 32 bits)
+ *
+ * This function also localizes all chip memory accesses.
+ * The copy should be written such that we read full cacheline packets
+ * from the chip.  This is usually used for a single qword
+ *
+ * NOTE:  This assumes the chip address is 64-bit aligned.
+ */
+static int ipath_read_umem64(struct ipath_devdata *dd, void __user *uaddr,
+			     const void __iomem *caddr, size_t count)
+{
+	const u64 __iomem *reg_addr = caddr;
+	const u64 __iomem *reg_end = reg_addr + (count / sizeof(u64));
+	int ret;
+
+	/* not very efficient, but it works for now */
+	if (reg_addr < dd->ipath_kregbase || reg_end > dd->ipath_kregend) {
+		ret = -EINVAL;
+		goto bail;
+	}
+	while (reg_addr < reg_end) {
+		u64 data = readq(reg_addr);
+		if (copy_to_user(uaddr, &data, sizeof(u64))) {
+			ret = -EFAULT;
+			goto bail;
+		}
+		reg_addr++;
+		uaddr += sizeof(u64);
+	}
+	ret = 0;
+bail:
+	return ret;
+}
+
+/**
+ * ipath_write_umem64 - write a 64-bit quantity to the chip from user space
+ * @dd: the infinipath device
+ * @caddr: the destination chip address (full pointer, not offset)
+ * @uaddr: the source of the data in user memory
+ * @count: the number of bytes to copy (multiple of 32 bits)
+ *
+ * This is usually used for a single qword
+ * NOTE:  This assumes the chip address is 64-bit aligned.
+ */
+
+static int ipath_write_umem64(struct ipath_devdata *dd, void __iomem *caddr,
+			      const void __user *uaddr, size_t count)
+{
+	u64 __iomem *reg_addr = caddr;
+	const u64 __iomem *reg_end = reg_addr + (count / sizeof(u64));
+	int ret;
+
+	/* not very efficient, but it works for now */
+	if (reg_addr < dd->ipath_kregbase || reg_end > dd->ipath_kregend) {
+		ret = -EINVAL;
+		goto bail;
+	}
+	while (reg_addr < reg_end) {
+		u64 data;
+		if (copy_from_user(&data, uaddr, sizeof(data))) {
+			ret = -EFAULT;
+			goto bail;
+		}
+		writeq(data, reg_addr);
+
+		reg_addr++;
+		uaddr += sizeof(u64);
+	}
+	ret = 0;
+bail:
+	return ret;
+}
+
+/**
+ * ipath_read_umem32 - read a 32-bit quantity from the chip into user space
+ * @dd: the infinipath device
+ * @uaddr: the location to store the data in user memory
+ * @caddr: the source chip address (full pointer, not offset)
+ * @count: number of bytes to copy
+ *
+ * read 32 bit values, not 64 bit; for memories that only
+ * support 32 bit reads; usually a single dword.
+ */
+static int ipath_read_umem32(struct ipath_devdata *dd, void __user *uaddr,
+			     const void __iomem *caddr, size_t count)
+{
+	const u32 __iomem *reg_addr = caddr;
+	const u32 __iomem *reg_end = reg_addr + (count / sizeof(u32));
+	int ret;
+
+	if (reg_addr < (u32 __iomem *) dd->ipath_kregbase ||
+	    reg_end > (u32 __iomem *) dd->ipath_kregend) {
+		ret = -EINVAL;
+		goto bail;
+	}
+	/* not very efficient, but it works for now */
+	while (reg_addr < reg_end) {
+		u32 data = readl(reg_addr);
+		if (copy_to_user(uaddr, &data, sizeof(data))) {
+			ret = -EFAULT;
+			goto bail;
+		}
+
+		reg_addr++;
+		uaddr += sizeof(u32);
+
+	}
+	ret = 0;
+bail:
+	return ret;
+}
+
+/**
+ * ipath_write_umem32 - write a 32-bit quantity to the chip from user space
+ * @dd: the infinipath device
+ * @caddr: the destination chip address (full pointer, not offset)
+ * @uaddr: the source of the data in user memory
+ * @count: number of bytes to copy
+ *
+ * write 32 bit values, not 64 bit; for memories that only
+ * support 32 bit write; usually a single dword.
+ */
+
+static int ipath_write_umem32(struct ipath_devdata *dd, void __iomem *caddr,
+			      const void __user *uaddr, size_t count)
+{
+	u32 __iomem *reg_addr = caddr;
+	const u32 __iomem *reg_end = reg_addr + (count / sizeof(u32));
+	int ret;
+
+	if (reg_addr < (u32 __iomem *) dd->ipath_kregbase ||
+	    reg_end > (u32 __iomem *) dd->ipath_kregend) {
+		ret = -EINVAL;
+		goto bail;
+	}
+	while (reg_addr < reg_end) {
+		u32 data;
+		if (copy_from_user(&data, uaddr, sizeof(data))) {
+			ret = -EFAULT;
+			goto bail;
+		}
+		writel(data, reg_addr);
+
+		reg_addr++;
+		uaddr += sizeof(u32);
+	}
+	ret = 0;
+bail:
+	return ret;
+}
+
+static int ipath_diag_open(struct inode *in, struct file *fp)
+{
+	int unit = iminor(in) - IPATH_DIAG_MINOR_BASE;
+	struct ipath_devdata *dd;
+	int ret;
+
+	mutex_lock(&ipath_mutex);
+
+	if (ipath_diag_inuse) {
+		ret = -EBUSY;
+		goto bail;
+	}
+
+	dd = ipath_lookup(unit);
+
+	if (dd == NULL || !(dd->ipath_flags & IPATH_PRESENT) ||
+	    !dd->ipath_kregbase) {
+		ret = -ENODEV;
+		goto bail;
+	}
+
+	fp->private_data = dd;
+	ipath_diag_inuse = -2;
+	diag_set_link = 0;
+	ret = 0;
+
+	/* Only expose a way to reset the device if we
+	   make it into diag mode. */
+	ipath_expose_reset(&dd->pcidev->dev);
+
+bail:
+	mutex_unlock(&ipath_mutex);
+
+	return ret;
+}
+
+/**
+ * ipath_diagpkt_write - write an IB packet
+ * @fp: the diag data device file pointer
+ * @data: ipath_diag_pkt structure saying where to get the packet
+ * @count: size of data to write
+ * @off: unused by this code
+ */
+static ssize_t ipath_diagpkt_write(struct file *fp,
+				   const char __user *data,
+				   size_t count, loff_t *off)
+{
+	u32 __iomem *piobuf;
+	u32 plen, pbufn, maxlen_reserve;
+	struct ipath_diag_pkt odp;
+	struct ipath_diag_xpkt dp;
+	u32 *tmpbuf = NULL;
+	struct ipath_devdata *dd;
+	ssize_t ret = 0;
+	u64 val;
+	u32 l_state, lt_state; /* LinkState, LinkTrainingState */
+
+
+	if (count == sizeof(dp)) {
+		if (copy_from_user(&dp, data, sizeof(dp))) {
+			ret = -EFAULT;
+			goto bail;
+		}
+	} else if (count == sizeof(odp)) {
+		if (copy_from_user(&odp, data, sizeof(odp))) {
+			ret = -EFAULT;
+			goto bail;
+		}
+		dp.len = odp.len;
+		dp.unit = odp.unit;
+		dp.data = odp.data;
+		dp.pbc_wd = 0;
+	} else {
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	/* send count must be an exact number of dwords */
+	if (dp.len & 3) {
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	plen = dp.len >> 2;
+
+	dd = ipath_lookup(dp.unit);
+	if (!dd || !(dd->ipath_flags & IPATH_PRESENT) ||
+	    !dd->ipath_kregbase) {
+		ipath_cdbg(VERBOSE, "illegal unit %u for diag data send\n",
+			   dp.unit);
+		ret = -ENODEV;
+		goto bail;
+	}
+
+	if (ipath_diag_inuse && !diag_set_link &&
+	    !(dd->ipath_flags & IPATH_LINKACTIVE)) {
+		diag_set_link = 1;
+		ipath_cdbg(VERBOSE, "Trying to set to set link active for "
+			   "diag pkt\n");
+		ipath_set_linkstate(dd, IPATH_IB_LINKARM);
+		ipath_set_linkstate(dd, IPATH_IB_LINKACTIVE);
+	}
+
+	if (!(dd->ipath_flags & IPATH_INITTED)) {
+		/* no hardware, freeze, etc. */
+		ipath_cdbg(VERBOSE, "unit %u not usable\n", dd->ipath_unit);
+		ret = -ENODEV;
+		goto bail;
+	}
+	/*
+	 * Want to skip check for l_state if using custom PBC,
+	 * because we might be trying to force an SM packet out.
+	 * first-cut, skip _all_ state checking in that case.
+	 */
+	val = ipath_ib_state(dd, dd->ipath_lastibcstat);
+	lt_state = ipath_ib_linktrstate(dd, dd->ipath_lastibcstat);
+	l_state = ipath_ib_linkstate(dd, dd->ipath_lastibcstat);
+	if (!dp.pbc_wd && (lt_state != INFINIPATH_IBCS_LT_STATE_LINKUP ||
+	    (val != dd->ib_init && val != dd->ib_arm &&
+	    val != dd->ib_active))) {
+		ipath_cdbg(VERBOSE, "unit %u not ready (state %llx)\n",
+			   dd->ipath_unit, (unsigned long long) val);
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	/*
+	 * need total length before first word written, plus 2 Dwords. One Dword
+	 * is for padding so we get the full user data when not aligned on
+	 * a word boundary. The other Dword is to make sure we have room for the
+	 * ICRC which gets tacked on later.
+	 */
+	maxlen_reserve = 2 * sizeof(u32);
+	if (dp.len > dd->ipath_ibmaxlen - maxlen_reserve) {
+		ipath_dbg("Pkt len 0x%x > ibmaxlen %x\n",
+			  dp.len, dd->ipath_ibmaxlen);
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	plen = sizeof(u32) + dp.len;
+
+	tmpbuf = vmalloc(plen);
+	if (!tmpbuf) {
+		dev_info(&dd->pcidev->dev, "Unable to allocate tmp buffer, "
+			 "failing\n");
+		ret = -ENOMEM;
+		goto bail;
+	}
+
+	if (copy_from_user(tmpbuf,
+			   (const void __user *) (unsigned long) dp.data,
+			   dp.len)) {
+		ret = -EFAULT;
+		goto bail;
+	}
+
+	plen >>= 2;		/* in dwords */
+
+	piobuf = ipath_getpiobuf(dd, plen, &pbufn);
+	if (!piobuf) {
+		ipath_cdbg(VERBOSE, "No PIO buffers avail unit for %u\n",
+			   dd->ipath_unit);
+		ret = -EBUSY;
+		goto bail;
+	}
+	/* disarm it just to be extra sure */
+	ipath_disarm_piobufs(dd, pbufn, 1);
+
+	if (ipath_debug & __IPATH_PKTDBG)
+		ipath_cdbg(VERBOSE, "unit %u 0x%x+1w pio%d\n",
+			   dd->ipath_unit, plen - 1, pbufn);
+
+	if (dp.pbc_wd == 0)
+		dp.pbc_wd = plen;
+	writeq(dp.pbc_wd, piobuf);
+	/*
+	 * Copy all by the trigger word, then flush, so it's written
+	 * to chip before trigger word, then write trigger word, then
+	 * flush again, so packet is sent.
+	 */
+	if (dd->ipath_flags & IPATH_PIO_FLUSH_WC) {
+		ipath_flush_wc();
+		__iowrite32_copy(piobuf + 2, tmpbuf, plen - 1);
+		ipath_flush_wc();
+		__raw_writel(tmpbuf[plen - 1], piobuf + plen + 1);
+	} else
+		__iowrite32_copy(piobuf + 2, tmpbuf, plen);
+
+	ipath_flush_wc();
+
+	ret = sizeof(dp);
+
+bail:
+	vfree(tmpbuf);
+	return ret;
+}
+
+static int ipath_diag_release(struct inode *in, struct file *fp)
+{
+	mutex_lock(&ipath_mutex);
+	ipath_diag_inuse = 0;
+	fp->private_data = NULL;
+	mutex_unlock(&ipath_mutex);
+	return 0;
+}
+
+static ssize_t ipath_diag_read(struct file *fp, char __user *data,
+			       size_t count, loff_t *off)
+{
+	struct ipath_devdata *dd = fp->private_data;
+	void __iomem *kreg_base;
+	ssize_t ret;
+
+	kreg_base = dd->ipath_kregbase;
+
+	if (count == 0)
+		ret = 0;
+	else if ((count % 4) || (*off % 4))
+		/* address or length is not 32-bit aligned, hence invalid */
+		ret = -EINVAL;
+	else if (ipath_diag_inuse < 1 && (*off || count != 8))
+		ret = -EINVAL;  /* prevent cat /dev/ipath_diag* */
+	else if ((count % 8) || (*off % 8))
+		/* address or length not 64-bit aligned; do 32-bit reads */
+		ret = ipath_read_umem32(dd, data, kreg_base + *off, count);
+	else
+		ret = ipath_read_umem64(dd, data, kreg_base + *off, count);
+
+	if (ret >= 0) {
+		*off += count;
+		ret = count;
+		if (ipath_diag_inuse == -2)
+			ipath_diag_inuse++;
+	}
+
+	return ret;
+}
+
+static ssize_t ipath_diag_write(struct file *fp, const char __user *data,
+				size_t count, loff_t *off)
+{
+	struct ipath_devdata *dd = fp->private_data;
+	void __iomem *kreg_base;
+	ssize_t ret;
+
+	kreg_base = dd->ipath_kregbase;
+
+	if (count == 0)
+		ret = 0;
+	else if ((count % 4) || (*off % 4))
+		/* address or length is not 32-bit aligned, hence invalid */
+		ret = -EINVAL;
+	else if ((ipath_diag_inuse == -1 && (*off || count != 8)) ||
+		 ipath_diag_inuse == -2)  /* read qw off 0, write qw off 0 */
+		ret = -EINVAL;  /* before any other write allowed */
+	else if ((count % 8) || (*off % 8))
+		/* address or length not 64-bit aligned; do 32-bit writes */
+		ret = ipath_write_umem32(dd, kreg_base + *off, data, count);
+	else
+		ret = ipath_write_umem64(dd, kreg_base + *off, data, count);
+
+	if (ret >= 0) {
+		*off += count;
+		ret = count;
+		if (ipath_diag_inuse == -1)
+			ipath_diag_inuse = 1; /* all read/write OK now */
+	}
+
+	return ret;
+}
diff --git a/drivers/infiniband/hw/ipath/ipath_dma.c b/drivers/infiniband/hw/ipath/ipath_dma.c
new file mode 100644
index 0000000..123a8c0
--- /dev/null
+++ b/drivers/infiniband/hw/ipath/ipath_dma.c
@@ -0,0 +1,179 @@
+/*
+ * Copyright (c) 2006 QLogic, Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/scatterlist.h>
+#include <linux/gfp.h>
+#include <rdma/ib_verbs.h>
+
+#include "ipath_verbs.h"
+
+#define BAD_DMA_ADDRESS ((u64) 0)
+
+/*
+ * The following functions implement driver specific replacements
+ * for the ib_dma_*() functions.
+ *
+ * These functions return kernel virtual addresses instead of
+ * device bus addresses since the driver uses the CPU to copy
+ * data instead of using hardware DMA.
+ */
+
+static int ipath_mapping_error(struct ib_device *dev, u64 dma_addr)
+{
+	return dma_addr == BAD_DMA_ADDRESS;
+}
+
+static u64 ipath_dma_map_single(struct ib_device *dev,
+			        void *cpu_addr, size_t size,
+			        enum dma_data_direction direction)
+{
+	BUG_ON(!valid_dma_direction(direction));
+	return (u64) cpu_addr;
+}
+
+static void ipath_dma_unmap_single(struct ib_device *dev,
+				   u64 addr, size_t size,
+				   enum dma_data_direction direction)
+{
+	BUG_ON(!valid_dma_direction(direction));
+}
+
+static u64 ipath_dma_map_page(struct ib_device *dev,
+			      struct page *page,
+			      unsigned long offset,
+			      size_t size,
+			      enum dma_data_direction direction)
+{
+	u64 addr;
+
+	BUG_ON(!valid_dma_direction(direction));
+
+	if (offset + size > PAGE_SIZE) {
+		addr = BAD_DMA_ADDRESS;
+		goto done;
+	}
+
+	addr = (u64) page_address(page);
+	if (addr)
+		addr += offset;
+	/* TODO: handle highmem pages */
+
+done:
+	return addr;
+}
+
+static void ipath_dma_unmap_page(struct ib_device *dev,
+				 u64 addr, size_t size,
+				 enum dma_data_direction direction)
+{
+	BUG_ON(!valid_dma_direction(direction));
+}
+
+static int ipath_map_sg(struct ib_device *dev, struct scatterlist *sgl,
+			int nents, enum dma_data_direction direction)
+{
+	struct scatterlist *sg;
+	u64 addr;
+	int i;
+	int ret = nents;
+
+	BUG_ON(!valid_dma_direction(direction));
+
+	for_each_sg(sgl, sg, nents, i) {
+		addr = (u64) page_address(sg_page(sg));
+		/* TODO: handle highmem pages */
+		if (!addr) {
+			ret = 0;
+			break;
+		}
+		sg->dma_address = addr + sg->offset;
+#ifdef CONFIG_NEED_SG_DMA_LENGTH
+		sg->dma_length = sg->length;
+#endif
+	}
+	return ret;
+}
+
+static void ipath_unmap_sg(struct ib_device *dev,
+			   struct scatterlist *sg, int nents,
+			   enum dma_data_direction direction)
+{
+	BUG_ON(!valid_dma_direction(direction));
+}
+
+static void ipath_sync_single_for_cpu(struct ib_device *dev,
+				      u64 addr,
+				      size_t size,
+				      enum dma_data_direction dir)
+{
+}
+
+static void ipath_sync_single_for_device(struct ib_device *dev,
+					 u64 addr,
+					 size_t size,
+					 enum dma_data_direction dir)
+{
+}
+
+static void *ipath_dma_alloc_coherent(struct ib_device *dev, size_t size,
+				      u64 *dma_handle, gfp_t flag)
+{
+	struct page *p;
+	void *addr = NULL;
+
+	p = alloc_pages(flag, get_order(size));
+	if (p)
+		addr = page_address(p);
+	if (dma_handle)
+		*dma_handle = (u64) addr;
+	return addr;
+}
+
+static void ipath_dma_free_coherent(struct ib_device *dev, size_t size,
+				    void *cpu_addr, u64 dma_handle)
+{
+	free_pages((unsigned long) cpu_addr, get_order(size));
+}
+
+struct ib_dma_mapping_ops ipath_dma_mapping_ops = {
+	.mapping_error = ipath_mapping_error,
+	.map_single = ipath_dma_map_single,
+	.unmap_single = ipath_dma_unmap_single,
+	.map_page = ipath_dma_map_page,
+	.unmap_page = ipath_dma_unmap_page,
+	.map_sg = ipath_map_sg,
+	.unmap_sg = ipath_unmap_sg,
+	.sync_single_for_cpu = ipath_sync_single_for_cpu,
+	.sync_single_for_device = ipath_sync_single_for_device,
+	.alloc_coherent = ipath_dma_alloc_coherent,
+	.free_coherent = ipath_dma_free_coherent
+};
diff --git a/drivers/infiniband/hw/ipath/ipath_driver.c b/drivers/infiniband/hw/ipath/ipath_driver.c
new file mode 100644
index 0000000..bd0caed
--- /dev/null
+++ b/drivers/infiniband/hw/ipath/ipath_driver.c
@@ -0,0 +1,2779 @@
+/*
+ * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/sched.h>
+#include <linux/spinlock.h>
+#include <linux/idr.h>
+#include <linux/pci.h>
+#include <linux/io.h>
+#include <linux/delay.h>
+#include <linux/netdevice.h>
+#include <linux/vmalloc.h>
+#include <linux/bitmap.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+
+#include "ipath_kernel.h"
+#include "ipath_verbs.h"
+
+static void ipath_update_pio_bufs(struct ipath_devdata *);
+
+const char *ipath_get_unit_name(int unit)
+{
+	static char iname[16];
+	snprintf(iname, sizeof iname, "infinipath%u", unit);
+	return iname;
+}
+
+#define DRIVER_LOAD_MSG "QLogic " IPATH_DRV_NAME " loaded: "
+#define PFX IPATH_DRV_NAME ": "
+
+/*
+ * The size has to be longer than this string, so we can append
+ * board/chip information to it in the init code.
+ */
+const char ib_ipath_version[] = IPATH_IDSTR "\n";
+
+static struct idr unit_table;
+DEFINE_SPINLOCK(ipath_devs_lock);
+LIST_HEAD(ipath_dev_list);
+
+wait_queue_head_t ipath_state_wait;
+
+unsigned ipath_debug = __IPATH_INFO;
+
+module_param_named(debug, ipath_debug, uint, S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(debug, "mask for debug prints");
+EXPORT_SYMBOL_GPL(ipath_debug);
+
+unsigned ipath_mtu4096 = 1; /* max 4KB IB mtu by default, if supported */
+module_param_named(mtu4096, ipath_mtu4096, uint, S_IRUGO);
+MODULE_PARM_DESC(mtu4096, "enable MTU of 4096 bytes, if supported");
+
+static unsigned ipath_hol_timeout_ms = 13000;
+module_param_named(hol_timeout_ms, ipath_hol_timeout_ms, uint, S_IRUGO);
+MODULE_PARM_DESC(hol_timeout_ms,
+	"duration of user app suspension after link failure");
+
+unsigned ipath_linkrecovery = 1;
+module_param_named(linkrecovery, ipath_linkrecovery, uint, S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(linkrecovery, "enable workaround for link recovery issue");
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("QLogic <support@qlogic.com>");
+MODULE_DESCRIPTION("QLogic InfiniPath driver");
+
+/*
+ * Table to translate the LINKTRAININGSTATE portion of
+ * IBCStatus to a human-readable form.
+ */
+const char *ipath_ibcstatus_str[] = {
+	"Disabled",
+	"LinkUp",
+	"PollActive",
+	"PollQuiet",
+	"SleepDelay",
+	"SleepQuiet",
+	"LState6",		/* unused */
+	"LState7",		/* unused */
+	"CfgDebounce",
+	"CfgRcvfCfg",
+	"CfgWaitRmt",
+	"CfgIdle",
+	"RecovRetrain",
+	"CfgTxRevLane",		/* unused before IBA7220 */
+	"RecovWaitRmt",
+	"RecovIdle",
+	/* below were added for IBA7220 */
+	"CfgEnhanced",
+	"CfgTest",
+	"CfgWaitRmtTest",
+	"CfgWaitCfgEnhanced",
+	"SendTS_T",
+	"SendTstIdles",
+	"RcvTS_T",
+	"SendTst_TS1s",
+	"LTState18", "LTState19", "LTState1A", "LTState1B",
+	"LTState1C", "LTState1D", "LTState1E", "LTState1F"
+};
+
+static void ipath_remove_one(struct pci_dev *);
+static int ipath_init_one(struct pci_dev *, const struct pci_device_id *);
+
+/* Only needed for registration, nothing else needs this info */
+#define PCI_VENDOR_ID_PATHSCALE 0x1fc1
+#define PCI_DEVICE_ID_INFINIPATH_HT 0xd
+
+/* Number of seconds before our card status check...  */
+#define STATUS_TIMEOUT 60
+
+static const struct pci_device_id ipath_pci_tbl[] = {
+	{ PCI_DEVICE(PCI_VENDOR_ID_PATHSCALE, PCI_DEVICE_ID_INFINIPATH_HT) },
+	{ 0, }
+};
+
+MODULE_DEVICE_TABLE(pci, ipath_pci_tbl);
+
+static struct pci_driver ipath_driver = {
+	.name = IPATH_DRV_NAME,
+	.probe = ipath_init_one,
+	.remove = ipath_remove_one,
+	.id_table = ipath_pci_tbl,
+	.driver = {
+		.groups = ipath_driver_attr_groups,
+	},
+};
+
+static inline void read_bars(struct ipath_devdata *dd, struct pci_dev *dev,
+			     u32 *bar0, u32 *bar1)
+{
+	int ret;
+
+	ret = pci_read_config_dword(dev, PCI_BASE_ADDRESS_0, bar0);
+	if (ret)
+		ipath_dev_err(dd, "failed to read bar0 before enable: "
+			      "error %d\n", -ret);
+
+	ret = pci_read_config_dword(dev, PCI_BASE_ADDRESS_1, bar1);
+	if (ret)
+		ipath_dev_err(dd, "failed to read bar1 before enable: "
+			      "error %d\n", -ret);
+
+	ipath_dbg("Read bar0 %x bar1 %x\n", *bar0, *bar1);
+}
+
+static void ipath_free_devdata(struct pci_dev *pdev,
+			       struct ipath_devdata *dd)
+{
+	unsigned long flags;
+
+	pci_set_drvdata(pdev, NULL);
+
+	if (dd->ipath_unit != -1) {
+		spin_lock_irqsave(&ipath_devs_lock, flags);
+		idr_remove(&unit_table, dd->ipath_unit);
+		list_del(&dd->ipath_list);
+		spin_unlock_irqrestore(&ipath_devs_lock, flags);
+	}
+	vfree(dd);
+}
+
+static struct ipath_devdata *ipath_alloc_devdata(struct pci_dev *pdev)
+{
+	unsigned long flags;
+	struct ipath_devdata *dd;
+	int ret;
+
+	dd = vzalloc(sizeof(*dd));
+	if (!dd) {
+		dd = ERR_PTR(-ENOMEM);
+		goto bail;
+	}
+	dd->ipath_unit = -1;
+
+	idr_preload(GFP_KERNEL);
+	spin_lock_irqsave(&ipath_devs_lock, flags);
+
+	ret = idr_alloc(&unit_table, dd, 0, 0, GFP_NOWAIT);
+	if (ret < 0) {
+		printk(KERN_ERR IPATH_DRV_NAME
+		       ": Could not allocate unit ID: error %d\n", -ret);
+		ipath_free_devdata(pdev, dd);
+		dd = ERR_PTR(ret);
+		goto bail_unlock;
+	}
+	dd->ipath_unit = ret;
+
+	dd->pcidev = pdev;
+	pci_set_drvdata(pdev, dd);
+
+	list_add(&dd->ipath_list, &ipath_dev_list);
+
+bail_unlock:
+	spin_unlock_irqrestore(&ipath_devs_lock, flags);
+	idr_preload_end();
+bail:
+	return dd;
+}
+
+static inline struct ipath_devdata *__ipath_lookup(int unit)
+{
+	return idr_find(&unit_table, unit);
+}
+
+struct ipath_devdata *ipath_lookup(int unit)
+{
+	struct ipath_devdata *dd;
+	unsigned long flags;
+
+	spin_lock_irqsave(&ipath_devs_lock, flags);
+	dd = __ipath_lookup(unit);
+	spin_unlock_irqrestore(&ipath_devs_lock, flags);
+
+	return dd;
+}
+
+int ipath_count_units(int *npresentp, int *nupp, int *maxportsp)
+{
+	int nunits, npresent, nup;
+	struct ipath_devdata *dd;
+	unsigned long flags;
+	int maxports;
+
+	nunits = npresent = nup = maxports = 0;
+
+	spin_lock_irqsave(&ipath_devs_lock, flags);
+
+	list_for_each_entry(dd, &ipath_dev_list, ipath_list) {
+		nunits++;
+		if ((dd->ipath_flags & IPATH_PRESENT) && dd->ipath_kregbase)
+			npresent++;
+		if (dd->ipath_lid &&
+		    !(dd->ipath_flags & (IPATH_DISABLED | IPATH_LINKDOWN
+					 | IPATH_LINKUNK)))
+			nup++;
+		if (dd->ipath_cfgports > maxports)
+			maxports = dd->ipath_cfgports;
+	}
+
+	spin_unlock_irqrestore(&ipath_devs_lock, flags);
+
+	if (npresentp)
+		*npresentp = npresent;
+	if (nupp)
+		*nupp = nup;
+	if (maxportsp)
+		*maxportsp = maxports;
+
+	return nunits;
+}
+
+/*
+ * These next two routines are placeholders in case we don't have per-arch
+ * code for controlling write combining.  If explicit control of write
+ * combining is not available, performance will probably be awful.
+ */
+
+int __attribute__((weak)) ipath_enable_wc(struct ipath_devdata *dd)
+{
+	return -EOPNOTSUPP;
+}
+
+void __attribute__((weak)) ipath_disable_wc(struct ipath_devdata *dd)
+{
+}
+
+/*
+ * Perform a PIO buffer bandwidth write test, to verify proper system
+ * configuration.  Even when all the setup calls work, occasionally
+ * BIOS or other issues can prevent write combining from working, or
+ * can cause other bandwidth problems to the chip.
+ *
+ * This test simply writes the same buffer over and over again, and
+ * measures close to the peak bandwidth to the chip (not testing
+ * data bandwidth to the wire).   On chips that use an address-based
+ * trigger to send packets to the wire, this is easy.  On chips that
+ * use a count to trigger, we want to make sure that the packet doesn't
+ * go out on the wire, or trigger flow control checks.
+ */
+static void ipath_verify_pioperf(struct ipath_devdata *dd)
+{
+	u32 pbnum, cnt, lcnt;
+	u32 __iomem *piobuf;
+	u32 *addr;
+	u64 msecs, emsecs;
+
+	piobuf = ipath_getpiobuf(dd, 0, &pbnum);
+	if (!piobuf) {
+		dev_info(&dd->pcidev->dev,
+			"No PIObufs for checking perf, skipping\n");
+		return;
+	}
+
+	/*
+	 * Enough to give us a reasonable test, less than piobuf size, and
+	 * likely multiple of store buffer length.
+	 */
+	cnt = 1024;
+
+	addr = vmalloc(cnt);
+	if (!addr) {
+		dev_info(&dd->pcidev->dev,
+			"Couldn't get memory for checking PIO perf,"
+			" skipping\n");
+		goto done;
+	}
+
+	preempt_disable();  /* we want reasonably accurate elapsed time */
+	msecs = 1 + jiffies_to_msecs(jiffies);
+	for (lcnt = 0; lcnt < 10000U; lcnt++) {
+		/* wait until we cross msec boundary */
+		if (jiffies_to_msecs(jiffies) >= msecs)
+			break;
+		udelay(1);
+	}
+
+	ipath_disable_armlaunch(dd);
+
+	/*
+	 * length 0, no dwords actually sent, and mark as VL15
+	 * on chips where that may matter (due to IB flowcontrol)
+	 */
+	if ((dd->ipath_flags & IPATH_HAS_PBC_CNT))
+		writeq(1UL << 63, piobuf);
+	else
+		writeq(0, piobuf);
+	ipath_flush_wc();
+
+	/*
+	 * this is only roughly accurate, since even with preempt we
+	 * still take interrupts that could take a while.   Running for
+	 * >= 5 msec seems to get us "close enough" to accurate values
+	 */
+	msecs = jiffies_to_msecs(jiffies);
+	for (emsecs = lcnt = 0; emsecs <= 5UL; lcnt++) {
+		__iowrite32_copy(piobuf + 64, addr, cnt >> 2);
+		emsecs = jiffies_to_msecs(jiffies) - msecs;
+	}
+
+	/* 1 GiB/sec, slightly over IB SDR line rate */
+	if (lcnt < (emsecs * 1024U))
+		ipath_dev_err(dd,
+			"Performance problem: bandwidth to PIO buffers is "
+			"only %u MiB/sec\n",
+			lcnt / (u32) emsecs);
+	else
+		ipath_dbg("PIO buffer bandwidth %u MiB/sec is OK\n",
+			lcnt / (u32) emsecs);
+
+	preempt_enable();
+
+	vfree(addr);
+
+done:
+	/* disarm piobuf, so it's available again */
+	ipath_disarm_piobufs(dd, pbnum, 1);
+	ipath_enable_armlaunch(dd);
+}
+
+static void cleanup_device(struct ipath_devdata *dd);
+
+static int ipath_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
+{
+	int ret, len, j;
+	struct ipath_devdata *dd;
+	unsigned long long addr;
+	u32 bar0 = 0, bar1 = 0;
+
+	dd = ipath_alloc_devdata(pdev);
+	if (IS_ERR(dd)) {
+		ret = PTR_ERR(dd);
+		printk(KERN_ERR IPATH_DRV_NAME
+		       ": Could not allocate devdata: error %d\n", -ret);
+		goto bail;
+	}
+
+	ipath_cdbg(VERBOSE, "initializing unit #%u\n", dd->ipath_unit);
+
+	ret = pci_enable_device(pdev);
+	if (ret) {
+		/* This can happen iff:
+		 *
+		 * We did a chip reset, and then failed to reprogram the
+		 * BAR, or the chip reset due to an internal error.  We then
+		 * unloaded the driver and reloaded it.
+		 *
+		 * Both reset cases set the BAR back to initial state.  For
+		 * the latter case, the AER sticky error bit at offset 0x718
+		 * should be set, but the Linux kernel doesn't yet know
+		 * about that, it appears.  If the original BAR was retained
+		 * in the kernel data structures, this may be OK.
+		 */
+		ipath_dev_err(dd, "enable unit %d failed: error %d\n",
+			      dd->ipath_unit, -ret);
+		goto bail_devdata;
+	}
+	addr = pci_resource_start(pdev, 0);
+	len = pci_resource_len(pdev, 0);
+	ipath_cdbg(VERBOSE, "regbase (0) %llx len %d irq %d, vend %x/%x "
+		   "driver_data %lx\n", addr, len, pdev->irq, ent->vendor,
+		   ent->device, ent->driver_data);
+
+	read_bars(dd, pdev, &bar0, &bar1);
+
+	if (!bar1 && !(bar0 & ~0xf)) {
+		if (addr) {
+			dev_info(&pdev->dev, "BAR is 0 (probable RESET), "
+				 "rewriting as %llx\n", addr);
+			ret = pci_write_config_dword(
+				pdev, PCI_BASE_ADDRESS_0, addr);
+			if (ret) {
+				ipath_dev_err(dd, "rewrite of BAR0 "
+					      "failed: err %d\n", -ret);
+				goto bail_disable;
+			}
+			ret = pci_write_config_dword(
+				pdev, PCI_BASE_ADDRESS_1, addr >> 32);
+			if (ret) {
+				ipath_dev_err(dd, "rewrite of BAR1 "
+					      "failed: err %d\n", -ret);
+				goto bail_disable;
+			}
+		} else {
+			ipath_dev_err(dd, "BAR is 0 (probable RESET), "
+				      "not usable until reboot\n");
+			ret = -ENODEV;
+			goto bail_disable;
+		}
+	}
+
+	ret = pci_request_regions(pdev, IPATH_DRV_NAME);
+	if (ret) {
+		dev_info(&pdev->dev, "pci_request_regions unit %u fails: "
+			 "err %d\n", dd->ipath_unit, -ret);
+		goto bail_disable;
+	}
+
+	ret = pci_set_dma_mask(pdev, DMA_BIT_MASK(64));
+	if (ret) {
+		/*
+		 * if the 64 bit setup fails, try 32 bit.  Some systems
+		 * do not setup 64 bit maps on systems with 2GB or less
+		 * memory installed.
+		 */
+		ret = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
+		if (ret) {
+			dev_info(&pdev->dev,
+				"Unable to set DMA mask for unit %u: %d\n",
+				dd->ipath_unit, ret);
+			goto bail_regions;
+		}
+		else {
+			ipath_dbg("No 64bit DMA mask, used 32 bit mask\n");
+			ret = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32));
+			if (ret)
+				dev_info(&pdev->dev,
+					"Unable to set DMA consistent mask "
+					"for unit %u: %d\n",
+					dd->ipath_unit, ret);
+
+		}
+	}
+	else {
+		ret = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64));
+		if (ret)
+			dev_info(&pdev->dev,
+				"Unable to set DMA consistent mask "
+				"for unit %u: %d\n",
+				dd->ipath_unit, ret);
+	}
+
+	pci_set_master(pdev);
+
+	/*
+	 * Save BARs to rewrite after device reset.  Save all 64 bits of
+	 * BAR, just in case.
+	 */
+	dd->ipath_pcibar0 = addr;
+	dd->ipath_pcibar1 = addr >> 32;
+	dd->ipath_deviceid = ent->device;	/* save for later use */
+	dd->ipath_vendorid = ent->vendor;
+
+	/* setup the chip-specific functions, as early as possible. */
+	switch (ent->device) {
+	case PCI_DEVICE_ID_INFINIPATH_HT:
+		ipath_init_iba6110_funcs(dd);
+		break;
+
+	default:
+		ipath_dev_err(dd, "Found unknown QLogic deviceid 0x%x, "
+			      "failing\n", ent->device);
+		return -ENODEV;
+	}
+
+	for (j = 0; j < 6; j++) {
+		if (!pdev->resource[j].start)
+			continue;
+		ipath_cdbg(VERBOSE, "BAR %d %pR, len %llx\n",
+			   j, &pdev->resource[j],
+			   (unsigned long long)pci_resource_len(pdev, j));
+	}
+
+	if (!addr) {
+		ipath_dev_err(dd, "No valid address in BAR 0!\n");
+		ret = -ENODEV;
+		goto bail_regions;
+	}
+
+	dd->ipath_pcirev = pdev->revision;
+
+#if defined(__powerpc__)
+	/* There isn't a generic way to specify writethrough mappings */
+	dd->ipath_kregbase = __ioremap(addr, len,
+		(_PAGE_NO_CACHE|_PAGE_WRITETHRU));
+#else
+	dd->ipath_kregbase = ioremap_nocache(addr, len);
+#endif
+
+	if (!dd->ipath_kregbase) {
+		ipath_dbg("Unable to map io addr %llx to kvirt, failing\n",
+			  addr);
+		ret = -ENOMEM;
+		goto bail_iounmap;
+	}
+	dd->ipath_kregend = (u64 __iomem *)
+		((void __iomem *)dd->ipath_kregbase + len);
+	dd->ipath_physaddr = addr;	/* used for io_remap, etc. */
+	/* for user mmap */
+	ipath_cdbg(VERBOSE, "mapped io addr %llx to kregbase %p\n",
+		   addr, dd->ipath_kregbase);
+
+	if (dd->ipath_f_bus(dd, pdev))
+		ipath_dev_err(dd, "Failed to setup config space; "
+			      "continuing anyway\n");
+
+	/*
+	 * set up our interrupt handler; IRQF_SHARED probably not needed,
+	 * since MSI interrupts shouldn't be shared but won't  hurt for now.
+	 * check 0 irq after we return from chip-specific bus setup, since
+	 * that can affect this due to setup
+	 */
+	if (!dd->ipath_irq)
+		ipath_dev_err(dd, "irq is 0, BIOS error?  Interrupts won't "
+			      "work\n");
+	else {
+		ret = request_irq(dd->ipath_irq, ipath_intr, IRQF_SHARED,
+				  IPATH_DRV_NAME, dd);
+		if (ret) {
+			ipath_dev_err(dd, "Couldn't setup irq handler, "
+				      "irq=%d: %d\n", dd->ipath_irq, ret);
+			goto bail_iounmap;
+		}
+	}
+
+	ret = ipath_init_chip(dd, 0);	/* do the chip-specific init */
+	if (ret)
+		goto bail_irqsetup;
+
+	ret = ipath_enable_wc(dd);
+
+	if (ret) {
+		ipath_dev_err(dd, "Write combining not enabled "
+			      "(err %d): performance may be poor\n",
+			      -ret);
+		ret = 0;
+	}
+
+	ipath_verify_pioperf(dd);
+
+	ipath_device_create_group(&pdev->dev, dd);
+	ipathfs_add_device(dd);
+	ipath_user_add(dd);
+	ipath_diag_add(dd);
+	ipath_register_ib_device(dd);
+
+	goto bail;
+
+bail_irqsetup:
+	cleanup_device(dd);
+
+	if (dd->ipath_irq)
+		dd->ipath_f_free_irq(dd);
+
+	if (dd->ipath_f_cleanup)
+		dd->ipath_f_cleanup(dd);
+
+bail_iounmap:
+	iounmap((volatile void __iomem *) dd->ipath_kregbase);
+
+bail_regions:
+	pci_release_regions(pdev);
+
+bail_disable:
+	pci_disable_device(pdev);
+
+bail_devdata:
+	ipath_free_devdata(pdev, dd);
+
+bail:
+	return ret;
+}
+
+static void cleanup_device(struct ipath_devdata *dd)
+{
+	int port;
+	struct ipath_portdata **tmp;
+	unsigned long flags;
+
+	if (*dd->ipath_statusp & IPATH_STATUS_CHIP_PRESENT) {
+		/* can't do anything more with chip; needs re-init */
+		*dd->ipath_statusp &= ~IPATH_STATUS_CHIP_PRESENT;
+		if (dd->ipath_kregbase) {
+			/*
+			 * if we haven't already cleaned up before these are
+			 * to ensure any register reads/writes "fail" until
+			 * re-init
+			 */
+			dd->ipath_kregbase = NULL;
+			dd->ipath_uregbase = 0;
+			dd->ipath_sregbase = 0;
+			dd->ipath_cregbase = 0;
+			dd->ipath_kregsize = 0;
+		}
+		ipath_disable_wc(dd);
+	}
+
+	if (dd->ipath_spectriggerhit)
+		dev_info(&dd->pcidev->dev, "%lu special trigger hits\n",
+			 dd->ipath_spectriggerhit);
+
+	if (dd->ipath_pioavailregs_dma) {
+		dma_free_coherent(&dd->pcidev->dev, PAGE_SIZE,
+				  (void *) dd->ipath_pioavailregs_dma,
+				  dd->ipath_pioavailregs_phys);
+		dd->ipath_pioavailregs_dma = NULL;
+	}
+	if (dd->ipath_dummy_hdrq) {
+		dma_free_coherent(&dd->pcidev->dev,
+			dd->ipath_pd[0]->port_rcvhdrq_size,
+			dd->ipath_dummy_hdrq, dd->ipath_dummy_hdrq_phys);
+		dd->ipath_dummy_hdrq = NULL;
+	}
+
+	if (dd->ipath_pageshadow) {
+		struct page **tmpp = dd->ipath_pageshadow;
+		dma_addr_t *tmpd = dd->ipath_physshadow;
+		int i, cnt = 0;
+
+		ipath_cdbg(VERBOSE, "Unlocking any expTID pages still "
+			   "locked\n");
+		for (port = 0; port < dd->ipath_cfgports; port++) {
+			int port_tidbase = port * dd->ipath_rcvtidcnt;
+			int maxtid = port_tidbase + dd->ipath_rcvtidcnt;
+			for (i = port_tidbase; i < maxtid; i++) {
+				if (!tmpp[i])
+					continue;
+				pci_unmap_page(dd->pcidev, tmpd[i],
+					PAGE_SIZE, PCI_DMA_FROMDEVICE);
+				ipath_release_user_pages(&tmpp[i], 1);
+				tmpp[i] = NULL;
+				cnt++;
+			}
+		}
+		if (cnt) {
+			ipath_stats.sps_pageunlocks += cnt;
+			ipath_cdbg(VERBOSE, "There were still %u expTID "
+				   "entries locked\n", cnt);
+		}
+		if (ipath_stats.sps_pagelocks ||
+		    ipath_stats.sps_pageunlocks)
+			ipath_cdbg(VERBOSE, "%llu pages locked, %llu "
+				   "unlocked via ipath_m{un}lock\n",
+				   (unsigned long long)
+				   ipath_stats.sps_pagelocks,
+				   (unsigned long long)
+				   ipath_stats.sps_pageunlocks);
+
+		ipath_cdbg(VERBOSE, "Free shadow page tid array at %p\n",
+			   dd->ipath_pageshadow);
+		tmpp = dd->ipath_pageshadow;
+		dd->ipath_pageshadow = NULL;
+		vfree(tmpp);
+
+		dd->ipath_egrtidbase = NULL;
+	}
+
+	/*
+	 * free any resources still in use (usually just kernel ports)
+	 * at unload; we do for portcnt, because that's what we allocate.
+	 * We acquire lock to be really paranoid that ipath_pd isn't being
+	 * accessed from some interrupt-related code (that should not happen,
+	 * but best to be sure).
+	 */
+	spin_lock_irqsave(&dd->ipath_uctxt_lock, flags);
+	tmp = dd->ipath_pd;
+	dd->ipath_pd = NULL;
+	spin_unlock_irqrestore(&dd->ipath_uctxt_lock, flags);
+	for (port = 0; port < dd->ipath_portcnt; port++) {
+		struct ipath_portdata *pd = tmp[port];
+		tmp[port] = NULL; /* debugging paranoia */
+		ipath_free_pddata(dd, pd);
+	}
+	kfree(tmp);
+}
+
+static void ipath_remove_one(struct pci_dev *pdev)
+{
+	struct ipath_devdata *dd = pci_get_drvdata(pdev);
+
+	ipath_cdbg(VERBOSE, "removing, pdev=%p, dd=%p\n", pdev, dd);
+
+	/*
+	 * disable the IB link early, to be sure no new packets arrive, which
+	 * complicates the shutdown process
+	 */
+	ipath_shutdown_device(dd);
+
+	flush_workqueue(ib_wq);
+
+	if (dd->verbs_dev)
+		ipath_unregister_ib_device(dd->verbs_dev);
+
+	ipath_diag_remove(dd);
+	ipath_user_remove(dd);
+	ipathfs_remove_device(dd);
+	ipath_device_remove_group(&pdev->dev, dd);
+
+	ipath_cdbg(VERBOSE, "Releasing pci memory regions, dd %p, "
+		   "unit %u\n", dd, (u32) dd->ipath_unit);
+
+	cleanup_device(dd);
+
+	/*
+	 * turn off rcv, send, and interrupts for all ports, all drivers
+	 * should also hard reset the chip here?
+	 * free up port 0 (kernel) rcvhdr, egr bufs, and eventually tid bufs
+	 * for all versions of the driver, if they were allocated
+	 */
+	if (dd->ipath_irq) {
+		ipath_cdbg(VERBOSE, "unit %u free irq %d\n",
+			   dd->ipath_unit, dd->ipath_irq);
+		dd->ipath_f_free_irq(dd);
+	} else
+		ipath_dbg("irq is 0, not doing free_irq "
+			  "for unit %u\n", dd->ipath_unit);
+	/*
+	 * we check for NULL here, because it's outside
+	 * the kregbase check, and we need to call it
+	 * after the free_irq.	Thus it's possible that
+	 * the function pointers were never initialized.
+	 */
+	if (dd->ipath_f_cleanup)
+		/* clean up chip-specific stuff */
+		dd->ipath_f_cleanup(dd);
+
+	ipath_cdbg(VERBOSE, "Unmapping kregbase %p\n", dd->ipath_kregbase);
+	iounmap((volatile void __iomem *) dd->ipath_kregbase);
+	pci_release_regions(pdev);
+	ipath_cdbg(VERBOSE, "calling pci_disable_device\n");
+	pci_disable_device(pdev);
+
+	ipath_free_devdata(pdev, dd);
+}
+
+/* general driver use */
+DEFINE_MUTEX(ipath_mutex);
+
+static DEFINE_SPINLOCK(ipath_pioavail_lock);
+
+/**
+ * ipath_disarm_piobufs - cancel a range of PIO buffers
+ * @dd: the infinipath device
+ * @first: the first PIO buffer to cancel
+ * @cnt: the number of PIO buffers to cancel
+ *
+ * cancel a range of PIO buffers, used when they might be armed, but
+ * not triggered.  Used at init to ensure buffer state, and also user
+ * process close, in case it died while writing to a PIO buffer
+ * Also after errors.
+ */
+void ipath_disarm_piobufs(struct ipath_devdata *dd, unsigned first,
+			  unsigned cnt)
+{
+	unsigned i, last = first + cnt;
+	unsigned long flags;
+
+	ipath_cdbg(PKT, "disarm %u PIObufs first=%u\n", cnt, first);
+	for (i = first; i < last; i++) {
+		spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
+		/*
+		 * The disarm-related bits are write-only, so it
+		 * is ok to OR them in with our copy of sendctrl
+		 * while we hold the lock.
+		 */
+		ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl,
+			dd->ipath_sendctrl | INFINIPATH_S_DISARM |
+			(i << INFINIPATH_S_DISARMPIOBUF_SHIFT));
+		/* can't disarm bufs back-to-back per iba7220 spec */
+		ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
+		spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
+	}
+	/* on some older chips, update may not happen after cancel */
+	ipath_force_pio_avail_update(dd);
+}
+
+/**
+ * ipath_wait_linkstate - wait for an IB link state change to occur
+ * @dd: the infinipath device
+ * @state: the state to wait for
+ * @msecs: the number of milliseconds to wait
+ *
+ * wait up to msecs milliseconds for IB link state change to occur for
+ * now, take the easy polling route.  Currently used only by
+ * ipath_set_linkstate.  Returns 0 if state reached, otherwise
+ * -ETIMEDOUT state can have multiple states set, for any of several
+ * transitions.
+ */
+int ipath_wait_linkstate(struct ipath_devdata *dd, u32 state, int msecs)
+{
+	dd->ipath_state_wanted = state;
+	wait_event_interruptible_timeout(ipath_state_wait,
+					 (dd->ipath_flags & state),
+					 msecs_to_jiffies(msecs));
+	dd->ipath_state_wanted = 0;
+
+	if (!(dd->ipath_flags & state)) {
+		u64 val;
+		ipath_cdbg(VERBOSE, "Didn't reach linkstate %s within %u"
+			   " ms\n",
+			   /* test INIT ahead of DOWN, both can be set */
+			   (state & IPATH_LINKINIT) ? "INIT" :
+			   ((state & IPATH_LINKDOWN) ? "DOWN" :
+			    ((state & IPATH_LINKARMED) ? "ARM" : "ACTIVE")),
+			   msecs);
+		val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_ibcstatus);
+		ipath_cdbg(VERBOSE, "ibcc=%llx ibcstatus=%llx (%s)\n",
+			   (unsigned long long) ipath_read_kreg64(
+				   dd, dd->ipath_kregs->kr_ibcctrl),
+			   (unsigned long long) val,
+			   ipath_ibcstatus_str[val & dd->ibcs_lts_mask]);
+	}
+	return (dd->ipath_flags & state) ? 0 : -ETIMEDOUT;
+}
+
+static void decode_sdma_errs(struct ipath_devdata *dd, ipath_err_t err,
+	char *buf, size_t blen)
+{
+	static const struct {
+		ipath_err_t err;
+		const char *msg;
+	} errs[] = {
+		{ INFINIPATH_E_SDMAGENMISMATCH, "SDmaGenMismatch" },
+		{ INFINIPATH_E_SDMAOUTOFBOUND, "SDmaOutOfBound" },
+		{ INFINIPATH_E_SDMATAILOUTOFBOUND, "SDmaTailOutOfBound" },
+		{ INFINIPATH_E_SDMABASE, "SDmaBase" },
+		{ INFINIPATH_E_SDMA1STDESC, "SDma1stDesc" },
+		{ INFINIPATH_E_SDMARPYTAG, "SDmaRpyTag" },
+		{ INFINIPATH_E_SDMADWEN, "SDmaDwEn" },
+		{ INFINIPATH_E_SDMAMISSINGDW, "SDmaMissingDw" },
+		{ INFINIPATH_E_SDMAUNEXPDATA, "SDmaUnexpData" },
+		{ INFINIPATH_E_SDMADESCADDRMISALIGN, "SDmaDescAddrMisalign" },
+		{ INFINIPATH_E_SENDBUFMISUSE, "SendBufMisuse" },
+		{ INFINIPATH_E_SDMADISABLED, "SDmaDisabled" },
+	};
+	int i;
+	int expected;
+	size_t bidx = 0;
+
+	for (i = 0; i < ARRAY_SIZE(errs); i++) {
+		expected = (errs[i].err != INFINIPATH_E_SDMADISABLED) ? 0 :
+			test_bit(IPATH_SDMA_ABORTING, &dd->ipath_sdma_status);
+		if ((err & errs[i].err) && !expected)
+			bidx += snprintf(buf + bidx, blen - bidx,
+					 "%s ", errs[i].msg);
+	}
+}
+
+/*
+ * Decode the error status into strings, deciding whether to always
+ * print * it or not depending on "normal packet errors" vs everything
+ * else.   Return 1 if "real" errors, otherwise 0 if only packet
+ * errors, so caller can decide what to print with the string.
+ */
+int ipath_decode_err(struct ipath_devdata *dd, char *buf, size_t blen,
+	ipath_err_t err)
+{
+	int iserr = 1;
+	*buf = '\0';
+	if (err & INFINIPATH_E_PKTERRS) {
+		if (!(err & ~INFINIPATH_E_PKTERRS))
+			iserr = 0; // if only packet errors.
+		if (ipath_debug & __IPATH_ERRPKTDBG) {
+			if (err & INFINIPATH_E_REBP)
+				strlcat(buf, "EBP ", blen);
+			if (err & INFINIPATH_E_RVCRC)
+				strlcat(buf, "VCRC ", blen);
+			if (err & INFINIPATH_E_RICRC) {
+				strlcat(buf, "CRC ", blen);
+				// clear for check below, so only once
+				err &= INFINIPATH_E_RICRC;
+			}
+			if (err & INFINIPATH_E_RSHORTPKTLEN)
+				strlcat(buf, "rshortpktlen ", blen);
+			if (err & INFINIPATH_E_SDROPPEDDATAPKT)
+				strlcat(buf, "sdroppeddatapkt ", blen);
+			if (err & INFINIPATH_E_SPKTLEN)
+				strlcat(buf, "spktlen ", blen);
+		}
+		if ((err & INFINIPATH_E_RICRC) &&
+			!(err&(INFINIPATH_E_RVCRC|INFINIPATH_E_REBP)))
+			strlcat(buf, "CRC ", blen);
+		if (!iserr)
+			goto done;
+	}
+	if (err & INFINIPATH_E_RHDRLEN)
+		strlcat(buf, "rhdrlen ", blen);
+	if (err & INFINIPATH_E_RBADTID)
+		strlcat(buf, "rbadtid ", blen);
+	if (err & INFINIPATH_E_RBADVERSION)
+		strlcat(buf, "rbadversion ", blen);
+	if (err & INFINIPATH_E_RHDR)
+		strlcat(buf, "rhdr ", blen);
+	if (err & INFINIPATH_E_SENDSPECIALTRIGGER)
+		strlcat(buf, "sendspecialtrigger ", blen);
+	if (err & INFINIPATH_E_RLONGPKTLEN)
+		strlcat(buf, "rlongpktlen ", blen);
+	if (err & INFINIPATH_E_RMAXPKTLEN)
+		strlcat(buf, "rmaxpktlen ", blen);
+	if (err & INFINIPATH_E_RMINPKTLEN)
+		strlcat(buf, "rminpktlen ", blen);
+	if (err & INFINIPATH_E_SMINPKTLEN)
+		strlcat(buf, "sminpktlen ", blen);
+	if (err & INFINIPATH_E_RFORMATERR)
+		strlcat(buf, "rformaterr ", blen);
+	if (err & INFINIPATH_E_RUNSUPVL)
+		strlcat(buf, "runsupvl ", blen);
+	if (err & INFINIPATH_E_RUNEXPCHAR)
+		strlcat(buf, "runexpchar ", blen);
+	if (err & INFINIPATH_E_RIBFLOW)
+		strlcat(buf, "ribflow ", blen);
+	if (err & INFINIPATH_E_SUNDERRUN)
+		strlcat(buf, "sunderrun ", blen);
+	if (err & INFINIPATH_E_SPIOARMLAUNCH)
+		strlcat(buf, "spioarmlaunch ", blen);
+	if (err & INFINIPATH_E_SUNEXPERRPKTNUM)
+		strlcat(buf, "sunexperrpktnum ", blen);
+	if (err & INFINIPATH_E_SDROPPEDSMPPKT)
+		strlcat(buf, "sdroppedsmppkt ", blen);
+	if (err & INFINIPATH_E_SMAXPKTLEN)
+		strlcat(buf, "smaxpktlen ", blen);
+	if (err & INFINIPATH_E_SUNSUPVL)
+		strlcat(buf, "sunsupVL ", blen);
+	if (err & INFINIPATH_E_INVALIDADDR)
+		strlcat(buf, "invalidaddr ", blen);
+	if (err & INFINIPATH_E_RRCVEGRFULL)
+		strlcat(buf, "rcvegrfull ", blen);
+	if (err & INFINIPATH_E_RRCVHDRFULL)
+		strlcat(buf, "rcvhdrfull ", blen);
+	if (err & INFINIPATH_E_IBSTATUSCHANGED)
+		strlcat(buf, "ibcstatuschg ", blen);
+	if (err & INFINIPATH_E_RIBLOSTLINK)
+		strlcat(buf, "riblostlink ", blen);
+	if (err & INFINIPATH_E_HARDWARE)
+		strlcat(buf, "hardware ", blen);
+	if (err & INFINIPATH_E_RESET)
+		strlcat(buf, "reset ", blen);
+	if (err & INFINIPATH_E_SDMAERRS)
+		decode_sdma_errs(dd, err, buf, blen);
+	if (err & INFINIPATH_E_INVALIDEEPCMD)
+		strlcat(buf, "invalideepromcmd ", blen);
+done:
+	return iserr;
+}
+
+/**
+ * get_rhf_errstring - decode RHF errors
+ * @err: the err number
+ * @msg: the output buffer
+ * @len: the length of the output buffer
+ *
+ * only used one place now, may want more later
+ */
+static void get_rhf_errstring(u32 err, char *msg, size_t len)
+{
+	/* if no errors, and so don't need to check what's first */
+	*msg = '\0';
+
+	if (err & INFINIPATH_RHF_H_ICRCERR)
+		strlcat(msg, "icrcerr ", len);
+	if (err & INFINIPATH_RHF_H_VCRCERR)
+		strlcat(msg, "vcrcerr ", len);
+	if (err & INFINIPATH_RHF_H_PARITYERR)
+		strlcat(msg, "parityerr ", len);
+	if (err & INFINIPATH_RHF_H_LENERR)
+		strlcat(msg, "lenerr ", len);
+	if (err & INFINIPATH_RHF_H_MTUERR)
+		strlcat(msg, "mtuerr ", len);
+	if (err & INFINIPATH_RHF_H_IHDRERR)
+		/* infinipath hdr checksum error */
+		strlcat(msg, "ipathhdrerr ", len);
+	if (err & INFINIPATH_RHF_H_TIDERR)
+		strlcat(msg, "tiderr ", len);
+	if (err & INFINIPATH_RHF_H_MKERR)
+		/* bad port, offset, etc. */
+		strlcat(msg, "invalid ipathhdr ", len);
+	if (err & INFINIPATH_RHF_H_IBERR)
+		strlcat(msg, "iberr ", len);
+	if (err & INFINIPATH_RHF_L_SWA)
+		strlcat(msg, "swA ", len);
+	if (err & INFINIPATH_RHF_L_SWB)
+		strlcat(msg, "swB ", len);
+}
+
+/**
+ * ipath_get_egrbuf - get an eager buffer
+ * @dd: the infinipath device
+ * @bufnum: the eager buffer to get
+ *
+ * must only be called if ipath_pd[port] is known to be allocated
+ */
+static inline void *ipath_get_egrbuf(struct ipath_devdata *dd, u32 bufnum)
+{
+	return dd->ipath_port0_skbinfo ?
+		(void *) dd->ipath_port0_skbinfo[bufnum].skb->data : NULL;
+}
+
+/**
+ * ipath_alloc_skb - allocate an skb and buffer with possible constraints
+ * @dd: the infinipath device
+ * @gfp_mask: the sk_buff SFP mask
+ */
+struct sk_buff *ipath_alloc_skb(struct ipath_devdata *dd,
+				gfp_t gfp_mask)
+{
+	struct sk_buff *skb;
+	u32 len;
+
+	/*
+	 * Only fully supported way to handle this is to allocate lots
+	 * extra, align as needed, and then do skb_reserve().  That wastes
+	 * a lot of memory...  I'll have to hack this into infinipath_copy
+	 * also.
+	 */
+
+	/*
+	 * We need 2 extra bytes for ipath_ether data sent in the
+	 * key header.  In order to keep everything dword aligned,
+	 * we'll reserve 4 bytes.
+	 */
+	len = dd->ipath_ibmaxlen + 4;
+
+	if (dd->ipath_flags & IPATH_4BYTE_TID) {
+		/* We need a 2KB multiple alignment, and there is no way
+		 * to do it except to allocate extra and then skb_reserve
+		 * enough to bring it up to the right alignment.
+		 */
+		len += 2047;
+	}
+
+	skb = __dev_alloc_skb(len, gfp_mask);
+	if (!skb) {
+		ipath_dev_err(dd, "Failed to allocate skbuff, length %u\n",
+			      len);
+		goto bail;
+	}
+
+	skb_reserve(skb, 4);
+
+	if (dd->ipath_flags & IPATH_4BYTE_TID) {
+		u32 una = (unsigned long)skb->data & 2047;
+		if (una)
+			skb_reserve(skb, 2048 - una);
+	}
+
+bail:
+	return skb;
+}
+
+static void ipath_rcv_hdrerr(struct ipath_devdata *dd,
+			     u32 eflags,
+			     u32 l,
+			     u32 etail,
+			     __le32 *rhf_addr,
+			     struct ipath_message_header *hdr)
+{
+	char emsg[128];
+
+	get_rhf_errstring(eflags, emsg, sizeof emsg);
+	ipath_cdbg(PKT, "RHFerrs %x hdrqtail=%x typ=%u "
+		   "tlen=%x opcode=%x egridx=%x: %s\n",
+		   eflags, l,
+		   ipath_hdrget_rcv_type(rhf_addr),
+		   ipath_hdrget_length_in_bytes(rhf_addr),
+		   be32_to_cpu(hdr->bth[0]) >> 24,
+		   etail, emsg);
+
+	/* Count local link integrity errors. */
+	if (eflags & (INFINIPATH_RHF_H_ICRCERR | INFINIPATH_RHF_H_VCRCERR)) {
+		u8 n = (dd->ipath_ibcctrl >>
+			INFINIPATH_IBCC_PHYERRTHRESHOLD_SHIFT) &
+			INFINIPATH_IBCC_PHYERRTHRESHOLD_MASK;
+
+		if (++dd->ipath_lli_counter > n) {
+			dd->ipath_lli_counter = 0;
+			dd->ipath_lli_errors++;
+		}
+	}
+}
+
+/*
+ * ipath_kreceive - receive a packet
+ * @pd: the infinipath port
+ *
+ * called from interrupt handler for errors or receive interrupt
+ */
+void ipath_kreceive(struct ipath_portdata *pd)
+{
+	struct ipath_devdata *dd = pd->port_dd;
+	__le32 *rhf_addr;
+	void *ebuf;
+	const u32 rsize = dd->ipath_rcvhdrentsize;	/* words */
+	const u32 maxcnt = dd->ipath_rcvhdrcnt * rsize;	/* words */
+	u32 etail = -1, l, hdrqtail;
+	struct ipath_message_header *hdr;
+	u32 eflags, i, etype, tlen, pkttot = 0, updegr = 0, reloop = 0;
+	static u64 totcalls;	/* stats, may eventually remove */
+	int last;
+
+	l = pd->port_head;
+	rhf_addr = (__le32 *) pd->port_rcvhdrq + l + dd->ipath_rhf_offset;
+	if (dd->ipath_flags & IPATH_NODMA_RTAIL) {
+		u32 seq = ipath_hdrget_seq(rhf_addr);
+
+		if (seq != pd->port_seq_cnt)
+			goto bail;
+		hdrqtail = 0;
+	} else {
+		hdrqtail = ipath_get_rcvhdrtail(pd);
+		if (l == hdrqtail)
+			goto bail;
+		smp_rmb();
+	}
+
+reloop:
+	for (last = 0, i = 1; !last; i += !last) {
+		hdr = dd->ipath_f_get_msgheader(dd, rhf_addr);
+		eflags = ipath_hdrget_err_flags(rhf_addr);
+		etype = ipath_hdrget_rcv_type(rhf_addr);
+		/* total length */
+		tlen = ipath_hdrget_length_in_bytes(rhf_addr);
+		ebuf = NULL;
+		if ((dd->ipath_flags & IPATH_NODMA_RTAIL) ?
+		    ipath_hdrget_use_egr_buf(rhf_addr) :
+		    (etype != RCVHQ_RCV_TYPE_EXPECTED)) {
+			/*
+			 * It turns out that the chip uses an eager buffer
+			 * for all non-expected packets, whether it "needs"
+			 * one or not.  So always get the index, but don't
+			 * set ebuf (so we try to copy data) unless the
+			 * length requires it.
+			 */
+			etail = ipath_hdrget_index(rhf_addr);
+			updegr = 1;
+			if (tlen > sizeof(*hdr) ||
+			    etype == RCVHQ_RCV_TYPE_NON_KD)
+				ebuf = ipath_get_egrbuf(dd, etail);
+		}
+
+		/*
+		 * both tiderr and ipathhdrerr are set for all plain IB
+		 * packets; only ipathhdrerr should be set.
+		 */
+
+		if (etype != RCVHQ_RCV_TYPE_NON_KD &&
+		    etype != RCVHQ_RCV_TYPE_ERROR &&
+		    ipath_hdrget_ipath_ver(hdr->iph.ver_port_tid_offset) !=
+		    IPS_PROTO_VERSION)
+			ipath_cdbg(PKT, "Bad InfiniPath protocol version "
+				   "%x\n", etype);
+
+		if (unlikely(eflags))
+			ipath_rcv_hdrerr(dd, eflags, l, etail, rhf_addr, hdr);
+		else if (etype == RCVHQ_RCV_TYPE_NON_KD) {
+			ipath_ib_rcv(dd->verbs_dev, (u32 *)hdr, ebuf, tlen);
+			if (dd->ipath_lli_counter)
+				dd->ipath_lli_counter--;
+		} else if (etype == RCVHQ_RCV_TYPE_EAGER) {
+			u8 opcode = be32_to_cpu(hdr->bth[0]) >> 24;
+			u32 qp = be32_to_cpu(hdr->bth[1]) & 0xffffff;
+			ipath_cdbg(PKT, "typ %x, opcode %x (eager, "
+				   "qp=%x), len %x; ignored\n",
+				   etype, opcode, qp, tlen);
+		}
+		else if (etype == RCVHQ_RCV_TYPE_EXPECTED)
+			ipath_dbg("Bug: Expected TID, opcode %x; ignored\n",
+				  be32_to_cpu(hdr->bth[0]) >> 24);
+		else {
+			/*
+			 * error packet, type of error unknown.
+			 * Probably type 3, but we don't know, so don't
+			 * even try to print the opcode, etc.
+			 * Usually caused by a "bad packet", that has no
+			 * BTH, when the LRH says it should.
+			 */
+			ipath_cdbg(ERRPKT, "Error Pkt, but no eflags! egrbuf"
+				  " %x, len %x hdrq+%x rhf: %Lx\n",
+				  etail, tlen, l, (unsigned long long)
+				  le64_to_cpu(*(__le64 *) rhf_addr));
+			if (ipath_debug & __IPATH_ERRPKTDBG) {
+				u32 j, *d, dw = rsize-2;
+				if (rsize > (tlen>>2))
+					dw = tlen>>2;
+				d = (u32 *)hdr;
+				printk(KERN_DEBUG "EPkt rcvhdr(%x dw):\n",
+					dw);
+				for (j = 0; j < dw; j++)
+					printk(KERN_DEBUG "%8x%s", d[j],
+						(j%8) == 7 ? "\n" : " ");
+				printk(KERN_DEBUG ".\n");
+			}
+		}
+		l += rsize;
+		if (l >= maxcnt)
+			l = 0;
+		rhf_addr = (__le32 *) pd->port_rcvhdrq +
+			l + dd->ipath_rhf_offset;
+		if (dd->ipath_flags & IPATH_NODMA_RTAIL) {
+			u32 seq = ipath_hdrget_seq(rhf_addr);
+
+			if (++pd->port_seq_cnt > 13)
+				pd->port_seq_cnt = 1;
+			if (seq != pd->port_seq_cnt)
+				last = 1;
+		} else if (l == hdrqtail)
+			last = 1;
+		/*
+		 * update head regs on last packet, and every 16 packets.
+		 * Reduce bus traffic, while still trying to prevent
+		 * rcvhdrq overflows, for when the queue is nearly full
+		 */
+		if (last || !(i & 0xf)) {
+			u64 lval = l;
+
+			/* request IBA6120 and 7220 interrupt only on last */
+			if (last)
+				lval |= dd->ipath_rhdrhead_intr_off;
+			ipath_write_ureg(dd, ur_rcvhdrhead, lval,
+				pd->port_port);
+			if (updegr) {
+				ipath_write_ureg(dd, ur_rcvegrindexhead,
+						 etail, pd->port_port);
+				updegr = 0;
+			}
+		}
+	}
+
+	if (!dd->ipath_rhdrhead_intr_off && !reloop &&
+	    !(dd->ipath_flags & IPATH_NODMA_RTAIL)) {
+		/* IBA6110 workaround; we can have a race clearing chip
+		 * interrupt with another interrupt about to be delivered,
+		 * and can clear it before it is delivered on the GPIO
+		 * workaround.  By doing the extra check here for the
+		 * in-memory tail register updating while we were doing
+		 * earlier packets, we "almost" guarantee we have covered
+		 * that case.
+		 */
+		u32 hqtail = ipath_get_rcvhdrtail(pd);
+		if (hqtail != hdrqtail) {
+			hdrqtail = hqtail;
+			reloop = 1; /* loop 1 extra time at most */
+			goto reloop;
+		}
+	}
+
+	pkttot += i;
+
+	pd->port_head = l;
+
+	if (pkttot > ipath_stats.sps_maxpkts_call)
+		ipath_stats.sps_maxpkts_call = pkttot;
+	ipath_stats.sps_port0pkts += pkttot;
+	ipath_stats.sps_avgpkts_call =
+		ipath_stats.sps_port0pkts / ++totcalls;
+
+bail:;
+}
+
+/**
+ * ipath_update_pio_bufs - update shadow copy of the PIO availability map
+ * @dd: the infinipath device
+ *
+ * called whenever our local copy indicates we have run out of send buffers
+ * NOTE: This can be called from interrupt context by some code
+ * and from non-interrupt context by ipath_getpiobuf().
+ */
+
+static void ipath_update_pio_bufs(struct ipath_devdata *dd)
+{
+	unsigned long flags;
+	int i;
+	const unsigned piobregs = (unsigned)dd->ipath_pioavregs;
+
+	/* If the generation (check) bits have changed, then we update the
+	 * busy bit for the corresponding PIO buffer.  This algorithm will
+	 * modify positions to the value they already have in some cases
+	 * (i.e., no change), but it's faster than changing only the bits
+	 * that have changed.
+	 *
+	 * We would like to do this atomicly, to avoid spinlocks in the
+	 * critical send path, but that's not really possible, given the
+	 * type of changes, and that this routine could be called on
+	 * multiple cpu's simultaneously, so we lock in this routine only,
+	 * to avoid conflicting updates; all we change is the shadow, and
+	 * it's a single 64 bit memory location, so by definition the update
+	 * is atomic in terms of what other cpu's can see in testing the
+	 * bits.  The spin_lock overhead isn't too bad, since it only
+	 * happens when all buffers are in use, so only cpu overhead, not
+	 * latency or bandwidth is affected.
+	 */
+	if (!dd->ipath_pioavailregs_dma) {
+		ipath_dbg("Update shadow pioavail, but regs_dma NULL!\n");
+		return;
+	}
+	if (ipath_debug & __IPATH_VERBDBG) {
+		/* only if packet debug and verbose */
+		volatile __le64 *dma = dd->ipath_pioavailregs_dma;
+		unsigned long *shadow = dd->ipath_pioavailshadow;
+
+		ipath_cdbg(PKT, "Refill avail, dma0=%llx shad0=%lx, "
+			   "d1=%llx s1=%lx, d2=%llx s2=%lx, d3=%llx "
+			   "s3=%lx\n",
+			   (unsigned long long) le64_to_cpu(dma[0]),
+			   shadow[0],
+			   (unsigned long long) le64_to_cpu(dma[1]),
+			   shadow[1],
+			   (unsigned long long) le64_to_cpu(dma[2]),
+			   shadow[2],
+			   (unsigned long long) le64_to_cpu(dma[3]),
+			   shadow[3]);
+		if (piobregs > 4)
+			ipath_cdbg(
+				PKT, "2nd group, dma4=%llx shad4=%lx, "
+				"d5=%llx s5=%lx, d6=%llx s6=%lx, "
+				"d7=%llx s7=%lx\n",
+				(unsigned long long) le64_to_cpu(dma[4]),
+				shadow[4],
+				(unsigned long long) le64_to_cpu(dma[5]),
+				shadow[5],
+				(unsigned long long) le64_to_cpu(dma[6]),
+				shadow[6],
+				(unsigned long long) le64_to_cpu(dma[7]),
+				shadow[7]);
+	}
+	spin_lock_irqsave(&ipath_pioavail_lock, flags);
+	for (i = 0; i < piobregs; i++) {
+		u64 pchbusy, pchg, piov, pnew;
+		/*
+		 * Chip Errata: bug 6641; even and odd qwords>3 are swapped
+		 */
+		if (i > 3 && (dd->ipath_flags & IPATH_SWAP_PIOBUFS))
+			piov = le64_to_cpu(dd->ipath_pioavailregs_dma[i ^ 1]);
+		else
+			piov = le64_to_cpu(dd->ipath_pioavailregs_dma[i]);
+		pchg = dd->ipath_pioavailkernel[i] &
+			~(dd->ipath_pioavailshadow[i] ^ piov);
+		pchbusy = pchg << INFINIPATH_SENDPIOAVAIL_BUSY_SHIFT;
+		if (pchg && (pchbusy & dd->ipath_pioavailshadow[i])) {
+			pnew = dd->ipath_pioavailshadow[i] & ~pchbusy;
+			pnew |= piov & pchbusy;
+			dd->ipath_pioavailshadow[i] = pnew;
+		}
+	}
+	spin_unlock_irqrestore(&ipath_pioavail_lock, flags);
+}
+
+/*
+ * used to force update of pioavailshadow if we can't get a pio buffer.
+ * Needed primarily due to exitting freeze mode after recovering
+ * from errors.  Done lazily, because it's safer (known to not
+ * be writing pio buffers).
+ */
+static void ipath_reset_availshadow(struct ipath_devdata *dd)
+{
+	int i, im;
+	unsigned long flags;
+
+	spin_lock_irqsave(&ipath_pioavail_lock, flags);
+	for (i = 0; i < dd->ipath_pioavregs; i++) {
+		u64 val, oldval;
+		/* deal with 6110 chip bug on high register #s */
+		im = (i > 3 && (dd->ipath_flags & IPATH_SWAP_PIOBUFS)) ?
+			i ^ 1 : i;
+		val = le64_to_cpu(dd->ipath_pioavailregs_dma[im]);
+		/*
+		 * busy out the buffers not in the kernel avail list,
+		 * without changing the generation bits.
+		 */
+		oldval = dd->ipath_pioavailshadow[i];
+		dd->ipath_pioavailshadow[i] = val |
+			((~dd->ipath_pioavailkernel[i] <<
+			INFINIPATH_SENDPIOAVAIL_BUSY_SHIFT) &
+			0xaaaaaaaaaaaaaaaaULL); /* All BUSY bits in qword */
+		if (oldval != dd->ipath_pioavailshadow[i])
+			ipath_dbg("shadow[%d] was %Lx, now %lx\n",
+				i, (unsigned long long) oldval,
+				dd->ipath_pioavailshadow[i]);
+	}
+	spin_unlock_irqrestore(&ipath_pioavail_lock, flags);
+}
+
+/**
+ * ipath_setrcvhdrsize - set the receive header size
+ * @dd: the infinipath device
+ * @rhdrsize: the receive header size
+ *
+ * called from user init code, and also layered driver init
+ */
+int ipath_setrcvhdrsize(struct ipath_devdata *dd, unsigned rhdrsize)
+{
+	int ret = 0;
+
+	if (dd->ipath_flags & IPATH_RCVHDRSZ_SET) {
+		if (dd->ipath_rcvhdrsize != rhdrsize) {
+			dev_info(&dd->pcidev->dev,
+				 "Error: can't set protocol header "
+				 "size %u, already %u\n",
+				 rhdrsize, dd->ipath_rcvhdrsize);
+			ret = -EAGAIN;
+		} else
+			ipath_cdbg(VERBOSE, "Reuse same protocol header "
+				   "size %u\n", dd->ipath_rcvhdrsize);
+	} else if (rhdrsize > (dd->ipath_rcvhdrentsize -
+			       (sizeof(u64) / sizeof(u32)))) {
+		ipath_dbg("Error: can't set protocol header size %u "
+			  "(> max %u)\n", rhdrsize,
+			  dd->ipath_rcvhdrentsize -
+			  (u32) (sizeof(u64) / sizeof(u32)));
+		ret = -EOVERFLOW;
+	} else {
+		dd->ipath_flags |= IPATH_RCVHDRSZ_SET;
+		dd->ipath_rcvhdrsize = rhdrsize;
+		ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvhdrsize,
+				 dd->ipath_rcvhdrsize);
+		ipath_cdbg(VERBOSE, "Set protocol header size to %u\n",
+			   dd->ipath_rcvhdrsize);
+	}
+	return ret;
+}
+
+/*
+ * debugging code and stats updates if no pio buffers available.
+ */
+static noinline void no_pio_bufs(struct ipath_devdata *dd)
+{
+	unsigned long *shadow = dd->ipath_pioavailshadow;
+	__le64 *dma = (__le64 *)dd->ipath_pioavailregs_dma;
+
+	dd->ipath_upd_pio_shadow = 1;
+
+	/*
+	 * not atomic, but if we lose a stat count in a while, that's OK
+	 */
+	ipath_stats.sps_nopiobufs++;
+	if (!(++dd->ipath_consec_nopiobuf % 100000)) {
+		ipath_force_pio_avail_update(dd); /* at start */
+		ipath_dbg("%u tries no piobufavail ts%lx; dmacopy: "
+			"%llx %llx %llx %llx\n"
+			"ipath  shadow:  %lx %lx %lx %lx\n",
+			dd->ipath_consec_nopiobuf,
+			(unsigned long)get_cycles(),
+			(unsigned long long) le64_to_cpu(dma[0]),
+			(unsigned long long) le64_to_cpu(dma[1]),
+			(unsigned long long) le64_to_cpu(dma[2]),
+			(unsigned long long) le64_to_cpu(dma[3]),
+			shadow[0], shadow[1], shadow[2], shadow[3]);
+		/*
+		 * 4 buffers per byte, 4 registers above, cover rest
+		 * below
+		 */
+		if ((dd->ipath_piobcnt2k + dd->ipath_piobcnt4k) >
+		    (sizeof(shadow[0]) * 4 * 4))
+			ipath_dbg("2nd group: dmacopy: "
+				  "%llx %llx %llx %llx\n"
+				  "ipath  shadow:  %lx %lx %lx %lx\n",
+				  (unsigned long long)le64_to_cpu(dma[4]),
+				  (unsigned long long)le64_to_cpu(dma[5]),
+				  (unsigned long long)le64_to_cpu(dma[6]),
+				  (unsigned long long)le64_to_cpu(dma[7]),
+				  shadow[4], shadow[5], shadow[6], shadow[7]);
+
+		/* at end, so update likely happened */
+		ipath_reset_availshadow(dd);
+	}
+}
+
+/*
+ * common code for normal driver pio buffer allocation, and reserved
+ * allocation.
+ *
+ * do appropriate marking as busy, etc.
+ * returns buffer number if one found (>=0), negative number is error.
+ */
+static u32 __iomem *ipath_getpiobuf_range(struct ipath_devdata *dd,
+	u32 *pbufnum, u32 first, u32 last, u32 firsti)
+{
+	int i, j, updated = 0;
+	unsigned piobcnt;
+	unsigned long flags;
+	unsigned long *shadow = dd->ipath_pioavailshadow;
+	u32 __iomem *buf;
+
+	piobcnt = last - first;
+	if (dd->ipath_upd_pio_shadow) {
+		/*
+		 * Minor optimization.  If we had no buffers on last call,
+		 * start out by doing the update; continue and do scan even
+		 * if no buffers were updated, to be paranoid
+		 */
+		ipath_update_pio_bufs(dd);
+		updated++;
+		i = first;
+	} else
+		i = firsti;
+rescan:
+	/*
+	 * while test_and_set_bit() is atomic, we do that and then the
+	 * change_bit(), and the pair is not.  See if this is the cause
+	 * of the remaining armlaunch errors.
+	 */
+	spin_lock_irqsave(&ipath_pioavail_lock, flags);
+	for (j = 0; j < piobcnt; j++, i++) {
+		if (i >= last)
+			i = first;
+		if (__test_and_set_bit((2 * i) + 1, shadow))
+			continue;
+		/* flip generation bit */
+		__change_bit(2 * i, shadow);
+		break;
+	}
+	spin_unlock_irqrestore(&ipath_pioavail_lock, flags);
+
+	if (j == piobcnt) {
+		if (!updated) {
+			/*
+			 * first time through; shadow exhausted, but may be
+			 * buffers available, try an update and then rescan.
+			 */
+			ipath_update_pio_bufs(dd);
+			updated++;
+			i = first;
+			goto rescan;
+		} else if (updated == 1 && piobcnt <=
+			((dd->ipath_sendctrl
+			>> INFINIPATH_S_UPDTHRESH_SHIFT) &
+			INFINIPATH_S_UPDTHRESH_MASK)) {
+			/*
+			 * for chips supporting and using the update
+			 * threshold we need to force an update of the
+			 * in-memory copy if the count is less than the
+			 * thershold, then check one more time.
+			 */
+			ipath_force_pio_avail_update(dd);
+			ipath_update_pio_bufs(dd);
+			updated++;
+			i = first;
+			goto rescan;
+		}
+
+		no_pio_bufs(dd);
+		buf = NULL;
+	} else {
+		if (i < dd->ipath_piobcnt2k)
+			buf = (u32 __iomem *) (dd->ipath_pio2kbase +
+					       i * dd->ipath_palign);
+		else
+			buf = (u32 __iomem *)
+				(dd->ipath_pio4kbase +
+				 (i - dd->ipath_piobcnt2k) * dd->ipath_4kalign);
+		if (pbufnum)
+			*pbufnum = i;
+	}
+
+	return buf;
+}
+
+/**
+ * ipath_getpiobuf - find an available pio buffer
+ * @dd: the infinipath device
+ * @plen: the size of the PIO buffer needed in 32-bit words
+ * @pbufnum: the buffer number is placed here
+ */
+u32 __iomem *ipath_getpiobuf(struct ipath_devdata *dd, u32 plen, u32 *pbufnum)
+{
+	u32 __iomem *buf;
+	u32 pnum, nbufs;
+	u32 first, lasti;
+
+	if (plen + 1 >= IPATH_SMALLBUF_DWORDS) {
+		first = dd->ipath_piobcnt2k;
+		lasti = dd->ipath_lastpioindexl;
+	} else {
+		first = 0;
+		lasti = dd->ipath_lastpioindex;
+	}
+	nbufs = dd->ipath_piobcnt2k + dd->ipath_piobcnt4k;
+	buf = ipath_getpiobuf_range(dd, &pnum, first, nbufs, lasti);
+
+	if (buf) {
+		/*
+		 * Set next starting place.  It's just an optimization,
+		 * it doesn't matter who wins on this, so no locking
+		 */
+		if (plen + 1 >= IPATH_SMALLBUF_DWORDS)
+			dd->ipath_lastpioindexl = pnum + 1;
+		else
+			dd->ipath_lastpioindex = pnum + 1;
+		if (dd->ipath_upd_pio_shadow)
+			dd->ipath_upd_pio_shadow = 0;
+		if (dd->ipath_consec_nopiobuf)
+			dd->ipath_consec_nopiobuf = 0;
+		ipath_cdbg(VERBOSE, "Return piobuf%u %uk @ %p\n",
+			   pnum, (pnum < dd->ipath_piobcnt2k) ? 2 : 4, buf);
+		if (pbufnum)
+			*pbufnum = pnum;
+
+	}
+	return buf;
+}
+
+/**
+ * ipath_chg_pioavailkernel - change which send buffers are available for kernel
+ * @dd: the infinipath device
+ * @start: the starting send buffer number
+ * @len: the number of send buffers
+ * @avail: true if the buffers are available for kernel use, false otherwise
+ */
+void ipath_chg_pioavailkernel(struct ipath_devdata *dd, unsigned start,
+			      unsigned len, int avail)
+{
+	unsigned long flags;
+	unsigned end, cnt = 0;
+
+	/* There are two bits per send buffer (busy and generation) */
+	start *= 2;
+	end = start + len * 2;
+
+	spin_lock_irqsave(&ipath_pioavail_lock, flags);
+	/* Set or clear the busy bit in the shadow. */
+	while (start < end) {
+		if (avail) {
+			unsigned long dma;
+			int i, im;
+			/*
+			 * the BUSY bit will never be set, because we disarm
+			 * the user buffers before we hand them back to the
+			 * kernel.  We do have to make sure the generation
+			 * bit is set correctly in shadow, since it could
+			 * have changed many times while allocated to user.
+			 * We can't use the bitmap functions on the full
+			 * dma array because it is always little-endian, so
+			 * we have to flip to host-order first.
+			 * BITS_PER_LONG is slightly wrong, since it's
+			 * always 64 bits per register in chip...
+			 * We only work on 64 bit kernels, so that's OK.
+			 */
+			/* deal with 6110 chip bug on high register #s */
+			i = start / BITS_PER_LONG;
+			im = (i > 3 && (dd->ipath_flags & IPATH_SWAP_PIOBUFS)) ?
+				i ^ 1 : i;
+			__clear_bit(INFINIPATH_SENDPIOAVAIL_BUSY_SHIFT
+				+ start, dd->ipath_pioavailshadow);
+			dma = (unsigned long) le64_to_cpu(
+				dd->ipath_pioavailregs_dma[im]);
+			if (test_bit((INFINIPATH_SENDPIOAVAIL_CHECK_SHIFT
+				+ start) % BITS_PER_LONG, &dma))
+				__set_bit(INFINIPATH_SENDPIOAVAIL_CHECK_SHIFT
+					+ start, dd->ipath_pioavailshadow);
+			else
+				__clear_bit(INFINIPATH_SENDPIOAVAIL_CHECK_SHIFT
+					+ start, dd->ipath_pioavailshadow);
+			__set_bit(start, dd->ipath_pioavailkernel);
+		} else {
+			__set_bit(start + INFINIPATH_SENDPIOAVAIL_BUSY_SHIFT,
+				dd->ipath_pioavailshadow);
+			__clear_bit(start, dd->ipath_pioavailkernel);
+		}
+		start += 2;
+	}
+
+	if (dd->ipath_pioupd_thresh) {
+		end = 2 * (dd->ipath_piobcnt2k + dd->ipath_piobcnt4k);
+		cnt = bitmap_weight(dd->ipath_pioavailkernel, end);
+	}
+	spin_unlock_irqrestore(&ipath_pioavail_lock, flags);
+
+	/*
+	 * When moving buffers from kernel to user, if number assigned to
+	 * the user is less than the pio update threshold, and threshold
+	 * is supported (cnt was computed > 0), drop the update threshold
+	 * so we update at least once per allocated number of buffers.
+	 * In any case, if the kernel buffers are less than the threshold,
+	 * drop the threshold.  We don't bother increasing it, having once
+	 * decreased it, since it would typically just cycle back and forth.
+	 * If we don't decrease below buffers in use, we can wait a long
+	 * time for an update, until some other context uses PIO buffers.
+	 */
+	if (!avail && len < cnt)
+		cnt = len;
+	if (cnt < dd->ipath_pioupd_thresh) {
+		dd->ipath_pioupd_thresh = cnt;
+		ipath_dbg("Decreased pio update threshold to %u\n",
+			dd->ipath_pioupd_thresh);
+		spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
+		dd->ipath_sendctrl &= ~(INFINIPATH_S_UPDTHRESH_MASK
+			<< INFINIPATH_S_UPDTHRESH_SHIFT);
+		dd->ipath_sendctrl |= dd->ipath_pioupd_thresh
+			<< INFINIPATH_S_UPDTHRESH_SHIFT;
+		ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl,
+			dd->ipath_sendctrl);
+		spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
+	}
+}
+
+/**
+ * ipath_create_rcvhdrq - create a receive header queue
+ * @dd: the infinipath device
+ * @pd: the port data
+ *
+ * this must be contiguous memory (from an i/o perspective), and must be
+ * DMA'able (which means for some systems, it will go through an IOMMU,
+ * or be forced into a low address range).
+ */
+int ipath_create_rcvhdrq(struct ipath_devdata *dd,
+			 struct ipath_portdata *pd)
+{
+	int ret = 0;
+
+	if (!pd->port_rcvhdrq) {
+		dma_addr_t phys_hdrqtail;
+		gfp_t gfp_flags = GFP_USER | __GFP_COMP;
+		int amt = ALIGN(dd->ipath_rcvhdrcnt * dd->ipath_rcvhdrentsize *
+				sizeof(u32), PAGE_SIZE);
+
+		pd->port_rcvhdrq = dma_alloc_coherent(
+			&dd->pcidev->dev, amt, &pd->port_rcvhdrq_phys,
+			gfp_flags);
+
+		if (!pd->port_rcvhdrq) {
+			ipath_dev_err(dd, "attempt to allocate %d bytes "
+				      "for port %u rcvhdrq failed\n",
+				      amt, pd->port_port);
+			ret = -ENOMEM;
+			goto bail;
+		}
+
+		if (!(dd->ipath_flags & IPATH_NODMA_RTAIL)) {
+			pd->port_rcvhdrtail_kvaddr = dma_alloc_coherent(
+				&dd->pcidev->dev, PAGE_SIZE, &phys_hdrqtail,
+				GFP_KERNEL);
+			if (!pd->port_rcvhdrtail_kvaddr) {
+				ipath_dev_err(dd, "attempt to allocate 1 page "
+					"for port %u rcvhdrqtailaddr "
+					"failed\n", pd->port_port);
+				ret = -ENOMEM;
+				dma_free_coherent(&dd->pcidev->dev, amt,
+					pd->port_rcvhdrq,
+					pd->port_rcvhdrq_phys);
+				pd->port_rcvhdrq = NULL;
+				goto bail;
+			}
+			pd->port_rcvhdrqtailaddr_phys = phys_hdrqtail;
+			ipath_cdbg(VERBOSE, "port %d hdrtailaddr, %llx "
+				   "physical\n", pd->port_port,
+				   (unsigned long long) phys_hdrqtail);
+		}
+
+		pd->port_rcvhdrq_size = amt;
+
+		ipath_cdbg(VERBOSE, "%d pages at %p (phys %lx) size=%lu "
+			   "for port %u rcvhdr Q\n",
+			   amt >> PAGE_SHIFT, pd->port_rcvhdrq,
+			   (unsigned long) pd->port_rcvhdrq_phys,
+			   (unsigned long) pd->port_rcvhdrq_size,
+			   pd->port_port);
+	}
+	else
+		ipath_cdbg(VERBOSE, "reuse port %d rcvhdrq @%p %llx phys; "
+			   "hdrtailaddr@%p %llx physical\n",
+			   pd->port_port, pd->port_rcvhdrq,
+			   (unsigned long long) pd->port_rcvhdrq_phys,
+			   pd->port_rcvhdrtail_kvaddr, (unsigned long long)
+			   pd->port_rcvhdrqtailaddr_phys);
+
+	/* clear for security and sanity on each use */
+	memset(pd->port_rcvhdrq, 0, pd->port_rcvhdrq_size);
+	if (pd->port_rcvhdrtail_kvaddr)
+		memset(pd->port_rcvhdrtail_kvaddr, 0, PAGE_SIZE);
+
+	/*
+	 * tell chip each time we init it, even if we are re-using previous
+	 * memory (we zero the register at process close)
+	 */
+	ipath_write_kreg_port(dd, dd->ipath_kregs->kr_rcvhdrtailaddr,
+			      pd->port_port, pd->port_rcvhdrqtailaddr_phys);
+	ipath_write_kreg_port(dd, dd->ipath_kregs->kr_rcvhdraddr,
+			      pd->port_port, pd->port_rcvhdrq_phys);
+
+bail:
+	return ret;
+}
+
+
+/*
+ * Flush all sends that might be in the ready to send state, as well as any
+ * that are in the process of being sent.   Used whenever we need to be
+ * sure the send side is idle.  Cleans up all buffer state by canceling
+ * all pio buffers, and issuing an abort, which cleans up anything in the
+ * launch fifo.  The cancel is superfluous on some chip versions, but
+ * it's safer to always do it.
+ * PIOAvail bits are updated by the chip as if normal send had happened.
+ */
+void ipath_cancel_sends(struct ipath_devdata *dd, int restore_sendctrl)
+{
+	unsigned long flags;
+
+	if (dd->ipath_flags & IPATH_IB_AUTONEG_INPROG) {
+		ipath_cdbg(VERBOSE, "Ignore while in autonegotiation\n");
+		goto bail;
+	}
+	/*
+	 * If we have SDMA, and it's not disabled, we have to kick off the
+	 * abort state machine, provided we aren't already aborting.
+	 * If we are in the process of aborting SDMA (!DISABLED, but ABORTING),
+	 * we skip the rest of this routine. It is already "in progress"
+	 */
+	if (dd->ipath_flags & IPATH_HAS_SEND_DMA) {
+		int skip_cancel;
+		unsigned long *statp = &dd->ipath_sdma_status;
+
+		spin_lock_irqsave(&dd->ipath_sdma_lock, flags);
+		skip_cancel =
+			test_and_set_bit(IPATH_SDMA_ABORTING, statp)
+			&& !test_bit(IPATH_SDMA_DISABLED, statp);
+		spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
+		if (skip_cancel)
+			goto bail;
+	}
+
+	ipath_dbg("Cancelling all in-progress send buffers\n");
+
+	/* skip armlaunch errs for a while */
+	dd->ipath_lastcancel = jiffies + HZ / 2;
+
+	/*
+	 * The abort bit is auto-clearing.  We also don't want pioavail
+	 * update happening during this, and we don't want any other
+	 * sends going out, so turn those off for the duration.  We read
+	 * the scratch register to be sure that cancels and the abort
+	 * have taken effect in the chip.  Otherwise two parts are same
+	 * as ipath_force_pio_avail_update()
+	 */
+	spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
+	dd->ipath_sendctrl &= ~(INFINIPATH_S_PIOBUFAVAILUPD
+		| INFINIPATH_S_PIOENABLE);
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl,
+		dd->ipath_sendctrl | INFINIPATH_S_ABORT);
+	ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
+	spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
+
+	/* disarm all send buffers */
+	ipath_disarm_piobufs(dd, 0,
+		dd->ipath_piobcnt2k + dd->ipath_piobcnt4k);
+
+	if (dd->ipath_flags & IPATH_HAS_SEND_DMA)
+		set_bit(IPATH_SDMA_DISARMED, &dd->ipath_sdma_status);
+
+	if (restore_sendctrl) {
+		/* else done by caller later if needed */
+		spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
+		dd->ipath_sendctrl |= INFINIPATH_S_PIOBUFAVAILUPD |
+			INFINIPATH_S_PIOENABLE;
+		ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl,
+			dd->ipath_sendctrl);
+		/* and again, be sure all have hit the chip */
+		ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
+		spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
+	}
+
+	if ((dd->ipath_flags & IPATH_HAS_SEND_DMA) &&
+	    !test_bit(IPATH_SDMA_DISABLED, &dd->ipath_sdma_status) &&
+	    test_bit(IPATH_SDMA_RUNNING, &dd->ipath_sdma_status)) {
+		spin_lock_irqsave(&dd->ipath_sdma_lock, flags);
+		/* only wait so long for intr */
+		dd->ipath_sdma_abort_intr_timeout = jiffies + HZ;
+		dd->ipath_sdma_reset_wait = 200;
+		if (!test_bit(IPATH_SDMA_SHUTDOWN, &dd->ipath_sdma_status))
+			tasklet_hi_schedule(&dd->ipath_sdma_abort_task);
+		spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
+	}
+bail:;
+}
+
+/*
+ * Force an update of in-memory copy of the pioavail registers, when
+ * needed for any of a variety of reasons.  We read the scratch register
+ * to make it highly likely that the update will have happened by the
+ * time we return.  If already off (as in cancel_sends above), this
+ * routine is a nop, on the assumption that the caller will "do the
+ * right thing".
+ */
+void ipath_force_pio_avail_update(struct ipath_devdata *dd)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
+	if (dd->ipath_sendctrl & INFINIPATH_S_PIOBUFAVAILUPD) {
+		ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl,
+			dd->ipath_sendctrl & ~INFINIPATH_S_PIOBUFAVAILUPD);
+		ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
+		ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl,
+			dd->ipath_sendctrl);
+		ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
+	}
+	spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
+}
+
+static void ipath_set_ib_lstate(struct ipath_devdata *dd, int linkcmd,
+				int linitcmd)
+{
+	u64 mod_wd;
+	static const char *what[4] = {
+		[0] = "NOP",
+		[INFINIPATH_IBCC_LINKCMD_DOWN] = "DOWN",
+		[INFINIPATH_IBCC_LINKCMD_ARMED] = "ARMED",
+		[INFINIPATH_IBCC_LINKCMD_ACTIVE] = "ACTIVE"
+	};
+
+	if (linitcmd == INFINIPATH_IBCC_LINKINITCMD_DISABLE) {
+		/*
+		 * If we are told to disable, note that so link-recovery
+		 * code does not attempt to bring us back up.
+		 */
+		preempt_disable();
+		dd->ipath_flags |= IPATH_IB_LINK_DISABLED;
+		preempt_enable();
+	} else if (linitcmd) {
+		/*
+		 * Any other linkinitcmd will lead to LINKDOWN and then
+		 * to INIT (if all is well), so clear flag to let
+		 * link-recovery code attempt to bring us back up.
+		 */
+		preempt_disable();
+		dd->ipath_flags &= ~IPATH_IB_LINK_DISABLED;
+		preempt_enable();
+	}
+
+	mod_wd = (linkcmd << dd->ibcc_lc_shift) |
+		(linitcmd << INFINIPATH_IBCC_LINKINITCMD_SHIFT);
+	ipath_cdbg(VERBOSE,
+		"Moving unit %u to %s (initcmd=0x%x), current ltstate is %s\n",
+		dd->ipath_unit, what[linkcmd], linitcmd,
+		ipath_ibcstatus_str[ipath_ib_linktrstate(dd,
+			ipath_read_kreg64(dd, dd->ipath_kregs->kr_ibcstatus))]);
+
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_ibcctrl,
+			 dd->ipath_ibcctrl | mod_wd);
+	/* read from chip so write is flushed */
+	(void) ipath_read_kreg64(dd, dd->ipath_kregs->kr_ibcstatus);
+}
+
+int ipath_set_linkstate(struct ipath_devdata *dd, u8 newstate)
+{
+	u32 lstate;
+	int ret;
+
+	switch (newstate) {
+	case IPATH_IB_LINKDOWN_ONLY:
+		ipath_set_ib_lstate(dd, INFINIPATH_IBCC_LINKCMD_DOWN, 0);
+		/* don't wait */
+		ret = 0;
+		goto bail;
+
+	case IPATH_IB_LINKDOWN:
+		ipath_set_ib_lstate(dd, INFINIPATH_IBCC_LINKCMD_DOWN,
+					INFINIPATH_IBCC_LINKINITCMD_POLL);
+		/* don't wait */
+		ret = 0;
+		goto bail;
+
+	case IPATH_IB_LINKDOWN_SLEEP:
+		ipath_set_ib_lstate(dd, INFINIPATH_IBCC_LINKCMD_DOWN,
+					INFINIPATH_IBCC_LINKINITCMD_SLEEP);
+		/* don't wait */
+		ret = 0;
+		goto bail;
+
+	case IPATH_IB_LINKDOWN_DISABLE:
+		ipath_set_ib_lstate(dd, INFINIPATH_IBCC_LINKCMD_DOWN,
+					INFINIPATH_IBCC_LINKINITCMD_DISABLE);
+		/* don't wait */
+		ret = 0;
+		goto bail;
+
+	case IPATH_IB_LINKARM:
+		if (dd->ipath_flags & IPATH_LINKARMED) {
+			ret = 0;
+			goto bail;
+		}
+		if (!(dd->ipath_flags &
+		      (IPATH_LINKINIT | IPATH_LINKACTIVE))) {
+			ret = -EINVAL;
+			goto bail;
+		}
+		ipath_set_ib_lstate(dd, INFINIPATH_IBCC_LINKCMD_ARMED, 0);
+
+		/*
+		 * Since the port can transition to ACTIVE by receiving
+		 * a non VL 15 packet, wait for either state.
+		 */
+		lstate = IPATH_LINKARMED | IPATH_LINKACTIVE;
+		break;
+
+	case IPATH_IB_LINKACTIVE:
+		if (dd->ipath_flags & IPATH_LINKACTIVE) {
+			ret = 0;
+			goto bail;
+		}
+		if (!(dd->ipath_flags & IPATH_LINKARMED)) {
+			ret = -EINVAL;
+			goto bail;
+		}
+		ipath_set_ib_lstate(dd, INFINIPATH_IBCC_LINKCMD_ACTIVE, 0);
+		lstate = IPATH_LINKACTIVE;
+		break;
+
+	case IPATH_IB_LINK_LOOPBACK:
+		dev_info(&dd->pcidev->dev, "Enabling IB local loopback\n");
+		dd->ipath_ibcctrl |= INFINIPATH_IBCC_LOOPBACK;
+		ipath_write_kreg(dd, dd->ipath_kregs->kr_ibcctrl,
+				 dd->ipath_ibcctrl);
+
+		/* turn heartbeat off, as it causes loopback to fail */
+		dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_HRTBT,
+				       IPATH_IB_HRTBT_OFF);
+		/* don't wait */
+		ret = 0;
+		goto bail;
+
+	case IPATH_IB_LINK_EXTERNAL:
+		dev_info(&dd->pcidev->dev,
+			"Disabling IB local loopback (normal)\n");
+		dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_HRTBT,
+				       IPATH_IB_HRTBT_ON);
+		dd->ipath_ibcctrl &= ~INFINIPATH_IBCC_LOOPBACK;
+		ipath_write_kreg(dd, dd->ipath_kregs->kr_ibcctrl,
+				 dd->ipath_ibcctrl);
+		/* don't wait */
+		ret = 0;
+		goto bail;
+
+	/*
+	 * Heartbeat can be explicitly enabled by the user via
+	 * "hrtbt_enable" "file", and if disabled, trying to enable here
+	 * will have no effect.  Implicit changes (heartbeat off when
+	 * loopback on, and vice versa) are included to ease testing.
+	 */
+	case IPATH_IB_LINK_HRTBT:
+		ret = dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_HRTBT,
+			IPATH_IB_HRTBT_ON);
+		goto bail;
+
+	case IPATH_IB_LINK_NO_HRTBT:
+		ret = dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_HRTBT,
+			IPATH_IB_HRTBT_OFF);
+		goto bail;
+
+	default:
+		ipath_dbg("Invalid linkstate 0x%x requested\n", newstate);
+		ret = -EINVAL;
+		goto bail;
+	}
+	ret = ipath_wait_linkstate(dd, lstate, 2000);
+
+bail:
+	return ret;
+}
+
+/**
+ * ipath_set_mtu - set the MTU
+ * @dd: the infinipath device
+ * @arg: the new MTU
+ *
+ * we can handle "any" incoming size, the issue here is whether we
+ * need to restrict our outgoing size.   For now, we don't do any
+ * sanity checking on this, and we don't deal with what happens to
+ * programs that are already running when the size changes.
+ * NOTE: changing the MTU will usually cause the IBC to go back to
+ * link INIT state...
+ */
+int ipath_set_mtu(struct ipath_devdata *dd, u16 arg)
+{
+	u32 piosize;
+	int changed = 0;
+	int ret;
+
+	/*
+	 * mtu is IB data payload max.  It's the largest power of 2 less
+	 * than piosize (or even larger, since it only really controls the
+	 * largest we can receive; we can send the max of the mtu and
+	 * piosize).  We check that it's one of the valid IB sizes.
+	 */
+	if (arg != 256 && arg != 512 && arg != 1024 && arg != 2048 &&
+	    (arg != 4096 || !ipath_mtu4096)) {
+		ipath_dbg("Trying to set invalid mtu %u, failing\n", arg);
+		ret = -EINVAL;
+		goto bail;
+	}
+	if (dd->ipath_ibmtu == arg) {
+		ret = 0;        /* same as current */
+		goto bail;
+	}
+
+	piosize = dd->ipath_ibmaxlen;
+	dd->ipath_ibmtu = arg;
+
+	if (arg >= (piosize - IPATH_PIO_MAXIBHDR)) {
+		/* Only if it's not the initial value (or reset to it) */
+		if (piosize != dd->ipath_init_ibmaxlen) {
+			if (arg > piosize && arg <= dd->ipath_init_ibmaxlen)
+				piosize = dd->ipath_init_ibmaxlen;
+			dd->ipath_ibmaxlen = piosize;
+			changed = 1;
+		}
+	} else if ((arg + IPATH_PIO_MAXIBHDR) != dd->ipath_ibmaxlen) {
+		piosize = arg + IPATH_PIO_MAXIBHDR;
+		ipath_cdbg(VERBOSE, "ibmaxlen was 0x%x, setting to 0x%x "
+			   "(mtu 0x%x)\n", dd->ipath_ibmaxlen, piosize,
+			   arg);
+		dd->ipath_ibmaxlen = piosize;
+		changed = 1;
+	}
+
+	if (changed) {
+		u64 ibc = dd->ipath_ibcctrl, ibdw;
+		/*
+		 * update our housekeeping variables, and set IBC max
+		 * size, same as init code; max IBC is max we allow in
+		 * buffer, less the qword pbc, plus 1 for ICRC, in dwords
+		 */
+		dd->ipath_ibmaxlen = piosize - 2 * sizeof(u32);
+		ibdw = (dd->ipath_ibmaxlen >> 2) + 1;
+		ibc &= ~(INFINIPATH_IBCC_MAXPKTLEN_MASK <<
+			 dd->ibcc_mpl_shift);
+		ibc |= ibdw << dd->ibcc_mpl_shift;
+		dd->ipath_ibcctrl = ibc;
+		ipath_write_kreg(dd, dd->ipath_kregs->kr_ibcctrl,
+				 dd->ipath_ibcctrl);
+		dd->ipath_f_tidtemplate(dd);
+	}
+
+	ret = 0;
+
+bail:
+	return ret;
+}
+
+int ipath_set_lid(struct ipath_devdata *dd, u32 lid, u8 lmc)
+{
+	dd->ipath_lid = lid;
+	dd->ipath_lmc = lmc;
+
+	dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_LIDLMC, lid |
+		(~((1U << lmc) - 1)) << 16);
+
+	dev_info(&dd->pcidev->dev, "We got a lid: 0x%x\n", lid);
+
+	return 0;
+}
+
+
+/**
+ * ipath_write_kreg_port - write a device's per-port 64-bit kernel register
+ * @dd: the infinipath device
+ * @regno: the register number to write
+ * @port: the port containing the register
+ * @value: the value to write
+ *
+ * Registers that vary with the chip implementation constants (port)
+ * use this routine.
+ */
+void ipath_write_kreg_port(const struct ipath_devdata *dd, ipath_kreg regno,
+			  unsigned port, u64 value)
+{
+	u16 where;
+
+	if (port < dd->ipath_portcnt &&
+	    (regno == dd->ipath_kregs->kr_rcvhdraddr ||
+	     regno == dd->ipath_kregs->kr_rcvhdrtailaddr))
+		where = regno + port;
+	else
+		where = -1;
+
+	ipath_write_kreg(dd, where, value);
+}
+
+/*
+ * Following deal with the "obviously simple" task of overriding the state
+ * of the LEDS, which normally indicate link physical and logical status.
+ * The complications arise in dealing with different hardware mappings
+ * and the board-dependent routine being called from interrupts.
+ * and then there's the requirement to _flash_ them.
+ */
+#define LED_OVER_FREQ_SHIFT 8
+#define LED_OVER_FREQ_MASK (0xFF<<LED_OVER_FREQ_SHIFT)
+/* Below is "non-zero" to force override, but both actual LEDs are off */
+#define LED_OVER_BOTH_OFF (8)
+
+static void ipath_run_led_override(unsigned long opaque)
+{
+	struct ipath_devdata *dd = (struct ipath_devdata *)opaque;
+	int timeoff;
+	int pidx;
+	u64 lstate, ltstate, val;
+
+	if (!(dd->ipath_flags & IPATH_INITTED))
+		return;
+
+	pidx = dd->ipath_led_override_phase++ & 1;
+	dd->ipath_led_override = dd->ipath_led_override_vals[pidx];
+	timeoff = dd->ipath_led_override_timeoff;
+
+	/*
+	 * below potentially restores the LED values per current status,
+	 * should also possibly setup the traffic-blink register,
+	 * but leave that to per-chip functions.
+	 */
+	val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_ibcstatus);
+	ltstate = ipath_ib_linktrstate(dd, val);
+	lstate = ipath_ib_linkstate(dd, val);
+
+	dd->ipath_f_setextled(dd, lstate, ltstate);
+	mod_timer(&dd->ipath_led_override_timer, jiffies + timeoff);
+}
+
+void ipath_set_led_override(struct ipath_devdata *dd, unsigned int val)
+{
+	int timeoff, freq;
+
+	if (!(dd->ipath_flags & IPATH_INITTED))
+		return;
+
+	/* First check if we are blinking. If not, use 1HZ polling */
+	timeoff = HZ;
+	freq = (val & LED_OVER_FREQ_MASK) >> LED_OVER_FREQ_SHIFT;
+
+	if (freq) {
+		/* For blink, set each phase from one nybble of val */
+		dd->ipath_led_override_vals[0] = val & 0xF;
+		dd->ipath_led_override_vals[1] = (val >> 4) & 0xF;
+		timeoff = (HZ << 4)/freq;
+	} else {
+		/* Non-blink set both phases the same. */
+		dd->ipath_led_override_vals[0] = val & 0xF;
+		dd->ipath_led_override_vals[1] = val & 0xF;
+	}
+	dd->ipath_led_override_timeoff = timeoff;
+
+	/*
+	 * If the timer has not already been started, do so. Use a "quick"
+	 * timeout so the function will be called soon, to look at our request.
+	 */
+	if (atomic_inc_return(&dd->ipath_led_override_timer_active) == 1) {
+		/* Need to start timer */
+		init_timer(&dd->ipath_led_override_timer);
+		dd->ipath_led_override_timer.function =
+						 ipath_run_led_override;
+		dd->ipath_led_override_timer.data = (unsigned long) dd;
+		dd->ipath_led_override_timer.expires = jiffies + 1;
+		add_timer(&dd->ipath_led_override_timer);
+	} else
+		atomic_dec(&dd->ipath_led_override_timer_active);
+}
+
+/**
+ * ipath_shutdown_device - shut down a device
+ * @dd: the infinipath device
+ *
+ * This is called to make the device quiet when we are about to
+ * unload the driver, and also when the device is administratively
+ * disabled.   It does not free any data structures.
+ * Everything it does has to be setup again by ipath_init_chip(dd,1)
+ */
+void ipath_shutdown_device(struct ipath_devdata *dd)
+{
+	unsigned long flags;
+
+	ipath_dbg("Shutting down the device\n");
+
+	ipath_hol_up(dd); /* make sure user processes aren't suspended */
+
+	dd->ipath_flags |= IPATH_LINKUNK;
+	dd->ipath_flags &= ~(IPATH_INITTED | IPATH_LINKDOWN |
+			     IPATH_LINKINIT | IPATH_LINKARMED |
+			     IPATH_LINKACTIVE);
+	*dd->ipath_statusp &= ~(IPATH_STATUS_IB_CONF |
+				IPATH_STATUS_IB_READY);
+
+	/* mask interrupts, but not errors */
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_intmask, 0ULL);
+
+	dd->ipath_rcvctrl = 0;
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl,
+			 dd->ipath_rcvctrl);
+
+	if (dd->ipath_flags & IPATH_HAS_SEND_DMA)
+		teardown_sdma(dd);
+
+	/*
+	 * gracefully stop all sends allowing any in progress to trickle out
+	 * first.
+	 */
+	spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
+	dd->ipath_sendctrl = 0;
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, dd->ipath_sendctrl);
+	/* flush it */
+	ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
+	spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
+
+	/*
+	 * enough for anything that's going to trickle out to have actually
+	 * done so.
+	 */
+	udelay(5);
+
+	dd->ipath_f_setextled(dd, 0, 0); /* make sure LEDs are off */
+
+	ipath_set_ib_lstate(dd, 0, INFINIPATH_IBCC_LINKINITCMD_DISABLE);
+	ipath_cancel_sends(dd, 0);
+
+	/*
+	 * we are shutting down, so tell components that care.  We don't do
+	 * this on just a link state change, much like ethernet, a cable
+	 * unplug, etc. doesn't change driver state
+	 */
+	signal_ib_event(dd, IB_EVENT_PORT_ERR);
+
+	/* disable IBC */
+	dd->ipath_control &= ~INFINIPATH_C_LINKENABLE;
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_control,
+			 dd->ipath_control | INFINIPATH_C_FREEZEMODE);
+
+	/*
+	 * clear SerdesEnable and turn the leds off; do this here because
+	 * we are unloading, so don't count on interrupts to move along
+	 * Turn the LEDs off explicitly for the same reason.
+	 */
+	dd->ipath_f_quiet_serdes(dd);
+
+	/* stop all the timers that might still be running */
+	del_timer_sync(&dd->ipath_hol_timer);
+	if (dd->ipath_stats_timer_active) {
+		del_timer_sync(&dd->ipath_stats_timer);
+		dd->ipath_stats_timer_active = 0;
+	}
+	if (dd->ipath_intrchk_timer.data) {
+		del_timer_sync(&dd->ipath_intrchk_timer);
+		dd->ipath_intrchk_timer.data = 0;
+	}
+	if (atomic_read(&dd->ipath_led_override_timer_active)) {
+		del_timer_sync(&dd->ipath_led_override_timer);
+		atomic_set(&dd->ipath_led_override_timer_active, 0);
+	}
+
+	/*
+	 * clear all interrupts and errors, so that the next time the driver
+	 * is loaded or device is enabled, we know that whatever is set
+	 * happened while we were unloaded
+	 */
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrclear,
+			 ~0ULL & ~INFINIPATH_HWE_MEMBISTFAILED);
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_errorclear, -1LL);
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_intclear, -1LL);
+
+	ipath_cdbg(VERBOSE, "Flush time and errors to EEPROM\n");
+	ipath_update_eeprom_log(dd);
+}
+
+/**
+ * ipath_free_pddata - free a port's allocated data
+ * @dd: the infinipath device
+ * @pd: the portdata structure
+ *
+ * free up any allocated data for a port
+ * This should not touch anything that would affect a simultaneous
+ * re-allocation of port data, because it is called after ipath_mutex
+ * is released (and can be called from reinit as well).
+ * It should never change any chip state, or global driver state.
+ * (The only exception to global state is freeing the port0 port0_skbs.)
+ */
+void ipath_free_pddata(struct ipath_devdata *dd, struct ipath_portdata *pd)
+{
+	if (!pd)
+		return;
+
+	if (pd->port_rcvhdrq) {
+		ipath_cdbg(VERBOSE, "free closed port %d rcvhdrq @ %p "
+			   "(size=%lu)\n", pd->port_port, pd->port_rcvhdrq,
+			   (unsigned long) pd->port_rcvhdrq_size);
+		dma_free_coherent(&dd->pcidev->dev, pd->port_rcvhdrq_size,
+				  pd->port_rcvhdrq, pd->port_rcvhdrq_phys);
+		pd->port_rcvhdrq = NULL;
+		if (pd->port_rcvhdrtail_kvaddr) {
+			dma_free_coherent(&dd->pcidev->dev, PAGE_SIZE,
+					 pd->port_rcvhdrtail_kvaddr,
+					 pd->port_rcvhdrqtailaddr_phys);
+			pd->port_rcvhdrtail_kvaddr = NULL;
+		}
+	}
+	if (pd->port_port && pd->port_rcvegrbuf) {
+		unsigned e;
+
+		for (e = 0; e < pd->port_rcvegrbuf_chunks; e++) {
+			void *base = pd->port_rcvegrbuf[e];
+			size_t size = pd->port_rcvegrbuf_size;
+
+			ipath_cdbg(VERBOSE, "egrbuf free(%p, %lu), "
+				   "chunk %u/%u\n", base,
+				   (unsigned long) size,
+				   e, pd->port_rcvegrbuf_chunks);
+			dma_free_coherent(&dd->pcidev->dev, size,
+				base, pd->port_rcvegrbuf_phys[e]);
+		}
+		kfree(pd->port_rcvegrbuf);
+		pd->port_rcvegrbuf = NULL;
+		kfree(pd->port_rcvegrbuf_phys);
+		pd->port_rcvegrbuf_phys = NULL;
+		pd->port_rcvegrbuf_chunks = 0;
+	} else if (pd->port_port == 0 && dd->ipath_port0_skbinfo) {
+		unsigned e;
+		struct ipath_skbinfo *skbinfo = dd->ipath_port0_skbinfo;
+
+		dd->ipath_port0_skbinfo = NULL;
+		ipath_cdbg(VERBOSE, "free closed port %d "
+			   "ipath_port0_skbinfo @ %p\n", pd->port_port,
+			   skbinfo);
+		for (e = 0; e < dd->ipath_p0_rcvegrcnt; e++)
+			if (skbinfo[e].skb) {
+				pci_unmap_single(dd->pcidev, skbinfo[e].phys,
+						 dd->ipath_ibmaxlen,
+						 PCI_DMA_FROMDEVICE);
+				dev_kfree_skb(skbinfo[e].skb);
+			}
+		vfree(skbinfo);
+	}
+	kfree(pd->port_tid_pg_list);
+	vfree(pd->subport_uregbase);
+	vfree(pd->subport_rcvegrbuf);
+	vfree(pd->subport_rcvhdr_base);
+	kfree(pd);
+}
+
+static int __init infinipath_init(void)
+{
+	int ret;
+
+	if (ipath_debug & __IPATH_DBG)
+		printk(KERN_INFO DRIVER_LOAD_MSG "%s", ib_ipath_version);
+
+	/*
+	 * These must be called before the driver is registered with
+	 * the PCI subsystem.
+	 */
+	idr_init(&unit_table);
+
+	ret = pci_register_driver(&ipath_driver);
+	if (ret < 0) {
+		printk(KERN_ERR IPATH_DRV_NAME
+		       ": Unable to register driver: error %d\n", -ret);
+		goto bail_unit;
+	}
+
+	ret = ipath_init_ipathfs();
+	if (ret < 0) {
+		printk(KERN_ERR IPATH_DRV_NAME ": Unable to create "
+		       "ipathfs: error %d\n", -ret);
+		goto bail_pci;
+	}
+
+	goto bail;
+
+bail_pci:
+	pci_unregister_driver(&ipath_driver);
+
+bail_unit:
+	idr_destroy(&unit_table);
+
+bail:
+	return ret;
+}
+
+static void __exit infinipath_cleanup(void)
+{
+	ipath_exit_ipathfs();
+
+	ipath_cdbg(VERBOSE, "Unregistering pci driver\n");
+	pci_unregister_driver(&ipath_driver);
+
+	idr_destroy(&unit_table);
+}
+
+/**
+ * ipath_reset_device - reset the chip if possible
+ * @unit: the device to reset
+ *
+ * Whether or not reset is successful, we attempt to re-initialize the chip
+ * (that is, much like a driver unload/reload).  We clear the INITTED flag
+ * so that the various entry points will fail until we reinitialize.  For
+ * now, we only allow this if no user ports are open that use chip resources
+ */
+int ipath_reset_device(int unit)
+{
+	int ret, i;
+	struct ipath_devdata *dd = ipath_lookup(unit);
+	unsigned long flags;
+
+	if (!dd) {
+		ret = -ENODEV;
+		goto bail;
+	}
+
+	if (atomic_read(&dd->ipath_led_override_timer_active)) {
+		/* Need to stop LED timer, _then_ shut off LEDs */
+		del_timer_sync(&dd->ipath_led_override_timer);
+		atomic_set(&dd->ipath_led_override_timer_active, 0);
+	}
+
+	/* Shut off LEDs after we are sure timer is not running */
+	dd->ipath_led_override = LED_OVER_BOTH_OFF;
+	dd->ipath_f_setextled(dd, 0, 0);
+
+	dev_info(&dd->pcidev->dev, "Reset on unit %u requested\n", unit);
+
+	if (!dd->ipath_kregbase || !(dd->ipath_flags & IPATH_PRESENT)) {
+		dev_info(&dd->pcidev->dev, "Invalid unit number %u or "
+			 "not initialized or not present\n", unit);
+		ret = -ENXIO;
+		goto bail;
+	}
+
+	spin_lock_irqsave(&dd->ipath_uctxt_lock, flags);
+	if (dd->ipath_pd)
+		for (i = 1; i < dd->ipath_cfgports; i++) {
+			if (!dd->ipath_pd[i] || !dd->ipath_pd[i]->port_cnt)
+				continue;
+			spin_unlock_irqrestore(&dd->ipath_uctxt_lock, flags);
+			ipath_dbg("unit %u port %d is in use "
+				  "(PID %u cmd %s), can't reset\n",
+				  unit, i,
+				  pid_nr(dd->ipath_pd[i]->port_pid),
+				  dd->ipath_pd[i]->port_comm);
+			ret = -EBUSY;
+			goto bail;
+		}
+	spin_unlock_irqrestore(&dd->ipath_uctxt_lock, flags);
+
+	if (dd->ipath_flags & IPATH_HAS_SEND_DMA)
+		teardown_sdma(dd);
+
+	dd->ipath_flags &= ~IPATH_INITTED;
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_intmask, 0ULL);
+	ret = dd->ipath_f_reset(dd);
+	if (ret == 1) {
+		ipath_dbg("Reinitializing unit %u after reset attempt\n",
+			  unit);
+		ret = ipath_init_chip(dd, 1);
+	} else
+		ret = -EAGAIN;
+	if (ret)
+		ipath_dev_err(dd, "Reinitialize unit %u after "
+			      "reset failed with %d\n", unit, ret);
+	else
+		dev_info(&dd->pcidev->dev, "Reinitialized unit %u after "
+			 "resetting\n", unit);
+
+bail:
+	return ret;
+}
+
+/*
+ * send a signal to all the processes that have the driver open
+ * through the normal interfaces (i.e., everything other than diags
+ * interface).  Returns number of signalled processes.
+ */
+static int ipath_signal_procs(struct ipath_devdata *dd, int sig)
+{
+	int i, sub, any = 0;
+	struct pid *pid;
+	unsigned long flags;
+
+	if (!dd->ipath_pd)
+		return 0;
+
+	spin_lock_irqsave(&dd->ipath_uctxt_lock, flags);
+	for (i = 1; i < dd->ipath_cfgports; i++) {
+		if (!dd->ipath_pd[i] || !dd->ipath_pd[i]->port_cnt)
+			continue;
+		pid = dd->ipath_pd[i]->port_pid;
+		if (!pid)
+			continue;
+
+		dev_info(&dd->pcidev->dev, "context %d in use "
+			  "(PID %u), sending signal %d\n",
+			  i, pid_nr(pid), sig);
+		kill_pid(pid, sig, 1);
+		any++;
+		for (sub = 0; sub < INFINIPATH_MAX_SUBPORT; sub++) {
+			pid = dd->ipath_pd[i]->port_subpid[sub];
+			if (!pid)
+				continue;
+			dev_info(&dd->pcidev->dev, "sub-context "
+				"%d:%d in use (PID %u), sending "
+				"signal %d\n", i, sub, pid_nr(pid), sig);
+			kill_pid(pid, sig, 1);
+			any++;
+		}
+	}
+	spin_unlock_irqrestore(&dd->ipath_uctxt_lock, flags);
+	return any;
+}
+
+static void ipath_hol_signal_down(struct ipath_devdata *dd)
+{
+	if (ipath_signal_procs(dd, SIGSTOP))
+		ipath_dbg("Stopped some processes\n");
+	ipath_cancel_sends(dd, 1);
+}
+
+
+static void ipath_hol_signal_up(struct ipath_devdata *dd)
+{
+	if (ipath_signal_procs(dd, SIGCONT))
+		ipath_dbg("Continued some processes\n");
+}
+
+/*
+ * link is down, stop any users processes, and flush pending sends
+ * to prevent HoL blocking, then start the HoL timer that
+ * periodically continues, then stop procs, so they can detect
+ * link down if they want, and do something about it.
+ * Timer may already be running, so use mod_timer, not add_timer.
+ */
+void ipath_hol_down(struct ipath_devdata *dd)
+{
+	dd->ipath_hol_state = IPATH_HOL_DOWN;
+	ipath_hol_signal_down(dd);
+	dd->ipath_hol_next = IPATH_HOL_DOWNCONT;
+	dd->ipath_hol_timer.expires = jiffies +
+		msecs_to_jiffies(ipath_hol_timeout_ms);
+	mod_timer(&dd->ipath_hol_timer, dd->ipath_hol_timer.expires);
+}
+
+/*
+ * link is up, continue any user processes, and ensure timer
+ * is a nop, if running.  Let timer keep running, if set; it
+ * will nop when it sees the link is up
+ */
+void ipath_hol_up(struct ipath_devdata *dd)
+{
+	ipath_hol_signal_up(dd);
+	dd->ipath_hol_state = IPATH_HOL_UP;
+}
+
+/*
+ * toggle the running/not running state of user proceses
+ * to prevent HoL blocking on chip resources, but still allow
+ * user processes to do link down special case handling.
+ * Should only be called via the timer
+ */
+void ipath_hol_event(unsigned long opaque)
+{
+	struct ipath_devdata *dd = (struct ipath_devdata *)opaque;
+
+	if (dd->ipath_hol_next == IPATH_HOL_DOWNSTOP
+		&& dd->ipath_hol_state != IPATH_HOL_UP) {
+		dd->ipath_hol_next = IPATH_HOL_DOWNCONT;
+		ipath_dbg("Stopping processes\n");
+		ipath_hol_signal_down(dd);
+	} else { /* may do "extra" if also in ipath_hol_up() */
+		dd->ipath_hol_next = IPATH_HOL_DOWNSTOP;
+		ipath_dbg("Continuing processes\n");
+		ipath_hol_signal_up(dd);
+	}
+	if (dd->ipath_hol_state == IPATH_HOL_UP)
+		ipath_dbg("link's up, don't resched timer\n");
+	else {
+		dd->ipath_hol_timer.expires = jiffies +
+			msecs_to_jiffies(ipath_hol_timeout_ms);
+		mod_timer(&dd->ipath_hol_timer,
+			dd->ipath_hol_timer.expires);
+	}
+}
+
+int ipath_set_rx_pol_inv(struct ipath_devdata *dd, u8 new_pol_inv)
+{
+	u64 val;
+
+	if (new_pol_inv > INFINIPATH_XGXS_RX_POL_MASK)
+		return -1;
+	if (dd->ipath_rx_pol_inv != new_pol_inv) {
+		dd->ipath_rx_pol_inv = new_pol_inv;
+		val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_xgxsconfig);
+		val &= ~(INFINIPATH_XGXS_RX_POL_MASK <<
+			 INFINIPATH_XGXS_RX_POL_SHIFT);
+		val |= ((u64)dd->ipath_rx_pol_inv) <<
+			INFINIPATH_XGXS_RX_POL_SHIFT;
+		ipath_write_kreg(dd, dd->ipath_kregs->kr_xgxsconfig, val);
+	}
+	return 0;
+}
+
+/*
+ * Disable and enable the armlaunch error.  Used for PIO bandwidth testing on
+ * the 7220, which is count-based, rather than trigger-based.  Safe for the
+ * driver check, since it's at init.   Not completely safe when used for
+ * user-mode checking, since some error checking can be lost, but not
+ * particularly risky, and only has problematic side-effects in the face of
+ * very buggy user code.  There is no reference counting, but that's also
+ * fine, given the intended use.
+ */
+void ipath_enable_armlaunch(struct ipath_devdata *dd)
+{
+	dd->ipath_lasterror &= ~INFINIPATH_E_SPIOARMLAUNCH;
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_errorclear,
+		INFINIPATH_E_SPIOARMLAUNCH);
+	dd->ipath_errormask |= INFINIPATH_E_SPIOARMLAUNCH;
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_errormask,
+		dd->ipath_errormask);
+}
+
+void ipath_disable_armlaunch(struct ipath_devdata *dd)
+{
+	/* so don't re-enable if already set */
+	dd->ipath_maskederrs &= ~INFINIPATH_E_SPIOARMLAUNCH;
+	dd->ipath_errormask &= ~INFINIPATH_E_SPIOARMLAUNCH;
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_errormask,
+		dd->ipath_errormask);
+}
+
+module_init(infinipath_init);
+module_exit(infinipath_cleanup);
diff --git a/drivers/infiniband/hw/ipath/ipath_eeprom.c b/drivers/infiniband/hw/ipath/ipath_eeprom.c
new file mode 100644
index 0000000..fc71819
--- /dev/null
+++ b/drivers/infiniband/hw/ipath/ipath_eeprom.c
@@ -0,0 +1,1183 @@
+/*
+ * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/delay.h>
+#include <linux/pci.h>
+#include <linux/vmalloc.h>
+
+#include "ipath_kernel.h"
+
+/*
+ * InfiniPath I2C driver for a serial eeprom.  This is not a generic
+ * I2C interface.  For a start, the device we're using (Atmel AT24C11)
+ * doesn't work like a regular I2C device.  It looks like one
+ * electrically, but not logically.  Normal I2C devices have a single
+ * 7-bit or 10-bit I2C address that they respond to.  Valid 7-bit
+ * addresses range from 0x03 to 0x77.  Addresses 0x00 to 0x02 and 0x78
+ * to 0x7F are special reserved addresses (e.g. 0x00 is the "general
+ * call" address.)  The Atmel device, on the other hand, responds to ALL
+ * 7-bit addresses.  It's designed to be the only device on a given I2C
+ * bus.  A 7-bit address corresponds to the memory address within the
+ * Atmel device itself.
+ *
+ * Also, the timing requirements mean more than simple software
+ * bitbanging, with readbacks from chip to ensure timing (simple udelay
+ * is not enough).
+ *
+ * This all means that accessing the device is specialized enough
+ * that using the standard kernel I2C bitbanging interface would be
+ * impossible.  For example, the core I2C eeprom driver expects to find
+ * a device at one or more of a limited set of addresses only.  It doesn't
+ * allow writing to an eeprom.  It also doesn't provide any means of
+ * accessing eeprom contents from within the kernel, only via sysfs.
+ */
+
+/* Added functionality for IBA7220-based cards */
+#define IPATH_EEPROM_DEV_V1 0xA0
+#define IPATH_EEPROM_DEV_V2 0xA2
+#define IPATH_TEMP_DEV 0x98
+#define IPATH_BAD_DEV (IPATH_EEPROM_DEV_V2+2)
+#define IPATH_NO_DEV (0xFF)
+
+/*
+ * The number of I2C chains is proliferating. Table below brings
+ * some order to the madness. The basic principle is that the
+ * table is scanned from the top, and a "probe" is made to the
+ * device probe_dev. If that succeeds, the chain is considered
+ * to be of that type, and dd->i2c_chain_type is set to the index+1
+ * of the entry.
+ * The +1 is so static initialization can mean "unknown, do probe."
+ */
+static struct i2c_chain_desc {
+	u8 probe_dev;	/* If seen at probe, chain is this type */
+	u8 eeprom_dev;	/* Dev addr (if any) for EEPROM */
+	u8 temp_dev;	/* Dev Addr (if any) for Temp-sense */
+} i2c_chains[] = {
+	{ IPATH_BAD_DEV, IPATH_NO_DEV, IPATH_NO_DEV }, /* pre-iba7220 bds */
+	{ IPATH_EEPROM_DEV_V1, IPATH_EEPROM_DEV_V1, IPATH_TEMP_DEV}, /* V1 */
+	{ IPATH_EEPROM_DEV_V2, IPATH_EEPROM_DEV_V2, IPATH_TEMP_DEV}, /* V2 */
+	{ IPATH_NO_DEV }
+};
+
+enum i2c_type {
+	i2c_line_scl = 0,
+	i2c_line_sda
+};
+
+enum i2c_state {
+	i2c_line_low = 0,
+	i2c_line_high
+};
+
+#define READ_CMD 1
+#define WRITE_CMD 0
+
+/**
+ * i2c_gpio_set - set a GPIO line
+ * @dd: the infinipath device
+ * @line: the line to set
+ * @new_line_state: the state to set
+ *
+ * Returns 0 if the line was set to the new state successfully, non-zero
+ * on error.
+ */
+static int i2c_gpio_set(struct ipath_devdata *dd,
+			enum i2c_type line,
+			enum i2c_state new_line_state)
+{
+	u64 out_mask, dir_mask, *gpioval;
+	unsigned long flags = 0;
+
+	gpioval = &dd->ipath_gpio_out;
+
+	if (line == i2c_line_scl) {
+		dir_mask = dd->ipath_gpio_scl;
+		out_mask = (1UL << dd->ipath_gpio_scl_num);
+	} else {
+		dir_mask = dd->ipath_gpio_sda;
+		out_mask = (1UL << dd->ipath_gpio_sda_num);
+	}
+
+	spin_lock_irqsave(&dd->ipath_gpio_lock, flags);
+	if (new_line_state == i2c_line_high) {
+		/* tri-state the output rather than force high */
+		dd->ipath_extctrl &= ~dir_mask;
+	} else {
+		/* config line to be an output */
+		dd->ipath_extctrl |= dir_mask;
+	}
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_extctrl, dd->ipath_extctrl);
+
+	/* set output as well (no real verify) */
+	if (new_line_state == i2c_line_high)
+		*gpioval |= out_mask;
+	else
+		*gpioval &= ~out_mask;
+
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_gpio_out, *gpioval);
+	spin_unlock_irqrestore(&dd->ipath_gpio_lock, flags);
+
+	return 0;
+}
+
+/**
+ * i2c_gpio_get - get a GPIO line state
+ * @dd: the infinipath device
+ * @line: the line to get
+ * @curr_statep: where to put the line state
+ *
+ * Returns 0 if the line was set to the new state successfully, non-zero
+ * on error.  curr_state is not set on error.
+ */
+static int i2c_gpio_get(struct ipath_devdata *dd,
+			enum i2c_type line,
+			enum i2c_state *curr_statep)
+{
+	u64 read_val, mask;
+	int ret;
+	unsigned long flags = 0;
+
+	/* check args */
+	if (curr_statep == NULL) {
+		ret = 1;
+		goto bail;
+	}
+
+	/* config line to be an input */
+	if (line == i2c_line_scl)
+		mask = dd->ipath_gpio_scl;
+	else
+		mask = dd->ipath_gpio_sda;
+
+	spin_lock_irqsave(&dd->ipath_gpio_lock, flags);
+	dd->ipath_extctrl &= ~mask;
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_extctrl, dd->ipath_extctrl);
+	/*
+	 * Below is very unlikely to reflect true input state if Output
+	 * Enable actually changed.
+	 */
+	read_val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_extstatus);
+	spin_unlock_irqrestore(&dd->ipath_gpio_lock, flags);
+
+	if (read_val & mask)
+		*curr_statep = i2c_line_high;
+	else
+		*curr_statep = i2c_line_low;
+
+	ret = 0;
+
+bail:
+	return ret;
+}
+
+/**
+ * i2c_wait_for_writes - wait for a write
+ * @dd: the infinipath device
+ *
+ * We use this instead of udelay directly, so we can make sure
+ * that previous register writes have been flushed all the way
+ * to the chip.  Since we are delaying anyway, the cost doesn't
+ * hurt, and makes the bit twiddling more regular
+ */
+static void i2c_wait_for_writes(struct ipath_devdata *dd)
+{
+	(void)ipath_read_kreg32(dd, dd->ipath_kregs->kr_scratch);
+	rmb();
+}
+
+static void scl_out(struct ipath_devdata *dd, u8 bit)
+{
+	udelay(1);
+	i2c_gpio_set(dd, i2c_line_scl, bit ? i2c_line_high : i2c_line_low);
+
+	i2c_wait_for_writes(dd);
+}
+
+static void sda_out(struct ipath_devdata *dd, u8 bit)
+{
+	i2c_gpio_set(dd, i2c_line_sda, bit ? i2c_line_high : i2c_line_low);
+
+	i2c_wait_for_writes(dd);
+}
+
+static u8 sda_in(struct ipath_devdata *dd, int wait)
+{
+	enum i2c_state bit;
+
+	if (i2c_gpio_get(dd, i2c_line_sda, &bit))
+		ipath_dbg("get bit failed!\n");
+
+	if (wait)
+		i2c_wait_for_writes(dd);
+
+	return bit == i2c_line_high ? 1U : 0;
+}
+
+/**
+ * i2c_ackrcv - see if ack following write is true
+ * @dd: the infinipath device
+ */
+static int i2c_ackrcv(struct ipath_devdata *dd)
+{
+	u8 ack_received;
+
+	/* AT ENTRY SCL = LOW */
+	/* change direction, ignore data */
+	ack_received = sda_in(dd, 1);
+	scl_out(dd, i2c_line_high);
+	ack_received = sda_in(dd, 1) == 0;
+	scl_out(dd, i2c_line_low);
+	return ack_received;
+}
+
+/**
+ * rd_byte - read a byte, leaving ACK, STOP, etc up to caller
+ * @dd: the infinipath device
+ *
+ * Returns byte shifted out of device
+ */
+static int rd_byte(struct ipath_devdata *dd)
+{
+	int bit_cntr, data;
+
+	data = 0;
+
+	for (bit_cntr = 7; bit_cntr >= 0; --bit_cntr) {
+		data <<= 1;
+		scl_out(dd, i2c_line_high);
+		data |= sda_in(dd, 0);
+		scl_out(dd, i2c_line_low);
+	}
+	return data;
+}
+
+/**
+ * wr_byte - write a byte, one bit at a time
+ * @dd: the infinipath device
+ * @data: the byte to write
+ *
+ * Returns 0 if we got the following ack, otherwise 1
+ */
+static int wr_byte(struct ipath_devdata *dd, u8 data)
+{
+	int bit_cntr;
+	u8 bit;
+
+	for (bit_cntr = 7; bit_cntr >= 0; bit_cntr--) {
+		bit = (data >> bit_cntr) & 1;
+		sda_out(dd, bit);
+		scl_out(dd, i2c_line_high);
+		scl_out(dd, i2c_line_low);
+	}
+	return (!i2c_ackrcv(dd)) ? 1 : 0;
+}
+
+static void send_ack(struct ipath_devdata *dd)
+{
+	sda_out(dd, i2c_line_low);
+	scl_out(dd, i2c_line_high);
+	scl_out(dd, i2c_line_low);
+	sda_out(dd, i2c_line_high);
+}
+
+/**
+ * i2c_startcmd - transmit the start condition, followed by address/cmd
+ * @dd: the infinipath device
+ * @offset_dir: direction byte
+ *
+ *      (both clock/data high, clock high, data low while clock is high)
+ */
+static int i2c_startcmd(struct ipath_devdata *dd, u8 offset_dir)
+{
+	int res;
+
+	/* issue start sequence */
+	sda_out(dd, i2c_line_high);
+	scl_out(dd, i2c_line_high);
+	sda_out(dd, i2c_line_low);
+	scl_out(dd, i2c_line_low);
+
+	/* issue length and direction byte */
+	res = wr_byte(dd, offset_dir);
+
+	if (res)
+		ipath_cdbg(VERBOSE, "No ack to complete start\n");
+
+	return res;
+}
+
+/**
+ * stop_cmd - transmit the stop condition
+ * @dd: the infinipath device
+ *
+ * (both clock/data low, clock high, data high while clock is high)
+ */
+static void stop_cmd(struct ipath_devdata *dd)
+{
+	scl_out(dd, i2c_line_low);
+	sda_out(dd, i2c_line_low);
+	scl_out(dd, i2c_line_high);
+	sda_out(dd, i2c_line_high);
+	udelay(2);
+}
+
+/**
+ * eeprom_reset - reset I2C communication
+ * @dd: the infinipath device
+ */
+
+static int eeprom_reset(struct ipath_devdata *dd)
+{
+	int clock_cycles_left = 9;
+	u64 *gpioval = &dd->ipath_gpio_out;
+	int ret;
+	unsigned long flags;
+
+	spin_lock_irqsave(&dd->ipath_gpio_lock, flags);
+	/* Make sure shadows are consistent */
+	dd->ipath_extctrl = ipath_read_kreg64(dd, dd->ipath_kregs->kr_extctrl);
+	*gpioval = ipath_read_kreg64(dd, dd->ipath_kregs->kr_gpio_out);
+	spin_unlock_irqrestore(&dd->ipath_gpio_lock, flags);
+
+	ipath_cdbg(VERBOSE, "Resetting i2c eeprom; initial gpioout reg "
+		   "is %llx\n", (unsigned long long) *gpioval);
+
+	/*
+	 * This is to get the i2c into a known state, by first going low,
+	 * then tristate sda (and then tristate scl as first thing
+	 * in loop)
+	 */
+	scl_out(dd, i2c_line_low);
+	sda_out(dd, i2c_line_high);
+
+	/* Clock up to 9 cycles looking for SDA hi, then issue START and STOP */
+	while (clock_cycles_left--) {
+		scl_out(dd, i2c_line_high);
+
+		/* SDA seen high, issue START by dropping it while SCL high */
+		if (sda_in(dd, 0)) {
+			sda_out(dd, i2c_line_low);
+			scl_out(dd, i2c_line_low);
+			/* ATMEL spec says must be followed by STOP. */
+			scl_out(dd, i2c_line_high);
+			sda_out(dd, i2c_line_high);
+			ret = 0;
+			goto bail;
+		}
+
+		scl_out(dd, i2c_line_low);
+	}
+
+	ret = 1;
+
+bail:
+	return ret;
+}
+
+/*
+ * Probe for I2C device at specified address. Returns 0 for "success"
+ * to match rest of this file.
+ * Leave bus in "reasonable" state for further commands.
+ */
+static int i2c_probe(struct ipath_devdata *dd, int devaddr)
+{
+	int ret = 0;
+
+	ret = eeprom_reset(dd);
+	if (ret) {
+		ipath_dev_err(dd, "Failed reset probing device 0x%02X\n",
+			      devaddr);
+		return ret;
+	}
+	/*
+	 * Reset no longer leaves bus in start condition, so normal
+	 * i2c_startcmd() will do.
+	 */
+	ret = i2c_startcmd(dd, devaddr | READ_CMD);
+	if (ret)
+		ipath_cdbg(VERBOSE, "Failed startcmd for device 0x%02X\n",
+			   devaddr);
+	else {
+		/*
+		 * Device did respond. Complete a single-byte read, because some
+		 * devices apparently cannot handle STOP immediately after they
+		 * ACK the start-cmd.
+		 */
+		int data;
+		data = rd_byte(dd);
+		stop_cmd(dd);
+		ipath_cdbg(VERBOSE, "Response from device 0x%02X\n", devaddr);
+	}
+	return ret;
+}
+
+/*
+ * Returns the "i2c type". This is a pointer to a struct that describes
+ * the I2C chain on this board. To minimize impact on struct ipath_devdata,
+ * the (small integer) index into the table is actually memoized, rather
+ * then the pointer.
+ * Memoization is because the type is determined on the first call per chip.
+ * An alternative would be to move type determination to early
+ * init code.
+ */
+static struct i2c_chain_desc *ipath_i2c_type(struct ipath_devdata *dd)
+{
+	int idx;
+
+	/* Get memoized index, from previous successful probes */
+	idx = dd->ipath_i2c_chain_type - 1;
+	if (idx >= 0 && idx < (ARRAY_SIZE(i2c_chains) - 1))
+		goto done;
+
+	idx = 0;
+	while (i2c_chains[idx].probe_dev != IPATH_NO_DEV) {
+		/* if probe succeeds, this is type */
+		if (!i2c_probe(dd, i2c_chains[idx].probe_dev))
+			break;
+		++idx;
+	}
+
+	/*
+	 * Old EEPROM (first entry) may require a reset after probe,
+	 * rather than being able to "start" after "stop"
+	 */
+	if (idx == 0)
+		eeprom_reset(dd);
+
+	if (i2c_chains[idx].probe_dev == IPATH_NO_DEV)
+		idx = -1;
+	else
+		dd->ipath_i2c_chain_type = idx + 1;
+done:
+	return (idx >= 0) ? i2c_chains + idx : NULL;
+}
+
+static int ipath_eeprom_internal_read(struct ipath_devdata *dd,
+					u8 eeprom_offset, void *buffer, int len)
+{
+	int ret;
+	struct i2c_chain_desc *icd;
+	u8 *bp = buffer;
+
+	ret = 1;
+	icd = ipath_i2c_type(dd);
+	if (!icd)
+		goto bail;
+
+	if (icd->eeprom_dev == IPATH_NO_DEV) {
+		/* legacy not-really-I2C */
+		ipath_cdbg(VERBOSE, "Start command only address\n");
+		eeprom_offset = (eeprom_offset << 1) | READ_CMD;
+		ret = i2c_startcmd(dd, eeprom_offset);
+	} else {
+		/* Actual I2C */
+		ipath_cdbg(VERBOSE, "Start command uses devaddr\n");
+		if (i2c_startcmd(dd, icd->eeprom_dev | WRITE_CMD)) {
+			ipath_dbg("Failed EEPROM startcmd\n");
+			stop_cmd(dd);
+			ret = 1;
+			goto bail;
+		}
+		ret = wr_byte(dd, eeprom_offset);
+		stop_cmd(dd);
+		if (ret) {
+			ipath_dev_err(dd, "Failed to write EEPROM address\n");
+			ret = 1;
+			goto bail;
+		}
+		ret = i2c_startcmd(dd, icd->eeprom_dev | READ_CMD);
+	}
+	if (ret) {
+		ipath_dbg("Failed startcmd for dev %02X\n", icd->eeprom_dev);
+		stop_cmd(dd);
+		ret = 1;
+		goto bail;
+	}
+
+	/*
+	 * eeprom keeps clocking data out as long as we ack, automatically
+	 * incrementing the address.
+	 */
+	while (len-- > 0) {
+		/* get and store data */
+		*bp++ = rd_byte(dd);
+		/* send ack if not the last byte */
+		if (len)
+			send_ack(dd);
+	}
+
+	stop_cmd(dd);
+
+	ret = 0;
+
+bail:
+	return ret;
+}
+
+static int ipath_eeprom_internal_write(struct ipath_devdata *dd, u8 eeprom_offset,
+				       const void *buffer, int len)
+{
+	int sub_len;
+	const u8 *bp = buffer;
+	int max_wait_time, i;
+	int ret;
+	struct i2c_chain_desc *icd;
+
+	ret = 1;
+	icd = ipath_i2c_type(dd);
+	if (!icd)
+		goto bail;
+
+	while (len > 0) {
+		if (icd->eeprom_dev == IPATH_NO_DEV) {
+			if (i2c_startcmd(dd,
+					 (eeprom_offset << 1) | WRITE_CMD)) {
+				ipath_dbg("Failed to start cmd offset %u\n",
+					eeprom_offset);
+				goto failed_write;
+			}
+		} else {
+			/* Real I2C */
+			if (i2c_startcmd(dd, icd->eeprom_dev | WRITE_CMD)) {
+				ipath_dbg("Failed EEPROM startcmd\n");
+				goto failed_write;
+			}
+			ret = wr_byte(dd, eeprom_offset);
+			if (ret) {
+				ipath_dev_err(dd, "Failed to write EEPROM "
+					      "address\n");
+				goto failed_write;
+			}
+		}
+
+		sub_len = min(len, 4);
+		eeprom_offset += sub_len;
+		len -= sub_len;
+
+		for (i = 0; i < sub_len; i++) {
+			if (wr_byte(dd, *bp++)) {
+				ipath_dbg("no ack after byte %u/%u (%u "
+					  "total remain)\n", i, sub_len,
+					  len + sub_len - i);
+				goto failed_write;
+			}
+		}
+
+		stop_cmd(dd);
+
+		/*
+		 * wait for write complete by waiting for a successful
+		 * read (the chip replies with a zero after the write
+		 * cmd completes, and before it writes to the eeprom.
+		 * The startcmd for the read will fail the ack until
+		 * the writes have completed.   We do this inline to avoid
+		 * the debug prints that are in the real read routine
+		 * if the startcmd fails.
+		 * We also use the proper device address, so it doesn't matter
+		 * whether we have real eeprom_dev. legacy likes any address.
+		 */
+		max_wait_time = 100;
+		while (i2c_startcmd(dd, icd->eeprom_dev | READ_CMD)) {
+			stop_cmd(dd);
+			if (!--max_wait_time) {
+				ipath_dbg("Did not get successful read to "
+					  "complete write\n");
+				goto failed_write;
+			}
+		}
+		/* now read (and ignore) the resulting byte */
+		rd_byte(dd);
+		stop_cmd(dd);
+	}
+
+	ret = 0;
+	goto bail;
+
+failed_write:
+	stop_cmd(dd);
+	ret = 1;
+
+bail:
+	return ret;
+}
+
+/**
+ * ipath_eeprom_read - receives bytes from the eeprom via I2C
+ * @dd: the infinipath device
+ * @eeprom_offset: address to read from
+ * @buffer: where to store result
+ * @len: number of bytes to receive
+ */
+int ipath_eeprom_read(struct ipath_devdata *dd, u8 eeprom_offset,
+			void *buff, int len)
+{
+	int ret;
+
+	ret = mutex_lock_interruptible(&dd->ipath_eep_lock);
+	if (!ret) {
+		ret = ipath_eeprom_internal_read(dd, eeprom_offset, buff, len);
+		mutex_unlock(&dd->ipath_eep_lock);
+	}
+
+	return ret;
+}
+
+/**
+ * ipath_eeprom_write - writes data to the eeprom via I2C
+ * @dd: the infinipath device
+ * @eeprom_offset: where to place data
+ * @buffer: data to write
+ * @len: number of bytes to write
+ */
+int ipath_eeprom_write(struct ipath_devdata *dd, u8 eeprom_offset,
+			const void *buff, int len)
+{
+	int ret;
+
+	ret = mutex_lock_interruptible(&dd->ipath_eep_lock);
+	if (!ret) {
+		ret = ipath_eeprom_internal_write(dd, eeprom_offset, buff, len);
+		mutex_unlock(&dd->ipath_eep_lock);
+	}
+
+	return ret;
+}
+
+static u8 flash_csum(struct ipath_flash *ifp, int adjust)
+{
+	u8 *ip = (u8 *) ifp;
+	u8 csum = 0, len;
+
+	/*
+	 * Limit length checksummed to max length of actual data.
+	 * Checksum of erased eeprom will still be bad, but we avoid
+	 * reading past the end of the buffer we were passed.
+	 */
+	len = ifp->if_length;
+	if (len > sizeof(struct ipath_flash))
+		len = sizeof(struct ipath_flash);
+	while (len--)
+		csum += *ip++;
+	csum -= ifp->if_csum;
+	csum = ~csum;
+	if (adjust)
+		ifp->if_csum = csum;
+
+	return csum;
+}
+
+/**
+ * ipath_get_guid - get the GUID from the i2c device
+ * @dd: the infinipath device
+ *
+ * We have the capability to use the ipath_nguid field, and get
+ * the guid from the first chip's flash, to use for all of them.
+ */
+void ipath_get_eeprom_info(struct ipath_devdata *dd)
+{
+	void *buf;
+	struct ipath_flash *ifp;
+	__be64 guid;
+	int len, eep_stat;
+	u8 csum, *bguid;
+	int t = dd->ipath_unit;
+	struct ipath_devdata *dd0 = ipath_lookup(0);
+
+	if (t && dd0->ipath_nguid > 1 && t <= dd0->ipath_nguid) {
+		u8 oguid;
+		dd->ipath_guid = dd0->ipath_guid;
+		bguid = (u8 *) & dd->ipath_guid;
+
+		oguid = bguid[7];
+		bguid[7] += t;
+		if (oguid > bguid[7]) {
+			if (bguid[6] == 0xff) {
+				if (bguid[5] == 0xff) {
+					ipath_dev_err(
+						dd,
+						"Can't set %s GUID from "
+						"base, wraps to OUI!\n",
+						ipath_get_unit_name(t));
+					dd->ipath_guid = 0;
+					goto bail;
+				}
+				bguid[5]++;
+			}
+			bguid[6]++;
+		}
+		dd->ipath_nguid = 1;
+
+		ipath_dbg("nguid %u, so adding %u to device 0 guid, "
+			  "for %llx\n",
+			  dd0->ipath_nguid, t,
+			  (unsigned long long) be64_to_cpu(dd->ipath_guid));
+		goto bail;
+	}
+
+	/*
+	 * read full flash, not just currently used part, since it may have
+	 * been written with a newer definition
+	 * */
+	len = sizeof(struct ipath_flash);
+	buf = vmalloc(len);
+	if (!buf) {
+		ipath_dev_err(dd, "Couldn't allocate memory to read %u "
+			      "bytes from eeprom for GUID\n", len);
+		goto bail;
+	}
+
+	mutex_lock(&dd->ipath_eep_lock);
+	eep_stat = ipath_eeprom_internal_read(dd, 0, buf, len);
+	mutex_unlock(&dd->ipath_eep_lock);
+
+	if (eep_stat) {
+		ipath_dev_err(dd, "Failed reading GUID from eeprom\n");
+		goto done;
+	}
+	ifp = (struct ipath_flash *)buf;
+
+	csum = flash_csum(ifp, 0);
+	if (csum != ifp->if_csum) {
+		dev_info(&dd->pcidev->dev, "Bad I2C flash checksum: "
+			 "0x%x, not 0x%x\n", csum, ifp->if_csum);
+		goto done;
+	}
+	if (*(__be64 *) ifp->if_guid == cpu_to_be64(0) ||
+	    *(__be64 *) ifp->if_guid == ~cpu_to_be64(0)) {
+		ipath_dev_err(dd, "Invalid GUID %llx from flash; "
+			      "ignoring\n",
+			      *(unsigned long long *) ifp->if_guid);
+		/* don't allow GUID if all 0 or all 1's */
+		goto done;
+	}
+
+	/* complain, but allow it */
+	if (*(u64 *) ifp->if_guid == 0x100007511000000ULL)
+		dev_info(&dd->pcidev->dev, "Warning, GUID %llx is "
+			 "default, probably not correct!\n",
+			 *(unsigned long long *) ifp->if_guid);
+
+	bguid = ifp->if_guid;
+	if (!bguid[0] && !bguid[1] && !bguid[2]) {
+		/* original incorrect GUID format in flash; fix in
+		 * core copy, by shifting up 2 octets; don't need to
+		 * change top octet, since both it and shifted are
+		 * 0.. */
+		bguid[1] = bguid[3];
+		bguid[2] = bguid[4];
+		bguid[3] = bguid[4] = 0;
+		guid = *(__be64 *) ifp->if_guid;
+		ipath_cdbg(VERBOSE, "Old GUID format in flash, top 3 zero, "
+			   "shifting 2 octets\n");
+	} else
+		guid = *(__be64 *) ifp->if_guid;
+	dd->ipath_guid = guid;
+	dd->ipath_nguid = ifp->if_numguid;
+	/*
+	 * Things are slightly complicated by the desire to transparently
+	 * support both the Pathscale 10-digit serial number and the QLogic
+	 * 13-character version.
+	 */
+	if ((ifp->if_fversion > 1) && ifp->if_sprefix[0]
+		&& ((u8 *)ifp->if_sprefix)[0] != 0xFF) {
+		/* This board has a Serial-prefix, which is stored
+		 * elsewhere for backward-compatibility.
+		 */
+		char *snp = dd->ipath_serial;
+		memcpy(snp, ifp->if_sprefix, sizeof ifp->if_sprefix);
+		snp[sizeof ifp->if_sprefix] = '\0';
+		len = strlen(snp);
+		snp += len;
+		len = (sizeof dd->ipath_serial) - len;
+		if (len > sizeof ifp->if_serial) {
+			len = sizeof ifp->if_serial;
+		}
+		memcpy(snp, ifp->if_serial, len);
+	} else
+		memcpy(dd->ipath_serial, ifp->if_serial,
+		       sizeof ifp->if_serial);
+	if (!strstr(ifp->if_comment, "Tested successfully"))
+		ipath_dev_err(dd, "Board SN %s did not pass functional "
+			"test: %s\n", dd->ipath_serial,
+			ifp->if_comment);
+
+	ipath_cdbg(VERBOSE, "Initted GUID to %llx from eeprom\n",
+		   (unsigned long long) be64_to_cpu(dd->ipath_guid));
+
+	memcpy(&dd->ipath_eep_st_errs, &ifp->if_errcntp, IPATH_EEP_LOG_CNT);
+	/*
+	 * Power-on (actually "active") hours are kept as little-endian value
+	 * in EEPROM, but as seconds in a (possibly as small as 24-bit)
+	 * atomic_t while running.
+	 */
+	atomic_set(&dd->ipath_active_time, 0);
+	dd->ipath_eep_hrs = ifp->if_powerhour[0] | (ifp->if_powerhour[1] << 8);
+
+done:
+	vfree(buf);
+
+bail:;
+}
+
+/**
+ * ipath_update_eeprom_log - copy active-time and error counters to eeprom
+ * @dd: the infinipath device
+ *
+ * Although the time is kept as seconds in the ipath_devdata struct, it is
+ * rounded to hours for re-write, as we have only 16 bits in EEPROM.
+ * First-cut code reads whole (expected) struct ipath_flash, modifies,
+ * re-writes. Future direction: read/write only what we need, assuming
+ * that the EEPROM had to have been "good enough" for driver init, and
+ * if not, we aren't making it worse.
+ *
+ */
+
+int ipath_update_eeprom_log(struct ipath_devdata *dd)
+{
+	void *buf;
+	struct ipath_flash *ifp;
+	int len, hi_water;
+	uint32_t new_time, new_hrs;
+	u8 csum;
+	int ret, idx;
+	unsigned long flags;
+
+	/* first, check if we actually need to do anything. */
+	ret = 0;
+	for (idx = 0; idx < IPATH_EEP_LOG_CNT; ++idx) {
+		if (dd->ipath_eep_st_new_errs[idx]) {
+			ret = 1;
+			break;
+		}
+	}
+	new_time = atomic_read(&dd->ipath_active_time);
+
+	if (ret == 0 && new_time < 3600)
+		return 0;
+
+	/*
+	 * The quick-check above determined that there is something worthy
+	 * of logging, so get current contents and do a more detailed idea.
+	 * read full flash, not just currently used part, since it may have
+	 * been written with a newer definition
+	 */
+	len = sizeof(struct ipath_flash);
+	buf = vmalloc(len);
+	ret = 1;
+	if (!buf) {
+		ipath_dev_err(dd, "Couldn't allocate memory to read %u "
+				"bytes from eeprom for logging\n", len);
+		goto bail;
+	}
+
+	/* Grab semaphore and read current EEPROM. If we get an
+	 * error, let go, but if not, keep it until we finish write.
+	 */
+	ret = mutex_lock_interruptible(&dd->ipath_eep_lock);
+	if (ret) {
+		ipath_dev_err(dd, "Unable to acquire EEPROM for logging\n");
+		goto free_bail;
+	}
+	ret = ipath_eeprom_internal_read(dd, 0, buf, len);
+	if (ret) {
+		mutex_unlock(&dd->ipath_eep_lock);
+		ipath_dev_err(dd, "Unable read EEPROM for logging\n");
+		goto free_bail;
+	}
+	ifp = (struct ipath_flash *)buf;
+
+	csum = flash_csum(ifp, 0);
+	if (csum != ifp->if_csum) {
+		mutex_unlock(&dd->ipath_eep_lock);
+		ipath_dev_err(dd, "EEPROM cks err (0x%02X, S/B 0x%02X)\n",
+				csum, ifp->if_csum);
+		ret = 1;
+		goto free_bail;
+	}
+	hi_water = 0;
+	spin_lock_irqsave(&dd->ipath_eep_st_lock, flags);
+	for (idx = 0; idx < IPATH_EEP_LOG_CNT; ++idx) {
+		int new_val = dd->ipath_eep_st_new_errs[idx];
+		if (new_val) {
+			/*
+			 * If we have seen any errors, add to EEPROM values
+			 * We need to saturate at 0xFF (255) and we also
+			 * would need to adjust the checksum if we were
+			 * trying to minimize EEPROM traffic
+			 * Note that we add to actual current count in EEPROM,
+			 * in case it was altered while we were running.
+			 */
+			new_val += ifp->if_errcntp[idx];
+			if (new_val > 0xFF)
+				new_val = 0xFF;
+			if (ifp->if_errcntp[idx] != new_val) {
+				ifp->if_errcntp[idx] = new_val;
+				hi_water = offsetof(struct ipath_flash,
+						if_errcntp) + idx;
+			}
+			/*
+			 * update our shadow (used to minimize EEPROM
+			 * traffic), to match what we are about to write.
+			 */
+			dd->ipath_eep_st_errs[idx] = new_val;
+			dd->ipath_eep_st_new_errs[idx] = 0;
+		}
+	}
+	/*
+	 * now update active-time. We would like to round to the nearest hour
+	 * but unless atomic_t are sure to be proper signed ints we cannot,
+	 * because we need to account for what we "transfer" to EEPROM and
+	 * if we log an hour at 31 minutes, then we would need to set
+	 * active_time to -29 to accurately count the _next_ hour.
+	 */
+	if (new_time >= 3600) {
+		new_hrs = new_time / 3600;
+		atomic_sub((new_hrs * 3600), &dd->ipath_active_time);
+		new_hrs += dd->ipath_eep_hrs;
+		if (new_hrs > 0xFFFF)
+			new_hrs = 0xFFFF;
+		dd->ipath_eep_hrs = new_hrs;
+		if ((new_hrs & 0xFF) != ifp->if_powerhour[0]) {
+			ifp->if_powerhour[0] = new_hrs & 0xFF;
+			hi_water = offsetof(struct ipath_flash, if_powerhour);
+		}
+		if ((new_hrs >> 8) != ifp->if_powerhour[1]) {
+			ifp->if_powerhour[1] = new_hrs >> 8;
+			hi_water = offsetof(struct ipath_flash, if_powerhour)
+					+ 1;
+		}
+	}
+	/*
+	 * There is a tiny possibility that we could somehow fail to write
+	 * the EEPROM after updating our shadows, but problems from holding
+	 * the spinlock too long are a much bigger issue.
+	 */
+	spin_unlock_irqrestore(&dd->ipath_eep_st_lock, flags);
+	if (hi_water) {
+		/* we made some change to the data, uopdate cksum and write */
+		csum = flash_csum(ifp, 1);
+		ret = ipath_eeprom_internal_write(dd, 0, buf, hi_water + 1);
+	}
+	mutex_unlock(&dd->ipath_eep_lock);
+	if (ret)
+		ipath_dev_err(dd, "Failed updating EEPROM\n");
+
+free_bail:
+	vfree(buf);
+bail:
+	return ret;
+
+}
+
+/**
+ * ipath_inc_eeprom_err - increment one of the four error counters
+ * that are logged to EEPROM.
+ * @dd: the infinipath device
+ * @eidx: 0..3, the counter to increment
+ * @incr: how much to add
+ *
+ * Each counter is 8-bits, and saturates at 255 (0xFF). They
+ * are copied to the EEPROM (aka flash) whenever ipath_update_eeprom_log()
+ * is called, but it can only be called in a context that allows sleep.
+ * This function can be called even at interrupt level.
+ */
+
+void ipath_inc_eeprom_err(struct ipath_devdata *dd, u32 eidx, u32 incr)
+{
+	uint new_val;
+	unsigned long flags;
+
+	spin_lock_irqsave(&dd->ipath_eep_st_lock, flags);
+	new_val = dd->ipath_eep_st_new_errs[eidx] + incr;
+	if (new_val > 255)
+		new_val = 255;
+	dd->ipath_eep_st_new_errs[eidx] = new_val;
+	spin_unlock_irqrestore(&dd->ipath_eep_st_lock, flags);
+	return;
+}
+
+static int ipath_tempsense_internal_read(struct ipath_devdata *dd, u8 regnum)
+{
+	int ret;
+	struct i2c_chain_desc *icd;
+
+	ret = -ENOENT;
+
+	icd = ipath_i2c_type(dd);
+	if (!icd)
+		goto bail;
+
+	if (icd->temp_dev == IPATH_NO_DEV) {
+		/* tempsense only exists on new, real-I2C boards */
+		ret = -ENXIO;
+		goto bail;
+	}
+
+	if (i2c_startcmd(dd, icd->temp_dev | WRITE_CMD)) {
+		ipath_dbg("Failed tempsense startcmd\n");
+		stop_cmd(dd);
+		ret = -ENXIO;
+		goto bail;
+	}
+	ret = wr_byte(dd, regnum);
+	stop_cmd(dd);
+	if (ret) {
+		ipath_dev_err(dd, "Failed tempsense WR command %02X\n",
+			      regnum);
+		ret = -ENXIO;
+		goto bail;
+	}
+	if (i2c_startcmd(dd, icd->temp_dev | READ_CMD)) {
+		ipath_dbg("Failed tempsense RD startcmd\n");
+		stop_cmd(dd);
+		ret = -ENXIO;
+		goto bail;
+	}
+	/*
+	 * We can only clock out one byte per command, sensibly
+	 */
+	ret = rd_byte(dd);
+	stop_cmd(dd);
+
+bail:
+	return ret;
+}
+
+#define VALID_TS_RD_REG_MASK 0xBF
+
+/**
+ * ipath_tempsense_read - read register of temp sensor via I2C
+ * @dd: the infinipath device
+ * @regnum: register to read from
+ *
+ * returns reg contents (0..255) or < 0 for error
+ */
+int ipath_tempsense_read(struct ipath_devdata *dd, u8 regnum)
+{
+	int ret;
+
+	if (regnum > 7)
+		return -EINVAL;
+
+	/* return a bogus value for (the one) register we do not have */
+	if (!((1 << regnum) & VALID_TS_RD_REG_MASK))
+		return 0;
+
+	ret = mutex_lock_interruptible(&dd->ipath_eep_lock);
+	if (!ret) {
+		ret = ipath_tempsense_internal_read(dd, regnum);
+		mutex_unlock(&dd->ipath_eep_lock);
+	}
+
+	/*
+	 * There are three possibilities here:
+	 * ret is actual value (0..255)
+	 * ret is -ENXIO or -EINVAL from code in this file
+	 * ret is -EINTR from mutex_lock_interruptible.
+	 */
+	return ret;
+}
+
+static int ipath_tempsense_internal_write(struct ipath_devdata *dd,
+					  u8 regnum, u8 data)
+{
+	int ret = -ENOENT;
+	struct i2c_chain_desc *icd;
+
+	icd = ipath_i2c_type(dd);
+	if (!icd)
+		goto bail;
+
+	if (icd->temp_dev == IPATH_NO_DEV) {
+		/* tempsense only exists on new, real-I2C boards */
+		ret = -ENXIO;
+		goto bail;
+	}
+	if (i2c_startcmd(dd, icd->temp_dev | WRITE_CMD)) {
+		ipath_dbg("Failed tempsense startcmd\n");
+		stop_cmd(dd);
+		ret = -ENXIO;
+		goto bail;
+	}
+	ret = wr_byte(dd, regnum);
+	if (ret) {
+		stop_cmd(dd);
+		ipath_dev_err(dd, "Failed to write tempsense command %02X\n",
+			      regnum);
+		ret = -ENXIO;
+		goto bail;
+	}
+	ret = wr_byte(dd, data);
+	stop_cmd(dd);
+	ret = i2c_startcmd(dd, icd->temp_dev | READ_CMD);
+	if (ret) {
+		ipath_dev_err(dd, "Failed tempsense data wrt to %02X\n",
+			      regnum);
+		ret = -ENXIO;
+	}
+
+bail:
+	return ret;
+}
+
+#define VALID_TS_WR_REG_MASK ((1 << 9) | (1 << 0xB) | (1 << 0xD))
+
+/**
+ * ipath_tempsense_write - write register of temp sensor via I2C
+ * @dd: the infinipath device
+ * @regnum: register to write
+ * @data: data to write
+ *
+ * returns 0 for success or < 0 for error
+ */
+int ipath_tempsense_write(struct ipath_devdata *dd, u8 regnum, u8 data)
+{
+	int ret;
+
+	if (regnum > 15 || !((1 << regnum) & VALID_TS_WR_REG_MASK))
+		return -EINVAL;
+
+	ret = mutex_lock_interruptible(&dd->ipath_eep_lock);
+	if (!ret) {
+		ret = ipath_tempsense_internal_write(dd, regnum, data);
+		mutex_unlock(&dd->ipath_eep_lock);
+	}
+
+	/*
+	 * There are three possibilities here:
+	 * ret is 0 for success
+	 * ret is -ENXIO or -EINVAL from code in this file
+	 * ret is -EINTR from mutex_lock_interruptible.
+	 */
+	return ret;
+}
diff --git a/drivers/infiniband/hw/ipath/ipath_file_ops.c b/drivers/infiniband/hw/ipath/ipath_file_ops.c
new file mode 100644
index 0000000..6d7f453
--- /dev/null
+++ b/drivers/infiniband/hw/ipath/ipath_file_ops.c
@@ -0,0 +1,2617 @@
+/*
+ * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/pci.h>
+#include <linux/poll.h>
+#include <linux/cdev.h>
+#include <linux/swap.h>
+#include <linux/export.h>
+#include <linux/vmalloc.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/io.h>
+#include <linux/aio.h>
+#include <linux/jiffies.h>
+#include <linux/cpu.h>
+#include <asm/pgtable.h>
+
+#include "ipath_kernel.h"
+#include "ipath_common.h"
+#include "ipath_user_sdma.h"
+
+static int ipath_open(struct inode *, struct file *);
+static int ipath_close(struct inode *, struct file *);
+static ssize_t ipath_write(struct file *, const char __user *, size_t,
+			   loff_t *);
+static ssize_t ipath_writev(struct kiocb *, const struct iovec *,
+			    unsigned long , loff_t);
+static unsigned int ipath_poll(struct file *, struct poll_table_struct *);
+static int ipath_mmap(struct file *, struct vm_area_struct *);
+
+static const struct file_operations ipath_file_ops = {
+	.owner = THIS_MODULE,
+	.write = ipath_write,
+	.aio_write = ipath_writev,
+	.open = ipath_open,
+	.release = ipath_close,
+	.poll = ipath_poll,
+	.mmap = ipath_mmap,
+	.llseek = noop_llseek,
+};
+
+/*
+ * Convert kernel virtual addresses to physical addresses so they don't
+ * potentially conflict with the chip addresses used as mmap offsets.
+ * It doesn't really matter what mmap offset we use as long as we can
+ * interpret it correctly.
+ */
+static u64 cvt_kvaddr(void *p)
+{
+	struct page *page;
+	u64 paddr = 0;
+
+	page = vmalloc_to_page(p);
+	if (page)
+		paddr = page_to_pfn(page) << PAGE_SHIFT;
+
+	return paddr;
+}
+
+static int ipath_get_base_info(struct file *fp,
+			       void __user *ubase, size_t ubase_size)
+{
+	struct ipath_portdata *pd = port_fp(fp);
+	int ret = 0;
+	struct ipath_base_info *kinfo = NULL;
+	struct ipath_devdata *dd = pd->port_dd;
+	unsigned subport_cnt;
+	int shared, master;
+	size_t sz;
+
+	subport_cnt = pd->port_subport_cnt;
+	if (!subport_cnt) {
+		shared = 0;
+		master = 0;
+		subport_cnt = 1;
+	} else {
+		shared = 1;
+		master = !subport_fp(fp);
+	}
+
+	sz = sizeof(*kinfo);
+	/* If port sharing is not requested, allow the old size structure */
+	if (!shared)
+		sz -= 7 * sizeof(u64);
+	if (ubase_size < sz) {
+		ipath_cdbg(PROC,
+			   "Base size %zu, need %zu (version mismatch?)\n",
+			   ubase_size, sz);
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	kinfo = kzalloc(sizeof(*kinfo), GFP_KERNEL);
+	if (kinfo == NULL) {
+		ret = -ENOMEM;
+		goto bail;
+	}
+
+	ret = dd->ipath_f_get_base_info(pd, kinfo);
+	if (ret < 0)
+		goto bail;
+
+	kinfo->spi_rcvhdr_cnt = dd->ipath_rcvhdrcnt;
+	kinfo->spi_rcvhdrent_size = dd->ipath_rcvhdrentsize;
+	kinfo->spi_tidegrcnt = dd->ipath_rcvegrcnt;
+	kinfo->spi_rcv_egrbufsize = dd->ipath_rcvegrbufsize;
+	/*
+	 * have to mmap whole thing
+	 */
+	kinfo->spi_rcv_egrbuftotlen =
+		pd->port_rcvegrbuf_chunks * pd->port_rcvegrbuf_size;
+	kinfo->spi_rcv_egrperchunk = pd->port_rcvegrbufs_perchunk;
+	kinfo->spi_rcv_egrchunksize = kinfo->spi_rcv_egrbuftotlen /
+		pd->port_rcvegrbuf_chunks;
+	kinfo->spi_tidcnt = dd->ipath_rcvtidcnt / subport_cnt;
+	if (master)
+		kinfo->spi_tidcnt += dd->ipath_rcvtidcnt % subport_cnt;
+	/*
+	 * for this use, may be ipath_cfgports summed over all chips that
+	 * are are configured and present
+	 */
+	kinfo->spi_nports = dd->ipath_cfgports;
+	/* unit (chip/board) our port is on */
+	kinfo->spi_unit = dd->ipath_unit;
+	/* for now, only a single page */
+	kinfo->spi_tid_maxsize = PAGE_SIZE;
+
+	/*
+	 * Doing this per port, and based on the skip value, etc.  This has
+	 * to be the actual buffer size, since the protocol code treats it
+	 * as an array.
+	 *
+	 * These have to be set to user addresses in the user code via mmap.
+	 * These values are used on return to user code for the mmap target
+	 * addresses only.  For 32 bit, same 44 bit address problem, so use
+	 * the physical address, not virtual.  Before 2.6.11, using the
+	 * page_address() macro worked, but in 2.6.11, even that returns the
+	 * full 64 bit address (upper bits all 1's).  So far, using the
+	 * physical addresses (or chip offsets, for chip mapping) works, but
+	 * no doubt some future kernel release will change that, and we'll be
+	 * on to yet another method of dealing with this.
+	 */
+	kinfo->spi_rcvhdr_base = (u64) pd->port_rcvhdrq_phys;
+	kinfo->spi_rcvhdr_tailaddr = (u64) pd->port_rcvhdrqtailaddr_phys;
+	kinfo->spi_rcv_egrbufs = (u64) pd->port_rcvegr_phys;
+	kinfo->spi_pioavailaddr = (u64) dd->ipath_pioavailregs_phys;
+	kinfo->spi_status = (u64) kinfo->spi_pioavailaddr +
+		(void *) dd->ipath_statusp -
+		(void *) dd->ipath_pioavailregs_dma;
+	if (!shared) {
+		kinfo->spi_piocnt = pd->port_piocnt;
+		kinfo->spi_piobufbase = (u64) pd->port_piobufs;
+		kinfo->__spi_uregbase = (u64) dd->ipath_uregbase +
+			dd->ipath_ureg_align * pd->port_port;
+	} else if (master) {
+		kinfo->spi_piocnt = (pd->port_piocnt / subport_cnt) +
+				    (pd->port_piocnt % subport_cnt);
+		/* Master's PIO buffers are after all the slave's */
+		kinfo->spi_piobufbase = (u64) pd->port_piobufs +
+			dd->ipath_palign *
+			(pd->port_piocnt - kinfo->spi_piocnt);
+	} else {
+		unsigned slave = subport_fp(fp) - 1;
+
+		kinfo->spi_piocnt = pd->port_piocnt / subport_cnt;
+		kinfo->spi_piobufbase = (u64) pd->port_piobufs +
+			dd->ipath_palign * kinfo->spi_piocnt * slave;
+	}
+
+	if (shared) {
+		kinfo->spi_port_uregbase = (u64) dd->ipath_uregbase +
+			dd->ipath_ureg_align * pd->port_port;
+		kinfo->spi_port_rcvegrbuf = kinfo->spi_rcv_egrbufs;
+		kinfo->spi_port_rcvhdr_base = kinfo->spi_rcvhdr_base;
+		kinfo->spi_port_rcvhdr_tailaddr = kinfo->spi_rcvhdr_tailaddr;
+
+		kinfo->__spi_uregbase = cvt_kvaddr(pd->subport_uregbase +
+			PAGE_SIZE * subport_fp(fp));
+
+		kinfo->spi_rcvhdr_base = cvt_kvaddr(pd->subport_rcvhdr_base +
+			pd->port_rcvhdrq_size * subport_fp(fp));
+		kinfo->spi_rcvhdr_tailaddr = 0;
+		kinfo->spi_rcv_egrbufs = cvt_kvaddr(pd->subport_rcvegrbuf +
+			pd->port_rcvegrbuf_chunks * pd->port_rcvegrbuf_size *
+			subport_fp(fp));
+
+		kinfo->spi_subport_uregbase =
+			cvt_kvaddr(pd->subport_uregbase);
+		kinfo->spi_subport_rcvegrbuf =
+			cvt_kvaddr(pd->subport_rcvegrbuf);
+		kinfo->spi_subport_rcvhdr_base =
+			cvt_kvaddr(pd->subport_rcvhdr_base);
+		ipath_cdbg(PROC, "port %u flags %x %llx %llx %llx\n",
+			kinfo->spi_port, kinfo->spi_runtime_flags,
+			(unsigned long long) kinfo->spi_subport_uregbase,
+			(unsigned long long) kinfo->spi_subport_rcvegrbuf,
+			(unsigned long long) kinfo->spi_subport_rcvhdr_base);
+	}
+
+	/*
+	 * All user buffers are 2KB buffers.  If we ever support
+	 * giving 4KB buffers to user processes, this will need some
+	 * work.
+	 */
+	kinfo->spi_pioindex = (kinfo->spi_piobufbase -
+		(dd->ipath_piobufbase & 0xffffffff)) / dd->ipath_palign;
+	kinfo->spi_pioalign = dd->ipath_palign;
+
+	kinfo->spi_qpair = IPATH_KD_QP;
+	/*
+	 * user mode PIO buffers are always 2KB, even when 4KB can
+	 * be received, and sent via the kernel; this is ibmaxlen
+	 * for 2K MTU.
+	 */
+	kinfo->spi_piosize = dd->ipath_piosize2k - 2 * sizeof(u32);
+	kinfo->spi_mtu = dd->ipath_ibmaxlen;	/* maxlen, not ibmtu */
+	kinfo->spi_port = pd->port_port;
+	kinfo->spi_subport = subport_fp(fp);
+	kinfo->spi_sw_version = IPATH_KERN_SWVERSION;
+	kinfo->spi_hw_version = dd->ipath_revision;
+
+	if (master) {
+		kinfo->spi_runtime_flags |= IPATH_RUNTIME_MASTER;
+	}
+
+	sz = (ubase_size < sizeof(*kinfo)) ? ubase_size : sizeof(*kinfo);
+	if (copy_to_user(ubase, kinfo, sz))
+		ret = -EFAULT;
+
+bail:
+	kfree(kinfo);
+	return ret;
+}
+
+/**
+ * ipath_tid_update - update a port TID
+ * @pd: the port
+ * @fp: the ipath device file
+ * @ti: the TID information
+ *
+ * The new implementation as of Oct 2004 is that the driver assigns
+ * the tid and returns it to the caller.   To make it easier to
+ * catch bugs, and to reduce search time, we keep a cursor for
+ * each port, walking the shadow tid array to find one that's not
+ * in use.
+ *
+ * For now, if we can't allocate the full list, we fail, although
+ * in the long run, we'll allocate as many as we can, and the
+ * caller will deal with that by trying the remaining pages later.
+ * That means that when we fail, we have to mark the tids as not in
+ * use again, in our shadow copy.
+ *
+ * It's up to the caller to free the tids when they are done.
+ * We'll unlock the pages as they free them.
+ *
+ * Also, right now we are locking one page at a time, but since
+ * the intended use of this routine is for a single group of
+ * virtually contiguous pages, that should change to improve
+ * performance.
+ */
+static int ipath_tid_update(struct ipath_portdata *pd, struct file *fp,
+			    const struct ipath_tid_info *ti)
+{
+	int ret = 0, ntids;
+	u32 tid, porttid, cnt, i, tidcnt, tidoff;
+	u16 *tidlist;
+	struct ipath_devdata *dd = pd->port_dd;
+	u64 physaddr;
+	unsigned long vaddr;
+	u64 __iomem *tidbase;
+	unsigned long tidmap[8];
+	struct page **pagep = NULL;
+	unsigned subport = subport_fp(fp);
+
+	if (!dd->ipath_pageshadow) {
+		ret = -ENOMEM;
+		goto done;
+	}
+
+	cnt = ti->tidcnt;
+	if (!cnt) {
+		ipath_dbg("After copyin, tidcnt 0, tidlist %llx\n",
+			  (unsigned long long) ti->tidlist);
+		/*
+		 * Should we treat as success?  likely a bug
+		 */
+		ret = -EFAULT;
+		goto done;
+	}
+	porttid = pd->port_port * dd->ipath_rcvtidcnt;
+	if (!pd->port_subport_cnt) {
+		tidcnt = dd->ipath_rcvtidcnt;
+		tid = pd->port_tidcursor;
+		tidoff = 0;
+	} else if (!subport) {
+		tidcnt = (dd->ipath_rcvtidcnt / pd->port_subport_cnt) +
+			 (dd->ipath_rcvtidcnt % pd->port_subport_cnt);
+		tidoff = dd->ipath_rcvtidcnt - tidcnt;
+		porttid += tidoff;
+		tid = tidcursor_fp(fp);
+	} else {
+		tidcnt = dd->ipath_rcvtidcnt / pd->port_subport_cnt;
+		tidoff = tidcnt * (subport - 1);
+		porttid += tidoff;
+		tid = tidcursor_fp(fp);
+	}
+	if (cnt > tidcnt) {
+		/* make sure it all fits in port_tid_pg_list */
+		dev_info(&dd->pcidev->dev, "Process tried to allocate %u "
+			 "TIDs, only trying max (%u)\n", cnt, tidcnt);
+		cnt = tidcnt;
+	}
+	pagep = &((struct page **) pd->port_tid_pg_list)[tidoff];
+	tidlist = &((u16 *) &pagep[dd->ipath_rcvtidcnt])[tidoff];
+
+	memset(tidmap, 0, sizeof(tidmap));
+	/* before decrement; chip actual # */
+	ntids = tidcnt;
+	tidbase = (u64 __iomem *) (((char __iomem *) dd->ipath_kregbase) +
+				   dd->ipath_rcvtidbase +
+				   porttid * sizeof(*tidbase));
+
+	ipath_cdbg(VERBOSE, "Port%u %u tids, cursor %u, tidbase %p\n",
+		   pd->port_port, cnt, tid, tidbase);
+
+	/* virtual address of first page in transfer */
+	vaddr = ti->tidvaddr;
+	if (!access_ok(VERIFY_WRITE, (void __user *) vaddr,
+		       cnt * PAGE_SIZE)) {
+		ipath_dbg("Fail vaddr %p, %u pages, !access_ok\n",
+			  (void *)vaddr, cnt);
+		ret = -EFAULT;
+		goto done;
+	}
+	ret = ipath_get_user_pages(vaddr, cnt, pagep);
+	if (ret) {
+		if (ret == -EBUSY) {
+			ipath_dbg("Failed to lock addr %p, %u pages "
+				  "(already locked)\n",
+				  (void *) vaddr, cnt);
+			/*
+			 * for now, continue, and see what happens but with
+			 * the new implementation, this should never happen,
+			 * unless perhaps the user has mpin'ed the pages
+			 * themselves (something we need to test)
+			 */
+			ret = 0;
+		} else {
+			dev_info(&dd->pcidev->dev,
+				 "Failed to lock addr %p, %u pages: "
+				 "errno %d\n", (void *) vaddr, cnt, -ret);
+			goto done;
+		}
+	}
+	for (i = 0; i < cnt; i++, vaddr += PAGE_SIZE) {
+		for (; ntids--; tid++) {
+			if (tid == tidcnt)
+				tid = 0;
+			if (!dd->ipath_pageshadow[porttid + tid])
+				break;
+		}
+		if (ntids < 0) {
+			/*
+			 * oops, wrapped all the way through their TIDs,
+			 * and didn't have enough free; see comments at
+			 * start of routine
+			 */
+			ipath_dbg("Not enough free TIDs for %u pages "
+				  "(index %d), failing\n", cnt, i);
+			i--;	/* last tidlist[i] not filled in */
+			ret = -ENOMEM;
+			break;
+		}
+		tidlist[i] = tid + tidoff;
+		ipath_cdbg(VERBOSE, "Updating idx %u to TID %u, "
+			   "vaddr %lx\n", i, tid + tidoff, vaddr);
+		/* we "know" system pages and TID pages are same size */
+		dd->ipath_pageshadow[porttid + tid] = pagep[i];
+		dd->ipath_physshadow[porttid + tid] = ipath_map_page(
+			dd->pcidev, pagep[i], 0, PAGE_SIZE,
+			PCI_DMA_FROMDEVICE);
+		/*
+		 * don't need atomic or it's overhead
+		 */
+		__set_bit(tid, tidmap);
+		physaddr = dd->ipath_physshadow[porttid + tid];
+		ipath_stats.sps_pagelocks++;
+		ipath_cdbg(VERBOSE,
+			   "TID %u, vaddr %lx, physaddr %llx pgp %p\n",
+			   tid, vaddr, (unsigned long long) physaddr,
+			   pagep[i]);
+		dd->ipath_f_put_tid(dd, &tidbase[tid], RCVHQ_RCV_TYPE_EXPECTED,
+				    physaddr);
+		/*
+		 * don't check this tid in ipath_portshadow, since we
+		 * just filled it in; start with the next one.
+		 */
+		tid++;
+	}
+
+	if (ret) {
+		u32 limit;
+	cleanup:
+		/* jump here if copy out of updated info failed... */
+		ipath_dbg("After failure (ret=%d), undo %d of %d entries\n",
+			  -ret, i, cnt);
+		/* same code that's in ipath_free_tid() */
+		limit = sizeof(tidmap) * BITS_PER_BYTE;
+		if (limit > tidcnt)
+			/* just in case size changes in future */
+			limit = tidcnt;
+		tid = find_first_bit((const unsigned long *)tidmap, limit);
+		for (; tid < limit; tid++) {
+			if (!test_bit(tid, tidmap))
+				continue;
+			if (dd->ipath_pageshadow[porttid + tid]) {
+				ipath_cdbg(VERBOSE, "Freeing TID %u\n",
+					   tid);
+				dd->ipath_f_put_tid(dd, &tidbase[tid],
+						    RCVHQ_RCV_TYPE_EXPECTED,
+						    dd->ipath_tidinvalid);
+				pci_unmap_page(dd->pcidev,
+					dd->ipath_physshadow[porttid + tid],
+					PAGE_SIZE, PCI_DMA_FROMDEVICE);
+				dd->ipath_pageshadow[porttid + tid] = NULL;
+				ipath_stats.sps_pageunlocks++;
+			}
+		}
+		ipath_release_user_pages(pagep, cnt);
+	} else {
+		/*
+		 * Copy the updated array, with ipath_tid's filled in, back
+		 * to user.  Since we did the copy in already, this "should
+		 * never fail" If it does, we have to clean up...
+		 */
+		if (copy_to_user((void __user *)
+				 (unsigned long) ti->tidlist,
+				 tidlist, cnt * sizeof(*tidlist))) {
+			ret = -EFAULT;
+			goto cleanup;
+		}
+		if (copy_to_user((void __user *) (unsigned long) ti->tidmap,
+				 tidmap, sizeof tidmap)) {
+			ret = -EFAULT;
+			goto cleanup;
+		}
+		if (tid == tidcnt)
+			tid = 0;
+		if (!pd->port_subport_cnt)
+			pd->port_tidcursor = tid;
+		else
+			tidcursor_fp(fp) = tid;
+	}
+
+done:
+	if (ret)
+		ipath_dbg("Failed to map %u TID pages, failing with %d\n",
+			  ti->tidcnt, -ret);
+	return ret;
+}
+
+/**
+ * ipath_tid_free - free a port TID
+ * @pd: the port
+ * @subport: the subport
+ * @ti: the TID info
+ *
+ * right now we are unlocking one page at a time, but since
+ * the intended use of this routine is for a single group of
+ * virtually contiguous pages, that should change to improve
+ * performance.  We check that the TID is in range for this port
+ * but otherwise don't check validity; if user has an error and
+ * frees the wrong tid, it's only their own data that can thereby
+ * be corrupted.  We do check that the TID was in use, for sanity
+ * We always use our idea of the saved address, not the address that
+ * they pass in to us.
+ */
+
+static int ipath_tid_free(struct ipath_portdata *pd, unsigned subport,
+			  const struct ipath_tid_info *ti)
+{
+	int ret = 0;
+	u32 tid, porttid, cnt, limit, tidcnt;
+	struct ipath_devdata *dd = pd->port_dd;
+	u64 __iomem *tidbase;
+	unsigned long tidmap[8];
+
+	if (!dd->ipath_pageshadow) {
+		ret = -ENOMEM;
+		goto done;
+	}
+
+	if (copy_from_user(tidmap, (void __user *)(unsigned long)ti->tidmap,
+			   sizeof tidmap)) {
+		ret = -EFAULT;
+		goto done;
+	}
+
+	porttid = pd->port_port * dd->ipath_rcvtidcnt;
+	if (!pd->port_subport_cnt)
+		tidcnt = dd->ipath_rcvtidcnt;
+	else if (!subport) {
+		tidcnt = (dd->ipath_rcvtidcnt / pd->port_subport_cnt) +
+			 (dd->ipath_rcvtidcnt % pd->port_subport_cnt);
+		porttid += dd->ipath_rcvtidcnt - tidcnt;
+	} else {
+		tidcnt = dd->ipath_rcvtidcnt / pd->port_subport_cnt;
+		porttid += tidcnt * (subport - 1);
+	}
+	tidbase = (u64 __iomem *) ((char __iomem *)(dd->ipath_kregbase) +
+				   dd->ipath_rcvtidbase +
+				   porttid * sizeof(*tidbase));
+
+	limit = sizeof(tidmap) * BITS_PER_BYTE;
+	if (limit > tidcnt)
+		/* just in case size changes in future */
+		limit = tidcnt;
+	tid = find_first_bit(tidmap, limit);
+	ipath_cdbg(VERBOSE, "Port%u free %u tids; first bit (max=%d) "
+		   "set is %d, porttid %u\n", pd->port_port, ti->tidcnt,
+		   limit, tid, porttid);
+	for (cnt = 0; tid < limit; tid++) {
+		/*
+		 * small optimization; if we detect a run of 3 or so without
+		 * any set, use find_first_bit again.  That's mainly to
+		 * accelerate the case where we wrapped, so we have some at
+		 * the beginning, and some at the end, and a big gap
+		 * in the middle.
+		 */
+		if (!test_bit(tid, tidmap))
+			continue;
+		cnt++;
+		if (dd->ipath_pageshadow[porttid + tid]) {
+			struct page *p;
+			p = dd->ipath_pageshadow[porttid + tid];
+			dd->ipath_pageshadow[porttid + tid] = NULL;
+			ipath_cdbg(VERBOSE, "PID %u freeing TID %u\n",
+				   pid_nr(pd->port_pid), tid);
+			dd->ipath_f_put_tid(dd, &tidbase[tid],
+					    RCVHQ_RCV_TYPE_EXPECTED,
+					    dd->ipath_tidinvalid);
+			pci_unmap_page(dd->pcidev,
+				dd->ipath_physshadow[porttid + tid],
+				PAGE_SIZE, PCI_DMA_FROMDEVICE);
+			ipath_release_user_pages(&p, 1);
+			ipath_stats.sps_pageunlocks++;
+		} else
+			ipath_dbg("Unused tid %u, ignoring\n", tid);
+	}
+	if (cnt != ti->tidcnt)
+		ipath_dbg("passed in tidcnt %d, only %d bits set in map\n",
+			  ti->tidcnt, cnt);
+done:
+	if (ret)
+		ipath_dbg("Failed to unmap %u TID pages, failing with %d\n",
+			  ti->tidcnt, -ret);
+	return ret;
+}
+
+/**
+ * ipath_set_part_key - set a partition key
+ * @pd: the port
+ * @key: the key
+ *
+ * We can have up to 4 active at a time (other than the default, which is
+ * always allowed).  This is somewhat tricky, since multiple ports may set
+ * the same key, so we reference count them, and clean up at exit.  All 4
+ * partition keys are packed into a single infinipath register.  It's an
+ * error for a process to set the same pkey multiple times.  We provide no
+ * mechanism to de-allocate a pkey at this time, we may eventually need to
+ * do that.  I've used the atomic operations, and no locking, and only make
+ * a single pass through what's available.  This should be more than
+ * adequate for some time. I'll think about spinlocks or the like if and as
+ * it's necessary.
+ */
+static int ipath_set_part_key(struct ipath_portdata *pd, u16 key)
+{
+	struct ipath_devdata *dd = pd->port_dd;
+	int i, any = 0, pidx = -1;
+	u16 lkey = key & 0x7FFF;
+	int ret;
+
+	if (lkey == (IPATH_DEFAULT_P_KEY & 0x7FFF)) {
+		/* nothing to do; this key always valid */
+		ret = 0;
+		goto bail;
+	}
+
+	ipath_cdbg(VERBOSE, "p%u try to set pkey %hx, current keys "
+		   "%hx:%x %hx:%x %hx:%x %hx:%x\n",
+		   pd->port_port, key, dd->ipath_pkeys[0],
+		   atomic_read(&dd->ipath_pkeyrefs[0]), dd->ipath_pkeys[1],
+		   atomic_read(&dd->ipath_pkeyrefs[1]), dd->ipath_pkeys[2],
+		   atomic_read(&dd->ipath_pkeyrefs[2]), dd->ipath_pkeys[3],
+		   atomic_read(&dd->ipath_pkeyrefs[3]));
+
+	if (!lkey) {
+		ipath_cdbg(PROC, "p%u tries to set key 0, not allowed\n",
+			   pd->port_port);
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	/*
+	 * Set the full membership bit, because it has to be
+	 * set in the register or the packet, and it seems
+	 * cleaner to set in the register than to force all
+	 * callers to set it. (see bug 4331)
+	 */
+	key |= 0x8000;
+
+	for (i = 0; i < ARRAY_SIZE(pd->port_pkeys); i++) {
+		if (!pd->port_pkeys[i] && pidx == -1)
+			pidx = i;
+		if (pd->port_pkeys[i] == key) {
+			ipath_cdbg(VERBOSE, "p%u tries to set same pkey "
+				   "(%x) more than once\n",
+				   pd->port_port, key);
+			ret = -EEXIST;
+			goto bail;
+		}
+	}
+	if (pidx == -1) {
+		ipath_dbg("All pkeys for port %u already in use, "
+			  "can't set %x\n", pd->port_port, key);
+		ret = -EBUSY;
+		goto bail;
+	}
+	for (any = i = 0; i < ARRAY_SIZE(dd->ipath_pkeys); i++) {
+		if (!dd->ipath_pkeys[i]) {
+			any++;
+			continue;
+		}
+		if (dd->ipath_pkeys[i] == key) {
+			atomic_t *pkrefs = &dd->ipath_pkeyrefs[i];
+
+			if (atomic_inc_return(pkrefs) > 1) {
+				pd->port_pkeys[pidx] = key;
+				ipath_cdbg(VERBOSE, "p%u set key %x "
+					   "matches #%d, count now %d\n",
+					   pd->port_port, key, i,
+					   atomic_read(pkrefs));
+				ret = 0;
+				goto bail;
+			} else {
+				/*
+				 * lost race, decrement count, catch below
+				 */
+				atomic_dec(pkrefs);
+				ipath_cdbg(VERBOSE, "Lost race, count was "
+					   "0, after dec, it's %d\n",
+					   atomic_read(pkrefs));
+				any++;
+			}
+		}
+		if ((dd->ipath_pkeys[i] & 0x7FFF) == lkey) {
+			/*
+			 * It makes no sense to have both the limited and
+			 * full membership PKEY set at the same time since
+			 * the unlimited one will disable the limited one.
+			 */
+			ret = -EEXIST;
+			goto bail;
+		}
+	}
+	if (!any) {
+		ipath_dbg("port %u, all pkeys already in use, "
+			  "can't set %x\n", pd->port_port, key);
+		ret = -EBUSY;
+		goto bail;
+	}
+	for (any = i = 0; i < ARRAY_SIZE(dd->ipath_pkeys); i++) {
+		if (!dd->ipath_pkeys[i] &&
+		    atomic_inc_return(&dd->ipath_pkeyrefs[i]) == 1) {
+			u64 pkey;
+
+			/* for ipathstats, etc. */
+			ipath_stats.sps_pkeys[i] = lkey;
+			pd->port_pkeys[pidx] = dd->ipath_pkeys[i] = key;
+			pkey =
+				(u64) dd->ipath_pkeys[0] |
+				((u64) dd->ipath_pkeys[1] << 16) |
+				((u64) dd->ipath_pkeys[2] << 32) |
+				((u64) dd->ipath_pkeys[3] << 48);
+			ipath_cdbg(PROC, "p%u set key %x in #%d, "
+				   "portidx %d, new pkey reg %llx\n",
+				   pd->port_port, key, i, pidx,
+				   (unsigned long long) pkey);
+			ipath_write_kreg(
+				dd, dd->ipath_kregs->kr_partitionkey, pkey);
+
+			ret = 0;
+			goto bail;
+		}
+	}
+	ipath_dbg("port %u, all pkeys already in use 2nd pass, "
+		  "can't set %x\n", pd->port_port, key);
+	ret = -EBUSY;
+
+bail:
+	return ret;
+}
+
+/**
+ * ipath_manage_rcvq - manage a port's receive queue
+ * @pd: the port
+ * @subport: the subport
+ * @start_stop: action to carry out
+ *
+ * start_stop == 0 disables receive on the port, for use in queue
+ * overflow conditions.  start_stop==1 re-enables, to be used to
+ * re-init the software copy of the head register
+ */
+static int ipath_manage_rcvq(struct ipath_portdata *pd, unsigned subport,
+			     int start_stop)
+{
+	struct ipath_devdata *dd = pd->port_dd;
+
+	ipath_cdbg(PROC, "%sabling rcv for unit %u port %u:%u\n",
+		   start_stop ? "en" : "dis", dd->ipath_unit,
+		   pd->port_port, subport);
+	if (subport)
+		goto bail;
+	/* atomically clear receive enable port. */
+	if (start_stop) {
+		/*
+		 * On enable, force in-memory copy of the tail register to
+		 * 0, so that protocol code doesn't have to worry about
+		 * whether or not the chip has yet updated the in-memory
+		 * copy or not on return from the system call. The chip
+		 * always resets it's tail register back to 0 on a
+		 * transition from disabled to enabled.  This could cause a
+		 * problem if software was broken, and did the enable w/o
+		 * the disable, but eventually the in-memory copy will be
+		 * updated and correct itself, even in the face of software
+		 * bugs.
+		 */
+		if (pd->port_rcvhdrtail_kvaddr)
+			ipath_clear_rcvhdrtail(pd);
+		set_bit(dd->ipath_r_portenable_shift + pd->port_port,
+			&dd->ipath_rcvctrl);
+	} else
+		clear_bit(dd->ipath_r_portenable_shift + pd->port_port,
+			  &dd->ipath_rcvctrl);
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl,
+			 dd->ipath_rcvctrl);
+	/* now be sure chip saw it before we return */
+	ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
+	if (start_stop) {
+		/*
+		 * And try to be sure that tail reg update has happened too.
+		 * This should in theory interlock with the RXE changes to
+		 * the tail register.  Don't assign it to the tail register
+		 * in memory copy, since we could overwrite an update by the
+		 * chip if we did.
+		 */
+		ipath_read_ureg32(dd, ur_rcvhdrtail, pd->port_port);
+	}
+	/* always; new head should be equal to new tail; see above */
+bail:
+	return 0;
+}
+
+static void ipath_clean_part_key(struct ipath_portdata *pd,
+				 struct ipath_devdata *dd)
+{
+	int i, j, pchanged = 0;
+	u64 oldpkey;
+
+	/* for debugging only */
+	oldpkey = (u64) dd->ipath_pkeys[0] |
+		((u64) dd->ipath_pkeys[1] << 16) |
+		((u64) dd->ipath_pkeys[2] << 32) |
+		((u64) dd->ipath_pkeys[3] << 48);
+
+	for (i = 0; i < ARRAY_SIZE(pd->port_pkeys); i++) {
+		if (!pd->port_pkeys[i])
+			continue;
+		ipath_cdbg(VERBOSE, "look for key[%d] %hx in pkeys\n", i,
+			   pd->port_pkeys[i]);
+		for (j = 0; j < ARRAY_SIZE(dd->ipath_pkeys); j++) {
+			/* check for match independent of the global bit */
+			if ((dd->ipath_pkeys[j] & 0x7fff) !=
+			    (pd->port_pkeys[i] & 0x7fff))
+				continue;
+			if (atomic_dec_and_test(&dd->ipath_pkeyrefs[j])) {
+				ipath_cdbg(VERBOSE, "p%u clear key "
+					   "%x matches #%d\n",
+					   pd->port_port,
+					   pd->port_pkeys[i], j);
+				ipath_stats.sps_pkeys[j] =
+					dd->ipath_pkeys[j] = 0;
+				pchanged++;
+			}
+			else ipath_cdbg(
+				VERBOSE, "p%u key %x matches #%d, "
+				"but ref still %d\n", pd->port_port,
+				pd->port_pkeys[i], j,
+				atomic_read(&dd->ipath_pkeyrefs[j]));
+			break;
+		}
+		pd->port_pkeys[i] = 0;
+	}
+	if (pchanged) {
+		u64 pkey = (u64) dd->ipath_pkeys[0] |
+			((u64) dd->ipath_pkeys[1] << 16) |
+			((u64) dd->ipath_pkeys[2] << 32) |
+			((u64) dd->ipath_pkeys[3] << 48);
+		ipath_cdbg(VERBOSE, "p%u old pkey reg %llx, "
+			   "new pkey reg %llx\n", pd->port_port,
+			   (unsigned long long) oldpkey,
+			   (unsigned long long) pkey);
+		ipath_write_kreg(dd, dd->ipath_kregs->kr_partitionkey,
+				 pkey);
+	}
+}
+
+/*
+ * Initialize the port data with the receive buffer sizes
+ * so this can be done while the master port is locked.
+ * Otherwise, there is a race with a slave opening the port
+ * and seeing these fields uninitialized.
+ */
+static void init_user_egr_sizes(struct ipath_portdata *pd)
+{
+	struct ipath_devdata *dd = pd->port_dd;
+	unsigned egrperchunk, egrcnt, size;
+
+	/*
+	 * to avoid wasting a lot of memory, we allocate 32KB chunks of
+	 * physically contiguous memory, advance through it until used up
+	 * and then allocate more.  Of course, we need memory to store those
+	 * extra pointers, now.  Started out with 256KB, but under heavy
+	 * memory pressure (creating large files and then copying them over
+	 * NFS while doing lots of MPI jobs), we hit some allocation
+	 * failures, even though we can sleep...  (2.6.10) Still get
+	 * failures at 64K.  32K is the lowest we can go without wasting
+	 * additional memory.
+	 */
+	size = 0x8000;
+	egrperchunk = size / dd->ipath_rcvegrbufsize;
+	egrcnt = dd->ipath_rcvegrcnt;
+	pd->port_rcvegrbuf_chunks = (egrcnt + egrperchunk - 1) / egrperchunk;
+	pd->port_rcvegrbufs_perchunk = egrperchunk;
+	pd->port_rcvegrbuf_size = size;
+}
+
+/**
+ * ipath_create_user_egr - allocate eager TID buffers
+ * @pd: the port to allocate TID buffers for
+ *
+ * This routine is now quite different for user and kernel, because
+ * the kernel uses skb's, for the accelerated network performance
+ * This is the user port version
+ *
+ * Allocate the eager TID buffers and program them into infinipath
+ * They are no longer completely contiguous, we do multiple allocation
+ * calls.
+ */
+static int ipath_create_user_egr(struct ipath_portdata *pd)
+{
+	struct ipath_devdata *dd = pd->port_dd;
+	unsigned e, egrcnt, egrperchunk, chunk, egrsize, egroff;
+	size_t size;
+	int ret;
+	gfp_t gfp_flags;
+
+	/*
+	 * GFP_USER, but without GFP_FS, so buffer cache can be
+	 * coalesced (we hope); otherwise, even at order 4,
+	 * heavy filesystem activity makes these fail, and we can
+	 * use compound pages.
+	 */
+	gfp_flags = __GFP_WAIT | __GFP_IO | __GFP_COMP;
+
+	egrcnt = dd->ipath_rcvegrcnt;
+	/* TID number offset for this port */
+	egroff = (pd->port_port - 1) * egrcnt + dd->ipath_p0_rcvegrcnt;
+	egrsize = dd->ipath_rcvegrbufsize;
+	ipath_cdbg(VERBOSE, "Allocating %d egr buffers, at egrtid "
+		   "offset %x, egrsize %u\n", egrcnt, egroff, egrsize);
+
+	chunk = pd->port_rcvegrbuf_chunks;
+	egrperchunk = pd->port_rcvegrbufs_perchunk;
+	size = pd->port_rcvegrbuf_size;
+	pd->port_rcvegrbuf = kmalloc(chunk * sizeof(pd->port_rcvegrbuf[0]),
+				     GFP_KERNEL);
+	if (!pd->port_rcvegrbuf) {
+		ret = -ENOMEM;
+		goto bail;
+	}
+	pd->port_rcvegrbuf_phys =
+		kmalloc(chunk * sizeof(pd->port_rcvegrbuf_phys[0]),
+			GFP_KERNEL);
+	if (!pd->port_rcvegrbuf_phys) {
+		ret = -ENOMEM;
+		goto bail_rcvegrbuf;
+	}
+	for (e = 0; e < pd->port_rcvegrbuf_chunks; e++) {
+
+		pd->port_rcvegrbuf[e] = dma_alloc_coherent(
+			&dd->pcidev->dev, size, &pd->port_rcvegrbuf_phys[e],
+			gfp_flags);
+
+		if (!pd->port_rcvegrbuf[e]) {
+			ret = -ENOMEM;
+			goto bail_rcvegrbuf_phys;
+		}
+	}
+
+	pd->port_rcvegr_phys = pd->port_rcvegrbuf_phys[0];
+
+	for (e = chunk = 0; chunk < pd->port_rcvegrbuf_chunks; chunk++) {
+		dma_addr_t pa = pd->port_rcvegrbuf_phys[chunk];
+		unsigned i;
+
+		for (i = 0; e < egrcnt && i < egrperchunk; e++, i++) {
+			dd->ipath_f_put_tid(dd, e + egroff +
+					    (u64 __iomem *)
+					    ((char __iomem *)
+					     dd->ipath_kregbase +
+					     dd->ipath_rcvegrbase),
+					    RCVHQ_RCV_TYPE_EAGER, pa);
+			pa += egrsize;
+		}
+		cond_resched();	/* don't hog the cpu */
+	}
+
+	ret = 0;
+	goto bail;
+
+bail_rcvegrbuf_phys:
+	for (e = 0; e < pd->port_rcvegrbuf_chunks &&
+		pd->port_rcvegrbuf[e]; e++) {
+		dma_free_coherent(&dd->pcidev->dev, size,
+				  pd->port_rcvegrbuf[e],
+				  pd->port_rcvegrbuf_phys[e]);
+
+	}
+	kfree(pd->port_rcvegrbuf_phys);
+	pd->port_rcvegrbuf_phys = NULL;
+bail_rcvegrbuf:
+	kfree(pd->port_rcvegrbuf);
+	pd->port_rcvegrbuf = NULL;
+bail:
+	return ret;
+}
+
+
+/* common code for the mappings on dma_alloc_coherent mem */
+static int ipath_mmap_mem(struct vm_area_struct *vma,
+	struct ipath_portdata *pd, unsigned len, int write_ok,
+	void *kvaddr, char *what)
+{
+	struct ipath_devdata *dd = pd->port_dd;
+	unsigned long pfn;
+	int ret;
+
+	if ((vma->vm_end - vma->vm_start) > len) {
+		dev_info(&dd->pcidev->dev,
+		         "FAIL on %s: len %lx > %x\n", what,
+			 vma->vm_end - vma->vm_start, len);
+		ret = -EFAULT;
+		goto bail;
+	}
+
+	if (!write_ok) {
+		if (vma->vm_flags & VM_WRITE) {
+			dev_info(&dd->pcidev->dev,
+				 "%s must be mapped readonly\n", what);
+			ret = -EPERM;
+			goto bail;
+		}
+
+		/* don't allow them to later change with mprotect */
+		vma->vm_flags &= ~VM_MAYWRITE;
+	}
+
+	pfn = virt_to_phys(kvaddr) >> PAGE_SHIFT;
+	ret = remap_pfn_range(vma, vma->vm_start, pfn,
+			      len, vma->vm_page_prot);
+	if (ret)
+		dev_info(&dd->pcidev->dev, "%s port%u mmap of %lx, %x "
+			 "bytes r%c failed: %d\n", what, pd->port_port,
+			 pfn, len, write_ok?'w':'o', ret);
+	else
+		ipath_cdbg(VERBOSE, "%s port%u mmaped %lx, %x bytes "
+			   "r%c\n", what, pd->port_port, pfn, len,
+			   write_ok?'w':'o');
+bail:
+	return ret;
+}
+
+static int mmap_ureg(struct vm_area_struct *vma, struct ipath_devdata *dd,
+		     u64 ureg)
+{
+	unsigned long phys;
+	int ret;
+
+	/*
+	 * This is real hardware, so use io_remap.  This is the mechanism
+	 * for the user process to update the head registers for their port
+	 * in the chip.
+	 */
+	if ((vma->vm_end - vma->vm_start) > PAGE_SIZE) {
+		dev_info(&dd->pcidev->dev, "FAIL mmap userreg: reqlen "
+			 "%lx > PAGE\n", vma->vm_end - vma->vm_start);
+		ret = -EFAULT;
+	} else {
+		phys = dd->ipath_physaddr + ureg;
+		vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+
+		vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND;
+		ret = io_remap_pfn_range(vma, vma->vm_start,
+					 phys >> PAGE_SHIFT,
+					 vma->vm_end - vma->vm_start,
+					 vma->vm_page_prot);
+	}
+	return ret;
+}
+
+static int mmap_piobufs(struct vm_area_struct *vma,
+			struct ipath_devdata *dd,
+			struct ipath_portdata *pd,
+			unsigned piobufs, unsigned piocnt)
+{
+	unsigned long phys;
+	int ret;
+
+	/*
+	 * When we map the PIO buffers in the chip, we want to map them as
+	 * writeonly, no read possible.   This prevents access to previous
+	 * process data, and catches users who might try to read the i/o
+	 * space due to a bug.
+	 */
+	if ((vma->vm_end - vma->vm_start) > (piocnt * dd->ipath_palign)) {
+		dev_info(&dd->pcidev->dev, "FAIL mmap piobufs: "
+			 "reqlen %lx > PAGE\n",
+			 vma->vm_end - vma->vm_start);
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	phys = dd->ipath_physaddr + piobufs;
+
+#if defined(__powerpc__)
+	/* There isn't a generic way to specify writethrough mappings */
+	pgprot_val(vma->vm_page_prot) |= _PAGE_NO_CACHE;
+	pgprot_val(vma->vm_page_prot) |= _PAGE_WRITETHRU;
+	pgprot_val(vma->vm_page_prot) &= ~_PAGE_GUARDED;
+#endif
+
+	/*
+	 * don't allow them to later change to readable with mprotect (for when
+	 * not initially mapped readable, as is normally the case)
+	 */
+	vma->vm_flags &= ~VM_MAYREAD;
+	vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND;
+
+	ret = io_remap_pfn_range(vma, vma->vm_start, phys >> PAGE_SHIFT,
+				 vma->vm_end - vma->vm_start,
+				 vma->vm_page_prot);
+bail:
+	return ret;
+}
+
+static int mmap_rcvegrbufs(struct vm_area_struct *vma,
+			   struct ipath_portdata *pd)
+{
+	struct ipath_devdata *dd = pd->port_dd;
+	unsigned long start, size;
+	size_t total_size, i;
+	unsigned long pfn;
+	int ret;
+
+	size = pd->port_rcvegrbuf_size;
+	total_size = pd->port_rcvegrbuf_chunks * size;
+	if ((vma->vm_end - vma->vm_start) > total_size) {
+		dev_info(&dd->pcidev->dev, "FAIL on egr bufs: "
+			 "reqlen %lx > actual %lx\n",
+			 vma->vm_end - vma->vm_start,
+			 (unsigned long) total_size);
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	if (vma->vm_flags & VM_WRITE) {
+		dev_info(&dd->pcidev->dev, "Can't map eager buffers as "
+			 "writable (flags=%lx)\n", vma->vm_flags);
+		ret = -EPERM;
+		goto bail;
+	}
+	/* don't allow them to later change to writeable with mprotect */
+	vma->vm_flags &= ~VM_MAYWRITE;
+
+	start = vma->vm_start;
+
+	for (i = 0; i < pd->port_rcvegrbuf_chunks; i++, start += size) {
+		pfn = virt_to_phys(pd->port_rcvegrbuf[i]) >> PAGE_SHIFT;
+		ret = remap_pfn_range(vma, start, pfn, size,
+				      vma->vm_page_prot);
+		if (ret < 0)
+			goto bail;
+	}
+	ret = 0;
+
+bail:
+	return ret;
+}
+
+/*
+ * ipath_file_vma_fault - handle a VMA page fault.
+ */
+static int ipath_file_vma_fault(struct vm_area_struct *vma,
+					struct vm_fault *vmf)
+{
+	struct page *page;
+
+	page = vmalloc_to_page((void *)(vmf->pgoff << PAGE_SHIFT));
+	if (!page)
+		return VM_FAULT_SIGBUS;
+	get_page(page);
+	vmf->page = page;
+
+	return 0;
+}
+
+static const struct vm_operations_struct ipath_file_vm_ops = {
+	.fault = ipath_file_vma_fault,
+};
+
+static int mmap_kvaddr(struct vm_area_struct *vma, u64 pgaddr,
+		       struct ipath_portdata *pd, unsigned subport)
+{
+	unsigned long len;
+	struct ipath_devdata *dd;
+	void *addr;
+	size_t size;
+	int ret = 0;
+
+	/* If the port is not shared, all addresses should be physical */
+	if (!pd->port_subport_cnt)
+		goto bail;
+
+	dd = pd->port_dd;
+	size = pd->port_rcvegrbuf_chunks * pd->port_rcvegrbuf_size;
+
+	/*
+	 * Each process has all the subport uregbase, rcvhdrq, and
+	 * rcvegrbufs mmapped - as an array for all the processes,
+	 * and also separately for this process.
+	 */
+	if (pgaddr == cvt_kvaddr(pd->subport_uregbase)) {
+		addr = pd->subport_uregbase;
+		size = PAGE_SIZE * pd->port_subport_cnt;
+	} else if (pgaddr == cvt_kvaddr(pd->subport_rcvhdr_base)) {
+		addr = pd->subport_rcvhdr_base;
+		size = pd->port_rcvhdrq_size * pd->port_subport_cnt;
+	} else if (pgaddr == cvt_kvaddr(pd->subport_rcvegrbuf)) {
+		addr = pd->subport_rcvegrbuf;
+		size *= pd->port_subport_cnt;
+        } else if (pgaddr == cvt_kvaddr(pd->subport_uregbase +
+                                        PAGE_SIZE * subport)) {
+                addr = pd->subport_uregbase + PAGE_SIZE * subport;
+                size = PAGE_SIZE;
+        } else if (pgaddr == cvt_kvaddr(pd->subport_rcvhdr_base +
+                                pd->port_rcvhdrq_size * subport)) {
+                addr = pd->subport_rcvhdr_base +
+                        pd->port_rcvhdrq_size * subport;
+                size = pd->port_rcvhdrq_size;
+        } else if (pgaddr == cvt_kvaddr(pd->subport_rcvegrbuf +
+                               size * subport)) {
+                addr = pd->subport_rcvegrbuf + size * subport;
+                /* rcvegrbufs are read-only on the slave */
+                if (vma->vm_flags & VM_WRITE) {
+                        dev_info(&dd->pcidev->dev,
+                                 "Can't map eager buffers as "
+                                 "writable (flags=%lx)\n", vma->vm_flags);
+                        ret = -EPERM;
+                        goto bail;
+                }
+                /*
+                 * Don't allow permission to later change to writeable
+                 * with mprotect.
+                 */
+                vma->vm_flags &= ~VM_MAYWRITE;
+	} else {
+		goto bail;
+	}
+	len = vma->vm_end - vma->vm_start;
+	if (len > size) {
+		ipath_cdbg(MM, "FAIL: reqlen %lx > %zx\n", len, size);
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	vma->vm_pgoff = (unsigned long) addr >> PAGE_SHIFT;
+	vma->vm_ops = &ipath_file_vm_ops;
+	vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
+	ret = 1;
+
+bail:
+	return ret;
+}
+
+/**
+ * ipath_mmap - mmap various structures into user space
+ * @fp: the file pointer
+ * @vma: the VM area
+ *
+ * We use this to have a shared buffer between the kernel and the user code
+ * for the rcvhdr queue, egr buffers, and the per-port user regs and pio
+ * buffers in the chip.  We have the open and close entries so we can bump
+ * the ref count and keep the driver from being unloaded while still mapped.
+ */
+static int ipath_mmap(struct file *fp, struct vm_area_struct *vma)
+{
+	struct ipath_portdata *pd;
+	struct ipath_devdata *dd;
+	u64 pgaddr, ureg;
+	unsigned piobufs, piocnt;
+	int ret;
+
+	pd = port_fp(fp);
+	if (!pd) {
+		ret = -EINVAL;
+		goto bail;
+	}
+	dd = pd->port_dd;
+
+	/*
+	 * This is the ipath_do_user_init() code, mapping the shared buffers
+	 * into the user process. The address referred to by vm_pgoff is the
+	 * file offset passed via mmap().  For shared ports, this is the
+	 * kernel vmalloc() address of the pages to share with the master.
+	 * For non-shared or master ports, this is a physical address.
+	 * We only do one mmap for each space mapped.
+	 */
+	pgaddr = vma->vm_pgoff << PAGE_SHIFT;
+
+	/*
+	 * Check for 0 in case one of the allocations failed, but user
+	 * called mmap anyway.
+	 */
+	if (!pgaddr)  {
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	ipath_cdbg(MM, "pgaddr %llx vm_start=%lx len %lx port %u:%u:%u\n",
+		   (unsigned long long) pgaddr, vma->vm_start,
+		   vma->vm_end - vma->vm_start, dd->ipath_unit,
+		   pd->port_port, subport_fp(fp));
+
+	/*
+	 * Physical addresses must fit in 40 bits for our hardware.
+	 * Check for kernel virtual addresses first, anything else must
+	 * match a HW or memory address.
+	 */
+	ret = mmap_kvaddr(vma, pgaddr, pd, subport_fp(fp));
+	if (ret) {
+		if (ret > 0)
+			ret = 0;
+		goto bail;
+	}
+
+	ureg = dd->ipath_uregbase + dd->ipath_ureg_align * pd->port_port;
+	if (!pd->port_subport_cnt) {
+		/* port is not shared */
+		piocnt = pd->port_piocnt;
+		piobufs = pd->port_piobufs;
+	} else if (!subport_fp(fp)) {
+		/* caller is the master */
+		piocnt = (pd->port_piocnt / pd->port_subport_cnt) +
+			 (pd->port_piocnt % pd->port_subport_cnt);
+		piobufs = pd->port_piobufs +
+			dd->ipath_palign * (pd->port_piocnt - piocnt);
+	} else {
+		unsigned slave = subport_fp(fp) - 1;
+
+		/* caller is a slave */
+		piocnt = pd->port_piocnt / pd->port_subport_cnt;
+		piobufs = pd->port_piobufs + dd->ipath_palign * piocnt * slave;
+	}
+
+	if (pgaddr == ureg)
+		ret = mmap_ureg(vma, dd, ureg);
+	else if (pgaddr == piobufs)
+		ret = mmap_piobufs(vma, dd, pd, piobufs, piocnt);
+	else if (pgaddr == dd->ipath_pioavailregs_phys)
+		/* in-memory copy of pioavail registers */
+		ret = ipath_mmap_mem(vma, pd, PAGE_SIZE, 0,
+			      	     (void *) dd->ipath_pioavailregs_dma,
+				     "pioavail registers");
+	else if (pgaddr == pd->port_rcvegr_phys)
+		ret = mmap_rcvegrbufs(vma, pd);
+	else if (pgaddr == (u64) pd->port_rcvhdrq_phys)
+		/*
+		 * The rcvhdrq itself; readonly except on HT (so have
+		 * to allow writable mapping), multiple pages, contiguous
+		 * from an i/o perspective.
+		 */
+		ret = ipath_mmap_mem(vma, pd, pd->port_rcvhdrq_size, 1,
+				     pd->port_rcvhdrq,
+				     "rcvhdrq");
+	else if (pgaddr == (u64) pd->port_rcvhdrqtailaddr_phys)
+		/* in-memory copy of rcvhdrq tail register */
+		ret = ipath_mmap_mem(vma, pd, PAGE_SIZE, 0,
+				     pd->port_rcvhdrtail_kvaddr,
+				     "rcvhdrq tail");
+	else
+		ret = -EINVAL;
+
+	vma->vm_private_data = NULL;
+
+	if (ret < 0)
+		dev_info(&dd->pcidev->dev,
+			 "Failure %d on off %llx len %lx\n",
+			 -ret, (unsigned long long)pgaddr,
+			 vma->vm_end - vma->vm_start);
+bail:
+	return ret;
+}
+
+static unsigned ipath_poll_hdrqfull(struct ipath_portdata *pd)
+{
+	unsigned pollflag = 0;
+
+	if ((pd->poll_type & IPATH_POLL_TYPE_OVERFLOW) &&
+	    pd->port_hdrqfull != pd->port_hdrqfull_poll) {
+		pollflag |= POLLIN | POLLRDNORM;
+		pd->port_hdrqfull_poll = pd->port_hdrqfull;
+	}
+
+	return pollflag;
+}
+
+static unsigned int ipath_poll_urgent(struct ipath_portdata *pd,
+				      struct file *fp,
+				      struct poll_table_struct *pt)
+{
+	unsigned pollflag = 0;
+	struct ipath_devdata *dd;
+
+	dd = pd->port_dd;
+
+	/* variable access in ipath_poll_hdrqfull() needs this */
+	rmb();
+	pollflag = ipath_poll_hdrqfull(pd);
+
+	if (pd->port_urgent != pd->port_urgent_poll) {
+		pollflag |= POLLIN | POLLRDNORM;
+		pd->port_urgent_poll = pd->port_urgent;
+	}
+
+	if (!pollflag) {
+		/* this saves a spin_lock/unlock in interrupt handler... */
+		set_bit(IPATH_PORT_WAITING_URG, &pd->port_flag);
+		/* flush waiting flag so don't miss an event... */
+		wmb();
+		poll_wait(fp, &pd->port_wait, pt);
+	}
+
+	return pollflag;
+}
+
+static unsigned int ipath_poll_next(struct ipath_portdata *pd,
+				    struct file *fp,
+				    struct poll_table_struct *pt)
+{
+	u32 head;
+	u32 tail;
+	unsigned pollflag = 0;
+	struct ipath_devdata *dd;
+
+	dd = pd->port_dd;
+
+	/* variable access in ipath_poll_hdrqfull() needs this */
+	rmb();
+	pollflag = ipath_poll_hdrqfull(pd);
+
+	head = ipath_read_ureg32(dd, ur_rcvhdrhead, pd->port_port);
+	if (pd->port_rcvhdrtail_kvaddr)
+		tail = ipath_get_rcvhdrtail(pd);
+	else
+		tail = ipath_read_ureg32(dd, ur_rcvhdrtail, pd->port_port);
+
+	if (head != tail)
+		pollflag |= POLLIN | POLLRDNORM;
+	else {
+		/* this saves a spin_lock/unlock in interrupt handler */
+		set_bit(IPATH_PORT_WAITING_RCV, &pd->port_flag);
+		/* flush waiting flag so we don't miss an event */
+		wmb();
+
+		set_bit(pd->port_port + dd->ipath_r_intravail_shift,
+			&dd->ipath_rcvctrl);
+
+		ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl,
+				 dd->ipath_rcvctrl);
+
+		if (dd->ipath_rhdrhead_intr_off) /* arm rcv interrupt */
+			ipath_write_ureg(dd, ur_rcvhdrhead,
+					 dd->ipath_rhdrhead_intr_off | head,
+					 pd->port_port);
+
+		poll_wait(fp, &pd->port_wait, pt);
+	}
+
+	return pollflag;
+}
+
+static unsigned int ipath_poll(struct file *fp,
+			       struct poll_table_struct *pt)
+{
+	struct ipath_portdata *pd;
+	unsigned pollflag;
+
+	pd = port_fp(fp);
+	if (!pd)
+		pollflag = 0;
+	else if (pd->poll_type & IPATH_POLL_TYPE_URGENT)
+		pollflag = ipath_poll_urgent(pd, fp, pt);
+	else
+		pollflag = ipath_poll_next(pd, fp, pt);
+
+	return pollflag;
+}
+
+static int ipath_supports_subports(int user_swmajor, int user_swminor)
+{
+	/* no subport implementation prior to software version 1.3 */
+	return (user_swmajor > 1) || (user_swminor >= 3);
+}
+
+static int ipath_compatible_subports(int user_swmajor, int user_swminor)
+{
+	/* this code is written long-hand for clarity */
+	if (IPATH_USER_SWMAJOR != user_swmajor) {
+		/* no promise of compatibility if major mismatch */
+		return 0;
+	}
+	if (IPATH_USER_SWMAJOR == 1) {
+		switch (IPATH_USER_SWMINOR) {
+		case 0:
+		case 1:
+		case 2:
+			/* no subport implementation so cannot be compatible */
+			return 0;
+		case 3:
+			/* 3 is only compatible with itself */
+			return user_swminor == 3;
+		default:
+			/* >= 4 are compatible (or are expected to be) */
+			return user_swminor >= 4;
+		}
+	}
+	/* make no promises yet for future major versions */
+	return 0;
+}
+
+static int init_subports(struct ipath_devdata *dd,
+			 struct ipath_portdata *pd,
+			 const struct ipath_user_info *uinfo)
+{
+	int ret = 0;
+	unsigned num_subports;
+	size_t size;
+
+	/*
+	 * If the user is requesting zero subports,
+	 * skip the subport allocation.
+	 */
+	if (uinfo->spu_subport_cnt <= 0)
+		goto bail;
+
+	/* Self-consistency check for ipath_compatible_subports() */
+	if (ipath_supports_subports(IPATH_USER_SWMAJOR, IPATH_USER_SWMINOR) &&
+	    !ipath_compatible_subports(IPATH_USER_SWMAJOR,
+				       IPATH_USER_SWMINOR)) {
+		dev_info(&dd->pcidev->dev,
+			 "Inconsistent ipath_compatible_subports()\n");
+		goto bail;
+	}
+
+	/* Check for subport compatibility */
+	if (!ipath_compatible_subports(uinfo->spu_userversion >> 16,
+				       uinfo->spu_userversion & 0xffff)) {
+		dev_info(&dd->pcidev->dev,
+			 "Mismatched user version (%d.%d) and driver "
+			 "version (%d.%d) while port sharing. Ensure "
+                         "that driver and library are from the same "
+                         "release.\n",
+			 (int) (uinfo->spu_userversion >> 16),
+                         (int) (uinfo->spu_userversion & 0xffff),
+			 IPATH_USER_SWMAJOR,
+	                 IPATH_USER_SWMINOR);
+		goto bail;
+	}
+	if (uinfo->spu_subport_cnt > INFINIPATH_MAX_SUBPORT) {
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	num_subports = uinfo->spu_subport_cnt;
+	pd->subport_uregbase = vzalloc(PAGE_SIZE * num_subports);
+	if (!pd->subport_uregbase) {
+		ret = -ENOMEM;
+		goto bail;
+	}
+	/* Note: pd->port_rcvhdrq_size isn't initialized yet. */
+	size = ALIGN(dd->ipath_rcvhdrcnt * dd->ipath_rcvhdrentsize *
+		     sizeof(u32), PAGE_SIZE) * num_subports;
+	pd->subport_rcvhdr_base = vzalloc(size);
+	if (!pd->subport_rcvhdr_base) {
+		ret = -ENOMEM;
+		goto bail_ureg;
+	}
+
+	pd->subport_rcvegrbuf = vzalloc(pd->port_rcvegrbuf_chunks *
+					pd->port_rcvegrbuf_size *
+					num_subports);
+	if (!pd->subport_rcvegrbuf) {
+		ret = -ENOMEM;
+		goto bail_rhdr;
+	}
+
+	pd->port_subport_cnt = uinfo->spu_subport_cnt;
+	pd->port_subport_id = uinfo->spu_subport_id;
+	pd->active_slaves = 1;
+	set_bit(IPATH_PORT_MASTER_UNINIT, &pd->port_flag);
+	goto bail;
+
+bail_rhdr:
+	vfree(pd->subport_rcvhdr_base);
+bail_ureg:
+	vfree(pd->subport_uregbase);
+	pd->subport_uregbase = NULL;
+bail:
+	return ret;
+}
+
+static int try_alloc_port(struct ipath_devdata *dd, int port,
+			  struct file *fp,
+			  const struct ipath_user_info *uinfo)
+{
+	struct ipath_portdata *pd;
+	int ret;
+
+	if (!(pd = dd->ipath_pd[port])) {
+		void *ptmp;
+
+		pd = kzalloc(sizeof(struct ipath_portdata), GFP_KERNEL);
+
+		/*
+		 * Allocate memory for use in ipath_tid_update() just once
+		 * at open, not per call.  Reduces cost of expected send
+		 * setup.
+		 */
+		ptmp = kmalloc(dd->ipath_rcvtidcnt * sizeof(u16) +
+			       dd->ipath_rcvtidcnt * sizeof(struct page **),
+			       GFP_KERNEL);
+		if (!pd || !ptmp) {
+			ipath_dev_err(dd, "Unable to allocate portdata "
+				      "memory, failing open\n");
+			ret = -ENOMEM;
+			kfree(pd);
+			kfree(ptmp);
+			goto bail;
+		}
+		dd->ipath_pd[port] = pd;
+		dd->ipath_pd[port]->port_port = port;
+		dd->ipath_pd[port]->port_dd = dd;
+		dd->ipath_pd[port]->port_tid_pg_list = ptmp;
+		init_waitqueue_head(&dd->ipath_pd[port]->port_wait);
+	}
+	if (!pd->port_cnt) {
+		pd->userversion = uinfo->spu_userversion;
+		init_user_egr_sizes(pd);
+		if ((ret = init_subports(dd, pd, uinfo)) != 0)
+			goto bail;
+		ipath_cdbg(PROC, "%s[%u] opened unit:port %u:%u\n",
+			   current->comm, current->pid, dd->ipath_unit,
+			   port);
+		pd->port_cnt = 1;
+		port_fp(fp) = pd;
+		pd->port_pid = get_pid(task_pid(current));
+		strlcpy(pd->port_comm, current->comm, sizeof(pd->port_comm));
+		ipath_stats.sps_ports++;
+		ret = 0;
+	} else
+		ret = -EBUSY;
+
+bail:
+	return ret;
+}
+
+static inline int usable(struct ipath_devdata *dd)
+{
+	return dd &&
+		(dd->ipath_flags & IPATH_PRESENT) &&
+		dd->ipath_kregbase &&
+		dd->ipath_lid &&
+		!(dd->ipath_flags & (IPATH_LINKDOWN | IPATH_DISABLED
+				     | IPATH_LINKUNK));
+}
+
+static int find_free_port(int unit, struct file *fp,
+			  const struct ipath_user_info *uinfo)
+{
+	struct ipath_devdata *dd = ipath_lookup(unit);
+	int ret, i;
+
+	if (!dd) {
+		ret = -ENODEV;
+		goto bail;
+	}
+
+	if (!usable(dd)) {
+		ret = -ENETDOWN;
+		goto bail;
+	}
+
+	for (i = 1; i < dd->ipath_cfgports; i++) {
+		ret = try_alloc_port(dd, i, fp, uinfo);
+		if (ret != -EBUSY)
+			goto bail;
+	}
+	ret = -EBUSY;
+
+bail:
+	return ret;
+}
+
+static int find_best_unit(struct file *fp,
+			  const struct ipath_user_info *uinfo)
+{
+	int ret = 0, i, prefunit = -1, devmax;
+	int maxofallports, npresent, nup;
+	int ndev;
+
+	devmax = ipath_count_units(&npresent, &nup, &maxofallports);
+
+	/*
+	 * This code is present to allow a knowledgeable person to
+	 * specify the layout of processes to processors before opening
+	 * this driver, and then we'll assign the process to the "closest"
+	 * InfiniPath chip to that processor (we assume reasonable connectivity,
+	 * for now).  This code assumes that if affinity has been set
+	 * before this point, that at most one cpu is set; for now this
+	 * is reasonable.  I check for both cpumask_empty() and cpumask_full(),
+	 * in case some kernel variant sets none of the bits when no
+	 * affinity is set.  2.6.11 and 12 kernels have all present
+	 * cpus set.  Some day we'll have to fix it up further to handle
+	 * a cpu subset.  This algorithm fails for two HT chips connected
+	 * in tunnel fashion.  Eventually this needs real topology
+	 * information.  There may be some issues with dual core numbering
+	 * as well.  This needs more work prior to release.
+	 */
+	if (!cpumask_empty(tsk_cpus_allowed(current)) &&
+	    !cpumask_full(tsk_cpus_allowed(current))) {
+		int ncpus = num_online_cpus(), curcpu = -1, nset = 0;
+		get_online_cpus();
+		for_each_online_cpu(i)
+			if (cpumask_test_cpu(i, tsk_cpus_allowed(current))) {
+				ipath_cdbg(PROC, "%s[%u] affinity set for "
+					   "cpu %d/%d\n", current->comm,
+					   current->pid, i, ncpus);
+				curcpu = i;
+				nset++;
+			}
+		put_online_cpus();
+		if (curcpu != -1 && nset != ncpus) {
+			if (npresent) {
+				prefunit = curcpu / (ncpus / npresent);
+				ipath_cdbg(PROC,"%s[%u] %d chips, %d cpus, "
+					  "%d cpus/chip, select unit %d\n",
+					  current->comm, current->pid,
+					  npresent, ncpus, ncpus / npresent,
+					  prefunit);
+			}
+		}
+	}
+
+	/*
+	 * user ports start at 1, kernel port is 0
+	 * For now, we do round-robin access across all chips
+	 */
+
+	if (prefunit != -1)
+		devmax = prefunit + 1;
+recheck:
+	for (i = 1; i < maxofallports; i++) {
+		for (ndev = prefunit != -1 ? prefunit : 0; ndev < devmax;
+		     ndev++) {
+			struct ipath_devdata *dd = ipath_lookup(ndev);
+
+			if (!usable(dd))
+				continue; /* can't use this unit */
+			if (i >= dd->ipath_cfgports)
+				/*
+				 * Maxed out on users of this unit. Try
+				 * next.
+				 */
+				continue;
+			ret = try_alloc_port(dd, i, fp, uinfo);
+			if (!ret)
+				goto done;
+		}
+	}
+
+	if (npresent) {
+		if (nup == 0) {
+			ret = -ENETDOWN;
+			ipath_dbg("No ports available (none initialized "
+				  "and ready)\n");
+		} else {
+			if (prefunit > 0) {
+				/* if started above 0, retry from 0 */
+				ipath_cdbg(PROC,
+					   "%s[%u] no ports on prefunit "
+					   "%d, clear and re-check\n",
+					   current->comm, current->pid,
+					   prefunit);
+				devmax = ipath_count_units(NULL, NULL,
+							   NULL);
+				prefunit = -1;
+				goto recheck;
+			}
+			ret = -EBUSY;
+			ipath_dbg("No ports available\n");
+		}
+	} else {
+		ret = -ENXIO;
+		ipath_dbg("No boards found\n");
+	}
+
+done:
+	return ret;
+}
+
+static int find_shared_port(struct file *fp,
+			    const struct ipath_user_info *uinfo)
+{
+	int devmax, ndev, i;
+	int ret = 0;
+
+	devmax = ipath_count_units(NULL, NULL, NULL);
+
+	for (ndev = 0; ndev < devmax; ndev++) {
+		struct ipath_devdata *dd = ipath_lookup(ndev);
+
+		if (!usable(dd))
+			continue;
+		for (i = 1; i < dd->ipath_cfgports; i++) {
+			struct ipath_portdata *pd = dd->ipath_pd[i];
+
+			/* Skip ports which are not yet open */
+			if (!pd || !pd->port_cnt)
+				continue;
+			/* Skip port if it doesn't match the requested one */
+			if (pd->port_subport_id != uinfo->spu_subport_id)
+				continue;
+			/* Verify the sharing process matches the master */
+			if (pd->port_subport_cnt != uinfo->spu_subport_cnt ||
+			    pd->userversion != uinfo->spu_userversion ||
+			    pd->port_cnt >= pd->port_subport_cnt) {
+				ret = -EINVAL;
+				goto done;
+			}
+			port_fp(fp) = pd;
+			subport_fp(fp) = pd->port_cnt++;
+			pd->port_subpid[subport_fp(fp)] =
+				get_pid(task_pid(current));
+			tidcursor_fp(fp) = 0;
+			pd->active_slaves |= 1 << subport_fp(fp);
+			ipath_cdbg(PROC,
+				   "%s[%u] %u sharing %s[%u] unit:port %u:%u\n",
+				   current->comm, current->pid,
+				   subport_fp(fp),
+				   pd->port_comm, pid_nr(pd->port_pid),
+				   dd->ipath_unit, pd->port_port);
+			ret = 1;
+			goto done;
+		}
+	}
+
+done:
+	return ret;
+}
+
+static int ipath_open(struct inode *in, struct file *fp)
+{
+	/* The real work is performed later in ipath_assign_port() */
+	fp->private_data = kzalloc(sizeof(struct ipath_filedata), GFP_KERNEL);
+	return fp->private_data ? 0 : -ENOMEM;
+}
+
+/* Get port early, so can set affinity prior to memory allocation */
+static int ipath_assign_port(struct file *fp,
+			      const struct ipath_user_info *uinfo)
+{
+	int ret;
+	int i_minor;
+	unsigned swmajor, swminor;
+
+	/* Check to be sure we haven't already initialized this file */
+	if (port_fp(fp)) {
+		ret = -EINVAL;
+		goto done;
+	}
+
+	/* for now, if major version is different, bail */
+	swmajor = uinfo->spu_userversion >> 16;
+	if (swmajor != IPATH_USER_SWMAJOR) {
+		ipath_dbg("User major version %d not same as driver "
+			  "major %d\n", uinfo->spu_userversion >> 16,
+			  IPATH_USER_SWMAJOR);
+		ret = -ENODEV;
+		goto done;
+	}
+
+	swminor = uinfo->spu_userversion & 0xffff;
+	if (swminor != IPATH_USER_SWMINOR)
+		ipath_dbg("User minor version %d not same as driver "
+			  "minor %d\n", swminor, IPATH_USER_SWMINOR);
+
+	mutex_lock(&ipath_mutex);
+
+	if (ipath_compatible_subports(swmajor, swminor) &&
+	    uinfo->spu_subport_cnt &&
+	    (ret = find_shared_port(fp, uinfo))) {
+		if (ret > 0)
+			ret = 0;
+		goto done_chk_sdma;
+	}
+
+	i_minor = iminor(file_inode(fp)) - IPATH_USER_MINOR_BASE;
+	ipath_cdbg(VERBOSE, "open on dev %lx (minor %d)\n",
+		   (long)file_inode(fp)->i_rdev, i_minor);
+
+	if (i_minor)
+		ret = find_free_port(i_minor - 1, fp, uinfo);
+	else
+		ret = find_best_unit(fp, uinfo);
+
+done_chk_sdma:
+	if (!ret) {
+		struct ipath_filedata *fd = fp->private_data;
+		const struct ipath_portdata *pd = fd->pd;
+		const struct ipath_devdata *dd = pd->port_dd;
+
+		fd->pq = ipath_user_sdma_queue_create(&dd->pcidev->dev,
+						      dd->ipath_unit,
+						      pd->port_port,
+						      fd->subport);
+
+		if (!fd->pq)
+			ret = -ENOMEM;
+	}
+
+	mutex_unlock(&ipath_mutex);
+
+done:
+	return ret;
+}
+
+
+static int ipath_do_user_init(struct file *fp,
+			      const struct ipath_user_info *uinfo)
+{
+	int ret;
+	struct ipath_portdata *pd = port_fp(fp);
+	struct ipath_devdata *dd;
+	u32 head32;
+
+	/* Subports don't need to initialize anything since master did it. */
+	if (subport_fp(fp)) {
+		ret = wait_event_interruptible(pd->port_wait,
+			!test_bit(IPATH_PORT_MASTER_UNINIT, &pd->port_flag));
+		goto done;
+	}
+
+	dd = pd->port_dd;
+
+	if (uinfo->spu_rcvhdrsize) {
+		ret = ipath_setrcvhdrsize(dd, uinfo->spu_rcvhdrsize);
+		if (ret)
+			goto done;
+	}
+
+	/* for now we do nothing with rcvhdrcnt: uinfo->spu_rcvhdrcnt */
+
+	/* some ports may get extra buffers, calculate that here */
+	if (pd->port_port <= dd->ipath_ports_extrabuf)
+		pd->port_piocnt = dd->ipath_pbufsport + 1;
+	else
+		pd->port_piocnt = dd->ipath_pbufsport;
+
+	/* for right now, kernel piobufs are at end, so port 1 is at 0 */
+	if (pd->port_port <= dd->ipath_ports_extrabuf)
+		pd->port_pio_base = (dd->ipath_pbufsport + 1)
+			* (pd->port_port - 1);
+	else
+		pd->port_pio_base = dd->ipath_ports_extrabuf +
+			dd->ipath_pbufsport * (pd->port_port - 1);
+	pd->port_piobufs = dd->ipath_piobufbase +
+		pd->port_pio_base * dd->ipath_palign;
+	ipath_cdbg(VERBOSE, "piobuf base for port %u is 0x%x, piocnt %u,"
+		" first pio %u\n", pd->port_port, pd->port_piobufs,
+		pd->port_piocnt, pd->port_pio_base);
+	ipath_chg_pioavailkernel(dd, pd->port_pio_base, pd->port_piocnt, 0);
+
+	/*
+	 * Now allocate the rcvhdr Q and eager TIDs; skip the TID
+	 * array for time being.  If pd->port_port > chip-supported,
+	 * we need to do extra stuff here to handle by handling overflow
+	 * through port 0, someday
+	 */
+	ret = ipath_create_rcvhdrq(dd, pd);
+	if (!ret)
+		ret = ipath_create_user_egr(pd);
+	if (ret)
+		goto done;
+
+	/*
+	 * set the eager head register for this port to the current values
+	 * of the tail pointers, since we don't know if they were
+	 * updated on last use of the port.
+	 */
+	head32 = ipath_read_ureg32(dd, ur_rcvegrindextail, pd->port_port);
+	ipath_write_ureg(dd, ur_rcvegrindexhead, head32, pd->port_port);
+	pd->port_lastrcvhdrqtail = -1;
+	ipath_cdbg(VERBOSE, "Wrote port%d egrhead %x from tail regs\n",
+		pd->port_port, head32);
+	pd->port_tidcursor = 0;	/* start at beginning after open */
+
+	/* initialize poll variables... */
+	pd->port_urgent = 0;
+	pd->port_urgent_poll = 0;
+	pd->port_hdrqfull_poll = pd->port_hdrqfull;
+
+	/*
+	 * Now enable the port for receive.
+	 * For chips that are set to DMA the tail register to memory
+	 * when they change (and when the update bit transitions from
+	 * 0 to 1.  So for those chips, we turn it off and then back on.
+	 * This will (very briefly) affect any other open ports, but the
+	 * duration is very short, and therefore isn't an issue.  We
+	 * explicitly set the in-memory tail copy to 0 beforehand, so we
+	 * don't have to wait to be sure the DMA update has happened
+	 * (chip resets head/tail to 0 on transition to enable).
+	 */
+	set_bit(dd->ipath_r_portenable_shift + pd->port_port,
+		&dd->ipath_rcvctrl);
+	if (!(dd->ipath_flags & IPATH_NODMA_RTAIL)) {
+		if (pd->port_rcvhdrtail_kvaddr)
+			ipath_clear_rcvhdrtail(pd);
+		ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl,
+			dd->ipath_rcvctrl &
+			~(1ULL << dd->ipath_r_tailupd_shift));
+	}
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl,
+			 dd->ipath_rcvctrl);
+	/* Notify any waiting slaves */
+	if (pd->port_subport_cnt) {
+		clear_bit(IPATH_PORT_MASTER_UNINIT, &pd->port_flag);
+		wake_up(&pd->port_wait);
+	}
+done:
+	return ret;
+}
+
+/**
+ * unlock_exptid - unlock any expected TID entries port still had in use
+ * @pd: port
+ *
+ * We don't actually update the chip here, because we do a bulk update
+ * below, using ipath_f_clear_tids.
+ */
+static void unlock_expected_tids(struct ipath_portdata *pd)
+{
+	struct ipath_devdata *dd = pd->port_dd;
+	int port_tidbase = pd->port_port * dd->ipath_rcvtidcnt;
+	int i, cnt = 0, maxtid = port_tidbase + dd->ipath_rcvtidcnt;
+
+	ipath_cdbg(VERBOSE, "Port %u unlocking any locked expTID pages\n",
+		   pd->port_port);
+	for (i = port_tidbase; i < maxtid; i++) {
+		struct page *ps = dd->ipath_pageshadow[i];
+
+		if (!ps)
+			continue;
+
+		dd->ipath_pageshadow[i] = NULL;
+		pci_unmap_page(dd->pcidev, dd->ipath_physshadow[i],
+			PAGE_SIZE, PCI_DMA_FROMDEVICE);
+		ipath_release_user_pages_on_close(&ps, 1);
+		cnt++;
+		ipath_stats.sps_pageunlocks++;
+	}
+	if (cnt)
+		ipath_cdbg(VERBOSE, "Port %u locked %u expTID entries\n",
+			   pd->port_port, cnt);
+
+	if (ipath_stats.sps_pagelocks || ipath_stats.sps_pageunlocks)
+		ipath_cdbg(VERBOSE, "%llu pages locked, %llu unlocked\n",
+			   (unsigned long long) ipath_stats.sps_pagelocks,
+			   (unsigned long long)
+			   ipath_stats.sps_pageunlocks);
+}
+
+static int ipath_close(struct inode *in, struct file *fp)
+{
+	int ret = 0;
+	struct ipath_filedata *fd;
+	struct ipath_portdata *pd;
+	struct ipath_devdata *dd;
+	unsigned long flags;
+	unsigned port;
+	struct pid *pid;
+
+	ipath_cdbg(VERBOSE, "close on dev %lx, private data %p\n",
+		   (long)in->i_rdev, fp->private_data);
+
+	mutex_lock(&ipath_mutex);
+
+	fd = fp->private_data;
+	fp->private_data = NULL;
+	pd = fd->pd;
+	if (!pd) {
+		mutex_unlock(&ipath_mutex);
+		goto bail;
+	}
+
+	dd = pd->port_dd;
+
+	/* drain user sdma queue */
+	ipath_user_sdma_queue_drain(dd, fd->pq);
+	ipath_user_sdma_queue_destroy(fd->pq);
+
+	if (--pd->port_cnt) {
+		/*
+		 * XXX If the master closes the port before the slave(s),
+		 * revoke the mmap for the eager receive queue so
+		 * the slave(s) don't wait for receive data forever.
+		 */
+		pd->active_slaves &= ~(1 << fd->subport);
+		put_pid(pd->port_subpid[fd->subport]);
+		pd->port_subpid[fd->subport] = NULL;
+		mutex_unlock(&ipath_mutex);
+		goto bail;
+	}
+	/* early; no interrupt users after this */
+	spin_lock_irqsave(&dd->ipath_uctxt_lock, flags);
+	port = pd->port_port;
+	dd->ipath_pd[port] = NULL;
+	pid = pd->port_pid;
+	pd->port_pid = NULL;
+	spin_unlock_irqrestore(&dd->ipath_uctxt_lock, flags);
+
+	if (pd->port_rcvwait_to || pd->port_piowait_to
+	    || pd->port_rcvnowait || pd->port_pionowait) {
+		ipath_cdbg(VERBOSE, "port%u, %u rcv, %u pio wait timeo; "
+			   "%u rcv %u, pio already\n",
+			   pd->port_port, pd->port_rcvwait_to,
+			   pd->port_piowait_to, pd->port_rcvnowait,
+			   pd->port_pionowait);
+		pd->port_rcvwait_to = pd->port_piowait_to =
+			pd->port_rcvnowait = pd->port_pionowait = 0;
+	}
+	if (pd->port_flag) {
+		ipath_cdbg(PROC, "port %u port_flag set: 0x%lx\n",
+			  pd->port_port, pd->port_flag);
+		pd->port_flag = 0;
+	}
+
+	if (dd->ipath_kregbase) {
+		/* atomically clear receive enable port and intr avail. */
+		clear_bit(dd->ipath_r_portenable_shift + port,
+			  &dd->ipath_rcvctrl);
+		clear_bit(pd->port_port + dd->ipath_r_intravail_shift,
+			  &dd->ipath_rcvctrl);
+		ipath_write_kreg( dd, dd->ipath_kregs->kr_rcvctrl,
+			dd->ipath_rcvctrl);
+		/* and read back from chip to be sure that nothing
+		 * else is in flight when we do the rest */
+		(void)ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
+
+		/* clean up the pkeys for this port user */
+		ipath_clean_part_key(pd, dd);
+		/*
+		 * be paranoid, and never write 0's to these, just use an
+		 * unused part of the port 0 tail page.  Of course,
+		 * rcvhdraddr points to a large chunk of memory, so this
+		 * could still trash things, but at least it won't trash
+		 * page 0, and by disabling the port, it should stop "soon",
+		 * even if a packet or two is in already in flight after we
+		 * disabled the port.
+		 */
+		ipath_write_kreg_port(dd,
+		        dd->ipath_kregs->kr_rcvhdrtailaddr, port,
+			dd->ipath_dummy_hdrq_phys);
+		ipath_write_kreg_port(dd, dd->ipath_kregs->kr_rcvhdraddr,
+			pd->port_port, dd->ipath_dummy_hdrq_phys);
+
+		ipath_disarm_piobufs(dd, pd->port_pio_base, pd->port_piocnt);
+		ipath_chg_pioavailkernel(dd, pd->port_pio_base,
+			pd->port_piocnt, 1);
+
+		dd->ipath_f_clear_tids(dd, pd->port_port);
+
+		if (dd->ipath_pageshadow)
+			unlock_expected_tids(pd);
+		ipath_stats.sps_ports--;
+		ipath_cdbg(PROC, "%s[%u] closed port %u:%u\n",
+			   pd->port_comm, pid_nr(pid),
+			   dd->ipath_unit, port);
+	}
+
+	put_pid(pid);
+	mutex_unlock(&ipath_mutex);
+	ipath_free_pddata(dd, pd); /* after releasing the mutex */
+
+bail:
+	kfree(fd);
+	return ret;
+}
+
+static int ipath_port_info(struct ipath_portdata *pd, u16 subport,
+			   struct ipath_port_info __user *uinfo)
+{
+	struct ipath_port_info info;
+	int nup;
+	int ret;
+	size_t sz;
+
+	(void) ipath_count_units(NULL, &nup, NULL);
+	info.num_active = nup;
+	info.unit = pd->port_dd->ipath_unit;
+	info.port = pd->port_port;
+	info.subport = subport;
+	/* Don't return new fields if old library opened the port. */
+	if (ipath_supports_subports(pd->userversion >> 16,
+				    pd->userversion & 0xffff)) {
+		/* Number of user ports available for this device. */
+		info.num_ports = pd->port_dd->ipath_cfgports - 1;
+		info.num_subports = pd->port_subport_cnt;
+		sz = sizeof(info);
+	} else
+		sz = sizeof(info) - 2 * sizeof(u16);
+
+	if (copy_to_user(uinfo, &info, sz)) {
+		ret = -EFAULT;
+		goto bail;
+	}
+	ret = 0;
+
+bail:
+	return ret;
+}
+
+static int ipath_get_slave_info(struct ipath_portdata *pd,
+				void __user *slave_mask_addr)
+{
+	int ret = 0;
+
+	if (copy_to_user(slave_mask_addr, &pd->active_slaves, sizeof(u32)))
+		ret = -EFAULT;
+	return ret;
+}
+
+static int ipath_sdma_get_inflight(struct ipath_user_sdma_queue *pq,
+				   u32 __user *inflightp)
+{
+	const u32 val = ipath_user_sdma_inflight_counter(pq);
+
+	if (put_user(val, inflightp))
+		return -EFAULT;
+
+	return 0;
+}
+
+static int ipath_sdma_get_complete(struct ipath_devdata *dd,
+				   struct ipath_user_sdma_queue *pq,
+				   u32 __user *completep)
+{
+	u32 val;
+	int err;
+
+	err = ipath_user_sdma_make_progress(dd, pq);
+	if (err < 0)
+		return err;
+
+	val = ipath_user_sdma_complete_counter(pq);
+	if (put_user(val, completep))
+		return -EFAULT;
+
+	return 0;
+}
+
+static ssize_t ipath_write(struct file *fp, const char __user *data,
+			   size_t count, loff_t *off)
+{
+	const struct ipath_cmd __user *ucmd;
+	struct ipath_portdata *pd;
+	const void __user *src;
+	size_t consumed, copy;
+	struct ipath_cmd cmd;
+	ssize_t ret = 0;
+	void *dest;
+
+	if (count < sizeof(cmd.type)) {
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	ucmd = (const struct ipath_cmd __user *) data;
+
+	if (copy_from_user(&cmd.type, &ucmd->type, sizeof(cmd.type))) {
+		ret = -EFAULT;
+		goto bail;
+	}
+
+	consumed = sizeof(cmd.type);
+
+	switch (cmd.type) {
+	case IPATH_CMD_ASSIGN_PORT:
+	case __IPATH_CMD_USER_INIT:
+	case IPATH_CMD_USER_INIT:
+		copy = sizeof(cmd.cmd.user_info);
+		dest = &cmd.cmd.user_info;
+		src = &ucmd->cmd.user_info;
+		break;
+	case IPATH_CMD_RECV_CTRL:
+		copy = sizeof(cmd.cmd.recv_ctrl);
+		dest = &cmd.cmd.recv_ctrl;
+		src = &ucmd->cmd.recv_ctrl;
+		break;
+	case IPATH_CMD_PORT_INFO:
+		copy = sizeof(cmd.cmd.port_info);
+		dest = &cmd.cmd.port_info;
+		src = &ucmd->cmd.port_info;
+		break;
+	case IPATH_CMD_TID_UPDATE:
+	case IPATH_CMD_TID_FREE:
+		copy = sizeof(cmd.cmd.tid_info);
+		dest = &cmd.cmd.tid_info;
+		src = &ucmd->cmd.tid_info;
+		break;
+	case IPATH_CMD_SET_PART_KEY:
+		copy = sizeof(cmd.cmd.part_key);
+		dest = &cmd.cmd.part_key;
+		src = &ucmd->cmd.part_key;
+		break;
+	case __IPATH_CMD_SLAVE_INFO:
+		copy = sizeof(cmd.cmd.slave_mask_addr);
+		dest = &cmd.cmd.slave_mask_addr;
+		src = &ucmd->cmd.slave_mask_addr;
+		break;
+	case IPATH_CMD_PIOAVAILUPD:	// force an update of PIOAvail reg
+		copy = 0;
+		src = NULL;
+		dest = NULL;
+		break;
+	case IPATH_CMD_POLL_TYPE:
+		copy = sizeof(cmd.cmd.poll_type);
+		dest = &cmd.cmd.poll_type;
+		src = &ucmd->cmd.poll_type;
+		break;
+	case IPATH_CMD_ARMLAUNCH_CTRL:
+		copy = sizeof(cmd.cmd.armlaunch_ctrl);
+		dest = &cmd.cmd.armlaunch_ctrl;
+		src = &ucmd->cmd.armlaunch_ctrl;
+		break;
+	case IPATH_CMD_SDMA_INFLIGHT:
+		copy = sizeof(cmd.cmd.sdma_inflight);
+		dest = &cmd.cmd.sdma_inflight;
+		src = &ucmd->cmd.sdma_inflight;
+		break;
+	case IPATH_CMD_SDMA_COMPLETE:
+		copy = sizeof(cmd.cmd.sdma_complete);
+		dest = &cmd.cmd.sdma_complete;
+		src = &ucmd->cmd.sdma_complete;
+		break;
+	default:
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	if (copy) {
+		if ((count - consumed) < copy) {
+			ret = -EINVAL;
+			goto bail;
+		}
+
+		if (copy_from_user(dest, src, copy)) {
+			ret = -EFAULT;
+			goto bail;
+		}
+
+		consumed += copy;
+	}
+
+	pd = port_fp(fp);
+	if (!pd && cmd.type != __IPATH_CMD_USER_INIT &&
+		cmd.type != IPATH_CMD_ASSIGN_PORT) {
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	switch (cmd.type) {
+	case IPATH_CMD_ASSIGN_PORT:
+		ret = ipath_assign_port(fp, &cmd.cmd.user_info);
+		if (ret)
+			goto bail;
+		break;
+	case __IPATH_CMD_USER_INIT:
+		/* backwards compatibility, get port first */
+		ret = ipath_assign_port(fp, &cmd.cmd.user_info);
+		if (ret)
+			goto bail;
+		/* and fall through to current version. */
+	case IPATH_CMD_USER_INIT:
+		ret = ipath_do_user_init(fp, &cmd.cmd.user_info);
+		if (ret)
+			goto bail;
+		ret = ipath_get_base_info(
+			fp, (void __user *) (unsigned long)
+			cmd.cmd.user_info.spu_base_info,
+			cmd.cmd.user_info.spu_base_info_size);
+		break;
+	case IPATH_CMD_RECV_CTRL:
+		ret = ipath_manage_rcvq(pd, subport_fp(fp), cmd.cmd.recv_ctrl);
+		break;
+	case IPATH_CMD_PORT_INFO:
+		ret = ipath_port_info(pd, subport_fp(fp),
+				      (struct ipath_port_info __user *)
+				      (unsigned long) cmd.cmd.port_info);
+		break;
+	case IPATH_CMD_TID_UPDATE:
+		ret = ipath_tid_update(pd, fp, &cmd.cmd.tid_info);
+		break;
+	case IPATH_CMD_TID_FREE:
+		ret = ipath_tid_free(pd, subport_fp(fp), &cmd.cmd.tid_info);
+		break;
+	case IPATH_CMD_SET_PART_KEY:
+		ret = ipath_set_part_key(pd, cmd.cmd.part_key);
+		break;
+	case __IPATH_CMD_SLAVE_INFO:
+		ret = ipath_get_slave_info(pd,
+					   (void __user *) (unsigned long)
+					   cmd.cmd.slave_mask_addr);
+		break;
+	case IPATH_CMD_PIOAVAILUPD:
+		ipath_force_pio_avail_update(pd->port_dd);
+		break;
+	case IPATH_CMD_POLL_TYPE:
+		pd->poll_type = cmd.cmd.poll_type;
+		break;
+	case IPATH_CMD_ARMLAUNCH_CTRL:
+		if (cmd.cmd.armlaunch_ctrl)
+			ipath_enable_armlaunch(pd->port_dd);
+		else
+			ipath_disable_armlaunch(pd->port_dd);
+		break;
+	case IPATH_CMD_SDMA_INFLIGHT:
+		ret = ipath_sdma_get_inflight(user_sdma_queue_fp(fp),
+					      (u32 __user *) (unsigned long)
+					      cmd.cmd.sdma_inflight);
+		break;
+	case IPATH_CMD_SDMA_COMPLETE:
+		ret = ipath_sdma_get_complete(pd->port_dd,
+					      user_sdma_queue_fp(fp),
+					      (u32 __user *) (unsigned long)
+					      cmd.cmd.sdma_complete);
+		break;
+	}
+
+	if (ret >= 0)
+		ret = consumed;
+
+bail:
+	return ret;
+}
+
+static ssize_t ipath_writev(struct kiocb *iocb, const struct iovec *iov,
+			    unsigned long dim, loff_t off)
+{
+	struct file *filp = iocb->ki_filp;
+	struct ipath_filedata *fp = filp->private_data;
+	struct ipath_portdata *pd = port_fp(filp);
+	struct ipath_user_sdma_queue *pq = fp->pq;
+
+	if (!dim)
+		return -EINVAL;
+
+	return ipath_user_sdma_writev(pd->port_dd, pq, iov, dim);
+}
+
+static struct class *ipath_class;
+
+static int init_cdev(int minor, char *name, const struct file_operations *fops,
+		     struct cdev **cdevp, struct device **devp)
+{
+	const dev_t dev = MKDEV(IPATH_MAJOR, minor);
+	struct cdev *cdev = NULL;
+	struct device *device = NULL;
+	int ret;
+
+	cdev = cdev_alloc();
+	if (!cdev) {
+		printk(KERN_ERR IPATH_DRV_NAME
+		       ": Could not allocate cdev for minor %d, %s\n",
+		       minor, name);
+		ret = -ENOMEM;
+		goto done;
+	}
+
+	cdev->owner = THIS_MODULE;
+	cdev->ops = fops;
+	kobject_set_name(&cdev->kobj, name);
+
+	ret = cdev_add(cdev, dev, 1);
+	if (ret < 0) {
+		printk(KERN_ERR IPATH_DRV_NAME
+		       ": Could not add cdev for minor %d, %s (err %d)\n",
+		       minor, name, -ret);
+		goto err_cdev;
+	}
+
+	device = device_create(ipath_class, NULL, dev, NULL, name);
+
+	if (IS_ERR(device)) {
+		ret = PTR_ERR(device);
+		printk(KERN_ERR IPATH_DRV_NAME ": Could not create "
+		       "device for minor %d, %s (err %d)\n",
+		       minor, name, -ret);
+		goto err_cdev;
+	}
+
+	goto done;
+
+err_cdev:
+	cdev_del(cdev);
+	cdev = NULL;
+
+done:
+	if (ret >= 0) {
+		*cdevp = cdev;
+		*devp = device;
+	} else {
+		*cdevp = NULL;
+		*devp = NULL;
+	}
+
+	return ret;
+}
+
+int ipath_cdev_init(int minor, char *name, const struct file_operations *fops,
+		    struct cdev **cdevp, struct device **devp)
+{
+	return init_cdev(minor, name, fops, cdevp, devp);
+}
+
+static void cleanup_cdev(struct cdev **cdevp,
+			 struct device **devp)
+{
+	struct device *dev = *devp;
+
+	if (dev) {
+		device_unregister(dev);
+		*devp = NULL;
+	}
+
+	if (*cdevp) {
+		cdev_del(*cdevp);
+		*cdevp = NULL;
+	}
+}
+
+void ipath_cdev_cleanup(struct cdev **cdevp,
+			struct device **devp)
+{
+	cleanup_cdev(cdevp, devp);
+}
+
+static struct cdev *wildcard_cdev;
+static struct device *wildcard_dev;
+
+static const dev_t dev = MKDEV(IPATH_MAJOR, 0);
+
+static int user_init(void)
+{
+	int ret;
+
+	ret = register_chrdev_region(dev, IPATH_NMINORS, IPATH_DRV_NAME);
+	if (ret < 0) {
+		printk(KERN_ERR IPATH_DRV_NAME ": Could not register "
+		       "chrdev region (err %d)\n", -ret);
+		goto done;
+	}
+
+	ipath_class = class_create(THIS_MODULE, IPATH_DRV_NAME);
+
+	if (IS_ERR(ipath_class)) {
+		ret = PTR_ERR(ipath_class);
+		printk(KERN_ERR IPATH_DRV_NAME ": Could not create "
+		       "device class (err %d)\n", -ret);
+		goto bail;
+	}
+
+	goto done;
+bail:
+	unregister_chrdev_region(dev, IPATH_NMINORS);
+done:
+	return ret;
+}
+
+static void user_cleanup(void)
+{
+	if (ipath_class) {
+		class_destroy(ipath_class);
+		ipath_class = NULL;
+	}
+
+	unregister_chrdev_region(dev, IPATH_NMINORS);
+}
+
+static atomic_t user_count = ATOMIC_INIT(0);
+static atomic_t user_setup = ATOMIC_INIT(0);
+
+int ipath_user_add(struct ipath_devdata *dd)
+{
+	char name[10];
+	int ret;
+
+	if (atomic_inc_return(&user_count) == 1) {
+		ret = user_init();
+		if (ret < 0) {
+			ipath_dev_err(dd, "Unable to set up user support: "
+				      "error %d\n", -ret);
+			goto bail;
+		}
+		ret = init_cdev(0, "ipath", &ipath_file_ops, &wildcard_cdev,
+				&wildcard_dev);
+		if (ret < 0) {
+			ipath_dev_err(dd, "Could not create wildcard "
+				      "minor: error %d\n", -ret);
+			goto bail_user;
+		}
+
+		atomic_set(&user_setup, 1);
+	}
+
+	snprintf(name, sizeof(name), "ipath%d", dd->ipath_unit);
+
+	ret = init_cdev(dd->ipath_unit + 1, name, &ipath_file_ops,
+			&dd->user_cdev, &dd->user_dev);
+	if (ret < 0)
+		ipath_dev_err(dd, "Could not create user minor %d, %s\n",
+			      dd->ipath_unit + 1, name);
+
+	goto bail;
+
+bail_user:
+	user_cleanup();
+bail:
+	return ret;
+}
+
+void ipath_user_remove(struct ipath_devdata *dd)
+{
+	cleanup_cdev(&dd->user_cdev, &dd->user_dev);
+
+	if (atomic_dec_return(&user_count) == 0) {
+		if (atomic_read(&user_setup) == 0)
+			goto bail;
+
+		cleanup_cdev(&wildcard_cdev, &wildcard_dev);
+		user_cleanup();
+
+		atomic_set(&user_setup, 0);
+	}
+bail:
+	return;
+}
diff --git a/drivers/infiniband/hw/ipath/ipath_fs.c b/drivers/infiniband/hw/ipath/ipath_fs.c
new file mode 100644
index 0000000..4977082
--- /dev/null
+++ b/drivers/infiniband/hw/ipath/ipath_fs.c
@@ -0,0 +1,422 @@
+/*
+ * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/mount.h>
+#include <linux/pagemap.h>
+#include <linux/init.h>
+#include <linux/namei.h>
+#include <linux/slab.h>
+
+#include "ipath_kernel.h"
+
+#define IPATHFS_MAGIC 0x726a77
+
+static struct super_block *ipath_super;
+
+static int ipathfs_mknod(struct inode *dir, struct dentry *dentry,
+			 umode_t mode, const struct file_operations *fops,
+			 void *data)
+{
+	int error;
+	struct inode *inode = new_inode(dir->i_sb);
+
+	if (!inode) {
+		error = -EPERM;
+		goto bail;
+	}
+
+	inode->i_ino = get_next_ino();
+	inode->i_mode = mode;
+	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+	inode->i_private = data;
+	if (S_ISDIR(mode)) {
+		inode->i_op = &simple_dir_inode_operations;
+		inc_nlink(inode);
+		inc_nlink(dir);
+	}
+
+	inode->i_fop = fops;
+
+	d_instantiate(dentry, inode);
+	error = 0;
+
+bail:
+	return error;
+}
+
+static int create_file(const char *name, umode_t mode,
+		       struct dentry *parent, struct dentry **dentry,
+		       const struct file_operations *fops, void *data)
+{
+	int error;
+
+	mutex_lock(&parent->d_inode->i_mutex);
+	*dentry = lookup_one_len(name, parent, strlen(name));
+	if (!IS_ERR(*dentry))
+		error = ipathfs_mknod(parent->d_inode, *dentry,
+				      mode, fops, data);
+	else
+		error = PTR_ERR(*dentry);
+	mutex_unlock(&parent->d_inode->i_mutex);
+
+	return error;
+}
+
+static ssize_t atomic_stats_read(struct file *file, char __user *buf,
+				 size_t count, loff_t *ppos)
+{
+	return simple_read_from_buffer(buf, count, ppos, &ipath_stats,
+				       sizeof ipath_stats);
+}
+
+static const struct file_operations atomic_stats_ops = {
+	.read = atomic_stats_read,
+	.llseek = default_llseek,
+};
+
+static ssize_t atomic_counters_read(struct file *file, char __user *buf,
+				    size_t count, loff_t *ppos)
+{
+	struct infinipath_counters counters;
+	struct ipath_devdata *dd;
+
+	dd = file_inode(file)->i_private;
+	dd->ipath_f_read_counters(dd, &counters);
+
+	return simple_read_from_buffer(buf, count, ppos, &counters,
+				       sizeof counters);
+}
+
+static const struct file_operations atomic_counters_ops = {
+	.read = atomic_counters_read,
+	.llseek = default_llseek,
+};
+
+static ssize_t flash_read(struct file *file, char __user *buf,
+			  size_t count, loff_t *ppos)
+{
+	struct ipath_devdata *dd;
+	ssize_t ret;
+	loff_t pos;
+	char *tmp;
+
+	pos = *ppos;
+
+	if ( pos < 0) {
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	if (pos >= sizeof(struct ipath_flash)) {
+		ret = 0;
+		goto bail;
+	}
+
+	if (count > sizeof(struct ipath_flash) - pos)
+		count = sizeof(struct ipath_flash) - pos;
+
+	tmp = kmalloc(count, GFP_KERNEL);
+	if (!tmp) {
+		ret = -ENOMEM;
+		goto bail;
+	}
+
+	dd = file_inode(file)->i_private;
+	if (ipath_eeprom_read(dd, pos, tmp, count)) {
+		ipath_dev_err(dd, "failed to read from flash\n");
+		ret = -ENXIO;
+		goto bail_tmp;
+	}
+
+	if (copy_to_user(buf, tmp, count)) {
+		ret = -EFAULT;
+		goto bail_tmp;
+	}
+
+	*ppos = pos + count;
+	ret = count;
+
+bail_tmp:
+	kfree(tmp);
+
+bail:
+	return ret;
+}
+
+static ssize_t flash_write(struct file *file, const char __user *buf,
+			   size_t count, loff_t *ppos)
+{
+	struct ipath_devdata *dd;
+	ssize_t ret;
+	loff_t pos;
+	char *tmp;
+
+	pos = *ppos;
+
+	if (pos != 0) {
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	if (count != sizeof(struct ipath_flash)) {
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	tmp = kmalloc(count, GFP_KERNEL);
+	if (!tmp) {
+		ret = -ENOMEM;
+		goto bail;
+	}
+
+	if (copy_from_user(tmp, buf, count)) {
+		ret = -EFAULT;
+		goto bail_tmp;
+	}
+
+	dd = file_inode(file)->i_private;
+	if (ipath_eeprom_write(dd, pos, tmp, count)) {
+		ret = -ENXIO;
+		ipath_dev_err(dd, "failed to write to flash\n");
+		goto bail_tmp;
+	}
+
+	*ppos = pos + count;
+	ret = count;
+
+bail_tmp:
+	kfree(tmp);
+
+bail:
+	return ret;
+}
+
+static const struct file_operations flash_ops = {
+	.read = flash_read,
+	.write = flash_write,
+	.llseek = default_llseek,
+};
+
+static int create_device_files(struct super_block *sb,
+			       struct ipath_devdata *dd)
+{
+	struct dentry *dir, *tmp;
+	char unit[10];
+	int ret;
+
+	snprintf(unit, sizeof unit, "%02d", dd->ipath_unit);
+	ret = create_file(unit, S_IFDIR|S_IRUGO|S_IXUGO, sb->s_root, &dir,
+			  &simple_dir_operations, dd);
+	if (ret) {
+		printk(KERN_ERR "create_file(%s) failed: %d\n", unit, ret);
+		goto bail;
+	}
+
+	ret = create_file("atomic_counters", S_IFREG|S_IRUGO, dir, &tmp,
+			  &atomic_counters_ops, dd);
+	if (ret) {
+		printk(KERN_ERR "create_file(%s/atomic_counters) "
+		       "failed: %d\n", unit, ret);
+		goto bail;
+	}
+
+	ret = create_file("flash", S_IFREG|S_IWUSR|S_IRUGO, dir, &tmp,
+			  &flash_ops, dd);
+	if (ret) {
+		printk(KERN_ERR "create_file(%s/flash) "
+		       "failed: %d\n", unit, ret);
+		goto bail;
+	}
+
+bail:
+	return ret;
+}
+
+static int remove_file(struct dentry *parent, char *name)
+{
+	struct dentry *tmp;
+	int ret;
+
+	tmp = lookup_one_len(name, parent, strlen(name));
+
+	if (IS_ERR(tmp)) {
+		ret = PTR_ERR(tmp);
+		goto bail;
+	}
+
+	spin_lock(&tmp->d_lock);
+	if (!(d_unhashed(tmp) && tmp->d_inode)) {
+		dget_dlock(tmp);
+		__d_drop(tmp);
+		spin_unlock(&tmp->d_lock);
+		simple_unlink(parent->d_inode, tmp);
+	} else
+		spin_unlock(&tmp->d_lock);
+
+	ret = 0;
+bail:
+	/*
+	 * We don't expect clients to care about the return value, but
+	 * it's there if they need it.
+	 */
+	return ret;
+}
+
+static int remove_device_files(struct super_block *sb,
+			       struct ipath_devdata *dd)
+{
+	struct dentry *dir, *root;
+	char unit[10];
+	int ret;
+
+	root = dget(sb->s_root);
+	mutex_lock(&root->d_inode->i_mutex);
+	snprintf(unit, sizeof unit, "%02d", dd->ipath_unit);
+	dir = lookup_one_len(unit, root, strlen(unit));
+
+	if (IS_ERR(dir)) {
+		ret = PTR_ERR(dir);
+		printk(KERN_ERR "Lookup of %s failed\n", unit);
+		goto bail;
+	}
+
+	remove_file(dir, "flash");
+	remove_file(dir, "atomic_counters");
+	d_delete(dir);
+	ret = simple_rmdir(root->d_inode, dir);
+
+bail:
+	mutex_unlock(&root->d_inode->i_mutex);
+	dput(root);
+	return ret;
+}
+
+static int ipathfs_fill_super(struct super_block *sb, void *data,
+			      int silent)
+{
+	struct ipath_devdata *dd, *tmp;
+	unsigned long flags;
+	int ret;
+
+	static struct tree_descr files[] = {
+		[2] = {"atomic_stats", &atomic_stats_ops, S_IRUGO},
+		{""},
+	};
+
+	ret = simple_fill_super(sb, IPATHFS_MAGIC, files);
+	if (ret) {
+		printk(KERN_ERR "simple_fill_super failed: %d\n", ret);
+		goto bail;
+	}
+
+	spin_lock_irqsave(&ipath_devs_lock, flags);
+
+	list_for_each_entry_safe(dd, tmp, &ipath_dev_list, ipath_list) {
+		spin_unlock_irqrestore(&ipath_devs_lock, flags);
+		ret = create_device_files(sb, dd);
+		if (ret)
+			goto bail;
+		spin_lock_irqsave(&ipath_devs_lock, flags);
+	}
+
+	spin_unlock_irqrestore(&ipath_devs_lock, flags);
+
+bail:
+	return ret;
+}
+
+static struct dentry *ipathfs_mount(struct file_system_type *fs_type,
+			int flags, const char *dev_name, void *data)
+{
+	struct dentry *ret;
+	ret = mount_single(fs_type, flags, data, ipathfs_fill_super);
+	if (!IS_ERR(ret))
+		ipath_super = ret->d_sb;
+	return ret;
+}
+
+static void ipathfs_kill_super(struct super_block *s)
+{
+	kill_litter_super(s);
+	ipath_super = NULL;
+}
+
+int ipathfs_add_device(struct ipath_devdata *dd)
+{
+	int ret;
+
+	if (ipath_super == NULL) {
+		ret = 0;
+		goto bail;
+	}
+
+	ret = create_device_files(ipath_super, dd);
+
+bail:
+	return ret;
+}
+
+int ipathfs_remove_device(struct ipath_devdata *dd)
+{
+	int ret;
+
+	if (ipath_super == NULL) {
+		ret = 0;
+		goto bail;
+	}
+
+	ret = remove_device_files(ipath_super, dd);
+
+bail:
+	return ret;
+}
+
+static struct file_system_type ipathfs_fs_type = {
+	.owner =	THIS_MODULE,
+	.name =		"ipathfs",
+	.mount =	ipathfs_mount,
+	.kill_sb =	ipathfs_kill_super,
+};
+MODULE_ALIAS_FS("ipathfs");
+
+int __init ipath_init_ipathfs(void)
+{
+	return register_filesystem(&ipathfs_fs_type);
+}
+
+void __exit ipath_exit_ipathfs(void)
+{
+	unregister_filesystem(&ipathfs_fs_type);
+}
diff --git a/drivers/infiniband/hw/ipath/ipath_iba6110.c b/drivers/infiniband/hw/ipath/ipath_iba6110.c
new file mode 100644
index 0000000..7cc3054
--- /dev/null
+++ b/drivers/infiniband/hw/ipath/ipath_iba6110.c
@@ -0,0 +1,1940 @@
+/*
+ * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/*
+ * This file contains all of the code that is specific to the InfiniPath
+ * HT chip.
+ */
+
+#include <linux/vmalloc.h>
+#include <linux/pci.h>
+#include <linux/delay.h>
+#include <linux/htirq.h>
+#include <rdma/ib_verbs.h>
+
+#include "ipath_kernel.h"
+#include "ipath_registers.h"
+
+static void ipath_setup_ht_setextled(struct ipath_devdata *, u64, u64);
+
+
+/*
+ * This lists the InfiniPath registers, in the actual chip layout.
+ * This structure should never be directly accessed.
+ *
+ * The names are in InterCap form because they're taken straight from
+ * the chip specification.  Since they're only used in this file, they
+ * don't pollute the rest of the source.
+*/
+
+struct _infinipath_do_not_use_kernel_regs {
+	unsigned long long Revision;
+	unsigned long long Control;
+	unsigned long long PageAlign;
+	unsigned long long PortCnt;
+	unsigned long long DebugPortSelect;
+	unsigned long long DebugPort;
+	unsigned long long SendRegBase;
+	unsigned long long UserRegBase;
+	unsigned long long CounterRegBase;
+	unsigned long long Scratch;
+	unsigned long long ReservedMisc1;
+	unsigned long long InterruptConfig;
+	unsigned long long IntBlocked;
+	unsigned long long IntMask;
+	unsigned long long IntStatus;
+	unsigned long long IntClear;
+	unsigned long long ErrorMask;
+	unsigned long long ErrorStatus;
+	unsigned long long ErrorClear;
+	unsigned long long HwErrMask;
+	unsigned long long HwErrStatus;
+	unsigned long long HwErrClear;
+	unsigned long long HwDiagCtrl;
+	unsigned long long MDIO;
+	unsigned long long IBCStatus;
+	unsigned long long IBCCtrl;
+	unsigned long long ExtStatus;
+	unsigned long long ExtCtrl;
+	unsigned long long GPIOOut;
+	unsigned long long GPIOMask;
+	unsigned long long GPIOStatus;
+	unsigned long long GPIOClear;
+	unsigned long long RcvCtrl;
+	unsigned long long RcvBTHQP;
+	unsigned long long RcvHdrSize;
+	unsigned long long RcvHdrCnt;
+	unsigned long long RcvHdrEntSize;
+	unsigned long long RcvTIDBase;
+	unsigned long long RcvTIDCnt;
+	unsigned long long RcvEgrBase;
+	unsigned long long RcvEgrCnt;
+	unsigned long long RcvBufBase;
+	unsigned long long RcvBufSize;
+	unsigned long long RxIntMemBase;
+	unsigned long long RxIntMemSize;
+	unsigned long long RcvPartitionKey;
+	unsigned long long ReservedRcv[10];
+	unsigned long long SendCtrl;
+	unsigned long long SendPIOBufBase;
+	unsigned long long SendPIOSize;
+	unsigned long long SendPIOBufCnt;
+	unsigned long long SendPIOAvailAddr;
+	unsigned long long TxIntMemBase;
+	unsigned long long TxIntMemSize;
+	unsigned long long ReservedSend[9];
+	unsigned long long SendBufferError;
+	unsigned long long SendBufferErrorCONT1;
+	unsigned long long SendBufferErrorCONT2;
+	unsigned long long SendBufferErrorCONT3;
+	unsigned long long ReservedSBE[4];
+	unsigned long long RcvHdrAddr0;
+	unsigned long long RcvHdrAddr1;
+	unsigned long long RcvHdrAddr2;
+	unsigned long long RcvHdrAddr3;
+	unsigned long long RcvHdrAddr4;
+	unsigned long long RcvHdrAddr5;
+	unsigned long long RcvHdrAddr6;
+	unsigned long long RcvHdrAddr7;
+	unsigned long long RcvHdrAddr8;
+	unsigned long long ReservedRHA[7];
+	unsigned long long RcvHdrTailAddr0;
+	unsigned long long RcvHdrTailAddr1;
+	unsigned long long RcvHdrTailAddr2;
+	unsigned long long RcvHdrTailAddr3;
+	unsigned long long RcvHdrTailAddr4;
+	unsigned long long RcvHdrTailAddr5;
+	unsigned long long RcvHdrTailAddr6;
+	unsigned long long RcvHdrTailAddr7;
+	unsigned long long RcvHdrTailAddr8;
+	unsigned long long ReservedRHTA[7];
+	unsigned long long Sync;	/* Software only */
+	unsigned long long Dump;	/* Software only */
+	unsigned long long SimVer;	/* Software only */
+	unsigned long long ReservedSW[5];
+	unsigned long long SerdesConfig0;
+	unsigned long long SerdesConfig1;
+	unsigned long long SerdesStatus;
+	unsigned long long XGXSConfig;
+	unsigned long long ReservedSW2[4];
+};
+
+struct _infinipath_do_not_use_counters {
+	__u64 LBIntCnt;
+	__u64 LBFlowStallCnt;
+	__u64 Reserved1;
+	__u64 TxUnsupVLErrCnt;
+	__u64 TxDataPktCnt;
+	__u64 TxFlowPktCnt;
+	__u64 TxDwordCnt;
+	__u64 TxLenErrCnt;
+	__u64 TxMaxMinLenErrCnt;
+	__u64 TxUnderrunCnt;
+	__u64 TxFlowStallCnt;
+	__u64 TxDroppedPktCnt;
+	__u64 RxDroppedPktCnt;
+	__u64 RxDataPktCnt;
+	__u64 RxFlowPktCnt;
+	__u64 RxDwordCnt;
+	__u64 RxLenErrCnt;
+	__u64 RxMaxMinLenErrCnt;
+	__u64 RxICRCErrCnt;
+	__u64 RxVCRCErrCnt;
+	__u64 RxFlowCtrlErrCnt;
+	__u64 RxBadFormatCnt;
+	__u64 RxLinkProblemCnt;
+	__u64 RxEBPCnt;
+	__u64 RxLPCRCErrCnt;
+	__u64 RxBufOvflCnt;
+	__u64 RxTIDFullErrCnt;
+	__u64 RxTIDValidErrCnt;
+	__u64 RxPKeyMismatchCnt;
+	__u64 RxP0HdrEgrOvflCnt;
+	__u64 RxP1HdrEgrOvflCnt;
+	__u64 RxP2HdrEgrOvflCnt;
+	__u64 RxP3HdrEgrOvflCnt;
+	__u64 RxP4HdrEgrOvflCnt;
+	__u64 RxP5HdrEgrOvflCnt;
+	__u64 RxP6HdrEgrOvflCnt;
+	__u64 RxP7HdrEgrOvflCnt;
+	__u64 RxP8HdrEgrOvflCnt;
+	__u64 Reserved6;
+	__u64 Reserved7;
+	__u64 IBStatusChangeCnt;
+	__u64 IBLinkErrRecoveryCnt;
+	__u64 IBLinkDownedCnt;
+	__u64 IBSymbolErrCnt;
+};
+
+#define IPATH_KREG_OFFSET(field) (offsetof( \
+	struct _infinipath_do_not_use_kernel_regs, field) / sizeof(u64))
+#define IPATH_CREG_OFFSET(field) (offsetof( \
+	struct _infinipath_do_not_use_counters, field) / sizeof(u64))
+
+static const struct ipath_kregs ipath_ht_kregs = {
+	.kr_control = IPATH_KREG_OFFSET(Control),
+	.kr_counterregbase = IPATH_KREG_OFFSET(CounterRegBase),
+	.kr_debugport = IPATH_KREG_OFFSET(DebugPort),
+	.kr_debugportselect = IPATH_KREG_OFFSET(DebugPortSelect),
+	.kr_errorclear = IPATH_KREG_OFFSET(ErrorClear),
+	.kr_errormask = IPATH_KREG_OFFSET(ErrorMask),
+	.kr_errorstatus = IPATH_KREG_OFFSET(ErrorStatus),
+	.kr_extctrl = IPATH_KREG_OFFSET(ExtCtrl),
+	.kr_extstatus = IPATH_KREG_OFFSET(ExtStatus),
+	.kr_gpio_clear = IPATH_KREG_OFFSET(GPIOClear),
+	.kr_gpio_mask = IPATH_KREG_OFFSET(GPIOMask),
+	.kr_gpio_out = IPATH_KREG_OFFSET(GPIOOut),
+	.kr_gpio_status = IPATH_KREG_OFFSET(GPIOStatus),
+	.kr_hwdiagctrl = IPATH_KREG_OFFSET(HwDiagCtrl),
+	.kr_hwerrclear = IPATH_KREG_OFFSET(HwErrClear),
+	.kr_hwerrmask = IPATH_KREG_OFFSET(HwErrMask),
+	.kr_hwerrstatus = IPATH_KREG_OFFSET(HwErrStatus),
+	.kr_ibcctrl = IPATH_KREG_OFFSET(IBCCtrl),
+	.kr_ibcstatus = IPATH_KREG_OFFSET(IBCStatus),
+	.kr_intblocked = IPATH_KREG_OFFSET(IntBlocked),
+	.kr_intclear = IPATH_KREG_OFFSET(IntClear),
+	.kr_interruptconfig = IPATH_KREG_OFFSET(InterruptConfig),
+	.kr_intmask = IPATH_KREG_OFFSET(IntMask),
+	.kr_intstatus = IPATH_KREG_OFFSET(IntStatus),
+	.kr_mdio = IPATH_KREG_OFFSET(MDIO),
+	.kr_pagealign = IPATH_KREG_OFFSET(PageAlign),
+	.kr_partitionkey = IPATH_KREG_OFFSET(RcvPartitionKey),
+	.kr_portcnt = IPATH_KREG_OFFSET(PortCnt),
+	.kr_rcvbthqp = IPATH_KREG_OFFSET(RcvBTHQP),
+	.kr_rcvbufbase = IPATH_KREG_OFFSET(RcvBufBase),
+	.kr_rcvbufsize = IPATH_KREG_OFFSET(RcvBufSize),
+	.kr_rcvctrl = IPATH_KREG_OFFSET(RcvCtrl),
+	.kr_rcvegrbase = IPATH_KREG_OFFSET(RcvEgrBase),
+	.kr_rcvegrcnt = IPATH_KREG_OFFSET(RcvEgrCnt),
+	.kr_rcvhdrcnt = IPATH_KREG_OFFSET(RcvHdrCnt),
+	.kr_rcvhdrentsize = IPATH_KREG_OFFSET(RcvHdrEntSize),
+	.kr_rcvhdrsize = IPATH_KREG_OFFSET(RcvHdrSize),
+	.kr_rcvintmembase = IPATH_KREG_OFFSET(RxIntMemBase),
+	.kr_rcvintmemsize = IPATH_KREG_OFFSET(RxIntMemSize),
+	.kr_rcvtidbase = IPATH_KREG_OFFSET(RcvTIDBase),
+	.kr_rcvtidcnt = IPATH_KREG_OFFSET(RcvTIDCnt),
+	.kr_revision = IPATH_KREG_OFFSET(Revision),
+	.kr_scratch = IPATH_KREG_OFFSET(Scratch),
+	.kr_sendbuffererror = IPATH_KREG_OFFSET(SendBufferError),
+	.kr_sendctrl = IPATH_KREG_OFFSET(SendCtrl),
+	.kr_sendpioavailaddr = IPATH_KREG_OFFSET(SendPIOAvailAddr),
+	.kr_sendpiobufbase = IPATH_KREG_OFFSET(SendPIOBufBase),
+	.kr_sendpiobufcnt = IPATH_KREG_OFFSET(SendPIOBufCnt),
+	.kr_sendpiosize = IPATH_KREG_OFFSET(SendPIOSize),
+	.kr_sendregbase = IPATH_KREG_OFFSET(SendRegBase),
+	.kr_txintmembase = IPATH_KREG_OFFSET(TxIntMemBase),
+	.kr_txintmemsize = IPATH_KREG_OFFSET(TxIntMemSize),
+	.kr_userregbase = IPATH_KREG_OFFSET(UserRegBase),
+	.kr_serdesconfig0 = IPATH_KREG_OFFSET(SerdesConfig0),
+	.kr_serdesconfig1 = IPATH_KREG_OFFSET(SerdesConfig1),
+	.kr_serdesstatus = IPATH_KREG_OFFSET(SerdesStatus),
+	.kr_xgxsconfig = IPATH_KREG_OFFSET(XGXSConfig),
+	/*
+	 * These should not be used directly via ipath_write_kreg64(),
+	 * use them with ipath_write_kreg64_port(),
+	 */
+	.kr_rcvhdraddr = IPATH_KREG_OFFSET(RcvHdrAddr0),
+	.kr_rcvhdrtailaddr = IPATH_KREG_OFFSET(RcvHdrTailAddr0)
+};
+
+static const struct ipath_cregs ipath_ht_cregs = {
+	.cr_badformatcnt = IPATH_CREG_OFFSET(RxBadFormatCnt),
+	.cr_erricrccnt = IPATH_CREG_OFFSET(RxICRCErrCnt),
+	.cr_errlinkcnt = IPATH_CREG_OFFSET(RxLinkProblemCnt),
+	.cr_errlpcrccnt = IPATH_CREG_OFFSET(RxLPCRCErrCnt),
+	.cr_errpkey = IPATH_CREG_OFFSET(RxPKeyMismatchCnt),
+	.cr_errrcvflowctrlcnt = IPATH_CREG_OFFSET(RxFlowCtrlErrCnt),
+	.cr_err_rlencnt = IPATH_CREG_OFFSET(RxLenErrCnt),
+	.cr_errslencnt = IPATH_CREG_OFFSET(TxLenErrCnt),
+	.cr_errtidfull = IPATH_CREG_OFFSET(RxTIDFullErrCnt),
+	.cr_errtidvalid = IPATH_CREG_OFFSET(RxTIDValidErrCnt),
+	.cr_errvcrccnt = IPATH_CREG_OFFSET(RxVCRCErrCnt),
+	.cr_ibstatuschange = IPATH_CREG_OFFSET(IBStatusChangeCnt),
+	/* calc from Reg_CounterRegBase + offset */
+	.cr_intcnt = IPATH_CREG_OFFSET(LBIntCnt),
+	.cr_invalidrlencnt = IPATH_CREG_OFFSET(RxMaxMinLenErrCnt),
+	.cr_invalidslencnt = IPATH_CREG_OFFSET(TxMaxMinLenErrCnt),
+	.cr_lbflowstallcnt = IPATH_CREG_OFFSET(LBFlowStallCnt),
+	.cr_pktrcvcnt = IPATH_CREG_OFFSET(RxDataPktCnt),
+	.cr_pktrcvflowctrlcnt = IPATH_CREG_OFFSET(RxFlowPktCnt),
+	.cr_pktsendcnt = IPATH_CREG_OFFSET(TxDataPktCnt),
+	.cr_pktsendflowcnt = IPATH_CREG_OFFSET(TxFlowPktCnt),
+	.cr_portovflcnt = IPATH_CREG_OFFSET(RxP0HdrEgrOvflCnt),
+	.cr_rcvebpcnt = IPATH_CREG_OFFSET(RxEBPCnt),
+	.cr_rcvovflcnt = IPATH_CREG_OFFSET(RxBufOvflCnt),
+	.cr_senddropped = IPATH_CREG_OFFSET(TxDroppedPktCnt),
+	.cr_sendstallcnt = IPATH_CREG_OFFSET(TxFlowStallCnt),
+	.cr_sendunderruncnt = IPATH_CREG_OFFSET(TxUnderrunCnt),
+	.cr_wordrcvcnt = IPATH_CREG_OFFSET(RxDwordCnt),
+	.cr_wordsendcnt = IPATH_CREG_OFFSET(TxDwordCnt),
+	.cr_unsupvlcnt = IPATH_CREG_OFFSET(TxUnsupVLErrCnt),
+	.cr_rxdroppktcnt = IPATH_CREG_OFFSET(RxDroppedPktCnt),
+	.cr_iblinkerrrecovcnt = IPATH_CREG_OFFSET(IBLinkErrRecoveryCnt),
+	.cr_iblinkdowncnt = IPATH_CREG_OFFSET(IBLinkDownedCnt),
+	.cr_ibsymbolerrcnt = IPATH_CREG_OFFSET(IBSymbolErrCnt)
+};
+
+/* kr_intstatus, kr_intclear, kr_intmask bits */
+#define INFINIPATH_I_RCVURG_MASK ((1U<<9)-1)
+#define INFINIPATH_I_RCVURG_SHIFT 0
+#define INFINIPATH_I_RCVAVAIL_MASK ((1U<<9)-1)
+#define INFINIPATH_I_RCVAVAIL_SHIFT 12
+
+/* kr_hwerrclear, kr_hwerrmask, kr_hwerrstatus, bits */
+#define INFINIPATH_HWE_HTCMEMPARITYERR_SHIFT 0
+#define INFINIPATH_HWE_HTCMEMPARITYERR_MASK 0x3FFFFFULL
+#define INFINIPATH_HWE_HTCLNKABYTE0CRCERR   0x0000000000800000ULL
+#define INFINIPATH_HWE_HTCLNKABYTE1CRCERR   0x0000000001000000ULL
+#define INFINIPATH_HWE_HTCLNKBBYTE0CRCERR   0x0000000002000000ULL
+#define INFINIPATH_HWE_HTCLNKBBYTE1CRCERR   0x0000000004000000ULL
+#define INFINIPATH_HWE_HTCMISCERR4          0x0000000008000000ULL
+#define INFINIPATH_HWE_HTCMISCERR5          0x0000000010000000ULL
+#define INFINIPATH_HWE_HTCMISCERR6          0x0000000020000000ULL
+#define INFINIPATH_HWE_HTCMISCERR7          0x0000000040000000ULL
+#define INFINIPATH_HWE_HTCBUSTREQPARITYERR  0x0000000080000000ULL
+#define INFINIPATH_HWE_HTCBUSTRESPPARITYERR 0x0000000100000000ULL
+#define INFINIPATH_HWE_HTCBUSIREQPARITYERR  0x0000000200000000ULL
+#define INFINIPATH_HWE_COREPLL_FBSLIP       0x0080000000000000ULL
+#define INFINIPATH_HWE_COREPLL_RFSLIP       0x0100000000000000ULL
+#define INFINIPATH_HWE_HTBPLL_FBSLIP        0x0200000000000000ULL
+#define INFINIPATH_HWE_HTBPLL_RFSLIP        0x0400000000000000ULL
+#define INFINIPATH_HWE_HTAPLL_FBSLIP        0x0800000000000000ULL
+#define INFINIPATH_HWE_HTAPLL_RFSLIP        0x1000000000000000ULL
+#define INFINIPATH_HWE_SERDESPLLFAILED      0x2000000000000000ULL
+
+#define IBA6110_IBCS_LINKTRAININGSTATE_MASK 0xf
+#define IBA6110_IBCS_LINKSTATE_SHIFT 4
+
+/* kr_extstatus bits */
+#define INFINIPATH_EXTS_FREQSEL 0x2
+#define INFINIPATH_EXTS_SERDESSEL 0x4
+#define INFINIPATH_EXTS_MEMBIST_ENDTEST     0x0000000000004000
+#define INFINIPATH_EXTS_MEMBIST_CORRECT     0x0000000000008000
+
+
+/* TID entries (memory), HT-only */
+#define INFINIPATH_RT_ADDR_MASK 0xFFFFFFFFFFULL	/* 40 bits valid */
+#define INFINIPATH_RT_VALID 0x8000000000000000ULL
+#define INFINIPATH_RT_ADDR_SHIFT 0
+#define INFINIPATH_RT_BUFSIZE_MASK 0x3FFFULL
+#define INFINIPATH_RT_BUFSIZE_SHIFT 48
+
+#define INFINIPATH_R_INTRAVAIL_SHIFT 16
+#define INFINIPATH_R_TAILUPD_SHIFT 31
+
+/* kr_xgxsconfig bits */
+#define INFINIPATH_XGXS_RESET          0x7ULL
+
+/*
+ * masks and bits that are different in different chips, or present only
+ * in one
+ */
+static const ipath_err_t infinipath_hwe_htcmemparityerr_mask =
+    INFINIPATH_HWE_HTCMEMPARITYERR_MASK;
+static const ipath_err_t infinipath_hwe_htcmemparityerr_shift =
+    INFINIPATH_HWE_HTCMEMPARITYERR_SHIFT;
+
+static const ipath_err_t infinipath_hwe_htclnkabyte0crcerr =
+    INFINIPATH_HWE_HTCLNKABYTE0CRCERR;
+static const ipath_err_t infinipath_hwe_htclnkabyte1crcerr =
+    INFINIPATH_HWE_HTCLNKABYTE1CRCERR;
+static const ipath_err_t infinipath_hwe_htclnkbbyte0crcerr =
+    INFINIPATH_HWE_HTCLNKBBYTE0CRCERR;
+static const ipath_err_t infinipath_hwe_htclnkbbyte1crcerr =
+    INFINIPATH_HWE_HTCLNKBBYTE1CRCERR;
+
+#define _IPATH_GPIO_SDA_NUM 1
+#define _IPATH_GPIO_SCL_NUM 0
+
+#define IPATH_GPIO_SDA \
+	(1ULL << (_IPATH_GPIO_SDA_NUM+INFINIPATH_EXTC_GPIOOE_SHIFT))
+#define IPATH_GPIO_SCL \
+	(1ULL << (_IPATH_GPIO_SCL_NUM+INFINIPATH_EXTC_GPIOOE_SHIFT))
+
+/* keep the code below somewhat more readable; not used elsewhere */
+#define _IPATH_HTLINK0_CRCBITS (infinipath_hwe_htclnkabyte0crcerr |	\
+				infinipath_hwe_htclnkabyte1crcerr)
+#define _IPATH_HTLINK1_CRCBITS (infinipath_hwe_htclnkbbyte0crcerr |	\
+				infinipath_hwe_htclnkbbyte1crcerr)
+#define _IPATH_HTLANE0_CRCBITS (infinipath_hwe_htclnkabyte0crcerr |	\
+				infinipath_hwe_htclnkbbyte0crcerr)
+#define _IPATH_HTLANE1_CRCBITS (infinipath_hwe_htclnkabyte1crcerr |	\
+				infinipath_hwe_htclnkbbyte1crcerr)
+
+static void hwerr_crcbits(struct ipath_devdata *dd, ipath_err_t hwerrs,
+			  char *msg, size_t msgl)
+{
+	char bitsmsg[64];
+	ipath_err_t crcbits = hwerrs &
+		(_IPATH_HTLINK0_CRCBITS | _IPATH_HTLINK1_CRCBITS);
+	/* don't check if 8bit HT */
+	if (dd->ipath_flags & IPATH_8BIT_IN_HT0)
+		crcbits &= ~infinipath_hwe_htclnkabyte1crcerr;
+	/* don't check if 8bit HT */
+	if (dd->ipath_flags & IPATH_8BIT_IN_HT1)
+		crcbits &= ~infinipath_hwe_htclnkbbyte1crcerr;
+	/*
+	 * we'll want to ignore link errors on link that is
+	 * not in use, if any.  For now, complain about both
+	 */
+	if (crcbits) {
+		u16 ctrl0, ctrl1;
+		snprintf(bitsmsg, sizeof bitsmsg,
+			 "[HT%s lane %s CRC (%llx); powercycle to completely clear]",
+			 !(crcbits & _IPATH_HTLINK1_CRCBITS) ?
+			 "0 (A)" : (!(crcbits & _IPATH_HTLINK0_CRCBITS)
+				    ? "1 (B)" : "0+1 (A+B)"),
+			 !(crcbits & _IPATH_HTLANE1_CRCBITS) ? "0"
+			 : (!(crcbits & _IPATH_HTLANE0_CRCBITS) ? "1" :
+			    "0+1"), (unsigned long long) crcbits);
+		strlcat(msg, bitsmsg, msgl);
+
+		/*
+		 * print extra info for debugging.  slave/primary
+		 * config word 4, 8 (link control 0, 1)
+		 */
+
+		if (pci_read_config_word(dd->pcidev,
+					 dd->ipath_ht_slave_off + 0x4,
+					 &ctrl0))
+			dev_info(&dd->pcidev->dev, "Couldn't read "
+				 "linkctrl0 of slave/primary "
+				 "config block\n");
+		else if (!(ctrl0 & 1 << 6))
+			/* not if EOC bit set */
+			ipath_dbg("HT linkctrl0 0x%x%s%s\n", ctrl0,
+				  ((ctrl0 >> 8) & 7) ? " CRC" : "",
+				  ((ctrl0 >> 4) & 1) ? "linkfail" :
+				  "");
+		if (pci_read_config_word(dd->pcidev,
+					 dd->ipath_ht_slave_off + 0x8,
+					 &ctrl1))
+			dev_info(&dd->pcidev->dev, "Couldn't read "
+				 "linkctrl1 of slave/primary "
+				 "config block\n");
+		else if (!(ctrl1 & 1 << 6))
+			/* not if EOC bit set */
+			ipath_dbg("HT linkctrl1 0x%x%s%s\n", ctrl1,
+				  ((ctrl1 >> 8) & 7) ? " CRC" : "",
+				  ((ctrl1 >> 4) & 1) ? "linkfail" :
+				  "");
+
+		/* disable until driver reloaded */
+		dd->ipath_hwerrmask &= ~crcbits;
+		ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrmask,
+				 dd->ipath_hwerrmask);
+		ipath_dbg("HT crc errs: %s\n", msg);
+	} else
+		ipath_dbg("ignoring HT crc errors 0x%llx, "
+			  "not in use\n", (unsigned long long)
+			  (hwerrs & (_IPATH_HTLINK0_CRCBITS |
+				     _IPATH_HTLINK1_CRCBITS)));
+}
+
+/* 6110 specific hardware errors... */
+static const struct ipath_hwerror_msgs ipath_6110_hwerror_msgs[] = {
+	INFINIPATH_HWE_MSG(HTCBUSIREQPARITYERR, "HTC Ireq Parity"),
+	INFINIPATH_HWE_MSG(HTCBUSTREQPARITYERR, "HTC Treq Parity"),
+	INFINIPATH_HWE_MSG(HTCBUSTRESPPARITYERR, "HTC Tresp Parity"),
+	INFINIPATH_HWE_MSG(HTCMISCERR5, "HT core Misc5"),
+	INFINIPATH_HWE_MSG(HTCMISCERR6, "HT core Misc6"),
+	INFINIPATH_HWE_MSG(HTCMISCERR7, "HT core Misc7"),
+	INFINIPATH_HWE_MSG(RXDSYNCMEMPARITYERR, "Rx Dsync"),
+	INFINIPATH_HWE_MSG(SERDESPLLFAILED, "SerDes PLL"),
+};
+
+#define TXE_PIO_PARITY ((INFINIPATH_HWE_TXEMEMPARITYERR_PIOBUF | \
+		        INFINIPATH_HWE_TXEMEMPARITYERR_PIOPBC) \
+		        << INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT)
+#define RXE_EAGER_PARITY (INFINIPATH_HWE_RXEMEMPARITYERR_EAGERTID \
+			  << INFINIPATH_HWE_RXEMEMPARITYERR_SHIFT)
+
+static void ipath_ht_txe_recover(struct ipath_devdata *dd)
+{
+	++ipath_stats.sps_txeparity;
+	dev_info(&dd->pcidev->dev,
+		"Recovering from TXE PIO parity error\n");
+}
+
+
+/**
+ * ipath_ht_handle_hwerrors - display hardware errors.
+ * @dd: the infinipath device
+ * @msg: the output buffer
+ * @msgl: the size of the output buffer
+ *
+ * Use same msg buffer as regular errors to avoid excessive stack
+ * use.  Most hardware errors are catastrophic, but for right now,
+ * we'll print them and continue.  We reuse the same message buffer as
+ * ipath_handle_errors() to avoid excessive stack usage.
+ */
+static void ipath_ht_handle_hwerrors(struct ipath_devdata *dd, char *msg,
+				     size_t msgl)
+{
+	ipath_err_t hwerrs;
+	u32 bits, ctrl;
+	int isfatal = 0;
+	char bitsmsg[64];
+	int log_idx;
+
+	hwerrs = ipath_read_kreg64(dd, dd->ipath_kregs->kr_hwerrstatus);
+
+	if (!hwerrs) {
+		ipath_cdbg(VERBOSE, "Called but no hardware errors set\n");
+		/*
+		 * better than printing cofusing messages
+		 * This seems to be related to clearing the crc error, or
+		 * the pll error during init.
+		 */
+		goto bail;
+	} else if (hwerrs == -1LL) {
+		ipath_dev_err(dd, "Read of hardware error status failed "
+			      "(all bits set); ignoring\n");
+		goto bail;
+	}
+	ipath_stats.sps_hwerrs++;
+
+	/* Always clear the error status register, except MEMBISTFAIL,
+	 * regardless of whether we continue or stop using the chip.
+	 * We want that set so we know it failed, even across driver reload.
+	 * We'll still ignore it in the hwerrmask.  We do this partly for
+	 * diagnostics, but also for support */
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrclear,
+			 hwerrs&~INFINIPATH_HWE_MEMBISTFAILED);
+
+	hwerrs &= dd->ipath_hwerrmask;
+
+	/* We log some errors to EEPROM, check if we have any of those. */
+	for (log_idx = 0; log_idx < IPATH_EEP_LOG_CNT; ++log_idx)
+		if (hwerrs & dd->ipath_eep_st_masks[log_idx].hwerrs_to_log)
+			ipath_inc_eeprom_err(dd, log_idx, 1);
+
+	/*
+	 * make sure we get this much out, unless told to be quiet,
+	 * it's a parity error we may recover from,
+	 * or it's occurred within the last 5 seconds
+	 */
+	if ((hwerrs & ~(dd->ipath_lasthwerror | TXE_PIO_PARITY |
+		RXE_EAGER_PARITY)) ||
+		(ipath_debug & __IPATH_VERBDBG))
+		dev_info(&dd->pcidev->dev, "Hardware error: hwerr=0x%llx "
+			 "(cleared)\n", (unsigned long long) hwerrs);
+	dd->ipath_lasthwerror |= hwerrs;
+
+	if (hwerrs & ~dd->ipath_hwe_bitsextant)
+		ipath_dev_err(dd, "hwerror interrupt with unknown errors "
+			      "%llx set\n", (unsigned long long)
+			      (hwerrs & ~dd->ipath_hwe_bitsextant));
+
+	ctrl = ipath_read_kreg32(dd, dd->ipath_kregs->kr_control);
+	if ((ctrl & INFINIPATH_C_FREEZEMODE) && !ipath_diag_inuse) {
+		/*
+		 * parity errors in send memory are recoverable,
+		 * just cancel the send (if indicated in * sendbuffererror),
+		 * count the occurrence, unfreeze (if no other handled
+		 * hardware error bits are set), and continue. They can
+		 * occur if a processor speculative read is done to the PIO
+		 * buffer while we are sending a packet, for example.
+		 */
+		if (hwerrs & TXE_PIO_PARITY) {
+			ipath_ht_txe_recover(dd);
+			hwerrs &= ~TXE_PIO_PARITY;
+		}
+
+		if (!hwerrs) {
+			ipath_dbg("Clearing freezemode on ignored or "
+				  "recovered hardware error\n");
+			ipath_clear_freeze(dd);
+		}
+	}
+
+	*msg = '\0';
+
+	/*
+	 * may someday want to decode into which bits are which
+	 * functional area for parity errors, etc.
+	 */
+	if (hwerrs & (infinipath_hwe_htcmemparityerr_mask
+		      << INFINIPATH_HWE_HTCMEMPARITYERR_SHIFT)) {
+		bits = (u32) ((hwerrs >>
+			       INFINIPATH_HWE_HTCMEMPARITYERR_SHIFT) &
+			      INFINIPATH_HWE_HTCMEMPARITYERR_MASK);
+		snprintf(bitsmsg, sizeof bitsmsg, "[HTC Parity Errs %x] ",
+			 bits);
+		strlcat(msg, bitsmsg, msgl);
+	}
+
+	ipath_format_hwerrors(hwerrs,
+			      ipath_6110_hwerror_msgs,
+			      ARRAY_SIZE(ipath_6110_hwerror_msgs),
+			      msg, msgl);
+
+	if (hwerrs & (_IPATH_HTLINK0_CRCBITS | _IPATH_HTLINK1_CRCBITS))
+		hwerr_crcbits(dd, hwerrs, msg, msgl);
+
+	if (hwerrs & INFINIPATH_HWE_MEMBISTFAILED) {
+		strlcat(msg, "[Memory BIST test failed, InfiniPath hardware unusable]",
+			msgl);
+		/* ignore from now on, so disable until driver reloaded */
+		dd->ipath_hwerrmask &= ~INFINIPATH_HWE_MEMBISTFAILED;
+		ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrmask,
+				 dd->ipath_hwerrmask);
+	}
+#define _IPATH_PLL_FAIL (INFINIPATH_HWE_COREPLL_FBSLIP |	\
+			 INFINIPATH_HWE_COREPLL_RFSLIP |	\
+			 INFINIPATH_HWE_HTBPLL_FBSLIP |		\
+			 INFINIPATH_HWE_HTBPLL_RFSLIP |		\
+			 INFINIPATH_HWE_HTAPLL_FBSLIP |		\
+			 INFINIPATH_HWE_HTAPLL_RFSLIP)
+
+	if (hwerrs & _IPATH_PLL_FAIL) {
+		snprintf(bitsmsg, sizeof bitsmsg,
+			 "[PLL failed (%llx), InfiniPath hardware unusable]",
+			 (unsigned long long) (hwerrs & _IPATH_PLL_FAIL));
+		strlcat(msg, bitsmsg, msgl);
+		/* ignore from now on, so disable until driver reloaded */
+		dd->ipath_hwerrmask &= ~(hwerrs & _IPATH_PLL_FAIL);
+		ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrmask,
+				 dd->ipath_hwerrmask);
+	}
+
+	if (hwerrs & INFINIPATH_HWE_SERDESPLLFAILED) {
+		/*
+		 * If it occurs, it is left masked since the eternal
+		 * interface is unused
+		 */
+		dd->ipath_hwerrmask &= ~INFINIPATH_HWE_SERDESPLLFAILED;
+		ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrmask,
+				 dd->ipath_hwerrmask);
+	}
+
+	if (hwerrs) {
+		/*
+		 * if any set that we aren't ignoring; only
+		 * make the complaint once, in case it's stuck
+		 * or recurring, and we get here multiple
+		 * times.
+		 * force link down, so switch knows, and
+		 * LEDs are turned off
+		 */
+		if (dd->ipath_flags & IPATH_INITTED) {
+			ipath_set_linkstate(dd, IPATH_IB_LINKDOWN);
+			ipath_setup_ht_setextled(dd,
+				INFINIPATH_IBCS_L_STATE_DOWN,
+				INFINIPATH_IBCS_LT_STATE_DISABLED);
+			ipath_dev_err(dd, "Fatal Hardware Error (freeze "
+					  "mode), no longer usable, SN %.16s\n",
+					  dd->ipath_serial);
+			isfatal = 1;
+		}
+		*dd->ipath_statusp &= ~IPATH_STATUS_IB_READY;
+		/* mark as having had error */
+		*dd->ipath_statusp |= IPATH_STATUS_HWERROR;
+		/*
+		 * mark as not usable, at a minimum until driver
+		 * is reloaded, probably until reboot, since no
+		 * other reset is possible.
+		 */
+		dd->ipath_flags &= ~IPATH_INITTED;
+	}
+	else
+		*msg = 0; /* recovered from all of them */
+	if (*msg)
+		ipath_dev_err(dd, "%s hardware error\n", msg);
+	if (isfatal && !ipath_diag_inuse && dd->ipath_freezemsg)
+		/*
+		 * for status file; if no trailing brace is copied,
+		 * we'll know it was truncated.
+		 */
+		snprintf(dd->ipath_freezemsg,
+			 dd->ipath_freezelen, "{%s}", msg);
+
+bail:;
+}
+
+/**
+ * ipath_ht_boardname - fill in the board name
+ * @dd: the infinipath device
+ * @name: the output buffer
+ * @namelen: the size of the output buffer
+ *
+ * fill in the board name, based on the board revision register
+ */
+static int ipath_ht_boardname(struct ipath_devdata *dd, char *name,
+			      size_t namelen)
+{
+	char *n = NULL;
+	u8 boardrev = dd->ipath_boardrev;
+	int ret = 0;
+
+	switch (boardrev) {
+	case 5:
+		/*
+		 * original production board; two production levels, with
+		 * different serial number ranges.   See ipath_ht_early_init() for
+		 * case where we enable IPATH_GPIO_INTR for later serial # range.
+		 * Original 112* serial number is no longer supported.
+		 */
+		n = "InfiniPath_QHT7040";
+		break;
+	case 7:
+		/* small form factor production board */
+		n = "InfiniPath_QHT7140";
+		break;
+	default:		/* don't know, just print the number */
+		ipath_dev_err(dd, "Don't yet know about board "
+			      "with ID %u\n", boardrev);
+		snprintf(name, namelen, "Unknown_InfiniPath_QHT7xxx_%u",
+			 boardrev);
+		break;
+	}
+	if (n)
+		snprintf(name, namelen, "%s", n);
+
+	if (ret) {
+		ipath_dev_err(dd, "Unsupported InfiniPath board %s!\n", name);
+		goto bail;
+	}
+	if (dd->ipath_majrev != 3 || (dd->ipath_minrev < 2 ||
+		dd->ipath_minrev > 4)) {
+		/*
+		 * This version of the driver only supports Rev 3.2 - 3.4
+		 */
+		ipath_dev_err(dd,
+			      "Unsupported InfiniPath hardware revision %u.%u!\n",
+			      dd->ipath_majrev, dd->ipath_minrev);
+		ret = 1;
+		goto bail;
+	}
+	/*
+	 * pkt/word counters are 32 bit, and therefore wrap fast enough
+	 * that we snapshot them from a timer, and maintain 64 bit shadow
+	 * copies
+	 */
+	dd->ipath_flags |= IPATH_32BITCOUNTERS;
+	dd->ipath_flags |= IPATH_GPIO_INTR;
+	if (dd->ipath_lbus_speed != 800)
+		ipath_dev_err(dd,
+			      "Incorrectly configured for HT @ %uMHz\n",
+			      dd->ipath_lbus_speed);
+
+	/*
+	 * set here, not in ipath_init_*_funcs because we have to do
+	 * it after we can read chip registers.
+	 */
+	dd->ipath_ureg_align =
+		ipath_read_kreg32(dd, dd->ipath_kregs->kr_pagealign);
+
+bail:
+	return ret;
+}
+
+static void ipath_check_htlink(struct ipath_devdata *dd)
+{
+	u8 linkerr, link_off, i;
+
+	for (i = 0; i < 2; i++) {
+		link_off = dd->ipath_ht_slave_off + i * 4 + 0xd;
+		if (pci_read_config_byte(dd->pcidev, link_off, &linkerr))
+			dev_info(&dd->pcidev->dev, "Couldn't read "
+				 "linkerror%d of HT slave/primary block\n",
+				 i);
+		else if (linkerr & 0xf0) {
+			ipath_cdbg(VERBOSE, "HT linkerr%d bits 0x%x set, "
+				   "clearing\n", linkerr >> 4, i);
+			/*
+			 * writing the linkerr bits that are set should
+			 * clear them
+			 */
+			if (pci_write_config_byte(dd->pcidev, link_off,
+						  linkerr))
+				ipath_dbg("Failed write to clear HT "
+					  "linkerror%d\n", i);
+			if (pci_read_config_byte(dd->pcidev, link_off,
+						 &linkerr))
+				dev_info(&dd->pcidev->dev,
+					 "Couldn't reread linkerror%d of "
+					 "HT slave/primary block\n", i);
+			else if (linkerr & 0xf0)
+				dev_info(&dd->pcidev->dev,
+					 "HT linkerror%d bits 0x%x "
+					 "couldn't be cleared\n",
+					 i, linkerr >> 4);
+		}
+	}
+}
+
+static int ipath_setup_ht_reset(struct ipath_devdata *dd)
+{
+	ipath_dbg("No reset possible for this InfiniPath hardware\n");
+	return 0;
+}
+
+#define HT_INTR_DISC_CONFIG  0x80	/* HT interrupt and discovery cap */
+#define HT_INTR_REG_INDEX    2	/* intconfig requires indirect accesses */
+
+/*
+ * Bits 13-15 of command==0 is slave/primary block.  Clear any HT CRC
+ * errors.  We only bother to do this at load time, because it's OK if
+ * it happened before we were loaded (first time after boot/reset),
+ * but any time after that, it's fatal anyway.  Also need to not check
+ * for upper byte errors if we are in 8 bit mode, so figure out
+ * our width.  For now, at least, also complain if it's 8 bit.
+ */
+static void slave_or_pri_blk(struct ipath_devdata *dd, struct pci_dev *pdev,
+			     int pos, u8 cap_type)
+{
+	u8 linkwidth = 0, linkerr, link_a_b_off, link_off;
+	u16 linkctrl = 0;
+	int i;
+
+	dd->ipath_ht_slave_off = pos;
+	/* command word, master_host bit */
+	/* master host || slave */
+	if ((cap_type >> 2) & 1)
+		link_a_b_off = 4;
+	else
+		link_a_b_off = 0;
+	ipath_cdbg(VERBOSE, "HT%u (Link %c) connected to processor\n",
+		   link_a_b_off ? 1 : 0,
+		   link_a_b_off ? 'B' : 'A');
+
+	link_a_b_off += pos;
+
+	/*
+	 * check both link control registers; clear both HT CRC sets if
+	 * necessary.
+	 */
+	for (i = 0; i < 2; i++) {
+		link_off = pos + i * 4 + 0x4;
+		if (pci_read_config_word(pdev, link_off, &linkctrl))
+			ipath_dev_err(dd, "Couldn't read HT link control%d "
+				      "register\n", i);
+		else if (linkctrl & (0xf << 8)) {
+			ipath_cdbg(VERBOSE, "Clear linkctrl%d CRC Error "
+				   "bits %x\n", i, linkctrl & (0xf << 8));
+			/*
+			 * now write them back to clear the error.
+			 */
+			pci_write_config_word(pdev, link_off,
+					      linkctrl & (0xf << 8));
+		}
+	}
+
+	/*
+	 * As with HT CRC bits, same for protocol errors that might occur
+	 * during boot.
+	 */
+	for (i = 0; i < 2; i++) {
+		link_off = pos + i * 4 + 0xd;
+		if (pci_read_config_byte(pdev, link_off, &linkerr))
+			dev_info(&pdev->dev, "Couldn't read linkerror%d "
+				 "of HT slave/primary block\n", i);
+		else if (linkerr & 0xf0) {
+			ipath_cdbg(VERBOSE, "HT linkerr%d bits 0x%x set, "
+				   "clearing\n", linkerr >> 4, i);
+			/*
+			 * writing the linkerr bits that are set will clear
+			 * them
+			 */
+			if (pci_write_config_byte
+			    (pdev, link_off, linkerr))
+				ipath_dbg("Failed write to clear HT "
+					  "linkerror%d\n", i);
+			if (pci_read_config_byte(pdev, link_off, &linkerr))
+				dev_info(&pdev->dev, "Couldn't reread "
+					 "linkerror%d of HT slave/primary "
+					 "block\n", i);
+			else if (linkerr & 0xf0)
+				dev_info(&pdev->dev, "HT linkerror%d bits "
+					 "0x%x couldn't be cleared\n",
+					 i, linkerr >> 4);
+		}
+	}
+
+	/*
+	 * this is just for our link to the host, not devices connected
+	 * through tunnel.
+	 */
+
+	if (pci_read_config_byte(pdev, link_a_b_off + 7, &linkwidth))
+		ipath_dev_err(dd, "Couldn't read HT link width "
+			      "config register\n");
+	else {
+		u32 width;
+		switch (linkwidth & 7) {
+		case 5:
+			width = 4;
+			break;
+		case 4:
+			width = 2;
+			break;
+		case 3:
+			width = 32;
+			break;
+		case 1:
+			width = 16;
+			break;
+		case 0:
+		default:	/* if wrong, assume 8 bit */
+			width = 8;
+			break;
+		}
+
+		dd->ipath_lbus_width = width;
+
+		if (linkwidth != 0x11) {
+			ipath_dev_err(dd, "Not configured for 16 bit HT "
+				      "(%x)\n", linkwidth);
+			if (!(linkwidth & 0xf)) {
+				ipath_dbg("Will ignore HT lane1 errors\n");
+				dd->ipath_flags |= IPATH_8BIT_IN_HT0;
+			}
+		}
+	}
+
+	/*
+	 * this is just for our link to the host, not devices connected
+	 * through tunnel.
+	 */
+	if (pci_read_config_byte(pdev, link_a_b_off + 0xd, &linkwidth))
+		ipath_dev_err(dd, "Couldn't read HT link frequency "
+			      "config register\n");
+	else {
+		u32 speed;
+		switch (linkwidth & 0xf) {
+		case 6:
+			speed = 1000;
+			break;
+		case 5:
+			speed = 800;
+			break;
+		case 4:
+			speed = 600;
+			break;
+		case 3:
+			speed = 500;
+			break;
+		case 2:
+			speed = 400;
+			break;
+		case 1:
+			speed = 300;
+			break;
+		default:
+			/*
+			 * assume reserved and vendor-specific are 200...
+			 */
+		case 0:
+			speed = 200;
+			break;
+		}
+		dd->ipath_lbus_speed = speed;
+	}
+
+	snprintf(dd->ipath_lbus_info, sizeof(dd->ipath_lbus_info),
+		"HyperTransport,%uMHz,x%u\n",
+		dd->ipath_lbus_speed,
+		dd->ipath_lbus_width);
+}
+
+static int ipath_ht_intconfig(struct ipath_devdata *dd)
+{
+	int ret;
+
+	if (dd->ipath_intconfig) {
+		ipath_write_kreg(dd, dd->ipath_kregs->kr_interruptconfig,
+				 dd->ipath_intconfig);	/* interrupt address */
+		ret = 0;
+	} else {
+		ipath_dev_err(dd, "No interrupts enabled, couldn't setup "
+			      "interrupt address\n");
+		ret = -EINVAL;
+	}
+
+	return ret;
+}
+
+static void ipath_ht_irq_update(struct pci_dev *dev, int irq,
+				struct ht_irq_msg *msg)
+{
+	struct ipath_devdata *dd = pci_get_drvdata(dev);
+	u64 prev_intconfig = dd->ipath_intconfig;
+
+	dd->ipath_intconfig = msg->address_lo;
+	dd->ipath_intconfig |= ((u64) msg->address_hi) << 32;
+
+	/*
+	 * If the previous value of dd->ipath_intconfig is zero, we're
+	 * getting configured for the first time, and must not program the
+	 * intconfig register here (it will be programmed later, when the
+	 * hardware is ready).  Otherwise, we should.
+	 */
+	if (prev_intconfig)
+		ipath_ht_intconfig(dd);
+}
+
+/**
+ * ipath_setup_ht_config - setup the interruptconfig register
+ * @dd: the infinipath device
+ * @pdev: the PCI device
+ *
+ * setup the interruptconfig register from the HT config info.
+ * Also clear CRC errors in HT linkcontrol, if necessary.
+ * This is done only for the real hardware.  It is done before
+ * chip address space is initted, so can't touch infinipath registers
+ */
+static int ipath_setup_ht_config(struct ipath_devdata *dd,
+				 struct pci_dev *pdev)
+{
+	int pos, ret;
+
+	ret = __ht_create_irq(pdev, 0, ipath_ht_irq_update);
+	if (ret < 0) {
+		ipath_dev_err(dd, "Couldn't create interrupt handler: "
+			      "err %d\n", ret);
+		goto bail;
+	}
+	dd->ipath_irq = ret;
+	ret = 0;
+
+	/*
+	 * Handle clearing CRC errors in linkctrl register if necessary.  We
+	 * do this early, before we ever enable errors or hardware errors,
+	 * mostly to avoid causing the chip to enter freeze mode.
+	 */
+	pos = pci_find_capability(pdev, PCI_CAP_ID_HT);
+	if (!pos) {
+		ipath_dev_err(dd, "Couldn't find HyperTransport "
+			      "capability; no interrupts\n");
+		ret = -ENODEV;
+		goto bail;
+	}
+	do {
+		u8 cap_type;
+
+		/*
+		 * The HT capability type byte is 3 bytes after the
+		 * capability byte.
+		 */
+		if (pci_read_config_byte(pdev, pos + 3, &cap_type)) {
+			dev_info(&pdev->dev, "Couldn't read config "
+				 "command @ %d\n", pos);
+			continue;
+		}
+		if (!(cap_type & 0xE0))
+			slave_or_pri_blk(dd, pdev, pos, cap_type);
+	} while ((pos = pci_find_next_capability(pdev, pos,
+						 PCI_CAP_ID_HT)));
+
+	dd->ipath_flags |= IPATH_SWAP_PIOBUFS;
+
+bail:
+	return ret;
+}
+
+/**
+ * ipath_setup_ht_cleanup - clean up any per-chip chip-specific stuff
+ * @dd: the infinipath device
+ *
+ * Called during driver unload.
+ * This is currently a nop for the HT chip, not for all chips
+ */
+static void ipath_setup_ht_cleanup(struct ipath_devdata *dd)
+{
+}
+
+/**
+ * ipath_setup_ht_setextled - set the state of the two external LEDs
+ * @dd: the infinipath device
+ * @lst: the L state
+ * @ltst: the LT state
+ *
+ * Set the state of the two external LEDs, to indicate physical and
+ * logical state of IB link.   For this chip (at least with recommended
+ * board pinouts), LED1 is Green (physical state), and LED2 is Yellow
+ * (logical state)
+ *
+ * Note:  We try to match the Mellanox HCA LED behavior as best
+ * we can.  Green indicates physical link state is OK (something is
+ * plugged in, and we can train).
+ * Amber indicates the link is logically up (ACTIVE).
+ * Mellanox further blinks the amber LED to indicate data packet
+ * activity, but we have no hardware support for that, so it would
+ * require waking up every 10-20 msecs and checking the counters
+ * on the chip, and then turning the LED off if appropriate.  That's
+ * visible overhead, so not something we will do.
+ *
+ */
+static void ipath_setup_ht_setextled(struct ipath_devdata *dd,
+				     u64 lst, u64 ltst)
+{
+	u64 extctl;
+	unsigned long flags = 0;
+
+	/* the diags use the LED to indicate diag info, so we leave
+	 * the external LED alone when the diags are running */
+	if (ipath_diag_inuse)
+		return;
+
+	/* Allow override of LED display for, e.g. Locating system in rack */
+	if (dd->ipath_led_override) {
+		ltst = (dd->ipath_led_override & IPATH_LED_PHYS)
+			? INFINIPATH_IBCS_LT_STATE_LINKUP
+			: INFINIPATH_IBCS_LT_STATE_DISABLED;
+		lst = (dd->ipath_led_override & IPATH_LED_LOG)
+			? INFINIPATH_IBCS_L_STATE_ACTIVE
+			: INFINIPATH_IBCS_L_STATE_DOWN;
+	}
+
+	spin_lock_irqsave(&dd->ipath_gpio_lock, flags);
+	/*
+	 * start by setting both LED control bits to off, then turn
+	 * on the appropriate bit(s).
+	 */
+	if (dd->ipath_boardrev == 8) { /* LS/X-1 uses different pins */
+		/*
+		 * major difference is that INFINIPATH_EXTC_LEDGBLERR_OFF
+		 * is inverted,  because it is normally used to indicate
+		 * a hardware fault at reset, if there were errors
+		 */
+		extctl = (dd->ipath_extctrl & ~INFINIPATH_EXTC_LEDGBLOK_ON)
+			| INFINIPATH_EXTC_LEDGBLERR_OFF;
+		if (ltst == INFINIPATH_IBCS_LT_STATE_LINKUP)
+			extctl &= ~INFINIPATH_EXTC_LEDGBLERR_OFF;
+		if (lst == INFINIPATH_IBCS_L_STATE_ACTIVE)
+			extctl |= INFINIPATH_EXTC_LEDGBLOK_ON;
+	}
+	else {
+		extctl = dd->ipath_extctrl &
+			~(INFINIPATH_EXTC_LED1PRIPORT_ON |
+			  INFINIPATH_EXTC_LED2PRIPORT_ON);
+		if (ltst == INFINIPATH_IBCS_LT_STATE_LINKUP)
+			extctl |= INFINIPATH_EXTC_LED1PRIPORT_ON;
+		if (lst == INFINIPATH_IBCS_L_STATE_ACTIVE)
+			extctl |= INFINIPATH_EXTC_LED2PRIPORT_ON;
+	}
+	dd->ipath_extctrl = extctl;
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_extctrl, extctl);
+	spin_unlock_irqrestore(&dd->ipath_gpio_lock, flags);
+}
+
+static void ipath_init_ht_variables(struct ipath_devdata *dd)
+{
+	/*
+	 * setup the register offsets, since they are different for each
+	 * chip
+	 */
+	dd->ipath_kregs = &ipath_ht_kregs;
+	dd->ipath_cregs = &ipath_ht_cregs;
+
+	dd->ipath_gpio_sda_num = _IPATH_GPIO_SDA_NUM;
+	dd->ipath_gpio_scl_num = _IPATH_GPIO_SCL_NUM;
+	dd->ipath_gpio_sda = IPATH_GPIO_SDA;
+	dd->ipath_gpio_scl = IPATH_GPIO_SCL;
+
+	/*
+	 * Fill in data for field-values that change in newer chips.
+	 * We dynamically specify only the mask for LINKTRAININGSTATE
+	 * and only the shift for LINKSTATE, as they are the only ones
+	 * that change.  Also precalculate the 3 link states of interest
+	 * and the combined mask.
+	 */
+	dd->ibcs_ls_shift = IBA6110_IBCS_LINKSTATE_SHIFT;
+	dd->ibcs_lts_mask = IBA6110_IBCS_LINKTRAININGSTATE_MASK;
+	dd->ibcs_mask = (INFINIPATH_IBCS_LINKSTATE_MASK <<
+		dd->ibcs_ls_shift) | dd->ibcs_lts_mask;
+	dd->ib_init = (INFINIPATH_IBCS_LT_STATE_LINKUP <<
+		INFINIPATH_IBCS_LINKTRAININGSTATE_SHIFT) |
+		(INFINIPATH_IBCS_L_STATE_INIT << dd->ibcs_ls_shift);
+	dd->ib_arm = (INFINIPATH_IBCS_LT_STATE_LINKUP <<
+		INFINIPATH_IBCS_LINKTRAININGSTATE_SHIFT) |
+		(INFINIPATH_IBCS_L_STATE_ARM << dd->ibcs_ls_shift);
+	dd->ib_active = (INFINIPATH_IBCS_LT_STATE_LINKUP <<
+		INFINIPATH_IBCS_LINKTRAININGSTATE_SHIFT) |
+		(INFINIPATH_IBCS_L_STATE_ACTIVE << dd->ibcs_ls_shift);
+
+	/*
+	 * Fill in data for ibcc field-values that change in newer chips.
+	 * We dynamically specify only the mask for LINKINITCMD
+	 * and only the shift for LINKCMD and MAXPKTLEN, as they are
+	 * the only ones that change.
+	 */
+	dd->ibcc_lic_mask = INFINIPATH_IBCC_LINKINITCMD_MASK;
+	dd->ibcc_lc_shift = INFINIPATH_IBCC_LINKCMD_SHIFT;
+	dd->ibcc_mpl_shift = INFINIPATH_IBCC_MAXPKTLEN_SHIFT;
+
+	/* Fill in shifts for RcvCtrl. */
+	dd->ipath_r_portenable_shift = INFINIPATH_R_PORTENABLE_SHIFT;
+	dd->ipath_r_intravail_shift = INFINIPATH_R_INTRAVAIL_SHIFT;
+	dd->ipath_r_tailupd_shift = INFINIPATH_R_TAILUPD_SHIFT;
+	dd->ipath_r_portcfg_shift = 0; /* Not on IBA6110 */
+
+	dd->ipath_i_bitsextant =
+		(INFINIPATH_I_RCVURG_MASK << INFINIPATH_I_RCVURG_SHIFT) |
+		(INFINIPATH_I_RCVAVAIL_MASK <<
+		 INFINIPATH_I_RCVAVAIL_SHIFT) |
+		INFINIPATH_I_ERROR | INFINIPATH_I_SPIOSENT |
+		INFINIPATH_I_SPIOBUFAVAIL | INFINIPATH_I_GPIO;
+
+	dd->ipath_e_bitsextant =
+		INFINIPATH_E_RFORMATERR | INFINIPATH_E_RVCRC |
+		INFINIPATH_E_RICRC | INFINIPATH_E_RMINPKTLEN |
+		INFINIPATH_E_RMAXPKTLEN | INFINIPATH_E_RLONGPKTLEN |
+		INFINIPATH_E_RSHORTPKTLEN | INFINIPATH_E_RUNEXPCHAR |
+		INFINIPATH_E_RUNSUPVL | INFINIPATH_E_REBP |
+		INFINIPATH_E_RIBFLOW | INFINIPATH_E_RBADVERSION |
+		INFINIPATH_E_RRCVEGRFULL | INFINIPATH_E_RRCVHDRFULL |
+		INFINIPATH_E_RBADTID | INFINIPATH_E_RHDRLEN |
+		INFINIPATH_E_RHDR | INFINIPATH_E_RIBLOSTLINK |
+		INFINIPATH_E_SMINPKTLEN | INFINIPATH_E_SMAXPKTLEN |
+		INFINIPATH_E_SUNDERRUN | INFINIPATH_E_SPKTLEN |
+		INFINIPATH_E_SDROPPEDSMPPKT | INFINIPATH_E_SDROPPEDDATAPKT |
+		INFINIPATH_E_SPIOARMLAUNCH | INFINIPATH_E_SUNEXPERRPKTNUM |
+		INFINIPATH_E_SUNSUPVL | INFINIPATH_E_IBSTATUSCHANGED |
+		INFINIPATH_E_INVALIDADDR | INFINIPATH_E_RESET |
+		INFINIPATH_E_HARDWARE;
+
+	dd->ipath_hwe_bitsextant =
+		(INFINIPATH_HWE_HTCMEMPARITYERR_MASK <<
+		 INFINIPATH_HWE_HTCMEMPARITYERR_SHIFT) |
+		(INFINIPATH_HWE_TXEMEMPARITYERR_MASK <<
+		 INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT) |
+		(INFINIPATH_HWE_RXEMEMPARITYERR_MASK <<
+		 INFINIPATH_HWE_RXEMEMPARITYERR_SHIFT) |
+		INFINIPATH_HWE_HTCLNKABYTE0CRCERR |
+		INFINIPATH_HWE_HTCLNKABYTE1CRCERR |
+		INFINIPATH_HWE_HTCLNKBBYTE0CRCERR |
+		INFINIPATH_HWE_HTCLNKBBYTE1CRCERR |
+		INFINIPATH_HWE_HTCMISCERR4 |
+		INFINIPATH_HWE_HTCMISCERR5 | INFINIPATH_HWE_HTCMISCERR6 |
+		INFINIPATH_HWE_HTCMISCERR7 |
+		INFINIPATH_HWE_HTCBUSTREQPARITYERR |
+		INFINIPATH_HWE_HTCBUSTRESPPARITYERR |
+		INFINIPATH_HWE_HTCBUSIREQPARITYERR |
+		INFINIPATH_HWE_RXDSYNCMEMPARITYERR |
+		INFINIPATH_HWE_MEMBISTFAILED |
+		INFINIPATH_HWE_COREPLL_FBSLIP |
+		INFINIPATH_HWE_COREPLL_RFSLIP |
+		INFINIPATH_HWE_HTBPLL_FBSLIP |
+		INFINIPATH_HWE_HTBPLL_RFSLIP |
+		INFINIPATH_HWE_HTAPLL_FBSLIP |
+		INFINIPATH_HWE_HTAPLL_RFSLIP |
+		INFINIPATH_HWE_SERDESPLLFAILED |
+		INFINIPATH_HWE_IBCBUSTOSPCPARITYERR |
+		INFINIPATH_HWE_IBCBUSFRSPCPARITYERR;
+
+	dd->ipath_i_rcvavail_mask = INFINIPATH_I_RCVAVAIL_MASK;
+	dd->ipath_i_rcvurg_mask = INFINIPATH_I_RCVURG_MASK;
+	dd->ipath_i_rcvavail_shift = INFINIPATH_I_RCVAVAIL_SHIFT;
+	dd->ipath_i_rcvurg_shift = INFINIPATH_I_RCVURG_SHIFT;
+
+	/*
+	 * EEPROM error log 0 is TXE Parity errors. 1 is RXE Parity.
+	 * 2 is Some Misc, 3 is reserved for future.
+	 */
+	dd->ipath_eep_st_masks[0].hwerrs_to_log =
+		INFINIPATH_HWE_TXEMEMPARITYERR_MASK <<
+		INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT;
+
+	dd->ipath_eep_st_masks[1].hwerrs_to_log =
+		INFINIPATH_HWE_RXEMEMPARITYERR_MASK <<
+		INFINIPATH_HWE_RXEMEMPARITYERR_SHIFT;
+
+	dd->ipath_eep_st_masks[2].errs_to_log = INFINIPATH_E_RESET;
+
+	dd->delay_mult = 2; /* SDR, 4X, can't change */
+
+	dd->ipath_link_width_supported = IB_WIDTH_1X | IB_WIDTH_4X;
+	dd->ipath_link_speed_supported = IPATH_IB_SDR;
+	dd->ipath_link_width_enabled = IB_WIDTH_4X;
+	dd->ipath_link_speed_enabled = dd->ipath_link_speed_supported;
+	/* these can't change for this chip, so set once */
+	dd->ipath_link_width_active = dd->ipath_link_width_enabled;
+	dd->ipath_link_speed_active = dd->ipath_link_speed_enabled;
+}
+
+/**
+ * ipath_ht_init_hwerrors - enable hardware errors
+ * @dd: the infinipath device
+ *
+ * now that we have finished initializing everything that might reasonably
+ * cause a hardware error, and cleared those errors bits as they occur,
+ * we can enable hardware errors in the mask (potentially enabling
+ * freeze mode), and enable hardware errors as errors (along with
+ * everything else) in errormask
+ */
+static void ipath_ht_init_hwerrors(struct ipath_devdata *dd)
+{
+	ipath_err_t val;
+	u64 extsval;
+
+	extsval = ipath_read_kreg64(dd, dd->ipath_kregs->kr_extstatus);
+
+	if (!(extsval & INFINIPATH_EXTS_MEMBIST_ENDTEST))
+		ipath_dev_err(dd, "MemBIST did not complete!\n");
+	if (extsval & INFINIPATH_EXTS_MEMBIST_CORRECT)
+		ipath_dbg("MemBIST corrected\n");
+
+	ipath_check_htlink(dd);
+
+	/* barring bugs, all hwerrors become interrupts, which can */
+	val = -1LL;
+	/* don't look at crc lane1 if 8 bit */
+	if (dd->ipath_flags & IPATH_8BIT_IN_HT0)
+		val &= ~infinipath_hwe_htclnkabyte1crcerr;
+	/* don't look at crc lane1 if 8 bit */
+	if (dd->ipath_flags & IPATH_8BIT_IN_HT1)
+		val &= ~infinipath_hwe_htclnkbbyte1crcerr;
+
+	/*
+	 * disable RXDSYNCMEMPARITY because external serdes is unused,
+	 * and therefore the logic will never be used or initialized,
+	 * and uninitialized state will normally result in this error
+	 * being asserted.  Similarly for the external serdess pll
+	 * lock signal.
+	 */
+	val &= ~(INFINIPATH_HWE_SERDESPLLFAILED |
+		 INFINIPATH_HWE_RXDSYNCMEMPARITYERR);
+
+	/*
+	 * Disable MISCERR4 because of an inversion in the HT core
+	 * logic checking for errors that cause this bit to be set.
+	 * The errata can also cause the protocol error bit to be set
+	 * in the HT config space linkerror register(s).
+	 */
+	val &= ~INFINIPATH_HWE_HTCMISCERR4;
+
+	/*
+	 * PLL ignored because unused MDIO interface has a logic problem
+	 */
+	if (dd->ipath_boardrev == 4 || dd->ipath_boardrev == 9)
+		val &= ~INFINIPATH_HWE_SERDESPLLFAILED;
+	dd->ipath_hwerrmask = val;
+}
+
+
+
+
+/**
+ * ipath_ht_bringup_serdes - bring up the serdes
+ * @dd: the infinipath device
+ */
+static int ipath_ht_bringup_serdes(struct ipath_devdata *dd)
+{
+	u64 val, config1;
+	int ret = 0, change = 0;
+
+	ipath_dbg("Trying to bringup serdes\n");
+
+	if (ipath_read_kreg64(dd, dd->ipath_kregs->kr_hwerrstatus) &
+	    INFINIPATH_HWE_SERDESPLLFAILED)
+	{
+		ipath_dbg("At start, serdes PLL failed bit set in "
+			  "hwerrstatus, clearing and continuing\n");
+		ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrclear,
+				 INFINIPATH_HWE_SERDESPLLFAILED);
+	}
+
+	val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_serdesconfig0);
+	config1 = ipath_read_kreg64(dd, dd->ipath_kregs->kr_serdesconfig1);
+
+	ipath_cdbg(VERBOSE, "Initial serdes status is config0=%llx "
+		   "config1=%llx, sstatus=%llx xgxs %llx\n",
+		   (unsigned long long) val, (unsigned long long) config1,
+		   (unsigned long long)
+		   ipath_read_kreg64(dd, dd->ipath_kregs->kr_serdesstatus),
+		   (unsigned long long)
+		   ipath_read_kreg64(dd, dd->ipath_kregs->kr_xgxsconfig));
+
+	/* force reset on */
+	val |= INFINIPATH_SERDC0_RESET_PLL
+		/* | INFINIPATH_SERDC0_RESET_MASK */
+		;
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_serdesconfig0, val);
+	udelay(15);		/* need pll reset set at least for a bit */
+
+	if (val & INFINIPATH_SERDC0_RESET_PLL) {
+		u64 val2 = val &= ~INFINIPATH_SERDC0_RESET_PLL;
+		/* set lane resets, and tx idle, during pll reset */
+		val2 |= INFINIPATH_SERDC0_RESET_MASK |
+			INFINIPATH_SERDC0_TXIDLE;
+		ipath_cdbg(VERBOSE, "Clearing serdes PLL reset (writing "
+			   "%llx)\n", (unsigned long long) val2);
+		ipath_write_kreg(dd, dd->ipath_kregs->kr_serdesconfig0,
+				 val2);
+		/*
+		 * be sure chip saw it
+		 */
+		val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
+		/*
+		 * need pll reset clear at least 11 usec before lane
+		 * resets cleared; give it a few more
+		 */
+		udelay(15);
+		val = val2;	/* for check below */
+	}
+
+	if (val & (INFINIPATH_SERDC0_RESET_PLL |
+		   INFINIPATH_SERDC0_RESET_MASK |
+		   INFINIPATH_SERDC0_TXIDLE)) {
+		val &= ~(INFINIPATH_SERDC0_RESET_PLL |
+			 INFINIPATH_SERDC0_RESET_MASK |
+			 INFINIPATH_SERDC0_TXIDLE);
+		/* clear them */
+		ipath_write_kreg(dd, dd->ipath_kregs->kr_serdesconfig0,
+				 val);
+	}
+
+	val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_xgxsconfig);
+	if (val & INFINIPATH_XGXS_RESET) {
+		/* normally true after boot */
+		val &= ~INFINIPATH_XGXS_RESET;
+		change = 1;
+	}
+	if (((val >> INFINIPATH_XGXS_RX_POL_SHIFT) &
+	     INFINIPATH_XGXS_RX_POL_MASK) != dd->ipath_rx_pol_inv ) {
+		/* need to compensate for Tx inversion in partner */
+		val &= ~(INFINIPATH_XGXS_RX_POL_MASK <<
+		         INFINIPATH_XGXS_RX_POL_SHIFT);
+		val |= dd->ipath_rx_pol_inv <<
+			INFINIPATH_XGXS_RX_POL_SHIFT;
+		change = 1;
+	}
+	if (change)
+		ipath_write_kreg(dd, dd->ipath_kregs->kr_xgxsconfig, val);
+
+	val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_serdesconfig0);
+
+	/* clear current and de-emphasis bits */
+	config1 &= ~0x0ffffffff00ULL;
+	/* set current to 20ma */
+	config1 |= 0x00000000000ULL;
+	/* set de-emphasis to -5.68dB */
+	config1 |= 0x0cccc000000ULL;
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_serdesconfig1, config1);
+
+	ipath_cdbg(VERBOSE, "After setup: serdes status is config0=%llx "
+		   "config1=%llx, sstatus=%llx xgxs %llx\n",
+		   (unsigned long long) val, (unsigned long long) config1,
+		   (unsigned long long)
+		   ipath_read_kreg64(dd, dd->ipath_kregs->kr_serdesstatus),
+		   (unsigned long long)
+		   ipath_read_kreg64(dd, dd->ipath_kregs->kr_xgxsconfig));
+
+	return ret;		/* for now, say we always succeeded */
+}
+
+/**
+ * ipath_ht_quiet_serdes - set serdes to txidle
+ * @dd: the infinipath device
+ * driver is being unloaded
+ */
+static void ipath_ht_quiet_serdes(struct ipath_devdata *dd)
+{
+	u64 val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_serdesconfig0);
+
+	val |= INFINIPATH_SERDC0_TXIDLE;
+	ipath_dbg("Setting TxIdleEn on serdes (config0 = %llx)\n",
+		  (unsigned long long) val);
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_serdesconfig0, val);
+}
+
+/**
+ * ipath_pe_put_tid - write a TID in chip
+ * @dd: the infinipath device
+ * @tidptr: pointer to the expected TID (in chip) to update
+ * @tidtype: RCVHQ_RCV_TYPE_EAGER (1) for eager, RCVHQ_RCV_TYPE_EXPECTED (0) for expected
+ * @pa: physical address of in memory buffer; ipath_tidinvalid if freeing
+ *
+ * This exists as a separate routine to allow for special locking etc.
+ * It's used for both the full cleanup on exit, as well as the normal
+ * setup and teardown.
+ */
+static void ipath_ht_put_tid(struct ipath_devdata *dd,
+			     u64 __iomem *tidptr, u32 type,
+			     unsigned long pa)
+{
+	if (!dd->ipath_kregbase)
+		return;
+
+	if (pa != dd->ipath_tidinvalid) {
+		if (unlikely((pa & ~INFINIPATH_RT_ADDR_MASK))) {
+			dev_info(&dd->pcidev->dev,
+				 "physaddr %lx has more than "
+				 "40 bits, using only 40!!!\n", pa);
+			pa &= INFINIPATH_RT_ADDR_MASK;
+		}
+		if (type == RCVHQ_RCV_TYPE_EAGER)
+			pa |= dd->ipath_tidtemplate;
+		else {
+			/* in words (fixed, full page).  */
+			u64 lenvalid = PAGE_SIZE >> 2;
+			lenvalid <<= INFINIPATH_RT_BUFSIZE_SHIFT;
+			pa |= lenvalid | INFINIPATH_RT_VALID;
+		}
+	}
+
+	writeq(pa, tidptr);
+}
+
+
+/**
+ * ipath_ht_clear_tid - clear all TID entries for a port, expected and eager
+ * @dd: the infinipath device
+ * @port: the port
+ *
+ * Used from ipath_close(), and at chip initialization.
+ */
+static void ipath_ht_clear_tids(struct ipath_devdata *dd, unsigned port)
+{
+	u64 __iomem *tidbase;
+	int i;
+
+	if (!dd->ipath_kregbase)
+		return;
+
+	ipath_cdbg(VERBOSE, "Invalidate TIDs for port %u\n", port);
+
+	/*
+	 * need to invalidate all of the expected TID entries for this
+	 * port, so we don't have valid entries that might somehow get
+	 * used (early in next use of this port, or through some bug)
+	 */
+	tidbase = (u64 __iomem *) ((char __iomem *)(dd->ipath_kregbase) +
+				   dd->ipath_rcvtidbase +
+				   port * dd->ipath_rcvtidcnt *
+				   sizeof(*tidbase));
+	for (i = 0; i < dd->ipath_rcvtidcnt; i++)
+		ipath_ht_put_tid(dd, &tidbase[i], RCVHQ_RCV_TYPE_EXPECTED,
+				 dd->ipath_tidinvalid);
+
+	tidbase = (u64 __iomem *) ((char __iomem *)(dd->ipath_kregbase) +
+				   dd->ipath_rcvegrbase +
+				   port * dd->ipath_rcvegrcnt *
+				   sizeof(*tidbase));
+
+	for (i = 0; i < dd->ipath_rcvegrcnt; i++)
+		ipath_ht_put_tid(dd, &tidbase[i], RCVHQ_RCV_TYPE_EAGER,
+				 dd->ipath_tidinvalid);
+}
+
+/**
+ * ipath_ht_tidtemplate - setup constants for TID updates
+ * @dd: the infinipath device
+ *
+ * We setup stuff that we use a lot, to avoid calculating each time
+ */
+static void ipath_ht_tidtemplate(struct ipath_devdata *dd)
+{
+	dd->ipath_tidtemplate = dd->ipath_ibmaxlen >> 2;
+	dd->ipath_tidtemplate <<= INFINIPATH_RT_BUFSIZE_SHIFT;
+	dd->ipath_tidtemplate |= INFINIPATH_RT_VALID;
+
+	/*
+	 * work around chip errata bug 7358, by marking invalid tids
+	 * as having max length
+	 */
+	dd->ipath_tidinvalid = (-1LL & INFINIPATH_RT_BUFSIZE_MASK) <<
+		INFINIPATH_RT_BUFSIZE_SHIFT;
+}
+
+static int ipath_ht_early_init(struct ipath_devdata *dd)
+{
+	u32 __iomem *piobuf;
+	u32 pioincr, val32;
+	int i;
+
+	/*
+	 * one cache line; long IB headers will spill over into received
+	 * buffer
+	 */
+	dd->ipath_rcvhdrentsize = 16;
+	dd->ipath_rcvhdrsize = IPATH_DFLT_RCVHDRSIZE;
+
+	/*
+	 * For HT, we allocate a somewhat overly large eager buffer,
+	 * such that we can guarantee that we can receive the largest
+	 * packet that we can send out.  To truly support a 4KB MTU,
+	 * we need to bump this to a large value.  To date, other than
+	 * testing, we have never encountered an HCA that can really
+	 * send 4KB MTU packets, so we do not handle that (we'll get
+	 * errors interrupts if we ever see one).
+	 */
+	dd->ipath_rcvegrbufsize = dd->ipath_piosize2k;
+
+	/*
+	 * the min() check here is currently a nop, but it may not
+	 * always be, depending on just how we do ipath_rcvegrbufsize
+	 */
+	dd->ipath_ibmaxlen = min(dd->ipath_piosize2k,
+				 dd->ipath_rcvegrbufsize);
+	dd->ipath_init_ibmaxlen = dd->ipath_ibmaxlen;
+	ipath_ht_tidtemplate(dd);
+
+	/*
+	 * zero all the TID entries at startup.  We do this for sanity,
+	 * in case of a previous driver crash of some kind, and also
+	 * because the chip powers up with these memories in an unknown
+	 * state.  Use portcnt, not cfgports, since this is for the
+	 * full chip, not for current (possibly different) configuration
+	 * value.
+	 * Chip Errata bug 6447
+	 */
+	for (val32 = 0; val32 < dd->ipath_portcnt; val32++)
+		ipath_ht_clear_tids(dd, val32);
+
+	/*
+	 * write the pbc of each buffer, to be sure it's initialized, then
+	 * cancel all the buffers, and also abort any packets that might
+	 * have been in flight for some reason (the latter is for driver
+	 * unload/reload, but isn't a bad idea at first init).	PIO send
+	 * isn't enabled at this point, so there is no danger of sending
+	 * these out on the wire.
+	 * Chip Errata bug 6610
+	 */
+	piobuf = (u32 __iomem *) (((char __iomem *)(dd->ipath_kregbase)) +
+				  dd->ipath_piobufbase);
+	pioincr = dd->ipath_palign / sizeof(*piobuf);
+	for (i = 0; i < dd->ipath_piobcnt2k; i++) {
+		/*
+		 * reasonable word count, just to init pbc
+		 */
+		writel(16, piobuf);
+		piobuf += pioincr;
+	}
+
+	ipath_get_eeprom_info(dd);
+	if (dd->ipath_boardrev == 5) {
+		/*
+		 * Later production QHT7040 has same changes as QHT7140, so
+		 * can use GPIO interrupts.  They have serial #'s starting
+		 * with 128, rather than 112.
+		 */
+		if (dd->ipath_serial[0] == '1' &&
+		    dd->ipath_serial[1] == '2' &&
+		    dd->ipath_serial[2] == '8')
+			dd->ipath_flags |= IPATH_GPIO_INTR;
+		else {
+			ipath_dev_err(dd, "Unsupported InfiniPath board "
+				"(serial number %.16s)!\n",
+				dd->ipath_serial);
+			return 1;
+		}
+	}
+
+	if (dd->ipath_minrev >= 4) {
+		/* Rev4+ reports extra errors via internal GPIO pins */
+		dd->ipath_flags |= IPATH_GPIO_ERRINTRS;
+		dd->ipath_gpio_mask |= IPATH_GPIO_ERRINTR_MASK;
+		ipath_write_kreg(dd, dd->ipath_kregs->kr_gpio_mask,
+				 dd->ipath_gpio_mask);
+	}
+
+	return 0;
+}
+
+
+/**
+ * ipath_init_ht_get_base_info - set chip-specific flags for user code
+ * @dd: the infinipath device
+ * @kbase: ipath_base_info pointer
+ *
+ * We set the PCIE flag because the lower bandwidth on PCIe vs
+ * HyperTransport can affect some user packet algorithms.
+ */
+static int ipath_ht_get_base_info(struct ipath_portdata *pd, void *kbase)
+{
+	struct ipath_base_info *kinfo = kbase;
+
+	kinfo->spi_runtime_flags |= IPATH_RUNTIME_HT |
+		IPATH_RUNTIME_PIO_REGSWAPPED;
+
+	if (pd->port_dd->ipath_minrev < 4)
+		kinfo->spi_runtime_flags |= IPATH_RUNTIME_RCVHDR_COPY;
+
+	return 0;
+}
+
+static void ipath_ht_free_irq(struct ipath_devdata *dd)
+{
+	free_irq(dd->ipath_irq, dd);
+	ht_destroy_irq(dd->ipath_irq);
+	dd->ipath_irq = 0;
+	dd->ipath_intconfig = 0;
+}
+
+static struct ipath_message_header *
+ipath_ht_get_msgheader(struct ipath_devdata *dd, __le32 *rhf_addr)
+{
+	return (struct ipath_message_header *)
+		&rhf_addr[sizeof(u64) / sizeof(u32)];
+}
+
+static void ipath_ht_config_ports(struct ipath_devdata *dd, ushort cfgports)
+{
+	dd->ipath_portcnt =
+		ipath_read_kreg32(dd, dd->ipath_kregs->kr_portcnt);
+	dd->ipath_p0_rcvegrcnt =
+		ipath_read_kreg32(dd, dd->ipath_kregs->kr_rcvegrcnt);
+}
+
+static void ipath_ht_read_counters(struct ipath_devdata *dd,
+				   struct infinipath_counters *cntrs)
+{
+	cntrs->LBIntCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(LBIntCnt));
+	cntrs->LBFlowStallCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(LBFlowStallCnt));
+	cntrs->TxSDmaDescCnt = 0;
+	cntrs->TxUnsupVLErrCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(TxUnsupVLErrCnt));
+	cntrs->TxDataPktCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(TxDataPktCnt));
+	cntrs->TxFlowPktCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(TxFlowPktCnt));
+	cntrs->TxDwordCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(TxDwordCnt));
+	cntrs->TxLenErrCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(TxLenErrCnt));
+	cntrs->TxMaxMinLenErrCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(TxMaxMinLenErrCnt));
+	cntrs->TxUnderrunCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(TxUnderrunCnt));
+	cntrs->TxFlowStallCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(TxFlowStallCnt));
+	cntrs->TxDroppedPktCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(TxDroppedPktCnt));
+	cntrs->RxDroppedPktCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxDroppedPktCnt));
+	cntrs->RxDataPktCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxDataPktCnt));
+	cntrs->RxFlowPktCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxFlowPktCnt));
+	cntrs->RxDwordCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxDwordCnt));
+	cntrs->RxLenErrCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxLenErrCnt));
+	cntrs->RxMaxMinLenErrCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxMaxMinLenErrCnt));
+	cntrs->RxICRCErrCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxICRCErrCnt));
+	cntrs->RxVCRCErrCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxVCRCErrCnt));
+	cntrs->RxFlowCtrlErrCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxFlowCtrlErrCnt));
+	cntrs->RxBadFormatCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxBadFormatCnt));
+	cntrs->RxLinkProblemCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxLinkProblemCnt));
+	cntrs->RxEBPCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxEBPCnt));
+	cntrs->RxLPCRCErrCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxLPCRCErrCnt));
+	cntrs->RxBufOvflCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxBufOvflCnt));
+	cntrs->RxTIDFullErrCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxTIDFullErrCnt));
+	cntrs->RxTIDValidErrCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxTIDValidErrCnt));
+	cntrs->RxPKeyMismatchCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxPKeyMismatchCnt));
+	cntrs->RxP0HdrEgrOvflCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxP0HdrEgrOvflCnt));
+	cntrs->RxP1HdrEgrOvflCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxP1HdrEgrOvflCnt));
+	cntrs->RxP2HdrEgrOvflCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxP2HdrEgrOvflCnt));
+	cntrs->RxP3HdrEgrOvflCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxP3HdrEgrOvflCnt));
+	cntrs->RxP4HdrEgrOvflCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxP4HdrEgrOvflCnt));
+	cntrs->RxP5HdrEgrOvflCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxP5HdrEgrOvflCnt));
+	cntrs->RxP6HdrEgrOvflCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxP6HdrEgrOvflCnt));
+	cntrs->RxP7HdrEgrOvflCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxP7HdrEgrOvflCnt));
+	cntrs->RxP8HdrEgrOvflCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxP8HdrEgrOvflCnt));
+	cntrs->RxP9HdrEgrOvflCnt = 0;
+	cntrs->RxP10HdrEgrOvflCnt = 0;
+	cntrs->RxP11HdrEgrOvflCnt = 0;
+	cntrs->RxP12HdrEgrOvflCnt = 0;
+	cntrs->RxP13HdrEgrOvflCnt = 0;
+	cntrs->RxP14HdrEgrOvflCnt = 0;
+	cntrs->RxP15HdrEgrOvflCnt = 0;
+	cntrs->RxP16HdrEgrOvflCnt = 0;
+	cntrs->IBStatusChangeCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(IBStatusChangeCnt));
+	cntrs->IBLinkErrRecoveryCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(IBLinkErrRecoveryCnt));
+	cntrs->IBLinkDownedCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(IBLinkDownedCnt));
+	cntrs->IBSymbolErrCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(IBSymbolErrCnt));
+	cntrs->RxVL15DroppedPktCnt = 0;
+	cntrs->RxOtherLocalPhyErrCnt = 0;
+	cntrs->PcieRetryBufDiagQwordCnt = 0;
+	cntrs->ExcessBufferOvflCnt = dd->ipath_overrun_thresh_errs;
+	cntrs->LocalLinkIntegrityErrCnt =
+		(dd->ipath_flags & IPATH_GPIO_ERRINTRS) ?
+		dd->ipath_lli_errs : dd->ipath_lli_errors;
+	cntrs->RxVlErrCnt = 0;
+	cntrs->RxDlidFltrCnt = 0;
+}
+
+
+/* no interrupt fallback for these chips */
+static int ipath_ht_nointr_fallback(struct ipath_devdata *dd)
+{
+	return 0;
+}
+
+
+/*
+ * reset the XGXS (between serdes and IBC).  Slightly less intrusive
+ * than resetting the IBC or external link state, and useful in some
+ * cases to cause some retraining.  To do this right, we reset IBC
+ * as well.
+ */
+static void ipath_ht_xgxs_reset(struct ipath_devdata *dd)
+{
+	u64 val, prev_val;
+
+	prev_val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_xgxsconfig);
+	val = prev_val | INFINIPATH_XGXS_RESET;
+	prev_val &= ~INFINIPATH_XGXS_RESET; /* be sure */
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_control,
+			 dd->ipath_control & ~INFINIPATH_C_LINKENABLE);
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_xgxsconfig, val);
+	ipath_read_kreg32(dd, dd->ipath_kregs->kr_scratch);
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_xgxsconfig, prev_val);
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_control,
+			 dd->ipath_control);
+}
+
+
+static int ipath_ht_get_ib_cfg(struct ipath_devdata *dd, int which)
+{
+	int ret;
+
+	switch (which) {
+	case IPATH_IB_CFG_LWID:
+		ret = dd->ipath_link_width_active;
+		break;
+	case IPATH_IB_CFG_SPD:
+		ret = dd->ipath_link_speed_active;
+		break;
+	case IPATH_IB_CFG_LWID_ENB:
+		ret = dd->ipath_link_width_enabled;
+		break;
+	case IPATH_IB_CFG_SPD_ENB:
+		ret = dd->ipath_link_speed_enabled;
+		break;
+	default:
+		ret =  -ENOTSUPP;
+		break;
+	}
+	return ret;
+}
+
+
+/* we assume range checking is already done, if needed */
+static int ipath_ht_set_ib_cfg(struct ipath_devdata *dd, int which, u32 val)
+{
+	int ret = 0;
+
+	if (which == IPATH_IB_CFG_LWID_ENB)
+		dd->ipath_link_width_enabled = val;
+	else if (which == IPATH_IB_CFG_SPD_ENB)
+		dd->ipath_link_speed_enabled = val;
+	else
+		ret = -ENOTSUPP;
+	return ret;
+}
+
+
+static void ipath_ht_config_jint(struct ipath_devdata *dd, u16 a, u16 b)
+{
+}
+
+
+static int ipath_ht_ib_updown(struct ipath_devdata *dd, int ibup, u64 ibcs)
+{
+	ipath_setup_ht_setextled(dd, ipath_ib_linkstate(dd, ibcs),
+		ipath_ib_linktrstate(dd, ibcs));
+	return 0;
+}
+
+
+/**
+ * ipath_init_iba6110_funcs - set up the chip-specific function pointers
+ * @dd: the infinipath device
+ *
+ * This is global, and is called directly at init to set up the
+ * chip-specific function pointers for later use.
+ */
+void ipath_init_iba6110_funcs(struct ipath_devdata *dd)
+{
+	dd->ipath_f_intrsetup = ipath_ht_intconfig;
+	dd->ipath_f_bus = ipath_setup_ht_config;
+	dd->ipath_f_reset = ipath_setup_ht_reset;
+	dd->ipath_f_get_boardname = ipath_ht_boardname;
+	dd->ipath_f_init_hwerrors = ipath_ht_init_hwerrors;
+	dd->ipath_f_early_init = ipath_ht_early_init;
+	dd->ipath_f_handle_hwerrors = ipath_ht_handle_hwerrors;
+	dd->ipath_f_quiet_serdes = ipath_ht_quiet_serdes;
+	dd->ipath_f_bringup_serdes = ipath_ht_bringup_serdes;
+	dd->ipath_f_clear_tids = ipath_ht_clear_tids;
+	dd->ipath_f_put_tid = ipath_ht_put_tid;
+	dd->ipath_f_cleanup = ipath_setup_ht_cleanup;
+	dd->ipath_f_setextled = ipath_setup_ht_setextled;
+	dd->ipath_f_get_base_info = ipath_ht_get_base_info;
+	dd->ipath_f_free_irq = ipath_ht_free_irq;
+	dd->ipath_f_tidtemplate = ipath_ht_tidtemplate;
+	dd->ipath_f_intr_fallback = ipath_ht_nointr_fallback;
+	dd->ipath_f_get_msgheader = ipath_ht_get_msgheader;
+	dd->ipath_f_config_ports = ipath_ht_config_ports;
+	dd->ipath_f_read_counters = ipath_ht_read_counters;
+	dd->ipath_f_xgxs_reset = ipath_ht_xgxs_reset;
+	dd->ipath_f_get_ib_cfg = ipath_ht_get_ib_cfg;
+	dd->ipath_f_set_ib_cfg = ipath_ht_set_ib_cfg;
+	dd->ipath_f_config_jint = ipath_ht_config_jint;
+	dd->ipath_f_ib_updown = ipath_ht_ib_updown;
+
+	/*
+	 * initialize chip-specific variables
+	 */
+	ipath_init_ht_variables(dd);
+}
diff --git a/drivers/infiniband/hw/ipath/ipath_init_chip.c b/drivers/infiniband/hw/ipath/ipath_init_chip.c
new file mode 100644
index 0000000..be2a60e
--- /dev/null
+++ b/drivers/infiniband/hw/ipath/ipath_init_chip.c
@@ -0,0 +1,1066 @@
+/*
+ * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/pci.h>
+#include <linux/netdevice.h>
+#include <linux/moduleparam.h>
+#include <linux/slab.h>
+#include <linux/stat.h>
+#include <linux/vmalloc.h>
+
+#include "ipath_kernel.h"
+#include "ipath_common.h"
+
+/*
+ * min buffers we want to have per port, after driver
+ */
+#define IPATH_MIN_USER_PORT_BUFCNT 7
+
+/*
+ * Number of ports we are configured to use (to allow for more pio
+ * buffers per port, etc.)  Zero means use chip value.
+ */
+static ushort ipath_cfgports;
+
+module_param_named(cfgports, ipath_cfgports, ushort, S_IRUGO);
+MODULE_PARM_DESC(cfgports, "Set max number of ports to use");
+
+/*
+ * Number of buffers reserved for driver (verbs and layered drivers.)
+ * Initialized based on number of PIO buffers if not set via module interface.
+ * The problem with this is that it's global, but we'll use different
+ * numbers for different chip types.
+ */
+static ushort ipath_kpiobufs;
+
+static int ipath_set_kpiobufs(const char *val, struct kernel_param *kp);
+
+module_param_call(kpiobufs, ipath_set_kpiobufs, param_get_ushort,
+		  &ipath_kpiobufs, S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(kpiobufs, "Set number of PIO buffers for driver");
+
+/**
+ * create_port0_egr - allocate the eager TID buffers
+ * @dd: the infinipath device
+ *
+ * This code is now quite different for user and kernel, because
+ * the kernel uses skb's, for the accelerated network performance.
+ * This is the kernel (port0) version.
+ *
+ * Allocate the eager TID buffers and program them into infinipath.
+ * We use the network layer alloc_skb() allocator to allocate the
+ * memory, and either use the buffers as is for things like verbs
+ * packets, or pass the buffers up to the ipath layered driver and
+ * thence the network layer, replacing them as we do so (see
+ * ipath_rcv_layer()).
+ */
+static int create_port0_egr(struct ipath_devdata *dd)
+{
+	unsigned e, egrcnt;
+	struct ipath_skbinfo *skbinfo;
+	int ret;
+
+	egrcnt = dd->ipath_p0_rcvegrcnt;
+
+	skbinfo = vmalloc(sizeof(*dd->ipath_port0_skbinfo) * egrcnt);
+	if (skbinfo == NULL) {
+		ipath_dev_err(dd, "allocation error for eager TID "
+			      "skb array\n");
+		ret = -ENOMEM;
+		goto bail;
+	}
+	for (e = 0; e < egrcnt; e++) {
+		/*
+		 * This is a bit tricky in that we allocate extra
+		 * space for 2 bytes of the 14 byte ethernet header.
+		 * These two bytes are passed in the ipath header so
+		 * the rest of the data is word aligned.  We allocate
+		 * 4 bytes so that the data buffer stays word aligned.
+		 * See ipath_kreceive() for more details.
+		 */
+		skbinfo[e].skb = ipath_alloc_skb(dd, GFP_KERNEL);
+		if (!skbinfo[e].skb) {
+			ipath_dev_err(dd, "SKB allocation error for "
+				      "eager TID %u\n", e);
+			while (e != 0)
+				dev_kfree_skb(skbinfo[--e].skb);
+			vfree(skbinfo);
+			ret = -ENOMEM;
+			goto bail;
+		}
+	}
+	/*
+	 * After loop above, so we can test non-NULL to see if ready
+	 * to use at receive, etc.
+	 */
+	dd->ipath_port0_skbinfo = skbinfo;
+
+	for (e = 0; e < egrcnt; e++) {
+		dd->ipath_port0_skbinfo[e].phys =
+		  ipath_map_single(dd->pcidev,
+				   dd->ipath_port0_skbinfo[e].skb->data,
+				   dd->ipath_ibmaxlen, PCI_DMA_FROMDEVICE);
+		dd->ipath_f_put_tid(dd, e + (u64 __iomem *)
+				    ((char __iomem *) dd->ipath_kregbase +
+				     dd->ipath_rcvegrbase),
+				    RCVHQ_RCV_TYPE_EAGER,
+				    dd->ipath_port0_skbinfo[e].phys);
+	}
+
+	ret = 0;
+
+bail:
+	return ret;
+}
+
+static int bringup_link(struct ipath_devdata *dd)
+{
+	u64 val, ibc;
+	int ret = 0;
+
+	/* hold IBC in reset */
+	dd->ipath_control &= ~INFINIPATH_C_LINKENABLE;
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_control,
+			 dd->ipath_control);
+
+	/*
+	 * set initial max size pkt IBC will send, including ICRC; it's the
+	 * PIO buffer size in dwords, less 1; also see ipath_set_mtu()
+	 */
+	val = (dd->ipath_ibmaxlen >> 2) + 1;
+	ibc = val << dd->ibcc_mpl_shift;
+
+	/* flowcontrolwatermark is in units of KBytes */
+	ibc |= 0x5ULL << INFINIPATH_IBCC_FLOWCTRLWATERMARK_SHIFT;
+	/*
+	 * How often flowctrl sent.  More or less in usecs; balance against
+	 * watermark value, so that in theory senders always get a flow
+	 * control update in time to not let the IB link go idle.
+	 */
+	ibc |= 0x3ULL << INFINIPATH_IBCC_FLOWCTRLPERIOD_SHIFT;
+	/* max error tolerance */
+	ibc |= 0xfULL << INFINIPATH_IBCC_PHYERRTHRESHOLD_SHIFT;
+	/* use "real" buffer space for */
+	ibc |= 4ULL << INFINIPATH_IBCC_CREDITSCALE_SHIFT;
+	/* IB credit flow control. */
+	ibc |= 0xfULL << INFINIPATH_IBCC_OVERRUNTHRESHOLD_SHIFT;
+	/* initially come up waiting for TS1, without sending anything. */
+	dd->ipath_ibcctrl = ibc;
+	/*
+	 * Want to start out with both LINKCMD and LINKINITCMD in NOP
+	 * (0 and 0).  Don't put linkinitcmd in ipath_ibcctrl, want that
+	 * to stay a NOP. Flag that we are disabled, for the (unlikely)
+	 * case that some recovery path is trying to bring the link up
+	 * before we are ready.
+	 */
+	ibc |= INFINIPATH_IBCC_LINKINITCMD_DISABLE <<
+		INFINIPATH_IBCC_LINKINITCMD_SHIFT;
+	dd->ipath_flags |= IPATH_IB_LINK_DISABLED;
+	ipath_cdbg(VERBOSE, "Writing 0x%llx to ibcctrl\n",
+		   (unsigned long long) ibc);
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_ibcctrl, ibc);
+
+	// be sure chip saw it
+	val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
+
+	ret = dd->ipath_f_bringup_serdes(dd);
+
+	if (ret)
+		dev_info(&dd->pcidev->dev, "Could not initialize SerDes, "
+			 "not usable\n");
+	else {
+		/* enable IBC */
+		dd->ipath_control |= INFINIPATH_C_LINKENABLE;
+		ipath_write_kreg(dd, dd->ipath_kregs->kr_control,
+				 dd->ipath_control);
+	}
+
+	return ret;
+}
+
+static struct ipath_portdata *create_portdata0(struct ipath_devdata *dd)
+{
+	struct ipath_portdata *pd = NULL;
+
+	pd = kzalloc(sizeof(*pd), GFP_KERNEL);
+	if (pd) {
+		pd->port_dd = dd;
+		pd->port_cnt = 1;
+		/* The port 0 pkey table is used by the layer interface. */
+		pd->port_pkeys[0] = IPATH_DEFAULT_P_KEY;
+		pd->port_seq_cnt = 1;
+	}
+	return pd;
+}
+
+static int init_chip_first(struct ipath_devdata *dd)
+{
+	struct ipath_portdata *pd;
+	int ret = 0;
+	u64 val;
+
+	spin_lock_init(&dd->ipath_kernel_tid_lock);
+	spin_lock_init(&dd->ipath_user_tid_lock);
+	spin_lock_init(&dd->ipath_sendctrl_lock);
+	spin_lock_init(&dd->ipath_uctxt_lock);
+	spin_lock_init(&dd->ipath_sdma_lock);
+	spin_lock_init(&dd->ipath_gpio_lock);
+	spin_lock_init(&dd->ipath_eep_st_lock);
+	spin_lock_init(&dd->ipath_sdepb_lock);
+	mutex_init(&dd->ipath_eep_lock);
+
+	/*
+	 * skip cfgports stuff because we are not allocating memory,
+	 * and we don't want problems if the portcnt changed due to
+	 * cfgports.  We do still check and report a difference, if
+	 * not same (should be impossible).
+	 */
+	dd->ipath_f_config_ports(dd, ipath_cfgports);
+	if (!ipath_cfgports)
+		dd->ipath_cfgports = dd->ipath_portcnt;
+	else if (ipath_cfgports <= dd->ipath_portcnt) {
+		dd->ipath_cfgports = ipath_cfgports;
+		ipath_dbg("Configured to use %u ports out of %u in chip\n",
+			  dd->ipath_cfgports, ipath_read_kreg32(dd,
+			  dd->ipath_kregs->kr_portcnt));
+	} else {
+		dd->ipath_cfgports = dd->ipath_portcnt;
+		ipath_dbg("Tried to configured to use %u ports; chip "
+			  "only supports %u\n", ipath_cfgports,
+			  ipath_read_kreg32(dd,
+				  dd->ipath_kregs->kr_portcnt));
+	}
+	/*
+	 * Allocate full portcnt array, rather than just cfgports, because
+	 * cleanup iterates across all possible ports.
+	 */
+	dd->ipath_pd = kzalloc(sizeof(*dd->ipath_pd) * dd->ipath_portcnt,
+			       GFP_KERNEL);
+
+	if (!dd->ipath_pd) {
+		ipath_dev_err(dd, "Unable to allocate portdata array, "
+			      "failing\n");
+		ret = -ENOMEM;
+		goto done;
+	}
+
+	pd = create_portdata0(dd);
+	if (!pd) {
+		ipath_dev_err(dd, "Unable to allocate portdata for port "
+			      "0, failing\n");
+		ret = -ENOMEM;
+		goto done;
+	}
+	dd->ipath_pd[0] = pd;
+
+	dd->ipath_rcvtidcnt =
+		ipath_read_kreg32(dd, dd->ipath_kregs->kr_rcvtidcnt);
+	dd->ipath_rcvtidbase =
+		ipath_read_kreg32(dd, dd->ipath_kregs->kr_rcvtidbase);
+	dd->ipath_rcvegrcnt =
+		ipath_read_kreg32(dd, dd->ipath_kregs->kr_rcvegrcnt);
+	dd->ipath_rcvegrbase =
+		ipath_read_kreg32(dd, dd->ipath_kregs->kr_rcvegrbase);
+	dd->ipath_palign =
+		ipath_read_kreg32(dd, dd->ipath_kregs->kr_pagealign);
+	dd->ipath_piobufbase =
+		ipath_read_kreg64(dd, dd->ipath_kregs->kr_sendpiobufbase);
+	val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_sendpiosize);
+	dd->ipath_piosize2k = val & ~0U;
+	dd->ipath_piosize4k = val >> 32;
+	if (dd->ipath_piosize4k == 0 && ipath_mtu4096)
+		ipath_mtu4096 = 0; /* 4KB not supported by this chip */
+	dd->ipath_ibmtu = ipath_mtu4096 ? 4096 : 2048;
+	val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_sendpiobufcnt);
+	dd->ipath_piobcnt2k = val & ~0U;
+	dd->ipath_piobcnt4k = val >> 32;
+	dd->ipath_pio2kbase =
+		(u32 __iomem *) (((char __iomem *) dd->ipath_kregbase) +
+				 (dd->ipath_piobufbase & 0xffffffff));
+	if (dd->ipath_piobcnt4k) {
+		dd->ipath_pio4kbase = (u32 __iomem *)
+			(((char __iomem *) dd->ipath_kregbase) +
+			 (dd->ipath_piobufbase >> 32));
+		/*
+		 * 4K buffers take 2 pages; we use roundup just to be
+		 * paranoid; we calculate it once here, rather than on
+		 * ever buf allocate
+		 */
+		dd->ipath_4kalign = ALIGN(dd->ipath_piosize4k,
+					  dd->ipath_palign);
+		ipath_dbg("%u 2k(%x) piobufs @ %p, %u 4k(%x) @ %p "
+			  "(%x aligned)\n",
+			  dd->ipath_piobcnt2k, dd->ipath_piosize2k,
+			  dd->ipath_pio2kbase, dd->ipath_piobcnt4k,
+			  dd->ipath_piosize4k, dd->ipath_pio4kbase,
+			  dd->ipath_4kalign);
+	}
+	else ipath_dbg("%u 2k piobufs @ %p\n",
+		       dd->ipath_piobcnt2k, dd->ipath_pio2kbase);
+
+done:
+	return ret;
+}
+
+/**
+ * init_chip_reset - re-initialize after a reset, or enable
+ * @dd: the infinipath device
+ *
+ * sanity check at least some of the values after reset, and
+ * ensure no receive or transmit (explicitly, in case reset
+ * failed
+ */
+static int init_chip_reset(struct ipath_devdata *dd)
+{
+	u32 rtmp;
+	int i;
+	unsigned long flags;
+
+	/*
+	 * ensure chip does no sends or receives, tail updates, or
+	 * pioavail updates while we re-initialize
+	 */
+	dd->ipath_rcvctrl &= ~(1ULL << dd->ipath_r_tailupd_shift);
+	for (i = 0; i < dd->ipath_portcnt; i++) {
+		clear_bit(dd->ipath_r_portenable_shift + i,
+			  &dd->ipath_rcvctrl);
+		clear_bit(dd->ipath_r_intravail_shift + i,
+			  &dd->ipath_rcvctrl);
+	}
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl,
+		dd->ipath_rcvctrl);
+
+	spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
+	dd->ipath_sendctrl = 0U; /* no sdma, etc */
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, dd->ipath_sendctrl);
+	ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
+	spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
+
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_control, 0ULL);
+
+	rtmp = ipath_read_kreg32(dd, dd->ipath_kregs->kr_rcvtidcnt);
+	if (rtmp != dd->ipath_rcvtidcnt)
+		dev_info(&dd->pcidev->dev, "tidcnt was %u before "
+			 "reset, now %u, using original\n",
+			 dd->ipath_rcvtidcnt, rtmp);
+	rtmp = ipath_read_kreg32(dd, dd->ipath_kregs->kr_rcvtidbase);
+	if (rtmp != dd->ipath_rcvtidbase)
+		dev_info(&dd->pcidev->dev, "tidbase was %u before "
+			 "reset, now %u, using original\n",
+			 dd->ipath_rcvtidbase, rtmp);
+	rtmp = ipath_read_kreg32(dd, dd->ipath_kregs->kr_rcvegrcnt);
+	if (rtmp != dd->ipath_rcvegrcnt)
+		dev_info(&dd->pcidev->dev, "egrcnt was %u before "
+			 "reset, now %u, using original\n",
+			 dd->ipath_rcvegrcnt, rtmp);
+	rtmp = ipath_read_kreg32(dd, dd->ipath_kregs->kr_rcvegrbase);
+	if (rtmp != dd->ipath_rcvegrbase)
+		dev_info(&dd->pcidev->dev, "egrbase was %u before "
+			 "reset, now %u, using original\n",
+			 dd->ipath_rcvegrbase, rtmp);
+
+	return 0;
+}
+
+static int init_pioavailregs(struct ipath_devdata *dd)
+{
+	int ret;
+
+	dd->ipath_pioavailregs_dma = dma_alloc_coherent(
+		&dd->pcidev->dev, PAGE_SIZE, &dd->ipath_pioavailregs_phys,
+		GFP_KERNEL);
+	if (!dd->ipath_pioavailregs_dma) {
+		ipath_dev_err(dd, "failed to allocate PIOavail reg area "
+			      "in memory\n");
+		ret = -ENOMEM;
+		goto done;
+	}
+
+	/*
+	 * we really want L2 cache aligned, but for current CPUs of
+	 * interest, they are the same.
+	 */
+	dd->ipath_statusp = (u64 *)
+		((char *)dd->ipath_pioavailregs_dma +
+		 ((2 * L1_CACHE_BYTES +
+		   dd->ipath_pioavregs * sizeof(u64)) & ~L1_CACHE_BYTES));
+	/* copy the current value now that it's really allocated */
+	*dd->ipath_statusp = dd->_ipath_status;
+	/*
+	 * setup buffer to hold freeze msg, accessible to apps,
+	 * following statusp
+	 */
+	dd->ipath_freezemsg = (char *)&dd->ipath_statusp[1];
+	/* and its length */
+	dd->ipath_freezelen = L1_CACHE_BYTES - sizeof(dd->ipath_statusp[0]);
+
+	ret = 0;
+
+done:
+	return ret;
+}
+
+/**
+ * init_shadow_tids - allocate the shadow TID array
+ * @dd: the infinipath device
+ *
+ * allocate the shadow TID array, so we can ipath_munlock previous
+ * entries.  It may make more sense to move the pageshadow to the
+ * port data structure, so we only allocate memory for ports actually
+ * in use, since we at 8k per port, now.
+ */
+static void init_shadow_tids(struct ipath_devdata *dd)
+{
+	struct page **pages;
+	dma_addr_t *addrs;
+
+	pages = vzalloc(dd->ipath_cfgports * dd->ipath_rcvtidcnt *
+			sizeof(struct page *));
+	if (!pages) {
+		ipath_dev_err(dd, "failed to allocate shadow page * "
+			      "array, no expected sends!\n");
+		dd->ipath_pageshadow = NULL;
+		return;
+	}
+
+	addrs = vmalloc(dd->ipath_cfgports * dd->ipath_rcvtidcnt *
+			sizeof(dma_addr_t));
+	if (!addrs) {
+		ipath_dev_err(dd, "failed to allocate shadow dma handle "
+			      "array, no expected sends!\n");
+		vfree(pages);
+		dd->ipath_pageshadow = NULL;
+		return;
+	}
+
+	dd->ipath_pageshadow = pages;
+	dd->ipath_physshadow = addrs;
+}
+
+static void enable_chip(struct ipath_devdata *dd, int reinit)
+{
+	u32 val;
+	u64 rcvmask;
+	unsigned long flags;
+	int i;
+
+	if (!reinit)
+		init_waitqueue_head(&ipath_state_wait);
+
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl,
+			 dd->ipath_rcvctrl);
+
+	spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
+	/* Enable PIO send, and update of PIOavail regs to memory. */
+	dd->ipath_sendctrl = INFINIPATH_S_PIOENABLE |
+		INFINIPATH_S_PIOBUFAVAILUPD;
+
+	/*
+	 * Set the PIO avail update threshold to host memory
+	 * on chips that support it.
+	 */
+	if (dd->ipath_pioupd_thresh)
+		dd->ipath_sendctrl |= dd->ipath_pioupd_thresh
+			<< INFINIPATH_S_UPDTHRESH_SHIFT;
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, dd->ipath_sendctrl);
+	ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
+	spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
+
+	/*
+	 * Enable kernel ports' receive and receive interrupt.
+	 * Other ports done as user opens and inits them.
+	 */
+	rcvmask = 1ULL;
+	dd->ipath_rcvctrl |= (rcvmask << dd->ipath_r_portenable_shift) |
+		(rcvmask << dd->ipath_r_intravail_shift);
+	if (!(dd->ipath_flags & IPATH_NODMA_RTAIL))
+		dd->ipath_rcvctrl |= (1ULL << dd->ipath_r_tailupd_shift);
+
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl,
+			 dd->ipath_rcvctrl);
+
+	/*
+	 * now ready for use.  this should be cleared whenever we
+	 * detect a reset, or initiate one.
+	 */
+	dd->ipath_flags |= IPATH_INITTED;
+
+	/*
+	 * Init our shadow copies of head from tail values,
+	 * and write head values to match.
+	 */
+	val = ipath_read_ureg32(dd, ur_rcvegrindextail, 0);
+	ipath_write_ureg(dd, ur_rcvegrindexhead, val, 0);
+
+	/* Initialize so we interrupt on next packet received */
+	ipath_write_ureg(dd, ur_rcvhdrhead,
+			 dd->ipath_rhdrhead_intr_off |
+			 dd->ipath_pd[0]->port_head, 0);
+
+	/*
+	 * by now pioavail updates to memory should have occurred, so
+	 * copy them into our working/shadow registers; this is in
+	 * case something went wrong with abort, but mostly to get the
+	 * initial values of the generation bit correct.
+	 */
+	for (i = 0; i < dd->ipath_pioavregs; i++) {
+		__le64 pioavail;
+
+		/*
+		 * Chip Errata bug 6641; even and odd qwords>3 are swapped.
+		 */
+		if (i > 3 && (dd->ipath_flags & IPATH_SWAP_PIOBUFS))
+			pioavail = dd->ipath_pioavailregs_dma[i ^ 1];
+		else
+			pioavail = dd->ipath_pioavailregs_dma[i];
+		/*
+		 * don't need to worry about ipath_pioavailkernel here
+		 * because we will call ipath_chg_pioavailkernel() later
+		 * in initialization, to busy out buffers as needed
+		 */
+		dd->ipath_pioavailshadow[i] = le64_to_cpu(pioavail);
+	}
+	/* can get counters, stats, etc. */
+	dd->ipath_flags |= IPATH_PRESENT;
+}
+
+static int init_housekeeping(struct ipath_devdata *dd, int reinit)
+{
+	char boardn[40];
+	int ret = 0;
+
+	/*
+	 * have to clear shadow copies of registers at init that are
+	 * not otherwise set here, or all kinds of bizarre things
+	 * happen with driver on chip reset
+	 */
+	dd->ipath_rcvhdrsize = 0;
+
+	/*
+	 * Don't clear ipath_flags as 8bit mode was set before
+	 * entering this func. However, we do set the linkstate to
+	 * unknown, so we can watch for a transition.
+	 * PRESENT is set because we want register reads to work,
+	 * and the kernel infrastructure saw it in config space;
+	 * We clear it if we have failures.
+	 */
+	dd->ipath_flags |= IPATH_LINKUNK | IPATH_PRESENT;
+	dd->ipath_flags &= ~(IPATH_LINKACTIVE | IPATH_LINKARMED |
+			     IPATH_LINKDOWN | IPATH_LINKINIT);
+
+	ipath_cdbg(VERBOSE, "Try to read spc chip revision\n");
+	dd->ipath_revision =
+		ipath_read_kreg64(dd, dd->ipath_kregs->kr_revision);
+
+	/*
+	 * set up fundamental info we need to use the chip; we assume
+	 * if the revision reg and these regs are OK, we don't need to
+	 * special case the rest
+	 */
+	dd->ipath_sregbase =
+		ipath_read_kreg32(dd, dd->ipath_kregs->kr_sendregbase);
+	dd->ipath_cregbase =
+		ipath_read_kreg32(dd, dd->ipath_kregs->kr_counterregbase);
+	dd->ipath_uregbase =
+		ipath_read_kreg32(dd, dd->ipath_kregs->kr_userregbase);
+	ipath_cdbg(VERBOSE, "ipath_kregbase %p, sendbase %x usrbase %x, "
+		   "cntrbase %x\n", dd->ipath_kregbase, dd->ipath_sregbase,
+		   dd->ipath_uregbase, dd->ipath_cregbase);
+	if ((dd->ipath_revision & 0xffffffff) == 0xffffffff
+	    || (dd->ipath_sregbase & 0xffffffff) == 0xffffffff
+	    || (dd->ipath_cregbase & 0xffffffff) == 0xffffffff
+	    || (dd->ipath_uregbase & 0xffffffff) == 0xffffffff) {
+		ipath_dev_err(dd, "Register read failures from chip, "
+			      "giving up initialization\n");
+		dd->ipath_flags &= ~IPATH_PRESENT;
+		ret = -ENODEV;
+		goto done;
+	}
+
+
+	/* clear diagctrl register, in case diags were running and crashed */
+	ipath_write_kreg (dd, dd->ipath_kregs->kr_hwdiagctrl, 0);
+
+	/* clear the initial reset flag, in case first driver load */
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_errorclear,
+			 INFINIPATH_E_RESET);
+
+	ipath_cdbg(VERBOSE, "Revision %llx (PCI %x)\n",
+		   (unsigned long long) dd->ipath_revision,
+		   dd->ipath_pcirev);
+
+	if (((dd->ipath_revision >> INFINIPATH_R_SOFTWARE_SHIFT) &
+	     INFINIPATH_R_SOFTWARE_MASK) != IPATH_CHIP_SWVERSION) {
+		ipath_dev_err(dd, "Driver only handles version %d, "
+			      "chip swversion is %d (%llx), failng\n",
+			      IPATH_CHIP_SWVERSION,
+			      (int)(dd->ipath_revision >>
+				    INFINIPATH_R_SOFTWARE_SHIFT) &
+			      INFINIPATH_R_SOFTWARE_MASK,
+			      (unsigned long long) dd->ipath_revision);
+		ret = -ENOSYS;
+		goto done;
+	}
+	dd->ipath_majrev = (u8) ((dd->ipath_revision >>
+				  INFINIPATH_R_CHIPREVMAJOR_SHIFT) &
+				 INFINIPATH_R_CHIPREVMAJOR_MASK);
+	dd->ipath_minrev = (u8) ((dd->ipath_revision >>
+				  INFINIPATH_R_CHIPREVMINOR_SHIFT) &
+				 INFINIPATH_R_CHIPREVMINOR_MASK);
+	dd->ipath_boardrev = (u8) ((dd->ipath_revision >>
+				    INFINIPATH_R_BOARDID_SHIFT) &
+				   INFINIPATH_R_BOARDID_MASK);
+
+	ret = dd->ipath_f_get_boardname(dd, boardn, sizeof boardn);
+
+	snprintf(dd->ipath_boardversion, sizeof(dd->ipath_boardversion),
+		 "ChipABI %u.%u, %s, InfiniPath%u %u.%u, PCI %u, "
+		 "SW Compat %u\n",
+		 IPATH_CHIP_VERS_MAJ, IPATH_CHIP_VERS_MIN, boardn,
+		 (unsigned)(dd->ipath_revision >> INFINIPATH_R_ARCH_SHIFT) &
+		 INFINIPATH_R_ARCH_MASK,
+		 dd->ipath_majrev, dd->ipath_minrev, dd->ipath_pcirev,
+		 (unsigned)(dd->ipath_revision >>
+			    INFINIPATH_R_SOFTWARE_SHIFT) &
+		 INFINIPATH_R_SOFTWARE_MASK);
+
+	ipath_dbg("%s", dd->ipath_boardversion);
+
+	if (ret)
+		goto done;
+
+	if (reinit)
+		ret = init_chip_reset(dd);
+	else
+		ret = init_chip_first(dd);
+
+done:
+	return ret;
+}
+
+static void verify_interrupt(unsigned long opaque)
+{
+	struct ipath_devdata *dd = (struct ipath_devdata *) opaque;
+
+	if (!dd)
+		return; /* being torn down */
+
+	/*
+	 * If we don't have any interrupts, let the user know and
+	 * don't bother checking again.
+	 */
+	if (dd->ipath_int_counter == 0) {
+		if (!dd->ipath_f_intr_fallback(dd))
+			dev_err(&dd->pcidev->dev, "No interrupts detected, "
+				"not usable.\n");
+		else /* re-arm the timer to see if fallback works */
+			mod_timer(&dd->ipath_intrchk_timer, jiffies + HZ/2);
+	} else
+		ipath_cdbg(VERBOSE, "%u interrupts at timer check\n",
+			dd->ipath_int_counter);
+}
+
+/**
+ * ipath_init_chip - do the actual initialization sequence on the chip
+ * @dd: the infinipath device
+ * @reinit: reinitializing, so don't allocate new memory
+ *
+ * Do the actual initialization sequence on the chip.  This is done
+ * both from the init routine called from the PCI infrastructure, and
+ * when we reset the chip, or detect that it was reset internally,
+ * or it's administratively re-enabled.
+ *
+ * Memory allocation here and in called routines is only done in
+ * the first case (reinit == 0).  We have to be careful, because even
+ * without memory allocation, we need to re-write all the chip registers
+ * TIDs, etc. after the reset or enable has completed.
+ */
+int ipath_init_chip(struct ipath_devdata *dd, int reinit)
+{
+	int ret = 0;
+	u32 kpiobufs, defkbufs;
+	u32 piobufs, uports;
+	u64 val;
+	struct ipath_portdata *pd;
+	gfp_t gfp_flags = GFP_USER | __GFP_COMP;
+
+	ret = init_housekeeping(dd, reinit);
+	if (ret)
+		goto done;
+
+	/*
+	 * We could bump this to allow for full rcvegrcnt + rcvtidcnt,
+	 * but then it no longer nicely fits power of two, and since
+	 * we now use routines that backend onto __get_free_pages, the
+	 * rest would be wasted.
+	 */
+	dd->ipath_rcvhdrcnt = max(dd->ipath_p0_rcvegrcnt, dd->ipath_rcvegrcnt);
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvhdrcnt,
+			 dd->ipath_rcvhdrcnt);
+
+	/*
+	 * Set up the shadow copies of the piobufavail registers,
+	 * which we compare against the chip registers for now, and
+	 * the in memory DMA'ed copies of the registers.  This has to
+	 * be done early, before we calculate lastport, etc.
+	 */
+	piobufs = dd->ipath_piobcnt2k + dd->ipath_piobcnt4k;
+	/*
+	 * calc number of pioavail registers, and save it; we have 2
+	 * bits per buffer.
+	 */
+	dd->ipath_pioavregs = ALIGN(piobufs, sizeof(u64) * BITS_PER_BYTE / 2)
+		/ (sizeof(u64) * BITS_PER_BYTE / 2);
+	uports = dd->ipath_cfgports ? dd->ipath_cfgports - 1 : 0;
+	if (piobufs > 144)
+		defkbufs = 32 + dd->ipath_pioreserved;
+	else
+		defkbufs = 16 + dd->ipath_pioreserved;
+
+	if (ipath_kpiobufs && (ipath_kpiobufs +
+		(uports * IPATH_MIN_USER_PORT_BUFCNT)) > piobufs) {
+		int i = (int) piobufs -
+			(int) (uports * IPATH_MIN_USER_PORT_BUFCNT);
+		if (i < 1)
+			i = 1;
+		dev_info(&dd->pcidev->dev, "Allocating %d PIO bufs of "
+			 "%d for kernel leaves too few for %d user ports "
+			 "(%d each); using %u\n", ipath_kpiobufs,
+			 piobufs, uports, IPATH_MIN_USER_PORT_BUFCNT, i);
+		/*
+		 * shouldn't change ipath_kpiobufs, because could be
+		 * different for different devices...
+		 */
+		kpiobufs = i;
+	} else if (ipath_kpiobufs)
+		kpiobufs = ipath_kpiobufs;
+	else
+		kpiobufs = defkbufs;
+	dd->ipath_lastport_piobuf = piobufs - kpiobufs;
+	dd->ipath_pbufsport =
+		uports ? dd->ipath_lastport_piobuf / uports : 0;
+	/* if not an even divisor, some user ports get extra buffers */
+	dd->ipath_ports_extrabuf = dd->ipath_lastport_piobuf -
+		(dd->ipath_pbufsport * uports);
+	if (dd->ipath_ports_extrabuf)
+		ipath_dbg("%u pbufs/port leaves some unused, add 1 buffer to "
+			"ports <= %u\n", dd->ipath_pbufsport,
+			dd->ipath_ports_extrabuf);
+	dd->ipath_lastpioindex = 0;
+	dd->ipath_lastpioindexl = dd->ipath_piobcnt2k;
+	/* ipath_pioavailshadow initialized earlier */
+	ipath_cdbg(VERBOSE, "%d PIO bufs for kernel out of %d total %u "
+		   "each for %u user ports\n", kpiobufs,
+		   piobufs, dd->ipath_pbufsport, uports);
+	ret = dd->ipath_f_early_init(dd);
+	if (ret) {
+		ipath_dev_err(dd, "Early initialization failure\n");
+		goto done;
+	}
+
+	/*
+	 * Early_init sets rcvhdrentsize and rcvhdrsize, so this must be
+	 * done after early_init.
+	 */
+	dd->ipath_hdrqlast =
+		dd->ipath_rcvhdrentsize * (dd->ipath_rcvhdrcnt - 1);
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvhdrentsize,
+			 dd->ipath_rcvhdrentsize);
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvhdrsize,
+			 dd->ipath_rcvhdrsize);
+
+	if (!reinit) {
+		ret = init_pioavailregs(dd);
+		init_shadow_tids(dd);
+		if (ret)
+			goto done;
+	}
+
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_sendpioavailaddr,
+			 dd->ipath_pioavailregs_phys);
+
+	/*
+	 * this is to detect s/w errors, which the h/w works around by
+	 * ignoring the low 6 bits of address, if it wasn't aligned.
+	 */
+	val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_sendpioavailaddr);
+	if (val != dd->ipath_pioavailregs_phys) {
+		ipath_dev_err(dd, "Catastrophic software error, "
+			      "SendPIOAvailAddr written as %lx, "
+			      "read back as %llx\n",
+			      (unsigned long) dd->ipath_pioavailregs_phys,
+			      (unsigned long long) val);
+		ret = -EINVAL;
+		goto done;
+	}
+
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvbthqp, IPATH_KD_QP);
+
+	/*
+	 * make sure we are not in freeze, and PIO send enabled, so
+	 * writes to pbc happen
+	 */
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrmask, 0ULL);
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrclear,
+			 ~0ULL&~INFINIPATH_HWE_MEMBISTFAILED);
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_control, 0ULL);
+
+	/*
+	 * before error clears, since we expect serdes pll errors during
+	 * this, the first time after reset
+	 */
+	if (bringup_link(dd)) {
+		dev_info(&dd->pcidev->dev, "Failed to bringup IB link\n");
+		ret = -ENETDOWN;
+		goto done;
+	}
+
+	/*
+	 * clear any "expected" hwerrs from reset and/or initialization
+	 * clear any that aren't enabled (at least this once), and then
+	 * set the enable mask
+	 */
+	dd->ipath_f_init_hwerrors(dd);
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrclear,
+			 ~0ULL&~INFINIPATH_HWE_MEMBISTFAILED);
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrmask,
+			 dd->ipath_hwerrmask);
+
+	/* clear all */
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_errorclear, -1LL);
+	/* enable errors that are masked, at least this first time. */
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_errormask,
+			 ~dd->ipath_maskederrs);
+	dd->ipath_maskederrs = 0; /* don't re-enable ignored in timer */
+	dd->ipath_errormask =
+		ipath_read_kreg64(dd, dd->ipath_kregs->kr_errormask);
+	/* clear any interrupts up to this point (ints still not enabled) */
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_intclear, -1LL);
+
+	dd->ipath_f_tidtemplate(dd);
+
+	/*
+	 * Set up the port 0 (kernel) rcvhdr q and egr TIDs.  If doing
+	 * re-init, the simplest way to handle this is to free
+	 * existing, and re-allocate.
+	 * Need to re-create rest of port 0 portdata as well.
+	 */
+	pd = dd->ipath_pd[0];
+	if (reinit) {
+		struct ipath_portdata *npd;
+
+		/*
+		 * Alloc and init new ipath_portdata for port0,
+		 * Then free old pd. Could lead to fragmentation, but also
+		 * makes later support for hot-swap easier.
+		 */
+		npd = create_portdata0(dd);
+		if (npd) {
+			ipath_free_pddata(dd, pd);
+			dd->ipath_pd[0] = npd;
+			pd = npd;
+		} else {
+			ipath_dev_err(dd, "Unable to allocate portdata"
+				      " for port 0, failing\n");
+			ret = -ENOMEM;
+			goto done;
+		}
+	}
+	ret = ipath_create_rcvhdrq(dd, pd);
+	if (!ret)
+		ret = create_port0_egr(dd);
+	if (ret) {
+		ipath_dev_err(dd, "failed to allocate kernel port's "
+			      "rcvhdrq and/or egr bufs\n");
+		goto done;
+	}
+	else
+		enable_chip(dd, reinit);
+
+	/* after enable_chip, so pioavailshadow setup */
+	ipath_chg_pioavailkernel(dd, 0, piobufs, 1);
+
+	/*
+	 * Cancel any possible active sends from early driver load.
+	 * Follows early_init because some chips have to initialize
+	 * PIO buffers in early_init to avoid false parity errors.
+	 * After enable and ipath_chg_pioavailkernel so we can safely
+	 * enable pioavail updates and PIOENABLE; packets are now
+	 * ready to go out.
+	 */
+	ipath_cancel_sends(dd, 1);
+
+	if (!reinit) {
+		/*
+		 * Used when we close a port, for DMA already in flight
+		 * at close.
+		 */
+		dd->ipath_dummy_hdrq = dma_alloc_coherent(
+			&dd->pcidev->dev, dd->ipath_pd[0]->port_rcvhdrq_size,
+			&dd->ipath_dummy_hdrq_phys,
+			gfp_flags);
+		if (!dd->ipath_dummy_hdrq) {
+			dev_info(&dd->pcidev->dev,
+				"Couldn't allocate 0x%lx bytes for dummy hdrq\n",
+				dd->ipath_pd[0]->port_rcvhdrq_size);
+			/* fallback to just 0'ing */
+			dd->ipath_dummy_hdrq_phys = 0UL;
+		}
+	}
+
+	/*
+	 * cause retrigger of pending interrupts ignored during init,
+	 * even if we had errors
+	 */
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_intclear, 0ULL);
+
+	if (!dd->ipath_stats_timer_active) {
+		/*
+		 * first init, or after an admin disable/enable
+		 * set up stats retrieval timer, even if we had errors
+		 * in last portion of setup
+		 */
+		init_timer(&dd->ipath_stats_timer);
+		dd->ipath_stats_timer.function = ipath_get_faststats;
+		dd->ipath_stats_timer.data = (unsigned long) dd;
+		/* every 5 seconds; */
+		dd->ipath_stats_timer.expires = jiffies + 5 * HZ;
+		/* takes ~16 seconds to overflow at full IB 4x bandwdith */
+		add_timer(&dd->ipath_stats_timer);
+		dd->ipath_stats_timer_active = 1;
+	}
+
+	/* Set up SendDMA if chip supports it */
+	if (dd->ipath_flags & IPATH_HAS_SEND_DMA)
+		ret = setup_sdma(dd);
+
+	/* Set up HoL state */
+	init_timer(&dd->ipath_hol_timer);
+	dd->ipath_hol_timer.function = ipath_hol_event;
+	dd->ipath_hol_timer.data = (unsigned long)dd;
+	dd->ipath_hol_state = IPATH_HOL_UP;
+
+done:
+	if (!ret) {
+		*dd->ipath_statusp |= IPATH_STATUS_CHIP_PRESENT;
+		if (!dd->ipath_f_intrsetup(dd)) {
+			/* now we can enable all interrupts from the chip */
+			ipath_write_kreg(dd, dd->ipath_kregs->kr_intmask,
+					 -1LL);
+			/* force re-interrupt of any pending interrupts. */
+			ipath_write_kreg(dd, dd->ipath_kregs->kr_intclear,
+					 0ULL);
+			/* chip is usable; mark it as initialized */
+			*dd->ipath_statusp |= IPATH_STATUS_INITTED;
+
+			/*
+			 * setup to verify we get an interrupt, and fallback
+			 * to an alternate if necessary and possible
+			 */
+			if (!reinit) {
+				init_timer(&dd->ipath_intrchk_timer);
+				dd->ipath_intrchk_timer.function =
+					verify_interrupt;
+				dd->ipath_intrchk_timer.data =
+					(unsigned long) dd;
+			}
+			dd->ipath_intrchk_timer.expires = jiffies + HZ/2;
+			add_timer(&dd->ipath_intrchk_timer);
+		} else
+			ipath_dev_err(dd, "No interrupts enabled, couldn't "
+				      "setup interrupt address\n");
+
+		if (dd->ipath_cfgports > ipath_stats.sps_nports)
+			/*
+			 * sps_nports is a global, so, we set it to
+			 * the highest number of ports of any of the
+			 * chips we find; we never decrement it, at
+			 * least for now.  Since this might have changed
+			 * over disable/enable or prior to reset, always
+			 * do the check and potentially adjust.
+			 */
+			ipath_stats.sps_nports = dd->ipath_cfgports;
+	} else
+		ipath_dbg("Failed (%d) to initialize chip\n", ret);
+
+	/* if ret is non-zero, we probably should do some cleanup
+	   here... */
+	return ret;
+}
+
+static int ipath_set_kpiobufs(const char *str, struct kernel_param *kp)
+{
+	struct ipath_devdata *dd;
+	unsigned long flags;
+	unsigned short val;
+	int ret;
+
+	ret = ipath_parse_ushort(str, &val);
+
+	spin_lock_irqsave(&ipath_devs_lock, flags);
+
+	if (ret < 0)
+		goto bail;
+
+	if (val == 0) {
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	list_for_each_entry(dd, &ipath_dev_list, ipath_list) {
+		if (dd->ipath_kregbase)
+			continue;
+		if (val > (dd->ipath_piobcnt2k + dd->ipath_piobcnt4k -
+			   (dd->ipath_cfgports *
+			    IPATH_MIN_USER_PORT_BUFCNT)))
+		{
+			ipath_dev_err(
+				dd,
+				"Allocating %d PIO bufs for kernel leaves "
+				"too few for %d user ports (%d each)\n",
+				val, dd->ipath_cfgports - 1,
+				IPATH_MIN_USER_PORT_BUFCNT);
+			ret = -EINVAL;
+			goto bail;
+		}
+		dd->ipath_lastport_piobuf =
+			dd->ipath_piobcnt2k + dd->ipath_piobcnt4k - val;
+	}
+
+	ipath_kpiobufs = val;
+	ret = 0;
+bail:
+	spin_unlock_irqrestore(&ipath_devs_lock, flags);
+
+	return ret;
+}
diff --git a/drivers/infiniband/hw/ipath/ipath_intr.c b/drivers/infiniband/hw/ipath/ipath_intr.c
new file mode 100644
index 0000000..01ba792
--- /dev/null
+++ b/drivers/infiniband/hw/ipath/ipath_intr.c
@@ -0,0 +1,1273 @@
+/*
+ * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/pci.h>
+#include <linux/delay.h>
+#include <linux/sched.h>
+
+#include "ipath_kernel.h"
+#include "ipath_verbs.h"
+#include "ipath_common.h"
+
+
+/*
+ * Called when we might have an error that is specific to a particular
+ * PIO buffer, and may need to cancel that buffer, so it can be re-used.
+ */
+void ipath_disarm_senderrbufs(struct ipath_devdata *dd)
+{
+	u32 piobcnt;
+	unsigned long sbuf[4];
+	/*
+	 * it's possible that sendbuffererror could have bits set; might
+	 * have already done this as a result of hardware error handling
+	 */
+	piobcnt = dd->ipath_piobcnt2k + dd->ipath_piobcnt4k;
+	/* read these before writing errorclear */
+	sbuf[0] = ipath_read_kreg64(
+		dd, dd->ipath_kregs->kr_sendbuffererror);
+	sbuf[1] = ipath_read_kreg64(
+		dd, dd->ipath_kregs->kr_sendbuffererror + 1);
+	if (piobcnt > 128)
+		sbuf[2] = ipath_read_kreg64(
+			dd, dd->ipath_kregs->kr_sendbuffererror + 2);
+	if (piobcnt > 192)
+		sbuf[3] = ipath_read_kreg64(
+			dd, dd->ipath_kregs->kr_sendbuffererror + 3);
+	else
+		sbuf[3] = 0;
+
+	if (sbuf[0] || sbuf[1] || (piobcnt > 128 && (sbuf[2] || sbuf[3]))) {
+		int i;
+		if (ipath_debug & (__IPATH_PKTDBG|__IPATH_DBG) &&
+			time_after(dd->ipath_lastcancel, jiffies)) {
+			__IPATH_DBG_WHICH(__IPATH_PKTDBG|__IPATH_DBG,
+					  "SendbufErrs %lx %lx", sbuf[0],
+					  sbuf[1]);
+			if (ipath_debug & __IPATH_PKTDBG && piobcnt > 128)
+				printk(" %lx %lx ", sbuf[2], sbuf[3]);
+			printk("\n");
+		}
+
+		for (i = 0; i < piobcnt; i++)
+			if (test_bit(i, sbuf))
+				ipath_disarm_piobufs(dd, i, 1);
+		/* ignore armlaunch errs for a bit */
+		dd->ipath_lastcancel = jiffies+3;
+	}
+}
+
+
+/* These are all rcv-related errors which we want to count for stats */
+#define E_SUM_PKTERRS \
+	(INFINIPATH_E_RHDRLEN | INFINIPATH_E_RBADTID | \
+	 INFINIPATH_E_RBADVERSION | INFINIPATH_E_RHDR | \
+	 INFINIPATH_E_RLONGPKTLEN | INFINIPATH_E_RSHORTPKTLEN | \
+	 INFINIPATH_E_RMAXPKTLEN | INFINIPATH_E_RMINPKTLEN | \
+	 INFINIPATH_E_RFORMATERR | INFINIPATH_E_RUNSUPVL | \
+	 INFINIPATH_E_RUNEXPCHAR | INFINIPATH_E_REBP)
+
+/* These are all send-related errors which we want to count for stats */
+#define E_SUM_ERRS \
+	(INFINIPATH_E_SPIOARMLAUNCH | INFINIPATH_E_SUNEXPERRPKTNUM | \
+	 INFINIPATH_E_SDROPPEDDATAPKT | INFINIPATH_E_SDROPPEDSMPPKT | \
+	 INFINIPATH_E_SMAXPKTLEN | INFINIPATH_E_SUNSUPVL | \
+	 INFINIPATH_E_SMINPKTLEN | INFINIPATH_E_SPKTLEN | \
+	 INFINIPATH_E_INVALIDADDR)
+
+/*
+ * this is similar to E_SUM_ERRS, but can't ignore armlaunch, don't ignore
+ * errors not related to freeze and cancelling buffers.  Can't ignore
+ * armlaunch because could get more while still cleaning up, and need
+ * to cancel those as they happen.
+ */
+#define E_SPKT_ERRS_IGNORE \
+	 (INFINIPATH_E_SDROPPEDDATAPKT | INFINIPATH_E_SDROPPEDSMPPKT | \
+	 INFINIPATH_E_SMAXPKTLEN | INFINIPATH_E_SMINPKTLEN | \
+	 INFINIPATH_E_SPKTLEN)
+
+/*
+ * these are errors that can occur when the link changes state while
+ * a packet is being sent or received.  This doesn't cover things
+ * like EBP or VCRC that can be the result of a sending having the
+ * link change state, so we receive a "known bad" packet.
+ */
+#define E_SUM_LINK_PKTERRS \
+	(INFINIPATH_E_SDROPPEDDATAPKT | INFINIPATH_E_SDROPPEDSMPPKT | \
+	 INFINIPATH_E_SMINPKTLEN | INFINIPATH_E_SPKTLEN | \
+	 INFINIPATH_E_RSHORTPKTLEN | INFINIPATH_E_RMINPKTLEN | \
+	 INFINIPATH_E_RUNEXPCHAR)
+
+static u64 handle_e_sum_errs(struct ipath_devdata *dd, ipath_err_t errs)
+{
+	u64 ignore_this_time = 0;
+
+	ipath_disarm_senderrbufs(dd);
+	if ((errs & E_SUM_LINK_PKTERRS) &&
+	    !(dd->ipath_flags & IPATH_LINKACTIVE)) {
+		/*
+		 * This can happen when SMA is trying to bring the link
+		 * up, but the IB link changes state at the "wrong" time.
+		 * The IB logic then complains that the packet isn't
+		 * valid.  We don't want to confuse people, so we just
+		 * don't print them, except at debug
+		 */
+		ipath_dbg("Ignoring packet errors %llx, because link not "
+			  "ACTIVE\n", (unsigned long long) errs);
+		ignore_this_time = errs & E_SUM_LINK_PKTERRS;
+	}
+
+	return ignore_this_time;
+}
+
+/* generic hw error messages... */
+#define INFINIPATH_HWE_TXEMEMPARITYERR_MSG(a) \
+	{ \
+		.mask = ( INFINIPATH_HWE_TXEMEMPARITYERR_##a <<    \
+			  INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT ),   \
+		.msg = "TXE " #a " Memory Parity"	     \
+	}
+#define INFINIPATH_HWE_RXEMEMPARITYERR_MSG(a) \
+	{ \
+		.mask = ( INFINIPATH_HWE_RXEMEMPARITYERR_##a <<    \
+			  INFINIPATH_HWE_RXEMEMPARITYERR_SHIFT ),   \
+		.msg = "RXE " #a " Memory Parity"	     \
+	}
+
+static const struct ipath_hwerror_msgs ipath_generic_hwerror_msgs[] = {
+	INFINIPATH_HWE_MSG(IBCBUSFRSPCPARITYERR, "IPATH2IB Parity"),
+	INFINIPATH_HWE_MSG(IBCBUSTOSPCPARITYERR, "IB2IPATH Parity"),
+
+	INFINIPATH_HWE_TXEMEMPARITYERR_MSG(PIOBUF),
+	INFINIPATH_HWE_TXEMEMPARITYERR_MSG(PIOPBC),
+	INFINIPATH_HWE_TXEMEMPARITYERR_MSG(PIOLAUNCHFIFO),
+
+	INFINIPATH_HWE_RXEMEMPARITYERR_MSG(RCVBUF),
+	INFINIPATH_HWE_RXEMEMPARITYERR_MSG(LOOKUPQ),
+	INFINIPATH_HWE_RXEMEMPARITYERR_MSG(EAGERTID),
+	INFINIPATH_HWE_RXEMEMPARITYERR_MSG(EXPTID),
+	INFINIPATH_HWE_RXEMEMPARITYERR_MSG(FLAGBUF),
+	INFINIPATH_HWE_RXEMEMPARITYERR_MSG(DATAINFO),
+	INFINIPATH_HWE_RXEMEMPARITYERR_MSG(HDRINFO),
+};
+
+/**
+ * ipath_format_hwmsg - format a single hwerror message
+ * @msg message buffer
+ * @msgl length of message buffer
+ * @hwmsg message to add to message buffer
+ */
+static void ipath_format_hwmsg(char *msg, size_t msgl, const char *hwmsg)
+{
+	strlcat(msg, "[", msgl);
+	strlcat(msg, hwmsg, msgl);
+	strlcat(msg, "]", msgl);
+}
+
+/**
+ * ipath_format_hwerrors - format hardware error messages for display
+ * @hwerrs hardware errors bit vector
+ * @hwerrmsgs hardware error descriptions
+ * @nhwerrmsgs number of hwerrmsgs
+ * @msg message buffer
+ * @msgl message buffer length
+ */
+void ipath_format_hwerrors(u64 hwerrs,
+			   const struct ipath_hwerror_msgs *hwerrmsgs,
+			   size_t nhwerrmsgs,
+			   char *msg, size_t msgl)
+{
+	int i;
+	const int glen =
+	    ARRAY_SIZE(ipath_generic_hwerror_msgs);
+
+	for (i=0; i<glen; i++) {
+		if (hwerrs & ipath_generic_hwerror_msgs[i].mask) {
+			ipath_format_hwmsg(msg, msgl,
+					   ipath_generic_hwerror_msgs[i].msg);
+		}
+	}
+
+	for (i=0; i<nhwerrmsgs; i++) {
+		if (hwerrs & hwerrmsgs[i].mask) {
+			ipath_format_hwmsg(msg, msgl, hwerrmsgs[i].msg);
+		}
+	}
+}
+
+/* return the strings for the most common link states */
+static char *ib_linkstate(struct ipath_devdata *dd, u64 ibcs)
+{
+	char *ret;
+	u32 state;
+
+	state = ipath_ib_state(dd, ibcs);
+	if (state == dd->ib_init)
+		ret = "Init";
+	else if (state == dd->ib_arm)
+		ret = "Arm";
+	else if (state == dd->ib_active)
+		ret = "Active";
+	else
+		ret = "Down";
+	return ret;
+}
+
+void signal_ib_event(struct ipath_devdata *dd, enum ib_event_type ev)
+{
+	struct ib_event event;
+
+	event.device = &dd->verbs_dev->ibdev;
+	event.element.port_num = 1;
+	event.event = ev;
+	ib_dispatch_event(&event);
+}
+
+static void handle_e_ibstatuschanged(struct ipath_devdata *dd,
+				     ipath_err_t errs)
+{
+	u32 ltstate, lstate, ibstate, lastlstate;
+	u32 init = dd->ib_init;
+	u32 arm = dd->ib_arm;
+	u32 active = dd->ib_active;
+	const u64 ibcs = ipath_read_kreg64(dd, dd->ipath_kregs->kr_ibcstatus);
+
+	lstate = ipath_ib_linkstate(dd, ibcs); /* linkstate */
+	ibstate = ipath_ib_state(dd, ibcs);
+	/* linkstate at last interrupt */
+	lastlstate = ipath_ib_linkstate(dd, dd->ipath_lastibcstat);
+	ltstate = ipath_ib_linktrstate(dd, ibcs); /* linktrainingtate */
+
+	/*
+	 * Since going into a recovery state causes the link state to go
+	 * down and since recovery is transitory, it is better if we "miss"
+	 * ever seeing the link training state go into recovery (i.e.,
+	 * ignore this transition for link state special handling purposes)
+	 * without even updating ipath_lastibcstat.
+	 */
+	if ((ltstate == INFINIPATH_IBCS_LT_STATE_RECOVERRETRAIN) ||
+	    (ltstate == INFINIPATH_IBCS_LT_STATE_RECOVERWAITRMT) ||
+	    (ltstate == INFINIPATH_IBCS_LT_STATE_RECOVERIDLE))
+		goto done;
+
+	/*
+	 * if linkstate transitions into INIT from any of the various down
+	 * states, or if it transitions from any of the up (INIT or better)
+	 * states into any of the down states (except link recovery), then
+	 * call the chip-specific code to take appropriate actions.
+	 */
+	if (lstate >= INFINIPATH_IBCS_L_STATE_INIT &&
+		lastlstate == INFINIPATH_IBCS_L_STATE_DOWN) {
+		/* transitioned to UP */
+		if (dd->ipath_f_ib_updown(dd, 1, ibcs)) {
+			/* link came up, so we must no longer be disabled */
+			dd->ipath_flags &= ~IPATH_IB_LINK_DISABLED;
+			ipath_cdbg(LINKVERB, "LinkUp handled, skipped\n");
+			goto skip_ibchange; /* chip-code handled */
+		}
+	} else if ((lastlstate >= INFINIPATH_IBCS_L_STATE_INIT ||
+		(dd->ipath_flags & IPATH_IB_FORCE_NOTIFY)) &&
+		ltstate <= INFINIPATH_IBCS_LT_STATE_CFGWAITRMT &&
+		ltstate != INFINIPATH_IBCS_LT_STATE_LINKUP) {
+		int handled;
+		handled = dd->ipath_f_ib_updown(dd, 0, ibcs);
+		dd->ipath_flags &= ~IPATH_IB_FORCE_NOTIFY;
+		if (handled) {
+			ipath_cdbg(LINKVERB, "LinkDown handled, skipped\n");
+			goto skip_ibchange; /* chip-code handled */
+		}
+	}
+
+	/*
+	 * Significant enough to always print and get into logs, if it was
+	 * unexpected.  If it was a requested state change, we'll have
+	 * already cleared the flags, so we won't print this warning
+	 */
+	if ((ibstate != arm && ibstate != active) &&
+	    (dd->ipath_flags & (IPATH_LINKARMED | IPATH_LINKACTIVE))) {
+		dev_info(&dd->pcidev->dev, "Link state changed from %s "
+			 "to %s\n", (dd->ipath_flags & IPATH_LINKARMED) ?
+			 "ARM" : "ACTIVE", ib_linkstate(dd, ibcs));
+	}
+
+	if (ltstate == INFINIPATH_IBCS_LT_STATE_POLLACTIVE ||
+	    ltstate == INFINIPATH_IBCS_LT_STATE_POLLQUIET) {
+		u32 lastlts;
+		lastlts = ipath_ib_linktrstate(dd, dd->ipath_lastibcstat);
+		/*
+		 * Ignore cycling back and forth from Polling.Active to
+		 * Polling.Quiet while waiting for the other end of the link
+		 * to come up, except to try and decide if we are connected
+		 * to a live IB device or not.  We will cycle back and
+		 * forth between them if no cable is plugged in, the other
+		 * device is powered off or disabled, etc.
+		 */
+		if (lastlts == INFINIPATH_IBCS_LT_STATE_POLLACTIVE ||
+		    lastlts == INFINIPATH_IBCS_LT_STATE_POLLQUIET) {
+			if (!(dd->ipath_flags & IPATH_IB_AUTONEG_INPROG) &&
+			     (++dd->ipath_ibpollcnt == 40)) {
+				dd->ipath_flags |= IPATH_NOCABLE;
+				*dd->ipath_statusp |=
+					IPATH_STATUS_IB_NOCABLE;
+				ipath_cdbg(LINKVERB, "Set NOCABLE\n");
+			}
+			ipath_cdbg(LINKVERB, "POLL change to %s (%x)\n",
+				ipath_ibcstatus_str[ltstate], ibstate);
+			goto skip_ibchange;
+		}
+	}
+
+	dd->ipath_ibpollcnt = 0; /* not poll*, now */
+	ipath_stats.sps_iblink++;
+
+	if (ibstate != init && dd->ipath_lastlinkrecov && ipath_linkrecovery) {
+		u64 linkrecov;
+		linkrecov = ipath_snap_cntr(dd,
+			dd->ipath_cregs->cr_iblinkerrrecovcnt);
+		if (linkrecov != dd->ipath_lastlinkrecov) {
+			ipath_dbg("IB linkrecov up %Lx (%s %s) recov %Lu\n",
+				(unsigned long long) ibcs,
+				ib_linkstate(dd, ibcs),
+				ipath_ibcstatus_str[ltstate],
+				(unsigned long long) linkrecov);
+			/* and no more until active again */
+			dd->ipath_lastlinkrecov = 0;
+			ipath_set_linkstate(dd, IPATH_IB_LINKDOWN);
+			goto skip_ibchange;
+		}
+	}
+
+	if (ibstate == init || ibstate == arm || ibstate == active) {
+		*dd->ipath_statusp &= ~IPATH_STATUS_IB_NOCABLE;
+		if (ibstate == init || ibstate == arm) {
+			*dd->ipath_statusp &= ~IPATH_STATUS_IB_READY;
+			if (dd->ipath_flags & IPATH_LINKACTIVE)
+				signal_ib_event(dd, IB_EVENT_PORT_ERR);
+		}
+		if (ibstate == arm) {
+			dd->ipath_flags |= IPATH_LINKARMED;
+			dd->ipath_flags &= ~(IPATH_LINKUNK |
+				IPATH_LINKINIT | IPATH_LINKDOWN |
+				IPATH_LINKACTIVE | IPATH_NOCABLE);
+			ipath_hol_down(dd);
+		} else  if (ibstate == init) {
+			/*
+			 * set INIT and DOWN.  Down is checked by
+			 * most of the other code, but INIT is
+			 * useful to know in a few places.
+			 */
+			dd->ipath_flags |= IPATH_LINKINIT |
+				IPATH_LINKDOWN;
+			dd->ipath_flags &= ~(IPATH_LINKUNK |
+				IPATH_LINKARMED | IPATH_LINKACTIVE |
+				IPATH_NOCABLE);
+			ipath_hol_down(dd);
+		} else {  /* active */
+			dd->ipath_lastlinkrecov = ipath_snap_cntr(dd,
+				dd->ipath_cregs->cr_iblinkerrrecovcnt);
+			*dd->ipath_statusp |=
+				IPATH_STATUS_IB_READY | IPATH_STATUS_IB_CONF;
+			dd->ipath_flags |= IPATH_LINKACTIVE;
+			dd->ipath_flags &= ~(IPATH_LINKUNK | IPATH_LINKINIT
+				| IPATH_LINKDOWN | IPATH_LINKARMED |
+				IPATH_NOCABLE);
+			if (dd->ipath_flags & IPATH_HAS_SEND_DMA)
+				ipath_restart_sdma(dd);
+			signal_ib_event(dd, IB_EVENT_PORT_ACTIVE);
+			/* LED active not handled in chip _f_updown */
+			dd->ipath_f_setextled(dd, lstate, ltstate);
+			ipath_hol_up(dd);
+		}
+
+		/*
+		 * print after we've already done the work, so as not to
+		 * delay the state changes and notifications, for debugging
+		 */
+		if (lstate == lastlstate)
+			ipath_cdbg(LINKVERB, "Unchanged from last: %s "
+				"(%x)\n", ib_linkstate(dd, ibcs), ibstate);
+		else
+			ipath_cdbg(VERBOSE, "Unit %u: link up to %s %s (%x)\n",
+				  dd->ipath_unit, ib_linkstate(dd, ibcs),
+				  ipath_ibcstatus_str[ltstate],  ibstate);
+	} else { /* down */
+		if (dd->ipath_flags & IPATH_LINKACTIVE)
+			signal_ib_event(dd, IB_EVENT_PORT_ERR);
+		dd->ipath_flags |= IPATH_LINKDOWN;
+		dd->ipath_flags &= ~(IPATH_LINKUNK | IPATH_LINKINIT
+				     | IPATH_LINKACTIVE |
+				     IPATH_LINKARMED);
+		*dd->ipath_statusp &= ~IPATH_STATUS_IB_READY;
+		dd->ipath_lli_counter = 0;
+
+		if (lastlstate != INFINIPATH_IBCS_L_STATE_DOWN)
+			ipath_cdbg(VERBOSE, "Unit %u link state down "
+				   "(state 0x%x), from %s\n",
+				   dd->ipath_unit, lstate,
+				   ib_linkstate(dd, dd->ipath_lastibcstat));
+		else
+			ipath_cdbg(LINKVERB, "Unit %u link state changed "
+				   "to %s (0x%x) from down (%x)\n",
+				   dd->ipath_unit,
+				   ipath_ibcstatus_str[ltstate],
+				   ibstate, lastlstate);
+	}
+
+skip_ibchange:
+	dd->ipath_lastibcstat = ibcs;
+done:
+	return;
+}
+
+static void handle_supp_msgs(struct ipath_devdata *dd,
+			     unsigned supp_msgs, char *msg, u32 msgsz)
+{
+	/*
+	 * Print the message unless it's ibc status change only, which
+	 * happens so often we never want to count it.
+	 */
+	if (dd->ipath_lasterror & ~INFINIPATH_E_IBSTATUSCHANGED) {
+		int iserr;
+		ipath_err_t mask;
+		iserr = ipath_decode_err(dd, msg, msgsz,
+					 dd->ipath_lasterror &
+					 ~INFINIPATH_E_IBSTATUSCHANGED);
+
+		mask = INFINIPATH_E_RRCVEGRFULL | INFINIPATH_E_RRCVHDRFULL |
+			INFINIPATH_E_PKTERRS | INFINIPATH_E_SDMADISABLED;
+
+		/* if we're in debug, then don't mask SDMADISABLED msgs */
+		if (ipath_debug & __IPATH_DBG)
+			mask &= ~INFINIPATH_E_SDMADISABLED;
+
+		if (dd->ipath_lasterror & ~mask)
+			ipath_dev_err(dd, "Suppressed %u messages for "
+				      "fast-repeating errors (%s) (%llx)\n",
+				      supp_msgs, msg,
+				      (unsigned long long)
+				      dd->ipath_lasterror);
+		else {
+			/*
+			 * rcvegrfull and rcvhdrqfull are "normal", for some
+			 * types of processes (mostly benchmarks) that send
+			 * huge numbers of messages, while not processing
+			 * them. So only complain about these at debug
+			 * level.
+			 */
+			if (iserr)
+				ipath_dbg("Suppressed %u messages for %s\n",
+					  supp_msgs, msg);
+			else
+				ipath_cdbg(ERRPKT,
+					"Suppressed %u messages for %s\n",
+					  supp_msgs, msg);
+		}
+	}
+}
+
+static unsigned handle_frequent_errors(struct ipath_devdata *dd,
+				       ipath_err_t errs, char *msg,
+				       u32 msgsz, int *noprint)
+{
+	unsigned long nc;
+	static unsigned long nextmsg_time;
+	static unsigned nmsgs, supp_msgs;
+
+	/*
+	 * Throttle back "fast" messages to no more than 10 per 5 seconds.
+	 * This isn't perfect, but it's a reasonable heuristic. If we get
+	 * more than 10, give a 6x longer delay.
+	 */
+	nc = jiffies;
+	if (nmsgs > 10) {
+		if (time_before(nc, nextmsg_time)) {
+			*noprint = 1;
+			if (!supp_msgs++)
+				nextmsg_time = nc + HZ * 3;
+		}
+		else if (supp_msgs) {
+			handle_supp_msgs(dd, supp_msgs, msg, msgsz);
+			supp_msgs = 0;
+			nmsgs = 0;
+		}
+	}
+	else if (!nmsgs++ || time_after(nc, nextmsg_time))
+		nextmsg_time = nc + HZ / 2;
+
+	return supp_msgs;
+}
+
+static void handle_sdma_errors(struct ipath_devdata *dd, ipath_err_t errs)
+{
+	unsigned long flags;
+	int expected;
+
+	if (ipath_debug & __IPATH_DBG) {
+		char msg[128];
+		ipath_decode_err(dd, msg, sizeof msg, errs &
+			INFINIPATH_E_SDMAERRS);
+		ipath_dbg("errors %lx (%s)\n", (unsigned long)errs, msg);
+	}
+	if (ipath_debug & __IPATH_VERBDBG) {
+		unsigned long tl, hd, status, lengen;
+		tl = ipath_read_kreg64(dd, dd->ipath_kregs->kr_senddmatail);
+		hd = ipath_read_kreg64(dd, dd->ipath_kregs->kr_senddmahead);
+		status = ipath_read_kreg64(dd
+			, dd->ipath_kregs->kr_senddmastatus);
+		lengen = ipath_read_kreg64(dd,
+			dd->ipath_kregs->kr_senddmalengen);
+		ipath_cdbg(VERBOSE, "sdma tl 0x%lx hd 0x%lx status 0x%lx "
+			"lengen 0x%lx\n", tl, hd, status, lengen);
+	}
+
+	spin_lock_irqsave(&dd->ipath_sdma_lock, flags);
+	__set_bit(IPATH_SDMA_DISABLED, &dd->ipath_sdma_status);
+	expected = test_bit(IPATH_SDMA_ABORTING, &dd->ipath_sdma_status);
+	spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
+	if (!expected)
+		ipath_cancel_sends(dd, 1);
+}
+
+static void handle_sdma_intr(struct ipath_devdata *dd, u64 istat)
+{
+	unsigned long flags;
+	int expected;
+
+	if ((istat & INFINIPATH_I_SDMAINT) &&
+	    !test_bit(IPATH_SDMA_SHUTDOWN, &dd->ipath_sdma_status))
+		ipath_sdma_intr(dd);
+
+	if (istat & INFINIPATH_I_SDMADISABLED) {
+		expected = test_bit(IPATH_SDMA_ABORTING,
+			&dd->ipath_sdma_status);
+		ipath_dbg("%s SDmaDisabled intr\n",
+			expected ? "expected" : "unexpected");
+		spin_lock_irqsave(&dd->ipath_sdma_lock, flags);
+		__set_bit(IPATH_SDMA_DISABLED, &dd->ipath_sdma_status);
+		spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
+		if (!expected)
+			ipath_cancel_sends(dd, 1);
+		if (!test_bit(IPATH_SDMA_SHUTDOWN, &dd->ipath_sdma_status))
+			tasklet_hi_schedule(&dd->ipath_sdma_abort_task);
+	}
+}
+
+static int handle_hdrq_full(struct ipath_devdata *dd)
+{
+	int chkerrpkts = 0;
+	u32 hd, tl;
+	u32 i;
+
+	ipath_stats.sps_hdrqfull++;
+	for (i = 0; i < dd->ipath_cfgports; i++) {
+		struct ipath_portdata *pd = dd->ipath_pd[i];
+
+		if (i == 0) {
+			/*
+			 * For kernel receive queues, we just want to know
+			 * if there are packets in the queue that we can
+			 * process.
+			 */
+			if (pd->port_head != ipath_get_hdrqtail(pd))
+				chkerrpkts |= 1 << i;
+			continue;
+		}
+
+		/* Skip if user context is not open */
+		if (!pd || !pd->port_cnt)
+			continue;
+
+		/* Don't report the same point multiple times. */
+		if (dd->ipath_flags & IPATH_NODMA_RTAIL)
+			tl = ipath_read_ureg32(dd, ur_rcvhdrtail, i);
+		else
+			tl = ipath_get_rcvhdrtail(pd);
+		if (tl == pd->port_lastrcvhdrqtail)
+			continue;
+
+		hd = ipath_read_ureg32(dd, ur_rcvhdrhead, i);
+		if (hd == (tl + 1) || (!hd && tl == dd->ipath_hdrqlast)) {
+			pd->port_lastrcvhdrqtail = tl;
+			pd->port_hdrqfull++;
+			/* flush hdrqfull so that poll() sees it */
+			wmb();
+			wake_up_interruptible(&pd->port_wait);
+		}
+	}
+
+	return chkerrpkts;
+}
+
+static int handle_errors(struct ipath_devdata *dd, ipath_err_t errs)
+{
+	char msg[128];
+	u64 ignore_this_time = 0;
+	u64 iserr = 0;
+	int chkerrpkts = 0, noprint = 0;
+	unsigned supp_msgs;
+	int log_idx;
+
+	/*
+	 * don't report errors that are masked, either at init
+	 * (not set in ipath_errormask), or temporarily (set in
+	 * ipath_maskederrs)
+	 */
+	errs &= dd->ipath_errormask & ~dd->ipath_maskederrs;
+
+	supp_msgs = handle_frequent_errors(dd, errs, msg, (u32)sizeof msg,
+		&noprint);
+
+	/* do these first, they are most important */
+	if (errs & INFINIPATH_E_HARDWARE) {
+		/* reuse same msg buf */
+		dd->ipath_f_handle_hwerrors(dd, msg, sizeof msg);
+	} else {
+		u64 mask;
+		for (log_idx = 0; log_idx < IPATH_EEP_LOG_CNT; ++log_idx) {
+			mask = dd->ipath_eep_st_masks[log_idx].errs_to_log;
+			if (errs & mask)
+				ipath_inc_eeprom_err(dd, log_idx, 1);
+		}
+	}
+
+	if (errs & INFINIPATH_E_SDMAERRS)
+		handle_sdma_errors(dd, errs);
+
+	if (!noprint && (errs & ~dd->ipath_e_bitsextant))
+		ipath_dev_err(dd, "error interrupt with unknown errors "
+			      "%llx set\n", (unsigned long long)
+			      (errs & ~dd->ipath_e_bitsextant));
+
+	if (errs & E_SUM_ERRS)
+		ignore_this_time = handle_e_sum_errs(dd, errs);
+	else if ((errs & E_SUM_LINK_PKTERRS) &&
+	    !(dd->ipath_flags & IPATH_LINKACTIVE)) {
+		/*
+		 * This can happen when SMA is trying to bring the link
+		 * up, but the IB link changes state at the "wrong" time.
+		 * The IB logic then complains that the packet isn't
+		 * valid.  We don't want to confuse people, so we just
+		 * don't print them, except at debug
+		 */
+		ipath_dbg("Ignoring packet errors %llx, because link not "
+			  "ACTIVE\n", (unsigned long long) errs);
+		ignore_this_time = errs & E_SUM_LINK_PKTERRS;
+	}
+
+	if (supp_msgs == 250000) {
+		int s_iserr;
+		/*
+		 * It's not entirely reasonable assuming that the errors set
+		 * in the last clear period are all responsible for the
+		 * problem, but the alternative is to assume it's the only
+		 * ones on this particular interrupt, which also isn't great
+		 */
+		dd->ipath_maskederrs |= dd->ipath_lasterror | errs;
+
+		dd->ipath_errormask &= ~dd->ipath_maskederrs;
+		ipath_write_kreg(dd, dd->ipath_kregs->kr_errormask,
+				 dd->ipath_errormask);
+		s_iserr = ipath_decode_err(dd, msg, sizeof msg,
+					   dd->ipath_maskederrs);
+
+		if (dd->ipath_maskederrs &
+		    ~(INFINIPATH_E_RRCVEGRFULL |
+		      INFINIPATH_E_RRCVHDRFULL | INFINIPATH_E_PKTERRS))
+			ipath_dev_err(dd, "Temporarily disabling "
+			    "error(s) %llx reporting; too frequent (%s)\n",
+				(unsigned long long) dd->ipath_maskederrs,
+				msg);
+		else {
+			/*
+			 * rcvegrfull and rcvhdrqfull are "normal",
+			 * for some types of processes (mostly benchmarks)
+			 * that send huge numbers of messages, while not
+			 * processing them.  So only complain about
+			 * these at debug level.
+			 */
+			if (s_iserr)
+				ipath_dbg("Temporarily disabling reporting "
+				    "too frequent queue full errors (%s)\n",
+				    msg);
+			else
+				ipath_cdbg(ERRPKT,
+				    "Temporarily disabling reporting too"
+				    " frequent packet errors (%s)\n",
+				    msg);
+		}
+
+		/*
+		 * Re-enable the masked errors after around 3 minutes.  in
+		 * ipath_get_faststats().  If we have a series of fast
+		 * repeating but different errors, the interval will keep
+		 * stretching out, but that's OK, as that's pretty
+		 * catastrophic.
+		 */
+		dd->ipath_unmasktime = jiffies + HZ * 180;
+	}
+
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_errorclear, errs);
+	if (ignore_this_time)
+		errs &= ~ignore_this_time;
+	if (errs & ~dd->ipath_lasterror) {
+		errs &= ~dd->ipath_lasterror;
+		/* never suppress duplicate hwerrors or ibstatuschange */
+		dd->ipath_lasterror |= errs &
+			~(INFINIPATH_E_HARDWARE |
+			  INFINIPATH_E_IBSTATUSCHANGED);
+	}
+
+	if (errs & INFINIPATH_E_SENDSPECIALTRIGGER) {
+		dd->ipath_spectriggerhit++;
+		ipath_dbg("%lu special trigger hits\n",
+			dd->ipath_spectriggerhit);
+	}
+
+	/* likely due to cancel; so suppress message unless verbose */
+	if ((errs & (INFINIPATH_E_SPKTLEN | INFINIPATH_E_SPIOARMLAUNCH)) &&
+		time_after(dd->ipath_lastcancel, jiffies)) {
+		/* armlaunch takes precedence; it often causes both. */
+		ipath_cdbg(VERBOSE,
+			"Suppressed %s error (%llx) after sendbuf cancel\n",
+			(errs &  INFINIPATH_E_SPIOARMLAUNCH) ?
+			"armlaunch" : "sendpktlen", (unsigned long long)errs);
+		errs &= ~(INFINIPATH_E_SPIOARMLAUNCH | INFINIPATH_E_SPKTLEN);
+	}
+
+	if (!errs)
+		return 0;
+
+	if (!noprint) {
+		ipath_err_t mask;
+		/*
+		 * The ones we mask off are handled specially below
+		 * or above.  Also mask SDMADISABLED by default as it
+		 * is too chatty.
+		 */
+		mask = INFINIPATH_E_IBSTATUSCHANGED |
+			INFINIPATH_E_RRCVEGRFULL | INFINIPATH_E_RRCVHDRFULL |
+			INFINIPATH_E_HARDWARE | INFINIPATH_E_SDMADISABLED;
+
+		/* if we're in debug, then don't mask SDMADISABLED msgs */
+		if (ipath_debug & __IPATH_DBG)
+			mask &= ~INFINIPATH_E_SDMADISABLED;
+
+		ipath_decode_err(dd, msg, sizeof msg, errs & ~mask);
+	} else
+		/* so we don't need if (!noprint) at strlcat's below */
+		*msg = 0;
+
+	if (errs & E_SUM_PKTERRS) {
+		ipath_stats.sps_pkterrs++;
+		chkerrpkts = 1;
+	}
+	if (errs & E_SUM_ERRS)
+		ipath_stats.sps_errs++;
+
+	if (errs & (INFINIPATH_E_RICRC | INFINIPATH_E_RVCRC)) {
+		ipath_stats.sps_crcerrs++;
+		chkerrpkts = 1;
+	}
+	iserr = errs & ~(E_SUM_PKTERRS | INFINIPATH_E_PKTERRS);
+
+
+	/*
+	 * We don't want to print these two as they happen, or we can make
+	 * the situation even worse, because it takes so long to print
+	 * messages to serial consoles.  Kernel ports get printed from
+	 * fast_stats, no more than every 5 seconds, user ports get printed
+	 * on close
+	 */
+	if (errs & INFINIPATH_E_RRCVHDRFULL)
+		chkerrpkts |= handle_hdrq_full(dd);
+	if (errs & INFINIPATH_E_RRCVEGRFULL) {
+		struct ipath_portdata *pd = dd->ipath_pd[0];
+
+		/*
+		 * since this is of less importance and not likely to
+		 * happen without also getting hdrfull, only count
+		 * occurrences; don't check each port (or even the kernel
+		 * vs user)
+		 */
+		ipath_stats.sps_etidfull++;
+		if (pd->port_head != ipath_get_hdrqtail(pd))
+			chkerrpkts |= 1;
+	}
+
+	/*
+	 * do this before IBSTATUSCHANGED, in case both bits set in a single
+	 * interrupt; we want the STATUSCHANGE to "win", so we do our
+	 * internal copy of state machine correctly
+	 */
+	if (errs & INFINIPATH_E_RIBLOSTLINK) {
+		/*
+		 * force through block below
+		 */
+		errs |= INFINIPATH_E_IBSTATUSCHANGED;
+		ipath_stats.sps_iblink++;
+		dd->ipath_flags |= IPATH_LINKDOWN;
+		dd->ipath_flags &= ~(IPATH_LINKUNK | IPATH_LINKINIT
+				     | IPATH_LINKARMED | IPATH_LINKACTIVE);
+		*dd->ipath_statusp &= ~IPATH_STATUS_IB_READY;
+
+		ipath_dbg("Lost link, link now down (%s)\n",
+			ipath_ibcstatus_str[ipath_read_kreg64(dd,
+			dd->ipath_kregs->kr_ibcstatus) & 0xf]);
+	}
+	if (errs & INFINIPATH_E_IBSTATUSCHANGED)
+		handle_e_ibstatuschanged(dd, errs);
+
+	if (errs & INFINIPATH_E_RESET) {
+		if (!noprint)
+			ipath_dev_err(dd, "Got reset, requires re-init "
+				      "(unload and reload driver)\n");
+		dd->ipath_flags &= ~IPATH_INITTED;	/* needs re-init */
+		/* mark as having had error */
+		*dd->ipath_statusp |= IPATH_STATUS_HWERROR;
+		*dd->ipath_statusp &= ~IPATH_STATUS_IB_CONF;
+	}
+
+	if (!noprint && *msg) {
+		if (iserr)
+			ipath_dev_err(dd, "%s error\n", msg);
+	}
+	if (dd->ipath_state_wanted & dd->ipath_flags) {
+		ipath_cdbg(VERBOSE, "driver wanted state %x, iflags now %x, "
+			   "waking\n", dd->ipath_state_wanted,
+			   dd->ipath_flags);
+		wake_up_interruptible(&ipath_state_wait);
+	}
+
+	return chkerrpkts;
+}
+
+/*
+ * try to cleanup as much as possible for anything that might have gone
+ * wrong while in freeze mode, such as pio buffers being written by user
+ * processes (causing armlaunch), send errors due to going into freeze mode,
+ * etc., and try to avoid causing extra interrupts while doing so.
+ * Forcibly update the in-memory pioavail register copies after cleanup
+ * because the chip won't do it while in freeze mode (the register values
+ * themselves are kept correct).
+ * Make sure that we don't lose any important interrupts by using the chip
+ * feature that says that writing 0 to a bit in *clear that is set in
+ * *status will cause an interrupt to be generated again (if allowed by
+ * the *mask value).
+ */
+void ipath_clear_freeze(struct ipath_devdata *dd)
+{
+	/* disable error interrupts, to avoid confusion */
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_errormask, 0ULL);
+
+	/* also disable interrupts; errormask is sometimes overwriten */
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_intmask, 0ULL);
+
+	ipath_cancel_sends(dd, 1);
+
+	/* clear the freeze, and be sure chip saw it */
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_control,
+			 dd->ipath_control);
+	ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
+
+	/* force in-memory update now we are out of freeze */
+	ipath_force_pio_avail_update(dd);
+
+	/*
+	 * force new interrupt if any hwerr, error or interrupt bits are
+	 * still set, and clear "safe" send packet errors related to freeze
+	 * and cancelling sends.  Re-enable error interrupts before possible
+	 * force of re-interrupt on pending interrupts.
+	 */
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrclear, 0ULL);
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_errorclear,
+		E_SPKT_ERRS_IGNORE);
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_errormask,
+		dd->ipath_errormask);
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_intmask, -1LL);
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_intclear, 0ULL);
+}
+
+
+/* this is separate to allow for better optimization of ipath_intr() */
+
+static noinline void ipath_bad_intr(struct ipath_devdata *dd, u32 *unexpectp)
+{
+	/*
+	 * sometimes happen during driver init and unload, don't want
+	 * to process any interrupts at that point
+	 */
+
+	/* this is just a bandaid, not a fix, if something goes badly
+	 * wrong */
+	if (++*unexpectp > 100) {
+		if (++*unexpectp > 105) {
+			/*
+			 * ok, we must be taking somebody else's interrupts,
+			 * due to a messed up mptable and/or PIRQ table, so
+			 * unregister the interrupt.  We've seen this during
+			 * linuxbios development work, and it may happen in
+			 * the future again.
+			 */
+			if (dd->pcidev && dd->ipath_irq) {
+				ipath_dev_err(dd, "Now %u unexpected "
+					      "interrupts, unregistering "
+					      "interrupt handler\n",
+					      *unexpectp);
+				ipath_dbg("free_irq of irq %d\n",
+					  dd->ipath_irq);
+				dd->ipath_f_free_irq(dd);
+			}
+		}
+		if (ipath_read_ireg(dd, dd->ipath_kregs->kr_intmask)) {
+			ipath_dev_err(dd, "%u unexpected interrupts, "
+				      "disabling interrupts completely\n",
+				      *unexpectp);
+			/*
+			 * disable all interrupts, something is very wrong
+			 */
+			ipath_write_kreg(dd, dd->ipath_kregs->kr_intmask,
+					 0ULL);
+		}
+	} else if (*unexpectp > 1)
+		ipath_dbg("Interrupt when not ready, should not happen, "
+			  "ignoring\n");
+}
+
+static noinline void ipath_bad_regread(struct ipath_devdata *dd)
+{
+	static int allbits;
+
+	/* separate routine, for better optimization of ipath_intr() */
+
+	/*
+	 * We print the message and disable interrupts, in hope of
+	 * having a better chance of debugging the problem.
+	 */
+	ipath_dev_err(dd,
+		      "Read of interrupt status failed (all bits set)\n");
+	if (allbits++) {
+		/* disable all interrupts, something is very wrong */
+		ipath_write_kreg(dd, dd->ipath_kregs->kr_intmask, 0ULL);
+		if (allbits == 2) {
+			ipath_dev_err(dd, "Still bad interrupt status, "
+				      "unregistering interrupt\n");
+			dd->ipath_f_free_irq(dd);
+		} else if (allbits > 2) {
+			if ((allbits % 10000) == 0)
+				printk(".");
+		} else
+			ipath_dev_err(dd, "Disabling interrupts, "
+				      "multiple errors\n");
+	}
+}
+
+static void handle_layer_pioavail(struct ipath_devdata *dd)
+{
+	unsigned long flags;
+	int ret;
+
+	ret = ipath_ib_piobufavail(dd->verbs_dev);
+	if (ret > 0)
+		goto set;
+
+	return;
+set:
+	spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
+	dd->ipath_sendctrl |= INFINIPATH_S_PIOINTBUFAVAIL;
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl,
+			 dd->ipath_sendctrl);
+	ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
+	spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
+}
+
+/*
+ * Handle receive interrupts for user ports; this means a user
+ * process was waiting for a packet to arrive, and didn't want
+ * to poll
+ */
+static void handle_urcv(struct ipath_devdata *dd, u64 istat)
+{
+	u64 portr;
+	int i;
+	int rcvdint = 0;
+
+	/*
+	 * test_and_clear_bit(IPATH_PORT_WAITING_RCV) and
+	 * test_and_clear_bit(IPATH_PORT_WAITING_URG) below
+	 * would both like timely updates of the bits so that
+	 * we don't pass them by unnecessarily.  the rmb()
+	 * here ensures that we see them promptly -- the
+	 * corresponding wmb()'s are in ipath_poll_urgent()
+	 * and ipath_poll_next()...
+	 */
+	rmb();
+	portr = ((istat >> dd->ipath_i_rcvavail_shift) &
+		 dd->ipath_i_rcvavail_mask) |
+		((istat >> dd->ipath_i_rcvurg_shift) &
+		 dd->ipath_i_rcvurg_mask);
+	for (i = 1; i < dd->ipath_cfgports; i++) {
+		struct ipath_portdata *pd = dd->ipath_pd[i];
+
+		if (portr & (1 << i) && pd && pd->port_cnt) {
+			if (test_and_clear_bit(IPATH_PORT_WAITING_RCV,
+					       &pd->port_flag)) {
+				clear_bit(i + dd->ipath_r_intravail_shift,
+					  &dd->ipath_rcvctrl);
+				wake_up_interruptible(&pd->port_wait);
+				rcvdint = 1;
+			} else if (test_and_clear_bit(IPATH_PORT_WAITING_URG,
+						      &pd->port_flag)) {
+				pd->port_urgent++;
+				wake_up_interruptible(&pd->port_wait);
+			}
+		}
+	}
+	if (rcvdint) {
+		/* only want to take one interrupt, so turn off the rcv
+		 * interrupt for all the ports that we set the rcv_waiting
+		 * (but never for kernel port)
+		 */
+		ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl,
+				 dd->ipath_rcvctrl);
+	}
+}
+
+irqreturn_t ipath_intr(int irq, void *data)
+{
+	struct ipath_devdata *dd = data;
+	u64 istat, chk0rcv = 0;
+	ipath_err_t estat = 0;
+	irqreturn_t ret;
+	static unsigned unexpected = 0;
+	u64 kportrbits;
+
+	ipath_stats.sps_ints++;
+
+	if (dd->ipath_int_counter != (u32) -1)
+		dd->ipath_int_counter++;
+
+	if (!(dd->ipath_flags & IPATH_PRESENT)) {
+		/*
+		 * This return value is not great, but we do not want the
+		 * interrupt core code to remove our interrupt handler
+		 * because we don't appear to be handling an interrupt
+		 * during a chip reset.
+		 */
+		return IRQ_HANDLED;
+	}
+
+	/*
+	 * this needs to be flags&initted, not statusp, so we keep
+	 * taking interrupts even after link goes down, etc.
+	 * Also, we *must* clear the interrupt at some point, or we won't
+	 * take it again, which can be real bad for errors, etc...
+	 */
+
+	if (!(dd->ipath_flags & IPATH_INITTED)) {
+		ipath_bad_intr(dd, &unexpected);
+		ret = IRQ_NONE;
+		goto bail;
+	}
+
+	istat = ipath_read_ireg(dd, dd->ipath_kregs->kr_intstatus);
+
+	if (unlikely(!istat)) {
+		ipath_stats.sps_nullintr++;
+		ret = IRQ_NONE; /* not our interrupt, or already handled */
+		goto bail;
+	}
+	if (unlikely(istat == -1)) {
+		ipath_bad_regread(dd);
+		/* don't know if it was our interrupt or not */
+		ret = IRQ_NONE;
+		goto bail;
+	}
+
+	if (unexpected)
+		unexpected = 0;
+
+	if (unlikely(istat & ~dd->ipath_i_bitsextant))
+		ipath_dev_err(dd,
+			      "interrupt with unknown interrupts %Lx set\n",
+			      (unsigned long long)
+			      istat & ~dd->ipath_i_bitsextant);
+	else if (istat & ~INFINIPATH_I_ERROR) /* errors do own printing */
+		ipath_cdbg(VERBOSE, "intr stat=0x%Lx\n",
+			(unsigned long long) istat);
+
+	if (istat & INFINIPATH_I_ERROR) {
+		ipath_stats.sps_errints++;
+		estat = ipath_read_kreg64(dd,
+					  dd->ipath_kregs->kr_errorstatus);
+		if (!estat)
+			dev_info(&dd->pcidev->dev, "error interrupt (%Lx), "
+				 "but no error bits set!\n",
+				 (unsigned long long) istat);
+		else if (estat == -1LL)
+			/*
+			 * should we try clearing all, or hope next read
+			 * works?
+			 */
+			ipath_dev_err(dd, "Read of error status failed "
+				      "(all bits set); ignoring\n");
+		else
+			chk0rcv |= handle_errors(dd, estat);
+	}
+
+	if (istat & INFINIPATH_I_GPIO) {
+		/*
+		 * GPIO interrupts fall in two broad classes:
+		 * GPIO_2 indicates (on some HT4xx boards) that a packet
+		 *        has arrived for Port 0. Checking for this
+		 *        is controlled by flag IPATH_GPIO_INTR.
+		 * GPIO_3..5 on IBA6120 Rev2 and IBA6110 Rev4 chips indicate
+		 *        errors that we need to count. Checking for this
+		 *        is controlled by flag IPATH_GPIO_ERRINTRS.
+		 */
+		u32 gpiostatus;
+		u32 to_clear = 0;
+
+		gpiostatus = ipath_read_kreg32(
+			dd, dd->ipath_kregs->kr_gpio_status);
+		/* First the error-counter case. */
+		if ((gpiostatus & IPATH_GPIO_ERRINTR_MASK) &&
+		    (dd->ipath_flags & IPATH_GPIO_ERRINTRS)) {
+			/* want to clear the bits we see asserted. */
+			to_clear |= (gpiostatus & IPATH_GPIO_ERRINTR_MASK);
+
+			/*
+			 * Count appropriately, clear bits out of our copy,
+			 * as they have been "handled".
+			 */
+			if (gpiostatus & (1 << IPATH_GPIO_RXUVL_BIT)) {
+				ipath_dbg("FlowCtl on UnsupVL\n");
+				dd->ipath_rxfc_unsupvl_errs++;
+			}
+			if (gpiostatus & (1 << IPATH_GPIO_OVRUN_BIT)) {
+				ipath_dbg("Overrun Threshold exceeded\n");
+				dd->ipath_overrun_thresh_errs++;
+			}
+			if (gpiostatus & (1 << IPATH_GPIO_LLI_BIT)) {
+				ipath_dbg("Local Link Integrity error\n");
+				dd->ipath_lli_errs++;
+			}
+			gpiostatus &= ~IPATH_GPIO_ERRINTR_MASK;
+		}
+		/* Now the Port0 Receive case */
+		if ((gpiostatus & (1 << IPATH_GPIO_PORT0_BIT)) &&
+		    (dd->ipath_flags & IPATH_GPIO_INTR)) {
+			/*
+			 * GPIO status bit 2 is set, and we expected it.
+			 * clear it and indicate in p0bits.
+			 * This probably only happens if a Port0 pkt
+			 * arrives at _just_ the wrong time, and we
+			 * handle that by seting chk0rcv;
+			 */
+			to_clear |= (1 << IPATH_GPIO_PORT0_BIT);
+			gpiostatus &= ~(1 << IPATH_GPIO_PORT0_BIT);
+			chk0rcv = 1;
+		}
+		if (gpiostatus) {
+			/*
+			 * Some unexpected bits remain. If they could have
+			 * caused the interrupt, complain and clear.
+			 * To avoid repetition of this condition, also clear
+			 * the mask. It is almost certainly due to error.
+			 */
+			const u32 mask = (u32) dd->ipath_gpio_mask;
+
+			if (mask & gpiostatus) {
+				ipath_dbg("Unexpected GPIO IRQ bits %x\n",
+				  gpiostatus & mask);
+				to_clear |= (gpiostatus & mask);
+				dd->ipath_gpio_mask &= ~(gpiostatus & mask);
+				ipath_write_kreg(dd,
+					dd->ipath_kregs->kr_gpio_mask,
+					dd->ipath_gpio_mask);
+			}
+		}
+		if (to_clear) {
+			ipath_write_kreg(dd, dd->ipath_kregs->kr_gpio_clear,
+					(u64) to_clear);
+		}
+	}
+
+	/*
+	 * Clear the interrupt bits we found set, unless they are receive
+	 * related, in which case we already cleared them above, and don't
+	 * want to clear them again, because we might lose an interrupt.
+	 * Clear it early, so we "know" know the chip will have seen this by
+	 * the time we process the queue, and will re-interrupt if necessary.
+	 * The processor itself won't take the interrupt again until we return.
+	 */
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_intclear, istat);
+
+	/*
+	 * Handle kernel receive queues before checking for pio buffers
+	 * available since receives can overflow; piobuf waiters can afford
+	 * a few extra cycles, since they were waiting anyway, and user's
+	 * waiting for receive are at the bottom.
+	 */
+	kportrbits = (1ULL << dd->ipath_i_rcvavail_shift) |
+		(1ULL << dd->ipath_i_rcvurg_shift);
+	if (chk0rcv || (istat & kportrbits)) {
+		istat &= ~kportrbits;
+		ipath_kreceive(dd->ipath_pd[0]);
+	}
+
+	if (istat & ((dd->ipath_i_rcvavail_mask << dd->ipath_i_rcvavail_shift) |
+		     (dd->ipath_i_rcvurg_mask << dd->ipath_i_rcvurg_shift)))
+		handle_urcv(dd, istat);
+
+	if (istat & (INFINIPATH_I_SDMAINT | INFINIPATH_I_SDMADISABLED))
+		handle_sdma_intr(dd, istat);
+
+	if (istat & INFINIPATH_I_SPIOBUFAVAIL) {
+		unsigned long flags;
+
+		spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
+		dd->ipath_sendctrl &= ~INFINIPATH_S_PIOINTBUFAVAIL;
+		ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl,
+				 dd->ipath_sendctrl);
+		ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
+		spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
+
+		/* always process; sdma verbs uses PIO for acks and VL15  */
+		handle_layer_pioavail(dd);
+	}
+
+	ret = IRQ_HANDLED;
+
+bail:
+	return ret;
+}
diff --git a/drivers/infiniband/hw/ipath/ipath_kernel.h b/drivers/infiniband/hw/ipath/ipath_kernel.h
new file mode 100644
index 0000000..6559af6
--- /dev/null
+++ b/drivers/infiniband/hw/ipath/ipath_kernel.h
@@ -0,0 +1,1378 @@
+#ifndef _IPATH_KERNEL_H
+#define _IPATH_KERNEL_H
+/*
+ * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/*
+ * This header file is the base header file for infinipath kernel code
+ * ipath_user.h serves a similar purpose for user code.
+ */
+
+#include <linux/interrupt.h>
+#include <linux/pci.h>
+#include <linux/dma-mapping.h>
+#include <linux/mutex.h>
+#include <linux/list.h>
+#include <linux/scatterlist.h>
+#include <asm/io.h>
+#include <rdma/ib_verbs.h>
+
+#include "ipath_common.h"
+#include "ipath_debug.h"
+#include "ipath_registers.h"
+
+/* only s/w major version of InfiniPath we can handle */
+#define IPATH_CHIP_VERS_MAJ 2U
+
+/* don't care about this except printing */
+#define IPATH_CHIP_VERS_MIN 0U
+
+/* temporary, maybe always */
+extern struct infinipath_stats ipath_stats;
+
+#define IPATH_CHIP_SWVERSION IPATH_CHIP_VERS_MAJ
+/*
+ * First-cut critierion for "device is active" is
+ * two thousand dwords combined Tx, Rx traffic per
+ * 5-second interval. SMA packets are 64 dwords,
+ * and occur "a few per second", presumably each way.
+ */
+#define IPATH_TRAFFIC_ACTIVE_THRESHOLD (2000)
+/*
+ * Struct used to indicate which errors are logged in each of the
+ * error-counters that are logged to EEPROM. A counter is incremented
+ * _once_ (saturating at 255) for each event with any bits set in
+ * the error or hwerror register masks below.
+ */
+#define IPATH_EEP_LOG_CNT (4)
+struct ipath_eep_log_mask {
+	u64 errs_to_log;
+	u64 hwerrs_to_log;
+};
+
+struct ipath_portdata {
+	void **port_rcvegrbuf;
+	dma_addr_t *port_rcvegrbuf_phys;
+	/* rcvhdrq base, needs mmap before useful */
+	void *port_rcvhdrq;
+	/* kernel virtual address where hdrqtail is updated */
+	void *port_rcvhdrtail_kvaddr;
+	/*
+	 * temp buffer for expected send setup, allocated at open, instead
+	 * of each setup call
+	 */
+	void *port_tid_pg_list;
+	/* when waiting for rcv or pioavail */
+	wait_queue_head_t port_wait;
+	/*
+	 * rcvegr bufs base, physical, must fit
+	 * in 44 bits so 32 bit programs mmap64 44 bit works)
+	 */
+	dma_addr_t port_rcvegr_phys;
+	/* mmap of hdrq, must fit in 44 bits */
+	dma_addr_t port_rcvhdrq_phys;
+	dma_addr_t port_rcvhdrqtailaddr_phys;
+	/*
+	 * number of opens (including slave subports) on this instance
+	 * (ignoring forks, dup, etc. for now)
+	 */
+	int port_cnt;
+	/*
+	 * how much space to leave at start of eager TID entries for
+	 * protocol use, on each TID
+	 */
+	/* instead of calculating it */
+	unsigned port_port;
+	/* non-zero if port is being shared. */
+	u16 port_subport_cnt;
+	/* non-zero if port is being shared. */
+	u16 port_subport_id;
+	/* number of pio bufs for this port (all procs, if shared) */
+	u32 port_piocnt;
+	/* first pio buffer for this port */
+	u32 port_pio_base;
+	/* chip offset of PIO buffers for this port */
+	u32 port_piobufs;
+	/* how many alloc_pages() chunks in port_rcvegrbuf_pages */
+	u32 port_rcvegrbuf_chunks;
+	/* how many egrbufs per chunk */
+	u32 port_rcvegrbufs_perchunk;
+	/* order for port_rcvegrbuf_pages */
+	size_t port_rcvegrbuf_size;
+	/* rcvhdrq size (for freeing) */
+	size_t port_rcvhdrq_size;
+	/* next expected TID to check when looking for free */
+	u32 port_tidcursor;
+	/* next expected TID to check */
+	unsigned long port_flag;
+	/* what happened */
+	unsigned long int_flag;
+	/* WAIT_RCV that timed out, no interrupt */
+	u32 port_rcvwait_to;
+	/* WAIT_PIO that timed out, no interrupt */
+	u32 port_piowait_to;
+	/* WAIT_RCV already happened, no wait */
+	u32 port_rcvnowait;
+	/* WAIT_PIO already happened, no wait */
+	u32 port_pionowait;
+	/* total number of rcvhdrqfull errors */
+	u32 port_hdrqfull;
+	/*
+	 * Used to suppress multiple instances of same
+	 * port staying stuck at same point.
+	 */
+	u32 port_lastrcvhdrqtail;
+	/* saved total number of rcvhdrqfull errors for poll edge trigger */
+	u32 port_hdrqfull_poll;
+	/* total number of polled urgent packets */
+	u32 port_urgent;
+	/* saved total number of polled urgent packets for poll edge trigger */
+	u32 port_urgent_poll;
+	/* pid of process using this port */
+	struct pid *port_pid;
+	struct pid *port_subpid[INFINIPATH_MAX_SUBPORT];
+	/* same size as task_struct .comm[] */
+	char port_comm[16];
+	/* pkeys set by this use of this port */
+	u16 port_pkeys[4];
+	/* so file ops can get at unit */
+	struct ipath_devdata *port_dd;
+	/* A page of memory for rcvhdrhead, rcvegrhead, rcvegrtail * N */
+	void *subport_uregbase;
+	/* An array of pages for the eager receive buffers * N */
+	void *subport_rcvegrbuf;
+	/* An array of pages for the eager header queue entries * N */
+	void *subport_rcvhdr_base;
+	/* The version of the library which opened this port */
+	u32 userversion;
+	/* Bitmask of active slaves */
+	u32 active_slaves;
+	/* Type of packets or conditions we want to poll for */
+	u16 poll_type;
+	/* port rcvhdrq head offset */
+	u32 port_head;
+	/* receive packet sequence counter */
+	u32 port_seq_cnt;
+};
+
+struct sk_buff;
+struct ipath_sge_state;
+struct ipath_verbs_txreq;
+
+/*
+ * control information for layered drivers
+ */
+struct _ipath_layer {
+	void *l_arg;
+};
+
+struct ipath_skbinfo {
+	struct sk_buff *skb;
+	dma_addr_t phys;
+};
+
+struct ipath_sdma_txreq {
+	int                 flags;
+	int                 sg_count;
+	union {
+		struct scatterlist *sg;
+		void *map_addr;
+	};
+	void              (*callback)(void *, int);
+	void               *callback_cookie;
+	int                 callback_status;
+	u16                 start_idx;  /* sdma private */
+	u16                 next_descq_idx;  /* sdma private */
+	struct list_head    list;       /* sdma private */
+};
+
+struct ipath_sdma_desc {
+	__le64 qw[2];
+};
+
+#define IPATH_SDMA_TXREQ_F_USELARGEBUF  0x1
+#define IPATH_SDMA_TXREQ_F_HEADTOHOST   0x2
+#define IPATH_SDMA_TXREQ_F_INTREQ       0x4
+#define IPATH_SDMA_TXREQ_F_FREEBUF      0x8
+#define IPATH_SDMA_TXREQ_F_FREEDESC     0x10
+#define IPATH_SDMA_TXREQ_F_VL15         0x20
+
+#define IPATH_SDMA_TXREQ_S_OK        0
+#define IPATH_SDMA_TXREQ_S_SENDERROR 1
+#define IPATH_SDMA_TXREQ_S_ABORTED   2
+#define IPATH_SDMA_TXREQ_S_SHUTDOWN  3
+
+#define IPATH_SDMA_STATUS_SCORE_BOARD_DRAIN_IN_PROG	(1ull << 63)
+#define IPATH_SDMA_STATUS_ABORT_IN_PROG			(1ull << 62)
+#define IPATH_SDMA_STATUS_INTERNAL_SDMA_ENABLE		(1ull << 61)
+#define IPATH_SDMA_STATUS_SCB_EMPTY			(1ull << 30)
+
+/* max dwords in small buffer packet */
+#define IPATH_SMALLBUF_DWORDS (dd->ipath_piosize2k >> 2)
+
+/*
+ * Possible IB config parameters for ipath_f_get/set_ib_cfg()
+ */
+#define IPATH_IB_CFG_LIDLMC 0 /* Get/set LID (LS16b) and Mask (MS16b) */
+#define IPATH_IB_CFG_HRTBT 1 /* Get/set Heartbeat off/enable/auto */
+#define IPATH_IB_HRTBT_ON 3 /* Heartbeat enabled, sent every 100msec */
+#define IPATH_IB_HRTBT_OFF 0 /* Heartbeat off */
+#define IPATH_IB_CFG_LWID_ENB 2 /* Get/set allowed Link-width */
+#define IPATH_IB_CFG_LWID 3 /* Get currently active Link-width */
+#define IPATH_IB_CFG_SPD_ENB 4 /* Get/set allowed Link speeds */
+#define IPATH_IB_CFG_SPD 5 /* Get current Link spd */
+#define IPATH_IB_CFG_RXPOL_ENB 6 /* Get/set Auto-RX-polarity enable */
+#define IPATH_IB_CFG_LREV_ENB 7 /* Get/set Auto-Lane-reversal enable */
+#define IPATH_IB_CFG_LINKLATENCY 8 /* Get Auto-Lane-reversal enable */
+
+
+struct ipath_devdata {
+	struct list_head ipath_list;
+
+	struct ipath_kregs const *ipath_kregs;
+	struct ipath_cregs const *ipath_cregs;
+
+	/* mem-mapped pointer to base of chip regs */
+	u64 __iomem *ipath_kregbase;
+	/* end of mem-mapped chip space; range checking */
+	u64 __iomem *ipath_kregend;
+	/* physical address of chip for io_remap, etc. */
+	unsigned long ipath_physaddr;
+	/* base of memory alloced for ipath_kregbase, for free */
+	u64 *ipath_kregalloc;
+	/* ipath_cfgports pointers */
+	struct ipath_portdata **ipath_pd;
+	/* sk_buffs used by port 0 eager receive queue */
+	struct ipath_skbinfo *ipath_port0_skbinfo;
+	/* kvirt address of 1st 2k pio buffer */
+	void __iomem *ipath_pio2kbase;
+	/* kvirt address of 1st 4k pio buffer */
+	void __iomem *ipath_pio4kbase;
+	/*
+	 * points to area where PIOavail registers will be DMA'ed.
+	 * Has to be on a page of it's own, because the page will be
+	 * mapped into user program space.  This copy is *ONLY* ever
+	 * written by DMA, not by the driver!  Need a copy per device
+	 * when we get to multiple devices
+	 */
+	volatile __le64 *ipath_pioavailregs_dma;
+	/* physical address where updates occur */
+	dma_addr_t ipath_pioavailregs_phys;
+	struct _ipath_layer ipath_layer;
+	/* setup intr */
+	int (*ipath_f_intrsetup)(struct ipath_devdata *);
+	/* fallback to alternate interrupt type if possible */
+	int (*ipath_f_intr_fallback)(struct ipath_devdata *);
+	/* setup on-chip bus config */
+	int (*ipath_f_bus)(struct ipath_devdata *, struct pci_dev *);
+	/* hard reset chip */
+	int (*ipath_f_reset)(struct ipath_devdata *);
+	int (*ipath_f_get_boardname)(struct ipath_devdata *, char *,
+				     size_t);
+	void (*ipath_f_init_hwerrors)(struct ipath_devdata *);
+	void (*ipath_f_handle_hwerrors)(struct ipath_devdata *, char *,
+					size_t);
+	void (*ipath_f_quiet_serdes)(struct ipath_devdata *);
+	int (*ipath_f_bringup_serdes)(struct ipath_devdata *);
+	int (*ipath_f_early_init)(struct ipath_devdata *);
+	void (*ipath_f_clear_tids)(struct ipath_devdata *, unsigned);
+	void (*ipath_f_put_tid)(struct ipath_devdata *, u64 __iomem*,
+				u32, unsigned long);
+	void (*ipath_f_tidtemplate)(struct ipath_devdata *);
+	void (*ipath_f_cleanup)(struct ipath_devdata *);
+	void (*ipath_f_setextled)(struct ipath_devdata *, u64, u64);
+	/* fill out chip-specific fields */
+	int (*ipath_f_get_base_info)(struct ipath_portdata *, void *);
+	/* free irq */
+	void (*ipath_f_free_irq)(struct ipath_devdata *);
+	struct ipath_message_header *(*ipath_f_get_msgheader)
+					(struct ipath_devdata *, __le32 *);
+	void (*ipath_f_config_ports)(struct ipath_devdata *, ushort);
+	int (*ipath_f_get_ib_cfg)(struct ipath_devdata *, int);
+	int (*ipath_f_set_ib_cfg)(struct ipath_devdata *, int, u32);
+	void (*ipath_f_config_jint)(struct ipath_devdata *, u16 , u16);
+	void (*ipath_f_read_counters)(struct ipath_devdata *,
+					struct infinipath_counters *);
+	void (*ipath_f_xgxs_reset)(struct ipath_devdata *);
+	/* per chip actions needed for IB Link up/down changes */
+	int (*ipath_f_ib_updown)(struct ipath_devdata *, int, u64);
+
+	unsigned ipath_lastegr_idx;
+	struct ipath_ibdev *verbs_dev;
+	struct timer_list verbs_timer;
+	/* total dwords sent (summed from counter) */
+	u64 ipath_sword;
+	/* total dwords rcvd (summed from counter) */
+	u64 ipath_rword;
+	/* total packets sent (summed from counter) */
+	u64 ipath_spkts;
+	/* total packets rcvd (summed from counter) */
+	u64 ipath_rpkts;
+	/* ipath_statusp initially points to this. */
+	u64 _ipath_status;
+	/* GUID for this interface, in network order */
+	__be64 ipath_guid;
+	/*
+	 * aggregrate of error bits reported since last cleared, for
+	 * limiting of error reporting
+	 */
+	ipath_err_t ipath_lasterror;
+	/*
+	 * aggregrate of error bits reported since last cleared, for
+	 * limiting of hwerror reporting
+	 */
+	ipath_err_t ipath_lasthwerror;
+	/* errors masked because they occur too fast */
+	ipath_err_t ipath_maskederrs;
+	u64 ipath_lastlinkrecov; /* link recoveries at last ACTIVE */
+	/* these 5 fields are used to establish deltas for IB Symbol
+	 * errors and linkrecovery errors. They can be reported on
+	 * some chips during link negotiation prior to INIT, and with
+	 * DDR when faking DDR negotiations with non-IBTA switches.
+	 * The chip counters are adjusted at driver unload if there is
+	 * a non-zero delta.
+	 */
+	u64 ibdeltainprog;
+	u64 ibsymdelta;
+	u64 ibsymsnap;
+	u64 iblnkerrdelta;
+	u64 iblnkerrsnap;
+
+	/* time in jiffies at which to re-enable maskederrs */
+	unsigned long ipath_unmasktime;
+	/* count of egrfull errors, combined for all ports */
+	u64 ipath_last_tidfull;
+	/* for ipath_qcheck() */
+	u64 ipath_lastport0rcv_cnt;
+	/* template for writing TIDs  */
+	u64 ipath_tidtemplate;
+	/* value to write to free TIDs */
+	u64 ipath_tidinvalid;
+	/* IBA6120 rcv interrupt setup */
+	u64 ipath_rhdrhead_intr_off;
+
+	/* size of memory at ipath_kregbase */
+	u32 ipath_kregsize;
+	/* number of registers used for pioavail */
+	u32 ipath_pioavregs;
+	/* IPATH_POLL, etc. */
+	u32 ipath_flags;
+	/* ipath_flags driver is waiting for */
+	u32 ipath_state_wanted;
+	/* last buffer for user use, first buf for kernel use is this
+	 * index. */
+	u32 ipath_lastport_piobuf;
+	/* is a stats timer active */
+	u32 ipath_stats_timer_active;
+	/* number of interrupts for this device -- saturates... */
+	u32 ipath_int_counter;
+	/* dwords sent read from counter */
+	u32 ipath_lastsword;
+	/* dwords received read from counter */
+	u32 ipath_lastrword;
+	/* sent packets read from counter */
+	u32 ipath_lastspkts;
+	/* received packets read from counter */
+	u32 ipath_lastrpkts;
+	/* pio bufs allocated per port */
+	u32 ipath_pbufsport;
+	/* if remainder on bufs/port, ports < extrabuf get 1 extra */
+	u32 ipath_ports_extrabuf;
+	u32 ipath_pioupd_thresh; /* update threshold, some chips */
+	/*
+	 * number of ports configured as max; zero is set to number chip
+	 * supports, less gives more pio bufs/port, etc.
+	 */
+	u32 ipath_cfgports;
+	/* count of port 0 hdrqfull errors */
+	u32 ipath_p0_hdrqfull;
+	/* port 0 number of receive eager buffers */
+	u32 ipath_p0_rcvegrcnt;
+
+	/*
+	 * index of last piobuffer we used.  Speeds up searching, by
+	 * starting at this point.  Doesn't matter if multiple cpu's use and
+	 * update, last updater is only write that matters.  Whenever it
+	 * wraps, we update shadow copies.  Need a copy per device when we
+	 * get to multiple devices
+	 */
+	u32 ipath_lastpioindex;
+	u32 ipath_lastpioindexl;
+	/* max length of freezemsg */
+	u32 ipath_freezelen;
+	/*
+	 * consecutive times we wanted a PIO buffer but were unable to
+	 * get one
+	 */
+	u32 ipath_consec_nopiobuf;
+	/*
+	 * hint that we should update ipath_pioavailshadow before
+	 * looking for a PIO buffer
+	 */
+	u32 ipath_upd_pio_shadow;
+	/* so we can rewrite it after a chip reset */
+	u32 ipath_pcibar0;
+	/* so we can rewrite it after a chip reset */
+	u32 ipath_pcibar1;
+	u32 ipath_x1_fix_tries;
+	u32 ipath_autoneg_tries;
+	u32 serdes_first_init_done;
+
+	struct ipath_relock {
+		atomic_t ipath_relock_timer_active;
+		struct timer_list ipath_relock_timer;
+		unsigned int ipath_relock_interval; /* in jiffies */
+	} ipath_relock_singleton;
+
+	/* interrupt number */
+	int ipath_irq;
+	/* HT/PCI Vendor ID (here for NodeInfo) */
+	u16 ipath_vendorid;
+	/* HT/PCI Device ID (here for NodeInfo) */
+	u16 ipath_deviceid;
+	/* offset in HT config space of slave/primary interface block */
+	u8 ipath_ht_slave_off;
+	/* for write combining settings */
+	unsigned long ipath_wc_cookie;
+	unsigned long ipath_wc_base;
+	unsigned long ipath_wc_len;
+	/* ref count for each pkey */
+	atomic_t ipath_pkeyrefs[4];
+	/* shadow copy of struct page *'s for exp tid pages */
+	struct page **ipath_pageshadow;
+	/* shadow copy of dma handles for exp tid pages */
+	dma_addr_t *ipath_physshadow;
+	u64 __iomem *ipath_egrtidbase;
+	/* lock to workaround chip bug 9437 and others */
+	spinlock_t ipath_kernel_tid_lock;
+	spinlock_t ipath_user_tid_lock;
+	spinlock_t ipath_sendctrl_lock;
+	/* around ipath_pd and (user ports) port_cnt use (intr vs free) */
+	spinlock_t ipath_uctxt_lock;
+
+	/*
+	 * IPATH_STATUS_*,
+	 * this address is mapped readonly into user processes so they can
+	 * get status cheaply, whenever they want.
+	 */
+	u64 *ipath_statusp;
+	/* freeze msg if hw error put chip in freeze */
+	char *ipath_freezemsg;
+	/* pci access data structure */
+	struct pci_dev *pcidev;
+	struct cdev *user_cdev;
+	struct cdev *diag_cdev;
+	struct device *user_dev;
+	struct device *diag_dev;
+	/* timer used to prevent stats overflow, error throttling, etc. */
+	struct timer_list ipath_stats_timer;
+	/* timer to verify interrupts work, and fallback if possible */
+	struct timer_list ipath_intrchk_timer;
+	void *ipath_dummy_hdrq;	/* used after port close */
+	dma_addr_t ipath_dummy_hdrq_phys;
+
+	/* SendDMA related entries */
+	spinlock_t            ipath_sdma_lock;
+	unsigned long         ipath_sdma_status;
+	unsigned long         ipath_sdma_abort_jiffies;
+	unsigned long         ipath_sdma_abort_intr_timeout;
+	unsigned long         ipath_sdma_buf_jiffies;
+	struct ipath_sdma_desc *ipath_sdma_descq;
+	u64		      ipath_sdma_descq_added;
+	u64		      ipath_sdma_descq_removed;
+	int		      ipath_sdma_desc_nreserved;
+	u16                   ipath_sdma_descq_cnt;
+	u16                   ipath_sdma_descq_tail;
+	u16                   ipath_sdma_descq_head;
+	u16                   ipath_sdma_next_intr;
+	u16                   ipath_sdma_reset_wait;
+	u8                    ipath_sdma_generation;
+	struct tasklet_struct ipath_sdma_abort_task;
+	struct tasklet_struct ipath_sdma_notify_task;
+	struct list_head      ipath_sdma_activelist;
+	struct list_head      ipath_sdma_notifylist;
+	atomic_t              ipath_sdma_vl15_count;
+	struct timer_list     ipath_sdma_vl15_timer;
+
+	dma_addr_t       ipath_sdma_descq_phys;
+	volatile __le64 *ipath_sdma_head_dma;
+	dma_addr_t       ipath_sdma_head_phys;
+
+	unsigned long ipath_ureg_align; /* user register alignment */
+
+	struct delayed_work ipath_autoneg_work;
+	wait_queue_head_t ipath_autoneg_wait;
+
+	/* HoL blocking / user app forward-progress state */
+	unsigned          ipath_hol_state;
+	unsigned          ipath_hol_next;
+	struct timer_list ipath_hol_timer;
+
+	/*
+	 * Shadow copies of registers; size indicates read access size.
+	 * Most of them are readonly, but some are write-only register,
+	 * where we manipulate the bits in the shadow copy, and then write
+	 * the shadow copy to infinipath.
+	 *
+	 * We deliberately make most of these 32 bits, since they have
+	 * restricted range.  For any that we read, we won't to generate 32
+	 * bit accesses, since Opteron will generate 2 separate 32 bit HT
+	 * transactions for a 64 bit read, and we want to avoid unnecessary
+	 * HT transactions.
+	 */
+
+	/* This is the 64 bit group */
+
+	/*
+	 * shadow of pioavail, check to be sure it's large enough at
+	 * init time.
+	 */
+	unsigned long ipath_pioavailshadow[8];
+	/* bitmap of send buffers available for the kernel to use with PIO. */
+	unsigned long ipath_pioavailkernel[8];
+	/* shadow of kr_gpio_out, for rmw ops */
+	u64 ipath_gpio_out;
+	/* shadow the gpio mask register */
+	u64 ipath_gpio_mask;
+	/* shadow the gpio output enable, etc... */
+	u64 ipath_extctrl;
+	/* kr_revision shadow */
+	u64 ipath_revision;
+	/*
+	 * shadow of ibcctrl, for interrupt handling of link changes,
+	 * etc.
+	 */
+	u64 ipath_ibcctrl;
+	/*
+	 * last ibcstatus, to suppress "duplicate" status change messages,
+	 * mostly from 2 to 3
+	 */
+	u64 ipath_lastibcstat;
+	/* hwerrmask shadow */
+	ipath_err_t ipath_hwerrmask;
+	ipath_err_t ipath_errormask; /* errormask shadow */
+	/* interrupt config reg shadow */
+	u64 ipath_intconfig;
+	/* kr_sendpiobufbase value */
+	u64 ipath_piobufbase;
+	/* kr_ibcddrctrl shadow */
+	u64 ipath_ibcddrctrl;
+
+	/* these are the "32 bit" regs */
+
+	/*
+	 * number of GUIDs in the flash for this interface; may need some
+	 * rethinking for setting on other ifaces
+	 */
+	u32 ipath_nguid;
+	/*
+	 * the following two are 32-bit bitmasks, but {test,clear,set}_bit
+	 * all expect bit fields to be "unsigned long"
+	 */
+	/* shadow kr_rcvctrl */
+	unsigned long ipath_rcvctrl;
+	/* shadow kr_sendctrl */
+	unsigned long ipath_sendctrl;
+	/* to not count armlaunch after cancel */
+	unsigned long ipath_lastcancel;
+	/* count cases where special trigger was needed (double write) */
+	unsigned long ipath_spectriggerhit;
+
+	/* value we put in kr_rcvhdrcnt */
+	u32 ipath_rcvhdrcnt;
+	/* value we put in kr_rcvhdrsize */
+	u32 ipath_rcvhdrsize;
+	/* value we put in kr_rcvhdrentsize */
+	u32 ipath_rcvhdrentsize;
+	/* offset of last entry in rcvhdrq */
+	u32 ipath_hdrqlast;
+	/* kr_portcnt value */
+	u32 ipath_portcnt;
+	/* kr_pagealign value */
+	u32 ipath_palign;
+	/* number of "2KB" PIO buffers */
+	u32 ipath_piobcnt2k;
+	/* size in bytes of "2KB" PIO buffers */
+	u32 ipath_piosize2k;
+	/* number of "4KB" PIO buffers */
+	u32 ipath_piobcnt4k;
+	/* size in bytes of "4KB" PIO buffers */
+	u32 ipath_piosize4k;
+	u32 ipath_pioreserved; /* reserved special-inkernel; */
+	/* kr_rcvegrbase value */
+	u32 ipath_rcvegrbase;
+	/* kr_rcvegrcnt value */
+	u32 ipath_rcvegrcnt;
+	/* kr_rcvtidbase value */
+	u32 ipath_rcvtidbase;
+	/* kr_rcvtidcnt value */
+	u32 ipath_rcvtidcnt;
+	/* kr_sendregbase */
+	u32 ipath_sregbase;
+	/* kr_userregbase */
+	u32 ipath_uregbase;
+	/* kr_counterregbase */
+	u32 ipath_cregbase;
+	/* shadow the control register contents */
+	u32 ipath_control;
+	/* PCI revision register (HTC rev on FPGA) */
+	u32 ipath_pcirev;
+
+	/* chip address space used by 4k pio buffers */
+	u32 ipath_4kalign;
+	/* The MTU programmed for this unit */
+	u32 ipath_ibmtu;
+	/*
+	 * The max size IB packet, included IB headers that we can send.
+	 * Starts same as ipath_piosize, but is affected when ibmtu is
+	 * changed, or by size of eager buffers
+	 */
+	u32 ipath_ibmaxlen;
+	/*
+	 * ibmaxlen at init time, limited by chip and by receive buffer
+	 * size.  Not changed after init.
+	 */
+	u32 ipath_init_ibmaxlen;
+	/* size of each rcvegrbuffer */
+	u32 ipath_rcvegrbufsize;
+	/* localbus width (1, 2,4,8,16,32) from config space  */
+	u32 ipath_lbus_width;
+	/* localbus speed (HT: 200,400,800,1000; PCIe 2500) */
+	u32 ipath_lbus_speed;
+	/*
+	 * number of sequential ibcstatus change for polling active/quiet
+	 * (i.e., link not coming up).
+	 */
+	u32 ipath_ibpollcnt;
+	/* low and high portions of MSI capability/vector */
+	u32 ipath_msi_lo;
+	/* saved after PCIe init for restore after reset */
+	u32 ipath_msi_hi;
+	/* MSI data (vector) saved for restore */
+	u16 ipath_msi_data;
+	/* MLID programmed for this instance */
+	u16 ipath_mlid;
+	/* LID programmed for this instance */
+	u16 ipath_lid;
+	/* list of pkeys programmed; 0 if not set */
+	u16 ipath_pkeys[4];
+	/*
+	 * ASCII serial number, from flash, large enough for original
+	 * all digit strings, and longer QLogic serial number format
+	 */
+	u8 ipath_serial[16];
+	/* human readable board version */
+	u8 ipath_boardversion[96];
+	u8 ipath_lbus_info[32]; /* human readable localbus info */
+	/* chip major rev, from ipath_revision */
+	u8 ipath_majrev;
+	/* chip minor rev, from ipath_revision */
+	u8 ipath_minrev;
+	/* board rev, from ipath_revision */
+	u8 ipath_boardrev;
+	/* saved for restore after reset */
+	u8 ipath_pci_cacheline;
+	/* LID mask control */
+	u8 ipath_lmc;
+	/* link width supported */
+	u8 ipath_link_width_supported;
+	/* link speed supported */
+	u8 ipath_link_speed_supported;
+	u8 ipath_link_width_enabled;
+	u8 ipath_link_speed_enabled;
+	u8 ipath_link_width_active;
+	u8 ipath_link_speed_active;
+	/* Rx Polarity inversion (compensate for ~tx on partner) */
+	u8 ipath_rx_pol_inv;
+
+	u8 ipath_r_portenable_shift;
+	u8 ipath_r_intravail_shift;
+	u8 ipath_r_tailupd_shift;
+	u8 ipath_r_portcfg_shift;
+
+	/* unit # of this chip, if present */
+	int ipath_unit;
+
+	/* local link integrity counter */
+	u32 ipath_lli_counter;
+	/* local link integrity errors */
+	u32 ipath_lli_errors;
+	/*
+	 * Above counts only cases where _successive_ LocalLinkIntegrity
+	 * errors were seen in the receive headers of kern-packets.
+	 * Below are the three (monotonically increasing) counters
+	 * maintained via GPIO interrupts on iba6120-rev2.
+	 */
+	u32 ipath_rxfc_unsupvl_errs;
+	u32 ipath_overrun_thresh_errs;
+	u32 ipath_lli_errs;
+
+	/*
+	 * Not all devices managed by a driver instance are the same
+	 * type, so these fields must be per-device.
+	 */
+	u64 ipath_i_bitsextant;
+	ipath_err_t ipath_e_bitsextant;
+	ipath_err_t ipath_hwe_bitsextant;
+
+	/*
+	 * Below should be computable from number of ports,
+	 * since they are never modified.
+	 */
+	u64 ipath_i_rcvavail_mask;
+	u64 ipath_i_rcvurg_mask;
+	u16 ipath_i_rcvurg_shift;
+	u16 ipath_i_rcvavail_shift;
+
+	/*
+	 * Register bits for selecting i2c direction and values, used for
+	 * I2C serial flash.
+	 */
+	u8 ipath_gpio_sda_num;
+	u8 ipath_gpio_scl_num;
+	u8 ipath_i2c_chain_type;
+	u64 ipath_gpio_sda;
+	u64 ipath_gpio_scl;
+
+	/* lock for doing RMW of shadows/regs for ExtCtrl and GPIO */
+	spinlock_t ipath_gpio_lock;
+
+	/*
+	 * IB link and linktraining states and masks that vary per chip in
+	 * some way.  Set at init, to avoid each IB status change interrupt
+	 */
+	u8 ibcs_ls_shift;
+	u8 ibcs_lts_mask;
+	u32 ibcs_mask;
+	u32 ib_init;
+	u32 ib_arm;
+	u32 ib_active;
+
+	u16 ipath_rhf_offset; /* offset of RHF within receive header entry */
+
+	/*
+	 * shift/mask for linkcmd, linkinitcmd, maxpktlen in ibccontol
+	 * reg. Changes for IBA7220
+	 */
+	u8 ibcc_lic_mask; /* LinkInitCmd */
+	u8 ibcc_lc_shift; /* LinkCmd */
+	u8 ibcc_mpl_shift; /* Maxpktlen */
+
+	u8 delay_mult;
+
+	/* used to override LED behavior */
+	u8 ipath_led_override;  /* Substituted for normal value, if non-zero */
+	u16 ipath_led_override_timeoff; /* delta to next timer event */
+	u8 ipath_led_override_vals[2]; /* Alternates per blink-frame */
+	u8 ipath_led_override_phase; /* Just counts, LSB picks from vals[] */
+	atomic_t ipath_led_override_timer_active;
+	/* Used to flash LEDs in override mode */
+	struct timer_list ipath_led_override_timer;
+
+	/* Support (including locks) for EEPROM logging of errors and time */
+	/* control access to actual counters, timer */
+	spinlock_t ipath_eep_st_lock;
+	/* control high-level access to EEPROM */
+	struct mutex ipath_eep_lock;
+	/* Below inc'd by ipath_snap_cntrs(), locked by ipath_eep_st_lock */
+	uint64_t ipath_traffic_wds;
+	/* active time is kept in seconds, but logged in hours */
+	atomic_t ipath_active_time;
+	/* Below are nominal shadow of EEPROM, new since last EEPROM update */
+	uint8_t ipath_eep_st_errs[IPATH_EEP_LOG_CNT];
+	uint8_t ipath_eep_st_new_errs[IPATH_EEP_LOG_CNT];
+	uint16_t ipath_eep_hrs;
+	/*
+	 * masks for which bits of errs, hwerrs that cause
+	 * each of the counters to increment.
+	 */
+	struct ipath_eep_log_mask ipath_eep_st_masks[IPATH_EEP_LOG_CNT];
+
+	/* interrupt mitigation reload register info */
+	u16 ipath_jint_idle_ticks;	/* idle clock ticks */
+	u16 ipath_jint_max_packets;	/* max packets across all ports */
+
+	/*
+	 * lock for access to SerDes, and flags to sequence preset
+	 * versus steady-state. 7220-only at the moment.
+	 */
+	spinlock_t ipath_sdepb_lock;
+	u8 ipath_presets_needed; /* Set if presets to be restored next DOWN */
+};
+
+/* ipath_hol_state values (stopping/starting user proc, send flushing) */
+#define IPATH_HOL_UP       0
+#define IPATH_HOL_DOWN     1
+/* ipath_hol_next toggle values, used when hol_state IPATH_HOL_DOWN */
+#define IPATH_HOL_DOWNSTOP 0
+#define IPATH_HOL_DOWNCONT 1
+
+/* bit positions for sdma_status */
+#define IPATH_SDMA_ABORTING  0
+#define IPATH_SDMA_DISARMED  1
+#define IPATH_SDMA_DISABLED  2
+#define IPATH_SDMA_LAYERBUF  3
+#define IPATH_SDMA_RUNNING  30
+#define IPATH_SDMA_SHUTDOWN 31
+
+/* bit combinations that correspond to abort states */
+#define IPATH_SDMA_ABORT_NONE 0
+#define IPATH_SDMA_ABORT_ABORTING (1UL << IPATH_SDMA_ABORTING)
+#define IPATH_SDMA_ABORT_DISARMED ((1UL << IPATH_SDMA_ABORTING) | \
+	(1UL << IPATH_SDMA_DISARMED))
+#define IPATH_SDMA_ABORT_DISABLED ((1UL << IPATH_SDMA_ABORTING) | \
+	(1UL << IPATH_SDMA_DISABLED))
+#define IPATH_SDMA_ABORT_ABORTED ((1UL << IPATH_SDMA_ABORTING) | \
+	(1UL << IPATH_SDMA_DISARMED) | (1UL << IPATH_SDMA_DISABLED))
+#define IPATH_SDMA_ABORT_MASK ((1UL<<IPATH_SDMA_ABORTING) | \
+	(1UL << IPATH_SDMA_DISARMED) | (1UL << IPATH_SDMA_DISABLED))
+
+#define IPATH_SDMA_BUF_NONE 0
+#define IPATH_SDMA_BUF_MASK (1UL<<IPATH_SDMA_LAYERBUF)
+
+/* Private data for file operations */
+struct ipath_filedata {
+	struct ipath_portdata *pd;
+	unsigned subport;
+	unsigned tidcursor;
+	struct ipath_user_sdma_queue *pq;
+};
+extern struct list_head ipath_dev_list;
+extern spinlock_t ipath_devs_lock;
+extern struct ipath_devdata *ipath_lookup(int unit);
+
+int ipath_init_chip(struct ipath_devdata *, int);
+int ipath_enable_wc(struct ipath_devdata *dd);
+void ipath_disable_wc(struct ipath_devdata *dd);
+int ipath_count_units(int *npresentp, int *nupp, int *maxportsp);
+void ipath_shutdown_device(struct ipath_devdata *);
+void ipath_clear_freeze(struct ipath_devdata *);
+
+struct file_operations;
+int ipath_cdev_init(int minor, char *name, const struct file_operations *fops,
+		    struct cdev **cdevp, struct device **devp);
+void ipath_cdev_cleanup(struct cdev **cdevp,
+			struct device **devp);
+
+int ipath_diag_add(struct ipath_devdata *);
+void ipath_diag_remove(struct ipath_devdata *);
+
+extern wait_queue_head_t ipath_state_wait;
+
+int ipath_user_add(struct ipath_devdata *dd);
+void ipath_user_remove(struct ipath_devdata *dd);
+
+struct sk_buff *ipath_alloc_skb(struct ipath_devdata *dd, gfp_t);
+
+extern int ipath_diag_inuse;
+
+irqreturn_t ipath_intr(int irq, void *devid);
+int ipath_decode_err(struct ipath_devdata *dd, char *buf, size_t blen,
+		     ipath_err_t err);
+#if __IPATH_INFO || __IPATH_DBG
+extern const char *ipath_ibcstatus_str[];
+#endif
+
+/* clean up any per-chip chip-specific stuff */
+void ipath_chip_cleanup(struct ipath_devdata *);
+/* clean up any chip type-specific stuff */
+void ipath_chip_done(void);
+
+/* check to see if we have to force ordering for write combining */
+int ipath_unordered_wc(void);
+
+void ipath_disarm_piobufs(struct ipath_devdata *, unsigned first,
+			  unsigned cnt);
+void ipath_cancel_sends(struct ipath_devdata *, int);
+
+int ipath_create_rcvhdrq(struct ipath_devdata *, struct ipath_portdata *);
+void ipath_free_pddata(struct ipath_devdata *, struct ipath_portdata *);
+
+int ipath_parse_ushort(const char *str, unsigned short *valp);
+
+void ipath_kreceive(struct ipath_portdata *);
+int ipath_setrcvhdrsize(struct ipath_devdata *, unsigned);
+int ipath_reset_device(int);
+void ipath_get_faststats(unsigned long);
+int ipath_wait_linkstate(struct ipath_devdata *, u32, int);
+int ipath_set_linkstate(struct ipath_devdata *, u8);
+int ipath_set_mtu(struct ipath_devdata *, u16);
+int ipath_set_lid(struct ipath_devdata *, u32, u8);
+int ipath_set_rx_pol_inv(struct ipath_devdata *dd, u8 new_pol_inv);
+void ipath_enable_armlaunch(struct ipath_devdata *);
+void ipath_disable_armlaunch(struct ipath_devdata *);
+void ipath_hol_down(struct ipath_devdata *);
+void ipath_hol_up(struct ipath_devdata *);
+void ipath_hol_event(unsigned long);
+void ipath_toggle_rclkrls(struct ipath_devdata *);
+void ipath_sd7220_clr_ibpar(struct ipath_devdata *);
+void ipath_set_relock_poll(struct ipath_devdata *, int);
+void ipath_shutdown_relock_poll(struct ipath_devdata *);
+
+/* for use in system calls, where we want to know device type, etc. */
+#define port_fp(fp) ((struct ipath_filedata *)(fp)->private_data)->pd
+#define subport_fp(fp) \
+	((struct ipath_filedata *)(fp)->private_data)->subport
+#define tidcursor_fp(fp) \
+	((struct ipath_filedata *)(fp)->private_data)->tidcursor
+#define user_sdma_queue_fp(fp) \
+	((struct ipath_filedata *)(fp)->private_data)->pq
+
+/*
+ * values for ipath_flags
+ */
+		/* chip can report link latency (IB 1.2) */
+#define IPATH_HAS_LINK_LATENCY 0x1
+		/* The chip is up and initted */
+#define IPATH_INITTED       0x2
+		/* set if any user code has set kr_rcvhdrsize */
+#define IPATH_RCVHDRSZ_SET  0x4
+		/* The chip is present and valid for accesses */
+#define IPATH_PRESENT       0x8
+		/* HT link0 is only 8 bits wide, ignore upper byte crc
+		 * errors, etc. */
+#define IPATH_8BIT_IN_HT0   0x10
+		/* HT link1 is only 8 bits wide, ignore upper byte crc
+		 * errors, etc. */
+#define IPATH_8BIT_IN_HT1   0x20
+		/* The link is down */
+#define IPATH_LINKDOWN      0x40
+		/* The link level is up (0x11) */
+#define IPATH_LINKINIT      0x80
+		/* The link is in the armed (0x21) state */
+#define IPATH_LINKARMED     0x100
+		/* The link is in the active (0x31) state */
+#define IPATH_LINKACTIVE    0x200
+		/* link current state is unknown */
+#define IPATH_LINKUNK       0x400
+		/* Write combining flush needed for PIO */
+#define IPATH_PIO_FLUSH_WC  0x1000
+		/* DMA Receive tail pointer */
+#define IPATH_NODMA_RTAIL   0x2000
+		/* no IB cable, or no device on IB cable */
+#define IPATH_NOCABLE       0x4000
+		/* Supports port zero per packet receive interrupts via
+		 * GPIO */
+#define IPATH_GPIO_INTR     0x8000
+		/* uses the coded 4byte TID, not 8 byte */
+#define IPATH_4BYTE_TID     0x10000
+		/* packet/word counters are 32 bit, else those 4 counters
+		 * are 64bit */
+#define IPATH_32BITCOUNTERS 0x20000
+		/* Interrupt register is 64 bits */
+#define IPATH_INTREG_64     0x40000
+		/* can miss port0 rx interrupts */
+#define IPATH_DISABLED      0x80000 /* administratively disabled */
+		/* Use GPIO interrupts for new counters */
+#define IPATH_GPIO_ERRINTRS 0x100000
+#define IPATH_SWAP_PIOBUFS  0x200000
+		/* Supports Send DMA */
+#define IPATH_HAS_SEND_DMA  0x400000
+		/* Supports Send Count (not just word count) in PBC */
+#define IPATH_HAS_PBC_CNT   0x800000
+		/* Suppress heartbeat, even if turning off loopback */
+#define IPATH_NO_HRTBT      0x1000000
+#define IPATH_HAS_THRESH_UPDATE 0x4000000
+#define IPATH_HAS_MULT_IB_SPEED 0x8000000
+#define IPATH_IB_AUTONEG_INPROG 0x10000000
+#define IPATH_IB_AUTONEG_FAILED 0x20000000
+		/* Linkdown-disable intentionally, Do not attempt to bring up */
+#define IPATH_IB_LINK_DISABLED 0x40000000
+#define IPATH_IB_FORCE_NOTIFY 0x80000000 /* force notify on next ib change */
+
+/* Bits in GPIO for the added interrupts */
+#define IPATH_GPIO_PORT0_BIT 2
+#define IPATH_GPIO_RXUVL_BIT 3
+#define IPATH_GPIO_OVRUN_BIT 4
+#define IPATH_GPIO_LLI_BIT 5
+#define IPATH_GPIO_ERRINTR_MASK 0x38
+
+/* portdata flag bit offsets */
+		/* waiting for a packet to arrive */
+#define IPATH_PORT_WAITING_RCV   2
+		/* master has not finished initializing */
+#define IPATH_PORT_MASTER_UNINIT 4
+		/* waiting for an urgent packet to arrive */
+#define IPATH_PORT_WAITING_URG 5
+
+/* free up any allocated data at closes */
+void ipath_free_data(struct ipath_portdata *dd);
+u32 __iomem *ipath_getpiobuf(struct ipath_devdata *, u32, u32 *);
+void ipath_chg_pioavailkernel(struct ipath_devdata *dd, unsigned start,
+				unsigned len, int avail);
+void ipath_init_iba6110_funcs(struct ipath_devdata *);
+void ipath_get_eeprom_info(struct ipath_devdata *);
+int ipath_update_eeprom_log(struct ipath_devdata *dd);
+void ipath_inc_eeprom_err(struct ipath_devdata *dd, u32 eidx, u32 incr);
+u64 ipath_snap_cntr(struct ipath_devdata *, ipath_creg);
+void ipath_disarm_senderrbufs(struct ipath_devdata *);
+void ipath_force_pio_avail_update(struct ipath_devdata *);
+void signal_ib_event(struct ipath_devdata *dd, enum ib_event_type ev);
+
+/*
+ * Set LED override, only the two LSBs have "public" meaning, but
+ * any non-zero value substitutes them for the Link and LinkTrain
+ * LED states.
+ */
+#define IPATH_LED_PHYS 1 /* Physical (linktraining) GREEN LED */
+#define IPATH_LED_LOG 2  /* Logical (link) YELLOW LED */
+void ipath_set_led_override(struct ipath_devdata *dd, unsigned int val);
+
+/* send dma routines */
+int setup_sdma(struct ipath_devdata *);
+void teardown_sdma(struct ipath_devdata *);
+void ipath_restart_sdma(struct ipath_devdata *);
+void ipath_sdma_intr(struct ipath_devdata *);
+int ipath_sdma_verbs_send(struct ipath_devdata *, struct ipath_sge_state *,
+			  u32, struct ipath_verbs_txreq *);
+/* ipath_sdma_lock should be locked before calling this. */
+int ipath_sdma_make_progress(struct ipath_devdata *dd);
+
+/* must be called under ipath_sdma_lock */
+static inline u16 ipath_sdma_descq_freecnt(const struct ipath_devdata *dd)
+{
+	return dd->ipath_sdma_descq_cnt -
+		(dd->ipath_sdma_descq_added - dd->ipath_sdma_descq_removed) -
+		1 - dd->ipath_sdma_desc_nreserved;
+}
+
+static inline void ipath_sdma_desc_reserve(struct ipath_devdata *dd, u16 cnt)
+{
+	dd->ipath_sdma_desc_nreserved += cnt;
+}
+
+static inline void ipath_sdma_desc_unreserve(struct ipath_devdata *dd, u16 cnt)
+{
+	dd->ipath_sdma_desc_nreserved -= cnt;
+}
+
+/*
+ * number of words used for protocol header if not set by ipath_userinit();
+ */
+#define IPATH_DFLT_RCVHDRSIZE 9
+
+int ipath_get_user_pages(unsigned long, size_t, struct page **);
+void ipath_release_user_pages(struct page **, size_t);
+void ipath_release_user_pages_on_close(struct page **, size_t);
+int ipath_eeprom_read(struct ipath_devdata *, u8, void *, int);
+int ipath_eeprom_write(struct ipath_devdata *, u8, const void *, int);
+int ipath_tempsense_read(struct ipath_devdata *, u8 regnum);
+int ipath_tempsense_write(struct ipath_devdata *, u8 regnum, u8 data);
+
+/* these are used for the registers that vary with port */
+void ipath_write_kreg_port(const struct ipath_devdata *, ipath_kreg,
+			   unsigned, u64);
+
+/*
+ * We could have a single register get/put routine, that takes a group type,
+ * but this is somewhat clearer and cleaner.  It also gives us some error
+ * checking.  64 bit register reads should always work, but are inefficient
+ * on opteron (the northbridge always generates 2 separate HT 32 bit reads),
+ * so we use kreg32 wherever possible.  User register and counter register
+ * reads are always 32 bit reads, so only one form of those routines.
+ */
+
+/*
+ * At the moment, none of the s-registers are writable, so no
+ * ipath_write_sreg().
+ */
+
+/**
+ * ipath_read_ureg32 - read 32-bit virtualized per-port register
+ * @dd: device
+ * @regno: register number
+ * @port: port number
+ *
+ * Return the contents of a register that is virtualized to be per port.
+ * Returns -1 on errors (not distinguishable from valid contents at
+ * runtime; we may add a separate error variable at some point).
+ */
+static inline u32 ipath_read_ureg32(const struct ipath_devdata *dd,
+				    ipath_ureg regno, int port)
+{
+	if (!dd->ipath_kregbase || !(dd->ipath_flags & IPATH_PRESENT))
+		return 0;
+
+	return readl(regno + (u64 __iomem *)
+		     (dd->ipath_uregbase +
+		      (char __iomem *)dd->ipath_kregbase +
+		      dd->ipath_ureg_align * port));
+}
+
+/**
+ * ipath_write_ureg - write 32-bit virtualized per-port register
+ * @dd: device
+ * @regno: register number
+ * @value: value
+ * @port: port
+ *
+ * Write the contents of a register that is virtualized to be per port.
+ */
+static inline void ipath_write_ureg(const struct ipath_devdata *dd,
+				    ipath_ureg regno, u64 value, int port)
+{
+	u64 __iomem *ubase = (u64 __iomem *)
+		(dd->ipath_uregbase + (char __iomem *) dd->ipath_kregbase +
+		 dd->ipath_ureg_align * port);
+	if (dd->ipath_kregbase)
+		writeq(value, &ubase[regno]);
+}
+
+static inline u32 ipath_read_kreg32(const struct ipath_devdata *dd,
+				    ipath_kreg regno)
+{
+	if (!dd->ipath_kregbase || !(dd->ipath_flags & IPATH_PRESENT))
+		return -1;
+	return readl((u32 __iomem *) & dd->ipath_kregbase[regno]);
+}
+
+static inline u64 ipath_read_kreg64(const struct ipath_devdata *dd,
+				    ipath_kreg regno)
+{
+	if (!dd->ipath_kregbase || !(dd->ipath_flags & IPATH_PRESENT))
+		return -1;
+
+	return readq(&dd->ipath_kregbase[regno]);
+}
+
+static inline void ipath_write_kreg(const struct ipath_devdata *dd,
+				    ipath_kreg regno, u64 value)
+{
+	if (dd->ipath_kregbase)
+		writeq(value, &dd->ipath_kregbase[regno]);
+}
+
+static inline u64 ipath_read_creg(const struct ipath_devdata *dd,
+				  ipath_sreg regno)
+{
+	if (!dd->ipath_kregbase || !(dd->ipath_flags & IPATH_PRESENT))
+		return 0;
+
+	return readq(regno + (u64 __iomem *)
+		     (dd->ipath_cregbase +
+		      (char __iomem *)dd->ipath_kregbase));
+}
+
+static inline u32 ipath_read_creg32(const struct ipath_devdata *dd,
+					 ipath_sreg regno)
+{
+	if (!dd->ipath_kregbase || !(dd->ipath_flags & IPATH_PRESENT))
+		return 0;
+	return readl(regno + (u64 __iomem *)
+		     (dd->ipath_cregbase +
+		      (char __iomem *)dd->ipath_kregbase));
+}
+
+static inline void ipath_write_creg(const struct ipath_devdata *dd,
+				    ipath_creg regno, u64 value)
+{
+	if (dd->ipath_kregbase)
+		writeq(value, regno + (u64 __iomem *)
+		       (dd->ipath_cregbase +
+			(char __iomem *)dd->ipath_kregbase));
+}
+
+static inline void ipath_clear_rcvhdrtail(const struct ipath_portdata *pd)
+{
+	*((u64 *) pd->port_rcvhdrtail_kvaddr) = 0ULL;
+}
+
+static inline u32 ipath_get_rcvhdrtail(const struct ipath_portdata *pd)
+{
+	return (u32) le64_to_cpu(*((volatile __le64 *)
+				pd->port_rcvhdrtail_kvaddr));
+}
+
+static inline u32 ipath_get_hdrqtail(const struct ipath_portdata *pd)
+{
+	const struct ipath_devdata *dd = pd->port_dd;
+	u32 hdrqtail;
+
+	if (dd->ipath_flags & IPATH_NODMA_RTAIL) {
+		__le32 *rhf_addr;
+		u32 seq;
+
+		rhf_addr = (__le32 *) pd->port_rcvhdrq +
+			pd->port_head + dd->ipath_rhf_offset;
+		seq = ipath_hdrget_seq(rhf_addr);
+		hdrqtail = pd->port_head;
+		if (seq == pd->port_seq_cnt)
+			hdrqtail++;
+	} else
+		hdrqtail = ipath_get_rcvhdrtail(pd);
+
+	return hdrqtail;
+}
+
+static inline u64 ipath_read_ireg(const struct ipath_devdata *dd, ipath_kreg r)
+{
+	return (dd->ipath_flags & IPATH_INTREG_64) ?
+		ipath_read_kreg64(dd, r) : ipath_read_kreg32(dd, r);
+}
+
+/*
+ * from contents of IBCStatus (or a saved copy), return linkstate
+ * Report ACTIVE_DEFER as ACTIVE, because we treat them the same
+ * everywhere, anyway (and should be, for almost all purposes).
+ */
+static inline u32 ipath_ib_linkstate(struct ipath_devdata *dd, u64 ibcs)
+{
+	u32 state = (u32)(ibcs >> dd->ibcs_ls_shift) &
+		INFINIPATH_IBCS_LINKSTATE_MASK;
+	if (state == INFINIPATH_IBCS_L_STATE_ACT_DEFER)
+		state = INFINIPATH_IBCS_L_STATE_ACTIVE;
+	return state;
+}
+
+/* from contents of IBCStatus (or a saved copy), return linktrainingstate */
+static inline u32 ipath_ib_linktrstate(struct ipath_devdata *dd, u64 ibcs)
+{
+	return (u32)(ibcs >> INFINIPATH_IBCS_LINKTRAININGSTATE_SHIFT) &
+		dd->ibcs_lts_mask;
+}
+
+/*
+ * from contents of IBCStatus (or a saved copy), return logical link state
+ * combination of link state and linktraining state (down, active, init,
+ * arm, etc.
+ */
+static inline u32 ipath_ib_state(struct ipath_devdata *dd, u64 ibcs)
+{
+	u32 ibs;
+	ibs = (u32)(ibcs >> INFINIPATH_IBCS_LINKTRAININGSTATE_SHIFT) &
+		dd->ibcs_lts_mask;
+	ibs |= (u32)(ibcs &
+		(INFINIPATH_IBCS_LINKSTATE_MASK << dd->ibcs_ls_shift));
+	return ibs;
+}
+
+/*
+ * sysfs interface.
+ */
+
+struct device_driver;
+
+extern const char ib_ipath_version[];
+
+extern const struct attribute_group *ipath_driver_attr_groups[];
+
+int ipath_device_create_group(struct device *, struct ipath_devdata *);
+void ipath_device_remove_group(struct device *, struct ipath_devdata *);
+int ipath_expose_reset(struct device *);
+
+int ipath_init_ipathfs(void);
+void ipath_exit_ipathfs(void);
+int ipathfs_add_device(struct ipath_devdata *);
+int ipathfs_remove_device(struct ipath_devdata *);
+
+/*
+ * dma_addr wrappers - all 0's invalid for hw
+ */
+dma_addr_t ipath_map_page(struct pci_dev *, struct page *, unsigned long,
+			  size_t, int);
+dma_addr_t ipath_map_single(struct pci_dev *, void *, size_t, int);
+const char *ipath_get_unit_name(int unit);
+
+/*
+ * Flush write combining store buffers (if present) and perform a write
+ * barrier.
+ */
+#if defined(CONFIG_X86_64)
+#define ipath_flush_wc() asm volatile("sfence" ::: "memory")
+#else
+#define ipath_flush_wc() wmb()
+#endif
+
+extern unsigned ipath_debug; /* debugging bit mask */
+extern unsigned ipath_linkrecovery;
+extern unsigned ipath_mtu4096;
+extern struct mutex ipath_mutex;
+
+#define IPATH_DRV_NAME		"ib_ipath"
+#define IPATH_MAJOR		233
+#define IPATH_USER_MINOR_BASE	0
+#define IPATH_DIAGPKT_MINOR	127
+#define IPATH_DIAG_MINOR_BASE	129
+#define IPATH_NMINORS		255
+
+#define ipath_dev_err(dd,fmt,...) \
+	do { \
+		const struct ipath_devdata *__dd = (dd); \
+		if (__dd->pcidev) \
+			dev_err(&__dd->pcidev->dev, "%s: " fmt, \
+				ipath_get_unit_name(__dd->ipath_unit), \
+				##__VA_ARGS__); \
+		else \
+			printk(KERN_ERR IPATH_DRV_NAME ": %s: " fmt, \
+			       ipath_get_unit_name(__dd->ipath_unit), \
+			       ##__VA_ARGS__); \
+	} while (0)
+
+#if _IPATH_DEBUGGING
+
+# define __IPATH_DBG_WHICH(which,fmt,...) \
+	do { \
+		if (unlikely(ipath_debug & (which))) \
+			printk(KERN_DEBUG IPATH_DRV_NAME ": %s: " fmt, \
+			       __func__,##__VA_ARGS__); \
+	} while(0)
+
+# define ipath_dbg(fmt,...) \
+	__IPATH_DBG_WHICH(__IPATH_DBG,fmt,##__VA_ARGS__)
+# define ipath_cdbg(which,fmt,...) \
+	__IPATH_DBG_WHICH(__IPATH_##which##DBG,fmt,##__VA_ARGS__)
+
+#else /* ! _IPATH_DEBUGGING */
+
+# define ipath_dbg(fmt,...)
+# define ipath_cdbg(which,fmt,...)
+
+#endif /* _IPATH_DEBUGGING */
+
+/*
+ * this is used for formatting hw error messages...
+ */
+struct ipath_hwerror_msgs {
+	u64 mask;
+	const char *msg;
+};
+
+#define INFINIPATH_HWE_MSG(a, b) { .mask = INFINIPATH_HWE_##a, .msg = b }
+
+/* in ipath_intr.c... */
+void ipath_format_hwerrors(u64 hwerrs,
+			   const struct ipath_hwerror_msgs *hwerrmsgs,
+			   size_t nhwerrmsgs,
+			   char *msg, size_t lmsg);
+
+#endif				/* _IPATH_KERNEL_H */
diff --git a/drivers/infiniband/hw/ipath/ipath_keys.c b/drivers/infiniband/hw/ipath/ipath_keys.c
new file mode 100644
index 0000000..c0e933f
--- /dev/null
+++ b/drivers/infiniband/hw/ipath/ipath_keys.c
@@ -0,0 +1,270 @@
+/*
+ * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <asm/io.h>
+
+#include "ipath_verbs.h"
+#include "ipath_kernel.h"
+
+/**
+ * ipath_alloc_lkey - allocate an lkey
+ * @rkt: lkey table in which to allocate the lkey
+ * @mr: memory region that this lkey protects
+ *
+ * Returns 1 if successful, otherwise returns 0.
+ */
+
+int ipath_alloc_lkey(struct ipath_lkey_table *rkt, struct ipath_mregion *mr)
+{
+	unsigned long flags;
+	u32 r;
+	u32 n;
+	int ret;
+
+	spin_lock_irqsave(&rkt->lock, flags);
+
+	/* Find the next available LKEY */
+	r = n = rkt->next;
+	for (;;) {
+		if (rkt->table[r] == NULL)
+			break;
+		r = (r + 1) & (rkt->max - 1);
+		if (r == n) {
+			spin_unlock_irqrestore(&rkt->lock, flags);
+			ipath_dbg("LKEY table full\n");
+			ret = 0;
+			goto bail;
+		}
+	}
+	rkt->next = (r + 1) & (rkt->max - 1);
+	/*
+	 * Make sure lkey is never zero which is reserved to indicate an
+	 * unrestricted LKEY.
+	 */
+	rkt->gen++;
+	mr->lkey = (r << (32 - ib_ipath_lkey_table_size)) |
+		((((1 << (24 - ib_ipath_lkey_table_size)) - 1) & rkt->gen)
+		 << 8);
+	if (mr->lkey == 0) {
+		mr->lkey |= 1 << 8;
+		rkt->gen++;
+	}
+	rkt->table[r] = mr;
+	spin_unlock_irqrestore(&rkt->lock, flags);
+
+	ret = 1;
+
+bail:
+	return ret;
+}
+
+/**
+ * ipath_free_lkey - free an lkey
+ * @rkt: table from which to free the lkey
+ * @lkey: lkey id to free
+ */
+void ipath_free_lkey(struct ipath_lkey_table *rkt, u32 lkey)
+{
+	unsigned long flags;
+	u32 r;
+
+	if (lkey == 0)
+		return;
+	r = lkey >> (32 - ib_ipath_lkey_table_size);
+	spin_lock_irqsave(&rkt->lock, flags);
+	rkt->table[r] = NULL;
+	spin_unlock_irqrestore(&rkt->lock, flags);
+}
+
+/**
+ * ipath_lkey_ok - check IB SGE for validity and initialize
+ * @rkt: table containing lkey to check SGE against
+ * @isge: outgoing internal SGE
+ * @sge: SGE to check
+ * @acc: access flags
+ *
+ * Return 1 if valid and successful, otherwise returns 0.
+ *
+ * Check the IB SGE for validity and initialize our internal version
+ * of it.
+ */
+int ipath_lkey_ok(struct ipath_qp *qp, struct ipath_sge *isge,
+		  struct ib_sge *sge, int acc)
+{
+	struct ipath_lkey_table *rkt = &to_idev(qp->ibqp.device)->lk_table;
+	struct ipath_mregion *mr;
+	unsigned n, m;
+	size_t off;
+	int ret;
+
+	/*
+	 * We use LKEY == zero for kernel virtual addresses
+	 * (see ipath_get_dma_mr and ipath_dma.c).
+	 */
+	if (sge->lkey == 0) {
+		/* always a kernel port, no locking needed */
+		struct ipath_pd *pd = to_ipd(qp->ibqp.pd);
+
+		if (pd->user) {
+			ret = 0;
+			goto bail;
+		}
+		isge->mr = NULL;
+		isge->vaddr = (void *) sge->addr;
+		isge->length = sge->length;
+		isge->sge_length = sge->length;
+		ret = 1;
+		goto bail;
+	}
+	mr = rkt->table[(sge->lkey >> (32 - ib_ipath_lkey_table_size))];
+	if (unlikely(mr == NULL || mr->lkey != sge->lkey ||
+		     qp->ibqp.pd != mr->pd)) {
+		ret = 0;
+		goto bail;
+	}
+
+	off = sge->addr - mr->user_base;
+	if (unlikely(sge->addr < mr->user_base ||
+		     off + sge->length > mr->length ||
+		     (mr->access_flags & acc) != acc)) {
+		ret = 0;
+		goto bail;
+	}
+
+	off += mr->offset;
+	m = 0;
+	n = 0;
+	while (off >= mr->map[m]->segs[n].length) {
+		off -= mr->map[m]->segs[n].length;
+		n++;
+		if (n >= IPATH_SEGSZ) {
+			m++;
+			n = 0;
+		}
+	}
+	isge->mr = mr;
+	isge->vaddr = mr->map[m]->segs[n].vaddr + off;
+	isge->length = mr->map[m]->segs[n].length - off;
+	isge->sge_length = sge->length;
+	isge->m = m;
+	isge->n = n;
+
+	ret = 1;
+
+bail:
+	return ret;
+}
+
+/**
+ * ipath_rkey_ok - check the IB virtual address, length, and RKEY
+ * @dev: infiniband device
+ * @ss: SGE state
+ * @len: length of data
+ * @vaddr: virtual address to place data
+ * @rkey: rkey to check
+ * @acc: access flags
+ *
+ * Return 1 if successful, otherwise 0.
+ */
+int ipath_rkey_ok(struct ipath_qp *qp, struct ipath_sge_state *ss,
+		  u32 len, u64 vaddr, u32 rkey, int acc)
+{
+	struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
+	struct ipath_lkey_table *rkt = &dev->lk_table;
+	struct ipath_sge *sge = &ss->sge;
+	struct ipath_mregion *mr;
+	unsigned n, m;
+	size_t off;
+	int ret;
+
+	/*
+	 * We use RKEY == zero for kernel virtual addresses
+	 * (see ipath_get_dma_mr and ipath_dma.c).
+	 */
+	if (rkey == 0) {
+		/* always a kernel port, no locking needed */
+		struct ipath_pd *pd = to_ipd(qp->ibqp.pd);
+
+		if (pd->user) {
+			ret = 0;
+			goto bail;
+		}
+		sge->mr = NULL;
+		sge->vaddr = (void *) vaddr;
+		sge->length = len;
+		sge->sge_length = len;
+		ss->sg_list = NULL;
+		ss->num_sge = 1;
+		ret = 1;
+		goto bail;
+	}
+
+	mr = rkt->table[(rkey >> (32 - ib_ipath_lkey_table_size))];
+	if (unlikely(mr == NULL || mr->lkey != rkey ||
+		     qp->ibqp.pd != mr->pd)) {
+		ret = 0;
+		goto bail;
+	}
+
+	off = vaddr - mr->iova;
+	if (unlikely(vaddr < mr->iova || off + len > mr->length ||
+		     (mr->access_flags & acc) == 0)) {
+		ret = 0;
+		goto bail;
+	}
+
+	off += mr->offset;
+	m = 0;
+	n = 0;
+	while (off >= mr->map[m]->segs[n].length) {
+		off -= mr->map[m]->segs[n].length;
+		n++;
+		if (n >= IPATH_SEGSZ) {
+			m++;
+			n = 0;
+		}
+	}
+	sge->mr = mr;
+	sge->vaddr = mr->map[m]->segs[n].vaddr + off;
+	sge->length = mr->map[m]->segs[n].length - off;
+	sge->sge_length = len;
+	sge->m = m;
+	sge->n = n;
+	ss->sg_list = NULL;
+	ss->num_sge = 1;
+
+	ret = 1;
+
+bail:
+	return ret;
+}
diff --git a/drivers/infiniband/hw/ipath/ipath_mad.c b/drivers/infiniband/hw/ipath/ipath_mad.c
new file mode 100644
index 0000000..e890e5b
--- /dev/null
+++ b/drivers/infiniband/hw/ipath/ipath_mad.c
@@ -0,0 +1,1513 @@
+/*
+ * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <rdma/ib_smi.h>
+#include <rdma/ib_pma.h>
+
+#include "ipath_kernel.h"
+#include "ipath_verbs.h"
+#include "ipath_common.h"
+
+#define IB_SMP_UNSUP_VERSION	cpu_to_be16(0x0004)
+#define IB_SMP_UNSUP_METHOD	cpu_to_be16(0x0008)
+#define IB_SMP_UNSUP_METH_ATTR	cpu_to_be16(0x000C)
+#define IB_SMP_INVALID_FIELD	cpu_to_be16(0x001C)
+
+static int reply(struct ib_smp *smp)
+{
+	/*
+	 * The verbs framework will handle the directed/LID route
+	 * packet changes.
+	 */
+	smp->method = IB_MGMT_METHOD_GET_RESP;
+	if (smp->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)
+		smp->status |= IB_SMP_DIRECTION;
+	return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY;
+}
+
+static int recv_subn_get_nodedescription(struct ib_smp *smp,
+					 struct ib_device *ibdev)
+{
+	if (smp->attr_mod)
+		smp->status |= IB_SMP_INVALID_FIELD;
+
+	memcpy(smp->data, ibdev->node_desc, sizeof(smp->data));
+
+	return reply(smp);
+}
+
+struct nodeinfo {
+	u8 base_version;
+	u8 class_version;
+	u8 node_type;
+	u8 num_ports;
+	__be64 sys_guid;
+	__be64 node_guid;
+	__be64 port_guid;
+	__be16 partition_cap;
+	__be16 device_id;
+	__be32 revision;
+	u8 local_port_num;
+	u8 vendor_id[3];
+} __attribute__ ((packed));
+
+static int recv_subn_get_nodeinfo(struct ib_smp *smp,
+				  struct ib_device *ibdev, u8 port)
+{
+	struct nodeinfo *nip = (struct nodeinfo *)&smp->data;
+	struct ipath_devdata *dd = to_idev(ibdev)->dd;
+	u32 vendor, majrev, minrev;
+
+	/* GUID 0 is illegal */
+	if (smp->attr_mod || (dd->ipath_guid == 0))
+		smp->status |= IB_SMP_INVALID_FIELD;
+
+	nip->base_version = 1;
+	nip->class_version = 1;
+	nip->node_type = 1;	/* channel adapter */
+	/*
+	 * XXX The num_ports value will need a layer function to get
+	 * the value if we ever have more than one IB port on a chip.
+	 * We will also need to get the GUID for the port.
+	 */
+	nip->num_ports = ibdev->phys_port_cnt;
+	/* This is already in network order */
+	nip->sys_guid = to_idev(ibdev)->sys_image_guid;
+	nip->node_guid = dd->ipath_guid;
+	nip->port_guid = dd->ipath_guid;
+	nip->partition_cap = cpu_to_be16(ipath_get_npkeys(dd));
+	nip->device_id = cpu_to_be16(dd->ipath_deviceid);
+	majrev = dd->ipath_majrev;
+	minrev = dd->ipath_minrev;
+	nip->revision = cpu_to_be32((majrev << 16) | minrev);
+	nip->local_port_num = port;
+	vendor = dd->ipath_vendorid;
+	nip->vendor_id[0] = IPATH_SRC_OUI_1;
+	nip->vendor_id[1] = IPATH_SRC_OUI_2;
+	nip->vendor_id[2] = IPATH_SRC_OUI_3;
+
+	return reply(smp);
+}
+
+static int recv_subn_get_guidinfo(struct ib_smp *smp,
+				  struct ib_device *ibdev)
+{
+	u32 startgx = 8 * be32_to_cpu(smp->attr_mod);
+	__be64 *p = (__be64 *) smp->data;
+
+	/* 32 blocks of 8 64-bit GUIDs per block */
+
+	memset(smp->data, 0, sizeof(smp->data));
+
+	/*
+	 * We only support one GUID for now.  If this changes, the
+	 * portinfo.guid_cap field needs to be updated too.
+	 */
+	if (startgx == 0) {
+		__be64 g = to_idev(ibdev)->dd->ipath_guid;
+		if (g == 0)
+			/* GUID 0 is illegal */
+			smp->status |= IB_SMP_INVALID_FIELD;
+		else
+			/* The first is a copy of the read-only HW GUID. */
+			*p = g;
+	} else
+		smp->status |= IB_SMP_INVALID_FIELD;
+
+	return reply(smp);
+}
+
+static void set_link_width_enabled(struct ipath_devdata *dd, u32 w)
+{
+	(void) dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_LWID_ENB, w);
+}
+
+static void set_link_speed_enabled(struct ipath_devdata *dd, u32 s)
+{
+	(void) dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_SPD_ENB, s);
+}
+
+static int get_overrunthreshold(struct ipath_devdata *dd)
+{
+	return (dd->ipath_ibcctrl >>
+		INFINIPATH_IBCC_OVERRUNTHRESHOLD_SHIFT) &
+		INFINIPATH_IBCC_OVERRUNTHRESHOLD_MASK;
+}
+
+/**
+ * set_overrunthreshold - set the overrun threshold
+ * @dd: the infinipath device
+ * @n: the new threshold
+ *
+ * Note that this will only take effect when the link state changes.
+ */
+static int set_overrunthreshold(struct ipath_devdata *dd, unsigned n)
+{
+	unsigned v;
+
+	v = (dd->ipath_ibcctrl >> INFINIPATH_IBCC_OVERRUNTHRESHOLD_SHIFT) &
+		INFINIPATH_IBCC_OVERRUNTHRESHOLD_MASK;
+	if (v != n) {
+		dd->ipath_ibcctrl &=
+			~(INFINIPATH_IBCC_OVERRUNTHRESHOLD_MASK <<
+			  INFINIPATH_IBCC_OVERRUNTHRESHOLD_SHIFT);
+		dd->ipath_ibcctrl |=
+			(u64) n << INFINIPATH_IBCC_OVERRUNTHRESHOLD_SHIFT;
+		ipath_write_kreg(dd, dd->ipath_kregs->kr_ibcctrl,
+				 dd->ipath_ibcctrl);
+	}
+	return 0;
+}
+
+static int get_phyerrthreshold(struct ipath_devdata *dd)
+{
+	return (dd->ipath_ibcctrl >>
+		INFINIPATH_IBCC_PHYERRTHRESHOLD_SHIFT) &
+		INFINIPATH_IBCC_PHYERRTHRESHOLD_MASK;
+}
+
+/**
+ * set_phyerrthreshold - set the physical error threshold
+ * @dd: the infinipath device
+ * @n: the new threshold
+ *
+ * Note that this will only take effect when the link state changes.
+ */
+static int set_phyerrthreshold(struct ipath_devdata *dd, unsigned n)
+{
+	unsigned v;
+
+	v = (dd->ipath_ibcctrl >> INFINIPATH_IBCC_PHYERRTHRESHOLD_SHIFT) &
+		INFINIPATH_IBCC_PHYERRTHRESHOLD_MASK;
+	if (v != n) {
+		dd->ipath_ibcctrl &=
+			~(INFINIPATH_IBCC_PHYERRTHRESHOLD_MASK <<
+			  INFINIPATH_IBCC_PHYERRTHRESHOLD_SHIFT);
+		dd->ipath_ibcctrl |=
+			(u64) n << INFINIPATH_IBCC_PHYERRTHRESHOLD_SHIFT;
+		ipath_write_kreg(dd, dd->ipath_kregs->kr_ibcctrl,
+				 dd->ipath_ibcctrl);
+	}
+	return 0;
+}
+
+/**
+ * get_linkdowndefaultstate - get the default linkdown state
+ * @dd: the infinipath device
+ *
+ * Returns zero if the default is POLL, 1 if the default is SLEEP.
+ */
+static int get_linkdowndefaultstate(struct ipath_devdata *dd)
+{
+	return !!(dd->ipath_ibcctrl & INFINIPATH_IBCC_LINKDOWNDEFAULTSTATE);
+}
+
+static int recv_subn_get_portinfo(struct ib_smp *smp,
+				  struct ib_device *ibdev, u8 port)
+{
+	struct ipath_ibdev *dev;
+	struct ipath_devdata *dd;
+	struct ib_port_info *pip = (struct ib_port_info *)smp->data;
+	u16 lid;
+	u8 ibcstat;
+	u8 mtu;
+	int ret;
+
+	if (be32_to_cpu(smp->attr_mod) > ibdev->phys_port_cnt) {
+		smp->status |= IB_SMP_INVALID_FIELD;
+		ret = reply(smp);
+		goto bail;
+	}
+
+	dev = to_idev(ibdev);
+	dd = dev->dd;
+
+	/* Clear all fields.  Only set the non-zero fields. */
+	memset(smp->data, 0, sizeof(smp->data));
+
+	/* Only return the mkey if the protection field allows it. */
+	if (smp->method == IB_MGMT_METHOD_SET || dev->mkey == smp->mkey ||
+	    dev->mkeyprot == 0)
+		pip->mkey = dev->mkey;
+	pip->gid_prefix = dev->gid_prefix;
+	lid = dd->ipath_lid;
+	pip->lid = lid ? cpu_to_be16(lid) : IB_LID_PERMISSIVE;
+	pip->sm_lid = cpu_to_be16(dev->sm_lid);
+	pip->cap_mask = cpu_to_be32(dev->port_cap_flags);
+	/* pip->diag_code; */
+	pip->mkey_lease_period = cpu_to_be16(dev->mkey_lease_period);
+	pip->local_port_num = port;
+	pip->link_width_enabled = dd->ipath_link_width_enabled;
+	pip->link_width_supported = dd->ipath_link_width_supported;
+	pip->link_width_active = dd->ipath_link_width_active;
+	pip->linkspeed_portstate = dd->ipath_link_speed_supported << 4;
+	ibcstat = dd->ipath_lastibcstat;
+	/* map LinkState to IB portinfo values.  */
+	pip->linkspeed_portstate |= ipath_ib_linkstate(dd, ibcstat) + 1;
+
+	pip->portphysstate_linkdown =
+		(ipath_cvt_physportstate[ibcstat & dd->ibcs_lts_mask] << 4) |
+		(get_linkdowndefaultstate(dd) ? 1 : 2);
+	pip->mkeyprot_resv_lmc = (dev->mkeyprot << 6) | dd->ipath_lmc;
+	pip->linkspeedactive_enabled = (dd->ipath_link_speed_active << 4) |
+		dd->ipath_link_speed_enabled;
+	switch (dd->ipath_ibmtu) {
+	case 4096:
+		mtu = IB_MTU_4096;
+		break;
+	case 2048:
+		mtu = IB_MTU_2048;
+		break;
+	case 1024:
+		mtu = IB_MTU_1024;
+		break;
+	case 512:
+		mtu = IB_MTU_512;
+		break;
+	case 256:
+		mtu = IB_MTU_256;
+		break;
+	default:		/* oops, something is wrong */
+		mtu = IB_MTU_2048;
+		break;
+	}
+	pip->neighbormtu_mastersmsl = (mtu << 4) | dev->sm_sl;
+	pip->vlcap_inittype = 0x10;	/* VLCap = VL0, InitType = 0 */
+	pip->vl_high_limit = dev->vl_high_limit;
+	/* pip->vl_arb_high_cap; // only one VL */
+	/* pip->vl_arb_low_cap; // only one VL */
+	/* InitTypeReply = 0 */
+	/* our mtu cap depends on whether 4K MTU enabled or not */
+	pip->inittypereply_mtucap = ipath_mtu4096 ? IB_MTU_4096 : IB_MTU_2048;
+	/* HCAs ignore VLStallCount and HOQLife */
+	/* pip->vlstallcnt_hoqlife; */
+	pip->operationalvl_pei_peo_fpi_fpo = 0x10;	/* OVLs = 1 */
+	pip->mkey_violations = cpu_to_be16(dev->mkey_violations);
+	/* P_KeyViolations are counted by hardware. */
+	pip->pkey_violations =
+		cpu_to_be16((ipath_get_cr_errpkey(dd) -
+			     dev->z_pkey_violations) & 0xFFFF);
+	pip->qkey_violations = cpu_to_be16(dev->qkey_violations);
+	/* Only the hardware GUID is supported for now */
+	pip->guid_cap = 1;
+	pip->clientrereg_resv_subnetto = dev->subnet_timeout;
+	/* 32.768 usec. response time (guessing) */
+	pip->resv_resptimevalue = 3;
+	pip->localphyerrors_overrunerrors =
+		(get_phyerrthreshold(dd) << 4) |
+		get_overrunthreshold(dd);
+	/* pip->max_credit_hint; */
+	if (dev->port_cap_flags & IB_PORT_LINK_LATENCY_SUP) {
+		u32 v;
+
+		v = dd->ipath_f_get_ib_cfg(dd, IPATH_IB_CFG_LINKLATENCY);
+		pip->link_roundtrip_latency[0] = v >> 16;
+		pip->link_roundtrip_latency[1] = v >> 8;
+		pip->link_roundtrip_latency[2] = v;
+	}
+
+	ret = reply(smp);
+
+bail:
+	return ret;
+}
+
+/**
+ * get_pkeys - return the PKEY table for port 0
+ * @dd: the infinipath device
+ * @pkeys: the pkey table is placed here
+ */
+static int get_pkeys(struct ipath_devdata *dd, u16 * pkeys)
+{
+	/* always a kernel port, no locking needed */
+	struct ipath_portdata *pd = dd->ipath_pd[0];
+
+	memcpy(pkeys, pd->port_pkeys, sizeof(pd->port_pkeys));
+
+	return 0;
+}
+
+static int recv_subn_get_pkeytable(struct ib_smp *smp,
+				   struct ib_device *ibdev)
+{
+	u32 startpx = 32 * (be32_to_cpu(smp->attr_mod) & 0xffff);
+	u16 *p = (u16 *) smp->data;
+	__be16 *q = (__be16 *) smp->data;
+
+	/* 64 blocks of 32 16-bit P_Key entries */
+
+	memset(smp->data, 0, sizeof(smp->data));
+	if (startpx == 0) {
+		struct ipath_ibdev *dev = to_idev(ibdev);
+		unsigned i, n = ipath_get_npkeys(dev->dd);
+
+		get_pkeys(dev->dd, p);
+
+		for (i = 0; i < n; i++)
+			q[i] = cpu_to_be16(p[i]);
+	} else
+		smp->status |= IB_SMP_INVALID_FIELD;
+
+	return reply(smp);
+}
+
+static int recv_subn_set_guidinfo(struct ib_smp *smp,
+				  struct ib_device *ibdev)
+{
+	/* The only GUID we support is the first read-only entry. */
+	return recv_subn_get_guidinfo(smp, ibdev);
+}
+
+/**
+ * set_linkdowndefaultstate - set the default linkdown state
+ * @dd: the infinipath device
+ * @sleep: the new state
+ *
+ * Note that this will only take effect when the link state changes.
+ */
+static int set_linkdowndefaultstate(struct ipath_devdata *dd, int sleep)
+{
+	if (sleep)
+		dd->ipath_ibcctrl |= INFINIPATH_IBCC_LINKDOWNDEFAULTSTATE;
+	else
+		dd->ipath_ibcctrl &= ~INFINIPATH_IBCC_LINKDOWNDEFAULTSTATE;
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_ibcctrl,
+			 dd->ipath_ibcctrl);
+	return 0;
+}
+
+/**
+ * recv_subn_set_portinfo - set port information
+ * @smp: the incoming SM packet
+ * @ibdev: the infiniband device
+ * @port: the port on the device
+ *
+ * Set Portinfo (see ch. 14.2.5.6).
+ */
+static int recv_subn_set_portinfo(struct ib_smp *smp,
+				  struct ib_device *ibdev, u8 port)
+{
+	struct ib_port_info *pip = (struct ib_port_info *)smp->data;
+	struct ib_event event;
+	struct ipath_ibdev *dev;
+	struct ipath_devdata *dd;
+	char clientrereg = 0;
+	u16 lid, smlid;
+	u8 lwe;
+	u8 lse;
+	u8 state;
+	u16 lstate;
+	u32 mtu;
+	int ret, ore;
+
+	if (be32_to_cpu(smp->attr_mod) > ibdev->phys_port_cnt)
+		goto err;
+
+	dev = to_idev(ibdev);
+	dd = dev->dd;
+	event.device = ibdev;
+	event.element.port_num = port;
+
+	dev->mkey = pip->mkey;
+	dev->gid_prefix = pip->gid_prefix;
+	dev->mkey_lease_period = be16_to_cpu(pip->mkey_lease_period);
+
+	lid = be16_to_cpu(pip->lid);
+	if (dd->ipath_lid != lid ||
+	    dd->ipath_lmc != (pip->mkeyprot_resv_lmc & 7)) {
+		/* Must be a valid unicast LID address. */
+		if (lid == 0 || lid >= IPATH_MULTICAST_LID_BASE)
+			goto err;
+		ipath_set_lid(dd, lid, pip->mkeyprot_resv_lmc & 7);
+		event.event = IB_EVENT_LID_CHANGE;
+		ib_dispatch_event(&event);
+	}
+
+	smlid = be16_to_cpu(pip->sm_lid);
+	if (smlid != dev->sm_lid) {
+		/* Must be a valid unicast LID address. */
+		if (smlid == 0 || smlid >= IPATH_MULTICAST_LID_BASE)
+			goto err;
+		dev->sm_lid = smlid;
+		event.event = IB_EVENT_SM_CHANGE;
+		ib_dispatch_event(&event);
+	}
+
+	/* Allow 1x or 4x to be set (see 14.2.6.6). */
+	lwe = pip->link_width_enabled;
+	if (lwe) {
+		if (lwe == 0xFF)
+			lwe = dd->ipath_link_width_supported;
+		else if (lwe >= 16 || (lwe & ~dd->ipath_link_width_supported))
+			goto err;
+		set_link_width_enabled(dd, lwe);
+	}
+
+	/* Allow 2.5 or 5.0 Gbs. */
+	lse = pip->linkspeedactive_enabled & 0xF;
+	if (lse) {
+		if (lse == 15)
+			lse = dd->ipath_link_speed_supported;
+		else if (lse >= 8 || (lse & ~dd->ipath_link_speed_supported))
+			goto err;
+		set_link_speed_enabled(dd, lse);
+	}
+
+	/* Set link down default state. */
+	switch (pip->portphysstate_linkdown & 0xF) {
+	case 0: /* NOP */
+		break;
+	case 1: /* SLEEP */
+		if (set_linkdowndefaultstate(dd, 1))
+			goto err;
+		break;
+	case 2: /* POLL */
+		if (set_linkdowndefaultstate(dd, 0))
+			goto err;
+		break;
+	default:
+		goto err;
+	}
+
+	dev->mkeyprot = pip->mkeyprot_resv_lmc >> 6;
+	dev->vl_high_limit = pip->vl_high_limit;
+
+	switch ((pip->neighbormtu_mastersmsl >> 4) & 0xF) {
+	case IB_MTU_256:
+		mtu = 256;
+		break;
+	case IB_MTU_512:
+		mtu = 512;
+		break;
+	case IB_MTU_1024:
+		mtu = 1024;
+		break;
+	case IB_MTU_2048:
+		mtu = 2048;
+		break;
+	case IB_MTU_4096:
+		if (!ipath_mtu4096)
+			goto err;
+		mtu = 4096;
+		break;
+	default:
+		/* XXX We have already partially updated our state! */
+		goto err;
+	}
+	ipath_set_mtu(dd, mtu);
+
+	dev->sm_sl = pip->neighbormtu_mastersmsl & 0xF;
+
+	/* We only support VL0 */
+	if (((pip->operationalvl_pei_peo_fpi_fpo >> 4) & 0xF) > 1)
+		goto err;
+
+	if (pip->mkey_violations == 0)
+		dev->mkey_violations = 0;
+
+	/*
+	 * Hardware counter can't be reset so snapshot and subtract
+	 * later.
+	 */
+	if (pip->pkey_violations == 0)
+		dev->z_pkey_violations = ipath_get_cr_errpkey(dd);
+
+	if (pip->qkey_violations == 0)
+		dev->qkey_violations = 0;
+
+	ore = pip->localphyerrors_overrunerrors;
+	if (set_phyerrthreshold(dd, (ore >> 4) & 0xF))
+		goto err;
+
+	if (set_overrunthreshold(dd, (ore & 0xF)))
+		goto err;
+
+	dev->subnet_timeout = pip->clientrereg_resv_subnetto & 0x1F;
+
+	if (pip->clientrereg_resv_subnetto & 0x80) {
+		clientrereg = 1;
+		event.event = IB_EVENT_CLIENT_REREGISTER;
+		ib_dispatch_event(&event);
+	}
+
+	/*
+	 * Do the port state change now that the other link parameters
+	 * have been set.
+	 * Changing the port physical state only makes sense if the link
+	 * is down or is being set to down.
+	 */
+	state = pip->linkspeed_portstate & 0xF;
+	lstate = (pip->portphysstate_linkdown >> 4) & 0xF;
+	if (lstate && !(state == IB_PORT_DOWN || state == IB_PORT_NOP))
+		goto err;
+
+	/*
+	 * Only state changes of DOWN, ARM, and ACTIVE are valid
+	 * and must be in the correct state to take effect (see 7.2.6).
+	 */
+	switch (state) {
+	case IB_PORT_NOP:
+		if (lstate == 0)
+			break;
+		/* FALLTHROUGH */
+	case IB_PORT_DOWN:
+		if (lstate == 0)
+			lstate = IPATH_IB_LINKDOWN_ONLY;
+		else if (lstate == 1)
+			lstate = IPATH_IB_LINKDOWN_SLEEP;
+		else if (lstate == 2)
+			lstate = IPATH_IB_LINKDOWN;
+		else if (lstate == 3)
+			lstate = IPATH_IB_LINKDOWN_DISABLE;
+		else
+			goto err;
+		ipath_set_linkstate(dd, lstate);
+		if (lstate == IPATH_IB_LINKDOWN_DISABLE) {
+			ret = IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED;
+			goto done;
+		}
+		ipath_wait_linkstate(dd, IPATH_LINKINIT | IPATH_LINKARMED |
+				IPATH_LINKACTIVE, 1000);
+		break;
+	case IB_PORT_ARMED:
+		ipath_set_linkstate(dd, IPATH_IB_LINKARM);
+		break;
+	case IB_PORT_ACTIVE:
+		ipath_set_linkstate(dd, IPATH_IB_LINKACTIVE);
+		break;
+	default:
+		/* XXX We have already partially updated our state! */
+		goto err;
+	}
+
+	ret = recv_subn_get_portinfo(smp, ibdev, port);
+
+	if (clientrereg)
+		pip->clientrereg_resv_subnetto |= 0x80;
+
+	goto done;
+
+err:
+	smp->status |= IB_SMP_INVALID_FIELD;
+	ret = recv_subn_get_portinfo(smp, ibdev, port);
+
+done:
+	return ret;
+}
+
+/**
+ * rm_pkey - decrecment the reference count for the given PKEY
+ * @dd: the infinipath device
+ * @key: the PKEY index
+ *
+ * Return true if this was the last reference and the hardware table entry
+ * needs to be changed.
+ */
+static int rm_pkey(struct ipath_devdata *dd, u16 key)
+{
+	int i;
+	int ret;
+
+	for (i = 0; i < ARRAY_SIZE(dd->ipath_pkeys); i++) {
+		if (dd->ipath_pkeys[i] != key)
+			continue;
+		if (atomic_dec_and_test(&dd->ipath_pkeyrefs[i])) {
+			dd->ipath_pkeys[i] = 0;
+			ret = 1;
+			goto bail;
+		}
+		break;
+	}
+
+	ret = 0;
+
+bail:
+	return ret;
+}
+
+/**
+ * add_pkey - add the given PKEY to the hardware table
+ * @dd: the infinipath device
+ * @key: the PKEY
+ *
+ * Return an error code if unable to add the entry, zero if no change,
+ * or 1 if the hardware PKEY register needs to be updated.
+ */
+static int add_pkey(struct ipath_devdata *dd, u16 key)
+{
+	int i;
+	u16 lkey = key & 0x7FFF;
+	int any = 0;
+	int ret;
+
+	if (lkey == 0x7FFF) {
+		ret = 0;
+		goto bail;
+	}
+
+	/* Look for an empty slot or a matching PKEY. */
+	for (i = 0; i < ARRAY_SIZE(dd->ipath_pkeys); i++) {
+		if (!dd->ipath_pkeys[i]) {
+			any++;
+			continue;
+		}
+		/* If it matches exactly, try to increment the ref count */
+		if (dd->ipath_pkeys[i] == key) {
+			if (atomic_inc_return(&dd->ipath_pkeyrefs[i]) > 1) {
+				ret = 0;
+				goto bail;
+			}
+			/* Lost the race. Look for an empty slot below. */
+			atomic_dec(&dd->ipath_pkeyrefs[i]);
+			any++;
+		}
+		/*
+		 * It makes no sense to have both the limited and unlimited
+		 * PKEY set at the same time since the unlimited one will
+		 * disable the limited one.
+		 */
+		if ((dd->ipath_pkeys[i] & 0x7FFF) == lkey) {
+			ret = -EEXIST;
+			goto bail;
+		}
+	}
+	if (!any) {
+		ret = -EBUSY;
+		goto bail;
+	}
+	for (i = 0; i < ARRAY_SIZE(dd->ipath_pkeys); i++) {
+		if (!dd->ipath_pkeys[i] &&
+		    atomic_inc_return(&dd->ipath_pkeyrefs[i]) == 1) {
+			/* for ipathstats, etc. */
+			ipath_stats.sps_pkeys[i] = lkey;
+			dd->ipath_pkeys[i] = key;
+			ret = 1;
+			goto bail;
+		}
+	}
+	ret = -EBUSY;
+
+bail:
+	return ret;
+}
+
+/**
+ * set_pkeys - set the PKEY table for port 0
+ * @dd: the infinipath device
+ * @pkeys: the PKEY table
+ */
+static int set_pkeys(struct ipath_devdata *dd, u16 *pkeys, u8 port)
+{
+	struct ipath_portdata *pd;
+	int i;
+	int changed = 0;
+
+	/* always a kernel port, no locking needed */
+	pd = dd->ipath_pd[0];
+
+	for (i = 0; i < ARRAY_SIZE(pd->port_pkeys); i++) {
+		u16 key = pkeys[i];
+		u16 okey = pd->port_pkeys[i];
+
+		if (key == okey)
+			continue;
+		/*
+		 * The value of this PKEY table entry is changing.
+		 * Remove the old entry in the hardware's array of PKEYs.
+		 */
+		if (okey & 0x7FFF)
+			changed |= rm_pkey(dd, okey);
+		if (key & 0x7FFF) {
+			int ret = add_pkey(dd, key);
+
+			if (ret < 0)
+				key = 0;
+			else
+				changed |= ret;
+		}
+		pd->port_pkeys[i] = key;
+	}
+	if (changed) {
+		u64 pkey;
+		struct ib_event event;
+
+		pkey = (u64) dd->ipath_pkeys[0] |
+			((u64) dd->ipath_pkeys[1] << 16) |
+			((u64) dd->ipath_pkeys[2] << 32) |
+			((u64) dd->ipath_pkeys[3] << 48);
+		ipath_cdbg(VERBOSE, "p0 new pkey reg %llx\n",
+			   (unsigned long long) pkey);
+		ipath_write_kreg(dd, dd->ipath_kregs->kr_partitionkey,
+				 pkey);
+
+		event.event = IB_EVENT_PKEY_CHANGE;
+		event.device = &dd->verbs_dev->ibdev;
+		event.element.port_num = port;
+		ib_dispatch_event(&event);
+	}
+	return 0;
+}
+
+static int recv_subn_set_pkeytable(struct ib_smp *smp,
+				   struct ib_device *ibdev, u8 port)
+{
+	u32 startpx = 32 * (be32_to_cpu(smp->attr_mod) & 0xffff);
+	__be16 *p = (__be16 *) smp->data;
+	u16 *q = (u16 *) smp->data;
+	struct ipath_ibdev *dev = to_idev(ibdev);
+	unsigned i, n = ipath_get_npkeys(dev->dd);
+
+	for (i = 0; i < n; i++)
+		q[i] = be16_to_cpu(p[i]);
+
+	if (startpx != 0 || set_pkeys(dev->dd, q, port) != 0)
+		smp->status |= IB_SMP_INVALID_FIELD;
+
+	return recv_subn_get_pkeytable(smp, ibdev);
+}
+
+static int recv_pma_get_classportinfo(struct ib_pma_mad *pmp)
+{
+	struct ib_class_port_info *p =
+		(struct ib_class_port_info *)pmp->data;
+
+	memset(pmp->data, 0, sizeof(pmp->data));
+
+	if (pmp->mad_hdr.attr_mod != 0)
+		pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+
+	/* Indicate AllPortSelect is valid (only one port anyway) */
+	p->capability_mask = cpu_to_be16(1 << 8);
+	p->base_version = 1;
+	p->class_version = 1;
+	/*
+	 * Expected response time is 4.096 usec. * 2^18 == 1.073741824
+	 * sec.
+	 */
+	p->resp_time_value = 18;
+
+	return reply((struct ib_smp *) pmp);
+}
+
+/*
+ * The PortSamplesControl.CounterMasks field is an array of 3 bit fields
+ * which specify the N'th counter's capabilities. See ch. 16.1.3.2.
+ * We support 5 counters which only count the mandatory quantities.
+ */
+#define COUNTER_MASK(q, n) (q << ((9 - n) * 3))
+#define COUNTER_MASK0_9 cpu_to_be32(COUNTER_MASK(1, 0) | \
+				    COUNTER_MASK(1, 1) | \
+				    COUNTER_MASK(1, 2) | \
+				    COUNTER_MASK(1, 3) | \
+				    COUNTER_MASK(1, 4))
+
+static int recv_pma_get_portsamplescontrol(struct ib_pma_mad *pmp,
+					   struct ib_device *ibdev, u8 port)
+{
+	struct ib_pma_portsamplescontrol *p =
+		(struct ib_pma_portsamplescontrol *)pmp->data;
+	struct ipath_ibdev *dev = to_idev(ibdev);
+	struct ipath_cregs const *crp = dev->dd->ipath_cregs;
+	unsigned long flags;
+	u8 port_select = p->port_select;
+
+	memset(pmp->data, 0, sizeof(pmp->data));
+
+	p->port_select = port_select;
+	if (pmp->mad_hdr.attr_mod != 0 ||
+	    (port_select != port && port_select != 0xFF))
+		pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+	/*
+	 * Ticks are 10x the link transfer period which for 2.5Gbs is 4
+	 * nsec.  0 == 4 nsec., 1 == 8 nsec., ..., 255 == 1020 nsec.  Sample
+	 * intervals are counted in ticks.  Since we use Linux timers, that
+	 * count in jiffies, we can't sample for less than 1000 ticks if HZ
+	 * == 1000 (4000 ticks if HZ is 250).  link_speed_active returns 2 for
+	 * DDR, 1 for SDR, set the tick to 1 for DDR, 0 for SDR on chips that
+	 * have hardware support for delaying packets.
+	 */
+	if (crp->cr_psstat)
+		p->tick = dev->dd->ipath_link_speed_active - 1;
+	else
+		p->tick = 250;		/* 1 usec. */
+	p->counter_width = 4;	/* 32 bit counters */
+	p->counter_mask0_9 = COUNTER_MASK0_9;
+	spin_lock_irqsave(&dev->pending_lock, flags);
+	if (crp->cr_psstat)
+		p->sample_status = ipath_read_creg32(dev->dd, crp->cr_psstat);
+	else
+		p->sample_status = dev->pma_sample_status;
+	p->sample_start = cpu_to_be32(dev->pma_sample_start);
+	p->sample_interval = cpu_to_be32(dev->pma_sample_interval);
+	p->tag = cpu_to_be16(dev->pma_tag);
+	p->counter_select[0] = dev->pma_counter_select[0];
+	p->counter_select[1] = dev->pma_counter_select[1];
+	p->counter_select[2] = dev->pma_counter_select[2];
+	p->counter_select[3] = dev->pma_counter_select[3];
+	p->counter_select[4] = dev->pma_counter_select[4];
+	spin_unlock_irqrestore(&dev->pending_lock, flags);
+
+	return reply((struct ib_smp *) pmp);
+}
+
+static int recv_pma_set_portsamplescontrol(struct ib_pma_mad *pmp,
+					   struct ib_device *ibdev, u8 port)
+{
+	struct ib_pma_portsamplescontrol *p =
+		(struct ib_pma_portsamplescontrol *)pmp->data;
+	struct ipath_ibdev *dev = to_idev(ibdev);
+	struct ipath_cregs const *crp = dev->dd->ipath_cregs;
+	unsigned long flags;
+	u8 status;
+	int ret;
+
+	if (pmp->mad_hdr.attr_mod != 0 ||
+	    (p->port_select != port && p->port_select != 0xFF)) {
+		pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+		ret = reply((struct ib_smp *) pmp);
+		goto bail;
+	}
+
+	spin_lock_irqsave(&dev->pending_lock, flags);
+	if (crp->cr_psstat)
+		status = ipath_read_creg32(dev->dd, crp->cr_psstat);
+	else
+		status = dev->pma_sample_status;
+	if (status == IB_PMA_SAMPLE_STATUS_DONE) {
+		dev->pma_sample_start = be32_to_cpu(p->sample_start);
+		dev->pma_sample_interval = be32_to_cpu(p->sample_interval);
+		dev->pma_tag = be16_to_cpu(p->tag);
+		dev->pma_counter_select[0] = p->counter_select[0];
+		dev->pma_counter_select[1] = p->counter_select[1];
+		dev->pma_counter_select[2] = p->counter_select[2];
+		dev->pma_counter_select[3] = p->counter_select[3];
+		dev->pma_counter_select[4] = p->counter_select[4];
+		if (crp->cr_psstat) {
+			ipath_write_creg(dev->dd, crp->cr_psinterval,
+					 dev->pma_sample_interval);
+			ipath_write_creg(dev->dd, crp->cr_psstart,
+					 dev->pma_sample_start);
+		} else
+			dev->pma_sample_status = IB_PMA_SAMPLE_STATUS_STARTED;
+	}
+	spin_unlock_irqrestore(&dev->pending_lock, flags);
+
+	ret = recv_pma_get_portsamplescontrol(pmp, ibdev, port);
+
+bail:
+	return ret;
+}
+
+static u64 get_counter(struct ipath_ibdev *dev,
+		       struct ipath_cregs const *crp,
+		       __be16 sel)
+{
+	u64 ret;
+
+	switch (sel) {
+	case IB_PMA_PORT_XMIT_DATA:
+		ret = (crp->cr_psxmitdatacount) ?
+			ipath_read_creg32(dev->dd, crp->cr_psxmitdatacount) :
+			dev->ipath_sword;
+		break;
+	case IB_PMA_PORT_RCV_DATA:
+		ret = (crp->cr_psrcvdatacount) ?
+			ipath_read_creg32(dev->dd, crp->cr_psrcvdatacount) :
+			dev->ipath_rword;
+		break;
+	case IB_PMA_PORT_XMIT_PKTS:
+		ret = (crp->cr_psxmitpktscount) ?
+			ipath_read_creg32(dev->dd, crp->cr_psxmitpktscount) :
+			dev->ipath_spkts;
+		break;
+	case IB_PMA_PORT_RCV_PKTS:
+		ret = (crp->cr_psrcvpktscount) ?
+			ipath_read_creg32(dev->dd, crp->cr_psrcvpktscount) :
+			dev->ipath_rpkts;
+		break;
+	case IB_PMA_PORT_XMIT_WAIT:
+		ret = (crp->cr_psxmitwaitcount) ?
+			ipath_read_creg32(dev->dd, crp->cr_psxmitwaitcount) :
+			dev->ipath_xmit_wait;
+		break;
+	default:
+		ret = 0;
+	}
+
+	return ret;
+}
+
+static int recv_pma_get_portsamplesresult(struct ib_pma_mad *pmp,
+					  struct ib_device *ibdev)
+{
+	struct ib_pma_portsamplesresult *p =
+		(struct ib_pma_portsamplesresult *)pmp->data;
+	struct ipath_ibdev *dev = to_idev(ibdev);
+	struct ipath_cregs const *crp = dev->dd->ipath_cregs;
+	u8 status;
+	int i;
+
+	memset(pmp->data, 0, sizeof(pmp->data));
+	p->tag = cpu_to_be16(dev->pma_tag);
+	if (crp->cr_psstat)
+		status = ipath_read_creg32(dev->dd, crp->cr_psstat);
+	else
+		status = dev->pma_sample_status;
+	p->sample_status = cpu_to_be16(status);
+	for (i = 0; i < ARRAY_SIZE(dev->pma_counter_select); i++)
+		p->counter[i] = (status != IB_PMA_SAMPLE_STATUS_DONE) ? 0 :
+		    cpu_to_be32(
+			get_counter(dev, crp, dev->pma_counter_select[i]));
+
+	return reply((struct ib_smp *) pmp);
+}
+
+static int recv_pma_get_portsamplesresult_ext(struct ib_pma_mad *pmp,
+					      struct ib_device *ibdev)
+{
+	struct ib_pma_portsamplesresult_ext *p =
+		(struct ib_pma_portsamplesresult_ext *)pmp->data;
+	struct ipath_ibdev *dev = to_idev(ibdev);
+	struct ipath_cregs const *crp = dev->dd->ipath_cregs;
+	u8 status;
+	int i;
+
+	memset(pmp->data, 0, sizeof(pmp->data));
+	p->tag = cpu_to_be16(dev->pma_tag);
+	if (crp->cr_psstat)
+		status = ipath_read_creg32(dev->dd, crp->cr_psstat);
+	else
+		status = dev->pma_sample_status;
+	p->sample_status = cpu_to_be16(status);
+	/* 64 bits */
+	p->extended_width = cpu_to_be32(0x80000000);
+	for (i = 0; i < ARRAY_SIZE(dev->pma_counter_select); i++)
+		p->counter[i] = (status != IB_PMA_SAMPLE_STATUS_DONE) ? 0 :
+		    cpu_to_be64(
+			get_counter(dev, crp, dev->pma_counter_select[i]));
+
+	return reply((struct ib_smp *) pmp);
+}
+
+static int recv_pma_get_portcounters(struct ib_pma_mad *pmp,
+				     struct ib_device *ibdev, u8 port)
+{
+	struct ib_pma_portcounters *p = (struct ib_pma_portcounters *)
+		pmp->data;
+	struct ipath_ibdev *dev = to_idev(ibdev);
+	struct ipath_verbs_counters cntrs;
+	u8 port_select = p->port_select;
+
+	ipath_get_counters(dev->dd, &cntrs);
+
+	/* Adjust counters for any resets done. */
+	cntrs.symbol_error_counter -= dev->z_symbol_error_counter;
+	cntrs.link_error_recovery_counter -=
+		dev->z_link_error_recovery_counter;
+	cntrs.link_downed_counter -= dev->z_link_downed_counter;
+	cntrs.port_rcv_errors += dev->rcv_errors;
+	cntrs.port_rcv_errors -= dev->z_port_rcv_errors;
+	cntrs.port_rcv_remphys_errors -= dev->z_port_rcv_remphys_errors;
+	cntrs.port_xmit_discards -= dev->z_port_xmit_discards;
+	cntrs.port_xmit_data -= dev->z_port_xmit_data;
+	cntrs.port_rcv_data -= dev->z_port_rcv_data;
+	cntrs.port_xmit_packets -= dev->z_port_xmit_packets;
+	cntrs.port_rcv_packets -= dev->z_port_rcv_packets;
+	cntrs.local_link_integrity_errors -=
+		dev->z_local_link_integrity_errors;
+	cntrs.excessive_buffer_overrun_errors -=
+		dev->z_excessive_buffer_overrun_errors;
+	cntrs.vl15_dropped -= dev->z_vl15_dropped;
+	cntrs.vl15_dropped += dev->n_vl15_dropped;
+
+	memset(pmp->data, 0, sizeof(pmp->data));
+
+	p->port_select = port_select;
+	if (pmp->mad_hdr.attr_mod != 0 ||
+	    (port_select != port && port_select != 0xFF))
+		pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+
+	if (cntrs.symbol_error_counter > 0xFFFFUL)
+		p->symbol_error_counter = cpu_to_be16(0xFFFF);
+	else
+		p->symbol_error_counter =
+			cpu_to_be16((u16)cntrs.symbol_error_counter);
+	if (cntrs.link_error_recovery_counter > 0xFFUL)
+		p->link_error_recovery_counter = 0xFF;
+	else
+		p->link_error_recovery_counter =
+			(u8)cntrs.link_error_recovery_counter;
+	if (cntrs.link_downed_counter > 0xFFUL)
+		p->link_downed_counter = 0xFF;
+	else
+		p->link_downed_counter = (u8)cntrs.link_downed_counter;
+	if (cntrs.port_rcv_errors > 0xFFFFUL)
+		p->port_rcv_errors = cpu_to_be16(0xFFFF);
+	else
+		p->port_rcv_errors =
+			cpu_to_be16((u16) cntrs.port_rcv_errors);
+	if (cntrs.port_rcv_remphys_errors > 0xFFFFUL)
+		p->port_rcv_remphys_errors = cpu_to_be16(0xFFFF);
+	else
+		p->port_rcv_remphys_errors =
+			cpu_to_be16((u16)cntrs.port_rcv_remphys_errors);
+	if (cntrs.port_xmit_discards > 0xFFFFUL)
+		p->port_xmit_discards = cpu_to_be16(0xFFFF);
+	else
+		p->port_xmit_discards =
+			cpu_to_be16((u16)cntrs.port_xmit_discards);
+	if (cntrs.local_link_integrity_errors > 0xFUL)
+		cntrs.local_link_integrity_errors = 0xFUL;
+	if (cntrs.excessive_buffer_overrun_errors > 0xFUL)
+		cntrs.excessive_buffer_overrun_errors = 0xFUL;
+	p->link_overrun_errors = (cntrs.local_link_integrity_errors << 4) |
+		cntrs.excessive_buffer_overrun_errors;
+	if (cntrs.vl15_dropped > 0xFFFFUL)
+		p->vl15_dropped = cpu_to_be16(0xFFFF);
+	else
+		p->vl15_dropped = cpu_to_be16((u16)cntrs.vl15_dropped);
+	if (cntrs.port_xmit_data > 0xFFFFFFFFUL)
+		p->port_xmit_data = cpu_to_be32(0xFFFFFFFF);
+	else
+		p->port_xmit_data = cpu_to_be32((u32)cntrs.port_xmit_data);
+	if (cntrs.port_rcv_data > 0xFFFFFFFFUL)
+		p->port_rcv_data = cpu_to_be32(0xFFFFFFFF);
+	else
+		p->port_rcv_data = cpu_to_be32((u32)cntrs.port_rcv_data);
+	if (cntrs.port_xmit_packets > 0xFFFFFFFFUL)
+		p->port_xmit_packets = cpu_to_be32(0xFFFFFFFF);
+	else
+		p->port_xmit_packets =
+			cpu_to_be32((u32)cntrs.port_xmit_packets);
+	if (cntrs.port_rcv_packets > 0xFFFFFFFFUL)
+		p->port_rcv_packets = cpu_to_be32(0xFFFFFFFF);
+	else
+		p->port_rcv_packets =
+			cpu_to_be32((u32) cntrs.port_rcv_packets);
+
+	return reply((struct ib_smp *) pmp);
+}
+
+static int recv_pma_get_portcounters_ext(struct ib_pma_mad *pmp,
+					 struct ib_device *ibdev, u8 port)
+{
+	struct ib_pma_portcounters_ext *p =
+		(struct ib_pma_portcounters_ext *)pmp->data;
+	struct ipath_ibdev *dev = to_idev(ibdev);
+	u64 swords, rwords, spkts, rpkts, xwait;
+	u8 port_select = p->port_select;
+
+	ipath_snapshot_counters(dev->dd, &swords, &rwords, &spkts,
+				&rpkts, &xwait);
+
+	/* Adjust counters for any resets done. */
+	swords -= dev->z_port_xmit_data;
+	rwords -= dev->z_port_rcv_data;
+	spkts -= dev->z_port_xmit_packets;
+	rpkts -= dev->z_port_rcv_packets;
+
+	memset(pmp->data, 0, sizeof(pmp->data));
+
+	p->port_select = port_select;
+	if (pmp->mad_hdr.attr_mod != 0 ||
+	    (port_select != port && port_select != 0xFF))
+		pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+
+	p->port_xmit_data = cpu_to_be64(swords);
+	p->port_rcv_data = cpu_to_be64(rwords);
+	p->port_xmit_packets = cpu_to_be64(spkts);
+	p->port_rcv_packets = cpu_to_be64(rpkts);
+	p->port_unicast_xmit_packets = cpu_to_be64(dev->n_unicast_xmit);
+	p->port_unicast_rcv_packets = cpu_to_be64(dev->n_unicast_rcv);
+	p->port_multicast_xmit_packets = cpu_to_be64(dev->n_multicast_xmit);
+	p->port_multicast_rcv_packets = cpu_to_be64(dev->n_multicast_rcv);
+
+	return reply((struct ib_smp *) pmp);
+}
+
+static int recv_pma_set_portcounters(struct ib_pma_mad *pmp,
+				     struct ib_device *ibdev, u8 port)
+{
+	struct ib_pma_portcounters *p = (struct ib_pma_portcounters *)
+		pmp->data;
+	struct ipath_ibdev *dev = to_idev(ibdev);
+	struct ipath_verbs_counters cntrs;
+
+	/*
+	 * Since the HW doesn't support clearing counters, we save the
+	 * current count and subtract it from future responses.
+	 */
+	ipath_get_counters(dev->dd, &cntrs);
+
+	if (p->counter_select & IB_PMA_SEL_SYMBOL_ERROR)
+		dev->z_symbol_error_counter = cntrs.symbol_error_counter;
+
+	if (p->counter_select & IB_PMA_SEL_LINK_ERROR_RECOVERY)
+		dev->z_link_error_recovery_counter =
+			cntrs.link_error_recovery_counter;
+
+	if (p->counter_select & IB_PMA_SEL_LINK_DOWNED)
+		dev->z_link_downed_counter = cntrs.link_downed_counter;
+
+	if (p->counter_select & IB_PMA_SEL_PORT_RCV_ERRORS)
+		dev->z_port_rcv_errors =
+			cntrs.port_rcv_errors + dev->rcv_errors;
+
+	if (p->counter_select & IB_PMA_SEL_PORT_RCV_REMPHYS_ERRORS)
+		dev->z_port_rcv_remphys_errors =
+			cntrs.port_rcv_remphys_errors;
+
+	if (p->counter_select & IB_PMA_SEL_PORT_XMIT_DISCARDS)
+		dev->z_port_xmit_discards = cntrs.port_xmit_discards;
+
+	if (p->counter_select & IB_PMA_SEL_LOCAL_LINK_INTEGRITY_ERRORS)
+		dev->z_local_link_integrity_errors =
+			cntrs.local_link_integrity_errors;
+
+	if (p->counter_select & IB_PMA_SEL_EXCESSIVE_BUFFER_OVERRUNS)
+		dev->z_excessive_buffer_overrun_errors =
+			cntrs.excessive_buffer_overrun_errors;
+
+	if (p->counter_select & IB_PMA_SEL_PORT_VL15_DROPPED) {
+		dev->n_vl15_dropped = 0;
+		dev->z_vl15_dropped = cntrs.vl15_dropped;
+	}
+
+	if (p->counter_select & IB_PMA_SEL_PORT_XMIT_DATA)
+		dev->z_port_xmit_data = cntrs.port_xmit_data;
+
+	if (p->counter_select & IB_PMA_SEL_PORT_RCV_DATA)
+		dev->z_port_rcv_data = cntrs.port_rcv_data;
+
+	if (p->counter_select & IB_PMA_SEL_PORT_XMIT_PACKETS)
+		dev->z_port_xmit_packets = cntrs.port_xmit_packets;
+
+	if (p->counter_select & IB_PMA_SEL_PORT_RCV_PACKETS)
+		dev->z_port_rcv_packets = cntrs.port_rcv_packets;
+
+	return recv_pma_get_portcounters(pmp, ibdev, port);
+}
+
+static int recv_pma_set_portcounters_ext(struct ib_pma_mad *pmp,
+					 struct ib_device *ibdev, u8 port)
+{
+	struct ib_pma_portcounters *p = (struct ib_pma_portcounters *)
+		pmp->data;
+	struct ipath_ibdev *dev = to_idev(ibdev);
+	u64 swords, rwords, spkts, rpkts, xwait;
+
+	ipath_snapshot_counters(dev->dd, &swords, &rwords, &spkts,
+				&rpkts, &xwait);
+
+	if (p->counter_select & IB_PMA_SELX_PORT_XMIT_DATA)
+		dev->z_port_xmit_data = swords;
+
+	if (p->counter_select & IB_PMA_SELX_PORT_RCV_DATA)
+		dev->z_port_rcv_data = rwords;
+
+	if (p->counter_select & IB_PMA_SELX_PORT_XMIT_PACKETS)
+		dev->z_port_xmit_packets = spkts;
+
+	if (p->counter_select & IB_PMA_SELX_PORT_RCV_PACKETS)
+		dev->z_port_rcv_packets = rpkts;
+
+	if (p->counter_select & IB_PMA_SELX_PORT_UNI_XMIT_PACKETS)
+		dev->n_unicast_xmit = 0;
+
+	if (p->counter_select & IB_PMA_SELX_PORT_UNI_RCV_PACKETS)
+		dev->n_unicast_rcv = 0;
+
+	if (p->counter_select & IB_PMA_SELX_PORT_MULTI_XMIT_PACKETS)
+		dev->n_multicast_xmit = 0;
+
+	if (p->counter_select & IB_PMA_SELX_PORT_MULTI_RCV_PACKETS)
+		dev->n_multicast_rcv = 0;
+
+	return recv_pma_get_portcounters_ext(pmp, ibdev, port);
+}
+
+static int process_subn(struct ib_device *ibdev, int mad_flags,
+			u8 port_num, struct ib_mad *in_mad,
+			struct ib_mad *out_mad)
+{
+	struct ib_smp *smp = (struct ib_smp *)out_mad;
+	struct ipath_ibdev *dev = to_idev(ibdev);
+	int ret;
+
+	*out_mad = *in_mad;
+	if (smp->class_version != 1) {
+		smp->status |= IB_SMP_UNSUP_VERSION;
+		ret = reply(smp);
+		goto bail;
+	}
+
+	/* Is the mkey in the process of expiring? */
+	if (dev->mkey_lease_timeout &&
+	    time_after_eq(jiffies, dev->mkey_lease_timeout)) {
+		/* Clear timeout and mkey protection field. */
+		dev->mkey_lease_timeout = 0;
+		dev->mkeyprot = 0;
+	}
+
+	/*
+	 * M_Key checking depends on
+	 * Portinfo:M_Key_protect_bits
+	 */
+	if ((mad_flags & IB_MAD_IGNORE_MKEY) == 0 && dev->mkey != 0 &&
+	    dev->mkey != smp->mkey &&
+	    (smp->method == IB_MGMT_METHOD_SET ||
+	     (smp->method == IB_MGMT_METHOD_GET &&
+	      dev->mkeyprot >= 2))) {
+		if (dev->mkey_violations != 0xFFFF)
+			++dev->mkey_violations;
+		if (dev->mkey_lease_timeout ||
+		    dev->mkey_lease_period == 0) {
+			ret = IB_MAD_RESULT_SUCCESS |
+				IB_MAD_RESULT_CONSUMED;
+			goto bail;
+		}
+		dev->mkey_lease_timeout = jiffies +
+			dev->mkey_lease_period * HZ;
+		/* Future: Generate a trap notice. */
+		ret = IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED;
+		goto bail;
+	} else if (dev->mkey_lease_timeout)
+		dev->mkey_lease_timeout = 0;
+
+	switch (smp->method) {
+	case IB_MGMT_METHOD_GET:
+		switch (smp->attr_id) {
+		case IB_SMP_ATTR_NODE_DESC:
+			ret = recv_subn_get_nodedescription(smp, ibdev);
+			goto bail;
+		case IB_SMP_ATTR_NODE_INFO:
+			ret = recv_subn_get_nodeinfo(smp, ibdev, port_num);
+			goto bail;
+		case IB_SMP_ATTR_GUID_INFO:
+			ret = recv_subn_get_guidinfo(smp, ibdev);
+			goto bail;
+		case IB_SMP_ATTR_PORT_INFO:
+			ret = recv_subn_get_portinfo(smp, ibdev, port_num);
+			goto bail;
+		case IB_SMP_ATTR_PKEY_TABLE:
+			ret = recv_subn_get_pkeytable(smp, ibdev);
+			goto bail;
+		case IB_SMP_ATTR_SM_INFO:
+			if (dev->port_cap_flags & IB_PORT_SM_DISABLED) {
+				ret = IB_MAD_RESULT_SUCCESS |
+					IB_MAD_RESULT_CONSUMED;
+				goto bail;
+			}
+			if (dev->port_cap_flags & IB_PORT_SM) {
+				ret = IB_MAD_RESULT_SUCCESS;
+				goto bail;
+			}
+			/* FALLTHROUGH */
+		default:
+			smp->status |= IB_SMP_UNSUP_METH_ATTR;
+			ret = reply(smp);
+			goto bail;
+		}
+
+	case IB_MGMT_METHOD_SET:
+		switch (smp->attr_id) {
+		case IB_SMP_ATTR_GUID_INFO:
+			ret = recv_subn_set_guidinfo(smp, ibdev);
+			goto bail;
+		case IB_SMP_ATTR_PORT_INFO:
+			ret = recv_subn_set_portinfo(smp, ibdev, port_num);
+			goto bail;
+		case IB_SMP_ATTR_PKEY_TABLE:
+			ret = recv_subn_set_pkeytable(smp, ibdev, port_num);
+			goto bail;
+		case IB_SMP_ATTR_SM_INFO:
+			if (dev->port_cap_flags & IB_PORT_SM_DISABLED) {
+				ret = IB_MAD_RESULT_SUCCESS |
+					IB_MAD_RESULT_CONSUMED;
+				goto bail;
+			}
+			if (dev->port_cap_flags & IB_PORT_SM) {
+				ret = IB_MAD_RESULT_SUCCESS;
+				goto bail;
+			}
+			/* FALLTHROUGH */
+		default:
+			smp->status |= IB_SMP_UNSUP_METH_ATTR;
+			ret = reply(smp);
+			goto bail;
+		}
+
+	case IB_MGMT_METHOD_TRAP:
+	case IB_MGMT_METHOD_REPORT:
+	case IB_MGMT_METHOD_REPORT_RESP:
+	case IB_MGMT_METHOD_TRAP_REPRESS:
+	case IB_MGMT_METHOD_GET_RESP:
+		/*
+		 * The ib_mad module will call us to process responses
+		 * before checking for other consumers.
+		 * Just tell the caller to process it normally.
+		 */
+		ret = IB_MAD_RESULT_SUCCESS;
+		goto bail;
+	default:
+		smp->status |= IB_SMP_UNSUP_METHOD;
+		ret = reply(smp);
+	}
+
+bail:
+	return ret;
+}
+
+static int process_perf(struct ib_device *ibdev, u8 port_num,
+			struct ib_mad *in_mad,
+			struct ib_mad *out_mad)
+{
+	struct ib_pma_mad *pmp = (struct ib_pma_mad *)out_mad;
+	int ret;
+
+	*out_mad = *in_mad;
+	if (pmp->mad_hdr.class_version != 1) {
+		pmp->mad_hdr.status |= IB_SMP_UNSUP_VERSION;
+		ret = reply((struct ib_smp *) pmp);
+		goto bail;
+	}
+
+	switch (pmp->mad_hdr.method) {
+	case IB_MGMT_METHOD_GET:
+		switch (pmp->mad_hdr.attr_id) {
+		case IB_PMA_CLASS_PORT_INFO:
+			ret = recv_pma_get_classportinfo(pmp);
+			goto bail;
+		case IB_PMA_PORT_SAMPLES_CONTROL:
+			ret = recv_pma_get_portsamplescontrol(pmp, ibdev,
+							      port_num);
+			goto bail;
+		case IB_PMA_PORT_SAMPLES_RESULT:
+			ret = recv_pma_get_portsamplesresult(pmp, ibdev);
+			goto bail;
+		case IB_PMA_PORT_SAMPLES_RESULT_EXT:
+			ret = recv_pma_get_portsamplesresult_ext(pmp,
+								 ibdev);
+			goto bail;
+		case IB_PMA_PORT_COUNTERS:
+			ret = recv_pma_get_portcounters(pmp, ibdev,
+							port_num);
+			goto bail;
+		case IB_PMA_PORT_COUNTERS_EXT:
+			ret = recv_pma_get_portcounters_ext(pmp, ibdev,
+							    port_num);
+			goto bail;
+		default:
+			pmp->mad_hdr.status |= IB_SMP_UNSUP_METH_ATTR;
+			ret = reply((struct ib_smp *) pmp);
+			goto bail;
+		}
+
+	case IB_MGMT_METHOD_SET:
+		switch (pmp->mad_hdr.attr_id) {
+		case IB_PMA_PORT_SAMPLES_CONTROL:
+			ret = recv_pma_set_portsamplescontrol(pmp, ibdev,
+							      port_num);
+			goto bail;
+		case IB_PMA_PORT_COUNTERS:
+			ret = recv_pma_set_portcounters(pmp, ibdev,
+							port_num);
+			goto bail;
+		case IB_PMA_PORT_COUNTERS_EXT:
+			ret = recv_pma_set_portcounters_ext(pmp, ibdev,
+							    port_num);
+			goto bail;
+		default:
+			pmp->mad_hdr.status |= IB_SMP_UNSUP_METH_ATTR;
+			ret = reply((struct ib_smp *) pmp);
+			goto bail;
+		}
+
+	case IB_MGMT_METHOD_GET_RESP:
+		/*
+		 * The ib_mad module will call us to process responses
+		 * before checking for other consumers.
+		 * Just tell the caller to process it normally.
+		 */
+		ret = IB_MAD_RESULT_SUCCESS;
+		goto bail;
+	default:
+		pmp->mad_hdr.status |= IB_SMP_UNSUP_METHOD;
+		ret = reply((struct ib_smp *) pmp);
+	}
+
+bail:
+	return ret;
+}
+
+/**
+ * ipath_process_mad - process an incoming MAD packet
+ * @ibdev: the infiniband device this packet came in on
+ * @mad_flags: MAD flags
+ * @port_num: the port number this packet came in on
+ * @in_wc: the work completion entry for this packet
+ * @in_grh: the global route header for this packet
+ * @in_mad: the incoming MAD
+ * @out_mad: any outgoing MAD reply
+ *
+ * Returns IB_MAD_RESULT_SUCCESS if this is a MAD that we are not
+ * interested in processing.
+ *
+ * Note that the verbs framework has already done the MAD sanity checks,
+ * and hop count/pointer updating for IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE
+ * MADs.
+ *
+ * This is called by the ib_mad module.
+ */
+int ipath_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
+		      struct ib_wc *in_wc, struct ib_grh *in_grh,
+		      struct ib_mad *in_mad, struct ib_mad *out_mad)
+{
+	int ret;
+
+	switch (in_mad->mad_hdr.mgmt_class) {
+	case IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE:
+	case IB_MGMT_CLASS_SUBN_LID_ROUTED:
+		ret = process_subn(ibdev, mad_flags, port_num,
+				   in_mad, out_mad);
+		goto bail;
+	case IB_MGMT_CLASS_PERF_MGMT:
+		ret = process_perf(ibdev, port_num, in_mad, out_mad);
+		goto bail;
+	default:
+		ret = IB_MAD_RESULT_SUCCESS;
+	}
+
+bail:
+	return ret;
+}
diff --git a/drivers/infiniband/hw/ipath/ipath_mmap.c b/drivers/infiniband/hw/ipath/ipath_mmap.c
new file mode 100644
index 0000000..e732742
--- /dev/null
+++ b/drivers/infiniband/hw/ipath/ipath_mmap.c
@@ -0,0 +1,174 @@
+/*
+ * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/vmalloc.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/errno.h>
+#include <asm/pgtable.h>
+
+#include "ipath_verbs.h"
+
+/**
+ * ipath_release_mmap_info - free mmap info structure
+ * @ref: a pointer to the kref within struct ipath_mmap_info
+ */
+void ipath_release_mmap_info(struct kref *ref)
+{
+	struct ipath_mmap_info *ip =
+		container_of(ref, struct ipath_mmap_info, ref);
+	struct ipath_ibdev *dev = to_idev(ip->context->device);
+
+	spin_lock_irq(&dev->pending_lock);
+	list_del(&ip->pending_mmaps);
+	spin_unlock_irq(&dev->pending_lock);
+
+	vfree(ip->obj);
+	kfree(ip);
+}
+
+/*
+ * open and close keep track of how many times the CQ is mapped,
+ * to avoid releasing it.
+ */
+static void ipath_vma_open(struct vm_area_struct *vma)
+{
+	struct ipath_mmap_info *ip = vma->vm_private_data;
+
+	kref_get(&ip->ref);
+}
+
+static void ipath_vma_close(struct vm_area_struct *vma)
+{
+	struct ipath_mmap_info *ip = vma->vm_private_data;
+
+	kref_put(&ip->ref, ipath_release_mmap_info);
+}
+
+static const struct vm_operations_struct ipath_vm_ops = {
+	.open =     ipath_vma_open,
+	.close =    ipath_vma_close,
+};
+
+/**
+ * ipath_mmap - create a new mmap region
+ * @context: the IB user context of the process making the mmap() call
+ * @vma: the VMA to be initialized
+ * Return zero if the mmap is OK. Otherwise, return an errno.
+ */
+int ipath_mmap(struct ib_ucontext *context, struct vm_area_struct *vma)
+{
+	struct ipath_ibdev *dev = to_idev(context->device);
+	unsigned long offset = vma->vm_pgoff << PAGE_SHIFT;
+	unsigned long size = vma->vm_end - vma->vm_start;
+	struct ipath_mmap_info *ip, *pp;
+	int ret = -EINVAL;
+
+	/*
+	 * Search the device's list of objects waiting for a mmap call.
+	 * Normally, this list is very short since a call to create a
+	 * CQ, QP, or SRQ is soon followed by a call to mmap().
+	 */
+	spin_lock_irq(&dev->pending_lock);
+	list_for_each_entry_safe(ip, pp, &dev->pending_mmaps,
+				 pending_mmaps) {
+		/* Only the creator is allowed to mmap the object */
+		if (context != ip->context || (__u64) offset != ip->offset)
+			continue;
+		/* Don't allow a mmap larger than the object. */
+		if (size > ip->size)
+			break;
+
+		list_del_init(&ip->pending_mmaps);
+		spin_unlock_irq(&dev->pending_lock);
+
+		ret = remap_vmalloc_range(vma, ip->obj, 0);
+		if (ret)
+			goto done;
+		vma->vm_ops = &ipath_vm_ops;
+		vma->vm_private_data = ip;
+		ipath_vma_open(vma);
+		goto done;
+	}
+	spin_unlock_irq(&dev->pending_lock);
+done:
+	return ret;
+}
+
+/*
+ * Allocate information for ipath_mmap
+ */
+struct ipath_mmap_info *ipath_create_mmap_info(struct ipath_ibdev *dev,
+					       u32 size,
+					       struct ib_ucontext *context,
+					       void *obj) {
+	struct ipath_mmap_info *ip;
+
+	ip = kmalloc(sizeof *ip, GFP_KERNEL);
+	if (!ip)
+		goto bail;
+
+	size = PAGE_ALIGN(size);
+
+	spin_lock_irq(&dev->mmap_offset_lock);
+	if (dev->mmap_offset == 0)
+		dev->mmap_offset = PAGE_SIZE;
+	ip->offset = dev->mmap_offset;
+	dev->mmap_offset += size;
+	spin_unlock_irq(&dev->mmap_offset_lock);
+
+	INIT_LIST_HEAD(&ip->pending_mmaps);
+	ip->size = size;
+	ip->context = context;
+	ip->obj = obj;
+	kref_init(&ip->ref);
+
+bail:
+	return ip;
+}
+
+void ipath_update_mmap_info(struct ipath_ibdev *dev,
+			    struct ipath_mmap_info *ip,
+			    u32 size, void *obj) {
+	size = PAGE_ALIGN(size);
+
+	spin_lock_irq(&dev->mmap_offset_lock);
+	if (dev->mmap_offset == 0)
+		dev->mmap_offset = PAGE_SIZE;
+	ip->offset = dev->mmap_offset;
+	dev->mmap_offset += size;
+	spin_unlock_irq(&dev->mmap_offset_lock);
+
+	ip->size = size;
+	ip->obj = obj;
+}
diff --git a/drivers/infiniband/hw/ipath/ipath_mr.c b/drivers/infiniband/hw/ipath/ipath_mr.c
new file mode 100644
index 0000000..5e61e9b
--- /dev/null
+++ b/drivers/infiniband/hw/ipath/ipath_mr.c
@@ -0,0 +1,425 @@
+/*
+ * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/slab.h>
+
+#include <rdma/ib_umem.h>
+#include <rdma/ib_pack.h>
+#include <rdma/ib_smi.h>
+
+#include "ipath_verbs.h"
+
+/* Fast memory region */
+struct ipath_fmr {
+	struct ib_fmr ibfmr;
+	u8 page_shift;
+	struct ipath_mregion mr;        /* must be last */
+};
+
+static inline struct ipath_fmr *to_ifmr(struct ib_fmr *ibfmr)
+{
+	return container_of(ibfmr, struct ipath_fmr, ibfmr);
+}
+
+/**
+ * ipath_get_dma_mr - get a DMA memory region
+ * @pd: protection domain for this memory region
+ * @acc: access flags
+ *
+ * Returns the memory region on success, otherwise returns an errno.
+ * Note that all DMA addresses should be created via the
+ * struct ib_dma_mapping_ops functions (see ipath_dma.c).
+ */
+struct ib_mr *ipath_get_dma_mr(struct ib_pd *pd, int acc)
+{
+	struct ipath_mr *mr;
+	struct ib_mr *ret;
+
+	mr = kzalloc(sizeof *mr, GFP_KERNEL);
+	if (!mr) {
+		ret = ERR_PTR(-ENOMEM);
+		goto bail;
+	}
+
+	mr->mr.access_flags = acc;
+	ret = &mr->ibmr;
+
+bail:
+	return ret;
+}
+
+static struct ipath_mr *alloc_mr(int count,
+				 struct ipath_lkey_table *lk_table)
+{
+	struct ipath_mr *mr;
+	int m, i = 0;
+
+	/* Allocate struct plus pointers to first level page tables. */
+	m = (count + IPATH_SEGSZ - 1) / IPATH_SEGSZ;
+	mr = kmalloc(sizeof *mr + m * sizeof mr->mr.map[0], GFP_KERNEL);
+	if (!mr)
+		goto done;
+
+	/* Allocate first level page tables. */
+	for (; i < m; i++) {
+		mr->mr.map[i] = kmalloc(sizeof *mr->mr.map[0], GFP_KERNEL);
+		if (!mr->mr.map[i])
+			goto bail;
+	}
+	mr->mr.mapsz = m;
+
+	/*
+	 * ib_reg_phys_mr() will initialize mr->ibmr except for
+	 * lkey and rkey.
+	 */
+	if (!ipath_alloc_lkey(lk_table, &mr->mr))
+		goto bail;
+	mr->ibmr.rkey = mr->ibmr.lkey = mr->mr.lkey;
+
+	goto done;
+
+bail:
+	while (i) {
+		i--;
+		kfree(mr->mr.map[i]);
+	}
+	kfree(mr);
+	mr = NULL;
+
+done:
+	return mr;
+}
+
+/**
+ * ipath_reg_phys_mr - register a physical memory region
+ * @pd: protection domain for this memory region
+ * @buffer_list: pointer to the list of physical buffers to register
+ * @num_phys_buf: the number of physical buffers to register
+ * @iova_start: the starting address passed over IB which maps to this MR
+ *
+ * Returns the memory region on success, otherwise returns an errno.
+ */
+struct ib_mr *ipath_reg_phys_mr(struct ib_pd *pd,
+				struct ib_phys_buf *buffer_list,
+				int num_phys_buf, int acc, u64 *iova_start)
+{
+	struct ipath_mr *mr;
+	int n, m, i;
+	struct ib_mr *ret;
+
+	mr = alloc_mr(num_phys_buf, &to_idev(pd->device)->lk_table);
+	if (mr == NULL) {
+		ret = ERR_PTR(-ENOMEM);
+		goto bail;
+	}
+
+	mr->mr.pd = pd;
+	mr->mr.user_base = *iova_start;
+	mr->mr.iova = *iova_start;
+	mr->mr.length = 0;
+	mr->mr.offset = 0;
+	mr->mr.access_flags = acc;
+	mr->mr.max_segs = num_phys_buf;
+	mr->umem = NULL;
+
+	m = 0;
+	n = 0;
+	for (i = 0; i < num_phys_buf; i++) {
+		mr->mr.map[m]->segs[n].vaddr = (void *) buffer_list[i].addr;
+		mr->mr.map[m]->segs[n].length = buffer_list[i].size;
+		mr->mr.length += buffer_list[i].size;
+		n++;
+		if (n == IPATH_SEGSZ) {
+			m++;
+			n = 0;
+		}
+	}
+
+	ret = &mr->ibmr;
+
+bail:
+	return ret;
+}
+
+/**
+ * ipath_reg_user_mr - register a userspace memory region
+ * @pd: protection domain for this memory region
+ * @start: starting userspace address
+ * @length: length of region to register
+ * @virt_addr: virtual address to use (from HCA's point of view)
+ * @mr_access_flags: access flags for this memory region
+ * @udata: unused by the InfiniPath driver
+ *
+ * Returns the memory region on success, otherwise returns an errno.
+ */
+struct ib_mr *ipath_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
+				u64 virt_addr, int mr_access_flags,
+				struct ib_udata *udata)
+{
+	struct ipath_mr *mr;
+	struct ib_umem *umem;
+	int n, m, entry;
+	struct scatterlist *sg;
+	struct ib_mr *ret;
+
+	if (length == 0) {
+		ret = ERR_PTR(-EINVAL);
+		goto bail;
+	}
+
+	umem = ib_umem_get(pd->uobject->context, start, length,
+			   mr_access_flags, 0);
+	if (IS_ERR(umem))
+		return (void *) umem;
+
+	n = umem->nmap;
+	mr = alloc_mr(n, &to_idev(pd->device)->lk_table);
+	if (!mr) {
+		ret = ERR_PTR(-ENOMEM);
+		ib_umem_release(umem);
+		goto bail;
+	}
+
+	mr->mr.pd = pd;
+	mr->mr.user_base = start;
+	mr->mr.iova = virt_addr;
+	mr->mr.length = length;
+	mr->mr.offset = umem->offset;
+	mr->mr.access_flags = mr_access_flags;
+	mr->mr.max_segs = n;
+	mr->umem = umem;
+
+	m = 0;
+	n = 0;
+	for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) {
+		void *vaddr;
+
+		vaddr = page_address(sg_page(sg));
+		if (!vaddr) {
+			ret = ERR_PTR(-EINVAL);
+			goto bail;
+		}
+		mr->mr.map[m]->segs[n].vaddr = vaddr;
+		mr->mr.map[m]->segs[n].length = umem->page_size;
+		n++;
+		if (n == IPATH_SEGSZ) {
+			m++;
+			n = 0;
+		}
+	}
+	ret = &mr->ibmr;
+
+bail:
+	return ret;
+}
+
+/**
+ * ipath_dereg_mr - unregister and free a memory region
+ * @ibmr: the memory region to free
+ *
+ * Returns 0 on success.
+ *
+ * Note that this is called to free MRs created by ipath_get_dma_mr()
+ * or ipath_reg_user_mr().
+ */
+int ipath_dereg_mr(struct ib_mr *ibmr)
+{
+	struct ipath_mr *mr = to_imr(ibmr);
+	int i;
+
+	ipath_free_lkey(&to_idev(ibmr->device)->lk_table, ibmr->lkey);
+	i = mr->mr.mapsz;
+	while (i) {
+		i--;
+		kfree(mr->mr.map[i]);
+	}
+
+	if (mr->umem)
+		ib_umem_release(mr->umem);
+
+	kfree(mr);
+	return 0;
+}
+
+/**
+ * ipath_alloc_fmr - allocate a fast memory region
+ * @pd: the protection domain for this memory region
+ * @mr_access_flags: access flags for this memory region
+ * @fmr_attr: fast memory region attributes
+ *
+ * Returns the memory region on success, otherwise returns an errno.
+ */
+struct ib_fmr *ipath_alloc_fmr(struct ib_pd *pd, int mr_access_flags,
+			       struct ib_fmr_attr *fmr_attr)
+{
+	struct ipath_fmr *fmr;
+	int m, i = 0;
+	struct ib_fmr *ret;
+
+	/* Allocate struct plus pointers to first level page tables. */
+	m = (fmr_attr->max_pages + IPATH_SEGSZ - 1) / IPATH_SEGSZ;
+	fmr = kmalloc(sizeof *fmr + m * sizeof fmr->mr.map[0], GFP_KERNEL);
+	if (!fmr)
+		goto bail;
+
+	/* Allocate first level page tables. */
+	for (; i < m; i++) {
+		fmr->mr.map[i] = kmalloc(sizeof *fmr->mr.map[0],
+					 GFP_KERNEL);
+		if (!fmr->mr.map[i])
+			goto bail;
+	}
+	fmr->mr.mapsz = m;
+
+	/*
+	 * ib_alloc_fmr() will initialize fmr->ibfmr except for lkey &
+	 * rkey.
+	 */
+	if (!ipath_alloc_lkey(&to_idev(pd->device)->lk_table, &fmr->mr))
+		goto bail;
+	fmr->ibfmr.rkey = fmr->ibfmr.lkey = fmr->mr.lkey;
+	/*
+	 * Resources are allocated but no valid mapping (RKEY can't be
+	 * used).
+	 */
+	fmr->mr.pd = pd;
+	fmr->mr.user_base = 0;
+	fmr->mr.iova = 0;
+	fmr->mr.length = 0;
+	fmr->mr.offset = 0;
+	fmr->mr.access_flags = mr_access_flags;
+	fmr->mr.max_segs = fmr_attr->max_pages;
+	fmr->page_shift = fmr_attr->page_shift;
+
+	ret = &fmr->ibfmr;
+	goto done;
+
+bail:
+	while (i)
+		kfree(fmr->mr.map[--i]);
+	kfree(fmr);
+	ret = ERR_PTR(-ENOMEM);
+
+done:
+	return ret;
+}
+
+/**
+ * ipath_map_phys_fmr - set up a fast memory region
+ * @ibmfr: the fast memory region to set up
+ * @page_list: the list of pages to associate with the fast memory region
+ * @list_len: the number of pages to associate with the fast memory region
+ * @iova: the virtual address of the start of the fast memory region
+ *
+ * This may be called from interrupt context.
+ */
+
+int ipath_map_phys_fmr(struct ib_fmr *ibfmr, u64 * page_list,
+		       int list_len, u64 iova)
+{
+	struct ipath_fmr *fmr = to_ifmr(ibfmr);
+	struct ipath_lkey_table *rkt;
+	unsigned long flags;
+	int m, n, i;
+	u32 ps;
+	int ret;
+
+	if (list_len > fmr->mr.max_segs) {
+		ret = -EINVAL;
+		goto bail;
+	}
+	rkt = &to_idev(ibfmr->device)->lk_table;
+	spin_lock_irqsave(&rkt->lock, flags);
+	fmr->mr.user_base = iova;
+	fmr->mr.iova = iova;
+	ps = 1 << fmr->page_shift;
+	fmr->mr.length = list_len * ps;
+	m = 0;
+	n = 0;
+	ps = 1 << fmr->page_shift;
+	for (i = 0; i < list_len; i++) {
+		fmr->mr.map[m]->segs[n].vaddr = (void *) page_list[i];
+		fmr->mr.map[m]->segs[n].length = ps;
+		if (++n == IPATH_SEGSZ) {
+			m++;
+			n = 0;
+		}
+	}
+	spin_unlock_irqrestore(&rkt->lock, flags);
+	ret = 0;
+
+bail:
+	return ret;
+}
+
+/**
+ * ipath_unmap_fmr - unmap fast memory regions
+ * @fmr_list: the list of fast memory regions to unmap
+ *
+ * Returns 0 on success.
+ */
+int ipath_unmap_fmr(struct list_head *fmr_list)
+{
+	struct ipath_fmr *fmr;
+	struct ipath_lkey_table *rkt;
+	unsigned long flags;
+
+	list_for_each_entry(fmr, fmr_list, ibfmr.list) {
+		rkt = &to_idev(fmr->ibfmr.device)->lk_table;
+		spin_lock_irqsave(&rkt->lock, flags);
+		fmr->mr.user_base = 0;
+		fmr->mr.iova = 0;
+		fmr->mr.length = 0;
+		spin_unlock_irqrestore(&rkt->lock, flags);
+	}
+	return 0;
+}
+
+/**
+ * ipath_dealloc_fmr - deallocate a fast memory region
+ * @ibfmr: the fast memory region to deallocate
+ *
+ * Returns 0 on success.
+ */
+int ipath_dealloc_fmr(struct ib_fmr *ibfmr)
+{
+	struct ipath_fmr *fmr = to_ifmr(ibfmr);
+	int i;
+
+	ipath_free_lkey(&to_idev(ibfmr->device)->lk_table, ibfmr->lkey);
+	i = fmr->mr.mapsz;
+	while (i)
+		kfree(fmr->mr.map[--i]);
+	kfree(fmr);
+	return 0;
+}
diff --git a/drivers/infiniband/hw/ipath/ipath_qp.c b/drivers/infiniband/hw/ipath/ipath_qp.c
new file mode 100644
index 0000000..face876
--- /dev/null
+++ b/drivers/infiniband/hw/ipath/ipath_qp.c
@@ -0,0 +1,1080 @@
+/*
+ * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/err.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+
+#include "ipath_verbs.h"
+#include "ipath_kernel.h"
+
+#define BITS_PER_PAGE		(PAGE_SIZE*BITS_PER_BYTE)
+#define BITS_PER_PAGE_MASK	(BITS_PER_PAGE-1)
+#define mk_qpn(qpt, map, off)	(((map) - (qpt)->map) * BITS_PER_PAGE + \
+				 (off))
+#define find_next_offset(map, off) find_next_zero_bit((map)->page, \
+						      BITS_PER_PAGE, off)
+
+/*
+ * Convert the AETH credit code into the number of credits.
+ */
+static u32 credit_table[31] = {
+	0,			/* 0 */
+	1,			/* 1 */
+	2,			/* 2 */
+	3,			/* 3 */
+	4,			/* 4 */
+	6,			/* 5 */
+	8,			/* 6 */
+	12,			/* 7 */
+	16,			/* 8 */
+	24,			/* 9 */
+	32,			/* A */
+	48,			/* B */
+	64,			/* C */
+	96,			/* D */
+	128,			/* E */
+	192,			/* F */
+	256,			/* 10 */
+	384,			/* 11 */
+	512,			/* 12 */
+	768,			/* 13 */
+	1024,			/* 14 */
+	1536,			/* 15 */
+	2048,			/* 16 */
+	3072,			/* 17 */
+	4096,			/* 18 */
+	6144,			/* 19 */
+	8192,			/* 1A */
+	12288,			/* 1B */
+	16384,			/* 1C */
+	24576,			/* 1D */
+	32768			/* 1E */
+};
+
+
+static void get_map_page(struct ipath_qp_table *qpt, struct qpn_map *map)
+{
+	unsigned long page = get_zeroed_page(GFP_KERNEL);
+	unsigned long flags;
+
+	/*
+	 * Free the page if someone raced with us installing it.
+	 */
+
+	spin_lock_irqsave(&qpt->lock, flags);
+	if (map->page)
+		free_page(page);
+	else
+		map->page = (void *)page;
+	spin_unlock_irqrestore(&qpt->lock, flags);
+}
+
+
+static int alloc_qpn(struct ipath_qp_table *qpt, enum ib_qp_type type)
+{
+	u32 i, offset, max_scan, qpn;
+	struct qpn_map *map;
+	u32 ret = -1;
+
+	if (type == IB_QPT_SMI)
+		ret = 0;
+	else if (type == IB_QPT_GSI)
+		ret = 1;
+
+	if (ret != -1) {
+		map = &qpt->map[0];
+		if (unlikely(!map->page)) {
+			get_map_page(qpt, map);
+			if (unlikely(!map->page)) {
+				ret = -ENOMEM;
+				goto bail;
+			}
+		}
+		if (!test_and_set_bit(ret, map->page))
+			atomic_dec(&map->n_free);
+		else
+			ret = -EBUSY;
+		goto bail;
+	}
+
+	qpn = qpt->last + 1;
+	if (qpn >= QPN_MAX)
+		qpn = 2;
+	offset = qpn & BITS_PER_PAGE_MASK;
+	map = &qpt->map[qpn / BITS_PER_PAGE];
+	max_scan = qpt->nmaps - !offset;
+	for (i = 0;;) {
+		if (unlikely(!map->page)) {
+			get_map_page(qpt, map);
+			if (unlikely(!map->page))
+				break;
+		}
+		if (likely(atomic_read(&map->n_free))) {
+			do {
+				if (!test_and_set_bit(offset, map->page)) {
+					atomic_dec(&map->n_free);
+					qpt->last = qpn;
+					ret = qpn;
+					goto bail;
+				}
+				offset = find_next_offset(map, offset);
+				qpn = mk_qpn(qpt, map, offset);
+				/*
+				 * This test differs from alloc_pidmap().
+				 * If find_next_offset() does find a zero
+				 * bit, we don't need to check for QPN
+				 * wrapping around past our starting QPN.
+				 * We just need to be sure we don't loop
+				 * forever.
+				 */
+			} while (offset < BITS_PER_PAGE && qpn < QPN_MAX);
+		}
+		/*
+		 * In order to keep the number of pages allocated to a
+		 * minimum, we scan the all existing pages before increasing
+		 * the size of the bitmap table.
+		 */
+		if (++i > max_scan) {
+			if (qpt->nmaps == QPNMAP_ENTRIES)
+				break;
+			map = &qpt->map[qpt->nmaps++];
+			offset = 0;
+		} else if (map < &qpt->map[qpt->nmaps]) {
+			++map;
+			offset = 0;
+		} else {
+			map = &qpt->map[0];
+			offset = 2;
+		}
+		qpn = mk_qpn(qpt, map, offset);
+	}
+
+	ret = -ENOMEM;
+
+bail:
+	return ret;
+}
+
+static void free_qpn(struct ipath_qp_table *qpt, u32 qpn)
+{
+	struct qpn_map *map;
+
+	map = qpt->map + qpn / BITS_PER_PAGE;
+	if (map->page)
+		clear_bit(qpn & BITS_PER_PAGE_MASK, map->page);
+	atomic_inc(&map->n_free);
+}
+
+/**
+ * ipath_alloc_qpn - allocate a QP number
+ * @qpt: the QP table
+ * @qp: the QP
+ * @type: the QP type (IB_QPT_SMI and IB_QPT_GSI are special)
+ *
+ * Allocate the next available QPN and put the QP into the hash table.
+ * The hash table holds a reference to the QP.
+ */
+static int ipath_alloc_qpn(struct ipath_qp_table *qpt, struct ipath_qp *qp,
+			   enum ib_qp_type type)
+{
+	unsigned long flags;
+	int ret;
+
+	ret = alloc_qpn(qpt, type);
+	if (ret < 0)
+		goto bail;
+	qp->ibqp.qp_num = ret;
+
+	/* Add the QP to the hash table. */
+	spin_lock_irqsave(&qpt->lock, flags);
+
+	ret %= qpt->max;
+	qp->next = qpt->table[ret];
+	qpt->table[ret] = qp;
+	atomic_inc(&qp->refcount);
+
+	spin_unlock_irqrestore(&qpt->lock, flags);
+	ret = 0;
+
+bail:
+	return ret;
+}
+
+/**
+ * ipath_free_qp - remove a QP from the QP table
+ * @qpt: the QP table
+ * @qp: the QP to remove
+ *
+ * Remove the QP from the table so it can't be found asynchronously by
+ * the receive interrupt routine.
+ */
+static void ipath_free_qp(struct ipath_qp_table *qpt, struct ipath_qp *qp)
+{
+	struct ipath_qp *q, **qpp;
+	unsigned long flags;
+
+	spin_lock_irqsave(&qpt->lock, flags);
+
+	/* Remove QP from the hash table. */
+	qpp = &qpt->table[qp->ibqp.qp_num % qpt->max];
+	for (; (q = *qpp) != NULL; qpp = &q->next) {
+		if (q == qp) {
+			*qpp = qp->next;
+			qp->next = NULL;
+			atomic_dec(&qp->refcount);
+			break;
+		}
+	}
+
+	spin_unlock_irqrestore(&qpt->lock, flags);
+}
+
+/**
+ * ipath_free_all_qps - check for QPs still in use
+ * @qpt: the QP table to empty
+ *
+ * There should not be any QPs still in use.
+ * Free memory for table.
+ */
+unsigned ipath_free_all_qps(struct ipath_qp_table *qpt)
+{
+	unsigned long flags;
+	struct ipath_qp *qp;
+	u32 n, qp_inuse = 0;
+
+	spin_lock_irqsave(&qpt->lock, flags);
+	for (n = 0; n < qpt->max; n++) {
+		qp = qpt->table[n];
+		qpt->table[n] = NULL;
+
+		for (; qp; qp = qp->next)
+			qp_inuse++;
+	}
+	spin_unlock_irqrestore(&qpt->lock, flags);
+
+	for (n = 0; n < ARRAY_SIZE(qpt->map); n++)
+		if (qpt->map[n].page)
+			free_page((unsigned long) qpt->map[n].page);
+	return qp_inuse;
+}
+
+/**
+ * ipath_lookup_qpn - return the QP with the given QPN
+ * @qpt: the QP table
+ * @qpn: the QP number to look up
+ *
+ * The caller is responsible for decrementing the QP reference count
+ * when done.
+ */
+struct ipath_qp *ipath_lookup_qpn(struct ipath_qp_table *qpt, u32 qpn)
+{
+	unsigned long flags;
+	struct ipath_qp *qp;
+
+	spin_lock_irqsave(&qpt->lock, flags);
+
+	for (qp = qpt->table[qpn % qpt->max]; qp; qp = qp->next) {
+		if (qp->ibqp.qp_num == qpn) {
+			atomic_inc(&qp->refcount);
+			break;
+		}
+	}
+
+	spin_unlock_irqrestore(&qpt->lock, flags);
+	return qp;
+}
+
+/**
+ * ipath_reset_qp - initialize the QP state to the reset state
+ * @qp: the QP to reset
+ * @type: the QP type
+ */
+static void ipath_reset_qp(struct ipath_qp *qp, enum ib_qp_type type)
+{
+	qp->remote_qpn = 0;
+	qp->qkey = 0;
+	qp->qp_access_flags = 0;
+	atomic_set(&qp->s_dma_busy, 0);
+	qp->s_flags &= IPATH_S_SIGNAL_REQ_WR;
+	qp->s_hdrwords = 0;
+	qp->s_wqe = NULL;
+	qp->s_pkt_delay = 0;
+	qp->s_draining = 0;
+	qp->s_psn = 0;
+	qp->r_psn = 0;
+	qp->r_msn = 0;
+	if (type == IB_QPT_RC) {
+		qp->s_state = IB_OPCODE_RC_SEND_LAST;
+		qp->r_state = IB_OPCODE_RC_SEND_LAST;
+	} else {
+		qp->s_state = IB_OPCODE_UC_SEND_LAST;
+		qp->r_state = IB_OPCODE_UC_SEND_LAST;
+	}
+	qp->s_ack_state = IB_OPCODE_RC_ACKNOWLEDGE;
+	qp->r_nak_state = 0;
+	qp->r_aflags = 0;
+	qp->r_flags = 0;
+	qp->s_rnr_timeout = 0;
+	qp->s_head = 0;
+	qp->s_tail = 0;
+	qp->s_cur = 0;
+	qp->s_last = 0;
+	qp->s_ssn = 1;
+	qp->s_lsn = 0;
+	memset(qp->s_ack_queue, 0, sizeof(qp->s_ack_queue));
+	qp->r_head_ack_queue = 0;
+	qp->s_tail_ack_queue = 0;
+	qp->s_num_rd_atomic = 0;
+	if (qp->r_rq.wq) {
+		qp->r_rq.wq->head = 0;
+		qp->r_rq.wq->tail = 0;
+	}
+}
+
+/**
+ * ipath_error_qp - put a QP into the error state
+ * @qp: the QP to put into the error state
+ * @err: the receive completion error to signal if a RWQE is active
+ *
+ * Flushes both send and receive work queues.
+ * Returns true if last WQE event should be generated.
+ * The QP s_lock should be held and interrupts disabled.
+ * If we are already in error state, just return.
+ */
+
+int ipath_error_qp(struct ipath_qp *qp, enum ib_wc_status err)
+{
+	struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
+	struct ib_wc wc;
+	int ret = 0;
+
+	if (qp->state == IB_QPS_ERR)
+		goto bail;
+
+	qp->state = IB_QPS_ERR;
+
+	spin_lock(&dev->pending_lock);
+	if (!list_empty(&qp->timerwait))
+		list_del_init(&qp->timerwait);
+	if (!list_empty(&qp->piowait))
+		list_del_init(&qp->piowait);
+	spin_unlock(&dev->pending_lock);
+
+	/* Schedule the sending tasklet to drain the send work queue. */
+	if (qp->s_last != qp->s_head)
+		ipath_schedule_send(qp);
+
+	memset(&wc, 0, sizeof(wc));
+	wc.qp = &qp->ibqp;
+	wc.opcode = IB_WC_RECV;
+
+	if (test_and_clear_bit(IPATH_R_WRID_VALID, &qp->r_aflags)) {
+		wc.wr_id = qp->r_wr_id;
+		wc.status = err;
+		ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, 1);
+	}
+	wc.status = IB_WC_WR_FLUSH_ERR;
+
+	if (qp->r_rq.wq) {
+		struct ipath_rwq *wq;
+		u32 head;
+		u32 tail;
+
+		spin_lock(&qp->r_rq.lock);
+
+		/* sanity check pointers before trusting them */
+		wq = qp->r_rq.wq;
+		head = wq->head;
+		if (head >= qp->r_rq.size)
+			head = 0;
+		tail = wq->tail;
+		if (tail >= qp->r_rq.size)
+			tail = 0;
+		while (tail != head) {
+			wc.wr_id = get_rwqe_ptr(&qp->r_rq, tail)->wr_id;
+			if (++tail >= qp->r_rq.size)
+				tail = 0;
+			ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, 1);
+		}
+		wq->tail = tail;
+
+		spin_unlock(&qp->r_rq.lock);
+	} else if (qp->ibqp.event_handler)
+		ret = 1;
+
+bail:
+	return ret;
+}
+
+/**
+ * ipath_modify_qp - modify the attributes of a queue pair
+ * @ibqp: the queue pair who's attributes we're modifying
+ * @attr: the new attributes
+ * @attr_mask: the mask of attributes to modify
+ * @udata: user data for ipathverbs.so
+ *
+ * Returns 0 on success, otherwise returns an errno.
+ */
+int ipath_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+		    int attr_mask, struct ib_udata *udata)
+{
+	struct ipath_ibdev *dev = to_idev(ibqp->device);
+	struct ipath_qp *qp = to_iqp(ibqp);
+	enum ib_qp_state cur_state, new_state;
+	int lastwqe = 0;
+	int ret;
+
+	spin_lock_irq(&qp->s_lock);
+
+	cur_state = attr_mask & IB_QP_CUR_STATE ?
+		attr->cur_qp_state : qp->state;
+	new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state;
+
+	if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type,
+				attr_mask, IB_LINK_LAYER_UNSPECIFIED))
+		goto inval;
+
+	if (attr_mask & IB_QP_AV) {
+		if (attr->ah_attr.dlid == 0 ||
+		    attr->ah_attr.dlid >= IPATH_MULTICAST_LID_BASE)
+			goto inval;
+
+		if ((attr->ah_attr.ah_flags & IB_AH_GRH) &&
+		    (attr->ah_attr.grh.sgid_index > 1))
+			goto inval;
+	}
+
+	if (attr_mask & IB_QP_PKEY_INDEX)
+		if (attr->pkey_index >= ipath_get_npkeys(dev->dd))
+			goto inval;
+
+	if (attr_mask & IB_QP_MIN_RNR_TIMER)
+		if (attr->min_rnr_timer > 31)
+			goto inval;
+
+	if (attr_mask & IB_QP_PORT)
+		if (attr->port_num == 0 ||
+		    attr->port_num > ibqp->device->phys_port_cnt)
+			goto inval;
+
+	/*
+	 * don't allow invalid Path MTU values or greater than 2048
+	 * unless we are configured for a 4KB MTU
+	 */
+	if ((attr_mask & IB_QP_PATH_MTU) &&
+		(ib_mtu_enum_to_int(attr->path_mtu) == -1 ||
+		(attr->path_mtu > IB_MTU_2048 && !ipath_mtu4096)))
+		goto inval;
+
+	if (attr_mask & IB_QP_PATH_MIG_STATE)
+		if (attr->path_mig_state != IB_MIG_MIGRATED &&
+		    attr->path_mig_state != IB_MIG_REARM)
+			goto inval;
+
+	if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC)
+		if (attr->max_dest_rd_atomic > IPATH_MAX_RDMA_ATOMIC)
+			goto inval;
+
+	switch (new_state) {
+	case IB_QPS_RESET:
+		if (qp->state != IB_QPS_RESET) {
+			qp->state = IB_QPS_RESET;
+			spin_lock(&dev->pending_lock);
+			if (!list_empty(&qp->timerwait))
+				list_del_init(&qp->timerwait);
+			if (!list_empty(&qp->piowait))
+				list_del_init(&qp->piowait);
+			spin_unlock(&dev->pending_lock);
+			qp->s_flags &= ~IPATH_S_ANY_WAIT;
+			spin_unlock_irq(&qp->s_lock);
+			/* Stop the sending tasklet */
+			tasklet_kill(&qp->s_task);
+			wait_event(qp->wait_dma, !atomic_read(&qp->s_dma_busy));
+			spin_lock_irq(&qp->s_lock);
+		}
+		ipath_reset_qp(qp, ibqp->qp_type);
+		break;
+
+	case IB_QPS_SQD:
+		qp->s_draining = qp->s_last != qp->s_cur;
+		qp->state = new_state;
+		break;
+
+	case IB_QPS_SQE:
+		if (qp->ibqp.qp_type == IB_QPT_RC)
+			goto inval;
+		qp->state = new_state;
+		break;
+
+	case IB_QPS_ERR:
+		lastwqe = ipath_error_qp(qp, IB_WC_WR_FLUSH_ERR);
+		break;
+
+	default:
+		qp->state = new_state;
+		break;
+	}
+
+	if (attr_mask & IB_QP_PKEY_INDEX)
+		qp->s_pkey_index = attr->pkey_index;
+
+	if (attr_mask & IB_QP_DEST_QPN)
+		qp->remote_qpn = attr->dest_qp_num;
+
+	if (attr_mask & IB_QP_SQ_PSN) {
+		qp->s_psn = qp->s_next_psn = attr->sq_psn;
+		qp->s_last_psn = qp->s_next_psn - 1;
+	}
+
+	if (attr_mask & IB_QP_RQ_PSN)
+		qp->r_psn = attr->rq_psn;
+
+	if (attr_mask & IB_QP_ACCESS_FLAGS)
+		qp->qp_access_flags = attr->qp_access_flags;
+
+	if (attr_mask & IB_QP_AV) {
+		qp->remote_ah_attr = attr->ah_attr;
+		qp->s_dmult = ipath_ib_rate_to_mult(attr->ah_attr.static_rate);
+	}
+
+	if (attr_mask & IB_QP_PATH_MTU)
+		qp->path_mtu = attr->path_mtu;
+
+	if (attr_mask & IB_QP_RETRY_CNT)
+		qp->s_retry = qp->s_retry_cnt = attr->retry_cnt;
+
+	if (attr_mask & IB_QP_RNR_RETRY) {
+		qp->s_rnr_retry = attr->rnr_retry;
+		if (qp->s_rnr_retry > 7)
+			qp->s_rnr_retry = 7;
+		qp->s_rnr_retry_cnt = qp->s_rnr_retry;
+	}
+
+	if (attr_mask & IB_QP_MIN_RNR_TIMER)
+		qp->r_min_rnr_timer = attr->min_rnr_timer;
+
+	if (attr_mask & IB_QP_TIMEOUT)
+		qp->timeout = attr->timeout;
+
+	if (attr_mask & IB_QP_QKEY)
+		qp->qkey = attr->qkey;
+
+	if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC)
+		qp->r_max_rd_atomic = attr->max_dest_rd_atomic;
+
+	if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC)
+		qp->s_max_rd_atomic = attr->max_rd_atomic;
+
+	spin_unlock_irq(&qp->s_lock);
+
+	if (lastwqe) {
+		struct ib_event ev;
+
+		ev.device = qp->ibqp.device;
+		ev.element.qp = &qp->ibqp;
+		ev.event = IB_EVENT_QP_LAST_WQE_REACHED;
+		qp->ibqp.event_handler(&ev, qp->ibqp.qp_context);
+	}
+	ret = 0;
+	goto bail;
+
+inval:
+	spin_unlock_irq(&qp->s_lock);
+	ret = -EINVAL;
+
+bail:
+	return ret;
+}
+
+int ipath_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+		   int attr_mask, struct ib_qp_init_attr *init_attr)
+{
+	struct ipath_qp *qp = to_iqp(ibqp);
+
+	attr->qp_state = qp->state;
+	attr->cur_qp_state = attr->qp_state;
+	attr->path_mtu = qp->path_mtu;
+	attr->path_mig_state = 0;
+	attr->qkey = qp->qkey;
+	attr->rq_psn = qp->r_psn;
+	attr->sq_psn = qp->s_next_psn;
+	attr->dest_qp_num = qp->remote_qpn;
+	attr->qp_access_flags = qp->qp_access_flags;
+	attr->cap.max_send_wr = qp->s_size - 1;
+	attr->cap.max_recv_wr = qp->ibqp.srq ? 0 : qp->r_rq.size - 1;
+	attr->cap.max_send_sge = qp->s_max_sge;
+	attr->cap.max_recv_sge = qp->r_rq.max_sge;
+	attr->cap.max_inline_data = 0;
+	attr->ah_attr = qp->remote_ah_attr;
+	memset(&attr->alt_ah_attr, 0, sizeof(attr->alt_ah_attr));
+	attr->pkey_index = qp->s_pkey_index;
+	attr->alt_pkey_index = 0;
+	attr->en_sqd_async_notify = 0;
+	attr->sq_draining = qp->s_draining;
+	attr->max_rd_atomic = qp->s_max_rd_atomic;
+	attr->max_dest_rd_atomic = qp->r_max_rd_atomic;
+	attr->min_rnr_timer = qp->r_min_rnr_timer;
+	attr->port_num = 1;
+	attr->timeout = qp->timeout;
+	attr->retry_cnt = qp->s_retry_cnt;
+	attr->rnr_retry = qp->s_rnr_retry_cnt;
+	attr->alt_port_num = 0;
+	attr->alt_timeout = 0;
+
+	init_attr->event_handler = qp->ibqp.event_handler;
+	init_attr->qp_context = qp->ibqp.qp_context;
+	init_attr->send_cq = qp->ibqp.send_cq;
+	init_attr->recv_cq = qp->ibqp.recv_cq;
+	init_attr->srq = qp->ibqp.srq;
+	init_attr->cap = attr->cap;
+	if (qp->s_flags & IPATH_S_SIGNAL_REQ_WR)
+		init_attr->sq_sig_type = IB_SIGNAL_REQ_WR;
+	else
+		init_attr->sq_sig_type = IB_SIGNAL_ALL_WR;
+	init_attr->qp_type = qp->ibqp.qp_type;
+	init_attr->port_num = 1;
+	return 0;
+}
+
+/**
+ * ipath_compute_aeth - compute the AETH (syndrome + MSN)
+ * @qp: the queue pair to compute the AETH for
+ *
+ * Returns the AETH.
+ */
+__be32 ipath_compute_aeth(struct ipath_qp *qp)
+{
+	u32 aeth = qp->r_msn & IPATH_MSN_MASK;
+
+	if (qp->ibqp.srq) {
+		/*
+		 * Shared receive queues don't generate credits.
+		 * Set the credit field to the invalid value.
+		 */
+		aeth |= IPATH_AETH_CREDIT_INVAL << IPATH_AETH_CREDIT_SHIFT;
+	} else {
+		u32 min, max, x;
+		u32 credits;
+		struct ipath_rwq *wq = qp->r_rq.wq;
+		u32 head;
+		u32 tail;
+
+		/* sanity check pointers before trusting them */
+		head = wq->head;
+		if (head >= qp->r_rq.size)
+			head = 0;
+		tail = wq->tail;
+		if (tail >= qp->r_rq.size)
+			tail = 0;
+		/*
+		 * Compute the number of credits available (RWQEs).
+		 * XXX Not holding the r_rq.lock here so there is a small
+		 * chance that the pair of reads are not atomic.
+		 */
+		credits = head - tail;
+		if ((int)credits < 0)
+			credits += qp->r_rq.size;
+		/*
+		 * Binary search the credit table to find the code to
+		 * use.
+		 */
+		min = 0;
+		max = 31;
+		for (;;) {
+			x = (min + max) / 2;
+			if (credit_table[x] == credits)
+				break;
+			if (credit_table[x] > credits)
+				max = x;
+			else if (min == x)
+				break;
+			else
+				min = x;
+		}
+		aeth |= x << IPATH_AETH_CREDIT_SHIFT;
+	}
+	return cpu_to_be32(aeth);
+}
+
+/**
+ * ipath_create_qp - create a queue pair for a device
+ * @ibpd: the protection domain who's device we create the queue pair for
+ * @init_attr: the attributes of the queue pair
+ * @udata: unused by InfiniPath
+ *
+ * Returns the queue pair on success, otherwise returns an errno.
+ *
+ * Called by the ib_create_qp() core verbs function.
+ */
+struct ib_qp *ipath_create_qp(struct ib_pd *ibpd,
+			      struct ib_qp_init_attr *init_attr,
+			      struct ib_udata *udata)
+{
+	struct ipath_qp *qp;
+	int err;
+	struct ipath_swqe *swq = NULL;
+	struct ipath_ibdev *dev;
+	size_t sz;
+	size_t sg_list_sz;
+	struct ib_qp *ret;
+
+	if (init_attr->create_flags) {
+		ret = ERR_PTR(-EINVAL);
+		goto bail;
+	}
+
+	if (init_attr->cap.max_send_sge > ib_ipath_max_sges ||
+	    init_attr->cap.max_send_wr > ib_ipath_max_qp_wrs) {
+		ret = ERR_PTR(-EINVAL);
+		goto bail;
+	}
+
+	/* Check receive queue parameters if no SRQ is specified. */
+	if (!init_attr->srq) {
+		if (init_attr->cap.max_recv_sge > ib_ipath_max_sges ||
+		    init_attr->cap.max_recv_wr > ib_ipath_max_qp_wrs) {
+			ret = ERR_PTR(-EINVAL);
+			goto bail;
+		}
+		if (init_attr->cap.max_send_sge +
+		    init_attr->cap.max_send_wr +
+		    init_attr->cap.max_recv_sge +
+		    init_attr->cap.max_recv_wr == 0) {
+			ret = ERR_PTR(-EINVAL);
+			goto bail;
+		}
+	}
+
+	switch (init_attr->qp_type) {
+	case IB_QPT_UC:
+	case IB_QPT_RC:
+	case IB_QPT_UD:
+	case IB_QPT_SMI:
+	case IB_QPT_GSI:
+		sz = sizeof(struct ipath_sge) *
+			init_attr->cap.max_send_sge +
+			sizeof(struct ipath_swqe);
+		swq = vmalloc((init_attr->cap.max_send_wr + 1) * sz);
+		if (swq == NULL) {
+			ret = ERR_PTR(-ENOMEM);
+			goto bail;
+		}
+		sz = sizeof(*qp);
+		sg_list_sz = 0;
+		if (init_attr->srq) {
+			struct ipath_srq *srq = to_isrq(init_attr->srq);
+
+			if (srq->rq.max_sge > 1)
+				sg_list_sz = sizeof(*qp->r_sg_list) *
+					(srq->rq.max_sge - 1);
+		} else if (init_attr->cap.max_recv_sge > 1)
+			sg_list_sz = sizeof(*qp->r_sg_list) *
+				(init_attr->cap.max_recv_sge - 1);
+		qp = kmalloc(sz + sg_list_sz, GFP_KERNEL);
+		if (!qp) {
+			ret = ERR_PTR(-ENOMEM);
+			goto bail_swq;
+		}
+		if (sg_list_sz && (init_attr->qp_type == IB_QPT_UD ||
+		    init_attr->qp_type == IB_QPT_SMI ||
+		    init_attr->qp_type == IB_QPT_GSI)) {
+			qp->r_ud_sg_list = kmalloc(sg_list_sz, GFP_KERNEL);
+			if (!qp->r_ud_sg_list) {
+				ret = ERR_PTR(-ENOMEM);
+				goto bail_qp;
+			}
+		} else
+			qp->r_ud_sg_list = NULL;
+		if (init_attr->srq) {
+			sz = 0;
+			qp->r_rq.size = 0;
+			qp->r_rq.max_sge = 0;
+			qp->r_rq.wq = NULL;
+			init_attr->cap.max_recv_wr = 0;
+			init_attr->cap.max_recv_sge = 0;
+		} else {
+			qp->r_rq.size = init_attr->cap.max_recv_wr + 1;
+			qp->r_rq.max_sge = init_attr->cap.max_recv_sge;
+			sz = (sizeof(struct ib_sge) * qp->r_rq.max_sge) +
+				sizeof(struct ipath_rwqe);
+			qp->r_rq.wq = vmalloc_user(sizeof(struct ipath_rwq) +
+					      qp->r_rq.size * sz);
+			if (!qp->r_rq.wq) {
+				ret = ERR_PTR(-ENOMEM);
+				goto bail_sg_list;
+			}
+		}
+
+		/*
+		 * ib_create_qp() will initialize qp->ibqp
+		 * except for qp->ibqp.qp_num.
+		 */
+		spin_lock_init(&qp->s_lock);
+		spin_lock_init(&qp->r_rq.lock);
+		atomic_set(&qp->refcount, 0);
+		init_waitqueue_head(&qp->wait);
+		init_waitqueue_head(&qp->wait_dma);
+		tasklet_init(&qp->s_task, ipath_do_send, (unsigned long)qp);
+		INIT_LIST_HEAD(&qp->piowait);
+		INIT_LIST_HEAD(&qp->timerwait);
+		qp->state = IB_QPS_RESET;
+		qp->s_wq = swq;
+		qp->s_size = init_attr->cap.max_send_wr + 1;
+		qp->s_max_sge = init_attr->cap.max_send_sge;
+		if (init_attr->sq_sig_type == IB_SIGNAL_REQ_WR)
+			qp->s_flags = IPATH_S_SIGNAL_REQ_WR;
+		else
+			qp->s_flags = 0;
+		dev = to_idev(ibpd->device);
+		err = ipath_alloc_qpn(&dev->qp_table, qp,
+				      init_attr->qp_type);
+		if (err) {
+			ret = ERR_PTR(err);
+			vfree(qp->r_rq.wq);
+			goto bail_sg_list;
+		}
+		qp->ip = NULL;
+		qp->s_tx = NULL;
+		ipath_reset_qp(qp, init_attr->qp_type);
+		break;
+
+	default:
+		/* Don't support raw QPs */
+		ret = ERR_PTR(-ENOSYS);
+		goto bail;
+	}
+
+	init_attr->cap.max_inline_data = 0;
+
+	/*
+	 * Return the address of the RWQ as the offset to mmap.
+	 * See ipath_mmap() for details.
+	 */
+	if (udata && udata->outlen >= sizeof(__u64)) {
+		if (!qp->r_rq.wq) {
+			__u64 offset = 0;
+
+			err = ib_copy_to_udata(udata, &offset,
+					       sizeof(offset));
+			if (err) {
+				ret = ERR_PTR(err);
+				goto bail_ip;
+			}
+		} else {
+			u32 s = sizeof(struct ipath_rwq) +
+				qp->r_rq.size * sz;
+
+			qp->ip =
+			    ipath_create_mmap_info(dev, s,
+						   ibpd->uobject->context,
+						   qp->r_rq.wq);
+			if (!qp->ip) {
+				ret = ERR_PTR(-ENOMEM);
+				goto bail_ip;
+			}
+
+			err = ib_copy_to_udata(udata, &(qp->ip->offset),
+					       sizeof(qp->ip->offset));
+			if (err) {
+				ret = ERR_PTR(err);
+				goto bail_ip;
+			}
+		}
+	}
+
+	spin_lock(&dev->n_qps_lock);
+	if (dev->n_qps_allocated == ib_ipath_max_qps) {
+		spin_unlock(&dev->n_qps_lock);
+		ret = ERR_PTR(-ENOMEM);
+		goto bail_ip;
+	}
+
+	dev->n_qps_allocated++;
+	spin_unlock(&dev->n_qps_lock);
+
+	if (qp->ip) {
+		spin_lock_irq(&dev->pending_lock);
+		list_add(&qp->ip->pending_mmaps, &dev->pending_mmaps);
+		spin_unlock_irq(&dev->pending_lock);
+	}
+
+	ret = &qp->ibqp;
+	goto bail;
+
+bail_ip:
+	if (qp->ip)
+		kref_put(&qp->ip->ref, ipath_release_mmap_info);
+	else
+		vfree(qp->r_rq.wq);
+	ipath_free_qp(&dev->qp_table, qp);
+	free_qpn(&dev->qp_table, qp->ibqp.qp_num);
+bail_sg_list:
+	kfree(qp->r_ud_sg_list);
+bail_qp:
+	kfree(qp);
+bail_swq:
+	vfree(swq);
+bail:
+	return ret;
+}
+
+/**
+ * ipath_destroy_qp - destroy a queue pair
+ * @ibqp: the queue pair to destroy
+ *
+ * Returns 0 on success.
+ *
+ * Note that this can be called while the QP is actively sending or
+ * receiving!
+ */
+int ipath_destroy_qp(struct ib_qp *ibqp)
+{
+	struct ipath_qp *qp = to_iqp(ibqp);
+	struct ipath_ibdev *dev = to_idev(ibqp->device);
+
+	/* Make sure HW and driver activity is stopped. */
+	spin_lock_irq(&qp->s_lock);
+	if (qp->state != IB_QPS_RESET) {
+		qp->state = IB_QPS_RESET;
+		spin_lock(&dev->pending_lock);
+		if (!list_empty(&qp->timerwait))
+			list_del_init(&qp->timerwait);
+		if (!list_empty(&qp->piowait))
+			list_del_init(&qp->piowait);
+		spin_unlock(&dev->pending_lock);
+		qp->s_flags &= ~IPATH_S_ANY_WAIT;
+		spin_unlock_irq(&qp->s_lock);
+		/* Stop the sending tasklet */
+		tasklet_kill(&qp->s_task);
+		wait_event(qp->wait_dma, !atomic_read(&qp->s_dma_busy));
+	} else
+		spin_unlock_irq(&qp->s_lock);
+
+	ipath_free_qp(&dev->qp_table, qp);
+
+	if (qp->s_tx) {
+		atomic_dec(&qp->refcount);
+		if (qp->s_tx->txreq.flags & IPATH_SDMA_TXREQ_F_FREEBUF)
+			kfree(qp->s_tx->txreq.map_addr);
+		spin_lock_irq(&dev->pending_lock);
+		list_add(&qp->s_tx->txreq.list, &dev->txreq_free);
+		spin_unlock_irq(&dev->pending_lock);
+		qp->s_tx = NULL;
+	}
+
+	wait_event(qp->wait, !atomic_read(&qp->refcount));
+
+	/* all user's cleaned up, mark it available */
+	free_qpn(&dev->qp_table, qp->ibqp.qp_num);
+	spin_lock(&dev->n_qps_lock);
+	dev->n_qps_allocated--;
+	spin_unlock(&dev->n_qps_lock);
+
+	if (qp->ip)
+		kref_put(&qp->ip->ref, ipath_release_mmap_info);
+	else
+		vfree(qp->r_rq.wq);
+	kfree(qp->r_ud_sg_list);
+	vfree(qp->s_wq);
+	kfree(qp);
+	return 0;
+}
+
+/**
+ * ipath_init_qp_table - initialize the QP table for a device
+ * @idev: the device who's QP table we're initializing
+ * @size: the size of the QP table
+ *
+ * Returns 0 on success, otherwise returns an errno.
+ */
+int ipath_init_qp_table(struct ipath_ibdev *idev, int size)
+{
+	int i;
+	int ret;
+
+	idev->qp_table.last = 1;	/* QPN 0 and 1 are special. */
+	idev->qp_table.max = size;
+	idev->qp_table.nmaps = 1;
+	idev->qp_table.table = kzalloc(size * sizeof(*idev->qp_table.table),
+				       GFP_KERNEL);
+	if (idev->qp_table.table == NULL) {
+		ret = -ENOMEM;
+		goto bail;
+	}
+
+	for (i = 0; i < ARRAY_SIZE(idev->qp_table.map); i++) {
+		atomic_set(&idev->qp_table.map[i].n_free, BITS_PER_PAGE);
+		idev->qp_table.map[i].page = NULL;
+	}
+
+	ret = 0;
+
+bail:
+	return ret;
+}
+
+/**
+ * ipath_get_credit - flush the send work queue of a QP
+ * @qp: the qp who's send work queue to flush
+ * @aeth: the Acknowledge Extended Transport Header
+ *
+ * The QP s_lock should be held.
+ */
+void ipath_get_credit(struct ipath_qp *qp, u32 aeth)
+{
+	u32 credit = (aeth >> IPATH_AETH_CREDIT_SHIFT) & IPATH_AETH_CREDIT_MASK;
+
+	/*
+	 * If the credit is invalid, we can send
+	 * as many packets as we like.  Otherwise, we have to
+	 * honor the credit field.
+	 */
+	if (credit == IPATH_AETH_CREDIT_INVAL)
+		qp->s_lsn = (u32) -1;
+	else if (qp->s_lsn != (u32) -1) {
+		/* Compute new LSN (i.e., MSN + credit) */
+		credit = (aeth + credit_table[credit]) & IPATH_MSN_MASK;
+		if (ipath_cmp24(credit, qp->s_lsn) > 0)
+			qp->s_lsn = credit;
+	}
+
+	/* Restart sending if it was blocked due to lack of credits. */
+	if ((qp->s_flags & IPATH_S_WAIT_SSN_CREDIT) &&
+	    qp->s_cur != qp->s_head &&
+	    (qp->s_lsn == (u32) -1 ||
+	     ipath_cmp24(get_swqe_ptr(qp, qp->s_cur)->ssn,
+			 qp->s_lsn + 1) <= 0))
+		ipath_schedule_send(qp);
+}
diff --git a/drivers/infiniband/hw/ipath/ipath_rc.c b/drivers/infiniband/hw/ipath/ipath_rc.c
new file mode 100644
index 0000000..79b3dbc
--- /dev/null
+++ b/drivers/infiniband/hw/ipath/ipath_rc.c
@@ -0,0 +1,1969 @@
+/*
+ * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/io.h>
+
+#include "ipath_verbs.h"
+#include "ipath_kernel.h"
+
+/* cut down ridiculously long IB macro names */
+#define OP(x) IB_OPCODE_RC_##x
+
+static u32 restart_sge(struct ipath_sge_state *ss, struct ipath_swqe *wqe,
+		       u32 psn, u32 pmtu)
+{
+	u32 len;
+
+	len = ((psn - wqe->psn) & IPATH_PSN_MASK) * pmtu;
+	ss->sge = wqe->sg_list[0];
+	ss->sg_list = wqe->sg_list + 1;
+	ss->num_sge = wqe->wr.num_sge;
+	ipath_skip_sge(ss, len);
+	return wqe->length - len;
+}
+
+/**
+ * ipath_init_restart- initialize the qp->s_sge after a restart
+ * @qp: the QP who's SGE we're restarting
+ * @wqe: the work queue to initialize the QP's SGE from
+ *
+ * The QP s_lock should be held and interrupts disabled.
+ */
+static void ipath_init_restart(struct ipath_qp *qp, struct ipath_swqe *wqe)
+{
+	struct ipath_ibdev *dev;
+
+	qp->s_len = restart_sge(&qp->s_sge, wqe, qp->s_psn,
+				ib_mtu_enum_to_int(qp->path_mtu));
+	dev = to_idev(qp->ibqp.device);
+	spin_lock(&dev->pending_lock);
+	if (list_empty(&qp->timerwait))
+		list_add_tail(&qp->timerwait,
+			      &dev->pending[dev->pending_index]);
+	spin_unlock(&dev->pending_lock);
+}
+
+/**
+ * ipath_make_rc_ack - construct a response packet (ACK, NAK, or RDMA read)
+ * @qp: a pointer to the QP
+ * @ohdr: a pointer to the IB header being constructed
+ * @pmtu: the path MTU
+ *
+ * Return 1 if constructed; otherwise, return 0.
+ * Note that we are in the responder's side of the QP context.
+ * Note the QP s_lock must be held.
+ */
+static int ipath_make_rc_ack(struct ipath_ibdev *dev, struct ipath_qp *qp,
+			     struct ipath_other_headers *ohdr, u32 pmtu)
+{
+	struct ipath_ack_entry *e;
+	u32 hwords;
+	u32 len;
+	u32 bth0;
+	u32 bth2;
+
+	/* Don't send an ACK if we aren't supposed to. */
+	if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK))
+		goto bail;
+
+	/* header size in 32-bit words LRH+BTH = (8+12)/4. */
+	hwords = 5;
+
+	switch (qp->s_ack_state) {
+	case OP(RDMA_READ_RESPONSE_LAST):
+	case OP(RDMA_READ_RESPONSE_ONLY):
+	case OP(ATOMIC_ACKNOWLEDGE):
+		/*
+		 * We can increment the tail pointer now that the last
+		 * response has been sent instead of only being
+		 * constructed.
+		 */
+		if (++qp->s_tail_ack_queue > IPATH_MAX_RDMA_ATOMIC)
+			qp->s_tail_ack_queue = 0;
+		/* FALLTHROUGH */
+	case OP(SEND_ONLY):
+	case OP(ACKNOWLEDGE):
+		/* Check for no next entry in the queue. */
+		if (qp->r_head_ack_queue == qp->s_tail_ack_queue) {
+			if (qp->s_flags & IPATH_S_ACK_PENDING)
+				goto normal;
+			qp->s_ack_state = OP(ACKNOWLEDGE);
+			goto bail;
+		}
+
+		e = &qp->s_ack_queue[qp->s_tail_ack_queue];
+		if (e->opcode == OP(RDMA_READ_REQUEST)) {
+			/* Copy SGE state in case we need to resend */
+			qp->s_ack_rdma_sge = e->rdma_sge;
+			qp->s_cur_sge = &qp->s_ack_rdma_sge;
+			len = e->rdma_sge.sge.sge_length;
+			if (len > pmtu) {
+				len = pmtu;
+				qp->s_ack_state = OP(RDMA_READ_RESPONSE_FIRST);
+			} else {
+				qp->s_ack_state = OP(RDMA_READ_RESPONSE_ONLY);
+				e->sent = 1;
+			}
+			ohdr->u.aeth = ipath_compute_aeth(qp);
+			hwords++;
+			qp->s_ack_rdma_psn = e->psn;
+			bth2 = qp->s_ack_rdma_psn++ & IPATH_PSN_MASK;
+		} else {
+			/* COMPARE_SWAP or FETCH_ADD */
+			qp->s_cur_sge = NULL;
+			len = 0;
+			qp->s_ack_state = OP(ATOMIC_ACKNOWLEDGE);
+			ohdr->u.at.aeth = ipath_compute_aeth(qp);
+			ohdr->u.at.atomic_ack_eth[0] =
+				cpu_to_be32(e->atomic_data >> 32);
+			ohdr->u.at.atomic_ack_eth[1] =
+				cpu_to_be32(e->atomic_data);
+			hwords += sizeof(ohdr->u.at) / sizeof(u32);
+			bth2 = e->psn;
+			e->sent = 1;
+		}
+		bth0 = qp->s_ack_state << 24;
+		break;
+
+	case OP(RDMA_READ_RESPONSE_FIRST):
+		qp->s_ack_state = OP(RDMA_READ_RESPONSE_MIDDLE);
+		/* FALLTHROUGH */
+	case OP(RDMA_READ_RESPONSE_MIDDLE):
+		len = qp->s_ack_rdma_sge.sge.sge_length;
+		if (len > pmtu)
+			len = pmtu;
+		else {
+			ohdr->u.aeth = ipath_compute_aeth(qp);
+			hwords++;
+			qp->s_ack_state = OP(RDMA_READ_RESPONSE_LAST);
+			qp->s_ack_queue[qp->s_tail_ack_queue].sent = 1;
+		}
+		bth0 = qp->s_ack_state << 24;
+		bth2 = qp->s_ack_rdma_psn++ & IPATH_PSN_MASK;
+		break;
+
+	default:
+	normal:
+		/*
+		 * Send a regular ACK.
+		 * Set the s_ack_state so we wait until after sending
+		 * the ACK before setting s_ack_state to ACKNOWLEDGE
+		 * (see above).
+		 */
+		qp->s_ack_state = OP(SEND_ONLY);
+		qp->s_flags &= ~IPATH_S_ACK_PENDING;
+		qp->s_cur_sge = NULL;
+		if (qp->s_nak_state)
+			ohdr->u.aeth =
+				cpu_to_be32((qp->r_msn & IPATH_MSN_MASK) |
+					    (qp->s_nak_state <<
+					     IPATH_AETH_CREDIT_SHIFT));
+		else
+			ohdr->u.aeth = ipath_compute_aeth(qp);
+		hwords++;
+		len = 0;
+		bth0 = OP(ACKNOWLEDGE) << 24;
+		bth2 = qp->s_ack_psn & IPATH_PSN_MASK;
+	}
+	qp->s_hdrwords = hwords;
+	qp->s_cur_size = len;
+	ipath_make_ruc_header(dev, qp, ohdr, bth0, bth2);
+	return 1;
+
+bail:
+	return 0;
+}
+
+/**
+ * ipath_make_rc_req - construct a request packet (SEND, RDMA r/w, ATOMIC)
+ * @qp: a pointer to the QP
+ *
+ * Return 1 if constructed; otherwise, return 0.
+ */
+int ipath_make_rc_req(struct ipath_qp *qp)
+{
+	struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
+	struct ipath_other_headers *ohdr;
+	struct ipath_sge_state *ss;
+	struct ipath_swqe *wqe;
+	u32 hwords;
+	u32 len;
+	u32 bth0;
+	u32 bth2;
+	u32 pmtu = ib_mtu_enum_to_int(qp->path_mtu);
+	char newreq;
+	unsigned long flags;
+	int ret = 0;
+
+	ohdr = &qp->s_hdr.u.oth;
+	if (qp->remote_ah_attr.ah_flags & IB_AH_GRH)
+		ohdr = &qp->s_hdr.u.l.oth;
+
+	/*
+	 * The lock is needed to synchronize between the sending tasklet,
+	 * the receive interrupt handler, and timeout resends.
+	 */
+	spin_lock_irqsave(&qp->s_lock, flags);
+
+	/* Sending responses has higher priority over sending requests. */
+	if ((qp->r_head_ack_queue != qp->s_tail_ack_queue ||
+	     (qp->s_flags & IPATH_S_ACK_PENDING) ||
+	     qp->s_ack_state != OP(ACKNOWLEDGE)) &&
+	    ipath_make_rc_ack(dev, qp, ohdr, pmtu))
+		goto done;
+
+	if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_SEND_OK)) {
+		if (!(ib_ipath_state_ops[qp->state] & IPATH_FLUSH_SEND))
+			goto bail;
+		/* We are in the error state, flush the work request. */
+		if (qp->s_last == qp->s_head)
+			goto bail;
+		/* If DMAs are in progress, we can't flush immediately. */
+		if (atomic_read(&qp->s_dma_busy)) {
+			qp->s_flags |= IPATH_S_WAIT_DMA;
+			goto bail;
+		}
+		wqe = get_swqe_ptr(qp, qp->s_last);
+		ipath_send_complete(qp, wqe, IB_WC_WR_FLUSH_ERR);
+		goto done;
+	}
+
+	/* Leave BUSY set until RNR timeout. */
+	if (qp->s_rnr_timeout) {
+		qp->s_flags |= IPATH_S_WAITING;
+		goto bail;
+	}
+
+	/* header size in 32-bit words LRH+BTH = (8+12)/4. */
+	hwords = 5;
+	bth0 = 1 << 22; /* Set M bit */
+
+	/* Send a request. */
+	wqe = get_swqe_ptr(qp, qp->s_cur);
+	switch (qp->s_state) {
+	default:
+		if (!(ib_ipath_state_ops[qp->state] &
+		    IPATH_PROCESS_NEXT_SEND_OK))
+			goto bail;
+		/*
+		 * Resend an old request or start a new one.
+		 *
+		 * We keep track of the current SWQE so that
+		 * we don't reset the "furthest progress" state
+		 * if we need to back up.
+		 */
+		newreq = 0;
+		if (qp->s_cur == qp->s_tail) {
+			/* Check if send work queue is empty. */
+			if (qp->s_tail == qp->s_head)
+				goto bail;
+			/*
+			 * If a fence is requested, wait for previous
+			 * RDMA read and atomic operations to finish.
+			 */
+			if ((wqe->wr.send_flags & IB_SEND_FENCE) &&
+			    qp->s_num_rd_atomic) {
+				qp->s_flags |= IPATH_S_FENCE_PENDING;
+				goto bail;
+			}
+			wqe->psn = qp->s_next_psn;
+			newreq = 1;
+		}
+		/*
+		 * Note that we have to be careful not to modify the
+		 * original work request since we may need to resend
+		 * it.
+		 */
+		len = wqe->length;
+		ss = &qp->s_sge;
+		bth2 = 0;
+		switch (wqe->wr.opcode) {
+		case IB_WR_SEND:
+		case IB_WR_SEND_WITH_IMM:
+			/* If no credit, return. */
+			if (qp->s_lsn != (u32) -1 &&
+			    ipath_cmp24(wqe->ssn, qp->s_lsn + 1) > 0) {
+				qp->s_flags |= IPATH_S_WAIT_SSN_CREDIT;
+				goto bail;
+			}
+			wqe->lpsn = wqe->psn;
+			if (len > pmtu) {
+				wqe->lpsn += (len - 1) / pmtu;
+				qp->s_state = OP(SEND_FIRST);
+				len = pmtu;
+				break;
+			}
+			if (wqe->wr.opcode == IB_WR_SEND)
+				qp->s_state = OP(SEND_ONLY);
+			else {
+				qp->s_state = OP(SEND_ONLY_WITH_IMMEDIATE);
+				/* Immediate data comes after the BTH */
+				ohdr->u.imm_data = wqe->wr.ex.imm_data;
+				hwords += 1;
+			}
+			if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+				bth0 |= 1 << 23;
+			bth2 = 1 << 31;	/* Request ACK. */
+			if (++qp->s_cur == qp->s_size)
+				qp->s_cur = 0;
+			break;
+
+		case IB_WR_RDMA_WRITE:
+			if (newreq && qp->s_lsn != (u32) -1)
+				qp->s_lsn++;
+			/* FALLTHROUGH */
+		case IB_WR_RDMA_WRITE_WITH_IMM:
+			/* If no credit, return. */
+			if (qp->s_lsn != (u32) -1 &&
+			    ipath_cmp24(wqe->ssn, qp->s_lsn + 1) > 0) {
+				qp->s_flags |= IPATH_S_WAIT_SSN_CREDIT;
+				goto bail;
+			}
+			ohdr->u.rc.reth.vaddr =
+				cpu_to_be64(wqe->wr.wr.rdma.remote_addr);
+			ohdr->u.rc.reth.rkey =
+				cpu_to_be32(wqe->wr.wr.rdma.rkey);
+			ohdr->u.rc.reth.length = cpu_to_be32(len);
+			hwords += sizeof(struct ib_reth) / sizeof(u32);
+			wqe->lpsn = wqe->psn;
+			if (len > pmtu) {
+				wqe->lpsn += (len - 1) / pmtu;
+				qp->s_state = OP(RDMA_WRITE_FIRST);
+				len = pmtu;
+				break;
+			}
+			if (wqe->wr.opcode == IB_WR_RDMA_WRITE)
+				qp->s_state = OP(RDMA_WRITE_ONLY);
+			else {
+				qp->s_state =
+					OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE);
+				/* Immediate data comes after RETH */
+				ohdr->u.rc.imm_data = wqe->wr.ex.imm_data;
+				hwords += 1;
+				if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+					bth0 |= 1 << 23;
+			}
+			bth2 = 1 << 31;	/* Request ACK. */
+			if (++qp->s_cur == qp->s_size)
+				qp->s_cur = 0;
+			break;
+
+		case IB_WR_RDMA_READ:
+			/*
+			 * Don't allow more operations to be started
+			 * than the QP limits allow.
+			 */
+			if (newreq) {
+				if (qp->s_num_rd_atomic >=
+				    qp->s_max_rd_atomic) {
+					qp->s_flags |= IPATH_S_RDMAR_PENDING;
+					goto bail;
+				}
+				qp->s_num_rd_atomic++;
+				if (qp->s_lsn != (u32) -1)
+					qp->s_lsn++;
+				/*
+				 * Adjust s_next_psn to count the
+				 * expected number of responses.
+				 */
+				if (len > pmtu)
+					qp->s_next_psn += (len - 1) / pmtu;
+				wqe->lpsn = qp->s_next_psn++;
+			}
+			ohdr->u.rc.reth.vaddr =
+				cpu_to_be64(wqe->wr.wr.rdma.remote_addr);
+			ohdr->u.rc.reth.rkey =
+				cpu_to_be32(wqe->wr.wr.rdma.rkey);
+			ohdr->u.rc.reth.length = cpu_to_be32(len);
+			qp->s_state = OP(RDMA_READ_REQUEST);
+			hwords += sizeof(ohdr->u.rc.reth) / sizeof(u32);
+			ss = NULL;
+			len = 0;
+			if (++qp->s_cur == qp->s_size)
+				qp->s_cur = 0;
+			break;
+
+		case IB_WR_ATOMIC_CMP_AND_SWP:
+		case IB_WR_ATOMIC_FETCH_AND_ADD:
+			/*
+			 * Don't allow more operations to be started
+			 * than the QP limits allow.
+			 */
+			if (newreq) {
+				if (qp->s_num_rd_atomic >=
+				    qp->s_max_rd_atomic) {
+					qp->s_flags |= IPATH_S_RDMAR_PENDING;
+					goto bail;
+				}
+				qp->s_num_rd_atomic++;
+				if (qp->s_lsn != (u32) -1)
+					qp->s_lsn++;
+				wqe->lpsn = wqe->psn;
+			}
+			if (wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP) {
+				qp->s_state = OP(COMPARE_SWAP);
+				ohdr->u.atomic_eth.swap_data = cpu_to_be64(
+					wqe->wr.wr.atomic.swap);
+				ohdr->u.atomic_eth.compare_data = cpu_to_be64(
+					wqe->wr.wr.atomic.compare_add);
+			} else {
+				qp->s_state = OP(FETCH_ADD);
+				ohdr->u.atomic_eth.swap_data = cpu_to_be64(
+					wqe->wr.wr.atomic.compare_add);
+				ohdr->u.atomic_eth.compare_data = 0;
+			}
+			ohdr->u.atomic_eth.vaddr[0] = cpu_to_be32(
+				wqe->wr.wr.atomic.remote_addr >> 32);
+			ohdr->u.atomic_eth.vaddr[1] = cpu_to_be32(
+				wqe->wr.wr.atomic.remote_addr);
+			ohdr->u.atomic_eth.rkey = cpu_to_be32(
+				wqe->wr.wr.atomic.rkey);
+			hwords += sizeof(struct ib_atomic_eth) / sizeof(u32);
+			ss = NULL;
+			len = 0;
+			if (++qp->s_cur == qp->s_size)
+				qp->s_cur = 0;
+			break;
+
+		default:
+			goto bail;
+		}
+		qp->s_sge.sge = wqe->sg_list[0];
+		qp->s_sge.sg_list = wqe->sg_list + 1;
+		qp->s_sge.num_sge = wqe->wr.num_sge;
+		qp->s_len = wqe->length;
+		if (newreq) {
+			qp->s_tail++;
+			if (qp->s_tail >= qp->s_size)
+				qp->s_tail = 0;
+		}
+		bth2 |= qp->s_psn & IPATH_PSN_MASK;
+		if (wqe->wr.opcode == IB_WR_RDMA_READ)
+			qp->s_psn = wqe->lpsn + 1;
+		else {
+			qp->s_psn++;
+			if (ipath_cmp24(qp->s_psn, qp->s_next_psn) > 0)
+				qp->s_next_psn = qp->s_psn;
+		}
+		/*
+		 * Put the QP on the pending list so lost ACKs will cause
+		 * a retry.  More than one request can be pending so the
+		 * QP may already be on the dev->pending list.
+		 */
+		spin_lock(&dev->pending_lock);
+		if (list_empty(&qp->timerwait))
+			list_add_tail(&qp->timerwait,
+				      &dev->pending[dev->pending_index]);
+		spin_unlock(&dev->pending_lock);
+		break;
+
+	case OP(RDMA_READ_RESPONSE_FIRST):
+		/*
+		 * This case can only happen if a send is restarted.
+		 * See ipath_restart_rc().
+		 */
+		ipath_init_restart(qp, wqe);
+		/* FALLTHROUGH */
+	case OP(SEND_FIRST):
+		qp->s_state = OP(SEND_MIDDLE);
+		/* FALLTHROUGH */
+	case OP(SEND_MIDDLE):
+		bth2 = qp->s_psn++ & IPATH_PSN_MASK;
+		if (ipath_cmp24(qp->s_psn, qp->s_next_psn) > 0)
+			qp->s_next_psn = qp->s_psn;
+		ss = &qp->s_sge;
+		len = qp->s_len;
+		if (len > pmtu) {
+			len = pmtu;
+			break;
+		}
+		if (wqe->wr.opcode == IB_WR_SEND)
+			qp->s_state = OP(SEND_LAST);
+		else {
+			qp->s_state = OP(SEND_LAST_WITH_IMMEDIATE);
+			/* Immediate data comes after the BTH */
+			ohdr->u.imm_data = wqe->wr.ex.imm_data;
+			hwords += 1;
+		}
+		if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+			bth0 |= 1 << 23;
+		bth2 |= 1 << 31;	/* Request ACK. */
+		qp->s_cur++;
+		if (qp->s_cur >= qp->s_size)
+			qp->s_cur = 0;
+		break;
+
+	case OP(RDMA_READ_RESPONSE_LAST):
+		/*
+		 * This case can only happen if a RDMA write is restarted.
+		 * See ipath_restart_rc().
+		 */
+		ipath_init_restart(qp, wqe);
+		/* FALLTHROUGH */
+	case OP(RDMA_WRITE_FIRST):
+		qp->s_state = OP(RDMA_WRITE_MIDDLE);
+		/* FALLTHROUGH */
+	case OP(RDMA_WRITE_MIDDLE):
+		bth2 = qp->s_psn++ & IPATH_PSN_MASK;
+		if (ipath_cmp24(qp->s_psn, qp->s_next_psn) > 0)
+			qp->s_next_psn = qp->s_psn;
+		ss = &qp->s_sge;
+		len = qp->s_len;
+		if (len > pmtu) {
+			len = pmtu;
+			break;
+		}
+		if (wqe->wr.opcode == IB_WR_RDMA_WRITE)
+			qp->s_state = OP(RDMA_WRITE_LAST);
+		else {
+			qp->s_state = OP(RDMA_WRITE_LAST_WITH_IMMEDIATE);
+			/* Immediate data comes after the BTH */
+			ohdr->u.imm_data = wqe->wr.ex.imm_data;
+			hwords += 1;
+			if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+				bth0 |= 1 << 23;
+		}
+		bth2 |= 1 << 31;	/* Request ACK. */
+		qp->s_cur++;
+		if (qp->s_cur >= qp->s_size)
+			qp->s_cur = 0;
+		break;
+
+	case OP(RDMA_READ_RESPONSE_MIDDLE):
+		/*
+		 * This case can only happen if a RDMA read is restarted.
+		 * See ipath_restart_rc().
+		 */
+		ipath_init_restart(qp, wqe);
+		len = ((qp->s_psn - wqe->psn) & IPATH_PSN_MASK) * pmtu;
+		ohdr->u.rc.reth.vaddr =
+			cpu_to_be64(wqe->wr.wr.rdma.remote_addr + len);
+		ohdr->u.rc.reth.rkey =
+			cpu_to_be32(wqe->wr.wr.rdma.rkey);
+		ohdr->u.rc.reth.length = cpu_to_be32(qp->s_len);
+		qp->s_state = OP(RDMA_READ_REQUEST);
+		hwords += sizeof(ohdr->u.rc.reth) / sizeof(u32);
+		bth2 = qp->s_psn & IPATH_PSN_MASK;
+		qp->s_psn = wqe->lpsn + 1;
+		ss = NULL;
+		len = 0;
+		qp->s_cur++;
+		if (qp->s_cur == qp->s_size)
+			qp->s_cur = 0;
+		break;
+	}
+	if (ipath_cmp24(qp->s_psn, qp->s_last_psn + IPATH_PSN_CREDIT - 1) >= 0)
+		bth2 |= 1 << 31;	/* Request ACK. */
+	qp->s_len -= len;
+	qp->s_hdrwords = hwords;
+	qp->s_cur_sge = ss;
+	qp->s_cur_size = len;
+	ipath_make_ruc_header(dev, qp, ohdr, bth0 | (qp->s_state << 24), bth2);
+done:
+	ret = 1;
+	goto unlock;
+
+bail:
+	qp->s_flags &= ~IPATH_S_BUSY;
+unlock:
+	spin_unlock_irqrestore(&qp->s_lock, flags);
+	return ret;
+}
+
+/**
+ * send_rc_ack - Construct an ACK packet and send it
+ * @qp: a pointer to the QP
+ *
+ * This is called from ipath_rc_rcv() and only uses the receive
+ * side QP state.
+ * Note that RDMA reads and atomics are handled in the
+ * send side QP state and tasklet.
+ */
+static void send_rc_ack(struct ipath_qp *qp)
+{
+	struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
+	struct ipath_devdata *dd;
+	u16 lrh0;
+	u32 bth0;
+	u32 hwords;
+	u32 __iomem *piobuf;
+	struct ipath_ib_header hdr;
+	struct ipath_other_headers *ohdr;
+	unsigned long flags;
+
+	spin_lock_irqsave(&qp->s_lock, flags);
+
+	/* Don't send ACK or NAK if a RDMA read or atomic is pending. */
+	if (qp->r_head_ack_queue != qp->s_tail_ack_queue ||
+	    (qp->s_flags & IPATH_S_ACK_PENDING) ||
+	    qp->s_ack_state != OP(ACKNOWLEDGE))
+		goto queue_ack;
+
+	spin_unlock_irqrestore(&qp->s_lock, flags);
+
+	/* Don't try to send ACKs if the link isn't ACTIVE */
+	dd = dev->dd;
+	if (!(dd->ipath_flags & IPATH_LINKACTIVE))
+		goto done;
+
+	piobuf = ipath_getpiobuf(dd, 0, NULL);
+	if (!piobuf) {
+		/*
+		 * We are out of PIO buffers at the moment.
+		 * Pass responsibility for sending the ACK to the
+		 * send tasklet so that when a PIO buffer becomes
+		 * available, the ACK is sent ahead of other outgoing
+		 * packets.
+		 */
+		spin_lock_irqsave(&qp->s_lock, flags);
+		goto queue_ack;
+	}
+
+	/* Construct the header. */
+	ohdr = &hdr.u.oth;
+	lrh0 = IPATH_LRH_BTH;
+	/* header size in 32-bit words LRH+BTH+AETH = (8+12+4)/4. */
+	hwords = 6;
+	if (unlikely(qp->remote_ah_attr.ah_flags & IB_AH_GRH)) {
+		hwords += ipath_make_grh(dev, &hdr.u.l.grh,
+					 &qp->remote_ah_attr.grh,
+					 hwords, 0);
+		ohdr = &hdr.u.l.oth;
+		lrh0 = IPATH_LRH_GRH;
+	}
+	/* read pkey_index w/o lock (its atomic) */
+	bth0 = ipath_get_pkey(dd, qp->s_pkey_index) |
+		(OP(ACKNOWLEDGE) << 24) | (1 << 22);
+	if (qp->r_nak_state)
+		ohdr->u.aeth = cpu_to_be32((qp->r_msn & IPATH_MSN_MASK) |
+					    (qp->r_nak_state <<
+					     IPATH_AETH_CREDIT_SHIFT));
+	else
+		ohdr->u.aeth = ipath_compute_aeth(qp);
+	lrh0 |= qp->remote_ah_attr.sl << 4;
+	hdr.lrh[0] = cpu_to_be16(lrh0);
+	hdr.lrh[1] = cpu_to_be16(qp->remote_ah_attr.dlid);
+	hdr.lrh[2] = cpu_to_be16(hwords + SIZE_OF_CRC);
+	hdr.lrh[3] = cpu_to_be16(dd->ipath_lid |
+				 qp->remote_ah_attr.src_path_bits);
+	ohdr->bth[0] = cpu_to_be32(bth0);
+	ohdr->bth[1] = cpu_to_be32(qp->remote_qpn);
+	ohdr->bth[2] = cpu_to_be32(qp->r_ack_psn & IPATH_PSN_MASK);
+
+	writeq(hwords + 1, piobuf);
+
+	if (dd->ipath_flags & IPATH_PIO_FLUSH_WC) {
+		u32 *hdrp = (u32 *) &hdr;
+
+		ipath_flush_wc();
+		__iowrite32_copy(piobuf + 2, hdrp, hwords - 1);
+		ipath_flush_wc();
+		__raw_writel(hdrp[hwords - 1], piobuf + hwords + 1);
+	} else
+		__iowrite32_copy(piobuf + 2, (u32 *) &hdr, hwords);
+
+	ipath_flush_wc();
+
+	dev->n_unicast_xmit++;
+	goto done;
+
+queue_ack:
+	if (ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK) {
+		dev->n_rc_qacks++;
+		qp->s_flags |= IPATH_S_ACK_PENDING;
+		qp->s_nak_state = qp->r_nak_state;
+		qp->s_ack_psn = qp->r_ack_psn;
+
+		/* Schedule the send tasklet. */
+		ipath_schedule_send(qp);
+	}
+	spin_unlock_irqrestore(&qp->s_lock, flags);
+done:
+	return;
+}
+
+/**
+ * reset_psn - reset the QP state to send starting from PSN
+ * @qp: the QP
+ * @psn: the packet sequence number to restart at
+ *
+ * This is called from ipath_rc_rcv() to process an incoming RC ACK
+ * for the given QP.
+ * Called at interrupt level with the QP s_lock held.
+ */
+static void reset_psn(struct ipath_qp *qp, u32 psn)
+{
+	u32 n = qp->s_last;
+	struct ipath_swqe *wqe = get_swqe_ptr(qp, n);
+	u32 opcode;
+
+	qp->s_cur = n;
+
+	/*
+	 * If we are starting the request from the beginning,
+	 * let the normal send code handle initialization.
+	 */
+	if (ipath_cmp24(psn, wqe->psn) <= 0) {
+		qp->s_state = OP(SEND_LAST);
+		goto done;
+	}
+
+	/* Find the work request opcode corresponding to the given PSN. */
+	opcode = wqe->wr.opcode;
+	for (;;) {
+		int diff;
+
+		if (++n == qp->s_size)
+			n = 0;
+		if (n == qp->s_tail)
+			break;
+		wqe = get_swqe_ptr(qp, n);
+		diff = ipath_cmp24(psn, wqe->psn);
+		if (diff < 0)
+			break;
+		qp->s_cur = n;
+		/*
+		 * If we are starting the request from the beginning,
+		 * let the normal send code handle initialization.
+		 */
+		if (diff == 0) {
+			qp->s_state = OP(SEND_LAST);
+			goto done;
+		}
+		opcode = wqe->wr.opcode;
+	}
+
+	/*
+	 * Set the state to restart in the middle of a request.
+	 * Don't change the s_sge, s_cur_sge, or s_cur_size.
+	 * See ipath_make_rc_req().
+	 */
+	switch (opcode) {
+	case IB_WR_SEND:
+	case IB_WR_SEND_WITH_IMM:
+		qp->s_state = OP(RDMA_READ_RESPONSE_FIRST);
+		break;
+
+	case IB_WR_RDMA_WRITE:
+	case IB_WR_RDMA_WRITE_WITH_IMM:
+		qp->s_state = OP(RDMA_READ_RESPONSE_LAST);
+		break;
+
+	case IB_WR_RDMA_READ:
+		qp->s_state = OP(RDMA_READ_RESPONSE_MIDDLE);
+		break;
+
+	default:
+		/*
+		 * This case shouldn't happen since its only
+		 * one PSN per req.
+		 */
+		qp->s_state = OP(SEND_LAST);
+	}
+done:
+	qp->s_psn = psn;
+}
+
+/**
+ * ipath_restart_rc - back up requester to resend the last un-ACKed request
+ * @qp: the QP to restart
+ * @psn: packet sequence number for the request
+ * @wc: the work completion request
+ *
+ * The QP s_lock should be held and interrupts disabled.
+ */
+void ipath_restart_rc(struct ipath_qp *qp, u32 psn)
+{
+	struct ipath_swqe *wqe = get_swqe_ptr(qp, qp->s_last);
+	struct ipath_ibdev *dev;
+
+	if (qp->s_retry == 0) {
+		ipath_send_complete(qp, wqe, IB_WC_RETRY_EXC_ERR);
+		ipath_error_qp(qp, IB_WC_WR_FLUSH_ERR);
+		goto bail;
+	}
+	qp->s_retry--;
+
+	/*
+	 * Remove the QP from the timeout queue.
+	 * Note: it may already have been removed by ipath_ib_timer().
+	 */
+	dev = to_idev(qp->ibqp.device);
+	spin_lock(&dev->pending_lock);
+	if (!list_empty(&qp->timerwait))
+		list_del_init(&qp->timerwait);
+	if (!list_empty(&qp->piowait))
+		list_del_init(&qp->piowait);
+	spin_unlock(&dev->pending_lock);
+
+	if (wqe->wr.opcode == IB_WR_RDMA_READ)
+		dev->n_rc_resends++;
+	else
+		dev->n_rc_resends += (qp->s_psn - psn) & IPATH_PSN_MASK;
+
+	reset_psn(qp, psn);
+	ipath_schedule_send(qp);
+
+bail:
+	return;
+}
+
+static inline void update_last_psn(struct ipath_qp *qp, u32 psn)
+{
+	qp->s_last_psn = psn;
+}
+
+/**
+ * do_rc_ack - process an incoming RC ACK
+ * @qp: the QP the ACK came in on
+ * @psn: the packet sequence number of the ACK
+ * @opcode: the opcode of the request that resulted in the ACK
+ *
+ * This is called from ipath_rc_rcv_resp() to process an incoming RC ACK
+ * for the given QP.
+ * Called at interrupt level with the QP s_lock held and interrupts disabled.
+ * Returns 1 if OK, 0 if current operation should be aborted (NAK).
+ */
+static int do_rc_ack(struct ipath_qp *qp, u32 aeth, u32 psn, int opcode,
+		     u64 val)
+{
+	struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
+	struct ib_wc wc;
+	enum ib_wc_status status;
+	struct ipath_swqe *wqe;
+	int ret = 0;
+	u32 ack_psn;
+	int diff;
+
+	/*
+	 * Remove the QP from the timeout queue (or RNR timeout queue).
+	 * If ipath_ib_timer() has already removed it,
+	 * it's OK since we hold the QP s_lock and ipath_restart_rc()
+	 * just won't find anything to restart if we ACK everything.
+	 */
+	spin_lock(&dev->pending_lock);
+	if (!list_empty(&qp->timerwait))
+		list_del_init(&qp->timerwait);
+	spin_unlock(&dev->pending_lock);
+
+	/*
+	 * Note that NAKs implicitly ACK outstanding SEND and RDMA write
+	 * requests and implicitly NAK RDMA read and atomic requests issued
+	 * before the NAK'ed request.  The MSN won't include the NAK'ed
+	 * request but will include an ACK'ed request(s).
+	 */
+	ack_psn = psn;
+	if (aeth >> 29)
+		ack_psn--;
+	wqe = get_swqe_ptr(qp, qp->s_last);
+
+	/*
+	 * The MSN might be for a later WQE than the PSN indicates so
+	 * only complete WQEs that the PSN finishes.
+	 */
+	while ((diff = ipath_cmp24(ack_psn, wqe->lpsn)) >= 0) {
+		/*
+		 * RDMA_READ_RESPONSE_ONLY is a special case since
+		 * we want to generate completion events for everything
+		 * before the RDMA read, copy the data, then generate
+		 * the completion for the read.
+		 */
+		if (wqe->wr.opcode == IB_WR_RDMA_READ &&
+		    opcode == OP(RDMA_READ_RESPONSE_ONLY) &&
+		    diff == 0) {
+			ret = 1;
+			goto bail;
+		}
+		/*
+		 * If this request is a RDMA read or atomic, and the ACK is
+		 * for a later operation, this ACK NAKs the RDMA read or
+		 * atomic.  In other words, only a RDMA_READ_LAST or ONLY
+		 * can ACK a RDMA read and likewise for atomic ops.  Note
+		 * that the NAK case can only happen if relaxed ordering is
+		 * used and requests are sent after an RDMA read or atomic
+		 * is sent but before the response is received.
+		 */
+		if ((wqe->wr.opcode == IB_WR_RDMA_READ &&
+		     (opcode != OP(RDMA_READ_RESPONSE_LAST) || diff != 0)) ||
+		    ((wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
+		      wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) &&
+		     (opcode != OP(ATOMIC_ACKNOWLEDGE) || diff != 0))) {
+			/*
+			 * The last valid PSN seen is the previous
+			 * request's.
+			 */
+			update_last_psn(qp, wqe->psn - 1);
+			/* Retry this request. */
+			ipath_restart_rc(qp, wqe->psn);
+			/*
+			 * No need to process the ACK/NAK since we are
+			 * restarting an earlier request.
+			 */
+			goto bail;
+		}
+		if (wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
+		    wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD)
+			*(u64 *) wqe->sg_list[0].vaddr = val;
+		if (qp->s_num_rd_atomic &&
+		    (wqe->wr.opcode == IB_WR_RDMA_READ ||
+		     wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
+		     wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD)) {
+			qp->s_num_rd_atomic--;
+			/* Restart sending task if fence is complete */
+			if (((qp->s_flags & IPATH_S_FENCE_PENDING) &&
+			     !qp->s_num_rd_atomic) ||
+			    qp->s_flags & IPATH_S_RDMAR_PENDING)
+				ipath_schedule_send(qp);
+		}
+		/* Post a send completion queue entry if requested. */
+		if (!(qp->s_flags & IPATH_S_SIGNAL_REQ_WR) ||
+		    (wqe->wr.send_flags & IB_SEND_SIGNALED)) {
+			memset(&wc, 0, sizeof wc);
+			wc.wr_id = wqe->wr.wr_id;
+			wc.status = IB_WC_SUCCESS;
+			wc.opcode = ib_ipath_wc_opcode[wqe->wr.opcode];
+			wc.byte_len = wqe->length;
+			wc.qp = &qp->ibqp;
+			wc.src_qp = qp->remote_qpn;
+			wc.slid = qp->remote_ah_attr.dlid;
+			wc.sl = qp->remote_ah_attr.sl;
+			ipath_cq_enter(to_icq(qp->ibqp.send_cq), &wc, 0);
+		}
+		qp->s_retry = qp->s_retry_cnt;
+		/*
+		 * If we are completing a request which is in the process of
+		 * being resent, we can stop resending it since we know the
+		 * responder has already seen it.
+		 */
+		if (qp->s_last == qp->s_cur) {
+			if (++qp->s_cur >= qp->s_size)
+				qp->s_cur = 0;
+			qp->s_last = qp->s_cur;
+			if (qp->s_last == qp->s_tail)
+				break;
+			wqe = get_swqe_ptr(qp, qp->s_cur);
+			qp->s_state = OP(SEND_LAST);
+			qp->s_psn = wqe->psn;
+		} else {
+			if (++qp->s_last >= qp->s_size)
+				qp->s_last = 0;
+			if (qp->state == IB_QPS_SQD && qp->s_last == qp->s_cur)
+				qp->s_draining = 0;
+			if (qp->s_last == qp->s_tail)
+				break;
+			wqe = get_swqe_ptr(qp, qp->s_last);
+		}
+	}
+
+	switch (aeth >> 29) {
+	case 0:		/* ACK */
+		dev->n_rc_acks++;
+		/* If this is a partial ACK, reset the retransmit timer. */
+		if (qp->s_last != qp->s_tail) {
+			spin_lock(&dev->pending_lock);
+			if (list_empty(&qp->timerwait))
+				list_add_tail(&qp->timerwait,
+					&dev->pending[dev->pending_index]);
+			spin_unlock(&dev->pending_lock);
+			/*
+			 * If we get a partial ACK for a resent operation,
+			 * we can stop resending the earlier packets and
+			 * continue with the next packet the receiver wants.
+			 */
+			if (ipath_cmp24(qp->s_psn, psn) <= 0) {
+				reset_psn(qp, psn + 1);
+				ipath_schedule_send(qp);
+			}
+		} else if (ipath_cmp24(qp->s_psn, psn) <= 0) {
+			qp->s_state = OP(SEND_LAST);
+			qp->s_psn = psn + 1;
+		}
+		ipath_get_credit(qp, aeth);
+		qp->s_rnr_retry = qp->s_rnr_retry_cnt;
+		qp->s_retry = qp->s_retry_cnt;
+		update_last_psn(qp, psn);
+		ret = 1;
+		goto bail;
+
+	case 1:		/* RNR NAK */
+		dev->n_rnr_naks++;
+		if (qp->s_last == qp->s_tail)
+			goto bail;
+		if (qp->s_rnr_retry == 0) {
+			status = IB_WC_RNR_RETRY_EXC_ERR;
+			goto class_b;
+		}
+		if (qp->s_rnr_retry_cnt < 7)
+			qp->s_rnr_retry--;
+
+		/* The last valid PSN is the previous PSN. */
+		update_last_psn(qp, psn - 1);
+
+		if (wqe->wr.opcode == IB_WR_RDMA_READ)
+			dev->n_rc_resends++;
+		else
+			dev->n_rc_resends +=
+				(qp->s_psn - psn) & IPATH_PSN_MASK;
+
+		reset_psn(qp, psn);
+
+		qp->s_rnr_timeout =
+			ib_ipath_rnr_table[(aeth >> IPATH_AETH_CREDIT_SHIFT) &
+					   IPATH_AETH_CREDIT_MASK];
+		ipath_insert_rnr_queue(qp);
+		ipath_schedule_send(qp);
+		goto bail;
+
+	case 3:		/* NAK */
+		if (qp->s_last == qp->s_tail)
+			goto bail;
+		/* The last valid PSN is the previous PSN. */
+		update_last_psn(qp, psn - 1);
+		switch ((aeth >> IPATH_AETH_CREDIT_SHIFT) &
+			IPATH_AETH_CREDIT_MASK) {
+		case 0:	/* PSN sequence error */
+			dev->n_seq_naks++;
+			/*
+			 * Back up to the responder's expected PSN.
+			 * Note that we might get a NAK in the middle of an
+			 * RDMA READ response which terminates the RDMA
+			 * READ.
+			 */
+			ipath_restart_rc(qp, psn);
+			break;
+
+		case 1:	/* Invalid Request */
+			status = IB_WC_REM_INV_REQ_ERR;
+			dev->n_other_naks++;
+			goto class_b;
+
+		case 2:	/* Remote Access Error */
+			status = IB_WC_REM_ACCESS_ERR;
+			dev->n_other_naks++;
+			goto class_b;
+
+		case 3:	/* Remote Operation Error */
+			status = IB_WC_REM_OP_ERR;
+			dev->n_other_naks++;
+		class_b:
+			ipath_send_complete(qp, wqe, status);
+			ipath_error_qp(qp, IB_WC_WR_FLUSH_ERR);
+			break;
+
+		default:
+			/* Ignore other reserved NAK error codes */
+			goto reserved;
+		}
+		qp->s_rnr_retry = qp->s_rnr_retry_cnt;
+		goto bail;
+
+	default:		/* 2: reserved */
+	reserved:
+		/* Ignore reserved NAK codes. */
+		goto bail;
+	}
+
+bail:
+	return ret;
+}
+
+/**
+ * ipath_rc_rcv_resp - process an incoming RC response packet
+ * @dev: the device this packet came in on
+ * @ohdr: the other headers for this packet
+ * @data: the packet data
+ * @tlen: the packet length
+ * @qp: the QP for this packet
+ * @opcode: the opcode for this packet
+ * @psn: the packet sequence number for this packet
+ * @hdrsize: the header length
+ * @pmtu: the path MTU
+ * @header_in_data: true if part of the header data is in the data buffer
+ *
+ * This is called from ipath_rc_rcv() to process an incoming RC response
+ * packet for the given QP.
+ * Called at interrupt level.
+ */
+static inline void ipath_rc_rcv_resp(struct ipath_ibdev *dev,
+				     struct ipath_other_headers *ohdr,
+				     void *data, u32 tlen,
+				     struct ipath_qp *qp,
+				     u32 opcode,
+				     u32 psn, u32 hdrsize, u32 pmtu,
+				     int header_in_data)
+{
+	struct ipath_swqe *wqe;
+	enum ib_wc_status status;
+	unsigned long flags;
+	int diff;
+	u32 pad;
+	u32 aeth;
+	u64 val;
+
+	spin_lock_irqsave(&qp->s_lock, flags);
+
+	/* Double check we can process this now that we hold the s_lock. */
+	if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK))
+		goto ack_done;
+
+	/* Ignore invalid responses. */
+	if (ipath_cmp24(psn, qp->s_next_psn) >= 0)
+		goto ack_done;
+
+	/* Ignore duplicate responses. */
+	diff = ipath_cmp24(psn, qp->s_last_psn);
+	if (unlikely(diff <= 0)) {
+		/* Update credits for "ghost" ACKs */
+		if (diff == 0 && opcode == OP(ACKNOWLEDGE)) {
+			if (!header_in_data)
+				aeth = be32_to_cpu(ohdr->u.aeth);
+			else {
+				aeth = be32_to_cpu(((__be32 *) data)[0]);
+				data += sizeof(__be32);
+			}
+			if ((aeth >> 29) == 0)
+				ipath_get_credit(qp, aeth);
+		}
+		goto ack_done;
+	}
+
+	if (unlikely(qp->s_last == qp->s_tail))
+		goto ack_done;
+	wqe = get_swqe_ptr(qp, qp->s_last);
+	status = IB_WC_SUCCESS;
+
+	switch (opcode) {
+	case OP(ACKNOWLEDGE):
+	case OP(ATOMIC_ACKNOWLEDGE):
+	case OP(RDMA_READ_RESPONSE_FIRST):
+		if (!header_in_data)
+			aeth = be32_to_cpu(ohdr->u.aeth);
+		else {
+			aeth = be32_to_cpu(((__be32 *) data)[0]);
+			data += sizeof(__be32);
+		}
+		if (opcode == OP(ATOMIC_ACKNOWLEDGE)) {
+			if (!header_in_data) {
+				__be32 *p = ohdr->u.at.atomic_ack_eth;
+
+				val = ((u64) be32_to_cpu(p[0]) << 32) |
+					be32_to_cpu(p[1]);
+			} else
+				val = be64_to_cpu(((__be64 *) data)[0]);
+		} else
+			val = 0;
+		if (!do_rc_ack(qp, aeth, psn, opcode, val) ||
+		    opcode != OP(RDMA_READ_RESPONSE_FIRST))
+			goto ack_done;
+		hdrsize += 4;
+		wqe = get_swqe_ptr(qp, qp->s_last);
+		if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ))
+			goto ack_op_err;
+		qp->r_flags &= ~IPATH_R_RDMAR_SEQ;
+		/*
+		 * If this is a response to a resent RDMA read, we
+		 * have to be careful to copy the data to the right
+		 * location.
+		 */
+		qp->s_rdma_read_len = restart_sge(&qp->s_rdma_read_sge,
+						  wqe, psn, pmtu);
+		goto read_middle;
+
+	case OP(RDMA_READ_RESPONSE_MIDDLE):
+		/* no AETH, no ACK */
+		if (unlikely(ipath_cmp24(psn, qp->s_last_psn + 1))) {
+			dev->n_rdma_seq++;
+			if (qp->r_flags & IPATH_R_RDMAR_SEQ)
+				goto ack_done;
+			qp->r_flags |= IPATH_R_RDMAR_SEQ;
+			ipath_restart_rc(qp, qp->s_last_psn + 1);
+			goto ack_done;
+		}
+		if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ))
+			goto ack_op_err;
+	read_middle:
+		if (unlikely(tlen != (hdrsize + pmtu + 4)))
+			goto ack_len_err;
+		if (unlikely(pmtu >= qp->s_rdma_read_len))
+			goto ack_len_err;
+
+		/* We got a response so update the timeout. */
+		spin_lock(&dev->pending_lock);
+		if (qp->s_rnr_timeout == 0 && !list_empty(&qp->timerwait))
+			list_move_tail(&qp->timerwait,
+				       &dev->pending[dev->pending_index]);
+		spin_unlock(&dev->pending_lock);
+
+		if (opcode == OP(RDMA_READ_RESPONSE_MIDDLE))
+			qp->s_retry = qp->s_retry_cnt;
+
+		/*
+		 * Update the RDMA receive state but do the copy w/o
+		 * holding the locks and blocking interrupts.
+		 */
+		qp->s_rdma_read_len -= pmtu;
+		update_last_psn(qp, psn);
+		spin_unlock_irqrestore(&qp->s_lock, flags);
+		ipath_copy_sge(&qp->s_rdma_read_sge, data, pmtu);
+		goto bail;
+
+	case OP(RDMA_READ_RESPONSE_ONLY):
+		if (!header_in_data)
+			aeth = be32_to_cpu(ohdr->u.aeth);
+		else
+			aeth = be32_to_cpu(((__be32 *) data)[0]);
+		if (!do_rc_ack(qp, aeth, psn, opcode, 0))
+			goto ack_done;
+		/* Get the number of bytes the message was padded by. */
+		pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
+		/*
+		 * Check that the data size is >= 0 && <= pmtu.
+		 * Remember to account for the AETH header (4) and
+		 * ICRC (4).
+		 */
+		if (unlikely(tlen < (hdrsize + pad + 8)))
+			goto ack_len_err;
+		/*
+		 * If this is a response to a resent RDMA read, we
+		 * have to be careful to copy the data to the right
+		 * location.
+		 */
+		wqe = get_swqe_ptr(qp, qp->s_last);
+		qp->s_rdma_read_len = restart_sge(&qp->s_rdma_read_sge,
+						  wqe, psn, pmtu);
+		goto read_last;
+
+	case OP(RDMA_READ_RESPONSE_LAST):
+		/* ACKs READ req. */
+		if (unlikely(ipath_cmp24(psn, qp->s_last_psn + 1))) {
+			dev->n_rdma_seq++;
+			if (qp->r_flags & IPATH_R_RDMAR_SEQ)
+				goto ack_done;
+			qp->r_flags |= IPATH_R_RDMAR_SEQ;
+			ipath_restart_rc(qp, qp->s_last_psn + 1);
+			goto ack_done;
+		}
+		if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ))
+			goto ack_op_err;
+		/* Get the number of bytes the message was padded by. */
+		pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
+		/*
+		 * Check that the data size is >= 1 && <= pmtu.
+		 * Remember to account for the AETH header (4) and
+		 * ICRC (4).
+		 */
+		if (unlikely(tlen <= (hdrsize + pad + 8)))
+			goto ack_len_err;
+	read_last:
+		tlen -= hdrsize + pad + 8;
+		if (unlikely(tlen != qp->s_rdma_read_len))
+			goto ack_len_err;
+		if (!header_in_data)
+			aeth = be32_to_cpu(ohdr->u.aeth);
+		else {
+			aeth = be32_to_cpu(((__be32 *) data)[0]);
+			data += sizeof(__be32);
+		}
+		ipath_copy_sge(&qp->s_rdma_read_sge, data, tlen);
+		(void) do_rc_ack(qp, aeth, psn,
+				 OP(RDMA_READ_RESPONSE_LAST), 0);
+		goto ack_done;
+	}
+
+ack_op_err:
+	status = IB_WC_LOC_QP_OP_ERR;
+	goto ack_err;
+
+ack_len_err:
+	status = IB_WC_LOC_LEN_ERR;
+ack_err:
+	ipath_send_complete(qp, wqe, status);
+	ipath_error_qp(qp, IB_WC_WR_FLUSH_ERR);
+ack_done:
+	spin_unlock_irqrestore(&qp->s_lock, flags);
+bail:
+	return;
+}
+
+/**
+ * ipath_rc_rcv_error - process an incoming duplicate or error RC packet
+ * @dev: the device this packet came in on
+ * @ohdr: the other headers for this packet
+ * @data: the packet data
+ * @qp: the QP for this packet
+ * @opcode: the opcode for this packet
+ * @psn: the packet sequence number for this packet
+ * @diff: the difference between the PSN and the expected PSN
+ * @header_in_data: true if part of the header data is in the data buffer
+ *
+ * This is called from ipath_rc_rcv() to process an unexpected
+ * incoming RC packet for the given QP.
+ * Called at interrupt level.
+ * Return 1 if no more processing is needed; otherwise return 0 to
+ * schedule a response to be sent.
+ */
+static inline int ipath_rc_rcv_error(struct ipath_ibdev *dev,
+				     struct ipath_other_headers *ohdr,
+				     void *data,
+				     struct ipath_qp *qp,
+				     u32 opcode,
+				     u32 psn,
+				     int diff,
+				     int header_in_data)
+{
+	struct ipath_ack_entry *e;
+	u8 i, prev;
+	int old_req;
+	unsigned long flags;
+
+	if (diff > 0) {
+		/*
+		 * Packet sequence error.
+		 * A NAK will ACK earlier sends and RDMA writes.
+		 * Don't queue the NAK if we already sent one.
+		 */
+		if (!qp->r_nak_state) {
+			qp->r_nak_state = IB_NAK_PSN_ERROR;
+			/* Use the expected PSN. */
+			qp->r_ack_psn = qp->r_psn;
+			goto send_ack;
+		}
+		goto done;
+	}
+
+	/*
+	 * Handle a duplicate request.  Don't re-execute SEND, RDMA
+	 * write or atomic op.  Don't NAK errors, just silently drop
+	 * the duplicate request.  Note that r_sge, r_len, and
+	 * r_rcv_len may be in use so don't modify them.
+	 *
+	 * We are supposed to ACK the earliest duplicate PSN but we
+	 * can coalesce an outstanding duplicate ACK.  We have to
+	 * send the earliest so that RDMA reads can be restarted at
+	 * the requester's expected PSN.
+	 *
+	 * First, find where this duplicate PSN falls within the
+	 * ACKs previously sent.
+	 */
+	psn &= IPATH_PSN_MASK;
+	e = NULL;
+	old_req = 1;
+
+	spin_lock_irqsave(&qp->s_lock, flags);
+	/* Double check we can process this now that we hold the s_lock. */
+	if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK))
+		goto unlock_done;
+
+	for (i = qp->r_head_ack_queue; ; i = prev) {
+		if (i == qp->s_tail_ack_queue)
+			old_req = 0;
+		if (i)
+			prev = i - 1;
+		else
+			prev = IPATH_MAX_RDMA_ATOMIC;
+		if (prev == qp->r_head_ack_queue) {
+			e = NULL;
+			break;
+		}
+		e = &qp->s_ack_queue[prev];
+		if (!e->opcode) {
+			e = NULL;
+			break;
+		}
+		if (ipath_cmp24(psn, e->psn) >= 0) {
+			if (prev == qp->s_tail_ack_queue)
+				old_req = 0;
+			break;
+		}
+	}
+	switch (opcode) {
+	case OP(RDMA_READ_REQUEST): {
+		struct ib_reth *reth;
+		u32 offset;
+		u32 len;
+
+		/*
+		 * If we didn't find the RDMA read request in the ack queue,
+		 * or the send tasklet is already backed up to send an
+		 * earlier entry, we can ignore this request.
+		 */
+		if (!e || e->opcode != OP(RDMA_READ_REQUEST) || old_req)
+			goto unlock_done;
+		/* RETH comes after BTH */
+		if (!header_in_data)
+			reth = &ohdr->u.rc.reth;
+		else {
+			reth = (struct ib_reth *)data;
+			data += sizeof(*reth);
+		}
+		/*
+		 * Address range must be a subset of the original
+		 * request and start on pmtu boundaries.
+		 * We reuse the old ack_queue slot since the requester
+		 * should not back up and request an earlier PSN for the
+		 * same request.
+		 */
+		offset = ((psn - e->psn) & IPATH_PSN_MASK) *
+			ib_mtu_enum_to_int(qp->path_mtu);
+		len = be32_to_cpu(reth->length);
+		if (unlikely(offset + len > e->rdma_sge.sge.sge_length))
+			goto unlock_done;
+		if (len != 0) {
+			u32 rkey = be32_to_cpu(reth->rkey);
+			u64 vaddr = be64_to_cpu(reth->vaddr);
+			int ok;
+
+			ok = ipath_rkey_ok(qp, &e->rdma_sge,
+					   len, vaddr, rkey,
+					   IB_ACCESS_REMOTE_READ);
+			if (unlikely(!ok))
+				goto unlock_done;
+		} else {
+			e->rdma_sge.sg_list = NULL;
+			e->rdma_sge.num_sge = 0;
+			e->rdma_sge.sge.mr = NULL;
+			e->rdma_sge.sge.vaddr = NULL;
+			e->rdma_sge.sge.length = 0;
+			e->rdma_sge.sge.sge_length = 0;
+		}
+		e->psn = psn;
+		qp->s_ack_state = OP(ACKNOWLEDGE);
+		qp->s_tail_ack_queue = prev;
+		break;
+	}
+
+	case OP(COMPARE_SWAP):
+	case OP(FETCH_ADD): {
+		/*
+		 * If we didn't find the atomic request in the ack queue
+		 * or the send tasklet is already backed up to send an
+		 * earlier entry, we can ignore this request.
+		 */
+		if (!e || e->opcode != (u8) opcode || old_req)
+			goto unlock_done;
+		qp->s_ack_state = OP(ACKNOWLEDGE);
+		qp->s_tail_ack_queue = prev;
+		break;
+	}
+
+	default:
+		if (old_req)
+			goto unlock_done;
+		/*
+		 * Resend the most recent ACK if this request is
+		 * after all the previous RDMA reads and atomics.
+		 */
+		if (i == qp->r_head_ack_queue) {
+			spin_unlock_irqrestore(&qp->s_lock, flags);
+			qp->r_nak_state = 0;
+			qp->r_ack_psn = qp->r_psn - 1;
+			goto send_ack;
+		}
+		/*
+		 * Try to send a simple ACK to work around a Mellanox bug
+		 * which doesn't accept a RDMA read response or atomic
+		 * response as an ACK for earlier SENDs or RDMA writes.
+		 */
+		if (qp->r_head_ack_queue == qp->s_tail_ack_queue &&
+		    !(qp->s_flags & IPATH_S_ACK_PENDING) &&
+		    qp->s_ack_state == OP(ACKNOWLEDGE)) {
+			spin_unlock_irqrestore(&qp->s_lock, flags);
+			qp->r_nak_state = 0;
+			qp->r_ack_psn = qp->s_ack_queue[i].psn - 1;
+			goto send_ack;
+		}
+		/*
+		 * Resend the RDMA read or atomic op which
+		 * ACKs this duplicate request.
+		 */
+		qp->s_ack_state = OP(ACKNOWLEDGE);
+		qp->s_tail_ack_queue = i;
+		break;
+	}
+	qp->r_nak_state = 0;
+	ipath_schedule_send(qp);
+
+unlock_done:
+	spin_unlock_irqrestore(&qp->s_lock, flags);
+done:
+	return 1;
+
+send_ack:
+	return 0;
+}
+
+void ipath_rc_error(struct ipath_qp *qp, enum ib_wc_status err)
+{
+	unsigned long flags;
+	int lastwqe;
+
+	spin_lock_irqsave(&qp->s_lock, flags);
+	lastwqe = ipath_error_qp(qp, err);
+	spin_unlock_irqrestore(&qp->s_lock, flags);
+
+	if (lastwqe) {
+		struct ib_event ev;
+
+		ev.device = qp->ibqp.device;
+		ev.element.qp = &qp->ibqp;
+		ev.event = IB_EVENT_QP_LAST_WQE_REACHED;
+		qp->ibqp.event_handler(&ev, qp->ibqp.qp_context);
+	}
+}
+
+static inline void ipath_update_ack_queue(struct ipath_qp *qp, unsigned n)
+{
+	unsigned next;
+
+	next = n + 1;
+	if (next > IPATH_MAX_RDMA_ATOMIC)
+		next = 0;
+	if (n == qp->s_tail_ack_queue) {
+		qp->s_tail_ack_queue = next;
+		qp->s_ack_state = OP(ACKNOWLEDGE);
+	}
+}
+
+/**
+ * ipath_rc_rcv - process an incoming RC packet
+ * @dev: the device this packet came in on
+ * @hdr: the header of this packet
+ * @has_grh: true if the header has a GRH
+ * @data: the packet data
+ * @tlen: the packet length
+ * @qp: the QP for this packet
+ *
+ * This is called from ipath_qp_rcv() to process an incoming RC packet
+ * for the given QP.
+ * Called at interrupt level.
+ */
+void ipath_rc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
+		  int has_grh, void *data, u32 tlen, struct ipath_qp *qp)
+{
+	struct ipath_other_headers *ohdr;
+	u32 opcode;
+	u32 hdrsize;
+	u32 psn;
+	u32 pad;
+	struct ib_wc wc;
+	u32 pmtu = ib_mtu_enum_to_int(qp->path_mtu);
+	int diff;
+	struct ib_reth *reth;
+	int header_in_data;
+	unsigned long flags;
+
+	/* Validate the SLID. See Ch. 9.6.1.5 */
+	if (unlikely(be16_to_cpu(hdr->lrh[3]) != qp->remote_ah_attr.dlid))
+		goto done;
+
+	/* Check for GRH */
+	if (!has_grh) {
+		ohdr = &hdr->u.oth;
+		hdrsize = 8 + 12;	/* LRH + BTH */
+		psn = be32_to_cpu(ohdr->bth[2]);
+		header_in_data = 0;
+	} else {
+		ohdr = &hdr->u.l.oth;
+		hdrsize = 8 + 40 + 12;	/* LRH + GRH + BTH */
+		/*
+		 * The header with GRH is 60 bytes and the core driver sets
+		 * the eager header buffer size to 56 bytes so the last 4
+		 * bytes of the BTH header (PSN) is in the data buffer.
+		 */
+		header_in_data = dev->dd->ipath_rcvhdrentsize == 16;
+		if (header_in_data) {
+			psn = be32_to_cpu(((__be32 *) data)[0]);
+			data += sizeof(__be32);
+		} else
+			psn = be32_to_cpu(ohdr->bth[2]);
+	}
+
+	/*
+	 * Process responses (ACKs) before anything else.  Note that the
+	 * packet sequence number will be for something in the send work
+	 * queue rather than the expected receive packet sequence number.
+	 * In other words, this QP is the requester.
+	 */
+	opcode = be32_to_cpu(ohdr->bth[0]) >> 24;
+	if (opcode >= OP(RDMA_READ_RESPONSE_FIRST) &&
+	    opcode <= OP(ATOMIC_ACKNOWLEDGE)) {
+		ipath_rc_rcv_resp(dev, ohdr, data, tlen, qp, opcode, psn,
+				  hdrsize, pmtu, header_in_data);
+		goto done;
+	}
+
+	/* Compute 24 bits worth of difference. */
+	diff = ipath_cmp24(psn, qp->r_psn);
+	if (unlikely(diff)) {
+		if (ipath_rc_rcv_error(dev, ohdr, data, qp, opcode,
+				       psn, diff, header_in_data))
+			goto done;
+		goto send_ack;
+	}
+
+	/* Check for opcode sequence errors. */
+	switch (qp->r_state) {
+	case OP(SEND_FIRST):
+	case OP(SEND_MIDDLE):
+		if (opcode == OP(SEND_MIDDLE) ||
+		    opcode == OP(SEND_LAST) ||
+		    opcode == OP(SEND_LAST_WITH_IMMEDIATE))
+			break;
+		goto nack_inv;
+
+	case OP(RDMA_WRITE_FIRST):
+	case OP(RDMA_WRITE_MIDDLE):
+		if (opcode == OP(RDMA_WRITE_MIDDLE) ||
+		    opcode == OP(RDMA_WRITE_LAST) ||
+		    opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE))
+			break;
+		goto nack_inv;
+
+	default:
+		if (opcode == OP(SEND_MIDDLE) ||
+		    opcode == OP(SEND_LAST) ||
+		    opcode == OP(SEND_LAST_WITH_IMMEDIATE) ||
+		    opcode == OP(RDMA_WRITE_MIDDLE) ||
+		    opcode == OP(RDMA_WRITE_LAST) ||
+		    opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE))
+			goto nack_inv;
+		/*
+		 * Note that it is up to the requester to not send a new
+		 * RDMA read or atomic operation before receiving an ACK
+		 * for the previous operation.
+		 */
+		break;
+	}
+
+	memset(&wc, 0, sizeof wc);
+
+	/* OK, process the packet. */
+	switch (opcode) {
+	case OP(SEND_FIRST):
+		if (!ipath_get_rwqe(qp, 0))
+			goto rnr_nak;
+		qp->r_rcv_len = 0;
+		/* FALLTHROUGH */
+	case OP(SEND_MIDDLE):
+	case OP(RDMA_WRITE_MIDDLE):
+	send_middle:
+		/* Check for invalid length PMTU or posted rwqe len. */
+		if (unlikely(tlen != (hdrsize + pmtu + 4)))
+			goto nack_inv;
+		qp->r_rcv_len += pmtu;
+		if (unlikely(qp->r_rcv_len > qp->r_len))
+			goto nack_inv;
+		ipath_copy_sge(&qp->r_sge, data, pmtu);
+		break;
+
+	case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE):
+		/* consume RWQE */
+		if (!ipath_get_rwqe(qp, 1))
+			goto rnr_nak;
+		goto send_last_imm;
+
+	case OP(SEND_ONLY):
+	case OP(SEND_ONLY_WITH_IMMEDIATE):
+		if (!ipath_get_rwqe(qp, 0))
+			goto rnr_nak;
+		qp->r_rcv_len = 0;
+		if (opcode == OP(SEND_ONLY))
+			goto send_last;
+		/* FALLTHROUGH */
+	case OP(SEND_LAST_WITH_IMMEDIATE):
+	send_last_imm:
+		if (header_in_data) {
+			wc.ex.imm_data = *(__be32 *) data;
+			data += sizeof(__be32);
+		} else {
+			/* Immediate data comes after BTH */
+			wc.ex.imm_data = ohdr->u.imm_data;
+		}
+		hdrsize += 4;
+		wc.wc_flags = IB_WC_WITH_IMM;
+		/* FALLTHROUGH */
+	case OP(SEND_LAST):
+	case OP(RDMA_WRITE_LAST):
+	send_last:
+		/* Get the number of bytes the message was padded by. */
+		pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
+		/* Check for invalid length. */
+		/* XXX LAST len should be >= 1 */
+		if (unlikely(tlen < (hdrsize + pad + 4)))
+			goto nack_inv;
+		/* Don't count the CRC. */
+		tlen -= (hdrsize + pad + 4);
+		wc.byte_len = tlen + qp->r_rcv_len;
+		if (unlikely(wc.byte_len > qp->r_len))
+			goto nack_inv;
+		ipath_copy_sge(&qp->r_sge, data, tlen);
+		qp->r_msn++;
+		if (!test_and_clear_bit(IPATH_R_WRID_VALID, &qp->r_aflags))
+			break;
+		wc.wr_id = qp->r_wr_id;
+		wc.status = IB_WC_SUCCESS;
+		if (opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE) ||
+		    opcode == OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE))
+			wc.opcode = IB_WC_RECV_RDMA_WITH_IMM;
+		else
+			wc.opcode = IB_WC_RECV;
+		wc.qp = &qp->ibqp;
+		wc.src_qp = qp->remote_qpn;
+		wc.slid = qp->remote_ah_attr.dlid;
+		wc.sl = qp->remote_ah_attr.sl;
+		/* Signal completion event if the solicited bit is set. */
+		ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc,
+			       (ohdr->bth[0] &
+				cpu_to_be32(1 << 23)) != 0);
+		break;
+
+	case OP(RDMA_WRITE_FIRST):
+	case OP(RDMA_WRITE_ONLY):
+	case OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE):
+		if (unlikely(!(qp->qp_access_flags &
+			       IB_ACCESS_REMOTE_WRITE)))
+			goto nack_inv;
+		/* consume RWQE */
+		/* RETH comes after BTH */
+		if (!header_in_data)
+			reth = &ohdr->u.rc.reth;
+		else {
+			reth = (struct ib_reth *)data;
+			data += sizeof(*reth);
+		}
+		hdrsize += sizeof(*reth);
+		qp->r_len = be32_to_cpu(reth->length);
+		qp->r_rcv_len = 0;
+		if (qp->r_len != 0) {
+			u32 rkey = be32_to_cpu(reth->rkey);
+			u64 vaddr = be64_to_cpu(reth->vaddr);
+			int ok;
+
+			/* Check rkey & NAK */
+			ok = ipath_rkey_ok(qp, &qp->r_sge,
+					   qp->r_len, vaddr, rkey,
+					   IB_ACCESS_REMOTE_WRITE);
+			if (unlikely(!ok))
+				goto nack_acc;
+		} else {
+			qp->r_sge.sg_list = NULL;
+			qp->r_sge.sge.mr = NULL;
+			qp->r_sge.sge.vaddr = NULL;
+			qp->r_sge.sge.length = 0;
+			qp->r_sge.sge.sge_length = 0;
+		}
+		if (opcode == OP(RDMA_WRITE_FIRST))
+			goto send_middle;
+		else if (opcode == OP(RDMA_WRITE_ONLY))
+			goto send_last;
+		if (!ipath_get_rwqe(qp, 1))
+			goto rnr_nak;
+		goto send_last_imm;
+
+	case OP(RDMA_READ_REQUEST): {
+		struct ipath_ack_entry *e;
+		u32 len;
+		u8 next;
+
+		if (unlikely(!(qp->qp_access_flags &
+			       IB_ACCESS_REMOTE_READ)))
+			goto nack_inv;
+		next = qp->r_head_ack_queue + 1;
+		if (next > IPATH_MAX_RDMA_ATOMIC)
+			next = 0;
+		spin_lock_irqsave(&qp->s_lock, flags);
+		/* Double check we can process this while holding the s_lock. */
+		if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK))
+			goto unlock;
+		if (unlikely(next == qp->s_tail_ack_queue)) {
+			if (!qp->s_ack_queue[next].sent)
+				goto nack_inv_unlck;
+			ipath_update_ack_queue(qp, next);
+		}
+		e = &qp->s_ack_queue[qp->r_head_ack_queue];
+		/* RETH comes after BTH */
+		if (!header_in_data)
+			reth = &ohdr->u.rc.reth;
+		else {
+			reth = (struct ib_reth *)data;
+			data += sizeof(*reth);
+		}
+		len = be32_to_cpu(reth->length);
+		if (len) {
+			u32 rkey = be32_to_cpu(reth->rkey);
+			u64 vaddr = be64_to_cpu(reth->vaddr);
+			int ok;
+
+			/* Check rkey & NAK */
+			ok = ipath_rkey_ok(qp, &e->rdma_sge, len, vaddr,
+					   rkey, IB_ACCESS_REMOTE_READ);
+			if (unlikely(!ok))
+				goto nack_acc_unlck;
+			/*
+			 * Update the next expected PSN.  We add 1 later
+			 * below, so only add the remainder here.
+			 */
+			if (len > pmtu)
+				qp->r_psn += (len - 1) / pmtu;
+		} else {
+			e->rdma_sge.sg_list = NULL;
+			e->rdma_sge.num_sge = 0;
+			e->rdma_sge.sge.mr = NULL;
+			e->rdma_sge.sge.vaddr = NULL;
+			e->rdma_sge.sge.length = 0;
+			e->rdma_sge.sge.sge_length = 0;
+		}
+		e->opcode = opcode;
+		e->sent = 0;
+		e->psn = psn;
+		/*
+		 * We need to increment the MSN here instead of when we
+		 * finish sending the result since a duplicate request would
+		 * increment it more than once.
+		 */
+		qp->r_msn++;
+		qp->r_psn++;
+		qp->r_state = opcode;
+		qp->r_nak_state = 0;
+		qp->r_head_ack_queue = next;
+
+		/* Schedule the send tasklet. */
+		ipath_schedule_send(qp);
+
+		goto unlock;
+	}
+
+	case OP(COMPARE_SWAP):
+	case OP(FETCH_ADD): {
+		struct ib_atomic_eth *ateth;
+		struct ipath_ack_entry *e;
+		u64 vaddr;
+		atomic64_t *maddr;
+		u64 sdata;
+		u32 rkey;
+		u8 next;
+
+		if (unlikely(!(qp->qp_access_flags &
+			       IB_ACCESS_REMOTE_ATOMIC)))
+			goto nack_inv;
+		next = qp->r_head_ack_queue + 1;
+		if (next > IPATH_MAX_RDMA_ATOMIC)
+			next = 0;
+		spin_lock_irqsave(&qp->s_lock, flags);
+		/* Double check we can process this while holding the s_lock. */
+		if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK))
+			goto unlock;
+		if (unlikely(next == qp->s_tail_ack_queue)) {
+			if (!qp->s_ack_queue[next].sent)
+				goto nack_inv_unlck;
+			ipath_update_ack_queue(qp, next);
+		}
+		if (!header_in_data)
+			ateth = &ohdr->u.atomic_eth;
+		else
+			ateth = (struct ib_atomic_eth *)data;
+		vaddr = ((u64) be32_to_cpu(ateth->vaddr[0]) << 32) |
+			be32_to_cpu(ateth->vaddr[1]);
+		if (unlikely(vaddr & (sizeof(u64) - 1)))
+			goto nack_inv_unlck;
+		rkey = be32_to_cpu(ateth->rkey);
+		/* Check rkey & NAK */
+		if (unlikely(!ipath_rkey_ok(qp, &qp->r_sge,
+					    sizeof(u64), vaddr, rkey,
+					    IB_ACCESS_REMOTE_ATOMIC)))
+			goto nack_acc_unlck;
+		/* Perform atomic OP and save result. */
+		maddr = (atomic64_t *) qp->r_sge.sge.vaddr;
+		sdata = be64_to_cpu(ateth->swap_data);
+		e = &qp->s_ack_queue[qp->r_head_ack_queue];
+		e->atomic_data = (opcode == OP(FETCH_ADD)) ?
+			(u64) atomic64_add_return(sdata, maddr) - sdata :
+			(u64) cmpxchg((u64 *) qp->r_sge.sge.vaddr,
+				      be64_to_cpu(ateth->compare_data),
+				      sdata);
+		e->opcode = opcode;
+		e->sent = 0;
+		e->psn = psn & IPATH_PSN_MASK;
+		qp->r_msn++;
+		qp->r_psn++;
+		qp->r_state = opcode;
+		qp->r_nak_state = 0;
+		qp->r_head_ack_queue = next;
+
+		/* Schedule the send tasklet. */
+		ipath_schedule_send(qp);
+
+		goto unlock;
+	}
+
+	default:
+		/* NAK unknown opcodes. */
+		goto nack_inv;
+	}
+	qp->r_psn++;
+	qp->r_state = opcode;
+	qp->r_ack_psn = psn;
+	qp->r_nak_state = 0;
+	/* Send an ACK if requested or required. */
+	if (psn & (1 << 31))
+		goto send_ack;
+	goto done;
+
+rnr_nak:
+	qp->r_nak_state = IB_RNR_NAK | qp->r_min_rnr_timer;
+	qp->r_ack_psn = qp->r_psn;
+	goto send_ack;
+
+nack_inv_unlck:
+	spin_unlock_irqrestore(&qp->s_lock, flags);
+nack_inv:
+	ipath_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
+	qp->r_nak_state = IB_NAK_INVALID_REQUEST;
+	qp->r_ack_psn = qp->r_psn;
+	goto send_ack;
+
+nack_acc_unlck:
+	spin_unlock_irqrestore(&qp->s_lock, flags);
+nack_acc:
+	ipath_rc_error(qp, IB_WC_LOC_PROT_ERR);
+	qp->r_nak_state = IB_NAK_REMOTE_ACCESS_ERROR;
+	qp->r_ack_psn = qp->r_psn;
+send_ack:
+	send_rc_ack(qp);
+	goto done;
+
+unlock:
+	spin_unlock_irqrestore(&qp->s_lock, flags);
+done:
+	return;
+}
diff --git a/drivers/infiniband/hw/ipath/ipath_registers.h b/drivers/infiniband/hw/ipath/ipath_registers.h
new file mode 100644
index 0000000..8f44d0c
--- /dev/null
+++ b/drivers/infiniband/hw/ipath/ipath_registers.h
@@ -0,0 +1,512 @@
+/*
+ * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _IPATH_REGISTERS_H
+#define _IPATH_REGISTERS_H
+
+/*
+ * This file should only be included by kernel source, and by the diags.  It
+ * defines the registers, and their contents, for InfiniPath chips.
+ */
+
+/*
+ * These are the InfiniPath register and buffer bit definitions,
+ * that are visible to software, and needed only by the kernel
+ * and diag code.  A few, that are visible to protocol and user
+ * code are in ipath_common.h.  Some bits are specific
+ * to a given chip implementation, and have been moved to the
+ * chip-specific source file
+ */
+
+/* kr_revision bits */
+#define INFINIPATH_R_CHIPREVMINOR_MASK 0xFF
+#define INFINIPATH_R_CHIPREVMINOR_SHIFT 0
+#define INFINIPATH_R_CHIPREVMAJOR_MASK 0xFF
+#define INFINIPATH_R_CHIPREVMAJOR_SHIFT 8
+#define INFINIPATH_R_ARCH_MASK 0xFF
+#define INFINIPATH_R_ARCH_SHIFT 16
+#define INFINIPATH_R_SOFTWARE_MASK 0xFF
+#define INFINIPATH_R_SOFTWARE_SHIFT 24
+#define INFINIPATH_R_BOARDID_MASK 0xFF
+#define INFINIPATH_R_BOARDID_SHIFT 32
+
+/* kr_control bits */
+#define INFINIPATH_C_FREEZEMODE 0x00000002
+#define INFINIPATH_C_LINKENABLE 0x00000004
+
+/* kr_sendctrl bits */
+#define INFINIPATH_S_DISARMPIOBUF_SHIFT 16
+#define INFINIPATH_S_UPDTHRESH_SHIFT 24
+#define INFINIPATH_S_UPDTHRESH_MASK 0x1f
+
+#define IPATH_S_ABORT		0
+#define IPATH_S_PIOINTBUFAVAIL	1
+#define IPATH_S_PIOBUFAVAILUPD	2
+#define IPATH_S_PIOENABLE	3
+#define IPATH_S_SDMAINTENABLE	9
+#define IPATH_S_SDMASINGLEDESCRIPTOR	10
+#define IPATH_S_SDMAENABLE	11
+#define IPATH_S_SDMAHALT	12
+#define IPATH_S_DISARM		31
+
+#define INFINIPATH_S_ABORT		(1U << IPATH_S_ABORT)
+#define INFINIPATH_S_PIOINTBUFAVAIL	(1U << IPATH_S_PIOINTBUFAVAIL)
+#define INFINIPATH_S_PIOBUFAVAILUPD	(1U << IPATH_S_PIOBUFAVAILUPD)
+#define INFINIPATH_S_PIOENABLE		(1U << IPATH_S_PIOENABLE)
+#define INFINIPATH_S_SDMAINTENABLE	(1U << IPATH_S_SDMAINTENABLE)
+#define INFINIPATH_S_SDMASINGLEDESCRIPTOR \
+					(1U << IPATH_S_SDMASINGLEDESCRIPTOR)
+#define INFINIPATH_S_SDMAENABLE		(1U << IPATH_S_SDMAENABLE)
+#define INFINIPATH_S_SDMAHALT		(1U << IPATH_S_SDMAHALT)
+#define INFINIPATH_S_DISARM		(1U << IPATH_S_DISARM)
+
+/* kr_rcvctrl bits that are the same on multiple chips */
+#define INFINIPATH_R_PORTENABLE_SHIFT 0
+#define INFINIPATH_R_QPMAP_ENABLE (1ULL << 38)
+
+/* kr_intstatus, kr_intclear, kr_intmask bits */
+#define INFINIPATH_I_SDMAINT		0x8000000000000000ULL
+#define INFINIPATH_I_SDMADISABLED	0x4000000000000000ULL
+#define INFINIPATH_I_ERROR		0x0000000080000000ULL
+#define INFINIPATH_I_SPIOSENT		0x0000000040000000ULL
+#define INFINIPATH_I_SPIOBUFAVAIL	0x0000000020000000ULL
+#define INFINIPATH_I_GPIO		0x0000000010000000ULL
+#define INFINIPATH_I_JINT		0x0000000004000000ULL
+
+/* kr_errorstatus, kr_errorclear, kr_errormask bits */
+#define INFINIPATH_E_RFORMATERR			0x0000000000000001ULL
+#define INFINIPATH_E_RVCRC			0x0000000000000002ULL
+#define INFINIPATH_E_RICRC			0x0000000000000004ULL
+#define INFINIPATH_E_RMINPKTLEN			0x0000000000000008ULL
+#define INFINIPATH_E_RMAXPKTLEN			0x0000000000000010ULL
+#define INFINIPATH_E_RLONGPKTLEN		0x0000000000000020ULL
+#define INFINIPATH_E_RSHORTPKTLEN		0x0000000000000040ULL
+#define INFINIPATH_E_RUNEXPCHAR			0x0000000000000080ULL
+#define INFINIPATH_E_RUNSUPVL			0x0000000000000100ULL
+#define INFINIPATH_E_REBP			0x0000000000000200ULL
+#define INFINIPATH_E_RIBFLOW			0x0000000000000400ULL
+#define INFINIPATH_E_RBADVERSION		0x0000000000000800ULL
+#define INFINIPATH_E_RRCVEGRFULL		0x0000000000001000ULL
+#define INFINIPATH_E_RRCVHDRFULL		0x0000000000002000ULL
+#define INFINIPATH_E_RBADTID			0x0000000000004000ULL
+#define INFINIPATH_E_RHDRLEN			0x0000000000008000ULL
+#define INFINIPATH_E_RHDR			0x0000000000010000ULL
+#define INFINIPATH_E_RIBLOSTLINK		0x0000000000020000ULL
+#define INFINIPATH_E_SENDSPECIALTRIGGER		0x0000000008000000ULL
+#define INFINIPATH_E_SDMADISABLED		0x0000000010000000ULL
+#define INFINIPATH_E_SMINPKTLEN			0x0000000020000000ULL
+#define INFINIPATH_E_SMAXPKTLEN			0x0000000040000000ULL
+#define INFINIPATH_E_SUNDERRUN			0x0000000080000000ULL
+#define INFINIPATH_E_SPKTLEN			0x0000000100000000ULL
+#define INFINIPATH_E_SDROPPEDSMPPKT		0x0000000200000000ULL
+#define INFINIPATH_E_SDROPPEDDATAPKT		0x0000000400000000ULL
+#define INFINIPATH_E_SPIOARMLAUNCH		0x0000000800000000ULL
+#define INFINIPATH_E_SUNEXPERRPKTNUM		0x0000001000000000ULL
+#define INFINIPATH_E_SUNSUPVL			0x0000002000000000ULL
+#define INFINIPATH_E_SENDBUFMISUSE		0x0000004000000000ULL
+#define INFINIPATH_E_SDMAGENMISMATCH		0x0000008000000000ULL
+#define INFINIPATH_E_SDMAOUTOFBOUND		0x0000010000000000ULL
+#define INFINIPATH_E_SDMATAILOUTOFBOUND		0x0000020000000000ULL
+#define INFINIPATH_E_SDMABASE			0x0000040000000000ULL
+#define INFINIPATH_E_SDMA1STDESC		0x0000080000000000ULL
+#define INFINIPATH_E_SDMARPYTAG			0x0000100000000000ULL
+#define INFINIPATH_E_SDMADWEN			0x0000200000000000ULL
+#define INFINIPATH_E_SDMAMISSINGDW		0x0000400000000000ULL
+#define INFINIPATH_E_SDMAUNEXPDATA		0x0000800000000000ULL
+#define INFINIPATH_E_IBSTATUSCHANGED		0x0001000000000000ULL
+#define INFINIPATH_E_INVALIDADDR		0x0002000000000000ULL
+#define INFINIPATH_E_RESET			0x0004000000000000ULL
+#define INFINIPATH_E_HARDWARE			0x0008000000000000ULL
+#define INFINIPATH_E_SDMADESCADDRMISALIGN	0x0010000000000000ULL
+#define INFINIPATH_E_INVALIDEEPCMD		0x0020000000000000ULL
+
+/*
+ * this is used to print "common" packet errors only when the
+ * __IPATH_ERRPKTDBG bit is set in ipath_debug.
+ */
+#define INFINIPATH_E_PKTERRS ( INFINIPATH_E_SPKTLEN \
+		| INFINIPATH_E_SDROPPEDDATAPKT | INFINIPATH_E_RVCRC \
+		| INFINIPATH_E_RICRC | INFINIPATH_E_RSHORTPKTLEN \
+		| INFINIPATH_E_REBP )
+
+/* Convenience for decoding Send DMA errors */
+#define INFINIPATH_E_SDMAERRS ( \
+	INFINIPATH_E_SDMAGENMISMATCH | INFINIPATH_E_SDMAOUTOFBOUND | \
+	INFINIPATH_E_SDMATAILOUTOFBOUND | INFINIPATH_E_SDMABASE | \
+	INFINIPATH_E_SDMA1STDESC | INFINIPATH_E_SDMARPYTAG | \
+	INFINIPATH_E_SDMADWEN | INFINIPATH_E_SDMAMISSINGDW | \
+	INFINIPATH_E_SDMAUNEXPDATA | \
+	INFINIPATH_E_SDMADESCADDRMISALIGN | \
+	INFINIPATH_E_SDMADISABLED | \
+	INFINIPATH_E_SENDBUFMISUSE)
+
+/* kr_hwerrclear, kr_hwerrmask, kr_hwerrstatus, bits */
+/* TXEMEMPARITYERR bit 0: PIObuf, 1: PIOpbc, 2: launchfifo
+ * RXEMEMPARITYERR bit 0: rcvbuf, 1: lookupq, 2:  expTID, 3: eagerTID
+ * 		bit 4: flag buffer, 5: datainfo, 6: header info */
+#define INFINIPATH_HWE_TXEMEMPARITYERR_MASK 0xFULL
+#define INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT 40
+#define INFINIPATH_HWE_RXEMEMPARITYERR_MASK 0x7FULL
+#define INFINIPATH_HWE_RXEMEMPARITYERR_SHIFT 44
+#define INFINIPATH_HWE_IBCBUSTOSPCPARITYERR 0x4000000000000000ULL
+#define INFINIPATH_HWE_IBCBUSFRSPCPARITYERR 0x8000000000000000ULL
+/* txe mem parity errors (shift by INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT) */
+#define INFINIPATH_HWE_TXEMEMPARITYERR_PIOBUF	0x1ULL
+#define INFINIPATH_HWE_TXEMEMPARITYERR_PIOPBC	0x2ULL
+#define INFINIPATH_HWE_TXEMEMPARITYERR_PIOLAUNCHFIFO 0x4ULL
+/* rxe mem parity errors (shift by INFINIPATH_HWE_RXEMEMPARITYERR_SHIFT) */
+#define INFINIPATH_HWE_RXEMEMPARITYERR_RCVBUF   0x01ULL
+#define INFINIPATH_HWE_RXEMEMPARITYERR_LOOKUPQ  0x02ULL
+#define INFINIPATH_HWE_RXEMEMPARITYERR_EXPTID   0x04ULL
+#define INFINIPATH_HWE_RXEMEMPARITYERR_EAGERTID 0x08ULL
+#define INFINIPATH_HWE_RXEMEMPARITYERR_FLAGBUF  0x10ULL
+#define INFINIPATH_HWE_RXEMEMPARITYERR_DATAINFO 0x20ULL
+#define INFINIPATH_HWE_RXEMEMPARITYERR_HDRINFO  0x40ULL
+/* waldo specific -- find the rest in ipath_6110.c */
+#define INFINIPATH_HWE_RXDSYNCMEMPARITYERR  0x0000000400000000ULL
+/* 6120/7220 specific -- find the rest in ipath_6120.c and ipath_7220.c */
+#define INFINIPATH_HWE_MEMBISTFAILED	0x0040000000000000ULL
+
+/* kr_hwdiagctrl bits */
+#define INFINIPATH_DC_FORCETXEMEMPARITYERR_MASK 0xFULL
+#define INFINIPATH_DC_FORCETXEMEMPARITYERR_SHIFT 40
+#define INFINIPATH_DC_FORCERXEMEMPARITYERR_MASK 0x7FULL
+#define INFINIPATH_DC_FORCERXEMEMPARITYERR_SHIFT 44
+#define INFINIPATH_DC_FORCERXDSYNCMEMPARITYERR  0x0000000400000000ULL
+#define INFINIPATH_DC_COUNTERDISABLE            0x1000000000000000ULL
+#define INFINIPATH_DC_COUNTERWREN               0x2000000000000000ULL
+#define INFINIPATH_DC_FORCEIBCBUSTOSPCPARITYERR 0x4000000000000000ULL
+#define INFINIPATH_DC_FORCEIBCBUSFRSPCPARITYERR 0x8000000000000000ULL
+
+/* kr_ibcctrl bits */
+#define INFINIPATH_IBCC_FLOWCTRLPERIOD_MASK 0xFFULL
+#define INFINIPATH_IBCC_FLOWCTRLPERIOD_SHIFT 0
+#define INFINIPATH_IBCC_FLOWCTRLWATERMARK_MASK 0xFFULL
+#define INFINIPATH_IBCC_FLOWCTRLWATERMARK_SHIFT 8
+#define INFINIPATH_IBCC_LINKINITCMD_MASK 0x3ULL
+#define INFINIPATH_IBCC_LINKINITCMD_DISABLE 1
+/* cycle through TS1/TS2 till OK */
+#define INFINIPATH_IBCC_LINKINITCMD_POLL 2
+/* wait for TS1, then go on */
+#define INFINIPATH_IBCC_LINKINITCMD_SLEEP 3
+#define INFINIPATH_IBCC_LINKINITCMD_SHIFT 16
+#define INFINIPATH_IBCC_LINKCMD_MASK 0x3ULL
+#define INFINIPATH_IBCC_LINKCMD_DOWN 1		/* move to 0x11 */
+#define INFINIPATH_IBCC_LINKCMD_ARMED 2		/* move to 0x21 */
+#define INFINIPATH_IBCC_LINKCMD_ACTIVE 3	/* move to 0x31 */
+#define INFINIPATH_IBCC_LINKCMD_SHIFT 18
+#define INFINIPATH_IBCC_MAXPKTLEN_MASK 0x7FFULL
+#define INFINIPATH_IBCC_MAXPKTLEN_SHIFT 20
+#define INFINIPATH_IBCC_PHYERRTHRESHOLD_MASK 0xFULL
+#define INFINIPATH_IBCC_PHYERRTHRESHOLD_SHIFT 32
+#define INFINIPATH_IBCC_OVERRUNTHRESHOLD_MASK 0xFULL
+#define INFINIPATH_IBCC_OVERRUNTHRESHOLD_SHIFT 36
+#define INFINIPATH_IBCC_CREDITSCALE_MASK 0x7ULL
+#define INFINIPATH_IBCC_CREDITSCALE_SHIFT 40
+#define INFINIPATH_IBCC_LOOPBACK             0x8000000000000000ULL
+#define INFINIPATH_IBCC_LINKDOWNDEFAULTSTATE 0x4000000000000000ULL
+
+/* kr_ibcstatus bits */
+#define INFINIPATH_IBCS_LINKTRAININGSTATE_SHIFT 0
+#define INFINIPATH_IBCS_LINKSTATE_MASK 0x7
+
+#define INFINIPATH_IBCS_TXREADY       0x40000000
+#define INFINIPATH_IBCS_TXCREDITOK    0x80000000
+/* link training states (shift by
+   INFINIPATH_IBCS_LINKTRAININGSTATE_SHIFT) */
+#define INFINIPATH_IBCS_LT_STATE_DISABLED	0x00
+#define INFINIPATH_IBCS_LT_STATE_LINKUP		0x01
+#define INFINIPATH_IBCS_LT_STATE_POLLACTIVE	0x02
+#define INFINIPATH_IBCS_LT_STATE_POLLQUIET	0x03
+#define INFINIPATH_IBCS_LT_STATE_SLEEPDELAY	0x04
+#define INFINIPATH_IBCS_LT_STATE_SLEEPQUIET	0x05
+#define INFINIPATH_IBCS_LT_STATE_CFGDEBOUNCE	0x08
+#define INFINIPATH_IBCS_LT_STATE_CFGRCVFCFG	0x09
+#define INFINIPATH_IBCS_LT_STATE_CFGWAITRMT	0x0a
+#define INFINIPATH_IBCS_LT_STATE_CFGIDLE	0x0b
+#define INFINIPATH_IBCS_LT_STATE_RECOVERRETRAIN	0x0c
+#define INFINIPATH_IBCS_LT_STATE_RECOVERWAITRMT	0x0e
+#define INFINIPATH_IBCS_LT_STATE_RECOVERIDLE	0x0f
+/* link state machine states (shift by ibcs_ls_shift) */
+#define INFINIPATH_IBCS_L_STATE_DOWN		0x0
+#define INFINIPATH_IBCS_L_STATE_INIT		0x1
+#define INFINIPATH_IBCS_L_STATE_ARM		0x2
+#define INFINIPATH_IBCS_L_STATE_ACTIVE		0x3
+#define INFINIPATH_IBCS_L_STATE_ACT_DEFER	0x4
+
+
+/* kr_extstatus bits */
+#define INFINIPATH_EXTS_SERDESPLLLOCK 0x1
+#define INFINIPATH_EXTS_GPIOIN_MASK 0xFFFFULL
+#define INFINIPATH_EXTS_GPIOIN_SHIFT 48
+
+/* kr_extctrl bits */
+#define INFINIPATH_EXTC_GPIOINVERT_MASK 0xFFFFULL
+#define INFINIPATH_EXTC_GPIOINVERT_SHIFT 32
+#define INFINIPATH_EXTC_GPIOOE_MASK 0xFFFFULL
+#define INFINIPATH_EXTC_GPIOOE_SHIFT 48
+#define INFINIPATH_EXTC_SERDESENABLE         0x80000000ULL
+#define INFINIPATH_EXTC_SERDESCONNECT        0x40000000ULL
+#define INFINIPATH_EXTC_SERDESENTRUNKING     0x20000000ULL
+#define INFINIPATH_EXTC_SERDESDISRXFIFO      0x10000000ULL
+#define INFINIPATH_EXTC_SERDESENPLPBK1       0x08000000ULL
+#define INFINIPATH_EXTC_SERDESENPLPBK2       0x04000000ULL
+#define INFINIPATH_EXTC_SERDESENENCDEC       0x02000000ULL
+#define INFINIPATH_EXTC_LED1SECPORT_ON       0x00000020ULL
+#define INFINIPATH_EXTC_LED2SECPORT_ON       0x00000010ULL
+#define INFINIPATH_EXTC_LED1PRIPORT_ON       0x00000008ULL
+#define INFINIPATH_EXTC_LED2PRIPORT_ON       0x00000004ULL
+#define INFINIPATH_EXTC_LEDGBLOK_ON          0x00000002ULL
+#define INFINIPATH_EXTC_LEDGBLERR_OFF        0x00000001ULL
+
+/* kr_partitionkey bits */
+#define INFINIPATH_PKEY_SIZE 16
+#define INFINIPATH_PKEY_MASK 0xFFFF
+#define INFINIPATH_PKEY_DEFAULT_PKEY 0xFFFF
+
+/* kr_serdesconfig0 bits */
+#define INFINIPATH_SERDC0_RESET_MASK  0xfULL	/* overal reset bits */
+#define INFINIPATH_SERDC0_RESET_PLL   0x10000000ULL	/* pll reset */
+/* tx idle enables (per lane) */
+#define INFINIPATH_SERDC0_TXIDLE      0xF000ULL
+/* rx detect enables (per lane) */
+#define INFINIPATH_SERDC0_RXDETECT_EN 0xF0000ULL
+/* L1 Power down; use with RXDETECT, Otherwise not used on IB side */
+#define INFINIPATH_SERDC0_L1PWR_DN	 0xF0ULL
+
+/* common kr_xgxsconfig bits (or safe in all, even if not implemented) */
+#define INFINIPATH_XGXS_RX_POL_SHIFT 19
+#define INFINIPATH_XGXS_RX_POL_MASK 0xfULL
+
+
+/*
+ * IPATH_PIO_MAXIBHDR is the max IB header size allowed for in our
+ * PIO send buffers.  This is well beyond anything currently
+ * defined in the InfiniBand spec.
+ */
+#define IPATH_PIO_MAXIBHDR 128
+
+typedef u64 ipath_err_t;
+
+/* The following change with the type of device, so
+ * need to be part of the ipath_devdata struct, or
+ * we could have problems plugging in devices of
+ * different types (e.g. one HT, one PCIE)
+ * in one system, to be managed by one driver.
+ * On the other hand, this file is may also be included
+ * by other code, so leave the declarations here
+ * temporarily. Minor footprint issue if common-model
+ * linker used, none if C89+ linker used.
+ */
+
+/* mask of defined bits for various registers */
+extern u64 infinipath_i_bitsextant;
+extern ipath_err_t infinipath_e_bitsextant, infinipath_hwe_bitsextant;
+
+/* masks that are different in various chips, or only exist in some chips */
+extern u32 infinipath_i_rcvavail_mask, infinipath_i_rcvurg_mask;
+
+/*
+ * These are the infinipath general register numbers (not offsets).
+ * The kernel registers are used directly, those beyond the kernel
+ * registers are calculated from one of the base registers.  The use of
+ * an integer type doesn't allow type-checking as thorough as, say,
+ * an enum but allows for better hiding of chip differences.
+ */
+typedef const u16 ipath_kreg,	/* infinipath general registers */
+ ipath_creg,			/* infinipath counter registers */
+ ipath_sreg;			/* kernel-only, infinipath send registers */
+
+/*
+ * These are the chip registers common to all infinipath chips, and
+ * used both by the kernel and the diagnostics or other user code.
+ * They are all implemented such that 64 bit accesses work.
+ * Some implement no more than 32 bits.  Because 64 bit reads
+ * require 2 HT cmds on opteron, we access those with 32 bit
+ * reads for efficiency (they are written as 64 bits, since
+ * the extra 32 bits are nearly free on writes, and it slightly reduces
+ * complexity).  The rest are all accessed as 64 bits.
+ */
+struct ipath_kregs {
+	/* These are the 32 bit group */
+	ipath_kreg kr_control;
+	ipath_kreg kr_counterregbase;
+	ipath_kreg kr_intmask;
+	ipath_kreg kr_intstatus;
+	ipath_kreg kr_pagealign;
+	ipath_kreg kr_portcnt;
+	ipath_kreg kr_rcvtidbase;
+	ipath_kreg kr_rcvtidcnt;
+	ipath_kreg kr_rcvegrbase;
+	ipath_kreg kr_rcvegrcnt;
+	ipath_kreg kr_scratch;
+	ipath_kreg kr_sendctrl;
+	ipath_kreg kr_sendpiobufbase;
+	ipath_kreg kr_sendpiobufcnt;
+	ipath_kreg kr_sendpiosize;
+	ipath_kreg kr_sendregbase;
+	ipath_kreg kr_userregbase;
+	/* These are the 64 bit group */
+	ipath_kreg kr_debugport;
+	ipath_kreg kr_debugportselect;
+	ipath_kreg kr_errorclear;
+	ipath_kreg kr_errormask;
+	ipath_kreg kr_errorstatus;
+	ipath_kreg kr_extctrl;
+	ipath_kreg kr_extstatus;
+	ipath_kreg kr_gpio_clear;
+	ipath_kreg kr_gpio_mask;
+	ipath_kreg kr_gpio_out;
+	ipath_kreg kr_gpio_status;
+	ipath_kreg kr_hwdiagctrl;
+	ipath_kreg kr_hwerrclear;
+	ipath_kreg kr_hwerrmask;
+	ipath_kreg kr_hwerrstatus;
+	ipath_kreg kr_ibcctrl;
+	ipath_kreg kr_ibcstatus;
+	ipath_kreg kr_intblocked;
+	ipath_kreg kr_intclear;
+	ipath_kreg kr_interruptconfig;
+	ipath_kreg kr_mdio;
+	ipath_kreg kr_partitionkey;
+	ipath_kreg kr_rcvbthqp;
+	ipath_kreg kr_rcvbufbase;
+	ipath_kreg kr_rcvbufsize;
+	ipath_kreg kr_rcvctrl;
+	ipath_kreg kr_rcvhdrcnt;
+	ipath_kreg kr_rcvhdrentsize;
+	ipath_kreg kr_rcvhdrsize;
+	ipath_kreg kr_rcvintmembase;
+	ipath_kreg kr_rcvintmemsize;
+	ipath_kreg kr_revision;
+	ipath_kreg kr_sendbuffererror;
+	ipath_kreg kr_sendpioavailaddr;
+	ipath_kreg kr_serdesconfig0;
+	ipath_kreg kr_serdesconfig1;
+	ipath_kreg kr_serdesstatus;
+	ipath_kreg kr_txintmembase;
+	ipath_kreg kr_txintmemsize;
+	ipath_kreg kr_xgxsconfig;
+	ipath_kreg kr_ibpllcfg;
+	/* use these two (and the following N ports) only with
+	 * ipath_k*_kreg64_port(); not *kreg64() */
+	ipath_kreg kr_rcvhdraddr;
+	ipath_kreg kr_rcvhdrtailaddr;
+
+	/* remaining registers are not present on all types of infinipath
+	   chips  */
+	ipath_kreg kr_rcvpktledcnt;
+	ipath_kreg kr_pcierbuftestreg0;
+	ipath_kreg kr_pcierbuftestreg1;
+	ipath_kreg kr_pcieq0serdesconfig0;
+	ipath_kreg kr_pcieq0serdesconfig1;
+	ipath_kreg kr_pcieq0serdesstatus;
+	ipath_kreg kr_pcieq1serdesconfig0;
+	ipath_kreg kr_pcieq1serdesconfig1;
+	ipath_kreg kr_pcieq1serdesstatus;
+	ipath_kreg kr_hrtbt_guid;
+	ipath_kreg kr_ibcddrctrl;
+	ipath_kreg kr_ibcddrstatus;
+	ipath_kreg kr_jintreload;
+
+	/* send dma related regs */
+	ipath_kreg kr_senddmabase;
+	ipath_kreg kr_senddmalengen;
+	ipath_kreg kr_senddmatail;
+	ipath_kreg kr_senddmahead;
+	ipath_kreg kr_senddmaheadaddr;
+	ipath_kreg kr_senddmabufmask0;
+	ipath_kreg kr_senddmabufmask1;
+	ipath_kreg kr_senddmabufmask2;
+	ipath_kreg kr_senddmastatus;
+
+	/* SerDes related regs (IBA7220-only) */
+	ipath_kreg kr_ibserdesctrl;
+	ipath_kreg kr_ib_epbacc;
+	ipath_kreg kr_ib_epbtrans;
+	ipath_kreg kr_pcie_epbacc;
+	ipath_kreg kr_pcie_epbtrans;
+	ipath_kreg kr_ib_ddsrxeq;
+};
+
+struct ipath_cregs {
+	ipath_creg cr_badformatcnt;
+	ipath_creg cr_erricrccnt;
+	ipath_creg cr_errlinkcnt;
+	ipath_creg cr_errlpcrccnt;
+	ipath_creg cr_errpkey;
+	ipath_creg cr_errrcvflowctrlcnt;
+	ipath_creg cr_err_rlencnt;
+	ipath_creg cr_errslencnt;
+	ipath_creg cr_errtidfull;
+	ipath_creg cr_errtidvalid;
+	ipath_creg cr_errvcrccnt;
+	ipath_creg cr_ibstatuschange;
+	ipath_creg cr_intcnt;
+	ipath_creg cr_invalidrlencnt;
+	ipath_creg cr_invalidslencnt;
+	ipath_creg cr_lbflowstallcnt;
+	ipath_creg cr_iblinkdowncnt;
+	ipath_creg cr_iblinkerrrecovcnt;
+	ipath_creg cr_ibsymbolerrcnt;
+	ipath_creg cr_pktrcvcnt;
+	ipath_creg cr_pktrcvflowctrlcnt;
+	ipath_creg cr_pktsendcnt;
+	ipath_creg cr_pktsendflowcnt;
+	ipath_creg cr_portovflcnt;
+	ipath_creg cr_rcvebpcnt;
+	ipath_creg cr_rcvovflcnt;
+	ipath_creg cr_rxdroppktcnt;
+	ipath_creg cr_senddropped;
+	ipath_creg cr_sendstallcnt;
+	ipath_creg cr_sendunderruncnt;
+	ipath_creg cr_unsupvlcnt;
+	ipath_creg cr_wordrcvcnt;
+	ipath_creg cr_wordsendcnt;
+	ipath_creg cr_vl15droppedpktcnt;
+	ipath_creg cr_rxotherlocalphyerrcnt;
+	ipath_creg cr_excessbufferovflcnt;
+	ipath_creg cr_locallinkintegrityerrcnt;
+	ipath_creg cr_rxvlerrcnt;
+	ipath_creg cr_rxdlidfltrcnt;
+	ipath_creg cr_psstat;
+	ipath_creg cr_psstart;
+	ipath_creg cr_psinterval;
+	ipath_creg cr_psrcvdatacount;
+	ipath_creg cr_psrcvpktscount;
+	ipath_creg cr_psxmitdatacount;
+	ipath_creg cr_psxmitpktscount;
+	ipath_creg cr_psxmitwaitcount;
+};
+
+#endif				/* _IPATH_REGISTERS_H */
diff --git a/drivers/infiniband/hw/ipath/ipath_ruc.c b/drivers/infiniband/hw/ipath/ipath_ruc.c
new file mode 100644
index 0000000..1f95bba
--- /dev/null
+++ b/drivers/infiniband/hw/ipath/ipath_ruc.c
@@ -0,0 +1,734 @@
+/*
+ * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/sched.h>
+#include <linux/spinlock.h>
+
+#include "ipath_verbs.h"
+#include "ipath_kernel.h"
+
+/*
+ * Convert the AETH RNR timeout code into the number of milliseconds.
+ */
+const u32 ib_ipath_rnr_table[32] = {
+	656,			/* 0 */
+	1,			/* 1 */
+	1,			/* 2 */
+	1,			/* 3 */
+	1,			/* 4 */
+	1,			/* 5 */
+	1,			/* 6 */
+	1,			/* 7 */
+	1,			/* 8 */
+	1,			/* 9 */
+	1,			/* A */
+	1,			/* B */
+	1,			/* C */
+	1,			/* D */
+	2,			/* E */
+	2,			/* F */
+	3,			/* 10 */
+	4,			/* 11 */
+	6,			/* 12 */
+	8,			/* 13 */
+	11,			/* 14 */
+	16,			/* 15 */
+	21,			/* 16 */
+	31,			/* 17 */
+	41,			/* 18 */
+	62,			/* 19 */
+	82,			/* 1A */
+	123,			/* 1B */
+	164,			/* 1C */
+	246,			/* 1D */
+	328,			/* 1E */
+	492			/* 1F */
+};
+
+/**
+ * ipath_insert_rnr_queue - put QP on the RNR timeout list for the device
+ * @qp: the QP
+ *
+ * Called with the QP s_lock held and interrupts disabled.
+ * XXX Use a simple list for now.  We might need a priority
+ * queue if we have lots of QPs waiting for RNR timeouts
+ * but that should be rare.
+ */
+void ipath_insert_rnr_queue(struct ipath_qp *qp)
+{
+	struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
+
+	/* We already did a spin_lock_irqsave(), so just use spin_lock */
+	spin_lock(&dev->pending_lock);
+	if (list_empty(&dev->rnrwait))
+		list_add(&qp->timerwait, &dev->rnrwait);
+	else {
+		struct list_head *l = &dev->rnrwait;
+		struct ipath_qp *nqp = list_entry(l->next, struct ipath_qp,
+						  timerwait);
+
+		while (qp->s_rnr_timeout >= nqp->s_rnr_timeout) {
+			qp->s_rnr_timeout -= nqp->s_rnr_timeout;
+			l = l->next;
+			if (l->next == &dev->rnrwait) {
+				nqp = NULL;
+				break;
+			}
+			nqp = list_entry(l->next, struct ipath_qp,
+					 timerwait);
+		}
+		if (nqp)
+			nqp->s_rnr_timeout -= qp->s_rnr_timeout;
+		list_add(&qp->timerwait, l);
+	}
+	spin_unlock(&dev->pending_lock);
+}
+
+/**
+ * ipath_init_sge - Validate a RWQE and fill in the SGE state
+ * @qp: the QP
+ *
+ * Return 1 if OK.
+ */
+int ipath_init_sge(struct ipath_qp *qp, struct ipath_rwqe *wqe,
+		   u32 *lengthp, struct ipath_sge_state *ss)
+{
+	int i, j, ret;
+	struct ib_wc wc;
+
+	*lengthp = 0;
+	for (i = j = 0; i < wqe->num_sge; i++) {
+		if (wqe->sg_list[i].length == 0)
+			continue;
+		/* Check LKEY */
+		if (!ipath_lkey_ok(qp, j ? &ss->sg_list[j - 1] : &ss->sge,
+				   &wqe->sg_list[i], IB_ACCESS_LOCAL_WRITE))
+			goto bad_lkey;
+		*lengthp += wqe->sg_list[i].length;
+		j++;
+	}
+	ss->num_sge = j;
+	ret = 1;
+	goto bail;
+
+bad_lkey:
+	memset(&wc, 0, sizeof(wc));
+	wc.wr_id = wqe->wr_id;
+	wc.status = IB_WC_LOC_PROT_ERR;
+	wc.opcode = IB_WC_RECV;
+	wc.qp = &qp->ibqp;
+	/* Signal solicited completion event. */
+	ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, 1);
+	ret = 0;
+bail:
+	return ret;
+}
+
+/**
+ * ipath_get_rwqe - copy the next RWQE into the QP's RWQE
+ * @qp: the QP
+ * @wr_id_only: update qp->r_wr_id only, not qp->r_sge
+ *
+ * Return 0 if no RWQE is available, otherwise return 1.
+ *
+ * Can be called from interrupt level.
+ */
+int ipath_get_rwqe(struct ipath_qp *qp, int wr_id_only)
+{
+	unsigned long flags;
+	struct ipath_rq *rq;
+	struct ipath_rwq *wq;
+	struct ipath_srq *srq;
+	struct ipath_rwqe *wqe;
+	void (*handler)(struct ib_event *, void *);
+	u32 tail;
+	int ret;
+
+	if (qp->ibqp.srq) {
+		srq = to_isrq(qp->ibqp.srq);
+		handler = srq->ibsrq.event_handler;
+		rq = &srq->rq;
+	} else {
+		srq = NULL;
+		handler = NULL;
+		rq = &qp->r_rq;
+	}
+
+	spin_lock_irqsave(&rq->lock, flags);
+	if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK)) {
+		ret = 0;
+		goto unlock;
+	}
+
+	wq = rq->wq;
+	tail = wq->tail;
+	/* Validate tail before using it since it is user writable. */
+	if (tail >= rq->size)
+		tail = 0;
+	do {
+		if (unlikely(tail == wq->head)) {
+			ret = 0;
+			goto unlock;
+		}
+		/* Make sure entry is read after head index is read. */
+		smp_rmb();
+		wqe = get_rwqe_ptr(rq, tail);
+		if (++tail >= rq->size)
+			tail = 0;
+		if (wr_id_only)
+			break;
+		qp->r_sge.sg_list = qp->r_sg_list;
+	} while (!ipath_init_sge(qp, wqe, &qp->r_len, &qp->r_sge));
+	qp->r_wr_id = wqe->wr_id;
+	wq->tail = tail;
+
+	ret = 1;
+	set_bit(IPATH_R_WRID_VALID, &qp->r_aflags);
+	if (handler) {
+		u32 n;
+
+		/*
+		 * validate head pointer value and compute
+		 * the number of remaining WQEs.
+		 */
+		n = wq->head;
+		if (n >= rq->size)
+			n = 0;
+		if (n < tail)
+			n += rq->size - tail;
+		else
+			n -= tail;
+		if (n < srq->limit) {
+			struct ib_event ev;
+
+			srq->limit = 0;
+			spin_unlock_irqrestore(&rq->lock, flags);
+			ev.device = qp->ibqp.device;
+			ev.element.srq = qp->ibqp.srq;
+			ev.event = IB_EVENT_SRQ_LIMIT_REACHED;
+			handler(&ev, srq->ibsrq.srq_context);
+			goto bail;
+		}
+	}
+unlock:
+	spin_unlock_irqrestore(&rq->lock, flags);
+bail:
+	return ret;
+}
+
+/**
+ * ipath_ruc_loopback - handle UC and RC lookback requests
+ * @sqp: the sending QP
+ *
+ * This is called from ipath_do_send() to
+ * forward a WQE addressed to the same HCA.
+ * Note that although we are single threaded due to the tasklet, we still
+ * have to protect against post_send().  We don't have to worry about
+ * receive interrupts since this is a connected protocol and all packets
+ * will pass through here.
+ */
+static void ipath_ruc_loopback(struct ipath_qp *sqp)
+{
+	struct ipath_ibdev *dev = to_idev(sqp->ibqp.device);
+	struct ipath_qp *qp;
+	struct ipath_swqe *wqe;
+	struct ipath_sge *sge;
+	unsigned long flags;
+	struct ib_wc wc;
+	u64 sdata;
+	atomic64_t *maddr;
+	enum ib_wc_status send_status;
+
+	/*
+	 * Note that we check the responder QP state after
+	 * checking the requester's state.
+	 */
+	qp = ipath_lookup_qpn(&dev->qp_table, sqp->remote_qpn);
+
+	spin_lock_irqsave(&sqp->s_lock, flags);
+
+	/* Return if we are already busy processing a work request. */
+	if ((sqp->s_flags & (IPATH_S_BUSY | IPATH_S_ANY_WAIT)) ||
+	    !(ib_ipath_state_ops[sqp->state] & IPATH_PROCESS_OR_FLUSH_SEND))
+		goto unlock;
+
+	sqp->s_flags |= IPATH_S_BUSY;
+
+again:
+	if (sqp->s_last == sqp->s_head)
+		goto clr_busy;
+	wqe = get_swqe_ptr(sqp, sqp->s_last);
+
+	/* Return if it is not OK to start a new work reqeust. */
+	if (!(ib_ipath_state_ops[sqp->state] & IPATH_PROCESS_NEXT_SEND_OK)) {
+		if (!(ib_ipath_state_ops[sqp->state] & IPATH_FLUSH_SEND))
+			goto clr_busy;
+		/* We are in the error state, flush the work request. */
+		send_status = IB_WC_WR_FLUSH_ERR;
+		goto flush_send;
+	}
+
+	/*
+	 * We can rely on the entry not changing without the s_lock
+	 * being held until we update s_last.
+	 * We increment s_cur to indicate s_last is in progress.
+	 */
+	if (sqp->s_last == sqp->s_cur) {
+		if (++sqp->s_cur >= sqp->s_size)
+			sqp->s_cur = 0;
+	}
+	spin_unlock_irqrestore(&sqp->s_lock, flags);
+
+	if (!qp || !(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK)) {
+		dev->n_pkt_drops++;
+		/*
+		 * For RC, the requester would timeout and retry so
+		 * shortcut the timeouts and just signal too many retries.
+		 */
+		if (sqp->ibqp.qp_type == IB_QPT_RC)
+			send_status = IB_WC_RETRY_EXC_ERR;
+		else
+			send_status = IB_WC_SUCCESS;
+		goto serr;
+	}
+
+	memset(&wc, 0, sizeof wc);
+	send_status = IB_WC_SUCCESS;
+
+	sqp->s_sge.sge = wqe->sg_list[0];
+	sqp->s_sge.sg_list = wqe->sg_list + 1;
+	sqp->s_sge.num_sge = wqe->wr.num_sge;
+	sqp->s_len = wqe->length;
+	switch (wqe->wr.opcode) {
+	case IB_WR_SEND_WITH_IMM:
+		wc.wc_flags = IB_WC_WITH_IMM;
+		wc.ex.imm_data = wqe->wr.ex.imm_data;
+		/* FALLTHROUGH */
+	case IB_WR_SEND:
+		if (!ipath_get_rwqe(qp, 0))
+			goto rnr_nak;
+		break;
+
+	case IB_WR_RDMA_WRITE_WITH_IMM:
+		if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE)))
+			goto inv_err;
+		wc.wc_flags = IB_WC_WITH_IMM;
+		wc.ex.imm_data = wqe->wr.ex.imm_data;
+		if (!ipath_get_rwqe(qp, 1))
+			goto rnr_nak;
+		/* FALLTHROUGH */
+	case IB_WR_RDMA_WRITE:
+		if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE)))
+			goto inv_err;
+		if (wqe->length == 0)
+			break;
+		if (unlikely(!ipath_rkey_ok(qp, &qp->r_sge, wqe->length,
+					    wqe->wr.wr.rdma.remote_addr,
+					    wqe->wr.wr.rdma.rkey,
+					    IB_ACCESS_REMOTE_WRITE)))
+			goto acc_err;
+		break;
+
+	case IB_WR_RDMA_READ:
+		if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_READ)))
+			goto inv_err;
+		if (unlikely(!ipath_rkey_ok(qp, &sqp->s_sge, wqe->length,
+					    wqe->wr.wr.rdma.remote_addr,
+					    wqe->wr.wr.rdma.rkey,
+					    IB_ACCESS_REMOTE_READ)))
+			goto acc_err;
+		qp->r_sge.sge = wqe->sg_list[0];
+		qp->r_sge.sg_list = wqe->sg_list + 1;
+		qp->r_sge.num_sge = wqe->wr.num_sge;
+		break;
+
+	case IB_WR_ATOMIC_CMP_AND_SWP:
+	case IB_WR_ATOMIC_FETCH_AND_ADD:
+		if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC)))
+			goto inv_err;
+		if (unlikely(!ipath_rkey_ok(qp, &qp->r_sge, sizeof(u64),
+					    wqe->wr.wr.atomic.remote_addr,
+					    wqe->wr.wr.atomic.rkey,
+					    IB_ACCESS_REMOTE_ATOMIC)))
+			goto acc_err;
+		/* Perform atomic OP and save result. */
+		maddr = (atomic64_t *) qp->r_sge.sge.vaddr;
+		sdata = wqe->wr.wr.atomic.compare_add;
+		*(u64 *) sqp->s_sge.sge.vaddr =
+			(wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) ?
+			(u64) atomic64_add_return(sdata, maddr) - sdata :
+			(u64) cmpxchg((u64 *) qp->r_sge.sge.vaddr,
+				      sdata, wqe->wr.wr.atomic.swap);
+		goto send_comp;
+
+	default:
+		send_status = IB_WC_LOC_QP_OP_ERR;
+		goto serr;
+	}
+
+	sge = &sqp->s_sge.sge;
+	while (sqp->s_len) {
+		u32 len = sqp->s_len;
+
+		if (len > sge->length)
+			len = sge->length;
+		if (len > sge->sge_length)
+			len = sge->sge_length;
+		BUG_ON(len == 0);
+		ipath_copy_sge(&qp->r_sge, sge->vaddr, len);
+		sge->vaddr += len;
+		sge->length -= len;
+		sge->sge_length -= len;
+		if (sge->sge_length == 0) {
+			if (--sqp->s_sge.num_sge)
+				*sge = *sqp->s_sge.sg_list++;
+		} else if (sge->length == 0 && sge->mr != NULL) {
+			if (++sge->n >= IPATH_SEGSZ) {
+				if (++sge->m >= sge->mr->mapsz)
+					break;
+				sge->n = 0;
+			}
+			sge->vaddr =
+				sge->mr->map[sge->m]->segs[sge->n].vaddr;
+			sge->length =
+				sge->mr->map[sge->m]->segs[sge->n].length;
+		}
+		sqp->s_len -= len;
+	}
+
+	if (!test_and_clear_bit(IPATH_R_WRID_VALID, &qp->r_aflags))
+		goto send_comp;
+
+	if (wqe->wr.opcode == IB_WR_RDMA_WRITE_WITH_IMM)
+		wc.opcode = IB_WC_RECV_RDMA_WITH_IMM;
+	else
+		wc.opcode = IB_WC_RECV;
+	wc.wr_id = qp->r_wr_id;
+	wc.status = IB_WC_SUCCESS;
+	wc.byte_len = wqe->length;
+	wc.qp = &qp->ibqp;
+	wc.src_qp = qp->remote_qpn;
+	wc.slid = qp->remote_ah_attr.dlid;
+	wc.sl = qp->remote_ah_attr.sl;
+	wc.port_num = 1;
+	/* Signal completion event if the solicited bit is set. */
+	ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc,
+		       wqe->wr.send_flags & IB_SEND_SOLICITED);
+
+send_comp:
+	spin_lock_irqsave(&sqp->s_lock, flags);
+flush_send:
+	sqp->s_rnr_retry = sqp->s_rnr_retry_cnt;
+	ipath_send_complete(sqp, wqe, send_status);
+	goto again;
+
+rnr_nak:
+	/* Handle RNR NAK */
+	if (qp->ibqp.qp_type == IB_QPT_UC)
+		goto send_comp;
+	/*
+	 * Note: we don't need the s_lock held since the BUSY flag
+	 * makes this single threaded.
+	 */
+	if (sqp->s_rnr_retry == 0) {
+		send_status = IB_WC_RNR_RETRY_EXC_ERR;
+		goto serr;
+	}
+	if (sqp->s_rnr_retry_cnt < 7)
+		sqp->s_rnr_retry--;
+	spin_lock_irqsave(&sqp->s_lock, flags);
+	if (!(ib_ipath_state_ops[sqp->state] & IPATH_PROCESS_RECV_OK))
+		goto clr_busy;
+	sqp->s_flags |= IPATH_S_WAITING;
+	dev->n_rnr_naks++;
+	sqp->s_rnr_timeout = ib_ipath_rnr_table[qp->r_min_rnr_timer];
+	ipath_insert_rnr_queue(sqp);
+	goto clr_busy;
+
+inv_err:
+	send_status = IB_WC_REM_INV_REQ_ERR;
+	wc.status = IB_WC_LOC_QP_OP_ERR;
+	goto err;
+
+acc_err:
+	send_status = IB_WC_REM_ACCESS_ERR;
+	wc.status = IB_WC_LOC_PROT_ERR;
+err:
+	/* responder goes to error state */
+	ipath_rc_error(qp, wc.status);
+
+serr:
+	spin_lock_irqsave(&sqp->s_lock, flags);
+	ipath_send_complete(sqp, wqe, send_status);
+	if (sqp->ibqp.qp_type == IB_QPT_RC) {
+		int lastwqe = ipath_error_qp(sqp, IB_WC_WR_FLUSH_ERR);
+
+		sqp->s_flags &= ~IPATH_S_BUSY;
+		spin_unlock_irqrestore(&sqp->s_lock, flags);
+		if (lastwqe) {
+			struct ib_event ev;
+
+			ev.device = sqp->ibqp.device;
+			ev.element.qp = &sqp->ibqp;
+			ev.event = IB_EVENT_QP_LAST_WQE_REACHED;
+			sqp->ibqp.event_handler(&ev, sqp->ibqp.qp_context);
+		}
+		goto done;
+	}
+clr_busy:
+	sqp->s_flags &= ~IPATH_S_BUSY;
+unlock:
+	spin_unlock_irqrestore(&sqp->s_lock, flags);
+done:
+	if (qp && atomic_dec_and_test(&qp->refcount))
+		wake_up(&qp->wait);
+}
+
+static void want_buffer(struct ipath_devdata *dd, struct ipath_qp *qp)
+{
+	if (!(dd->ipath_flags & IPATH_HAS_SEND_DMA) ||
+	    qp->ibqp.qp_type == IB_QPT_SMI) {
+		unsigned long flags;
+
+		spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
+		dd->ipath_sendctrl |= INFINIPATH_S_PIOINTBUFAVAIL;
+		ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl,
+				 dd->ipath_sendctrl);
+		ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
+		spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
+	}
+}
+
+/**
+ * ipath_no_bufs_available - tell the layer driver we need buffers
+ * @qp: the QP that caused the problem
+ * @dev: the device we ran out of buffers on
+ *
+ * Called when we run out of PIO buffers.
+ * If we are now in the error state, return zero to flush the
+ * send work request.
+ */
+static int ipath_no_bufs_available(struct ipath_qp *qp,
+				    struct ipath_ibdev *dev)
+{
+	unsigned long flags;
+	int ret = 1;
+
+	/*
+	 * Note that as soon as want_buffer() is called and
+	 * possibly before it returns, ipath_ib_piobufavail()
+	 * could be called. Therefore, put QP on the piowait list before
+	 * enabling the PIO avail interrupt.
+	 */
+	spin_lock_irqsave(&qp->s_lock, flags);
+	if (ib_ipath_state_ops[qp->state] & IPATH_PROCESS_SEND_OK) {
+		dev->n_piowait++;
+		qp->s_flags |= IPATH_S_WAITING;
+		qp->s_flags &= ~IPATH_S_BUSY;
+		spin_lock(&dev->pending_lock);
+		if (list_empty(&qp->piowait))
+			list_add_tail(&qp->piowait, &dev->piowait);
+		spin_unlock(&dev->pending_lock);
+	} else
+		ret = 0;
+	spin_unlock_irqrestore(&qp->s_lock, flags);
+	if (ret)
+		want_buffer(dev->dd, qp);
+	return ret;
+}
+
+/**
+ * ipath_make_grh - construct a GRH header
+ * @dev: a pointer to the ipath device
+ * @hdr: a pointer to the GRH header being constructed
+ * @grh: the global route address to send to
+ * @hwords: the number of 32 bit words of header being sent
+ * @nwords: the number of 32 bit words of data being sent
+ *
+ * Return the size of the header in 32 bit words.
+ */
+u32 ipath_make_grh(struct ipath_ibdev *dev, struct ib_grh *hdr,
+		   struct ib_global_route *grh, u32 hwords, u32 nwords)
+{
+	hdr->version_tclass_flow =
+		cpu_to_be32((6 << 28) |
+			    (grh->traffic_class << 20) |
+			    grh->flow_label);
+	hdr->paylen = cpu_to_be16((hwords - 2 + nwords + SIZE_OF_CRC) << 2);
+	/* next_hdr is defined by C8-7 in ch. 8.4.1 */
+	hdr->next_hdr = 0x1B;
+	hdr->hop_limit = grh->hop_limit;
+	/* The SGID is 32-bit aligned. */
+	hdr->sgid.global.subnet_prefix = dev->gid_prefix;
+	hdr->sgid.global.interface_id = dev->dd->ipath_guid;
+	hdr->dgid = grh->dgid;
+
+	/* GRH header size in 32-bit words. */
+	return sizeof(struct ib_grh) / sizeof(u32);
+}
+
+void ipath_make_ruc_header(struct ipath_ibdev *dev, struct ipath_qp *qp,
+			   struct ipath_other_headers *ohdr,
+			   u32 bth0, u32 bth2)
+{
+	u16 lrh0;
+	u32 nwords;
+	u32 extra_bytes;
+
+	/* Construct the header. */
+	extra_bytes = -qp->s_cur_size & 3;
+	nwords = (qp->s_cur_size + extra_bytes) >> 2;
+	lrh0 = IPATH_LRH_BTH;
+	if (unlikely(qp->remote_ah_attr.ah_flags & IB_AH_GRH)) {
+		qp->s_hdrwords += ipath_make_grh(dev, &qp->s_hdr.u.l.grh,
+						 &qp->remote_ah_attr.grh,
+						 qp->s_hdrwords, nwords);
+		lrh0 = IPATH_LRH_GRH;
+	}
+	lrh0 |= qp->remote_ah_attr.sl << 4;
+	qp->s_hdr.lrh[0] = cpu_to_be16(lrh0);
+	qp->s_hdr.lrh[1] = cpu_to_be16(qp->remote_ah_attr.dlid);
+	qp->s_hdr.lrh[2] = cpu_to_be16(qp->s_hdrwords + nwords + SIZE_OF_CRC);
+	qp->s_hdr.lrh[3] = cpu_to_be16(dev->dd->ipath_lid |
+				       qp->remote_ah_attr.src_path_bits);
+	bth0 |= ipath_get_pkey(dev->dd, qp->s_pkey_index);
+	bth0 |= extra_bytes << 20;
+	ohdr->bth[0] = cpu_to_be32(bth0 | (1 << 22));
+	ohdr->bth[1] = cpu_to_be32(qp->remote_qpn);
+	ohdr->bth[2] = cpu_to_be32(bth2);
+}
+
+/**
+ * ipath_do_send - perform a send on a QP
+ * @data: contains a pointer to the QP
+ *
+ * Process entries in the send work queue until credit or queue is
+ * exhausted.  Only allow one CPU to send a packet per QP (tasklet).
+ * Otherwise, two threads could send packets out of order.
+ */
+void ipath_do_send(unsigned long data)
+{
+	struct ipath_qp *qp = (struct ipath_qp *)data;
+	struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
+	int (*make_req)(struct ipath_qp *qp);
+	unsigned long flags;
+
+	if ((qp->ibqp.qp_type == IB_QPT_RC ||
+	     qp->ibqp.qp_type == IB_QPT_UC) &&
+	    qp->remote_ah_attr.dlid == dev->dd->ipath_lid) {
+		ipath_ruc_loopback(qp);
+		goto bail;
+	}
+
+	if (qp->ibqp.qp_type == IB_QPT_RC)
+	       make_req = ipath_make_rc_req;
+	else if (qp->ibqp.qp_type == IB_QPT_UC)
+	       make_req = ipath_make_uc_req;
+	else
+	       make_req = ipath_make_ud_req;
+
+	spin_lock_irqsave(&qp->s_lock, flags);
+
+	/* Return if we are already busy processing a work request. */
+	if ((qp->s_flags & (IPATH_S_BUSY | IPATH_S_ANY_WAIT)) ||
+	    !(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_OR_FLUSH_SEND)) {
+		spin_unlock_irqrestore(&qp->s_lock, flags);
+		goto bail;
+	}
+
+	qp->s_flags |= IPATH_S_BUSY;
+
+	spin_unlock_irqrestore(&qp->s_lock, flags);
+
+again:
+	/* Check for a constructed packet to be sent. */
+	if (qp->s_hdrwords != 0) {
+		/*
+		 * If no PIO bufs are available, return.  An interrupt will
+		 * call ipath_ib_piobufavail() when one is available.
+		 */
+		if (ipath_verbs_send(qp, &qp->s_hdr, qp->s_hdrwords,
+				     qp->s_cur_sge, qp->s_cur_size)) {
+			if (ipath_no_bufs_available(qp, dev))
+				goto bail;
+		}
+		dev->n_unicast_xmit++;
+		/* Record that we sent the packet and s_hdr is empty. */
+		qp->s_hdrwords = 0;
+	}
+
+	if (make_req(qp))
+		goto again;
+
+bail:;
+}
+
+/*
+ * This should be called with s_lock held.
+ */
+void ipath_send_complete(struct ipath_qp *qp, struct ipath_swqe *wqe,
+			 enum ib_wc_status status)
+{
+	u32 old_last, last;
+
+	if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_OR_FLUSH_SEND))
+		return;
+
+	/* See ch. 11.2.4.1 and 10.7.3.1 */
+	if (!(qp->s_flags & IPATH_S_SIGNAL_REQ_WR) ||
+	    (wqe->wr.send_flags & IB_SEND_SIGNALED) ||
+	    status != IB_WC_SUCCESS) {
+		struct ib_wc wc;
+
+		memset(&wc, 0, sizeof wc);
+		wc.wr_id = wqe->wr.wr_id;
+		wc.status = status;
+		wc.opcode = ib_ipath_wc_opcode[wqe->wr.opcode];
+		wc.qp = &qp->ibqp;
+		if (status == IB_WC_SUCCESS)
+			wc.byte_len = wqe->length;
+		ipath_cq_enter(to_icq(qp->ibqp.send_cq), &wc,
+			       status != IB_WC_SUCCESS);
+	}
+
+	old_last = last = qp->s_last;
+	if (++last >= qp->s_size)
+		last = 0;
+	qp->s_last = last;
+	if (qp->s_cur == old_last)
+		qp->s_cur = last;
+	if (qp->s_tail == old_last)
+		qp->s_tail = last;
+	if (qp->state == IB_QPS_SQD && last == qp->s_cur)
+		qp->s_draining = 0;
+}
diff --git a/drivers/infiniband/hw/ipath/ipath_sdma.c b/drivers/infiniband/hw/ipath/ipath_sdma.c
new file mode 100644
index 0000000..17a5177
--- /dev/null
+++ b/drivers/infiniband/hw/ipath/ipath_sdma.c
@@ -0,0 +1,818 @@
+/*
+ * Copyright (c) 2007, 2008 QLogic Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/spinlock.h>
+#include <linux/gfp.h>
+
+#include "ipath_kernel.h"
+#include "ipath_verbs.h"
+#include "ipath_common.h"
+
+#define SDMA_DESCQ_SZ PAGE_SIZE /* 256 entries per 4KB page */
+
+static void vl15_watchdog_enq(struct ipath_devdata *dd)
+{
+	/* ipath_sdma_lock must already be held */
+	if (atomic_inc_return(&dd->ipath_sdma_vl15_count) == 1) {
+		unsigned long interval = (HZ + 19) / 20;
+		dd->ipath_sdma_vl15_timer.expires = jiffies + interval;
+		add_timer(&dd->ipath_sdma_vl15_timer);
+	}
+}
+
+static void vl15_watchdog_deq(struct ipath_devdata *dd)
+{
+	/* ipath_sdma_lock must already be held */
+	if (atomic_dec_return(&dd->ipath_sdma_vl15_count) != 0) {
+		unsigned long interval = (HZ + 19) / 20;
+		mod_timer(&dd->ipath_sdma_vl15_timer, jiffies + interval);
+	} else {
+		del_timer(&dd->ipath_sdma_vl15_timer);
+	}
+}
+
+static void vl15_watchdog_timeout(unsigned long opaque)
+{
+	struct ipath_devdata *dd = (struct ipath_devdata *)opaque;
+
+	if (atomic_read(&dd->ipath_sdma_vl15_count) != 0) {
+		ipath_dbg("vl15 watchdog timeout - clearing\n");
+		ipath_cancel_sends(dd, 1);
+		ipath_hol_down(dd);
+	} else {
+		ipath_dbg("vl15 watchdog timeout - "
+			  "condition already cleared\n");
+	}
+}
+
+static void unmap_desc(struct ipath_devdata *dd, unsigned head)
+{
+	__le64 *descqp = &dd->ipath_sdma_descq[head].qw[0];
+	u64 desc[2];
+	dma_addr_t addr;
+	size_t len;
+
+	desc[0] = le64_to_cpu(descqp[0]);
+	desc[1] = le64_to_cpu(descqp[1]);
+
+	addr = (desc[1] << 32) | (desc[0] >> 32);
+	len = (desc[0] >> 14) & (0x7ffULL << 2);
+	dma_unmap_single(&dd->pcidev->dev, addr, len, DMA_TO_DEVICE);
+}
+
+/*
+ * ipath_sdma_lock should be locked before calling this.
+ */
+int ipath_sdma_make_progress(struct ipath_devdata *dd)
+{
+	struct list_head *lp = NULL;
+	struct ipath_sdma_txreq *txp = NULL;
+	u16 dmahead;
+	u16 start_idx = 0;
+	int progress = 0;
+
+	if (!list_empty(&dd->ipath_sdma_activelist)) {
+		lp = dd->ipath_sdma_activelist.next;
+		txp = list_entry(lp, struct ipath_sdma_txreq, list);
+		start_idx = txp->start_idx;
+	}
+
+	/*
+	 * Read the SDMA head register in order to know that the
+	 * interrupt clear has been written to the chip.
+	 * Otherwise, we may not get an interrupt for the last
+	 * descriptor in the queue.
+	 */
+	dmahead = (u16)ipath_read_kreg32(dd, dd->ipath_kregs->kr_senddmahead);
+	/* sanity check return value for error handling (chip reset, etc.) */
+	if (dmahead >= dd->ipath_sdma_descq_cnt)
+		goto done;
+
+	while (dd->ipath_sdma_descq_head != dmahead) {
+		if (txp && txp->flags & IPATH_SDMA_TXREQ_F_FREEDESC &&
+		    dd->ipath_sdma_descq_head == start_idx) {
+			unmap_desc(dd, dd->ipath_sdma_descq_head);
+			start_idx++;
+			if (start_idx == dd->ipath_sdma_descq_cnt)
+				start_idx = 0;
+		}
+
+		/* increment free count and head */
+		dd->ipath_sdma_descq_removed++;
+		if (++dd->ipath_sdma_descq_head == dd->ipath_sdma_descq_cnt)
+			dd->ipath_sdma_descq_head = 0;
+
+		if (txp && txp->next_descq_idx == dd->ipath_sdma_descq_head) {
+			/* move to notify list */
+			if (txp->flags & IPATH_SDMA_TXREQ_F_VL15)
+				vl15_watchdog_deq(dd);
+			list_move_tail(lp, &dd->ipath_sdma_notifylist);
+			if (!list_empty(&dd->ipath_sdma_activelist)) {
+				lp = dd->ipath_sdma_activelist.next;
+				txp = list_entry(lp, struct ipath_sdma_txreq,
+						 list);
+				start_idx = txp->start_idx;
+			} else {
+				lp = NULL;
+				txp = NULL;
+			}
+		}
+		progress = 1;
+	}
+
+	if (progress)
+		tasklet_hi_schedule(&dd->ipath_sdma_notify_task);
+
+done:
+	return progress;
+}
+
+static void ipath_sdma_notify(struct ipath_devdata *dd, struct list_head *list)
+{
+	struct ipath_sdma_txreq *txp, *txp_next;
+
+	list_for_each_entry_safe(txp, txp_next, list, list) {
+		list_del_init(&txp->list);
+
+		if (txp->callback)
+			(*txp->callback)(txp->callback_cookie,
+					 txp->callback_status);
+	}
+}
+
+static void sdma_notify_taskbody(struct ipath_devdata *dd)
+{
+	unsigned long flags;
+	struct list_head list;
+
+	INIT_LIST_HEAD(&list);
+
+	spin_lock_irqsave(&dd->ipath_sdma_lock, flags);
+
+	list_splice_init(&dd->ipath_sdma_notifylist, &list);
+
+	spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
+
+	ipath_sdma_notify(dd, &list);
+
+	/*
+	 * The IB verbs layer needs to see the callback before getting
+	 * the call to ipath_ib_piobufavail() because the callback
+	 * handles releasing resources the next send will need.
+	 * Otherwise, we could do these calls in
+	 * ipath_sdma_make_progress().
+	 */
+	ipath_ib_piobufavail(dd->verbs_dev);
+}
+
+static void sdma_notify_task(unsigned long opaque)
+{
+	struct ipath_devdata *dd = (struct ipath_devdata *)opaque;
+
+	if (!test_bit(IPATH_SDMA_SHUTDOWN, &dd->ipath_sdma_status))
+		sdma_notify_taskbody(dd);
+}
+
+static void dump_sdma_state(struct ipath_devdata *dd)
+{
+	unsigned long reg;
+
+	reg = ipath_read_kreg64(dd, dd->ipath_kregs->kr_senddmastatus);
+	ipath_cdbg(VERBOSE, "kr_senddmastatus: 0x%016lx\n", reg);
+
+	reg = ipath_read_kreg64(dd, dd->ipath_kregs->kr_sendctrl);
+	ipath_cdbg(VERBOSE, "kr_sendctrl: 0x%016lx\n", reg);
+
+	reg = ipath_read_kreg64(dd, dd->ipath_kregs->kr_senddmabufmask0);
+	ipath_cdbg(VERBOSE, "kr_senddmabufmask0: 0x%016lx\n", reg);
+
+	reg = ipath_read_kreg64(dd, dd->ipath_kregs->kr_senddmabufmask1);
+	ipath_cdbg(VERBOSE, "kr_senddmabufmask1: 0x%016lx\n", reg);
+
+	reg = ipath_read_kreg64(dd, dd->ipath_kregs->kr_senddmabufmask2);
+	ipath_cdbg(VERBOSE, "kr_senddmabufmask2: 0x%016lx\n", reg);
+
+	reg = ipath_read_kreg64(dd, dd->ipath_kregs->kr_senddmatail);
+	ipath_cdbg(VERBOSE, "kr_senddmatail: 0x%016lx\n", reg);
+
+	reg = ipath_read_kreg64(dd, dd->ipath_kregs->kr_senddmahead);
+	ipath_cdbg(VERBOSE, "kr_senddmahead: 0x%016lx\n", reg);
+}
+
+static void sdma_abort_task(unsigned long opaque)
+{
+	struct ipath_devdata *dd = (struct ipath_devdata *) opaque;
+	u64 status;
+	unsigned long flags;
+
+	if (test_bit(IPATH_SDMA_SHUTDOWN, &dd->ipath_sdma_status))
+		return;
+
+	spin_lock_irqsave(&dd->ipath_sdma_lock, flags);
+
+	status = dd->ipath_sdma_status & IPATH_SDMA_ABORT_MASK;
+
+	/* nothing to do */
+	if (status == IPATH_SDMA_ABORT_NONE)
+		goto unlock;
+
+	/* ipath_sdma_abort() is done, waiting for interrupt */
+	if (status == IPATH_SDMA_ABORT_DISARMED) {
+		if (time_before(jiffies, dd->ipath_sdma_abort_intr_timeout))
+			goto resched_noprint;
+		/* give up, intr got lost somewhere */
+		ipath_dbg("give up waiting for SDMADISABLED intr\n");
+		__set_bit(IPATH_SDMA_DISABLED, &dd->ipath_sdma_status);
+		status = IPATH_SDMA_ABORT_ABORTED;
+	}
+
+	/* everything is stopped, time to clean up and restart */
+	if (status == IPATH_SDMA_ABORT_ABORTED) {
+		struct ipath_sdma_txreq *txp, *txpnext;
+		u64 hwstatus;
+		int notify = 0;
+
+		hwstatus = ipath_read_kreg64(dd,
+				dd->ipath_kregs->kr_senddmastatus);
+
+		if ((hwstatus & (IPATH_SDMA_STATUS_SCORE_BOARD_DRAIN_IN_PROG |
+				 IPATH_SDMA_STATUS_ABORT_IN_PROG	     |
+				 IPATH_SDMA_STATUS_INTERNAL_SDMA_ENABLE)) ||
+		    !(hwstatus & IPATH_SDMA_STATUS_SCB_EMPTY)) {
+			if (dd->ipath_sdma_reset_wait > 0) {
+				/* not done shutting down sdma */
+				--dd->ipath_sdma_reset_wait;
+				goto resched;
+			}
+			ipath_cdbg(VERBOSE, "gave up waiting for quiescent "
+				"status after SDMA reset, continuing\n");
+			dump_sdma_state(dd);
+		}
+
+		/* dequeue all "sent" requests */
+		list_for_each_entry_safe(txp, txpnext,
+					 &dd->ipath_sdma_activelist, list) {
+			txp->callback_status = IPATH_SDMA_TXREQ_S_ABORTED;
+			if (txp->flags & IPATH_SDMA_TXREQ_F_VL15)
+				vl15_watchdog_deq(dd);
+			list_move_tail(&txp->list, &dd->ipath_sdma_notifylist);
+			notify = 1;
+		}
+		if (notify)
+			tasklet_hi_schedule(&dd->ipath_sdma_notify_task);
+
+		/* reset our notion of head and tail */
+		dd->ipath_sdma_descq_tail = 0;
+		dd->ipath_sdma_descq_head = 0;
+		dd->ipath_sdma_head_dma[0] = 0;
+		dd->ipath_sdma_generation = 0;
+		dd->ipath_sdma_descq_removed = dd->ipath_sdma_descq_added;
+
+		/* Reset SendDmaLenGen */
+		ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmalengen,
+			(u64) dd->ipath_sdma_descq_cnt | (1ULL << 18));
+
+		/* done with sdma state for a bit */
+		spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
+
+		/*
+		 * Don't restart sdma here (with the exception
+		 * below). Wait until link is up to ACTIVE.  VL15 MADs
+		 * used to bring the link up use PIO, and multiple link
+		 * transitions otherwise cause the sdma engine to be
+		 * stopped and started multiple times.
+		 * The disable is done here, including the shadow,
+		 * so the state is kept consistent.
+		 * See ipath_restart_sdma() for the actual starting
+		 * of sdma.
+		 */
+		spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
+		dd->ipath_sendctrl &= ~INFINIPATH_S_SDMAENABLE;
+		ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl,
+				 dd->ipath_sendctrl);
+		ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
+		spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
+
+		/* make sure I see next message */
+		dd->ipath_sdma_abort_jiffies = 0;
+
+		/*
+		 * Not everything that takes SDMA offline is a link
+		 * status change.  If the link was up, restart SDMA.
+		 */
+		if (dd->ipath_flags & IPATH_LINKACTIVE)
+			ipath_restart_sdma(dd);
+
+		goto done;
+	}
+
+resched:
+	/*
+	 * for now, keep spinning
+	 * JAG - this is bad to just have default be a loop without
+	 * state change
+	 */
+	if (time_after(jiffies, dd->ipath_sdma_abort_jiffies)) {
+		ipath_dbg("looping with status 0x%08lx\n",
+			  dd->ipath_sdma_status);
+		dd->ipath_sdma_abort_jiffies = jiffies + 5 * HZ;
+	}
+resched_noprint:
+	spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
+	if (!test_bit(IPATH_SDMA_SHUTDOWN, &dd->ipath_sdma_status))
+		tasklet_hi_schedule(&dd->ipath_sdma_abort_task);
+	return;
+
+unlock:
+	spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
+done:
+	return;
+}
+
+/*
+ * This is called from interrupt context.
+ */
+void ipath_sdma_intr(struct ipath_devdata *dd)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&dd->ipath_sdma_lock, flags);
+
+	(void) ipath_sdma_make_progress(dd);
+
+	spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
+}
+
+static int alloc_sdma(struct ipath_devdata *dd)
+{
+	int ret = 0;
+
+	/* Allocate memory for SendDMA descriptor FIFO */
+	dd->ipath_sdma_descq = dma_alloc_coherent(&dd->pcidev->dev,
+		SDMA_DESCQ_SZ, &dd->ipath_sdma_descq_phys, GFP_KERNEL);
+
+	if (!dd->ipath_sdma_descq) {
+		ipath_dev_err(dd, "failed to allocate SendDMA descriptor "
+			"FIFO memory\n");
+		ret = -ENOMEM;
+		goto done;
+	}
+
+	dd->ipath_sdma_descq_cnt =
+		SDMA_DESCQ_SZ / sizeof(struct ipath_sdma_desc);
+
+	/* Allocate memory for DMA of head register to memory */
+	dd->ipath_sdma_head_dma = dma_alloc_coherent(&dd->pcidev->dev,
+		PAGE_SIZE, &dd->ipath_sdma_head_phys, GFP_KERNEL);
+	if (!dd->ipath_sdma_head_dma) {
+		ipath_dev_err(dd, "failed to allocate SendDMA head memory\n");
+		ret = -ENOMEM;
+		goto cleanup_descq;
+	}
+	dd->ipath_sdma_head_dma[0] = 0;
+
+	init_timer(&dd->ipath_sdma_vl15_timer);
+	dd->ipath_sdma_vl15_timer.function = vl15_watchdog_timeout;
+	dd->ipath_sdma_vl15_timer.data = (unsigned long)dd;
+	atomic_set(&dd->ipath_sdma_vl15_count, 0);
+
+	goto done;
+
+cleanup_descq:
+	dma_free_coherent(&dd->pcidev->dev, SDMA_DESCQ_SZ,
+		(void *)dd->ipath_sdma_descq, dd->ipath_sdma_descq_phys);
+	dd->ipath_sdma_descq = NULL;
+	dd->ipath_sdma_descq_phys = 0;
+done:
+	return ret;
+}
+
+int setup_sdma(struct ipath_devdata *dd)
+{
+	int ret = 0;
+	unsigned i, n;
+	u64 tmp64;
+	u64 senddmabufmask[3] = { 0 };
+	unsigned long flags;
+
+	ret = alloc_sdma(dd);
+	if (ret)
+		goto done;
+
+	if (!dd->ipath_sdma_descq) {
+		ipath_dev_err(dd, "SendDMA memory not allocated\n");
+		goto done;
+	}
+
+	/*
+	 * Set initial status as if we had been up, then gone down.
+	 * This lets initial start on transition to ACTIVE be the
+	 * same as restart after link flap.
+	 */
+	dd->ipath_sdma_status = IPATH_SDMA_ABORT_ABORTED;
+	dd->ipath_sdma_abort_jiffies = 0;
+	dd->ipath_sdma_generation = 0;
+	dd->ipath_sdma_descq_tail = 0;
+	dd->ipath_sdma_descq_head = 0;
+	dd->ipath_sdma_descq_removed = 0;
+	dd->ipath_sdma_descq_added = 0;
+
+	/* Set SendDmaBase */
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmabase,
+			 dd->ipath_sdma_descq_phys);
+	/* Set SendDmaLenGen */
+	tmp64 = dd->ipath_sdma_descq_cnt;
+	tmp64 |= 1<<18; /* enable generation checking */
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmalengen, tmp64);
+	/* Set SendDmaTail */
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmatail,
+			 dd->ipath_sdma_descq_tail);
+	/* Set SendDmaHeadAddr */
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmaheadaddr,
+			 dd->ipath_sdma_head_phys);
+
+	/*
+	 * Reserve all the former "kernel" piobufs, using high number range
+	 * so we get as many 4K buffers as possible
+	 */
+	n = dd->ipath_piobcnt2k + dd->ipath_piobcnt4k;
+	i = dd->ipath_lastport_piobuf + dd->ipath_pioreserved;
+	ipath_chg_pioavailkernel(dd, i, n - i , 0);
+	for (; i < n; ++i) {
+		unsigned word = i / 64;
+		unsigned bit = i & 63;
+		BUG_ON(word >= 3);
+		senddmabufmask[word] |= 1ULL << bit;
+	}
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmabufmask0,
+			 senddmabufmask[0]);
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmabufmask1,
+			 senddmabufmask[1]);
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmabufmask2,
+			 senddmabufmask[2]);
+
+	INIT_LIST_HEAD(&dd->ipath_sdma_activelist);
+	INIT_LIST_HEAD(&dd->ipath_sdma_notifylist);
+
+	tasklet_init(&dd->ipath_sdma_notify_task, sdma_notify_task,
+		     (unsigned long) dd);
+	tasklet_init(&dd->ipath_sdma_abort_task, sdma_abort_task,
+		     (unsigned long) dd);
+
+	/*
+	 * No use to turn on SDMA here, as link is probably not ACTIVE
+	 * Just mark it RUNNING and enable the interrupt, and let the
+	 * ipath_restart_sdma() on link transition to ACTIVE actually
+	 * enable it.
+	 */
+	spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
+	dd->ipath_sendctrl |= INFINIPATH_S_SDMAINTENABLE;
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, dd->ipath_sendctrl);
+	ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
+	__set_bit(IPATH_SDMA_RUNNING, &dd->ipath_sdma_status);
+	spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
+
+done:
+	return ret;
+}
+
+void teardown_sdma(struct ipath_devdata *dd)
+{
+	struct ipath_sdma_txreq *txp, *txpnext;
+	unsigned long flags;
+	dma_addr_t sdma_head_phys = 0;
+	dma_addr_t sdma_descq_phys = 0;
+	void *sdma_descq = NULL;
+	void *sdma_head_dma = NULL;
+
+	spin_lock_irqsave(&dd->ipath_sdma_lock, flags);
+	__clear_bit(IPATH_SDMA_RUNNING, &dd->ipath_sdma_status);
+	__set_bit(IPATH_SDMA_ABORTING, &dd->ipath_sdma_status);
+	__set_bit(IPATH_SDMA_SHUTDOWN, &dd->ipath_sdma_status);
+	spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
+
+	tasklet_kill(&dd->ipath_sdma_abort_task);
+	tasklet_kill(&dd->ipath_sdma_notify_task);
+
+	/* turn off sdma */
+	spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
+	dd->ipath_sendctrl &= ~INFINIPATH_S_SDMAENABLE;
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl,
+		dd->ipath_sendctrl);
+	ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
+	spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
+
+	spin_lock_irqsave(&dd->ipath_sdma_lock, flags);
+	/* dequeue all "sent" requests */
+	list_for_each_entry_safe(txp, txpnext, &dd->ipath_sdma_activelist,
+				 list) {
+		txp->callback_status = IPATH_SDMA_TXREQ_S_SHUTDOWN;
+		if (txp->flags & IPATH_SDMA_TXREQ_F_VL15)
+			vl15_watchdog_deq(dd);
+		list_move_tail(&txp->list, &dd->ipath_sdma_notifylist);
+	}
+	spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
+
+	sdma_notify_taskbody(dd);
+
+	del_timer_sync(&dd->ipath_sdma_vl15_timer);
+
+	spin_lock_irqsave(&dd->ipath_sdma_lock, flags);
+
+	dd->ipath_sdma_abort_jiffies = 0;
+
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmabase, 0);
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmalengen, 0);
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmatail, 0);
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmaheadaddr, 0);
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmabufmask0, 0);
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmabufmask1, 0);
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmabufmask2, 0);
+
+	if (dd->ipath_sdma_head_dma) {
+		sdma_head_dma = (void *) dd->ipath_sdma_head_dma;
+		sdma_head_phys = dd->ipath_sdma_head_phys;
+		dd->ipath_sdma_head_dma = NULL;
+		dd->ipath_sdma_head_phys = 0;
+	}
+
+	if (dd->ipath_sdma_descq) {
+		sdma_descq = dd->ipath_sdma_descq;
+		sdma_descq_phys = dd->ipath_sdma_descq_phys;
+		dd->ipath_sdma_descq = NULL;
+		dd->ipath_sdma_descq_phys = 0;
+	}
+
+	spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
+
+	if (sdma_head_dma)
+		dma_free_coherent(&dd->pcidev->dev, PAGE_SIZE,
+				  sdma_head_dma, sdma_head_phys);
+
+	if (sdma_descq)
+		dma_free_coherent(&dd->pcidev->dev, SDMA_DESCQ_SZ,
+				  sdma_descq, sdma_descq_phys);
+}
+
+/*
+ * [Re]start SDMA, if we use it, and it's not already OK.
+ * This is called on transition to link ACTIVE, either the first or
+ * subsequent times.
+ */
+void ipath_restart_sdma(struct ipath_devdata *dd)
+{
+	unsigned long flags;
+	int needed = 1;
+
+	if (!(dd->ipath_flags & IPATH_HAS_SEND_DMA))
+		goto bail;
+
+	/*
+	 * First, make sure we should, which is to say,
+	 * check that we are "RUNNING" (not in teardown)
+	 * and not "SHUTDOWN"
+	 */
+	spin_lock_irqsave(&dd->ipath_sdma_lock, flags);
+	if (!test_bit(IPATH_SDMA_RUNNING, &dd->ipath_sdma_status)
+		|| test_bit(IPATH_SDMA_SHUTDOWN, &dd->ipath_sdma_status))
+			needed = 0;
+	else {
+		__clear_bit(IPATH_SDMA_DISABLED, &dd->ipath_sdma_status);
+		__clear_bit(IPATH_SDMA_DISARMED, &dd->ipath_sdma_status);
+		__clear_bit(IPATH_SDMA_ABORTING, &dd->ipath_sdma_status);
+	}
+	spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
+	if (!needed) {
+		ipath_dbg("invalid attempt to restart SDMA, status 0x%08lx\n",
+			dd->ipath_sdma_status);
+		goto bail;
+	}
+	spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
+	/*
+	 * First clear, just to be safe. Enable is only done
+	 * in chip on 0->1 transition
+	 */
+	dd->ipath_sendctrl &= ~INFINIPATH_S_SDMAENABLE;
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, dd->ipath_sendctrl);
+	ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
+	dd->ipath_sendctrl |= INFINIPATH_S_SDMAENABLE;
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, dd->ipath_sendctrl);
+	ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
+	spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
+
+	/* notify upper layers */
+	ipath_ib_piobufavail(dd->verbs_dev);
+
+bail:
+	return;
+}
+
+static inline void make_sdma_desc(struct ipath_devdata *dd,
+	u64 *sdmadesc, u64 addr, u64 dwlen, u64 dwoffset)
+{
+	WARN_ON(addr & 3);
+	/* SDmaPhyAddr[47:32] */
+	sdmadesc[1] = addr >> 32;
+	/* SDmaPhyAddr[31:0] */
+	sdmadesc[0] = (addr & 0xfffffffcULL) << 32;
+	/* SDmaGeneration[1:0] */
+	sdmadesc[0] |= (dd->ipath_sdma_generation & 3ULL) << 30;
+	/* SDmaDwordCount[10:0] */
+	sdmadesc[0] |= (dwlen & 0x7ffULL) << 16;
+	/* SDmaBufOffset[12:2] */
+	sdmadesc[0] |= dwoffset & 0x7ffULL;
+}
+
+/*
+ * This function queues one IB packet onto the send DMA queue per call.
+ * The caller is responsible for checking:
+ * 1) The number of send DMA descriptor entries is less than the size of
+ *    the descriptor queue.
+ * 2) The IB SGE addresses and lengths are 32-bit aligned
+ *    (except possibly the last SGE's length)
+ * 3) The SGE addresses are suitable for passing to dma_map_single().
+ */
+int ipath_sdma_verbs_send(struct ipath_devdata *dd,
+	struct ipath_sge_state *ss, u32 dwords,
+	struct ipath_verbs_txreq *tx)
+{
+
+	unsigned long flags;
+	struct ipath_sge *sge;
+	int ret = 0;
+	u16 tail;
+	__le64 *descqp;
+	u64 sdmadesc[2];
+	u32 dwoffset;
+	dma_addr_t addr;
+
+	if ((tx->map_len + (dwords<<2)) > dd->ipath_ibmaxlen) {
+		ipath_dbg("packet size %X > ibmax %X, fail\n",
+			tx->map_len + (dwords<<2), dd->ipath_ibmaxlen);
+		ret = -EMSGSIZE;
+		goto fail;
+	}
+
+	spin_lock_irqsave(&dd->ipath_sdma_lock, flags);
+
+retry:
+	if (unlikely(test_bit(IPATH_SDMA_ABORTING, &dd->ipath_sdma_status))) {
+		ret = -EBUSY;
+		goto unlock;
+	}
+
+	if (tx->txreq.sg_count > ipath_sdma_descq_freecnt(dd)) {
+		if (ipath_sdma_make_progress(dd))
+			goto retry;
+		ret = -ENOBUFS;
+		goto unlock;
+	}
+
+	addr = dma_map_single(&dd->pcidev->dev, tx->txreq.map_addr,
+			      tx->map_len, DMA_TO_DEVICE);
+	if (dma_mapping_error(&dd->pcidev->dev, addr))
+		goto ioerr;
+
+	dwoffset = tx->map_len >> 2;
+	make_sdma_desc(dd, sdmadesc, (u64) addr, dwoffset, 0);
+
+	/* SDmaFirstDesc */
+	sdmadesc[0] |= 1ULL << 12;
+	if (tx->txreq.flags & IPATH_SDMA_TXREQ_F_USELARGEBUF)
+		sdmadesc[0] |= 1ULL << 14;	/* SDmaUseLargeBuf */
+
+	/* write to the descq */
+	tail = dd->ipath_sdma_descq_tail;
+	descqp = &dd->ipath_sdma_descq[tail].qw[0];
+	*descqp++ = cpu_to_le64(sdmadesc[0]);
+	*descqp++ = cpu_to_le64(sdmadesc[1]);
+
+	if (tx->txreq.flags & IPATH_SDMA_TXREQ_F_FREEDESC)
+		tx->txreq.start_idx = tail;
+
+	/* increment the tail */
+	if (++tail == dd->ipath_sdma_descq_cnt) {
+		tail = 0;
+		descqp = &dd->ipath_sdma_descq[0].qw[0];
+		++dd->ipath_sdma_generation;
+	}
+
+	sge = &ss->sge;
+	while (dwords) {
+		u32 dw;
+		u32 len;
+
+		len = dwords << 2;
+		if (len > sge->length)
+			len = sge->length;
+		if (len > sge->sge_length)
+			len = sge->sge_length;
+		BUG_ON(len == 0);
+		dw = (len + 3) >> 2;
+		addr = dma_map_single(&dd->pcidev->dev, sge->vaddr, dw << 2,
+				      DMA_TO_DEVICE);
+		if (dma_mapping_error(&dd->pcidev->dev, addr))
+			goto unmap;
+		make_sdma_desc(dd, sdmadesc, (u64) addr, dw, dwoffset);
+		/* SDmaUseLargeBuf has to be set in every descriptor */
+		if (tx->txreq.flags & IPATH_SDMA_TXREQ_F_USELARGEBUF)
+			sdmadesc[0] |= 1ULL << 14;
+		/* write to the descq */
+		*descqp++ = cpu_to_le64(sdmadesc[0]);
+		*descqp++ = cpu_to_le64(sdmadesc[1]);
+
+		/* increment the tail */
+		if (++tail == dd->ipath_sdma_descq_cnt) {
+			tail = 0;
+			descqp = &dd->ipath_sdma_descq[0].qw[0];
+			++dd->ipath_sdma_generation;
+		}
+		sge->vaddr += len;
+		sge->length -= len;
+		sge->sge_length -= len;
+		if (sge->sge_length == 0) {
+			if (--ss->num_sge)
+				*sge = *ss->sg_list++;
+		} else if (sge->length == 0 && sge->mr != NULL) {
+			if (++sge->n >= IPATH_SEGSZ) {
+				if (++sge->m >= sge->mr->mapsz)
+					break;
+				sge->n = 0;
+			}
+			sge->vaddr =
+				sge->mr->map[sge->m]->segs[sge->n].vaddr;
+			sge->length =
+				sge->mr->map[sge->m]->segs[sge->n].length;
+		}
+
+		dwoffset += dw;
+		dwords -= dw;
+	}
+
+	if (!tail)
+		descqp = &dd->ipath_sdma_descq[dd->ipath_sdma_descq_cnt].qw[0];
+	descqp -= 2;
+	/* SDmaLastDesc */
+	descqp[0] |= cpu_to_le64(1ULL << 11);
+	if (tx->txreq.flags & IPATH_SDMA_TXREQ_F_INTREQ) {
+		/* SDmaIntReq */
+		descqp[0] |= cpu_to_le64(1ULL << 15);
+	}
+
+	/* Commit writes to memory and advance the tail on the chip */
+	wmb();
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmatail, tail);
+
+	tx->txreq.next_descq_idx = tail;
+	tx->txreq.callback_status = IPATH_SDMA_TXREQ_S_OK;
+	dd->ipath_sdma_descq_tail = tail;
+	dd->ipath_sdma_descq_added += tx->txreq.sg_count;
+	list_add_tail(&tx->txreq.list, &dd->ipath_sdma_activelist);
+	if (tx->txreq.flags & IPATH_SDMA_TXREQ_F_VL15)
+		vl15_watchdog_enq(dd);
+	goto unlock;
+
+unmap:
+	while (tail != dd->ipath_sdma_descq_tail) {
+		if (!tail)
+			tail = dd->ipath_sdma_descq_cnt - 1;
+		else
+			tail--;
+		unmap_desc(dd, tail);
+	}
+ioerr:
+	ret = -EIO;
+unlock:
+	spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
+fail:
+	return ret;
+}
diff --git a/drivers/infiniband/hw/ipath/ipath_srq.c b/drivers/infiniband/hw/ipath/ipath_srq.c
new file mode 100644
index 0000000..2627198
--- /dev/null
+++ b/drivers/infiniband/hw/ipath/ipath_srq.c
@@ -0,0 +1,380 @@
+/*
+ * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/err.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+
+#include "ipath_verbs.h"
+
+/**
+ * ipath_post_srq_receive - post a receive on a shared receive queue
+ * @ibsrq: the SRQ to post the receive on
+ * @wr: the list of work requests to post
+ * @bad_wr: the first WR to cause a problem is put here
+ *
+ * This may be called from interrupt context.
+ */
+int ipath_post_srq_receive(struct ib_srq *ibsrq, struct ib_recv_wr *wr,
+			   struct ib_recv_wr **bad_wr)
+{
+	struct ipath_srq *srq = to_isrq(ibsrq);
+	struct ipath_rwq *wq;
+	unsigned long flags;
+	int ret;
+
+	for (; wr; wr = wr->next) {
+		struct ipath_rwqe *wqe;
+		u32 next;
+		int i;
+
+		if ((unsigned) wr->num_sge > srq->rq.max_sge) {
+			*bad_wr = wr;
+			ret = -EINVAL;
+			goto bail;
+		}
+
+		spin_lock_irqsave(&srq->rq.lock, flags);
+		wq = srq->rq.wq;
+		next = wq->head + 1;
+		if (next >= srq->rq.size)
+			next = 0;
+		if (next == wq->tail) {
+			spin_unlock_irqrestore(&srq->rq.lock, flags);
+			*bad_wr = wr;
+			ret = -ENOMEM;
+			goto bail;
+		}
+
+		wqe = get_rwqe_ptr(&srq->rq, wq->head);
+		wqe->wr_id = wr->wr_id;
+		wqe->num_sge = wr->num_sge;
+		for (i = 0; i < wr->num_sge; i++)
+			wqe->sg_list[i] = wr->sg_list[i];
+		/* Make sure queue entry is written before the head index. */
+		smp_wmb();
+		wq->head = next;
+		spin_unlock_irqrestore(&srq->rq.lock, flags);
+	}
+	ret = 0;
+
+bail:
+	return ret;
+}
+
+/**
+ * ipath_create_srq - create a shared receive queue
+ * @ibpd: the protection domain of the SRQ to create
+ * @srq_init_attr: the attributes of the SRQ
+ * @udata: data from libipathverbs when creating a user SRQ
+ */
+struct ib_srq *ipath_create_srq(struct ib_pd *ibpd,
+				struct ib_srq_init_attr *srq_init_attr,
+				struct ib_udata *udata)
+{
+	struct ipath_ibdev *dev = to_idev(ibpd->device);
+	struct ipath_srq *srq;
+	u32 sz;
+	struct ib_srq *ret;
+
+	if (srq_init_attr->srq_type != IB_SRQT_BASIC) {
+		ret = ERR_PTR(-ENOSYS);
+		goto done;
+	}
+
+	if (srq_init_attr->attr.max_wr == 0) {
+		ret = ERR_PTR(-EINVAL);
+		goto done;
+	}
+
+	if ((srq_init_attr->attr.max_sge > ib_ipath_max_srq_sges) ||
+	    (srq_init_attr->attr.max_wr > ib_ipath_max_srq_wrs)) {
+		ret = ERR_PTR(-EINVAL);
+		goto done;
+	}
+
+	srq = kmalloc(sizeof(*srq), GFP_KERNEL);
+	if (!srq) {
+		ret = ERR_PTR(-ENOMEM);
+		goto done;
+	}
+
+	/*
+	 * Need to use vmalloc() if we want to support large #s of entries.
+	 */
+	srq->rq.size = srq_init_attr->attr.max_wr + 1;
+	srq->rq.max_sge = srq_init_attr->attr.max_sge;
+	sz = sizeof(struct ib_sge) * srq->rq.max_sge +
+		sizeof(struct ipath_rwqe);
+	srq->rq.wq = vmalloc_user(sizeof(struct ipath_rwq) + srq->rq.size * sz);
+	if (!srq->rq.wq) {
+		ret = ERR_PTR(-ENOMEM);
+		goto bail_srq;
+	}
+
+	/*
+	 * Return the address of the RWQ as the offset to mmap.
+	 * See ipath_mmap() for details.
+	 */
+	if (udata && udata->outlen >= sizeof(__u64)) {
+		int err;
+		u32 s = sizeof(struct ipath_rwq) + srq->rq.size * sz;
+
+		srq->ip =
+		    ipath_create_mmap_info(dev, s,
+					   ibpd->uobject->context,
+					   srq->rq.wq);
+		if (!srq->ip) {
+			ret = ERR_PTR(-ENOMEM);
+			goto bail_wq;
+		}
+
+		err = ib_copy_to_udata(udata, &srq->ip->offset,
+				       sizeof(srq->ip->offset));
+		if (err) {
+			ret = ERR_PTR(err);
+			goto bail_ip;
+		}
+	} else
+		srq->ip = NULL;
+
+	/*
+	 * ib_create_srq() will initialize srq->ibsrq.
+	 */
+	spin_lock_init(&srq->rq.lock);
+	srq->rq.wq->head = 0;
+	srq->rq.wq->tail = 0;
+	srq->limit = srq_init_attr->attr.srq_limit;
+
+	spin_lock(&dev->n_srqs_lock);
+	if (dev->n_srqs_allocated == ib_ipath_max_srqs) {
+		spin_unlock(&dev->n_srqs_lock);
+		ret = ERR_PTR(-ENOMEM);
+		goto bail_ip;
+	}
+
+ 	dev->n_srqs_allocated++;
+	spin_unlock(&dev->n_srqs_lock);
+
+	if (srq->ip) {
+		spin_lock_irq(&dev->pending_lock);
+		list_add(&srq->ip->pending_mmaps, &dev->pending_mmaps);
+		spin_unlock_irq(&dev->pending_lock);
+	}
+
+	ret = &srq->ibsrq;
+	goto done;
+
+bail_ip:
+	kfree(srq->ip);
+bail_wq:
+	vfree(srq->rq.wq);
+bail_srq:
+	kfree(srq);
+done:
+	return ret;
+}
+
+/**
+ * ipath_modify_srq - modify a shared receive queue
+ * @ibsrq: the SRQ to modify
+ * @attr: the new attributes of the SRQ
+ * @attr_mask: indicates which attributes to modify
+ * @udata: user data for ipathverbs.so
+ */
+int ipath_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
+		     enum ib_srq_attr_mask attr_mask,
+		     struct ib_udata *udata)
+{
+	struct ipath_srq *srq = to_isrq(ibsrq);
+	struct ipath_rwq *wq;
+	int ret = 0;
+
+	if (attr_mask & IB_SRQ_MAX_WR) {
+		struct ipath_rwq *owq;
+		struct ipath_rwqe *p;
+		u32 sz, size, n, head, tail;
+
+		/* Check that the requested sizes are below the limits. */
+		if ((attr->max_wr > ib_ipath_max_srq_wrs) ||
+		    ((attr_mask & IB_SRQ_LIMIT) ?
+		     attr->srq_limit : srq->limit) > attr->max_wr) {
+			ret = -EINVAL;
+			goto bail;
+		}
+
+		sz = sizeof(struct ipath_rwqe) +
+			srq->rq.max_sge * sizeof(struct ib_sge);
+		size = attr->max_wr + 1;
+		wq = vmalloc_user(sizeof(struct ipath_rwq) + size * sz);
+		if (!wq) {
+			ret = -ENOMEM;
+			goto bail;
+		}
+
+		/* Check that we can write the offset to mmap. */
+		if (udata && udata->inlen >= sizeof(__u64)) {
+			__u64 offset_addr;
+			__u64 offset = 0;
+
+			ret = ib_copy_from_udata(&offset_addr, udata,
+						 sizeof(offset_addr));
+			if (ret)
+				goto bail_free;
+			udata->outbuf =
+				(void __user *) (unsigned long) offset_addr;
+			ret = ib_copy_to_udata(udata, &offset,
+					       sizeof(offset));
+			if (ret)
+				goto bail_free;
+		}
+
+		spin_lock_irq(&srq->rq.lock);
+		/*
+		 * validate head pointer value and compute
+		 * the number of remaining WQEs.
+		 */
+		owq = srq->rq.wq;
+		head = owq->head;
+		if (head >= srq->rq.size)
+			head = 0;
+		tail = owq->tail;
+		if (tail >= srq->rq.size)
+			tail = 0;
+		n = head;
+		if (n < tail)
+			n += srq->rq.size - tail;
+		else
+			n -= tail;
+		if (size <= n) {
+			ret = -EINVAL;
+			goto bail_unlock;
+		}
+		n = 0;
+		p = wq->wq;
+		while (tail != head) {
+			struct ipath_rwqe *wqe;
+			int i;
+
+			wqe = get_rwqe_ptr(&srq->rq, tail);
+			p->wr_id = wqe->wr_id;
+			p->num_sge = wqe->num_sge;
+			for (i = 0; i < wqe->num_sge; i++)
+				p->sg_list[i] = wqe->sg_list[i];
+			n++;
+			p = (struct ipath_rwqe *)((char *) p + sz);
+			if (++tail >= srq->rq.size)
+				tail = 0;
+		}
+		srq->rq.wq = wq;
+		srq->rq.size = size;
+		wq->head = n;
+		wq->tail = 0;
+		if (attr_mask & IB_SRQ_LIMIT)
+			srq->limit = attr->srq_limit;
+		spin_unlock_irq(&srq->rq.lock);
+
+		vfree(owq);
+
+		if (srq->ip) {
+			struct ipath_mmap_info *ip = srq->ip;
+			struct ipath_ibdev *dev = to_idev(srq->ibsrq.device);
+			u32 s = sizeof(struct ipath_rwq) + size * sz;
+
+			ipath_update_mmap_info(dev, ip, s, wq);
+
+			/*
+			 * Return the offset to mmap.
+			 * See ipath_mmap() for details.
+			 */
+			if (udata && udata->inlen >= sizeof(__u64)) {
+				ret = ib_copy_to_udata(udata, &ip->offset,
+						       sizeof(ip->offset));
+				if (ret)
+					goto bail;
+			}
+
+			spin_lock_irq(&dev->pending_lock);
+			if (list_empty(&ip->pending_mmaps))
+				list_add(&ip->pending_mmaps,
+					 &dev->pending_mmaps);
+			spin_unlock_irq(&dev->pending_lock);
+		}
+	} else if (attr_mask & IB_SRQ_LIMIT) {
+		spin_lock_irq(&srq->rq.lock);
+		if (attr->srq_limit >= srq->rq.size)
+			ret = -EINVAL;
+		else
+			srq->limit = attr->srq_limit;
+		spin_unlock_irq(&srq->rq.lock);
+	}
+	goto bail;
+
+bail_unlock:
+	spin_unlock_irq(&srq->rq.lock);
+bail_free:
+	vfree(wq);
+bail:
+	return ret;
+}
+
+int ipath_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr)
+{
+	struct ipath_srq *srq = to_isrq(ibsrq);
+
+	attr->max_wr = srq->rq.size - 1;
+	attr->max_sge = srq->rq.max_sge;
+	attr->srq_limit = srq->limit;
+	return 0;
+}
+
+/**
+ * ipath_destroy_srq - destroy a shared receive queue
+ * @ibsrq: the SRQ to destroy
+ */
+int ipath_destroy_srq(struct ib_srq *ibsrq)
+{
+	struct ipath_srq *srq = to_isrq(ibsrq);
+	struct ipath_ibdev *dev = to_idev(ibsrq->device);
+
+	spin_lock(&dev->n_srqs_lock);
+	dev->n_srqs_allocated--;
+	spin_unlock(&dev->n_srqs_lock);
+	if (srq->ip)
+		kref_put(&srq->ip->ref, ipath_release_mmap_info);
+	else
+		vfree(srq->rq.wq);
+	kfree(srq);
+
+	return 0;
+}
diff --git a/drivers/infiniband/hw/ipath/ipath_stats.c b/drivers/infiniband/hw/ipath/ipath_stats.c
new file mode 100644
index 0000000..f63e143
--- /dev/null
+++ b/drivers/infiniband/hw/ipath/ipath_stats.c
@@ -0,0 +1,347 @@
+/*
+ * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "ipath_kernel.h"
+
+struct infinipath_stats ipath_stats;
+
+/**
+ * ipath_snap_cntr - snapshot a chip counter
+ * @dd: the infinipath device
+ * @creg: the counter to snapshot
+ *
+ * called from add_timer and user counter read calls, to deal with
+ * counters that wrap in "human time".  The words sent and received, and
+ * the packets sent and received are all that we worry about.  For now,
+ * at least, we don't worry about error counters, because if they wrap
+ * that quickly, we probably don't care.  We may eventually just make this
+ * handle all the counters.  word counters can wrap in about 20 seconds
+ * of full bandwidth traffic, packet counters in a few hours.
+ */
+
+u64 ipath_snap_cntr(struct ipath_devdata *dd, ipath_creg creg)
+{
+	u32 val, reg64 = 0;
+	u64 val64;
+	unsigned long t0, t1;
+	u64 ret;
+
+	t0 = jiffies;
+	/* If fast increment counters are only 32 bits, snapshot them,
+	 * and maintain them as 64bit values in the driver */
+	if (!(dd->ipath_flags & IPATH_32BITCOUNTERS) &&
+	    (creg == dd->ipath_cregs->cr_wordsendcnt ||
+	     creg == dd->ipath_cregs->cr_wordrcvcnt ||
+	     creg == dd->ipath_cregs->cr_pktsendcnt ||
+	     creg == dd->ipath_cregs->cr_pktrcvcnt)) {
+		val64 = ipath_read_creg(dd, creg);
+		val = val64 == ~0ULL ? ~0U : 0;
+		reg64 = 1;
+	} else			/* val64 just to keep gcc quiet... */
+		val64 = val = ipath_read_creg32(dd, creg);
+	/*
+	 * See if a second has passed.  This is just a way to detect things
+	 * that are quite broken.  Normally this should take just a few
+	 * cycles (the check is for long enough that we don't care if we get
+	 * pre-empted.)  An Opteron HT O read timeout is 4 seconds with
+	 * normal NB values
+	 */
+	t1 = jiffies;
+	if (time_before(t0 + HZ, t1) && val == -1) {
+		ipath_dev_err(dd, "Error!  Read counter 0x%x timed out\n",
+			      creg);
+		ret = 0ULL;
+		goto bail;
+	}
+	if (reg64) {
+		ret = val64;
+		goto bail;
+	}
+
+	if (creg == dd->ipath_cregs->cr_wordsendcnt) {
+		if (val != dd->ipath_lastsword) {
+			dd->ipath_sword += val - dd->ipath_lastsword;
+			dd->ipath_lastsword = val;
+		}
+		val64 = dd->ipath_sword;
+	} else if (creg == dd->ipath_cregs->cr_wordrcvcnt) {
+		if (val != dd->ipath_lastrword) {
+			dd->ipath_rword += val - dd->ipath_lastrword;
+			dd->ipath_lastrword = val;
+		}
+		val64 = dd->ipath_rword;
+	} else if (creg == dd->ipath_cregs->cr_pktsendcnt) {
+		if (val != dd->ipath_lastspkts) {
+			dd->ipath_spkts += val - dd->ipath_lastspkts;
+			dd->ipath_lastspkts = val;
+		}
+		val64 = dd->ipath_spkts;
+	} else if (creg == dd->ipath_cregs->cr_pktrcvcnt) {
+		if (val != dd->ipath_lastrpkts) {
+			dd->ipath_rpkts += val - dd->ipath_lastrpkts;
+			dd->ipath_lastrpkts = val;
+		}
+		val64 = dd->ipath_rpkts;
+	} else if (creg == dd->ipath_cregs->cr_ibsymbolerrcnt) {
+		if (dd->ibdeltainprog)
+			val64 -= val64 - dd->ibsymsnap;
+		val64 -= dd->ibsymdelta;
+	} else if (creg == dd->ipath_cregs->cr_iblinkerrrecovcnt) {
+		if (dd->ibdeltainprog)
+			val64 -= val64 - dd->iblnkerrsnap;
+		val64 -= dd->iblnkerrdelta;
+	} else
+		val64 = (u64) val;
+
+	ret = val64;
+
+bail:
+	return ret;
+}
+
+/**
+ * ipath_qcheck - print delta of egrfull/hdrqfull errors for kernel ports
+ * @dd: the infinipath device
+ *
+ * print the delta of egrfull/hdrqfull errors for kernel ports no more than
+ * every 5 seconds.  User processes are printed at close, but kernel doesn't
+ * close, so...  Separate routine so may call from other places someday, and
+ * so function name when printed by _IPATH_INFO is meaningfull
+ */
+static void ipath_qcheck(struct ipath_devdata *dd)
+{
+	static u64 last_tot_hdrqfull;
+	struct ipath_portdata *pd = dd->ipath_pd[0];
+	size_t blen = 0;
+	char buf[128];
+	u32 hdrqtail;
+
+	*buf = 0;
+	if (pd->port_hdrqfull != dd->ipath_p0_hdrqfull) {
+		blen = snprintf(buf, sizeof buf, "port 0 hdrqfull %u",
+				pd->port_hdrqfull -
+				dd->ipath_p0_hdrqfull);
+		dd->ipath_p0_hdrqfull = pd->port_hdrqfull;
+	}
+	if (ipath_stats.sps_etidfull != dd->ipath_last_tidfull) {
+		blen += snprintf(buf + blen, sizeof buf - blen,
+				 "%srcvegrfull %llu",
+				 blen ? ", " : "",
+				 (unsigned long long)
+				 (ipath_stats.sps_etidfull -
+				  dd->ipath_last_tidfull));
+		dd->ipath_last_tidfull = ipath_stats.sps_etidfull;
+	}
+
+	/*
+	 * this is actually the number of hdrq full interrupts, not actual
+	 * events, but at the moment that's mostly what I'm interested in.
+	 * Actual count, etc. is in the counters, if needed.  For production
+	 * users this won't ordinarily be printed.
+	 */
+
+	if ((ipath_debug & (__IPATH_PKTDBG | __IPATH_DBG)) &&
+	    ipath_stats.sps_hdrqfull != last_tot_hdrqfull) {
+		blen += snprintf(buf + blen, sizeof buf - blen,
+				 "%shdrqfull %llu (all ports)",
+				 blen ? ", " : "",
+				 (unsigned long long)
+				 (ipath_stats.sps_hdrqfull -
+				  last_tot_hdrqfull));
+		last_tot_hdrqfull = ipath_stats.sps_hdrqfull;
+	}
+	if (blen)
+		ipath_dbg("%s\n", buf);
+
+	hdrqtail = ipath_get_hdrqtail(pd);
+	if (pd->port_head != hdrqtail) {
+		if (dd->ipath_lastport0rcv_cnt ==
+		    ipath_stats.sps_port0pkts) {
+			ipath_cdbg(PKT, "missing rcv interrupts? "
+				   "port0 hd=%x tl=%x; port0pkts %llx; write"
+				   " hd (w/intr)\n",
+				   pd->port_head, hdrqtail,
+				   (unsigned long long)
+				   ipath_stats.sps_port0pkts);
+			ipath_write_ureg(dd, ur_rcvhdrhead, hdrqtail |
+				dd->ipath_rhdrhead_intr_off, pd->port_port);
+		}
+		dd->ipath_lastport0rcv_cnt = ipath_stats.sps_port0pkts;
+	}
+}
+
+static void ipath_chk_errormask(struct ipath_devdata *dd)
+{
+	static u32 fixed;
+	u32 ctrl;
+	unsigned long errormask;
+	unsigned long hwerrs;
+
+	if (!dd->ipath_errormask || !(dd->ipath_flags & IPATH_INITTED))
+		return;
+
+	errormask = ipath_read_kreg64(dd, dd->ipath_kregs->kr_errormask);
+
+	if (errormask == dd->ipath_errormask)
+		return;
+	fixed++;
+
+	hwerrs = ipath_read_kreg64(dd, dd->ipath_kregs->kr_hwerrstatus);
+	ctrl = ipath_read_kreg32(dd, dd->ipath_kregs->kr_control);
+
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_errormask,
+		dd->ipath_errormask);
+
+	if ((hwerrs & dd->ipath_hwerrmask) ||
+		(ctrl & INFINIPATH_C_FREEZEMODE)) {
+		/* force re-interrupt of pending events, just in case */
+		ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrclear, 0ULL);
+		ipath_write_kreg(dd, dd->ipath_kregs->kr_errorclear, 0ULL);
+		ipath_write_kreg(dd, dd->ipath_kregs->kr_intclear, 0ULL);
+		dev_info(&dd->pcidev->dev,
+			"errormask fixed(%u) %lx -> %lx, ctrl %x hwerr %lx\n",
+			fixed, errormask, (unsigned long)dd->ipath_errormask,
+			ctrl, hwerrs);
+	} else
+		ipath_dbg("errormask fixed(%u) %lx -> %lx, no freeze\n",
+			fixed, errormask,
+			(unsigned long)dd->ipath_errormask);
+}
+
+
+/**
+ * ipath_get_faststats - get word counters from chip before they overflow
+ * @opaque - contains a pointer to the infinipath device ipath_devdata
+ *
+ * called from add_timer
+ */
+void ipath_get_faststats(unsigned long opaque)
+{
+	struct ipath_devdata *dd = (struct ipath_devdata *) opaque;
+	int i;
+	static unsigned cnt;
+	unsigned long flags;
+	u64 traffic_wds;
+
+	/*
+	 * don't access the chip while running diags, or memory diags can
+	 * fail
+	 */
+	if (!dd->ipath_kregbase || !(dd->ipath_flags & IPATH_INITTED) ||
+	    ipath_diag_inuse)
+		/* but re-arm the timer, for diags case; won't hurt other */
+		goto done;
+
+	/*
+	 * We now try to maintain a "active timer", based on traffic
+	 * exceeding a threshold, so we need to check the word-counts
+	 * even if they are 64-bit.
+	 */
+	traffic_wds = ipath_snap_cntr(dd, dd->ipath_cregs->cr_wordsendcnt) +
+		ipath_snap_cntr(dd, dd->ipath_cregs->cr_wordrcvcnt);
+	spin_lock_irqsave(&dd->ipath_eep_st_lock, flags);
+	traffic_wds -= dd->ipath_traffic_wds;
+	dd->ipath_traffic_wds += traffic_wds;
+	if (traffic_wds  >= IPATH_TRAFFIC_ACTIVE_THRESHOLD)
+		atomic_add(5, &dd->ipath_active_time); /* S/B #define */
+	spin_unlock_irqrestore(&dd->ipath_eep_st_lock, flags);
+
+	if (dd->ipath_flags & IPATH_32BITCOUNTERS) {
+		ipath_snap_cntr(dd, dd->ipath_cregs->cr_pktsendcnt);
+		ipath_snap_cntr(dd, dd->ipath_cregs->cr_pktrcvcnt);
+	}
+
+	ipath_qcheck(dd);
+
+	/*
+	 * deal with repeat error suppression.  Doesn't really matter if
+	 * last error was almost a full interval ago, or just a few usecs
+	 * ago; still won't get more than 2 per interval.  We may want
+	 * longer intervals for this eventually, could do with mod, counter
+	 * or separate timer.  Also see code in ipath_handle_errors() and
+	 * ipath_handle_hwerrors().
+	 */
+
+	if (dd->ipath_lasterror)
+		dd->ipath_lasterror = 0;
+	if (dd->ipath_lasthwerror)
+		dd->ipath_lasthwerror = 0;
+	if (dd->ipath_maskederrs
+	    && time_after(jiffies, dd->ipath_unmasktime)) {
+		char ebuf[256];
+		int iserr;
+		iserr = ipath_decode_err(dd, ebuf, sizeof ebuf,
+					 dd->ipath_maskederrs);
+		if (dd->ipath_maskederrs &
+		    ~(INFINIPATH_E_RRCVEGRFULL | INFINIPATH_E_RRCVHDRFULL |
+		      INFINIPATH_E_PKTERRS))
+			ipath_dev_err(dd, "Re-enabling masked errors "
+				      "(%s)\n", ebuf);
+		else {
+			/*
+			 * rcvegrfull and rcvhdrqfull are "normal", for some
+			 * types of processes (mostly benchmarks) that send
+			 * huge numbers of messages, while not processing
+			 * them.  So only complain about these at debug
+			 * level.
+			 */
+			if (iserr)
+				ipath_dbg(
+					"Re-enabling queue full errors (%s)\n",
+					ebuf);
+			else
+				ipath_cdbg(ERRPKT, "Re-enabling packet"
+					" problem interrupt (%s)\n", ebuf);
+		}
+
+		/* re-enable masked errors */
+		dd->ipath_errormask |= dd->ipath_maskederrs;
+		ipath_write_kreg(dd, dd->ipath_kregs->kr_errormask,
+				 dd->ipath_errormask);
+		dd->ipath_maskederrs = 0;
+	}
+
+	/* limit qfull messages to ~one per minute per port */
+	if ((++cnt & 0x10)) {
+		for (i = (int) dd->ipath_cfgports; --i >= 0; ) {
+			struct ipath_portdata *pd = dd->ipath_pd[i];
+
+			if (pd && pd->port_lastrcvhdrqtail != -1)
+				pd->port_lastrcvhdrqtail = -1;
+		}
+	}
+
+	ipath_chk_errormask(dd);
+done:
+	mod_timer(&dd->ipath_stats_timer, jiffies + HZ * 5);
+}
diff --git a/drivers/infiniband/hw/ipath/ipath_sysfs.c b/drivers/infiniband/hw/ipath/ipath_sysfs.c
new file mode 100644
index 0000000..75558f3
--- /dev/null
+++ b/drivers/infiniband/hw/ipath/ipath_sysfs.c
@@ -0,0 +1,1238 @@
+/*
+ * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/ctype.h>
+#include <linux/stat.h>
+
+#include "ipath_kernel.h"
+#include "ipath_verbs.h"
+#include "ipath_common.h"
+
+/**
+ * ipath_parse_ushort - parse an unsigned short value in an arbitrary base
+ * @str: the string containing the number
+ * @valp: where to put the result
+ *
+ * returns the number of bytes consumed, or negative value on error
+ */
+int ipath_parse_ushort(const char *str, unsigned short *valp)
+{
+	unsigned long val;
+	char *end;
+	int ret;
+
+	if (!isdigit(str[0])) {
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	val = simple_strtoul(str, &end, 0);
+
+	if (val > 0xffff) {
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	*valp = val;
+
+	ret = end + 1 - str;
+	if (ret == 0)
+		ret = -EINVAL;
+
+bail:
+	return ret;
+}
+
+static ssize_t show_version(struct device_driver *dev, char *buf)
+{
+	/* The string printed here is already newline-terminated. */
+	return scnprintf(buf, PAGE_SIZE, "%s", ib_ipath_version);
+}
+
+static ssize_t show_num_units(struct device_driver *dev, char *buf)
+{
+	return scnprintf(buf, PAGE_SIZE, "%d\n",
+			 ipath_count_units(NULL, NULL, NULL));
+}
+
+static ssize_t show_status(struct device *dev,
+			   struct device_attribute *attr,
+			   char *buf)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+	ssize_t ret;
+
+	if (!dd->ipath_statusp) {
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	ret = scnprintf(buf, PAGE_SIZE, "0x%llx\n",
+			(unsigned long long) *(dd->ipath_statusp));
+
+bail:
+	return ret;
+}
+
+static const char *ipath_status_str[] = {
+	"Initted",
+	"Disabled",
+	"Admin_Disabled",
+	"", /* This used to be the old "OIB_SMA" status. */
+	"", /* This used to be the old "SMA" status. */
+	"Present",
+	"IB_link_up",
+	"IB_configured",
+	"NoIBcable",
+	"Fatal_Hardware_Error",
+	NULL,
+};
+
+static ssize_t show_status_str(struct device *dev,
+			       struct device_attribute *attr,
+			       char *buf)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+	int i, any;
+	u64 s;
+	ssize_t ret;
+
+	if (!dd->ipath_statusp) {
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	s = *(dd->ipath_statusp);
+	*buf = '\0';
+	for (any = i = 0; s && ipath_status_str[i]; i++) {
+		if (s & 1) {
+			if (any && strlcat(buf, " ", PAGE_SIZE) >=
+			    PAGE_SIZE)
+				/* overflow */
+				break;
+			if (strlcat(buf, ipath_status_str[i],
+				    PAGE_SIZE) >= PAGE_SIZE)
+				break;
+			any = 1;
+		}
+		s >>= 1;
+	}
+	if (any)
+		strlcat(buf, "\n", PAGE_SIZE);
+
+	ret = strlen(buf);
+
+bail:
+	return ret;
+}
+
+static ssize_t show_boardversion(struct device *dev,
+			       struct device_attribute *attr,
+			       char *buf)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+	/* The string printed here is already newline-terminated. */
+	return scnprintf(buf, PAGE_SIZE, "%s", dd->ipath_boardversion);
+}
+
+static ssize_t show_localbus_info(struct device *dev,
+			       struct device_attribute *attr,
+			       char *buf)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+	/* The string printed here is already newline-terminated. */
+	return scnprintf(buf, PAGE_SIZE, "%s", dd->ipath_lbus_info);
+}
+
+static ssize_t show_lmc(struct device *dev,
+			struct device_attribute *attr,
+			char *buf)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+
+	return scnprintf(buf, PAGE_SIZE, "%u\n", dd->ipath_lmc);
+}
+
+static ssize_t store_lmc(struct device *dev,
+			 struct device_attribute *attr,
+			 const char *buf,
+			 size_t count)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+	u16 lmc = 0;
+	int ret;
+
+	ret = ipath_parse_ushort(buf, &lmc);
+	if (ret < 0)
+		goto invalid;
+
+	if (lmc > 7) {
+		ret = -EINVAL;
+		goto invalid;
+	}
+
+	ipath_set_lid(dd, dd->ipath_lid, lmc);
+
+	goto bail;
+invalid:
+	ipath_dev_err(dd, "attempt to set invalid LMC %u\n", lmc);
+bail:
+	return ret;
+}
+
+static ssize_t show_lid(struct device *dev,
+			struct device_attribute *attr,
+			char *buf)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+
+	return scnprintf(buf, PAGE_SIZE, "0x%x\n", dd->ipath_lid);
+}
+
+static ssize_t store_lid(struct device *dev,
+			 struct device_attribute *attr,
+			  const char *buf,
+			  size_t count)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+	u16 lid = 0;
+	int ret;
+
+	ret = ipath_parse_ushort(buf, &lid);
+	if (ret < 0)
+		goto invalid;
+
+	if (lid == 0 || lid >= IPATH_MULTICAST_LID_BASE) {
+		ret = -EINVAL;
+		goto invalid;
+	}
+
+	ipath_set_lid(dd, lid, dd->ipath_lmc);
+
+	goto bail;
+invalid:
+	ipath_dev_err(dd, "attempt to set invalid LID 0x%x\n", lid);
+bail:
+	return ret;
+}
+
+static ssize_t show_mlid(struct device *dev,
+			 struct device_attribute *attr,
+			 char *buf)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+
+	return scnprintf(buf, PAGE_SIZE, "0x%x\n", dd->ipath_mlid);
+}
+
+static ssize_t store_mlid(struct device *dev,
+			 struct device_attribute *attr,
+			  const char *buf,
+			  size_t count)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+	u16 mlid;
+	int ret;
+
+	ret = ipath_parse_ushort(buf, &mlid);
+	if (ret < 0 || mlid < IPATH_MULTICAST_LID_BASE)
+		goto invalid;
+
+	dd->ipath_mlid = mlid;
+
+	goto bail;
+invalid:
+	ipath_dev_err(dd, "attempt to set invalid MLID\n");
+bail:
+	return ret;
+}
+
+static ssize_t show_guid(struct device *dev,
+			 struct device_attribute *attr,
+			 char *buf)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+	u8 *guid;
+
+	guid = (u8 *) & (dd->ipath_guid);
+
+	return scnprintf(buf, PAGE_SIZE,
+			 "%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x\n",
+			 guid[0], guid[1], guid[2], guid[3],
+			 guid[4], guid[5], guid[6], guid[7]);
+}
+
+static ssize_t store_guid(struct device *dev,
+			 struct device_attribute *attr,
+			  const char *buf,
+			  size_t count)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+	ssize_t ret;
+	unsigned short guid[8];
+	__be64 new_guid;
+	u8 *ng;
+	int i;
+
+	if (sscanf(buf, "%hx:%hx:%hx:%hx:%hx:%hx:%hx:%hx",
+		   &guid[0], &guid[1], &guid[2], &guid[3],
+		   &guid[4], &guid[5], &guid[6], &guid[7]) != 8)
+		goto invalid;
+
+	ng = (u8 *) &new_guid;
+
+	for (i = 0; i < 8; i++) {
+		if (guid[i] > 0xff)
+			goto invalid;
+		ng[i] = guid[i];
+	}
+
+	if (new_guid == 0)
+		goto invalid;
+
+	dd->ipath_guid = new_guid;
+	dd->ipath_nguid = 1;
+	if (dd->verbs_dev)
+		dd->verbs_dev->ibdev.node_guid = new_guid;
+
+	ret = strlen(buf);
+	goto bail;
+
+invalid:
+	ipath_dev_err(dd, "attempt to set invalid GUID\n");
+	ret = -EINVAL;
+
+bail:
+	return ret;
+}
+
+static ssize_t show_nguid(struct device *dev,
+			  struct device_attribute *attr,
+			  char *buf)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+
+	return scnprintf(buf, PAGE_SIZE, "%u\n", dd->ipath_nguid);
+}
+
+static ssize_t show_nports(struct device *dev,
+			   struct device_attribute *attr,
+			   char *buf)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+
+	/* Return the number of user ports available. */
+	return scnprintf(buf, PAGE_SIZE, "%u\n", dd->ipath_cfgports - 1);
+}
+
+static ssize_t show_serial(struct device *dev,
+			   struct device_attribute *attr,
+			   char *buf)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+
+	buf[sizeof dd->ipath_serial] = '\0';
+	memcpy(buf, dd->ipath_serial, sizeof dd->ipath_serial);
+	strcat(buf, "\n");
+	return strlen(buf);
+}
+
+static ssize_t show_unit(struct device *dev,
+			 struct device_attribute *attr,
+			 char *buf)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+
+	return scnprintf(buf, PAGE_SIZE, "%u\n", dd->ipath_unit);
+}
+
+static ssize_t show_jint_max_packets(struct device *dev,
+				     struct device_attribute *attr,
+				     char *buf)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+
+	return scnprintf(buf, PAGE_SIZE, "%hu\n", dd->ipath_jint_max_packets);
+}
+
+static ssize_t store_jint_max_packets(struct device *dev,
+				      struct device_attribute *attr,
+				      const char *buf,
+				      size_t count)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+	u16 v = 0;
+	int ret;
+
+	ret = ipath_parse_ushort(buf, &v);
+	if (ret < 0)
+		ipath_dev_err(dd, "invalid jint_max_packets.\n");
+	else
+		dd->ipath_f_config_jint(dd, dd->ipath_jint_idle_ticks, v);
+
+	return ret;
+}
+
+static ssize_t show_jint_idle_ticks(struct device *dev,
+				    struct device_attribute *attr,
+				    char *buf)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+
+	return scnprintf(buf, PAGE_SIZE, "%hu\n", dd->ipath_jint_idle_ticks);
+}
+
+static ssize_t store_jint_idle_ticks(struct device *dev,
+				     struct device_attribute *attr,
+				     const char *buf,
+				     size_t count)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+	u16 v = 0;
+	int ret;
+
+	ret = ipath_parse_ushort(buf, &v);
+	if (ret < 0)
+		ipath_dev_err(dd, "invalid jint_idle_ticks.\n");
+	else
+		dd->ipath_f_config_jint(dd, v, dd->ipath_jint_max_packets);
+
+	return ret;
+}
+
+#define DEVICE_COUNTER(name, attr) \
+	static ssize_t show_counter_##name(struct device *dev, \
+					   struct device_attribute *attr, \
+					   char *buf) \
+	{ \
+		struct ipath_devdata *dd = dev_get_drvdata(dev); \
+		return scnprintf(\
+			buf, PAGE_SIZE, "%llu\n", (unsigned long long) \
+			ipath_snap_cntr( \
+				dd, offsetof(struct infinipath_counters, \
+					     attr) / sizeof(u64)));	\
+	} \
+	static DEVICE_ATTR(name, S_IRUGO, show_counter_##name, NULL);
+
+DEVICE_COUNTER(ib_link_downeds, IBLinkDownedCnt);
+DEVICE_COUNTER(ib_link_err_recoveries, IBLinkErrRecoveryCnt);
+DEVICE_COUNTER(ib_status_changes, IBStatusChangeCnt);
+DEVICE_COUNTER(ib_symbol_errs, IBSymbolErrCnt);
+DEVICE_COUNTER(lb_flow_stalls, LBFlowStallCnt);
+DEVICE_COUNTER(lb_ints, LBIntCnt);
+DEVICE_COUNTER(rx_bad_formats, RxBadFormatCnt);
+DEVICE_COUNTER(rx_buf_ovfls, RxBufOvflCnt);
+DEVICE_COUNTER(rx_data_pkts, RxDataPktCnt);
+DEVICE_COUNTER(rx_dropped_pkts, RxDroppedPktCnt);
+DEVICE_COUNTER(rx_dwords, RxDwordCnt);
+DEVICE_COUNTER(rx_ebps, RxEBPCnt);
+DEVICE_COUNTER(rx_flow_ctrl_errs, RxFlowCtrlErrCnt);
+DEVICE_COUNTER(rx_flow_pkts, RxFlowPktCnt);
+DEVICE_COUNTER(rx_icrc_errs, RxICRCErrCnt);
+DEVICE_COUNTER(rx_len_errs, RxLenErrCnt);
+DEVICE_COUNTER(rx_link_problems, RxLinkProblemCnt);
+DEVICE_COUNTER(rx_lpcrc_errs, RxLPCRCErrCnt);
+DEVICE_COUNTER(rx_max_min_len_errs, RxMaxMinLenErrCnt);
+DEVICE_COUNTER(rx_p0_hdr_egr_ovfls, RxP0HdrEgrOvflCnt);
+DEVICE_COUNTER(rx_p1_hdr_egr_ovfls, RxP1HdrEgrOvflCnt);
+DEVICE_COUNTER(rx_p2_hdr_egr_ovfls, RxP2HdrEgrOvflCnt);
+DEVICE_COUNTER(rx_p3_hdr_egr_ovfls, RxP3HdrEgrOvflCnt);
+DEVICE_COUNTER(rx_p4_hdr_egr_ovfls, RxP4HdrEgrOvflCnt);
+DEVICE_COUNTER(rx_p5_hdr_egr_ovfls, RxP5HdrEgrOvflCnt);
+DEVICE_COUNTER(rx_p6_hdr_egr_ovfls, RxP6HdrEgrOvflCnt);
+DEVICE_COUNTER(rx_p7_hdr_egr_ovfls, RxP7HdrEgrOvflCnt);
+DEVICE_COUNTER(rx_p8_hdr_egr_ovfls, RxP8HdrEgrOvflCnt);
+DEVICE_COUNTER(rx_pkey_mismatches, RxPKeyMismatchCnt);
+DEVICE_COUNTER(rx_tid_full_errs, RxTIDFullErrCnt);
+DEVICE_COUNTER(rx_tid_valid_errs, RxTIDValidErrCnt);
+DEVICE_COUNTER(rx_vcrc_errs, RxVCRCErrCnt);
+DEVICE_COUNTER(tx_data_pkts, TxDataPktCnt);
+DEVICE_COUNTER(tx_dropped_pkts, TxDroppedPktCnt);
+DEVICE_COUNTER(tx_dwords, TxDwordCnt);
+DEVICE_COUNTER(tx_flow_pkts, TxFlowPktCnt);
+DEVICE_COUNTER(tx_flow_stalls, TxFlowStallCnt);
+DEVICE_COUNTER(tx_len_errs, TxLenErrCnt);
+DEVICE_COUNTER(tx_max_min_len_errs, TxMaxMinLenErrCnt);
+DEVICE_COUNTER(tx_underruns, TxUnderrunCnt);
+DEVICE_COUNTER(tx_unsup_vl_errs, TxUnsupVLErrCnt);
+
+static struct attribute *dev_counter_attributes[] = {
+	&dev_attr_ib_link_downeds.attr,
+	&dev_attr_ib_link_err_recoveries.attr,
+	&dev_attr_ib_status_changes.attr,
+	&dev_attr_ib_symbol_errs.attr,
+	&dev_attr_lb_flow_stalls.attr,
+	&dev_attr_lb_ints.attr,
+	&dev_attr_rx_bad_formats.attr,
+	&dev_attr_rx_buf_ovfls.attr,
+	&dev_attr_rx_data_pkts.attr,
+	&dev_attr_rx_dropped_pkts.attr,
+	&dev_attr_rx_dwords.attr,
+	&dev_attr_rx_ebps.attr,
+	&dev_attr_rx_flow_ctrl_errs.attr,
+	&dev_attr_rx_flow_pkts.attr,
+	&dev_attr_rx_icrc_errs.attr,
+	&dev_attr_rx_len_errs.attr,
+	&dev_attr_rx_link_problems.attr,
+	&dev_attr_rx_lpcrc_errs.attr,
+	&dev_attr_rx_max_min_len_errs.attr,
+	&dev_attr_rx_p0_hdr_egr_ovfls.attr,
+	&dev_attr_rx_p1_hdr_egr_ovfls.attr,
+	&dev_attr_rx_p2_hdr_egr_ovfls.attr,
+	&dev_attr_rx_p3_hdr_egr_ovfls.attr,
+	&dev_attr_rx_p4_hdr_egr_ovfls.attr,
+	&dev_attr_rx_p5_hdr_egr_ovfls.attr,
+	&dev_attr_rx_p6_hdr_egr_ovfls.attr,
+	&dev_attr_rx_p7_hdr_egr_ovfls.attr,
+	&dev_attr_rx_p8_hdr_egr_ovfls.attr,
+	&dev_attr_rx_pkey_mismatches.attr,
+	&dev_attr_rx_tid_full_errs.attr,
+	&dev_attr_rx_tid_valid_errs.attr,
+	&dev_attr_rx_vcrc_errs.attr,
+	&dev_attr_tx_data_pkts.attr,
+	&dev_attr_tx_dropped_pkts.attr,
+	&dev_attr_tx_dwords.attr,
+	&dev_attr_tx_flow_pkts.attr,
+	&dev_attr_tx_flow_stalls.attr,
+	&dev_attr_tx_len_errs.attr,
+	&dev_attr_tx_max_min_len_errs.attr,
+	&dev_attr_tx_underruns.attr,
+	&dev_attr_tx_unsup_vl_errs.attr,
+	NULL
+};
+
+static struct attribute_group dev_counter_attr_group = {
+	.name = "counters",
+	.attrs = dev_counter_attributes
+};
+
+static ssize_t store_reset(struct device *dev,
+			 struct device_attribute *attr,
+			  const char *buf,
+			  size_t count)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+	int ret;
+
+	if (count < 5 || memcmp(buf, "reset", 5)) {
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	if (dd->ipath_flags & IPATH_DISABLED) {
+		/*
+		 * post-reset init would re-enable interrupts, etc.
+		 * so don't allow reset on disabled devices.  Not
+		 * perfect error, but about the best choice.
+		 */
+		dev_info(dev,"Unit %d is disabled, can't reset\n",
+			 dd->ipath_unit);
+		ret = -EINVAL;
+		goto bail;
+	}
+	ret = ipath_reset_device(dd->ipath_unit);
+bail:
+	return ret<0 ? ret : count;
+}
+
+static ssize_t store_link_state(struct device *dev,
+			 struct device_attribute *attr,
+			  const char *buf,
+			  size_t count)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+	int ret, r;
+	u16 state;
+
+	ret = ipath_parse_ushort(buf, &state);
+	if (ret < 0)
+		goto invalid;
+
+	r = ipath_set_linkstate(dd, state);
+	if (r < 0) {
+		ret = r;
+		goto bail;
+	}
+
+	goto bail;
+invalid:
+	ipath_dev_err(dd, "attempt to set invalid link state\n");
+bail:
+	return ret;
+}
+
+static ssize_t show_mtu(struct device *dev,
+			 struct device_attribute *attr,
+			 char *buf)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+	return scnprintf(buf, PAGE_SIZE, "%u\n", dd->ipath_ibmtu);
+}
+
+static ssize_t store_mtu(struct device *dev,
+			 struct device_attribute *attr,
+			  const char *buf,
+			  size_t count)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+	ssize_t ret;
+	u16 mtu = 0;
+	int r;
+
+	ret = ipath_parse_ushort(buf, &mtu);
+	if (ret < 0)
+		goto invalid;
+
+	r = ipath_set_mtu(dd, mtu);
+	if (r < 0)
+		ret = r;
+
+	goto bail;
+invalid:
+	ipath_dev_err(dd, "attempt to set invalid MTU\n");
+bail:
+	return ret;
+}
+
+static ssize_t show_enabled(struct device *dev,
+			 struct device_attribute *attr,
+			 char *buf)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+	return scnprintf(buf, PAGE_SIZE, "%u\n",
+			 (dd->ipath_flags & IPATH_DISABLED) ? 0 : 1);
+}
+
+static ssize_t store_enabled(struct device *dev,
+			 struct device_attribute *attr,
+			  const char *buf,
+			  size_t count)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+	ssize_t ret;
+	u16 enable = 0;
+
+	ret = ipath_parse_ushort(buf, &enable);
+	if (ret < 0) {
+		ipath_dev_err(dd, "attempt to use non-numeric on enable\n");
+		goto bail;
+	}
+
+	if (enable) {
+		if (!(dd->ipath_flags & IPATH_DISABLED))
+			goto bail;
+
+		dev_info(dev, "Enabling unit %d\n", dd->ipath_unit);
+		/* same as post-reset */
+		ret = ipath_init_chip(dd, 1);
+		if (ret)
+			ipath_dev_err(dd, "Failed to enable unit %d\n",
+				      dd->ipath_unit);
+		else {
+			dd->ipath_flags &= ~IPATH_DISABLED;
+			*dd->ipath_statusp &= ~IPATH_STATUS_ADMIN_DISABLED;
+		}
+	}
+	else if (!(dd->ipath_flags & IPATH_DISABLED)) {
+		dev_info(dev, "Disabling unit %d\n", dd->ipath_unit);
+		ipath_shutdown_device(dd);
+		dd->ipath_flags |= IPATH_DISABLED;
+		*dd->ipath_statusp |= IPATH_STATUS_ADMIN_DISABLED;
+	}
+
+bail:
+	return ret;
+}
+
+static ssize_t store_rx_pol_inv(struct device *dev,
+			  struct device_attribute *attr,
+			  const char *buf,
+			  size_t count)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+	int ret, r;
+	u16 val;
+
+	ret = ipath_parse_ushort(buf, &val);
+	if (ret < 0)
+		goto invalid;
+
+	r = ipath_set_rx_pol_inv(dd, val);
+	if (r < 0) {
+		ret = r;
+		goto bail;
+	}
+
+	goto bail;
+invalid:
+	ipath_dev_err(dd, "attempt to set invalid Rx Polarity invert\n");
+bail:
+	return ret;
+}
+
+static ssize_t store_led_override(struct device *dev,
+			  struct device_attribute *attr,
+			  const char *buf,
+			  size_t count)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+	int ret;
+	u16 val;
+
+	ret = ipath_parse_ushort(buf, &val);
+	if (ret > 0)
+		ipath_set_led_override(dd, val);
+	else
+		ipath_dev_err(dd, "attempt to set invalid LED override\n");
+	return ret;
+}
+
+static ssize_t show_logged_errs(struct device *dev,
+				struct device_attribute *attr,
+				char *buf)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+	int idx, count;
+
+	/* force consistency with actual EEPROM */
+	if (ipath_update_eeprom_log(dd) != 0)
+		return -ENXIO;
+
+	count = 0;
+	for (idx = 0; idx < IPATH_EEP_LOG_CNT; ++idx) {
+		count += scnprintf(buf + count, PAGE_SIZE - count, "%d%c",
+			dd->ipath_eep_st_errs[idx],
+			idx == (IPATH_EEP_LOG_CNT - 1) ? '\n' : ' ');
+	}
+
+	return count;
+}
+
+/*
+ * New sysfs entries to control various IB config. These all turn into
+ * accesses via ipath_f_get/set_ib_cfg.
+ *
+ * Get/Set heartbeat enable. Or of 1=enabled, 2=auto
+ */
+static ssize_t show_hrtbt_enb(struct device *dev,
+			 struct device_attribute *attr,
+			 char *buf)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+	int ret;
+
+	ret = dd->ipath_f_get_ib_cfg(dd, IPATH_IB_CFG_HRTBT);
+	if (ret >= 0)
+		ret = scnprintf(buf, PAGE_SIZE, "%d\n", ret);
+	return ret;
+}
+
+static ssize_t store_hrtbt_enb(struct device *dev,
+			  struct device_attribute *attr,
+			  const char *buf,
+			  size_t count)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+	int ret, r;
+	u16 val;
+
+	ret = ipath_parse_ushort(buf, &val);
+	if (ret >= 0 && val > 3)
+		ret = -EINVAL;
+	if (ret < 0) {
+		ipath_dev_err(dd, "attempt to set invalid Heartbeat enable\n");
+		goto bail;
+	}
+
+	/*
+	 * Set the "intentional" heartbeat enable per either of
+	 * "Enable" and "Auto", as these are normally set together.
+	 * This bit is consulted when leaving loopback mode,
+	 * because entering loopback mode overrides it and automatically
+	 * disables heartbeat.
+	 */
+	r = dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_HRTBT, val);
+	if (r < 0)
+		ret = r;
+	else if (val == IPATH_IB_HRTBT_OFF)
+		dd->ipath_flags |= IPATH_NO_HRTBT;
+	else
+		dd->ipath_flags &= ~IPATH_NO_HRTBT;
+
+bail:
+	return ret;
+}
+
+/*
+ * Get/Set Link-widths enabled. Or of 1=1x, 2=4x (this is human/IB centric,
+ * _not_ the particular encoding of any given chip)
+ */
+static ssize_t show_lwid_enb(struct device *dev,
+			 struct device_attribute *attr,
+			 char *buf)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+	int ret;
+
+	ret = dd->ipath_f_get_ib_cfg(dd, IPATH_IB_CFG_LWID_ENB);
+	if (ret >= 0)
+		ret = scnprintf(buf, PAGE_SIZE, "%d\n", ret);
+	return ret;
+}
+
+static ssize_t store_lwid_enb(struct device *dev,
+			  struct device_attribute *attr,
+			  const char *buf,
+			  size_t count)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+	int ret, r;
+	u16 val;
+
+	ret = ipath_parse_ushort(buf, &val);
+	if (ret >= 0 && (val == 0 || val > 3))
+		ret = -EINVAL;
+	if (ret < 0) {
+		ipath_dev_err(dd,
+			"attempt to set invalid Link Width (enable)\n");
+		goto bail;
+	}
+
+	r = dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_LWID_ENB, val);
+	if (r < 0)
+		ret = r;
+
+bail:
+	return ret;
+}
+
+/* Get current link width */
+static ssize_t show_lwid(struct device *dev,
+			 struct device_attribute *attr,
+			 char *buf)
+
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+	int ret;
+
+	ret = dd->ipath_f_get_ib_cfg(dd, IPATH_IB_CFG_LWID);
+	if (ret >= 0)
+		ret = scnprintf(buf, PAGE_SIZE, "%d\n", ret);
+	return ret;
+}
+
+/*
+ * Get/Set Link-speeds enabled. Or of 1=SDR 2=DDR.
+ */
+static ssize_t show_spd_enb(struct device *dev,
+			 struct device_attribute *attr,
+			 char *buf)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+	int ret;
+
+	ret = dd->ipath_f_get_ib_cfg(dd, IPATH_IB_CFG_SPD_ENB);
+	if (ret >= 0)
+		ret = scnprintf(buf, PAGE_SIZE, "%d\n", ret);
+	return ret;
+}
+
+static ssize_t store_spd_enb(struct device *dev,
+			  struct device_attribute *attr,
+			  const char *buf,
+			  size_t count)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+	int ret, r;
+	u16 val;
+
+	ret = ipath_parse_ushort(buf, &val);
+	if (ret >= 0 && (val == 0 || val > (IPATH_IB_SDR | IPATH_IB_DDR)))
+		ret = -EINVAL;
+	if (ret < 0) {
+		ipath_dev_err(dd,
+			"attempt to set invalid Link Speed (enable)\n");
+		goto bail;
+	}
+
+	r = dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_SPD_ENB, val);
+	if (r < 0)
+		ret = r;
+
+bail:
+	return ret;
+}
+
+/* Get current link speed */
+static ssize_t show_spd(struct device *dev,
+			 struct device_attribute *attr,
+			 char *buf)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+	int ret;
+
+	ret = dd->ipath_f_get_ib_cfg(dd, IPATH_IB_CFG_SPD);
+	if (ret >= 0)
+		ret = scnprintf(buf, PAGE_SIZE, "%d\n", ret);
+	return ret;
+}
+
+/*
+ * Get/Set RX polarity-invert enable. 0=no, 1=yes.
+ */
+static ssize_t show_rx_polinv_enb(struct device *dev,
+			 struct device_attribute *attr,
+			 char *buf)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+	int ret;
+
+	ret = dd->ipath_f_get_ib_cfg(dd, IPATH_IB_CFG_RXPOL_ENB);
+	if (ret >= 0)
+		ret = scnprintf(buf, PAGE_SIZE, "%d\n", ret);
+	return ret;
+}
+
+static ssize_t store_rx_polinv_enb(struct device *dev,
+			  struct device_attribute *attr,
+			  const char *buf,
+			  size_t count)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+	int ret, r;
+	u16 val;
+
+	ret = ipath_parse_ushort(buf, &val);
+	if (ret >= 0 && val > 1) {
+		ipath_dev_err(dd,
+			"attempt to set invalid Rx Polarity (enable)\n");
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	r = dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_RXPOL_ENB, val);
+	if (r < 0)
+		ret = r;
+
+bail:
+	return ret;
+}
+
+/*
+ * Get/Set RX lane-reversal enable. 0=no, 1=yes.
+ */
+static ssize_t show_lanerev_enb(struct device *dev,
+			 struct device_attribute *attr,
+			 char *buf)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+	int ret;
+
+	ret = dd->ipath_f_get_ib_cfg(dd, IPATH_IB_CFG_LREV_ENB);
+	if (ret >= 0)
+		ret = scnprintf(buf, PAGE_SIZE, "%d\n", ret);
+	return ret;
+}
+
+static ssize_t store_lanerev_enb(struct device *dev,
+			  struct device_attribute *attr,
+			  const char *buf,
+			  size_t count)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+	int ret, r;
+	u16 val;
+
+	ret = ipath_parse_ushort(buf, &val);
+	if (ret >= 0 && val > 1) {
+		ret = -EINVAL;
+		ipath_dev_err(dd,
+			"attempt to set invalid Lane reversal (enable)\n");
+		goto bail;
+	}
+
+	r = dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_LREV_ENB, val);
+	if (r < 0)
+		ret = r;
+
+bail:
+	return ret;
+}
+
+static DRIVER_ATTR(num_units, S_IRUGO, show_num_units, NULL);
+static DRIVER_ATTR(version, S_IRUGO, show_version, NULL);
+
+static struct attribute *driver_attributes[] = {
+	&driver_attr_num_units.attr,
+	&driver_attr_version.attr,
+	NULL
+};
+
+static struct attribute_group driver_attr_group = {
+	.attrs = driver_attributes
+};
+
+static ssize_t store_tempsense(struct device *dev,
+			       struct device_attribute *attr,
+			       const char *buf,
+			       size_t count)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+	int ret, stat;
+	u16 val;
+
+	ret = ipath_parse_ushort(buf, &val);
+	if (ret <= 0) {
+		ipath_dev_err(dd, "attempt to set invalid tempsense config\n");
+		goto bail;
+	}
+	/* If anything but the highest limit, enable T_CRIT_A "interrupt" */
+	stat = ipath_tempsense_write(dd, 9, (val == 0x7f7f) ? 0x80 : 0);
+	if (stat) {
+		ipath_dev_err(dd, "Unable to set tempsense config\n");
+		ret = -1;
+		goto bail;
+	}
+	stat = ipath_tempsense_write(dd, 0xB, (u8) (val & 0xFF));
+	if (stat) {
+		ipath_dev_err(dd, "Unable to set local Tcrit\n");
+		ret = -1;
+		goto bail;
+	}
+	stat = ipath_tempsense_write(dd, 0xD, (u8) (val >> 8));
+	if (stat) {
+		ipath_dev_err(dd, "Unable to set remote Tcrit\n");
+		ret = -1;
+		goto bail;
+	}
+
+bail:
+	return ret;
+}
+
+/*
+ * dump tempsense regs. in decimal, to ease shell-scripts.
+ */
+static ssize_t show_tempsense(struct device *dev,
+			      struct device_attribute *attr,
+			      char *buf)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+	int ret;
+	int idx;
+	u8 regvals[8];
+
+	ret = -ENXIO;
+	for (idx = 0; idx < 8; ++idx) {
+		if (idx == 6)
+			continue;
+		ret = ipath_tempsense_read(dd, idx);
+		if (ret < 0)
+			break;
+		regvals[idx] = ret;
+	}
+	if (idx == 8)
+		ret = scnprintf(buf, PAGE_SIZE, "%d %d %02X %02X %d %d\n",
+			*(signed char *)(regvals),
+			*(signed char *)(regvals + 1),
+			regvals[2], regvals[3],
+			*(signed char *)(regvals + 5),
+			*(signed char *)(regvals + 7));
+	return ret;
+}
+
+const struct attribute_group *ipath_driver_attr_groups[] = {
+	&driver_attr_group,
+	NULL,
+};
+
+static DEVICE_ATTR(guid, S_IWUSR | S_IRUGO, show_guid, store_guid);
+static DEVICE_ATTR(lmc, S_IWUSR | S_IRUGO, show_lmc, store_lmc);
+static DEVICE_ATTR(lid, S_IWUSR | S_IRUGO, show_lid, store_lid);
+static DEVICE_ATTR(link_state, S_IWUSR, NULL, store_link_state);
+static DEVICE_ATTR(mlid, S_IWUSR | S_IRUGO, show_mlid, store_mlid);
+static DEVICE_ATTR(mtu, S_IWUSR | S_IRUGO, show_mtu, store_mtu);
+static DEVICE_ATTR(enabled, S_IWUSR | S_IRUGO, show_enabled, store_enabled);
+static DEVICE_ATTR(nguid, S_IRUGO, show_nguid, NULL);
+static DEVICE_ATTR(nports, S_IRUGO, show_nports, NULL);
+static DEVICE_ATTR(reset, S_IWUSR, NULL, store_reset);
+static DEVICE_ATTR(serial, S_IRUGO, show_serial, NULL);
+static DEVICE_ATTR(status, S_IRUGO, show_status, NULL);
+static DEVICE_ATTR(status_str, S_IRUGO, show_status_str, NULL);
+static DEVICE_ATTR(boardversion, S_IRUGO, show_boardversion, NULL);
+static DEVICE_ATTR(unit, S_IRUGO, show_unit, NULL);
+static DEVICE_ATTR(rx_pol_inv, S_IWUSR, NULL, store_rx_pol_inv);
+static DEVICE_ATTR(led_override, S_IWUSR, NULL, store_led_override);
+static DEVICE_ATTR(logged_errors, S_IRUGO, show_logged_errs, NULL);
+static DEVICE_ATTR(localbus_info, S_IRUGO, show_localbus_info, NULL);
+static DEVICE_ATTR(jint_max_packets, S_IWUSR | S_IRUGO,
+		   show_jint_max_packets, store_jint_max_packets);
+static DEVICE_ATTR(jint_idle_ticks, S_IWUSR | S_IRUGO,
+		   show_jint_idle_ticks, store_jint_idle_ticks);
+static DEVICE_ATTR(tempsense, S_IWUSR | S_IRUGO,
+		   show_tempsense, store_tempsense);
+
+static struct attribute *dev_attributes[] = {
+	&dev_attr_guid.attr,
+	&dev_attr_lmc.attr,
+	&dev_attr_lid.attr,
+	&dev_attr_link_state.attr,
+	&dev_attr_mlid.attr,
+	&dev_attr_mtu.attr,
+	&dev_attr_nguid.attr,
+	&dev_attr_nports.attr,
+	&dev_attr_serial.attr,
+	&dev_attr_status.attr,
+	&dev_attr_status_str.attr,
+	&dev_attr_boardversion.attr,
+	&dev_attr_unit.attr,
+	&dev_attr_enabled.attr,
+	&dev_attr_rx_pol_inv.attr,
+	&dev_attr_led_override.attr,
+	&dev_attr_logged_errors.attr,
+	&dev_attr_tempsense.attr,
+	&dev_attr_localbus_info.attr,
+	NULL
+};
+
+static struct attribute_group dev_attr_group = {
+	.attrs = dev_attributes
+};
+
+static DEVICE_ATTR(hrtbt_enable, S_IWUSR | S_IRUGO, show_hrtbt_enb,
+		   store_hrtbt_enb);
+static DEVICE_ATTR(link_width_enable, S_IWUSR | S_IRUGO, show_lwid_enb,
+		   store_lwid_enb);
+static DEVICE_ATTR(link_width, S_IRUGO, show_lwid, NULL);
+static DEVICE_ATTR(link_speed_enable, S_IWUSR | S_IRUGO, show_spd_enb,
+		   store_spd_enb);
+static DEVICE_ATTR(link_speed, S_IRUGO, show_spd, NULL);
+static DEVICE_ATTR(rx_pol_inv_enable, S_IWUSR | S_IRUGO, show_rx_polinv_enb,
+		   store_rx_polinv_enb);
+static DEVICE_ATTR(rx_lane_rev_enable, S_IWUSR | S_IRUGO, show_lanerev_enb,
+		   store_lanerev_enb);
+
+static struct attribute *dev_ibcfg_attributes[] = {
+	&dev_attr_hrtbt_enable.attr,
+	&dev_attr_link_width_enable.attr,
+	&dev_attr_link_width.attr,
+	&dev_attr_link_speed_enable.attr,
+	&dev_attr_link_speed.attr,
+	&dev_attr_rx_pol_inv_enable.attr,
+	&dev_attr_rx_lane_rev_enable.attr,
+	NULL
+};
+
+static struct attribute_group dev_ibcfg_attr_group = {
+	.attrs = dev_ibcfg_attributes
+};
+
+/**
+ * ipath_expose_reset - create a device reset file
+ * @dev: the device structure
+ *
+ * Only expose a file that lets us reset the device after someone
+ * enters diag mode.  A device reset is quite likely to crash the
+ * machine entirely, so we don't want to normally make it
+ * available.
+ *
+ * Called with ipath_mutex held.
+ */
+int ipath_expose_reset(struct device *dev)
+{
+	static int exposed;
+	int ret;
+
+	if (!exposed) {
+		ret = device_create_file(dev, &dev_attr_reset);
+		exposed = 1;
+	}
+	else
+		ret = 0;
+
+	return ret;
+}
+
+int ipath_device_create_group(struct device *dev, struct ipath_devdata *dd)
+{
+	int ret;
+
+	ret = sysfs_create_group(&dev->kobj, &dev_attr_group);
+	if (ret)
+		goto bail;
+
+	ret = sysfs_create_group(&dev->kobj, &dev_counter_attr_group);
+	if (ret)
+		goto bail_attrs;
+
+	if (dd->ipath_flags & IPATH_HAS_MULT_IB_SPEED) {
+		ret = device_create_file(dev, &dev_attr_jint_idle_ticks);
+		if (ret)
+			goto bail_counter;
+		ret = device_create_file(dev, &dev_attr_jint_max_packets);
+		if (ret)
+			goto bail_idle;
+
+		ret = sysfs_create_group(&dev->kobj, &dev_ibcfg_attr_group);
+		if (ret)
+			goto bail_max;
+	}
+
+	return 0;
+
+bail_max:
+	device_remove_file(dev, &dev_attr_jint_max_packets);
+bail_idle:
+	device_remove_file(dev, &dev_attr_jint_idle_ticks);
+bail_counter:
+	sysfs_remove_group(&dev->kobj, &dev_counter_attr_group);
+bail_attrs:
+	sysfs_remove_group(&dev->kobj, &dev_attr_group);
+bail:
+	return ret;
+}
+
+void ipath_device_remove_group(struct device *dev, struct ipath_devdata *dd)
+{
+	sysfs_remove_group(&dev->kobj, &dev_counter_attr_group);
+
+	if (dd->ipath_flags & IPATH_HAS_MULT_IB_SPEED) {
+		sysfs_remove_group(&dev->kobj, &dev_ibcfg_attr_group);
+		device_remove_file(dev, &dev_attr_jint_idle_ticks);
+		device_remove_file(dev, &dev_attr_jint_max_packets);
+	}
+
+	sysfs_remove_group(&dev->kobj, &dev_attr_group);
+
+	device_remove_file(dev, &dev_attr_reset);
+}
diff --git a/drivers/infiniband/hw/ipath/ipath_uc.c b/drivers/infiniband/hw/ipath/ipath_uc.c
new file mode 100644
index 0000000..22e6099
--- /dev/null
+++ b/drivers/infiniband/hw/ipath/ipath_uc.c
@@ -0,0 +1,547 @@
+/*
+ * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "ipath_verbs.h"
+#include "ipath_kernel.h"
+
+/* cut down ridiculously long IB macro names */
+#define OP(x) IB_OPCODE_UC_##x
+
+/**
+ * ipath_make_uc_req - construct a request packet (SEND, RDMA write)
+ * @qp: a pointer to the QP
+ *
+ * Return 1 if constructed; otherwise, return 0.
+ */
+int ipath_make_uc_req(struct ipath_qp *qp)
+{
+	struct ipath_other_headers *ohdr;
+	struct ipath_swqe *wqe;
+	unsigned long flags;
+	u32 hwords;
+	u32 bth0;
+	u32 len;
+	u32 pmtu = ib_mtu_enum_to_int(qp->path_mtu);
+	int ret = 0;
+
+	spin_lock_irqsave(&qp->s_lock, flags);
+
+	if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_SEND_OK)) {
+		if (!(ib_ipath_state_ops[qp->state] & IPATH_FLUSH_SEND))
+			goto bail;
+		/* We are in the error state, flush the work request. */
+		if (qp->s_last == qp->s_head)
+			goto bail;
+		/* If DMAs are in progress, we can't flush immediately. */
+		if (atomic_read(&qp->s_dma_busy)) {
+			qp->s_flags |= IPATH_S_WAIT_DMA;
+			goto bail;
+		}
+		wqe = get_swqe_ptr(qp, qp->s_last);
+		ipath_send_complete(qp, wqe, IB_WC_WR_FLUSH_ERR);
+		goto done;
+	}
+
+	ohdr = &qp->s_hdr.u.oth;
+	if (qp->remote_ah_attr.ah_flags & IB_AH_GRH)
+		ohdr = &qp->s_hdr.u.l.oth;
+
+	/* header size in 32-bit words LRH+BTH = (8+12)/4. */
+	hwords = 5;
+	bth0 = 1 << 22; /* Set M bit */
+
+	/* Get the next send request. */
+	wqe = get_swqe_ptr(qp, qp->s_cur);
+	qp->s_wqe = NULL;
+	switch (qp->s_state) {
+	default:
+		if (!(ib_ipath_state_ops[qp->state] &
+		    IPATH_PROCESS_NEXT_SEND_OK))
+			goto bail;
+		/* Check if send work queue is empty. */
+		if (qp->s_cur == qp->s_head)
+			goto bail;
+		/*
+		 * Start a new request.
+		 */
+		qp->s_psn = wqe->psn = qp->s_next_psn;
+		qp->s_sge.sge = wqe->sg_list[0];
+		qp->s_sge.sg_list = wqe->sg_list + 1;
+		qp->s_sge.num_sge = wqe->wr.num_sge;
+		qp->s_len = len = wqe->length;
+		switch (wqe->wr.opcode) {
+		case IB_WR_SEND:
+		case IB_WR_SEND_WITH_IMM:
+			if (len > pmtu) {
+				qp->s_state = OP(SEND_FIRST);
+				len = pmtu;
+				break;
+			}
+			if (wqe->wr.opcode == IB_WR_SEND)
+				qp->s_state = OP(SEND_ONLY);
+			else {
+				qp->s_state =
+					OP(SEND_ONLY_WITH_IMMEDIATE);
+				/* Immediate data comes after the BTH */
+				ohdr->u.imm_data = wqe->wr.ex.imm_data;
+				hwords += 1;
+			}
+			if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+				bth0 |= 1 << 23;
+			qp->s_wqe = wqe;
+			if (++qp->s_cur >= qp->s_size)
+				qp->s_cur = 0;
+			break;
+
+		case IB_WR_RDMA_WRITE:
+		case IB_WR_RDMA_WRITE_WITH_IMM:
+			ohdr->u.rc.reth.vaddr =
+				cpu_to_be64(wqe->wr.wr.rdma.remote_addr);
+			ohdr->u.rc.reth.rkey =
+				cpu_to_be32(wqe->wr.wr.rdma.rkey);
+			ohdr->u.rc.reth.length = cpu_to_be32(len);
+			hwords += sizeof(struct ib_reth) / 4;
+			if (len > pmtu) {
+				qp->s_state = OP(RDMA_WRITE_FIRST);
+				len = pmtu;
+				break;
+			}
+			if (wqe->wr.opcode == IB_WR_RDMA_WRITE)
+				qp->s_state = OP(RDMA_WRITE_ONLY);
+			else {
+				qp->s_state =
+					OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE);
+				/* Immediate data comes after the RETH */
+				ohdr->u.rc.imm_data = wqe->wr.ex.imm_data;
+				hwords += 1;
+				if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+					bth0 |= 1 << 23;
+			}
+			qp->s_wqe = wqe;
+			if (++qp->s_cur >= qp->s_size)
+				qp->s_cur = 0;
+			break;
+
+		default:
+			goto bail;
+		}
+		break;
+
+	case OP(SEND_FIRST):
+		qp->s_state = OP(SEND_MIDDLE);
+		/* FALLTHROUGH */
+	case OP(SEND_MIDDLE):
+		len = qp->s_len;
+		if (len > pmtu) {
+			len = pmtu;
+			break;
+		}
+		if (wqe->wr.opcode == IB_WR_SEND)
+			qp->s_state = OP(SEND_LAST);
+		else {
+			qp->s_state = OP(SEND_LAST_WITH_IMMEDIATE);
+			/* Immediate data comes after the BTH */
+			ohdr->u.imm_data = wqe->wr.ex.imm_data;
+			hwords += 1;
+		}
+		if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+			bth0 |= 1 << 23;
+		qp->s_wqe = wqe;
+		if (++qp->s_cur >= qp->s_size)
+			qp->s_cur = 0;
+		break;
+
+	case OP(RDMA_WRITE_FIRST):
+		qp->s_state = OP(RDMA_WRITE_MIDDLE);
+		/* FALLTHROUGH */
+	case OP(RDMA_WRITE_MIDDLE):
+		len = qp->s_len;
+		if (len > pmtu) {
+			len = pmtu;
+			break;
+		}
+		if (wqe->wr.opcode == IB_WR_RDMA_WRITE)
+			qp->s_state = OP(RDMA_WRITE_LAST);
+		else {
+			qp->s_state =
+				OP(RDMA_WRITE_LAST_WITH_IMMEDIATE);
+			/* Immediate data comes after the BTH */
+			ohdr->u.imm_data = wqe->wr.ex.imm_data;
+			hwords += 1;
+			if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+				bth0 |= 1 << 23;
+		}
+		qp->s_wqe = wqe;
+		if (++qp->s_cur >= qp->s_size)
+			qp->s_cur = 0;
+		break;
+	}
+	qp->s_len -= len;
+	qp->s_hdrwords = hwords;
+	qp->s_cur_sge = &qp->s_sge;
+	qp->s_cur_size = len;
+	ipath_make_ruc_header(to_idev(qp->ibqp.device),
+			      qp, ohdr, bth0 | (qp->s_state << 24),
+			      qp->s_next_psn++ & IPATH_PSN_MASK);
+done:
+	ret = 1;
+	goto unlock;
+
+bail:
+	qp->s_flags &= ~IPATH_S_BUSY;
+unlock:
+	spin_unlock_irqrestore(&qp->s_lock, flags);
+	return ret;
+}
+
+/**
+ * ipath_uc_rcv - handle an incoming UC packet
+ * @dev: the device the packet came in on
+ * @hdr: the header of the packet
+ * @has_grh: true if the packet has a GRH
+ * @data: the packet data
+ * @tlen: the length of the packet
+ * @qp: the QP for this packet.
+ *
+ * This is called from ipath_qp_rcv() to process an incoming UC packet
+ * for the given QP.
+ * Called at interrupt level.
+ */
+void ipath_uc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
+		  int has_grh, void *data, u32 tlen, struct ipath_qp *qp)
+{
+	struct ipath_other_headers *ohdr;
+	int opcode;
+	u32 hdrsize;
+	u32 psn;
+	u32 pad;
+	struct ib_wc wc;
+	u32 pmtu = ib_mtu_enum_to_int(qp->path_mtu);
+	struct ib_reth *reth;
+	int header_in_data;
+
+	/* Validate the SLID. See Ch. 9.6.1.5 */
+	if (unlikely(be16_to_cpu(hdr->lrh[3]) != qp->remote_ah_attr.dlid))
+		goto done;
+
+	/* Check for GRH */
+	if (!has_grh) {
+		ohdr = &hdr->u.oth;
+		hdrsize = 8 + 12;	/* LRH + BTH */
+		psn = be32_to_cpu(ohdr->bth[2]);
+		header_in_data = 0;
+	} else {
+		ohdr = &hdr->u.l.oth;
+		hdrsize = 8 + 40 + 12;	/* LRH + GRH + BTH */
+		/*
+		 * The header with GRH is 60 bytes and the
+		 * core driver sets the eager header buffer
+		 * size to 56 bytes so the last 4 bytes of
+		 * the BTH header (PSN) is in the data buffer.
+		 */
+		header_in_data = dev->dd->ipath_rcvhdrentsize == 16;
+		if (header_in_data) {
+			psn = be32_to_cpu(((__be32 *) data)[0]);
+			data += sizeof(__be32);
+		} else
+			psn = be32_to_cpu(ohdr->bth[2]);
+	}
+	/*
+	 * The opcode is in the low byte when its in network order
+	 * (top byte when in host order).
+	 */
+	opcode = be32_to_cpu(ohdr->bth[0]) >> 24;
+
+	memset(&wc, 0, sizeof wc);
+
+	/* Compare the PSN verses the expected PSN. */
+	if (unlikely(ipath_cmp24(psn, qp->r_psn) != 0)) {
+		/*
+		 * Handle a sequence error.
+		 * Silently drop any current message.
+		 */
+		qp->r_psn = psn;
+	inv:
+		qp->r_state = OP(SEND_LAST);
+		switch (opcode) {
+		case OP(SEND_FIRST):
+		case OP(SEND_ONLY):
+		case OP(SEND_ONLY_WITH_IMMEDIATE):
+			goto send_first;
+
+		case OP(RDMA_WRITE_FIRST):
+		case OP(RDMA_WRITE_ONLY):
+		case OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE):
+			goto rdma_first;
+
+		default:
+			dev->n_pkt_drops++;
+			goto done;
+		}
+	}
+
+	/* Check for opcode sequence errors. */
+	switch (qp->r_state) {
+	case OP(SEND_FIRST):
+	case OP(SEND_MIDDLE):
+		if (opcode == OP(SEND_MIDDLE) ||
+		    opcode == OP(SEND_LAST) ||
+		    opcode == OP(SEND_LAST_WITH_IMMEDIATE))
+			break;
+		goto inv;
+
+	case OP(RDMA_WRITE_FIRST):
+	case OP(RDMA_WRITE_MIDDLE):
+		if (opcode == OP(RDMA_WRITE_MIDDLE) ||
+		    opcode == OP(RDMA_WRITE_LAST) ||
+		    opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE))
+			break;
+		goto inv;
+
+	default:
+		if (opcode == OP(SEND_FIRST) ||
+		    opcode == OP(SEND_ONLY) ||
+		    opcode == OP(SEND_ONLY_WITH_IMMEDIATE) ||
+		    opcode == OP(RDMA_WRITE_FIRST) ||
+		    opcode == OP(RDMA_WRITE_ONLY) ||
+		    opcode == OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE))
+			break;
+		goto inv;
+	}
+
+	/* OK, process the packet. */
+	switch (opcode) {
+	case OP(SEND_FIRST):
+	case OP(SEND_ONLY):
+	case OP(SEND_ONLY_WITH_IMMEDIATE):
+	send_first:
+		if (qp->r_flags & IPATH_R_REUSE_SGE) {
+			qp->r_flags &= ~IPATH_R_REUSE_SGE;
+			qp->r_sge = qp->s_rdma_read_sge;
+		} else if (!ipath_get_rwqe(qp, 0)) {
+			dev->n_pkt_drops++;
+			goto done;
+		}
+		/* Save the WQE so we can reuse it in case of an error. */
+		qp->s_rdma_read_sge = qp->r_sge;
+		qp->r_rcv_len = 0;
+		if (opcode == OP(SEND_ONLY))
+			goto send_last;
+		else if (opcode == OP(SEND_ONLY_WITH_IMMEDIATE))
+			goto send_last_imm;
+		/* FALLTHROUGH */
+	case OP(SEND_MIDDLE):
+		/* Check for invalid length PMTU or posted rwqe len. */
+		if (unlikely(tlen != (hdrsize + pmtu + 4))) {
+			qp->r_flags |= IPATH_R_REUSE_SGE;
+			dev->n_pkt_drops++;
+			goto done;
+		}
+		qp->r_rcv_len += pmtu;
+		if (unlikely(qp->r_rcv_len > qp->r_len)) {
+			qp->r_flags |= IPATH_R_REUSE_SGE;
+			dev->n_pkt_drops++;
+			goto done;
+		}
+		ipath_copy_sge(&qp->r_sge, data, pmtu);
+		break;
+
+	case OP(SEND_LAST_WITH_IMMEDIATE):
+	send_last_imm:
+		if (header_in_data) {
+			wc.ex.imm_data = *(__be32 *) data;
+			data += sizeof(__be32);
+		} else {
+			/* Immediate data comes after BTH */
+			wc.ex.imm_data = ohdr->u.imm_data;
+		}
+		hdrsize += 4;
+		wc.wc_flags = IB_WC_WITH_IMM;
+		/* FALLTHROUGH */
+	case OP(SEND_LAST):
+	send_last:
+		/* Get the number of bytes the message was padded by. */
+		pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
+		/* Check for invalid length. */
+		/* XXX LAST len should be >= 1 */
+		if (unlikely(tlen < (hdrsize + pad + 4))) {
+			qp->r_flags |= IPATH_R_REUSE_SGE;
+			dev->n_pkt_drops++;
+			goto done;
+		}
+		/* Don't count the CRC. */
+		tlen -= (hdrsize + pad + 4);
+		wc.byte_len = tlen + qp->r_rcv_len;
+		if (unlikely(wc.byte_len > qp->r_len)) {
+			qp->r_flags |= IPATH_R_REUSE_SGE;
+			dev->n_pkt_drops++;
+			goto done;
+		}
+		wc.opcode = IB_WC_RECV;
+	last_imm:
+		ipath_copy_sge(&qp->r_sge, data, tlen);
+		wc.wr_id = qp->r_wr_id;
+		wc.status = IB_WC_SUCCESS;
+		wc.qp = &qp->ibqp;
+		wc.src_qp = qp->remote_qpn;
+		wc.slid = qp->remote_ah_attr.dlid;
+		wc.sl = qp->remote_ah_attr.sl;
+		/* Signal completion event if the solicited bit is set. */
+		ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc,
+			       (ohdr->bth[0] &
+				cpu_to_be32(1 << 23)) != 0);
+		break;
+
+	case OP(RDMA_WRITE_FIRST):
+	case OP(RDMA_WRITE_ONLY):
+	case OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE): /* consume RWQE */
+	rdma_first:
+		/* RETH comes after BTH */
+		if (!header_in_data)
+			reth = &ohdr->u.rc.reth;
+		else {
+			reth = (struct ib_reth *)data;
+			data += sizeof(*reth);
+		}
+		hdrsize += sizeof(*reth);
+		qp->r_len = be32_to_cpu(reth->length);
+		qp->r_rcv_len = 0;
+		if (qp->r_len != 0) {
+			u32 rkey = be32_to_cpu(reth->rkey);
+			u64 vaddr = be64_to_cpu(reth->vaddr);
+			int ok;
+
+			/* Check rkey */
+			ok = ipath_rkey_ok(qp, &qp->r_sge, qp->r_len,
+					   vaddr, rkey,
+					   IB_ACCESS_REMOTE_WRITE);
+			if (unlikely(!ok)) {
+				dev->n_pkt_drops++;
+				goto done;
+			}
+		} else {
+			qp->r_sge.sg_list = NULL;
+			qp->r_sge.sge.mr = NULL;
+			qp->r_sge.sge.vaddr = NULL;
+			qp->r_sge.sge.length = 0;
+			qp->r_sge.sge.sge_length = 0;
+		}
+		if (unlikely(!(qp->qp_access_flags &
+			       IB_ACCESS_REMOTE_WRITE))) {
+			dev->n_pkt_drops++;
+			goto done;
+		}
+		if (opcode == OP(RDMA_WRITE_ONLY))
+			goto rdma_last;
+		else if (opcode == OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE))
+			goto rdma_last_imm;
+		/* FALLTHROUGH */
+	case OP(RDMA_WRITE_MIDDLE):
+		/* Check for invalid length PMTU or posted rwqe len. */
+		if (unlikely(tlen != (hdrsize + pmtu + 4))) {
+			dev->n_pkt_drops++;
+			goto done;
+		}
+		qp->r_rcv_len += pmtu;
+		if (unlikely(qp->r_rcv_len > qp->r_len)) {
+			dev->n_pkt_drops++;
+			goto done;
+		}
+		ipath_copy_sge(&qp->r_sge, data, pmtu);
+		break;
+
+	case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE):
+	rdma_last_imm:
+		if (header_in_data) {
+			wc.ex.imm_data = *(__be32 *) data;
+			data += sizeof(__be32);
+		} else {
+			/* Immediate data comes after BTH */
+			wc.ex.imm_data = ohdr->u.imm_data;
+		}
+		hdrsize += 4;
+		wc.wc_flags = IB_WC_WITH_IMM;
+
+		/* Get the number of bytes the message was padded by. */
+		pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
+		/* Check for invalid length. */
+		/* XXX LAST len should be >= 1 */
+		if (unlikely(tlen < (hdrsize + pad + 4))) {
+			dev->n_pkt_drops++;
+			goto done;
+		}
+		/* Don't count the CRC. */
+		tlen -= (hdrsize + pad + 4);
+		if (unlikely(tlen + qp->r_rcv_len != qp->r_len)) {
+			dev->n_pkt_drops++;
+			goto done;
+		}
+		if (qp->r_flags & IPATH_R_REUSE_SGE)
+			qp->r_flags &= ~IPATH_R_REUSE_SGE;
+		else if (!ipath_get_rwqe(qp, 1)) {
+			dev->n_pkt_drops++;
+			goto done;
+		}
+		wc.byte_len = qp->r_len;
+		wc.opcode = IB_WC_RECV_RDMA_WITH_IMM;
+		goto last_imm;
+
+	case OP(RDMA_WRITE_LAST):
+	rdma_last:
+		/* Get the number of bytes the message was padded by. */
+		pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
+		/* Check for invalid length. */
+		/* XXX LAST len should be >= 1 */
+		if (unlikely(tlen < (hdrsize + pad + 4))) {
+			dev->n_pkt_drops++;
+			goto done;
+		}
+		/* Don't count the CRC. */
+		tlen -= (hdrsize + pad + 4);
+		if (unlikely(tlen + qp->r_rcv_len != qp->r_len)) {
+			dev->n_pkt_drops++;
+			goto done;
+		}
+		ipath_copy_sge(&qp->r_sge, data, tlen);
+		break;
+
+	default:
+		/* Drop packet for unknown opcodes. */
+		dev->n_pkt_drops++;
+		goto done;
+	}
+	qp->r_psn++;
+	qp->r_state = opcode;
+done:
+	return;
+}
diff --git a/drivers/infiniband/hw/ipath/ipath_ud.c b/drivers/infiniband/hw/ipath/ipath_ud.c
new file mode 100644
index 0000000..e8a2a91
--- /dev/null
+++ b/drivers/infiniband/hw/ipath/ipath_ud.c
@@ -0,0 +1,580 @@
+/*
+ * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/sched.h>
+#include <rdma/ib_smi.h>
+
+#include "ipath_verbs.h"
+#include "ipath_kernel.h"
+
+/**
+ * ipath_ud_loopback - handle send on loopback QPs
+ * @sqp: the sending QP
+ * @swqe: the send work request
+ *
+ * This is called from ipath_make_ud_req() to forward a WQE addressed
+ * to the same HCA.
+ * Note that the receive interrupt handler may be calling ipath_ud_rcv()
+ * while this is being called.
+ */
+static void ipath_ud_loopback(struct ipath_qp *sqp, struct ipath_swqe *swqe)
+{
+	struct ipath_ibdev *dev = to_idev(sqp->ibqp.device);
+	struct ipath_qp *qp;
+	struct ib_ah_attr *ah_attr;
+	unsigned long flags;
+	struct ipath_rq *rq;
+	struct ipath_srq *srq;
+	struct ipath_sge_state rsge;
+	struct ipath_sge *sge;
+	struct ipath_rwq *wq;
+	struct ipath_rwqe *wqe;
+	void (*handler)(struct ib_event *, void *);
+	struct ib_wc wc;
+	u32 tail;
+	u32 rlen;
+	u32 length;
+
+	qp = ipath_lookup_qpn(&dev->qp_table, swqe->wr.wr.ud.remote_qpn);
+	if (!qp || !(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK)) {
+		dev->n_pkt_drops++;
+		goto done;
+	}
+
+	/*
+	 * Check that the qkey matches (except for QP0, see 9.6.1.4.1).
+	 * Qkeys with the high order bit set mean use the
+	 * qkey from the QP context instead of the WR (see 10.2.5).
+	 */
+	if (unlikely(qp->ibqp.qp_num &&
+		     ((int) swqe->wr.wr.ud.remote_qkey < 0 ?
+		      sqp->qkey : swqe->wr.wr.ud.remote_qkey) != qp->qkey)) {
+		/* XXX OK to lose a count once in a while. */
+		dev->qkey_violations++;
+		dev->n_pkt_drops++;
+		goto drop;
+	}
+
+	/*
+	 * A GRH is expected to precede the data even if not
+	 * present on the wire.
+	 */
+	length = swqe->length;
+	memset(&wc, 0, sizeof wc);
+	wc.byte_len = length + sizeof(struct ib_grh);
+
+	if (swqe->wr.opcode == IB_WR_SEND_WITH_IMM) {
+		wc.wc_flags = IB_WC_WITH_IMM;
+		wc.ex.imm_data = swqe->wr.ex.imm_data;
+	}
+
+	/*
+	 * This would be a lot simpler if we could call ipath_get_rwqe()
+	 * but that uses state that the receive interrupt handler uses
+	 * so we would need to lock out receive interrupts while doing
+	 * local loopback.
+	 */
+	if (qp->ibqp.srq) {
+		srq = to_isrq(qp->ibqp.srq);
+		handler = srq->ibsrq.event_handler;
+		rq = &srq->rq;
+	} else {
+		srq = NULL;
+		handler = NULL;
+		rq = &qp->r_rq;
+	}
+
+	/*
+	 * Get the next work request entry to find where to put the data.
+	 * Note that it is safe to drop the lock after changing rq->tail
+	 * since ipath_post_receive() won't fill the empty slot.
+	 */
+	spin_lock_irqsave(&rq->lock, flags);
+	wq = rq->wq;
+	tail = wq->tail;
+	/* Validate tail before using it since it is user writable. */
+	if (tail >= rq->size)
+		tail = 0;
+	if (unlikely(tail == wq->head)) {
+		spin_unlock_irqrestore(&rq->lock, flags);
+		dev->n_pkt_drops++;
+		goto drop;
+	}
+	wqe = get_rwqe_ptr(rq, tail);
+	rsge.sg_list = qp->r_ud_sg_list;
+	if (!ipath_init_sge(qp, wqe, &rlen, &rsge)) {
+		spin_unlock_irqrestore(&rq->lock, flags);
+		dev->n_pkt_drops++;
+		goto drop;
+	}
+	/* Silently drop packets which are too big. */
+	if (wc.byte_len > rlen) {
+		spin_unlock_irqrestore(&rq->lock, flags);
+		dev->n_pkt_drops++;
+		goto drop;
+	}
+	if (++tail >= rq->size)
+		tail = 0;
+	wq->tail = tail;
+	wc.wr_id = wqe->wr_id;
+	if (handler) {
+		u32 n;
+
+		/*
+		 * validate head pointer value and compute
+		 * the number of remaining WQEs.
+		 */
+		n = wq->head;
+		if (n >= rq->size)
+			n = 0;
+		if (n < tail)
+			n += rq->size - tail;
+		else
+			n -= tail;
+		if (n < srq->limit) {
+			struct ib_event ev;
+
+			srq->limit = 0;
+			spin_unlock_irqrestore(&rq->lock, flags);
+			ev.device = qp->ibqp.device;
+			ev.element.srq = qp->ibqp.srq;
+			ev.event = IB_EVENT_SRQ_LIMIT_REACHED;
+			handler(&ev, srq->ibsrq.srq_context);
+		} else
+			spin_unlock_irqrestore(&rq->lock, flags);
+	} else
+		spin_unlock_irqrestore(&rq->lock, flags);
+
+	ah_attr = &to_iah(swqe->wr.wr.ud.ah)->attr;
+	if (ah_attr->ah_flags & IB_AH_GRH) {
+		ipath_copy_sge(&rsge, &ah_attr->grh, sizeof(struct ib_grh));
+		wc.wc_flags |= IB_WC_GRH;
+	} else
+		ipath_skip_sge(&rsge, sizeof(struct ib_grh));
+	sge = swqe->sg_list;
+	while (length) {
+		u32 len = sge->length;
+
+		if (len > length)
+			len = length;
+		if (len > sge->sge_length)
+			len = sge->sge_length;
+		BUG_ON(len == 0);
+		ipath_copy_sge(&rsge, sge->vaddr, len);
+		sge->vaddr += len;
+		sge->length -= len;
+		sge->sge_length -= len;
+		if (sge->sge_length == 0) {
+			if (--swqe->wr.num_sge)
+				sge++;
+		} else if (sge->length == 0 && sge->mr != NULL) {
+			if (++sge->n >= IPATH_SEGSZ) {
+				if (++sge->m >= sge->mr->mapsz)
+					break;
+				sge->n = 0;
+			}
+			sge->vaddr =
+				sge->mr->map[sge->m]->segs[sge->n].vaddr;
+			sge->length =
+				sge->mr->map[sge->m]->segs[sge->n].length;
+		}
+		length -= len;
+	}
+	wc.status = IB_WC_SUCCESS;
+	wc.opcode = IB_WC_RECV;
+	wc.qp = &qp->ibqp;
+	wc.src_qp = sqp->ibqp.qp_num;
+	/* XXX do we know which pkey matched? Only needed for GSI. */
+	wc.pkey_index = 0;
+	wc.slid = dev->dd->ipath_lid |
+		(ah_attr->src_path_bits &
+		 ((1 << dev->dd->ipath_lmc) - 1));
+	wc.sl = ah_attr->sl;
+	wc.dlid_path_bits =
+		ah_attr->dlid & ((1 << dev->dd->ipath_lmc) - 1);
+	wc.port_num = 1;
+	/* Signal completion event if the solicited bit is set. */
+	ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc,
+		       swqe->wr.send_flags & IB_SEND_SOLICITED);
+drop:
+	if (atomic_dec_and_test(&qp->refcount))
+		wake_up(&qp->wait);
+done:;
+}
+
+/**
+ * ipath_make_ud_req - construct a UD request packet
+ * @qp: the QP
+ *
+ * Return 1 if constructed; otherwise, return 0.
+ */
+int ipath_make_ud_req(struct ipath_qp *qp)
+{
+	struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
+	struct ipath_other_headers *ohdr;
+	struct ib_ah_attr *ah_attr;
+	struct ipath_swqe *wqe;
+	unsigned long flags;
+	u32 nwords;
+	u32 extra_bytes;
+	u32 bth0;
+	u16 lrh0;
+	u16 lid;
+	int ret = 0;
+	int next_cur;
+
+	spin_lock_irqsave(&qp->s_lock, flags);
+
+	if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_NEXT_SEND_OK)) {
+		if (!(ib_ipath_state_ops[qp->state] & IPATH_FLUSH_SEND))
+			goto bail;
+		/* We are in the error state, flush the work request. */
+		if (qp->s_last == qp->s_head)
+			goto bail;
+		/* If DMAs are in progress, we can't flush immediately. */
+		if (atomic_read(&qp->s_dma_busy)) {
+			qp->s_flags |= IPATH_S_WAIT_DMA;
+			goto bail;
+		}
+		wqe = get_swqe_ptr(qp, qp->s_last);
+		ipath_send_complete(qp, wqe, IB_WC_WR_FLUSH_ERR);
+		goto done;
+	}
+
+	if (qp->s_cur == qp->s_head)
+		goto bail;
+
+	wqe = get_swqe_ptr(qp, qp->s_cur);
+	next_cur = qp->s_cur + 1;
+	if (next_cur >= qp->s_size)
+		next_cur = 0;
+
+	/* Construct the header. */
+	ah_attr = &to_iah(wqe->wr.wr.ud.ah)->attr;
+	if (ah_attr->dlid >= IPATH_MULTICAST_LID_BASE) {
+		if (ah_attr->dlid != IPATH_PERMISSIVE_LID)
+			dev->n_multicast_xmit++;
+		else
+			dev->n_unicast_xmit++;
+	} else {
+		dev->n_unicast_xmit++;
+		lid = ah_attr->dlid & ~((1 << dev->dd->ipath_lmc) - 1);
+		if (unlikely(lid == dev->dd->ipath_lid)) {
+			/*
+			 * If DMAs are in progress, we can't generate
+			 * a completion for the loopback packet since
+			 * it would be out of order.
+			 * XXX Instead of waiting, we could queue a
+			 * zero length descriptor so we get a callback.
+			 */
+			if (atomic_read(&qp->s_dma_busy)) {
+				qp->s_flags |= IPATH_S_WAIT_DMA;
+				goto bail;
+			}
+			qp->s_cur = next_cur;
+			spin_unlock_irqrestore(&qp->s_lock, flags);
+			ipath_ud_loopback(qp, wqe);
+			spin_lock_irqsave(&qp->s_lock, flags);
+			ipath_send_complete(qp, wqe, IB_WC_SUCCESS);
+			goto done;
+		}
+	}
+
+	qp->s_cur = next_cur;
+	extra_bytes = -wqe->length & 3;
+	nwords = (wqe->length + extra_bytes) >> 2;
+
+	/* header size in 32-bit words LRH+BTH+DETH = (8+12+8)/4. */
+	qp->s_hdrwords = 7;
+	qp->s_cur_size = wqe->length;
+	qp->s_cur_sge = &qp->s_sge;
+	qp->s_dmult = ah_attr->static_rate;
+	qp->s_wqe = wqe;
+	qp->s_sge.sge = wqe->sg_list[0];
+	qp->s_sge.sg_list = wqe->sg_list + 1;
+	qp->s_sge.num_sge = wqe->wr.num_sge;
+
+	if (ah_attr->ah_flags & IB_AH_GRH) {
+		/* Header size in 32-bit words. */
+		qp->s_hdrwords += ipath_make_grh(dev, &qp->s_hdr.u.l.grh,
+						 &ah_attr->grh,
+						 qp->s_hdrwords, nwords);
+		lrh0 = IPATH_LRH_GRH;
+		ohdr = &qp->s_hdr.u.l.oth;
+		/*
+		 * Don't worry about sending to locally attached multicast
+		 * QPs.  It is unspecified by the spec. what happens.
+		 */
+	} else {
+		/* Header size in 32-bit words. */
+		lrh0 = IPATH_LRH_BTH;
+		ohdr = &qp->s_hdr.u.oth;
+	}
+	if (wqe->wr.opcode == IB_WR_SEND_WITH_IMM) {
+		qp->s_hdrwords++;
+		ohdr->u.ud.imm_data = wqe->wr.ex.imm_data;
+		bth0 = IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE << 24;
+	} else
+		bth0 = IB_OPCODE_UD_SEND_ONLY << 24;
+	lrh0 |= ah_attr->sl << 4;
+	if (qp->ibqp.qp_type == IB_QPT_SMI)
+		lrh0 |= 0xF000;	/* Set VL (see ch. 13.5.3.1) */
+	qp->s_hdr.lrh[0] = cpu_to_be16(lrh0);
+	qp->s_hdr.lrh[1] = cpu_to_be16(ah_attr->dlid);	/* DEST LID */
+	qp->s_hdr.lrh[2] = cpu_to_be16(qp->s_hdrwords + nwords +
+					   SIZE_OF_CRC);
+	lid = dev->dd->ipath_lid;
+	if (lid) {
+		lid |= ah_attr->src_path_bits &
+			((1 << dev->dd->ipath_lmc) - 1);
+		qp->s_hdr.lrh[3] = cpu_to_be16(lid);
+	} else
+		qp->s_hdr.lrh[3] = IB_LID_PERMISSIVE;
+	if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+		bth0 |= 1 << 23;
+	bth0 |= extra_bytes << 20;
+	bth0 |= qp->ibqp.qp_type == IB_QPT_SMI ? IPATH_DEFAULT_P_KEY :
+		ipath_get_pkey(dev->dd, qp->s_pkey_index);
+	ohdr->bth[0] = cpu_to_be32(bth0);
+	/*
+	 * Use the multicast QP if the destination LID is a multicast LID.
+	 */
+	ohdr->bth[1] = ah_attr->dlid >= IPATH_MULTICAST_LID_BASE &&
+		ah_attr->dlid != IPATH_PERMISSIVE_LID ?
+		cpu_to_be32(IPATH_MULTICAST_QPN) :
+		cpu_to_be32(wqe->wr.wr.ud.remote_qpn);
+	ohdr->bth[2] = cpu_to_be32(qp->s_next_psn++ & IPATH_PSN_MASK);
+	/*
+	 * Qkeys with the high order bit set mean use the
+	 * qkey from the QP context instead of the WR (see 10.2.5).
+	 */
+	ohdr->u.ud.deth[0] = cpu_to_be32((int)wqe->wr.wr.ud.remote_qkey < 0 ?
+					 qp->qkey : wqe->wr.wr.ud.remote_qkey);
+	ohdr->u.ud.deth[1] = cpu_to_be32(qp->ibqp.qp_num);
+
+done:
+	ret = 1;
+	goto unlock;
+
+bail:
+	qp->s_flags &= ~IPATH_S_BUSY;
+unlock:
+	spin_unlock_irqrestore(&qp->s_lock, flags);
+	return ret;
+}
+
+/**
+ * ipath_ud_rcv - receive an incoming UD packet
+ * @dev: the device the packet came in on
+ * @hdr: the packet header
+ * @has_grh: true if the packet has a GRH
+ * @data: the packet data
+ * @tlen: the packet length
+ * @qp: the QP the packet came on
+ *
+ * This is called from ipath_qp_rcv() to process an incoming UD packet
+ * for the given QP.
+ * Called at interrupt level.
+ */
+void ipath_ud_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
+		  int has_grh, void *data, u32 tlen, struct ipath_qp *qp)
+{
+	struct ipath_other_headers *ohdr;
+	int opcode;
+	u32 hdrsize;
+	u32 pad;
+	struct ib_wc wc;
+	u32 qkey;
+	u32 src_qp;
+	u16 dlid;
+	int header_in_data;
+
+	/* Check for GRH */
+	if (!has_grh) {
+		ohdr = &hdr->u.oth;
+		hdrsize = 8 + 12 + 8;	/* LRH + BTH + DETH */
+		qkey = be32_to_cpu(ohdr->u.ud.deth[0]);
+		src_qp = be32_to_cpu(ohdr->u.ud.deth[1]);
+		header_in_data = 0;
+	} else {
+		ohdr = &hdr->u.l.oth;
+		hdrsize = 8 + 40 + 12 + 8; /* LRH + GRH + BTH + DETH */
+		/*
+		 * The header with GRH is 68 bytes and the core driver sets
+		 * the eager header buffer size to 56 bytes so the last 12
+		 * bytes of the IB header is in the data buffer.
+		 */
+		header_in_data = dev->dd->ipath_rcvhdrentsize == 16;
+		if (header_in_data) {
+			qkey = be32_to_cpu(((__be32 *) data)[1]);
+			src_qp = be32_to_cpu(((__be32 *) data)[2]);
+			data += 12;
+		} else {
+			qkey = be32_to_cpu(ohdr->u.ud.deth[0]);
+			src_qp = be32_to_cpu(ohdr->u.ud.deth[1]);
+		}
+	}
+	src_qp &= IPATH_QPN_MASK;
+
+	/*
+	 * Check that the permissive LID is only used on QP0
+	 * and the QKEY matches (see 9.6.1.4.1 and 9.6.1.5.1).
+	 */
+	if (qp->ibqp.qp_num) {
+		if (unlikely(hdr->lrh[1] == IB_LID_PERMISSIVE ||
+			     hdr->lrh[3] == IB_LID_PERMISSIVE)) {
+			dev->n_pkt_drops++;
+			goto bail;
+		}
+		if (unlikely(qkey != qp->qkey)) {
+			/* XXX OK to lose a count once in a while. */
+			dev->qkey_violations++;
+			dev->n_pkt_drops++;
+			goto bail;
+		}
+	} else if (hdr->lrh[1] == IB_LID_PERMISSIVE ||
+		   hdr->lrh[3] == IB_LID_PERMISSIVE) {
+		struct ib_smp *smp = (struct ib_smp *) data;
+
+		if (smp->mgmt_class != IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) {
+			dev->n_pkt_drops++;
+			goto bail;
+		}
+	}
+
+	/*
+	 * The opcode is in the low byte when its in network order
+	 * (top byte when in host order).
+	 */
+	opcode = be32_to_cpu(ohdr->bth[0]) >> 24;
+	if (qp->ibqp.qp_num > 1 &&
+	    opcode == IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE) {
+		if (header_in_data) {
+			wc.ex.imm_data = *(__be32 *) data;
+			data += sizeof(__be32);
+		} else
+			wc.ex.imm_data = ohdr->u.ud.imm_data;
+		wc.wc_flags = IB_WC_WITH_IMM;
+		hdrsize += sizeof(u32);
+	} else if (opcode == IB_OPCODE_UD_SEND_ONLY) {
+		wc.ex.imm_data = 0;
+		wc.wc_flags = 0;
+	} else {
+		dev->n_pkt_drops++;
+		goto bail;
+	}
+
+	/* Get the number of bytes the message was padded by. */
+	pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
+	if (unlikely(tlen < (hdrsize + pad + 4))) {
+		/* Drop incomplete packets. */
+		dev->n_pkt_drops++;
+		goto bail;
+	}
+	tlen -= hdrsize + pad + 4;
+
+	/* Drop invalid MAD packets (see 13.5.3.1). */
+	if (unlikely((qp->ibqp.qp_num == 0 &&
+		      (tlen != 256 ||
+		       (be16_to_cpu(hdr->lrh[0]) >> 12) != 15)) ||
+		     (qp->ibqp.qp_num == 1 &&
+		      (tlen != 256 ||
+		       (be16_to_cpu(hdr->lrh[0]) >> 12) == 15)))) {
+		dev->n_pkt_drops++;
+		goto bail;
+	}
+
+	/*
+	 * A GRH is expected to precede the data even if not
+	 * present on the wire.
+	 */
+	wc.byte_len = tlen + sizeof(struct ib_grh);
+
+	/*
+	 * Get the next work request entry to find where to put the data.
+	 */
+	if (qp->r_flags & IPATH_R_REUSE_SGE)
+		qp->r_flags &= ~IPATH_R_REUSE_SGE;
+	else if (!ipath_get_rwqe(qp, 0)) {
+		/*
+		 * Count VL15 packets dropped due to no receive buffer.
+		 * Otherwise, count them as buffer overruns since usually,
+		 * the HW will be able to receive packets even if there are
+		 * no QPs with posted receive buffers.
+		 */
+		if (qp->ibqp.qp_num == 0)
+			dev->n_vl15_dropped++;
+		else
+			dev->rcv_errors++;
+		goto bail;
+	}
+	/* Silently drop packets which are too big. */
+	if (wc.byte_len > qp->r_len) {
+		qp->r_flags |= IPATH_R_REUSE_SGE;
+		dev->n_pkt_drops++;
+		goto bail;
+	}
+	if (has_grh) {
+		ipath_copy_sge(&qp->r_sge, &hdr->u.l.grh,
+			       sizeof(struct ib_grh));
+		wc.wc_flags |= IB_WC_GRH;
+	} else
+		ipath_skip_sge(&qp->r_sge, sizeof(struct ib_grh));
+	ipath_copy_sge(&qp->r_sge, data,
+		       wc.byte_len - sizeof(struct ib_grh));
+	if (!test_and_clear_bit(IPATH_R_WRID_VALID, &qp->r_aflags))
+		goto bail;
+	wc.wr_id = qp->r_wr_id;
+	wc.status = IB_WC_SUCCESS;
+	wc.opcode = IB_WC_RECV;
+	wc.vendor_err = 0;
+	wc.qp = &qp->ibqp;
+	wc.src_qp = src_qp;
+	/* XXX do we know which pkey matched? Only needed for GSI. */
+	wc.pkey_index = 0;
+	wc.slid = be16_to_cpu(hdr->lrh[3]);
+	wc.sl = (be16_to_cpu(hdr->lrh[0]) >> 4) & 0xF;
+	dlid = be16_to_cpu(hdr->lrh[1]);
+	/*
+	 * Save the LMC lower bits if the destination LID is a unicast LID.
+	 */
+	wc.dlid_path_bits = dlid >= IPATH_MULTICAST_LID_BASE ? 0 :
+		dlid & ((1 << dev->dd->ipath_lmc) - 1);
+	wc.port_num = 1;
+	/* Signal completion event if the solicited bit is set. */
+	ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc,
+		       (ohdr->bth[0] &
+			cpu_to_be32(1 << 23)) != 0);
+
+bail:;
+}
diff --git a/drivers/infiniband/hw/ipath/ipath_user_pages.c b/drivers/infiniband/hw/ipath/ipath_user_pages.c
new file mode 100644
index 0000000..1da1252
--- /dev/null
+++ b/drivers/infiniband/hw/ipath/ipath_user_pages.c
@@ -0,0 +1,229 @@
+/*
+ * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/mm.h>
+#include <linux/device.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+
+#include "ipath_kernel.h"
+
+static void __ipath_release_user_pages(struct page **p, size_t num_pages,
+				   int dirty)
+{
+	size_t i;
+
+	for (i = 0; i < num_pages; i++) {
+		ipath_cdbg(MM, "%lu/%lu put_page %p\n", (unsigned long) i,
+			   (unsigned long) num_pages, p[i]);
+		if (dirty)
+			set_page_dirty_lock(p[i]);
+		put_page(p[i]);
+	}
+}
+
+/* call with current->mm->mmap_sem held */
+static int __ipath_get_user_pages(unsigned long start_page, size_t num_pages,
+				  struct page **p)
+{
+	unsigned long lock_limit;
+	size_t got;
+	int ret;
+
+	lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+
+	if (num_pages > lock_limit) {
+		ret = -ENOMEM;
+		goto bail;
+	}
+
+	ipath_cdbg(VERBOSE, "pin %lx pages from vaddr %lx\n",
+		   (unsigned long) num_pages, start_page);
+
+	for (got = 0; got < num_pages; got += ret) {
+		ret = get_user_pages(current, current->mm,
+				     start_page + got * PAGE_SIZE,
+				     num_pages - got, 1, 1,
+				     p + got, NULL);
+		if (ret < 0)
+			goto bail_release;
+	}
+
+	current->mm->pinned_vm += num_pages;
+
+	ret = 0;
+	goto bail;
+
+bail_release:
+	__ipath_release_user_pages(p, got, 0);
+bail:
+	return ret;
+}
+
+/**
+ * ipath_map_page - a safety wrapper around pci_map_page()
+ *
+ * A dma_addr of all 0's is interpreted by the chip as "disabled".
+ * Unfortunately, it can also be a valid dma_addr returned on some
+ * architectures.
+ *
+ * The powerpc iommu assigns dma_addrs in ascending order, so we don't
+ * have to bother with retries or mapping a dummy page to insure we
+ * don't just get the same mapping again.
+ *
+ * I'm sure we won't be so lucky with other iommu's, so FIXME.
+ */
+dma_addr_t ipath_map_page(struct pci_dev *hwdev, struct page *page,
+	unsigned long offset, size_t size, int direction)
+{
+	dma_addr_t phys;
+
+	phys = pci_map_page(hwdev, page, offset, size, direction);
+
+	if (phys == 0) {
+		pci_unmap_page(hwdev, phys, size, direction);
+		phys = pci_map_page(hwdev, page, offset, size, direction);
+		/*
+		 * FIXME: If we get 0 again, we should keep this page,
+		 * map another, then free the 0 page.
+		 */
+	}
+
+	return phys;
+}
+
+/**
+ * ipath_map_single - a safety wrapper around pci_map_single()
+ *
+ * Same idea as ipath_map_page().
+ */
+dma_addr_t ipath_map_single(struct pci_dev *hwdev, void *ptr, size_t size,
+	int direction)
+{
+	dma_addr_t phys;
+
+	phys = pci_map_single(hwdev, ptr, size, direction);
+
+	if (phys == 0) {
+		pci_unmap_single(hwdev, phys, size, direction);
+		phys = pci_map_single(hwdev, ptr, size, direction);
+		/*
+		 * FIXME: If we get 0 again, we should keep this page,
+		 * map another, then free the 0 page.
+		 */
+	}
+
+	return phys;
+}
+
+/**
+ * ipath_get_user_pages - lock user pages into memory
+ * @start_page: the start page
+ * @num_pages: the number of pages
+ * @p: the output page structures
+ *
+ * This function takes a given start page (page aligned user virtual
+ * address) and pins it and the following specified number of pages.  For
+ * now, num_pages is always 1, but that will probably change at some point
+ * (because caller is doing expected sends on a single virtually contiguous
+ * buffer, so we can do all pages at once).
+ */
+int ipath_get_user_pages(unsigned long start_page, size_t num_pages,
+			 struct page **p)
+{
+	int ret;
+
+	down_write(&current->mm->mmap_sem);
+
+	ret = __ipath_get_user_pages(start_page, num_pages, p);
+
+	up_write(&current->mm->mmap_sem);
+
+	return ret;
+}
+
+void ipath_release_user_pages(struct page **p, size_t num_pages)
+{
+	down_write(&current->mm->mmap_sem);
+
+	__ipath_release_user_pages(p, num_pages, 1);
+
+	current->mm->pinned_vm -= num_pages;
+
+	up_write(&current->mm->mmap_sem);
+}
+
+struct ipath_user_pages_work {
+	struct work_struct work;
+	struct mm_struct *mm;
+	unsigned long num_pages;
+};
+
+static void user_pages_account(struct work_struct *_work)
+{
+	struct ipath_user_pages_work *work =
+		container_of(_work, struct ipath_user_pages_work, work);
+
+	down_write(&work->mm->mmap_sem);
+	work->mm->pinned_vm -= work->num_pages;
+	up_write(&work->mm->mmap_sem);
+	mmput(work->mm);
+	kfree(work);
+}
+
+void ipath_release_user_pages_on_close(struct page **p, size_t num_pages)
+{
+	struct ipath_user_pages_work *work;
+	struct mm_struct *mm;
+
+	__ipath_release_user_pages(p, num_pages, 1);
+
+	mm = get_task_mm(current);
+	if (!mm)
+		return;
+
+	work = kmalloc(sizeof(*work), GFP_KERNEL);
+	if (!work)
+		goto bail_mm;
+
+	INIT_WORK(&work->work, user_pages_account);
+	work->mm = mm;
+	work->num_pages = num_pages;
+
+	queue_work(ib_wq, &work->work);
+	return;
+
+bail_mm:
+	mmput(mm);
+	return;
+}
diff --git a/drivers/infiniband/hw/ipath/ipath_user_sdma.c b/drivers/infiniband/hw/ipath/ipath_user_sdma.c
new file mode 100644
index 0000000..cc04b7b
--- /dev/null
+++ b/drivers/infiniband/hw/ipath/ipath_user_sdma.c
@@ -0,0 +1,875 @@
+/*
+ * Copyright (c) 2007, 2008 QLogic Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include <linux/mm.h>
+#include <linux/types.h>
+#include <linux/device.h>
+#include <linux/dmapool.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/list.h>
+#include <linux/highmem.h>
+#include <linux/io.h>
+#include <linux/uio.h>
+#include <linux/rbtree.h>
+#include <linux/spinlock.h>
+#include <linux/delay.h>
+
+#include "ipath_kernel.h"
+#include "ipath_user_sdma.h"
+
+/* minimum size of header */
+#define IPATH_USER_SDMA_MIN_HEADER_LENGTH	64
+/* expected size of headers (for dma_pool) */
+#define IPATH_USER_SDMA_EXP_HEADER_LENGTH	64
+/* length mask in PBC (lower 11 bits) */
+#define IPATH_PBC_LENGTH_MASK			((1 << 11) - 1)
+
+struct ipath_user_sdma_pkt {
+	u8 naddr;		/* dimension of addr (1..3) ... */
+	u32 counter;		/* sdma pkts queued counter for this entry */
+	u64 added;		/* global descq number of entries */
+
+	struct {
+		u32 offset;			/* offset for kvaddr, addr */
+		u32 length;			/* length in page */
+		u8  put_page;			/* should we put_page? */
+		u8  dma_mapped;			/* is page dma_mapped? */
+		struct page *page;		/* may be NULL (coherent mem) */
+		void *kvaddr;			/* FIXME: only for pio hack */
+		dma_addr_t addr;
+	} addr[4];   /* max pages, any more and we coalesce */
+	struct list_head list;	/* list element */
+};
+
+struct ipath_user_sdma_queue {
+	/*
+	 * pkts sent to dma engine are queued on this
+	 * list head.  the type of the elements of this
+	 * list are struct ipath_user_sdma_pkt...
+	 */
+	struct list_head sent;
+
+	/* headers with expected length are allocated from here... */
+	char header_cache_name[64];
+	struct dma_pool *header_cache;
+
+	/* packets are allocated from the slab cache... */
+	char pkt_slab_name[64];
+	struct kmem_cache *pkt_slab;
+
+	/* as packets go on the queued queue, they are counted... */
+	u32 counter;
+	u32 sent_counter;
+
+	/* dma page table */
+	struct rb_root dma_pages_root;
+
+	/* protect everything above... */
+	struct mutex lock;
+};
+
+struct ipath_user_sdma_queue *
+ipath_user_sdma_queue_create(struct device *dev, int unit, int port, int sport)
+{
+	struct ipath_user_sdma_queue *pq =
+		kmalloc(sizeof(struct ipath_user_sdma_queue), GFP_KERNEL);
+
+	if (!pq)
+		goto done;
+
+	pq->counter = 0;
+	pq->sent_counter = 0;
+	INIT_LIST_HEAD(&pq->sent);
+
+	mutex_init(&pq->lock);
+
+	snprintf(pq->pkt_slab_name, sizeof(pq->pkt_slab_name),
+		 "ipath-user-sdma-pkts-%u-%02u.%02u", unit, port, sport);
+	pq->pkt_slab = kmem_cache_create(pq->pkt_slab_name,
+					 sizeof(struct ipath_user_sdma_pkt),
+					 0, 0, NULL);
+
+	if (!pq->pkt_slab)
+		goto err_kfree;
+
+	snprintf(pq->header_cache_name, sizeof(pq->header_cache_name),
+		 "ipath-user-sdma-headers-%u-%02u.%02u", unit, port, sport);
+	pq->header_cache = dma_pool_create(pq->header_cache_name,
+					   dev,
+					   IPATH_USER_SDMA_EXP_HEADER_LENGTH,
+					   4, 0);
+	if (!pq->header_cache)
+		goto err_slab;
+
+	pq->dma_pages_root = RB_ROOT;
+
+	goto done;
+
+err_slab:
+	kmem_cache_destroy(pq->pkt_slab);
+err_kfree:
+	kfree(pq);
+	pq = NULL;
+
+done:
+	return pq;
+}
+
+static void ipath_user_sdma_init_frag(struct ipath_user_sdma_pkt *pkt,
+				      int i, size_t offset, size_t len,
+				      int put_page, int dma_mapped,
+				      struct page *page,
+				      void *kvaddr, dma_addr_t dma_addr)
+{
+	pkt->addr[i].offset = offset;
+	pkt->addr[i].length = len;
+	pkt->addr[i].put_page = put_page;
+	pkt->addr[i].dma_mapped = dma_mapped;
+	pkt->addr[i].page = page;
+	pkt->addr[i].kvaddr = kvaddr;
+	pkt->addr[i].addr = dma_addr;
+}
+
+static void ipath_user_sdma_init_header(struct ipath_user_sdma_pkt *pkt,
+					u32 counter, size_t offset,
+					size_t len, int dma_mapped,
+					struct page *page,
+					void *kvaddr, dma_addr_t dma_addr)
+{
+	pkt->naddr = 1;
+	pkt->counter = counter;
+	ipath_user_sdma_init_frag(pkt, 0, offset, len, 0, dma_mapped, page,
+				  kvaddr, dma_addr);
+}
+
+/* we've too many pages in the iovec, coalesce to a single page */
+static int ipath_user_sdma_coalesce(const struct ipath_devdata *dd,
+				    struct ipath_user_sdma_pkt *pkt,
+				    const struct iovec *iov,
+				    unsigned long niov) {
+	int ret = 0;
+	struct page *page = alloc_page(GFP_KERNEL);
+	void *mpage_save;
+	char *mpage;
+	int i;
+	int len = 0;
+	dma_addr_t dma_addr;
+
+	if (!page) {
+		ret = -ENOMEM;
+		goto done;
+	}
+
+	mpage = kmap(page);
+	mpage_save = mpage;
+	for (i = 0; i < niov; i++) {
+		int cfur;
+
+		cfur = copy_from_user(mpage,
+				      iov[i].iov_base, iov[i].iov_len);
+		if (cfur) {
+			ret = -EFAULT;
+			goto free_unmap;
+		}
+
+		mpage += iov[i].iov_len;
+		len += iov[i].iov_len;
+	}
+
+	dma_addr = dma_map_page(&dd->pcidev->dev, page, 0, len,
+				DMA_TO_DEVICE);
+	if (dma_mapping_error(&dd->pcidev->dev, dma_addr)) {
+		ret = -ENOMEM;
+		goto free_unmap;
+	}
+
+	ipath_user_sdma_init_frag(pkt, 1, 0, len, 0, 1, page, mpage_save,
+				  dma_addr);
+	pkt->naddr = 2;
+
+	goto done;
+
+free_unmap:
+	kunmap(page);
+	__free_page(page);
+done:
+	return ret;
+}
+
+/* how many pages in this iovec element? */
+static int ipath_user_sdma_num_pages(const struct iovec *iov)
+{
+	const unsigned long addr  = (unsigned long) iov->iov_base;
+	const unsigned long  len  = iov->iov_len;
+	const unsigned long spage = addr & PAGE_MASK;
+	const unsigned long epage = (addr + len - 1) & PAGE_MASK;
+
+	return 1 + ((epage - spage) >> PAGE_SHIFT);
+}
+
+/* truncate length to page boundary */
+static int ipath_user_sdma_page_length(unsigned long addr, unsigned long len)
+{
+	const unsigned long offset = addr & ~PAGE_MASK;
+
+	return ((offset + len) > PAGE_SIZE) ? (PAGE_SIZE - offset) : len;
+}
+
+static void ipath_user_sdma_free_pkt_frag(struct device *dev,
+					  struct ipath_user_sdma_queue *pq,
+					  struct ipath_user_sdma_pkt *pkt,
+					  int frag)
+{
+	const int i = frag;
+
+	if (pkt->addr[i].page) {
+		if (pkt->addr[i].dma_mapped)
+			dma_unmap_page(dev,
+				       pkt->addr[i].addr,
+				       pkt->addr[i].length,
+				       DMA_TO_DEVICE);
+
+		if (pkt->addr[i].kvaddr)
+			kunmap(pkt->addr[i].page);
+
+		if (pkt->addr[i].put_page)
+			put_page(pkt->addr[i].page);
+		else
+			__free_page(pkt->addr[i].page);
+	} else if (pkt->addr[i].kvaddr)
+		/* free coherent mem from cache... */
+		dma_pool_free(pq->header_cache,
+			      pkt->addr[i].kvaddr, pkt->addr[i].addr);
+}
+
+/* return number of pages pinned... */
+static int ipath_user_sdma_pin_pages(const struct ipath_devdata *dd,
+				     struct ipath_user_sdma_pkt *pkt,
+				     unsigned long addr, int tlen, int npages)
+{
+	struct page *pages[2];
+	int j;
+	int ret;
+
+	ret = get_user_pages_fast(addr, npages, 0, pages);
+	if (ret != npages) {
+		int i;
+
+		for (i = 0; i < ret; i++)
+			put_page(pages[i]);
+
+		ret = -ENOMEM;
+		goto done;
+	}
+
+	for (j = 0; j < npages; j++) {
+		/* map the pages... */
+		const int flen =
+			ipath_user_sdma_page_length(addr, tlen);
+		dma_addr_t dma_addr =
+			dma_map_page(&dd->pcidev->dev,
+				     pages[j], 0, flen, DMA_TO_DEVICE);
+		unsigned long fofs = addr & ~PAGE_MASK;
+
+		if (dma_mapping_error(&dd->pcidev->dev, dma_addr)) {
+			ret = -ENOMEM;
+			goto done;
+		}
+
+		ipath_user_sdma_init_frag(pkt, pkt->naddr, fofs, flen, 1, 1,
+					  pages[j], kmap(pages[j]),
+					  dma_addr);
+
+		pkt->naddr++;
+		addr += flen;
+		tlen -= flen;
+	}
+
+done:
+	return ret;
+}
+
+static int ipath_user_sdma_pin_pkt(const struct ipath_devdata *dd,
+				   struct ipath_user_sdma_queue *pq,
+				   struct ipath_user_sdma_pkt *pkt,
+				   const struct iovec *iov,
+				   unsigned long niov)
+{
+	int ret = 0;
+	unsigned long idx;
+
+	for (idx = 0; idx < niov; idx++) {
+		const int npages = ipath_user_sdma_num_pages(iov + idx);
+		const unsigned long addr = (unsigned long) iov[idx].iov_base;
+
+		ret = ipath_user_sdma_pin_pages(dd, pkt,
+						addr, iov[idx].iov_len,
+						npages);
+		if (ret < 0)
+			goto free_pkt;
+	}
+
+	goto done;
+
+free_pkt:
+	for (idx = 0; idx < pkt->naddr; idx++)
+		ipath_user_sdma_free_pkt_frag(&dd->pcidev->dev, pq, pkt, idx);
+
+done:
+	return ret;
+}
+
+static int ipath_user_sdma_init_payload(const struct ipath_devdata *dd,
+					struct ipath_user_sdma_queue *pq,
+					struct ipath_user_sdma_pkt *pkt,
+					const struct iovec *iov,
+					unsigned long niov, int npages)
+{
+	int ret = 0;
+
+	if (npages >= ARRAY_SIZE(pkt->addr))
+		ret = ipath_user_sdma_coalesce(dd, pkt, iov, niov);
+	else
+		ret = ipath_user_sdma_pin_pkt(dd, pq, pkt, iov, niov);
+
+	return ret;
+}
+
+/* free a packet list -- return counter value of last packet */
+static void ipath_user_sdma_free_pkt_list(struct device *dev,
+					  struct ipath_user_sdma_queue *pq,
+					  struct list_head *list)
+{
+	struct ipath_user_sdma_pkt *pkt, *pkt_next;
+
+	list_for_each_entry_safe(pkt, pkt_next, list, list) {
+		int i;
+
+		for (i = 0; i < pkt->naddr; i++)
+			ipath_user_sdma_free_pkt_frag(dev, pq, pkt, i);
+
+		kmem_cache_free(pq->pkt_slab, pkt);
+	}
+}
+
+/*
+ * copy headers, coalesce etc -- pq->lock must be held
+ *
+ * we queue all the packets to list, returning the
+ * number of bytes total.  list must be empty initially,
+ * as, if there is an error we clean it...
+ */
+static int ipath_user_sdma_queue_pkts(const struct ipath_devdata *dd,
+				      struct ipath_user_sdma_queue *pq,
+				      struct list_head *list,
+				      const struct iovec *iov,
+				      unsigned long niov,
+				      int maxpkts)
+{
+	unsigned long idx = 0;
+	int ret = 0;
+	int npkts = 0;
+	struct page *page = NULL;
+	__le32 *pbc;
+	dma_addr_t dma_addr;
+	struct ipath_user_sdma_pkt *pkt = NULL;
+	size_t len;
+	size_t nw;
+	u32 counter = pq->counter;
+	int dma_mapped = 0;
+
+	while (idx < niov && npkts < maxpkts) {
+		const unsigned long addr = (unsigned long) iov[idx].iov_base;
+		const unsigned long idx_save = idx;
+		unsigned pktnw;
+		unsigned pktnwc;
+		int nfrags = 0;
+		int npages = 0;
+		int cfur;
+
+		dma_mapped = 0;
+		len = iov[idx].iov_len;
+		nw = len >> 2;
+		page = NULL;
+
+		pkt = kmem_cache_alloc(pq->pkt_slab, GFP_KERNEL);
+		if (!pkt) {
+			ret = -ENOMEM;
+			goto free_list;
+		}
+
+		if (len < IPATH_USER_SDMA_MIN_HEADER_LENGTH ||
+		    len > PAGE_SIZE || len & 3 || addr & 3) {
+			ret = -EINVAL;
+			goto free_pkt;
+		}
+
+		if (len == IPATH_USER_SDMA_EXP_HEADER_LENGTH)
+			pbc = dma_pool_alloc(pq->header_cache, GFP_KERNEL,
+					     &dma_addr);
+		else
+			pbc = NULL;
+
+		if (!pbc) {
+			page = alloc_page(GFP_KERNEL);
+			if (!page) {
+				ret = -ENOMEM;
+				goto free_pkt;
+			}
+			pbc = kmap(page);
+		}
+
+		cfur = copy_from_user(pbc, iov[idx].iov_base, len);
+		if (cfur) {
+			ret = -EFAULT;
+			goto free_pbc;
+		}
+
+		/*
+		 * this assignment is a bit strange.  it's because the
+		 * the pbc counts the number of 32 bit words in the full
+		 * packet _except_ the first word of the pbc itself...
+		 */
+		pktnwc = nw - 1;
+
+		/*
+		 * pktnw computation yields the number of 32 bit words
+		 * that the caller has indicated in the PBC.  note that
+		 * this is one less than the total number of words that
+		 * goes to the send DMA engine as the first 32 bit word
+		 * of the PBC itself is not counted.  Armed with this count,
+		 * we can verify that the packet is consistent with the
+		 * iovec lengths.
+		 */
+		pktnw = le32_to_cpu(*pbc) & IPATH_PBC_LENGTH_MASK;
+		if (pktnw < pktnwc || pktnw > pktnwc + (PAGE_SIZE >> 2)) {
+			ret = -EINVAL;
+			goto free_pbc;
+		}
+
+
+		idx++;
+		while (pktnwc < pktnw && idx < niov) {
+			const size_t slen = iov[idx].iov_len;
+			const unsigned long faddr =
+				(unsigned long) iov[idx].iov_base;
+
+			if (slen & 3 || faddr & 3 || !slen ||
+			    slen > PAGE_SIZE) {
+				ret = -EINVAL;
+				goto free_pbc;
+			}
+
+			npages++;
+			if ((faddr & PAGE_MASK) !=
+			    ((faddr + slen - 1) & PAGE_MASK))
+				npages++;
+
+			pktnwc += slen >> 2;
+			idx++;
+			nfrags++;
+		}
+
+		if (pktnwc != pktnw) {
+			ret = -EINVAL;
+			goto free_pbc;
+		}
+
+		if (page) {
+			dma_addr = dma_map_page(&dd->pcidev->dev,
+						page, 0, len, DMA_TO_DEVICE);
+			if (dma_mapping_error(&dd->pcidev->dev, dma_addr)) {
+				ret = -ENOMEM;
+				goto free_pbc;
+			}
+
+			dma_mapped = 1;
+		}
+
+		ipath_user_sdma_init_header(pkt, counter, 0, len, dma_mapped,
+					    page, pbc, dma_addr);
+
+		if (nfrags) {
+			ret = ipath_user_sdma_init_payload(dd, pq, pkt,
+							   iov + idx_save + 1,
+							   nfrags, npages);
+			if (ret < 0)
+				goto free_pbc_dma;
+		}
+
+		counter++;
+		npkts++;
+
+		list_add_tail(&pkt->list, list);
+	}
+
+	ret = idx;
+	goto done;
+
+free_pbc_dma:
+	if (dma_mapped)
+		dma_unmap_page(&dd->pcidev->dev, dma_addr, len, DMA_TO_DEVICE);
+free_pbc:
+	if (page) {
+		kunmap(page);
+		__free_page(page);
+	} else
+		dma_pool_free(pq->header_cache, pbc, dma_addr);
+free_pkt:
+	kmem_cache_free(pq->pkt_slab, pkt);
+free_list:
+	ipath_user_sdma_free_pkt_list(&dd->pcidev->dev, pq, list);
+done:
+	return ret;
+}
+
+static void ipath_user_sdma_set_complete_counter(struct ipath_user_sdma_queue *pq,
+						 u32 c)
+{
+	pq->sent_counter = c;
+}
+
+/* try to clean out queue -- needs pq->lock */
+static int ipath_user_sdma_queue_clean(const struct ipath_devdata *dd,
+				       struct ipath_user_sdma_queue *pq)
+{
+	struct list_head free_list;
+	struct ipath_user_sdma_pkt *pkt;
+	struct ipath_user_sdma_pkt *pkt_prev;
+	int ret = 0;
+
+	INIT_LIST_HEAD(&free_list);
+
+	list_for_each_entry_safe(pkt, pkt_prev, &pq->sent, list) {
+		s64 descd = dd->ipath_sdma_descq_removed - pkt->added;
+
+		if (descd < 0)
+			break;
+
+		list_move_tail(&pkt->list, &free_list);
+
+		/* one more packet cleaned */
+		ret++;
+	}
+
+	if (!list_empty(&free_list)) {
+		u32 counter;
+
+		pkt = list_entry(free_list.prev,
+				 struct ipath_user_sdma_pkt, list);
+		counter = pkt->counter;
+
+		ipath_user_sdma_free_pkt_list(&dd->pcidev->dev, pq, &free_list);
+		ipath_user_sdma_set_complete_counter(pq, counter);
+	}
+
+	return ret;
+}
+
+void ipath_user_sdma_queue_destroy(struct ipath_user_sdma_queue *pq)
+{
+	if (!pq)
+		return;
+
+	kmem_cache_destroy(pq->pkt_slab);
+	dma_pool_destroy(pq->header_cache);
+	kfree(pq);
+}
+
+/* clean descriptor queue, returns > 0 if some elements cleaned */
+static int ipath_user_sdma_hwqueue_clean(struct ipath_devdata *dd)
+{
+	int ret;
+	unsigned long flags;
+
+	spin_lock_irqsave(&dd->ipath_sdma_lock, flags);
+	ret = ipath_sdma_make_progress(dd);
+	spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
+
+	return ret;
+}
+
+/* we're in close, drain packets so that we can cleanup successfully... */
+void ipath_user_sdma_queue_drain(struct ipath_devdata *dd,
+				 struct ipath_user_sdma_queue *pq)
+{
+	int i;
+
+	if (!pq)
+		return;
+
+	for (i = 0; i < 100; i++) {
+		mutex_lock(&pq->lock);
+		if (list_empty(&pq->sent)) {
+			mutex_unlock(&pq->lock);
+			break;
+		}
+		ipath_user_sdma_hwqueue_clean(dd);
+		ipath_user_sdma_queue_clean(dd, pq);
+		mutex_unlock(&pq->lock);
+		msleep(10);
+	}
+
+	if (!list_empty(&pq->sent)) {
+		struct list_head free_list;
+
+		printk(KERN_INFO "drain: lists not empty: forcing!\n");
+		INIT_LIST_HEAD(&free_list);
+		mutex_lock(&pq->lock);
+		list_splice_init(&pq->sent, &free_list);
+		ipath_user_sdma_free_pkt_list(&dd->pcidev->dev, pq, &free_list);
+		mutex_unlock(&pq->lock);
+	}
+}
+
+static inline __le64 ipath_sdma_make_desc0(struct ipath_devdata *dd,
+					   u64 addr, u64 dwlen, u64 dwoffset)
+{
+	return cpu_to_le64(/* SDmaPhyAddr[31:0] */
+			   ((addr & 0xfffffffcULL) << 32) |
+			   /* SDmaGeneration[1:0] */
+			   ((dd->ipath_sdma_generation & 3ULL) << 30) |
+			   /* SDmaDwordCount[10:0] */
+			   ((dwlen & 0x7ffULL) << 16) |
+			   /* SDmaBufOffset[12:2] */
+			   (dwoffset & 0x7ffULL));
+}
+
+static inline __le64 ipath_sdma_make_first_desc0(__le64 descq)
+{
+	return descq | cpu_to_le64(1ULL << 12);
+}
+
+static inline __le64 ipath_sdma_make_last_desc0(__le64 descq)
+{
+					      /* last */  /* dma head */
+	return descq | cpu_to_le64(1ULL << 11 | 1ULL << 13);
+}
+
+static inline __le64 ipath_sdma_make_desc1(u64 addr)
+{
+	/* SDmaPhyAddr[47:32] */
+	return cpu_to_le64(addr >> 32);
+}
+
+static void ipath_user_sdma_send_frag(struct ipath_devdata *dd,
+				      struct ipath_user_sdma_pkt *pkt, int idx,
+				      unsigned ofs, u16 tail)
+{
+	const u64 addr = (u64) pkt->addr[idx].addr +
+		(u64) pkt->addr[idx].offset;
+	const u64 dwlen = (u64) pkt->addr[idx].length / 4;
+	__le64 *descqp;
+	__le64 descq0;
+
+	descqp = &dd->ipath_sdma_descq[tail].qw[0];
+
+	descq0 = ipath_sdma_make_desc0(dd, addr, dwlen, ofs);
+	if (idx == 0)
+		descq0 = ipath_sdma_make_first_desc0(descq0);
+	if (idx == pkt->naddr - 1)
+		descq0 = ipath_sdma_make_last_desc0(descq0);
+
+	descqp[0] = descq0;
+	descqp[1] = ipath_sdma_make_desc1(addr);
+}
+
+/* pq->lock must be held, get packets on the wire... */
+static int ipath_user_sdma_push_pkts(struct ipath_devdata *dd,
+				     struct ipath_user_sdma_queue *pq,
+				     struct list_head *pktlist)
+{
+	int ret = 0;
+	unsigned long flags;
+	u16 tail;
+
+	if (list_empty(pktlist))
+		return 0;
+
+	if (unlikely(!(dd->ipath_flags & IPATH_LINKACTIVE)))
+		return -ECOMM;
+
+	spin_lock_irqsave(&dd->ipath_sdma_lock, flags);
+
+	if (unlikely(dd->ipath_sdma_status & IPATH_SDMA_ABORT_MASK)) {
+		ret = -ECOMM;
+		goto unlock;
+	}
+
+	tail = dd->ipath_sdma_descq_tail;
+	while (!list_empty(pktlist)) {
+		struct ipath_user_sdma_pkt *pkt =
+			list_entry(pktlist->next, struct ipath_user_sdma_pkt,
+				   list);
+		int i;
+		unsigned ofs = 0;
+		u16 dtail = tail;
+
+		if (pkt->naddr > ipath_sdma_descq_freecnt(dd))
+			goto unlock_check_tail;
+
+		for (i = 0; i < pkt->naddr; i++) {
+			ipath_user_sdma_send_frag(dd, pkt, i, ofs, tail);
+			ofs += pkt->addr[i].length >> 2;
+
+			if (++tail == dd->ipath_sdma_descq_cnt) {
+				tail = 0;
+				++dd->ipath_sdma_generation;
+			}
+		}
+
+		if ((ofs<<2) > dd->ipath_ibmaxlen) {
+			ipath_dbg("packet size %X > ibmax %X, fail\n",
+				ofs<<2, dd->ipath_ibmaxlen);
+			ret = -EMSGSIZE;
+			goto unlock;
+		}
+
+		/*
+		 * if the packet is >= 2KB mtu equivalent, we have to use
+		 * the large buffers, and have to mark each descriptor as
+		 * part of a large buffer packet.
+		 */
+		if (ofs >= IPATH_SMALLBUF_DWORDS) {
+			for (i = 0; i < pkt->naddr; i++) {
+				dd->ipath_sdma_descq[dtail].qw[0] |=
+					cpu_to_le64(1ULL << 14);
+				if (++dtail == dd->ipath_sdma_descq_cnt)
+					dtail = 0;
+			}
+		}
+
+		dd->ipath_sdma_descq_added += pkt->naddr;
+		pkt->added = dd->ipath_sdma_descq_added;
+		list_move_tail(&pkt->list, &pq->sent);
+		ret++;
+	}
+
+unlock_check_tail:
+	/* advance the tail on the chip if necessary */
+	if (dd->ipath_sdma_descq_tail != tail) {
+		wmb();
+		ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmatail, tail);
+		dd->ipath_sdma_descq_tail = tail;
+	}
+
+unlock:
+	spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
+
+	return ret;
+}
+
+int ipath_user_sdma_writev(struct ipath_devdata *dd,
+			   struct ipath_user_sdma_queue *pq,
+			   const struct iovec *iov,
+			   unsigned long dim)
+{
+	int ret = 0;
+	struct list_head list;
+	int npkts = 0;
+
+	INIT_LIST_HEAD(&list);
+
+	mutex_lock(&pq->lock);
+
+	if (dd->ipath_sdma_descq_added != dd->ipath_sdma_descq_removed) {
+		ipath_user_sdma_hwqueue_clean(dd);
+		ipath_user_sdma_queue_clean(dd, pq);
+	}
+
+	while (dim) {
+		const int mxp = 8;
+
+		ret = ipath_user_sdma_queue_pkts(dd, pq, &list, iov, dim, mxp);
+		if (ret <= 0)
+			goto done_unlock;
+		else {
+			dim -= ret;
+			iov += ret;
+		}
+
+		/* force packets onto the sdma hw queue... */
+		if (!list_empty(&list)) {
+			/*
+			 * lazily clean hw queue.  the 4 is a guess of about
+			 * how many sdma descriptors a packet will take (it
+			 * doesn't have to be perfect).
+			 */
+			if (ipath_sdma_descq_freecnt(dd) < ret * 4) {
+				ipath_user_sdma_hwqueue_clean(dd);
+				ipath_user_sdma_queue_clean(dd, pq);
+			}
+
+			ret = ipath_user_sdma_push_pkts(dd, pq, &list);
+			if (ret < 0)
+				goto done_unlock;
+			else {
+				npkts += ret;
+				pq->counter += ret;
+
+				if (!list_empty(&list))
+					goto done_unlock;
+			}
+		}
+	}
+
+done_unlock:
+	if (!list_empty(&list))
+		ipath_user_sdma_free_pkt_list(&dd->pcidev->dev, pq, &list);
+	mutex_unlock(&pq->lock);
+
+	return (ret < 0) ? ret : npkts;
+}
+
+int ipath_user_sdma_make_progress(struct ipath_devdata *dd,
+				  struct ipath_user_sdma_queue *pq)
+{
+	int ret = 0;
+
+	mutex_lock(&pq->lock);
+	ipath_user_sdma_hwqueue_clean(dd);
+	ret = ipath_user_sdma_queue_clean(dd, pq);
+	mutex_unlock(&pq->lock);
+
+	return ret;
+}
+
+u32 ipath_user_sdma_complete_counter(const struct ipath_user_sdma_queue *pq)
+{
+	return pq->sent_counter;
+}
+
+u32 ipath_user_sdma_inflight_counter(struct ipath_user_sdma_queue *pq)
+{
+	return pq->counter;
+}
+
diff --git a/drivers/infiniband/hw/ipath/ipath_user_sdma.h b/drivers/infiniband/hw/ipath/ipath_user_sdma.h
new file mode 100644
index 0000000..fc76316
--- /dev/null
+++ b/drivers/infiniband/hw/ipath/ipath_user_sdma.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2007, 2008 QLogic Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include <linux/device.h>
+
+struct ipath_user_sdma_queue;
+
+struct ipath_user_sdma_queue *
+ipath_user_sdma_queue_create(struct device *dev, int unit, int port, int sport);
+void ipath_user_sdma_queue_destroy(struct ipath_user_sdma_queue *pq);
+
+int ipath_user_sdma_writev(struct ipath_devdata *dd,
+			   struct ipath_user_sdma_queue *pq,
+			   const struct iovec *iov,
+			   unsigned long dim);
+
+int ipath_user_sdma_make_progress(struct ipath_devdata *dd,
+				  struct ipath_user_sdma_queue *pq);
+
+void ipath_user_sdma_queue_drain(struct ipath_devdata *dd,
+				 struct ipath_user_sdma_queue *pq);
+
+u32 ipath_user_sdma_complete_counter(const struct ipath_user_sdma_queue *pq);
+u32 ipath_user_sdma_inflight_counter(struct ipath_user_sdma_queue *pq);
diff --git a/drivers/infiniband/hw/ipath/ipath_verbs.c b/drivers/infiniband/hw/ipath/ipath_verbs.c
new file mode 100644
index 0000000..44ea939
--- /dev/null
+++ b/drivers/infiniband/hw/ipath/ipath_verbs.c
@@ -0,0 +1,2342 @@
+/*
+ * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <rdma/ib_mad.h>
+#include <rdma/ib_user_verbs.h>
+#include <linux/io.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/utsname.h>
+#include <linux/rculist.h>
+
+#include "ipath_kernel.h"
+#include "ipath_verbs.h"
+#include "ipath_common.h"
+
+static unsigned int ib_ipath_qp_table_size = 251;
+module_param_named(qp_table_size, ib_ipath_qp_table_size, uint, S_IRUGO);
+MODULE_PARM_DESC(qp_table_size, "QP table size");
+
+unsigned int ib_ipath_lkey_table_size = 12;
+module_param_named(lkey_table_size, ib_ipath_lkey_table_size, uint,
+		   S_IRUGO);
+MODULE_PARM_DESC(lkey_table_size,
+		 "LKEY table size in bits (2^n, 1 <= n <= 23)");
+
+static unsigned int ib_ipath_max_pds = 0xFFFF;
+module_param_named(max_pds, ib_ipath_max_pds, uint, S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(max_pds,
+		 "Maximum number of protection domains to support");
+
+static unsigned int ib_ipath_max_ahs = 0xFFFF;
+module_param_named(max_ahs, ib_ipath_max_ahs, uint, S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(max_ahs, "Maximum number of address handles to support");
+
+unsigned int ib_ipath_max_cqes = 0x2FFFF;
+module_param_named(max_cqes, ib_ipath_max_cqes, uint, S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(max_cqes,
+		 "Maximum number of completion queue entries to support");
+
+unsigned int ib_ipath_max_cqs = 0x1FFFF;
+module_param_named(max_cqs, ib_ipath_max_cqs, uint, S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(max_cqs, "Maximum number of completion queues to support");
+
+unsigned int ib_ipath_max_qp_wrs = 0x3FFF;
+module_param_named(max_qp_wrs, ib_ipath_max_qp_wrs, uint,
+		   S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(max_qp_wrs, "Maximum number of QP WRs to support");
+
+unsigned int ib_ipath_max_qps = 16384;
+module_param_named(max_qps, ib_ipath_max_qps, uint, S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(max_qps, "Maximum number of QPs to support");
+
+unsigned int ib_ipath_max_sges = 0x60;
+module_param_named(max_sges, ib_ipath_max_sges, uint, S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(max_sges, "Maximum number of SGEs to support");
+
+unsigned int ib_ipath_max_mcast_grps = 16384;
+module_param_named(max_mcast_grps, ib_ipath_max_mcast_grps, uint,
+		   S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(max_mcast_grps,
+		 "Maximum number of multicast groups to support");
+
+unsigned int ib_ipath_max_mcast_qp_attached = 16;
+module_param_named(max_mcast_qp_attached, ib_ipath_max_mcast_qp_attached,
+		   uint, S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(max_mcast_qp_attached,
+		 "Maximum number of attached QPs to support");
+
+unsigned int ib_ipath_max_srqs = 1024;
+module_param_named(max_srqs, ib_ipath_max_srqs, uint, S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(max_srqs, "Maximum number of SRQs to support");
+
+unsigned int ib_ipath_max_srq_sges = 128;
+module_param_named(max_srq_sges, ib_ipath_max_srq_sges,
+		   uint, S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(max_srq_sges, "Maximum number of SRQ SGEs to support");
+
+unsigned int ib_ipath_max_srq_wrs = 0x1FFFF;
+module_param_named(max_srq_wrs, ib_ipath_max_srq_wrs,
+		   uint, S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(max_srq_wrs, "Maximum number of SRQ WRs support");
+
+static unsigned int ib_ipath_disable_sma;
+module_param_named(disable_sma, ib_ipath_disable_sma, uint, S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(disable_sma, "Disable the SMA");
+
+/*
+ * Note that it is OK to post send work requests in the SQE and ERR
+ * states; ipath_do_send() will process them and generate error
+ * completions as per IB 1.2 C10-96.
+ */
+const int ib_ipath_state_ops[IB_QPS_ERR + 1] = {
+	[IB_QPS_RESET] = 0,
+	[IB_QPS_INIT] = IPATH_POST_RECV_OK,
+	[IB_QPS_RTR] = IPATH_POST_RECV_OK | IPATH_PROCESS_RECV_OK,
+	[IB_QPS_RTS] = IPATH_POST_RECV_OK | IPATH_PROCESS_RECV_OK |
+	    IPATH_POST_SEND_OK | IPATH_PROCESS_SEND_OK |
+	    IPATH_PROCESS_NEXT_SEND_OK,
+	[IB_QPS_SQD] = IPATH_POST_RECV_OK | IPATH_PROCESS_RECV_OK |
+	    IPATH_POST_SEND_OK | IPATH_PROCESS_SEND_OK,
+	[IB_QPS_SQE] = IPATH_POST_RECV_OK | IPATH_PROCESS_RECV_OK |
+	    IPATH_POST_SEND_OK | IPATH_FLUSH_SEND,
+	[IB_QPS_ERR] = IPATH_POST_RECV_OK | IPATH_FLUSH_RECV |
+	    IPATH_POST_SEND_OK | IPATH_FLUSH_SEND,
+};
+
+struct ipath_ucontext {
+	struct ib_ucontext ibucontext;
+};
+
+static inline struct ipath_ucontext *to_iucontext(struct ib_ucontext
+						  *ibucontext)
+{
+	return container_of(ibucontext, struct ipath_ucontext, ibucontext);
+}
+
+/*
+ * Translate ib_wr_opcode into ib_wc_opcode.
+ */
+const enum ib_wc_opcode ib_ipath_wc_opcode[] = {
+	[IB_WR_RDMA_WRITE] = IB_WC_RDMA_WRITE,
+	[IB_WR_RDMA_WRITE_WITH_IMM] = IB_WC_RDMA_WRITE,
+	[IB_WR_SEND] = IB_WC_SEND,
+	[IB_WR_SEND_WITH_IMM] = IB_WC_SEND,
+	[IB_WR_RDMA_READ] = IB_WC_RDMA_READ,
+	[IB_WR_ATOMIC_CMP_AND_SWP] = IB_WC_COMP_SWAP,
+	[IB_WR_ATOMIC_FETCH_AND_ADD] = IB_WC_FETCH_ADD
+};
+
+/*
+ * System image GUID.
+ */
+static __be64 sys_image_guid;
+
+/**
+ * ipath_copy_sge - copy data to SGE memory
+ * @ss: the SGE state
+ * @data: the data to copy
+ * @length: the length of the data
+ */
+void ipath_copy_sge(struct ipath_sge_state *ss, void *data, u32 length)
+{
+	struct ipath_sge *sge = &ss->sge;
+
+	while (length) {
+		u32 len = sge->length;
+
+		if (len > length)
+			len = length;
+		if (len > sge->sge_length)
+			len = sge->sge_length;
+		BUG_ON(len == 0);
+		memcpy(sge->vaddr, data, len);
+		sge->vaddr += len;
+		sge->length -= len;
+		sge->sge_length -= len;
+		if (sge->sge_length == 0) {
+			if (--ss->num_sge)
+				*sge = *ss->sg_list++;
+		} else if (sge->length == 0 && sge->mr != NULL) {
+			if (++sge->n >= IPATH_SEGSZ) {
+				if (++sge->m >= sge->mr->mapsz)
+					break;
+				sge->n = 0;
+			}
+			sge->vaddr =
+				sge->mr->map[sge->m]->segs[sge->n].vaddr;
+			sge->length =
+				sge->mr->map[sge->m]->segs[sge->n].length;
+		}
+		data += len;
+		length -= len;
+	}
+}
+
+/**
+ * ipath_skip_sge - skip over SGE memory - XXX almost dup of prev func
+ * @ss: the SGE state
+ * @length: the number of bytes to skip
+ */
+void ipath_skip_sge(struct ipath_sge_state *ss, u32 length)
+{
+	struct ipath_sge *sge = &ss->sge;
+
+	while (length) {
+		u32 len = sge->length;
+
+		if (len > length)
+			len = length;
+		if (len > sge->sge_length)
+			len = sge->sge_length;
+		BUG_ON(len == 0);
+		sge->vaddr += len;
+		sge->length -= len;
+		sge->sge_length -= len;
+		if (sge->sge_length == 0) {
+			if (--ss->num_sge)
+				*sge = *ss->sg_list++;
+		} else if (sge->length == 0 && sge->mr != NULL) {
+			if (++sge->n >= IPATH_SEGSZ) {
+				if (++sge->m >= sge->mr->mapsz)
+					break;
+				sge->n = 0;
+			}
+			sge->vaddr =
+				sge->mr->map[sge->m]->segs[sge->n].vaddr;
+			sge->length =
+				sge->mr->map[sge->m]->segs[sge->n].length;
+		}
+		length -= len;
+	}
+}
+
+/*
+ * Count the number of DMA descriptors needed to send length bytes of data.
+ * Don't modify the ipath_sge_state to get the count.
+ * Return zero if any of the segments is not aligned.
+ */
+static u32 ipath_count_sge(struct ipath_sge_state *ss, u32 length)
+{
+	struct ipath_sge *sg_list = ss->sg_list;
+	struct ipath_sge sge = ss->sge;
+	u8 num_sge = ss->num_sge;
+	u32 ndesc = 1;	/* count the header */
+
+	while (length) {
+		u32 len = sge.length;
+
+		if (len > length)
+			len = length;
+		if (len > sge.sge_length)
+			len = sge.sge_length;
+		BUG_ON(len == 0);
+		if (((long) sge.vaddr & (sizeof(u32) - 1)) ||
+		    (len != length && (len & (sizeof(u32) - 1)))) {
+			ndesc = 0;
+			break;
+		}
+		ndesc++;
+		sge.vaddr += len;
+		sge.length -= len;
+		sge.sge_length -= len;
+		if (sge.sge_length == 0) {
+			if (--num_sge)
+				sge = *sg_list++;
+		} else if (sge.length == 0 && sge.mr != NULL) {
+			if (++sge.n >= IPATH_SEGSZ) {
+				if (++sge.m >= sge.mr->mapsz)
+					break;
+				sge.n = 0;
+			}
+			sge.vaddr =
+				sge.mr->map[sge.m]->segs[sge.n].vaddr;
+			sge.length =
+				sge.mr->map[sge.m]->segs[sge.n].length;
+		}
+		length -= len;
+	}
+	return ndesc;
+}
+
+/*
+ * Copy from the SGEs to the data buffer.
+ */
+static void ipath_copy_from_sge(void *data, struct ipath_sge_state *ss,
+				u32 length)
+{
+	struct ipath_sge *sge = &ss->sge;
+
+	while (length) {
+		u32 len = sge->length;
+
+		if (len > length)
+			len = length;
+		if (len > sge->sge_length)
+			len = sge->sge_length;
+		BUG_ON(len == 0);
+		memcpy(data, sge->vaddr, len);
+		sge->vaddr += len;
+		sge->length -= len;
+		sge->sge_length -= len;
+		if (sge->sge_length == 0) {
+			if (--ss->num_sge)
+				*sge = *ss->sg_list++;
+		} else if (sge->length == 0 && sge->mr != NULL) {
+			if (++sge->n >= IPATH_SEGSZ) {
+				if (++sge->m >= sge->mr->mapsz)
+					break;
+				sge->n = 0;
+			}
+			sge->vaddr =
+				sge->mr->map[sge->m]->segs[sge->n].vaddr;
+			sge->length =
+				sge->mr->map[sge->m]->segs[sge->n].length;
+		}
+		data += len;
+		length -= len;
+	}
+}
+
+/**
+ * ipath_post_one_send - post one RC, UC, or UD send work request
+ * @qp: the QP to post on
+ * @wr: the work request to send
+ */
+static int ipath_post_one_send(struct ipath_qp *qp, struct ib_send_wr *wr)
+{
+	struct ipath_swqe *wqe;
+	u32 next;
+	int i;
+	int j;
+	int acc;
+	int ret;
+	unsigned long flags;
+	struct ipath_devdata *dd = to_idev(qp->ibqp.device)->dd;
+
+	spin_lock_irqsave(&qp->s_lock, flags);
+
+	if (qp->ibqp.qp_type != IB_QPT_SMI &&
+	    !(dd->ipath_flags & IPATH_LINKACTIVE)) {
+		ret = -ENETDOWN;
+		goto bail;
+	}
+
+	/* Check that state is OK to post send. */
+	if (unlikely(!(ib_ipath_state_ops[qp->state] & IPATH_POST_SEND_OK)))
+		goto bail_inval;
+
+	/* IB spec says that num_sge == 0 is OK. */
+	if (wr->num_sge > qp->s_max_sge)
+		goto bail_inval;
+
+	/*
+	 * Don't allow RDMA reads or atomic operations on UC or
+	 * undefined operations.
+	 * Make sure buffer is large enough to hold the result for atomics.
+	 */
+	if (qp->ibqp.qp_type == IB_QPT_UC) {
+		if ((unsigned) wr->opcode >= IB_WR_RDMA_READ)
+			goto bail_inval;
+	} else if (qp->ibqp.qp_type == IB_QPT_UD) {
+		/* Check UD opcode */
+		if (wr->opcode != IB_WR_SEND &&
+		    wr->opcode != IB_WR_SEND_WITH_IMM)
+			goto bail_inval;
+		/* Check UD destination address PD */
+		if (qp->ibqp.pd != wr->wr.ud.ah->pd)
+			goto bail_inval;
+	} else if ((unsigned) wr->opcode > IB_WR_ATOMIC_FETCH_AND_ADD)
+		goto bail_inval;
+	else if (wr->opcode >= IB_WR_ATOMIC_CMP_AND_SWP &&
+		   (wr->num_sge == 0 ||
+		    wr->sg_list[0].length < sizeof(u64) ||
+		    wr->sg_list[0].addr & (sizeof(u64) - 1)))
+		goto bail_inval;
+	else if (wr->opcode >= IB_WR_RDMA_READ && !qp->s_max_rd_atomic)
+		goto bail_inval;
+
+	next = qp->s_head + 1;
+	if (next >= qp->s_size)
+		next = 0;
+	if (next == qp->s_last) {
+		ret = -ENOMEM;
+		goto bail;
+	}
+
+	wqe = get_swqe_ptr(qp, qp->s_head);
+	wqe->wr = *wr;
+	wqe->length = 0;
+	if (wr->num_sge) {
+		acc = wr->opcode >= IB_WR_RDMA_READ ?
+			IB_ACCESS_LOCAL_WRITE : 0;
+		for (i = 0, j = 0; i < wr->num_sge; i++) {
+			u32 length = wr->sg_list[i].length;
+			int ok;
+
+			if (length == 0)
+				continue;
+			ok = ipath_lkey_ok(qp, &wqe->sg_list[j],
+					   &wr->sg_list[i], acc);
+			if (!ok)
+				goto bail_inval;
+			wqe->length += length;
+			j++;
+		}
+		wqe->wr.num_sge = j;
+	}
+	if (qp->ibqp.qp_type == IB_QPT_UC ||
+	    qp->ibqp.qp_type == IB_QPT_RC) {
+		if (wqe->length > 0x80000000U)
+			goto bail_inval;
+	} else if (wqe->length > to_idev(qp->ibqp.device)->dd->ipath_ibmtu)
+		goto bail_inval;
+	wqe->ssn = qp->s_ssn++;
+	qp->s_head = next;
+
+	ret = 0;
+	goto bail;
+
+bail_inval:
+	ret = -EINVAL;
+bail:
+	spin_unlock_irqrestore(&qp->s_lock, flags);
+	return ret;
+}
+
+/**
+ * ipath_post_send - post a send on a QP
+ * @ibqp: the QP to post the send on
+ * @wr: the list of work requests to post
+ * @bad_wr: the first bad WR is put here
+ *
+ * This may be called from interrupt context.
+ */
+static int ipath_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
+			   struct ib_send_wr **bad_wr)
+{
+	struct ipath_qp *qp = to_iqp(ibqp);
+	int err = 0;
+
+	for (; wr; wr = wr->next) {
+		err = ipath_post_one_send(qp, wr);
+		if (err) {
+			*bad_wr = wr;
+			goto bail;
+		}
+	}
+
+	/* Try to do the send work in the caller's context. */
+	ipath_do_send((unsigned long) qp);
+
+bail:
+	return err;
+}
+
+/**
+ * ipath_post_receive - post a receive on a QP
+ * @ibqp: the QP to post the receive on
+ * @wr: the WR to post
+ * @bad_wr: the first bad WR is put here
+ *
+ * This may be called from interrupt context.
+ */
+static int ipath_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr,
+			      struct ib_recv_wr **bad_wr)
+{
+	struct ipath_qp *qp = to_iqp(ibqp);
+	struct ipath_rwq *wq = qp->r_rq.wq;
+	unsigned long flags;
+	int ret;
+
+	/* Check that state is OK to post receive. */
+	if (!(ib_ipath_state_ops[qp->state] & IPATH_POST_RECV_OK) || !wq) {
+		*bad_wr = wr;
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	for (; wr; wr = wr->next) {
+		struct ipath_rwqe *wqe;
+		u32 next;
+		int i;
+
+		if ((unsigned) wr->num_sge > qp->r_rq.max_sge) {
+			*bad_wr = wr;
+			ret = -EINVAL;
+			goto bail;
+		}
+
+		spin_lock_irqsave(&qp->r_rq.lock, flags);
+		next = wq->head + 1;
+		if (next >= qp->r_rq.size)
+			next = 0;
+		if (next == wq->tail) {
+			spin_unlock_irqrestore(&qp->r_rq.lock, flags);
+			*bad_wr = wr;
+			ret = -ENOMEM;
+			goto bail;
+		}
+
+		wqe = get_rwqe_ptr(&qp->r_rq, wq->head);
+		wqe->wr_id = wr->wr_id;
+		wqe->num_sge = wr->num_sge;
+		for (i = 0; i < wr->num_sge; i++)
+			wqe->sg_list[i] = wr->sg_list[i];
+		/* Make sure queue entry is written before the head index. */
+		smp_wmb();
+		wq->head = next;
+		spin_unlock_irqrestore(&qp->r_rq.lock, flags);
+	}
+	ret = 0;
+
+bail:
+	return ret;
+}
+
+/**
+ * ipath_qp_rcv - processing an incoming packet on a QP
+ * @dev: the device the packet came on
+ * @hdr: the packet header
+ * @has_grh: true if the packet has a GRH
+ * @data: the packet data
+ * @tlen: the packet length
+ * @qp: the QP the packet came on
+ *
+ * This is called from ipath_ib_rcv() to process an incoming packet
+ * for the given QP.
+ * Called at interrupt level.
+ */
+static void ipath_qp_rcv(struct ipath_ibdev *dev,
+			 struct ipath_ib_header *hdr, int has_grh,
+			 void *data, u32 tlen, struct ipath_qp *qp)
+{
+	/* Check for valid receive state. */
+	if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK)) {
+		dev->n_pkt_drops++;
+		return;
+	}
+
+	switch (qp->ibqp.qp_type) {
+	case IB_QPT_SMI:
+	case IB_QPT_GSI:
+		if (ib_ipath_disable_sma)
+			break;
+		/* FALLTHROUGH */
+	case IB_QPT_UD:
+		ipath_ud_rcv(dev, hdr, has_grh, data, tlen, qp);
+		break;
+
+	case IB_QPT_RC:
+		ipath_rc_rcv(dev, hdr, has_grh, data, tlen, qp);
+		break;
+
+	case IB_QPT_UC:
+		ipath_uc_rcv(dev, hdr, has_grh, data, tlen, qp);
+		break;
+
+	default:
+		break;
+	}
+}
+
+/**
+ * ipath_ib_rcv - process an incoming packet
+ * @arg: the device pointer
+ * @rhdr: the header of the packet
+ * @data: the packet data
+ * @tlen: the packet length
+ *
+ * This is called from ipath_kreceive() to process an incoming packet at
+ * interrupt level. Tlen is the length of the header + data + CRC in bytes.
+ */
+void ipath_ib_rcv(struct ipath_ibdev *dev, void *rhdr, void *data,
+		  u32 tlen)
+{
+	struct ipath_ib_header *hdr = rhdr;
+	struct ipath_other_headers *ohdr;
+	struct ipath_qp *qp;
+	u32 qp_num;
+	int lnh;
+	u8 opcode;
+	u16 lid;
+
+	if (unlikely(dev == NULL))
+		goto bail;
+
+	if (unlikely(tlen < 24)) {	/* LRH+BTH+CRC */
+		dev->rcv_errors++;
+		goto bail;
+	}
+
+	/* Check for a valid destination LID (see ch. 7.11.1). */
+	lid = be16_to_cpu(hdr->lrh[1]);
+	if (lid < IPATH_MULTICAST_LID_BASE) {
+		lid &= ~((1 << dev->dd->ipath_lmc) - 1);
+		if (unlikely(lid != dev->dd->ipath_lid)) {
+			dev->rcv_errors++;
+			goto bail;
+		}
+	}
+
+	/* Check for GRH */
+	lnh = be16_to_cpu(hdr->lrh[0]) & 3;
+	if (lnh == IPATH_LRH_BTH)
+		ohdr = &hdr->u.oth;
+	else if (lnh == IPATH_LRH_GRH)
+		ohdr = &hdr->u.l.oth;
+	else {
+		dev->rcv_errors++;
+		goto bail;
+	}
+
+	opcode = (be32_to_cpu(ohdr->bth[0]) >> 24) & 0x7f;
+	dev->opstats[opcode].n_bytes += tlen;
+	dev->opstats[opcode].n_packets++;
+
+	/* Get the destination QP number. */
+	qp_num = be32_to_cpu(ohdr->bth[1]) & IPATH_QPN_MASK;
+	if (qp_num == IPATH_MULTICAST_QPN) {
+		struct ipath_mcast *mcast;
+		struct ipath_mcast_qp *p;
+
+		if (lnh != IPATH_LRH_GRH) {
+			dev->n_pkt_drops++;
+			goto bail;
+		}
+		mcast = ipath_mcast_find(&hdr->u.l.grh.dgid);
+		if (mcast == NULL) {
+			dev->n_pkt_drops++;
+			goto bail;
+		}
+		dev->n_multicast_rcv++;
+		list_for_each_entry_rcu(p, &mcast->qp_list, list)
+			ipath_qp_rcv(dev, hdr, 1, data, tlen, p->qp);
+		/*
+		 * Notify ipath_multicast_detach() if it is waiting for us
+		 * to finish.
+		 */
+		if (atomic_dec_return(&mcast->refcount) <= 1)
+			wake_up(&mcast->wait);
+	} else {
+		qp = ipath_lookup_qpn(&dev->qp_table, qp_num);
+		if (qp) {
+			dev->n_unicast_rcv++;
+			ipath_qp_rcv(dev, hdr, lnh == IPATH_LRH_GRH, data,
+				     tlen, qp);
+			/*
+			 * Notify ipath_destroy_qp() if it is waiting
+			 * for us to finish.
+			 */
+			if (atomic_dec_and_test(&qp->refcount))
+				wake_up(&qp->wait);
+		} else
+			dev->n_pkt_drops++;
+	}
+
+bail:;
+}
+
+/**
+ * ipath_ib_timer - verbs timer
+ * @arg: the device pointer
+ *
+ * This is called from ipath_do_rcv_timer() at interrupt level to check for
+ * QPs which need retransmits and to collect performance numbers.
+ */
+static void ipath_ib_timer(struct ipath_ibdev *dev)
+{
+	struct ipath_qp *resend = NULL;
+	struct ipath_qp *rnr = NULL;
+	struct list_head *last;
+	struct ipath_qp *qp;
+	unsigned long flags;
+
+	if (dev == NULL)
+		return;
+
+	spin_lock_irqsave(&dev->pending_lock, flags);
+	/* Start filling the next pending queue. */
+	if (++dev->pending_index >= ARRAY_SIZE(dev->pending))
+		dev->pending_index = 0;
+	/* Save any requests still in the new queue, they have timed out. */
+	last = &dev->pending[dev->pending_index];
+	while (!list_empty(last)) {
+		qp = list_entry(last->next, struct ipath_qp, timerwait);
+		list_del_init(&qp->timerwait);
+		qp->timer_next = resend;
+		resend = qp;
+		atomic_inc(&qp->refcount);
+	}
+	last = &dev->rnrwait;
+	if (!list_empty(last)) {
+		qp = list_entry(last->next, struct ipath_qp, timerwait);
+		if (--qp->s_rnr_timeout == 0) {
+			do {
+				list_del_init(&qp->timerwait);
+				qp->timer_next = rnr;
+				rnr = qp;
+				atomic_inc(&qp->refcount);
+				if (list_empty(last))
+					break;
+				qp = list_entry(last->next, struct ipath_qp,
+						timerwait);
+			} while (qp->s_rnr_timeout == 0);
+		}
+	}
+	/*
+	 * We should only be in the started state if pma_sample_start != 0
+	 */
+	if (dev->pma_sample_status == IB_PMA_SAMPLE_STATUS_STARTED &&
+	    --dev->pma_sample_start == 0) {
+		dev->pma_sample_status = IB_PMA_SAMPLE_STATUS_RUNNING;
+		ipath_snapshot_counters(dev->dd, &dev->ipath_sword,
+					&dev->ipath_rword,
+					&dev->ipath_spkts,
+					&dev->ipath_rpkts,
+					&dev->ipath_xmit_wait);
+	}
+	if (dev->pma_sample_status == IB_PMA_SAMPLE_STATUS_RUNNING) {
+		if (dev->pma_sample_interval == 0) {
+			u64 ta, tb, tc, td, te;
+
+			dev->pma_sample_status = IB_PMA_SAMPLE_STATUS_DONE;
+			ipath_snapshot_counters(dev->dd, &ta, &tb,
+						&tc, &td, &te);
+
+			dev->ipath_sword = ta - dev->ipath_sword;
+			dev->ipath_rword = tb - dev->ipath_rword;
+			dev->ipath_spkts = tc - dev->ipath_spkts;
+			dev->ipath_rpkts = td - dev->ipath_rpkts;
+			dev->ipath_xmit_wait = te - dev->ipath_xmit_wait;
+		}
+		else
+			dev->pma_sample_interval--;
+	}
+	spin_unlock_irqrestore(&dev->pending_lock, flags);
+
+	/* XXX What if timer fires again while this is running? */
+	while (resend != NULL) {
+		qp = resend;
+		resend = qp->timer_next;
+
+		spin_lock_irqsave(&qp->s_lock, flags);
+		if (qp->s_last != qp->s_tail &&
+		    ib_ipath_state_ops[qp->state] & IPATH_PROCESS_SEND_OK) {
+			dev->n_timeouts++;
+			ipath_restart_rc(qp, qp->s_last_psn + 1);
+		}
+		spin_unlock_irqrestore(&qp->s_lock, flags);
+
+		/* Notify ipath_destroy_qp() if it is waiting. */
+		if (atomic_dec_and_test(&qp->refcount))
+			wake_up(&qp->wait);
+	}
+	while (rnr != NULL) {
+		qp = rnr;
+		rnr = qp->timer_next;
+
+		spin_lock_irqsave(&qp->s_lock, flags);
+		if (ib_ipath_state_ops[qp->state] & IPATH_PROCESS_SEND_OK)
+			ipath_schedule_send(qp);
+		spin_unlock_irqrestore(&qp->s_lock, flags);
+
+		/* Notify ipath_destroy_qp() if it is waiting. */
+		if (atomic_dec_and_test(&qp->refcount))
+			wake_up(&qp->wait);
+	}
+}
+
+static void update_sge(struct ipath_sge_state *ss, u32 length)
+{
+	struct ipath_sge *sge = &ss->sge;
+
+	sge->vaddr += length;
+	sge->length -= length;
+	sge->sge_length -= length;
+	if (sge->sge_length == 0) {
+		if (--ss->num_sge)
+			*sge = *ss->sg_list++;
+	} else if (sge->length == 0 && sge->mr != NULL) {
+		if (++sge->n >= IPATH_SEGSZ) {
+			if (++sge->m >= sge->mr->mapsz)
+				return;
+			sge->n = 0;
+		}
+		sge->vaddr = sge->mr->map[sge->m]->segs[sge->n].vaddr;
+		sge->length = sge->mr->map[sge->m]->segs[sge->n].length;
+	}
+}
+
+#ifdef __LITTLE_ENDIAN
+static inline u32 get_upper_bits(u32 data, u32 shift)
+{
+	return data >> shift;
+}
+
+static inline u32 set_upper_bits(u32 data, u32 shift)
+{
+	return data << shift;
+}
+
+static inline u32 clear_upper_bytes(u32 data, u32 n, u32 off)
+{
+	data <<= ((sizeof(u32) - n) * BITS_PER_BYTE);
+	data >>= ((sizeof(u32) - n - off) * BITS_PER_BYTE);
+	return data;
+}
+#else
+static inline u32 get_upper_bits(u32 data, u32 shift)
+{
+	return data << shift;
+}
+
+static inline u32 set_upper_bits(u32 data, u32 shift)
+{
+	return data >> shift;
+}
+
+static inline u32 clear_upper_bytes(u32 data, u32 n, u32 off)
+{
+	data >>= ((sizeof(u32) - n) * BITS_PER_BYTE);
+	data <<= ((sizeof(u32) - n - off) * BITS_PER_BYTE);
+	return data;
+}
+#endif
+
+static void copy_io(u32 __iomem *piobuf, struct ipath_sge_state *ss,
+		    u32 length, unsigned flush_wc)
+{
+	u32 extra = 0;
+	u32 data = 0;
+	u32 last;
+
+	while (1) {
+		u32 len = ss->sge.length;
+		u32 off;
+
+		if (len > length)
+			len = length;
+		if (len > ss->sge.sge_length)
+			len = ss->sge.sge_length;
+		BUG_ON(len == 0);
+		/* If the source address is not aligned, try to align it. */
+		off = (unsigned long)ss->sge.vaddr & (sizeof(u32) - 1);
+		if (off) {
+			u32 *addr = (u32 *)((unsigned long)ss->sge.vaddr &
+					    ~(sizeof(u32) - 1));
+			u32 v = get_upper_bits(*addr, off * BITS_PER_BYTE);
+			u32 y;
+
+			y = sizeof(u32) - off;
+			if (len > y)
+				len = y;
+			if (len + extra >= sizeof(u32)) {
+				data |= set_upper_bits(v, extra *
+						       BITS_PER_BYTE);
+				len = sizeof(u32) - extra;
+				if (len == length) {
+					last = data;
+					break;
+				}
+				__raw_writel(data, piobuf);
+				piobuf++;
+				extra = 0;
+				data = 0;
+			} else {
+				/* Clear unused upper bytes */
+				data |= clear_upper_bytes(v, len, extra);
+				if (len == length) {
+					last = data;
+					break;
+				}
+				extra += len;
+			}
+		} else if (extra) {
+			/* Source address is aligned. */
+			u32 *addr = (u32 *) ss->sge.vaddr;
+			int shift = extra * BITS_PER_BYTE;
+			int ushift = 32 - shift;
+			u32 l = len;
+
+			while (l >= sizeof(u32)) {
+				u32 v = *addr;
+
+				data |= set_upper_bits(v, shift);
+				__raw_writel(data, piobuf);
+				data = get_upper_bits(v, ushift);
+				piobuf++;
+				addr++;
+				l -= sizeof(u32);
+			}
+			/*
+			 * We still have 'extra' number of bytes leftover.
+			 */
+			if (l) {
+				u32 v = *addr;
+
+				if (l + extra >= sizeof(u32)) {
+					data |= set_upper_bits(v, shift);
+					len -= l + extra - sizeof(u32);
+					if (len == length) {
+						last = data;
+						break;
+					}
+					__raw_writel(data, piobuf);
+					piobuf++;
+					extra = 0;
+					data = 0;
+				} else {
+					/* Clear unused upper bytes */
+					data |= clear_upper_bytes(v, l,
+								  extra);
+					if (len == length) {
+						last = data;
+						break;
+					}
+					extra += l;
+				}
+			} else if (len == length) {
+				last = data;
+				break;
+			}
+		} else if (len == length) {
+			u32 w;
+
+			/*
+			 * Need to round up for the last dword in the
+			 * packet.
+			 */
+			w = (len + 3) >> 2;
+			__iowrite32_copy(piobuf, ss->sge.vaddr, w - 1);
+			piobuf += w - 1;
+			last = ((u32 *) ss->sge.vaddr)[w - 1];
+			break;
+		} else {
+			u32 w = len >> 2;
+
+			__iowrite32_copy(piobuf, ss->sge.vaddr, w);
+			piobuf += w;
+
+			extra = len & (sizeof(u32) - 1);
+			if (extra) {
+				u32 v = ((u32 *) ss->sge.vaddr)[w];
+
+				/* Clear unused upper bytes */
+				data = clear_upper_bytes(v, extra, 0);
+			}
+		}
+		update_sge(ss, len);
+		length -= len;
+	}
+	/* Update address before sending packet. */
+	update_sge(ss, length);
+	if (flush_wc) {
+		/* must flush early everything before trigger word */
+		ipath_flush_wc();
+		__raw_writel(last, piobuf);
+		/* be sure trigger word is written */
+		ipath_flush_wc();
+	} else
+		__raw_writel(last, piobuf);
+}
+
+/*
+ * Convert IB rate to delay multiplier.
+ */
+unsigned ipath_ib_rate_to_mult(enum ib_rate rate)
+{
+	switch (rate) {
+	case IB_RATE_2_5_GBPS: return 8;
+	case IB_RATE_5_GBPS:   return 4;
+	case IB_RATE_10_GBPS:  return 2;
+	case IB_RATE_20_GBPS:  return 1;
+	default:	       return 0;
+	}
+}
+
+/*
+ * Convert delay multiplier to IB rate
+ */
+static enum ib_rate ipath_mult_to_ib_rate(unsigned mult)
+{
+	switch (mult) {
+	case 8:  return IB_RATE_2_5_GBPS;
+	case 4:  return IB_RATE_5_GBPS;
+	case 2:  return IB_RATE_10_GBPS;
+	case 1:  return IB_RATE_20_GBPS;
+	default: return IB_RATE_PORT_CURRENT;
+	}
+}
+
+static inline struct ipath_verbs_txreq *get_txreq(struct ipath_ibdev *dev)
+{
+	struct ipath_verbs_txreq *tx = NULL;
+	unsigned long flags;
+
+	spin_lock_irqsave(&dev->pending_lock, flags);
+	if (!list_empty(&dev->txreq_free)) {
+		struct list_head *l = dev->txreq_free.next;
+
+		list_del(l);
+		tx = list_entry(l, struct ipath_verbs_txreq, txreq.list);
+	}
+	spin_unlock_irqrestore(&dev->pending_lock, flags);
+	return tx;
+}
+
+static inline void put_txreq(struct ipath_ibdev *dev,
+			     struct ipath_verbs_txreq *tx)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&dev->pending_lock, flags);
+	list_add(&tx->txreq.list, &dev->txreq_free);
+	spin_unlock_irqrestore(&dev->pending_lock, flags);
+}
+
+static void sdma_complete(void *cookie, int status)
+{
+	struct ipath_verbs_txreq *tx = cookie;
+	struct ipath_qp *qp = tx->qp;
+	struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
+	unsigned long flags;
+	enum ib_wc_status ibs = status == IPATH_SDMA_TXREQ_S_OK ?
+		IB_WC_SUCCESS : IB_WC_WR_FLUSH_ERR;
+
+	if (atomic_dec_and_test(&qp->s_dma_busy)) {
+		spin_lock_irqsave(&qp->s_lock, flags);
+		if (tx->wqe)
+			ipath_send_complete(qp, tx->wqe, ibs);
+		if ((ib_ipath_state_ops[qp->state] & IPATH_FLUSH_SEND &&
+		     qp->s_last != qp->s_head) ||
+		    (qp->s_flags & IPATH_S_WAIT_DMA))
+			ipath_schedule_send(qp);
+		spin_unlock_irqrestore(&qp->s_lock, flags);
+		wake_up(&qp->wait_dma);
+	} else if (tx->wqe) {
+		spin_lock_irqsave(&qp->s_lock, flags);
+		ipath_send_complete(qp, tx->wqe, ibs);
+		spin_unlock_irqrestore(&qp->s_lock, flags);
+	}
+
+	if (tx->txreq.flags & IPATH_SDMA_TXREQ_F_FREEBUF)
+		kfree(tx->txreq.map_addr);
+	put_txreq(dev, tx);
+
+	if (atomic_dec_and_test(&qp->refcount))
+		wake_up(&qp->wait);
+}
+
+static void decrement_dma_busy(struct ipath_qp *qp)
+{
+	unsigned long flags;
+
+	if (atomic_dec_and_test(&qp->s_dma_busy)) {
+		spin_lock_irqsave(&qp->s_lock, flags);
+		if ((ib_ipath_state_ops[qp->state] & IPATH_FLUSH_SEND &&
+		     qp->s_last != qp->s_head) ||
+		    (qp->s_flags & IPATH_S_WAIT_DMA))
+			ipath_schedule_send(qp);
+		spin_unlock_irqrestore(&qp->s_lock, flags);
+		wake_up(&qp->wait_dma);
+	}
+}
+
+/*
+ * Compute the number of clock cycles of delay before sending the next packet.
+ * The multipliers reflect the number of clocks for the fastest rate so
+ * one tick at 4xDDR is 8 ticks at 1xSDR.
+ * If the destination port will take longer to receive a packet than
+ * the outgoing link can send it, we need to delay sending the next packet
+ * by the difference in time it takes the receiver to receive and the sender
+ * to send this packet.
+ * Note that this delay is always correct for UC and RC but not always
+ * optimal for UD. For UD, the destination HCA can be different for each
+ * packet, in which case, we could send packets to a different destination
+ * while "waiting" for the delay. The overhead for doing this without
+ * HW support is more than just paying the cost of delaying some packets
+ * unnecessarily.
+ */
+static inline unsigned ipath_pkt_delay(u32 plen, u8 snd_mult, u8 rcv_mult)
+{
+	return (rcv_mult > snd_mult) ?
+		(plen * (rcv_mult - snd_mult) + 1) >> 1 : 0;
+}
+
+static int ipath_verbs_send_dma(struct ipath_qp *qp,
+				struct ipath_ib_header *hdr, u32 hdrwords,
+				struct ipath_sge_state *ss, u32 len,
+				u32 plen, u32 dwords)
+{
+	struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
+	struct ipath_devdata *dd = dev->dd;
+	struct ipath_verbs_txreq *tx;
+	u32 *piobuf;
+	u32 control;
+	u32 ndesc;
+	int ret;
+
+	tx = qp->s_tx;
+	if (tx) {
+		qp->s_tx = NULL;
+		/* resend previously constructed packet */
+		atomic_inc(&qp->s_dma_busy);
+		ret = ipath_sdma_verbs_send(dd, tx->ss, tx->len, tx);
+		if (ret) {
+			qp->s_tx = tx;
+			decrement_dma_busy(qp);
+		}
+		goto bail;
+	}
+
+	tx = get_txreq(dev);
+	if (!tx) {
+		ret = -EBUSY;
+		goto bail;
+	}
+
+	/*
+	 * Get the saved delay count we computed for the previous packet
+	 * and save the delay count for this packet to be used next time
+	 * we get here.
+	 */
+	control = qp->s_pkt_delay;
+	qp->s_pkt_delay = ipath_pkt_delay(plen, dd->delay_mult, qp->s_dmult);
+
+	tx->qp = qp;
+	atomic_inc(&qp->refcount);
+	tx->wqe = qp->s_wqe;
+	tx->txreq.callback = sdma_complete;
+	tx->txreq.callback_cookie = tx;
+	tx->txreq.flags = IPATH_SDMA_TXREQ_F_HEADTOHOST |
+		IPATH_SDMA_TXREQ_F_INTREQ | IPATH_SDMA_TXREQ_F_FREEDESC;
+	if (plen + 1 >= IPATH_SMALLBUF_DWORDS)
+		tx->txreq.flags |= IPATH_SDMA_TXREQ_F_USELARGEBUF;
+
+	/* VL15 packets bypass credit check */
+	if ((be16_to_cpu(hdr->lrh[0]) >> 12) == 15) {
+		control |= 1ULL << 31;
+		tx->txreq.flags |= IPATH_SDMA_TXREQ_F_VL15;
+	}
+
+	if (len) {
+		/*
+		 * Don't try to DMA if it takes more descriptors than
+		 * the queue holds.
+		 */
+		ndesc = ipath_count_sge(ss, len);
+		if (ndesc >= dd->ipath_sdma_descq_cnt)
+			ndesc = 0;
+	} else
+		ndesc = 1;
+	if (ndesc) {
+		tx->hdr.pbc[0] = cpu_to_le32(plen);
+		tx->hdr.pbc[1] = cpu_to_le32(control);
+		memcpy(&tx->hdr.hdr, hdr, hdrwords << 2);
+		tx->txreq.sg_count = ndesc;
+		tx->map_len = (hdrwords + 2) << 2;
+		tx->txreq.map_addr = &tx->hdr;
+		atomic_inc(&qp->s_dma_busy);
+		ret = ipath_sdma_verbs_send(dd, ss, dwords, tx);
+		if (ret) {
+			/* save ss and length in dwords */
+			tx->ss = ss;
+			tx->len = dwords;
+			qp->s_tx = tx;
+			decrement_dma_busy(qp);
+		}
+		goto bail;
+	}
+
+	/* Allocate a buffer and copy the header and payload to it. */
+	tx->map_len = (plen + 1) << 2;
+	piobuf = kmalloc(tx->map_len, GFP_ATOMIC);
+	if (unlikely(piobuf == NULL)) {
+		ret = -EBUSY;
+		goto err_tx;
+	}
+	tx->txreq.map_addr = piobuf;
+	tx->txreq.flags |= IPATH_SDMA_TXREQ_F_FREEBUF;
+	tx->txreq.sg_count = 1;
+
+	*piobuf++ = (__force u32) cpu_to_le32(plen);
+	*piobuf++ = (__force u32) cpu_to_le32(control);
+	memcpy(piobuf, hdr, hdrwords << 2);
+	ipath_copy_from_sge(piobuf + hdrwords, ss, len);
+
+	atomic_inc(&qp->s_dma_busy);
+	ret = ipath_sdma_verbs_send(dd, NULL, 0, tx);
+	/*
+	 * If we couldn't queue the DMA request, save the info
+	 * and try again later rather than destroying the
+	 * buffer and undoing the side effects of the copy.
+	 */
+	if (ret) {
+		tx->ss = NULL;
+		tx->len = 0;
+		qp->s_tx = tx;
+		decrement_dma_busy(qp);
+	}
+	dev->n_unaligned++;
+	goto bail;
+
+err_tx:
+	if (atomic_dec_and_test(&qp->refcount))
+		wake_up(&qp->wait);
+	put_txreq(dev, tx);
+bail:
+	return ret;
+}
+
+static int ipath_verbs_send_pio(struct ipath_qp *qp,
+				struct ipath_ib_header *ibhdr, u32 hdrwords,
+				struct ipath_sge_state *ss, u32 len,
+				u32 plen, u32 dwords)
+{
+	struct ipath_devdata *dd = to_idev(qp->ibqp.device)->dd;
+	u32 *hdr = (u32 *) ibhdr;
+	u32 __iomem *piobuf;
+	unsigned flush_wc;
+	u32 control;
+	int ret;
+	unsigned long flags;
+
+	piobuf = ipath_getpiobuf(dd, plen, NULL);
+	if (unlikely(piobuf == NULL)) {
+		ret = -EBUSY;
+		goto bail;
+	}
+
+	/*
+	 * Get the saved delay count we computed for the previous packet
+	 * and save the delay count for this packet to be used next time
+	 * we get here.
+	 */
+	control = qp->s_pkt_delay;
+	qp->s_pkt_delay = ipath_pkt_delay(plen, dd->delay_mult, qp->s_dmult);
+
+	/* VL15 packets bypass credit check */
+	if ((be16_to_cpu(ibhdr->lrh[0]) >> 12) == 15)
+		control |= 1ULL << 31;
+
+	/*
+	 * Write the length to the control qword plus any needed flags.
+	 * We have to flush after the PBC for correctness on some cpus
+	 * or WC buffer can be written out of order.
+	 */
+	writeq(((u64) control << 32) | plen, piobuf);
+	piobuf += 2;
+
+	flush_wc = dd->ipath_flags & IPATH_PIO_FLUSH_WC;
+	if (len == 0) {
+		/*
+		 * If there is just the header portion, must flush before
+		 * writing last word of header for correctness, and after
+		 * the last header word (trigger word).
+		 */
+		if (flush_wc) {
+			ipath_flush_wc();
+			__iowrite32_copy(piobuf, hdr, hdrwords - 1);
+			ipath_flush_wc();
+			__raw_writel(hdr[hdrwords - 1], piobuf + hdrwords - 1);
+			ipath_flush_wc();
+		} else
+			__iowrite32_copy(piobuf, hdr, hdrwords);
+		goto done;
+	}
+
+	if (flush_wc)
+		ipath_flush_wc();
+	__iowrite32_copy(piobuf, hdr, hdrwords);
+	piobuf += hdrwords;
+
+	/* The common case is aligned and contained in one segment. */
+	if (likely(ss->num_sge == 1 && len <= ss->sge.length &&
+		   !((unsigned long)ss->sge.vaddr & (sizeof(u32) - 1)))) {
+		u32 *addr = (u32 *) ss->sge.vaddr;
+
+		/* Update address before sending packet. */
+		update_sge(ss, len);
+		if (flush_wc) {
+			__iowrite32_copy(piobuf, addr, dwords - 1);
+			/* must flush early everything before trigger word */
+			ipath_flush_wc();
+			__raw_writel(addr[dwords - 1], piobuf + dwords - 1);
+			/* be sure trigger word is written */
+			ipath_flush_wc();
+		} else
+			__iowrite32_copy(piobuf, addr, dwords);
+		goto done;
+	}
+	copy_io(piobuf, ss, len, flush_wc);
+done:
+	if (qp->s_wqe) {
+		spin_lock_irqsave(&qp->s_lock, flags);
+		ipath_send_complete(qp, qp->s_wqe, IB_WC_SUCCESS);
+		spin_unlock_irqrestore(&qp->s_lock, flags);
+	}
+	ret = 0;
+bail:
+	return ret;
+}
+
+/**
+ * ipath_verbs_send - send a packet
+ * @qp: the QP to send on
+ * @hdr: the packet header
+ * @hdrwords: the number of 32-bit words in the header
+ * @ss: the SGE to send
+ * @len: the length of the packet in bytes
+ */
+int ipath_verbs_send(struct ipath_qp *qp, struct ipath_ib_header *hdr,
+		     u32 hdrwords, struct ipath_sge_state *ss, u32 len)
+{
+	struct ipath_devdata *dd = to_idev(qp->ibqp.device)->dd;
+	u32 plen;
+	int ret;
+	u32 dwords = (len + 3) >> 2;
+
+	/*
+	 * Calculate the send buffer trigger address.
+	 * The +1 counts for the pbc control dword following the pbc length.
+	 */
+	plen = hdrwords + dwords + 1;
+
+	/*
+	 * VL15 packets (IB_QPT_SMI) will always use PIO, so we
+	 * can defer SDMA restart until link goes ACTIVE without
+	 * worrying about just how we got there.
+	 */
+	if (qp->ibqp.qp_type == IB_QPT_SMI ||
+	    !(dd->ipath_flags & IPATH_HAS_SEND_DMA))
+		ret = ipath_verbs_send_pio(qp, hdr, hdrwords, ss, len,
+					   plen, dwords);
+	else
+		ret = ipath_verbs_send_dma(qp, hdr, hdrwords, ss, len,
+					   plen, dwords);
+
+	return ret;
+}
+
+int ipath_snapshot_counters(struct ipath_devdata *dd, u64 *swords,
+			    u64 *rwords, u64 *spkts, u64 *rpkts,
+			    u64 *xmit_wait)
+{
+	int ret;
+
+	if (!(dd->ipath_flags & IPATH_INITTED)) {
+		/* no hardware, freeze, etc. */
+		ret = -EINVAL;
+		goto bail;
+	}
+	*swords = ipath_snap_cntr(dd, dd->ipath_cregs->cr_wordsendcnt);
+	*rwords = ipath_snap_cntr(dd, dd->ipath_cregs->cr_wordrcvcnt);
+	*spkts = ipath_snap_cntr(dd, dd->ipath_cregs->cr_pktsendcnt);
+	*rpkts = ipath_snap_cntr(dd, dd->ipath_cregs->cr_pktrcvcnt);
+	*xmit_wait = ipath_snap_cntr(dd, dd->ipath_cregs->cr_sendstallcnt);
+
+	ret = 0;
+
+bail:
+	return ret;
+}
+
+/**
+ * ipath_get_counters - get various chip counters
+ * @dd: the infinipath device
+ * @cntrs: counters are placed here
+ *
+ * Return the counters needed by recv_pma_get_portcounters().
+ */
+int ipath_get_counters(struct ipath_devdata *dd,
+		       struct ipath_verbs_counters *cntrs)
+{
+	struct ipath_cregs const *crp = dd->ipath_cregs;
+	int ret;
+
+	if (!(dd->ipath_flags & IPATH_INITTED)) {
+		/* no hardware, freeze, etc. */
+		ret = -EINVAL;
+		goto bail;
+	}
+	cntrs->symbol_error_counter =
+		ipath_snap_cntr(dd, crp->cr_ibsymbolerrcnt);
+	cntrs->link_error_recovery_counter =
+		ipath_snap_cntr(dd, crp->cr_iblinkerrrecovcnt);
+	/*
+	 * The link downed counter counts when the other side downs the
+	 * connection.  We add in the number of times we downed the link
+	 * due to local link integrity errors to compensate.
+	 */
+	cntrs->link_downed_counter =
+		ipath_snap_cntr(dd, crp->cr_iblinkdowncnt);
+	cntrs->port_rcv_errors =
+		ipath_snap_cntr(dd, crp->cr_rxdroppktcnt) +
+		ipath_snap_cntr(dd, crp->cr_rcvovflcnt) +
+		ipath_snap_cntr(dd, crp->cr_portovflcnt) +
+		ipath_snap_cntr(dd, crp->cr_err_rlencnt) +
+		ipath_snap_cntr(dd, crp->cr_invalidrlencnt) +
+		ipath_snap_cntr(dd, crp->cr_errlinkcnt) +
+		ipath_snap_cntr(dd, crp->cr_erricrccnt) +
+		ipath_snap_cntr(dd, crp->cr_errvcrccnt) +
+		ipath_snap_cntr(dd, crp->cr_errlpcrccnt) +
+		ipath_snap_cntr(dd, crp->cr_badformatcnt) +
+		dd->ipath_rxfc_unsupvl_errs;
+	if (crp->cr_rxotherlocalphyerrcnt)
+		cntrs->port_rcv_errors +=
+			ipath_snap_cntr(dd, crp->cr_rxotherlocalphyerrcnt);
+	if (crp->cr_rxvlerrcnt)
+		cntrs->port_rcv_errors +=
+			ipath_snap_cntr(dd, crp->cr_rxvlerrcnt);
+	cntrs->port_rcv_remphys_errors =
+		ipath_snap_cntr(dd, crp->cr_rcvebpcnt);
+	cntrs->port_xmit_discards = ipath_snap_cntr(dd, crp->cr_unsupvlcnt);
+	cntrs->port_xmit_data = ipath_snap_cntr(dd, crp->cr_wordsendcnt);
+	cntrs->port_rcv_data = ipath_snap_cntr(dd, crp->cr_wordrcvcnt);
+	cntrs->port_xmit_packets = ipath_snap_cntr(dd, crp->cr_pktsendcnt);
+	cntrs->port_rcv_packets = ipath_snap_cntr(dd, crp->cr_pktrcvcnt);
+	cntrs->local_link_integrity_errors =
+		crp->cr_locallinkintegrityerrcnt ?
+		ipath_snap_cntr(dd, crp->cr_locallinkintegrityerrcnt) :
+		((dd->ipath_flags & IPATH_GPIO_ERRINTRS) ?
+		 dd->ipath_lli_errs : dd->ipath_lli_errors);
+	cntrs->excessive_buffer_overrun_errors =
+		crp->cr_excessbufferovflcnt ?
+		ipath_snap_cntr(dd, crp->cr_excessbufferovflcnt) :
+		dd->ipath_overrun_thresh_errs;
+	cntrs->vl15_dropped = crp->cr_vl15droppedpktcnt ?
+		ipath_snap_cntr(dd, crp->cr_vl15droppedpktcnt) : 0;
+
+	ret = 0;
+
+bail:
+	return ret;
+}
+
+/**
+ * ipath_ib_piobufavail - callback when a PIO buffer is available
+ * @arg: the device pointer
+ *
+ * This is called from ipath_intr() at interrupt level when a PIO buffer is
+ * available after ipath_verbs_send() returned an error that no buffers were
+ * available.  Return 1 if we consumed all the PIO buffers and we still have
+ * QPs waiting for buffers (for now, just restart the send tasklet and
+ * return zero).
+ */
+int ipath_ib_piobufavail(struct ipath_ibdev *dev)
+{
+	struct list_head *list;
+	struct ipath_qp *qplist;
+	struct ipath_qp *qp;
+	unsigned long flags;
+
+	if (dev == NULL)
+		goto bail;
+
+	list = &dev->piowait;
+	qplist = NULL;
+
+	spin_lock_irqsave(&dev->pending_lock, flags);
+	while (!list_empty(list)) {
+		qp = list_entry(list->next, struct ipath_qp, piowait);
+		list_del_init(&qp->piowait);
+		qp->pio_next = qplist;
+		qplist = qp;
+		atomic_inc(&qp->refcount);
+	}
+	spin_unlock_irqrestore(&dev->pending_lock, flags);
+
+	while (qplist != NULL) {
+		qp = qplist;
+		qplist = qp->pio_next;
+
+		spin_lock_irqsave(&qp->s_lock, flags);
+		if (ib_ipath_state_ops[qp->state] & IPATH_PROCESS_SEND_OK)
+			ipath_schedule_send(qp);
+		spin_unlock_irqrestore(&qp->s_lock, flags);
+
+		/* Notify ipath_destroy_qp() if it is waiting. */
+		if (atomic_dec_and_test(&qp->refcount))
+			wake_up(&qp->wait);
+	}
+
+bail:
+	return 0;
+}
+
+static int ipath_query_device(struct ib_device *ibdev,
+			      struct ib_device_attr *props)
+{
+	struct ipath_ibdev *dev = to_idev(ibdev);
+
+	memset(props, 0, sizeof(*props));
+
+	props->device_cap_flags = IB_DEVICE_BAD_PKEY_CNTR |
+		IB_DEVICE_BAD_QKEY_CNTR | IB_DEVICE_SHUTDOWN_PORT |
+		IB_DEVICE_SYS_IMAGE_GUID | IB_DEVICE_RC_RNR_NAK_GEN |
+		IB_DEVICE_PORT_ACTIVE_EVENT | IB_DEVICE_SRQ_RESIZE;
+	props->page_size_cap = PAGE_SIZE;
+	props->vendor_id =
+		IPATH_SRC_OUI_1 << 16 | IPATH_SRC_OUI_2 << 8 | IPATH_SRC_OUI_3;
+	props->vendor_part_id = dev->dd->ipath_deviceid;
+	props->hw_ver = dev->dd->ipath_pcirev;
+
+	props->sys_image_guid = dev->sys_image_guid;
+
+	props->max_mr_size = ~0ull;
+	props->max_qp = ib_ipath_max_qps;
+	props->max_qp_wr = ib_ipath_max_qp_wrs;
+	props->max_sge = ib_ipath_max_sges;
+	props->max_cq = ib_ipath_max_cqs;
+	props->max_ah = ib_ipath_max_ahs;
+	props->max_cqe = ib_ipath_max_cqes;
+	props->max_mr = dev->lk_table.max;
+	props->max_fmr = dev->lk_table.max;
+	props->max_map_per_fmr = 32767;
+	props->max_pd = ib_ipath_max_pds;
+	props->max_qp_rd_atom = IPATH_MAX_RDMA_ATOMIC;
+	props->max_qp_init_rd_atom = 255;
+	/* props->max_res_rd_atom */
+	props->max_srq = ib_ipath_max_srqs;
+	props->max_srq_wr = ib_ipath_max_srq_wrs;
+	props->max_srq_sge = ib_ipath_max_srq_sges;
+	/* props->local_ca_ack_delay */
+	props->atomic_cap = IB_ATOMIC_GLOB;
+	props->max_pkeys = ipath_get_npkeys(dev->dd);
+	props->max_mcast_grp = ib_ipath_max_mcast_grps;
+	props->max_mcast_qp_attach = ib_ipath_max_mcast_qp_attached;
+	props->max_total_mcast_qp_attach = props->max_mcast_qp_attach *
+		props->max_mcast_grp;
+
+	return 0;
+}
+
+const u8 ipath_cvt_physportstate[32] = {
+	[INFINIPATH_IBCS_LT_STATE_DISABLED] = IB_PHYSPORTSTATE_DISABLED,
+	[INFINIPATH_IBCS_LT_STATE_LINKUP] = IB_PHYSPORTSTATE_LINKUP,
+	[INFINIPATH_IBCS_LT_STATE_POLLACTIVE] = IB_PHYSPORTSTATE_POLL,
+	[INFINIPATH_IBCS_LT_STATE_POLLQUIET] = IB_PHYSPORTSTATE_POLL,
+	[INFINIPATH_IBCS_LT_STATE_SLEEPDELAY] = IB_PHYSPORTSTATE_SLEEP,
+	[INFINIPATH_IBCS_LT_STATE_SLEEPQUIET] = IB_PHYSPORTSTATE_SLEEP,
+	[INFINIPATH_IBCS_LT_STATE_CFGDEBOUNCE] =
+		IB_PHYSPORTSTATE_CFG_TRAIN,
+	[INFINIPATH_IBCS_LT_STATE_CFGRCVFCFG] =
+		IB_PHYSPORTSTATE_CFG_TRAIN,
+	[INFINIPATH_IBCS_LT_STATE_CFGWAITRMT] =
+		IB_PHYSPORTSTATE_CFG_TRAIN,
+	[INFINIPATH_IBCS_LT_STATE_CFGIDLE] = IB_PHYSPORTSTATE_CFG_TRAIN,
+	[INFINIPATH_IBCS_LT_STATE_RECOVERRETRAIN] =
+		IB_PHYSPORTSTATE_LINK_ERR_RECOVER,
+	[INFINIPATH_IBCS_LT_STATE_RECOVERWAITRMT] =
+		IB_PHYSPORTSTATE_LINK_ERR_RECOVER,
+	[INFINIPATH_IBCS_LT_STATE_RECOVERIDLE] =
+		IB_PHYSPORTSTATE_LINK_ERR_RECOVER,
+	[0x10] = IB_PHYSPORTSTATE_CFG_TRAIN,
+	[0x11] = IB_PHYSPORTSTATE_CFG_TRAIN,
+	[0x12] = IB_PHYSPORTSTATE_CFG_TRAIN,
+	[0x13] = IB_PHYSPORTSTATE_CFG_TRAIN,
+	[0x14] = IB_PHYSPORTSTATE_CFG_TRAIN,
+	[0x15] = IB_PHYSPORTSTATE_CFG_TRAIN,
+	[0x16] = IB_PHYSPORTSTATE_CFG_TRAIN,
+	[0x17] = IB_PHYSPORTSTATE_CFG_TRAIN
+};
+
+u32 ipath_get_cr_errpkey(struct ipath_devdata *dd)
+{
+	return ipath_read_creg32(dd, dd->ipath_cregs->cr_errpkey);
+}
+
+static int ipath_query_port(struct ib_device *ibdev,
+			    u8 port, struct ib_port_attr *props)
+{
+	struct ipath_ibdev *dev = to_idev(ibdev);
+	struct ipath_devdata *dd = dev->dd;
+	enum ib_mtu mtu;
+	u16 lid = dd->ipath_lid;
+	u64 ibcstat;
+
+	memset(props, 0, sizeof(*props));
+	props->lid = lid ? lid : be16_to_cpu(IB_LID_PERMISSIVE);
+	props->lmc = dd->ipath_lmc;
+	props->sm_lid = dev->sm_lid;
+	props->sm_sl = dev->sm_sl;
+	ibcstat = dd->ipath_lastibcstat;
+	/* map LinkState to IB portinfo values.  */
+	props->state = ipath_ib_linkstate(dd, ibcstat) + 1;
+
+	/* See phys_state_show() */
+	props->phys_state = /* MEA: assumes shift == 0 */
+		ipath_cvt_physportstate[dd->ipath_lastibcstat &
+		dd->ibcs_lts_mask];
+	props->port_cap_flags = dev->port_cap_flags;
+	props->gid_tbl_len = 1;
+	props->max_msg_sz = 0x80000000;
+	props->pkey_tbl_len = ipath_get_npkeys(dd);
+	props->bad_pkey_cntr = ipath_get_cr_errpkey(dd) -
+		dev->z_pkey_violations;
+	props->qkey_viol_cntr = dev->qkey_violations;
+	props->active_width = dd->ipath_link_width_active;
+	/* See rate_show() */
+	props->active_speed = dd->ipath_link_speed_active;
+	props->max_vl_num = 1;		/* VLCap = VL0 */
+	props->init_type_reply = 0;
+
+	props->max_mtu = ipath_mtu4096 ? IB_MTU_4096 : IB_MTU_2048;
+	switch (dd->ipath_ibmtu) {
+	case 4096:
+		mtu = IB_MTU_4096;
+		break;
+	case 2048:
+		mtu = IB_MTU_2048;
+		break;
+	case 1024:
+		mtu = IB_MTU_1024;
+		break;
+	case 512:
+		mtu = IB_MTU_512;
+		break;
+	case 256:
+		mtu = IB_MTU_256;
+		break;
+	default:
+		mtu = IB_MTU_2048;
+	}
+	props->active_mtu = mtu;
+	props->subnet_timeout = dev->subnet_timeout;
+
+	return 0;
+}
+
+static int ipath_modify_device(struct ib_device *device,
+			       int device_modify_mask,
+			       struct ib_device_modify *device_modify)
+{
+	int ret;
+
+	if (device_modify_mask & ~(IB_DEVICE_MODIFY_SYS_IMAGE_GUID |
+				   IB_DEVICE_MODIFY_NODE_DESC)) {
+		ret = -EOPNOTSUPP;
+		goto bail;
+	}
+
+	if (device_modify_mask & IB_DEVICE_MODIFY_NODE_DESC)
+		memcpy(device->node_desc, device_modify->node_desc, 64);
+
+	if (device_modify_mask & IB_DEVICE_MODIFY_SYS_IMAGE_GUID)
+		to_idev(device)->sys_image_guid =
+			cpu_to_be64(device_modify->sys_image_guid);
+
+	ret = 0;
+
+bail:
+	return ret;
+}
+
+static int ipath_modify_port(struct ib_device *ibdev,
+			     u8 port, int port_modify_mask,
+			     struct ib_port_modify *props)
+{
+	struct ipath_ibdev *dev = to_idev(ibdev);
+
+	dev->port_cap_flags |= props->set_port_cap_mask;
+	dev->port_cap_flags &= ~props->clr_port_cap_mask;
+	if (port_modify_mask & IB_PORT_SHUTDOWN)
+		ipath_set_linkstate(dev->dd, IPATH_IB_LINKDOWN);
+	if (port_modify_mask & IB_PORT_RESET_QKEY_CNTR)
+		dev->qkey_violations = 0;
+	return 0;
+}
+
+static int ipath_query_gid(struct ib_device *ibdev, u8 port,
+			   int index, union ib_gid *gid)
+{
+	struct ipath_ibdev *dev = to_idev(ibdev);
+	int ret;
+
+	if (index >= 1) {
+		ret = -EINVAL;
+		goto bail;
+	}
+	gid->global.subnet_prefix = dev->gid_prefix;
+	gid->global.interface_id = dev->dd->ipath_guid;
+
+	ret = 0;
+
+bail:
+	return ret;
+}
+
+static struct ib_pd *ipath_alloc_pd(struct ib_device *ibdev,
+				    struct ib_ucontext *context,
+				    struct ib_udata *udata)
+{
+	struct ipath_ibdev *dev = to_idev(ibdev);
+	struct ipath_pd *pd;
+	struct ib_pd *ret;
+
+	/*
+	 * This is actually totally arbitrary.	Some correctness tests
+	 * assume there's a maximum number of PDs that can be allocated.
+	 * We don't actually have this limit, but we fail the test if
+	 * we allow allocations of more than we report for this value.
+	 */
+
+	pd = kmalloc(sizeof *pd, GFP_KERNEL);
+	if (!pd) {
+		ret = ERR_PTR(-ENOMEM);
+		goto bail;
+	}
+
+	spin_lock(&dev->n_pds_lock);
+	if (dev->n_pds_allocated == ib_ipath_max_pds) {
+		spin_unlock(&dev->n_pds_lock);
+		kfree(pd);
+		ret = ERR_PTR(-ENOMEM);
+		goto bail;
+	}
+
+	dev->n_pds_allocated++;
+	spin_unlock(&dev->n_pds_lock);
+
+	/* ib_alloc_pd() will initialize pd->ibpd. */
+	pd->user = udata != NULL;
+
+	ret = &pd->ibpd;
+
+bail:
+	return ret;
+}
+
+static int ipath_dealloc_pd(struct ib_pd *ibpd)
+{
+	struct ipath_pd *pd = to_ipd(ibpd);
+	struct ipath_ibdev *dev = to_idev(ibpd->device);
+
+	spin_lock(&dev->n_pds_lock);
+	dev->n_pds_allocated--;
+	spin_unlock(&dev->n_pds_lock);
+
+	kfree(pd);
+
+	return 0;
+}
+
+/**
+ * ipath_create_ah - create an address handle
+ * @pd: the protection domain
+ * @ah_attr: the attributes of the AH
+ *
+ * This may be called from interrupt context.
+ */
+static struct ib_ah *ipath_create_ah(struct ib_pd *pd,
+				     struct ib_ah_attr *ah_attr)
+{
+	struct ipath_ah *ah;
+	struct ib_ah *ret;
+	struct ipath_ibdev *dev = to_idev(pd->device);
+	unsigned long flags;
+
+	/* A multicast address requires a GRH (see ch. 8.4.1). */
+	if (ah_attr->dlid >= IPATH_MULTICAST_LID_BASE &&
+	    ah_attr->dlid != IPATH_PERMISSIVE_LID &&
+	    !(ah_attr->ah_flags & IB_AH_GRH)) {
+		ret = ERR_PTR(-EINVAL);
+		goto bail;
+	}
+
+	if (ah_attr->dlid == 0) {
+		ret = ERR_PTR(-EINVAL);
+		goto bail;
+	}
+
+	if (ah_attr->port_num < 1 ||
+	    ah_attr->port_num > pd->device->phys_port_cnt) {
+		ret = ERR_PTR(-EINVAL);
+		goto bail;
+	}
+
+	ah = kmalloc(sizeof *ah, GFP_ATOMIC);
+	if (!ah) {
+		ret = ERR_PTR(-ENOMEM);
+		goto bail;
+	}
+
+	spin_lock_irqsave(&dev->n_ahs_lock, flags);
+	if (dev->n_ahs_allocated == ib_ipath_max_ahs) {
+		spin_unlock_irqrestore(&dev->n_ahs_lock, flags);
+		kfree(ah);
+		ret = ERR_PTR(-ENOMEM);
+		goto bail;
+	}
+
+	dev->n_ahs_allocated++;
+	spin_unlock_irqrestore(&dev->n_ahs_lock, flags);
+
+	/* ib_create_ah() will initialize ah->ibah. */
+	ah->attr = *ah_attr;
+	ah->attr.static_rate = ipath_ib_rate_to_mult(ah_attr->static_rate);
+
+	ret = &ah->ibah;
+
+bail:
+	return ret;
+}
+
+/**
+ * ipath_destroy_ah - destroy an address handle
+ * @ibah: the AH to destroy
+ *
+ * This may be called from interrupt context.
+ */
+static int ipath_destroy_ah(struct ib_ah *ibah)
+{
+	struct ipath_ibdev *dev = to_idev(ibah->device);
+	struct ipath_ah *ah = to_iah(ibah);
+	unsigned long flags;
+
+	spin_lock_irqsave(&dev->n_ahs_lock, flags);
+	dev->n_ahs_allocated--;
+	spin_unlock_irqrestore(&dev->n_ahs_lock, flags);
+
+	kfree(ah);
+
+	return 0;
+}
+
+static int ipath_query_ah(struct ib_ah *ibah, struct ib_ah_attr *ah_attr)
+{
+	struct ipath_ah *ah = to_iah(ibah);
+
+	*ah_attr = ah->attr;
+	ah_attr->static_rate = ipath_mult_to_ib_rate(ah->attr.static_rate);
+
+	return 0;
+}
+
+/**
+ * ipath_get_npkeys - return the size of the PKEY table for port 0
+ * @dd: the infinipath device
+ */
+unsigned ipath_get_npkeys(struct ipath_devdata *dd)
+{
+	return ARRAY_SIZE(dd->ipath_pd[0]->port_pkeys);
+}
+
+/**
+ * ipath_get_pkey - return the indexed PKEY from the port PKEY table
+ * @dd: the infinipath device
+ * @index: the PKEY index
+ */
+unsigned ipath_get_pkey(struct ipath_devdata *dd, unsigned index)
+{
+	unsigned ret;
+
+	/* always a kernel port, no locking needed */
+	if (index >= ARRAY_SIZE(dd->ipath_pd[0]->port_pkeys))
+		ret = 0;
+	else
+		ret = dd->ipath_pd[0]->port_pkeys[index];
+
+	return ret;
+}
+
+static int ipath_query_pkey(struct ib_device *ibdev, u8 port, u16 index,
+			    u16 *pkey)
+{
+	struct ipath_ibdev *dev = to_idev(ibdev);
+	int ret;
+
+	if (index >= ipath_get_npkeys(dev->dd)) {
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	*pkey = ipath_get_pkey(dev->dd, index);
+	ret = 0;
+
+bail:
+	return ret;
+}
+
+/**
+ * ipath_alloc_ucontext - allocate a ucontest
+ * @ibdev: the infiniband device
+ * @udata: not used by the InfiniPath driver
+ */
+
+static struct ib_ucontext *ipath_alloc_ucontext(struct ib_device *ibdev,
+						struct ib_udata *udata)
+{
+	struct ipath_ucontext *context;
+	struct ib_ucontext *ret;
+
+	context = kmalloc(sizeof *context, GFP_KERNEL);
+	if (!context) {
+		ret = ERR_PTR(-ENOMEM);
+		goto bail;
+	}
+
+	ret = &context->ibucontext;
+
+bail:
+	return ret;
+}
+
+static int ipath_dealloc_ucontext(struct ib_ucontext *context)
+{
+	kfree(to_iucontext(context));
+	return 0;
+}
+
+static int ipath_verbs_register_sysfs(struct ib_device *dev);
+
+static void __verbs_timer(unsigned long arg)
+{
+	struct ipath_devdata *dd = (struct ipath_devdata *) arg;
+
+	/* Handle verbs layer timeouts. */
+	ipath_ib_timer(dd->verbs_dev);
+
+	mod_timer(&dd->verbs_timer, jiffies + 1);
+}
+
+static int enable_timer(struct ipath_devdata *dd)
+{
+	/*
+	 * Early chips had a design flaw where the chip and kernel idea
+	 * of the tail register don't always agree, and therefore we won't
+	 * get an interrupt on the next packet received.
+	 * If the board supports per packet receive interrupts, use it.
+	 * Otherwise, the timer function periodically checks for packets
+	 * to cover this case.
+	 * Either way, the timer is needed for verbs layer related
+	 * processing.
+	 */
+	if (dd->ipath_flags & IPATH_GPIO_INTR) {
+		ipath_write_kreg(dd, dd->ipath_kregs->kr_debugportselect,
+				 0x2074076542310ULL);
+		/* Enable GPIO bit 2 interrupt */
+		dd->ipath_gpio_mask |= (u64) (1 << IPATH_GPIO_PORT0_BIT);
+		ipath_write_kreg(dd, dd->ipath_kregs->kr_gpio_mask,
+				 dd->ipath_gpio_mask);
+	}
+
+	init_timer(&dd->verbs_timer);
+	dd->verbs_timer.function = __verbs_timer;
+	dd->verbs_timer.data = (unsigned long)dd;
+	dd->verbs_timer.expires = jiffies + 1;
+	add_timer(&dd->verbs_timer);
+
+	return 0;
+}
+
+static int disable_timer(struct ipath_devdata *dd)
+{
+	/* Disable GPIO bit 2 interrupt */
+	if (dd->ipath_flags & IPATH_GPIO_INTR) {
+                /* Disable GPIO bit 2 interrupt */
+		dd->ipath_gpio_mask &= ~((u64) (1 << IPATH_GPIO_PORT0_BIT));
+		ipath_write_kreg(dd, dd->ipath_kregs->kr_gpio_mask,
+				 dd->ipath_gpio_mask);
+		/*
+		 * We might want to undo changes to debugportselect,
+		 * but how?
+		 */
+	}
+
+	del_timer_sync(&dd->verbs_timer);
+
+	return 0;
+}
+
+/**
+ * ipath_register_ib_device - register our device with the infiniband core
+ * @dd: the device data structure
+ * Return the allocated ipath_ibdev pointer or NULL on error.
+ */
+int ipath_register_ib_device(struct ipath_devdata *dd)
+{
+	struct ipath_verbs_counters cntrs;
+	struct ipath_ibdev *idev;
+	struct ib_device *dev;
+	struct ipath_verbs_txreq *tx;
+	unsigned i;
+	int ret;
+
+	idev = (struct ipath_ibdev *)ib_alloc_device(sizeof *idev);
+	if (idev == NULL) {
+		ret = -ENOMEM;
+		goto bail;
+	}
+
+	dev = &idev->ibdev;
+
+	if (dd->ipath_sdma_descq_cnt) {
+		tx = kmalloc(dd->ipath_sdma_descq_cnt * sizeof *tx,
+			     GFP_KERNEL);
+		if (tx == NULL) {
+			ret = -ENOMEM;
+			goto err_tx;
+		}
+	} else
+		tx = NULL;
+	idev->txreq_bufs = tx;
+
+	/* Only need to initialize non-zero fields. */
+	spin_lock_init(&idev->n_pds_lock);
+	spin_lock_init(&idev->n_ahs_lock);
+	spin_lock_init(&idev->n_cqs_lock);
+	spin_lock_init(&idev->n_qps_lock);
+	spin_lock_init(&idev->n_srqs_lock);
+	spin_lock_init(&idev->n_mcast_grps_lock);
+
+	spin_lock_init(&idev->qp_table.lock);
+	spin_lock_init(&idev->lk_table.lock);
+	idev->sm_lid = __constant_be16_to_cpu(IB_LID_PERMISSIVE);
+	/* Set the prefix to the default value (see ch. 4.1.1) */
+	idev->gid_prefix = __constant_cpu_to_be64(0xfe80000000000000ULL);
+
+	ret = ipath_init_qp_table(idev, ib_ipath_qp_table_size);
+	if (ret)
+		goto err_qp;
+
+	/*
+	 * The top ib_ipath_lkey_table_size bits are used to index the
+	 * table.  The lower 8 bits can be owned by the user (copied from
+	 * the LKEY).  The remaining bits act as a generation number or tag.
+	 */
+	idev->lk_table.max = 1 << ib_ipath_lkey_table_size;
+	idev->lk_table.table = kzalloc(idev->lk_table.max *
+				       sizeof(*idev->lk_table.table),
+				       GFP_KERNEL);
+	if (idev->lk_table.table == NULL) {
+		ret = -ENOMEM;
+		goto err_lk;
+	}
+	INIT_LIST_HEAD(&idev->pending_mmaps);
+	spin_lock_init(&idev->pending_lock);
+	idev->mmap_offset = PAGE_SIZE;
+	spin_lock_init(&idev->mmap_offset_lock);
+	INIT_LIST_HEAD(&idev->pending[0]);
+	INIT_LIST_HEAD(&idev->pending[1]);
+	INIT_LIST_HEAD(&idev->pending[2]);
+	INIT_LIST_HEAD(&idev->piowait);
+	INIT_LIST_HEAD(&idev->rnrwait);
+	INIT_LIST_HEAD(&idev->txreq_free);
+	idev->pending_index = 0;
+	idev->port_cap_flags =
+		IB_PORT_SYS_IMAGE_GUID_SUP | IB_PORT_CLIENT_REG_SUP;
+	if (dd->ipath_flags & IPATH_HAS_LINK_LATENCY)
+		idev->port_cap_flags |= IB_PORT_LINK_LATENCY_SUP;
+	idev->pma_counter_select[0] = IB_PMA_PORT_XMIT_DATA;
+	idev->pma_counter_select[1] = IB_PMA_PORT_RCV_DATA;
+	idev->pma_counter_select[2] = IB_PMA_PORT_XMIT_PKTS;
+	idev->pma_counter_select[3] = IB_PMA_PORT_RCV_PKTS;
+	idev->pma_counter_select[4] = IB_PMA_PORT_XMIT_WAIT;
+
+	/* Snapshot current HW counters to "clear" them. */
+	ipath_get_counters(dd, &cntrs);
+	idev->z_symbol_error_counter = cntrs.symbol_error_counter;
+	idev->z_link_error_recovery_counter =
+		cntrs.link_error_recovery_counter;
+	idev->z_link_downed_counter = cntrs.link_downed_counter;
+	idev->z_port_rcv_errors = cntrs.port_rcv_errors;
+	idev->z_port_rcv_remphys_errors =
+		cntrs.port_rcv_remphys_errors;
+	idev->z_port_xmit_discards = cntrs.port_xmit_discards;
+	idev->z_port_xmit_data = cntrs.port_xmit_data;
+	idev->z_port_rcv_data = cntrs.port_rcv_data;
+	idev->z_port_xmit_packets = cntrs.port_xmit_packets;
+	idev->z_port_rcv_packets = cntrs.port_rcv_packets;
+	idev->z_local_link_integrity_errors =
+		cntrs.local_link_integrity_errors;
+	idev->z_excessive_buffer_overrun_errors =
+		cntrs.excessive_buffer_overrun_errors;
+	idev->z_vl15_dropped = cntrs.vl15_dropped;
+
+	for (i = 0; i < dd->ipath_sdma_descq_cnt; i++, tx++)
+		list_add(&tx->txreq.list, &idev->txreq_free);
+
+	/*
+	 * The system image GUID is supposed to be the same for all
+	 * IB HCAs in a single system but since there can be other
+	 * device types in the system, we can't be sure this is unique.
+	 */
+	if (!sys_image_guid)
+		sys_image_guid = dd->ipath_guid;
+	idev->sys_image_guid = sys_image_guid;
+	idev->ib_unit = dd->ipath_unit;
+	idev->dd = dd;
+
+	strlcpy(dev->name, "ipath%d", IB_DEVICE_NAME_MAX);
+	dev->owner = THIS_MODULE;
+	dev->node_guid = dd->ipath_guid;
+	dev->uverbs_abi_ver = IPATH_UVERBS_ABI_VERSION;
+	dev->uverbs_cmd_mask =
+		(1ull << IB_USER_VERBS_CMD_GET_CONTEXT)		|
+		(1ull << IB_USER_VERBS_CMD_QUERY_DEVICE)	|
+		(1ull << IB_USER_VERBS_CMD_QUERY_PORT)		|
+		(1ull << IB_USER_VERBS_CMD_ALLOC_PD)		|
+		(1ull << IB_USER_VERBS_CMD_DEALLOC_PD)		|
+		(1ull << IB_USER_VERBS_CMD_CREATE_AH)		|
+		(1ull << IB_USER_VERBS_CMD_DESTROY_AH)		|
+		(1ull << IB_USER_VERBS_CMD_QUERY_AH)		|
+		(1ull << IB_USER_VERBS_CMD_REG_MR)		|
+		(1ull << IB_USER_VERBS_CMD_DEREG_MR)		|
+		(1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) |
+		(1ull << IB_USER_VERBS_CMD_CREATE_CQ)		|
+		(1ull << IB_USER_VERBS_CMD_RESIZE_CQ)		|
+		(1ull << IB_USER_VERBS_CMD_DESTROY_CQ)		|
+		(1ull << IB_USER_VERBS_CMD_POLL_CQ)		|
+		(1ull << IB_USER_VERBS_CMD_REQ_NOTIFY_CQ)	|
+		(1ull << IB_USER_VERBS_CMD_CREATE_QP)		|
+		(1ull << IB_USER_VERBS_CMD_QUERY_QP)		|
+		(1ull << IB_USER_VERBS_CMD_MODIFY_QP)		|
+		(1ull << IB_USER_VERBS_CMD_DESTROY_QP)		|
+		(1ull << IB_USER_VERBS_CMD_POST_SEND)		|
+		(1ull << IB_USER_VERBS_CMD_POST_RECV)		|
+		(1ull << IB_USER_VERBS_CMD_ATTACH_MCAST)	|
+		(1ull << IB_USER_VERBS_CMD_DETACH_MCAST)	|
+		(1ull << IB_USER_VERBS_CMD_CREATE_SRQ)		|
+		(1ull << IB_USER_VERBS_CMD_MODIFY_SRQ)		|
+		(1ull << IB_USER_VERBS_CMD_QUERY_SRQ)		|
+		(1ull << IB_USER_VERBS_CMD_DESTROY_SRQ)		|
+		(1ull << IB_USER_VERBS_CMD_POST_SRQ_RECV);
+	dev->node_type = RDMA_NODE_IB_CA;
+	dev->phys_port_cnt = 1;
+	dev->num_comp_vectors = 1;
+	dev->dma_device = &dd->pcidev->dev;
+	dev->query_device = ipath_query_device;
+	dev->modify_device = ipath_modify_device;
+	dev->query_port = ipath_query_port;
+	dev->modify_port = ipath_modify_port;
+	dev->query_pkey = ipath_query_pkey;
+	dev->query_gid = ipath_query_gid;
+	dev->alloc_ucontext = ipath_alloc_ucontext;
+	dev->dealloc_ucontext = ipath_dealloc_ucontext;
+	dev->alloc_pd = ipath_alloc_pd;
+	dev->dealloc_pd = ipath_dealloc_pd;
+	dev->create_ah = ipath_create_ah;
+	dev->destroy_ah = ipath_destroy_ah;
+	dev->query_ah = ipath_query_ah;
+	dev->create_srq = ipath_create_srq;
+	dev->modify_srq = ipath_modify_srq;
+	dev->query_srq = ipath_query_srq;
+	dev->destroy_srq = ipath_destroy_srq;
+	dev->create_qp = ipath_create_qp;
+	dev->modify_qp = ipath_modify_qp;
+	dev->query_qp = ipath_query_qp;
+	dev->destroy_qp = ipath_destroy_qp;
+	dev->post_send = ipath_post_send;
+	dev->post_recv = ipath_post_receive;
+	dev->post_srq_recv = ipath_post_srq_receive;
+	dev->create_cq = ipath_create_cq;
+	dev->destroy_cq = ipath_destroy_cq;
+	dev->resize_cq = ipath_resize_cq;
+	dev->poll_cq = ipath_poll_cq;
+	dev->req_notify_cq = ipath_req_notify_cq;
+	dev->get_dma_mr = ipath_get_dma_mr;
+	dev->reg_phys_mr = ipath_reg_phys_mr;
+	dev->reg_user_mr = ipath_reg_user_mr;
+	dev->dereg_mr = ipath_dereg_mr;
+	dev->alloc_fmr = ipath_alloc_fmr;
+	dev->map_phys_fmr = ipath_map_phys_fmr;
+	dev->unmap_fmr = ipath_unmap_fmr;
+	dev->dealloc_fmr = ipath_dealloc_fmr;
+	dev->attach_mcast = ipath_multicast_attach;
+	dev->detach_mcast = ipath_multicast_detach;
+	dev->process_mad = ipath_process_mad;
+	dev->mmap = ipath_mmap;
+	dev->dma_ops = &ipath_dma_mapping_ops;
+
+	snprintf(dev->node_desc, sizeof(dev->node_desc),
+		 IPATH_IDSTR " %s", init_utsname()->nodename);
+
+	ret = ib_register_device(dev, NULL);
+	if (ret)
+		goto err_reg;
+
+	ret = ipath_verbs_register_sysfs(dev);
+	if (ret)
+		goto err_class;
+
+	enable_timer(dd);
+
+	goto bail;
+
+err_class:
+	ib_unregister_device(dev);
+err_reg:
+	kfree(idev->lk_table.table);
+err_lk:
+	kfree(idev->qp_table.table);
+err_qp:
+	kfree(idev->txreq_bufs);
+err_tx:
+	ib_dealloc_device(dev);
+	ipath_dev_err(dd, "cannot register verbs: %d!\n", -ret);
+	idev = NULL;
+
+bail:
+	dd->verbs_dev = idev;
+	return ret;
+}
+
+void ipath_unregister_ib_device(struct ipath_ibdev *dev)
+{
+	struct ib_device *ibdev = &dev->ibdev;
+	u32 qps_inuse;
+
+	ib_unregister_device(ibdev);
+
+	disable_timer(dev->dd);
+
+	if (!list_empty(&dev->pending[0]) ||
+	    !list_empty(&dev->pending[1]) ||
+	    !list_empty(&dev->pending[2]))
+		ipath_dev_err(dev->dd, "pending list not empty!\n");
+	if (!list_empty(&dev->piowait))
+		ipath_dev_err(dev->dd, "piowait list not empty!\n");
+	if (!list_empty(&dev->rnrwait))
+		ipath_dev_err(dev->dd, "rnrwait list not empty!\n");
+	if (!ipath_mcast_tree_empty())
+		ipath_dev_err(dev->dd, "multicast table memory leak!\n");
+	/*
+	 * Note that ipath_unregister_ib_device() can be called before all
+	 * the QPs are destroyed!
+	 */
+	qps_inuse = ipath_free_all_qps(&dev->qp_table);
+	if (qps_inuse)
+		ipath_dev_err(dev->dd, "QP memory leak! %u still in use\n",
+			qps_inuse);
+	kfree(dev->qp_table.table);
+	kfree(dev->lk_table.table);
+	kfree(dev->txreq_bufs);
+	ib_dealloc_device(ibdev);
+}
+
+static ssize_t show_rev(struct device *device, struct device_attribute *attr,
+			char *buf)
+{
+	struct ipath_ibdev *dev =
+		container_of(device, struct ipath_ibdev, ibdev.dev);
+
+	return sprintf(buf, "%x\n", dev->dd->ipath_pcirev);
+}
+
+static ssize_t show_hca(struct device *device, struct device_attribute *attr,
+			char *buf)
+{
+	struct ipath_ibdev *dev =
+		container_of(device, struct ipath_ibdev, ibdev.dev);
+	int ret;
+
+	ret = dev->dd->ipath_f_get_boardname(dev->dd, buf, 128);
+	if (ret < 0)
+		goto bail;
+	strcat(buf, "\n");
+	ret = strlen(buf);
+
+bail:
+	return ret;
+}
+
+static ssize_t show_stats(struct device *device, struct device_attribute *attr,
+			  char *buf)
+{
+	struct ipath_ibdev *dev =
+		container_of(device, struct ipath_ibdev, ibdev.dev);
+	int i;
+	int len;
+
+	len = sprintf(buf,
+		      "RC resends  %d\n"
+		      "RC no QACK  %d\n"
+		      "RC ACKs     %d\n"
+		      "RC SEQ NAKs %d\n"
+		      "RC RDMA seq %d\n"
+		      "RC RNR NAKs %d\n"
+		      "RC OTH NAKs %d\n"
+		      "RC timeouts %d\n"
+		      "RC RDMA dup %d\n"
+		      "piobuf wait %d\n"
+		      "unaligned   %d\n"
+		      "PKT drops   %d\n"
+		      "WQE errs    %d\n",
+		      dev->n_rc_resends, dev->n_rc_qacks, dev->n_rc_acks,
+		      dev->n_seq_naks, dev->n_rdma_seq, dev->n_rnr_naks,
+		      dev->n_other_naks, dev->n_timeouts,
+		      dev->n_rdma_dup_busy, dev->n_piowait, dev->n_unaligned,
+		      dev->n_pkt_drops, dev->n_wqe_errs);
+	for (i = 0; i < ARRAY_SIZE(dev->opstats); i++) {
+		const struct ipath_opcode_stats *si = &dev->opstats[i];
+
+		if (!si->n_packets && !si->n_bytes)
+			continue;
+		len += sprintf(buf + len, "%02x %llu/%llu\n", i,
+			       (unsigned long long) si->n_packets,
+			       (unsigned long long) si->n_bytes);
+	}
+	return len;
+}
+
+static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL);
+static DEVICE_ATTR(hca_type, S_IRUGO, show_hca, NULL);
+static DEVICE_ATTR(board_id, S_IRUGO, show_hca, NULL);
+static DEVICE_ATTR(stats, S_IRUGO, show_stats, NULL);
+
+static struct device_attribute *ipath_class_attributes[] = {
+	&dev_attr_hw_rev,
+	&dev_attr_hca_type,
+	&dev_attr_board_id,
+	&dev_attr_stats
+};
+
+static int ipath_verbs_register_sysfs(struct ib_device *dev)
+{
+	int i;
+	int ret;
+
+	for (i = 0; i < ARRAY_SIZE(ipath_class_attributes); ++i) {
+		ret = device_create_file(&dev->dev,
+				       ipath_class_attributes[i]);
+		if (ret)
+			goto bail;
+	}
+	return 0;
+bail:
+	for (i = 0; i < ARRAY_SIZE(ipath_class_attributes); ++i)
+		device_remove_file(&dev->dev, ipath_class_attributes[i]);
+	return ret;
+}
diff --git a/drivers/infiniband/hw/ipath/ipath_verbs.h b/drivers/infiniband/hw/ipath/ipath_verbs.h
new file mode 100644
index 0000000..ae6cff4
--- /dev/null
+++ b/drivers/infiniband/hw/ipath/ipath_verbs.h
@@ -0,0 +1,936 @@
+/*
+ * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef IPATH_VERBS_H
+#define IPATH_VERBS_H
+
+#include <linux/types.h>
+#include <linux/spinlock.h>
+#include <linux/kernel.h>
+#include <linux/interrupt.h>
+#include <linux/kref.h>
+#include <rdma/ib_pack.h>
+#include <rdma/ib_user_verbs.h>
+
+#include "ipath_kernel.h"
+
+#define IPATH_MAX_RDMA_ATOMIC	4
+
+#define QPN_MAX                 (1 << 24)
+#define QPNMAP_ENTRIES          (QPN_MAX / PAGE_SIZE / BITS_PER_BYTE)
+
+/*
+ * Increment this value if any changes that break userspace ABI
+ * compatibility are made.
+ */
+#define IPATH_UVERBS_ABI_VERSION       2
+
+/*
+ * Define an ib_cq_notify value that is not valid so we know when CQ
+ * notifications are armed.
+ */
+#define IB_CQ_NONE	(IB_CQ_NEXT_COMP + 1)
+
+/* AETH NAK opcode values */
+#define IB_RNR_NAK			0x20
+#define IB_NAK_PSN_ERROR		0x60
+#define IB_NAK_INVALID_REQUEST		0x61
+#define IB_NAK_REMOTE_ACCESS_ERROR	0x62
+#define IB_NAK_REMOTE_OPERATIONAL_ERROR 0x63
+#define IB_NAK_INVALID_RD_REQUEST	0x64
+
+/* Flags for checking QP state (see ib_ipath_state_ops[]) */
+#define IPATH_POST_SEND_OK		0x01
+#define IPATH_POST_RECV_OK		0x02
+#define IPATH_PROCESS_RECV_OK		0x04
+#define IPATH_PROCESS_SEND_OK		0x08
+#define IPATH_PROCESS_NEXT_SEND_OK	0x10
+#define IPATH_FLUSH_SEND		0x20
+#define IPATH_FLUSH_RECV		0x40
+#define IPATH_PROCESS_OR_FLUSH_SEND \
+	(IPATH_PROCESS_SEND_OK | IPATH_FLUSH_SEND)
+
+/* IB Performance Manager status values */
+#define IB_PMA_SAMPLE_STATUS_DONE	0x00
+#define IB_PMA_SAMPLE_STATUS_STARTED	0x01
+#define IB_PMA_SAMPLE_STATUS_RUNNING	0x02
+
+/* Mandatory IB performance counter select values. */
+#define IB_PMA_PORT_XMIT_DATA	cpu_to_be16(0x0001)
+#define IB_PMA_PORT_RCV_DATA	cpu_to_be16(0x0002)
+#define IB_PMA_PORT_XMIT_PKTS	cpu_to_be16(0x0003)
+#define IB_PMA_PORT_RCV_PKTS	cpu_to_be16(0x0004)
+#define IB_PMA_PORT_XMIT_WAIT	cpu_to_be16(0x0005)
+
+struct ib_reth {
+	__be64 vaddr;
+	__be32 rkey;
+	__be32 length;
+} __attribute__ ((packed));
+
+struct ib_atomic_eth {
+	__be32 vaddr[2];	/* unaligned so access as 2 32-bit words */
+	__be32 rkey;
+	__be64 swap_data;
+	__be64 compare_data;
+} __attribute__ ((packed));
+
+struct ipath_other_headers {
+	__be32 bth[3];
+	union {
+		struct {
+			__be32 deth[2];
+			__be32 imm_data;
+		} ud;
+		struct {
+			struct ib_reth reth;
+			__be32 imm_data;
+		} rc;
+		struct {
+			__be32 aeth;
+			__be32 atomic_ack_eth[2];
+		} at;
+		__be32 imm_data;
+		__be32 aeth;
+		struct ib_atomic_eth atomic_eth;
+	} u;
+} __attribute__ ((packed));
+
+/*
+ * Note that UD packets with a GRH header are 8+40+12+8 = 68 bytes
+ * long (72 w/ imm_data).  Only the first 56 bytes of the IB header
+ * will be in the eager header buffer.  The remaining 12 or 16 bytes
+ * are in the data buffer.
+ */
+struct ipath_ib_header {
+	__be16 lrh[4];
+	union {
+		struct {
+			struct ib_grh grh;
+			struct ipath_other_headers oth;
+		} l;
+		struct ipath_other_headers oth;
+	} u;
+} __attribute__ ((packed));
+
+struct ipath_pio_header {
+	__le32 pbc[2];
+	struct ipath_ib_header hdr;
+} __attribute__ ((packed));
+
+/*
+ * There is one struct ipath_mcast for each multicast GID.
+ * All attached QPs are then stored as a list of
+ * struct ipath_mcast_qp.
+ */
+struct ipath_mcast_qp {
+	struct list_head list;
+	struct ipath_qp *qp;
+};
+
+struct ipath_mcast {
+	struct rb_node rb_node;
+	union ib_gid mgid;
+	struct list_head qp_list;
+	wait_queue_head_t wait;
+	atomic_t refcount;
+	int n_attached;
+};
+
+/* Protection domain */
+struct ipath_pd {
+	struct ib_pd ibpd;
+	int user;		/* non-zero if created from user space */
+};
+
+/* Address Handle */
+struct ipath_ah {
+	struct ib_ah ibah;
+	struct ib_ah_attr attr;
+};
+
+/*
+ * This structure is used by ipath_mmap() to validate an offset
+ * when an mmap() request is made.  The vm_area_struct then uses
+ * this as its vm_private_data.
+ */
+struct ipath_mmap_info {
+	struct list_head pending_mmaps;
+	struct ib_ucontext *context;
+	void *obj;
+	__u64 offset;
+	struct kref ref;
+	unsigned size;
+};
+
+/*
+ * This structure is used to contain the head pointer, tail pointer,
+ * and completion queue entries as a single memory allocation so
+ * it can be mmap'ed into user space.
+ */
+struct ipath_cq_wc {
+	u32 head;		/* index of next entry to fill */
+	u32 tail;		/* index of next ib_poll_cq() entry */
+	union {
+		/* these are actually size ibcq.cqe + 1 */
+		struct ib_uverbs_wc uqueue[0];
+		struct ib_wc kqueue[0];
+	};
+};
+
+/*
+ * The completion queue structure.
+ */
+struct ipath_cq {
+	struct ib_cq ibcq;
+	struct tasklet_struct comptask;
+	spinlock_t lock;
+	u8 notify;
+	u8 triggered;
+	struct ipath_cq_wc *queue;
+	struct ipath_mmap_info *ip;
+};
+
+/*
+ * A segment is a linear region of low physical memory.
+ * XXX Maybe we should use phys addr here and kmap()/kunmap().
+ * Used by the verbs layer.
+ */
+struct ipath_seg {
+	void *vaddr;
+	size_t length;
+};
+
+/* The number of ipath_segs that fit in a page. */
+#define IPATH_SEGSZ     (PAGE_SIZE / sizeof (struct ipath_seg))
+
+struct ipath_segarray {
+	struct ipath_seg segs[IPATH_SEGSZ];
+};
+
+struct ipath_mregion {
+	struct ib_pd *pd;	/* shares refcnt of ibmr.pd */
+	u64 user_base;		/* User's address for this region */
+	u64 iova;		/* IB start address of this region */
+	size_t length;
+	u32 lkey;
+	u32 offset;		/* offset (bytes) to start of region */
+	int access_flags;
+	u32 max_segs;		/* number of ipath_segs in all the arrays */
+	u32 mapsz;		/* size of the map array */
+	struct ipath_segarray *map[0];	/* the segments */
+};
+
+/*
+ * These keep track of the copy progress within a memory region.
+ * Used by the verbs layer.
+ */
+struct ipath_sge {
+	struct ipath_mregion *mr;
+	void *vaddr;		/* kernel virtual address of segment */
+	u32 sge_length;		/* length of the SGE */
+	u32 length;		/* remaining length of the segment */
+	u16 m;			/* current index: mr->map[m] */
+	u16 n;			/* current index: mr->map[m]->segs[n] */
+};
+
+/* Memory region */
+struct ipath_mr {
+	struct ib_mr ibmr;
+	struct ib_umem *umem;
+	struct ipath_mregion mr;	/* must be last */
+};
+
+/*
+ * Send work request queue entry.
+ * The size of the sg_list is determined when the QP is created and stored
+ * in qp->s_max_sge.
+ */
+struct ipath_swqe {
+	struct ib_send_wr wr;	/* don't use wr.sg_list */
+	u32 psn;		/* first packet sequence number */
+	u32 lpsn;		/* last packet sequence number */
+	u32 ssn;		/* send sequence number */
+	u32 length;		/* total length of data in sg_list */
+	struct ipath_sge sg_list[0];
+};
+
+/*
+ * Receive work request queue entry.
+ * The size of the sg_list is determined when the QP (or SRQ) is created
+ * and stored in qp->r_rq.max_sge (or srq->rq.max_sge).
+ */
+struct ipath_rwqe {
+	u64 wr_id;
+	u8 num_sge;
+	struct ib_sge sg_list[0];
+};
+
+/*
+ * This structure is used to contain the head pointer, tail pointer,
+ * and receive work queue entries as a single memory allocation so
+ * it can be mmap'ed into user space.
+ * Note that the wq array elements are variable size so you can't
+ * just index into the array to get the N'th element;
+ * use get_rwqe_ptr() instead.
+ */
+struct ipath_rwq {
+	u32 head;		/* new work requests posted to the head */
+	u32 tail;		/* receives pull requests from here. */
+	struct ipath_rwqe wq[0];
+};
+
+struct ipath_rq {
+	struct ipath_rwq *wq;
+	spinlock_t lock;
+	u32 size;		/* size of RWQE array */
+	u8 max_sge;
+};
+
+struct ipath_srq {
+	struct ib_srq ibsrq;
+	struct ipath_rq rq;
+	struct ipath_mmap_info *ip;
+	/* send signal when number of RWQEs < limit */
+	u32 limit;
+};
+
+struct ipath_sge_state {
+	struct ipath_sge *sg_list;      /* next SGE to be used if any */
+	struct ipath_sge sge;   /* progress state for the current SGE */
+	u8 num_sge;
+	u8 static_rate;
+};
+
+/*
+ * This structure holds the information that the send tasklet needs
+ * to send a RDMA read response or atomic operation.
+ */
+struct ipath_ack_entry {
+	u8 opcode;
+	u8 sent;
+	u32 psn;
+	union {
+		struct ipath_sge_state rdma_sge;
+		u64 atomic_data;
+	};
+};
+
+/*
+ * Variables prefixed with s_ are for the requester (sender).
+ * Variables prefixed with r_ are for the responder (receiver).
+ * Variables prefixed with ack_ are for responder replies.
+ *
+ * Common variables are protected by both r_rq.lock and s_lock in that order
+ * which only happens in modify_qp() or changing the QP 'state'.
+ */
+struct ipath_qp {
+	struct ib_qp ibqp;
+	struct ipath_qp *next;		/* link list for QPN hash table */
+	struct ipath_qp *timer_next;	/* link list for ipath_ib_timer() */
+	struct ipath_qp *pio_next;	/* link for ipath_ib_piobufavail() */
+	struct list_head piowait;	/* link for wait PIO buf */
+	struct list_head timerwait;	/* link for waiting for timeouts */
+	struct ib_ah_attr remote_ah_attr;
+	struct ipath_ib_header s_hdr;	/* next packet header to send */
+	atomic_t refcount;
+	wait_queue_head_t wait;
+	wait_queue_head_t wait_dma;
+	struct tasklet_struct s_task;
+	struct ipath_mmap_info *ip;
+	struct ipath_sge_state *s_cur_sge;
+	struct ipath_verbs_txreq *s_tx;
+	struct ipath_sge_state s_sge;	/* current send request data */
+	struct ipath_ack_entry s_ack_queue[IPATH_MAX_RDMA_ATOMIC + 1];
+	struct ipath_sge_state s_ack_rdma_sge;
+	struct ipath_sge_state s_rdma_read_sge;
+	struct ipath_sge_state r_sge;	/* current receive data */
+	spinlock_t s_lock;
+	atomic_t s_dma_busy;
+	u16 s_pkt_delay;
+	u16 s_hdrwords;		/* size of s_hdr in 32 bit words */
+	u32 s_cur_size;		/* size of send packet in bytes */
+	u32 s_len;		/* total length of s_sge */
+	u32 s_rdma_read_len;	/* total length of s_rdma_read_sge */
+	u32 s_next_psn;		/* PSN for next request */
+	u32 s_last_psn;		/* last response PSN processed */
+	u32 s_psn;		/* current packet sequence number */
+	u32 s_ack_rdma_psn;	/* PSN for sending RDMA read responses */
+	u32 s_ack_psn;		/* PSN for acking sends and RDMA writes */
+	u32 s_rnr_timeout;	/* number of milliseconds for RNR timeout */
+	u32 r_ack_psn;		/* PSN for next ACK or atomic ACK */
+	u64 r_wr_id;		/* ID for current receive WQE */
+	unsigned long r_aflags;
+	u32 r_len;		/* total length of r_sge */
+	u32 r_rcv_len;		/* receive data len processed */
+	u32 r_psn;		/* expected rcv packet sequence number */
+	u32 r_msn;		/* message sequence number */
+	u8 state;		/* QP state */
+	u8 s_state;		/* opcode of last packet sent */
+	u8 s_ack_state;		/* opcode of packet to ACK */
+	u8 s_nak_state;		/* non-zero if NAK is pending */
+	u8 r_state;		/* opcode of last packet received */
+	u8 r_nak_state;		/* non-zero if NAK is pending */
+	u8 r_min_rnr_timer;	/* retry timeout value for RNR NAKs */
+	u8 r_flags;
+	u8 r_max_rd_atomic;	/* max number of RDMA read/atomic to receive */
+	u8 r_head_ack_queue;	/* index into s_ack_queue[] */
+	u8 qp_access_flags;
+	u8 s_max_sge;		/* size of s_wq->sg_list */
+	u8 s_retry_cnt;		/* number of times to retry */
+	u8 s_rnr_retry_cnt;
+	u8 s_retry;		/* requester retry counter */
+	u8 s_rnr_retry;		/* requester RNR retry counter */
+	u8 s_pkey_index;	/* PKEY index to use */
+	u8 s_max_rd_atomic;	/* max number of RDMA read/atomic to send */
+	u8 s_num_rd_atomic;	/* number of RDMA read/atomic pending */
+	u8 s_tail_ack_queue;	/* index into s_ack_queue[] */
+	u8 s_flags;
+	u8 s_dmult;
+	u8 s_draining;
+	u8 timeout;		/* Timeout for this QP */
+	enum ib_mtu path_mtu;
+	u32 remote_qpn;
+	u32 qkey;		/* QKEY for this QP (for UD or RD) */
+	u32 s_size;		/* send work queue size */
+	u32 s_head;		/* new entries added here */
+	u32 s_tail;		/* next entry to process */
+	u32 s_cur;		/* current work queue entry */
+	u32 s_last;		/* last un-ACK'ed entry */
+	u32 s_ssn;		/* SSN of tail entry */
+	u32 s_lsn;		/* limit sequence number (credit) */
+	struct ipath_swqe *s_wq;	/* send work queue */
+	struct ipath_swqe *s_wqe;
+	struct ipath_sge *r_ud_sg_list;
+	struct ipath_rq r_rq;		/* receive work queue */
+	struct ipath_sge r_sg_list[0];	/* verified SGEs */
+};
+
+/*
+ * Atomic bit definitions for r_aflags.
+ */
+#define IPATH_R_WRID_VALID	0
+
+/*
+ * Bit definitions for r_flags.
+ */
+#define IPATH_R_REUSE_SGE	0x01
+#define IPATH_R_RDMAR_SEQ	0x02
+
+/*
+ * Bit definitions for s_flags.
+ *
+ * IPATH_S_FENCE_PENDING - waiting for all prior RDMA read or atomic SWQEs
+ *			   before processing the next SWQE
+ * IPATH_S_RDMAR_PENDING - waiting for any RDMA read or atomic SWQEs
+ *			   before processing the next SWQE
+ * IPATH_S_WAITING - waiting for RNR timeout or send buffer available.
+ * IPATH_S_WAIT_SSN_CREDIT - waiting for RC credits to process next SWQE
+ * IPATH_S_WAIT_DMA - waiting for send DMA queue to drain before generating
+ *		      next send completion entry not via send DMA.
+ */
+#define IPATH_S_SIGNAL_REQ_WR	0x01
+#define IPATH_S_FENCE_PENDING	0x02
+#define IPATH_S_RDMAR_PENDING	0x04
+#define IPATH_S_ACK_PENDING	0x08
+#define IPATH_S_BUSY		0x10
+#define IPATH_S_WAITING		0x20
+#define IPATH_S_WAIT_SSN_CREDIT	0x40
+#define IPATH_S_WAIT_DMA	0x80
+
+#define IPATH_S_ANY_WAIT (IPATH_S_FENCE_PENDING | IPATH_S_RDMAR_PENDING | \
+	IPATH_S_WAITING | IPATH_S_WAIT_SSN_CREDIT | IPATH_S_WAIT_DMA)
+
+#define IPATH_PSN_CREDIT	512
+
+/*
+ * Since struct ipath_swqe is not a fixed size, we can't simply index into
+ * struct ipath_qp.s_wq.  This function does the array index computation.
+ */
+static inline struct ipath_swqe *get_swqe_ptr(struct ipath_qp *qp,
+					      unsigned n)
+{
+	return (struct ipath_swqe *)((char *)qp->s_wq +
+				     (sizeof(struct ipath_swqe) +
+				      qp->s_max_sge *
+				      sizeof(struct ipath_sge)) * n);
+}
+
+/*
+ * Since struct ipath_rwqe is not a fixed size, we can't simply index into
+ * struct ipath_rwq.wq.  This function does the array index computation.
+ */
+static inline struct ipath_rwqe *get_rwqe_ptr(struct ipath_rq *rq,
+					      unsigned n)
+{
+	return (struct ipath_rwqe *)
+		((char *) rq->wq->wq +
+		 (sizeof(struct ipath_rwqe) +
+		  rq->max_sge * sizeof(struct ib_sge)) * n);
+}
+
+/*
+ * QPN-map pages start out as NULL, they get allocated upon
+ * first use and are never deallocated. This way,
+ * large bitmaps are not allocated unless large numbers of QPs are used.
+ */
+struct qpn_map {
+	atomic_t n_free;
+	void *page;
+};
+
+struct ipath_qp_table {
+	spinlock_t lock;
+	u32 last;		/* last QP number allocated */
+	u32 max;		/* size of the hash table */
+	u32 nmaps;		/* size of the map table */
+	struct ipath_qp **table;
+	/* bit map of free numbers */
+	struct qpn_map map[QPNMAP_ENTRIES];
+};
+
+struct ipath_lkey_table {
+	spinlock_t lock;
+	u32 next;		/* next unused index (speeds search) */
+	u32 gen;		/* generation count */
+	u32 max;		/* size of the table */
+	struct ipath_mregion **table;
+};
+
+struct ipath_opcode_stats {
+	u64 n_packets;		/* number of packets */
+	u64 n_bytes;		/* total number of bytes */
+};
+
+struct ipath_ibdev {
+	struct ib_device ibdev;
+	struct ipath_devdata *dd;
+	struct list_head pending_mmaps;
+	spinlock_t mmap_offset_lock;
+	u32 mmap_offset;
+	int ib_unit;		/* This is the device number */
+	u16 sm_lid;		/* in host order */
+	u8 sm_sl;
+	u8 mkeyprot;
+	/* non-zero when timer is set */
+	unsigned long mkey_lease_timeout;
+
+	/* The following fields are really per port. */
+	struct ipath_qp_table qp_table;
+	struct ipath_lkey_table lk_table;
+	struct list_head pending[3];	/* FIFO of QPs waiting for ACKs */
+	struct list_head piowait;	/* list for wait PIO buf */
+	struct list_head txreq_free;
+	void *txreq_bufs;
+	/* list of QPs waiting for RNR timer */
+	struct list_head rnrwait;
+	spinlock_t pending_lock;
+	__be64 sys_image_guid;	/* in network order */
+	__be64 gid_prefix;	/* in network order */
+	__be64 mkey;
+
+	u32 n_pds_allocated;	/* number of PDs allocated for device */
+	spinlock_t n_pds_lock;
+	u32 n_ahs_allocated;	/* number of AHs allocated for device */
+	spinlock_t n_ahs_lock;
+	u32 n_cqs_allocated;	/* number of CQs allocated for device */
+	spinlock_t n_cqs_lock;
+	u32 n_qps_allocated;	/* number of QPs allocated for device */
+	spinlock_t n_qps_lock;
+	u32 n_srqs_allocated;	/* number of SRQs allocated for device */
+	spinlock_t n_srqs_lock;
+	u32 n_mcast_grps_allocated; /* number of mcast groups allocated */
+	spinlock_t n_mcast_grps_lock;
+
+	u64 ipath_sword;	/* total dwords sent (sample result) */
+	u64 ipath_rword;	/* total dwords received (sample result) */
+	u64 ipath_spkts;	/* total packets sent (sample result) */
+	u64 ipath_rpkts;	/* total packets received (sample result) */
+	/* # of ticks no data sent (sample result) */
+	u64 ipath_xmit_wait;
+	u64 rcv_errors;		/* # of packets with SW detected rcv errs */
+	u64 n_unicast_xmit;	/* total unicast packets sent */
+	u64 n_unicast_rcv;	/* total unicast packets received */
+	u64 n_multicast_xmit;	/* total multicast packets sent */
+	u64 n_multicast_rcv;	/* total multicast packets received */
+	u64 z_symbol_error_counter;		/* starting count for PMA */
+	u64 z_link_error_recovery_counter;	/* starting count for PMA */
+	u64 z_link_downed_counter;		/* starting count for PMA */
+	u64 z_port_rcv_errors;			/* starting count for PMA */
+	u64 z_port_rcv_remphys_errors;		/* starting count for PMA */
+	u64 z_port_xmit_discards;		/* starting count for PMA */
+	u64 z_port_xmit_data;			/* starting count for PMA */
+	u64 z_port_rcv_data;			/* starting count for PMA */
+	u64 z_port_xmit_packets;		/* starting count for PMA */
+	u64 z_port_rcv_packets;			/* starting count for PMA */
+	u32 z_pkey_violations;			/* starting count for PMA */
+	u32 z_local_link_integrity_errors;	/* starting count for PMA */
+	u32 z_excessive_buffer_overrun_errors;	/* starting count for PMA */
+	u32 z_vl15_dropped;			/* starting count for PMA */
+	u32 n_rc_resends;
+	u32 n_rc_acks;
+	u32 n_rc_qacks;
+	u32 n_seq_naks;
+	u32 n_rdma_seq;
+	u32 n_rnr_naks;
+	u32 n_other_naks;
+	u32 n_timeouts;
+	u32 n_pkt_drops;
+	u32 n_vl15_dropped;
+	u32 n_wqe_errs;
+	u32 n_rdma_dup_busy;
+	u32 n_piowait;
+	u32 n_unaligned;
+	u32 port_cap_flags;
+	u32 pma_sample_start;
+	u32 pma_sample_interval;
+	__be16 pma_counter_select[5];
+	u16 pma_tag;
+	u16 qkey_violations;
+	u16 mkey_violations;
+	u16 mkey_lease_period;
+	u16 pending_index;	/* which pending queue is active */
+	u8 pma_sample_status;
+	u8 subnet_timeout;
+	u8 vl_high_limit;
+	struct ipath_opcode_stats opstats[128];
+};
+
+struct ipath_verbs_counters {
+	u64 symbol_error_counter;
+	u64 link_error_recovery_counter;
+	u64 link_downed_counter;
+	u64 port_rcv_errors;
+	u64 port_rcv_remphys_errors;
+	u64 port_xmit_discards;
+	u64 port_xmit_data;
+	u64 port_rcv_data;
+	u64 port_xmit_packets;
+	u64 port_rcv_packets;
+	u32 local_link_integrity_errors;
+	u32 excessive_buffer_overrun_errors;
+	u32 vl15_dropped;
+};
+
+struct ipath_verbs_txreq {
+	struct ipath_qp         *qp;
+	struct ipath_swqe       *wqe;
+	u32                      map_len;
+	u32                      len;
+	struct ipath_sge_state  *ss;
+	struct ipath_pio_header  hdr;
+	struct ipath_sdma_txreq  txreq;
+};
+
+static inline struct ipath_mr *to_imr(struct ib_mr *ibmr)
+{
+	return container_of(ibmr, struct ipath_mr, ibmr);
+}
+
+static inline struct ipath_pd *to_ipd(struct ib_pd *ibpd)
+{
+	return container_of(ibpd, struct ipath_pd, ibpd);
+}
+
+static inline struct ipath_ah *to_iah(struct ib_ah *ibah)
+{
+	return container_of(ibah, struct ipath_ah, ibah);
+}
+
+static inline struct ipath_cq *to_icq(struct ib_cq *ibcq)
+{
+	return container_of(ibcq, struct ipath_cq, ibcq);
+}
+
+static inline struct ipath_srq *to_isrq(struct ib_srq *ibsrq)
+{
+	return container_of(ibsrq, struct ipath_srq, ibsrq);
+}
+
+static inline struct ipath_qp *to_iqp(struct ib_qp *ibqp)
+{
+	return container_of(ibqp, struct ipath_qp, ibqp);
+}
+
+static inline struct ipath_ibdev *to_idev(struct ib_device *ibdev)
+{
+	return container_of(ibdev, struct ipath_ibdev, ibdev);
+}
+
+/*
+ * This must be called with s_lock held.
+ */
+static inline void ipath_schedule_send(struct ipath_qp *qp)
+{
+	if (qp->s_flags & IPATH_S_ANY_WAIT)
+		qp->s_flags &= ~IPATH_S_ANY_WAIT;
+	if (!(qp->s_flags & IPATH_S_BUSY))
+		tasklet_hi_schedule(&qp->s_task);
+}
+
+int ipath_process_mad(struct ib_device *ibdev,
+		      int mad_flags,
+		      u8 port_num,
+		      struct ib_wc *in_wc,
+		      struct ib_grh *in_grh,
+		      struct ib_mad *in_mad, struct ib_mad *out_mad);
+
+/*
+ * Compare the lower 24 bits of the two values.
+ * Returns an integer <, ==, or > than zero.
+ */
+static inline int ipath_cmp24(u32 a, u32 b)
+{
+	return (((int) a) - ((int) b)) << 8;
+}
+
+struct ipath_mcast *ipath_mcast_find(union ib_gid *mgid);
+
+int ipath_snapshot_counters(struct ipath_devdata *dd, u64 *swords,
+			    u64 *rwords, u64 *spkts, u64 *rpkts,
+			    u64 *xmit_wait);
+
+int ipath_get_counters(struct ipath_devdata *dd,
+		       struct ipath_verbs_counters *cntrs);
+
+int ipath_multicast_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid);
+
+int ipath_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid);
+
+int ipath_mcast_tree_empty(void);
+
+__be32 ipath_compute_aeth(struct ipath_qp *qp);
+
+struct ipath_qp *ipath_lookup_qpn(struct ipath_qp_table *qpt, u32 qpn);
+
+struct ib_qp *ipath_create_qp(struct ib_pd *ibpd,
+			      struct ib_qp_init_attr *init_attr,
+			      struct ib_udata *udata);
+
+int ipath_destroy_qp(struct ib_qp *ibqp);
+
+int ipath_error_qp(struct ipath_qp *qp, enum ib_wc_status err);
+
+int ipath_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+		    int attr_mask, struct ib_udata *udata);
+
+int ipath_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+		   int attr_mask, struct ib_qp_init_attr *init_attr);
+
+unsigned ipath_free_all_qps(struct ipath_qp_table *qpt);
+
+int ipath_init_qp_table(struct ipath_ibdev *idev, int size);
+
+void ipath_get_credit(struct ipath_qp *qp, u32 aeth);
+
+unsigned ipath_ib_rate_to_mult(enum ib_rate rate);
+
+int ipath_verbs_send(struct ipath_qp *qp, struct ipath_ib_header *hdr,
+		     u32 hdrwords, struct ipath_sge_state *ss, u32 len);
+
+void ipath_copy_sge(struct ipath_sge_state *ss, void *data, u32 length);
+
+void ipath_skip_sge(struct ipath_sge_state *ss, u32 length);
+
+void ipath_uc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
+		  int has_grh, void *data, u32 tlen, struct ipath_qp *qp);
+
+void ipath_rc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
+		  int has_grh, void *data, u32 tlen, struct ipath_qp *qp);
+
+void ipath_restart_rc(struct ipath_qp *qp, u32 psn);
+
+void ipath_rc_error(struct ipath_qp *qp, enum ib_wc_status err);
+
+int ipath_post_ud_send(struct ipath_qp *qp, struct ib_send_wr *wr);
+
+void ipath_ud_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
+		  int has_grh, void *data, u32 tlen, struct ipath_qp *qp);
+
+int ipath_alloc_lkey(struct ipath_lkey_table *rkt,
+		     struct ipath_mregion *mr);
+
+void ipath_free_lkey(struct ipath_lkey_table *rkt, u32 lkey);
+
+int ipath_lkey_ok(struct ipath_qp *qp, struct ipath_sge *isge,
+		  struct ib_sge *sge, int acc);
+
+int ipath_rkey_ok(struct ipath_qp *qp, struct ipath_sge_state *ss,
+		  u32 len, u64 vaddr, u32 rkey, int acc);
+
+int ipath_post_srq_receive(struct ib_srq *ibsrq, struct ib_recv_wr *wr,
+			   struct ib_recv_wr **bad_wr);
+
+struct ib_srq *ipath_create_srq(struct ib_pd *ibpd,
+				struct ib_srq_init_attr *srq_init_attr,
+				struct ib_udata *udata);
+
+int ipath_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
+		     enum ib_srq_attr_mask attr_mask,
+		     struct ib_udata *udata);
+
+int ipath_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr);
+
+int ipath_destroy_srq(struct ib_srq *ibsrq);
+
+void ipath_cq_enter(struct ipath_cq *cq, struct ib_wc *entry, int sig);
+
+int ipath_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry);
+
+struct ib_cq *ipath_create_cq(struct ib_device *ibdev, int entries, int comp_vector,
+			      struct ib_ucontext *context,
+			      struct ib_udata *udata);
+
+int ipath_destroy_cq(struct ib_cq *ibcq);
+
+int ipath_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags notify_flags);
+
+int ipath_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata);
+
+struct ib_mr *ipath_get_dma_mr(struct ib_pd *pd, int acc);
+
+struct ib_mr *ipath_reg_phys_mr(struct ib_pd *pd,
+				struct ib_phys_buf *buffer_list,
+				int num_phys_buf, int acc, u64 *iova_start);
+
+struct ib_mr *ipath_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
+				u64 virt_addr, int mr_access_flags,
+				struct ib_udata *udata);
+
+int ipath_dereg_mr(struct ib_mr *ibmr);
+
+struct ib_fmr *ipath_alloc_fmr(struct ib_pd *pd, int mr_access_flags,
+			       struct ib_fmr_attr *fmr_attr);
+
+int ipath_map_phys_fmr(struct ib_fmr *ibfmr, u64 * page_list,
+		       int list_len, u64 iova);
+
+int ipath_unmap_fmr(struct list_head *fmr_list);
+
+int ipath_dealloc_fmr(struct ib_fmr *ibfmr);
+
+void ipath_release_mmap_info(struct kref *ref);
+
+struct ipath_mmap_info *ipath_create_mmap_info(struct ipath_ibdev *dev,
+					       u32 size,
+					       struct ib_ucontext *context,
+					       void *obj);
+
+void ipath_update_mmap_info(struct ipath_ibdev *dev,
+			    struct ipath_mmap_info *ip,
+			    u32 size, void *obj);
+
+int ipath_mmap(struct ib_ucontext *context, struct vm_area_struct *vma);
+
+void ipath_insert_rnr_queue(struct ipath_qp *qp);
+
+int ipath_init_sge(struct ipath_qp *qp, struct ipath_rwqe *wqe,
+		   u32 *lengthp, struct ipath_sge_state *ss);
+
+int ipath_get_rwqe(struct ipath_qp *qp, int wr_id_only);
+
+u32 ipath_make_grh(struct ipath_ibdev *dev, struct ib_grh *hdr,
+		   struct ib_global_route *grh, u32 hwords, u32 nwords);
+
+void ipath_make_ruc_header(struct ipath_ibdev *dev, struct ipath_qp *qp,
+			   struct ipath_other_headers *ohdr,
+			   u32 bth0, u32 bth2);
+
+void ipath_do_send(unsigned long data);
+
+void ipath_send_complete(struct ipath_qp *qp, struct ipath_swqe *wqe,
+			 enum ib_wc_status status);
+
+int ipath_make_rc_req(struct ipath_qp *qp);
+
+int ipath_make_uc_req(struct ipath_qp *qp);
+
+int ipath_make_ud_req(struct ipath_qp *qp);
+
+int ipath_register_ib_device(struct ipath_devdata *);
+
+void ipath_unregister_ib_device(struct ipath_ibdev *);
+
+void ipath_ib_rcv(struct ipath_ibdev *, void *, void *, u32);
+
+int ipath_ib_piobufavail(struct ipath_ibdev *);
+
+unsigned ipath_get_npkeys(struct ipath_devdata *);
+
+u32 ipath_get_cr_errpkey(struct ipath_devdata *);
+
+unsigned ipath_get_pkey(struct ipath_devdata *, unsigned);
+
+extern const enum ib_wc_opcode ib_ipath_wc_opcode[];
+
+/*
+ * Below converts HCA-specific LinkTrainingState to IB PhysPortState
+ * values.
+ */
+extern const u8 ipath_cvt_physportstate[];
+#define IB_PHYSPORTSTATE_SLEEP 1
+#define IB_PHYSPORTSTATE_POLL 2
+#define IB_PHYSPORTSTATE_DISABLED 3
+#define IB_PHYSPORTSTATE_CFG_TRAIN 4
+#define IB_PHYSPORTSTATE_LINKUP 5
+#define IB_PHYSPORTSTATE_LINK_ERR_RECOVER 6
+
+extern const int ib_ipath_state_ops[];
+
+extern unsigned int ib_ipath_lkey_table_size;
+
+extern unsigned int ib_ipath_max_cqes;
+
+extern unsigned int ib_ipath_max_cqs;
+
+extern unsigned int ib_ipath_max_qp_wrs;
+
+extern unsigned int ib_ipath_max_qps;
+
+extern unsigned int ib_ipath_max_sges;
+
+extern unsigned int ib_ipath_max_mcast_grps;
+
+extern unsigned int ib_ipath_max_mcast_qp_attached;
+
+extern unsigned int ib_ipath_max_srqs;
+
+extern unsigned int ib_ipath_max_srq_sges;
+
+extern unsigned int ib_ipath_max_srq_wrs;
+
+extern const u32 ib_ipath_rnr_table[];
+
+extern struct ib_dma_mapping_ops ipath_dma_mapping_ops;
+
+#endif				/* IPATH_VERBS_H */
diff --git a/drivers/infiniband/hw/ipath/ipath_verbs_mcast.c b/drivers/infiniband/hw/ipath/ipath_verbs_mcast.c
new file mode 100644
index 0000000..6216ea9
--- /dev/null
+++ b/drivers/infiniband/hw/ipath/ipath_verbs_mcast.c
@@ -0,0 +1,364 @@
+/*
+ * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/rculist.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+
+#include "ipath_verbs.h"
+
+/*
+ * Global table of GID to attached QPs.
+ * The table is global to all ipath devices since a send from one QP/device
+ * needs to be locally routed to any locally attached QPs on the same
+ * or different device.
+ */
+static struct rb_root mcast_tree;
+static DEFINE_SPINLOCK(mcast_lock);
+
+/**
+ * ipath_mcast_qp_alloc - alloc a struct to link a QP to mcast GID struct
+ * @qp: the QP to link
+ */
+static struct ipath_mcast_qp *ipath_mcast_qp_alloc(struct ipath_qp *qp)
+{
+	struct ipath_mcast_qp *mqp;
+
+	mqp = kmalloc(sizeof *mqp, GFP_KERNEL);
+	if (!mqp)
+		goto bail;
+
+	mqp->qp = qp;
+	atomic_inc(&qp->refcount);
+
+bail:
+	return mqp;
+}
+
+static void ipath_mcast_qp_free(struct ipath_mcast_qp *mqp)
+{
+	struct ipath_qp *qp = mqp->qp;
+
+	/* Notify ipath_destroy_qp() if it is waiting. */
+	if (atomic_dec_and_test(&qp->refcount))
+		wake_up(&qp->wait);
+
+	kfree(mqp);
+}
+
+/**
+ * ipath_mcast_alloc - allocate the multicast GID structure
+ * @mgid: the multicast GID
+ *
+ * A list of QPs will be attached to this structure.
+ */
+static struct ipath_mcast *ipath_mcast_alloc(union ib_gid *mgid)
+{
+	struct ipath_mcast *mcast;
+
+	mcast = kmalloc(sizeof *mcast, GFP_KERNEL);
+	if (!mcast)
+		goto bail;
+
+	mcast->mgid = *mgid;
+	INIT_LIST_HEAD(&mcast->qp_list);
+	init_waitqueue_head(&mcast->wait);
+	atomic_set(&mcast->refcount, 0);
+	mcast->n_attached = 0;
+
+bail:
+	return mcast;
+}
+
+static void ipath_mcast_free(struct ipath_mcast *mcast)
+{
+	struct ipath_mcast_qp *p, *tmp;
+
+	list_for_each_entry_safe(p, tmp, &mcast->qp_list, list)
+		ipath_mcast_qp_free(p);
+
+	kfree(mcast);
+}
+
+/**
+ * ipath_mcast_find - search the global table for the given multicast GID
+ * @mgid: the multicast GID to search for
+ *
+ * Returns NULL if not found.
+ *
+ * The caller is responsible for decrementing the reference count if found.
+ */
+struct ipath_mcast *ipath_mcast_find(union ib_gid *mgid)
+{
+	struct rb_node *n;
+	unsigned long flags;
+	struct ipath_mcast *mcast;
+
+	spin_lock_irqsave(&mcast_lock, flags);
+	n = mcast_tree.rb_node;
+	while (n) {
+		int ret;
+
+		mcast = rb_entry(n, struct ipath_mcast, rb_node);
+
+		ret = memcmp(mgid->raw, mcast->mgid.raw,
+			     sizeof(union ib_gid));
+		if (ret < 0)
+			n = n->rb_left;
+		else if (ret > 0)
+			n = n->rb_right;
+		else {
+			atomic_inc(&mcast->refcount);
+			spin_unlock_irqrestore(&mcast_lock, flags);
+			goto bail;
+		}
+	}
+	spin_unlock_irqrestore(&mcast_lock, flags);
+
+	mcast = NULL;
+
+bail:
+	return mcast;
+}
+
+/**
+ * ipath_mcast_add - insert mcast GID into table and attach QP struct
+ * @mcast: the mcast GID table
+ * @mqp: the QP to attach
+ *
+ * Return zero if both were added.  Return EEXIST if the GID was already in
+ * the table but the QP was added.  Return ESRCH if the QP was already
+ * attached and neither structure was added.
+ */
+static int ipath_mcast_add(struct ipath_ibdev *dev,
+			   struct ipath_mcast *mcast,
+			   struct ipath_mcast_qp *mqp)
+{
+	struct rb_node **n = &mcast_tree.rb_node;
+	struct rb_node *pn = NULL;
+	int ret;
+
+	spin_lock_irq(&mcast_lock);
+
+	while (*n) {
+		struct ipath_mcast *tmcast;
+		struct ipath_mcast_qp *p;
+
+		pn = *n;
+		tmcast = rb_entry(pn, struct ipath_mcast, rb_node);
+
+		ret = memcmp(mcast->mgid.raw, tmcast->mgid.raw,
+			     sizeof(union ib_gid));
+		if (ret < 0) {
+			n = &pn->rb_left;
+			continue;
+		}
+		if (ret > 0) {
+			n = &pn->rb_right;
+			continue;
+		}
+
+		/* Search the QP list to see if this is already there. */
+		list_for_each_entry_rcu(p, &tmcast->qp_list, list) {
+			if (p->qp == mqp->qp) {
+				ret = ESRCH;
+				goto bail;
+			}
+		}
+		if (tmcast->n_attached == ib_ipath_max_mcast_qp_attached) {
+			ret = ENOMEM;
+			goto bail;
+		}
+
+		tmcast->n_attached++;
+
+		list_add_tail_rcu(&mqp->list, &tmcast->qp_list);
+		ret = EEXIST;
+		goto bail;
+	}
+
+	spin_lock(&dev->n_mcast_grps_lock);
+	if (dev->n_mcast_grps_allocated == ib_ipath_max_mcast_grps) {
+		spin_unlock(&dev->n_mcast_grps_lock);
+		ret = ENOMEM;
+		goto bail;
+	}
+
+	dev->n_mcast_grps_allocated++;
+	spin_unlock(&dev->n_mcast_grps_lock);
+
+	mcast->n_attached++;
+
+	list_add_tail_rcu(&mqp->list, &mcast->qp_list);
+
+	atomic_inc(&mcast->refcount);
+	rb_link_node(&mcast->rb_node, pn, n);
+	rb_insert_color(&mcast->rb_node, &mcast_tree);
+
+	ret = 0;
+
+bail:
+	spin_unlock_irq(&mcast_lock);
+
+	return ret;
+}
+
+int ipath_multicast_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
+{
+	struct ipath_qp *qp = to_iqp(ibqp);
+	struct ipath_ibdev *dev = to_idev(ibqp->device);
+	struct ipath_mcast *mcast;
+	struct ipath_mcast_qp *mqp;
+	int ret;
+
+	/*
+	 * Allocate data structures since its better to do this outside of
+	 * spin locks and it will most likely be needed.
+	 */
+	mcast = ipath_mcast_alloc(gid);
+	if (mcast == NULL) {
+		ret = -ENOMEM;
+		goto bail;
+	}
+	mqp = ipath_mcast_qp_alloc(qp);
+	if (mqp == NULL) {
+		ipath_mcast_free(mcast);
+		ret = -ENOMEM;
+		goto bail;
+	}
+	switch (ipath_mcast_add(dev, mcast, mqp)) {
+	case ESRCH:
+		/* Neither was used: can't attach the same QP twice. */
+		ipath_mcast_qp_free(mqp);
+		ipath_mcast_free(mcast);
+		ret = -EINVAL;
+		goto bail;
+	case EEXIST:		/* The mcast wasn't used */
+		ipath_mcast_free(mcast);
+		break;
+	case ENOMEM:
+		/* Exceeded the maximum number of mcast groups. */
+		ipath_mcast_qp_free(mqp);
+		ipath_mcast_free(mcast);
+		ret = -ENOMEM;
+		goto bail;
+	default:
+		break;
+	}
+
+	ret = 0;
+
+bail:
+	return ret;
+}
+
+int ipath_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
+{
+	struct ipath_qp *qp = to_iqp(ibqp);
+	struct ipath_ibdev *dev = to_idev(ibqp->device);
+	struct ipath_mcast *mcast = NULL;
+	struct ipath_mcast_qp *p, *tmp;
+	struct rb_node *n;
+	int last = 0;
+	int ret;
+
+	spin_lock_irq(&mcast_lock);
+
+	/* Find the GID in the mcast table. */
+	n = mcast_tree.rb_node;
+	while (1) {
+		if (n == NULL) {
+			spin_unlock_irq(&mcast_lock);
+			ret = -EINVAL;
+			goto bail;
+		}
+
+		mcast = rb_entry(n, struct ipath_mcast, rb_node);
+		ret = memcmp(gid->raw, mcast->mgid.raw,
+			     sizeof(union ib_gid));
+		if (ret < 0)
+			n = n->rb_left;
+		else if (ret > 0)
+			n = n->rb_right;
+		else
+			break;
+	}
+
+	/* Search the QP list. */
+	list_for_each_entry_safe(p, tmp, &mcast->qp_list, list) {
+		if (p->qp != qp)
+			continue;
+		/*
+		 * We found it, so remove it, but don't poison the forward
+		 * link until we are sure there are no list walkers.
+		 */
+		list_del_rcu(&p->list);
+		mcast->n_attached--;
+
+		/* If this was the last attached QP, remove the GID too. */
+		if (list_empty(&mcast->qp_list)) {
+			rb_erase(&mcast->rb_node, &mcast_tree);
+			last = 1;
+		}
+		break;
+	}
+
+	spin_unlock_irq(&mcast_lock);
+
+	if (p) {
+		/*
+		 * Wait for any list walkers to finish before freeing the
+		 * list element.
+		 */
+		wait_event(mcast->wait, atomic_read(&mcast->refcount) <= 1);
+		ipath_mcast_qp_free(p);
+	}
+	if (last) {
+		atomic_dec(&mcast->refcount);
+		wait_event(mcast->wait, !atomic_read(&mcast->refcount));
+		ipath_mcast_free(mcast);
+		spin_lock_irq(&dev->n_mcast_grps_lock);
+		dev->n_mcast_grps_allocated--;
+		spin_unlock_irq(&dev->n_mcast_grps_lock);
+	}
+
+	ret = 0;
+
+bail:
+	return ret;
+}
+
+int ipath_mcast_tree_empty(void)
+{
+	return mcast_tree.rb_node == NULL;
+}
diff --git a/drivers/infiniband/hw/ipath/ipath_wc_ppc64.c b/drivers/infiniband/hw/ipath/ipath_wc_ppc64.c
new file mode 100644
index 0000000..1d7bd82
--- /dev/null
+++ b/drivers/infiniband/hw/ipath/ipath_wc_ppc64.c
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/*
+ * This file is conditionally built on PowerPC only.  Otherwise weak symbol
+ * versions of the functions exported from here are used.
+ */
+
+#include "ipath_kernel.h"
+
+/**
+ * ipath_enable_wc - enable write combining for MMIO writes to the device
+ * @dd: infinipath device
+ *
+ * Nothing to do on PowerPC, so just return without error.
+ */
+int ipath_enable_wc(struct ipath_devdata *dd)
+{
+	return 0;
+}
+
+/**
+ * ipath_unordered_wc - indicate whether write combining is unordered
+ *
+ * Because our performance depends on our ability to do write
+ * combining mmio writes in the most efficient way, we need to
+ * know if we are on a processor that may reorder stores when
+ * write combining.
+ */
+int ipath_unordered_wc(void)
+{
+	return 1;
+}
diff --git a/drivers/infiniband/hw/ipath/ipath_wc_x86_64.c b/drivers/infiniband/hw/ipath/ipath_wc_x86_64.c
new file mode 100644
index 0000000..3428acb
--- /dev/null
+++ b/drivers/infiniband/hw/ipath/ipath_wc_x86_64.c
@@ -0,0 +1,184 @@
+/*
+ * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/*
+ * This file is conditionally built on x86_64 only.  Otherwise weak symbol
+ * versions of the functions exported from here are used.
+ */
+
+#include <linux/pci.h>
+#include <asm/mtrr.h>
+#include <asm/processor.h>
+
+#include "ipath_kernel.h"
+
+/**
+ * ipath_enable_wc - enable write combining for MMIO writes to the device
+ * @dd: infinipath device
+ *
+ * This routine is x86_64-specific; it twiddles the CPU's MTRRs to enable
+ * write combining.
+ */
+int ipath_enable_wc(struct ipath_devdata *dd)
+{
+	int ret = 0;
+	u64 pioaddr, piolen;
+	unsigned bits;
+	const unsigned long addr = pci_resource_start(dd->pcidev, 0);
+	const size_t len = pci_resource_len(dd->pcidev, 0);
+
+	/*
+	 * Set the PIO buffers to be WCCOMB, so we get HT bursts to the
+	 * chip.  Linux (possibly the hardware) requires it to be on a power
+	 * of 2 address matching the length (which has to be a power of 2).
+	 * For rev1, that means the base address, for rev2, it will be just
+	 * the PIO buffers themselves.
+	 * For chips with two sets of buffers, the calculations are
+	 * somewhat more complicated; we need to sum, and the piobufbase
+	 * register has both offsets, 2K in low 32 bits, 4K in high 32 bits.
+	 * The buffers are still packed, so a single range covers both.
+	 */
+	if (dd->ipath_piobcnt2k && dd->ipath_piobcnt4k) { /* 2 sizes */
+		unsigned long pio2kbase, pio4kbase;
+		pio2kbase = dd->ipath_piobufbase & 0xffffffffUL;
+		pio4kbase = (dd->ipath_piobufbase >> 32) & 0xffffffffUL;
+		if (pio2kbase < pio4kbase) { /* all, for now */
+			pioaddr = addr + pio2kbase;
+			piolen = pio4kbase - pio2kbase +
+				dd->ipath_piobcnt4k * dd->ipath_4kalign;
+		} else {
+			pioaddr = addr + pio4kbase;
+			piolen = pio2kbase - pio4kbase +
+				dd->ipath_piobcnt2k * dd->ipath_palign;
+		}
+	} else {  /* single buffer size (2K, currently) */
+		pioaddr = addr + dd->ipath_piobufbase;
+		piolen = dd->ipath_piobcnt2k * dd->ipath_palign +
+			dd->ipath_piobcnt4k * dd->ipath_4kalign;
+	}
+
+	for (bits = 0; !(piolen & (1ULL << bits)); bits++)
+		/* do nothing */ ;
+
+	if (piolen != (1ULL << bits)) {
+		piolen >>= bits;
+		while (piolen >>= 1)
+			bits++;
+		piolen = 1ULL << (bits + 1);
+	}
+	if (pioaddr & (piolen - 1)) {
+		u64 atmp;
+		ipath_dbg("pioaddr %llx not on right boundary for size "
+			  "%llx, fixing\n",
+			  (unsigned long long) pioaddr,
+			  (unsigned long long) piolen);
+		atmp = pioaddr & ~(piolen - 1);
+		if (atmp < addr || (atmp + piolen) > (addr + len)) {
+			ipath_dev_err(dd, "No way to align address/size "
+				      "(%llx/%llx), no WC mtrr\n",
+				      (unsigned long long) atmp,
+				      (unsigned long long) piolen << 1);
+			ret = -ENODEV;
+		} else {
+			ipath_dbg("changing WC base from %llx to %llx, "
+				  "len from %llx to %llx\n",
+				  (unsigned long long) pioaddr,
+				  (unsigned long long) atmp,
+				  (unsigned long long) piolen,
+				  (unsigned long long) piolen << 1);
+			pioaddr = atmp;
+			piolen <<= 1;
+		}
+	}
+
+	if (!ret) {
+		int cookie;
+		ipath_cdbg(VERBOSE, "Setting mtrr for chip to WC "
+			   "(addr %llx, len=0x%llx)\n",
+			   (unsigned long long) pioaddr,
+			   (unsigned long long) piolen);
+		cookie = mtrr_add(pioaddr, piolen, MTRR_TYPE_WRCOMB, 0);
+		if (cookie < 0) {
+			{
+				dev_info(&dd->pcidev->dev,
+					 "mtrr_add()  WC for PIO bufs "
+					 "failed (%d)\n",
+					 cookie);
+				ret = -EINVAL;
+			}
+		} else {
+			ipath_cdbg(VERBOSE, "Set mtrr for chip to WC, "
+				   "cookie is %d\n", cookie);
+			dd->ipath_wc_cookie = cookie;
+			dd->ipath_wc_base = (unsigned long) pioaddr;
+			dd->ipath_wc_len = (unsigned long) piolen;
+		}
+	}
+
+	return ret;
+}
+
+/**
+ * ipath_disable_wc - disable write combining for MMIO writes to the device
+ * @dd: infinipath device
+ */
+void ipath_disable_wc(struct ipath_devdata *dd)
+{
+	if (dd->ipath_wc_cookie) {
+		int r;
+		ipath_cdbg(VERBOSE, "undoing WCCOMB on pio buffers\n");
+		r = mtrr_del(dd->ipath_wc_cookie, dd->ipath_wc_base,
+			     dd->ipath_wc_len);
+		if (r < 0)
+			dev_info(&dd->pcidev->dev,
+				 "mtrr_del(%lx, %lx, %lx) failed: %d\n",
+				 dd->ipath_wc_cookie, dd->ipath_wc_base,
+				 dd->ipath_wc_len, r);
+		dd->ipath_wc_cookie = 0; /* even on failure */
+	}
+}
+
+/**
+ * ipath_unordered_wc - indicate whether write combining is ordered
+ *
+ * Because our performance depends on our ability to do write combining mmio
+ * writes in the most efficient way, we need to know if we are on an Intel
+ * or AMD x86_64 processor.  AMD x86_64 processors flush WC buffers out in
+ * the order completed, and so no special flushing is required to get
+ * correct ordering.  Intel processors, however, will flush write buffers
+ * out in "random" orders, and so explicit ordering is needed at times.
+ */
+int ipath_unordered_wc(void)
+{
+	return boot_cpu_data.x86_vendor != X86_VENDOR_AMD;
+}
diff --git a/drivers/infiniband/hw/mlx4/Kconfig b/drivers/infiniband/hw/mlx4/Kconfig
new file mode 100644
index 0000000..fc01dea
--- /dev/null
+++ b/drivers/infiniband/hw/mlx4/Kconfig
@@ -0,0 +1,10 @@
+config MLX4_INFINIBAND
+	tristate "Mellanox ConnectX HCA support"
+	depends on NETDEVICES && ETHERNET && PCI && INET
+	select NET_VENDOR_MELLANOX
+	select MLX4_CORE
+	---help---
+	  This driver provides low-level InfiniBand support for
+	  Mellanox ConnectX PCI Express host channel adapters (HCAs).
+	  This is required to use InfiniBand protocols such as
+	  IP-over-IB or SRP with these devices.
diff --git a/drivers/infiniband/hw/mlx4/Makefile b/drivers/infiniband/hw/mlx4/Makefile
new file mode 100644
index 0000000..f4213b3
--- /dev/null
+++ b/drivers/infiniband/hw/mlx4/Makefile
@@ -0,0 +1,3 @@
+obj-$(CONFIG_MLX4_INFINIBAND)	+= mlx4_ib.o
+
+mlx4_ib-y :=	ah.o cq.o doorbell.o mad.o main.o mr.o qp.o srq.o mcg.o cm.o alias_GUID.o sysfs.o
diff --git a/drivers/infiniband/hw/mlx4/ah.c b/drivers/infiniband/hw/mlx4/ah.c
new file mode 100644
index 0000000..2d8c339
--- /dev/null
+++ b/drivers/infiniband/hw/mlx4/ah.c
@@ -0,0 +1,177 @@
+/*
+ * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <rdma/ib_addr.h>
+#include <rdma/ib_cache.h>
+
+#include <linux/slab.h>
+#include <linux/inet.h>
+#include <linux/string.h>
+
+#include "mlx4_ib.h"
+
+static struct ib_ah *create_ib_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr,
+				  struct mlx4_ib_ah *ah)
+{
+	struct mlx4_dev *dev = to_mdev(pd->device)->dev;
+
+	ah->av.ib.port_pd = cpu_to_be32(to_mpd(pd)->pdn | (ah_attr->port_num << 24));
+	ah->av.ib.g_slid  = ah_attr->src_path_bits;
+	if (ah_attr->ah_flags & IB_AH_GRH) {
+		ah->av.ib.g_slid   |= 0x80;
+		ah->av.ib.gid_index = ah_attr->grh.sgid_index;
+		ah->av.ib.hop_limit = ah_attr->grh.hop_limit;
+		ah->av.ib.sl_tclass_flowlabel |=
+			cpu_to_be32((ah_attr->grh.traffic_class << 20) |
+				    ah_attr->grh.flow_label);
+		memcpy(ah->av.ib.dgid, ah_attr->grh.dgid.raw, 16);
+	}
+
+	ah->av.ib.dlid    = cpu_to_be16(ah_attr->dlid);
+	if (ah_attr->static_rate) {
+		ah->av.ib.stat_rate = ah_attr->static_rate + MLX4_STAT_RATE_OFFSET;
+		while (ah->av.ib.stat_rate > IB_RATE_2_5_GBPS + MLX4_STAT_RATE_OFFSET &&
+		       !(1 << ah->av.ib.stat_rate & dev->caps.stat_rate_support))
+			--ah->av.ib.stat_rate;
+	}
+	ah->av.ib.sl_tclass_flowlabel = cpu_to_be32(ah_attr->sl << 28);
+
+	return &ah->ibah;
+}
+
+static struct ib_ah *create_iboe_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr,
+				    struct mlx4_ib_ah *ah)
+{
+	struct mlx4_ib_dev *ibdev = to_mdev(pd->device);
+	struct mlx4_dev *dev = ibdev->dev;
+	int is_mcast = 0;
+	struct in6_addr in6;
+	u16 vlan_tag;
+
+	memcpy(&in6, ah_attr->grh.dgid.raw, sizeof(in6));
+	if (rdma_is_multicast_addr(&in6)) {
+		is_mcast = 1;
+		rdma_get_mcast_mac(&in6, ah->av.eth.mac);
+	} else {
+		memcpy(ah->av.eth.mac, ah_attr->dmac, ETH_ALEN);
+	}
+	vlan_tag = ah_attr->vlan_id;
+	if (vlan_tag < 0x1000)
+		vlan_tag |= (ah_attr->sl & 7) << 13;
+	ah->av.eth.port_pd = cpu_to_be32(to_mpd(pd)->pdn | (ah_attr->port_num << 24));
+	ah->av.eth.gid_index = ah_attr->grh.sgid_index;
+	ah->av.eth.vlan = cpu_to_be16(vlan_tag);
+	if (ah_attr->static_rate) {
+		ah->av.eth.stat_rate = ah_attr->static_rate + MLX4_STAT_RATE_OFFSET;
+		while (ah->av.eth.stat_rate > IB_RATE_2_5_GBPS + MLX4_STAT_RATE_OFFSET &&
+		       !(1 << ah->av.eth.stat_rate & dev->caps.stat_rate_support))
+			--ah->av.eth.stat_rate;
+	}
+
+	/*
+	 * HW requires multicast LID so we just choose one.
+	 */
+	if (is_mcast)
+		ah->av.ib.dlid = cpu_to_be16(0xc000);
+
+	memcpy(ah->av.eth.dgid, ah_attr->grh.dgid.raw, 16);
+	ah->av.eth.sl_tclass_flowlabel = cpu_to_be32(ah_attr->sl << 29);
+
+	return &ah->ibah;
+}
+
+struct ib_ah *mlx4_ib_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr)
+{
+	struct mlx4_ib_ah *ah;
+	struct ib_ah *ret;
+
+	ah = kzalloc(sizeof *ah, GFP_ATOMIC);
+	if (!ah)
+		return ERR_PTR(-ENOMEM);
+
+	if (rdma_port_get_link_layer(pd->device, ah_attr->port_num) == IB_LINK_LAYER_ETHERNET) {
+		if (!(ah_attr->ah_flags & IB_AH_GRH)) {
+			ret = ERR_PTR(-EINVAL);
+		} else {
+			/*
+			 * TBD: need to handle the case when we get
+			 * called in an atomic context and there we
+			 * might sleep.  We don't expect this
+			 * currently since we're working with link
+			 * local addresses which we can translate
+			 * without going to sleep.
+			 */
+			ret = create_iboe_ah(pd, ah_attr, ah);
+		}
+
+		if (IS_ERR(ret))
+			kfree(ah);
+
+		return ret;
+	} else
+		return create_ib_ah(pd, ah_attr, ah); /* never fails */
+}
+
+int mlx4_ib_query_ah(struct ib_ah *ibah, struct ib_ah_attr *ah_attr)
+{
+	struct mlx4_ib_ah *ah = to_mah(ibah);
+	enum rdma_link_layer ll;
+
+	memset(ah_attr, 0, sizeof *ah_attr);
+	ah_attr->sl = be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 28;
+	ah_attr->port_num = be32_to_cpu(ah->av.ib.port_pd) >> 24;
+	ll = rdma_port_get_link_layer(ibah->device, ah_attr->port_num);
+	ah_attr->dlid = ll == IB_LINK_LAYER_INFINIBAND ? be16_to_cpu(ah->av.ib.dlid) : 0;
+	if (ah->av.ib.stat_rate)
+		ah_attr->static_rate = ah->av.ib.stat_rate - MLX4_STAT_RATE_OFFSET;
+	ah_attr->src_path_bits = ah->av.ib.g_slid & 0x7F;
+
+	if (mlx4_ib_ah_grh_present(ah)) {
+		ah_attr->ah_flags = IB_AH_GRH;
+
+		ah_attr->grh.traffic_class =
+			be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 20;
+		ah_attr->grh.flow_label =
+			be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) & 0xfffff;
+		ah_attr->grh.hop_limit  = ah->av.ib.hop_limit;
+		ah_attr->grh.sgid_index = ah->av.ib.gid_index;
+		memcpy(ah_attr->grh.dgid.raw, ah->av.ib.dgid, 16);
+	}
+
+	return 0;
+}
+
+int mlx4_ib_destroy_ah(struct ib_ah *ah)
+{
+	kfree(to_mah(ah));
+	return 0;
+}
diff --git a/drivers/infiniband/hw/mlx4/alias_GUID.c b/drivers/infiniband/hw/mlx4/alias_GUID.c
new file mode 100644
index 0000000..0eb141c
--- /dev/null
+++ b/drivers/infiniband/hw/mlx4/alias_GUID.c
@@ -0,0 +1,688 @@
+/*
+ * Copyright (c) 2012 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+ /***********************************************************/
+/*This file support the handling of the Alias GUID feature. */
+/***********************************************************/
+#include <rdma/ib_mad.h>
+#include <rdma/ib_smi.h>
+#include <rdma/ib_cache.h>
+#include <rdma/ib_sa.h>
+#include <rdma/ib_pack.h>
+#include <linux/mlx4/cmd.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <rdma/ib_user_verbs.h>
+#include <linux/delay.h>
+#include "mlx4_ib.h"
+
+/*
+The driver keeps the current state of all guids, as they are in the HW.
+Whenever we receive an smp mad GUIDInfo record, the data will be cached.
+*/
+
+struct mlx4_alias_guid_work_context {
+	u8 port;
+	struct mlx4_ib_dev     *dev ;
+	struct ib_sa_query     *sa_query;
+	struct completion	done;
+	int			query_id;
+	struct list_head	list;
+	int			block_num;
+};
+
+struct mlx4_next_alias_guid_work {
+	u8 port;
+	u8 block_num;
+	struct mlx4_sriov_alias_guid_info_rec_det rec_det;
+};
+
+
+void mlx4_ib_update_cache_on_guid_change(struct mlx4_ib_dev *dev, int block_num,
+					 u8 port_num, u8 *p_data)
+{
+	int i;
+	u64 guid_indexes;
+	int slave_id;
+	int port_index = port_num - 1;
+
+	if (!mlx4_is_master(dev->dev))
+		return;
+
+	guid_indexes = be64_to_cpu((__force __be64) dev->sriov.alias_guid.
+				   ports_guid[port_num - 1].
+				   all_rec_per_port[block_num].guid_indexes);
+	pr_debug("port: %d, guid_indexes: 0x%llx\n", port_num, guid_indexes);
+
+	for (i = 0; i < NUM_ALIAS_GUID_IN_REC; i++) {
+		/* The location of the specific index starts from bit number 4
+		 * until bit num 11 */
+		if (test_bit(i + 4, (unsigned long *)&guid_indexes)) {
+			slave_id = (block_num * NUM_ALIAS_GUID_IN_REC) + i ;
+			if (slave_id >= dev->dev->num_slaves) {
+				pr_debug("The last slave: %d\n", slave_id);
+				return;
+			}
+
+			/* cache the guid: */
+			memcpy(&dev->sriov.demux[port_index].guid_cache[slave_id],
+			       &p_data[i * GUID_REC_SIZE],
+			       GUID_REC_SIZE);
+		} else
+			pr_debug("Guid number: %d in block: %d"
+				 " was not updated\n", i, block_num);
+	}
+}
+
+static __be64 get_cached_alias_guid(struct mlx4_ib_dev *dev, int port, int index)
+{
+	if (index >= NUM_ALIAS_GUID_PER_PORT) {
+		pr_err("%s: ERROR: asked for index:%d\n", __func__, index);
+		return (__force __be64) -1;
+	}
+	return *(__be64 *)&dev->sriov.demux[port - 1].guid_cache[index];
+}
+
+
+ib_sa_comp_mask mlx4_ib_get_aguid_comp_mask_from_ix(int index)
+{
+	return IB_SA_COMP_MASK(4 + index);
+}
+
+/*
+ * Whenever new GUID is set/unset (guid table change) create event and
+ * notify the relevant slave (master also should be notified).
+ * If the GUID value is not as we have in the cache the slave will not be
+ * updated; in this case it waits for the smp_snoop or the port management
+ * event to call the function and to update the slave.
+ * block_number - the index of the block (16 blocks available)
+ * port_number - 1 or 2
+ */
+void mlx4_ib_notify_slaves_on_guid_change(struct mlx4_ib_dev *dev,
+					  int block_num, u8 port_num,
+					  u8 *p_data)
+{
+	int i;
+	u64 guid_indexes;
+	int slave_id;
+	enum slave_port_state new_state;
+	enum slave_port_state prev_state;
+	__be64 tmp_cur_ag, form_cache_ag;
+	enum slave_port_gen_event gen_event;
+
+	if (!mlx4_is_master(dev->dev))
+		return;
+
+	guid_indexes = be64_to_cpu((__force __be64) dev->sriov.alias_guid.
+				   ports_guid[port_num - 1].
+				   all_rec_per_port[block_num].guid_indexes);
+	pr_debug("port: %d, guid_indexes: 0x%llx\n", port_num, guid_indexes);
+
+	/*calculate the slaves and notify them*/
+	for (i = 0; i < NUM_ALIAS_GUID_IN_REC; i++) {
+		/* the location of the specific index runs from bits 4..11 */
+		if (!(test_bit(i + 4, (unsigned long *)&guid_indexes)))
+			continue;
+
+		slave_id = (block_num * NUM_ALIAS_GUID_IN_REC) + i ;
+		if (slave_id >= dev->dev->num_vfs + 1)
+			return;
+		tmp_cur_ag = *(__be64 *)&p_data[i * GUID_REC_SIZE];
+		form_cache_ag = get_cached_alias_guid(dev, port_num,
+					(NUM_ALIAS_GUID_IN_REC * block_num) + i);
+		/*
+		 * Check if guid is not the same as in the cache,
+		 * If it is different, wait for the snoop_smp or the port mgmt
+		 * change event to update the slave on its port state change
+		 */
+		if (tmp_cur_ag != form_cache_ag)
+			continue;
+		mlx4_gen_guid_change_eqe(dev->dev, slave_id, port_num);
+
+		/*2 cases: Valid GUID, and Invalid Guid*/
+
+		if (tmp_cur_ag != MLX4_NOT_SET_GUID) { /*valid GUID*/
+			prev_state = mlx4_get_slave_port_state(dev->dev, slave_id, port_num);
+			new_state = set_and_calc_slave_port_state(dev->dev, slave_id, port_num,
+								  MLX4_PORT_STATE_IB_PORT_STATE_EVENT_GID_VALID,
+								  &gen_event);
+			pr_debug("slave: %d, port: %d prev_port_state: %d,"
+				 " new_port_state: %d, gen_event: %d\n",
+				 slave_id, port_num, prev_state, new_state, gen_event);
+			if (gen_event == SLAVE_PORT_GEN_EVENT_UP) {
+				pr_debug("sending PORT_UP event to slave: %d, port: %d\n",
+					 slave_id, port_num);
+				mlx4_gen_port_state_change_eqe(dev->dev, slave_id,
+							       port_num, MLX4_PORT_CHANGE_SUBTYPE_ACTIVE);
+			}
+		} else { /* request to invalidate GUID */
+			set_and_calc_slave_port_state(dev->dev, slave_id, port_num,
+						      MLX4_PORT_STATE_IB_EVENT_GID_INVALID,
+						      &gen_event);
+			pr_debug("sending PORT DOWN event to slave: %d, port: %d\n",
+				 slave_id, port_num);
+			mlx4_gen_port_state_change_eqe(dev->dev, slave_id, port_num,
+						       MLX4_PORT_CHANGE_SUBTYPE_DOWN);
+		}
+	}
+}
+
+static void aliasguid_query_handler(int status,
+				    struct ib_sa_guidinfo_rec *guid_rec,
+				    void *context)
+{
+	struct mlx4_ib_dev *dev;
+	struct mlx4_alias_guid_work_context *cb_ctx = context;
+	u8 port_index ;
+	int i;
+	struct mlx4_sriov_alias_guid_info_rec_det *rec;
+	unsigned long flags, flags1;
+
+	if (!context)
+		return;
+
+	dev = cb_ctx->dev;
+	port_index = cb_ctx->port - 1;
+	rec = &dev->sriov.alias_guid.ports_guid[port_index].
+		all_rec_per_port[cb_ctx->block_num];
+
+	if (status) {
+		rec->status = MLX4_GUID_INFO_STATUS_IDLE;
+		pr_debug("(port: %d) failed: status = %d\n",
+			 cb_ctx->port, status);
+		goto out;
+	}
+
+	if (guid_rec->block_num != cb_ctx->block_num) {
+		pr_err("block num mismatch: %d != %d\n",
+		       cb_ctx->block_num, guid_rec->block_num);
+		goto out;
+	}
+
+	pr_debug("lid/port: %d/%d, block_num: %d\n",
+		 be16_to_cpu(guid_rec->lid), cb_ctx->port,
+		 guid_rec->block_num);
+
+	rec = &dev->sriov.alias_guid.ports_guid[port_index].
+		all_rec_per_port[guid_rec->block_num];
+
+	rec->status = MLX4_GUID_INFO_STATUS_SET;
+	rec->method = MLX4_GUID_INFO_RECORD_SET;
+
+	for (i = 0 ; i < NUM_ALIAS_GUID_IN_REC; i++) {
+		__be64 tmp_cur_ag;
+		tmp_cur_ag = *(__be64 *)&guid_rec->guid_info_list[i * GUID_REC_SIZE];
+		/* check if the SM didn't assign one of the records.
+		 * if it didn't, if it was not sysadmin request:
+		 * ask the SM to give a new GUID, (instead of the driver request).
+		 */
+		if (tmp_cur_ag == MLX4_NOT_SET_GUID) {
+			mlx4_ib_warn(&dev->ib_dev, "%s:Record num %d in "
+				     "block_num: %d was declined by SM, "
+				     "ownership by %d (0 = driver, 1=sysAdmin,"
+				     " 2=None)\n", __func__, i,
+				     guid_rec->block_num, rec->ownership);
+			if (rec->ownership == MLX4_GUID_DRIVER_ASSIGN) {
+				/* if it is driver assign, asks for new GUID from SM*/
+				*(__be64 *)&rec->all_recs[i * GUID_REC_SIZE] =
+					MLX4_NOT_SET_GUID;
+
+				/* Mark the record as not assigned, and let it
+				 * be sent again in the next work sched.*/
+				rec->status = MLX4_GUID_INFO_STATUS_IDLE;
+				rec->guid_indexes |= mlx4_ib_get_aguid_comp_mask_from_ix(i);
+			}
+		} else {
+		       /* properly assigned record. */
+		       /* We save the GUID we just got from the SM in the
+			* admin_guid in order to be persistent, and in the
+			* request from the sm the process will ask for the same GUID */
+			if (rec->ownership == MLX4_GUID_SYSADMIN_ASSIGN &&
+			    tmp_cur_ag != *(__be64 *)&rec->all_recs[i * GUID_REC_SIZE]) {
+				/* the sysadmin assignment failed.*/
+				mlx4_ib_warn(&dev->ib_dev, "%s: Failed to set"
+					     " admin guid after SysAdmin "
+					     "configuration. "
+					     "Record num %d in block_num:%d "
+					     "was declined by SM, "
+					     "new val(0x%llx) was kept\n",
+					      __func__, i,
+					     guid_rec->block_num,
+					     be64_to_cpu(*(__be64 *) &
+							 rec->all_recs[i * GUID_REC_SIZE]));
+			} else {
+				memcpy(&rec->all_recs[i * GUID_REC_SIZE],
+				       &guid_rec->guid_info_list[i * GUID_REC_SIZE],
+				       GUID_REC_SIZE);
+			}
+		}
+	}
+	/*
+	The func is call here to close the cases when the
+	sm doesn't send smp, so in the sa response the driver
+	notifies the slave.
+	*/
+	mlx4_ib_notify_slaves_on_guid_change(dev, guid_rec->block_num,
+					     cb_ctx->port,
+					     guid_rec->guid_info_list);
+out:
+	spin_lock_irqsave(&dev->sriov.going_down_lock, flags);
+	spin_lock_irqsave(&dev->sriov.alias_guid.ag_work_lock, flags1);
+	if (!dev->sriov.is_going_down)
+		queue_delayed_work(dev->sriov.alias_guid.ports_guid[port_index].wq,
+				   &dev->sriov.alias_guid.ports_guid[port_index].
+				   alias_guid_work, 0);
+	if (cb_ctx->sa_query) {
+		list_del(&cb_ctx->list);
+		kfree(cb_ctx);
+	} else
+		complete(&cb_ctx->done);
+	spin_unlock_irqrestore(&dev->sriov.alias_guid.ag_work_lock, flags1);
+	spin_unlock_irqrestore(&dev->sriov.going_down_lock, flags);
+}
+
+static void invalidate_guid_record(struct mlx4_ib_dev *dev, u8 port, int index)
+{
+	int i;
+	u64 cur_admin_val;
+	ib_sa_comp_mask comp_mask = 0;
+
+	dev->sriov.alias_guid.ports_guid[port - 1].all_rec_per_port[index].status
+		= MLX4_GUID_INFO_STATUS_IDLE;
+	dev->sriov.alias_guid.ports_guid[port - 1].all_rec_per_port[index].method
+		= MLX4_GUID_INFO_RECORD_SET;
+
+	/* calculate the comp_mask for that record.*/
+	for (i = 0; i < NUM_ALIAS_GUID_IN_REC; i++) {
+		cur_admin_val =
+			*(u64 *)&dev->sriov.alias_guid.ports_guid[port - 1].
+			all_rec_per_port[index].all_recs[GUID_REC_SIZE * i];
+		/*
+		check the admin value: if it's for delete (~00LL) or
+		it is the first guid of the first record (hw guid) or
+		the records is not in ownership of the sysadmin and the sm doesn't
+		need to assign GUIDs, then don't put it up for assignment.
+		*/
+		if (MLX4_GUID_FOR_DELETE_VAL == cur_admin_val ||
+		    (!index && !i) ||
+		    MLX4_GUID_NONE_ASSIGN == dev->sriov.alias_guid.
+		    ports_guid[port - 1].all_rec_per_port[index].ownership)
+			continue;
+		comp_mask |= mlx4_ib_get_aguid_comp_mask_from_ix(i);
+	}
+	dev->sriov.alias_guid.ports_guid[port - 1].
+		all_rec_per_port[index].guid_indexes = comp_mask;
+}
+
+static int set_guid_rec(struct ib_device *ibdev,
+			u8 port, int index,
+			struct mlx4_sriov_alias_guid_info_rec_det *rec_det)
+{
+	int err;
+	struct mlx4_ib_dev *dev = to_mdev(ibdev);
+	struct ib_sa_guidinfo_rec guid_info_rec;
+	ib_sa_comp_mask comp_mask;
+	struct ib_port_attr attr;
+	struct mlx4_alias_guid_work_context *callback_context;
+	unsigned long resched_delay, flags, flags1;
+	struct list_head *head =
+		&dev->sriov.alias_guid.ports_guid[port - 1].cb_list;
+
+	err = __mlx4_ib_query_port(ibdev, port, &attr, 1);
+	if (err) {
+		pr_debug("mlx4_ib_query_port failed (err: %d), port: %d\n",
+			 err, port);
+		return err;
+	}
+	/*check the port was configured by the sm, otherwise no need to send */
+	if (attr.state != IB_PORT_ACTIVE) {
+		pr_debug("port %d not active...rescheduling\n", port);
+		resched_delay = 5 * HZ;
+		err = -EAGAIN;
+		goto new_schedule;
+	}
+
+	callback_context = kmalloc(sizeof *callback_context, GFP_KERNEL);
+	if (!callback_context) {
+		err = -ENOMEM;
+		resched_delay = HZ * 5;
+		goto new_schedule;
+	}
+	callback_context->port = port;
+	callback_context->dev = dev;
+	callback_context->block_num = index;
+
+	memset(&guid_info_rec, 0, sizeof (struct ib_sa_guidinfo_rec));
+
+	guid_info_rec.lid = cpu_to_be16(attr.lid);
+	guid_info_rec.block_num = index;
+
+	memcpy(guid_info_rec.guid_info_list, rec_det->all_recs,
+	       GUID_REC_SIZE * NUM_ALIAS_GUID_IN_REC);
+	comp_mask = IB_SA_GUIDINFO_REC_LID | IB_SA_GUIDINFO_REC_BLOCK_NUM |
+		rec_det->guid_indexes;
+
+	init_completion(&callback_context->done);
+	spin_lock_irqsave(&dev->sriov.alias_guid.ag_work_lock, flags1);
+	list_add_tail(&callback_context->list, head);
+	spin_unlock_irqrestore(&dev->sriov.alias_guid.ag_work_lock, flags1);
+
+	callback_context->query_id =
+		ib_sa_guid_info_rec_query(dev->sriov.alias_guid.sa_client,
+					  ibdev, port, &guid_info_rec,
+					  comp_mask, rec_det->method, 1000,
+					  GFP_KERNEL, aliasguid_query_handler,
+					  callback_context,
+					  &callback_context->sa_query);
+	if (callback_context->query_id < 0) {
+		pr_debug("ib_sa_guid_info_rec_query failed, query_id: "
+			 "%d. will reschedule to the next 1 sec.\n",
+			 callback_context->query_id);
+		spin_lock_irqsave(&dev->sriov.alias_guid.ag_work_lock, flags1);
+		list_del(&callback_context->list);
+		kfree(callback_context);
+		spin_unlock_irqrestore(&dev->sriov.alias_guid.ag_work_lock, flags1);
+		resched_delay = 1 * HZ;
+		err = -EAGAIN;
+		goto new_schedule;
+	}
+	err = 0;
+	goto out;
+
+new_schedule:
+	spin_lock_irqsave(&dev->sriov.going_down_lock, flags);
+	spin_lock_irqsave(&dev->sriov.alias_guid.ag_work_lock, flags1);
+	invalidate_guid_record(dev, port, index);
+	if (!dev->sriov.is_going_down) {
+		queue_delayed_work(dev->sriov.alias_guid.ports_guid[port - 1].wq,
+				   &dev->sriov.alias_guid.ports_guid[port - 1].alias_guid_work,
+				   resched_delay);
+	}
+	spin_unlock_irqrestore(&dev->sriov.alias_guid.ag_work_lock, flags1);
+	spin_unlock_irqrestore(&dev->sriov.going_down_lock, flags);
+
+out:
+	return err;
+}
+
+void mlx4_ib_invalidate_all_guid_record(struct mlx4_ib_dev *dev, int port)
+{
+	int i;
+	unsigned long flags, flags1;
+
+	pr_debug("port %d\n", port);
+
+	spin_lock_irqsave(&dev->sriov.going_down_lock, flags);
+	spin_lock_irqsave(&dev->sriov.alias_guid.ag_work_lock, flags1);
+	for (i = 0; i < NUM_ALIAS_GUID_REC_IN_PORT; i++)
+		invalidate_guid_record(dev, port, i);
+
+	if (mlx4_is_master(dev->dev) && !dev->sriov.is_going_down) {
+		/*
+		make sure no work waits in the queue, if the work is already
+		queued(not on the timer) the cancel will fail. That is not a problem
+		because we just want the work started.
+		*/
+		cancel_delayed_work(&dev->sriov.alias_guid.
+				      ports_guid[port - 1].alias_guid_work);
+		queue_delayed_work(dev->sriov.alias_guid.ports_guid[port - 1].wq,
+				   &dev->sriov.alias_guid.ports_guid[port - 1].alias_guid_work,
+				   0);
+	}
+	spin_unlock_irqrestore(&dev->sriov.alias_guid.ag_work_lock, flags1);
+	spin_unlock_irqrestore(&dev->sriov.going_down_lock, flags);
+}
+
+/* The function returns the next record that was
+ * not configured (or failed to be configured) */
+static int get_next_record_to_update(struct mlx4_ib_dev *dev, u8 port,
+				     struct mlx4_next_alias_guid_work *rec)
+{
+	int j;
+	unsigned long flags;
+
+	for (j = 0; j < NUM_ALIAS_GUID_REC_IN_PORT; j++) {
+		spin_lock_irqsave(&dev->sriov.alias_guid.ag_work_lock, flags);
+		if (dev->sriov.alias_guid.ports_guid[port].all_rec_per_port[j].status ==
+		    MLX4_GUID_INFO_STATUS_IDLE) {
+			memcpy(&rec->rec_det,
+			       &dev->sriov.alias_guid.ports_guid[port].all_rec_per_port[j],
+			       sizeof (struct mlx4_sriov_alias_guid_info_rec_det));
+			rec->port = port;
+			rec->block_num = j;
+			dev->sriov.alias_guid.ports_guid[port].all_rec_per_port[j].status =
+				MLX4_GUID_INFO_STATUS_PENDING;
+			spin_unlock_irqrestore(&dev->sriov.alias_guid.ag_work_lock, flags);
+			return 0;
+		}
+		spin_unlock_irqrestore(&dev->sriov.alias_guid.ag_work_lock, flags);
+	}
+	return -ENOENT;
+}
+
+static void set_administratively_guid_record(struct mlx4_ib_dev *dev, int port,
+					     int rec_index,
+					     struct mlx4_sriov_alias_guid_info_rec_det *rec_det)
+{
+	dev->sriov.alias_guid.ports_guid[port].all_rec_per_port[rec_index].guid_indexes =
+		rec_det->guid_indexes;
+	memcpy(dev->sriov.alias_guid.ports_guid[port].all_rec_per_port[rec_index].all_recs,
+	       rec_det->all_recs, NUM_ALIAS_GUID_IN_REC * GUID_REC_SIZE);
+	dev->sriov.alias_guid.ports_guid[port].all_rec_per_port[rec_index].status =
+		rec_det->status;
+}
+
+static void set_all_slaves_guids(struct mlx4_ib_dev *dev, int port)
+{
+	int j;
+	struct mlx4_sriov_alias_guid_info_rec_det rec_det ;
+
+	for (j = 0 ; j < NUM_ALIAS_GUID_REC_IN_PORT ; j++) {
+		memset(rec_det.all_recs, 0, NUM_ALIAS_GUID_IN_REC * GUID_REC_SIZE);
+		rec_det.guid_indexes = (!j ? 0 : IB_SA_GUIDINFO_REC_GID0) |
+			IB_SA_GUIDINFO_REC_GID1 | IB_SA_GUIDINFO_REC_GID2 |
+			IB_SA_GUIDINFO_REC_GID3 | IB_SA_GUIDINFO_REC_GID4 |
+			IB_SA_GUIDINFO_REC_GID5 | IB_SA_GUIDINFO_REC_GID6 |
+			IB_SA_GUIDINFO_REC_GID7;
+		rec_det.status = MLX4_GUID_INFO_STATUS_IDLE;
+		set_administratively_guid_record(dev, port, j, &rec_det);
+	}
+}
+
+static void alias_guid_work(struct work_struct *work)
+{
+	struct delayed_work *delay = to_delayed_work(work);
+	int ret = 0;
+	struct mlx4_next_alias_guid_work *rec;
+	struct mlx4_sriov_alias_guid_port_rec_det *sriov_alias_port =
+		container_of(delay, struct mlx4_sriov_alias_guid_port_rec_det,
+			     alias_guid_work);
+	struct mlx4_sriov_alias_guid *sriov_alias_guid = sriov_alias_port->parent;
+	struct mlx4_ib_sriov *ib_sriov = container_of(sriov_alias_guid,
+						struct mlx4_ib_sriov,
+						alias_guid);
+	struct mlx4_ib_dev *dev = container_of(ib_sriov, struct mlx4_ib_dev, sriov);
+
+	rec = kzalloc(sizeof *rec, GFP_KERNEL);
+	if (!rec) {
+		pr_err("alias_guid_work: No Memory\n");
+		return;
+	}
+
+	pr_debug("starting [port: %d]...\n", sriov_alias_port->port + 1);
+	ret = get_next_record_to_update(dev, sriov_alias_port->port, rec);
+	if (ret) {
+		pr_debug("No more records to update.\n");
+		goto out;
+	}
+
+	set_guid_rec(&dev->ib_dev, rec->port + 1, rec->block_num,
+		     &rec->rec_det);
+
+out:
+	kfree(rec);
+}
+
+
+void mlx4_ib_init_alias_guid_work(struct mlx4_ib_dev *dev, int port)
+{
+	unsigned long flags, flags1;
+
+	if (!mlx4_is_master(dev->dev))
+		return;
+	spin_lock_irqsave(&dev->sriov.going_down_lock, flags);
+	spin_lock_irqsave(&dev->sriov.alias_guid.ag_work_lock, flags1);
+	if (!dev->sriov.is_going_down) {
+		queue_delayed_work(dev->sriov.alias_guid.ports_guid[port].wq,
+			   &dev->sriov.alias_guid.ports_guid[port].alias_guid_work, 0);
+	}
+	spin_unlock_irqrestore(&dev->sriov.alias_guid.ag_work_lock, flags1);
+	spin_unlock_irqrestore(&dev->sriov.going_down_lock, flags);
+}
+
+void mlx4_ib_destroy_alias_guid_service(struct mlx4_ib_dev *dev)
+{
+	int i;
+	struct mlx4_ib_sriov *sriov = &dev->sriov;
+	struct mlx4_alias_guid_work_context *cb_ctx;
+	struct mlx4_sriov_alias_guid_port_rec_det *det;
+	struct ib_sa_query *sa_query;
+	unsigned long flags;
+
+	for (i = 0 ; i < dev->num_ports; i++) {
+		cancel_delayed_work(&dev->sriov.alias_guid.ports_guid[i].alias_guid_work);
+		det = &sriov->alias_guid.ports_guid[i];
+		spin_lock_irqsave(&sriov->alias_guid.ag_work_lock, flags);
+		while (!list_empty(&det->cb_list)) {
+			cb_ctx = list_entry(det->cb_list.next,
+					    struct mlx4_alias_guid_work_context,
+					    list);
+			sa_query = cb_ctx->sa_query;
+			cb_ctx->sa_query = NULL;
+			list_del(&cb_ctx->list);
+			spin_unlock_irqrestore(&sriov->alias_guid.ag_work_lock, flags);
+			ib_sa_cancel_query(cb_ctx->query_id, sa_query);
+			wait_for_completion(&cb_ctx->done);
+			kfree(cb_ctx);
+			spin_lock_irqsave(&sriov->alias_guid.ag_work_lock, flags);
+		}
+		spin_unlock_irqrestore(&sriov->alias_guid.ag_work_lock, flags);
+	}
+	for (i = 0 ; i < dev->num_ports; i++) {
+		flush_workqueue(dev->sriov.alias_guid.ports_guid[i].wq);
+		destroy_workqueue(dev->sriov.alias_guid.ports_guid[i].wq);
+	}
+	ib_sa_unregister_client(dev->sriov.alias_guid.sa_client);
+	kfree(dev->sriov.alias_guid.sa_client);
+}
+
+int mlx4_ib_init_alias_guid_service(struct mlx4_ib_dev *dev)
+{
+	char alias_wq_name[15];
+	int ret = 0;
+	int i, j, k;
+	union ib_gid gid;
+
+	if (!mlx4_is_master(dev->dev))
+		return 0;
+	dev->sriov.alias_guid.sa_client =
+		kzalloc(sizeof *dev->sriov.alias_guid.sa_client, GFP_KERNEL);
+	if (!dev->sriov.alias_guid.sa_client)
+		return -ENOMEM;
+
+	ib_sa_register_client(dev->sriov.alias_guid.sa_client);
+
+	spin_lock_init(&dev->sriov.alias_guid.ag_work_lock);
+
+	for (i = 1; i <= dev->num_ports; ++i) {
+		if (dev->ib_dev.query_gid(&dev->ib_dev , i, 0, &gid)) {
+			ret = -EFAULT;
+			goto err_unregister;
+		}
+	}
+
+	for (i = 0 ; i < dev->num_ports; i++) {
+		memset(&dev->sriov.alias_guid.ports_guid[i], 0,
+		       sizeof (struct mlx4_sriov_alias_guid_port_rec_det));
+		/*Check if the SM doesn't need to assign the GUIDs*/
+		for (j = 0; j < NUM_ALIAS_GUID_REC_IN_PORT; j++) {
+			if (mlx4_ib_sm_guid_assign) {
+				dev->sriov.alias_guid.ports_guid[i].
+					all_rec_per_port[j].
+					ownership = MLX4_GUID_DRIVER_ASSIGN;
+				continue;
+			}
+			dev->sriov.alias_guid.ports_guid[i].all_rec_per_port[j].
+					ownership = MLX4_GUID_NONE_ASSIGN;
+			/*mark each val as it was deleted,
+			  till the sysAdmin will give it valid val*/
+			for (k = 0; k < NUM_ALIAS_GUID_IN_REC; k++) {
+				*(__be64 *)&dev->sriov.alias_guid.ports_guid[i].
+					all_rec_per_port[j].all_recs[GUID_REC_SIZE * k] =
+						cpu_to_be64(MLX4_GUID_FOR_DELETE_VAL);
+			}
+		}
+		INIT_LIST_HEAD(&dev->sriov.alias_guid.ports_guid[i].cb_list);
+		/*prepare the records, set them to be allocated by sm*/
+		for (j = 0 ; j < NUM_ALIAS_GUID_REC_IN_PORT; j++)
+			invalidate_guid_record(dev, i + 1, j);
+
+		dev->sriov.alias_guid.ports_guid[i].parent = &dev->sriov.alias_guid;
+		dev->sriov.alias_guid.ports_guid[i].port  = i;
+		if (mlx4_ib_sm_guid_assign)
+			set_all_slaves_guids(dev, i);
+
+		snprintf(alias_wq_name, sizeof alias_wq_name, "alias_guid%d", i);
+		dev->sriov.alias_guid.ports_guid[i].wq =
+			create_singlethread_workqueue(alias_wq_name);
+		if (!dev->sriov.alias_guid.ports_guid[i].wq) {
+			ret = -ENOMEM;
+			goto err_thread;
+		}
+		INIT_DELAYED_WORK(&dev->sriov.alias_guid.ports_guid[i].alias_guid_work,
+			  alias_guid_work);
+	}
+	return 0;
+
+err_thread:
+	for (--i; i >= 0; i--) {
+		destroy_workqueue(dev->sriov.alias_guid.ports_guid[i].wq);
+		dev->sriov.alias_guid.ports_guid[i].wq = NULL;
+	}
+
+err_unregister:
+	ib_sa_unregister_client(dev->sriov.alias_guid.sa_client);
+	kfree(dev->sriov.alias_guid.sa_client);
+	dev->sriov.alias_guid.sa_client = NULL;
+	pr_err("init_alias_guid_service: Failed. (ret:%d)\n", ret);
+	return ret;
+}
diff --git a/drivers/infiniband/hw/mlx4/cm.c b/drivers/infiniband/hw/mlx4/cm.c
new file mode 100644
index 0000000..56a593e
--- /dev/null
+++ b/drivers/infiniband/hw/mlx4/cm.c
@@ -0,0 +1,478 @@
+/*
+ * Copyright (c) 2012 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <rdma/ib_mad.h>
+
+#include <linux/mlx4/cmd.h>
+#include <linux/rbtree.h>
+#include <linux/idr.h>
+#include <rdma/ib_cm.h>
+
+#include "mlx4_ib.h"
+
+#define CM_CLEANUP_CACHE_TIMEOUT  (5 * HZ)
+
+struct id_map_entry {
+	struct rb_node node;
+
+	u32 sl_cm_id;
+	u32 pv_cm_id;
+	int slave_id;
+	int scheduled_delete;
+	struct mlx4_ib_dev *dev;
+
+	struct list_head list;
+	struct delayed_work timeout;
+};
+
+struct cm_generic_msg {
+	struct ib_mad_hdr hdr;
+
+	__be32 local_comm_id;
+	__be32 remote_comm_id;
+};
+
+struct cm_sidr_generic_msg {
+	struct ib_mad_hdr hdr;
+	__be32 request_id;
+};
+
+struct cm_req_msg {
+	unsigned char unused[0x60];
+	union ib_gid primary_path_sgid;
+};
+
+
+static void set_local_comm_id(struct ib_mad *mad, u32 cm_id)
+{
+	if (mad->mad_hdr.attr_id == CM_SIDR_REQ_ATTR_ID) {
+		struct cm_sidr_generic_msg *msg =
+			(struct cm_sidr_generic_msg *)mad;
+		msg->request_id = cpu_to_be32(cm_id);
+	} else if (mad->mad_hdr.attr_id == CM_SIDR_REP_ATTR_ID) {
+		pr_err("trying to set local_comm_id in SIDR_REP\n");
+		return;
+	} else {
+		struct cm_generic_msg *msg = (struct cm_generic_msg *)mad;
+		msg->local_comm_id = cpu_to_be32(cm_id);
+	}
+}
+
+static u32 get_local_comm_id(struct ib_mad *mad)
+{
+	if (mad->mad_hdr.attr_id == CM_SIDR_REQ_ATTR_ID) {
+		struct cm_sidr_generic_msg *msg =
+			(struct cm_sidr_generic_msg *)mad;
+		return be32_to_cpu(msg->request_id);
+	} else if (mad->mad_hdr.attr_id == CM_SIDR_REP_ATTR_ID) {
+		pr_err("trying to set local_comm_id in SIDR_REP\n");
+		return -1;
+	} else {
+		struct cm_generic_msg *msg = (struct cm_generic_msg *)mad;
+		return be32_to_cpu(msg->local_comm_id);
+	}
+}
+
+static void set_remote_comm_id(struct ib_mad *mad, u32 cm_id)
+{
+	if (mad->mad_hdr.attr_id == CM_SIDR_REP_ATTR_ID) {
+		struct cm_sidr_generic_msg *msg =
+			(struct cm_sidr_generic_msg *)mad;
+		msg->request_id = cpu_to_be32(cm_id);
+	} else if (mad->mad_hdr.attr_id == CM_SIDR_REQ_ATTR_ID) {
+		pr_err("trying to set remote_comm_id in SIDR_REQ\n");
+		return;
+	} else {
+		struct cm_generic_msg *msg = (struct cm_generic_msg *)mad;
+		msg->remote_comm_id = cpu_to_be32(cm_id);
+	}
+}
+
+static u32 get_remote_comm_id(struct ib_mad *mad)
+{
+	if (mad->mad_hdr.attr_id == CM_SIDR_REP_ATTR_ID) {
+		struct cm_sidr_generic_msg *msg =
+			(struct cm_sidr_generic_msg *)mad;
+		return be32_to_cpu(msg->request_id);
+	} else if (mad->mad_hdr.attr_id == CM_SIDR_REQ_ATTR_ID) {
+		pr_err("trying to set remote_comm_id in SIDR_REQ\n");
+		return -1;
+	} else {
+		struct cm_generic_msg *msg = (struct cm_generic_msg *)mad;
+		return be32_to_cpu(msg->remote_comm_id);
+	}
+}
+
+static union ib_gid gid_from_req_msg(struct ib_device *ibdev, struct ib_mad *mad)
+{
+	struct cm_req_msg *msg = (struct cm_req_msg *)mad;
+
+	return msg->primary_path_sgid;
+}
+
+/* Lock should be taken before called */
+static struct id_map_entry *
+id_map_find_by_sl_id(struct ib_device *ibdev, u32 slave_id, u32 sl_cm_id)
+{
+	struct rb_root *sl_id_map = &to_mdev(ibdev)->sriov.sl_id_map;
+	struct rb_node *node = sl_id_map->rb_node;
+
+	while (node) {
+		struct id_map_entry *id_map_entry =
+			rb_entry(node, struct id_map_entry, node);
+
+		if (id_map_entry->sl_cm_id > sl_cm_id)
+			node = node->rb_left;
+		else if (id_map_entry->sl_cm_id < sl_cm_id)
+			node = node->rb_right;
+		else if (id_map_entry->slave_id > slave_id)
+			node = node->rb_left;
+		else if (id_map_entry->slave_id < slave_id)
+			node = node->rb_right;
+		else
+			return id_map_entry;
+	}
+	return NULL;
+}
+
+static void id_map_ent_timeout(struct work_struct *work)
+{
+	struct delayed_work *delay = to_delayed_work(work);
+	struct id_map_entry *ent = container_of(delay, struct id_map_entry, timeout);
+	struct id_map_entry *db_ent, *found_ent;
+	struct mlx4_ib_dev *dev = ent->dev;
+	struct mlx4_ib_sriov *sriov = &dev->sriov;
+	struct rb_root *sl_id_map = &sriov->sl_id_map;
+	int pv_id = (int) ent->pv_cm_id;
+
+	spin_lock(&sriov->id_map_lock);
+	db_ent = (struct id_map_entry *)idr_find(&sriov->pv_id_table, pv_id);
+	if (!db_ent)
+		goto out;
+	found_ent = id_map_find_by_sl_id(&dev->ib_dev, ent->slave_id, ent->sl_cm_id);
+	if (found_ent && found_ent == ent)
+		rb_erase(&found_ent->node, sl_id_map);
+	idr_remove(&sriov->pv_id_table, pv_id);
+
+out:
+	list_del(&ent->list);
+	spin_unlock(&sriov->id_map_lock);
+	kfree(ent);
+}
+
+static void id_map_find_del(struct ib_device *ibdev, int pv_cm_id)
+{
+	struct mlx4_ib_sriov *sriov = &to_mdev(ibdev)->sriov;
+	struct rb_root *sl_id_map = &sriov->sl_id_map;
+	struct id_map_entry *ent, *found_ent;
+
+	spin_lock(&sriov->id_map_lock);
+	ent = (struct id_map_entry *)idr_find(&sriov->pv_id_table, pv_cm_id);
+	if (!ent)
+		goto out;
+	found_ent = id_map_find_by_sl_id(ibdev, ent->slave_id, ent->sl_cm_id);
+	if (found_ent && found_ent == ent)
+		rb_erase(&found_ent->node, sl_id_map);
+	idr_remove(&sriov->pv_id_table, pv_cm_id);
+out:
+	spin_unlock(&sriov->id_map_lock);
+}
+
+static void sl_id_map_add(struct ib_device *ibdev, struct id_map_entry *new)
+{
+	struct rb_root *sl_id_map = &to_mdev(ibdev)->sriov.sl_id_map;
+	struct rb_node **link = &sl_id_map->rb_node, *parent = NULL;
+	struct id_map_entry *ent;
+	int slave_id = new->slave_id;
+	int sl_cm_id = new->sl_cm_id;
+
+	ent = id_map_find_by_sl_id(ibdev, slave_id, sl_cm_id);
+	if (ent) {
+		pr_debug("overriding existing sl_id_map entry (cm_id = %x)\n",
+			 sl_cm_id);
+
+		rb_replace_node(&ent->node, &new->node, sl_id_map);
+		return;
+	}
+
+	/* Go to the bottom of the tree */
+	while (*link) {
+		parent = *link;
+		ent = rb_entry(parent, struct id_map_entry, node);
+
+		if (ent->sl_cm_id > sl_cm_id || (ent->sl_cm_id == sl_cm_id && ent->slave_id > slave_id))
+			link = &(*link)->rb_left;
+		else
+			link = &(*link)->rb_right;
+	}
+
+	rb_link_node(&new->node, parent, link);
+	rb_insert_color(&new->node, sl_id_map);
+}
+
+static struct id_map_entry *
+id_map_alloc(struct ib_device *ibdev, int slave_id, u32 sl_cm_id)
+{
+	int ret;
+	struct id_map_entry *ent;
+	struct mlx4_ib_sriov *sriov = &to_mdev(ibdev)->sriov;
+
+	ent = kmalloc(sizeof (struct id_map_entry), GFP_KERNEL);
+	if (!ent) {
+		mlx4_ib_warn(ibdev, "Couldn't allocate id cache entry - out of memory\n");
+		return ERR_PTR(-ENOMEM);
+	}
+
+	ent->sl_cm_id = sl_cm_id;
+	ent->slave_id = slave_id;
+	ent->scheduled_delete = 0;
+	ent->dev = to_mdev(ibdev);
+	INIT_DELAYED_WORK(&ent->timeout, id_map_ent_timeout);
+
+	idr_preload(GFP_KERNEL);
+	spin_lock(&to_mdev(ibdev)->sriov.id_map_lock);
+
+	ret = idr_alloc_cyclic(&sriov->pv_id_table, ent, 0, 0, GFP_NOWAIT);
+	if (ret >= 0) {
+		ent->pv_cm_id = (u32)ret;
+		sl_id_map_add(ibdev, ent);
+		list_add_tail(&ent->list, &sriov->cm_list);
+	}
+
+	spin_unlock(&sriov->id_map_lock);
+	idr_preload_end();
+
+	if (ret >= 0)
+		return ent;
+
+	/*error flow*/
+	kfree(ent);
+	mlx4_ib_warn(ibdev, "No more space in the idr (err:0x%x)\n", ret);
+	return ERR_PTR(-ENOMEM);
+}
+
+static struct id_map_entry *
+id_map_get(struct ib_device *ibdev, int *pv_cm_id, int sl_cm_id, int slave_id)
+{
+	struct id_map_entry *ent;
+	struct mlx4_ib_sriov *sriov = &to_mdev(ibdev)->sriov;
+
+	spin_lock(&sriov->id_map_lock);
+	if (*pv_cm_id == -1) {
+		ent = id_map_find_by_sl_id(ibdev, sl_cm_id, slave_id);
+		if (ent)
+			*pv_cm_id = (int) ent->pv_cm_id;
+	} else
+		ent = (struct id_map_entry *)idr_find(&sriov->pv_id_table, *pv_cm_id);
+	spin_unlock(&sriov->id_map_lock);
+
+	return ent;
+}
+
+static void schedule_delayed(struct ib_device *ibdev, struct id_map_entry *id)
+{
+	struct mlx4_ib_sriov *sriov = &to_mdev(ibdev)->sriov;
+	unsigned long flags;
+
+	spin_lock(&sriov->id_map_lock);
+	spin_lock_irqsave(&sriov->going_down_lock, flags);
+	/*make sure that there is no schedule inside the scheduled work.*/
+	if (!sriov->is_going_down) {
+		id->scheduled_delete = 1;
+		schedule_delayed_work(&id->timeout, CM_CLEANUP_CACHE_TIMEOUT);
+	}
+	spin_unlock_irqrestore(&sriov->going_down_lock, flags);
+	spin_unlock(&sriov->id_map_lock);
+}
+
+int mlx4_ib_multiplex_cm_handler(struct ib_device *ibdev, int port, int slave_id,
+		struct ib_mad *mad)
+{
+	struct id_map_entry *id;
+	u32 sl_cm_id;
+	int pv_cm_id = -1;
+
+	if (mad->mad_hdr.attr_id == CM_REQ_ATTR_ID ||
+			mad->mad_hdr.attr_id == CM_REP_ATTR_ID ||
+			mad->mad_hdr.attr_id == CM_SIDR_REQ_ATTR_ID) {
+		sl_cm_id = get_local_comm_id(mad);
+		id = id_map_alloc(ibdev, slave_id, sl_cm_id);
+		if (IS_ERR(id)) {
+			mlx4_ib_warn(ibdev, "%s: id{slave: %d, sl_cm_id: 0x%x} Failed to id_map_alloc\n",
+				__func__, slave_id, sl_cm_id);
+			return PTR_ERR(id);
+		}
+	} else if (mad->mad_hdr.attr_id == CM_REJ_ATTR_ID ||
+		   mad->mad_hdr.attr_id == CM_SIDR_REP_ATTR_ID) {
+		return 0;
+	} else {
+		sl_cm_id = get_local_comm_id(mad);
+		id = id_map_get(ibdev, &pv_cm_id, slave_id, sl_cm_id);
+	}
+
+	if (!id) {
+		pr_debug("id{slave: %d, sl_cm_id: 0x%x} is NULL!\n",
+			 slave_id, sl_cm_id);
+		return -EINVAL;
+	}
+
+	set_local_comm_id(mad, id->pv_cm_id);
+
+	if (mad->mad_hdr.attr_id == CM_DREQ_ATTR_ID)
+		schedule_delayed(ibdev, id);
+	else if (mad->mad_hdr.attr_id == CM_DREP_ATTR_ID)
+		id_map_find_del(ibdev, pv_cm_id);
+
+	return 0;
+}
+
+int mlx4_ib_demux_cm_handler(struct ib_device *ibdev, int port, int *slave,
+			     struct ib_mad *mad)
+{
+	u32 pv_cm_id;
+	struct id_map_entry *id;
+
+	if (mad->mad_hdr.attr_id == CM_REQ_ATTR_ID ||
+	    mad->mad_hdr.attr_id == CM_SIDR_REQ_ATTR_ID) {
+		union ib_gid gid;
+
+		if (!slave)
+			return 0;
+
+		gid = gid_from_req_msg(ibdev, mad);
+		*slave = mlx4_ib_find_real_gid(ibdev, port, gid.global.interface_id);
+		if (*slave < 0) {
+			mlx4_ib_warn(ibdev, "failed matching slave_id by gid (0x%llx)\n",
+					gid.global.interface_id);
+			return -ENOENT;
+		}
+		return 0;
+	}
+
+	pv_cm_id = get_remote_comm_id(mad);
+	id = id_map_get(ibdev, (int *)&pv_cm_id, -1, -1);
+
+	if (!id) {
+		pr_debug("Couldn't find an entry for pv_cm_id 0x%x\n", pv_cm_id);
+		return -ENOENT;
+	}
+
+	if (slave)
+		*slave = id->slave_id;
+	set_remote_comm_id(mad, id->sl_cm_id);
+
+	if (mad->mad_hdr.attr_id == CM_DREQ_ATTR_ID)
+		schedule_delayed(ibdev, id);
+	else if (mad->mad_hdr.attr_id == CM_REJ_ATTR_ID ||
+			mad->mad_hdr.attr_id == CM_DREP_ATTR_ID) {
+		id_map_find_del(ibdev, (int) pv_cm_id);
+	}
+
+	return 0;
+}
+
+void mlx4_ib_cm_paravirt_init(struct mlx4_ib_dev *dev)
+{
+	spin_lock_init(&dev->sriov.id_map_lock);
+	INIT_LIST_HEAD(&dev->sriov.cm_list);
+	dev->sriov.sl_id_map = RB_ROOT;
+	idr_init(&dev->sriov.pv_id_table);
+}
+
+/* slave = -1 ==> all slaves */
+/* TBD -- call paravirt clean for single slave.  Need for slave RESET event */
+void mlx4_ib_cm_paravirt_clean(struct mlx4_ib_dev *dev, int slave)
+{
+	struct mlx4_ib_sriov *sriov = &dev->sriov;
+	struct rb_root *sl_id_map = &sriov->sl_id_map;
+	struct list_head lh;
+	struct rb_node *nd;
+	int need_flush = 1;
+	struct id_map_entry *map, *tmp_map;
+	/* cancel all delayed work queue entries */
+	INIT_LIST_HEAD(&lh);
+	spin_lock(&sriov->id_map_lock);
+	list_for_each_entry_safe(map, tmp_map, &dev->sriov.cm_list, list) {
+		if (slave < 0 || slave == map->slave_id) {
+			if (map->scheduled_delete)
+				need_flush &= !!cancel_delayed_work(&map->timeout);
+		}
+	}
+
+	spin_unlock(&sriov->id_map_lock);
+
+	if (!need_flush)
+		flush_scheduled_work(); /* make sure all timers were flushed */
+
+	/* now, remove all leftover entries from databases*/
+	spin_lock(&sriov->id_map_lock);
+	if (slave < 0) {
+		while (rb_first(sl_id_map)) {
+			struct id_map_entry *ent =
+				rb_entry(rb_first(sl_id_map),
+					 struct id_map_entry, node);
+
+			rb_erase(&ent->node, sl_id_map);
+			idr_remove(&sriov->pv_id_table, (int) ent->pv_cm_id);
+		}
+		list_splice_init(&dev->sriov.cm_list, &lh);
+	} else {
+		/* first, move nodes belonging to slave to db remove list */
+		nd = rb_first(sl_id_map);
+		while (nd) {
+			struct id_map_entry *ent =
+				rb_entry(nd, struct id_map_entry, node);
+			nd = rb_next(nd);
+			if (ent->slave_id == slave)
+				list_move_tail(&ent->list, &lh);
+		}
+		/* remove those nodes from databases */
+		list_for_each_entry_safe(map, tmp_map, &lh, list) {
+			rb_erase(&map->node, sl_id_map);
+			idr_remove(&sriov->pv_id_table, (int) map->pv_cm_id);
+		}
+
+		/* add remaining nodes from cm_list */
+		list_for_each_entry_safe(map, tmp_map, &dev->sriov.cm_list, list) {
+			if (slave == map->slave_id)
+				list_move_tail(&map->list, &lh);
+		}
+	}
+
+	spin_unlock(&sriov->id_map_lock);
+
+	/* free any map entries left behind due to cancel_delayed_work above */
+	list_for_each_entry_safe(map, tmp_map, &lh, list) {
+		list_del(&map->list);
+		kfree(map);
+	}
+}
diff --git a/drivers/infiniband/hw/mlx4/cq.c b/drivers/infiniband/hw/mlx4/cq.c
new file mode 100644
index 0000000..1066eec
--- /dev/null
+++ b/drivers/infiniband/hw/mlx4/cq.c
@@ -0,0 +1,924 @@
+/*
+ * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
+ * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/mlx4/cq.h>
+#include <linux/mlx4/qp.h>
+#include <linux/mlx4/srq.h>
+#include <linux/slab.h>
+
+#include "mlx4_ib.h"
+#include "user.h"
+
+static void mlx4_ib_cq_comp(struct mlx4_cq *cq)
+{
+	struct ib_cq *ibcq = &to_mibcq(cq)->ibcq;
+	ibcq->comp_handler(ibcq, ibcq->cq_context);
+}
+
+static void mlx4_ib_cq_event(struct mlx4_cq *cq, enum mlx4_event type)
+{
+	struct ib_event event;
+	struct ib_cq *ibcq;
+
+	if (type != MLX4_EVENT_TYPE_CQ_ERROR) {
+		pr_warn("Unexpected event type %d "
+		       "on CQ %06x\n", type, cq->cqn);
+		return;
+	}
+
+	ibcq = &to_mibcq(cq)->ibcq;
+	if (ibcq->event_handler) {
+		event.device     = ibcq->device;
+		event.event      = IB_EVENT_CQ_ERR;
+		event.element.cq = ibcq;
+		ibcq->event_handler(&event, ibcq->cq_context);
+	}
+}
+
+static void *get_cqe_from_buf(struct mlx4_ib_cq_buf *buf, int n)
+{
+	return mlx4_buf_offset(&buf->buf, n * buf->entry_size);
+}
+
+static void *get_cqe(struct mlx4_ib_cq *cq, int n)
+{
+	return get_cqe_from_buf(&cq->buf, n);
+}
+
+static void *get_sw_cqe(struct mlx4_ib_cq *cq, int n)
+{
+	struct mlx4_cqe *cqe = get_cqe(cq, n & cq->ibcq.cqe);
+	struct mlx4_cqe *tcqe = ((cq->buf.entry_size == 64) ? (cqe + 1) : cqe);
+
+	return (!!(tcqe->owner_sr_opcode & MLX4_CQE_OWNER_MASK) ^
+		!!(n & (cq->ibcq.cqe + 1))) ? NULL : cqe;
+}
+
+static struct mlx4_cqe *next_cqe_sw(struct mlx4_ib_cq *cq)
+{
+	return get_sw_cqe(cq, cq->mcq.cons_index);
+}
+
+int mlx4_ib_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period)
+{
+	struct mlx4_ib_cq *mcq = to_mcq(cq);
+	struct mlx4_ib_dev *dev = to_mdev(cq->device);
+
+	return mlx4_cq_modify(dev->dev, &mcq->mcq, cq_count, cq_period);
+}
+
+static int mlx4_ib_alloc_cq_buf(struct mlx4_ib_dev *dev, struct mlx4_ib_cq_buf *buf, int nent)
+{
+	int err;
+
+	err = mlx4_buf_alloc(dev->dev, nent * dev->dev->caps.cqe_size,
+			     PAGE_SIZE * 2, &buf->buf, GFP_KERNEL);
+
+	if (err)
+		goto out;
+
+	buf->entry_size = dev->dev->caps.cqe_size;
+	err = mlx4_mtt_init(dev->dev, buf->buf.npages, buf->buf.page_shift,
+				    &buf->mtt);
+	if (err)
+		goto err_buf;
+
+	err = mlx4_buf_write_mtt(dev->dev, &buf->mtt, &buf->buf, GFP_KERNEL);
+	if (err)
+		goto err_mtt;
+
+	return 0;
+
+err_mtt:
+	mlx4_mtt_cleanup(dev->dev, &buf->mtt);
+
+err_buf:
+	mlx4_buf_free(dev->dev, nent * buf->entry_size, &buf->buf);
+
+out:
+	return err;
+}
+
+static void mlx4_ib_free_cq_buf(struct mlx4_ib_dev *dev, struct mlx4_ib_cq_buf *buf, int cqe)
+{
+	mlx4_buf_free(dev->dev, (cqe + 1) * buf->entry_size, &buf->buf);
+}
+
+static int mlx4_ib_get_cq_umem(struct mlx4_ib_dev *dev, struct ib_ucontext *context,
+			       struct mlx4_ib_cq_buf *buf, struct ib_umem **umem,
+			       u64 buf_addr, int cqe)
+{
+	int err;
+	int cqe_size = dev->dev->caps.cqe_size;
+
+	*umem = ib_umem_get(context, buf_addr, cqe * cqe_size,
+			    IB_ACCESS_LOCAL_WRITE, 1);
+	if (IS_ERR(*umem))
+		return PTR_ERR(*umem);
+
+	err = mlx4_mtt_init(dev->dev, ib_umem_page_count(*umem),
+			    ilog2((*umem)->page_size), &buf->mtt);
+	if (err)
+		goto err_buf;
+
+	err = mlx4_ib_umem_write_mtt(dev, &buf->mtt, *umem);
+	if (err)
+		goto err_mtt;
+
+	return 0;
+
+err_mtt:
+	mlx4_mtt_cleanup(dev->dev, &buf->mtt);
+
+err_buf:
+	ib_umem_release(*umem);
+
+	return err;
+}
+
+struct ib_cq *mlx4_ib_create_cq(struct ib_device *ibdev, int entries, int vector,
+				struct ib_ucontext *context,
+				struct ib_udata *udata)
+{
+	struct mlx4_ib_dev *dev = to_mdev(ibdev);
+	struct mlx4_ib_cq *cq;
+	struct mlx4_uar *uar;
+	int err;
+
+	if (entries < 1 || entries > dev->dev->caps.max_cqes)
+		return ERR_PTR(-EINVAL);
+
+	cq = kmalloc(sizeof *cq, GFP_KERNEL);
+	if (!cq)
+		return ERR_PTR(-ENOMEM);
+
+	entries      = roundup_pow_of_two(entries + 1);
+	cq->ibcq.cqe = entries - 1;
+	mutex_init(&cq->resize_mutex);
+	spin_lock_init(&cq->lock);
+	cq->resize_buf = NULL;
+	cq->resize_umem = NULL;
+
+	if (context) {
+		struct mlx4_ib_create_cq ucmd;
+
+		if (ib_copy_from_udata(&ucmd, udata, sizeof ucmd)) {
+			err = -EFAULT;
+			goto err_cq;
+		}
+
+		err = mlx4_ib_get_cq_umem(dev, context, &cq->buf, &cq->umem,
+					  ucmd.buf_addr, entries);
+		if (err)
+			goto err_cq;
+
+		err = mlx4_ib_db_map_user(to_mucontext(context), ucmd.db_addr,
+					  &cq->db);
+		if (err)
+			goto err_mtt;
+
+		uar = &to_mucontext(context)->uar;
+	} else {
+		err = mlx4_db_alloc(dev->dev, &cq->db, 1, GFP_KERNEL);
+		if (err)
+			goto err_cq;
+
+		cq->mcq.set_ci_db  = cq->db.db;
+		cq->mcq.arm_db     = cq->db.db + 1;
+		*cq->mcq.set_ci_db = 0;
+		*cq->mcq.arm_db    = 0;
+
+		err = mlx4_ib_alloc_cq_buf(dev, &cq->buf, entries);
+		if (err)
+			goto err_db;
+
+		uar = &dev->priv_uar;
+	}
+
+	if (dev->eq_table)
+		vector = dev->eq_table[vector % ibdev->num_comp_vectors];
+
+	err = mlx4_cq_alloc(dev->dev, entries, &cq->buf.mtt, uar,
+			    cq->db.dma, &cq->mcq, vector, 0, 0);
+	if (err)
+		goto err_dbmap;
+
+	cq->mcq.comp  = mlx4_ib_cq_comp;
+	cq->mcq.event = mlx4_ib_cq_event;
+
+	if (context)
+		if (ib_copy_to_udata(udata, &cq->mcq.cqn, sizeof (__u32))) {
+			err = -EFAULT;
+			goto err_dbmap;
+		}
+
+	return &cq->ibcq;
+
+err_dbmap:
+	if (context)
+		mlx4_ib_db_unmap_user(to_mucontext(context), &cq->db);
+
+err_mtt:
+	mlx4_mtt_cleanup(dev->dev, &cq->buf.mtt);
+
+	if (context)
+		ib_umem_release(cq->umem);
+	else
+		mlx4_ib_free_cq_buf(dev, &cq->buf, cq->ibcq.cqe);
+
+err_db:
+	if (!context)
+		mlx4_db_free(dev->dev, &cq->db);
+
+err_cq:
+	kfree(cq);
+
+	return ERR_PTR(err);
+}
+
+static int mlx4_alloc_resize_buf(struct mlx4_ib_dev *dev, struct mlx4_ib_cq *cq,
+				  int entries)
+{
+	int err;
+
+	if (cq->resize_buf)
+		return -EBUSY;
+
+	cq->resize_buf = kmalloc(sizeof *cq->resize_buf, GFP_ATOMIC);
+	if (!cq->resize_buf)
+		return -ENOMEM;
+
+	err = mlx4_ib_alloc_cq_buf(dev, &cq->resize_buf->buf, entries);
+	if (err) {
+		kfree(cq->resize_buf);
+		cq->resize_buf = NULL;
+		return err;
+	}
+
+	cq->resize_buf->cqe = entries - 1;
+
+	return 0;
+}
+
+static int mlx4_alloc_resize_umem(struct mlx4_ib_dev *dev, struct mlx4_ib_cq *cq,
+				   int entries, struct ib_udata *udata)
+{
+	struct mlx4_ib_resize_cq ucmd;
+	int err;
+
+	if (cq->resize_umem)
+		return -EBUSY;
+
+	if (ib_copy_from_udata(&ucmd, udata, sizeof ucmd))
+		return -EFAULT;
+
+	cq->resize_buf = kmalloc(sizeof *cq->resize_buf, GFP_ATOMIC);
+	if (!cq->resize_buf)
+		return -ENOMEM;
+
+	err = mlx4_ib_get_cq_umem(dev, cq->umem->context, &cq->resize_buf->buf,
+				  &cq->resize_umem, ucmd.buf_addr, entries);
+	if (err) {
+		kfree(cq->resize_buf);
+		cq->resize_buf = NULL;
+		return err;
+	}
+
+	cq->resize_buf->cqe = entries - 1;
+
+	return 0;
+}
+
+static int mlx4_ib_get_outstanding_cqes(struct mlx4_ib_cq *cq)
+{
+	u32 i;
+
+	i = cq->mcq.cons_index;
+	while (get_sw_cqe(cq, i))
+		++i;
+
+	return i - cq->mcq.cons_index;
+}
+
+static void mlx4_ib_cq_resize_copy_cqes(struct mlx4_ib_cq *cq)
+{
+	struct mlx4_cqe *cqe, *new_cqe;
+	int i;
+	int cqe_size = cq->buf.entry_size;
+	int cqe_inc = cqe_size == 64 ? 1 : 0;
+
+	i = cq->mcq.cons_index;
+	cqe = get_cqe(cq, i & cq->ibcq.cqe);
+	cqe += cqe_inc;
+
+	while ((cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) != MLX4_CQE_OPCODE_RESIZE) {
+		new_cqe = get_cqe_from_buf(&cq->resize_buf->buf,
+					   (i + 1) & cq->resize_buf->cqe);
+		memcpy(new_cqe, get_cqe(cq, i & cq->ibcq.cqe), cqe_size);
+		new_cqe += cqe_inc;
+
+		new_cqe->owner_sr_opcode = (cqe->owner_sr_opcode & ~MLX4_CQE_OWNER_MASK) |
+			(((i + 1) & (cq->resize_buf->cqe + 1)) ? MLX4_CQE_OWNER_MASK : 0);
+		cqe = get_cqe(cq, ++i & cq->ibcq.cqe);
+		cqe += cqe_inc;
+	}
+	++cq->mcq.cons_index;
+}
+
+int mlx4_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata)
+{
+	struct mlx4_ib_dev *dev = to_mdev(ibcq->device);
+	struct mlx4_ib_cq *cq = to_mcq(ibcq);
+	struct mlx4_mtt mtt;
+	int outst_cqe;
+	int err;
+
+	mutex_lock(&cq->resize_mutex);
+
+	if (entries < 1) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	entries = roundup_pow_of_two(entries + 1);
+	if (entries == ibcq->cqe + 1) {
+		err = 0;
+		goto out;
+	}
+
+	if (entries > dev->dev->caps.max_cqes) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	if (ibcq->uobject) {
+		err = mlx4_alloc_resize_umem(dev, cq, entries, udata);
+		if (err)
+			goto out;
+	} else {
+		/* Can't be smaller than the number of outstanding CQEs */
+		outst_cqe = mlx4_ib_get_outstanding_cqes(cq);
+		if (entries < outst_cqe + 1) {
+			err = 0;
+			goto out;
+		}
+
+		err = mlx4_alloc_resize_buf(dev, cq, entries);
+		if (err)
+			goto out;
+	}
+
+	mtt = cq->buf.mtt;
+
+	err = mlx4_cq_resize(dev->dev, &cq->mcq, entries, &cq->resize_buf->buf.mtt);
+	if (err)
+		goto err_buf;
+
+	mlx4_mtt_cleanup(dev->dev, &mtt);
+	if (ibcq->uobject) {
+		cq->buf      = cq->resize_buf->buf;
+		cq->ibcq.cqe = cq->resize_buf->cqe;
+		ib_umem_release(cq->umem);
+		cq->umem     = cq->resize_umem;
+
+		kfree(cq->resize_buf);
+		cq->resize_buf = NULL;
+		cq->resize_umem = NULL;
+	} else {
+		struct mlx4_ib_cq_buf tmp_buf;
+		int tmp_cqe = 0;
+
+		spin_lock_irq(&cq->lock);
+		if (cq->resize_buf) {
+			mlx4_ib_cq_resize_copy_cqes(cq);
+			tmp_buf = cq->buf;
+			tmp_cqe = cq->ibcq.cqe;
+			cq->buf      = cq->resize_buf->buf;
+			cq->ibcq.cqe = cq->resize_buf->cqe;
+
+			kfree(cq->resize_buf);
+			cq->resize_buf = NULL;
+		}
+		spin_unlock_irq(&cq->lock);
+
+		if (tmp_cqe)
+			mlx4_ib_free_cq_buf(dev, &tmp_buf, tmp_cqe);
+	}
+
+	goto out;
+
+err_buf:
+	mlx4_mtt_cleanup(dev->dev, &cq->resize_buf->buf.mtt);
+	if (!ibcq->uobject)
+		mlx4_ib_free_cq_buf(dev, &cq->resize_buf->buf,
+				    cq->resize_buf->cqe);
+
+	kfree(cq->resize_buf);
+	cq->resize_buf = NULL;
+
+	if (cq->resize_umem) {
+		ib_umem_release(cq->resize_umem);
+		cq->resize_umem = NULL;
+	}
+
+out:
+	mutex_unlock(&cq->resize_mutex);
+
+	return err;
+}
+
+int mlx4_ib_destroy_cq(struct ib_cq *cq)
+{
+	struct mlx4_ib_dev *dev = to_mdev(cq->device);
+	struct mlx4_ib_cq *mcq = to_mcq(cq);
+
+	mlx4_cq_free(dev->dev, &mcq->mcq);
+	mlx4_mtt_cleanup(dev->dev, &mcq->buf.mtt);
+
+	if (cq->uobject) {
+		mlx4_ib_db_unmap_user(to_mucontext(cq->uobject->context), &mcq->db);
+		ib_umem_release(mcq->umem);
+	} else {
+		mlx4_ib_free_cq_buf(dev, &mcq->buf, cq->cqe);
+		mlx4_db_free(dev->dev, &mcq->db);
+	}
+
+	kfree(mcq);
+
+	return 0;
+}
+
+static void dump_cqe(void *cqe)
+{
+	__be32 *buf = cqe;
+
+	pr_debug("CQE contents %08x %08x %08x %08x %08x %08x %08x %08x\n",
+	       be32_to_cpu(buf[0]), be32_to_cpu(buf[1]), be32_to_cpu(buf[2]),
+	       be32_to_cpu(buf[3]), be32_to_cpu(buf[4]), be32_to_cpu(buf[5]),
+	       be32_to_cpu(buf[6]), be32_to_cpu(buf[7]));
+}
+
+static void mlx4_ib_handle_error_cqe(struct mlx4_err_cqe *cqe,
+				     struct ib_wc *wc)
+{
+	if (cqe->syndrome == MLX4_CQE_SYNDROME_LOCAL_QP_OP_ERR) {
+		pr_debug("local QP operation err "
+		       "(QPN %06x, WQE index %x, vendor syndrome %02x, "
+		       "opcode = %02x)\n",
+		       be32_to_cpu(cqe->my_qpn), be16_to_cpu(cqe->wqe_index),
+		       cqe->vendor_err_syndrome,
+		       cqe->owner_sr_opcode & ~MLX4_CQE_OWNER_MASK);
+		dump_cqe(cqe);
+	}
+
+	switch (cqe->syndrome) {
+	case MLX4_CQE_SYNDROME_LOCAL_LENGTH_ERR:
+		wc->status = IB_WC_LOC_LEN_ERR;
+		break;
+	case MLX4_CQE_SYNDROME_LOCAL_QP_OP_ERR:
+		wc->status = IB_WC_LOC_QP_OP_ERR;
+		break;
+	case MLX4_CQE_SYNDROME_LOCAL_PROT_ERR:
+		wc->status = IB_WC_LOC_PROT_ERR;
+		break;
+	case MLX4_CQE_SYNDROME_WR_FLUSH_ERR:
+		wc->status = IB_WC_WR_FLUSH_ERR;
+		break;
+	case MLX4_CQE_SYNDROME_MW_BIND_ERR:
+		wc->status = IB_WC_MW_BIND_ERR;
+		break;
+	case MLX4_CQE_SYNDROME_BAD_RESP_ERR:
+		wc->status = IB_WC_BAD_RESP_ERR;
+		break;
+	case MLX4_CQE_SYNDROME_LOCAL_ACCESS_ERR:
+		wc->status = IB_WC_LOC_ACCESS_ERR;
+		break;
+	case MLX4_CQE_SYNDROME_REMOTE_INVAL_REQ_ERR:
+		wc->status = IB_WC_REM_INV_REQ_ERR;
+		break;
+	case MLX4_CQE_SYNDROME_REMOTE_ACCESS_ERR:
+		wc->status = IB_WC_REM_ACCESS_ERR;
+		break;
+	case MLX4_CQE_SYNDROME_REMOTE_OP_ERR:
+		wc->status = IB_WC_REM_OP_ERR;
+		break;
+	case MLX4_CQE_SYNDROME_TRANSPORT_RETRY_EXC_ERR:
+		wc->status = IB_WC_RETRY_EXC_ERR;
+		break;
+	case MLX4_CQE_SYNDROME_RNR_RETRY_EXC_ERR:
+		wc->status = IB_WC_RNR_RETRY_EXC_ERR;
+		break;
+	case MLX4_CQE_SYNDROME_REMOTE_ABORTED_ERR:
+		wc->status = IB_WC_REM_ABORT_ERR;
+		break;
+	default:
+		wc->status = IB_WC_GENERAL_ERR;
+		break;
+	}
+
+	wc->vendor_err = cqe->vendor_err_syndrome;
+}
+
+static int mlx4_ib_ipoib_csum_ok(__be16 status, __be16 checksum)
+{
+	return ((status & cpu_to_be16(MLX4_CQE_STATUS_IPV4      |
+				      MLX4_CQE_STATUS_IPV4F     |
+				      MLX4_CQE_STATUS_IPV4OPT   |
+				      MLX4_CQE_STATUS_IPV6      |
+				      MLX4_CQE_STATUS_IPOK)) ==
+		cpu_to_be16(MLX4_CQE_STATUS_IPV4        |
+			    MLX4_CQE_STATUS_IPOK))              &&
+		(status & cpu_to_be16(MLX4_CQE_STATUS_UDP       |
+				      MLX4_CQE_STATUS_TCP))     &&
+		checksum == cpu_to_be16(0xffff);
+}
+
+static int use_tunnel_data(struct mlx4_ib_qp *qp, struct mlx4_ib_cq *cq, struct ib_wc *wc,
+			   unsigned tail, struct mlx4_cqe *cqe, int is_eth)
+{
+	struct mlx4_ib_proxy_sqp_hdr *hdr;
+
+	ib_dma_sync_single_for_cpu(qp->ibqp.device,
+				   qp->sqp_proxy_rcv[tail].map,
+				   sizeof (struct mlx4_ib_proxy_sqp_hdr),
+				   DMA_FROM_DEVICE);
+	hdr = (struct mlx4_ib_proxy_sqp_hdr *) (qp->sqp_proxy_rcv[tail].addr);
+	wc->pkey_index	= be16_to_cpu(hdr->tun.pkey_index);
+	wc->src_qp	= be32_to_cpu(hdr->tun.flags_src_qp) & 0xFFFFFF;
+	wc->wc_flags   |= (hdr->tun.g_ml_path & 0x80) ? (IB_WC_GRH) : 0;
+	wc->dlid_path_bits = 0;
+
+	if (is_eth) {
+		wc->vlan_id = be16_to_cpu(hdr->tun.sl_vid);
+		memcpy(&(wc->smac[0]), (char *)&hdr->tun.mac_31_0, 4);
+		memcpy(&(wc->smac[4]), (char *)&hdr->tun.slid_mac_47_32, 2);
+		wc->wc_flags |= (IB_WC_WITH_VLAN | IB_WC_WITH_SMAC);
+	} else {
+		wc->slid        = be16_to_cpu(hdr->tun.slid_mac_47_32);
+		wc->sl          = (u8) (be16_to_cpu(hdr->tun.sl_vid) >> 12);
+	}
+
+	return 0;
+}
+
+static int mlx4_ib_poll_one(struct mlx4_ib_cq *cq,
+			    struct mlx4_ib_qp **cur_qp,
+			    struct ib_wc *wc)
+{
+	struct mlx4_cqe *cqe;
+	struct mlx4_qp *mqp;
+	struct mlx4_ib_wq *wq;
+	struct mlx4_ib_srq *srq;
+	struct mlx4_srq *msrq = NULL;
+	int is_send;
+	int is_error;
+	int is_eth;
+	u32 g_mlpath_rqpn;
+	u16 wqe_ctr;
+	unsigned tail = 0;
+
+repoll:
+	cqe = next_cqe_sw(cq);
+	if (!cqe)
+		return -EAGAIN;
+
+	if (cq->buf.entry_size == 64)
+		cqe++;
+
+	++cq->mcq.cons_index;
+
+	/*
+	 * Make sure we read CQ entry contents after we've checked the
+	 * ownership bit.
+	 */
+	rmb();
+
+	is_send  = cqe->owner_sr_opcode & MLX4_CQE_IS_SEND_MASK;
+	is_error = (cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) ==
+		MLX4_CQE_OPCODE_ERROR;
+
+	if (unlikely((cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) == MLX4_OPCODE_NOP &&
+		     is_send)) {
+		pr_warn("Completion for NOP opcode detected!\n");
+		return -EINVAL;
+	}
+
+	/* Resize CQ in progress */
+	if (unlikely((cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) == MLX4_CQE_OPCODE_RESIZE)) {
+		if (cq->resize_buf) {
+			struct mlx4_ib_dev *dev = to_mdev(cq->ibcq.device);
+
+			mlx4_ib_free_cq_buf(dev, &cq->buf, cq->ibcq.cqe);
+			cq->buf      = cq->resize_buf->buf;
+			cq->ibcq.cqe = cq->resize_buf->cqe;
+
+			kfree(cq->resize_buf);
+			cq->resize_buf = NULL;
+		}
+
+		goto repoll;
+	}
+
+	if (!*cur_qp ||
+	    (be32_to_cpu(cqe->vlan_my_qpn) & MLX4_CQE_QPN_MASK) != (*cur_qp)->mqp.qpn) {
+		/*
+		 * We do not have to take the QP table lock here,
+		 * because CQs will be locked while QPs are removed
+		 * from the table.
+		 */
+		mqp = __mlx4_qp_lookup(to_mdev(cq->ibcq.device)->dev,
+				       be32_to_cpu(cqe->vlan_my_qpn));
+		if (unlikely(!mqp)) {
+			pr_warn("CQ %06x with entry for unknown QPN %06x\n",
+			       cq->mcq.cqn, be32_to_cpu(cqe->vlan_my_qpn) & MLX4_CQE_QPN_MASK);
+			return -EINVAL;
+		}
+
+		*cur_qp = to_mibqp(mqp);
+	}
+
+	wc->qp = &(*cur_qp)->ibqp;
+
+	if (wc->qp->qp_type == IB_QPT_XRC_TGT) {
+		u32 srq_num;
+		g_mlpath_rqpn = be32_to_cpu(cqe->g_mlpath_rqpn);
+		srq_num       = g_mlpath_rqpn & 0xffffff;
+		/* SRQ is also in the radix tree */
+		msrq = mlx4_srq_lookup(to_mdev(cq->ibcq.device)->dev,
+				       srq_num);
+		if (unlikely(!msrq)) {
+			pr_warn("CQ %06x with entry for unknown SRQN %06x\n",
+				cq->mcq.cqn, srq_num);
+			return -EINVAL;
+		}
+	}
+
+	if (is_send) {
+		wq = &(*cur_qp)->sq;
+		if (!(*cur_qp)->sq_signal_bits) {
+			wqe_ctr = be16_to_cpu(cqe->wqe_index);
+			wq->tail += (u16) (wqe_ctr - (u16) wq->tail);
+		}
+		wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)];
+		++wq->tail;
+	} else if ((*cur_qp)->ibqp.srq) {
+		srq = to_msrq((*cur_qp)->ibqp.srq);
+		wqe_ctr = be16_to_cpu(cqe->wqe_index);
+		wc->wr_id = srq->wrid[wqe_ctr];
+		mlx4_ib_free_srq_wqe(srq, wqe_ctr);
+	} else if (msrq) {
+		srq = to_mibsrq(msrq);
+		wqe_ctr = be16_to_cpu(cqe->wqe_index);
+		wc->wr_id = srq->wrid[wqe_ctr];
+		mlx4_ib_free_srq_wqe(srq, wqe_ctr);
+	} else {
+		wq	  = &(*cur_qp)->rq;
+		tail	  = wq->tail & (wq->wqe_cnt - 1);
+		wc->wr_id = wq->wrid[tail];
+		++wq->tail;
+	}
+
+	if (unlikely(is_error)) {
+		mlx4_ib_handle_error_cqe((struct mlx4_err_cqe *) cqe, wc);
+		return 0;
+	}
+
+	wc->status = IB_WC_SUCCESS;
+
+	if (is_send) {
+		wc->wc_flags = 0;
+		switch (cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) {
+		case MLX4_OPCODE_RDMA_WRITE_IMM:
+			wc->wc_flags |= IB_WC_WITH_IMM;
+		case MLX4_OPCODE_RDMA_WRITE:
+			wc->opcode    = IB_WC_RDMA_WRITE;
+			break;
+		case MLX4_OPCODE_SEND_IMM:
+			wc->wc_flags |= IB_WC_WITH_IMM;
+		case MLX4_OPCODE_SEND:
+		case MLX4_OPCODE_SEND_INVAL:
+			wc->opcode    = IB_WC_SEND;
+			break;
+		case MLX4_OPCODE_RDMA_READ:
+			wc->opcode    = IB_WC_RDMA_READ;
+			wc->byte_len  = be32_to_cpu(cqe->byte_cnt);
+			break;
+		case MLX4_OPCODE_ATOMIC_CS:
+			wc->opcode    = IB_WC_COMP_SWAP;
+			wc->byte_len  = 8;
+			break;
+		case MLX4_OPCODE_ATOMIC_FA:
+			wc->opcode    = IB_WC_FETCH_ADD;
+			wc->byte_len  = 8;
+			break;
+		case MLX4_OPCODE_MASKED_ATOMIC_CS:
+			wc->opcode    = IB_WC_MASKED_COMP_SWAP;
+			wc->byte_len  = 8;
+			break;
+		case MLX4_OPCODE_MASKED_ATOMIC_FA:
+			wc->opcode    = IB_WC_MASKED_FETCH_ADD;
+			wc->byte_len  = 8;
+			break;
+		case MLX4_OPCODE_BIND_MW:
+			wc->opcode    = IB_WC_BIND_MW;
+			break;
+		case MLX4_OPCODE_LSO:
+			wc->opcode    = IB_WC_LSO;
+			break;
+		case MLX4_OPCODE_FMR:
+			wc->opcode    = IB_WC_FAST_REG_MR;
+			break;
+		case MLX4_OPCODE_LOCAL_INVAL:
+			wc->opcode    = IB_WC_LOCAL_INV;
+			break;
+		}
+	} else {
+		wc->byte_len = be32_to_cpu(cqe->byte_cnt);
+
+		switch (cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) {
+		case MLX4_RECV_OPCODE_RDMA_WRITE_IMM:
+			wc->opcode	= IB_WC_RECV_RDMA_WITH_IMM;
+			wc->wc_flags	= IB_WC_WITH_IMM;
+			wc->ex.imm_data = cqe->immed_rss_invalid;
+			break;
+		case MLX4_RECV_OPCODE_SEND_INVAL:
+			wc->opcode	= IB_WC_RECV;
+			wc->wc_flags	= IB_WC_WITH_INVALIDATE;
+			wc->ex.invalidate_rkey = be32_to_cpu(cqe->immed_rss_invalid);
+			break;
+		case MLX4_RECV_OPCODE_SEND:
+			wc->opcode   = IB_WC_RECV;
+			wc->wc_flags = 0;
+			break;
+		case MLX4_RECV_OPCODE_SEND_IMM:
+			wc->opcode	= IB_WC_RECV;
+			wc->wc_flags	= IB_WC_WITH_IMM;
+			wc->ex.imm_data = cqe->immed_rss_invalid;
+			break;
+		}
+
+		is_eth = (rdma_port_get_link_layer(wc->qp->device,
+						  (*cur_qp)->port) ==
+			  IB_LINK_LAYER_ETHERNET);
+		if (mlx4_is_mfunc(to_mdev(cq->ibcq.device)->dev)) {
+			if ((*cur_qp)->mlx4_ib_qp_type &
+			    (MLX4_IB_QPT_PROXY_SMI_OWNER |
+			     MLX4_IB_QPT_PROXY_SMI | MLX4_IB_QPT_PROXY_GSI))
+				return use_tunnel_data(*cur_qp, cq, wc, tail,
+						       cqe, is_eth);
+		}
+
+		wc->slid	   = be16_to_cpu(cqe->rlid);
+		g_mlpath_rqpn	   = be32_to_cpu(cqe->g_mlpath_rqpn);
+		wc->src_qp	   = g_mlpath_rqpn & 0xffffff;
+		wc->dlid_path_bits = (g_mlpath_rqpn >> 24) & 0x7f;
+		wc->wc_flags	  |= g_mlpath_rqpn & 0x80000000 ? IB_WC_GRH : 0;
+		wc->pkey_index     = be32_to_cpu(cqe->immed_rss_invalid) & 0x7f;
+		wc->wc_flags	  |= mlx4_ib_ipoib_csum_ok(cqe->status,
+					cqe->checksum) ? IB_WC_IP_CSUM_OK : 0;
+		if (is_eth) {
+			wc->sl  = be16_to_cpu(cqe->sl_vid) >> 13;
+			if (be32_to_cpu(cqe->vlan_my_qpn) &
+					MLX4_CQE_VLAN_PRESENT_MASK) {
+				wc->vlan_id = be16_to_cpu(cqe->sl_vid) &
+					MLX4_CQE_VID_MASK;
+			} else {
+				wc->vlan_id = 0xffff;
+			}
+			memcpy(wc->smac, cqe->smac, ETH_ALEN);
+			wc->wc_flags |= (IB_WC_WITH_VLAN | IB_WC_WITH_SMAC);
+		} else {
+			wc->sl  = be16_to_cpu(cqe->sl_vid) >> 12;
+			wc->vlan_id = 0xffff;
+		}
+	}
+
+	return 0;
+}
+
+int mlx4_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc)
+{
+	struct mlx4_ib_cq *cq = to_mcq(ibcq);
+	struct mlx4_ib_qp *cur_qp = NULL;
+	unsigned long flags;
+	int npolled;
+	int err = 0;
+
+	spin_lock_irqsave(&cq->lock, flags);
+
+	for (npolled = 0; npolled < num_entries; ++npolled) {
+		err = mlx4_ib_poll_one(cq, &cur_qp, wc + npolled);
+		if (err)
+			break;
+	}
+
+	mlx4_cq_set_ci(&cq->mcq);
+
+	spin_unlock_irqrestore(&cq->lock, flags);
+
+	if (err == 0 || err == -EAGAIN)
+		return npolled;
+	else
+		return err;
+}
+
+int mlx4_ib_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags)
+{
+	mlx4_cq_arm(&to_mcq(ibcq)->mcq,
+		    (flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED ?
+		    MLX4_CQ_DB_REQ_NOT_SOL : MLX4_CQ_DB_REQ_NOT,
+		    to_mdev(ibcq->device)->uar_map,
+		    MLX4_GET_DOORBELL_LOCK(&to_mdev(ibcq->device)->uar_lock));
+
+	return 0;
+}
+
+void __mlx4_ib_cq_clean(struct mlx4_ib_cq *cq, u32 qpn, struct mlx4_ib_srq *srq)
+{
+	u32 prod_index;
+	int nfreed = 0;
+	struct mlx4_cqe *cqe, *dest;
+	u8 owner_bit;
+	int cqe_inc = cq->buf.entry_size == 64 ? 1 : 0;
+
+	/*
+	 * First we need to find the current producer index, so we
+	 * know where to start cleaning from.  It doesn't matter if HW
+	 * adds new entries after this loop -- the QP we're worried
+	 * about is already in RESET, so the new entries won't come
+	 * from our QP and therefore don't need to be checked.
+	 */
+	for (prod_index = cq->mcq.cons_index; get_sw_cqe(cq, prod_index); ++prod_index)
+		if (prod_index == cq->mcq.cons_index + cq->ibcq.cqe)
+			break;
+
+	/*
+	 * Now sweep backwards through the CQ, removing CQ entries
+	 * that match our QP by copying older entries on top of them.
+	 */
+	while ((int) --prod_index - (int) cq->mcq.cons_index >= 0) {
+		cqe = get_cqe(cq, prod_index & cq->ibcq.cqe);
+		cqe += cqe_inc;
+
+		if ((be32_to_cpu(cqe->vlan_my_qpn) & MLX4_CQE_QPN_MASK) == qpn) {
+			if (srq && !(cqe->owner_sr_opcode & MLX4_CQE_IS_SEND_MASK))
+				mlx4_ib_free_srq_wqe(srq, be16_to_cpu(cqe->wqe_index));
+			++nfreed;
+		} else if (nfreed) {
+			dest = get_cqe(cq, (prod_index + nfreed) & cq->ibcq.cqe);
+			dest += cqe_inc;
+
+			owner_bit = dest->owner_sr_opcode & MLX4_CQE_OWNER_MASK;
+			memcpy(dest, cqe, sizeof *cqe);
+			dest->owner_sr_opcode = owner_bit |
+				(dest->owner_sr_opcode & ~MLX4_CQE_OWNER_MASK);
+		}
+	}
+
+	if (nfreed) {
+		cq->mcq.cons_index += nfreed;
+		/*
+		 * Make sure update of buffer contents is done before
+		 * updating consumer index.
+		 */
+		wmb();
+		mlx4_cq_set_ci(&cq->mcq);
+	}
+}
+
+void mlx4_ib_cq_clean(struct mlx4_ib_cq *cq, u32 qpn, struct mlx4_ib_srq *srq)
+{
+	spin_lock_irq(&cq->lock);
+	__mlx4_ib_cq_clean(cq, qpn, srq);
+	spin_unlock_irq(&cq->lock);
+}
diff --git a/drivers/infiniband/hw/mlx4/doorbell.c b/drivers/infiniband/hw/mlx4/doorbell.c
new file mode 100644
index 0000000..c517409
--- /dev/null
+++ b/drivers/infiniband/hw/mlx4/doorbell.c
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/slab.h>
+
+#include "mlx4_ib.h"
+
+struct mlx4_ib_user_db_page {
+	struct list_head	list;
+	struct ib_umem	       *umem;
+	unsigned long		user_virt;
+	int			refcnt;
+};
+
+int mlx4_ib_db_map_user(struct mlx4_ib_ucontext *context, unsigned long virt,
+			struct mlx4_db *db)
+{
+	struct mlx4_ib_user_db_page *page;
+	int err = 0;
+
+	mutex_lock(&context->db_page_mutex);
+
+	list_for_each_entry(page, &context->db_page_list, list)
+		if (page->user_virt == (virt & PAGE_MASK))
+			goto found;
+
+	page = kmalloc(sizeof *page, GFP_KERNEL);
+	if (!page) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	page->user_virt = (virt & PAGE_MASK);
+	page->refcnt    = 0;
+	page->umem      = ib_umem_get(&context->ibucontext, virt & PAGE_MASK,
+				      PAGE_SIZE, 0, 0);
+	if (IS_ERR(page->umem)) {
+		err = PTR_ERR(page->umem);
+		kfree(page);
+		goto out;
+	}
+
+	list_add(&page->list, &context->db_page_list);
+
+found:
+	db->dma = sg_dma_address(page->umem->sg_head.sgl) + (virt & ~PAGE_MASK);
+	db->u.user_page = page;
+	++page->refcnt;
+
+out:
+	mutex_unlock(&context->db_page_mutex);
+
+	return err;
+}
+
+void mlx4_ib_db_unmap_user(struct mlx4_ib_ucontext *context, struct mlx4_db *db)
+{
+	mutex_lock(&context->db_page_mutex);
+
+	if (!--db->u.user_page->refcnt) {
+		list_del(&db->u.user_page->list);
+		ib_umem_release(db->u.user_page->umem);
+		kfree(db->u.user_page);
+	}
+
+	mutex_unlock(&context->db_page_mutex);
+}
diff --git a/drivers/infiniband/hw/mlx4/mad.c b/drivers/infiniband/hw/mlx4/mad.c
new file mode 100644
index 0000000..82a7dd8
--- /dev/null
+++ b/drivers/infiniband/hw/mlx4/mad.c
@@ -0,0 +1,2163 @@
+/*
+ * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <rdma/ib_mad.h>
+#include <rdma/ib_smi.h>
+#include <rdma/ib_sa.h>
+#include <rdma/ib_cache.h>
+
+#include <linux/random.h>
+#include <linux/mlx4/cmd.h>
+#include <linux/gfp.h>
+#include <rdma/ib_pma.h>
+
+#include "mlx4_ib.h"
+
+enum {
+	MLX4_IB_VENDOR_CLASS1 = 0x9,
+	MLX4_IB_VENDOR_CLASS2 = 0xa
+};
+
+#define MLX4_TUN_SEND_WRID_SHIFT 34
+#define MLX4_TUN_QPN_SHIFT 32
+#define MLX4_TUN_WRID_RECV (((u64) 1) << MLX4_TUN_SEND_WRID_SHIFT)
+#define MLX4_TUN_SET_WRID_QPN(a) (((u64) ((a) & 0x3)) << MLX4_TUN_QPN_SHIFT)
+
+#define MLX4_TUN_IS_RECV(a)  (((a) >>  MLX4_TUN_SEND_WRID_SHIFT) & 0x1)
+#define MLX4_TUN_WRID_QPN(a) (((a) >> MLX4_TUN_QPN_SHIFT) & 0x3)
+
+ /* Port mgmt change event handling */
+
+#define GET_BLK_PTR_FROM_EQE(eqe) be32_to_cpu(eqe->event.port_mgmt_change.params.tbl_change_info.block_ptr)
+#define GET_MASK_FROM_EQE(eqe) be32_to_cpu(eqe->event.port_mgmt_change.params.tbl_change_info.tbl_entries_mask)
+#define NUM_IDX_IN_PKEY_TBL_BLK 32
+#define GUID_TBL_ENTRY_SIZE 8	   /* size in bytes */
+#define GUID_TBL_BLK_NUM_ENTRIES 8
+#define GUID_TBL_BLK_SIZE (GUID_TBL_ENTRY_SIZE * GUID_TBL_BLK_NUM_ENTRIES)
+
+struct mlx4_mad_rcv_buf {
+	struct ib_grh grh;
+	u8 payload[256];
+} __packed;
+
+struct mlx4_mad_snd_buf {
+	u8 payload[256];
+} __packed;
+
+struct mlx4_tunnel_mad {
+	struct ib_grh grh;
+	struct mlx4_ib_tunnel_header hdr;
+	struct ib_mad mad;
+} __packed;
+
+struct mlx4_rcv_tunnel_mad {
+	struct mlx4_rcv_tunnel_hdr hdr;
+	struct ib_grh grh;
+	struct ib_mad mad;
+} __packed;
+
+static void handle_client_rereg_event(struct mlx4_ib_dev *dev, u8 port_num);
+static void handle_lid_change_event(struct mlx4_ib_dev *dev, u8 port_num);
+static void __propagate_pkey_ev(struct mlx4_ib_dev *dev, int port_num,
+				int block, u32 change_bitmap);
+
+__be64 mlx4_ib_gen_node_guid(void)
+{
+#define NODE_GUID_HI	((u64) (((u64)IB_OPENIB_OUI) << 40))
+	return cpu_to_be64(NODE_GUID_HI | prandom_u32());
+}
+
+__be64 mlx4_ib_get_new_demux_tid(struct mlx4_ib_demux_ctx *ctx)
+{
+	return cpu_to_be64(atomic_inc_return(&ctx->tid)) |
+		cpu_to_be64(0xff00000000000000LL);
+}
+
+int mlx4_MAD_IFC(struct mlx4_ib_dev *dev, int mad_ifc_flags,
+		 int port, struct ib_wc *in_wc, struct ib_grh *in_grh,
+		 void *in_mad, void *response_mad)
+{
+	struct mlx4_cmd_mailbox *inmailbox, *outmailbox;
+	void *inbox;
+	int err;
+	u32 in_modifier = port;
+	u8 op_modifier = 0;
+
+	inmailbox = mlx4_alloc_cmd_mailbox(dev->dev);
+	if (IS_ERR(inmailbox))
+		return PTR_ERR(inmailbox);
+	inbox = inmailbox->buf;
+
+	outmailbox = mlx4_alloc_cmd_mailbox(dev->dev);
+	if (IS_ERR(outmailbox)) {
+		mlx4_free_cmd_mailbox(dev->dev, inmailbox);
+		return PTR_ERR(outmailbox);
+	}
+
+	memcpy(inbox, in_mad, 256);
+
+	/*
+	 * Key check traps can't be generated unless we have in_wc to
+	 * tell us where to send the trap.
+	 */
+	if ((mad_ifc_flags & MLX4_MAD_IFC_IGNORE_MKEY) || !in_wc)
+		op_modifier |= 0x1;
+	if ((mad_ifc_flags & MLX4_MAD_IFC_IGNORE_BKEY) || !in_wc)
+		op_modifier |= 0x2;
+	if (mlx4_is_mfunc(dev->dev) &&
+	    (mad_ifc_flags & MLX4_MAD_IFC_NET_VIEW || in_wc))
+		op_modifier |= 0x8;
+
+	if (in_wc) {
+		struct {
+			__be32		my_qpn;
+			u32		reserved1;
+			__be32		rqpn;
+			u8		sl;
+			u8		g_path;
+			u16		reserved2[2];
+			__be16		pkey;
+			u32		reserved3[11];
+			u8		grh[40];
+		} *ext_info;
+
+		memset(inbox + 256, 0, 256);
+		ext_info = inbox + 256;
+
+		ext_info->my_qpn = cpu_to_be32(in_wc->qp->qp_num);
+		ext_info->rqpn   = cpu_to_be32(in_wc->src_qp);
+		ext_info->sl     = in_wc->sl << 4;
+		ext_info->g_path = in_wc->dlid_path_bits |
+			(in_wc->wc_flags & IB_WC_GRH ? 0x80 : 0);
+		ext_info->pkey   = cpu_to_be16(in_wc->pkey_index);
+
+		if (in_grh)
+			memcpy(ext_info->grh, in_grh, 40);
+
+		op_modifier |= 0x4;
+
+		in_modifier |= in_wc->slid << 16;
+	}
+
+	err = mlx4_cmd_box(dev->dev, inmailbox->dma, outmailbox->dma, in_modifier,
+			   mlx4_is_master(dev->dev) ? (op_modifier & ~0x8) : op_modifier,
+			   MLX4_CMD_MAD_IFC, MLX4_CMD_TIME_CLASS_C,
+			   (op_modifier & 0x8) ? MLX4_CMD_NATIVE : MLX4_CMD_WRAPPED);
+
+	if (!err)
+		memcpy(response_mad, outmailbox->buf, 256);
+
+	mlx4_free_cmd_mailbox(dev->dev, inmailbox);
+	mlx4_free_cmd_mailbox(dev->dev, outmailbox);
+
+	return err;
+}
+
+static void update_sm_ah(struct mlx4_ib_dev *dev, u8 port_num, u16 lid, u8 sl)
+{
+	struct ib_ah *new_ah;
+	struct ib_ah_attr ah_attr;
+	unsigned long flags;
+
+	if (!dev->send_agent[port_num - 1][0])
+		return;
+
+	memset(&ah_attr, 0, sizeof ah_attr);
+	ah_attr.dlid     = lid;
+	ah_attr.sl       = sl;
+	ah_attr.port_num = port_num;
+
+	new_ah = ib_create_ah(dev->send_agent[port_num - 1][0]->qp->pd,
+			      &ah_attr);
+	if (IS_ERR(new_ah))
+		return;
+
+	spin_lock_irqsave(&dev->sm_lock, flags);
+	if (dev->sm_ah[port_num - 1])
+		ib_destroy_ah(dev->sm_ah[port_num - 1]);
+	dev->sm_ah[port_num - 1] = new_ah;
+	spin_unlock_irqrestore(&dev->sm_lock, flags);
+}
+
+/*
+ * Snoop SM MADs for port info, GUID info, and  P_Key table sets, so we can
+ * synthesize LID change, Client-Rereg, GID change, and P_Key change events.
+ */
+static void smp_snoop(struct ib_device *ibdev, u8 port_num, struct ib_mad *mad,
+		      u16 prev_lid)
+{
+	struct ib_port_info *pinfo;
+	u16 lid;
+	__be16 *base;
+	u32 bn, pkey_change_bitmap;
+	int i;
+
+
+	struct mlx4_ib_dev *dev = to_mdev(ibdev);
+	if ((mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED ||
+	     mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) &&
+	    mad->mad_hdr.method == IB_MGMT_METHOD_SET)
+		switch (mad->mad_hdr.attr_id) {
+		case IB_SMP_ATTR_PORT_INFO:
+			pinfo = (struct ib_port_info *) ((struct ib_smp *) mad)->data;
+			lid = be16_to_cpu(pinfo->lid);
+
+			update_sm_ah(dev, port_num,
+				     be16_to_cpu(pinfo->sm_lid),
+				     pinfo->neighbormtu_mastersmsl & 0xf);
+
+			if (pinfo->clientrereg_resv_subnetto & 0x80)
+				handle_client_rereg_event(dev, port_num);
+
+			if (prev_lid != lid)
+				handle_lid_change_event(dev, port_num);
+			break;
+
+		case IB_SMP_ATTR_PKEY_TABLE:
+			if (!mlx4_is_mfunc(dev->dev)) {
+				mlx4_ib_dispatch_event(dev, port_num,
+						       IB_EVENT_PKEY_CHANGE);
+				break;
+			}
+
+			/* at this point, we are running in the master.
+			 * Slaves do not receive SMPs.
+			 */
+			bn  = be32_to_cpu(((struct ib_smp *)mad)->attr_mod) & 0xFFFF;
+			base = (__be16 *) &(((struct ib_smp *)mad)->data[0]);
+			pkey_change_bitmap = 0;
+			for (i = 0; i < 32; i++) {
+				pr_debug("PKEY[%d] = x%x\n",
+					 i + bn*32, be16_to_cpu(base[i]));
+				if (be16_to_cpu(base[i]) !=
+				    dev->pkeys.phys_pkey_cache[port_num - 1][i + bn*32]) {
+					pkey_change_bitmap |= (1 << i);
+					dev->pkeys.phys_pkey_cache[port_num - 1][i + bn*32] =
+						be16_to_cpu(base[i]);
+				}
+			}
+			pr_debug("PKEY Change event: port=%d, "
+				 "block=0x%x, change_bitmap=0x%x\n",
+				 port_num, bn, pkey_change_bitmap);
+
+			if (pkey_change_bitmap) {
+				mlx4_ib_dispatch_event(dev, port_num,
+						       IB_EVENT_PKEY_CHANGE);
+				if (!dev->sriov.is_going_down)
+					__propagate_pkey_ev(dev, port_num, bn,
+							    pkey_change_bitmap);
+			}
+			break;
+
+		case IB_SMP_ATTR_GUID_INFO:
+			/* paravirtualized master's guid is guid 0 -- does not change */
+			if (!mlx4_is_master(dev->dev))
+				mlx4_ib_dispatch_event(dev, port_num,
+						       IB_EVENT_GID_CHANGE);
+			/*if master, notify relevant slaves*/
+			if (mlx4_is_master(dev->dev) &&
+			    !dev->sriov.is_going_down) {
+				bn = be32_to_cpu(((struct ib_smp *)mad)->attr_mod);
+				mlx4_ib_update_cache_on_guid_change(dev, bn, port_num,
+								    (u8 *)(&((struct ib_smp *)mad)->data));
+				mlx4_ib_notify_slaves_on_guid_change(dev, bn, port_num,
+								     (u8 *)(&((struct ib_smp *)mad)->data));
+			}
+			break;
+
+		default:
+			break;
+		}
+}
+
+static void __propagate_pkey_ev(struct mlx4_ib_dev *dev, int port_num,
+				int block, u32 change_bitmap)
+{
+	int i, ix, slave, err;
+	int have_event = 0;
+
+	for (slave = 0; slave < dev->dev->caps.sqp_demux; slave++) {
+		if (slave == mlx4_master_func_num(dev->dev))
+			continue;
+		if (!mlx4_is_slave_active(dev->dev, slave))
+			continue;
+
+		have_event = 0;
+		for (i = 0; i < 32; i++) {
+			if (!(change_bitmap & (1 << i)))
+				continue;
+			for (ix = 0;
+			     ix < dev->dev->caps.pkey_table_len[port_num]; ix++) {
+				if (dev->pkeys.virt2phys_pkey[slave][port_num - 1]
+				    [ix] == i + 32 * block) {
+					err = mlx4_gen_pkey_eqe(dev->dev, slave, port_num);
+					pr_debug("propagate_pkey_ev: slave %d,"
+						 " port %d, ix %d (%d)\n",
+						 slave, port_num, ix, err);
+					have_event = 1;
+					break;
+				}
+			}
+			if (have_event)
+				break;
+		}
+	}
+}
+
+static void node_desc_override(struct ib_device *dev,
+			       struct ib_mad *mad)
+{
+	unsigned long flags;
+
+	if ((mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED ||
+	     mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) &&
+	    mad->mad_hdr.method == IB_MGMT_METHOD_GET_RESP &&
+	    mad->mad_hdr.attr_id == IB_SMP_ATTR_NODE_DESC) {
+		spin_lock_irqsave(&to_mdev(dev)->sm_lock, flags);
+		memcpy(((struct ib_smp *) mad)->data, dev->node_desc, 64);
+		spin_unlock_irqrestore(&to_mdev(dev)->sm_lock, flags);
+	}
+}
+
+static void forward_trap(struct mlx4_ib_dev *dev, u8 port_num, struct ib_mad *mad)
+{
+	int qpn = mad->mad_hdr.mgmt_class != IB_MGMT_CLASS_SUBN_LID_ROUTED;
+	struct ib_mad_send_buf *send_buf;
+	struct ib_mad_agent *agent = dev->send_agent[port_num - 1][qpn];
+	int ret;
+	unsigned long flags;
+
+	if (agent) {
+		send_buf = ib_create_send_mad(agent, qpn, 0, 0, IB_MGMT_MAD_HDR,
+					      IB_MGMT_MAD_DATA, GFP_ATOMIC);
+		if (IS_ERR(send_buf))
+			return;
+		/*
+		 * We rely here on the fact that MLX QPs don't use the
+		 * address handle after the send is posted (this is
+		 * wrong following the IB spec strictly, but we know
+		 * it's OK for our devices).
+		 */
+		spin_lock_irqsave(&dev->sm_lock, flags);
+		memcpy(send_buf->mad, mad, sizeof *mad);
+		if ((send_buf->ah = dev->sm_ah[port_num - 1]))
+			ret = ib_post_send_mad(send_buf, NULL);
+		else
+			ret = -EINVAL;
+		spin_unlock_irqrestore(&dev->sm_lock, flags);
+
+		if (ret)
+			ib_free_send_mad(send_buf);
+	}
+}
+
+static int mlx4_ib_demux_sa_handler(struct ib_device *ibdev, int port, int slave,
+							     struct ib_sa_mad *sa_mad)
+{
+	int ret = 0;
+
+	/* dispatch to different sa handlers */
+	switch (be16_to_cpu(sa_mad->mad_hdr.attr_id)) {
+	case IB_SA_ATTR_MC_MEMBER_REC:
+		ret = mlx4_ib_mcg_demux_handler(ibdev, port, slave, sa_mad);
+		break;
+	default:
+		break;
+	}
+	return ret;
+}
+
+int mlx4_ib_find_real_gid(struct ib_device *ibdev, u8 port, __be64 guid)
+{
+	struct mlx4_ib_dev *dev = to_mdev(ibdev);
+	int i;
+
+	for (i = 0; i < dev->dev->caps.sqp_demux; i++) {
+		if (dev->sriov.demux[port - 1].guid_cache[i] == guid)
+			return i;
+	}
+	return -1;
+}
+
+
+static int find_slave_port_pkey_ix(struct mlx4_ib_dev *dev, int slave,
+				   u8 port, u16 pkey, u16 *ix)
+{
+	int i, ret;
+	u8 unassigned_pkey_ix, pkey_ix, partial_ix = 0xFF;
+	u16 slot_pkey;
+
+	if (slave == mlx4_master_func_num(dev->dev))
+		return ib_find_cached_pkey(&dev->ib_dev, port, pkey, ix);
+
+	unassigned_pkey_ix = dev->dev->phys_caps.pkey_phys_table_len[port] - 1;
+
+	for (i = 0; i < dev->dev->caps.pkey_table_len[port]; i++) {
+		if (dev->pkeys.virt2phys_pkey[slave][port - 1][i] == unassigned_pkey_ix)
+			continue;
+
+		pkey_ix = dev->pkeys.virt2phys_pkey[slave][port - 1][i];
+
+		ret = ib_get_cached_pkey(&dev->ib_dev, port, pkey_ix, &slot_pkey);
+		if (ret)
+			continue;
+		if ((slot_pkey & 0x7FFF) == (pkey & 0x7FFF)) {
+			if (slot_pkey & 0x8000) {
+				*ix = (u16) pkey_ix;
+				return 0;
+			} else {
+				/* take first partial pkey index found */
+				if (partial_ix == 0xFF)
+					partial_ix = pkey_ix;
+			}
+		}
+	}
+
+	if (partial_ix < 0xFF) {
+		*ix = (u16) partial_ix;
+		return 0;
+	}
+
+	return -EINVAL;
+}
+
+int mlx4_ib_send_to_slave(struct mlx4_ib_dev *dev, int slave, u8 port,
+			  enum ib_qp_type dest_qpt, struct ib_wc *wc,
+			  struct ib_grh *grh, struct ib_mad *mad)
+{
+	struct ib_sge list;
+	struct ib_send_wr wr, *bad_wr;
+	struct mlx4_ib_demux_pv_ctx *tun_ctx;
+	struct mlx4_ib_demux_pv_qp *tun_qp;
+	struct mlx4_rcv_tunnel_mad *tun_mad;
+	struct ib_ah_attr attr;
+	struct ib_ah *ah;
+	struct ib_qp *src_qp = NULL;
+	unsigned tun_tx_ix = 0;
+	int dqpn;
+	int ret = 0;
+	u16 tun_pkey_ix;
+	u16 cached_pkey;
+	u8 is_eth = dev->dev->caps.port_type[port] == MLX4_PORT_TYPE_ETH;
+
+	if (dest_qpt > IB_QPT_GSI)
+		return -EINVAL;
+
+	tun_ctx = dev->sriov.demux[port-1].tun[slave];
+
+	/* check if proxy qp created */
+	if (!tun_ctx || tun_ctx->state != DEMUX_PV_STATE_ACTIVE)
+		return -EAGAIN;
+
+	if (!dest_qpt)
+		tun_qp = &tun_ctx->qp[0];
+	else
+		tun_qp = &tun_ctx->qp[1];
+
+	/* compute P_Key index to put in tunnel header for slave */
+	if (dest_qpt) {
+		u16 pkey_ix;
+		ret = ib_get_cached_pkey(&dev->ib_dev, port, wc->pkey_index, &cached_pkey);
+		if (ret)
+			return -EINVAL;
+
+		ret = find_slave_port_pkey_ix(dev, slave, port, cached_pkey, &pkey_ix);
+		if (ret)
+			return -EINVAL;
+		tun_pkey_ix = pkey_ix;
+	} else
+		tun_pkey_ix = dev->pkeys.virt2phys_pkey[slave][port - 1][0];
+
+	dqpn = dev->dev->phys_caps.base_proxy_sqpn + 8 * slave + port + (dest_qpt * 2) - 1;
+
+	/* get tunnel tx data buf for slave */
+	src_qp = tun_qp->qp;
+
+	/* create ah. Just need an empty one with the port num for the post send.
+	 * The driver will set the force loopback bit in post_send */
+	memset(&attr, 0, sizeof attr);
+	attr.port_num = port;
+	if (is_eth) {
+		memcpy(&attr.grh.dgid.raw[0], &grh->dgid.raw[0], 16);
+		attr.ah_flags = IB_AH_GRH;
+	}
+	ah = ib_create_ah(tun_ctx->pd, &attr);
+	if (IS_ERR(ah))
+		return -ENOMEM;
+
+	/* allocate tunnel tx buf after pass failure returns */
+	spin_lock(&tun_qp->tx_lock);
+	if (tun_qp->tx_ix_head - tun_qp->tx_ix_tail >=
+	    (MLX4_NUM_TUNNEL_BUFS - 1))
+		ret = -EAGAIN;
+	else
+		tun_tx_ix = (++tun_qp->tx_ix_head) & (MLX4_NUM_TUNNEL_BUFS - 1);
+	spin_unlock(&tun_qp->tx_lock);
+	if (ret)
+		goto out;
+
+	tun_mad = (struct mlx4_rcv_tunnel_mad *) (tun_qp->tx_ring[tun_tx_ix].buf.addr);
+	if (tun_qp->tx_ring[tun_tx_ix].ah)
+		ib_destroy_ah(tun_qp->tx_ring[tun_tx_ix].ah);
+	tun_qp->tx_ring[tun_tx_ix].ah = ah;
+	ib_dma_sync_single_for_cpu(&dev->ib_dev,
+				   tun_qp->tx_ring[tun_tx_ix].buf.map,
+				   sizeof (struct mlx4_rcv_tunnel_mad),
+				   DMA_TO_DEVICE);
+
+	/* copy over to tunnel buffer */
+	if (grh)
+		memcpy(&tun_mad->grh, grh, sizeof *grh);
+	memcpy(&tun_mad->mad, mad, sizeof *mad);
+
+	/* adjust tunnel data */
+	tun_mad->hdr.pkey_index = cpu_to_be16(tun_pkey_ix);
+	tun_mad->hdr.flags_src_qp = cpu_to_be32(wc->src_qp & 0xFFFFFF);
+	tun_mad->hdr.g_ml_path = (grh && (wc->wc_flags & IB_WC_GRH)) ? 0x80 : 0;
+
+	if (is_eth) {
+		u16 vlan = 0;
+		if (mlx4_get_slave_default_vlan(dev->dev, port, slave, &vlan,
+						NULL)) {
+			/* VST mode */
+			if (vlan != wc->vlan_id)
+				/* Packet vlan is not the VST-assigned vlan.
+				 * Drop the packet.
+				 */
+				goto out;
+			 else
+				/* Remove the vlan tag before forwarding
+				 * the packet to the VF.
+				 */
+				vlan = 0xffff;
+		} else {
+			vlan = wc->vlan_id;
+		}
+
+		tun_mad->hdr.sl_vid = cpu_to_be16(vlan);
+		memcpy((char *)&tun_mad->hdr.mac_31_0, &(wc->smac[0]), 4);
+		memcpy((char *)&tun_mad->hdr.slid_mac_47_32, &(wc->smac[4]), 2);
+	} else {
+		tun_mad->hdr.sl_vid = cpu_to_be16(((u16)(wc->sl)) << 12);
+		tun_mad->hdr.slid_mac_47_32 = cpu_to_be16(wc->slid);
+	}
+
+	ib_dma_sync_single_for_device(&dev->ib_dev,
+				      tun_qp->tx_ring[tun_tx_ix].buf.map,
+				      sizeof (struct mlx4_rcv_tunnel_mad),
+				      DMA_TO_DEVICE);
+
+	list.addr = tun_qp->tx_ring[tun_tx_ix].buf.map;
+	list.length = sizeof (struct mlx4_rcv_tunnel_mad);
+	list.lkey = tun_ctx->mr->lkey;
+
+	wr.wr.ud.ah = ah;
+	wr.wr.ud.port_num = port;
+	wr.wr.ud.remote_qkey = IB_QP_SET_QKEY;
+	wr.wr.ud.remote_qpn = dqpn;
+	wr.next = NULL;
+	wr.wr_id = ((u64) tun_tx_ix) | MLX4_TUN_SET_WRID_QPN(dest_qpt);
+	wr.sg_list = &list;
+	wr.num_sge = 1;
+	wr.opcode = IB_WR_SEND;
+	wr.send_flags = IB_SEND_SIGNALED;
+
+	ret = ib_post_send(src_qp, &wr, &bad_wr);
+out:
+	if (ret)
+		ib_destroy_ah(ah);
+	return ret;
+}
+
+static int mlx4_ib_demux_mad(struct ib_device *ibdev, u8 port,
+			struct ib_wc *wc, struct ib_grh *grh,
+			struct ib_mad *mad)
+{
+	struct mlx4_ib_dev *dev = to_mdev(ibdev);
+	int err;
+	int slave;
+	u8 *slave_id;
+	int is_eth = 0;
+
+	if (rdma_port_get_link_layer(ibdev, port) == IB_LINK_LAYER_INFINIBAND)
+		is_eth = 0;
+	else
+		is_eth = 1;
+
+	if (is_eth) {
+		if (!(wc->wc_flags & IB_WC_GRH)) {
+			mlx4_ib_warn(ibdev, "RoCE grh not present.\n");
+			return -EINVAL;
+		}
+		if (mad->mad_hdr.mgmt_class != IB_MGMT_CLASS_CM) {
+			mlx4_ib_warn(ibdev, "RoCE mgmt class is not CM\n");
+			return -EINVAL;
+		}
+		if (mlx4_get_slave_from_roce_gid(dev->dev, port, grh->dgid.raw, &slave)) {
+			mlx4_ib_warn(ibdev, "failed matching grh\n");
+			return -ENOENT;
+		}
+		if (slave >= dev->dev->caps.sqp_demux) {
+			mlx4_ib_warn(ibdev, "slave id: %d is bigger than allowed:%d\n",
+				     slave, dev->dev->caps.sqp_demux);
+			return -ENOENT;
+		}
+
+		if (mlx4_ib_demux_cm_handler(ibdev, port, NULL, mad))
+			return 0;
+
+		err = mlx4_ib_send_to_slave(dev, slave, port, wc->qp->qp_type, wc, grh, mad);
+		if (err)
+			pr_debug("failed sending to slave %d via tunnel qp (%d)\n",
+				 slave, err);
+		return 0;
+	}
+
+	/* Initially assume that this mad is for us */
+	slave = mlx4_master_func_num(dev->dev);
+
+	/* See if the slave id is encoded in a response mad */
+	if (mad->mad_hdr.method & 0x80) {
+		slave_id = (u8 *) &mad->mad_hdr.tid;
+		slave = *slave_id;
+		if (slave != 255) /*255 indicates the dom0*/
+			*slave_id = 0; /* remap tid */
+	}
+
+	/* If a grh is present, we demux according to it */
+	if (wc->wc_flags & IB_WC_GRH) {
+		slave = mlx4_ib_find_real_gid(ibdev, port, grh->dgid.global.interface_id);
+		if (slave < 0) {
+			mlx4_ib_warn(ibdev, "failed matching grh\n");
+			return -ENOENT;
+		}
+	}
+	/* Class-specific handling */
+	switch (mad->mad_hdr.mgmt_class) {
+	case IB_MGMT_CLASS_SUBN_LID_ROUTED:
+	case IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE:
+		/* 255 indicates the dom0 */
+		if (slave != 255 && slave != mlx4_master_func_num(dev->dev)) {
+			if (!mlx4_vf_smi_enabled(dev->dev, slave, port))
+				return -EPERM;
+			/* for a VF. drop unsolicited MADs */
+			if (!(mad->mad_hdr.method & IB_MGMT_METHOD_RESP)) {
+				mlx4_ib_warn(ibdev, "demux QP0. rejecting unsolicited mad for slave %d class 0x%x, method 0x%x\n",
+					     slave, mad->mad_hdr.mgmt_class,
+					     mad->mad_hdr.method);
+				return -EINVAL;
+			}
+		}
+		break;
+	case IB_MGMT_CLASS_SUBN_ADM:
+		if (mlx4_ib_demux_sa_handler(ibdev, port, slave,
+					     (struct ib_sa_mad *) mad))
+			return 0;
+		break;
+	case IB_MGMT_CLASS_CM:
+		if (mlx4_ib_demux_cm_handler(ibdev, port, &slave, mad))
+			return 0;
+		break;
+	case IB_MGMT_CLASS_DEVICE_MGMT:
+		if (mad->mad_hdr.method != IB_MGMT_METHOD_GET_RESP)
+			return 0;
+		break;
+	default:
+		/* Drop unsupported classes for slaves in tunnel mode */
+		if (slave != mlx4_master_func_num(dev->dev)) {
+			pr_debug("dropping unsupported ingress mad from class:%d "
+				 "for slave:%d\n", mad->mad_hdr.mgmt_class, slave);
+			return 0;
+		}
+	}
+	/*make sure that no slave==255 was not handled yet.*/
+	if (slave >= dev->dev->caps.sqp_demux) {
+		mlx4_ib_warn(ibdev, "slave id: %d is bigger than allowed:%d\n",
+			     slave, dev->dev->caps.sqp_demux);
+		return -ENOENT;
+	}
+
+	err = mlx4_ib_send_to_slave(dev, slave, port, wc->qp->qp_type, wc, grh, mad);
+	if (err)
+		pr_debug("failed sending to slave %d via tunnel qp (%d)\n",
+			 slave, err);
+	return 0;
+}
+
+static int ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
+			struct ib_wc *in_wc, struct ib_grh *in_grh,
+			struct ib_mad *in_mad, struct ib_mad *out_mad)
+{
+	u16 slid, prev_lid = 0;
+	int err;
+	struct ib_port_attr pattr;
+
+	if (in_wc && in_wc->qp->qp_num) {
+		pr_debug("received MAD: slid:%d sqpn:%d "
+			"dlid_bits:%d dqpn:%d wc_flags:0x%x, cls %x, mtd %x, atr %x\n",
+			in_wc->slid, in_wc->src_qp,
+			in_wc->dlid_path_bits,
+			in_wc->qp->qp_num,
+			in_wc->wc_flags,
+			in_mad->mad_hdr.mgmt_class, in_mad->mad_hdr.method,
+			be16_to_cpu(in_mad->mad_hdr.attr_id));
+		if (in_wc->wc_flags & IB_WC_GRH) {
+			pr_debug("sgid_hi:0x%016llx sgid_lo:0x%016llx\n",
+				 be64_to_cpu(in_grh->sgid.global.subnet_prefix),
+				 be64_to_cpu(in_grh->sgid.global.interface_id));
+			pr_debug("dgid_hi:0x%016llx dgid_lo:0x%016llx\n",
+				 be64_to_cpu(in_grh->dgid.global.subnet_prefix),
+				 be64_to_cpu(in_grh->dgid.global.interface_id));
+		}
+	}
+
+	slid = in_wc ? in_wc->slid : be16_to_cpu(IB_LID_PERMISSIVE);
+
+	if (in_mad->mad_hdr.method == IB_MGMT_METHOD_TRAP && slid == 0) {
+		forward_trap(to_mdev(ibdev), port_num, in_mad);
+		return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED;
+	}
+
+	if (in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED ||
+	    in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) {
+		if (in_mad->mad_hdr.method   != IB_MGMT_METHOD_GET &&
+		    in_mad->mad_hdr.method   != IB_MGMT_METHOD_SET &&
+		    in_mad->mad_hdr.method   != IB_MGMT_METHOD_TRAP_REPRESS)
+			return IB_MAD_RESULT_SUCCESS;
+
+		/*
+		 * Don't process SMInfo queries -- the SMA can't handle them.
+		 */
+		if (in_mad->mad_hdr.attr_id == IB_SMP_ATTR_SM_INFO)
+			return IB_MAD_RESULT_SUCCESS;
+	} else if (in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_PERF_MGMT ||
+		   in_mad->mad_hdr.mgmt_class == MLX4_IB_VENDOR_CLASS1   ||
+		   in_mad->mad_hdr.mgmt_class == MLX4_IB_VENDOR_CLASS2   ||
+		   in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_CONG_MGMT) {
+		if (in_mad->mad_hdr.method  != IB_MGMT_METHOD_GET &&
+		    in_mad->mad_hdr.method  != IB_MGMT_METHOD_SET)
+			return IB_MAD_RESULT_SUCCESS;
+	} else
+		return IB_MAD_RESULT_SUCCESS;
+
+	if ((in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED ||
+	     in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) &&
+	    in_mad->mad_hdr.method == IB_MGMT_METHOD_SET &&
+	    in_mad->mad_hdr.attr_id == IB_SMP_ATTR_PORT_INFO &&
+	    !ib_query_port(ibdev, port_num, &pattr))
+		prev_lid = pattr.lid;
+
+	err = mlx4_MAD_IFC(to_mdev(ibdev),
+			   (mad_flags & IB_MAD_IGNORE_MKEY ? MLX4_MAD_IFC_IGNORE_MKEY : 0) |
+			   (mad_flags & IB_MAD_IGNORE_BKEY ? MLX4_MAD_IFC_IGNORE_BKEY : 0) |
+			   MLX4_MAD_IFC_NET_VIEW,
+			   port_num, in_wc, in_grh, in_mad, out_mad);
+	if (err)
+		return IB_MAD_RESULT_FAILURE;
+
+	if (!out_mad->mad_hdr.status) {
+		if (!(to_mdev(ibdev)->dev->caps.flags & MLX4_DEV_CAP_FLAG_PORT_MNG_CHG_EV))
+			smp_snoop(ibdev, port_num, in_mad, prev_lid);
+		/* slaves get node desc from FW */
+		if (!mlx4_is_slave(to_mdev(ibdev)->dev))
+			node_desc_override(ibdev, out_mad);
+	}
+
+	/* set return bit in status of directed route responses */
+	if (in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)
+		out_mad->mad_hdr.status |= cpu_to_be16(1 << 15);
+
+	if (in_mad->mad_hdr.method == IB_MGMT_METHOD_TRAP_REPRESS)
+		/* no response for trap repress */
+		return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED;
+
+	return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY;
+}
+
+static void edit_counter(struct mlx4_counter *cnt,
+					struct ib_pma_portcounters *pma_cnt)
+{
+	pma_cnt->port_xmit_data = cpu_to_be32((be64_to_cpu(cnt->tx_bytes)>>2));
+	pma_cnt->port_rcv_data  = cpu_to_be32((be64_to_cpu(cnt->rx_bytes)>>2));
+	pma_cnt->port_xmit_packets = cpu_to_be32(be64_to_cpu(cnt->tx_frames));
+	pma_cnt->port_rcv_packets  = cpu_to_be32(be64_to_cpu(cnt->rx_frames));
+}
+
+static int iboe_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
+			struct ib_wc *in_wc, struct ib_grh *in_grh,
+			struct ib_mad *in_mad, struct ib_mad *out_mad)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_ib_dev *dev = to_mdev(ibdev);
+	int err;
+	u32 inmod = dev->counters[port_num - 1] & 0xffff;
+	u8 mode;
+
+	if (in_mad->mad_hdr.mgmt_class != IB_MGMT_CLASS_PERF_MGMT)
+		return -EINVAL;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev->dev);
+	if (IS_ERR(mailbox))
+		return IB_MAD_RESULT_FAILURE;
+
+	err = mlx4_cmd_box(dev->dev, 0, mailbox->dma, inmod, 0,
+			   MLX4_CMD_QUERY_IF_STAT, MLX4_CMD_TIME_CLASS_C,
+			   MLX4_CMD_WRAPPED);
+	if (err)
+		err = IB_MAD_RESULT_FAILURE;
+	else {
+		memset(out_mad->data, 0, sizeof out_mad->data);
+		mode = ((struct mlx4_counter *)mailbox->buf)->counter_mode;
+		switch (mode & 0xf) {
+		case 0:
+			edit_counter(mailbox->buf,
+						(void *)(out_mad->data + 40));
+			err = IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY;
+			break;
+		default:
+			err = IB_MAD_RESULT_FAILURE;
+		}
+	}
+
+	mlx4_free_cmd_mailbox(dev->dev, mailbox);
+
+	return err;
+}
+
+int mlx4_ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
+			struct ib_wc *in_wc, struct ib_grh *in_grh,
+			struct ib_mad *in_mad, struct ib_mad *out_mad)
+{
+	switch (rdma_port_get_link_layer(ibdev, port_num)) {
+	case IB_LINK_LAYER_INFINIBAND:
+		return ib_process_mad(ibdev, mad_flags, port_num, in_wc,
+				      in_grh, in_mad, out_mad);
+	case IB_LINK_LAYER_ETHERNET:
+		return iboe_process_mad(ibdev, mad_flags, port_num, in_wc,
+					  in_grh, in_mad, out_mad);
+	default:
+		return -EINVAL;
+	}
+}
+
+static void send_handler(struct ib_mad_agent *agent,
+			 struct ib_mad_send_wc *mad_send_wc)
+{
+	if (mad_send_wc->send_buf->context[0])
+		ib_destroy_ah(mad_send_wc->send_buf->context[0]);
+	ib_free_send_mad(mad_send_wc->send_buf);
+}
+
+int mlx4_ib_mad_init(struct mlx4_ib_dev *dev)
+{
+	struct ib_mad_agent *agent;
+	int p, q;
+	int ret;
+	enum rdma_link_layer ll;
+
+	for (p = 0; p < dev->num_ports; ++p) {
+		ll = rdma_port_get_link_layer(&dev->ib_dev, p + 1);
+		for (q = 0; q <= 1; ++q) {
+			if (ll == IB_LINK_LAYER_INFINIBAND) {
+				agent = ib_register_mad_agent(&dev->ib_dev, p + 1,
+							      q ? IB_QPT_GSI : IB_QPT_SMI,
+							      NULL, 0, send_handler,
+							      NULL, NULL, 0);
+				if (IS_ERR(agent)) {
+					ret = PTR_ERR(agent);
+					goto err;
+				}
+				dev->send_agent[p][q] = agent;
+			} else
+				dev->send_agent[p][q] = NULL;
+		}
+	}
+
+	return 0;
+
+err:
+	for (p = 0; p < dev->num_ports; ++p)
+		for (q = 0; q <= 1; ++q)
+			if (dev->send_agent[p][q])
+				ib_unregister_mad_agent(dev->send_agent[p][q]);
+
+	return ret;
+}
+
+void mlx4_ib_mad_cleanup(struct mlx4_ib_dev *dev)
+{
+	struct ib_mad_agent *agent;
+	int p, q;
+
+	for (p = 0; p < dev->num_ports; ++p) {
+		for (q = 0; q <= 1; ++q) {
+			agent = dev->send_agent[p][q];
+			if (agent) {
+				dev->send_agent[p][q] = NULL;
+				ib_unregister_mad_agent(agent);
+			}
+		}
+
+		if (dev->sm_ah[p])
+			ib_destroy_ah(dev->sm_ah[p]);
+	}
+}
+
+static void handle_lid_change_event(struct mlx4_ib_dev *dev, u8 port_num)
+{
+	mlx4_ib_dispatch_event(dev, port_num, IB_EVENT_LID_CHANGE);
+
+	if (mlx4_is_master(dev->dev) && !dev->sriov.is_going_down)
+		mlx4_gen_slaves_port_mgt_ev(dev->dev, port_num,
+					    MLX4_EQ_PORT_INFO_LID_CHANGE_MASK);
+}
+
+static void handle_client_rereg_event(struct mlx4_ib_dev *dev, u8 port_num)
+{
+	/* re-configure the alias-guid and mcg's */
+	if (mlx4_is_master(dev->dev)) {
+		mlx4_ib_invalidate_all_guid_record(dev, port_num);
+
+		if (!dev->sriov.is_going_down) {
+			mlx4_ib_mcg_port_cleanup(&dev->sriov.demux[port_num - 1], 0);
+			mlx4_gen_slaves_port_mgt_ev(dev->dev, port_num,
+						    MLX4_EQ_PORT_INFO_CLIENT_REREG_MASK);
+		}
+	}
+	mlx4_ib_dispatch_event(dev, port_num, IB_EVENT_CLIENT_REREGISTER);
+}
+
+static void propagate_pkey_ev(struct mlx4_ib_dev *dev, int port_num,
+			      struct mlx4_eqe *eqe)
+{
+	__propagate_pkey_ev(dev, port_num, GET_BLK_PTR_FROM_EQE(eqe),
+			    GET_MASK_FROM_EQE(eqe));
+}
+
+static void handle_slaves_guid_change(struct mlx4_ib_dev *dev, u8 port_num,
+				      u32 guid_tbl_blk_num, u32 change_bitmap)
+{
+	struct ib_smp *in_mad  = NULL;
+	struct ib_smp *out_mad  = NULL;
+	u16 i;
+
+	if (!mlx4_is_mfunc(dev->dev) || !mlx4_is_master(dev->dev))
+		return;
+
+	in_mad  = kmalloc(sizeof *in_mad, GFP_KERNEL);
+	out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL);
+	if (!in_mad || !out_mad) {
+		mlx4_ib_warn(&dev->ib_dev, "failed to allocate memory for guid info mads\n");
+		goto out;
+	}
+
+	guid_tbl_blk_num  *= 4;
+
+	for (i = 0; i < 4; i++) {
+		if (change_bitmap && (!((change_bitmap >> (8 * i)) & 0xff)))
+			continue;
+		memset(in_mad, 0, sizeof *in_mad);
+		memset(out_mad, 0, sizeof *out_mad);
+
+		in_mad->base_version  = 1;
+		in_mad->mgmt_class    = IB_MGMT_CLASS_SUBN_LID_ROUTED;
+		in_mad->class_version = 1;
+		in_mad->method        = IB_MGMT_METHOD_GET;
+		in_mad->attr_id       = IB_SMP_ATTR_GUID_INFO;
+		in_mad->attr_mod      = cpu_to_be32(guid_tbl_blk_num + i);
+
+		if (mlx4_MAD_IFC(dev,
+				 MLX4_MAD_IFC_IGNORE_KEYS | MLX4_MAD_IFC_NET_VIEW,
+				 port_num, NULL, NULL, in_mad, out_mad)) {
+			mlx4_ib_warn(&dev->ib_dev, "Failed in get GUID INFO MAD_IFC\n");
+			goto out;
+		}
+
+		mlx4_ib_update_cache_on_guid_change(dev, guid_tbl_blk_num + i,
+						    port_num,
+						    (u8 *)(&((struct ib_smp *)out_mad)->data));
+		mlx4_ib_notify_slaves_on_guid_change(dev, guid_tbl_blk_num + i,
+						     port_num,
+						     (u8 *)(&((struct ib_smp *)out_mad)->data));
+	}
+
+out:
+	kfree(in_mad);
+	kfree(out_mad);
+	return;
+}
+
+void handle_port_mgmt_change_event(struct work_struct *work)
+{
+	struct ib_event_work *ew = container_of(work, struct ib_event_work, work);
+	struct mlx4_ib_dev *dev = ew->ib_dev;
+	struct mlx4_eqe *eqe = &(ew->ib_eqe);
+	u8 port = eqe->event.port_mgmt_change.port;
+	u32 changed_attr;
+	u32 tbl_block;
+	u32 change_bitmap;
+
+	switch (eqe->subtype) {
+	case MLX4_DEV_PMC_SUBTYPE_PORT_INFO:
+		changed_attr = be32_to_cpu(eqe->event.port_mgmt_change.params.port_info.changed_attr);
+
+		/* Update the SM ah - This should be done before handling
+		   the other changed attributes so that MADs can be sent to the SM */
+		if (changed_attr & MSTR_SM_CHANGE_MASK) {
+			u16 lid = be16_to_cpu(eqe->event.port_mgmt_change.params.port_info.mstr_sm_lid);
+			u8 sl = eqe->event.port_mgmt_change.params.port_info.mstr_sm_sl & 0xf;
+			update_sm_ah(dev, port, lid, sl);
+		}
+
+		/* Check if it is a lid change event */
+		if (changed_attr & MLX4_EQ_PORT_INFO_LID_CHANGE_MASK)
+			handle_lid_change_event(dev, port);
+
+		/* Generate GUID changed event */
+		if (changed_attr & MLX4_EQ_PORT_INFO_GID_PFX_CHANGE_MASK) {
+			mlx4_ib_dispatch_event(dev, port, IB_EVENT_GID_CHANGE);
+			/*if master, notify all slaves*/
+			if (mlx4_is_master(dev->dev))
+				mlx4_gen_slaves_port_mgt_ev(dev->dev, port,
+							    MLX4_EQ_PORT_INFO_GID_PFX_CHANGE_MASK);
+		}
+
+		if (changed_attr & MLX4_EQ_PORT_INFO_CLIENT_REREG_MASK)
+			handle_client_rereg_event(dev, port);
+		break;
+
+	case MLX4_DEV_PMC_SUBTYPE_PKEY_TABLE:
+		mlx4_ib_dispatch_event(dev, port, IB_EVENT_PKEY_CHANGE);
+		if (mlx4_is_master(dev->dev) && !dev->sriov.is_going_down)
+			propagate_pkey_ev(dev, port, eqe);
+		break;
+	case MLX4_DEV_PMC_SUBTYPE_GUID_INFO:
+		/* paravirtualized master's guid is guid 0 -- does not change */
+		if (!mlx4_is_master(dev->dev))
+			mlx4_ib_dispatch_event(dev, port, IB_EVENT_GID_CHANGE);
+		/*if master, notify relevant slaves*/
+		else if (!dev->sriov.is_going_down) {
+			tbl_block = GET_BLK_PTR_FROM_EQE(eqe);
+			change_bitmap = GET_MASK_FROM_EQE(eqe);
+			handle_slaves_guid_change(dev, port, tbl_block, change_bitmap);
+		}
+		break;
+	default:
+		pr_warn("Unsupported subtype 0x%x for "
+			"Port Management Change event\n", eqe->subtype);
+	}
+
+	kfree(ew);
+}
+
+void mlx4_ib_dispatch_event(struct mlx4_ib_dev *dev, u8 port_num,
+			    enum ib_event_type type)
+{
+	struct ib_event event;
+
+	event.device		= &dev->ib_dev;
+	event.element.port_num	= port_num;
+	event.event		= type;
+
+	ib_dispatch_event(&event);
+}
+
+static void mlx4_ib_tunnel_comp_handler(struct ib_cq *cq, void *arg)
+{
+	unsigned long flags;
+	struct mlx4_ib_demux_pv_ctx *ctx = cq->cq_context;
+	struct mlx4_ib_dev *dev = to_mdev(ctx->ib_dev);
+	spin_lock_irqsave(&dev->sriov.going_down_lock, flags);
+	if (!dev->sriov.is_going_down && ctx->state == DEMUX_PV_STATE_ACTIVE)
+		queue_work(ctx->wq, &ctx->work);
+	spin_unlock_irqrestore(&dev->sriov.going_down_lock, flags);
+}
+
+static int mlx4_ib_post_pv_qp_buf(struct mlx4_ib_demux_pv_ctx *ctx,
+				  struct mlx4_ib_demux_pv_qp *tun_qp,
+				  int index)
+{
+	struct ib_sge sg_list;
+	struct ib_recv_wr recv_wr, *bad_recv_wr;
+	int size;
+
+	size = (tun_qp->qp->qp_type == IB_QPT_UD) ?
+		sizeof (struct mlx4_tunnel_mad) : sizeof (struct mlx4_mad_rcv_buf);
+
+	sg_list.addr = tun_qp->ring[index].map;
+	sg_list.length = size;
+	sg_list.lkey = ctx->mr->lkey;
+
+	recv_wr.next = NULL;
+	recv_wr.sg_list = &sg_list;
+	recv_wr.num_sge = 1;
+	recv_wr.wr_id = (u64) index | MLX4_TUN_WRID_RECV |
+		MLX4_TUN_SET_WRID_QPN(tun_qp->proxy_qpt);
+	ib_dma_sync_single_for_device(ctx->ib_dev, tun_qp->ring[index].map,
+				      size, DMA_FROM_DEVICE);
+	return ib_post_recv(tun_qp->qp, &recv_wr, &bad_recv_wr);
+}
+
+static int mlx4_ib_multiplex_sa_handler(struct ib_device *ibdev, int port,
+		int slave, struct ib_sa_mad *sa_mad)
+{
+	int ret = 0;
+
+	/* dispatch to different sa handlers */
+	switch (be16_to_cpu(sa_mad->mad_hdr.attr_id)) {
+	case IB_SA_ATTR_MC_MEMBER_REC:
+		ret = mlx4_ib_mcg_multiplex_handler(ibdev, port, slave, sa_mad);
+		break;
+	default:
+		break;
+	}
+	return ret;
+}
+
+static int is_proxy_qp0(struct mlx4_ib_dev *dev, int qpn, int slave)
+{
+	int proxy_start = dev->dev->phys_caps.base_proxy_sqpn + 8 * slave;
+
+	return (qpn >= proxy_start && qpn <= proxy_start + 1);
+}
+
+
+int mlx4_ib_send_to_wire(struct mlx4_ib_dev *dev, int slave, u8 port,
+			 enum ib_qp_type dest_qpt, u16 pkey_index,
+			 u32 remote_qpn, u32 qkey, struct ib_ah_attr *attr,
+			 u8 *s_mac, struct ib_mad *mad)
+{
+	struct ib_sge list;
+	struct ib_send_wr wr, *bad_wr;
+	struct mlx4_ib_demux_pv_ctx *sqp_ctx;
+	struct mlx4_ib_demux_pv_qp *sqp;
+	struct mlx4_mad_snd_buf *sqp_mad;
+	struct ib_ah *ah;
+	struct ib_qp *send_qp = NULL;
+	unsigned wire_tx_ix = 0;
+	int ret = 0;
+	u16 wire_pkey_ix;
+	int src_qpnum;
+	u8 sgid_index;
+
+
+	sqp_ctx = dev->sriov.sqps[port-1];
+
+	/* check if proxy qp created */
+	if (!sqp_ctx || sqp_ctx->state != DEMUX_PV_STATE_ACTIVE)
+		return -EAGAIN;
+
+	if (dest_qpt == IB_QPT_SMI) {
+		src_qpnum = 0;
+		sqp = &sqp_ctx->qp[0];
+		wire_pkey_ix = dev->pkeys.virt2phys_pkey[slave][port - 1][0];
+	} else {
+		src_qpnum = 1;
+		sqp = &sqp_ctx->qp[1];
+		wire_pkey_ix = dev->pkeys.virt2phys_pkey[slave][port - 1][pkey_index];
+	}
+
+	send_qp = sqp->qp;
+
+	/* create ah */
+	sgid_index = attr->grh.sgid_index;
+	attr->grh.sgid_index = 0;
+	ah = ib_create_ah(sqp_ctx->pd, attr);
+	if (IS_ERR(ah))
+		return -ENOMEM;
+	attr->grh.sgid_index = sgid_index;
+	to_mah(ah)->av.ib.gid_index = sgid_index;
+	/* get rid of force-loopback bit */
+	to_mah(ah)->av.ib.port_pd &= cpu_to_be32(0x7FFFFFFF);
+	spin_lock(&sqp->tx_lock);
+	if (sqp->tx_ix_head - sqp->tx_ix_tail >=
+	    (MLX4_NUM_TUNNEL_BUFS - 1))
+		ret = -EAGAIN;
+	else
+		wire_tx_ix = (++sqp->tx_ix_head) & (MLX4_NUM_TUNNEL_BUFS - 1);
+	spin_unlock(&sqp->tx_lock);
+	if (ret)
+		goto out;
+
+	sqp_mad = (struct mlx4_mad_snd_buf *) (sqp->tx_ring[wire_tx_ix].buf.addr);
+	if (sqp->tx_ring[wire_tx_ix].ah)
+		ib_destroy_ah(sqp->tx_ring[wire_tx_ix].ah);
+	sqp->tx_ring[wire_tx_ix].ah = ah;
+	ib_dma_sync_single_for_cpu(&dev->ib_dev,
+				   sqp->tx_ring[wire_tx_ix].buf.map,
+				   sizeof (struct mlx4_mad_snd_buf),
+				   DMA_TO_DEVICE);
+
+	memcpy(&sqp_mad->payload, mad, sizeof *mad);
+
+	ib_dma_sync_single_for_device(&dev->ib_dev,
+				      sqp->tx_ring[wire_tx_ix].buf.map,
+				      sizeof (struct mlx4_mad_snd_buf),
+				      DMA_TO_DEVICE);
+
+	list.addr = sqp->tx_ring[wire_tx_ix].buf.map;
+	list.length = sizeof (struct mlx4_mad_snd_buf);
+	list.lkey = sqp_ctx->mr->lkey;
+
+	wr.wr.ud.ah = ah;
+	wr.wr.ud.port_num = port;
+	wr.wr.ud.pkey_index = wire_pkey_ix;
+	wr.wr.ud.remote_qkey = qkey;
+	wr.wr.ud.remote_qpn = remote_qpn;
+	wr.next = NULL;
+	wr.wr_id = ((u64) wire_tx_ix) | MLX4_TUN_SET_WRID_QPN(src_qpnum);
+	wr.sg_list = &list;
+	wr.num_sge = 1;
+	wr.opcode = IB_WR_SEND;
+	wr.send_flags = IB_SEND_SIGNALED;
+	if (s_mac)
+		memcpy(to_mah(ah)->av.eth.s_mac, s_mac, 6);
+
+
+	ret = ib_post_send(send_qp, &wr, &bad_wr);
+out:
+	if (ret)
+		ib_destroy_ah(ah);
+	return ret;
+}
+
+static int get_slave_base_gid_ix(struct mlx4_ib_dev *dev, int slave, int port)
+{
+	if (rdma_port_get_link_layer(&dev->ib_dev, port) == IB_LINK_LAYER_INFINIBAND)
+		return slave;
+	return mlx4_get_base_gid_ix(dev->dev, slave, port);
+}
+
+static void fill_in_real_sgid_index(struct mlx4_ib_dev *dev, int slave, int port,
+				    struct ib_ah_attr *ah_attr)
+{
+	if (rdma_port_get_link_layer(&dev->ib_dev, port) == IB_LINK_LAYER_INFINIBAND)
+		ah_attr->grh.sgid_index = slave;
+	else
+		ah_attr->grh.sgid_index += get_slave_base_gid_ix(dev, slave, port);
+}
+
+static void mlx4_ib_multiplex_mad(struct mlx4_ib_demux_pv_ctx *ctx, struct ib_wc *wc)
+{
+	struct mlx4_ib_dev *dev = to_mdev(ctx->ib_dev);
+	struct mlx4_ib_demux_pv_qp *tun_qp = &ctx->qp[MLX4_TUN_WRID_QPN(wc->wr_id)];
+	int wr_ix = wc->wr_id & (MLX4_NUM_TUNNEL_BUFS - 1);
+	struct mlx4_tunnel_mad *tunnel = tun_qp->ring[wr_ix].addr;
+	struct mlx4_ib_ah ah;
+	struct ib_ah_attr ah_attr;
+	u8 *slave_id;
+	int slave;
+	int port;
+
+	/* Get slave that sent this packet */
+	if (wc->src_qp < dev->dev->phys_caps.base_proxy_sqpn ||
+	    wc->src_qp >= dev->dev->phys_caps.base_proxy_sqpn + 8 * MLX4_MFUNC_MAX ||
+	    (wc->src_qp & 0x1) != ctx->port - 1 ||
+	    wc->src_qp & 0x4) {
+		mlx4_ib_warn(ctx->ib_dev, "can't multiplex bad sqp:%d\n", wc->src_qp);
+		return;
+	}
+	slave = ((wc->src_qp & ~0x7) - dev->dev->phys_caps.base_proxy_sqpn) / 8;
+	if (slave != ctx->slave) {
+		mlx4_ib_warn(ctx->ib_dev, "can't multiplex bad sqp:%d: "
+			     "belongs to another slave\n", wc->src_qp);
+		return;
+	}
+
+	/* Map transaction ID */
+	ib_dma_sync_single_for_cpu(ctx->ib_dev, tun_qp->ring[wr_ix].map,
+				   sizeof (struct mlx4_tunnel_mad),
+				   DMA_FROM_DEVICE);
+	switch (tunnel->mad.mad_hdr.method) {
+	case IB_MGMT_METHOD_SET:
+	case IB_MGMT_METHOD_GET:
+	case IB_MGMT_METHOD_REPORT:
+	case IB_SA_METHOD_GET_TABLE:
+	case IB_SA_METHOD_DELETE:
+	case IB_SA_METHOD_GET_MULTI:
+	case IB_SA_METHOD_GET_TRACE_TBL:
+		slave_id = (u8 *) &tunnel->mad.mad_hdr.tid;
+		if (*slave_id) {
+			mlx4_ib_warn(ctx->ib_dev, "egress mad has non-null tid msb:%d "
+				     "class:%d slave:%d\n", *slave_id,
+				     tunnel->mad.mad_hdr.mgmt_class, slave);
+			return;
+		} else
+			*slave_id = slave;
+	default:
+		/* nothing */;
+	}
+
+	/* Class-specific handling */
+	switch (tunnel->mad.mad_hdr.mgmt_class) {
+	case IB_MGMT_CLASS_SUBN_LID_ROUTED:
+	case IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE:
+		if (slave != mlx4_master_func_num(dev->dev) &&
+		    !mlx4_vf_smi_enabled(dev->dev, slave, ctx->port))
+			return;
+		break;
+	case IB_MGMT_CLASS_SUBN_ADM:
+		if (mlx4_ib_multiplex_sa_handler(ctx->ib_dev, ctx->port, slave,
+			      (struct ib_sa_mad *) &tunnel->mad))
+			return;
+		break;
+	case IB_MGMT_CLASS_CM:
+		if (mlx4_ib_multiplex_cm_handler(ctx->ib_dev, ctx->port, slave,
+			      (struct ib_mad *) &tunnel->mad))
+			return;
+		break;
+	case IB_MGMT_CLASS_DEVICE_MGMT:
+		if (tunnel->mad.mad_hdr.method != IB_MGMT_METHOD_GET &&
+		    tunnel->mad.mad_hdr.method != IB_MGMT_METHOD_SET)
+			return;
+		break;
+	default:
+		/* Drop unsupported classes for slaves in tunnel mode */
+		if (slave != mlx4_master_func_num(dev->dev)) {
+			mlx4_ib_warn(ctx->ib_dev, "dropping unsupported egress mad from class:%d "
+				     "for slave:%d\n", tunnel->mad.mad_hdr.mgmt_class, slave);
+			return;
+		}
+	}
+
+	/* We are using standard ib_core services to send the mad, so generate a
+	 * stadard address handle by decoding the tunnelled mlx4_ah fields */
+	memcpy(&ah.av, &tunnel->hdr.av, sizeof (struct mlx4_av));
+	ah.ibah.device = ctx->ib_dev;
+	mlx4_ib_query_ah(&ah.ibah, &ah_attr);
+	if (ah_attr.ah_flags & IB_AH_GRH)
+		fill_in_real_sgid_index(dev, slave, ctx->port, &ah_attr);
+
+	port = mlx4_slave_convert_port(dev->dev, slave, ah_attr.port_num);
+	if (port < 0)
+		return;
+	ah_attr.port_num = port;
+	memcpy(ah_attr.dmac, tunnel->hdr.mac, 6);
+	ah_attr.vlan_id = be16_to_cpu(tunnel->hdr.vlan);
+	/* if slave have default vlan use it */
+	mlx4_get_slave_default_vlan(dev->dev, ctx->port, slave,
+				    &ah_attr.vlan_id, &ah_attr.sl);
+
+	mlx4_ib_send_to_wire(dev, slave, ctx->port,
+			     is_proxy_qp0(dev, wc->src_qp, slave) ?
+			     IB_QPT_SMI : IB_QPT_GSI,
+			     be16_to_cpu(tunnel->hdr.pkey_index),
+			     be32_to_cpu(tunnel->hdr.remote_qpn),
+			     be32_to_cpu(tunnel->hdr.qkey),
+			     &ah_attr, wc->smac, &tunnel->mad);
+}
+
+static int mlx4_ib_alloc_pv_bufs(struct mlx4_ib_demux_pv_ctx *ctx,
+				 enum ib_qp_type qp_type, int is_tun)
+{
+	int i;
+	struct mlx4_ib_demux_pv_qp *tun_qp;
+	int rx_buf_size, tx_buf_size;
+
+	if (qp_type > IB_QPT_GSI)
+		return -EINVAL;
+
+	tun_qp = &ctx->qp[qp_type];
+
+	tun_qp->ring = kzalloc(sizeof (struct mlx4_ib_buf) * MLX4_NUM_TUNNEL_BUFS,
+			       GFP_KERNEL);
+	if (!tun_qp->ring)
+		return -ENOMEM;
+
+	tun_qp->tx_ring = kcalloc(MLX4_NUM_TUNNEL_BUFS,
+				  sizeof (struct mlx4_ib_tun_tx_buf),
+				  GFP_KERNEL);
+	if (!tun_qp->tx_ring) {
+		kfree(tun_qp->ring);
+		tun_qp->ring = NULL;
+		return -ENOMEM;
+	}
+
+	if (is_tun) {
+		rx_buf_size = sizeof (struct mlx4_tunnel_mad);
+		tx_buf_size = sizeof (struct mlx4_rcv_tunnel_mad);
+	} else {
+		rx_buf_size = sizeof (struct mlx4_mad_rcv_buf);
+		tx_buf_size = sizeof (struct mlx4_mad_snd_buf);
+	}
+
+	for (i = 0; i < MLX4_NUM_TUNNEL_BUFS; i++) {
+		tun_qp->ring[i].addr = kmalloc(rx_buf_size, GFP_KERNEL);
+		if (!tun_qp->ring[i].addr)
+			goto err;
+		tun_qp->ring[i].map = ib_dma_map_single(ctx->ib_dev,
+							tun_qp->ring[i].addr,
+							rx_buf_size,
+							DMA_FROM_DEVICE);
+	}
+
+	for (i = 0; i < MLX4_NUM_TUNNEL_BUFS; i++) {
+		tun_qp->tx_ring[i].buf.addr =
+			kmalloc(tx_buf_size, GFP_KERNEL);
+		if (!tun_qp->tx_ring[i].buf.addr)
+			goto tx_err;
+		tun_qp->tx_ring[i].buf.map =
+			ib_dma_map_single(ctx->ib_dev,
+					  tun_qp->tx_ring[i].buf.addr,
+					  tx_buf_size,
+					  DMA_TO_DEVICE);
+		tun_qp->tx_ring[i].ah = NULL;
+	}
+	spin_lock_init(&tun_qp->tx_lock);
+	tun_qp->tx_ix_head = 0;
+	tun_qp->tx_ix_tail = 0;
+	tun_qp->proxy_qpt = qp_type;
+
+	return 0;
+
+tx_err:
+	while (i > 0) {
+		--i;
+		ib_dma_unmap_single(ctx->ib_dev, tun_qp->tx_ring[i].buf.map,
+				    tx_buf_size, DMA_TO_DEVICE);
+		kfree(tun_qp->tx_ring[i].buf.addr);
+	}
+	kfree(tun_qp->tx_ring);
+	tun_qp->tx_ring = NULL;
+	i = MLX4_NUM_TUNNEL_BUFS;
+err:
+	while (i > 0) {
+		--i;
+		ib_dma_unmap_single(ctx->ib_dev, tun_qp->ring[i].map,
+				    rx_buf_size, DMA_FROM_DEVICE);
+		kfree(tun_qp->ring[i].addr);
+	}
+	kfree(tun_qp->ring);
+	tun_qp->ring = NULL;
+	return -ENOMEM;
+}
+
+static void mlx4_ib_free_pv_qp_bufs(struct mlx4_ib_demux_pv_ctx *ctx,
+				     enum ib_qp_type qp_type, int is_tun)
+{
+	int i;
+	struct mlx4_ib_demux_pv_qp *tun_qp;
+	int rx_buf_size, tx_buf_size;
+
+	if (qp_type > IB_QPT_GSI)
+		return;
+
+	tun_qp = &ctx->qp[qp_type];
+	if (is_tun) {
+		rx_buf_size = sizeof (struct mlx4_tunnel_mad);
+		tx_buf_size = sizeof (struct mlx4_rcv_tunnel_mad);
+	} else {
+		rx_buf_size = sizeof (struct mlx4_mad_rcv_buf);
+		tx_buf_size = sizeof (struct mlx4_mad_snd_buf);
+	}
+
+
+	for (i = 0; i < MLX4_NUM_TUNNEL_BUFS; i++) {
+		ib_dma_unmap_single(ctx->ib_dev, tun_qp->ring[i].map,
+				    rx_buf_size, DMA_FROM_DEVICE);
+		kfree(tun_qp->ring[i].addr);
+	}
+
+	for (i = 0; i < MLX4_NUM_TUNNEL_BUFS; i++) {
+		ib_dma_unmap_single(ctx->ib_dev, tun_qp->tx_ring[i].buf.map,
+				    tx_buf_size, DMA_TO_DEVICE);
+		kfree(tun_qp->tx_ring[i].buf.addr);
+		if (tun_qp->tx_ring[i].ah)
+			ib_destroy_ah(tun_qp->tx_ring[i].ah);
+	}
+	kfree(tun_qp->tx_ring);
+	kfree(tun_qp->ring);
+}
+
+static void mlx4_ib_tunnel_comp_worker(struct work_struct *work)
+{
+	struct mlx4_ib_demux_pv_ctx *ctx;
+	struct mlx4_ib_demux_pv_qp *tun_qp;
+	struct ib_wc wc;
+	int ret;
+	ctx = container_of(work, struct mlx4_ib_demux_pv_ctx, work);
+	ib_req_notify_cq(ctx->cq, IB_CQ_NEXT_COMP);
+
+	while (ib_poll_cq(ctx->cq, 1, &wc) == 1) {
+		tun_qp = &ctx->qp[MLX4_TUN_WRID_QPN(wc.wr_id)];
+		if (wc.status == IB_WC_SUCCESS) {
+			switch (wc.opcode) {
+			case IB_WC_RECV:
+				mlx4_ib_multiplex_mad(ctx, &wc);
+				ret = mlx4_ib_post_pv_qp_buf(ctx, tun_qp,
+							     wc.wr_id &
+							     (MLX4_NUM_TUNNEL_BUFS - 1));
+				if (ret)
+					pr_err("Failed reposting tunnel "
+					       "buf:%lld\n", wc.wr_id);
+				break;
+			case IB_WC_SEND:
+				pr_debug("received tunnel send completion:"
+					 "wrid=0x%llx, status=0x%x\n",
+					 wc.wr_id, wc.status);
+				ib_destroy_ah(tun_qp->tx_ring[wc.wr_id &
+					      (MLX4_NUM_TUNNEL_BUFS - 1)].ah);
+				tun_qp->tx_ring[wc.wr_id & (MLX4_NUM_TUNNEL_BUFS - 1)].ah
+					= NULL;
+				spin_lock(&tun_qp->tx_lock);
+				tun_qp->tx_ix_tail++;
+				spin_unlock(&tun_qp->tx_lock);
+
+				break;
+			default:
+				break;
+			}
+		} else  {
+			pr_debug("mlx4_ib: completion error in tunnel: %d."
+				 " status = %d, wrid = 0x%llx\n",
+				 ctx->slave, wc.status, wc.wr_id);
+			if (!MLX4_TUN_IS_RECV(wc.wr_id)) {
+				ib_destroy_ah(tun_qp->tx_ring[wc.wr_id &
+					      (MLX4_NUM_TUNNEL_BUFS - 1)].ah);
+				tun_qp->tx_ring[wc.wr_id & (MLX4_NUM_TUNNEL_BUFS - 1)].ah
+					= NULL;
+				spin_lock(&tun_qp->tx_lock);
+				tun_qp->tx_ix_tail++;
+				spin_unlock(&tun_qp->tx_lock);
+			}
+		}
+	}
+}
+
+static void pv_qp_event_handler(struct ib_event *event, void *qp_context)
+{
+	struct mlx4_ib_demux_pv_ctx *sqp = qp_context;
+
+	/* It's worse than that! He's dead, Jim! */
+	pr_err("Fatal error (%d) on a MAD QP on port %d\n",
+	       event->event, sqp->port);
+}
+
+static int create_pv_sqp(struct mlx4_ib_demux_pv_ctx *ctx,
+			    enum ib_qp_type qp_type, int create_tun)
+{
+	int i, ret;
+	struct mlx4_ib_demux_pv_qp *tun_qp;
+	struct mlx4_ib_qp_tunnel_init_attr qp_init_attr;
+	struct ib_qp_attr attr;
+	int qp_attr_mask_INIT;
+
+	if (qp_type > IB_QPT_GSI)
+		return -EINVAL;
+
+	tun_qp = &ctx->qp[qp_type];
+
+	memset(&qp_init_attr, 0, sizeof qp_init_attr);
+	qp_init_attr.init_attr.send_cq = ctx->cq;
+	qp_init_attr.init_attr.recv_cq = ctx->cq;
+	qp_init_attr.init_attr.sq_sig_type = IB_SIGNAL_ALL_WR;
+	qp_init_attr.init_attr.cap.max_send_wr = MLX4_NUM_TUNNEL_BUFS;
+	qp_init_attr.init_attr.cap.max_recv_wr = MLX4_NUM_TUNNEL_BUFS;
+	qp_init_attr.init_attr.cap.max_send_sge = 1;
+	qp_init_attr.init_attr.cap.max_recv_sge = 1;
+	if (create_tun) {
+		qp_init_attr.init_attr.qp_type = IB_QPT_UD;
+		qp_init_attr.init_attr.create_flags = MLX4_IB_SRIOV_TUNNEL_QP;
+		qp_init_attr.port = ctx->port;
+		qp_init_attr.slave = ctx->slave;
+		qp_init_attr.proxy_qp_type = qp_type;
+		qp_attr_mask_INIT = IB_QP_STATE | IB_QP_PKEY_INDEX |
+			   IB_QP_QKEY | IB_QP_PORT;
+	} else {
+		qp_init_attr.init_attr.qp_type = qp_type;
+		qp_init_attr.init_attr.create_flags = MLX4_IB_SRIOV_SQP;
+		qp_attr_mask_INIT = IB_QP_STATE | IB_QP_PKEY_INDEX | IB_QP_QKEY;
+	}
+	qp_init_attr.init_attr.port_num = ctx->port;
+	qp_init_attr.init_attr.qp_context = ctx;
+	qp_init_attr.init_attr.event_handler = pv_qp_event_handler;
+	tun_qp->qp = ib_create_qp(ctx->pd, &qp_init_attr.init_attr);
+	if (IS_ERR(tun_qp->qp)) {
+		ret = PTR_ERR(tun_qp->qp);
+		tun_qp->qp = NULL;
+		pr_err("Couldn't create %s QP (%d)\n",
+		       create_tun ? "tunnel" : "special", ret);
+		return ret;
+	}
+
+	memset(&attr, 0, sizeof attr);
+	attr.qp_state = IB_QPS_INIT;
+	ret = 0;
+	if (create_tun)
+		ret = find_slave_port_pkey_ix(to_mdev(ctx->ib_dev), ctx->slave,
+					      ctx->port, IB_DEFAULT_PKEY_FULL,
+					      &attr.pkey_index);
+	if (ret || !create_tun)
+		attr.pkey_index =
+			to_mdev(ctx->ib_dev)->pkeys.virt2phys_pkey[ctx->slave][ctx->port - 1][0];
+	attr.qkey = IB_QP1_QKEY;
+	attr.port_num = ctx->port;
+	ret = ib_modify_qp(tun_qp->qp, &attr, qp_attr_mask_INIT);
+	if (ret) {
+		pr_err("Couldn't change %s qp state to INIT (%d)\n",
+		       create_tun ? "tunnel" : "special", ret);
+		goto err_qp;
+	}
+	attr.qp_state = IB_QPS_RTR;
+	ret = ib_modify_qp(tun_qp->qp, &attr, IB_QP_STATE);
+	if (ret) {
+		pr_err("Couldn't change %s qp state to RTR (%d)\n",
+		       create_tun ? "tunnel" : "special", ret);
+		goto err_qp;
+	}
+	attr.qp_state = IB_QPS_RTS;
+	attr.sq_psn = 0;
+	ret = ib_modify_qp(tun_qp->qp, &attr, IB_QP_STATE | IB_QP_SQ_PSN);
+	if (ret) {
+		pr_err("Couldn't change %s qp state to RTS (%d)\n",
+		       create_tun ? "tunnel" : "special", ret);
+		goto err_qp;
+	}
+
+	for (i = 0; i < MLX4_NUM_TUNNEL_BUFS; i++) {
+		ret = mlx4_ib_post_pv_qp_buf(ctx, tun_qp, i);
+		if (ret) {
+			pr_err(" mlx4_ib_post_pv_buf error"
+			       " (err = %d, i = %d)\n", ret, i);
+			goto err_qp;
+		}
+	}
+	return 0;
+
+err_qp:
+	ib_destroy_qp(tun_qp->qp);
+	tun_qp->qp = NULL;
+	return ret;
+}
+
+/*
+ * IB MAD completion callback for real SQPs
+ */
+static void mlx4_ib_sqp_comp_worker(struct work_struct *work)
+{
+	struct mlx4_ib_demux_pv_ctx *ctx;
+	struct mlx4_ib_demux_pv_qp *sqp;
+	struct ib_wc wc;
+	struct ib_grh *grh;
+	struct ib_mad *mad;
+
+	ctx = container_of(work, struct mlx4_ib_demux_pv_ctx, work);
+	ib_req_notify_cq(ctx->cq, IB_CQ_NEXT_COMP);
+
+	while (mlx4_ib_poll_cq(ctx->cq, 1, &wc) == 1) {
+		sqp = &ctx->qp[MLX4_TUN_WRID_QPN(wc.wr_id)];
+		if (wc.status == IB_WC_SUCCESS) {
+			switch (wc.opcode) {
+			case IB_WC_SEND:
+				ib_destroy_ah(sqp->tx_ring[wc.wr_id &
+					      (MLX4_NUM_TUNNEL_BUFS - 1)].ah);
+				sqp->tx_ring[wc.wr_id & (MLX4_NUM_TUNNEL_BUFS - 1)].ah
+					= NULL;
+				spin_lock(&sqp->tx_lock);
+				sqp->tx_ix_tail++;
+				spin_unlock(&sqp->tx_lock);
+				break;
+			case IB_WC_RECV:
+				mad = (struct ib_mad *) &(((struct mlx4_mad_rcv_buf *)
+						(sqp->ring[wc.wr_id &
+						(MLX4_NUM_TUNNEL_BUFS - 1)].addr))->payload);
+				grh = &(((struct mlx4_mad_rcv_buf *)
+						(sqp->ring[wc.wr_id &
+						(MLX4_NUM_TUNNEL_BUFS - 1)].addr))->grh);
+				mlx4_ib_demux_mad(ctx->ib_dev, ctx->port, &wc, grh, mad);
+				if (mlx4_ib_post_pv_qp_buf(ctx, sqp, wc.wr_id &
+							   (MLX4_NUM_TUNNEL_BUFS - 1)))
+					pr_err("Failed reposting SQP "
+					       "buf:%lld\n", wc.wr_id);
+				break;
+			default:
+				BUG_ON(1);
+				break;
+			}
+		} else  {
+			pr_debug("mlx4_ib: completion error in tunnel: %d."
+				 " status = %d, wrid = 0x%llx\n",
+				 ctx->slave, wc.status, wc.wr_id);
+			if (!MLX4_TUN_IS_RECV(wc.wr_id)) {
+				ib_destroy_ah(sqp->tx_ring[wc.wr_id &
+					      (MLX4_NUM_TUNNEL_BUFS - 1)].ah);
+				sqp->tx_ring[wc.wr_id & (MLX4_NUM_TUNNEL_BUFS - 1)].ah
+					= NULL;
+				spin_lock(&sqp->tx_lock);
+				sqp->tx_ix_tail++;
+				spin_unlock(&sqp->tx_lock);
+			}
+		}
+	}
+}
+
+static int alloc_pv_object(struct mlx4_ib_dev *dev, int slave, int port,
+			       struct mlx4_ib_demux_pv_ctx **ret_ctx)
+{
+	struct mlx4_ib_demux_pv_ctx *ctx;
+
+	*ret_ctx = NULL;
+	ctx = kzalloc(sizeof (struct mlx4_ib_demux_pv_ctx), GFP_KERNEL);
+	if (!ctx) {
+		pr_err("failed allocating pv resource context "
+		       "for port %d, slave %d\n", port, slave);
+		return -ENOMEM;
+	}
+
+	ctx->ib_dev = &dev->ib_dev;
+	ctx->port = port;
+	ctx->slave = slave;
+	*ret_ctx = ctx;
+	return 0;
+}
+
+static void free_pv_object(struct mlx4_ib_dev *dev, int slave, int port)
+{
+	if (dev->sriov.demux[port - 1].tun[slave]) {
+		kfree(dev->sriov.demux[port - 1].tun[slave]);
+		dev->sriov.demux[port - 1].tun[slave] = NULL;
+	}
+}
+
+static int create_pv_resources(struct ib_device *ibdev, int slave, int port,
+			       int create_tun, struct mlx4_ib_demux_pv_ctx *ctx)
+{
+	int ret, cq_size;
+
+	if (ctx->state != DEMUX_PV_STATE_DOWN)
+		return -EEXIST;
+
+	ctx->state = DEMUX_PV_STATE_STARTING;
+	/* have QP0 only if link layer is IB */
+	if (rdma_port_get_link_layer(ibdev, ctx->port) ==
+	    IB_LINK_LAYER_INFINIBAND)
+		ctx->has_smi = 1;
+
+	if (ctx->has_smi) {
+		ret = mlx4_ib_alloc_pv_bufs(ctx, IB_QPT_SMI, create_tun);
+		if (ret) {
+			pr_err("Failed allocating qp0 tunnel bufs (%d)\n", ret);
+			goto err_out;
+		}
+	}
+
+	ret = mlx4_ib_alloc_pv_bufs(ctx, IB_QPT_GSI, create_tun);
+	if (ret) {
+		pr_err("Failed allocating qp1 tunnel bufs (%d)\n", ret);
+		goto err_out_qp0;
+	}
+
+	cq_size = 2 * MLX4_NUM_TUNNEL_BUFS;
+	if (ctx->has_smi)
+		cq_size *= 2;
+
+	ctx->cq = ib_create_cq(ctx->ib_dev, mlx4_ib_tunnel_comp_handler,
+			       NULL, ctx, cq_size, 0);
+	if (IS_ERR(ctx->cq)) {
+		ret = PTR_ERR(ctx->cq);
+		pr_err("Couldn't create tunnel CQ (%d)\n", ret);
+		goto err_buf;
+	}
+
+	ctx->pd = ib_alloc_pd(ctx->ib_dev);
+	if (IS_ERR(ctx->pd)) {
+		ret = PTR_ERR(ctx->pd);
+		pr_err("Couldn't create tunnel PD (%d)\n", ret);
+		goto err_cq;
+	}
+
+	ctx->mr = ib_get_dma_mr(ctx->pd, IB_ACCESS_LOCAL_WRITE);
+	if (IS_ERR(ctx->mr)) {
+		ret = PTR_ERR(ctx->mr);
+		pr_err("Couldn't get tunnel DMA MR (%d)\n", ret);
+		goto err_pd;
+	}
+
+	if (ctx->has_smi) {
+		ret = create_pv_sqp(ctx, IB_QPT_SMI, create_tun);
+		if (ret) {
+			pr_err("Couldn't create %s QP0 (%d)\n",
+			       create_tun ? "tunnel for" : "",  ret);
+			goto err_mr;
+		}
+	}
+
+	ret = create_pv_sqp(ctx, IB_QPT_GSI, create_tun);
+	if (ret) {
+		pr_err("Couldn't create %s QP1 (%d)\n",
+		       create_tun ? "tunnel for" : "",  ret);
+		goto err_qp0;
+	}
+
+	if (create_tun)
+		INIT_WORK(&ctx->work, mlx4_ib_tunnel_comp_worker);
+	else
+		INIT_WORK(&ctx->work, mlx4_ib_sqp_comp_worker);
+
+	ctx->wq = to_mdev(ibdev)->sriov.demux[port - 1].wq;
+
+	ret = ib_req_notify_cq(ctx->cq, IB_CQ_NEXT_COMP);
+	if (ret) {
+		pr_err("Couldn't arm tunnel cq (%d)\n", ret);
+		goto err_wq;
+	}
+	ctx->state = DEMUX_PV_STATE_ACTIVE;
+	return 0;
+
+err_wq:
+	ctx->wq = NULL;
+	ib_destroy_qp(ctx->qp[1].qp);
+	ctx->qp[1].qp = NULL;
+
+
+err_qp0:
+	if (ctx->has_smi)
+		ib_destroy_qp(ctx->qp[0].qp);
+	ctx->qp[0].qp = NULL;
+
+err_mr:
+	ib_dereg_mr(ctx->mr);
+	ctx->mr = NULL;
+
+err_pd:
+	ib_dealloc_pd(ctx->pd);
+	ctx->pd = NULL;
+
+err_cq:
+	ib_destroy_cq(ctx->cq);
+	ctx->cq = NULL;
+
+err_buf:
+	mlx4_ib_free_pv_qp_bufs(ctx, IB_QPT_GSI, create_tun);
+
+err_out_qp0:
+	if (ctx->has_smi)
+		mlx4_ib_free_pv_qp_bufs(ctx, IB_QPT_SMI, create_tun);
+err_out:
+	ctx->state = DEMUX_PV_STATE_DOWN;
+	return ret;
+}
+
+static void destroy_pv_resources(struct mlx4_ib_dev *dev, int slave, int port,
+				 struct mlx4_ib_demux_pv_ctx *ctx, int flush)
+{
+	if (!ctx)
+		return;
+	if (ctx->state > DEMUX_PV_STATE_DOWN) {
+		ctx->state = DEMUX_PV_STATE_DOWNING;
+		if (flush)
+			flush_workqueue(ctx->wq);
+		if (ctx->has_smi) {
+			ib_destroy_qp(ctx->qp[0].qp);
+			ctx->qp[0].qp = NULL;
+			mlx4_ib_free_pv_qp_bufs(ctx, IB_QPT_SMI, 1);
+		}
+		ib_destroy_qp(ctx->qp[1].qp);
+		ctx->qp[1].qp = NULL;
+		mlx4_ib_free_pv_qp_bufs(ctx, IB_QPT_GSI, 1);
+		ib_dereg_mr(ctx->mr);
+		ctx->mr = NULL;
+		ib_dealloc_pd(ctx->pd);
+		ctx->pd = NULL;
+		ib_destroy_cq(ctx->cq);
+		ctx->cq = NULL;
+		ctx->state = DEMUX_PV_STATE_DOWN;
+	}
+}
+
+static int mlx4_ib_tunnels_update(struct mlx4_ib_dev *dev, int slave,
+				  int port, int do_init)
+{
+	int ret = 0;
+
+	if (!do_init) {
+		clean_vf_mcast(&dev->sriov.demux[port - 1], slave);
+		/* for master, destroy real sqp resources */
+		if (slave == mlx4_master_func_num(dev->dev))
+			destroy_pv_resources(dev, slave, port,
+					     dev->sriov.sqps[port - 1], 1);
+		/* destroy the tunnel qp resources */
+		destroy_pv_resources(dev, slave, port,
+				     dev->sriov.demux[port - 1].tun[slave], 1);
+		return 0;
+	}
+
+	/* create the tunnel qp resources */
+	ret = create_pv_resources(&dev->ib_dev, slave, port, 1,
+				  dev->sriov.demux[port - 1].tun[slave]);
+
+	/* for master, create the real sqp resources */
+	if (!ret && slave == mlx4_master_func_num(dev->dev))
+		ret = create_pv_resources(&dev->ib_dev, slave, port, 0,
+					  dev->sriov.sqps[port - 1]);
+	return ret;
+}
+
+void mlx4_ib_tunnels_update_work(struct work_struct *work)
+{
+	struct mlx4_ib_demux_work *dmxw;
+
+	dmxw = container_of(work, struct mlx4_ib_demux_work, work);
+	mlx4_ib_tunnels_update(dmxw->dev, dmxw->slave, (int) dmxw->port,
+			       dmxw->do_init);
+	kfree(dmxw);
+	return;
+}
+
+static int mlx4_ib_alloc_demux_ctx(struct mlx4_ib_dev *dev,
+				       struct mlx4_ib_demux_ctx *ctx,
+				       int port)
+{
+	char name[12];
+	int ret = 0;
+	int i;
+
+	ctx->tun = kcalloc(dev->dev->caps.sqp_demux,
+			   sizeof (struct mlx4_ib_demux_pv_ctx *), GFP_KERNEL);
+	if (!ctx->tun)
+		return -ENOMEM;
+
+	ctx->dev = dev;
+	ctx->port = port;
+	ctx->ib_dev = &dev->ib_dev;
+
+	for (i = 0;
+	     i < min(dev->dev->caps.sqp_demux, (u16)(dev->dev->num_vfs + 1));
+	     i++) {
+		struct mlx4_active_ports actv_ports =
+			mlx4_get_active_ports(dev->dev, i);
+
+		if (!test_bit(port - 1, actv_ports.ports))
+			continue;
+
+		ret = alloc_pv_object(dev, i, port, &ctx->tun[i]);
+		if (ret) {
+			ret = -ENOMEM;
+			goto err_mcg;
+		}
+	}
+
+	ret = mlx4_ib_mcg_port_init(ctx);
+	if (ret) {
+		pr_err("Failed initializing mcg para-virt (%d)\n", ret);
+		goto err_mcg;
+	}
+
+	snprintf(name, sizeof name, "mlx4_ibt%d", port);
+	ctx->wq = create_singlethread_workqueue(name);
+	if (!ctx->wq) {
+		pr_err("Failed to create tunnelling WQ for port %d\n", port);
+		ret = -ENOMEM;
+		goto err_wq;
+	}
+
+	snprintf(name, sizeof name, "mlx4_ibud%d", port);
+	ctx->ud_wq = create_singlethread_workqueue(name);
+	if (!ctx->ud_wq) {
+		pr_err("Failed to create up/down WQ for port %d\n", port);
+		ret = -ENOMEM;
+		goto err_udwq;
+	}
+
+	return 0;
+
+err_udwq:
+	destroy_workqueue(ctx->wq);
+	ctx->wq = NULL;
+
+err_wq:
+	mlx4_ib_mcg_port_cleanup(ctx, 1);
+err_mcg:
+	for (i = 0; i < dev->dev->caps.sqp_demux; i++)
+		free_pv_object(dev, i, port);
+	kfree(ctx->tun);
+	ctx->tun = NULL;
+	return ret;
+}
+
+static void mlx4_ib_free_sqp_ctx(struct mlx4_ib_demux_pv_ctx *sqp_ctx)
+{
+	if (sqp_ctx->state > DEMUX_PV_STATE_DOWN) {
+		sqp_ctx->state = DEMUX_PV_STATE_DOWNING;
+		flush_workqueue(sqp_ctx->wq);
+		if (sqp_ctx->has_smi) {
+			ib_destroy_qp(sqp_ctx->qp[0].qp);
+			sqp_ctx->qp[0].qp = NULL;
+			mlx4_ib_free_pv_qp_bufs(sqp_ctx, IB_QPT_SMI, 0);
+		}
+		ib_destroy_qp(sqp_ctx->qp[1].qp);
+		sqp_ctx->qp[1].qp = NULL;
+		mlx4_ib_free_pv_qp_bufs(sqp_ctx, IB_QPT_GSI, 0);
+		ib_dereg_mr(sqp_ctx->mr);
+		sqp_ctx->mr = NULL;
+		ib_dealloc_pd(sqp_ctx->pd);
+		sqp_ctx->pd = NULL;
+		ib_destroy_cq(sqp_ctx->cq);
+		sqp_ctx->cq = NULL;
+		sqp_ctx->state = DEMUX_PV_STATE_DOWN;
+	}
+}
+
+static void mlx4_ib_free_demux_ctx(struct mlx4_ib_demux_ctx *ctx)
+{
+	int i;
+	if (ctx) {
+		struct mlx4_ib_dev *dev = to_mdev(ctx->ib_dev);
+		mlx4_ib_mcg_port_cleanup(ctx, 1);
+		for (i = 0; i < dev->dev->caps.sqp_demux; i++) {
+			if (!ctx->tun[i])
+				continue;
+			if (ctx->tun[i]->state > DEMUX_PV_STATE_DOWN)
+				ctx->tun[i]->state = DEMUX_PV_STATE_DOWNING;
+		}
+		flush_workqueue(ctx->wq);
+		for (i = 0; i < dev->dev->caps.sqp_demux; i++) {
+			destroy_pv_resources(dev, i, ctx->port, ctx->tun[i], 0);
+			free_pv_object(dev, i, ctx->port);
+		}
+		kfree(ctx->tun);
+		destroy_workqueue(ctx->ud_wq);
+		destroy_workqueue(ctx->wq);
+	}
+}
+
+static void mlx4_ib_master_tunnels(struct mlx4_ib_dev *dev, int do_init)
+{
+	int i;
+
+	if (!mlx4_is_master(dev->dev))
+		return;
+	/* initialize or tear down tunnel QPs for the master */
+	for (i = 0; i < dev->dev->caps.num_ports; i++)
+		mlx4_ib_tunnels_update(dev, mlx4_master_func_num(dev->dev), i + 1, do_init);
+	return;
+}
+
+int mlx4_ib_init_sriov(struct mlx4_ib_dev *dev)
+{
+	int i = 0;
+	int err;
+
+	if (!mlx4_is_mfunc(dev->dev))
+		return 0;
+
+	dev->sriov.is_going_down = 0;
+	spin_lock_init(&dev->sriov.going_down_lock);
+	mlx4_ib_cm_paravirt_init(dev);
+
+	mlx4_ib_warn(&dev->ib_dev, "multi-function enabled\n");
+
+	if (mlx4_is_slave(dev->dev)) {
+		mlx4_ib_warn(&dev->ib_dev, "operating in qp1 tunnel mode\n");
+		return 0;
+	}
+
+	for (i = 0; i < dev->dev->caps.sqp_demux; i++) {
+		if (i == mlx4_master_func_num(dev->dev))
+			mlx4_put_slave_node_guid(dev->dev, i, dev->ib_dev.node_guid);
+		else
+			mlx4_put_slave_node_guid(dev->dev, i, mlx4_ib_gen_node_guid());
+	}
+
+	err = mlx4_ib_init_alias_guid_service(dev);
+	if (err) {
+		mlx4_ib_warn(&dev->ib_dev, "Failed init alias guid process.\n");
+		goto paravirt_err;
+	}
+	err = mlx4_ib_device_register_sysfs(dev);
+	if (err) {
+		mlx4_ib_warn(&dev->ib_dev, "Failed to register sysfs\n");
+		goto sysfs_err;
+	}
+
+	mlx4_ib_warn(&dev->ib_dev, "initializing demux service for %d qp1 clients\n",
+		     dev->dev->caps.sqp_demux);
+	for (i = 0; i < dev->num_ports; i++) {
+		union ib_gid gid;
+		err = __mlx4_ib_query_gid(&dev->ib_dev, i + 1, 0, &gid, 1);
+		if (err)
+			goto demux_err;
+		dev->sriov.demux[i].guid_cache[0] = gid.global.interface_id;
+		err = alloc_pv_object(dev, mlx4_master_func_num(dev->dev), i + 1,
+				      &dev->sriov.sqps[i]);
+		if (err)
+			goto demux_err;
+		err = mlx4_ib_alloc_demux_ctx(dev, &dev->sriov.demux[i], i + 1);
+		if (err)
+			goto free_pv;
+	}
+	mlx4_ib_master_tunnels(dev, 1);
+	return 0;
+
+free_pv:
+	free_pv_object(dev, mlx4_master_func_num(dev->dev), i + 1);
+demux_err:
+	while (--i >= 0) {
+		free_pv_object(dev, mlx4_master_func_num(dev->dev), i + 1);
+		mlx4_ib_free_demux_ctx(&dev->sriov.demux[i]);
+	}
+	mlx4_ib_device_unregister_sysfs(dev);
+
+sysfs_err:
+	mlx4_ib_destroy_alias_guid_service(dev);
+
+paravirt_err:
+	mlx4_ib_cm_paravirt_clean(dev, -1);
+
+	return err;
+}
+
+void mlx4_ib_close_sriov(struct mlx4_ib_dev *dev)
+{
+	int i;
+	unsigned long flags;
+
+	if (!mlx4_is_mfunc(dev->dev))
+		return;
+
+	spin_lock_irqsave(&dev->sriov.going_down_lock, flags);
+	dev->sriov.is_going_down = 1;
+	spin_unlock_irqrestore(&dev->sriov.going_down_lock, flags);
+	if (mlx4_is_master(dev->dev)) {
+		for (i = 0; i < dev->num_ports; i++) {
+			flush_workqueue(dev->sriov.demux[i].ud_wq);
+			mlx4_ib_free_sqp_ctx(dev->sriov.sqps[i]);
+			kfree(dev->sriov.sqps[i]);
+			dev->sriov.sqps[i] = NULL;
+			mlx4_ib_free_demux_ctx(&dev->sriov.demux[i]);
+		}
+
+		mlx4_ib_cm_paravirt_clean(dev, -1);
+		mlx4_ib_destroy_alias_guid_service(dev);
+		mlx4_ib_device_unregister_sysfs(dev);
+	}
+}
diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c
new file mode 100644
index 0000000..8b72cf3
--- /dev/null
+++ b/drivers/infiniband/hw/mlx4/main.c
@@ -0,0 +1,2653 @@
+/*
+ * Copyright (c) 2006, 2007 Cisco Systems, Inc. All rights reserved.
+ * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/errno.h>
+#include <linux/netdevice.h>
+#include <linux/inetdevice.h>
+#include <linux/rtnetlink.h>
+#include <linux/if_vlan.h>
+#include <net/ipv6.h>
+#include <net/addrconf.h>
+
+#include <rdma/ib_smi.h>
+#include <rdma/ib_user_verbs.h>
+#include <rdma/ib_addr.h>
+
+#include <linux/mlx4/driver.h>
+#include <linux/mlx4/cmd.h>
+#include <linux/mlx4/qp.h>
+
+#include "mlx4_ib.h"
+#include "user.h"
+
+#define DRV_NAME	MLX4_IB_DRV_NAME
+#define DRV_VERSION	"2.2-1"
+#define DRV_RELDATE	"Feb 2014"
+
+#define MLX4_IB_FLOW_MAX_PRIO 0xFFF
+#define MLX4_IB_FLOW_QPN_MASK 0xFFFFFF
+#define MLX4_IB_CARD_REV_A0   0xA0
+
+MODULE_AUTHOR("Roland Dreier");
+MODULE_DESCRIPTION("Mellanox ConnectX HCA InfiniBand driver");
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_VERSION(DRV_VERSION);
+
+int mlx4_ib_sm_guid_assign = 1;
+module_param_named(sm_guid_assign, mlx4_ib_sm_guid_assign, int, 0444);
+MODULE_PARM_DESC(sm_guid_assign, "Enable SM alias_GUID assignment if sm_guid_assign > 0 (Default: 1)");
+
+static const char mlx4_ib_version[] =
+	DRV_NAME ": Mellanox ConnectX InfiniBand driver v"
+	DRV_VERSION " (" DRV_RELDATE ")\n";
+
+struct update_gid_work {
+	struct work_struct	work;
+	union ib_gid		gids[128];
+	struct mlx4_ib_dev     *dev;
+	int			port;
+};
+
+static void do_slave_init(struct mlx4_ib_dev *ibdev, int slave, int do_init);
+
+static struct workqueue_struct *wq;
+
+static void init_query_mad(struct ib_smp *mad)
+{
+	mad->base_version  = 1;
+	mad->mgmt_class    = IB_MGMT_CLASS_SUBN_LID_ROUTED;
+	mad->class_version = 1;
+	mad->method	   = IB_MGMT_METHOD_GET;
+}
+
+static union ib_gid zgid;
+
+static int check_flow_steering_support(struct mlx4_dev *dev)
+{
+	int eth_num_ports = 0;
+	int ib_num_ports = 0;
+
+	int dmfs = dev->caps.steering_mode == MLX4_STEERING_MODE_DEVICE_MANAGED;
+
+	if (dmfs) {
+		int i;
+		mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_ETH)
+			eth_num_ports++;
+		mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_IB)
+			ib_num_ports++;
+		dmfs &= (!ib_num_ports ||
+			 (dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_DMFS_IPOIB)) &&
+			(!eth_num_ports ||
+			 (dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_FS_EN));
+		if (ib_num_ports && mlx4_is_mfunc(dev)) {
+			pr_warn("Device managed flow steering is unavailable for IB port in multifunction env.\n");
+			dmfs = 0;
+		}
+	}
+	return dmfs;
+}
+
+static int num_ib_ports(struct mlx4_dev *dev)
+{
+	int ib_ports = 0;
+	int i;
+
+	mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_IB)
+		ib_ports++;
+
+	return ib_ports;
+}
+
+static int mlx4_ib_query_device(struct ib_device *ibdev,
+				struct ib_device_attr *props)
+{
+	struct mlx4_ib_dev *dev = to_mdev(ibdev);
+	struct ib_smp *in_mad  = NULL;
+	struct ib_smp *out_mad = NULL;
+	int err = -ENOMEM;
+	int have_ib_ports;
+
+	in_mad  = kzalloc(sizeof *in_mad, GFP_KERNEL);
+	out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL);
+	if (!in_mad || !out_mad)
+		goto out;
+
+	init_query_mad(in_mad);
+	in_mad->attr_id = IB_SMP_ATTR_NODE_INFO;
+
+	err = mlx4_MAD_IFC(to_mdev(ibdev), MLX4_MAD_IFC_IGNORE_KEYS,
+			   1, NULL, NULL, in_mad, out_mad);
+	if (err)
+		goto out;
+
+	memset(props, 0, sizeof *props);
+
+	have_ib_ports = num_ib_ports(dev->dev);
+
+	props->fw_ver = dev->dev->caps.fw_ver;
+	props->device_cap_flags    = IB_DEVICE_CHANGE_PHY_PORT |
+		IB_DEVICE_PORT_ACTIVE_EVENT		|
+		IB_DEVICE_SYS_IMAGE_GUID		|
+		IB_DEVICE_RC_RNR_NAK_GEN		|
+		IB_DEVICE_BLOCK_MULTICAST_LOOPBACK;
+	if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_BAD_PKEY_CNTR)
+		props->device_cap_flags |= IB_DEVICE_BAD_PKEY_CNTR;
+	if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_BAD_QKEY_CNTR)
+		props->device_cap_flags |= IB_DEVICE_BAD_QKEY_CNTR;
+	if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_APM && have_ib_ports)
+		props->device_cap_flags |= IB_DEVICE_AUTO_PATH_MIG;
+	if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_UD_AV_PORT)
+		props->device_cap_flags |= IB_DEVICE_UD_AV_PORT_ENFORCE;
+	if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_IPOIB_CSUM)
+		props->device_cap_flags |= IB_DEVICE_UD_IP_CSUM;
+	if (dev->dev->caps.max_gso_sz &&
+	    (dev->dev->rev_id != MLX4_IB_CARD_REV_A0) &&
+	    (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_BLH))
+		props->device_cap_flags |= IB_DEVICE_UD_TSO;
+	if (dev->dev->caps.bmme_flags & MLX4_BMME_FLAG_RESERVED_LKEY)
+		props->device_cap_flags |= IB_DEVICE_LOCAL_DMA_LKEY;
+	if ((dev->dev->caps.bmme_flags & MLX4_BMME_FLAG_LOCAL_INV) &&
+	    (dev->dev->caps.bmme_flags & MLX4_BMME_FLAG_REMOTE_INV) &&
+	    (dev->dev->caps.bmme_flags & MLX4_BMME_FLAG_FAST_REG_WR))
+		props->device_cap_flags |= IB_DEVICE_MEM_MGT_EXTENSIONS;
+	if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC)
+		props->device_cap_flags |= IB_DEVICE_XRC;
+	if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_MEM_WINDOW)
+		props->device_cap_flags |= IB_DEVICE_MEM_WINDOW;
+	if (dev->dev->caps.bmme_flags & MLX4_BMME_FLAG_TYPE_2_WIN) {
+		if (dev->dev->caps.bmme_flags & MLX4_BMME_FLAG_WIN_TYPE_2B)
+			props->device_cap_flags |= IB_DEVICE_MEM_WINDOW_TYPE_2B;
+		else
+			props->device_cap_flags |= IB_DEVICE_MEM_WINDOW_TYPE_2A;
+	if (dev->steering_support ==  MLX4_STEERING_MODE_DEVICE_MANAGED)
+		props->device_cap_flags |= IB_DEVICE_MANAGED_FLOW_STEERING;
+	}
+
+	props->vendor_id	   = be32_to_cpup((__be32 *) (out_mad->data + 36)) &
+		0xffffff;
+	props->vendor_part_id	   = dev->dev->pdev->device;
+	props->hw_ver		   = be32_to_cpup((__be32 *) (out_mad->data + 32));
+	memcpy(&props->sys_image_guid, out_mad->data +	4, 8);
+
+	props->max_mr_size	   = ~0ull;
+	props->page_size_cap	   = dev->dev->caps.page_size_cap;
+	props->max_qp		   = dev->dev->quotas.qp;
+	props->max_qp_wr	   = dev->dev->caps.max_wqes - MLX4_IB_SQ_MAX_SPARE;
+	props->max_sge		   = min(dev->dev->caps.max_sq_sg,
+					 dev->dev->caps.max_rq_sg);
+	props->max_cq		   = dev->dev->quotas.cq;
+	props->max_cqe		   = dev->dev->caps.max_cqes;
+	props->max_mr		   = dev->dev->quotas.mpt;
+	props->max_pd		   = dev->dev->caps.num_pds - dev->dev->caps.reserved_pds;
+	props->max_qp_rd_atom	   = dev->dev->caps.max_qp_dest_rdma;
+	props->max_qp_init_rd_atom = dev->dev->caps.max_qp_init_rdma;
+	props->max_res_rd_atom	   = props->max_qp_rd_atom * props->max_qp;
+	props->max_srq		   = dev->dev->quotas.srq;
+	props->max_srq_wr	   = dev->dev->caps.max_srq_wqes - 1;
+	props->max_srq_sge	   = dev->dev->caps.max_srq_sge;
+	props->max_fast_reg_page_list_len = MLX4_MAX_FAST_REG_PAGES;
+	props->local_ca_ack_delay  = dev->dev->caps.local_ca_ack_delay;
+	props->atomic_cap	   = dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_ATOMIC ?
+		IB_ATOMIC_HCA : IB_ATOMIC_NONE;
+	props->masked_atomic_cap   = props->atomic_cap;
+	props->max_pkeys	   = dev->dev->caps.pkey_table_len[1];
+	props->max_mcast_grp	   = dev->dev->caps.num_mgms + dev->dev->caps.num_amgms;
+	props->max_mcast_qp_attach = dev->dev->caps.num_qp_per_mgm;
+	props->max_total_mcast_qp_attach = props->max_mcast_qp_attach *
+					   props->max_mcast_grp;
+	props->max_map_per_fmr = dev->dev->caps.max_fmr_maps;
+
+out:
+	kfree(in_mad);
+	kfree(out_mad);
+
+	return err;
+}
+
+static enum rdma_link_layer
+mlx4_ib_port_link_layer(struct ib_device *device, u8 port_num)
+{
+	struct mlx4_dev *dev = to_mdev(device)->dev;
+
+	return dev->caps.port_mask[port_num] == MLX4_PORT_TYPE_IB ?
+		IB_LINK_LAYER_INFINIBAND : IB_LINK_LAYER_ETHERNET;
+}
+
+static int ib_link_query_port(struct ib_device *ibdev, u8 port,
+			      struct ib_port_attr *props, int netw_view)
+{
+	struct ib_smp *in_mad  = NULL;
+	struct ib_smp *out_mad = NULL;
+	int ext_active_speed;
+	int mad_ifc_flags = MLX4_MAD_IFC_IGNORE_KEYS;
+	int err = -ENOMEM;
+
+	in_mad  = kzalloc(sizeof *in_mad, GFP_KERNEL);
+	out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL);
+	if (!in_mad || !out_mad)
+		goto out;
+
+	init_query_mad(in_mad);
+	in_mad->attr_id  = IB_SMP_ATTR_PORT_INFO;
+	in_mad->attr_mod = cpu_to_be32(port);
+
+	if (mlx4_is_mfunc(to_mdev(ibdev)->dev) && netw_view)
+		mad_ifc_flags |= MLX4_MAD_IFC_NET_VIEW;
+
+	err = mlx4_MAD_IFC(to_mdev(ibdev), mad_ifc_flags, port, NULL, NULL,
+				in_mad, out_mad);
+	if (err)
+		goto out;
+
+
+	props->lid		= be16_to_cpup((__be16 *) (out_mad->data + 16));
+	props->lmc		= out_mad->data[34] & 0x7;
+	props->sm_lid		= be16_to_cpup((__be16 *) (out_mad->data + 18));
+	props->sm_sl		= out_mad->data[36] & 0xf;
+	props->state		= out_mad->data[32] & 0xf;
+	props->phys_state	= out_mad->data[33] >> 4;
+	props->port_cap_flags	= be32_to_cpup((__be32 *) (out_mad->data + 20));
+	if (netw_view)
+		props->gid_tbl_len = out_mad->data[50];
+	else
+		props->gid_tbl_len = to_mdev(ibdev)->dev->caps.gid_table_len[port];
+	props->max_msg_sz	= to_mdev(ibdev)->dev->caps.max_msg_sz;
+	props->pkey_tbl_len	= to_mdev(ibdev)->dev->caps.pkey_table_len[port];
+	props->bad_pkey_cntr	= be16_to_cpup((__be16 *) (out_mad->data + 46));
+	props->qkey_viol_cntr	= be16_to_cpup((__be16 *) (out_mad->data + 48));
+	props->active_width	= out_mad->data[31] & 0xf;
+	props->active_speed	= out_mad->data[35] >> 4;
+	props->max_mtu		= out_mad->data[41] & 0xf;
+	props->active_mtu	= out_mad->data[36] >> 4;
+	props->subnet_timeout	= out_mad->data[51] & 0x1f;
+	props->max_vl_num	= out_mad->data[37] >> 4;
+	props->init_type_reply	= out_mad->data[41] >> 4;
+
+	/* Check if extended speeds (EDR/FDR/...) are supported */
+	if (props->port_cap_flags & IB_PORT_EXTENDED_SPEEDS_SUP) {
+		ext_active_speed = out_mad->data[62] >> 4;
+
+		switch (ext_active_speed) {
+		case 1:
+			props->active_speed = IB_SPEED_FDR;
+			break;
+		case 2:
+			props->active_speed = IB_SPEED_EDR;
+			break;
+		}
+	}
+
+	/* If reported active speed is QDR, check if is FDR-10 */
+	if (props->active_speed == IB_SPEED_QDR) {
+		init_query_mad(in_mad);
+		in_mad->attr_id = MLX4_ATTR_EXTENDED_PORT_INFO;
+		in_mad->attr_mod = cpu_to_be32(port);
+
+		err = mlx4_MAD_IFC(to_mdev(ibdev), mad_ifc_flags, port,
+				   NULL, NULL, in_mad, out_mad);
+		if (err)
+			goto out;
+
+		/* Checking LinkSpeedActive for FDR-10 */
+		if (out_mad->data[15] & 0x1)
+			props->active_speed = IB_SPEED_FDR10;
+	}
+
+	/* Avoid wrong speed value returned by FW if the IB link is down. */
+	if (props->state == IB_PORT_DOWN)
+		 props->active_speed = IB_SPEED_SDR;
+
+out:
+	kfree(in_mad);
+	kfree(out_mad);
+	return err;
+}
+
+static u8 state_to_phys_state(enum ib_port_state state)
+{
+	return state == IB_PORT_ACTIVE ? 5 : 3;
+}
+
+static int eth_link_query_port(struct ib_device *ibdev, u8 port,
+			       struct ib_port_attr *props, int netw_view)
+{
+
+	struct mlx4_ib_dev *mdev = to_mdev(ibdev);
+	struct mlx4_ib_iboe *iboe = &mdev->iboe;
+	struct net_device *ndev;
+	enum ib_mtu tmp;
+	struct mlx4_cmd_mailbox *mailbox;
+	int err = 0;
+
+	mailbox = mlx4_alloc_cmd_mailbox(mdev->dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+
+	err = mlx4_cmd_box(mdev->dev, 0, mailbox->dma, port, 0,
+			   MLX4_CMD_QUERY_PORT, MLX4_CMD_TIME_CLASS_B,
+			   MLX4_CMD_WRAPPED);
+	if (err)
+		goto out;
+
+	props->active_width	=  (((u8 *)mailbox->buf)[5] == 0x40) ?
+						IB_WIDTH_4X : IB_WIDTH_1X;
+	props->active_speed	= IB_SPEED_QDR;
+	props->port_cap_flags	= IB_PORT_CM_SUP | IB_PORT_IP_BASED_GIDS;
+	props->gid_tbl_len	= mdev->dev->caps.gid_table_len[port];
+	props->max_msg_sz	= mdev->dev->caps.max_msg_sz;
+	props->pkey_tbl_len	= 1;
+	props->max_mtu		= IB_MTU_4096;
+	props->max_vl_num	= 2;
+	props->state		= IB_PORT_DOWN;
+	props->phys_state	= state_to_phys_state(props->state);
+	props->active_mtu	= IB_MTU_256;
+	spin_lock_bh(&iboe->lock);
+	ndev = iboe->netdevs[port - 1];
+	if (!ndev)
+		goto out_unlock;
+
+	tmp = iboe_get_mtu(ndev->mtu);
+	props->active_mtu = tmp ? min(props->max_mtu, tmp) : IB_MTU_256;
+
+	props->state		= (netif_running(ndev) && netif_carrier_ok(ndev)) ?
+					IB_PORT_ACTIVE : IB_PORT_DOWN;
+	props->phys_state	= state_to_phys_state(props->state);
+out_unlock:
+	spin_unlock_bh(&iboe->lock);
+out:
+	mlx4_free_cmd_mailbox(mdev->dev, mailbox);
+	return err;
+}
+
+int __mlx4_ib_query_port(struct ib_device *ibdev, u8 port,
+			 struct ib_port_attr *props, int netw_view)
+{
+	int err;
+
+	memset(props, 0, sizeof *props);
+
+	err = mlx4_ib_port_link_layer(ibdev, port) == IB_LINK_LAYER_INFINIBAND ?
+		ib_link_query_port(ibdev, port, props, netw_view) :
+				eth_link_query_port(ibdev, port, props, netw_view);
+
+	return err;
+}
+
+static int mlx4_ib_query_port(struct ib_device *ibdev, u8 port,
+			      struct ib_port_attr *props)
+{
+	/* returns host view */
+	return __mlx4_ib_query_port(ibdev, port, props, 0);
+}
+
+int __mlx4_ib_query_gid(struct ib_device *ibdev, u8 port, int index,
+			union ib_gid *gid, int netw_view)
+{
+	struct ib_smp *in_mad  = NULL;
+	struct ib_smp *out_mad = NULL;
+	int err = -ENOMEM;
+	struct mlx4_ib_dev *dev = to_mdev(ibdev);
+	int clear = 0;
+	int mad_ifc_flags = MLX4_MAD_IFC_IGNORE_KEYS;
+
+	in_mad  = kzalloc(sizeof *in_mad, GFP_KERNEL);
+	out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL);
+	if (!in_mad || !out_mad)
+		goto out;
+
+	init_query_mad(in_mad);
+	in_mad->attr_id  = IB_SMP_ATTR_PORT_INFO;
+	in_mad->attr_mod = cpu_to_be32(port);
+
+	if (mlx4_is_mfunc(dev->dev) && netw_view)
+		mad_ifc_flags |= MLX4_MAD_IFC_NET_VIEW;
+
+	err = mlx4_MAD_IFC(dev, mad_ifc_flags, port, NULL, NULL, in_mad, out_mad);
+	if (err)
+		goto out;
+
+	memcpy(gid->raw, out_mad->data + 8, 8);
+
+	if (mlx4_is_mfunc(dev->dev) && !netw_view) {
+		if (index) {
+			/* For any index > 0, return the null guid */
+			err = 0;
+			clear = 1;
+			goto out;
+		}
+	}
+
+	init_query_mad(in_mad);
+	in_mad->attr_id  = IB_SMP_ATTR_GUID_INFO;
+	in_mad->attr_mod = cpu_to_be32(index / 8);
+
+	err = mlx4_MAD_IFC(dev, mad_ifc_flags, port,
+			   NULL, NULL, in_mad, out_mad);
+	if (err)
+		goto out;
+
+	memcpy(gid->raw + 8, out_mad->data + (index % 8) * 8, 8);
+
+out:
+	if (clear)
+		memset(gid->raw + 8, 0, 8);
+	kfree(in_mad);
+	kfree(out_mad);
+	return err;
+}
+
+static int iboe_query_gid(struct ib_device *ibdev, u8 port, int index,
+			  union ib_gid *gid)
+{
+	struct mlx4_ib_dev *dev = to_mdev(ibdev);
+
+	*gid = dev->iboe.gid_table[port - 1][index];
+
+	return 0;
+}
+
+static int mlx4_ib_query_gid(struct ib_device *ibdev, u8 port, int index,
+			     union ib_gid *gid)
+{
+	if (rdma_port_get_link_layer(ibdev, port) == IB_LINK_LAYER_INFINIBAND)
+		return __mlx4_ib_query_gid(ibdev, port, index, gid, 0);
+	else
+		return iboe_query_gid(ibdev, port, index, gid);
+}
+
+int __mlx4_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index,
+			 u16 *pkey, int netw_view)
+{
+	struct ib_smp *in_mad  = NULL;
+	struct ib_smp *out_mad = NULL;
+	int mad_ifc_flags = MLX4_MAD_IFC_IGNORE_KEYS;
+	int err = -ENOMEM;
+
+	in_mad  = kzalloc(sizeof *in_mad, GFP_KERNEL);
+	out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL);
+	if (!in_mad || !out_mad)
+		goto out;
+
+	init_query_mad(in_mad);
+	in_mad->attr_id  = IB_SMP_ATTR_PKEY_TABLE;
+	in_mad->attr_mod = cpu_to_be32(index / 32);
+
+	if (mlx4_is_mfunc(to_mdev(ibdev)->dev) && netw_view)
+		mad_ifc_flags |= MLX4_MAD_IFC_NET_VIEW;
+
+	err = mlx4_MAD_IFC(to_mdev(ibdev), mad_ifc_flags, port, NULL, NULL,
+			   in_mad, out_mad);
+	if (err)
+		goto out;
+
+	*pkey = be16_to_cpu(((__be16 *) out_mad->data)[index % 32]);
+
+out:
+	kfree(in_mad);
+	kfree(out_mad);
+	return err;
+}
+
+static int mlx4_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index, u16 *pkey)
+{
+	return __mlx4_ib_query_pkey(ibdev, port, index, pkey, 0);
+}
+
+static int mlx4_ib_modify_device(struct ib_device *ibdev, int mask,
+				 struct ib_device_modify *props)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	unsigned long flags;
+
+	if (mask & ~IB_DEVICE_MODIFY_NODE_DESC)
+		return -EOPNOTSUPP;
+
+	if (!(mask & IB_DEVICE_MODIFY_NODE_DESC))
+		return 0;
+
+	if (mlx4_is_slave(to_mdev(ibdev)->dev))
+		return -EOPNOTSUPP;
+
+	spin_lock_irqsave(&to_mdev(ibdev)->sm_lock, flags);
+	memcpy(ibdev->node_desc, props->node_desc, 64);
+	spin_unlock_irqrestore(&to_mdev(ibdev)->sm_lock, flags);
+
+	/*
+	 * If possible, pass node desc to FW, so it can generate
+	 * a 144 trap.  If cmd fails, just ignore.
+	 */
+	mailbox = mlx4_alloc_cmd_mailbox(to_mdev(ibdev)->dev);
+	if (IS_ERR(mailbox))
+		return 0;
+
+	memcpy(mailbox->buf, props->node_desc, 64);
+	mlx4_cmd(to_mdev(ibdev)->dev, mailbox->dma, 1, 0,
+		 MLX4_CMD_SET_NODE, MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE);
+
+	mlx4_free_cmd_mailbox(to_mdev(ibdev)->dev, mailbox);
+
+	return 0;
+}
+
+static int mlx4_ib_SET_PORT(struct mlx4_ib_dev *dev, u8 port, int reset_qkey_viols,
+			    u32 cap_mask)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	int err;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev->dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+
+	if (dev->dev->flags & MLX4_FLAG_OLD_PORT_CMDS) {
+		*(u8 *) mailbox->buf	     = !!reset_qkey_viols << 6;
+		((__be32 *) mailbox->buf)[2] = cpu_to_be32(cap_mask);
+	} else {
+		((u8 *) mailbox->buf)[3]     = !!reset_qkey_viols;
+		((__be32 *) mailbox->buf)[1] = cpu_to_be32(cap_mask);
+	}
+
+	err = mlx4_cmd(dev->dev, mailbox->dma, port, 0, MLX4_CMD_SET_PORT,
+		       MLX4_CMD_TIME_CLASS_B, MLX4_CMD_WRAPPED);
+
+	mlx4_free_cmd_mailbox(dev->dev, mailbox);
+	return err;
+}
+
+static int mlx4_ib_modify_port(struct ib_device *ibdev, u8 port, int mask,
+			       struct ib_port_modify *props)
+{
+	struct mlx4_ib_dev *mdev = to_mdev(ibdev);
+	u8 is_eth = mdev->dev->caps.port_type[port] == MLX4_PORT_TYPE_ETH;
+	struct ib_port_attr attr;
+	u32 cap_mask;
+	int err;
+
+	/* return OK if this is RoCE. CM calls ib_modify_port() regardless
+	 * of whether port link layer is ETH or IB. For ETH ports, qkey
+	 * violations and port capabilities are not meaningful.
+	 */
+	if (is_eth)
+		return 0;
+
+	mutex_lock(&mdev->cap_mask_mutex);
+
+	err = mlx4_ib_query_port(ibdev, port, &attr);
+	if (err)
+		goto out;
+
+	cap_mask = (attr.port_cap_flags | props->set_port_cap_mask) &
+		~props->clr_port_cap_mask;
+
+	err = mlx4_ib_SET_PORT(mdev, port,
+			       !!(mask & IB_PORT_RESET_QKEY_CNTR),
+			       cap_mask);
+
+out:
+	mutex_unlock(&to_mdev(ibdev)->cap_mask_mutex);
+	return err;
+}
+
+static struct ib_ucontext *mlx4_ib_alloc_ucontext(struct ib_device *ibdev,
+						  struct ib_udata *udata)
+{
+	struct mlx4_ib_dev *dev = to_mdev(ibdev);
+	struct mlx4_ib_ucontext *context;
+	struct mlx4_ib_alloc_ucontext_resp_v3 resp_v3;
+	struct mlx4_ib_alloc_ucontext_resp resp;
+	int err;
+
+	if (!dev->ib_active)
+		return ERR_PTR(-EAGAIN);
+
+	if (ibdev->uverbs_abi_ver == MLX4_IB_UVERBS_NO_DEV_CAPS_ABI_VERSION) {
+		resp_v3.qp_tab_size      = dev->dev->caps.num_qps;
+		resp_v3.bf_reg_size      = dev->dev->caps.bf_reg_size;
+		resp_v3.bf_regs_per_page = dev->dev->caps.bf_regs_per_page;
+	} else {
+		resp.dev_caps	      = dev->dev->caps.userspace_caps;
+		resp.qp_tab_size      = dev->dev->caps.num_qps;
+		resp.bf_reg_size      = dev->dev->caps.bf_reg_size;
+		resp.bf_regs_per_page = dev->dev->caps.bf_regs_per_page;
+		resp.cqe_size	      = dev->dev->caps.cqe_size;
+	}
+
+	context = kmalloc(sizeof *context, GFP_KERNEL);
+	if (!context)
+		return ERR_PTR(-ENOMEM);
+
+	err = mlx4_uar_alloc(to_mdev(ibdev)->dev, &context->uar);
+	if (err) {
+		kfree(context);
+		return ERR_PTR(err);
+	}
+
+	INIT_LIST_HEAD(&context->db_page_list);
+	mutex_init(&context->db_page_mutex);
+
+	if (ibdev->uverbs_abi_ver == MLX4_IB_UVERBS_NO_DEV_CAPS_ABI_VERSION)
+		err = ib_copy_to_udata(udata, &resp_v3, sizeof(resp_v3));
+	else
+		err = ib_copy_to_udata(udata, &resp, sizeof(resp));
+
+	if (err) {
+		mlx4_uar_free(to_mdev(ibdev)->dev, &context->uar);
+		kfree(context);
+		return ERR_PTR(-EFAULT);
+	}
+
+	return &context->ibucontext;
+}
+
+static int mlx4_ib_dealloc_ucontext(struct ib_ucontext *ibcontext)
+{
+	struct mlx4_ib_ucontext *context = to_mucontext(ibcontext);
+
+	mlx4_uar_free(to_mdev(ibcontext->device)->dev, &context->uar);
+	kfree(context);
+
+	return 0;
+}
+
+static int mlx4_ib_mmap(struct ib_ucontext *context, struct vm_area_struct *vma)
+{
+	struct mlx4_ib_dev *dev = to_mdev(context->device);
+
+	if (vma->vm_end - vma->vm_start != PAGE_SIZE)
+		return -EINVAL;
+
+	if (vma->vm_pgoff == 0) {
+		vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+
+		if (io_remap_pfn_range(vma, vma->vm_start,
+				       to_mucontext(context)->uar.pfn,
+				       PAGE_SIZE, vma->vm_page_prot))
+			return -EAGAIN;
+	} else if (vma->vm_pgoff == 1 && dev->dev->caps.bf_reg_size != 0) {
+		vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot);
+
+		if (io_remap_pfn_range(vma, vma->vm_start,
+				       to_mucontext(context)->uar.pfn +
+				       dev->dev->caps.num_uars,
+				       PAGE_SIZE, vma->vm_page_prot))
+			return -EAGAIN;
+	} else
+		return -EINVAL;
+
+	return 0;
+}
+
+static struct ib_pd *mlx4_ib_alloc_pd(struct ib_device *ibdev,
+				      struct ib_ucontext *context,
+				      struct ib_udata *udata)
+{
+	struct mlx4_ib_pd *pd;
+	int err;
+
+	pd = kmalloc(sizeof *pd, GFP_KERNEL);
+	if (!pd)
+		return ERR_PTR(-ENOMEM);
+
+	err = mlx4_pd_alloc(to_mdev(ibdev)->dev, &pd->pdn);
+	if (err) {
+		kfree(pd);
+		return ERR_PTR(err);
+	}
+
+	if (context)
+		if (ib_copy_to_udata(udata, &pd->pdn, sizeof (__u32))) {
+			mlx4_pd_free(to_mdev(ibdev)->dev, pd->pdn);
+			kfree(pd);
+			return ERR_PTR(-EFAULT);
+		}
+
+	return &pd->ibpd;
+}
+
+static int mlx4_ib_dealloc_pd(struct ib_pd *pd)
+{
+	mlx4_pd_free(to_mdev(pd->device)->dev, to_mpd(pd)->pdn);
+	kfree(pd);
+
+	return 0;
+}
+
+static struct ib_xrcd *mlx4_ib_alloc_xrcd(struct ib_device *ibdev,
+					  struct ib_ucontext *context,
+					  struct ib_udata *udata)
+{
+	struct mlx4_ib_xrcd *xrcd;
+	int err;
+
+	if (!(to_mdev(ibdev)->dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC))
+		return ERR_PTR(-ENOSYS);
+
+	xrcd = kmalloc(sizeof *xrcd, GFP_KERNEL);
+	if (!xrcd)
+		return ERR_PTR(-ENOMEM);
+
+	err = mlx4_xrcd_alloc(to_mdev(ibdev)->dev, &xrcd->xrcdn);
+	if (err)
+		goto err1;
+
+	xrcd->pd = ib_alloc_pd(ibdev);
+	if (IS_ERR(xrcd->pd)) {
+		err = PTR_ERR(xrcd->pd);
+		goto err2;
+	}
+
+	xrcd->cq = ib_create_cq(ibdev, NULL, NULL, xrcd, 1, 0);
+	if (IS_ERR(xrcd->cq)) {
+		err = PTR_ERR(xrcd->cq);
+		goto err3;
+	}
+
+	return &xrcd->ibxrcd;
+
+err3:
+	ib_dealloc_pd(xrcd->pd);
+err2:
+	mlx4_xrcd_free(to_mdev(ibdev)->dev, xrcd->xrcdn);
+err1:
+	kfree(xrcd);
+	return ERR_PTR(err);
+}
+
+static int mlx4_ib_dealloc_xrcd(struct ib_xrcd *xrcd)
+{
+	ib_destroy_cq(to_mxrcd(xrcd)->cq);
+	ib_dealloc_pd(to_mxrcd(xrcd)->pd);
+	mlx4_xrcd_free(to_mdev(xrcd->device)->dev, to_mxrcd(xrcd)->xrcdn);
+	kfree(xrcd);
+
+	return 0;
+}
+
+static int add_gid_entry(struct ib_qp *ibqp, union ib_gid *gid)
+{
+	struct mlx4_ib_qp *mqp = to_mqp(ibqp);
+	struct mlx4_ib_dev *mdev = to_mdev(ibqp->device);
+	struct mlx4_ib_gid_entry *ge;
+
+	ge = kzalloc(sizeof *ge, GFP_KERNEL);
+	if (!ge)
+		return -ENOMEM;
+
+	ge->gid = *gid;
+	if (mlx4_ib_add_mc(mdev, mqp, gid)) {
+		ge->port = mqp->port;
+		ge->added = 1;
+	}
+
+	mutex_lock(&mqp->mutex);
+	list_add_tail(&ge->list, &mqp->gid_list);
+	mutex_unlock(&mqp->mutex);
+
+	return 0;
+}
+
+int mlx4_ib_add_mc(struct mlx4_ib_dev *mdev, struct mlx4_ib_qp *mqp,
+		   union ib_gid *gid)
+{
+	struct net_device *ndev;
+	int ret = 0;
+
+	if (!mqp->port)
+		return 0;
+
+	spin_lock_bh(&mdev->iboe.lock);
+	ndev = mdev->iboe.netdevs[mqp->port - 1];
+	if (ndev)
+		dev_hold(ndev);
+	spin_unlock_bh(&mdev->iboe.lock);
+
+	if (ndev) {
+		ret = 1;
+		dev_put(ndev);
+	}
+
+	return ret;
+}
+
+struct mlx4_ib_steering {
+	struct list_head list;
+	u64 reg_id;
+	union ib_gid gid;
+};
+
+static int parse_flow_attr(struct mlx4_dev *dev,
+			   u32 qp_num,
+			   union ib_flow_spec *ib_spec,
+			   struct _rule_hw *mlx4_spec)
+{
+	enum mlx4_net_trans_rule_id type;
+
+	switch (ib_spec->type) {
+	case IB_FLOW_SPEC_ETH:
+		type = MLX4_NET_TRANS_RULE_ID_ETH;
+		memcpy(mlx4_spec->eth.dst_mac, ib_spec->eth.val.dst_mac,
+		       ETH_ALEN);
+		memcpy(mlx4_spec->eth.dst_mac_msk, ib_spec->eth.mask.dst_mac,
+		       ETH_ALEN);
+		mlx4_spec->eth.vlan_tag = ib_spec->eth.val.vlan_tag;
+		mlx4_spec->eth.vlan_tag_msk = ib_spec->eth.mask.vlan_tag;
+		break;
+	case IB_FLOW_SPEC_IB:
+		type = MLX4_NET_TRANS_RULE_ID_IB;
+		mlx4_spec->ib.l3_qpn =
+			cpu_to_be32(qp_num);
+		mlx4_spec->ib.qpn_mask =
+			cpu_to_be32(MLX4_IB_FLOW_QPN_MASK);
+		break;
+
+
+	case IB_FLOW_SPEC_IPV4:
+		type = MLX4_NET_TRANS_RULE_ID_IPV4;
+		mlx4_spec->ipv4.src_ip = ib_spec->ipv4.val.src_ip;
+		mlx4_spec->ipv4.src_ip_msk = ib_spec->ipv4.mask.src_ip;
+		mlx4_spec->ipv4.dst_ip = ib_spec->ipv4.val.dst_ip;
+		mlx4_spec->ipv4.dst_ip_msk = ib_spec->ipv4.mask.dst_ip;
+		break;
+
+	case IB_FLOW_SPEC_TCP:
+	case IB_FLOW_SPEC_UDP:
+		type = ib_spec->type == IB_FLOW_SPEC_TCP ?
+					MLX4_NET_TRANS_RULE_ID_TCP :
+					MLX4_NET_TRANS_RULE_ID_UDP;
+		mlx4_spec->tcp_udp.dst_port = ib_spec->tcp_udp.val.dst_port;
+		mlx4_spec->tcp_udp.dst_port_msk = ib_spec->tcp_udp.mask.dst_port;
+		mlx4_spec->tcp_udp.src_port = ib_spec->tcp_udp.val.src_port;
+		mlx4_spec->tcp_udp.src_port_msk = ib_spec->tcp_udp.mask.src_port;
+		break;
+
+	default:
+		return -EINVAL;
+	}
+	if (mlx4_map_sw_to_hw_steering_id(dev, type) < 0 ||
+	    mlx4_hw_rule_sz(dev, type) < 0)
+		return -EINVAL;
+	mlx4_spec->id = cpu_to_be16(mlx4_map_sw_to_hw_steering_id(dev, type));
+	mlx4_spec->size = mlx4_hw_rule_sz(dev, type) >> 2;
+	return mlx4_hw_rule_sz(dev, type);
+}
+
+struct default_rules {
+	__u32 mandatory_fields[IB_FLOW_SPEC_SUPPORT_LAYERS];
+	__u32 mandatory_not_fields[IB_FLOW_SPEC_SUPPORT_LAYERS];
+	__u32 rules_create_list[IB_FLOW_SPEC_SUPPORT_LAYERS];
+	__u8  link_layer;
+};
+static const struct default_rules default_table[] = {
+	{
+		.mandatory_fields = {IB_FLOW_SPEC_IPV4},
+		.mandatory_not_fields = {IB_FLOW_SPEC_ETH},
+		.rules_create_list = {IB_FLOW_SPEC_IB},
+		.link_layer = IB_LINK_LAYER_INFINIBAND
+	}
+};
+
+static int __mlx4_ib_default_rules_match(struct ib_qp *qp,
+					 struct ib_flow_attr *flow_attr)
+{
+	int i, j, k;
+	void *ib_flow;
+	const struct default_rules *pdefault_rules = default_table;
+	u8 link_layer = rdma_port_get_link_layer(qp->device, flow_attr->port);
+
+	for (i = 0; i < ARRAY_SIZE(default_table); i++, pdefault_rules++) {
+		__u32 field_types[IB_FLOW_SPEC_SUPPORT_LAYERS];
+		memset(&field_types, 0, sizeof(field_types));
+
+		if (link_layer != pdefault_rules->link_layer)
+			continue;
+
+		ib_flow = flow_attr + 1;
+		/* we assume the specs are sorted */
+		for (j = 0, k = 0; k < IB_FLOW_SPEC_SUPPORT_LAYERS &&
+		     j < flow_attr->num_of_specs; k++) {
+			union ib_flow_spec *current_flow =
+				(union ib_flow_spec *)ib_flow;
+
+			/* same layer but different type */
+			if (((current_flow->type & IB_FLOW_SPEC_LAYER_MASK) ==
+			     (pdefault_rules->mandatory_fields[k] &
+			      IB_FLOW_SPEC_LAYER_MASK)) &&
+			    (current_flow->type !=
+			     pdefault_rules->mandatory_fields[k]))
+				goto out;
+
+			/* same layer, try match next one */
+			if (current_flow->type ==
+			    pdefault_rules->mandatory_fields[k]) {
+				j++;
+				ib_flow +=
+					((union ib_flow_spec *)ib_flow)->size;
+			}
+		}
+
+		ib_flow = flow_attr + 1;
+		for (j = 0; j < flow_attr->num_of_specs;
+		     j++, ib_flow += ((union ib_flow_spec *)ib_flow)->size)
+			for (k = 0; k < IB_FLOW_SPEC_SUPPORT_LAYERS; k++)
+				/* same layer and same type */
+				if (((union ib_flow_spec *)ib_flow)->type ==
+				    pdefault_rules->mandatory_not_fields[k])
+					goto out;
+
+		return i;
+	}
+out:
+	return -1;
+}
+
+static int __mlx4_ib_create_default_rules(
+		struct mlx4_ib_dev *mdev,
+		struct ib_qp *qp,
+		const struct default_rules *pdefault_rules,
+		struct _rule_hw *mlx4_spec) {
+	int size = 0;
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(pdefault_rules->rules_create_list); i++) {
+		int ret;
+		union ib_flow_spec ib_spec;
+		switch (pdefault_rules->rules_create_list[i]) {
+		case 0:
+			/* no rule */
+			continue;
+		case IB_FLOW_SPEC_IB:
+			ib_spec.type = IB_FLOW_SPEC_IB;
+			ib_spec.size = sizeof(struct ib_flow_spec_ib);
+
+			break;
+		default:
+			/* invalid rule */
+			return -EINVAL;
+		}
+		/* We must put empty rule, qpn is being ignored */
+		ret = parse_flow_attr(mdev->dev, 0, &ib_spec,
+				      mlx4_spec);
+		if (ret < 0) {
+			pr_info("invalid parsing\n");
+			return -EINVAL;
+		}
+
+		mlx4_spec = (void *)mlx4_spec + ret;
+		size += ret;
+	}
+	return size;
+}
+
+static int __mlx4_ib_create_flow(struct ib_qp *qp, struct ib_flow_attr *flow_attr,
+			  int domain,
+			  enum mlx4_net_trans_promisc_mode flow_type,
+			  u64 *reg_id)
+{
+	int ret, i;
+	int size = 0;
+	void *ib_flow;
+	struct mlx4_ib_dev *mdev = to_mdev(qp->device);
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_net_trans_rule_hw_ctrl *ctrl;
+	int default_flow;
+
+	static const u16 __mlx4_domain[] = {
+		[IB_FLOW_DOMAIN_USER] = MLX4_DOMAIN_UVERBS,
+		[IB_FLOW_DOMAIN_ETHTOOL] = MLX4_DOMAIN_ETHTOOL,
+		[IB_FLOW_DOMAIN_RFS] = MLX4_DOMAIN_RFS,
+		[IB_FLOW_DOMAIN_NIC] = MLX4_DOMAIN_NIC,
+	};
+
+	if (flow_attr->priority > MLX4_IB_FLOW_MAX_PRIO) {
+		pr_err("Invalid priority value %d\n", flow_attr->priority);
+		return -EINVAL;
+	}
+
+	if (domain >= IB_FLOW_DOMAIN_NUM) {
+		pr_err("Invalid domain value %d\n", domain);
+		return -EINVAL;
+	}
+
+	if (mlx4_map_sw_to_hw_steering_mode(mdev->dev, flow_type) < 0)
+		return -EINVAL;
+
+	mailbox = mlx4_alloc_cmd_mailbox(mdev->dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	ctrl = mailbox->buf;
+
+	ctrl->prio = cpu_to_be16(__mlx4_domain[domain] |
+				 flow_attr->priority);
+	ctrl->type = mlx4_map_sw_to_hw_steering_mode(mdev->dev, flow_type);
+	ctrl->port = flow_attr->port;
+	ctrl->qpn = cpu_to_be32(qp->qp_num);
+
+	ib_flow = flow_attr + 1;
+	size += sizeof(struct mlx4_net_trans_rule_hw_ctrl);
+	/* Add default flows */
+	default_flow = __mlx4_ib_default_rules_match(qp, flow_attr);
+	if (default_flow >= 0) {
+		ret = __mlx4_ib_create_default_rules(
+				mdev, qp, default_table + default_flow,
+				mailbox->buf + size);
+		if (ret < 0) {
+			mlx4_free_cmd_mailbox(mdev->dev, mailbox);
+			return -EINVAL;
+		}
+		size += ret;
+	}
+	for (i = 0; i < flow_attr->num_of_specs; i++) {
+		ret = parse_flow_attr(mdev->dev, qp->qp_num, ib_flow,
+				      mailbox->buf + size);
+		if (ret < 0) {
+			mlx4_free_cmd_mailbox(mdev->dev, mailbox);
+			return -EINVAL;
+		}
+		ib_flow += ((union ib_flow_spec *) ib_flow)->size;
+		size += ret;
+	}
+
+	ret = mlx4_cmd_imm(mdev->dev, mailbox->dma, reg_id, size >> 2, 0,
+			   MLX4_QP_FLOW_STEERING_ATTACH, MLX4_CMD_TIME_CLASS_A,
+			   MLX4_CMD_NATIVE);
+	if (ret == -ENOMEM)
+		pr_err("mcg table is full. Fail to register network rule.\n");
+	else if (ret == -ENXIO)
+		pr_err("Device managed flow steering is disabled. Fail to register network rule.\n");
+	else if (ret)
+		pr_err("Invalid argumant. Fail to register network rule.\n");
+
+	mlx4_free_cmd_mailbox(mdev->dev, mailbox);
+	return ret;
+}
+
+static int __mlx4_ib_destroy_flow(struct mlx4_dev *dev, u64 reg_id)
+{
+	int err;
+	err = mlx4_cmd(dev, reg_id, 0, 0,
+		       MLX4_QP_FLOW_STEERING_DETACH, MLX4_CMD_TIME_CLASS_A,
+		       MLX4_CMD_NATIVE);
+	if (err)
+		pr_err("Fail to detach network rule. registration id = 0x%llx\n",
+		       reg_id);
+	return err;
+}
+
+static int mlx4_ib_tunnel_steer_add(struct ib_qp *qp, struct ib_flow_attr *flow_attr,
+				    u64 *reg_id)
+{
+	void *ib_flow;
+	union ib_flow_spec *ib_spec;
+	struct mlx4_dev	*dev = to_mdev(qp->device)->dev;
+	int err = 0;
+
+	if (dev->caps.tunnel_offload_mode != MLX4_TUNNEL_OFFLOAD_MODE_VXLAN)
+		return 0; /* do nothing */
+
+	ib_flow = flow_attr + 1;
+	ib_spec = (union ib_flow_spec *)ib_flow;
+
+	if (ib_spec->type !=  IB_FLOW_SPEC_ETH || flow_attr->num_of_specs != 1)
+		return 0; /* do nothing */
+
+	err = mlx4_tunnel_steer_add(to_mdev(qp->device)->dev, ib_spec->eth.val.dst_mac,
+				    flow_attr->port, qp->qp_num,
+				    MLX4_DOMAIN_UVERBS | (flow_attr->priority & 0xff),
+				    reg_id);
+	return err;
+}
+
+static struct ib_flow *mlx4_ib_create_flow(struct ib_qp *qp,
+				    struct ib_flow_attr *flow_attr,
+				    int domain)
+{
+	int err = 0, i = 0;
+	struct mlx4_ib_flow *mflow;
+	enum mlx4_net_trans_promisc_mode type[2];
+
+	memset(type, 0, sizeof(type));
+
+	mflow = kzalloc(sizeof(*mflow), GFP_KERNEL);
+	if (!mflow) {
+		err = -ENOMEM;
+		goto err_free;
+	}
+
+	switch (flow_attr->type) {
+	case IB_FLOW_ATTR_NORMAL:
+		type[0] = MLX4_FS_REGULAR;
+		break;
+
+	case IB_FLOW_ATTR_ALL_DEFAULT:
+		type[0] = MLX4_FS_ALL_DEFAULT;
+		break;
+
+	case IB_FLOW_ATTR_MC_DEFAULT:
+		type[0] = MLX4_FS_MC_DEFAULT;
+		break;
+
+	case IB_FLOW_ATTR_SNIFFER:
+		type[0] = MLX4_FS_UC_SNIFFER;
+		type[1] = MLX4_FS_MC_SNIFFER;
+		break;
+
+	default:
+		err = -EINVAL;
+		goto err_free;
+	}
+
+	while (i < ARRAY_SIZE(type) && type[i]) {
+		err = __mlx4_ib_create_flow(qp, flow_attr, domain, type[i],
+					    &mflow->reg_id[i]);
+		if (err)
+			goto err_create_flow;
+		i++;
+	}
+
+	if (i < ARRAY_SIZE(type) && flow_attr->type == IB_FLOW_ATTR_NORMAL) {
+		err = mlx4_ib_tunnel_steer_add(qp, flow_attr, &mflow->reg_id[i]);
+		if (err)
+			goto err_create_flow;
+		i++;
+	}
+
+	return &mflow->ibflow;
+
+err_create_flow:
+	while (i) {
+		(void)__mlx4_ib_destroy_flow(to_mdev(qp->device)->dev, mflow->reg_id[i]);
+		i--;
+	}
+err_free:
+	kfree(mflow);
+	return ERR_PTR(err);
+}
+
+static int mlx4_ib_destroy_flow(struct ib_flow *flow_id)
+{
+	int err, ret = 0;
+	int i = 0;
+	struct mlx4_ib_dev *mdev = to_mdev(flow_id->qp->device);
+	struct mlx4_ib_flow *mflow = to_mflow(flow_id);
+
+	while (i < ARRAY_SIZE(mflow->reg_id) && mflow->reg_id[i]) {
+		err = __mlx4_ib_destroy_flow(mdev->dev, mflow->reg_id[i]);
+		if (err)
+			ret = err;
+		i++;
+	}
+
+	kfree(mflow);
+	return ret;
+}
+
+static int mlx4_ib_mcg_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
+{
+	int err;
+	struct mlx4_ib_dev *mdev = to_mdev(ibqp->device);
+	struct mlx4_ib_qp *mqp = to_mqp(ibqp);
+	u64 reg_id;
+	struct mlx4_ib_steering *ib_steering = NULL;
+	enum mlx4_protocol prot = (gid->raw[1] == 0x0e) ?
+		MLX4_PROT_IB_IPV4 : MLX4_PROT_IB_IPV6;
+
+	if (mdev->dev->caps.steering_mode ==
+	    MLX4_STEERING_MODE_DEVICE_MANAGED) {
+		ib_steering = kmalloc(sizeof(*ib_steering), GFP_KERNEL);
+		if (!ib_steering)
+			return -ENOMEM;
+	}
+
+	err = mlx4_multicast_attach(mdev->dev, &mqp->mqp, gid->raw, mqp->port,
+				    !!(mqp->flags &
+				       MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK),
+				    prot, &reg_id);
+	if (err)
+		goto err_malloc;
+
+	err = add_gid_entry(ibqp, gid);
+	if (err)
+		goto err_add;
+
+	if (ib_steering) {
+		memcpy(ib_steering->gid.raw, gid->raw, 16);
+		ib_steering->reg_id = reg_id;
+		mutex_lock(&mqp->mutex);
+		list_add(&ib_steering->list, &mqp->steering_rules);
+		mutex_unlock(&mqp->mutex);
+	}
+	return 0;
+
+err_add:
+	mlx4_multicast_detach(mdev->dev, &mqp->mqp, gid->raw,
+			      prot, reg_id);
+err_malloc:
+	kfree(ib_steering);
+
+	return err;
+}
+
+static struct mlx4_ib_gid_entry *find_gid_entry(struct mlx4_ib_qp *qp, u8 *raw)
+{
+	struct mlx4_ib_gid_entry *ge;
+	struct mlx4_ib_gid_entry *tmp;
+	struct mlx4_ib_gid_entry *ret = NULL;
+
+	list_for_each_entry_safe(ge, tmp, &qp->gid_list, list) {
+		if (!memcmp(raw, ge->gid.raw, 16)) {
+			ret = ge;
+			break;
+		}
+	}
+
+	return ret;
+}
+
+static int mlx4_ib_mcg_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
+{
+	int err;
+	struct mlx4_ib_dev *mdev = to_mdev(ibqp->device);
+	struct mlx4_ib_qp *mqp = to_mqp(ibqp);
+	struct net_device *ndev;
+	struct mlx4_ib_gid_entry *ge;
+	u64 reg_id = 0;
+	enum mlx4_protocol prot = (gid->raw[1] == 0x0e) ?
+		MLX4_PROT_IB_IPV4 : MLX4_PROT_IB_IPV6;
+
+	if (mdev->dev->caps.steering_mode ==
+	    MLX4_STEERING_MODE_DEVICE_MANAGED) {
+		struct mlx4_ib_steering *ib_steering;
+
+		mutex_lock(&mqp->mutex);
+		list_for_each_entry(ib_steering, &mqp->steering_rules, list) {
+			if (!memcmp(ib_steering->gid.raw, gid->raw, 16)) {
+				list_del(&ib_steering->list);
+				break;
+			}
+		}
+		mutex_unlock(&mqp->mutex);
+		if (&ib_steering->list == &mqp->steering_rules) {
+			pr_err("Couldn't find reg_id for mgid. Steering rule is left attached\n");
+			return -EINVAL;
+		}
+		reg_id = ib_steering->reg_id;
+		kfree(ib_steering);
+	}
+
+	err = mlx4_multicast_detach(mdev->dev, &mqp->mqp, gid->raw,
+				    prot, reg_id);
+	if (err)
+		return err;
+
+	mutex_lock(&mqp->mutex);
+	ge = find_gid_entry(mqp, gid->raw);
+	if (ge) {
+		spin_lock_bh(&mdev->iboe.lock);
+		ndev = ge->added ? mdev->iboe.netdevs[ge->port - 1] : NULL;
+		if (ndev)
+			dev_hold(ndev);
+		spin_unlock_bh(&mdev->iboe.lock);
+		if (ndev)
+			dev_put(ndev);
+		list_del(&ge->list);
+		kfree(ge);
+	} else
+		pr_warn("could not find mgid entry\n");
+
+	mutex_unlock(&mqp->mutex);
+
+	return 0;
+}
+
+static int init_node_data(struct mlx4_ib_dev *dev)
+{
+	struct ib_smp *in_mad  = NULL;
+	struct ib_smp *out_mad = NULL;
+	int mad_ifc_flags = MLX4_MAD_IFC_IGNORE_KEYS;
+	int err = -ENOMEM;
+
+	in_mad  = kzalloc(sizeof *in_mad, GFP_KERNEL);
+	out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL);
+	if (!in_mad || !out_mad)
+		goto out;
+
+	init_query_mad(in_mad);
+	in_mad->attr_id = IB_SMP_ATTR_NODE_DESC;
+	if (mlx4_is_master(dev->dev))
+		mad_ifc_flags |= MLX4_MAD_IFC_NET_VIEW;
+
+	err = mlx4_MAD_IFC(dev, mad_ifc_flags, 1, NULL, NULL, in_mad, out_mad);
+	if (err)
+		goto out;
+
+	memcpy(dev->ib_dev.node_desc, out_mad->data, 64);
+
+	in_mad->attr_id = IB_SMP_ATTR_NODE_INFO;
+
+	err = mlx4_MAD_IFC(dev, mad_ifc_flags, 1, NULL, NULL, in_mad, out_mad);
+	if (err)
+		goto out;
+
+	dev->dev->rev_id = be32_to_cpup((__be32 *) (out_mad->data + 32));
+	memcpy(&dev->ib_dev.node_guid, out_mad->data + 12, 8);
+
+out:
+	kfree(in_mad);
+	kfree(out_mad);
+	return err;
+}
+
+static ssize_t show_hca(struct device *device, struct device_attribute *attr,
+			char *buf)
+{
+	struct mlx4_ib_dev *dev =
+		container_of(device, struct mlx4_ib_dev, ib_dev.dev);
+	return sprintf(buf, "MT%d\n", dev->dev->pdev->device);
+}
+
+static ssize_t show_fw_ver(struct device *device, struct device_attribute *attr,
+			   char *buf)
+{
+	struct mlx4_ib_dev *dev =
+		container_of(device, struct mlx4_ib_dev, ib_dev.dev);
+	return sprintf(buf, "%d.%d.%d\n", (int) (dev->dev->caps.fw_ver >> 32),
+		       (int) (dev->dev->caps.fw_ver >> 16) & 0xffff,
+		       (int) dev->dev->caps.fw_ver & 0xffff);
+}
+
+static ssize_t show_rev(struct device *device, struct device_attribute *attr,
+			char *buf)
+{
+	struct mlx4_ib_dev *dev =
+		container_of(device, struct mlx4_ib_dev, ib_dev.dev);
+	return sprintf(buf, "%x\n", dev->dev->rev_id);
+}
+
+static ssize_t show_board(struct device *device, struct device_attribute *attr,
+			  char *buf)
+{
+	struct mlx4_ib_dev *dev =
+		container_of(device, struct mlx4_ib_dev, ib_dev.dev);
+	return sprintf(buf, "%.*s\n", MLX4_BOARD_ID_LEN,
+		       dev->dev->board_id);
+}
+
+static DEVICE_ATTR(hw_rev,   S_IRUGO, show_rev,    NULL);
+static DEVICE_ATTR(fw_ver,   S_IRUGO, show_fw_ver, NULL);
+static DEVICE_ATTR(hca_type, S_IRUGO, show_hca,    NULL);
+static DEVICE_ATTR(board_id, S_IRUGO, show_board,  NULL);
+
+static struct device_attribute *mlx4_class_attributes[] = {
+	&dev_attr_hw_rev,
+	&dev_attr_fw_ver,
+	&dev_attr_hca_type,
+	&dev_attr_board_id
+};
+
+static void mlx4_addrconf_ifid_eui48(u8 *eui, u16 vlan_id,
+				     struct net_device *dev)
+{
+	memcpy(eui, dev->dev_addr, 3);
+	memcpy(eui + 5, dev->dev_addr + 3, 3);
+	if (vlan_id < 0x1000) {
+		eui[3] = vlan_id >> 8;
+		eui[4] = vlan_id & 0xff;
+	} else {
+		eui[3] = 0xff;
+		eui[4] = 0xfe;
+	}
+	eui[0] ^= 2;
+}
+
+static void update_gids_task(struct work_struct *work)
+{
+	struct update_gid_work *gw = container_of(work, struct update_gid_work, work);
+	struct mlx4_cmd_mailbox *mailbox;
+	union ib_gid *gids;
+	int err;
+	struct mlx4_dev	*dev = gw->dev->dev;
+
+	if (!gw->dev->ib_active)
+		return;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox)) {
+		pr_warn("update gid table failed %ld\n", PTR_ERR(mailbox));
+		return;
+	}
+
+	gids = mailbox->buf;
+	memcpy(gids, gw->gids, sizeof gw->gids);
+
+	err = mlx4_cmd(dev, mailbox->dma, MLX4_SET_PORT_GID_TABLE << 8 | gw->port,
+		       1, MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B,
+		       MLX4_CMD_WRAPPED);
+	if (err)
+		pr_warn("set port command failed\n");
+	else
+		mlx4_ib_dispatch_event(gw->dev, gw->port, IB_EVENT_GID_CHANGE);
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	kfree(gw);
+}
+
+static void reset_gids_task(struct work_struct *work)
+{
+	struct update_gid_work *gw =
+			container_of(work, struct update_gid_work, work);
+	struct mlx4_cmd_mailbox *mailbox;
+	union ib_gid *gids;
+	int err;
+	struct mlx4_dev	*dev = gw->dev->dev;
+
+	if (!gw->dev->ib_active)
+		return;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox)) {
+		pr_warn("reset gid table failed\n");
+		goto free;
+	}
+
+	gids = mailbox->buf;
+	memcpy(gids, gw->gids, sizeof(gw->gids));
+
+	if (mlx4_ib_port_link_layer(&gw->dev->ib_dev, gw->port) ==
+				    IB_LINK_LAYER_ETHERNET) {
+		err = mlx4_cmd(dev, mailbox->dma,
+			       MLX4_SET_PORT_GID_TABLE << 8 | gw->port,
+			       1, MLX4_CMD_SET_PORT,
+			       MLX4_CMD_TIME_CLASS_B,
+			       MLX4_CMD_WRAPPED);
+		if (err)
+			pr_warn(KERN_WARNING
+				"set port %d command failed\n", gw->port);
+	}
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+free:
+	kfree(gw);
+}
+
+static int update_gid_table(struct mlx4_ib_dev *dev, int port,
+			    union ib_gid *gid, int clear,
+			    int default_gid)
+{
+	struct update_gid_work *work;
+	int i;
+	int need_update = 0;
+	int free = -1;
+	int found = -1;
+	int max_gids;
+
+	if (default_gid) {
+		free = 0;
+	} else {
+		max_gids = dev->dev->caps.gid_table_len[port];
+		for (i = 1; i < max_gids; ++i) {
+			if (!memcmp(&dev->iboe.gid_table[port - 1][i], gid,
+				    sizeof(*gid)))
+				found = i;
+
+			if (clear) {
+				if (found >= 0) {
+					need_update = 1;
+					dev->iboe.gid_table[port - 1][found] =
+						zgid;
+					break;
+				}
+			} else {
+				if (found >= 0)
+					break;
+
+				if (free < 0 &&
+				    !memcmp(&dev->iboe.gid_table[port - 1][i],
+					    &zgid, sizeof(*gid)))
+					free = i;
+			}
+		}
+	}
+
+	if (found == -1 && !clear && free >= 0) {
+		dev->iboe.gid_table[port - 1][free] = *gid;
+		need_update = 1;
+	}
+
+	if (!need_update)
+		return 0;
+
+	work = kzalloc(sizeof(*work), GFP_ATOMIC);
+	if (!work)
+		return -ENOMEM;
+
+	memcpy(work->gids, dev->iboe.gid_table[port - 1], sizeof(work->gids));
+	INIT_WORK(&work->work, update_gids_task);
+	work->port = port;
+	work->dev = dev;
+	queue_work(wq, &work->work);
+
+	return 0;
+}
+
+static void mlx4_make_default_gid(struct  net_device *dev, union ib_gid *gid)
+{
+	gid->global.subnet_prefix = cpu_to_be64(0xfe80000000000000LL);
+	mlx4_addrconf_ifid_eui48(&gid->raw[8], 0xffff, dev);
+}
+
+
+static int reset_gid_table(struct mlx4_ib_dev *dev, u8 port)
+{
+	struct update_gid_work *work;
+
+	work = kzalloc(sizeof(*work), GFP_ATOMIC);
+	if (!work)
+		return -ENOMEM;
+
+	memset(dev->iboe.gid_table[port - 1], 0, sizeof(work->gids));
+	memset(work->gids, 0, sizeof(work->gids));
+	INIT_WORK(&work->work, reset_gids_task);
+	work->dev = dev;
+	work->port = port;
+	queue_work(wq, &work->work);
+	return 0;
+}
+
+static int mlx4_ib_addr_event(int event, struct net_device *event_netdev,
+			      struct mlx4_ib_dev *ibdev, union ib_gid *gid)
+{
+	struct mlx4_ib_iboe *iboe;
+	int port = 0;
+	struct net_device *real_dev = rdma_vlan_dev_real_dev(event_netdev) ?
+				rdma_vlan_dev_real_dev(event_netdev) :
+				event_netdev;
+	union ib_gid default_gid;
+
+	mlx4_make_default_gid(real_dev, &default_gid);
+
+	if (!memcmp(gid, &default_gid, sizeof(*gid)))
+		return 0;
+
+	if (event != NETDEV_DOWN && event != NETDEV_UP)
+		return 0;
+
+	if ((real_dev != event_netdev) &&
+	    (event == NETDEV_DOWN) &&
+	    rdma_link_local_addr((struct in6_addr *)gid))
+		return 0;
+
+	iboe = &ibdev->iboe;
+	spin_lock_bh(&iboe->lock);
+
+	for (port = 1; port <= ibdev->dev->caps.num_ports; ++port)
+		if ((netif_is_bond_master(real_dev) &&
+		     (real_dev == iboe->masters[port - 1])) ||
+		     (!netif_is_bond_master(real_dev) &&
+		     (real_dev == iboe->netdevs[port - 1])))
+			update_gid_table(ibdev, port, gid,
+					 event == NETDEV_DOWN, 0);
+
+	spin_unlock_bh(&iboe->lock);
+	return 0;
+
+}
+
+static u8 mlx4_ib_get_dev_port(struct net_device *dev,
+			       struct mlx4_ib_dev *ibdev)
+{
+	u8 port = 0;
+	struct mlx4_ib_iboe *iboe;
+	struct net_device *real_dev = rdma_vlan_dev_real_dev(dev) ?
+				rdma_vlan_dev_real_dev(dev) : dev;
+
+	iboe = &ibdev->iboe;
+
+	for (port = 1; port <= ibdev->dev->caps.num_ports; ++port)
+		if ((netif_is_bond_master(real_dev) &&
+		     (real_dev == iboe->masters[port - 1])) ||
+		     (!netif_is_bond_master(real_dev) &&
+		     (real_dev == iboe->netdevs[port - 1])))
+			break;
+
+	if ((port == 0) || (port > ibdev->dev->caps.num_ports))
+		return 0;
+	else
+		return port;
+}
+
+static int mlx4_ib_inet_event(struct notifier_block *this, unsigned long event,
+				void *ptr)
+{
+	struct mlx4_ib_dev *ibdev;
+	struct in_ifaddr *ifa = ptr;
+	union ib_gid gid;
+	struct net_device *event_netdev = ifa->ifa_dev->dev;
+
+	ipv6_addr_set_v4mapped(ifa->ifa_address, (struct in6_addr *)&gid);
+
+	ibdev = container_of(this, struct mlx4_ib_dev, iboe.nb_inet);
+
+	mlx4_ib_addr_event(event, event_netdev, ibdev, &gid);
+	return NOTIFY_DONE;
+}
+
+#if IS_ENABLED(CONFIG_IPV6)
+static int mlx4_ib_inet6_event(struct notifier_block *this, unsigned long event,
+				void *ptr)
+{
+	struct mlx4_ib_dev *ibdev;
+	struct inet6_ifaddr *ifa = ptr;
+	union  ib_gid *gid = (union ib_gid *)&ifa->addr;
+	struct net_device *event_netdev = ifa->idev->dev;
+
+	ibdev = container_of(this, struct mlx4_ib_dev, iboe.nb_inet6);
+
+	mlx4_ib_addr_event(event, event_netdev, ibdev, gid);
+	return NOTIFY_DONE;
+}
+#endif
+
+#define MLX4_IB_INVALID_MAC	((u64)-1)
+static void mlx4_ib_update_qps(struct mlx4_ib_dev *ibdev,
+			       struct net_device *dev,
+			       int port)
+{
+	u64 new_smac = 0;
+	u64 release_mac = MLX4_IB_INVALID_MAC;
+	struct mlx4_ib_qp *qp;
+
+	read_lock(&dev_base_lock);
+	new_smac = mlx4_mac_to_u64(dev->dev_addr);
+	read_unlock(&dev_base_lock);
+
+	atomic64_set(&ibdev->iboe.mac[port - 1], new_smac);
+
+	/* no need for update QP1 and mac registration in non-SRIOV */
+	if (!mlx4_is_mfunc(ibdev->dev))
+		return;
+
+	mutex_lock(&ibdev->qp1_proxy_lock[port - 1]);
+	qp = ibdev->qp1_proxy[port - 1];
+	if (qp) {
+		int new_smac_index;
+		u64 old_smac;
+		struct mlx4_update_qp_params update_params;
+
+		mutex_lock(&qp->mutex);
+		old_smac = qp->pri.smac;
+		if (new_smac == old_smac)
+			goto unlock;
+
+		new_smac_index = mlx4_register_mac(ibdev->dev, port, new_smac);
+
+		if (new_smac_index < 0)
+			goto unlock;
+
+		update_params.smac_index = new_smac_index;
+		if (mlx4_update_qp(ibdev->dev, qp->mqp.qpn, MLX4_UPDATE_QP_SMAC,
+				   &update_params)) {
+			release_mac = new_smac;
+			goto unlock;
+		}
+		/* if old port was zero, no mac was yet registered for this QP */
+		if (qp->pri.smac_port)
+			release_mac = old_smac;
+		qp->pri.smac = new_smac;
+		qp->pri.smac_port = port;
+		qp->pri.smac_index = new_smac_index;
+	}
+
+unlock:
+	if (release_mac != MLX4_IB_INVALID_MAC)
+		mlx4_unregister_mac(ibdev->dev, port, release_mac);
+	if (qp)
+		mutex_unlock(&qp->mutex);
+	mutex_unlock(&ibdev->qp1_proxy_lock[port - 1]);
+}
+
+static void mlx4_ib_get_dev_addr(struct net_device *dev,
+				 struct mlx4_ib_dev *ibdev, u8 port)
+{
+	struct in_device *in_dev;
+#if IS_ENABLED(CONFIG_IPV6)
+	struct inet6_dev *in6_dev;
+	union ib_gid  *pgid;
+	struct inet6_ifaddr *ifp;
+	union ib_gid default_gid;
+#endif
+	union ib_gid gid;
+
+
+	if ((port == 0) || (port > ibdev->dev->caps.num_ports))
+		return;
+
+	/* IPv4 gids */
+	in_dev = in_dev_get(dev);
+	if (in_dev) {
+		for_ifa(in_dev) {
+			/*ifa->ifa_address;*/
+			ipv6_addr_set_v4mapped(ifa->ifa_address,
+					       (struct in6_addr *)&gid);
+			update_gid_table(ibdev, port, &gid, 0, 0);
+		}
+		endfor_ifa(in_dev);
+		in_dev_put(in_dev);
+	}
+#if IS_ENABLED(CONFIG_IPV6)
+	mlx4_make_default_gid(dev, &default_gid);
+	/* IPv6 gids */
+	in6_dev = in6_dev_get(dev);
+	if (in6_dev) {
+		read_lock_bh(&in6_dev->lock);
+		list_for_each_entry(ifp, &in6_dev->addr_list, if_list) {
+			pgid = (union ib_gid *)&ifp->addr;
+			if (!memcmp(pgid, &default_gid, sizeof(*pgid)))
+				continue;
+			update_gid_table(ibdev, port, pgid, 0, 0);
+		}
+		read_unlock_bh(&in6_dev->lock);
+		in6_dev_put(in6_dev);
+	}
+#endif
+}
+
+static void mlx4_ib_set_default_gid(struct mlx4_ib_dev *ibdev,
+				 struct  net_device *dev, u8 port)
+{
+	union ib_gid gid;
+	mlx4_make_default_gid(dev, &gid);
+	update_gid_table(ibdev, port, &gid, 0, 1);
+}
+
+static int mlx4_ib_init_gid_table(struct mlx4_ib_dev *ibdev)
+{
+	struct	net_device *dev;
+	struct mlx4_ib_iboe *iboe = &ibdev->iboe;
+	int i;
+	int err = 0;
+
+	for (i = 1; i <= ibdev->num_ports; ++i) {
+		if (rdma_port_get_link_layer(&ibdev->ib_dev, i) ==
+		    IB_LINK_LAYER_ETHERNET) {
+			err = reset_gid_table(ibdev, i);
+			if (err)
+				goto out;
+		}
+	}
+
+	read_lock(&dev_base_lock);
+	spin_lock_bh(&iboe->lock);
+
+	for_each_netdev(&init_net, dev) {
+		u8 port = mlx4_ib_get_dev_port(dev, ibdev);
+		/* port will be non-zero only for ETH ports */
+		if (port) {
+			mlx4_ib_set_default_gid(ibdev, dev, port);
+			mlx4_ib_get_dev_addr(dev, ibdev, port);
+		}
+	}
+
+	spin_unlock_bh(&iboe->lock);
+	read_unlock(&dev_base_lock);
+out:
+	return err;
+}
+
+static void mlx4_ib_scan_netdevs(struct mlx4_ib_dev *ibdev,
+				 struct net_device *dev,
+				 unsigned long event)
+
+{
+	struct mlx4_ib_iboe *iboe;
+	int update_qps_port = -1;
+	int port;
+
+	iboe = &ibdev->iboe;
+
+	spin_lock_bh(&iboe->lock);
+	mlx4_foreach_ib_transport_port(port, ibdev->dev) {
+		enum ib_port_state	port_state = IB_PORT_NOP;
+		struct net_device *old_master = iboe->masters[port - 1];
+		struct net_device *curr_netdev;
+		struct net_device *curr_master;
+
+		iboe->netdevs[port - 1] =
+			mlx4_get_protocol_dev(ibdev->dev, MLX4_PROT_ETH, port);
+		if (iboe->netdevs[port - 1])
+			mlx4_ib_set_default_gid(ibdev,
+						iboe->netdevs[port - 1], port);
+		curr_netdev = iboe->netdevs[port - 1];
+
+		if (iboe->netdevs[port - 1] &&
+		    netif_is_bond_slave(iboe->netdevs[port - 1])) {
+			iboe->masters[port - 1] = netdev_master_upper_dev_get(
+				iboe->netdevs[port - 1]);
+		} else {
+			iboe->masters[port - 1] = NULL;
+		}
+		curr_master = iboe->masters[port - 1];
+
+		if (dev == iboe->netdevs[port - 1] &&
+		    (event == NETDEV_CHANGEADDR || event == NETDEV_REGISTER ||
+		     event == NETDEV_UP || event == NETDEV_CHANGE))
+			update_qps_port = port;
+
+		if (curr_netdev) {
+			port_state = (netif_running(curr_netdev) && netif_carrier_ok(curr_netdev)) ?
+						IB_PORT_ACTIVE : IB_PORT_DOWN;
+			mlx4_ib_set_default_gid(ibdev, curr_netdev, port);
+			if (curr_master) {
+				/* if using bonding/team and a slave port is down, we
+				 * don't want the bond IP based gids in the table since
+				 * flows that select port by gid may get the down port.
+				*/
+				if (port_state == IB_PORT_DOWN) {
+					reset_gid_table(ibdev, port);
+					mlx4_ib_set_default_gid(ibdev,
+								curr_netdev,
+								port);
+				} else {
+					/* gids from the upper dev (bond/team)
+					 * should appear in port's gid table
+					*/
+					mlx4_ib_get_dev_addr(curr_master,
+							     ibdev, port);
+				}
+			}
+			/* if bonding is used it is possible that we add it to
+			 * masters only after IP address is assigned to the
+			 * net bonding interface.
+			*/
+			if (curr_master && (old_master != curr_master)) {
+				reset_gid_table(ibdev, port);
+				mlx4_ib_set_default_gid(ibdev,
+							curr_netdev, port);
+				mlx4_ib_get_dev_addr(curr_master, ibdev, port);
+			}
+
+			if (!curr_master && (old_master != curr_master)) {
+				reset_gid_table(ibdev, port);
+				mlx4_ib_set_default_gid(ibdev,
+							curr_netdev, port);
+				mlx4_ib_get_dev_addr(curr_netdev, ibdev, port);
+			}
+		} else {
+			reset_gid_table(ibdev, port);
+		}
+	}
+
+	spin_unlock_bh(&iboe->lock);
+
+	if (update_qps_port > 0)
+		mlx4_ib_update_qps(ibdev, dev, update_qps_port);
+}
+
+static int mlx4_ib_netdev_event(struct notifier_block *this,
+				unsigned long event, void *ptr)
+{
+	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+	struct mlx4_ib_dev *ibdev;
+
+	if (!net_eq(dev_net(dev), &init_net))
+		return NOTIFY_DONE;
+
+	ibdev = container_of(this, struct mlx4_ib_dev, iboe.nb);
+	mlx4_ib_scan_netdevs(ibdev, dev, event);
+
+	return NOTIFY_DONE;
+}
+
+static void init_pkeys(struct mlx4_ib_dev *ibdev)
+{
+	int port;
+	int slave;
+	int i;
+
+	if (mlx4_is_master(ibdev->dev)) {
+		for (slave = 0; slave <= ibdev->dev->num_vfs; ++slave) {
+			for (port = 1; port <= ibdev->dev->caps.num_ports; ++port) {
+				for (i = 0;
+				     i < ibdev->dev->phys_caps.pkey_phys_table_len[port];
+				     ++i) {
+					ibdev->pkeys.virt2phys_pkey[slave][port - 1][i] =
+					/* master has the identity virt2phys pkey mapping */
+						(slave == mlx4_master_func_num(ibdev->dev) || !i) ? i :
+							ibdev->dev->phys_caps.pkey_phys_table_len[port] - 1;
+					mlx4_sync_pkey_table(ibdev->dev, slave, port, i,
+							     ibdev->pkeys.virt2phys_pkey[slave][port - 1][i]);
+				}
+			}
+		}
+		/* initialize pkey cache */
+		for (port = 1; port <= ibdev->dev->caps.num_ports; ++port) {
+			for (i = 0;
+			     i < ibdev->dev->phys_caps.pkey_phys_table_len[port];
+			     ++i)
+				ibdev->pkeys.phys_pkey_cache[port-1][i] =
+					(i) ? 0 : 0xFFFF;
+		}
+	}
+}
+
+static void mlx4_ib_alloc_eqs(struct mlx4_dev *dev, struct mlx4_ib_dev *ibdev)
+{
+	char name[80];
+	int eq_per_port = 0;
+	int added_eqs = 0;
+	int total_eqs = 0;
+	int i, j, eq;
+
+	/* Legacy mode or comp_pool is not large enough */
+	if (dev->caps.comp_pool == 0 ||
+	    dev->caps.num_ports > dev->caps.comp_pool)
+		return;
+
+	eq_per_port = rounddown_pow_of_two(dev->caps.comp_pool/
+					dev->caps.num_ports);
+
+	/* Init eq table */
+	added_eqs = 0;
+	mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_IB)
+		added_eqs += eq_per_port;
+
+	total_eqs = dev->caps.num_comp_vectors + added_eqs;
+
+	ibdev->eq_table = kzalloc(total_eqs * sizeof(int), GFP_KERNEL);
+	if (!ibdev->eq_table)
+		return;
+
+	ibdev->eq_added = added_eqs;
+
+	eq = 0;
+	mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_IB) {
+		for (j = 0; j < eq_per_port; j++) {
+			snprintf(name, sizeof(name), "mlx4-ib-%d-%d@%s",
+				 i, j, dev->pdev->bus->name);
+			/* Set IRQ for specific name (per ring) */
+			if (mlx4_assign_eq(dev, name, NULL,
+					   &ibdev->eq_table[eq])) {
+				/* Use legacy (same as mlx4_en driver) */
+				pr_warn("Can't allocate EQ %d; reverting to legacy\n", eq);
+				ibdev->eq_table[eq] =
+					(eq % dev->caps.num_comp_vectors);
+			}
+			eq++;
+		}
+	}
+
+	/* Fill the reset of the vector with legacy EQ */
+	for (i = 0, eq = added_eqs; i < dev->caps.num_comp_vectors; i++)
+		ibdev->eq_table[eq++] = i;
+
+	/* Advertise the new number of EQs to clients */
+	ibdev->ib_dev.num_comp_vectors = total_eqs;
+}
+
+static void mlx4_ib_free_eqs(struct mlx4_dev *dev, struct mlx4_ib_dev *ibdev)
+{
+	int i;
+
+	/* no additional eqs were added */
+	if (!ibdev->eq_table)
+		return;
+
+	/* Reset the advertised EQ number */
+	ibdev->ib_dev.num_comp_vectors = dev->caps.num_comp_vectors;
+
+	/* Free only the added eqs */
+	for (i = 0; i < ibdev->eq_added; i++) {
+		/* Don't free legacy eqs if used */
+		if (ibdev->eq_table[i] <= dev->caps.num_comp_vectors)
+			continue;
+		mlx4_release_eq(dev, ibdev->eq_table[i]);
+	}
+
+	kfree(ibdev->eq_table);
+}
+
+static void *mlx4_ib_add(struct mlx4_dev *dev)
+{
+	struct mlx4_ib_dev *ibdev;
+	int num_ports = 0;
+	int i, j;
+	int err;
+	struct mlx4_ib_iboe *iboe;
+	int ib_num_ports = 0;
+
+	pr_info_once("%s", mlx4_ib_version);
+
+	num_ports = 0;
+	mlx4_foreach_ib_transport_port(i, dev)
+		num_ports++;
+
+	/* No point in registering a device with no ports... */
+	if (num_ports == 0)
+		return NULL;
+
+	ibdev = (struct mlx4_ib_dev *) ib_alloc_device(sizeof *ibdev);
+	if (!ibdev) {
+		dev_err(&dev->pdev->dev, "Device struct alloc failed\n");
+		return NULL;
+	}
+
+	iboe = &ibdev->iboe;
+
+	if (mlx4_pd_alloc(dev, &ibdev->priv_pdn))
+		goto err_dealloc;
+
+	if (mlx4_uar_alloc(dev, &ibdev->priv_uar))
+		goto err_pd;
+
+	ibdev->uar_map = ioremap((phys_addr_t) ibdev->priv_uar.pfn << PAGE_SHIFT,
+				 PAGE_SIZE);
+	if (!ibdev->uar_map)
+		goto err_uar;
+	MLX4_INIT_DOORBELL_LOCK(&ibdev->uar_lock);
+
+	ibdev->dev = dev;
+
+	strlcpy(ibdev->ib_dev.name, "mlx4_%d", IB_DEVICE_NAME_MAX);
+	ibdev->ib_dev.owner		= THIS_MODULE;
+	ibdev->ib_dev.node_type		= RDMA_NODE_IB_CA;
+	ibdev->ib_dev.local_dma_lkey	= dev->caps.reserved_lkey;
+	ibdev->num_ports		= num_ports;
+	ibdev->ib_dev.phys_port_cnt     = ibdev->num_ports;
+	ibdev->ib_dev.num_comp_vectors	= dev->caps.num_comp_vectors;
+	ibdev->ib_dev.dma_device	= &dev->pdev->dev;
+
+	if (dev->caps.userspace_caps)
+		ibdev->ib_dev.uverbs_abi_ver = MLX4_IB_UVERBS_ABI_VERSION;
+	else
+		ibdev->ib_dev.uverbs_abi_ver = MLX4_IB_UVERBS_NO_DEV_CAPS_ABI_VERSION;
+
+	ibdev->ib_dev.uverbs_cmd_mask	=
+		(1ull << IB_USER_VERBS_CMD_GET_CONTEXT)		|
+		(1ull << IB_USER_VERBS_CMD_QUERY_DEVICE)	|
+		(1ull << IB_USER_VERBS_CMD_QUERY_PORT)		|
+		(1ull << IB_USER_VERBS_CMD_ALLOC_PD)		|
+		(1ull << IB_USER_VERBS_CMD_DEALLOC_PD)		|
+		(1ull << IB_USER_VERBS_CMD_REG_MR)		|
+		(1ull << IB_USER_VERBS_CMD_REREG_MR)		|
+		(1ull << IB_USER_VERBS_CMD_DEREG_MR)		|
+		(1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL)	|
+		(1ull << IB_USER_VERBS_CMD_CREATE_CQ)		|
+		(1ull << IB_USER_VERBS_CMD_RESIZE_CQ)		|
+		(1ull << IB_USER_VERBS_CMD_DESTROY_CQ)		|
+		(1ull << IB_USER_VERBS_CMD_CREATE_QP)		|
+		(1ull << IB_USER_VERBS_CMD_MODIFY_QP)		|
+		(1ull << IB_USER_VERBS_CMD_QUERY_QP)		|
+		(1ull << IB_USER_VERBS_CMD_DESTROY_QP)		|
+		(1ull << IB_USER_VERBS_CMD_ATTACH_MCAST)	|
+		(1ull << IB_USER_VERBS_CMD_DETACH_MCAST)	|
+		(1ull << IB_USER_VERBS_CMD_CREATE_SRQ)		|
+		(1ull << IB_USER_VERBS_CMD_MODIFY_SRQ)		|
+		(1ull << IB_USER_VERBS_CMD_QUERY_SRQ)		|
+		(1ull << IB_USER_VERBS_CMD_DESTROY_SRQ)		|
+		(1ull << IB_USER_VERBS_CMD_CREATE_XSRQ)		|
+		(1ull << IB_USER_VERBS_CMD_OPEN_QP);
+
+	ibdev->ib_dev.query_device	= mlx4_ib_query_device;
+	ibdev->ib_dev.query_port	= mlx4_ib_query_port;
+	ibdev->ib_dev.get_link_layer	= mlx4_ib_port_link_layer;
+	ibdev->ib_dev.query_gid		= mlx4_ib_query_gid;
+	ibdev->ib_dev.query_pkey	= mlx4_ib_query_pkey;
+	ibdev->ib_dev.modify_device	= mlx4_ib_modify_device;
+	ibdev->ib_dev.modify_port	= mlx4_ib_modify_port;
+	ibdev->ib_dev.alloc_ucontext	= mlx4_ib_alloc_ucontext;
+	ibdev->ib_dev.dealloc_ucontext	= mlx4_ib_dealloc_ucontext;
+	ibdev->ib_dev.mmap		= mlx4_ib_mmap;
+	ibdev->ib_dev.alloc_pd		= mlx4_ib_alloc_pd;
+	ibdev->ib_dev.dealloc_pd	= mlx4_ib_dealloc_pd;
+	ibdev->ib_dev.create_ah		= mlx4_ib_create_ah;
+	ibdev->ib_dev.query_ah		= mlx4_ib_query_ah;
+	ibdev->ib_dev.destroy_ah	= mlx4_ib_destroy_ah;
+	ibdev->ib_dev.create_srq	= mlx4_ib_create_srq;
+	ibdev->ib_dev.modify_srq	= mlx4_ib_modify_srq;
+	ibdev->ib_dev.query_srq		= mlx4_ib_query_srq;
+	ibdev->ib_dev.destroy_srq	= mlx4_ib_destroy_srq;
+	ibdev->ib_dev.post_srq_recv	= mlx4_ib_post_srq_recv;
+	ibdev->ib_dev.create_qp		= mlx4_ib_create_qp;
+	ibdev->ib_dev.modify_qp		= mlx4_ib_modify_qp;
+	ibdev->ib_dev.query_qp		= mlx4_ib_query_qp;
+	ibdev->ib_dev.destroy_qp	= mlx4_ib_destroy_qp;
+	ibdev->ib_dev.post_send		= mlx4_ib_post_send;
+	ibdev->ib_dev.post_recv		= mlx4_ib_post_recv;
+	ibdev->ib_dev.create_cq		= mlx4_ib_create_cq;
+	ibdev->ib_dev.modify_cq		= mlx4_ib_modify_cq;
+	ibdev->ib_dev.resize_cq		= mlx4_ib_resize_cq;
+	ibdev->ib_dev.destroy_cq	= mlx4_ib_destroy_cq;
+	ibdev->ib_dev.poll_cq		= mlx4_ib_poll_cq;
+	ibdev->ib_dev.req_notify_cq	= mlx4_ib_arm_cq;
+	ibdev->ib_dev.get_dma_mr	= mlx4_ib_get_dma_mr;
+	ibdev->ib_dev.reg_user_mr	= mlx4_ib_reg_user_mr;
+	ibdev->ib_dev.rereg_user_mr	= mlx4_ib_rereg_user_mr;
+	ibdev->ib_dev.dereg_mr		= mlx4_ib_dereg_mr;
+	ibdev->ib_dev.alloc_fast_reg_mr = mlx4_ib_alloc_fast_reg_mr;
+	ibdev->ib_dev.alloc_fast_reg_page_list = mlx4_ib_alloc_fast_reg_page_list;
+	ibdev->ib_dev.free_fast_reg_page_list  = mlx4_ib_free_fast_reg_page_list;
+	ibdev->ib_dev.attach_mcast	= mlx4_ib_mcg_attach;
+	ibdev->ib_dev.detach_mcast	= mlx4_ib_mcg_detach;
+	ibdev->ib_dev.process_mad	= mlx4_ib_process_mad;
+
+	if (!mlx4_is_slave(ibdev->dev)) {
+		ibdev->ib_dev.alloc_fmr		= mlx4_ib_fmr_alloc;
+		ibdev->ib_dev.map_phys_fmr	= mlx4_ib_map_phys_fmr;
+		ibdev->ib_dev.unmap_fmr		= mlx4_ib_unmap_fmr;
+		ibdev->ib_dev.dealloc_fmr	= mlx4_ib_fmr_dealloc;
+	}
+
+	if (dev->caps.flags & MLX4_DEV_CAP_FLAG_MEM_WINDOW ||
+	    dev->caps.bmme_flags & MLX4_BMME_FLAG_TYPE_2_WIN) {
+		ibdev->ib_dev.alloc_mw = mlx4_ib_alloc_mw;
+		ibdev->ib_dev.bind_mw = mlx4_ib_bind_mw;
+		ibdev->ib_dev.dealloc_mw = mlx4_ib_dealloc_mw;
+
+		ibdev->ib_dev.uverbs_cmd_mask |=
+			(1ull << IB_USER_VERBS_CMD_ALLOC_MW) |
+			(1ull << IB_USER_VERBS_CMD_DEALLOC_MW);
+	}
+
+	if (dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC) {
+		ibdev->ib_dev.alloc_xrcd = mlx4_ib_alloc_xrcd;
+		ibdev->ib_dev.dealloc_xrcd = mlx4_ib_dealloc_xrcd;
+		ibdev->ib_dev.uverbs_cmd_mask |=
+			(1ull << IB_USER_VERBS_CMD_OPEN_XRCD) |
+			(1ull << IB_USER_VERBS_CMD_CLOSE_XRCD);
+	}
+
+	if (check_flow_steering_support(dev)) {
+		ibdev->steering_support = MLX4_STEERING_MODE_DEVICE_MANAGED;
+		ibdev->ib_dev.create_flow	= mlx4_ib_create_flow;
+		ibdev->ib_dev.destroy_flow	= mlx4_ib_destroy_flow;
+
+		ibdev->ib_dev.uverbs_ex_cmd_mask	|=
+			(1ull << IB_USER_VERBS_EX_CMD_CREATE_FLOW) |
+			(1ull << IB_USER_VERBS_EX_CMD_DESTROY_FLOW);
+	}
+
+	mlx4_ib_alloc_eqs(dev, ibdev);
+
+	spin_lock_init(&iboe->lock);
+
+	if (init_node_data(ibdev))
+		goto err_map;
+
+	for (i = 0; i < ibdev->num_ports; ++i) {
+		mutex_init(&ibdev->qp1_proxy_lock[i]);
+		if (mlx4_ib_port_link_layer(&ibdev->ib_dev, i + 1) ==
+						IB_LINK_LAYER_ETHERNET) {
+			err = mlx4_counter_alloc(ibdev->dev, &ibdev->counters[i]);
+			if (err)
+				ibdev->counters[i] = -1;
+		} else {
+			ibdev->counters[i] = -1;
+		}
+	}
+
+	mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_IB)
+		ib_num_ports++;
+
+	spin_lock_init(&ibdev->sm_lock);
+	mutex_init(&ibdev->cap_mask_mutex);
+
+	if (ibdev->steering_support == MLX4_STEERING_MODE_DEVICE_MANAGED &&
+	    ib_num_ports) {
+		ibdev->steer_qpn_count = MLX4_IB_UC_MAX_NUM_QPS;
+		err = mlx4_qp_reserve_range(dev, ibdev->steer_qpn_count,
+					    MLX4_IB_UC_STEER_QPN_ALIGN,
+					    &ibdev->steer_qpn_base);
+		if (err)
+			goto err_counter;
+
+		ibdev->ib_uc_qpns_bitmap =
+			kmalloc(BITS_TO_LONGS(ibdev->steer_qpn_count) *
+				sizeof(long),
+				GFP_KERNEL);
+		if (!ibdev->ib_uc_qpns_bitmap) {
+			dev_err(&dev->pdev->dev, "bit map alloc failed\n");
+			goto err_steer_qp_release;
+		}
+
+		bitmap_zero(ibdev->ib_uc_qpns_bitmap, ibdev->steer_qpn_count);
+
+		err = mlx4_FLOW_STEERING_IB_UC_QP_RANGE(
+				dev, ibdev->steer_qpn_base,
+				ibdev->steer_qpn_base +
+				ibdev->steer_qpn_count - 1);
+		if (err)
+			goto err_steer_free_bitmap;
+	}
+
+	for (j = 1; j <= ibdev->dev->caps.num_ports; j++)
+		atomic64_set(&iboe->mac[j - 1], ibdev->dev->caps.def_mac[j]);
+
+	if (ib_register_device(&ibdev->ib_dev, NULL))
+		goto err_steer_free_bitmap;
+
+	if (mlx4_ib_mad_init(ibdev))
+		goto err_reg;
+
+	if (mlx4_ib_init_sriov(ibdev))
+		goto err_mad;
+
+	if (dev->caps.flags & MLX4_DEV_CAP_FLAG_IBOE) {
+		if (!iboe->nb.notifier_call) {
+			iboe->nb.notifier_call = mlx4_ib_netdev_event;
+			err = register_netdevice_notifier(&iboe->nb);
+			if (err) {
+				iboe->nb.notifier_call = NULL;
+				goto err_notif;
+			}
+		}
+		if (!iboe->nb_inet.notifier_call) {
+			iboe->nb_inet.notifier_call = mlx4_ib_inet_event;
+			err = register_inetaddr_notifier(&iboe->nb_inet);
+			if (err) {
+				iboe->nb_inet.notifier_call = NULL;
+				goto err_notif;
+			}
+		}
+#if IS_ENABLED(CONFIG_IPV6)
+		if (!iboe->nb_inet6.notifier_call) {
+			iboe->nb_inet6.notifier_call = mlx4_ib_inet6_event;
+			err = register_inet6addr_notifier(&iboe->nb_inet6);
+			if (err) {
+				iboe->nb_inet6.notifier_call = NULL;
+				goto err_notif;
+			}
+		}
+#endif
+		if (mlx4_ib_init_gid_table(ibdev))
+			goto err_notif;
+	}
+
+	for (j = 0; j < ARRAY_SIZE(mlx4_class_attributes); ++j) {
+		if (device_create_file(&ibdev->ib_dev.dev,
+				       mlx4_class_attributes[j]))
+			goto err_notif;
+	}
+
+	ibdev->ib_active = true;
+
+	if (mlx4_is_mfunc(ibdev->dev))
+		init_pkeys(ibdev);
+
+	/* create paravirt contexts for any VFs which are active */
+	if (mlx4_is_master(ibdev->dev)) {
+		for (j = 0; j < MLX4_MFUNC_MAX; j++) {
+			if (j == mlx4_master_func_num(ibdev->dev))
+				continue;
+			if (mlx4_is_slave_active(ibdev->dev, j))
+				do_slave_init(ibdev, j, 1);
+		}
+	}
+	return ibdev;
+
+err_notif:
+	if (ibdev->iboe.nb.notifier_call) {
+		if (unregister_netdevice_notifier(&ibdev->iboe.nb))
+			pr_warn("failure unregistering notifier\n");
+		ibdev->iboe.nb.notifier_call = NULL;
+	}
+	if (ibdev->iboe.nb_inet.notifier_call) {
+		if (unregister_inetaddr_notifier(&ibdev->iboe.nb_inet))
+			pr_warn("failure unregistering notifier\n");
+		ibdev->iboe.nb_inet.notifier_call = NULL;
+	}
+#if IS_ENABLED(CONFIG_IPV6)
+	if (ibdev->iboe.nb_inet6.notifier_call) {
+		if (unregister_inet6addr_notifier(&ibdev->iboe.nb_inet6))
+			pr_warn("failure unregistering notifier\n");
+		ibdev->iboe.nb_inet6.notifier_call = NULL;
+	}
+#endif
+	flush_workqueue(wq);
+
+	mlx4_ib_close_sriov(ibdev);
+
+err_mad:
+	mlx4_ib_mad_cleanup(ibdev);
+
+err_reg:
+	ib_unregister_device(&ibdev->ib_dev);
+
+err_steer_free_bitmap:
+	kfree(ibdev->ib_uc_qpns_bitmap);
+
+err_steer_qp_release:
+	if (ibdev->steering_support == MLX4_STEERING_MODE_DEVICE_MANAGED)
+		mlx4_qp_release_range(dev, ibdev->steer_qpn_base,
+				      ibdev->steer_qpn_count);
+err_counter:
+	for (; i; --i)
+		if (ibdev->counters[i - 1] != -1)
+			mlx4_counter_free(ibdev->dev, ibdev->counters[i - 1]);
+
+err_map:
+	iounmap(ibdev->uar_map);
+
+err_uar:
+	mlx4_uar_free(dev, &ibdev->priv_uar);
+
+err_pd:
+	mlx4_pd_free(dev, ibdev->priv_pdn);
+
+err_dealloc:
+	ib_dealloc_device(&ibdev->ib_dev);
+
+	return NULL;
+}
+
+int mlx4_ib_steer_qp_alloc(struct mlx4_ib_dev *dev, int count, int *qpn)
+{
+	int offset;
+
+	WARN_ON(!dev->ib_uc_qpns_bitmap);
+
+	offset = bitmap_find_free_region(dev->ib_uc_qpns_bitmap,
+					 dev->steer_qpn_count,
+					 get_count_order(count));
+	if (offset < 0)
+		return offset;
+
+	*qpn = dev->steer_qpn_base + offset;
+	return 0;
+}
+
+void mlx4_ib_steer_qp_free(struct mlx4_ib_dev *dev, u32 qpn, int count)
+{
+	if (!qpn ||
+	    dev->steering_support != MLX4_STEERING_MODE_DEVICE_MANAGED)
+		return;
+
+	BUG_ON(qpn < dev->steer_qpn_base);
+
+	bitmap_release_region(dev->ib_uc_qpns_bitmap,
+			      qpn - dev->steer_qpn_base,
+			      get_count_order(count));
+}
+
+int mlx4_ib_steer_qp_reg(struct mlx4_ib_dev *mdev, struct mlx4_ib_qp *mqp,
+			 int is_attach)
+{
+	int err;
+	size_t flow_size;
+	struct ib_flow_attr *flow = NULL;
+	struct ib_flow_spec_ib *ib_spec;
+
+	if (is_attach) {
+		flow_size = sizeof(struct ib_flow_attr) +
+			    sizeof(struct ib_flow_spec_ib);
+		flow = kzalloc(flow_size, GFP_KERNEL);
+		if (!flow)
+			return -ENOMEM;
+		flow->port = mqp->port;
+		flow->num_of_specs = 1;
+		flow->size = flow_size;
+		ib_spec = (struct ib_flow_spec_ib *)(flow + 1);
+		ib_spec->type = IB_FLOW_SPEC_IB;
+		ib_spec->size = sizeof(struct ib_flow_spec_ib);
+		/* Add an empty rule for IB L2 */
+		memset(&ib_spec->mask, 0, sizeof(ib_spec->mask));
+
+		err = __mlx4_ib_create_flow(&mqp->ibqp, flow,
+					    IB_FLOW_DOMAIN_NIC,
+					    MLX4_FS_REGULAR,
+					    &mqp->reg_id);
+	} else {
+		err = __mlx4_ib_destroy_flow(mdev->dev, mqp->reg_id);
+	}
+	kfree(flow);
+	return err;
+}
+
+static void mlx4_ib_remove(struct mlx4_dev *dev, void *ibdev_ptr)
+{
+	struct mlx4_ib_dev *ibdev = ibdev_ptr;
+	int p;
+
+	ibdev->ib_active = false;
+	flush_workqueue(wq);
+
+	mlx4_ib_close_sriov(ibdev);
+	mlx4_ib_mad_cleanup(ibdev);
+	ib_unregister_device(&ibdev->ib_dev);
+	if (ibdev->iboe.nb.notifier_call) {
+		if (unregister_netdevice_notifier(&ibdev->iboe.nb))
+			pr_warn("failure unregistering notifier\n");
+		ibdev->iboe.nb.notifier_call = NULL;
+	}
+
+	if (ibdev->steering_support == MLX4_STEERING_MODE_DEVICE_MANAGED) {
+		mlx4_qp_release_range(dev, ibdev->steer_qpn_base,
+				      ibdev->steer_qpn_count);
+		kfree(ibdev->ib_uc_qpns_bitmap);
+	}
+
+	if (ibdev->iboe.nb_inet.notifier_call) {
+		if (unregister_inetaddr_notifier(&ibdev->iboe.nb_inet))
+			pr_warn("failure unregistering notifier\n");
+		ibdev->iboe.nb_inet.notifier_call = NULL;
+	}
+#if IS_ENABLED(CONFIG_IPV6)
+	if (ibdev->iboe.nb_inet6.notifier_call) {
+		if (unregister_inet6addr_notifier(&ibdev->iboe.nb_inet6))
+			pr_warn("failure unregistering notifier\n");
+		ibdev->iboe.nb_inet6.notifier_call = NULL;
+	}
+#endif
+
+	iounmap(ibdev->uar_map);
+	for (p = 0; p < ibdev->num_ports; ++p)
+		if (ibdev->counters[p] != -1)
+			mlx4_counter_free(ibdev->dev, ibdev->counters[p]);
+	mlx4_foreach_port(p, dev, MLX4_PORT_TYPE_IB)
+		mlx4_CLOSE_PORT(dev, p);
+
+	mlx4_ib_free_eqs(dev, ibdev);
+
+	mlx4_uar_free(dev, &ibdev->priv_uar);
+	mlx4_pd_free(dev, ibdev->priv_pdn);
+	ib_dealloc_device(&ibdev->ib_dev);
+}
+
+static void do_slave_init(struct mlx4_ib_dev *ibdev, int slave, int do_init)
+{
+	struct mlx4_ib_demux_work **dm = NULL;
+	struct mlx4_dev *dev = ibdev->dev;
+	int i;
+	unsigned long flags;
+	struct mlx4_active_ports actv_ports;
+	unsigned int ports;
+	unsigned int first_port;
+
+	if (!mlx4_is_master(dev))
+		return;
+
+	actv_ports = mlx4_get_active_ports(dev, slave);
+	ports = bitmap_weight(actv_ports.ports, dev->caps.num_ports);
+	first_port = find_first_bit(actv_ports.ports, dev->caps.num_ports);
+
+	dm = kcalloc(ports, sizeof(*dm), GFP_ATOMIC);
+	if (!dm) {
+		pr_err("failed to allocate memory for tunneling qp update\n");
+		goto out;
+	}
+
+	for (i = 0; i < ports; i++) {
+		dm[i] = kmalloc(sizeof (struct mlx4_ib_demux_work), GFP_ATOMIC);
+		if (!dm[i]) {
+			pr_err("failed to allocate memory for tunneling qp update work struct\n");
+			for (i = 0; i < dev->caps.num_ports; i++) {
+				if (dm[i])
+					kfree(dm[i]);
+			}
+			goto out;
+		}
+	}
+	/* initialize or tear down tunnel QPs for the slave */
+	for (i = 0; i < ports; i++) {
+		INIT_WORK(&dm[i]->work, mlx4_ib_tunnels_update_work);
+		dm[i]->port = first_port + i + 1;
+		dm[i]->slave = slave;
+		dm[i]->do_init = do_init;
+		dm[i]->dev = ibdev;
+		spin_lock_irqsave(&ibdev->sriov.going_down_lock, flags);
+		if (!ibdev->sriov.is_going_down)
+			queue_work(ibdev->sriov.demux[i].ud_wq, &dm[i]->work);
+		spin_unlock_irqrestore(&ibdev->sriov.going_down_lock, flags);
+	}
+out:
+	kfree(dm);
+	return;
+}
+
+static void mlx4_ib_event(struct mlx4_dev *dev, void *ibdev_ptr,
+			  enum mlx4_dev_event event, unsigned long param)
+{
+	struct ib_event ibev;
+	struct mlx4_ib_dev *ibdev = to_mdev((struct ib_device *) ibdev_ptr);
+	struct mlx4_eqe *eqe = NULL;
+	struct ib_event_work *ew;
+	int p = 0;
+
+	if (event == MLX4_DEV_EVENT_PORT_MGMT_CHANGE)
+		eqe = (struct mlx4_eqe *)param;
+	else
+		p = (int) param;
+
+	switch (event) {
+	case MLX4_DEV_EVENT_PORT_UP:
+		if (p > ibdev->num_ports)
+			return;
+		if (mlx4_is_master(dev) &&
+		    rdma_port_get_link_layer(&ibdev->ib_dev, p) ==
+			IB_LINK_LAYER_INFINIBAND) {
+			mlx4_ib_invalidate_all_guid_record(ibdev, p);
+		}
+		ibev.event = IB_EVENT_PORT_ACTIVE;
+		break;
+
+	case MLX4_DEV_EVENT_PORT_DOWN:
+		if (p > ibdev->num_ports)
+			return;
+		ibev.event = IB_EVENT_PORT_ERR;
+		break;
+
+	case MLX4_DEV_EVENT_CATASTROPHIC_ERROR:
+		ibdev->ib_active = false;
+		ibev.event = IB_EVENT_DEVICE_FATAL;
+		break;
+
+	case MLX4_DEV_EVENT_PORT_MGMT_CHANGE:
+		ew = kmalloc(sizeof *ew, GFP_ATOMIC);
+		if (!ew) {
+			pr_err("failed to allocate memory for events work\n");
+			break;
+		}
+
+		INIT_WORK(&ew->work, handle_port_mgmt_change_event);
+		memcpy(&ew->ib_eqe, eqe, sizeof *eqe);
+		ew->ib_dev = ibdev;
+		/* need to queue only for port owner, which uses GEN_EQE */
+		if (mlx4_is_master(dev))
+			queue_work(wq, &ew->work);
+		else
+			handle_port_mgmt_change_event(&ew->work);
+		return;
+
+	case MLX4_DEV_EVENT_SLAVE_INIT:
+		/* here, p is the slave id */
+		do_slave_init(ibdev, p, 1);
+		return;
+
+	case MLX4_DEV_EVENT_SLAVE_SHUTDOWN:
+		/* here, p is the slave id */
+		do_slave_init(ibdev, p, 0);
+		return;
+
+	default:
+		return;
+	}
+
+	ibev.device	      = ibdev_ptr;
+	ibev.element.port_num = (u8) p;
+
+	ib_dispatch_event(&ibev);
+}
+
+static struct mlx4_interface mlx4_ib_interface = {
+	.add		= mlx4_ib_add,
+	.remove		= mlx4_ib_remove,
+	.event		= mlx4_ib_event,
+	.protocol	= MLX4_PROT_IB_IPV6
+};
+
+static int __init mlx4_ib_init(void)
+{
+	int err;
+
+	wq = create_singlethread_workqueue("mlx4_ib");
+	if (!wq)
+		return -ENOMEM;
+
+	err = mlx4_ib_mcg_init();
+	if (err)
+		goto clean_wq;
+
+	err = mlx4_register_interface(&mlx4_ib_interface);
+	if (err)
+		goto clean_mcg;
+
+	return 0;
+
+clean_mcg:
+	mlx4_ib_mcg_destroy();
+
+clean_wq:
+	destroy_workqueue(wq);
+	return err;
+}
+
+static void __exit mlx4_ib_cleanup(void)
+{
+	mlx4_unregister_interface(&mlx4_ib_interface);
+	mlx4_ib_mcg_destroy();
+	destroy_workqueue(wq);
+}
+
+module_init(mlx4_ib_init);
+module_exit(mlx4_ib_cleanup);
diff --git a/drivers/infiniband/hw/mlx4/mcg.c b/drivers/infiniband/hw/mlx4/mcg.c
new file mode 100644
index 0000000..ed327e6
--- /dev/null
+++ b/drivers/infiniband/hw/mlx4/mcg.c
@@ -0,0 +1,1257 @@
+/*
+ * Copyright (c) 2012 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <rdma/ib_mad.h>
+#include <rdma/ib_smi.h>
+#include <rdma/ib_cache.h>
+#include <rdma/ib_sa.h>
+
+#include <linux/mlx4/cmd.h>
+#include <linux/rbtree.h>
+#include <linux/delay.h>
+
+#include "mlx4_ib.h"
+
+#define MAX_VFS		80
+#define MAX_PEND_REQS_PER_FUNC 4
+#define MAD_TIMEOUT_MS	2000
+
+#define mcg_warn(fmt, arg...)	pr_warn("MCG WARNING: " fmt, ##arg)
+#define mcg_error(fmt, arg...)	pr_err(fmt, ##arg)
+#define mcg_warn_group(group, format, arg...) \
+	pr_warn("%s-%d: %16s (port %d): WARNING: " format, __func__, __LINE__,\
+	(group)->name, group->demux->port, ## arg)
+
+#define mcg_error_group(group, format, arg...) \
+	pr_err("  %16s: " format, (group)->name, ## arg)
+
+
+static union ib_gid mgid0;
+
+static struct workqueue_struct *clean_wq;
+
+enum mcast_state {
+	MCAST_NOT_MEMBER = 0,
+	MCAST_MEMBER,
+};
+
+enum mcast_group_state {
+	MCAST_IDLE,
+	MCAST_JOIN_SENT,
+	MCAST_LEAVE_SENT,
+	MCAST_RESP_READY
+};
+
+struct mcast_member {
+	enum mcast_state state;
+	uint8_t			join_state;
+	int			num_pend_reqs;
+	struct list_head	pending;
+};
+
+struct ib_sa_mcmember_data {
+	union ib_gid	mgid;
+	union ib_gid	port_gid;
+	__be32		qkey;
+	__be16		mlid;
+	u8		mtusel_mtu;
+	u8		tclass;
+	__be16		pkey;
+	u8		ratesel_rate;
+	u8		lifetmsel_lifetm;
+	__be32		sl_flowlabel_hoplimit;
+	u8		scope_join_state;
+	u8		proxy_join;
+	u8		reserved[2];
+};
+
+struct mcast_group {
+	struct ib_sa_mcmember_data rec;
+	struct rb_node		node;
+	struct list_head	mgid0_list;
+	struct mlx4_ib_demux_ctx *demux;
+	struct mcast_member	func[MAX_VFS];
+	struct mutex		lock;
+	struct work_struct	work;
+	struct list_head	pending_list;
+	int			members[3];
+	enum mcast_group_state	state;
+	enum mcast_group_state	prev_state;
+	struct ib_sa_mad	response_sa_mad;
+	__be64			last_req_tid;
+
+	char			name[33]; /* MGID string */
+	struct device_attribute	dentry;
+
+	/* refcount is the reference count for the following:
+	   1. Each queued request
+	   2. Each invocation of the worker thread
+	   3. Membership of the port at the SA
+	*/
+	atomic_t		refcount;
+
+	/* delayed work to clean pending SM request */
+	struct delayed_work	timeout_work;
+	struct list_head	cleanup_list;
+};
+
+struct mcast_req {
+	int			func;
+	struct ib_sa_mad	sa_mad;
+	struct list_head	group_list;
+	struct list_head	func_list;
+	struct mcast_group	*group;
+	int			clean;
+};
+
+
+#define safe_atomic_dec(ref) \
+	do {\
+		if (atomic_dec_and_test(ref)) \
+			mcg_warn_group(group, "did not expect to reach zero\n"); \
+	} while (0)
+
+static const char *get_state_string(enum mcast_group_state state)
+{
+	switch (state) {
+	case MCAST_IDLE:
+		return "MCAST_IDLE";
+	case MCAST_JOIN_SENT:
+		return "MCAST_JOIN_SENT";
+	case MCAST_LEAVE_SENT:
+		return "MCAST_LEAVE_SENT";
+	case MCAST_RESP_READY:
+		return "MCAST_RESP_READY";
+	}
+	return "Invalid State";
+}
+
+static struct mcast_group *mcast_find(struct mlx4_ib_demux_ctx *ctx,
+				      union ib_gid *mgid)
+{
+	struct rb_node *node = ctx->mcg_table.rb_node;
+	struct mcast_group *group;
+	int ret;
+
+	while (node) {
+		group = rb_entry(node, struct mcast_group, node);
+		ret = memcmp(mgid->raw, group->rec.mgid.raw, sizeof *mgid);
+		if (!ret)
+			return group;
+
+		if (ret < 0)
+			node = node->rb_left;
+		else
+			node = node->rb_right;
+	}
+	return NULL;
+}
+
+static struct mcast_group *mcast_insert(struct mlx4_ib_demux_ctx *ctx,
+					struct mcast_group *group)
+{
+	struct rb_node **link = &ctx->mcg_table.rb_node;
+	struct rb_node *parent = NULL;
+	struct mcast_group *cur_group;
+	int ret;
+
+	while (*link) {
+		parent = *link;
+		cur_group = rb_entry(parent, struct mcast_group, node);
+
+		ret = memcmp(group->rec.mgid.raw, cur_group->rec.mgid.raw,
+			     sizeof group->rec.mgid);
+		if (ret < 0)
+			link = &(*link)->rb_left;
+		else if (ret > 0)
+			link = &(*link)->rb_right;
+		else
+			return cur_group;
+	}
+	rb_link_node(&group->node, parent, link);
+	rb_insert_color(&group->node, &ctx->mcg_table);
+	return NULL;
+}
+
+static int send_mad_to_wire(struct mlx4_ib_demux_ctx *ctx, struct ib_mad *mad)
+{
+	struct mlx4_ib_dev *dev = ctx->dev;
+	struct ib_ah_attr	ah_attr;
+
+	spin_lock(&dev->sm_lock);
+	if (!dev->sm_ah[ctx->port - 1]) {
+		/* port is not yet Active, sm_ah not ready */
+		spin_unlock(&dev->sm_lock);
+		return -EAGAIN;
+	}
+	mlx4_ib_query_ah(dev->sm_ah[ctx->port - 1], &ah_attr);
+	spin_unlock(&dev->sm_lock);
+	return mlx4_ib_send_to_wire(dev, mlx4_master_func_num(dev->dev),
+				    ctx->port, IB_QPT_GSI, 0, 1, IB_QP1_QKEY,
+				    &ah_attr, NULL, mad);
+}
+
+static int send_mad_to_slave(int slave, struct mlx4_ib_demux_ctx *ctx,
+			     struct ib_mad *mad)
+{
+	struct mlx4_ib_dev *dev = ctx->dev;
+	struct ib_mad_agent *agent = dev->send_agent[ctx->port - 1][1];
+	struct ib_wc wc;
+	struct ib_ah_attr ah_attr;
+
+	/* Our agent might not yet be registered when mads start to arrive */
+	if (!agent)
+		return -EAGAIN;
+
+	ib_query_ah(dev->sm_ah[ctx->port - 1], &ah_attr);
+
+	if (ib_find_cached_pkey(&dev->ib_dev, ctx->port, IB_DEFAULT_PKEY_FULL, &wc.pkey_index))
+		return -EINVAL;
+	wc.sl = 0;
+	wc.dlid_path_bits = 0;
+	wc.port_num = ctx->port;
+	wc.slid = ah_attr.dlid;  /* opensm lid */
+	wc.src_qp = 1;
+	return mlx4_ib_send_to_slave(dev, slave, ctx->port, IB_QPT_GSI, &wc, NULL, mad);
+}
+
+static int send_join_to_wire(struct mcast_group *group, struct ib_sa_mad *sa_mad)
+{
+	struct ib_sa_mad mad;
+	struct ib_sa_mcmember_data *sa_mad_data = (struct ib_sa_mcmember_data *)&mad.data;
+	int ret;
+
+	/* we rely on a mad request as arrived from a VF */
+	memcpy(&mad, sa_mad, sizeof mad);
+
+	/* fix port GID to be the real one (slave 0) */
+	sa_mad_data->port_gid.global.interface_id = group->demux->guid_cache[0];
+
+	/* assign our own TID */
+	mad.mad_hdr.tid = mlx4_ib_get_new_demux_tid(group->demux);
+	group->last_req_tid = mad.mad_hdr.tid; /* keep it for later validation */
+
+	ret = send_mad_to_wire(group->demux, (struct ib_mad *)&mad);
+	/* set timeout handler */
+	if (!ret) {
+		/* calls mlx4_ib_mcg_timeout_handler */
+		queue_delayed_work(group->demux->mcg_wq, &group->timeout_work,
+				msecs_to_jiffies(MAD_TIMEOUT_MS));
+	}
+
+	return ret;
+}
+
+static int send_leave_to_wire(struct mcast_group *group, u8 join_state)
+{
+	struct ib_sa_mad mad;
+	struct ib_sa_mcmember_data *sa_data = (struct ib_sa_mcmember_data *)&mad.data;
+	int ret;
+
+	memset(&mad, 0, sizeof mad);
+	mad.mad_hdr.base_version = 1;
+	mad.mad_hdr.mgmt_class = IB_MGMT_CLASS_SUBN_ADM;
+	mad.mad_hdr.class_version = 2;
+	mad.mad_hdr.method = IB_SA_METHOD_DELETE;
+	mad.mad_hdr.status = cpu_to_be16(0);
+	mad.mad_hdr.class_specific = cpu_to_be16(0);
+	mad.mad_hdr.tid = mlx4_ib_get_new_demux_tid(group->demux);
+	group->last_req_tid = mad.mad_hdr.tid; /* keep it for later validation */
+	mad.mad_hdr.attr_id = cpu_to_be16(IB_SA_ATTR_MC_MEMBER_REC);
+	mad.mad_hdr.attr_mod = cpu_to_be32(0);
+	mad.sa_hdr.sm_key = 0x0;
+	mad.sa_hdr.attr_offset = cpu_to_be16(7);
+	mad.sa_hdr.comp_mask = IB_SA_MCMEMBER_REC_MGID |
+		IB_SA_MCMEMBER_REC_PORT_GID | IB_SA_MCMEMBER_REC_JOIN_STATE;
+
+	*sa_data = group->rec;
+	sa_data->scope_join_state = join_state;
+
+	ret = send_mad_to_wire(group->demux, (struct ib_mad *)&mad);
+	if (ret)
+		group->state = MCAST_IDLE;
+
+	/* set timeout handler */
+	if (!ret) {
+		/* calls mlx4_ib_mcg_timeout_handler */
+		queue_delayed_work(group->demux->mcg_wq, &group->timeout_work,
+				msecs_to_jiffies(MAD_TIMEOUT_MS));
+	}
+
+	return ret;
+}
+
+static int send_reply_to_slave(int slave, struct mcast_group *group,
+		struct ib_sa_mad *req_sa_mad, u16 status)
+{
+	struct ib_sa_mad mad;
+	struct ib_sa_mcmember_data *sa_data = (struct ib_sa_mcmember_data *)&mad.data;
+	struct ib_sa_mcmember_data *req_sa_data = (struct ib_sa_mcmember_data *)&req_sa_mad->data;
+	int ret;
+
+	memset(&mad, 0, sizeof mad);
+	mad.mad_hdr.base_version = 1;
+	mad.mad_hdr.mgmt_class = IB_MGMT_CLASS_SUBN_ADM;
+	mad.mad_hdr.class_version = 2;
+	mad.mad_hdr.method = IB_MGMT_METHOD_GET_RESP;
+	mad.mad_hdr.status = cpu_to_be16(status);
+	mad.mad_hdr.class_specific = cpu_to_be16(0);
+	mad.mad_hdr.tid = req_sa_mad->mad_hdr.tid;
+	*(u8 *)&mad.mad_hdr.tid = 0; /* resetting tid to 0 */
+	mad.mad_hdr.attr_id = cpu_to_be16(IB_SA_ATTR_MC_MEMBER_REC);
+	mad.mad_hdr.attr_mod = cpu_to_be32(0);
+	mad.sa_hdr.sm_key = req_sa_mad->sa_hdr.sm_key;
+	mad.sa_hdr.attr_offset = cpu_to_be16(7);
+	mad.sa_hdr.comp_mask = 0; /* ignored on responses, see IBTA spec */
+
+	*sa_data = group->rec;
+
+	/* reconstruct VF's requested join_state and port_gid */
+	sa_data->scope_join_state &= 0xf0;
+	sa_data->scope_join_state |= (group->func[slave].join_state & 0x0f);
+	memcpy(&sa_data->port_gid, &req_sa_data->port_gid, sizeof req_sa_data->port_gid);
+
+	ret = send_mad_to_slave(slave, group->demux, (struct ib_mad *)&mad);
+	return ret;
+}
+
+static int check_selector(ib_sa_comp_mask comp_mask,
+			  ib_sa_comp_mask selector_mask,
+			  ib_sa_comp_mask value_mask,
+			  u8 src_value, u8 dst_value)
+{
+	int err;
+	u8 selector = dst_value >> 6;
+	dst_value &= 0x3f;
+	src_value &= 0x3f;
+
+	if (!(comp_mask & selector_mask) || !(comp_mask & value_mask))
+		return 0;
+
+	switch (selector) {
+	case IB_SA_GT:
+		err = (src_value <= dst_value);
+		break;
+	case IB_SA_LT:
+		err = (src_value >= dst_value);
+		break;
+	case IB_SA_EQ:
+		err = (src_value != dst_value);
+		break;
+	default:
+		err = 0;
+		break;
+	}
+
+	return err;
+}
+
+static u16 cmp_rec(struct ib_sa_mcmember_data *src,
+		   struct ib_sa_mcmember_data *dst, ib_sa_comp_mask comp_mask)
+{
+	/* src is group record, dst is request record */
+	/* MGID must already match */
+	/* Port_GID we always replace to our Port_GID, so it is a match */
+
+#define MAD_STATUS_REQ_INVALID 0x0200
+	if (comp_mask & IB_SA_MCMEMBER_REC_QKEY && src->qkey != dst->qkey)
+		return MAD_STATUS_REQ_INVALID;
+	if (comp_mask & IB_SA_MCMEMBER_REC_MLID && src->mlid != dst->mlid)
+		return MAD_STATUS_REQ_INVALID;
+	if (check_selector(comp_mask, IB_SA_MCMEMBER_REC_MTU_SELECTOR,
+				 IB_SA_MCMEMBER_REC_MTU,
+				 src->mtusel_mtu, dst->mtusel_mtu))
+		return MAD_STATUS_REQ_INVALID;
+	if (comp_mask & IB_SA_MCMEMBER_REC_TRAFFIC_CLASS &&
+	    src->tclass != dst->tclass)
+		return MAD_STATUS_REQ_INVALID;
+	if (comp_mask & IB_SA_MCMEMBER_REC_PKEY && src->pkey != dst->pkey)
+		return MAD_STATUS_REQ_INVALID;
+	if (check_selector(comp_mask, IB_SA_MCMEMBER_REC_RATE_SELECTOR,
+				 IB_SA_MCMEMBER_REC_RATE,
+				 src->ratesel_rate, dst->ratesel_rate))
+		return MAD_STATUS_REQ_INVALID;
+	if (check_selector(comp_mask,
+				 IB_SA_MCMEMBER_REC_PACKET_LIFE_TIME_SELECTOR,
+				 IB_SA_MCMEMBER_REC_PACKET_LIFE_TIME,
+				 src->lifetmsel_lifetm, dst->lifetmsel_lifetm))
+		return MAD_STATUS_REQ_INVALID;
+	if (comp_mask & IB_SA_MCMEMBER_REC_SL &&
+			(be32_to_cpu(src->sl_flowlabel_hoplimit) & 0xf0000000) !=
+			(be32_to_cpu(dst->sl_flowlabel_hoplimit) & 0xf0000000))
+		return MAD_STATUS_REQ_INVALID;
+	if (comp_mask & IB_SA_MCMEMBER_REC_FLOW_LABEL &&
+			(be32_to_cpu(src->sl_flowlabel_hoplimit) & 0x0fffff00) !=
+			(be32_to_cpu(dst->sl_flowlabel_hoplimit) & 0x0fffff00))
+		return MAD_STATUS_REQ_INVALID;
+	if (comp_mask & IB_SA_MCMEMBER_REC_HOP_LIMIT &&
+			(be32_to_cpu(src->sl_flowlabel_hoplimit) & 0x000000ff) !=
+			(be32_to_cpu(dst->sl_flowlabel_hoplimit) & 0x000000ff))
+		return MAD_STATUS_REQ_INVALID;
+	if (comp_mask & IB_SA_MCMEMBER_REC_SCOPE &&
+			(src->scope_join_state & 0xf0) !=
+			(dst->scope_join_state & 0xf0))
+		return MAD_STATUS_REQ_INVALID;
+
+	/* join_state checked separately, proxy_join ignored */
+
+	return 0;
+}
+
+/* release group, return 1 if this was last release and group is destroyed
+ * timout work is canceled sync */
+static int release_group(struct mcast_group *group, int from_timeout_handler)
+{
+	struct mlx4_ib_demux_ctx *ctx = group->demux;
+	int nzgroup;
+
+	mutex_lock(&ctx->mcg_table_lock);
+	mutex_lock(&group->lock);
+	if (atomic_dec_and_test(&group->refcount)) {
+		if (!from_timeout_handler) {
+			if (group->state != MCAST_IDLE &&
+			    !cancel_delayed_work(&group->timeout_work)) {
+				atomic_inc(&group->refcount);
+				mutex_unlock(&group->lock);
+				mutex_unlock(&ctx->mcg_table_lock);
+				return 0;
+			}
+		}
+
+		nzgroup = memcmp(&group->rec.mgid, &mgid0, sizeof mgid0);
+		if (nzgroup)
+			del_sysfs_port_mcg_attr(ctx->dev, ctx->port, &group->dentry.attr);
+		if (!list_empty(&group->pending_list))
+			mcg_warn_group(group, "releasing a group with non empty pending list\n");
+		if (nzgroup)
+			rb_erase(&group->node, &ctx->mcg_table);
+		list_del_init(&group->mgid0_list);
+		mutex_unlock(&group->lock);
+		mutex_unlock(&ctx->mcg_table_lock);
+		kfree(group);
+		return 1;
+	} else {
+		mutex_unlock(&group->lock);
+		mutex_unlock(&ctx->mcg_table_lock);
+	}
+	return 0;
+}
+
+static void adjust_membership(struct mcast_group *group, u8 join_state, int inc)
+{
+	int i;
+
+	for (i = 0; i < 3; i++, join_state >>= 1)
+		if (join_state & 0x1)
+			group->members[i] += inc;
+}
+
+static u8 get_leave_state(struct mcast_group *group)
+{
+	u8 leave_state = 0;
+	int i;
+
+	for (i = 0; i < 3; i++)
+		if (!group->members[i])
+			leave_state |= (1 << i);
+
+	return leave_state & (group->rec.scope_join_state & 7);
+}
+
+static int join_group(struct mcast_group *group, int slave, u8 join_mask)
+{
+	int ret = 0;
+	u8 join_state;
+
+	/* remove bits that slave is already member of, and adjust */
+	join_state = join_mask & (~group->func[slave].join_state);
+	adjust_membership(group, join_state, 1);
+	group->func[slave].join_state |= join_state;
+	if (group->func[slave].state != MCAST_MEMBER && join_state) {
+		group->func[slave].state = MCAST_MEMBER;
+		ret = 1;
+	}
+	return ret;
+}
+
+static int leave_group(struct mcast_group *group, int slave, u8 leave_state)
+{
+	int ret = 0;
+
+	adjust_membership(group, leave_state, -1);
+	group->func[slave].join_state &= ~leave_state;
+	if (!group->func[slave].join_state) {
+		group->func[slave].state = MCAST_NOT_MEMBER;
+		ret = 1;
+	}
+	return ret;
+}
+
+static int check_leave(struct mcast_group *group, int slave, u8 leave_mask)
+{
+	if (group->func[slave].state != MCAST_MEMBER)
+		return MAD_STATUS_REQ_INVALID;
+
+	/* make sure we're not deleting unset bits */
+	if (~group->func[slave].join_state & leave_mask)
+		return MAD_STATUS_REQ_INVALID;
+
+	if (!leave_mask)
+		return MAD_STATUS_REQ_INVALID;
+
+	return 0;
+}
+
+static void mlx4_ib_mcg_timeout_handler(struct work_struct *work)
+{
+	struct delayed_work *delay = to_delayed_work(work);
+	struct mcast_group *group;
+	struct mcast_req *req = NULL;
+
+	group = container_of(delay, typeof(*group), timeout_work);
+
+	mutex_lock(&group->lock);
+	if (group->state == MCAST_JOIN_SENT) {
+		if (!list_empty(&group->pending_list)) {
+			req = list_first_entry(&group->pending_list, struct mcast_req, group_list);
+			list_del(&req->group_list);
+			list_del(&req->func_list);
+			--group->func[req->func].num_pend_reqs;
+			mutex_unlock(&group->lock);
+			kfree(req);
+			if (memcmp(&group->rec.mgid, &mgid0, sizeof mgid0)) {
+				if (release_group(group, 1))
+					return;
+			} else {
+				kfree(group);
+				return;
+			}
+			mutex_lock(&group->lock);
+		} else
+			mcg_warn_group(group, "DRIVER BUG\n");
+	} else if (group->state == MCAST_LEAVE_SENT) {
+		if (group->rec.scope_join_state & 7)
+			group->rec.scope_join_state &= 0xf8;
+		group->state = MCAST_IDLE;
+		mutex_unlock(&group->lock);
+		if (release_group(group, 1))
+			return;
+		mutex_lock(&group->lock);
+	} else
+		mcg_warn_group(group, "invalid state %s\n", get_state_string(group->state));
+	group->state = MCAST_IDLE;
+	atomic_inc(&group->refcount);
+	if (!queue_work(group->demux->mcg_wq, &group->work))
+		safe_atomic_dec(&group->refcount);
+
+	mutex_unlock(&group->lock);
+}
+
+static int handle_leave_req(struct mcast_group *group, u8 leave_mask,
+			    struct mcast_req *req)
+{
+	u16 status;
+
+	if (req->clean)
+		leave_mask = group->func[req->func].join_state;
+
+	status = check_leave(group, req->func, leave_mask);
+	if (!status)
+		leave_group(group, req->func, leave_mask);
+
+	if (!req->clean)
+		send_reply_to_slave(req->func, group, &req->sa_mad, status);
+	--group->func[req->func].num_pend_reqs;
+	list_del(&req->group_list);
+	list_del(&req->func_list);
+	kfree(req);
+	return 1;
+}
+
+static int handle_join_req(struct mcast_group *group, u8 join_mask,
+			   struct mcast_req *req)
+{
+	u8 group_join_state = group->rec.scope_join_state & 7;
+	int ref = 0;
+	u16 status;
+	struct ib_sa_mcmember_data *sa_data = (struct ib_sa_mcmember_data *)req->sa_mad.data;
+
+	if (join_mask == (group_join_state & join_mask)) {
+		/* port's membership need not change */
+		status = cmp_rec(&group->rec, sa_data, req->sa_mad.sa_hdr.comp_mask);
+		if (!status)
+			join_group(group, req->func, join_mask);
+
+		--group->func[req->func].num_pend_reqs;
+		send_reply_to_slave(req->func, group, &req->sa_mad, status);
+		list_del(&req->group_list);
+		list_del(&req->func_list);
+		kfree(req);
+		++ref;
+	} else {
+		/* port's membership needs to be updated */
+		group->prev_state = group->state;
+		if (send_join_to_wire(group, &req->sa_mad)) {
+			--group->func[req->func].num_pend_reqs;
+			list_del(&req->group_list);
+			list_del(&req->func_list);
+			kfree(req);
+			ref = 1;
+			group->state = group->prev_state;
+		} else
+			group->state = MCAST_JOIN_SENT;
+	}
+
+	return ref;
+}
+
+static void mlx4_ib_mcg_work_handler(struct work_struct *work)
+{
+	struct mcast_group *group;
+	struct mcast_req *req = NULL;
+	struct ib_sa_mcmember_data *sa_data;
+	u8 req_join_state;
+	int rc = 1; /* release_count - this is for the scheduled work */
+	u16 status;
+	u8 method;
+
+	group = container_of(work, typeof(*group), work);
+
+	mutex_lock(&group->lock);
+
+	/* First, let's see if a response from SM is waiting regarding this group.
+	 * If so, we need to update the group's REC. If this is a bad response, we
+	 * may need to send a bad response to a VF waiting for it. If VF is waiting
+	 * and this is a good response, the VF will be answered later in this func. */
+	if (group->state == MCAST_RESP_READY) {
+		/* cancels mlx4_ib_mcg_timeout_handler */
+		cancel_delayed_work(&group->timeout_work);
+		status = be16_to_cpu(group->response_sa_mad.mad_hdr.status);
+		method = group->response_sa_mad.mad_hdr.method;
+		if (group->last_req_tid != group->response_sa_mad.mad_hdr.tid) {
+			mcg_warn_group(group, "Got MAD response to existing MGID but wrong TID, dropping. Resp TID=%llx, group TID=%llx\n",
+				be64_to_cpu(group->response_sa_mad.mad_hdr.tid),
+				be64_to_cpu(group->last_req_tid));
+			group->state = group->prev_state;
+			goto process_requests;
+		}
+		if (status) {
+			if (!list_empty(&group->pending_list))
+				req = list_first_entry(&group->pending_list,
+						struct mcast_req, group_list);
+			if ((method == IB_MGMT_METHOD_GET_RESP)) {
+					if (req) {
+						send_reply_to_slave(req->func, group, &req->sa_mad, status);
+						--group->func[req->func].num_pend_reqs;
+						list_del(&req->group_list);
+						list_del(&req->func_list);
+						kfree(req);
+						++rc;
+					} else
+						mcg_warn_group(group, "no request for failed join\n");
+			} else if (method == IB_SA_METHOD_DELETE_RESP && group->demux->flushing)
+				++rc;
+		} else {
+			u8 resp_join_state;
+			u8 cur_join_state;
+
+			resp_join_state = ((struct ib_sa_mcmember_data *)
+						group->response_sa_mad.data)->scope_join_state & 7;
+			cur_join_state = group->rec.scope_join_state & 7;
+
+			if (method == IB_MGMT_METHOD_GET_RESP) {
+				/* successfull join */
+				if (!cur_join_state && resp_join_state)
+					--rc;
+			} else if (!resp_join_state)
+					++rc;
+			memcpy(&group->rec, group->response_sa_mad.data, sizeof group->rec);
+		}
+		group->state = MCAST_IDLE;
+	}
+
+process_requests:
+	/* We should now go over pending join/leave requests, as long as we are idle. */
+	while (!list_empty(&group->pending_list) && group->state == MCAST_IDLE) {
+		req = list_first_entry(&group->pending_list, struct mcast_req,
+				       group_list);
+		sa_data = (struct ib_sa_mcmember_data *)req->sa_mad.data;
+		req_join_state = sa_data->scope_join_state & 0x7;
+
+		/* For a leave request, we will immediately answer the VF, and
+		 * update our internal counters. The actual leave will be sent
+		 * to SM later, if at all needed. We dequeue the request now. */
+		if (req->sa_mad.mad_hdr.method == IB_SA_METHOD_DELETE)
+			rc += handle_leave_req(group, req_join_state, req);
+		else
+			rc += handle_join_req(group, req_join_state, req);
+	}
+
+	/* Handle leaves */
+	if (group->state == MCAST_IDLE) {
+		req_join_state = get_leave_state(group);
+		if (req_join_state) {
+			group->rec.scope_join_state &= ~req_join_state;
+			group->prev_state = group->state;
+			if (send_leave_to_wire(group, req_join_state)) {
+				group->state = group->prev_state;
+				++rc;
+			} else
+				group->state = MCAST_LEAVE_SENT;
+		}
+	}
+
+	if (!list_empty(&group->pending_list) && group->state == MCAST_IDLE)
+		goto process_requests;
+	mutex_unlock(&group->lock);
+
+	while (rc--)
+		release_group(group, 0);
+}
+
+static struct mcast_group *search_relocate_mgid0_group(struct mlx4_ib_demux_ctx *ctx,
+						       __be64 tid,
+						       union ib_gid *new_mgid)
+{
+	struct mcast_group *group = NULL, *cur_group;
+	struct mcast_req *req;
+	struct list_head *pos;
+	struct list_head *n;
+
+	mutex_lock(&ctx->mcg_table_lock);
+	list_for_each_safe(pos, n, &ctx->mcg_mgid0_list) {
+		group = list_entry(pos, struct mcast_group, mgid0_list);
+		mutex_lock(&group->lock);
+		if (group->last_req_tid == tid) {
+			if (memcmp(new_mgid, &mgid0, sizeof mgid0)) {
+				group->rec.mgid = *new_mgid;
+				sprintf(group->name, "%016llx%016llx",
+						be64_to_cpu(group->rec.mgid.global.subnet_prefix),
+						be64_to_cpu(group->rec.mgid.global.interface_id));
+				list_del_init(&group->mgid0_list);
+				cur_group = mcast_insert(ctx, group);
+				if (cur_group) {
+					/* A race between our code and SM. Silently cleaning the new one */
+					req = list_first_entry(&group->pending_list,
+							       struct mcast_req, group_list);
+					--group->func[req->func].num_pend_reqs;
+					list_del(&req->group_list);
+					list_del(&req->func_list);
+					kfree(req);
+					mutex_unlock(&group->lock);
+					mutex_unlock(&ctx->mcg_table_lock);
+					release_group(group, 0);
+					return NULL;
+				}
+
+				atomic_inc(&group->refcount);
+				add_sysfs_port_mcg_attr(ctx->dev, ctx->port, &group->dentry.attr);
+				mutex_unlock(&group->lock);
+				mutex_unlock(&ctx->mcg_table_lock);
+				return group;
+			} else {
+				struct mcast_req *tmp1, *tmp2;
+
+				list_del(&group->mgid0_list);
+				if (!list_empty(&group->pending_list) && group->state != MCAST_IDLE)
+					cancel_delayed_work_sync(&group->timeout_work);
+
+				list_for_each_entry_safe(tmp1, tmp2, &group->pending_list, group_list) {
+					list_del(&tmp1->group_list);
+					kfree(tmp1);
+				}
+				mutex_unlock(&group->lock);
+				mutex_unlock(&ctx->mcg_table_lock);
+				kfree(group);
+				return NULL;
+			}
+		}
+		mutex_unlock(&group->lock);
+	}
+	mutex_unlock(&ctx->mcg_table_lock);
+
+	return NULL;
+}
+
+static ssize_t sysfs_show_group(struct device *dev,
+		struct device_attribute *attr, char *buf);
+
+static struct mcast_group *acquire_group(struct mlx4_ib_demux_ctx *ctx,
+					 union ib_gid *mgid, int create,
+					 gfp_t gfp_mask)
+{
+	struct mcast_group *group, *cur_group;
+	int is_mgid0;
+	int i;
+
+	is_mgid0 = !memcmp(&mgid0, mgid, sizeof mgid0);
+	if (!is_mgid0) {
+		group = mcast_find(ctx, mgid);
+		if (group)
+			goto found;
+	}
+
+	if (!create)
+		return ERR_PTR(-ENOENT);
+
+	group = kzalloc(sizeof *group, gfp_mask);
+	if (!group)
+		return ERR_PTR(-ENOMEM);
+
+	group->demux = ctx;
+	group->rec.mgid = *mgid;
+	INIT_LIST_HEAD(&group->pending_list);
+	INIT_LIST_HEAD(&group->mgid0_list);
+	for (i = 0; i < MAX_VFS; ++i)
+		INIT_LIST_HEAD(&group->func[i].pending);
+	INIT_WORK(&group->work, mlx4_ib_mcg_work_handler);
+	INIT_DELAYED_WORK(&group->timeout_work, mlx4_ib_mcg_timeout_handler);
+	mutex_init(&group->lock);
+	sprintf(group->name, "%016llx%016llx",
+			be64_to_cpu(group->rec.mgid.global.subnet_prefix),
+			be64_to_cpu(group->rec.mgid.global.interface_id));
+	sysfs_attr_init(&group->dentry.attr);
+	group->dentry.show = sysfs_show_group;
+	group->dentry.store = NULL;
+	group->dentry.attr.name = group->name;
+	group->dentry.attr.mode = 0400;
+	group->state = MCAST_IDLE;
+
+	if (is_mgid0) {
+		list_add(&group->mgid0_list, &ctx->mcg_mgid0_list);
+		goto found;
+	}
+
+	cur_group = mcast_insert(ctx, group);
+	if (cur_group) {
+		mcg_warn("group just showed up %s - confused\n", cur_group->name);
+		kfree(group);
+		return ERR_PTR(-EINVAL);
+	}
+
+	add_sysfs_port_mcg_attr(ctx->dev, ctx->port, &group->dentry.attr);
+
+found:
+	atomic_inc(&group->refcount);
+	return group;
+}
+
+static void queue_req(struct mcast_req *req)
+{
+	struct mcast_group *group = req->group;
+
+	atomic_inc(&group->refcount); /* for the request */
+	atomic_inc(&group->refcount); /* for scheduling the work */
+	list_add_tail(&req->group_list, &group->pending_list);
+	list_add_tail(&req->func_list, &group->func[req->func].pending);
+	/* calls mlx4_ib_mcg_work_handler */
+	if (!queue_work(group->demux->mcg_wq, &group->work))
+		safe_atomic_dec(&group->refcount);
+}
+
+int mlx4_ib_mcg_demux_handler(struct ib_device *ibdev, int port, int slave,
+			      struct ib_sa_mad *mad)
+{
+	struct mlx4_ib_dev *dev = to_mdev(ibdev);
+	struct ib_sa_mcmember_data *rec = (struct ib_sa_mcmember_data *)mad->data;
+	struct mlx4_ib_demux_ctx *ctx = &dev->sriov.demux[port - 1];
+	struct mcast_group *group;
+
+	switch (mad->mad_hdr.method) {
+	case IB_MGMT_METHOD_GET_RESP:
+	case IB_SA_METHOD_DELETE_RESP:
+		mutex_lock(&ctx->mcg_table_lock);
+		group = acquire_group(ctx, &rec->mgid, 0, GFP_KERNEL);
+		mutex_unlock(&ctx->mcg_table_lock);
+		if (IS_ERR(group)) {
+			if (mad->mad_hdr.method == IB_MGMT_METHOD_GET_RESP) {
+				__be64 tid = mad->mad_hdr.tid;
+				*(u8 *)(&tid) = (u8)slave; /* in group we kept the modified TID */
+				group = search_relocate_mgid0_group(ctx, tid, &rec->mgid);
+			} else
+				group = NULL;
+		}
+
+		if (!group)
+			return 1;
+
+		mutex_lock(&group->lock);
+		group->response_sa_mad = *mad;
+		group->prev_state = group->state;
+		group->state = MCAST_RESP_READY;
+		/* calls mlx4_ib_mcg_work_handler */
+		atomic_inc(&group->refcount);
+		if (!queue_work(ctx->mcg_wq, &group->work))
+			safe_atomic_dec(&group->refcount);
+		mutex_unlock(&group->lock);
+		release_group(group, 0);
+		return 1; /* consumed */
+	case IB_MGMT_METHOD_SET:
+	case IB_SA_METHOD_GET_TABLE:
+	case IB_SA_METHOD_GET_TABLE_RESP:
+	case IB_SA_METHOD_DELETE:
+		return 0; /* not consumed, pass-through to guest over tunnel */
+	default:
+		mcg_warn("In demux, port %d: unexpected MCMember method: 0x%x, dropping\n",
+			port, mad->mad_hdr.method);
+		return 1; /* consumed */
+	}
+}
+
+int mlx4_ib_mcg_multiplex_handler(struct ib_device *ibdev, int port,
+				  int slave, struct ib_sa_mad *sa_mad)
+{
+	struct mlx4_ib_dev *dev = to_mdev(ibdev);
+	struct ib_sa_mcmember_data *rec = (struct ib_sa_mcmember_data *)sa_mad->data;
+	struct mlx4_ib_demux_ctx *ctx = &dev->sriov.demux[port - 1];
+	struct mcast_group *group;
+	struct mcast_req *req;
+	int may_create = 0;
+
+	if (ctx->flushing)
+		return -EAGAIN;
+
+	switch (sa_mad->mad_hdr.method) {
+	case IB_MGMT_METHOD_SET:
+		may_create = 1;
+	case IB_SA_METHOD_DELETE:
+		req = kzalloc(sizeof *req, GFP_KERNEL);
+		if (!req)
+			return -ENOMEM;
+
+		req->func = slave;
+		req->sa_mad = *sa_mad;
+
+		mutex_lock(&ctx->mcg_table_lock);
+		group = acquire_group(ctx, &rec->mgid, may_create, GFP_KERNEL);
+		mutex_unlock(&ctx->mcg_table_lock);
+		if (IS_ERR(group)) {
+			kfree(req);
+			return PTR_ERR(group);
+		}
+		mutex_lock(&group->lock);
+		if (group->func[slave].num_pend_reqs > MAX_PEND_REQS_PER_FUNC) {
+			mutex_unlock(&group->lock);
+			mcg_warn_group(group, "Port %d, Func %d has too many pending requests (%d), dropping\n",
+				       port, slave, MAX_PEND_REQS_PER_FUNC);
+			release_group(group, 0);
+			kfree(req);
+			return -ENOMEM;
+		}
+		++group->func[slave].num_pend_reqs;
+		req->group = group;
+		queue_req(req);
+		mutex_unlock(&group->lock);
+		release_group(group, 0);
+		return 1; /* consumed */
+	case IB_SA_METHOD_GET_TABLE:
+	case IB_MGMT_METHOD_GET_RESP:
+	case IB_SA_METHOD_GET_TABLE_RESP:
+	case IB_SA_METHOD_DELETE_RESP:
+		return 0; /* not consumed, pass-through */
+	default:
+		mcg_warn("In multiplex, port %d, func %d: unexpected MCMember method: 0x%x, dropping\n",
+			port, slave, sa_mad->mad_hdr.method);
+		return 1; /* consumed */
+	}
+}
+
+static ssize_t sysfs_show_group(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct mcast_group *group =
+		container_of(attr, struct mcast_group, dentry);
+	struct mcast_req *req = NULL;
+	char pending_str[40];
+	char state_str[40];
+	ssize_t len = 0;
+	int f;
+
+	if (group->state == MCAST_IDLE)
+		sprintf(state_str, "%s", get_state_string(group->state));
+	else
+		sprintf(state_str, "%s(TID=0x%llx)",
+				get_state_string(group->state),
+				be64_to_cpu(group->last_req_tid));
+	if (list_empty(&group->pending_list)) {
+		sprintf(pending_str, "No");
+	} else {
+		req = list_first_entry(&group->pending_list, struct mcast_req, group_list);
+		sprintf(pending_str, "Yes(TID=0x%llx)",
+				be64_to_cpu(req->sa_mad.mad_hdr.tid));
+	}
+	len += sprintf(buf + len, "%1d [%02d,%02d,%02d] %4d %4s %5s     ",
+			group->rec.scope_join_state & 0xf,
+			group->members[2], group->members[1], group->members[0],
+			atomic_read(&group->refcount),
+			pending_str,
+			state_str);
+	for (f = 0; f < MAX_VFS; ++f)
+		if (group->func[f].state == MCAST_MEMBER)
+			len += sprintf(buf + len, "%d[%1x] ",
+					f, group->func[f].join_state);
+
+	len += sprintf(buf + len, "\t\t(%4hx %4x %2x %2x %2x %2x %2x "
+		"%4x %4x %2x %2x)\n",
+		be16_to_cpu(group->rec.pkey),
+		be32_to_cpu(group->rec.qkey),
+		(group->rec.mtusel_mtu & 0xc0) >> 6,
+		group->rec.mtusel_mtu & 0x3f,
+		group->rec.tclass,
+		(group->rec.ratesel_rate & 0xc0) >> 6,
+		group->rec.ratesel_rate & 0x3f,
+		(be32_to_cpu(group->rec.sl_flowlabel_hoplimit) & 0xf0000000) >> 28,
+		(be32_to_cpu(group->rec.sl_flowlabel_hoplimit) & 0x0fffff00) >> 8,
+		be32_to_cpu(group->rec.sl_flowlabel_hoplimit) & 0x000000ff,
+		group->rec.proxy_join);
+
+	return len;
+}
+
+int mlx4_ib_mcg_port_init(struct mlx4_ib_demux_ctx *ctx)
+{
+	char name[20];
+
+	atomic_set(&ctx->tid, 0);
+	sprintf(name, "mlx4_ib_mcg%d", ctx->port);
+	ctx->mcg_wq = create_singlethread_workqueue(name);
+	if (!ctx->mcg_wq)
+		return -ENOMEM;
+
+	mutex_init(&ctx->mcg_table_lock);
+	ctx->mcg_table = RB_ROOT;
+	INIT_LIST_HEAD(&ctx->mcg_mgid0_list);
+	ctx->flushing = 0;
+
+	return 0;
+}
+
+static void force_clean_group(struct mcast_group *group)
+{
+	struct mcast_req *req, *tmp
+		;
+	list_for_each_entry_safe(req, tmp, &group->pending_list, group_list) {
+		list_del(&req->group_list);
+		kfree(req);
+	}
+	del_sysfs_port_mcg_attr(group->demux->dev, group->demux->port, &group->dentry.attr);
+	rb_erase(&group->node, &group->demux->mcg_table);
+	kfree(group);
+}
+
+static void _mlx4_ib_mcg_port_cleanup(struct mlx4_ib_demux_ctx *ctx, int destroy_wq)
+{
+	int i;
+	struct rb_node *p;
+	struct mcast_group *group;
+	unsigned long end;
+	int count;
+
+	for (i = 0; i < MAX_VFS; ++i)
+		clean_vf_mcast(ctx, i);
+
+	end = jiffies + msecs_to_jiffies(MAD_TIMEOUT_MS + 3000);
+	do {
+		count = 0;
+		mutex_lock(&ctx->mcg_table_lock);
+		for (p = rb_first(&ctx->mcg_table); p; p = rb_next(p))
+			++count;
+		mutex_unlock(&ctx->mcg_table_lock);
+		if (!count)
+			break;
+
+		msleep(1);
+	} while (time_after(end, jiffies));
+
+	flush_workqueue(ctx->mcg_wq);
+	if (destroy_wq)
+		destroy_workqueue(ctx->mcg_wq);
+
+	mutex_lock(&ctx->mcg_table_lock);
+	while ((p = rb_first(&ctx->mcg_table)) != NULL) {
+		group = rb_entry(p, struct mcast_group, node);
+		if (atomic_read(&group->refcount))
+			mcg_warn_group(group, "group refcount %d!!! (pointer %p)\n", atomic_read(&group->refcount), group);
+
+		force_clean_group(group);
+	}
+	mutex_unlock(&ctx->mcg_table_lock);
+}
+
+struct clean_work {
+	struct work_struct work;
+	struct mlx4_ib_demux_ctx *ctx;
+	int destroy_wq;
+};
+
+static void mcg_clean_task(struct work_struct *work)
+{
+	struct clean_work *cw = container_of(work, struct clean_work, work);
+
+	_mlx4_ib_mcg_port_cleanup(cw->ctx, cw->destroy_wq);
+	cw->ctx->flushing = 0;
+	kfree(cw);
+}
+
+void mlx4_ib_mcg_port_cleanup(struct mlx4_ib_demux_ctx *ctx, int destroy_wq)
+{
+	struct clean_work *work;
+
+	if (ctx->flushing)
+		return;
+
+	ctx->flushing = 1;
+
+	if (destroy_wq) {
+		_mlx4_ib_mcg_port_cleanup(ctx, destroy_wq);
+		ctx->flushing = 0;
+		return;
+	}
+
+	work = kmalloc(sizeof *work, GFP_KERNEL);
+	if (!work) {
+		ctx->flushing = 0;
+		mcg_warn("failed allocating work for cleanup\n");
+		return;
+	}
+
+	work->ctx = ctx;
+	work->destroy_wq = destroy_wq;
+	INIT_WORK(&work->work, mcg_clean_task);
+	queue_work(clean_wq, &work->work);
+}
+
+static void build_leave_mad(struct mcast_req *req)
+{
+	struct ib_sa_mad *mad = &req->sa_mad;
+
+	mad->mad_hdr.method = IB_SA_METHOD_DELETE;
+}
+
+
+static void clear_pending_reqs(struct mcast_group *group, int vf)
+{
+	struct mcast_req *req, *tmp, *group_first = NULL;
+	int clear;
+	int pend = 0;
+
+	if (!list_empty(&group->pending_list))
+		group_first = list_first_entry(&group->pending_list, struct mcast_req, group_list);
+
+	list_for_each_entry_safe(req, tmp, &group->func[vf].pending, func_list) {
+		clear = 1;
+		if (group_first == req &&
+		    (group->state == MCAST_JOIN_SENT ||
+		     group->state == MCAST_LEAVE_SENT)) {
+			clear = cancel_delayed_work(&group->timeout_work);
+			pend = !clear;
+			group->state = MCAST_IDLE;
+		}
+		if (clear) {
+			--group->func[vf].num_pend_reqs;
+			list_del(&req->group_list);
+			list_del(&req->func_list);
+			kfree(req);
+			atomic_dec(&group->refcount);
+		}
+	}
+
+	if (!pend && (!list_empty(&group->func[vf].pending) || group->func[vf].num_pend_reqs)) {
+		mcg_warn_group(group, "DRIVER BUG: list_empty %d, num_pend_reqs %d\n",
+			       list_empty(&group->func[vf].pending), group->func[vf].num_pend_reqs);
+	}
+}
+
+static int push_deleteing_req(struct mcast_group *group, int slave)
+{
+	struct mcast_req *req;
+	struct mcast_req *pend_req;
+
+	if (!group->func[slave].join_state)
+		return 0;
+
+	req = kzalloc(sizeof *req, GFP_KERNEL);
+	if (!req) {
+		mcg_warn_group(group, "failed allocation - may leave stall groups\n");
+		return -ENOMEM;
+	}
+
+	if (!list_empty(&group->func[slave].pending)) {
+		pend_req = list_entry(group->func[slave].pending.prev, struct mcast_req, group_list);
+		if (pend_req->clean) {
+			kfree(req);
+			return 0;
+		}
+	}
+
+	req->clean = 1;
+	req->func = slave;
+	req->group = group;
+	++group->func[slave].num_pend_reqs;
+	build_leave_mad(req);
+	queue_req(req);
+	return 0;
+}
+
+void clean_vf_mcast(struct mlx4_ib_demux_ctx *ctx, int slave)
+{
+	struct mcast_group *group;
+	struct rb_node *p;
+
+	mutex_lock(&ctx->mcg_table_lock);
+	for (p = rb_first(&ctx->mcg_table); p; p = rb_next(p)) {
+		group = rb_entry(p, struct mcast_group, node);
+		mutex_lock(&group->lock);
+		if (atomic_read(&group->refcount)) {
+			/* clear pending requests of this VF */
+			clear_pending_reqs(group, slave);
+			push_deleteing_req(group, slave);
+		}
+		mutex_unlock(&group->lock);
+	}
+	mutex_unlock(&ctx->mcg_table_lock);
+}
+
+
+int mlx4_ib_mcg_init(void)
+{
+	clean_wq = create_singlethread_workqueue("mlx4_ib_mcg");
+	if (!clean_wq)
+		return -ENOMEM;
+
+	return 0;
+}
+
+void mlx4_ib_mcg_destroy(void)
+{
+	destroy_workqueue(clean_wq);
+}
diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h
new file mode 100644
index 0000000..6eb743f
--- /dev/null
+++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h
@@ -0,0 +1,797 @@
+/*
+ * Copyright (c) 2006, 2007 Cisco Systems.  All rights reserved.
+ * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MLX4_IB_H
+#define MLX4_IB_H
+
+#include <linux/compiler.h>
+#include <linux/list.h>
+#include <linux/mutex.h>
+#include <linux/idr.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_umem.h>
+#include <rdma/ib_mad.h>
+#include <rdma/ib_sa.h>
+
+#include <linux/mlx4/device.h>
+#include <linux/mlx4/doorbell.h>
+
+#define MLX4_IB_DRV_NAME	"mlx4_ib"
+
+#ifdef pr_fmt
+#undef pr_fmt
+#endif
+#define pr_fmt(fmt)	"<" MLX4_IB_DRV_NAME "> %s: " fmt, __func__
+
+#define mlx4_ib_warn(ibdev, format, arg...) \
+	dev_warn((ibdev)->dma_device, MLX4_IB_DRV_NAME ": " format, ## arg)
+
+enum {
+	MLX4_IB_SQ_MIN_WQE_SHIFT = 6,
+	MLX4_IB_MAX_HEADROOM	 = 2048
+};
+
+#define MLX4_IB_SQ_HEADROOM(shift)	((MLX4_IB_MAX_HEADROOM >> (shift)) + 1)
+#define MLX4_IB_SQ_MAX_SPARE		(MLX4_IB_SQ_HEADROOM(MLX4_IB_SQ_MIN_WQE_SHIFT))
+
+/*module param to indicate if SM assigns the alias_GUID*/
+extern int mlx4_ib_sm_guid_assign;
+
+#define MLX4_IB_UC_STEER_QPN_ALIGN 1
+#define MLX4_IB_UC_MAX_NUM_QPS     256
+struct mlx4_ib_ucontext {
+	struct ib_ucontext	ibucontext;
+	struct mlx4_uar		uar;
+	struct list_head	db_page_list;
+	struct mutex		db_page_mutex;
+};
+
+struct mlx4_ib_pd {
+	struct ib_pd		ibpd;
+	u32			pdn;
+};
+
+struct mlx4_ib_xrcd {
+	struct ib_xrcd		ibxrcd;
+	u32			xrcdn;
+	struct ib_pd	       *pd;
+	struct ib_cq	       *cq;
+};
+
+struct mlx4_ib_cq_buf {
+	struct mlx4_buf		buf;
+	struct mlx4_mtt		mtt;
+	int			entry_size;
+};
+
+struct mlx4_ib_cq_resize {
+	struct mlx4_ib_cq_buf	buf;
+	int			cqe;
+};
+
+struct mlx4_ib_cq {
+	struct ib_cq		ibcq;
+	struct mlx4_cq		mcq;
+	struct mlx4_ib_cq_buf	buf;
+	struct mlx4_ib_cq_resize *resize_buf;
+	struct mlx4_db		db;
+	spinlock_t		lock;
+	struct mutex		resize_mutex;
+	struct ib_umem	       *umem;
+	struct ib_umem	       *resize_umem;
+};
+
+struct mlx4_ib_mr {
+	struct ib_mr		ibmr;
+	struct mlx4_mr		mmr;
+	struct ib_umem	       *umem;
+};
+
+struct mlx4_ib_mw {
+	struct ib_mw		ibmw;
+	struct mlx4_mw		mmw;
+};
+
+struct mlx4_ib_fast_reg_page_list {
+	struct ib_fast_reg_page_list	ibfrpl;
+	__be64			       *mapped_page_list;
+	dma_addr_t			map;
+};
+
+struct mlx4_ib_fmr {
+	struct ib_fmr           ibfmr;
+	struct mlx4_fmr         mfmr;
+};
+
+struct mlx4_ib_flow {
+	struct ib_flow ibflow;
+	/* translating DMFS verbs sniffer rule to FW API requires two reg IDs */
+	u64 reg_id[2];
+};
+
+struct mlx4_ib_wq {
+	u64		       *wrid;
+	spinlock_t		lock;
+	int			wqe_cnt;
+	int			max_post;
+	int			max_gs;
+	int			offset;
+	int			wqe_shift;
+	unsigned		head;
+	unsigned		tail;
+};
+
+enum mlx4_ib_qp_flags {
+	MLX4_IB_QP_LSO = IB_QP_CREATE_IPOIB_UD_LSO,
+	MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK = IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK,
+	MLX4_IB_QP_NETIF = IB_QP_CREATE_NETIF_QP,
+	MLX4_IB_QP_CREATE_USE_GFP_NOIO = IB_QP_CREATE_USE_GFP_NOIO,
+	MLX4_IB_SRIOV_TUNNEL_QP = 1 << 30,
+	MLX4_IB_SRIOV_SQP = 1 << 31,
+};
+
+struct mlx4_ib_gid_entry {
+	struct list_head	list;
+	union ib_gid		gid;
+	int			added;
+	u8			port;
+};
+
+enum mlx4_ib_qp_type {
+	/*
+	 * IB_QPT_SMI and IB_QPT_GSI have to be the first two entries
+	 * here (and in that order) since the MAD layer uses them as
+	 * indices into a 2-entry table.
+	 */
+	MLX4_IB_QPT_SMI = IB_QPT_SMI,
+	MLX4_IB_QPT_GSI = IB_QPT_GSI,
+
+	MLX4_IB_QPT_RC = IB_QPT_RC,
+	MLX4_IB_QPT_UC = IB_QPT_UC,
+	MLX4_IB_QPT_UD = IB_QPT_UD,
+	MLX4_IB_QPT_RAW_IPV6 = IB_QPT_RAW_IPV6,
+	MLX4_IB_QPT_RAW_ETHERTYPE = IB_QPT_RAW_ETHERTYPE,
+	MLX4_IB_QPT_RAW_PACKET = IB_QPT_RAW_PACKET,
+	MLX4_IB_QPT_XRC_INI = IB_QPT_XRC_INI,
+	MLX4_IB_QPT_XRC_TGT = IB_QPT_XRC_TGT,
+
+	MLX4_IB_QPT_PROXY_SMI_OWNER	= 1 << 16,
+	MLX4_IB_QPT_PROXY_SMI		= 1 << 17,
+	MLX4_IB_QPT_PROXY_GSI		= 1 << 18,
+	MLX4_IB_QPT_TUN_SMI_OWNER	= 1 << 19,
+	MLX4_IB_QPT_TUN_SMI		= 1 << 20,
+	MLX4_IB_QPT_TUN_GSI		= 1 << 21,
+};
+
+#define MLX4_IB_QPT_ANY_SRIOV	(MLX4_IB_QPT_PROXY_SMI_OWNER | \
+	MLX4_IB_QPT_PROXY_SMI | MLX4_IB_QPT_PROXY_GSI | MLX4_IB_QPT_TUN_SMI_OWNER | \
+	MLX4_IB_QPT_TUN_SMI | MLX4_IB_QPT_TUN_GSI)
+
+enum mlx4_ib_mad_ifc_flags {
+	MLX4_MAD_IFC_IGNORE_MKEY	= 1,
+	MLX4_MAD_IFC_IGNORE_BKEY	= 2,
+	MLX4_MAD_IFC_IGNORE_KEYS	= (MLX4_MAD_IFC_IGNORE_MKEY |
+					   MLX4_MAD_IFC_IGNORE_BKEY),
+	MLX4_MAD_IFC_NET_VIEW		= 4,
+};
+
+enum {
+	MLX4_NUM_TUNNEL_BUFS		= 256,
+};
+
+struct mlx4_ib_tunnel_header {
+	struct mlx4_av av;
+	__be32 remote_qpn;
+	__be32 qkey;
+	__be16 vlan;
+	u8 mac[6];
+	__be16 pkey_index;
+	u8 reserved[6];
+};
+
+struct mlx4_ib_buf {
+	void *addr;
+	dma_addr_t map;
+};
+
+struct mlx4_rcv_tunnel_hdr {
+	__be32 flags_src_qp; /* flags[6:5] is defined for VLANs:
+			      * 0x0 - no vlan was in the packet
+			      * 0x01 - C-VLAN was in the packet */
+	u8 g_ml_path; /* gid bit stands for ipv6/4 header in RoCE */
+	u8 reserved;
+	__be16 pkey_index;
+	__be16 sl_vid;
+	__be16 slid_mac_47_32;
+	__be32 mac_31_0;
+};
+
+struct mlx4_ib_proxy_sqp_hdr {
+	struct ib_grh grh;
+	struct mlx4_rcv_tunnel_hdr tun;
+}  __packed;
+
+struct mlx4_roce_smac_vlan_info {
+	u64 smac;
+	int smac_index;
+	int smac_port;
+	u64 candidate_smac;
+	int candidate_smac_index;
+	int candidate_smac_port;
+	u16 vid;
+	int vlan_index;
+	int vlan_port;
+	u16 candidate_vid;
+	int candidate_vlan_index;
+	int candidate_vlan_port;
+	int update_vid;
+};
+
+struct mlx4_ib_qp {
+	struct ib_qp		ibqp;
+	struct mlx4_qp		mqp;
+	struct mlx4_buf		buf;
+
+	struct mlx4_db		db;
+	struct mlx4_ib_wq	rq;
+
+	u32			doorbell_qpn;
+	__be32			sq_signal_bits;
+	unsigned		sq_next_wqe;
+	int			sq_max_wqes_per_wr;
+	int			sq_spare_wqes;
+	struct mlx4_ib_wq	sq;
+
+	enum mlx4_ib_qp_type	mlx4_ib_qp_type;
+	struct ib_umem	       *umem;
+	struct mlx4_mtt		mtt;
+	int			buf_size;
+	struct mutex		mutex;
+	u16			xrcdn;
+	u32			flags;
+	u8			port;
+	u8			alt_port;
+	u8			atomic_rd_en;
+	u8			resp_depth;
+	u8			sq_no_prefetch;
+	u8			state;
+	int			mlx_type;
+	struct list_head	gid_list;
+	struct list_head	steering_rules;
+	struct mlx4_ib_buf	*sqp_proxy_rcv;
+	struct mlx4_roce_smac_vlan_info pri;
+	struct mlx4_roce_smac_vlan_info alt;
+	u64			reg_id;
+};
+
+struct mlx4_ib_srq {
+	struct ib_srq		ibsrq;
+	struct mlx4_srq		msrq;
+	struct mlx4_buf		buf;
+	struct mlx4_db		db;
+	u64		       *wrid;
+	spinlock_t		lock;
+	int			head;
+	int			tail;
+	u16			wqe_ctr;
+	struct ib_umem	       *umem;
+	struct mlx4_mtt		mtt;
+	struct mutex		mutex;
+};
+
+struct mlx4_ib_ah {
+	struct ib_ah		ibah;
+	union mlx4_ext_av       av;
+};
+
+/****************************************/
+/* alias guid support */
+/****************************************/
+#define NUM_PORT_ALIAS_GUID		2
+#define NUM_ALIAS_GUID_IN_REC		8
+#define NUM_ALIAS_GUID_REC_IN_PORT	16
+#define GUID_REC_SIZE			8
+#define NUM_ALIAS_GUID_PER_PORT		128
+#define MLX4_NOT_SET_GUID		(0x00LL)
+#define MLX4_GUID_FOR_DELETE_VAL	(~(0x00LL))
+
+enum mlx4_guid_alias_rec_status {
+	MLX4_GUID_INFO_STATUS_IDLE,
+	MLX4_GUID_INFO_STATUS_SET,
+	MLX4_GUID_INFO_STATUS_PENDING,
+};
+
+enum mlx4_guid_alias_rec_ownership {
+	MLX4_GUID_DRIVER_ASSIGN,
+	MLX4_GUID_SYSADMIN_ASSIGN,
+	MLX4_GUID_NONE_ASSIGN, /*init state of each record*/
+};
+
+enum mlx4_guid_alias_rec_method {
+	MLX4_GUID_INFO_RECORD_SET	= IB_MGMT_METHOD_SET,
+	MLX4_GUID_INFO_RECORD_DELETE	= IB_SA_METHOD_DELETE,
+};
+
+struct mlx4_sriov_alias_guid_info_rec_det {
+	u8 all_recs[GUID_REC_SIZE * NUM_ALIAS_GUID_IN_REC];
+	ib_sa_comp_mask guid_indexes; /*indicates what from the 8 records are valid*/
+	enum mlx4_guid_alias_rec_status status; /*indicates the administraively status of the record.*/
+	u8 method; /*set or delete*/
+	enum mlx4_guid_alias_rec_ownership ownership; /*indicates who assign that alias_guid record*/
+};
+
+struct mlx4_sriov_alias_guid_port_rec_det {
+	struct mlx4_sriov_alias_guid_info_rec_det all_rec_per_port[NUM_ALIAS_GUID_REC_IN_PORT];
+	struct workqueue_struct *wq;
+	struct delayed_work alias_guid_work;
+	u8 port;
+	struct mlx4_sriov_alias_guid *parent;
+	struct list_head cb_list;
+};
+
+struct mlx4_sriov_alias_guid {
+	struct mlx4_sriov_alias_guid_port_rec_det ports_guid[MLX4_MAX_PORTS];
+	spinlock_t ag_work_lock;
+	struct ib_sa_client *sa_client;
+};
+
+struct mlx4_ib_demux_work {
+	struct work_struct	work;
+	struct mlx4_ib_dev     *dev;
+	int			slave;
+	int			do_init;
+	u8			port;
+
+};
+
+struct mlx4_ib_tun_tx_buf {
+	struct mlx4_ib_buf buf;
+	struct ib_ah *ah;
+};
+
+struct mlx4_ib_demux_pv_qp {
+	struct ib_qp *qp;
+	enum ib_qp_type proxy_qpt;
+	struct mlx4_ib_buf *ring;
+	struct mlx4_ib_tun_tx_buf *tx_ring;
+	spinlock_t tx_lock;
+	unsigned tx_ix_head;
+	unsigned tx_ix_tail;
+};
+
+enum mlx4_ib_demux_pv_state {
+	DEMUX_PV_STATE_DOWN,
+	DEMUX_PV_STATE_STARTING,
+	DEMUX_PV_STATE_ACTIVE,
+	DEMUX_PV_STATE_DOWNING,
+};
+
+struct mlx4_ib_demux_pv_ctx {
+	int port;
+	int slave;
+	enum mlx4_ib_demux_pv_state state;
+	int has_smi;
+	struct ib_device *ib_dev;
+	struct ib_cq *cq;
+	struct ib_pd *pd;
+	struct ib_mr *mr;
+	struct work_struct work;
+	struct workqueue_struct *wq;
+	struct mlx4_ib_demux_pv_qp qp[2];
+};
+
+struct mlx4_ib_demux_ctx {
+	struct ib_device *ib_dev;
+	int port;
+	struct workqueue_struct *wq;
+	struct workqueue_struct *ud_wq;
+	spinlock_t ud_lock;
+	__be64 subnet_prefix;
+	__be64 guid_cache[128];
+	struct mlx4_ib_dev *dev;
+	/* the following lock protects both mcg_table and mcg_mgid0_list */
+	struct mutex		mcg_table_lock;
+	struct rb_root		mcg_table;
+	struct list_head	mcg_mgid0_list;
+	struct workqueue_struct	*mcg_wq;
+	struct mlx4_ib_demux_pv_ctx **tun;
+	atomic_t tid;
+	int    flushing; /* flushing the work queue */
+};
+
+struct mlx4_ib_sriov {
+	struct mlx4_ib_demux_ctx demux[MLX4_MAX_PORTS];
+	struct mlx4_ib_demux_pv_ctx *sqps[MLX4_MAX_PORTS];
+	/* when using this spinlock you should use "irq" because
+	 * it may be called from interrupt context.*/
+	spinlock_t going_down_lock;
+	int is_going_down;
+
+	struct mlx4_sriov_alias_guid alias_guid;
+
+	/* CM paravirtualization fields */
+	struct list_head cm_list;
+	spinlock_t id_map_lock;
+	struct rb_root sl_id_map;
+	struct idr pv_id_table;
+};
+
+struct mlx4_ib_iboe {
+	spinlock_t		lock;
+	struct net_device      *netdevs[MLX4_MAX_PORTS];
+	struct net_device      *masters[MLX4_MAX_PORTS];
+	atomic64_t		mac[MLX4_MAX_PORTS];
+	struct notifier_block 	nb;
+	struct notifier_block	nb_inet;
+	struct notifier_block	nb_inet6;
+	union ib_gid		gid_table[MLX4_MAX_PORTS][128];
+};
+
+struct pkey_mgt {
+	u8			virt2phys_pkey[MLX4_MFUNC_MAX][MLX4_MAX_PORTS][MLX4_MAX_PORT_PKEYS];
+	u16			phys_pkey_cache[MLX4_MAX_PORTS][MLX4_MAX_PORT_PKEYS];
+	struct list_head	pkey_port_list[MLX4_MFUNC_MAX];
+	struct kobject	       *device_parent[MLX4_MFUNC_MAX];
+};
+
+struct mlx4_ib_iov_sysfs_attr {
+	void *ctx;
+	struct kobject *kobj;
+	unsigned long data;
+	u32 entry_num;
+	char name[15];
+	struct device_attribute dentry;
+	struct device *dev;
+};
+
+struct mlx4_ib_iov_sysfs_attr_ar {
+	struct mlx4_ib_iov_sysfs_attr dentries[3 * NUM_ALIAS_GUID_PER_PORT + 1];
+};
+
+struct mlx4_ib_iov_port {
+	char name[100];
+	u8 num;
+	struct mlx4_ib_dev *dev;
+	struct list_head list;
+	struct mlx4_ib_iov_sysfs_attr_ar *dentr_ar;
+	struct ib_port_attr attr;
+	struct kobject	*cur_port;
+	struct kobject	*admin_alias_parent;
+	struct kobject	*gids_parent;
+	struct kobject	*pkeys_parent;
+	struct kobject	*mcgs_parent;
+	struct mlx4_ib_iov_sysfs_attr mcg_dentry;
+};
+
+struct mlx4_ib_dev {
+	struct ib_device	ib_dev;
+	struct mlx4_dev	       *dev;
+	int			num_ports;
+	void __iomem	       *uar_map;
+
+	struct mlx4_uar		priv_uar;
+	u32			priv_pdn;
+	MLX4_DECLARE_DOORBELL_LOCK(uar_lock);
+
+	struct ib_mad_agent    *send_agent[MLX4_MAX_PORTS][2];
+	struct ib_ah	       *sm_ah[MLX4_MAX_PORTS];
+	spinlock_t		sm_lock;
+	struct mlx4_ib_sriov	sriov;
+
+	struct mutex		cap_mask_mutex;
+	bool			ib_active;
+	struct mlx4_ib_iboe	iboe;
+	int			counters[MLX4_MAX_PORTS];
+	int		       *eq_table;
+	int			eq_added;
+	struct kobject	       *iov_parent;
+	struct kobject	       *ports_parent;
+	struct kobject	       *dev_ports_parent[MLX4_MFUNC_MAX];
+	struct mlx4_ib_iov_port	iov_ports[MLX4_MAX_PORTS];
+	struct pkey_mgt		pkeys;
+	unsigned long *ib_uc_qpns_bitmap;
+	int steer_qpn_count;
+	int steer_qpn_base;
+	int steering_support;
+	struct mlx4_ib_qp      *qp1_proxy[MLX4_MAX_PORTS];
+	/* lock when destroying qp1_proxy and getting netdev events */
+	struct mutex		qp1_proxy_lock[MLX4_MAX_PORTS];
+};
+
+struct ib_event_work {
+	struct work_struct	work;
+	struct mlx4_ib_dev	*ib_dev;
+	struct mlx4_eqe		ib_eqe;
+};
+
+struct mlx4_ib_qp_tunnel_init_attr {
+	struct ib_qp_init_attr init_attr;
+	int slave;
+	enum ib_qp_type proxy_qp_type;
+	u8 port;
+};
+
+static inline struct mlx4_ib_dev *to_mdev(struct ib_device *ibdev)
+{
+	return container_of(ibdev, struct mlx4_ib_dev, ib_dev);
+}
+
+static inline struct mlx4_ib_ucontext *to_mucontext(struct ib_ucontext *ibucontext)
+{
+	return container_of(ibucontext, struct mlx4_ib_ucontext, ibucontext);
+}
+
+static inline struct mlx4_ib_pd *to_mpd(struct ib_pd *ibpd)
+{
+	return container_of(ibpd, struct mlx4_ib_pd, ibpd);
+}
+
+static inline struct mlx4_ib_xrcd *to_mxrcd(struct ib_xrcd *ibxrcd)
+{
+	return container_of(ibxrcd, struct mlx4_ib_xrcd, ibxrcd);
+}
+
+static inline struct mlx4_ib_cq *to_mcq(struct ib_cq *ibcq)
+{
+	return container_of(ibcq, struct mlx4_ib_cq, ibcq);
+}
+
+static inline struct mlx4_ib_cq *to_mibcq(struct mlx4_cq *mcq)
+{
+	return container_of(mcq, struct mlx4_ib_cq, mcq);
+}
+
+static inline struct mlx4_ib_mr *to_mmr(struct ib_mr *ibmr)
+{
+	return container_of(ibmr, struct mlx4_ib_mr, ibmr);
+}
+
+static inline struct mlx4_ib_mw *to_mmw(struct ib_mw *ibmw)
+{
+	return container_of(ibmw, struct mlx4_ib_mw, ibmw);
+}
+
+static inline struct mlx4_ib_fast_reg_page_list *to_mfrpl(struct ib_fast_reg_page_list *ibfrpl)
+{
+	return container_of(ibfrpl, struct mlx4_ib_fast_reg_page_list, ibfrpl);
+}
+
+static inline struct mlx4_ib_fmr *to_mfmr(struct ib_fmr *ibfmr)
+{
+	return container_of(ibfmr, struct mlx4_ib_fmr, ibfmr);
+}
+
+static inline struct mlx4_ib_flow *to_mflow(struct ib_flow *ibflow)
+{
+	return container_of(ibflow, struct mlx4_ib_flow, ibflow);
+}
+
+static inline struct mlx4_ib_qp *to_mqp(struct ib_qp *ibqp)
+{
+	return container_of(ibqp, struct mlx4_ib_qp, ibqp);
+}
+
+static inline struct mlx4_ib_qp *to_mibqp(struct mlx4_qp *mqp)
+{
+	return container_of(mqp, struct mlx4_ib_qp, mqp);
+}
+
+static inline struct mlx4_ib_srq *to_msrq(struct ib_srq *ibsrq)
+{
+	return container_of(ibsrq, struct mlx4_ib_srq, ibsrq);
+}
+
+static inline struct mlx4_ib_srq *to_mibsrq(struct mlx4_srq *msrq)
+{
+	return container_of(msrq, struct mlx4_ib_srq, msrq);
+}
+
+static inline struct mlx4_ib_ah *to_mah(struct ib_ah *ibah)
+{
+	return container_of(ibah, struct mlx4_ib_ah, ibah);
+}
+
+int mlx4_ib_init_sriov(struct mlx4_ib_dev *dev);
+void mlx4_ib_close_sriov(struct mlx4_ib_dev *dev);
+
+int mlx4_ib_db_map_user(struct mlx4_ib_ucontext *context, unsigned long virt,
+			struct mlx4_db *db);
+void mlx4_ib_db_unmap_user(struct mlx4_ib_ucontext *context, struct mlx4_db *db);
+
+struct ib_mr *mlx4_ib_get_dma_mr(struct ib_pd *pd, int acc);
+int mlx4_ib_umem_write_mtt(struct mlx4_ib_dev *dev, struct mlx4_mtt *mtt,
+			   struct ib_umem *umem);
+struct ib_mr *mlx4_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
+				  u64 virt_addr, int access_flags,
+				  struct ib_udata *udata);
+int mlx4_ib_dereg_mr(struct ib_mr *mr);
+struct ib_mw *mlx4_ib_alloc_mw(struct ib_pd *pd, enum ib_mw_type type);
+int mlx4_ib_bind_mw(struct ib_qp *qp, struct ib_mw *mw,
+		    struct ib_mw_bind *mw_bind);
+int mlx4_ib_dealloc_mw(struct ib_mw *mw);
+struct ib_mr *mlx4_ib_alloc_fast_reg_mr(struct ib_pd *pd,
+					int max_page_list_len);
+struct ib_fast_reg_page_list *mlx4_ib_alloc_fast_reg_page_list(struct ib_device *ibdev,
+							       int page_list_len);
+void mlx4_ib_free_fast_reg_page_list(struct ib_fast_reg_page_list *page_list);
+
+int mlx4_ib_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period);
+int mlx4_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata);
+struct ib_cq *mlx4_ib_create_cq(struct ib_device *ibdev, int entries, int vector,
+				struct ib_ucontext *context,
+				struct ib_udata *udata);
+int mlx4_ib_destroy_cq(struct ib_cq *cq);
+int mlx4_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc);
+int mlx4_ib_arm_cq(struct ib_cq *cq, enum ib_cq_notify_flags flags);
+void __mlx4_ib_cq_clean(struct mlx4_ib_cq *cq, u32 qpn, struct mlx4_ib_srq *srq);
+void mlx4_ib_cq_clean(struct mlx4_ib_cq *cq, u32 qpn, struct mlx4_ib_srq *srq);
+
+struct ib_ah *mlx4_ib_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr);
+int mlx4_ib_query_ah(struct ib_ah *ibah, struct ib_ah_attr *ah_attr);
+int mlx4_ib_destroy_ah(struct ib_ah *ah);
+
+struct ib_srq *mlx4_ib_create_srq(struct ib_pd *pd,
+				  struct ib_srq_init_attr *init_attr,
+				  struct ib_udata *udata);
+int mlx4_ib_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
+		       enum ib_srq_attr_mask attr_mask, struct ib_udata *udata);
+int mlx4_ib_query_srq(struct ib_srq *srq, struct ib_srq_attr *srq_attr);
+int mlx4_ib_destroy_srq(struct ib_srq *srq);
+void mlx4_ib_free_srq_wqe(struct mlx4_ib_srq *srq, int wqe_index);
+int mlx4_ib_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr,
+			  struct ib_recv_wr **bad_wr);
+
+struct ib_qp *mlx4_ib_create_qp(struct ib_pd *pd,
+				struct ib_qp_init_attr *init_attr,
+				struct ib_udata *udata);
+int mlx4_ib_destroy_qp(struct ib_qp *qp);
+int mlx4_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+		      int attr_mask, struct ib_udata *udata);
+int mlx4_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, int qp_attr_mask,
+		     struct ib_qp_init_attr *qp_init_attr);
+int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
+		      struct ib_send_wr **bad_wr);
+int mlx4_ib_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr,
+		      struct ib_recv_wr **bad_wr);
+
+int mlx4_MAD_IFC(struct mlx4_ib_dev *dev, int mad_ifc_flags,
+		 int port, struct ib_wc *in_wc, struct ib_grh *in_grh,
+		 void *in_mad, void *response_mad);
+int mlx4_ib_process_mad(struct ib_device *ibdev, int mad_flags,	u8 port_num,
+			struct ib_wc *in_wc, struct ib_grh *in_grh,
+			struct ib_mad *in_mad, struct ib_mad *out_mad);
+int mlx4_ib_mad_init(struct mlx4_ib_dev *dev);
+void mlx4_ib_mad_cleanup(struct mlx4_ib_dev *dev);
+
+struct ib_fmr *mlx4_ib_fmr_alloc(struct ib_pd *pd, int mr_access_flags,
+				  struct ib_fmr_attr *fmr_attr);
+int mlx4_ib_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list, int npages,
+			 u64 iova);
+int mlx4_ib_unmap_fmr(struct list_head *fmr_list);
+int mlx4_ib_fmr_dealloc(struct ib_fmr *fmr);
+int __mlx4_ib_query_port(struct ib_device *ibdev, u8 port,
+			 struct ib_port_attr *props, int netw_view);
+int __mlx4_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index,
+			 u16 *pkey, int netw_view);
+
+int __mlx4_ib_query_gid(struct ib_device *ibdev, u8 port, int index,
+			union ib_gid *gid, int netw_view);
+
+static inline bool mlx4_ib_ah_grh_present(struct mlx4_ib_ah *ah)
+{
+	u8 port = be32_to_cpu(ah->av.ib.port_pd) >> 24 & 3;
+
+	if (rdma_port_get_link_layer(ah->ibah.device, port) == IB_LINK_LAYER_ETHERNET)
+		return true;
+
+	return !!(ah->av.ib.g_slid & 0x80);
+}
+
+int mlx4_ib_mcg_port_init(struct mlx4_ib_demux_ctx *ctx);
+void mlx4_ib_mcg_port_cleanup(struct mlx4_ib_demux_ctx *ctx, int destroy_wq);
+void clean_vf_mcast(struct mlx4_ib_demux_ctx *ctx, int slave);
+int mlx4_ib_mcg_init(void);
+void mlx4_ib_mcg_destroy(void);
+
+int mlx4_ib_find_real_gid(struct ib_device *ibdev, u8 port, __be64 guid);
+
+int mlx4_ib_mcg_multiplex_handler(struct ib_device *ibdev, int port, int slave,
+				  struct ib_sa_mad *sa_mad);
+int mlx4_ib_mcg_demux_handler(struct ib_device *ibdev, int port, int slave,
+			      struct ib_sa_mad *mad);
+
+int mlx4_ib_add_mc(struct mlx4_ib_dev *mdev, struct mlx4_ib_qp *mqp,
+		   union ib_gid *gid);
+
+void mlx4_ib_dispatch_event(struct mlx4_ib_dev *dev, u8 port_num,
+			    enum ib_event_type type);
+
+void mlx4_ib_tunnels_update_work(struct work_struct *work);
+
+int mlx4_ib_send_to_slave(struct mlx4_ib_dev *dev, int slave, u8 port,
+			  enum ib_qp_type qpt, struct ib_wc *wc,
+			  struct ib_grh *grh, struct ib_mad *mad);
+
+int mlx4_ib_send_to_wire(struct mlx4_ib_dev *dev, int slave, u8 port,
+			 enum ib_qp_type dest_qpt, u16 pkey_index, u32 remote_qpn,
+			 u32 qkey, struct ib_ah_attr *attr, u8 *s_mac,
+			 struct ib_mad *mad);
+
+__be64 mlx4_ib_get_new_demux_tid(struct mlx4_ib_demux_ctx *ctx);
+
+int mlx4_ib_demux_cm_handler(struct ib_device *ibdev, int port, int *slave,
+		struct ib_mad *mad);
+
+int mlx4_ib_multiplex_cm_handler(struct ib_device *ibdev, int port, int slave_id,
+		struct ib_mad *mad);
+
+void mlx4_ib_cm_paravirt_init(struct mlx4_ib_dev *dev);
+void mlx4_ib_cm_paravirt_clean(struct mlx4_ib_dev *dev, int slave_id);
+
+/* alias guid support */
+void mlx4_ib_init_alias_guid_work(struct mlx4_ib_dev *dev, int port);
+int mlx4_ib_init_alias_guid_service(struct mlx4_ib_dev *dev);
+void mlx4_ib_destroy_alias_guid_service(struct mlx4_ib_dev *dev);
+void mlx4_ib_invalidate_all_guid_record(struct mlx4_ib_dev *dev, int port);
+
+void mlx4_ib_notify_slaves_on_guid_change(struct mlx4_ib_dev *dev,
+					  int block_num,
+					  u8 port_num, u8 *p_data);
+
+void mlx4_ib_update_cache_on_guid_change(struct mlx4_ib_dev *dev,
+					 int block_num, u8 port_num,
+					 u8 *p_data);
+
+int add_sysfs_port_mcg_attr(struct mlx4_ib_dev *device, int port_num,
+			    struct attribute *attr);
+void del_sysfs_port_mcg_attr(struct mlx4_ib_dev *device, int port_num,
+			     struct attribute *attr);
+ib_sa_comp_mask mlx4_ib_get_aguid_comp_mask_from_ix(int index);
+
+int mlx4_ib_device_register_sysfs(struct mlx4_ib_dev *device) ;
+
+void mlx4_ib_device_unregister_sysfs(struct mlx4_ib_dev *device);
+
+__be64 mlx4_ib_gen_node_guid(void);
+
+int mlx4_ib_steer_qp_alloc(struct mlx4_ib_dev *dev, int count, int *qpn);
+void mlx4_ib_steer_qp_free(struct mlx4_ib_dev *dev, u32 qpn, int count);
+int mlx4_ib_steer_qp_reg(struct mlx4_ib_dev *mdev, struct mlx4_ib_qp *mqp,
+			 int is_attach);
+int mlx4_ib_rereg_user_mr(struct ib_mr *mr, int flags,
+			  u64 start, u64 length, u64 virt_addr,
+			  int mr_access_flags, struct ib_pd *pd,
+			  struct ib_udata *udata);
+
+#endif /* MLX4_IB_H */
diff --git a/drivers/infiniband/hw/mlx4/mr.c b/drivers/infiniband/hw/mlx4/mr.c
new file mode 100644
index 0000000..8f9325c
--- /dev/null
+++ b/drivers/infiniband/hw/mlx4/mr.c
@@ -0,0 +1,524 @@
+/*
+ * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
+ * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/slab.h>
+
+#include "mlx4_ib.h"
+
+static u32 convert_access(int acc)
+{
+	return (acc & IB_ACCESS_REMOTE_ATOMIC ? MLX4_PERM_ATOMIC       : 0) |
+	       (acc & IB_ACCESS_REMOTE_WRITE  ? MLX4_PERM_REMOTE_WRITE : 0) |
+	       (acc & IB_ACCESS_REMOTE_READ   ? MLX4_PERM_REMOTE_READ  : 0) |
+	       (acc & IB_ACCESS_LOCAL_WRITE   ? MLX4_PERM_LOCAL_WRITE  : 0) |
+	       (acc & IB_ACCESS_MW_BIND	      ? MLX4_PERM_BIND_MW      : 0) |
+	       MLX4_PERM_LOCAL_READ;
+}
+
+static enum mlx4_mw_type to_mlx4_type(enum ib_mw_type type)
+{
+	switch (type) {
+	case IB_MW_TYPE_1:	return MLX4_MW_TYPE_1;
+	case IB_MW_TYPE_2:	return MLX4_MW_TYPE_2;
+	default:		return -1;
+	}
+}
+
+struct ib_mr *mlx4_ib_get_dma_mr(struct ib_pd *pd, int acc)
+{
+	struct mlx4_ib_mr *mr;
+	int err;
+
+	mr = kmalloc(sizeof *mr, GFP_KERNEL);
+	if (!mr)
+		return ERR_PTR(-ENOMEM);
+
+	err = mlx4_mr_alloc(to_mdev(pd->device)->dev, to_mpd(pd)->pdn, 0,
+			    ~0ull, convert_access(acc), 0, 0, &mr->mmr);
+	if (err)
+		goto err_free;
+
+	err = mlx4_mr_enable(to_mdev(pd->device)->dev, &mr->mmr);
+	if (err)
+		goto err_mr;
+
+	mr->ibmr.rkey = mr->ibmr.lkey = mr->mmr.key;
+	mr->umem = NULL;
+
+	return &mr->ibmr;
+
+err_mr:
+	(void) mlx4_mr_free(to_mdev(pd->device)->dev, &mr->mmr);
+
+err_free:
+	kfree(mr);
+
+	return ERR_PTR(err);
+}
+
+int mlx4_ib_umem_write_mtt(struct mlx4_ib_dev *dev, struct mlx4_mtt *mtt,
+			   struct ib_umem *umem)
+{
+	u64 *pages;
+	int i, k, entry;
+	int n;
+	int len;
+	int err = 0;
+	struct scatterlist *sg;
+
+	pages = (u64 *) __get_free_page(GFP_KERNEL);
+	if (!pages)
+		return -ENOMEM;
+
+	i = n = 0;
+
+	for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) {
+		len = sg_dma_len(sg) >> mtt->page_shift;
+		for (k = 0; k < len; ++k) {
+			pages[i++] = sg_dma_address(sg) +
+				umem->page_size * k;
+			/*
+			 * Be friendly to mlx4_write_mtt() and
+			 * pass it chunks of appropriate size.
+			 */
+			if (i == PAGE_SIZE / sizeof (u64)) {
+				err = mlx4_write_mtt(dev->dev, mtt, n,
+						     i, pages);
+				if (err)
+					goto out;
+				n += i;
+				i = 0;
+			}
+		}
+	}
+
+	if (i)
+		err = mlx4_write_mtt(dev->dev, mtt, n, i, pages);
+
+out:
+	free_page((unsigned long) pages);
+	return err;
+}
+
+struct ib_mr *mlx4_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
+				  u64 virt_addr, int access_flags,
+				  struct ib_udata *udata)
+{
+	struct mlx4_ib_dev *dev = to_mdev(pd->device);
+	struct mlx4_ib_mr *mr;
+	int shift;
+	int err;
+	int n;
+
+	mr = kmalloc(sizeof *mr, GFP_KERNEL);
+	if (!mr)
+		return ERR_PTR(-ENOMEM);
+
+	/* Force registering the memory as writable. */
+	/* Used for memory re-registeration. HCA protects the access */
+	mr->umem = ib_umem_get(pd->uobject->context, start, length,
+			       access_flags | IB_ACCESS_LOCAL_WRITE, 0);
+	if (IS_ERR(mr->umem)) {
+		err = PTR_ERR(mr->umem);
+		goto err_free;
+	}
+
+	n = ib_umem_page_count(mr->umem);
+	shift = ilog2(mr->umem->page_size);
+
+	err = mlx4_mr_alloc(dev->dev, to_mpd(pd)->pdn, virt_addr, length,
+			    convert_access(access_flags), n, shift, &mr->mmr);
+	if (err)
+		goto err_umem;
+
+	err = mlx4_ib_umem_write_mtt(dev, &mr->mmr.mtt, mr->umem);
+	if (err)
+		goto err_mr;
+
+	err = mlx4_mr_enable(dev->dev, &mr->mmr);
+	if (err)
+		goto err_mr;
+
+	mr->ibmr.rkey = mr->ibmr.lkey = mr->mmr.key;
+
+	return &mr->ibmr;
+
+err_mr:
+	(void) mlx4_mr_free(to_mdev(pd->device)->dev, &mr->mmr);
+
+err_umem:
+	ib_umem_release(mr->umem);
+
+err_free:
+	kfree(mr);
+
+	return ERR_PTR(err);
+}
+
+int mlx4_ib_rereg_user_mr(struct ib_mr *mr, int flags,
+			  u64 start, u64 length, u64 virt_addr,
+			  int mr_access_flags, struct ib_pd *pd,
+			  struct ib_udata *udata)
+{
+	struct mlx4_ib_dev *dev = to_mdev(mr->device);
+	struct mlx4_ib_mr *mmr = to_mmr(mr);
+	struct mlx4_mpt_entry *mpt_entry;
+	struct mlx4_mpt_entry **pmpt_entry = &mpt_entry;
+	int err;
+
+	/* Since we synchronize this call and mlx4_ib_dereg_mr via uverbs,
+	 * we assume that the calls can't run concurrently. Otherwise, a
+	 * race exists.
+	 */
+	err =  mlx4_mr_hw_get_mpt(dev->dev, &mmr->mmr, &pmpt_entry);
+
+	if (err)
+		return err;
+
+	if (flags & IB_MR_REREG_PD) {
+		err = mlx4_mr_hw_change_pd(dev->dev, *pmpt_entry,
+					   to_mpd(pd)->pdn);
+
+		if (err)
+			goto release_mpt_entry;
+	}
+
+	if (flags & IB_MR_REREG_ACCESS) {
+		err = mlx4_mr_hw_change_access(dev->dev, *pmpt_entry,
+					       convert_access(mr_access_flags));
+
+		if (err)
+			goto release_mpt_entry;
+	}
+
+	if (flags & IB_MR_REREG_TRANS) {
+		int shift;
+		int err;
+		int n;
+
+		mlx4_mr_rereg_mem_cleanup(dev->dev, &mmr->mmr);
+		ib_umem_release(mmr->umem);
+		mmr->umem = ib_umem_get(mr->uobject->context, start, length,
+					mr_access_flags |
+					IB_ACCESS_LOCAL_WRITE,
+					0);
+		if (IS_ERR(mmr->umem)) {
+			err = PTR_ERR(mmr->umem);
+			/* Prevent mlx4_ib_dereg_mr from free'ing invalid pointer */
+			mmr->umem = NULL;
+			goto release_mpt_entry;
+		}
+		n = ib_umem_page_count(mmr->umem);
+		shift = ilog2(mmr->umem->page_size);
+
+		err = mlx4_mr_rereg_mem_write(dev->dev, &mmr->mmr,
+					      virt_addr, length, n, shift,
+					      *pmpt_entry);
+		if (err) {
+			ib_umem_release(mmr->umem);
+			goto release_mpt_entry;
+		}
+		mmr->mmr.iova       = virt_addr;
+		mmr->mmr.size       = length;
+
+		err = mlx4_ib_umem_write_mtt(dev, &mmr->mmr.mtt, mmr->umem);
+		if (err) {
+			mlx4_mr_rereg_mem_cleanup(dev->dev, &mmr->mmr);
+			ib_umem_release(mmr->umem);
+			goto release_mpt_entry;
+		}
+	}
+
+	/* If we couldn't transfer the MR to the HCA, just remember to
+	 * return a failure. But dereg_mr will free the resources.
+	 */
+	err = mlx4_mr_hw_write_mpt(dev->dev, &mmr->mmr, pmpt_entry);
+	if (!err && flags & IB_MR_REREG_ACCESS)
+		mmr->mmr.access = mr_access_flags;
+
+release_mpt_entry:
+	mlx4_mr_hw_put_mpt(dev->dev, pmpt_entry);
+
+	return err;
+}
+
+int mlx4_ib_dereg_mr(struct ib_mr *ibmr)
+{
+	struct mlx4_ib_mr *mr = to_mmr(ibmr);
+	int ret;
+
+	ret = mlx4_mr_free(to_mdev(ibmr->device)->dev, &mr->mmr);
+	if (ret)
+		return ret;
+	if (mr->umem)
+		ib_umem_release(mr->umem);
+	kfree(mr);
+
+	return 0;
+}
+
+struct ib_mw *mlx4_ib_alloc_mw(struct ib_pd *pd, enum ib_mw_type type)
+{
+	struct mlx4_ib_dev *dev = to_mdev(pd->device);
+	struct mlx4_ib_mw *mw;
+	int err;
+
+	mw = kmalloc(sizeof(*mw), GFP_KERNEL);
+	if (!mw)
+		return ERR_PTR(-ENOMEM);
+
+	err = mlx4_mw_alloc(dev->dev, to_mpd(pd)->pdn,
+			    to_mlx4_type(type), &mw->mmw);
+	if (err)
+		goto err_free;
+
+	err = mlx4_mw_enable(dev->dev, &mw->mmw);
+	if (err)
+		goto err_mw;
+
+	mw->ibmw.rkey = mw->mmw.key;
+
+	return &mw->ibmw;
+
+err_mw:
+	mlx4_mw_free(dev->dev, &mw->mmw);
+
+err_free:
+	kfree(mw);
+
+	return ERR_PTR(err);
+}
+
+int mlx4_ib_bind_mw(struct ib_qp *qp, struct ib_mw *mw,
+		    struct ib_mw_bind *mw_bind)
+{
+	struct ib_send_wr  wr;
+	struct ib_send_wr *bad_wr;
+	int ret;
+
+	memset(&wr, 0, sizeof(wr));
+	wr.opcode               = IB_WR_BIND_MW;
+	wr.wr_id                = mw_bind->wr_id;
+	wr.send_flags           = mw_bind->send_flags;
+	wr.wr.bind_mw.mw        = mw;
+	wr.wr.bind_mw.bind_info = mw_bind->bind_info;
+	wr.wr.bind_mw.rkey      = ib_inc_rkey(mw->rkey);
+
+	ret = mlx4_ib_post_send(qp, &wr, &bad_wr);
+	if (!ret)
+		mw->rkey = wr.wr.bind_mw.rkey;
+
+	return ret;
+}
+
+int mlx4_ib_dealloc_mw(struct ib_mw *ibmw)
+{
+	struct mlx4_ib_mw *mw = to_mmw(ibmw);
+
+	mlx4_mw_free(to_mdev(ibmw->device)->dev, &mw->mmw);
+	kfree(mw);
+
+	return 0;
+}
+
+struct ib_mr *mlx4_ib_alloc_fast_reg_mr(struct ib_pd *pd,
+					int max_page_list_len)
+{
+	struct mlx4_ib_dev *dev = to_mdev(pd->device);
+	struct mlx4_ib_mr *mr;
+	int err;
+
+	mr = kmalloc(sizeof *mr, GFP_KERNEL);
+	if (!mr)
+		return ERR_PTR(-ENOMEM);
+
+	err = mlx4_mr_alloc(dev->dev, to_mpd(pd)->pdn, 0, 0, 0,
+			    max_page_list_len, 0, &mr->mmr);
+	if (err)
+		goto err_free;
+
+	err = mlx4_mr_enable(dev->dev, &mr->mmr);
+	if (err)
+		goto err_mr;
+
+	mr->ibmr.rkey = mr->ibmr.lkey = mr->mmr.key;
+	mr->umem = NULL;
+
+	return &mr->ibmr;
+
+err_mr:
+	(void) mlx4_mr_free(dev->dev, &mr->mmr);
+
+err_free:
+	kfree(mr);
+	return ERR_PTR(err);
+}
+
+struct ib_fast_reg_page_list *mlx4_ib_alloc_fast_reg_page_list(struct ib_device *ibdev,
+							       int page_list_len)
+{
+	struct mlx4_ib_dev *dev = to_mdev(ibdev);
+	struct mlx4_ib_fast_reg_page_list *mfrpl;
+	int size = page_list_len * sizeof (u64);
+
+	if (page_list_len > MLX4_MAX_FAST_REG_PAGES)
+		return ERR_PTR(-EINVAL);
+
+	mfrpl = kmalloc(sizeof *mfrpl, GFP_KERNEL);
+	if (!mfrpl)
+		return ERR_PTR(-ENOMEM);
+
+	mfrpl->ibfrpl.page_list = kmalloc(size, GFP_KERNEL);
+	if (!mfrpl->ibfrpl.page_list)
+		goto err_free;
+
+	mfrpl->mapped_page_list = dma_alloc_coherent(&dev->dev->pdev->dev,
+						     size, &mfrpl->map,
+						     GFP_KERNEL);
+	if (!mfrpl->mapped_page_list)
+		goto err_free;
+
+	WARN_ON(mfrpl->map & 0x3f);
+
+	return &mfrpl->ibfrpl;
+
+err_free:
+	kfree(mfrpl->ibfrpl.page_list);
+	kfree(mfrpl);
+	return ERR_PTR(-ENOMEM);
+}
+
+void mlx4_ib_free_fast_reg_page_list(struct ib_fast_reg_page_list *page_list)
+{
+	struct mlx4_ib_dev *dev = to_mdev(page_list->device);
+	struct mlx4_ib_fast_reg_page_list *mfrpl = to_mfrpl(page_list);
+	int size = page_list->max_page_list_len * sizeof (u64);
+
+	dma_free_coherent(&dev->dev->pdev->dev, size, mfrpl->mapped_page_list,
+			  mfrpl->map);
+	kfree(mfrpl->ibfrpl.page_list);
+	kfree(mfrpl);
+}
+
+struct ib_fmr *mlx4_ib_fmr_alloc(struct ib_pd *pd, int acc,
+				 struct ib_fmr_attr *fmr_attr)
+{
+	struct mlx4_ib_dev *dev = to_mdev(pd->device);
+	struct mlx4_ib_fmr *fmr;
+	int err = -ENOMEM;
+
+	fmr = kmalloc(sizeof *fmr, GFP_KERNEL);
+	if (!fmr)
+		return ERR_PTR(-ENOMEM);
+
+	err = mlx4_fmr_alloc(dev->dev, to_mpd(pd)->pdn, convert_access(acc),
+			     fmr_attr->max_pages, fmr_attr->max_maps,
+			     fmr_attr->page_shift, &fmr->mfmr);
+	if (err)
+		goto err_free;
+
+	err = mlx4_fmr_enable(to_mdev(pd->device)->dev, &fmr->mfmr);
+	if (err)
+		goto err_mr;
+
+	fmr->ibfmr.rkey = fmr->ibfmr.lkey = fmr->mfmr.mr.key;
+
+	return &fmr->ibfmr;
+
+err_mr:
+	(void) mlx4_mr_free(to_mdev(pd->device)->dev, &fmr->mfmr.mr);
+
+err_free:
+	kfree(fmr);
+
+	return ERR_PTR(err);
+}
+
+int mlx4_ib_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list,
+		      int npages, u64 iova)
+{
+	struct mlx4_ib_fmr *ifmr = to_mfmr(ibfmr);
+	struct mlx4_ib_dev *dev = to_mdev(ifmr->ibfmr.device);
+
+	return mlx4_map_phys_fmr(dev->dev, &ifmr->mfmr, page_list, npages, iova,
+				 &ifmr->ibfmr.lkey, &ifmr->ibfmr.rkey);
+}
+
+int mlx4_ib_unmap_fmr(struct list_head *fmr_list)
+{
+	struct ib_fmr *ibfmr;
+	int err;
+	struct mlx4_dev *mdev = NULL;
+
+	list_for_each_entry(ibfmr, fmr_list, list) {
+		if (mdev && to_mdev(ibfmr->device)->dev != mdev)
+			return -EINVAL;
+		mdev = to_mdev(ibfmr->device)->dev;
+	}
+
+	if (!mdev)
+		return 0;
+
+	list_for_each_entry(ibfmr, fmr_list, list) {
+		struct mlx4_ib_fmr *ifmr = to_mfmr(ibfmr);
+
+		mlx4_fmr_unmap(mdev, &ifmr->mfmr, &ifmr->ibfmr.lkey, &ifmr->ibfmr.rkey);
+	}
+
+	/*
+	 * Make sure all MPT status updates are visible before issuing
+	 * SYNC_TPT firmware command.
+	 */
+	wmb();
+
+	err = mlx4_SYNC_TPT(mdev);
+	if (err)
+		pr_warn("SYNC_TPT error %d when "
+		       "unmapping FMRs\n", err);
+
+	return 0;
+}
+
+int mlx4_ib_fmr_dealloc(struct ib_fmr *ibfmr)
+{
+	struct mlx4_ib_fmr *ifmr = to_mfmr(ibfmr);
+	struct mlx4_ib_dev *dev = to_mdev(ibfmr->device);
+	int err;
+
+	err = mlx4_fmr_free(dev->dev, &ifmr->mfmr);
+
+	if (!err)
+		kfree(ifmr);
+
+	return err;
+}
diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c
new file mode 100644
index 0000000..9c5150c
--- /dev/null
+++ b/drivers/infiniband/hw/mlx4/qp.c
@@ -0,0 +1,3151 @@
+/*
+ * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
+ * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/log2.h>
+#include <linux/slab.h>
+#include <linux/netdevice.h>
+
+#include <rdma/ib_cache.h>
+#include <rdma/ib_pack.h>
+#include <rdma/ib_addr.h>
+#include <rdma/ib_mad.h>
+
+#include <linux/mlx4/qp.h>
+
+#include "mlx4_ib.h"
+#include "user.h"
+
+enum {
+	MLX4_IB_ACK_REQ_FREQ	= 8,
+};
+
+enum {
+	MLX4_IB_DEFAULT_SCHED_QUEUE	= 0x83,
+	MLX4_IB_DEFAULT_QP0_SCHED_QUEUE	= 0x3f,
+	MLX4_IB_LINK_TYPE_IB		= 0,
+	MLX4_IB_LINK_TYPE_ETH		= 1
+};
+
+enum {
+	/*
+	 * Largest possible UD header: send with GRH and immediate
+	 * data plus 18 bytes for an Ethernet header with VLAN/802.1Q
+	 * tag.  (LRH would only use 8 bytes, so Ethernet is the
+	 * biggest case)
+	 */
+	MLX4_IB_UD_HEADER_SIZE		= 82,
+	MLX4_IB_LSO_HEADER_SPARE	= 128,
+};
+
+enum {
+	MLX4_IB_IBOE_ETHERTYPE		= 0x8915
+};
+
+struct mlx4_ib_sqp {
+	struct mlx4_ib_qp	qp;
+	int			pkey_index;
+	u32			qkey;
+	u32			send_psn;
+	struct ib_ud_header	ud_header;
+	u8			header_buf[MLX4_IB_UD_HEADER_SIZE];
+};
+
+enum {
+	MLX4_IB_MIN_SQ_STRIDE	= 6,
+	MLX4_IB_CACHE_LINE_SIZE	= 64,
+};
+
+enum {
+	MLX4_RAW_QP_MTU		= 7,
+	MLX4_RAW_QP_MSGMAX	= 31,
+};
+
+#ifndef ETH_ALEN
+#define ETH_ALEN        6
+#endif
+static inline u64 mlx4_mac_to_u64(u8 *addr)
+{
+	u64 mac = 0;
+	int i;
+
+	for (i = 0; i < ETH_ALEN; i++) {
+		mac <<= 8;
+		mac |= addr[i];
+	}
+	return mac;
+}
+
+static const __be32 mlx4_ib_opcode[] = {
+	[IB_WR_SEND]				= cpu_to_be32(MLX4_OPCODE_SEND),
+	[IB_WR_LSO]				= cpu_to_be32(MLX4_OPCODE_LSO),
+	[IB_WR_SEND_WITH_IMM]			= cpu_to_be32(MLX4_OPCODE_SEND_IMM),
+	[IB_WR_RDMA_WRITE]			= cpu_to_be32(MLX4_OPCODE_RDMA_WRITE),
+	[IB_WR_RDMA_WRITE_WITH_IMM]		= cpu_to_be32(MLX4_OPCODE_RDMA_WRITE_IMM),
+	[IB_WR_RDMA_READ]			= cpu_to_be32(MLX4_OPCODE_RDMA_READ),
+	[IB_WR_ATOMIC_CMP_AND_SWP]		= cpu_to_be32(MLX4_OPCODE_ATOMIC_CS),
+	[IB_WR_ATOMIC_FETCH_AND_ADD]		= cpu_to_be32(MLX4_OPCODE_ATOMIC_FA),
+	[IB_WR_SEND_WITH_INV]			= cpu_to_be32(MLX4_OPCODE_SEND_INVAL),
+	[IB_WR_LOCAL_INV]			= cpu_to_be32(MLX4_OPCODE_LOCAL_INVAL),
+	[IB_WR_FAST_REG_MR]			= cpu_to_be32(MLX4_OPCODE_FMR),
+	[IB_WR_MASKED_ATOMIC_CMP_AND_SWP]	= cpu_to_be32(MLX4_OPCODE_MASKED_ATOMIC_CS),
+	[IB_WR_MASKED_ATOMIC_FETCH_AND_ADD]	= cpu_to_be32(MLX4_OPCODE_MASKED_ATOMIC_FA),
+	[IB_WR_BIND_MW]				= cpu_to_be32(MLX4_OPCODE_BIND_MW),
+};
+
+static struct mlx4_ib_sqp *to_msqp(struct mlx4_ib_qp *mqp)
+{
+	return container_of(mqp, struct mlx4_ib_sqp, qp);
+}
+
+static int is_tunnel_qp(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp)
+{
+	if (!mlx4_is_master(dev->dev))
+		return 0;
+
+	return qp->mqp.qpn >= dev->dev->phys_caps.base_tunnel_sqpn &&
+	       qp->mqp.qpn < dev->dev->phys_caps.base_tunnel_sqpn +
+		8 * MLX4_MFUNC_MAX;
+}
+
+static int is_sqp(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp)
+{
+	int proxy_sqp = 0;
+	int real_sqp = 0;
+	int i;
+	/* PPF or Native -- real SQP */
+	real_sqp = ((mlx4_is_master(dev->dev) || !mlx4_is_mfunc(dev->dev)) &&
+		    qp->mqp.qpn >= dev->dev->phys_caps.base_sqpn &&
+		    qp->mqp.qpn <= dev->dev->phys_caps.base_sqpn + 3);
+	if (real_sqp)
+		return 1;
+	/* VF or PF -- proxy SQP */
+	if (mlx4_is_mfunc(dev->dev)) {
+		for (i = 0; i < dev->dev->caps.num_ports; i++) {
+			if (qp->mqp.qpn == dev->dev->caps.qp0_proxy[i] ||
+			    qp->mqp.qpn == dev->dev->caps.qp1_proxy[i]) {
+				proxy_sqp = 1;
+				break;
+			}
+		}
+	}
+	return proxy_sqp;
+}
+
+/* used for INIT/CLOSE port logic */
+static int is_qp0(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp)
+{
+	int proxy_qp0 = 0;
+	int real_qp0 = 0;
+	int i;
+	/* PPF or Native -- real QP0 */
+	real_qp0 = ((mlx4_is_master(dev->dev) || !mlx4_is_mfunc(dev->dev)) &&
+		    qp->mqp.qpn >= dev->dev->phys_caps.base_sqpn &&
+		    qp->mqp.qpn <= dev->dev->phys_caps.base_sqpn + 1);
+	if (real_qp0)
+		return 1;
+	/* VF or PF -- proxy QP0 */
+	if (mlx4_is_mfunc(dev->dev)) {
+		for (i = 0; i < dev->dev->caps.num_ports; i++) {
+			if (qp->mqp.qpn == dev->dev->caps.qp0_proxy[i]) {
+				proxy_qp0 = 1;
+				break;
+			}
+		}
+	}
+	return proxy_qp0;
+}
+
+static void *get_wqe(struct mlx4_ib_qp *qp, int offset)
+{
+	return mlx4_buf_offset(&qp->buf, offset);
+}
+
+static void *get_recv_wqe(struct mlx4_ib_qp *qp, int n)
+{
+	return get_wqe(qp, qp->rq.offset + (n << qp->rq.wqe_shift));
+}
+
+static void *get_send_wqe(struct mlx4_ib_qp *qp, int n)
+{
+	return get_wqe(qp, qp->sq.offset + (n << qp->sq.wqe_shift));
+}
+
+/*
+ * Stamp a SQ WQE so that it is invalid if prefetched by marking the
+ * first four bytes of every 64 byte chunk with
+ *     0x7FFFFFF | (invalid_ownership_value << 31).
+ *
+ * When the max work request size is less than or equal to the WQE
+ * basic block size, as an optimization, we can stamp all WQEs with
+ * 0xffffffff, and skip the very first chunk of each WQE.
+ */
+static void stamp_send_wqe(struct mlx4_ib_qp *qp, int n, int size)
+{
+	__be32 *wqe;
+	int i;
+	int s;
+	int ind;
+	void *buf;
+	__be32 stamp;
+	struct mlx4_wqe_ctrl_seg *ctrl;
+
+	if (qp->sq_max_wqes_per_wr > 1) {
+		s = roundup(size, 1U << qp->sq.wqe_shift);
+		for (i = 0; i < s; i += 64) {
+			ind = (i >> qp->sq.wqe_shift) + n;
+			stamp = ind & qp->sq.wqe_cnt ? cpu_to_be32(0x7fffffff) :
+						       cpu_to_be32(0xffffffff);
+			buf = get_send_wqe(qp, ind & (qp->sq.wqe_cnt - 1));
+			wqe = buf + (i & ((1 << qp->sq.wqe_shift) - 1));
+			*wqe = stamp;
+		}
+	} else {
+		ctrl = buf = get_send_wqe(qp, n & (qp->sq.wqe_cnt - 1));
+		s = (ctrl->fence_size & 0x3f) << 4;
+		for (i = 64; i < s; i += 64) {
+			wqe = buf + i;
+			*wqe = cpu_to_be32(0xffffffff);
+		}
+	}
+}
+
+static void post_nop_wqe(struct mlx4_ib_qp *qp, int n, int size)
+{
+	struct mlx4_wqe_ctrl_seg *ctrl;
+	struct mlx4_wqe_inline_seg *inl;
+	void *wqe;
+	int s;
+
+	ctrl = wqe = get_send_wqe(qp, n & (qp->sq.wqe_cnt - 1));
+	s = sizeof(struct mlx4_wqe_ctrl_seg);
+
+	if (qp->ibqp.qp_type == IB_QPT_UD) {
+		struct mlx4_wqe_datagram_seg *dgram = wqe + sizeof *ctrl;
+		struct mlx4_av *av = (struct mlx4_av *)dgram->av;
+		memset(dgram, 0, sizeof *dgram);
+		av->port_pd = cpu_to_be32((qp->port << 24) | to_mpd(qp->ibqp.pd)->pdn);
+		s += sizeof(struct mlx4_wqe_datagram_seg);
+	}
+
+	/* Pad the remainder of the WQE with an inline data segment. */
+	if (size > s) {
+		inl = wqe + s;
+		inl->byte_count = cpu_to_be32(1 << 31 | (size - s - sizeof *inl));
+	}
+	ctrl->srcrb_flags = 0;
+	ctrl->fence_size = size / 16;
+	/*
+	 * Make sure descriptor is fully written before setting ownership bit
+	 * (because HW can start executing as soon as we do).
+	 */
+	wmb();
+
+	ctrl->owner_opcode = cpu_to_be32(MLX4_OPCODE_NOP | MLX4_WQE_CTRL_NEC) |
+		(n & qp->sq.wqe_cnt ? cpu_to_be32(1 << 31) : 0);
+
+	stamp_send_wqe(qp, n + qp->sq_spare_wqes, size);
+}
+
+/* Post NOP WQE to prevent wrap-around in the middle of WR */
+static inline unsigned pad_wraparound(struct mlx4_ib_qp *qp, int ind)
+{
+	unsigned s = qp->sq.wqe_cnt - (ind & (qp->sq.wqe_cnt - 1));
+	if (unlikely(s < qp->sq_max_wqes_per_wr)) {
+		post_nop_wqe(qp, ind, s << qp->sq.wqe_shift);
+		ind += s;
+	}
+	return ind;
+}
+
+static void mlx4_ib_qp_event(struct mlx4_qp *qp, enum mlx4_event type)
+{
+	struct ib_event event;
+	struct ib_qp *ibqp = &to_mibqp(qp)->ibqp;
+
+	if (type == MLX4_EVENT_TYPE_PATH_MIG)
+		to_mibqp(qp)->port = to_mibqp(qp)->alt_port;
+
+	if (ibqp->event_handler) {
+		event.device     = ibqp->device;
+		event.element.qp = ibqp;
+		switch (type) {
+		case MLX4_EVENT_TYPE_PATH_MIG:
+			event.event = IB_EVENT_PATH_MIG;
+			break;
+		case MLX4_EVENT_TYPE_COMM_EST:
+			event.event = IB_EVENT_COMM_EST;
+			break;
+		case MLX4_EVENT_TYPE_SQ_DRAINED:
+			event.event = IB_EVENT_SQ_DRAINED;
+			break;
+		case MLX4_EVENT_TYPE_SRQ_QP_LAST_WQE:
+			event.event = IB_EVENT_QP_LAST_WQE_REACHED;
+			break;
+		case MLX4_EVENT_TYPE_WQ_CATAS_ERROR:
+			event.event = IB_EVENT_QP_FATAL;
+			break;
+		case MLX4_EVENT_TYPE_PATH_MIG_FAILED:
+			event.event = IB_EVENT_PATH_MIG_ERR;
+			break;
+		case MLX4_EVENT_TYPE_WQ_INVAL_REQ_ERROR:
+			event.event = IB_EVENT_QP_REQ_ERR;
+			break;
+		case MLX4_EVENT_TYPE_WQ_ACCESS_ERROR:
+			event.event = IB_EVENT_QP_ACCESS_ERR;
+			break;
+		default:
+			pr_warn("Unexpected event type %d "
+			       "on QP %06x\n", type, qp->qpn);
+			return;
+		}
+
+		ibqp->event_handler(&event, ibqp->qp_context);
+	}
+}
+
+static int send_wqe_overhead(enum mlx4_ib_qp_type type, u32 flags)
+{
+	/*
+	 * UD WQEs must have a datagram segment.
+	 * RC and UC WQEs might have a remote address segment.
+	 * MLX WQEs need two extra inline data segments (for the UD
+	 * header and space for the ICRC).
+	 */
+	switch (type) {
+	case MLX4_IB_QPT_UD:
+		return sizeof (struct mlx4_wqe_ctrl_seg) +
+			sizeof (struct mlx4_wqe_datagram_seg) +
+			((flags & MLX4_IB_QP_LSO) ? MLX4_IB_LSO_HEADER_SPARE : 0);
+	case MLX4_IB_QPT_PROXY_SMI_OWNER:
+	case MLX4_IB_QPT_PROXY_SMI:
+	case MLX4_IB_QPT_PROXY_GSI:
+		return sizeof (struct mlx4_wqe_ctrl_seg) +
+			sizeof (struct mlx4_wqe_datagram_seg) + 64;
+	case MLX4_IB_QPT_TUN_SMI_OWNER:
+	case MLX4_IB_QPT_TUN_GSI:
+		return sizeof (struct mlx4_wqe_ctrl_seg) +
+			sizeof (struct mlx4_wqe_datagram_seg);
+
+	case MLX4_IB_QPT_UC:
+		return sizeof (struct mlx4_wqe_ctrl_seg) +
+			sizeof (struct mlx4_wqe_raddr_seg);
+	case MLX4_IB_QPT_RC:
+		return sizeof (struct mlx4_wqe_ctrl_seg) +
+			sizeof (struct mlx4_wqe_atomic_seg) +
+			sizeof (struct mlx4_wqe_raddr_seg);
+	case MLX4_IB_QPT_SMI:
+	case MLX4_IB_QPT_GSI:
+		return sizeof (struct mlx4_wqe_ctrl_seg) +
+			ALIGN(MLX4_IB_UD_HEADER_SIZE +
+			      DIV_ROUND_UP(MLX4_IB_UD_HEADER_SIZE,
+					   MLX4_INLINE_ALIGN) *
+			      sizeof (struct mlx4_wqe_inline_seg),
+			      sizeof (struct mlx4_wqe_data_seg)) +
+			ALIGN(4 +
+			      sizeof (struct mlx4_wqe_inline_seg),
+			      sizeof (struct mlx4_wqe_data_seg));
+	default:
+		return sizeof (struct mlx4_wqe_ctrl_seg);
+	}
+}
+
+static int set_rq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap,
+		       int is_user, int has_rq, struct mlx4_ib_qp *qp)
+{
+	/* Sanity check RQ size before proceeding */
+	if (cap->max_recv_wr > dev->dev->caps.max_wqes - MLX4_IB_SQ_MAX_SPARE ||
+	    cap->max_recv_sge > min(dev->dev->caps.max_sq_sg, dev->dev->caps.max_rq_sg))
+		return -EINVAL;
+
+	if (!has_rq) {
+		if (cap->max_recv_wr)
+			return -EINVAL;
+
+		qp->rq.wqe_cnt = qp->rq.max_gs = 0;
+	} else {
+		/* HW requires >= 1 RQ entry with >= 1 gather entry */
+		if (is_user && (!cap->max_recv_wr || !cap->max_recv_sge))
+			return -EINVAL;
+
+		qp->rq.wqe_cnt	 = roundup_pow_of_two(max(1U, cap->max_recv_wr));
+		qp->rq.max_gs	 = roundup_pow_of_two(max(1U, cap->max_recv_sge));
+		qp->rq.wqe_shift = ilog2(qp->rq.max_gs * sizeof (struct mlx4_wqe_data_seg));
+	}
+
+	/* leave userspace return values as they were, so as not to break ABI */
+	if (is_user) {
+		cap->max_recv_wr  = qp->rq.max_post = qp->rq.wqe_cnt;
+		cap->max_recv_sge = qp->rq.max_gs;
+	} else {
+		cap->max_recv_wr  = qp->rq.max_post =
+			min(dev->dev->caps.max_wqes - MLX4_IB_SQ_MAX_SPARE, qp->rq.wqe_cnt);
+		cap->max_recv_sge = min(qp->rq.max_gs,
+					min(dev->dev->caps.max_sq_sg,
+					    dev->dev->caps.max_rq_sg));
+	}
+
+	return 0;
+}
+
+static int set_kernel_sq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap,
+			      enum mlx4_ib_qp_type type, struct mlx4_ib_qp *qp)
+{
+	int s;
+
+	/* Sanity check SQ size before proceeding */
+	if (cap->max_send_wr  > (dev->dev->caps.max_wqes - MLX4_IB_SQ_MAX_SPARE) ||
+	    cap->max_send_sge > min(dev->dev->caps.max_sq_sg, dev->dev->caps.max_rq_sg) ||
+	    cap->max_inline_data + send_wqe_overhead(type, qp->flags) +
+	    sizeof (struct mlx4_wqe_inline_seg) > dev->dev->caps.max_sq_desc_sz)
+		return -EINVAL;
+
+	/*
+	 * For MLX transport we need 2 extra S/G entries:
+	 * one for the header and one for the checksum at the end
+	 */
+	if ((type == MLX4_IB_QPT_SMI || type == MLX4_IB_QPT_GSI ||
+	     type & (MLX4_IB_QPT_PROXY_SMI_OWNER | MLX4_IB_QPT_TUN_SMI_OWNER)) &&
+	    cap->max_send_sge + 2 > dev->dev->caps.max_sq_sg)
+		return -EINVAL;
+
+	s = max(cap->max_send_sge * sizeof (struct mlx4_wqe_data_seg),
+		cap->max_inline_data + sizeof (struct mlx4_wqe_inline_seg)) +
+		send_wqe_overhead(type, qp->flags);
+
+	if (s > dev->dev->caps.max_sq_desc_sz)
+		return -EINVAL;
+
+	/*
+	 * Hermon supports shrinking WQEs, such that a single work
+	 * request can include multiple units of 1 << wqe_shift.  This
+	 * way, work requests can differ in size, and do not have to
+	 * be a power of 2 in size, saving memory and speeding up send
+	 * WR posting.  Unfortunately, if we do this then the
+	 * wqe_index field in CQEs can't be used to look up the WR ID
+	 * anymore, so we do this only if selective signaling is off.
+	 *
+	 * Further, on 32-bit platforms, we can't use vmap() to make
+	 * the QP buffer virtually contiguous.  Thus we have to use
+	 * constant-sized WRs to make sure a WR is always fully within
+	 * a single page-sized chunk.
+	 *
+	 * Finally, we use NOP work requests to pad the end of the
+	 * work queue, to avoid wrap-around in the middle of WR.  We
+	 * set NEC bit to avoid getting completions with error for
+	 * these NOP WRs, but since NEC is only supported starting
+	 * with firmware 2.2.232, we use constant-sized WRs for older
+	 * firmware.
+	 *
+	 * And, since MLX QPs only support SEND, we use constant-sized
+	 * WRs in this case.
+	 *
+	 * We look for the smallest value of wqe_shift such that the
+	 * resulting number of wqes does not exceed device
+	 * capabilities.
+	 *
+	 * We set WQE size to at least 64 bytes, this way stamping
+	 * invalidates each WQE.
+	 */
+	if (dev->dev->caps.fw_ver >= MLX4_FW_VER_WQE_CTRL_NEC &&
+	    qp->sq_signal_bits && BITS_PER_LONG == 64 &&
+	    type != MLX4_IB_QPT_SMI && type != MLX4_IB_QPT_GSI &&
+	    !(type & (MLX4_IB_QPT_PROXY_SMI_OWNER | MLX4_IB_QPT_PROXY_SMI |
+		      MLX4_IB_QPT_PROXY_GSI | MLX4_IB_QPT_TUN_SMI_OWNER)))
+		qp->sq.wqe_shift = ilog2(64);
+	else
+		qp->sq.wqe_shift = ilog2(roundup_pow_of_two(s));
+
+	for (;;) {
+		qp->sq_max_wqes_per_wr = DIV_ROUND_UP(s, 1U << qp->sq.wqe_shift);
+
+		/*
+		 * We need to leave 2 KB + 1 WR of headroom in the SQ to
+		 * allow HW to prefetch.
+		 */
+		qp->sq_spare_wqes = (2048 >> qp->sq.wqe_shift) + qp->sq_max_wqes_per_wr;
+		qp->sq.wqe_cnt = roundup_pow_of_two(cap->max_send_wr *
+						    qp->sq_max_wqes_per_wr +
+						    qp->sq_spare_wqes);
+
+		if (qp->sq.wqe_cnt <= dev->dev->caps.max_wqes)
+			break;
+
+		if (qp->sq_max_wqes_per_wr <= 1)
+			return -EINVAL;
+
+		++qp->sq.wqe_shift;
+	}
+
+	qp->sq.max_gs = (min(dev->dev->caps.max_sq_desc_sz,
+			     (qp->sq_max_wqes_per_wr << qp->sq.wqe_shift)) -
+			 send_wqe_overhead(type, qp->flags)) /
+		sizeof (struct mlx4_wqe_data_seg);
+
+	qp->buf_size = (qp->rq.wqe_cnt << qp->rq.wqe_shift) +
+		(qp->sq.wqe_cnt << qp->sq.wqe_shift);
+	if (qp->rq.wqe_shift > qp->sq.wqe_shift) {
+		qp->rq.offset = 0;
+		qp->sq.offset = qp->rq.wqe_cnt << qp->rq.wqe_shift;
+	} else {
+		qp->rq.offset = qp->sq.wqe_cnt << qp->sq.wqe_shift;
+		qp->sq.offset = 0;
+	}
+
+	cap->max_send_wr  = qp->sq.max_post =
+		(qp->sq.wqe_cnt - qp->sq_spare_wqes) / qp->sq_max_wqes_per_wr;
+	cap->max_send_sge = min(qp->sq.max_gs,
+				min(dev->dev->caps.max_sq_sg,
+				    dev->dev->caps.max_rq_sg));
+	/* We don't support inline sends for kernel QPs (yet) */
+	cap->max_inline_data = 0;
+
+	return 0;
+}
+
+static int set_user_sq_size(struct mlx4_ib_dev *dev,
+			    struct mlx4_ib_qp *qp,
+			    struct mlx4_ib_create_qp *ucmd)
+{
+	/* Sanity check SQ size before proceeding */
+	if ((1 << ucmd->log_sq_bb_count) > dev->dev->caps.max_wqes	 ||
+	    ucmd->log_sq_stride >
+		ilog2(roundup_pow_of_two(dev->dev->caps.max_sq_desc_sz)) ||
+	    ucmd->log_sq_stride < MLX4_IB_MIN_SQ_STRIDE)
+		return -EINVAL;
+
+	qp->sq.wqe_cnt   = 1 << ucmd->log_sq_bb_count;
+	qp->sq.wqe_shift = ucmd->log_sq_stride;
+
+	qp->buf_size = (qp->rq.wqe_cnt << qp->rq.wqe_shift) +
+		(qp->sq.wqe_cnt << qp->sq.wqe_shift);
+
+	return 0;
+}
+
+static int alloc_proxy_bufs(struct ib_device *dev, struct mlx4_ib_qp *qp)
+{
+	int i;
+
+	qp->sqp_proxy_rcv =
+		kmalloc(sizeof (struct mlx4_ib_buf) * qp->rq.wqe_cnt,
+			GFP_KERNEL);
+	if (!qp->sqp_proxy_rcv)
+		return -ENOMEM;
+	for (i = 0; i < qp->rq.wqe_cnt; i++) {
+		qp->sqp_proxy_rcv[i].addr =
+			kmalloc(sizeof (struct mlx4_ib_proxy_sqp_hdr),
+				GFP_KERNEL);
+		if (!qp->sqp_proxy_rcv[i].addr)
+			goto err;
+		qp->sqp_proxy_rcv[i].map =
+			ib_dma_map_single(dev, qp->sqp_proxy_rcv[i].addr,
+					  sizeof (struct mlx4_ib_proxy_sqp_hdr),
+					  DMA_FROM_DEVICE);
+	}
+	return 0;
+
+err:
+	while (i > 0) {
+		--i;
+		ib_dma_unmap_single(dev, qp->sqp_proxy_rcv[i].map,
+				    sizeof (struct mlx4_ib_proxy_sqp_hdr),
+				    DMA_FROM_DEVICE);
+		kfree(qp->sqp_proxy_rcv[i].addr);
+	}
+	kfree(qp->sqp_proxy_rcv);
+	qp->sqp_proxy_rcv = NULL;
+	return -ENOMEM;
+}
+
+static void free_proxy_bufs(struct ib_device *dev, struct mlx4_ib_qp *qp)
+{
+	int i;
+
+	for (i = 0; i < qp->rq.wqe_cnt; i++) {
+		ib_dma_unmap_single(dev, qp->sqp_proxy_rcv[i].map,
+				    sizeof (struct mlx4_ib_proxy_sqp_hdr),
+				    DMA_FROM_DEVICE);
+		kfree(qp->sqp_proxy_rcv[i].addr);
+	}
+	kfree(qp->sqp_proxy_rcv);
+}
+
+static int qp_has_rq(struct ib_qp_init_attr *attr)
+{
+	if (attr->qp_type == IB_QPT_XRC_INI || attr->qp_type == IB_QPT_XRC_TGT)
+		return 0;
+
+	return !attr->srq;
+}
+
+static int qp0_enabled_vf(struct mlx4_dev *dev, int qpn)
+{
+	int i;
+	for (i = 0; i < dev->caps.num_ports; i++) {
+		if (qpn == dev->caps.qp0_proxy[i])
+			return !!dev->caps.qp0_qkey[i];
+	}
+	return 0;
+}
+
+static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,
+			    struct ib_qp_init_attr *init_attr,
+			    struct ib_udata *udata, int sqpn, struct mlx4_ib_qp **caller_qp,
+			    gfp_t gfp)
+{
+	int qpn;
+	int err;
+	struct mlx4_ib_sqp *sqp;
+	struct mlx4_ib_qp *qp;
+	enum mlx4_ib_qp_type qp_type = (enum mlx4_ib_qp_type) init_attr->qp_type;
+
+	/* When tunneling special qps, we use a plain UD qp */
+	if (sqpn) {
+		if (mlx4_is_mfunc(dev->dev) &&
+		    (!mlx4_is_master(dev->dev) ||
+		     !(init_attr->create_flags & MLX4_IB_SRIOV_SQP))) {
+			if (init_attr->qp_type == IB_QPT_GSI)
+				qp_type = MLX4_IB_QPT_PROXY_GSI;
+			else {
+				if (mlx4_is_master(dev->dev) ||
+				    qp0_enabled_vf(dev->dev, sqpn))
+					qp_type = MLX4_IB_QPT_PROXY_SMI_OWNER;
+				else
+					qp_type = MLX4_IB_QPT_PROXY_SMI;
+			}
+		}
+		qpn = sqpn;
+		/* add extra sg entry for tunneling */
+		init_attr->cap.max_recv_sge++;
+	} else if (init_attr->create_flags & MLX4_IB_SRIOV_TUNNEL_QP) {
+		struct mlx4_ib_qp_tunnel_init_attr *tnl_init =
+			container_of(init_attr,
+				     struct mlx4_ib_qp_tunnel_init_attr, init_attr);
+		if ((tnl_init->proxy_qp_type != IB_QPT_SMI &&
+		     tnl_init->proxy_qp_type != IB_QPT_GSI)   ||
+		    !mlx4_is_master(dev->dev))
+			return -EINVAL;
+		if (tnl_init->proxy_qp_type == IB_QPT_GSI)
+			qp_type = MLX4_IB_QPT_TUN_GSI;
+		else if (tnl_init->slave == mlx4_master_func_num(dev->dev) ||
+			 mlx4_vf_smi_enabled(dev->dev, tnl_init->slave,
+					     tnl_init->port))
+			qp_type = MLX4_IB_QPT_TUN_SMI_OWNER;
+		else
+			qp_type = MLX4_IB_QPT_TUN_SMI;
+		/* we are definitely in the PPF here, since we are creating
+		 * tunnel QPs. base_tunnel_sqpn is therefore valid. */
+		qpn = dev->dev->phys_caps.base_tunnel_sqpn + 8 * tnl_init->slave
+			+ tnl_init->proxy_qp_type * 2 + tnl_init->port - 1;
+		sqpn = qpn;
+	}
+
+	if (!*caller_qp) {
+		if (qp_type == MLX4_IB_QPT_SMI || qp_type == MLX4_IB_QPT_GSI ||
+		    (qp_type & (MLX4_IB_QPT_PROXY_SMI | MLX4_IB_QPT_PROXY_SMI_OWNER |
+				MLX4_IB_QPT_PROXY_GSI | MLX4_IB_QPT_TUN_SMI_OWNER))) {
+			sqp = kzalloc(sizeof (struct mlx4_ib_sqp), gfp);
+			if (!sqp)
+				return -ENOMEM;
+			qp = &sqp->qp;
+			qp->pri.vid = 0xFFFF;
+			qp->alt.vid = 0xFFFF;
+		} else {
+			qp = kzalloc(sizeof (struct mlx4_ib_qp), gfp);
+			if (!qp)
+				return -ENOMEM;
+			qp->pri.vid = 0xFFFF;
+			qp->alt.vid = 0xFFFF;
+		}
+	} else
+		qp = *caller_qp;
+
+	qp->mlx4_ib_qp_type = qp_type;
+
+	mutex_init(&qp->mutex);
+	spin_lock_init(&qp->sq.lock);
+	spin_lock_init(&qp->rq.lock);
+	INIT_LIST_HEAD(&qp->gid_list);
+	INIT_LIST_HEAD(&qp->steering_rules);
+
+	qp->state	 = IB_QPS_RESET;
+	if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR)
+		qp->sq_signal_bits = cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE);
+
+	err = set_rq_size(dev, &init_attr->cap, !!pd->uobject, qp_has_rq(init_attr), qp);
+	if (err)
+		goto err;
+
+	if (pd->uobject) {
+		struct mlx4_ib_create_qp ucmd;
+
+		if (ib_copy_from_udata(&ucmd, udata, sizeof ucmd)) {
+			err = -EFAULT;
+			goto err;
+		}
+
+		qp->sq_no_prefetch = ucmd.sq_no_prefetch;
+
+		err = set_user_sq_size(dev, qp, &ucmd);
+		if (err)
+			goto err;
+
+		qp->umem = ib_umem_get(pd->uobject->context, ucmd.buf_addr,
+				       qp->buf_size, 0, 0);
+		if (IS_ERR(qp->umem)) {
+			err = PTR_ERR(qp->umem);
+			goto err;
+		}
+
+		err = mlx4_mtt_init(dev->dev, ib_umem_page_count(qp->umem),
+				    ilog2(qp->umem->page_size), &qp->mtt);
+		if (err)
+			goto err_buf;
+
+		err = mlx4_ib_umem_write_mtt(dev, &qp->mtt, qp->umem);
+		if (err)
+			goto err_mtt;
+
+		if (qp_has_rq(init_attr)) {
+			err = mlx4_ib_db_map_user(to_mucontext(pd->uobject->context),
+						  ucmd.db_addr, &qp->db);
+			if (err)
+				goto err_mtt;
+		}
+	} else {
+		qp->sq_no_prefetch = 0;
+
+		if (init_attr->create_flags & IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK)
+			qp->flags |= MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK;
+
+		if (init_attr->create_flags & IB_QP_CREATE_IPOIB_UD_LSO)
+			qp->flags |= MLX4_IB_QP_LSO;
+
+		if (init_attr->create_flags & IB_QP_CREATE_NETIF_QP) {
+			if (dev->steering_support ==
+			    MLX4_STEERING_MODE_DEVICE_MANAGED)
+				qp->flags |= MLX4_IB_QP_NETIF;
+			else
+				goto err;
+		}
+
+		err = set_kernel_sq_size(dev, &init_attr->cap, qp_type, qp);
+		if (err)
+			goto err;
+
+		if (qp_has_rq(init_attr)) {
+			err = mlx4_db_alloc(dev->dev, &qp->db, 0, gfp);
+			if (err)
+				goto err;
+
+			*qp->db.db = 0;
+		}
+
+		if (mlx4_buf_alloc(dev->dev, qp->buf_size, PAGE_SIZE * 2, &qp->buf, gfp)) {
+			err = -ENOMEM;
+			goto err_db;
+		}
+
+		err = mlx4_mtt_init(dev->dev, qp->buf.npages, qp->buf.page_shift,
+				    &qp->mtt);
+		if (err)
+			goto err_buf;
+
+		err = mlx4_buf_write_mtt(dev->dev, &qp->mtt, &qp->buf, gfp);
+		if (err)
+			goto err_mtt;
+
+		qp->sq.wrid  = kmalloc(qp->sq.wqe_cnt * sizeof (u64), gfp);
+		qp->rq.wrid  = kmalloc(qp->rq.wqe_cnt * sizeof (u64), gfp);
+		if (!qp->sq.wrid || !qp->rq.wrid) {
+			err = -ENOMEM;
+			goto err_wrid;
+		}
+	}
+
+	if (sqpn) {
+		if (qp->mlx4_ib_qp_type & (MLX4_IB_QPT_PROXY_SMI_OWNER |
+		    MLX4_IB_QPT_PROXY_SMI | MLX4_IB_QPT_PROXY_GSI)) {
+			if (alloc_proxy_bufs(pd->device, qp)) {
+				err = -ENOMEM;
+				goto err_wrid;
+			}
+		}
+	} else {
+		/* Raw packet QPNs must be aligned to 8 bits. If not, the WQE
+		 * BlueFlame setup flow wrongly causes VLAN insertion. */
+		if (init_attr->qp_type == IB_QPT_RAW_PACKET)
+			err = mlx4_qp_reserve_range(dev->dev, 1, 1 << 8, &qpn);
+		else
+			if (qp->flags & MLX4_IB_QP_NETIF)
+				err = mlx4_ib_steer_qp_alloc(dev, 1, &qpn);
+			else
+				err = mlx4_qp_reserve_range(dev->dev, 1, 1,
+							    &qpn);
+		if (err)
+			goto err_proxy;
+	}
+
+	err = mlx4_qp_alloc(dev->dev, qpn, &qp->mqp, gfp);
+	if (err)
+		goto err_qpn;
+
+	if (init_attr->qp_type == IB_QPT_XRC_TGT)
+		qp->mqp.qpn |= (1 << 23);
+
+	/*
+	 * Hardware wants QPN written in big-endian order (after
+	 * shifting) for send doorbell.  Precompute this value to save
+	 * a little bit when posting sends.
+	 */
+	qp->doorbell_qpn = swab32(qp->mqp.qpn << 8);
+
+	qp->mqp.event = mlx4_ib_qp_event;
+	if (!*caller_qp)
+		*caller_qp = qp;
+	return 0;
+
+err_qpn:
+	if (!sqpn) {
+		if (qp->flags & MLX4_IB_QP_NETIF)
+			mlx4_ib_steer_qp_free(dev, qpn, 1);
+		else
+			mlx4_qp_release_range(dev->dev, qpn, 1);
+	}
+err_proxy:
+	if (qp->mlx4_ib_qp_type == MLX4_IB_QPT_PROXY_GSI)
+		free_proxy_bufs(pd->device, qp);
+err_wrid:
+	if (pd->uobject) {
+		if (qp_has_rq(init_attr))
+			mlx4_ib_db_unmap_user(to_mucontext(pd->uobject->context), &qp->db);
+	} else {
+		kfree(qp->sq.wrid);
+		kfree(qp->rq.wrid);
+	}
+
+err_mtt:
+	mlx4_mtt_cleanup(dev->dev, &qp->mtt);
+
+err_buf:
+	if (pd->uobject)
+		ib_umem_release(qp->umem);
+	else
+		mlx4_buf_free(dev->dev, qp->buf_size, &qp->buf);
+
+err_db:
+	if (!pd->uobject && qp_has_rq(init_attr))
+		mlx4_db_free(dev->dev, &qp->db);
+
+err:
+	if (!*caller_qp)
+		kfree(qp);
+	return err;
+}
+
+static enum mlx4_qp_state to_mlx4_state(enum ib_qp_state state)
+{
+	switch (state) {
+	case IB_QPS_RESET:	return MLX4_QP_STATE_RST;
+	case IB_QPS_INIT:	return MLX4_QP_STATE_INIT;
+	case IB_QPS_RTR:	return MLX4_QP_STATE_RTR;
+	case IB_QPS_RTS:	return MLX4_QP_STATE_RTS;
+	case IB_QPS_SQD:	return MLX4_QP_STATE_SQD;
+	case IB_QPS_SQE:	return MLX4_QP_STATE_SQER;
+	case IB_QPS_ERR:	return MLX4_QP_STATE_ERR;
+	default:		return -1;
+	}
+}
+
+static void mlx4_ib_lock_cqs(struct mlx4_ib_cq *send_cq, struct mlx4_ib_cq *recv_cq)
+	__acquires(&send_cq->lock) __acquires(&recv_cq->lock)
+{
+	if (send_cq == recv_cq) {
+		spin_lock_irq(&send_cq->lock);
+		__acquire(&recv_cq->lock);
+	} else if (send_cq->mcq.cqn < recv_cq->mcq.cqn) {
+		spin_lock_irq(&send_cq->lock);
+		spin_lock_nested(&recv_cq->lock, SINGLE_DEPTH_NESTING);
+	} else {
+		spin_lock_irq(&recv_cq->lock);
+		spin_lock_nested(&send_cq->lock, SINGLE_DEPTH_NESTING);
+	}
+}
+
+static void mlx4_ib_unlock_cqs(struct mlx4_ib_cq *send_cq, struct mlx4_ib_cq *recv_cq)
+	__releases(&send_cq->lock) __releases(&recv_cq->lock)
+{
+	if (send_cq == recv_cq) {
+		__release(&recv_cq->lock);
+		spin_unlock_irq(&send_cq->lock);
+	} else if (send_cq->mcq.cqn < recv_cq->mcq.cqn) {
+		spin_unlock(&recv_cq->lock);
+		spin_unlock_irq(&send_cq->lock);
+	} else {
+		spin_unlock(&send_cq->lock);
+		spin_unlock_irq(&recv_cq->lock);
+	}
+}
+
+static void del_gid_entries(struct mlx4_ib_qp *qp)
+{
+	struct mlx4_ib_gid_entry *ge, *tmp;
+
+	list_for_each_entry_safe(ge, tmp, &qp->gid_list, list) {
+		list_del(&ge->list);
+		kfree(ge);
+	}
+}
+
+static struct mlx4_ib_pd *get_pd(struct mlx4_ib_qp *qp)
+{
+	if (qp->ibqp.qp_type == IB_QPT_XRC_TGT)
+		return to_mpd(to_mxrcd(qp->ibqp.xrcd)->pd);
+	else
+		return to_mpd(qp->ibqp.pd);
+}
+
+static void get_cqs(struct mlx4_ib_qp *qp,
+		    struct mlx4_ib_cq **send_cq, struct mlx4_ib_cq **recv_cq)
+{
+	switch (qp->ibqp.qp_type) {
+	case IB_QPT_XRC_TGT:
+		*send_cq = to_mcq(to_mxrcd(qp->ibqp.xrcd)->cq);
+		*recv_cq = *send_cq;
+		break;
+	case IB_QPT_XRC_INI:
+		*send_cq = to_mcq(qp->ibqp.send_cq);
+		*recv_cq = *send_cq;
+		break;
+	default:
+		*send_cq = to_mcq(qp->ibqp.send_cq);
+		*recv_cq = to_mcq(qp->ibqp.recv_cq);
+		break;
+	}
+}
+
+static void destroy_qp_common(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp,
+			      int is_user)
+{
+	struct mlx4_ib_cq *send_cq, *recv_cq;
+
+	if (qp->state != IB_QPS_RESET) {
+		if (mlx4_qp_modify(dev->dev, NULL, to_mlx4_state(qp->state),
+				   MLX4_QP_STATE_RST, NULL, 0, 0, &qp->mqp))
+			pr_warn("modify QP %06x to RESET failed.\n",
+			       qp->mqp.qpn);
+		if (qp->pri.smac || (!qp->pri.smac && qp->pri.smac_port)) {
+			mlx4_unregister_mac(dev->dev, qp->pri.smac_port, qp->pri.smac);
+			qp->pri.smac = 0;
+			qp->pri.smac_port = 0;
+		}
+		if (qp->alt.smac) {
+			mlx4_unregister_mac(dev->dev, qp->alt.smac_port, qp->alt.smac);
+			qp->alt.smac = 0;
+		}
+		if (qp->pri.vid < 0x1000) {
+			mlx4_unregister_vlan(dev->dev, qp->pri.vlan_port, qp->pri.vid);
+			qp->pri.vid = 0xFFFF;
+			qp->pri.candidate_vid = 0xFFFF;
+			qp->pri.update_vid = 0;
+		}
+		if (qp->alt.vid < 0x1000) {
+			mlx4_unregister_vlan(dev->dev, qp->alt.vlan_port, qp->alt.vid);
+			qp->alt.vid = 0xFFFF;
+			qp->alt.candidate_vid = 0xFFFF;
+			qp->alt.update_vid = 0;
+		}
+	}
+
+	get_cqs(qp, &send_cq, &recv_cq);
+
+	mlx4_ib_lock_cqs(send_cq, recv_cq);
+
+	if (!is_user) {
+		__mlx4_ib_cq_clean(recv_cq, qp->mqp.qpn,
+				 qp->ibqp.srq ? to_msrq(qp->ibqp.srq): NULL);
+		if (send_cq != recv_cq)
+			__mlx4_ib_cq_clean(send_cq, qp->mqp.qpn, NULL);
+	}
+
+	mlx4_qp_remove(dev->dev, &qp->mqp);
+
+	mlx4_ib_unlock_cqs(send_cq, recv_cq);
+
+	mlx4_qp_free(dev->dev, &qp->mqp);
+
+	if (!is_sqp(dev, qp) && !is_tunnel_qp(dev, qp)) {
+		if (qp->flags & MLX4_IB_QP_NETIF)
+			mlx4_ib_steer_qp_free(dev, qp->mqp.qpn, 1);
+		else
+			mlx4_qp_release_range(dev->dev, qp->mqp.qpn, 1);
+	}
+
+	mlx4_mtt_cleanup(dev->dev, &qp->mtt);
+
+	if (is_user) {
+		if (qp->rq.wqe_cnt)
+			mlx4_ib_db_unmap_user(to_mucontext(qp->ibqp.uobject->context),
+					      &qp->db);
+		ib_umem_release(qp->umem);
+	} else {
+		kfree(qp->sq.wrid);
+		kfree(qp->rq.wrid);
+		if (qp->mlx4_ib_qp_type & (MLX4_IB_QPT_PROXY_SMI_OWNER |
+		    MLX4_IB_QPT_PROXY_SMI | MLX4_IB_QPT_PROXY_GSI))
+			free_proxy_bufs(&dev->ib_dev, qp);
+		mlx4_buf_free(dev->dev, qp->buf_size, &qp->buf);
+		if (qp->rq.wqe_cnt)
+			mlx4_db_free(dev->dev, &qp->db);
+	}
+
+	del_gid_entries(qp);
+}
+
+static u32 get_sqp_num(struct mlx4_ib_dev *dev, struct ib_qp_init_attr *attr)
+{
+	/* Native or PPF */
+	if (!mlx4_is_mfunc(dev->dev) ||
+	    (mlx4_is_master(dev->dev) &&
+	     attr->create_flags & MLX4_IB_SRIOV_SQP)) {
+		return  dev->dev->phys_caps.base_sqpn +
+			(attr->qp_type == IB_QPT_SMI ? 0 : 2) +
+			attr->port_num - 1;
+	}
+	/* PF or VF -- creating proxies */
+	if (attr->qp_type == IB_QPT_SMI)
+		return dev->dev->caps.qp0_proxy[attr->port_num - 1];
+	else
+		return dev->dev->caps.qp1_proxy[attr->port_num - 1];
+}
+
+struct ib_qp *mlx4_ib_create_qp(struct ib_pd *pd,
+				struct ib_qp_init_attr *init_attr,
+				struct ib_udata *udata)
+{
+	struct mlx4_ib_qp *qp = NULL;
+	int err;
+	u16 xrcdn = 0;
+	gfp_t gfp;
+
+	gfp = (init_attr->create_flags & MLX4_IB_QP_CREATE_USE_GFP_NOIO) ?
+		GFP_NOIO : GFP_KERNEL;
+	/*
+	 * We only support LSO, vendor flag1, and multicast loopback blocking,
+	 * and only for kernel UD QPs.
+	 */
+	if (init_attr->create_flags & ~(MLX4_IB_QP_LSO |
+					MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK |
+					MLX4_IB_SRIOV_TUNNEL_QP |
+					MLX4_IB_SRIOV_SQP |
+					MLX4_IB_QP_NETIF |
+					MLX4_IB_QP_CREATE_USE_GFP_NOIO))
+		return ERR_PTR(-EINVAL);
+
+	if (init_attr->create_flags & IB_QP_CREATE_NETIF_QP) {
+		if (init_attr->qp_type != IB_QPT_UD)
+			return ERR_PTR(-EINVAL);
+	}
+
+	if (init_attr->create_flags &&
+	    (udata ||
+	     ((init_attr->create_flags & ~(MLX4_IB_SRIOV_SQP | MLX4_IB_QP_CREATE_USE_GFP_NOIO)) &&
+	      init_attr->qp_type != IB_QPT_UD) ||
+	     ((init_attr->create_flags & MLX4_IB_SRIOV_SQP) &&
+	      init_attr->qp_type > IB_QPT_GSI)))
+		return ERR_PTR(-EINVAL);
+
+	switch (init_attr->qp_type) {
+	case IB_QPT_XRC_TGT:
+		pd = to_mxrcd(init_attr->xrcd)->pd;
+		xrcdn = to_mxrcd(init_attr->xrcd)->xrcdn;
+		init_attr->send_cq = to_mxrcd(init_attr->xrcd)->cq;
+		/* fall through */
+	case IB_QPT_XRC_INI:
+		if (!(to_mdev(pd->device)->dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC))
+			return ERR_PTR(-ENOSYS);
+		init_attr->recv_cq = init_attr->send_cq;
+		/* fall through */
+	case IB_QPT_RC:
+	case IB_QPT_UC:
+	case IB_QPT_RAW_PACKET:
+		qp = kzalloc(sizeof *qp, gfp);
+		if (!qp)
+			return ERR_PTR(-ENOMEM);
+		qp->pri.vid = 0xFFFF;
+		qp->alt.vid = 0xFFFF;
+		/* fall through */
+	case IB_QPT_UD:
+	{
+		err = create_qp_common(to_mdev(pd->device), pd, init_attr,
+				       udata, 0, &qp, gfp);
+		if (err)
+			return ERR_PTR(err);
+
+		qp->ibqp.qp_num = qp->mqp.qpn;
+		qp->xrcdn = xrcdn;
+
+		break;
+	}
+	case IB_QPT_SMI:
+	case IB_QPT_GSI:
+	{
+		/* Userspace is not allowed to create special QPs: */
+		if (udata)
+			return ERR_PTR(-EINVAL);
+
+		err = create_qp_common(to_mdev(pd->device), pd, init_attr, udata,
+				       get_sqp_num(to_mdev(pd->device), init_attr),
+				       &qp, gfp);
+		if (err)
+			return ERR_PTR(err);
+
+		qp->port	= init_attr->port_num;
+		qp->ibqp.qp_num = init_attr->qp_type == IB_QPT_SMI ? 0 : 1;
+
+		break;
+	}
+	default:
+		/* Don't support raw QPs */
+		return ERR_PTR(-EINVAL);
+	}
+
+	return &qp->ibqp;
+}
+
+int mlx4_ib_destroy_qp(struct ib_qp *qp)
+{
+	struct mlx4_ib_dev *dev = to_mdev(qp->device);
+	struct mlx4_ib_qp *mqp = to_mqp(qp);
+	struct mlx4_ib_pd *pd;
+
+	if (is_qp0(dev, mqp))
+		mlx4_CLOSE_PORT(dev->dev, mqp->port);
+
+	if (dev->qp1_proxy[mqp->port - 1] == mqp) {
+		mutex_lock(&dev->qp1_proxy_lock[mqp->port - 1]);
+		dev->qp1_proxy[mqp->port - 1] = NULL;
+		mutex_unlock(&dev->qp1_proxy_lock[mqp->port - 1]);
+	}
+
+	pd = get_pd(mqp);
+	destroy_qp_common(dev, mqp, !!pd->ibpd.uobject);
+
+	if (is_sqp(dev, mqp))
+		kfree(to_msqp(mqp));
+	else
+		kfree(mqp);
+
+	return 0;
+}
+
+static int to_mlx4_st(struct mlx4_ib_dev *dev, enum mlx4_ib_qp_type type)
+{
+	switch (type) {
+	case MLX4_IB_QPT_RC:		return MLX4_QP_ST_RC;
+	case MLX4_IB_QPT_UC:		return MLX4_QP_ST_UC;
+	case MLX4_IB_QPT_UD:		return MLX4_QP_ST_UD;
+	case MLX4_IB_QPT_XRC_INI:
+	case MLX4_IB_QPT_XRC_TGT:	return MLX4_QP_ST_XRC;
+	case MLX4_IB_QPT_SMI:
+	case MLX4_IB_QPT_GSI:
+	case MLX4_IB_QPT_RAW_PACKET:	return MLX4_QP_ST_MLX;
+
+	case MLX4_IB_QPT_PROXY_SMI_OWNER:
+	case MLX4_IB_QPT_TUN_SMI_OWNER:	return (mlx4_is_mfunc(dev->dev) ?
+						MLX4_QP_ST_MLX : -1);
+	case MLX4_IB_QPT_PROXY_SMI:
+	case MLX4_IB_QPT_TUN_SMI:
+	case MLX4_IB_QPT_PROXY_GSI:
+	case MLX4_IB_QPT_TUN_GSI:	return (mlx4_is_mfunc(dev->dev) ?
+						MLX4_QP_ST_UD : -1);
+	default:			return -1;
+	}
+}
+
+static __be32 to_mlx4_access_flags(struct mlx4_ib_qp *qp, const struct ib_qp_attr *attr,
+				   int attr_mask)
+{
+	u8 dest_rd_atomic;
+	u32 access_flags;
+	u32 hw_access_flags = 0;
+
+	if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC)
+		dest_rd_atomic = attr->max_dest_rd_atomic;
+	else
+		dest_rd_atomic = qp->resp_depth;
+
+	if (attr_mask & IB_QP_ACCESS_FLAGS)
+		access_flags = attr->qp_access_flags;
+	else
+		access_flags = qp->atomic_rd_en;
+
+	if (!dest_rd_atomic)
+		access_flags &= IB_ACCESS_REMOTE_WRITE;
+
+	if (access_flags & IB_ACCESS_REMOTE_READ)
+		hw_access_flags |= MLX4_QP_BIT_RRE;
+	if (access_flags & IB_ACCESS_REMOTE_ATOMIC)
+		hw_access_flags |= MLX4_QP_BIT_RAE;
+	if (access_flags & IB_ACCESS_REMOTE_WRITE)
+		hw_access_flags |= MLX4_QP_BIT_RWE;
+
+	return cpu_to_be32(hw_access_flags);
+}
+
+static void store_sqp_attrs(struct mlx4_ib_sqp *sqp, const struct ib_qp_attr *attr,
+			    int attr_mask)
+{
+	if (attr_mask & IB_QP_PKEY_INDEX)
+		sqp->pkey_index = attr->pkey_index;
+	if (attr_mask & IB_QP_QKEY)
+		sqp->qkey = attr->qkey;
+	if (attr_mask & IB_QP_SQ_PSN)
+		sqp->send_psn = attr->sq_psn;
+}
+
+static void mlx4_set_sched(struct mlx4_qp_path *path, u8 port)
+{
+	path->sched_queue = (path->sched_queue & 0xbf) | ((port - 1) << 6);
+}
+
+static int _mlx4_set_path(struct mlx4_ib_dev *dev, const struct ib_ah_attr *ah,
+			  u64 smac, u16 vlan_tag, struct mlx4_qp_path *path,
+			  struct mlx4_roce_smac_vlan_info *smac_info, u8 port)
+{
+	int is_eth = rdma_port_get_link_layer(&dev->ib_dev, port) ==
+		IB_LINK_LAYER_ETHERNET;
+	int vidx;
+	int smac_index;
+	int err;
+
+
+	path->grh_mylmc     = ah->src_path_bits & 0x7f;
+	path->rlid	    = cpu_to_be16(ah->dlid);
+	if (ah->static_rate) {
+		path->static_rate = ah->static_rate + MLX4_STAT_RATE_OFFSET;
+		while (path->static_rate > IB_RATE_2_5_GBPS + MLX4_STAT_RATE_OFFSET &&
+		       !(1 << path->static_rate & dev->dev->caps.stat_rate_support))
+			--path->static_rate;
+	} else
+		path->static_rate = 0;
+
+	if (ah->ah_flags & IB_AH_GRH) {
+		if (ah->grh.sgid_index >= dev->dev->caps.gid_table_len[port]) {
+			pr_err("sgid_index (%u) too large. max is %d\n",
+			       ah->grh.sgid_index, dev->dev->caps.gid_table_len[port] - 1);
+			return -1;
+		}
+
+		path->grh_mylmc |= 1 << 7;
+		path->mgid_index = ah->grh.sgid_index;
+		path->hop_limit  = ah->grh.hop_limit;
+		path->tclass_flowlabel =
+			cpu_to_be32((ah->grh.traffic_class << 20) |
+				    (ah->grh.flow_label));
+		memcpy(path->rgid, ah->grh.dgid.raw, 16);
+	}
+
+	if (is_eth) {
+		if (!(ah->ah_flags & IB_AH_GRH))
+			return -1;
+
+		path->sched_queue = MLX4_IB_DEFAULT_SCHED_QUEUE |
+			((port - 1) << 6) | ((ah->sl & 7) << 3);
+
+		path->feup |= MLX4_FEUP_FORCE_ETH_UP;
+		if (vlan_tag < 0x1000) {
+			if (smac_info->vid < 0x1000) {
+				/* both valid vlan ids */
+				if (smac_info->vid != vlan_tag) {
+					/* different VIDs.  unreg old and reg new */
+					err = mlx4_register_vlan(dev->dev, port, vlan_tag, &vidx);
+					if (err)
+						return err;
+					smac_info->candidate_vid = vlan_tag;
+					smac_info->candidate_vlan_index = vidx;
+					smac_info->candidate_vlan_port = port;
+					smac_info->update_vid = 1;
+					path->vlan_index = vidx;
+				} else {
+					path->vlan_index = smac_info->vlan_index;
+				}
+			} else {
+				/* no current vlan tag in qp */
+				err = mlx4_register_vlan(dev->dev, port, vlan_tag, &vidx);
+				if (err)
+					return err;
+				smac_info->candidate_vid = vlan_tag;
+				smac_info->candidate_vlan_index = vidx;
+				smac_info->candidate_vlan_port = port;
+				smac_info->update_vid = 1;
+				path->vlan_index = vidx;
+			}
+			path->feup |= MLX4_FVL_FORCE_ETH_VLAN;
+			path->fl = 1 << 6;
+		} else {
+			/* have current vlan tag. unregister it at modify-qp success */
+			if (smac_info->vid < 0x1000) {
+				smac_info->candidate_vid = 0xFFFF;
+				smac_info->update_vid = 1;
+			}
+		}
+
+		/* get smac_index for RoCE use.
+		 * If no smac was yet assigned, register one.
+		 * If one was already assigned, but the new mac differs,
+		 * unregister the old one and register the new one.
+		*/
+		if ((!smac_info->smac && !smac_info->smac_port) ||
+		    smac_info->smac != smac) {
+			/* register candidate now, unreg if needed, after success */
+			smac_index = mlx4_register_mac(dev->dev, port, smac);
+			if (smac_index >= 0) {
+				smac_info->candidate_smac_index = smac_index;
+				smac_info->candidate_smac = smac;
+				smac_info->candidate_smac_port = port;
+			} else {
+				return -EINVAL;
+			}
+		} else {
+			smac_index = smac_info->smac_index;
+		}
+
+		memcpy(path->dmac, ah->dmac, 6);
+		path->ackto = MLX4_IB_LINK_TYPE_ETH;
+		/* put MAC table smac index for IBoE */
+		path->grh_mylmc = (u8) (smac_index) | 0x80;
+	} else {
+		path->sched_queue = MLX4_IB_DEFAULT_SCHED_QUEUE |
+			((port - 1) << 6) | ((ah->sl & 0xf) << 2);
+	}
+
+	return 0;
+}
+
+static int mlx4_set_path(struct mlx4_ib_dev *dev, const struct ib_qp_attr *qp,
+			 enum ib_qp_attr_mask qp_attr_mask,
+			 struct mlx4_ib_qp *mqp,
+			 struct mlx4_qp_path *path, u8 port)
+{
+	return _mlx4_set_path(dev, &qp->ah_attr,
+			      mlx4_mac_to_u64((u8 *)qp->smac),
+			      (qp_attr_mask & IB_QP_VID) ? qp->vlan_id : 0xffff,
+			      path, &mqp->pri, port);
+}
+
+static int mlx4_set_alt_path(struct mlx4_ib_dev *dev,
+			     const struct ib_qp_attr *qp,
+			     enum ib_qp_attr_mask qp_attr_mask,
+			     struct mlx4_ib_qp *mqp,
+			     struct mlx4_qp_path *path, u8 port)
+{
+	return _mlx4_set_path(dev, &qp->alt_ah_attr,
+			      mlx4_mac_to_u64((u8 *)qp->alt_smac),
+			      (qp_attr_mask & IB_QP_ALT_VID) ?
+			      qp->alt_vlan_id : 0xffff,
+			      path, &mqp->alt, port);
+}
+
+static void update_mcg_macs(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp)
+{
+	struct mlx4_ib_gid_entry *ge, *tmp;
+
+	list_for_each_entry_safe(ge, tmp, &qp->gid_list, list) {
+		if (!ge->added && mlx4_ib_add_mc(dev, qp, &ge->gid)) {
+			ge->added = 1;
+			ge->port = qp->port;
+		}
+	}
+}
+
+static int handle_eth_ud_smac_index(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp, u8 *smac,
+				    struct mlx4_qp_context *context)
+{
+	u64 u64_mac;
+	int smac_index;
+
+	u64_mac = atomic64_read(&dev->iboe.mac[qp->port - 1]);
+
+	context->pri_path.sched_queue = MLX4_IB_DEFAULT_SCHED_QUEUE | ((qp->port - 1) << 6);
+	if (!qp->pri.smac && !qp->pri.smac_port) {
+		smac_index = mlx4_register_mac(dev->dev, qp->port, u64_mac);
+		if (smac_index >= 0) {
+			qp->pri.candidate_smac_index = smac_index;
+			qp->pri.candidate_smac = u64_mac;
+			qp->pri.candidate_smac_port = qp->port;
+			context->pri_path.grh_mylmc = 0x80 | (u8) smac_index;
+		} else {
+			return -ENOENT;
+		}
+	}
+	return 0;
+}
+
+static int __mlx4_ib_modify_qp(struct ib_qp *ibqp,
+			       const struct ib_qp_attr *attr, int attr_mask,
+			       enum ib_qp_state cur_state, enum ib_qp_state new_state)
+{
+	struct mlx4_ib_dev *dev = to_mdev(ibqp->device);
+	struct mlx4_ib_qp *qp = to_mqp(ibqp);
+	struct mlx4_ib_pd *pd;
+	struct mlx4_ib_cq *send_cq, *recv_cq;
+	struct mlx4_qp_context *context;
+	enum mlx4_qp_optpar optpar = 0;
+	int sqd_event;
+	int steer_qp = 0;
+	int err = -EINVAL;
+
+	/* APM is not supported under RoCE */
+	if (attr_mask & IB_QP_ALT_PATH &&
+	    rdma_port_get_link_layer(&dev->ib_dev, qp->port) ==
+	    IB_LINK_LAYER_ETHERNET)
+		return -ENOTSUPP;
+
+	context = kzalloc(sizeof *context, GFP_KERNEL);
+	if (!context)
+		return -ENOMEM;
+
+	context->flags = cpu_to_be32((to_mlx4_state(new_state) << 28) |
+				     (to_mlx4_st(dev, qp->mlx4_ib_qp_type) << 16));
+
+	if (!(attr_mask & IB_QP_PATH_MIG_STATE))
+		context->flags |= cpu_to_be32(MLX4_QP_PM_MIGRATED << 11);
+	else {
+		optpar |= MLX4_QP_OPTPAR_PM_STATE;
+		switch (attr->path_mig_state) {
+		case IB_MIG_MIGRATED:
+			context->flags |= cpu_to_be32(MLX4_QP_PM_MIGRATED << 11);
+			break;
+		case IB_MIG_REARM:
+			context->flags |= cpu_to_be32(MLX4_QP_PM_REARM << 11);
+			break;
+		case IB_MIG_ARMED:
+			context->flags |= cpu_to_be32(MLX4_QP_PM_ARMED << 11);
+			break;
+		}
+	}
+
+	if (ibqp->qp_type == IB_QPT_GSI || ibqp->qp_type == IB_QPT_SMI)
+		context->mtu_msgmax = (IB_MTU_4096 << 5) | 11;
+	else if (ibqp->qp_type == IB_QPT_RAW_PACKET)
+		context->mtu_msgmax = (MLX4_RAW_QP_MTU << 5) | MLX4_RAW_QP_MSGMAX;
+	else if (ibqp->qp_type == IB_QPT_UD) {
+		if (qp->flags & MLX4_IB_QP_LSO)
+			context->mtu_msgmax = (IB_MTU_4096 << 5) |
+					      ilog2(dev->dev->caps.max_gso_sz);
+		else
+			context->mtu_msgmax = (IB_MTU_4096 << 5) | 12;
+	} else if (attr_mask & IB_QP_PATH_MTU) {
+		if (attr->path_mtu < IB_MTU_256 || attr->path_mtu > IB_MTU_4096) {
+			pr_err("path MTU (%u) is invalid\n",
+			       attr->path_mtu);
+			goto out;
+		}
+		context->mtu_msgmax = (attr->path_mtu << 5) |
+			ilog2(dev->dev->caps.max_msg_sz);
+	}
+
+	if (qp->rq.wqe_cnt)
+		context->rq_size_stride = ilog2(qp->rq.wqe_cnt) << 3;
+	context->rq_size_stride |= qp->rq.wqe_shift - 4;
+
+	if (qp->sq.wqe_cnt)
+		context->sq_size_stride = ilog2(qp->sq.wqe_cnt) << 3;
+	context->sq_size_stride |= qp->sq.wqe_shift - 4;
+
+	if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) {
+		context->sq_size_stride |= !!qp->sq_no_prefetch << 7;
+		context->xrcd = cpu_to_be32((u32) qp->xrcdn);
+		if (ibqp->qp_type == IB_QPT_RAW_PACKET)
+			context->param3 |= cpu_to_be32(1 << 30);
+	}
+
+	if (qp->ibqp.uobject)
+		context->usr_page = cpu_to_be32(to_mucontext(ibqp->uobject->context)->uar.index);
+	else
+		context->usr_page = cpu_to_be32(dev->priv_uar.index);
+
+	if (attr_mask & IB_QP_DEST_QPN)
+		context->remote_qpn = cpu_to_be32(attr->dest_qp_num);
+
+	if (attr_mask & IB_QP_PORT) {
+		if (cur_state == IB_QPS_SQD && new_state == IB_QPS_SQD &&
+		    !(attr_mask & IB_QP_AV)) {
+			mlx4_set_sched(&context->pri_path, attr->port_num);
+			optpar |= MLX4_QP_OPTPAR_SCHED_QUEUE;
+		}
+	}
+
+	if (cur_state == IB_QPS_INIT && new_state == IB_QPS_RTR) {
+		if (dev->counters[qp->port - 1] != -1) {
+			context->pri_path.counter_index =
+						dev->counters[qp->port - 1];
+			optpar |= MLX4_QP_OPTPAR_COUNTER_INDEX;
+		} else
+			context->pri_path.counter_index = 0xff;
+
+		if (qp->flags & MLX4_IB_QP_NETIF) {
+			mlx4_ib_steer_qp_reg(dev, qp, 1);
+			steer_qp = 1;
+		}
+	}
+
+	if (attr_mask & IB_QP_PKEY_INDEX) {
+		if (qp->mlx4_ib_qp_type & MLX4_IB_QPT_ANY_SRIOV)
+			context->pri_path.disable_pkey_check = 0x40;
+		context->pri_path.pkey_index = attr->pkey_index;
+		optpar |= MLX4_QP_OPTPAR_PKEY_INDEX;
+	}
+
+	if (attr_mask & IB_QP_AV) {
+		if (mlx4_set_path(dev, attr, attr_mask, qp, &context->pri_path,
+				  attr_mask & IB_QP_PORT ?
+				  attr->port_num : qp->port))
+			goto out;
+
+		optpar |= (MLX4_QP_OPTPAR_PRIMARY_ADDR_PATH |
+			   MLX4_QP_OPTPAR_SCHED_QUEUE);
+	}
+
+	if (attr_mask & IB_QP_TIMEOUT) {
+		context->pri_path.ackto |= attr->timeout << 3;
+		optpar |= MLX4_QP_OPTPAR_ACK_TIMEOUT;
+	}
+
+	if (attr_mask & IB_QP_ALT_PATH) {
+		if (attr->alt_port_num == 0 ||
+		    attr->alt_port_num > dev->dev->caps.num_ports)
+			goto out;
+
+		if (attr->alt_pkey_index >=
+		    dev->dev->caps.pkey_table_len[attr->alt_port_num])
+			goto out;
+
+		if (mlx4_set_alt_path(dev, attr, attr_mask, qp,
+				      &context->alt_path,
+				      attr->alt_port_num))
+			goto out;
+
+		context->alt_path.pkey_index = attr->alt_pkey_index;
+		context->alt_path.ackto = attr->alt_timeout << 3;
+		optpar |= MLX4_QP_OPTPAR_ALT_ADDR_PATH;
+	}
+
+	pd = get_pd(qp);
+	get_cqs(qp, &send_cq, &recv_cq);
+	context->pd       = cpu_to_be32(pd->pdn);
+	context->cqn_send = cpu_to_be32(send_cq->mcq.cqn);
+	context->cqn_recv = cpu_to_be32(recv_cq->mcq.cqn);
+	context->params1  = cpu_to_be32(MLX4_IB_ACK_REQ_FREQ << 28);
+
+	/* Set "fast registration enabled" for all kernel QPs */
+	if (!qp->ibqp.uobject)
+		context->params1 |= cpu_to_be32(1 << 11);
+
+	if (attr_mask & IB_QP_RNR_RETRY) {
+		context->params1 |= cpu_to_be32(attr->rnr_retry << 13);
+		optpar |= MLX4_QP_OPTPAR_RNR_RETRY;
+	}
+
+	if (attr_mask & IB_QP_RETRY_CNT) {
+		context->params1 |= cpu_to_be32(attr->retry_cnt << 16);
+		optpar |= MLX4_QP_OPTPAR_RETRY_COUNT;
+	}
+
+	if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC) {
+		if (attr->max_rd_atomic)
+			context->params1 |=
+				cpu_to_be32(fls(attr->max_rd_atomic - 1) << 21);
+		optpar |= MLX4_QP_OPTPAR_SRA_MAX;
+	}
+
+	if (attr_mask & IB_QP_SQ_PSN)
+		context->next_send_psn = cpu_to_be32(attr->sq_psn);
+
+	if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) {
+		if (attr->max_dest_rd_atomic)
+			context->params2 |=
+				cpu_to_be32(fls(attr->max_dest_rd_atomic - 1) << 21);
+		optpar |= MLX4_QP_OPTPAR_RRA_MAX;
+	}
+
+	if (attr_mask & (IB_QP_ACCESS_FLAGS | IB_QP_MAX_DEST_RD_ATOMIC)) {
+		context->params2 |= to_mlx4_access_flags(qp, attr, attr_mask);
+		optpar |= MLX4_QP_OPTPAR_RWE | MLX4_QP_OPTPAR_RRE | MLX4_QP_OPTPAR_RAE;
+	}
+
+	if (ibqp->srq)
+		context->params2 |= cpu_to_be32(MLX4_QP_BIT_RIC);
+
+	if (attr_mask & IB_QP_MIN_RNR_TIMER) {
+		context->rnr_nextrecvpsn |= cpu_to_be32(attr->min_rnr_timer << 24);
+		optpar |= MLX4_QP_OPTPAR_RNR_TIMEOUT;
+	}
+	if (attr_mask & IB_QP_RQ_PSN)
+		context->rnr_nextrecvpsn |= cpu_to_be32(attr->rq_psn);
+
+	/* proxy and tunnel qp qkeys will be changed in modify-qp wrappers */
+	if (attr_mask & IB_QP_QKEY) {
+		if (qp->mlx4_ib_qp_type &
+		    (MLX4_IB_QPT_PROXY_SMI_OWNER | MLX4_IB_QPT_TUN_SMI_OWNER))
+			context->qkey = cpu_to_be32(IB_QP_SET_QKEY);
+		else {
+			if (mlx4_is_mfunc(dev->dev) &&
+			    !(qp->mlx4_ib_qp_type & MLX4_IB_QPT_ANY_SRIOV) &&
+			    (attr->qkey & MLX4_RESERVED_QKEY_MASK) ==
+			    MLX4_RESERVED_QKEY_BASE) {
+				pr_err("Cannot use reserved QKEY"
+				       " 0x%x (range 0xffff0000..0xffffffff"
+				       " is reserved)\n", attr->qkey);
+				err = -EINVAL;
+				goto out;
+			}
+			context->qkey = cpu_to_be32(attr->qkey);
+		}
+		optpar |= MLX4_QP_OPTPAR_Q_KEY;
+	}
+
+	if (ibqp->srq)
+		context->srqn = cpu_to_be32(1 << 24 | to_msrq(ibqp->srq)->msrq.srqn);
+
+	if (qp->rq.wqe_cnt && cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT)
+		context->db_rec_addr = cpu_to_be64(qp->db.dma);
+
+	if (cur_state == IB_QPS_INIT &&
+	    new_state == IB_QPS_RTR  &&
+	    (ibqp->qp_type == IB_QPT_GSI || ibqp->qp_type == IB_QPT_SMI ||
+	     ibqp->qp_type == IB_QPT_UD ||
+	     ibqp->qp_type == IB_QPT_RAW_PACKET)) {
+		context->pri_path.sched_queue = (qp->port - 1) << 6;
+		if (qp->mlx4_ib_qp_type == MLX4_IB_QPT_SMI ||
+		    qp->mlx4_ib_qp_type &
+		    (MLX4_IB_QPT_PROXY_SMI_OWNER | MLX4_IB_QPT_TUN_SMI_OWNER)) {
+			context->pri_path.sched_queue |= MLX4_IB_DEFAULT_QP0_SCHED_QUEUE;
+			if (qp->mlx4_ib_qp_type != MLX4_IB_QPT_SMI)
+				context->pri_path.fl = 0x80;
+		} else {
+			if (qp->mlx4_ib_qp_type & MLX4_IB_QPT_ANY_SRIOV)
+				context->pri_path.fl = 0x80;
+			context->pri_path.sched_queue |= MLX4_IB_DEFAULT_SCHED_QUEUE;
+		}
+		if (rdma_port_get_link_layer(&dev->ib_dev, qp->port) ==
+		    IB_LINK_LAYER_ETHERNET) {
+			if (qp->mlx4_ib_qp_type == MLX4_IB_QPT_TUN_GSI ||
+			    qp->mlx4_ib_qp_type == MLX4_IB_QPT_GSI)
+				context->pri_path.feup = 1 << 7; /* don't fsm */
+			/* handle smac_index */
+			if (qp->mlx4_ib_qp_type == MLX4_IB_QPT_UD ||
+			    qp->mlx4_ib_qp_type == MLX4_IB_QPT_PROXY_GSI ||
+			    qp->mlx4_ib_qp_type == MLX4_IB_QPT_TUN_GSI) {
+				err = handle_eth_ud_smac_index(dev, qp, (u8 *)attr->smac, context);
+				if (err)
+					return -EINVAL;
+				if (qp->mlx4_ib_qp_type == MLX4_IB_QPT_PROXY_GSI)
+					dev->qp1_proxy[qp->port - 1] = qp;
+			}
+		}
+	}
+
+	if (qp->ibqp.qp_type == IB_QPT_RAW_PACKET) {
+		context->pri_path.ackto = (context->pri_path.ackto & 0xf8) |
+					MLX4_IB_LINK_TYPE_ETH;
+		if (dev->dev->caps.tunnel_offload_mode ==  MLX4_TUNNEL_OFFLOAD_MODE_VXLAN) {
+			/* set QP to receive both tunneled & non-tunneled packets */
+			if (!(context->flags & cpu_to_be32(1 << MLX4_RSS_QPC_FLAG_OFFSET)))
+				context->srqn = cpu_to_be32(7 << 28);
+		}
+	}
+
+	if (ibqp->qp_type == IB_QPT_UD && (new_state == IB_QPS_RTR)) {
+		int is_eth = rdma_port_get_link_layer(
+				&dev->ib_dev, qp->port) ==
+				IB_LINK_LAYER_ETHERNET;
+		if (is_eth) {
+			context->pri_path.ackto = MLX4_IB_LINK_TYPE_ETH;
+			optpar |= MLX4_QP_OPTPAR_PRIMARY_ADDR_PATH;
+		}
+	}
+
+
+	if (cur_state == IB_QPS_RTS && new_state == IB_QPS_SQD	&&
+	    attr_mask & IB_QP_EN_SQD_ASYNC_NOTIFY && attr->en_sqd_async_notify)
+		sqd_event = 1;
+	else
+		sqd_event = 0;
+
+	if (!ibqp->uobject && cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT)
+		context->rlkey |= (1 << 4);
+
+	/*
+	 * Before passing a kernel QP to the HW, make sure that the
+	 * ownership bits of the send queue are set and the SQ
+	 * headroom is stamped so that the hardware doesn't start
+	 * processing stale work requests.
+	 */
+	if (!ibqp->uobject && cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) {
+		struct mlx4_wqe_ctrl_seg *ctrl;
+		int i;
+
+		for (i = 0; i < qp->sq.wqe_cnt; ++i) {
+			ctrl = get_send_wqe(qp, i);
+			ctrl->owner_opcode = cpu_to_be32(1 << 31);
+			if (qp->sq_max_wqes_per_wr == 1)
+				ctrl->fence_size = 1 << (qp->sq.wqe_shift - 4);
+
+			stamp_send_wqe(qp, i, 1 << qp->sq.wqe_shift);
+		}
+	}
+
+	err = mlx4_qp_modify(dev->dev, &qp->mtt, to_mlx4_state(cur_state),
+			     to_mlx4_state(new_state), context, optpar,
+			     sqd_event, &qp->mqp);
+	if (err)
+		goto out;
+
+	qp->state = new_state;
+
+	if (attr_mask & IB_QP_ACCESS_FLAGS)
+		qp->atomic_rd_en = attr->qp_access_flags;
+	if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC)
+		qp->resp_depth = attr->max_dest_rd_atomic;
+	if (attr_mask & IB_QP_PORT) {
+		qp->port = attr->port_num;
+		update_mcg_macs(dev, qp);
+	}
+	if (attr_mask & IB_QP_ALT_PATH)
+		qp->alt_port = attr->alt_port_num;
+
+	if (is_sqp(dev, qp))
+		store_sqp_attrs(to_msqp(qp), attr, attr_mask);
+
+	/*
+	 * If we moved QP0 to RTR, bring the IB link up; if we moved
+	 * QP0 to RESET or ERROR, bring the link back down.
+	 */
+	if (is_qp0(dev, qp)) {
+		if (cur_state != IB_QPS_RTR && new_state == IB_QPS_RTR)
+			if (mlx4_INIT_PORT(dev->dev, qp->port))
+				pr_warn("INIT_PORT failed for port %d\n",
+				       qp->port);
+
+		if (cur_state != IB_QPS_RESET && cur_state != IB_QPS_ERR &&
+		    (new_state == IB_QPS_RESET || new_state == IB_QPS_ERR))
+			mlx4_CLOSE_PORT(dev->dev, qp->port);
+	}
+
+	/*
+	 * If we moved a kernel QP to RESET, clean up all old CQ
+	 * entries and reinitialize the QP.
+	 */
+	if (new_state == IB_QPS_RESET) {
+		if (!ibqp->uobject) {
+			mlx4_ib_cq_clean(recv_cq, qp->mqp.qpn,
+					 ibqp->srq ? to_msrq(ibqp->srq) : NULL);
+			if (send_cq != recv_cq)
+				mlx4_ib_cq_clean(send_cq, qp->mqp.qpn, NULL);
+
+			qp->rq.head = 0;
+			qp->rq.tail = 0;
+			qp->sq.head = 0;
+			qp->sq.tail = 0;
+			qp->sq_next_wqe = 0;
+			if (qp->rq.wqe_cnt)
+				*qp->db.db  = 0;
+
+			if (qp->flags & MLX4_IB_QP_NETIF)
+				mlx4_ib_steer_qp_reg(dev, qp, 0);
+		}
+		if (qp->pri.smac || (!qp->pri.smac && qp->pri.smac_port)) {
+			mlx4_unregister_mac(dev->dev, qp->pri.smac_port, qp->pri.smac);
+			qp->pri.smac = 0;
+			qp->pri.smac_port = 0;
+		}
+		if (qp->alt.smac) {
+			mlx4_unregister_mac(dev->dev, qp->alt.smac_port, qp->alt.smac);
+			qp->alt.smac = 0;
+		}
+		if (qp->pri.vid < 0x1000) {
+			mlx4_unregister_vlan(dev->dev, qp->pri.vlan_port, qp->pri.vid);
+			qp->pri.vid = 0xFFFF;
+			qp->pri.candidate_vid = 0xFFFF;
+			qp->pri.update_vid = 0;
+		}
+
+		if (qp->alt.vid < 0x1000) {
+			mlx4_unregister_vlan(dev->dev, qp->alt.vlan_port, qp->alt.vid);
+			qp->alt.vid = 0xFFFF;
+			qp->alt.candidate_vid = 0xFFFF;
+			qp->alt.update_vid = 0;
+		}
+	}
+out:
+	if (err && steer_qp)
+		mlx4_ib_steer_qp_reg(dev, qp, 0);
+	kfree(context);
+	if (qp->pri.candidate_smac ||
+	    (!qp->pri.candidate_smac && qp->pri.candidate_smac_port)) {
+		if (err) {
+			mlx4_unregister_mac(dev->dev, qp->pri.candidate_smac_port, qp->pri.candidate_smac);
+		} else {
+			if (qp->pri.smac || (!qp->pri.smac && qp->pri.smac_port))
+				mlx4_unregister_mac(dev->dev, qp->pri.smac_port, qp->pri.smac);
+			qp->pri.smac = qp->pri.candidate_smac;
+			qp->pri.smac_index = qp->pri.candidate_smac_index;
+			qp->pri.smac_port = qp->pri.candidate_smac_port;
+		}
+		qp->pri.candidate_smac = 0;
+		qp->pri.candidate_smac_index = 0;
+		qp->pri.candidate_smac_port = 0;
+	}
+	if (qp->alt.candidate_smac) {
+		if (err) {
+			mlx4_unregister_mac(dev->dev, qp->alt.candidate_smac_port, qp->alt.candidate_smac);
+		} else {
+			if (qp->alt.smac)
+				mlx4_unregister_mac(dev->dev, qp->alt.smac_port, qp->alt.smac);
+			qp->alt.smac = qp->alt.candidate_smac;
+			qp->alt.smac_index = qp->alt.candidate_smac_index;
+			qp->alt.smac_port = qp->alt.candidate_smac_port;
+		}
+		qp->alt.candidate_smac = 0;
+		qp->alt.candidate_smac_index = 0;
+		qp->alt.candidate_smac_port = 0;
+	}
+
+	if (qp->pri.update_vid) {
+		if (err) {
+			if (qp->pri.candidate_vid < 0x1000)
+				mlx4_unregister_vlan(dev->dev, qp->pri.candidate_vlan_port,
+						     qp->pri.candidate_vid);
+		} else {
+			if (qp->pri.vid < 0x1000)
+				mlx4_unregister_vlan(dev->dev, qp->pri.vlan_port,
+						     qp->pri.vid);
+			qp->pri.vid = qp->pri.candidate_vid;
+			qp->pri.vlan_port = qp->pri.candidate_vlan_port;
+			qp->pri.vlan_index =  qp->pri.candidate_vlan_index;
+		}
+		qp->pri.candidate_vid = 0xFFFF;
+		qp->pri.update_vid = 0;
+	}
+
+	if (qp->alt.update_vid) {
+		if (err) {
+			if (qp->alt.candidate_vid < 0x1000)
+				mlx4_unregister_vlan(dev->dev, qp->alt.candidate_vlan_port,
+						     qp->alt.candidate_vid);
+		} else {
+			if (qp->alt.vid < 0x1000)
+				mlx4_unregister_vlan(dev->dev, qp->alt.vlan_port,
+						     qp->alt.vid);
+			qp->alt.vid = qp->alt.candidate_vid;
+			qp->alt.vlan_port = qp->alt.candidate_vlan_port;
+			qp->alt.vlan_index =  qp->alt.candidate_vlan_index;
+		}
+		qp->alt.candidate_vid = 0xFFFF;
+		qp->alt.update_vid = 0;
+	}
+
+	return err;
+}
+
+int mlx4_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+		      int attr_mask, struct ib_udata *udata)
+{
+	struct mlx4_ib_dev *dev = to_mdev(ibqp->device);
+	struct mlx4_ib_qp *qp = to_mqp(ibqp);
+	enum ib_qp_state cur_state, new_state;
+	int err = -EINVAL;
+	int ll;
+	mutex_lock(&qp->mutex);
+
+	cur_state = attr_mask & IB_QP_CUR_STATE ? attr->cur_qp_state : qp->state;
+	new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state;
+
+	if (cur_state == new_state && cur_state == IB_QPS_RESET) {
+		ll = IB_LINK_LAYER_UNSPECIFIED;
+	} else {
+		int port = attr_mask & IB_QP_PORT ? attr->port_num : qp->port;
+		ll = rdma_port_get_link_layer(&dev->ib_dev, port);
+	}
+
+	if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type,
+				attr_mask, ll)) {
+		pr_debug("qpn 0x%x: invalid attribute mask specified "
+			 "for transition %d to %d. qp_type %d,"
+			 " attr_mask 0x%x\n",
+			 ibqp->qp_num, cur_state, new_state,
+			 ibqp->qp_type, attr_mask);
+		goto out;
+	}
+
+	if ((attr_mask & IB_QP_PORT) &&
+	    (attr->port_num == 0 || attr->port_num > dev->num_ports)) {
+		pr_debug("qpn 0x%x: invalid port number (%d) specified "
+			 "for transition %d to %d. qp_type %d\n",
+			 ibqp->qp_num, attr->port_num, cur_state,
+			 new_state, ibqp->qp_type);
+		goto out;
+	}
+
+	if ((attr_mask & IB_QP_PORT) && (ibqp->qp_type == IB_QPT_RAW_PACKET) &&
+	    (rdma_port_get_link_layer(&dev->ib_dev, attr->port_num) !=
+	     IB_LINK_LAYER_ETHERNET))
+		goto out;
+
+	if (attr_mask & IB_QP_PKEY_INDEX) {
+		int p = attr_mask & IB_QP_PORT ? attr->port_num : qp->port;
+		if (attr->pkey_index >= dev->dev->caps.pkey_table_len[p]) {
+			pr_debug("qpn 0x%x: invalid pkey index (%d) specified "
+				 "for transition %d to %d. qp_type %d\n",
+				 ibqp->qp_num, attr->pkey_index, cur_state,
+				 new_state, ibqp->qp_type);
+			goto out;
+		}
+	}
+
+	if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC &&
+	    attr->max_rd_atomic > dev->dev->caps.max_qp_init_rdma) {
+		pr_debug("qpn 0x%x: max_rd_atomic (%d) too large. "
+			 "Transition %d to %d. qp_type %d\n",
+			 ibqp->qp_num, attr->max_rd_atomic, cur_state,
+			 new_state, ibqp->qp_type);
+		goto out;
+	}
+
+	if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC &&
+	    attr->max_dest_rd_atomic > dev->dev->caps.max_qp_dest_rdma) {
+		pr_debug("qpn 0x%x: max_dest_rd_atomic (%d) too large. "
+			 "Transition %d to %d. qp_type %d\n",
+			 ibqp->qp_num, attr->max_dest_rd_atomic, cur_state,
+			 new_state, ibqp->qp_type);
+		goto out;
+	}
+
+	if (cur_state == new_state && cur_state == IB_QPS_RESET) {
+		err = 0;
+		goto out;
+	}
+
+	err = __mlx4_ib_modify_qp(ibqp, attr, attr_mask, cur_state, new_state);
+
+out:
+	mutex_unlock(&qp->mutex);
+	return err;
+}
+
+static int vf_get_qp0_qkey(struct mlx4_dev *dev, int qpn, u32 *qkey)
+{
+	int i;
+	for (i = 0; i < dev->caps.num_ports; i++) {
+		if (qpn == dev->caps.qp0_proxy[i] ||
+		    qpn == dev->caps.qp0_tunnel[i]) {
+			*qkey = dev->caps.qp0_qkey[i];
+			return 0;
+		}
+	}
+	return -EINVAL;
+}
+
+static int build_sriov_qp0_header(struct mlx4_ib_sqp *sqp,
+				  struct ib_send_wr *wr,
+				  void *wqe, unsigned *mlx_seg_len)
+{
+	struct mlx4_ib_dev *mdev = to_mdev(sqp->qp.ibqp.device);
+	struct ib_device *ib_dev = &mdev->ib_dev;
+	struct mlx4_wqe_mlx_seg *mlx = wqe;
+	struct mlx4_wqe_inline_seg *inl = wqe + sizeof *mlx;
+	struct mlx4_ib_ah *ah = to_mah(wr->wr.ud.ah);
+	u16 pkey;
+	u32 qkey;
+	int send_size;
+	int header_size;
+	int spc;
+	int i;
+
+	if (wr->opcode != IB_WR_SEND)
+		return -EINVAL;
+
+	send_size = 0;
+
+	for (i = 0; i < wr->num_sge; ++i)
+		send_size += wr->sg_list[i].length;
+
+	/* for proxy-qp0 sends, need to add in size of tunnel header */
+	/* for tunnel-qp0 sends, tunnel header is already in s/g list */
+	if (sqp->qp.mlx4_ib_qp_type == MLX4_IB_QPT_PROXY_SMI_OWNER)
+		send_size += sizeof (struct mlx4_ib_tunnel_header);
+
+	ib_ud_header_init(send_size, 1, 0, 0, 0, 0, &sqp->ud_header);
+
+	if (sqp->qp.mlx4_ib_qp_type == MLX4_IB_QPT_PROXY_SMI_OWNER) {
+		sqp->ud_header.lrh.service_level =
+			be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 28;
+		sqp->ud_header.lrh.destination_lid =
+			cpu_to_be16(ah->av.ib.g_slid & 0x7f);
+		sqp->ud_header.lrh.source_lid =
+			cpu_to_be16(ah->av.ib.g_slid & 0x7f);
+	}
+
+	mlx->flags &= cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE);
+
+	/* force loopback */
+	mlx->flags |= cpu_to_be32(MLX4_WQE_MLX_VL15 | 0x1 | MLX4_WQE_MLX_SLR);
+	mlx->rlid = sqp->ud_header.lrh.destination_lid;
+
+	sqp->ud_header.lrh.virtual_lane    = 0;
+	sqp->ud_header.bth.solicited_event = !!(wr->send_flags & IB_SEND_SOLICITED);
+	ib_get_cached_pkey(ib_dev, sqp->qp.port, 0, &pkey);
+	sqp->ud_header.bth.pkey = cpu_to_be16(pkey);
+	if (sqp->qp.mlx4_ib_qp_type == MLX4_IB_QPT_TUN_SMI_OWNER)
+		sqp->ud_header.bth.destination_qpn = cpu_to_be32(wr->wr.ud.remote_qpn);
+	else
+		sqp->ud_header.bth.destination_qpn =
+			cpu_to_be32(mdev->dev->caps.qp0_tunnel[sqp->qp.port - 1]);
+
+	sqp->ud_header.bth.psn = cpu_to_be32((sqp->send_psn++) & ((1 << 24) - 1));
+	if (mlx4_is_master(mdev->dev)) {
+		if (mlx4_get_parav_qkey(mdev->dev, sqp->qp.mqp.qpn, &qkey))
+			return -EINVAL;
+	} else {
+		if (vf_get_qp0_qkey(mdev->dev, sqp->qp.mqp.qpn, &qkey))
+			return -EINVAL;
+	}
+	sqp->ud_header.deth.qkey = cpu_to_be32(qkey);
+	sqp->ud_header.deth.source_qpn = cpu_to_be32(sqp->qp.mqp.qpn);
+
+	sqp->ud_header.bth.opcode        = IB_OPCODE_UD_SEND_ONLY;
+	sqp->ud_header.immediate_present = 0;
+
+	header_size = ib_ud_header_pack(&sqp->ud_header, sqp->header_buf);
+
+	/*
+	 * Inline data segments may not cross a 64 byte boundary.  If
+	 * our UD header is bigger than the space available up to the
+	 * next 64 byte boundary in the WQE, use two inline data
+	 * segments to hold the UD header.
+	 */
+	spc = MLX4_INLINE_ALIGN -
+	      ((unsigned long) (inl + 1) & (MLX4_INLINE_ALIGN - 1));
+	if (header_size <= spc) {
+		inl->byte_count = cpu_to_be32(1 << 31 | header_size);
+		memcpy(inl + 1, sqp->header_buf, header_size);
+		i = 1;
+	} else {
+		inl->byte_count = cpu_to_be32(1 << 31 | spc);
+		memcpy(inl + 1, sqp->header_buf, spc);
+
+		inl = (void *) (inl + 1) + spc;
+		memcpy(inl + 1, sqp->header_buf + spc, header_size - spc);
+		/*
+		 * Need a barrier here to make sure all the data is
+		 * visible before the byte_count field is set.
+		 * Otherwise the HCA prefetcher could grab the 64-byte
+		 * chunk with this inline segment and get a valid (!=
+		 * 0xffffffff) byte count but stale data, and end up
+		 * generating a packet with bad headers.
+		 *
+		 * The first inline segment's byte_count field doesn't
+		 * need a barrier, because it comes after a
+		 * control/MLX segment and therefore is at an offset
+		 * of 16 mod 64.
+		 */
+		wmb();
+		inl->byte_count = cpu_to_be32(1 << 31 | (header_size - spc));
+		i = 2;
+	}
+
+	*mlx_seg_len =
+	ALIGN(i * sizeof (struct mlx4_wqe_inline_seg) + header_size, 16);
+	return 0;
+}
+
+static void mlx4_u64_to_smac(u8 *dst_mac, u64 src_mac)
+{
+	int i;
+
+	for (i = ETH_ALEN; i; i--) {
+		dst_mac[i - 1] = src_mac & 0xff;
+		src_mac >>= 8;
+	}
+}
+
+static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_send_wr *wr,
+			    void *wqe, unsigned *mlx_seg_len)
+{
+	struct ib_device *ib_dev = sqp->qp.ibqp.device;
+	struct mlx4_wqe_mlx_seg *mlx = wqe;
+	struct mlx4_wqe_ctrl_seg *ctrl = wqe;
+	struct mlx4_wqe_inline_seg *inl = wqe + sizeof *mlx;
+	struct mlx4_ib_ah *ah = to_mah(wr->wr.ud.ah);
+	union ib_gid sgid;
+	u16 pkey;
+	int send_size;
+	int header_size;
+	int spc;
+	int i;
+	int err = 0;
+	u16 vlan = 0xffff;
+	bool is_eth;
+	bool is_vlan = false;
+	bool is_grh;
+
+	send_size = 0;
+	for (i = 0; i < wr->num_sge; ++i)
+		send_size += wr->sg_list[i].length;
+
+	is_eth = rdma_port_get_link_layer(sqp->qp.ibqp.device, sqp->qp.port) == IB_LINK_LAYER_ETHERNET;
+	is_grh = mlx4_ib_ah_grh_present(ah);
+	if (is_eth) {
+		if (mlx4_is_mfunc(to_mdev(ib_dev)->dev)) {
+			/* When multi-function is enabled, the ib_core gid
+			 * indexes don't necessarily match the hw ones, so
+			 * we must use our own cache */
+			err = mlx4_get_roce_gid_from_slave(to_mdev(ib_dev)->dev,
+							   be32_to_cpu(ah->av.ib.port_pd) >> 24,
+							   ah->av.ib.gid_index, &sgid.raw[0]);
+			if (err)
+				return err;
+		} else  {
+			err = ib_get_cached_gid(ib_dev,
+						be32_to_cpu(ah->av.ib.port_pd) >> 24,
+						ah->av.ib.gid_index, &sgid);
+			if (err)
+				return err;
+		}
+
+		if (ah->av.eth.vlan != cpu_to_be16(0xffff)) {
+			vlan = be16_to_cpu(ah->av.eth.vlan) & 0x0fff;
+			is_vlan = 1;
+		}
+	}
+	ib_ud_header_init(send_size, !is_eth, is_eth, is_vlan, is_grh, 0, &sqp->ud_header);
+
+	if (!is_eth) {
+		sqp->ud_header.lrh.service_level =
+			be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 28;
+		sqp->ud_header.lrh.destination_lid = ah->av.ib.dlid;
+		sqp->ud_header.lrh.source_lid = cpu_to_be16(ah->av.ib.g_slid & 0x7f);
+	}
+
+	if (is_grh) {
+		sqp->ud_header.grh.traffic_class =
+			(be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 20) & 0xff;
+		sqp->ud_header.grh.flow_label    =
+			ah->av.ib.sl_tclass_flowlabel & cpu_to_be32(0xfffff);
+		sqp->ud_header.grh.hop_limit     = ah->av.ib.hop_limit;
+		if (is_eth)
+			memcpy(sqp->ud_header.grh.source_gid.raw, sgid.raw, 16);
+		else {
+		if (mlx4_is_mfunc(to_mdev(ib_dev)->dev)) {
+			/* When multi-function is enabled, the ib_core gid
+			 * indexes don't necessarily match the hw ones, so
+			 * we must use our own cache */
+			sqp->ud_header.grh.source_gid.global.subnet_prefix =
+				to_mdev(ib_dev)->sriov.demux[sqp->qp.port - 1].
+						       subnet_prefix;
+			sqp->ud_header.grh.source_gid.global.interface_id =
+				to_mdev(ib_dev)->sriov.demux[sqp->qp.port - 1].
+					       guid_cache[ah->av.ib.gid_index];
+		} else
+			ib_get_cached_gid(ib_dev,
+					  be32_to_cpu(ah->av.ib.port_pd) >> 24,
+					  ah->av.ib.gid_index,
+					  &sqp->ud_header.grh.source_gid);
+		}
+		memcpy(sqp->ud_header.grh.destination_gid.raw,
+		       ah->av.ib.dgid, 16);
+	}
+
+	mlx->flags &= cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE);
+
+	if (!is_eth) {
+		mlx->flags |= cpu_to_be32((!sqp->qp.ibqp.qp_num ? MLX4_WQE_MLX_VL15 : 0) |
+					  (sqp->ud_header.lrh.destination_lid ==
+					   IB_LID_PERMISSIVE ? MLX4_WQE_MLX_SLR : 0) |
+					  (sqp->ud_header.lrh.service_level << 8));
+		if (ah->av.ib.port_pd & cpu_to_be32(0x80000000))
+			mlx->flags |= cpu_to_be32(0x1); /* force loopback */
+		mlx->rlid = sqp->ud_header.lrh.destination_lid;
+	}
+
+	switch (wr->opcode) {
+	case IB_WR_SEND:
+		sqp->ud_header.bth.opcode	 = IB_OPCODE_UD_SEND_ONLY;
+		sqp->ud_header.immediate_present = 0;
+		break;
+	case IB_WR_SEND_WITH_IMM:
+		sqp->ud_header.bth.opcode	 = IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE;
+		sqp->ud_header.immediate_present = 1;
+		sqp->ud_header.immediate_data    = wr->ex.imm_data;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	if (is_eth) {
+		struct in6_addr in6;
+
+		u16 pcp = (be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 29) << 13;
+
+		mlx->sched_prio = cpu_to_be16(pcp);
+
+		memcpy(sqp->ud_header.eth.dmac_h, ah->av.eth.mac, 6);
+		/* FIXME: cache smac value? */
+		memcpy(&ctrl->srcrb_flags16[0], ah->av.eth.mac, 2);
+		memcpy(&ctrl->imm, ah->av.eth.mac + 2, 4);
+		memcpy(&in6, sgid.raw, sizeof(in6));
+
+		if (!mlx4_is_mfunc(to_mdev(ib_dev)->dev)) {
+			u64 mac = atomic64_read(&to_mdev(ib_dev)->iboe.mac[sqp->qp.port - 1]);
+			u8 smac[ETH_ALEN];
+
+			mlx4_u64_to_smac(smac, mac);
+			memcpy(sqp->ud_header.eth.smac_h, smac, ETH_ALEN);
+		} else {
+			/* use the src mac of the tunnel */
+			memcpy(sqp->ud_header.eth.smac_h, ah->av.eth.s_mac, ETH_ALEN);
+		}
+
+		if (!memcmp(sqp->ud_header.eth.smac_h, sqp->ud_header.eth.dmac_h, 6))
+			mlx->flags |= cpu_to_be32(MLX4_WQE_CTRL_FORCE_LOOPBACK);
+		if (!is_vlan) {
+			sqp->ud_header.eth.type = cpu_to_be16(MLX4_IB_IBOE_ETHERTYPE);
+		} else {
+			sqp->ud_header.vlan.type = cpu_to_be16(MLX4_IB_IBOE_ETHERTYPE);
+			sqp->ud_header.vlan.tag = cpu_to_be16(vlan | pcp);
+		}
+	} else {
+		sqp->ud_header.lrh.virtual_lane    = !sqp->qp.ibqp.qp_num ? 15 : 0;
+		if (sqp->ud_header.lrh.destination_lid == IB_LID_PERMISSIVE)
+			sqp->ud_header.lrh.source_lid = IB_LID_PERMISSIVE;
+	}
+	sqp->ud_header.bth.solicited_event = !!(wr->send_flags & IB_SEND_SOLICITED);
+	if (!sqp->qp.ibqp.qp_num)
+		ib_get_cached_pkey(ib_dev, sqp->qp.port, sqp->pkey_index, &pkey);
+	else
+		ib_get_cached_pkey(ib_dev, sqp->qp.port, wr->wr.ud.pkey_index, &pkey);
+	sqp->ud_header.bth.pkey = cpu_to_be16(pkey);
+	sqp->ud_header.bth.destination_qpn = cpu_to_be32(wr->wr.ud.remote_qpn);
+	sqp->ud_header.bth.psn = cpu_to_be32((sqp->send_psn++) & ((1 << 24) - 1));
+	sqp->ud_header.deth.qkey = cpu_to_be32(wr->wr.ud.remote_qkey & 0x80000000 ?
+					       sqp->qkey : wr->wr.ud.remote_qkey);
+	sqp->ud_header.deth.source_qpn = cpu_to_be32(sqp->qp.ibqp.qp_num);
+
+	header_size = ib_ud_header_pack(&sqp->ud_header, sqp->header_buf);
+
+	if (0) {
+		pr_err("built UD header of size %d:\n", header_size);
+		for (i = 0; i < header_size / 4; ++i) {
+			if (i % 8 == 0)
+				pr_err("  [%02x] ", i * 4);
+			pr_cont(" %08x",
+				be32_to_cpu(((__be32 *) sqp->header_buf)[i]));
+			if ((i + 1) % 8 == 0)
+				pr_cont("\n");
+		}
+		pr_err("\n");
+	}
+
+	/*
+	 * Inline data segments may not cross a 64 byte boundary.  If
+	 * our UD header is bigger than the space available up to the
+	 * next 64 byte boundary in the WQE, use two inline data
+	 * segments to hold the UD header.
+	 */
+	spc = MLX4_INLINE_ALIGN -
+		((unsigned long) (inl + 1) & (MLX4_INLINE_ALIGN - 1));
+	if (header_size <= spc) {
+		inl->byte_count = cpu_to_be32(1 << 31 | header_size);
+		memcpy(inl + 1, sqp->header_buf, header_size);
+		i = 1;
+	} else {
+		inl->byte_count = cpu_to_be32(1 << 31 | spc);
+		memcpy(inl + 1, sqp->header_buf, spc);
+
+		inl = (void *) (inl + 1) + spc;
+		memcpy(inl + 1, sqp->header_buf + spc, header_size - spc);
+		/*
+		 * Need a barrier here to make sure all the data is
+		 * visible before the byte_count field is set.
+		 * Otherwise the HCA prefetcher could grab the 64-byte
+		 * chunk with this inline segment and get a valid (!=
+		 * 0xffffffff) byte count but stale data, and end up
+		 * generating a packet with bad headers.
+		 *
+		 * The first inline segment's byte_count field doesn't
+		 * need a barrier, because it comes after a
+		 * control/MLX segment and therefore is at an offset
+		 * of 16 mod 64.
+		 */
+		wmb();
+		inl->byte_count = cpu_to_be32(1 << 31 | (header_size - spc));
+		i = 2;
+	}
+
+	*mlx_seg_len =
+		ALIGN(i * sizeof (struct mlx4_wqe_inline_seg) + header_size, 16);
+	return 0;
+}
+
+static int mlx4_wq_overflow(struct mlx4_ib_wq *wq, int nreq, struct ib_cq *ib_cq)
+{
+	unsigned cur;
+	struct mlx4_ib_cq *cq;
+
+	cur = wq->head - wq->tail;
+	if (likely(cur + nreq < wq->max_post))
+		return 0;
+
+	cq = to_mcq(ib_cq);
+	spin_lock(&cq->lock);
+	cur = wq->head - wq->tail;
+	spin_unlock(&cq->lock);
+
+	return cur + nreq >= wq->max_post;
+}
+
+static __be32 convert_access(int acc)
+{
+	return (acc & IB_ACCESS_REMOTE_ATOMIC ?
+		cpu_to_be32(MLX4_WQE_FMR_AND_BIND_PERM_ATOMIC)       : 0) |
+	       (acc & IB_ACCESS_REMOTE_WRITE  ?
+		cpu_to_be32(MLX4_WQE_FMR_AND_BIND_PERM_REMOTE_WRITE) : 0) |
+	       (acc & IB_ACCESS_REMOTE_READ   ?
+		cpu_to_be32(MLX4_WQE_FMR_AND_BIND_PERM_REMOTE_READ)  : 0) |
+	       (acc & IB_ACCESS_LOCAL_WRITE   ? cpu_to_be32(MLX4_WQE_FMR_PERM_LOCAL_WRITE)  : 0) |
+		cpu_to_be32(MLX4_WQE_FMR_PERM_LOCAL_READ);
+}
+
+static void set_fmr_seg(struct mlx4_wqe_fmr_seg *fseg, struct ib_send_wr *wr)
+{
+	struct mlx4_ib_fast_reg_page_list *mfrpl = to_mfrpl(wr->wr.fast_reg.page_list);
+	int i;
+
+	for (i = 0; i < wr->wr.fast_reg.page_list_len; ++i)
+		mfrpl->mapped_page_list[i] =
+			cpu_to_be64(wr->wr.fast_reg.page_list->page_list[i] |
+				    MLX4_MTT_FLAG_PRESENT);
+
+	fseg->flags		= convert_access(wr->wr.fast_reg.access_flags);
+	fseg->mem_key		= cpu_to_be32(wr->wr.fast_reg.rkey);
+	fseg->buf_list		= cpu_to_be64(mfrpl->map);
+	fseg->start_addr	= cpu_to_be64(wr->wr.fast_reg.iova_start);
+	fseg->reg_len		= cpu_to_be64(wr->wr.fast_reg.length);
+	fseg->offset		= 0; /* XXX -- is this just for ZBVA? */
+	fseg->page_size		= cpu_to_be32(wr->wr.fast_reg.page_shift);
+	fseg->reserved[0]	= 0;
+	fseg->reserved[1]	= 0;
+}
+
+static void set_bind_seg(struct mlx4_wqe_bind_seg *bseg, struct ib_send_wr *wr)
+{
+	bseg->flags1 =
+		convert_access(wr->wr.bind_mw.bind_info.mw_access_flags) &
+		cpu_to_be32(MLX4_WQE_FMR_AND_BIND_PERM_REMOTE_READ  |
+			    MLX4_WQE_FMR_AND_BIND_PERM_REMOTE_WRITE |
+			    MLX4_WQE_FMR_AND_BIND_PERM_ATOMIC);
+	bseg->flags2 = 0;
+	if (wr->wr.bind_mw.mw->type == IB_MW_TYPE_2)
+		bseg->flags2 |= cpu_to_be32(MLX4_WQE_BIND_TYPE_2);
+	if (wr->wr.bind_mw.bind_info.mw_access_flags & IB_ZERO_BASED)
+		bseg->flags2 |= cpu_to_be32(MLX4_WQE_BIND_ZERO_BASED);
+	bseg->new_rkey = cpu_to_be32(wr->wr.bind_mw.rkey);
+	bseg->lkey = cpu_to_be32(wr->wr.bind_mw.bind_info.mr->lkey);
+	bseg->addr = cpu_to_be64(wr->wr.bind_mw.bind_info.addr);
+	bseg->length = cpu_to_be64(wr->wr.bind_mw.bind_info.length);
+}
+
+static void set_local_inv_seg(struct mlx4_wqe_local_inval_seg *iseg, u32 rkey)
+{
+	memset(iseg, 0, sizeof(*iseg));
+	iseg->mem_key = cpu_to_be32(rkey);
+}
+
+static __always_inline void set_raddr_seg(struct mlx4_wqe_raddr_seg *rseg,
+					  u64 remote_addr, u32 rkey)
+{
+	rseg->raddr    = cpu_to_be64(remote_addr);
+	rseg->rkey     = cpu_to_be32(rkey);
+	rseg->reserved = 0;
+}
+
+static void set_atomic_seg(struct mlx4_wqe_atomic_seg *aseg, struct ib_send_wr *wr)
+{
+	if (wr->opcode == IB_WR_ATOMIC_CMP_AND_SWP) {
+		aseg->swap_add = cpu_to_be64(wr->wr.atomic.swap);
+		aseg->compare  = cpu_to_be64(wr->wr.atomic.compare_add);
+	} else if (wr->opcode == IB_WR_MASKED_ATOMIC_FETCH_AND_ADD) {
+		aseg->swap_add = cpu_to_be64(wr->wr.atomic.compare_add);
+		aseg->compare  = cpu_to_be64(wr->wr.atomic.compare_add_mask);
+	} else {
+		aseg->swap_add = cpu_to_be64(wr->wr.atomic.compare_add);
+		aseg->compare  = 0;
+	}
+
+}
+
+static void set_masked_atomic_seg(struct mlx4_wqe_masked_atomic_seg *aseg,
+				  struct ib_send_wr *wr)
+{
+	aseg->swap_add		= cpu_to_be64(wr->wr.atomic.swap);
+	aseg->swap_add_mask	= cpu_to_be64(wr->wr.atomic.swap_mask);
+	aseg->compare		= cpu_to_be64(wr->wr.atomic.compare_add);
+	aseg->compare_mask	= cpu_to_be64(wr->wr.atomic.compare_add_mask);
+}
+
+static void set_datagram_seg(struct mlx4_wqe_datagram_seg *dseg,
+			     struct ib_send_wr *wr)
+{
+	memcpy(dseg->av, &to_mah(wr->wr.ud.ah)->av, sizeof (struct mlx4_av));
+	dseg->dqpn = cpu_to_be32(wr->wr.ud.remote_qpn);
+	dseg->qkey = cpu_to_be32(wr->wr.ud.remote_qkey);
+	dseg->vlan = to_mah(wr->wr.ud.ah)->av.eth.vlan;
+	memcpy(dseg->mac, to_mah(wr->wr.ud.ah)->av.eth.mac, 6);
+}
+
+static void set_tunnel_datagram_seg(struct mlx4_ib_dev *dev,
+				    struct mlx4_wqe_datagram_seg *dseg,
+				    struct ib_send_wr *wr,
+				    enum mlx4_ib_qp_type qpt)
+{
+	union mlx4_ext_av *av = &to_mah(wr->wr.ud.ah)->av;
+	struct mlx4_av sqp_av = {0};
+	int port = *((u8 *) &av->ib.port_pd) & 0x3;
+
+	/* force loopback */
+	sqp_av.port_pd = av->ib.port_pd | cpu_to_be32(0x80000000);
+	sqp_av.g_slid = av->ib.g_slid & 0x7f; /* no GRH */
+	sqp_av.sl_tclass_flowlabel = av->ib.sl_tclass_flowlabel &
+			cpu_to_be32(0xf0000000);
+
+	memcpy(dseg->av, &sqp_av, sizeof (struct mlx4_av));
+	if (qpt == MLX4_IB_QPT_PROXY_GSI)
+		dseg->dqpn = cpu_to_be32(dev->dev->caps.qp1_tunnel[port - 1]);
+	else
+		dseg->dqpn = cpu_to_be32(dev->dev->caps.qp0_tunnel[port - 1]);
+	/* Use QKEY from the QP context, which is set by master */
+	dseg->qkey = cpu_to_be32(IB_QP_SET_QKEY);
+}
+
+static void build_tunnel_header(struct ib_send_wr *wr, void *wqe, unsigned *mlx_seg_len)
+{
+	struct mlx4_wqe_inline_seg *inl = wqe;
+	struct mlx4_ib_tunnel_header hdr;
+	struct mlx4_ib_ah *ah = to_mah(wr->wr.ud.ah);
+	int spc;
+	int i;
+
+	memcpy(&hdr.av, &ah->av, sizeof hdr.av);
+	hdr.remote_qpn = cpu_to_be32(wr->wr.ud.remote_qpn);
+	hdr.pkey_index = cpu_to_be16(wr->wr.ud.pkey_index);
+	hdr.qkey = cpu_to_be32(wr->wr.ud.remote_qkey);
+	memcpy(hdr.mac, ah->av.eth.mac, 6);
+	hdr.vlan = ah->av.eth.vlan;
+
+	spc = MLX4_INLINE_ALIGN -
+		((unsigned long) (inl + 1) & (MLX4_INLINE_ALIGN - 1));
+	if (sizeof (hdr) <= spc) {
+		memcpy(inl + 1, &hdr, sizeof (hdr));
+		wmb();
+		inl->byte_count = cpu_to_be32(1 << 31 | sizeof (hdr));
+		i = 1;
+	} else {
+		memcpy(inl + 1, &hdr, spc);
+		wmb();
+		inl->byte_count = cpu_to_be32(1 << 31 | spc);
+
+		inl = (void *) (inl + 1) + spc;
+		memcpy(inl + 1, (void *) &hdr + spc, sizeof (hdr) - spc);
+		wmb();
+		inl->byte_count = cpu_to_be32(1 << 31 | (sizeof (hdr) - spc));
+		i = 2;
+	}
+
+	*mlx_seg_len =
+		ALIGN(i * sizeof (struct mlx4_wqe_inline_seg) + sizeof (hdr), 16);
+}
+
+static void set_mlx_icrc_seg(void *dseg)
+{
+	u32 *t = dseg;
+	struct mlx4_wqe_inline_seg *iseg = dseg;
+
+	t[1] = 0;
+
+	/*
+	 * Need a barrier here before writing the byte_count field to
+	 * make sure that all the data is visible before the
+	 * byte_count field is set.  Otherwise, if the segment begins
+	 * a new cacheline, the HCA prefetcher could grab the 64-byte
+	 * chunk and get a valid (!= * 0xffffffff) byte count but
+	 * stale data, and end up sending the wrong data.
+	 */
+	wmb();
+
+	iseg->byte_count = cpu_to_be32((1 << 31) | 4);
+}
+
+static void set_data_seg(struct mlx4_wqe_data_seg *dseg, struct ib_sge *sg)
+{
+	dseg->lkey       = cpu_to_be32(sg->lkey);
+	dseg->addr       = cpu_to_be64(sg->addr);
+
+	/*
+	 * Need a barrier here before writing the byte_count field to
+	 * make sure that all the data is visible before the
+	 * byte_count field is set.  Otherwise, if the segment begins
+	 * a new cacheline, the HCA prefetcher could grab the 64-byte
+	 * chunk and get a valid (!= * 0xffffffff) byte count but
+	 * stale data, and end up sending the wrong data.
+	 */
+	wmb();
+
+	dseg->byte_count = cpu_to_be32(sg->length);
+}
+
+static void __set_data_seg(struct mlx4_wqe_data_seg *dseg, struct ib_sge *sg)
+{
+	dseg->byte_count = cpu_to_be32(sg->length);
+	dseg->lkey       = cpu_to_be32(sg->lkey);
+	dseg->addr       = cpu_to_be64(sg->addr);
+}
+
+static int build_lso_seg(struct mlx4_wqe_lso_seg *wqe, struct ib_send_wr *wr,
+			 struct mlx4_ib_qp *qp, unsigned *lso_seg_len,
+			 __be32 *lso_hdr_sz, __be32 *blh)
+{
+	unsigned halign = ALIGN(sizeof *wqe + wr->wr.ud.hlen, 16);
+
+	if (unlikely(halign > MLX4_IB_CACHE_LINE_SIZE))
+		*blh = cpu_to_be32(1 << 6);
+
+	if (unlikely(!(qp->flags & MLX4_IB_QP_LSO) &&
+		     wr->num_sge > qp->sq.max_gs - (halign >> 4)))
+		return -EINVAL;
+
+	memcpy(wqe->header, wr->wr.ud.header, wr->wr.ud.hlen);
+
+	*lso_hdr_sz  = cpu_to_be32((wr->wr.ud.mss - wr->wr.ud.hlen) << 16 |
+				   wr->wr.ud.hlen);
+	*lso_seg_len = halign;
+	return 0;
+}
+
+static __be32 send_ieth(struct ib_send_wr *wr)
+{
+	switch (wr->opcode) {
+	case IB_WR_SEND_WITH_IMM:
+	case IB_WR_RDMA_WRITE_WITH_IMM:
+		return wr->ex.imm_data;
+
+	case IB_WR_SEND_WITH_INV:
+		return cpu_to_be32(wr->ex.invalidate_rkey);
+
+	default:
+		return 0;
+	}
+}
+
+static void add_zero_len_inline(void *wqe)
+{
+	struct mlx4_wqe_inline_seg *inl = wqe;
+	memset(wqe, 0, 16);
+	inl->byte_count = cpu_to_be32(1 << 31);
+}
+
+int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
+		      struct ib_send_wr **bad_wr)
+{
+	struct mlx4_ib_qp *qp = to_mqp(ibqp);
+	void *wqe;
+	struct mlx4_wqe_ctrl_seg *ctrl;
+	struct mlx4_wqe_data_seg *dseg;
+	unsigned long flags;
+	int nreq;
+	int err = 0;
+	unsigned ind;
+	int uninitialized_var(stamp);
+	int uninitialized_var(size);
+	unsigned uninitialized_var(seglen);
+	__be32 dummy;
+	__be32 *lso_wqe;
+	__be32 uninitialized_var(lso_hdr_sz);
+	__be32 blh;
+	int i;
+
+	spin_lock_irqsave(&qp->sq.lock, flags);
+
+	ind = qp->sq_next_wqe;
+
+	for (nreq = 0; wr; ++nreq, wr = wr->next) {
+		lso_wqe = &dummy;
+		blh = 0;
+
+		if (mlx4_wq_overflow(&qp->sq, nreq, qp->ibqp.send_cq)) {
+			err = -ENOMEM;
+			*bad_wr = wr;
+			goto out;
+		}
+
+		if (unlikely(wr->num_sge > qp->sq.max_gs)) {
+			err = -EINVAL;
+			*bad_wr = wr;
+			goto out;
+		}
+
+		ctrl = wqe = get_send_wqe(qp, ind & (qp->sq.wqe_cnt - 1));
+		qp->sq.wrid[(qp->sq.head + nreq) & (qp->sq.wqe_cnt - 1)] = wr->wr_id;
+
+		ctrl->srcrb_flags =
+			(wr->send_flags & IB_SEND_SIGNALED ?
+			 cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE) : 0) |
+			(wr->send_flags & IB_SEND_SOLICITED ?
+			 cpu_to_be32(MLX4_WQE_CTRL_SOLICITED) : 0) |
+			((wr->send_flags & IB_SEND_IP_CSUM) ?
+			 cpu_to_be32(MLX4_WQE_CTRL_IP_CSUM |
+				     MLX4_WQE_CTRL_TCP_UDP_CSUM) : 0) |
+			qp->sq_signal_bits;
+
+		ctrl->imm = send_ieth(wr);
+
+		wqe += sizeof *ctrl;
+		size = sizeof *ctrl / 16;
+
+		switch (qp->mlx4_ib_qp_type) {
+		case MLX4_IB_QPT_RC:
+		case MLX4_IB_QPT_UC:
+			switch (wr->opcode) {
+			case IB_WR_ATOMIC_CMP_AND_SWP:
+			case IB_WR_ATOMIC_FETCH_AND_ADD:
+			case IB_WR_MASKED_ATOMIC_FETCH_AND_ADD:
+				set_raddr_seg(wqe, wr->wr.atomic.remote_addr,
+					      wr->wr.atomic.rkey);
+				wqe  += sizeof (struct mlx4_wqe_raddr_seg);
+
+				set_atomic_seg(wqe, wr);
+				wqe  += sizeof (struct mlx4_wqe_atomic_seg);
+
+				size += (sizeof (struct mlx4_wqe_raddr_seg) +
+					 sizeof (struct mlx4_wqe_atomic_seg)) / 16;
+
+				break;
+
+			case IB_WR_MASKED_ATOMIC_CMP_AND_SWP:
+				set_raddr_seg(wqe, wr->wr.atomic.remote_addr,
+					      wr->wr.atomic.rkey);
+				wqe  += sizeof (struct mlx4_wqe_raddr_seg);
+
+				set_masked_atomic_seg(wqe, wr);
+				wqe  += sizeof (struct mlx4_wqe_masked_atomic_seg);
+
+				size += (sizeof (struct mlx4_wqe_raddr_seg) +
+					 sizeof (struct mlx4_wqe_masked_atomic_seg)) / 16;
+
+				break;
+
+			case IB_WR_RDMA_READ:
+			case IB_WR_RDMA_WRITE:
+			case IB_WR_RDMA_WRITE_WITH_IMM:
+				set_raddr_seg(wqe, wr->wr.rdma.remote_addr,
+					      wr->wr.rdma.rkey);
+				wqe  += sizeof (struct mlx4_wqe_raddr_seg);
+				size += sizeof (struct mlx4_wqe_raddr_seg) / 16;
+				break;
+
+			case IB_WR_LOCAL_INV:
+				ctrl->srcrb_flags |=
+					cpu_to_be32(MLX4_WQE_CTRL_STRONG_ORDER);
+				set_local_inv_seg(wqe, wr->ex.invalidate_rkey);
+				wqe  += sizeof (struct mlx4_wqe_local_inval_seg);
+				size += sizeof (struct mlx4_wqe_local_inval_seg) / 16;
+				break;
+
+			case IB_WR_FAST_REG_MR:
+				ctrl->srcrb_flags |=
+					cpu_to_be32(MLX4_WQE_CTRL_STRONG_ORDER);
+				set_fmr_seg(wqe, wr);
+				wqe  += sizeof (struct mlx4_wqe_fmr_seg);
+				size += sizeof (struct mlx4_wqe_fmr_seg) / 16;
+				break;
+
+			case IB_WR_BIND_MW:
+				ctrl->srcrb_flags |=
+					cpu_to_be32(MLX4_WQE_CTRL_STRONG_ORDER);
+				set_bind_seg(wqe, wr);
+				wqe  += sizeof(struct mlx4_wqe_bind_seg);
+				size += sizeof(struct mlx4_wqe_bind_seg) / 16;
+				break;
+			default:
+				/* No extra segments required for sends */
+				break;
+			}
+			break;
+
+		case MLX4_IB_QPT_TUN_SMI_OWNER:
+			err =  build_sriov_qp0_header(to_msqp(qp), wr, ctrl, &seglen);
+			if (unlikely(err)) {
+				*bad_wr = wr;
+				goto out;
+			}
+			wqe  += seglen;
+			size += seglen / 16;
+			break;
+		case MLX4_IB_QPT_TUN_SMI:
+		case MLX4_IB_QPT_TUN_GSI:
+			/* this is a UD qp used in MAD responses to slaves. */
+			set_datagram_seg(wqe, wr);
+			/* set the forced-loopback bit in the data seg av */
+			*(__be32 *) wqe |= cpu_to_be32(0x80000000);
+			wqe  += sizeof (struct mlx4_wqe_datagram_seg);
+			size += sizeof (struct mlx4_wqe_datagram_seg) / 16;
+			break;
+		case MLX4_IB_QPT_UD:
+			set_datagram_seg(wqe, wr);
+			wqe  += sizeof (struct mlx4_wqe_datagram_seg);
+			size += sizeof (struct mlx4_wqe_datagram_seg) / 16;
+
+			if (wr->opcode == IB_WR_LSO) {
+				err = build_lso_seg(wqe, wr, qp, &seglen, &lso_hdr_sz, &blh);
+				if (unlikely(err)) {
+					*bad_wr = wr;
+					goto out;
+				}
+				lso_wqe = (__be32 *) wqe;
+				wqe  += seglen;
+				size += seglen / 16;
+			}
+			break;
+
+		case MLX4_IB_QPT_PROXY_SMI_OWNER:
+			err = build_sriov_qp0_header(to_msqp(qp), wr, ctrl, &seglen);
+			if (unlikely(err)) {
+				*bad_wr = wr;
+				goto out;
+			}
+			wqe  += seglen;
+			size += seglen / 16;
+			/* to start tunnel header on a cache-line boundary */
+			add_zero_len_inline(wqe);
+			wqe += 16;
+			size++;
+			build_tunnel_header(wr, wqe, &seglen);
+			wqe  += seglen;
+			size += seglen / 16;
+			break;
+		case MLX4_IB_QPT_PROXY_SMI:
+		case MLX4_IB_QPT_PROXY_GSI:
+			/* If we are tunneling special qps, this is a UD qp.
+			 * In this case we first add a UD segment targeting
+			 * the tunnel qp, and then add a header with address
+			 * information */
+			set_tunnel_datagram_seg(to_mdev(ibqp->device), wqe, wr,
+						qp->mlx4_ib_qp_type);
+			wqe  += sizeof (struct mlx4_wqe_datagram_seg);
+			size += sizeof (struct mlx4_wqe_datagram_seg) / 16;
+			build_tunnel_header(wr, wqe, &seglen);
+			wqe  += seglen;
+			size += seglen / 16;
+			break;
+
+		case MLX4_IB_QPT_SMI:
+		case MLX4_IB_QPT_GSI:
+			err = build_mlx_header(to_msqp(qp), wr, ctrl, &seglen);
+			if (unlikely(err)) {
+				*bad_wr = wr;
+				goto out;
+			}
+			wqe  += seglen;
+			size += seglen / 16;
+			break;
+
+		default:
+			break;
+		}
+
+		/*
+		 * Write data segments in reverse order, so as to
+		 * overwrite cacheline stamp last within each
+		 * cacheline.  This avoids issues with WQE
+		 * prefetching.
+		 */
+
+		dseg = wqe;
+		dseg += wr->num_sge - 1;
+		size += wr->num_sge * (sizeof (struct mlx4_wqe_data_seg) / 16);
+
+		/* Add one more inline data segment for ICRC for MLX sends */
+		if (unlikely(qp->mlx4_ib_qp_type == MLX4_IB_QPT_SMI ||
+			     qp->mlx4_ib_qp_type == MLX4_IB_QPT_GSI ||
+			     qp->mlx4_ib_qp_type &
+			     (MLX4_IB_QPT_PROXY_SMI_OWNER | MLX4_IB_QPT_TUN_SMI_OWNER))) {
+			set_mlx_icrc_seg(dseg + 1);
+			size += sizeof (struct mlx4_wqe_data_seg) / 16;
+		}
+
+		for (i = wr->num_sge - 1; i >= 0; --i, --dseg)
+			set_data_seg(dseg, wr->sg_list + i);
+
+		/*
+		 * Possibly overwrite stamping in cacheline with LSO
+		 * segment only after making sure all data segments
+		 * are written.
+		 */
+		wmb();
+		*lso_wqe = lso_hdr_sz;
+
+		ctrl->fence_size = (wr->send_flags & IB_SEND_FENCE ?
+				    MLX4_WQE_CTRL_FENCE : 0) | size;
+
+		/*
+		 * Make sure descriptor is fully written before
+		 * setting ownership bit (because HW can start
+		 * executing as soon as we do).
+		 */
+		wmb();
+
+		if (wr->opcode < 0 || wr->opcode >= ARRAY_SIZE(mlx4_ib_opcode)) {
+			*bad_wr = wr;
+			err = -EINVAL;
+			goto out;
+		}
+
+		ctrl->owner_opcode = mlx4_ib_opcode[wr->opcode] |
+			(ind & qp->sq.wqe_cnt ? cpu_to_be32(1 << 31) : 0) | blh;
+
+		stamp = ind + qp->sq_spare_wqes;
+		ind += DIV_ROUND_UP(size * 16, 1U << qp->sq.wqe_shift);
+
+		/*
+		 * We can improve latency by not stamping the last
+		 * send queue WQE until after ringing the doorbell, so
+		 * only stamp here if there are still more WQEs to post.
+		 *
+		 * Same optimization applies to padding with NOP wqe
+		 * in case of WQE shrinking (used to prevent wrap-around
+		 * in the middle of WR).
+		 */
+		if (wr->next) {
+			stamp_send_wqe(qp, stamp, size * 16);
+			ind = pad_wraparound(qp, ind);
+		}
+	}
+
+out:
+	if (likely(nreq)) {
+		qp->sq.head += nreq;
+
+		/*
+		 * Make sure that descriptors are written before
+		 * doorbell record.
+		 */
+		wmb();
+
+		writel(qp->doorbell_qpn,
+		       to_mdev(ibqp->device)->uar_map + MLX4_SEND_DOORBELL);
+
+		/*
+		 * Make sure doorbells don't leak out of SQ spinlock
+		 * and reach the HCA out of order.
+		 */
+		mmiowb();
+
+		stamp_send_wqe(qp, stamp, size * 16);
+
+		ind = pad_wraparound(qp, ind);
+		qp->sq_next_wqe = ind;
+	}
+
+	spin_unlock_irqrestore(&qp->sq.lock, flags);
+
+	return err;
+}
+
+int mlx4_ib_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr,
+		      struct ib_recv_wr **bad_wr)
+{
+	struct mlx4_ib_qp *qp = to_mqp(ibqp);
+	struct mlx4_wqe_data_seg *scat;
+	unsigned long flags;
+	int err = 0;
+	int nreq;
+	int ind;
+	int max_gs;
+	int i;
+
+	max_gs = qp->rq.max_gs;
+	spin_lock_irqsave(&qp->rq.lock, flags);
+
+	ind = qp->rq.head & (qp->rq.wqe_cnt - 1);
+
+	for (nreq = 0; wr; ++nreq, wr = wr->next) {
+		if (mlx4_wq_overflow(&qp->rq, nreq, qp->ibqp.recv_cq)) {
+			err = -ENOMEM;
+			*bad_wr = wr;
+			goto out;
+		}
+
+		if (unlikely(wr->num_sge > qp->rq.max_gs)) {
+			err = -EINVAL;
+			*bad_wr = wr;
+			goto out;
+		}
+
+		scat = get_recv_wqe(qp, ind);
+
+		if (qp->mlx4_ib_qp_type & (MLX4_IB_QPT_PROXY_SMI_OWNER |
+		    MLX4_IB_QPT_PROXY_SMI | MLX4_IB_QPT_PROXY_GSI)) {
+			ib_dma_sync_single_for_device(ibqp->device,
+						      qp->sqp_proxy_rcv[ind].map,
+						      sizeof (struct mlx4_ib_proxy_sqp_hdr),
+						      DMA_FROM_DEVICE);
+			scat->byte_count =
+				cpu_to_be32(sizeof (struct mlx4_ib_proxy_sqp_hdr));
+			/* use dma lkey from upper layer entry */
+			scat->lkey = cpu_to_be32(wr->sg_list->lkey);
+			scat->addr = cpu_to_be64(qp->sqp_proxy_rcv[ind].map);
+			scat++;
+			max_gs--;
+		}
+
+		for (i = 0; i < wr->num_sge; ++i)
+			__set_data_seg(scat + i, wr->sg_list + i);
+
+		if (i < max_gs) {
+			scat[i].byte_count = 0;
+			scat[i].lkey       = cpu_to_be32(MLX4_INVALID_LKEY);
+			scat[i].addr       = 0;
+		}
+
+		qp->rq.wrid[ind] = wr->wr_id;
+
+		ind = (ind + 1) & (qp->rq.wqe_cnt - 1);
+	}
+
+out:
+	if (likely(nreq)) {
+		qp->rq.head += nreq;
+
+		/*
+		 * Make sure that descriptors are written before
+		 * doorbell record.
+		 */
+		wmb();
+
+		*qp->db.db = cpu_to_be32(qp->rq.head & 0xffff);
+	}
+
+	spin_unlock_irqrestore(&qp->rq.lock, flags);
+
+	return err;
+}
+
+static inline enum ib_qp_state to_ib_qp_state(enum mlx4_qp_state mlx4_state)
+{
+	switch (mlx4_state) {
+	case MLX4_QP_STATE_RST:      return IB_QPS_RESET;
+	case MLX4_QP_STATE_INIT:     return IB_QPS_INIT;
+	case MLX4_QP_STATE_RTR:      return IB_QPS_RTR;
+	case MLX4_QP_STATE_RTS:      return IB_QPS_RTS;
+	case MLX4_QP_STATE_SQ_DRAINING:
+	case MLX4_QP_STATE_SQD:      return IB_QPS_SQD;
+	case MLX4_QP_STATE_SQER:     return IB_QPS_SQE;
+	case MLX4_QP_STATE_ERR:      return IB_QPS_ERR;
+	default:		     return -1;
+	}
+}
+
+static inline enum ib_mig_state to_ib_mig_state(int mlx4_mig_state)
+{
+	switch (mlx4_mig_state) {
+	case MLX4_QP_PM_ARMED:		return IB_MIG_ARMED;
+	case MLX4_QP_PM_REARM:		return IB_MIG_REARM;
+	case MLX4_QP_PM_MIGRATED:	return IB_MIG_MIGRATED;
+	default: return -1;
+	}
+}
+
+static int to_ib_qp_access_flags(int mlx4_flags)
+{
+	int ib_flags = 0;
+
+	if (mlx4_flags & MLX4_QP_BIT_RRE)
+		ib_flags |= IB_ACCESS_REMOTE_READ;
+	if (mlx4_flags & MLX4_QP_BIT_RWE)
+		ib_flags |= IB_ACCESS_REMOTE_WRITE;
+	if (mlx4_flags & MLX4_QP_BIT_RAE)
+		ib_flags |= IB_ACCESS_REMOTE_ATOMIC;
+
+	return ib_flags;
+}
+
+static void to_ib_ah_attr(struct mlx4_ib_dev *ibdev, struct ib_ah_attr *ib_ah_attr,
+				struct mlx4_qp_path *path)
+{
+	struct mlx4_dev *dev = ibdev->dev;
+	int is_eth;
+
+	memset(ib_ah_attr, 0, sizeof *ib_ah_attr);
+	ib_ah_attr->port_num	  = path->sched_queue & 0x40 ? 2 : 1;
+
+	if (ib_ah_attr->port_num == 0 || ib_ah_attr->port_num > dev->caps.num_ports)
+		return;
+
+	is_eth = rdma_port_get_link_layer(&ibdev->ib_dev, ib_ah_attr->port_num) ==
+		IB_LINK_LAYER_ETHERNET;
+	if (is_eth)
+		ib_ah_attr->sl = ((path->sched_queue >> 3) & 0x7) |
+		((path->sched_queue & 4) << 1);
+	else
+		ib_ah_attr->sl = (path->sched_queue >> 2) & 0xf;
+
+	ib_ah_attr->dlid	  = be16_to_cpu(path->rlid);
+	ib_ah_attr->src_path_bits = path->grh_mylmc & 0x7f;
+	ib_ah_attr->static_rate   = path->static_rate ? path->static_rate - 5 : 0;
+	ib_ah_attr->ah_flags      = (path->grh_mylmc & (1 << 7)) ? IB_AH_GRH : 0;
+	if (ib_ah_attr->ah_flags) {
+		ib_ah_attr->grh.sgid_index = path->mgid_index;
+		ib_ah_attr->grh.hop_limit  = path->hop_limit;
+		ib_ah_attr->grh.traffic_class =
+			(be32_to_cpu(path->tclass_flowlabel) >> 20) & 0xff;
+		ib_ah_attr->grh.flow_label =
+			be32_to_cpu(path->tclass_flowlabel) & 0xfffff;
+		memcpy(ib_ah_attr->grh.dgid.raw,
+			path->rgid, sizeof ib_ah_attr->grh.dgid.raw);
+	}
+}
+
+int mlx4_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, int qp_attr_mask,
+		     struct ib_qp_init_attr *qp_init_attr)
+{
+	struct mlx4_ib_dev *dev = to_mdev(ibqp->device);
+	struct mlx4_ib_qp *qp = to_mqp(ibqp);
+	struct mlx4_qp_context context;
+	int mlx4_state;
+	int err = 0;
+
+	mutex_lock(&qp->mutex);
+
+	if (qp->state == IB_QPS_RESET) {
+		qp_attr->qp_state = IB_QPS_RESET;
+		goto done;
+	}
+
+	err = mlx4_qp_query(dev->dev, &qp->mqp, &context);
+	if (err) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	mlx4_state = be32_to_cpu(context.flags) >> 28;
+
+	qp->state		     = to_ib_qp_state(mlx4_state);
+	qp_attr->qp_state	     = qp->state;
+	qp_attr->path_mtu	     = context.mtu_msgmax >> 5;
+	qp_attr->path_mig_state	     =
+		to_ib_mig_state((be32_to_cpu(context.flags) >> 11) & 0x3);
+	qp_attr->qkey		     = be32_to_cpu(context.qkey);
+	qp_attr->rq_psn		     = be32_to_cpu(context.rnr_nextrecvpsn) & 0xffffff;
+	qp_attr->sq_psn		     = be32_to_cpu(context.next_send_psn) & 0xffffff;
+	qp_attr->dest_qp_num	     = be32_to_cpu(context.remote_qpn) & 0xffffff;
+	qp_attr->qp_access_flags     =
+		to_ib_qp_access_flags(be32_to_cpu(context.params2));
+
+	if (qp->ibqp.qp_type == IB_QPT_RC || qp->ibqp.qp_type == IB_QPT_UC) {
+		to_ib_ah_attr(dev, &qp_attr->ah_attr, &context.pri_path);
+		to_ib_ah_attr(dev, &qp_attr->alt_ah_attr, &context.alt_path);
+		qp_attr->alt_pkey_index = context.alt_path.pkey_index & 0x7f;
+		qp_attr->alt_port_num	= qp_attr->alt_ah_attr.port_num;
+	}
+
+	qp_attr->pkey_index = context.pri_path.pkey_index & 0x7f;
+	if (qp_attr->qp_state == IB_QPS_INIT)
+		qp_attr->port_num = qp->port;
+	else
+		qp_attr->port_num = context.pri_path.sched_queue & 0x40 ? 2 : 1;
+
+	/* qp_attr->en_sqd_async_notify is only applicable in modify qp */
+	qp_attr->sq_draining = mlx4_state == MLX4_QP_STATE_SQ_DRAINING;
+
+	qp_attr->max_rd_atomic = 1 << ((be32_to_cpu(context.params1) >> 21) & 0x7);
+
+	qp_attr->max_dest_rd_atomic =
+		1 << ((be32_to_cpu(context.params2) >> 21) & 0x7);
+	qp_attr->min_rnr_timer	    =
+		(be32_to_cpu(context.rnr_nextrecvpsn) >> 24) & 0x1f;
+	qp_attr->timeout	    = context.pri_path.ackto >> 3;
+	qp_attr->retry_cnt	    = (be32_to_cpu(context.params1) >> 16) & 0x7;
+	qp_attr->rnr_retry	    = (be32_to_cpu(context.params1) >> 13) & 0x7;
+	qp_attr->alt_timeout	    = context.alt_path.ackto >> 3;
+
+done:
+	qp_attr->cur_qp_state	     = qp_attr->qp_state;
+	qp_attr->cap.max_recv_wr     = qp->rq.wqe_cnt;
+	qp_attr->cap.max_recv_sge    = qp->rq.max_gs;
+
+	if (!ibqp->uobject) {
+		qp_attr->cap.max_send_wr  = qp->sq.wqe_cnt;
+		qp_attr->cap.max_send_sge = qp->sq.max_gs;
+	} else {
+		qp_attr->cap.max_send_wr  = 0;
+		qp_attr->cap.max_send_sge = 0;
+	}
+
+	/*
+	 * We don't support inline sends for kernel QPs (yet), and we
+	 * don't know what userspace's value should be.
+	 */
+	qp_attr->cap.max_inline_data = 0;
+
+	qp_init_attr->cap	     = qp_attr->cap;
+
+	qp_init_attr->create_flags = 0;
+	if (qp->flags & MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK)
+		qp_init_attr->create_flags |= IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK;
+
+	if (qp->flags & MLX4_IB_QP_LSO)
+		qp_init_attr->create_flags |= IB_QP_CREATE_IPOIB_UD_LSO;
+
+	if (qp->flags & MLX4_IB_QP_NETIF)
+		qp_init_attr->create_flags |= IB_QP_CREATE_NETIF_QP;
+
+	qp_init_attr->sq_sig_type =
+		qp->sq_signal_bits == cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE) ?
+		IB_SIGNAL_ALL_WR : IB_SIGNAL_REQ_WR;
+
+out:
+	mutex_unlock(&qp->mutex);
+	return err;
+}
+
diff --git a/drivers/infiniband/hw/mlx4/srq.c b/drivers/infiniband/hw/mlx4/srq.c
new file mode 100644
index 0000000..62d9285
--- /dev/null
+++ b/drivers/infiniband/hw/mlx4/srq.c
@@ -0,0 +1,369 @@
+/*
+ * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
+ * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/mlx4/qp.h>
+#include <linux/mlx4/srq.h>
+#include <linux/slab.h>
+
+#include "mlx4_ib.h"
+#include "user.h"
+
+static void *get_wqe(struct mlx4_ib_srq *srq, int n)
+{
+	return mlx4_buf_offset(&srq->buf, n << srq->msrq.wqe_shift);
+}
+
+static void mlx4_ib_srq_event(struct mlx4_srq *srq, enum mlx4_event type)
+{
+	struct ib_event event;
+	struct ib_srq *ibsrq = &to_mibsrq(srq)->ibsrq;
+
+	if (ibsrq->event_handler) {
+		event.device      = ibsrq->device;
+		event.element.srq = ibsrq;
+		switch (type) {
+		case MLX4_EVENT_TYPE_SRQ_LIMIT:
+			event.event = IB_EVENT_SRQ_LIMIT_REACHED;
+			break;
+		case MLX4_EVENT_TYPE_SRQ_CATAS_ERROR:
+			event.event = IB_EVENT_SRQ_ERR;
+			break;
+		default:
+			pr_warn("Unexpected event type %d "
+			       "on SRQ %06x\n", type, srq->srqn);
+			return;
+		}
+
+		ibsrq->event_handler(&event, ibsrq->srq_context);
+	}
+}
+
+struct ib_srq *mlx4_ib_create_srq(struct ib_pd *pd,
+				  struct ib_srq_init_attr *init_attr,
+				  struct ib_udata *udata)
+{
+	struct mlx4_ib_dev *dev = to_mdev(pd->device);
+	struct mlx4_ib_srq *srq;
+	struct mlx4_wqe_srq_next_seg *next;
+	struct mlx4_wqe_data_seg *scatter;
+	u32 cqn;
+	u16 xrcdn;
+	int desc_size;
+	int buf_size;
+	int err;
+	int i;
+
+	/* Sanity check SRQ size before proceeding */
+	if (init_attr->attr.max_wr  >= dev->dev->caps.max_srq_wqes ||
+	    init_attr->attr.max_sge >  dev->dev->caps.max_srq_sge)
+		return ERR_PTR(-EINVAL);
+
+	srq = kmalloc(sizeof *srq, GFP_KERNEL);
+	if (!srq)
+		return ERR_PTR(-ENOMEM);
+
+	mutex_init(&srq->mutex);
+	spin_lock_init(&srq->lock);
+	srq->msrq.max    = roundup_pow_of_two(init_attr->attr.max_wr + 1);
+	srq->msrq.max_gs = init_attr->attr.max_sge;
+
+	desc_size = max(32UL,
+			roundup_pow_of_two(sizeof (struct mlx4_wqe_srq_next_seg) +
+					   srq->msrq.max_gs *
+					   sizeof (struct mlx4_wqe_data_seg)));
+	srq->msrq.wqe_shift = ilog2(desc_size);
+
+	buf_size = srq->msrq.max * desc_size;
+
+	if (pd->uobject) {
+		struct mlx4_ib_create_srq ucmd;
+
+		if (ib_copy_from_udata(&ucmd, udata, sizeof ucmd)) {
+			err = -EFAULT;
+			goto err_srq;
+		}
+
+		srq->umem = ib_umem_get(pd->uobject->context, ucmd.buf_addr,
+					buf_size, 0, 0);
+		if (IS_ERR(srq->umem)) {
+			err = PTR_ERR(srq->umem);
+			goto err_srq;
+		}
+
+		err = mlx4_mtt_init(dev->dev, ib_umem_page_count(srq->umem),
+				    ilog2(srq->umem->page_size), &srq->mtt);
+		if (err)
+			goto err_buf;
+
+		err = mlx4_ib_umem_write_mtt(dev, &srq->mtt, srq->umem);
+		if (err)
+			goto err_mtt;
+
+		err = mlx4_ib_db_map_user(to_mucontext(pd->uobject->context),
+					  ucmd.db_addr, &srq->db);
+		if (err)
+			goto err_mtt;
+	} else {
+		err = mlx4_db_alloc(dev->dev, &srq->db, 0, GFP_KERNEL);
+		if (err)
+			goto err_srq;
+
+		*srq->db.db = 0;
+
+		if (mlx4_buf_alloc(dev->dev, buf_size, PAGE_SIZE * 2, &srq->buf,
+				   GFP_KERNEL)) {
+			err = -ENOMEM;
+			goto err_db;
+		}
+
+		srq->head    = 0;
+		srq->tail    = srq->msrq.max - 1;
+		srq->wqe_ctr = 0;
+
+		for (i = 0; i < srq->msrq.max; ++i) {
+			next = get_wqe(srq, i);
+			next->next_wqe_index =
+				cpu_to_be16((i + 1) & (srq->msrq.max - 1));
+
+			for (scatter = (void *) (next + 1);
+			     (void *) scatter < (void *) next + desc_size;
+			     ++scatter)
+				scatter->lkey = cpu_to_be32(MLX4_INVALID_LKEY);
+		}
+
+		err = mlx4_mtt_init(dev->dev, srq->buf.npages, srq->buf.page_shift,
+				    &srq->mtt);
+		if (err)
+			goto err_buf;
+
+		err = mlx4_buf_write_mtt(dev->dev, &srq->mtt, &srq->buf, GFP_KERNEL);
+		if (err)
+			goto err_mtt;
+
+		srq->wrid = kmalloc(srq->msrq.max * sizeof (u64), GFP_KERNEL);
+		if (!srq->wrid) {
+			err = -ENOMEM;
+			goto err_mtt;
+		}
+	}
+
+	cqn = (init_attr->srq_type == IB_SRQT_XRC) ?
+		to_mcq(init_attr->ext.xrc.cq)->mcq.cqn : 0;
+	xrcdn = (init_attr->srq_type == IB_SRQT_XRC) ?
+		to_mxrcd(init_attr->ext.xrc.xrcd)->xrcdn :
+		(u16) dev->dev->caps.reserved_xrcds;
+	err = mlx4_srq_alloc(dev->dev, to_mpd(pd)->pdn, cqn, xrcdn, &srq->mtt,
+			     srq->db.dma, &srq->msrq);
+	if (err)
+		goto err_wrid;
+
+	srq->msrq.event = mlx4_ib_srq_event;
+	srq->ibsrq.ext.xrc.srq_num = srq->msrq.srqn;
+
+	if (pd->uobject)
+		if (ib_copy_to_udata(udata, &srq->msrq.srqn, sizeof (__u32))) {
+			err = -EFAULT;
+			goto err_wrid;
+		}
+
+	init_attr->attr.max_wr = srq->msrq.max - 1;
+
+	return &srq->ibsrq;
+
+err_wrid:
+	if (pd->uobject)
+		mlx4_ib_db_unmap_user(to_mucontext(pd->uobject->context), &srq->db);
+	else
+		kfree(srq->wrid);
+
+err_mtt:
+	mlx4_mtt_cleanup(dev->dev, &srq->mtt);
+
+err_buf:
+	if (pd->uobject)
+		ib_umem_release(srq->umem);
+	else
+		mlx4_buf_free(dev->dev, buf_size, &srq->buf);
+
+err_db:
+	if (!pd->uobject)
+		mlx4_db_free(dev->dev, &srq->db);
+
+err_srq:
+	kfree(srq);
+
+	return ERR_PTR(err);
+}
+
+int mlx4_ib_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
+		       enum ib_srq_attr_mask attr_mask, struct ib_udata *udata)
+{
+	struct mlx4_ib_dev *dev = to_mdev(ibsrq->device);
+	struct mlx4_ib_srq *srq = to_msrq(ibsrq);
+	int ret;
+
+	/* We don't support resizing SRQs (yet?) */
+	if (attr_mask & IB_SRQ_MAX_WR)
+		return -EINVAL;
+
+	if (attr_mask & IB_SRQ_LIMIT) {
+		if (attr->srq_limit >= srq->msrq.max)
+			return -EINVAL;
+
+		mutex_lock(&srq->mutex);
+		ret = mlx4_srq_arm(dev->dev, &srq->msrq, attr->srq_limit);
+		mutex_unlock(&srq->mutex);
+
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+int mlx4_ib_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *srq_attr)
+{
+	struct mlx4_ib_dev *dev = to_mdev(ibsrq->device);
+	struct mlx4_ib_srq *srq = to_msrq(ibsrq);
+	int ret;
+	int limit_watermark;
+
+	ret = mlx4_srq_query(dev->dev, &srq->msrq, &limit_watermark);
+	if (ret)
+		return ret;
+
+	srq_attr->srq_limit = limit_watermark;
+	srq_attr->max_wr    = srq->msrq.max - 1;
+	srq_attr->max_sge   = srq->msrq.max_gs;
+
+	return 0;
+}
+
+int mlx4_ib_destroy_srq(struct ib_srq *srq)
+{
+	struct mlx4_ib_dev *dev = to_mdev(srq->device);
+	struct mlx4_ib_srq *msrq = to_msrq(srq);
+
+	mlx4_srq_free(dev->dev, &msrq->msrq);
+	mlx4_mtt_cleanup(dev->dev, &msrq->mtt);
+
+	if (srq->uobject) {
+		mlx4_ib_db_unmap_user(to_mucontext(srq->uobject->context), &msrq->db);
+		ib_umem_release(msrq->umem);
+	} else {
+		kfree(msrq->wrid);
+		mlx4_buf_free(dev->dev, msrq->msrq.max << msrq->msrq.wqe_shift,
+			      &msrq->buf);
+		mlx4_db_free(dev->dev, &msrq->db);
+	}
+
+	kfree(msrq);
+
+	return 0;
+}
+
+void mlx4_ib_free_srq_wqe(struct mlx4_ib_srq *srq, int wqe_index)
+{
+	struct mlx4_wqe_srq_next_seg *next;
+
+	/* always called with interrupts disabled. */
+	spin_lock(&srq->lock);
+
+	next = get_wqe(srq, srq->tail);
+	next->next_wqe_index = cpu_to_be16(wqe_index);
+	srq->tail = wqe_index;
+
+	spin_unlock(&srq->lock);
+}
+
+int mlx4_ib_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr,
+			  struct ib_recv_wr **bad_wr)
+{
+	struct mlx4_ib_srq *srq = to_msrq(ibsrq);
+	struct mlx4_wqe_srq_next_seg *next;
+	struct mlx4_wqe_data_seg *scat;
+	unsigned long flags;
+	int err = 0;
+	int nreq;
+	int i;
+
+	spin_lock_irqsave(&srq->lock, flags);
+
+	for (nreq = 0; wr; ++nreq, wr = wr->next) {
+		if (unlikely(wr->num_sge > srq->msrq.max_gs)) {
+			err = -EINVAL;
+			*bad_wr = wr;
+			break;
+		}
+
+		if (unlikely(srq->head == srq->tail)) {
+			err = -ENOMEM;
+			*bad_wr = wr;
+			break;
+		}
+
+		srq->wrid[srq->head] = wr->wr_id;
+
+		next      = get_wqe(srq, srq->head);
+		srq->head = be16_to_cpu(next->next_wqe_index);
+		scat      = (struct mlx4_wqe_data_seg *) (next + 1);
+
+		for (i = 0; i < wr->num_sge; ++i) {
+			scat[i].byte_count = cpu_to_be32(wr->sg_list[i].length);
+			scat[i].lkey       = cpu_to_be32(wr->sg_list[i].lkey);
+			scat[i].addr       = cpu_to_be64(wr->sg_list[i].addr);
+		}
+
+		if (i < srq->msrq.max_gs) {
+			scat[i].byte_count = 0;
+			scat[i].lkey       = cpu_to_be32(MLX4_INVALID_LKEY);
+			scat[i].addr       = 0;
+		}
+	}
+
+	if (likely(nreq)) {
+		srq->wqe_ctr += nreq;
+
+		/*
+		 * Make sure that descriptors are written before
+		 * doorbell record.
+		 */
+		wmb();
+
+		*srq->db.db = cpu_to_be32(srq->wqe_ctr);
+	}
+
+	spin_unlock_irqrestore(&srq->lock, flags);
+
+	return err;
+}
diff --git a/drivers/infiniband/hw/mlx4/sysfs.c b/drivers/infiniband/hw/mlx4/sysfs.c
new file mode 100644
index 0000000..cb4c66e
--- /dev/null
+++ b/drivers/infiniband/hw/mlx4/sysfs.c
@@ -0,0 +1,906 @@
+/*
+ * Copyright (c) 2012 Mellanox Technologies.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/*#include "core_priv.h"*/
+#include "mlx4_ib.h"
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+
+#include <rdma/ib_mad.h>
+/*show_admin_alias_guid returns the administratively assigned value of that GUID.
+ * Values returned in buf parameter string:
+ *	0			- requests opensm to assign a value.
+ *	ffffffffffffffff	- delete this entry.
+ *	other			- value assigned by administrator.
+ */
+static ssize_t show_admin_alias_guid(struct device *dev,
+			      struct device_attribute *attr, char *buf)
+{
+	int record_num;/*0-15*/
+	int guid_index_in_rec; /*0 - 7*/
+	struct mlx4_ib_iov_sysfs_attr *mlx4_ib_iov_dentry =
+		container_of(attr, struct mlx4_ib_iov_sysfs_attr, dentry);
+	struct mlx4_ib_iov_port *port = mlx4_ib_iov_dentry->ctx;
+	struct mlx4_ib_dev *mdev = port->dev;
+
+	record_num = mlx4_ib_iov_dentry->entry_num / 8 ;
+	guid_index_in_rec = mlx4_ib_iov_dentry->entry_num % 8 ;
+
+	return sprintf(buf, "%llx\n",
+		       be64_to_cpu(*(__be64 *)&mdev->sriov.alias_guid.
+				   ports_guid[port->num - 1].
+				   all_rec_per_port[record_num].
+				   all_recs[8 * guid_index_in_rec]));
+}
+
+/* store_admin_alias_guid stores the (new) administratively assigned value of that GUID.
+ * Values in buf parameter string:
+ *	0			- requests opensm to assign a value.
+ *	0xffffffffffffffff	- delete this entry.
+ *	other			- guid value assigned by the administrator.
+ */
+static ssize_t store_admin_alias_guid(struct device *dev,
+				      struct device_attribute *attr,
+				      const char *buf, size_t count)
+{
+	int record_num;/*0-15*/
+	int guid_index_in_rec; /*0 - 7*/
+	struct mlx4_ib_iov_sysfs_attr *mlx4_ib_iov_dentry =
+		container_of(attr, struct mlx4_ib_iov_sysfs_attr, dentry);
+	struct mlx4_ib_iov_port *port = mlx4_ib_iov_dentry->ctx;
+	struct mlx4_ib_dev *mdev = port->dev;
+	u64 sysadmin_ag_val;
+
+	record_num = mlx4_ib_iov_dentry->entry_num / 8;
+	guid_index_in_rec = mlx4_ib_iov_dentry->entry_num % 8;
+	if (0 == record_num && 0 == guid_index_in_rec) {
+		pr_err("GUID 0 block 0 is RO\n");
+		return count;
+	}
+	sscanf(buf, "%llx", &sysadmin_ag_val);
+	*(__be64 *)&mdev->sriov.alias_guid.ports_guid[port->num - 1].
+		all_rec_per_port[record_num].
+		all_recs[GUID_REC_SIZE * guid_index_in_rec] =
+			cpu_to_be64(sysadmin_ag_val);
+
+	/* Change the state to be pending for update */
+	mdev->sriov.alias_guid.ports_guid[port->num - 1].all_rec_per_port[record_num].status
+		= MLX4_GUID_INFO_STATUS_IDLE ;
+
+	mdev->sriov.alias_guid.ports_guid[port->num - 1].all_rec_per_port[record_num].method
+		= MLX4_GUID_INFO_RECORD_SET;
+
+	switch (sysadmin_ag_val) {
+	case MLX4_GUID_FOR_DELETE_VAL:
+		mdev->sriov.alias_guid.ports_guid[port->num - 1].all_rec_per_port[record_num].method
+			= MLX4_GUID_INFO_RECORD_DELETE;
+		mdev->sriov.alias_guid.ports_guid[port->num - 1].all_rec_per_port[record_num].ownership
+			= MLX4_GUID_SYSADMIN_ASSIGN;
+		break;
+	/* The sysadmin requests the SM to re-assign */
+	case MLX4_NOT_SET_GUID:
+		mdev->sriov.alias_guid.ports_guid[port->num - 1].all_rec_per_port[record_num].ownership
+			= MLX4_GUID_DRIVER_ASSIGN;
+		break;
+	/* The sysadmin requests a specific value.*/
+	default:
+		mdev->sriov.alias_guid.ports_guid[port->num - 1].all_rec_per_port[record_num].ownership
+			= MLX4_GUID_SYSADMIN_ASSIGN;
+		break;
+	}
+
+	/* set the record index */
+	mdev->sriov.alias_guid.ports_guid[port->num - 1].all_rec_per_port[record_num].guid_indexes
+		= mlx4_ib_get_aguid_comp_mask_from_ix(guid_index_in_rec);
+
+	mlx4_ib_init_alias_guid_work(mdev, port->num - 1);
+
+	return count;
+}
+
+static ssize_t show_port_gid(struct device *dev,
+			     struct device_attribute *attr,
+			     char *buf)
+{
+	struct mlx4_ib_iov_sysfs_attr *mlx4_ib_iov_dentry =
+		container_of(attr, struct mlx4_ib_iov_sysfs_attr, dentry);
+	struct mlx4_ib_iov_port *port = mlx4_ib_iov_dentry->ctx;
+	struct mlx4_ib_dev *mdev = port->dev;
+	union ib_gid gid;
+	ssize_t ret;
+
+	ret = __mlx4_ib_query_gid(&mdev->ib_dev, port->num,
+				  mlx4_ib_iov_dentry->entry_num, &gid, 1);
+	if (ret)
+		return ret;
+	ret = sprintf(buf, "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x\n",
+		      be16_to_cpu(((__be16 *) gid.raw)[0]),
+		      be16_to_cpu(((__be16 *) gid.raw)[1]),
+		      be16_to_cpu(((__be16 *) gid.raw)[2]),
+		      be16_to_cpu(((__be16 *) gid.raw)[3]),
+		      be16_to_cpu(((__be16 *) gid.raw)[4]),
+		      be16_to_cpu(((__be16 *) gid.raw)[5]),
+		      be16_to_cpu(((__be16 *) gid.raw)[6]),
+		      be16_to_cpu(((__be16 *) gid.raw)[7]));
+	return ret;
+}
+
+static ssize_t show_phys_port_pkey(struct device *dev,
+				   struct device_attribute *attr,
+				   char *buf)
+{
+	struct mlx4_ib_iov_sysfs_attr *mlx4_ib_iov_dentry =
+		container_of(attr, struct mlx4_ib_iov_sysfs_attr, dentry);
+	struct mlx4_ib_iov_port *port = mlx4_ib_iov_dentry->ctx;
+	struct mlx4_ib_dev *mdev = port->dev;
+	u16 pkey;
+	ssize_t ret;
+
+	ret = __mlx4_ib_query_pkey(&mdev->ib_dev, port->num,
+				   mlx4_ib_iov_dentry->entry_num, &pkey, 1);
+	if (ret)
+		return ret;
+
+	return sprintf(buf, "0x%04x\n", pkey);
+}
+
+#define DENTRY_REMOVE(_dentry)						\
+do {									\
+	sysfs_remove_file((_dentry)->kobj, &(_dentry)->dentry.attr);	\
+} while (0);
+
+static int create_sysfs_entry(void *_ctx, struct mlx4_ib_iov_sysfs_attr *_dentry,
+			      char *_name, struct kobject *_kobj,
+			      ssize_t (*show)(struct device *dev,
+					      struct device_attribute *attr,
+					      char *buf),
+			      ssize_t (*store)(struct device *dev,
+					       struct device_attribute *attr,
+					       const char *buf, size_t count)
+			      )
+{
+	int ret = 0;
+	struct mlx4_ib_iov_sysfs_attr *vdentry = _dentry;
+
+	vdentry->ctx = _ctx;
+	vdentry->dentry.show = show;
+	vdentry->dentry.store = store;
+	sysfs_attr_init(&vdentry->dentry.attr);
+	vdentry->dentry.attr.name = vdentry->name;
+	vdentry->dentry.attr.mode = 0;
+	vdentry->kobj = _kobj;
+	snprintf(vdentry->name, 15, "%s", _name);
+
+	if (vdentry->dentry.store)
+		vdentry->dentry.attr.mode |= S_IWUSR;
+
+	if (vdentry->dentry.show)
+		vdentry->dentry.attr.mode |= S_IRUGO;
+
+	ret = sysfs_create_file(vdentry->kobj, &vdentry->dentry.attr);
+	if (ret) {
+		pr_err("failed to create %s\n", vdentry->dentry.attr.name);
+		vdentry->ctx = NULL;
+		return ret;
+	}
+
+	return ret;
+}
+
+int add_sysfs_port_mcg_attr(struct mlx4_ib_dev *device, int port_num,
+		struct attribute *attr)
+{
+	struct mlx4_ib_iov_port *port = &device->iov_ports[port_num - 1];
+	int ret;
+
+	ret = sysfs_create_file(port->mcgs_parent, attr);
+	if (ret)
+		pr_err("failed to create %s\n", attr->name);
+
+	return ret;
+}
+
+void del_sysfs_port_mcg_attr(struct mlx4_ib_dev *device, int port_num,
+		struct attribute *attr)
+{
+	struct mlx4_ib_iov_port *port = &device->iov_ports[port_num - 1];
+
+	sysfs_remove_file(port->mcgs_parent, attr);
+}
+
+static int add_port_entries(struct mlx4_ib_dev *device, int port_num)
+{
+	int i;
+	char buff[10];
+	struct mlx4_ib_iov_port *port = NULL;
+	int ret = 0 ;
+	struct ib_port_attr attr;
+
+	/* get the physical gid and pkey table sizes.*/
+	ret = __mlx4_ib_query_port(&device->ib_dev, port_num, &attr, 1);
+	if (ret)
+		goto err;
+
+	port = &device->iov_ports[port_num - 1];
+	port->dev = device;
+	port->num = port_num;
+	/* Directory structure:
+	 * iov -
+	 *   port num -
+	 *	admin_guids
+	 *	gids (operational)
+	 *	mcg_table
+	 */
+	port->dentr_ar = kzalloc(sizeof (struct mlx4_ib_iov_sysfs_attr_ar),
+				 GFP_KERNEL);
+	if (!port->dentr_ar) {
+		ret = -ENOMEM;
+		goto err;
+	}
+	sprintf(buff, "%d", port_num);
+	port->cur_port = kobject_create_and_add(buff,
+				 kobject_get(device->ports_parent));
+	if (!port->cur_port) {
+		ret = -ENOMEM;
+		goto kobj_create_err;
+	}
+	/* admin GUIDs */
+	port->admin_alias_parent = kobject_create_and_add("admin_guids",
+						  kobject_get(port->cur_port));
+	if (!port->admin_alias_parent) {
+		ret = -ENOMEM;
+		goto err_admin_guids;
+	}
+	for (i = 0 ; i < attr.gid_tbl_len; i++) {
+		sprintf(buff, "%d", i);
+		port->dentr_ar->dentries[i].entry_num = i;
+		ret = create_sysfs_entry(port, &port->dentr_ar->dentries[i],
+					  buff, port->admin_alias_parent,
+					  show_admin_alias_guid, store_admin_alias_guid);
+		if (ret)
+			goto err_admin_alias_parent;
+	}
+
+	/* gids subdirectory (operational gids) */
+	port->gids_parent = kobject_create_and_add("gids",
+						  kobject_get(port->cur_port));
+	if (!port->gids_parent) {
+		ret = -ENOMEM;
+		goto err_gids;
+	}
+
+	for (i = 0 ; i < attr.gid_tbl_len; i++) {
+		sprintf(buff, "%d", i);
+		port->dentr_ar->dentries[attr.gid_tbl_len + i].entry_num = i;
+		ret = create_sysfs_entry(port,
+					 &port->dentr_ar->dentries[attr.gid_tbl_len + i],
+					 buff,
+					 port->gids_parent, show_port_gid, NULL);
+		if (ret)
+			goto err_gids_parent;
+	}
+
+	/* physical port pkey table */
+	port->pkeys_parent =
+		kobject_create_and_add("pkeys", kobject_get(port->cur_port));
+	if (!port->pkeys_parent) {
+		ret = -ENOMEM;
+		goto err_pkeys;
+	}
+
+	for (i = 0 ; i < attr.pkey_tbl_len; i++) {
+		sprintf(buff, "%d", i);
+		port->dentr_ar->dentries[2 * attr.gid_tbl_len + i].entry_num = i;
+		ret = create_sysfs_entry(port,
+					 &port->dentr_ar->dentries[2 * attr.gid_tbl_len + i],
+					 buff, port->pkeys_parent,
+					 show_phys_port_pkey, NULL);
+		if (ret)
+			goto err_pkeys_parent;
+	}
+
+	/* MCGs table */
+	port->mcgs_parent =
+		kobject_create_and_add("mcgs", kobject_get(port->cur_port));
+	if (!port->mcgs_parent) {
+		ret = -ENOMEM;
+		goto err_mcgs;
+	}
+	return 0;
+
+err_mcgs:
+	kobject_put(port->cur_port);
+
+err_pkeys_parent:
+	kobject_put(port->pkeys_parent);
+
+err_pkeys:
+	kobject_put(port->cur_port);
+
+err_gids_parent:
+	kobject_put(port->gids_parent);
+
+err_gids:
+	kobject_put(port->cur_port);
+
+err_admin_alias_parent:
+	kobject_put(port->admin_alias_parent);
+
+err_admin_guids:
+	kobject_put(port->cur_port);
+	kobject_put(port->cur_port); /* once more for create_and_add buff */
+
+kobj_create_err:
+	kobject_put(device->ports_parent);
+	kfree(port->dentr_ar);
+
+err:
+	pr_err("add_port_entries FAILED: for port:%d, error: %d\n",
+	       port_num, ret);
+	return ret;
+}
+
+static void get_name(struct mlx4_ib_dev *dev, char *name, int i, int max)
+{
+	char base_name[9];
+
+	/* pci_name format is: bus:dev:func -> xxxx:yy:zz.n */
+	strlcpy(name, pci_name(dev->dev->pdev), max);
+	strncpy(base_name, name, 8); /*till xxxx:yy:*/
+	base_name[8] = '\0';
+	/* with no ARI only 3 last bits are used so when the fn is higher than 8
+	 * need to add it to the dev num, so count in the last number will be
+	 * modulo 8 */
+	sprintf(name, "%s%.2d.%d", base_name, (i/8), (i%8));
+}
+
+struct mlx4_port {
+	struct kobject         kobj;
+	struct mlx4_ib_dev    *dev;
+	struct attribute_group pkey_group;
+	struct attribute_group gid_group;
+	struct device_attribute	enable_smi_admin;
+	struct device_attribute	smi_enabled;
+	int		       slave;
+	u8                     port_num;
+};
+
+
+static void mlx4_port_release(struct kobject *kobj)
+{
+	struct mlx4_port *p = container_of(kobj, struct mlx4_port, kobj);
+	struct attribute *a;
+	int i;
+
+	for (i = 0; (a = p->pkey_group.attrs[i]); ++i)
+		kfree(a);
+	kfree(p->pkey_group.attrs);
+	for (i = 0; (a = p->gid_group.attrs[i]); ++i)
+		kfree(a);
+	kfree(p->gid_group.attrs);
+	kfree(p);
+}
+
+struct port_attribute {
+	struct attribute attr;
+	ssize_t (*show)(struct mlx4_port *, struct port_attribute *, char *buf);
+	ssize_t (*store)(struct mlx4_port *, struct port_attribute *,
+			 const char *buf, size_t count);
+};
+
+static ssize_t port_attr_show(struct kobject *kobj,
+			      struct attribute *attr, char *buf)
+{
+	struct port_attribute *port_attr =
+		container_of(attr, struct port_attribute, attr);
+	struct mlx4_port *p = container_of(kobj, struct mlx4_port, kobj);
+
+	if (!port_attr->show)
+		return -EIO;
+	return port_attr->show(p, port_attr, buf);
+}
+
+static ssize_t port_attr_store(struct kobject *kobj,
+			       struct attribute *attr,
+			       const char *buf, size_t size)
+{
+	struct port_attribute *port_attr =
+		container_of(attr, struct port_attribute, attr);
+	struct mlx4_port *p = container_of(kobj, struct mlx4_port, kobj);
+
+	if (!port_attr->store)
+		return -EIO;
+	return port_attr->store(p, port_attr, buf, size);
+}
+
+static const struct sysfs_ops port_sysfs_ops = {
+	.show = port_attr_show,
+	.store = port_attr_store,
+};
+
+static struct kobj_type port_type = {
+	.release    = mlx4_port_release,
+	.sysfs_ops  = &port_sysfs_ops,
+};
+
+struct port_table_attribute {
+	struct port_attribute	attr;
+	char			name[8];
+	int			index;
+};
+
+static ssize_t show_port_pkey(struct mlx4_port *p, struct port_attribute *attr,
+			      char *buf)
+{
+	struct port_table_attribute *tab_attr =
+		container_of(attr, struct port_table_attribute, attr);
+	ssize_t ret = -ENODEV;
+
+	if (p->dev->pkeys.virt2phys_pkey[p->slave][p->port_num - 1][tab_attr->index] >=
+	    (p->dev->dev->caps.pkey_table_len[p->port_num]))
+		ret = sprintf(buf, "none\n");
+	else
+		ret = sprintf(buf, "%d\n",
+			      p->dev->pkeys.virt2phys_pkey[p->slave]
+			      [p->port_num - 1][tab_attr->index]);
+	return ret;
+}
+
+static ssize_t store_port_pkey(struct mlx4_port *p, struct port_attribute *attr,
+			       const char *buf, size_t count)
+{
+	struct port_table_attribute *tab_attr =
+		container_of(attr, struct port_table_attribute, attr);
+	int idx;
+	int err;
+
+	/* do not allow remapping Dom0 virtual pkey table */
+	if (p->slave == mlx4_master_func_num(p->dev->dev))
+		return -EINVAL;
+
+	if (!strncasecmp(buf, "no", 2))
+		idx = p->dev->dev->phys_caps.pkey_phys_table_len[p->port_num] - 1;
+	else if (sscanf(buf, "%i", &idx) != 1 ||
+		 idx >= p->dev->dev->caps.pkey_table_len[p->port_num] ||
+		 idx < 0)
+		return -EINVAL;
+
+	p->dev->pkeys.virt2phys_pkey[p->slave][p->port_num - 1]
+				    [tab_attr->index] = idx;
+	mlx4_sync_pkey_table(p->dev->dev, p->slave, p->port_num,
+			     tab_attr->index, idx);
+	err = mlx4_gen_pkey_eqe(p->dev->dev, p->slave, p->port_num);
+	if (err) {
+		pr_err("mlx4_gen_pkey_eqe failed for slave %d,"
+		       " port %d, index %d\n", p->slave, p->port_num, idx);
+		return err;
+	}
+	return count;
+}
+
+static ssize_t show_port_gid_idx(struct mlx4_port *p,
+				 struct port_attribute *attr, char *buf)
+{
+	return sprintf(buf, "%d\n", p->slave);
+}
+
+static struct attribute **
+alloc_group_attrs(ssize_t (*show)(struct mlx4_port *,
+				  struct port_attribute *, char *buf),
+		  ssize_t (*store)(struct mlx4_port *, struct port_attribute *,
+				   const char *buf, size_t count),
+		  int len)
+{
+	struct attribute **tab_attr;
+	struct port_table_attribute *element;
+	int i;
+
+	tab_attr = kcalloc(1 + len, sizeof (struct attribute *), GFP_KERNEL);
+	if (!tab_attr)
+		return NULL;
+
+	for (i = 0; i < len; i++) {
+		element = kzalloc(sizeof (struct port_table_attribute),
+				  GFP_KERNEL);
+		if (!element)
+			goto err;
+		if (snprintf(element->name, sizeof (element->name),
+			     "%d", i) >= sizeof (element->name)) {
+			kfree(element);
+			goto err;
+		}
+		sysfs_attr_init(&element->attr.attr);
+		element->attr.attr.name  = element->name;
+		if (store) {
+			element->attr.attr.mode  = S_IWUSR | S_IRUGO;
+			element->attr.store	 = store;
+		} else
+			element->attr.attr.mode  = S_IRUGO;
+
+		element->attr.show       = show;
+		element->index		 = i;
+		tab_attr[i] = &element->attr.attr;
+	}
+	return tab_attr;
+
+err:
+	while (--i >= 0)
+		kfree(tab_attr[i]);
+	kfree(tab_attr);
+	return NULL;
+}
+
+static ssize_t sysfs_show_smi_enabled(struct device *dev,
+				      struct device_attribute *attr, char *buf)
+{
+	struct mlx4_port *p =
+		container_of(attr, struct mlx4_port, smi_enabled);
+	ssize_t len = 0;
+
+	if (mlx4_vf_smi_enabled(p->dev->dev, p->slave, p->port_num))
+		len = sprintf(buf, "%d\n", 1);
+	else
+		len = sprintf(buf, "%d\n", 0);
+
+	return len;
+}
+
+static ssize_t sysfs_show_enable_smi_admin(struct device *dev,
+					   struct device_attribute *attr,
+					   char *buf)
+{
+	struct mlx4_port *p =
+		container_of(attr, struct mlx4_port, enable_smi_admin);
+	ssize_t len = 0;
+
+	if (mlx4_vf_get_enable_smi_admin(p->dev->dev, p->slave, p->port_num))
+		len = sprintf(buf, "%d\n", 1);
+	else
+		len = sprintf(buf, "%d\n", 0);
+
+	return len;
+}
+
+static ssize_t sysfs_store_enable_smi_admin(struct device *dev,
+					    struct device_attribute *attr,
+					    const char *buf, size_t count)
+{
+	struct mlx4_port *p =
+		container_of(attr, struct mlx4_port, enable_smi_admin);
+	int enable;
+
+	if (sscanf(buf, "%i", &enable) != 1 ||
+	    enable < 0 || enable > 1)
+		return -EINVAL;
+
+	if (mlx4_vf_set_enable_smi_admin(p->dev->dev, p->slave, p->port_num, enable))
+		return -EINVAL;
+	return count;
+}
+
+static int add_vf_smi_entries(struct mlx4_port *p)
+{
+	int is_eth = rdma_port_get_link_layer(&p->dev->ib_dev, p->port_num) ==
+			IB_LINK_LAYER_ETHERNET;
+	int ret;
+
+	/* do not display entries if eth transport, or if master */
+	if (is_eth || p->slave == mlx4_master_func_num(p->dev->dev))
+		return 0;
+
+	sysfs_attr_init(&p->smi_enabled.attr);
+	p->smi_enabled.show = sysfs_show_smi_enabled;
+	p->smi_enabled.store = NULL;
+	p->smi_enabled.attr.name = "smi_enabled";
+	p->smi_enabled.attr.mode = 0444;
+	ret = sysfs_create_file(&p->kobj, &p->smi_enabled.attr);
+	if (ret) {
+		pr_err("failed to create smi_enabled\n");
+		return ret;
+	}
+
+	sysfs_attr_init(&p->enable_smi_admin.attr);
+	p->enable_smi_admin.show = sysfs_show_enable_smi_admin;
+	p->enable_smi_admin.store = sysfs_store_enable_smi_admin;
+	p->enable_smi_admin.attr.name = "enable_smi_admin";
+	p->enable_smi_admin.attr.mode = 0644;
+	ret = sysfs_create_file(&p->kobj, &p->enable_smi_admin.attr);
+	if (ret) {
+		pr_err("failed to create enable_smi_admin\n");
+		sysfs_remove_file(&p->kobj, &p->smi_enabled.attr);
+		return ret;
+	}
+	return 0;
+}
+
+static void remove_vf_smi_entries(struct mlx4_port *p)
+{
+	int is_eth = rdma_port_get_link_layer(&p->dev->ib_dev, p->port_num) ==
+			IB_LINK_LAYER_ETHERNET;
+
+	if (is_eth || p->slave == mlx4_master_func_num(p->dev->dev))
+		return;
+
+	sysfs_remove_file(&p->kobj, &p->smi_enabled.attr);
+	sysfs_remove_file(&p->kobj, &p->enable_smi_admin.attr);
+}
+
+static int add_port(struct mlx4_ib_dev *dev, int port_num, int slave)
+{
+	struct mlx4_port *p;
+	int i;
+	int ret;
+
+	p = kzalloc(sizeof *p, GFP_KERNEL);
+	if (!p)
+		return -ENOMEM;
+
+	p->dev = dev;
+	p->port_num = port_num;
+	p->slave = slave;
+
+	ret = kobject_init_and_add(&p->kobj, &port_type,
+				   kobject_get(dev->dev_ports_parent[slave]),
+				   "%d", port_num);
+	if (ret)
+		goto err_alloc;
+
+	p->pkey_group.name  = "pkey_idx";
+	p->pkey_group.attrs =
+		alloc_group_attrs(show_port_pkey, store_port_pkey,
+				  dev->dev->caps.pkey_table_len[port_num]);
+	if (!p->pkey_group.attrs) {
+		ret = -ENOMEM;
+		goto err_alloc;
+	}
+
+	ret = sysfs_create_group(&p->kobj, &p->pkey_group);
+	if (ret)
+		goto err_free_pkey;
+
+	p->gid_group.name  = "gid_idx";
+	p->gid_group.attrs = alloc_group_attrs(show_port_gid_idx, NULL, 1);
+	if (!p->gid_group.attrs) {
+		ret = -ENOMEM;
+		goto err_free_pkey;
+	}
+
+	ret = sysfs_create_group(&p->kobj, &p->gid_group);
+	if (ret)
+		goto err_free_gid;
+
+	ret = add_vf_smi_entries(p);
+	if (ret)
+		goto err_free_gid;
+
+	list_add_tail(&p->kobj.entry, &dev->pkeys.pkey_port_list[slave]);
+	return 0;
+
+err_free_gid:
+	kfree(p->gid_group.attrs[0]);
+	kfree(p->gid_group.attrs);
+
+err_free_pkey:
+	for (i = 0; i < dev->dev->caps.pkey_table_len[port_num]; ++i)
+		kfree(p->pkey_group.attrs[i]);
+	kfree(p->pkey_group.attrs);
+
+err_alloc:
+	kobject_put(dev->dev_ports_parent[slave]);
+	kfree(p);
+	return ret;
+}
+
+static int register_one_pkey_tree(struct mlx4_ib_dev *dev, int slave)
+{
+	char name[32];
+	int err;
+	int port;
+	struct kobject *p, *t;
+	struct mlx4_port *mport;
+	struct mlx4_active_ports actv_ports;
+
+	get_name(dev, name, slave, sizeof name);
+
+	dev->pkeys.device_parent[slave] =
+		kobject_create_and_add(name, kobject_get(dev->iov_parent));
+
+	if (!dev->pkeys.device_parent[slave]) {
+		err = -ENOMEM;
+		goto fail_dev;
+	}
+
+	INIT_LIST_HEAD(&dev->pkeys.pkey_port_list[slave]);
+
+	dev->dev_ports_parent[slave] =
+		kobject_create_and_add("ports",
+				       kobject_get(dev->pkeys.device_parent[slave]));
+
+	if (!dev->dev_ports_parent[slave]) {
+		err = -ENOMEM;
+		goto err_ports;
+	}
+
+	actv_ports = mlx4_get_active_ports(dev->dev, slave);
+
+	for (port = 1; port <= dev->dev->caps.num_ports; ++port) {
+		if (!test_bit(port - 1, actv_ports.ports))
+			continue;
+		err = add_port(dev, port, slave);
+		if (err)
+			goto err_add;
+	}
+	return 0;
+
+err_add:
+	list_for_each_entry_safe(p, t,
+				 &dev->pkeys.pkey_port_list[slave],
+				 entry) {
+		list_del(&p->entry);
+		mport = container_of(p, struct mlx4_port, kobj);
+		sysfs_remove_group(p, &mport->pkey_group);
+		sysfs_remove_group(p, &mport->gid_group);
+		remove_vf_smi_entries(mport);
+		kobject_put(p);
+	}
+	kobject_put(dev->dev_ports_parent[slave]);
+
+err_ports:
+	kobject_put(dev->pkeys.device_parent[slave]);
+	/* extra put for the device_parent create_and_add */
+	kobject_put(dev->pkeys.device_parent[slave]);
+
+fail_dev:
+	kobject_put(dev->iov_parent);
+	return err;
+}
+
+static int register_pkey_tree(struct mlx4_ib_dev *device)
+{
+	int i;
+
+	if (!mlx4_is_master(device->dev))
+		return 0;
+
+	for (i = 0; i <= device->dev->num_vfs; ++i)
+		register_one_pkey_tree(device, i);
+
+	return 0;
+}
+
+static void unregister_pkey_tree(struct mlx4_ib_dev *device)
+{
+	int slave;
+	struct kobject *p, *t;
+	struct mlx4_port *port;
+
+	if (!mlx4_is_master(device->dev))
+		return;
+
+	for (slave = device->dev->num_vfs; slave >= 0; --slave) {
+		list_for_each_entry_safe(p, t,
+					 &device->pkeys.pkey_port_list[slave],
+					 entry) {
+			list_del(&p->entry);
+			port = container_of(p, struct mlx4_port, kobj);
+			sysfs_remove_group(p, &port->pkey_group);
+			sysfs_remove_group(p, &port->gid_group);
+			remove_vf_smi_entries(port);
+			kobject_put(p);
+			kobject_put(device->dev_ports_parent[slave]);
+		}
+		kobject_put(device->dev_ports_parent[slave]);
+		kobject_put(device->pkeys.device_parent[slave]);
+		kobject_put(device->pkeys.device_parent[slave]);
+		kobject_put(device->iov_parent);
+	}
+}
+
+int mlx4_ib_device_register_sysfs(struct mlx4_ib_dev *dev)
+{
+	int i;
+	int ret = 0;
+
+	if (!mlx4_is_master(dev->dev))
+		return 0;
+
+	dev->iov_parent =
+		kobject_create_and_add("iov",
+				       kobject_get(dev->ib_dev.ports_parent->parent));
+	if (!dev->iov_parent) {
+		ret = -ENOMEM;
+		goto err;
+	}
+	dev->ports_parent =
+		kobject_create_and_add("ports",
+				       kobject_get(dev->iov_parent));
+	if (!dev->ports_parent) {
+		ret = -ENOMEM;
+		goto err_ports;
+	}
+
+	for (i = 1; i <= dev->ib_dev.phys_port_cnt; ++i) {
+		ret = add_port_entries(dev, i);
+		if (ret)
+			goto err_add_entries;
+	}
+
+	ret = register_pkey_tree(dev);
+	if (ret)
+		goto err_add_entries;
+	return 0;
+
+err_add_entries:
+	kobject_put(dev->ports_parent);
+
+err_ports:
+	kobject_put(dev->iov_parent);
+err:
+	kobject_put(dev->ib_dev.ports_parent->parent);
+	pr_err("mlx4_ib_device_register_sysfs error (%d)\n", ret);
+	return ret;
+}
+
+static void unregister_alias_guid_tree(struct mlx4_ib_dev *device)
+{
+	struct mlx4_ib_iov_port *p;
+	int i;
+
+	if (!mlx4_is_master(device->dev))
+		return;
+
+	for (i = 0; i < device->dev->caps.num_ports; i++) {
+		p = &device->iov_ports[i];
+		kobject_put(p->admin_alias_parent);
+		kobject_put(p->gids_parent);
+		kobject_put(p->pkeys_parent);
+		kobject_put(p->mcgs_parent);
+		kobject_put(p->cur_port);
+		kobject_put(p->cur_port);
+		kobject_put(p->cur_port);
+		kobject_put(p->cur_port);
+		kobject_put(p->cur_port);
+		kobject_put(p->dev->ports_parent);
+		kfree(p->dentr_ar);
+	}
+}
+
+void mlx4_ib_device_unregister_sysfs(struct mlx4_ib_dev *device)
+{
+	unregister_alias_guid_tree(device);
+	unregister_pkey_tree(device);
+	kobject_put(device->ports_parent);
+	kobject_put(device->iov_parent);
+	kobject_put(device->iov_parent);
+	kobject_put(device->ib_dev.ports_parent->parent);
+}
diff --git a/drivers/infiniband/hw/mlx4/user.h b/drivers/infiniband/hw/mlx4/user.h
new file mode 100644
index 0000000..07e6769
--- /dev/null
+++ b/drivers/infiniband/hw/mlx4/user.h
@@ -0,0 +1,107 @@
+/*
+ * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
+ * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MLX4_IB_USER_H
+#define MLX4_IB_USER_H
+
+#include <linux/types.h>
+
+/*
+ * Increment this value if any changes that break userspace ABI
+ * compatibility are made.
+ */
+
+#define MLX4_IB_UVERBS_NO_DEV_CAPS_ABI_VERSION	3
+#define MLX4_IB_UVERBS_ABI_VERSION		4
+
+/*
+ * Make sure that all structs defined in this file remain laid out so
+ * that they pack the same way on 32-bit and 64-bit architectures (to
+ * avoid incompatibility between 32-bit userspace and 64-bit kernels).
+ * In particular do not use pointer types -- pass pointers in __u64
+ * instead.
+ */
+
+struct mlx4_ib_alloc_ucontext_resp_v3 {
+	__u32	qp_tab_size;
+	__u16	bf_reg_size;
+	__u16	bf_regs_per_page;
+};
+
+struct mlx4_ib_alloc_ucontext_resp {
+	__u32	dev_caps;
+	__u32	qp_tab_size;
+	__u16	bf_reg_size;
+	__u16	bf_regs_per_page;
+	__u32	cqe_size;
+};
+
+struct mlx4_ib_alloc_pd_resp {
+	__u32	pdn;
+	__u32	reserved;
+};
+
+struct mlx4_ib_create_cq {
+	__u64	buf_addr;
+	__u64	db_addr;
+};
+
+struct mlx4_ib_create_cq_resp {
+	__u32	cqn;
+	__u32	reserved;
+};
+
+struct mlx4_ib_resize_cq {
+	__u64	buf_addr;
+};
+
+struct mlx4_ib_create_srq {
+	__u64	buf_addr;
+	__u64	db_addr;
+};
+
+struct mlx4_ib_create_srq_resp {
+	__u32	srqn;
+	__u32	reserved;
+};
+
+struct mlx4_ib_create_qp {
+	__u64	buf_addr;
+	__u64	db_addr;
+	__u8	log_sq_bb_count;
+	__u8	log_sq_stride;
+	__u8	sq_no_prefetch;
+	__u8	reserved[5];
+};
+
+#endif /* MLX4_IB_USER_H */
diff --git a/drivers/infiniband/hw/mlx5/Kconfig b/drivers/infiniband/hw/mlx5/Kconfig
new file mode 100644
index 0000000..10df386
--- /dev/null
+++ b/drivers/infiniband/hw/mlx5/Kconfig
@@ -0,0 +1,10 @@
+config MLX5_INFINIBAND
+	tristate "Mellanox Connect-IB HCA support"
+	depends on NETDEVICES && ETHERNET && PCI
+	select NET_VENDOR_MELLANOX
+	select MLX5_CORE
+	---help---
+	  This driver provides low-level InfiniBand support for
+	  Mellanox Connect-IB PCI Express host channel adapters (HCAs).
+	  This is required to use InfiniBand protocols such as
+	  IP-over-IB or SRP with these devices.
diff --git a/drivers/infiniband/hw/mlx5/Makefile b/drivers/infiniband/hw/mlx5/Makefile
new file mode 100644
index 0000000..4ea0135
--- /dev/null
+++ b/drivers/infiniband/hw/mlx5/Makefile
@@ -0,0 +1,3 @@
+obj-$(CONFIG_MLX5_INFINIBAND)	+= mlx5_ib.o
+
+mlx5_ib-y :=	main.o cq.o doorbell.o qp.o mem.o srq.o mr.o ah.o mad.o
diff --git a/drivers/infiniband/hw/mlx5/ah.c b/drivers/infiniband/hw/mlx5/ah.c
new file mode 100644
index 0000000..39ab0ca
--- /dev/null
+++ b/drivers/infiniband/hw/mlx5/ah.c
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2013, Mellanox Technologies inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "mlx5_ib.h"
+
+struct ib_ah *create_ib_ah(struct ib_ah_attr *ah_attr,
+			   struct mlx5_ib_ah *ah)
+{
+	if (ah_attr->ah_flags & IB_AH_GRH) {
+		memcpy(ah->av.rgid, &ah_attr->grh.dgid, 16);
+		ah->av.grh_gid_fl = cpu_to_be32(ah_attr->grh.flow_label |
+						(1 << 30) |
+						ah_attr->grh.sgid_index << 20);
+		ah->av.hop_limit = ah_attr->grh.hop_limit;
+		ah->av.tclass = ah_attr->grh.traffic_class;
+	}
+
+	ah->av.rlid = cpu_to_be16(ah_attr->dlid);
+	ah->av.fl_mlid = ah_attr->src_path_bits & 0x7f;
+	ah->av.stat_rate_sl = (ah_attr->static_rate << 4) | (ah_attr->sl & 0xf);
+
+	return &ah->ibah;
+}
+
+struct ib_ah *mlx5_ib_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr)
+{
+	struct mlx5_ib_ah *ah;
+
+	ah = kzalloc(sizeof(*ah), GFP_ATOMIC);
+	if (!ah)
+		return ERR_PTR(-ENOMEM);
+
+	return create_ib_ah(ah_attr, ah); /* never fails */
+}
+
+int mlx5_ib_query_ah(struct ib_ah *ibah, struct ib_ah_attr *ah_attr)
+{
+	struct mlx5_ib_ah *ah = to_mah(ibah);
+	u32 tmp;
+
+	memset(ah_attr, 0, sizeof(*ah_attr));
+
+	tmp = be32_to_cpu(ah->av.grh_gid_fl);
+	if (tmp & (1 << 30)) {
+		ah_attr->ah_flags = IB_AH_GRH;
+		ah_attr->grh.sgid_index = (tmp >> 20) & 0xff;
+		ah_attr->grh.flow_label = tmp & 0xfffff;
+		memcpy(&ah_attr->grh.dgid, ah->av.rgid, 16);
+		ah_attr->grh.hop_limit = ah->av.hop_limit;
+		ah_attr->grh.traffic_class = ah->av.tclass;
+	}
+	ah_attr->dlid = be16_to_cpu(ah->av.rlid);
+	ah_attr->static_rate = ah->av.stat_rate_sl >> 4;
+	ah_attr->sl = ah->av.stat_rate_sl & 0xf;
+
+	return 0;
+}
+
+int mlx5_ib_destroy_ah(struct ib_ah *ah)
+{
+	kfree(to_mah(ah));
+	return 0;
+}
diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c
new file mode 100644
index 0000000..10cfce5
--- /dev/null
+++ b/drivers/infiniband/hw/mlx5/cq.c
@@ -0,0 +1,1187 @@
+/*
+ * Copyright (c) 2013, Mellanox Technologies inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/kref.h>
+#include <rdma/ib_umem.h>
+#include <rdma/ib_user_verbs.h>
+#include "mlx5_ib.h"
+#include "user.h"
+
+static void mlx5_ib_cq_comp(struct mlx5_core_cq *cq)
+{
+	struct ib_cq *ibcq = &to_mibcq(cq)->ibcq;
+
+	ibcq->comp_handler(ibcq, ibcq->cq_context);
+}
+
+static void mlx5_ib_cq_event(struct mlx5_core_cq *mcq, enum mlx5_event type)
+{
+	struct mlx5_ib_cq *cq = container_of(mcq, struct mlx5_ib_cq, mcq);
+	struct mlx5_ib_dev *dev = to_mdev(cq->ibcq.device);
+	struct ib_cq *ibcq = &cq->ibcq;
+	struct ib_event event;
+
+	if (type != MLX5_EVENT_TYPE_CQ_ERROR) {
+		mlx5_ib_warn(dev, "Unexpected event type %d on CQ %06x\n",
+			     type, mcq->cqn);
+		return;
+	}
+
+	if (ibcq->event_handler) {
+		event.device     = &dev->ib_dev;
+		event.event      = IB_EVENT_CQ_ERR;
+		event.element.cq = ibcq;
+		ibcq->event_handler(&event, ibcq->cq_context);
+	}
+}
+
+static void *get_cqe_from_buf(struct mlx5_ib_cq_buf *buf, int n, int size)
+{
+	return mlx5_buf_offset(&buf->buf, n * size);
+}
+
+static void *get_cqe(struct mlx5_ib_cq *cq, int n)
+{
+	return get_cqe_from_buf(&cq->buf, n, cq->mcq.cqe_sz);
+}
+
+static u8 sw_ownership_bit(int n, int nent)
+{
+	return (n & nent) ? 1 : 0;
+}
+
+static void *get_sw_cqe(struct mlx5_ib_cq *cq, int n)
+{
+	void *cqe = get_cqe(cq, n & cq->ibcq.cqe);
+	struct mlx5_cqe64 *cqe64;
+
+	cqe64 = (cq->mcq.cqe_sz == 64) ? cqe : cqe + 64;
+
+	if (likely((cqe64->op_own) >> 4 != MLX5_CQE_INVALID) &&
+	    !((cqe64->op_own & MLX5_CQE_OWNER_MASK) ^ !!(n & (cq->ibcq.cqe + 1)))) {
+		return cqe;
+	} else {
+		return NULL;
+	}
+}
+
+static void *next_cqe_sw(struct mlx5_ib_cq *cq)
+{
+	return get_sw_cqe(cq, cq->mcq.cons_index);
+}
+
+static enum ib_wc_opcode get_umr_comp(struct mlx5_ib_wq *wq, int idx)
+{
+	switch (wq->wr_data[idx]) {
+	case MLX5_IB_WR_UMR:
+		return 0;
+
+	case IB_WR_LOCAL_INV:
+		return IB_WC_LOCAL_INV;
+
+	case IB_WR_FAST_REG_MR:
+		return IB_WC_FAST_REG_MR;
+
+	default:
+		pr_warn("unknown completion status\n");
+		return 0;
+	}
+}
+
+static void handle_good_req(struct ib_wc *wc, struct mlx5_cqe64 *cqe,
+			    struct mlx5_ib_wq *wq, int idx)
+{
+	wc->wc_flags = 0;
+	switch (be32_to_cpu(cqe->sop_drop_qpn) >> 24) {
+	case MLX5_OPCODE_RDMA_WRITE_IMM:
+		wc->wc_flags |= IB_WC_WITH_IMM;
+	case MLX5_OPCODE_RDMA_WRITE:
+		wc->opcode    = IB_WC_RDMA_WRITE;
+		break;
+	case MLX5_OPCODE_SEND_IMM:
+		wc->wc_flags |= IB_WC_WITH_IMM;
+	case MLX5_OPCODE_SEND:
+	case MLX5_OPCODE_SEND_INVAL:
+		wc->opcode    = IB_WC_SEND;
+		break;
+	case MLX5_OPCODE_RDMA_READ:
+		wc->opcode    = IB_WC_RDMA_READ;
+		wc->byte_len  = be32_to_cpu(cqe->byte_cnt);
+		break;
+	case MLX5_OPCODE_ATOMIC_CS:
+		wc->opcode    = IB_WC_COMP_SWAP;
+		wc->byte_len  = 8;
+		break;
+	case MLX5_OPCODE_ATOMIC_FA:
+		wc->opcode    = IB_WC_FETCH_ADD;
+		wc->byte_len  = 8;
+		break;
+	case MLX5_OPCODE_ATOMIC_MASKED_CS:
+		wc->opcode    = IB_WC_MASKED_COMP_SWAP;
+		wc->byte_len  = 8;
+		break;
+	case MLX5_OPCODE_ATOMIC_MASKED_FA:
+		wc->opcode    = IB_WC_MASKED_FETCH_ADD;
+		wc->byte_len  = 8;
+		break;
+	case MLX5_OPCODE_BIND_MW:
+		wc->opcode    = IB_WC_BIND_MW;
+		break;
+	case MLX5_OPCODE_UMR:
+		wc->opcode = get_umr_comp(wq, idx);
+		break;
+	}
+}
+
+enum {
+	MLX5_GRH_IN_BUFFER = 1,
+	MLX5_GRH_IN_CQE	   = 2,
+};
+
+static void handle_responder(struct ib_wc *wc, struct mlx5_cqe64 *cqe,
+			     struct mlx5_ib_qp *qp)
+{
+	struct mlx5_ib_dev *dev = to_mdev(qp->ibqp.device);
+	struct mlx5_ib_srq *srq;
+	struct mlx5_ib_wq *wq;
+	u16 wqe_ctr;
+	u8 g;
+
+	if (qp->ibqp.srq || qp->ibqp.xrcd) {
+		struct mlx5_core_srq *msrq = NULL;
+
+		if (qp->ibqp.xrcd) {
+			msrq = mlx5_core_get_srq(dev->mdev,
+						 be32_to_cpu(cqe->srqn));
+			srq = to_mibsrq(msrq);
+		} else {
+			srq = to_msrq(qp->ibqp.srq);
+		}
+		if (srq) {
+			wqe_ctr = be16_to_cpu(cqe->wqe_counter);
+			wc->wr_id = srq->wrid[wqe_ctr];
+			mlx5_ib_free_srq_wqe(srq, wqe_ctr);
+			if (msrq && atomic_dec_and_test(&msrq->refcount))
+				complete(&msrq->free);
+		}
+	} else {
+		wq	  = &qp->rq;
+		wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)];
+		++wq->tail;
+	}
+	wc->byte_len = be32_to_cpu(cqe->byte_cnt);
+
+	switch (cqe->op_own >> 4) {
+	case MLX5_CQE_RESP_WR_IMM:
+		wc->opcode	= IB_WC_RECV_RDMA_WITH_IMM;
+		wc->wc_flags	= IB_WC_WITH_IMM;
+		wc->ex.imm_data = cqe->imm_inval_pkey;
+		break;
+	case MLX5_CQE_RESP_SEND:
+		wc->opcode   = IB_WC_RECV;
+		wc->wc_flags = 0;
+		break;
+	case MLX5_CQE_RESP_SEND_IMM:
+		wc->opcode	= IB_WC_RECV;
+		wc->wc_flags	= IB_WC_WITH_IMM;
+		wc->ex.imm_data = cqe->imm_inval_pkey;
+		break;
+	case MLX5_CQE_RESP_SEND_INV:
+		wc->opcode	= IB_WC_RECV;
+		wc->wc_flags	= IB_WC_WITH_INVALIDATE;
+		wc->ex.invalidate_rkey = be32_to_cpu(cqe->imm_inval_pkey);
+		break;
+	}
+	wc->slid	   = be16_to_cpu(cqe->slid);
+	wc->sl		   = (be32_to_cpu(cqe->flags_rqpn) >> 24) & 0xf;
+	wc->src_qp	   = be32_to_cpu(cqe->flags_rqpn) & 0xffffff;
+	wc->dlid_path_bits = cqe->ml_path;
+	g = (be32_to_cpu(cqe->flags_rqpn) >> 28) & 3;
+	wc->wc_flags |= g ? IB_WC_GRH : 0;
+	wc->pkey_index     = be32_to_cpu(cqe->imm_inval_pkey) & 0xffff;
+}
+
+static void dump_cqe(struct mlx5_ib_dev *dev, struct mlx5_err_cqe *cqe)
+{
+	__be32 *p = (__be32 *)cqe;
+	int i;
+
+	mlx5_ib_warn(dev, "dump error cqe\n");
+	for (i = 0; i < sizeof(*cqe) / 16; i++, p += 4)
+		pr_info("%08x %08x %08x %08x\n", be32_to_cpu(p[0]),
+			be32_to_cpu(p[1]), be32_to_cpu(p[2]),
+			be32_to_cpu(p[3]));
+}
+
+static void mlx5_handle_error_cqe(struct mlx5_ib_dev *dev,
+				  struct mlx5_err_cqe *cqe,
+				  struct ib_wc *wc)
+{
+	int dump = 1;
+
+	switch (cqe->syndrome) {
+	case MLX5_CQE_SYNDROME_LOCAL_LENGTH_ERR:
+		wc->status = IB_WC_LOC_LEN_ERR;
+		break;
+	case MLX5_CQE_SYNDROME_LOCAL_QP_OP_ERR:
+		wc->status = IB_WC_LOC_QP_OP_ERR;
+		break;
+	case MLX5_CQE_SYNDROME_LOCAL_PROT_ERR:
+		wc->status = IB_WC_LOC_PROT_ERR;
+		break;
+	case MLX5_CQE_SYNDROME_WR_FLUSH_ERR:
+		dump = 0;
+		wc->status = IB_WC_WR_FLUSH_ERR;
+		break;
+	case MLX5_CQE_SYNDROME_MW_BIND_ERR:
+		wc->status = IB_WC_MW_BIND_ERR;
+		break;
+	case MLX5_CQE_SYNDROME_BAD_RESP_ERR:
+		wc->status = IB_WC_BAD_RESP_ERR;
+		break;
+	case MLX5_CQE_SYNDROME_LOCAL_ACCESS_ERR:
+		wc->status = IB_WC_LOC_ACCESS_ERR;
+		break;
+	case MLX5_CQE_SYNDROME_REMOTE_INVAL_REQ_ERR:
+		wc->status = IB_WC_REM_INV_REQ_ERR;
+		break;
+	case MLX5_CQE_SYNDROME_REMOTE_ACCESS_ERR:
+		wc->status = IB_WC_REM_ACCESS_ERR;
+		break;
+	case MLX5_CQE_SYNDROME_REMOTE_OP_ERR:
+		wc->status = IB_WC_REM_OP_ERR;
+		break;
+	case MLX5_CQE_SYNDROME_TRANSPORT_RETRY_EXC_ERR:
+		wc->status = IB_WC_RETRY_EXC_ERR;
+		dump = 0;
+		break;
+	case MLX5_CQE_SYNDROME_RNR_RETRY_EXC_ERR:
+		wc->status = IB_WC_RNR_RETRY_EXC_ERR;
+		dump = 0;
+		break;
+	case MLX5_CQE_SYNDROME_REMOTE_ABORTED_ERR:
+		wc->status = IB_WC_REM_ABORT_ERR;
+		break;
+	default:
+		wc->status = IB_WC_GENERAL_ERR;
+		break;
+	}
+
+	wc->vendor_err = cqe->vendor_err_synd;
+	if (dump)
+		dump_cqe(dev, cqe);
+}
+
+static int is_atomic_response(struct mlx5_ib_qp *qp, uint16_t idx)
+{
+	/* TBD: waiting decision
+	*/
+	return 0;
+}
+
+static void *mlx5_get_atomic_laddr(struct mlx5_ib_qp *qp, uint16_t idx)
+{
+	struct mlx5_wqe_data_seg *dpseg;
+	void *addr;
+
+	dpseg = mlx5_get_send_wqe(qp, idx) + sizeof(struct mlx5_wqe_ctrl_seg) +
+		sizeof(struct mlx5_wqe_raddr_seg) +
+		sizeof(struct mlx5_wqe_atomic_seg);
+	addr = (void *)(unsigned long)be64_to_cpu(dpseg->addr);
+	return addr;
+}
+
+static void handle_atomic(struct mlx5_ib_qp *qp, struct mlx5_cqe64 *cqe64,
+			  uint16_t idx)
+{
+	void *addr;
+	int byte_count;
+	int i;
+
+	if (!is_atomic_response(qp, idx))
+		return;
+
+	byte_count = be32_to_cpu(cqe64->byte_cnt);
+	addr = mlx5_get_atomic_laddr(qp, idx);
+
+	if (byte_count == 4) {
+		*(uint32_t *)addr = be32_to_cpu(*((__be32 *)addr));
+	} else {
+		for (i = 0; i < byte_count; i += 8) {
+			*(uint64_t *)addr = be64_to_cpu(*((__be64 *)addr));
+			addr += 8;
+		}
+	}
+
+	return;
+}
+
+static void handle_atomics(struct mlx5_ib_qp *qp, struct mlx5_cqe64 *cqe64,
+			   u16 tail, u16 head)
+{
+	u16 idx;
+
+	do {
+		idx = tail & (qp->sq.wqe_cnt - 1);
+		handle_atomic(qp, cqe64, idx);
+		if (idx == head)
+			break;
+
+		tail = qp->sq.w_list[idx].next;
+	} while (1);
+	tail = qp->sq.w_list[idx].next;
+	qp->sq.last_poll = tail;
+}
+
+static void free_cq_buf(struct mlx5_ib_dev *dev, struct mlx5_ib_cq_buf *buf)
+{
+	mlx5_buf_free(dev->mdev, &buf->buf);
+}
+
+static void get_sig_err_item(struct mlx5_sig_err_cqe *cqe,
+			     struct ib_sig_err *item)
+{
+	u16 syndrome = be16_to_cpu(cqe->syndrome);
+
+#define GUARD_ERR   (1 << 13)
+#define APPTAG_ERR  (1 << 12)
+#define REFTAG_ERR  (1 << 11)
+
+	if (syndrome & GUARD_ERR) {
+		item->err_type = IB_SIG_BAD_GUARD;
+		item->expected = be32_to_cpu(cqe->expected_trans_sig) >> 16;
+		item->actual = be32_to_cpu(cqe->actual_trans_sig) >> 16;
+	} else
+	if (syndrome & REFTAG_ERR) {
+		item->err_type = IB_SIG_BAD_REFTAG;
+		item->expected = be32_to_cpu(cqe->expected_reftag);
+		item->actual = be32_to_cpu(cqe->actual_reftag);
+	} else
+	if (syndrome & APPTAG_ERR) {
+		item->err_type = IB_SIG_BAD_APPTAG;
+		item->expected = be32_to_cpu(cqe->expected_trans_sig) & 0xffff;
+		item->actual = be32_to_cpu(cqe->actual_trans_sig) & 0xffff;
+	} else {
+		pr_err("Got signature completion error with bad syndrome %04x\n",
+		       syndrome);
+	}
+
+	item->sig_err_offset = be64_to_cpu(cqe->err_offset);
+	item->key = be32_to_cpu(cqe->mkey);
+}
+
+static int mlx5_poll_one(struct mlx5_ib_cq *cq,
+			 struct mlx5_ib_qp **cur_qp,
+			 struct ib_wc *wc)
+{
+	struct mlx5_ib_dev *dev = to_mdev(cq->ibcq.device);
+	struct mlx5_err_cqe *err_cqe;
+	struct mlx5_cqe64 *cqe64;
+	struct mlx5_core_qp *mqp;
+	struct mlx5_ib_wq *wq;
+	struct mlx5_sig_err_cqe *sig_err_cqe;
+	struct mlx5_core_mr *mmr;
+	struct mlx5_ib_mr *mr;
+	uint8_t opcode;
+	uint32_t qpn;
+	u16 wqe_ctr;
+	void *cqe;
+	int idx;
+
+repoll:
+	cqe = next_cqe_sw(cq);
+	if (!cqe)
+		return -EAGAIN;
+
+	cqe64 = (cq->mcq.cqe_sz == 64) ? cqe : cqe + 64;
+
+	++cq->mcq.cons_index;
+
+	/* Make sure we read CQ entry contents after we've checked the
+	 * ownership bit.
+	 */
+	rmb();
+
+	opcode = cqe64->op_own >> 4;
+	if (unlikely(opcode == MLX5_CQE_RESIZE_CQ)) {
+		if (likely(cq->resize_buf)) {
+			free_cq_buf(dev, &cq->buf);
+			cq->buf = *cq->resize_buf;
+			kfree(cq->resize_buf);
+			cq->resize_buf = NULL;
+			goto repoll;
+		} else {
+			mlx5_ib_warn(dev, "unexpected resize cqe\n");
+		}
+	}
+
+	qpn = ntohl(cqe64->sop_drop_qpn) & 0xffffff;
+	if (!*cur_qp || (qpn != (*cur_qp)->ibqp.qp_num)) {
+		/* We do not have to take the QP table lock here,
+		 * because CQs will be locked while QPs are removed
+		 * from the table.
+		 */
+		mqp = __mlx5_qp_lookup(dev->mdev, qpn);
+		if (unlikely(!mqp)) {
+			mlx5_ib_warn(dev, "CQE@CQ %06x for unknown QPN %6x\n",
+				     cq->mcq.cqn, qpn);
+			return -EINVAL;
+		}
+
+		*cur_qp = to_mibqp(mqp);
+	}
+
+	wc->qp  = &(*cur_qp)->ibqp;
+	switch (opcode) {
+	case MLX5_CQE_REQ:
+		wq = &(*cur_qp)->sq;
+		wqe_ctr = be16_to_cpu(cqe64->wqe_counter);
+		idx = wqe_ctr & (wq->wqe_cnt - 1);
+		handle_good_req(wc, cqe64, wq, idx);
+		handle_atomics(*cur_qp, cqe64, wq->last_poll, idx);
+		wc->wr_id = wq->wrid[idx];
+		wq->tail = wq->wqe_head[idx] + 1;
+		wc->status = IB_WC_SUCCESS;
+		break;
+	case MLX5_CQE_RESP_WR_IMM:
+	case MLX5_CQE_RESP_SEND:
+	case MLX5_CQE_RESP_SEND_IMM:
+	case MLX5_CQE_RESP_SEND_INV:
+		handle_responder(wc, cqe64, *cur_qp);
+		wc->status = IB_WC_SUCCESS;
+		break;
+	case MLX5_CQE_RESIZE_CQ:
+		break;
+	case MLX5_CQE_REQ_ERR:
+	case MLX5_CQE_RESP_ERR:
+		err_cqe = (struct mlx5_err_cqe *)cqe64;
+		mlx5_handle_error_cqe(dev, err_cqe, wc);
+		mlx5_ib_dbg(dev, "%s error cqe on cqn 0x%x:\n",
+			    opcode == MLX5_CQE_REQ_ERR ?
+			    "Requestor" : "Responder", cq->mcq.cqn);
+		mlx5_ib_dbg(dev, "syndrome 0x%x, vendor syndrome 0x%x\n",
+			    err_cqe->syndrome, err_cqe->vendor_err_synd);
+		if (opcode == MLX5_CQE_REQ_ERR) {
+			wq = &(*cur_qp)->sq;
+			wqe_ctr = be16_to_cpu(cqe64->wqe_counter);
+			idx = wqe_ctr & (wq->wqe_cnt - 1);
+			wc->wr_id = wq->wrid[idx];
+			wq->tail = wq->wqe_head[idx] + 1;
+		} else {
+			struct mlx5_ib_srq *srq;
+
+			if ((*cur_qp)->ibqp.srq) {
+				srq = to_msrq((*cur_qp)->ibqp.srq);
+				wqe_ctr = be16_to_cpu(cqe64->wqe_counter);
+				wc->wr_id = srq->wrid[wqe_ctr];
+				mlx5_ib_free_srq_wqe(srq, wqe_ctr);
+			} else {
+				wq = &(*cur_qp)->rq;
+				wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)];
+				++wq->tail;
+			}
+		}
+		break;
+	case MLX5_CQE_SIG_ERR:
+		sig_err_cqe = (struct mlx5_sig_err_cqe *)cqe64;
+
+		read_lock(&dev->mdev->priv.mr_table.lock);
+		mmr = __mlx5_mr_lookup(dev->mdev,
+				       mlx5_base_mkey(be32_to_cpu(sig_err_cqe->mkey)));
+		if (unlikely(!mmr)) {
+			read_unlock(&dev->mdev->priv.mr_table.lock);
+			mlx5_ib_warn(dev, "CQE@CQ %06x for unknown MR %6x\n",
+				     cq->mcq.cqn, be32_to_cpu(sig_err_cqe->mkey));
+			return -EINVAL;
+		}
+
+		mr = to_mibmr(mmr);
+		get_sig_err_item(sig_err_cqe, &mr->sig->err_item);
+		mr->sig->sig_err_exists = true;
+		mr->sig->sigerr_count++;
+
+		mlx5_ib_warn(dev, "CQN: 0x%x Got SIGERR on key: 0x%x err_type %x err_offset %llx expected %x actual %x\n",
+			     cq->mcq.cqn, mr->sig->err_item.key,
+			     mr->sig->err_item.err_type,
+			     mr->sig->err_item.sig_err_offset,
+			     mr->sig->err_item.expected,
+			     mr->sig->err_item.actual);
+
+		read_unlock(&dev->mdev->priv.mr_table.lock);
+		goto repoll;
+	}
+
+	return 0;
+}
+
+int mlx5_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc)
+{
+	struct mlx5_ib_cq *cq = to_mcq(ibcq);
+	struct mlx5_ib_qp *cur_qp = NULL;
+	unsigned long flags;
+	int npolled;
+	int err = 0;
+
+	spin_lock_irqsave(&cq->lock, flags);
+
+	for (npolled = 0; npolled < num_entries; npolled++) {
+		err = mlx5_poll_one(cq, &cur_qp, wc + npolled);
+		if (err)
+			break;
+	}
+
+	if (npolled)
+		mlx5_cq_set_ci(&cq->mcq);
+
+	spin_unlock_irqrestore(&cq->lock, flags);
+
+	if (err == 0 || err == -EAGAIN)
+		return npolled;
+	else
+		return err;
+}
+
+int mlx5_ib_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags)
+{
+	mlx5_cq_arm(&to_mcq(ibcq)->mcq,
+		    (flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED ?
+		    MLX5_CQ_DB_REQ_NOT_SOL : MLX5_CQ_DB_REQ_NOT,
+		    to_mdev(ibcq->device)->mdev->priv.uuari.uars[0].map,
+		    MLX5_GET_DOORBELL_LOCK(&to_mdev(ibcq->device)->mdev->priv.cq_uar_lock));
+
+	return 0;
+}
+
+static int alloc_cq_buf(struct mlx5_ib_dev *dev, struct mlx5_ib_cq_buf *buf,
+			int nent, int cqe_size)
+{
+	int err;
+
+	err = mlx5_buf_alloc(dev->mdev, nent * cqe_size,
+			     PAGE_SIZE * 2, &buf->buf);
+	if (err)
+		return err;
+
+	buf->cqe_size = cqe_size;
+	buf->nent = nent;
+
+	return 0;
+}
+
+static int create_cq_user(struct mlx5_ib_dev *dev, struct ib_udata *udata,
+			  struct ib_ucontext *context, struct mlx5_ib_cq *cq,
+			  int entries, struct mlx5_create_cq_mbox_in **cqb,
+			  int *cqe_size, int *index, int *inlen)
+{
+	struct mlx5_ib_create_cq ucmd;
+	size_t ucmdlen;
+	int page_shift;
+	int npages;
+	int ncont;
+	int err;
+
+	ucmdlen =
+		(udata->inlen - sizeof(struct ib_uverbs_cmd_hdr) <
+		 sizeof(ucmd)) ? (sizeof(ucmd) -
+				  sizeof(ucmd.reserved)) : sizeof(ucmd);
+
+	if (ib_copy_from_udata(&ucmd, udata, ucmdlen))
+		return -EFAULT;
+
+	if (ucmdlen == sizeof(ucmd) &&
+	    ucmd.reserved != 0)
+		return -EINVAL;
+
+	if (ucmd.cqe_size != 64 && ucmd.cqe_size != 128)
+		return -EINVAL;
+
+	*cqe_size = ucmd.cqe_size;
+
+	cq->buf.umem = ib_umem_get(context, ucmd.buf_addr,
+				   entries * ucmd.cqe_size,
+				   IB_ACCESS_LOCAL_WRITE, 1);
+	if (IS_ERR(cq->buf.umem)) {
+		err = PTR_ERR(cq->buf.umem);
+		return err;
+	}
+
+	err = mlx5_ib_db_map_user(to_mucontext(context), ucmd.db_addr,
+				  &cq->db);
+	if (err)
+		goto err_umem;
+
+	mlx5_ib_cont_pages(cq->buf.umem, ucmd.buf_addr, &npages, &page_shift,
+			   &ncont, NULL);
+	mlx5_ib_dbg(dev, "addr 0x%llx, size %u, npages %d, page_shift %d, ncont %d\n",
+		    ucmd.buf_addr, entries * ucmd.cqe_size, npages, page_shift, ncont);
+
+	*inlen = sizeof(**cqb) + sizeof(*(*cqb)->pas) * ncont;
+	*cqb = mlx5_vzalloc(*inlen);
+	if (!*cqb) {
+		err = -ENOMEM;
+		goto err_db;
+	}
+	mlx5_ib_populate_pas(dev, cq->buf.umem, page_shift, (*cqb)->pas, 0);
+	(*cqb)->ctx.log_pg_sz = page_shift - MLX5_ADAPTER_PAGE_SHIFT;
+
+	*index = to_mucontext(context)->uuari.uars[0].index;
+
+	return 0;
+
+err_db:
+	mlx5_ib_db_unmap_user(to_mucontext(context), &cq->db);
+
+err_umem:
+	ib_umem_release(cq->buf.umem);
+	return err;
+}
+
+static void destroy_cq_user(struct mlx5_ib_cq *cq, struct ib_ucontext *context)
+{
+	mlx5_ib_db_unmap_user(to_mucontext(context), &cq->db);
+	ib_umem_release(cq->buf.umem);
+}
+
+static void init_cq_buf(struct mlx5_ib_cq *cq, struct mlx5_ib_cq_buf *buf)
+{
+	int i;
+	void *cqe;
+	struct mlx5_cqe64 *cqe64;
+
+	for (i = 0; i < buf->nent; i++) {
+		cqe = get_cqe_from_buf(buf, i, buf->cqe_size);
+		cqe64 = buf->cqe_size == 64 ? cqe : cqe + 64;
+		cqe64->op_own = MLX5_CQE_INVALID << 4;
+	}
+}
+
+static int create_cq_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *cq,
+			    int entries, int cqe_size,
+			    struct mlx5_create_cq_mbox_in **cqb,
+			    int *index, int *inlen)
+{
+	int err;
+
+	err = mlx5_db_alloc(dev->mdev, &cq->db);
+	if (err)
+		return err;
+
+	cq->mcq.set_ci_db  = cq->db.db;
+	cq->mcq.arm_db     = cq->db.db + 1;
+	*cq->mcq.set_ci_db = 0;
+	*cq->mcq.arm_db    = 0;
+	cq->mcq.cqe_sz = cqe_size;
+
+	err = alloc_cq_buf(dev, &cq->buf, entries, cqe_size);
+	if (err)
+		goto err_db;
+
+	init_cq_buf(cq, &cq->buf);
+
+	*inlen = sizeof(**cqb) + sizeof(*(*cqb)->pas) * cq->buf.buf.npages;
+	*cqb = mlx5_vzalloc(*inlen);
+	if (!*cqb) {
+		err = -ENOMEM;
+		goto err_buf;
+	}
+	mlx5_fill_page_array(&cq->buf.buf, (*cqb)->pas);
+
+	(*cqb)->ctx.log_pg_sz = cq->buf.buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT;
+	*index = dev->mdev->priv.uuari.uars[0].index;
+
+	return 0;
+
+err_buf:
+	free_cq_buf(dev, &cq->buf);
+
+err_db:
+	mlx5_db_free(dev->mdev, &cq->db);
+	return err;
+}
+
+static void destroy_cq_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *cq)
+{
+	free_cq_buf(dev, &cq->buf);
+	mlx5_db_free(dev->mdev, &cq->db);
+}
+
+struct ib_cq *mlx5_ib_create_cq(struct ib_device *ibdev, int entries,
+				int vector, struct ib_ucontext *context,
+				struct ib_udata *udata)
+{
+	struct mlx5_create_cq_mbox_in *cqb = NULL;
+	struct mlx5_ib_dev *dev = to_mdev(ibdev);
+	struct mlx5_ib_cq *cq;
+	int uninitialized_var(index);
+	int uninitialized_var(inlen);
+	int cqe_size;
+	int irqn;
+	int eqn;
+	int err;
+
+	if (entries < 0)
+		return ERR_PTR(-EINVAL);
+
+	entries = roundup_pow_of_two(entries + 1);
+	if (entries > dev->mdev->caps.gen.max_cqes)
+		return ERR_PTR(-EINVAL);
+
+	cq = kzalloc(sizeof(*cq), GFP_KERNEL);
+	if (!cq)
+		return ERR_PTR(-ENOMEM);
+
+	cq->ibcq.cqe = entries - 1;
+	mutex_init(&cq->resize_mutex);
+	spin_lock_init(&cq->lock);
+	cq->resize_buf = NULL;
+	cq->resize_umem = NULL;
+
+	if (context) {
+		err = create_cq_user(dev, udata, context, cq, entries,
+				     &cqb, &cqe_size, &index, &inlen);
+		if (err)
+			goto err_create;
+	} else {
+		/* for now choose 64 bytes till we have a proper interface */
+		cqe_size = 64;
+		err = create_cq_kernel(dev, cq, entries, cqe_size, &cqb,
+				       &index, &inlen);
+		if (err)
+			goto err_create;
+	}
+
+	cq->cqe_size = cqe_size;
+	cqb->ctx.cqe_sz_flags = cqe_sz_to_mlx_sz(cqe_size) << 5;
+	cqb->ctx.log_sz_usr_page = cpu_to_be32((ilog2(entries) << 24) | index);
+	err = mlx5_vector2eqn(dev, vector, &eqn, &irqn);
+	if (err)
+		goto err_cqb;
+
+	cqb->ctx.c_eqn = cpu_to_be16(eqn);
+	cqb->ctx.db_record_addr = cpu_to_be64(cq->db.dma);
+
+	err = mlx5_core_create_cq(dev->mdev, &cq->mcq, cqb, inlen);
+	if (err)
+		goto err_cqb;
+
+	mlx5_ib_dbg(dev, "cqn 0x%x\n", cq->mcq.cqn);
+	cq->mcq.irqn = irqn;
+	cq->mcq.comp  = mlx5_ib_cq_comp;
+	cq->mcq.event = mlx5_ib_cq_event;
+
+	if (context)
+		if (ib_copy_to_udata(udata, &cq->mcq.cqn, sizeof(__u32))) {
+			err = -EFAULT;
+			goto err_cmd;
+		}
+
+
+	mlx5_vfree(cqb);
+	return &cq->ibcq;
+
+err_cmd:
+	mlx5_core_destroy_cq(dev->mdev, &cq->mcq);
+
+err_cqb:
+	mlx5_vfree(cqb);
+	if (context)
+		destroy_cq_user(cq, context);
+	else
+		destroy_cq_kernel(dev, cq);
+
+err_create:
+	kfree(cq);
+
+	return ERR_PTR(err);
+}
+
+
+int mlx5_ib_destroy_cq(struct ib_cq *cq)
+{
+	struct mlx5_ib_dev *dev = to_mdev(cq->device);
+	struct mlx5_ib_cq *mcq = to_mcq(cq);
+	struct ib_ucontext *context = NULL;
+
+	if (cq->uobject)
+		context = cq->uobject->context;
+
+	mlx5_core_destroy_cq(dev->mdev, &mcq->mcq);
+	if (context)
+		destroy_cq_user(mcq, context);
+	else
+		destroy_cq_kernel(dev, mcq);
+
+	kfree(mcq);
+
+	return 0;
+}
+
+static int is_equal_rsn(struct mlx5_cqe64 *cqe64, u32 rsn)
+{
+	return rsn == (ntohl(cqe64->sop_drop_qpn) & 0xffffff);
+}
+
+void __mlx5_ib_cq_clean(struct mlx5_ib_cq *cq, u32 rsn, struct mlx5_ib_srq *srq)
+{
+	struct mlx5_cqe64 *cqe64, *dest64;
+	void *cqe, *dest;
+	u32 prod_index;
+	int nfreed = 0;
+	u8 owner_bit;
+
+	if (!cq)
+		return;
+
+	/* First we need to find the current producer index, so we
+	 * know where to start cleaning from.  It doesn't matter if HW
+	 * adds new entries after this loop -- the QP we're worried
+	 * about is already in RESET, so the new entries won't come
+	 * from our QP and therefore don't need to be checked.
+	 */
+	for (prod_index = cq->mcq.cons_index; get_sw_cqe(cq, prod_index); prod_index++)
+		if (prod_index == cq->mcq.cons_index + cq->ibcq.cqe)
+			break;
+
+	/* Now sweep backwards through the CQ, removing CQ entries
+	 * that match our QP by copying older entries on top of them.
+	 */
+	while ((int) --prod_index - (int) cq->mcq.cons_index >= 0) {
+		cqe = get_cqe(cq, prod_index & cq->ibcq.cqe);
+		cqe64 = (cq->mcq.cqe_sz == 64) ? cqe : cqe + 64;
+		if (is_equal_rsn(cqe64, rsn)) {
+			if (srq && (ntohl(cqe64->srqn) & 0xffffff))
+				mlx5_ib_free_srq_wqe(srq, be16_to_cpu(cqe64->wqe_counter));
+			++nfreed;
+		} else if (nfreed) {
+			dest = get_cqe(cq, (prod_index + nfreed) & cq->ibcq.cqe);
+			dest64 = (cq->mcq.cqe_sz == 64) ? dest : dest + 64;
+			owner_bit = dest64->op_own & MLX5_CQE_OWNER_MASK;
+			memcpy(dest, cqe, cq->mcq.cqe_sz);
+			dest64->op_own = owner_bit |
+				(dest64->op_own & ~MLX5_CQE_OWNER_MASK);
+		}
+	}
+
+	if (nfreed) {
+		cq->mcq.cons_index += nfreed;
+		/* Make sure update of buffer contents is done before
+		 * updating consumer index.
+		 */
+		wmb();
+		mlx5_cq_set_ci(&cq->mcq);
+	}
+}
+
+void mlx5_ib_cq_clean(struct mlx5_ib_cq *cq, u32 qpn, struct mlx5_ib_srq *srq)
+{
+	if (!cq)
+		return;
+
+	spin_lock_irq(&cq->lock);
+	__mlx5_ib_cq_clean(cq, qpn, srq);
+	spin_unlock_irq(&cq->lock);
+}
+
+int mlx5_ib_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period)
+{
+	struct mlx5_modify_cq_mbox_in *in;
+	struct mlx5_ib_dev *dev = to_mdev(cq->device);
+	struct mlx5_ib_cq *mcq = to_mcq(cq);
+	int err;
+	u32 fsel;
+
+	if (!(dev->mdev->caps.gen.flags & MLX5_DEV_CAP_FLAG_CQ_MODER))
+		return -ENOSYS;
+
+	in = kzalloc(sizeof(*in), GFP_KERNEL);
+	if (!in)
+		return -ENOMEM;
+
+	in->cqn = cpu_to_be32(mcq->mcq.cqn);
+	fsel = (MLX5_CQ_MODIFY_PERIOD | MLX5_CQ_MODIFY_COUNT);
+	in->ctx.cq_period = cpu_to_be16(cq_period);
+	in->ctx.cq_max_count = cpu_to_be16(cq_count);
+	in->field_select = cpu_to_be32(fsel);
+	err = mlx5_core_modify_cq(dev->mdev, &mcq->mcq, in, sizeof(*in));
+	kfree(in);
+
+	if (err)
+		mlx5_ib_warn(dev, "modify cq 0x%x failed\n", mcq->mcq.cqn);
+
+	return err;
+}
+
+static int resize_user(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *cq,
+		       int entries, struct ib_udata *udata, int *npas,
+		       int *page_shift, int *cqe_size)
+{
+	struct mlx5_ib_resize_cq ucmd;
+	struct ib_umem *umem;
+	int err;
+	int npages;
+	struct ib_ucontext *context = cq->buf.umem->context;
+
+	err = ib_copy_from_udata(&ucmd, udata, sizeof(ucmd));
+	if (err)
+		return err;
+
+	if (ucmd.reserved0 || ucmd.reserved1)
+		return -EINVAL;
+
+	umem = ib_umem_get(context, ucmd.buf_addr, entries * ucmd.cqe_size,
+			   IB_ACCESS_LOCAL_WRITE, 1);
+	if (IS_ERR(umem)) {
+		err = PTR_ERR(umem);
+		return err;
+	}
+
+	mlx5_ib_cont_pages(umem, ucmd.buf_addr, &npages, page_shift,
+			   npas, NULL);
+
+	cq->resize_umem = umem;
+	*cqe_size = ucmd.cqe_size;
+
+	return 0;
+}
+
+static void un_resize_user(struct mlx5_ib_cq *cq)
+{
+	ib_umem_release(cq->resize_umem);
+}
+
+static int resize_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *cq,
+			 int entries, int cqe_size)
+{
+	int err;
+
+	cq->resize_buf = kzalloc(sizeof(*cq->resize_buf), GFP_KERNEL);
+	if (!cq->resize_buf)
+		return -ENOMEM;
+
+	err = alloc_cq_buf(dev, cq->resize_buf, entries, cqe_size);
+	if (err)
+		goto ex;
+
+	init_cq_buf(cq, cq->resize_buf);
+
+	return 0;
+
+ex:
+	kfree(cq->resize_buf);
+	return err;
+}
+
+static void un_resize_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *cq)
+{
+	free_cq_buf(dev, cq->resize_buf);
+	cq->resize_buf = NULL;
+}
+
+static int copy_resize_cqes(struct mlx5_ib_cq *cq)
+{
+	struct mlx5_ib_dev *dev = to_mdev(cq->ibcq.device);
+	struct mlx5_cqe64 *scqe64;
+	struct mlx5_cqe64 *dcqe64;
+	void *start_cqe;
+	void *scqe;
+	void *dcqe;
+	int ssize;
+	int dsize;
+	int i;
+	u8 sw_own;
+
+	ssize = cq->buf.cqe_size;
+	dsize = cq->resize_buf->cqe_size;
+	if (ssize != dsize) {
+		mlx5_ib_warn(dev, "resize from different cqe size is not supported\n");
+		return -EINVAL;
+	}
+
+	i = cq->mcq.cons_index;
+	scqe = get_sw_cqe(cq, i);
+	scqe64 = ssize == 64 ? scqe : scqe + 64;
+	start_cqe = scqe;
+	if (!scqe) {
+		mlx5_ib_warn(dev, "expected cqe in sw ownership\n");
+		return -EINVAL;
+	}
+
+	while ((scqe64->op_own >> 4) != MLX5_CQE_RESIZE_CQ) {
+		dcqe = get_cqe_from_buf(cq->resize_buf,
+					(i + 1) & (cq->resize_buf->nent),
+					dsize);
+		dcqe64 = dsize == 64 ? dcqe : dcqe + 64;
+		sw_own = sw_ownership_bit(i + 1, cq->resize_buf->nent);
+		memcpy(dcqe, scqe, dsize);
+		dcqe64->op_own = (dcqe64->op_own & ~MLX5_CQE_OWNER_MASK) | sw_own;
+
+		++i;
+		scqe = get_sw_cqe(cq, i);
+		scqe64 = ssize == 64 ? scqe : scqe + 64;
+		if (!scqe) {
+			mlx5_ib_warn(dev, "expected cqe in sw ownership\n");
+			return -EINVAL;
+		}
+
+		if (scqe == start_cqe) {
+			pr_warn("resize CQ failed to get resize CQE, CQN 0x%x\n",
+				cq->mcq.cqn);
+			return -ENOMEM;
+		}
+	}
+	++cq->mcq.cons_index;
+	return 0;
+}
+
+int mlx5_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata)
+{
+	struct mlx5_ib_dev *dev = to_mdev(ibcq->device);
+	struct mlx5_ib_cq *cq = to_mcq(ibcq);
+	struct mlx5_modify_cq_mbox_in *in;
+	int err;
+	int npas;
+	int page_shift;
+	int inlen;
+	int uninitialized_var(cqe_size);
+	unsigned long flags;
+
+	if (!(dev->mdev->caps.gen.flags & MLX5_DEV_CAP_FLAG_RESIZE_CQ)) {
+		pr_info("Firmware does not support resize CQ\n");
+		return -ENOSYS;
+	}
+
+	if (entries < 1)
+		return -EINVAL;
+
+	entries = roundup_pow_of_two(entries + 1);
+	if (entries > dev->mdev->caps.gen.max_cqes + 1)
+		return -EINVAL;
+
+	if (entries == ibcq->cqe + 1)
+		return 0;
+
+	mutex_lock(&cq->resize_mutex);
+	if (udata) {
+		err = resize_user(dev, cq, entries, udata, &npas, &page_shift,
+				  &cqe_size);
+	} else {
+		cqe_size = 64;
+		err = resize_kernel(dev, cq, entries, cqe_size);
+		if (!err) {
+			npas = cq->resize_buf->buf.npages;
+			page_shift = cq->resize_buf->buf.page_shift;
+		}
+	}
+
+	if (err)
+		goto ex;
+
+	inlen = sizeof(*in) + npas * sizeof(in->pas[0]);
+	in = mlx5_vzalloc(inlen);
+	if (!in) {
+		err = -ENOMEM;
+		goto ex_resize;
+	}
+
+	if (udata)
+		mlx5_ib_populate_pas(dev, cq->resize_umem, page_shift,
+				     in->pas, 0);
+	else
+		mlx5_fill_page_array(&cq->resize_buf->buf, in->pas);
+
+	in->field_select = cpu_to_be32(MLX5_MODIFY_CQ_MASK_LOG_SIZE  |
+				       MLX5_MODIFY_CQ_MASK_PG_OFFSET |
+				       MLX5_MODIFY_CQ_MASK_PG_SIZE);
+	in->ctx.log_pg_sz = page_shift - MLX5_ADAPTER_PAGE_SHIFT;
+	in->ctx.cqe_sz_flags = cqe_sz_to_mlx_sz(cqe_size) << 5;
+	in->ctx.page_offset = 0;
+	in->ctx.log_sz_usr_page = cpu_to_be32(ilog2(entries) << 24);
+	in->hdr.opmod = cpu_to_be16(MLX5_CQ_OPMOD_RESIZE);
+	in->cqn = cpu_to_be32(cq->mcq.cqn);
+
+	err = mlx5_core_modify_cq(dev->mdev, &cq->mcq, in, inlen);
+	if (err)
+		goto ex_alloc;
+
+	if (udata) {
+		cq->ibcq.cqe = entries - 1;
+		ib_umem_release(cq->buf.umem);
+		cq->buf.umem = cq->resize_umem;
+		cq->resize_umem = NULL;
+	} else {
+		struct mlx5_ib_cq_buf tbuf;
+		int resized = 0;
+
+		spin_lock_irqsave(&cq->lock, flags);
+		if (cq->resize_buf) {
+			err = copy_resize_cqes(cq);
+			if (!err) {
+				tbuf = cq->buf;
+				cq->buf = *cq->resize_buf;
+				kfree(cq->resize_buf);
+				cq->resize_buf = NULL;
+				resized = 1;
+			}
+		}
+		cq->ibcq.cqe = entries - 1;
+		spin_unlock_irqrestore(&cq->lock, flags);
+		if (resized)
+			free_cq_buf(dev, &tbuf);
+	}
+	mutex_unlock(&cq->resize_mutex);
+
+	mlx5_vfree(in);
+	return 0;
+
+ex_alloc:
+	mlx5_vfree(in);
+
+ex_resize:
+	if (udata)
+		un_resize_user(cq);
+	else
+		un_resize_kernel(dev, cq);
+ex:
+	mutex_unlock(&cq->resize_mutex);
+	return err;
+}
+
+int mlx5_ib_get_cqe_size(struct mlx5_ib_dev *dev, struct ib_cq *ibcq)
+{
+	struct mlx5_ib_cq *cq;
+
+	if (!ibcq)
+		return 128;
+
+	cq = to_mcq(ibcq);
+	return cq->cqe_size;
+}
diff --git a/drivers/infiniband/hw/mlx5/doorbell.c b/drivers/infiniband/hw/mlx5/doorbell.c
new file mode 100644
index 0000000..ece028f
--- /dev/null
+++ b/drivers/infiniband/hw/mlx5/doorbell.c
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2013, Mellanox Technologies inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/kref.h>
+#include <linux/slab.h>
+#include <rdma/ib_umem.h>
+
+#include "mlx5_ib.h"
+
+struct mlx5_ib_user_db_page {
+	struct list_head	list;
+	struct ib_umem	       *umem;
+	unsigned long		user_virt;
+	int			refcnt;
+};
+
+int mlx5_ib_db_map_user(struct mlx5_ib_ucontext *context, unsigned long virt,
+			struct mlx5_db *db)
+{
+	struct mlx5_ib_user_db_page *page;
+	int err = 0;
+
+	mutex_lock(&context->db_page_mutex);
+
+	list_for_each_entry(page, &context->db_page_list, list)
+		if (page->user_virt == (virt & PAGE_MASK))
+			goto found;
+
+	page = kmalloc(sizeof(*page), GFP_KERNEL);
+	if (!page) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	page->user_virt = (virt & PAGE_MASK);
+	page->refcnt    = 0;
+	page->umem      = ib_umem_get(&context->ibucontext, virt & PAGE_MASK,
+				      PAGE_SIZE, 0, 0);
+	if (IS_ERR(page->umem)) {
+		err = PTR_ERR(page->umem);
+		kfree(page);
+		goto out;
+	}
+
+	list_add(&page->list, &context->db_page_list);
+
+found:
+	db->dma = sg_dma_address(page->umem->sg_head.sgl) + (virt & ~PAGE_MASK);
+	db->u.user_page = page;
+	++page->refcnt;
+
+out:
+	mutex_unlock(&context->db_page_mutex);
+
+	return err;
+}
+
+void mlx5_ib_db_unmap_user(struct mlx5_ib_ucontext *context, struct mlx5_db *db)
+{
+	mutex_lock(&context->db_page_mutex);
+
+	if (!--db->u.user_page->refcnt) {
+		list_del(&db->u.user_page->list);
+		ib_umem_release(db->u.user_page->umem);
+		kfree(db->u.user_page);
+	}
+
+	mutex_unlock(&context->db_page_mutex);
+}
diff --git a/drivers/infiniband/hw/mlx5/mad.c b/drivers/infiniband/hw/mlx5/mad.c
new file mode 100644
index 0000000..657af9a
--- /dev/null
+++ b/drivers/infiniband/hw/mlx5/mad.c
@@ -0,0 +1,139 @@
+/*
+ * Copyright (c) 2013, Mellanox Technologies inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/mlx5/cmd.h>
+#include <rdma/ib_mad.h>
+#include <rdma/ib_smi.h>
+#include "mlx5_ib.h"
+
+enum {
+	MLX5_IB_VENDOR_CLASS1 = 0x9,
+	MLX5_IB_VENDOR_CLASS2 = 0xa
+};
+
+int mlx5_MAD_IFC(struct mlx5_ib_dev *dev, int ignore_mkey, int ignore_bkey,
+		 u8 port, struct ib_wc *in_wc, struct ib_grh *in_grh,
+		 void *in_mad, void *response_mad)
+{
+	u8 op_modifier = 0;
+
+	/* Key check traps can't be generated unless we have in_wc to
+	 * tell us where to send the trap.
+	 */
+	if (ignore_mkey || !in_wc)
+		op_modifier |= 0x1;
+	if (ignore_bkey || !in_wc)
+		op_modifier |= 0x2;
+
+	return mlx5_core_mad_ifc(dev->mdev, in_mad, response_mad, op_modifier, port);
+}
+
+int mlx5_ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
+			struct ib_wc *in_wc, struct ib_grh *in_grh,
+			struct ib_mad *in_mad, struct ib_mad *out_mad)
+{
+	u16 slid;
+	int err;
+
+	slid = in_wc ? in_wc->slid : be16_to_cpu(IB_LID_PERMISSIVE);
+
+	if (in_mad->mad_hdr.method == IB_MGMT_METHOD_TRAP && slid == 0)
+		return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED;
+
+	if (in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED ||
+	    in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) {
+		if (in_mad->mad_hdr.method   != IB_MGMT_METHOD_GET &&
+		    in_mad->mad_hdr.method   != IB_MGMT_METHOD_SET &&
+		    in_mad->mad_hdr.method   != IB_MGMT_METHOD_TRAP_REPRESS)
+			return IB_MAD_RESULT_SUCCESS;
+
+		/* Don't process SMInfo queries -- the SMA can't handle them.
+		 */
+		if (in_mad->mad_hdr.attr_id == IB_SMP_ATTR_SM_INFO)
+			return IB_MAD_RESULT_SUCCESS;
+	} else if (in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_PERF_MGMT ||
+		   in_mad->mad_hdr.mgmt_class == MLX5_IB_VENDOR_CLASS1   ||
+		   in_mad->mad_hdr.mgmt_class == MLX5_IB_VENDOR_CLASS2   ||
+		   in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_CONG_MGMT) {
+		if (in_mad->mad_hdr.method  != IB_MGMT_METHOD_GET &&
+		    in_mad->mad_hdr.method  != IB_MGMT_METHOD_SET)
+			return IB_MAD_RESULT_SUCCESS;
+	} else {
+		return IB_MAD_RESULT_SUCCESS;
+	}
+
+	err = mlx5_MAD_IFC(to_mdev(ibdev),
+			   mad_flags & IB_MAD_IGNORE_MKEY,
+			   mad_flags & IB_MAD_IGNORE_BKEY,
+			   port_num, in_wc, in_grh, in_mad, out_mad);
+	if (err)
+		return IB_MAD_RESULT_FAILURE;
+
+	/* set return bit in status of directed route responses */
+	if (in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)
+		out_mad->mad_hdr.status |= cpu_to_be16(1 << 15);
+
+	if (in_mad->mad_hdr.method == IB_MGMT_METHOD_TRAP_REPRESS)
+		/* no response for trap repress */
+		return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED;
+
+	return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY;
+}
+
+int mlx5_query_ext_port_caps(struct mlx5_ib_dev *dev, u8 port)
+{
+	struct ib_smp *in_mad  = NULL;
+	struct ib_smp *out_mad = NULL;
+	int err = -ENOMEM;
+	u16 packet_error;
+
+	in_mad  = kzalloc(sizeof(*in_mad), GFP_KERNEL);
+	out_mad = kmalloc(sizeof(*out_mad), GFP_KERNEL);
+	if (!in_mad || !out_mad)
+		goto out;
+
+	init_query_mad(in_mad);
+	in_mad->attr_id = MLX5_ATTR_EXTENDED_PORT_INFO;
+	in_mad->attr_mod = cpu_to_be32(port);
+
+	err = mlx5_MAD_IFC(dev, 1, 1, 1, NULL, NULL, in_mad, out_mad);
+
+	packet_error = be16_to_cpu(out_mad->status);
+
+	dev->mdev->caps.gen.ext_port_cap[port - 1] = (!err && !packet_error) ?
+		MLX_EXT_PORT_CAP_FLAG_EXTENDED_PORT_INFO : 0;
+
+out:
+	kfree(in_mad);
+	kfree(out_mad);
+	return err;
+}
diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
new file mode 100644
index 0000000..1ba6c42
--- /dev/null
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -0,0 +1,1455 @@
+/*
+ * Copyright (c) 2013, Mellanox Technologies inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <asm-generic/kmap_types.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/pci.h>
+#include <linux/dma-mapping.h>
+#include <linux/slab.h>
+#include <linux/io-mapping.h>
+#include <linux/sched.h>
+#include <rdma/ib_user_verbs.h>
+#include <rdma/ib_smi.h>
+#include <rdma/ib_umem.h>
+#include "user.h"
+#include "mlx5_ib.h"
+
+#define DRIVER_NAME "mlx5_ib"
+#define DRIVER_VERSION "2.2-1"
+#define DRIVER_RELDATE	"Feb 2014"
+
+MODULE_AUTHOR("Eli Cohen <eli@mellanox.com>");
+MODULE_DESCRIPTION("Mellanox Connect-IB HCA IB driver");
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_VERSION(DRIVER_VERSION);
+
+static int deprecated_prof_sel = 2;
+module_param_named(prof_sel, deprecated_prof_sel, int, 0444);
+MODULE_PARM_DESC(prof_sel, "profile selector. Deprecated here. Moved to module mlx5_core");
+
+static char mlx5_version[] =
+	DRIVER_NAME ": Mellanox Connect-IB Infiniband driver v"
+	DRIVER_VERSION " (" DRIVER_RELDATE ")\n";
+
+int mlx5_vector2eqn(struct mlx5_ib_dev *dev, int vector, int *eqn, int *irqn)
+{
+	struct mlx5_eq_table *table = &dev->mdev->priv.eq_table;
+	struct mlx5_eq *eq, *n;
+	int err = -ENOENT;
+
+	spin_lock(&table->lock);
+	list_for_each_entry_safe(eq, n, &dev->eqs_list, list) {
+		if (eq->index == vector) {
+			*eqn = eq->eqn;
+			*irqn = eq->irqn;
+			err = 0;
+			break;
+		}
+	}
+	spin_unlock(&table->lock);
+
+	return err;
+}
+
+static int alloc_comp_eqs(struct mlx5_ib_dev *dev)
+{
+	struct mlx5_eq_table *table = &dev->mdev->priv.eq_table;
+	char name[MLX5_MAX_EQ_NAME];
+	struct mlx5_eq *eq, *n;
+	int ncomp_vec;
+	int nent;
+	int err;
+	int i;
+
+	INIT_LIST_HEAD(&dev->eqs_list);
+	ncomp_vec = table->num_comp_vectors;
+	nent = MLX5_COMP_EQ_SIZE;
+	for (i = 0; i < ncomp_vec; i++) {
+		eq = kzalloc(sizeof(*eq), GFP_KERNEL);
+		if (!eq) {
+			err = -ENOMEM;
+			goto clean;
+		}
+
+		snprintf(name, MLX5_MAX_EQ_NAME, "mlx5_comp%d", i);
+		err = mlx5_create_map_eq(dev->mdev, eq,
+					 i + MLX5_EQ_VEC_COMP_BASE, nent, 0,
+					 name, &dev->mdev->priv.uuari.uars[0]);
+		if (err) {
+			kfree(eq);
+			goto clean;
+		}
+		mlx5_ib_dbg(dev, "allocated completion EQN %d\n", eq->eqn);
+		eq->index = i;
+		spin_lock(&table->lock);
+		list_add_tail(&eq->list, &dev->eqs_list);
+		spin_unlock(&table->lock);
+	}
+
+	dev->num_comp_vectors = ncomp_vec;
+	return 0;
+
+clean:
+	spin_lock(&table->lock);
+	list_for_each_entry_safe(eq, n, &dev->eqs_list, list) {
+		list_del(&eq->list);
+		spin_unlock(&table->lock);
+		if (mlx5_destroy_unmap_eq(dev->mdev, eq))
+			mlx5_ib_warn(dev, "failed to destroy EQ 0x%x\n", eq->eqn);
+		kfree(eq);
+		spin_lock(&table->lock);
+	}
+	spin_unlock(&table->lock);
+	return err;
+}
+
+static void free_comp_eqs(struct mlx5_ib_dev *dev)
+{
+	struct mlx5_eq_table *table = &dev->mdev->priv.eq_table;
+	struct mlx5_eq *eq, *n;
+
+	spin_lock(&table->lock);
+	list_for_each_entry_safe(eq, n, &dev->eqs_list, list) {
+		list_del(&eq->list);
+		spin_unlock(&table->lock);
+		if (mlx5_destroy_unmap_eq(dev->mdev, eq))
+			mlx5_ib_warn(dev, "failed to destroy EQ 0x%x\n", eq->eqn);
+		kfree(eq);
+		spin_lock(&table->lock);
+	}
+	spin_unlock(&table->lock);
+}
+
+static int mlx5_ib_query_device(struct ib_device *ibdev,
+				struct ib_device_attr *props)
+{
+	struct mlx5_ib_dev *dev = to_mdev(ibdev);
+	struct ib_smp *in_mad  = NULL;
+	struct ib_smp *out_mad = NULL;
+	struct mlx5_general_caps *gen;
+	int err = -ENOMEM;
+	int max_rq_sg;
+	int max_sq_sg;
+	u64 flags;
+
+	gen = &dev->mdev->caps.gen;
+	in_mad  = kzalloc(sizeof(*in_mad), GFP_KERNEL);
+	out_mad = kmalloc(sizeof(*out_mad), GFP_KERNEL);
+	if (!in_mad || !out_mad)
+		goto out;
+
+	init_query_mad(in_mad);
+	in_mad->attr_id = IB_SMP_ATTR_NODE_INFO;
+
+	err = mlx5_MAD_IFC(to_mdev(ibdev), 1, 1, 1, NULL, NULL, in_mad, out_mad);
+	if (err)
+		goto out;
+
+	memset(props, 0, sizeof(*props));
+
+	props->fw_ver = ((u64)fw_rev_maj(dev->mdev) << 32) |
+		(fw_rev_min(dev->mdev) << 16) |
+		fw_rev_sub(dev->mdev);
+	props->device_cap_flags    = IB_DEVICE_CHANGE_PHY_PORT |
+		IB_DEVICE_PORT_ACTIVE_EVENT		|
+		IB_DEVICE_SYS_IMAGE_GUID		|
+		IB_DEVICE_RC_RNR_NAK_GEN;
+	flags = gen->flags;
+	if (flags & MLX5_DEV_CAP_FLAG_BAD_PKEY_CNTR)
+		props->device_cap_flags |= IB_DEVICE_BAD_PKEY_CNTR;
+	if (flags & MLX5_DEV_CAP_FLAG_BAD_QKEY_CNTR)
+		props->device_cap_flags |= IB_DEVICE_BAD_QKEY_CNTR;
+	if (flags & MLX5_DEV_CAP_FLAG_APM)
+		props->device_cap_flags |= IB_DEVICE_AUTO_PATH_MIG;
+	props->device_cap_flags |= IB_DEVICE_LOCAL_DMA_LKEY;
+	if (flags & MLX5_DEV_CAP_FLAG_XRC)
+		props->device_cap_flags |= IB_DEVICE_XRC;
+	props->device_cap_flags |= IB_DEVICE_MEM_MGT_EXTENSIONS;
+	if (flags & MLX5_DEV_CAP_FLAG_SIG_HAND_OVER) {
+		props->device_cap_flags |= IB_DEVICE_SIGNATURE_HANDOVER;
+		/* At this stage no support for signature handover */
+		props->sig_prot_cap = IB_PROT_T10DIF_TYPE_1 |
+				      IB_PROT_T10DIF_TYPE_2 |
+				      IB_PROT_T10DIF_TYPE_3;
+		props->sig_guard_cap = IB_GUARD_T10DIF_CRC |
+				       IB_GUARD_T10DIF_CSUM;
+	}
+	if (flags & MLX5_DEV_CAP_FLAG_BLOCK_MCAST)
+		props->device_cap_flags |= IB_DEVICE_BLOCK_MULTICAST_LOOPBACK;
+
+	props->vendor_id	   = be32_to_cpup((__be32 *)(out_mad->data + 36)) &
+		0xffffff;
+	props->vendor_part_id	   = be16_to_cpup((__be16 *)(out_mad->data + 30));
+	props->hw_ver		   = be32_to_cpup((__be32 *)(out_mad->data + 32));
+	memcpy(&props->sys_image_guid, out_mad->data +	4, 8);
+
+	props->max_mr_size	   = ~0ull;
+	props->page_size_cap	   = gen->min_page_sz;
+	props->max_qp		   = 1 << gen->log_max_qp;
+	props->max_qp_wr	   = gen->max_wqes;
+	max_rq_sg = gen->max_rq_desc_sz / sizeof(struct mlx5_wqe_data_seg);
+	max_sq_sg = (gen->max_sq_desc_sz - sizeof(struct mlx5_wqe_ctrl_seg)) /
+		sizeof(struct mlx5_wqe_data_seg);
+	props->max_sge = min(max_rq_sg, max_sq_sg);
+	props->max_cq		   = 1 << gen->log_max_cq;
+	props->max_cqe		   = gen->max_cqes - 1;
+	props->max_mr		   = 1 << gen->log_max_mkey;
+	props->max_pd		   = 1 << gen->log_max_pd;
+	props->max_qp_rd_atom	   = 1 << gen->log_max_ra_req_qp;
+	props->max_qp_init_rd_atom = 1 << gen->log_max_ra_res_qp;
+	props->max_srq		   = 1 << gen->log_max_srq;
+	props->max_srq_wr	   = gen->max_srq_wqes - 1;
+	props->local_ca_ack_delay  = gen->local_ca_ack_delay;
+	props->max_res_rd_atom	   = props->max_qp_rd_atom * props->max_qp;
+	props->max_srq_sge	   = max_rq_sg - 1;
+	props->max_fast_reg_page_list_len = (unsigned int)-1;
+	props->local_ca_ack_delay  = gen->local_ca_ack_delay;
+	props->atomic_cap	   = IB_ATOMIC_NONE;
+	props->masked_atomic_cap   = IB_ATOMIC_NONE;
+	props->max_pkeys	   = be16_to_cpup((__be16 *)(out_mad->data + 28));
+	props->max_mcast_grp	   = 1 << gen->log_max_mcg;
+	props->max_mcast_qp_attach = gen->max_qp_mcg;
+	props->max_total_mcast_qp_attach = props->max_mcast_qp_attach *
+					   props->max_mcast_grp;
+	props->max_map_per_fmr = INT_MAX; /* no limit in ConnectIB */
+
+out:
+	kfree(in_mad);
+	kfree(out_mad);
+
+	return err;
+}
+
+int mlx5_ib_query_port(struct ib_device *ibdev, u8 port,
+		       struct ib_port_attr *props)
+{
+	struct mlx5_ib_dev *dev = to_mdev(ibdev);
+	struct ib_smp *in_mad  = NULL;
+	struct ib_smp *out_mad = NULL;
+	struct mlx5_general_caps *gen;
+	int ext_active_speed;
+	int err = -ENOMEM;
+
+	gen = &dev->mdev->caps.gen;
+	if (port < 1 || port > gen->num_ports) {
+		mlx5_ib_warn(dev, "invalid port number %d\n", port);
+		return -EINVAL;
+	}
+
+	in_mad  = kzalloc(sizeof(*in_mad), GFP_KERNEL);
+	out_mad = kmalloc(sizeof(*out_mad), GFP_KERNEL);
+	if (!in_mad || !out_mad)
+		goto out;
+
+	memset(props, 0, sizeof(*props));
+
+	init_query_mad(in_mad);
+	in_mad->attr_id  = IB_SMP_ATTR_PORT_INFO;
+	in_mad->attr_mod = cpu_to_be32(port);
+
+	err = mlx5_MAD_IFC(dev, 1, 1, port, NULL, NULL, in_mad, out_mad);
+	if (err) {
+		mlx5_ib_warn(dev, "err %d\n", err);
+		goto out;
+	}
+
+
+	props->lid		= be16_to_cpup((__be16 *)(out_mad->data + 16));
+	props->lmc		= out_mad->data[34] & 0x7;
+	props->sm_lid		= be16_to_cpup((__be16 *)(out_mad->data + 18));
+	props->sm_sl		= out_mad->data[36] & 0xf;
+	props->state		= out_mad->data[32] & 0xf;
+	props->phys_state	= out_mad->data[33] >> 4;
+	props->port_cap_flags	= be32_to_cpup((__be32 *)(out_mad->data + 20));
+	props->gid_tbl_len	= out_mad->data[50];
+	props->max_msg_sz	= 1 << gen->log_max_msg;
+	props->pkey_tbl_len	= gen->port[port - 1].pkey_table_len;
+	props->bad_pkey_cntr	= be16_to_cpup((__be16 *)(out_mad->data + 46));
+	props->qkey_viol_cntr	= be16_to_cpup((__be16 *)(out_mad->data + 48));
+	props->active_width	= out_mad->data[31] & 0xf;
+	props->active_speed	= out_mad->data[35] >> 4;
+	props->max_mtu		= out_mad->data[41] & 0xf;
+	props->active_mtu	= out_mad->data[36] >> 4;
+	props->subnet_timeout	= out_mad->data[51] & 0x1f;
+	props->max_vl_num	= out_mad->data[37] >> 4;
+	props->init_type_reply	= out_mad->data[41] >> 4;
+
+	/* Check if extended speeds (EDR/FDR/...) are supported */
+	if (props->port_cap_flags & IB_PORT_EXTENDED_SPEEDS_SUP) {
+		ext_active_speed = out_mad->data[62] >> 4;
+
+		switch (ext_active_speed) {
+		case 1:
+			props->active_speed = 16; /* FDR */
+			break;
+		case 2:
+			props->active_speed = 32; /* EDR */
+			break;
+		}
+	}
+
+	/* If reported active speed is QDR, check if is FDR-10 */
+	if (props->active_speed == 4) {
+		if (gen->ext_port_cap[port - 1] &
+		    MLX_EXT_PORT_CAP_FLAG_EXTENDED_PORT_INFO) {
+			init_query_mad(in_mad);
+			in_mad->attr_id = MLX5_ATTR_EXTENDED_PORT_INFO;
+			in_mad->attr_mod = cpu_to_be32(port);
+
+			err = mlx5_MAD_IFC(dev, 1, 1, port,
+					   NULL, NULL, in_mad, out_mad);
+			if (err)
+				goto out;
+
+			/* Checking LinkSpeedActive for FDR-10 */
+			if (out_mad->data[15] & 0x1)
+				props->active_speed = 8;
+		}
+	}
+
+out:
+	kfree(in_mad);
+	kfree(out_mad);
+
+	return err;
+}
+
+static int mlx5_ib_query_gid(struct ib_device *ibdev, u8 port, int index,
+			     union ib_gid *gid)
+{
+	struct ib_smp *in_mad  = NULL;
+	struct ib_smp *out_mad = NULL;
+	int err = -ENOMEM;
+
+	in_mad  = kzalloc(sizeof(*in_mad), GFP_KERNEL);
+	out_mad = kmalloc(sizeof(*out_mad), GFP_KERNEL);
+	if (!in_mad || !out_mad)
+		goto out;
+
+	init_query_mad(in_mad);
+	in_mad->attr_id  = IB_SMP_ATTR_PORT_INFO;
+	in_mad->attr_mod = cpu_to_be32(port);
+
+	err = mlx5_MAD_IFC(to_mdev(ibdev), 1, 1, port, NULL, NULL, in_mad, out_mad);
+	if (err)
+		goto out;
+
+	memcpy(gid->raw, out_mad->data + 8, 8);
+
+	init_query_mad(in_mad);
+	in_mad->attr_id  = IB_SMP_ATTR_GUID_INFO;
+	in_mad->attr_mod = cpu_to_be32(index / 8);
+
+	err = mlx5_MAD_IFC(to_mdev(ibdev), 1, 1, port, NULL, NULL, in_mad, out_mad);
+	if (err)
+		goto out;
+
+	memcpy(gid->raw + 8, out_mad->data + (index % 8) * 8, 8);
+
+out:
+	kfree(in_mad);
+	kfree(out_mad);
+	return err;
+}
+
+static int mlx5_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index,
+			      u16 *pkey)
+{
+	struct ib_smp *in_mad  = NULL;
+	struct ib_smp *out_mad = NULL;
+	int err = -ENOMEM;
+
+	in_mad  = kzalloc(sizeof(*in_mad), GFP_KERNEL);
+	out_mad = kmalloc(sizeof(*out_mad), GFP_KERNEL);
+	if (!in_mad || !out_mad)
+		goto out;
+
+	init_query_mad(in_mad);
+	in_mad->attr_id  = IB_SMP_ATTR_PKEY_TABLE;
+	in_mad->attr_mod = cpu_to_be32(index / 32);
+
+	err = mlx5_MAD_IFC(to_mdev(ibdev), 1, 1, port, NULL, NULL, in_mad, out_mad);
+	if (err)
+		goto out;
+
+	*pkey = be16_to_cpu(((__be16 *)out_mad->data)[index % 32]);
+
+out:
+	kfree(in_mad);
+	kfree(out_mad);
+	return err;
+}
+
+struct mlx5_reg_node_desc {
+	u8	desc[64];
+};
+
+static int mlx5_ib_modify_device(struct ib_device *ibdev, int mask,
+				 struct ib_device_modify *props)
+{
+	struct mlx5_ib_dev *dev = to_mdev(ibdev);
+	struct mlx5_reg_node_desc in;
+	struct mlx5_reg_node_desc out;
+	int err;
+
+	if (mask & ~IB_DEVICE_MODIFY_NODE_DESC)
+		return -EOPNOTSUPP;
+
+	if (!(mask & IB_DEVICE_MODIFY_NODE_DESC))
+		return 0;
+
+	/*
+	 * If possible, pass node desc to FW, so it can generate
+	 * a 144 trap.  If cmd fails, just ignore.
+	 */
+	memcpy(&in, props->node_desc, 64);
+	err = mlx5_core_access_reg(dev->mdev, &in, sizeof(in), &out,
+				   sizeof(out), MLX5_REG_NODE_DESC, 0, 1);
+	if (err)
+		return err;
+
+	memcpy(ibdev->node_desc, props->node_desc, 64);
+
+	return err;
+}
+
+static int mlx5_ib_modify_port(struct ib_device *ibdev, u8 port, int mask,
+			       struct ib_port_modify *props)
+{
+	struct mlx5_ib_dev *dev = to_mdev(ibdev);
+	struct ib_port_attr attr;
+	u32 tmp;
+	int err;
+
+	mutex_lock(&dev->cap_mask_mutex);
+
+	err = mlx5_ib_query_port(ibdev, port, &attr);
+	if (err)
+		goto out;
+
+	tmp = (attr.port_cap_flags | props->set_port_cap_mask) &
+		~props->clr_port_cap_mask;
+
+	err = mlx5_set_port_caps(dev->mdev, port, tmp);
+
+out:
+	mutex_unlock(&dev->cap_mask_mutex);
+	return err;
+}
+
+static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev,
+						  struct ib_udata *udata)
+{
+	struct mlx5_ib_dev *dev = to_mdev(ibdev);
+	struct mlx5_ib_alloc_ucontext_req_v2 req;
+	struct mlx5_ib_alloc_ucontext_resp resp;
+	struct mlx5_ib_ucontext *context;
+	struct mlx5_general_caps *gen;
+	struct mlx5_uuar_info *uuari;
+	struct mlx5_uar *uars;
+	int gross_uuars;
+	int num_uars;
+	int ver;
+	int uuarn;
+	int err;
+	int i;
+	size_t reqlen;
+
+	gen = &dev->mdev->caps.gen;
+	if (!dev->ib_active)
+		return ERR_PTR(-EAGAIN);
+
+	memset(&req, 0, sizeof(req));
+	reqlen = udata->inlen - sizeof(struct ib_uverbs_cmd_hdr);
+	if (reqlen == sizeof(struct mlx5_ib_alloc_ucontext_req))
+		ver = 0;
+	else if (reqlen == sizeof(struct mlx5_ib_alloc_ucontext_req_v2))
+		ver = 2;
+	else
+		return ERR_PTR(-EINVAL);
+
+	err = ib_copy_from_udata(&req, udata, reqlen);
+	if (err)
+		return ERR_PTR(err);
+
+	if (req.flags || req.reserved)
+		return ERR_PTR(-EINVAL);
+
+	if (req.total_num_uuars > MLX5_MAX_UUARS)
+		return ERR_PTR(-ENOMEM);
+
+	if (req.total_num_uuars == 0)
+		return ERR_PTR(-EINVAL);
+
+	req.total_num_uuars = ALIGN(req.total_num_uuars,
+				    MLX5_NON_FP_BF_REGS_PER_PAGE);
+	if (req.num_low_latency_uuars > req.total_num_uuars - 1)
+		return ERR_PTR(-EINVAL);
+
+	num_uars = req.total_num_uuars / MLX5_NON_FP_BF_REGS_PER_PAGE;
+	gross_uuars = num_uars * MLX5_BF_REGS_PER_PAGE;
+	resp.qp_tab_size      = 1 << gen->log_max_qp;
+	resp.bf_reg_size      = gen->bf_reg_size;
+	resp.cache_line_size  = L1_CACHE_BYTES;
+	resp.max_sq_desc_sz = gen->max_sq_desc_sz;
+	resp.max_rq_desc_sz = gen->max_rq_desc_sz;
+	resp.max_send_wqebb = gen->max_wqes;
+	resp.max_recv_wr = gen->max_wqes;
+	resp.max_srq_recv_wr = gen->max_srq_wqes;
+
+	context = kzalloc(sizeof(*context), GFP_KERNEL);
+	if (!context)
+		return ERR_PTR(-ENOMEM);
+
+	uuari = &context->uuari;
+	mutex_init(&uuari->lock);
+	uars = kcalloc(num_uars, sizeof(*uars), GFP_KERNEL);
+	if (!uars) {
+		err = -ENOMEM;
+		goto out_ctx;
+	}
+
+	uuari->bitmap = kcalloc(BITS_TO_LONGS(gross_uuars),
+				sizeof(*uuari->bitmap),
+				GFP_KERNEL);
+	if (!uuari->bitmap) {
+		err = -ENOMEM;
+		goto out_uar_ctx;
+	}
+	/*
+	 * clear all fast path uuars
+	 */
+	for (i = 0; i < gross_uuars; i++) {
+		uuarn = i & 3;
+		if (uuarn == 2 || uuarn == 3)
+			set_bit(i, uuari->bitmap);
+	}
+
+	uuari->count = kcalloc(gross_uuars, sizeof(*uuari->count), GFP_KERNEL);
+	if (!uuari->count) {
+		err = -ENOMEM;
+		goto out_bitmap;
+	}
+
+	for (i = 0; i < num_uars; i++) {
+		err = mlx5_cmd_alloc_uar(dev->mdev, &uars[i].index);
+		if (err)
+			goto out_count;
+	}
+
+	INIT_LIST_HEAD(&context->db_page_list);
+	mutex_init(&context->db_page_mutex);
+
+	resp.tot_uuars = req.total_num_uuars;
+	resp.num_ports = gen->num_ports;
+	err = ib_copy_to_udata(udata, &resp,
+			       sizeof(resp) - sizeof(resp.reserved));
+	if (err)
+		goto out_uars;
+
+	uuari->ver = ver;
+	uuari->num_low_latency_uuars = req.num_low_latency_uuars;
+	uuari->uars = uars;
+	uuari->num_uars = num_uars;
+	return &context->ibucontext;
+
+out_uars:
+	for (i--; i >= 0; i--)
+		mlx5_cmd_free_uar(dev->mdev, uars[i].index);
+out_count:
+	kfree(uuari->count);
+
+out_bitmap:
+	kfree(uuari->bitmap);
+
+out_uar_ctx:
+	kfree(uars);
+
+out_ctx:
+	kfree(context);
+	return ERR_PTR(err);
+}
+
+static int mlx5_ib_dealloc_ucontext(struct ib_ucontext *ibcontext)
+{
+	struct mlx5_ib_ucontext *context = to_mucontext(ibcontext);
+	struct mlx5_ib_dev *dev = to_mdev(ibcontext->device);
+	struct mlx5_uuar_info *uuari = &context->uuari;
+	int i;
+
+	for (i = 0; i < uuari->num_uars; i++) {
+		if (mlx5_cmd_free_uar(dev->mdev, uuari->uars[i].index))
+			mlx5_ib_warn(dev, "failed to free UAR 0x%x\n", uuari->uars[i].index);
+	}
+
+	kfree(uuari->count);
+	kfree(uuari->bitmap);
+	kfree(uuari->uars);
+	kfree(context);
+
+	return 0;
+}
+
+static phys_addr_t uar_index2pfn(struct mlx5_ib_dev *dev, int index)
+{
+	return (pci_resource_start(dev->mdev->pdev, 0) >> PAGE_SHIFT) + index;
+}
+
+static int get_command(unsigned long offset)
+{
+	return (offset >> MLX5_IB_MMAP_CMD_SHIFT) & MLX5_IB_MMAP_CMD_MASK;
+}
+
+static int get_arg(unsigned long offset)
+{
+	return offset & ((1 << MLX5_IB_MMAP_CMD_SHIFT) - 1);
+}
+
+static int get_index(unsigned long offset)
+{
+	return get_arg(offset);
+}
+
+static int mlx5_ib_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vma)
+{
+	struct mlx5_ib_ucontext *context = to_mucontext(ibcontext);
+	struct mlx5_ib_dev *dev = to_mdev(ibcontext->device);
+	struct mlx5_uuar_info *uuari = &context->uuari;
+	unsigned long command;
+	unsigned long idx;
+	phys_addr_t pfn;
+
+	command = get_command(vma->vm_pgoff);
+	switch (command) {
+	case MLX5_IB_MMAP_REGULAR_PAGE:
+		if (vma->vm_end - vma->vm_start != PAGE_SIZE)
+			return -EINVAL;
+
+		idx = get_index(vma->vm_pgoff);
+		if (idx >= uuari->num_uars)
+			return -EINVAL;
+
+		pfn = uar_index2pfn(dev, uuari->uars[idx].index);
+		mlx5_ib_dbg(dev, "uar idx 0x%lx, pfn 0x%llx\n", idx,
+			    (unsigned long long)pfn);
+
+		vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot);
+		if (io_remap_pfn_range(vma, vma->vm_start, pfn,
+				       PAGE_SIZE, vma->vm_page_prot))
+			return -EAGAIN;
+
+		mlx5_ib_dbg(dev, "mapped WC at 0x%lx, PA 0x%llx\n",
+			    vma->vm_start,
+			    (unsigned long long)pfn << PAGE_SHIFT);
+		break;
+
+	case MLX5_IB_MMAP_GET_CONTIGUOUS_PAGES:
+		return -ENOSYS;
+
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int alloc_pa_mkey(struct mlx5_ib_dev *dev, u32 *key, u32 pdn)
+{
+	struct mlx5_create_mkey_mbox_in *in;
+	struct mlx5_mkey_seg *seg;
+	struct mlx5_core_mr mr;
+	int err;
+
+	in = kzalloc(sizeof(*in), GFP_KERNEL);
+	if (!in)
+		return -ENOMEM;
+
+	seg = &in->seg;
+	seg->flags = MLX5_PERM_LOCAL_READ | MLX5_ACCESS_MODE_PA;
+	seg->flags_pd = cpu_to_be32(pdn | MLX5_MKEY_LEN64);
+	seg->qpn_mkey7_0 = cpu_to_be32(0xffffff << 8);
+	seg->start_addr = 0;
+
+	err = mlx5_core_create_mkey(dev->mdev, &mr, in, sizeof(*in),
+				    NULL, NULL, NULL);
+	if (err) {
+		mlx5_ib_warn(dev, "failed to create mkey, %d\n", err);
+		goto err_in;
+	}
+
+	kfree(in);
+	*key = mr.key;
+
+	return 0;
+
+err_in:
+	kfree(in);
+
+	return err;
+}
+
+static void free_pa_mkey(struct mlx5_ib_dev *dev, u32 key)
+{
+	struct mlx5_core_mr mr;
+	int err;
+
+	memset(&mr, 0, sizeof(mr));
+	mr.key = key;
+	err = mlx5_core_destroy_mkey(dev->mdev, &mr);
+	if (err)
+		mlx5_ib_warn(dev, "failed to destroy mkey 0x%x\n", key);
+}
+
+static struct ib_pd *mlx5_ib_alloc_pd(struct ib_device *ibdev,
+				      struct ib_ucontext *context,
+				      struct ib_udata *udata)
+{
+	struct mlx5_ib_alloc_pd_resp resp;
+	struct mlx5_ib_pd *pd;
+	int err;
+
+	pd = kmalloc(sizeof(*pd), GFP_KERNEL);
+	if (!pd)
+		return ERR_PTR(-ENOMEM);
+
+	err = mlx5_core_alloc_pd(to_mdev(ibdev)->mdev, &pd->pdn);
+	if (err) {
+		kfree(pd);
+		return ERR_PTR(err);
+	}
+
+	if (context) {
+		resp.pdn = pd->pdn;
+		if (ib_copy_to_udata(udata, &resp, sizeof(resp))) {
+			mlx5_core_dealloc_pd(to_mdev(ibdev)->mdev, pd->pdn);
+			kfree(pd);
+			return ERR_PTR(-EFAULT);
+		}
+	} else {
+		err = alloc_pa_mkey(to_mdev(ibdev), &pd->pa_lkey, pd->pdn);
+		if (err) {
+			mlx5_core_dealloc_pd(to_mdev(ibdev)->mdev, pd->pdn);
+			kfree(pd);
+			return ERR_PTR(err);
+		}
+	}
+
+	return &pd->ibpd;
+}
+
+static int mlx5_ib_dealloc_pd(struct ib_pd *pd)
+{
+	struct mlx5_ib_dev *mdev = to_mdev(pd->device);
+	struct mlx5_ib_pd *mpd = to_mpd(pd);
+
+	if (!pd->uobject)
+		free_pa_mkey(mdev, mpd->pa_lkey);
+
+	mlx5_core_dealloc_pd(mdev->mdev, mpd->pdn);
+	kfree(mpd);
+
+	return 0;
+}
+
+static int mlx5_ib_mcg_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
+{
+	struct mlx5_ib_dev *dev = to_mdev(ibqp->device);
+	int err;
+
+	err = mlx5_core_attach_mcg(dev->mdev, gid, ibqp->qp_num);
+	if (err)
+		mlx5_ib_warn(dev, "failed attaching QPN 0x%x, MGID %pI6\n",
+			     ibqp->qp_num, gid->raw);
+
+	return err;
+}
+
+static int mlx5_ib_mcg_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
+{
+	struct mlx5_ib_dev *dev = to_mdev(ibqp->device);
+	int err;
+
+	err = mlx5_core_detach_mcg(dev->mdev, gid, ibqp->qp_num);
+	if (err)
+		mlx5_ib_warn(dev, "failed detaching QPN 0x%x, MGID %pI6\n",
+			     ibqp->qp_num, gid->raw);
+
+	return err;
+}
+
+static int init_node_data(struct mlx5_ib_dev *dev)
+{
+	struct ib_smp *in_mad  = NULL;
+	struct ib_smp *out_mad = NULL;
+	int err = -ENOMEM;
+
+	in_mad  = kzalloc(sizeof(*in_mad), GFP_KERNEL);
+	out_mad = kmalloc(sizeof(*out_mad), GFP_KERNEL);
+	if (!in_mad || !out_mad)
+		goto out;
+
+	init_query_mad(in_mad);
+	in_mad->attr_id = IB_SMP_ATTR_NODE_DESC;
+
+	err = mlx5_MAD_IFC(dev, 1, 1, 1, NULL, NULL, in_mad, out_mad);
+	if (err)
+		goto out;
+
+	memcpy(dev->ib_dev.node_desc, out_mad->data, 64);
+
+	in_mad->attr_id = IB_SMP_ATTR_NODE_INFO;
+
+	err = mlx5_MAD_IFC(dev, 1, 1, 1, NULL, NULL, in_mad, out_mad);
+	if (err)
+		goto out;
+
+	dev->mdev->rev_id = be32_to_cpup((__be32 *)(out_mad->data + 32));
+	memcpy(&dev->ib_dev.node_guid, out_mad->data + 12, 8);
+
+out:
+	kfree(in_mad);
+	kfree(out_mad);
+	return err;
+}
+
+static ssize_t show_fw_pages(struct device *device, struct device_attribute *attr,
+			     char *buf)
+{
+	struct mlx5_ib_dev *dev =
+		container_of(device, struct mlx5_ib_dev, ib_dev.dev);
+
+	return sprintf(buf, "%d\n", dev->mdev->priv.fw_pages);
+}
+
+static ssize_t show_reg_pages(struct device *device,
+			      struct device_attribute *attr, char *buf)
+{
+	struct mlx5_ib_dev *dev =
+		container_of(device, struct mlx5_ib_dev, ib_dev.dev);
+
+	return sprintf(buf, "%d\n", dev->mdev->priv.reg_pages);
+}
+
+static ssize_t show_hca(struct device *device, struct device_attribute *attr,
+			char *buf)
+{
+	struct mlx5_ib_dev *dev =
+		container_of(device, struct mlx5_ib_dev, ib_dev.dev);
+	return sprintf(buf, "MT%d\n", dev->mdev->pdev->device);
+}
+
+static ssize_t show_fw_ver(struct device *device, struct device_attribute *attr,
+			   char *buf)
+{
+	struct mlx5_ib_dev *dev =
+		container_of(device, struct mlx5_ib_dev, ib_dev.dev);
+	return sprintf(buf, "%d.%d.%d\n", fw_rev_maj(dev->mdev),
+		       fw_rev_min(dev->mdev), fw_rev_sub(dev->mdev));
+}
+
+static ssize_t show_rev(struct device *device, struct device_attribute *attr,
+			char *buf)
+{
+	struct mlx5_ib_dev *dev =
+		container_of(device, struct mlx5_ib_dev, ib_dev.dev);
+	return sprintf(buf, "%x\n", dev->mdev->rev_id);
+}
+
+static ssize_t show_board(struct device *device, struct device_attribute *attr,
+			  char *buf)
+{
+	struct mlx5_ib_dev *dev =
+		container_of(device, struct mlx5_ib_dev, ib_dev.dev);
+	return sprintf(buf, "%.*s\n", MLX5_BOARD_ID_LEN,
+		       dev->mdev->board_id);
+}
+
+static DEVICE_ATTR(hw_rev,   S_IRUGO, show_rev,    NULL);
+static DEVICE_ATTR(fw_ver,   S_IRUGO, show_fw_ver, NULL);
+static DEVICE_ATTR(hca_type, S_IRUGO, show_hca,    NULL);
+static DEVICE_ATTR(board_id, S_IRUGO, show_board,  NULL);
+static DEVICE_ATTR(fw_pages, S_IRUGO, show_fw_pages, NULL);
+static DEVICE_ATTR(reg_pages, S_IRUGO, show_reg_pages, NULL);
+
+static struct device_attribute *mlx5_class_attributes[] = {
+	&dev_attr_hw_rev,
+	&dev_attr_fw_ver,
+	&dev_attr_hca_type,
+	&dev_attr_board_id,
+	&dev_attr_fw_pages,
+	&dev_attr_reg_pages,
+};
+
+static void mlx5_ib_event(struct mlx5_core_dev *dev, void *context,
+			  enum mlx5_dev_event event, unsigned long param)
+{
+	struct mlx5_ib_dev *ibdev = (struct mlx5_ib_dev *)context;
+	struct ib_event ibev;
+
+	u8 port = 0;
+
+	switch (event) {
+	case MLX5_DEV_EVENT_SYS_ERROR:
+		ibdev->ib_active = false;
+		ibev.event = IB_EVENT_DEVICE_FATAL;
+		break;
+
+	case MLX5_DEV_EVENT_PORT_UP:
+		ibev.event = IB_EVENT_PORT_ACTIVE;
+		port = (u8)param;
+		break;
+
+	case MLX5_DEV_EVENT_PORT_DOWN:
+		ibev.event = IB_EVENT_PORT_ERR;
+		port = (u8)param;
+		break;
+
+	case MLX5_DEV_EVENT_PORT_INITIALIZED:
+		/* not used by ULPs */
+		return;
+
+	case MLX5_DEV_EVENT_LID_CHANGE:
+		ibev.event = IB_EVENT_LID_CHANGE;
+		port = (u8)param;
+		break;
+
+	case MLX5_DEV_EVENT_PKEY_CHANGE:
+		ibev.event = IB_EVENT_PKEY_CHANGE;
+		port = (u8)param;
+		break;
+
+	case MLX5_DEV_EVENT_GUID_CHANGE:
+		ibev.event = IB_EVENT_GID_CHANGE;
+		port = (u8)param;
+		break;
+
+	case MLX5_DEV_EVENT_CLIENT_REREG:
+		ibev.event = IB_EVENT_CLIENT_REREGISTER;
+		port = (u8)param;
+		break;
+	}
+
+	ibev.device	      = &ibdev->ib_dev;
+	ibev.element.port_num = port;
+
+	if (port < 1 || port > ibdev->num_ports) {
+		mlx5_ib_warn(ibdev, "warning: event on port %d\n", port);
+		return;
+	}
+
+	if (ibdev->ib_active)
+		ib_dispatch_event(&ibev);
+}
+
+static void get_ext_port_caps(struct mlx5_ib_dev *dev)
+{
+	struct mlx5_general_caps *gen;
+	int port;
+
+	gen = &dev->mdev->caps.gen;
+	for (port = 1; port <= gen->num_ports; port++)
+		mlx5_query_ext_port_caps(dev, port);
+}
+
+static int get_port_caps(struct mlx5_ib_dev *dev)
+{
+	struct ib_device_attr *dprops = NULL;
+	struct ib_port_attr *pprops = NULL;
+	struct mlx5_general_caps *gen;
+	int err = 0;
+	int port;
+
+	gen = &dev->mdev->caps.gen;
+	pprops = kmalloc(sizeof(*pprops), GFP_KERNEL);
+	if (!pprops)
+		goto out;
+
+	dprops = kmalloc(sizeof(*dprops), GFP_KERNEL);
+	if (!dprops)
+		goto out;
+
+	err = mlx5_ib_query_device(&dev->ib_dev, dprops);
+	if (err) {
+		mlx5_ib_warn(dev, "query_device failed %d\n", err);
+		goto out;
+	}
+
+	for (port = 1; port <= gen->num_ports; port++) {
+		err = mlx5_ib_query_port(&dev->ib_dev, port, pprops);
+		if (err) {
+			mlx5_ib_warn(dev, "query_port %d failed %d\n", port, err);
+			break;
+		}
+		gen->port[port - 1].pkey_table_len = dprops->max_pkeys;
+		gen->port[port - 1].gid_table_len = pprops->gid_tbl_len;
+		mlx5_ib_dbg(dev, "pkey_table_len %d, gid_table_len %d\n",
+			    dprops->max_pkeys, pprops->gid_tbl_len);
+	}
+
+out:
+	kfree(pprops);
+	kfree(dprops);
+
+	return err;
+}
+
+static void destroy_umrc_res(struct mlx5_ib_dev *dev)
+{
+	int err;
+
+	err = mlx5_mr_cache_cleanup(dev);
+	if (err)
+		mlx5_ib_warn(dev, "mr cache cleanup failed\n");
+
+	mlx5_ib_destroy_qp(dev->umrc.qp);
+	ib_destroy_cq(dev->umrc.cq);
+	ib_dereg_mr(dev->umrc.mr);
+	ib_dealloc_pd(dev->umrc.pd);
+}
+
+enum {
+	MAX_UMR_WR = 128,
+};
+
+static int create_umr_res(struct mlx5_ib_dev *dev)
+{
+	struct ib_qp_init_attr *init_attr = NULL;
+	struct ib_qp_attr *attr = NULL;
+	struct ib_pd *pd;
+	struct ib_cq *cq;
+	struct ib_qp *qp;
+	struct ib_mr *mr;
+	int ret;
+
+	attr = kzalloc(sizeof(*attr), GFP_KERNEL);
+	init_attr = kzalloc(sizeof(*init_attr), GFP_KERNEL);
+	if (!attr || !init_attr) {
+		ret = -ENOMEM;
+		goto error_0;
+	}
+
+	pd = ib_alloc_pd(&dev->ib_dev);
+	if (IS_ERR(pd)) {
+		mlx5_ib_dbg(dev, "Couldn't create PD for sync UMR QP\n");
+		ret = PTR_ERR(pd);
+		goto error_0;
+	}
+
+	mr = ib_get_dma_mr(pd,  IB_ACCESS_LOCAL_WRITE);
+	if (IS_ERR(mr)) {
+		mlx5_ib_dbg(dev, "Couldn't create DMA MR for sync UMR QP\n");
+		ret = PTR_ERR(mr);
+		goto error_1;
+	}
+
+	cq = ib_create_cq(&dev->ib_dev, mlx5_umr_cq_handler, NULL, NULL, 128,
+			  0);
+	if (IS_ERR(cq)) {
+		mlx5_ib_dbg(dev, "Couldn't create CQ for sync UMR QP\n");
+		ret = PTR_ERR(cq);
+		goto error_2;
+	}
+	ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
+
+	init_attr->send_cq = cq;
+	init_attr->recv_cq = cq;
+	init_attr->sq_sig_type = IB_SIGNAL_ALL_WR;
+	init_attr->cap.max_send_wr = MAX_UMR_WR;
+	init_attr->cap.max_send_sge = 1;
+	init_attr->qp_type = MLX5_IB_QPT_REG_UMR;
+	init_attr->port_num = 1;
+	qp = mlx5_ib_create_qp(pd, init_attr, NULL);
+	if (IS_ERR(qp)) {
+		mlx5_ib_dbg(dev, "Couldn't create sync UMR QP\n");
+		ret = PTR_ERR(qp);
+		goto error_3;
+	}
+	qp->device     = &dev->ib_dev;
+	qp->real_qp    = qp;
+	qp->uobject    = NULL;
+	qp->qp_type    = MLX5_IB_QPT_REG_UMR;
+
+	attr->qp_state = IB_QPS_INIT;
+	attr->port_num = 1;
+	ret = mlx5_ib_modify_qp(qp, attr, IB_QP_STATE | IB_QP_PKEY_INDEX |
+				IB_QP_PORT, NULL);
+	if (ret) {
+		mlx5_ib_dbg(dev, "Couldn't modify UMR QP\n");
+		goto error_4;
+	}
+
+	memset(attr, 0, sizeof(*attr));
+	attr->qp_state = IB_QPS_RTR;
+	attr->path_mtu = IB_MTU_256;
+
+	ret = mlx5_ib_modify_qp(qp, attr, IB_QP_STATE, NULL);
+	if (ret) {
+		mlx5_ib_dbg(dev, "Couldn't modify umr QP to rtr\n");
+		goto error_4;
+	}
+
+	memset(attr, 0, sizeof(*attr));
+	attr->qp_state = IB_QPS_RTS;
+	ret = mlx5_ib_modify_qp(qp, attr, IB_QP_STATE, NULL);
+	if (ret) {
+		mlx5_ib_dbg(dev, "Couldn't modify umr QP to rts\n");
+		goto error_4;
+	}
+
+	dev->umrc.qp = qp;
+	dev->umrc.cq = cq;
+	dev->umrc.mr = mr;
+	dev->umrc.pd = pd;
+
+	sema_init(&dev->umrc.sem, MAX_UMR_WR);
+	ret = mlx5_mr_cache_init(dev);
+	if (ret) {
+		mlx5_ib_warn(dev, "mr cache init failed %d\n", ret);
+		goto error_4;
+	}
+
+	kfree(attr);
+	kfree(init_attr);
+
+	return 0;
+
+error_4:
+	mlx5_ib_destroy_qp(qp);
+
+error_3:
+	ib_destroy_cq(cq);
+
+error_2:
+	ib_dereg_mr(mr);
+
+error_1:
+	ib_dealloc_pd(pd);
+
+error_0:
+	kfree(attr);
+	kfree(init_attr);
+	return ret;
+}
+
+static int create_dev_resources(struct mlx5_ib_resources *devr)
+{
+	struct ib_srq_init_attr attr;
+	struct mlx5_ib_dev *dev;
+	int ret = 0;
+
+	dev = container_of(devr, struct mlx5_ib_dev, devr);
+
+	devr->p0 = mlx5_ib_alloc_pd(&dev->ib_dev, NULL, NULL);
+	if (IS_ERR(devr->p0)) {
+		ret = PTR_ERR(devr->p0);
+		goto error0;
+	}
+	devr->p0->device  = &dev->ib_dev;
+	devr->p0->uobject = NULL;
+	atomic_set(&devr->p0->usecnt, 0);
+
+	devr->c0 = mlx5_ib_create_cq(&dev->ib_dev, 1, 0, NULL, NULL);
+	if (IS_ERR(devr->c0)) {
+		ret = PTR_ERR(devr->c0);
+		goto error1;
+	}
+	devr->c0->device        = &dev->ib_dev;
+	devr->c0->uobject       = NULL;
+	devr->c0->comp_handler  = NULL;
+	devr->c0->event_handler = NULL;
+	devr->c0->cq_context    = NULL;
+	atomic_set(&devr->c0->usecnt, 0);
+
+	devr->x0 = mlx5_ib_alloc_xrcd(&dev->ib_dev, NULL, NULL);
+	if (IS_ERR(devr->x0)) {
+		ret = PTR_ERR(devr->x0);
+		goto error2;
+	}
+	devr->x0->device = &dev->ib_dev;
+	devr->x0->inode = NULL;
+	atomic_set(&devr->x0->usecnt, 0);
+	mutex_init(&devr->x0->tgt_qp_mutex);
+	INIT_LIST_HEAD(&devr->x0->tgt_qp_list);
+
+	devr->x1 = mlx5_ib_alloc_xrcd(&dev->ib_dev, NULL, NULL);
+	if (IS_ERR(devr->x1)) {
+		ret = PTR_ERR(devr->x1);
+		goto error3;
+	}
+	devr->x1->device = &dev->ib_dev;
+	devr->x1->inode = NULL;
+	atomic_set(&devr->x1->usecnt, 0);
+	mutex_init(&devr->x1->tgt_qp_mutex);
+	INIT_LIST_HEAD(&devr->x1->tgt_qp_list);
+
+	memset(&attr, 0, sizeof(attr));
+	attr.attr.max_sge = 1;
+	attr.attr.max_wr = 1;
+	attr.srq_type = IB_SRQT_XRC;
+	attr.ext.xrc.cq = devr->c0;
+	attr.ext.xrc.xrcd = devr->x0;
+
+	devr->s0 = mlx5_ib_create_srq(devr->p0, &attr, NULL);
+	if (IS_ERR(devr->s0)) {
+		ret = PTR_ERR(devr->s0);
+		goto error4;
+	}
+	devr->s0->device	= &dev->ib_dev;
+	devr->s0->pd		= devr->p0;
+	devr->s0->uobject       = NULL;
+	devr->s0->event_handler = NULL;
+	devr->s0->srq_context   = NULL;
+	devr->s0->srq_type      = IB_SRQT_XRC;
+	devr->s0->ext.xrc.xrcd	= devr->x0;
+	devr->s0->ext.xrc.cq	= devr->c0;
+	atomic_inc(&devr->s0->ext.xrc.xrcd->usecnt);
+	atomic_inc(&devr->s0->ext.xrc.cq->usecnt);
+	atomic_inc(&devr->p0->usecnt);
+	atomic_set(&devr->s0->usecnt, 0);
+
+	return 0;
+
+error4:
+	mlx5_ib_dealloc_xrcd(devr->x1);
+error3:
+	mlx5_ib_dealloc_xrcd(devr->x0);
+error2:
+	mlx5_ib_destroy_cq(devr->c0);
+error1:
+	mlx5_ib_dealloc_pd(devr->p0);
+error0:
+	return ret;
+}
+
+static void destroy_dev_resources(struct mlx5_ib_resources *devr)
+{
+	mlx5_ib_destroy_srq(devr->s0);
+	mlx5_ib_dealloc_xrcd(devr->x0);
+	mlx5_ib_dealloc_xrcd(devr->x1);
+	mlx5_ib_destroy_cq(devr->c0);
+	mlx5_ib_dealloc_pd(devr->p0);
+}
+
+static void *mlx5_ib_add(struct mlx5_core_dev *mdev)
+{
+	struct mlx5_ib_dev *dev;
+	int err;
+	int i;
+
+	printk_once(KERN_INFO "%s", mlx5_version);
+
+	dev = (struct mlx5_ib_dev *)ib_alloc_device(sizeof(*dev));
+	if (!dev)
+		return NULL;
+
+	dev->mdev = mdev;
+
+	err = get_port_caps(dev);
+	if (err)
+		goto err_dealloc;
+
+	get_ext_port_caps(dev);
+
+	err = alloc_comp_eqs(dev);
+	if (err)
+		goto err_dealloc;
+
+	MLX5_INIT_DOORBELL_LOCK(&dev->uar_lock);
+
+	strlcpy(dev->ib_dev.name, "mlx5_%d", IB_DEVICE_NAME_MAX);
+	dev->ib_dev.owner		= THIS_MODULE;
+	dev->ib_dev.node_type		= RDMA_NODE_IB_CA;
+	dev->ib_dev.local_dma_lkey	= mdev->caps.gen.reserved_lkey;
+	dev->num_ports		= mdev->caps.gen.num_ports;
+	dev->ib_dev.phys_port_cnt     = dev->num_ports;
+	dev->ib_dev.num_comp_vectors	= dev->num_comp_vectors;
+	dev->ib_dev.dma_device	= &mdev->pdev->dev;
+
+	dev->ib_dev.uverbs_abi_ver	= MLX5_IB_UVERBS_ABI_VERSION;
+	dev->ib_dev.uverbs_cmd_mask	=
+		(1ull << IB_USER_VERBS_CMD_GET_CONTEXT)		|
+		(1ull << IB_USER_VERBS_CMD_QUERY_DEVICE)	|
+		(1ull << IB_USER_VERBS_CMD_QUERY_PORT)		|
+		(1ull << IB_USER_VERBS_CMD_ALLOC_PD)		|
+		(1ull << IB_USER_VERBS_CMD_DEALLOC_PD)		|
+		(1ull << IB_USER_VERBS_CMD_REG_MR)		|
+		(1ull << IB_USER_VERBS_CMD_DEREG_MR)		|
+		(1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL)	|
+		(1ull << IB_USER_VERBS_CMD_CREATE_CQ)		|
+		(1ull << IB_USER_VERBS_CMD_RESIZE_CQ)		|
+		(1ull << IB_USER_VERBS_CMD_DESTROY_CQ)		|
+		(1ull << IB_USER_VERBS_CMD_CREATE_QP)		|
+		(1ull << IB_USER_VERBS_CMD_MODIFY_QP)		|
+		(1ull << IB_USER_VERBS_CMD_QUERY_QP)		|
+		(1ull << IB_USER_VERBS_CMD_DESTROY_QP)		|
+		(1ull << IB_USER_VERBS_CMD_ATTACH_MCAST)	|
+		(1ull << IB_USER_VERBS_CMD_DETACH_MCAST)	|
+		(1ull << IB_USER_VERBS_CMD_CREATE_SRQ)		|
+		(1ull << IB_USER_VERBS_CMD_MODIFY_SRQ)		|
+		(1ull << IB_USER_VERBS_CMD_QUERY_SRQ)		|
+		(1ull << IB_USER_VERBS_CMD_DESTROY_SRQ)		|
+		(1ull << IB_USER_VERBS_CMD_CREATE_XSRQ)		|
+		(1ull << IB_USER_VERBS_CMD_OPEN_QP);
+
+	dev->ib_dev.query_device	= mlx5_ib_query_device;
+	dev->ib_dev.query_port		= mlx5_ib_query_port;
+	dev->ib_dev.query_gid		= mlx5_ib_query_gid;
+	dev->ib_dev.query_pkey		= mlx5_ib_query_pkey;
+	dev->ib_dev.modify_device	= mlx5_ib_modify_device;
+	dev->ib_dev.modify_port		= mlx5_ib_modify_port;
+	dev->ib_dev.alloc_ucontext	= mlx5_ib_alloc_ucontext;
+	dev->ib_dev.dealloc_ucontext	= mlx5_ib_dealloc_ucontext;
+	dev->ib_dev.mmap		= mlx5_ib_mmap;
+	dev->ib_dev.alloc_pd		= mlx5_ib_alloc_pd;
+	dev->ib_dev.dealloc_pd		= mlx5_ib_dealloc_pd;
+	dev->ib_dev.create_ah		= mlx5_ib_create_ah;
+	dev->ib_dev.query_ah		= mlx5_ib_query_ah;
+	dev->ib_dev.destroy_ah		= mlx5_ib_destroy_ah;
+	dev->ib_dev.create_srq		= mlx5_ib_create_srq;
+	dev->ib_dev.modify_srq		= mlx5_ib_modify_srq;
+	dev->ib_dev.query_srq		= mlx5_ib_query_srq;
+	dev->ib_dev.destroy_srq		= mlx5_ib_destroy_srq;
+	dev->ib_dev.post_srq_recv	= mlx5_ib_post_srq_recv;
+	dev->ib_dev.create_qp		= mlx5_ib_create_qp;
+	dev->ib_dev.modify_qp		= mlx5_ib_modify_qp;
+	dev->ib_dev.query_qp		= mlx5_ib_query_qp;
+	dev->ib_dev.destroy_qp		= mlx5_ib_destroy_qp;
+	dev->ib_dev.post_send		= mlx5_ib_post_send;
+	dev->ib_dev.post_recv		= mlx5_ib_post_recv;
+	dev->ib_dev.create_cq		= mlx5_ib_create_cq;
+	dev->ib_dev.modify_cq		= mlx5_ib_modify_cq;
+	dev->ib_dev.resize_cq		= mlx5_ib_resize_cq;
+	dev->ib_dev.destroy_cq		= mlx5_ib_destroy_cq;
+	dev->ib_dev.poll_cq		= mlx5_ib_poll_cq;
+	dev->ib_dev.req_notify_cq	= mlx5_ib_arm_cq;
+	dev->ib_dev.get_dma_mr		= mlx5_ib_get_dma_mr;
+	dev->ib_dev.reg_user_mr		= mlx5_ib_reg_user_mr;
+	dev->ib_dev.dereg_mr		= mlx5_ib_dereg_mr;
+	dev->ib_dev.destroy_mr		= mlx5_ib_destroy_mr;
+	dev->ib_dev.attach_mcast	= mlx5_ib_mcg_attach;
+	dev->ib_dev.detach_mcast	= mlx5_ib_mcg_detach;
+	dev->ib_dev.process_mad		= mlx5_ib_process_mad;
+	dev->ib_dev.create_mr		= mlx5_ib_create_mr;
+	dev->ib_dev.alloc_fast_reg_mr	= mlx5_ib_alloc_fast_reg_mr;
+	dev->ib_dev.alloc_fast_reg_page_list = mlx5_ib_alloc_fast_reg_page_list;
+	dev->ib_dev.free_fast_reg_page_list  = mlx5_ib_free_fast_reg_page_list;
+	dev->ib_dev.check_mr_status	= mlx5_ib_check_mr_status;
+
+	if (mdev->caps.gen.flags & MLX5_DEV_CAP_FLAG_XRC) {
+		dev->ib_dev.alloc_xrcd = mlx5_ib_alloc_xrcd;
+		dev->ib_dev.dealloc_xrcd = mlx5_ib_dealloc_xrcd;
+		dev->ib_dev.uverbs_cmd_mask |=
+			(1ull << IB_USER_VERBS_CMD_OPEN_XRCD) |
+			(1ull << IB_USER_VERBS_CMD_CLOSE_XRCD);
+	}
+
+	err = init_node_data(dev);
+	if (err)
+		goto err_eqs;
+
+	mutex_init(&dev->cap_mask_mutex);
+	spin_lock_init(&dev->mr_lock);
+
+	err = create_dev_resources(&dev->devr);
+	if (err)
+		goto err_eqs;
+
+	err = ib_register_device(&dev->ib_dev, NULL);
+	if (err)
+		goto err_rsrc;
+
+	err = create_umr_res(dev);
+	if (err)
+		goto err_dev;
+
+	for (i = 0; i < ARRAY_SIZE(mlx5_class_attributes); i++) {
+		err = device_create_file(&dev->ib_dev.dev,
+					 mlx5_class_attributes[i]);
+		if (err)
+			goto err_umrc;
+	}
+
+	dev->ib_active = true;
+
+	return dev;
+
+err_umrc:
+	destroy_umrc_res(dev);
+
+err_dev:
+	ib_unregister_device(&dev->ib_dev);
+
+err_rsrc:
+	destroy_dev_resources(&dev->devr);
+
+err_eqs:
+	free_comp_eqs(dev);
+
+err_dealloc:
+	ib_dealloc_device((struct ib_device *)dev);
+
+	return NULL;
+}
+
+static void mlx5_ib_remove(struct mlx5_core_dev *mdev, void *context)
+{
+	struct mlx5_ib_dev *dev = context;
+	ib_unregister_device(&dev->ib_dev);
+	destroy_umrc_res(dev);
+	destroy_dev_resources(&dev->devr);
+	free_comp_eqs(dev);
+	ib_dealloc_device(&dev->ib_dev);
+}
+
+static struct mlx5_interface mlx5_ib_interface = {
+	.add            = mlx5_ib_add,
+	.remove         = mlx5_ib_remove,
+	.event          = mlx5_ib_event,
+};
+
+static int __init mlx5_ib_init(void)
+{
+	if (deprecated_prof_sel != 2)
+		pr_warn("prof_sel is deprecated for mlx5_ib, set it for mlx5_core\n");
+
+	return mlx5_register_interface(&mlx5_ib_interface);
+}
+
+static void __exit mlx5_ib_cleanup(void)
+{
+	mlx5_unregister_interface(&mlx5_ib_interface);
+}
+
+module_init(mlx5_ib_init);
+module_exit(mlx5_ib_cleanup);
diff --git a/drivers/infiniband/hw/mlx5/mem.c b/drivers/infiniband/hw/mlx5/mem.c
new file mode 100644
index 0000000..dae07ea
--- /dev/null
+++ b/drivers/infiniband/hw/mlx5/mem.c
@@ -0,0 +1,164 @@
+/*
+ * Copyright (c) 2013, Mellanox Technologies inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <rdma/ib_umem.h>
+#include "mlx5_ib.h"
+
+/* @umem: umem object to scan
+ * @addr: ib virtual address requested by the user
+ * @count: number of PAGE_SIZE pages covered by umem
+ * @shift: page shift for the compound pages found in the region
+ * @ncont: number of compund pages
+ * @order: log2 of the number of compound pages
+ */
+void mlx5_ib_cont_pages(struct ib_umem *umem, u64 addr, int *count, int *shift,
+			int *ncont, int *order)
+{
+	unsigned long tmp;
+	unsigned long m;
+	int i, k;
+	u64 base = 0;
+	int p = 0;
+	int skip;
+	int mask;
+	u64 len;
+	u64 pfn;
+	struct scatterlist *sg;
+	int entry;
+	unsigned long page_shift = ilog2(umem->page_size);
+
+	addr = addr >> page_shift;
+	tmp = (unsigned long)addr;
+	m = find_first_bit(&tmp, sizeof(tmp));
+	skip = 1 << m;
+	mask = skip - 1;
+	i = 0;
+	for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) {
+		len = sg_dma_len(sg) >> page_shift;
+		pfn = sg_dma_address(sg) >> page_shift;
+		for (k = 0; k < len; k++) {
+			if (!(i & mask)) {
+				tmp = (unsigned long)pfn;
+				m = min(m, find_first_bit(&tmp, sizeof(tmp)));
+				skip = 1 << m;
+				mask = skip - 1;
+				base = pfn;
+				p = 0;
+			} else {
+				if (base + p != pfn) {
+					tmp = (unsigned long)p;
+					m = find_first_bit(&tmp, sizeof(tmp));
+					skip = 1 << m;
+					mask = skip - 1;
+					base = pfn;
+					p = 0;
+				}
+			}
+			p++;
+			i++;
+		}
+	}
+
+	if (i) {
+		m = min_t(unsigned long, ilog2(roundup_pow_of_two(i)), m);
+
+		if (order)
+			*order = ilog2(roundup_pow_of_two(i) >> m);
+
+		*ncont = DIV_ROUND_UP(i, (1 << m));
+	} else {
+		m  = 0;
+
+		if (order)
+			*order = 0;
+
+		*ncont = 0;
+	}
+	*shift = page_shift + m;
+	*count = i;
+}
+
+void mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem,
+			  int page_shift, __be64 *pas, int umr)
+{
+	unsigned long umem_page_shift = ilog2(umem->page_size);
+	int shift = page_shift - umem_page_shift;
+	int mask = (1 << shift) - 1;
+	int i, k;
+	u64 cur = 0;
+	u64 base;
+	int len;
+	struct scatterlist *sg;
+	int entry;
+
+	i = 0;
+	for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) {
+		len = sg_dma_len(sg) >> umem_page_shift;
+		base = sg_dma_address(sg);
+		for (k = 0; k < len; k++) {
+			if (!(i & mask)) {
+				cur = base + (k << umem_page_shift);
+				if (umr)
+					cur |= 3;
+
+				pas[i >> shift] = cpu_to_be64(cur);
+				mlx5_ib_dbg(dev, "pas[%d] 0x%llx\n",
+					    i >> shift, be64_to_cpu(pas[i >> shift]));
+			}  else
+				mlx5_ib_dbg(dev, "=====> 0x%llx\n",
+					    base + (k << umem_page_shift));
+			i++;
+		}
+	}
+}
+
+int mlx5_ib_get_buf_offset(u64 addr, int page_shift, u32 *offset)
+{
+	u64 page_size;
+	u64 page_mask;
+	u64 off_size;
+	u64 off_mask;
+	u64 buf_off;
+
+	page_size = (u64)1 << page_shift;
+	page_mask = page_size - 1;
+	buf_off = addr & page_mask;
+	off_size = page_size >> 6;
+	off_mask = off_size - 1;
+
+	if (buf_off & off_mask)
+		return -EINVAL;
+
+	*offset = buf_off >> ilog2(off_size);
+	return 0;
+}
diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
new file mode 100644
index 0000000..386780f
--- /dev/null
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -0,0 +1,564 @@
+/*
+ * Copyright (c) 2013, Mellanox Technologies inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MLX5_IB_H
+#define MLX5_IB_H
+
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_smi.h>
+#include <linux/mlx5/driver.h>
+#include <linux/mlx5/cq.h>
+#include <linux/mlx5/qp.h>
+#include <linux/mlx5/srq.h>
+#include <linux/types.h>
+
+#define mlx5_ib_dbg(dev, format, arg...)				\
+pr_debug("%s:%s:%d:(pid %d): " format, (dev)->ib_dev.name, __func__,	\
+	 __LINE__, current->pid, ##arg)
+
+#define mlx5_ib_err(dev, format, arg...)				\
+pr_err("%s:%s:%d:(pid %d): " format, (dev)->ib_dev.name, __func__,	\
+	__LINE__, current->pid, ##arg)
+
+#define mlx5_ib_warn(dev, format, arg...)				\
+pr_warn("%s:%s:%d:(pid %d): " format, (dev)->ib_dev.name, __func__,	\
+	__LINE__, current->pid, ##arg)
+
+enum {
+	MLX5_IB_MMAP_CMD_SHIFT	= 8,
+	MLX5_IB_MMAP_CMD_MASK	= 0xff,
+};
+
+enum mlx5_ib_mmap_cmd {
+	MLX5_IB_MMAP_REGULAR_PAGE		= 0,
+	MLX5_IB_MMAP_GET_CONTIGUOUS_PAGES	= 1, /* always last */
+};
+
+enum {
+	MLX5_RES_SCAT_DATA32_CQE	= 0x1,
+	MLX5_RES_SCAT_DATA64_CQE	= 0x2,
+	MLX5_REQ_SCAT_DATA32_CQE	= 0x11,
+	MLX5_REQ_SCAT_DATA64_CQE	= 0x22,
+};
+
+enum mlx5_ib_latency_class {
+	MLX5_IB_LATENCY_CLASS_LOW,
+	MLX5_IB_LATENCY_CLASS_MEDIUM,
+	MLX5_IB_LATENCY_CLASS_HIGH,
+	MLX5_IB_LATENCY_CLASS_FAST_PATH
+};
+
+enum mlx5_ib_mad_ifc_flags {
+	MLX5_MAD_IFC_IGNORE_MKEY	= 1,
+	MLX5_MAD_IFC_IGNORE_BKEY	= 2,
+	MLX5_MAD_IFC_NET_VIEW		= 4,
+};
+
+struct mlx5_ib_ucontext {
+	struct ib_ucontext	ibucontext;
+	struct list_head	db_page_list;
+
+	/* protect doorbell record alloc/free
+	 */
+	struct mutex		db_page_mutex;
+	struct mlx5_uuar_info	uuari;
+};
+
+static inline struct mlx5_ib_ucontext *to_mucontext(struct ib_ucontext *ibucontext)
+{
+	return container_of(ibucontext, struct mlx5_ib_ucontext, ibucontext);
+}
+
+struct mlx5_ib_pd {
+	struct ib_pd		ibpd;
+	u32			pdn;
+	u32			pa_lkey;
+};
+
+/* Use macros here so that don't have to duplicate
+ * enum ib_send_flags and enum ib_qp_type for low-level driver
+ */
+
+#define MLX5_IB_SEND_UMR_UNREG	IB_SEND_RESERVED_START
+#define MLX5_IB_QPT_REG_UMR	IB_QPT_RESERVED1
+#define MLX5_IB_WR_UMR		IB_WR_RESERVED1
+
+struct wr_list {
+	u16	opcode;
+	u16	next;
+};
+
+struct mlx5_ib_wq {
+	u64		       *wrid;
+	u32		       *wr_data;
+	struct wr_list	       *w_list;
+	unsigned	       *wqe_head;
+	u16		        unsig_count;
+
+	/* serialize post to the work queue
+	 */
+	spinlock_t		lock;
+	int			wqe_cnt;
+	int			max_post;
+	int			max_gs;
+	int			offset;
+	int			wqe_shift;
+	unsigned		head;
+	unsigned		tail;
+	u16			cur_post;
+	u16			last_poll;
+	void		       *qend;
+};
+
+enum {
+	MLX5_QP_USER,
+	MLX5_QP_KERNEL,
+	MLX5_QP_EMPTY
+};
+
+struct mlx5_ib_qp {
+	struct ib_qp		ibqp;
+	struct mlx5_core_qp	mqp;
+	struct mlx5_buf		buf;
+
+	struct mlx5_db		db;
+	struct mlx5_ib_wq	rq;
+
+	u32			doorbell_qpn;
+	u8			sq_signal_bits;
+	u8			fm_cache;
+	int			sq_max_wqes_per_wr;
+	int			sq_spare_wqes;
+	struct mlx5_ib_wq	sq;
+
+	struct ib_umem	       *umem;
+	int			buf_size;
+
+	/* serialize qp state modifications
+	 */
+	struct mutex		mutex;
+	u16			xrcdn;
+	u32			flags;
+	u8			port;
+	u8			alt_port;
+	u8			atomic_rd_en;
+	u8			resp_depth;
+	u8			state;
+	int			mlx_type;
+	int			wq_sig;
+	int			scat_cqe;
+	int			max_inline_data;
+	struct mlx5_bf	       *bf;
+	int			has_rq;
+
+	/* only for user space QPs. For kernel
+	 * we have it from the bf object
+	 */
+	int			uuarn;
+
+	int			create_type;
+	u32			pa_lkey;
+
+	/* Store signature errors */
+	bool			signature_en;
+};
+
+struct mlx5_ib_cq_buf {
+	struct mlx5_buf		buf;
+	struct ib_umem		*umem;
+	int			cqe_size;
+	int			nent;
+};
+
+enum mlx5_ib_qp_flags {
+	MLX5_IB_QP_BLOCK_MULTICAST_LOOPBACK     = 1 << 0,
+	MLX5_IB_QP_SIGNATURE_HANDLING           = 1 << 1,
+};
+
+struct mlx5_shared_mr_info {
+	int mr_id;
+	struct ib_umem		*umem;
+};
+
+struct mlx5_ib_cq {
+	struct ib_cq		ibcq;
+	struct mlx5_core_cq	mcq;
+	struct mlx5_ib_cq_buf	buf;
+	struct mlx5_db		db;
+
+	/* serialize access to the CQ
+	 */
+	spinlock_t		lock;
+
+	/* protect resize cq
+	 */
+	struct mutex		resize_mutex;
+	struct mlx5_ib_cq_buf  *resize_buf;
+	struct ib_umem	       *resize_umem;
+	int			cqe_size;
+};
+
+struct mlx5_ib_srq {
+	struct ib_srq		ibsrq;
+	struct mlx5_core_srq	msrq;
+	struct mlx5_buf		buf;
+	struct mlx5_db		db;
+	u64		       *wrid;
+	/* protect SRQ hanlding
+	 */
+	spinlock_t		lock;
+	int			head;
+	int			tail;
+	u16			wqe_ctr;
+	struct ib_umem	       *umem;
+	/* serialize arming a SRQ
+	 */
+	struct mutex		mutex;
+	int			wq_sig;
+};
+
+struct mlx5_ib_xrcd {
+	struct ib_xrcd		ibxrcd;
+	u32			xrcdn;
+};
+
+struct mlx5_ib_mr {
+	struct ib_mr		ibmr;
+	struct mlx5_core_mr	mmr;
+	struct ib_umem	       *umem;
+	struct mlx5_shared_mr_info	*smr_info;
+	struct list_head	list;
+	int			order;
+	int			umred;
+	__be64			*pas;
+	dma_addr_t		dma;
+	int			npages;
+	struct mlx5_ib_dev     *dev;
+	struct mlx5_create_mkey_mbox_out out;
+	struct mlx5_core_sig_ctx    *sig;
+};
+
+struct mlx5_ib_fast_reg_page_list {
+	struct ib_fast_reg_page_list	ibfrpl;
+	__be64			       *mapped_page_list;
+	dma_addr_t			map;
+};
+
+struct mlx5_ib_umr_context {
+	enum ib_wc_status	status;
+	struct completion	done;
+};
+
+static inline void mlx5_ib_init_umr_context(struct mlx5_ib_umr_context *context)
+{
+	context->status = -1;
+	init_completion(&context->done);
+}
+
+struct umr_common {
+	struct ib_pd	*pd;
+	struct ib_cq	*cq;
+	struct ib_qp	*qp;
+	struct ib_mr	*mr;
+	/* control access to UMR QP
+	 */
+	struct semaphore	sem;
+};
+
+enum {
+	MLX5_FMR_INVALID,
+	MLX5_FMR_VALID,
+	MLX5_FMR_BUSY,
+};
+
+struct mlx5_ib_fmr {
+	struct ib_fmr			ibfmr;
+	struct mlx5_core_mr		mr;
+	int				access_flags;
+	int				state;
+	/* protect fmr state
+	 */
+	spinlock_t			lock;
+	u64				wrid;
+	struct ib_send_wr		wr[2];
+	u8				page_shift;
+	struct ib_fast_reg_page_list	page_list;
+};
+
+struct mlx5_cache_ent {
+	struct list_head	head;
+	/* sync access to the cahce entry
+	 */
+	spinlock_t		lock;
+
+
+	struct dentry	       *dir;
+	char                    name[4];
+	u32                     order;
+	u32			size;
+	u32                     cur;
+	u32                     miss;
+	u32			limit;
+
+	struct dentry          *fsize;
+	struct dentry          *fcur;
+	struct dentry          *fmiss;
+	struct dentry          *flimit;
+
+	struct mlx5_ib_dev     *dev;
+	struct work_struct	work;
+	struct delayed_work	dwork;
+	int			pending;
+};
+
+struct mlx5_mr_cache {
+	struct workqueue_struct *wq;
+	struct mlx5_cache_ent	ent[MAX_MR_CACHE_ENTRIES];
+	int			stopped;
+	struct dentry		*root;
+	unsigned long		last_add;
+};
+
+struct mlx5_ib_resources {
+	struct ib_cq	*c0;
+	struct ib_xrcd	*x0;
+	struct ib_xrcd	*x1;
+	struct ib_pd	*p0;
+	struct ib_srq	*s0;
+};
+
+struct mlx5_ib_dev {
+	struct ib_device		ib_dev;
+	struct mlx5_core_dev		*mdev;
+	MLX5_DECLARE_DOORBELL_LOCK(uar_lock);
+	struct list_head		eqs_list;
+	int				num_ports;
+	int				num_comp_vectors;
+	/* serialize update of capability mask
+	 */
+	struct mutex			cap_mask_mutex;
+	bool				ib_active;
+	struct umr_common		umrc;
+	/* sync used page count stats
+	 */
+	spinlock_t			mr_lock;
+	struct mlx5_ib_resources	devr;
+	struct mlx5_mr_cache		cache;
+	struct timer_list		delay_timer;
+	int				fill_delay;
+};
+
+static inline struct mlx5_ib_cq *to_mibcq(struct mlx5_core_cq *mcq)
+{
+	return container_of(mcq, struct mlx5_ib_cq, mcq);
+}
+
+static inline struct mlx5_ib_xrcd *to_mxrcd(struct ib_xrcd *ibxrcd)
+{
+	return container_of(ibxrcd, struct mlx5_ib_xrcd, ibxrcd);
+}
+
+static inline struct mlx5_ib_dev *to_mdev(struct ib_device *ibdev)
+{
+	return container_of(ibdev, struct mlx5_ib_dev, ib_dev);
+}
+
+static inline struct mlx5_ib_fmr *to_mfmr(struct ib_fmr *ibfmr)
+{
+	return container_of(ibfmr, struct mlx5_ib_fmr, ibfmr);
+}
+
+static inline struct mlx5_ib_cq *to_mcq(struct ib_cq *ibcq)
+{
+	return container_of(ibcq, struct mlx5_ib_cq, ibcq);
+}
+
+static inline struct mlx5_ib_qp *to_mibqp(struct mlx5_core_qp *mqp)
+{
+	return container_of(mqp, struct mlx5_ib_qp, mqp);
+}
+
+static inline struct mlx5_ib_mr *to_mibmr(struct mlx5_core_mr *mmr)
+{
+	return container_of(mmr, struct mlx5_ib_mr, mmr);
+}
+
+static inline struct mlx5_ib_pd *to_mpd(struct ib_pd *ibpd)
+{
+	return container_of(ibpd, struct mlx5_ib_pd, ibpd);
+}
+
+static inline struct mlx5_ib_srq *to_msrq(struct ib_srq *ibsrq)
+{
+	return container_of(ibsrq, struct mlx5_ib_srq, ibsrq);
+}
+
+static inline struct mlx5_ib_qp *to_mqp(struct ib_qp *ibqp)
+{
+	return container_of(ibqp, struct mlx5_ib_qp, ibqp);
+}
+
+static inline struct mlx5_ib_srq *to_mibsrq(struct mlx5_core_srq *msrq)
+{
+	return container_of(msrq, struct mlx5_ib_srq, msrq);
+}
+
+static inline struct mlx5_ib_mr *to_mmr(struct ib_mr *ibmr)
+{
+	return container_of(ibmr, struct mlx5_ib_mr, ibmr);
+}
+
+static inline struct mlx5_ib_fast_reg_page_list *to_mfrpl(struct ib_fast_reg_page_list *ibfrpl)
+{
+	return container_of(ibfrpl, struct mlx5_ib_fast_reg_page_list, ibfrpl);
+}
+
+struct mlx5_ib_ah {
+	struct ib_ah		ibah;
+	struct mlx5_av		av;
+};
+
+static inline struct mlx5_ib_ah *to_mah(struct ib_ah *ibah)
+{
+	return container_of(ibah, struct mlx5_ib_ah, ibah);
+}
+
+int mlx5_ib_db_map_user(struct mlx5_ib_ucontext *context, unsigned long virt,
+			struct mlx5_db *db);
+void mlx5_ib_db_unmap_user(struct mlx5_ib_ucontext *context, struct mlx5_db *db);
+void __mlx5_ib_cq_clean(struct mlx5_ib_cq *cq, u32 qpn, struct mlx5_ib_srq *srq);
+void mlx5_ib_cq_clean(struct mlx5_ib_cq *cq, u32 qpn, struct mlx5_ib_srq *srq);
+void mlx5_ib_free_srq_wqe(struct mlx5_ib_srq *srq, int wqe_index);
+int mlx5_MAD_IFC(struct mlx5_ib_dev *dev, int ignore_mkey, int ignore_bkey,
+		 u8 port, struct ib_wc *in_wc, struct ib_grh *in_grh,
+		 void *in_mad, void *response_mad);
+struct ib_ah *create_ib_ah(struct ib_ah_attr *ah_attr,
+			   struct mlx5_ib_ah *ah);
+struct ib_ah *mlx5_ib_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr);
+int mlx5_ib_query_ah(struct ib_ah *ibah, struct ib_ah_attr *ah_attr);
+int mlx5_ib_destroy_ah(struct ib_ah *ah);
+struct ib_srq *mlx5_ib_create_srq(struct ib_pd *pd,
+				  struct ib_srq_init_attr *init_attr,
+				  struct ib_udata *udata);
+int mlx5_ib_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
+		       enum ib_srq_attr_mask attr_mask, struct ib_udata *udata);
+int mlx5_ib_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *srq_attr);
+int mlx5_ib_destroy_srq(struct ib_srq *srq);
+int mlx5_ib_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr,
+			  struct ib_recv_wr **bad_wr);
+struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd,
+				struct ib_qp_init_attr *init_attr,
+				struct ib_udata *udata);
+int mlx5_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+		      int attr_mask, struct ib_udata *udata);
+int mlx5_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, int qp_attr_mask,
+		     struct ib_qp_init_attr *qp_init_attr);
+int mlx5_ib_destroy_qp(struct ib_qp *qp);
+int mlx5_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
+		      struct ib_send_wr **bad_wr);
+int mlx5_ib_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr,
+		      struct ib_recv_wr **bad_wr);
+void *mlx5_get_send_wqe(struct mlx5_ib_qp *qp, int n);
+struct ib_cq *mlx5_ib_create_cq(struct ib_device *ibdev, int entries,
+				int vector, struct ib_ucontext *context,
+				struct ib_udata *udata);
+int mlx5_ib_destroy_cq(struct ib_cq *cq);
+int mlx5_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc);
+int mlx5_ib_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags);
+int mlx5_ib_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period);
+int mlx5_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata);
+struct ib_mr *mlx5_ib_get_dma_mr(struct ib_pd *pd, int acc);
+struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
+				  u64 virt_addr, int access_flags,
+				  struct ib_udata *udata);
+int mlx5_ib_dereg_mr(struct ib_mr *ibmr);
+int mlx5_ib_destroy_mr(struct ib_mr *ibmr);
+struct ib_mr *mlx5_ib_create_mr(struct ib_pd *pd,
+				struct ib_mr_init_attr *mr_init_attr);
+struct ib_mr *mlx5_ib_alloc_fast_reg_mr(struct ib_pd *pd,
+					int max_page_list_len);
+struct ib_fast_reg_page_list *mlx5_ib_alloc_fast_reg_page_list(struct ib_device *ibdev,
+							       int page_list_len);
+void mlx5_ib_free_fast_reg_page_list(struct ib_fast_reg_page_list *page_list);
+struct ib_fmr *mlx5_ib_fmr_alloc(struct ib_pd *pd, int acc,
+				 struct ib_fmr_attr *fmr_attr);
+int mlx5_ib_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list,
+		      int npages, u64 iova);
+int mlx5_ib_unmap_fmr(struct list_head *fmr_list);
+int mlx5_ib_fmr_dealloc(struct ib_fmr *ibfmr);
+int mlx5_ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
+			struct ib_wc *in_wc, struct ib_grh *in_grh,
+			struct ib_mad *in_mad, struct ib_mad *out_mad);
+struct ib_xrcd *mlx5_ib_alloc_xrcd(struct ib_device *ibdev,
+					  struct ib_ucontext *context,
+					  struct ib_udata *udata);
+int mlx5_ib_dealloc_xrcd(struct ib_xrcd *xrcd);
+int mlx5_vector2eqn(struct mlx5_ib_dev *dev, int vector, int *eqn, int *irqn);
+int mlx5_ib_get_buf_offset(u64 addr, int page_shift, u32 *offset);
+int mlx5_query_ext_port_caps(struct mlx5_ib_dev *dev, u8 port);
+int mlx5_ib_query_port(struct ib_device *ibdev, u8 port,
+		       struct ib_port_attr *props);
+int mlx5_ib_init_fmr(struct mlx5_ib_dev *dev);
+void mlx5_ib_cleanup_fmr(struct mlx5_ib_dev *dev);
+void mlx5_ib_cont_pages(struct ib_umem *umem, u64 addr, int *count, int *shift,
+			int *ncont, int *order);
+void mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem,
+			  int page_shift, __be64 *pas, int umr);
+void mlx5_ib_copy_pas(u64 *old, u64 *new, int step, int num);
+int mlx5_ib_get_cqe_size(struct mlx5_ib_dev *dev, struct ib_cq *ibcq);
+int mlx5_mr_cache_init(struct mlx5_ib_dev *dev);
+int mlx5_mr_cache_cleanup(struct mlx5_ib_dev *dev);
+int mlx5_mr_ib_cont_pages(struct ib_umem *umem, u64 addr, int *count, int *shift);
+void mlx5_umr_cq_handler(struct ib_cq *cq, void *cq_context);
+int mlx5_ib_check_mr_status(struct ib_mr *ibmr, u32 check_mask,
+			    struct ib_mr_status *mr_status);
+
+static inline void init_query_mad(struct ib_smp *mad)
+{
+	mad->base_version  = 1;
+	mad->mgmt_class    = IB_MGMT_CLASS_SUBN_LID_ROUTED;
+	mad->class_version = 1;
+	mad->method	   = IB_MGMT_METHOD_GET;
+}
+
+static inline u8 convert_access(int acc)
+{
+	return (acc & IB_ACCESS_REMOTE_ATOMIC ? MLX5_PERM_ATOMIC       : 0) |
+	       (acc & IB_ACCESS_REMOTE_WRITE  ? MLX5_PERM_REMOTE_WRITE : 0) |
+	       (acc & IB_ACCESS_REMOTE_READ   ? MLX5_PERM_REMOTE_READ  : 0) |
+	       (acc & IB_ACCESS_LOCAL_WRITE   ? MLX5_PERM_LOCAL_WRITE  : 0) |
+	       MLX5_PERM_LOCAL_READ;
+}
+
+#endif /* MLX5_IB_H */
diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
new file mode 100644
index 0000000..8ee7cb4
--- /dev/null
+++ b/drivers/infiniband/hw/mlx5/mr.c
@@ -0,0 +1,1250 @@
+/*
+ * Copyright (c) 2013, Mellanox Technologies inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+
+#include <linux/kref.h>
+#include <linux/random.h>
+#include <linux/debugfs.h>
+#include <linux/export.h>
+#include <linux/delay.h>
+#include <rdma/ib_umem.h>
+#include "mlx5_ib.h"
+
+enum {
+	MAX_PENDING_REG_MR = 8,
+};
+
+enum {
+	MLX5_UMR_ALIGN	= 2048
+};
+
+static __be64 *mr_align(__be64 *ptr, int align)
+{
+	unsigned long mask = align - 1;
+
+	return (__be64 *)(((unsigned long)ptr + mask) & ~mask);
+}
+
+static int order2idx(struct mlx5_ib_dev *dev, int order)
+{
+	struct mlx5_mr_cache *cache = &dev->cache;
+
+	if (order < cache->ent[0].order)
+		return 0;
+	else
+		return order - cache->ent[0].order;
+}
+
+static void reg_mr_callback(int status, void *context)
+{
+	struct mlx5_ib_mr *mr = context;
+	struct mlx5_ib_dev *dev = mr->dev;
+	struct mlx5_mr_cache *cache = &dev->cache;
+	int c = order2idx(dev, mr->order);
+	struct mlx5_cache_ent *ent = &cache->ent[c];
+	u8 key;
+	unsigned long flags;
+	struct mlx5_mr_table *table = &dev->mdev->priv.mr_table;
+	int err;
+
+	spin_lock_irqsave(&ent->lock, flags);
+	ent->pending--;
+	spin_unlock_irqrestore(&ent->lock, flags);
+	if (status) {
+		mlx5_ib_warn(dev, "async reg mr failed. status %d\n", status);
+		kfree(mr);
+		dev->fill_delay = 1;
+		mod_timer(&dev->delay_timer, jiffies + HZ);
+		return;
+	}
+
+	if (mr->out.hdr.status) {
+		mlx5_ib_warn(dev, "failed - status %d, syndorme 0x%x\n",
+			     mr->out.hdr.status,
+			     be32_to_cpu(mr->out.hdr.syndrome));
+		kfree(mr);
+		dev->fill_delay = 1;
+		mod_timer(&dev->delay_timer, jiffies + HZ);
+		return;
+	}
+
+	spin_lock_irqsave(&dev->mdev->priv.mkey_lock, flags);
+	key = dev->mdev->priv.mkey_key++;
+	spin_unlock_irqrestore(&dev->mdev->priv.mkey_lock, flags);
+	mr->mmr.key = mlx5_idx_to_mkey(be32_to_cpu(mr->out.mkey) & 0xffffff) | key;
+
+	cache->last_add = jiffies;
+
+	spin_lock_irqsave(&ent->lock, flags);
+	list_add_tail(&mr->list, &ent->head);
+	ent->cur++;
+	ent->size++;
+	spin_unlock_irqrestore(&ent->lock, flags);
+
+	write_lock_irqsave(&table->lock, flags);
+	err = radix_tree_insert(&table->tree, mlx5_base_mkey(mr->mmr.key),
+				&mr->mmr);
+	if (err)
+		pr_err("Error inserting to mr tree. 0x%x\n", -err);
+	write_unlock_irqrestore(&table->lock, flags);
+}
+
+static int add_keys(struct mlx5_ib_dev *dev, int c, int num)
+{
+	struct mlx5_mr_cache *cache = &dev->cache;
+	struct mlx5_cache_ent *ent = &cache->ent[c];
+	struct mlx5_create_mkey_mbox_in *in;
+	struct mlx5_ib_mr *mr;
+	int npages = 1 << ent->order;
+	int err = 0;
+	int i;
+
+	in = kzalloc(sizeof(*in), GFP_KERNEL);
+	if (!in)
+		return -ENOMEM;
+
+	for (i = 0; i < num; i++) {
+		if (ent->pending >= MAX_PENDING_REG_MR) {
+			err = -EAGAIN;
+			break;
+		}
+
+		mr = kzalloc(sizeof(*mr), GFP_KERNEL);
+		if (!mr) {
+			err = -ENOMEM;
+			break;
+		}
+		mr->order = ent->order;
+		mr->umred = 1;
+		mr->dev = dev;
+		in->seg.status = 1 << 6;
+		in->seg.xlt_oct_size = cpu_to_be32((npages + 1) / 2);
+		in->seg.qpn_mkey7_0 = cpu_to_be32(0xffffff << 8);
+		in->seg.flags = MLX5_ACCESS_MODE_MTT | MLX5_PERM_UMR_EN;
+		in->seg.log2_page_size = 12;
+
+		spin_lock_irq(&ent->lock);
+		ent->pending++;
+		spin_unlock_irq(&ent->lock);
+		err = mlx5_core_create_mkey(dev->mdev, &mr->mmr, in,
+					    sizeof(*in), reg_mr_callback,
+					    mr, &mr->out);
+		if (err) {
+			mlx5_ib_warn(dev, "create mkey failed %d\n", err);
+			kfree(mr);
+			break;
+		}
+	}
+
+	kfree(in);
+	return err;
+}
+
+static void remove_keys(struct mlx5_ib_dev *dev, int c, int num)
+{
+	struct mlx5_mr_cache *cache = &dev->cache;
+	struct mlx5_cache_ent *ent = &cache->ent[c];
+	struct mlx5_ib_mr *mr;
+	int err;
+	int i;
+
+	for (i = 0; i < num; i++) {
+		spin_lock_irq(&ent->lock);
+		if (list_empty(&ent->head)) {
+			spin_unlock_irq(&ent->lock);
+			return;
+		}
+		mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list);
+		list_del(&mr->list);
+		ent->cur--;
+		ent->size--;
+		spin_unlock_irq(&ent->lock);
+		err = mlx5_core_destroy_mkey(dev->mdev, &mr->mmr);
+		if (err)
+			mlx5_ib_warn(dev, "failed destroy mkey\n");
+		else
+			kfree(mr);
+	}
+}
+
+static ssize_t size_write(struct file *filp, const char __user *buf,
+			  size_t count, loff_t *pos)
+{
+	struct mlx5_cache_ent *ent = filp->private_data;
+	struct mlx5_ib_dev *dev = ent->dev;
+	char lbuf[20];
+	u32 var;
+	int err;
+	int c;
+
+	if (copy_from_user(lbuf, buf, sizeof(lbuf)))
+		return -EFAULT;
+
+	c = order2idx(dev, ent->order);
+	lbuf[sizeof(lbuf) - 1] = 0;
+
+	if (sscanf(lbuf, "%u", &var) != 1)
+		return -EINVAL;
+
+	if (var < ent->limit)
+		return -EINVAL;
+
+	if (var > ent->size) {
+		do {
+			err = add_keys(dev, c, var - ent->size);
+			if (err && err != -EAGAIN)
+				return err;
+
+			usleep_range(3000, 5000);
+		} while (err);
+	} else if (var < ent->size) {
+		remove_keys(dev, c, ent->size - var);
+	}
+
+	return count;
+}
+
+static ssize_t size_read(struct file *filp, char __user *buf, size_t count,
+			 loff_t *pos)
+{
+	struct mlx5_cache_ent *ent = filp->private_data;
+	char lbuf[20];
+	int err;
+
+	if (*pos)
+		return 0;
+
+	err = snprintf(lbuf, sizeof(lbuf), "%d\n", ent->size);
+	if (err < 0)
+		return err;
+
+	if (copy_to_user(buf, lbuf, err))
+		return -EFAULT;
+
+	*pos += err;
+
+	return err;
+}
+
+static const struct file_operations size_fops = {
+	.owner	= THIS_MODULE,
+	.open	= simple_open,
+	.write	= size_write,
+	.read	= size_read,
+};
+
+static ssize_t limit_write(struct file *filp, const char __user *buf,
+			   size_t count, loff_t *pos)
+{
+	struct mlx5_cache_ent *ent = filp->private_data;
+	struct mlx5_ib_dev *dev = ent->dev;
+	char lbuf[20];
+	u32 var;
+	int err;
+	int c;
+
+	if (copy_from_user(lbuf, buf, sizeof(lbuf)))
+		return -EFAULT;
+
+	c = order2idx(dev, ent->order);
+	lbuf[sizeof(lbuf) - 1] = 0;
+
+	if (sscanf(lbuf, "%u", &var) != 1)
+		return -EINVAL;
+
+	if (var > ent->size)
+		return -EINVAL;
+
+	ent->limit = var;
+
+	if (ent->cur < ent->limit) {
+		err = add_keys(dev, c, 2 * ent->limit - ent->cur);
+		if (err)
+			return err;
+	}
+
+	return count;
+}
+
+static ssize_t limit_read(struct file *filp, char __user *buf, size_t count,
+			  loff_t *pos)
+{
+	struct mlx5_cache_ent *ent = filp->private_data;
+	char lbuf[20];
+	int err;
+
+	if (*pos)
+		return 0;
+
+	err = snprintf(lbuf, sizeof(lbuf), "%d\n", ent->limit);
+	if (err < 0)
+		return err;
+
+	if (copy_to_user(buf, lbuf, err))
+		return -EFAULT;
+
+	*pos += err;
+
+	return err;
+}
+
+static const struct file_operations limit_fops = {
+	.owner	= THIS_MODULE,
+	.open	= simple_open,
+	.write	= limit_write,
+	.read	= limit_read,
+};
+
+static int someone_adding(struct mlx5_mr_cache *cache)
+{
+	int i;
+
+	for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) {
+		if (cache->ent[i].cur < cache->ent[i].limit)
+			return 1;
+	}
+
+	return 0;
+}
+
+static void __cache_work_func(struct mlx5_cache_ent *ent)
+{
+	struct mlx5_ib_dev *dev = ent->dev;
+	struct mlx5_mr_cache *cache = &dev->cache;
+	int i = order2idx(dev, ent->order);
+	int err;
+
+	if (cache->stopped)
+		return;
+
+	ent = &dev->cache.ent[i];
+	if (ent->cur < 2 * ent->limit && !dev->fill_delay) {
+		err = add_keys(dev, i, 1);
+		if (ent->cur < 2 * ent->limit) {
+			if (err == -EAGAIN) {
+				mlx5_ib_dbg(dev, "returned eagain, order %d\n",
+					    i + 2);
+				queue_delayed_work(cache->wq, &ent->dwork,
+						   msecs_to_jiffies(3));
+			} else if (err) {
+				mlx5_ib_warn(dev, "command failed order %d, err %d\n",
+					     i + 2, err);
+				queue_delayed_work(cache->wq, &ent->dwork,
+						   msecs_to_jiffies(1000));
+			} else {
+				queue_work(cache->wq, &ent->work);
+			}
+		}
+	} else if (ent->cur > 2 * ent->limit) {
+		if (!someone_adding(cache) &&
+		    time_after(jiffies, cache->last_add + 300 * HZ)) {
+			remove_keys(dev, i, 1);
+			if (ent->cur > ent->limit)
+				queue_work(cache->wq, &ent->work);
+		} else {
+			queue_delayed_work(cache->wq, &ent->dwork, 300 * HZ);
+		}
+	}
+}
+
+static void delayed_cache_work_func(struct work_struct *work)
+{
+	struct mlx5_cache_ent *ent;
+
+	ent = container_of(work, struct mlx5_cache_ent, dwork.work);
+	__cache_work_func(ent);
+}
+
+static void cache_work_func(struct work_struct *work)
+{
+	struct mlx5_cache_ent *ent;
+
+	ent = container_of(work, struct mlx5_cache_ent, work);
+	__cache_work_func(ent);
+}
+
+static struct mlx5_ib_mr *alloc_cached_mr(struct mlx5_ib_dev *dev, int order)
+{
+	struct mlx5_mr_cache *cache = &dev->cache;
+	struct mlx5_ib_mr *mr = NULL;
+	struct mlx5_cache_ent *ent;
+	int c;
+	int i;
+
+	c = order2idx(dev, order);
+	if (c < 0 || c >= MAX_MR_CACHE_ENTRIES) {
+		mlx5_ib_warn(dev, "order %d, cache index %d\n", order, c);
+		return NULL;
+	}
+
+	for (i = c; i < MAX_MR_CACHE_ENTRIES; i++) {
+		ent = &cache->ent[i];
+
+		mlx5_ib_dbg(dev, "order %d, cache index %d\n", ent->order, i);
+
+		spin_lock_irq(&ent->lock);
+		if (!list_empty(&ent->head)) {
+			mr = list_first_entry(&ent->head, struct mlx5_ib_mr,
+					      list);
+			list_del(&mr->list);
+			ent->cur--;
+			spin_unlock_irq(&ent->lock);
+			if (ent->cur < ent->limit)
+				queue_work(cache->wq, &ent->work);
+			break;
+		}
+		spin_unlock_irq(&ent->lock);
+
+		queue_work(cache->wq, &ent->work);
+
+		if (mr)
+			break;
+	}
+
+	if (!mr)
+		cache->ent[c].miss++;
+
+	return mr;
+}
+
+static void free_cached_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
+{
+	struct mlx5_mr_cache *cache = &dev->cache;
+	struct mlx5_cache_ent *ent;
+	int shrink = 0;
+	int c;
+
+	c = order2idx(dev, mr->order);
+	if (c < 0 || c >= MAX_MR_CACHE_ENTRIES) {
+		mlx5_ib_warn(dev, "order %d, cache index %d\n", mr->order, c);
+		return;
+	}
+	ent = &cache->ent[c];
+	spin_lock_irq(&ent->lock);
+	list_add_tail(&mr->list, &ent->head);
+	ent->cur++;
+	if (ent->cur > 2 * ent->limit)
+		shrink = 1;
+	spin_unlock_irq(&ent->lock);
+
+	if (shrink)
+		queue_work(cache->wq, &ent->work);
+}
+
+static void clean_keys(struct mlx5_ib_dev *dev, int c)
+{
+	struct mlx5_mr_cache *cache = &dev->cache;
+	struct mlx5_cache_ent *ent = &cache->ent[c];
+	struct mlx5_ib_mr *mr;
+	int err;
+
+	cancel_delayed_work(&ent->dwork);
+	while (1) {
+		spin_lock_irq(&ent->lock);
+		if (list_empty(&ent->head)) {
+			spin_unlock_irq(&ent->lock);
+			return;
+		}
+		mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list);
+		list_del(&mr->list);
+		ent->cur--;
+		ent->size--;
+		spin_unlock_irq(&ent->lock);
+		err = mlx5_core_destroy_mkey(dev->mdev, &mr->mmr);
+		if (err)
+			mlx5_ib_warn(dev, "failed destroy mkey\n");
+		else
+			kfree(mr);
+	}
+}
+
+static int mlx5_mr_cache_debugfs_init(struct mlx5_ib_dev *dev)
+{
+	struct mlx5_mr_cache *cache = &dev->cache;
+	struct mlx5_cache_ent *ent;
+	int i;
+
+	if (!mlx5_debugfs_root)
+		return 0;
+
+	cache->root = debugfs_create_dir("mr_cache", dev->mdev->priv.dbg_root);
+	if (!cache->root)
+		return -ENOMEM;
+
+	for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) {
+		ent = &cache->ent[i];
+		sprintf(ent->name, "%d", ent->order);
+		ent->dir = debugfs_create_dir(ent->name,  cache->root);
+		if (!ent->dir)
+			return -ENOMEM;
+
+		ent->fsize = debugfs_create_file("size", 0600, ent->dir, ent,
+						 &size_fops);
+		if (!ent->fsize)
+			return -ENOMEM;
+
+		ent->flimit = debugfs_create_file("limit", 0600, ent->dir, ent,
+						  &limit_fops);
+		if (!ent->flimit)
+			return -ENOMEM;
+
+		ent->fcur = debugfs_create_u32("cur", 0400, ent->dir,
+					       &ent->cur);
+		if (!ent->fcur)
+			return -ENOMEM;
+
+		ent->fmiss = debugfs_create_u32("miss", 0600, ent->dir,
+						&ent->miss);
+		if (!ent->fmiss)
+			return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static void mlx5_mr_cache_debugfs_cleanup(struct mlx5_ib_dev *dev)
+{
+	if (!mlx5_debugfs_root)
+		return;
+
+	debugfs_remove_recursive(dev->cache.root);
+}
+
+static void delay_time_func(unsigned long ctx)
+{
+	struct mlx5_ib_dev *dev = (struct mlx5_ib_dev *)ctx;
+
+	dev->fill_delay = 0;
+}
+
+int mlx5_mr_cache_init(struct mlx5_ib_dev *dev)
+{
+	struct mlx5_mr_cache *cache = &dev->cache;
+	struct mlx5_cache_ent *ent;
+	int limit;
+	int err;
+	int i;
+
+	cache->wq = create_singlethread_workqueue("mkey_cache");
+	if (!cache->wq) {
+		mlx5_ib_warn(dev, "failed to create work queue\n");
+		return -ENOMEM;
+	}
+
+	setup_timer(&dev->delay_timer, delay_time_func, (unsigned long)dev);
+	for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) {
+		INIT_LIST_HEAD(&cache->ent[i].head);
+		spin_lock_init(&cache->ent[i].lock);
+
+		ent = &cache->ent[i];
+		INIT_LIST_HEAD(&ent->head);
+		spin_lock_init(&ent->lock);
+		ent->order = i + 2;
+		ent->dev = dev;
+
+		if (dev->mdev->profile->mask & MLX5_PROF_MASK_MR_CACHE)
+			limit = dev->mdev->profile->mr_cache[i].limit;
+		else
+			limit = 0;
+
+		INIT_WORK(&ent->work, cache_work_func);
+		INIT_DELAYED_WORK(&ent->dwork, delayed_cache_work_func);
+		ent->limit = limit;
+		queue_work(cache->wq, &ent->work);
+	}
+
+	err = mlx5_mr_cache_debugfs_init(dev);
+	if (err)
+		mlx5_ib_warn(dev, "cache debugfs failure\n");
+
+	return 0;
+}
+
+int mlx5_mr_cache_cleanup(struct mlx5_ib_dev *dev)
+{
+	int i;
+
+	dev->cache.stopped = 1;
+	flush_workqueue(dev->cache.wq);
+
+	mlx5_mr_cache_debugfs_cleanup(dev);
+
+	for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++)
+		clean_keys(dev, i);
+
+	destroy_workqueue(dev->cache.wq);
+	del_timer_sync(&dev->delay_timer);
+
+	return 0;
+}
+
+struct ib_mr *mlx5_ib_get_dma_mr(struct ib_pd *pd, int acc)
+{
+	struct mlx5_ib_dev *dev = to_mdev(pd->device);
+	struct mlx5_core_dev *mdev = dev->mdev;
+	struct mlx5_create_mkey_mbox_in *in;
+	struct mlx5_mkey_seg *seg;
+	struct mlx5_ib_mr *mr;
+	int err;
+
+	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
+	if (!mr)
+		return ERR_PTR(-ENOMEM);
+
+	in = kzalloc(sizeof(*in), GFP_KERNEL);
+	if (!in) {
+		err = -ENOMEM;
+		goto err_free;
+	}
+
+	seg = &in->seg;
+	seg->flags = convert_access(acc) | MLX5_ACCESS_MODE_PA;
+	seg->flags_pd = cpu_to_be32(to_mpd(pd)->pdn | MLX5_MKEY_LEN64);
+	seg->qpn_mkey7_0 = cpu_to_be32(0xffffff << 8);
+	seg->start_addr = 0;
+
+	err = mlx5_core_create_mkey(mdev, &mr->mmr, in, sizeof(*in), NULL, NULL,
+				    NULL);
+	if (err)
+		goto err_in;
+
+	kfree(in);
+	mr->ibmr.lkey = mr->mmr.key;
+	mr->ibmr.rkey = mr->mmr.key;
+	mr->umem = NULL;
+
+	return &mr->ibmr;
+
+err_in:
+	kfree(in);
+
+err_free:
+	kfree(mr);
+
+	return ERR_PTR(err);
+}
+
+static int get_octo_len(u64 addr, u64 len, int page_size)
+{
+	u64 offset;
+	int npages;
+
+	offset = addr & (page_size - 1);
+	npages = ALIGN(len + offset, page_size) >> ilog2(page_size);
+	return (npages + 1) / 2;
+}
+
+static int use_umr(int order)
+{
+	return order <= 17;
+}
+
+static void prep_umr_reg_wqe(struct ib_pd *pd, struct ib_send_wr *wr,
+			     struct ib_sge *sg, u64 dma, int n, u32 key,
+			     int page_shift, u64 virt_addr, u64 len,
+			     int access_flags)
+{
+	struct mlx5_ib_dev *dev = to_mdev(pd->device);
+	struct ib_mr *mr = dev->umrc.mr;
+
+	sg->addr = dma;
+	sg->length = ALIGN(sizeof(u64) * n, 64);
+	sg->lkey = mr->lkey;
+
+	wr->next = NULL;
+	wr->send_flags = 0;
+	wr->sg_list = sg;
+	if (n)
+		wr->num_sge = 1;
+	else
+		wr->num_sge = 0;
+
+	wr->opcode = MLX5_IB_WR_UMR;
+	wr->wr.fast_reg.page_list_len = n;
+	wr->wr.fast_reg.page_shift = page_shift;
+	wr->wr.fast_reg.rkey = key;
+	wr->wr.fast_reg.iova_start = virt_addr;
+	wr->wr.fast_reg.length = len;
+	wr->wr.fast_reg.access_flags = access_flags;
+	wr->wr.fast_reg.page_list = (struct ib_fast_reg_page_list *)pd;
+}
+
+static void prep_umr_unreg_wqe(struct mlx5_ib_dev *dev,
+			       struct ib_send_wr *wr, u32 key)
+{
+	wr->send_flags = MLX5_IB_SEND_UMR_UNREG;
+	wr->opcode = MLX5_IB_WR_UMR;
+	wr->wr.fast_reg.rkey = key;
+}
+
+void mlx5_umr_cq_handler(struct ib_cq *cq, void *cq_context)
+{
+	struct mlx5_ib_umr_context *context;
+	struct ib_wc wc;
+	int err;
+
+	while (1) {
+		err = ib_poll_cq(cq, 1, &wc);
+		if (err < 0) {
+			pr_warn("poll cq error %d\n", err);
+			return;
+		}
+		if (err == 0)
+			break;
+
+		context = (struct mlx5_ib_umr_context *) (unsigned long) wc.wr_id;
+		context->status = wc.status;
+		complete(&context->done);
+	}
+	ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
+}
+
+static struct mlx5_ib_mr *reg_umr(struct ib_pd *pd, struct ib_umem *umem,
+				  u64 virt_addr, u64 len, int npages,
+				  int page_shift, int order, int access_flags)
+{
+	struct mlx5_ib_dev *dev = to_mdev(pd->device);
+	struct device *ddev = dev->ib_dev.dma_device;
+	struct umr_common *umrc = &dev->umrc;
+	struct mlx5_ib_umr_context umr_context;
+	struct ib_send_wr wr, *bad;
+	struct mlx5_ib_mr *mr;
+	struct ib_sge sg;
+	int size = sizeof(u64) * npages;
+	int err = 0;
+	int i;
+
+	for (i = 0; i < 1; i++) {
+		mr = alloc_cached_mr(dev, order);
+		if (mr)
+			break;
+
+		err = add_keys(dev, order2idx(dev, order), 1);
+		if (err && err != -EAGAIN) {
+			mlx5_ib_warn(dev, "add_keys failed, err %d\n", err);
+			break;
+		}
+	}
+
+	if (!mr)
+		return ERR_PTR(-EAGAIN);
+
+	mr->pas = kmalloc(size + MLX5_UMR_ALIGN - 1, GFP_KERNEL);
+	if (!mr->pas) {
+		err = -ENOMEM;
+		goto free_mr;
+	}
+
+	mlx5_ib_populate_pas(dev, umem, page_shift,
+			     mr_align(mr->pas, MLX5_UMR_ALIGN), 1);
+
+	mr->dma = dma_map_single(ddev, mr_align(mr->pas, MLX5_UMR_ALIGN), size,
+				 DMA_TO_DEVICE);
+	if (dma_mapping_error(ddev, mr->dma)) {
+		err = -ENOMEM;
+		goto free_pas;
+	}
+
+	memset(&wr, 0, sizeof(wr));
+	wr.wr_id = (u64)(unsigned long)&umr_context;
+	prep_umr_reg_wqe(pd, &wr, &sg, mr->dma, npages, mr->mmr.key, page_shift, virt_addr, len, access_flags);
+
+	mlx5_ib_init_umr_context(&umr_context);
+	down(&umrc->sem);
+	err = ib_post_send(umrc->qp, &wr, &bad);
+	if (err) {
+		mlx5_ib_warn(dev, "post send failed, err %d\n", err);
+		goto unmap_dma;
+	} else {
+		wait_for_completion(&umr_context.done);
+		if (umr_context.status != IB_WC_SUCCESS) {
+			mlx5_ib_warn(dev, "reg umr failed\n");
+			err = -EFAULT;
+		}
+	}
+
+	mr->mmr.iova = virt_addr;
+	mr->mmr.size = len;
+	mr->mmr.pd = to_mpd(pd)->pdn;
+
+unmap_dma:
+	up(&umrc->sem);
+	dma_unmap_single(ddev, mr->dma, size, DMA_TO_DEVICE);
+
+free_pas:
+	kfree(mr->pas);
+
+free_mr:
+	if (err) {
+		free_cached_mr(dev, mr);
+		return ERR_PTR(err);
+	}
+
+	return mr;
+}
+
+static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, u64 virt_addr,
+				     u64 length, struct ib_umem *umem,
+				     int npages, int page_shift,
+				     int access_flags)
+{
+	struct mlx5_ib_dev *dev = to_mdev(pd->device);
+	struct mlx5_create_mkey_mbox_in *in;
+	struct mlx5_ib_mr *mr;
+	int inlen;
+	int err;
+
+	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
+	if (!mr)
+		return ERR_PTR(-ENOMEM);
+
+	inlen = sizeof(*in) + sizeof(*in->pas) * ((npages + 1) / 2) * 2;
+	in = mlx5_vzalloc(inlen);
+	if (!in) {
+		err = -ENOMEM;
+		goto err_1;
+	}
+	mlx5_ib_populate_pas(dev, umem, page_shift, in->pas, 0);
+
+	in->seg.flags = convert_access(access_flags) |
+		MLX5_ACCESS_MODE_MTT;
+	in->seg.flags_pd = cpu_to_be32(to_mpd(pd)->pdn);
+	in->seg.start_addr = cpu_to_be64(virt_addr);
+	in->seg.len = cpu_to_be64(length);
+	in->seg.bsfs_octo_size = 0;
+	in->seg.xlt_oct_size = cpu_to_be32(get_octo_len(virt_addr, length, 1 << page_shift));
+	in->seg.log2_page_size = page_shift;
+	in->seg.qpn_mkey7_0 = cpu_to_be32(0xffffff << 8);
+	in->xlat_oct_act_size = cpu_to_be32(get_octo_len(virt_addr, length,
+							 1 << page_shift));
+	err = mlx5_core_create_mkey(dev->mdev, &mr->mmr, in, inlen, NULL,
+				    NULL, NULL);
+	if (err) {
+		mlx5_ib_warn(dev, "create mkey failed\n");
+		goto err_2;
+	}
+	mr->umem = umem;
+	mlx5_vfree(in);
+
+	mlx5_ib_dbg(dev, "mkey = 0x%x\n", mr->mmr.key);
+
+	return mr;
+
+err_2:
+	mlx5_vfree(in);
+
+err_1:
+	kfree(mr);
+
+	return ERR_PTR(err);
+}
+
+struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
+				  u64 virt_addr, int access_flags,
+				  struct ib_udata *udata)
+{
+	struct mlx5_ib_dev *dev = to_mdev(pd->device);
+	struct mlx5_ib_mr *mr = NULL;
+	struct ib_umem *umem;
+	int page_shift;
+	int npages;
+	int ncont;
+	int order;
+	int err;
+
+	mlx5_ib_dbg(dev, "start 0x%llx, virt_addr 0x%llx, length 0x%llx, access_flags 0x%x\n",
+		    start, virt_addr, length, access_flags);
+	umem = ib_umem_get(pd->uobject->context, start, length, access_flags,
+			   0);
+	if (IS_ERR(umem)) {
+		mlx5_ib_dbg(dev, "umem get failed (%ld)\n", PTR_ERR(umem));
+		return (void *)umem;
+	}
+
+	mlx5_ib_cont_pages(umem, start, &npages, &page_shift, &ncont, &order);
+	if (!npages) {
+		mlx5_ib_warn(dev, "avoid zero region\n");
+		err = -EINVAL;
+		goto error;
+	}
+
+	mlx5_ib_dbg(dev, "npages %d, ncont %d, order %d, page_shift %d\n",
+		    npages, ncont, order, page_shift);
+
+	if (use_umr(order)) {
+		mr = reg_umr(pd, umem, virt_addr, length, ncont, page_shift,
+			     order, access_flags);
+		if (PTR_ERR(mr) == -EAGAIN) {
+			mlx5_ib_dbg(dev, "cache empty for order %d", order);
+			mr = NULL;
+		}
+	}
+
+	if (!mr)
+		mr = reg_create(pd, virt_addr, length, umem, ncont, page_shift,
+				access_flags);
+
+	if (IS_ERR(mr)) {
+		err = PTR_ERR(mr);
+		goto error;
+	}
+
+	mlx5_ib_dbg(dev, "mkey 0x%x\n", mr->mmr.key);
+
+	mr->umem = umem;
+	mr->npages = npages;
+	spin_lock(&dev->mr_lock);
+	dev->mdev->priv.reg_pages += npages;
+	spin_unlock(&dev->mr_lock);
+	mr->ibmr.lkey = mr->mmr.key;
+	mr->ibmr.rkey = mr->mmr.key;
+
+	return &mr->ibmr;
+
+error:
+	ib_umem_release(umem);
+	return ERR_PTR(err);
+}
+
+static int unreg_umr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
+{
+	struct umr_common *umrc = &dev->umrc;
+	struct mlx5_ib_umr_context umr_context;
+	struct ib_send_wr wr, *bad;
+	int err;
+
+	memset(&wr, 0, sizeof(wr));
+	wr.wr_id = (u64)(unsigned long)&umr_context;
+	prep_umr_unreg_wqe(dev, &wr, mr->mmr.key);
+
+	mlx5_ib_init_umr_context(&umr_context);
+	down(&umrc->sem);
+	err = ib_post_send(umrc->qp, &wr, &bad);
+	if (err) {
+		up(&umrc->sem);
+		mlx5_ib_dbg(dev, "err %d\n", err);
+		goto error;
+	} else {
+		wait_for_completion(&umr_context.done);
+		up(&umrc->sem);
+	}
+	if (umr_context.status != IB_WC_SUCCESS) {
+		mlx5_ib_warn(dev, "unreg umr failed\n");
+		err = -EFAULT;
+		goto error;
+	}
+	return 0;
+
+error:
+	return err;
+}
+
+int mlx5_ib_dereg_mr(struct ib_mr *ibmr)
+{
+	struct mlx5_ib_dev *dev = to_mdev(ibmr->device);
+	struct mlx5_ib_mr *mr = to_mmr(ibmr);
+	struct ib_umem *umem = mr->umem;
+	int npages = mr->npages;
+	int umred = mr->umred;
+	int err;
+
+	if (!umred) {
+		err = mlx5_core_destroy_mkey(dev->mdev, &mr->mmr);
+		if (err) {
+			mlx5_ib_warn(dev, "failed to destroy mkey 0x%x (%d)\n",
+				     mr->mmr.key, err);
+			return err;
+		}
+	} else {
+		err = unreg_umr(dev, mr);
+		if (err) {
+			mlx5_ib_warn(dev, "failed unregister\n");
+			return err;
+		}
+		free_cached_mr(dev, mr);
+	}
+
+	if (umem) {
+		ib_umem_release(umem);
+		spin_lock(&dev->mr_lock);
+		dev->mdev->priv.reg_pages -= npages;
+		spin_unlock(&dev->mr_lock);
+	}
+
+	if (!umred)
+		kfree(mr);
+
+	return 0;
+}
+
+struct ib_mr *mlx5_ib_create_mr(struct ib_pd *pd,
+				struct ib_mr_init_attr *mr_init_attr)
+{
+	struct mlx5_ib_dev *dev = to_mdev(pd->device);
+	struct mlx5_create_mkey_mbox_in *in;
+	struct mlx5_ib_mr *mr;
+	int access_mode, err;
+	int ndescs = roundup(mr_init_attr->max_reg_descriptors, 4);
+
+	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
+	if (!mr)
+		return ERR_PTR(-ENOMEM);
+
+	in = kzalloc(sizeof(*in), GFP_KERNEL);
+	if (!in) {
+		err = -ENOMEM;
+		goto err_free;
+	}
+
+	in->seg.status = 1 << 6; /* free */
+	in->seg.xlt_oct_size = cpu_to_be32(ndescs);
+	in->seg.qpn_mkey7_0 = cpu_to_be32(0xffffff << 8);
+	in->seg.flags_pd = cpu_to_be32(to_mpd(pd)->pdn);
+	access_mode = MLX5_ACCESS_MODE_MTT;
+
+	if (mr_init_attr->flags & IB_MR_SIGNATURE_EN) {
+		u32 psv_index[2];
+
+		in->seg.flags_pd = cpu_to_be32(be32_to_cpu(in->seg.flags_pd) |
+							   MLX5_MKEY_BSF_EN);
+		in->seg.bsfs_octo_size = cpu_to_be32(MLX5_MKEY_BSF_OCTO_SIZE);
+		mr->sig = kzalloc(sizeof(*mr->sig), GFP_KERNEL);
+		if (!mr->sig) {
+			err = -ENOMEM;
+			goto err_free_in;
+		}
+
+		/* create mem & wire PSVs */
+		err = mlx5_core_create_psv(dev->mdev, to_mpd(pd)->pdn,
+					   2, psv_index);
+		if (err)
+			goto err_free_sig;
+
+		access_mode = MLX5_ACCESS_MODE_KLM;
+		mr->sig->psv_memory.psv_idx = psv_index[0];
+		mr->sig->psv_wire.psv_idx = psv_index[1];
+
+		mr->sig->sig_status_checked = true;
+		mr->sig->sig_err_exists = false;
+		/* Next UMR, Arm SIGERR */
+		++mr->sig->sigerr_count;
+	}
+
+	in->seg.flags = MLX5_PERM_UMR_EN | access_mode;
+	err = mlx5_core_create_mkey(dev->mdev, &mr->mmr, in, sizeof(*in),
+				    NULL, NULL, NULL);
+	if (err)
+		goto err_destroy_psv;
+
+	mr->ibmr.lkey = mr->mmr.key;
+	mr->ibmr.rkey = mr->mmr.key;
+	mr->umem = NULL;
+	kfree(in);
+
+	return &mr->ibmr;
+
+err_destroy_psv:
+	if (mr->sig) {
+		if (mlx5_core_destroy_psv(dev->mdev,
+					  mr->sig->psv_memory.psv_idx))
+			mlx5_ib_warn(dev, "failed to destroy mem psv %d\n",
+				     mr->sig->psv_memory.psv_idx);
+		if (mlx5_core_destroy_psv(dev->mdev,
+					  mr->sig->psv_wire.psv_idx))
+			mlx5_ib_warn(dev, "failed to destroy wire psv %d\n",
+				     mr->sig->psv_wire.psv_idx);
+	}
+err_free_sig:
+	kfree(mr->sig);
+err_free_in:
+	kfree(in);
+err_free:
+	kfree(mr);
+	return ERR_PTR(err);
+}
+
+int mlx5_ib_destroy_mr(struct ib_mr *ibmr)
+{
+	struct mlx5_ib_dev *dev = to_mdev(ibmr->device);
+	struct mlx5_ib_mr *mr = to_mmr(ibmr);
+	int err;
+
+	if (mr->sig) {
+		if (mlx5_core_destroy_psv(dev->mdev,
+					  mr->sig->psv_memory.psv_idx))
+			mlx5_ib_warn(dev, "failed to destroy mem psv %d\n",
+				     mr->sig->psv_memory.psv_idx);
+		if (mlx5_core_destroy_psv(dev->mdev,
+					  mr->sig->psv_wire.psv_idx))
+			mlx5_ib_warn(dev, "failed to destroy wire psv %d\n",
+				     mr->sig->psv_wire.psv_idx);
+		kfree(mr->sig);
+	}
+
+	err = mlx5_core_destroy_mkey(dev->mdev, &mr->mmr);
+	if (err) {
+		mlx5_ib_warn(dev, "failed to destroy mkey 0x%x (%d)\n",
+			     mr->mmr.key, err);
+		return err;
+	}
+
+	kfree(mr);
+
+	return err;
+}
+
+struct ib_mr *mlx5_ib_alloc_fast_reg_mr(struct ib_pd *pd,
+					int max_page_list_len)
+{
+	struct mlx5_ib_dev *dev = to_mdev(pd->device);
+	struct mlx5_create_mkey_mbox_in *in;
+	struct mlx5_ib_mr *mr;
+	int err;
+
+	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
+	if (!mr)
+		return ERR_PTR(-ENOMEM);
+
+	in = kzalloc(sizeof(*in), GFP_KERNEL);
+	if (!in) {
+		err = -ENOMEM;
+		goto err_free;
+	}
+
+	in->seg.status = 1 << 6; /* free */
+	in->seg.xlt_oct_size = cpu_to_be32((max_page_list_len + 1) / 2);
+	in->seg.qpn_mkey7_0 = cpu_to_be32(0xffffff << 8);
+	in->seg.flags = MLX5_PERM_UMR_EN | MLX5_ACCESS_MODE_MTT;
+	in->seg.flags_pd = cpu_to_be32(to_mpd(pd)->pdn);
+	/*
+	 * TBD not needed - issue 197292 */
+	in->seg.log2_page_size = PAGE_SHIFT;
+
+	err = mlx5_core_create_mkey(dev->mdev, &mr->mmr, in, sizeof(*in), NULL,
+				    NULL, NULL);
+	kfree(in);
+	if (err)
+		goto err_free;
+
+	mr->ibmr.lkey = mr->mmr.key;
+	mr->ibmr.rkey = mr->mmr.key;
+	mr->umem = NULL;
+
+	return &mr->ibmr;
+
+err_free:
+	kfree(mr);
+	return ERR_PTR(err);
+}
+
+struct ib_fast_reg_page_list *mlx5_ib_alloc_fast_reg_page_list(struct ib_device *ibdev,
+							       int page_list_len)
+{
+	struct mlx5_ib_fast_reg_page_list *mfrpl;
+	int size = page_list_len * sizeof(u64);
+
+	mfrpl = kmalloc(sizeof(*mfrpl), GFP_KERNEL);
+	if (!mfrpl)
+		return ERR_PTR(-ENOMEM);
+
+	mfrpl->ibfrpl.page_list = kmalloc(size, GFP_KERNEL);
+	if (!mfrpl->ibfrpl.page_list)
+		goto err_free;
+
+	mfrpl->mapped_page_list = dma_alloc_coherent(ibdev->dma_device,
+						     size, &mfrpl->map,
+						     GFP_KERNEL);
+	if (!mfrpl->mapped_page_list)
+		goto err_free;
+
+	WARN_ON(mfrpl->map & 0x3f);
+
+	return &mfrpl->ibfrpl;
+
+err_free:
+	kfree(mfrpl->ibfrpl.page_list);
+	kfree(mfrpl);
+	return ERR_PTR(-ENOMEM);
+}
+
+void mlx5_ib_free_fast_reg_page_list(struct ib_fast_reg_page_list *page_list)
+{
+	struct mlx5_ib_fast_reg_page_list *mfrpl = to_mfrpl(page_list);
+	struct mlx5_ib_dev *dev = to_mdev(page_list->device);
+	int size = page_list->max_page_list_len * sizeof(u64);
+
+	dma_free_coherent(&dev->mdev->pdev->dev, size, mfrpl->mapped_page_list,
+			  mfrpl->map);
+	kfree(mfrpl->ibfrpl.page_list);
+	kfree(mfrpl);
+}
+
+int mlx5_ib_check_mr_status(struct ib_mr *ibmr, u32 check_mask,
+			    struct ib_mr_status *mr_status)
+{
+	struct mlx5_ib_mr *mmr = to_mmr(ibmr);
+	int ret = 0;
+
+	if (check_mask & ~IB_MR_CHECK_SIG_STATUS) {
+		pr_err("Invalid status check mask\n");
+		ret = -EINVAL;
+		goto done;
+	}
+
+	mr_status->fail_status = 0;
+	if (check_mask & IB_MR_CHECK_SIG_STATUS) {
+		if (!mmr->sig) {
+			ret = -EINVAL;
+			pr_err("signature status check requested on a non-signature enabled MR\n");
+			goto done;
+		}
+
+		mmr->sig->sig_status_checked = true;
+		if (!mmr->sig->sig_err_exists)
+			goto done;
+
+		if (ibmr->lkey == mmr->sig->err_item.key)
+			memcpy(&mr_status->sig_err, &mmr->sig->err_item,
+			       sizeof(mr_status->sig_err));
+		else {
+			mr_status->sig_err.err_type = IB_SIG_BAD_GUARD;
+			mr_status->sig_err.sig_err_offset = 0;
+			mr_status->sig_err.key = mmr->sig->err_item.key;
+		}
+
+		mmr->sig->sig_err_exists = false;
+		mr_status->fail_status |= IB_MR_CHECK_SIG_STATUS;
+	}
+
+done:
+	return ret;
+}
diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c
new file mode 100644
index 0000000..e261a53
--- /dev/null
+++ b/drivers/infiniband/hw/mlx5/qp.c
@@ -0,0 +1,3039 @@
+/*
+ * Copyright (c) 2013, Mellanox Technologies inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <rdma/ib_umem.h>
+#include "mlx5_ib.h"
+#include "user.h"
+
+/* not supported currently */
+static int wq_signature;
+
+enum {
+	MLX5_IB_ACK_REQ_FREQ	= 8,
+};
+
+enum {
+	MLX5_IB_DEFAULT_SCHED_QUEUE	= 0x83,
+	MLX5_IB_DEFAULT_QP0_SCHED_QUEUE	= 0x3f,
+	MLX5_IB_LINK_TYPE_IB		= 0,
+	MLX5_IB_LINK_TYPE_ETH		= 1
+};
+
+enum {
+	MLX5_IB_SQ_STRIDE	= 6,
+	MLX5_IB_CACHE_LINE_SIZE	= 64,
+};
+
+static const u32 mlx5_ib_opcode[] = {
+	[IB_WR_SEND]				= MLX5_OPCODE_SEND,
+	[IB_WR_SEND_WITH_IMM]			= MLX5_OPCODE_SEND_IMM,
+	[IB_WR_RDMA_WRITE]			= MLX5_OPCODE_RDMA_WRITE,
+	[IB_WR_RDMA_WRITE_WITH_IMM]		= MLX5_OPCODE_RDMA_WRITE_IMM,
+	[IB_WR_RDMA_READ]			= MLX5_OPCODE_RDMA_READ,
+	[IB_WR_ATOMIC_CMP_AND_SWP]		= MLX5_OPCODE_ATOMIC_CS,
+	[IB_WR_ATOMIC_FETCH_AND_ADD]		= MLX5_OPCODE_ATOMIC_FA,
+	[IB_WR_SEND_WITH_INV]			= MLX5_OPCODE_SEND_INVAL,
+	[IB_WR_LOCAL_INV]			= MLX5_OPCODE_UMR,
+	[IB_WR_FAST_REG_MR]			= MLX5_OPCODE_UMR,
+	[IB_WR_MASKED_ATOMIC_CMP_AND_SWP]	= MLX5_OPCODE_ATOMIC_MASKED_CS,
+	[IB_WR_MASKED_ATOMIC_FETCH_AND_ADD]	= MLX5_OPCODE_ATOMIC_MASKED_FA,
+	[MLX5_IB_WR_UMR]			= MLX5_OPCODE_UMR,
+};
+
+struct umr_wr {
+	u64				virt_addr;
+	struct ib_pd		       *pd;
+	unsigned int			page_shift;
+	unsigned int			npages;
+	u32				length;
+	int				access_flags;
+	u32				mkey;
+};
+
+static int is_qp0(enum ib_qp_type qp_type)
+{
+	return qp_type == IB_QPT_SMI;
+}
+
+static int is_qp1(enum ib_qp_type qp_type)
+{
+	return qp_type == IB_QPT_GSI;
+}
+
+static int is_sqp(enum ib_qp_type qp_type)
+{
+	return is_qp0(qp_type) || is_qp1(qp_type);
+}
+
+static void *get_wqe(struct mlx5_ib_qp *qp, int offset)
+{
+	return mlx5_buf_offset(&qp->buf, offset);
+}
+
+static void *get_recv_wqe(struct mlx5_ib_qp *qp, int n)
+{
+	return get_wqe(qp, qp->rq.offset + (n << qp->rq.wqe_shift));
+}
+
+void *mlx5_get_send_wqe(struct mlx5_ib_qp *qp, int n)
+{
+	return get_wqe(qp, qp->sq.offset + (n << MLX5_IB_SQ_STRIDE));
+}
+
+static void mlx5_ib_qp_event(struct mlx5_core_qp *qp, int type)
+{
+	struct ib_qp *ibqp = &to_mibqp(qp)->ibqp;
+	struct ib_event event;
+
+	if (type == MLX5_EVENT_TYPE_PATH_MIG)
+		to_mibqp(qp)->port = to_mibqp(qp)->alt_port;
+
+	if (ibqp->event_handler) {
+		event.device     = ibqp->device;
+		event.element.qp = ibqp;
+		switch (type) {
+		case MLX5_EVENT_TYPE_PATH_MIG:
+			event.event = IB_EVENT_PATH_MIG;
+			break;
+		case MLX5_EVENT_TYPE_COMM_EST:
+			event.event = IB_EVENT_COMM_EST;
+			break;
+		case MLX5_EVENT_TYPE_SQ_DRAINED:
+			event.event = IB_EVENT_SQ_DRAINED;
+			break;
+		case MLX5_EVENT_TYPE_SRQ_LAST_WQE:
+			event.event = IB_EVENT_QP_LAST_WQE_REACHED;
+			break;
+		case MLX5_EVENT_TYPE_WQ_CATAS_ERROR:
+			event.event = IB_EVENT_QP_FATAL;
+			break;
+		case MLX5_EVENT_TYPE_PATH_MIG_FAILED:
+			event.event = IB_EVENT_PATH_MIG_ERR;
+			break;
+		case MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR:
+			event.event = IB_EVENT_QP_REQ_ERR;
+			break;
+		case MLX5_EVENT_TYPE_WQ_ACCESS_ERROR:
+			event.event = IB_EVENT_QP_ACCESS_ERR;
+			break;
+		default:
+			pr_warn("mlx5_ib: Unexpected event type %d on QP %06x\n", type, qp->qpn);
+			return;
+		}
+
+		ibqp->event_handler(&event, ibqp->qp_context);
+	}
+}
+
+static int set_rq_size(struct mlx5_ib_dev *dev, struct ib_qp_cap *cap,
+		       int has_rq, struct mlx5_ib_qp *qp, struct mlx5_ib_create_qp *ucmd)
+{
+	struct mlx5_general_caps *gen;
+	int wqe_size;
+	int wq_size;
+
+	gen = &dev->mdev->caps.gen;
+	/* Sanity check RQ size before proceeding */
+	if (cap->max_recv_wr  > gen->max_wqes)
+		return -EINVAL;
+
+	if (!has_rq) {
+		qp->rq.max_gs = 0;
+		qp->rq.wqe_cnt = 0;
+		qp->rq.wqe_shift = 0;
+	} else {
+		if (ucmd) {
+			qp->rq.wqe_cnt = ucmd->rq_wqe_count;
+			qp->rq.wqe_shift = ucmd->rq_wqe_shift;
+			qp->rq.max_gs = (1 << qp->rq.wqe_shift) / sizeof(struct mlx5_wqe_data_seg) - qp->wq_sig;
+			qp->rq.max_post = qp->rq.wqe_cnt;
+		} else {
+			wqe_size = qp->wq_sig ? sizeof(struct mlx5_wqe_signature_seg) : 0;
+			wqe_size += cap->max_recv_sge * sizeof(struct mlx5_wqe_data_seg);
+			wqe_size = roundup_pow_of_two(wqe_size);
+			wq_size = roundup_pow_of_two(cap->max_recv_wr) * wqe_size;
+			wq_size = max_t(int, wq_size, MLX5_SEND_WQE_BB);
+			qp->rq.wqe_cnt = wq_size / wqe_size;
+			if (wqe_size > gen->max_rq_desc_sz) {
+				mlx5_ib_dbg(dev, "wqe_size %d, max %d\n",
+					    wqe_size,
+					    gen->max_rq_desc_sz);
+				return -EINVAL;
+			}
+			qp->rq.wqe_shift = ilog2(wqe_size);
+			qp->rq.max_gs = (1 << qp->rq.wqe_shift) / sizeof(struct mlx5_wqe_data_seg) - qp->wq_sig;
+			qp->rq.max_post = qp->rq.wqe_cnt;
+		}
+	}
+
+	return 0;
+}
+
+static int sq_overhead(enum ib_qp_type qp_type)
+{
+	int size = 0;
+
+	switch (qp_type) {
+	case IB_QPT_XRC_INI:
+		size += sizeof(struct mlx5_wqe_xrc_seg);
+		/* fall through */
+	case IB_QPT_RC:
+		size += sizeof(struct mlx5_wqe_ctrl_seg) +
+			sizeof(struct mlx5_wqe_atomic_seg) +
+			sizeof(struct mlx5_wqe_raddr_seg);
+		break;
+
+	case IB_QPT_XRC_TGT:
+		return 0;
+
+	case IB_QPT_UC:
+		size += sizeof(struct mlx5_wqe_ctrl_seg) +
+			sizeof(struct mlx5_wqe_raddr_seg) +
+			sizeof(struct mlx5_wqe_umr_ctrl_seg) +
+			sizeof(struct mlx5_mkey_seg);
+		break;
+
+	case IB_QPT_UD:
+	case IB_QPT_SMI:
+	case IB_QPT_GSI:
+		size += sizeof(struct mlx5_wqe_ctrl_seg) +
+			sizeof(struct mlx5_wqe_datagram_seg);
+		break;
+
+	case MLX5_IB_QPT_REG_UMR:
+		size += sizeof(struct mlx5_wqe_ctrl_seg) +
+			sizeof(struct mlx5_wqe_umr_ctrl_seg) +
+			sizeof(struct mlx5_mkey_seg);
+		break;
+
+	default:
+		return -EINVAL;
+	}
+
+	return size;
+}
+
+static int calc_send_wqe(struct ib_qp_init_attr *attr)
+{
+	int inl_size = 0;
+	int size;
+
+	size = sq_overhead(attr->qp_type);
+	if (size < 0)
+		return size;
+
+	if (attr->cap.max_inline_data) {
+		inl_size = size + sizeof(struct mlx5_wqe_inline_seg) +
+			attr->cap.max_inline_data;
+	}
+
+	size += attr->cap.max_send_sge * sizeof(struct mlx5_wqe_data_seg);
+	if (attr->create_flags & IB_QP_CREATE_SIGNATURE_EN &&
+	    ALIGN(max_t(int, inl_size, size), MLX5_SEND_WQE_BB) < MLX5_SIG_WQE_SIZE)
+			return MLX5_SIG_WQE_SIZE;
+	else
+		return ALIGN(max_t(int, inl_size, size), MLX5_SEND_WQE_BB);
+}
+
+static int calc_sq_size(struct mlx5_ib_dev *dev, struct ib_qp_init_attr *attr,
+			struct mlx5_ib_qp *qp)
+{
+	struct mlx5_general_caps *gen;
+	int wqe_size;
+	int wq_size;
+
+	gen = &dev->mdev->caps.gen;
+	if (!attr->cap.max_send_wr)
+		return 0;
+
+	wqe_size = calc_send_wqe(attr);
+	mlx5_ib_dbg(dev, "wqe_size %d\n", wqe_size);
+	if (wqe_size < 0)
+		return wqe_size;
+
+	if (wqe_size > gen->max_sq_desc_sz) {
+		mlx5_ib_dbg(dev, "wqe_size(%d) > max_sq_desc_sz(%d)\n",
+			    wqe_size, gen->max_sq_desc_sz);
+		return -EINVAL;
+	}
+
+	qp->max_inline_data = wqe_size - sq_overhead(attr->qp_type) -
+		sizeof(struct mlx5_wqe_inline_seg);
+	attr->cap.max_inline_data = qp->max_inline_data;
+
+	if (attr->create_flags & IB_QP_CREATE_SIGNATURE_EN)
+		qp->signature_en = true;
+
+	wq_size = roundup_pow_of_two(attr->cap.max_send_wr * wqe_size);
+	qp->sq.wqe_cnt = wq_size / MLX5_SEND_WQE_BB;
+	if (qp->sq.wqe_cnt > gen->max_wqes) {
+		mlx5_ib_dbg(dev, "wqe count(%d) exceeds limits(%d)\n",
+			    qp->sq.wqe_cnt, gen->max_wqes);
+		return -ENOMEM;
+	}
+	qp->sq.wqe_shift = ilog2(MLX5_SEND_WQE_BB);
+	qp->sq.max_gs = attr->cap.max_send_sge;
+	qp->sq.max_post = wq_size / wqe_size;
+	attr->cap.max_send_wr = qp->sq.max_post;
+
+	return wq_size;
+}
+
+static int set_user_buf_size(struct mlx5_ib_dev *dev,
+			    struct mlx5_ib_qp *qp,
+			    struct mlx5_ib_create_qp *ucmd)
+{
+	struct mlx5_general_caps *gen;
+	int desc_sz = 1 << qp->sq.wqe_shift;
+
+	gen = &dev->mdev->caps.gen;
+	if (desc_sz > gen->max_sq_desc_sz) {
+		mlx5_ib_warn(dev, "desc_sz %d, max_sq_desc_sz %d\n",
+			     desc_sz, gen->max_sq_desc_sz);
+		return -EINVAL;
+	}
+
+	if (ucmd->sq_wqe_count && ((1 << ilog2(ucmd->sq_wqe_count)) != ucmd->sq_wqe_count)) {
+		mlx5_ib_warn(dev, "sq_wqe_count %d, sq_wqe_count %d\n",
+			     ucmd->sq_wqe_count, ucmd->sq_wqe_count);
+		return -EINVAL;
+	}
+
+	qp->sq.wqe_cnt = ucmd->sq_wqe_count;
+
+	if (qp->sq.wqe_cnt > gen->max_wqes) {
+		mlx5_ib_warn(dev, "wqe_cnt %d, max_wqes %d\n",
+			     qp->sq.wqe_cnt, gen->max_wqes);
+		return -EINVAL;
+	}
+
+	qp->buf_size = (qp->rq.wqe_cnt << qp->rq.wqe_shift) +
+		(qp->sq.wqe_cnt << 6);
+
+	return 0;
+}
+
+static int qp_has_rq(struct ib_qp_init_attr *attr)
+{
+	if (attr->qp_type == IB_QPT_XRC_INI ||
+	    attr->qp_type == IB_QPT_XRC_TGT || attr->srq ||
+	    attr->qp_type == MLX5_IB_QPT_REG_UMR ||
+	    !attr->cap.max_recv_wr)
+		return 0;
+
+	return 1;
+}
+
+static int first_med_uuar(void)
+{
+	return 1;
+}
+
+static int next_uuar(int n)
+{
+	n++;
+
+	while (((n % 4) & 2))
+		n++;
+
+	return n;
+}
+
+static int num_med_uuar(struct mlx5_uuar_info *uuari)
+{
+	int n;
+
+	n = uuari->num_uars * MLX5_NON_FP_BF_REGS_PER_PAGE -
+		uuari->num_low_latency_uuars - 1;
+
+	return n >= 0 ? n : 0;
+}
+
+static int max_uuari(struct mlx5_uuar_info *uuari)
+{
+	return uuari->num_uars * 4;
+}
+
+static int first_hi_uuar(struct mlx5_uuar_info *uuari)
+{
+	int med;
+	int i;
+	int t;
+
+	med = num_med_uuar(uuari);
+	for (t = 0, i = first_med_uuar();; i = next_uuar(i)) {
+		t++;
+		if (t == med)
+			return next_uuar(i);
+	}
+
+	return 0;
+}
+
+static int alloc_high_class_uuar(struct mlx5_uuar_info *uuari)
+{
+	int i;
+
+	for (i = first_hi_uuar(uuari); i < max_uuari(uuari); i = next_uuar(i)) {
+		if (!test_bit(i, uuari->bitmap)) {
+			set_bit(i, uuari->bitmap);
+			uuari->count[i]++;
+			return i;
+		}
+	}
+
+	return -ENOMEM;
+}
+
+static int alloc_med_class_uuar(struct mlx5_uuar_info *uuari)
+{
+	int minidx = first_med_uuar();
+	int i;
+
+	for (i = first_med_uuar(); i < first_hi_uuar(uuari); i = next_uuar(i)) {
+		if (uuari->count[i] < uuari->count[minidx])
+			minidx = i;
+	}
+
+	uuari->count[minidx]++;
+	return minidx;
+}
+
+static int alloc_uuar(struct mlx5_uuar_info *uuari,
+		      enum mlx5_ib_latency_class lat)
+{
+	int uuarn = -EINVAL;
+
+	mutex_lock(&uuari->lock);
+	switch (lat) {
+	case MLX5_IB_LATENCY_CLASS_LOW:
+		uuarn = 0;
+		uuari->count[uuarn]++;
+		break;
+
+	case MLX5_IB_LATENCY_CLASS_MEDIUM:
+		if (uuari->ver < 2)
+			uuarn = -ENOMEM;
+		else
+			uuarn = alloc_med_class_uuar(uuari);
+		break;
+
+	case MLX5_IB_LATENCY_CLASS_HIGH:
+		if (uuari->ver < 2)
+			uuarn = -ENOMEM;
+		else
+			uuarn = alloc_high_class_uuar(uuari);
+		break;
+
+	case MLX5_IB_LATENCY_CLASS_FAST_PATH:
+		uuarn = 2;
+		break;
+	}
+	mutex_unlock(&uuari->lock);
+
+	return uuarn;
+}
+
+static void free_med_class_uuar(struct mlx5_uuar_info *uuari, int uuarn)
+{
+	clear_bit(uuarn, uuari->bitmap);
+	--uuari->count[uuarn];
+}
+
+static void free_high_class_uuar(struct mlx5_uuar_info *uuari, int uuarn)
+{
+	clear_bit(uuarn, uuari->bitmap);
+	--uuari->count[uuarn];
+}
+
+static void free_uuar(struct mlx5_uuar_info *uuari, int uuarn)
+{
+	int nuuars = uuari->num_uars * MLX5_BF_REGS_PER_PAGE;
+	int high_uuar = nuuars - uuari->num_low_latency_uuars;
+
+	mutex_lock(&uuari->lock);
+	if (uuarn == 0) {
+		--uuari->count[uuarn];
+		goto out;
+	}
+
+	if (uuarn < high_uuar) {
+		free_med_class_uuar(uuari, uuarn);
+		goto out;
+	}
+
+	free_high_class_uuar(uuari, uuarn);
+
+out:
+	mutex_unlock(&uuari->lock);
+}
+
+static enum mlx5_qp_state to_mlx5_state(enum ib_qp_state state)
+{
+	switch (state) {
+	case IB_QPS_RESET:	return MLX5_QP_STATE_RST;
+	case IB_QPS_INIT:	return MLX5_QP_STATE_INIT;
+	case IB_QPS_RTR:	return MLX5_QP_STATE_RTR;
+	case IB_QPS_RTS:	return MLX5_QP_STATE_RTS;
+	case IB_QPS_SQD:	return MLX5_QP_STATE_SQD;
+	case IB_QPS_SQE:	return MLX5_QP_STATE_SQER;
+	case IB_QPS_ERR:	return MLX5_QP_STATE_ERR;
+	default:		return -1;
+	}
+}
+
+static int to_mlx5_st(enum ib_qp_type type)
+{
+	switch (type) {
+	case IB_QPT_RC:			return MLX5_QP_ST_RC;
+	case IB_QPT_UC:			return MLX5_QP_ST_UC;
+	case IB_QPT_UD:			return MLX5_QP_ST_UD;
+	case MLX5_IB_QPT_REG_UMR:	return MLX5_QP_ST_REG_UMR;
+	case IB_QPT_XRC_INI:
+	case IB_QPT_XRC_TGT:		return MLX5_QP_ST_XRC;
+	case IB_QPT_SMI:		return MLX5_QP_ST_QP0;
+	case IB_QPT_GSI:		return MLX5_QP_ST_QP1;
+	case IB_QPT_RAW_IPV6:		return MLX5_QP_ST_RAW_IPV6;
+	case IB_QPT_RAW_ETHERTYPE:	return MLX5_QP_ST_RAW_ETHERTYPE;
+	case IB_QPT_RAW_PACKET:
+	case IB_QPT_MAX:
+	default:		return -EINVAL;
+	}
+}
+
+static int uuarn_to_uar_index(struct mlx5_uuar_info *uuari, int uuarn)
+{
+	return uuari->uars[uuarn / MLX5_BF_REGS_PER_PAGE].index;
+}
+
+static int create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd,
+			  struct mlx5_ib_qp *qp, struct ib_udata *udata,
+			  struct mlx5_create_qp_mbox_in **in,
+			  struct mlx5_ib_create_qp_resp *resp, int *inlen)
+{
+	struct mlx5_ib_ucontext *context;
+	struct mlx5_ib_create_qp ucmd;
+	int page_shift = 0;
+	int uar_index;
+	int npages;
+	u32 offset = 0;
+	int uuarn;
+	int ncont = 0;
+	int err;
+
+	err = ib_copy_from_udata(&ucmd, udata, sizeof(ucmd));
+	if (err) {
+		mlx5_ib_dbg(dev, "copy failed\n");
+		return err;
+	}
+
+	context = to_mucontext(pd->uobject->context);
+	/*
+	 * TBD: should come from the verbs when we have the API
+	 */
+	uuarn = alloc_uuar(&context->uuari, MLX5_IB_LATENCY_CLASS_HIGH);
+	if (uuarn < 0) {
+		mlx5_ib_dbg(dev, "failed to allocate low latency UUAR\n");
+		mlx5_ib_dbg(dev, "reverting to medium latency\n");
+		uuarn = alloc_uuar(&context->uuari, MLX5_IB_LATENCY_CLASS_MEDIUM);
+		if (uuarn < 0) {
+			mlx5_ib_dbg(dev, "failed to allocate medium latency UUAR\n");
+			mlx5_ib_dbg(dev, "reverting to high latency\n");
+			uuarn = alloc_uuar(&context->uuari, MLX5_IB_LATENCY_CLASS_LOW);
+			if (uuarn < 0) {
+				mlx5_ib_warn(dev, "uuar allocation failed\n");
+				return uuarn;
+			}
+		}
+	}
+
+	uar_index = uuarn_to_uar_index(&context->uuari, uuarn);
+	mlx5_ib_dbg(dev, "uuarn 0x%x, uar_index 0x%x\n", uuarn, uar_index);
+
+	qp->rq.offset = 0;
+	qp->sq.wqe_shift = ilog2(MLX5_SEND_WQE_BB);
+	qp->sq.offset = qp->rq.wqe_cnt << qp->rq.wqe_shift;
+
+	err = set_user_buf_size(dev, qp, &ucmd);
+	if (err)
+		goto err_uuar;
+
+	if (ucmd.buf_addr && qp->buf_size) {
+		qp->umem = ib_umem_get(pd->uobject->context, ucmd.buf_addr,
+				       qp->buf_size, 0, 0);
+		if (IS_ERR(qp->umem)) {
+			mlx5_ib_dbg(dev, "umem_get failed\n");
+			err = PTR_ERR(qp->umem);
+			goto err_uuar;
+		}
+	} else {
+		qp->umem = NULL;
+	}
+
+	if (qp->umem) {
+		mlx5_ib_cont_pages(qp->umem, ucmd.buf_addr, &npages, &page_shift,
+				   &ncont, NULL);
+		err = mlx5_ib_get_buf_offset(ucmd.buf_addr, page_shift, &offset);
+		if (err) {
+			mlx5_ib_warn(dev, "bad offset\n");
+			goto err_umem;
+		}
+		mlx5_ib_dbg(dev, "addr 0x%llx, size %d, npages %d, page_shift %d, ncont %d, offset %d\n",
+			    ucmd.buf_addr, qp->buf_size, npages, page_shift, ncont, offset);
+	}
+
+	*inlen = sizeof(**in) + sizeof(*(*in)->pas) * ncont;
+	*in = mlx5_vzalloc(*inlen);
+	if (!*in) {
+		err = -ENOMEM;
+		goto err_umem;
+	}
+	if (qp->umem)
+		mlx5_ib_populate_pas(dev, qp->umem, page_shift, (*in)->pas, 0);
+	(*in)->ctx.log_pg_sz_remote_qpn =
+		cpu_to_be32((page_shift - MLX5_ADAPTER_PAGE_SHIFT) << 24);
+	(*in)->ctx.params2 = cpu_to_be32(offset << 6);
+
+	(*in)->ctx.qp_counter_set_usr_page = cpu_to_be32(uar_index);
+	resp->uuar_index = uuarn;
+	qp->uuarn = uuarn;
+
+	err = mlx5_ib_db_map_user(context, ucmd.db_addr, &qp->db);
+	if (err) {
+		mlx5_ib_dbg(dev, "map failed\n");
+		goto err_free;
+	}
+
+	err = ib_copy_to_udata(udata, resp, sizeof(*resp));
+	if (err) {
+		mlx5_ib_dbg(dev, "copy failed\n");
+		goto err_unmap;
+	}
+	qp->create_type = MLX5_QP_USER;
+
+	return 0;
+
+err_unmap:
+	mlx5_ib_db_unmap_user(context, &qp->db);
+
+err_free:
+	mlx5_vfree(*in);
+
+err_umem:
+	if (qp->umem)
+		ib_umem_release(qp->umem);
+
+err_uuar:
+	free_uuar(&context->uuari, uuarn);
+	return err;
+}
+
+static void destroy_qp_user(struct ib_pd *pd, struct mlx5_ib_qp *qp)
+{
+	struct mlx5_ib_ucontext *context;
+
+	context = to_mucontext(pd->uobject->context);
+	mlx5_ib_db_unmap_user(context, &qp->db);
+	if (qp->umem)
+		ib_umem_release(qp->umem);
+	free_uuar(&context->uuari, qp->uuarn);
+}
+
+static int create_kernel_qp(struct mlx5_ib_dev *dev,
+			    struct ib_qp_init_attr *init_attr,
+			    struct mlx5_ib_qp *qp,
+			    struct mlx5_create_qp_mbox_in **in, int *inlen)
+{
+	enum mlx5_ib_latency_class lc = MLX5_IB_LATENCY_CLASS_LOW;
+	struct mlx5_uuar_info *uuari;
+	int uar_index;
+	int uuarn;
+	int err;
+
+	uuari = &dev->mdev->priv.uuari;
+	if (init_attr->create_flags & ~(IB_QP_CREATE_SIGNATURE_EN | IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK))
+		return -EINVAL;
+
+	if (init_attr->qp_type == MLX5_IB_QPT_REG_UMR)
+		lc = MLX5_IB_LATENCY_CLASS_FAST_PATH;
+
+	uuarn = alloc_uuar(uuari, lc);
+	if (uuarn < 0) {
+		mlx5_ib_dbg(dev, "\n");
+		return -ENOMEM;
+	}
+
+	qp->bf = &uuari->bfs[uuarn];
+	uar_index = qp->bf->uar->index;
+
+	err = calc_sq_size(dev, init_attr, qp);
+	if (err < 0) {
+		mlx5_ib_dbg(dev, "err %d\n", err);
+		goto err_uuar;
+	}
+
+	qp->rq.offset = 0;
+	qp->sq.offset = qp->rq.wqe_cnt << qp->rq.wqe_shift;
+	qp->buf_size = err + (qp->rq.wqe_cnt << qp->rq.wqe_shift);
+
+	err = mlx5_buf_alloc(dev->mdev, qp->buf_size, PAGE_SIZE * 2, &qp->buf);
+	if (err) {
+		mlx5_ib_dbg(dev, "err %d\n", err);
+		goto err_uuar;
+	}
+
+	qp->sq.qend = mlx5_get_send_wqe(qp, qp->sq.wqe_cnt);
+	*inlen = sizeof(**in) + sizeof(*(*in)->pas) * qp->buf.npages;
+	*in = mlx5_vzalloc(*inlen);
+	if (!*in) {
+		err = -ENOMEM;
+		goto err_buf;
+	}
+	(*in)->ctx.qp_counter_set_usr_page = cpu_to_be32(uar_index);
+	(*in)->ctx.log_pg_sz_remote_qpn =
+		cpu_to_be32((qp->buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT) << 24);
+	/* Set "fast registration enabled" for all kernel QPs */
+	(*in)->ctx.params1 |= cpu_to_be32(1 << 11);
+	(*in)->ctx.sq_crq_size |= cpu_to_be16(1 << 4);
+
+	mlx5_fill_page_array(&qp->buf, (*in)->pas);
+
+	err = mlx5_db_alloc(dev->mdev, &qp->db);
+	if (err) {
+		mlx5_ib_dbg(dev, "err %d\n", err);
+		goto err_free;
+	}
+
+	qp->db.db[0] = 0;
+	qp->db.db[1] = 0;
+
+	qp->sq.wrid = kmalloc(qp->sq.wqe_cnt * sizeof(*qp->sq.wrid), GFP_KERNEL);
+	qp->sq.wr_data = kmalloc(qp->sq.wqe_cnt * sizeof(*qp->sq.wr_data), GFP_KERNEL);
+	qp->rq.wrid = kmalloc(qp->rq.wqe_cnt * sizeof(*qp->rq.wrid), GFP_KERNEL);
+	qp->sq.w_list = kmalloc(qp->sq.wqe_cnt * sizeof(*qp->sq.w_list), GFP_KERNEL);
+	qp->sq.wqe_head = kmalloc(qp->sq.wqe_cnt * sizeof(*qp->sq.wqe_head), GFP_KERNEL);
+
+	if (!qp->sq.wrid || !qp->sq.wr_data || !qp->rq.wrid ||
+	    !qp->sq.w_list || !qp->sq.wqe_head) {
+		err = -ENOMEM;
+		goto err_wrid;
+	}
+	qp->create_type = MLX5_QP_KERNEL;
+
+	return 0;
+
+err_wrid:
+	mlx5_db_free(dev->mdev, &qp->db);
+	kfree(qp->sq.wqe_head);
+	kfree(qp->sq.w_list);
+	kfree(qp->sq.wrid);
+	kfree(qp->sq.wr_data);
+	kfree(qp->rq.wrid);
+
+err_free:
+	mlx5_vfree(*in);
+
+err_buf:
+	mlx5_buf_free(dev->mdev, &qp->buf);
+
+err_uuar:
+	free_uuar(&dev->mdev->priv.uuari, uuarn);
+	return err;
+}
+
+static void destroy_qp_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp)
+{
+	mlx5_db_free(dev->mdev, &qp->db);
+	kfree(qp->sq.wqe_head);
+	kfree(qp->sq.w_list);
+	kfree(qp->sq.wrid);
+	kfree(qp->sq.wr_data);
+	kfree(qp->rq.wrid);
+	mlx5_buf_free(dev->mdev, &qp->buf);
+	free_uuar(&dev->mdev->priv.uuari, qp->bf->uuarn);
+}
+
+static __be32 get_rx_type(struct mlx5_ib_qp *qp, struct ib_qp_init_attr *attr)
+{
+	if (attr->srq || (attr->qp_type == IB_QPT_XRC_TGT) ||
+	    (attr->qp_type == IB_QPT_XRC_INI))
+		return cpu_to_be32(MLX5_SRQ_RQ);
+	else if (!qp->has_rq)
+		return cpu_to_be32(MLX5_ZERO_LEN_RQ);
+	else
+		return cpu_to_be32(MLX5_NON_ZERO_RQ);
+}
+
+static int is_connected(enum ib_qp_type qp_type)
+{
+	if (qp_type == IB_QPT_RC || qp_type == IB_QPT_UC)
+		return 1;
+
+	return 0;
+}
+
+static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd,
+			    struct ib_qp_init_attr *init_attr,
+			    struct ib_udata *udata, struct mlx5_ib_qp *qp)
+{
+	struct mlx5_ib_resources *devr = &dev->devr;
+	struct mlx5_ib_create_qp_resp resp;
+	struct mlx5_create_qp_mbox_in *in;
+	struct mlx5_general_caps *gen;
+	struct mlx5_ib_create_qp ucmd;
+	int inlen = sizeof(*in);
+	int err;
+
+	gen = &dev->mdev->caps.gen;
+	mutex_init(&qp->mutex);
+	spin_lock_init(&qp->sq.lock);
+	spin_lock_init(&qp->rq.lock);
+
+	if (init_attr->create_flags & IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK) {
+		if (!(gen->flags & MLX5_DEV_CAP_FLAG_BLOCK_MCAST)) {
+			mlx5_ib_dbg(dev, "block multicast loopback isn't supported\n");
+			return -EINVAL;
+		} else {
+			qp->flags |= MLX5_IB_QP_BLOCK_MULTICAST_LOOPBACK;
+		}
+	}
+
+	if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR)
+		qp->sq_signal_bits = MLX5_WQE_CTRL_CQ_UPDATE;
+
+	if (pd && pd->uobject) {
+		if (ib_copy_from_udata(&ucmd, udata, sizeof(ucmd))) {
+			mlx5_ib_dbg(dev, "copy failed\n");
+			return -EFAULT;
+		}
+
+		qp->wq_sig = !!(ucmd.flags & MLX5_QP_FLAG_SIGNATURE);
+		qp->scat_cqe = !!(ucmd.flags & MLX5_QP_FLAG_SCATTER_CQE);
+	} else {
+		qp->wq_sig = !!wq_signature;
+	}
+
+	qp->has_rq = qp_has_rq(init_attr);
+	err = set_rq_size(dev, &init_attr->cap, qp->has_rq,
+			  qp, (pd && pd->uobject) ? &ucmd : NULL);
+	if (err) {
+		mlx5_ib_dbg(dev, "err %d\n", err);
+		return err;
+	}
+
+	if (pd) {
+		if (pd->uobject) {
+			mlx5_ib_dbg(dev, "requested sq_wqe_count (%d)\n", ucmd.sq_wqe_count);
+			if (ucmd.rq_wqe_shift != qp->rq.wqe_shift ||
+			    ucmd.rq_wqe_count != qp->rq.wqe_cnt) {
+				mlx5_ib_dbg(dev, "invalid rq params\n");
+				return -EINVAL;
+			}
+			if (ucmd.sq_wqe_count > gen->max_wqes) {
+				mlx5_ib_dbg(dev, "requested sq_wqe_count (%d) > max allowed (%d)\n",
+					    ucmd.sq_wqe_count, gen->max_wqes);
+				return -EINVAL;
+			}
+			err = create_user_qp(dev, pd, qp, udata, &in, &resp, &inlen);
+			if (err)
+				mlx5_ib_dbg(dev, "err %d\n", err);
+		} else {
+			err = create_kernel_qp(dev, init_attr, qp, &in, &inlen);
+			if (err)
+				mlx5_ib_dbg(dev, "err %d\n", err);
+			else
+				qp->pa_lkey = to_mpd(pd)->pa_lkey;
+		}
+
+		if (err)
+			return err;
+	} else {
+		in = mlx5_vzalloc(sizeof(*in));
+		if (!in)
+			return -ENOMEM;
+
+		qp->create_type = MLX5_QP_EMPTY;
+	}
+
+	if (is_sqp(init_attr->qp_type))
+		qp->port = init_attr->port_num;
+
+	in->ctx.flags = cpu_to_be32(to_mlx5_st(init_attr->qp_type) << 16 |
+				    MLX5_QP_PM_MIGRATED << 11);
+
+	if (init_attr->qp_type != MLX5_IB_QPT_REG_UMR)
+		in->ctx.flags_pd = cpu_to_be32(to_mpd(pd ? pd : devr->p0)->pdn);
+	else
+		in->ctx.flags_pd = cpu_to_be32(MLX5_QP_LAT_SENSITIVE);
+
+	if (qp->wq_sig)
+		in->ctx.flags_pd |= cpu_to_be32(MLX5_QP_ENABLE_SIG);
+
+	if (qp->flags & MLX5_IB_QP_BLOCK_MULTICAST_LOOPBACK)
+		in->ctx.flags_pd |= cpu_to_be32(MLX5_QP_BLOCK_MCAST);
+
+	if (qp->scat_cqe && is_connected(init_attr->qp_type)) {
+		int rcqe_sz;
+		int scqe_sz;
+
+		rcqe_sz = mlx5_ib_get_cqe_size(dev, init_attr->recv_cq);
+		scqe_sz = mlx5_ib_get_cqe_size(dev, init_attr->send_cq);
+
+		if (rcqe_sz == 128)
+			in->ctx.cs_res = MLX5_RES_SCAT_DATA64_CQE;
+		else
+			in->ctx.cs_res = MLX5_RES_SCAT_DATA32_CQE;
+
+		if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR) {
+			if (scqe_sz == 128)
+				in->ctx.cs_req = MLX5_REQ_SCAT_DATA64_CQE;
+			else
+				in->ctx.cs_req = MLX5_REQ_SCAT_DATA32_CQE;
+		}
+	}
+
+	if (qp->rq.wqe_cnt) {
+		in->ctx.rq_size_stride = (qp->rq.wqe_shift - 4);
+		in->ctx.rq_size_stride |= ilog2(qp->rq.wqe_cnt) << 3;
+	}
+
+	in->ctx.rq_type_srqn = get_rx_type(qp, init_attr);
+
+	if (qp->sq.wqe_cnt)
+		in->ctx.sq_crq_size |= cpu_to_be16(ilog2(qp->sq.wqe_cnt) << 11);
+	else
+		in->ctx.sq_crq_size |= cpu_to_be16(0x8000);
+
+	/* Set default resources */
+	switch (init_attr->qp_type) {
+	case IB_QPT_XRC_TGT:
+		in->ctx.cqn_recv = cpu_to_be32(to_mcq(devr->c0)->mcq.cqn);
+		in->ctx.cqn_send = cpu_to_be32(to_mcq(devr->c0)->mcq.cqn);
+		in->ctx.rq_type_srqn |= cpu_to_be32(to_msrq(devr->s0)->msrq.srqn);
+		in->ctx.xrcd = cpu_to_be32(to_mxrcd(init_attr->xrcd)->xrcdn);
+		break;
+	case IB_QPT_XRC_INI:
+		in->ctx.cqn_recv = cpu_to_be32(to_mcq(devr->c0)->mcq.cqn);
+		in->ctx.xrcd = cpu_to_be32(to_mxrcd(devr->x1)->xrcdn);
+		in->ctx.rq_type_srqn |= cpu_to_be32(to_msrq(devr->s0)->msrq.srqn);
+		break;
+	default:
+		if (init_attr->srq) {
+			in->ctx.xrcd = cpu_to_be32(to_mxrcd(devr->x0)->xrcdn);
+			in->ctx.rq_type_srqn |= cpu_to_be32(to_msrq(init_attr->srq)->msrq.srqn);
+		} else {
+			in->ctx.xrcd = cpu_to_be32(to_mxrcd(devr->x1)->xrcdn);
+			in->ctx.rq_type_srqn |= cpu_to_be32(to_msrq(devr->s0)->msrq.srqn);
+		}
+	}
+
+	if (init_attr->send_cq)
+		in->ctx.cqn_send = cpu_to_be32(to_mcq(init_attr->send_cq)->mcq.cqn);
+
+	if (init_attr->recv_cq)
+		in->ctx.cqn_recv = cpu_to_be32(to_mcq(init_attr->recv_cq)->mcq.cqn);
+
+	in->ctx.db_rec_addr = cpu_to_be64(qp->db.dma);
+
+	err = mlx5_core_create_qp(dev->mdev, &qp->mqp, in, inlen);
+	if (err) {
+		mlx5_ib_dbg(dev, "create qp failed\n");
+		goto err_create;
+	}
+
+	mlx5_vfree(in);
+	/* Hardware wants QPN written in big-endian order (after
+	 * shifting) for send doorbell.  Precompute this value to save
+	 * a little bit when posting sends.
+	 */
+	qp->doorbell_qpn = swab32(qp->mqp.qpn << 8);
+
+	qp->mqp.event = mlx5_ib_qp_event;
+
+	return 0;
+
+err_create:
+	if (qp->create_type == MLX5_QP_USER)
+		destroy_qp_user(pd, qp);
+	else if (qp->create_type == MLX5_QP_KERNEL)
+		destroy_qp_kernel(dev, qp);
+
+	mlx5_vfree(in);
+	return err;
+}
+
+static void mlx5_ib_lock_cqs(struct mlx5_ib_cq *send_cq, struct mlx5_ib_cq *recv_cq)
+	__acquires(&send_cq->lock) __acquires(&recv_cq->lock)
+{
+	if (send_cq) {
+		if (recv_cq) {
+			if (send_cq->mcq.cqn < recv_cq->mcq.cqn)  {
+				spin_lock_irq(&send_cq->lock);
+				spin_lock_nested(&recv_cq->lock,
+						 SINGLE_DEPTH_NESTING);
+			} else if (send_cq->mcq.cqn == recv_cq->mcq.cqn) {
+				spin_lock_irq(&send_cq->lock);
+				__acquire(&recv_cq->lock);
+			} else {
+				spin_lock_irq(&recv_cq->lock);
+				spin_lock_nested(&send_cq->lock,
+						 SINGLE_DEPTH_NESTING);
+			}
+		} else {
+			spin_lock_irq(&send_cq->lock);
+		}
+	} else if (recv_cq) {
+		spin_lock_irq(&recv_cq->lock);
+	}
+}
+
+static void mlx5_ib_unlock_cqs(struct mlx5_ib_cq *send_cq, struct mlx5_ib_cq *recv_cq)
+	__releases(&send_cq->lock) __releases(&recv_cq->lock)
+{
+	if (send_cq) {
+		if (recv_cq) {
+			if (send_cq->mcq.cqn < recv_cq->mcq.cqn)  {
+				spin_unlock(&recv_cq->lock);
+				spin_unlock_irq(&send_cq->lock);
+			} else if (send_cq->mcq.cqn == recv_cq->mcq.cqn) {
+				__release(&recv_cq->lock);
+				spin_unlock_irq(&send_cq->lock);
+			} else {
+				spin_unlock(&send_cq->lock);
+				spin_unlock_irq(&recv_cq->lock);
+			}
+		} else {
+			spin_unlock_irq(&send_cq->lock);
+		}
+	} else if (recv_cq) {
+		spin_unlock_irq(&recv_cq->lock);
+	}
+}
+
+static struct mlx5_ib_pd *get_pd(struct mlx5_ib_qp *qp)
+{
+	return to_mpd(qp->ibqp.pd);
+}
+
+static void get_cqs(struct mlx5_ib_qp *qp,
+		    struct mlx5_ib_cq **send_cq, struct mlx5_ib_cq **recv_cq)
+{
+	switch (qp->ibqp.qp_type) {
+	case IB_QPT_XRC_TGT:
+		*send_cq = NULL;
+		*recv_cq = NULL;
+		break;
+	case MLX5_IB_QPT_REG_UMR:
+	case IB_QPT_XRC_INI:
+		*send_cq = to_mcq(qp->ibqp.send_cq);
+		*recv_cq = NULL;
+		break;
+
+	case IB_QPT_SMI:
+	case IB_QPT_GSI:
+	case IB_QPT_RC:
+	case IB_QPT_UC:
+	case IB_QPT_UD:
+	case IB_QPT_RAW_IPV6:
+	case IB_QPT_RAW_ETHERTYPE:
+		*send_cq = to_mcq(qp->ibqp.send_cq);
+		*recv_cq = to_mcq(qp->ibqp.recv_cq);
+		break;
+
+	case IB_QPT_RAW_PACKET:
+	case IB_QPT_MAX:
+	default:
+		*send_cq = NULL;
+		*recv_cq = NULL;
+		break;
+	}
+}
+
+static void destroy_qp_common(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp)
+{
+	struct mlx5_ib_cq *send_cq, *recv_cq;
+	struct mlx5_modify_qp_mbox_in *in;
+	int err;
+
+	in = kzalloc(sizeof(*in), GFP_KERNEL);
+	if (!in)
+		return;
+	if (qp->state != IB_QPS_RESET)
+		if (mlx5_core_qp_modify(dev->mdev, to_mlx5_state(qp->state),
+					MLX5_QP_STATE_RST, in, sizeof(*in), &qp->mqp))
+			mlx5_ib_warn(dev, "mlx5_ib: modify QP %06x to RESET failed\n",
+				     qp->mqp.qpn);
+
+	get_cqs(qp, &send_cq, &recv_cq);
+
+	if (qp->create_type == MLX5_QP_KERNEL) {
+		mlx5_ib_lock_cqs(send_cq, recv_cq);
+		__mlx5_ib_cq_clean(recv_cq, qp->mqp.qpn,
+				   qp->ibqp.srq ? to_msrq(qp->ibqp.srq) : NULL);
+		if (send_cq != recv_cq)
+			__mlx5_ib_cq_clean(send_cq, qp->mqp.qpn, NULL);
+		mlx5_ib_unlock_cqs(send_cq, recv_cq);
+	}
+
+	err = mlx5_core_destroy_qp(dev->mdev, &qp->mqp);
+	if (err)
+		mlx5_ib_warn(dev, "failed to destroy QP 0x%x\n", qp->mqp.qpn);
+	kfree(in);
+
+
+	if (qp->create_type == MLX5_QP_KERNEL)
+		destroy_qp_kernel(dev, qp);
+	else if (qp->create_type == MLX5_QP_USER)
+		destroy_qp_user(&get_pd(qp)->ibpd, qp);
+}
+
+static const char *ib_qp_type_str(enum ib_qp_type type)
+{
+	switch (type) {
+	case IB_QPT_SMI:
+		return "IB_QPT_SMI";
+	case IB_QPT_GSI:
+		return "IB_QPT_GSI";
+	case IB_QPT_RC:
+		return "IB_QPT_RC";
+	case IB_QPT_UC:
+		return "IB_QPT_UC";
+	case IB_QPT_UD:
+		return "IB_QPT_UD";
+	case IB_QPT_RAW_IPV6:
+		return "IB_QPT_RAW_IPV6";
+	case IB_QPT_RAW_ETHERTYPE:
+		return "IB_QPT_RAW_ETHERTYPE";
+	case IB_QPT_XRC_INI:
+		return "IB_QPT_XRC_INI";
+	case IB_QPT_XRC_TGT:
+		return "IB_QPT_XRC_TGT";
+	case IB_QPT_RAW_PACKET:
+		return "IB_QPT_RAW_PACKET";
+	case MLX5_IB_QPT_REG_UMR:
+		return "MLX5_IB_QPT_REG_UMR";
+	case IB_QPT_MAX:
+	default:
+		return "Invalid QP type";
+	}
+}
+
+struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd,
+				struct ib_qp_init_attr *init_attr,
+				struct ib_udata *udata)
+{
+	struct mlx5_general_caps *gen;
+	struct mlx5_ib_dev *dev;
+	struct mlx5_ib_qp *qp;
+	u16 xrcdn = 0;
+	int err;
+
+	if (pd) {
+		dev = to_mdev(pd->device);
+	} else {
+		/* being cautious here */
+		if (init_attr->qp_type != IB_QPT_XRC_TGT &&
+		    init_attr->qp_type != MLX5_IB_QPT_REG_UMR) {
+			pr_warn("%s: no PD for transport %s\n", __func__,
+				ib_qp_type_str(init_attr->qp_type));
+			return ERR_PTR(-EINVAL);
+		}
+		dev = to_mdev(to_mxrcd(init_attr->xrcd)->ibxrcd.device);
+	}
+	gen = &dev->mdev->caps.gen;
+
+	switch (init_attr->qp_type) {
+	case IB_QPT_XRC_TGT:
+	case IB_QPT_XRC_INI:
+		if (!(gen->flags & MLX5_DEV_CAP_FLAG_XRC)) {
+			mlx5_ib_dbg(dev, "XRC not supported\n");
+			return ERR_PTR(-ENOSYS);
+		}
+		init_attr->recv_cq = NULL;
+		if (init_attr->qp_type == IB_QPT_XRC_TGT) {
+			xrcdn = to_mxrcd(init_attr->xrcd)->xrcdn;
+			init_attr->send_cq = NULL;
+		}
+
+		/* fall through */
+	case IB_QPT_RC:
+	case IB_QPT_UC:
+	case IB_QPT_UD:
+	case IB_QPT_SMI:
+	case IB_QPT_GSI:
+	case MLX5_IB_QPT_REG_UMR:
+		qp = kzalloc(sizeof(*qp), GFP_KERNEL);
+		if (!qp)
+			return ERR_PTR(-ENOMEM);
+
+		err = create_qp_common(dev, pd, init_attr, udata, qp);
+		if (err) {
+			mlx5_ib_dbg(dev, "create_qp_common failed\n");
+			kfree(qp);
+			return ERR_PTR(err);
+		}
+
+		if (is_qp0(init_attr->qp_type))
+			qp->ibqp.qp_num = 0;
+		else if (is_qp1(init_attr->qp_type))
+			qp->ibqp.qp_num = 1;
+		else
+			qp->ibqp.qp_num = qp->mqp.qpn;
+
+		mlx5_ib_dbg(dev, "ib qpnum 0x%x, mlx qpn 0x%x, rcqn 0x%x, scqn 0x%x\n",
+			    qp->ibqp.qp_num, qp->mqp.qpn, to_mcq(init_attr->recv_cq)->mcq.cqn,
+			    to_mcq(init_attr->send_cq)->mcq.cqn);
+
+		qp->xrcdn = xrcdn;
+
+		break;
+
+	case IB_QPT_RAW_IPV6:
+	case IB_QPT_RAW_ETHERTYPE:
+	case IB_QPT_RAW_PACKET:
+	case IB_QPT_MAX:
+	default:
+		mlx5_ib_dbg(dev, "unsupported qp type %d\n",
+			    init_attr->qp_type);
+		/* Don't support raw QPs */
+		return ERR_PTR(-EINVAL);
+	}
+
+	return &qp->ibqp;
+}
+
+int mlx5_ib_destroy_qp(struct ib_qp *qp)
+{
+	struct mlx5_ib_dev *dev = to_mdev(qp->device);
+	struct mlx5_ib_qp *mqp = to_mqp(qp);
+
+	destroy_qp_common(dev, mqp);
+
+	kfree(mqp);
+
+	return 0;
+}
+
+static __be32 to_mlx5_access_flags(struct mlx5_ib_qp *qp, const struct ib_qp_attr *attr,
+				   int attr_mask)
+{
+	u32 hw_access_flags = 0;
+	u8 dest_rd_atomic;
+	u32 access_flags;
+
+	if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC)
+		dest_rd_atomic = attr->max_dest_rd_atomic;
+	else
+		dest_rd_atomic = qp->resp_depth;
+
+	if (attr_mask & IB_QP_ACCESS_FLAGS)
+		access_flags = attr->qp_access_flags;
+	else
+		access_flags = qp->atomic_rd_en;
+
+	if (!dest_rd_atomic)
+		access_flags &= IB_ACCESS_REMOTE_WRITE;
+
+	if (access_flags & IB_ACCESS_REMOTE_READ)
+		hw_access_flags |= MLX5_QP_BIT_RRE;
+	if (access_flags & IB_ACCESS_REMOTE_ATOMIC)
+		hw_access_flags |= (MLX5_QP_BIT_RAE | MLX5_ATOMIC_MODE_CX);
+	if (access_flags & IB_ACCESS_REMOTE_WRITE)
+		hw_access_flags |= MLX5_QP_BIT_RWE;
+
+	return cpu_to_be32(hw_access_flags);
+}
+
+enum {
+	MLX5_PATH_FLAG_FL	= 1 << 0,
+	MLX5_PATH_FLAG_FREE_AR	= 1 << 1,
+	MLX5_PATH_FLAG_COUNTER	= 1 << 2,
+};
+
+static int ib_rate_to_mlx5(struct mlx5_ib_dev *dev, u8 rate)
+{
+	struct mlx5_general_caps *gen;
+
+	gen = &dev->mdev->caps.gen;
+	if (rate == IB_RATE_PORT_CURRENT) {
+		return 0;
+	} else if (rate < IB_RATE_2_5_GBPS || rate > IB_RATE_300_GBPS) {
+		return -EINVAL;
+	} else {
+		while (rate != IB_RATE_2_5_GBPS &&
+		       !(1 << (rate + MLX5_STAT_RATE_OFFSET) &
+			 gen->stat_rate_support))
+			--rate;
+	}
+
+	return rate + MLX5_STAT_RATE_OFFSET;
+}
+
+static int mlx5_set_path(struct mlx5_ib_dev *dev, const struct ib_ah_attr *ah,
+			 struct mlx5_qp_path *path, u8 port, int attr_mask,
+			 u32 path_flags, const struct ib_qp_attr *attr)
+{
+	struct mlx5_general_caps *gen;
+	int err;
+
+	gen = &dev->mdev->caps.gen;
+	path->fl = (path_flags & MLX5_PATH_FLAG_FL) ? 0x80 : 0;
+	path->free_ar = (path_flags & MLX5_PATH_FLAG_FREE_AR) ? 0x80 : 0;
+
+	if (attr_mask & IB_QP_PKEY_INDEX)
+		path->pkey_index = attr->pkey_index;
+
+	path->grh_mlid	= ah->src_path_bits & 0x7f;
+	path->rlid	= cpu_to_be16(ah->dlid);
+
+	if (ah->ah_flags & IB_AH_GRH) {
+		if (ah->grh.sgid_index >= gen->port[port - 1].gid_table_len) {
+			pr_err(KERN_ERR "sgid_index (%u) too large. max is %d\n",
+			       ah->grh.sgid_index, gen->port[port - 1].gid_table_len);
+			return -EINVAL;
+		}
+		path->grh_mlid |= 1 << 7;
+		path->mgid_index = ah->grh.sgid_index;
+		path->hop_limit  = ah->grh.hop_limit;
+		path->tclass_flowlabel =
+			cpu_to_be32((ah->grh.traffic_class << 20) |
+				    (ah->grh.flow_label));
+		memcpy(path->rgid, ah->grh.dgid.raw, 16);
+	}
+
+	err = ib_rate_to_mlx5(dev, ah->static_rate);
+	if (err < 0)
+		return err;
+	path->static_rate = err;
+	path->port = port;
+
+	if (attr_mask & IB_QP_TIMEOUT)
+		path->ackto_lt = attr->timeout << 3;
+
+	path->sl = ah->sl & 0xf;
+
+	return 0;
+}
+
+static enum mlx5_qp_optpar opt_mask[MLX5_QP_NUM_STATE][MLX5_QP_NUM_STATE][MLX5_QP_ST_MAX] = {
+	[MLX5_QP_STATE_INIT] = {
+		[MLX5_QP_STATE_INIT] = {
+			[MLX5_QP_ST_RC] = MLX5_QP_OPTPAR_RRE		|
+					  MLX5_QP_OPTPAR_RAE		|
+					  MLX5_QP_OPTPAR_RWE		|
+					  MLX5_QP_OPTPAR_PKEY_INDEX	|
+					  MLX5_QP_OPTPAR_PRI_PORT,
+			[MLX5_QP_ST_UC] = MLX5_QP_OPTPAR_RWE		|
+					  MLX5_QP_OPTPAR_PKEY_INDEX	|
+					  MLX5_QP_OPTPAR_PRI_PORT,
+			[MLX5_QP_ST_UD] = MLX5_QP_OPTPAR_PKEY_INDEX	|
+					  MLX5_QP_OPTPAR_Q_KEY		|
+					  MLX5_QP_OPTPAR_PRI_PORT,
+		},
+		[MLX5_QP_STATE_RTR] = {
+			[MLX5_QP_ST_RC] = MLX5_QP_OPTPAR_ALT_ADDR_PATH  |
+					  MLX5_QP_OPTPAR_RRE            |
+					  MLX5_QP_OPTPAR_RAE            |
+					  MLX5_QP_OPTPAR_RWE            |
+					  MLX5_QP_OPTPAR_PKEY_INDEX,
+			[MLX5_QP_ST_UC] = MLX5_QP_OPTPAR_ALT_ADDR_PATH  |
+					  MLX5_QP_OPTPAR_RWE            |
+					  MLX5_QP_OPTPAR_PKEY_INDEX,
+			[MLX5_QP_ST_UD] = MLX5_QP_OPTPAR_PKEY_INDEX     |
+					  MLX5_QP_OPTPAR_Q_KEY,
+			[MLX5_QP_ST_MLX] = MLX5_QP_OPTPAR_PKEY_INDEX	|
+					   MLX5_QP_OPTPAR_Q_KEY,
+			[MLX5_QP_ST_XRC] = MLX5_QP_OPTPAR_ALT_ADDR_PATH |
+					  MLX5_QP_OPTPAR_RRE            |
+					  MLX5_QP_OPTPAR_RAE            |
+					  MLX5_QP_OPTPAR_RWE            |
+					  MLX5_QP_OPTPAR_PKEY_INDEX,
+		},
+	},
+	[MLX5_QP_STATE_RTR] = {
+		[MLX5_QP_STATE_RTS] = {
+			[MLX5_QP_ST_RC] = MLX5_QP_OPTPAR_ALT_ADDR_PATH	|
+					  MLX5_QP_OPTPAR_RRE		|
+					  MLX5_QP_OPTPAR_RAE		|
+					  MLX5_QP_OPTPAR_RWE		|
+					  MLX5_QP_OPTPAR_PM_STATE	|
+					  MLX5_QP_OPTPAR_RNR_TIMEOUT,
+			[MLX5_QP_ST_UC] = MLX5_QP_OPTPAR_ALT_ADDR_PATH	|
+					  MLX5_QP_OPTPAR_RWE		|
+					  MLX5_QP_OPTPAR_PM_STATE,
+			[MLX5_QP_ST_UD] = MLX5_QP_OPTPAR_Q_KEY,
+		},
+	},
+	[MLX5_QP_STATE_RTS] = {
+		[MLX5_QP_STATE_RTS] = {
+			[MLX5_QP_ST_RC] = MLX5_QP_OPTPAR_RRE		|
+					  MLX5_QP_OPTPAR_RAE		|
+					  MLX5_QP_OPTPAR_RWE		|
+					  MLX5_QP_OPTPAR_RNR_TIMEOUT	|
+					  MLX5_QP_OPTPAR_PM_STATE	|
+					  MLX5_QP_OPTPAR_ALT_ADDR_PATH,
+			[MLX5_QP_ST_UC] = MLX5_QP_OPTPAR_RWE		|
+					  MLX5_QP_OPTPAR_PM_STATE	|
+					  MLX5_QP_OPTPAR_ALT_ADDR_PATH,
+			[MLX5_QP_ST_UD] = MLX5_QP_OPTPAR_Q_KEY		|
+					  MLX5_QP_OPTPAR_SRQN		|
+					  MLX5_QP_OPTPAR_CQN_RCV,
+		},
+	},
+	[MLX5_QP_STATE_SQER] = {
+		[MLX5_QP_STATE_RTS] = {
+			[MLX5_QP_ST_UD]	 = MLX5_QP_OPTPAR_Q_KEY,
+			[MLX5_QP_ST_MLX] = MLX5_QP_OPTPAR_Q_KEY,
+			[MLX5_QP_ST_UC]	 = MLX5_QP_OPTPAR_RWE,
+			[MLX5_QP_ST_RC]	 = MLX5_QP_OPTPAR_RNR_TIMEOUT	|
+					   MLX5_QP_OPTPAR_RWE		|
+					   MLX5_QP_OPTPAR_RAE		|
+					   MLX5_QP_OPTPAR_RRE,
+		},
+	},
+};
+
+static int ib_nr_to_mlx5_nr(int ib_mask)
+{
+	switch (ib_mask) {
+	case IB_QP_STATE:
+		return 0;
+	case IB_QP_CUR_STATE:
+		return 0;
+	case IB_QP_EN_SQD_ASYNC_NOTIFY:
+		return 0;
+	case IB_QP_ACCESS_FLAGS:
+		return MLX5_QP_OPTPAR_RWE | MLX5_QP_OPTPAR_RRE |
+			MLX5_QP_OPTPAR_RAE;
+	case IB_QP_PKEY_INDEX:
+		return MLX5_QP_OPTPAR_PKEY_INDEX;
+	case IB_QP_PORT:
+		return MLX5_QP_OPTPAR_PRI_PORT;
+	case IB_QP_QKEY:
+		return MLX5_QP_OPTPAR_Q_KEY;
+	case IB_QP_AV:
+		return MLX5_QP_OPTPAR_PRIMARY_ADDR_PATH |
+			MLX5_QP_OPTPAR_PRI_PORT;
+	case IB_QP_PATH_MTU:
+		return 0;
+	case IB_QP_TIMEOUT:
+		return MLX5_QP_OPTPAR_ACK_TIMEOUT;
+	case IB_QP_RETRY_CNT:
+		return MLX5_QP_OPTPAR_RETRY_COUNT;
+	case IB_QP_RNR_RETRY:
+		return MLX5_QP_OPTPAR_RNR_RETRY;
+	case IB_QP_RQ_PSN:
+		return 0;
+	case IB_QP_MAX_QP_RD_ATOMIC:
+		return MLX5_QP_OPTPAR_SRA_MAX;
+	case IB_QP_ALT_PATH:
+		return MLX5_QP_OPTPAR_ALT_ADDR_PATH;
+	case IB_QP_MIN_RNR_TIMER:
+		return MLX5_QP_OPTPAR_RNR_TIMEOUT;
+	case IB_QP_SQ_PSN:
+		return 0;
+	case IB_QP_MAX_DEST_RD_ATOMIC:
+		return MLX5_QP_OPTPAR_RRA_MAX | MLX5_QP_OPTPAR_RWE |
+			MLX5_QP_OPTPAR_RRE | MLX5_QP_OPTPAR_RAE;
+	case IB_QP_PATH_MIG_STATE:
+		return MLX5_QP_OPTPAR_PM_STATE;
+	case IB_QP_CAP:
+		return 0;
+	case IB_QP_DEST_QPN:
+		return 0;
+	}
+	return 0;
+}
+
+static int ib_mask_to_mlx5_opt(int ib_mask)
+{
+	int result = 0;
+	int i;
+
+	for (i = 0; i < 8 * sizeof(int); i++) {
+		if ((1 << i) & ib_mask)
+			result |= ib_nr_to_mlx5_nr(1 << i);
+	}
+
+	return result;
+}
+
+static int __mlx5_ib_modify_qp(struct ib_qp *ibqp,
+			       const struct ib_qp_attr *attr, int attr_mask,
+			       enum ib_qp_state cur_state, enum ib_qp_state new_state)
+{
+	struct mlx5_ib_dev *dev = to_mdev(ibqp->device);
+	struct mlx5_ib_qp *qp = to_mqp(ibqp);
+	struct mlx5_ib_cq *send_cq, *recv_cq;
+	struct mlx5_qp_context *context;
+	struct mlx5_general_caps *gen;
+	struct mlx5_modify_qp_mbox_in *in;
+	struct mlx5_ib_pd *pd;
+	enum mlx5_qp_state mlx5_cur, mlx5_new;
+	enum mlx5_qp_optpar optpar;
+	int sqd_event;
+	int mlx5_st;
+	int err;
+
+	gen = &dev->mdev->caps.gen;
+	in = kzalloc(sizeof(*in), GFP_KERNEL);
+	if (!in)
+		return -ENOMEM;
+
+	context = &in->ctx;
+	err = to_mlx5_st(ibqp->qp_type);
+	if (err < 0)
+		goto out;
+
+	context->flags = cpu_to_be32(err << 16);
+
+	if (!(attr_mask & IB_QP_PATH_MIG_STATE)) {
+		context->flags |= cpu_to_be32(MLX5_QP_PM_MIGRATED << 11);
+	} else {
+		switch (attr->path_mig_state) {
+		case IB_MIG_MIGRATED:
+			context->flags |= cpu_to_be32(MLX5_QP_PM_MIGRATED << 11);
+			break;
+		case IB_MIG_REARM:
+			context->flags |= cpu_to_be32(MLX5_QP_PM_REARM << 11);
+			break;
+		case IB_MIG_ARMED:
+			context->flags |= cpu_to_be32(MLX5_QP_PM_ARMED << 11);
+			break;
+		}
+	}
+
+	if (ibqp->qp_type == IB_QPT_GSI || ibqp->qp_type == IB_QPT_SMI) {
+		context->mtu_msgmax = (IB_MTU_256 << 5) | 8;
+	} else if (ibqp->qp_type == IB_QPT_UD ||
+		   ibqp->qp_type == MLX5_IB_QPT_REG_UMR) {
+		context->mtu_msgmax = (IB_MTU_4096 << 5) | 12;
+	} else if (attr_mask & IB_QP_PATH_MTU) {
+		if (attr->path_mtu < IB_MTU_256 ||
+		    attr->path_mtu > IB_MTU_4096) {
+			mlx5_ib_warn(dev, "invalid mtu %d\n", attr->path_mtu);
+			err = -EINVAL;
+			goto out;
+		}
+		context->mtu_msgmax = (attr->path_mtu << 5) | gen->log_max_msg;
+	}
+
+	if (attr_mask & IB_QP_DEST_QPN)
+		context->log_pg_sz_remote_qpn = cpu_to_be32(attr->dest_qp_num);
+
+	if (attr_mask & IB_QP_PKEY_INDEX)
+		context->pri_path.pkey_index = attr->pkey_index;
+
+	/* todo implement counter_index functionality */
+
+	if (is_sqp(ibqp->qp_type))
+		context->pri_path.port = qp->port;
+
+	if (attr_mask & IB_QP_PORT)
+		context->pri_path.port = attr->port_num;
+
+	if (attr_mask & IB_QP_AV) {
+		err = mlx5_set_path(dev, &attr->ah_attr, &context->pri_path,
+				    attr_mask & IB_QP_PORT ? attr->port_num : qp->port,
+				    attr_mask, 0, attr);
+		if (err)
+			goto out;
+	}
+
+	if (attr_mask & IB_QP_TIMEOUT)
+		context->pri_path.ackto_lt |= attr->timeout << 3;
+
+	if (attr_mask & IB_QP_ALT_PATH) {
+		err = mlx5_set_path(dev, &attr->alt_ah_attr, &context->alt_path,
+				    attr->alt_port_num, attr_mask, 0, attr);
+		if (err)
+			goto out;
+	}
+
+	pd = get_pd(qp);
+	get_cqs(qp, &send_cq, &recv_cq);
+
+	context->flags_pd = cpu_to_be32(pd ? pd->pdn : to_mpd(dev->devr.p0)->pdn);
+	context->cqn_send = send_cq ? cpu_to_be32(send_cq->mcq.cqn) : 0;
+	context->cqn_recv = recv_cq ? cpu_to_be32(recv_cq->mcq.cqn) : 0;
+	context->params1  = cpu_to_be32(MLX5_IB_ACK_REQ_FREQ << 28);
+
+	if (attr_mask & IB_QP_RNR_RETRY)
+		context->params1 |= cpu_to_be32(attr->rnr_retry << 13);
+
+	if (attr_mask & IB_QP_RETRY_CNT)
+		context->params1 |= cpu_to_be32(attr->retry_cnt << 16);
+
+	if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC) {
+		if (attr->max_rd_atomic)
+			context->params1 |=
+				cpu_to_be32(fls(attr->max_rd_atomic - 1) << 21);
+	}
+
+	if (attr_mask & IB_QP_SQ_PSN)
+		context->next_send_psn = cpu_to_be32(attr->sq_psn);
+
+	if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) {
+		if (attr->max_dest_rd_atomic)
+			context->params2 |=
+				cpu_to_be32(fls(attr->max_dest_rd_atomic - 1) << 21);
+	}
+
+	if (attr_mask & (IB_QP_ACCESS_FLAGS | IB_QP_MAX_DEST_RD_ATOMIC))
+		context->params2 |= to_mlx5_access_flags(qp, attr, attr_mask);
+
+	if (attr_mask & IB_QP_MIN_RNR_TIMER)
+		context->rnr_nextrecvpsn |= cpu_to_be32(attr->min_rnr_timer << 24);
+
+	if (attr_mask & IB_QP_RQ_PSN)
+		context->rnr_nextrecvpsn |= cpu_to_be32(attr->rq_psn);
+
+	if (attr_mask & IB_QP_QKEY)
+		context->qkey = cpu_to_be32(attr->qkey);
+
+	if (qp->rq.wqe_cnt && cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT)
+		context->db_rec_addr = cpu_to_be64(qp->db.dma);
+
+	if (cur_state == IB_QPS_RTS && new_state == IB_QPS_SQD	&&
+	    attr_mask & IB_QP_EN_SQD_ASYNC_NOTIFY && attr->en_sqd_async_notify)
+		sqd_event = 1;
+	else
+		sqd_event = 0;
+
+	if (!ibqp->uobject && cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT)
+		context->sq_crq_size |= cpu_to_be16(1 << 4);
+
+
+	mlx5_cur = to_mlx5_state(cur_state);
+	mlx5_new = to_mlx5_state(new_state);
+	mlx5_st = to_mlx5_st(ibqp->qp_type);
+	if (mlx5_st < 0)
+		goto out;
+
+	optpar = ib_mask_to_mlx5_opt(attr_mask);
+	optpar &= opt_mask[mlx5_cur][mlx5_new][mlx5_st];
+	in->optparam = cpu_to_be32(optpar);
+	err = mlx5_core_qp_modify(dev->mdev, to_mlx5_state(cur_state),
+				  to_mlx5_state(new_state), in, sqd_event,
+				  &qp->mqp);
+	if (err)
+		goto out;
+
+	qp->state = new_state;
+
+	if (attr_mask & IB_QP_ACCESS_FLAGS)
+		qp->atomic_rd_en = attr->qp_access_flags;
+	if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC)
+		qp->resp_depth = attr->max_dest_rd_atomic;
+	if (attr_mask & IB_QP_PORT)
+		qp->port = attr->port_num;
+	if (attr_mask & IB_QP_ALT_PATH)
+		qp->alt_port = attr->alt_port_num;
+
+	/*
+	 * If we moved a kernel QP to RESET, clean up all old CQ
+	 * entries and reinitialize the QP.
+	 */
+	if (new_state == IB_QPS_RESET && !ibqp->uobject) {
+		mlx5_ib_cq_clean(recv_cq, qp->mqp.qpn,
+				 ibqp->srq ? to_msrq(ibqp->srq) : NULL);
+		if (send_cq != recv_cq)
+			mlx5_ib_cq_clean(send_cq, qp->mqp.qpn, NULL);
+
+		qp->rq.head = 0;
+		qp->rq.tail = 0;
+		qp->sq.head = 0;
+		qp->sq.tail = 0;
+		qp->sq.cur_post = 0;
+		qp->sq.last_poll = 0;
+		qp->db.db[MLX5_RCV_DBR] = 0;
+		qp->db.db[MLX5_SND_DBR] = 0;
+	}
+
+out:
+	kfree(in);
+	return err;
+}
+
+int mlx5_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+		      int attr_mask, struct ib_udata *udata)
+{
+	struct mlx5_ib_dev *dev = to_mdev(ibqp->device);
+	struct mlx5_ib_qp *qp = to_mqp(ibqp);
+	enum ib_qp_state cur_state, new_state;
+	struct mlx5_general_caps *gen;
+	int err = -EINVAL;
+	int port;
+
+	gen = &dev->mdev->caps.gen;
+	mutex_lock(&qp->mutex);
+
+	cur_state = attr_mask & IB_QP_CUR_STATE ? attr->cur_qp_state : qp->state;
+	new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state;
+
+	if (ibqp->qp_type != MLX5_IB_QPT_REG_UMR &&
+	    !ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type, attr_mask,
+				IB_LINK_LAYER_UNSPECIFIED))
+		goto out;
+
+	if ((attr_mask & IB_QP_PORT) &&
+	    (attr->port_num == 0 || attr->port_num > gen->num_ports))
+		goto out;
+
+	if (attr_mask & IB_QP_PKEY_INDEX) {
+		port = attr_mask & IB_QP_PORT ? attr->port_num : qp->port;
+		if (attr->pkey_index >= gen->port[port - 1].pkey_table_len)
+			goto out;
+	}
+
+	if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC &&
+	    attr->max_rd_atomic > (1 << gen->log_max_ra_res_qp))
+		goto out;
+
+	if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC &&
+	    attr->max_dest_rd_atomic > (1 << gen->log_max_ra_req_qp))
+		goto out;
+
+	if (cur_state == new_state && cur_state == IB_QPS_RESET) {
+		err = 0;
+		goto out;
+	}
+
+	err = __mlx5_ib_modify_qp(ibqp, attr, attr_mask, cur_state, new_state);
+
+out:
+	mutex_unlock(&qp->mutex);
+	return err;
+}
+
+static int mlx5_wq_overflow(struct mlx5_ib_wq *wq, int nreq, struct ib_cq *ib_cq)
+{
+	struct mlx5_ib_cq *cq;
+	unsigned cur;
+
+	cur = wq->head - wq->tail;
+	if (likely(cur + nreq < wq->max_post))
+		return 0;
+
+	cq = to_mcq(ib_cq);
+	spin_lock(&cq->lock);
+	cur = wq->head - wq->tail;
+	spin_unlock(&cq->lock);
+
+	return cur + nreq >= wq->max_post;
+}
+
+static __always_inline void set_raddr_seg(struct mlx5_wqe_raddr_seg *rseg,
+					  u64 remote_addr, u32 rkey)
+{
+	rseg->raddr    = cpu_to_be64(remote_addr);
+	rseg->rkey     = cpu_to_be32(rkey);
+	rseg->reserved = 0;
+}
+
+static void set_datagram_seg(struct mlx5_wqe_datagram_seg *dseg,
+			     struct ib_send_wr *wr)
+{
+	memcpy(&dseg->av, &to_mah(wr->wr.ud.ah)->av, sizeof(struct mlx5_av));
+	dseg->av.dqp_dct = cpu_to_be32(wr->wr.ud.remote_qpn | MLX5_EXTENDED_UD_AV);
+	dseg->av.key.qkey.qkey = cpu_to_be32(wr->wr.ud.remote_qkey);
+}
+
+static void set_data_ptr_seg(struct mlx5_wqe_data_seg *dseg, struct ib_sge *sg)
+{
+	dseg->byte_count = cpu_to_be32(sg->length);
+	dseg->lkey       = cpu_to_be32(sg->lkey);
+	dseg->addr       = cpu_to_be64(sg->addr);
+}
+
+static __be16 get_klm_octo(int npages)
+{
+	return cpu_to_be16(ALIGN(npages, 8) / 2);
+}
+
+static __be64 frwr_mkey_mask(void)
+{
+	u64 result;
+
+	result = MLX5_MKEY_MASK_LEN		|
+		MLX5_MKEY_MASK_PAGE_SIZE	|
+		MLX5_MKEY_MASK_START_ADDR	|
+		MLX5_MKEY_MASK_EN_RINVAL	|
+		MLX5_MKEY_MASK_KEY		|
+		MLX5_MKEY_MASK_LR		|
+		MLX5_MKEY_MASK_LW		|
+		MLX5_MKEY_MASK_RR		|
+		MLX5_MKEY_MASK_RW		|
+		MLX5_MKEY_MASK_A		|
+		MLX5_MKEY_MASK_SMALL_FENCE	|
+		MLX5_MKEY_MASK_FREE;
+
+	return cpu_to_be64(result);
+}
+
+static __be64 sig_mkey_mask(void)
+{
+	u64 result;
+
+	result = MLX5_MKEY_MASK_LEN		|
+		MLX5_MKEY_MASK_PAGE_SIZE	|
+		MLX5_MKEY_MASK_START_ADDR	|
+		MLX5_MKEY_MASK_EN_SIGERR	|
+		MLX5_MKEY_MASK_EN_RINVAL	|
+		MLX5_MKEY_MASK_KEY		|
+		MLX5_MKEY_MASK_LR		|
+		MLX5_MKEY_MASK_LW		|
+		MLX5_MKEY_MASK_RR		|
+		MLX5_MKEY_MASK_RW		|
+		MLX5_MKEY_MASK_SMALL_FENCE	|
+		MLX5_MKEY_MASK_FREE		|
+		MLX5_MKEY_MASK_BSF_EN;
+
+	return cpu_to_be64(result);
+}
+
+static void set_frwr_umr_segment(struct mlx5_wqe_umr_ctrl_seg *umr,
+				 struct ib_send_wr *wr, int li)
+{
+	memset(umr, 0, sizeof(*umr));
+
+	if (li) {
+		umr->mkey_mask = cpu_to_be64(MLX5_MKEY_MASK_FREE);
+		umr->flags = 1 << 7;
+		return;
+	}
+
+	umr->flags = (1 << 5); /* fail if not free */
+	umr->klm_octowords = get_klm_octo(wr->wr.fast_reg.page_list_len);
+	umr->mkey_mask = frwr_mkey_mask();
+}
+
+static void set_reg_umr_segment(struct mlx5_wqe_umr_ctrl_seg *umr,
+				struct ib_send_wr *wr)
+{
+	struct umr_wr *umrwr = (struct umr_wr *)&wr->wr.fast_reg;
+	u64 mask;
+
+	memset(umr, 0, sizeof(*umr));
+
+	if (!(wr->send_flags & MLX5_IB_SEND_UMR_UNREG)) {
+		umr->flags = 1 << 5; /* fail if not free */
+		umr->klm_octowords = get_klm_octo(umrwr->npages);
+		mask =  MLX5_MKEY_MASK_LEN		|
+			MLX5_MKEY_MASK_PAGE_SIZE	|
+			MLX5_MKEY_MASK_START_ADDR	|
+			MLX5_MKEY_MASK_PD		|
+			MLX5_MKEY_MASK_LR		|
+			MLX5_MKEY_MASK_LW		|
+			MLX5_MKEY_MASK_KEY		|
+			MLX5_MKEY_MASK_RR		|
+			MLX5_MKEY_MASK_RW		|
+			MLX5_MKEY_MASK_A		|
+			MLX5_MKEY_MASK_FREE;
+		umr->mkey_mask = cpu_to_be64(mask);
+	} else {
+		umr->flags = 2 << 5; /* fail if free */
+		mask = MLX5_MKEY_MASK_FREE;
+		umr->mkey_mask = cpu_to_be64(mask);
+	}
+
+	if (!wr->num_sge)
+		umr->flags |= (1 << 7); /* inline */
+}
+
+static u8 get_umr_flags(int acc)
+{
+	return (acc & IB_ACCESS_REMOTE_ATOMIC ? MLX5_PERM_ATOMIC       : 0) |
+	       (acc & IB_ACCESS_REMOTE_WRITE  ? MLX5_PERM_REMOTE_WRITE : 0) |
+	       (acc & IB_ACCESS_REMOTE_READ   ? MLX5_PERM_REMOTE_READ  : 0) |
+	       (acc & IB_ACCESS_LOCAL_WRITE   ? MLX5_PERM_LOCAL_WRITE  : 0) |
+		MLX5_PERM_LOCAL_READ | MLX5_PERM_UMR_EN;
+}
+
+static void set_mkey_segment(struct mlx5_mkey_seg *seg, struct ib_send_wr *wr,
+			     int li, int *writ)
+{
+	memset(seg, 0, sizeof(*seg));
+	if (li) {
+		seg->status = 1 << 6;
+		return;
+	}
+
+	seg->flags = get_umr_flags(wr->wr.fast_reg.access_flags) |
+		     MLX5_ACCESS_MODE_MTT;
+	*writ = seg->flags & (MLX5_PERM_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE);
+	seg->qpn_mkey7_0 = cpu_to_be32((wr->wr.fast_reg.rkey & 0xff) | 0xffffff00);
+	seg->flags_pd = cpu_to_be32(MLX5_MKEY_REMOTE_INVAL);
+	seg->start_addr = cpu_to_be64(wr->wr.fast_reg.iova_start);
+	seg->len = cpu_to_be64(wr->wr.fast_reg.length);
+	seg->xlt_oct_size = cpu_to_be32((wr->wr.fast_reg.page_list_len + 1) / 2);
+	seg->log2_page_size = wr->wr.fast_reg.page_shift;
+}
+
+static void set_reg_mkey_segment(struct mlx5_mkey_seg *seg, struct ib_send_wr *wr)
+{
+	memset(seg, 0, sizeof(*seg));
+	if (wr->send_flags & MLX5_IB_SEND_UMR_UNREG) {
+		seg->status = 1 << 6;
+		return;
+	}
+
+	seg->flags = convert_access(wr->wr.fast_reg.access_flags);
+	seg->flags_pd = cpu_to_be32(to_mpd((struct ib_pd *)wr->wr.fast_reg.page_list)->pdn);
+	seg->start_addr = cpu_to_be64(wr->wr.fast_reg.iova_start);
+	seg->len = cpu_to_be64(wr->wr.fast_reg.length);
+	seg->log2_page_size = wr->wr.fast_reg.page_shift;
+	seg->qpn_mkey7_0 = cpu_to_be32(0xffffff00 |
+				       mlx5_mkey_variant(wr->wr.fast_reg.rkey));
+}
+
+static void set_frwr_pages(struct mlx5_wqe_data_seg *dseg,
+			   struct ib_send_wr *wr,
+			   struct mlx5_core_dev *mdev,
+			   struct mlx5_ib_pd *pd,
+			   int writ)
+{
+	struct mlx5_ib_fast_reg_page_list *mfrpl = to_mfrpl(wr->wr.fast_reg.page_list);
+	u64 *page_list = wr->wr.fast_reg.page_list->page_list;
+	u64 perm = MLX5_EN_RD | (writ ? MLX5_EN_WR : 0);
+	int i;
+
+	for (i = 0; i < wr->wr.fast_reg.page_list_len; i++)
+		mfrpl->mapped_page_list[i] = cpu_to_be64(page_list[i] | perm);
+	dseg->addr = cpu_to_be64(mfrpl->map);
+	dseg->byte_count = cpu_to_be32(ALIGN(sizeof(u64) * wr->wr.fast_reg.page_list_len, 64));
+	dseg->lkey = cpu_to_be32(pd->pa_lkey);
+}
+
+static __be32 send_ieth(struct ib_send_wr *wr)
+{
+	switch (wr->opcode) {
+	case IB_WR_SEND_WITH_IMM:
+	case IB_WR_RDMA_WRITE_WITH_IMM:
+		return wr->ex.imm_data;
+
+	case IB_WR_SEND_WITH_INV:
+		return cpu_to_be32(wr->ex.invalidate_rkey);
+
+	default:
+		return 0;
+	}
+}
+
+static u8 calc_sig(void *wqe, int size)
+{
+	u8 *p = wqe;
+	u8 res = 0;
+	int i;
+
+	for (i = 0; i < size; i++)
+		res ^= p[i];
+
+	return ~res;
+}
+
+static u8 wq_sig(void *wqe)
+{
+	return calc_sig(wqe, (*((u8 *)wqe + 8) & 0x3f) << 4);
+}
+
+static int set_data_inl_seg(struct mlx5_ib_qp *qp, struct ib_send_wr *wr,
+			    void *wqe, int *sz)
+{
+	struct mlx5_wqe_inline_seg *seg;
+	void *qend = qp->sq.qend;
+	void *addr;
+	int inl = 0;
+	int copy;
+	int len;
+	int i;
+
+	seg = wqe;
+	wqe += sizeof(*seg);
+	for (i = 0; i < wr->num_sge; i++) {
+		addr = (void *)(unsigned long)(wr->sg_list[i].addr);
+		len  = wr->sg_list[i].length;
+		inl += len;
+
+		if (unlikely(inl > qp->max_inline_data))
+			return -ENOMEM;
+
+		if (unlikely(wqe + len > qend)) {
+			copy = qend - wqe;
+			memcpy(wqe, addr, copy);
+			addr += copy;
+			len -= copy;
+			wqe = mlx5_get_send_wqe(qp, 0);
+		}
+		memcpy(wqe, addr, len);
+		wqe += len;
+	}
+
+	seg->byte_count = cpu_to_be32(inl | MLX5_INLINE_SEG);
+
+	*sz = ALIGN(inl + sizeof(seg->byte_count), 16) / 16;
+
+	return 0;
+}
+
+static u16 prot_field_size(enum ib_signature_type type)
+{
+	switch (type) {
+	case IB_SIG_TYPE_T10_DIF:
+		return MLX5_DIF_SIZE;
+	default:
+		return 0;
+	}
+}
+
+static u8 bs_selector(int block_size)
+{
+	switch (block_size) {
+	case 512:	    return 0x1;
+	case 520:	    return 0x2;
+	case 4096:	    return 0x3;
+	case 4160:	    return 0x4;
+	case 1073741824:    return 0x5;
+	default:	    return 0;
+	}
+}
+
+static void mlx5_fill_inl_bsf(struct ib_sig_domain *domain,
+			      struct mlx5_bsf_inl *inl)
+{
+	/* Valid inline section and allow BSF refresh */
+	inl->vld_refresh = cpu_to_be16(MLX5_BSF_INL_VALID |
+				       MLX5_BSF_REFRESH_DIF);
+	inl->dif_apptag = cpu_to_be16(domain->sig.dif.app_tag);
+	inl->dif_reftag = cpu_to_be32(domain->sig.dif.ref_tag);
+	/* repeating block */
+	inl->rp_inv_seed = MLX5_BSF_REPEAT_BLOCK;
+	inl->sig_type = domain->sig.dif.bg_type == IB_T10DIF_CRC ?
+			MLX5_DIF_CRC : MLX5_DIF_IPCS;
+
+	if (domain->sig.dif.ref_remap)
+		inl->dif_inc_ref_guard_check |= MLX5_BSF_INC_REFTAG;
+
+	if (domain->sig.dif.app_escape) {
+		if (domain->sig.dif.ref_escape)
+			inl->dif_inc_ref_guard_check |= MLX5_BSF_APPREF_ESCAPE;
+		else
+			inl->dif_inc_ref_guard_check |= MLX5_BSF_APPTAG_ESCAPE;
+	}
+
+	inl->dif_app_bitmask_check =
+		cpu_to_be16(domain->sig.dif.apptag_check_mask);
+}
+
+static int mlx5_set_bsf(struct ib_mr *sig_mr,
+			struct ib_sig_attrs *sig_attrs,
+			struct mlx5_bsf *bsf, u32 data_size)
+{
+	struct mlx5_core_sig_ctx *msig = to_mmr(sig_mr)->sig;
+	struct mlx5_bsf_basic *basic = &bsf->basic;
+	struct ib_sig_domain *mem = &sig_attrs->mem;
+	struct ib_sig_domain *wire = &sig_attrs->wire;
+
+	memset(bsf, 0, sizeof(*bsf));
+
+	/* Basic + Extended + Inline */
+	basic->bsf_size_sbs = 1 << 7;
+	/* Input domain check byte mask */
+	basic->check_byte_mask = sig_attrs->check_mask;
+	basic->raw_data_size = cpu_to_be32(data_size);
+
+	/* Memory domain */
+	switch (sig_attrs->mem.sig_type) {
+	case IB_SIG_TYPE_NONE:
+		break;
+	case IB_SIG_TYPE_T10_DIF:
+		basic->mem.bs_selector = bs_selector(mem->sig.dif.pi_interval);
+		basic->m_bfs_psv = cpu_to_be32(msig->psv_memory.psv_idx);
+		mlx5_fill_inl_bsf(mem, &bsf->m_inl);
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	/* Wire domain */
+	switch (sig_attrs->wire.sig_type) {
+	case IB_SIG_TYPE_NONE:
+		break;
+	case IB_SIG_TYPE_T10_DIF:
+		if (mem->sig.dif.pi_interval == wire->sig.dif.pi_interval &&
+		    mem->sig_type == wire->sig_type) {
+			/* Same block structure */
+			basic->bsf_size_sbs |= 1 << 4;
+			if (mem->sig.dif.bg_type == wire->sig.dif.bg_type)
+				basic->wire.copy_byte_mask |= MLX5_CPY_GRD_MASK;
+			if (mem->sig.dif.app_tag == wire->sig.dif.app_tag)
+				basic->wire.copy_byte_mask |= MLX5_CPY_APP_MASK;
+			if (mem->sig.dif.ref_tag == wire->sig.dif.ref_tag)
+				basic->wire.copy_byte_mask |= MLX5_CPY_REF_MASK;
+		} else
+			basic->wire.bs_selector = bs_selector(wire->sig.dif.pi_interval);
+
+		basic->w_bfs_psv = cpu_to_be32(msig->psv_wire.psv_idx);
+		mlx5_fill_inl_bsf(wire, &bsf->w_inl);
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int set_sig_data_segment(struct ib_send_wr *wr, struct mlx5_ib_qp *qp,
+				void **seg, int *size)
+{
+	struct ib_sig_attrs *sig_attrs = wr->wr.sig_handover.sig_attrs;
+	struct ib_mr *sig_mr = wr->wr.sig_handover.sig_mr;
+	struct mlx5_bsf *bsf;
+	u32 data_len = wr->sg_list->length;
+	u32 data_key = wr->sg_list->lkey;
+	u64 data_va = wr->sg_list->addr;
+	int ret;
+	int wqe_size;
+
+	if (!wr->wr.sig_handover.prot ||
+	    (data_key == wr->wr.sig_handover.prot->lkey &&
+	     data_va == wr->wr.sig_handover.prot->addr &&
+	     data_len == wr->wr.sig_handover.prot->length)) {
+		/**
+		 * Source domain doesn't contain signature information
+		 * or data and protection are interleaved in memory.
+		 * So need construct:
+		 *                  ------------------
+		 *                 |     data_klm     |
+		 *                  ------------------
+		 *                 |       BSF        |
+		 *                  ------------------
+		 **/
+		struct mlx5_klm *data_klm = *seg;
+
+		data_klm->bcount = cpu_to_be32(data_len);
+		data_klm->key = cpu_to_be32(data_key);
+		data_klm->va = cpu_to_be64(data_va);
+		wqe_size = ALIGN(sizeof(*data_klm), 64);
+	} else {
+		/**
+		 * Source domain contains signature information
+		 * So need construct a strided block format:
+		 *               ---------------------------
+		 *              |     stride_block_ctrl     |
+		 *               ---------------------------
+		 *              |          data_klm         |
+		 *               ---------------------------
+		 *              |          prot_klm         |
+		 *               ---------------------------
+		 *              |             BSF           |
+		 *               ---------------------------
+		 **/
+		struct mlx5_stride_block_ctrl_seg *sblock_ctrl;
+		struct mlx5_stride_block_entry *data_sentry;
+		struct mlx5_stride_block_entry *prot_sentry;
+		u32 prot_key = wr->wr.sig_handover.prot->lkey;
+		u64 prot_va = wr->wr.sig_handover.prot->addr;
+		u16 block_size = sig_attrs->mem.sig.dif.pi_interval;
+		int prot_size;
+
+		sblock_ctrl = *seg;
+		data_sentry = (void *)sblock_ctrl + sizeof(*sblock_ctrl);
+		prot_sentry = (void *)data_sentry + sizeof(*data_sentry);
+
+		prot_size = prot_field_size(sig_attrs->mem.sig_type);
+		if (!prot_size) {
+			pr_err("Bad block size given: %u\n", block_size);
+			return -EINVAL;
+		}
+		sblock_ctrl->bcount_per_cycle = cpu_to_be32(block_size +
+							    prot_size);
+		sblock_ctrl->op = cpu_to_be32(MLX5_STRIDE_BLOCK_OP);
+		sblock_ctrl->repeat_count = cpu_to_be32(data_len / block_size);
+		sblock_ctrl->num_entries = cpu_to_be16(2);
+
+		data_sentry->bcount = cpu_to_be16(block_size);
+		data_sentry->key = cpu_to_be32(data_key);
+		data_sentry->va = cpu_to_be64(data_va);
+		data_sentry->stride = cpu_to_be16(block_size);
+
+		prot_sentry->bcount = cpu_to_be16(prot_size);
+		prot_sentry->key = cpu_to_be32(prot_key);
+		prot_sentry->va = cpu_to_be64(prot_va);
+		prot_sentry->stride = cpu_to_be16(prot_size);
+
+		wqe_size = ALIGN(sizeof(*sblock_ctrl) + sizeof(*data_sentry) +
+				 sizeof(*prot_sentry), 64);
+	}
+
+	*seg += wqe_size;
+	*size += wqe_size / 16;
+	if (unlikely((*seg == qp->sq.qend)))
+		*seg = mlx5_get_send_wqe(qp, 0);
+
+	bsf = *seg;
+	ret = mlx5_set_bsf(sig_mr, sig_attrs, bsf, data_len);
+	if (ret)
+		return -EINVAL;
+
+	*seg += sizeof(*bsf);
+	*size += sizeof(*bsf) / 16;
+	if (unlikely((*seg == qp->sq.qend)))
+		*seg = mlx5_get_send_wqe(qp, 0);
+
+	return 0;
+}
+
+static void set_sig_mkey_segment(struct mlx5_mkey_seg *seg,
+				 struct ib_send_wr *wr, u32 nelements,
+				 u32 length, u32 pdn)
+{
+	struct ib_mr *sig_mr = wr->wr.sig_handover.sig_mr;
+	u32 sig_key = sig_mr->rkey;
+	u8 sigerr = to_mmr(sig_mr)->sig->sigerr_count & 1;
+
+	memset(seg, 0, sizeof(*seg));
+
+	seg->flags = get_umr_flags(wr->wr.sig_handover.access_flags) |
+				   MLX5_ACCESS_MODE_KLM;
+	seg->qpn_mkey7_0 = cpu_to_be32((sig_key & 0xff) | 0xffffff00);
+	seg->flags_pd = cpu_to_be32(MLX5_MKEY_REMOTE_INVAL | sigerr << 26 |
+				    MLX5_MKEY_BSF_EN | pdn);
+	seg->len = cpu_to_be64(length);
+	seg->xlt_oct_size = cpu_to_be32(be16_to_cpu(get_klm_octo(nelements)));
+	seg->bsfs_octo_size = cpu_to_be32(MLX5_MKEY_BSF_OCTO_SIZE);
+}
+
+static void set_sig_umr_segment(struct mlx5_wqe_umr_ctrl_seg *umr,
+				struct ib_send_wr *wr, u32 nelements)
+{
+	memset(umr, 0, sizeof(*umr));
+
+	umr->flags = MLX5_FLAGS_INLINE | MLX5_FLAGS_CHECK_FREE;
+	umr->klm_octowords = get_klm_octo(nelements);
+	umr->bsf_octowords = cpu_to_be16(MLX5_MKEY_BSF_OCTO_SIZE);
+	umr->mkey_mask = sig_mkey_mask();
+}
+
+
+static int set_sig_umr_wr(struct ib_send_wr *wr, struct mlx5_ib_qp *qp,
+			  void **seg, int *size)
+{
+	struct mlx5_ib_mr *sig_mr = to_mmr(wr->wr.sig_handover.sig_mr);
+	u32 pdn = get_pd(qp)->pdn;
+	u32 klm_oct_size;
+	int region_len, ret;
+
+	if (unlikely(wr->num_sge != 1) ||
+	    unlikely(wr->wr.sig_handover.access_flags &
+		     IB_ACCESS_REMOTE_ATOMIC) ||
+	    unlikely(!sig_mr->sig) || unlikely(!qp->signature_en) ||
+	    unlikely(!sig_mr->sig->sig_status_checked))
+		return -EINVAL;
+
+	/* length of the protected region, data + protection */
+	region_len = wr->sg_list->length;
+	if (wr->wr.sig_handover.prot &&
+	    (wr->wr.sig_handover.prot->lkey != wr->sg_list->lkey  ||
+	     wr->wr.sig_handover.prot->addr != wr->sg_list->addr  ||
+	     wr->wr.sig_handover.prot->length != wr->sg_list->length))
+		region_len += wr->wr.sig_handover.prot->length;
+
+	/**
+	 * KLM octoword size - if protection was provided
+	 * then we use strided block format (3 octowords),
+	 * else we use single KLM (1 octoword)
+	 **/
+	klm_oct_size = wr->wr.sig_handover.prot ? 3 : 1;
+
+	set_sig_umr_segment(*seg, wr, klm_oct_size);
+	*seg += sizeof(struct mlx5_wqe_umr_ctrl_seg);
+	*size += sizeof(struct mlx5_wqe_umr_ctrl_seg) / 16;
+	if (unlikely((*seg == qp->sq.qend)))
+		*seg = mlx5_get_send_wqe(qp, 0);
+
+	set_sig_mkey_segment(*seg, wr, klm_oct_size, region_len, pdn);
+	*seg += sizeof(struct mlx5_mkey_seg);
+	*size += sizeof(struct mlx5_mkey_seg) / 16;
+	if (unlikely((*seg == qp->sq.qend)))
+		*seg = mlx5_get_send_wqe(qp, 0);
+
+	ret = set_sig_data_segment(wr, qp, seg, size);
+	if (ret)
+		return ret;
+
+	sig_mr->sig->sig_status_checked = false;
+	return 0;
+}
+
+static int set_psv_wr(struct ib_sig_domain *domain,
+		      u32 psv_idx, void **seg, int *size)
+{
+	struct mlx5_seg_set_psv *psv_seg = *seg;
+
+	memset(psv_seg, 0, sizeof(*psv_seg));
+	psv_seg->psv_num = cpu_to_be32(psv_idx);
+	switch (domain->sig_type) {
+	case IB_SIG_TYPE_NONE:
+		break;
+	case IB_SIG_TYPE_T10_DIF:
+		psv_seg->transient_sig = cpu_to_be32(domain->sig.dif.bg << 16 |
+						     domain->sig.dif.app_tag);
+		psv_seg->ref_tag = cpu_to_be32(domain->sig.dif.ref_tag);
+		break;
+	default:
+		pr_err("Bad signature type given.\n");
+		return 1;
+	}
+
+	*seg += sizeof(*psv_seg);
+	*size += sizeof(*psv_seg) / 16;
+
+	return 0;
+}
+
+static int set_frwr_li_wr(void **seg, struct ib_send_wr *wr, int *size,
+			  struct mlx5_core_dev *mdev, struct mlx5_ib_pd *pd, struct mlx5_ib_qp *qp)
+{
+	int writ = 0;
+	int li;
+
+	li = wr->opcode == IB_WR_LOCAL_INV ? 1 : 0;
+	if (unlikely(wr->send_flags & IB_SEND_INLINE))
+		return -EINVAL;
+
+	set_frwr_umr_segment(*seg, wr, li);
+	*seg += sizeof(struct mlx5_wqe_umr_ctrl_seg);
+	*size += sizeof(struct mlx5_wqe_umr_ctrl_seg) / 16;
+	if (unlikely((*seg == qp->sq.qend)))
+		*seg = mlx5_get_send_wqe(qp, 0);
+	set_mkey_segment(*seg, wr, li, &writ);
+	*seg += sizeof(struct mlx5_mkey_seg);
+	*size += sizeof(struct mlx5_mkey_seg) / 16;
+	if (unlikely((*seg == qp->sq.qend)))
+		*seg = mlx5_get_send_wqe(qp, 0);
+	if (!li) {
+		if (unlikely(wr->wr.fast_reg.page_list_len >
+			     wr->wr.fast_reg.page_list->max_page_list_len))
+			return	-ENOMEM;
+
+		set_frwr_pages(*seg, wr, mdev, pd, writ);
+		*seg += sizeof(struct mlx5_wqe_data_seg);
+		*size += (sizeof(struct mlx5_wqe_data_seg) / 16);
+	}
+	return 0;
+}
+
+static void dump_wqe(struct mlx5_ib_qp *qp, int idx, int size_16)
+{
+	__be32 *p = NULL;
+	int tidx = idx;
+	int i, j;
+
+	pr_debug("dump wqe at %p\n", mlx5_get_send_wqe(qp, tidx));
+	for (i = 0, j = 0; i < size_16 * 4; i += 4, j += 4) {
+		if ((i & 0xf) == 0) {
+			void *buf = mlx5_get_send_wqe(qp, tidx);
+			tidx = (tidx + 1) & (qp->sq.wqe_cnt - 1);
+			p = buf;
+			j = 0;
+		}
+		pr_debug("%08x %08x %08x %08x\n", be32_to_cpu(p[j]),
+			 be32_to_cpu(p[j + 1]), be32_to_cpu(p[j + 2]),
+			 be32_to_cpu(p[j + 3]));
+	}
+}
+
+static void mlx5_bf_copy(u64 __iomem *dst, u64 *src,
+			 unsigned bytecnt, struct mlx5_ib_qp *qp)
+{
+	while (bytecnt > 0) {
+		__iowrite64_copy(dst++, src++, 8);
+		__iowrite64_copy(dst++, src++, 8);
+		__iowrite64_copy(dst++, src++, 8);
+		__iowrite64_copy(dst++, src++, 8);
+		__iowrite64_copy(dst++, src++, 8);
+		__iowrite64_copy(dst++, src++, 8);
+		__iowrite64_copy(dst++, src++, 8);
+		__iowrite64_copy(dst++, src++, 8);
+		bytecnt -= 64;
+		if (unlikely(src == qp->sq.qend))
+			src = mlx5_get_send_wqe(qp, 0);
+	}
+}
+
+static u8 get_fence(u8 fence, struct ib_send_wr *wr)
+{
+	if (unlikely(wr->opcode == IB_WR_LOCAL_INV &&
+		     wr->send_flags & IB_SEND_FENCE))
+		return MLX5_FENCE_MODE_STRONG_ORDERING;
+
+	if (unlikely(fence)) {
+		if (wr->send_flags & IB_SEND_FENCE)
+			return MLX5_FENCE_MODE_SMALL_AND_FENCE;
+		else
+			return fence;
+
+	} else {
+		return 0;
+	}
+}
+
+static int begin_wqe(struct mlx5_ib_qp *qp, void **seg,
+		     struct mlx5_wqe_ctrl_seg **ctrl,
+		     struct ib_send_wr *wr, int *idx,
+		     int *size, int nreq)
+{
+	int err = 0;
+
+	if (unlikely(mlx5_wq_overflow(&qp->sq, nreq, qp->ibqp.send_cq))) {
+		err = -ENOMEM;
+		return err;
+	}
+
+	*idx = qp->sq.cur_post & (qp->sq.wqe_cnt - 1);
+	*seg = mlx5_get_send_wqe(qp, *idx);
+	*ctrl = *seg;
+	*(uint32_t *)(*seg + 8) = 0;
+	(*ctrl)->imm = send_ieth(wr);
+	(*ctrl)->fm_ce_se = qp->sq_signal_bits |
+		(wr->send_flags & IB_SEND_SIGNALED ?
+		 MLX5_WQE_CTRL_CQ_UPDATE : 0) |
+		(wr->send_flags & IB_SEND_SOLICITED ?
+		 MLX5_WQE_CTRL_SOLICITED : 0);
+
+	*seg += sizeof(**ctrl);
+	*size = sizeof(**ctrl) / 16;
+
+	return err;
+}
+
+static void finish_wqe(struct mlx5_ib_qp *qp,
+		       struct mlx5_wqe_ctrl_seg *ctrl,
+		       u8 size, unsigned idx, u64 wr_id,
+		       int nreq, u8 fence, u8 next_fence,
+		       u32 mlx5_opcode)
+{
+	u8 opmod = 0;
+
+	ctrl->opmod_idx_opcode = cpu_to_be32(((u32)(qp->sq.cur_post) << 8) |
+					     mlx5_opcode | ((u32)opmod << 24));
+	ctrl->qpn_ds = cpu_to_be32(size | (qp->mqp.qpn << 8));
+	ctrl->fm_ce_se |= fence;
+	qp->fm_cache = next_fence;
+	if (unlikely(qp->wq_sig))
+		ctrl->signature = wq_sig(ctrl);
+
+	qp->sq.wrid[idx] = wr_id;
+	qp->sq.w_list[idx].opcode = mlx5_opcode;
+	qp->sq.wqe_head[idx] = qp->sq.head + nreq;
+	qp->sq.cur_post += DIV_ROUND_UP(size * 16, MLX5_SEND_WQE_BB);
+	qp->sq.w_list[idx].next = qp->sq.cur_post;
+}
+
+
+int mlx5_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
+		      struct ib_send_wr **bad_wr)
+{
+	struct mlx5_wqe_ctrl_seg *ctrl = NULL;  /* compiler warning */
+	struct mlx5_ib_dev *dev = to_mdev(ibqp->device);
+	struct mlx5_core_dev *mdev = dev->mdev;
+	struct mlx5_ib_qp *qp = to_mqp(ibqp);
+	struct mlx5_ib_mr *mr;
+	struct mlx5_wqe_data_seg *dpseg;
+	struct mlx5_wqe_xrc_seg *xrc;
+	struct mlx5_bf *bf = qp->bf;
+	int uninitialized_var(size);
+	void *qend = qp->sq.qend;
+	unsigned long flags;
+	unsigned idx;
+	int err = 0;
+	int inl = 0;
+	int num_sge;
+	void *seg;
+	int nreq;
+	int i;
+	u8 next_fence = 0;
+	u8 fence;
+
+	spin_lock_irqsave(&qp->sq.lock, flags);
+
+	for (nreq = 0; wr; nreq++, wr = wr->next) {
+		if (unlikely(wr->opcode >= ARRAY_SIZE(mlx5_ib_opcode))) {
+			mlx5_ib_warn(dev, "\n");
+			err = -EINVAL;
+			*bad_wr = wr;
+			goto out;
+		}
+
+		fence = qp->fm_cache;
+		num_sge = wr->num_sge;
+		if (unlikely(num_sge > qp->sq.max_gs)) {
+			mlx5_ib_warn(dev, "\n");
+			err = -ENOMEM;
+			*bad_wr = wr;
+			goto out;
+		}
+
+		err = begin_wqe(qp, &seg, &ctrl, wr, &idx, &size, nreq);
+		if (err) {
+			mlx5_ib_warn(dev, "\n");
+			err = -ENOMEM;
+			*bad_wr = wr;
+			goto out;
+		}
+
+		switch (ibqp->qp_type) {
+		case IB_QPT_XRC_INI:
+			xrc = seg;
+			xrc->xrc_srqn = htonl(wr->xrc_remote_srq_num);
+			seg += sizeof(*xrc);
+			size += sizeof(*xrc) / 16;
+			/* fall through */
+		case IB_QPT_RC:
+			switch (wr->opcode) {
+			case IB_WR_RDMA_READ:
+			case IB_WR_RDMA_WRITE:
+			case IB_WR_RDMA_WRITE_WITH_IMM:
+				set_raddr_seg(seg, wr->wr.rdma.remote_addr,
+					      wr->wr.rdma.rkey);
+				seg += sizeof(struct mlx5_wqe_raddr_seg);
+				size += sizeof(struct mlx5_wqe_raddr_seg) / 16;
+				break;
+
+			case IB_WR_ATOMIC_CMP_AND_SWP:
+			case IB_WR_ATOMIC_FETCH_AND_ADD:
+			case IB_WR_MASKED_ATOMIC_CMP_AND_SWP:
+				mlx5_ib_warn(dev, "Atomic operations are not supported yet\n");
+				err = -ENOSYS;
+				*bad_wr = wr;
+				goto out;
+
+			case IB_WR_LOCAL_INV:
+				next_fence = MLX5_FENCE_MODE_INITIATOR_SMALL;
+				qp->sq.wr_data[idx] = IB_WR_LOCAL_INV;
+				ctrl->imm = cpu_to_be32(wr->ex.invalidate_rkey);
+				err = set_frwr_li_wr(&seg, wr, &size, mdev, to_mpd(ibqp->pd), qp);
+				if (err) {
+					mlx5_ib_warn(dev, "\n");
+					*bad_wr = wr;
+					goto out;
+				}
+				num_sge = 0;
+				break;
+
+			case IB_WR_FAST_REG_MR:
+				next_fence = MLX5_FENCE_MODE_INITIATOR_SMALL;
+				qp->sq.wr_data[idx] = IB_WR_FAST_REG_MR;
+				ctrl->imm = cpu_to_be32(wr->wr.fast_reg.rkey);
+				err = set_frwr_li_wr(&seg, wr, &size, mdev, to_mpd(ibqp->pd), qp);
+				if (err) {
+					mlx5_ib_warn(dev, "\n");
+					*bad_wr = wr;
+					goto out;
+				}
+				num_sge = 0;
+				break;
+
+			case IB_WR_REG_SIG_MR:
+				qp->sq.wr_data[idx] = IB_WR_REG_SIG_MR;
+				mr = to_mmr(wr->wr.sig_handover.sig_mr);
+
+				ctrl->imm = cpu_to_be32(mr->ibmr.rkey);
+				err = set_sig_umr_wr(wr, qp, &seg, &size);
+				if (err) {
+					mlx5_ib_warn(dev, "\n");
+					*bad_wr = wr;
+					goto out;
+				}
+
+				finish_wqe(qp, ctrl, size, idx, wr->wr_id,
+					   nreq, get_fence(fence, wr),
+					   next_fence, MLX5_OPCODE_UMR);
+				/*
+				 * SET_PSV WQEs are not signaled and solicited
+				 * on error
+				 */
+				wr->send_flags &= ~IB_SEND_SIGNALED;
+				wr->send_flags |= IB_SEND_SOLICITED;
+				err = begin_wqe(qp, &seg, &ctrl, wr,
+						&idx, &size, nreq);
+				if (err) {
+					mlx5_ib_warn(dev, "\n");
+					err = -ENOMEM;
+					*bad_wr = wr;
+					goto out;
+				}
+
+				err = set_psv_wr(&wr->wr.sig_handover.sig_attrs->mem,
+						 mr->sig->psv_memory.psv_idx, &seg,
+						 &size);
+				if (err) {
+					mlx5_ib_warn(dev, "\n");
+					*bad_wr = wr;
+					goto out;
+				}
+
+				finish_wqe(qp, ctrl, size, idx, wr->wr_id,
+					   nreq, get_fence(fence, wr),
+					   next_fence, MLX5_OPCODE_SET_PSV);
+				err = begin_wqe(qp, &seg, &ctrl, wr,
+						&idx, &size, nreq);
+				if (err) {
+					mlx5_ib_warn(dev, "\n");
+					err = -ENOMEM;
+					*bad_wr = wr;
+					goto out;
+				}
+
+				next_fence = MLX5_FENCE_MODE_INITIATOR_SMALL;
+				err = set_psv_wr(&wr->wr.sig_handover.sig_attrs->wire,
+						 mr->sig->psv_wire.psv_idx, &seg,
+						 &size);
+				if (err) {
+					mlx5_ib_warn(dev, "\n");
+					*bad_wr = wr;
+					goto out;
+				}
+
+				finish_wqe(qp, ctrl, size, idx, wr->wr_id,
+					   nreq, get_fence(fence, wr),
+					   next_fence, MLX5_OPCODE_SET_PSV);
+				num_sge = 0;
+				goto skip_psv;
+
+			default:
+				break;
+			}
+			break;
+
+		case IB_QPT_UC:
+			switch (wr->opcode) {
+			case IB_WR_RDMA_WRITE:
+			case IB_WR_RDMA_WRITE_WITH_IMM:
+				set_raddr_seg(seg, wr->wr.rdma.remote_addr,
+					      wr->wr.rdma.rkey);
+				seg  += sizeof(struct mlx5_wqe_raddr_seg);
+				size += sizeof(struct mlx5_wqe_raddr_seg) / 16;
+				break;
+
+			default:
+				break;
+			}
+			break;
+
+		case IB_QPT_UD:
+		case IB_QPT_SMI:
+		case IB_QPT_GSI:
+			set_datagram_seg(seg, wr);
+			seg += sizeof(struct mlx5_wqe_datagram_seg);
+			size += sizeof(struct mlx5_wqe_datagram_seg) / 16;
+			if (unlikely((seg == qend)))
+				seg = mlx5_get_send_wqe(qp, 0);
+			break;
+
+		case MLX5_IB_QPT_REG_UMR:
+			if (wr->opcode != MLX5_IB_WR_UMR) {
+				err = -EINVAL;
+				mlx5_ib_warn(dev, "bad opcode\n");
+				goto out;
+			}
+			qp->sq.wr_data[idx] = MLX5_IB_WR_UMR;
+			ctrl->imm = cpu_to_be32(wr->wr.fast_reg.rkey);
+			set_reg_umr_segment(seg, wr);
+			seg += sizeof(struct mlx5_wqe_umr_ctrl_seg);
+			size += sizeof(struct mlx5_wqe_umr_ctrl_seg) / 16;
+			if (unlikely((seg == qend)))
+				seg = mlx5_get_send_wqe(qp, 0);
+			set_reg_mkey_segment(seg, wr);
+			seg += sizeof(struct mlx5_mkey_seg);
+			size += sizeof(struct mlx5_mkey_seg) / 16;
+			if (unlikely((seg == qend)))
+				seg = mlx5_get_send_wqe(qp, 0);
+			break;
+
+		default:
+			break;
+		}
+
+		if (wr->send_flags & IB_SEND_INLINE && num_sge) {
+			int uninitialized_var(sz);
+
+			err = set_data_inl_seg(qp, wr, seg, &sz);
+			if (unlikely(err)) {
+				mlx5_ib_warn(dev, "\n");
+				*bad_wr = wr;
+				goto out;
+			}
+			inl = 1;
+			size += sz;
+		} else {
+			dpseg = seg;
+			for (i = 0; i < num_sge; i++) {
+				if (unlikely(dpseg == qend)) {
+					seg = mlx5_get_send_wqe(qp, 0);
+					dpseg = seg;
+				}
+				if (likely(wr->sg_list[i].length)) {
+					set_data_ptr_seg(dpseg, wr->sg_list + i);
+					size += sizeof(struct mlx5_wqe_data_seg) / 16;
+					dpseg++;
+				}
+			}
+		}
+
+		finish_wqe(qp, ctrl, size, idx, wr->wr_id, nreq,
+			   get_fence(fence, wr), next_fence,
+			   mlx5_ib_opcode[wr->opcode]);
+skip_psv:
+		if (0)
+			dump_wqe(qp, idx, size);
+	}
+
+out:
+	if (likely(nreq)) {
+		qp->sq.head += nreq;
+
+		/* Make sure that descriptors are written before
+		 * updating doorbell record and ringing the doorbell
+		 */
+		wmb();
+
+		qp->db.db[MLX5_SND_DBR] = cpu_to_be32(qp->sq.cur_post);
+
+		/* Make sure doorbell record is visible to the HCA before
+		 * we hit doorbell */
+		wmb();
+
+		if (bf->need_lock)
+			spin_lock(&bf->lock);
+
+		/* TBD enable WC */
+		if (0 && nreq == 1 && bf->uuarn && inl && size > 1 && size <= bf->buf_size / 16) {
+			mlx5_bf_copy(bf->reg + bf->offset, (u64 *)ctrl, ALIGN(size * 16, 64), qp);
+			/* wc_wmb(); */
+		} else {
+			mlx5_write64((__be32 *)ctrl, bf->regreg + bf->offset,
+				     MLX5_GET_DOORBELL_LOCK(&bf->lock32));
+			/* Make sure doorbells don't leak out of SQ spinlock
+			 * and reach the HCA out of order.
+			 */
+			mmiowb();
+		}
+		bf->offset ^= bf->buf_size;
+		if (bf->need_lock)
+			spin_unlock(&bf->lock);
+	}
+
+	spin_unlock_irqrestore(&qp->sq.lock, flags);
+
+	return err;
+}
+
+static void set_sig_seg(struct mlx5_rwqe_sig *sig, int size)
+{
+	sig->signature = calc_sig(sig, size);
+}
+
+int mlx5_ib_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr,
+		      struct ib_recv_wr **bad_wr)
+{
+	struct mlx5_ib_qp *qp = to_mqp(ibqp);
+	struct mlx5_wqe_data_seg *scat;
+	struct mlx5_rwqe_sig *sig;
+	unsigned long flags;
+	int err = 0;
+	int nreq;
+	int ind;
+	int i;
+
+	spin_lock_irqsave(&qp->rq.lock, flags);
+
+	ind = qp->rq.head & (qp->rq.wqe_cnt - 1);
+
+	for (nreq = 0; wr; nreq++, wr = wr->next) {
+		if (mlx5_wq_overflow(&qp->rq, nreq, qp->ibqp.recv_cq)) {
+			err = -ENOMEM;
+			*bad_wr = wr;
+			goto out;
+		}
+
+		if (unlikely(wr->num_sge > qp->rq.max_gs)) {
+			err = -EINVAL;
+			*bad_wr = wr;
+			goto out;
+		}
+
+		scat = get_recv_wqe(qp, ind);
+		if (qp->wq_sig)
+			scat++;
+
+		for (i = 0; i < wr->num_sge; i++)
+			set_data_ptr_seg(scat + i, wr->sg_list + i);
+
+		if (i < qp->rq.max_gs) {
+			scat[i].byte_count = 0;
+			scat[i].lkey       = cpu_to_be32(MLX5_INVALID_LKEY);
+			scat[i].addr       = 0;
+		}
+
+		if (qp->wq_sig) {
+			sig = (struct mlx5_rwqe_sig *)scat;
+			set_sig_seg(sig, (qp->rq.max_gs + 1) << 2);
+		}
+
+		qp->rq.wrid[ind] = wr->wr_id;
+
+		ind = (ind + 1) & (qp->rq.wqe_cnt - 1);
+	}
+
+out:
+	if (likely(nreq)) {
+		qp->rq.head += nreq;
+
+		/* Make sure that descriptors are written before
+		 * doorbell record.
+		 */
+		wmb();
+
+		*qp->db.db = cpu_to_be32(qp->rq.head & 0xffff);
+	}
+
+	spin_unlock_irqrestore(&qp->rq.lock, flags);
+
+	return err;
+}
+
+static inline enum ib_qp_state to_ib_qp_state(enum mlx5_qp_state mlx5_state)
+{
+	switch (mlx5_state) {
+	case MLX5_QP_STATE_RST:      return IB_QPS_RESET;
+	case MLX5_QP_STATE_INIT:     return IB_QPS_INIT;
+	case MLX5_QP_STATE_RTR:      return IB_QPS_RTR;
+	case MLX5_QP_STATE_RTS:      return IB_QPS_RTS;
+	case MLX5_QP_STATE_SQ_DRAINING:
+	case MLX5_QP_STATE_SQD:      return IB_QPS_SQD;
+	case MLX5_QP_STATE_SQER:     return IB_QPS_SQE;
+	case MLX5_QP_STATE_ERR:      return IB_QPS_ERR;
+	default:		     return -1;
+	}
+}
+
+static inline enum ib_mig_state to_ib_mig_state(int mlx5_mig_state)
+{
+	switch (mlx5_mig_state) {
+	case MLX5_QP_PM_ARMED:		return IB_MIG_ARMED;
+	case MLX5_QP_PM_REARM:		return IB_MIG_REARM;
+	case MLX5_QP_PM_MIGRATED:	return IB_MIG_MIGRATED;
+	default: return -1;
+	}
+}
+
+static int to_ib_qp_access_flags(int mlx5_flags)
+{
+	int ib_flags = 0;
+
+	if (mlx5_flags & MLX5_QP_BIT_RRE)
+		ib_flags |= IB_ACCESS_REMOTE_READ;
+	if (mlx5_flags & MLX5_QP_BIT_RWE)
+		ib_flags |= IB_ACCESS_REMOTE_WRITE;
+	if (mlx5_flags & MLX5_QP_BIT_RAE)
+		ib_flags |= IB_ACCESS_REMOTE_ATOMIC;
+
+	return ib_flags;
+}
+
+static void to_ib_ah_attr(struct mlx5_ib_dev *ibdev, struct ib_ah_attr *ib_ah_attr,
+				struct mlx5_qp_path *path)
+{
+	struct mlx5_core_dev *dev = ibdev->mdev;
+
+	memset(ib_ah_attr, 0, sizeof(*ib_ah_attr));
+	ib_ah_attr->port_num	  = path->port;
+
+	if (ib_ah_attr->port_num == 0 ||
+	    ib_ah_attr->port_num > dev->caps.gen.num_ports)
+		return;
+
+	ib_ah_attr->sl = path->sl & 0xf;
+
+	ib_ah_attr->dlid	  = be16_to_cpu(path->rlid);
+	ib_ah_attr->src_path_bits = path->grh_mlid & 0x7f;
+	ib_ah_attr->static_rate   = path->static_rate ? path->static_rate - 5 : 0;
+	ib_ah_attr->ah_flags      = (path->grh_mlid & (1 << 7)) ? IB_AH_GRH : 0;
+	if (ib_ah_attr->ah_flags) {
+		ib_ah_attr->grh.sgid_index = path->mgid_index;
+		ib_ah_attr->grh.hop_limit  = path->hop_limit;
+		ib_ah_attr->grh.traffic_class =
+			(be32_to_cpu(path->tclass_flowlabel) >> 20) & 0xff;
+		ib_ah_attr->grh.flow_label =
+			be32_to_cpu(path->tclass_flowlabel) & 0xfffff;
+		memcpy(ib_ah_attr->grh.dgid.raw,
+		       path->rgid, sizeof(ib_ah_attr->grh.dgid.raw));
+	}
+}
+
+int mlx5_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, int qp_attr_mask,
+		     struct ib_qp_init_attr *qp_init_attr)
+{
+	struct mlx5_ib_dev *dev = to_mdev(ibqp->device);
+	struct mlx5_ib_qp *qp = to_mqp(ibqp);
+	struct mlx5_query_qp_mbox_out *outb;
+	struct mlx5_qp_context *context;
+	int mlx5_state;
+	int err = 0;
+
+	mutex_lock(&qp->mutex);
+	outb = kzalloc(sizeof(*outb), GFP_KERNEL);
+	if (!outb) {
+		err = -ENOMEM;
+		goto out;
+	}
+	context = &outb->ctx;
+	err = mlx5_core_qp_query(dev->mdev, &qp->mqp, outb, sizeof(*outb));
+	if (err)
+		goto out_free;
+
+	mlx5_state = be32_to_cpu(context->flags) >> 28;
+
+	qp->state		     = to_ib_qp_state(mlx5_state);
+	qp_attr->qp_state	     = qp->state;
+	qp_attr->path_mtu	     = context->mtu_msgmax >> 5;
+	qp_attr->path_mig_state	     =
+		to_ib_mig_state((be32_to_cpu(context->flags) >> 11) & 0x3);
+	qp_attr->qkey		     = be32_to_cpu(context->qkey);
+	qp_attr->rq_psn		     = be32_to_cpu(context->rnr_nextrecvpsn) & 0xffffff;
+	qp_attr->sq_psn		     = be32_to_cpu(context->next_send_psn) & 0xffffff;
+	qp_attr->dest_qp_num	     = be32_to_cpu(context->log_pg_sz_remote_qpn) & 0xffffff;
+	qp_attr->qp_access_flags     =
+		to_ib_qp_access_flags(be32_to_cpu(context->params2));
+
+	if (qp->ibqp.qp_type == IB_QPT_RC || qp->ibqp.qp_type == IB_QPT_UC) {
+		to_ib_ah_attr(dev, &qp_attr->ah_attr, &context->pri_path);
+		to_ib_ah_attr(dev, &qp_attr->alt_ah_attr, &context->alt_path);
+		qp_attr->alt_pkey_index = context->alt_path.pkey_index & 0x7f;
+		qp_attr->alt_port_num	= qp_attr->alt_ah_attr.port_num;
+	}
+
+	qp_attr->pkey_index = context->pri_path.pkey_index & 0x7f;
+	qp_attr->port_num = context->pri_path.port;
+
+	/* qp_attr->en_sqd_async_notify is only applicable in modify qp */
+	qp_attr->sq_draining = mlx5_state == MLX5_QP_STATE_SQ_DRAINING;
+
+	qp_attr->max_rd_atomic = 1 << ((be32_to_cpu(context->params1) >> 21) & 0x7);
+
+	qp_attr->max_dest_rd_atomic =
+		1 << ((be32_to_cpu(context->params2) >> 21) & 0x7);
+	qp_attr->min_rnr_timer	    =
+		(be32_to_cpu(context->rnr_nextrecvpsn) >> 24) & 0x1f;
+	qp_attr->timeout	    = context->pri_path.ackto_lt >> 3;
+	qp_attr->retry_cnt	    = (be32_to_cpu(context->params1) >> 16) & 0x7;
+	qp_attr->rnr_retry	    = (be32_to_cpu(context->params1) >> 13) & 0x7;
+	qp_attr->alt_timeout	    = context->alt_path.ackto_lt >> 3;
+	qp_attr->cur_qp_state	     = qp_attr->qp_state;
+	qp_attr->cap.max_recv_wr     = qp->rq.wqe_cnt;
+	qp_attr->cap.max_recv_sge    = qp->rq.max_gs;
+
+	if (!ibqp->uobject) {
+		qp_attr->cap.max_send_wr  = qp->sq.wqe_cnt;
+		qp_attr->cap.max_send_sge = qp->sq.max_gs;
+	} else {
+		qp_attr->cap.max_send_wr  = 0;
+		qp_attr->cap.max_send_sge = 0;
+	}
+
+	/* We don't support inline sends for kernel QPs (yet), and we
+	 * don't know what userspace's value should be.
+	 */
+	qp_attr->cap.max_inline_data = 0;
+
+	qp_init_attr->cap	     = qp_attr->cap;
+
+	qp_init_attr->create_flags = 0;
+	if (qp->flags & MLX5_IB_QP_BLOCK_MULTICAST_LOOPBACK)
+		qp_init_attr->create_flags |= IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK;
+
+	qp_init_attr->sq_sig_type = qp->sq_signal_bits & MLX5_WQE_CTRL_CQ_UPDATE ?
+		IB_SIGNAL_ALL_WR : IB_SIGNAL_REQ_WR;
+
+out_free:
+	kfree(outb);
+
+out:
+	mutex_unlock(&qp->mutex);
+	return err;
+}
+
+struct ib_xrcd *mlx5_ib_alloc_xrcd(struct ib_device *ibdev,
+					  struct ib_ucontext *context,
+					  struct ib_udata *udata)
+{
+	struct mlx5_ib_dev *dev = to_mdev(ibdev);
+	struct mlx5_general_caps *gen;
+	struct mlx5_ib_xrcd *xrcd;
+	int err;
+
+	gen = &dev->mdev->caps.gen;
+	if (!(gen->flags & MLX5_DEV_CAP_FLAG_XRC))
+		return ERR_PTR(-ENOSYS);
+
+	xrcd = kmalloc(sizeof(*xrcd), GFP_KERNEL);
+	if (!xrcd)
+		return ERR_PTR(-ENOMEM);
+
+	err = mlx5_core_xrcd_alloc(dev->mdev, &xrcd->xrcdn);
+	if (err) {
+		kfree(xrcd);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	return &xrcd->ibxrcd;
+}
+
+int mlx5_ib_dealloc_xrcd(struct ib_xrcd *xrcd)
+{
+	struct mlx5_ib_dev *dev = to_mdev(xrcd->device);
+	u32 xrcdn = to_mxrcd(xrcd)->xrcdn;
+	int err;
+
+	err = mlx5_core_xrcd_dealloc(dev->mdev, xrcdn);
+	if (err) {
+		mlx5_ib_warn(dev, "failed to dealloc xrcdn 0x%x\n", xrcdn);
+		return err;
+	}
+
+	kfree(xrcd);
+
+	return 0;
+}
diff --git a/drivers/infiniband/hw/mlx5/srq.c b/drivers/infiniband/hw/mlx5/srq.c
new file mode 100644
index 0000000..97cc1ba
--- /dev/null
+++ b/drivers/infiniband/hw/mlx5/srq.c
@@ -0,0 +1,487 @@
+/*
+ * Copyright (c) 2013, Mellanox Technologies inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/mlx5/qp.h>
+#include <linux/mlx5/srq.h>
+#include <linux/slab.h>
+#include <rdma/ib_umem.h>
+#include <rdma/ib_user_verbs.h>
+
+#include "mlx5_ib.h"
+#include "user.h"
+
+/* not supported currently */
+static int srq_signature;
+
+static void *get_wqe(struct mlx5_ib_srq *srq, int n)
+{
+	return mlx5_buf_offset(&srq->buf, n << srq->msrq.wqe_shift);
+}
+
+static void mlx5_ib_srq_event(struct mlx5_core_srq *srq, enum mlx5_event type)
+{
+	struct ib_event event;
+	struct ib_srq *ibsrq = &to_mibsrq(srq)->ibsrq;
+
+	if (ibsrq->event_handler) {
+		event.device      = ibsrq->device;
+		event.element.srq = ibsrq;
+		switch (type) {
+		case MLX5_EVENT_TYPE_SRQ_RQ_LIMIT:
+			event.event = IB_EVENT_SRQ_LIMIT_REACHED;
+			break;
+		case MLX5_EVENT_TYPE_SRQ_CATAS_ERROR:
+			event.event = IB_EVENT_SRQ_ERR;
+			break;
+		default:
+			pr_warn("mlx5_ib: Unexpected event type %d on SRQ %06x\n",
+				type, srq->srqn);
+			return;
+		}
+
+		ibsrq->event_handler(&event, ibsrq->srq_context);
+	}
+}
+
+static int create_srq_user(struct ib_pd *pd, struct mlx5_ib_srq *srq,
+			   struct mlx5_create_srq_mbox_in **in,
+			   struct ib_udata *udata, int buf_size, int *inlen)
+{
+	struct mlx5_ib_dev *dev = to_mdev(pd->device);
+	struct mlx5_ib_create_srq ucmd;
+	size_t ucmdlen;
+	int err;
+	int npages;
+	int page_shift;
+	int ncont;
+	u32 offset;
+
+	ucmdlen =
+		(udata->inlen - sizeof(struct ib_uverbs_cmd_hdr) <
+		 sizeof(ucmd)) ? (sizeof(ucmd) -
+				  sizeof(ucmd.reserved)) : sizeof(ucmd);
+
+	if (ib_copy_from_udata(&ucmd, udata, ucmdlen)) {
+		mlx5_ib_dbg(dev, "failed copy udata\n");
+		return -EFAULT;
+	}
+
+	if (ucmdlen == sizeof(ucmd) &&
+	    ucmd.reserved != 0)
+		return -EINVAL;
+
+	srq->wq_sig = !!(ucmd.flags & MLX5_SRQ_FLAG_SIGNATURE);
+
+	srq->umem = ib_umem_get(pd->uobject->context, ucmd.buf_addr, buf_size,
+				0, 0);
+	if (IS_ERR(srq->umem)) {
+		mlx5_ib_dbg(dev, "failed umem get, size %d\n", buf_size);
+		err = PTR_ERR(srq->umem);
+		return err;
+	}
+
+	mlx5_ib_cont_pages(srq->umem, ucmd.buf_addr, &npages,
+			   &page_shift, &ncont, NULL);
+	err = mlx5_ib_get_buf_offset(ucmd.buf_addr, page_shift,
+				     &offset);
+	if (err) {
+		mlx5_ib_warn(dev, "bad offset\n");
+		goto err_umem;
+	}
+
+	*inlen = sizeof(**in) + sizeof(*(*in)->pas) * ncont;
+	*in = mlx5_vzalloc(*inlen);
+	if (!(*in)) {
+		err = -ENOMEM;
+		goto err_umem;
+	}
+
+	mlx5_ib_populate_pas(dev, srq->umem, page_shift, (*in)->pas, 0);
+
+	err = mlx5_ib_db_map_user(to_mucontext(pd->uobject->context),
+				  ucmd.db_addr, &srq->db);
+	if (err) {
+		mlx5_ib_dbg(dev, "map doorbell failed\n");
+		goto err_in;
+	}
+
+	(*in)->ctx.log_pg_sz = page_shift - MLX5_ADAPTER_PAGE_SHIFT;
+	(*in)->ctx.pgoff_cqn = cpu_to_be32(offset << 26);
+
+	return 0;
+
+err_in:
+	mlx5_vfree(*in);
+
+err_umem:
+	ib_umem_release(srq->umem);
+
+	return err;
+}
+
+static int create_srq_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_srq *srq,
+			     struct mlx5_create_srq_mbox_in **in, int buf_size,
+			     int *inlen)
+{
+	int err;
+	int i;
+	struct mlx5_wqe_srq_next_seg *next;
+	int page_shift;
+	int npages;
+
+	err = mlx5_db_alloc(dev->mdev, &srq->db);
+	if (err) {
+		mlx5_ib_warn(dev, "alloc dbell rec failed\n");
+		return err;
+	}
+
+	*srq->db.db = 0;
+
+	if (mlx5_buf_alloc(dev->mdev, buf_size, PAGE_SIZE * 2, &srq->buf)) {
+		mlx5_ib_dbg(dev, "buf alloc failed\n");
+		err = -ENOMEM;
+		goto err_db;
+	}
+	page_shift = srq->buf.page_shift;
+
+	srq->head    = 0;
+	srq->tail    = srq->msrq.max - 1;
+	srq->wqe_ctr = 0;
+
+	for (i = 0; i < srq->msrq.max; i++) {
+		next = get_wqe(srq, i);
+		next->next_wqe_index =
+			cpu_to_be16((i + 1) & (srq->msrq.max - 1));
+	}
+
+	npages = DIV_ROUND_UP(srq->buf.npages, 1 << (page_shift - PAGE_SHIFT));
+	mlx5_ib_dbg(dev, "buf_size %d, page_shift %d, npages %d, calc npages %d\n",
+		    buf_size, page_shift, srq->buf.npages, npages);
+	*inlen = sizeof(**in) + sizeof(*(*in)->pas) * npages;
+	*in = mlx5_vzalloc(*inlen);
+	if (!*in) {
+		err = -ENOMEM;
+		goto err_buf;
+	}
+	mlx5_fill_page_array(&srq->buf, (*in)->pas);
+
+	srq->wrid = kmalloc(srq->msrq.max * sizeof(u64), GFP_KERNEL);
+	if (!srq->wrid) {
+		mlx5_ib_dbg(dev, "kmalloc failed %lu\n",
+			    (unsigned long)(srq->msrq.max * sizeof(u64)));
+		err = -ENOMEM;
+		goto err_in;
+	}
+	srq->wq_sig = !!srq_signature;
+
+	(*in)->ctx.log_pg_sz = page_shift - MLX5_ADAPTER_PAGE_SHIFT;
+
+	return 0;
+
+err_in:
+	mlx5_vfree(*in);
+
+err_buf:
+	mlx5_buf_free(dev->mdev, &srq->buf);
+
+err_db:
+	mlx5_db_free(dev->mdev, &srq->db);
+	return err;
+}
+
+static void destroy_srq_user(struct ib_pd *pd, struct mlx5_ib_srq *srq)
+{
+	mlx5_ib_db_unmap_user(to_mucontext(pd->uobject->context), &srq->db);
+	ib_umem_release(srq->umem);
+}
+
+
+static void destroy_srq_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_srq *srq)
+{
+	kfree(srq->wrid);
+	mlx5_buf_free(dev->mdev, &srq->buf);
+	mlx5_db_free(dev->mdev, &srq->db);
+}
+
+struct ib_srq *mlx5_ib_create_srq(struct ib_pd *pd,
+				  struct ib_srq_init_attr *init_attr,
+				  struct ib_udata *udata)
+{
+	struct mlx5_ib_dev *dev = to_mdev(pd->device);
+	struct mlx5_general_caps *gen;
+	struct mlx5_ib_srq *srq;
+	int desc_size;
+	int buf_size;
+	int err;
+	struct mlx5_create_srq_mbox_in *uninitialized_var(in);
+	int uninitialized_var(inlen);
+	int is_xrc;
+	u32 flgs, xrcdn;
+
+	gen = &dev->mdev->caps.gen;
+	/* Sanity check SRQ size before proceeding */
+	if (init_attr->attr.max_wr >= gen->max_srq_wqes) {
+		mlx5_ib_dbg(dev, "max_wr %d, cap %d\n",
+			    init_attr->attr.max_wr,
+			    gen->max_srq_wqes);
+		return ERR_PTR(-EINVAL);
+	}
+
+	srq = kmalloc(sizeof(*srq), GFP_KERNEL);
+	if (!srq)
+		return ERR_PTR(-ENOMEM);
+
+	mutex_init(&srq->mutex);
+	spin_lock_init(&srq->lock);
+	srq->msrq.max    = roundup_pow_of_two(init_attr->attr.max_wr + 1);
+	srq->msrq.max_gs = init_attr->attr.max_sge;
+
+	desc_size = sizeof(struct mlx5_wqe_srq_next_seg) +
+		    srq->msrq.max_gs * sizeof(struct mlx5_wqe_data_seg);
+	desc_size = roundup_pow_of_two(desc_size);
+	desc_size = max_t(int, 32, desc_size);
+	srq->msrq.max_avail_gather = (desc_size - sizeof(struct mlx5_wqe_srq_next_seg)) /
+		sizeof(struct mlx5_wqe_data_seg);
+	srq->msrq.wqe_shift = ilog2(desc_size);
+	buf_size = srq->msrq.max * desc_size;
+	mlx5_ib_dbg(dev, "desc_size 0x%x, req wr 0x%x, srq size 0x%x, max_gs 0x%x, max_avail_gather 0x%x\n",
+		    desc_size, init_attr->attr.max_wr, srq->msrq.max, srq->msrq.max_gs,
+		    srq->msrq.max_avail_gather);
+
+	if (pd->uobject)
+		err = create_srq_user(pd, srq, &in, udata, buf_size, &inlen);
+	else
+		err = create_srq_kernel(dev, srq, &in, buf_size, &inlen);
+
+	if (err) {
+		mlx5_ib_warn(dev, "create srq %s failed, err %d\n",
+			     pd->uobject ? "user" : "kernel", err);
+		goto err_srq;
+	}
+
+	is_xrc = (init_attr->srq_type == IB_SRQT_XRC);
+	in->ctx.state_log_sz = ilog2(srq->msrq.max);
+	flgs = ((srq->msrq.wqe_shift - 4) | (is_xrc << 5) | (srq->wq_sig << 7)) << 24;
+	xrcdn = 0;
+	if (is_xrc) {
+		xrcdn = to_mxrcd(init_attr->ext.xrc.xrcd)->xrcdn;
+		in->ctx.pgoff_cqn |= cpu_to_be32(to_mcq(init_attr->ext.xrc.cq)->mcq.cqn);
+	} else if (init_attr->srq_type == IB_SRQT_BASIC) {
+		xrcdn = to_mxrcd(dev->devr.x0)->xrcdn;
+		in->ctx.pgoff_cqn |= cpu_to_be32(to_mcq(dev->devr.c0)->mcq.cqn);
+	}
+
+	in->ctx.flags_xrcd = cpu_to_be32((flgs & 0xFF000000) | (xrcdn & 0xFFFFFF));
+
+	in->ctx.pd = cpu_to_be32(to_mpd(pd)->pdn);
+	in->ctx.db_record = cpu_to_be64(srq->db.dma);
+	err = mlx5_core_create_srq(dev->mdev, &srq->msrq, in, inlen);
+	mlx5_vfree(in);
+	if (err) {
+		mlx5_ib_dbg(dev, "create SRQ failed, err %d\n", err);
+		goto err_usr_kern_srq;
+	}
+
+	mlx5_ib_dbg(dev, "create SRQ with srqn 0x%x\n", srq->msrq.srqn);
+
+	srq->msrq.event = mlx5_ib_srq_event;
+	srq->ibsrq.ext.xrc.srq_num = srq->msrq.srqn;
+
+	if (pd->uobject)
+		if (ib_copy_to_udata(udata, &srq->msrq.srqn, sizeof(__u32))) {
+			mlx5_ib_dbg(dev, "copy to user failed\n");
+			err = -EFAULT;
+			goto err_core;
+		}
+
+	init_attr->attr.max_wr = srq->msrq.max - 1;
+
+	return &srq->ibsrq;
+
+err_core:
+	mlx5_core_destroy_srq(dev->mdev, &srq->msrq);
+
+err_usr_kern_srq:
+	if (pd->uobject)
+		destroy_srq_user(pd, srq);
+	else
+		destroy_srq_kernel(dev, srq);
+
+err_srq:
+	kfree(srq);
+
+	return ERR_PTR(err);
+}
+
+int mlx5_ib_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
+		       enum ib_srq_attr_mask attr_mask, struct ib_udata *udata)
+{
+	struct mlx5_ib_dev *dev = to_mdev(ibsrq->device);
+	struct mlx5_ib_srq *srq = to_msrq(ibsrq);
+	int ret;
+
+	/* We don't support resizing SRQs yet */
+	if (attr_mask & IB_SRQ_MAX_WR)
+		return -EINVAL;
+
+	if (attr_mask & IB_SRQ_LIMIT) {
+		if (attr->srq_limit >= srq->msrq.max)
+			return -EINVAL;
+
+		mutex_lock(&srq->mutex);
+		ret = mlx5_core_arm_srq(dev->mdev, &srq->msrq, attr->srq_limit, 1);
+		mutex_unlock(&srq->mutex);
+
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+int mlx5_ib_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *srq_attr)
+{
+	struct mlx5_ib_dev *dev = to_mdev(ibsrq->device);
+	struct mlx5_ib_srq *srq = to_msrq(ibsrq);
+	int ret;
+	struct mlx5_query_srq_mbox_out *out;
+
+	out = kzalloc(sizeof(*out), GFP_KERNEL);
+	if (!out)
+		return -ENOMEM;
+
+	ret = mlx5_core_query_srq(dev->mdev, &srq->msrq, out);
+	if (ret)
+		goto out_box;
+
+	srq_attr->srq_limit = be16_to_cpu(out->ctx.lwm);
+	srq_attr->max_wr    = srq->msrq.max - 1;
+	srq_attr->max_sge   = srq->msrq.max_gs;
+
+out_box:
+	kfree(out);
+	return ret;
+}
+
+int mlx5_ib_destroy_srq(struct ib_srq *srq)
+{
+	struct mlx5_ib_dev *dev = to_mdev(srq->device);
+	struct mlx5_ib_srq *msrq = to_msrq(srq);
+
+	mlx5_core_destroy_srq(dev->mdev, &msrq->msrq);
+
+	if (srq->uobject) {
+		mlx5_ib_db_unmap_user(to_mucontext(srq->uobject->context), &msrq->db);
+		ib_umem_release(msrq->umem);
+	} else {
+		destroy_srq_kernel(dev, msrq);
+	}
+
+	kfree(srq);
+	return 0;
+}
+
+void mlx5_ib_free_srq_wqe(struct mlx5_ib_srq *srq, int wqe_index)
+{
+	struct mlx5_wqe_srq_next_seg *next;
+
+	/* always called with interrupts disabled. */
+	spin_lock(&srq->lock);
+
+	next = get_wqe(srq, srq->tail);
+	next->next_wqe_index = cpu_to_be16(wqe_index);
+	srq->tail = wqe_index;
+
+	spin_unlock(&srq->lock);
+}
+
+int mlx5_ib_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr,
+			  struct ib_recv_wr **bad_wr)
+{
+	struct mlx5_ib_srq *srq = to_msrq(ibsrq);
+	struct mlx5_wqe_srq_next_seg *next;
+	struct mlx5_wqe_data_seg *scat;
+	unsigned long flags;
+	int err = 0;
+	int nreq;
+	int i;
+
+	spin_lock_irqsave(&srq->lock, flags);
+
+	for (nreq = 0; wr; nreq++, wr = wr->next) {
+		if (unlikely(wr->num_sge > srq->msrq.max_gs)) {
+			err = -EINVAL;
+			*bad_wr = wr;
+			break;
+		}
+
+		if (unlikely(srq->head == srq->tail)) {
+			err = -ENOMEM;
+			*bad_wr = wr;
+			break;
+		}
+
+		srq->wrid[srq->head] = wr->wr_id;
+
+		next      = get_wqe(srq, srq->head);
+		srq->head = be16_to_cpu(next->next_wqe_index);
+		scat      = (struct mlx5_wqe_data_seg *)(next + 1);
+
+		for (i = 0; i < wr->num_sge; i++) {
+			scat[i].byte_count = cpu_to_be32(wr->sg_list[i].length);
+			scat[i].lkey       = cpu_to_be32(wr->sg_list[i].lkey);
+			scat[i].addr       = cpu_to_be64(wr->sg_list[i].addr);
+		}
+
+		if (i < srq->msrq.max_avail_gather) {
+			scat[i].byte_count = 0;
+			scat[i].lkey       = cpu_to_be32(MLX5_INVALID_LKEY);
+			scat[i].addr       = 0;
+		}
+	}
+
+	if (likely(nreq)) {
+		srq->wqe_ctr += nreq;
+
+		/* Make sure that descriptors are written before
+		 * doorbell record.
+		 */
+		wmb();
+
+		*srq->db.db = cpu_to_be32(srq->wqe_ctr);
+	}
+
+	spin_unlock_irqrestore(&srq->lock, flags);
+
+	return err;
+}
diff --git a/drivers/infiniband/hw/mlx5/user.h b/drivers/infiniband/hw/mlx5/user.h
new file mode 100644
index 0000000..d0ba264
--- /dev/null
+++ b/drivers/infiniband/hw/mlx5/user.h
@@ -0,0 +1,133 @@
+/*
+ * Copyright (c) 2013, Mellanox Technologies inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MLX5_IB_USER_H
+#define MLX5_IB_USER_H
+
+#include <linux/types.h>
+
+enum {
+	MLX5_QP_FLAG_SIGNATURE		= 1 << 0,
+	MLX5_QP_FLAG_SCATTER_CQE	= 1 << 1,
+};
+
+enum {
+	MLX5_SRQ_FLAG_SIGNATURE		= 1 << 0,
+};
+
+
+/* Increment this value if any changes that break userspace ABI
+ * compatibility are made.
+ */
+#define MLX5_IB_UVERBS_ABI_VERSION	1
+
+/* Make sure that all structs defined in this file remain laid out so
+ * that they pack the same way on 32-bit and 64-bit architectures (to
+ * avoid incompatibility between 32-bit userspace and 64-bit kernels).
+ * In particular do not use pointer types -- pass pointers in __u64
+ * instead.
+ */
+
+struct mlx5_ib_alloc_ucontext_req {
+	__u32	total_num_uuars;
+	__u32	num_low_latency_uuars;
+};
+
+struct mlx5_ib_alloc_ucontext_req_v2 {
+	__u32	total_num_uuars;
+	__u32	num_low_latency_uuars;
+	__u32	flags;
+	__u32	reserved;
+};
+
+struct mlx5_ib_alloc_ucontext_resp {
+	__u32	qp_tab_size;
+	__u32	bf_reg_size;
+	__u32	tot_uuars;
+	__u32	cache_line_size;
+	__u16	max_sq_desc_sz;
+	__u16	max_rq_desc_sz;
+	__u32	max_send_wqebb;
+	__u32	max_recv_wr;
+	__u32	max_srq_recv_wr;
+	__u16	num_ports;
+	__u16	reserved;
+};
+
+struct mlx5_ib_alloc_pd_resp {
+	__u32	pdn;
+};
+
+struct mlx5_ib_create_cq {
+	__u64	buf_addr;
+	__u64	db_addr;
+	__u32	cqe_size;
+	__u32	reserved; /* explicit padding (optional on i386) */
+};
+
+struct mlx5_ib_create_cq_resp {
+	__u32	cqn;
+	__u32	reserved;
+};
+
+struct mlx5_ib_resize_cq {
+	__u64	buf_addr;
+	__u16	cqe_size;
+	__u16	reserved0;
+	__u32	reserved1;
+};
+
+struct mlx5_ib_create_srq {
+	__u64	buf_addr;
+	__u64	db_addr;
+	__u32	flags;
+	__u32	reserved; /* explicit padding (optional on i386) */
+};
+
+struct mlx5_ib_create_srq_resp {
+	__u32	srqn;
+	__u32	reserved;
+};
+
+struct mlx5_ib_create_qp {
+	__u64	buf_addr;
+	__u64	db_addr;
+	__u32	sq_wqe_count;
+	__u32	rq_wqe_count;
+	__u32	rq_wqe_shift;
+	__u32	flags;
+};
+
+struct mlx5_ib_create_qp_resp {
+	__u32	uuar_index;
+};
+#endif /* MLX5_IB_USER_H */
diff --git a/drivers/infiniband/hw/mthca/Kconfig b/drivers/infiniband/hw/mthca/Kconfig
new file mode 100644
index 0000000..da314c3
--- /dev/null
+++ b/drivers/infiniband/hw/mthca/Kconfig
@@ -0,0 +1,17 @@
+config INFINIBAND_MTHCA
+	tristate "Mellanox HCA support"
+	depends on PCI
+	---help---
+	  This is a low-level driver for Mellanox InfiniHost host
+	  channel adapters (HCAs), including the MT23108 PCI-X HCA
+	  ("Tavor") and the MT25208 PCI Express HCA ("Arbel").
+
+config INFINIBAND_MTHCA_DEBUG
+	bool "Verbose debugging output" if EXPERT
+	depends on INFINIBAND_MTHCA
+	default y
+	---help---
+	  This option causes debugging code to be compiled into the
+	  mthca driver.  The output can be turned on via the
+	  debug_level module parameter (which can also be set after
+	  the driver is loaded through sysfs).
diff --git a/drivers/infiniband/hw/mthca/Makefile b/drivers/infiniband/hw/mthca/Makefile
new file mode 100644
index 0000000..e388d95
--- /dev/null
+++ b/drivers/infiniband/hw/mthca/Makefile
@@ -0,0 +1,7 @@
+obj-$(CONFIG_INFINIBAND_MTHCA) += ib_mthca.o
+
+ib_mthca-y :=	mthca_main.o mthca_cmd.o mthca_profile.o mthca_reset.o \
+		mthca_allocator.o mthca_eq.o mthca_pd.o mthca_cq.o \
+		mthca_mr.o mthca_qp.o mthca_av.o mthca_mcg.o mthca_mad.o \
+		mthca_provider.o mthca_memfree.o mthca_uar.o mthca_srq.o \
+		mthca_catas.o
diff --git a/drivers/infiniband/hw/mthca/mthca_allocator.c b/drivers/infiniband/hw/mthca/mthca_allocator.c
new file mode 100644
index 0000000..b4e0cf4
--- /dev/null
+++ b/drivers/infiniband/hw/mthca/mthca_allocator.c
@@ -0,0 +1,301 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/bitmap.h>
+
+#include "mthca_dev.h"
+
+/* Trivial bitmap-based allocator */
+u32 mthca_alloc(struct mthca_alloc *alloc)
+{
+	unsigned long flags;
+	u32 obj;
+
+	spin_lock_irqsave(&alloc->lock, flags);
+
+	obj = find_next_zero_bit(alloc->table, alloc->max, alloc->last);
+	if (obj >= alloc->max) {
+		alloc->top = (alloc->top + alloc->max) & alloc->mask;
+		obj = find_first_zero_bit(alloc->table, alloc->max);
+	}
+
+	if (obj < alloc->max) {
+		set_bit(obj, alloc->table);
+		obj |= alloc->top;
+	} else
+		obj = -1;
+
+	spin_unlock_irqrestore(&alloc->lock, flags);
+
+	return obj;
+}
+
+void mthca_free(struct mthca_alloc *alloc, u32 obj)
+{
+	unsigned long flags;
+
+	obj &= alloc->max - 1;
+
+	spin_lock_irqsave(&alloc->lock, flags);
+
+	clear_bit(obj, alloc->table);
+	alloc->last = min(alloc->last, obj);
+	alloc->top = (alloc->top + alloc->max) & alloc->mask;
+
+	spin_unlock_irqrestore(&alloc->lock, flags);
+}
+
+int mthca_alloc_init(struct mthca_alloc *alloc, u32 num, u32 mask,
+		     u32 reserved)
+{
+	int i;
+
+	/* num must be a power of 2 */
+	if (num != 1 << (ffs(num) - 1))
+		return -EINVAL;
+
+	alloc->last = 0;
+	alloc->top  = 0;
+	alloc->max  = num;
+	alloc->mask = mask;
+	spin_lock_init(&alloc->lock);
+	alloc->table = kmalloc(BITS_TO_LONGS(num) * sizeof (long),
+			       GFP_KERNEL);
+	if (!alloc->table)
+		return -ENOMEM;
+
+	bitmap_zero(alloc->table, num);
+	for (i = 0; i < reserved; ++i)
+		set_bit(i, alloc->table);
+
+	return 0;
+}
+
+void mthca_alloc_cleanup(struct mthca_alloc *alloc)
+{
+	kfree(alloc->table);
+}
+
+/*
+ * Array of pointers with lazy allocation of leaf pages.  Callers of
+ * _get, _set and _clear methods must use a lock or otherwise
+ * serialize access to the array.
+ */
+
+#define MTHCA_ARRAY_MASK (PAGE_SIZE / sizeof (void *) - 1)
+
+void *mthca_array_get(struct mthca_array *array, int index)
+{
+	int p = (index * sizeof (void *)) >> PAGE_SHIFT;
+
+	if (array->page_list[p].page)
+		return array->page_list[p].page[index & MTHCA_ARRAY_MASK];
+	else
+		return NULL;
+}
+
+int mthca_array_set(struct mthca_array *array, int index, void *value)
+{
+	int p = (index * sizeof (void *)) >> PAGE_SHIFT;
+
+	/* Allocate with GFP_ATOMIC because we'll be called with locks held. */
+	if (!array->page_list[p].page)
+		array->page_list[p].page = (void **) get_zeroed_page(GFP_ATOMIC);
+
+	if (!array->page_list[p].page)
+		return -ENOMEM;
+
+	array->page_list[p].page[index & MTHCA_ARRAY_MASK] = value;
+	++array->page_list[p].used;
+
+	return 0;
+}
+
+void mthca_array_clear(struct mthca_array *array, int index)
+{
+	int p = (index * sizeof (void *)) >> PAGE_SHIFT;
+
+	if (--array->page_list[p].used == 0) {
+		free_page((unsigned long) array->page_list[p].page);
+		array->page_list[p].page = NULL;
+	} else
+		array->page_list[p].page[index & MTHCA_ARRAY_MASK] = NULL;
+
+	if (array->page_list[p].used < 0)
+		pr_debug("Array %p index %d page %d with ref count %d < 0\n",
+			 array, index, p, array->page_list[p].used);
+}
+
+int mthca_array_init(struct mthca_array *array, int nent)
+{
+	int npage = (nent * sizeof (void *) + PAGE_SIZE - 1) / PAGE_SIZE;
+	int i;
+
+	array->page_list = kmalloc(npage * sizeof *array->page_list, GFP_KERNEL);
+	if (!array->page_list)
+		return -ENOMEM;
+
+	for (i = 0; i < npage; ++i) {
+		array->page_list[i].page = NULL;
+		array->page_list[i].used = 0;
+	}
+
+	return 0;
+}
+
+void mthca_array_cleanup(struct mthca_array *array, int nent)
+{
+	int i;
+
+	for (i = 0; i < (nent * sizeof (void *) + PAGE_SIZE - 1) / PAGE_SIZE; ++i)
+		free_page((unsigned long) array->page_list[i].page);
+
+	kfree(array->page_list);
+}
+
+/*
+ * Handling for queue buffers -- we allocate a bunch of memory and
+ * register it in a memory region at HCA virtual address 0.  If the
+ * requested size is > max_direct, we split the allocation into
+ * multiple pages, so we don't require too much contiguous memory.
+ */
+
+int mthca_buf_alloc(struct mthca_dev *dev, int size, int max_direct,
+		    union mthca_buf *buf, int *is_direct, struct mthca_pd *pd,
+		    int hca_write, struct mthca_mr *mr)
+{
+	int err = -ENOMEM;
+	int npages, shift;
+	u64 *dma_list = NULL;
+	dma_addr_t t;
+	int i;
+
+	if (size <= max_direct) {
+		*is_direct = 1;
+		npages     = 1;
+		shift      = get_order(size) + PAGE_SHIFT;
+
+		buf->direct.buf = dma_alloc_coherent(&dev->pdev->dev,
+						     size, &t, GFP_KERNEL);
+		if (!buf->direct.buf)
+			return -ENOMEM;
+
+		dma_unmap_addr_set(&buf->direct, mapping, t);
+
+		memset(buf->direct.buf, 0, size);
+
+		while (t & ((1 << shift) - 1)) {
+			--shift;
+			npages *= 2;
+		}
+
+		dma_list = kmalloc(npages * sizeof *dma_list, GFP_KERNEL);
+		if (!dma_list)
+			goto err_free;
+
+		for (i = 0; i < npages; ++i)
+			dma_list[i] = t + i * (1 << shift);
+	} else {
+		*is_direct = 0;
+		npages     = (size + PAGE_SIZE - 1) / PAGE_SIZE;
+		shift      = PAGE_SHIFT;
+
+		dma_list = kmalloc(npages * sizeof *dma_list, GFP_KERNEL);
+		if (!dma_list)
+			return -ENOMEM;
+
+		buf->page_list = kmalloc(npages * sizeof *buf->page_list,
+					 GFP_KERNEL);
+		if (!buf->page_list)
+			goto err_out;
+
+		for (i = 0; i < npages; ++i)
+			buf->page_list[i].buf = NULL;
+
+		for (i = 0; i < npages; ++i) {
+			buf->page_list[i].buf =
+				dma_alloc_coherent(&dev->pdev->dev, PAGE_SIZE,
+						   &t, GFP_KERNEL);
+			if (!buf->page_list[i].buf)
+				goto err_free;
+
+			dma_list[i] = t;
+			dma_unmap_addr_set(&buf->page_list[i], mapping, t);
+
+			clear_page(buf->page_list[i].buf);
+		}
+	}
+
+	err = mthca_mr_alloc_phys(dev, pd->pd_num,
+				  dma_list, shift, npages,
+				  0, size,
+				  MTHCA_MPT_FLAG_LOCAL_READ |
+				  (hca_write ? MTHCA_MPT_FLAG_LOCAL_WRITE : 0),
+				  mr);
+	if (err)
+		goto err_free;
+
+	kfree(dma_list);
+
+	return 0;
+
+err_free:
+	mthca_buf_free(dev, size, buf, *is_direct, NULL);
+
+err_out:
+	kfree(dma_list);
+
+	return err;
+}
+
+void mthca_buf_free(struct mthca_dev *dev, int size, union mthca_buf *buf,
+		    int is_direct, struct mthca_mr *mr)
+{
+	int i;
+
+	if (mr)
+		mthca_free_mr(dev, mr);
+
+	if (is_direct)
+		dma_free_coherent(&dev->pdev->dev, size, buf->direct.buf,
+				  dma_unmap_addr(&buf->direct, mapping));
+	else {
+		for (i = 0; i < (size + PAGE_SIZE - 1) / PAGE_SIZE; ++i)
+			dma_free_coherent(&dev->pdev->dev, PAGE_SIZE,
+					  buf->page_list[i].buf,
+					  dma_unmap_addr(&buf->page_list[i],
+							 mapping));
+		kfree(buf->page_list);
+	}
+}
diff --git a/drivers/infiniband/hw/mthca/mthca_av.c b/drivers/infiniband/hw/mthca/mthca_av.c
new file mode 100644
index 0000000..32f6c63
--- /dev/null
+++ b/drivers/infiniband/hw/mthca/mthca_av.c
@@ -0,0 +1,374 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/string.h>
+#include <linux/slab.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_cache.h>
+
+#include "mthca_dev.h"
+
+enum {
+      MTHCA_RATE_TAVOR_FULL   = 0,
+      MTHCA_RATE_TAVOR_1X     = 1,
+      MTHCA_RATE_TAVOR_4X     = 2,
+      MTHCA_RATE_TAVOR_1X_DDR = 3
+};
+
+enum {
+      MTHCA_RATE_MEMFREE_FULL    = 0,
+      MTHCA_RATE_MEMFREE_QUARTER = 1,
+      MTHCA_RATE_MEMFREE_EIGHTH  = 2,
+      MTHCA_RATE_MEMFREE_HALF    = 3
+};
+
+struct mthca_av {
+	__be32 port_pd;
+	u8     reserved1;
+	u8     g_slid;
+	__be16 dlid;
+	u8     reserved2;
+	u8     gid_index;
+	u8     msg_sr;
+	u8     hop_limit;
+	__be32 sl_tclass_flowlabel;
+	__be32 dgid[4];
+};
+
+static enum ib_rate memfree_rate_to_ib(u8 mthca_rate, u8 port_rate)
+{
+	switch (mthca_rate) {
+	case MTHCA_RATE_MEMFREE_EIGHTH:
+		return mult_to_ib_rate(port_rate >> 3);
+	case MTHCA_RATE_MEMFREE_QUARTER:
+		return mult_to_ib_rate(port_rate >> 2);
+	case MTHCA_RATE_MEMFREE_HALF:
+		return mult_to_ib_rate(port_rate >> 1);
+	case MTHCA_RATE_MEMFREE_FULL:
+	default:
+		return mult_to_ib_rate(port_rate);
+	}
+}
+
+static enum ib_rate tavor_rate_to_ib(u8 mthca_rate, u8 port_rate)
+{
+	switch (mthca_rate) {
+	case MTHCA_RATE_TAVOR_1X:     return IB_RATE_2_5_GBPS;
+	case MTHCA_RATE_TAVOR_1X_DDR: return IB_RATE_5_GBPS;
+	case MTHCA_RATE_TAVOR_4X:     return IB_RATE_10_GBPS;
+	default:		      return mult_to_ib_rate(port_rate);
+	}
+}
+
+enum ib_rate mthca_rate_to_ib(struct mthca_dev *dev, u8 mthca_rate, u8 port)
+{
+	if (mthca_is_memfree(dev)) {
+		/* Handle old Arbel FW */
+		if (dev->limits.stat_rate_support == 0x3 && mthca_rate)
+			return IB_RATE_2_5_GBPS;
+
+		return memfree_rate_to_ib(mthca_rate, dev->rate[port - 1]);
+	} else
+		return tavor_rate_to_ib(mthca_rate, dev->rate[port - 1]);
+}
+
+static u8 ib_rate_to_memfree(u8 req_rate, u8 cur_rate)
+{
+	if (cur_rate <= req_rate)
+		return 0;
+
+	/*
+	 * Inter-packet delay (IPD) to get from rate X down to a rate
+	 * no more than Y is (X - 1) / Y.
+	 */
+	switch ((cur_rate - 1) / req_rate) {
+	case 0:	 return MTHCA_RATE_MEMFREE_FULL;
+	case 1:	 return MTHCA_RATE_MEMFREE_HALF;
+	case 2:	 /* fall through */
+	case 3:	 return MTHCA_RATE_MEMFREE_QUARTER;
+	default: return MTHCA_RATE_MEMFREE_EIGHTH;
+	}
+}
+
+static u8 ib_rate_to_tavor(u8 static_rate)
+{
+	switch (static_rate) {
+	case IB_RATE_2_5_GBPS: return MTHCA_RATE_TAVOR_1X;
+	case IB_RATE_5_GBPS:   return MTHCA_RATE_TAVOR_1X_DDR;
+	case IB_RATE_10_GBPS:  return MTHCA_RATE_TAVOR_4X;
+	default:	       return MTHCA_RATE_TAVOR_FULL;
+	}
+}
+
+u8 mthca_get_rate(struct mthca_dev *dev, int static_rate, u8 port)
+{
+	u8 rate;
+
+	if (!static_rate || ib_rate_to_mult(static_rate) >= dev->rate[port - 1])
+		return 0;
+
+	if (mthca_is_memfree(dev))
+		rate = ib_rate_to_memfree(ib_rate_to_mult(static_rate),
+					  dev->rate[port - 1]);
+	else
+		rate = ib_rate_to_tavor(static_rate);
+
+	if (!(dev->limits.stat_rate_support & (1 << rate)))
+		rate = 1;
+
+	return rate;
+}
+
+int mthca_create_ah(struct mthca_dev *dev,
+		    struct mthca_pd *pd,
+		    struct ib_ah_attr *ah_attr,
+		    struct mthca_ah *ah)
+{
+	u32 index = -1;
+	struct mthca_av *av = NULL;
+
+	ah->type = MTHCA_AH_PCI_POOL;
+
+	if (mthca_is_memfree(dev)) {
+		ah->av   = kmalloc(sizeof *ah->av, GFP_ATOMIC);
+		if (!ah->av)
+			return -ENOMEM;
+
+		ah->type = MTHCA_AH_KMALLOC;
+		av       = ah->av;
+	} else if (!atomic_read(&pd->sqp_count) &&
+		 !(dev->mthca_flags & MTHCA_FLAG_DDR_HIDDEN)) {
+		index = mthca_alloc(&dev->av_table.alloc);
+
+		/* fall back to allocate in host memory */
+		if (index == -1)
+			goto on_hca_fail;
+
+		av = kmalloc(sizeof *av, GFP_ATOMIC);
+		if (!av)
+			goto on_hca_fail;
+
+		ah->type = MTHCA_AH_ON_HCA;
+		ah->avdma  = dev->av_table.ddr_av_base +
+			index * MTHCA_AV_SIZE;
+	}
+
+on_hca_fail:
+	if (ah->type == MTHCA_AH_PCI_POOL) {
+		ah->av = pci_pool_alloc(dev->av_table.pool,
+					GFP_ATOMIC, &ah->avdma);
+		if (!ah->av)
+			return -ENOMEM;
+
+		av = ah->av;
+	}
+
+	ah->key = pd->ntmr.ibmr.lkey;
+
+	memset(av, 0, MTHCA_AV_SIZE);
+
+	av->port_pd = cpu_to_be32(pd->pd_num | (ah_attr->port_num << 24));
+	av->g_slid  = ah_attr->src_path_bits;
+	av->dlid    = cpu_to_be16(ah_attr->dlid);
+	av->msg_sr  = (3 << 4) | /* 2K message */
+		mthca_get_rate(dev, ah_attr->static_rate, ah_attr->port_num);
+	av->sl_tclass_flowlabel = cpu_to_be32(ah_attr->sl << 28);
+	if (ah_attr->ah_flags & IB_AH_GRH) {
+		av->g_slid |= 0x80;
+		av->gid_index = (ah_attr->port_num - 1) * dev->limits.gid_table_len +
+			ah_attr->grh.sgid_index;
+		av->hop_limit = ah_attr->grh.hop_limit;
+		av->sl_tclass_flowlabel |=
+			cpu_to_be32((ah_attr->grh.traffic_class << 20) |
+				    ah_attr->grh.flow_label);
+		memcpy(av->dgid, ah_attr->grh.dgid.raw, 16);
+	} else {
+		/* Arbel workaround -- low byte of GID must be 2 */
+		av->dgid[3] = cpu_to_be32(2);
+	}
+
+	if (0) {
+		int j;
+
+		mthca_dbg(dev, "Created UDAV at %p/%08lx:\n",
+			  av, (unsigned long) ah->avdma);
+		for (j = 0; j < 8; ++j)
+			printk(KERN_DEBUG "  [%2x] %08x\n",
+			       j * 4, be32_to_cpu(((__be32 *) av)[j]));
+	}
+
+	if (ah->type == MTHCA_AH_ON_HCA) {
+		memcpy_toio(dev->av_table.av_map + index * MTHCA_AV_SIZE,
+			    av, MTHCA_AV_SIZE);
+		kfree(av);
+	}
+
+	return 0;
+}
+
+int mthca_destroy_ah(struct mthca_dev *dev, struct mthca_ah *ah)
+{
+	switch (ah->type) {
+	case MTHCA_AH_ON_HCA:
+		mthca_free(&dev->av_table.alloc,
+			   (ah->avdma - dev->av_table.ddr_av_base) /
+			   MTHCA_AV_SIZE);
+		break;
+
+	case MTHCA_AH_PCI_POOL:
+		pci_pool_free(dev->av_table.pool, ah->av, ah->avdma);
+		break;
+
+	case MTHCA_AH_KMALLOC:
+		kfree(ah->av);
+		break;
+	}
+
+	return 0;
+}
+
+int mthca_ah_grh_present(struct mthca_ah *ah)
+{
+	return !!(ah->av->g_slid & 0x80);
+}
+
+int mthca_read_ah(struct mthca_dev *dev, struct mthca_ah *ah,
+		  struct ib_ud_header *header)
+{
+	if (ah->type == MTHCA_AH_ON_HCA)
+		return -EINVAL;
+
+	header->lrh.service_level   = be32_to_cpu(ah->av->sl_tclass_flowlabel) >> 28;
+	header->lrh.destination_lid = ah->av->dlid;
+	header->lrh.source_lid      = cpu_to_be16(ah->av->g_slid & 0x7f);
+	if (mthca_ah_grh_present(ah)) {
+		header->grh.traffic_class =
+			(be32_to_cpu(ah->av->sl_tclass_flowlabel) >> 20) & 0xff;
+		header->grh.flow_label    =
+			ah->av->sl_tclass_flowlabel & cpu_to_be32(0xfffff);
+		header->grh.hop_limit     = ah->av->hop_limit;
+		ib_get_cached_gid(&dev->ib_dev,
+				  be32_to_cpu(ah->av->port_pd) >> 24,
+				  ah->av->gid_index % dev->limits.gid_table_len,
+				  &header->grh.source_gid);
+		memcpy(header->grh.destination_gid.raw,
+		       ah->av->dgid, 16);
+	}
+
+	return 0;
+}
+
+int mthca_ah_query(struct ib_ah *ibah, struct ib_ah_attr *attr)
+{
+	struct mthca_ah *ah   = to_mah(ibah);
+	struct mthca_dev *dev = to_mdev(ibah->device);
+
+	/* Only implement for MAD and memfree ah for now. */
+	if (ah->type == MTHCA_AH_ON_HCA)
+		return -ENOSYS;
+
+	memset(attr, 0, sizeof *attr);
+	attr->dlid          = be16_to_cpu(ah->av->dlid);
+	attr->sl            = be32_to_cpu(ah->av->sl_tclass_flowlabel) >> 28;
+	attr->port_num      = be32_to_cpu(ah->av->port_pd) >> 24;
+	attr->static_rate   = mthca_rate_to_ib(dev, ah->av->msg_sr & 0x7,
+					       attr->port_num);
+	attr->src_path_bits = ah->av->g_slid & 0x7F;
+	attr->ah_flags      = mthca_ah_grh_present(ah) ? IB_AH_GRH : 0;
+
+	if (attr->ah_flags) {
+		attr->grh.traffic_class =
+			be32_to_cpu(ah->av->sl_tclass_flowlabel) >> 20;
+		attr->grh.flow_label =
+			be32_to_cpu(ah->av->sl_tclass_flowlabel) & 0xfffff;
+		attr->grh.hop_limit  = ah->av->hop_limit;
+		attr->grh.sgid_index = ah->av->gid_index &
+				       (dev->limits.gid_table_len - 1);
+		memcpy(attr->grh.dgid.raw, ah->av->dgid, 16);
+	}
+
+	return 0;
+}
+
+int mthca_init_av_table(struct mthca_dev *dev)
+{
+	int err;
+
+	if (mthca_is_memfree(dev))
+		return 0;
+
+	err = mthca_alloc_init(&dev->av_table.alloc,
+			       dev->av_table.num_ddr_avs,
+			       dev->av_table.num_ddr_avs - 1,
+			       0);
+	if (err)
+		return err;
+
+	dev->av_table.pool = pci_pool_create("mthca_av", dev->pdev,
+					     MTHCA_AV_SIZE,
+					     MTHCA_AV_SIZE, 0);
+	if (!dev->av_table.pool)
+		goto out_free_alloc;
+
+	if (!(dev->mthca_flags & MTHCA_FLAG_DDR_HIDDEN)) {
+		dev->av_table.av_map = ioremap(pci_resource_start(dev->pdev, 4) +
+					       dev->av_table.ddr_av_base -
+					       dev->ddr_start,
+					       dev->av_table.num_ddr_avs *
+					       MTHCA_AV_SIZE);
+		if (!dev->av_table.av_map)
+			goto out_free_pool;
+	} else
+		dev->av_table.av_map = NULL;
+
+	return 0;
+
+ out_free_pool:
+	pci_pool_destroy(dev->av_table.pool);
+
+ out_free_alloc:
+	mthca_alloc_cleanup(&dev->av_table.alloc);
+	return -ENOMEM;
+}
+
+void mthca_cleanup_av_table(struct mthca_dev *dev)
+{
+	if (mthca_is_memfree(dev))
+		return;
+
+	if (dev->av_table.av_map)
+		iounmap(dev->av_table.av_map);
+	pci_pool_destroy(dev->av_table.pool);
+	mthca_alloc_cleanup(&dev->av_table.alloc);
+}
diff --git a/drivers/infiniband/hw/mthca/mthca_catas.c b/drivers/infiniband/hw/mthca/mthca_catas.c
new file mode 100644
index 0000000..712d2a3
--- /dev/null
+++ b/drivers/infiniband/hw/mthca/mthca_catas.c
@@ -0,0 +1,200 @@
+/*
+ * Copyright (c) 2005 Cisco Systems.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/jiffies.h>
+#include <linux/module.h>
+#include <linux/timer.h>
+#include <linux/workqueue.h>
+
+#include "mthca_dev.h"
+
+enum {
+	MTHCA_CATAS_POLL_INTERVAL	= 5 * HZ,
+
+	MTHCA_CATAS_TYPE_INTERNAL	= 0,
+	MTHCA_CATAS_TYPE_UPLINK		= 3,
+	MTHCA_CATAS_TYPE_DDR		= 4,
+	MTHCA_CATAS_TYPE_PARITY		= 5,
+};
+
+static DEFINE_SPINLOCK(catas_lock);
+
+static LIST_HEAD(catas_list);
+static struct workqueue_struct *catas_wq;
+static struct work_struct catas_work;
+
+static int catas_reset_disable;
+module_param_named(catas_reset_disable, catas_reset_disable, int, 0644);
+MODULE_PARM_DESC(catas_reset_disable, "disable reset on catastrophic event if nonzero");
+
+static void catas_reset(struct work_struct *work)
+{
+	struct mthca_dev *dev, *tmpdev;
+	LIST_HEAD(tlist);
+	int ret;
+
+	mutex_lock(&mthca_device_mutex);
+
+	spin_lock_irq(&catas_lock);
+	list_splice_init(&catas_list, &tlist);
+	spin_unlock_irq(&catas_lock);
+
+	list_for_each_entry_safe(dev, tmpdev, &tlist, catas_err.list) {
+		struct pci_dev *pdev = dev->pdev;
+		ret = __mthca_restart_one(dev->pdev);
+		/* 'dev' now is not valid */
+		if (ret)
+			printk(KERN_ERR "mthca %s: Reset failed (%d)\n",
+			       pci_name(pdev), ret);
+		else {
+			struct mthca_dev *d = pci_get_drvdata(pdev);
+			mthca_dbg(d, "Reset succeeded\n");
+		}
+	}
+
+	mutex_unlock(&mthca_device_mutex);
+}
+
+static void handle_catas(struct mthca_dev *dev)
+{
+	struct ib_event event;
+	unsigned long flags;
+	const char *type;
+	int i;
+
+	event.device = &dev->ib_dev;
+	event.event  = IB_EVENT_DEVICE_FATAL;
+	event.element.port_num = 0;
+	dev->active = false;
+
+	ib_dispatch_event(&event);
+
+	switch (swab32(readl(dev->catas_err.map)) >> 24) {
+	case MTHCA_CATAS_TYPE_INTERNAL:
+		type = "internal error";
+		break;
+	case MTHCA_CATAS_TYPE_UPLINK:
+		type = "uplink bus error";
+		break;
+	case MTHCA_CATAS_TYPE_DDR:
+		type = "DDR data error";
+		break;
+	case MTHCA_CATAS_TYPE_PARITY:
+		type = "internal parity error";
+		break;
+	default:
+		type = "unknown error";
+		break;
+	}
+
+	mthca_err(dev, "Catastrophic error detected: %s\n", type);
+	for (i = 0; i < dev->catas_err.size; ++i)
+		mthca_err(dev, "  buf[%02x]: %08x\n",
+			  i, swab32(readl(dev->catas_err.map + i)));
+
+	if (catas_reset_disable)
+		return;
+
+	spin_lock_irqsave(&catas_lock, flags);
+	list_add(&dev->catas_err.list, &catas_list);
+	queue_work(catas_wq, &catas_work);
+	spin_unlock_irqrestore(&catas_lock, flags);
+}
+
+static void poll_catas(unsigned long dev_ptr)
+{
+	struct mthca_dev *dev = (struct mthca_dev *) dev_ptr;
+	int i;
+
+	for (i = 0; i < dev->catas_err.size; ++i)
+		if (readl(dev->catas_err.map + i)) {
+			handle_catas(dev);
+			return;
+		}
+
+	mod_timer(&dev->catas_err.timer,
+		  round_jiffies(jiffies + MTHCA_CATAS_POLL_INTERVAL));
+}
+
+void mthca_start_catas_poll(struct mthca_dev *dev)
+{
+	phys_addr_t addr;
+
+	init_timer(&dev->catas_err.timer);
+	dev->catas_err.map  = NULL;
+
+	addr = pci_resource_start(dev->pdev, 0) +
+		((pci_resource_len(dev->pdev, 0) - 1) &
+		 dev->catas_err.addr);
+
+	dev->catas_err.map = ioremap(addr, dev->catas_err.size * 4);
+	if (!dev->catas_err.map) {
+		mthca_warn(dev, "couldn't map catastrophic error region "
+			   "at 0x%llx/0x%x\n", (unsigned long long) addr,
+			   dev->catas_err.size * 4);
+		return;
+	}
+
+	dev->catas_err.timer.data     = (unsigned long) dev;
+	dev->catas_err.timer.function = poll_catas;
+	dev->catas_err.timer.expires  = jiffies + MTHCA_CATAS_POLL_INTERVAL;
+	INIT_LIST_HEAD(&dev->catas_err.list);
+	add_timer(&dev->catas_err.timer);
+}
+
+void mthca_stop_catas_poll(struct mthca_dev *dev)
+{
+	del_timer_sync(&dev->catas_err.timer);
+
+	if (dev->catas_err.map)
+		iounmap(dev->catas_err.map);
+
+	spin_lock_irq(&catas_lock);
+	list_del(&dev->catas_err.list);
+	spin_unlock_irq(&catas_lock);
+}
+
+int __init mthca_catas_init(void)
+{
+	INIT_WORK(&catas_work, catas_reset);
+
+	catas_wq = create_singlethread_workqueue("mthca_catas");
+	if (!catas_wq)
+		return -ENOMEM;
+
+	return 0;
+}
+
+void mthca_catas_cleanup(void)
+{
+	destroy_workqueue(catas_wq);
+}
diff --git a/drivers/infiniband/hw/mthca/mthca_cmd.c b/drivers/infiniband/hw/mthca/mthca_cmd.c
new file mode 100644
index 0000000..9d3e5c1
--- /dev/null
+++ b/drivers/infiniband/hw/mthca/mthca_cmd.c
@@ -0,0 +1,1969 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2005, 2006 Cisco Systems.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/completion.h>
+#include <linux/pci.h>
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <asm/io.h>
+#include <rdma/ib_mad.h>
+
+#include "mthca_dev.h"
+#include "mthca_config_reg.h"
+#include "mthca_cmd.h"
+#include "mthca_memfree.h"
+
+#define CMD_POLL_TOKEN 0xffff
+
+enum {
+	HCR_IN_PARAM_OFFSET    = 0x00,
+	HCR_IN_MODIFIER_OFFSET = 0x08,
+	HCR_OUT_PARAM_OFFSET   = 0x0c,
+	HCR_TOKEN_OFFSET       = 0x14,
+	HCR_STATUS_OFFSET      = 0x18,
+
+	HCR_OPMOD_SHIFT        = 12,
+	HCA_E_BIT              = 22,
+	HCR_GO_BIT             = 23
+};
+
+enum {
+	/* initialization and general commands */
+	CMD_SYS_EN          = 0x1,
+	CMD_SYS_DIS         = 0x2,
+	CMD_MAP_FA          = 0xfff,
+	CMD_UNMAP_FA        = 0xffe,
+	CMD_RUN_FW          = 0xff6,
+	CMD_MOD_STAT_CFG    = 0x34,
+	CMD_QUERY_DEV_LIM   = 0x3,
+	CMD_QUERY_FW        = 0x4,
+	CMD_ENABLE_LAM      = 0xff8,
+	CMD_DISABLE_LAM     = 0xff7,
+	CMD_QUERY_DDR       = 0x5,
+	CMD_QUERY_ADAPTER   = 0x6,
+	CMD_INIT_HCA        = 0x7,
+	CMD_CLOSE_HCA       = 0x8,
+	CMD_INIT_IB         = 0x9,
+	CMD_CLOSE_IB        = 0xa,
+	CMD_QUERY_HCA       = 0xb,
+	CMD_SET_IB          = 0xc,
+	CMD_ACCESS_DDR      = 0x2e,
+	CMD_MAP_ICM         = 0xffa,
+	CMD_UNMAP_ICM       = 0xff9,
+	CMD_MAP_ICM_AUX     = 0xffc,
+	CMD_UNMAP_ICM_AUX   = 0xffb,
+	CMD_SET_ICM_SIZE    = 0xffd,
+
+	/* TPT commands */
+	CMD_SW2HW_MPT 	    = 0xd,
+	CMD_QUERY_MPT 	    = 0xe,
+	CMD_HW2SW_MPT 	    = 0xf,
+	CMD_READ_MTT        = 0x10,
+	CMD_WRITE_MTT       = 0x11,
+	CMD_SYNC_TPT        = 0x2f,
+
+	/* EQ commands */
+	CMD_MAP_EQ          = 0x12,
+	CMD_SW2HW_EQ 	    = 0x13,
+	CMD_HW2SW_EQ 	    = 0x14,
+	CMD_QUERY_EQ        = 0x15,
+
+	/* CQ commands */
+	CMD_SW2HW_CQ 	    = 0x16,
+	CMD_HW2SW_CQ 	    = 0x17,
+	CMD_QUERY_CQ 	    = 0x18,
+	CMD_RESIZE_CQ       = 0x2c,
+
+	/* SRQ commands */
+	CMD_SW2HW_SRQ 	    = 0x35,
+	CMD_HW2SW_SRQ 	    = 0x36,
+	CMD_QUERY_SRQ       = 0x37,
+	CMD_ARM_SRQ         = 0x40,
+
+	/* QP/EE commands */
+	CMD_RST2INIT_QPEE   = 0x19,
+	CMD_INIT2RTR_QPEE   = 0x1a,
+	CMD_RTR2RTS_QPEE    = 0x1b,
+	CMD_RTS2RTS_QPEE    = 0x1c,
+	CMD_SQERR2RTS_QPEE  = 0x1d,
+	CMD_2ERR_QPEE       = 0x1e,
+	CMD_RTS2SQD_QPEE    = 0x1f,
+	CMD_SQD2SQD_QPEE    = 0x38,
+	CMD_SQD2RTS_QPEE    = 0x20,
+	CMD_ERR2RST_QPEE    = 0x21,
+	CMD_QUERY_QPEE      = 0x22,
+	CMD_INIT2INIT_QPEE  = 0x2d,
+	CMD_SUSPEND_QPEE    = 0x32,
+	CMD_UNSUSPEND_QPEE  = 0x33,
+	/* special QPs and management commands */
+	CMD_CONF_SPECIAL_QP = 0x23,
+	CMD_MAD_IFC         = 0x24,
+
+	/* multicast commands */
+	CMD_READ_MGM        = 0x25,
+	CMD_WRITE_MGM       = 0x26,
+	CMD_MGID_HASH       = 0x27,
+
+	/* miscellaneous commands */
+	CMD_DIAG_RPRT       = 0x30,
+	CMD_NOP             = 0x31,
+
+	/* debug commands */
+	CMD_QUERY_DEBUG_MSG = 0x2a,
+	CMD_SET_DEBUG_MSG   = 0x2b,
+};
+
+/*
+ * According to Mellanox code, FW may be starved and never complete
+ * commands.  So we can't use strict timeouts described in PRM -- we
+ * just arbitrarily select 60 seconds for now.
+ */
+#if 0
+/*
+ * Round up and add 1 to make sure we get the full wait time (since we
+ * will be starting in the middle of a jiffy)
+ */
+enum {
+	CMD_TIME_CLASS_A = (HZ + 999) / 1000 + 1,
+	CMD_TIME_CLASS_B = (HZ +  99) /  100 + 1,
+	CMD_TIME_CLASS_C = (HZ +   9) /   10 + 1,
+	CMD_TIME_CLASS_D = 60 * HZ
+};
+#else
+enum {
+	CMD_TIME_CLASS_A = 60 * HZ,
+	CMD_TIME_CLASS_B = 60 * HZ,
+	CMD_TIME_CLASS_C = 60 * HZ,
+	CMD_TIME_CLASS_D = 60 * HZ
+};
+#endif
+
+enum {
+	GO_BIT_TIMEOUT = HZ * 10
+};
+
+struct mthca_cmd_context {
+	struct completion done;
+	int               result;
+	int               next;
+	u64               out_param;
+	u16               token;
+	u8                status;
+};
+
+static int fw_cmd_doorbell = 0;
+module_param(fw_cmd_doorbell, int, 0644);
+MODULE_PARM_DESC(fw_cmd_doorbell, "post FW commands through doorbell page if nonzero "
+		 "(and supported by FW)");
+
+static inline int go_bit(struct mthca_dev *dev)
+{
+	return readl(dev->hcr + HCR_STATUS_OFFSET) &
+		swab32(1 << HCR_GO_BIT);
+}
+
+static void mthca_cmd_post_dbell(struct mthca_dev *dev,
+				 u64 in_param,
+				 u64 out_param,
+				 u32 in_modifier,
+				 u8 op_modifier,
+				 u16 op,
+				 u16 token)
+{
+	void __iomem *ptr = dev->cmd.dbell_map;
+	u16 *offs = dev->cmd.dbell_offsets;
+
+	__raw_writel((__force u32) cpu_to_be32(in_param >> 32),           ptr + offs[0]);
+	wmb();
+	__raw_writel((__force u32) cpu_to_be32(in_param & 0xfffffffful),  ptr + offs[1]);
+	wmb();
+	__raw_writel((__force u32) cpu_to_be32(in_modifier),              ptr + offs[2]);
+	wmb();
+	__raw_writel((__force u32) cpu_to_be32(out_param >> 32),          ptr + offs[3]);
+	wmb();
+	__raw_writel((__force u32) cpu_to_be32(out_param & 0xfffffffful), ptr + offs[4]);
+	wmb();
+	__raw_writel((__force u32) cpu_to_be32(token << 16),              ptr + offs[5]);
+	wmb();
+	__raw_writel((__force u32) cpu_to_be32((1 << HCR_GO_BIT)                |
+					       (1 << HCA_E_BIT)                 |
+					       (op_modifier << HCR_OPMOD_SHIFT) |
+						op),			  ptr + offs[6]);
+	wmb();
+	__raw_writel((__force u32) 0,                                     ptr + offs[7]);
+	wmb();
+}
+
+static int mthca_cmd_post_hcr(struct mthca_dev *dev,
+			      u64 in_param,
+			      u64 out_param,
+			      u32 in_modifier,
+			      u8 op_modifier,
+			      u16 op,
+			      u16 token,
+			      int event)
+{
+	if (event) {
+		unsigned long end = jiffies + GO_BIT_TIMEOUT;
+
+		while (go_bit(dev) && time_before(jiffies, end)) {
+			set_current_state(TASK_RUNNING);
+			schedule();
+		}
+	}
+
+	if (go_bit(dev))
+		return -EAGAIN;
+
+	/*
+	 * We use writel (instead of something like memcpy_toio)
+	 * because writes of less than 32 bits to the HCR don't work
+	 * (and some architectures such as ia64 implement memcpy_toio
+	 * in terms of writeb).
+	 */
+	__raw_writel((__force u32) cpu_to_be32(in_param >> 32),           dev->hcr + 0 * 4);
+	__raw_writel((__force u32) cpu_to_be32(in_param & 0xfffffffful),  dev->hcr + 1 * 4);
+	__raw_writel((__force u32) cpu_to_be32(in_modifier),              dev->hcr + 2 * 4);
+	__raw_writel((__force u32) cpu_to_be32(out_param >> 32),          dev->hcr + 3 * 4);
+	__raw_writel((__force u32) cpu_to_be32(out_param & 0xfffffffful), dev->hcr + 4 * 4);
+	__raw_writel((__force u32) cpu_to_be32(token << 16),              dev->hcr + 5 * 4);
+
+	/* __raw_writel may not order writes. */
+	wmb();
+
+	__raw_writel((__force u32) cpu_to_be32((1 << HCR_GO_BIT)                |
+					       (event ? (1 << HCA_E_BIT) : 0)   |
+					       (op_modifier << HCR_OPMOD_SHIFT) |
+					       op),                       dev->hcr + 6 * 4);
+
+	return 0;
+}
+
+static int mthca_cmd_post(struct mthca_dev *dev,
+			  u64 in_param,
+			  u64 out_param,
+			  u32 in_modifier,
+			  u8 op_modifier,
+			  u16 op,
+			  u16 token,
+			  int event)
+{
+	int err = 0;
+
+	mutex_lock(&dev->cmd.hcr_mutex);
+
+	if (event && dev->cmd.flags & MTHCA_CMD_POST_DOORBELLS && fw_cmd_doorbell)
+		mthca_cmd_post_dbell(dev, in_param, out_param, in_modifier,
+					   op_modifier, op, token);
+	else
+		err = mthca_cmd_post_hcr(dev, in_param, out_param, in_modifier,
+					 op_modifier, op, token, event);
+
+	/*
+	 * Make sure that our HCR writes don't get mixed in with
+	 * writes from another CPU starting a FW command.
+	 */
+	mmiowb();
+
+	mutex_unlock(&dev->cmd.hcr_mutex);
+	return err;
+}
+
+
+static int mthca_status_to_errno(u8 status)
+{
+	static const int trans_table[] = {
+		[MTHCA_CMD_STAT_INTERNAL_ERR]   = -EIO,
+		[MTHCA_CMD_STAT_BAD_OP]         = -EPERM,
+		[MTHCA_CMD_STAT_BAD_PARAM]      = -EINVAL,
+		[MTHCA_CMD_STAT_BAD_SYS_STATE]  = -ENXIO,
+		[MTHCA_CMD_STAT_BAD_RESOURCE]   = -EBADF,
+		[MTHCA_CMD_STAT_RESOURCE_BUSY]  = -EBUSY,
+		[MTHCA_CMD_STAT_DDR_MEM_ERR]    = -ENOMEM,
+		[MTHCA_CMD_STAT_EXCEED_LIM]     = -ENOMEM,
+		[MTHCA_CMD_STAT_BAD_RES_STATE]  = -EBADF,
+		[MTHCA_CMD_STAT_BAD_INDEX]      = -EBADF,
+		[MTHCA_CMD_STAT_BAD_NVMEM]      = -EFAULT,
+		[MTHCA_CMD_STAT_BAD_QPEE_STATE] = -EINVAL,
+		[MTHCA_CMD_STAT_BAD_SEG_PARAM]  = -EFAULT,
+		[MTHCA_CMD_STAT_REG_BOUND]      = -EBUSY,
+		[MTHCA_CMD_STAT_LAM_NOT_PRE]    = -EAGAIN,
+		[MTHCA_CMD_STAT_BAD_PKT]        = -EBADMSG,
+		[MTHCA_CMD_STAT_BAD_SIZE]       = -ENOMEM,
+	};
+
+	if (status >= ARRAY_SIZE(trans_table) ||
+			(status != MTHCA_CMD_STAT_OK
+			 && trans_table[status] == 0))
+		return -EINVAL;
+
+	return trans_table[status];
+}
+
+
+static int mthca_cmd_poll(struct mthca_dev *dev,
+			  u64 in_param,
+			  u64 *out_param,
+			  int out_is_imm,
+			  u32 in_modifier,
+			  u8 op_modifier,
+			  u16 op,
+			  unsigned long timeout)
+{
+	int err = 0;
+	unsigned long end;
+	u8 status;
+
+	down(&dev->cmd.poll_sem);
+
+	err = mthca_cmd_post(dev, in_param,
+			     out_param ? *out_param : 0,
+			     in_modifier, op_modifier,
+			     op, CMD_POLL_TOKEN, 0);
+	if (err)
+		goto out;
+
+	end = timeout + jiffies;
+	while (go_bit(dev) && time_before(jiffies, end)) {
+		set_current_state(TASK_RUNNING);
+		schedule();
+	}
+
+	if (go_bit(dev)) {
+		err = -EBUSY;
+		goto out;
+	}
+
+	if (out_is_imm)
+		*out_param =
+			(u64) be32_to_cpu((__force __be32)
+					  __raw_readl(dev->hcr + HCR_OUT_PARAM_OFFSET)) << 32 |
+			(u64) be32_to_cpu((__force __be32)
+					  __raw_readl(dev->hcr + HCR_OUT_PARAM_OFFSET + 4));
+
+	status = be32_to_cpu((__force __be32) __raw_readl(dev->hcr + HCR_STATUS_OFFSET)) >> 24;
+	if (status) {
+		mthca_dbg(dev, "Command %02x completed with status %02x\n",
+			  op, status);
+		err = mthca_status_to_errno(status);
+	}
+
+out:
+	up(&dev->cmd.poll_sem);
+	return err;
+}
+
+void mthca_cmd_event(struct mthca_dev *dev,
+		     u16 token,
+		     u8  status,
+		     u64 out_param)
+{
+	struct mthca_cmd_context *context =
+		&dev->cmd.context[token & dev->cmd.token_mask];
+
+	/* previously timed out command completing at long last */
+	if (token != context->token)
+		return;
+
+	context->result    = 0;
+	context->status    = status;
+	context->out_param = out_param;
+
+	complete(&context->done);
+}
+
+static int mthca_cmd_wait(struct mthca_dev *dev,
+			  u64 in_param,
+			  u64 *out_param,
+			  int out_is_imm,
+			  u32 in_modifier,
+			  u8 op_modifier,
+			  u16 op,
+			  unsigned long timeout)
+{
+	int err = 0;
+	struct mthca_cmd_context *context;
+
+	down(&dev->cmd.event_sem);
+
+	spin_lock(&dev->cmd.context_lock);
+	BUG_ON(dev->cmd.free_head < 0);
+	context = &dev->cmd.context[dev->cmd.free_head];
+	context->token += dev->cmd.token_mask + 1;
+	dev->cmd.free_head = context->next;
+	spin_unlock(&dev->cmd.context_lock);
+
+	init_completion(&context->done);
+
+	err = mthca_cmd_post(dev, in_param,
+			     out_param ? *out_param : 0,
+			     in_modifier, op_modifier,
+			     op, context->token, 1);
+	if (err)
+		goto out;
+
+	if (!wait_for_completion_timeout(&context->done, timeout)) {
+		err = -EBUSY;
+		goto out;
+	}
+
+	err = context->result;
+	if (err)
+		goto out;
+
+	if (context->status) {
+		mthca_dbg(dev, "Command %02x completed with status %02x\n",
+			  op, context->status);
+		err = mthca_status_to_errno(context->status);
+	}
+
+	if (out_is_imm)
+		*out_param = context->out_param;
+
+out:
+	spin_lock(&dev->cmd.context_lock);
+	context->next = dev->cmd.free_head;
+	dev->cmd.free_head = context - dev->cmd.context;
+	spin_unlock(&dev->cmd.context_lock);
+
+	up(&dev->cmd.event_sem);
+	return err;
+}
+
+/* Invoke a command with an output mailbox */
+static int mthca_cmd_box(struct mthca_dev *dev,
+			 u64 in_param,
+			 u64 out_param,
+			 u32 in_modifier,
+			 u8 op_modifier,
+			 u16 op,
+			 unsigned long timeout)
+{
+	if (dev->cmd.flags & MTHCA_CMD_USE_EVENTS)
+		return mthca_cmd_wait(dev, in_param, &out_param, 0,
+				      in_modifier, op_modifier, op,
+				      timeout);
+	else
+		return mthca_cmd_poll(dev, in_param, &out_param, 0,
+				      in_modifier, op_modifier, op,
+				      timeout);
+}
+
+/* Invoke a command with no output parameter */
+static int mthca_cmd(struct mthca_dev *dev,
+		     u64 in_param,
+		     u32 in_modifier,
+		     u8 op_modifier,
+		     u16 op,
+		     unsigned long timeout)
+{
+	return mthca_cmd_box(dev, in_param, 0, in_modifier,
+			     op_modifier, op, timeout);
+}
+
+/*
+ * Invoke a command with an immediate output parameter (and copy the
+ * output into the caller's out_param pointer after the command
+ * executes).
+ */
+static int mthca_cmd_imm(struct mthca_dev *dev,
+			 u64 in_param,
+			 u64 *out_param,
+			 u32 in_modifier,
+			 u8 op_modifier,
+			 u16 op,
+			 unsigned long timeout)
+{
+	if (dev->cmd.flags & MTHCA_CMD_USE_EVENTS)
+		return mthca_cmd_wait(dev, in_param, out_param, 1,
+				      in_modifier, op_modifier, op,
+				      timeout);
+	else
+		return mthca_cmd_poll(dev, in_param, out_param, 1,
+				      in_modifier, op_modifier, op,
+				      timeout);
+}
+
+int mthca_cmd_init(struct mthca_dev *dev)
+{
+	mutex_init(&dev->cmd.hcr_mutex);
+	sema_init(&dev->cmd.poll_sem, 1);
+	dev->cmd.flags = 0;
+
+	dev->hcr = ioremap(pci_resource_start(dev->pdev, 0) + MTHCA_HCR_BASE,
+			   MTHCA_HCR_SIZE);
+	if (!dev->hcr) {
+		mthca_err(dev, "Couldn't map command register.");
+		return -ENOMEM;
+	}
+
+	dev->cmd.pool = pci_pool_create("mthca_cmd", dev->pdev,
+					MTHCA_MAILBOX_SIZE,
+					MTHCA_MAILBOX_SIZE, 0);
+	if (!dev->cmd.pool) {
+		iounmap(dev->hcr);
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+void mthca_cmd_cleanup(struct mthca_dev *dev)
+{
+	pci_pool_destroy(dev->cmd.pool);
+	iounmap(dev->hcr);
+	if (dev->cmd.flags & MTHCA_CMD_POST_DOORBELLS)
+		iounmap(dev->cmd.dbell_map);
+}
+
+/*
+ * Switch to using events to issue FW commands (should be called after
+ * event queue to command events has been initialized).
+ */
+int mthca_cmd_use_events(struct mthca_dev *dev)
+{
+	int i;
+
+	dev->cmd.context = kmalloc(dev->cmd.max_cmds *
+				   sizeof (struct mthca_cmd_context),
+				   GFP_KERNEL);
+	if (!dev->cmd.context)
+		return -ENOMEM;
+
+	for (i = 0; i < dev->cmd.max_cmds; ++i) {
+		dev->cmd.context[i].token = i;
+		dev->cmd.context[i].next = i + 1;
+	}
+
+	dev->cmd.context[dev->cmd.max_cmds - 1].next = -1;
+	dev->cmd.free_head = 0;
+
+	sema_init(&dev->cmd.event_sem, dev->cmd.max_cmds);
+	spin_lock_init(&dev->cmd.context_lock);
+
+	for (dev->cmd.token_mask = 1;
+	     dev->cmd.token_mask < dev->cmd.max_cmds;
+	     dev->cmd.token_mask <<= 1)
+		; /* nothing */
+	--dev->cmd.token_mask;
+
+	dev->cmd.flags |= MTHCA_CMD_USE_EVENTS;
+
+	down(&dev->cmd.poll_sem);
+
+	return 0;
+}
+
+/*
+ * Switch back to polling (used when shutting down the device)
+ */
+void mthca_cmd_use_polling(struct mthca_dev *dev)
+{
+	int i;
+
+	dev->cmd.flags &= ~MTHCA_CMD_USE_EVENTS;
+
+	for (i = 0; i < dev->cmd.max_cmds; ++i)
+		down(&dev->cmd.event_sem);
+
+	kfree(dev->cmd.context);
+
+	up(&dev->cmd.poll_sem);
+}
+
+struct mthca_mailbox *mthca_alloc_mailbox(struct mthca_dev *dev,
+					  gfp_t gfp_mask)
+{
+	struct mthca_mailbox *mailbox;
+
+	mailbox = kmalloc(sizeof *mailbox, gfp_mask);
+	if (!mailbox)
+		return ERR_PTR(-ENOMEM);
+
+	mailbox->buf = pci_pool_alloc(dev->cmd.pool, gfp_mask, &mailbox->dma);
+	if (!mailbox->buf) {
+		kfree(mailbox);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	return mailbox;
+}
+
+void mthca_free_mailbox(struct mthca_dev *dev, struct mthca_mailbox *mailbox)
+{
+	if (!mailbox)
+		return;
+
+	pci_pool_free(dev->cmd.pool, mailbox->buf, mailbox->dma);
+	kfree(mailbox);
+}
+
+int mthca_SYS_EN(struct mthca_dev *dev)
+{
+	u64 out;
+	int ret;
+
+	ret = mthca_cmd_imm(dev, 0, &out, 0, 0, CMD_SYS_EN, CMD_TIME_CLASS_D);
+
+	if (ret == -ENOMEM)
+		mthca_warn(dev, "SYS_EN DDR error: syn=%x, sock=%d, "
+			   "sladdr=%d, SPD source=%s\n",
+			   (int) (out >> 6) & 0xf, (int) (out >> 4) & 3,
+			   (int) (out >> 1) & 7, (int) out & 1 ? "NVMEM" : "DIMM");
+
+	return ret;
+}
+
+int mthca_SYS_DIS(struct mthca_dev *dev)
+{
+	return mthca_cmd(dev, 0, 0, 0, CMD_SYS_DIS, CMD_TIME_CLASS_C);
+}
+
+static int mthca_map_cmd(struct mthca_dev *dev, u16 op, struct mthca_icm *icm,
+			 u64 virt)
+{
+	struct mthca_mailbox *mailbox;
+	struct mthca_icm_iter iter;
+	__be64 *pages;
+	int lg;
+	int nent = 0;
+	int i;
+	int err = 0;
+	int ts = 0, tc = 0;
+
+	mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	memset(mailbox->buf, 0, MTHCA_MAILBOX_SIZE);
+	pages = mailbox->buf;
+
+	for (mthca_icm_first(icm, &iter);
+	     !mthca_icm_last(&iter);
+	     mthca_icm_next(&iter)) {
+		/*
+		 * We have to pass pages that are aligned to their
+		 * size, so find the least significant 1 in the
+		 * address or size and use that as our log2 size.
+		 */
+		lg = ffs(mthca_icm_addr(&iter) | mthca_icm_size(&iter)) - 1;
+		if (lg < MTHCA_ICM_PAGE_SHIFT) {
+			mthca_warn(dev, "Got FW area not aligned to %d (%llx/%lx).\n",
+				   MTHCA_ICM_PAGE_SIZE,
+				   (unsigned long long) mthca_icm_addr(&iter),
+				   mthca_icm_size(&iter));
+			err = -EINVAL;
+			goto out;
+		}
+		for (i = 0; i < mthca_icm_size(&iter) >> lg; ++i) {
+			if (virt != -1) {
+				pages[nent * 2] = cpu_to_be64(virt);
+				virt += 1 << lg;
+			}
+
+			pages[nent * 2 + 1] =
+				cpu_to_be64((mthca_icm_addr(&iter) + (i << lg)) |
+					    (lg - MTHCA_ICM_PAGE_SHIFT));
+			ts += 1 << (lg - 10);
+			++tc;
+
+			if (++nent == MTHCA_MAILBOX_SIZE / 16) {
+				err = mthca_cmd(dev, mailbox->dma, nent, 0, op,
+						CMD_TIME_CLASS_B);
+				if (err)
+					goto out;
+				nent = 0;
+			}
+		}
+	}
+
+	if (nent)
+		err = mthca_cmd(dev, mailbox->dma, nent, 0, op,
+				CMD_TIME_CLASS_B);
+
+	switch (op) {
+	case CMD_MAP_FA:
+		mthca_dbg(dev, "Mapped %d chunks/%d KB for FW.\n", tc, ts);
+		break;
+	case CMD_MAP_ICM_AUX:
+		mthca_dbg(dev, "Mapped %d chunks/%d KB for ICM aux.\n", tc, ts);
+		break;
+	case CMD_MAP_ICM:
+		mthca_dbg(dev, "Mapped %d chunks/%d KB at %llx for ICM.\n",
+			  tc, ts, (unsigned long long) virt - (ts << 10));
+		break;
+	}
+
+out:
+	mthca_free_mailbox(dev, mailbox);
+	return err;
+}
+
+int mthca_MAP_FA(struct mthca_dev *dev, struct mthca_icm *icm)
+{
+	return mthca_map_cmd(dev, CMD_MAP_FA, icm, -1);
+}
+
+int mthca_UNMAP_FA(struct mthca_dev *dev)
+{
+	return mthca_cmd(dev, 0, 0, 0, CMD_UNMAP_FA, CMD_TIME_CLASS_B);
+}
+
+int mthca_RUN_FW(struct mthca_dev *dev)
+{
+	return mthca_cmd(dev, 0, 0, 0, CMD_RUN_FW, CMD_TIME_CLASS_A);
+}
+
+static void mthca_setup_cmd_doorbells(struct mthca_dev *dev, u64 base)
+{
+	phys_addr_t addr;
+	u16 max_off = 0;
+	int i;
+
+	for (i = 0; i < 8; ++i)
+		max_off = max(max_off, dev->cmd.dbell_offsets[i]);
+
+	if ((base & PAGE_MASK) != ((base + max_off) & PAGE_MASK)) {
+		mthca_warn(dev, "Firmware doorbell region at 0x%016llx, "
+			   "length 0x%x crosses a page boundary\n",
+			   (unsigned long long) base, max_off);
+		return;
+	}
+
+	addr = pci_resource_start(dev->pdev, 2) +
+		((pci_resource_len(dev->pdev, 2) - 1) & base);
+	dev->cmd.dbell_map = ioremap(addr, max_off + sizeof(u32));
+	if (!dev->cmd.dbell_map)
+		return;
+
+	dev->cmd.flags |= MTHCA_CMD_POST_DOORBELLS;
+	mthca_dbg(dev, "Mapped doorbell page for posting FW commands\n");
+}
+
+int mthca_QUERY_FW(struct mthca_dev *dev)
+{
+	struct mthca_mailbox *mailbox;
+	u32 *outbox;
+	u64 base;
+	u32 tmp;
+	int err = 0;
+	u8 lg;
+	int i;
+
+#define QUERY_FW_OUT_SIZE             0x100
+#define QUERY_FW_VER_OFFSET            0x00
+#define QUERY_FW_MAX_CMD_OFFSET        0x0f
+#define QUERY_FW_ERR_START_OFFSET      0x30
+#define QUERY_FW_ERR_SIZE_OFFSET       0x38
+
+#define QUERY_FW_CMD_DB_EN_OFFSET      0x10
+#define QUERY_FW_CMD_DB_OFFSET         0x50
+#define QUERY_FW_CMD_DB_BASE           0x60
+
+#define QUERY_FW_START_OFFSET          0x20
+#define QUERY_FW_END_OFFSET            0x28
+
+#define QUERY_FW_SIZE_OFFSET           0x00
+#define QUERY_FW_CLR_INT_BASE_OFFSET   0x20
+#define QUERY_FW_EQ_ARM_BASE_OFFSET    0x40
+#define QUERY_FW_EQ_SET_CI_BASE_OFFSET 0x48
+
+	mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	outbox = mailbox->buf;
+
+	err = mthca_cmd_box(dev, 0, mailbox->dma, 0, 0, CMD_QUERY_FW,
+			    CMD_TIME_CLASS_A);
+
+	if (err)
+		goto out;
+
+	MTHCA_GET(dev->fw_ver,   outbox, QUERY_FW_VER_OFFSET);
+	/*
+	 * FW subminor version is at more significant bits than minor
+	 * version, so swap here.
+	 */
+	dev->fw_ver = (dev->fw_ver & 0xffff00000000ull) |
+		((dev->fw_ver & 0xffff0000ull) >> 16) |
+		((dev->fw_ver & 0x0000ffffull) << 16);
+
+	MTHCA_GET(lg, outbox, QUERY_FW_MAX_CMD_OFFSET);
+	dev->cmd.max_cmds = 1 << lg;
+
+	mthca_dbg(dev, "FW version %012llx, max commands %d\n",
+		  (unsigned long long) dev->fw_ver, dev->cmd.max_cmds);
+
+	MTHCA_GET(dev->catas_err.addr, outbox, QUERY_FW_ERR_START_OFFSET);
+	MTHCA_GET(dev->catas_err.size, outbox, QUERY_FW_ERR_SIZE_OFFSET);
+
+	mthca_dbg(dev, "Catastrophic error buffer at 0x%llx, size 0x%x\n",
+		  (unsigned long long) dev->catas_err.addr, dev->catas_err.size);
+
+	MTHCA_GET(tmp, outbox, QUERY_FW_CMD_DB_EN_OFFSET);
+	if (tmp & 0x1) {
+		mthca_dbg(dev, "FW supports commands through doorbells\n");
+
+		MTHCA_GET(base, outbox, QUERY_FW_CMD_DB_BASE);
+		for (i = 0; i < MTHCA_CMD_NUM_DBELL_DWORDS; ++i)
+			MTHCA_GET(dev->cmd.dbell_offsets[i], outbox,
+				  QUERY_FW_CMD_DB_OFFSET + (i << 1));
+
+		mthca_setup_cmd_doorbells(dev, base);
+	}
+
+	if (mthca_is_memfree(dev)) {
+		MTHCA_GET(dev->fw.arbel.fw_pages,       outbox, QUERY_FW_SIZE_OFFSET);
+		MTHCA_GET(dev->fw.arbel.clr_int_base,   outbox, QUERY_FW_CLR_INT_BASE_OFFSET);
+		MTHCA_GET(dev->fw.arbel.eq_arm_base,    outbox, QUERY_FW_EQ_ARM_BASE_OFFSET);
+		MTHCA_GET(dev->fw.arbel.eq_set_ci_base, outbox, QUERY_FW_EQ_SET_CI_BASE_OFFSET);
+		mthca_dbg(dev, "FW size %d KB\n", dev->fw.arbel.fw_pages << 2);
+
+		/*
+		 * Round up number of system pages needed in case
+		 * MTHCA_ICM_PAGE_SIZE < PAGE_SIZE.
+		 */
+		dev->fw.arbel.fw_pages =
+			ALIGN(dev->fw.arbel.fw_pages, PAGE_SIZE / MTHCA_ICM_PAGE_SIZE) >>
+				(PAGE_SHIFT - MTHCA_ICM_PAGE_SHIFT);
+
+		mthca_dbg(dev, "Clear int @ %llx, EQ arm @ %llx, EQ set CI @ %llx\n",
+			  (unsigned long long) dev->fw.arbel.clr_int_base,
+			  (unsigned long long) dev->fw.arbel.eq_arm_base,
+			  (unsigned long long) dev->fw.arbel.eq_set_ci_base);
+	} else {
+		MTHCA_GET(dev->fw.tavor.fw_start, outbox, QUERY_FW_START_OFFSET);
+		MTHCA_GET(dev->fw.tavor.fw_end,   outbox, QUERY_FW_END_OFFSET);
+
+		mthca_dbg(dev, "FW size %d KB (start %llx, end %llx)\n",
+			  (int) ((dev->fw.tavor.fw_end - dev->fw.tavor.fw_start) >> 10),
+			  (unsigned long long) dev->fw.tavor.fw_start,
+			  (unsigned long long) dev->fw.tavor.fw_end);
+	}
+
+out:
+	mthca_free_mailbox(dev, mailbox);
+	return err;
+}
+
+int mthca_ENABLE_LAM(struct mthca_dev *dev)
+{
+	struct mthca_mailbox *mailbox;
+	u8 info;
+	u32 *outbox;
+	int err = 0;
+
+#define ENABLE_LAM_OUT_SIZE         0x100
+#define ENABLE_LAM_START_OFFSET     0x00
+#define ENABLE_LAM_END_OFFSET       0x08
+#define ENABLE_LAM_INFO_OFFSET      0x13
+
+#define ENABLE_LAM_INFO_HIDDEN_FLAG (1 << 4)
+#define ENABLE_LAM_INFO_ECC_MASK    0x3
+
+	mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	outbox = mailbox->buf;
+
+	err = mthca_cmd_box(dev, 0, mailbox->dma, 0, 0, CMD_ENABLE_LAM,
+			    CMD_TIME_CLASS_C);
+
+	if (err)
+		goto out;
+
+	MTHCA_GET(dev->ddr_start, outbox, ENABLE_LAM_START_OFFSET);
+	MTHCA_GET(dev->ddr_end,   outbox, ENABLE_LAM_END_OFFSET);
+	MTHCA_GET(info,           outbox, ENABLE_LAM_INFO_OFFSET);
+
+	if (!!(info & ENABLE_LAM_INFO_HIDDEN_FLAG) !=
+	    !!(dev->mthca_flags & MTHCA_FLAG_DDR_HIDDEN)) {
+		mthca_info(dev, "FW reports that HCA-attached memory "
+			   "is %s hidden; does not match PCI config\n",
+			   (info & ENABLE_LAM_INFO_HIDDEN_FLAG) ?
+			   "" : "not");
+	}
+	if (info & ENABLE_LAM_INFO_HIDDEN_FLAG)
+		mthca_dbg(dev, "HCA-attached memory is hidden.\n");
+
+	mthca_dbg(dev, "HCA memory size %d KB (start %llx, end %llx)\n",
+		  (int) ((dev->ddr_end - dev->ddr_start) >> 10),
+		  (unsigned long long) dev->ddr_start,
+		  (unsigned long long) dev->ddr_end);
+
+out:
+	mthca_free_mailbox(dev, mailbox);
+	return err;
+}
+
+int mthca_DISABLE_LAM(struct mthca_dev *dev)
+{
+	return mthca_cmd(dev, 0, 0, 0, CMD_SYS_DIS, CMD_TIME_CLASS_C);
+}
+
+int mthca_QUERY_DDR(struct mthca_dev *dev)
+{
+	struct mthca_mailbox *mailbox;
+	u8 info;
+	u32 *outbox;
+	int err = 0;
+
+#define QUERY_DDR_OUT_SIZE         0x100
+#define QUERY_DDR_START_OFFSET     0x00
+#define QUERY_DDR_END_OFFSET       0x08
+#define QUERY_DDR_INFO_OFFSET      0x13
+
+#define QUERY_DDR_INFO_HIDDEN_FLAG (1 << 4)
+#define QUERY_DDR_INFO_ECC_MASK    0x3
+
+	mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	outbox = mailbox->buf;
+
+	err = mthca_cmd_box(dev, 0, mailbox->dma, 0, 0, CMD_QUERY_DDR,
+			    CMD_TIME_CLASS_A);
+
+	if (err)
+		goto out;
+
+	MTHCA_GET(dev->ddr_start, outbox, QUERY_DDR_START_OFFSET);
+	MTHCA_GET(dev->ddr_end,   outbox, QUERY_DDR_END_OFFSET);
+	MTHCA_GET(info,           outbox, QUERY_DDR_INFO_OFFSET);
+
+	if (!!(info & QUERY_DDR_INFO_HIDDEN_FLAG) !=
+	    !!(dev->mthca_flags & MTHCA_FLAG_DDR_HIDDEN)) {
+		mthca_info(dev, "FW reports that HCA-attached memory "
+			   "is %s hidden; does not match PCI config\n",
+			   (info & QUERY_DDR_INFO_HIDDEN_FLAG) ?
+			   "" : "not");
+	}
+	if (info & QUERY_DDR_INFO_HIDDEN_FLAG)
+		mthca_dbg(dev, "HCA-attached memory is hidden.\n");
+
+	mthca_dbg(dev, "HCA memory size %d KB (start %llx, end %llx)\n",
+		  (int) ((dev->ddr_end - dev->ddr_start) >> 10),
+		  (unsigned long long) dev->ddr_start,
+		  (unsigned long long) dev->ddr_end);
+
+out:
+	mthca_free_mailbox(dev, mailbox);
+	return err;
+}
+
+int mthca_QUERY_DEV_LIM(struct mthca_dev *dev,
+			struct mthca_dev_lim *dev_lim)
+{
+	struct mthca_mailbox *mailbox;
+	u32 *outbox;
+	u8 field;
+	u16 size;
+	u16 stat_rate;
+	int err;
+
+#define QUERY_DEV_LIM_OUT_SIZE             0x100
+#define QUERY_DEV_LIM_MAX_SRQ_SZ_OFFSET     0x10
+#define QUERY_DEV_LIM_MAX_QP_SZ_OFFSET      0x11
+#define QUERY_DEV_LIM_RSVD_QP_OFFSET        0x12
+#define QUERY_DEV_LIM_MAX_QP_OFFSET         0x13
+#define QUERY_DEV_LIM_RSVD_SRQ_OFFSET       0x14
+#define QUERY_DEV_LIM_MAX_SRQ_OFFSET        0x15
+#define QUERY_DEV_LIM_RSVD_EEC_OFFSET       0x16
+#define QUERY_DEV_LIM_MAX_EEC_OFFSET        0x17
+#define QUERY_DEV_LIM_MAX_CQ_SZ_OFFSET      0x19
+#define QUERY_DEV_LIM_RSVD_CQ_OFFSET        0x1a
+#define QUERY_DEV_LIM_MAX_CQ_OFFSET         0x1b
+#define QUERY_DEV_LIM_MAX_MPT_OFFSET        0x1d
+#define QUERY_DEV_LIM_RSVD_EQ_OFFSET        0x1e
+#define QUERY_DEV_LIM_MAX_EQ_OFFSET         0x1f
+#define QUERY_DEV_LIM_RSVD_MTT_OFFSET       0x20
+#define QUERY_DEV_LIM_MAX_MRW_SZ_OFFSET     0x21
+#define QUERY_DEV_LIM_RSVD_MRW_OFFSET       0x22
+#define QUERY_DEV_LIM_MAX_MTT_SEG_OFFSET    0x23
+#define QUERY_DEV_LIM_MAX_AV_OFFSET         0x27
+#define QUERY_DEV_LIM_MAX_REQ_QP_OFFSET     0x29
+#define QUERY_DEV_LIM_MAX_RES_QP_OFFSET     0x2b
+#define QUERY_DEV_LIM_MAX_RDMA_OFFSET       0x2f
+#define QUERY_DEV_LIM_RSZ_SRQ_OFFSET        0x33
+#define QUERY_DEV_LIM_ACK_DELAY_OFFSET      0x35
+#define QUERY_DEV_LIM_MTU_WIDTH_OFFSET      0x36
+#define QUERY_DEV_LIM_VL_PORT_OFFSET        0x37
+#define QUERY_DEV_LIM_MAX_GID_OFFSET        0x3b
+#define QUERY_DEV_LIM_RATE_SUPPORT_OFFSET   0x3c
+#define QUERY_DEV_LIM_MAX_PKEY_OFFSET       0x3f
+#define QUERY_DEV_LIM_FLAGS_OFFSET          0x44
+#define QUERY_DEV_LIM_RSVD_UAR_OFFSET       0x48
+#define QUERY_DEV_LIM_UAR_SZ_OFFSET         0x49
+#define QUERY_DEV_LIM_PAGE_SZ_OFFSET        0x4b
+#define QUERY_DEV_LIM_MAX_SG_OFFSET         0x51
+#define QUERY_DEV_LIM_MAX_DESC_SZ_OFFSET    0x52
+#define QUERY_DEV_LIM_MAX_SG_RQ_OFFSET      0x55
+#define QUERY_DEV_LIM_MAX_DESC_SZ_RQ_OFFSET 0x56
+#define QUERY_DEV_LIM_MAX_QP_MCG_OFFSET     0x61
+#define QUERY_DEV_LIM_RSVD_MCG_OFFSET       0x62
+#define QUERY_DEV_LIM_MAX_MCG_OFFSET        0x63
+#define QUERY_DEV_LIM_RSVD_PD_OFFSET        0x64
+#define QUERY_DEV_LIM_MAX_PD_OFFSET         0x65
+#define QUERY_DEV_LIM_RSVD_RDD_OFFSET       0x66
+#define QUERY_DEV_LIM_MAX_RDD_OFFSET        0x67
+#define QUERY_DEV_LIM_EEC_ENTRY_SZ_OFFSET   0x80
+#define QUERY_DEV_LIM_QPC_ENTRY_SZ_OFFSET   0x82
+#define QUERY_DEV_LIM_EEEC_ENTRY_SZ_OFFSET  0x84
+#define QUERY_DEV_LIM_EQPC_ENTRY_SZ_OFFSET  0x86
+#define QUERY_DEV_LIM_EQC_ENTRY_SZ_OFFSET   0x88
+#define QUERY_DEV_LIM_CQC_ENTRY_SZ_OFFSET   0x8a
+#define QUERY_DEV_LIM_SRQ_ENTRY_SZ_OFFSET   0x8c
+#define QUERY_DEV_LIM_UAR_ENTRY_SZ_OFFSET   0x8e
+#define QUERY_DEV_LIM_MTT_ENTRY_SZ_OFFSET   0x90
+#define QUERY_DEV_LIM_MPT_ENTRY_SZ_OFFSET   0x92
+#define QUERY_DEV_LIM_PBL_SZ_OFFSET         0x96
+#define QUERY_DEV_LIM_BMME_FLAGS_OFFSET     0x97
+#define QUERY_DEV_LIM_RSVD_LKEY_OFFSET      0x98
+#define QUERY_DEV_LIM_LAMR_OFFSET           0x9f
+#define QUERY_DEV_LIM_MAX_ICM_SZ_OFFSET     0xa0
+
+	mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	outbox = mailbox->buf;
+
+	err = mthca_cmd_box(dev, 0, mailbox->dma, 0, 0, CMD_QUERY_DEV_LIM,
+			    CMD_TIME_CLASS_A);
+
+	if (err)
+		goto out;
+
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_RSVD_QP_OFFSET);
+	dev_lim->reserved_qps = 1 << (field & 0xf);
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_QP_OFFSET);
+	dev_lim->max_qps = 1 << (field & 0x1f);
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_RSVD_SRQ_OFFSET);
+	dev_lim->reserved_srqs = 1 << (field >> 4);
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_SRQ_OFFSET);
+	dev_lim->max_srqs = 1 << (field & 0x1f);
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_RSVD_EEC_OFFSET);
+	dev_lim->reserved_eecs = 1 << (field & 0xf);
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_EEC_OFFSET);
+	dev_lim->max_eecs = 1 << (field & 0x1f);
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_CQ_SZ_OFFSET);
+	dev_lim->max_cq_sz = 1 << field;
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_RSVD_CQ_OFFSET);
+	dev_lim->reserved_cqs = 1 << (field & 0xf);
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_CQ_OFFSET);
+	dev_lim->max_cqs = 1 << (field & 0x1f);
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_MPT_OFFSET);
+	dev_lim->max_mpts = 1 << (field & 0x3f);
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_RSVD_EQ_OFFSET);
+	dev_lim->reserved_eqs = 1 << (field & 0xf);
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_EQ_OFFSET);
+	dev_lim->max_eqs = 1 << (field & 0x7);
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_RSVD_MTT_OFFSET);
+	if (mthca_is_memfree(dev))
+		dev_lim->reserved_mtts = ALIGN((1 << (field >> 4)) * sizeof(u64),
+					       dev->limits.mtt_seg_size) / dev->limits.mtt_seg_size;
+	else
+		dev_lim->reserved_mtts = 1 << (field >> 4);
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_MRW_SZ_OFFSET);
+	dev_lim->max_mrw_sz = 1 << field;
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_RSVD_MRW_OFFSET);
+	dev_lim->reserved_mrws = 1 << (field & 0xf);
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_MTT_SEG_OFFSET);
+	dev_lim->max_mtt_seg = 1 << (field & 0x3f);
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_REQ_QP_OFFSET);
+	dev_lim->max_requester_per_qp = 1 << (field & 0x3f);
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_RES_QP_OFFSET);
+	dev_lim->max_responder_per_qp = 1 << (field & 0x3f);
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_RDMA_OFFSET);
+	dev_lim->max_rdma_global = 1 << (field & 0x3f);
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_ACK_DELAY_OFFSET);
+	dev_lim->local_ca_ack_delay = field & 0x1f;
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_MTU_WIDTH_OFFSET);
+	dev_lim->max_mtu        = field >> 4;
+	dev_lim->max_port_width = field & 0xf;
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_VL_PORT_OFFSET);
+	dev_lim->max_vl    = field >> 4;
+	dev_lim->num_ports = field & 0xf;
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_GID_OFFSET);
+	dev_lim->max_gids = 1 << (field & 0xf);
+	MTHCA_GET(stat_rate, outbox, QUERY_DEV_LIM_RATE_SUPPORT_OFFSET);
+	dev_lim->stat_rate_support = stat_rate;
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_PKEY_OFFSET);
+	dev_lim->max_pkeys = 1 << (field & 0xf);
+	MTHCA_GET(dev_lim->flags, outbox, QUERY_DEV_LIM_FLAGS_OFFSET);
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_RSVD_UAR_OFFSET);
+	dev_lim->reserved_uars = field >> 4;
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_UAR_SZ_OFFSET);
+	dev_lim->uar_size = 1 << ((field & 0x3f) + 20);
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_PAGE_SZ_OFFSET);
+	dev_lim->min_page_sz = 1 << field;
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_SG_OFFSET);
+	dev_lim->max_sg = field;
+
+	MTHCA_GET(size, outbox, QUERY_DEV_LIM_MAX_DESC_SZ_OFFSET);
+	dev_lim->max_desc_sz = size;
+
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_QP_MCG_OFFSET);
+	dev_lim->max_qp_per_mcg = 1 << field;
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_RSVD_MCG_OFFSET);
+	dev_lim->reserved_mgms = field & 0xf;
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_MCG_OFFSET);
+	dev_lim->max_mcgs = 1 << field;
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_RSVD_PD_OFFSET);
+	dev_lim->reserved_pds = field >> 4;
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_PD_OFFSET);
+	dev_lim->max_pds = 1 << (field & 0x3f);
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_RSVD_RDD_OFFSET);
+	dev_lim->reserved_rdds = field >> 4;
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_RDD_OFFSET);
+	dev_lim->max_rdds = 1 << (field & 0x3f);
+
+	MTHCA_GET(size, outbox, QUERY_DEV_LIM_EEC_ENTRY_SZ_OFFSET);
+	dev_lim->eec_entry_sz = size;
+	MTHCA_GET(size, outbox, QUERY_DEV_LIM_QPC_ENTRY_SZ_OFFSET);
+	dev_lim->qpc_entry_sz = size;
+	MTHCA_GET(size, outbox, QUERY_DEV_LIM_EEEC_ENTRY_SZ_OFFSET);
+	dev_lim->eeec_entry_sz = size;
+	MTHCA_GET(size, outbox, QUERY_DEV_LIM_EQPC_ENTRY_SZ_OFFSET);
+	dev_lim->eqpc_entry_sz = size;
+	MTHCA_GET(size, outbox, QUERY_DEV_LIM_EQC_ENTRY_SZ_OFFSET);
+	dev_lim->eqc_entry_sz = size;
+	MTHCA_GET(size, outbox, QUERY_DEV_LIM_CQC_ENTRY_SZ_OFFSET);
+	dev_lim->cqc_entry_sz = size;
+	MTHCA_GET(size, outbox, QUERY_DEV_LIM_SRQ_ENTRY_SZ_OFFSET);
+	dev_lim->srq_entry_sz = size;
+	MTHCA_GET(size, outbox, QUERY_DEV_LIM_UAR_ENTRY_SZ_OFFSET);
+	dev_lim->uar_scratch_entry_sz = size;
+
+	if (mthca_is_memfree(dev)) {
+		MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_SRQ_SZ_OFFSET);
+		dev_lim->max_srq_sz = 1 << field;
+		MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_QP_SZ_OFFSET);
+		dev_lim->max_qp_sz = 1 << field;
+		MTHCA_GET(field, outbox, QUERY_DEV_LIM_RSZ_SRQ_OFFSET);
+		dev_lim->hca.arbel.resize_srq = field & 1;
+		MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_SG_RQ_OFFSET);
+		dev_lim->max_sg = min_t(int, field, dev_lim->max_sg);
+		MTHCA_GET(size, outbox, QUERY_DEV_LIM_MAX_DESC_SZ_RQ_OFFSET);
+		dev_lim->max_desc_sz = min_t(int, size, dev_lim->max_desc_sz);
+		MTHCA_GET(size, outbox, QUERY_DEV_LIM_MPT_ENTRY_SZ_OFFSET);
+		dev_lim->mpt_entry_sz = size;
+		MTHCA_GET(field, outbox, QUERY_DEV_LIM_PBL_SZ_OFFSET);
+		dev_lim->hca.arbel.max_pbl_sz = 1 << (field & 0x3f);
+		MTHCA_GET(dev_lim->hca.arbel.bmme_flags, outbox,
+			  QUERY_DEV_LIM_BMME_FLAGS_OFFSET);
+		MTHCA_GET(dev_lim->hca.arbel.reserved_lkey, outbox,
+			  QUERY_DEV_LIM_RSVD_LKEY_OFFSET);
+		MTHCA_GET(field, outbox, QUERY_DEV_LIM_LAMR_OFFSET);
+		dev_lim->hca.arbel.lam_required = field & 1;
+		MTHCA_GET(dev_lim->hca.arbel.max_icm_sz, outbox,
+			  QUERY_DEV_LIM_MAX_ICM_SZ_OFFSET);
+
+		if (dev_lim->hca.arbel.bmme_flags & 1)
+			mthca_dbg(dev, "Base MM extensions: yes "
+				  "(flags %d, max PBL %d, rsvd L_Key %08x)\n",
+				  dev_lim->hca.arbel.bmme_flags,
+				  dev_lim->hca.arbel.max_pbl_sz,
+				  dev_lim->hca.arbel.reserved_lkey);
+		else
+			mthca_dbg(dev, "Base MM extensions: no\n");
+
+		mthca_dbg(dev, "Max ICM size %lld MB\n",
+			  (unsigned long long) dev_lim->hca.arbel.max_icm_sz >> 20);
+	} else {
+		MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_SRQ_SZ_OFFSET);
+		dev_lim->max_srq_sz = (1 << field) - 1;
+		MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_QP_SZ_OFFSET);
+		dev_lim->max_qp_sz = (1 << field) - 1;
+		MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_AV_OFFSET);
+		dev_lim->hca.tavor.max_avs = 1 << (field & 0x3f);
+		dev_lim->mpt_entry_sz = MTHCA_MPT_ENTRY_SIZE;
+	}
+
+	mthca_dbg(dev, "Max QPs: %d, reserved QPs: %d, entry size: %d\n",
+		  dev_lim->max_qps, dev_lim->reserved_qps, dev_lim->qpc_entry_sz);
+	mthca_dbg(dev, "Max SRQs: %d, reserved SRQs: %d, entry size: %d\n",
+		  dev_lim->max_srqs, dev_lim->reserved_srqs, dev_lim->srq_entry_sz);
+	mthca_dbg(dev, "Max CQs: %d, reserved CQs: %d, entry size: %d\n",
+		  dev_lim->max_cqs, dev_lim->reserved_cqs, dev_lim->cqc_entry_sz);
+	mthca_dbg(dev, "Max EQs: %d, reserved EQs: %d, entry size: %d\n",
+		  dev_lim->max_eqs, dev_lim->reserved_eqs, dev_lim->eqc_entry_sz);
+	mthca_dbg(dev, "reserved MPTs: %d, reserved MTTs: %d\n",
+		  dev_lim->reserved_mrws, dev_lim->reserved_mtts);
+	mthca_dbg(dev, "Max PDs: %d, reserved PDs: %d, reserved UARs: %d\n",
+		  dev_lim->max_pds, dev_lim->reserved_pds, dev_lim->reserved_uars);
+	mthca_dbg(dev, "Max QP/MCG: %d, reserved MGMs: %d\n",
+		  dev_lim->max_pds, dev_lim->reserved_mgms);
+	mthca_dbg(dev, "Max CQEs: %d, max WQEs: %d, max SRQ WQEs: %d\n",
+		  dev_lim->max_cq_sz, dev_lim->max_qp_sz, dev_lim->max_srq_sz);
+
+	mthca_dbg(dev, "Flags: %08x\n", dev_lim->flags);
+
+out:
+	mthca_free_mailbox(dev, mailbox);
+	return err;
+}
+
+static void get_board_id(void *vsd, char *board_id)
+{
+	int i;
+
+#define VSD_OFFSET_SIG1		0x00
+#define VSD_OFFSET_SIG2		0xde
+#define VSD_OFFSET_MLX_BOARD_ID	0xd0
+#define VSD_OFFSET_TS_BOARD_ID	0x20
+
+#define VSD_SIGNATURE_TOPSPIN	0x5ad
+
+	memset(board_id, 0, MTHCA_BOARD_ID_LEN);
+
+	if (be16_to_cpup(vsd + VSD_OFFSET_SIG1) == VSD_SIGNATURE_TOPSPIN &&
+	    be16_to_cpup(vsd + VSD_OFFSET_SIG2) == VSD_SIGNATURE_TOPSPIN) {
+		strlcpy(board_id, vsd + VSD_OFFSET_TS_BOARD_ID, MTHCA_BOARD_ID_LEN);
+	} else {
+		/*
+		 * The board ID is a string but the firmware byte
+		 * swaps each 4-byte word before passing it back to
+		 * us.  Therefore we need to swab it before printing.
+		 */
+		for (i = 0; i < 4; ++i)
+			((u32 *) board_id)[i] =
+				swab32(*(u32 *) (vsd + VSD_OFFSET_MLX_BOARD_ID + i * 4));
+	}
+}
+
+int mthca_QUERY_ADAPTER(struct mthca_dev *dev,
+			struct mthca_adapter *adapter)
+{
+	struct mthca_mailbox *mailbox;
+	u32 *outbox;
+	int err;
+
+#define QUERY_ADAPTER_OUT_SIZE             0x100
+#define QUERY_ADAPTER_VENDOR_ID_OFFSET     0x00
+#define QUERY_ADAPTER_DEVICE_ID_OFFSET     0x04
+#define QUERY_ADAPTER_REVISION_ID_OFFSET   0x08
+#define QUERY_ADAPTER_INTA_PIN_OFFSET      0x10
+#define QUERY_ADAPTER_VSD_OFFSET           0x20
+
+	mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	outbox = mailbox->buf;
+
+	err = mthca_cmd_box(dev, 0, mailbox->dma, 0, 0, CMD_QUERY_ADAPTER,
+			    CMD_TIME_CLASS_A);
+
+	if (err)
+		goto out;
+
+	if (!mthca_is_memfree(dev)) {
+		MTHCA_GET(adapter->vendor_id, outbox,
+			  QUERY_ADAPTER_VENDOR_ID_OFFSET);
+		MTHCA_GET(adapter->device_id, outbox,
+			  QUERY_ADAPTER_DEVICE_ID_OFFSET);
+		MTHCA_GET(adapter->revision_id, outbox,
+			  QUERY_ADAPTER_REVISION_ID_OFFSET);
+	}
+	MTHCA_GET(adapter->inta_pin, outbox,    QUERY_ADAPTER_INTA_PIN_OFFSET);
+
+	get_board_id(outbox + QUERY_ADAPTER_VSD_OFFSET / 4,
+		     adapter->board_id);
+
+out:
+	mthca_free_mailbox(dev, mailbox);
+	return err;
+}
+
+int mthca_INIT_HCA(struct mthca_dev *dev,
+		   struct mthca_init_hca_param *param)
+{
+	struct mthca_mailbox *mailbox;
+	__be32 *inbox;
+	int err;
+
+#define INIT_HCA_IN_SIZE             	 0x200
+#define INIT_HCA_FLAGS1_OFFSET           0x00c
+#define INIT_HCA_FLAGS2_OFFSET           0x014
+#define INIT_HCA_QPC_OFFSET          	 0x020
+#define  INIT_HCA_QPC_BASE_OFFSET    	 (INIT_HCA_QPC_OFFSET + 0x10)
+#define  INIT_HCA_LOG_QP_OFFSET      	 (INIT_HCA_QPC_OFFSET + 0x17)
+#define  INIT_HCA_EEC_BASE_OFFSET    	 (INIT_HCA_QPC_OFFSET + 0x20)
+#define  INIT_HCA_LOG_EEC_OFFSET     	 (INIT_HCA_QPC_OFFSET + 0x27)
+#define  INIT_HCA_SRQC_BASE_OFFSET   	 (INIT_HCA_QPC_OFFSET + 0x28)
+#define  INIT_HCA_LOG_SRQ_OFFSET     	 (INIT_HCA_QPC_OFFSET + 0x2f)
+#define  INIT_HCA_CQC_BASE_OFFSET    	 (INIT_HCA_QPC_OFFSET + 0x30)
+#define  INIT_HCA_LOG_CQ_OFFSET      	 (INIT_HCA_QPC_OFFSET + 0x37)
+#define  INIT_HCA_EQPC_BASE_OFFSET   	 (INIT_HCA_QPC_OFFSET + 0x40)
+#define  INIT_HCA_EEEC_BASE_OFFSET   	 (INIT_HCA_QPC_OFFSET + 0x50)
+#define  INIT_HCA_EQC_BASE_OFFSET    	 (INIT_HCA_QPC_OFFSET + 0x60)
+#define  INIT_HCA_LOG_EQ_OFFSET      	 (INIT_HCA_QPC_OFFSET + 0x67)
+#define  INIT_HCA_RDB_BASE_OFFSET    	 (INIT_HCA_QPC_OFFSET + 0x70)
+#define INIT_HCA_UDAV_OFFSET         	 0x0b0
+#define  INIT_HCA_UDAV_LKEY_OFFSET   	 (INIT_HCA_UDAV_OFFSET + 0x0)
+#define  INIT_HCA_UDAV_PD_OFFSET     	 (INIT_HCA_UDAV_OFFSET + 0x4)
+#define INIT_HCA_MCAST_OFFSET        	 0x0c0
+#define  INIT_HCA_MC_BASE_OFFSET         (INIT_HCA_MCAST_OFFSET + 0x00)
+#define  INIT_HCA_LOG_MC_ENTRY_SZ_OFFSET (INIT_HCA_MCAST_OFFSET + 0x12)
+#define  INIT_HCA_MC_HASH_SZ_OFFSET      (INIT_HCA_MCAST_OFFSET + 0x16)
+#define  INIT_HCA_LOG_MC_TABLE_SZ_OFFSET (INIT_HCA_MCAST_OFFSET + 0x1b)
+#define INIT_HCA_TPT_OFFSET              0x0f0
+#define  INIT_HCA_MPT_BASE_OFFSET        (INIT_HCA_TPT_OFFSET + 0x00)
+#define  INIT_HCA_MTT_SEG_SZ_OFFSET      (INIT_HCA_TPT_OFFSET + 0x09)
+#define  INIT_HCA_LOG_MPT_SZ_OFFSET      (INIT_HCA_TPT_OFFSET + 0x0b)
+#define  INIT_HCA_MTT_BASE_OFFSET        (INIT_HCA_TPT_OFFSET + 0x10)
+#define INIT_HCA_UAR_OFFSET              0x120
+#define  INIT_HCA_UAR_BASE_OFFSET        (INIT_HCA_UAR_OFFSET + 0x00)
+#define  INIT_HCA_UARC_SZ_OFFSET         (INIT_HCA_UAR_OFFSET + 0x09)
+#define  INIT_HCA_LOG_UAR_SZ_OFFSET      (INIT_HCA_UAR_OFFSET + 0x0a)
+#define  INIT_HCA_UAR_PAGE_SZ_OFFSET     (INIT_HCA_UAR_OFFSET + 0x0b)
+#define  INIT_HCA_UAR_SCATCH_BASE_OFFSET (INIT_HCA_UAR_OFFSET + 0x10)
+#define  INIT_HCA_UAR_CTX_BASE_OFFSET    (INIT_HCA_UAR_OFFSET + 0x18)
+
+	mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	inbox = mailbox->buf;
+
+	memset(inbox, 0, INIT_HCA_IN_SIZE);
+
+	if (dev->mthca_flags & MTHCA_FLAG_SINAI_OPT)
+		MTHCA_PUT(inbox, 0x1, INIT_HCA_FLAGS1_OFFSET);
+
+#if defined(__LITTLE_ENDIAN)
+	*(inbox + INIT_HCA_FLAGS2_OFFSET / 4) &= ~cpu_to_be32(1 << 1);
+#elif defined(__BIG_ENDIAN)
+	*(inbox + INIT_HCA_FLAGS2_OFFSET / 4) |= cpu_to_be32(1 << 1);
+#else
+#error Host endianness not defined
+#endif
+	/* Check port for UD address vector: */
+	*(inbox + INIT_HCA_FLAGS2_OFFSET / 4) |= cpu_to_be32(1);
+
+	/* Enable IPoIB checksumming if we can: */
+	if (dev->device_cap_flags & IB_DEVICE_UD_IP_CSUM)
+		*(inbox + INIT_HCA_FLAGS2_OFFSET / 4) |= cpu_to_be32(7 << 3);
+
+	/* We leave wqe_quota, responder_exu, etc as 0 (default) */
+
+	/* QPC/EEC/CQC/EQC/RDB attributes */
+
+	MTHCA_PUT(inbox, param->qpc_base,     INIT_HCA_QPC_BASE_OFFSET);
+	MTHCA_PUT(inbox, param->log_num_qps,  INIT_HCA_LOG_QP_OFFSET);
+	MTHCA_PUT(inbox, param->eec_base,     INIT_HCA_EEC_BASE_OFFSET);
+	MTHCA_PUT(inbox, param->log_num_eecs, INIT_HCA_LOG_EEC_OFFSET);
+	MTHCA_PUT(inbox, param->srqc_base,    INIT_HCA_SRQC_BASE_OFFSET);
+	MTHCA_PUT(inbox, param->log_num_srqs, INIT_HCA_LOG_SRQ_OFFSET);
+	MTHCA_PUT(inbox, param->cqc_base,     INIT_HCA_CQC_BASE_OFFSET);
+	MTHCA_PUT(inbox, param->log_num_cqs,  INIT_HCA_LOG_CQ_OFFSET);
+	MTHCA_PUT(inbox, param->eqpc_base,    INIT_HCA_EQPC_BASE_OFFSET);
+	MTHCA_PUT(inbox, param->eeec_base,    INIT_HCA_EEEC_BASE_OFFSET);
+	MTHCA_PUT(inbox, param->eqc_base,     INIT_HCA_EQC_BASE_OFFSET);
+	MTHCA_PUT(inbox, param->log_num_eqs,  INIT_HCA_LOG_EQ_OFFSET);
+	MTHCA_PUT(inbox, param->rdb_base,     INIT_HCA_RDB_BASE_OFFSET);
+
+	/* UD AV attributes */
+
+	/* multicast attributes */
+
+	MTHCA_PUT(inbox, param->mc_base,         INIT_HCA_MC_BASE_OFFSET);
+	MTHCA_PUT(inbox, param->log_mc_entry_sz, INIT_HCA_LOG_MC_ENTRY_SZ_OFFSET);
+	MTHCA_PUT(inbox, param->mc_hash_sz,      INIT_HCA_MC_HASH_SZ_OFFSET);
+	MTHCA_PUT(inbox, param->log_mc_table_sz, INIT_HCA_LOG_MC_TABLE_SZ_OFFSET);
+
+	/* TPT attributes */
+
+	MTHCA_PUT(inbox, param->mpt_base,   INIT_HCA_MPT_BASE_OFFSET);
+	if (!mthca_is_memfree(dev))
+		MTHCA_PUT(inbox, param->mtt_seg_sz, INIT_HCA_MTT_SEG_SZ_OFFSET);
+	MTHCA_PUT(inbox, param->log_mpt_sz, INIT_HCA_LOG_MPT_SZ_OFFSET);
+	MTHCA_PUT(inbox, param->mtt_base,   INIT_HCA_MTT_BASE_OFFSET);
+
+	/* UAR attributes */
+	{
+		u8 uar_page_sz = PAGE_SHIFT - 12;
+		MTHCA_PUT(inbox, uar_page_sz, INIT_HCA_UAR_PAGE_SZ_OFFSET);
+	}
+
+	MTHCA_PUT(inbox, param->uar_scratch_base, INIT_HCA_UAR_SCATCH_BASE_OFFSET);
+
+	if (mthca_is_memfree(dev)) {
+		MTHCA_PUT(inbox, param->log_uarc_sz, INIT_HCA_UARC_SZ_OFFSET);
+		MTHCA_PUT(inbox, param->log_uar_sz,  INIT_HCA_LOG_UAR_SZ_OFFSET);
+		MTHCA_PUT(inbox, param->uarc_base,   INIT_HCA_UAR_CTX_BASE_OFFSET);
+	}
+
+	err = mthca_cmd(dev, mailbox->dma, 0, 0,
+			CMD_INIT_HCA, CMD_TIME_CLASS_D);
+
+	mthca_free_mailbox(dev, mailbox);
+	return err;
+}
+
+int mthca_INIT_IB(struct mthca_dev *dev,
+		  struct mthca_init_ib_param *param,
+		  int port)
+{
+	struct mthca_mailbox *mailbox;
+	u32 *inbox;
+	int err;
+	u32 flags;
+
+#define INIT_IB_IN_SIZE          56
+#define INIT_IB_FLAGS_OFFSET     0x00
+#define INIT_IB_FLAG_SIG         (1 << 18)
+#define INIT_IB_FLAG_NG          (1 << 17)
+#define INIT_IB_FLAG_G0          (1 << 16)
+#define INIT_IB_VL_SHIFT         4
+#define INIT_IB_PORT_WIDTH_SHIFT 8
+#define INIT_IB_MTU_SHIFT        12
+#define INIT_IB_MAX_GID_OFFSET   0x06
+#define INIT_IB_MAX_PKEY_OFFSET  0x0a
+#define INIT_IB_GUID0_OFFSET     0x10
+#define INIT_IB_NODE_GUID_OFFSET 0x18
+#define INIT_IB_SI_GUID_OFFSET   0x20
+
+	mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	inbox = mailbox->buf;
+
+	memset(inbox, 0, INIT_IB_IN_SIZE);
+
+	flags = 0;
+	flags |= param->set_guid0     ? INIT_IB_FLAG_G0  : 0;
+	flags |= param->set_node_guid ? INIT_IB_FLAG_NG  : 0;
+	flags |= param->set_si_guid   ? INIT_IB_FLAG_SIG : 0;
+	flags |= param->vl_cap << INIT_IB_VL_SHIFT;
+	flags |= param->port_width << INIT_IB_PORT_WIDTH_SHIFT;
+	flags |= param->mtu_cap << INIT_IB_MTU_SHIFT;
+	MTHCA_PUT(inbox, flags, INIT_IB_FLAGS_OFFSET);
+
+	MTHCA_PUT(inbox, param->gid_cap,   INIT_IB_MAX_GID_OFFSET);
+	MTHCA_PUT(inbox, param->pkey_cap,  INIT_IB_MAX_PKEY_OFFSET);
+	MTHCA_PUT(inbox, param->guid0,     INIT_IB_GUID0_OFFSET);
+	MTHCA_PUT(inbox, param->node_guid, INIT_IB_NODE_GUID_OFFSET);
+	MTHCA_PUT(inbox, param->si_guid,   INIT_IB_SI_GUID_OFFSET);
+
+	err = mthca_cmd(dev, mailbox->dma, port, 0, CMD_INIT_IB,
+			CMD_TIME_CLASS_A);
+
+	mthca_free_mailbox(dev, mailbox);
+	return err;
+}
+
+int mthca_CLOSE_IB(struct mthca_dev *dev, int port)
+{
+	return mthca_cmd(dev, 0, port, 0, CMD_CLOSE_IB, CMD_TIME_CLASS_A);
+}
+
+int mthca_CLOSE_HCA(struct mthca_dev *dev, int panic)
+{
+	return mthca_cmd(dev, 0, 0, panic, CMD_CLOSE_HCA, CMD_TIME_CLASS_C);
+}
+
+int mthca_SET_IB(struct mthca_dev *dev, struct mthca_set_ib_param *param,
+		 int port)
+{
+	struct mthca_mailbox *mailbox;
+	u32 *inbox;
+	int err;
+	u32 flags = 0;
+
+#define SET_IB_IN_SIZE         0x40
+#define SET_IB_FLAGS_OFFSET    0x00
+#define SET_IB_FLAG_SIG        (1 << 18)
+#define SET_IB_FLAG_RQK        (1 <<  0)
+#define SET_IB_CAP_MASK_OFFSET 0x04
+#define SET_IB_SI_GUID_OFFSET  0x08
+
+	mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	inbox = mailbox->buf;
+
+	memset(inbox, 0, SET_IB_IN_SIZE);
+
+	flags |= param->set_si_guid     ? SET_IB_FLAG_SIG : 0;
+	flags |= param->reset_qkey_viol ? SET_IB_FLAG_RQK : 0;
+	MTHCA_PUT(inbox, flags, SET_IB_FLAGS_OFFSET);
+
+	MTHCA_PUT(inbox, param->cap_mask, SET_IB_CAP_MASK_OFFSET);
+	MTHCA_PUT(inbox, param->si_guid,  SET_IB_SI_GUID_OFFSET);
+
+	err = mthca_cmd(dev, mailbox->dma, port, 0, CMD_SET_IB,
+			CMD_TIME_CLASS_B);
+
+	mthca_free_mailbox(dev, mailbox);
+	return err;
+}
+
+int mthca_MAP_ICM(struct mthca_dev *dev, struct mthca_icm *icm, u64 virt)
+{
+	return mthca_map_cmd(dev, CMD_MAP_ICM, icm, virt);
+}
+
+int mthca_MAP_ICM_page(struct mthca_dev *dev, u64 dma_addr, u64 virt)
+{
+	struct mthca_mailbox *mailbox;
+	__be64 *inbox;
+	int err;
+
+	mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	inbox = mailbox->buf;
+
+	inbox[0] = cpu_to_be64(virt);
+	inbox[1] = cpu_to_be64(dma_addr);
+
+	err = mthca_cmd(dev, mailbox->dma, 1, 0, CMD_MAP_ICM,
+			CMD_TIME_CLASS_B);
+
+	mthca_free_mailbox(dev, mailbox);
+
+	if (!err)
+		mthca_dbg(dev, "Mapped page at %llx to %llx for ICM.\n",
+			  (unsigned long long) dma_addr, (unsigned long long) virt);
+
+	return err;
+}
+
+int mthca_UNMAP_ICM(struct mthca_dev *dev, u64 virt, u32 page_count)
+{
+	mthca_dbg(dev, "Unmapping %d pages at %llx from ICM.\n",
+		  page_count, (unsigned long long) virt);
+
+	return mthca_cmd(dev, virt, page_count, 0,
+			CMD_UNMAP_ICM, CMD_TIME_CLASS_B);
+}
+
+int mthca_MAP_ICM_AUX(struct mthca_dev *dev, struct mthca_icm *icm)
+{
+	return mthca_map_cmd(dev, CMD_MAP_ICM_AUX, icm, -1);
+}
+
+int mthca_UNMAP_ICM_AUX(struct mthca_dev *dev)
+{
+	return mthca_cmd(dev, 0, 0, 0, CMD_UNMAP_ICM_AUX, CMD_TIME_CLASS_B);
+}
+
+int mthca_SET_ICM_SIZE(struct mthca_dev *dev, u64 icm_size, u64 *aux_pages)
+{
+	int ret = mthca_cmd_imm(dev, icm_size, aux_pages, 0,
+			0, CMD_SET_ICM_SIZE, CMD_TIME_CLASS_A);
+
+	if (ret)
+		return ret;
+
+	/*
+	 * Round up number of system pages needed in case
+	 * MTHCA_ICM_PAGE_SIZE < PAGE_SIZE.
+	 */
+	*aux_pages = ALIGN(*aux_pages, PAGE_SIZE / MTHCA_ICM_PAGE_SIZE) >>
+		(PAGE_SHIFT - MTHCA_ICM_PAGE_SHIFT);
+
+	return 0;
+}
+
+int mthca_SW2HW_MPT(struct mthca_dev *dev, struct mthca_mailbox *mailbox,
+		    int mpt_index)
+{
+	return mthca_cmd(dev, mailbox->dma, mpt_index, 0, CMD_SW2HW_MPT,
+			 CMD_TIME_CLASS_B);
+}
+
+int mthca_HW2SW_MPT(struct mthca_dev *dev, struct mthca_mailbox *mailbox,
+		    int mpt_index)
+{
+	return mthca_cmd_box(dev, 0, mailbox ? mailbox->dma : 0, mpt_index,
+			     !mailbox, CMD_HW2SW_MPT,
+			     CMD_TIME_CLASS_B);
+}
+
+int mthca_WRITE_MTT(struct mthca_dev *dev, struct mthca_mailbox *mailbox,
+		    int num_mtt)
+{
+	return mthca_cmd(dev, mailbox->dma, num_mtt, 0, CMD_WRITE_MTT,
+			 CMD_TIME_CLASS_B);
+}
+
+int mthca_SYNC_TPT(struct mthca_dev *dev)
+{
+	return mthca_cmd(dev, 0, 0, 0, CMD_SYNC_TPT, CMD_TIME_CLASS_B);
+}
+
+int mthca_MAP_EQ(struct mthca_dev *dev, u64 event_mask, int unmap,
+		 int eq_num)
+{
+	mthca_dbg(dev, "%s mask %016llx for eqn %d\n",
+		  unmap ? "Clearing" : "Setting",
+		  (unsigned long long) event_mask, eq_num);
+	return mthca_cmd(dev, event_mask, (unmap << 31) | eq_num,
+			 0, CMD_MAP_EQ, CMD_TIME_CLASS_B);
+}
+
+int mthca_SW2HW_EQ(struct mthca_dev *dev, struct mthca_mailbox *mailbox,
+		   int eq_num)
+{
+	return mthca_cmd(dev, mailbox->dma, eq_num, 0, CMD_SW2HW_EQ,
+			 CMD_TIME_CLASS_A);
+}
+
+int mthca_HW2SW_EQ(struct mthca_dev *dev, struct mthca_mailbox *mailbox,
+		   int eq_num)
+{
+	return mthca_cmd_box(dev, 0, mailbox->dma, eq_num, 0,
+			     CMD_HW2SW_EQ,
+			     CMD_TIME_CLASS_A);
+}
+
+int mthca_SW2HW_CQ(struct mthca_dev *dev, struct mthca_mailbox *mailbox,
+		   int cq_num)
+{
+	return mthca_cmd(dev, mailbox->dma, cq_num, 0, CMD_SW2HW_CQ,
+			CMD_TIME_CLASS_A);
+}
+
+int mthca_HW2SW_CQ(struct mthca_dev *dev, struct mthca_mailbox *mailbox,
+		   int cq_num)
+{
+	return mthca_cmd_box(dev, 0, mailbox->dma, cq_num, 0,
+			     CMD_HW2SW_CQ,
+			     CMD_TIME_CLASS_A);
+}
+
+int mthca_RESIZE_CQ(struct mthca_dev *dev, int cq_num, u32 lkey, u8 log_size)
+{
+	struct mthca_mailbox *mailbox;
+	__be32 *inbox;
+	int err;
+
+#define RESIZE_CQ_IN_SIZE		0x40
+#define RESIZE_CQ_LOG_SIZE_OFFSET	0x0c
+#define RESIZE_CQ_LKEY_OFFSET		0x1c
+
+	mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	inbox = mailbox->buf;
+
+	memset(inbox, 0, RESIZE_CQ_IN_SIZE);
+	/*
+	 * Leave start address fields zeroed out -- mthca assumes that
+	 * MRs for CQs always start at virtual address 0.
+	 */
+	MTHCA_PUT(inbox, log_size, RESIZE_CQ_LOG_SIZE_OFFSET);
+	MTHCA_PUT(inbox, lkey,     RESIZE_CQ_LKEY_OFFSET);
+
+	err = mthca_cmd(dev, mailbox->dma, cq_num, 1, CMD_RESIZE_CQ,
+			CMD_TIME_CLASS_B);
+
+	mthca_free_mailbox(dev, mailbox);
+	return err;
+}
+
+int mthca_SW2HW_SRQ(struct mthca_dev *dev, struct mthca_mailbox *mailbox,
+		    int srq_num)
+{
+	return mthca_cmd(dev, mailbox->dma, srq_num, 0, CMD_SW2HW_SRQ,
+			CMD_TIME_CLASS_A);
+}
+
+int mthca_HW2SW_SRQ(struct mthca_dev *dev, struct mthca_mailbox *mailbox,
+		    int srq_num)
+{
+	return mthca_cmd_box(dev, 0, mailbox->dma, srq_num, 0,
+			     CMD_HW2SW_SRQ,
+			     CMD_TIME_CLASS_A);
+}
+
+int mthca_QUERY_SRQ(struct mthca_dev *dev, u32 num,
+		    struct mthca_mailbox *mailbox)
+{
+	return mthca_cmd_box(dev, 0, mailbox->dma, num, 0,
+			     CMD_QUERY_SRQ, CMD_TIME_CLASS_A);
+}
+
+int mthca_ARM_SRQ(struct mthca_dev *dev, int srq_num, int limit)
+{
+	return mthca_cmd(dev, limit, srq_num, 0, CMD_ARM_SRQ,
+			 CMD_TIME_CLASS_B);
+}
+
+int mthca_MODIFY_QP(struct mthca_dev *dev, enum ib_qp_state cur,
+		    enum ib_qp_state next, u32 num, int is_ee,
+		    struct mthca_mailbox *mailbox, u32 optmask)
+{
+	static const u16 op[IB_QPS_ERR + 1][IB_QPS_ERR + 1] = {
+		[IB_QPS_RESET] = {
+			[IB_QPS_RESET]	= CMD_ERR2RST_QPEE,
+			[IB_QPS_ERR]	= CMD_2ERR_QPEE,
+			[IB_QPS_INIT]	= CMD_RST2INIT_QPEE,
+		},
+		[IB_QPS_INIT]  = {
+			[IB_QPS_RESET]	= CMD_ERR2RST_QPEE,
+			[IB_QPS_ERR]	= CMD_2ERR_QPEE,
+			[IB_QPS_INIT]	= CMD_INIT2INIT_QPEE,
+			[IB_QPS_RTR]	= CMD_INIT2RTR_QPEE,
+		},
+		[IB_QPS_RTR]   = {
+			[IB_QPS_RESET]	= CMD_ERR2RST_QPEE,
+			[IB_QPS_ERR]	= CMD_2ERR_QPEE,
+			[IB_QPS_RTS]	= CMD_RTR2RTS_QPEE,
+		},
+		[IB_QPS_RTS]   = {
+			[IB_QPS_RESET]	= CMD_ERR2RST_QPEE,
+			[IB_QPS_ERR]	= CMD_2ERR_QPEE,
+			[IB_QPS_RTS]	= CMD_RTS2RTS_QPEE,
+			[IB_QPS_SQD]	= CMD_RTS2SQD_QPEE,
+		},
+		[IB_QPS_SQD] = {
+			[IB_QPS_RESET]	= CMD_ERR2RST_QPEE,
+			[IB_QPS_ERR]	= CMD_2ERR_QPEE,
+			[IB_QPS_RTS]	= CMD_SQD2RTS_QPEE,
+			[IB_QPS_SQD]	= CMD_SQD2SQD_QPEE,
+		},
+		[IB_QPS_SQE] = {
+			[IB_QPS_RESET]	= CMD_ERR2RST_QPEE,
+			[IB_QPS_ERR]	= CMD_2ERR_QPEE,
+			[IB_QPS_RTS]	= CMD_SQERR2RTS_QPEE,
+		},
+		[IB_QPS_ERR] = {
+			[IB_QPS_RESET]	= CMD_ERR2RST_QPEE,
+			[IB_QPS_ERR]	= CMD_2ERR_QPEE,
+		}
+	};
+
+	u8 op_mod = 0;
+	int my_mailbox = 0;
+	int err;
+
+	if (op[cur][next] == CMD_ERR2RST_QPEE) {
+		op_mod = 3;	/* don't write outbox, any->reset */
+
+		/* For debugging */
+		if (!mailbox) {
+			mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+			if (!IS_ERR(mailbox)) {
+				my_mailbox = 1;
+				op_mod     = 2;	/* write outbox, any->reset */
+			} else
+				mailbox = NULL;
+		}
+
+		err = mthca_cmd_box(dev, 0, mailbox ? mailbox->dma : 0,
+				    (!!is_ee << 24) | num, op_mod,
+				    op[cur][next], CMD_TIME_CLASS_C);
+
+		if (0 && mailbox) {
+			int i;
+			mthca_dbg(dev, "Dumping QP context:\n");
+			printk(" %08x\n", be32_to_cpup(mailbox->buf));
+			for (i = 0; i < 0x100 / 4; ++i) {
+				if (i % 8 == 0)
+					printk("[%02x] ", i * 4);
+				printk(" %08x",
+				       be32_to_cpu(((__be32 *) mailbox->buf)[i + 2]));
+				if ((i + 1) % 8 == 0)
+					printk("\n");
+			}
+		}
+
+		if (my_mailbox)
+			mthca_free_mailbox(dev, mailbox);
+	} else {
+		if (0) {
+			int i;
+			mthca_dbg(dev, "Dumping QP context:\n");
+			printk("  opt param mask: %08x\n", be32_to_cpup(mailbox->buf));
+			for (i = 0; i < 0x100 / 4; ++i) {
+				if (i % 8 == 0)
+					printk("  [%02x] ", i * 4);
+				printk(" %08x",
+				       be32_to_cpu(((__be32 *) mailbox->buf)[i + 2]));
+				if ((i + 1) % 8 == 0)
+					printk("\n");
+			}
+		}
+
+		err = mthca_cmd(dev, mailbox->dma, optmask | (!!is_ee << 24) | num,
+				op_mod, op[cur][next], CMD_TIME_CLASS_C);
+	}
+
+	return err;
+}
+
+int mthca_QUERY_QP(struct mthca_dev *dev, u32 num, int is_ee,
+		   struct mthca_mailbox *mailbox)
+{
+	return mthca_cmd_box(dev, 0, mailbox->dma, (!!is_ee << 24) | num, 0,
+			     CMD_QUERY_QPEE, CMD_TIME_CLASS_A);
+}
+
+int mthca_CONF_SPECIAL_QP(struct mthca_dev *dev, int type, u32 qpn)
+{
+	u8 op_mod;
+
+	switch (type) {
+	case IB_QPT_SMI:
+		op_mod = 0;
+		break;
+	case IB_QPT_GSI:
+		op_mod = 1;
+		break;
+	case IB_QPT_RAW_IPV6:
+		op_mod = 2;
+		break;
+	case IB_QPT_RAW_ETHERTYPE:
+		op_mod = 3;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return mthca_cmd(dev, 0, qpn, op_mod, CMD_CONF_SPECIAL_QP,
+			 CMD_TIME_CLASS_B);
+}
+
+int mthca_MAD_IFC(struct mthca_dev *dev, int ignore_mkey, int ignore_bkey,
+		  int port, struct ib_wc *in_wc, struct ib_grh *in_grh,
+		  void *in_mad, void *response_mad)
+{
+	struct mthca_mailbox *inmailbox, *outmailbox;
+	void *inbox;
+	int err;
+	u32 in_modifier = port;
+	u8 op_modifier = 0;
+
+#define MAD_IFC_BOX_SIZE      0x400
+#define MAD_IFC_MY_QPN_OFFSET 0x100
+#define MAD_IFC_RQPN_OFFSET   0x108
+#define MAD_IFC_SL_OFFSET     0x10c
+#define MAD_IFC_G_PATH_OFFSET 0x10d
+#define MAD_IFC_RLID_OFFSET   0x10e
+#define MAD_IFC_PKEY_OFFSET   0x112
+#define MAD_IFC_GRH_OFFSET    0x140
+
+	inmailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+	if (IS_ERR(inmailbox))
+		return PTR_ERR(inmailbox);
+	inbox = inmailbox->buf;
+
+	outmailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+	if (IS_ERR(outmailbox)) {
+		mthca_free_mailbox(dev, inmailbox);
+		return PTR_ERR(outmailbox);
+	}
+
+	memcpy(inbox, in_mad, 256);
+
+	/*
+	 * Key check traps can't be generated unless we have in_wc to
+	 * tell us where to send the trap.
+	 */
+	if (ignore_mkey || !in_wc)
+		op_modifier |= 0x1;
+	if (ignore_bkey || !in_wc)
+		op_modifier |= 0x2;
+
+	if (in_wc) {
+		u8 val;
+
+		memset(inbox + 256, 0, 256);
+
+		MTHCA_PUT(inbox, in_wc->qp->qp_num, MAD_IFC_MY_QPN_OFFSET);
+		MTHCA_PUT(inbox, in_wc->src_qp,     MAD_IFC_RQPN_OFFSET);
+
+		val = in_wc->sl << 4;
+		MTHCA_PUT(inbox, val,               MAD_IFC_SL_OFFSET);
+
+		val = in_wc->dlid_path_bits |
+			(in_wc->wc_flags & IB_WC_GRH ? 0x80 : 0);
+		MTHCA_PUT(inbox, val,               MAD_IFC_G_PATH_OFFSET);
+
+		MTHCA_PUT(inbox, in_wc->slid,       MAD_IFC_RLID_OFFSET);
+		MTHCA_PUT(inbox, in_wc->pkey_index, MAD_IFC_PKEY_OFFSET);
+
+		if (in_grh)
+			memcpy(inbox + MAD_IFC_GRH_OFFSET, in_grh, 40);
+
+		op_modifier |= 0x4;
+
+		in_modifier |= in_wc->slid << 16;
+	}
+
+	err = mthca_cmd_box(dev, inmailbox->dma, outmailbox->dma,
+			    in_modifier, op_modifier,
+			    CMD_MAD_IFC, CMD_TIME_CLASS_C);
+
+	if (!err)
+		memcpy(response_mad, outmailbox->buf, 256);
+
+	mthca_free_mailbox(dev, inmailbox);
+	mthca_free_mailbox(dev, outmailbox);
+	return err;
+}
+
+int mthca_READ_MGM(struct mthca_dev *dev, int index,
+		   struct mthca_mailbox *mailbox)
+{
+	return mthca_cmd_box(dev, 0, mailbox->dma, index, 0,
+			     CMD_READ_MGM, CMD_TIME_CLASS_A);
+}
+
+int mthca_WRITE_MGM(struct mthca_dev *dev, int index,
+		    struct mthca_mailbox *mailbox)
+{
+	return mthca_cmd(dev, mailbox->dma, index, 0, CMD_WRITE_MGM,
+			 CMD_TIME_CLASS_A);
+}
+
+int mthca_MGID_HASH(struct mthca_dev *dev, struct mthca_mailbox *mailbox,
+		    u16 *hash)
+{
+	u64 imm;
+	int err;
+
+	err = mthca_cmd_imm(dev, mailbox->dma, &imm, 0, 0, CMD_MGID_HASH,
+			    CMD_TIME_CLASS_A);
+
+	*hash = imm;
+	return err;
+}
+
+int mthca_NOP(struct mthca_dev *dev)
+{
+	return mthca_cmd(dev, 0, 0x1f, 0, CMD_NOP, msecs_to_jiffies(100));
+}
diff --git a/drivers/infiniband/hw/mthca/mthca_cmd.h b/drivers/infiniband/hw/mthca/mthca_cmd.h
new file mode 100644
index 0000000..f952244
--- /dev/null
+++ b/drivers/infiniband/hw/mthca/mthca_cmd.h
@@ -0,0 +1,325 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2006 Cisco Systems.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MTHCA_CMD_H
+#define MTHCA_CMD_H
+
+#include <rdma/ib_verbs.h>
+
+#define MTHCA_MAILBOX_SIZE 4096
+
+enum {
+	/* command completed successfully: */
+	MTHCA_CMD_STAT_OK 	      = 0x00,
+	/* Internal error (such as a bus error) occurred while processing command: */
+	MTHCA_CMD_STAT_INTERNAL_ERR   = 0x01,
+	/* Operation/command not supported or opcode modifier not supported: */
+	MTHCA_CMD_STAT_BAD_OP 	      = 0x02,
+	/* Parameter not supported or parameter out of range: */
+	MTHCA_CMD_STAT_BAD_PARAM      = 0x03,
+	/* System not enabled or bad system state: */
+	MTHCA_CMD_STAT_BAD_SYS_STATE  = 0x04,
+	/* Attempt to access reserved or unallocaterd resource: */
+	MTHCA_CMD_STAT_BAD_RESOURCE   = 0x05,
+	/* Requested resource is currently executing a command, or is otherwise busy: */
+	MTHCA_CMD_STAT_RESOURCE_BUSY  = 0x06,
+	/* memory error: */
+	MTHCA_CMD_STAT_DDR_MEM_ERR    = 0x07,
+	/* Required capability exceeds device limits: */
+	MTHCA_CMD_STAT_EXCEED_LIM     = 0x08,
+	/* Resource is not in the appropriate state or ownership: */
+	MTHCA_CMD_STAT_BAD_RES_STATE  = 0x09,
+	/* Index out of range: */
+	MTHCA_CMD_STAT_BAD_INDEX      = 0x0a,
+	/* FW image corrupted: */
+	MTHCA_CMD_STAT_BAD_NVMEM      = 0x0b,
+	/* Attempt to modify a QP/EE which is not in the presumed state: */
+	MTHCA_CMD_STAT_BAD_QPEE_STATE = 0x10,
+	/* Bad segment parameters (Address/Size): */
+	MTHCA_CMD_STAT_BAD_SEG_PARAM  = 0x20,
+	/* Memory Region has Memory Windows bound to: */
+	MTHCA_CMD_STAT_REG_BOUND      = 0x21,
+	/* HCA local attached memory not present: */
+	MTHCA_CMD_STAT_LAM_NOT_PRE    = 0x22,
+	/* Bad management packet (silently discarded): */
+	MTHCA_CMD_STAT_BAD_PKT 	      = 0x30,
+	/* More outstanding CQEs in CQ than new CQ size: */
+	MTHCA_CMD_STAT_BAD_SIZE       = 0x40
+};
+
+enum {
+	MTHCA_TRANS_INVALID = 0,
+	MTHCA_TRANS_RST2INIT,
+	MTHCA_TRANS_INIT2INIT,
+	MTHCA_TRANS_INIT2RTR,
+	MTHCA_TRANS_RTR2RTS,
+	MTHCA_TRANS_RTS2RTS,
+	MTHCA_TRANS_SQERR2RTS,
+	MTHCA_TRANS_ANY2ERR,
+	MTHCA_TRANS_RTS2SQD,
+	MTHCA_TRANS_SQD2SQD,
+	MTHCA_TRANS_SQD2RTS,
+	MTHCA_TRANS_ANY2RST,
+};
+
+enum {
+	DEV_LIM_FLAG_RC                 = 1 << 0,
+	DEV_LIM_FLAG_UC                 = 1 << 1,
+	DEV_LIM_FLAG_UD                 = 1 << 2,
+	DEV_LIM_FLAG_RD                 = 1 << 3,
+	DEV_LIM_FLAG_RAW_IPV6           = 1 << 4,
+	DEV_LIM_FLAG_RAW_ETHER          = 1 << 5,
+	DEV_LIM_FLAG_SRQ                = 1 << 6,
+	DEV_LIM_FLAG_IPOIB_CSUM		= 1 << 7,
+	DEV_LIM_FLAG_BAD_PKEY_CNTR      = 1 << 8,
+	DEV_LIM_FLAG_BAD_QKEY_CNTR      = 1 << 9,
+	DEV_LIM_FLAG_MW                 = 1 << 16,
+	DEV_LIM_FLAG_AUTO_PATH_MIG      = 1 << 17,
+	DEV_LIM_FLAG_ATOMIC             = 1 << 18,
+	DEV_LIM_FLAG_RAW_MULTI          = 1 << 19,
+	DEV_LIM_FLAG_UD_AV_PORT_ENFORCE = 1 << 20,
+	DEV_LIM_FLAG_UD_MULTI           = 1 << 21,
+};
+
+struct mthca_mailbox {
+	dma_addr_t dma;
+	void      *buf;
+};
+
+struct mthca_dev_lim {
+	int max_srq_sz;
+	int max_qp_sz;
+	int reserved_qps;
+	int max_qps;
+	int reserved_srqs;
+	int max_srqs;
+	int reserved_eecs;
+	int max_eecs;
+	int max_cq_sz;
+	int reserved_cqs;
+	int max_cqs;
+	int max_mpts;
+	int reserved_eqs;
+	int max_eqs;
+	int reserved_mtts;
+	int max_mrw_sz;
+	int reserved_mrws;
+	int max_mtt_seg;
+	int max_requester_per_qp;
+	int max_responder_per_qp;
+	int max_rdma_global;
+	int local_ca_ack_delay;
+	int max_mtu;
+	int max_port_width;
+	int max_vl;
+	int num_ports;
+	int max_gids;
+	u16 stat_rate_support;
+	int max_pkeys;
+	u32 flags;
+	int reserved_uars;
+	int uar_size;
+	int min_page_sz;
+	int max_sg;
+	int max_desc_sz;
+	int max_qp_per_mcg;
+	int reserved_mgms;
+	int max_mcgs;
+	int reserved_pds;
+	int max_pds;
+	int reserved_rdds;
+	int max_rdds;
+	int eec_entry_sz;
+	int qpc_entry_sz;
+	int eeec_entry_sz;
+	int eqpc_entry_sz;
+	int eqc_entry_sz;
+	int cqc_entry_sz;
+	int srq_entry_sz;
+	int uar_scratch_entry_sz;
+	int mpt_entry_sz;
+	union {
+		struct {
+			int max_avs;
+		} tavor;
+		struct {
+			int resize_srq;
+			int max_pbl_sz;
+			u8  bmme_flags;
+			u32 reserved_lkey;
+			int lam_required;
+			u64 max_icm_sz;
+		} arbel;
+	} hca;
+};
+
+struct mthca_adapter {
+	u32  vendor_id;
+	u32  device_id;
+	u32  revision_id;
+	char board_id[MTHCA_BOARD_ID_LEN];
+	u8   inta_pin;
+};
+
+struct mthca_init_hca_param {
+	u64 qpc_base;
+	u64 eec_base;
+	u64 srqc_base;
+	u64 cqc_base;
+	u64 eqpc_base;
+	u64 eeec_base;
+	u64 eqc_base;
+	u64 rdb_base;
+	u64 mc_base;
+	u64 mpt_base;
+	u64 mtt_base;
+	u64 uar_scratch_base;
+	u64 uarc_base;
+	u16 log_mc_entry_sz;
+	u16 mc_hash_sz;
+	u8  log_num_qps;
+	u8  log_num_eecs;
+	u8  log_num_srqs;
+	u8  log_num_cqs;
+	u8  log_num_eqs;
+	u8  log_mc_table_sz;
+	u8  mtt_seg_sz;
+	u8  log_mpt_sz;
+	u8  log_uar_sz;
+	u8  log_uarc_sz;
+};
+
+struct mthca_init_ib_param {
+	int port_width;
+	int vl_cap;
+	int mtu_cap;
+	u16 gid_cap;
+	u16 pkey_cap;
+	int set_guid0;
+	u64 guid0;
+	int set_node_guid;
+	u64 node_guid;
+	int set_si_guid;
+	u64 si_guid;
+};
+
+struct mthca_set_ib_param {
+	int set_si_guid;
+	int reset_qkey_viol;
+	u64 si_guid;
+	u32 cap_mask;
+};
+
+int mthca_cmd_init(struct mthca_dev *dev);
+void mthca_cmd_cleanup(struct mthca_dev *dev);
+int mthca_cmd_use_events(struct mthca_dev *dev);
+void mthca_cmd_use_polling(struct mthca_dev *dev);
+void mthca_cmd_event(struct mthca_dev *dev, u16 token,
+		     u8  status, u64 out_param);
+
+struct mthca_mailbox *mthca_alloc_mailbox(struct mthca_dev *dev,
+					  gfp_t gfp_mask);
+void mthca_free_mailbox(struct mthca_dev *dev, struct mthca_mailbox *mailbox);
+
+int mthca_SYS_EN(struct mthca_dev *dev);
+int mthca_SYS_DIS(struct mthca_dev *dev);
+int mthca_MAP_FA(struct mthca_dev *dev, struct mthca_icm *icm);
+int mthca_UNMAP_FA(struct mthca_dev *dev);
+int mthca_RUN_FW(struct mthca_dev *dev);
+int mthca_QUERY_FW(struct mthca_dev *dev);
+int mthca_ENABLE_LAM(struct mthca_dev *dev);
+int mthca_DISABLE_LAM(struct mthca_dev *dev);
+int mthca_QUERY_DDR(struct mthca_dev *dev);
+int mthca_QUERY_DEV_LIM(struct mthca_dev *dev,
+			struct mthca_dev_lim *dev_lim);
+int mthca_QUERY_ADAPTER(struct mthca_dev *dev,
+			struct mthca_adapter *adapter);
+int mthca_INIT_HCA(struct mthca_dev *dev,
+		   struct mthca_init_hca_param *param);
+int mthca_INIT_IB(struct mthca_dev *dev,
+		  struct mthca_init_ib_param *param,
+		  int port);
+int mthca_CLOSE_IB(struct mthca_dev *dev, int port);
+int mthca_CLOSE_HCA(struct mthca_dev *dev, int panic);
+int mthca_SET_IB(struct mthca_dev *dev, struct mthca_set_ib_param *param,
+		 int port);
+int mthca_MAP_ICM(struct mthca_dev *dev, struct mthca_icm *icm, u64 virt);
+int mthca_MAP_ICM_page(struct mthca_dev *dev, u64 dma_addr, u64 virt);
+int mthca_UNMAP_ICM(struct mthca_dev *dev, u64 virt, u32 page_count);
+int mthca_MAP_ICM_AUX(struct mthca_dev *dev, struct mthca_icm *icm);
+int mthca_UNMAP_ICM_AUX(struct mthca_dev *dev);
+int mthca_SET_ICM_SIZE(struct mthca_dev *dev, u64 icm_size, u64 *aux_pages);
+int mthca_SW2HW_MPT(struct mthca_dev *dev, struct mthca_mailbox *mailbox,
+		    int mpt_index);
+int mthca_HW2SW_MPT(struct mthca_dev *dev, struct mthca_mailbox *mailbox,
+		    int mpt_index);
+int mthca_WRITE_MTT(struct mthca_dev *dev, struct mthca_mailbox *mailbox,
+		    int num_mtt);
+int mthca_SYNC_TPT(struct mthca_dev *dev);
+int mthca_MAP_EQ(struct mthca_dev *dev, u64 event_mask, int unmap,
+		 int eq_num);
+int mthca_SW2HW_EQ(struct mthca_dev *dev, struct mthca_mailbox *mailbox,
+		   int eq_num);
+int mthca_HW2SW_EQ(struct mthca_dev *dev, struct mthca_mailbox *mailbox,
+		   int eq_num);
+int mthca_SW2HW_CQ(struct mthca_dev *dev, struct mthca_mailbox *mailbox,
+		   int cq_num);
+int mthca_HW2SW_CQ(struct mthca_dev *dev, struct mthca_mailbox *mailbox,
+		   int cq_num);
+int mthca_RESIZE_CQ(struct mthca_dev *dev, int cq_num, u32 lkey, u8 log_size);
+int mthca_SW2HW_SRQ(struct mthca_dev *dev, struct mthca_mailbox *mailbox,
+		    int srq_num);
+int mthca_HW2SW_SRQ(struct mthca_dev *dev, struct mthca_mailbox *mailbox,
+		    int srq_num);
+int mthca_QUERY_SRQ(struct mthca_dev *dev, u32 num,
+		    struct mthca_mailbox *mailbox);
+int mthca_ARM_SRQ(struct mthca_dev *dev, int srq_num, int limit);
+int mthca_MODIFY_QP(struct mthca_dev *dev, enum ib_qp_state cur,
+		    enum ib_qp_state next, u32 num, int is_ee,
+		    struct mthca_mailbox *mailbox, u32 optmask);
+int mthca_QUERY_QP(struct mthca_dev *dev, u32 num, int is_ee,
+		   struct mthca_mailbox *mailbox);
+int mthca_CONF_SPECIAL_QP(struct mthca_dev *dev, int type, u32 qpn);
+int mthca_MAD_IFC(struct mthca_dev *dev, int ignore_mkey, int ignore_bkey,
+		  int port, struct ib_wc *in_wc, struct ib_grh *in_grh,
+		  void *in_mad, void *response_mad);
+int mthca_READ_MGM(struct mthca_dev *dev, int index,
+		   struct mthca_mailbox *mailbox);
+int mthca_WRITE_MGM(struct mthca_dev *dev, int index,
+		    struct mthca_mailbox *mailbox);
+int mthca_MGID_HASH(struct mthca_dev *dev, struct mthca_mailbox *mailbox,
+		    u16 *hash);
+int mthca_NOP(struct mthca_dev *dev);
+
+#endif /* MTHCA_CMD_H */
diff --git a/drivers/infiniband/hw/mthca/mthca_config_reg.h b/drivers/infiniband/hw/mthca/mthca_config_reg.h
new file mode 100644
index 0000000..155bc66
--- /dev/null
+++ b/drivers/infiniband/hw/mthca/mthca_config_reg.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MTHCA_CONFIG_REG_H
+#define MTHCA_CONFIG_REG_H
+
+#define MTHCA_HCR_BASE         0x80680
+#define MTHCA_HCR_SIZE         0x0001c
+#define MTHCA_ECR_BASE         0x80700
+#define MTHCA_ECR_SIZE         0x00008
+#define MTHCA_ECR_CLR_BASE     0x80708
+#define MTHCA_ECR_CLR_SIZE     0x00008
+#define MTHCA_MAP_ECR_SIZE     (MTHCA_ECR_SIZE + MTHCA_ECR_CLR_SIZE)
+#define MTHCA_CLR_INT_BASE     0xf00d8
+#define MTHCA_CLR_INT_SIZE     0x00008
+#define MTHCA_EQ_SET_CI_SIZE   (8 * 32)
+
+#endif /* MTHCA_CONFIG_REG_H */
diff --git a/drivers/infiniband/hw/mthca/mthca_cq.c b/drivers/infiniband/hw/mthca/mthca_cq.c
new file mode 100644
index 0000000..40ba833
--- /dev/null
+++ b/drivers/infiniband/hw/mthca/mthca_cq.c
@@ -0,0 +1,984 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2005, 2006 Cisco Systems, Inc. All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2004 Voltaire, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/gfp.h>
+#include <linux/hardirq.h>
+#include <linux/sched.h>
+
+#include <asm/io.h>
+
+#include <rdma/ib_pack.h>
+
+#include "mthca_dev.h"
+#include "mthca_cmd.h"
+#include "mthca_memfree.h"
+
+enum {
+	MTHCA_MAX_DIRECT_CQ_SIZE = 4 * PAGE_SIZE
+};
+
+enum {
+	MTHCA_CQ_ENTRY_SIZE = 0x20
+};
+
+enum {
+	MTHCA_ATOMIC_BYTE_LEN = 8
+};
+
+/*
+ * Must be packed because start is 64 bits but only aligned to 32 bits.
+ */
+struct mthca_cq_context {
+	__be32 flags;
+	__be64 start;
+	__be32 logsize_usrpage;
+	__be32 error_eqn;	/* Tavor only */
+	__be32 comp_eqn;
+	__be32 pd;
+	__be32 lkey;
+	__be32 last_notified_index;
+	__be32 solicit_producer_index;
+	__be32 consumer_index;
+	__be32 producer_index;
+	__be32 cqn;
+	__be32 ci_db;		/* Arbel only */
+	__be32 state_db;	/* Arbel only */
+	u32    reserved;
+} __attribute__((packed));
+
+#define MTHCA_CQ_STATUS_OK          ( 0 << 28)
+#define MTHCA_CQ_STATUS_OVERFLOW    ( 9 << 28)
+#define MTHCA_CQ_STATUS_WRITE_FAIL  (10 << 28)
+#define MTHCA_CQ_FLAG_TR            ( 1 << 18)
+#define MTHCA_CQ_FLAG_OI            ( 1 << 17)
+#define MTHCA_CQ_STATE_DISARMED     ( 0 <<  8)
+#define MTHCA_CQ_STATE_ARMED        ( 1 <<  8)
+#define MTHCA_CQ_STATE_ARMED_SOL    ( 4 <<  8)
+#define MTHCA_EQ_STATE_FIRED        (10 <<  8)
+
+enum {
+	MTHCA_ERROR_CQE_OPCODE_MASK = 0xfe
+};
+
+enum {
+	SYNDROME_LOCAL_LENGTH_ERR 	 = 0x01,
+	SYNDROME_LOCAL_QP_OP_ERR  	 = 0x02,
+	SYNDROME_LOCAL_EEC_OP_ERR 	 = 0x03,
+	SYNDROME_LOCAL_PROT_ERR   	 = 0x04,
+	SYNDROME_WR_FLUSH_ERR     	 = 0x05,
+	SYNDROME_MW_BIND_ERR      	 = 0x06,
+	SYNDROME_BAD_RESP_ERR     	 = 0x10,
+	SYNDROME_LOCAL_ACCESS_ERR 	 = 0x11,
+	SYNDROME_REMOTE_INVAL_REQ_ERR 	 = 0x12,
+	SYNDROME_REMOTE_ACCESS_ERR 	 = 0x13,
+	SYNDROME_REMOTE_OP_ERR     	 = 0x14,
+	SYNDROME_RETRY_EXC_ERR 		 = 0x15,
+	SYNDROME_RNR_RETRY_EXC_ERR 	 = 0x16,
+	SYNDROME_LOCAL_RDD_VIOL_ERR 	 = 0x20,
+	SYNDROME_REMOTE_INVAL_RD_REQ_ERR = 0x21,
+	SYNDROME_REMOTE_ABORTED_ERR 	 = 0x22,
+	SYNDROME_INVAL_EECN_ERR 	 = 0x23,
+	SYNDROME_INVAL_EEC_STATE_ERR 	 = 0x24
+};
+
+struct mthca_cqe {
+	__be32 my_qpn;
+	__be32 my_ee;
+	__be32 rqpn;
+	u8     sl_ipok;
+	u8     g_mlpath;
+	__be16 rlid;
+	__be32 imm_etype_pkey_eec;
+	__be32 byte_cnt;
+	__be32 wqe;
+	u8     opcode;
+	u8     is_send;
+	u8     reserved;
+	u8     owner;
+};
+
+struct mthca_err_cqe {
+	__be32 my_qpn;
+	u32    reserved1[3];
+	u8     syndrome;
+	u8     vendor_err;
+	__be16 db_cnt;
+	u32    reserved2;
+	__be32 wqe;
+	u8     opcode;
+	u8     reserved3[2];
+	u8     owner;
+};
+
+#define MTHCA_CQ_ENTRY_OWNER_SW      (0 << 7)
+#define MTHCA_CQ_ENTRY_OWNER_HW      (1 << 7)
+
+#define MTHCA_TAVOR_CQ_DB_INC_CI       (1 << 24)
+#define MTHCA_TAVOR_CQ_DB_REQ_NOT      (2 << 24)
+#define MTHCA_TAVOR_CQ_DB_REQ_NOT_SOL  (3 << 24)
+#define MTHCA_TAVOR_CQ_DB_SET_CI       (4 << 24)
+#define MTHCA_TAVOR_CQ_DB_REQ_NOT_MULT (5 << 24)
+
+#define MTHCA_ARBEL_CQ_DB_REQ_NOT_SOL  (1 << 24)
+#define MTHCA_ARBEL_CQ_DB_REQ_NOT      (2 << 24)
+#define MTHCA_ARBEL_CQ_DB_REQ_NOT_MULT (3 << 24)
+
+static inline struct mthca_cqe *get_cqe_from_buf(struct mthca_cq_buf *buf,
+						 int entry)
+{
+	if (buf->is_direct)
+		return buf->queue.direct.buf + (entry * MTHCA_CQ_ENTRY_SIZE);
+	else
+		return buf->queue.page_list[entry * MTHCA_CQ_ENTRY_SIZE / PAGE_SIZE].buf
+			+ (entry * MTHCA_CQ_ENTRY_SIZE) % PAGE_SIZE;
+}
+
+static inline struct mthca_cqe *get_cqe(struct mthca_cq *cq, int entry)
+{
+	return get_cqe_from_buf(&cq->buf, entry);
+}
+
+static inline struct mthca_cqe *cqe_sw(struct mthca_cqe *cqe)
+{
+	return MTHCA_CQ_ENTRY_OWNER_HW & cqe->owner ? NULL : cqe;
+}
+
+static inline struct mthca_cqe *next_cqe_sw(struct mthca_cq *cq)
+{
+	return cqe_sw(get_cqe(cq, cq->cons_index & cq->ibcq.cqe));
+}
+
+static inline void set_cqe_hw(struct mthca_cqe *cqe)
+{
+	cqe->owner = MTHCA_CQ_ENTRY_OWNER_HW;
+}
+
+static void dump_cqe(struct mthca_dev *dev, void *cqe_ptr)
+{
+	__be32 *cqe = cqe_ptr;
+
+	(void) cqe;	/* avoid warning if mthca_dbg compiled away... */
+	mthca_dbg(dev, "CQE contents %08x %08x %08x %08x %08x %08x %08x %08x\n",
+		  be32_to_cpu(cqe[0]), be32_to_cpu(cqe[1]), be32_to_cpu(cqe[2]),
+		  be32_to_cpu(cqe[3]), be32_to_cpu(cqe[4]), be32_to_cpu(cqe[5]),
+		  be32_to_cpu(cqe[6]), be32_to_cpu(cqe[7]));
+}
+
+/*
+ * incr is ignored in native Arbel (mem-free) mode, so cq->cons_index
+ * should be correct before calling update_cons_index().
+ */
+static inline void update_cons_index(struct mthca_dev *dev, struct mthca_cq *cq,
+				     int incr)
+{
+	if (mthca_is_memfree(dev)) {
+		*cq->set_ci_db = cpu_to_be32(cq->cons_index);
+		wmb();
+	} else {
+		mthca_write64(MTHCA_TAVOR_CQ_DB_INC_CI | cq->cqn, incr - 1,
+			      dev->kar + MTHCA_CQ_DOORBELL,
+			      MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock));
+		/*
+		 * Make sure doorbells don't leak out of CQ spinlock
+		 * and reach the HCA out of order:
+		 */
+		mmiowb();
+	}
+}
+
+void mthca_cq_completion(struct mthca_dev *dev, u32 cqn)
+{
+	struct mthca_cq *cq;
+
+	cq = mthca_array_get(&dev->cq_table.cq, cqn & (dev->limits.num_cqs - 1));
+
+	if (!cq) {
+		mthca_warn(dev, "Completion event for bogus CQ %08x\n", cqn);
+		return;
+	}
+
+	++cq->arm_sn;
+
+	cq->ibcq.comp_handler(&cq->ibcq, cq->ibcq.cq_context);
+}
+
+void mthca_cq_event(struct mthca_dev *dev, u32 cqn,
+		    enum ib_event_type event_type)
+{
+	struct mthca_cq *cq;
+	struct ib_event event;
+
+	spin_lock(&dev->cq_table.lock);
+
+	cq = mthca_array_get(&dev->cq_table.cq, cqn & (dev->limits.num_cqs - 1));
+	if (cq)
+		++cq->refcount;
+
+	spin_unlock(&dev->cq_table.lock);
+
+	if (!cq) {
+		mthca_warn(dev, "Async event for bogus CQ %08x\n", cqn);
+		return;
+	}
+
+	event.device      = &dev->ib_dev;
+	event.event       = event_type;
+	event.element.cq  = &cq->ibcq;
+	if (cq->ibcq.event_handler)
+		cq->ibcq.event_handler(&event, cq->ibcq.cq_context);
+
+	spin_lock(&dev->cq_table.lock);
+	if (!--cq->refcount)
+		wake_up(&cq->wait);
+	spin_unlock(&dev->cq_table.lock);
+}
+
+static inline int is_recv_cqe(struct mthca_cqe *cqe)
+{
+	if ((cqe->opcode & MTHCA_ERROR_CQE_OPCODE_MASK) ==
+	    MTHCA_ERROR_CQE_OPCODE_MASK)
+		return !(cqe->opcode & 0x01);
+	else
+		return !(cqe->is_send & 0x80);
+}
+
+void mthca_cq_clean(struct mthca_dev *dev, struct mthca_cq *cq, u32 qpn,
+		    struct mthca_srq *srq)
+{
+	struct mthca_cqe *cqe;
+	u32 prod_index;
+	int i, nfreed = 0;
+
+	spin_lock_irq(&cq->lock);
+
+	/*
+	 * First we need to find the current producer index, so we
+	 * know where to start cleaning from.  It doesn't matter if HW
+	 * adds new entries after this loop -- the QP we're worried
+	 * about is already in RESET, so the new entries won't come
+	 * from our QP and therefore don't need to be checked.
+	 */
+	for (prod_index = cq->cons_index;
+	     cqe_sw(get_cqe(cq, prod_index & cq->ibcq.cqe));
+	     ++prod_index)
+		if (prod_index == cq->cons_index + cq->ibcq.cqe)
+			break;
+
+	if (0)
+		mthca_dbg(dev, "Cleaning QPN %06x from CQN %06x; ci %d, pi %d\n",
+			  qpn, cq->cqn, cq->cons_index, prod_index);
+
+	/*
+	 * Now sweep backwards through the CQ, removing CQ entries
+	 * that match our QP by copying older entries on top of them.
+	 */
+	while ((int) --prod_index - (int) cq->cons_index >= 0) {
+		cqe = get_cqe(cq, prod_index & cq->ibcq.cqe);
+		if (cqe->my_qpn == cpu_to_be32(qpn)) {
+			if (srq && is_recv_cqe(cqe))
+				mthca_free_srq_wqe(srq, be32_to_cpu(cqe->wqe));
+			++nfreed;
+		} else if (nfreed)
+			memcpy(get_cqe(cq, (prod_index + nfreed) & cq->ibcq.cqe),
+			       cqe, MTHCA_CQ_ENTRY_SIZE);
+	}
+
+	if (nfreed) {
+		for (i = 0; i < nfreed; ++i)
+			set_cqe_hw(get_cqe(cq, (cq->cons_index + i) & cq->ibcq.cqe));
+		wmb();
+		cq->cons_index += nfreed;
+		update_cons_index(dev, cq, nfreed);
+	}
+
+	spin_unlock_irq(&cq->lock);
+}
+
+void mthca_cq_resize_copy_cqes(struct mthca_cq *cq)
+{
+	int i;
+
+	/*
+	 * In Tavor mode, the hardware keeps the consumer and producer
+	 * indices mod the CQ size.  Since we might be making the CQ
+	 * bigger, we need to deal with the case where the producer
+	 * index wrapped around before the CQ was resized.
+	 */
+	if (!mthca_is_memfree(to_mdev(cq->ibcq.device)) &&
+	    cq->ibcq.cqe < cq->resize_buf->cqe) {
+		cq->cons_index &= cq->ibcq.cqe;
+		if (cqe_sw(get_cqe(cq, cq->ibcq.cqe)))
+			cq->cons_index -= cq->ibcq.cqe + 1;
+	}
+
+	for (i = cq->cons_index; cqe_sw(get_cqe(cq, i & cq->ibcq.cqe)); ++i)
+		memcpy(get_cqe_from_buf(&cq->resize_buf->buf,
+					i & cq->resize_buf->cqe),
+		       get_cqe(cq, i & cq->ibcq.cqe), MTHCA_CQ_ENTRY_SIZE);
+}
+
+int mthca_alloc_cq_buf(struct mthca_dev *dev, struct mthca_cq_buf *buf, int nent)
+{
+	int ret;
+	int i;
+
+	ret = mthca_buf_alloc(dev, nent * MTHCA_CQ_ENTRY_SIZE,
+			      MTHCA_MAX_DIRECT_CQ_SIZE,
+			      &buf->queue, &buf->is_direct,
+			      &dev->driver_pd, 1, &buf->mr);
+	if (ret)
+		return ret;
+
+	for (i = 0; i < nent; ++i)
+		set_cqe_hw(get_cqe_from_buf(buf, i));
+
+	return 0;
+}
+
+void mthca_free_cq_buf(struct mthca_dev *dev, struct mthca_cq_buf *buf, int cqe)
+{
+	mthca_buf_free(dev, (cqe + 1) * MTHCA_CQ_ENTRY_SIZE, &buf->queue,
+		       buf->is_direct, &buf->mr);
+}
+
+static void handle_error_cqe(struct mthca_dev *dev, struct mthca_cq *cq,
+			     struct mthca_qp *qp, int wqe_index, int is_send,
+			     struct mthca_err_cqe *cqe,
+			     struct ib_wc *entry, int *free_cqe)
+{
+	int dbd;
+	__be32 new_wqe;
+
+	if (cqe->syndrome == SYNDROME_LOCAL_QP_OP_ERR) {
+		mthca_dbg(dev, "local QP operation err "
+			  "(QPN %06x, WQE @ %08x, CQN %06x, index %d)\n",
+			  be32_to_cpu(cqe->my_qpn), be32_to_cpu(cqe->wqe),
+			  cq->cqn, cq->cons_index);
+		dump_cqe(dev, cqe);
+	}
+
+	/*
+	 * For completions in error, only work request ID, status, vendor error
+	 * (and freed resource count for RD) have to be set.
+	 */
+	switch (cqe->syndrome) {
+	case SYNDROME_LOCAL_LENGTH_ERR:
+		entry->status = IB_WC_LOC_LEN_ERR;
+		break;
+	case SYNDROME_LOCAL_QP_OP_ERR:
+		entry->status = IB_WC_LOC_QP_OP_ERR;
+		break;
+	case SYNDROME_LOCAL_EEC_OP_ERR:
+		entry->status = IB_WC_LOC_EEC_OP_ERR;
+		break;
+	case SYNDROME_LOCAL_PROT_ERR:
+		entry->status = IB_WC_LOC_PROT_ERR;
+		break;
+	case SYNDROME_WR_FLUSH_ERR:
+		entry->status = IB_WC_WR_FLUSH_ERR;
+		break;
+	case SYNDROME_MW_BIND_ERR:
+		entry->status = IB_WC_MW_BIND_ERR;
+		break;
+	case SYNDROME_BAD_RESP_ERR:
+		entry->status = IB_WC_BAD_RESP_ERR;
+		break;
+	case SYNDROME_LOCAL_ACCESS_ERR:
+		entry->status = IB_WC_LOC_ACCESS_ERR;
+		break;
+	case SYNDROME_REMOTE_INVAL_REQ_ERR:
+		entry->status = IB_WC_REM_INV_REQ_ERR;
+		break;
+	case SYNDROME_REMOTE_ACCESS_ERR:
+		entry->status = IB_WC_REM_ACCESS_ERR;
+		break;
+	case SYNDROME_REMOTE_OP_ERR:
+		entry->status = IB_WC_REM_OP_ERR;
+		break;
+	case SYNDROME_RETRY_EXC_ERR:
+		entry->status = IB_WC_RETRY_EXC_ERR;
+		break;
+	case SYNDROME_RNR_RETRY_EXC_ERR:
+		entry->status = IB_WC_RNR_RETRY_EXC_ERR;
+		break;
+	case SYNDROME_LOCAL_RDD_VIOL_ERR:
+		entry->status = IB_WC_LOC_RDD_VIOL_ERR;
+		break;
+	case SYNDROME_REMOTE_INVAL_RD_REQ_ERR:
+		entry->status = IB_WC_REM_INV_RD_REQ_ERR;
+		break;
+	case SYNDROME_REMOTE_ABORTED_ERR:
+		entry->status = IB_WC_REM_ABORT_ERR;
+		break;
+	case SYNDROME_INVAL_EECN_ERR:
+		entry->status = IB_WC_INV_EECN_ERR;
+		break;
+	case SYNDROME_INVAL_EEC_STATE_ERR:
+		entry->status = IB_WC_INV_EEC_STATE_ERR;
+		break;
+	default:
+		entry->status = IB_WC_GENERAL_ERR;
+		break;
+	}
+
+	entry->vendor_err = cqe->vendor_err;
+
+	/*
+	 * Mem-free HCAs always generate one CQE per WQE, even in the
+	 * error case, so we don't have to check the doorbell count, etc.
+	 */
+	if (mthca_is_memfree(dev))
+		return;
+
+	mthca_free_err_wqe(dev, qp, is_send, wqe_index, &dbd, &new_wqe);
+
+	/*
+	 * If we're at the end of the WQE chain, or we've used up our
+	 * doorbell count, free the CQE.  Otherwise just update it for
+	 * the next poll operation.
+	 */
+	if (!(new_wqe & cpu_to_be32(0x3f)) || (!cqe->db_cnt && dbd))
+		return;
+
+	be16_add_cpu(&cqe->db_cnt, -dbd);
+	cqe->wqe      = new_wqe;
+	cqe->syndrome = SYNDROME_WR_FLUSH_ERR;
+
+	*free_cqe = 0;
+}
+
+static inline int mthca_poll_one(struct mthca_dev *dev,
+				 struct mthca_cq *cq,
+				 struct mthca_qp **cur_qp,
+				 int *freed,
+				 struct ib_wc *entry)
+{
+	struct mthca_wq *wq;
+	struct mthca_cqe *cqe;
+	int wqe_index;
+	int is_error;
+	int is_send;
+	int free_cqe = 1;
+	int err = 0;
+	u16 checksum;
+
+	cqe = next_cqe_sw(cq);
+	if (!cqe)
+		return -EAGAIN;
+
+	/*
+	 * Make sure we read CQ entry contents after we've checked the
+	 * ownership bit.
+	 */
+	rmb();
+
+	if (0) {
+		mthca_dbg(dev, "%x/%d: CQE -> QPN %06x, WQE @ %08x\n",
+			  cq->cqn, cq->cons_index, be32_to_cpu(cqe->my_qpn),
+			  be32_to_cpu(cqe->wqe));
+		dump_cqe(dev, cqe);
+	}
+
+	is_error = (cqe->opcode & MTHCA_ERROR_CQE_OPCODE_MASK) ==
+		MTHCA_ERROR_CQE_OPCODE_MASK;
+	is_send  = is_error ? cqe->opcode & 0x01 : cqe->is_send & 0x80;
+
+	if (!*cur_qp || be32_to_cpu(cqe->my_qpn) != (*cur_qp)->qpn) {
+		/*
+		 * We do not have to take the QP table lock here,
+		 * because CQs will be locked while QPs are removed
+		 * from the table.
+		 */
+		*cur_qp = mthca_array_get(&dev->qp_table.qp,
+					  be32_to_cpu(cqe->my_qpn) &
+					  (dev->limits.num_qps - 1));
+		if (!*cur_qp) {
+			mthca_warn(dev, "CQ entry for unknown QP %06x\n",
+				   be32_to_cpu(cqe->my_qpn) & 0xffffff);
+			err = -EINVAL;
+			goto out;
+		}
+	}
+
+	entry->qp = &(*cur_qp)->ibqp;
+
+	if (is_send) {
+		wq = &(*cur_qp)->sq;
+		wqe_index = ((be32_to_cpu(cqe->wqe) - (*cur_qp)->send_wqe_offset)
+			     >> wq->wqe_shift);
+		entry->wr_id = (*cur_qp)->wrid[wqe_index +
+					       (*cur_qp)->rq.max];
+	} else if ((*cur_qp)->ibqp.srq) {
+		struct mthca_srq *srq = to_msrq((*cur_qp)->ibqp.srq);
+		u32 wqe = be32_to_cpu(cqe->wqe);
+		wq = NULL;
+		wqe_index = wqe >> srq->wqe_shift;
+		entry->wr_id = srq->wrid[wqe_index];
+		mthca_free_srq_wqe(srq, wqe);
+	} else {
+		s32 wqe;
+		wq = &(*cur_qp)->rq;
+		wqe = be32_to_cpu(cqe->wqe);
+		wqe_index = wqe >> wq->wqe_shift;
+		/*
+		 * WQE addr == base - 1 might be reported in receive completion
+		 * with error instead of (rq size - 1) by Sinai FW 1.0.800 and
+		 * Arbel FW 5.1.400.  This bug should be fixed in later FW revs.
+		 */
+		if (unlikely(wqe_index < 0))
+			wqe_index = wq->max - 1;
+		entry->wr_id = (*cur_qp)->wrid[wqe_index];
+	}
+
+	if (wq) {
+		if (wq->last_comp < wqe_index)
+			wq->tail += wqe_index - wq->last_comp;
+		else
+			wq->tail += wqe_index + wq->max - wq->last_comp;
+
+		wq->last_comp = wqe_index;
+	}
+
+	if (is_error) {
+		handle_error_cqe(dev, cq, *cur_qp, wqe_index, is_send,
+				 (struct mthca_err_cqe *) cqe,
+				 entry, &free_cqe);
+		goto out;
+	}
+
+	if (is_send) {
+		entry->wc_flags = 0;
+		switch (cqe->opcode) {
+		case MTHCA_OPCODE_RDMA_WRITE:
+			entry->opcode    = IB_WC_RDMA_WRITE;
+			break;
+		case MTHCA_OPCODE_RDMA_WRITE_IMM:
+			entry->opcode    = IB_WC_RDMA_WRITE;
+			entry->wc_flags |= IB_WC_WITH_IMM;
+			break;
+		case MTHCA_OPCODE_SEND:
+			entry->opcode    = IB_WC_SEND;
+			break;
+		case MTHCA_OPCODE_SEND_IMM:
+			entry->opcode    = IB_WC_SEND;
+			entry->wc_flags |= IB_WC_WITH_IMM;
+			break;
+		case MTHCA_OPCODE_RDMA_READ:
+			entry->opcode    = IB_WC_RDMA_READ;
+			entry->byte_len  = be32_to_cpu(cqe->byte_cnt);
+			break;
+		case MTHCA_OPCODE_ATOMIC_CS:
+			entry->opcode    = IB_WC_COMP_SWAP;
+			entry->byte_len  = MTHCA_ATOMIC_BYTE_LEN;
+			break;
+		case MTHCA_OPCODE_ATOMIC_FA:
+			entry->opcode    = IB_WC_FETCH_ADD;
+			entry->byte_len  = MTHCA_ATOMIC_BYTE_LEN;
+			break;
+		case MTHCA_OPCODE_BIND_MW:
+			entry->opcode    = IB_WC_BIND_MW;
+			break;
+		default:
+			entry->opcode    = MTHCA_OPCODE_INVALID;
+			break;
+		}
+	} else {
+		entry->byte_len = be32_to_cpu(cqe->byte_cnt);
+		switch (cqe->opcode & 0x1f) {
+		case IB_OPCODE_SEND_LAST_WITH_IMMEDIATE:
+		case IB_OPCODE_SEND_ONLY_WITH_IMMEDIATE:
+			entry->wc_flags = IB_WC_WITH_IMM;
+			entry->ex.imm_data = cqe->imm_etype_pkey_eec;
+			entry->opcode = IB_WC_RECV;
+			break;
+		case IB_OPCODE_RDMA_WRITE_LAST_WITH_IMMEDIATE:
+		case IB_OPCODE_RDMA_WRITE_ONLY_WITH_IMMEDIATE:
+			entry->wc_flags = IB_WC_WITH_IMM;
+			entry->ex.imm_data = cqe->imm_etype_pkey_eec;
+			entry->opcode = IB_WC_RECV_RDMA_WITH_IMM;
+			break;
+		default:
+			entry->wc_flags = 0;
+			entry->opcode = IB_WC_RECV;
+			break;
+		}
+		entry->slid 	   = be16_to_cpu(cqe->rlid);
+		entry->sl   	   = cqe->sl_ipok >> 4;
+		entry->src_qp 	   = be32_to_cpu(cqe->rqpn) & 0xffffff;
+		entry->dlid_path_bits = cqe->g_mlpath & 0x7f;
+		entry->pkey_index  = be32_to_cpu(cqe->imm_etype_pkey_eec) >> 16;
+		entry->wc_flags   |= cqe->g_mlpath & 0x80 ? IB_WC_GRH : 0;
+		checksum = (be32_to_cpu(cqe->rqpn) >> 24) |
+				((be32_to_cpu(cqe->my_ee) >> 16) & 0xff00);
+		entry->wc_flags	  |=  (cqe->sl_ipok & 1 && checksum == 0xffff) ?
+							IB_WC_IP_CSUM_OK : 0;
+	}
+
+	entry->status = IB_WC_SUCCESS;
+
+ out:
+	if (likely(free_cqe)) {
+		set_cqe_hw(cqe);
+		++(*freed);
+		++cq->cons_index;
+	}
+
+	return err;
+}
+
+int mthca_poll_cq(struct ib_cq *ibcq, int num_entries,
+		  struct ib_wc *entry)
+{
+	struct mthca_dev *dev = to_mdev(ibcq->device);
+	struct mthca_cq *cq = to_mcq(ibcq);
+	struct mthca_qp *qp = NULL;
+	unsigned long flags;
+	int err = 0;
+	int freed = 0;
+	int npolled;
+
+	spin_lock_irqsave(&cq->lock, flags);
+
+	npolled = 0;
+repoll:
+	while (npolled < num_entries) {
+		err = mthca_poll_one(dev, cq, &qp,
+				     &freed, entry + npolled);
+		if (err)
+			break;
+		++npolled;
+	}
+
+	if (freed) {
+		wmb();
+		update_cons_index(dev, cq, freed);
+	}
+
+	/*
+	 * If a CQ resize is in progress and we discovered that the
+	 * old buffer is empty, then peek in the new buffer, and if
+	 * it's not empty, switch to the new buffer and continue
+	 * polling there.
+	 */
+	if (unlikely(err == -EAGAIN && cq->resize_buf &&
+		     cq->resize_buf->state == CQ_RESIZE_READY)) {
+		/*
+		 * In Tavor mode, the hardware keeps the producer
+		 * index modulo the CQ size.  Since we might be making
+		 * the CQ bigger, we need to mask our consumer index
+		 * using the size of the old CQ buffer before looking
+		 * in the new CQ buffer.
+		 */
+		if (!mthca_is_memfree(dev))
+			cq->cons_index &= cq->ibcq.cqe;
+
+		if (cqe_sw(get_cqe_from_buf(&cq->resize_buf->buf,
+					    cq->cons_index & cq->resize_buf->cqe))) {
+			struct mthca_cq_buf tbuf;
+			int tcqe;
+
+			tbuf         = cq->buf;
+			tcqe         = cq->ibcq.cqe;
+			cq->buf      = cq->resize_buf->buf;
+			cq->ibcq.cqe = cq->resize_buf->cqe;
+
+			cq->resize_buf->buf   = tbuf;
+			cq->resize_buf->cqe   = tcqe;
+			cq->resize_buf->state = CQ_RESIZE_SWAPPED;
+
+			goto repoll;
+		}
+	}
+
+	spin_unlock_irqrestore(&cq->lock, flags);
+
+	return err == 0 || err == -EAGAIN ? npolled : err;
+}
+
+int mthca_tavor_arm_cq(struct ib_cq *cq, enum ib_cq_notify_flags flags)
+{
+	u32 dbhi = ((flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED ?
+		    MTHCA_TAVOR_CQ_DB_REQ_NOT_SOL :
+		    MTHCA_TAVOR_CQ_DB_REQ_NOT) |
+		to_mcq(cq)->cqn;
+
+	mthca_write64(dbhi, 0xffffffff, to_mdev(cq->device)->kar + MTHCA_CQ_DOORBELL,
+		      MTHCA_GET_DOORBELL_LOCK(&to_mdev(cq->device)->doorbell_lock));
+
+	return 0;
+}
+
+int mthca_arbel_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags)
+{
+	struct mthca_cq *cq = to_mcq(ibcq);
+	__be32 db_rec[2];
+	u32 dbhi;
+	u32 sn = cq->arm_sn & 3;
+
+	db_rec[0] = cpu_to_be32(cq->cons_index);
+	db_rec[1] = cpu_to_be32((cq->cqn << 8) | (2 << 5) | (sn << 3) |
+				((flags & IB_CQ_SOLICITED_MASK) ==
+				 IB_CQ_SOLICITED ? 1 : 2));
+
+	mthca_write_db_rec(db_rec, cq->arm_db);
+
+	/*
+	 * Make sure that the doorbell record in host memory is
+	 * written before ringing the doorbell via PCI MMIO.
+	 */
+	wmb();
+
+	dbhi = (sn << 28) |
+		((flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED ?
+		 MTHCA_ARBEL_CQ_DB_REQ_NOT_SOL :
+		 MTHCA_ARBEL_CQ_DB_REQ_NOT) | cq->cqn;
+
+	mthca_write64(dbhi, cq->cons_index,
+		      to_mdev(ibcq->device)->kar + MTHCA_CQ_DOORBELL,
+		      MTHCA_GET_DOORBELL_LOCK(&to_mdev(ibcq->device)->doorbell_lock));
+
+	return 0;
+}
+
+int mthca_init_cq(struct mthca_dev *dev, int nent,
+		  struct mthca_ucontext *ctx, u32 pdn,
+		  struct mthca_cq *cq)
+{
+	struct mthca_mailbox *mailbox;
+	struct mthca_cq_context *cq_context;
+	int err = -ENOMEM;
+
+	cq->ibcq.cqe  = nent - 1;
+	cq->is_kernel = !ctx;
+
+	cq->cqn = mthca_alloc(&dev->cq_table.alloc);
+	if (cq->cqn == -1)
+		return -ENOMEM;
+
+	if (mthca_is_memfree(dev)) {
+		err = mthca_table_get(dev, dev->cq_table.table, cq->cqn);
+		if (err)
+			goto err_out;
+
+		if (cq->is_kernel) {
+			cq->arm_sn = 1;
+
+			err = -ENOMEM;
+
+			cq->set_ci_db_index = mthca_alloc_db(dev, MTHCA_DB_TYPE_CQ_SET_CI,
+							     cq->cqn, &cq->set_ci_db);
+			if (cq->set_ci_db_index < 0)
+				goto err_out_icm;
+
+			cq->arm_db_index = mthca_alloc_db(dev, MTHCA_DB_TYPE_CQ_ARM,
+							  cq->cqn, &cq->arm_db);
+			if (cq->arm_db_index < 0)
+				goto err_out_ci;
+		}
+	}
+
+	mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+	if (IS_ERR(mailbox))
+		goto err_out_arm;
+
+	cq_context = mailbox->buf;
+
+	if (cq->is_kernel) {
+		err = mthca_alloc_cq_buf(dev, &cq->buf, nent);
+		if (err)
+			goto err_out_mailbox;
+	}
+
+	spin_lock_init(&cq->lock);
+	cq->refcount = 1;
+	init_waitqueue_head(&cq->wait);
+	mutex_init(&cq->mutex);
+
+	memset(cq_context, 0, sizeof *cq_context);
+	cq_context->flags           = cpu_to_be32(MTHCA_CQ_STATUS_OK      |
+						  MTHCA_CQ_STATE_DISARMED |
+						  MTHCA_CQ_FLAG_TR);
+	cq_context->logsize_usrpage = cpu_to_be32((ffs(nent) - 1) << 24);
+	if (ctx)
+		cq_context->logsize_usrpage |= cpu_to_be32(ctx->uar.index);
+	else
+		cq_context->logsize_usrpage |= cpu_to_be32(dev->driver_uar.index);
+	cq_context->error_eqn       = cpu_to_be32(dev->eq_table.eq[MTHCA_EQ_ASYNC].eqn);
+	cq_context->comp_eqn        = cpu_to_be32(dev->eq_table.eq[MTHCA_EQ_COMP].eqn);
+	cq_context->pd              = cpu_to_be32(pdn);
+	cq_context->lkey            = cpu_to_be32(cq->buf.mr.ibmr.lkey);
+	cq_context->cqn             = cpu_to_be32(cq->cqn);
+
+	if (mthca_is_memfree(dev)) {
+		cq_context->ci_db    = cpu_to_be32(cq->set_ci_db_index);
+		cq_context->state_db = cpu_to_be32(cq->arm_db_index);
+	}
+
+	err = mthca_SW2HW_CQ(dev, mailbox, cq->cqn);
+	if (err) {
+		mthca_warn(dev, "SW2HW_CQ failed (%d)\n", err);
+		goto err_out_free_mr;
+	}
+
+	spin_lock_irq(&dev->cq_table.lock);
+	if (mthca_array_set(&dev->cq_table.cq,
+			    cq->cqn & (dev->limits.num_cqs - 1),
+			    cq)) {
+		spin_unlock_irq(&dev->cq_table.lock);
+		goto err_out_free_mr;
+	}
+	spin_unlock_irq(&dev->cq_table.lock);
+
+	cq->cons_index = 0;
+
+	mthca_free_mailbox(dev, mailbox);
+
+	return 0;
+
+err_out_free_mr:
+	if (cq->is_kernel)
+		mthca_free_cq_buf(dev, &cq->buf, cq->ibcq.cqe);
+
+err_out_mailbox:
+	mthca_free_mailbox(dev, mailbox);
+
+err_out_arm:
+	if (cq->is_kernel && mthca_is_memfree(dev))
+		mthca_free_db(dev, MTHCA_DB_TYPE_CQ_ARM, cq->arm_db_index);
+
+err_out_ci:
+	if (cq->is_kernel && mthca_is_memfree(dev))
+		mthca_free_db(dev, MTHCA_DB_TYPE_CQ_SET_CI, cq->set_ci_db_index);
+
+err_out_icm:
+	mthca_table_put(dev, dev->cq_table.table, cq->cqn);
+
+err_out:
+	mthca_free(&dev->cq_table.alloc, cq->cqn);
+
+	return err;
+}
+
+static inline int get_cq_refcount(struct mthca_dev *dev, struct mthca_cq *cq)
+{
+	int c;
+
+	spin_lock_irq(&dev->cq_table.lock);
+	c = cq->refcount;
+	spin_unlock_irq(&dev->cq_table.lock);
+
+	return c;
+}
+
+void mthca_free_cq(struct mthca_dev *dev,
+		   struct mthca_cq *cq)
+{
+	struct mthca_mailbox *mailbox;
+	int err;
+
+	mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+	if (IS_ERR(mailbox)) {
+		mthca_warn(dev, "No memory for mailbox to free CQ.\n");
+		return;
+	}
+
+	err = mthca_HW2SW_CQ(dev, mailbox, cq->cqn);
+	if (err)
+		mthca_warn(dev, "HW2SW_CQ failed (%d)\n", err);
+
+	if (0) {
+		__be32 *ctx = mailbox->buf;
+		int j;
+
+		printk(KERN_ERR "context for CQN %x (cons index %x, next sw %d)\n",
+		       cq->cqn, cq->cons_index,
+		       cq->is_kernel ? !!next_cqe_sw(cq) : 0);
+		for (j = 0; j < 16; ++j)
+			printk(KERN_ERR "[%2x] %08x\n", j * 4, be32_to_cpu(ctx[j]));
+	}
+
+	spin_lock_irq(&dev->cq_table.lock);
+	mthca_array_clear(&dev->cq_table.cq,
+			  cq->cqn & (dev->limits.num_cqs - 1));
+	--cq->refcount;
+	spin_unlock_irq(&dev->cq_table.lock);
+
+	if (dev->mthca_flags & MTHCA_FLAG_MSI_X)
+		synchronize_irq(dev->eq_table.eq[MTHCA_EQ_COMP].msi_x_vector);
+	else
+		synchronize_irq(dev->pdev->irq);
+
+	wait_event(cq->wait, !get_cq_refcount(dev, cq));
+
+	if (cq->is_kernel) {
+		mthca_free_cq_buf(dev, &cq->buf, cq->ibcq.cqe);
+		if (mthca_is_memfree(dev)) {
+			mthca_free_db(dev, MTHCA_DB_TYPE_CQ_ARM,    cq->arm_db_index);
+			mthca_free_db(dev, MTHCA_DB_TYPE_CQ_SET_CI, cq->set_ci_db_index);
+		}
+	}
+
+	mthca_table_put(dev, dev->cq_table.table, cq->cqn);
+	mthca_free(&dev->cq_table.alloc, cq->cqn);
+	mthca_free_mailbox(dev, mailbox);
+}
+
+int mthca_init_cq_table(struct mthca_dev *dev)
+{
+	int err;
+
+	spin_lock_init(&dev->cq_table.lock);
+
+	err = mthca_alloc_init(&dev->cq_table.alloc,
+			       dev->limits.num_cqs,
+			       (1 << 24) - 1,
+			       dev->limits.reserved_cqs);
+	if (err)
+		return err;
+
+	err = mthca_array_init(&dev->cq_table.cq,
+			       dev->limits.num_cqs);
+	if (err)
+		mthca_alloc_cleanup(&dev->cq_table.alloc);
+
+	return err;
+}
+
+void mthca_cleanup_cq_table(struct mthca_dev *dev)
+{
+	mthca_array_cleanup(&dev->cq_table.cq, dev->limits.num_cqs);
+	mthca_alloc_cleanup(&dev->cq_table.alloc);
+}
diff --git a/drivers/infiniband/hw/mthca/mthca_dev.h b/drivers/infiniband/hw/mthca/mthca_dev.h
new file mode 100644
index 0000000..7e6a6d6
--- /dev/null
+++ b/drivers/infiniband/hw/mthca/mthca_dev.h
@@ -0,0 +1,596 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2005, 2006 Cisco Systems.  All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2004 Voltaire, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MTHCA_DEV_H
+#define MTHCA_DEV_H
+
+#include <linux/spinlock.h>
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include <linux/dma-mapping.h>
+#include <linux/timer.h>
+#include <linux/mutex.h>
+#include <linux/list.h>
+#include <linux/semaphore.h>
+
+#include "mthca_provider.h"
+#include "mthca_doorbell.h"
+
+#define DRV_NAME	"ib_mthca"
+#define PFX		DRV_NAME ": "
+#define DRV_VERSION	"1.0"
+#define DRV_RELDATE	"April 4, 2008"
+
+enum {
+	MTHCA_FLAG_DDR_HIDDEN = 1 << 1,
+	MTHCA_FLAG_SRQ        = 1 << 2,
+	MTHCA_FLAG_MSI_X      = 1 << 3,
+	MTHCA_FLAG_NO_LAM     = 1 << 4,
+	MTHCA_FLAG_FMR        = 1 << 5,
+	MTHCA_FLAG_MEMFREE    = 1 << 6,
+	MTHCA_FLAG_PCIE       = 1 << 7,
+	MTHCA_FLAG_SINAI_OPT  = 1 << 8
+};
+
+enum {
+	MTHCA_MAX_PORTS = 2
+};
+
+enum {
+	MTHCA_BOARD_ID_LEN = 64
+};
+
+enum {
+	MTHCA_EQ_CONTEXT_SIZE =  0x40,
+	MTHCA_CQ_CONTEXT_SIZE =  0x40,
+	MTHCA_QP_CONTEXT_SIZE = 0x200,
+	MTHCA_RDB_ENTRY_SIZE  =  0x20,
+	MTHCA_AV_SIZE         =  0x20,
+	MTHCA_MGM_ENTRY_SIZE  = 0x100,
+
+	/* Arbel FW gives us these, but we need them for Tavor */
+	MTHCA_MPT_ENTRY_SIZE  =  0x40,
+	MTHCA_MTT_SEG_SIZE    =  0x40,
+
+	MTHCA_QP_PER_MGM      = 4 * (MTHCA_MGM_ENTRY_SIZE / 16 - 2)
+};
+
+enum {
+	MTHCA_EQ_CMD,
+	MTHCA_EQ_ASYNC,
+	MTHCA_EQ_COMP,
+	MTHCA_NUM_EQ
+};
+
+enum {
+	MTHCA_OPCODE_NOP            = 0x00,
+	MTHCA_OPCODE_RDMA_WRITE     = 0x08,
+	MTHCA_OPCODE_RDMA_WRITE_IMM = 0x09,
+	MTHCA_OPCODE_SEND           = 0x0a,
+	MTHCA_OPCODE_SEND_IMM       = 0x0b,
+	MTHCA_OPCODE_RDMA_READ      = 0x10,
+	MTHCA_OPCODE_ATOMIC_CS      = 0x11,
+	MTHCA_OPCODE_ATOMIC_FA      = 0x12,
+	MTHCA_OPCODE_BIND_MW        = 0x18,
+	MTHCA_OPCODE_INVALID        = 0xff
+};
+
+enum {
+	MTHCA_CMD_USE_EVENTS         = 1 << 0,
+	MTHCA_CMD_POST_DOORBELLS     = 1 << 1
+};
+
+enum {
+	MTHCA_CMD_NUM_DBELL_DWORDS = 8
+};
+
+struct mthca_cmd {
+	struct pci_pool          *pool;
+	struct mutex              hcr_mutex;
+	struct semaphore 	  poll_sem;
+	struct semaphore 	  event_sem;
+	int              	  max_cmds;
+	spinlock_t                context_lock;
+	int                       free_head;
+	struct mthca_cmd_context *context;
+	u16                       token_mask;
+	u32                       flags;
+	void __iomem             *dbell_map;
+	u16                       dbell_offsets[MTHCA_CMD_NUM_DBELL_DWORDS];
+};
+
+struct mthca_limits {
+	int      num_ports;
+	int      vl_cap;
+	int      mtu_cap;
+	int      gid_table_len;
+	int      pkey_table_len;
+	int      local_ca_ack_delay;
+	int      num_uars;
+	int      max_sg;
+	int      num_qps;
+	int      max_wqes;
+	int	 max_desc_sz;
+	int	 max_qp_init_rdma;
+	int      reserved_qps;
+	int      num_srqs;
+	int      max_srq_wqes;
+	int      max_srq_sge;
+	int      reserved_srqs;
+	int      num_eecs;
+	int      reserved_eecs;
+	int      num_cqs;
+	int      max_cqes;
+	int      reserved_cqs;
+	int      num_eqs;
+	int      reserved_eqs;
+	int      num_mpts;
+	int      num_mtt_segs;
+	int	 mtt_seg_size;
+	int      fmr_reserved_mtts;
+	int      reserved_mtts;
+	int      reserved_mrws;
+	int      reserved_uars;
+	int      num_mgms;
+	int      num_amgms;
+	int      reserved_mcgs;
+	int      num_pds;
+	int      reserved_pds;
+	u32      page_size_cap;
+	u32      flags;
+	u16      stat_rate_support;
+	u8       port_width_cap;
+};
+
+struct mthca_alloc {
+	u32            last;
+	u32            top;
+	u32            max;
+	u32            mask;
+	spinlock_t     lock;
+	unsigned long *table;
+};
+
+struct mthca_array {
+	struct {
+		void    **page;
+		int       used;
+	} *page_list;
+};
+
+struct mthca_uar_table {
+	struct mthca_alloc alloc;
+	u64                uarc_base;
+	int                uarc_size;
+};
+
+struct mthca_pd_table {
+	struct mthca_alloc alloc;
+};
+
+struct mthca_buddy {
+	unsigned long **bits;
+	int	       *num_free;
+	int             max_order;
+	spinlock_t      lock;
+};
+
+struct mthca_mr_table {
+	struct mthca_alloc      mpt_alloc;
+	struct mthca_buddy      mtt_buddy;
+	struct mthca_buddy     *fmr_mtt_buddy;
+	u64                     mtt_base;
+	u64                     mpt_base;
+	struct mthca_icm_table *mtt_table;
+	struct mthca_icm_table *mpt_table;
+	struct {
+		void __iomem   *mpt_base;
+		void __iomem   *mtt_base;
+		struct mthca_buddy mtt_buddy;
+	} tavor_fmr;
+};
+
+struct mthca_eq_table {
+	struct mthca_alloc alloc;
+	void __iomem      *clr_int;
+	u32                clr_mask;
+	u32                arm_mask;
+	struct mthca_eq    eq[MTHCA_NUM_EQ];
+	u64                icm_virt;
+	struct page       *icm_page;
+	dma_addr_t         icm_dma;
+	int                have_irq;
+	u8                 inta_pin;
+};
+
+struct mthca_cq_table {
+	struct mthca_alloc 	alloc;
+	spinlock_t         	lock;
+	struct mthca_array      cq;
+	struct mthca_icm_table *table;
+};
+
+struct mthca_srq_table {
+	struct mthca_alloc 	alloc;
+	spinlock_t         	lock;
+	struct mthca_array      srq;
+	struct mthca_icm_table *table;
+};
+
+struct mthca_qp_table {
+	struct mthca_alloc     	alloc;
+	u32                    	rdb_base;
+	int                    	rdb_shift;
+	int                    	sqp_start;
+	spinlock_t             	lock;
+	struct mthca_array     	qp;
+	struct mthca_icm_table *qp_table;
+	struct mthca_icm_table *eqp_table;
+	struct mthca_icm_table *rdb_table;
+};
+
+struct mthca_av_table {
+	struct pci_pool   *pool;
+	int                num_ddr_avs;
+	u64                ddr_av_base;
+	void __iomem      *av_map;
+	struct mthca_alloc alloc;
+};
+
+struct mthca_mcg_table {
+	struct mutex		mutex;
+	struct mthca_alloc 	alloc;
+	struct mthca_icm_table *table;
+};
+
+struct mthca_catas_err {
+	u64			addr;
+	u32 __iomem	       *map;
+	u32			size;
+	struct timer_list	timer;
+	struct list_head	list;
+};
+
+extern struct mutex mthca_device_mutex;
+
+struct mthca_dev {
+	struct ib_device  ib_dev;
+	struct pci_dev   *pdev;
+
+	int          	 hca_type;
+	unsigned long	 mthca_flags;
+	unsigned long    device_cap_flags;
+
+	u32              rev_id;
+	char             board_id[MTHCA_BOARD_ID_LEN];
+
+	/* firmware info */
+	u64              fw_ver;
+	union {
+		struct {
+			u64 fw_start;
+			u64 fw_end;
+		}        tavor;
+		struct {
+			u64 clr_int_base;
+			u64 eq_arm_base;
+			u64 eq_set_ci_base;
+			struct mthca_icm *fw_icm;
+			struct mthca_icm *aux_icm;
+			u16 fw_pages;
+		}        arbel;
+	}                fw;
+
+	u64              ddr_start;
+	u64              ddr_end;
+
+	MTHCA_DECLARE_DOORBELL_LOCK(doorbell_lock)
+	struct mutex cap_mask_mutex;
+
+	void __iomem    *hcr;
+	void __iomem    *kar;
+	void __iomem    *clr_base;
+	union {
+		struct {
+			void __iomem *ecr_base;
+		} tavor;
+		struct {
+			void __iomem *eq_arm;
+			void __iomem *eq_set_ci_base;
+		} arbel;
+	} eq_regs;
+
+	struct mthca_cmd    cmd;
+	struct mthca_limits limits;
+
+	struct mthca_uar_table uar_table;
+	struct mthca_pd_table  pd_table;
+	struct mthca_mr_table  mr_table;
+	struct mthca_eq_table  eq_table;
+	struct mthca_cq_table  cq_table;
+	struct mthca_srq_table srq_table;
+	struct mthca_qp_table  qp_table;
+	struct mthca_av_table  av_table;
+	struct mthca_mcg_table mcg_table;
+
+	struct mthca_catas_err catas_err;
+
+	struct mthca_uar       driver_uar;
+	struct mthca_db_table *db_tab;
+	struct mthca_pd        driver_pd;
+	struct mthca_mr        driver_mr;
+
+	struct ib_mad_agent  *send_agent[MTHCA_MAX_PORTS][2];
+	struct ib_ah         *sm_ah[MTHCA_MAX_PORTS];
+	spinlock_t            sm_lock;
+	u8                    rate[MTHCA_MAX_PORTS];
+	bool		      active;
+};
+
+#ifdef CONFIG_INFINIBAND_MTHCA_DEBUG
+extern int mthca_debug_level;
+
+#define mthca_dbg(mdev, format, arg...)					\
+	do {								\
+		if (mthca_debug_level)					\
+			dev_printk(KERN_DEBUG, &mdev->pdev->dev, format, ## arg); \
+	} while (0)
+
+#else /* CONFIG_INFINIBAND_MTHCA_DEBUG */
+
+#define mthca_dbg(mdev, format, arg...) do { (void) mdev; } while (0)
+
+#endif /* CONFIG_INFINIBAND_MTHCA_DEBUG */
+
+#define mthca_err(mdev, format, arg...) \
+	dev_err(&mdev->pdev->dev, format, ## arg)
+#define mthca_info(mdev, format, arg...) \
+	dev_info(&mdev->pdev->dev, format, ## arg)
+#define mthca_warn(mdev, format, arg...) \
+	dev_warn(&mdev->pdev->dev, format, ## arg)
+
+extern void __buggy_use_of_MTHCA_GET(void);
+extern void __buggy_use_of_MTHCA_PUT(void);
+
+#define MTHCA_GET(dest, source, offset)                               \
+	do {                                                          \
+		void *__p = (char *) (source) + (offset);             \
+		switch (sizeof (dest)) {                              \
+		case 1: (dest) = *(u8 *) __p;       break;	      \
+		case 2: (dest) = be16_to_cpup(__p); break;	      \
+		case 4: (dest) = be32_to_cpup(__p); break;	      \
+		case 8: (dest) = be64_to_cpup(__p); break;	      \
+		default: __buggy_use_of_MTHCA_GET();		      \
+		}                                                     \
+	} while (0)
+
+#define MTHCA_PUT(dest, source, offset)                               \
+	do {                                                          \
+		void *__d = ((char *) (dest) + (offset));	      \
+		switch (sizeof(source)) {                             \
+		case 1: *(u8 *) __d = (source);                break; \
+		case 2:	*(__be16 *) __d = cpu_to_be16(source); break; \
+		case 4:	*(__be32 *) __d = cpu_to_be32(source); break; \
+		case 8:	*(__be64 *) __d = cpu_to_be64(source); break; \
+		default: __buggy_use_of_MTHCA_PUT();		      \
+		}                                                     \
+	} while (0)
+
+int mthca_reset(struct mthca_dev *mdev);
+
+u32 mthca_alloc(struct mthca_alloc *alloc);
+void mthca_free(struct mthca_alloc *alloc, u32 obj);
+int mthca_alloc_init(struct mthca_alloc *alloc, u32 num, u32 mask,
+		     u32 reserved);
+void mthca_alloc_cleanup(struct mthca_alloc *alloc);
+void *mthca_array_get(struct mthca_array *array, int index);
+int mthca_array_set(struct mthca_array *array, int index, void *value);
+void mthca_array_clear(struct mthca_array *array, int index);
+int mthca_array_init(struct mthca_array *array, int nent);
+void mthca_array_cleanup(struct mthca_array *array, int nent);
+int mthca_buf_alloc(struct mthca_dev *dev, int size, int max_direct,
+		    union mthca_buf *buf, int *is_direct, struct mthca_pd *pd,
+		    int hca_write, struct mthca_mr *mr);
+void mthca_buf_free(struct mthca_dev *dev, int size, union mthca_buf *buf,
+		    int is_direct, struct mthca_mr *mr);
+
+int mthca_init_uar_table(struct mthca_dev *dev);
+int mthca_init_pd_table(struct mthca_dev *dev);
+int mthca_init_mr_table(struct mthca_dev *dev);
+int mthca_init_eq_table(struct mthca_dev *dev);
+int mthca_init_cq_table(struct mthca_dev *dev);
+int mthca_init_srq_table(struct mthca_dev *dev);
+int mthca_init_qp_table(struct mthca_dev *dev);
+int mthca_init_av_table(struct mthca_dev *dev);
+int mthca_init_mcg_table(struct mthca_dev *dev);
+
+void mthca_cleanup_uar_table(struct mthca_dev *dev);
+void mthca_cleanup_pd_table(struct mthca_dev *dev);
+void mthca_cleanup_mr_table(struct mthca_dev *dev);
+void mthca_cleanup_eq_table(struct mthca_dev *dev);
+void mthca_cleanup_cq_table(struct mthca_dev *dev);
+void mthca_cleanup_srq_table(struct mthca_dev *dev);
+void mthca_cleanup_qp_table(struct mthca_dev *dev);
+void mthca_cleanup_av_table(struct mthca_dev *dev);
+void mthca_cleanup_mcg_table(struct mthca_dev *dev);
+
+int mthca_register_device(struct mthca_dev *dev);
+void mthca_unregister_device(struct mthca_dev *dev);
+
+void mthca_start_catas_poll(struct mthca_dev *dev);
+void mthca_stop_catas_poll(struct mthca_dev *dev);
+int __mthca_restart_one(struct pci_dev *pdev);
+int mthca_catas_init(void);
+void mthca_catas_cleanup(void);
+
+int mthca_uar_alloc(struct mthca_dev *dev, struct mthca_uar *uar);
+void mthca_uar_free(struct mthca_dev *dev, struct mthca_uar *uar);
+
+int mthca_pd_alloc(struct mthca_dev *dev, int privileged, struct mthca_pd *pd);
+void mthca_pd_free(struct mthca_dev *dev, struct mthca_pd *pd);
+
+int mthca_write_mtt_size(struct mthca_dev *dev);
+
+struct mthca_mtt *mthca_alloc_mtt(struct mthca_dev *dev, int size);
+void mthca_free_mtt(struct mthca_dev *dev, struct mthca_mtt *mtt);
+int mthca_write_mtt(struct mthca_dev *dev, struct mthca_mtt *mtt,
+		    int start_index, u64 *buffer_list, int list_len);
+int mthca_mr_alloc(struct mthca_dev *dev, u32 pd, int buffer_size_shift,
+		   u64 iova, u64 total_size, u32 access, struct mthca_mr *mr);
+int mthca_mr_alloc_notrans(struct mthca_dev *dev, u32 pd,
+			   u32 access, struct mthca_mr *mr);
+int mthca_mr_alloc_phys(struct mthca_dev *dev, u32 pd,
+			u64 *buffer_list, int buffer_size_shift,
+			int list_len, u64 iova, u64 total_size,
+			u32 access, struct mthca_mr *mr);
+void mthca_free_mr(struct mthca_dev *dev,  struct mthca_mr *mr);
+
+int mthca_fmr_alloc(struct mthca_dev *dev, u32 pd,
+		    u32 access, struct mthca_fmr *fmr);
+int mthca_tavor_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list,
+			     int list_len, u64 iova);
+void mthca_tavor_fmr_unmap(struct mthca_dev *dev, struct mthca_fmr *fmr);
+int mthca_arbel_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list,
+			     int list_len, u64 iova);
+void mthca_arbel_fmr_unmap(struct mthca_dev *dev, struct mthca_fmr *fmr);
+int mthca_free_fmr(struct mthca_dev *dev,  struct mthca_fmr *fmr);
+
+int mthca_map_eq_icm(struct mthca_dev *dev, u64 icm_virt);
+void mthca_unmap_eq_icm(struct mthca_dev *dev);
+
+int mthca_poll_cq(struct ib_cq *ibcq, int num_entries,
+		  struct ib_wc *entry);
+int mthca_tavor_arm_cq(struct ib_cq *cq, enum ib_cq_notify_flags flags);
+int mthca_arbel_arm_cq(struct ib_cq *cq, enum ib_cq_notify_flags flags);
+int mthca_init_cq(struct mthca_dev *dev, int nent,
+		  struct mthca_ucontext *ctx, u32 pdn,
+		  struct mthca_cq *cq);
+void mthca_free_cq(struct mthca_dev *dev,
+		   struct mthca_cq *cq);
+void mthca_cq_completion(struct mthca_dev *dev, u32 cqn);
+void mthca_cq_event(struct mthca_dev *dev, u32 cqn,
+		    enum ib_event_type event_type);
+void mthca_cq_clean(struct mthca_dev *dev, struct mthca_cq *cq, u32 qpn,
+		    struct mthca_srq *srq);
+void mthca_cq_resize_copy_cqes(struct mthca_cq *cq);
+int mthca_alloc_cq_buf(struct mthca_dev *dev, struct mthca_cq_buf *buf, int nent);
+void mthca_free_cq_buf(struct mthca_dev *dev, struct mthca_cq_buf *buf, int cqe);
+
+int mthca_alloc_srq(struct mthca_dev *dev, struct mthca_pd *pd,
+		    struct ib_srq_attr *attr, struct mthca_srq *srq);
+void mthca_free_srq(struct mthca_dev *dev, struct mthca_srq *srq);
+int mthca_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
+		     enum ib_srq_attr_mask attr_mask, struct ib_udata *udata);
+int mthca_query_srq(struct ib_srq *srq, struct ib_srq_attr *srq_attr);
+int mthca_max_srq_sge(struct mthca_dev *dev);
+void mthca_srq_event(struct mthca_dev *dev, u32 srqn,
+		     enum ib_event_type event_type);
+void mthca_free_srq_wqe(struct mthca_srq *srq, u32 wqe_addr);
+int mthca_tavor_post_srq_recv(struct ib_srq *srq, struct ib_recv_wr *wr,
+			      struct ib_recv_wr **bad_wr);
+int mthca_arbel_post_srq_recv(struct ib_srq *srq, struct ib_recv_wr *wr,
+			      struct ib_recv_wr **bad_wr);
+
+void mthca_qp_event(struct mthca_dev *dev, u32 qpn,
+		    enum ib_event_type event_type);
+int mthca_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, int qp_attr_mask,
+		   struct ib_qp_init_attr *qp_init_attr);
+int mthca_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask,
+		    struct ib_udata *udata);
+int mthca_tavor_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
+			  struct ib_send_wr **bad_wr);
+int mthca_tavor_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr,
+			     struct ib_recv_wr **bad_wr);
+int mthca_arbel_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
+			  struct ib_send_wr **bad_wr);
+int mthca_arbel_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr,
+			     struct ib_recv_wr **bad_wr);
+void mthca_free_err_wqe(struct mthca_dev *dev, struct mthca_qp *qp, int is_send,
+			int index, int *dbd, __be32 *new_wqe);
+int mthca_alloc_qp(struct mthca_dev *dev,
+		   struct mthca_pd *pd,
+		   struct mthca_cq *send_cq,
+		   struct mthca_cq *recv_cq,
+		   enum ib_qp_type type,
+		   enum ib_sig_type send_policy,
+		   struct ib_qp_cap *cap,
+		   struct mthca_qp *qp);
+int mthca_alloc_sqp(struct mthca_dev *dev,
+		    struct mthca_pd *pd,
+		    struct mthca_cq *send_cq,
+		    struct mthca_cq *recv_cq,
+		    enum ib_sig_type send_policy,
+		    struct ib_qp_cap *cap,
+		    int qpn,
+		    int port,
+		    struct mthca_sqp *sqp);
+void mthca_free_qp(struct mthca_dev *dev, struct mthca_qp *qp);
+int mthca_create_ah(struct mthca_dev *dev,
+		    struct mthca_pd *pd,
+		    struct ib_ah_attr *ah_attr,
+		    struct mthca_ah *ah);
+int mthca_destroy_ah(struct mthca_dev *dev, struct mthca_ah *ah);
+int mthca_read_ah(struct mthca_dev *dev, struct mthca_ah *ah,
+		  struct ib_ud_header *header);
+int mthca_ah_query(struct ib_ah *ibah, struct ib_ah_attr *attr);
+int mthca_ah_grh_present(struct mthca_ah *ah);
+u8 mthca_get_rate(struct mthca_dev *dev, int static_rate, u8 port);
+enum ib_rate mthca_rate_to_ib(struct mthca_dev *dev, u8 mthca_rate, u8 port);
+
+int mthca_multicast_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid);
+int mthca_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid);
+
+int mthca_process_mad(struct ib_device *ibdev,
+		      int mad_flags,
+		      u8 port_num,
+		      struct ib_wc *in_wc,
+		      struct ib_grh *in_grh,
+		      struct ib_mad *in_mad,
+		      struct ib_mad *out_mad);
+int mthca_create_agents(struct mthca_dev *dev);
+void mthca_free_agents(struct mthca_dev *dev);
+
+static inline struct mthca_dev *to_mdev(struct ib_device *ibdev)
+{
+	return container_of(ibdev, struct mthca_dev, ib_dev);
+}
+
+static inline int mthca_is_memfree(struct mthca_dev *dev)
+{
+	return dev->mthca_flags & MTHCA_FLAG_MEMFREE;
+}
+
+#endif /* MTHCA_DEV_H */
diff --git a/drivers/infiniband/hw/mthca/mthca_doorbell.h b/drivers/infiniband/hw/mthca/mthca_doorbell.h
new file mode 100644
index 0000000..14f51ef
--- /dev/null
+++ b/drivers/infiniband/hw/mthca/mthca_doorbell.h
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/types.h>
+
+#define MTHCA_RD_DOORBELL      0x00
+#define MTHCA_SEND_DOORBELL    0x10
+#define MTHCA_RECEIVE_DOORBELL 0x18
+#define MTHCA_CQ_DOORBELL      0x20
+#define MTHCA_EQ_DOORBELL      0x28
+
+#if BITS_PER_LONG == 64
+/*
+ * Assume that we can just write a 64-bit doorbell atomically.  s390
+ * actually doesn't have writeq() but S/390 systems don't even have
+ * PCI so we won't worry about it.
+ */
+
+#define MTHCA_DECLARE_DOORBELL_LOCK(name)
+#define MTHCA_INIT_DOORBELL_LOCK(ptr)    do { } while (0)
+#define MTHCA_GET_DOORBELL_LOCK(ptr)      (NULL)
+
+static inline void mthca_write64_raw(__be64 val, void __iomem *dest)
+{
+	__raw_writeq((__force u64) val, dest);
+}
+
+static inline void mthca_write64(u32 hi, u32 lo, void __iomem *dest,
+				 spinlock_t *doorbell_lock)
+{
+	__raw_writeq((__force u64) cpu_to_be64((u64) hi << 32 | lo), dest);
+}
+
+static inline void mthca_write_db_rec(__be32 val[2], __be32 *db)
+{
+	*(u64 *) db = *(u64 *) val;
+}
+
+#else
+
+/*
+ * Just fall back to a spinlock to protect the doorbell if
+ * BITS_PER_LONG is 32 -- there's no portable way to do atomic 64-bit
+ * MMIO writes.
+ */
+
+#define MTHCA_DECLARE_DOORBELL_LOCK(name) spinlock_t name;
+#define MTHCA_INIT_DOORBELL_LOCK(ptr)     spin_lock_init(ptr)
+#define MTHCA_GET_DOORBELL_LOCK(ptr)      (ptr)
+
+static inline void mthca_write64_raw(__be64 val, void __iomem *dest)
+{
+	__raw_writel(((__force u32 *) &val)[0], dest);
+	__raw_writel(((__force u32 *) &val)[1], dest + 4);
+}
+
+static inline void mthca_write64(u32 hi, u32 lo, void __iomem *dest,
+				 spinlock_t *doorbell_lock)
+{
+	unsigned long flags;
+
+	hi = (__force u32) cpu_to_be32(hi);
+	lo = (__force u32) cpu_to_be32(lo);
+
+	spin_lock_irqsave(doorbell_lock, flags);
+	__raw_writel(hi, dest);
+	__raw_writel(lo, dest + 4);
+	spin_unlock_irqrestore(doorbell_lock, flags);
+}
+
+static inline void mthca_write_db_rec(__be32 val[2], __be32 *db)
+{
+	db[0] = val[0];
+	wmb();
+	db[1] = val[1];
+}
+
+#endif
diff --git a/drivers/infiniband/hw/mthca/mthca_eq.c b/drivers/infiniband/hw/mthca/mthca_eq.c
new file mode 100644
index 0000000..6902017
--- /dev/null
+++ b/drivers/infiniband/hw/mthca/mthca_eq.c
@@ -0,0 +1,905 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/errno.h>
+#include <linux/interrupt.h>
+#include <linux/pci.h>
+#include <linux/slab.h>
+
+#include "mthca_dev.h"
+#include "mthca_cmd.h"
+#include "mthca_config_reg.h"
+
+enum {
+	MTHCA_NUM_ASYNC_EQE = 0x80,
+	MTHCA_NUM_CMD_EQE   = 0x80,
+	MTHCA_NUM_SPARE_EQE = 0x80,
+	MTHCA_EQ_ENTRY_SIZE = 0x20
+};
+
+/*
+ * Must be packed because start is 64 bits but only aligned to 32 bits.
+ */
+struct mthca_eq_context {
+	__be32 flags;
+	__be64 start;
+	__be32 logsize_usrpage;
+	__be32 tavor_pd;	/* reserved for Arbel */
+	u8     reserved1[3];
+	u8     intr;
+	__be32 arbel_pd;	/* lost_count for Tavor */
+	__be32 lkey;
+	u32    reserved2[2];
+	__be32 consumer_index;
+	__be32 producer_index;
+	u32    reserved3[4];
+} __attribute__((packed));
+
+#define MTHCA_EQ_STATUS_OK          ( 0 << 28)
+#define MTHCA_EQ_STATUS_OVERFLOW    ( 9 << 28)
+#define MTHCA_EQ_STATUS_WRITE_FAIL  (10 << 28)
+#define MTHCA_EQ_OWNER_SW           ( 0 << 24)
+#define MTHCA_EQ_OWNER_HW           ( 1 << 24)
+#define MTHCA_EQ_FLAG_TR            ( 1 << 18)
+#define MTHCA_EQ_FLAG_OI            ( 1 << 17)
+#define MTHCA_EQ_STATE_ARMED        ( 1 <<  8)
+#define MTHCA_EQ_STATE_FIRED        ( 2 <<  8)
+#define MTHCA_EQ_STATE_ALWAYS_ARMED ( 3 <<  8)
+#define MTHCA_EQ_STATE_ARBEL        ( 8 <<  8)
+
+enum {
+	MTHCA_EVENT_TYPE_COMP       	    = 0x00,
+	MTHCA_EVENT_TYPE_PATH_MIG   	    = 0x01,
+	MTHCA_EVENT_TYPE_COMM_EST   	    = 0x02,
+	MTHCA_EVENT_TYPE_SQ_DRAINED 	    = 0x03,
+	MTHCA_EVENT_TYPE_SRQ_QP_LAST_WQE    = 0x13,
+	MTHCA_EVENT_TYPE_SRQ_LIMIT	    = 0x14,
+	MTHCA_EVENT_TYPE_CQ_ERROR   	    = 0x04,
+	MTHCA_EVENT_TYPE_WQ_CATAS_ERROR     = 0x05,
+	MTHCA_EVENT_TYPE_EEC_CATAS_ERROR    = 0x06,
+	MTHCA_EVENT_TYPE_PATH_MIG_FAILED    = 0x07,
+	MTHCA_EVENT_TYPE_WQ_INVAL_REQ_ERROR = 0x10,
+	MTHCA_EVENT_TYPE_WQ_ACCESS_ERROR    = 0x11,
+	MTHCA_EVENT_TYPE_SRQ_CATAS_ERROR    = 0x12,
+	MTHCA_EVENT_TYPE_LOCAL_CATAS_ERROR  = 0x08,
+	MTHCA_EVENT_TYPE_PORT_CHANGE        = 0x09,
+	MTHCA_EVENT_TYPE_EQ_OVERFLOW        = 0x0f,
+	MTHCA_EVENT_TYPE_ECC_DETECT         = 0x0e,
+	MTHCA_EVENT_TYPE_CMD                = 0x0a
+};
+
+#define MTHCA_ASYNC_EVENT_MASK ((1ULL << MTHCA_EVENT_TYPE_PATH_MIG)           | \
+				(1ULL << MTHCA_EVENT_TYPE_COMM_EST)           | \
+				(1ULL << MTHCA_EVENT_TYPE_SQ_DRAINED)         | \
+				(1ULL << MTHCA_EVENT_TYPE_CQ_ERROR)           | \
+				(1ULL << MTHCA_EVENT_TYPE_WQ_CATAS_ERROR)     | \
+				(1ULL << MTHCA_EVENT_TYPE_EEC_CATAS_ERROR)    | \
+				(1ULL << MTHCA_EVENT_TYPE_PATH_MIG_FAILED)    | \
+				(1ULL << MTHCA_EVENT_TYPE_WQ_INVAL_REQ_ERROR) | \
+				(1ULL << MTHCA_EVENT_TYPE_WQ_ACCESS_ERROR)    | \
+				(1ULL << MTHCA_EVENT_TYPE_LOCAL_CATAS_ERROR)  | \
+				(1ULL << MTHCA_EVENT_TYPE_PORT_CHANGE)        | \
+				(1ULL << MTHCA_EVENT_TYPE_ECC_DETECT))
+#define MTHCA_SRQ_EVENT_MASK   ((1ULL << MTHCA_EVENT_TYPE_SRQ_CATAS_ERROR)    | \
+				(1ULL << MTHCA_EVENT_TYPE_SRQ_QP_LAST_WQE)    | \
+				(1ULL << MTHCA_EVENT_TYPE_SRQ_LIMIT))
+#define MTHCA_CMD_EVENT_MASK    (1ULL << MTHCA_EVENT_TYPE_CMD)
+
+#define MTHCA_EQ_DB_INC_CI     (1 << 24)
+#define MTHCA_EQ_DB_REQ_NOT    (2 << 24)
+#define MTHCA_EQ_DB_DISARM_CQ  (3 << 24)
+#define MTHCA_EQ_DB_SET_CI     (4 << 24)
+#define MTHCA_EQ_DB_ALWAYS_ARM (5 << 24)
+
+struct mthca_eqe {
+	u8 reserved1;
+	u8 type;
+	u8 reserved2;
+	u8 subtype;
+	union {
+		u32 raw[6];
+		struct {
+			__be32 cqn;
+		} __attribute__((packed)) comp;
+		struct {
+			u16    reserved1;
+			__be16 token;
+			u32    reserved2;
+			u8     reserved3[3];
+			u8     status;
+			__be64 out_param;
+		} __attribute__((packed)) cmd;
+		struct {
+			__be32 qpn;
+		} __attribute__((packed)) qp;
+		struct {
+			__be32 srqn;
+		} __attribute__((packed)) srq;
+		struct {
+			__be32 cqn;
+			u32    reserved1;
+			u8     reserved2[3];
+			u8     syndrome;
+		} __attribute__((packed)) cq_err;
+		struct {
+			u32    reserved1[2];
+			__be32 port;
+		} __attribute__((packed)) port_change;
+	} event;
+	u8 reserved3[3];
+	u8 owner;
+} __attribute__((packed));
+
+#define  MTHCA_EQ_ENTRY_OWNER_SW      (0 << 7)
+#define  MTHCA_EQ_ENTRY_OWNER_HW      (1 << 7)
+
+static inline u64 async_mask(struct mthca_dev *dev)
+{
+	return dev->mthca_flags & MTHCA_FLAG_SRQ ?
+		MTHCA_ASYNC_EVENT_MASK | MTHCA_SRQ_EVENT_MASK :
+		MTHCA_ASYNC_EVENT_MASK;
+}
+
+static inline void tavor_set_eq_ci(struct mthca_dev *dev, struct mthca_eq *eq, u32 ci)
+{
+	/*
+	 * This barrier makes sure that all updates to ownership bits
+	 * done by set_eqe_hw() hit memory before the consumer index
+	 * is updated.  set_eq_ci() allows the HCA to possibly write
+	 * more EQ entries, and we want to avoid the exceedingly
+	 * unlikely possibility of the HCA writing an entry and then
+	 * having set_eqe_hw() overwrite the owner field.
+	 */
+	wmb();
+	mthca_write64(MTHCA_EQ_DB_SET_CI | eq->eqn, ci & (eq->nent - 1),
+		      dev->kar + MTHCA_EQ_DOORBELL,
+		      MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock));
+}
+
+static inline void arbel_set_eq_ci(struct mthca_dev *dev, struct mthca_eq *eq, u32 ci)
+{
+	/* See comment in tavor_set_eq_ci() above. */
+	wmb();
+	__raw_writel((__force u32) cpu_to_be32(ci),
+		     dev->eq_regs.arbel.eq_set_ci_base + eq->eqn * 8);
+	/* We still want ordering, just not swabbing, so add a barrier */
+	mb();
+}
+
+static inline void set_eq_ci(struct mthca_dev *dev, struct mthca_eq *eq, u32 ci)
+{
+	if (mthca_is_memfree(dev))
+		arbel_set_eq_ci(dev, eq, ci);
+	else
+		tavor_set_eq_ci(dev, eq, ci);
+}
+
+static inline void tavor_eq_req_not(struct mthca_dev *dev, int eqn)
+{
+	mthca_write64(MTHCA_EQ_DB_REQ_NOT | eqn, 0,
+		      dev->kar + MTHCA_EQ_DOORBELL,
+		      MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock));
+}
+
+static inline void arbel_eq_req_not(struct mthca_dev *dev, u32 eqn_mask)
+{
+	writel(eqn_mask, dev->eq_regs.arbel.eq_arm);
+}
+
+static inline void disarm_cq(struct mthca_dev *dev, int eqn, int cqn)
+{
+	if (!mthca_is_memfree(dev)) {
+		mthca_write64(MTHCA_EQ_DB_DISARM_CQ | eqn, cqn,
+			      dev->kar + MTHCA_EQ_DOORBELL,
+			      MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock));
+	}
+}
+
+static inline struct mthca_eqe *get_eqe(struct mthca_eq *eq, u32 entry)
+{
+	unsigned long off = (entry & (eq->nent - 1)) * MTHCA_EQ_ENTRY_SIZE;
+	return eq->page_list[off / PAGE_SIZE].buf + off % PAGE_SIZE;
+}
+
+static inline struct mthca_eqe *next_eqe_sw(struct mthca_eq *eq)
+{
+	struct mthca_eqe *eqe;
+	eqe = get_eqe(eq, eq->cons_index);
+	return (MTHCA_EQ_ENTRY_OWNER_HW & eqe->owner) ? NULL : eqe;
+}
+
+static inline void set_eqe_hw(struct mthca_eqe *eqe)
+{
+	eqe->owner =  MTHCA_EQ_ENTRY_OWNER_HW;
+}
+
+static void port_change(struct mthca_dev *dev, int port, int active)
+{
+	struct ib_event record;
+
+	mthca_dbg(dev, "Port change to %s for port %d\n",
+		  active ? "active" : "down", port);
+
+	record.device = &dev->ib_dev;
+	record.event  = active ? IB_EVENT_PORT_ACTIVE : IB_EVENT_PORT_ERR;
+	record.element.port_num = port;
+
+	ib_dispatch_event(&record);
+}
+
+static int mthca_eq_int(struct mthca_dev *dev, struct mthca_eq *eq)
+{
+	struct mthca_eqe *eqe;
+	int disarm_cqn;
+	int eqes_found = 0;
+	int set_ci = 0;
+
+	while ((eqe = next_eqe_sw(eq))) {
+		/*
+		 * Make sure we read EQ entry contents after we've
+		 * checked the ownership bit.
+		 */
+		rmb();
+
+		switch (eqe->type) {
+		case MTHCA_EVENT_TYPE_COMP:
+			disarm_cqn = be32_to_cpu(eqe->event.comp.cqn) & 0xffffff;
+			disarm_cq(dev, eq->eqn, disarm_cqn);
+			mthca_cq_completion(dev, disarm_cqn);
+			break;
+
+		case MTHCA_EVENT_TYPE_PATH_MIG:
+			mthca_qp_event(dev, be32_to_cpu(eqe->event.qp.qpn) & 0xffffff,
+				       IB_EVENT_PATH_MIG);
+			break;
+
+		case MTHCA_EVENT_TYPE_COMM_EST:
+			mthca_qp_event(dev, be32_to_cpu(eqe->event.qp.qpn) & 0xffffff,
+				       IB_EVENT_COMM_EST);
+			break;
+
+		case MTHCA_EVENT_TYPE_SQ_DRAINED:
+			mthca_qp_event(dev, be32_to_cpu(eqe->event.qp.qpn) & 0xffffff,
+				       IB_EVENT_SQ_DRAINED);
+			break;
+
+		case MTHCA_EVENT_TYPE_SRQ_QP_LAST_WQE:
+			mthca_qp_event(dev, be32_to_cpu(eqe->event.qp.qpn) & 0xffffff,
+				       IB_EVENT_QP_LAST_WQE_REACHED);
+			break;
+
+		case MTHCA_EVENT_TYPE_SRQ_LIMIT:
+			mthca_srq_event(dev, be32_to_cpu(eqe->event.srq.srqn) & 0xffffff,
+					IB_EVENT_SRQ_LIMIT_REACHED);
+			break;
+
+		case MTHCA_EVENT_TYPE_WQ_CATAS_ERROR:
+			mthca_qp_event(dev, be32_to_cpu(eqe->event.qp.qpn) & 0xffffff,
+				       IB_EVENT_QP_FATAL);
+			break;
+
+		case MTHCA_EVENT_TYPE_PATH_MIG_FAILED:
+			mthca_qp_event(dev, be32_to_cpu(eqe->event.qp.qpn) & 0xffffff,
+				       IB_EVENT_PATH_MIG_ERR);
+			break;
+
+		case MTHCA_EVENT_TYPE_WQ_INVAL_REQ_ERROR:
+			mthca_qp_event(dev, be32_to_cpu(eqe->event.qp.qpn) & 0xffffff,
+				       IB_EVENT_QP_REQ_ERR);
+			break;
+
+		case MTHCA_EVENT_TYPE_WQ_ACCESS_ERROR:
+			mthca_qp_event(dev, be32_to_cpu(eqe->event.qp.qpn) & 0xffffff,
+				       IB_EVENT_QP_ACCESS_ERR);
+			break;
+
+		case MTHCA_EVENT_TYPE_CMD:
+			mthca_cmd_event(dev,
+					be16_to_cpu(eqe->event.cmd.token),
+					eqe->event.cmd.status,
+					be64_to_cpu(eqe->event.cmd.out_param));
+			break;
+
+		case MTHCA_EVENT_TYPE_PORT_CHANGE:
+			port_change(dev,
+				    (be32_to_cpu(eqe->event.port_change.port) >> 28) & 3,
+				    eqe->subtype == 0x4);
+			break;
+
+		case MTHCA_EVENT_TYPE_CQ_ERROR:
+			mthca_warn(dev, "CQ %s on CQN %06x\n",
+				   eqe->event.cq_err.syndrome == 1 ?
+				   "overrun" : "access violation",
+				   be32_to_cpu(eqe->event.cq_err.cqn) & 0xffffff);
+			mthca_cq_event(dev, be32_to_cpu(eqe->event.cq_err.cqn),
+				       IB_EVENT_CQ_ERR);
+			break;
+
+		case MTHCA_EVENT_TYPE_EQ_OVERFLOW:
+			mthca_warn(dev, "EQ overrun on EQN %d\n", eq->eqn);
+			break;
+
+		case MTHCA_EVENT_TYPE_EEC_CATAS_ERROR:
+		case MTHCA_EVENT_TYPE_SRQ_CATAS_ERROR:
+		case MTHCA_EVENT_TYPE_LOCAL_CATAS_ERROR:
+		case MTHCA_EVENT_TYPE_ECC_DETECT:
+		default:
+			mthca_warn(dev, "Unhandled event %02x(%02x) on EQ %d\n",
+				   eqe->type, eqe->subtype, eq->eqn);
+			break;
+		}
+
+		set_eqe_hw(eqe);
+		++eq->cons_index;
+		eqes_found = 1;
+		++set_ci;
+
+		/*
+		 * The HCA will think the queue has overflowed if we
+		 * don't tell it we've been processing events.  We
+		 * create our EQs with MTHCA_NUM_SPARE_EQE extra
+		 * entries, so we must update our consumer index at
+		 * least that often.
+		 */
+		if (unlikely(set_ci >= MTHCA_NUM_SPARE_EQE)) {
+			/*
+			 * Conditional on hca_type is OK here because
+			 * this is a rare case, not the fast path.
+			 */
+			set_eq_ci(dev, eq, eq->cons_index);
+			set_ci = 0;
+		}
+	}
+
+	/*
+	 * Rely on caller to set consumer index so that we don't have
+	 * to test hca_type in our interrupt handling fast path.
+	 */
+	return eqes_found;
+}
+
+static irqreturn_t mthca_tavor_interrupt(int irq, void *dev_ptr)
+{
+	struct mthca_dev *dev = dev_ptr;
+	u32 ecr;
+	int i;
+
+	if (dev->eq_table.clr_mask)
+		writel(dev->eq_table.clr_mask, dev->eq_table.clr_int);
+
+	ecr = readl(dev->eq_regs.tavor.ecr_base + 4);
+	if (!ecr)
+		return IRQ_NONE;
+
+	writel(ecr, dev->eq_regs.tavor.ecr_base +
+	       MTHCA_ECR_CLR_BASE - MTHCA_ECR_BASE + 4);
+
+	for (i = 0; i < MTHCA_NUM_EQ; ++i)
+		if (ecr & dev->eq_table.eq[i].eqn_mask) {
+			if (mthca_eq_int(dev, &dev->eq_table.eq[i]))
+				tavor_set_eq_ci(dev, &dev->eq_table.eq[i],
+						dev->eq_table.eq[i].cons_index);
+			tavor_eq_req_not(dev, dev->eq_table.eq[i].eqn);
+		}
+
+	return IRQ_HANDLED;
+}
+
+static irqreturn_t mthca_tavor_msi_x_interrupt(int irq, void *eq_ptr)
+{
+	struct mthca_eq  *eq  = eq_ptr;
+	struct mthca_dev *dev = eq->dev;
+
+	mthca_eq_int(dev, eq);
+	tavor_set_eq_ci(dev, eq, eq->cons_index);
+	tavor_eq_req_not(dev, eq->eqn);
+
+	/* MSI-X vectors always belong to us */
+	return IRQ_HANDLED;
+}
+
+static irqreturn_t mthca_arbel_interrupt(int irq, void *dev_ptr)
+{
+	struct mthca_dev *dev = dev_ptr;
+	int work = 0;
+	int i;
+
+	if (dev->eq_table.clr_mask)
+		writel(dev->eq_table.clr_mask, dev->eq_table.clr_int);
+
+	for (i = 0; i < MTHCA_NUM_EQ; ++i)
+		if (mthca_eq_int(dev, &dev->eq_table.eq[i])) {
+			work = 1;
+			arbel_set_eq_ci(dev, &dev->eq_table.eq[i],
+					dev->eq_table.eq[i].cons_index);
+		}
+
+	arbel_eq_req_not(dev, dev->eq_table.arm_mask);
+
+	return IRQ_RETVAL(work);
+}
+
+static irqreturn_t mthca_arbel_msi_x_interrupt(int irq, void *eq_ptr)
+{
+	struct mthca_eq  *eq  = eq_ptr;
+	struct mthca_dev *dev = eq->dev;
+
+	mthca_eq_int(dev, eq);
+	arbel_set_eq_ci(dev, eq, eq->cons_index);
+	arbel_eq_req_not(dev, eq->eqn_mask);
+
+	/* MSI-X vectors always belong to us */
+	return IRQ_HANDLED;
+}
+
+static int mthca_create_eq(struct mthca_dev *dev,
+			   int nent,
+			   u8 intr,
+			   struct mthca_eq *eq)
+{
+	int npages;
+	u64 *dma_list = NULL;
+	dma_addr_t t;
+	struct mthca_mailbox *mailbox;
+	struct mthca_eq_context *eq_context;
+	int err = -ENOMEM;
+	int i;
+
+	eq->dev  = dev;
+	eq->nent = roundup_pow_of_two(max(nent, 2));
+	npages = ALIGN(eq->nent * MTHCA_EQ_ENTRY_SIZE, PAGE_SIZE) / PAGE_SIZE;
+
+	eq->page_list = kmalloc(npages * sizeof *eq->page_list,
+				GFP_KERNEL);
+	if (!eq->page_list)
+		goto err_out;
+
+	for (i = 0; i < npages; ++i)
+		eq->page_list[i].buf = NULL;
+
+	dma_list = kmalloc(npages * sizeof *dma_list, GFP_KERNEL);
+	if (!dma_list)
+		goto err_out_free;
+
+	mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+	if (IS_ERR(mailbox))
+		goto err_out_free;
+	eq_context = mailbox->buf;
+
+	for (i = 0; i < npages; ++i) {
+		eq->page_list[i].buf = dma_alloc_coherent(&dev->pdev->dev,
+							  PAGE_SIZE, &t, GFP_KERNEL);
+		if (!eq->page_list[i].buf)
+			goto err_out_free_pages;
+
+		dma_list[i] = t;
+		dma_unmap_addr_set(&eq->page_list[i], mapping, t);
+
+		clear_page(eq->page_list[i].buf);
+	}
+
+	for (i = 0; i < eq->nent; ++i)
+		set_eqe_hw(get_eqe(eq, i));
+
+	eq->eqn = mthca_alloc(&dev->eq_table.alloc);
+	if (eq->eqn == -1)
+		goto err_out_free_pages;
+
+	err = mthca_mr_alloc_phys(dev, dev->driver_pd.pd_num,
+				  dma_list, PAGE_SHIFT, npages,
+				  0, npages * PAGE_SIZE,
+				  MTHCA_MPT_FLAG_LOCAL_WRITE |
+				  MTHCA_MPT_FLAG_LOCAL_READ,
+				  &eq->mr);
+	if (err)
+		goto err_out_free_eq;
+
+	memset(eq_context, 0, sizeof *eq_context);
+	eq_context->flags           = cpu_to_be32(MTHCA_EQ_STATUS_OK   |
+						  MTHCA_EQ_OWNER_HW    |
+						  MTHCA_EQ_STATE_ARMED |
+						  MTHCA_EQ_FLAG_TR);
+	if (mthca_is_memfree(dev))
+		eq_context->flags  |= cpu_to_be32(MTHCA_EQ_STATE_ARBEL);
+
+	eq_context->logsize_usrpage = cpu_to_be32((ffs(eq->nent) - 1) << 24);
+	if (mthca_is_memfree(dev)) {
+		eq_context->arbel_pd = cpu_to_be32(dev->driver_pd.pd_num);
+	} else {
+		eq_context->logsize_usrpage |= cpu_to_be32(dev->driver_uar.index);
+		eq_context->tavor_pd         = cpu_to_be32(dev->driver_pd.pd_num);
+	}
+	eq_context->intr            = intr;
+	eq_context->lkey            = cpu_to_be32(eq->mr.ibmr.lkey);
+
+	err = mthca_SW2HW_EQ(dev, mailbox, eq->eqn);
+	if (err) {
+		mthca_warn(dev, "SW2HW_EQ returned %d\n", err);
+		goto err_out_free_mr;
+	}
+
+	kfree(dma_list);
+	mthca_free_mailbox(dev, mailbox);
+
+	eq->eqn_mask   = swab32(1 << eq->eqn);
+	eq->cons_index = 0;
+
+	dev->eq_table.arm_mask |= eq->eqn_mask;
+
+	mthca_dbg(dev, "Allocated EQ %d with %d entries\n",
+		  eq->eqn, eq->nent);
+
+	return err;
+
+ err_out_free_mr:
+	mthca_free_mr(dev, &eq->mr);
+
+ err_out_free_eq:
+	mthca_free(&dev->eq_table.alloc, eq->eqn);
+
+ err_out_free_pages:
+	for (i = 0; i < npages; ++i)
+		if (eq->page_list[i].buf)
+			dma_free_coherent(&dev->pdev->dev, PAGE_SIZE,
+					  eq->page_list[i].buf,
+					  dma_unmap_addr(&eq->page_list[i],
+							 mapping));
+
+	mthca_free_mailbox(dev, mailbox);
+
+ err_out_free:
+	kfree(eq->page_list);
+	kfree(dma_list);
+
+ err_out:
+	return err;
+}
+
+static void mthca_free_eq(struct mthca_dev *dev,
+			  struct mthca_eq *eq)
+{
+	struct mthca_mailbox *mailbox;
+	int err;
+	int npages = (eq->nent * MTHCA_EQ_ENTRY_SIZE + PAGE_SIZE - 1) /
+		PAGE_SIZE;
+	int i;
+
+	mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+	if (IS_ERR(mailbox))
+		return;
+
+	err = mthca_HW2SW_EQ(dev, mailbox, eq->eqn);
+	if (err)
+		mthca_warn(dev, "HW2SW_EQ returned %d\n", err);
+
+	dev->eq_table.arm_mask &= ~eq->eqn_mask;
+
+	if (0) {
+		mthca_dbg(dev, "Dumping EQ context %02x:\n", eq->eqn);
+		for (i = 0; i < sizeof (struct mthca_eq_context) / 4; ++i) {
+			if (i % 4 == 0)
+				printk("[%02x] ", i * 4);
+			printk(" %08x", be32_to_cpup(mailbox->buf + i * 4));
+			if ((i + 1) % 4 == 0)
+				printk("\n");
+		}
+	}
+
+	mthca_free_mr(dev, &eq->mr);
+	for (i = 0; i < npages; ++i)
+		pci_free_consistent(dev->pdev, PAGE_SIZE,
+				    eq->page_list[i].buf,
+				    dma_unmap_addr(&eq->page_list[i], mapping));
+
+	kfree(eq->page_list);
+	mthca_free_mailbox(dev, mailbox);
+}
+
+static void mthca_free_irqs(struct mthca_dev *dev)
+{
+	int i;
+
+	if (dev->eq_table.have_irq)
+		free_irq(dev->pdev->irq, dev);
+	for (i = 0; i < MTHCA_NUM_EQ; ++i)
+		if (dev->eq_table.eq[i].have_irq) {
+			free_irq(dev->eq_table.eq[i].msi_x_vector,
+				 dev->eq_table.eq + i);
+			dev->eq_table.eq[i].have_irq = 0;
+		}
+}
+
+static int mthca_map_reg(struct mthca_dev *dev,
+			 unsigned long offset, unsigned long size,
+			 void __iomem **map)
+{
+	phys_addr_t base = pci_resource_start(dev->pdev, 0);
+
+	*map = ioremap(base + offset, size);
+	if (!*map)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static int mthca_map_eq_regs(struct mthca_dev *dev)
+{
+	if (mthca_is_memfree(dev)) {
+		/*
+		 * We assume that the EQ arm and EQ set CI registers
+		 * fall within the first BAR.  We can't trust the
+		 * values firmware gives us, since those addresses are
+		 * valid on the HCA's side of the PCI bus but not
+		 * necessarily the host side.
+		 */
+		if (mthca_map_reg(dev, (pci_resource_len(dev->pdev, 0) - 1) &
+				  dev->fw.arbel.clr_int_base, MTHCA_CLR_INT_SIZE,
+				  &dev->clr_base)) {
+			mthca_err(dev, "Couldn't map interrupt clear register, "
+				  "aborting.\n");
+			return -ENOMEM;
+		}
+
+		/*
+		 * Add 4 because we limit ourselves to EQs 0 ... 31,
+		 * so we only need the low word of the register.
+		 */
+		if (mthca_map_reg(dev, ((pci_resource_len(dev->pdev, 0) - 1) &
+					dev->fw.arbel.eq_arm_base) + 4, 4,
+				  &dev->eq_regs.arbel.eq_arm)) {
+			mthca_err(dev, "Couldn't map EQ arm register, aborting.\n");
+			iounmap(dev->clr_base);
+			return -ENOMEM;
+		}
+
+		if (mthca_map_reg(dev, (pci_resource_len(dev->pdev, 0) - 1) &
+				  dev->fw.arbel.eq_set_ci_base,
+				  MTHCA_EQ_SET_CI_SIZE,
+				  &dev->eq_regs.arbel.eq_set_ci_base)) {
+			mthca_err(dev, "Couldn't map EQ CI register, aborting.\n");
+			iounmap(dev->eq_regs.arbel.eq_arm);
+			iounmap(dev->clr_base);
+			return -ENOMEM;
+		}
+	} else {
+		if (mthca_map_reg(dev, MTHCA_CLR_INT_BASE, MTHCA_CLR_INT_SIZE,
+				  &dev->clr_base)) {
+			mthca_err(dev, "Couldn't map interrupt clear register, "
+				  "aborting.\n");
+			return -ENOMEM;
+		}
+
+		if (mthca_map_reg(dev, MTHCA_ECR_BASE,
+				  MTHCA_ECR_SIZE + MTHCA_ECR_CLR_SIZE,
+				  &dev->eq_regs.tavor.ecr_base)) {
+			mthca_err(dev, "Couldn't map ecr register, "
+				  "aborting.\n");
+			iounmap(dev->clr_base);
+			return -ENOMEM;
+		}
+	}
+
+	return 0;
+
+}
+
+static void mthca_unmap_eq_regs(struct mthca_dev *dev)
+{
+	if (mthca_is_memfree(dev)) {
+		iounmap(dev->eq_regs.arbel.eq_set_ci_base);
+		iounmap(dev->eq_regs.arbel.eq_arm);
+		iounmap(dev->clr_base);
+	} else {
+		iounmap(dev->eq_regs.tavor.ecr_base);
+		iounmap(dev->clr_base);
+	}
+}
+
+int mthca_map_eq_icm(struct mthca_dev *dev, u64 icm_virt)
+{
+	int ret;
+
+	/*
+	 * We assume that mapping one page is enough for the whole EQ
+	 * context table.  This is fine with all current HCAs, because
+	 * we only use 32 EQs and each EQ uses 32 bytes of context
+	 * memory, or 1 KB total.
+	 */
+	dev->eq_table.icm_virt = icm_virt;
+	dev->eq_table.icm_page = alloc_page(GFP_HIGHUSER);
+	if (!dev->eq_table.icm_page)
+		return -ENOMEM;
+	dev->eq_table.icm_dma  = pci_map_page(dev->pdev, dev->eq_table.icm_page, 0,
+					      PAGE_SIZE, PCI_DMA_BIDIRECTIONAL);
+	if (pci_dma_mapping_error(dev->pdev, dev->eq_table.icm_dma)) {
+		__free_page(dev->eq_table.icm_page);
+		return -ENOMEM;
+	}
+
+	ret = mthca_MAP_ICM_page(dev, dev->eq_table.icm_dma, icm_virt);
+	if (ret) {
+		pci_unmap_page(dev->pdev, dev->eq_table.icm_dma, PAGE_SIZE,
+			       PCI_DMA_BIDIRECTIONAL);
+		__free_page(dev->eq_table.icm_page);
+	}
+
+	return ret;
+}
+
+void mthca_unmap_eq_icm(struct mthca_dev *dev)
+{
+	mthca_UNMAP_ICM(dev, dev->eq_table.icm_virt, 1);
+	pci_unmap_page(dev->pdev, dev->eq_table.icm_dma, PAGE_SIZE,
+		       PCI_DMA_BIDIRECTIONAL);
+	__free_page(dev->eq_table.icm_page);
+}
+
+int mthca_init_eq_table(struct mthca_dev *dev)
+{
+	int err;
+	u8 intr;
+	int i;
+
+	err = mthca_alloc_init(&dev->eq_table.alloc,
+			       dev->limits.num_eqs,
+			       dev->limits.num_eqs - 1,
+			       dev->limits.reserved_eqs);
+	if (err)
+		return err;
+
+	err = mthca_map_eq_regs(dev);
+	if (err)
+		goto err_out_free;
+
+	if (dev->mthca_flags & MTHCA_FLAG_MSI_X) {
+		dev->eq_table.clr_mask = 0;
+	} else {
+		dev->eq_table.clr_mask =
+			swab32(1 << (dev->eq_table.inta_pin & 31));
+		dev->eq_table.clr_int  = dev->clr_base +
+			(dev->eq_table.inta_pin < 32 ? 4 : 0);
+	}
+
+	dev->eq_table.arm_mask = 0;
+
+	intr = dev->eq_table.inta_pin;
+
+	err = mthca_create_eq(dev, dev->limits.num_cqs + MTHCA_NUM_SPARE_EQE,
+			      (dev->mthca_flags & MTHCA_FLAG_MSI_X) ? 128 : intr,
+			      &dev->eq_table.eq[MTHCA_EQ_COMP]);
+	if (err)
+		goto err_out_unmap;
+
+	err = mthca_create_eq(dev, MTHCA_NUM_ASYNC_EQE + MTHCA_NUM_SPARE_EQE,
+			      (dev->mthca_flags & MTHCA_FLAG_MSI_X) ? 129 : intr,
+			      &dev->eq_table.eq[MTHCA_EQ_ASYNC]);
+	if (err)
+		goto err_out_comp;
+
+	err = mthca_create_eq(dev, MTHCA_NUM_CMD_EQE + MTHCA_NUM_SPARE_EQE,
+			      (dev->mthca_flags & MTHCA_FLAG_MSI_X) ? 130 : intr,
+			      &dev->eq_table.eq[MTHCA_EQ_CMD]);
+	if (err)
+		goto err_out_async;
+
+	if (dev->mthca_flags & MTHCA_FLAG_MSI_X) {
+		static const char *eq_name[] = {
+			[MTHCA_EQ_COMP]  = DRV_NAME "-comp",
+			[MTHCA_EQ_ASYNC] = DRV_NAME "-async",
+			[MTHCA_EQ_CMD]   = DRV_NAME "-cmd"
+		};
+
+		for (i = 0; i < MTHCA_NUM_EQ; ++i) {
+			snprintf(dev->eq_table.eq[i].irq_name,
+				 IB_DEVICE_NAME_MAX,
+				 "%s@pci:%s", eq_name[i],
+				 pci_name(dev->pdev));
+			err = request_irq(dev->eq_table.eq[i].msi_x_vector,
+					  mthca_is_memfree(dev) ?
+					  mthca_arbel_msi_x_interrupt :
+					  mthca_tavor_msi_x_interrupt,
+					  0, dev->eq_table.eq[i].irq_name,
+					  dev->eq_table.eq + i);
+			if (err)
+				goto err_out_cmd;
+			dev->eq_table.eq[i].have_irq = 1;
+		}
+	} else {
+		snprintf(dev->eq_table.eq[0].irq_name, IB_DEVICE_NAME_MAX,
+			 DRV_NAME "@pci:%s", pci_name(dev->pdev));
+		err = request_irq(dev->pdev->irq,
+				  mthca_is_memfree(dev) ?
+				  mthca_arbel_interrupt :
+				  mthca_tavor_interrupt,
+				  IRQF_SHARED, dev->eq_table.eq[0].irq_name, dev);
+		if (err)
+			goto err_out_cmd;
+		dev->eq_table.have_irq = 1;
+	}
+
+	err = mthca_MAP_EQ(dev, async_mask(dev),
+			   0, dev->eq_table.eq[MTHCA_EQ_ASYNC].eqn);
+	if (err)
+		mthca_warn(dev, "MAP_EQ for async EQ %d failed (%d)\n",
+			   dev->eq_table.eq[MTHCA_EQ_ASYNC].eqn, err);
+
+	err = mthca_MAP_EQ(dev, MTHCA_CMD_EVENT_MASK,
+			   0, dev->eq_table.eq[MTHCA_EQ_CMD].eqn);
+	if (err)
+		mthca_warn(dev, "MAP_EQ for cmd EQ %d failed (%d)\n",
+			   dev->eq_table.eq[MTHCA_EQ_CMD].eqn, err);
+
+	for (i = 0; i < MTHCA_NUM_EQ; ++i)
+		if (mthca_is_memfree(dev))
+			arbel_eq_req_not(dev, dev->eq_table.eq[i].eqn_mask);
+		else
+			tavor_eq_req_not(dev, dev->eq_table.eq[i].eqn);
+
+	return 0;
+
+err_out_cmd:
+	mthca_free_irqs(dev);
+	mthca_free_eq(dev, &dev->eq_table.eq[MTHCA_EQ_CMD]);
+
+err_out_async:
+	mthca_free_eq(dev, &dev->eq_table.eq[MTHCA_EQ_ASYNC]);
+
+err_out_comp:
+	mthca_free_eq(dev, &dev->eq_table.eq[MTHCA_EQ_COMP]);
+
+err_out_unmap:
+	mthca_unmap_eq_regs(dev);
+
+err_out_free:
+	mthca_alloc_cleanup(&dev->eq_table.alloc);
+	return err;
+}
+
+void mthca_cleanup_eq_table(struct mthca_dev *dev)
+{
+	int i;
+
+	mthca_free_irqs(dev);
+
+	mthca_MAP_EQ(dev, async_mask(dev),
+		     1, dev->eq_table.eq[MTHCA_EQ_ASYNC].eqn);
+	mthca_MAP_EQ(dev, MTHCA_CMD_EVENT_MASK,
+		     1, dev->eq_table.eq[MTHCA_EQ_CMD].eqn);
+
+	for (i = 0; i < MTHCA_NUM_EQ; ++i)
+		mthca_free_eq(dev, &dev->eq_table.eq[i]);
+
+	mthca_unmap_eq_regs(dev);
+
+	mthca_alloc_cleanup(&dev->eq_table.alloc);
+}
diff --git a/drivers/infiniband/hw/mthca/mthca_mad.c b/drivers/infiniband/hw/mthca/mthca_mad.c
new file mode 100644
index 0000000..8881fa3
--- /dev/null
+++ b/drivers/infiniband/hw/mthca/mthca_mad.c
@@ -0,0 +1,341 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2004 Voltaire, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/string.h>
+#include <linux/slab.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_mad.h>
+#include <rdma/ib_smi.h>
+
+#include "mthca_dev.h"
+#include "mthca_cmd.h"
+
+enum {
+	MTHCA_VENDOR_CLASS1 = 0x9,
+	MTHCA_VENDOR_CLASS2 = 0xa
+};
+
+static int mthca_update_rate(struct mthca_dev *dev, u8 port_num)
+{
+	struct ib_port_attr *tprops = NULL;
+	int                  ret;
+
+	tprops = kmalloc(sizeof *tprops, GFP_KERNEL);
+	if (!tprops)
+		return -ENOMEM;
+
+	ret = ib_query_port(&dev->ib_dev, port_num, tprops);
+	if (ret) {
+		printk(KERN_WARNING "ib_query_port failed (%d) for %s port %d\n",
+		       ret, dev->ib_dev.name, port_num);
+		goto out;
+	}
+
+	dev->rate[port_num - 1] = tprops->active_speed *
+				  ib_width_enum_to_int(tprops->active_width);
+
+out:
+	kfree(tprops);
+	return ret;
+}
+
+static void update_sm_ah(struct mthca_dev *dev,
+			 u8 port_num, u16 lid, u8 sl)
+{
+	struct ib_ah *new_ah;
+	struct ib_ah_attr ah_attr;
+	unsigned long flags;
+
+	if (!dev->send_agent[port_num - 1][0])
+		return;
+
+	memset(&ah_attr, 0, sizeof ah_attr);
+	ah_attr.dlid     = lid;
+	ah_attr.sl       = sl;
+	ah_attr.port_num = port_num;
+
+	new_ah = ib_create_ah(dev->send_agent[port_num - 1][0]->qp->pd,
+			      &ah_attr);
+	if (IS_ERR(new_ah))
+		return;
+
+	spin_lock_irqsave(&dev->sm_lock, flags);
+	if (dev->sm_ah[port_num - 1])
+		ib_destroy_ah(dev->sm_ah[port_num - 1]);
+	dev->sm_ah[port_num - 1] = new_ah;
+	spin_unlock_irqrestore(&dev->sm_lock, flags);
+}
+
+/*
+ * Snoop SM MADs for port info and P_Key table sets, so we can
+ * synthesize LID change and P_Key change events.
+ */
+static void smp_snoop(struct ib_device *ibdev,
+		      u8 port_num,
+		      struct ib_mad *mad,
+		      u16 prev_lid)
+{
+	struct ib_event event;
+
+	if ((mad->mad_hdr.mgmt_class  == IB_MGMT_CLASS_SUBN_LID_ROUTED ||
+	     mad->mad_hdr.mgmt_class  == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) &&
+	    mad->mad_hdr.method     == IB_MGMT_METHOD_SET) {
+		if (mad->mad_hdr.attr_id == IB_SMP_ATTR_PORT_INFO) {
+			struct ib_port_info *pinfo =
+				(struct ib_port_info *) ((struct ib_smp *) mad)->data;
+			u16 lid = be16_to_cpu(pinfo->lid);
+
+			mthca_update_rate(to_mdev(ibdev), port_num);
+			update_sm_ah(to_mdev(ibdev), port_num,
+				     be16_to_cpu(pinfo->sm_lid),
+				     pinfo->neighbormtu_mastersmsl & 0xf);
+
+			event.device           = ibdev;
+			event.element.port_num = port_num;
+
+			if (pinfo->clientrereg_resv_subnetto & 0x80) {
+				event.event    = IB_EVENT_CLIENT_REREGISTER;
+				ib_dispatch_event(&event);
+			}
+
+			if (prev_lid != lid) {
+				event.event    = IB_EVENT_LID_CHANGE;
+				ib_dispatch_event(&event);
+			}
+		}
+
+		if (mad->mad_hdr.attr_id == IB_SMP_ATTR_PKEY_TABLE) {
+			event.device           = ibdev;
+			event.event            = IB_EVENT_PKEY_CHANGE;
+			event.element.port_num = port_num;
+			ib_dispatch_event(&event);
+		}
+	}
+}
+
+static void node_desc_override(struct ib_device *dev,
+			       struct ib_mad *mad)
+{
+	if ((mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED ||
+	     mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) &&
+	    mad->mad_hdr.method == IB_MGMT_METHOD_GET_RESP &&
+	    mad->mad_hdr.attr_id == IB_SMP_ATTR_NODE_DESC) {
+		mutex_lock(&to_mdev(dev)->cap_mask_mutex);
+		memcpy(((struct ib_smp *) mad)->data, dev->node_desc, 64);
+		mutex_unlock(&to_mdev(dev)->cap_mask_mutex);
+	}
+}
+
+static void forward_trap(struct mthca_dev *dev,
+			 u8 port_num,
+			 struct ib_mad *mad)
+{
+	int qpn = mad->mad_hdr.mgmt_class != IB_MGMT_CLASS_SUBN_LID_ROUTED;
+	struct ib_mad_send_buf *send_buf;
+	struct ib_mad_agent *agent = dev->send_agent[port_num - 1][qpn];
+	int ret;
+	unsigned long flags;
+
+	if (agent) {
+		send_buf = ib_create_send_mad(agent, qpn, 0, 0, IB_MGMT_MAD_HDR,
+					      IB_MGMT_MAD_DATA, GFP_ATOMIC);
+		if (IS_ERR(send_buf))
+			return;
+		/*
+		 * We rely here on the fact that MLX QPs don't use the
+		 * address handle after the send is posted (this is
+		 * wrong following the IB spec strictly, but we know
+		 * it's OK for our devices).
+		 */
+		spin_lock_irqsave(&dev->sm_lock, flags);
+		memcpy(send_buf->mad, mad, sizeof *mad);
+		if ((send_buf->ah = dev->sm_ah[port_num - 1]))
+			ret = ib_post_send_mad(send_buf, NULL);
+		else
+			ret = -EINVAL;
+		spin_unlock_irqrestore(&dev->sm_lock, flags);
+
+		if (ret)
+			ib_free_send_mad(send_buf);
+	}
+}
+
+int mthca_process_mad(struct ib_device *ibdev,
+		      int mad_flags,
+		      u8 port_num,
+		      struct ib_wc *in_wc,
+		      struct ib_grh *in_grh,
+		      struct ib_mad *in_mad,
+		      struct ib_mad *out_mad)
+{
+	int err;
+	u16 slid = in_wc ? in_wc->slid : be16_to_cpu(IB_LID_PERMISSIVE);
+	u16 prev_lid = 0;
+	struct ib_port_attr pattr;
+
+	/* Forward locally generated traps to the SM */
+	if (in_mad->mad_hdr.method == IB_MGMT_METHOD_TRAP &&
+	    slid == 0) {
+		forward_trap(to_mdev(ibdev), port_num, in_mad);
+		return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED;
+	}
+
+	/*
+	 * Only handle SM gets, sets and trap represses for SM class
+	 *
+	 * Only handle PMA and Mellanox vendor-specific class gets and
+	 * sets for other classes.
+	 */
+	if (in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED ||
+	    in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) {
+		if (in_mad->mad_hdr.method   != IB_MGMT_METHOD_GET &&
+		    in_mad->mad_hdr.method   != IB_MGMT_METHOD_SET &&
+		    in_mad->mad_hdr.method   != IB_MGMT_METHOD_TRAP_REPRESS)
+			return IB_MAD_RESULT_SUCCESS;
+
+		/*
+		 * Don't process SMInfo queries or vendor-specific
+		 * MADs -- the SMA can't handle them.
+		 */
+		if (in_mad->mad_hdr.attr_id == IB_SMP_ATTR_SM_INFO ||
+		    ((in_mad->mad_hdr.attr_id & IB_SMP_ATTR_VENDOR_MASK) ==
+		     IB_SMP_ATTR_VENDOR_MASK))
+			return IB_MAD_RESULT_SUCCESS;
+	} else if (in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_PERF_MGMT ||
+		   in_mad->mad_hdr.mgmt_class == MTHCA_VENDOR_CLASS1     ||
+		   in_mad->mad_hdr.mgmt_class == MTHCA_VENDOR_CLASS2) {
+		if (in_mad->mad_hdr.method  != IB_MGMT_METHOD_GET &&
+		    in_mad->mad_hdr.method  != IB_MGMT_METHOD_SET)
+			return IB_MAD_RESULT_SUCCESS;
+	} else
+		return IB_MAD_RESULT_SUCCESS;
+	if ((in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED ||
+	     in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) &&
+	    in_mad->mad_hdr.method == IB_MGMT_METHOD_SET &&
+	    in_mad->mad_hdr.attr_id == IB_SMP_ATTR_PORT_INFO &&
+	    !ib_query_port(ibdev, port_num, &pattr))
+		prev_lid = pattr.lid;
+
+	err = mthca_MAD_IFC(to_mdev(ibdev),
+			    mad_flags & IB_MAD_IGNORE_MKEY,
+			    mad_flags & IB_MAD_IGNORE_BKEY,
+			    port_num, in_wc, in_grh, in_mad, out_mad);
+	if (err == -EBADMSG)
+		return IB_MAD_RESULT_SUCCESS;
+	else if (err) {
+		mthca_err(to_mdev(ibdev), "MAD_IFC returned %d\n", err);
+		return IB_MAD_RESULT_FAILURE;
+	}
+
+	if (!out_mad->mad_hdr.status) {
+		smp_snoop(ibdev, port_num, in_mad, prev_lid);
+		node_desc_override(ibdev, out_mad);
+	}
+
+	/* set return bit in status of directed route responses */
+	if (in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)
+		out_mad->mad_hdr.status |= cpu_to_be16(1 << 15);
+
+	if (in_mad->mad_hdr.method == IB_MGMT_METHOD_TRAP_REPRESS)
+		/* no response for trap repress */
+		return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED;
+
+	return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY;
+}
+
+static void send_handler(struct ib_mad_agent *agent,
+			 struct ib_mad_send_wc *mad_send_wc)
+{
+	ib_free_send_mad(mad_send_wc->send_buf);
+}
+
+int mthca_create_agents(struct mthca_dev *dev)
+{
+	struct ib_mad_agent *agent;
+	int p, q;
+	int ret;
+
+	spin_lock_init(&dev->sm_lock);
+
+	for (p = 0; p < dev->limits.num_ports; ++p)
+		for (q = 0; q <= 1; ++q) {
+			agent = ib_register_mad_agent(&dev->ib_dev, p + 1,
+						      q ? IB_QPT_GSI : IB_QPT_SMI,
+						      NULL, 0, send_handler,
+						      NULL, NULL, 0);
+			if (IS_ERR(agent)) {
+				ret = PTR_ERR(agent);
+				goto err;
+			}
+			dev->send_agent[p][q] = agent;
+		}
+
+
+	for (p = 1; p <= dev->limits.num_ports; ++p) {
+		ret = mthca_update_rate(dev, p);
+		if (ret) {
+			mthca_err(dev, "Failed to obtain port %d rate."
+				  " aborting.\n", p);
+			goto err;
+		}
+	}
+
+	return 0;
+
+err:
+	for (p = 0; p < dev->limits.num_ports; ++p)
+		for (q = 0; q <= 1; ++q)
+			if (dev->send_agent[p][q])
+				ib_unregister_mad_agent(dev->send_agent[p][q]);
+
+	return ret;
+}
+
+void mthca_free_agents(struct mthca_dev *dev)
+{
+	struct ib_mad_agent *agent;
+	int p, q;
+
+	for (p = 0; p < dev->limits.num_ports; ++p) {
+		for (q = 0; q <= 1; ++q) {
+			agent = dev->send_agent[p][q];
+			dev->send_agent[p][q] = NULL;
+			ib_unregister_mad_agent(agent);
+		}
+
+		if (dev->sm_ah[p])
+			ib_destroy_ah(dev->sm_ah[p]);
+	}
+}
diff --git a/drivers/infiniband/hw/mthca/mthca_main.c b/drivers/infiniband/hw/mthca/mthca_main.c
new file mode 100644
index 0000000..ded76c1
--- /dev/null
+++ b/drivers/infiniband/hw/mthca/mthca_main.c
@@ -0,0 +1,1275 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/pci.h>
+#include <linux/interrupt.h>
+#include <linux/gfp.h>
+
+#include "mthca_dev.h"
+#include "mthca_config_reg.h"
+#include "mthca_cmd.h"
+#include "mthca_profile.h"
+#include "mthca_memfree.h"
+#include "mthca_wqe.h"
+
+MODULE_AUTHOR("Roland Dreier");
+MODULE_DESCRIPTION("Mellanox InfiniBand HCA low-level driver");
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_VERSION(DRV_VERSION);
+
+#ifdef CONFIG_INFINIBAND_MTHCA_DEBUG
+
+int mthca_debug_level = 0;
+module_param_named(debug_level, mthca_debug_level, int, 0644);
+MODULE_PARM_DESC(debug_level, "Enable debug tracing if > 0");
+
+#endif /* CONFIG_INFINIBAND_MTHCA_DEBUG */
+
+#ifdef CONFIG_PCI_MSI
+
+static int msi_x = 1;
+module_param(msi_x, int, 0444);
+MODULE_PARM_DESC(msi_x, "attempt to use MSI-X if nonzero");
+
+#else /* CONFIG_PCI_MSI */
+
+#define msi_x (0)
+
+#endif /* CONFIG_PCI_MSI */
+
+static int tune_pci = 0;
+module_param(tune_pci, int, 0444);
+MODULE_PARM_DESC(tune_pci, "increase PCI burst from the default set by BIOS if nonzero");
+
+DEFINE_MUTEX(mthca_device_mutex);
+
+#define MTHCA_DEFAULT_NUM_QP            (1 << 16)
+#define MTHCA_DEFAULT_RDB_PER_QP        (1 << 2)
+#define MTHCA_DEFAULT_NUM_CQ            (1 << 16)
+#define MTHCA_DEFAULT_NUM_MCG           (1 << 13)
+#define MTHCA_DEFAULT_NUM_MPT           (1 << 17)
+#define MTHCA_DEFAULT_NUM_MTT           (1 << 20)
+#define MTHCA_DEFAULT_NUM_UDAV          (1 << 15)
+#define MTHCA_DEFAULT_NUM_RESERVED_MTTS (1 << 18)
+#define MTHCA_DEFAULT_NUM_UARC_SIZE     (1 << 18)
+
+static struct mthca_profile hca_profile = {
+	.num_qp             = MTHCA_DEFAULT_NUM_QP,
+	.rdb_per_qp         = MTHCA_DEFAULT_RDB_PER_QP,
+	.num_cq             = MTHCA_DEFAULT_NUM_CQ,
+	.num_mcg            = MTHCA_DEFAULT_NUM_MCG,
+	.num_mpt            = MTHCA_DEFAULT_NUM_MPT,
+	.num_mtt            = MTHCA_DEFAULT_NUM_MTT,
+	.num_udav           = MTHCA_DEFAULT_NUM_UDAV,          /* Tavor only */
+	.fmr_reserved_mtts  = MTHCA_DEFAULT_NUM_RESERVED_MTTS, /* Tavor only */
+	.uarc_size          = MTHCA_DEFAULT_NUM_UARC_SIZE,     /* Arbel only */
+};
+
+module_param_named(num_qp, hca_profile.num_qp, int, 0444);
+MODULE_PARM_DESC(num_qp, "maximum number of QPs per HCA");
+
+module_param_named(rdb_per_qp, hca_profile.rdb_per_qp, int, 0444);
+MODULE_PARM_DESC(rdb_per_qp, "number of RDB buffers per QP");
+
+module_param_named(num_cq, hca_profile.num_cq, int, 0444);
+MODULE_PARM_DESC(num_cq, "maximum number of CQs per HCA");
+
+module_param_named(num_mcg, hca_profile.num_mcg, int, 0444);
+MODULE_PARM_DESC(num_mcg, "maximum number of multicast groups per HCA");
+
+module_param_named(num_mpt, hca_profile.num_mpt, int, 0444);
+MODULE_PARM_DESC(num_mpt,
+		"maximum number of memory protection table entries per HCA");
+
+module_param_named(num_mtt, hca_profile.num_mtt, int, 0444);
+MODULE_PARM_DESC(num_mtt,
+		 "maximum number of memory translation table segments per HCA");
+
+module_param_named(num_udav, hca_profile.num_udav, int, 0444);
+MODULE_PARM_DESC(num_udav, "maximum number of UD address vectors per HCA");
+
+module_param_named(fmr_reserved_mtts, hca_profile.fmr_reserved_mtts, int, 0444);
+MODULE_PARM_DESC(fmr_reserved_mtts,
+		 "number of memory translation table segments reserved for FMR");
+
+static int log_mtts_per_seg = ilog2(MTHCA_MTT_SEG_SIZE / 8);
+module_param_named(log_mtts_per_seg, log_mtts_per_seg, int, 0444);
+MODULE_PARM_DESC(log_mtts_per_seg, "Log2 number of MTT entries per segment (1-5)");
+
+static char mthca_version[] =
+	DRV_NAME ": Mellanox InfiniBand HCA driver v"
+	DRV_VERSION " (" DRV_RELDATE ")\n";
+
+static int mthca_tune_pci(struct mthca_dev *mdev)
+{
+	if (!tune_pci)
+		return 0;
+
+	/* First try to max out Read Byte Count */
+	if (pci_find_capability(mdev->pdev, PCI_CAP_ID_PCIX)) {
+		if (pcix_set_mmrbc(mdev->pdev, pcix_get_max_mmrbc(mdev->pdev))) {
+			mthca_err(mdev, "Couldn't set PCI-X max read count, "
+				"aborting.\n");
+			return -ENODEV;
+		}
+	} else if (!(mdev->mthca_flags & MTHCA_FLAG_PCIE))
+		mthca_info(mdev, "No PCI-X capability, not setting RBC.\n");
+
+	if (pci_is_pcie(mdev->pdev)) {
+		if (pcie_set_readrq(mdev->pdev, 4096)) {
+			mthca_err(mdev, "Couldn't write PCI Express read request, "
+				"aborting.\n");
+			return -ENODEV;
+		}
+	} else if (mdev->mthca_flags & MTHCA_FLAG_PCIE)
+		mthca_info(mdev, "No PCI Express capability, "
+			   "not setting Max Read Request Size.\n");
+
+	return 0;
+}
+
+static int mthca_dev_lim(struct mthca_dev *mdev, struct mthca_dev_lim *dev_lim)
+{
+	int err;
+
+	mdev->limits.mtt_seg_size = (1 << log_mtts_per_seg) * 8;
+	err = mthca_QUERY_DEV_LIM(mdev, dev_lim);
+	if (err) {
+		mthca_err(mdev, "QUERY_DEV_LIM command returned %d"
+				", aborting.\n", err);
+		return err;
+	}
+	if (dev_lim->min_page_sz > PAGE_SIZE) {
+		mthca_err(mdev, "HCA minimum page size of %d bigger than "
+			  "kernel PAGE_SIZE of %ld, aborting.\n",
+			  dev_lim->min_page_sz, PAGE_SIZE);
+		return -ENODEV;
+	}
+	if (dev_lim->num_ports > MTHCA_MAX_PORTS) {
+		mthca_err(mdev, "HCA has %d ports, but we only support %d, "
+			  "aborting.\n",
+			  dev_lim->num_ports, MTHCA_MAX_PORTS);
+		return -ENODEV;
+	}
+
+	if (dev_lim->uar_size > pci_resource_len(mdev->pdev, 2)) {
+		mthca_err(mdev, "HCA reported UAR size of 0x%x bigger than "
+			  "PCI resource 2 size of 0x%llx, aborting.\n",
+			  dev_lim->uar_size,
+			  (unsigned long long)pci_resource_len(mdev->pdev, 2));
+		return -ENODEV;
+	}
+
+	mdev->limits.num_ports      	= dev_lim->num_ports;
+	mdev->limits.vl_cap             = dev_lim->max_vl;
+	mdev->limits.mtu_cap            = dev_lim->max_mtu;
+	mdev->limits.gid_table_len  	= dev_lim->max_gids;
+	mdev->limits.pkey_table_len 	= dev_lim->max_pkeys;
+	mdev->limits.local_ca_ack_delay = dev_lim->local_ca_ack_delay;
+	/*
+	 * Need to allow for worst case send WQE overhead and check
+	 * whether max_desc_sz imposes a lower limit than max_sg; UD
+	 * send has the biggest overhead.
+	 */
+	mdev->limits.max_sg		= min_t(int, dev_lim->max_sg,
+					      (dev_lim->max_desc_sz -
+					       sizeof (struct mthca_next_seg) -
+					       (mthca_is_memfree(mdev) ?
+						sizeof (struct mthca_arbel_ud_seg) :
+						sizeof (struct mthca_tavor_ud_seg))) /
+						sizeof (struct mthca_data_seg));
+	mdev->limits.max_wqes           = dev_lim->max_qp_sz;
+	mdev->limits.max_qp_init_rdma   = dev_lim->max_requester_per_qp;
+	mdev->limits.reserved_qps       = dev_lim->reserved_qps;
+	mdev->limits.max_srq_wqes       = dev_lim->max_srq_sz;
+	mdev->limits.reserved_srqs      = dev_lim->reserved_srqs;
+	mdev->limits.reserved_eecs      = dev_lim->reserved_eecs;
+	mdev->limits.max_desc_sz        = dev_lim->max_desc_sz;
+	mdev->limits.max_srq_sge	= mthca_max_srq_sge(mdev);
+	/*
+	 * Subtract 1 from the limit because we need to allocate a
+	 * spare CQE so the HCA HW can tell the difference between an
+	 * empty CQ and a full CQ.
+	 */
+	mdev->limits.max_cqes           = dev_lim->max_cq_sz - 1;
+	mdev->limits.reserved_cqs       = dev_lim->reserved_cqs;
+	mdev->limits.reserved_eqs       = dev_lim->reserved_eqs;
+	mdev->limits.reserved_mtts      = dev_lim->reserved_mtts;
+	mdev->limits.reserved_mrws      = dev_lim->reserved_mrws;
+	mdev->limits.reserved_uars      = dev_lim->reserved_uars;
+	mdev->limits.reserved_pds       = dev_lim->reserved_pds;
+	mdev->limits.port_width_cap     = dev_lim->max_port_width;
+	mdev->limits.page_size_cap      = ~(u32) (dev_lim->min_page_sz - 1);
+	mdev->limits.flags              = dev_lim->flags;
+	/*
+	 * For old FW that doesn't return static rate support, use a
+	 * value of 0x3 (only static rate values of 0 or 1 are handled),
+	 * except on Sinai, where even old FW can handle static rate
+	 * values of 2 and 3.
+	 */
+	if (dev_lim->stat_rate_support)
+		mdev->limits.stat_rate_support = dev_lim->stat_rate_support;
+	else if (mdev->mthca_flags & MTHCA_FLAG_SINAI_OPT)
+		mdev->limits.stat_rate_support = 0xf;
+	else
+		mdev->limits.stat_rate_support = 0x3;
+
+	/* IB_DEVICE_RESIZE_MAX_WR not supported by driver.
+	   May be doable since hardware supports it for SRQ.
+
+	   IB_DEVICE_N_NOTIFY_CQ is supported by hardware but not by driver.
+
+	   IB_DEVICE_SRQ_RESIZE is supported by hardware but SRQ is not
+	   supported by driver. */
+	mdev->device_cap_flags = IB_DEVICE_CHANGE_PHY_PORT |
+		IB_DEVICE_PORT_ACTIVE_EVENT |
+		IB_DEVICE_SYS_IMAGE_GUID |
+		IB_DEVICE_RC_RNR_NAK_GEN;
+
+	if (dev_lim->flags & DEV_LIM_FLAG_BAD_PKEY_CNTR)
+		mdev->device_cap_flags |= IB_DEVICE_BAD_PKEY_CNTR;
+
+	if (dev_lim->flags & DEV_LIM_FLAG_BAD_QKEY_CNTR)
+		mdev->device_cap_flags |= IB_DEVICE_BAD_QKEY_CNTR;
+
+	if (dev_lim->flags & DEV_LIM_FLAG_RAW_MULTI)
+		mdev->device_cap_flags |= IB_DEVICE_RAW_MULTI;
+
+	if (dev_lim->flags & DEV_LIM_FLAG_AUTO_PATH_MIG)
+		mdev->device_cap_flags |= IB_DEVICE_AUTO_PATH_MIG;
+
+	if (dev_lim->flags & DEV_LIM_FLAG_UD_AV_PORT_ENFORCE)
+		mdev->device_cap_flags |= IB_DEVICE_UD_AV_PORT_ENFORCE;
+
+	if (dev_lim->flags & DEV_LIM_FLAG_SRQ)
+		mdev->mthca_flags |= MTHCA_FLAG_SRQ;
+
+	if (mthca_is_memfree(mdev))
+		if (dev_lim->flags & DEV_LIM_FLAG_IPOIB_CSUM)
+			mdev->device_cap_flags |= IB_DEVICE_UD_IP_CSUM;
+
+	return 0;
+}
+
+static int mthca_init_tavor(struct mthca_dev *mdev)
+{
+	s64 size;
+	int err;
+	struct mthca_dev_lim        dev_lim;
+	struct mthca_profile        profile;
+	struct mthca_init_hca_param init_hca;
+
+	err = mthca_SYS_EN(mdev);
+	if (err) {
+		mthca_err(mdev, "SYS_EN command returned %d, aborting.\n", err);
+		return err;
+	}
+
+	err = mthca_QUERY_FW(mdev);
+	if (err) {
+		mthca_err(mdev, "QUERY_FW command returned %d,"
+				" aborting.\n", err);
+		goto err_disable;
+	}
+	err = mthca_QUERY_DDR(mdev);
+	if (err) {
+		mthca_err(mdev, "QUERY_DDR command returned %d, aborting.\n", err);
+		goto err_disable;
+	}
+
+	err = mthca_dev_lim(mdev, &dev_lim);
+	if (err) {
+		mthca_err(mdev, "QUERY_DEV_LIM command returned %d, aborting.\n", err);
+		goto err_disable;
+	}
+
+	profile = hca_profile;
+	profile.num_uar   = dev_lim.uar_size / PAGE_SIZE;
+	profile.uarc_size = 0;
+	if (mdev->mthca_flags & MTHCA_FLAG_SRQ)
+		profile.num_srq = dev_lim.max_srqs;
+
+	size = mthca_make_profile(mdev, &profile, &dev_lim, &init_hca);
+	if (size < 0) {
+		err = size;
+		goto err_disable;
+	}
+
+	err = mthca_INIT_HCA(mdev, &init_hca);
+	if (err) {
+		mthca_err(mdev, "INIT_HCA command returned %d, aborting.\n", err);
+		goto err_disable;
+	}
+
+	return 0;
+
+err_disable:
+	mthca_SYS_DIS(mdev);
+
+	return err;
+}
+
+static int mthca_load_fw(struct mthca_dev *mdev)
+{
+	int err;
+
+	/* FIXME: use HCA-attached memory for FW if present */
+
+	mdev->fw.arbel.fw_icm =
+		mthca_alloc_icm(mdev, mdev->fw.arbel.fw_pages,
+				GFP_HIGHUSER | __GFP_NOWARN, 0);
+	if (!mdev->fw.arbel.fw_icm) {
+		mthca_err(mdev, "Couldn't allocate FW area, aborting.\n");
+		return -ENOMEM;
+	}
+
+	err = mthca_MAP_FA(mdev, mdev->fw.arbel.fw_icm);
+	if (err) {
+		mthca_err(mdev, "MAP_FA command returned %d, aborting.\n", err);
+		goto err_free;
+	}
+	err = mthca_RUN_FW(mdev);
+	if (err) {
+		mthca_err(mdev, "RUN_FW command returned %d, aborting.\n", err);
+		goto err_unmap_fa;
+	}
+
+	return 0;
+
+err_unmap_fa:
+	mthca_UNMAP_FA(mdev);
+
+err_free:
+	mthca_free_icm(mdev, mdev->fw.arbel.fw_icm, 0);
+	return err;
+}
+
+static int mthca_init_icm(struct mthca_dev *mdev,
+			  struct mthca_dev_lim *dev_lim,
+			  struct mthca_init_hca_param *init_hca,
+			  u64 icm_size)
+{
+	u64 aux_pages;
+	int err;
+
+	err = mthca_SET_ICM_SIZE(mdev, icm_size, &aux_pages);
+	if (err) {
+		mthca_err(mdev, "SET_ICM_SIZE command returned %d, aborting.\n", err);
+		return err;
+	}
+
+	mthca_dbg(mdev, "%lld KB of HCA context requires %lld KB aux memory.\n",
+		  (unsigned long long) icm_size >> 10,
+		  (unsigned long long) aux_pages << 2);
+
+	mdev->fw.arbel.aux_icm = mthca_alloc_icm(mdev, aux_pages,
+						 GFP_HIGHUSER | __GFP_NOWARN, 0);
+	if (!mdev->fw.arbel.aux_icm) {
+		mthca_err(mdev, "Couldn't allocate aux memory, aborting.\n");
+		return -ENOMEM;
+	}
+
+	err = mthca_MAP_ICM_AUX(mdev, mdev->fw.arbel.aux_icm);
+	if (err) {
+		mthca_err(mdev, "MAP_ICM_AUX returned %d, aborting.\n", err);
+		goto err_free_aux;
+	}
+
+	err = mthca_map_eq_icm(mdev, init_hca->eqc_base);
+	if (err) {
+		mthca_err(mdev, "Failed to map EQ context memory, aborting.\n");
+		goto err_unmap_aux;
+	}
+
+	/* CPU writes to non-reserved MTTs, while HCA might DMA to reserved mtts */
+	mdev->limits.reserved_mtts = ALIGN(mdev->limits.reserved_mtts * mdev->limits.mtt_seg_size,
+					   dma_get_cache_alignment()) / mdev->limits.mtt_seg_size;
+
+	mdev->mr_table.mtt_table = mthca_alloc_icm_table(mdev, init_hca->mtt_base,
+							 mdev->limits.mtt_seg_size,
+							 mdev->limits.num_mtt_segs,
+							 mdev->limits.reserved_mtts,
+							 1, 0);
+	if (!mdev->mr_table.mtt_table) {
+		mthca_err(mdev, "Failed to map MTT context memory, aborting.\n");
+		err = -ENOMEM;
+		goto err_unmap_eq;
+	}
+
+	mdev->mr_table.mpt_table = mthca_alloc_icm_table(mdev, init_hca->mpt_base,
+							 dev_lim->mpt_entry_sz,
+							 mdev->limits.num_mpts,
+							 mdev->limits.reserved_mrws,
+							 1, 1);
+	if (!mdev->mr_table.mpt_table) {
+		mthca_err(mdev, "Failed to map MPT context memory, aborting.\n");
+		err = -ENOMEM;
+		goto err_unmap_mtt;
+	}
+
+	mdev->qp_table.qp_table = mthca_alloc_icm_table(mdev, init_hca->qpc_base,
+							dev_lim->qpc_entry_sz,
+							mdev->limits.num_qps,
+							mdev->limits.reserved_qps,
+							0, 0);
+	if (!mdev->qp_table.qp_table) {
+		mthca_err(mdev, "Failed to map QP context memory, aborting.\n");
+		err = -ENOMEM;
+		goto err_unmap_mpt;
+	}
+
+	mdev->qp_table.eqp_table = mthca_alloc_icm_table(mdev, init_hca->eqpc_base,
+							 dev_lim->eqpc_entry_sz,
+							 mdev->limits.num_qps,
+							 mdev->limits.reserved_qps,
+							 0, 0);
+	if (!mdev->qp_table.eqp_table) {
+		mthca_err(mdev, "Failed to map EQP context memory, aborting.\n");
+		err = -ENOMEM;
+		goto err_unmap_qp;
+	}
+
+	mdev->qp_table.rdb_table = mthca_alloc_icm_table(mdev, init_hca->rdb_base,
+							 MTHCA_RDB_ENTRY_SIZE,
+							 mdev->limits.num_qps <<
+							 mdev->qp_table.rdb_shift, 0,
+							 0, 0);
+	if (!mdev->qp_table.rdb_table) {
+		mthca_err(mdev, "Failed to map RDB context memory, aborting\n");
+		err = -ENOMEM;
+		goto err_unmap_eqp;
+	}
+
+       mdev->cq_table.table = mthca_alloc_icm_table(mdev, init_hca->cqc_base,
+						    dev_lim->cqc_entry_sz,
+						    mdev->limits.num_cqs,
+						    mdev->limits.reserved_cqs,
+						    0, 0);
+	if (!mdev->cq_table.table) {
+		mthca_err(mdev, "Failed to map CQ context memory, aborting.\n");
+		err = -ENOMEM;
+		goto err_unmap_rdb;
+	}
+
+	if (mdev->mthca_flags & MTHCA_FLAG_SRQ) {
+		mdev->srq_table.table =
+			mthca_alloc_icm_table(mdev, init_hca->srqc_base,
+					      dev_lim->srq_entry_sz,
+					      mdev->limits.num_srqs,
+					      mdev->limits.reserved_srqs,
+					      0, 0);
+		if (!mdev->srq_table.table) {
+			mthca_err(mdev, "Failed to map SRQ context memory, "
+				  "aborting.\n");
+			err = -ENOMEM;
+			goto err_unmap_cq;
+		}
+	}
+
+	/*
+	 * It's not strictly required, but for simplicity just map the
+	 * whole multicast group table now.  The table isn't very big
+	 * and it's a lot easier than trying to track ref counts.
+	 */
+	mdev->mcg_table.table = mthca_alloc_icm_table(mdev, init_hca->mc_base,
+						      MTHCA_MGM_ENTRY_SIZE,
+						      mdev->limits.num_mgms +
+						      mdev->limits.num_amgms,
+						      mdev->limits.num_mgms +
+						      mdev->limits.num_amgms,
+						      0, 0);
+	if (!mdev->mcg_table.table) {
+		mthca_err(mdev, "Failed to map MCG context memory, aborting.\n");
+		err = -ENOMEM;
+		goto err_unmap_srq;
+	}
+
+	return 0;
+
+err_unmap_srq:
+	if (mdev->mthca_flags & MTHCA_FLAG_SRQ)
+		mthca_free_icm_table(mdev, mdev->srq_table.table);
+
+err_unmap_cq:
+	mthca_free_icm_table(mdev, mdev->cq_table.table);
+
+err_unmap_rdb:
+	mthca_free_icm_table(mdev, mdev->qp_table.rdb_table);
+
+err_unmap_eqp:
+	mthca_free_icm_table(mdev, mdev->qp_table.eqp_table);
+
+err_unmap_qp:
+	mthca_free_icm_table(mdev, mdev->qp_table.qp_table);
+
+err_unmap_mpt:
+	mthca_free_icm_table(mdev, mdev->mr_table.mpt_table);
+
+err_unmap_mtt:
+	mthca_free_icm_table(mdev, mdev->mr_table.mtt_table);
+
+err_unmap_eq:
+	mthca_unmap_eq_icm(mdev);
+
+err_unmap_aux:
+	mthca_UNMAP_ICM_AUX(mdev);
+
+err_free_aux:
+	mthca_free_icm(mdev, mdev->fw.arbel.aux_icm, 0);
+
+	return err;
+}
+
+static void mthca_free_icms(struct mthca_dev *mdev)
+{
+
+	mthca_free_icm_table(mdev, mdev->mcg_table.table);
+	if (mdev->mthca_flags & MTHCA_FLAG_SRQ)
+		mthca_free_icm_table(mdev, mdev->srq_table.table);
+	mthca_free_icm_table(mdev, mdev->cq_table.table);
+	mthca_free_icm_table(mdev, mdev->qp_table.rdb_table);
+	mthca_free_icm_table(mdev, mdev->qp_table.eqp_table);
+	mthca_free_icm_table(mdev, mdev->qp_table.qp_table);
+	mthca_free_icm_table(mdev, mdev->mr_table.mpt_table);
+	mthca_free_icm_table(mdev, mdev->mr_table.mtt_table);
+	mthca_unmap_eq_icm(mdev);
+
+	mthca_UNMAP_ICM_AUX(mdev);
+	mthca_free_icm(mdev, mdev->fw.arbel.aux_icm, 0);
+}
+
+static int mthca_init_arbel(struct mthca_dev *mdev)
+{
+	struct mthca_dev_lim        dev_lim;
+	struct mthca_profile        profile;
+	struct mthca_init_hca_param init_hca;
+	s64 icm_size;
+	int err;
+
+	err = mthca_QUERY_FW(mdev);
+	if (err) {
+		mthca_err(mdev, "QUERY_FW command failed %d, aborting.\n", err);
+		return err;
+	}
+
+	err = mthca_ENABLE_LAM(mdev);
+	if (err == -EAGAIN) {
+		mthca_dbg(mdev, "No HCA-attached memory (running in MemFree mode)\n");
+		mdev->mthca_flags |= MTHCA_FLAG_NO_LAM;
+	} else if (err) {
+		mthca_err(mdev, "ENABLE_LAM returned %d, aborting.\n", err);
+		return err;
+	}
+
+	err = mthca_load_fw(mdev);
+	if (err) {
+		mthca_err(mdev, "Loading FW returned %d, aborting.\n", err);
+		goto err_disable;
+	}
+
+	err = mthca_dev_lim(mdev, &dev_lim);
+	if (err) {
+		mthca_err(mdev, "QUERY_DEV_LIM returned %d, aborting.\n", err);
+		goto err_stop_fw;
+	}
+
+	profile = hca_profile;
+	profile.num_uar  = dev_lim.uar_size / PAGE_SIZE;
+	profile.num_udav = 0;
+	if (mdev->mthca_flags & MTHCA_FLAG_SRQ)
+		profile.num_srq = dev_lim.max_srqs;
+
+	icm_size = mthca_make_profile(mdev, &profile, &dev_lim, &init_hca);
+	if (icm_size < 0) {
+		err = icm_size;
+		goto err_stop_fw;
+	}
+
+	err = mthca_init_icm(mdev, &dev_lim, &init_hca, icm_size);
+	if (err)
+		goto err_stop_fw;
+
+	err = mthca_INIT_HCA(mdev, &init_hca);
+	if (err) {
+		mthca_err(mdev, "INIT_HCA command returned %d, aborting.\n", err);
+		goto err_free_icm;
+	}
+
+	return 0;
+
+err_free_icm:
+	mthca_free_icms(mdev);
+
+err_stop_fw:
+	mthca_UNMAP_FA(mdev);
+	mthca_free_icm(mdev, mdev->fw.arbel.fw_icm, 0);
+
+err_disable:
+	if (!(mdev->mthca_flags & MTHCA_FLAG_NO_LAM))
+		mthca_DISABLE_LAM(mdev);
+
+	return err;
+}
+
+static void mthca_close_hca(struct mthca_dev *mdev)
+{
+	mthca_CLOSE_HCA(mdev, 0);
+
+	if (mthca_is_memfree(mdev)) {
+		mthca_free_icms(mdev);
+
+		mthca_UNMAP_FA(mdev);
+		mthca_free_icm(mdev, mdev->fw.arbel.fw_icm, 0);
+
+		if (!(mdev->mthca_flags & MTHCA_FLAG_NO_LAM))
+			mthca_DISABLE_LAM(mdev);
+	} else
+		mthca_SYS_DIS(mdev);
+}
+
+static int mthca_init_hca(struct mthca_dev *mdev)
+{
+	int err;
+	struct mthca_adapter adapter;
+
+	if (mthca_is_memfree(mdev))
+		err = mthca_init_arbel(mdev);
+	else
+		err = mthca_init_tavor(mdev);
+
+	if (err)
+		return err;
+
+	err = mthca_QUERY_ADAPTER(mdev, &adapter);
+	if (err) {
+		mthca_err(mdev, "QUERY_ADAPTER command returned %d, aborting.\n", err);
+		goto err_close;
+	}
+
+	mdev->eq_table.inta_pin = adapter.inta_pin;
+	if (!mthca_is_memfree(mdev))
+		mdev->rev_id = adapter.revision_id;
+	memcpy(mdev->board_id, adapter.board_id, sizeof mdev->board_id);
+
+	return 0;
+
+err_close:
+	mthca_close_hca(mdev);
+	return err;
+}
+
+static int mthca_setup_hca(struct mthca_dev *dev)
+{
+	int err;
+
+	MTHCA_INIT_DOORBELL_LOCK(&dev->doorbell_lock);
+
+	err = mthca_init_uar_table(dev);
+	if (err) {
+		mthca_err(dev, "Failed to initialize "
+			  "user access region table, aborting.\n");
+		return err;
+	}
+
+	err = mthca_uar_alloc(dev, &dev->driver_uar);
+	if (err) {
+		mthca_err(dev, "Failed to allocate driver access region, "
+			  "aborting.\n");
+		goto err_uar_table_free;
+	}
+
+	dev->kar = ioremap((phys_addr_t) dev->driver_uar.pfn << PAGE_SHIFT, PAGE_SIZE);
+	if (!dev->kar) {
+		mthca_err(dev, "Couldn't map kernel access region, "
+			  "aborting.\n");
+		err = -ENOMEM;
+		goto err_uar_free;
+	}
+
+	err = mthca_init_pd_table(dev);
+	if (err) {
+		mthca_err(dev, "Failed to initialize "
+			  "protection domain table, aborting.\n");
+		goto err_kar_unmap;
+	}
+
+	err = mthca_init_mr_table(dev);
+	if (err) {
+		mthca_err(dev, "Failed to initialize "
+			  "memory region table, aborting.\n");
+		goto err_pd_table_free;
+	}
+
+	err = mthca_pd_alloc(dev, 1, &dev->driver_pd);
+	if (err) {
+		mthca_err(dev, "Failed to create driver PD, "
+			  "aborting.\n");
+		goto err_mr_table_free;
+	}
+
+	err = mthca_init_eq_table(dev);
+	if (err) {
+		mthca_err(dev, "Failed to initialize "
+			  "event queue table, aborting.\n");
+		goto err_pd_free;
+	}
+
+	err = mthca_cmd_use_events(dev);
+	if (err) {
+		mthca_err(dev, "Failed to switch to event-driven "
+			  "firmware commands, aborting.\n");
+		goto err_eq_table_free;
+	}
+
+	err = mthca_NOP(dev);
+	if (err) {
+		if (dev->mthca_flags & MTHCA_FLAG_MSI_X) {
+			mthca_warn(dev, "NOP command failed to generate interrupt "
+				   "(IRQ %d).\n",
+				   dev->eq_table.eq[MTHCA_EQ_CMD].msi_x_vector);
+			mthca_warn(dev, "Trying again with MSI-X disabled.\n");
+		} else {
+			mthca_err(dev, "NOP command failed to generate interrupt "
+				  "(IRQ %d), aborting.\n",
+				  dev->pdev->irq);
+			mthca_err(dev, "BIOS or ACPI interrupt routing problem?\n");
+		}
+
+		goto err_cmd_poll;
+	}
+
+	mthca_dbg(dev, "NOP command IRQ test passed\n");
+
+	err = mthca_init_cq_table(dev);
+	if (err) {
+		mthca_err(dev, "Failed to initialize "
+			  "completion queue table, aborting.\n");
+		goto err_cmd_poll;
+	}
+
+	err = mthca_init_srq_table(dev);
+	if (err) {
+		mthca_err(dev, "Failed to initialize "
+			  "shared receive queue table, aborting.\n");
+		goto err_cq_table_free;
+	}
+
+	err = mthca_init_qp_table(dev);
+	if (err) {
+		mthca_err(dev, "Failed to initialize "
+			  "queue pair table, aborting.\n");
+		goto err_srq_table_free;
+	}
+
+	err = mthca_init_av_table(dev);
+	if (err) {
+		mthca_err(dev, "Failed to initialize "
+			  "address vector table, aborting.\n");
+		goto err_qp_table_free;
+	}
+
+	err = mthca_init_mcg_table(dev);
+	if (err) {
+		mthca_err(dev, "Failed to initialize "
+			  "multicast group table, aborting.\n");
+		goto err_av_table_free;
+	}
+
+	return 0;
+
+err_av_table_free:
+	mthca_cleanup_av_table(dev);
+
+err_qp_table_free:
+	mthca_cleanup_qp_table(dev);
+
+err_srq_table_free:
+	mthca_cleanup_srq_table(dev);
+
+err_cq_table_free:
+	mthca_cleanup_cq_table(dev);
+
+err_cmd_poll:
+	mthca_cmd_use_polling(dev);
+
+err_eq_table_free:
+	mthca_cleanup_eq_table(dev);
+
+err_pd_free:
+	mthca_pd_free(dev, &dev->driver_pd);
+
+err_mr_table_free:
+	mthca_cleanup_mr_table(dev);
+
+err_pd_table_free:
+	mthca_cleanup_pd_table(dev);
+
+err_kar_unmap:
+	iounmap(dev->kar);
+
+err_uar_free:
+	mthca_uar_free(dev, &dev->driver_uar);
+
+err_uar_table_free:
+	mthca_cleanup_uar_table(dev);
+	return err;
+}
+
+static int mthca_enable_msi_x(struct mthca_dev *mdev)
+{
+	struct msix_entry entries[3];
+	int err;
+
+	entries[0].entry = 0;
+	entries[1].entry = 1;
+	entries[2].entry = 2;
+
+	err = pci_enable_msix_exact(mdev->pdev, entries, ARRAY_SIZE(entries));
+	if (err)
+		return err;
+
+	mdev->eq_table.eq[MTHCA_EQ_COMP ].msi_x_vector = entries[0].vector;
+	mdev->eq_table.eq[MTHCA_EQ_ASYNC].msi_x_vector = entries[1].vector;
+	mdev->eq_table.eq[MTHCA_EQ_CMD  ].msi_x_vector = entries[2].vector;
+
+	return 0;
+}
+
+/* Types of supported HCA */
+enum {
+	TAVOR,			/* MT23108                        */
+	ARBEL_COMPAT,		/* MT25208 in Tavor compat mode   */
+	ARBEL_NATIVE,		/* MT25208 with extended features */
+	SINAI			/* MT25204 */
+};
+
+#define MTHCA_FW_VER(major, minor, subminor) \
+	(((u64) (major) << 32) | ((u64) (minor) << 16) | (u64) (subminor))
+
+static struct {
+	u64 latest_fw;
+	u32 flags;
+} mthca_hca_table[] = {
+	[TAVOR]        = { .latest_fw = MTHCA_FW_VER(3, 5, 0),
+			   .flags     = 0 },
+	[ARBEL_COMPAT] = { .latest_fw = MTHCA_FW_VER(4, 8, 200),
+			   .flags     = MTHCA_FLAG_PCIE },
+	[ARBEL_NATIVE] = { .latest_fw = MTHCA_FW_VER(5, 3, 0),
+			   .flags     = MTHCA_FLAG_MEMFREE |
+					MTHCA_FLAG_PCIE },
+	[SINAI]        = { .latest_fw = MTHCA_FW_VER(1, 2, 0),
+			   .flags     = MTHCA_FLAG_MEMFREE |
+					MTHCA_FLAG_PCIE    |
+					MTHCA_FLAG_SINAI_OPT }
+};
+
+static int __mthca_init_one(struct pci_dev *pdev, int hca_type)
+{
+	int ddr_hidden = 0;
+	int err;
+	struct mthca_dev *mdev;
+
+	printk(KERN_INFO PFX "Initializing %s\n",
+	       pci_name(pdev));
+
+	err = pci_enable_device(pdev);
+	if (err) {
+		dev_err(&pdev->dev, "Cannot enable PCI device, "
+			"aborting.\n");
+		return err;
+	}
+
+	/*
+	 * Check for BARs.  We expect 0: 1MB, 2: 8MB, 4: DDR (may not
+	 * be present)
+	 */
+	if (!(pci_resource_flags(pdev, 0) & IORESOURCE_MEM) ||
+	    pci_resource_len(pdev, 0) != 1 << 20) {
+		dev_err(&pdev->dev, "Missing DCS, aborting.\n");
+		err = -ENODEV;
+		goto err_disable_pdev;
+	}
+	if (!(pci_resource_flags(pdev, 2) & IORESOURCE_MEM)) {
+		dev_err(&pdev->dev, "Missing UAR, aborting.\n");
+		err = -ENODEV;
+		goto err_disable_pdev;
+	}
+	if (!(pci_resource_flags(pdev, 4) & IORESOURCE_MEM))
+		ddr_hidden = 1;
+
+	err = pci_request_regions(pdev, DRV_NAME);
+	if (err) {
+		dev_err(&pdev->dev, "Cannot obtain PCI resources, "
+			"aborting.\n");
+		goto err_disable_pdev;
+	}
+
+	pci_set_master(pdev);
+
+	err = pci_set_dma_mask(pdev, DMA_BIT_MASK(64));
+	if (err) {
+		dev_warn(&pdev->dev, "Warning: couldn't set 64-bit PCI DMA mask.\n");
+		err = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
+		if (err) {
+			dev_err(&pdev->dev, "Can't set PCI DMA mask, aborting.\n");
+			goto err_free_res;
+		}
+	}
+	err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64));
+	if (err) {
+		dev_warn(&pdev->dev, "Warning: couldn't set 64-bit "
+			 "consistent PCI DMA mask.\n");
+		err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32));
+		if (err) {
+			dev_err(&pdev->dev, "Can't set consistent PCI DMA mask, "
+				"aborting.\n");
+			goto err_free_res;
+		}
+	}
+
+	/* We can handle large RDMA requests, so allow larger segments. */
+	dma_set_max_seg_size(&pdev->dev, 1024 * 1024 * 1024);
+
+	mdev = (struct mthca_dev *) ib_alloc_device(sizeof *mdev);
+	if (!mdev) {
+		dev_err(&pdev->dev, "Device struct alloc failed, "
+			"aborting.\n");
+		err = -ENOMEM;
+		goto err_free_res;
+	}
+
+	mdev->pdev = pdev;
+
+	mdev->mthca_flags = mthca_hca_table[hca_type].flags;
+	if (ddr_hidden)
+		mdev->mthca_flags |= MTHCA_FLAG_DDR_HIDDEN;
+
+	/*
+	 * Now reset the HCA before we touch the PCI capabilities or
+	 * attempt a firmware command, since a boot ROM may have left
+	 * the HCA in an undefined state.
+	 */
+	err = mthca_reset(mdev);
+	if (err) {
+		mthca_err(mdev, "Failed to reset HCA, aborting.\n");
+		goto err_free_dev;
+	}
+
+	if (mthca_cmd_init(mdev)) {
+		mthca_err(mdev, "Failed to init command interface, aborting.\n");
+		goto err_free_dev;
+	}
+
+	err = mthca_tune_pci(mdev);
+	if (err)
+		goto err_cmd;
+
+	err = mthca_init_hca(mdev);
+	if (err)
+		goto err_cmd;
+
+	if (mdev->fw_ver < mthca_hca_table[hca_type].latest_fw) {
+		mthca_warn(mdev, "HCA FW version %d.%d.%03d is old (%d.%d.%03d is current).\n",
+			   (int) (mdev->fw_ver >> 32), (int) (mdev->fw_ver >> 16) & 0xffff,
+			   (int) (mdev->fw_ver & 0xffff),
+			   (int) (mthca_hca_table[hca_type].latest_fw >> 32),
+			   (int) (mthca_hca_table[hca_type].latest_fw >> 16) & 0xffff,
+			   (int) (mthca_hca_table[hca_type].latest_fw & 0xffff));
+		mthca_warn(mdev, "If you have problems, try updating your HCA FW.\n");
+	}
+
+	if (msi_x && !mthca_enable_msi_x(mdev))
+		mdev->mthca_flags |= MTHCA_FLAG_MSI_X;
+
+	err = mthca_setup_hca(mdev);
+	if (err == -EBUSY && (mdev->mthca_flags & MTHCA_FLAG_MSI_X)) {
+		if (mdev->mthca_flags & MTHCA_FLAG_MSI_X)
+			pci_disable_msix(pdev);
+		mdev->mthca_flags &= ~MTHCA_FLAG_MSI_X;
+
+		err = mthca_setup_hca(mdev);
+	}
+
+	if (err)
+		goto err_close;
+
+	err = mthca_register_device(mdev);
+	if (err)
+		goto err_cleanup;
+
+	err = mthca_create_agents(mdev);
+	if (err)
+		goto err_unregister;
+
+	pci_set_drvdata(pdev, mdev);
+	mdev->hca_type = hca_type;
+
+	mdev->active = true;
+
+	return 0;
+
+err_unregister:
+	mthca_unregister_device(mdev);
+
+err_cleanup:
+	mthca_cleanup_mcg_table(mdev);
+	mthca_cleanup_av_table(mdev);
+	mthca_cleanup_qp_table(mdev);
+	mthca_cleanup_srq_table(mdev);
+	mthca_cleanup_cq_table(mdev);
+	mthca_cmd_use_polling(mdev);
+	mthca_cleanup_eq_table(mdev);
+
+	mthca_pd_free(mdev, &mdev->driver_pd);
+
+	mthca_cleanup_mr_table(mdev);
+	mthca_cleanup_pd_table(mdev);
+	mthca_cleanup_uar_table(mdev);
+
+err_close:
+	if (mdev->mthca_flags & MTHCA_FLAG_MSI_X)
+		pci_disable_msix(pdev);
+
+	mthca_close_hca(mdev);
+
+err_cmd:
+	mthca_cmd_cleanup(mdev);
+
+err_free_dev:
+	ib_dealloc_device(&mdev->ib_dev);
+
+err_free_res:
+	pci_release_regions(pdev);
+
+err_disable_pdev:
+	pci_disable_device(pdev);
+	pci_set_drvdata(pdev, NULL);
+	return err;
+}
+
+static void __mthca_remove_one(struct pci_dev *pdev)
+{
+	struct mthca_dev *mdev = pci_get_drvdata(pdev);
+	int p;
+
+	if (mdev) {
+		mthca_free_agents(mdev);
+		mthca_unregister_device(mdev);
+
+		for (p = 1; p <= mdev->limits.num_ports; ++p)
+			mthca_CLOSE_IB(mdev, p);
+
+		mthca_cleanup_mcg_table(mdev);
+		mthca_cleanup_av_table(mdev);
+		mthca_cleanup_qp_table(mdev);
+		mthca_cleanup_srq_table(mdev);
+		mthca_cleanup_cq_table(mdev);
+		mthca_cmd_use_polling(mdev);
+		mthca_cleanup_eq_table(mdev);
+
+		mthca_pd_free(mdev, &mdev->driver_pd);
+
+		mthca_cleanup_mr_table(mdev);
+		mthca_cleanup_pd_table(mdev);
+
+		iounmap(mdev->kar);
+		mthca_uar_free(mdev, &mdev->driver_uar);
+		mthca_cleanup_uar_table(mdev);
+		mthca_close_hca(mdev);
+		mthca_cmd_cleanup(mdev);
+
+		if (mdev->mthca_flags & MTHCA_FLAG_MSI_X)
+			pci_disable_msix(pdev);
+
+		ib_dealloc_device(&mdev->ib_dev);
+		pci_release_regions(pdev);
+		pci_disable_device(pdev);
+		pci_set_drvdata(pdev, NULL);
+	}
+}
+
+int __mthca_restart_one(struct pci_dev *pdev)
+{
+	struct mthca_dev *mdev;
+	int hca_type;
+
+	mdev = pci_get_drvdata(pdev);
+	if (!mdev)
+		return -ENODEV;
+	hca_type = mdev->hca_type;
+	__mthca_remove_one(pdev);
+	return __mthca_init_one(pdev, hca_type);
+}
+
+static int mthca_init_one(struct pci_dev *pdev, const struct pci_device_id *id)
+{
+	int ret;
+
+	mutex_lock(&mthca_device_mutex);
+
+	printk_once(KERN_INFO "%s", mthca_version);
+
+	if (id->driver_data >= ARRAY_SIZE(mthca_hca_table)) {
+		printk(KERN_ERR PFX "%s has invalid driver data %lx\n",
+		       pci_name(pdev), id->driver_data);
+		mutex_unlock(&mthca_device_mutex);
+		return -ENODEV;
+	}
+
+	ret = __mthca_init_one(pdev, id->driver_data);
+
+	mutex_unlock(&mthca_device_mutex);
+
+	return ret;
+}
+
+static void mthca_remove_one(struct pci_dev *pdev)
+{
+	mutex_lock(&mthca_device_mutex);
+	__mthca_remove_one(pdev);
+	mutex_unlock(&mthca_device_mutex);
+}
+
+static struct pci_device_id mthca_pci_table[] = {
+	{ PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, PCI_DEVICE_ID_MELLANOX_TAVOR),
+	  .driver_data = TAVOR },
+	{ PCI_DEVICE(PCI_VENDOR_ID_TOPSPIN, PCI_DEVICE_ID_MELLANOX_TAVOR),
+	  .driver_data = TAVOR },
+	{ PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, PCI_DEVICE_ID_MELLANOX_ARBEL_COMPAT),
+	  .driver_data = ARBEL_COMPAT },
+	{ PCI_DEVICE(PCI_VENDOR_ID_TOPSPIN, PCI_DEVICE_ID_MELLANOX_ARBEL_COMPAT),
+	  .driver_data = ARBEL_COMPAT },
+	{ PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, PCI_DEVICE_ID_MELLANOX_ARBEL),
+	  .driver_data = ARBEL_NATIVE },
+	{ PCI_DEVICE(PCI_VENDOR_ID_TOPSPIN, PCI_DEVICE_ID_MELLANOX_ARBEL),
+	  .driver_data = ARBEL_NATIVE },
+	{ PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, PCI_DEVICE_ID_MELLANOX_SINAI),
+	  .driver_data = SINAI },
+	{ PCI_DEVICE(PCI_VENDOR_ID_TOPSPIN, PCI_DEVICE_ID_MELLANOX_SINAI),
+	  .driver_data = SINAI },
+	{ PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, PCI_DEVICE_ID_MELLANOX_SINAI_OLD),
+	  .driver_data = SINAI },
+	{ PCI_DEVICE(PCI_VENDOR_ID_TOPSPIN, PCI_DEVICE_ID_MELLANOX_SINAI_OLD),
+	  .driver_data = SINAI },
+	{ 0, }
+};
+
+MODULE_DEVICE_TABLE(pci, mthca_pci_table);
+
+static struct pci_driver mthca_driver = {
+	.name		= DRV_NAME,
+	.id_table	= mthca_pci_table,
+	.probe		= mthca_init_one,
+	.remove		= mthca_remove_one,
+};
+
+static void __init __mthca_check_profile_val(const char *name, int *pval,
+					     int pval_default)
+{
+	/* value must be positive and power of 2 */
+	int old_pval = *pval;
+
+	if (old_pval <= 0)
+		*pval = pval_default;
+	else
+		*pval = roundup_pow_of_two(old_pval);
+
+	if (old_pval != *pval) {
+		printk(KERN_WARNING PFX "Invalid value %d for %s in module parameter.\n",
+		       old_pval, name);
+		printk(KERN_WARNING PFX "Corrected %s to %d.\n", name, *pval);
+	}
+}
+
+#define mthca_check_profile_val(name, default)				\
+	__mthca_check_profile_val(#name, &hca_profile.name, default)
+
+static void __init mthca_validate_profile(void)
+{
+	mthca_check_profile_val(num_qp,            MTHCA_DEFAULT_NUM_QP);
+	mthca_check_profile_val(rdb_per_qp,        MTHCA_DEFAULT_RDB_PER_QP);
+	mthca_check_profile_val(num_cq,            MTHCA_DEFAULT_NUM_CQ);
+	mthca_check_profile_val(num_mcg, 	   MTHCA_DEFAULT_NUM_MCG);
+	mthca_check_profile_val(num_mpt, 	   MTHCA_DEFAULT_NUM_MPT);
+	mthca_check_profile_val(num_mtt, 	   MTHCA_DEFAULT_NUM_MTT);
+	mthca_check_profile_val(num_udav,          MTHCA_DEFAULT_NUM_UDAV);
+	mthca_check_profile_val(fmr_reserved_mtts, MTHCA_DEFAULT_NUM_RESERVED_MTTS);
+
+	if (hca_profile.fmr_reserved_mtts >= hca_profile.num_mtt) {
+		printk(KERN_WARNING PFX "Invalid fmr_reserved_mtts module parameter %d.\n",
+		       hca_profile.fmr_reserved_mtts);
+		printk(KERN_WARNING PFX "(Must be smaller than num_mtt %d)\n",
+		       hca_profile.num_mtt);
+		hca_profile.fmr_reserved_mtts = hca_profile.num_mtt / 2;
+		printk(KERN_WARNING PFX "Corrected fmr_reserved_mtts to %d.\n",
+		       hca_profile.fmr_reserved_mtts);
+	}
+
+	if ((log_mtts_per_seg < 1) || (log_mtts_per_seg > 5)) {
+		printk(KERN_WARNING PFX "bad log_mtts_per_seg (%d). Using default - %d\n",
+		       log_mtts_per_seg, ilog2(MTHCA_MTT_SEG_SIZE / 8));
+		log_mtts_per_seg = ilog2(MTHCA_MTT_SEG_SIZE / 8);
+	}
+}
+
+static int __init mthca_init(void)
+{
+	int ret;
+
+	mthca_validate_profile();
+
+	ret = mthca_catas_init();
+	if (ret)
+		return ret;
+
+	ret = pci_register_driver(&mthca_driver);
+	if (ret < 0) {
+		mthca_catas_cleanup();
+		return ret;
+	}
+
+	return 0;
+}
+
+static void __exit mthca_cleanup(void)
+{
+	pci_unregister_driver(&mthca_driver);
+	mthca_catas_cleanup();
+}
+
+module_init(mthca_init);
+module_exit(mthca_cleanup);
diff --git a/drivers/infiniband/hw/mthca/mthca_mcg.c b/drivers/infiniband/hw/mthca/mthca_mcg.c
new file mode 100644
index 0000000..6304ae8
--- /dev/null
+++ b/drivers/infiniband/hw/mthca/mthca_mcg.c
@@ -0,0 +1,335 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/string.h>
+#include <linux/gfp.h>
+
+#include "mthca_dev.h"
+#include "mthca_cmd.h"
+
+struct mthca_mgm {
+	__be32 next_gid_index;
+	u32    reserved[3];
+	u8     gid[16];
+	__be32 qp[MTHCA_QP_PER_MGM];
+};
+
+static const u8 zero_gid[16];	/* automatically initialized to 0 */
+
+/*
+ * Caller must hold MCG table semaphore.  gid and mgm parameters must
+ * be properly aligned for command interface.
+ *
+ *  Returns 0 unless a firmware command error occurs.
+ *
+ * If GID is found in MGM or MGM is empty, *index = *hash, *prev = -1
+ * and *mgm holds MGM entry.
+ *
+ * if GID is found in AMGM, *index = index in AMGM, *prev = index of
+ * previous entry in hash chain and *mgm holds AMGM entry.
+ *
+ * If no AMGM exists for given gid, *index = -1, *prev = index of last
+ * entry in hash chain and *mgm holds end of hash chain.
+ */
+static int find_mgm(struct mthca_dev *dev,
+		    u8 *gid, struct mthca_mailbox *mgm_mailbox,
+		    u16 *hash, int *prev, int *index)
+{
+	struct mthca_mailbox *mailbox;
+	struct mthca_mgm *mgm = mgm_mailbox->buf;
+	u8 *mgid;
+	int err;
+
+	mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+	if (IS_ERR(mailbox))
+		return -ENOMEM;
+	mgid = mailbox->buf;
+
+	memcpy(mgid, gid, 16);
+
+	err = mthca_MGID_HASH(dev, mailbox, hash);
+	if (err) {
+		mthca_err(dev, "MGID_HASH failed (%d)\n", err);
+		goto out;
+	}
+
+	if (0)
+		mthca_dbg(dev, "Hash for %pI6 is %04x\n", gid, *hash);
+
+	*index = *hash;
+	*prev  = -1;
+
+	do {
+		err = mthca_READ_MGM(dev, *index, mgm_mailbox);
+		if (err) {
+			mthca_err(dev, "READ_MGM failed (%d)\n", err);
+			goto out;
+		}
+
+		if (!memcmp(mgm->gid, zero_gid, 16)) {
+			if (*index != *hash) {
+				mthca_err(dev, "Found zero MGID in AMGM.\n");
+				err = -EINVAL;
+			}
+			goto out;
+		}
+
+		if (!memcmp(mgm->gid, gid, 16))
+			goto out;
+
+		*prev = *index;
+		*index = be32_to_cpu(mgm->next_gid_index) >> 6;
+	} while (*index);
+
+	*index = -1;
+
+ out:
+	mthca_free_mailbox(dev, mailbox);
+	return err;
+}
+
+int mthca_multicast_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
+{
+	struct mthca_dev *dev = to_mdev(ibqp->device);
+	struct mthca_mailbox *mailbox;
+	struct mthca_mgm *mgm;
+	u16 hash;
+	int index, prev;
+	int link = 0;
+	int i;
+	int err;
+
+	mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	mgm = mailbox->buf;
+
+	mutex_lock(&dev->mcg_table.mutex);
+
+	err = find_mgm(dev, gid->raw, mailbox, &hash, &prev, &index);
+	if (err)
+		goto out;
+
+	if (index != -1) {
+		if (!memcmp(mgm->gid, zero_gid, 16))
+			memcpy(mgm->gid, gid->raw, 16);
+	} else {
+		link = 1;
+
+		index = mthca_alloc(&dev->mcg_table.alloc);
+		if (index == -1) {
+			mthca_err(dev, "No AMGM entries left\n");
+			err = -ENOMEM;
+			goto out;
+		}
+
+		err = mthca_READ_MGM(dev, index, mailbox);
+		if (err) {
+			mthca_err(dev, "READ_MGM failed (%d)\n", err);
+			goto out;
+		}
+		memset(mgm, 0, sizeof *mgm);
+		memcpy(mgm->gid, gid->raw, 16);
+	}
+
+	for (i = 0; i < MTHCA_QP_PER_MGM; ++i)
+		if (mgm->qp[i] == cpu_to_be32(ibqp->qp_num | (1 << 31))) {
+			mthca_dbg(dev, "QP %06x already a member of MGM\n",
+				  ibqp->qp_num);
+			err = 0;
+			goto out;
+		} else if (!(mgm->qp[i] & cpu_to_be32(1 << 31))) {
+			mgm->qp[i] = cpu_to_be32(ibqp->qp_num | (1 << 31));
+			break;
+		}
+
+	if (i == MTHCA_QP_PER_MGM) {
+		mthca_err(dev, "MGM at index %x is full.\n", index);
+		err = -ENOMEM;
+		goto out;
+	}
+
+	err = mthca_WRITE_MGM(dev, index, mailbox);
+	if (err) {
+		mthca_err(dev, "WRITE_MGM failed %d\n", err);
+		err = -EINVAL;
+		goto out;
+	}
+
+	if (!link)
+		goto out;
+
+	err = mthca_READ_MGM(dev, prev, mailbox);
+	if (err) {
+		mthca_err(dev, "READ_MGM failed %d\n", err);
+		goto out;
+	}
+
+	mgm->next_gid_index = cpu_to_be32(index << 6);
+
+	err = mthca_WRITE_MGM(dev, prev, mailbox);
+	if (err)
+		mthca_err(dev, "WRITE_MGM returned %d\n", err);
+
+ out:
+	if (err && link && index != -1) {
+		BUG_ON(index < dev->limits.num_mgms);
+		mthca_free(&dev->mcg_table.alloc, index);
+	}
+	mutex_unlock(&dev->mcg_table.mutex);
+
+	mthca_free_mailbox(dev, mailbox);
+	return err;
+}
+
+int mthca_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
+{
+	struct mthca_dev *dev = to_mdev(ibqp->device);
+	struct mthca_mailbox *mailbox;
+	struct mthca_mgm *mgm;
+	u16 hash;
+	int prev, index;
+	int i, loc;
+	int err;
+
+	mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	mgm = mailbox->buf;
+
+	mutex_lock(&dev->mcg_table.mutex);
+
+	err = find_mgm(dev, gid->raw, mailbox, &hash, &prev, &index);
+	if (err)
+		goto out;
+
+	if (index == -1) {
+		mthca_err(dev, "MGID %pI6 not found\n", gid->raw);
+		err = -EINVAL;
+		goto out;
+	}
+
+	for (loc = -1, i = 0; i < MTHCA_QP_PER_MGM; ++i) {
+		if (mgm->qp[i] == cpu_to_be32(ibqp->qp_num | (1 << 31)))
+			loc = i;
+		if (!(mgm->qp[i] & cpu_to_be32(1 << 31)))
+			break;
+	}
+
+	if (loc == -1) {
+		mthca_err(dev, "QP %06x not found in MGM\n", ibqp->qp_num);
+		err = -EINVAL;
+		goto out;
+	}
+
+	mgm->qp[loc]   = mgm->qp[i - 1];
+	mgm->qp[i - 1] = 0;
+
+	err = mthca_WRITE_MGM(dev, index, mailbox);
+	if (err) {
+		mthca_err(dev, "WRITE_MGM returned %d\n", err);
+		goto out;
+	}
+
+	if (i != 1)
+		goto out;
+
+	if (prev == -1) {
+		/* Remove entry from MGM */
+		int amgm_index_to_free = be32_to_cpu(mgm->next_gid_index) >> 6;
+		if (amgm_index_to_free) {
+			err = mthca_READ_MGM(dev, amgm_index_to_free,
+					     mailbox);
+			if (err) {
+				mthca_err(dev, "READ_MGM returned %d\n", err);
+				goto out;
+			}
+		} else
+			memset(mgm->gid, 0, 16);
+
+		err = mthca_WRITE_MGM(dev, index, mailbox);
+		if (err) {
+			mthca_err(dev, "WRITE_MGM returned %d\n", err);
+			goto out;
+		}
+		if (amgm_index_to_free) {
+			BUG_ON(amgm_index_to_free < dev->limits.num_mgms);
+			mthca_free(&dev->mcg_table.alloc, amgm_index_to_free);
+		}
+	} else {
+		/* Remove entry from AMGM */
+		int curr_next_index = be32_to_cpu(mgm->next_gid_index) >> 6;
+		err = mthca_READ_MGM(dev, prev, mailbox);
+		if (err) {
+			mthca_err(dev, "READ_MGM returned %d\n", err);
+			goto out;
+		}
+
+		mgm->next_gid_index = cpu_to_be32(curr_next_index << 6);
+
+		err = mthca_WRITE_MGM(dev, prev, mailbox);
+		if (err) {
+			mthca_err(dev, "WRITE_MGM returned %d\n", err);
+			goto out;
+		}
+		BUG_ON(index < dev->limits.num_mgms);
+		mthca_free(&dev->mcg_table.alloc, index);
+	}
+
+ out:
+	mutex_unlock(&dev->mcg_table.mutex);
+
+	mthca_free_mailbox(dev, mailbox);
+	return err;
+}
+
+int mthca_init_mcg_table(struct mthca_dev *dev)
+{
+	int err;
+	int table_size = dev->limits.num_mgms + dev->limits.num_amgms;
+
+	err = mthca_alloc_init(&dev->mcg_table.alloc,
+			       table_size,
+			       table_size - 1,
+			       dev->limits.num_mgms);
+	if (err)
+		return err;
+
+	mutex_init(&dev->mcg_table.mutex);
+
+	return 0;
+}
+
+void mthca_cleanup_mcg_table(struct mthca_dev *dev)
+{
+	mthca_alloc_cleanup(&dev->mcg_table.alloc);
+}
diff --git a/drivers/infiniband/hw/mthca/mthca_memfree.c b/drivers/infiniband/hw/mthca/mthca_memfree.c
new file mode 100644
index 0000000..7d2e42d
--- /dev/null
+++ b/drivers/infiniband/hw/mthca/mthca_memfree.c
@@ -0,0 +1,760 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Cisco Systems.  All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/mm.h>
+#include <linux/scatterlist.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+
+#include <asm/page.h>
+
+#include "mthca_memfree.h"
+#include "mthca_dev.h"
+#include "mthca_cmd.h"
+
+/*
+ * We allocate in as big chunks as we can, up to a maximum of 256 KB
+ * per chunk.
+ */
+enum {
+	MTHCA_ICM_ALLOC_SIZE   = 1 << 18,
+	MTHCA_TABLE_CHUNK_SIZE = 1 << 18
+};
+
+struct mthca_user_db_table {
+	struct mutex mutex;
+	struct {
+		u64                uvirt;
+		struct scatterlist mem;
+		int                refcount;
+	}                page[0];
+};
+
+static void mthca_free_icm_pages(struct mthca_dev *dev, struct mthca_icm_chunk *chunk)
+{
+	int i;
+
+	if (chunk->nsg > 0)
+		pci_unmap_sg(dev->pdev, chunk->mem, chunk->npages,
+			     PCI_DMA_BIDIRECTIONAL);
+
+	for (i = 0; i < chunk->npages; ++i)
+		__free_pages(sg_page(&chunk->mem[i]),
+			     get_order(chunk->mem[i].length));
+}
+
+static void mthca_free_icm_coherent(struct mthca_dev *dev, struct mthca_icm_chunk *chunk)
+{
+	int i;
+
+	for (i = 0; i < chunk->npages; ++i) {
+		dma_free_coherent(&dev->pdev->dev, chunk->mem[i].length,
+				  lowmem_page_address(sg_page(&chunk->mem[i])),
+				  sg_dma_address(&chunk->mem[i]));
+	}
+}
+
+void mthca_free_icm(struct mthca_dev *dev, struct mthca_icm *icm, int coherent)
+{
+	struct mthca_icm_chunk *chunk, *tmp;
+
+	if (!icm)
+		return;
+
+	list_for_each_entry_safe(chunk, tmp, &icm->chunk_list, list) {
+		if (coherent)
+			mthca_free_icm_coherent(dev, chunk);
+		else
+			mthca_free_icm_pages(dev, chunk);
+
+		kfree(chunk);
+	}
+
+	kfree(icm);
+}
+
+static int mthca_alloc_icm_pages(struct scatterlist *mem, int order, gfp_t gfp_mask)
+{
+	struct page *page;
+
+	/*
+	 * Use __GFP_ZERO because buggy firmware assumes ICM pages are
+	 * cleared, and subtle failures are seen if they aren't.
+	 */
+	page = alloc_pages(gfp_mask | __GFP_ZERO, order);
+	if (!page)
+		return -ENOMEM;
+
+	sg_set_page(mem, page, PAGE_SIZE << order, 0);
+	return 0;
+}
+
+static int mthca_alloc_icm_coherent(struct device *dev, struct scatterlist *mem,
+				    int order, gfp_t gfp_mask)
+{
+	void *buf = dma_alloc_coherent(dev, PAGE_SIZE << order, &sg_dma_address(mem),
+				       gfp_mask);
+	if (!buf)
+		return -ENOMEM;
+
+	sg_set_buf(mem, buf, PAGE_SIZE << order);
+	BUG_ON(mem->offset);
+	sg_dma_len(mem) = PAGE_SIZE << order;
+	return 0;
+}
+
+struct mthca_icm *mthca_alloc_icm(struct mthca_dev *dev, int npages,
+				  gfp_t gfp_mask, int coherent)
+{
+	struct mthca_icm *icm;
+	struct mthca_icm_chunk *chunk = NULL;
+	int cur_order;
+	int ret;
+
+	/* We use sg_set_buf for coherent allocs, which assumes low memory */
+	BUG_ON(coherent && (gfp_mask & __GFP_HIGHMEM));
+
+	icm = kmalloc(sizeof *icm, gfp_mask & ~(__GFP_HIGHMEM | __GFP_NOWARN));
+	if (!icm)
+		return icm;
+
+	icm->refcount = 0;
+	INIT_LIST_HEAD(&icm->chunk_list);
+
+	cur_order = get_order(MTHCA_ICM_ALLOC_SIZE);
+
+	while (npages > 0) {
+		if (!chunk) {
+			chunk = kmalloc(sizeof *chunk,
+					gfp_mask & ~(__GFP_HIGHMEM | __GFP_NOWARN));
+			if (!chunk)
+				goto fail;
+
+			sg_init_table(chunk->mem, MTHCA_ICM_CHUNK_LEN);
+			chunk->npages = 0;
+			chunk->nsg    = 0;
+			list_add_tail(&chunk->list, &icm->chunk_list);
+		}
+
+		while (1 << cur_order > npages)
+			--cur_order;
+
+		if (coherent)
+			ret = mthca_alloc_icm_coherent(&dev->pdev->dev,
+						       &chunk->mem[chunk->npages],
+						       cur_order, gfp_mask);
+		else
+			ret = mthca_alloc_icm_pages(&chunk->mem[chunk->npages],
+						    cur_order, gfp_mask);
+
+		if (!ret) {
+			++chunk->npages;
+
+			if (coherent)
+				++chunk->nsg;
+			else if (chunk->npages == MTHCA_ICM_CHUNK_LEN) {
+				chunk->nsg = pci_map_sg(dev->pdev, chunk->mem,
+							chunk->npages,
+							PCI_DMA_BIDIRECTIONAL);
+
+				if (chunk->nsg <= 0)
+					goto fail;
+			}
+
+			if (chunk->npages == MTHCA_ICM_CHUNK_LEN)
+				chunk = NULL;
+
+			npages -= 1 << cur_order;
+		} else {
+			--cur_order;
+			if (cur_order < 0)
+				goto fail;
+		}
+	}
+
+	if (!coherent && chunk) {
+		chunk->nsg = pci_map_sg(dev->pdev, chunk->mem,
+					chunk->npages,
+					PCI_DMA_BIDIRECTIONAL);
+
+		if (chunk->nsg <= 0)
+			goto fail;
+	}
+
+	return icm;
+
+fail:
+	mthca_free_icm(dev, icm, coherent);
+	return NULL;
+}
+
+int mthca_table_get(struct mthca_dev *dev, struct mthca_icm_table *table, int obj)
+{
+	int i = (obj & (table->num_obj - 1)) * table->obj_size / MTHCA_TABLE_CHUNK_SIZE;
+	int ret = 0;
+
+	mutex_lock(&table->mutex);
+
+	if (table->icm[i]) {
+		++table->icm[i]->refcount;
+		goto out;
+	}
+
+	table->icm[i] = mthca_alloc_icm(dev, MTHCA_TABLE_CHUNK_SIZE >> PAGE_SHIFT,
+					(table->lowmem ? GFP_KERNEL : GFP_HIGHUSER) |
+					__GFP_NOWARN, table->coherent);
+	if (!table->icm[i]) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	if (mthca_MAP_ICM(dev, table->icm[i],
+			  table->virt + i * MTHCA_TABLE_CHUNK_SIZE)) {
+		mthca_free_icm(dev, table->icm[i], table->coherent);
+		table->icm[i] = NULL;
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	++table->icm[i]->refcount;
+
+out:
+	mutex_unlock(&table->mutex);
+	return ret;
+}
+
+void mthca_table_put(struct mthca_dev *dev, struct mthca_icm_table *table, int obj)
+{
+	int i;
+
+	if (!mthca_is_memfree(dev))
+		return;
+
+	i = (obj & (table->num_obj - 1)) * table->obj_size / MTHCA_TABLE_CHUNK_SIZE;
+
+	mutex_lock(&table->mutex);
+
+	if (--table->icm[i]->refcount == 0) {
+		mthca_UNMAP_ICM(dev, table->virt + i * MTHCA_TABLE_CHUNK_SIZE,
+				MTHCA_TABLE_CHUNK_SIZE / MTHCA_ICM_PAGE_SIZE);
+		mthca_free_icm(dev, table->icm[i], table->coherent);
+		table->icm[i] = NULL;
+	}
+
+	mutex_unlock(&table->mutex);
+}
+
+void *mthca_table_find(struct mthca_icm_table *table, int obj, dma_addr_t *dma_handle)
+{
+	int idx, offset, dma_offset, i;
+	struct mthca_icm_chunk *chunk;
+	struct mthca_icm *icm;
+	struct page *page = NULL;
+
+	if (!table->lowmem)
+		return NULL;
+
+	mutex_lock(&table->mutex);
+
+	idx = (obj & (table->num_obj - 1)) * table->obj_size;
+	icm = table->icm[idx / MTHCA_TABLE_CHUNK_SIZE];
+	dma_offset = offset = idx % MTHCA_TABLE_CHUNK_SIZE;
+
+	if (!icm)
+		goto out;
+
+	list_for_each_entry(chunk, &icm->chunk_list, list) {
+		for (i = 0; i < chunk->npages; ++i) {
+			if (dma_handle && dma_offset >= 0) {
+				if (sg_dma_len(&chunk->mem[i]) > dma_offset)
+					*dma_handle = sg_dma_address(&chunk->mem[i]) +
+						dma_offset;
+				dma_offset -= sg_dma_len(&chunk->mem[i]);
+			}
+			/* DMA mapping can merge pages but not split them,
+			 * so if we found the page, dma_handle has already
+			 * been assigned to. */
+			if (chunk->mem[i].length > offset) {
+				page = sg_page(&chunk->mem[i]);
+				goto out;
+			}
+			offset -= chunk->mem[i].length;
+		}
+	}
+
+out:
+	mutex_unlock(&table->mutex);
+	return page ? lowmem_page_address(page) + offset : NULL;
+}
+
+int mthca_table_get_range(struct mthca_dev *dev, struct mthca_icm_table *table,
+			  int start, int end)
+{
+	int inc = MTHCA_TABLE_CHUNK_SIZE / table->obj_size;
+	int i, err;
+
+	for (i = start; i <= end; i += inc) {
+		err = mthca_table_get(dev, table, i);
+		if (err)
+			goto fail;
+	}
+
+	return 0;
+
+fail:
+	while (i > start) {
+		i -= inc;
+		mthca_table_put(dev, table, i);
+	}
+
+	return err;
+}
+
+void mthca_table_put_range(struct mthca_dev *dev, struct mthca_icm_table *table,
+			   int start, int end)
+{
+	int i;
+
+	if (!mthca_is_memfree(dev))
+		return;
+
+	for (i = start; i <= end; i += MTHCA_TABLE_CHUNK_SIZE / table->obj_size)
+		mthca_table_put(dev, table, i);
+}
+
+struct mthca_icm_table *mthca_alloc_icm_table(struct mthca_dev *dev,
+					      u64 virt, int obj_size,
+					      int nobj, int reserved,
+					      int use_lowmem, int use_coherent)
+{
+	struct mthca_icm_table *table;
+	int obj_per_chunk;
+	int num_icm;
+	unsigned chunk_size;
+	int i;
+
+	obj_per_chunk = MTHCA_TABLE_CHUNK_SIZE / obj_size;
+	num_icm = DIV_ROUND_UP(nobj, obj_per_chunk);
+
+	table = kmalloc(sizeof *table + num_icm * sizeof *table->icm, GFP_KERNEL);
+	if (!table)
+		return NULL;
+
+	table->virt     = virt;
+	table->num_icm  = num_icm;
+	table->num_obj  = nobj;
+	table->obj_size = obj_size;
+	table->lowmem   = use_lowmem;
+	table->coherent = use_coherent;
+	mutex_init(&table->mutex);
+
+	for (i = 0; i < num_icm; ++i)
+		table->icm[i] = NULL;
+
+	for (i = 0; i * MTHCA_TABLE_CHUNK_SIZE < reserved * obj_size; ++i) {
+		chunk_size = MTHCA_TABLE_CHUNK_SIZE;
+		if ((i + 1) * MTHCA_TABLE_CHUNK_SIZE > nobj * obj_size)
+			chunk_size = nobj * obj_size - i * MTHCA_TABLE_CHUNK_SIZE;
+
+		table->icm[i] = mthca_alloc_icm(dev, chunk_size >> PAGE_SHIFT,
+						(use_lowmem ? GFP_KERNEL : GFP_HIGHUSER) |
+						__GFP_NOWARN, use_coherent);
+		if (!table->icm[i])
+			goto err;
+		if (mthca_MAP_ICM(dev, table->icm[i],
+				  virt + i * MTHCA_TABLE_CHUNK_SIZE)) {
+			mthca_free_icm(dev, table->icm[i], table->coherent);
+			table->icm[i] = NULL;
+			goto err;
+		}
+
+		/*
+		 * Add a reference to this ICM chunk so that it never
+		 * gets freed (since it contains reserved firmware objects).
+		 */
+		++table->icm[i]->refcount;
+	}
+
+	return table;
+
+err:
+	for (i = 0; i < num_icm; ++i)
+		if (table->icm[i]) {
+			mthca_UNMAP_ICM(dev, virt + i * MTHCA_TABLE_CHUNK_SIZE,
+					MTHCA_TABLE_CHUNK_SIZE / MTHCA_ICM_PAGE_SIZE);
+			mthca_free_icm(dev, table->icm[i], table->coherent);
+		}
+
+	kfree(table);
+
+	return NULL;
+}
+
+void mthca_free_icm_table(struct mthca_dev *dev, struct mthca_icm_table *table)
+{
+	int i;
+
+	for (i = 0; i < table->num_icm; ++i)
+		if (table->icm[i]) {
+			mthca_UNMAP_ICM(dev,
+					table->virt + i * MTHCA_TABLE_CHUNK_SIZE,
+					MTHCA_TABLE_CHUNK_SIZE / MTHCA_ICM_PAGE_SIZE);
+			mthca_free_icm(dev, table->icm[i], table->coherent);
+		}
+
+	kfree(table);
+}
+
+static u64 mthca_uarc_virt(struct mthca_dev *dev, struct mthca_uar *uar, int page)
+{
+	return dev->uar_table.uarc_base +
+		uar->index * dev->uar_table.uarc_size +
+		page * MTHCA_ICM_PAGE_SIZE;
+}
+
+int mthca_map_user_db(struct mthca_dev *dev, struct mthca_uar *uar,
+		      struct mthca_user_db_table *db_tab, int index, u64 uaddr)
+{
+	struct page *pages[1];
+	int ret = 0;
+	int i;
+
+	if (!mthca_is_memfree(dev))
+		return 0;
+
+	if (index < 0 || index > dev->uar_table.uarc_size / 8)
+		return -EINVAL;
+
+	mutex_lock(&db_tab->mutex);
+
+	i = index / MTHCA_DB_REC_PER_PAGE;
+
+	if ((db_tab->page[i].refcount >= MTHCA_DB_REC_PER_PAGE)       ||
+	    (db_tab->page[i].uvirt && db_tab->page[i].uvirt != uaddr) ||
+	    (uaddr & 4095)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (db_tab->page[i].refcount) {
+		++db_tab->page[i].refcount;
+		goto out;
+	}
+
+	ret = get_user_pages(current, current->mm, uaddr & PAGE_MASK, 1, 1, 0,
+			     pages, NULL);
+	if (ret < 0)
+		goto out;
+
+	sg_set_page(&db_tab->page[i].mem, pages[0], MTHCA_ICM_PAGE_SIZE,
+			uaddr & ~PAGE_MASK);
+
+	ret = pci_map_sg(dev->pdev, &db_tab->page[i].mem, 1, PCI_DMA_TODEVICE);
+	if (ret < 0) {
+		put_page(pages[0]);
+		goto out;
+	}
+
+	ret = mthca_MAP_ICM_page(dev, sg_dma_address(&db_tab->page[i].mem),
+				 mthca_uarc_virt(dev, uar, i));
+	if (ret) {
+		pci_unmap_sg(dev->pdev, &db_tab->page[i].mem, 1, PCI_DMA_TODEVICE);
+		put_page(sg_page(&db_tab->page[i].mem));
+		goto out;
+	}
+
+	db_tab->page[i].uvirt    = uaddr;
+	db_tab->page[i].refcount = 1;
+
+out:
+	mutex_unlock(&db_tab->mutex);
+	return ret;
+}
+
+void mthca_unmap_user_db(struct mthca_dev *dev, struct mthca_uar *uar,
+			 struct mthca_user_db_table *db_tab, int index)
+{
+	if (!mthca_is_memfree(dev))
+		return;
+
+	/*
+	 * To make our bookkeeping simpler, we don't unmap DB
+	 * pages until we clean up the whole db table.
+	 */
+
+	mutex_lock(&db_tab->mutex);
+
+	--db_tab->page[index / MTHCA_DB_REC_PER_PAGE].refcount;
+
+	mutex_unlock(&db_tab->mutex);
+}
+
+struct mthca_user_db_table *mthca_init_user_db_tab(struct mthca_dev *dev)
+{
+	struct mthca_user_db_table *db_tab;
+	int npages;
+	int i;
+
+	if (!mthca_is_memfree(dev))
+		return NULL;
+
+	npages = dev->uar_table.uarc_size / MTHCA_ICM_PAGE_SIZE;
+	db_tab = kmalloc(sizeof *db_tab + npages * sizeof *db_tab->page, GFP_KERNEL);
+	if (!db_tab)
+		return ERR_PTR(-ENOMEM);
+
+	mutex_init(&db_tab->mutex);
+	for (i = 0; i < npages; ++i) {
+		db_tab->page[i].refcount = 0;
+		db_tab->page[i].uvirt    = 0;
+		sg_init_table(&db_tab->page[i].mem, 1);
+	}
+
+	return db_tab;
+}
+
+void mthca_cleanup_user_db_tab(struct mthca_dev *dev, struct mthca_uar *uar,
+			       struct mthca_user_db_table *db_tab)
+{
+	int i;
+
+	if (!mthca_is_memfree(dev))
+		return;
+
+	for (i = 0; i < dev->uar_table.uarc_size / MTHCA_ICM_PAGE_SIZE; ++i) {
+		if (db_tab->page[i].uvirt) {
+			mthca_UNMAP_ICM(dev, mthca_uarc_virt(dev, uar, i), 1);
+			pci_unmap_sg(dev->pdev, &db_tab->page[i].mem, 1, PCI_DMA_TODEVICE);
+			put_page(sg_page(&db_tab->page[i].mem));
+		}
+	}
+
+	kfree(db_tab);
+}
+
+int mthca_alloc_db(struct mthca_dev *dev, enum mthca_db_type type,
+		   u32 qn, __be32 **db)
+{
+	int group;
+	int start, end, dir;
+	int i, j;
+	struct mthca_db_page *page;
+	int ret = 0;
+
+	mutex_lock(&dev->db_tab->mutex);
+
+	switch (type) {
+	case MTHCA_DB_TYPE_CQ_ARM:
+	case MTHCA_DB_TYPE_SQ:
+		group = 0;
+		start = 0;
+		end   = dev->db_tab->max_group1;
+		dir   = 1;
+		break;
+
+	case MTHCA_DB_TYPE_CQ_SET_CI:
+	case MTHCA_DB_TYPE_RQ:
+	case MTHCA_DB_TYPE_SRQ:
+		group = 1;
+		start = dev->db_tab->npages - 1;
+		end   = dev->db_tab->min_group2;
+		dir   = -1;
+		break;
+
+	default:
+		ret = -EINVAL;
+		goto out;
+	}
+
+	for (i = start; i != end; i += dir)
+		if (dev->db_tab->page[i].db_rec &&
+		    !bitmap_full(dev->db_tab->page[i].used,
+				 MTHCA_DB_REC_PER_PAGE)) {
+			page = dev->db_tab->page + i;
+			goto found;
+		}
+
+	for (i = start; i != end; i += dir)
+		if (!dev->db_tab->page[i].db_rec) {
+			page = dev->db_tab->page + i;
+			goto alloc;
+		}
+
+	if (dev->db_tab->max_group1 >= dev->db_tab->min_group2 - 1) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	if (group == 0)
+		++dev->db_tab->max_group1;
+	else
+		--dev->db_tab->min_group2;
+
+	page = dev->db_tab->page + end;
+
+alloc:
+	page->db_rec = dma_alloc_coherent(&dev->pdev->dev, MTHCA_ICM_PAGE_SIZE,
+					  &page->mapping, GFP_KERNEL);
+	if (!page->db_rec) {
+		ret = -ENOMEM;
+		goto out;
+	}
+	memset(page->db_rec, 0, MTHCA_ICM_PAGE_SIZE);
+
+	ret = mthca_MAP_ICM_page(dev, page->mapping,
+				 mthca_uarc_virt(dev, &dev->driver_uar, i));
+	if (ret) {
+		dma_free_coherent(&dev->pdev->dev, MTHCA_ICM_PAGE_SIZE,
+				  page->db_rec, page->mapping);
+		goto out;
+	}
+
+	bitmap_zero(page->used, MTHCA_DB_REC_PER_PAGE);
+
+found:
+	j = find_first_zero_bit(page->used, MTHCA_DB_REC_PER_PAGE);
+	set_bit(j, page->used);
+
+	if (group == 1)
+		j = MTHCA_DB_REC_PER_PAGE - 1 - j;
+
+	ret = i * MTHCA_DB_REC_PER_PAGE + j;
+
+	page->db_rec[j] = cpu_to_be64((qn << 8) | (type << 5));
+
+	*db = (__be32 *) &page->db_rec[j];
+
+out:
+	mutex_unlock(&dev->db_tab->mutex);
+
+	return ret;
+}
+
+void mthca_free_db(struct mthca_dev *dev, int type, int db_index)
+{
+	int i, j;
+	struct mthca_db_page *page;
+
+	i = db_index / MTHCA_DB_REC_PER_PAGE;
+	j = db_index % MTHCA_DB_REC_PER_PAGE;
+
+	page = dev->db_tab->page + i;
+
+	mutex_lock(&dev->db_tab->mutex);
+
+	page->db_rec[j] = 0;
+	if (i >= dev->db_tab->min_group2)
+		j = MTHCA_DB_REC_PER_PAGE - 1 - j;
+	clear_bit(j, page->used);
+
+	if (bitmap_empty(page->used, MTHCA_DB_REC_PER_PAGE) &&
+	    i >= dev->db_tab->max_group1 - 1) {
+		mthca_UNMAP_ICM(dev, mthca_uarc_virt(dev, &dev->driver_uar, i), 1);
+
+		dma_free_coherent(&dev->pdev->dev, MTHCA_ICM_PAGE_SIZE,
+				  page->db_rec, page->mapping);
+		page->db_rec = NULL;
+
+		if (i == dev->db_tab->max_group1) {
+			--dev->db_tab->max_group1;
+			/* XXX may be able to unmap more pages now */
+		}
+		if (i == dev->db_tab->min_group2)
+			++dev->db_tab->min_group2;
+	}
+
+	mutex_unlock(&dev->db_tab->mutex);
+}
+
+int mthca_init_db_tab(struct mthca_dev *dev)
+{
+	int i;
+
+	if (!mthca_is_memfree(dev))
+		return 0;
+
+	dev->db_tab = kmalloc(sizeof *dev->db_tab, GFP_KERNEL);
+	if (!dev->db_tab)
+		return -ENOMEM;
+
+	mutex_init(&dev->db_tab->mutex);
+
+	dev->db_tab->npages     = dev->uar_table.uarc_size / MTHCA_ICM_PAGE_SIZE;
+	dev->db_tab->max_group1 = 0;
+	dev->db_tab->min_group2 = dev->db_tab->npages - 1;
+
+	dev->db_tab->page = kmalloc(dev->db_tab->npages *
+				    sizeof *dev->db_tab->page,
+				    GFP_KERNEL);
+	if (!dev->db_tab->page) {
+		kfree(dev->db_tab);
+		return -ENOMEM;
+	}
+
+	for (i = 0; i < dev->db_tab->npages; ++i)
+		dev->db_tab->page[i].db_rec = NULL;
+
+	return 0;
+}
+
+void mthca_cleanup_db_tab(struct mthca_dev *dev)
+{
+	int i;
+
+	if (!mthca_is_memfree(dev))
+		return;
+
+	/*
+	 * Because we don't always free our UARC pages when they
+	 * become empty to make mthca_free_db() simpler we need to
+	 * make a sweep through the doorbell pages and free any
+	 * leftover pages now.
+	 */
+	for (i = 0; i < dev->db_tab->npages; ++i) {
+		if (!dev->db_tab->page[i].db_rec)
+			continue;
+
+		if (!bitmap_empty(dev->db_tab->page[i].used, MTHCA_DB_REC_PER_PAGE))
+			mthca_warn(dev, "Kernel UARC page %d not empty\n", i);
+
+		mthca_UNMAP_ICM(dev, mthca_uarc_virt(dev, &dev->driver_uar, i), 1);
+
+		dma_free_coherent(&dev->pdev->dev, MTHCA_ICM_PAGE_SIZE,
+				  dev->db_tab->page[i].db_rec,
+				  dev->db_tab->page[i].mapping);
+	}
+
+	kfree(dev->db_tab->page);
+	kfree(dev->db_tab);
+}
diff --git a/drivers/infiniband/hw/mthca/mthca_memfree.h b/drivers/infiniband/hw/mthca/mthca_memfree.h
new file mode 100644
index 0000000..da9b8f9
--- /dev/null
+++ b/drivers/infiniband/hw/mthca/mthca_memfree.h
@@ -0,0 +1,179 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Cisco Systems.  All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MTHCA_MEMFREE_H
+#define MTHCA_MEMFREE_H
+
+#include <linux/list.h>
+#include <linux/mutex.h>
+
+#define MTHCA_ICM_CHUNK_LEN \
+	((256 - sizeof (struct list_head) - 2 * sizeof (int)) /		\
+	 (sizeof (struct scatterlist)))
+
+enum {
+	MTHCA_ICM_PAGE_SHIFT	= 12,
+	MTHCA_ICM_PAGE_SIZE	= 1 << MTHCA_ICM_PAGE_SHIFT,
+	MTHCA_DB_REC_PER_PAGE	= MTHCA_ICM_PAGE_SIZE / 8
+};
+
+struct mthca_icm_chunk {
+	struct list_head   list;
+	int                npages;
+	int                nsg;
+	struct scatterlist mem[MTHCA_ICM_CHUNK_LEN];
+};
+
+struct mthca_icm {
+	struct list_head chunk_list;
+	int              refcount;
+};
+
+struct mthca_icm_table {
+	u64               virt;
+	int               num_icm;
+	int               num_obj;
+	int               obj_size;
+	int               lowmem;
+	int               coherent;
+	struct mutex      mutex;
+	struct mthca_icm *icm[0];
+};
+
+struct mthca_icm_iter {
+	struct mthca_icm       *icm;
+	struct mthca_icm_chunk *chunk;
+	int                     page_idx;
+};
+
+struct mthca_dev;
+
+struct mthca_icm *mthca_alloc_icm(struct mthca_dev *dev, int npages,
+				  gfp_t gfp_mask, int coherent);
+void mthca_free_icm(struct mthca_dev *dev, struct mthca_icm *icm, int coherent);
+
+struct mthca_icm_table *mthca_alloc_icm_table(struct mthca_dev *dev,
+					      u64 virt, int obj_size,
+					      int nobj, int reserved,
+					      int use_lowmem, int use_coherent);
+void mthca_free_icm_table(struct mthca_dev *dev, struct mthca_icm_table *table);
+int mthca_table_get(struct mthca_dev *dev, struct mthca_icm_table *table, int obj);
+void mthca_table_put(struct mthca_dev *dev, struct mthca_icm_table *table, int obj);
+void *mthca_table_find(struct mthca_icm_table *table, int obj, dma_addr_t *dma_handle);
+int mthca_table_get_range(struct mthca_dev *dev, struct mthca_icm_table *table,
+			  int start, int end);
+void mthca_table_put_range(struct mthca_dev *dev, struct mthca_icm_table *table,
+			   int start, int end);
+
+static inline void mthca_icm_first(struct mthca_icm *icm,
+				   struct mthca_icm_iter *iter)
+{
+	iter->icm      = icm;
+	iter->chunk    = list_empty(&icm->chunk_list) ?
+		NULL : list_entry(icm->chunk_list.next,
+				  struct mthca_icm_chunk, list);
+	iter->page_idx = 0;
+}
+
+static inline int mthca_icm_last(struct mthca_icm_iter *iter)
+{
+	return !iter->chunk;
+}
+
+static inline void mthca_icm_next(struct mthca_icm_iter *iter)
+{
+	if (++iter->page_idx >= iter->chunk->nsg) {
+		if (iter->chunk->list.next == &iter->icm->chunk_list) {
+			iter->chunk = NULL;
+			return;
+		}
+
+		iter->chunk = list_entry(iter->chunk->list.next,
+					 struct mthca_icm_chunk, list);
+		iter->page_idx = 0;
+	}
+}
+
+static inline dma_addr_t mthca_icm_addr(struct mthca_icm_iter *iter)
+{
+	return sg_dma_address(&iter->chunk->mem[iter->page_idx]);
+}
+
+static inline unsigned long mthca_icm_size(struct mthca_icm_iter *iter)
+{
+	return sg_dma_len(&iter->chunk->mem[iter->page_idx]);
+}
+
+struct mthca_db_page {
+	DECLARE_BITMAP(used, MTHCA_DB_REC_PER_PAGE);
+	__be64    *db_rec;
+	dma_addr_t mapping;
+};
+
+struct mthca_db_table {
+	int 	       	      npages;
+	int 	       	      max_group1;
+	int 	       	      min_group2;
+	struct mthca_db_page *page;
+	struct mutex          mutex;
+};
+
+enum mthca_db_type {
+	MTHCA_DB_TYPE_INVALID   = 0x0,
+	MTHCA_DB_TYPE_CQ_SET_CI = 0x1,
+	MTHCA_DB_TYPE_CQ_ARM    = 0x2,
+	MTHCA_DB_TYPE_SQ        = 0x3,
+	MTHCA_DB_TYPE_RQ        = 0x4,
+	MTHCA_DB_TYPE_SRQ       = 0x5,
+	MTHCA_DB_TYPE_GROUP_SEP = 0x7
+};
+
+struct mthca_user_db_table;
+struct mthca_uar;
+
+int mthca_map_user_db(struct mthca_dev *dev, struct mthca_uar *uar,
+		      struct mthca_user_db_table *db_tab, int index, u64 uaddr);
+void mthca_unmap_user_db(struct mthca_dev *dev, struct mthca_uar *uar,
+			 struct mthca_user_db_table *db_tab, int index);
+struct mthca_user_db_table *mthca_init_user_db_tab(struct mthca_dev *dev);
+void mthca_cleanup_user_db_tab(struct mthca_dev *dev, struct mthca_uar *uar,
+			       struct mthca_user_db_table *db_tab);
+
+int mthca_init_db_tab(struct mthca_dev *dev);
+void mthca_cleanup_db_tab(struct mthca_dev *dev);
+int mthca_alloc_db(struct mthca_dev *dev, enum mthca_db_type type,
+		   u32 qn, __be32 **db);
+void mthca_free_db(struct mthca_dev *dev, int type, int db_index);
+
+#endif /* MTHCA_MEMFREE_H */
diff --git a/drivers/infiniband/hw/mthca/mthca_mr.c b/drivers/infiniband/hw/mthca/mthca_mr.c
new file mode 100644
index 0000000..ed9a989
--- /dev/null
+++ b/drivers/infiniband/hw/mthca/mthca_mr.c
@@ -0,0 +1,965 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/slab.h>
+#include <linux/errno.h>
+
+#include "mthca_dev.h"
+#include "mthca_cmd.h"
+#include "mthca_memfree.h"
+
+struct mthca_mtt {
+	struct mthca_buddy *buddy;
+	int                 order;
+	u32                 first_seg;
+};
+
+/*
+ * Must be packed because mtt_seg is 64 bits but only aligned to 32 bits.
+ */
+struct mthca_mpt_entry {
+	__be32 flags;
+	__be32 page_size;
+	__be32 key;
+	__be32 pd;
+	__be64 start;
+	__be64 length;
+	__be32 lkey;
+	__be32 window_count;
+	__be32 window_count_limit;
+	__be64 mtt_seg;
+	__be32 mtt_sz;		/* Arbel only */
+	u32    reserved[2];
+} __attribute__((packed));
+
+#define MTHCA_MPT_FLAG_SW_OWNS       (0xfUL << 28)
+#define MTHCA_MPT_FLAG_MIO           (1 << 17)
+#define MTHCA_MPT_FLAG_BIND_ENABLE   (1 << 15)
+#define MTHCA_MPT_FLAG_PHYSICAL      (1 <<  9)
+#define MTHCA_MPT_FLAG_REGION        (1 <<  8)
+
+#define MTHCA_MTT_FLAG_PRESENT       1
+
+#define MTHCA_MPT_STATUS_SW 0xF0
+#define MTHCA_MPT_STATUS_HW 0x00
+
+#define SINAI_FMR_KEY_INC 0x1000000
+
+/*
+ * Buddy allocator for MTT segments (currently not very efficient
+ * since it doesn't keep a free list and just searches linearly
+ * through the bitmaps)
+ */
+
+static u32 mthca_buddy_alloc(struct mthca_buddy *buddy, int order)
+{
+	int o;
+	int m;
+	u32 seg;
+
+	spin_lock(&buddy->lock);
+
+	for (o = order; o <= buddy->max_order; ++o)
+		if (buddy->num_free[o]) {
+			m = 1 << (buddy->max_order - o);
+			seg = find_first_bit(buddy->bits[o], m);
+			if (seg < m)
+				goto found;
+		}
+
+	spin_unlock(&buddy->lock);
+	return -1;
+
+ found:
+	clear_bit(seg, buddy->bits[o]);
+	--buddy->num_free[o];
+
+	while (o > order) {
+		--o;
+		seg <<= 1;
+		set_bit(seg ^ 1, buddy->bits[o]);
+		++buddy->num_free[o];
+	}
+
+	spin_unlock(&buddy->lock);
+
+	seg <<= order;
+
+	return seg;
+}
+
+static void mthca_buddy_free(struct mthca_buddy *buddy, u32 seg, int order)
+{
+	seg >>= order;
+
+	spin_lock(&buddy->lock);
+
+	while (test_bit(seg ^ 1, buddy->bits[order])) {
+		clear_bit(seg ^ 1, buddy->bits[order]);
+		--buddy->num_free[order];
+		seg >>= 1;
+		++order;
+	}
+
+	set_bit(seg, buddy->bits[order]);
+	++buddy->num_free[order];
+
+	spin_unlock(&buddy->lock);
+}
+
+static int mthca_buddy_init(struct mthca_buddy *buddy, int max_order)
+{
+	int i, s;
+
+	buddy->max_order = max_order;
+	spin_lock_init(&buddy->lock);
+
+	buddy->bits = kzalloc((buddy->max_order + 1) * sizeof (long *),
+			      GFP_KERNEL);
+	buddy->num_free = kcalloc((buddy->max_order + 1), sizeof *buddy->num_free,
+				  GFP_KERNEL);
+	if (!buddy->bits || !buddy->num_free)
+		goto err_out;
+
+	for (i = 0; i <= buddy->max_order; ++i) {
+		s = BITS_TO_LONGS(1 << (buddy->max_order - i));
+		buddy->bits[i] = kmalloc(s * sizeof (long), GFP_KERNEL);
+		if (!buddy->bits[i])
+			goto err_out_free;
+		bitmap_zero(buddy->bits[i],
+			    1 << (buddy->max_order - i));
+	}
+
+	set_bit(0, buddy->bits[buddy->max_order]);
+	buddy->num_free[buddy->max_order] = 1;
+
+	return 0;
+
+err_out_free:
+	for (i = 0; i <= buddy->max_order; ++i)
+		kfree(buddy->bits[i]);
+
+err_out:
+	kfree(buddy->bits);
+	kfree(buddy->num_free);
+
+	return -ENOMEM;
+}
+
+static void mthca_buddy_cleanup(struct mthca_buddy *buddy)
+{
+	int i;
+
+	for (i = 0; i <= buddy->max_order; ++i)
+		kfree(buddy->bits[i]);
+
+	kfree(buddy->bits);
+	kfree(buddy->num_free);
+}
+
+static u32 mthca_alloc_mtt_range(struct mthca_dev *dev, int order,
+				 struct mthca_buddy *buddy)
+{
+	u32 seg = mthca_buddy_alloc(buddy, order);
+
+	if (seg == -1)
+		return -1;
+
+	if (mthca_is_memfree(dev))
+		if (mthca_table_get_range(dev, dev->mr_table.mtt_table, seg,
+					  seg + (1 << order) - 1)) {
+			mthca_buddy_free(buddy, seg, order);
+			seg = -1;
+		}
+
+	return seg;
+}
+
+static struct mthca_mtt *__mthca_alloc_mtt(struct mthca_dev *dev, int size,
+					   struct mthca_buddy *buddy)
+{
+	struct mthca_mtt *mtt;
+	int i;
+
+	if (size <= 0)
+		return ERR_PTR(-EINVAL);
+
+	mtt = kmalloc(sizeof *mtt, GFP_KERNEL);
+	if (!mtt)
+		return ERR_PTR(-ENOMEM);
+
+	mtt->buddy = buddy;
+	mtt->order = 0;
+	for (i = dev->limits.mtt_seg_size / 8; i < size; i <<= 1)
+		++mtt->order;
+
+	mtt->first_seg = mthca_alloc_mtt_range(dev, mtt->order, buddy);
+	if (mtt->first_seg == -1) {
+		kfree(mtt);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	return mtt;
+}
+
+struct mthca_mtt *mthca_alloc_mtt(struct mthca_dev *dev, int size)
+{
+	return __mthca_alloc_mtt(dev, size, &dev->mr_table.mtt_buddy);
+}
+
+void mthca_free_mtt(struct mthca_dev *dev, struct mthca_mtt *mtt)
+{
+	if (!mtt)
+		return;
+
+	mthca_buddy_free(mtt->buddy, mtt->first_seg, mtt->order);
+
+	mthca_table_put_range(dev, dev->mr_table.mtt_table,
+			      mtt->first_seg,
+			      mtt->first_seg + (1 << mtt->order) - 1);
+
+	kfree(mtt);
+}
+
+static int __mthca_write_mtt(struct mthca_dev *dev, struct mthca_mtt *mtt,
+			     int start_index, u64 *buffer_list, int list_len)
+{
+	struct mthca_mailbox *mailbox;
+	__be64 *mtt_entry;
+	int err = 0;
+	int i;
+
+	mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	mtt_entry = mailbox->buf;
+
+	while (list_len > 0) {
+		mtt_entry[0] = cpu_to_be64(dev->mr_table.mtt_base +
+					   mtt->first_seg * dev->limits.mtt_seg_size +
+					   start_index * 8);
+		mtt_entry[1] = 0;
+		for (i = 0; i < list_len && i < MTHCA_MAILBOX_SIZE / 8 - 2; ++i)
+			mtt_entry[i + 2] = cpu_to_be64(buffer_list[i] |
+						       MTHCA_MTT_FLAG_PRESENT);
+
+		/*
+		 * If we have an odd number of entries to write, add
+		 * one more dummy entry for firmware efficiency.
+		 */
+		if (i & 1)
+			mtt_entry[i + 2] = 0;
+
+		err = mthca_WRITE_MTT(dev, mailbox, (i + 1) & ~1);
+		if (err) {
+			mthca_warn(dev, "WRITE_MTT failed (%d)\n", err);
+			goto out;
+		}
+
+		list_len    -= i;
+		start_index += i;
+		buffer_list += i;
+	}
+
+out:
+	mthca_free_mailbox(dev, mailbox);
+	return err;
+}
+
+int mthca_write_mtt_size(struct mthca_dev *dev)
+{
+	if (dev->mr_table.fmr_mtt_buddy != &dev->mr_table.mtt_buddy ||
+	    !(dev->mthca_flags & MTHCA_FLAG_FMR))
+		/*
+		 * Be friendly to WRITE_MTT command
+		 * and leave two empty slots for the
+		 * index and reserved fields of the
+		 * mailbox.
+		 */
+		return PAGE_SIZE / sizeof (u64) - 2;
+
+	/* For Arbel, all MTTs must fit in the same page. */
+	return mthca_is_memfree(dev) ? (PAGE_SIZE / sizeof (u64)) : 0x7ffffff;
+}
+
+static void mthca_tavor_write_mtt_seg(struct mthca_dev *dev,
+				      struct mthca_mtt *mtt, int start_index,
+				      u64 *buffer_list, int list_len)
+{
+	u64 __iomem *mtts;
+	int i;
+
+	mtts = dev->mr_table.tavor_fmr.mtt_base + mtt->first_seg * dev->limits.mtt_seg_size +
+		start_index * sizeof (u64);
+	for (i = 0; i < list_len; ++i)
+		mthca_write64_raw(cpu_to_be64(buffer_list[i] | MTHCA_MTT_FLAG_PRESENT),
+				  mtts + i);
+}
+
+static void mthca_arbel_write_mtt_seg(struct mthca_dev *dev,
+				      struct mthca_mtt *mtt, int start_index,
+				      u64 *buffer_list, int list_len)
+{
+	__be64 *mtts;
+	dma_addr_t dma_handle;
+	int i;
+	int s = start_index * sizeof (u64);
+
+	/* For Arbel, all MTTs must fit in the same page. */
+	BUG_ON(s / PAGE_SIZE != (s + list_len * sizeof(u64) - 1) / PAGE_SIZE);
+	/* Require full segments */
+	BUG_ON(s % dev->limits.mtt_seg_size);
+
+	mtts = mthca_table_find(dev->mr_table.mtt_table, mtt->first_seg +
+				s / dev->limits.mtt_seg_size, &dma_handle);
+
+	BUG_ON(!mtts);
+
+	dma_sync_single_for_cpu(&dev->pdev->dev, dma_handle,
+				list_len * sizeof (u64), DMA_TO_DEVICE);
+
+	for (i = 0; i < list_len; ++i)
+		mtts[i] = cpu_to_be64(buffer_list[i] | MTHCA_MTT_FLAG_PRESENT);
+
+	dma_sync_single_for_device(&dev->pdev->dev, dma_handle,
+				   list_len * sizeof (u64), DMA_TO_DEVICE);
+}
+
+int mthca_write_mtt(struct mthca_dev *dev, struct mthca_mtt *mtt,
+		    int start_index, u64 *buffer_list, int list_len)
+{
+	int size = mthca_write_mtt_size(dev);
+	int chunk;
+
+	if (dev->mr_table.fmr_mtt_buddy != &dev->mr_table.mtt_buddy ||
+	    !(dev->mthca_flags & MTHCA_FLAG_FMR))
+		return __mthca_write_mtt(dev, mtt, start_index, buffer_list, list_len);
+
+	while (list_len > 0) {
+		chunk = min(size, list_len);
+		if (mthca_is_memfree(dev))
+			mthca_arbel_write_mtt_seg(dev, mtt, start_index,
+						  buffer_list, chunk);
+		else
+			mthca_tavor_write_mtt_seg(dev, mtt, start_index,
+						  buffer_list, chunk);
+
+		list_len    -= chunk;
+		start_index += chunk;
+		buffer_list += chunk;
+	}
+
+	return 0;
+}
+
+static inline u32 tavor_hw_index_to_key(u32 ind)
+{
+	return ind;
+}
+
+static inline u32 tavor_key_to_hw_index(u32 key)
+{
+	return key;
+}
+
+static inline u32 arbel_hw_index_to_key(u32 ind)
+{
+	return (ind >> 24) | (ind << 8);
+}
+
+static inline u32 arbel_key_to_hw_index(u32 key)
+{
+	return (key << 24) | (key >> 8);
+}
+
+static inline u32 hw_index_to_key(struct mthca_dev *dev, u32 ind)
+{
+	if (mthca_is_memfree(dev))
+		return arbel_hw_index_to_key(ind);
+	else
+		return tavor_hw_index_to_key(ind);
+}
+
+static inline u32 key_to_hw_index(struct mthca_dev *dev, u32 key)
+{
+	if (mthca_is_memfree(dev))
+		return arbel_key_to_hw_index(key);
+	else
+		return tavor_key_to_hw_index(key);
+}
+
+static inline u32 adjust_key(struct mthca_dev *dev, u32 key)
+{
+	if (dev->mthca_flags & MTHCA_FLAG_SINAI_OPT)
+		return ((key << 20) & 0x800000) | (key & 0x7fffff);
+	else
+		return key;
+}
+
+int mthca_mr_alloc(struct mthca_dev *dev, u32 pd, int buffer_size_shift,
+		   u64 iova, u64 total_size, u32 access, struct mthca_mr *mr)
+{
+	struct mthca_mailbox *mailbox;
+	struct mthca_mpt_entry *mpt_entry;
+	u32 key;
+	int i;
+	int err;
+
+	WARN_ON(buffer_size_shift >= 32);
+
+	key = mthca_alloc(&dev->mr_table.mpt_alloc);
+	if (key == -1)
+		return -ENOMEM;
+	key = adjust_key(dev, key);
+	mr->ibmr.rkey = mr->ibmr.lkey = hw_index_to_key(dev, key);
+
+	if (mthca_is_memfree(dev)) {
+		err = mthca_table_get(dev, dev->mr_table.mpt_table, key);
+		if (err)
+			goto err_out_mpt_free;
+	}
+
+	mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+	if (IS_ERR(mailbox)) {
+		err = PTR_ERR(mailbox);
+		goto err_out_table;
+	}
+	mpt_entry = mailbox->buf;
+
+	mpt_entry->flags = cpu_to_be32(MTHCA_MPT_FLAG_SW_OWNS     |
+				       MTHCA_MPT_FLAG_MIO         |
+				       MTHCA_MPT_FLAG_REGION      |
+				       access);
+	if (!mr->mtt)
+		mpt_entry->flags |= cpu_to_be32(MTHCA_MPT_FLAG_PHYSICAL);
+
+	mpt_entry->page_size = cpu_to_be32(buffer_size_shift - 12);
+	mpt_entry->key       = cpu_to_be32(key);
+	mpt_entry->pd        = cpu_to_be32(pd);
+	mpt_entry->start     = cpu_to_be64(iova);
+	mpt_entry->length    = cpu_to_be64(total_size);
+
+	memset(&mpt_entry->lkey, 0,
+	       sizeof *mpt_entry - offsetof(struct mthca_mpt_entry, lkey));
+
+	if (mr->mtt)
+		mpt_entry->mtt_seg =
+			cpu_to_be64(dev->mr_table.mtt_base +
+				    mr->mtt->first_seg * dev->limits.mtt_seg_size);
+
+	if (0) {
+		mthca_dbg(dev, "Dumping MPT entry %08x:\n", mr->ibmr.lkey);
+		for (i = 0; i < sizeof (struct mthca_mpt_entry) / 4; ++i) {
+			if (i % 4 == 0)
+				printk("[%02x] ", i * 4);
+			printk(" %08x", be32_to_cpu(((__be32 *) mpt_entry)[i]));
+			if ((i + 1) % 4 == 0)
+				printk("\n");
+		}
+	}
+
+	err = mthca_SW2HW_MPT(dev, mailbox,
+			      key & (dev->limits.num_mpts - 1));
+	if (err) {
+		mthca_warn(dev, "SW2HW_MPT failed (%d)\n", err);
+		goto err_out_mailbox;
+	}
+
+	mthca_free_mailbox(dev, mailbox);
+	return err;
+
+err_out_mailbox:
+	mthca_free_mailbox(dev, mailbox);
+
+err_out_table:
+	mthca_table_put(dev, dev->mr_table.mpt_table, key);
+
+err_out_mpt_free:
+	mthca_free(&dev->mr_table.mpt_alloc, key);
+	return err;
+}
+
+int mthca_mr_alloc_notrans(struct mthca_dev *dev, u32 pd,
+			   u32 access, struct mthca_mr *mr)
+{
+	mr->mtt = NULL;
+	return mthca_mr_alloc(dev, pd, 12, 0, ~0ULL, access, mr);
+}
+
+int mthca_mr_alloc_phys(struct mthca_dev *dev, u32 pd,
+			u64 *buffer_list, int buffer_size_shift,
+			int list_len, u64 iova, u64 total_size,
+			u32 access, struct mthca_mr *mr)
+{
+	int err;
+
+	mr->mtt = mthca_alloc_mtt(dev, list_len);
+	if (IS_ERR(mr->mtt))
+		return PTR_ERR(mr->mtt);
+
+	err = mthca_write_mtt(dev, mr->mtt, 0, buffer_list, list_len);
+	if (err) {
+		mthca_free_mtt(dev, mr->mtt);
+		return err;
+	}
+
+	err = mthca_mr_alloc(dev, pd, buffer_size_shift, iova,
+			     total_size, access, mr);
+	if (err)
+		mthca_free_mtt(dev, mr->mtt);
+
+	return err;
+}
+
+/* Free mr or fmr */
+static void mthca_free_region(struct mthca_dev *dev, u32 lkey)
+{
+	mthca_table_put(dev, dev->mr_table.mpt_table,
+			key_to_hw_index(dev, lkey));
+
+	mthca_free(&dev->mr_table.mpt_alloc, key_to_hw_index(dev, lkey));
+}
+
+void mthca_free_mr(struct mthca_dev *dev, struct mthca_mr *mr)
+{
+	int err;
+
+	err = mthca_HW2SW_MPT(dev, NULL,
+			      key_to_hw_index(dev, mr->ibmr.lkey) &
+			      (dev->limits.num_mpts - 1));
+	if (err)
+		mthca_warn(dev, "HW2SW_MPT failed (%d)\n", err);
+
+	mthca_free_region(dev, mr->ibmr.lkey);
+	mthca_free_mtt(dev, mr->mtt);
+}
+
+int mthca_fmr_alloc(struct mthca_dev *dev, u32 pd,
+		    u32 access, struct mthca_fmr *mr)
+{
+	struct mthca_mpt_entry *mpt_entry;
+	struct mthca_mailbox *mailbox;
+	u64 mtt_seg;
+	u32 key, idx;
+	int list_len = mr->attr.max_pages;
+	int err = -ENOMEM;
+	int i;
+
+	if (mr->attr.page_shift < 12 || mr->attr.page_shift >= 32)
+		return -EINVAL;
+
+	/* For Arbel, all MTTs must fit in the same page. */
+	if (mthca_is_memfree(dev) &&
+	    mr->attr.max_pages * sizeof *mr->mem.arbel.mtts > PAGE_SIZE)
+		return -EINVAL;
+
+	mr->maps = 0;
+
+	key = mthca_alloc(&dev->mr_table.mpt_alloc);
+	if (key == -1)
+		return -ENOMEM;
+	key = adjust_key(dev, key);
+
+	idx = key & (dev->limits.num_mpts - 1);
+	mr->ibmr.rkey = mr->ibmr.lkey = hw_index_to_key(dev, key);
+
+	if (mthca_is_memfree(dev)) {
+		err = mthca_table_get(dev, dev->mr_table.mpt_table, key);
+		if (err)
+			goto err_out_mpt_free;
+
+		mr->mem.arbel.mpt = mthca_table_find(dev->mr_table.mpt_table, key, NULL);
+		BUG_ON(!mr->mem.arbel.mpt);
+	} else
+		mr->mem.tavor.mpt = dev->mr_table.tavor_fmr.mpt_base +
+			sizeof *(mr->mem.tavor.mpt) * idx;
+
+	mr->mtt = __mthca_alloc_mtt(dev, list_len, dev->mr_table.fmr_mtt_buddy);
+	if (IS_ERR(mr->mtt)) {
+		err = PTR_ERR(mr->mtt);
+		goto err_out_table;
+	}
+
+	mtt_seg = mr->mtt->first_seg * dev->limits.mtt_seg_size;
+
+	if (mthca_is_memfree(dev)) {
+		mr->mem.arbel.mtts = mthca_table_find(dev->mr_table.mtt_table,
+						      mr->mtt->first_seg,
+						      &mr->mem.arbel.dma_handle);
+		BUG_ON(!mr->mem.arbel.mtts);
+	} else
+		mr->mem.tavor.mtts = dev->mr_table.tavor_fmr.mtt_base + mtt_seg;
+
+	mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+	if (IS_ERR(mailbox)) {
+		err = PTR_ERR(mailbox);
+		goto err_out_free_mtt;
+	}
+
+	mpt_entry = mailbox->buf;
+
+	mpt_entry->flags = cpu_to_be32(MTHCA_MPT_FLAG_SW_OWNS     |
+				       MTHCA_MPT_FLAG_MIO         |
+				       MTHCA_MPT_FLAG_REGION      |
+				       access);
+
+	mpt_entry->page_size = cpu_to_be32(mr->attr.page_shift - 12);
+	mpt_entry->key       = cpu_to_be32(key);
+	mpt_entry->pd        = cpu_to_be32(pd);
+	memset(&mpt_entry->start, 0,
+	       sizeof *mpt_entry - offsetof(struct mthca_mpt_entry, start));
+	mpt_entry->mtt_seg   = cpu_to_be64(dev->mr_table.mtt_base + mtt_seg);
+
+	if (0) {
+		mthca_dbg(dev, "Dumping MPT entry %08x:\n", mr->ibmr.lkey);
+		for (i = 0; i < sizeof (struct mthca_mpt_entry) / 4; ++i) {
+			if (i % 4 == 0)
+				printk("[%02x] ", i * 4);
+			printk(" %08x", be32_to_cpu(((__be32 *) mpt_entry)[i]));
+			if ((i + 1) % 4 == 0)
+				printk("\n");
+		}
+	}
+
+	err = mthca_SW2HW_MPT(dev, mailbox,
+			      key & (dev->limits.num_mpts - 1));
+	if (err) {
+		mthca_warn(dev, "SW2HW_MPT failed (%d)\n", err);
+		goto err_out_mailbox_free;
+	}
+
+	mthca_free_mailbox(dev, mailbox);
+	return 0;
+
+err_out_mailbox_free:
+	mthca_free_mailbox(dev, mailbox);
+
+err_out_free_mtt:
+	mthca_free_mtt(dev, mr->mtt);
+
+err_out_table:
+	mthca_table_put(dev, dev->mr_table.mpt_table, key);
+
+err_out_mpt_free:
+	mthca_free(&dev->mr_table.mpt_alloc, key);
+	return err;
+}
+
+int mthca_free_fmr(struct mthca_dev *dev, struct mthca_fmr *fmr)
+{
+	if (fmr->maps)
+		return -EBUSY;
+
+	mthca_free_region(dev, fmr->ibmr.lkey);
+	mthca_free_mtt(dev, fmr->mtt);
+
+	return 0;
+}
+
+static inline int mthca_check_fmr(struct mthca_fmr *fmr, u64 *page_list,
+				  int list_len, u64 iova)
+{
+	int i, page_mask;
+
+	if (list_len > fmr->attr.max_pages)
+		return -EINVAL;
+
+	page_mask = (1 << fmr->attr.page_shift) - 1;
+
+	/* We are getting page lists, so va must be page aligned. */
+	if (iova & page_mask)
+		return -EINVAL;
+
+	/* Trust the user not to pass misaligned data in page_list */
+	if (0)
+		for (i = 0; i < list_len; ++i) {
+			if (page_list[i] & ~page_mask)
+				return -EINVAL;
+		}
+
+	if (fmr->maps >= fmr->attr.max_maps)
+		return -EINVAL;
+
+	return 0;
+}
+
+
+int mthca_tavor_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list,
+			     int list_len, u64 iova)
+{
+	struct mthca_fmr *fmr = to_mfmr(ibfmr);
+	struct mthca_dev *dev = to_mdev(ibfmr->device);
+	struct mthca_mpt_entry mpt_entry;
+	u32 key;
+	int i, err;
+
+	err = mthca_check_fmr(fmr, page_list, list_len, iova);
+	if (err)
+		return err;
+
+	++fmr->maps;
+
+	key = tavor_key_to_hw_index(fmr->ibmr.lkey);
+	key += dev->limits.num_mpts;
+	fmr->ibmr.lkey = fmr->ibmr.rkey = tavor_hw_index_to_key(key);
+
+	writeb(MTHCA_MPT_STATUS_SW, fmr->mem.tavor.mpt);
+
+	for (i = 0; i < list_len; ++i) {
+		__be64 mtt_entry = cpu_to_be64(page_list[i] |
+					       MTHCA_MTT_FLAG_PRESENT);
+		mthca_write64_raw(mtt_entry, fmr->mem.tavor.mtts + i);
+	}
+
+	mpt_entry.lkey   = cpu_to_be32(key);
+	mpt_entry.length = cpu_to_be64(list_len * (1ull << fmr->attr.page_shift));
+	mpt_entry.start  = cpu_to_be64(iova);
+
+	__raw_writel((__force u32) mpt_entry.lkey, &fmr->mem.tavor.mpt->key);
+	memcpy_toio(&fmr->mem.tavor.mpt->start, &mpt_entry.start,
+		    offsetof(struct mthca_mpt_entry, window_count) -
+		    offsetof(struct mthca_mpt_entry, start));
+
+	writeb(MTHCA_MPT_STATUS_HW, fmr->mem.tavor.mpt);
+
+	return 0;
+}
+
+int mthca_arbel_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list,
+			     int list_len, u64 iova)
+{
+	struct mthca_fmr *fmr = to_mfmr(ibfmr);
+	struct mthca_dev *dev = to_mdev(ibfmr->device);
+	u32 key;
+	int i, err;
+
+	err = mthca_check_fmr(fmr, page_list, list_len, iova);
+	if (err)
+		return err;
+
+	++fmr->maps;
+
+	key = arbel_key_to_hw_index(fmr->ibmr.lkey);
+	if (dev->mthca_flags & MTHCA_FLAG_SINAI_OPT)
+		key += SINAI_FMR_KEY_INC;
+	else
+		key += dev->limits.num_mpts;
+	fmr->ibmr.lkey = fmr->ibmr.rkey = arbel_hw_index_to_key(key);
+
+	*(u8 *) fmr->mem.arbel.mpt = MTHCA_MPT_STATUS_SW;
+
+	wmb();
+
+	dma_sync_single_for_cpu(&dev->pdev->dev, fmr->mem.arbel.dma_handle,
+				list_len * sizeof(u64), DMA_TO_DEVICE);
+
+	for (i = 0; i < list_len; ++i)
+		fmr->mem.arbel.mtts[i] = cpu_to_be64(page_list[i] |
+						     MTHCA_MTT_FLAG_PRESENT);
+
+	dma_sync_single_for_device(&dev->pdev->dev, fmr->mem.arbel.dma_handle,
+				   list_len * sizeof(u64), DMA_TO_DEVICE);
+
+	fmr->mem.arbel.mpt->key    = cpu_to_be32(key);
+	fmr->mem.arbel.mpt->lkey   = cpu_to_be32(key);
+	fmr->mem.arbel.mpt->length = cpu_to_be64(list_len * (1ull << fmr->attr.page_shift));
+	fmr->mem.arbel.mpt->start  = cpu_to_be64(iova);
+
+	wmb();
+
+	*(u8 *) fmr->mem.arbel.mpt = MTHCA_MPT_STATUS_HW;
+
+	wmb();
+
+	return 0;
+}
+
+void mthca_tavor_fmr_unmap(struct mthca_dev *dev, struct mthca_fmr *fmr)
+{
+	if (!fmr->maps)
+		return;
+
+	fmr->maps = 0;
+
+	writeb(MTHCA_MPT_STATUS_SW, fmr->mem.tavor.mpt);
+}
+
+void mthca_arbel_fmr_unmap(struct mthca_dev *dev, struct mthca_fmr *fmr)
+{
+	if (!fmr->maps)
+		return;
+
+	fmr->maps = 0;
+
+	*(u8 *) fmr->mem.arbel.mpt = MTHCA_MPT_STATUS_SW;
+}
+
+int mthca_init_mr_table(struct mthca_dev *dev)
+{
+	phys_addr_t addr;
+	int mpts, mtts, err, i;
+
+	err = mthca_alloc_init(&dev->mr_table.mpt_alloc,
+			       dev->limits.num_mpts,
+			       ~0, dev->limits.reserved_mrws);
+	if (err)
+		return err;
+
+	if (!mthca_is_memfree(dev) &&
+	    (dev->mthca_flags & MTHCA_FLAG_DDR_HIDDEN))
+		dev->limits.fmr_reserved_mtts = 0;
+	else
+		dev->mthca_flags |= MTHCA_FLAG_FMR;
+
+	if (dev->mthca_flags & MTHCA_FLAG_SINAI_OPT)
+		mthca_dbg(dev, "Memory key throughput optimization activated.\n");
+
+	err = mthca_buddy_init(&dev->mr_table.mtt_buddy,
+			       fls(dev->limits.num_mtt_segs - 1));
+
+	if (err)
+		goto err_mtt_buddy;
+
+	dev->mr_table.tavor_fmr.mpt_base = NULL;
+	dev->mr_table.tavor_fmr.mtt_base = NULL;
+
+	if (dev->limits.fmr_reserved_mtts) {
+		i = fls(dev->limits.fmr_reserved_mtts - 1);
+
+		if (i >= 31) {
+			mthca_warn(dev, "Unable to reserve 2^31 FMR MTTs.\n");
+			err = -EINVAL;
+			goto err_fmr_mpt;
+		}
+		mpts = mtts = 1 << i;
+	} else {
+		mtts = dev->limits.num_mtt_segs;
+		mpts = dev->limits.num_mpts;
+	}
+
+	if (!mthca_is_memfree(dev) &&
+	    (dev->mthca_flags & MTHCA_FLAG_FMR)) {
+
+		addr = pci_resource_start(dev->pdev, 4) +
+			((pci_resource_len(dev->pdev, 4) - 1) &
+			 dev->mr_table.mpt_base);
+
+		dev->mr_table.tavor_fmr.mpt_base =
+			ioremap(addr, mpts * sizeof(struct mthca_mpt_entry));
+
+		if (!dev->mr_table.tavor_fmr.mpt_base) {
+			mthca_warn(dev, "MPT ioremap for FMR failed.\n");
+			err = -ENOMEM;
+			goto err_fmr_mpt;
+		}
+
+		addr = pci_resource_start(dev->pdev, 4) +
+			((pci_resource_len(dev->pdev, 4) - 1) &
+			 dev->mr_table.mtt_base);
+
+		dev->mr_table.tavor_fmr.mtt_base =
+			ioremap(addr, mtts * dev->limits.mtt_seg_size);
+		if (!dev->mr_table.tavor_fmr.mtt_base) {
+			mthca_warn(dev, "MTT ioremap for FMR failed.\n");
+			err = -ENOMEM;
+			goto err_fmr_mtt;
+		}
+	}
+
+	if (dev->limits.fmr_reserved_mtts) {
+		err = mthca_buddy_init(&dev->mr_table.tavor_fmr.mtt_buddy, fls(mtts - 1));
+		if (err)
+			goto err_fmr_mtt_buddy;
+
+		/* Prevent regular MRs from using FMR keys */
+		err = mthca_buddy_alloc(&dev->mr_table.mtt_buddy, fls(mtts - 1));
+		if (err)
+			goto err_reserve_fmr;
+
+		dev->mr_table.fmr_mtt_buddy =
+			&dev->mr_table.tavor_fmr.mtt_buddy;
+	} else
+		dev->mr_table.fmr_mtt_buddy = &dev->mr_table.mtt_buddy;
+
+	/* FMR table is always the first, take reserved MTTs out of there */
+	if (dev->limits.reserved_mtts) {
+		i = fls(dev->limits.reserved_mtts - 1);
+
+		if (mthca_alloc_mtt_range(dev, i,
+					  dev->mr_table.fmr_mtt_buddy) == -1) {
+			mthca_warn(dev, "MTT table of order %d is too small.\n",
+				  dev->mr_table.fmr_mtt_buddy->max_order);
+			err = -ENOMEM;
+			goto err_reserve_mtts;
+		}
+	}
+
+	return 0;
+
+err_reserve_mtts:
+err_reserve_fmr:
+	if (dev->limits.fmr_reserved_mtts)
+		mthca_buddy_cleanup(&dev->mr_table.tavor_fmr.mtt_buddy);
+
+err_fmr_mtt_buddy:
+	if (dev->mr_table.tavor_fmr.mtt_base)
+		iounmap(dev->mr_table.tavor_fmr.mtt_base);
+
+err_fmr_mtt:
+	if (dev->mr_table.tavor_fmr.mpt_base)
+		iounmap(dev->mr_table.tavor_fmr.mpt_base);
+
+err_fmr_mpt:
+	mthca_buddy_cleanup(&dev->mr_table.mtt_buddy);
+
+err_mtt_buddy:
+	mthca_alloc_cleanup(&dev->mr_table.mpt_alloc);
+
+	return err;
+}
+
+void mthca_cleanup_mr_table(struct mthca_dev *dev)
+{
+	/* XXX check if any MRs are still allocated? */
+	if (dev->limits.fmr_reserved_mtts)
+		mthca_buddy_cleanup(&dev->mr_table.tavor_fmr.mtt_buddy);
+
+	mthca_buddy_cleanup(&dev->mr_table.mtt_buddy);
+
+	if (dev->mr_table.tavor_fmr.mtt_base)
+		iounmap(dev->mr_table.tavor_fmr.mtt_base);
+	if (dev->mr_table.tavor_fmr.mpt_base)
+		iounmap(dev->mr_table.tavor_fmr.mpt_base);
+
+	mthca_alloc_cleanup(&dev->mr_table.mpt_alloc);
+}
diff --git a/drivers/infiniband/hw/mthca/mthca_pd.c b/drivers/infiniband/hw/mthca/mthca_pd.c
new file mode 100644
index 0000000..266f14e
--- /dev/null
+++ b/drivers/infiniband/hw/mthca/mthca_pd.c
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Cisco Systems.  All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/errno.h>
+
+#include "mthca_dev.h"
+
+int mthca_pd_alloc(struct mthca_dev *dev, int privileged, struct mthca_pd *pd)
+{
+	int err = 0;
+
+	pd->privileged = privileged;
+
+	atomic_set(&pd->sqp_count, 0);
+	pd->pd_num = mthca_alloc(&dev->pd_table.alloc);
+	if (pd->pd_num == -1)
+		return -ENOMEM;
+
+	if (privileged) {
+		err = mthca_mr_alloc_notrans(dev, pd->pd_num,
+					     MTHCA_MPT_FLAG_LOCAL_READ |
+					     MTHCA_MPT_FLAG_LOCAL_WRITE,
+					     &pd->ntmr);
+		if (err)
+			mthca_free(&dev->pd_table.alloc, pd->pd_num);
+	}
+
+	return err;
+}
+
+void mthca_pd_free(struct mthca_dev *dev, struct mthca_pd *pd)
+{
+	if (pd->privileged)
+		mthca_free_mr(dev, &pd->ntmr);
+	mthca_free(&dev->pd_table.alloc, pd->pd_num);
+}
+
+int mthca_init_pd_table(struct mthca_dev *dev)
+{
+	return mthca_alloc_init(&dev->pd_table.alloc,
+				dev->limits.num_pds,
+				(1 << 24) - 1,
+				dev->limits.reserved_pds);
+}
+
+void mthca_cleanup_pd_table(struct mthca_dev *dev)
+{
+	/* XXX check if any PDs are still allocated? */
+	mthca_alloc_cleanup(&dev->pd_table.alloc);
+}
diff --git a/drivers/infiniband/hw/mthca/mthca_profile.c b/drivers/infiniband/hw/mthca/mthca_profile.c
new file mode 100644
index 0000000..8edb28a
--- /dev/null
+++ b/drivers/infiniband/hw/mthca/mthca_profile.c
@@ -0,0 +1,285 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+
+#include "mthca_profile.h"
+
+enum {
+	MTHCA_RES_QP,
+	MTHCA_RES_EEC,
+	MTHCA_RES_SRQ,
+	MTHCA_RES_CQ,
+	MTHCA_RES_EQP,
+	MTHCA_RES_EEEC,
+	MTHCA_RES_EQ,
+	MTHCA_RES_RDB,
+	MTHCA_RES_MCG,
+	MTHCA_RES_MPT,
+	MTHCA_RES_MTT,
+	MTHCA_RES_UAR,
+	MTHCA_RES_UDAV,
+	MTHCA_RES_UARC,
+	MTHCA_RES_NUM
+};
+
+enum {
+	MTHCA_NUM_EQS = 32,
+	MTHCA_NUM_PDS = 1 << 15
+};
+
+s64 mthca_make_profile(struct mthca_dev *dev,
+		       struct mthca_profile *request,
+		       struct mthca_dev_lim *dev_lim,
+		       struct mthca_init_hca_param *init_hca)
+{
+	struct mthca_resource {
+		u64 size;
+		u64 start;
+		int type;
+		int num;
+		int log_num;
+	};
+
+	u64 mem_base, mem_avail;
+	s64 total_size = 0;
+	struct mthca_resource *profile;
+	struct mthca_resource tmp;
+	int i, j;
+
+	profile = kzalloc(MTHCA_RES_NUM * sizeof *profile, GFP_KERNEL);
+	if (!profile)
+		return -ENOMEM;
+
+	profile[MTHCA_RES_QP].size   = dev_lim->qpc_entry_sz;
+	profile[MTHCA_RES_EEC].size  = dev_lim->eec_entry_sz;
+	profile[MTHCA_RES_SRQ].size  = dev_lim->srq_entry_sz;
+	profile[MTHCA_RES_CQ].size   = dev_lim->cqc_entry_sz;
+	profile[MTHCA_RES_EQP].size  = dev_lim->eqpc_entry_sz;
+	profile[MTHCA_RES_EEEC].size = dev_lim->eeec_entry_sz;
+	profile[MTHCA_RES_EQ].size   = dev_lim->eqc_entry_sz;
+	profile[MTHCA_RES_RDB].size  = MTHCA_RDB_ENTRY_SIZE;
+	profile[MTHCA_RES_MCG].size  = MTHCA_MGM_ENTRY_SIZE;
+	profile[MTHCA_RES_MPT].size  = dev_lim->mpt_entry_sz;
+	profile[MTHCA_RES_MTT].size  = dev->limits.mtt_seg_size;
+	profile[MTHCA_RES_UAR].size  = dev_lim->uar_scratch_entry_sz;
+	profile[MTHCA_RES_UDAV].size = MTHCA_AV_SIZE;
+	profile[MTHCA_RES_UARC].size = request->uarc_size;
+
+	profile[MTHCA_RES_QP].num    = request->num_qp;
+	profile[MTHCA_RES_SRQ].num   = request->num_srq;
+	profile[MTHCA_RES_EQP].num   = request->num_qp;
+	profile[MTHCA_RES_RDB].num   = request->num_qp * request->rdb_per_qp;
+	profile[MTHCA_RES_CQ].num    = request->num_cq;
+	profile[MTHCA_RES_EQ].num    = MTHCA_NUM_EQS;
+	profile[MTHCA_RES_MCG].num   = request->num_mcg;
+	profile[MTHCA_RES_MPT].num   = request->num_mpt;
+	profile[MTHCA_RES_MTT].num   = request->num_mtt;
+	profile[MTHCA_RES_UAR].num   = request->num_uar;
+	profile[MTHCA_RES_UARC].num  = request->num_uar;
+	profile[MTHCA_RES_UDAV].num  = request->num_udav;
+
+	for (i = 0; i < MTHCA_RES_NUM; ++i) {
+		profile[i].type     = i;
+		profile[i].log_num  = max(ffs(profile[i].num) - 1, 0);
+		profile[i].size    *= profile[i].num;
+		if (mthca_is_memfree(dev))
+			profile[i].size = max(profile[i].size, (u64) PAGE_SIZE);
+	}
+
+	if (mthca_is_memfree(dev)) {
+		mem_base  = 0;
+		mem_avail = dev_lim->hca.arbel.max_icm_sz;
+	} else {
+		mem_base  = dev->ddr_start;
+		mem_avail = dev->fw.tavor.fw_start - dev->ddr_start;
+	}
+
+	/*
+	 * Sort the resources in decreasing order of size.  Since they
+	 * all have sizes that are powers of 2, we'll be able to keep
+	 * resources aligned to their size and pack them without gaps
+	 * using the sorted order.
+	 */
+	for (i = MTHCA_RES_NUM; i > 0; --i)
+		for (j = 1; j < i; ++j) {
+			if (profile[j].size > profile[j - 1].size) {
+				tmp            = profile[j];
+				profile[j]     = profile[j - 1];
+				profile[j - 1] = tmp;
+			}
+		}
+
+	for (i = 0; i < MTHCA_RES_NUM; ++i) {
+		if (profile[i].size) {
+			profile[i].start = mem_base + total_size;
+			total_size      += profile[i].size;
+		}
+		if (total_size > mem_avail) {
+			mthca_err(dev, "Profile requires 0x%llx bytes; "
+				  "won't fit in 0x%llx bytes of context memory.\n",
+				  (unsigned long long) total_size,
+				  (unsigned long long) mem_avail);
+			kfree(profile);
+			return -ENOMEM;
+		}
+
+		if (profile[i].size)
+			mthca_dbg(dev, "profile[%2d]--%2d/%2d @ 0x%16llx "
+				  "(size 0x%8llx)\n",
+				  i, profile[i].type, profile[i].log_num,
+				  (unsigned long long) profile[i].start,
+				  (unsigned long long) profile[i].size);
+	}
+
+	if (mthca_is_memfree(dev))
+		mthca_dbg(dev, "HCA context memory: reserving %d KB\n",
+			  (int) (total_size >> 10));
+	else
+		mthca_dbg(dev, "HCA memory: allocated %d KB/%d KB (%d KB free)\n",
+			  (int) (total_size >> 10), (int) (mem_avail >> 10),
+			  (int) ((mem_avail - total_size) >> 10));
+
+	for (i = 0; i < MTHCA_RES_NUM; ++i) {
+		switch (profile[i].type) {
+		case MTHCA_RES_QP:
+			dev->limits.num_qps   = profile[i].num;
+			init_hca->qpc_base    = profile[i].start;
+			init_hca->log_num_qps = profile[i].log_num;
+			break;
+		case MTHCA_RES_EEC:
+			dev->limits.num_eecs   = profile[i].num;
+			init_hca->eec_base     = profile[i].start;
+			init_hca->log_num_eecs = profile[i].log_num;
+			break;
+		case MTHCA_RES_SRQ:
+			dev->limits.num_srqs   = profile[i].num;
+			init_hca->srqc_base    = profile[i].start;
+			init_hca->log_num_srqs = profile[i].log_num;
+			break;
+		case MTHCA_RES_CQ:
+			dev->limits.num_cqs   = profile[i].num;
+			init_hca->cqc_base    = profile[i].start;
+			init_hca->log_num_cqs = profile[i].log_num;
+			break;
+		case MTHCA_RES_EQP:
+			init_hca->eqpc_base = profile[i].start;
+			break;
+		case MTHCA_RES_EEEC:
+			init_hca->eeec_base = profile[i].start;
+			break;
+		case MTHCA_RES_EQ:
+			dev->limits.num_eqs   = profile[i].num;
+			init_hca->eqc_base    = profile[i].start;
+			init_hca->log_num_eqs = profile[i].log_num;
+			break;
+		case MTHCA_RES_RDB:
+			for (dev->qp_table.rdb_shift = 0;
+			     request->num_qp << dev->qp_table.rdb_shift < profile[i].num;
+			     ++dev->qp_table.rdb_shift)
+				; /* nothing */
+			dev->qp_table.rdb_base    = (u32) profile[i].start;
+			init_hca->rdb_base        = profile[i].start;
+			break;
+		case MTHCA_RES_MCG:
+			dev->limits.num_mgms      = profile[i].num >> 1;
+			dev->limits.num_amgms     = profile[i].num >> 1;
+			init_hca->mc_base         = profile[i].start;
+			init_hca->log_mc_entry_sz = ffs(MTHCA_MGM_ENTRY_SIZE) - 1;
+			init_hca->log_mc_table_sz = profile[i].log_num;
+			init_hca->mc_hash_sz      = 1 << (profile[i].log_num - 1);
+			break;
+		case MTHCA_RES_MPT:
+			dev->limits.num_mpts   = profile[i].num;
+			dev->mr_table.mpt_base = profile[i].start;
+			init_hca->mpt_base     = profile[i].start;
+			init_hca->log_mpt_sz   = profile[i].log_num;
+			break;
+		case MTHCA_RES_MTT:
+			dev->limits.num_mtt_segs = profile[i].num;
+			dev->mr_table.mtt_base   = profile[i].start;
+			init_hca->mtt_base       = profile[i].start;
+			init_hca->mtt_seg_sz     = ffs(dev->limits.mtt_seg_size) - 7;
+			break;
+		case MTHCA_RES_UAR:
+			dev->limits.num_uars       = profile[i].num;
+			init_hca->uar_scratch_base = profile[i].start;
+			break;
+		case MTHCA_RES_UDAV:
+			dev->av_table.ddr_av_base = profile[i].start;
+			dev->av_table.num_ddr_avs = profile[i].num;
+			break;
+		case MTHCA_RES_UARC:
+			dev->uar_table.uarc_size = request->uarc_size;
+			dev->uar_table.uarc_base = profile[i].start;
+			init_hca->uarc_base   	 = profile[i].start;
+			init_hca->log_uarc_sz 	 = ffs(request->uarc_size) - 13;
+			init_hca->log_uar_sz  	 = ffs(request->num_uar) - 1;
+			break;
+		default:
+			break;
+		}
+	}
+
+	/*
+	 * PDs don't take any HCA memory, but we assign them as part
+	 * of the HCA profile anyway.
+	 */
+	dev->limits.num_pds = MTHCA_NUM_PDS;
+
+	if (dev->mthca_flags & MTHCA_FLAG_SINAI_OPT &&
+	    init_hca->log_mpt_sz > 23) {
+		mthca_warn(dev, "MPT table too large (requested size 2^%d >= 2^24)\n",
+			   init_hca->log_mpt_sz);
+		mthca_warn(dev, "Disabling memory key throughput optimization.\n");
+		dev->mthca_flags &= ~MTHCA_FLAG_SINAI_OPT;
+	}
+
+	/*
+	 * For Tavor, FMRs use ioremapped PCI memory. For 32 bit
+	 * systems it may use too much vmalloc space to map all MTT
+	 * memory, so we reserve some MTTs for FMR access, taking them
+	 * out of the MR pool. They don't use additional memory, but
+	 * we assign them as part of the HCA profile anyway.
+	 */
+	if (mthca_is_memfree(dev) || BITS_PER_LONG == 64)
+		dev->limits.fmr_reserved_mtts = 0;
+	else
+		dev->limits.fmr_reserved_mtts = request->fmr_reserved_mtts;
+
+	kfree(profile);
+	return total_size;
+}
diff --git a/drivers/infiniband/hw/mthca/mthca_profile.h b/drivers/infiniband/hw/mthca/mthca_profile.h
new file mode 100644
index 0000000..62b009c
--- /dev/null
+++ b/drivers/infiniband/hw/mthca/mthca_profile.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MTHCA_PROFILE_H
+#define MTHCA_PROFILE_H
+
+#include "mthca_dev.h"
+#include "mthca_cmd.h"
+
+struct mthca_profile {
+	int num_qp;
+	int rdb_per_qp;
+	int num_srq;
+	int num_cq;
+	int num_mcg;
+	int num_mpt;
+	int num_mtt;
+	int num_udav;
+	int num_uar;
+	int uarc_size;
+	int fmr_reserved_mtts;
+};
+
+s64 mthca_make_profile(struct mthca_dev *mdev,
+		       struct mthca_profile *request,
+		       struct mthca_dev_lim *dev_lim,
+		       struct mthca_init_hca_param *init_hca);
+
+#endif /* MTHCA_PROFILE_H */
diff --git a/drivers/infiniband/hw/mthca/mthca_provider.c b/drivers/infiniband/hw/mthca/mthca_provider.c
new file mode 100644
index 0000000..415f8e1
--- /dev/null
+++ b/drivers/infiniband/hw/mthca/mthca_provider.c
@@ -0,0 +1,1375 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2005, 2006 Cisco Systems.  All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2004 Voltaire, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <rdma/ib_smi.h>
+#include <rdma/ib_umem.h>
+#include <rdma/ib_user_verbs.h>
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/stat.h>
+#include <linux/mm.h>
+#include <linux/export.h>
+
+#include "mthca_dev.h"
+#include "mthca_cmd.h"
+#include "mthca_user.h"
+#include "mthca_memfree.h"
+
+static void init_query_mad(struct ib_smp *mad)
+{
+	mad->base_version  = 1;
+	mad->mgmt_class    = IB_MGMT_CLASS_SUBN_LID_ROUTED;
+	mad->class_version = 1;
+	mad->method    	   = IB_MGMT_METHOD_GET;
+}
+
+static int mthca_query_device(struct ib_device *ibdev,
+			      struct ib_device_attr *props)
+{
+	struct ib_smp *in_mad  = NULL;
+	struct ib_smp *out_mad = NULL;
+	int err = -ENOMEM;
+	struct mthca_dev *mdev = to_mdev(ibdev);
+
+	in_mad  = kzalloc(sizeof *in_mad, GFP_KERNEL);
+	out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL);
+	if (!in_mad || !out_mad)
+		goto out;
+
+	memset(props, 0, sizeof *props);
+
+	props->fw_ver              = mdev->fw_ver;
+
+	init_query_mad(in_mad);
+	in_mad->attr_id = IB_SMP_ATTR_NODE_INFO;
+
+	err = mthca_MAD_IFC(mdev, 1, 1,
+			    1, NULL, NULL, in_mad, out_mad);
+	if (err)
+		goto out;
+
+	props->device_cap_flags    = mdev->device_cap_flags;
+	props->vendor_id           = be32_to_cpup((__be32 *) (out_mad->data + 36)) &
+		0xffffff;
+	props->vendor_part_id      = be16_to_cpup((__be16 *) (out_mad->data + 30));
+	props->hw_ver              = be32_to_cpup((__be32 *) (out_mad->data + 32));
+	memcpy(&props->sys_image_guid, out_mad->data +  4, 8);
+
+	props->max_mr_size         = ~0ull;
+	props->page_size_cap       = mdev->limits.page_size_cap;
+	props->max_qp              = mdev->limits.num_qps - mdev->limits.reserved_qps;
+	props->max_qp_wr           = mdev->limits.max_wqes;
+	props->max_sge             = mdev->limits.max_sg;
+	props->max_cq              = mdev->limits.num_cqs - mdev->limits.reserved_cqs;
+	props->max_cqe             = mdev->limits.max_cqes;
+	props->max_mr              = mdev->limits.num_mpts - mdev->limits.reserved_mrws;
+	props->max_pd              = mdev->limits.num_pds - mdev->limits.reserved_pds;
+	props->max_qp_rd_atom      = 1 << mdev->qp_table.rdb_shift;
+	props->max_qp_init_rd_atom = mdev->limits.max_qp_init_rdma;
+	props->max_res_rd_atom     = props->max_qp_rd_atom * props->max_qp;
+	props->max_srq             = mdev->limits.num_srqs - mdev->limits.reserved_srqs;
+	props->max_srq_wr          = mdev->limits.max_srq_wqes;
+	props->max_srq_sge         = mdev->limits.max_srq_sge;
+	props->local_ca_ack_delay  = mdev->limits.local_ca_ack_delay;
+	props->atomic_cap          = mdev->limits.flags & DEV_LIM_FLAG_ATOMIC ?
+					IB_ATOMIC_HCA : IB_ATOMIC_NONE;
+	props->max_pkeys           = mdev->limits.pkey_table_len;
+	props->max_mcast_grp       = mdev->limits.num_mgms + mdev->limits.num_amgms;
+	props->max_mcast_qp_attach = MTHCA_QP_PER_MGM;
+	props->max_total_mcast_qp_attach = props->max_mcast_qp_attach *
+					   props->max_mcast_grp;
+	/*
+	 * If Sinai memory key optimization is being used, then only
+	 * the 8-bit key portion will change.  For other HCAs, the
+	 * unused index bits will also be used for FMR remapping.
+	 */
+	if (mdev->mthca_flags & MTHCA_FLAG_SINAI_OPT)
+		props->max_map_per_fmr = 255;
+	else
+		props->max_map_per_fmr =
+			(1 << (32 - ilog2(mdev->limits.num_mpts))) - 1;
+
+	err = 0;
+ out:
+	kfree(in_mad);
+	kfree(out_mad);
+	return err;
+}
+
+static int mthca_query_port(struct ib_device *ibdev,
+			    u8 port, struct ib_port_attr *props)
+{
+	struct ib_smp *in_mad  = NULL;
+	struct ib_smp *out_mad = NULL;
+	int err = -ENOMEM;
+
+	in_mad  = kzalloc(sizeof *in_mad, GFP_KERNEL);
+	out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL);
+	if (!in_mad || !out_mad)
+		goto out;
+
+	memset(props, 0, sizeof *props);
+
+	init_query_mad(in_mad);
+	in_mad->attr_id  = IB_SMP_ATTR_PORT_INFO;
+	in_mad->attr_mod = cpu_to_be32(port);
+
+	err = mthca_MAD_IFC(to_mdev(ibdev), 1, 1,
+			    port, NULL, NULL, in_mad, out_mad);
+	if (err)
+		goto out;
+
+	props->lid               = be16_to_cpup((__be16 *) (out_mad->data + 16));
+	props->lmc               = out_mad->data[34] & 0x7;
+	props->sm_lid            = be16_to_cpup((__be16 *) (out_mad->data + 18));
+	props->sm_sl             = out_mad->data[36] & 0xf;
+	props->state             = out_mad->data[32] & 0xf;
+	props->phys_state        = out_mad->data[33] >> 4;
+	props->port_cap_flags    = be32_to_cpup((__be32 *) (out_mad->data + 20));
+	props->gid_tbl_len       = to_mdev(ibdev)->limits.gid_table_len;
+	props->max_msg_sz        = 0x80000000;
+	props->pkey_tbl_len      = to_mdev(ibdev)->limits.pkey_table_len;
+	props->bad_pkey_cntr     = be16_to_cpup((__be16 *) (out_mad->data + 46));
+	props->qkey_viol_cntr    = be16_to_cpup((__be16 *) (out_mad->data + 48));
+	props->active_width      = out_mad->data[31] & 0xf;
+	props->active_speed      = out_mad->data[35] >> 4;
+	props->max_mtu           = out_mad->data[41] & 0xf;
+	props->active_mtu        = out_mad->data[36] >> 4;
+	props->subnet_timeout    = out_mad->data[51] & 0x1f;
+	props->max_vl_num        = out_mad->data[37] >> 4;
+	props->init_type_reply   = out_mad->data[41] >> 4;
+
+ out:
+	kfree(in_mad);
+	kfree(out_mad);
+	return err;
+}
+
+static int mthca_modify_device(struct ib_device *ibdev,
+			       int mask,
+			       struct ib_device_modify *props)
+{
+	if (mask & ~IB_DEVICE_MODIFY_NODE_DESC)
+		return -EOPNOTSUPP;
+
+	if (mask & IB_DEVICE_MODIFY_NODE_DESC) {
+		if (mutex_lock_interruptible(&to_mdev(ibdev)->cap_mask_mutex))
+			return -ERESTARTSYS;
+		memcpy(ibdev->node_desc, props->node_desc, 64);
+		mutex_unlock(&to_mdev(ibdev)->cap_mask_mutex);
+	}
+
+	return 0;
+}
+
+static int mthca_modify_port(struct ib_device *ibdev,
+			     u8 port, int port_modify_mask,
+			     struct ib_port_modify *props)
+{
+	struct mthca_set_ib_param set_ib;
+	struct ib_port_attr attr;
+	int err;
+
+	if (mutex_lock_interruptible(&to_mdev(ibdev)->cap_mask_mutex))
+		return -ERESTARTSYS;
+
+	err = mthca_query_port(ibdev, port, &attr);
+	if (err)
+		goto out;
+
+	set_ib.set_si_guid     = 0;
+	set_ib.reset_qkey_viol = !!(port_modify_mask & IB_PORT_RESET_QKEY_CNTR);
+
+	set_ib.cap_mask = (attr.port_cap_flags | props->set_port_cap_mask) &
+		~props->clr_port_cap_mask;
+
+	err = mthca_SET_IB(to_mdev(ibdev), &set_ib, port);
+	if (err)
+		goto out;
+out:
+	mutex_unlock(&to_mdev(ibdev)->cap_mask_mutex);
+	return err;
+}
+
+static int mthca_query_pkey(struct ib_device *ibdev,
+			    u8 port, u16 index, u16 *pkey)
+{
+	struct ib_smp *in_mad  = NULL;
+	struct ib_smp *out_mad = NULL;
+	int err = -ENOMEM;
+
+	in_mad  = kzalloc(sizeof *in_mad, GFP_KERNEL);
+	out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL);
+	if (!in_mad || !out_mad)
+		goto out;
+
+	init_query_mad(in_mad);
+	in_mad->attr_id  = IB_SMP_ATTR_PKEY_TABLE;
+	in_mad->attr_mod = cpu_to_be32(index / 32);
+
+	err = mthca_MAD_IFC(to_mdev(ibdev), 1, 1,
+			    port, NULL, NULL, in_mad, out_mad);
+	if (err)
+		goto out;
+
+	*pkey = be16_to_cpu(((__be16 *) out_mad->data)[index % 32]);
+
+ out:
+	kfree(in_mad);
+	kfree(out_mad);
+	return err;
+}
+
+static int mthca_query_gid(struct ib_device *ibdev, u8 port,
+			   int index, union ib_gid *gid)
+{
+	struct ib_smp *in_mad  = NULL;
+	struct ib_smp *out_mad = NULL;
+	int err = -ENOMEM;
+
+	in_mad  = kzalloc(sizeof *in_mad, GFP_KERNEL);
+	out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL);
+	if (!in_mad || !out_mad)
+		goto out;
+
+	init_query_mad(in_mad);
+	in_mad->attr_id  = IB_SMP_ATTR_PORT_INFO;
+	in_mad->attr_mod = cpu_to_be32(port);
+
+	err = mthca_MAD_IFC(to_mdev(ibdev), 1, 1,
+			    port, NULL, NULL, in_mad, out_mad);
+	if (err)
+		goto out;
+
+	memcpy(gid->raw, out_mad->data + 8, 8);
+
+	init_query_mad(in_mad);
+	in_mad->attr_id  = IB_SMP_ATTR_GUID_INFO;
+	in_mad->attr_mod = cpu_to_be32(index / 8);
+
+	err = mthca_MAD_IFC(to_mdev(ibdev), 1, 1,
+			    port, NULL, NULL, in_mad, out_mad);
+	if (err)
+		goto out;
+
+	memcpy(gid->raw + 8, out_mad->data + (index % 8) * 8, 8);
+
+ out:
+	kfree(in_mad);
+	kfree(out_mad);
+	return err;
+}
+
+static struct ib_ucontext *mthca_alloc_ucontext(struct ib_device *ibdev,
+						struct ib_udata *udata)
+{
+	struct mthca_alloc_ucontext_resp uresp;
+	struct mthca_ucontext           *context;
+	int                              err;
+
+	if (!(to_mdev(ibdev)->active))
+		return ERR_PTR(-EAGAIN);
+
+	memset(&uresp, 0, sizeof uresp);
+
+	uresp.qp_tab_size = to_mdev(ibdev)->limits.num_qps;
+	if (mthca_is_memfree(to_mdev(ibdev)))
+		uresp.uarc_size = to_mdev(ibdev)->uar_table.uarc_size;
+	else
+		uresp.uarc_size = 0;
+
+	context = kmalloc(sizeof *context, GFP_KERNEL);
+	if (!context)
+		return ERR_PTR(-ENOMEM);
+
+	err = mthca_uar_alloc(to_mdev(ibdev), &context->uar);
+	if (err) {
+		kfree(context);
+		return ERR_PTR(err);
+	}
+
+	context->db_tab = mthca_init_user_db_tab(to_mdev(ibdev));
+	if (IS_ERR(context->db_tab)) {
+		err = PTR_ERR(context->db_tab);
+		mthca_uar_free(to_mdev(ibdev), &context->uar);
+		kfree(context);
+		return ERR_PTR(err);
+	}
+
+	if (ib_copy_to_udata(udata, &uresp, sizeof uresp)) {
+		mthca_cleanup_user_db_tab(to_mdev(ibdev), &context->uar, context->db_tab);
+		mthca_uar_free(to_mdev(ibdev), &context->uar);
+		kfree(context);
+		return ERR_PTR(-EFAULT);
+	}
+
+	context->reg_mr_warned = 0;
+
+	return &context->ibucontext;
+}
+
+static int mthca_dealloc_ucontext(struct ib_ucontext *context)
+{
+	mthca_cleanup_user_db_tab(to_mdev(context->device), &to_mucontext(context)->uar,
+				  to_mucontext(context)->db_tab);
+	mthca_uar_free(to_mdev(context->device), &to_mucontext(context)->uar);
+	kfree(to_mucontext(context));
+
+	return 0;
+}
+
+static int mthca_mmap_uar(struct ib_ucontext *context,
+			  struct vm_area_struct *vma)
+{
+	if (vma->vm_end - vma->vm_start != PAGE_SIZE)
+		return -EINVAL;
+
+	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+
+	if (io_remap_pfn_range(vma, vma->vm_start,
+			       to_mucontext(context)->uar.pfn,
+			       PAGE_SIZE, vma->vm_page_prot))
+		return -EAGAIN;
+
+	return 0;
+}
+
+static struct ib_pd *mthca_alloc_pd(struct ib_device *ibdev,
+				    struct ib_ucontext *context,
+				    struct ib_udata *udata)
+{
+	struct mthca_pd *pd;
+	int err;
+
+	pd = kmalloc(sizeof *pd, GFP_KERNEL);
+	if (!pd)
+		return ERR_PTR(-ENOMEM);
+
+	err = mthca_pd_alloc(to_mdev(ibdev), !context, pd);
+	if (err) {
+		kfree(pd);
+		return ERR_PTR(err);
+	}
+
+	if (context) {
+		if (ib_copy_to_udata(udata, &pd->pd_num, sizeof (__u32))) {
+			mthca_pd_free(to_mdev(ibdev), pd);
+			kfree(pd);
+			return ERR_PTR(-EFAULT);
+		}
+	}
+
+	return &pd->ibpd;
+}
+
+static int mthca_dealloc_pd(struct ib_pd *pd)
+{
+	mthca_pd_free(to_mdev(pd->device), to_mpd(pd));
+	kfree(pd);
+
+	return 0;
+}
+
+static struct ib_ah *mthca_ah_create(struct ib_pd *pd,
+				     struct ib_ah_attr *ah_attr)
+{
+	int err;
+	struct mthca_ah *ah;
+
+	ah = kmalloc(sizeof *ah, GFP_ATOMIC);
+	if (!ah)
+		return ERR_PTR(-ENOMEM);
+
+	err = mthca_create_ah(to_mdev(pd->device), to_mpd(pd), ah_attr, ah);
+	if (err) {
+		kfree(ah);
+		return ERR_PTR(err);
+	}
+
+	return &ah->ibah;
+}
+
+static int mthca_ah_destroy(struct ib_ah *ah)
+{
+	mthca_destroy_ah(to_mdev(ah->device), to_mah(ah));
+	kfree(ah);
+
+	return 0;
+}
+
+static struct ib_srq *mthca_create_srq(struct ib_pd *pd,
+				       struct ib_srq_init_attr *init_attr,
+				       struct ib_udata *udata)
+{
+	struct mthca_create_srq ucmd;
+	struct mthca_ucontext *context = NULL;
+	struct mthca_srq *srq;
+	int err;
+
+	if (init_attr->srq_type != IB_SRQT_BASIC)
+		return ERR_PTR(-ENOSYS);
+
+	srq = kmalloc(sizeof *srq, GFP_KERNEL);
+	if (!srq)
+		return ERR_PTR(-ENOMEM);
+
+	if (pd->uobject) {
+		context = to_mucontext(pd->uobject->context);
+
+		if (ib_copy_from_udata(&ucmd, udata, sizeof ucmd)) {
+			err = -EFAULT;
+			goto err_free;
+		}
+
+		err = mthca_map_user_db(to_mdev(pd->device), &context->uar,
+					context->db_tab, ucmd.db_index,
+					ucmd.db_page);
+
+		if (err)
+			goto err_free;
+
+		srq->mr.ibmr.lkey = ucmd.lkey;
+		srq->db_index     = ucmd.db_index;
+	}
+
+	err = mthca_alloc_srq(to_mdev(pd->device), to_mpd(pd),
+			      &init_attr->attr, srq);
+
+	if (err && pd->uobject)
+		mthca_unmap_user_db(to_mdev(pd->device), &context->uar,
+				    context->db_tab, ucmd.db_index);
+
+	if (err)
+		goto err_free;
+
+	if (context && ib_copy_to_udata(udata, &srq->srqn, sizeof (__u32))) {
+		mthca_free_srq(to_mdev(pd->device), srq);
+		err = -EFAULT;
+		goto err_free;
+	}
+
+	return &srq->ibsrq;
+
+err_free:
+	kfree(srq);
+
+	return ERR_PTR(err);
+}
+
+static int mthca_destroy_srq(struct ib_srq *srq)
+{
+	struct mthca_ucontext *context;
+
+	if (srq->uobject) {
+		context = to_mucontext(srq->uobject->context);
+
+		mthca_unmap_user_db(to_mdev(srq->device), &context->uar,
+				    context->db_tab, to_msrq(srq)->db_index);
+	}
+
+	mthca_free_srq(to_mdev(srq->device), to_msrq(srq));
+	kfree(srq);
+
+	return 0;
+}
+
+static struct ib_qp *mthca_create_qp(struct ib_pd *pd,
+				     struct ib_qp_init_attr *init_attr,
+				     struct ib_udata *udata)
+{
+	struct mthca_create_qp ucmd;
+	struct mthca_qp *qp;
+	int err;
+
+	if (init_attr->create_flags)
+		return ERR_PTR(-EINVAL);
+
+	switch (init_attr->qp_type) {
+	case IB_QPT_RC:
+	case IB_QPT_UC:
+	case IB_QPT_UD:
+	{
+		struct mthca_ucontext *context;
+
+		qp = kmalloc(sizeof *qp, GFP_KERNEL);
+		if (!qp)
+			return ERR_PTR(-ENOMEM);
+
+		if (pd->uobject) {
+			context = to_mucontext(pd->uobject->context);
+
+			if (ib_copy_from_udata(&ucmd, udata, sizeof ucmd)) {
+				kfree(qp);
+				return ERR_PTR(-EFAULT);
+			}
+
+			err = mthca_map_user_db(to_mdev(pd->device), &context->uar,
+						context->db_tab,
+						ucmd.sq_db_index, ucmd.sq_db_page);
+			if (err) {
+				kfree(qp);
+				return ERR_PTR(err);
+			}
+
+			err = mthca_map_user_db(to_mdev(pd->device), &context->uar,
+						context->db_tab,
+						ucmd.rq_db_index, ucmd.rq_db_page);
+			if (err) {
+				mthca_unmap_user_db(to_mdev(pd->device),
+						    &context->uar,
+						    context->db_tab,
+						    ucmd.sq_db_index);
+				kfree(qp);
+				return ERR_PTR(err);
+			}
+
+			qp->mr.ibmr.lkey = ucmd.lkey;
+			qp->sq.db_index  = ucmd.sq_db_index;
+			qp->rq.db_index  = ucmd.rq_db_index;
+		}
+
+		err = mthca_alloc_qp(to_mdev(pd->device), to_mpd(pd),
+				     to_mcq(init_attr->send_cq),
+				     to_mcq(init_attr->recv_cq),
+				     init_attr->qp_type, init_attr->sq_sig_type,
+				     &init_attr->cap, qp);
+
+		if (err && pd->uobject) {
+			context = to_mucontext(pd->uobject->context);
+
+			mthca_unmap_user_db(to_mdev(pd->device),
+					    &context->uar,
+					    context->db_tab,
+					    ucmd.sq_db_index);
+			mthca_unmap_user_db(to_mdev(pd->device),
+					    &context->uar,
+					    context->db_tab,
+					    ucmd.rq_db_index);
+		}
+
+		qp->ibqp.qp_num = qp->qpn;
+		break;
+	}
+	case IB_QPT_SMI:
+	case IB_QPT_GSI:
+	{
+		/* Don't allow userspace to create special QPs */
+		if (pd->uobject)
+			return ERR_PTR(-EINVAL);
+
+		qp = kmalloc(sizeof (struct mthca_sqp), GFP_KERNEL);
+		if (!qp)
+			return ERR_PTR(-ENOMEM);
+
+		qp->ibqp.qp_num = init_attr->qp_type == IB_QPT_SMI ? 0 : 1;
+
+		err = mthca_alloc_sqp(to_mdev(pd->device), to_mpd(pd),
+				      to_mcq(init_attr->send_cq),
+				      to_mcq(init_attr->recv_cq),
+				      init_attr->sq_sig_type, &init_attr->cap,
+				      qp->ibqp.qp_num, init_attr->port_num,
+				      to_msqp(qp));
+		break;
+	}
+	default:
+		/* Don't support raw QPs */
+		return ERR_PTR(-ENOSYS);
+	}
+
+	if (err) {
+		kfree(qp);
+		return ERR_PTR(err);
+	}
+
+	init_attr->cap.max_send_wr     = qp->sq.max;
+	init_attr->cap.max_recv_wr     = qp->rq.max;
+	init_attr->cap.max_send_sge    = qp->sq.max_gs;
+	init_attr->cap.max_recv_sge    = qp->rq.max_gs;
+	init_attr->cap.max_inline_data = qp->max_inline_data;
+
+	return &qp->ibqp;
+}
+
+static int mthca_destroy_qp(struct ib_qp *qp)
+{
+	if (qp->uobject) {
+		mthca_unmap_user_db(to_mdev(qp->device),
+				    &to_mucontext(qp->uobject->context)->uar,
+				    to_mucontext(qp->uobject->context)->db_tab,
+				    to_mqp(qp)->sq.db_index);
+		mthca_unmap_user_db(to_mdev(qp->device),
+				    &to_mucontext(qp->uobject->context)->uar,
+				    to_mucontext(qp->uobject->context)->db_tab,
+				    to_mqp(qp)->rq.db_index);
+	}
+	mthca_free_qp(to_mdev(qp->device), to_mqp(qp));
+	kfree(qp);
+	return 0;
+}
+
+static struct ib_cq *mthca_create_cq(struct ib_device *ibdev, int entries,
+				     int comp_vector,
+				     struct ib_ucontext *context,
+				     struct ib_udata *udata)
+{
+	struct mthca_create_cq ucmd;
+	struct mthca_cq *cq;
+	int nent;
+	int err;
+
+	if (entries < 1 || entries > to_mdev(ibdev)->limits.max_cqes)
+		return ERR_PTR(-EINVAL);
+
+	if (context) {
+		if (ib_copy_from_udata(&ucmd, udata, sizeof ucmd))
+			return ERR_PTR(-EFAULT);
+
+		err = mthca_map_user_db(to_mdev(ibdev), &to_mucontext(context)->uar,
+					to_mucontext(context)->db_tab,
+					ucmd.set_db_index, ucmd.set_db_page);
+		if (err)
+			return ERR_PTR(err);
+
+		err = mthca_map_user_db(to_mdev(ibdev), &to_mucontext(context)->uar,
+					to_mucontext(context)->db_tab,
+					ucmd.arm_db_index, ucmd.arm_db_page);
+		if (err)
+			goto err_unmap_set;
+	}
+
+	cq = kmalloc(sizeof *cq, GFP_KERNEL);
+	if (!cq) {
+		err = -ENOMEM;
+		goto err_unmap_arm;
+	}
+
+	if (context) {
+		cq->buf.mr.ibmr.lkey = ucmd.lkey;
+		cq->set_ci_db_index  = ucmd.set_db_index;
+		cq->arm_db_index     = ucmd.arm_db_index;
+	}
+
+	for (nent = 1; nent <= entries; nent <<= 1)
+		; /* nothing */
+
+	err = mthca_init_cq(to_mdev(ibdev), nent,
+			    context ? to_mucontext(context) : NULL,
+			    context ? ucmd.pdn : to_mdev(ibdev)->driver_pd.pd_num,
+			    cq);
+	if (err)
+		goto err_free;
+
+	if (context && ib_copy_to_udata(udata, &cq->cqn, sizeof (__u32))) {
+		mthca_free_cq(to_mdev(ibdev), cq);
+		err = -EFAULT;
+		goto err_free;
+	}
+
+	cq->resize_buf = NULL;
+
+	return &cq->ibcq;
+
+err_free:
+	kfree(cq);
+
+err_unmap_arm:
+	if (context)
+		mthca_unmap_user_db(to_mdev(ibdev), &to_mucontext(context)->uar,
+				    to_mucontext(context)->db_tab, ucmd.arm_db_index);
+
+err_unmap_set:
+	if (context)
+		mthca_unmap_user_db(to_mdev(ibdev), &to_mucontext(context)->uar,
+				    to_mucontext(context)->db_tab, ucmd.set_db_index);
+
+	return ERR_PTR(err);
+}
+
+static int mthca_alloc_resize_buf(struct mthca_dev *dev, struct mthca_cq *cq,
+				  int entries)
+{
+	int ret;
+
+	spin_lock_irq(&cq->lock);
+	if (cq->resize_buf) {
+		ret = -EBUSY;
+		goto unlock;
+	}
+
+	cq->resize_buf = kmalloc(sizeof *cq->resize_buf, GFP_ATOMIC);
+	if (!cq->resize_buf) {
+		ret = -ENOMEM;
+		goto unlock;
+	}
+
+	cq->resize_buf->state = CQ_RESIZE_ALLOC;
+
+	ret = 0;
+
+unlock:
+	spin_unlock_irq(&cq->lock);
+
+	if (ret)
+		return ret;
+
+	ret = mthca_alloc_cq_buf(dev, &cq->resize_buf->buf, entries);
+	if (ret) {
+		spin_lock_irq(&cq->lock);
+		kfree(cq->resize_buf);
+		cq->resize_buf = NULL;
+		spin_unlock_irq(&cq->lock);
+		return ret;
+	}
+
+	cq->resize_buf->cqe = entries - 1;
+
+	spin_lock_irq(&cq->lock);
+	cq->resize_buf->state = CQ_RESIZE_READY;
+	spin_unlock_irq(&cq->lock);
+
+	return 0;
+}
+
+static int mthca_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata)
+{
+	struct mthca_dev *dev = to_mdev(ibcq->device);
+	struct mthca_cq *cq = to_mcq(ibcq);
+	struct mthca_resize_cq ucmd;
+	u32 lkey;
+	int ret;
+
+	if (entries < 1 || entries > dev->limits.max_cqes)
+		return -EINVAL;
+
+	mutex_lock(&cq->mutex);
+
+	entries = roundup_pow_of_two(entries + 1);
+	if (entries == ibcq->cqe + 1) {
+		ret = 0;
+		goto out;
+	}
+
+	if (cq->is_kernel) {
+		ret = mthca_alloc_resize_buf(dev, cq, entries);
+		if (ret)
+			goto out;
+		lkey = cq->resize_buf->buf.mr.ibmr.lkey;
+	} else {
+		if (ib_copy_from_udata(&ucmd, udata, sizeof ucmd)) {
+			ret = -EFAULT;
+			goto out;
+		}
+		lkey = ucmd.lkey;
+	}
+
+	ret = mthca_RESIZE_CQ(dev, cq->cqn, lkey, ilog2(entries));
+
+	if (ret) {
+		if (cq->resize_buf) {
+			mthca_free_cq_buf(dev, &cq->resize_buf->buf,
+					  cq->resize_buf->cqe);
+			kfree(cq->resize_buf);
+			spin_lock_irq(&cq->lock);
+			cq->resize_buf = NULL;
+			spin_unlock_irq(&cq->lock);
+		}
+		goto out;
+	}
+
+	if (cq->is_kernel) {
+		struct mthca_cq_buf tbuf;
+		int tcqe;
+
+		spin_lock_irq(&cq->lock);
+		if (cq->resize_buf->state == CQ_RESIZE_READY) {
+			mthca_cq_resize_copy_cqes(cq);
+			tbuf         = cq->buf;
+			tcqe         = cq->ibcq.cqe;
+			cq->buf      = cq->resize_buf->buf;
+			cq->ibcq.cqe = cq->resize_buf->cqe;
+		} else {
+			tbuf = cq->resize_buf->buf;
+			tcqe = cq->resize_buf->cqe;
+		}
+
+		kfree(cq->resize_buf);
+		cq->resize_buf = NULL;
+		spin_unlock_irq(&cq->lock);
+
+		mthca_free_cq_buf(dev, &tbuf, tcqe);
+	} else
+		ibcq->cqe = entries - 1;
+
+out:
+	mutex_unlock(&cq->mutex);
+
+	return ret;
+}
+
+static int mthca_destroy_cq(struct ib_cq *cq)
+{
+	if (cq->uobject) {
+		mthca_unmap_user_db(to_mdev(cq->device),
+				    &to_mucontext(cq->uobject->context)->uar,
+				    to_mucontext(cq->uobject->context)->db_tab,
+				    to_mcq(cq)->arm_db_index);
+		mthca_unmap_user_db(to_mdev(cq->device),
+				    &to_mucontext(cq->uobject->context)->uar,
+				    to_mucontext(cq->uobject->context)->db_tab,
+				    to_mcq(cq)->set_ci_db_index);
+	}
+	mthca_free_cq(to_mdev(cq->device), to_mcq(cq));
+	kfree(cq);
+
+	return 0;
+}
+
+static inline u32 convert_access(int acc)
+{
+	return (acc & IB_ACCESS_REMOTE_ATOMIC ? MTHCA_MPT_FLAG_ATOMIC       : 0) |
+	       (acc & IB_ACCESS_REMOTE_WRITE  ? MTHCA_MPT_FLAG_REMOTE_WRITE : 0) |
+	       (acc & IB_ACCESS_REMOTE_READ   ? MTHCA_MPT_FLAG_REMOTE_READ  : 0) |
+	       (acc & IB_ACCESS_LOCAL_WRITE   ? MTHCA_MPT_FLAG_LOCAL_WRITE  : 0) |
+	       MTHCA_MPT_FLAG_LOCAL_READ;
+}
+
+static struct ib_mr *mthca_get_dma_mr(struct ib_pd *pd, int acc)
+{
+	struct mthca_mr *mr;
+	int err;
+
+	mr = kmalloc(sizeof *mr, GFP_KERNEL);
+	if (!mr)
+		return ERR_PTR(-ENOMEM);
+
+	err = mthca_mr_alloc_notrans(to_mdev(pd->device),
+				     to_mpd(pd)->pd_num,
+				     convert_access(acc), mr);
+
+	if (err) {
+		kfree(mr);
+		return ERR_PTR(err);
+	}
+
+	mr->umem = NULL;
+
+	return &mr->ibmr;
+}
+
+static struct ib_mr *mthca_reg_phys_mr(struct ib_pd       *pd,
+				       struct ib_phys_buf *buffer_list,
+				       int                 num_phys_buf,
+				       int                 acc,
+				       u64                *iova_start)
+{
+	struct mthca_mr *mr;
+	u64 *page_list;
+	u64 total_size;
+	unsigned long mask;
+	int shift;
+	int npages;
+	int err;
+	int i, j, n;
+
+	mask = buffer_list[0].addr ^ *iova_start;
+	total_size = 0;
+	for (i = 0; i < num_phys_buf; ++i) {
+		if (i != 0)
+			mask |= buffer_list[i].addr;
+		if (i != num_phys_buf - 1)
+			mask |= buffer_list[i].addr + buffer_list[i].size;
+
+		total_size += buffer_list[i].size;
+	}
+
+	if (mask & ~PAGE_MASK)
+		return ERR_PTR(-EINVAL);
+
+	shift = __ffs(mask | 1 << 31);
+
+	buffer_list[0].size += buffer_list[0].addr & ((1ULL << shift) - 1);
+	buffer_list[0].addr &= ~0ull << shift;
+
+	mr = kmalloc(sizeof *mr, GFP_KERNEL);
+	if (!mr)
+		return ERR_PTR(-ENOMEM);
+
+	npages = 0;
+	for (i = 0; i < num_phys_buf; ++i)
+		npages += (buffer_list[i].size + (1ULL << shift) - 1) >> shift;
+
+	if (!npages)
+		return &mr->ibmr;
+
+	page_list = kmalloc(npages * sizeof *page_list, GFP_KERNEL);
+	if (!page_list) {
+		kfree(mr);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	n = 0;
+	for (i = 0; i < num_phys_buf; ++i)
+		for (j = 0;
+		     j < (buffer_list[i].size + (1ULL << shift) - 1) >> shift;
+		     ++j)
+			page_list[n++] = buffer_list[i].addr + ((u64) j << shift);
+
+	mthca_dbg(to_mdev(pd->device), "Registering memory at %llx (iova %llx) "
+		  "in PD %x; shift %d, npages %d.\n",
+		  (unsigned long long) buffer_list[0].addr,
+		  (unsigned long long) *iova_start,
+		  to_mpd(pd)->pd_num,
+		  shift, npages);
+
+	err = mthca_mr_alloc_phys(to_mdev(pd->device),
+				  to_mpd(pd)->pd_num,
+				  page_list, shift, npages,
+				  *iova_start, total_size,
+				  convert_access(acc), mr);
+
+	if (err) {
+		kfree(page_list);
+		kfree(mr);
+		return ERR_PTR(err);
+	}
+
+	kfree(page_list);
+	mr->umem = NULL;
+
+	return &mr->ibmr;
+}
+
+static struct ib_mr *mthca_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
+				       u64 virt, int acc, struct ib_udata *udata)
+{
+	struct mthca_dev *dev = to_mdev(pd->device);
+	struct scatterlist *sg;
+	struct mthca_mr *mr;
+	struct mthca_reg_mr ucmd;
+	u64 *pages;
+	int shift, n, len;
+	int i, k, entry;
+	int err = 0;
+	int write_mtt_size;
+
+	if (udata->inlen - sizeof (struct ib_uverbs_cmd_hdr) < sizeof ucmd) {
+		if (!to_mucontext(pd->uobject->context)->reg_mr_warned) {
+			mthca_warn(dev, "Process '%s' did not pass in MR attrs.\n",
+				   current->comm);
+			mthca_warn(dev, "  Update libmthca to fix this.\n");
+		}
+		++to_mucontext(pd->uobject->context)->reg_mr_warned;
+		ucmd.mr_attrs = 0;
+	} else if (ib_copy_from_udata(&ucmd, udata, sizeof ucmd))
+		return ERR_PTR(-EFAULT);
+
+	mr = kmalloc(sizeof *mr, GFP_KERNEL);
+	if (!mr)
+		return ERR_PTR(-ENOMEM);
+
+	mr->umem = ib_umem_get(pd->uobject->context, start, length, acc,
+			       ucmd.mr_attrs & MTHCA_MR_DMASYNC);
+
+	if (IS_ERR(mr->umem)) {
+		err = PTR_ERR(mr->umem);
+		goto err;
+	}
+
+	shift = ffs(mr->umem->page_size) - 1;
+	n = mr->umem->nmap;
+
+	mr->mtt = mthca_alloc_mtt(dev, n);
+	if (IS_ERR(mr->mtt)) {
+		err = PTR_ERR(mr->mtt);
+		goto err_umem;
+	}
+
+	pages = (u64 *) __get_free_page(GFP_KERNEL);
+	if (!pages) {
+		err = -ENOMEM;
+		goto err_mtt;
+	}
+
+	i = n = 0;
+
+	write_mtt_size = min(mthca_write_mtt_size(dev), (int) (PAGE_SIZE / sizeof *pages));
+
+	for_each_sg(mr->umem->sg_head.sgl, sg, mr->umem->nmap, entry) {
+		len = sg_dma_len(sg) >> shift;
+		for (k = 0; k < len; ++k) {
+			pages[i++] = sg_dma_address(sg) +
+				mr->umem->page_size * k;
+			/*
+			 * Be friendly to write_mtt and pass it chunks
+			 * of appropriate size.
+			 */
+			if (i == write_mtt_size) {
+				err = mthca_write_mtt(dev, mr->mtt, n, pages, i);
+				if (err)
+					goto mtt_done;
+				n += i;
+				i = 0;
+			}
+		}
+	}
+
+	if (i)
+		err = mthca_write_mtt(dev, mr->mtt, n, pages, i);
+mtt_done:
+	free_page((unsigned long) pages);
+	if (err)
+		goto err_mtt;
+
+	err = mthca_mr_alloc(dev, to_mpd(pd)->pd_num, shift, virt, length,
+			     convert_access(acc), mr);
+
+	if (err)
+		goto err_mtt;
+
+	return &mr->ibmr;
+
+err_mtt:
+	mthca_free_mtt(dev, mr->mtt);
+
+err_umem:
+	ib_umem_release(mr->umem);
+
+err:
+	kfree(mr);
+	return ERR_PTR(err);
+}
+
+static int mthca_dereg_mr(struct ib_mr *mr)
+{
+	struct mthca_mr *mmr = to_mmr(mr);
+
+	mthca_free_mr(to_mdev(mr->device), mmr);
+	if (mmr->umem)
+		ib_umem_release(mmr->umem);
+	kfree(mmr);
+
+	return 0;
+}
+
+static struct ib_fmr *mthca_alloc_fmr(struct ib_pd *pd, int mr_access_flags,
+				      struct ib_fmr_attr *fmr_attr)
+{
+	struct mthca_fmr *fmr;
+	int err;
+
+	fmr = kmalloc(sizeof *fmr, GFP_KERNEL);
+	if (!fmr)
+		return ERR_PTR(-ENOMEM);
+
+	memcpy(&fmr->attr, fmr_attr, sizeof *fmr_attr);
+	err = mthca_fmr_alloc(to_mdev(pd->device), to_mpd(pd)->pd_num,
+			     convert_access(mr_access_flags), fmr);
+
+	if (err) {
+		kfree(fmr);
+		return ERR_PTR(err);
+	}
+
+	return &fmr->ibmr;
+}
+
+static int mthca_dealloc_fmr(struct ib_fmr *fmr)
+{
+	struct mthca_fmr *mfmr = to_mfmr(fmr);
+	int err;
+
+	err = mthca_free_fmr(to_mdev(fmr->device), mfmr);
+	if (err)
+		return err;
+
+	kfree(mfmr);
+	return 0;
+}
+
+static int mthca_unmap_fmr(struct list_head *fmr_list)
+{
+	struct ib_fmr *fmr;
+	int err;
+	struct mthca_dev *mdev = NULL;
+
+	list_for_each_entry(fmr, fmr_list, list) {
+		if (mdev && to_mdev(fmr->device) != mdev)
+			return -EINVAL;
+		mdev = to_mdev(fmr->device);
+	}
+
+	if (!mdev)
+		return 0;
+
+	if (mthca_is_memfree(mdev)) {
+		list_for_each_entry(fmr, fmr_list, list)
+			mthca_arbel_fmr_unmap(mdev, to_mfmr(fmr));
+
+		wmb();
+	} else
+		list_for_each_entry(fmr, fmr_list, list)
+			mthca_tavor_fmr_unmap(mdev, to_mfmr(fmr));
+
+	err = mthca_SYNC_TPT(mdev);
+	return err;
+}
+
+static ssize_t show_rev(struct device *device, struct device_attribute *attr,
+			char *buf)
+{
+	struct mthca_dev *dev =
+		container_of(device, struct mthca_dev, ib_dev.dev);
+	return sprintf(buf, "%x\n", dev->rev_id);
+}
+
+static ssize_t show_fw_ver(struct device *device, struct device_attribute *attr,
+			   char *buf)
+{
+	struct mthca_dev *dev =
+		container_of(device, struct mthca_dev, ib_dev.dev);
+	return sprintf(buf, "%d.%d.%d\n", (int) (dev->fw_ver >> 32),
+		       (int) (dev->fw_ver >> 16) & 0xffff,
+		       (int) dev->fw_ver & 0xffff);
+}
+
+static ssize_t show_hca(struct device *device, struct device_attribute *attr,
+			char *buf)
+{
+	struct mthca_dev *dev =
+		container_of(device, struct mthca_dev, ib_dev.dev);
+	switch (dev->pdev->device) {
+	case PCI_DEVICE_ID_MELLANOX_TAVOR:
+		return sprintf(buf, "MT23108\n");
+	case PCI_DEVICE_ID_MELLANOX_ARBEL_COMPAT:
+		return sprintf(buf, "MT25208 (MT23108 compat mode)\n");
+	case PCI_DEVICE_ID_MELLANOX_ARBEL:
+		return sprintf(buf, "MT25208\n");
+	case PCI_DEVICE_ID_MELLANOX_SINAI:
+	case PCI_DEVICE_ID_MELLANOX_SINAI_OLD:
+		return sprintf(buf, "MT25204\n");
+	default:
+		return sprintf(buf, "unknown\n");
+	}
+}
+
+static ssize_t show_board(struct device *device, struct device_attribute *attr,
+			  char *buf)
+{
+	struct mthca_dev *dev =
+		container_of(device, struct mthca_dev, ib_dev.dev);
+	return sprintf(buf, "%.*s\n", MTHCA_BOARD_ID_LEN, dev->board_id);
+}
+
+static DEVICE_ATTR(hw_rev,   S_IRUGO, show_rev,    NULL);
+static DEVICE_ATTR(fw_ver,   S_IRUGO, show_fw_ver, NULL);
+static DEVICE_ATTR(hca_type, S_IRUGO, show_hca,    NULL);
+static DEVICE_ATTR(board_id, S_IRUGO, show_board,  NULL);
+
+static struct device_attribute *mthca_dev_attributes[] = {
+	&dev_attr_hw_rev,
+	&dev_attr_fw_ver,
+	&dev_attr_hca_type,
+	&dev_attr_board_id
+};
+
+static int mthca_init_node_data(struct mthca_dev *dev)
+{
+	struct ib_smp *in_mad  = NULL;
+	struct ib_smp *out_mad = NULL;
+	int err = -ENOMEM;
+
+	in_mad  = kzalloc(sizeof *in_mad, GFP_KERNEL);
+	out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL);
+	if (!in_mad || !out_mad)
+		goto out;
+
+	init_query_mad(in_mad);
+	in_mad->attr_id = IB_SMP_ATTR_NODE_DESC;
+
+	err = mthca_MAD_IFC(dev, 1, 1,
+			    1, NULL, NULL, in_mad, out_mad);
+	if (err)
+		goto out;
+
+	memcpy(dev->ib_dev.node_desc, out_mad->data, 64);
+
+	in_mad->attr_id = IB_SMP_ATTR_NODE_INFO;
+
+	err = mthca_MAD_IFC(dev, 1, 1,
+			    1, NULL, NULL, in_mad, out_mad);
+	if (err)
+		goto out;
+
+	if (mthca_is_memfree(dev))
+		dev->rev_id = be32_to_cpup((__be32 *) (out_mad->data + 32));
+	memcpy(&dev->ib_dev.node_guid, out_mad->data + 12, 8);
+
+out:
+	kfree(in_mad);
+	kfree(out_mad);
+	return err;
+}
+
+int mthca_register_device(struct mthca_dev *dev)
+{
+	int ret;
+	int i;
+
+	ret = mthca_init_node_data(dev);
+	if (ret)
+		return ret;
+
+	strlcpy(dev->ib_dev.name, "mthca%d", IB_DEVICE_NAME_MAX);
+	dev->ib_dev.owner                = THIS_MODULE;
+
+	dev->ib_dev.uverbs_abi_ver	 = MTHCA_UVERBS_ABI_VERSION;
+	dev->ib_dev.uverbs_cmd_mask	 =
+		(1ull << IB_USER_VERBS_CMD_GET_CONTEXT)		|
+		(1ull << IB_USER_VERBS_CMD_QUERY_DEVICE)	|
+		(1ull << IB_USER_VERBS_CMD_QUERY_PORT)		|
+		(1ull << IB_USER_VERBS_CMD_ALLOC_PD)		|
+		(1ull << IB_USER_VERBS_CMD_DEALLOC_PD)		|
+		(1ull << IB_USER_VERBS_CMD_REG_MR)		|
+		(1ull << IB_USER_VERBS_CMD_DEREG_MR)		|
+		(1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL)	|
+		(1ull << IB_USER_VERBS_CMD_CREATE_CQ)		|
+		(1ull << IB_USER_VERBS_CMD_RESIZE_CQ)		|
+		(1ull << IB_USER_VERBS_CMD_DESTROY_CQ)		|
+		(1ull << IB_USER_VERBS_CMD_CREATE_QP)		|
+		(1ull << IB_USER_VERBS_CMD_QUERY_QP)		|
+		(1ull << IB_USER_VERBS_CMD_MODIFY_QP)		|
+		(1ull << IB_USER_VERBS_CMD_DESTROY_QP)		|
+		(1ull << IB_USER_VERBS_CMD_ATTACH_MCAST)	|
+		(1ull << IB_USER_VERBS_CMD_DETACH_MCAST);
+	dev->ib_dev.node_type            = RDMA_NODE_IB_CA;
+	dev->ib_dev.phys_port_cnt        = dev->limits.num_ports;
+	dev->ib_dev.num_comp_vectors     = 1;
+	dev->ib_dev.dma_device           = &dev->pdev->dev;
+	dev->ib_dev.query_device         = mthca_query_device;
+	dev->ib_dev.query_port           = mthca_query_port;
+	dev->ib_dev.modify_device        = mthca_modify_device;
+	dev->ib_dev.modify_port          = mthca_modify_port;
+	dev->ib_dev.query_pkey           = mthca_query_pkey;
+	dev->ib_dev.query_gid            = mthca_query_gid;
+	dev->ib_dev.alloc_ucontext       = mthca_alloc_ucontext;
+	dev->ib_dev.dealloc_ucontext     = mthca_dealloc_ucontext;
+	dev->ib_dev.mmap                 = mthca_mmap_uar;
+	dev->ib_dev.alloc_pd             = mthca_alloc_pd;
+	dev->ib_dev.dealloc_pd           = mthca_dealloc_pd;
+	dev->ib_dev.create_ah            = mthca_ah_create;
+	dev->ib_dev.query_ah             = mthca_ah_query;
+	dev->ib_dev.destroy_ah           = mthca_ah_destroy;
+
+	if (dev->mthca_flags & MTHCA_FLAG_SRQ) {
+		dev->ib_dev.create_srq           = mthca_create_srq;
+		dev->ib_dev.modify_srq           = mthca_modify_srq;
+		dev->ib_dev.query_srq            = mthca_query_srq;
+		dev->ib_dev.destroy_srq          = mthca_destroy_srq;
+		dev->ib_dev.uverbs_cmd_mask	|=
+			(1ull << IB_USER_VERBS_CMD_CREATE_SRQ)		|
+			(1ull << IB_USER_VERBS_CMD_MODIFY_SRQ)		|
+			(1ull << IB_USER_VERBS_CMD_QUERY_SRQ)		|
+			(1ull << IB_USER_VERBS_CMD_DESTROY_SRQ);
+
+		if (mthca_is_memfree(dev))
+			dev->ib_dev.post_srq_recv = mthca_arbel_post_srq_recv;
+		else
+			dev->ib_dev.post_srq_recv = mthca_tavor_post_srq_recv;
+	}
+
+	dev->ib_dev.create_qp            = mthca_create_qp;
+	dev->ib_dev.modify_qp            = mthca_modify_qp;
+	dev->ib_dev.query_qp             = mthca_query_qp;
+	dev->ib_dev.destroy_qp           = mthca_destroy_qp;
+	dev->ib_dev.create_cq            = mthca_create_cq;
+	dev->ib_dev.resize_cq            = mthca_resize_cq;
+	dev->ib_dev.destroy_cq           = mthca_destroy_cq;
+	dev->ib_dev.poll_cq              = mthca_poll_cq;
+	dev->ib_dev.get_dma_mr           = mthca_get_dma_mr;
+	dev->ib_dev.reg_phys_mr          = mthca_reg_phys_mr;
+	dev->ib_dev.reg_user_mr          = mthca_reg_user_mr;
+	dev->ib_dev.dereg_mr             = mthca_dereg_mr;
+
+	if (dev->mthca_flags & MTHCA_FLAG_FMR) {
+		dev->ib_dev.alloc_fmr            = mthca_alloc_fmr;
+		dev->ib_dev.unmap_fmr            = mthca_unmap_fmr;
+		dev->ib_dev.dealloc_fmr          = mthca_dealloc_fmr;
+		if (mthca_is_memfree(dev))
+			dev->ib_dev.map_phys_fmr = mthca_arbel_map_phys_fmr;
+		else
+			dev->ib_dev.map_phys_fmr = mthca_tavor_map_phys_fmr;
+	}
+
+	dev->ib_dev.attach_mcast         = mthca_multicast_attach;
+	dev->ib_dev.detach_mcast         = mthca_multicast_detach;
+	dev->ib_dev.process_mad          = mthca_process_mad;
+
+	if (mthca_is_memfree(dev)) {
+		dev->ib_dev.req_notify_cq = mthca_arbel_arm_cq;
+		dev->ib_dev.post_send     = mthca_arbel_post_send;
+		dev->ib_dev.post_recv     = mthca_arbel_post_receive;
+	} else {
+		dev->ib_dev.req_notify_cq = mthca_tavor_arm_cq;
+		dev->ib_dev.post_send     = mthca_tavor_post_send;
+		dev->ib_dev.post_recv     = mthca_tavor_post_receive;
+	}
+
+	mutex_init(&dev->cap_mask_mutex);
+
+	ret = ib_register_device(&dev->ib_dev, NULL);
+	if (ret)
+		return ret;
+
+	for (i = 0; i < ARRAY_SIZE(mthca_dev_attributes); ++i) {
+		ret = device_create_file(&dev->ib_dev.dev,
+					 mthca_dev_attributes[i]);
+		if (ret) {
+			ib_unregister_device(&dev->ib_dev);
+			return ret;
+		}
+	}
+
+	mthca_start_catas_poll(dev);
+
+	return 0;
+}
+
+void mthca_unregister_device(struct mthca_dev *dev)
+{
+	mthca_stop_catas_poll(dev);
+	ib_unregister_device(&dev->ib_dev);
+}
diff --git a/drivers/infiniband/hw/mthca/mthca_provider.h b/drivers/infiniband/hw/mthca/mthca_provider.h
new file mode 100644
index 0000000..596acc4
--- /dev/null
+++ b/drivers/infiniband/hw/mthca/mthca_provider.h
@@ -0,0 +1,344 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005, 2006 Cisco Systems.  All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MTHCA_PROVIDER_H
+#define MTHCA_PROVIDER_H
+
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_pack.h>
+
+#define MTHCA_MPT_FLAG_ATOMIC        (1 << 14)
+#define MTHCA_MPT_FLAG_REMOTE_WRITE  (1 << 13)
+#define MTHCA_MPT_FLAG_REMOTE_READ   (1 << 12)
+#define MTHCA_MPT_FLAG_LOCAL_WRITE   (1 << 11)
+#define MTHCA_MPT_FLAG_LOCAL_READ    (1 << 10)
+
+struct mthca_buf_list {
+	void *buf;
+	DEFINE_DMA_UNMAP_ADDR(mapping);
+};
+
+union mthca_buf {
+	struct mthca_buf_list direct;
+	struct mthca_buf_list *page_list;
+};
+
+struct mthca_uar {
+	unsigned long pfn;
+	int           index;
+};
+
+struct mthca_user_db_table;
+
+struct mthca_ucontext {
+	struct ib_ucontext          ibucontext;
+	struct mthca_uar            uar;
+	struct mthca_user_db_table *db_tab;
+	int			    reg_mr_warned;
+};
+
+struct mthca_mtt;
+
+struct mthca_mr {
+	struct ib_mr      ibmr;
+	struct ib_umem   *umem;
+	struct mthca_mtt *mtt;
+};
+
+struct mthca_fmr {
+	struct ib_fmr      ibmr;
+	struct ib_fmr_attr attr;
+	struct mthca_mtt  *mtt;
+	int                maps;
+	union {
+		struct {
+			struct mthca_mpt_entry __iomem *mpt;
+			u64 __iomem *mtts;
+		} tavor;
+		struct {
+			struct mthca_mpt_entry *mpt;
+			__be64 *mtts;
+			dma_addr_t dma_handle;
+		} arbel;
+	} mem;
+};
+
+struct mthca_pd {
+	struct ib_pd    ibpd;
+	u32             pd_num;
+	atomic_t        sqp_count;
+	struct mthca_mr ntmr;
+	int             privileged;
+};
+
+struct mthca_eq {
+	struct mthca_dev      *dev;
+	int                    eqn;
+	u32                    eqn_mask;
+	u32                    cons_index;
+	u16                    msi_x_vector;
+	u16                    msi_x_entry;
+	int                    have_irq;
+	int                    nent;
+	struct mthca_buf_list *page_list;
+	struct mthca_mr        mr;
+	char		       irq_name[IB_DEVICE_NAME_MAX];
+};
+
+struct mthca_av;
+
+enum mthca_ah_type {
+	MTHCA_AH_ON_HCA,
+	MTHCA_AH_PCI_POOL,
+	MTHCA_AH_KMALLOC
+};
+
+struct mthca_ah {
+	struct ib_ah       ibah;
+	enum mthca_ah_type type;
+	u32                key;
+	struct mthca_av   *av;
+	dma_addr_t         avdma;
+};
+
+/*
+ * Quick description of our CQ/QP locking scheme:
+ *
+ * We have one global lock that protects dev->cq/qp_table.  Each
+ * struct mthca_cq/qp also has its own lock.  An individual qp lock
+ * may be taken inside of an individual cq lock.  Both cqs attached to
+ * a qp may be locked, with the cq with the lower cqn locked first.
+ * No other nesting should be done.
+ *
+ * Each struct mthca_cq/qp also has an ref count, protected by the
+ * corresponding table lock.  The pointer from the cq/qp_table to the
+ * struct counts as one reference.  This reference also is good for
+ * access through the consumer API, so modifying the CQ/QP etc doesn't
+ * need to take another reference.  Access to a QP because of a
+ * completion being polled does not need a reference either.
+ *
+ * Finally, each struct mthca_cq/qp has a wait_queue_head_t for the
+ * destroy function to sleep on.
+ *
+ * This means that access from the consumer API requires nothing but
+ * taking the struct's lock.
+ *
+ * Access because of a completion event should go as follows:
+ * - lock cq/qp_table and look up struct
+ * - increment ref count in struct
+ * - drop cq/qp_table lock
+ * - lock struct, do your thing, and unlock struct
+ * - decrement ref count; if zero, wake up waiters
+ *
+ * To destroy a CQ/QP, we can do the following:
+ * - lock cq/qp_table
+ * - remove pointer and decrement ref count
+ * - unlock cq/qp_table lock
+ * - wait_event until ref count is zero
+ *
+ * It is the consumer's responsibilty to make sure that no QP
+ * operations (WQE posting or state modification) are pending when a
+ * QP is destroyed.  Also, the consumer must make sure that calls to
+ * qp_modify are serialized.  Similarly, the consumer is responsible
+ * for ensuring that no CQ resize operations are pending when a CQ
+ * is destroyed.
+ *
+ * Possible optimizations (wait for profile data to see if/where we
+ * have locks bouncing between CPUs):
+ * - split cq/qp table lock into n separate (cache-aligned) locks,
+ *   indexed (say) by the page in the table
+ * - split QP struct lock into three (one for common info, one for the
+ *   send queue and one for the receive queue)
+ */
+
+struct mthca_cq_buf {
+	union mthca_buf		queue;
+	struct mthca_mr		mr;
+	int			is_direct;
+};
+
+struct mthca_cq_resize {
+	struct mthca_cq_buf	buf;
+	int			cqe;
+	enum {
+		CQ_RESIZE_ALLOC,
+		CQ_RESIZE_READY,
+		CQ_RESIZE_SWAPPED
+	}			state;
+};
+
+struct mthca_cq {
+	struct ib_cq		ibcq;
+	spinlock_t		lock;
+	int			refcount;
+	int			cqn;
+	u32			cons_index;
+	struct mthca_cq_buf	buf;
+	struct mthca_cq_resize *resize_buf;
+	int			is_kernel;
+
+	/* Next fields are Arbel only */
+	int			set_ci_db_index;
+	__be32		       *set_ci_db;
+	int			arm_db_index;
+	__be32		       *arm_db;
+	int			arm_sn;
+
+	wait_queue_head_t	wait;
+	struct mutex		mutex;
+};
+
+struct mthca_srq {
+	struct ib_srq		ibsrq;
+	spinlock_t		lock;
+	int			refcount;
+	int			srqn;
+	int			max;
+	int			max_gs;
+	int			wqe_shift;
+	int			first_free;
+	int			last_free;
+	u16			counter;  /* Arbel only */
+	int			db_index; /* Arbel only */
+	__be32		       *db;       /* Arbel only */
+	void		       *last;
+
+	int			is_direct;
+	u64		       *wrid;
+	union mthca_buf		queue;
+	struct mthca_mr		mr;
+
+	wait_queue_head_t	wait;
+	struct mutex		mutex;
+};
+
+struct mthca_wq {
+	spinlock_t lock;
+	int        max;
+	unsigned   next_ind;
+	unsigned   last_comp;
+	unsigned   head;
+	unsigned   tail;
+	void      *last;
+	int        max_gs;
+	int        wqe_shift;
+
+	int        db_index;	/* Arbel only */
+	__be32    *db;
+};
+
+struct mthca_qp {
+	struct ib_qp           ibqp;
+	int                    refcount;
+	u32                    qpn;
+	int                    is_direct;
+	u8                     port; /* for SQP and memfree use only */
+	u8                     alt_port; /* for memfree use only */
+	u8                     transport;
+	u8                     state;
+	u8                     atomic_rd_en;
+	u8                     resp_depth;
+
+	struct mthca_mr        mr;
+
+	struct mthca_wq        rq;
+	struct mthca_wq        sq;
+	enum ib_sig_type       sq_policy;
+	int                    send_wqe_offset;
+	int                    max_inline_data;
+
+	u64                   *wrid;
+	union mthca_buf	       queue;
+
+	wait_queue_head_t      wait;
+	struct mutex	       mutex;
+};
+
+struct mthca_sqp {
+	struct mthca_qp qp;
+	int             pkey_index;
+	u32             qkey;
+	u32             send_psn;
+	struct ib_ud_header ud_header;
+	int             header_buf_size;
+	void           *header_buf;
+	dma_addr_t      header_dma;
+};
+
+static inline struct mthca_ucontext *to_mucontext(struct ib_ucontext *ibucontext)
+{
+	return container_of(ibucontext, struct mthca_ucontext, ibucontext);
+}
+
+static inline struct mthca_fmr *to_mfmr(struct ib_fmr *ibmr)
+{
+	return container_of(ibmr, struct mthca_fmr, ibmr);
+}
+
+static inline struct mthca_mr *to_mmr(struct ib_mr *ibmr)
+{
+	return container_of(ibmr, struct mthca_mr, ibmr);
+}
+
+static inline struct mthca_pd *to_mpd(struct ib_pd *ibpd)
+{
+	return container_of(ibpd, struct mthca_pd, ibpd);
+}
+
+static inline struct mthca_ah *to_mah(struct ib_ah *ibah)
+{
+	return container_of(ibah, struct mthca_ah, ibah);
+}
+
+static inline struct mthca_cq *to_mcq(struct ib_cq *ibcq)
+{
+	return container_of(ibcq, struct mthca_cq, ibcq);
+}
+
+static inline struct mthca_srq *to_msrq(struct ib_srq *ibsrq)
+{
+	return container_of(ibsrq, struct mthca_srq, ibsrq);
+}
+
+static inline struct mthca_qp *to_mqp(struct ib_qp *ibqp)
+{
+	return container_of(ibqp, struct mthca_qp, ibqp);
+}
+
+static inline struct mthca_sqp *to_msqp(struct mthca_qp *qp)
+{
+	return container_of(qp, struct mthca_sqp, qp);
+}
+
+#endif /* MTHCA_PROVIDER_H */
diff --git a/drivers/infiniband/hw/mthca/mthca_qp.c b/drivers/infiniband/hw/mthca/mthca_qp.c
new file mode 100644
index 0000000..e354b2f
--- /dev/null
+++ b/drivers/infiniband/hw/mthca/mthca_qp.c
@@ -0,0 +1,2311 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Cisco Systems. All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2004 Voltaire, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+
+#include <asm/io.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_cache.h>
+#include <rdma/ib_pack.h>
+
+#include "mthca_dev.h"
+#include "mthca_cmd.h"
+#include "mthca_memfree.h"
+#include "mthca_wqe.h"
+
+enum {
+	MTHCA_MAX_DIRECT_QP_SIZE = 4 * PAGE_SIZE,
+	MTHCA_ACK_REQ_FREQ       = 10,
+	MTHCA_FLIGHT_LIMIT       = 9,
+	MTHCA_UD_HEADER_SIZE     = 72, /* largest UD header possible */
+	MTHCA_INLINE_HEADER_SIZE = 4,  /* data segment overhead for inline */
+	MTHCA_INLINE_CHUNK_SIZE  = 16  /* inline data segment chunk */
+};
+
+enum {
+	MTHCA_QP_STATE_RST  = 0,
+	MTHCA_QP_STATE_INIT = 1,
+	MTHCA_QP_STATE_RTR  = 2,
+	MTHCA_QP_STATE_RTS  = 3,
+	MTHCA_QP_STATE_SQE  = 4,
+	MTHCA_QP_STATE_SQD  = 5,
+	MTHCA_QP_STATE_ERR  = 6,
+	MTHCA_QP_STATE_DRAINING = 7
+};
+
+enum {
+	MTHCA_QP_ST_RC 	= 0x0,
+	MTHCA_QP_ST_UC 	= 0x1,
+	MTHCA_QP_ST_RD 	= 0x2,
+	MTHCA_QP_ST_UD 	= 0x3,
+	MTHCA_QP_ST_MLX = 0x7
+};
+
+enum {
+	MTHCA_QP_PM_MIGRATED = 0x3,
+	MTHCA_QP_PM_ARMED    = 0x0,
+	MTHCA_QP_PM_REARM    = 0x1
+};
+
+enum {
+	/* qp_context flags */
+	MTHCA_QP_BIT_DE  = 1 <<  8,
+	/* params1 */
+	MTHCA_QP_BIT_SRE = 1 << 15,
+	MTHCA_QP_BIT_SWE = 1 << 14,
+	MTHCA_QP_BIT_SAE = 1 << 13,
+	MTHCA_QP_BIT_SIC = 1 <<  4,
+	MTHCA_QP_BIT_SSC = 1 <<  3,
+	/* params2 */
+	MTHCA_QP_BIT_RRE = 1 << 15,
+	MTHCA_QP_BIT_RWE = 1 << 14,
+	MTHCA_QP_BIT_RAE = 1 << 13,
+	MTHCA_QP_BIT_RIC = 1 <<  4,
+	MTHCA_QP_BIT_RSC = 1 <<  3
+};
+
+enum {
+	MTHCA_SEND_DOORBELL_FENCE = 1 << 5
+};
+
+struct mthca_qp_path {
+	__be32 port_pkey;
+	u8     rnr_retry;
+	u8     g_mylmc;
+	__be16 rlid;
+	u8     ackto;
+	u8     mgid_index;
+	u8     static_rate;
+	u8     hop_limit;
+	__be32 sl_tclass_flowlabel;
+	u8     rgid[16];
+} __attribute__((packed));
+
+struct mthca_qp_context {
+	__be32 flags;
+	__be32 tavor_sched_queue; /* Reserved on Arbel */
+	u8     mtu_msgmax;
+	u8     rq_size_stride;	/* Reserved on Tavor */
+	u8     sq_size_stride;	/* Reserved on Tavor */
+	u8     rlkey_arbel_sched_queue;	/* Reserved on Tavor */
+	__be32 usr_page;
+	__be32 local_qpn;
+	__be32 remote_qpn;
+	u32    reserved1[2];
+	struct mthca_qp_path pri_path;
+	struct mthca_qp_path alt_path;
+	__be32 rdd;
+	__be32 pd;
+	__be32 wqe_base;
+	__be32 wqe_lkey;
+	__be32 params1;
+	__be32 reserved2;
+	__be32 next_send_psn;
+	__be32 cqn_snd;
+	__be32 snd_wqe_base_l;	/* Next send WQE on Tavor */
+	__be32 snd_db_index;	/* (debugging only entries) */
+	__be32 last_acked_psn;
+	__be32 ssn;
+	__be32 params2;
+	__be32 rnr_nextrecvpsn;
+	__be32 ra_buff_indx;
+	__be32 cqn_rcv;
+	__be32 rcv_wqe_base_l;	/* Next recv WQE on Tavor */
+	__be32 rcv_db_index;	/* (debugging only entries) */
+	__be32 qkey;
+	__be32 srqn;
+	__be32 rmsn;
+	__be16 rq_wqe_counter;	/* reserved on Tavor */
+	__be16 sq_wqe_counter;	/* reserved on Tavor */
+	u32    reserved3[18];
+} __attribute__((packed));
+
+struct mthca_qp_param {
+	__be32 opt_param_mask;
+	u32    reserved1;
+	struct mthca_qp_context context;
+	u32    reserved2[62];
+} __attribute__((packed));
+
+enum {
+	MTHCA_QP_OPTPAR_ALT_ADDR_PATH     = 1 << 0,
+	MTHCA_QP_OPTPAR_RRE               = 1 << 1,
+	MTHCA_QP_OPTPAR_RAE               = 1 << 2,
+	MTHCA_QP_OPTPAR_RWE               = 1 << 3,
+	MTHCA_QP_OPTPAR_PKEY_INDEX        = 1 << 4,
+	MTHCA_QP_OPTPAR_Q_KEY             = 1 << 5,
+	MTHCA_QP_OPTPAR_RNR_TIMEOUT       = 1 << 6,
+	MTHCA_QP_OPTPAR_PRIMARY_ADDR_PATH = 1 << 7,
+	MTHCA_QP_OPTPAR_SRA_MAX           = 1 << 8,
+	MTHCA_QP_OPTPAR_RRA_MAX           = 1 << 9,
+	MTHCA_QP_OPTPAR_PM_STATE          = 1 << 10,
+	MTHCA_QP_OPTPAR_PORT_NUM          = 1 << 11,
+	MTHCA_QP_OPTPAR_RETRY_COUNT       = 1 << 12,
+	MTHCA_QP_OPTPAR_ALT_RNR_RETRY     = 1 << 13,
+	MTHCA_QP_OPTPAR_ACK_TIMEOUT       = 1 << 14,
+	MTHCA_QP_OPTPAR_RNR_RETRY         = 1 << 15,
+	MTHCA_QP_OPTPAR_SCHED_QUEUE       = 1 << 16
+};
+
+static const u8 mthca_opcode[] = {
+	[IB_WR_SEND]                 = MTHCA_OPCODE_SEND,
+	[IB_WR_SEND_WITH_IMM]        = MTHCA_OPCODE_SEND_IMM,
+	[IB_WR_RDMA_WRITE]           = MTHCA_OPCODE_RDMA_WRITE,
+	[IB_WR_RDMA_WRITE_WITH_IMM]  = MTHCA_OPCODE_RDMA_WRITE_IMM,
+	[IB_WR_RDMA_READ]            = MTHCA_OPCODE_RDMA_READ,
+	[IB_WR_ATOMIC_CMP_AND_SWP]   = MTHCA_OPCODE_ATOMIC_CS,
+	[IB_WR_ATOMIC_FETCH_AND_ADD] = MTHCA_OPCODE_ATOMIC_FA,
+};
+
+static int is_sqp(struct mthca_dev *dev, struct mthca_qp *qp)
+{
+	return qp->qpn >= dev->qp_table.sqp_start &&
+		qp->qpn <= dev->qp_table.sqp_start + 3;
+}
+
+static int is_qp0(struct mthca_dev *dev, struct mthca_qp *qp)
+{
+	return qp->qpn >= dev->qp_table.sqp_start &&
+		qp->qpn <= dev->qp_table.sqp_start + 1;
+}
+
+static void *get_recv_wqe(struct mthca_qp *qp, int n)
+{
+	if (qp->is_direct)
+		return qp->queue.direct.buf + (n << qp->rq.wqe_shift);
+	else
+		return qp->queue.page_list[(n << qp->rq.wqe_shift) >> PAGE_SHIFT].buf +
+			((n << qp->rq.wqe_shift) & (PAGE_SIZE - 1));
+}
+
+static void *get_send_wqe(struct mthca_qp *qp, int n)
+{
+	if (qp->is_direct)
+		return qp->queue.direct.buf + qp->send_wqe_offset +
+			(n << qp->sq.wqe_shift);
+	else
+		return qp->queue.page_list[(qp->send_wqe_offset +
+					    (n << qp->sq.wqe_shift)) >>
+					   PAGE_SHIFT].buf +
+			((qp->send_wqe_offset + (n << qp->sq.wqe_shift)) &
+			 (PAGE_SIZE - 1));
+}
+
+static void mthca_wq_reset(struct mthca_wq *wq)
+{
+	wq->next_ind  = 0;
+	wq->last_comp = wq->max - 1;
+	wq->head      = 0;
+	wq->tail      = 0;
+}
+
+void mthca_qp_event(struct mthca_dev *dev, u32 qpn,
+		    enum ib_event_type event_type)
+{
+	struct mthca_qp *qp;
+	struct ib_event event;
+
+	spin_lock(&dev->qp_table.lock);
+	qp = mthca_array_get(&dev->qp_table.qp, qpn & (dev->limits.num_qps - 1));
+	if (qp)
+		++qp->refcount;
+	spin_unlock(&dev->qp_table.lock);
+
+	if (!qp) {
+		mthca_warn(dev, "Async event %d for bogus QP %08x\n",
+			   event_type, qpn);
+		return;
+	}
+
+	if (event_type == IB_EVENT_PATH_MIG)
+		qp->port = qp->alt_port;
+
+	event.device      = &dev->ib_dev;
+	event.event       = event_type;
+	event.element.qp  = &qp->ibqp;
+	if (qp->ibqp.event_handler)
+		qp->ibqp.event_handler(&event, qp->ibqp.qp_context);
+
+	spin_lock(&dev->qp_table.lock);
+	if (!--qp->refcount)
+		wake_up(&qp->wait);
+	spin_unlock(&dev->qp_table.lock);
+}
+
+static int to_mthca_state(enum ib_qp_state ib_state)
+{
+	switch (ib_state) {
+	case IB_QPS_RESET: return MTHCA_QP_STATE_RST;
+	case IB_QPS_INIT:  return MTHCA_QP_STATE_INIT;
+	case IB_QPS_RTR:   return MTHCA_QP_STATE_RTR;
+	case IB_QPS_RTS:   return MTHCA_QP_STATE_RTS;
+	case IB_QPS_SQD:   return MTHCA_QP_STATE_SQD;
+	case IB_QPS_SQE:   return MTHCA_QP_STATE_SQE;
+	case IB_QPS_ERR:   return MTHCA_QP_STATE_ERR;
+	default:                return -1;
+	}
+}
+
+enum { RC, UC, UD, RD, RDEE, MLX, NUM_TRANS };
+
+static int to_mthca_st(int transport)
+{
+	switch (transport) {
+	case RC:  return MTHCA_QP_ST_RC;
+	case UC:  return MTHCA_QP_ST_UC;
+	case UD:  return MTHCA_QP_ST_UD;
+	case RD:  return MTHCA_QP_ST_RD;
+	case MLX: return MTHCA_QP_ST_MLX;
+	default:  return -1;
+	}
+}
+
+static void store_attrs(struct mthca_sqp *sqp, const struct ib_qp_attr *attr,
+			int attr_mask)
+{
+	if (attr_mask & IB_QP_PKEY_INDEX)
+		sqp->pkey_index = attr->pkey_index;
+	if (attr_mask & IB_QP_QKEY)
+		sqp->qkey = attr->qkey;
+	if (attr_mask & IB_QP_SQ_PSN)
+		sqp->send_psn = attr->sq_psn;
+}
+
+static void init_port(struct mthca_dev *dev, int port)
+{
+	int err;
+	struct mthca_init_ib_param param;
+
+	memset(&param, 0, sizeof param);
+
+	param.port_width = dev->limits.port_width_cap;
+	param.vl_cap     = dev->limits.vl_cap;
+	param.mtu_cap    = dev->limits.mtu_cap;
+	param.gid_cap    = dev->limits.gid_table_len;
+	param.pkey_cap   = dev->limits.pkey_table_len;
+
+	err = mthca_INIT_IB(dev, &param, port);
+	if (err)
+		mthca_warn(dev, "INIT_IB failed, return code %d.\n", err);
+}
+
+static __be32 get_hw_access_flags(struct mthca_qp *qp, const struct ib_qp_attr *attr,
+				  int attr_mask)
+{
+	u8 dest_rd_atomic;
+	u32 access_flags;
+	u32 hw_access_flags = 0;
+
+	if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC)
+		dest_rd_atomic = attr->max_dest_rd_atomic;
+	else
+		dest_rd_atomic = qp->resp_depth;
+
+	if (attr_mask & IB_QP_ACCESS_FLAGS)
+		access_flags = attr->qp_access_flags;
+	else
+		access_flags = qp->atomic_rd_en;
+
+	if (!dest_rd_atomic)
+		access_flags &= IB_ACCESS_REMOTE_WRITE;
+
+	if (access_flags & IB_ACCESS_REMOTE_READ)
+		hw_access_flags |= MTHCA_QP_BIT_RRE;
+	if (access_flags & IB_ACCESS_REMOTE_ATOMIC)
+		hw_access_flags |= MTHCA_QP_BIT_RAE;
+	if (access_flags & IB_ACCESS_REMOTE_WRITE)
+		hw_access_flags |= MTHCA_QP_BIT_RWE;
+
+	return cpu_to_be32(hw_access_flags);
+}
+
+static inline enum ib_qp_state to_ib_qp_state(int mthca_state)
+{
+	switch (mthca_state) {
+	case MTHCA_QP_STATE_RST:      return IB_QPS_RESET;
+	case MTHCA_QP_STATE_INIT:     return IB_QPS_INIT;
+	case MTHCA_QP_STATE_RTR:      return IB_QPS_RTR;
+	case MTHCA_QP_STATE_RTS:      return IB_QPS_RTS;
+	case MTHCA_QP_STATE_DRAINING:
+	case MTHCA_QP_STATE_SQD:      return IB_QPS_SQD;
+	case MTHCA_QP_STATE_SQE:      return IB_QPS_SQE;
+	case MTHCA_QP_STATE_ERR:      return IB_QPS_ERR;
+	default:                      return -1;
+	}
+}
+
+static inline enum ib_mig_state to_ib_mig_state(int mthca_mig_state)
+{
+	switch (mthca_mig_state) {
+	case 0:  return IB_MIG_ARMED;
+	case 1:  return IB_MIG_REARM;
+	case 3:  return IB_MIG_MIGRATED;
+	default: return -1;
+	}
+}
+
+static int to_ib_qp_access_flags(int mthca_flags)
+{
+	int ib_flags = 0;
+
+	if (mthca_flags & MTHCA_QP_BIT_RRE)
+		ib_flags |= IB_ACCESS_REMOTE_READ;
+	if (mthca_flags & MTHCA_QP_BIT_RWE)
+		ib_flags |= IB_ACCESS_REMOTE_WRITE;
+	if (mthca_flags & MTHCA_QP_BIT_RAE)
+		ib_flags |= IB_ACCESS_REMOTE_ATOMIC;
+
+	return ib_flags;
+}
+
+static void to_ib_ah_attr(struct mthca_dev *dev, struct ib_ah_attr *ib_ah_attr,
+				struct mthca_qp_path *path)
+{
+	memset(ib_ah_attr, 0, sizeof *ib_ah_attr);
+	ib_ah_attr->port_num 	  = (be32_to_cpu(path->port_pkey) >> 24) & 0x3;
+
+	if (ib_ah_attr->port_num == 0 || ib_ah_attr->port_num > dev->limits.num_ports)
+		return;
+
+	ib_ah_attr->dlid     	  = be16_to_cpu(path->rlid);
+	ib_ah_attr->sl       	  = be32_to_cpu(path->sl_tclass_flowlabel) >> 28;
+	ib_ah_attr->src_path_bits = path->g_mylmc & 0x7f;
+	ib_ah_attr->static_rate   = mthca_rate_to_ib(dev,
+						     path->static_rate & 0xf,
+						     ib_ah_attr->port_num);
+	ib_ah_attr->ah_flags      = (path->g_mylmc & (1 << 7)) ? IB_AH_GRH : 0;
+	if (ib_ah_attr->ah_flags) {
+		ib_ah_attr->grh.sgid_index = path->mgid_index & (dev->limits.gid_table_len - 1);
+		ib_ah_attr->grh.hop_limit  = path->hop_limit;
+		ib_ah_attr->grh.traffic_class =
+			(be32_to_cpu(path->sl_tclass_flowlabel) >> 20) & 0xff;
+		ib_ah_attr->grh.flow_label =
+			be32_to_cpu(path->sl_tclass_flowlabel) & 0xfffff;
+		memcpy(ib_ah_attr->grh.dgid.raw,
+			path->rgid, sizeof ib_ah_attr->grh.dgid.raw);
+	}
+}
+
+int mthca_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, int qp_attr_mask,
+		   struct ib_qp_init_attr *qp_init_attr)
+{
+	struct mthca_dev *dev = to_mdev(ibqp->device);
+	struct mthca_qp *qp = to_mqp(ibqp);
+	int err = 0;
+	struct mthca_mailbox *mailbox = NULL;
+	struct mthca_qp_param *qp_param;
+	struct mthca_qp_context *context;
+	int mthca_state;
+
+	mutex_lock(&qp->mutex);
+
+	if (qp->state == IB_QPS_RESET) {
+		qp_attr->qp_state = IB_QPS_RESET;
+		goto done;
+	}
+
+	mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+	if (IS_ERR(mailbox)) {
+		err = PTR_ERR(mailbox);
+		goto out;
+	}
+
+	err = mthca_QUERY_QP(dev, qp->qpn, 0, mailbox);
+	if (err) {
+		mthca_warn(dev, "QUERY_QP failed (%d)\n", err);
+		goto out_mailbox;
+	}
+
+	qp_param    = mailbox->buf;
+	context     = &qp_param->context;
+	mthca_state = be32_to_cpu(context->flags) >> 28;
+
+	qp->state		     = to_ib_qp_state(mthca_state);
+	qp_attr->qp_state	     = qp->state;
+	qp_attr->path_mtu 	     = context->mtu_msgmax >> 5;
+	qp_attr->path_mig_state      =
+		to_ib_mig_state((be32_to_cpu(context->flags) >> 11) & 0x3);
+	qp_attr->qkey 		     = be32_to_cpu(context->qkey);
+	qp_attr->rq_psn 	     = be32_to_cpu(context->rnr_nextrecvpsn) & 0xffffff;
+	qp_attr->sq_psn 	     = be32_to_cpu(context->next_send_psn) & 0xffffff;
+	qp_attr->dest_qp_num 	     = be32_to_cpu(context->remote_qpn) & 0xffffff;
+	qp_attr->qp_access_flags     =
+		to_ib_qp_access_flags(be32_to_cpu(context->params2));
+
+	if (qp->transport == RC || qp->transport == UC) {
+		to_ib_ah_attr(dev, &qp_attr->ah_attr, &context->pri_path);
+		to_ib_ah_attr(dev, &qp_attr->alt_ah_attr, &context->alt_path);
+		qp_attr->alt_pkey_index =
+			be32_to_cpu(context->alt_path.port_pkey) & 0x7f;
+		qp_attr->alt_port_num 	= qp_attr->alt_ah_attr.port_num;
+	}
+
+	qp_attr->pkey_index = be32_to_cpu(context->pri_path.port_pkey) & 0x7f;
+	qp_attr->port_num   =
+		(be32_to_cpu(context->pri_path.port_pkey) >> 24) & 0x3;
+
+	/* qp_attr->en_sqd_async_notify is only applicable in modify qp */
+	qp_attr->sq_draining = mthca_state == MTHCA_QP_STATE_DRAINING;
+
+	qp_attr->max_rd_atomic = 1 << ((be32_to_cpu(context->params1) >> 21) & 0x7);
+
+	qp_attr->max_dest_rd_atomic =
+		1 << ((be32_to_cpu(context->params2) >> 21) & 0x7);
+	qp_attr->min_rnr_timer 	    =
+		(be32_to_cpu(context->rnr_nextrecvpsn) >> 24) & 0x1f;
+	qp_attr->timeout 	    = context->pri_path.ackto >> 3;
+	qp_attr->retry_cnt 	    = (be32_to_cpu(context->params1) >> 16) & 0x7;
+	qp_attr->rnr_retry 	    = context->pri_path.rnr_retry >> 5;
+	qp_attr->alt_timeout 	    = context->alt_path.ackto >> 3;
+
+done:
+	qp_attr->cur_qp_state	     = qp_attr->qp_state;
+	qp_attr->cap.max_send_wr     = qp->sq.max;
+	qp_attr->cap.max_recv_wr     = qp->rq.max;
+	qp_attr->cap.max_send_sge    = qp->sq.max_gs;
+	qp_attr->cap.max_recv_sge    = qp->rq.max_gs;
+	qp_attr->cap.max_inline_data = qp->max_inline_data;
+
+	qp_init_attr->cap	     = qp_attr->cap;
+	qp_init_attr->sq_sig_type    = qp->sq_policy;
+
+out_mailbox:
+	mthca_free_mailbox(dev, mailbox);
+
+out:
+	mutex_unlock(&qp->mutex);
+	return err;
+}
+
+static int mthca_path_set(struct mthca_dev *dev, const struct ib_ah_attr *ah,
+			  struct mthca_qp_path *path, u8 port)
+{
+	path->g_mylmc     = ah->src_path_bits & 0x7f;
+	path->rlid        = cpu_to_be16(ah->dlid);
+	path->static_rate = mthca_get_rate(dev, ah->static_rate, port);
+
+	if (ah->ah_flags & IB_AH_GRH) {
+		if (ah->grh.sgid_index >= dev->limits.gid_table_len) {
+			mthca_dbg(dev, "sgid_index (%u) too large. max is %d\n",
+				  ah->grh.sgid_index, dev->limits.gid_table_len-1);
+			return -1;
+		}
+
+		path->g_mylmc   |= 1 << 7;
+		path->mgid_index = ah->grh.sgid_index;
+		path->hop_limit  = ah->grh.hop_limit;
+		path->sl_tclass_flowlabel =
+			cpu_to_be32((ah->sl << 28)                |
+				    (ah->grh.traffic_class << 20) |
+				    (ah->grh.flow_label));
+		memcpy(path->rgid, ah->grh.dgid.raw, 16);
+	} else
+		path->sl_tclass_flowlabel = cpu_to_be32(ah->sl << 28);
+
+	return 0;
+}
+
+static int __mthca_modify_qp(struct ib_qp *ibqp,
+			     const struct ib_qp_attr *attr, int attr_mask,
+			     enum ib_qp_state cur_state, enum ib_qp_state new_state)
+{
+	struct mthca_dev *dev = to_mdev(ibqp->device);
+	struct mthca_qp *qp = to_mqp(ibqp);
+	struct mthca_mailbox *mailbox;
+	struct mthca_qp_param *qp_param;
+	struct mthca_qp_context *qp_context;
+	u32 sqd_event = 0;
+	int err = -EINVAL;
+
+	mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+	if (IS_ERR(mailbox)) {
+		err = PTR_ERR(mailbox);
+		goto out;
+	}
+	qp_param = mailbox->buf;
+	qp_context = &qp_param->context;
+	memset(qp_param, 0, sizeof *qp_param);
+
+	qp_context->flags      = cpu_to_be32((to_mthca_state(new_state) << 28) |
+					     (to_mthca_st(qp->transport) << 16));
+	qp_context->flags     |= cpu_to_be32(MTHCA_QP_BIT_DE);
+	if (!(attr_mask & IB_QP_PATH_MIG_STATE))
+		qp_context->flags |= cpu_to_be32(MTHCA_QP_PM_MIGRATED << 11);
+	else {
+		qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_PM_STATE);
+		switch (attr->path_mig_state) {
+		case IB_MIG_MIGRATED:
+			qp_context->flags |= cpu_to_be32(MTHCA_QP_PM_MIGRATED << 11);
+			break;
+		case IB_MIG_REARM:
+			qp_context->flags |= cpu_to_be32(MTHCA_QP_PM_REARM << 11);
+			break;
+		case IB_MIG_ARMED:
+			qp_context->flags |= cpu_to_be32(MTHCA_QP_PM_ARMED << 11);
+			break;
+		}
+	}
+
+	/* leave tavor_sched_queue as 0 */
+
+	if (qp->transport == MLX || qp->transport == UD)
+		qp_context->mtu_msgmax = (IB_MTU_2048 << 5) | 11;
+	else if (attr_mask & IB_QP_PATH_MTU) {
+		if (attr->path_mtu < IB_MTU_256 || attr->path_mtu > IB_MTU_2048) {
+			mthca_dbg(dev, "path MTU (%u) is invalid\n",
+				  attr->path_mtu);
+			goto out_mailbox;
+		}
+		qp_context->mtu_msgmax = (attr->path_mtu << 5) | 31;
+	}
+
+	if (mthca_is_memfree(dev)) {
+		if (qp->rq.max)
+			qp_context->rq_size_stride = ilog2(qp->rq.max) << 3;
+		qp_context->rq_size_stride |= qp->rq.wqe_shift - 4;
+
+		if (qp->sq.max)
+			qp_context->sq_size_stride = ilog2(qp->sq.max) << 3;
+		qp_context->sq_size_stride |= qp->sq.wqe_shift - 4;
+	}
+
+	/* leave arbel_sched_queue as 0 */
+
+	if (qp->ibqp.uobject)
+		qp_context->usr_page =
+			cpu_to_be32(to_mucontext(qp->ibqp.uobject->context)->uar.index);
+	else
+		qp_context->usr_page = cpu_to_be32(dev->driver_uar.index);
+	qp_context->local_qpn  = cpu_to_be32(qp->qpn);
+	if (attr_mask & IB_QP_DEST_QPN) {
+		qp_context->remote_qpn = cpu_to_be32(attr->dest_qp_num);
+	}
+
+	if (qp->transport == MLX)
+		qp_context->pri_path.port_pkey |=
+			cpu_to_be32(qp->port << 24);
+	else {
+		if (attr_mask & IB_QP_PORT) {
+			qp_context->pri_path.port_pkey |=
+				cpu_to_be32(attr->port_num << 24);
+			qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_PORT_NUM);
+		}
+	}
+
+	if (attr_mask & IB_QP_PKEY_INDEX) {
+		qp_context->pri_path.port_pkey |=
+			cpu_to_be32(attr->pkey_index);
+		qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_PKEY_INDEX);
+	}
+
+	if (attr_mask & IB_QP_RNR_RETRY) {
+		qp_context->alt_path.rnr_retry = qp_context->pri_path.rnr_retry =
+			attr->rnr_retry << 5;
+		qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_RNR_RETRY |
+							MTHCA_QP_OPTPAR_ALT_RNR_RETRY);
+	}
+
+	if (attr_mask & IB_QP_AV) {
+		if (mthca_path_set(dev, &attr->ah_attr, &qp_context->pri_path,
+				   attr_mask & IB_QP_PORT ? attr->port_num : qp->port))
+			goto out_mailbox;
+
+		qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_PRIMARY_ADDR_PATH);
+	}
+
+	if (ibqp->qp_type == IB_QPT_RC &&
+	    cur_state == IB_QPS_INIT && new_state == IB_QPS_RTR) {
+		u8 sched_queue = ibqp->uobject ? 0x2 : 0x1;
+
+		if (mthca_is_memfree(dev))
+			qp_context->rlkey_arbel_sched_queue |= sched_queue;
+		else
+			qp_context->tavor_sched_queue |= cpu_to_be32(sched_queue);
+
+		qp_param->opt_param_mask |=
+			cpu_to_be32(MTHCA_QP_OPTPAR_SCHED_QUEUE);
+	}
+
+	if (attr_mask & IB_QP_TIMEOUT) {
+		qp_context->pri_path.ackto = attr->timeout << 3;
+		qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_ACK_TIMEOUT);
+	}
+
+	if (attr_mask & IB_QP_ALT_PATH) {
+		if (attr->alt_pkey_index >= dev->limits.pkey_table_len) {
+			mthca_dbg(dev, "Alternate P_Key index (%u) too large. max is %d\n",
+				  attr->alt_pkey_index, dev->limits.pkey_table_len-1);
+			goto out_mailbox;
+		}
+
+		if (attr->alt_port_num == 0 || attr->alt_port_num > dev->limits.num_ports) {
+			mthca_dbg(dev, "Alternate port number (%u) is invalid\n",
+				attr->alt_port_num);
+			goto out_mailbox;
+		}
+
+		if (mthca_path_set(dev, &attr->alt_ah_attr, &qp_context->alt_path,
+				   attr->alt_ah_attr.port_num))
+			goto out_mailbox;
+
+		qp_context->alt_path.port_pkey |= cpu_to_be32(attr->alt_pkey_index |
+							      attr->alt_port_num << 24);
+		qp_context->alt_path.ackto = attr->alt_timeout << 3;
+		qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_ALT_ADDR_PATH);
+	}
+
+	/* leave rdd as 0 */
+	qp_context->pd         = cpu_to_be32(to_mpd(ibqp->pd)->pd_num);
+	/* leave wqe_base as 0 (we always create an MR based at 0 for WQs) */
+	qp_context->wqe_lkey   = cpu_to_be32(qp->mr.ibmr.lkey);
+	qp_context->params1    = cpu_to_be32((MTHCA_ACK_REQ_FREQ << 28) |
+					     (MTHCA_FLIGHT_LIMIT << 24) |
+					     MTHCA_QP_BIT_SWE);
+	if (qp->sq_policy == IB_SIGNAL_ALL_WR)
+		qp_context->params1 |= cpu_to_be32(MTHCA_QP_BIT_SSC);
+	if (attr_mask & IB_QP_RETRY_CNT) {
+		qp_context->params1 |= cpu_to_be32(attr->retry_cnt << 16);
+		qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_RETRY_COUNT);
+	}
+
+	if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC) {
+		if (attr->max_rd_atomic) {
+			qp_context->params1 |=
+				cpu_to_be32(MTHCA_QP_BIT_SRE |
+					    MTHCA_QP_BIT_SAE);
+			qp_context->params1 |=
+				cpu_to_be32(fls(attr->max_rd_atomic - 1) << 21);
+		}
+		qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_SRA_MAX);
+	}
+
+	if (attr_mask & IB_QP_SQ_PSN)
+		qp_context->next_send_psn = cpu_to_be32(attr->sq_psn);
+	qp_context->cqn_snd = cpu_to_be32(to_mcq(ibqp->send_cq)->cqn);
+
+	if (mthca_is_memfree(dev)) {
+		qp_context->snd_wqe_base_l = cpu_to_be32(qp->send_wqe_offset);
+		qp_context->snd_db_index   = cpu_to_be32(qp->sq.db_index);
+	}
+
+	if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) {
+		if (attr->max_dest_rd_atomic)
+			qp_context->params2 |=
+				cpu_to_be32(fls(attr->max_dest_rd_atomic - 1) << 21);
+
+		qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_RRA_MAX);
+	}
+
+	if (attr_mask & (IB_QP_ACCESS_FLAGS | IB_QP_MAX_DEST_RD_ATOMIC)) {
+		qp_context->params2      |= get_hw_access_flags(qp, attr, attr_mask);
+		qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_RWE |
+							MTHCA_QP_OPTPAR_RRE |
+							MTHCA_QP_OPTPAR_RAE);
+	}
+
+	qp_context->params2 |= cpu_to_be32(MTHCA_QP_BIT_RSC);
+
+	if (ibqp->srq)
+		qp_context->params2 |= cpu_to_be32(MTHCA_QP_BIT_RIC);
+
+	if (attr_mask & IB_QP_MIN_RNR_TIMER) {
+		qp_context->rnr_nextrecvpsn |= cpu_to_be32(attr->min_rnr_timer << 24);
+		qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_RNR_TIMEOUT);
+	}
+	if (attr_mask & IB_QP_RQ_PSN)
+		qp_context->rnr_nextrecvpsn |= cpu_to_be32(attr->rq_psn);
+
+	qp_context->ra_buff_indx =
+		cpu_to_be32(dev->qp_table.rdb_base +
+			    ((qp->qpn & (dev->limits.num_qps - 1)) * MTHCA_RDB_ENTRY_SIZE <<
+			     dev->qp_table.rdb_shift));
+
+	qp_context->cqn_rcv = cpu_to_be32(to_mcq(ibqp->recv_cq)->cqn);
+
+	if (mthca_is_memfree(dev))
+		qp_context->rcv_db_index   = cpu_to_be32(qp->rq.db_index);
+
+	if (attr_mask & IB_QP_QKEY) {
+		qp_context->qkey = cpu_to_be32(attr->qkey);
+		qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_Q_KEY);
+	}
+
+	if (ibqp->srq)
+		qp_context->srqn = cpu_to_be32(1 << 24 |
+					       to_msrq(ibqp->srq)->srqn);
+
+	if (cur_state == IB_QPS_RTS && new_state == IB_QPS_SQD	&&
+	    attr_mask & IB_QP_EN_SQD_ASYNC_NOTIFY		&&
+	    attr->en_sqd_async_notify)
+		sqd_event = 1 << 31;
+
+	err = mthca_MODIFY_QP(dev, cur_state, new_state, qp->qpn, 0,
+			      mailbox, sqd_event);
+	if (err) {
+		mthca_warn(dev, "modify QP %d->%d returned %d.\n",
+			   cur_state, new_state, err);
+		goto out_mailbox;
+	}
+
+	qp->state = new_state;
+	if (attr_mask & IB_QP_ACCESS_FLAGS)
+		qp->atomic_rd_en = attr->qp_access_flags;
+	if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC)
+		qp->resp_depth = attr->max_dest_rd_atomic;
+	if (attr_mask & IB_QP_PORT)
+		qp->port = attr->port_num;
+	if (attr_mask & IB_QP_ALT_PATH)
+		qp->alt_port = attr->alt_port_num;
+
+	if (is_sqp(dev, qp))
+		store_attrs(to_msqp(qp), attr, attr_mask);
+
+	/*
+	 * If we moved QP0 to RTR, bring the IB link up; if we moved
+	 * QP0 to RESET or ERROR, bring the link back down.
+	 */
+	if (is_qp0(dev, qp)) {
+		if (cur_state != IB_QPS_RTR &&
+		    new_state == IB_QPS_RTR)
+			init_port(dev, qp->port);
+
+		if (cur_state != IB_QPS_RESET &&
+		    cur_state != IB_QPS_ERR &&
+		    (new_state == IB_QPS_RESET ||
+		     new_state == IB_QPS_ERR))
+			mthca_CLOSE_IB(dev, qp->port);
+	}
+
+	/*
+	 * If we moved a kernel QP to RESET, clean up all old CQ
+	 * entries and reinitialize the QP.
+	 */
+	if (new_state == IB_QPS_RESET && !qp->ibqp.uobject) {
+		mthca_cq_clean(dev, to_mcq(qp->ibqp.recv_cq), qp->qpn,
+			       qp->ibqp.srq ? to_msrq(qp->ibqp.srq) : NULL);
+		if (qp->ibqp.send_cq != qp->ibqp.recv_cq)
+			mthca_cq_clean(dev, to_mcq(qp->ibqp.send_cq), qp->qpn, NULL);
+
+		mthca_wq_reset(&qp->sq);
+		qp->sq.last = get_send_wqe(qp, qp->sq.max - 1);
+
+		mthca_wq_reset(&qp->rq);
+		qp->rq.last = get_recv_wqe(qp, qp->rq.max - 1);
+
+		if (mthca_is_memfree(dev)) {
+			*qp->sq.db = 0;
+			*qp->rq.db = 0;
+		}
+	}
+
+out_mailbox:
+	mthca_free_mailbox(dev, mailbox);
+out:
+	return err;
+}
+
+int mthca_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask,
+		    struct ib_udata *udata)
+{
+	struct mthca_dev *dev = to_mdev(ibqp->device);
+	struct mthca_qp *qp = to_mqp(ibqp);
+	enum ib_qp_state cur_state, new_state;
+	int err = -EINVAL;
+
+	mutex_lock(&qp->mutex);
+	if (attr_mask & IB_QP_CUR_STATE) {
+		cur_state = attr->cur_qp_state;
+	} else {
+		spin_lock_irq(&qp->sq.lock);
+		spin_lock(&qp->rq.lock);
+		cur_state = qp->state;
+		spin_unlock(&qp->rq.lock);
+		spin_unlock_irq(&qp->sq.lock);
+	}
+
+	new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state;
+
+	if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type, attr_mask,
+				IB_LINK_LAYER_UNSPECIFIED)) {
+		mthca_dbg(dev, "Bad QP transition (transport %d) "
+			  "%d->%d with attr 0x%08x\n",
+			  qp->transport, cur_state, new_state,
+			  attr_mask);
+		goto out;
+	}
+
+	if ((attr_mask & IB_QP_PKEY_INDEX) &&
+	     attr->pkey_index >= dev->limits.pkey_table_len) {
+		mthca_dbg(dev, "P_Key index (%u) too large. max is %d\n",
+			  attr->pkey_index, dev->limits.pkey_table_len-1);
+		goto out;
+	}
+
+	if ((attr_mask & IB_QP_PORT) &&
+	    (attr->port_num == 0 || attr->port_num > dev->limits.num_ports)) {
+		mthca_dbg(dev, "Port number (%u) is invalid\n", attr->port_num);
+		goto out;
+	}
+
+	if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC &&
+	    attr->max_rd_atomic > dev->limits.max_qp_init_rdma) {
+		mthca_dbg(dev, "Max rdma_atomic as initiator %u too large (max is %d)\n",
+			  attr->max_rd_atomic, dev->limits.max_qp_init_rdma);
+		goto out;
+	}
+
+	if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC &&
+	    attr->max_dest_rd_atomic > 1 << dev->qp_table.rdb_shift) {
+		mthca_dbg(dev, "Max rdma_atomic as responder %u too large (max %d)\n",
+			  attr->max_dest_rd_atomic, 1 << dev->qp_table.rdb_shift);
+		goto out;
+	}
+
+	if (cur_state == new_state && cur_state == IB_QPS_RESET) {
+		err = 0;
+		goto out;
+	}
+
+	err = __mthca_modify_qp(ibqp, attr, attr_mask, cur_state, new_state);
+
+out:
+	mutex_unlock(&qp->mutex);
+	return err;
+}
+
+static int mthca_max_data_size(struct mthca_dev *dev, struct mthca_qp *qp, int desc_sz)
+{
+	/*
+	 * Calculate the maximum size of WQE s/g segments, excluding
+	 * the next segment and other non-data segments.
+	 */
+	int max_data_size = desc_sz - sizeof (struct mthca_next_seg);
+
+	switch (qp->transport) {
+	case MLX:
+		max_data_size -= 2 * sizeof (struct mthca_data_seg);
+		break;
+
+	case UD:
+		if (mthca_is_memfree(dev))
+			max_data_size -= sizeof (struct mthca_arbel_ud_seg);
+		else
+			max_data_size -= sizeof (struct mthca_tavor_ud_seg);
+		break;
+
+	default:
+		max_data_size -= sizeof (struct mthca_raddr_seg);
+		break;
+	}
+
+	return max_data_size;
+}
+
+static inline int mthca_max_inline_data(struct mthca_pd *pd, int max_data_size)
+{
+	/* We don't support inline data for kernel QPs (yet). */
+	return pd->ibpd.uobject ? max_data_size - MTHCA_INLINE_HEADER_SIZE : 0;
+}
+
+static void mthca_adjust_qp_caps(struct mthca_dev *dev,
+				 struct mthca_pd *pd,
+				 struct mthca_qp *qp)
+{
+	int max_data_size = mthca_max_data_size(dev, qp,
+						min(dev->limits.max_desc_sz,
+						    1 << qp->sq.wqe_shift));
+
+	qp->max_inline_data = mthca_max_inline_data(pd, max_data_size);
+
+	qp->sq.max_gs = min_t(int, dev->limits.max_sg,
+			      max_data_size / sizeof (struct mthca_data_seg));
+	qp->rq.max_gs = min_t(int, dev->limits.max_sg,
+			       (min(dev->limits.max_desc_sz, 1 << qp->rq.wqe_shift) -
+				sizeof (struct mthca_next_seg)) /
+			       sizeof (struct mthca_data_seg));
+}
+
+/*
+ * Allocate and register buffer for WQEs.  qp->rq.max, sq.max,
+ * rq.max_gs and sq.max_gs must all be assigned.
+ * mthca_alloc_wqe_buf will calculate rq.wqe_shift and
+ * sq.wqe_shift (as well as send_wqe_offset, is_direct, and
+ * queue)
+ */
+static int mthca_alloc_wqe_buf(struct mthca_dev *dev,
+			       struct mthca_pd *pd,
+			       struct mthca_qp *qp)
+{
+	int size;
+	int err = -ENOMEM;
+
+	size = sizeof (struct mthca_next_seg) +
+		qp->rq.max_gs * sizeof (struct mthca_data_seg);
+
+	if (size > dev->limits.max_desc_sz)
+		return -EINVAL;
+
+	for (qp->rq.wqe_shift = 6; 1 << qp->rq.wqe_shift < size;
+	     qp->rq.wqe_shift++)
+		; /* nothing */
+
+	size = qp->sq.max_gs * sizeof (struct mthca_data_seg);
+	switch (qp->transport) {
+	case MLX:
+		size += 2 * sizeof (struct mthca_data_seg);
+		break;
+
+	case UD:
+		size += mthca_is_memfree(dev) ?
+			sizeof (struct mthca_arbel_ud_seg) :
+			sizeof (struct mthca_tavor_ud_seg);
+		break;
+
+	case UC:
+		size += sizeof (struct mthca_raddr_seg);
+		break;
+
+	case RC:
+		size += sizeof (struct mthca_raddr_seg);
+		/*
+		 * An atomic op will require an atomic segment, a
+		 * remote address segment and one scatter entry.
+		 */
+		size = max_t(int, size,
+			     sizeof (struct mthca_atomic_seg) +
+			     sizeof (struct mthca_raddr_seg) +
+			     sizeof (struct mthca_data_seg));
+		break;
+
+	default:
+		break;
+	}
+
+	/* Make sure that we have enough space for a bind request */
+	size = max_t(int, size, sizeof (struct mthca_bind_seg));
+
+	size += sizeof (struct mthca_next_seg);
+
+	if (size > dev->limits.max_desc_sz)
+		return -EINVAL;
+
+	for (qp->sq.wqe_shift = 6; 1 << qp->sq.wqe_shift < size;
+	     qp->sq.wqe_shift++)
+		; /* nothing */
+
+	qp->send_wqe_offset = ALIGN(qp->rq.max << qp->rq.wqe_shift,
+				    1 << qp->sq.wqe_shift);
+
+	/*
+	 * If this is a userspace QP, we don't actually have to
+	 * allocate anything.  All we need is to calculate the WQE
+	 * sizes and the send_wqe_offset, so we're done now.
+	 */
+	if (pd->ibpd.uobject)
+		return 0;
+
+	size = PAGE_ALIGN(qp->send_wqe_offset +
+			  (qp->sq.max << qp->sq.wqe_shift));
+
+	qp->wrid = kmalloc((qp->rq.max + qp->sq.max) * sizeof (u64),
+			   GFP_KERNEL);
+	if (!qp->wrid)
+		goto err_out;
+
+	err = mthca_buf_alloc(dev, size, MTHCA_MAX_DIRECT_QP_SIZE,
+			      &qp->queue, &qp->is_direct, pd, 0, &qp->mr);
+	if (err)
+		goto err_out;
+
+	return 0;
+
+err_out:
+	kfree(qp->wrid);
+	return err;
+}
+
+static void mthca_free_wqe_buf(struct mthca_dev *dev,
+			       struct mthca_qp *qp)
+{
+	mthca_buf_free(dev, PAGE_ALIGN(qp->send_wqe_offset +
+				       (qp->sq.max << qp->sq.wqe_shift)),
+		       &qp->queue, qp->is_direct, &qp->mr);
+	kfree(qp->wrid);
+}
+
+static int mthca_map_memfree(struct mthca_dev *dev,
+			     struct mthca_qp *qp)
+{
+	int ret;
+
+	if (mthca_is_memfree(dev)) {
+		ret = mthca_table_get(dev, dev->qp_table.qp_table, qp->qpn);
+		if (ret)
+			return ret;
+
+		ret = mthca_table_get(dev, dev->qp_table.eqp_table, qp->qpn);
+		if (ret)
+			goto err_qpc;
+
+		ret = mthca_table_get(dev, dev->qp_table.rdb_table,
+				      qp->qpn << dev->qp_table.rdb_shift);
+		if (ret)
+			goto err_eqpc;
+
+	}
+
+	return 0;
+
+err_eqpc:
+	mthca_table_put(dev, dev->qp_table.eqp_table, qp->qpn);
+
+err_qpc:
+	mthca_table_put(dev, dev->qp_table.qp_table, qp->qpn);
+
+	return ret;
+}
+
+static void mthca_unmap_memfree(struct mthca_dev *dev,
+				struct mthca_qp *qp)
+{
+	mthca_table_put(dev, dev->qp_table.rdb_table,
+			qp->qpn << dev->qp_table.rdb_shift);
+	mthca_table_put(dev, dev->qp_table.eqp_table, qp->qpn);
+	mthca_table_put(dev, dev->qp_table.qp_table, qp->qpn);
+}
+
+static int mthca_alloc_memfree(struct mthca_dev *dev,
+			       struct mthca_qp *qp)
+{
+	if (mthca_is_memfree(dev)) {
+		qp->rq.db_index = mthca_alloc_db(dev, MTHCA_DB_TYPE_RQ,
+						 qp->qpn, &qp->rq.db);
+		if (qp->rq.db_index < 0)
+			return -ENOMEM;
+
+		qp->sq.db_index = mthca_alloc_db(dev, MTHCA_DB_TYPE_SQ,
+						 qp->qpn, &qp->sq.db);
+		if (qp->sq.db_index < 0) {
+			mthca_free_db(dev, MTHCA_DB_TYPE_RQ, qp->rq.db_index);
+			return -ENOMEM;
+		}
+	}
+
+	return 0;
+}
+
+static void mthca_free_memfree(struct mthca_dev *dev,
+			       struct mthca_qp *qp)
+{
+	if (mthca_is_memfree(dev)) {
+		mthca_free_db(dev, MTHCA_DB_TYPE_SQ, qp->sq.db_index);
+		mthca_free_db(dev, MTHCA_DB_TYPE_RQ, qp->rq.db_index);
+	}
+}
+
+static int mthca_alloc_qp_common(struct mthca_dev *dev,
+				 struct mthca_pd *pd,
+				 struct mthca_cq *send_cq,
+				 struct mthca_cq *recv_cq,
+				 enum ib_sig_type send_policy,
+				 struct mthca_qp *qp)
+{
+	int ret;
+	int i;
+	struct mthca_next_seg *next;
+
+	qp->refcount = 1;
+	init_waitqueue_head(&qp->wait);
+	mutex_init(&qp->mutex);
+	qp->state    	 = IB_QPS_RESET;
+	qp->atomic_rd_en = 0;
+	qp->resp_depth   = 0;
+	qp->sq_policy    = send_policy;
+	mthca_wq_reset(&qp->sq);
+	mthca_wq_reset(&qp->rq);
+
+	spin_lock_init(&qp->sq.lock);
+	spin_lock_init(&qp->rq.lock);
+
+	ret = mthca_map_memfree(dev, qp);
+	if (ret)
+		return ret;
+
+	ret = mthca_alloc_wqe_buf(dev, pd, qp);
+	if (ret) {
+		mthca_unmap_memfree(dev, qp);
+		return ret;
+	}
+
+	mthca_adjust_qp_caps(dev, pd, qp);
+
+	/*
+	 * If this is a userspace QP, we're done now.  The doorbells
+	 * will be allocated and buffers will be initialized in
+	 * userspace.
+	 */
+	if (pd->ibpd.uobject)
+		return 0;
+
+	ret = mthca_alloc_memfree(dev, qp);
+	if (ret) {
+		mthca_free_wqe_buf(dev, qp);
+		mthca_unmap_memfree(dev, qp);
+		return ret;
+	}
+
+	if (mthca_is_memfree(dev)) {
+		struct mthca_data_seg *scatter;
+		int size = (sizeof (struct mthca_next_seg) +
+			    qp->rq.max_gs * sizeof (struct mthca_data_seg)) / 16;
+
+		for (i = 0; i < qp->rq.max; ++i) {
+			next = get_recv_wqe(qp, i);
+			next->nda_op = cpu_to_be32(((i + 1) & (qp->rq.max - 1)) <<
+						   qp->rq.wqe_shift);
+			next->ee_nds = cpu_to_be32(size);
+
+			for (scatter = (void *) (next + 1);
+			     (void *) scatter < (void *) next + (1 << qp->rq.wqe_shift);
+			     ++scatter)
+				scatter->lkey = cpu_to_be32(MTHCA_INVAL_LKEY);
+		}
+
+		for (i = 0; i < qp->sq.max; ++i) {
+			next = get_send_wqe(qp, i);
+			next->nda_op = cpu_to_be32((((i + 1) & (qp->sq.max - 1)) <<
+						    qp->sq.wqe_shift) +
+						   qp->send_wqe_offset);
+		}
+	} else {
+		for (i = 0; i < qp->rq.max; ++i) {
+			next = get_recv_wqe(qp, i);
+			next->nda_op = htonl((((i + 1) % qp->rq.max) <<
+					      qp->rq.wqe_shift) | 1);
+		}
+
+	}
+
+	qp->sq.last = get_send_wqe(qp, qp->sq.max - 1);
+	qp->rq.last = get_recv_wqe(qp, qp->rq.max - 1);
+
+	return 0;
+}
+
+static int mthca_set_qp_size(struct mthca_dev *dev, struct ib_qp_cap *cap,
+			     struct mthca_pd *pd, struct mthca_qp *qp)
+{
+	int max_data_size = mthca_max_data_size(dev, qp, dev->limits.max_desc_sz);
+
+	/* Sanity check QP size before proceeding */
+	if (cap->max_send_wr  	 > dev->limits.max_wqes ||
+	    cap->max_recv_wr  	 > dev->limits.max_wqes ||
+	    cap->max_send_sge 	 > dev->limits.max_sg   ||
+	    cap->max_recv_sge 	 > dev->limits.max_sg   ||
+	    cap->max_inline_data > mthca_max_inline_data(pd, max_data_size))
+		return -EINVAL;
+
+	/*
+	 * For MLX transport we need 2 extra send gather entries:
+	 * one for the header and one for the checksum at the end
+	 */
+	if (qp->transport == MLX && cap->max_send_sge + 2 > dev->limits.max_sg)
+		return -EINVAL;
+
+	if (mthca_is_memfree(dev)) {
+		qp->rq.max = cap->max_recv_wr ?
+			roundup_pow_of_two(cap->max_recv_wr) : 0;
+		qp->sq.max = cap->max_send_wr ?
+			roundup_pow_of_two(cap->max_send_wr) : 0;
+	} else {
+		qp->rq.max = cap->max_recv_wr;
+		qp->sq.max = cap->max_send_wr;
+	}
+
+	qp->rq.max_gs = cap->max_recv_sge;
+	qp->sq.max_gs = max_t(int, cap->max_send_sge,
+			      ALIGN(cap->max_inline_data + MTHCA_INLINE_HEADER_SIZE,
+				    MTHCA_INLINE_CHUNK_SIZE) /
+			      sizeof (struct mthca_data_seg));
+
+	return 0;
+}
+
+int mthca_alloc_qp(struct mthca_dev *dev,
+		   struct mthca_pd *pd,
+		   struct mthca_cq *send_cq,
+		   struct mthca_cq *recv_cq,
+		   enum ib_qp_type type,
+		   enum ib_sig_type send_policy,
+		   struct ib_qp_cap *cap,
+		   struct mthca_qp *qp)
+{
+	int err;
+
+	switch (type) {
+	case IB_QPT_RC: qp->transport = RC; break;
+	case IB_QPT_UC: qp->transport = UC; break;
+	case IB_QPT_UD: qp->transport = UD; break;
+	default: return -EINVAL;
+	}
+
+	err = mthca_set_qp_size(dev, cap, pd, qp);
+	if (err)
+		return err;
+
+	qp->qpn = mthca_alloc(&dev->qp_table.alloc);
+	if (qp->qpn == -1)
+		return -ENOMEM;
+
+	/* initialize port to zero for error-catching. */
+	qp->port = 0;
+
+	err = mthca_alloc_qp_common(dev, pd, send_cq, recv_cq,
+				    send_policy, qp);
+	if (err) {
+		mthca_free(&dev->qp_table.alloc, qp->qpn);
+		return err;
+	}
+
+	spin_lock_irq(&dev->qp_table.lock);
+	mthca_array_set(&dev->qp_table.qp,
+			qp->qpn & (dev->limits.num_qps - 1), qp);
+	spin_unlock_irq(&dev->qp_table.lock);
+
+	return 0;
+}
+
+static void mthca_lock_cqs(struct mthca_cq *send_cq, struct mthca_cq *recv_cq)
+	__acquires(&send_cq->lock) __acquires(&recv_cq->lock)
+{
+	if (send_cq == recv_cq) {
+		spin_lock_irq(&send_cq->lock);
+		__acquire(&recv_cq->lock);
+	} else if (send_cq->cqn < recv_cq->cqn) {
+		spin_lock_irq(&send_cq->lock);
+		spin_lock_nested(&recv_cq->lock, SINGLE_DEPTH_NESTING);
+	} else {
+		spin_lock_irq(&recv_cq->lock);
+		spin_lock_nested(&send_cq->lock, SINGLE_DEPTH_NESTING);
+	}
+}
+
+static void mthca_unlock_cqs(struct mthca_cq *send_cq, struct mthca_cq *recv_cq)
+	__releases(&send_cq->lock) __releases(&recv_cq->lock)
+{
+	if (send_cq == recv_cq) {
+		__release(&recv_cq->lock);
+		spin_unlock_irq(&send_cq->lock);
+	} else if (send_cq->cqn < recv_cq->cqn) {
+		spin_unlock(&recv_cq->lock);
+		spin_unlock_irq(&send_cq->lock);
+	} else {
+		spin_unlock(&send_cq->lock);
+		spin_unlock_irq(&recv_cq->lock);
+	}
+}
+
+int mthca_alloc_sqp(struct mthca_dev *dev,
+		    struct mthca_pd *pd,
+		    struct mthca_cq *send_cq,
+		    struct mthca_cq *recv_cq,
+		    enum ib_sig_type send_policy,
+		    struct ib_qp_cap *cap,
+		    int qpn,
+		    int port,
+		    struct mthca_sqp *sqp)
+{
+	u32 mqpn = qpn * 2 + dev->qp_table.sqp_start + port - 1;
+	int err;
+
+	sqp->qp.transport = MLX;
+	err = mthca_set_qp_size(dev, cap, pd, &sqp->qp);
+	if (err)
+		return err;
+
+	sqp->header_buf_size = sqp->qp.sq.max * MTHCA_UD_HEADER_SIZE;
+	sqp->header_buf = dma_alloc_coherent(&dev->pdev->dev, sqp->header_buf_size,
+					     &sqp->header_dma, GFP_KERNEL);
+	if (!sqp->header_buf)
+		return -ENOMEM;
+
+	spin_lock_irq(&dev->qp_table.lock);
+	if (mthca_array_get(&dev->qp_table.qp, mqpn))
+		err = -EBUSY;
+	else
+		mthca_array_set(&dev->qp_table.qp, mqpn, sqp);
+	spin_unlock_irq(&dev->qp_table.lock);
+
+	if (err)
+		goto err_out;
+
+	sqp->qp.port      = port;
+	sqp->qp.qpn       = mqpn;
+	sqp->qp.transport = MLX;
+
+	err = mthca_alloc_qp_common(dev, pd, send_cq, recv_cq,
+				    send_policy, &sqp->qp);
+	if (err)
+		goto err_out_free;
+
+	atomic_inc(&pd->sqp_count);
+
+	return 0;
+
+ err_out_free:
+	/*
+	 * Lock CQs here, so that CQ polling code can do QP lookup
+	 * without taking a lock.
+	 */
+	mthca_lock_cqs(send_cq, recv_cq);
+
+	spin_lock(&dev->qp_table.lock);
+	mthca_array_clear(&dev->qp_table.qp, mqpn);
+	spin_unlock(&dev->qp_table.lock);
+
+	mthca_unlock_cqs(send_cq, recv_cq);
+
+ err_out:
+	dma_free_coherent(&dev->pdev->dev, sqp->header_buf_size,
+			  sqp->header_buf, sqp->header_dma);
+
+	return err;
+}
+
+static inline int get_qp_refcount(struct mthca_dev *dev, struct mthca_qp *qp)
+{
+	int c;
+
+	spin_lock_irq(&dev->qp_table.lock);
+	c = qp->refcount;
+	spin_unlock_irq(&dev->qp_table.lock);
+
+	return c;
+}
+
+void mthca_free_qp(struct mthca_dev *dev,
+		   struct mthca_qp *qp)
+{
+	struct mthca_cq *send_cq;
+	struct mthca_cq *recv_cq;
+
+	send_cq = to_mcq(qp->ibqp.send_cq);
+	recv_cq = to_mcq(qp->ibqp.recv_cq);
+
+	/*
+	 * Lock CQs here, so that CQ polling code can do QP lookup
+	 * without taking a lock.
+	 */
+	mthca_lock_cqs(send_cq, recv_cq);
+
+	spin_lock(&dev->qp_table.lock);
+	mthca_array_clear(&dev->qp_table.qp,
+			  qp->qpn & (dev->limits.num_qps - 1));
+	--qp->refcount;
+	spin_unlock(&dev->qp_table.lock);
+
+	mthca_unlock_cqs(send_cq, recv_cq);
+
+	wait_event(qp->wait, !get_qp_refcount(dev, qp));
+
+	if (qp->state != IB_QPS_RESET)
+		mthca_MODIFY_QP(dev, qp->state, IB_QPS_RESET, qp->qpn, 0,
+				NULL, 0);
+
+	/*
+	 * If this is a userspace QP, the buffers, MR, CQs and so on
+	 * will be cleaned up in userspace, so all we have to do is
+	 * unref the mem-free tables and free the QPN in our table.
+	 */
+	if (!qp->ibqp.uobject) {
+		mthca_cq_clean(dev, recv_cq, qp->qpn,
+			       qp->ibqp.srq ? to_msrq(qp->ibqp.srq) : NULL);
+		if (send_cq != recv_cq)
+			mthca_cq_clean(dev, send_cq, qp->qpn, NULL);
+
+		mthca_free_memfree(dev, qp);
+		mthca_free_wqe_buf(dev, qp);
+	}
+
+	mthca_unmap_memfree(dev, qp);
+
+	if (is_sqp(dev, qp)) {
+		atomic_dec(&(to_mpd(qp->ibqp.pd)->sqp_count));
+		dma_free_coherent(&dev->pdev->dev,
+				  to_msqp(qp)->header_buf_size,
+				  to_msqp(qp)->header_buf,
+				  to_msqp(qp)->header_dma);
+	} else
+		mthca_free(&dev->qp_table.alloc, qp->qpn);
+}
+
+/* Create UD header for an MLX send and build a data segment for it */
+static int build_mlx_header(struct mthca_dev *dev, struct mthca_sqp *sqp,
+			    int ind, struct ib_send_wr *wr,
+			    struct mthca_mlx_seg *mlx,
+			    struct mthca_data_seg *data)
+{
+	int header_size;
+	int err;
+	u16 pkey;
+
+	ib_ud_header_init(256, /* assume a MAD */ 1, 0, 0,
+			  mthca_ah_grh_present(to_mah(wr->wr.ud.ah)), 0,
+			  &sqp->ud_header);
+
+	err = mthca_read_ah(dev, to_mah(wr->wr.ud.ah), &sqp->ud_header);
+	if (err)
+		return err;
+	mlx->flags &= ~cpu_to_be32(MTHCA_NEXT_SOLICIT | 1);
+	mlx->flags |= cpu_to_be32((!sqp->qp.ibqp.qp_num ? MTHCA_MLX_VL15 : 0) |
+				  (sqp->ud_header.lrh.destination_lid ==
+				   IB_LID_PERMISSIVE ? MTHCA_MLX_SLR : 0) |
+				  (sqp->ud_header.lrh.service_level << 8));
+	mlx->rlid = sqp->ud_header.lrh.destination_lid;
+	mlx->vcrc = 0;
+
+	switch (wr->opcode) {
+	case IB_WR_SEND:
+		sqp->ud_header.bth.opcode = IB_OPCODE_UD_SEND_ONLY;
+		sqp->ud_header.immediate_present = 0;
+		break;
+	case IB_WR_SEND_WITH_IMM:
+		sqp->ud_header.bth.opcode = IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE;
+		sqp->ud_header.immediate_present = 1;
+		sqp->ud_header.immediate_data = wr->ex.imm_data;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	sqp->ud_header.lrh.virtual_lane    = !sqp->qp.ibqp.qp_num ? 15 : 0;
+	if (sqp->ud_header.lrh.destination_lid == IB_LID_PERMISSIVE)
+		sqp->ud_header.lrh.source_lid = IB_LID_PERMISSIVE;
+	sqp->ud_header.bth.solicited_event = !!(wr->send_flags & IB_SEND_SOLICITED);
+	if (!sqp->qp.ibqp.qp_num)
+		ib_get_cached_pkey(&dev->ib_dev, sqp->qp.port,
+				   sqp->pkey_index, &pkey);
+	else
+		ib_get_cached_pkey(&dev->ib_dev, sqp->qp.port,
+				   wr->wr.ud.pkey_index, &pkey);
+	sqp->ud_header.bth.pkey = cpu_to_be16(pkey);
+	sqp->ud_header.bth.destination_qpn = cpu_to_be32(wr->wr.ud.remote_qpn);
+	sqp->ud_header.bth.psn = cpu_to_be32((sqp->send_psn++) & ((1 << 24) - 1));
+	sqp->ud_header.deth.qkey = cpu_to_be32(wr->wr.ud.remote_qkey & 0x80000000 ?
+					       sqp->qkey : wr->wr.ud.remote_qkey);
+	sqp->ud_header.deth.source_qpn = cpu_to_be32(sqp->qp.ibqp.qp_num);
+
+	header_size = ib_ud_header_pack(&sqp->ud_header,
+					sqp->header_buf +
+					ind * MTHCA_UD_HEADER_SIZE);
+
+	data->byte_count = cpu_to_be32(header_size);
+	data->lkey       = cpu_to_be32(to_mpd(sqp->qp.ibqp.pd)->ntmr.ibmr.lkey);
+	data->addr       = cpu_to_be64(sqp->header_dma +
+				       ind * MTHCA_UD_HEADER_SIZE);
+
+	return 0;
+}
+
+static inline int mthca_wq_overflow(struct mthca_wq *wq, int nreq,
+				    struct ib_cq *ib_cq)
+{
+	unsigned cur;
+	struct mthca_cq *cq;
+
+	cur = wq->head - wq->tail;
+	if (likely(cur + nreq < wq->max))
+		return 0;
+
+	cq = to_mcq(ib_cq);
+	spin_lock(&cq->lock);
+	cur = wq->head - wq->tail;
+	spin_unlock(&cq->lock);
+
+	return cur + nreq >= wq->max;
+}
+
+static __always_inline void set_raddr_seg(struct mthca_raddr_seg *rseg,
+					  u64 remote_addr, u32 rkey)
+{
+	rseg->raddr    = cpu_to_be64(remote_addr);
+	rseg->rkey     = cpu_to_be32(rkey);
+	rseg->reserved = 0;
+}
+
+static __always_inline void set_atomic_seg(struct mthca_atomic_seg *aseg,
+					   struct ib_send_wr *wr)
+{
+	if (wr->opcode == IB_WR_ATOMIC_CMP_AND_SWP) {
+		aseg->swap_add = cpu_to_be64(wr->wr.atomic.swap);
+		aseg->compare  = cpu_to_be64(wr->wr.atomic.compare_add);
+	} else {
+		aseg->swap_add = cpu_to_be64(wr->wr.atomic.compare_add);
+		aseg->compare  = 0;
+	}
+
+}
+
+static void set_tavor_ud_seg(struct mthca_tavor_ud_seg *useg,
+			     struct ib_send_wr *wr)
+{
+	useg->lkey    = cpu_to_be32(to_mah(wr->wr.ud.ah)->key);
+	useg->av_addr =	cpu_to_be64(to_mah(wr->wr.ud.ah)->avdma);
+	useg->dqpn    =	cpu_to_be32(wr->wr.ud.remote_qpn);
+	useg->qkey    =	cpu_to_be32(wr->wr.ud.remote_qkey);
+
+}
+
+static void set_arbel_ud_seg(struct mthca_arbel_ud_seg *useg,
+			     struct ib_send_wr *wr)
+{
+	memcpy(useg->av, to_mah(wr->wr.ud.ah)->av, MTHCA_AV_SIZE);
+	useg->dqpn = cpu_to_be32(wr->wr.ud.remote_qpn);
+	useg->qkey = cpu_to_be32(wr->wr.ud.remote_qkey);
+}
+
+int mthca_tavor_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
+			  struct ib_send_wr **bad_wr)
+{
+	struct mthca_dev *dev = to_mdev(ibqp->device);
+	struct mthca_qp *qp = to_mqp(ibqp);
+	void *wqe;
+	void *prev_wqe;
+	unsigned long flags;
+	int err = 0;
+	int nreq;
+	int i;
+	int size;
+	/*
+	 * f0 and size0 are only used if nreq != 0, and they will
+	 * always be initialized the first time through the main loop
+	 * before nreq is incremented.  So nreq cannot become non-zero
+	 * without initializing f0 and size0, and they are in fact
+	 * never used uninitialized.
+	 */
+	int uninitialized_var(size0);
+	u32 uninitialized_var(f0);
+	int ind;
+	u8 op0 = 0;
+
+	spin_lock_irqsave(&qp->sq.lock, flags);
+
+	/* XXX check that state is OK to post send */
+
+	ind = qp->sq.next_ind;
+
+	for (nreq = 0; wr; ++nreq, wr = wr->next) {
+		if (mthca_wq_overflow(&qp->sq, nreq, qp->ibqp.send_cq)) {
+			mthca_err(dev, "SQ %06x full (%u head, %u tail,"
+					" %d max, %d nreq)\n", qp->qpn,
+					qp->sq.head, qp->sq.tail,
+					qp->sq.max, nreq);
+			err = -ENOMEM;
+			*bad_wr = wr;
+			goto out;
+		}
+
+		wqe = get_send_wqe(qp, ind);
+		prev_wqe = qp->sq.last;
+		qp->sq.last = wqe;
+
+		((struct mthca_next_seg *) wqe)->nda_op = 0;
+		((struct mthca_next_seg *) wqe)->ee_nds = 0;
+		((struct mthca_next_seg *) wqe)->flags =
+			((wr->send_flags & IB_SEND_SIGNALED) ?
+			 cpu_to_be32(MTHCA_NEXT_CQ_UPDATE) : 0) |
+			((wr->send_flags & IB_SEND_SOLICITED) ?
+			 cpu_to_be32(MTHCA_NEXT_SOLICIT) : 0)   |
+			cpu_to_be32(1);
+		if (wr->opcode == IB_WR_SEND_WITH_IMM ||
+		    wr->opcode == IB_WR_RDMA_WRITE_WITH_IMM)
+			((struct mthca_next_seg *) wqe)->imm = wr->ex.imm_data;
+
+		wqe += sizeof (struct mthca_next_seg);
+		size = sizeof (struct mthca_next_seg) / 16;
+
+		switch (qp->transport) {
+		case RC:
+			switch (wr->opcode) {
+			case IB_WR_ATOMIC_CMP_AND_SWP:
+			case IB_WR_ATOMIC_FETCH_AND_ADD:
+				set_raddr_seg(wqe, wr->wr.atomic.remote_addr,
+					      wr->wr.atomic.rkey);
+				wqe += sizeof (struct mthca_raddr_seg);
+
+				set_atomic_seg(wqe, wr);
+				wqe += sizeof (struct mthca_atomic_seg);
+				size += (sizeof (struct mthca_raddr_seg) +
+					 sizeof (struct mthca_atomic_seg)) / 16;
+				break;
+
+			case IB_WR_RDMA_WRITE:
+			case IB_WR_RDMA_WRITE_WITH_IMM:
+			case IB_WR_RDMA_READ:
+				set_raddr_seg(wqe, wr->wr.rdma.remote_addr,
+					      wr->wr.rdma.rkey);
+				wqe  += sizeof (struct mthca_raddr_seg);
+				size += sizeof (struct mthca_raddr_seg) / 16;
+				break;
+
+			default:
+				/* No extra segments required for sends */
+				break;
+			}
+
+			break;
+
+		case UC:
+			switch (wr->opcode) {
+			case IB_WR_RDMA_WRITE:
+			case IB_WR_RDMA_WRITE_WITH_IMM:
+				set_raddr_seg(wqe, wr->wr.rdma.remote_addr,
+					      wr->wr.rdma.rkey);
+				wqe  += sizeof (struct mthca_raddr_seg);
+				size += sizeof (struct mthca_raddr_seg) / 16;
+				break;
+
+			default:
+				/* No extra segments required for sends */
+				break;
+			}
+
+			break;
+
+		case UD:
+			set_tavor_ud_seg(wqe, wr);
+			wqe  += sizeof (struct mthca_tavor_ud_seg);
+			size += sizeof (struct mthca_tavor_ud_seg) / 16;
+			break;
+
+		case MLX:
+			err = build_mlx_header(dev, to_msqp(qp), ind, wr,
+					       wqe - sizeof (struct mthca_next_seg),
+					       wqe);
+			if (err) {
+				*bad_wr = wr;
+				goto out;
+			}
+			wqe += sizeof (struct mthca_data_seg);
+			size += sizeof (struct mthca_data_seg) / 16;
+			break;
+		}
+
+		if (wr->num_sge > qp->sq.max_gs) {
+			mthca_err(dev, "too many gathers\n");
+			err = -EINVAL;
+			*bad_wr = wr;
+			goto out;
+		}
+
+		for (i = 0; i < wr->num_sge; ++i) {
+			mthca_set_data_seg(wqe, wr->sg_list + i);
+			wqe  += sizeof (struct mthca_data_seg);
+			size += sizeof (struct mthca_data_seg) / 16;
+		}
+
+		/* Add one more inline data segment for ICRC */
+		if (qp->transport == MLX) {
+			((struct mthca_data_seg *) wqe)->byte_count =
+				cpu_to_be32((1 << 31) | 4);
+			((u32 *) wqe)[1] = 0;
+			wqe += sizeof (struct mthca_data_seg);
+			size += sizeof (struct mthca_data_seg) / 16;
+		}
+
+		qp->wrid[ind + qp->rq.max] = wr->wr_id;
+
+		if (wr->opcode >= ARRAY_SIZE(mthca_opcode)) {
+			mthca_err(dev, "opcode invalid\n");
+			err = -EINVAL;
+			*bad_wr = wr;
+			goto out;
+		}
+
+		((struct mthca_next_seg *) prev_wqe)->nda_op =
+			cpu_to_be32(((ind << qp->sq.wqe_shift) +
+				     qp->send_wqe_offset) |
+				    mthca_opcode[wr->opcode]);
+		wmb();
+		((struct mthca_next_seg *) prev_wqe)->ee_nds =
+			cpu_to_be32((nreq ? 0 : MTHCA_NEXT_DBD) | size |
+				    ((wr->send_flags & IB_SEND_FENCE) ?
+				    MTHCA_NEXT_FENCE : 0));
+
+		if (!nreq) {
+			size0 = size;
+			op0   = mthca_opcode[wr->opcode];
+			f0    = wr->send_flags & IB_SEND_FENCE ?
+				MTHCA_SEND_DOORBELL_FENCE : 0;
+		}
+
+		++ind;
+		if (unlikely(ind >= qp->sq.max))
+			ind -= qp->sq.max;
+	}
+
+out:
+	if (likely(nreq)) {
+		wmb();
+
+		mthca_write64(((qp->sq.next_ind << qp->sq.wqe_shift) +
+			       qp->send_wqe_offset) | f0 | op0,
+			      (qp->qpn << 8) | size0,
+			      dev->kar + MTHCA_SEND_DOORBELL,
+			      MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock));
+		/*
+		 * Make sure doorbells don't leak out of SQ spinlock
+		 * and reach the HCA out of order:
+		 */
+		mmiowb();
+	}
+
+	qp->sq.next_ind = ind;
+	qp->sq.head    += nreq;
+
+	spin_unlock_irqrestore(&qp->sq.lock, flags);
+	return err;
+}
+
+int mthca_tavor_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr,
+			     struct ib_recv_wr **bad_wr)
+{
+	struct mthca_dev *dev = to_mdev(ibqp->device);
+	struct mthca_qp *qp = to_mqp(ibqp);
+	unsigned long flags;
+	int err = 0;
+	int nreq;
+	int i;
+	int size;
+	/*
+	 * size0 is only used if nreq != 0, and it will always be
+	 * initialized the first time through the main loop before
+	 * nreq is incremented.  So nreq cannot become non-zero
+	 * without initializing size0, and it is in fact never used
+	 * uninitialized.
+	 */
+	int uninitialized_var(size0);
+	int ind;
+	void *wqe;
+	void *prev_wqe;
+
+	spin_lock_irqsave(&qp->rq.lock, flags);
+
+	/* XXX check that state is OK to post receive */
+
+	ind = qp->rq.next_ind;
+
+	for (nreq = 0; wr; wr = wr->next) {
+		if (mthca_wq_overflow(&qp->rq, nreq, qp->ibqp.recv_cq)) {
+			mthca_err(dev, "RQ %06x full (%u head, %u tail,"
+					" %d max, %d nreq)\n", qp->qpn,
+					qp->rq.head, qp->rq.tail,
+					qp->rq.max, nreq);
+			err = -ENOMEM;
+			*bad_wr = wr;
+			goto out;
+		}
+
+		wqe = get_recv_wqe(qp, ind);
+		prev_wqe = qp->rq.last;
+		qp->rq.last = wqe;
+
+		((struct mthca_next_seg *) wqe)->ee_nds =
+			cpu_to_be32(MTHCA_NEXT_DBD);
+		((struct mthca_next_seg *) wqe)->flags = 0;
+
+		wqe += sizeof (struct mthca_next_seg);
+		size = sizeof (struct mthca_next_seg) / 16;
+
+		if (unlikely(wr->num_sge > qp->rq.max_gs)) {
+			err = -EINVAL;
+			*bad_wr = wr;
+			goto out;
+		}
+
+		for (i = 0; i < wr->num_sge; ++i) {
+			mthca_set_data_seg(wqe, wr->sg_list + i);
+			wqe  += sizeof (struct mthca_data_seg);
+			size += sizeof (struct mthca_data_seg) / 16;
+		}
+
+		qp->wrid[ind] = wr->wr_id;
+
+		((struct mthca_next_seg *) prev_wqe)->ee_nds =
+			cpu_to_be32(MTHCA_NEXT_DBD | size);
+
+		if (!nreq)
+			size0 = size;
+
+		++ind;
+		if (unlikely(ind >= qp->rq.max))
+			ind -= qp->rq.max;
+
+		++nreq;
+		if (unlikely(nreq == MTHCA_TAVOR_MAX_WQES_PER_RECV_DB)) {
+			nreq = 0;
+
+			wmb();
+
+			mthca_write64((qp->rq.next_ind << qp->rq.wqe_shift) | size0,
+				      qp->qpn << 8, dev->kar + MTHCA_RECEIVE_DOORBELL,
+				      MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock));
+
+			qp->rq.next_ind = ind;
+			qp->rq.head += MTHCA_TAVOR_MAX_WQES_PER_RECV_DB;
+		}
+	}
+
+out:
+	if (likely(nreq)) {
+		wmb();
+
+		mthca_write64((qp->rq.next_ind << qp->rq.wqe_shift) | size0,
+			      qp->qpn << 8 | nreq, dev->kar + MTHCA_RECEIVE_DOORBELL,
+			      MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock));
+	}
+
+	qp->rq.next_ind = ind;
+	qp->rq.head    += nreq;
+
+	/*
+	 * Make sure doorbells don't leak out of RQ spinlock and reach
+	 * the HCA out of order:
+	 */
+	mmiowb();
+
+	spin_unlock_irqrestore(&qp->rq.lock, flags);
+	return err;
+}
+
+int mthca_arbel_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
+			  struct ib_send_wr **bad_wr)
+{
+	struct mthca_dev *dev = to_mdev(ibqp->device);
+	struct mthca_qp *qp = to_mqp(ibqp);
+	u32 dbhi;
+	void *wqe;
+	void *prev_wqe;
+	unsigned long flags;
+	int err = 0;
+	int nreq;
+	int i;
+	int size;
+	/*
+	 * f0 and size0 are only used if nreq != 0, and they will
+	 * always be initialized the first time through the main loop
+	 * before nreq is incremented.  So nreq cannot become non-zero
+	 * without initializing f0 and size0, and they are in fact
+	 * never used uninitialized.
+	 */
+	int uninitialized_var(size0);
+	u32 uninitialized_var(f0);
+	int ind;
+	u8 op0 = 0;
+
+	spin_lock_irqsave(&qp->sq.lock, flags);
+
+	/* XXX check that state is OK to post send */
+
+	ind = qp->sq.head & (qp->sq.max - 1);
+
+	for (nreq = 0; wr; ++nreq, wr = wr->next) {
+		if (unlikely(nreq == MTHCA_ARBEL_MAX_WQES_PER_SEND_DB)) {
+			nreq = 0;
+
+			dbhi = (MTHCA_ARBEL_MAX_WQES_PER_SEND_DB << 24) |
+				((qp->sq.head & 0xffff) << 8) | f0 | op0;
+
+			qp->sq.head += MTHCA_ARBEL_MAX_WQES_PER_SEND_DB;
+
+			/*
+			 * Make sure that descriptors are written before
+			 * doorbell record.
+			 */
+			wmb();
+			*qp->sq.db = cpu_to_be32(qp->sq.head & 0xffff);
+
+			/*
+			 * Make sure doorbell record is written before we
+			 * write MMIO send doorbell.
+			 */
+			wmb();
+
+			mthca_write64(dbhi, (qp->qpn << 8) | size0,
+				      dev->kar + MTHCA_SEND_DOORBELL,
+				      MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock));
+		}
+
+		if (mthca_wq_overflow(&qp->sq, nreq, qp->ibqp.send_cq)) {
+			mthca_err(dev, "SQ %06x full (%u head, %u tail,"
+					" %d max, %d nreq)\n", qp->qpn,
+					qp->sq.head, qp->sq.tail,
+					qp->sq.max, nreq);
+			err = -ENOMEM;
+			*bad_wr = wr;
+			goto out;
+		}
+
+		wqe = get_send_wqe(qp, ind);
+		prev_wqe = qp->sq.last;
+		qp->sq.last = wqe;
+
+		((struct mthca_next_seg *) wqe)->flags =
+			((wr->send_flags & IB_SEND_SIGNALED) ?
+			 cpu_to_be32(MTHCA_NEXT_CQ_UPDATE) : 0) |
+			((wr->send_flags & IB_SEND_SOLICITED) ?
+			 cpu_to_be32(MTHCA_NEXT_SOLICIT) : 0)   |
+			((wr->send_flags & IB_SEND_IP_CSUM) ?
+			 cpu_to_be32(MTHCA_NEXT_IP_CSUM | MTHCA_NEXT_TCP_UDP_CSUM) : 0) |
+			cpu_to_be32(1);
+		if (wr->opcode == IB_WR_SEND_WITH_IMM ||
+		    wr->opcode == IB_WR_RDMA_WRITE_WITH_IMM)
+			((struct mthca_next_seg *) wqe)->imm = wr->ex.imm_data;
+
+		wqe += sizeof (struct mthca_next_seg);
+		size = sizeof (struct mthca_next_seg) / 16;
+
+		switch (qp->transport) {
+		case RC:
+			switch (wr->opcode) {
+			case IB_WR_ATOMIC_CMP_AND_SWP:
+			case IB_WR_ATOMIC_FETCH_AND_ADD:
+				set_raddr_seg(wqe, wr->wr.atomic.remote_addr,
+					      wr->wr.atomic.rkey);
+				wqe += sizeof (struct mthca_raddr_seg);
+
+				set_atomic_seg(wqe, wr);
+				wqe  += sizeof (struct mthca_atomic_seg);
+				size += (sizeof (struct mthca_raddr_seg) +
+					 sizeof (struct mthca_atomic_seg)) / 16;
+				break;
+
+			case IB_WR_RDMA_READ:
+			case IB_WR_RDMA_WRITE:
+			case IB_WR_RDMA_WRITE_WITH_IMM:
+				set_raddr_seg(wqe, wr->wr.rdma.remote_addr,
+					      wr->wr.rdma.rkey);
+				wqe  += sizeof (struct mthca_raddr_seg);
+				size += sizeof (struct mthca_raddr_seg) / 16;
+				break;
+
+			default:
+				/* No extra segments required for sends */
+				break;
+			}
+
+			break;
+
+		case UC:
+			switch (wr->opcode) {
+			case IB_WR_RDMA_WRITE:
+			case IB_WR_RDMA_WRITE_WITH_IMM:
+				set_raddr_seg(wqe, wr->wr.rdma.remote_addr,
+					      wr->wr.rdma.rkey);
+				wqe  += sizeof (struct mthca_raddr_seg);
+				size += sizeof (struct mthca_raddr_seg) / 16;
+				break;
+
+			default:
+				/* No extra segments required for sends */
+				break;
+			}
+
+			break;
+
+		case UD:
+			set_arbel_ud_seg(wqe, wr);
+			wqe  += sizeof (struct mthca_arbel_ud_seg);
+			size += sizeof (struct mthca_arbel_ud_seg) / 16;
+			break;
+
+		case MLX:
+			err = build_mlx_header(dev, to_msqp(qp), ind, wr,
+					       wqe - sizeof (struct mthca_next_seg),
+					       wqe);
+			if (err) {
+				*bad_wr = wr;
+				goto out;
+			}
+			wqe += sizeof (struct mthca_data_seg);
+			size += sizeof (struct mthca_data_seg) / 16;
+			break;
+		}
+
+		if (wr->num_sge > qp->sq.max_gs) {
+			mthca_err(dev, "too many gathers\n");
+			err = -EINVAL;
+			*bad_wr = wr;
+			goto out;
+		}
+
+		for (i = 0; i < wr->num_sge; ++i) {
+			mthca_set_data_seg(wqe, wr->sg_list + i);
+			wqe  += sizeof (struct mthca_data_seg);
+			size += sizeof (struct mthca_data_seg) / 16;
+		}
+
+		/* Add one more inline data segment for ICRC */
+		if (qp->transport == MLX) {
+			((struct mthca_data_seg *) wqe)->byte_count =
+				cpu_to_be32((1 << 31) | 4);
+			((u32 *) wqe)[1] = 0;
+			wqe += sizeof (struct mthca_data_seg);
+			size += sizeof (struct mthca_data_seg) / 16;
+		}
+
+		qp->wrid[ind + qp->rq.max] = wr->wr_id;
+
+		if (wr->opcode >= ARRAY_SIZE(mthca_opcode)) {
+			mthca_err(dev, "opcode invalid\n");
+			err = -EINVAL;
+			*bad_wr = wr;
+			goto out;
+		}
+
+		((struct mthca_next_seg *) prev_wqe)->nda_op =
+			cpu_to_be32(((ind << qp->sq.wqe_shift) +
+				     qp->send_wqe_offset) |
+				    mthca_opcode[wr->opcode]);
+		wmb();
+		((struct mthca_next_seg *) prev_wqe)->ee_nds =
+			cpu_to_be32(MTHCA_NEXT_DBD | size |
+				    ((wr->send_flags & IB_SEND_FENCE) ?
+				     MTHCA_NEXT_FENCE : 0));
+
+		if (!nreq) {
+			size0 = size;
+			op0   = mthca_opcode[wr->opcode];
+			f0    = wr->send_flags & IB_SEND_FENCE ?
+				MTHCA_SEND_DOORBELL_FENCE : 0;
+		}
+
+		++ind;
+		if (unlikely(ind >= qp->sq.max))
+			ind -= qp->sq.max;
+	}
+
+out:
+	if (likely(nreq)) {
+		dbhi = (nreq << 24) | ((qp->sq.head & 0xffff) << 8) | f0 | op0;
+
+		qp->sq.head += nreq;
+
+		/*
+		 * Make sure that descriptors are written before
+		 * doorbell record.
+		 */
+		wmb();
+		*qp->sq.db = cpu_to_be32(qp->sq.head & 0xffff);
+
+		/*
+		 * Make sure doorbell record is written before we
+		 * write MMIO send doorbell.
+		 */
+		wmb();
+
+		mthca_write64(dbhi, (qp->qpn << 8) | size0, dev->kar + MTHCA_SEND_DOORBELL,
+			      MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock));
+	}
+
+	/*
+	 * Make sure doorbells don't leak out of SQ spinlock and reach
+	 * the HCA out of order:
+	 */
+	mmiowb();
+
+	spin_unlock_irqrestore(&qp->sq.lock, flags);
+	return err;
+}
+
+int mthca_arbel_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr,
+			     struct ib_recv_wr **bad_wr)
+{
+	struct mthca_dev *dev = to_mdev(ibqp->device);
+	struct mthca_qp *qp = to_mqp(ibqp);
+	unsigned long flags;
+	int err = 0;
+	int nreq;
+	int ind;
+	int i;
+	void *wqe;
+
+	spin_lock_irqsave(&qp->rq.lock, flags);
+
+	/* XXX check that state is OK to post receive */
+
+	ind = qp->rq.head & (qp->rq.max - 1);
+
+	for (nreq = 0; wr; ++nreq, wr = wr->next) {
+		if (mthca_wq_overflow(&qp->rq, nreq, qp->ibqp.recv_cq)) {
+			mthca_err(dev, "RQ %06x full (%u head, %u tail,"
+					" %d max, %d nreq)\n", qp->qpn,
+					qp->rq.head, qp->rq.tail,
+					qp->rq.max, nreq);
+			err = -ENOMEM;
+			*bad_wr = wr;
+			goto out;
+		}
+
+		wqe = get_recv_wqe(qp, ind);
+
+		((struct mthca_next_seg *) wqe)->flags = 0;
+
+		wqe += sizeof (struct mthca_next_seg);
+
+		if (unlikely(wr->num_sge > qp->rq.max_gs)) {
+			err = -EINVAL;
+			*bad_wr = wr;
+			goto out;
+		}
+
+		for (i = 0; i < wr->num_sge; ++i) {
+			mthca_set_data_seg(wqe, wr->sg_list + i);
+			wqe += sizeof (struct mthca_data_seg);
+		}
+
+		if (i < qp->rq.max_gs)
+			mthca_set_data_seg_inval(wqe);
+
+		qp->wrid[ind] = wr->wr_id;
+
+		++ind;
+		if (unlikely(ind >= qp->rq.max))
+			ind -= qp->rq.max;
+	}
+out:
+	if (likely(nreq)) {
+		qp->rq.head += nreq;
+
+		/*
+		 * Make sure that descriptors are written before
+		 * doorbell record.
+		 */
+		wmb();
+		*qp->rq.db = cpu_to_be32(qp->rq.head & 0xffff);
+	}
+
+	spin_unlock_irqrestore(&qp->rq.lock, flags);
+	return err;
+}
+
+void mthca_free_err_wqe(struct mthca_dev *dev, struct mthca_qp *qp, int is_send,
+			int index, int *dbd, __be32 *new_wqe)
+{
+	struct mthca_next_seg *next;
+
+	/*
+	 * For SRQs, all receive WQEs generate a CQE, so we're always
+	 * at the end of the doorbell chain.
+	 */
+	if (qp->ibqp.srq && !is_send) {
+		*new_wqe = 0;
+		return;
+	}
+
+	if (is_send)
+		next = get_send_wqe(qp, index);
+	else
+		next = get_recv_wqe(qp, index);
+
+	*dbd = !!(next->ee_nds & cpu_to_be32(MTHCA_NEXT_DBD));
+	if (next->ee_nds & cpu_to_be32(0x3f))
+		*new_wqe = (next->nda_op & cpu_to_be32(~0x3f)) |
+			(next->ee_nds & cpu_to_be32(0x3f));
+	else
+		*new_wqe = 0;
+}
+
+int mthca_init_qp_table(struct mthca_dev *dev)
+{
+	int err;
+	int i;
+
+	spin_lock_init(&dev->qp_table.lock);
+
+	/*
+	 * We reserve 2 extra QPs per port for the special QPs.  The
+	 * special QP for port 1 has to be even, so round up.
+	 */
+	dev->qp_table.sqp_start = (dev->limits.reserved_qps + 1) & ~1UL;
+	err = mthca_alloc_init(&dev->qp_table.alloc,
+			       dev->limits.num_qps,
+			       (1 << 24) - 1,
+			       dev->qp_table.sqp_start +
+			       MTHCA_MAX_PORTS * 2);
+	if (err)
+		return err;
+
+	err = mthca_array_init(&dev->qp_table.qp,
+			       dev->limits.num_qps);
+	if (err) {
+		mthca_alloc_cleanup(&dev->qp_table.alloc);
+		return err;
+	}
+
+	for (i = 0; i < 2; ++i) {
+		err = mthca_CONF_SPECIAL_QP(dev, i ? IB_QPT_GSI : IB_QPT_SMI,
+				    dev->qp_table.sqp_start + i * 2);
+		if (err) {
+			mthca_warn(dev, "CONF_SPECIAL_QP returned "
+				   "%d, aborting.\n", err);
+			goto err_out;
+		}
+	}
+	return 0;
+
+ err_out:
+	for (i = 0; i < 2; ++i)
+		mthca_CONF_SPECIAL_QP(dev, i, 0);
+
+	mthca_array_cleanup(&dev->qp_table.qp, dev->limits.num_qps);
+	mthca_alloc_cleanup(&dev->qp_table.alloc);
+
+	return err;
+}
+
+void mthca_cleanup_qp_table(struct mthca_dev *dev)
+{
+	int i;
+
+	for (i = 0; i < 2; ++i)
+		mthca_CONF_SPECIAL_QP(dev, i, 0);
+
+	mthca_array_cleanup(&dev->qp_table.qp, dev->limits.num_qps);
+	mthca_alloc_cleanup(&dev->qp_table.alloc);
+}
diff --git a/drivers/infiniband/hw/mthca/mthca_reset.c b/drivers/infiniband/hw/mthca/mthca_reset.c
new file mode 100644
index 0000000..74c6a94
--- /dev/null
+++ b/drivers/infiniband/hw/mthca/mthca_reset.c
@@ -0,0 +1,288 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/errno.h>
+#include <linux/pci.h>
+#include <linux/delay.h>
+#include <linux/slab.h>
+
+#include "mthca_dev.h"
+#include "mthca_cmd.h"
+
+int mthca_reset(struct mthca_dev *mdev)
+{
+	int i;
+	int err = 0;
+	u32 *hca_header    = NULL;
+	u32 *bridge_header = NULL;
+	struct pci_dev *bridge = NULL;
+	int bridge_pcix_cap = 0;
+	int hca_pcie_cap = 0;
+	int hca_pcix_cap = 0;
+
+	u16 devctl;
+	u16 linkctl;
+
+#define MTHCA_RESET_OFFSET 0xf0010
+#define MTHCA_RESET_VALUE  swab32(1)
+
+	/*
+	 * Reset the chip.  This is somewhat ugly because we have to
+	 * save off the PCI header before reset and then restore it
+	 * after the chip reboots.  We skip config space offsets 22
+	 * and 23 since those have a special meaning.
+	 *
+	 * To make matters worse, for Tavor (PCI-X HCA) we have to
+	 * find the associated bridge device and save off its PCI
+	 * header as well.
+	 */
+
+	if (!(mdev->mthca_flags & MTHCA_FLAG_PCIE)) {
+		/* Look for the bridge -- its device ID will be 2 more
+		   than HCA's device ID. */
+		while ((bridge = pci_get_device(mdev->pdev->vendor,
+						mdev->pdev->device + 2,
+						bridge)) != NULL) {
+			if (bridge->hdr_type    == PCI_HEADER_TYPE_BRIDGE &&
+			    bridge->subordinate == mdev->pdev->bus) {
+				mthca_dbg(mdev, "Found bridge: %s\n",
+					  pci_name(bridge));
+				break;
+			}
+		}
+
+		if (!bridge) {
+			/*
+			 * Didn't find a bridge for a Tavor device --
+			 * assume we're in no-bridge mode and hope for
+			 * the best.
+			 */
+			mthca_warn(mdev, "No bridge found for %s\n",
+				  pci_name(mdev->pdev));
+		}
+
+	}
+
+	/* For Arbel do we need to save off the full 4K PCI Express header?? */
+	hca_header = kmalloc(256, GFP_KERNEL);
+	if (!hca_header) {
+		err = -ENOMEM;
+		mthca_err(mdev, "Couldn't allocate memory to save HCA "
+			  "PCI header, aborting.\n");
+		goto out;
+	}
+
+	for (i = 0; i < 64; ++i) {
+		if (i == 22 || i == 23)
+			continue;
+		if (pci_read_config_dword(mdev->pdev, i * 4, hca_header + i)) {
+			err = -ENODEV;
+			mthca_err(mdev, "Couldn't save HCA "
+				  "PCI header, aborting.\n");
+			goto out;
+		}
+	}
+
+	hca_pcix_cap = pci_find_capability(mdev->pdev, PCI_CAP_ID_PCIX);
+	hca_pcie_cap = pci_pcie_cap(mdev->pdev);
+
+	if (bridge) {
+		bridge_header = kmalloc(256, GFP_KERNEL);
+		if (!bridge_header) {
+			err = -ENOMEM;
+			mthca_err(mdev, "Couldn't allocate memory to save HCA "
+				  "bridge PCI header, aborting.\n");
+			goto out;
+		}
+
+		for (i = 0; i < 64; ++i) {
+			if (i == 22 || i == 23)
+				continue;
+			if (pci_read_config_dword(bridge, i * 4, bridge_header + i)) {
+				err = -ENODEV;
+				mthca_err(mdev, "Couldn't save HCA bridge "
+					  "PCI header, aborting.\n");
+				goto out;
+			}
+		}
+		bridge_pcix_cap = pci_find_capability(bridge, PCI_CAP_ID_PCIX);
+		if (!bridge_pcix_cap) {
+				err = -ENODEV;
+				mthca_err(mdev, "Couldn't locate HCA bridge "
+					  "PCI-X capability, aborting.\n");
+				goto out;
+		}
+	}
+
+	/* actually hit reset */
+	{
+		void __iomem *reset = ioremap(pci_resource_start(mdev->pdev, 0) +
+					      MTHCA_RESET_OFFSET, 4);
+
+		if (!reset) {
+			err = -ENOMEM;
+			mthca_err(mdev, "Couldn't map HCA reset register, "
+				  "aborting.\n");
+			goto out;
+		}
+
+		writel(MTHCA_RESET_VALUE, reset);
+		iounmap(reset);
+	}
+
+	/* Docs say to wait one second before accessing device */
+	msleep(1000);
+
+	/* Now wait for PCI device to start responding again */
+	{
+		u32 v;
+		int c = 0;
+
+		for (c = 0; c < 100; ++c) {
+			if (pci_read_config_dword(bridge ? bridge : mdev->pdev, 0, &v)) {
+				err = -ENODEV;
+				mthca_err(mdev, "Couldn't access HCA after reset, "
+					  "aborting.\n");
+				goto out;
+			}
+
+			if (v != 0xffffffff)
+				goto good;
+
+			msleep(100);
+		}
+
+		err = -ENODEV;
+		mthca_err(mdev, "PCI device did not come back after reset, "
+			  "aborting.\n");
+		goto out;
+	}
+
+good:
+	/* Now restore the PCI headers */
+	if (bridge) {
+		if (pci_write_config_dword(bridge, bridge_pcix_cap + 0x8,
+				 bridge_header[(bridge_pcix_cap + 0x8) / 4])) {
+			err = -ENODEV;
+			mthca_err(mdev, "Couldn't restore HCA bridge Upstream "
+				  "split transaction control, aborting.\n");
+			goto out;
+		}
+		if (pci_write_config_dword(bridge, bridge_pcix_cap + 0xc,
+				 bridge_header[(bridge_pcix_cap + 0xc) / 4])) {
+			err = -ENODEV;
+			mthca_err(mdev, "Couldn't restore HCA bridge Downstream "
+				  "split transaction control, aborting.\n");
+			goto out;
+		}
+		/*
+		 * Bridge control register is at 0x3e, so we'll
+		 * naturally restore it last in this loop.
+		 */
+		for (i = 0; i < 16; ++i) {
+			if (i * 4 == PCI_COMMAND)
+				continue;
+
+			if (pci_write_config_dword(bridge, i * 4, bridge_header[i])) {
+				err = -ENODEV;
+				mthca_err(mdev, "Couldn't restore HCA bridge reg %x, "
+					  "aborting.\n", i);
+				goto out;
+			}
+		}
+
+		if (pci_write_config_dword(bridge, PCI_COMMAND,
+					   bridge_header[PCI_COMMAND / 4])) {
+			err = -ENODEV;
+			mthca_err(mdev, "Couldn't restore HCA bridge COMMAND, "
+				  "aborting.\n");
+			goto out;
+		}
+	}
+
+	if (hca_pcix_cap) {
+		if (pci_write_config_dword(mdev->pdev, hca_pcix_cap,
+				 hca_header[hca_pcix_cap / 4])) {
+			err = -ENODEV;
+			mthca_err(mdev, "Couldn't restore HCA PCI-X "
+				  "command register, aborting.\n");
+			goto out;
+		}
+	}
+
+	if (hca_pcie_cap) {
+		devctl = hca_header[(hca_pcie_cap + PCI_EXP_DEVCTL) / 4];
+		if (pcie_capability_write_word(mdev->pdev, PCI_EXP_DEVCTL,
+					       devctl)) {
+			err = -ENODEV;
+			mthca_err(mdev, "Couldn't restore HCA PCI Express "
+				  "Device Control register, aborting.\n");
+			goto out;
+		}
+		linkctl = hca_header[(hca_pcie_cap + PCI_EXP_LNKCTL) / 4];
+		if (pcie_capability_write_word(mdev->pdev, PCI_EXP_LNKCTL,
+					       linkctl)) {
+			err = -ENODEV;
+			mthca_err(mdev, "Couldn't restore HCA PCI Express "
+				  "Link control register, aborting.\n");
+			goto out;
+		}
+	}
+
+	for (i = 0; i < 16; ++i) {
+		if (i * 4 == PCI_COMMAND)
+			continue;
+
+		if (pci_write_config_dword(mdev->pdev, i * 4, hca_header[i])) {
+			err = -ENODEV;
+			mthca_err(mdev, "Couldn't restore HCA reg %x, "
+				  "aborting.\n", i);
+			goto out;
+		}
+	}
+
+	if (pci_write_config_dword(mdev->pdev, PCI_COMMAND,
+				   hca_header[PCI_COMMAND / 4])) {
+		err = -ENODEV;
+		mthca_err(mdev, "Couldn't restore HCA COMMAND, "
+			  "aborting.\n");
+		goto out;
+	}
+
+out:
+	if (bridge)
+		pci_dev_put(bridge);
+	kfree(bridge_header);
+	kfree(hca_header);
+
+	return err;
+}
diff --git a/drivers/infiniband/hw/mthca/mthca_srq.c b/drivers/infiniband/hw/mthca/mthca_srq.c
new file mode 100644
index 0000000..d22f970
--- /dev/null
+++ b/drivers/infiniband/hw/mthca/mthca_srq.c
@@ -0,0 +1,696 @@
+/*
+ * Copyright (c) 2005 Cisco Systems. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/sched.h>
+
+#include <asm/io.h>
+
+#include "mthca_dev.h"
+#include "mthca_cmd.h"
+#include "mthca_memfree.h"
+#include "mthca_wqe.h"
+
+enum {
+	MTHCA_MAX_DIRECT_SRQ_SIZE = 4 * PAGE_SIZE
+};
+
+struct mthca_tavor_srq_context {
+	__be64 wqe_base_ds;	/* low 6 bits is descriptor size */
+	__be32 state_pd;
+	__be32 lkey;
+	__be32 uar;
+	__be16 limit_watermark;
+	__be16 wqe_cnt;
+	u32    reserved[2];
+};
+
+struct mthca_arbel_srq_context {
+	__be32 state_logsize_srqn;
+	__be32 lkey;
+	__be32 db_index;
+	__be32 logstride_usrpage;
+	__be64 wqe_base;
+	__be32 eq_pd;
+	__be16 limit_watermark;
+	__be16 wqe_cnt;
+	u16    reserved1;
+	__be16 wqe_counter;
+	u32    reserved2[3];
+};
+
+static void *get_wqe(struct mthca_srq *srq, int n)
+{
+	if (srq->is_direct)
+		return srq->queue.direct.buf + (n << srq->wqe_shift);
+	else
+		return srq->queue.page_list[(n << srq->wqe_shift) >> PAGE_SHIFT].buf +
+			((n << srq->wqe_shift) & (PAGE_SIZE - 1));
+}
+
+/*
+ * Return a pointer to the location within a WQE that we're using as a
+ * link when the WQE is in the free list.  We use the imm field
+ * because in the Tavor case, posting a WQE may overwrite the next
+ * segment of the previous WQE, but a receive WQE will never touch the
+ * imm field.  This avoids corrupting our free list if the previous
+ * WQE has already completed and been put on the free list when we
+ * post the next WQE.
+ */
+static inline int *wqe_to_link(void *wqe)
+{
+	return (int *) (wqe + offsetof(struct mthca_next_seg, imm));
+}
+
+static void mthca_tavor_init_srq_context(struct mthca_dev *dev,
+					 struct mthca_pd *pd,
+					 struct mthca_srq *srq,
+					 struct mthca_tavor_srq_context *context)
+{
+	memset(context, 0, sizeof *context);
+
+	context->wqe_base_ds = cpu_to_be64(1 << (srq->wqe_shift - 4));
+	context->state_pd    = cpu_to_be32(pd->pd_num);
+	context->lkey        = cpu_to_be32(srq->mr.ibmr.lkey);
+
+	if (pd->ibpd.uobject)
+		context->uar =
+			cpu_to_be32(to_mucontext(pd->ibpd.uobject->context)->uar.index);
+	else
+		context->uar = cpu_to_be32(dev->driver_uar.index);
+}
+
+static void mthca_arbel_init_srq_context(struct mthca_dev *dev,
+					 struct mthca_pd *pd,
+					 struct mthca_srq *srq,
+					 struct mthca_arbel_srq_context *context)
+{
+	int logsize, max;
+
+	memset(context, 0, sizeof *context);
+
+	/*
+	 * Put max in a temporary variable to work around gcc bug
+	 * triggered by ilog2() on sparc64.
+	 */
+	max = srq->max;
+	logsize = ilog2(max);
+	context->state_logsize_srqn = cpu_to_be32(logsize << 24 | srq->srqn);
+	context->lkey = cpu_to_be32(srq->mr.ibmr.lkey);
+	context->db_index = cpu_to_be32(srq->db_index);
+	context->logstride_usrpage = cpu_to_be32((srq->wqe_shift - 4) << 29);
+	if (pd->ibpd.uobject)
+		context->logstride_usrpage |=
+			cpu_to_be32(to_mucontext(pd->ibpd.uobject->context)->uar.index);
+	else
+		context->logstride_usrpage |= cpu_to_be32(dev->driver_uar.index);
+	context->eq_pd = cpu_to_be32(MTHCA_EQ_ASYNC << 24 | pd->pd_num);
+}
+
+static void mthca_free_srq_buf(struct mthca_dev *dev, struct mthca_srq *srq)
+{
+	mthca_buf_free(dev, srq->max << srq->wqe_shift, &srq->queue,
+		       srq->is_direct, &srq->mr);
+	kfree(srq->wrid);
+}
+
+static int mthca_alloc_srq_buf(struct mthca_dev *dev, struct mthca_pd *pd,
+			       struct mthca_srq *srq)
+{
+	struct mthca_data_seg *scatter;
+	void *wqe;
+	int err;
+	int i;
+
+	if (pd->ibpd.uobject)
+		return 0;
+
+	srq->wrid = kmalloc(srq->max * sizeof (u64), GFP_KERNEL);
+	if (!srq->wrid)
+		return -ENOMEM;
+
+	err = mthca_buf_alloc(dev, srq->max << srq->wqe_shift,
+			      MTHCA_MAX_DIRECT_SRQ_SIZE,
+			      &srq->queue, &srq->is_direct, pd, 1, &srq->mr);
+	if (err) {
+		kfree(srq->wrid);
+		return err;
+	}
+
+	/*
+	 * Now initialize the SRQ buffer so that all of the WQEs are
+	 * linked into the list of free WQEs.  In addition, set the
+	 * scatter list L_Keys to the sentry value of 0x100.
+	 */
+	for (i = 0; i < srq->max; ++i) {
+		struct mthca_next_seg *next;
+
+		next = wqe = get_wqe(srq, i);
+
+		if (i < srq->max - 1) {
+			*wqe_to_link(wqe) = i + 1;
+			next->nda_op = htonl(((i + 1) << srq->wqe_shift) | 1);
+		} else {
+			*wqe_to_link(wqe) = -1;
+			next->nda_op = 0;
+		}
+
+		for (scatter = wqe + sizeof (struct mthca_next_seg);
+		     (void *) scatter < wqe + (1 << srq->wqe_shift);
+		     ++scatter)
+			scatter->lkey = cpu_to_be32(MTHCA_INVAL_LKEY);
+	}
+
+	srq->last = get_wqe(srq, srq->max - 1);
+
+	return 0;
+}
+
+int mthca_alloc_srq(struct mthca_dev *dev, struct mthca_pd *pd,
+		    struct ib_srq_attr *attr, struct mthca_srq *srq)
+{
+	struct mthca_mailbox *mailbox;
+	int ds;
+	int err;
+
+	/* Sanity check SRQ size before proceeding */
+	if (attr->max_wr  > dev->limits.max_srq_wqes ||
+	    attr->max_sge > dev->limits.max_srq_sge)
+		return -EINVAL;
+
+	srq->max      = attr->max_wr;
+	srq->max_gs   = attr->max_sge;
+	srq->counter  = 0;
+
+	if (mthca_is_memfree(dev))
+		srq->max = roundup_pow_of_two(srq->max + 1);
+	else
+		srq->max = srq->max + 1;
+
+	ds = max(64UL,
+		 roundup_pow_of_two(sizeof (struct mthca_next_seg) +
+				    srq->max_gs * sizeof (struct mthca_data_seg)));
+
+	if (!mthca_is_memfree(dev) && (ds > dev->limits.max_desc_sz))
+		return -EINVAL;
+
+	srq->wqe_shift = ilog2(ds);
+
+	srq->srqn = mthca_alloc(&dev->srq_table.alloc);
+	if (srq->srqn == -1)
+		return -ENOMEM;
+
+	if (mthca_is_memfree(dev)) {
+		err = mthca_table_get(dev, dev->srq_table.table, srq->srqn);
+		if (err)
+			goto err_out;
+
+		if (!pd->ibpd.uobject) {
+			srq->db_index = mthca_alloc_db(dev, MTHCA_DB_TYPE_SRQ,
+						       srq->srqn, &srq->db);
+			if (srq->db_index < 0) {
+				err = -ENOMEM;
+				goto err_out_icm;
+			}
+		}
+	}
+
+	mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+	if (IS_ERR(mailbox)) {
+		err = PTR_ERR(mailbox);
+		goto err_out_db;
+	}
+
+	err = mthca_alloc_srq_buf(dev, pd, srq);
+	if (err)
+		goto err_out_mailbox;
+
+	spin_lock_init(&srq->lock);
+	srq->refcount = 1;
+	init_waitqueue_head(&srq->wait);
+	mutex_init(&srq->mutex);
+
+	if (mthca_is_memfree(dev))
+		mthca_arbel_init_srq_context(dev, pd, srq, mailbox->buf);
+	else
+		mthca_tavor_init_srq_context(dev, pd, srq, mailbox->buf);
+
+	err = mthca_SW2HW_SRQ(dev, mailbox, srq->srqn);
+
+	if (err) {
+		mthca_warn(dev, "SW2HW_SRQ failed (%d)\n", err);
+		goto err_out_free_buf;
+	}
+
+	spin_lock_irq(&dev->srq_table.lock);
+	if (mthca_array_set(&dev->srq_table.srq,
+			    srq->srqn & (dev->limits.num_srqs - 1),
+			    srq)) {
+		spin_unlock_irq(&dev->srq_table.lock);
+		goto err_out_free_srq;
+	}
+	spin_unlock_irq(&dev->srq_table.lock);
+
+	mthca_free_mailbox(dev, mailbox);
+
+	srq->first_free = 0;
+	srq->last_free  = srq->max - 1;
+
+	attr->max_wr    = srq->max - 1;
+	attr->max_sge   = srq->max_gs;
+
+	return 0;
+
+err_out_free_srq:
+	err = mthca_HW2SW_SRQ(dev, mailbox, srq->srqn);
+	if (err)
+		mthca_warn(dev, "HW2SW_SRQ failed (%d)\n", err);
+
+err_out_free_buf:
+	if (!pd->ibpd.uobject)
+		mthca_free_srq_buf(dev, srq);
+
+err_out_mailbox:
+	mthca_free_mailbox(dev, mailbox);
+
+err_out_db:
+	if (!pd->ibpd.uobject && mthca_is_memfree(dev))
+		mthca_free_db(dev, MTHCA_DB_TYPE_SRQ, srq->db_index);
+
+err_out_icm:
+	mthca_table_put(dev, dev->srq_table.table, srq->srqn);
+
+err_out:
+	mthca_free(&dev->srq_table.alloc, srq->srqn);
+
+	return err;
+}
+
+static inline int get_srq_refcount(struct mthca_dev *dev, struct mthca_srq *srq)
+{
+	int c;
+
+	spin_lock_irq(&dev->srq_table.lock);
+	c = srq->refcount;
+	spin_unlock_irq(&dev->srq_table.lock);
+
+	return c;
+}
+
+void mthca_free_srq(struct mthca_dev *dev, struct mthca_srq *srq)
+{
+	struct mthca_mailbox *mailbox;
+	int err;
+
+	mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+	if (IS_ERR(mailbox)) {
+		mthca_warn(dev, "No memory for mailbox to free SRQ.\n");
+		return;
+	}
+
+	err = mthca_HW2SW_SRQ(dev, mailbox, srq->srqn);
+	if (err)
+		mthca_warn(dev, "HW2SW_SRQ failed (%d)\n", err);
+
+	spin_lock_irq(&dev->srq_table.lock);
+	mthca_array_clear(&dev->srq_table.srq,
+			  srq->srqn & (dev->limits.num_srqs - 1));
+	--srq->refcount;
+	spin_unlock_irq(&dev->srq_table.lock);
+
+	wait_event(srq->wait, !get_srq_refcount(dev, srq));
+
+	if (!srq->ibsrq.uobject) {
+		mthca_free_srq_buf(dev, srq);
+		if (mthca_is_memfree(dev))
+			mthca_free_db(dev, MTHCA_DB_TYPE_SRQ, srq->db_index);
+	}
+
+	mthca_table_put(dev, dev->srq_table.table, srq->srqn);
+	mthca_free(&dev->srq_table.alloc, srq->srqn);
+	mthca_free_mailbox(dev, mailbox);
+}
+
+int mthca_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
+		     enum ib_srq_attr_mask attr_mask, struct ib_udata *udata)
+{
+	struct mthca_dev *dev = to_mdev(ibsrq->device);
+	struct mthca_srq *srq = to_msrq(ibsrq);
+	int ret = 0;
+
+	/* We don't support resizing SRQs (yet?) */
+	if (attr_mask & IB_SRQ_MAX_WR)
+		return -EINVAL;
+
+	if (attr_mask & IB_SRQ_LIMIT) {
+		u32 max_wr = mthca_is_memfree(dev) ? srq->max - 1 : srq->max;
+		if (attr->srq_limit > max_wr)
+			return -EINVAL;
+
+		mutex_lock(&srq->mutex);
+		ret = mthca_ARM_SRQ(dev, srq->srqn, attr->srq_limit);
+		mutex_unlock(&srq->mutex);
+	}
+
+	return ret;
+}
+
+int mthca_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *srq_attr)
+{
+	struct mthca_dev *dev = to_mdev(ibsrq->device);
+	struct mthca_srq *srq = to_msrq(ibsrq);
+	struct mthca_mailbox *mailbox;
+	struct mthca_arbel_srq_context *arbel_ctx;
+	struct mthca_tavor_srq_context *tavor_ctx;
+	int err;
+
+	mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+
+	err = mthca_QUERY_SRQ(dev, srq->srqn, mailbox);
+	if (err)
+		goto out;
+
+	if (mthca_is_memfree(dev)) {
+		arbel_ctx = mailbox->buf;
+		srq_attr->srq_limit = be16_to_cpu(arbel_ctx->limit_watermark);
+	} else {
+		tavor_ctx = mailbox->buf;
+		srq_attr->srq_limit = be16_to_cpu(tavor_ctx->limit_watermark);
+	}
+
+	srq_attr->max_wr  = srq->max - 1;
+	srq_attr->max_sge = srq->max_gs;
+
+out:
+	mthca_free_mailbox(dev, mailbox);
+
+	return err;
+}
+
+void mthca_srq_event(struct mthca_dev *dev, u32 srqn,
+		     enum ib_event_type event_type)
+{
+	struct mthca_srq *srq;
+	struct ib_event event;
+
+	spin_lock(&dev->srq_table.lock);
+	srq = mthca_array_get(&dev->srq_table.srq, srqn & (dev->limits.num_srqs - 1));
+	if (srq)
+		++srq->refcount;
+	spin_unlock(&dev->srq_table.lock);
+
+	if (!srq) {
+		mthca_warn(dev, "Async event for bogus SRQ %08x\n", srqn);
+		return;
+	}
+
+	if (!srq->ibsrq.event_handler)
+		goto out;
+
+	event.device      = &dev->ib_dev;
+	event.event       = event_type;
+	event.element.srq = &srq->ibsrq;
+	srq->ibsrq.event_handler(&event, srq->ibsrq.srq_context);
+
+out:
+	spin_lock(&dev->srq_table.lock);
+	if (!--srq->refcount)
+		wake_up(&srq->wait);
+	spin_unlock(&dev->srq_table.lock);
+}
+
+/*
+ * This function must be called with IRQs disabled.
+ */
+void mthca_free_srq_wqe(struct mthca_srq *srq, u32 wqe_addr)
+{
+	int ind;
+	struct mthca_next_seg *last_free;
+
+	ind = wqe_addr >> srq->wqe_shift;
+
+	spin_lock(&srq->lock);
+
+	last_free = get_wqe(srq, srq->last_free);
+	*wqe_to_link(last_free) = ind;
+	last_free->nda_op = htonl((ind << srq->wqe_shift) | 1);
+	*wqe_to_link(get_wqe(srq, ind)) = -1;
+	srq->last_free = ind;
+
+	spin_unlock(&srq->lock);
+}
+
+int mthca_tavor_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr,
+			      struct ib_recv_wr **bad_wr)
+{
+	struct mthca_dev *dev = to_mdev(ibsrq->device);
+	struct mthca_srq *srq = to_msrq(ibsrq);
+	unsigned long flags;
+	int err = 0;
+	int first_ind;
+	int ind;
+	int next_ind;
+	int nreq;
+	int i;
+	void *wqe;
+	void *prev_wqe;
+
+	spin_lock_irqsave(&srq->lock, flags);
+
+	first_ind = srq->first_free;
+
+	for (nreq = 0; wr; wr = wr->next) {
+		ind       = srq->first_free;
+		wqe       = get_wqe(srq, ind);
+		next_ind  = *wqe_to_link(wqe);
+
+		if (unlikely(next_ind < 0)) {
+			mthca_err(dev, "SRQ %06x full\n", srq->srqn);
+			err = -ENOMEM;
+			*bad_wr = wr;
+			break;
+		}
+
+		prev_wqe  = srq->last;
+		srq->last = wqe;
+
+		((struct mthca_next_seg *) wqe)->ee_nds = 0;
+		/* flags field will always remain 0 */
+
+		wqe += sizeof (struct mthca_next_seg);
+
+		if (unlikely(wr->num_sge > srq->max_gs)) {
+			err = -EINVAL;
+			*bad_wr = wr;
+			srq->last = prev_wqe;
+			break;
+		}
+
+		for (i = 0; i < wr->num_sge; ++i) {
+			mthca_set_data_seg(wqe, wr->sg_list + i);
+			wqe += sizeof (struct mthca_data_seg);
+		}
+
+		if (i < srq->max_gs)
+			mthca_set_data_seg_inval(wqe);
+
+		((struct mthca_next_seg *) prev_wqe)->ee_nds =
+			cpu_to_be32(MTHCA_NEXT_DBD);
+
+		srq->wrid[ind]  = wr->wr_id;
+		srq->first_free = next_ind;
+
+		++nreq;
+		if (unlikely(nreq == MTHCA_TAVOR_MAX_WQES_PER_RECV_DB)) {
+			nreq = 0;
+
+			/*
+			 * Make sure that descriptors are written
+			 * before doorbell is rung.
+			 */
+			wmb();
+
+			mthca_write64(first_ind << srq->wqe_shift, srq->srqn << 8,
+				      dev->kar + MTHCA_RECEIVE_DOORBELL,
+				      MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock));
+
+			first_ind = srq->first_free;
+		}
+	}
+
+	if (likely(nreq)) {
+		/*
+		 * Make sure that descriptors are written before
+		 * doorbell is rung.
+		 */
+		wmb();
+
+		mthca_write64(first_ind << srq->wqe_shift, (srq->srqn << 8) | nreq,
+			      dev->kar + MTHCA_RECEIVE_DOORBELL,
+			      MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock));
+	}
+
+	/*
+	 * Make sure doorbells don't leak out of SRQ spinlock and
+	 * reach the HCA out of order:
+	 */
+	mmiowb();
+
+	spin_unlock_irqrestore(&srq->lock, flags);
+	return err;
+}
+
+int mthca_arbel_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr,
+			      struct ib_recv_wr **bad_wr)
+{
+	struct mthca_dev *dev = to_mdev(ibsrq->device);
+	struct mthca_srq *srq = to_msrq(ibsrq);
+	unsigned long flags;
+	int err = 0;
+	int ind;
+	int next_ind;
+	int nreq;
+	int i;
+	void *wqe;
+
+	spin_lock_irqsave(&srq->lock, flags);
+
+	for (nreq = 0; wr; ++nreq, wr = wr->next) {
+		ind       = srq->first_free;
+		wqe       = get_wqe(srq, ind);
+		next_ind  = *wqe_to_link(wqe);
+
+		if (unlikely(next_ind < 0)) {
+			mthca_err(dev, "SRQ %06x full\n", srq->srqn);
+			err = -ENOMEM;
+			*bad_wr = wr;
+			break;
+		}
+
+		((struct mthca_next_seg *) wqe)->ee_nds = 0;
+		/* flags field will always remain 0 */
+
+		wqe += sizeof (struct mthca_next_seg);
+
+		if (unlikely(wr->num_sge > srq->max_gs)) {
+			err = -EINVAL;
+			*bad_wr = wr;
+			break;
+		}
+
+		for (i = 0; i < wr->num_sge; ++i) {
+			mthca_set_data_seg(wqe, wr->sg_list + i);
+			wqe += sizeof (struct mthca_data_seg);
+		}
+
+		if (i < srq->max_gs)
+			mthca_set_data_seg_inval(wqe);
+
+		srq->wrid[ind]  = wr->wr_id;
+		srq->first_free = next_ind;
+	}
+
+	if (likely(nreq)) {
+		srq->counter += nreq;
+
+		/*
+		 * Make sure that descriptors are written before
+		 * we write doorbell record.
+		 */
+		wmb();
+		*srq->db = cpu_to_be32(srq->counter);
+	}
+
+	spin_unlock_irqrestore(&srq->lock, flags);
+	return err;
+}
+
+int mthca_max_srq_sge(struct mthca_dev *dev)
+{
+	if (mthca_is_memfree(dev))
+		return dev->limits.max_sg;
+
+	/*
+	 * SRQ allocations are based on powers of 2 for Tavor,
+	 * (although they only need to be multiples of 16 bytes).
+	 *
+	 * Therefore, we need to base the max number of sg entries on
+	 * the largest power of 2 descriptor size that is <= to the
+	 * actual max WQE descriptor size, rather than return the
+	 * max_sg value given by the firmware (which is based on WQE
+	 * sizes as multiples of 16, not powers of 2).
+	 *
+	 * If SRQ implementation is changed for Tavor to be based on
+	 * multiples of 16, the calculation below can be deleted and
+	 * the FW max_sg value returned.
+	 */
+	return min_t(int, dev->limits.max_sg,
+		     ((1 << (fls(dev->limits.max_desc_sz) - 1)) -
+		      sizeof (struct mthca_next_seg)) /
+		     sizeof (struct mthca_data_seg));
+}
+
+int mthca_init_srq_table(struct mthca_dev *dev)
+{
+	int err;
+
+	if (!(dev->mthca_flags & MTHCA_FLAG_SRQ))
+		return 0;
+
+	spin_lock_init(&dev->srq_table.lock);
+
+	err = mthca_alloc_init(&dev->srq_table.alloc,
+			       dev->limits.num_srqs,
+			       dev->limits.num_srqs - 1,
+			       dev->limits.reserved_srqs);
+	if (err)
+		return err;
+
+	err = mthca_array_init(&dev->srq_table.srq,
+			       dev->limits.num_srqs);
+	if (err)
+		mthca_alloc_cleanup(&dev->srq_table.alloc);
+
+	return err;
+}
+
+void mthca_cleanup_srq_table(struct mthca_dev *dev)
+{
+	if (!(dev->mthca_flags & MTHCA_FLAG_SRQ))
+		return;
+
+	mthca_array_cleanup(&dev->srq_table.srq, dev->limits.num_srqs);
+	mthca_alloc_cleanup(&dev->srq_table.alloc);
+}
diff --git a/drivers/infiniband/hw/mthca/mthca_uar.c b/drivers/infiniband/hw/mthca/mthca_uar.c
new file mode 100644
index 0000000..ca5900c
--- /dev/null
+++ b/drivers/infiniband/hw/mthca/mthca_uar.c
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2005 Topspin Communications.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <asm/page.h>		/* PAGE_SHIFT */
+
+#include "mthca_dev.h"
+#include "mthca_memfree.h"
+
+int mthca_uar_alloc(struct mthca_dev *dev, struct mthca_uar *uar)
+{
+	uar->index = mthca_alloc(&dev->uar_table.alloc);
+	if (uar->index == -1)
+		return -ENOMEM;
+
+	uar->pfn = (pci_resource_start(dev->pdev, 2) >> PAGE_SHIFT) + uar->index;
+
+	return 0;
+}
+
+void mthca_uar_free(struct mthca_dev *dev, struct mthca_uar *uar)
+{
+	mthca_free(&dev->uar_table.alloc, uar->index);
+}
+
+int mthca_init_uar_table(struct mthca_dev *dev)
+{
+	int ret;
+
+	ret = mthca_alloc_init(&dev->uar_table.alloc,
+			       dev->limits.num_uars,
+			       dev->limits.num_uars - 1,
+			       dev->limits.reserved_uars + 1);
+	if (ret)
+		return ret;
+
+	ret = mthca_init_db_tab(dev);
+	if (ret)
+		mthca_alloc_cleanup(&dev->uar_table.alloc);
+
+	return ret;
+}
+
+void mthca_cleanup_uar_table(struct mthca_dev *dev)
+{
+	mthca_cleanup_db_tab(dev);
+
+	/* XXX check if any UARs are still allocated? */
+	mthca_alloc_cleanup(&dev->uar_table.alloc);
+}
diff --git a/drivers/infiniband/hw/mthca/mthca_user.h b/drivers/infiniband/hw/mthca/mthca_user.h
new file mode 100644
index 0000000..5fe56e8
--- /dev/null
+++ b/drivers/infiniband/hw/mthca/mthca_user.h
@@ -0,0 +1,112 @@
+/*
+ * Copyright (c) 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005, 2006 Cisco Systems.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MTHCA_USER_H
+#define MTHCA_USER_H
+
+#include <linux/types.h>
+
+/*
+ * Increment this value if any changes that break userspace ABI
+ * compatibility are made.
+ */
+#define MTHCA_UVERBS_ABI_VERSION	1
+
+/*
+ * Make sure that all structs defined in this file remain laid out so
+ * that they pack the same way on 32-bit and 64-bit architectures (to
+ * avoid incompatibility between 32-bit userspace and 64-bit kernels).
+ * In particular do not use pointer types -- pass pointers in __u64
+ * instead.
+ */
+
+struct mthca_alloc_ucontext_resp {
+	__u32 qp_tab_size;
+	__u32 uarc_size;
+};
+
+struct mthca_alloc_pd_resp {
+	__u32 pdn;
+	__u32 reserved;
+};
+
+struct mthca_reg_mr {
+/*
+ * Mark the memory region with a DMA attribute that causes
+ * in-flight DMA to be flushed when the region is written to:
+ */
+#define MTHCA_MR_DMASYNC	0x1
+	__u32 mr_attrs;
+	__u32 reserved;
+};
+
+struct mthca_create_cq {
+	__u32 lkey;
+	__u32 pdn;
+	__u64 arm_db_page;
+	__u64 set_db_page;
+	__u32 arm_db_index;
+	__u32 set_db_index;
+};
+
+struct mthca_create_cq_resp {
+	__u32 cqn;
+	__u32 reserved;
+};
+
+struct mthca_resize_cq {
+	__u32 lkey;
+	__u32 reserved;
+};
+
+struct mthca_create_srq {
+	__u32 lkey;
+	__u32 db_index;
+	__u64 db_page;
+};
+
+struct mthca_create_srq_resp {
+	__u32 srqn;
+	__u32 reserved;
+};
+
+struct mthca_create_qp {
+	__u32 lkey;
+	__u32 reserved;
+	__u64 sq_db_page;
+	__u64 rq_db_page;
+	__u32 sq_db_index;
+	__u32 rq_db_index;
+};
+
+#endif /* MTHCA_USER_H */
diff --git a/drivers/infiniband/hw/mthca/mthca_wqe.h b/drivers/infiniband/hw/mthca/mthca_wqe.h
new file mode 100644
index 0000000..341a5ae
--- /dev/null
+++ b/drivers/infiniband/hw/mthca/mthca_wqe.h
@@ -0,0 +1,131 @@
+/*
+ * Copyright (c) 2005 Cisco Systems. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MTHCA_WQE_H
+#define MTHCA_WQE_H
+
+#include <linux/types.h>
+
+enum {
+	MTHCA_NEXT_DBD		= 1 << 7,
+	MTHCA_NEXT_FENCE	= 1 << 6,
+	MTHCA_NEXT_CQ_UPDATE	= 1 << 3,
+	MTHCA_NEXT_EVENT_GEN	= 1 << 2,
+	MTHCA_NEXT_SOLICIT	= 1 << 1,
+	MTHCA_NEXT_IP_CSUM	= 1 << 4,
+	MTHCA_NEXT_TCP_UDP_CSUM = 1 << 5,
+
+	MTHCA_MLX_VL15		= 1 << 17,
+	MTHCA_MLX_SLR		= 1 << 16
+};
+
+enum {
+	MTHCA_INVAL_LKEY			= 0x100,
+	MTHCA_TAVOR_MAX_WQES_PER_RECV_DB	= 256,
+	MTHCA_ARBEL_MAX_WQES_PER_SEND_DB	= 255
+};
+
+struct mthca_next_seg {
+	__be32 nda_op;		/* [31:6] next WQE [4:0] next opcode */
+	__be32 ee_nds;		/* [31:8] next EE  [7] DBD [6] F [5:0] next WQE size */
+	__be32 flags;		/* [3] CQ [2] Event [1] Solicit */
+	__be32 imm;		/* immediate data */
+};
+
+struct mthca_tavor_ud_seg {
+	u32    reserved1;
+	__be32 lkey;
+	__be64 av_addr;
+	u32    reserved2[4];
+	__be32 dqpn;
+	__be32 qkey;
+	u32    reserved3[2];
+};
+
+struct mthca_arbel_ud_seg {
+	__be32 av[8];
+	__be32 dqpn;
+	__be32 qkey;
+	u32    reserved[2];
+};
+
+struct mthca_bind_seg {
+	__be32 flags;		/* [31] Atomic [30] rem write [29] rem read */
+	u32    reserved;
+	__be32 new_rkey;
+	__be32 lkey;
+	__be64 addr;
+	__be64 length;
+};
+
+struct mthca_raddr_seg {
+	__be64 raddr;
+	__be32 rkey;
+	u32    reserved;
+};
+
+struct mthca_atomic_seg {
+	__be64 swap_add;
+	__be64 compare;
+};
+
+struct mthca_data_seg {
+	__be32 byte_count;
+	__be32 lkey;
+	__be64 addr;
+};
+
+struct mthca_mlx_seg {
+	__be32 nda_op;
+	__be32 nds;
+	__be32 flags;		/* [17] VL15 [16] SLR [14:12] static rate
+				   [11:8] SL [3] C [2] E */
+	__be16 rlid;
+	__be16 vcrc;
+};
+
+static __always_inline void mthca_set_data_seg(struct mthca_data_seg *dseg,
+					       struct ib_sge *sg)
+{
+	dseg->byte_count = cpu_to_be32(sg->length);
+	dseg->lkey       = cpu_to_be32(sg->lkey);
+	dseg->addr       = cpu_to_be64(sg->addr);
+}
+
+static __always_inline void mthca_set_data_seg_inval(struct mthca_data_seg *dseg)
+{
+	dseg->byte_count = 0;
+	dseg->lkey       = cpu_to_be32(MTHCA_INVAL_LKEY);
+	dseg->addr       = 0;
+}
+
+#endif /* MTHCA_WQE_H */
diff --git a/drivers/infiniband/hw/nes/Kconfig b/drivers/infiniband/hw/nes/Kconfig
new file mode 100644
index 0000000..846dc97
--- /dev/null
+++ b/drivers/infiniband/hw/nes/Kconfig
@@ -0,0 +1,16 @@
+config INFINIBAND_NES
+	tristate "NetEffect RNIC Driver"
+	depends on PCI && INET && INFINIBAND
+	select LIBCRC32C
+	select INET_LRO
+	---help---
+	  This is the RDMA Network Interface Card (RNIC) driver for
+	  NetEffect Ethernet Cluster Server Adapters.
+
+config INFINIBAND_NES_DEBUG
+	bool "Verbose debugging output"
+	depends on INFINIBAND_NES
+	default n
+	---help---
+	  This option enables debug messages from the NetEffect RNIC
+	  driver.  Select this if you are diagnosing a problem.
diff --git a/drivers/infiniband/hw/nes/Makefile b/drivers/infiniband/hw/nes/Makefile
new file mode 100644
index 0000000..97820c2
--- /dev/null
+++ b/drivers/infiniband/hw/nes/Makefile
@@ -0,0 +1,3 @@
+obj-$(CONFIG_INFINIBAND_NES) += iw_nes.o
+
+iw_nes-objs := nes.o nes_hw.o nes_nic.o nes_utils.o nes_verbs.o nes_cm.o nes_mgt.o
diff --git a/drivers/infiniband/hw/nes/nes.c b/drivers/infiniband/hw/nes/nes.c
new file mode 100644
index 0000000..3b2a6dc
--- /dev/null
+++ b/drivers/infiniband/hw/nes/nes.c
@@ -0,0 +1,1269 @@
+/*
+ * Copyright (c) 2006 - 2011 Intel Corporation.  All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/ethtool.h>
+#include <linux/mii.h>
+#include <linux/if_vlan.h>
+#include <linux/crc32.h>
+#include <linux/in.h>
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/if_arp.h>
+#include <linux/highmem.h>
+#include <linux/slab.h>
+#include <asm/io.h>
+#include <asm/irq.h>
+#include <asm/byteorder.h>
+#include <rdma/ib_smi.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_pack.h>
+#include <rdma/iw_cm.h>
+
+#include "nes.h"
+
+#include <net/netevent.h>
+#include <net/neighbour.h>
+#include <linux/route.h>
+#include <net/ip_fib.h>
+
+MODULE_AUTHOR("NetEffect");
+MODULE_DESCRIPTION("NetEffect RNIC Low-level iWARP Driver");
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_VERSION(DRV_VERSION);
+
+int max_mtu = 9000;
+int interrupt_mod_interval = 0;
+
+/* Interoperability */
+int mpa_version = 1;
+module_param(mpa_version, int, 0644);
+MODULE_PARM_DESC(mpa_version, "MPA version to be used int MPA Req/Resp (0 or 1)");
+
+/* Interoperability */
+int disable_mpa_crc = 0;
+module_param(disable_mpa_crc, int, 0644);
+MODULE_PARM_DESC(disable_mpa_crc, "Disable checking of MPA CRC");
+
+unsigned int nes_drv_opt = NES_DRV_OPT_DISABLE_INT_MOD | NES_DRV_OPT_ENABLE_PAU;
+module_param(nes_drv_opt, int, 0644);
+MODULE_PARM_DESC(nes_drv_opt, "Driver option parameters");
+
+unsigned int nes_debug_level = 0;
+module_param_named(debug_level, nes_debug_level, uint, 0644);
+MODULE_PARM_DESC(debug_level, "Enable debug output level");
+
+unsigned int wqm_quanta = 0x10000;
+module_param(wqm_quanta, int, 0644);
+MODULE_PARM_DESC(wqm_quanta, "WQM quanta");
+
+static bool limit_maxrdreqsz;
+module_param(limit_maxrdreqsz, bool, 0644);
+MODULE_PARM_DESC(limit_maxrdreqsz, "Limit max read request size to 256 Bytes");
+
+LIST_HEAD(nes_adapter_list);
+static LIST_HEAD(nes_dev_list);
+
+atomic_t qps_destroyed;
+
+static unsigned int ee_flsh_adapter;
+static unsigned int sysfs_nonidx_addr;
+static unsigned int sysfs_idx_addr;
+
+static struct pci_device_id nes_pci_table[] = {
+	{ PCI_VDEVICE(NETEFFECT, PCI_DEVICE_ID_NETEFFECT_NE020), },
+	{ PCI_VDEVICE(NETEFFECT, PCI_DEVICE_ID_NETEFFECT_NE020_KR), },
+	{0}
+};
+
+MODULE_DEVICE_TABLE(pci, nes_pci_table);
+
+/* registered nes netlink callbacks */
+static struct ibnl_client_cbs nes_nl_cb_table[] = {
+	[RDMA_NL_IWPM_REG_PID] = {.dump = iwpm_register_pid_cb},
+	[RDMA_NL_IWPM_ADD_MAPPING] = {.dump = iwpm_add_mapping_cb},
+	[RDMA_NL_IWPM_QUERY_MAPPING] = {.dump = iwpm_add_and_query_mapping_cb},
+	[RDMA_NL_IWPM_HANDLE_ERR] = {.dump = iwpm_mapping_error_cb},
+	[RDMA_NL_IWPM_MAPINFO] = {.dump = iwpm_mapping_info_cb},
+	[RDMA_NL_IWPM_MAPINFO_NUM] = {.dump = iwpm_ack_mapping_info_cb}
+};
+
+static int nes_inetaddr_event(struct notifier_block *, unsigned long, void *);
+static int nes_net_event(struct notifier_block *, unsigned long, void *);
+static int nes_notifiers_registered;
+
+
+static struct notifier_block nes_inetaddr_notifier = {
+	.notifier_call = nes_inetaddr_event
+};
+
+static struct notifier_block nes_net_notifier = {
+	.notifier_call = nes_net_event
+};
+
+/**
+ * nes_inetaddr_event
+ */
+static int nes_inetaddr_event(struct notifier_block *notifier,
+		unsigned long event, void *ptr)
+{
+	struct in_ifaddr *ifa = ptr;
+	struct net_device *event_netdev = ifa->ifa_dev->dev;
+	struct nes_device *nesdev;
+	struct net_device *netdev;
+	struct net_device *upper_dev;
+	struct nes_vnic *nesvnic;
+	unsigned int is_bonded;
+
+	nes_debug(NES_DBG_NETDEV, "nes_inetaddr_event: ip address %pI4, netmask %pI4.\n",
+		  &ifa->ifa_address, &ifa->ifa_mask);
+	list_for_each_entry(nesdev, &nes_dev_list, list) {
+		nes_debug(NES_DBG_NETDEV, "Nesdev list entry = 0x%p. (%s)\n",
+				nesdev, nesdev->netdev[0]->name);
+		netdev = nesdev->netdev[0];
+		nesvnic = netdev_priv(netdev);
+		upper_dev = netdev_master_upper_dev_get(netdev);
+		is_bonded = netif_is_bond_slave(netdev) &&
+			    (upper_dev == event_netdev);
+		if ((netdev == event_netdev) || is_bonded) {
+			if (nesvnic->rdma_enabled == 0) {
+				nes_debug(NES_DBG_NETDEV, "Returning without processing event for %s since"
+						" RDMA is not enabled.\n",
+						netdev->name);
+				return NOTIFY_OK;
+			}
+			/* we have ifa->ifa_address/mask here if we need it */
+			switch (event) {
+				case NETDEV_DOWN:
+					nes_debug(NES_DBG_NETDEV, "event:DOWN\n");
+					nes_write_indexed(nesdev,
+							NES_IDX_DST_IP_ADDR+(0x10*PCI_FUNC(nesdev->pcidev->devfn)), 0);
+
+					nes_manage_arp_cache(netdev, netdev->dev_addr,
+							ntohl(nesvnic->local_ipaddr), NES_ARP_DELETE);
+					nesvnic->local_ipaddr = 0;
+					if (is_bonded)
+						continue;
+					else
+						return NOTIFY_OK;
+					break;
+				case NETDEV_UP:
+					nes_debug(NES_DBG_NETDEV, "event:UP\n");
+
+					if (nesvnic->local_ipaddr != 0) {
+						nes_debug(NES_DBG_NETDEV, "Interface already has local_ipaddr\n");
+						return NOTIFY_OK;
+					}
+					/* fall through */
+				case NETDEV_CHANGEADDR:
+					/* Add the address to the IP table */
+					if (upper_dev)
+						nesvnic->local_ipaddr =
+							((struct in_device *)upper_dev->ip_ptr)->ifa_list->ifa_address;
+					else
+						nesvnic->local_ipaddr = ifa->ifa_address;
+
+					nes_write_indexed(nesdev,
+							NES_IDX_DST_IP_ADDR+(0x10*PCI_FUNC(nesdev->pcidev->devfn)),
+							ntohl(nesvnic->local_ipaddr));
+					nes_manage_arp_cache(netdev, netdev->dev_addr,
+							ntohl(nesvnic->local_ipaddr), NES_ARP_ADD);
+					if (is_bonded)
+						continue;
+					else
+						return NOTIFY_OK;
+					break;
+				default:
+					break;
+			}
+		}
+	}
+
+	return NOTIFY_DONE;
+}
+
+
+/**
+ * nes_net_event
+ */
+static int nes_net_event(struct notifier_block *notifier,
+		unsigned long event, void *ptr)
+{
+	struct neighbour *neigh = ptr;
+	struct nes_device *nesdev;
+	struct net_device *netdev;
+	struct nes_vnic *nesvnic;
+
+	switch (event) {
+		case NETEVENT_NEIGH_UPDATE:
+			list_for_each_entry(nesdev, &nes_dev_list, list) {
+				/* nes_debug(NES_DBG_NETDEV, "Nesdev list entry = 0x%p.\n", nesdev); */
+				netdev = nesdev->netdev[0];
+				nesvnic = netdev_priv(netdev);
+				if (netdev == neigh->dev) {
+					if (nesvnic->rdma_enabled == 0) {
+						nes_debug(NES_DBG_NETDEV, "Skipping device %s since no RDMA\n",
+								netdev->name);
+					} else {
+						if (neigh->nud_state & NUD_VALID) {
+							nes_manage_arp_cache(neigh->dev, neigh->ha,
+									ntohl(*(__be32 *)neigh->primary_key), NES_ARP_ADD);
+						} else {
+							nes_manage_arp_cache(neigh->dev, neigh->ha,
+									ntohl(*(__be32 *)neigh->primary_key), NES_ARP_DELETE);
+						}
+					}
+					return NOTIFY_OK;
+				}
+			}
+			break;
+		default:
+			nes_debug(NES_DBG_NETDEV, "NETEVENT_ %lu undefined\n", event);
+			break;
+	}
+
+	return NOTIFY_DONE;
+}
+
+
+/**
+ * nes_add_ref
+ */
+void nes_add_ref(struct ib_qp *ibqp)
+{
+	struct nes_qp *nesqp;
+
+	nesqp = to_nesqp(ibqp);
+	nes_debug(NES_DBG_QP, "Bumping refcount for QP%u.  Pre-inc value = %u\n",
+			ibqp->qp_num, atomic_read(&nesqp->refcount));
+	atomic_inc(&nesqp->refcount);
+}
+
+static void nes_cqp_rem_ref_callback(struct nes_device *nesdev, struct nes_cqp_request *cqp_request)
+{
+	unsigned long flags;
+	struct nes_qp *nesqp = cqp_request->cqp_callback_pointer;
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+
+	atomic_inc(&qps_destroyed);
+
+	/* Free the control structures */
+
+	if (nesqp->pbl_vbase) {
+		pci_free_consistent(nesdev->pcidev, nesqp->qp_mem_size,
+				nesqp->hwqp.q2_vbase, nesqp->hwqp.q2_pbase);
+		spin_lock_irqsave(&nesadapter->pbl_lock, flags);
+		nesadapter->free_256pbl++;
+		spin_unlock_irqrestore(&nesadapter->pbl_lock, flags);
+		pci_free_consistent(nesdev->pcidev, 256, nesqp->pbl_vbase, nesqp->pbl_pbase);
+		nesqp->pbl_vbase = NULL;
+
+	} else {
+		pci_free_consistent(nesdev->pcidev, nesqp->qp_mem_size,
+				nesqp->hwqp.sq_vbase, nesqp->hwqp.sq_pbase);
+	}
+	nes_free_resource(nesadapter, nesadapter->allocated_qps, nesqp->hwqp.qp_id);
+
+	nesadapter->qp_table[nesqp->hwqp.qp_id-NES_FIRST_QPN] = NULL;
+	kfree(nesqp->allocated_buffer);
+
+}
+
+/**
+ * nes_rem_ref
+ */
+void nes_rem_ref(struct ib_qp *ibqp)
+{
+	u64 u64temp;
+	struct nes_qp *nesqp;
+	struct nes_vnic *nesvnic = to_nesvnic(ibqp->device);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	struct nes_hw_cqp_wqe *cqp_wqe;
+	struct nes_cqp_request *cqp_request;
+	u32 opcode;
+
+	nesqp = to_nesqp(ibqp);
+
+	if (atomic_read(&nesqp->refcount) == 0) {
+		printk(KERN_INFO PFX "%s: Reference count already 0 for QP%d, last aeq = 0x%04X.\n",
+				__func__, ibqp->qp_num, nesqp->last_aeq);
+		BUG();
+	}
+
+	if (atomic_dec_and_test(&nesqp->refcount)) {
+		if (nesqp->pau_mode)
+			nes_destroy_pau_qp(nesdev, nesqp);
+
+		/* Destroy the QP */
+		cqp_request = nes_get_cqp_request(nesdev);
+		if (cqp_request == NULL) {
+			nes_debug(NES_DBG_QP, "Failed to get a cqp_request.\n");
+			return;
+		}
+		cqp_request->waiting = 0;
+		cqp_request->callback = 1;
+		cqp_request->cqp_callback = nes_cqp_rem_ref_callback;
+		cqp_request->cqp_callback_pointer = nesqp;
+		cqp_wqe = &cqp_request->cqp_wqe;
+
+		nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
+		opcode = NES_CQP_DESTROY_QP | NES_CQP_QP_TYPE_IWARP;
+
+		if (nesqp->hte_added) {
+			opcode  |= NES_CQP_QP_DEL_HTE;
+			nesqp->hte_added = 0;
+		}
+		set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX, opcode);
+		set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_ID_IDX, nesqp->hwqp.qp_id);
+		u64temp = (u64)nesqp->nesqp_context_pbase;
+		set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_QP_WQE_CONTEXT_LOW_IDX, u64temp);
+		nes_post_cqp_request(nesdev, cqp_request);
+	}
+}
+
+
+/**
+ * nes_get_qp
+ */
+struct ib_qp *nes_get_qp(struct ib_device *device, int qpn)
+{
+	struct nes_vnic *nesvnic = to_nesvnic(device);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+
+	if ((qpn < NES_FIRST_QPN) || (qpn >= (NES_FIRST_QPN + nesadapter->max_qp)))
+		return NULL;
+
+	return &nesadapter->qp_table[qpn - NES_FIRST_QPN]->ibqp;
+}
+
+
+/**
+ * nes_print_macaddr
+ */
+static void nes_print_macaddr(struct net_device *netdev)
+{
+	nes_debug(NES_DBG_INIT, "%s: %pM, IRQ %u\n",
+		  netdev->name, netdev->dev_addr, netdev->irq);
+}
+
+/**
+ * nes_interrupt - handle interrupts
+ */
+static irqreturn_t nes_interrupt(int irq, void *dev_id)
+{
+	struct nes_device *nesdev = (struct nes_device *)dev_id;
+	int handled = 0;
+	u32 int_mask;
+	u32 int_req;
+	u32 int_stat;
+	u32 intf_int_stat;
+	u32 timer_stat;
+
+	if (nesdev->msi_enabled) {
+		/* No need to read the interrupt pending register if msi is enabled */
+		handled = 1;
+	} else {
+		if (unlikely(nesdev->nesadapter->hw_rev == NE020_REV)) {
+			/* Master interrupt enable provides synchronization for kicking off bottom half
+			  when interrupt sharing is going on */
+			int_mask = nes_read32(nesdev->regs + NES_INT_MASK);
+			if (int_mask & 0x80000000) {
+				/* Check interrupt status to see if this might be ours */
+				int_stat = nes_read32(nesdev->regs + NES_INT_STAT);
+				int_req = nesdev->int_req;
+				if (int_stat&int_req) {
+					/* if interesting CEQ or AEQ is pending, claim the interrupt */
+					if ((int_stat&int_req) & (~(NES_INT_TIMER|NES_INT_INTF))) {
+						handled = 1;
+					} else {
+						if (((int_stat & int_req) & NES_INT_TIMER) == NES_INT_TIMER) {
+							/* Timer might be running but might be for another function */
+							timer_stat = nes_read32(nesdev->regs + NES_TIMER_STAT);
+							if ((timer_stat & nesdev->timer_int_req) != 0) {
+								handled = 1;
+							}
+						}
+						if ((((int_stat & int_req) & NES_INT_INTF) == NES_INT_INTF) &&
+								(handled == 0)) {
+							intf_int_stat = nes_read32(nesdev->regs+NES_INTF_INT_STAT);
+							if ((intf_int_stat & nesdev->intf_int_req) != 0) {
+								handled = 1;
+							}
+						}
+					}
+					if (handled) {
+						nes_write32(nesdev->regs+NES_INT_MASK, int_mask & (~0x80000000));
+						int_mask = nes_read32(nesdev->regs+NES_INT_MASK);
+						/* Save off the status to save an additional read */
+						nesdev->int_stat = int_stat;
+						nesdev->napi_isr_ran = 1;
+					}
+				}
+			}
+		} else {
+			handled = nes_read32(nesdev->regs+NES_INT_PENDING);
+		}
+	}
+
+	if (handled) {
+
+		if (nes_napi_isr(nesdev) == 0) {
+			tasklet_schedule(&nesdev->dpc_tasklet);
+
+		}
+		return IRQ_HANDLED;
+	} else {
+		return IRQ_NONE;
+	}
+}
+
+
+/**
+ * nes_probe - Device initialization
+ */
+static int nes_probe(struct pci_dev *pcidev, const struct pci_device_id *ent)
+{
+	struct net_device *netdev = NULL;
+	struct nes_device *nesdev = NULL;
+	int ret = 0;
+	void __iomem *mmio_regs = NULL;
+	u8 hw_rev;
+
+	assert(pcidev != NULL);
+	assert(ent != NULL);
+
+	printk(KERN_INFO PFX "NetEffect RNIC driver v%s loading. (%s)\n",
+			DRV_VERSION, pci_name(pcidev));
+
+	ret = pci_enable_device(pcidev);
+	if (ret) {
+		printk(KERN_ERR PFX "Unable to enable PCI device. (%s)\n", pci_name(pcidev));
+		goto bail0;
+	}
+
+	nes_debug(NES_DBG_INIT, "BAR0 (@0x%08lX) size = 0x%lX bytes\n",
+			(long unsigned int)pci_resource_start(pcidev, BAR_0),
+			(long unsigned int)pci_resource_len(pcidev, BAR_0));
+	nes_debug(NES_DBG_INIT, "BAR1 (@0x%08lX) size = 0x%lX bytes\n",
+			(long unsigned int)pci_resource_start(pcidev, BAR_1),
+			(long unsigned int)pci_resource_len(pcidev, BAR_1));
+
+	/* Make sure PCI base addr are MMIO */
+	if (!(pci_resource_flags(pcidev, BAR_0) & IORESOURCE_MEM) ||
+			!(pci_resource_flags(pcidev, BAR_1) & IORESOURCE_MEM)) {
+		printk(KERN_ERR PFX "PCI regions not an MMIO resource\n");
+		ret = -ENODEV;
+		goto bail1;
+	}
+
+	/* Reserve PCI I/O and memory resources */
+	ret = pci_request_regions(pcidev, DRV_NAME);
+	if (ret) {
+		printk(KERN_ERR PFX "Unable to request regions. (%s)\n", pci_name(pcidev));
+		goto bail1;
+	}
+
+	if ((sizeof(dma_addr_t) > 4)) {
+		ret = pci_set_dma_mask(pcidev, DMA_BIT_MASK(64));
+		if (ret < 0) {
+			printk(KERN_ERR PFX "64b DMA mask configuration failed\n");
+			goto bail2;
+		}
+		ret = pci_set_consistent_dma_mask(pcidev, DMA_BIT_MASK(64));
+		if (ret) {
+			printk(KERN_ERR PFX "64b DMA consistent mask configuration failed\n");
+			goto bail2;
+		}
+	} else {
+		ret = pci_set_dma_mask(pcidev, DMA_BIT_MASK(32));
+		if (ret < 0) {
+			printk(KERN_ERR PFX "32b DMA mask configuration failed\n");
+			goto bail2;
+		}
+		ret = pci_set_consistent_dma_mask(pcidev, DMA_BIT_MASK(32));
+		if (ret) {
+			printk(KERN_ERR PFX "32b DMA consistent mask configuration failed\n");
+			goto bail2;
+		}
+	}
+
+	pci_set_master(pcidev);
+
+	/* Allocate hardware structure */
+	nesdev = kzalloc(sizeof(struct nes_device), GFP_KERNEL);
+	if (!nesdev) {
+		printk(KERN_ERR PFX "%s: Unable to alloc hardware struct\n", pci_name(pcidev));
+		ret = -ENOMEM;
+		goto bail2;
+	}
+
+	nes_debug(NES_DBG_INIT, "Allocated nes device at %p\n", nesdev);
+	nesdev->pcidev = pcidev;
+	pci_set_drvdata(pcidev, nesdev);
+
+	pci_read_config_byte(pcidev, 0x0008, &hw_rev);
+	nes_debug(NES_DBG_INIT, "hw_rev=%u\n", hw_rev);
+
+	spin_lock_init(&nesdev->indexed_regs_lock);
+
+	/* Remap the PCI registers in adapter BAR0 to kernel VA space */
+	mmio_regs = ioremap_nocache(pci_resource_start(pcidev, BAR_0),
+				    pci_resource_len(pcidev, BAR_0));
+	if (mmio_regs == NULL) {
+		printk(KERN_ERR PFX "Unable to remap BAR0\n");
+		ret = -EIO;
+		goto bail3;
+	}
+	nesdev->regs = mmio_regs;
+	nesdev->index_reg = 0x50 + (PCI_FUNC(pcidev->devfn)*8) + mmio_regs;
+
+	/* Ensure interrupts are disabled */
+	nes_write32(nesdev->regs+NES_INT_MASK, 0x7fffffff);
+
+	if (nes_drv_opt & NES_DRV_OPT_ENABLE_MSI) {
+		if (!pci_enable_msi(nesdev->pcidev)) {
+			nesdev->msi_enabled = 1;
+			nes_debug(NES_DBG_INIT, "MSI is enabled for device %s\n",
+					pci_name(pcidev));
+		} else {
+			nes_debug(NES_DBG_INIT, "MSI is disabled by linux for device %s\n",
+					pci_name(pcidev));
+		}
+	} else {
+		nes_debug(NES_DBG_INIT, "MSI not requested due to driver options for device %s\n",
+				pci_name(pcidev));
+	}
+
+	nesdev->csr_start = pci_resource_start(nesdev->pcidev, BAR_0);
+	nesdev->doorbell_region = pci_resource_start(nesdev->pcidev, BAR_1);
+
+	/* Init the adapter */
+	nesdev->nesadapter = nes_init_adapter(nesdev, hw_rev);
+	if (!nesdev->nesadapter) {
+		printk(KERN_ERR PFX "Unable to initialize adapter.\n");
+		ret = -ENOMEM;
+		goto bail5;
+	}
+	nesdev->nesadapter->et_rx_coalesce_usecs_irq = interrupt_mod_interval;
+	nesdev->nesadapter->wqm_quanta = wqm_quanta;
+
+	/* nesdev->base_doorbell_index =
+			nesdev->nesadapter->pd_config_base[PCI_FUNC(nesdev->pcidev->devfn)]; */
+	nesdev->base_doorbell_index = 1;
+	nesdev->doorbell_start = nesdev->nesadapter->doorbell_start;
+	if (nesdev->nesadapter->phy_type[0] == NES_PHY_TYPE_PUMA_1G) {
+		switch (PCI_FUNC(nesdev->pcidev->devfn) %
+			nesdev->nesadapter->port_count) {
+		case 1:
+			nesdev->mac_index = 2;
+			break;
+		case 2:
+			nesdev->mac_index = 1;
+			break;
+		case 3:
+			nesdev->mac_index = 3;
+			break;
+		case 0:
+		default:
+			nesdev->mac_index = 0;
+		}
+	} else {
+		nesdev->mac_index = PCI_FUNC(nesdev->pcidev->devfn) %
+						nesdev->nesadapter->port_count;
+	}
+
+	if ((limit_maxrdreqsz ||
+	     ((nesdev->nesadapter->phy_type[0] == NES_PHY_TYPE_GLADIUS) &&
+	      (hw_rev == NE020_REV1))) &&
+	    (pcie_get_readrq(pcidev) > 256)) {
+		if (pcie_set_readrq(pcidev, 256))
+			printk(KERN_ERR PFX "Unable to set max read request"
+				" to 256 bytes\n");
+		else
+			nes_debug(NES_DBG_INIT, "Max read request size set"
+				" to 256 bytes\n");
+	}
+
+	tasklet_init(&nesdev->dpc_tasklet, nes_dpc, (unsigned long)nesdev);
+
+	/* bring up the Control QP */
+	if (nes_init_cqp(nesdev)) {
+		ret = -ENODEV;
+		goto bail6;
+	}
+
+	/* Arm the CCQ */
+	nes_write32(nesdev->regs+NES_CQE_ALLOC, NES_CQE_ALLOC_NOTIFY_NEXT |
+			PCI_FUNC(nesdev->pcidev->devfn));
+	nes_read32(nesdev->regs+NES_CQE_ALLOC);
+
+	/* Enable the interrupts */
+	nesdev->int_req = (0x101 << PCI_FUNC(nesdev->pcidev->devfn)) |
+			(1 << (PCI_FUNC(nesdev->pcidev->devfn)+16));
+	if (PCI_FUNC(nesdev->pcidev->devfn) < 4) {
+		nesdev->int_req |= (1 << (PCI_FUNC(nesdev->mac_index)+24));
+	}
+
+	/* TODO: This really should be the first driver to load, not function 0 */
+	if (PCI_FUNC(nesdev->pcidev->devfn) == 0) {
+		/* pick up PCI and critical errors if the first driver to load */
+		nesdev->intf_int_req = NES_INTF_INT_PCIERR | NES_INTF_INT_CRITERR;
+		nesdev->int_req |= NES_INT_INTF;
+	} else {
+		nesdev->intf_int_req = 0;
+	}
+	nesdev->intf_int_req |= (1 << (PCI_FUNC(nesdev->pcidev->devfn)+16));
+	nes_write_indexed(nesdev, NES_IDX_DEBUG_ERROR_MASKS0, 0);
+	nes_write_indexed(nesdev, NES_IDX_DEBUG_ERROR_MASKS1, 0);
+	nes_write_indexed(nesdev, NES_IDX_DEBUG_ERROR_MASKS2, 0x00001265);
+	nes_write_indexed(nesdev, NES_IDX_DEBUG_ERROR_MASKS4, 0x18021804);
+
+	nes_write_indexed(nesdev, NES_IDX_DEBUG_ERROR_MASKS3, 0x17801790);
+
+	/* deal with both periodic and one_shot */
+	nesdev->timer_int_req = 0x101 << PCI_FUNC(nesdev->pcidev->devfn);
+	nesdev->nesadapter->timer_int_req |= nesdev->timer_int_req;
+	nes_debug(NES_DBG_INIT, "setting int_req for function %u, nesdev = 0x%04X, adapter = 0x%04X\n",
+			PCI_FUNC(nesdev->pcidev->devfn),
+			nesdev->timer_int_req, nesdev->nesadapter->timer_int_req);
+
+	nes_write32(nesdev->regs+NES_INTF_INT_MASK, ~(nesdev->intf_int_req));
+
+	list_add_tail(&nesdev->list, &nes_dev_list);
+
+	/* Request an interrupt line for the driver */
+	ret = request_irq(pcidev->irq, nes_interrupt, IRQF_SHARED, DRV_NAME, nesdev);
+	if (ret) {
+		printk(KERN_ERR PFX "%s: requested IRQ %u is busy\n",
+				pci_name(pcidev), pcidev->irq);
+		goto bail65;
+	}
+
+	nes_write32(nesdev->regs+NES_INT_MASK, ~nesdev->int_req);
+
+	if (nes_notifiers_registered == 0) {
+		register_inetaddr_notifier(&nes_inetaddr_notifier);
+		register_netevent_notifier(&nes_net_notifier);
+	}
+	nes_notifiers_registered++;
+
+	if (ibnl_add_client(RDMA_NL_NES, RDMA_NL_IWPM_NUM_OPS, nes_nl_cb_table))
+		printk(KERN_ERR PFX "%s[%u]: Failed to add netlink callback\n",
+			__func__, __LINE__);
+
+	ret = iwpm_init(RDMA_NL_NES);
+	if (ret) {
+		printk(KERN_ERR PFX "%s: port mapper initialization failed\n",
+				pci_name(pcidev));
+		goto bail7;
+	}
+
+	INIT_DELAYED_WORK(&nesdev->work, nes_recheck_link_status);
+
+	/* Initialize network devices */
+	netdev = nes_netdev_init(nesdev, mmio_regs);
+	if (netdev == NULL) {
+		ret = -ENOMEM;
+		goto bail7;
+	}
+
+	/* Register network device */
+	ret = register_netdev(netdev);
+	if (ret) {
+		printk(KERN_ERR PFX "Unable to register netdev, ret = %d\n", ret);
+		nes_netdev_destroy(netdev);
+		goto bail7;
+	}
+
+	nes_print_macaddr(netdev);
+
+	nesdev->netdev_count++;
+	nesdev->nesadapter->netdev_count++;
+
+	printk(KERN_INFO PFX "%s: NetEffect RNIC driver successfully loaded.\n",
+			pci_name(pcidev));
+	return 0;
+
+	bail7:
+	printk(KERN_ERR PFX "bail7\n");
+	while (nesdev->netdev_count > 0) {
+		nesdev->netdev_count--;
+		nesdev->nesadapter->netdev_count--;
+
+		unregister_netdev(nesdev->netdev[nesdev->netdev_count]);
+		nes_netdev_destroy(nesdev->netdev[nesdev->netdev_count]);
+	}
+
+	nes_debug(NES_DBG_INIT, "netdev_count=%d, nesadapter->netdev_count=%d\n",
+			nesdev->netdev_count, nesdev->nesadapter->netdev_count);
+	ibnl_remove_client(RDMA_NL_NES);
+
+	nes_notifiers_registered--;
+	if (nes_notifiers_registered == 0) {
+		unregister_netevent_notifier(&nes_net_notifier);
+		unregister_inetaddr_notifier(&nes_inetaddr_notifier);
+	}
+
+	list_del(&nesdev->list);
+	nes_destroy_cqp(nesdev);
+
+	bail65:
+	printk(KERN_ERR PFX "bail65\n");
+	free_irq(pcidev->irq, nesdev);
+	if (nesdev->msi_enabled) {
+		pci_disable_msi(pcidev);
+	}
+	bail6:
+	printk(KERN_ERR PFX "bail6\n");
+	tasklet_kill(&nesdev->dpc_tasklet);
+	/* Deallocate the Adapter Structure */
+	nes_destroy_adapter(nesdev->nesadapter);
+
+	bail5:
+	printk(KERN_ERR PFX "bail5\n");
+	iounmap(nesdev->regs);
+
+	bail3:
+	printk(KERN_ERR PFX "bail3\n");
+	kfree(nesdev);
+
+	bail2:
+	pci_release_regions(pcidev);
+
+	bail1:
+	pci_disable_device(pcidev);
+
+	bail0:
+	return ret;
+}
+
+
+/**
+ * nes_remove - unload from kernel
+ */
+static void nes_remove(struct pci_dev *pcidev)
+{
+	struct nes_device *nesdev = pci_get_drvdata(pcidev);
+	struct net_device *netdev;
+	int netdev_index = 0;
+	unsigned long flags;
+
+		if (nesdev->netdev_count) {
+			netdev = nesdev->netdev[netdev_index];
+			if (netdev) {
+				netif_stop_queue(netdev);
+				unregister_netdev(netdev);
+				nes_netdev_destroy(netdev);
+
+				nesdev->netdev[netdev_index] = NULL;
+				nesdev->netdev_count--;
+				nesdev->nesadapter->netdev_count--;
+			}
+		}
+	ibnl_remove_client(RDMA_NL_NES);
+	iwpm_exit(RDMA_NL_NES);
+
+	nes_notifiers_registered--;
+	if (nes_notifiers_registered == 0) {
+		unregister_netevent_notifier(&nes_net_notifier);
+		unregister_inetaddr_notifier(&nes_inetaddr_notifier);
+	}
+
+	list_del(&nesdev->list);
+	nes_destroy_cqp(nesdev);
+
+	free_irq(pcidev->irq, nesdev);
+	tasklet_kill(&nesdev->dpc_tasklet);
+
+	spin_lock_irqsave(&nesdev->nesadapter->phy_lock, flags);
+	if (nesdev->link_recheck) {
+		spin_unlock_irqrestore(&nesdev->nesadapter->phy_lock, flags);
+		cancel_delayed_work_sync(&nesdev->work);
+	} else {
+		spin_unlock_irqrestore(&nesdev->nesadapter->phy_lock, flags);
+	}
+
+	/* Deallocate the Adapter Structure */
+	nes_destroy_adapter(nesdev->nesadapter);
+
+	if (nesdev->msi_enabled) {
+		pci_disable_msi(pcidev);
+	}
+
+	iounmap(nesdev->regs);
+	kfree(nesdev);
+
+	/* nes_debug(NES_DBG_SHUTDOWN, "calling pci_release_regions.\n"); */
+	pci_release_regions(pcidev);
+	pci_disable_device(pcidev);
+	pci_set_drvdata(pcidev, NULL);
+}
+
+
+static struct pci_driver nes_pci_driver = {
+	.name = DRV_NAME,
+	.id_table = nes_pci_table,
+	.probe = nes_probe,
+	.remove = nes_remove,
+};
+
+static ssize_t nes_show_adapter(struct device_driver *ddp, char *buf)
+{
+	unsigned int  devfn = 0xffffffff;
+	unsigned char bus_number = 0xff;
+	unsigned int  i = 0;
+	struct nes_device *nesdev;
+
+	list_for_each_entry(nesdev, &nes_dev_list, list) {
+		if (i == ee_flsh_adapter) {
+			devfn = nesdev->pcidev->devfn;
+			bus_number = nesdev->pcidev->bus->number;
+			break;
+		}
+		i++;
+	}
+
+	return snprintf(buf, PAGE_SIZE, "%x:%x\n", bus_number, devfn);
+}
+
+static ssize_t nes_store_adapter(struct device_driver *ddp,
+	const char *buf, size_t count)
+{
+	char *p = (char *)buf;
+
+	ee_flsh_adapter = simple_strtoul(p, &p, 10);
+	return strnlen(buf, count);
+}
+
+static ssize_t nes_show_ee_cmd(struct device_driver *ddp, char *buf)
+{
+	u32 eeprom_cmd = 0xdead;
+	u32 i = 0;
+	struct nes_device *nesdev;
+
+	list_for_each_entry(nesdev, &nes_dev_list, list) {
+		if (i == ee_flsh_adapter) {
+			eeprom_cmd = nes_read32(nesdev->regs + NES_EEPROM_COMMAND);
+			break;
+		}
+		i++;
+	}
+	return snprintf(buf, PAGE_SIZE, "0x%x\n", eeprom_cmd);
+}
+
+static ssize_t nes_store_ee_cmd(struct device_driver *ddp,
+	const char *buf, size_t count)
+{
+	char *p = (char *)buf;
+	u32 val;
+	u32 i = 0;
+	struct nes_device *nesdev;
+
+	if (p[1] == 'x' || p[1] == 'X' || p[0] == 'x' || p[0] == 'X') {
+		val = simple_strtoul(p, &p, 16);
+		list_for_each_entry(nesdev, &nes_dev_list, list) {
+			if (i == ee_flsh_adapter) {
+				nes_write32(nesdev->regs + NES_EEPROM_COMMAND, val);
+				break;
+			}
+			i++;
+		}
+	}
+	return strnlen(buf, count);
+}
+
+static ssize_t nes_show_ee_data(struct device_driver *ddp, char *buf)
+{
+	u32 eeprom_data = 0xdead;
+	u32 i = 0;
+	struct nes_device *nesdev;
+
+	list_for_each_entry(nesdev, &nes_dev_list, list) {
+		if (i == ee_flsh_adapter) {
+			eeprom_data = nes_read32(nesdev->regs + NES_EEPROM_DATA);
+			break;
+		}
+		i++;
+	}
+
+	return  snprintf(buf, PAGE_SIZE, "0x%x\n", eeprom_data);
+}
+
+static ssize_t nes_store_ee_data(struct device_driver *ddp,
+	const char *buf, size_t count)
+{
+	char *p = (char *)buf;
+	u32 val;
+	u32 i = 0;
+	struct nes_device *nesdev;
+
+	if (p[1] == 'x' || p[1] == 'X' || p[0] == 'x' || p[0] == 'X') {
+		val = simple_strtoul(p, &p, 16);
+		list_for_each_entry(nesdev, &nes_dev_list, list) {
+			if (i == ee_flsh_adapter) {
+				nes_write32(nesdev->regs + NES_EEPROM_DATA, val);
+				break;
+			}
+			i++;
+		}
+	}
+	return strnlen(buf, count);
+}
+
+static ssize_t nes_show_flash_cmd(struct device_driver *ddp, char *buf)
+{
+	u32 flash_cmd = 0xdead;
+	u32 i = 0;
+	struct nes_device *nesdev;
+
+	list_for_each_entry(nesdev, &nes_dev_list, list) {
+		if (i == ee_flsh_adapter) {
+			flash_cmd = nes_read32(nesdev->regs + NES_FLASH_COMMAND);
+			break;
+		}
+		i++;
+	}
+
+	return  snprintf(buf, PAGE_SIZE, "0x%x\n", flash_cmd);
+}
+
+static ssize_t nes_store_flash_cmd(struct device_driver *ddp,
+	const char *buf, size_t count)
+{
+	char *p = (char *)buf;
+	u32 val;
+	u32 i = 0;
+	struct nes_device *nesdev;
+
+	if (p[1] == 'x' || p[1] == 'X' || p[0] == 'x' || p[0] == 'X') {
+		val = simple_strtoul(p, &p, 16);
+		list_for_each_entry(nesdev, &nes_dev_list, list) {
+			if (i == ee_flsh_adapter) {
+				nes_write32(nesdev->regs + NES_FLASH_COMMAND, val);
+				break;
+			}
+			i++;
+		}
+	}
+	return strnlen(buf, count);
+}
+
+static ssize_t nes_show_flash_data(struct device_driver *ddp, char *buf)
+{
+	u32 flash_data = 0xdead;
+	u32 i = 0;
+	struct nes_device *nesdev;
+
+	list_for_each_entry(nesdev, &nes_dev_list, list) {
+		if (i == ee_flsh_adapter) {
+			flash_data = nes_read32(nesdev->regs + NES_FLASH_DATA);
+			break;
+		}
+		i++;
+	}
+
+	return  snprintf(buf, PAGE_SIZE, "0x%x\n", flash_data);
+}
+
+static ssize_t nes_store_flash_data(struct device_driver *ddp,
+	const char *buf, size_t count)
+{
+	char *p = (char *)buf;
+	u32 val;
+	u32 i = 0;
+	struct nes_device *nesdev;
+
+	if (p[1] == 'x' || p[1] == 'X' || p[0] == 'x' || p[0] == 'X') {
+		val = simple_strtoul(p, &p, 16);
+		list_for_each_entry(nesdev, &nes_dev_list, list) {
+			if (i == ee_flsh_adapter) {
+				nes_write32(nesdev->regs + NES_FLASH_DATA, val);
+				break;
+			}
+			i++;
+		}
+	}
+	return strnlen(buf, count);
+}
+
+static ssize_t nes_show_nonidx_addr(struct device_driver *ddp, char *buf)
+{
+	return  snprintf(buf, PAGE_SIZE, "0x%x\n", sysfs_nonidx_addr);
+}
+
+static ssize_t nes_store_nonidx_addr(struct device_driver *ddp,
+	const char *buf, size_t count)
+{
+	char *p = (char *)buf;
+
+	if (p[1] == 'x' || p[1] == 'X' || p[0] == 'x' || p[0] == 'X')
+		sysfs_nonidx_addr = simple_strtoul(p, &p, 16);
+
+	return strnlen(buf, count);
+}
+
+static ssize_t nes_show_nonidx_data(struct device_driver *ddp, char *buf)
+{
+	u32 nonidx_data = 0xdead;
+	u32 i = 0;
+	struct nes_device *nesdev;
+
+	list_for_each_entry(nesdev, &nes_dev_list, list) {
+		if (i == ee_flsh_adapter) {
+			nonidx_data = nes_read32(nesdev->regs + sysfs_nonidx_addr);
+			break;
+		}
+		i++;
+	}
+
+	return  snprintf(buf, PAGE_SIZE, "0x%x\n", nonidx_data);
+}
+
+static ssize_t nes_store_nonidx_data(struct device_driver *ddp,
+	const char *buf, size_t count)
+{
+	char *p = (char *)buf;
+	u32 val;
+	u32 i = 0;
+	struct nes_device *nesdev;
+
+	if (p[1] == 'x' || p[1] == 'X' || p[0] == 'x' || p[0] == 'X') {
+		val = simple_strtoul(p, &p, 16);
+		list_for_each_entry(nesdev, &nes_dev_list, list) {
+			if (i == ee_flsh_adapter) {
+				nes_write32(nesdev->regs + sysfs_nonidx_addr, val);
+				break;
+			}
+			i++;
+		}
+	}
+	return strnlen(buf, count);
+}
+
+static ssize_t nes_show_idx_addr(struct device_driver *ddp, char *buf)
+{
+	return  snprintf(buf, PAGE_SIZE, "0x%x\n", sysfs_idx_addr);
+}
+
+static ssize_t nes_store_idx_addr(struct device_driver *ddp,
+	const char *buf, size_t count)
+{
+	char *p = (char *)buf;
+
+	if (p[1] == 'x' || p[1] == 'X' || p[0] == 'x' || p[0] == 'X')
+		sysfs_idx_addr = simple_strtoul(p, &p, 16);
+
+	return strnlen(buf, count);
+}
+
+static ssize_t nes_show_idx_data(struct device_driver *ddp, char *buf)
+{
+	u32 idx_data = 0xdead;
+	u32 i = 0;
+	struct nes_device *nesdev;
+
+	list_for_each_entry(nesdev, &nes_dev_list, list) {
+		if (i == ee_flsh_adapter) {
+			idx_data = nes_read_indexed(nesdev, sysfs_idx_addr);
+			break;
+		}
+		i++;
+	}
+
+	return  snprintf(buf, PAGE_SIZE, "0x%x\n", idx_data);
+}
+
+static ssize_t nes_store_idx_data(struct device_driver *ddp,
+	const char *buf, size_t count)
+{
+	char *p = (char *)buf;
+	u32 val;
+	u32 i = 0;
+	struct nes_device *nesdev;
+
+	if (p[1] == 'x' || p[1] == 'X' || p[0] == 'x' || p[0] == 'X') {
+		val = simple_strtoul(p, &p, 16);
+		list_for_each_entry(nesdev, &nes_dev_list, list) {
+			if (i == ee_flsh_adapter) {
+				nes_write_indexed(nesdev, sysfs_idx_addr, val);
+				break;
+			}
+			i++;
+		}
+	}
+	return strnlen(buf, count);
+}
+
+
+/**
+ * nes_show_wqm_quanta
+ */
+static ssize_t nes_show_wqm_quanta(struct device_driver *ddp, char *buf)
+{
+	u32 wqm_quanta_value = 0xdead;
+	u32 i = 0;
+	struct nes_device *nesdev;
+
+	list_for_each_entry(nesdev, &nes_dev_list, list) {
+		if (i == ee_flsh_adapter) {
+			wqm_quanta_value = nesdev->nesadapter->wqm_quanta;
+			break;
+		}
+		i++;
+	}
+
+	return  snprintf(buf, PAGE_SIZE, "0x%X\n", wqm_quanta_value);
+}
+
+
+/**
+ * nes_store_wqm_quanta
+ */
+static ssize_t nes_store_wqm_quanta(struct device_driver *ddp,
+					const char *buf, size_t count)
+{
+	unsigned long wqm_quanta_value;
+	u32 wqm_config1;
+	u32 i = 0;
+	struct nes_device *nesdev;
+
+	if (kstrtoul(buf, 0, &wqm_quanta_value) < 0)
+		return -EINVAL;
+
+	list_for_each_entry(nesdev, &nes_dev_list, list) {
+		if (i == ee_flsh_adapter) {
+			nesdev->nesadapter->wqm_quanta = wqm_quanta_value;
+			wqm_config1 = nes_read_indexed(nesdev,
+						NES_IDX_WQM_CONFIG1);
+			nes_write_indexed(nesdev, NES_IDX_WQM_CONFIG1,
+					((wqm_quanta_value << 1) |
+					(wqm_config1 & 0x00000001)));
+			break;
+		}
+		i++;
+	}
+	return strnlen(buf, count);
+}
+
+static DRIVER_ATTR(adapter, S_IRUSR | S_IWUSR,
+		   nes_show_adapter, nes_store_adapter);
+static DRIVER_ATTR(eeprom_cmd, S_IRUSR | S_IWUSR,
+		   nes_show_ee_cmd, nes_store_ee_cmd);
+static DRIVER_ATTR(eeprom_data, S_IRUSR | S_IWUSR,
+		   nes_show_ee_data, nes_store_ee_data);
+static DRIVER_ATTR(flash_cmd, S_IRUSR | S_IWUSR,
+		   nes_show_flash_cmd, nes_store_flash_cmd);
+static DRIVER_ATTR(flash_data, S_IRUSR | S_IWUSR,
+		   nes_show_flash_data, nes_store_flash_data);
+static DRIVER_ATTR(nonidx_addr, S_IRUSR | S_IWUSR,
+		   nes_show_nonidx_addr, nes_store_nonidx_addr);
+static DRIVER_ATTR(nonidx_data, S_IRUSR | S_IWUSR,
+		   nes_show_nonidx_data, nes_store_nonidx_data);
+static DRIVER_ATTR(idx_addr, S_IRUSR | S_IWUSR,
+		   nes_show_idx_addr, nes_store_idx_addr);
+static DRIVER_ATTR(idx_data, S_IRUSR | S_IWUSR,
+		   nes_show_idx_data, nes_store_idx_data);
+static DRIVER_ATTR(wqm_quanta, S_IRUSR | S_IWUSR,
+		   nes_show_wqm_quanta, nes_store_wqm_quanta);
+
+static int nes_create_driver_sysfs(struct pci_driver *drv)
+{
+	int error;
+	error  = driver_create_file(&drv->driver, &driver_attr_adapter);
+	error |= driver_create_file(&drv->driver, &driver_attr_eeprom_cmd);
+	error |= driver_create_file(&drv->driver, &driver_attr_eeprom_data);
+	error |= driver_create_file(&drv->driver, &driver_attr_flash_cmd);
+	error |= driver_create_file(&drv->driver, &driver_attr_flash_data);
+	error |= driver_create_file(&drv->driver, &driver_attr_nonidx_addr);
+	error |= driver_create_file(&drv->driver, &driver_attr_nonidx_data);
+	error |= driver_create_file(&drv->driver, &driver_attr_idx_addr);
+	error |= driver_create_file(&drv->driver, &driver_attr_idx_data);
+	error |= driver_create_file(&drv->driver, &driver_attr_wqm_quanta);
+	return error;
+}
+
+static void nes_remove_driver_sysfs(struct pci_driver *drv)
+{
+	driver_remove_file(&drv->driver, &driver_attr_adapter);
+	driver_remove_file(&drv->driver, &driver_attr_eeprom_cmd);
+	driver_remove_file(&drv->driver, &driver_attr_eeprom_data);
+	driver_remove_file(&drv->driver, &driver_attr_flash_cmd);
+	driver_remove_file(&drv->driver, &driver_attr_flash_data);
+	driver_remove_file(&drv->driver, &driver_attr_nonidx_addr);
+	driver_remove_file(&drv->driver, &driver_attr_nonidx_data);
+	driver_remove_file(&drv->driver, &driver_attr_idx_addr);
+	driver_remove_file(&drv->driver, &driver_attr_idx_data);
+	driver_remove_file(&drv->driver, &driver_attr_wqm_quanta);
+}
+
+/**
+ * nes_init_module - module initialization entry point
+ */
+static int __init nes_init_module(void)
+{
+	int retval;
+	int retval1;
+
+	retval = nes_cm_start();
+	if (retval) {
+		printk(KERN_ERR PFX "Unable to start NetEffect iWARP CM.\n");
+		return retval;
+	}
+	retval = pci_register_driver(&nes_pci_driver);
+	if (retval >= 0) {
+		retval1 = nes_create_driver_sysfs(&nes_pci_driver);
+		if (retval1 < 0)
+			printk(KERN_ERR PFX "Unable to create NetEffect sys files.\n");
+	}
+	return retval;
+}
+
+
+/**
+ * nes_exit_module - module unload entry point
+ */
+static void __exit nes_exit_module(void)
+{
+	nes_cm_stop();
+	nes_remove_driver_sysfs(&nes_pci_driver);
+
+	pci_unregister_driver(&nes_pci_driver);
+}
+
+
+module_init(nes_init_module);
+module_exit(nes_exit_module);
diff --git a/drivers/infiniband/hw/nes/nes.h b/drivers/infiniband/hw/nes/nes.h
new file mode 100644
index 0000000..bd9d132
--- /dev/null
+++ b/drivers/infiniband/hw/nes/nes.h
@@ -0,0 +1,582 @@
+/*
+ * Copyright (c) 2006 - 2011 Intel Corporation.  All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __NES_H
+#define __NES_H
+
+#include <linux/netdevice.h>
+#include <linux/inetdevice.h>
+#include <linux/spinlock.h>
+#include <linux/kernel.h>
+#include <linux/delay.h>
+#include <linux/pci.h>
+#include <linux/dma-mapping.h>
+#include <linux/workqueue.h>
+#include <linux/slab.h>
+#include <asm/io.h>
+#include <linux/crc32c.h>
+
+#include <rdma/ib_smi.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_pack.h>
+#include <rdma/rdma_cm.h>
+#include <rdma/iw_cm.h>
+#include <rdma/rdma_netlink.h>
+#include <rdma/iw_portmap.h>
+
+#define NES_SEND_FIRST_WRITE
+
+#define QUEUE_DISCONNECTS
+
+#define DRV_NAME    "iw_nes"
+#define DRV_VERSION "1.5.0.1"
+#define PFX         DRV_NAME ": "
+
+/*
+ * NetEffect PCI vendor id and NE010 PCI device id.
+ */
+#ifndef PCI_VENDOR_ID_NETEFFECT	/* not in pci.ids yet */
+#define PCI_VENDOR_ID_NETEFFECT          0x1678
+#define PCI_DEVICE_ID_NETEFFECT_NE020    0x0100
+#define PCI_DEVICE_ID_NETEFFECT_NE020_KR 0x0110
+#endif
+
+#define NE020_REV   4
+#define NE020_REV1  5
+
+#define BAR_0       0
+#define BAR_1       2
+
+#define RX_BUF_SIZE             (1536 + 8)
+#define NES_REG0_SIZE           (4 * 1024)
+#define NES_TX_TIMEOUT          (6*HZ)
+#define NES_FIRST_QPN           64
+#define NES_SW_CONTEXT_ALIGN    1024
+
+#define NES_NIC_MAX_NICS        16
+#define NES_MAX_ARP_TABLE_SIZE  4096
+
+#define NES_NIC_CEQ_SIZE        8
+/* NICs will be on a separate CQ */
+#define NES_CCEQ_SIZE ((nesadapter->max_cq / nesadapter->port_count) - 32)
+
+#define NES_MAX_PORT_COUNT 4
+
+#define MAX_DPC_ITERATIONS               128
+
+#define NES_DRV_OPT_ENABLE_MPA_VER_0     0x00000001
+#define NES_DRV_OPT_DISABLE_MPA_CRC      0x00000002
+#define NES_DRV_OPT_DISABLE_FIRST_WRITE  0x00000004
+#define NES_DRV_OPT_DISABLE_INTF         0x00000008
+#define NES_DRV_OPT_ENABLE_MSI           0x00000010
+#define NES_DRV_OPT_DUAL_LOGICAL_PORT    0x00000020
+#define NES_DRV_OPT_SUPRESS_OPTION_BC    0x00000040
+#define NES_DRV_OPT_NO_INLINE_DATA       0x00000080
+#define NES_DRV_OPT_DISABLE_INT_MOD      0x00000100
+#define NES_DRV_OPT_DISABLE_VIRT_WQ      0x00000200
+#define NES_DRV_OPT_ENABLE_PAU           0x00000400
+
+#define NES_AEQ_EVENT_TIMEOUT         2500
+#define NES_DISCONNECT_EVENT_TIMEOUT  2000
+
+/* debug levels */
+/* must match userspace */
+#define NES_DBG_HW          0x00000001
+#define NES_DBG_INIT        0x00000002
+#define NES_DBG_ISR         0x00000004
+#define NES_DBG_PHY         0x00000008
+#define NES_DBG_NETDEV      0x00000010
+#define NES_DBG_CM          0x00000020
+#define NES_DBG_CM1         0x00000040
+#define NES_DBG_NIC_RX      0x00000080
+#define NES_DBG_NIC_TX      0x00000100
+#define NES_DBG_CQP         0x00000200
+#define NES_DBG_MMAP        0x00000400
+#define NES_DBG_MR          0x00000800
+#define NES_DBG_PD          0x00001000
+#define NES_DBG_CQ          0x00002000
+#define NES_DBG_QP          0x00004000
+#define NES_DBG_MOD_QP      0x00008000
+#define NES_DBG_AEQ         0x00010000
+#define NES_DBG_IW_RX       0x00020000
+#define NES_DBG_IW_TX       0x00040000
+#define NES_DBG_SHUTDOWN    0x00080000
+#define NES_DBG_PAU         0x00100000
+#define NES_DBG_NLMSG       0x00200000
+#define NES_DBG_RSVD1       0x10000000
+#define NES_DBG_RSVD2       0x20000000
+#define NES_DBG_RSVD3       0x40000000
+#define NES_DBG_RSVD4       0x80000000
+#define NES_DBG_ALL         0xffffffff
+
+#ifdef CONFIG_INFINIBAND_NES_DEBUG
+#define nes_debug(level, fmt, args...) \
+do { \
+	if (level & nes_debug_level) \
+		printk(KERN_ERR PFX "%s[%u]: " fmt, __func__, __LINE__, ##args); \
+} while (0)
+
+#define assert(expr) \
+do { \
+	if (!(expr)) { \
+		printk(KERN_ERR PFX "Assertion failed! %s, %s, %s, line %d\n", \
+			   #expr, __FILE__, __func__, __LINE__); \
+	} \
+} while (0)
+
+#define NES_EVENT_TIMEOUT   1200000
+#else
+#define nes_debug(level, fmt, args...)
+#define assert(expr)          do {} while (0)
+
+#define NES_EVENT_TIMEOUT   100000
+#endif
+
+#include "nes_hw.h"
+#include "nes_verbs.h"
+#include "nes_context.h"
+#include "nes_user.h"
+#include "nes_cm.h"
+#include "nes_mgt.h"
+
+extern int max_mtu;
+#define max_frame_len (max_mtu+ETH_HLEN)
+extern int interrupt_mod_interval;
+extern int nes_if_count;
+extern int mpa_version;
+extern int disable_mpa_crc;
+extern unsigned int nes_drv_opt;
+extern unsigned int nes_debug_level;
+extern unsigned int wqm_quanta;
+extern struct list_head nes_adapter_list;
+
+extern atomic_t cm_connects;
+extern atomic_t cm_accepts;
+extern atomic_t cm_disconnects;
+extern atomic_t cm_closes;
+extern atomic_t cm_connecteds;
+extern atomic_t cm_connect_reqs;
+extern atomic_t cm_rejects;
+extern atomic_t mod_qp_timouts;
+extern atomic_t qps_created;
+extern atomic_t qps_destroyed;
+extern atomic_t sw_qps_destroyed;
+extern u32 mh_detected;
+extern u32 mh_pauses_sent;
+extern u32 cm_packets_sent;
+extern u32 cm_packets_bounced;
+extern u32 cm_packets_created;
+extern u32 cm_packets_received;
+extern u32 cm_packets_dropped;
+extern u32 cm_packets_retrans;
+extern atomic_t cm_listens_created;
+extern atomic_t cm_listens_destroyed;
+extern u32 cm_backlog_drops;
+extern atomic_t cm_loopbacks;
+extern atomic_t cm_nodes_created;
+extern atomic_t cm_nodes_destroyed;
+extern atomic_t cm_accel_dropped_pkts;
+extern atomic_t cm_resets_recvd;
+extern atomic_t pau_qps_created;
+extern atomic_t pau_qps_destroyed;
+
+extern u32 int_mod_timer_init;
+extern u32 int_mod_cq_depth_256;
+extern u32 int_mod_cq_depth_128;
+extern u32 int_mod_cq_depth_32;
+extern u32 int_mod_cq_depth_24;
+extern u32 int_mod_cq_depth_16;
+extern u32 int_mod_cq_depth_4;
+extern u32 int_mod_cq_depth_1;
+
+struct nes_device {
+	struct nes_adapter	   *nesadapter;
+	void __iomem           *regs;
+	void __iomem           *index_reg;
+	struct pci_dev         *pcidev;
+	struct net_device      *netdev[NES_NIC_MAX_NICS];
+	u64                    link_status_interrupts;
+	struct tasklet_struct  dpc_tasklet;
+	spinlock_t             indexed_regs_lock;
+	unsigned long          csr_start;
+	unsigned long          doorbell_region;
+	unsigned long          doorbell_start;
+	unsigned long          mac_tx_errors;
+	unsigned long          mac_pause_frames_sent;
+	unsigned long          mac_pause_frames_received;
+	unsigned long          mac_rx_errors;
+	unsigned long          mac_rx_crc_errors;
+	unsigned long          mac_rx_symbol_err_frames;
+	unsigned long          mac_rx_jabber_frames;
+	unsigned long          mac_rx_oversized_frames;
+	unsigned long          mac_rx_short_frames;
+	unsigned long          port_rx_discards;
+	unsigned long          port_tx_discards;
+	unsigned int           mac_index;
+	unsigned int           nes_stack_start;
+
+	/* Control Structures */
+	void                   *cqp_vbase;
+	dma_addr_t             cqp_pbase;
+	u32                    cqp_mem_size;
+	u8                     ceq_index;
+	u8                     nic_ceq_index;
+	struct nes_hw_cqp      cqp;
+	struct nes_hw_cq       ccq;
+	struct list_head       cqp_avail_reqs;
+	struct list_head       cqp_pending_reqs;
+	struct nes_cqp_request *nes_cqp_requests;
+
+	u32                    int_req;
+	u32                    int_stat;
+	u32                    timer_int_req;
+	u32                    timer_only_int_count;
+	u32                    intf_int_req;
+	u32                    last_mac_tx_pauses;
+	u32                    last_used_chunks_tx;
+	struct list_head       list;
+
+	u16                    base_doorbell_index;
+	u16                    currcq_count;
+	u16                    deepcq_count;
+	u8                     iw_status;
+	u8                     msi_enabled;
+	u8                     netdev_count;
+	u8                     napi_isr_ran;
+	u8                     disable_rx_flow_control;
+	u8                     disable_tx_flow_control;
+
+	struct delayed_work    work;
+	u8                     link_recheck;
+};
+
+/* Receive skb private area - must fit in skb->cb area */
+struct nes_rskb_cb {
+	u64                    busaddr;
+	u32                    maplen;
+	u32                    seqnum;
+	u8                     *data_start;
+	struct nes_qp          *nesqp;
+};
+
+static inline __le32 get_crc_value(struct nes_v4_quad *nes_quad)
+{
+	u32 crc_value;
+	crc_value = crc32c(~0, (void *)nes_quad, sizeof (struct nes_v4_quad));
+
+	/*
+	 * With commit ef19454b ("[LIB] crc32c: Keep intermediate crc
+	 * state in cpu order"), behavior of crc32c changes on
+	 * big-endian platforms.  Our algorithm expects the previous
+	 * behavior; otherwise we have RDMA connection establishment
+	 * issue on big-endian.
+	 */
+	return cpu_to_le32(crc_value);
+}
+
+static inline void
+set_wqe_64bit_value(__le32 *wqe_words, u32 index, u64 value)
+{
+	wqe_words[index]     = cpu_to_le32((u32) value);
+	wqe_words[index + 1] = cpu_to_le32(upper_32_bits(value));
+}
+
+static inline void
+set_wqe_32bit_value(__le32 *wqe_words, u32 index, u32 value)
+{
+	wqe_words[index] = cpu_to_le32(value);
+}
+
+static inline void
+nes_fill_init_cqp_wqe(struct nes_hw_cqp_wqe *cqp_wqe, struct nes_device *nesdev)
+{
+	cqp_wqe->wqe_words[NES_CQP_WQE_COMP_CTX_LOW_IDX]       = 0;
+	cqp_wqe->wqe_words[NES_CQP_WQE_COMP_CTX_HIGH_IDX]      = 0;
+	cqp_wqe->wqe_words[NES_CQP_WQE_COMP_SCRATCH_LOW_IDX]   = 0;
+	cqp_wqe->wqe_words[NES_CQP_WQE_COMP_SCRATCH_HIGH_IDX]  = 0;
+	cqp_wqe->wqe_words[NES_CQP_STAG_WQE_PBL_BLK_COUNT_IDX] = 0;
+	cqp_wqe->wqe_words[NES_CQP_STAG_WQE_PBL_LEN_IDX]       = 0;
+	cqp_wqe->wqe_words[NES_CQP_STAG_WQE_LEN_LOW_IDX]       = 0;
+	cqp_wqe->wqe_words[NES_CQP_STAG_WQE_PA_LOW_IDX]        = 0;
+	cqp_wqe->wqe_words[NES_CQP_STAG_WQE_PA_HIGH_IDX]       = 0;
+}
+
+static inline void
+nes_fill_init_qp_wqe(struct nes_hw_qp_wqe *wqe, struct nes_qp *nesqp, u32 head)
+{
+	u32 value;
+	value = ((u32)((unsigned long) nesqp)) | head;
+	set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_COMP_CTX_HIGH_IDX,
+			(u32)(upper_32_bits((unsigned long)(nesqp))));
+	set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_COMP_CTX_LOW_IDX, value);
+}
+
+/* Read from memory-mapped device */
+static inline u32 nes_read_indexed(struct nes_device *nesdev, u32 reg_index)
+{
+	unsigned long flags;
+	void __iomem *addr = nesdev->index_reg;
+	u32 value;
+
+	spin_lock_irqsave(&nesdev->indexed_regs_lock, flags);
+
+	writel(reg_index, addr);
+	value = readl((void __iomem *)addr + 4);
+
+	spin_unlock_irqrestore(&nesdev->indexed_regs_lock, flags);
+	return value;
+}
+
+static inline u32 nes_read32(const void __iomem *addr)
+{
+	return readl(addr);
+}
+
+static inline u16 nes_read16(const void __iomem *addr)
+{
+	return readw(addr);
+}
+
+static inline u8 nes_read8(const void __iomem *addr)
+{
+	return readb(addr);
+}
+
+/* Write to memory-mapped device */
+static inline void nes_write_indexed(struct nes_device *nesdev, u32 reg_index, u32 val)
+{
+	unsigned long flags;
+	void __iomem *addr = nesdev->index_reg;
+
+	spin_lock_irqsave(&nesdev->indexed_regs_lock, flags);
+
+	writel(reg_index, addr);
+	writel(val, (void __iomem *)addr + 4);
+
+	spin_unlock_irqrestore(&nesdev->indexed_regs_lock, flags);
+}
+
+static inline void nes_write32(void __iomem *addr, u32 val)
+{
+	writel(val, addr);
+}
+
+static inline void nes_write16(void __iomem *addr, u16 val)
+{
+	writew(val, addr);
+}
+
+static inline void nes_write8(void __iomem *addr, u8 val)
+{
+	writeb(val, addr);
+}
+
+enum nes_resource {
+	NES_RESOURCE_MW = 1,
+	NES_RESOURCE_FAST_MR,
+	NES_RESOURCE_PHYS_MR,
+	NES_RESOURCE_USER_MR,
+	NES_RESOURCE_PD,
+	NES_RESOURCE_QP,
+	NES_RESOURCE_CQ,
+	NES_RESOURCE_ARP
+};
+
+static inline int nes_alloc_resource(struct nes_adapter *nesadapter,
+		unsigned long *resource_array, u32 max_resources,
+		u32 *req_resource_num, u32 *next, enum nes_resource resource_type)
+{
+	unsigned long flags;
+	u32 resource_num;
+
+	spin_lock_irqsave(&nesadapter->resource_lock, flags);
+
+	resource_num = find_next_zero_bit(resource_array, max_resources, *next);
+	if (resource_num >= max_resources) {
+		resource_num = find_first_zero_bit(resource_array, max_resources);
+		if (resource_num >= max_resources) {
+			printk(KERN_ERR PFX "%s: No available resources [type=%u].\n", __func__, resource_type);
+			spin_unlock_irqrestore(&nesadapter->resource_lock, flags);
+			return -EMFILE;
+		}
+	}
+	set_bit(resource_num, resource_array);
+	*next = resource_num+1;
+	if (*next == max_resources) {
+		*next = 0;
+	}
+	spin_unlock_irqrestore(&nesadapter->resource_lock, flags);
+	*req_resource_num = resource_num;
+
+	return 0;
+}
+
+static inline int nes_is_resource_allocated(struct nes_adapter *nesadapter,
+		unsigned long *resource_array, u32 resource_num)
+{
+	unsigned long flags;
+	int bit_is_set;
+
+	spin_lock_irqsave(&nesadapter->resource_lock, flags);
+
+	bit_is_set = test_bit(resource_num, resource_array);
+	nes_debug(NES_DBG_HW, "resource_num %u is%s allocated.\n",
+			resource_num, (bit_is_set ? "": " not"));
+	spin_unlock_irqrestore(&nesadapter->resource_lock, flags);
+
+	return bit_is_set;
+}
+
+static inline void nes_free_resource(struct nes_adapter *nesadapter,
+		unsigned long *resource_array, u32 resource_num)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&nesadapter->resource_lock, flags);
+	clear_bit(resource_num, resource_array);
+	spin_unlock_irqrestore(&nesadapter->resource_lock, flags);
+}
+
+static inline struct nes_vnic *to_nesvnic(struct ib_device *ibdev)
+{
+	return container_of(ibdev, struct nes_ib_device, ibdev)->nesvnic;
+}
+
+static inline struct nes_pd *to_nespd(struct ib_pd *ibpd)
+{
+	return container_of(ibpd, struct nes_pd, ibpd);
+}
+
+static inline struct nes_ucontext *to_nesucontext(struct ib_ucontext *ibucontext)
+{
+	return container_of(ibucontext, struct nes_ucontext, ibucontext);
+}
+
+static inline struct nes_mr *to_nesmr(struct ib_mr *ibmr)
+{
+	return container_of(ibmr, struct nes_mr, ibmr);
+}
+
+static inline struct nes_mr *to_nesmr_from_ibfmr(struct ib_fmr *ibfmr)
+{
+	return container_of(ibfmr, struct nes_mr, ibfmr);
+}
+
+static inline struct nes_mr *to_nesmw(struct ib_mw *ibmw)
+{
+	return container_of(ibmw, struct nes_mr, ibmw);
+}
+
+static inline struct nes_fmr *to_nesfmr(struct nes_mr *nesmr)
+{
+	return container_of(nesmr, struct nes_fmr, nesmr);
+}
+
+static inline struct nes_cq *to_nescq(struct ib_cq *ibcq)
+{
+	return container_of(ibcq, struct nes_cq, ibcq);
+}
+
+static inline struct nes_qp *to_nesqp(struct ib_qp *ibqp)
+{
+	return container_of(ibqp, struct nes_qp, ibqp);
+}
+
+
+
+/* nes.c */
+void nes_add_ref(struct ib_qp *);
+void nes_rem_ref(struct ib_qp *);
+struct ib_qp *nes_get_qp(struct ib_device *, int);
+
+
+/* nes_hw.c */
+struct nes_adapter *nes_init_adapter(struct nes_device *, u8);
+void  nes_nic_init_timer_defaults(struct nes_device *, u8);
+void nes_destroy_adapter(struct nes_adapter *);
+int nes_init_cqp(struct nes_device *);
+int nes_init_phy(struct nes_device *);
+int nes_init_nic_qp(struct nes_device *, struct net_device *);
+void nes_destroy_nic_qp(struct nes_vnic *);
+int nes_napi_isr(struct nes_device *);
+void nes_dpc(unsigned long);
+void nes_nic_ce_handler(struct nes_device *, struct nes_hw_nic_cq *);
+void nes_iwarp_ce_handler(struct nes_device *, struct nes_hw_cq *);
+int nes_destroy_cqp(struct nes_device *);
+int nes_nic_cm_xmit(struct sk_buff *, struct net_device *);
+void nes_recheck_link_status(struct work_struct *work);
+void nes_terminate_timeout(unsigned long context);
+
+/* nes_nic.c */
+struct net_device *nes_netdev_init(struct nes_device *, void __iomem *);
+void nes_netdev_destroy(struct net_device *);
+int nes_nic_cm_xmit(struct sk_buff *, struct net_device *);
+
+/* nes_cm.c */
+void *nes_cm_create(struct net_device *);
+int nes_cm_recv(struct sk_buff *, struct net_device *);
+void nes_update_arp(unsigned char *, u32, u32, u16, u16);
+void nes_manage_arp_cache(struct net_device *, unsigned char *, u32, u32);
+void nes_sock_release(struct nes_qp *, unsigned long *);
+void flush_wqes(struct nes_device *nesdev, struct nes_qp *, u32, u32);
+int nes_manage_apbvt(struct nes_vnic *, u32, u32, u32);
+int nes_cm_disconn(struct nes_qp *);
+void nes_cm_disconn_worker(void *);
+
+/* nes_verbs.c */
+int nes_hw_modify_qp(struct nes_device *, struct nes_qp *, u32, u32, u32);
+int nes_modify_qp(struct ib_qp *, struct ib_qp_attr *, int, struct ib_udata *);
+struct nes_ib_device *nes_init_ofa_device(struct net_device *);
+void  nes_port_ibevent(struct nes_vnic *nesvnic);
+void nes_destroy_ofa_device(struct nes_ib_device *);
+int nes_register_ofa_device(struct nes_ib_device *);
+
+/* nes_util.c */
+int nes_read_eeprom_values(struct nes_device *, struct nes_adapter *);
+void nes_write_1G_phy_reg(struct nes_device *, u8, u8, u16);
+void nes_read_1G_phy_reg(struct nes_device *, u8, u8, u16 *);
+void nes_write_10G_phy_reg(struct nes_device *, u16, u8, u16, u16);
+void nes_read_10G_phy_reg(struct nes_device *, u8, u8, u16);
+struct nes_cqp_request *nes_get_cqp_request(struct nes_device *);
+void nes_free_cqp_request(struct nes_device *nesdev,
+			  struct nes_cqp_request *cqp_request);
+void nes_put_cqp_request(struct nes_device *nesdev,
+			 struct nes_cqp_request *cqp_request);
+void nes_post_cqp_request(struct nes_device *, struct nes_cqp_request *);
+int nes_arp_table(struct nes_device *, u32, u8 *, u32);
+void nes_mh_fix(unsigned long);
+void nes_clc(unsigned long);
+void nes_dump_mem(unsigned int, void *, int);
+u32 nes_crc32(u32, u32, u32, u32, u8 *, u32, u32, u32);
+
+#endif	/* __NES_H */
diff --git a/drivers/infiniband/hw/nes/nes_cm.c b/drivers/infiniband/hw/nes/nes_cm.c
new file mode 100644
index 0000000..6f09a72
--- /dev/null
+++ b/drivers/infiniband/hw/nes/nes_cm.c
@@ -0,0 +1,4153 @@
+/*
+ * Copyright (c) 2006 - 2014 Intel Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+
+#define TCPOPT_TIMESTAMP 8
+
+#include <linux/atomic.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>
+#include <linux/init.h>
+#include <linux/if_arp.h>
+#include <linux/if_vlan.h>
+#include <linux/notifier.h>
+#include <linux/net.h>
+#include <linux/types.h>
+#include <linux/timer.h>
+#include <linux/time.h>
+#include <linux/delay.h>
+#include <linux/etherdevice.h>
+#include <linux/netdevice.h>
+#include <linux/random.h>
+#include <linux/list.h>
+#include <linux/threads.h>
+#include <linux/highmem.h>
+#include <linux/slab.h>
+#include <net/arp.h>
+#include <net/neighbour.h>
+#include <net/route.h>
+#include <net/ip_fib.h>
+#include <net/tcp.h>
+#include <linux/fcntl.h>
+
+#include "nes.h"
+
+u32 cm_packets_sent;
+u32 cm_packets_bounced;
+u32 cm_packets_dropped;
+u32 cm_packets_retrans;
+u32 cm_packets_created;
+u32 cm_packets_received;
+atomic_t cm_listens_created;
+atomic_t cm_listens_destroyed;
+u32 cm_backlog_drops;
+atomic_t cm_loopbacks;
+atomic_t cm_nodes_created;
+atomic_t cm_nodes_destroyed;
+atomic_t cm_accel_dropped_pkts;
+atomic_t cm_resets_recvd;
+
+static inline int mini_cm_accelerated(struct nes_cm_core *, struct nes_cm_node *);
+static struct nes_cm_listener *mini_cm_listen(struct nes_cm_core *, struct nes_vnic *, struct nes_cm_info *);
+static int mini_cm_del_listen(struct nes_cm_core *, struct nes_cm_listener *);
+static struct nes_cm_node *mini_cm_connect(struct nes_cm_core *, struct nes_vnic *, u16, void *, struct nes_cm_info *);
+static int mini_cm_close(struct nes_cm_core *, struct nes_cm_node *);
+static int mini_cm_accept(struct nes_cm_core *, struct nes_cm_node *);
+static int mini_cm_reject(struct nes_cm_core *, struct nes_cm_node *);
+static int mini_cm_recv_pkt(struct nes_cm_core *, struct nes_vnic *, struct sk_buff *);
+static int mini_cm_dealloc_core(struct nes_cm_core *);
+static int mini_cm_get(struct nes_cm_core *);
+static int mini_cm_set(struct nes_cm_core *, u32, u32);
+
+static void form_cm_frame(struct sk_buff *, struct nes_cm_node *, void *, u32, void *, u32, u8);
+static int add_ref_cm_node(struct nes_cm_node *);
+static int rem_ref_cm_node(struct nes_cm_core *, struct nes_cm_node *);
+
+static int nes_cm_disconn_true(struct nes_qp *);
+static int nes_cm_post_event(struct nes_cm_event *event);
+static int nes_disconnect(struct nes_qp *nesqp, int abrupt);
+static void nes_disconnect_worker(struct work_struct *work);
+
+static int send_mpa_request(struct nes_cm_node *, struct sk_buff *);
+static int send_mpa_reject(struct nes_cm_node *);
+static int send_syn(struct nes_cm_node *, u32, struct sk_buff *);
+static int send_reset(struct nes_cm_node *, struct sk_buff *);
+static int send_ack(struct nes_cm_node *cm_node, struct sk_buff *skb);
+static int send_fin(struct nes_cm_node *cm_node, struct sk_buff *skb);
+static void process_packet(struct nes_cm_node *, struct sk_buff *, struct nes_cm_core *);
+
+static void active_open_err(struct nes_cm_node *, struct sk_buff *, int);
+static void passive_open_err(struct nes_cm_node *, struct sk_buff *, int);
+static void cleanup_retrans_entry(struct nes_cm_node *);
+static void handle_rcv_mpa(struct nes_cm_node *, struct sk_buff *);
+static void free_retrans_entry(struct nes_cm_node *cm_node);
+static int handle_tcp_options(struct nes_cm_node *cm_node, struct tcphdr *tcph, struct sk_buff *skb, int optionsize, int passive);
+
+/* CM event handler functions */
+static void cm_event_connected(struct nes_cm_event *);
+static void cm_event_connect_error(struct nes_cm_event *);
+static void cm_event_reset(struct nes_cm_event *);
+static void cm_event_mpa_req(struct nes_cm_event *);
+static void cm_event_mpa_reject(struct nes_cm_event *);
+static void handle_recv_entry(struct nes_cm_node *cm_node, u32 rem_node);
+
+/* MPA build functions */
+static int cm_build_mpa_frame(struct nes_cm_node *, u8 **, u16 *, u8 *, u8);
+static void build_mpa_v2(struct nes_cm_node *, void *, u8);
+static void build_mpa_v1(struct nes_cm_node *, void *, u8);
+static void build_rdma0_msg(struct nes_cm_node *, struct nes_qp **);
+
+static void print_core(struct nes_cm_core *core);
+static void record_ird_ord(struct nes_cm_node *, u16, u16);
+
+/* External CM API Interface */
+/* instance of function pointers for client API */
+/* set address of this instance to cm_core->cm_ops at cm_core alloc */
+static struct nes_cm_ops nes_cm_api = {
+	mini_cm_accelerated,
+	mini_cm_listen,
+	mini_cm_del_listen,
+	mini_cm_connect,
+	mini_cm_close,
+	mini_cm_accept,
+	mini_cm_reject,
+	mini_cm_recv_pkt,
+	mini_cm_dealloc_core,
+	mini_cm_get,
+	mini_cm_set
+};
+
+static struct nes_cm_core *g_cm_core;
+
+atomic_t cm_connects;
+atomic_t cm_accepts;
+atomic_t cm_disconnects;
+atomic_t cm_closes;
+atomic_t cm_connecteds;
+atomic_t cm_connect_reqs;
+atomic_t cm_rejects;
+
+int nes_add_ref_cm_node(struct nes_cm_node *cm_node)
+{
+	return add_ref_cm_node(cm_node);
+}
+
+int nes_rem_ref_cm_node(struct nes_cm_node *cm_node)
+{
+	return rem_ref_cm_node(cm_node->cm_core, cm_node);
+}
+/**
+ * create_event
+ */
+static struct nes_cm_event *create_event(struct nes_cm_node *	cm_node,
+					 enum nes_cm_event_type type)
+{
+	struct nes_cm_event *event;
+
+	if (!cm_node->cm_id)
+		return NULL;
+
+	/* allocate an empty event */
+	event = kzalloc(sizeof(*event), GFP_ATOMIC);
+
+	if (!event)
+		return NULL;
+
+	event->type = type;
+	event->cm_node = cm_node;
+	event->cm_info.rem_addr = cm_node->rem_addr;
+	event->cm_info.loc_addr = cm_node->loc_addr;
+	event->cm_info.rem_port = cm_node->rem_port;
+	event->cm_info.loc_port = cm_node->loc_port;
+	event->cm_info.cm_id = cm_node->cm_id;
+
+	nes_debug(NES_DBG_CM, "cm_node=%p Created event=%p, type=%u, "
+		  "dst_addr=%08x[%x], src_addr=%08x[%x]\n",
+		  cm_node, event, type, event->cm_info.loc_addr,
+		  event->cm_info.loc_port, event->cm_info.rem_addr,
+		  event->cm_info.rem_port);
+
+	nes_cm_post_event(event);
+	return event;
+}
+
+
+/**
+ * send_mpa_request
+ */
+static int send_mpa_request(struct nes_cm_node *cm_node, struct sk_buff *skb)
+{
+	u8 start_addr = 0;
+	u8 *start_ptr = &start_addr;
+	u8 **start_buff = &start_ptr;
+	u16 buff_len = 0;
+
+	if (!skb) {
+		nes_debug(NES_DBG_CM, "skb set to NULL\n");
+		return -1;
+	}
+
+	/* send an MPA Request frame */
+	cm_build_mpa_frame(cm_node, start_buff, &buff_len, NULL, MPA_KEY_REQUEST);
+	form_cm_frame(skb, cm_node, NULL, 0, *start_buff, buff_len, SET_ACK);
+
+	return schedule_nes_timer(cm_node, skb, NES_TIMER_TYPE_SEND, 1, 0);
+}
+
+
+
+static int send_mpa_reject(struct nes_cm_node *cm_node)
+{
+	struct sk_buff *skb = NULL;
+	u8 start_addr = 0;
+	u8 *start_ptr = &start_addr;
+	u8 **start_buff = &start_ptr;
+	u16 buff_len = 0;
+	struct ietf_mpa_v1 *mpa_frame;
+
+	skb = dev_alloc_skb(MAX_CM_BUFFER);
+	if (!skb) {
+		nes_debug(NES_DBG_CM, "Failed to get a Free pkt\n");
+		return -ENOMEM;
+	}
+
+	/* send an MPA reject frame */
+	cm_build_mpa_frame(cm_node, start_buff, &buff_len, NULL, MPA_KEY_REPLY);
+	mpa_frame = (struct ietf_mpa_v1 *)*start_buff;
+	mpa_frame->flags |= IETF_MPA_FLAGS_REJECT;
+	form_cm_frame(skb, cm_node, NULL, 0, *start_buff, buff_len, SET_ACK | SET_FIN);
+
+	cm_node->state = NES_CM_STATE_FIN_WAIT1;
+	return schedule_nes_timer(cm_node, skb, NES_TIMER_TYPE_SEND, 1, 0);
+}
+
+
+/**
+ * recv_mpa - process a received TCP pkt, we are expecting an
+ * IETF MPA frame
+ */
+static int parse_mpa(struct nes_cm_node *cm_node, u8 *buffer, u32 *type,
+		     u32 len)
+{
+	struct ietf_mpa_v1 *mpa_frame;
+	struct ietf_mpa_v2 *mpa_v2_frame;
+	struct ietf_rtr_msg *rtr_msg;
+	int mpa_hdr_len;
+	int priv_data_len;
+
+	*type = NES_MPA_REQUEST_ACCEPT;
+
+	/* assume req frame is in tcp data payload */
+	if (len < sizeof(struct ietf_mpa_v1)) {
+		nes_debug(NES_DBG_CM, "The received ietf buffer was too small (%x)\n", len);
+		return -EINVAL;
+	}
+
+	/* points to the beginning of the frame, which could be MPA V1 or V2 */
+	mpa_frame = (struct ietf_mpa_v1 *)buffer;
+	mpa_hdr_len = sizeof(struct ietf_mpa_v1);
+	priv_data_len = ntohs(mpa_frame->priv_data_len);
+
+	/* make sure mpa private data len is less than 512 bytes */
+	if (priv_data_len > IETF_MAX_PRIV_DATA_LEN) {
+		nes_debug(NES_DBG_CM, "The received Length of Private"
+			  " Data field exceeds 512 octets\n");
+		return -EINVAL;
+	}
+	/*
+	 * make sure MPA receiver interoperate with the
+	 * received MPA version and MPA key information
+	 *
+	 */
+	if (mpa_frame->rev != IETF_MPA_V1 && mpa_frame->rev != IETF_MPA_V2) {
+		nes_debug(NES_DBG_CM, "The received mpa version"
+			  " is not supported\n");
+		return -EINVAL;
+	}
+	/*
+	* backwards compatibility only
+	*/
+	if (mpa_frame->rev > cm_node->mpa_frame_rev) {
+		nes_debug(NES_DBG_CM, "The received mpa version"
+			" can not be interoperated\n");
+		return -EINVAL;
+	} else {
+		cm_node->mpa_frame_rev = mpa_frame->rev;
+	}
+
+	if (cm_node->state != NES_CM_STATE_MPAREQ_SENT) {
+		if (memcmp(mpa_frame->key, IEFT_MPA_KEY_REQ, IETF_MPA_KEY_SIZE)) {
+			nes_debug(NES_DBG_CM, "Unexpected MPA Key received \n");
+			return -EINVAL;
+		}
+	} else {
+		if (memcmp(mpa_frame->key, IEFT_MPA_KEY_REP, IETF_MPA_KEY_SIZE)) {
+			nes_debug(NES_DBG_CM, "Unexpected MPA Key received \n");
+			return -EINVAL;
+		}
+	}
+
+	if (priv_data_len + mpa_hdr_len != len) {
+		nes_debug(NES_DBG_CM, "The received ietf buffer was not right"
+			" complete (%x + %x != %x)\n",
+			priv_data_len, mpa_hdr_len, len);
+		return -EINVAL;
+	}
+	/* make sure it does not exceed the max size */
+	if (len > MAX_CM_BUFFER) {
+		nes_debug(NES_DBG_CM, "The received ietf buffer was too large"
+			" (%x + %x != %x)\n",
+			priv_data_len, mpa_hdr_len, len);
+		return -EINVAL;
+	}
+
+	cm_node->mpa_frame_size = priv_data_len;
+
+	switch (mpa_frame->rev) {
+	case IETF_MPA_V2: {
+		u16 ird_size;
+		u16 ord_size;
+		u16 rtr_ctrl_ird;
+		u16 rtr_ctrl_ord;
+
+		mpa_v2_frame = (struct ietf_mpa_v2 *)buffer;
+		mpa_hdr_len += IETF_RTR_MSG_SIZE;
+		cm_node->mpa_frame_size -= IETF_RTR_MSG_SIZE;
+		rtr_msg = &mpa_v2_frame->rtr_msg;
+
+		/* parse rtr message */
+		rtr_ctrl_ird = ntohs(rtr_msg->ctrl_ird);
+		rtr_ctrl_ord = ntohs(rtr_msg->ctrl_ord);
+		ird_size = rtr_ctrl_ird & IETF_NO_IRD_ORD;
+		ord_size = rtr_ctrl_ord & IETF_NO_IRD_ORD;
+
+		if (!(rtr_ctrl_ird & IETF_PEER_TO_PEER)) {
+			/* send reset */
+			return -EINVAL;
+		}
+		if (ird_size == IETF_NO_IRD_ORD || ord_size == IETF_NO_IRD_ORD)
+			cm_node->mpav2_ird_ord = IETF_NO_IRD_ORD;
+
+		if (cm_node->mpav2_ird_ord != IETF_NO_IRD_ORD) {
+			/* responder */
+			if (cm_node->state != NES_CM_STATE_MPAREQ_SENT) {
+				/* we are still negotiating */
+				if (ord_size > NES_MAX_IRD) {
+					cm_node->ird_size = NES_MAX_IRD;
+				} else {
+					cm_node->ird_size = ord_size;
+					if (ord_size == 0 &&
+					(rtr_ctrl_ord & IETF_RDMA0_READ)) {
+						cm_node->ird_size = 1;
+						nes_debug(NES_DBG_CM,
+						"%s: Remote peer doesn't support RDMA0_READ (ord=%u)\n",
+							__func__, ord_size);
+					}
+				}
+				if (ird_size > NES_MAX_ORD)
+					cm_node->ord_size = NES_MAX_ORD;
+				else
+					cm_node->ord_size = ird_size;
+			} else { /* initiator */
+				if (ord_size > NES_MAX_IRD) {
+					nes_debug(NES_DBG_CM,
+					"%s: Unable to support the requested (ord =%u)\n",
+							__func__, ord_size);
+					return -EINVAL;
+				}
+				cm_node->ird_size = ord_size;
+
+				if (ird_size > NES_MAX_ORD) {
+					cm_node->ord_size = NES_MAX_ORD;
+				} else {
+					if (ird_size == 0 &&
+					(rtr_ctrl_ord & IETF_RDMA0_READ)) {
+						nes_debug(NES_DBG_CM,
+						"%s: Remote peer doesn't support RDMA0_READ (ird=%u)\n",
+							__func__, ird_size);
+						return -EINVAL;
+					} else {
+						cm_node->ord_size = ird_size;
+					}
+				}
+			}
+		}
+
+		if (rtr_ctrl_ord & IETF_RDMA0_READ) {
+			cm_node->send_rdma0_op = SEND_RDMA_READ_ZERO;
+
+		} else if (rtr_ctrl_ord & IETF_RDMA0_WRITE) {
+			cm_node->send_rdma0_op = SEND_RDMA_WRITE_ZERO;
+		} else {        /* Not supported RDMA0 operation */
+			return -EINVAL;
+		}
+		break;
+	}
+	case IETF_MPA_V1:
+	default:
+		break;
+	}
+
+	/* copy entire MPA frame to our cm_node's frame */
+	memcpy(cm_node->mpa_frame_buf, buffer + mpa_hdr_len, cm_node->mpa_frame_size);
+
+	if (mpa_frame->flags & IETF_MPA_FLAGS_REJECT)
+		*type = NES_MPA_REQUEST_REJECT;
+	return 0;
+}
+
+
+/**
+ * form_cm_frame - get a free packet and build empty frame Use
+ * node info to build.
+ */
+static void form_cm_frame(struct sk_buff *skb,
+			  struct nes_cm_node *cm_node, void *options, u32 optionsize,
+			  void *data, u32 datasize, u8 flags)
+{
+	struct tcphdr *tcph;
+	struct iphdr *iph;
+	struct ethhdr *ethh;
+	u8 *buf;
+	u16 packetsize = sizeof(*iph);
+
+	packetsize += sizeof(*tcph);
+	packetsize += optionsize + datasize;
+
+	skb_trim(skb, 0);
+	memset(skb->data, 0x00, ETH_HLEN + sizeof(*iph) + sizeof(*tcph));
+
+	buf = skb_put(skb, packetsize + ETH_HLEN);
+
+	ethh = (struct ethhdr *)buf;
+	buf += ETH_HLEN;
+
+	iph = (struct iphdr *)buf;
+	buf += sizeof(*iph);
+	tcph = (struct tcphdr *)buf;
+	skb_reset_mac_header(skb);
+	skb_set_network_header(skb, ETH_HLEN);
+	skb_set_transport_header(skb, ETH_HLEN + sizeof(*iph));
+	buf += sizeof(*tcph);
+
+	skb->ip_summed = CHECKSUM_PARTIAL;
+	if (!(cm_node->netdev->features & NETIF_F_IP_CSUM))
+		skb->ip_summed = CHECKSUM_NONE;
+	skb->protocol = htons(0x800);
+	skb->data_len = 0;
+	skb->mac_len = ETH_HLEN;
+
+	memcpy(ethh->h_dest, cm_node->rem_mac, ETH_ALEN);
+	memcpy(ethh->h_source, cm_node->loc_mac, ETH_ALEN);
+	ethh->h_proto = htons(0x0800);
+
+	iph->version = IPVERSION;
+	iph->ihl = 5;           /* 5 * 4Byte words, IP headr len */
+	iph->tos = 0;
+	iph->tot_len = htons(packetsize);
+	iph->id = htons(++cm_node->tcp_cntxt.loc_id);
+
+	iph->frag_off = htons(0x4000);
+	iph->ttl = 0x40;
+	iph->protocol = 0x06;   /* IPPROTO_TCP */
+
+	iph->saddr = htonl(cm_node->mapped_loc_addr);
+	iph->daddr = htonl(cm_node->mapped_rem_addr);
+
+	tcph->source = htons(cm_node->mapped_loc_port);
+	tcph->dest = htons(cm_node->mapped_rem_port);
+	tcph->seq = htonl(cm_node->tcp_cntxt.loc_seq_num);
+
+	if (flags & SET_ACK) {
+		cm_node->tcp_cntxt.loc_ack_num = cm_node->tcp_cntxt.rcv_nxt;
+		tcph->ack_seq = htonl(cm_node->tcp_cntxt.loc_ack_num);
+		tcph->ack = 1;
+	} else {
+		tcph->ack_seq = 0;
+	}
+
+	if (flags & SET_SYN) {
+		cm_node->tcp_cntxt.loc_seq_num++;
+		tcph->syn = 1;
+	} else {
+		cm_node->tcp_cntxt.loc_seq_num += datasize;
+	}
+
+	if (flags & SET_FIN) {
+		cm_node->tcp_cntxt.loc_seq_num++;
+		tcph->fin = 1;
+	}
+
+	if (flags & SET_RST)
+		tcph->rst = 1;
+
+	tcph->doff = (u16)((sizeof(*tcph) + optionsize + 3) >> 2);
+	tcph->window = htons(cm_node->tcp_cntxt.rcv_wnd);
+	tcph->urg_ptr = 0;
+	if (optionsize)
+		memcpy(buf, options, optionsize);
+	buf += optionsize;
+	if (datasize)
+		memcpy(buf, data, datasize);
+
+	skb_shinfo(skb)->nr_frags = 0;
+	cm_packets_created++;
+}
+
+/*
+ * nes_create_sockaddr - Record ip addr and tcp port in a sockaddr struct
+ */
+static void nes_create_sockaddr(__be32 ip_addr, __be16 port,
+				struct sockaddr_storage *addr)
+{
+	struct sockaddr_in *nes_sockaddr = (struct sockaddr_in *)addr;
+	nes_sockaddr->sin_family = AF_INET;
+	memcpy(&nes_sockaddr->sin_addr.s_addr, &ip_addr, sizeof(__be32));
+	nes_sockaddr->sin_port = port;
+}
+
+/*
+ * nes_create_mapinfo - Create a mapinfo object in the port mapper data base
+ */
+static int nes_create_mapinfo(struct nes_cm_info *cm_info)
+{
+	struct sockaddr_storage local_sockaddr;
+	struct sockaddr_storage mapped_sockaddr;
+
+	nes_create_sockaddr(htonl(cm_info->loc_addr), htons(cm_info->loc_port),
+				&local_sockaddr);
+	nes_create_sockaddr(htonl(cm_info->mapped_loc_addr),
+			htons(cm_info->mapped_loc_port), &mapped_sockaddr);
+
+	return iwpm_create_mapinfo(&local_sockaddr,
+				&mapped_sockaddr, RDMA_NL_NES);
+}
+
+/*
+ * nes_remove_mapinfo - Remove a mapinfo object from the port mapper data base
+ *                      and send a remove mapping op message to
+ *                      the userspace port mapper
+ */
+static int nes_remove_mapinfo(u32 loc_addr, u16 loc_port,
+			u32 mapped_loc_addr, u16 mapped_loc_port)
+{
+	struct sockaddr_storage local_sockaddr;
+	struct sockaddr_storage mapped_sockaddr;
+
+	nes_create_sockaddr(htonl(loc_addr), htons(loc_port), &local_sockaddr);
+	nes_create_sockaddr(htonl(mapped_loc_addr), htons(mapped_loc_port),
+				&mapped_sockaddr);
+
+	iwpm_remove_mapinfo(&local_sockaddr, &mapped_sockaddr);
+	return iwpm_remove_mapping(&local_sockaddr, RDMA_NL_NES);
+}
+
+/*
+ * nes_form_pm_msg - Form a port mapper message with mapping info
+ */
+static void nes_form_pm_msg(struct nes_cm_info *cm_info,
+				struct iwpm_sa_data *pm_msg)
+{
+	nes_create_sockaddr(htonl(cm_info->loc_addr), htons(cm_info->loc_port),
+				&pm_msg->loc_addr);
+	nes_create_sockaddr(htonl(cm_info->rem_addr), htons(cm_info->rem_port),
+				&pm_msg->rem_addr);
+}
+
+/*
+ * nes_form_reg_msg - Form a port mapper message with dev info
+ */
+static void nes_form_reg_msg(struct nes_vnic *nesvnic,
+			struct iwpm_dev_data *pm_msg)
+{
+	memcpy(pm_msg->dev_name, nesvnic->nesibdev->ibdev.name,
+				IWPM_DEVNAME_SIZE);
+	memcpy(pm_msg->if_name, nesvnic->netdev->name, IWPM_IFNAME_SIZE);
+}
+
+/*
+ * nes_record_pm_msg - Save the received mapping info
+ */
+static void nes_record_pm_msg(struct nes_cm_info *cm_info,
+			struct iwpm_sa_data *pm_msg)
+{
+	struct sockaddr_in *mapped_loc_addr =
+			(struct sockaddr_in *)&pm_msg->mapped_loc_addr;
+	struct sockaddr_in *mapped_rem_addr =
+			(struct sockaddr_in *)&pm_msg->mapped_rem_addr;
+
+	if (mapped_loc_addr->sin_family == AF_INET) {
+		cm_info->mapped_loc_addr =
+			ntohl(mapped_loc_addr->sin_addr.s_addr);
+		cm_info->mapped_loc_port = ntohs(mapped_loc_addr->sin_port);
+	}
+	if (mapped_rem_addr->sin_family == AF_INET) {
+		cm_info->mapped_rem_addr =
+			ntohl(mapped_rem_addr->sin_addr.s_addr);
+		cm_info->mapped_rem_port = ntohs(mapped_rem_addr->sin_port);
+	}
+}
+
+/**
+ * print_core - dump a cm core
+ */
+static void print_core(struct nes_cm_core *core)
+{
+	nes_debug(NES_DBG_CM, "---------------------------------------------\n");
+	nes_debug(NES_DBG_CM, "CM Core  -- (core = %p )\n", core);
+	if (!core)
+		return;
+	nes_debug(NES_DBG_CM, "---------------------------------------------\n");
+
+	nes_debug(NES_DBG_CM, "State         : %u \n", core->state);
+
+	nes_debug(NES_DBG_CM, "Listen Nodes  : %u \n", atomic_read(&core->listen_node_cnt));
+	nes_debug(NES_DBG_CM, "Active Nodes  : %u \n", atomic_read(&core->node_cnt));
+
+	nes_debug(NES_DBG_CM, "core          : %p \n", core);
+
+	nes_debug(NES_DBG_CM, "-------------- end core ---------------\n");
+}
+
+static void record_ird_ord(struct nes_cm_node *cm_node,
+					u16 conn_ird, u16 conn_ord)
+{
+	if (conn_ird > NES_MAX_IRD)
+		conn_ird = NES_MAX_IRD;
+
+	if (conn_ord > NES_MAX_ORD)
+		conn_ord = NES_MAX_ORD;
+
+	cm_node->ird_size = conn_ird;
+	cm_node->ord_size = conn_ord;
+}
+
+/**
+ * cm_build_mpa_frame - build a MPA V1 frame or MPA V2 frame
+ */
+static int cm_build_mpa_frame(struct nes_cm_node *cm_node, u8 **start_buff,
+			      u16 *buff_len, u8 *pci_mem, u8 mpa_key)
+{
+	int ret = 0;
+
+	*start_buff = (pci_mem) ? pci_mem : &cm_node->mpa_frame_buf[0];
+
+	switch (cm_node->mpa_frame_rev) {
+	case IETF_MPA_V1:
+		*start_buff = (u8 *)*start_buff + sizeof(struct ietf_rtr_msg);
+		*buff_len = sizeof(struct ietf_mpa_v1) + cm_node->mpa_frame_size;
+		build_mpa_v1(cm_node, *start_buff, mpa_key);
+		break;
+	case IETF_MPA_V2:
+		*buff_len = sizeof(struct ietf_mpa_v2) + cm_node->mpa_frame_size;
+		build_mpa_v2(cm_node, *start_buff, mpa_key);
+		break;
+	default:
+		ret = -EINVAL;
+	}
+	return ret;
+}
+
+/**
+ * build_mpa_v2 - build a MPA V2 frame
+ */
+static void build_mpa_v2(struct nes_cm_node *cm_node,
+			 void *start_addr, u8 mpa_key)
+{
+	struct ietf_mpa_v2 *mpa_frame = (struct ietf_mpa_v2 *)start_addr;
+	struct ietf_rtr_msg *rtr_msg = &mpa_frame->rtr_msg;
+	u16 ctrl_ird;
+	u16 ctrl_ord;
+
+	/* initialize the upper 5 bytes of the frame */
+	build_mpa_v1(cm_node, start_addr, mpa_key);
+	mpa_frame->flags |= IETF_MPA_V2_FLAG; /* set a bit to indicate MPA V2 */
+	mpa_frame->priv_data_len += htons(IETF_RTR_MSG_SIZE);
+
+	/* initialize RTR msg */
+	if (cm_node->mpav2_ird_ord == IETF_NO_IRD_ORD) {
+		ctrl_ird = IETF_NO_IRD_ORD;
+		ctrl_ord = IETF_NO_IRD_ORD;
+	} else {
+		ctrl_ird = cm_node->ird_size & IETF_NO_IRD_ORD;
+		ctrl_ord = cm_node->ord_size & IETF_NO_IRD_ORD;
+	}
+	ctrl_ird |= IETF_PEER_TO_PEER;
+	ctrl_ird |= IETF_FLPDU_ZERO_LEN;
+
+	switch (mpa_key) {
+	case MPA_KEY_REQUEST:
+		ctrl_ord |= IETF_RDMA0_WRITE;
+		ctrl_ord |= IETF_RDMA0_READ;
+		break;
+	case MPA_KEY_REPLY:
+		switch (cm_node->send_rdma0_op) {
+		case SEND_RDMA_WRITE_ZERO:
+			ctrl_ord |= IETF_RDMA0_WRITE;
+			break;
+		case SEND_RDMA_READ_ZERO:
+			ctrl_ord |= IETF_RDMA0_READ;
+			break;
+		}
+	}
+	rtr_msg->ctrl_ird = htons(ctrl_ird);
+	rtr_msg->ctrl_ord = htons(ctrl_ord);
+}
+
+/**
+ * build_mpa_v1 - build a MPA V1 frame
+ */
+static void build_mpa_v1(struct nes_cm_node *cm_node, void *start_addr, u8 mpa_key)
+{
+	struct ietf_mpa_v1 *mpa_frame = (struct ietf_mpa_v1 *)start_addr;
+
+	switch (mpa_key) {
+	case MPA_KEY_REQUEST:
+		memcpy(mpa_frame->key, IEFT_MPA_KEY_REQ, IETF_MPA_KEY_SIZE);
+		break;
+	case MPA_KEY_REPLY:
+		memcpy(mpa_frame->key, IEFT_MPA_KEY_REP, IETF_MPA_KEY_SIZE);
+		break;
+	}
+	mpa_frame->flags = IETF_MPA_FLAGS_CRC;
+	mpa_frame->rev = cm_node->mpa_frame_rev;
+	mpa_frame->priv_data_len = htons(cm_node->mpa_frame_size);
+}
+
+static void build_rdma0_msg(struct nes_cm_node *cm_node, struct nes_qp **nesqp_addr)
+{
+	u64 u64temp;
+	struct nes_qp *nesqp = *nesqp_addr;
+	struct nes_hw_qp_wqe *wqe = &nesqp->hwqp.sq_vbase[0];
+
+	u64temp = (unsigned long)nesqp->nesuqp_addr;
+	u64temp |= NES_SW_CONTEXT_ALIGN >> 1;
+	set_wqe_64bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_COMP_CTX_LOW_IDX, u64temp);
+
+	wqe->wqe_words[NES_IWARP_SQ_WQE_FRAG0_LOW_IDX] = 0;
+	wqe->wqe_words[NES_IWARP_SQ_WQE_FRAG0_HIGH_IDX] = 0;
+
+	switch (cm_node->send_rdma0_op) {
+	case SEND_RDMA_WRITE_ZERO:
+		nes_debug(NES_DBG_CM, "Sending first write.\n");
+		wqe->wqe_words[NES_IWARP_SQ_WQE_MISC_IDX] =
+			cpu_to_le32(NES_IWARP_SQ_OP_RDMAW);
+		wqe->wqe_words[NES_IWARP_SQ_WQE_TOTAL_PAYLOAD_IDX] = 0;
+		wqe->wqe_words[NES_IWARP_SQ_WQE_LENGTH0_IDX] = 0;
+		wqe->wqe_words[NES_IWARP_SQ_WQE_STAG0_IDX] = 0;
+		break;
+
+	case SEND_RDMA_READ_ZERO:
+	default:
+		if (cm_node->send_rdma0_op != SEND_RDMA_READ_ZERO)
+			WARN(1, "Unsupported RDMA0 len operation=%u\n",
+			     cm_node->send_rdma0_op);
+		nes_debug(NES_DBG_CM, "Sending first rdma operation.\n");
+		wqe->wqe_words[NES_IWARP_SQ_WQE_MISC_IDX] =
+			cpu_to_le32(NES_IWARP_SQ_OP_RDMAR);
+		wqe->wqe_words[NES_IWARP_SQ_WQE_RDMA_TO_LOW_IDX] = 1;
+		wqe->wqe_words[NES_IWARP_SQ_WQE_RDMA_TO_HIGH_IDX] = 0;
+		wqe->wqe_words[NES_IWARP_SQ_WQE_RDMA_LENGTH_IDX] = 0;
+		wqe->wqe_words[NES_IWARP_SQ_WQE_RDMA_STAG_IDX] = 1;
+		wqe->wqe_words[NES_IWARP_SQ_WQE_STAG0_IDX] = 1;
+		break;
+	}
+
+	if (nesqp->sq_kmapped) {
+		nesqp->sq_kmapped = 0;
+		kunmap(nesqp->page);
+	}
+
+	/*use the reserved spot on the WQ for the extra first WQE*/
+	nesqp->nesqp_context->ird_ord_sizes &= cpu_to_le32(~(NES_QPCONTEXT_ORDIRD_LSMM_PRESENT |
+							     NES_QPCONTEXT_ORDIRD_WRPDU |
+							     NES_QPCONTEXT_ORDIRD_ALSMM));
+	nesqp->skip_lsmm = 1;
+	nesqp->hwqp.sq_tail = 0;
+}
+
+/**
+ * schedule_nes_timer
+ * note - cm_node needs to be protected before calling this. Encase in:
+ *			rem_ref_cm_node(cm_core, cm_node);add_ref_cm_node(cm_node);
+ */
+int schedule_nes_timer(struct nes_cm_node *cm_node, struct sk_buff *skb,
+		       enum nes_timer_type type, int send_retrans,
+		       int close_when_complete)
+{
+	unsigned long flags;
+	struct nes_cm_core *cm_core = cm_node->cm_core;
+	struct nes_timer_entry *new_send;
+	int ret = 0;
+
+	new_send = kzalloc(sizeof(*new_send), GFP_ATOMIC);
+	if (!new_send)
+		return -ENOMEM;
+
+	/* new_send->timetosend = currenttime */
+	new_send->retrycount = NES_DEFAULT_RETRYS;
+	new_send->retranscount = NES_DEFAULT_RETRANS;
+	new_send->skb = skb;
+	new_send->timetosend = jiffies;
+	new_send->type = type;
+	new_send->netdev = cm_node->netdev;
+	new_send->send_retrans = send_retrans;
+	new_send->close_when_complete = close_when_complete;
+
+	if (type == NES_TIMER_TYPE_CLOSE) {
+		new_send->timetosend += (HZ / 10);
+		if (cm_node->recv_entry) {
+			kfree(new_send);
+			WARN_ON(1);
+			return -EINVAL;
+		}
+		cm_node->recv_entry = new_send;
+	}
+
+	if (type == NES_TIMER_TYPE_SEND) {
+		new_send->seq_num = ntohl(tcp_hdr(skb)->seq);
+		atomic_inc(&new_send->skb->users);
+		spin_lock_irqsave(&cm_node->retrans_list_lock, flags);
+		cm_node->send_entry = new_send;
+		add_ref_cm_node(cm_node);
+		spin_unlock_irqrestore(&cm_node->retrans_list_lock, flags);
+		new_send->timetosend = jiffies + NES_RETRY_TIMEOUT;
+
+		ret = nes_nic_cm_xmit(new_send->skb, cm_node->netdev);
+		if (ret != NETDEV_TX_OK) {
+			nes_debug(NES_DBG_CM, "Error sending packet %p "
+				  "(jiffies = %lu)\n", new_send, jiffies);
+			new_send->timetosend = jiffies;
+			ret = NETDEV_TX_OK;
+		} else {
+			cm_packets_sent++;
+			if (!send_retrans) {
+				cleanup_retrans_entry(cm_node);
+				if (close_when_complete)
+					rem_ref_cm_node(cm_core, cm_node);
+				return ret;
+			}
+		}
+	}
+
+	if (!timer_pending(&cm_core->tcp_timer))
+		mod_timer(&cm_core->tcp_timer, new_send->timetosend);
+
+	return ret;
+}
+
+static void nes_retrans_expired(struct nes_cm_node *cm_node)
+{
+	struct iw_cm_id *cm_id = cm_node->cm_id;
+	enum nes_cm_node_state state = cm_node->state;
+	cm_node->state = NES_CM_STATE_CLOSED;
+
+	switch (state) {
+	case NES_CM_STATE_SYN_RCVD:
+	case NES_CM_STATE_CLOSING:
+		rem_ref_cm_node(cm_node->cm_core, cm_node);
+		break;
+	case NES_CM_STATE_LAST_ACK:
+	case NES_CM_STATE_FIN_WAIT1:
+		if (cm_node->cm_id)
+			cm_id->rem_ref(cm_id);
+		send_reset(cm_node, NULL);
+		break;
+	default:
+		add_ref_cm_node(cm_node);
+		send_reset(cm_node, NULL);
+		create_event(cm_node, NES_CM_EVENT_ABORTED);
+	}
+}
+
+static void handle_recv_entry(struct nes_cm_node *cm_node, u32 rem_node)
+{
+	struct nes_timer_entry *recv_entry = cm_node->recv_entry;
+	struct iw_cm_id *cm_id = cm_node->cm_id;
+	struct nes_qp *nesqp;
+	unsigned long qplockflags;
+
+	if (!recv_entry)
+		return;
+	nesqp = (struct nes_qp *)recv_entry->skb;
+	if (nesqp) {
+		spin_lock_irqsave(&nesqp->lock, qplockflags);
+		if (nesqp->cm_id) {
+			nes_debug(NES_DBG_CM, "QP%u: cm_id = %p, "
+				  "refcount = %d: HIT A "
+				  "NES_TIMER_TYPE_CLOSE with something "
+				  "to do!!!\n", nesqp->hwqp.qp_id, cm_id,
+				  atomic_read(&nesqp->refcount));
+			nesqp->hw_tcp_state = NES_AEQE_TCP_STATE_CLOSED;
+			nesqp->last_aeq = NES_AEQE_AEID_RESET_SENT;
+			nesqp->ibqp_state = IB_QPS_ERR;
+			spin_unlock_irqrestore(&nesqp->lock, qplockflags);
+			nes_cm_disconn(nesqp);
+		} else {
+			spin_unlock_irqrestore(&nesqp->lock, qplockflags);
+			nes_debug(NES_DBG_CM, "QP%u: cm_id = %p, "
+				  "refcount = %d: HIT A "
+				  "NES_TIMER_TYPE_CLOSE with nothing "
+				  "to do!!!\n", nesqp->hwqp.qp_id, cm_id,
+				  atomic_read(&nesqp->refcount));
+		}
+	} else if (rem_node) {
+		/* TIME_WAIT state */
+		rem_ref_cm_node(cm_node->cm_core, cm_node);
+	}
+	if (cm_node->cm_id)
+		cm_id->rem_ref(cm_id);
+	kfree(recv_entry);
+	cm_node->recv_entry = NULL;
+}
+
+/**
+ * nes_cm_timer_tick
+ */
+static void nes_cm_timer_tick(unsigned long pass)
+{
+	unsigned long flags;
+	unsigned long nexttimeout = jiffies + NES_LONG_TIME;
+	struct nes_cm_node *cm_node;
+	struct nes_timer_entry *send_entry, *recv_entry;
+	struct list_head *list_core_temp;
+	struct list_head *list_node;
+	struct nes_cm_core *cm_core = g_cm_core;
+	u32 settimer = 0;
+	unsigned long timetosend;
+	int ret = NETDEV_TX_OK;
+
+	struct list_head timer_list;
+
+	INIT_LIST_HEAD(&timer_list);
+	spin_lock_irqsave(&cm_core->ht_lock, flags);
+
+	list_for_each_safe(list_node, list_core_temp,
+			   &cm_core->connected_nodes) {
+		cm_node = container_of(list_node, struct nes_cm_node, list);
+		if ((cm_node->recv_entry) || (cm_node->send_entry)) {
+			add_ref_cm_node(cm_node);
+			list_add(&cm_node->timer_entry, &timer_list);
+		}
+	}
+	spin_unlock_irqrestore(&cm_core->ht_lock, flags);
+
+	list_for_each_safe(list_node, list_core_temp, &timer_list) {
+		cm_node = container_of(list_node, struct nes_cm_node,
+				       timer_entry);
+		recv_entry = cm_node->recv_entry;
+
+		if (recv_entry) {
+			if (time_after(recv_entry->timetosend, jiffies)) {
+				if (nexttimeout > recv_entry->timetosend ||
+				    !settimer) {
+					nexttimeout = recv_entry->timetosend;
+					settimer = 1;
+				}
+			} else {
+				handle_recv_entry(cm_node, 1);
+			}
+		}
+
+		spin_lock_irqsave(&cm_node->retrans_list_lock, flags);
+		do {
+			send_entry = cm_node->send_entry;
+			if (!send_entry)
+				break;
+			if (time_after(send_entry->timetosend, jiffies)) {
+				if (cm_node->state != NES_CM_STATE_TSA) {
+					if ((nexttimeout >
+					     send_entry->timetosend) ||
+					    !settimer) {
+						nexttimeout =
+							send_entry->timetosend;
+						settimer = 1;
+					}
+				} else {
+					free_retrans_entry(cm_node);
+				}
+				break;
+			}
+
+			if ((cm_node->state == NES_CM_STATE_TSA) ||
+			    (cm_node->state == NES_CM_STATE_CLOSED)) {
+				free_retrans_entry(cm_node);
+				break;
+			}
+
+			if (!send_entry->retranscount ||
+			    !send_entry->retrycount) {
+				cm_packets_dropped++;
+				free_retrans_entry(cm_node);
+
+				spin_unlock_irqrestore(
+					&cm_node->retrans_list_lock, flags);
+				nes_retrans_expired(cm_node);
+				cm_node->state = NES_CM_STATE_CLOSED;
+				spin_lock_irqsave(&cm_node->retrans_list_lock,
+						  flags);
+				break;
+			}
+			atomic_inc(&send_entry->skb->users);
+			cm_packets_retrans++;
+			nes_debug(NES_DBG_CM, "Retransmitting send_entry %p "
+				  "for node %p, jiffies = %lu, time to send = "
+				  "%lu, retranscount = %u, send_entry->seq_num = "
+				  "0x%08X, cm_node->tcp_cntxt.rem_ack_num = "
+				  "0x%08X\n", send_entry, cm_node, jiffies,
+				  send_entry->timetosend,
+				  send_entry->retranscount,
+				  send_entry->seq_num,
+				  cm_node->tcp_cntxt.rem_ack_num);
+
+			spin_unlock_irqrestore(&cm_node->retrans_list_lock,
+					       flags);
+			ret = nes_nic_cm_xmit(send_entry->skb, cm_node->netdev);
+			spin_lock_irqsave(&cm_node->retrans_list_lock, flags);
+			if (ret != NETDEV_TX_OK) {
+				nes_debug(NES_DBG_CM, "rexmit failed for "
+					  "node=%p\n", cm_node);
+				cm_packets_bounced++;
+				send_entry->retrycount--;
+				nexttimeout = jiffies + NES_SHORT_TIME;
+				settimer = 1;
+				break;
+			} else {
+				cm_packets_sent++;
+			}
+			nes_debug(NES_DBG_CM, "Packet Sent: retrans count = "
+				  "%u, retry count = %u.\n",
+				  send_entry->retranscount,
+				  send_entry->retrycount);
+			if (send_entry->send_retrans) {
+				send_entry->retranscount--;
+				timetosend = (NES_RETRY_TIMEOUT <<
+					      (NES_DEFAULT_RETRANS - send_entry->retranscount));
+
+				send_entry->timetosend = jiffies +
+							 min(timetosend, NES_MAX_TIMEOUT);
+				if (nexttimeout > send_entry->timetosend ||
+				    !settimer) {
+					nexttimeout = send_entry->timetosend;
+					settimer = 1;
+				}
+			} else {
+				int close_when_complete;
+				close_when_complete =
+					send_entry->close_when_complete;
+				nes_debug(NES_DBG_CM, "cm_node=%p state=%d\n",
+					  cm_node, cm_node->state);
+				free_retrans_entry(cm_node);
+				if (close_when_complete)
+					rem_ref_cm_node(cm_node->cm_core,
+							cm_node);
+			}
+		} while (0);
+
+		spin_unlock_irqrestore(&cm_node->retrans_list_lock, flags);
+		rem_ref_cm_node(cm_node->cm_core, cm_node);
+	}
+
+	if (settimer) {
+		if (!timer_pending(&cm_core->tcp_timer))
+			mod_timer(&cm_core->tcp_timer, nexttimeout);
+	}
+}
+
+
+/**
+ * send_syn
+ */
+static int send_syn(struct nes_cm_node *cm_node, u32 sendack,
+		    struct sk_buff *skb)
+{
+	int ret;
+	int flags = SET_SYN;
+	char optionsbuffer[sizeof(struct option_mss) +
+			   sizeof(struct option_windowscale) + sizeof(struct option_base) +
+			   TCP_OPTIONS_PADDING];
+
+	int optionssize = 0;
+	/* Sending MSS option */
+	union all_known_options *options;
+
+	if (!cm_node)
+		return -EINVAL;
+
+	options = (union all_known_options *)&optionsbuffer[optionssize];
+	options->as_mss.optionnum = OPTION_NUMBER_MSS;
+	options->as_mss.length = sizeof(struct option_mss);
+	options->as_mss.mss = htons(cm_node->tcp_cntxt.mss);
+	optionssize += sizeof(struct option_mss);
+
+	options = (union all_known_options *)&optionsbuffer[optionssize];
+	options->as_windowscale.optionnum = OPTION_NUMBER_WINDOW_SCALE;
+	options->as_windowscale.length = sizeof(struct option_windowscale);
+	options->as_windowscale.shiftcount = cm_node->tcp_cntxt.rcv_wscale;
+	optionssize += sizeof(struct option_windowscale);
+
+	if (sendack && !(NES_DRV_OPT_SUPRESS_OPTION_BC & nes_drv_opt)) {
+		options = (union all_known_options *)&optionsbuffer[optionssize];
+		options->as_base.optionnum = OPTION_NUMBER_WRITE0;
+		options->as_base.length = sizeof(struct option_base);
+		optionssize += sizeof(struct option_base);
+		/* we need the size to be a multiple of 4 */
+		options = (union all_known_options *)&optionsbuffer[optionssize];
+		options->as_end = 1;
+		optionssize += 1;
+		options = (union all_known_options *)&optionsbuffer[optionssize];
+		options->as_end = 1;
+		optionssize += 1;
+	}
+
+	options = (union all_known_options *)&optionsbuffer[optionssize];
+	options->as_end = OPTION_NUMBER_END;
+	optionssize += 1;
+
+	if (!skb)
+		skb = dev_alloc_skb(MAX_CM_BUFFER);
+	if (!skb) {
+		nes_debug(NES_DBG_CM, "Failed to get a Free pkt\n");
+		return -1;
+	}
+
+	if (sendack)
+		flags |= SET_ACK;
+
+	form_cm_frame(skb, cm_node, optionsbuffer, optionssize, NULL, 0, flags);
+	ret = schedule_nes_timer(cm_node, skb, NES_TIMER_TYPE_SEND, 1, 0);
+
+	return ret;
+}
+
+
+/**
+ * send_reset
+ */
+static int send_reset(struct nes_cm_node *cm_node, struct sk_buff *skb)
+{
+	int ret;
+	int flags = SET_RST | SET_ACK;
+
+	if (!skb)
+		skb = dev_alloc_skb(MAX_CM_BUFFER);
+	if (!skb) {
+		nes_debug(NES_DBG_CM, "Failed to get a Free pkt\n");
+		return -ENOMEM;
+	}
+
+	form_cm_frame(skb, cm_node, NULL, 0, NULL, 0, flags);
+	ret = schedule_nes_timer(cm_node, skb, NES_TIMER_TYPE_SEND, 0, 1);
+
+	return ret;
+}
+
+
+/**
+ * send_ack
+ */
+static int send_ack(struct nes_cm_node *cm_node, struct sk_buff *skb)
+{
+	int ret;
+
+	if (!skb)
+		skb = dev_alloc_skb(MAX_CM_BUFFER);
+
+	if (!skb) {
+		nes_debug(NES_DBG_CM, "Failed to get a Free pkt\n");
+		return -1;
+	}
+
+	form_cm_frame(skb, cm_node, NULL, 0, NULL, 0, SET_ACK);
+	ret = schedule_nes_timer(cm_node, skb, NES_TIMER_TYPE_SEND, 0, 0);
+
+	return ret;
+}
+
+
+/**
+ * send_fin
+ */
+static int send_fin(struct nes_cm_node *cm_node, struct sk_buff *skb)
+{
+	int ret;
+
+	/* if we didn't get a frame get one */
+	if (!skb)
+		skb = dev_alloc_skb(MAX_CM_BUFFER);
+
+	if (!skb) {
+		nes_debug(NES_DBG_CM, "Failed to get a Free pkt\n");
+		return -1;
+	}
+
+	form_cm_frame(skb, cm_node, NULL, 0, NULL, 0, SET_ACK | SET_FIN);
+	ret = schedule_nes_timer(cm_node, skb, NES_TIMER_TYPE_SEND, 1, 0);
+
+	return ret;
+}
+
+
+/**
+ * find_node - find a cm node that matches the reference cm node
+ */
+static struct nes_cm_node *find_node(struct nes_cm_core *cm_core,
+				     u16 rem_port, nes_addr_t rem_addr, u16 loc_port, nes_addr_t loc_addr)
+{
+	unsigned long flags;
+	struct list_head *hte;
+	struct nes_cm_node *cm_node;
+
+	/* get a handle on the hte */
+	hte = &cm_core->connected_nodes;
+
+	/* walk list and find cm_node associated with this session ID */
+	spin_lock_irqsave(&cm_core->ht_lock, flags);
+	list_for_each_entry(cm_node, hte, list) {
+		/* compare quad, return node handle if a match */
+		nes_debug(NES_DBG_CM, "finding node %x:%x =? %x:%x ^ %x:%x =? %x:%x\n",
+			  cm_node->loc_addr, cm_node->loc_port,
+			  loc_addr, loc_port,
+			  cm_node->rem_addr, cm_node->rem_port,
+			  rem_addr, rem_port);
+		if ((cm_node->mapped_loc_addr == loc_addr) &&
+			(cm_node->mapped_loc_port == loc_port) &&
+			(cm_node->mapped_rem_addr == rem_addr) &&
+			(cm_node->mapped_rem_port == rem_port)) {
+
+			add_ref_cm_node(cm_node);
+			spin_unlock_irqrestore(&cm_core->ht_lock, flags);
+			return cm_node;
+		}
+	}
+	spin_unlock_irqrestore(&cm_core->ht_lock, flags);
+
+	/* no owner node */
+	return NULL;
+}
+
+
+/**
+ * find_listener - find a cm node listening on this addr-port pair
+ */
+static struct nes_cm_listener *find_listener(struct nes_cm_core *cm_core,
+					nes_addr_t dst_addr, u16 dst_port,
+					enum nes_cm_listener_state listener_state, int local)
+{
+	unsigned long flags;
+	struct nes_cm_listener *listen_node;
+	nes_addr_t listen_addr;
+	u16 listen_port;
+
+	/* walk list and find cm_node associated with this session ID */
+	spin_lock_irqsave(&cm_core->listen_list_lock, flags);
+	list_for_each_entry(listen_node, &cm_core->listen_list.list, list) {
+		if (local) {
+			listen_addr = listen_node->loc_addr;
+			listen_port = listen_node->loc_port;
+		} else {
+			listen_addr = listen_node->mapped_loc_addr;
+			listen_port = listen_node->mapped_loc_port;
+		}
+		/* compare node pair, return node handle if a match */
+		if (((listen_addr == dst_addr) ||
+		     listen_addr == 0x00000000) &&
+		    (listen_port == dst_port) &&
+		    (listener_state & listen_node->listener_state)) {
+			atomic_inc(&listen_node->ref_count);
+			spin_unlock_irqrestore(&cm_core->listen_list_lock, flags);
+			return listen_node;
+		}
+	}
+	spin_unlock_irqrestore(&cm_core->listen_list_lock, flags);
+
+	/* no listener */
+	return NULL;
+}
+
+/**
+ * add_hte_node - add a cm node to the hash table
+ */
+static int add_hte_node(struct nes_cm_core *cm_core, struct nes_cm_node *cm_node)
+{
+	unsigned long flags;
+	struct list_head *hte;
+
+	if (!cm_node || !cm_core)
+		return -EINVAL;
+
+	nes_debug(NES_DBG_CM, "Adding Node %p to Active Connection HT\n",
+		  cm_node);
+
+	spin_lock_irqsave(&cm_core->ht_lock, flags);
+
+	/* get a handle on the hash table element (list head for this slot) */
+	hte = &cm_core->connected_nodes;
+	list_add_tail(&cm_node->list, hte);
+	atomic_inc(&cm_core->ht_node_cnt);
+
+	spin_unlock_irqrestore(&cm_core->ht_lock, flags);
+
+	return 0;
+}
+
+
+/**
+ * mini_cm_dec_refcnt_listen
+ */
+static int mini_cm_dec_refcnt_listen(struct nes_cm_core *cm_core,
+				     struct nes_cm_listener *listener, int free_hanging_nodes)
+{
+	int ret = -EINVAL;
+	int err = 0;
+	unsigned long flags;
+	struct list_head *list_pos = NULL;
+	struct list_head *list_temp = NULL;
+	struct nes_cm_node *cm_node = NULL;
+	struct list_head reset_list;
+
+	nes_debug(NES_DBG_CM, "attempting listener= %p free_nodes= %d, "
+		  "refcnt=%d\n", listener, free_hanging_nodes,
+		  atomic_read(&listener->ref_count));
+	/* free non-accelerated child nodes for this listener */
+	INIT_LIST_HEAD(&reset_list);
+	if (free_hanging_nodes) {
+		spin_lock_irqsave(&cm_core->ht_lock, flags);
+		list_for_each_safe(list_pos, list_temp,
+				   &g_cm_core->connected_nodes) {
+			cm_node = container_of(list_pos, struct nes_cm_node,
+					       list);
+			if ((cm_node->listener == listener) &&
+			    (!cm_node->accelerated)) {
+				add_ref_cm_node(cm_node);
+				list_add(&cm_node->reset_entry, &reset_list);
+			}
+		}
+		spin_unlock_irqrestore(&cm_core->ht_lock, flags);
+	}
+
+	list_for_each_safe(list_pos, list_temp, &reset_list) {
+		cm_node = container_of(list_pos, struct nes_cm_node,
+				       reset_entry);
+		{
+			struct nes_cm_node *loopback = cm_node->loopbackpartner;
+			enum nes_cm_node_state old_state;
+			if (NES_CM_STATE_FIN_WAIT1 <= cm_node->state) {
+				rem_ref_cm_node(cm_node->cm_core, cm_node);
+			} else {
+				if (!loopback) {
+					cleanup_retrans_entry(cm_node);
+					err = send_reset(cm_node, NULL);
+					if (err) {
+						cm_node->state =
+							NES_CM_STATE_CLOSED;
+						WARN_ON(1);
+					} else {
+						old_state = cm_node->state;
+						cm_node->state = NES_CM_STATE_LISTENER_DESTROYED;
+						if (old_state != NES_CM_STATE_MPAREQ_RCVD)
+							rem_ref_cm_node(
+								cm_node->cm_core,
+								cm_node);
+					}
+				} else {
+					struct nes_cm_event event;
+
+					event.cm_node = loopback;
+					event.cm_info.rem_addr =
+							loopback->rem_addr;
+					event.cm_info.loc_addr =
+							loopback->loc_addr;
+					event.cm_info.rem_port =
+							loopback->rem_port;
+					event.cm_info.loc_port =
+							 loopback->loc_port;
+					event.cm_info.cm_id = loopback->cm_id;
+					add_ref_cm_node(loopback);
+					loopback->state = NES_CM_STATE_CLOSED;
+					cm_event_connect_error(&event);
+					cm_node->state = NES_CM_STATE_LISTENER_DESTROYED;
+
+					rem_ref_cm_node(cm_node->cm_core,
+							 cm_node);
+
+				}
+			}
+		}
+	}
+
+	spin_lock_irqsave(&cm_core->listen_list_lock, flags);
+	if (!atomic_dec_return(&listener->ref_count)) {
+		list_del(&listener->list);
+
+		/* decrement our listen node count */
+		atomic_dec(&cm_core->listen_node_cnt);
+
+		spin_unlock_irqrestore(&cm_core->listen_list_lock, flags);
+
+		if (listener->nesvnic) {
+			nes_manage_apbvt(listener->nesvnic,
+				listener->mapped_loc_port,
+				PCI_FUNC(listener->nesvnic->nesdev->pcidev->devfn),
+				NES_MANAGE_APBVT_DEL);
+
+			nes_remove_mapinfo(listener->loc_addr,
+					listener->loc_port,
+					listener->mapped_loc_addr,
+					listener->mapped_loc_port);
+			nes_debug(NES_DBG_NLMSG,
+					"Delete APBVT mapped_loc_port = %04X\n",
+					listener->mapped_loc_port);
+		}
+
+		nes_debug(NES_DBG_CM, "destroying listener (%p)\n", listener);
+
+		kfree(listener);
+		listener = NULL;
+		ret = 0;
+		atomic_inc(&cm_listens_destroyed);
+	} else {
+		spin_unlock_irqrestore(&cm_core->listen_list_lock, flags);
+	}
+	if (listener) {
+		if (atomic_read(&listener->pend_accepts_cnt) > 0)
+			nes_debug(NES_DBG_CM, "destroying listener (%p)"
+				  " with non-zero pending accepts=%u\n",
+				  listener, atomic_read(&listener->pend_accepts_cnt));
+	}
+
+	return ret;
+}
+
+
+/**
+ * mini_cm_del_listen
+ */
+static int mini_cm_del_listen(struct nes_cm_core *cm_core,
+			      struct nes_cm_listener *listener)
+{
+	listener->listener_state = NES_CM_LISTENER_PASSIVE_STATE;
+	listener->cm_id = NULL; /* going to be destroyed pretty soon */
+	return mini_cm_dec_refcnt_listen(cm_core, listener, 1);
+}
+
+
+/**
+ * mini_cm_accelerated
+ */
+static inline int mini_cm_accelerated(struct nes_cm_core *cm_core,
+				      struct nes_cm_node *cm_node)
+{
+	cm_node->accelerated = 1;
+
+	if (cm_node->accept_pend) {
+		BUG_ON(!cm_node->listener);
+		atomic_dec(&cm_node->listener->pend_accepts_cnt);
+		cm_node->accept_pend = 0;
+		BUG_ON(atomic_read(&cm_node->listener->pend_accepts_cnt) < 0);
+	}
+
+	if (!timer_pending(&cm_core->tcp_timer))
+		mod_timer(&cm_core->tcp_timer, (jiffies + NES_SHORT_TIME));
+
+	return 0;
+}
+
+
+/**
+ * nes_addr_resolve_neigh
+ */
+static int nes_addr_resolve_neigh(struct nes_vnic *nesvnic, u32 dst_ip, int arpindex)
+{
+	struct rtable *rt;
+	struct neighbour *neigh;
+	int rc = arpindex;
+	struct net_device *netdev;
+	struct nes_adapter *nesadapter = nesvnic->nesdev->nesadapter;
+
+	rt = ip_route_output(&init_net, htonl(dst_ip), 0, 0, 0);
+	if (IS_ERR(rt)) {
+		printk(KERN_ERR "%s: ip_route_output_key failed for 0x%08X\n",
+		       __func__, dst_ip);
+		return rc;
+	}
+
+	if (netif_is_bond_slave(nesvnic->netdev))
+		netdev = netdev_master_upper_dev_get(nesvnic->netdev);
+	else
+		netdev = nesvnic->netdev;
+
+	neigh = neigh_lookup(&arp_tbl, &rt->rt_gateway, netdev);
+
+	rcu_read_lock();
+	if (neigh) {
+		if (neigh->nud_state & NUD_VALID) {
+			nes_debug(NES_DBG_CM, "Neighbor MAC address for 0x%08X"
+				  " is %pM, Gateway is 0x%08X \n", dst_ip,
+				  neigh->ha, ntohl(rt->rt_gateway));
+
+			if (arpindex >= 0) {
+				if (ether_addr_equal(nesadapter->arp_table[arpindex].mac_addr, neigh->ha)) {
+					/* Mac address same as in nes_arp_table */
+					goto out;
+				}
+
+				nes_manage_arp_cache(nesvnic->netdev,
+						     nesadapter->arp_table[arpindex].mac_addr,
+						     dst_ip, NES_ARP_DELETE);
+			}
+
+			nes_manage_arp_cache(nesvnic->netdev, neigh->ha,
+					     dst_ip, NES_ARP_ADD);
+			rc = nes_arp_table(nesvnic->nesdev, dst_ip, NULL,
+					   NES_ARP_RESOLVE);
+		} else {
+			neigh_event_send(neigh, NULL);
+		}
+	}
+out:
+	rcu_read_unlock();
+
+	if (neigh)
+		neigh_release(neigh);
+
+	ip_rt_put(rt);
+	return rc;
+}
+
+/**
+ * make_cm_node - create a new instance of a cm node
+ */
+static struct nes_cm_node *make_cm_node(struct nes_cm_core *cm_core,
+					struct nes_vnic *nesvnic, struct nes_cm_info *cm_info,
+					struct nes_cm_listener *listener)
+{
+	struct nes_cm_node *cm_node;
+	struct timespec ts;
+	int oldarpindex = 0;
+	int arpindex = 0;
+	struct nes_device *nesdev;
+	struct nes_adapter *nesadapter;
+
+	/* create an hte and cm_node for this instance */
+	cm_node = kzalloc(sizeof(*cm_node), GFP_ATOMIC);
+	if (!cm_node)
+		return NULL;
+
+	/* set our node specific transport info */
+	cm_node->loc_addr = cm_info->loc_addr;
+	cm_node->rem_addr = cm_info->rem_addr;
+	cm_node->loc_port = cm_info->loc_port;
+	cm_node->rem_port = cm_info->rem_port;
+
+	cm_node->mapped_loc_addr = cm_info->mapped_loc_addr;
+	cm_node->mapped_rem_addr = cm_info->mapped_rem_addr;
+	cm_node->mapped_loc_port = cm_info->mapped_loc_port;
+	cm_node->mapped_rem_port = cm_info->mapped_rem_port;
+
+	cm_node->mpa_frame_rev = mpa_version;
+	cm_node->send_rdma0_op = SEND_RDMA_READ_ZERO;
+	cm_node->mpav2_ird_ord = 0;
+	cm_node->ird_size = 0;
+	cm_node->ord_size = 0;
+
+	nes_debug(NES_DBG_CM, "Make node addresses : loc = %pI4:%x, rem = %pI4:%x\n",
+		  &cm_node->loc_addr, cm_node->loc_port,
+		  &cm_node->rem_addr, cm_node->rem_port);
+	cm_node->listener = listener;
+	cm_node->netdev = nesvnic->netdev;
+	cm_node->cm_id = cm_info->cm_id;
+	memcpy(cm_node->loc_mac, nesvnic->netdev->dev_addr, ETH_ALEN);
+
+	nes_debug(NES_DBG_CM, "listener=%p, cm_id=%p\n", cm_node->listener,
+		  cm_node->cm_id);
+
+	spin_lock_init(&cm_node->retrans_list_lock);
+
+	cm_node->loopbackpartner = NULL;
+	atomic_set(&cm_node->ref_count, 1);
+	/* associate our parent CM core */
+	cm_node->cm_core = cm_core;
+	cm_node->tcp_cntxt.loc_id = NES_CM_DEF_LOCAL_ID;
+	cm_node->tcp_cntxt.rcv_wscale = NES_CM_DEFAULT_RCV_WND_SCALE;
+	cm_node->tcp_cntxt.rcv_wnd = NES_CM_DEFAULT_RCV_WND_SCALED >>
+				     NES_CM_DEFAULT_RCV_WND_SCALE;
+	ts = current_kernel_time();
+	cm_node->tcp_cntxt.loc_seq_num = htonl(ts.tv_nsec);
+	cm_node->tcp_cntxt.mss = nesvnic->max_frame_size - sizeof(struct iphdr) -
+				 sizeof(struct tcphdr) - ETH_HLEN - VLAN_HLEN;
+	cm_node->tcp_cntxt.rcv_nxt = 0;
+	/* get a unique session ID , add thread_id to an upcounter to handle race */
+	atomic_inc(&cm_core->node_cnt);
+	cm_node->conn_type = cm_info->conn_type;
+	cm_node->apbvt_set = 0;
+	cm_node->accept_pend = 0;
+
+	cm_node->nesvnic = nesvnic;
+	/* get some device handles, for arp lookup */
+	nesdev = nesvnic->nesdev;
+	nesadapter = nesdev->nesadapter;
+
+	cm_node->loopbackpartner = NULL;
+
+	/* get the mac addr for the remote node */
+	oldarpindex = nes_arp_table(nesdev, cm_node->mapped_rem_addr,
+				NULL, NES_ARP_RESOLVE);
+	arpindex = nes_addr_resolve_neigh(nesvnic,
+				cm_node->mapped_rem_addr, oldarpindex);
+	if (arpindex < 0) {
+		kfree(cm_node);
+		return NULL;
+	}
+
+	/* copy the mac addr to node context */
+	memcpy(cm_node->rem_mac, nesadapter->arp_table[arpindex].mac_addr, ETH_ALEN);
+	nes_debug(NES_DBG_CM, "Remote mac addr from arp table: %pM\n",
+		  cm_node->rem_mac);
+
+	add_hte_node(cm_core, cm_node);
+	atomic_inc(&cm_nodes_created);
+
+	return cm_node;
+}
+
+
+/**
+ * add_ref_cm_node - destroy an instance of a cm node
+ */
+static int add_ref_cm_node(struct nes_cm_node *cm_node)
+{
+	atomic_inc(&cm_node->ref_count);
+	return 0;
+}
+
+
+/**
+ * rem_ref_cm_node - destroy an instance of a cm node
+ */
+static int rem_ref_cm_node(struct nes_cm_core *cm_core,
+			   struct nes_cm_node *cm_node)
+{
+	unsigned long flags;
+	struct nes_qp *nesqp;
+
+	if (!cm_node)
+		return -EINVAL;
+
+	spin_lock_irqsave(&cm_node->cm_core->ht_lock, flags);
+	if (atomic_dec_return(&cm_node->ref_count)) {
+		spin_unlock_irqrestore(&cm_node->cm_core->ht_lock, flags);
+		return 0;
+	}
+	list_del(&cm_node->list);
+	atomic_dec(&cm_core->ht_node_cnt);
+	spin_unlock_irqrestore(&cm_node->cm_core->ht_lock, flags);
+
+	/* if the node is destroyed before connection was accelerated */
+	if (!cm_node->accelerated && cm_node->accept_pend) {
+		BUG_ON(!cm_node->listener);
+		atomic_dec(&cm_node->listener->pend_accepts_cnt);
+		BUG_ON(atomic_read(&cm_node->listener->pend_accepts_cnt) < 0);
+	}
+	WARN_ON(cm_node->send_entry);
+	if (cm_node->recv_entry)
+		handle_recv_entry(cm_node, 0);
+	if (cm_node->listener) {
+		mini_cm_dec_refcnt_listen(cm_core, cm_node->listener, 0);
+	} else {
+		if (cm_node->apbvt_set && cm_node->nesvnic) {
+			nes_manage_apbvt(cm_node->nesvnic, cm_node->mapped_loc_port,
+					 PCI_FUNC(cm_node->nesvnic->nesdev->pcidev->devfn),
+					 NES_MANAGE_APBVT_DEL);
+		}
+		nes_debug(NES_DBG_NLMSG, "Delete APBVT mapped_loc_port = %04X\n",
+					cm_node->mapped_loc_port);
+		nes_remove_mapinfo(cm_node->loc_addr, cm_node->loc_port,
+			cm_node->mapped_loc_addr, cm_node->mapped_loc_port);
+	}
+
+	atomic_dec(&cm_core->node_cnt);
+	atomic_inc(&cm_nodes_destroyed);
+	nesqp = cm_node->nesqp;
+	if (nesqp) {
+		nesqp->cm_node = NULL;
+		nes_rem_ref(&nesqp->ibqp);
+		cm_node->nesqp = NULL;
+	}
+
+	kfree(cm_node);
+	return 0;
+}
+
+/**
+ * process_options
+ */
+static int process_options(struct nes_cm_node *cm_node, u8 *optionsloc,
+			   u32 optionsize, u32 syn_packet)
+{
+	u32 tmp;
+	u32 offset = 0;
+	union all_known_options *all_options;
+	char got_mss_option = 0;
+
+	while (offset < optionsize) {
+		all_options = (union all_known_options *)(optionsloc + offset);
+		switch (all_options->as_base.optionnum) {
+		case OPTION_NUMBER_END:
+			offset = optionsize;
+			break;
+		case OPTION_NUMBER_NONE:
+			offset += 1;
+			continue;
+		case OPTION_NUMBER_MSS:
+			nes_debug(NES_DBG_CM, "%s: MSS Length: %d Offset: %d "
+				  "Size: %d\n", __func__,
+				  all_options->as_mss.length, offset, optionsize);
+			got_mss_option = 1;
+			if (all_options->as_mss.length != 4) {
+				return 1;
+			} else {
+				tmp = ntohs(all_options->as_mss.mss);
+				if (tmp > 0 && tmp <
+				    cm_node->tcp_cntxt.mss)
+					cm_node->tcp_cntxt.mss = tmp;
+			}
+			break;
+		case OPTION_NUMBER_WINDOW_SCALE:
+			cm_node->tcp_cntxt.snd_wscale =
+				all_options->as_windowscale.shiftcount;
+			break;
+		default:
+			nes_debug(NES_DBG_CM, "TCP Option not understood: %x\n",
+				  all_options->as_base.optionnum);
+			break;
+		}
+		offset += all_options->as_base.length;
+	}
+	if ((!got_mss_option) && (syn_packet))
+		cm_node->tcp_cntxt.mss = NES_CM_DEFAULT_MSS;
+	return 0;
+}
+
+static void drop_packet(struct sk_buff *skb)
+{
+	atomic_inc(&cm_accel_dropped_pkts);
+	dev_kfree_skb_any(skb);
+}
+
+static void handle_fin_pkt(struct nes_cm_node *cm_node)
+{
+	nes_debug(NES_DBG_CM, "Received FIN, cm_node = %p, state = %u. "
+		  "refcnt=%d\n", cm_node, cm_node->state,
+		  atomic_read(&cm_node->ref_count));
+	switch (cm_node->state) {
+	case NES_CM_STATE_SYN_RCVD:
+	case NES_CM_STATE_SYN_SENT:
+	case NES_CM_STATE_ESTABLISHED:
+	case NES_CM_STATE_MPAREJ_RCVD:
+		cm_node->tcp_cntxt.rcv_nxt++;
+		cleanup_retrans_entry(cm_node);
+		cm_node->state = NES_CM_STATE_LAST_ACK;
+		send_fin(cm_node, NULL);
+		break;
+	case NES_CM_STATE_MPAREQ_SENT:
+		create_event(cm_node, NES_CM_EVENT_ABORTED);
+		cm_node->tcp_cntxt.rcv_nxt++;
+		cleanup_retrans_entry(cm_node);
+		cm_node->state = NES_CM_STATE_CLOSED;
+		add_ref_cm_node(cm_node);
+		send_reset(cm_node, NULL);
+		break;
+	case NES_CM_STATE_FIN_WAIT1:
+		cm_node->tcp_cntxt.rcv_nxt++;
+		cleanup_retrans_entry(cm_node);
+		cm_node->state = NES_CM_STATE_CLOSING;
+		send_ack(cm_node, NULL);
+		/* Wait for ACK as this is simultaneous close..
+		* After we receive ACK, do not send anything..
+		* Just rm the node.. Done.. */
+		break;
+	case NES_CM_STATE_FIN_WAIT2:
+		cm_node->tcp_cntxt.rcv_nxt++;
+		cleanup_retrans_entry(cm_node);
+		cm_node->state = NES_CM_STATE_TIME_WAIT;
+		send_ack(cm_node, NULL);
+		schedule_nes_timer(cm_node, NULL,  NES_TIMER_TYPE_CLOSE, 1, 0);
+		break;
+	case NES_CM_STATE_TIME_WAIT:
+		cm_node->tcp_cntxt.rcv_nxt++;
+		cleanup_retrans_entry(cm_node);
+		cm_node->state = NES_CM_STATE_CLOSED;
+		rem_ref_cm_node(cm_node->cm_core, cm_node);
+		break;
+	case NES_CM_STATE_TSA:
+	default:
+		nes_debug(NES_DBG_CM, "Error Rcvd FIN for node-%p state = %d\n",
+			cm_node, cm_node->state);
+		break;
+	}
+}
+
+
+static void handle_rst_pkt(struct nes_cm_node *cm_node, struct sk_buff *skb,
+	struct tcphdr *tcph)
+{
+
+	int	reset = 0;	/* whether to send reset in case of err.. */
+	atomic_inc(&cm_resets_recvd);
+	nes_debug(NES_DBG_CM, "Received Reset, cm_node = %p, state = %u."
+			" refcnt=%d\n", cm_node, cm_node->state,
+			atomic_read(&cm_node->ref_count));
+	cleanup_retrans_entry(cm_node);
+	switch (cm_node->state) {
+	case NES_CM_STATE_SYN_SENT:
+	case NES_CM_STATE_MPAREQ_SENT:
+		nes_debug(NES_DBG_CM, "%s[%u] create abort for cm_node=%p "
+			"listener=%p state=%d\n", __func__, __LINE__, cm_node,
+			cm_node->listener, cm_node->state);
+		switch (cm_node->mpa_frame_rev) {
+		case IETF_MPA_V2:
+			cm_node->mpa_frame_rev = IETF_MPA_V1;
+			/* send a syn and goto syn sent state */
+			cm_node->state = NES_CM_STATE_SYN_SENT;
+			if (send_syn(cm_node, 0, NULL)) {
+				active_open_err(cm_node, skb, reset);
+			}
+			break;
+		case IETF_MPA_V1:
+		default:
+			active_open_err(cm_node, skb, reset);
+			break;
+		}
+		break;
+	case NES_CM_STATE_MPAREQ_RCVD:
+		atomic_inc(&cm_node->passive_state);
+		dev_kfree_skb_any(skb);
+		break;
+	case NES_CM_STATE_ESTABLISHED:
+	case NES_CM_STATE_SYN_RCVD:
+	case NES_CM_STATE_LISTENING:
+		nes_debug(NES_DBG_CM, "Bad state %s[%u]\n", __func__, __LINE__);
+		passive_open_err(cm_node, skb, reset);
+		break;
+	case NES_CM_STATE_TSA:
+		active_open_err(cm_node, skb, reset);
+		break;
+	case NES_CM_STATE_CLOSED:
+		drop_packet(skb);
+		break;
+	case NES_CM_STATE_FIN_WAIT2:
+	case NES_CM_STATE_FIN_WAIT1:
+	case NES_CM_STATE_LAST_ACK:
+		cm_node->cm_id->rem_ref(cm_node->cm_id);
+	case NES_CM_STATE_TIME_WAIT:
+		cm_node->state = NES_CM_STATE_CLOSED;
+		rem_ref_cm_node(cm_node->cm_core, cm_node);
+		drop_packet(skb);
+		break;
+	default:
+		drop_packet(skb);
+		break;
+	}
+}
+
+
+static void handle_rcv_mpa(struct nes_cm_node *cm_node, struct sk_buff *skb)
+{
+	int ret = 0;
+	int datasize = skb->len;
+	u8 *dataloc = skb->data;
+
+	enum nes_cm_event_type type = NES_CM_EVENT_UNKNOWN;
+	u32 res_type;
+
+	ret = parse_mpa(cm_node, dataloc, &res_type, datasize);
+	if (ret) {
+		nes_debug(NES_DBG_CM, "didn't like MPA Request\n");
+		if (cm_node->state == NES_CM_STATE_MPAREQ_SENT) {
+			nes_debug(NES_DBG_CM, "%s[%u] create abort for "
+				  "cm_node=%p listener=%p state=%d\n", __func__,
+				  __LINE__, cm_node, cm_node->listener,
+				  cm_node->state);
+			active_open_err(cm_node, skb, 1);
+		} else {
+			passive_open_err(cm_node, skb, 1);
+		}
+		return;
+	}
+
+	switch (cm_node->state) {
+	case NES_CM_STATE_ESTABLISHED:
+		if (res_type == NES_MPA_REQUEST_REJECT)
+			/*BIG problem as we are receiving the MPA.. So should
+			 * not be REJECT.. This is Passive Open.. We can
+			 * only receive it Reject for Active Open...*/
+			WARN_ON(1);
+		cm_node->state = NES_CM_STATE_MPAREQ_RCVD;
+		type = NES_CM_EVENT_MPA_REQ;
+		atomic_set(&cm_node->passive_state,
+			   NES_PASSIVE_STATE_INDICATED);
+		break;
+	case NES_CM_STATE_MPAREQ_SENT:
+		cleanup_retrans_entry(cm_node);
+		if (res_type == NES_MPA_REQUEST_REJECT) {
+			type = NES_CM_EVENT_MPA_REJECT;
+			cm_node->state = NES_CM_STATE_MPAREJ_RCVD;
+		} else {
+			type = NES_CM_EVENT_CONNECTED;
+			cm_node->state = NES_CM_STATE_TSA;
+		}
+
+		break;
+	default:
+		WARN_ON(1);
+		break;
+	}
+	dev_kfree_skb_any(skb);
+	create_event(cm_node, type);
+}
+
+static void indicate_pkt_err(struct nes_cm_node *cm_node, struct sk_buff *skb)
+{
+	switch (cm_node->state) {
+	case NES_CM_STATE_SYN_SENT:
+	case NES_CM_STATE_MPAREQ_SENT:
+		nes_debug(NES_DBG_CM, "%s[%u] create abort for cm_node=%p "
+			  "listener=%p state=%d\n", __func__, __LINE__, cm_node,
+			  cm_node->listener, cm_node->state);
+		active_open_err(cm_node, skb, 1);
+		break;
+	case NES_CM_STATE_ESTABLISHED:
+	case NES_CM_STATE_SYN_RCVD:
+		passive_open_err(cm_node, skb, 1);
+		break;
+	case NES_CM_STATE_TSA:
+	default:
+		drop_packet(skb);
+	}
+}
+
+static int check_syn(struct nes_cm_node *cm_node, struct tcphdr *tcph,
+		     struct sk_buff *skb)
+{
+	int err;
+
+	err = ((ntohl(tcph->ack_seq) == cm_node->tcp_cntxt.loc_seq_num)) ? 0 : 1;
+	if (err)
+		active_open_err(cm_node, skb, 1);
+
+	return err;
+}
+
+static int check_seq(struct nes_cm_node *cm_node, struct tcphdr *tcph,
+		     struct sk_buff *skb)
+{
+	int err = 0;
+	u32 seq;
+	u32 ack_seq;
+	u32 loc_seq_num = cm_node->tcp_cntxt.loc_seq_num;
+	u32 rcv_nxt = cm_node->tcp_cntxt.rcv_nxt;
+	u32 rcv_wnd;
+
+	seq = ntohl(tcph->seq);
+	ack_seq = ntohl(tcph->ack_seq);
+	rcv_wnd = cm_node->tcp_cntxt.rcv_wnd;
+	if (ack_seq != loc_seq_num)
+		err = 1;
+	else if (!between(seq, rcv_nxt, (rcv_nxt + rcv_wnd)))
+		err = 1;
+	if (err) {
+		nes_debug(NES_DBG_CM, "%s[%u] create abort for cm_node=%p "
+			  "listener=%p state=%d\n", __func__, __LINE__, cm_node,
+			  cm_node->listener, cm_node->state);
+		indicate_pkt_err(cm_node, skb);
+		nes_debug(NES_DBG_CM, "seq ERROR cm_node =%p seq=0x%08X "
+			  "rcv_nxt=0x%08X rcv_wnd=0x%x\n", cm_node, seq, rcv_nxt,
+			  rcv_wnd);
+	}
+	return err;
+}
+
+/*
+ * handle_syn_pkt() is for Passive node. The syn packet is received when a node
+ * is created with a listener or it may comein as rexmitted packet which in
+ * that case will be just dropped.
+ */
+static void handle_syn_pkt(struct nes_cm_node *cm_node, struct sk_buff *skb,
+			   struct tcphdr *tcph)
+{
+	int ret;
+	u32 inc_sequence;
+	int optionsize;
+
+	optionsize = (tcph->doff << 2) - sizeof(struct tcphdr);
+	skb_trim(skb, 0);
+	inc_sequence = ntohl(tcph->seq);
+
+	switch (cm_node->state) {
+	case NES_CM_STATE_SYN_SENT:
+	case NES_CM_STATE_MPAREQ_SENT:
+		/* Rcvd syn on active open connection*/
+		active_open_err(cm_node, skb, 1);
+		break;
+	case NES_CM_STATE_LISTENING:
+		/* Passive OPEN */
+		if (atomic_read(&cm_node->listener->pend_accepts_cnt) >
+		    cm_node->listener->backlog) {
+			nes_debug(NES_DBG_CM, "drop syn due to backlog "
+				  "pressure \n");
+			cm_backlog_drops++;
+			passive_open_err(cm_node, skb, 0);
+			break;
+		}
+		ret = handle_tcp_options(cm_node, tcph, skb, optionsize,
+					 1);
+		if (ret) {
+			passive_open_err(cm_node, skb, 0);
+			/* drop pkt */
+			break;
+		}
+		cm_node->tcp_cntxt.rcv_nxt = inc_sequence + 1;
+		BUG_ON(cm_node->send_entry);
+		cm_node->accept_pend = 1;
+		atomic_inc(&cm_node->listener->pend_accepts_cnt);
+
+		cm_node->state = NES_CM_STATE_SYN_RCVD;
+		send_syn(cm_node, 1, skb);
+		break;
+	case NES_CM_STATE_CLOSED:
+		cleanup_retrans_entry(cm_node);
+		add_ref_cm_node(cm_node);
+		send_reset(cm_node, skb);
+		break;
+	case NES_CM_STATE_TSA:
+	case NES_CM_STATE_ESTABLISHED:
+	case NES_CM_STATE_FIN_WAIT1:
+	case NES_CM_STATE_FIN_WAIT2:
+	case NES_CM_STATE_MPAREQ_RCVD:
+	case NES_CM_STATE_LAST_ACK:
+	case NES_CM_STATE_CLOSING:
+	case NES_CM_STATE_UNKNOWN:
+	default:
+		drop_packet(skb);
+		break;
+	}
+}
+
+static void handle_synack_pkt(struct nes_cm_node *cm_node, struct sk_buff *skb,
+			      struct tcphdr *tcph)
+{
+	int ret;
+	u32 inc_sequence;
+	int optionsize;
+
+	optionsize = (tcph->doff << 2) - sizeof(struct tcphdr);
+	skb_trim(skb, 0);
+	inc_sequence = ntohl(tcph->seq);
+	switch (cm_node->state) {
+	case NES_CM_STATE_SYN_SENT:
+		cleanup_retrans_entry(cm_node);
+		/* active open */
+		if (check_syn(cm_node, tcph, skb))
+			return;
+		cm_node->tcp_cntxt.rem_ack_num = ntohl(tcph->ack_seq);
+		/* setup options */
+		ret = handle_tcp_options(cm_node, tcph, skb, optionsize, 0);
+		if (ret) {
+			nes_debug(NES_DBG_CM, "cm_node=%p tcp_options failed\n",
+				  cm_node);
+			break;
+		}
+		cleanup_retrans_entry(cm_node);
+		cm_node->tcp_cntxt.rcv_nxt = inc_sequence + 1;
+		send_mpa_request(cm_node, skb);
+		cm_node->state = NES_CM_STATE_MPAREQ_SENT;
+		break;
+	case NES_CM_STATE_MPAREQ_RCVD:
+		/* passive open, so should not be here */
+		passive_open_err(cm_node, skb, 1);
+		break;
+	case NES_CM_STATE_LISTENING:
+		cm_node->tcp_cntxt.loc_seq_num = ntohl(tcph->ack_seq);
+		cleanup_retrans_entry(cm_node);
+		cm_node->state = NES_CM_STATE_CLOSED;
+		send_reset(cm_node, skb);
+		break;
+	case NES_CM_STATE_CLOSED:
+		cm_node->tcp_cntxt.loc_seq_num = ntohl(tcph->ack_seq);
+		cleanup_retrans_entry(cm_node);
+		add_ref_cm_node(cm_node);
+		send_reset(cm_node, skb);
+		break;
+	case NES_CM_STATE_ESTABLISHED:
+	case NES_CM_STATE_FIN_WAIT1:
+	case NES_CM_STATE_FIN_WAIT2:
+	case NES_CM_STATE_LAST_ACK:
+	case NES_CM_STATE_TSA:
+	case NES_CM_STATE_CLOSING:
+	case NES_CM_STATE_UNKNOWN:
+	case NES_CM_STATE_MPAREQ_SENT:
+	default:
+		drop_packet(skb);
+		break;
+	}
+}
+
+static int handle_ack_pkt(struct nes_cm_node *cm_node, struct sk_buff *skb,
+			  struct tcphdr *tcph)
+{
+	int datasize = 0;
+	u32 inc_sequence;
+	int ret = 0;
+	int optionsize;
+
+	optionsize = (tcph->doff << 2) - sizeof(struct tcphdr);
+
+	if (check_seq(cm_node, tcph, skb))
+		return -EINVAL;
+
+	skb_pull(skb, tcph->doff << 2);
+	inc_sequence = ntohl(tcph->seq);
+	datasize = skb->len;
+	switch (cm_node->state) {
+	case NES_CM_STATE_SYN_RCVD:
+		/* Passive OPEN */
+		cleanup_retrans_entry(cm_node);
+		ret = handle_tcp_options(cm_node, tcph, skb, optionsize, 1);
+		if (ret)
+			break;
+		cm_node->tcp_cntxt.rem_ack_num = ntohl(tcph->ack_seq);
+		cm_node->state = NES_CM_STATE_ESTABLISHED;
+		if (datasize) {
+			cm_node->tcp_cntxt.rcv_nxt = inc_sequence + datasize;
+			handle_rcv_mpa(cm_node, skb);
+		} else { /* rcvd ACK only */
+			dev_kfree_skb_any(skb);
+		}
+		break;
+	case NES_CM_STATE_ESTABLISHED:
+		/* Passive OPEN */
+		cleanup_retrans_entry(cm_node);
+		if (datasize) {
+			cm_node->tcp_cntxt.rcv_nxt = inc_sequence + datasize;
+			handle_rcv_mpa(cm_node, skb);
+		} else {
+			drop_packet(skb);
+		}
+		break;
+	case NES_CM_STATE_MPAREQ_SENT:
+		cm_node->tcp_cntxt.rem_ack_num = ntohl(tcph->ack_seq);
+		if (datasize) {
+			cm_node->tcp_cntxt.rcv_nxt = inc_sequence + datasize;
+			handle_rcv_mpa(cm_node, skb);
+		} else { /* Could be just an ack pkt.. */
+			dev_kfree_skb_any(skb);
+		}
+		break;
+	case NES_CM_STATE_LISTENING:
+		cleanup_retrans_entry(cm_node);
+		cm_node->state = NES_CM_STATE_CLOSED;
+		send_reset(cm_node, skb);
+		break;
+	case NES_CM_STATE_CLOSED:
+		cleanup_retrans_entry(cm_node);
+		add_ref_cm_node(cm_node);
+		send_reset(cm_node, skb);
+		break;
+	case NES_CM_STATE_LAST_ACK:
+	case NES_CM_STATE_CLOSING:
+		cleanup_retrans_entry(cm_node);
+		cm_node->state = NES_CM_STATE_CLOSED;
+		cm_node->cm_id->rem_ref(cm_node->cm_id);
+		rem_ref_cm_node(cm_node->cm_core, cm_node);
+		drop_packet(skb);
+		break;
+	case NES_CM_STATE_FIN_WAIT1:
+		cleanup_retrans_entry(cm_node);
+		drop_packet(skb);
+		cm_node->state = NES_CM_STATE_FIN_WAIT2;
+		break;
+	case NES_CM_STATE_SYN_SENT:
+	case NES_CM_STATE_FIN_WAIT2:
+	case NES_CM_STATE_TSA:
+	case NES_CM_STATE_MPAREQ_RCVD:
+	case NES_CM_STATE_UNKNOWN:
+	default:
+		cleanup_retrans_entry(cm_node);
+		drop_packet(skb);
+		break;
+	}
+	return ret;
+}
+
+
+
+static int handle_tcp_options(struct nes_cm_node *cm_node, struct tcphdr *tcph,
+			      struct sk_buff *skb, int optionsize, int passive)
+{
+	u8 *optionsloc = (u8 *)&tcph[1];
+
+	if (optionsize) {
+		if (process_options(cm_node, optionsloc, optionsize,
+				    (u32)tcph->syn)) {
+			nes_debug(NES_DBG_CM, "%s: Node %p, Sending RESET\n",
+				  __func__, cm_node);
+			if (passive)
+				passive_open_err(cm_node, skb, 1);
+			else
+				active_open_err(cm_node, skb, 1);
+			return 1;
+		}
+	}
+
+	cm_node->tcp_cntxt.snd_wnd = ntohs(tcph->window) <<
+				     cm_node->tcp_cntxt.snd_wscale;
+
+	if (cm_node->tcp_cntxt.snd_wnd > cm_node->tcp_cntxt.max_snd_wnd)
+		cm_node->tcp_cntxt.max_snd_wnd = cm_node->tcp_cntxt.snd_wnd;
+	return 0;
+}
+
+/*
+ * active_open_err() will send reset() if flag set..
+ * It will also send ABORT event.
+ */
+static void active_open_err(struct nes_cm_node *cm_node, struct sk_buff *skb,
+			    int reset)
+{
+	cleanup_retrans_entry(cm_node);
+	if (reset) {
+		nes_debug(NES_DBG_CM, "ERROR active err called for cm_node=%p, "
+			  "state=%d\n", cm_node, cm_node->state);
+		add_ref_cm_node(cm_node);
+		send_reset(cm_node, skb);
+	} else {
+		dev_kfree_skb_any(skb);
+	}
+
+	cm_node->state = NES_CM_STATE_CLOSED;
+	create_event(cm_node, NES_CM_EVENT_ABORTED);
+}
+
+/*
+ * passive_open_err() will either do a reset() or will free up the skb and
+ * remove the cm_node.
+ */
+static void passive_open_err(struct nes_cm_node *cm_node, struct sk_buff *skb,
+			     int reset)
+{
+	cleanup_retrans_entry(cm_node);
+	cm_node->state = NES_CM_STATE_CLOSED;
+	if (reset) {
+		nes_debug(NES_DBG_CM, "passive_open_err sending RST for "
+			  "cm_node=%p state =%d\n", cm_node, cm_node->state);
+		send_reset(cm_node, skb);
+	} else {
+		dev_kfree_skb_any(skb);
+		rem_ref_cm_node(cm_node->cm_core, cm_node);
+	}
+}
+
+/*
+ * free_retrans_entry() routines assumes that the retrans_list_lock has
+ * been acquired before calling.
+ */
+static void free_retrans_entry(struct nes_cm_node *cm_node)
+{
+	struct nes_timer_entry *send_entry;
+
+	send_entry = cm_node->send_entry;
+	if (send_entry) {
+		cm_node->send_entry = NULL;
+		dev_kfree_skb_any(send_entry->skb);
+		kfree(send_entry);
+		rem_ref_cm_node(cm_node->cm_core, cm_node);
+	}
+}
+
+static void cleanup_retrans_entry(struct nes_cm_node *cm_node)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&cm_node->retrans_list_lock, flags);
+	free_retrans_entry(cm_node);
+	spin_unlock_irqrestore(&cm_node->retrans_list_lock, flags);
+}
+
+/**
+ * process_packet
+ * Returns skb if to be freed, else it will return NULL if already used..
+ */
+static void process_packet(struct nes_cm_node *cm_node, struct sk_buff *skb,
+			   struct nes_cm_core *cm_core)
+{
+	enum nes_tcpip_pkt_type pkt_type = NES_PKT_TYPE_UNKNOWN;
+	struct tcphdr *tcph = tcp_hdr(skb);
+	u32 fin_set = 0;
+	int ret = 0;
+
+	skb_pull(skb, ip_hdr(skb)->ihl << 2);
+
+	nes_debug(NES_DBG_CM, "process_packet: cm_node=%p state =%d syn=%d "
+		  "ack=%d rst=%d fin=%d\n", cm_node, cm_node->state, tcph->syn,
+		  tcph->ack, tcph->rst, tcph->fin);
+
+	if (tcph->rst) {
+		pkt_type = NES_PKT_TYPE_RST;
+	} else if (tcph->syn) {
+		pkt_type = NES_PKT_TYPE_SYN;
+		if (tcph->ack)
+			pkt_type = NES_PKT_TYPE_SYNACK;
+	} else if (tcph->ack) {
+		pkt_type = NES_PKT_TYPE_ACK;
+	}
+	if (tcph->fin)
+		fin_set = 1;
+
+	switch (pkt_type) {
+	case NES_PKT_TYPE_SYN:
+		handle_syn_pkt(cm_node, skb, tcph);
+		break;
+	case NES_PKT_TYPE_SYNACK:
+		handle_synack_pkt(cm_node, skb, tcph);
+		break;
+	case NES_PKT_TYPE_ACK:
+		ret = handle_ack_pkt(cm_node, skb, tcph);
+		if (fin_set && !ret)
+			handle_fin_pkt(cm_node);
+		break;
+	case NES_PKT_TYPE_RST:
+		handle_rst_pkt(cm_node, skb, tcph);
+		break;
+	default:
+		if ((fin_set) && (!check_seq(cm_node, tcph, skb)))
+			handle_fin_pkt(cm_node);
+		drop_packet(skb);
+		break;
+	}
+}
+
+/**
+ * mini_cm_listen - create a listen node with params
+ */
+static struct nes_cm_listener *mini_cm_listen(struct nes_cm_core *cm_core,
+			struct nes_vnic *nesvnic, struct nes_cm_info *cm_info)
+{
+	struct nes_cm_listener *listener;
+	struct iwpm_dev_data pm_reg_msg;
+	struct iwpm_sa_data pm_msg;
+	unsigned long flags;
+	int iwpm_err = 0;
+
+	nes_debug(NES_DBG_CM, "Search for 0x%08x : 0x%04x\n",
+		  cm_info->loc_addr, cm_info->loc_port);
+
+	/* cannot have multiple matching listeners */
+	listener = find_listener(cm_core, cm_info->loc_addr, cm_info->loc_port,
+				NES_CM_LISTENER_EITHER_STATE, 1);
+
+	if (listener && listener->listener_state == NES_CM_LISTENER_ACTIVE_STATE) {
+		/* find automatically incs ref count ??? */
+		atomic_dec(&listener->ref_count);
+		nes_debug(NES_DBG_CM, "Not creating listener since it already exists\n");
+		return NULL;
+	}
+
+	if (!listener) {
+		nes_form_reg_msg(nesvnic, &pm_reg_msg);
+		iwpm_err = iwpm_register_pid(&pm_reg_msg, RDMA_NL_NES);
+		if (iwpm_err) {
+			nes_debug(NES_DBG_NLMSG,
+			"Port Mapper reg pid fail (err = %d).\n", iwpm_err);
+		}
+		if (iwpm_valid_pid() && !iwpm_err) {
+			nes_form_pm_msg(cm_info, &pm_msg);
+			iwpm_err = iwpm_add_mapping(&pm_msg, RDMA_NL_NES);
+			if (iwpm_err)
+				nes_debug(NES_DBG_NLMSG,
+				"Port Mapper query fail (err = %d).\n", iwpm_err);
+			else
+				nes_record_pm_msg(cm_info, &pm_msg);
+		}
+
+		/* create a CM listen node (1/2 node to compare incoming traffic to) */
+		listener = kzalloc(sizeof(*listener), GFP_ATOMIC);
+		if (!listener) {
+			nes_debug(NES_DBG_CM, "Not creating listener memory allocation failed\n");
+			return NULL;
+		}
+
+		listener->loc_addr = cm_info->loc_addr;
+		listener->loc_port = cm_info->loc_port;
+		listener->mapped_loc_addr = cm_info->mapped_loc_addr;
+		listener->mapped_loc_port = cm_info->mapped_loc_port;
+		listener->reused_node = 0;
+
+		atomic_set(&listener->ref_count, 1);
+	}
+	/* pasive case */
+	/* find already inc'ed the ref count */
+	else {
+		listener->reused_node = 1;
+	}
+
+	listener->cm_id = cm_info->cm_id;
+	atomic_set(&listener->pend_accepts_cnt, 0);
+	listener->cm_core = cm_core;
+	listener->nesvnic = nesvnic;
+	atomic_inc(&cm_core->node_cnt);
+
+	listener->conn_type = cm_info->conn_type;
+	listener->backlog = cm_info->backlog;
+	listener->listener_state = NES_CM_LISTENER_ACTIVE_STATE;
+
+	if (!listener->reused_node) {
+		spin_lock_irqsave(&cm_core->listen_list_lock, flags);
+		list_add(&listener->list, &cm_core->listen_list.list);
+		spin_unlock_irqrestore(&cm_core->listen_list_lock, flags);
+		atomic_inc(&cm_core->listen_node_cnt);
+	}
+
+	nes_debug(NES_DBG_CM, "Api - listen(): addr=0x%08X, port=0x%04x,"
+		  " listener = %p, backlog = %d, cm_id = %p.\n",
+		  cm_info->loc_addr, cm_info->loc_port,
+		  listener, listener->backlog, listener->cm_id);
+
+	return listener;
+}
+
+
+/**
+ * mini_cm_connect - make a connection node with params
+ */
+static struct nes_cm_node *mini_cm_connect(struct nes_cm_core *cm_core,
+					   struct nes_vnic *nesvnic, u16 private_data_len,
+					   void *private_data, struct nes_cm_info *cm_info)
+{
+	int ret = 0;
+	struct nes_cm_node *cm_node;
+	struct nes_cm_listener *loopbackremotelistener;
+	struct nes_cm_node *loopbackremotenode;
+	struct nes_cm_info loopback_cm_info;
+	u8 *start_buff;
+
+	/* create a CM connection node */
+	cm_node = make_cm_node(cm_core, nesvnic, cm_info, NULL);
+	if (!cm_node)
+		return NULL;
+
+	/* set our node side to client (active) side */
+	cm_node->tcp_cntxt.client = 1;
+	cm_node->tcp_cntxt.rcv_wscale = NES_CM_DEFAULT_RCV_WND_SCALE;
+
+	if (cm_info->loc_addr == cm_info->rem_addr) {
+		loopbackremotelistener = find_listener(cm_core,
+			cm_node->mapped_loc_addr, cm_node->mapped_rem_port,
+			NES_CM_LISTENER_ACTIVE_STATE, 0);
+		if (loopbackremotelistener == NULL) {
+			create_event(cm_node, NES_CM_EVENT_ABORTED);
+		} else {
+			loopback_cm_info = *cm_info;
+			loopback_cm_info.loc_port = cm_info->rem_port;
+			loopback_cm_info.rem_port = cm_info->loc_port;
+			loopback_cm_info.mapped_loc_port =
+				cm_info->mapped_rem_port;
+			loopback_cm_info.mapped_rem_port =
+				cm_info->mapped_loc_port;
+			loopback_cm_info.cm_id = loopbackremotelistener->cm_id;
+			loopbackremotenode = make_cm_node(cm_core, nesvnic,
+							  &loopback_cm_info, loopbackremotelistener);
+			if (!loopbackremotenode) {
+				rem_ref_cm_node(cm_node->cm_core, cm_node);
+				return NULL;
+			}
+			atomic_inc(&cm_loopbacks);
+			loopbackremotenode->loopbackpartner = cm_node;
+			loopbackremotenode->tcp_cntxt.rcv_wscale =
+				NES_CM_DEFAULT_RCV_WND_SCALE;
+			cm_node->loopbackpartner = loopbackremotenode;
+			memcpy(loopbackremotenode->mpa_frame_buf, private_data,
+			       private_data_len);
+			loopbackremotenode->mpa_frame_size = private_data_len;
+
+			/* we are done handling this state. */
+			/* set node to a TSA state */
+			cm_node->state = NES_CM_STATE_TSA;
+			cm_node->tcp_cntxt.rcv_nxt =
+				loopbackremotenode->tcp_cntxt.loc_seq_num;
+			loopbackremotenode->tcp_cntxt.rcv_nxt =
+				cm_node->tcp_cntxt.loc_seq_num;
+			cm_node->tcp_cntxt.max_snd_wnd =
+				loopbackremotenode->tcp_cntxt.rcv_wnd;
+			loopbackremotenode->tcp_cntxt.max_snd_wnd =
+				cm_node->tcp_cntxt.rcv_wnd;
+			cm_node->tcp_cntxt.snd_wnd =
+				loopbackremotenode->tcp_cntxt.rcv_wnd;
+			loopbackremotenode->tcp_cntxt.snd_wnd =
+				cm_node->tcp_cntxt.rcv_wnd;
+			cm_node->tcp_cntxt.snd_wscale =
+				loopbackremotenode->tcp_cntxt.rcv_wscale;
+			loopbackremotenode->tcp_cntxt.snd_wscale =
+				cm_node->tcp_cntxt.rcv_wscale;
+			loopbackremotenode->state = NES_CM_STATE_MPAREQ_RCVD;
+			create_event(loopbackremotenode, NES_CM_EVENT_MPA_REQ);
+		}
+		return cm_node;
+	}
+
+	start_buff = &cm_node->mpa_frame_buf[0] + sizeof(struct ietf_mpa_v2);
+	cm_node->mpa_frame_size = private_data_len;
+
+	memcpy(start_buff, private_data, private_data_len);
+
+	/* send a syn and goto syn sent state */
+	cm_node->state = NES_CM_STATE_SYN_SENT;
+	ret = send_syn(cm_node, 0, NULL);
+
+	if (ret) {
+		/* error in sending the syn free up the cm_node struct */
+		nes_debug(NES_DBG_CM, "Api - connect() FAILED: dest "
+			  "addr=0x%08X, port=0x%04x, cm_node=%p, cm_id = %p.\n",
+			  cm_node->rem_addr, cm_node->rem_port, cm_node,
+			  cm_node->cm_id);
+		rem_ref_cm_node(cm_node->cm_core, cm_node);
+		cm_node = NULL;
+	}
+
+	if (cm_node) {
+		nes_debug(NES_DBG_CM, "Api - connect(): dest addr=0x%08X,"
+			  "port=0x%04x, cm_node=%p, cm_id = %p.\n",
+			  cm_node->rem_addr, cm_node->rem_port, cm_node,
+			  cm_node->cm_id);
+	}
+
+	return cm_node;
+}
+
+
+/**
+ * mini_cm_accept - accept a connection
+ * This function is never called
+ */
+static int mini_cm_accept(struct nes_cm_core *cm_core, struct nes_cm_node *cm_node)
+{
+	return 0;
+}
+
+
+/**
+ * mini_cm_reject - reject and teardown a connection
+ */
+static int mini_cm_reject(struct nes_cm_core *cm_core, struct nes_cm_node *cm_node)
+{
+	int ret = 0;
+	int err = 0;
+	int passive_state;
+	struct nes_cm_event event;
+	struct iw_cm_id *cm_id = cm_node->cm_id;
+	struct nes_cm_node *loopback = cm_node->loopbackpartner;
+
+	nes_debug(NES_DBG_CM, "%s cm_node=%p type=%d state=%d\n",
+		  __func__, cm_node, cm_node->tcp_cntxt.client, cm_node->state);
+
+	if (cm_node->tcp_cntxt.client)
+		return ret;
+	cleanup_retrans_entry(cm_node);
+
+	if (!loopback) {
+		passive_state = atomic_add_return(1, &cm_node->passive_state);
+		if (passive_state == NES_SEND_RESET_EVENT) {
+			cm_node->state = NES_CM_STATE_CLOSED;
+			rem_ref_cm_node(cm_core, cm_node);
+		} else {
+			if (cm_node->state == NES_CM_STATE_LISTENER_DESTROYED) {
+				rem_ref_cm_node(cm_core, cm_node);
+			} else {
+				ret = send_mpa_reject(cm_node);
+				if (ret) {
+					cm_node->state = NES_CM_STATE_CLOSED;
+					err = send_reset(cm_node, NULL);
+					if (err)
+						WARN_ON(1);
+				} else {
+					cm_id->add_ref(cm_id);
+				}
+			}
+		}
+	} else {
+		cm_node->cm_id = NULL;
+		if (cm_node->state == NES_CM_STATE_LISTENER_DESTROYED) {
+			rem_ref_cm_node(cm_core, cm_node);
+			rem_ref_cm_node(cm_core, loopback);
+		} else {
+			event.cm_node = loopback;
+			event.cm_info.rem_addr = loopback->rem_addr;
+			event.cm_info.loc_addr = loopback->loc_addr;
+			event.cm_info.rem_port = loopback->rem_port;
+			event.cm_info.loc_port = loopback->loc_port;
+			event.cm_info.cm_id = loopback->cm_id;
+			cm_event_mpa_reject(&event);
+			rem_ref_cm_node(cm_core, cm_node);
+			loopback->state = NES_CM_STATE_CLOSING;
+
+			cm_id = loopback->cm_id;
+			rem_ref_cm_node(cm_core, loopback);
+			cm_id->rem_ref(cm_id);
+		}
+	}
+
+	return ret;
+}
+
+
+/**
+ * mini_cm_close
+ */
+static int mini_cm_close(struct nes_cm_core *cm_core, struct nes_cm_node *cm_node)
+{
+	int ret = 0;
+
+	if (!cm_core || !cm_node)
+		return -EINVAL;
+
+	switch (cm_node->state) {
+	case NES_CM_STATE_SYN_RCVD:
+	case NES_CM_STATE_SYN_SENT:
+	case NES_CM_STATE_ONE_SIDE_ESTABLISHED:
+	case NES_CM_STATE_ESTABLISHED:
+	case NES_CM_STATE_ACCEPTING:
+	case NES_CM_STATE_MPAREQ_SENT:
+	case NES_CM_STATE_MPAREQ_RCVD:
+		cleanup_retrans_entry(cm_node);
+		send_reset(cm_node, NULL);
+		break;
+	case NES_CM_STATE_CLOSE_WAIT:
+		cm_node->state = NES_CM_STATE_LAST_ACK;
+		send_fin(cm_node, NULL);
+		break;
+	case NES_CM_STATE_FIN_WAIT1:
+	case NES_CM_STATE_FIN_WAIT2:
+	case NES_CM_STATE_LAST_ACK:
+	case NES_CM_STATE_TIME_WAIT:
+	case NES_CM_STATE_CLOSING:
+		ret = -1;
+		break;
+	case NES_CM_STATE_LISTENING:
+		cleanup_retrans_entry(cm_node);
+		send_reset(cm_node, NULL);
+		break;
+	case NES_CM_STATE_MPAREJ_RCVD:
+	case NES_CM_STATE_UNKNOWN:
+	case NES_CM_STATE_INITED:
+	case NES_CM_STATE_CLOSED:
+	case NES_CM_STATE_LISTENER_DESTROYED:
+		ret = rem_ref_cm_node(cm_core, cm_node);
+		break;
+	case NES_CM_STATE_TSA:
+		if (cm_node->send_entry)
+			printk(KERN_ERR "ERROR Close got called from STATE_TSA "
+			       "send_entry=%p\n", cm_node->send_entry);
+		ret = rem_ref_cm_node(cm_core, cm_node);
+		break;
+	}
+	return ret;
+}
+
+
+/**
+ * recv_pkt - recv an ETHERNET packet, and process it through CM
+ * node state machine
+ */
+static int mini_cm_recv_pkt(struct nes_cm_core *cm_core,
+			    struct nes_vnic *nesvnic, struct sk_buff *skb)
+{
+	struct nes_cm_node *cm_node = NULL;
+	struct nes_cm_listener *listener = NULL;
+	struct iphdr *iph;
+	struct tcphdr *tcph;
+	struct nes_cm_info nfo;
+	int skb_handled = 1;
+	__be32 tmp_daddr, tmp_saddr;
+
+	if (!skb)
+		return 0;
+	if (skb->len < sizeof(struct iphdr) + sizeof(struct tcphdr))
+		return 0;
+
+	iph = (struct iphdr *)skb->data;
+	tcph = (struct tcphdr *)(skb->data + sizeof(struct iphdr));
+
+	nfo.loc_addr = ntohl(iph->daddr);
+	nfo.loc_port = ntohs(tcph->dest);
+	nfo.rem_addr = ntohl(iph->saddr);
+	nfo.rem_port = ntohs(tcph->source);
+
+	/* If port mapper is available these should be mapped address info */
+	nfo.mapped_loc_addr = ntohl(iph->daddr);
+	nfo.mapped_loc_port = ntohs(tcph->dest);
+	nfo.mapped_rem_addr = ntohl(iph->saddr);
+	nfo.mapped_rem_port = ntohs(tcph->source);
+
+	tmp_daddr = cpu_to_be32(iph->daddr);
+	tmp_saddr = cpu_to_be32(iph->saddr);
+
+	nes_debug(NES_DBG_CM, "Received packet: dest=%pI4:0x%04X src=%pI4:0x%04X\n",
+		  &tmp_daddr, tcph->dest, &tmp_saddr, tcph->source);
+
+	do {
+		cm_node = find_node(cm_core,
+				    nfo.mapped_rem_port, nfo.mapped_rem_addr,
+				    nfo.mapped_loc_port, nfo.mapped_loc_addr);
+
+		if (!cm_node) {
+			/* Only type of packet accepted are for */
+			/* the PASSIVE open (syn only) */
+			if ((!tcph->syn) || (tcph->ack)) {
+				skb_handled = 0;
+				break;
+			}
+			listener = find_listener(cm_core, nfo.mapped_loc_addr,
+					nfo.mapped_loc_port,
+					NES_CM_LISTENER_ACTIVE_STATE, 0);
+			if (!listener) {
+				nfo.cm_id = NULL;
+				nfo.conn_type = 0;
+				nes_debug(NES_DBG_CM, "Unable to find listener for the pkt\n");
+				skb_handled = 0;
+				break;
+			}
+			nfo.cm_id = listener->cm_id;
+			nfo.conn_type = listener->conn_type;
+			cm_node = make_cm_node(cm_core, nesvnic, &nfo,
+					       listener);
+			if (!cm_node) {
+				nes_debug(NES_DBG_CM, "Unable to allocate "
+					  "node\n");
+				cm_packets_dropped++;
+				atomic_dec(&listener->ref_count);
+				dev_kfree_skb_any(skb);
+				break;
+			}
+			if (!tcph->rst && !tcph->fin) {
+				cm_node->state = NES_CM_STATE_LISTENING;
+			} else {
+				cm_packets_dropped++;
+				rem_ref_cm_node(cm_core, cm_node);
+				dev_kfree_skb_any(skb);
+				break;
+			}
+			add_ref_cm_node(cm_node);
+		} else if (cm_node->state == NES_CM_STATE_TSA) {
+			if (cm_node->nesqp->pau_mode)
+				nes_queue_mgt_skbs(skb, nesvnic, cm_node->nesqp);
+			else {
+				rem_ref_cm_node(cm_core, cm_node);
+				atomic_inc(&cm_accel_dropped_pkts);
+				dev_kfree_skb_any(skb);
+			}
+			break;
+		}
+		skb_reset_network_header(skb);
+		skb_set_transport_header(skb, sizeof(*tcph));
+		skb->len = ntohs(iph->tot_len);
+		process_packet(cm_node, skb, cm_core);
+		rem_ref_cm_node(cm_core, cm_node);
+	} while (0);
+	return skb_handled;
+}
+
+
+/**
+ * nes_cm_alloc_core - allocate a top level instance of a cm core
+ */
+static struct nes_cm_core *nes_cm_alloc_core(void)
+{
+	struct nes_cm_core *cm_core;
+
+	/* setup the CM core */
+	/* alloc top level core control structure */
+	cm_core = kzalloc(sizeof(*cm_core), GFP_KERNEL);
+	if (!cm_core)
+		return NULL;
+
+	INIT_LIST_HEAD(&cm_core->connected_nodes);
+	init_timer(&cm_core->tcp_timer);
+	cm_core->tcp_timer.function = nes_cm_timer_tick;
+
+	cm_core->mtu = NES_CM_DEFAULT_MTU;
+	cm_core->state = NES_CM_STATE_INITED;
+	cm_core->free_tx_pkt_max = NES_CM_DEFAULT_FREE_PKTS;
+
+	atomic_set(&cm_core->events_posted, 0);
+
+	cm_core->api = &nes_cm_api;
+
+	spin_lock_init(&cm_core->ht_lock);
+	spin_lock_init(&cm_core->listen_list_lock);
+
+	INIT_LIST_HEAD(&cm_core->listen_list.list);
+
+	nes_debug(NES_DBG_CM, "Init CM Core completed -- cm_core=%p\n", cm_core);
+
+	nes_debug(NES_DBG_CM, "Enable QUEUE EVENTS\n");
+	cm_core->event_wq = create_singlethread_workqueue("nesewq");
+	cm_core->post_event = nes_cm_post_event;
+	nes_debug(NES_DBG_CM, "Enable QUEUE DISCONNECTS\n");
+	cm_core->disconn_wq = create_singlethread_workqueue("nesdwq");
+
+	print_core(cm_core);
+	return cm_core;
+}
+
+
+/**
+ * mini_cm_dealloc_core - deallocate a top level instance of a cm core
+ */
+static int mini_cm_dealloc_core(struct nes_cm_core *cm_core)
+{
+	nes_debug(NES_DBG_CM, "De-Alloc CM Core (%p)\n", cm_core);
+
+	if (!cm_core)
+		return -EINVAL;
+
+	barrier();
+
+	if (timer_pending(&cm_core->tcp_timer))
+		del_timer(&cm_core->tcp_timer);
+
+	destroy_workqueue(cm_core->event_wq);
+	destroy_workqueue(cm_core->disconn_wq);
+	nes_debug(NES_DBG_CM, "\n");
+	kfree(cm_core);
+
+	return 0;
+}
+
+
+/**
+ * mini_cm_get
+ */
+static int mini_cm_get(struct nes_cm_core *cm_core)
+{
+	return cm_core->state;
+}
+
+
+/**
+ * mini_cm_set
+ */
+static int mini_cm_set(struct nes_cm_core *cm_core, u32 type, u32 value)
+{
+	int ret = 0;
+
+	switch (type) {
+	case NES_CM_SET_PKT_SIZE:
+		cm_core->mtu = value;
+		break;
+	case NES_CM_SET_FREE_PKT_Q_SIZE:
+		cm_core->free_tx_pkt_max = value;
+		break;
+	default:
+		/* unknown set option */
+		ret = -EINVAL;
+	}
+
+	return ret;
+}
+
+
+/**
+ * nes_cm_init_tsa_conn setup HW; MPA frames must be
+ * successfully exchanged when this is called
+ */
+static int nes_cm_init_tsa_conn(struct nes_qp *nesqp, struct nes_cm_node *cm_node)
+{
+	int ret = 0;
+
+	if (!nesqp)
+		return -EINVAL;
+
+	nesqp->nesqp_context->misc |= cpu_to_le32(NES_QPCONTEXT_MISC_IPV4 |
+						  NES_QPCONTEXT_MISC_NO_NAGLE | NES_QPCONTEXT_MISC_DO_NOT_FRAG |
+						  NES_QPCONTEXT_MISC_DROS);
+
+	if (cm_node->tcp_cntxt.snd_wscale || cm_node->tcp_cntxt.rcv_wscale)
+		nesqp->nesqp_context->misc |= cpu_to_le32(NES_QPCONTEXT_MISC_WSCALE);
+
+	nesqp->nesqp_context->misc2 |= cpu_to_le32(64 << NES_QPCONTEXT_MISC2_TTL_SHIFT);
+
+	nesqp->nesqp_context->mss |= cpu_to_le32(((u32)cm_node->tcp_cntxt.mss) << 16);
+
+	nesqp->nesqp_context->tcp_state_flow_label |= cpu_to_le32(
+		(u32)NES_QPCONTEXT_TCPSTATE_EST << NES_QPCONTEXT_TCPFLOW_TCP_STATE_SHIFT);
+
+	nesqp->nesqp_context->pd_index_wscale |= cpu_to_le32(
+		(cm_node->tcp_cntxt.snd_wscale << NES_QPCONTEXT_PDWSCALE_SND_WSCALE_SHIFT) &
+		NES_QPCONTEXT_PDWSCALE_SND_WSCALE_MASK);
+
+	nesqp->nesqp_context->pd_index_wscale |= cpu_to_le32(
+		(cm_node->tcp_cntxt.rcv_wscale << NES_QPCONTEXT_PDWSCALE_RCV_WSCALE_SHIFT) &
+		NES_QPCONTEXT_PDWSCALE_RCV_WSCALE_MASK);
+
+	nesqp->nesqp_context->keepalive = cpu_to_le32(0x80);
+	nesqp->nesqp_context->ts_recent = 0;
+	nesqp->nesqp_context->ts_age = 0;
+	nesqp->nesqp_context->snd_nxt = cpu_to_le32(cm_node->tcp_cntxt.loc_seq_num);
+	nesqp->nesqp_context->snd_wnd = cpu_to_le32(cm_node->tcp_cntxt.snd_wnd);
+	nesqp->nesqp_context->rcv_nxt = cpu_to_le32(cm_node->tcp_cntxt.rcv_nxt);
+	nesqp->nesqp_context->rcv_wnd = cpu_to_le32(cm_node->tcp_cntxt.rcv_wnd <<
+						    cm_node->tcp_cntxt.rcv_wscale);
+	nesqp->nesqp_context->snd_max = cpu_to_le32(cm_node->tcp_cntxt.loc_seq_num);
+	nesqp->nesqp_context->snd_una = cpu_to_le32(cm_node->tcp_cntxt.loc_seq_num);
+	nesqp->nesqp_context->srtt = 0;
+	nesqp->nesqp_context->rttvar = cpu_to_le32(0x6);
+	nesqp->nesqp_context->ssthresh = cpu_to_le32(0x3FFFC000);
+	nesqp->nesqp_context->cwnd = cpu_to_le32(2 * cm_node->tcp_cntxt.mss);
+	nesqp->nesqp_context->snd_wl1 = cpu_to_le32(cm_node->tcp_cntxt.rcv_nxt);
+	nesqp->nesqp_context->snd_wl2 = cpu_to_le32(cm_node->tcp_cntxt.loc_seq_num);
+	nesqp->nesqp_context->max_snd_wnd = cpu_to_le32(cm_node->tcp_cntxt.max_snd_wnd);
+
+	nes_debug(NES_DBG_CM, "QP%u: rcv_nxt = 0x%08X, snd_nxt = 0x%08X,"
+		  " Setting MSS to %u, PDWscale = 0x%08X, rcv_wnd = %u, context misc = 0x%08X.\n",
+		  nesqp->hwqp.qp_id, le32_to_cpu(nesqp->nesqp_context->rcv_nxt),
+		  le32_to_cpu(nesqp->nesqp_context->snd_nxt),
+		  cm_node->tcp_cntxt.mss, le32_to_cpu(nesqp->nesqp_context->pd_index_wscale),
+		  le32_to_cpu(nesqp->nesqp_context->rcv_wnd),
+		  le32_to_cpu(nesqp->nesqp_context->misc));
+	nes_debug(NES_DBG_CM, "  snd_wnd  = 0x%08X.\n", le32_to_cpu(nesqp->nesqp_context->snd_wnd));
+	nes_debug(NES_DBG_CM, "  snd_cwnd = 0x%08X.\n", le32_to_cpu(nesqp->nesqp_context->cwnd));
+	nes_debug(NES_DBG_CM, "  max_swnd = 0x%08X.\n", le32_to_cpu(nesqp->nesqp_context->max_snd_wnd));
+
+	nes_debug(NES_DBG_CM, "Change cm_node state to TSA\n");
+	cm_node->state = NES_CM_STATE_TSA;
+
+	return ret;
+}
+
+
+/**
+ * nes_cm_disconn
+ */
+int nes_cm_disconn(struct nes_qp *nesqp)
+{
+	struct disconn_work *work;
+
+	work = kzalloc(sizeof *work, GFP_ATOMIC);
+	if (!work)
+		return -ENOMEM;  /* Timer will clean up */
+
+	nes_add_ref(&nesqp->ibqp);
+	work->nesqp = nesqp;
+	INIT_WORK(&work->work, nes_disconnect_worker);
+	queue_work(g_cm_core->disconn_wq, &work->work);
+	return 0;
+}
+
+
+/**
+ * nes_disconnect_worker
+ */
+static void nes_disconnect_worker(struct work_struct *work)
+{
+	struct disconn_work *dwork = container_of(work, struct disconn_work, work);
+	struct nes_qp *nesqp = dwork->nesqp;
+
+	kfree(dwork);
+	nes_debug(NES_DBG_CM, "processing AEQE id 0x%04X for QP%u.\n",
+		  nesqp->last_aeq, nesqp->hwqp.qp_id);
+	nes_cm_disconn_true(nesqp);
+	nes_rem_ref(&nesqp->ibqp);
+}
+
+
+/**
+ * nes_cm_disconn_true
+ */
+static int nes_cm_disconn_true(struct nes_qp *nesqp)
+{
+	unsigned long flags;
+	int ret = 0;
+	struct iw_cm_id *cm_id;
+	struct iw_cm_event cm_event;
+	struct nes_vnic *nesvnic;
+	u16 last_ae;
+	u8 original_hw_tcp_state;
+	u8 original_ibqp_state;
+	int disconn_status = 0;
+	int issue_disconn = 0;
+	int issue_close = 0;
+	int issue_flush = 0;
+	u32 flush_q = NES_CQP_FLUSH_RQ;
+	struct ib_event ibevent;
+
+	if (!nesqp) {
+		nes_debug(NES_DBG_CM, "disconnect_worker nesqp is NULL\n");
+		return -1;
+	}
+
+	spin_lock_irqsave(&nesqp->lock, flags);
+	cm_id = nesqp->cm_id;
+	/* make sure we havent already closed this connection */
+	if (!cm_id) {
+		nes_debug(NES_DBG_CM, "QP%u disconnect_worker cmid is NULL\n",
+			  nesqp->hwqp.qp_id);
+		spin_unlock_irqrestore(&nesqp->lock, flags);
+		return -1;
+	}
+
+	nesvnic = to_nesvnic(nesqp->ibqp.device);
+	nes_debug(NES_DBG_CM, "Disconnecting QP%u\n", nesqp->hwqp.qp_id);
+
+	original_hw_tcp_state = nesqp->hw_tcp_state;
+	original_ibqp_state = nesqp->ibqp_state;
+	last_ae = nesqp->last_aeq;
+
+	if (nesqp->term_flags) {
+		issue_disconn = 1;
+		issue_close = 1;
+		nesqp->cm_id = NULL;
+		del_timer(&nesqp->terminate_timer);
+		if (nesqp->flush_issued == 0) {
+			nesqp->flush_issued = 1;
+			issue_flush = 1;
+		}
+	} else if ((original_hw_tcp_state == NES_AEQE_TCP_STATE_CLOSE_WAIT) ||
+			((original_ibqp_state == IB_QPS_RTS) &&
+			(last_ae == NES_AEQE_AEID_LLP_CONNECTION_RESET))) {
+		issue_disconn = 1;
+		if (last_ae == NES_AEQE_AEID_LLP_CONNECTION_RESET)
+			disconn_status = -ECONNRESET;
+	}
+
+	if (((original_hw_tcp_state == NES_AEQE_TCP_STATE_CLOSED) ||
+		 (original_hw_tcp_state == NES_AEQE_TCP_STATE_TIME_WAIT) ||
+		 (last_ae == NES_AEQE_AEID_RDMAP_ROE_BAD_LLP_CLOSE) ||
+		 (last_ae == NES_AEQE_AEID_LLP_CONNECTION_RESET))) {
+		issue_close = 1;
+		nesqp->cm_id = NULL;
+		if (nesqp->flush_issued == 0) {
+			nesqp->flush_issued = 1;
+			issue_flush = 1;
+		}
+	}
+
+	spin_unlock_irqrestore(&nesqp->lock, flags);
+
+	if ((issue_flush) && (nesqp->destroyed == 0)) {
+		/* Flush the queue(s) */
+		if (nesqp->hw_iwarp_state >= NES_AEQE_IWARP_STATE_TERMINATE)
+			flush_q |= NES_CQP_FLUSH_SQ;
+		flush_wqes(nesvnic->nesdev, nesqp, flush_q, 1);
+
+		if (nesqp->term_flags) {
+			ibevent.device = nesqp->ibqp.device;
+			ibevent.event = nesqp->terminate_eventtype;
+			ibevent.element.qp = &nesqp->ibqp;
+			if (nesqp->ibqp.event_handler)
+				nesqp->ibqp.event_handler(&ibevent, nesqp->ibqp.qp_context);
+		}
+	}
+
+	if ((cm_id) && (cm_id->event_handler)) {
+		if (issue_disconn) {
+			atomic_inc(&cm_disconnects);
+			cm_event.event = IW_CM_EVENT_DISCONNECT;
+			cm_event.status = disconn_status;
+			cm_event.local_addr = cm_id->local_addr;
+			cm_event.remote_addr = cm_id->remote_addr;
+			cm_event.private_data = NULL;
+			cm_event.private_data_len = 0;
+
+			nes_debug(NES_DBG_CM, "Generating a CM Disconnect Event"
+				  " for  QP%u, SQ Head = %u, SQ Tail = %u. "
+				  "cm_id = %p, refcount = %u.\n",
+				  nesqp->hwqp.qp_id, nesqp->hwqp.sq_head,
+				  nesqp->hwqp.sq_tail, cm_id,
+				  atomic_read(&nesqp->refcount));
+
+			ret = cm_id->event_handler(cm_id, &cm_event);
+			if (ret)
+				nes_debug(NES_DBG_CM, "OFA CM event_handler "
+					  "returned, ret=%d\n", ret);
+		}
+
+		if (issue_close) {
+			atomic_inc(&cm_closes);
+			nes_disconnect(nesqp, 1);
+
+			cm_id->provider_data = nesqp;
+			/* Send up the close complete event */
+			cm_event.event = IW_CM_EVENT_CLOSE;
+			cm_event.status = 0;
+			cm_event.provider_data = cm_id->provider_data;
+			cm_event.local_addr = cm_id->local_addr;
+			cm_event.remote_addr = cm_id->remote_addr;
+			cm_event.private_data = NULL;
+			cm_event.private_data_len = 0;
+
+			ret = cm_id->event_handler(cm_id, &cm_event);
+			if (ret)
+				nes_debug(NES_DBG_CM, "OFA CM event_handler returned, ret=%d\n", ret);
+
+			cm_id->rem_ref(cm_id);
+		}
+	}
+
+	return 0;
+}
+
+
+/**
+ * nes_disconnect
+ */
+static int nes_disconnect(struct nes_qp *nesqp, int abrupt)
+{
+	int ret = 0;
+	struct nes_vnic *nesvnic;
+	struct nes_device *nesdev;
+	struct nes_ib_device *nesibdev;
+
+	nesvnic = to_nesvnic(nesqp->ibqp.device);
+	if (!nesvnic)
+		return -EINVAL;
+
+	nesdev = nesvnic->nesdev;
+	nesibdev = nesvnic->nesibdev;
+
+	nes_debug(NES_DBG_CM, "netdev refcnt = %u.\n",
+			netdev_refcnt_read(nesvnic->netdev));
+
+	if (nesqp->active_conn) {
+
+		/* indicate this connection is NOT active */
+		nesqp->active_conn = 0;
+	} else {
+		/* Need to free the Last Streaming Mode Message */
+		if (nesqp->ietf_frame) {
+			if (nesqp->lsmm_mr)
+				nesibdev->ibdev.dereg_mr(nesqp->lsmm_mr);
+			pci_free_consistent(nesdev->pcidev,
+					    nesqp->private_data_len + nesqp->ietf_frame_size,
+					    nesqp->ietf_frame, nesqp->ietf_frame_pbase);
+		}
+	}
+
+	/* close the CM node down if it is still active */
+	if (nesqp->cm_node) {
+		nes_debug(NES_DBG_CM, "Call close API\n");
+
+		g_cm_core->api->close(g_cm_core, nesqp->cm_node);
+	}
+
+	return ret;
+}
+
+
+/**
+ * nes_accept
+ */
+int nes_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
+{
+	u64 u64temp;
+	struct ib_qp *ibqp;
+	struct nes_qp *nesqp;
+	struct nes_vnic *nesvnic;
+	struct nes_device *nesdev;
+	struct nes_cm_node *cm_node;
+	struct nes_adapter *adapter;
+	struct ib_qp_attr attr;
+	struct iw_cm_event cm_event;
+	struct nes_hw_qp_wqe *wqe;
+	struct nes_v4_quad nes_quad;
+	u32 crc_value;
+	int ret;
+	int passive_state;
+	struct nes_ib_device *nesibdev;
+	struct ib_mr *ibmr = NULL;
+	struct ib_phys_buf ibphysbuf;
+	struct nes_pd *nespd;
+	u64 tagged_offset;
+	u8 mpa_frame_offset = 0;
+	struct ietf_mpa_v2 *mpa_v2_frame;
+	u8 start_addr = 0;
+	u8 *start_ptr = &start_addr;
+	u8 **start_buff = &start_ptr;
+	u16 buff_len = 0;
+	struct sockaddr_in *laddr = (struct sockaddr_in *)&cm_id->local_addr;
+	struct sockaddr_in *raddr = (struct sockaddr_in *)&cm_id->remote_addr;
+
+	ibqp = nes_get_qp(cm_id->device, conn_param->qpn);
+	if (!ibqp)
+		return -EINVAL;
+
+	/* get all our handles */
+	nesqp = to_nesqp(ibqp);
+	nesvnic = to_nesvnic(nesqp->ibqp.device);
+	nesdev = nesvnic->nesdev;
+	adapter = nesdev->nesadapter;
+
+	cm_node = (struct nes_cm_node *)cm_id->provider_data;
+	nes_debug(NES_DBG_CM, "nes_accept: cm_node= %p nesvnic=%p, netdev=%p,"
+		"%s\n", cm_node, nesvnic, nesvnic->netdev,
+		nesvnic->netdev->name);
+
+	if (NES_CM_STATE_LISTENER_DESTROYED == cm_node->state) {
+		if (cm_node->loopbackpartner)
+			rem_ref_cm_node(cm_node->cm_core, cm_node->loopbackpartner);
+		rem_ref_cm_node(cm_node->cm_core, cm_node);
+		return -EINVAL;
+	}
+
+	passive_state = atomic_add_return(1, &cm_node->passive_state);
+	if (passive_state == NES_SEND_RESET_EVENT) {
+		rem_ref_cm_node(cm_node->cm_core, cm_node);
+		return -ECONNRESET;
+	}
+	/* associate the node with the QP */
+	nesqp->cm_node = (void *)cm_node;
+	cm_node->nesqp = nesqp;
+
+
+	nes_debug(NES_DBG_CM, "QP%u, cm_node=%p, jiffies = %lu listener = %p\n",
+		nesqp->hwqp.qp_id, cm_node, jiffies, cm_node->listener);
+	atomic_inc(&cm_accepts);
+
+	nes_debug(NES_DBG_CM, "netdev refcnt = %u.\n",
+			netdev_refcnt_read(nesvnic->netdev));
+
+	nesqp->ietf_frame_size = sizeof(struct ietf_mpa_v2);
+	/* allocate the ietf frame and space for private data */
+	nesqp->ietf_frame = pci_alloc_consistent(nesdev->pcidev,
+						 nesqp->ietf_frame_size + conn_param->private_data_len,
+						 &nesqp->ietf_frame_pbase);
+
+	if (!nesqp->ietf_frame) {
+		nes_debug(NES_DBG_CM, "Unable to allocate memory for private data\n");
+		return -ENOMEM;
+	}
+	mpa_v2_frame = (struct ietf_mpa_v2 *)nesqp->ietf_frame;
+
+	if (cm_node->mpa_frame_rev == IETF_MPA_V1)
+		mpa_frame_offset = 4;
+
+	if (cm_node->mpa_frame_rev == IETF_MPA_V1 ||
+			cm_node->mpav2_ird_ord == IETF_NO_IRD_ORD) {
+		record_ird_ord(cm_node, (u16)conn_param->ird, (u16)conn_param->ord);
+	}
+
+	memcpy(mpa_v2_frame->priv_data, conn_param->private_data,
+	       conn_param->private_data_len);
+
+	cm_build_mpa_frame(cm_node, start_buff, &buff_len, nesqp->ietf_frame, MPA_KEY_REPLY);
+	nesqp->private_data_len = conn_param->private_data_len;
+
+	/* setup our first outgoing iWarp send WQE (the IETF frame response) */
+	wqe = &nesqp->hwqp.sq_vbase[0];
+
+	if (raddr->sin_addr.s_addr != laddr->sin_addr.s_addr) {
+		u64temp = (unsigned long)nesqp;
+		nesibdev = nesvnic->nesibdev;
+		nespd = nesqp->nespd;
+		ibphysbuf.addr = nesqp->ietf_frame_pbase + mpa_frame_offset;
+		ibphysbuf.size = buff_len;
+		tagged_offset = (u64)(unsigned long)*start_buff;
+		ibmr = nesibdev->ibdev.reg_phys_mr((struct ib_pd *)nespd,
+						   &ibphysbuf, 1,
+						   IB_ACCESS_LOCAL_WRITE,
+						   &tagged_offset);
+		if (!ibmr) {
+			nes_debug(NES_DBG_CM, "Unable to register memory region"
+				  "for lSMM for cm_node = %p \n",
+				  cm_node);
+			pci_free_consistent(nesdev->pcidev,
+					    nesqp->private_data_len + nesqp->ietf_frame_size,
+					    nesqp->ietf_frame, nesqp->ietf_frame_pbase);
+			return -ENOMEM;
+		}
+
+		ibmr->pd = &nespd->ibpd;
+		ibmr->device = nespd->ibpd.device;
+		nesqp->lsmm_mr = ibmr;
+
+		u64temp |= NES_SW_CONTEXT_ALIGN >> 1;
+		set_wqe_64bit_value(wqe->wqe_words,
+				    NES_IWARP_SQ_WQE_COMP_CTX_LOW_IDX,
+				    u64temp);
+		wqe->wqe_words[NES_IWARP_SQ_WQE_MISC_IDX] =
+			cpu_to_le32(NES_IWARP_SQ_WQE_STREAMING |
+				    NES_IWARP_SQ_WQE_WRPDU);
+		wqe->wqe_words[NES_IWARP_SQ_WQE_TOTAL_PAYLOAD_IDX] =
+			cpu_to_le32(buff_len);
+		set_wqe_64bit_value(wqe->wqe_words,
+				    NES_IWARP_SQ_WQE_FRAG0_LOW_IDX,
+				    (u64)(unsigned long)(*start_buff));
+		wqe->wqe_words[NES_IWARP_SQ_WQE_LENGTH0_IDX] =
+			cpu_to_le32(buff_len);
+		wqe->wqe_words[NES_IWARP_SQ_WQE_STAG0_IDX] = ibmr->lkey;
+		if (nesqp->sq_kmapped) {
+			nesqp->sq_kmapped = 0;
+			kunmap(nesqp->page);
+		}
+
+		nesqp->nesqp_context->ird_ord_sizes |=
+			cpu_to_le32(NES_QPCONTEXT_ORDIRD_LSMM_PRESENT |
+				    NES_QPCONTEXT_ORDIRD_WRPDU);
+	} else {
+		nesqp->nesqp_context->ird_ord_sizes |=
+			cpu_to_le32(NES_QPCONTEXT_ORDIRD_WRPDU);
+	}
+	nesqp->skip_lsmm = 1;
+
+	/* Cache the cm_id in the qp */
+	nesqp->cm_id = cm_id;
+	cm_node->cm_id = cm_id;
+
+	/*  nesqp->cm_node = (void *)cm_id->provider_data; */
+	cm_id->provider_data = nesqp;
+	nesqp->active_conn = 0;
+
+	if (cm_node->state == NES_CM_STATE_TSA)
+		nes_debug(NES_DBG_CM, "Already state = TSA for cm_node=%p\n",
+			  cm_node);
+
+	nes_cm_init_tsa_conn(nesqp, cm_node);
+
+	nesqp->nesqp_context->tcpPorts[0] =
+				cpu_to_le16(cm_node->mapped_loc_port);
+	nesqp->nesqp_context->tcpPorts[1] =
+				cpu_to_le16(cm_node->mapped_rem_port);
+
+	nesqp->nesqp_context->ip0 = cpu_to_le32(cm_node->mapped_rem_addr);
+
+	nesqp->nesqp_context->misc2 |= cpu_to_le32(
+		(u32)PCI_FUNC(nesdev->pcidev->devfn) <<
+		NES_QPCONTEXT_MISC2_SRC_IP_SHIFT);
+
+	nesqp->nesqp_context->arp_index_vlan |=
+		cpu_to_le32(nes_arp_table(nesdev,
+					  le32_to_cpu(nesqp->nesqp_context->ip0), NULL,
+					  NES_ARP_RESOLVE) << 16);
+
+	nesqp->nesqp_context->ts_val_delta = cpu_to_le32(
+		jiffies - nes_read_indexed(nesdev, NES_IDX_TCP_NOW));
+
+	nesqp->nesqp_context->ird_index = cpu_to_le32(nesqp->hwqp.qp_id);
+
+	nesqp->nesqp_context->ird_ord_sizes |= cpu_to_le32(
+		((u32)1 << NES_QPCONTEXT_ORDIRD_IWARP_MODE_SHIFT));
+	nesqp->nesqp_context->ird_ord_sizes |=
+		cpu_to_le32((u32)cm_node->ord_size);
+
+	memset(&nes_quad, 0, sizeof(nes_quad));
+	nes_quad.DstIpAdrIndex =
+		cpu_to_le32((u32)PCI_FUNC(nesdev->pcidev->devfn) << 24);
+	nes_quad.SrcIpadr = htonl(cm_node->mapped_rem_addr);
+	nes_quad.TcpPorts[0] = htons(cm_node->mapped_rem_port);
+	nes_quad.TcpPorts[1] = htons(cm_node->mapped_loc_port);
+
+	/* Produce hash key */
+	crc_value = get_crc_value(&nes_quad);
+	nesqp->hte_index = cpu_to_be32(crc_value ^ 0xffffffff);
+	nes_debug(NES_DBG_CM, "HTE Index = 0x%08X, CRC = 0x%08X\n",
+		  nesqp->hte_index, nesqp->hte_index & adapter->hte_index_mask);
+
+	nesqp->hte_index &= adapter->hte_index_mask;
+	nesqp->nesqp_context->hte_index = cpu_to_le32(nesqp->hte_index);
+
+	cm_node->cm_core->api->accelerated(cm_node->cm_core, cm_node);
+
+	nes_debug(NES_DBG_CM, "QP%u, Destination IP = 0x%08X:0x%04X, local = "
+		  "0x%08X:0x%04X, rcv_nxt=0x%08X, snd_nxt=0x%08X, mpa + "
+		  "private data length=%u.\n", nesqp->hwqp.qp_id,
+		  ntohl(raddr->sin_addr.s_addr), ntohs(raddr->sin_port),
+		  ntohl(laddr->sin_addr.s_addr), ntohs(laddr->sin_port),
+		  le32_to_cpu(nesqp->nesqp_context->rcv_nxt),
+		  le32_to_cpu(nesqp->nesqp_context->snd_nxt),
+		  buff_len);
+
+	/* notify OF layer that accept event was successful */
+	cm_id->add_ref(cm_id);
+	nes_add_ref(&nesqp->ibqp);
+
+	cm_event.event = IW_CM_EVENT_ESTABLISHED;
+	cm_event.status = 0;
+	cm_event.provider_data = (void *)nesqp;
+	cm_event.local_addr = cm_id->local_addr;
+	cm_event.remote_addr = cm_id->remote_addr;
+	cm_event.private_data = NULL;
+	cm_event.private_data_len = 0;
+	cm_event.ird = cm_node->ird_size;
+	cm_event.ord = cm_node->ord_size;
+
+	ret = cm_id->event_handler(cm_id, &cm_event);
+	attr.qp_state = IB_QPS_RTS;
+	nes_modify_qp(&nesqp->ibqp, &attr, IB_QP_STATE, NULL);
+	if (cm_node->loopbackpartner) {
+		cm_node->loopbackpartner->mpa_frame_size =
+			nesqp->private_data_len;
+		/* copy entire MPA frame to our cm_node's frame */
+		memcpy(cm_node->loopbackpartner->mpa_frame_buf,
+		       conn_param->private_data, conn_param->private_data_len);
+		create_event(cm_node->loopbackpartner, NES_CM_EVENT_CONNECTED);
+	}
+	if (ret)
+		printk(KERN_ERR "%s[%u] OFA CM event_handler returned, "
+		       "ret=%d\n", __func__, __LINE__, ret);
+
+	return 0;
+}
+
+
+/**
+ * nes_reject
+ */
+int nes_reject(struct iw_cm_id *cm_id, const void *pdata, u8 pdata_len)
+{
+	struct nes_cm_node *cm_node;
+	struct nes_cm_node *loopback;
+	struct nes_cm_core *cm_core;
+	u8 *start_buff;
+
+	atomic_inc(&cm_rejects);
+	cm_node = (struct nes_cm_node *)cm_id->provider_data;
+	loopback = cm_node->loopbackpartner;
+	cm_core = cm_node->cm_core;
+	cm_node->cm_id = cm_id;
+
+	if (pdata_len + sizeof(struct ietf_mpa_v2) > MAX_CM_BUFFER)
+		return -EINVAL;
+
+	if (loopback) {
+		memcpy(&loopback->mpa_frame.priv_data, pdata, pdata_len);
+		loopback->mpa_frame.priv_data_len = pdata_len;
+		loopback->mpa_frame_size = pdata_len;
+	} else {
+		start_buff = &cm_node->mpa_frame_buf[0] + sizeof(struct ietf_mpa_v2);
+		cm_node->mpa_frame_size = pdata_len;
+		memcpy(start_buff, pdata, pdata_len);
+	}
+	return cm_core->api->reject(cm_core, cm_node);
+}
+
+
+/**
+ * nes_connect
+ * setup and launch cm connect node
+ */
+int nes_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
+{
+	struct ib_qp *ibqp;
+	struct nes_qp *nesqp;
+	struct nes_vnic *nesvnic;
+	struct nes_device *nesdev;
+	struct nes_cm_node *cm_node;
+	struct nes_cm_info cm_info;
+	int apbvt_set = 0;
+	struct sockaddr_in *laddr = (struct sockaddr_in *)&cm_id->local_addr;
+	struct sockaddr_in *raddr = (struct sockaddr_in *)&cm_id->remote_addr;
+	struct iwpm_dev_data pm_reg_msg;
+	struct iwpm_sa_data pm_msg;
+	int iwpm_err = 0;
+
+	if (cm_id->remote_addr.ss_family != AF_INET)
+		return -ENOSYS;
+	ibqp = nes_get_qp(cm_id->device, conn_param->qpn);
+	if (!ibqp)
+		return -EINVAL;
+	nesqp = to_nesqp(ibqp);
+	if (!nesqp)
+		return -EINVAL;
+	nesvnic = to_nesvnic(nesqp->ibqp.device);
+	if (!nesvnic)
+		return -EINVAL;
+	nesdev = nesvnic->nesdev;
+	if (!nesdev)
+		return -EINVAL;
+
+	if (!laddr->sin_port || !raddr->sin_port)
+		return -EINVAL;
+
+	nes_debug(NES_DBG_CM, "QP%u, current IP = 0x%08X, Destination IP = "
+		  "0x%08X:0x%04X, local = 0x%08X:0x%04X.\n", nesqp->hwqp.qp_id,
+		  ntohl(nesvnic->local_ipaddr), ntohl(raddr->sin_addr.s_addr),
+		  ntohs(raddr->sin_port), ntohl(laddr->sin_addr.s_addr),
+		  ntohs(laddr->sin_port));
+
+	atomic_inc(&cm_connects);
+	nesqp->active_conn = 1;
+
+	/* cache the cm_id in the qp */
+	nesqp->cm_id = cm_id;
+	cm_id->provider_data = nesqp;
+	nesqp->private_data_len = conn_param->private_data_len;
+
+	nes_debug(NES_DBG_CM, "requested ord = 0x%08X.\n", (u32)conn_param->ord);
+	nes_debug(NES_DBG_CM, "mpa private data len =%u\n",
+		  conn_param->private_data_len);
+
+	/* set up the connection params for the node */
+	cm_info.loc_addr = ntohl(laddr->sin_addr.s_addr);
+	cm_info.loc_port = ntohs(laddr->sin_port);
+	cm_info.rem_addr = ntohl(raddr->sin_addr.s_addr);
+	cm_info.rem_port = ntohs(raddr->sin_port);
+	cm_info.cm_id = cm_id;
+	cm_info.conn_type = NES_CM_IWARP_CONN_TYPE;
+
+	/* No port mapper available, go with the specified peer information */
+	cm_info.mapped_loc_addr = cm_info.loc_addr;
+	cm_info.mapped_loc_port = cm_info.loc_port;
+	cm_info.mapped_rem_addr = cm_info.rem_addr;
+	cm_info.mapped_rem_port = cm_info.rem_port;
+
+	nes_form_reg_msg(nesvnic, &pm_reg_msg);
+	iwpm_err = iwpm_register_pid(&pm_reg_msg, RDMA_NL_NES);
+	if (iwpm_err) {
+		nes_debug(NES_DBG_NLMSG,
+			"Port Mapper reg pid fail (err = %d).\n", iwpm_err);
+	}
+	if (iwpm_valid_pid() && !iwpm_err) {
+		nes_form_pm_msg(&cm_info, &pm_msg);
+		iwpm_err = iwpm_add_and_query_mapping(&pm_msg, RDMA_NL_NES);
+		if (iwpm_err)
+			nes_debug(NES_DBG_NLMSG,
+			"Port Mapper query fail (err = %d).\n", iwpm_err);
+		else
+			nes_record_pm_msg(&cm_info, &pm_msg);
+	}
+
+	if (laddr->sin_addr.s_addr != raddr->sin_addr.s_addr) {
+		nes_manage_apbvt(nesvnic, cm_info.mapped_loc_port,
+			PCI_FUNC(nesdev->pcidev->devfn), NES_MANAGE_APBVT_ADD);
+		apbvt_set = 1;
+	}
+
+	if (nes_create_mapinfo(&cm_info))
+		return -ENOMEM;
+
+	cm_id->add_ref(cm_id);
+
+	/* create a connect CM node connection */
+	cm_node = g_cm_core->api->connect(g_cm_core, nesvnic,
+					  conn_param->private_data_len, (void *)conn_param->private_data,
+					  &cm_info);
+	if (!cm_node) {
+		if (apbvt_set)
+			nes_manage_apbvt(nesvnic, cm_info.mapped_loc_port,
+					 PCI_FUNC(nesdev->pcidev->devfn),
+					 NES_MANAGE_APBVT_DEL);
+
+		nes_debug(NES_DBG_NLMSG, "Delete mapped_loc_port = %04X\n",
+				cm_info.mapped_loc_port);
+		nes_remove_mapinfo(cm_info.loc_addr, cm_info.loc_port,
+			cm_info.mapped_loc_addr, cm_info.mapped_loc_port);
+		cm_id->rem_ref(cm_id);
+		return -ENOMEM;
+	}
+
+	record_ird_ord(cm_node, (u16)conn_param->ird, (u16)conn_param->ord);
+	if (cm_node->send_rdma0_op == SEND_RDMA_READ_ZERO &&
+				cm_node->ord_size == 0)
+		cm_node->ord_size = 1;
+
+	cm_node->apbvt_set = apbvt_set;
+	nesqp->cm_node = cm_node;
+	cm_node->nesqp = nesqp;
+	nes_add_ref(&nesqp->ibqp);
+
+	return 0;
+}
+
+
+/**
+ * nes_create_listen
+ */
+int nes_create_listen(struct iw_cm_id *cm_id, int backlog)
+{
+	struct nes_vnic *nesvnic;
+	struct nes_cm_listener *cm_node;
+	struct nes_cm_info cm_info;
+	int err;
+	struct sockaddr_in *laddr = (struct sockaddr_in *)&cm_id->local_addr;
+
+	nes_debug(NES_DBG_CM, "cm_id = %p, local port = 0x%04X.\n",
+		  cm_id, ntohs(laddr->sin_port));
+
+	if (cm_id->local_addr.ss_family != AF_INET)
+		return -ENOSYS;
+	nesvnic = to_nesvnic(cm_id->device);
+	if (!nesvnic)
+		return -EINVAL;
+
+	nes_debug(NES_DBG_CM, "nesvnic=%p, netdev=%p, %s\n",
+			nesvnic, nesvnic->netdev, nesvnic->netdev->name);
+
+	nes_debug(NES_DBG_CM, "nesvnic->local_ipaddr=0x%08x, sin_addr.s_addr=0x%08x\n",
+			nesvnic->local_ipaddr, laddr->sin_addr.s_addr);
+
+	/* setup listen params in our api call struct */
+	cm_info.loc_addr = ntohl(nesvnic->local_ipaddr);
+	cm_info.loc_port = ntohs(laddr->sin_port);
+	cm_info.backlog = backlog;
+	cm_info.cm_id = cm_id;
+
+	cm_info.conn_type = NES_CM_IWARP_CONN_TYPE;
+
+	/* No port mapper available, go with the specified info */
+	cm_info.mapped_loc_addr = cm_info.loc_addr;
+	cm_info.mapped_loc_port = cm_info.loc_port;
+
+	cm_node = g_cm_core->api->listen(g_cm_core, nesvnic, &cm_info);
+	if (!cm_node) {
+		printk(KERN_ERR "%s[%u] Error returned from listen API call\n",
+		       __func__, __LINE__);
+		return -ENOMEM;
+	}
+
+	cm_id->provider_data = cm_node;
+
+	if (!cm_node->reused_node) {
+		if (nes_create_mapinfo(&cm_info))
+			return -ENOMEM;
+
+		err = nes_manage_apbvt(nesvnic, cm_node->mapped_loc_port,
+				       PCI_FUNC(nesvnic->nesdev->pcidev->devfn),
+				       NES_MANAGE_APBVT_ADD);
+		if (err) {
+			printk(KERN_ERR "nes_manage_apbvt call returned %d.\n",
+			       err);
+			g_cm_core->api->stop_listener(g_cm_core, (void *)cm_node);
+			return err;
+		}
+		atomic_inc(&cm_listens_created);
+	}
+
+	cm_id->add_ref(cm_id);
+	cm_id->provider_data = (void *)cm_node;
+
+
+	return 0;
+}
+
+
+/**
+ * nes_destroy_listen
+ */
+int nes_destroy_listen(struct iw_cm_id *cm_id)
+{
+	if (cm_id->provider_data)
+		g_cm_core->api->stop_listener(g_cm_core, cm_id->provider_data);
+	else
+		nes_debug(NES_DBG_CM, "cm_id->provider_data was NULL\n");
+
+	cm_id->rem_ref(cm_id);
+
+	return 0;
+}
+
+
+/**
+ * nes_cm_recv
+ */
+int nes_cm_recv(struct sk_buff *skb, struct net_device *netdevice)
+{
+	int rc = 0;
+
+	cm_packets_received++;
+	if ((g_cm_core) && (g_cm_core->api))
+		rc = g_cm_core->api->recv_pkt(g_cm_core, netdev_priv(netdevice), skb);
+	else
+		nes_debug(NES_DBG_CM, "Unable to process packet for CM,"
+			  " cm is not setup properly.\n");
+
+	return rc;
+}
+
+
+/**
+ * nes_cm_start
+ * Start and init a cm core module
+ */
+int nes_cm_start(void)
+{
+	nes_debug(NES_DBG_CM, "\n");
+	/* create the primary CM core, pass this handle to subsequent core inits */
+	g_cm_core = nes_cm_alloc_core();
+	if (g_cm_core)
+		return 0;
+	else
+		return -ENOMEM;
+}
+
+
+/**
+ * nes_cm_stop
+ * stop and dealloc all cm core instances
+ */
+int nes_cm_stop(void)
+{
+	g_cm_core->api->destroy_cm_core(g_cm_core);
+	return 0;
+}
+
+
+/**
+ * cm_event_connected
+ * handle a connected event, setup QPs and HW
+ */
+static void cm_event_connected(struct nes_cm_event *event)
+{
+	struct nes_qp *nesqp;
+	struct nes_vnic *nesvnic;
+	struct nes_device *nesdev;
+	struct nes_cm_node *cm_node;
+	struct nes_adapter *nesadapter;
+	struct ib_qp_attr attr;
+	struct iw_cm_id *cm_id;
+	struct iw_cm_event cm_event;
+	struct nes_v4_quad nes_quad;
+	u32 crc_value;
+	int ret;
+	struct sockaddr_in *laddr;
+	struct sockaddr_in *raddr;
+	struct sockaddr_in *cm_event_laddr;
+
+	/* get all our handles */
+	cm_node = event->cm_node;
+	cm_id = cm_node->cm_id;
+	nes_debug(NES_DBG_CM, "cm_event_connected - %p - cm_id = %p\n", cm_node, cm_id);
+	nesqp = (struct nes_qp *)cm_id->provider_data;
+	nesvnic = to_nesvnic(nesqp->ibqp.device);
+	nesdev = nesvnic->nesdev;
+	nesadapter = nesdev->nesadapter;
+	laddr = (struct sockaddr_in *)&cm_id->local_addr;
+	raddr = (struct sockaddr_in *)&cm_id->remote_addr;
+	cm_event_laddr = (struct sockaddr_in *)&cm_event.local_addr;
+
+	if (nesqp->destroyed)
+		return;
+	atomic_inc(&cm_connecteds);
+	nes_debug(NES_DBG_CM, "QP%u attempting to connect to  0x%08X:0x%04X on"
+		  " local port 0x%04X. jiffies = %lu.\n",
+		  nesqp->hwqp.qp_id, ntohl(raddr->sin_addr.s_addr),
+		  ntohs(raddr->sin_port), ntohs(laddr->sin_port), jiffies);
+
+	nes_cm_init_tsa_conn(nesqp, cm_node);
+
+	/* set the QP tsa context */
+	nesqp->nesqp_context->tcpPorts[0] =
+			cpu_to_le16(cm_node->mapped_loc_port);
+	nesqp->nesqp_context->tcpPorts[1] =
+			cpu_to_le16(cm_node->mapped_rem_port);
+	nesqp->nesqp_context->ip0 = cpu_to_le32(cm_node->mapped_rem_addr);
+
+	nesqp->nesqp_context->misc2 |= cpu_to_le32(
+			(u32)PCI_FUNC(nesdev->pcidev->devfn) <<
+			NES_QPCONTEXT_MISC2_SRC_IP_SHIFT);
+	nesqp->nesqp_context->arp_index_vlan |= cpu_to_le32(
+			nes_arp_table(nesdev,
+			le32_to_cpu(nesqp->nesqp_context->ip0),
+			NULL, NES_ARP_RESOLVE) << 16);
+	nesqp->nesqp_context->ts_val_delta = cpu_to_le32(
+			jiffies - nes_read_indexed(nesdev, NES_IDX_TCP_NOW));
+	nesqp->nesqp_context->ird_index = cpu_to_le32(nesqp->hwqp.qp_id);
+	nesqp->nesqp_context->ird_ord_sizes |=
+			cpu_to_le32((u32)1 <<
+			NES_QPCONTEXT_ORDIRD_IWARP_MODE_SHIFT);
+	nesqp->nesqp_context->ird_ord_sizes |=
+			cpu_to_le32((u32)cm_node->ord_size);
+
+	/* Adjust tail for not having a LSMM */
+	/*nesqp->hwqp.sq_tail = 1;*/
+
+	build_rdma0_msg(cm_node, &nesqp);
+
+	nes_write32(nesdev->regs + NES_WQE_ALLOC,
+		    (1 << 24) | 0x00800000 | nesqp->hwqp.qp_id);
+
+	memset(&nes_quad, 0, sizeof(nes_quad));
+
+	nes_quad.DstIpAdrIndex =
+		cpu_to_le32((u32)PCI_FUNC(nesdev->pcidev->devfn) << 24);
+	nes_quad.SrcIpadr = htonl(cm_node->mapped_rem_addr);
+	nes_quad.TcpPorts[0] = htons(cm_node->mapped_rem_port);
+	nes_quad.TcpPorts[1] = htons(cm_node->mapped_loc_port);
+
+	/* Produce hash key */
+	crc_value = get_crc_value(&nes_quad);
+	nesqp->hte_index = cpu_to_be32(crc_value ^ 0xffffffff);
+	nes_debug(NES_DBG_CM, "HTE Index = 0x%08X, After CRC = 0x%08X\n",
+		  nesqp->hte_index, nesqp->hte_index & nesadapter->hte_index_mask);
+
+	nesqp->hte_index &= nesadapter->hte_index_mask;
+	nesqp->nesqp_context->hte_index = cpu_to_le32(nesqp->hte_index);
+
+	nesqp->ietf_frame = &cm_node->mpa_frame;
+	nesqp->private_data_len = (u8)cm_node->mpa_frame_size;
+	cm_node->cm_core->api->accelerated(cm_node->cm_core, cm_node);
+
+	/* notify OF layer we successfully created the requested connection */
+	cm_event.event = IW_CM_EVENT_CONNECT_REPLY;
+	cm_event.status = 0;
+	cm_event.provider_data = cm_id->provider_data;
+	cm_event_laddr->sin_family = AF_INET;
+	cm_event_laddr->sin_port = laddr->sin_port;
+	cm_event.remote_addr = cm_id->remote_addr;
+
+	cm_event.private_data = (void *)event->cm_node->mpa_frame_buf;
+	cm_event.private_data_len = (u8)event->cm_node->mpa_frame_size;
+	cm_event.ird = cm_node->ird_size;
+	cm_event.ord = cm_node->ord_size;
+
+	cm_event_laddr->sin_addr.s_addr = htonl(event->cm_info.rem_addr);
+	ret = cm_id->event_handler(cm_id, &cm_event);
+	nes_debug(NES_DBG_CM, "OFA CM event_handler returned, ret=%d\n", ret);
+
+	if (ret)
+		printk(KERN_ERR "%s[%u] OFA CM event_handler returned, "
+		       "ret=%d\n", __func__, __LINE__, ret);
+	attr.qp_state = IB_QPS_RTS;
+	nes_modify_qp(&nesqp->ibqp, &attr, IB_QP_STATE, NULL);
+
+	nes_debug(NES_DBG_CM, "Exiting connect thread for QP%u. jiffies = "
+		  "%lu\n", nesqp->hwqp.qp_id, jiffies);
+
+	return;
+}
+
+
+/**
+ * cm_event_connect_error
+ */
+static void cm_event_connect_error(struct nes_cm_event *event)
+{
+	struct nes_qp *nesqp;
+	struct iw_cm_id *cm_id;
+	struct iw_cm_event cm_event;
+	/* struct nes_cm_info cm_info; */
+	int ret;
+
+	if (!event->cm_node)
+		return;
+
+	cm_id = event->cm_node->cm_id;
+	if (!cm_id)
+		return;
+
+	nes_debug(NES_DBG_CM, "cm_node=%p, cm_id=%p\n", event->cm_node, cm_id);
+	nesqp = cm_id->provider_data;
+
+	if (!nesqp)
+		return;
+
+	/* notify OF layer about this connection error event */
+	/* cm_id->rem_ref(cm_id); */
+	nesqp->cm_id = NULL;
+	cm_id->provider_data = NULL;
+	cm_event.event = IW_CM_EVENT_CONNECT_REPLY;
+	cm_event.status = -ECONNRESET;
+	cm_event.provider_data = cm_id->provider_data;
+	cm_event.local_addr = cm_id->local_addr;
+	cm_event.remote_addr = cm_id->remote_addr;
+	cm_event.private_data = NULL;
+	cm_event.private_data_len = 0;
+
+#ifdef CONFIG_INFINIBAND_NES_DEBUG
+	{
+		struct sockaddr_in *cm_event_laddr = (struct sockaddr_in *)
+						     &cm_event.local_addr;
+		struct sockaddr_in *cm_event_raddr = (struct sockaddr_in *)
+						     &cm_event.remote_addr;
+		nes_debug(NES_DBG_CM, "call CM_EVENT REJECTED, local_addr=%08x, remote_addr=%08x\n",
+			  cm_event_laddr->sin_addr.s_addr, cm_event_raddr->sin_addr.s_addr);
+	}
+#endif
+
+	ret = cm_id->event_handler(cm_id, &cm_event);
+	nes_debug(NES_DBG_CM, "OFA CM event_handler returned, ret=%d\n", ret);
+	if (ret)
+		printk(KERN_ERR "%s[%u] OFA CM event_handler returned, "
+		       "ret=%d\n", __func__, __LINE__, ret);
+	cm_id->rem_ref(cm_id);
+
+	rem_ref_cm_node(event->cm_node->cm_core, event->cm_node);
+	return;
+}
+
+
+/**
+ * cm_event_reset
+ */
+static void cm_event_reset(struct nes_cm_event *event)
+{
+	struct nes_qp *nesqp;
+	struct iw_cm_id *cm_id;
+	struct iw_cm_event cm_event;
+	/* struct nes_cm_info cm_info; */
+	int ret;
+
+	if (!event->cm_node)
+		return;
+
+	if (!event->cm_node->cm_id)
+		return;
+
+	cm_id = event->cm_node->cm_id;
+
+	nes_debug(NES_DBG_CM, "%p - cm_id = %p\n", event->cm_node, cm_id);
+	nesqp = cm_id->provider_data;
+	if (!nesqp)
+		return;
+
+	nesqp->cm_id = NULL;
+	/* cm_id->provider_data = NULL; */
+	cm_event.event = IW_CM_EVENT_DISCONNECT;
+	cm_event.status = -ECONNRESET;
+	cm_event.provider_data = cm_id->provider_data;
+	cm_event.local_addr = cm_id->local_addr;
+	cm_event.remote_addr = cm_id->remote_addr;
+	cm_event.private_data = NULL;
+	cm_event.private_data_len = 0;
+
+	cm_id->add_ref(cm_id);
+	ret = cm_id->event_handler(cm_id, &cm_event);
+	atomic_inc(&cm_closes);
+	cm_event.event = IW_CM_EVENT_CLOSE;
+	cm_event.status = 0;
+	cm_event.provider_data = cm_id->provider_data;
+	cm_event.local_addr = cm_id->local_addr;
+	cm_event.remote_addr = cm_id->remote_addr;
+	cm_event.private_data = NULL;
+	cm_event.private_data_len = 0;
+	nes_debug(NES_DBG_CM, "NODE %p Generating CLOSE\n", event->cm_node);
+	ret = cm_id->event_handler(cm_id, &cm_event);
+
+	nes_debug(NES_DBG_CM, "OFA CM event_handler returned, ret=%d\n", ret);
+
+
+	/* notify OF layer about this connection error event */
+	cm_id->rem_ref(cm_id);
+
+	return;
+}
+
+
+/**
+ * cm_event_mpa_req
+ */
+static void cm_event_mpa_req(struct nes_cm_event *event)
+{
+	struct iw_cm_id *cm_id;
+	struct iw_cm_event cm_event;
+	int ret;
+	struct nes_cm_node *cm_node;
+	struct sockaddr_in *cm_event_laddr = (struct sockaddr_in *)
+					     &cm_event.local_addr;
+	struct sockaddr_in *cm_event_raddr = (struct sockaddr_in *)
+					     &cm_event.remote_addr;
+
+	cm_node = event->cm_node;
+	if (!cm_node)
+		return;
+	cm_id = cm_node->cm_id;
+
+	atomic_inc(&cm_connect_reqs);
+	nes_debug(NES_DBG_CM, "cm_node = %p - cm_id = %p, jiffies = %lu\n",
+		  cm_node, cm_id, jiffies);
+
+	cm_event.event = IW_CM_EVENT_CONNECT_REQUEST;
+	cm_event.status = 0;
+	cm_event.provider_data = (void *)cm_node;
+
+	cm_event_laddr->sin_family = AF_INET;
+	cm_event_laddr->sin_port = htons(event->cm_info.loc_port);
+	cm_event_laddr->sin_addr.s_addr = htonl(event->cm_info.loc_addr);
+
+	cm_event_raddr->sin_family = AF_INET;
+	cm_event_raddr->sin_port = htons(event->cm_info.rem_port);
+	cm_event_raddr->sin_addr.s_addr = htonl(event->cm_info.rem_addr);
+	cm_event.private_data = cm_node->mpa_frame_buf;
+	cm_event.private_data_len = (u8)cm_node->mpa_frame_size;
+	if (cm_node->mpa_frame_rev == IETF_MPA_V1) {
+		cm_event.ird = NES_MAX_IRD;
+		cm_event.ord = NES_MAX_ORD;
+	} else {
+	cm_event.ird = cm_node->ird_size;
+	cm_event.ord = cm_node->ord_size;
+	}
+
+	ret = cm_id->event_handler(cm_id, &cm_event);
+	if (ret)
+		printk(KERN_ERR "%s[%u] OFA CM event_handler returned, ret=%d\n",
+		       __func__, __LINE__, ret);
+	return;
+}
+
+
+static void cm_event_mpa_reject(struct nes_cm_event *event)
+{
+	struct iw_cm_id *cm_id;
+	struct iw_cm_event cm_event;
+	struct nes_cm_node *cm_node;
+	int ret;
+	struct sockaddr_in *cm_event_laddr = (struct sockaddr_in *)
+					     &cm_event.local_addr;
+	struct sockaddr_in *cm_event_raddr = (struct sockaddr_in *)
+					     &cm_event.remote_addr;
+
+	cm_node = event->cm_node;
+	if (!cm_node)
+		return;
+	cm_id = cm_node->cm_id;
+
+	atomic_inc(&cm_connect_reqs);
+	nes_debug(NES_DBG_CM, "cm_node = %p - cm_id = %p, jiffies = %lu\n",
+		  cm_node, cm_id, jiffies);
+
+	cm_event.event = IW_CM_EVENT_CONNECT_REPLY;
+	cm_event.status = -ECONNREFUSED;
+	cm_event.provider_data = cm_id->provider_data;
+
+	cm_event_laddr->sin_family = AF_INET;
+	cm_event_laddr->sin_port = htons(event->cm_info.loc_port);
+	cm_event_laddr->sin_addr.s_addr = htonl(event->cm_info.loc_addr);
+
+	cm_event_raddr->sin_family = AF_INET;
+	cm_event_raddr->sin_port = htons(event->cm_info.rem_port);
+	cm_event_raddr->sin_addr.s_addr = htonl(event->cm_info.rem_addr);
+
+	cm_event.private_data = cm_node->mpa_frame_buf;
+	cm_event.private_data_len = (u8)cm_node->mpa_frame_size;
+
+	nes_debug(NES_DBG_CM, "call CM_EVENT_MPA_REJECTED, local_addr=%08x, "
+		  "remove_addr=%08x\n",
+		  cm_event_laddr->sin_addr.s_addr,
+		  cm_event_raddr->sin_addr.s_addr);
+
+	ret = cm_id->event_handler(cm_id, &cm_event);
+	if (ret)
+		printk(KERN_ERR "%s[%u] OFA CM event_handler returned, ret=%d\n",
+		       __func__, __LINE__, ret);
+
+	return;
+}
+
+
+static void nes_cm_event_handler(struct work_struct *);
+
+/**
+ * nes_cm_post_event
+ * post an event to the cm event handler
+ */
+static int nes_cm_post_event(struct nes_cm_event *event)
+{
+	atomic_inc(&event->cm_node->cm_core->events_posted);
+	add_ref_cm_node(event->cm_node);
+	event->cm_info.cm_id->add_ref(event->cm_info.cm_id);
+	INIT_WORK(&event->event_work, nes_cm_event_handler);
+	nes_debug(NES_DBG_CM, "cm_node=%p queue_work, event=%p\n",
+		  event->cm_node, event);
+
+	queue_work(event->cm_node->cm_core->event_wq, &event->event_work);
+
+	nes_debug(NES_DBG_CM, "Exit\n");
+	return 0;
+}
+
+
+/**
+ * nes_cm_event_handler
+ * worker function to handle cm events
+ * will free instance of nes_cm_event
+ */
+static void nes_cm_event_handler(struct work_struct *work)
+{
+	struct nes_cm_event *event = container_of(work, struct nes_cm_event,
+						  event_work);
+	struct nes_cm_core *cm_core;
+
+	if ((!event) || (!event->cm_node) || (!event->cm_node->cm_core))
+		return;
+
+	cm_core = event->cm_node->cm_core;
+	nes_debug(NES_DBG_CM, "event=%p, event->type=%u, events posted=%u\n",
+		  event, event->type, atomic_read(&cm_core->events_posted));
+
+	switch (event->type) {
+	case NES_CM_EVENT_MPA_REQ:
+		cm_event_mpa_req(event);
+		nes_debug(NES_DBG_CM, "cm_node=%p CM Event: MPA REQUEST\n",
+			  event->cm_node);
+		break;
+	case NES_CM_EVENT_RESET:
+		nes_debug(NES_DBG_CM, "cm_node = %p CM Event: RESET\n",
+			  event->cm_node);
+		cm_event_reset(event);
+		break;
+	case NES_CM_EVENT_CONNECTED:
+		if ((!event->cm_node->cm_id) ||
+		    (event->cm_node->state != NES_CM_STATE_TSA))
+			break;
+		cm_event_connected(event);
+		nes_debug(NES_DBG_CM, "CM Event: CONNECTED\n");
+		break;
+	case NES_CM_EVENT_MPA_REJECT:
+		if ((!event->cm_node->cm_id) ||
+		    (event->cm_node->state == NES_CM_STATE_TSA))
+			break;
+		cm_event_mpa_reject(event);
+		nes_debug(NES_DBG_CM, "CM Event: REJECT\n");
+		break;
+
+	case NES_CM_EVENT_ABORTED:
+		if ((!event->cm_node->cm_id) ||
+		    (event->cm_node->state == NES_CM_STATE_TSA))
+			break;
+		cm_event_connect_error(event);
+		nes_debug(NES_DBG_CM, "CM Event: ABORTED\n");
+		break;
+	case NES_CM_EVENT_DROPPED_PKT:
+		nes_debug(NES_DBG_CM, "CM Event: DROPPED PKT\n");
+		break;
+	default:
+		nes_debug(NES_DBG_CM, "CM Event: UNKNOWN EVENT TYPE\n");
+		break;
+	}
+
+	atomic_dec(&cm_core->events_posted);
+	event->cm_info.cm_id->rem_ref(event->cm_info.cm_id);
+	rem_ref_cm_node(cm_core, event->cm_node);
+	kfree(event);
+
+	return;
+}
diff --git a/drivers/infiniband/hw/nes/nes_cm.h b/drivers/infiniband/hw/nes/nes_cm.h
new file mode 100644
index 0000000..f522cf6
--- /dev/null
+++ b/drivers/infiniband/hw/nes/nes_cm.h
@@ -0,0 +1,476 @@
+/*
+ * Copyright (c) 2006 - 2014 Intel Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifndef NES_CM_H
+#define NES_CM_H
+
+#define QUEUE_EVENTS
+
+#define NES_MANAGE_APBVT_DEL 0
+#define NES_MANAGE_APBVT_ADD 1
+
+#define NES_MPA_REQUEST_ACCEPT  1
+#define NES_MPA_REQUEST_REJECT  2
+
+/* IETF MPA -- defines, enums, structs */
+#define IEFT_MPA_KEY_REQ  "MPA ID Req Frame"
+#define IEFT_MPA_KEY_REP  "MPA ID Rep Frame"
+#define IETF_MPA_KEY_SIZE 16
+#define IETF_MPA_VERSION  1
+#define IETF_MAX_PRIV_DATA_LEN 512
+#define IETF_MPA_FRAME_SIZE    20
+#define IETF_RTR_MSG_SIZE      4
+#define IETF_MPA_V2_FLAG       0x10
+
+/* IETF RTR MSG Fields               */
+#define IETF_PEER_TO_PEER       0x8000
+#define IETF_FLPDU_ZERO_LEN     0x4000
+#define IETF_RDMA0_WRITE        0x8000
+#define IETF_RDMA0_READ         0x4000
+#define IETF_NO_IRD_ORD         0x3FFF
+#define NES_MAX_IRD		 0x40
+#define NES_MAX_ORD		 0x7F
+
+enum ietf_mpa_flags {
+	IETF_MPA_FLAGS_MARKERS = 0x80,	/* receive Markers */
+	IETF_MPA_FLAGS_CRC     = 0x40,	/* receive Markers */
+	IETF_MPA_FLAGS_REJECT  = 0x20,	/* Reject */
+};
+
+struct ietf_mpa_v1 {
+	u8 key[IETF_MPA_KEY_SIZE];
+	u8 flags;
+	u8 rev;
+	__be16 priv_data_len;
+	u8 priv_data[0];
+};
+
+#define ietf_mpa_req_resp_frame ietf_mpa_frame
+
+struct ietf_rtr_msg {
+	__be16 ctrl_ird;
+	__be16 ctrl_ord;
+};
+
+struct ietf_mpa_v2 {
+	u8 key[IETF_MPA_KEY_SIZE];
+	u8 flags;
+	u8 rev;
+	 __be16 priv_data_len;
+	struct ietf_rtr_msg rtr_msg;
+	u8 priv_data[0];
+};
+
+struct nes_v4_quad {
+	u32 rsvd0;
+	__le32 DstIpAdrIndex;	/* Only most significant 5 bits are valid */
+	__be32 SrcIpadr;
+	__be16 TcpPorts[2];		/* src is low, dest is high */
+};
+
+struct nes_cm_node;
+enum nes_timer_type {
+	NES_TIMER_TYPE_SEND,
+	NES_TIMER_TYPE_RECV,
+	NES_TIMER_NODE_CLEANUP,
+	NES_TIMER_TYPE_CLOSE,
+};
+
+#define NES_PASSIVE_STATE_INDICATED	0
+#define NES_DO_NOT_SEND_RESET_EVENT	1
+#define NES_SEND_RESET_EVENT		2
+
+#define MAX_NES_IFS 4
+
+#define SET_ACK 1
+#define SET_SYN 2
+#define SET_FIN 4
+#define SET_RST 8
+
+#define TCP_OPTIONS_PADDING	3
+
+struct option_base {
+	u8 optionnum;
+	u8 length;
+};
+
+enum option_numbers {
+	OPTION_NUMBER_END,
+	OPTION_NUMBER_NONE,
+	OPTION_NUMBER_MSS,
+	OPTION_NUMBER_WINDOW_SCALE,
+	OPTION_NUMBER_SACK_PERM,
+	OPTION_NUMBER_SACK,
+	OPTION_NUMBER_WRITE0 = 0xbc
+};
+
+struct option_mss {
+	u8 optionnum;
+	u8 length;
+	__be16 mss;
+};
+
+struct option_windowscale {
+	u8 optionnum;
+	u8 length;
+	u8 shiftcount;
+};
+
+union all_known_options {
+	char as_end;
+	struct option_base as_base;
+	struct option_mss as_mss;
+	struct option_windowscale as_windowscale;
+};
+
+struct nes_timer_entry {
+	struct list_head list;
+	unsigned long timetosend;	/* jiffies */
+	struct sk_buff *skb;
+	u32 type;
+	u32 retrycount;
+	u32 retranscount;
+	u32 context;
+	u32 seq_num;
+	u32 send_retrans;
+	int close_when_complete;
+	struct net_device *netdev;
+};
+
+#define NES_DEFAULT_RETRYS  64
+#define NES_DEFAULT_RETRANS 8
+#ifdef CONFIG_INFINIBAND_NES_DEBUG
+#define NES_RETRY_TIMEOUT   (1000*HZ/1000)
+#else
+#define NES_RETRY_TIMEOUT   (3000*HZ/1000)
+#endif
+#define NES_SHORT_TIME      (10)
+#define NES_LONG_TIME       (2000*HZ/1000)
+#define NES_MAX_TIMEOUT     ((unsigned long) (12*HZ))
+
+#define NES_CM_HASHTABLE_SIZE         1024
+#define NES_CM_TCP_TIMER_INTERVAL     3000
+#define NES_CM_DEFAULT_MTU            1540
+#define NES_CM_DEFAULT_FRAME_CNT      10
+#define NES_CM_THREAD_STACK_SIZE      256
+#define NES_CM_DEFAULT_RCV_WND        64240	// before we know that window scaling is allowed
+#define NES_CM_DEFAULT_RCV_WND_SCALED 256960  // after we know that window scaling is allowed
+#define NES_CM_DEFAULT_RCV_WND_SCALE  2
+#define NES_CM_DEFAULT_FREE_PKTS      0x000A
+#define NES_CM_FREE_PKT_LO_WATERMARK  2
+
+#define NES_CM_DEFAULT_MSS   536
+
+#define NES_CM_DEF_SEQ       0x159bf75f
+#define NES_CM_DEF_LOCAL_ID  0x3b47
+
+#define NES_CM_DEF_SEQ2      0x18ed5740
+#define NES_CM_DEF_LOCAL_ID2 0xb807
+#define	MAX_CM_BUFFER	(IETF_MPA_FRAME_SIZE + IETF_RTR_MSG_SIZE + IETF_MAX_PRIV_DATA_LEN)
+
+typedef u32 nes_addr_t;
+
+#define nes_cm_tsa_context nes_qp_context
+
+struct nes_qp;
+
+/* cm node transition states */
+enum nes_cm_node_state {
+	NES_CM_STATE_UNKNOWN,
+	NES_CM_STATE_INITED,
+	NES_CM_STATE_LISTENING,
+	NES_CM_STATE_SYN_RCVD,
+	NES_CM_STATE_SYN_SENT,
+	NES_CM_STATE_ONE_SIDE_ESTABLISHED,
+	NES_CM_STATE_ESTABLISHED,
+	NES_CM_STATE_ACCEPTING,
+	NES_CM_STATE_MPAREQ_SENT,
+	NES_CM_STATE_MPAREQ_RCVD,
+	NES_CM_STATE_MPAREJ_RCVD,
+	NES_CM_STATE_TSA,
+	NES_CM_STATE_FIN_WAIT1,
+	NES_CM_STATE_FIN_WAIT2,
+	NES_CM_STATE_CLOSE_WAIT,
+	NES_CM_STATE_TIME_WAIT,
+	NES_CM_STATE_LAST_ACK,
+	NES_CM_STATE_CLOSING,
+	NES_CM_STATE_LISTENER_DESTROYED,
+	NES_CM_STATE_CLOSED
+};
+
+enum mpa_frame_version {
+	IETF_MPA_V1 = 1,
+	IETF_MPA_V2 = 2
+};
+
+enum mpa_frame_key {
+	MPA_KEY_REQUEST,
+	MPA_KEY_REPLY
+};
+
+enum send_rdma0 {
+	SEND_RDMA_READ_ZERO = 1,
+	SEND_RDMA_WRITE_ZERO = 2
+};
+
+enum nes_tcpip_pkt_type {
+	NES_PKT_TYPE_UNKNOWN,
+	NES_PKT_TYPE_SYN,
+	NES_PKT_TYPE_SYNACK,
+	NES_PKT_TYPE_ACK,
+	NES_PKT_TYPE_FIN,
+	NES_PKT_TYPE_RST
+};
+
+
+/* type of nes connection */
+enum nes_cm_conn_type {
+	NES_CM_IWARP_CONN_TYPE,
+};
+
+/* CM context params */
+struct nes_cm_tcp_context {
+	u8  client;
+
+	u32 loc_seq_num;
+	u32 loc_ack_num;
+	u32 rem_ack_num;
+	u32 rcv_nxt;
+
+	u32 loc_id;
+	u32 rem_id;
+
+	u32 snd_wnd;
+	u32 max_snd_wnd;
+
+	u32 rcv_wnd;
+	u32 mss;
+	u8  snd_wscale;
+	u8  rcv_wscale;
+
+	struct nes_cm_tsa_context tsa_cntxt;
+	struct timeval            sent_ts;
+};
+
+
+enum nes_cm_listener_state {
+	NES_CM_LISTENER_PASSIVE_STATE = 1,
+	NES_CM_LISTENER_ACTIVE_STATE = 2,
+	NES_CM_LISTENER_EITHER_STATE = 3
+};
+
+struct nes_cm_listener {
+	struct list_head           list;
+	struct nes_cm_core         *cm_core;
+	u8                         loc_mac[ETH_ALEN];
+	nes_addr_t                 loc_addr, mapped_loc_addr;
+	u16                        loc_port, mapped_loc_port;
+	struct iw_cm_id            *cm_id;
+	enum nes_cm_conn_type      conn_type;
+	atomic_t                   ref_count;
+	struct nes_vnic            *nesvnic;
+	atomic_t                   pend_accepts_cnt;
+	int                        backlog;
+	enum nes_cm_listener_state listener_state;
+	u32                        reused_node;
+};
+
+/* per connection node and node state information */
+struct nes_cm_node {
+	nes_addr_t                loc_addr, rem_addr;
+	nes_addr_t                mapped_loc_addr, mapped_rem_addr;
+	u16                       loc_port, rem_port;
+	u16                       mapped_loc_port, mapped_rem_port;
+
+	u8                        loc_mac[ETH_ALEN];
+	u8                        rem_mac[ETH_ALEN];
+
+	enum nes_cm_node_state    state;
+	struct nes_cm_tcp_context tcp_cntxt;
+	struct nes_cm_core        *cm_core;
+	struct sk_buff_head       resend_list;
+	atomic_t                  ref_count;
+	struct net_device         *netdev;
+
+	struct nes_cm_node        *loopbackpartner;
+
+	struct nes_timer_entry	  *send_entry;
+	struct nes_timer_entry    *recv_entry;
+	spinlock_t                retrans_list_lock;
+	enum send_rdma0           send_rdma0_op;
+
+	union {
+		struct ietf_mpa_v1 mpa_frame;
+		struct ietf_mpa_v2 mpa_v2_frame;
+		u8                 mpa_frame_buf[MAX_CM_BUFFER];
+	};
+	enum mpa_frame_version    mpa_frame_rev;
+	u16			  ird_size;
+	u16                       ord_size;
+	u16			  mpav2_ird_ord;
+
+	u16                       mpa_frame_size;
+	struct iw_cm_id           *cm_id;
+	struct list_head          list;
+	int                       accelerated;
+	struct nes_cm_listener    *listener;
+	enum nes_cm_conn_type     conn_type;
+	struct nes_vnic           *nesvnic;
+	int                       apbvt_set;
+	int                       accept_pend;
+	struct list_head	timer_entry;
+	struct list_head	reset_entry;
+	struct nes_qp		*nesqp;
+	atomic_t 		passive_state;
+};
+
+/* structure for client or CM to fill when making CM api calls. */
+/*	- only need to set relevant data, based on op. */
+struct nes_cm_info {
+	union {
+		struct iw_cm_id   *cm_id;
+		struct net_device *netdev;
+	};
+
+	u16 loc_port;
+	u16 rem_port;
+	nes_addr_t loc_addr;
+	nes_addr_t rem_addr;
+	u16 mapped_loc_port;
+	u16 mapped_rem_port;
+	nes_addr_t mapped_loc_addr;
+	nes_addr_t mapped_rem_addr;
+
+	enum nes_cm_conn_type  conn_type;
+	int backlog;
+};
+
+/* CM event codes */
+enum  nes_cm_event_type {
+	NES_CM_EVENT_UNKNOWN,
+	NES_CM_EVENT_ESTABLISHED,
+	NES_CM_EVENT_MPA_REQ,
+	NES_CM_EVENT_MPA_CONNECT,
+	NES_CM_EVENT_MPA_ACCEPT,
+	NES_CM_EVENT_MPA_REJECT,
+	NES_CM_EVENT_MPA_ESTABLISHED,
+	NES_CM_EVENT_CONNECTED,
+	NES_CM_EVENT_CLOSED,
+	NES_CM_EVENT_RESET,
+	NES_CM_EVENT_DROPPED_PKT,
+	NES_CM_EVENT_CLOSE_IMMED,
+	NES_CM_EVENT_CLOSE_HARD,
+	NES_CM_EVENT_CLOSE_CLEAN,
+	NES_CM_EVENT_ABORTED,
+	NES_CM_EVENT_SEND_FIRST
+};
+
+/* event to post to CM event handler */
+struct nes_cm_event {
+	enum nes_cm_event_type type;
+
+	struct nes_cm_info cm_info;
+	struct work_struct event_work;
+	struct nes_cm_node *cm_node;
+};
+
+struct nes_cm_core {
+	enum nes_cm_node_state  state;
+
+	atomic_t                listen_node_cnt;
+	struct nes_cm_node      listen_list;
+	spinlock_t              listen_list_lock;
+
+	u32                     mtu;
+	u32                     free_tx_pkt_max;
+	u32                     rx_pkt_posted;
+	atomic_t                ht_node_cnt;
+	struct list_head        connected_nodes;
+	/* struct list_head hashtable[NES_CM_HASHTABLE_SIZE]; */
+	spinlock_t              ht_lock;
+
+	struct timer_list       tcp_timer;
+
+	struct nes_cm_ops       *api;
+
+	int (*post_event)(struct nes_cm_event *event);
+	atomic_t                events_posted;
+	struct workqueue_struct *event_wq;
+	struct workqueue_struct *disconn_wq;
+
+	atomic_t                node_cnt;
+	u64                     aborted_connects;
+	u32                     options;
+
+	struct nes_cm_node      *current_listen_node;
+};
+
+
+#define NES_CM_SET_PKT_SIZE        (1 << 1)
+#define NES_CM_SET_FREE_PKT_Q_SIZE (1 << 2)
+
+/* CM ops/API for client interface */
+struct nes_cm_ops {
+	int (*accelerated)(struct nes_cm_core *, struct nes_cm_node *);
+	struct nes_cm_listener * (*listen)(struct nes_cm_core *, struct nes_vnic *,
+			struct nes_cm_info *);
+	int (*stop_listener)(struct nes_cm_core *, struct nes_cm_listener *);
+	struct nes_cm_node * (*connect)(struct nes_cm_core *,
+			struct nes_vnic *, u16, void *,
+			struct nes_cm_info *);
+	int (*close)(struct nes_cm_core *, struct nes_cm_node *);
+	int (*accept)(struct nes_cm_core *, struct nes_cm_node *);
+	int (*reject)(struct nes_cm_core *, struct nes_cm_node *);
+	int (*recv_pkt)(struct nes_cm_core *, struct nes_vnic *,
+			struct sk_buff *);
+	int (*destroy_cm_core)(struct nes_cm_core *);
+	int (*get)(struct nes_cm_core *);
+	int (*set)(struct nes_cm_core *, u32, u32);
+};
+
+int schedule_nes_timer(struct nes_cm_node *, struct sk_buff *,
+		enum nes_timer_type, int, int);
+
+int nes_accept(struct iw_cm_id *, struct iw_cm_conn_param *);
+int nes_reject(struct iw_cm_id *, const void *, u8);
+int nes_connect(struct iw_cm_id *, struct iw_cm_conn_param *);
+int nes_create_listen(struct iw_cm_id *, int);
+int nes_destroy_listen(struct iw_cm_id *);
+
+int nes_cm_recv(struct sk_buff *, struct net_device *);
+int nes_cm_start(void);
+int nes_cm_stop(void);
+int nes_add_ref_cm_node(struct nes_cm_node *cm_node);
+int nes_rem_ref_cm_node(struct nes_cm_node *cm_node);
+
+#endif			/* NES_CM_H */
diff --git a/drivers/infiniband/hw/nes/nes_context.h b/drivers/infiniband/hw/nes/nes_context.h
new file mode 100644
index 0000000..a69eef1
--- /dev/null
+++ b/drivers/infiniband/hw/nes/nes_context.h
@@ -0,0 +1,193 @@
+/*
+ * Copyright (c) 2006 - 2011 Intel Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef NES_CONTEXT_H
+#define NES_CONTEXT_H
+
+struct nes_qp_context {
+	__le32   misc;
+	__le32   cqs;
+	__le32   sq_addr_low;
+	__le32   sq_addr_high;
+	__le32   rq_addr_low;
+	__le32   rq_addr_high;
+	__le32   misc2;
+	__le16   tcpPorts[2];
+	__le32   ip0;
+	__le32   ip1;
+	__le32   ip2;
+	__le32   ip3;
+	__le32   mss;
+	__le32   arp_index_vlan;
+	__le32   tcp_state_flow_label;
+	__le32   pd_index_wscale;
+	__le32   keepalive;
+	u32   ts_recent;
+	u32   ts_age;
+	__le32   snd_nxt;
+	__le32   snd_wnd;
+	__le32   rcv_nxt;
+	__le32   rcv_wnd;
+	__le32   snd_max;
+	__le32   snd_una;
+	u32   srtt;
+	__le32   rttvar;
+	__le32   ssthresh;
+	__le32   cwnd;
+	__le32   snd_wl1;
+	__le32   snd_wl2;
+	__le32   max_snd_wnd;
+	__le32   ts_val_delta;
+	u32   retransmit;
+	u32   probe_cnt;
+	u32   hte_index;
+	__le32   q2_addr_low;
+	__le32   q2_addr_high;
+	__le32   ird_index;
+	u32   Rsvd3;
+	__le32   ird_ord_sizes;
+	u32   mrkr_offset;
+	__le32   aeq_token_low;
+	__le32   aeq_token_high;
+};
+
+/* QP Context Misc Field */
+
+#define NES_QPCONTEXT_MISC_IWARP_VER_MASK    0x00000003
+#define NES_QPCONTEXT_MISC_IWARP_VER_SHIFT   0
+#define NES_QPCONTEXT_MISC_EFB_SIZE_MASK     0x000000C0
+#define NES_QPCONTEXT_MISC_EFB_SIZE_SHIFT    6
+#define NES_QPCONTEXT_MISC_RQ_SIZE_MASK      0x00000300
+#define NES_QPCONTEXT_MISC_RQ_SIZE_SHIFT     8
+#define NES_QPCONTEXT_MISC_SQ_SIZE_MASK      0x00000c00
+#define NES_QPCONTEXT_MISC_SQ_SIZE_SHIFT     10
+#define NES_QPCONTEXT_MISC_PCI_FCN_MASK      0x00007000
+#define NES_QPCONTEXT_MISC_PCI_FCN_SHIFT     12
+#define NES_QPCONTEXT_MISC_DUP_ACKS_MASK     0x00070000
+#define NES_QPCONTEXT_MISC_DUP_ACKS_SHIFT    16
+
+enum nes_qp_context_misc_bits {
+	NES_QPCONTEXT_MISC_RX_WQE_SIZE         = 0x00000004,
+	NES_QPCONTEXT_MISC_IPV4                = 0x00000008,
+	NES_QPCONTEXT_MISC_DO_NOT_FRAG         = 0x00000010,
+	NES_QPCONTEXT_MISC_INSERT_VLAN         = 0x00000020,
+	NES_QPCONTEXT_MISC_DROS                = 0x00008000,
+	NES_QPCONTEXT_MISC_WSCALE              = 0x00080000,
+	NES_QPCONTEXT_MISC_KEEPALIVE           = 0x00100000,
+	NES_QPCONTEXT_MISC_TIMESTAMP           = 0x00200000,
+	NES_QPCONTEXT_MISC_SACK                = 0x00400000,
+	NES_QPCONTEXT_MISC_RDMA_WRITE_EN       = 0x00800000,
+	NES_QPCONTEXT_MISC_RDMA_READ_EN        = 0x01000000,
+	NES_QPCONTEXT_MISC_WBIND_EN            = 0x10000000,
+	NES_QPCONTEXT_MISC_FAST_REGISTER_EN    = 0x20000000,
+	NES_QPCONTEXT_MISC_PRIV_EN             = 0x40000000,
+	NES_QPCONTEXT_MISC_NO_NAGLE            = 0x80000000
+};
+
+enum nes_qp_acc_wq_sizes {
+	HCONTEXT_TSA_WQ_SIZE_4 = 0,
+	HCONTEXT_TSA_WQ_SIZE_32 = 1,
+	HCONTEXT_TSA_WQ_SIZE_128 = 2,
+	HCONTEXT_TSA_WQ_SIZE_512 = 3
+};
+
+/* QP Context Misc2 Fields */
+#define NES_QPCONTEXT_MISC2_TTL_MASK            0x000000ff
+#define NES_QPCONTEXT_MISC2_TTL_SHIFT           0
+#define NES_QPCONTEXT_MISC2_HOP_LIMIT_MASK      0x000000ff
+#define NES_QPCONTEXT_MISC2_HOP_LIMIT_SHIFT     0
+#define NES_QPCONTEXT_MISC2_LIMIT_MASK          0x00000300
+#define NES_QPCONTEXT_MISC2_LIMIT_SHIFT         8
+#define NES_QPCONTEXT_MISC2_NIC_INDEX_MASK      0x0000fc00
+#define NES_QPCONTEXT_MISC2_NIC_INDEX_SHIFT     10
+#define NES_QPCONTEXT_MISC2_SRC_IP_MASK         0x001f0000
+#define NES_QPCONTEXT_MISC2_SRC_IP_SHIFT        16
+#define NES_QPCONTEXT_MISC2_TOS_MASK            0xff000000
+#define NES_QPCONTEXT_MISC2_TOS_SHIFT           24
+#define NES_QPCONTEXT_MISC2_TRAFFIC_CLASS_MASK  0xff000000
+#define NES_QPCONTEXT_MISC2_TRAFFIC_CLASS_SHIFT 24
+
+/* QP Context Tcp State/Flow Label Fields */
+#define NES_QPCONTEXT_TCPFLOW_FLOW_LABEL_MASK   0x000fffff
+#define NES_QPCONTEXT_TCPFLOW_FLOW_LABEL_SHIFT  0
+#define NES_QPCONTEXT_TCPFLOW_TCP_STATE_MASK    0xf0000000
+#define NES_QPCONTEXT_TCPFLOW_TCP_STATE_SHIFT   28
+
+enum nes_qp_tcp_state {
+	NES_QPCONTEXT_TCPSTATE_CLOSED = 1,
+	NES_QPCONTEXT_TCPSTATE_EST = 5,
+	NES_QPCONTEXT_TCPSTATE_TIME_WAIT = 11,
+};
+
+/* QP Context PD Index/wscale Fields */
+#define NES_QPCONTEXT_PDWSCALE_RCV_WSCALE_MASK  0x0000000f
+#define NES_QPCONTEXT_PDWSCALE_RCV_WSCALE_SHIFT 0
+#define NES_QPCONTEXT_PDWSCALE_SND_WSCALE_MASK  0x00000f00
+#define NES_QPCONTEXT_PDWSCALE_SND_WSCALE_SHIFT 8
+#define NES_QPCONTEXT_PDWSCALE_PDINDEX_MASK     0xffff0000
+#define NES_QPCONTEXT_PDWSCALE_PDINDEX_SHIFT    16
+
+/* QP Context Keepalive Fields */
+#define NES_QPCONTEXT_KEEPALIVE_DELTA_MASK      0x0000ffff
+#define NES_QPCONTEXT_KEEPALIVE_DELTA_SHIFT     0
+#define NES_QPCONTEXT_KEEPALIVE_PROBE_CNT_MASK  0x00ff0000
+#define NES_QPCONTEXT_KEEPALIVE_PROBE_CNT_SHIFT 16
+#define NES_QPCONTEXT_KEEPALIVE_INTV_MASK       0xff000000
+#define NES_QPCONTEXT_KEEPALIVE_INTV_SHIFT      24
+
+/* QP Context ORD/IRD Fields */
+#define NES_QPCONTEXT_ORDIRD_ORDSIZE_MASK       0x0000007f
+#define NES_QPCONTEXT_ORDIRD_ORDSIZE_SHIFT      0
+#define NES_QPCONTEXT_ORDIRD_IRDSIZE_MASK       0x00030000
+#define NES_QPCONTEXT_ORDIRD_IRDSIZE_SHIFT      16
+#define NES_QPCONTEXT_ORDIRD_IWARP_MODE_MASK    0x30000000
+#define NES_QPCONTEXT_ORDIRD_IWARP_MODE_SHIFT   28
+
+enum nes_ord_ird_bits {
+	NES_QPCONTEXT_ORDIRD_WRPDU                   = 0x02000000,
+	NES_QPCONTEXT_ORDIRD_LSMM_PRESENT            = 0x04000000,
+	NES_QPCONTEXT_ORDIRD_ALSMM                   = 0x08000000,
+	NES_QPCONTEXT_ORDIRD_AAH                     = 0x40000000,
+	NES_QPCONTEXT_ORDIRD_RNMC                    = 0x80000000
+};
+
+enum nes_iwarp_qp_state {
+	NES_QPCONTEXT_IWARP_STATE_NONEXIST  = 0,
+	NES_QPCONTEXT_IWARP_STATE_IDLE      = 1,
+	NES_QPCONTEXT_IWARP_STATE_RTS       = 2,
+	NES_QPCONTEXT_IWARP_STATE_CLOSING   = 3,
+	NES_QPCONTEXT_IWARP_STATE_TERMINATE = 5,
+	NES_QPCONTEXT_IWARP_STATE_ERROR     = 6
+};
+
+
+#endif		/* NES_CONTEXT_H */
diff --git a/drivers/infiniband/hw/nes/nes_hw.c b/drivers/infiniband/hw/nes/nes_hw.c
new file mode 100644
index 0000000..02120d3
--- /dev/null
+++ b/drivers/infiniband/hw/nes/nes_hw.c
@@ -0,0 +1,3937 @@
+/*
+ * Copyright (c) 2006 - 2011 Intel Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>
+#include <linux/if_vlan.h>
+#include <linux/inet_lro.h>
+#include <linux/slab.h>
+
+#include "nes.h"
+
+static unsigned int nes_lro_max_aggr = NES_LRO_MAX_AGGR;
+module_param(nes_lro_max_aggr, uint, 0444);
+MODULE_PARM_DESC(nes_lro_max_aggr, "NIC LRO max packet aggregation");
+
+static int wide_ppm_offset;
+module_param(wide_ppm_offset, int, 0644);
+MODULE_PARM_DESC(wide_ppm_offset, "Increase CX4 interface clock ppm offset, 0=100ppm (default), 1=300ppm");
+
+static u32 crit_err_count;
+u32 int_mod_timer_init;
+u32 int_mod_cq_depth_256;
+u32 int_mod_cq_depth_128;
+u32 int_mod_cq_depth_32;
+u32 int_mod_cq_depth_24;
+u32 int_mod_cq_depth_16;
+u32 int_mod_cq_depth_4;
+u32 int_mod_cq_depth_1;
+static const u8 nes_max_critical_error_count = 100;
+#include "nes_cm.h"
+
+static void nes_cqp_ce_handler(struct nes_device *nesdev, struct nes_hw_cq *cq);
+static void nes_init_csr_ne020(struct nes_device *nesdev, u8 hw_rev, u8 port_count);
+static int nes_init_serdes(struct nes_device *nesdev, u8 hw_rev, u8 port_count,
+				struct nes_adapter *nesadapter, u8  OneG_Mode);
+static void nes_nic_napi_ce_handler(struct nes_device *nesdev, struct nes_hw_nic_cq *cq);
+static void nes_process_aeq(struct nes_device *nesdev, struct nes_hw_aeq *aeq);
+static void nes_process_ceq(struct nes_device *nesdev, struct nes_hw_ceq *ceq);
+static void nes_process_iwarp_aeqe(struct nes_device *nesdev,
+				   struct nes_hw_aeqe *aeqe);
+static void process_critical_error(struct nes_device *nesdev);
+static void nes_process_mac_intr(struct nes_device *nesdev, u32 mac_number);
+static unsigned int nes_reset_adapter_ne020(struct nes_device *nesdev, u8 *OneG_Mode);
+static void nes_terminate_start_timer(struct nes_qp *nesqp);
+
+#ifdef CONFIG_INFINIBAND_NES_DEBUG
+static unsigned char *nes_iwarp_state_str[] = {
+	"Non-Existent",
+	"Idle",
+	"RTS",
+	"Closing",
+	"RSVD1",
+	"Terminate",
+	"Error",
+	"RSVD2",
+};
+
+static unsigned char *nes_tcp_state_str[] = {
+	"Non-Existent",
+	"Closed",
+	"Listen",
+	"SYN Sent",
+	"SYN Rcvd",
+	"Established",
+	"Close Wait",
+	"FIN Wait 1",
+	"Closing",
+	"Last Ack",
+	"FIN Wait 2",
+	"Time Wait",
+	"RSVD1",
+	"RSVD2",
+	"RSVD3",
+	"RSVD4",
+};
+#endif
+
+static inline void print_ip(struct nes_cm_node *cm_node)
+{
+	unsigned char *rem_addr;
+	if (cm_node) {
+		rem_addr = (unsigned char *)&cm_node->rem_addr;
+		printk(KERN_ERR PFX "Remote IP addr: %pI4\n", rem_addr);
+	}
+}
+
+/**
+ * nes_nic_init_timer_defaults
+ */
+void  nes_nic_init_timer_defaults(struct nes_device *nesdev, u8 jumbomode)
+{
+	unsigned long flags;
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+	struct nes_hw_tune_timer *shared_timer = &nesadapter->tune_timer;
+
+	spin_lock_irqsave(&nesadapter->periodic_timer_lock, flags);
+
+	shared_timer->timer_in_use_min = NES_NIC_FAST_TIMER_LOW;
+	shared_timer->timer_in_use_max = NES_NIC_FAST_TIMER_HIGH;
+	if (jumbomode) {
+		shared_timer->threshold_low    = DEFAULT_JUMBO_NES_QL_LOW;
+		shared_timer->threshold_target = DEFAULT_JUMBO_NES_QL_TARGET;
+		shared_timer->threshold_high   = DEFAULT_JUMBO_NES_QL_HIGH;
+	} else {
+		shared_timer->threshold_low    = DEFAULT_NES_QL_LOW;
+		shared_timer->threshold_target = DEFAULT_NES_QL_TARGET;
+		shared_timer->threshold_high   = DEFAULT_NES_QL_HIGH;
+	}
+
+	/* todo use netdev->mtu to set thresholds */
+	spin_unlock_irqrestore(&nesadapter->periodic_timer_lock, flags);
+}
+
+
+/**
+ * nes_nic_init_timer
+ */
+static void  nes_nic_init_timer(struct nes_device *nesdev)
+{
+	unsigned long flags;
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+	struct nes_hw_tune_timer *shared_timer = &nesadapter->tune_timer;
+
+	spin_lock_irqsave(&nesadapter->periodic_timer_lock, flags);
+
+	if (shared_timer->timer_in_use_old == 0) {
+		nesdev->deepcq_count = 0;
+		shared_timer->timer_direction_upward = 0;
+		shared_timer->timer_direction_downward = 0;
+		shared_timer->timer_in_use = NES_NIC_FAST_TIMER;
+		shared_timer->timer_in_use_old = 0;
+
+	}
+	if (shared_timer->timer_in_use != shared_timer->timer_in_use_old) {
+		shared_timer->timer_in_use_old = shared_timer->timer_in_use;
+		nes_write32(nesdev->regs+NES_PERIODIC_CONTROL,
+			0x80000000 | ((u32)(shared_timer->timer_in_use*8)));
+	}
+	/* todo use netdev->mtu to set thresholds */
+	spin_unlock_irqrestore(&nesadapter->periodic_timer_lock, flags);
+}
+
+
+/**
+ * nes_nic_tune_timer
+ */
+static void nes_nic_tune_timer(struct nes_device *nesdev)
+{
+	unsigned long flags;
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+	struct nes_hw_tune_timer *shared_timer = &nesadapter->tune_timer;
+	u16 cq_count = nesdev->currcq_count;
+
+	spin_lock_irqsave(&nesadapter->periodic_timer_lock, flags);
+
+	if (shared_timer->cq_count_old <= cq_count)
+		shared_timer->cq_direction_downward = 0;
+	else
+		shared_timer->cq_direction_downward++;
+	shared_timer->cq_count_old = cq_count;
+	if (shared_timer->cq_direction_downward > NES_NIC_CQ_DOWNWARD_TREND) {
+		if (cq_count <= shared_timer->threshold_low &&
+		    shared_timer->threshold_low > 4) {
+			shared_timer->threshold_low = shared_timer->threshold_low/2;
+			shared_timer->cq_direction_downward=0;
+			nesdev->currcq_count = 0;
+			spin_unlock_irqrestore(&nesadapter->periodic_timer_lock, flags);
+			return;
+		}
+	}
+
+	if (cq_count > 1) {
+		nesdev->deepcq_count += cq_count;
+		if (cq_count <= shared_timer->threshold_low) {       /* increase timer gently */
+			shared_timer->timer_direction_upward++;
+			shared_timer->timer_direction_downward = 0;
+		} else if (cq_count <= shared_timer->threshold_target) { /* balanced */
+			shared_timer->timer_direction_upward = 0;
+			shared_timer->timer_direction_downward = 0;
+		} else if (cq_count <= shared_timer->threshold_high) {  /* decrease timer gently */
+			shared_timer->timer_direction_downward++;
+			shared_timer->timer_direction_upward = 0;
+		} else if (cq_count <= (shared_timer->threshold_high) * 2) {
+			shared_timer->timer_in_use -= 2;
+			shared_timer->timer_direction_upward = 0;
+			shared_timer->timer_direction_downward++;
+		} else {
+			shared_timer->timer_in_use -= 4;
+			shared_timer->timer_direction_upward = 0;
+			shared_timer->timer_direction_downward++;
+		}
+
+		if (shared_timer->timer_direction_upward > 3 ) {  /* using history */
+			shared_timer->timer_in_use += 3;
+			shared_timer->timer_direction_upward = 0;
+			shared_timer->timer_direction_downward = 0;
+		}
+		if (shared_timer->timer_direction_downward > 5) { /* using history */
+			shared_timer->timer_in_use -= 4 ;
+			shared_timer->timer_direction_downward = 0;
+			shared_timer->timer_direction_upward = 0;
+		}
+	}
+
+	/* boundary checking */
+	if (shared_timer->timer_in_use > shared_timer->threshold_high)
+		shared_timer->timer_in_use = shared_timer->threshold_high;
+	else if (shared_timer->timer_in_use < shared_timer->threshold_low)
+		shared_timer->timer_in_use = shared_timer->threshold_low;
+
+	nesdev->currcq_count = 0;
+
+	spin_unlock_irqrestore(&nesadapter->periodic_timer_lock, flags);
+}
+
+
+/**
+ * nes_init_adapter - initialize adapter
+ */
+struct nes_adapter *nes_init_adapter(struct nes_device *nesdev, u8 hw_rev) {
+	struct nes_adapter *nesadapter = NULL;
+	unsigned long num_pds;
+	u32 u32temp;
+	u32 port_count;
+	u16 max_rq_wrs;
+	u16 max_sq_wrs;
+	u32 max_mr;
+	u32 max_256pbl;
+	u32 max_4kpbl;
+	u32 max_qp;
+	u32 max_irrq;
+	u32 max_cq;
+	u32 hte_index_mask;
+	u32 adapter_size;
+	u32 arp_table_size;
+	u16 vendor_id;
+	u16 device_id;
+	u8  OneG_Mode;
+	u8  func_index;
+
+	/* search the list of existing adapters */
+	list_for_each_entry(nesadapter, &nes_adapter_list, list) {
+		nes_debug(NES_DBG_INIT, "Searching Adapter list for PCI devfn = 0x%X,"
+				" adapter PCI slot/bus = %u/%u, pci devices PCI slot/bus = %u/%u, .\n",
+				nesdev->pcidev->devfn,
+				PCI_SLOT(nesadapter->devfn),
+				nesadapter->bus_number,
+				PCI_SLOT(nesdev->pcidev->devfn),
+				nesdev->pcidev->bus->number );
+		if ((PCI_SLOT(nesadapter->devfn) == PCI_SLOT(nesdev->pcidev->devfn)) &&
+				(nesadapter->bus_number == nesdev->pcidev->bus->number)) {
+			nesadapter->ref_count++;
+			return nesadapter;
+		}
+	}
+
+	/* no adapter found */
+	num_pds = pci_resource_len(nesdev->pcidev, BAR_1) >> PAGE_SHIFT;
+	if ((hw_rev != NE020_REV) && (hw_rev != NE020_REV1)) {
+		nes_debug(NES_DBG_INIT, "NE020 driver detected unknown hardware revision 0x%x\n",
+				hw_rev);
+		return NULL;
+	}
+
+	nes_debug(NES_DBG_INIT, "Determine Soft Reset, QP_control=0x%x, CPU0=0x%x, CPU1=0x%x, CPU2=0x%x\n",
+			nes_read_indexed(nesdev, NES_IDX_QP_CONTROL + PCI_FUNC(nesdev->pcidev->devfn) * 8),
+			nes_read_indexed(nesdev, NES_IDX_INT_CPU_STATUS),
+			nes_read_indexed(nesdev, NES_IDX_INT_CPU_STATUS + 4),
+			nes_read_indexed(nesdev, NES_IDX_INT_CPU_STATUS + 8));
+
+	nes_debug(NES_DBG_INIT, "Reset and init NE020\n");
+
+
+	if ((port_count = nes_reset_adapter_ne020(nesdev, &OneG_Mode)) == 0)
+		return NULL;
+
+	max_qp = nes_read_indexed(nesdev, NES_IDX_QP_CTX_SIZE);
+	nes_debug(NES_DBG_INIT, "QP_CTX_SIZE=%u\n", max_qp);
+
+	u32temp = nes_read_indexed(nesdev, NES_IDX_QUAD_HASH_TABLE_SIZE);
+	if (max_qp > ((u32)1 << (u32temp & 0x001f))) {
+		nes_debug(NES_DBG_INIT, "Reducing Max QPs to %u due to hash table size = 0x%08X\n",
+				max_qp, u32temp);
+		max_qp = (u32)1 << (u32temp & 0x001f);
+	}
+
+	hte_index_mask = ((u32)1 << ((u32temp & 0x001f)+1))-1;
+	nes_debug(NES_DBG_INIT, "Max QP = %u, hte_index_mask = 0x%08X.\n",
+			max_qp, hte_index_mask);
+
+	u32temp = nes_read_indexed(nesdev, NES_IDX_IRRQ_COUNT);
+
+	max_irrq = 1 << (u32temp & 0x001f);
+
+	if (max_qp > max_irrq) {
+		max_qp = max_irrq;
+		nes_debug(NES_DBG_INIT, "Reducing Max QPs to %u due to Available Q1s.\n",
+				max_qp);
+	}
+
+	/* there should be no reason to allocate more pds than qps */
+	if (num_pds > max_qp)
+		num_pds = max_qp;
+
+	u32temp = nes_read_indexed(nesdev, NES_IDX_MRT_SIZE);
+	max_mr = (u32)8192 << (u32temp & 0x7);
+
+	u32temp = nes_read_indexed(nesdev, NES_IDX_PBL_REGION_SIZE);
+	max_256pbl = (u32)1 << (u32temp & 0x0000001f);
+	max_4kpbl = (u32)1 << ((u32temp >> 16) & 0x0000001f);
+	max_cq = nes_read_indexed(nesdev, NES_IDX_CQ_CTX_SIZE);
+
+	u32temp = nes_read_indexed(nesdev, NES_IDX_ARP_CACHE_SIZE);
+	arp_table_size = 1 << u32temp;
+
+	adapter_size = (sizeof(struct nes_adapter) +
+			(sizeof(unsigned long)-1)) & (~(sizeof(unsigned long)-1));
+	adapter_size += sizeof(unsigned long) * BITS_TO_LONGS(max_qp);
+	adapter_size += sizeof(unsigned long) * BITS_TO_LONGS(max_mr);
+	adapter_size += sizeof(unsigned long) * BITS_TO_LONGS(max_cq);
+	adapter_size += sizeof(unsigned long) * BITS_TO_LONGS(num_pds);
+	adapter_size += sizeof(unsigned long) * BITS_TO_LONGS(arp_table_size);
+	adapter_size += sizeof(struct nes_qp **) * max_qp;
+
+	/* allocate a new adapter struct */
+	nesadapter = kzalloc(adapter_size, GFP_KERNEL);
+	if (nesadapter == NULL) {
+		return NULL;
+	}
+
+	nes_debug(NES_DBG_INIT, "Allocating new nesadapter @ %p, size = %u (actual size = %u).\n",
+			nesadapter, (u32)sizeof(struct nes_adapter), adapter_size);
+
+	if (nes_read_eeprom_values(nesdev, nesadapter)) {
+		printk(KERN_ERR PFX "Unable to read EEPROM data.\n");
+		kfree(nesadapter);
+		return NULL;
+	}
+
+	nesadapter->vendor_id = (((u32) nesadapter->mac_addr_high) << 8) |
+				(nesadapter->mac_addr_low >> 24);
+
+	pci_bus_read_config_word(nesdev->pcidev->bus, nesdev->pcidev->devfn,
+				 PCI_DEVICE_ID, &device_id);
+	nesadapter->vendor_part_id = device_id;
+
+	if (nes_init_serdes(nesdev, hw_rev, port_count, nesadapter,
+							OneG_Mode)) {
+		kfree(nesadapter);
+		return NULL;
+	}
+	nes_init_csr_ne020(nesdev, hw_rev, port_count);
+
+	memset(nesadapter->pft_mcast_map, 255,
+	       sizeof nesadapter->pft_mcast_map);
+
+	/* populate the new nesadapter */
+	nesadapter->devfn = nesdev->pcidev->devfn;
+	nesadapter->bus_number = nesdev->pcidev->bus->number;
+	nesadapter->ref_count = 1;
+	nesadapter->timer_int_req = 0xffff0000;
+	nesadapter->OneG_Mode = OneG_Mode;
+	nesadapter->doorbell_start = nesdev->doorbell_region;
+
+	/* nesadapter->tick_delta = clk_divisor; */
+	nesadapter->hw_rev = hw_rev;
+	nesadapter->port_count = port_count;
+
+	nesadapter->max_qp = max_qp;
+	nesadapter->hte_index_mask = hte_index_mask;
+	nesadapter->max_irrq = max_irrq;
+	nesadapter->max_mr = max_mr;
+	nesadapter->max_256pbl = max_256pbl - 1;
+	nesadapter->max_4kpbl = max_4kpbl - 1;
+	nesadapter->max_cq = max_cq;
+	nesadapter->free_256pbl = max_256pbl - 1;
+	nesadapter->free_4kpbl = max_4kpbl - 1;
+	nesadapter->max_pd = num_pds;
+	nesadapter->arp_table_size = arp_table_size;
+
+	nesadapter->et_pkt_rate_low = NES_TIMER_ENABLE_LIMIT;
+	if (nes_drv_opt & NES_DRV_OPT_DISABLE_INT_MOD) {
+		nesadapter->et_use_adaptive_rx_coalesce = 0;
+		nesadapter->timer_int_limit = NES_TIMER_INT_LIMIT;
+		nesadapter->et_rx_coalesce_usecs_irq = interrupt_mod_interval;
+	} else {
+		nesadapter->et_use_adaptive_rx_coalesce = 1;
+		nesadapter->timer_int_limit = NES_TIMER_INT_LIMIT_DYNAMIC;
+		nesadapter->et_rx_coalesce_usecs_irq = 0;
+		printk(PFX "%s: Using Adaptive Interrupt Moderation\n", __func__);
+	}
+	/* Setup and enable the periodic timer */
+	if (nesadapter->et_rx_coalesce_usecs_irq)
+		nes_write32(nesdev->regs+NES_PERIODIC_CONTROL, 0x80000000 |
+				((u32)(nesadapter->et_rx_coalesce_usecs_irq * 8)));
+	else
+		nes_write32(nesdev->regs+NES_PERIODIC_CONTROL, 0x00000000);
+
+	nesadapter->base_pd = 1;
+
+	nesadapter->device_cap_flags = IB_DEVICE_LOCAL_DMA_LKEY |
+				       IB_DEVICE_MEM_WINDOW |
+				       IB_DEVICE_MEM_MGT_EXTENSIONS;
+
+	nesadapter->allocated_qps = (unsigned long *)&(((unsigned char *)nesadapter)
+			[(sizeof(struct nes_adapter)+(sizeof(unsigned long)-1))&(~(sizeof(unsigned long)-1))]);
+	nesadapter->allocated_cqs = &nesadapter->allocated_qps[BITS_TO_LONGS(max_qp)];
+	nesadapter->allocated_mrs = &nesadapter->allocated_cqs[BITS_TO_LONGS(max_cq)];
+	nesadapter->allocated_pds = &nesadapter->allocated_mrs[BITS_TO_LONGS(max_mr)];
+	nesadapter->allocated_arps = &nesadapter->allocated_pds[BITS_TO_LONGS(num_pds)];
+	nesadapter->qp_table = (struct nes_qp **)(&nesadapter->allocated_arps[BITS_TO_LONGS(arp_table_size)]);
+
+
+	/* mark the usual suspect QPs, MR and CQs as in use */
+	for (u32temp = 0; u32temp < NES_FIRST_QPN; u32temp++) {
+		set_bit(u32temp, nesadapter->allocated_qps);
+		set_bit(u32temp, nesadapter->allocated_cqs);
+	}
+	set_bit(0, nesadapter->allocated_mrs);
+
+	for (u32temp = 0; u32temp < 20; u32temp++)
+		set_bit(u32temp, nesadapter->allocated_pds);
+	u32temp = nes_read_indexed(nesdev, NES_IDX_QP_MAX_CFG_SIZES);
+
+	max_rq_wrs = ((u32temp >> 8) & 3);
+	switch (max_rq_wrs) {
+		case 0:
+			max_rq_wrs = 4;
+			break;
+		case 1:
+			max_rq_wrs = 16;
+			break;
+		case 2:
+			max_rq_wrs = 32;
+			break;
+		case 3:
+			max_rq_wrs = 512;
+			break;
+	}
+
+	max_sq_wrs = (u32temp & 3);
+	switch (max_sq_wrs) {
+		case 0:
+			max_sq_wrs = 4;
+			break;
+		case 1:
+			max_sq_wrs = 16;
+			break;
+		case 2:
+			max_sq_wrs = 32;
+			break;
+		case 3:
+			max_sq_wrs = 512;
+			break;
+	}
+	nesadapter->max_qp_wr = min(max_rq_wrs, max_sq_wrs);
+	nesadapter->max_irrq_wr = (u32temp >> 16) & 3;
+
+	nesadapter->max_sge = 4;
+	nesadapter->max_cqe = 32766;
+
+	if (nes_read_eeprom_values(nesdev, nesadapter)) {
+		printk(KERN_ERR PFX "Unable to read EEPROM data.\n");
+		kfree(nesadapter);
+		return NULL;
+	}
+
+	u32temp = nes_read_indexed(nesdev, NES_IDX_TCP_TIMER_CONFIG);
+	nes_write_indexed(nesdev, NES_IDX_TCP_TIMER_CONFIG,
+			(u32temp & 0xff000000) | (nesadapter->tcp_timer_core_clk_divisor & 0x00ffffff));
+
+	/* setup port configuration */
+	if (nesadapter->port_count == 1) {
+		nesadapter->log_port = 0x00000000;
+		if (nes_drv_opt & NES_DRV_OPT_DUAL_LOGICAL_PORT)
+			nes_write_indexed(nesdev, NES_IDX_TX_POOL_SIZE, 0x00000002);
+		else
+			nes_write_indexed(nesdev, NES_IDX_TX_POOL_SIZE, 0x00000003);
+	} else {
+		if (nesadapter->phy_type[0] == NES_PHY_TYPE_PUMA_1G) {
+			nesadapter->log_port = 0x000000D8;
+		} else {
+			if (nesadapter->port_count == 2)
+				nesadapter->log_port = 0x00000044;
+			else
+				nesadapter->log_port = 0x000000e4;
+		}
+		nes_write_indexed(nesdev, NES_IDX_TX_POOL_SIZE, 0x00000003);
+	}
+
+	nes_write_indexed(nesdev, NES_IDX_NIC_LOGPORT_TO_PHYPORT,
+						nesadapter->log_port);
+	nes_debug(NES_DBG_INIT, "Probe time, LOG2PHY=%u\n",
+			nes_read_indexed(nesdev, NES_IDX_NIC_LOGPORT_TO_PHYPORT));
+
+	spin_lock_init(&nesadapter->resource_lock);
+	spin_lock_init(&nesadapter->phy_lock);
+	spin_lock_init(&nesadapter->pbl_lock);
+	spin_lock_init(&nesadapter->periodic_timer_lock);
+
+	INIT_LIST_HEAD(&nesadapter->nesvnic_list[0]);
+	INIT_LIST_HEAD(&nesadapter->nesvnic_list[1]);
+	INIT_LIST_HEAD(&nesadapter->nesvnic_list[2]);
+	INIT_LIST_HEAD(&nesadapter->nesvnic_list[3]);
+
+	if ((!nesadapter->OneG_Mode) && (nesadapter->port_count == 2)) {
+		u32 pcs_control_status0, pcs_control_status1;
+		u32 reset_value;
+		u32 i = 0;
+		u32 int_cnt = 0;
+		u32 ext_cnt = 0;
+		unsigned long flags;
+		u32 j = 0;
+
+		pcs_control_status0 = nes_read_indexed(nesdev,
+			NES_IDX_PHY_PCS_CONTROL_STATUS0);
+		pcs_control_status1 = nes_read_indexed(nesdev,
+			NES_IDX_PHY_PCS_CONTROL_STATUS0 + 0x200);
+
+		for (i = 0; i < NES_MAX_LINK_CHECK; i++) {
+			pcs_control_status0 = nes_read_indexed(nesdev,
+					NES_IDX_PHY_PCS_CONTROL_STATUS0);
+			pcs_control_status1 = nes_read_indexed(nesdev,
+					NES_IDX_PHY_PCS_CONTROL_STATUS0 + 0x200);
+			if ((0x0F000100 == (pcs_control_status0 & 0x0F000100))
+			    || (0x0F000100 == (pcs_control_status1 & 0x0F000100)))
+				int_cnt++;
+			msleep(1);
+		}
+		if (int_cnt > 1) {
+			spin_lock_irqsave(&nesadapter->phy_lock, flags);
+			nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL1, 0x0000F0C8);
+			mh_detected++;
+			reset_value = nes_read32(nesdev->regs+NES_SOFTWARE_RESET);
+			reset_value |= 0x0000003d;
+			nes_write32(nesdev->regs+NES_SOFTWARE_RESET, reset_value);
+
+			while (((nes_read32(nesdev->regs+NES_SOFTWARE_RESET)
+				& 0x00000040) != 0x00000040) && (j++ < 5000));
+			spin_unlock_irqrestore(&nesadapter->phy_lock, flags);
+
+			pcs_control_status0 = nes_read_indexed(nesdev,
+					NES_IDX_PHY_PCS_CONTROL_STATUS0);
+			pcs_control_status1 = nes_read_indexed(nesdev,
+					NES_IDX_PHY_PCS_CONTROL_STATUS0 + 0x200);
+
+			for (i = 0; i < NES_MAX_LINK_CHECK; i++) {
+				pcs_control_status0 = nes_read_indexed(nesdev,
+					NES_IDX_PHY_PCS_CONTROL_STATUS0);
+				pcs_control_status1 = nes_read_indexed(nesdev,
+					NES_IDX_PHY_PCS_CONTROL_STATUS0 + 0x200);
+				if ((0x0F000100 == (pcs_control_status0 & 0x0F000100))
+					|| (0x0F000100 == (pcs_control_status1 & 0x0F000100))) {
+					if (++ext_cnt > int_cnt) {
+						spin_lock_irqsave(&nesadapter->phy_lock, flags);
+						nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL1,
+								0x0000F088);
+						mh_detected++;
+						reset_value = nes_read32(nesdev->regs+NES_SOFTWARE_RESET);
+						reset_value |= 0x0000003d;
+						nes_write32(nesdev->regs+NES_SOFTWARE_RESET, reset_value);
+
+						while (((nes_read32(nesdev->regs+NES_SOFTWARE_RESET)
+							& 0x00000040) != 0x00000040) && (j++ < 5000));
+						spin_unlock_irqrestore(&nesadapter->phy_lock, flags);
+						break;
+					}
+				}
+				msleep(1);
+			}
+		}
+	}
+
+	if (nesadapter->hw_rev == NE020_REV) {
+		init_timer(&nesadapter->mh_timer);
+		nesadapter->mh_timer.function = nes_mh_fix;
+		nesadapter->mh_timer.expires = jiffies + (HZ/5);  /* 1 second */
+		nesadapter->mh_timer.data = (unsigned long)nesdev;
+		add_timer(&nesadapter->mh_timer);
+	} else {
+		nes_write32(nesdev->regs+NES_INTF_INT_STAT, 0x0f000000);
+	}
+
+	init_timer(&nesadapter->lc_timer);
+	nesadapter->lc_timer.function = nes_clc;
+	nesadapter->lc_timer.expires = jiffies + 3600 * HZ;  /* 1 hour */
+	nesadapter->lc_timer.data = (unsigned long)nesdev;
+	add_timer(&nesadapter->lc_timer);
+
+	list_add_tail(&nesadapter->list, &nes_adapter_list);
+
+	for (func_index = 0; func_index < 8; func_index++) {
+		pci_bus_read_config_word(nesdev->pcidev->bus,
+					PCI_DEVFN(PCI_SLOT(nesdev->pcidev->devfn),
+					func_index), 0, &vendor_id);
+		if (vendor_id == 0xffff)
+			break;
+	}
+	nes_debug(NES_DBG_INIT, "%s %d functions found for %s.\n", __func__,
+		func_index, pci_name(nesdev->pcidev));
+	nesadapter->adapter_fcn_count = func_index;
+
+	return nesadapter;
+}
+
+
+/**
+ * nes_reset_adapter_ne020
+ */
+static unsigned int nes_reset_adapter_ne020(struct nes_device *nesdev, u8 *OneG_Mode)
+{
+	u32 port_count;
+	u32 u32temp;
+	u32 i;
+
+	u32temp = nes_read32(nesdev->regs+NES_SOFTWARE_RESET);
+	port_count = ((u32temp & 0x00000300) >> 8) + 1;
+	/* TODO: assuming that both SERDES are set the same for now */
+	*OneG_Mode = (u32temp & 0x00003c00) ? 0 : 1;
+	nes_debug(NES_DBG_INIT, "Initial Software Reset = 0x%08X, port_count=%u\n",
+			u32temp, port_count);
+	if (*OneG_Mode)
+		nes_debug(NES_DBG_INIT, "Running in 1G mode.\n");
+	u32temp &= 0xff00ffc0;
+	switch (port_count) {
+		case 1:
+			u32temp |= 0x00ee0000;
+			break;
+		case 2:
+			u32temp |= 0x00cc0000;
+			break;
+		case 4:
+			u32temp |= 0x00000000;
+			break;
+		default:
+			return 0;
+			break;
+	}
+
+	/* check and do full reset if needed */
+	if (nes_read_indexed(nesdev, NES_IDX_QP_CONTROL+(PCI_FUNC(nesdev->pcidev->devfn)*8))) {
+		nes_debug(NES_DBG_INIT, "Issuing Full Soft reset = 0x%08X\n", u32temp | 0xd);
+		nes_write32(nesdev->regs+NES_SOFTWARE_RESET, u32temp | 0xd);
+
+		i = 0;
+		while (((nes_read32(nesdev->regs+NES_SOFTWARE_RESET) & 0x00000040) == 0) && i++ < 10000)
+			mdelay(1);
+		if (i > 10000) {
+			nes_debug(NES_DBG_INIT, "Did not see full soft reset done.\n");
+			return 0;
+		}
+
+		i = 0;
+		while ((nes_read_indexed(nesdev, NES_IDX_INT_CPU_STATUS) != 0x80) && i++ < 10000)
+			mdelay(1);
+		if (i > 10000) {
+			printk(KERN_ERR PFX "Internal CPU not ready, status = %02X\n",
+			       nes_read_indexed(nesdev, NES_IDX_INT_CPU_STATUS));
+			return 0;
+		}
+	}
+
+	/* port reset */
+	switch (port_count) {
+		case 1:
+			u32temp |= 0x00ee0010;
+			break;
+		case 2:
+			u32temp |= 0x00cc0030;
+			break;
+		case 4:
+			u32temp |= 0x00000030;
+			break;
+	}
+
+	nes_debug(NES_DBG_INIT, "Issuing Port Soft reset = 0x%08X\n", u32temp | 0xd);
+	nes_write32(nesdev->regs+NES_SOFTWARE_RESET, u32temp | 0xd);
+
+	i = 0;
+	while (((nes_read32(nesdev->regs+NES_SOFTWARE_RESET) & 0x00000040) == 0) && i++ < 10000)
+		mdelay(1);
+	if (i > 10000) {
+		nes_debug(NES_DBG_INIT, "Did not see port soft reset done.\n");
+		return 0;
+	}
+
+	/* serdes 0 */
+	i = 0;
+	while (((u32temp = (nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_STATUS0)
+			& 0x0000000f)) != 0x0000000f) && i++ < 5000)
+		mdelay(1);
+	if (i > 5000) {
+		nes_debug(NES_DBG_INIT, "Serdes 0 not ready, status=%x\n", u32temp);
+		return 0;
+	}
+
+	/* serdes 1 */
+	if (port_count > 1) {
+		i = 0;
+		while (((u32temp = (nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_STATUS1)
+				& 0x0000000f)) != 0x0000000f) && i++ < 5000)
+			mdelay(1);
+		if (i > 5000) {
+			nes_debug(NES_DBG_INIT, "Serdes 1 not ready, status=%x\n", u32temp);
+			return 0;
+		}
+	}
+
+	return port_count;
+}
+
+
+/**
+ * nes_init_serdes
+ */
+static int nes_init_serdes(struct nes_device *nesdev, u8 hw_rev, u8 port_count,
+				struct nes_adapter *nesadapter, u8  OneG_Mode)
+{
+	int i;
+	u32 u32temp;
+	u32 sds;
+
+	if (hw_rev != NE020_REV) {
+		/* init serdes 0 */
+		switch (nesadapter->phy_type[0]) {
+		case NES_PHY_TYPE_CX4:
+			if (wide_ppm_offset)
+				nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_CDR_CONTROL0, 0x000FFFAA);
+			else
+				nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_CDR_CONTROL0, 0x000000FF);
+			break;
+		case NES_PHY_TYPE_KR:
+			nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_CDR_CONTROL0, 0x000000FF);
+			nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_TX_EMP0, 0x00000000);
+			break;
+		case NES_PHY_TYPE_PUMA_1G:
+			nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_CDR_CONTROL0, 0x000000FF);
+			sds = nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL0);
+			sds |= 0x00000100;
+			nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL0, sds);
+			break;
+		default:
+			nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_CDR_CONTROL0, 0x000000FF);
+			break;
+		}
+
+		if (!OneG_Mode)
+			nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_TX_HIGHZ_LANE_MODE0, 0x11110000);
+
+		if (port_count < 2)
+			return 0;
+
+		/* init serdes 1 */
+		if (!(OneG_Mode && (nesadapter->phy_type[1] != NES_PHY_TYPE_PUMA_1G)))
+			nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_CDR_CONTROL1, 0x000000FF);
+
+		switch (nesadapter->phy_type[1]) {
+		case NES_PHY_TYPE_ARGUS:
+		case NES_PHY_TYPE_SFP_D:
+			nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_TX_EMP0, 0x00000000);
+			nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_TX_EMP1, 0x00000000);
+			break;
+		case NES_PHY_TYPE_CX4:
+			if (wide_ppm_offset)
+				nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_CDR_CONTROL1, 0x000FFFAA);
+			break;
+		case NES_PHY_TYPE_KR:
+			nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_TX_EMP1, 0x00000000);
+			break;
+		case NES_PHY_TYPE_PUMA_1G:
+			sds = nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL1);
+			sds |= 0x000000100;
+			nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL1, sds);
+		}
+		if (!OneG_Mode) {
+			nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_TX_HIGHZ_LANE_MODE1, 0x11110000);
+			sds = nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL1);
+			sds &= 0xFFFFFFBF;
+			nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL1, sds);
+		}
+	} else {
+		/* init serdes 0 */
+		nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL0, 0x00000008);
+		i = 0;
+		while (((u32temp = (nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_STATUS0)
+				& 0x0000000f)) != 0x0000000f) && i++ < 5000)
+			mdelay(1);
+		if (i > 5000) {
+			nes_debug(NES_DBG_PHY, "Init: serdes 0 not ready, status=%x\n", u32temp);
+			return 1;
+		}
+		nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_TX_EMP0, 0x000bdef7);
+		nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_TX_DRIVE0, 0x9ce73000);
+		nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_RX_MODE0, 0x0ff00000);
+		nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_RX_SIGDET0, 0x00000000);
+		nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_BYPASS0, 0x00000000);
+		nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_LOOPBACK_CONTROL0, 0x00000000);
+		if (OneG_Mode)
+			nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_RX_EQ_CONTROL0, 0xf0182222);
+		else
+			nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_RX_EQ_CONTROL0, 0xf0042222);
+
+		nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_CDR_CONTROL0, 0x000000ff);
+		if (port_count > 1) {
+			/* init serdes 1 */
+			nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL1, 0x00000048);
+			i = 0;
+			while (((u32temp = (nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_STATUS1)
+				& 0x0000000f)) != 0x0000000f) && (i++ < 5000))
+				mdelay(1);
+			if (i > 5000) {
+				printk("%s: Init: serdes 1 not ready, status=%x\n", __func__, u32temp);
+				/* return 1; */
+			}
+			nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_TX_EMP1, 0x000bdef7);
+			nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_TX_DRIVE1, 0x9ce73000);
+			nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_RX_MODE1, 0x0ff00000);
+			nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_RX_SIGDET1, 0x00000000);
+			nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_BYPASS1, 0x00000000);
+			nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_LOOPBACK_CONTROL1, 0x00000000);
+			nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_RX_EQ_CONTROL1, 0xf0002222);
+			nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_CDR_CONTROL1, 0x000000ff);
+		}
+	}
+	return 0;
+}
+
+
+/**
+ * nes_init_csr_ne020
+ * Initialize registers for ne020 hardware
+ */
+static void nes_init_csr_ne020(struct nes_device *nesdev, u8 hw_rev, u8 port_count)
+{
+	u32 u32temp;
+
+	nes_debug(NES_DBG_INIT, "port_count=%d\n", port_count);
+
+	nes_write_indexed(nesdev, 0x000001E4, 0x00000007);
+	/* nes_write_indexed(nesdev, 0x000001E8, 0x000208C4); */
+	nes_write_indexed(nesdev, 0x000001E8, 0x00020874);
+	nes_write_indexed(nesdev, 0x000001D8, 0x00048002);
+	/* nes_write_indexed(nesdev, 0x000001D8, 0x0004B002); */
+	nes_write_indexed(nesdev, 0x000001FC, 0x00050005);
+	nes_write_indexed(nesdev, 0x00000600, 0x55555555);
+	nes_write_indexed(nesdev, 0x00000604, 0x55555555);
+
+	/* TODO: move these MAC register settings to NIC bringup */
+	nes_write_indexed(nesdev, 0x00002000, 0x00000001);
+	nes_write_indexed(nesdev, 0x00002004, 0x00000001);
+	nes_write_indexed(nesdev, 0x00002008, 0x0000FFFF);
+	nes_write_indexed(nesdev, 0x0000200C, 0x00000001);
+	nes_write_indexed(nesdev, 0x00002010, 0x000003c1);
+	nes_write_indexed(nesdev, 0x0000201C, 0x75345678);
+	if (port_count > 1) {
+		nes_write_indexed(nesdev, 0x00002200, 0x00000001);
+		nes_write_indexed(nesdev, 0x00002204, 0x00000001);
+		nes_write_indexed(nesdev, 0x00002208, 0x0000FFFF);
+		nes_write_indexed(nesdev, 0x0000220C, 0x00000001);
+		nes_write_indexed(nesdev, 0x00002210, 0x000003c1);
+		nes_write_indexed(nesdev, 0x0000221C, 0x75345678);
+		nes_write_indexed(nesdev, 0x00000908, 0x20000001);
+	}
+	if (port_count > 2) {
+		nes_write_indexed(nesdev, 0x00002400, 0x00000001);
+		nes_write_indexed(nesdev, 0x00002404, 0x00000001);
+		nes_write_indexed(nesdev, 0x00002408, 0x0000FFFF);
+		nes_write_indexed(nesdev, 0x0000240C, 0x00000001);
+		nes_write_indexed(nesdev, 0x00002410, 0x000003c1);
+		nes_write_indexed(nesdev, 0x0000241C, 0x75345678);
+		nes_write_indexed(nesdev, 0x00000910, 0x20000001);
+
+		nes_write_indexed(nesdev, 0x00002600, 0x00000001);
+		nes_write_indexed(nesdev, 0x00002604, 0x00000001);
+		nes_write_indexed(nesdev, 0x00002608, 0x0000FFFF);
+		nes_write_indexed(nesdev, 0x0000260C, 0x00000001);
+		nes_write_indexed(nesdev, 0x00002610, 0x000003c1);
+		nes_write_indexed(nesdev, 0x0000261C, 0x75345678);
+		nes_write_indexed(nesdev, 0x00000918, 0x20000001);
+	}
+
+	nes_write_indexed(nesdev, 0x00005000, 0x00018000);
+	/* nes_write_indexed(nesdev, 0x00005000, 0x00010000); */
+	nes_write_indexed(nesdev, NES_IDX_WQM_CONFIG1, (wqm_quanta << 1) |
+							 0x00000001);
+	nes_write_indexed(nesdev, 0x00005008, 0x1F1F1F1F);
+	nes_write_indexed(nesdev, 0x00005010, 0x1F1F1F1F);
+	nes_write_indexed(nesdev, 0x00005018, 0x1F1F1F1F);
+	nes_write_indexed(nesdev, 0x00005020, 0x1F1F1F1F);
+	nes_write_indexed(nesdev, 0x00006090, 0xFFFFFFFF);
+
+	/* TODO: move this to code, get from EEPROM */
+	nes_write_indexed(nesdev, 0x00000900, 0x20000001);
+	nes_write_indexed(nesdev, 0x000060C0, 0x0000028e);
+	nes_write_indexed(nesdev, 0x000060C8, 0x00000020);
+
+	nes_write_indexed(nesdev, 0x000001EC, 0x7b2625a0);
+	/* nes_write_indexed(nesdev, 0x000001EC, 0x5f2625a0); */
+
+	if (hw_rev != NE020_REV) {
+		u32temp = nes_read_indexed(nesdev, 0x000008e8);
+		u32temp |= 0x80000000;
+		nes_write_indexed(nesdev, 0x000008e8, u32temp);
+		u32temp = nes_read_indexed(nesdev, 0x000021f8);
+		u32temp &= 0x7fffffff;
+		u32temp |= 0x7fff0010;
+		nes_write_indexed(nesdev, 0x000021f8, u32temp);
+		if (port_count > 1) {
+			u32temp = nes_read_indexed(nesdev, 0x000023f8);
+			u32temp &= 0x7fffffff;
+			u32temp |= 0x7fff0010;
+			nes_write_indexed(nesdev, 0x000023f8, u32temp);
+		}
+	}
+}
+
+
+/**
+ * nes_destroy_adapter - destroy the adapter structure
+ */
+void nes_destroy_adapter(struct nes_adapter *nesadapter)
+{
+	struct nes_adapter *tmp_adapter;
+
+	list_for_each_entry(tmp_adapter, &nes_adapter_list, list) {
+		nes_debug(NES_DBG_SHUTDOWN, "Nes Adapter list entry = 0x%p.\n",
+				tmp_adapter);
+	}
+
+	nesadapter->ref_count--;
+	if (!nesadapter->ref_count) {
+		if (nesadapter->hw_rev == NE020_REV) {
+			del_timer(&nesadapter->mh_timer);
+		}
+		del_timer(&nesadapter->lc_timer);
+
+		list_del(&nesadapter->list);
+		kfree(nesadapter);
+	}
+}
+
+
+/**
+ * nes_init_cqp
+ */
+int nes_init_cqp(struct nes_device *nesdev)
+{
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+	struct nes_hw_cqp_qp_context *cqp_qp_context;
+	struct nes_hw_cqp_wqe *cqp_wqe;
+	struct nes_hw_ceq *ceq;
+	struct nes_hw_ceq *nic_ceq;
+	struct nes_hw_aeq *aeq;
+	void *vmem;
+	dma_addr_t pmem;
+	u32 count=0;
+	u32 cqp_head;
+	u64 u64temp;
+	u32 u32temp;
+
+	/* allocate CQP memory */
+	/* Need to add max_cq to the aeq size once cq overflow checking is added back */
+	/* SQ is 512 byte aligned, others are 256 byte aligned */
+	nesdev->cqp_mem_size = 512 +
+			(sizeof(struct nes_hw_cqp_wqe) * NES_CQP_SQ_SIZE) +
+			(sizeof(struct nes_hw_cqe) * NES_CCQ_SIZE) +
+			max(((u32)sizeof(struct nes_hw_ceqe) * NES_CCEQ_SIZE), (u32)256) +
+			max(((u32)sizeof(struct nes_hw_ceqe) * NES_NIC_CEQ_SIZE), (u32)256) +
+			(sizeof(struct nes_hw_aeqe) * nesadapter->max_qp) +
+			sizeof(struct nes_hw_cqp_qp_context);
+
+	nesdev->cqp_vbase = pci_zalloc_consistent(nesdev->pcidev,
+						  nesdev->cqp_mem_size,
+						  &nesdev->cqp_pbase);
+	if (!nesdev->cqp_vbase) {
+		nes_debug(NES_DBG_INIT, "Unable to allocate memory for host descriptor rings\n");
+		return -ENOMEM;
+	}
+
+	/* Allocate a twice the number of CQP requests as the SQ size */
+	nesdev->nes_cqp_requests = kzalloc(sizeof(struct nes_cqp_request) *
+			2 * NES_CQP_SQ_SIZE, GFP_KERNEL);
+	if (nesdev->nes_cqp_requests == NULL) {
+		nes_debug(NES_DBG_INIT, "Unable to allocate memory CQP request entries.\n");
+		pci_free_consistent(nesdev->pcidev, nesdev->cqp_mem_size, nesdev->cqp.sq_vbase,
+				nesdev->cqp.sq_pbase);
+		return -ENOMEM;
+	}
+
+	nes_debug(NES_DBG_INIT, "Allocated CQP structures at %p (phys = %016lX), size = %u.\n",
+			nesdev->cqp_vbase, (unsigned long)nesdev->cqp_pbase, nesdev->cqp_mem_size);
+
+	spin_lock_init(&nesdev->cqp.lock);
+	init_waitqueue_head(&nesdev->cqp.waitq);
+
+	/* Setup Various Structures */
+	vmem = (void *)(((unsigned long)nesdev->cqp_vbase + (512 - 1)) &
+			~(unsigned long)(512 - 1));
+	pmem = (dma_addr_t)(((unsigned long long)nesdev->cqp_pbase + (512 - 1)) &
+			~(unsigned long long)(512 - 1));
+
+	nesdev->cqp.sq_vbase = vmem;
+	nesdev->cqp.sq_pbase = pmem;
+	nesdev->cqp.sq_size = NES_CQP_SQ_SIZE;
+	nesdev->cqp.sq_head = 0;
+	nesdev->cqp.sq_tail = 0;
+	nesdev->cqp.qp_id = PCI_FUNC(nesdev->pcidev->devfn);
+
+	vmem += (sizeof(struct nes_hw_cqp_wqe) * nesdev->cqp.sq_size);
+	pmem += (sizeof(struct nes_hw_cqp_wqe) * nesdev->cqp.sq_size);
+
+	nesdev->ccq.cq_vbase = vmem;
+	nesdev->ccq.cq_pbase = pmem;
+	nesdev->ccq.cq_size = NES_CCQ_SIZE;
+	nesdev->ccq.cq_head = 0;
+	nesdev->ccq.ce_handler = nes_cqp_ce_handler;
+	nesdev->ccq.cq_number = PCI_FUNC(nesdev->pcidev->devfn);
+
+	vmem += (sizeof(struct nes_hw_cqe) * nesdev->ccq.cq_size);
+	pmem += (sizeof(struct nes_hw_cqe) * nesdev->ccq.cq_size);
+
+	nesdev->ceq_index = PCI_FUNC(nesdev->pcidev->devfn);
+	ceq = &nesadapter->ceq[nesdev->ceq_index];
+	ceq->ceq_vbase = vmem;
+	ceq->ceq_pbase = pmem;
+	ceq->ceq_size = NES_CCEQ_SIZE;
+	ceq->ceq_head = 0;
+
+	vmem += max(((u32)sizeof(struct nes_hw_ceqe) * ceq->ceq_size), (u32)256);
+	pmem += max(((u32)sizeof(struct nes_hw_ceqe) * ceq->ceq_size), (u32)256);
+
+	nesdev->nic_ceq_index = PCI_FUNC(nesdev->pcidev->devfn) + 8;
+	nic_ceq = &nesadapter->ceq[nesdev->nic_ceq_index];
+	nic_ceq->ceq_vbase = vmem;
+	nic_ceq->ceq_pbase = pmem;
+	nic_ceq->ceq_size = NES_NIC_CEQ_SIZE;
+	nic_ceq->ceq_head = 0;
+
+	vmem += max(((u32)sizeof(struct nes_hw_ceqe) * nic_ceq->ceq_size), (u32)256);
+	pmem += max(((u32)sizeof(struct nes_hw_ceqe) * nic_ceq->ceq_size), (u32)256);
+
+	aeq = &nesadapter->aeq[PCI_FUNC(nesdev->pcidev->devfn)];
+	aeq->aeq_vbase = vmem;
+	aeq->aeq_pbase = pmem;
+	aeq->aeq_size = nesadapter->max_qp;
+	aeq->aeq_head = 0;
+
+	/* Setup QP Context */
+	vmem += (sizeof(struct nes_hw_aeqe) * aeq->aeq_size);
+	pmem += (sizeof(struct nes_hw_aeqe) * aeq->aeq_size);
+
+	cqp_qp_context = vmem;
+	cqp_qp_context->context_words[0] =
+			cpu_to_le32((PCI_FUNC(nesdev->pcidev->devfn) << 12) + (2 << 10));
+	cqp_qp_context->context_words[1] = 0;
+	cqp_qp_context->context_words[2] = cpu_to_le32((u32)nesdev->cqp.sq_pbase);
+	cqp_qp_context->context_words[3] = cpu_to_le32(((u64)nesdev->cqp.sq_pbase) >> 32);
+
+
+	/* Write the address to Create CQP */
+	if ((sizeof(dma_addr_t) > 4)) {
+		nes_write_indexed(nesdev,
+				NES_IDX_CREATE_CQP_HIGH + (PCI_FUNC(nesdev->pcidev->devfn) * 8),
+				((u64)pmem) >> 32);
+	} else {
+		nes_write_indexed(nesdev,
+				NES_IDX_CREATE_CQP_HIGH + (PCI_FUNC(nesdev->pcidev->devfn) * 8), 0);
+	}
+	nes_write_indexed(nesdev,
+			NES_IDX_CREATE_CQP_LOW + (PCI_FUNC(nesdev->pcidev->devfn) * 8),
+			(u32)pmem);
+
+	INIT_LIST_HEAD(&nesdev->cqp_avail_reqs);
+	INIT_LIST_HEAD(&nesdev->cqp_pending_reqs);
+
+	for (count = 0; count < 2*NES_CQP_SQ_SIZE; count++) {
+		init_waitqueue_head(&nesdev->nes_cqp_requests[count].waitq);
+		list_add_tail(&nesdev->nes_cqp_requests[count].list, &nesdev->cqp_avail_reqs);
+	}
+
+	/* Write Create CCQ WQE */
+	cqp_head = nesdev->cqp.sq_head++;
+	cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head];
+	nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
+	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX,
+			(NES_CQP_CREATE_CQ | NES_CQP_CQ_CEQ_VALID |
+			NES_CQP_CQ_CHK_OVERFLOW | ((u32)nesdev->ccq.cq_size << 16)));
+	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_ID_IDX,
+			    (nesdev->ccq.cq_number |
+			     ((u32)nesdev->ceq_index << 16)));
+	u64temp = (u64)nesdev->ccq.cq_pbase;
+	set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_CQ_WQE_PBL_LOW_IDX, u64temp);
+	cqp_wqe->wqe_words[NES_CQP_CQ_WQE_CQ_CONTEXT_HIGH_IDX] = 0;
+	u64temp = (unsigned long)&nesdev->ccq;
+	cqp_wqe->wqe_words[NES_CQP_CQ_WQE_CQ_CONTEXT_LOW_IDX] =
+			cpu_to_le32((u32)(u64temp >> 1));
+	cqp_wqe->wqe_words[NES_CQP_CQ_WQE_CQ_CONTEXT_HIGH_IDX] =
+			cpu_to_le32(((u32)((u64temp) >> 33)) & 0x7FFFFFFF);
+	cqp_wqe->wqe_words[NES_CQP_CQ_WQE_DOORBELL_INDEX_HIGH_IDX] = 0;
+
+	/* Write Create CEQ WQE */
+	cqp_head = nesdev->cqp.sq_head++;
+	cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head];
+	nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
+	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX,
+			    (NES_CQP_CREATE_CEQ + ((u32)nesdev->ceq_index << 8)));
+	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_CEQ_WQE_ELEMENT_COUNT_IDX, ceq->ceq_size);
+	u64temp = (u64)ceq->ceq_pbase;
+	set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_CQ_WQE_PBL_LOW_IDX, u64temp);
+
+	/* Write Create AEQ WQE */
+	cqp_head = nesdev->cqp.sq_head++;
+	cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head];
+	nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
+	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX,
+			(NES_CQP_CREATE_AEQ + ((u32)PCI_FUNC(nesdev->pcidev->devfn) << 8)));
+	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_AEQ_WQE_ELEMENT_COUNT_IDX, aeq->aeq_size);
+	u64temp = (u64)aeq->aeq_pbase;
+	set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_CQ_WQE_PBL_LOW_IDX, u64temp);
+
+	/* Write Create NIC CEQ WQE */
+	cqp_head = nesdev->cqp.sq_head++;
+	cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head];
+	nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
+	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX,
+			(NES_CQP_CREATE_CEQ + ((u32)nesdev->nic_ceq_index << 8)));
+	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_CEQ_WQE_ELEMENT_COUNT_IDX, nic_ceq->ceq_size);
+	u64temp = (u64)nic_ceq->ceq_pbase;
+	set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_CQ_WQE_PBL_LOW_IDX, u64temp);
+
+	/* Poll until CCQP done */
+	count = 0;
+	do {
+		if (count++ > 1000) {
+			printk(KERN_ERR PFX "Error creating CQP\n");
+			pci_free_consistent(nesdev->pcidev, nesdev->cqp_mem_size,
+					nesdev->cqp_vbase, nesdev->cqp_pbase);
+			return -1;
+		}
+		udelay(10);
+	} while (!(nes_read_indexed(nesdev,
+			NES_IDX_QP_CONTROL + (PCI_FUNC(nesdev->pcidev->devfn) * 8)) & (1 << 8)));
+
+	nes_debug(NES_DBG_INIT, "CQP Status = 0x%08X\n", nes_read_indexed(nesdev,
+			NES_IDX_QP_CONTROL+(PCI_FUNC(nesdev->pcidev->devfn)*8)));
+
+	u32temp = 0x04800000;
+	nes_write32(nesdev->regs+NES_WQE_ALLOC, u32temp | nesdev->cqp.qp_id);
+
+	/* wait for the CCQ, CEQ, and AEQ to get created */
+	count = 0;
+	do {
+		if (count++ > 1000) {
+			printk(KERN_ERR PFX "Error creating CCQ, CEQ, and AEQ\n");
+			pci_free_consistent(nesdev->pcidev, nesdev->cqp_mem_size,
+					nesdev->cqp_vbase, nesdev->cqp_pbase);
+			return -1;
+		}
+		udelay(10);
+	} while (((nes_read_indexed(nesdev,
+			NES_IDX_QP_CONTROL+(PCI_FUNC(nesdev->pcidev->devfn)*8)) & (15<<8)) != (15<<8)));
+
+	/* dump the QP status value */
+	nes_debug(NES_DBG_INIT, "QP Status = 0x%08X\n", nes_read_indexed(nesdev,
+			NES_IDX_QP_CONTROL+(PCI_FUNC(nesdev->pcidev->devfn)*8)));
+
+	nesdev->cqp.sq_tail++;
+
+	return 0;
+}
+
+
+/**
+ * nes_destroy_cqp
+ */
+int nes_destroy_cqp(struct nes_device *nesdev)
+{
+	struct nes_hw_cqp_wqe *cqp_wqe;
+	u32 count = 0;
+	u32 cqp_head;
+	unsigned long flags;
+
+	do {
+		if (count++ > 1000)
+			break;
+		udelay(10);
+	} while (!(nesdev->cqp.sq_head == nesdev->cqp.sq_tail));
+
+	/* Reset CCQ */
+	nes_write32(nesdev->regs+NES_CQE_ALLOC, NES_CQE_ALLOC_RESET |
+			nesdev->ccq.cq_number);
+
+	/* Disable device interrupts */
+	nes_write32(nesdev->regs+NES_INT_MASK, 0x7fffffff);
+
+	spin_lock_irqsave(&nesdev->cqp.lock, flags);
+
+	/* Destroy the AEQ */
+	cqp_head = nesdev->cqp.sq_head++;
+	nesdev->cqp.sq_head &= nesdev->cqp.sq_size-1;
+	cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head];
+	cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] = cpu_to_le32(NES_CQP_DESTROY_AEQ |
+			((u32)PCI_FUNC(nesdev->pcidev->devfn) << 8));
+	cqp_wqe->wqe_words[NES_CQP_WQE_COMP_CTX_HIGH_IDX] = 0;
+
+	/* Destroy the NIC CEQ */
+	cqp_head = nesdev->cqp.sq_head++;
+	nesdev->cqp.sq_head &= nesdev->cqp.sq_size-1;
+	cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head];
+	cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] = cpu_to_le32(NES_CQP_DESTROY_CEQ |
+			((u32)nesdev->nic_ceq_index << 8));
+
+	/* Destroy the CEQ */
+	cqp_head = nesdev->cqp.sq_head++;
+	nesdev->cqp.sq_head &= nesdev->cqp.sq_size-1;
+	cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head];
+	cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] = cpu_to_le32(NES_CQP_DESTROY_CEQ |
+			(nesdev->ceq_index << 8));
+
+	/* Destroy the CCQ */
+	cqp_head = nesdev->cqp.sq_head++;
+	nesdev->cqp.sq_head &= nesdev->cqp.sq_size-1;
+	cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head];
+	cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] = cpu_to_le32(NES_CQP_DESTROY_CQ);
+	cqp_wqe->wqe_words[NES_CQP_WQE_ID_IDX] = cpu_to_le32(nesdev->ccq.cq_number |
+			((u32)nesdev->ceq_index << 16));
+
+	/* Destroy CQP */
+	cqp_head = nesdev->cqp.sq_head++;
+	nesdev->cqp.sq_head &= nesdev->cqp.sq_size-1;
+	cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head];
+	cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] = cpu_to_le32(NES_CQP_DESTROY_QP |
+			NES_CQP_QP_TYPE_CQP);
+	cqp_wqe->wqe_words[NES_CQP_WQE_ID_IDX] = cpu_to_le32(nesdev->cqp.qp_id);
+
+	barrier();
+	/* Ring doorbell (5 WQEs) */
+	nes_write32(nesdev->regs+NES_WQE_ALLOC, 0x05800000 | nesdev->cqp.qp_id);
+
+	spin_unlock_irqrestore(&nesdev->cqp.lock, flags);
+
+	/* wait for the CCQ, CEQ, and AEQ to get destroyed */
+	count = 0;
+	do {
+		if (count++ > 1000) {
+			printk(KERN_ERR PFX "Function%d: Error destroying CCQ, CEQ, and AEQ\n",
+					PCI_FUNC(nesdev->pcidev->devfn));
+			break;
+		}
+		udelay(10);
+	} while (((nes_read_indexed(nesdev,
+			NES_IDX_QP_CONTROL + (PCI_FUNC(nesdev->pcidev->devfn)*8)) & (15 << 8)) != 0));
+
+	/* dump the QP status value */
+	nes_debug(NES_DBG_SHUTDOWN, "Function%d: QP Status = 0x%08X\n",
+			PCI_FUNC(nesdev->pcidev->devfn),
+			nes_read_indexed(nesdev,
+			NES_IDX_QP_CONTROL+(PCI_FUNC(nesdev->pcidev->devfn)*8)));
+
+	kfree(nesdev->nes_cqp_requests);
+
+	/* Free the control structures */
+	pci_free_consistent(nesdev->pcidev, nesdev->cqp_mem_size, nesdev->cqp.sq_vbase,
+			nesdev->cqp.sq_pbase);
+
+	return 0;
+}
+
+
+/**
+ * nes_init_1g_phy
+ */
+static int nes_init_1g_phy(struct nes_device *nesdev, u8 phy_type, u8 phy_index)
+{
+	u32 counter = 0;
+	u16 phy_data;
+	int ret = 0;
+
+	nes_read_1G_phy_reg(nesdev, 1, phy_index, &phy_data);
+	nes_write_1G_phy_reg(nesdev, 23, phy_index, 0xb000);
+
+	/* Reset the PHY */
+	nes_write_1G_phy_reg(nesdev, 0, phy_index, 0x8000);
+	udelay(100);
+	counter = 0;
+	do {
+		nes_read_1G_phy_reg(nesdev, 0, phy_index, &phy_data);
+		if (counter++ > 100) {
+			ret = -1;
+			break;
+		}
+	} while (phy_data & 0x8000);
+
+	/* Setting no phy loopback */
+	phy_data &= 0xbfff;
+	phy_data |= 0x1140;
+	nes_write_1G_phy_reg(nesdev, 0, phy_index,  phy_data);
+	nes_read_1G_phy_reg(nesdev, 0, phy_index, &phy_data);
+	nes_read_1G_phy_reg(nesdev, 0x17, phy_index, &phy_data);
+	nes_read_1G_phy_reg(nesdev, 0x1e, phy_index, &phy_data);
+
+	/* Setting the interrupt mask */
+	nes_read_1G_phy_reg(nesdev, 0x19, phy_index, &phy_data);
+	nes_write_1G_phy_reg(nesdev, 0x19, phy_index, 0xffee);
+	nes_read_1G_phy_reg(nesdev, 0x19, phy_index, &phy_data);
+
+	/* turning on flow control */
+	nes_read_1G_phy_reg(nesdev, 4, phy_index, &phy_data);
+	nes_write_1G_phy_reg(nesdev, 4, phy_index, (phy_data & ~(0x03E0)) | 0xc00);
+	nes_read_1G_phy_reg(nesdev, 4, phy_index, &phy_data);
+
+	/* Clear Half duplex */
+	nes_read_1G_phy_reg(nesdev, 9, phy_index, &phy_data);
+	nes_write_1G_phy_reg(nesdev, 9, phy_index, phy_data & ~(0x0100));
+	nes_read_1G_phy_reg(nesdev, 9, phy_index, &phy_data);
+
+	nes_read_1G_phy_reg(nesdev, 0, phy_index, &phy_data);
+	nes_write_1G_phy_reg(nesdev, 0, phy_index, phy_data | 0x0300);
+
+	return ret;
+}
+
+
+/**
+ * nes_init_2025_phy
+ */
+static int nes_init_2025_phy(struct nes_device *nesdev, u8 phy_type, u8 phy_index)
+{
+	u32 temp_phy_data = 0;
+	u32 temp_phy_data2 = 0;
+	u32 counter = 0;
+	u32 sds;
+	u32 mac_index = nesdev->mac_index;
+	int ret = 0;
+	unsigned int first_attempt = 1;
+
+	/* Check firmware heartbeat */
+	nes_read_10G_phy_reg(nesdev, phy_index, 0x3, 0xd7ee);
+	temp_phy_data = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL);
+	udelay(1500);
+	nes_read_10G_phy_reg(nesdev, phy_index, 0x3, 0xd7ee);
+	temp_phy_data2 = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL);
+
+	if (temp_phy_data != temp_phy_data2) {
+		nes_read_10G_phy_reg(nesdev, phy_index, 0x3, 0xd7fd);
+		temp_phy_data = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL);
+		if ((temp_phy_data & 0xff) > 0x20)
+			return 0;
+		printk(PFX "Reinitialize external PHY\n");
+	}
+
+	/* no heartbeat, configure the PHY */
+	nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0x0000, 0x8000);
+	nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc300, 0x0000);
+	nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc316, 0x000A);
+	nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc318, 0x0052);
+
+	switch (phy_type) {
+	case NES_PHY_TYPE_ARGUS:
+		nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc316, 0x000A);
+		nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc318, 0x0052);
+		nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc302, 0x000C);
+		nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc319, 0x0008);
+		nes_write_10G_phy_reg(nesdev, phy_index, 0x3, 0x0027, 0x0001);
+		nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc31a, 0x0098);
+		nes_write_10G_phy_reg(nesdev, phy_index, 0x3, 0x0026, 0x0E00);
+
+		/* setup LEDs */
+		nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xd006, 0x0007);
+		nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xd007, 0x000A);
+		nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xd008, 0x0009);
+		break;
+
+	case NES_PHY_TYPE_SFP_D:
+		nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc316, 0x000A);
+		nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc318, 0x0052);
+		nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc302, 0x0004);
+		nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc319, 0x0038);
+		nes_write_10G_phy_reg(nesdev, phy_index, 0x3, 0x0027, 0x0013);
+		nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc31a, 0x0098);
+		nes_write_10G_phy_reg(nesdev, phy_index, 0x3, 0x0026, 0x0E00);
+
+		/* setup LEDs */
+		nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xd006, 0x0007);
+		nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xd007, 0x000A);
+		nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xd008, 0x0009);
+		break;
+
+	case NES_PHY_TYPE_KR:
+		nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc316, 0x000A);
+		nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc318, 0x0052);
+		nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc302, 0x000C);
+		nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc319, 0x0010);
+		nes_write_10G_phy_reg(nesdev, phy_index, 0x3, 0x0027, 0x0013);
+		nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc31a, 0x0080);
+		nes_write_10G_phy_reg(nesdev, phy_index, 0x3, 0x0026, 0x0E00);
+
+		/* setup LEDs */
+		nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xd006, 0x000B);
+		nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xd007, 0x0003);
+		nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xd008, 0x0004);
+
+		nes_write_10G_phy_reg(nesdev, phy_index, 0x3, 0x0022, 0x406D);
+		nes_write_10G_phy_reg(nesdev, phy_index, 0x3, 0x0023, 0x0020);
+		break;
+	}
+
+	nes_write_10G_phy_reg(nesdev, phy_index, 0x3, 0x0028, 0xA528);
+
+	/* Bring PHY out of reset */
+	nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc300, 0x0002);
+
+	/* Check for heartbeat */
+	counter = 0;
+	mdelay(690);
+	nes_read_10G_phy_reg(nesdev, phy_index, 0x3, 0xd7ee);
+	temp_phy_data = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL);
+	do {
+		if (counter++ > 150) {
+			printk(PFX "No PHY heartbeat\n");
+			break;
+		}
+		mdelay(1);
+		nes_read_10G_phy_reg(nesdev, phy_index, 0x3, 0xd7ee);
+		temp_phy_data2 = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL);
+	} while ((temp_phy_data2 == temp_phy_data));
+
+	/* wait for tracking */
+	counter = 0;
+	do {
+		nes_read_10G_phy_reg(nesdev, phy_index, 0x3, 0xd7fd);
+		temp_phy_data = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL);
+		if (counter++ > 300) {
+			if (((temp_phy_data & 0xff) == 0x0) && first_attempt) {
+				first_attempt = 0;
+				counter = 0;
+				/* reset AMCC PHY and try again */
+				nes_write_10G_phy_reg(nesdev, phy_index, 0x3, 0xe854, 0x00c0);
+				nes_write_10G_phy_reg(nesdev, phy_index, 0x3, 0xe854, 0x0040);
+				continue;
+			} else {
+				ret = 1;
+				break;
+			}
+		}
+		mdelay(10);
+	} while ((temp_phy_data & 0xff) < 0x30);
+
+	/* setup signal integrity */
+	nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xd003, 0x0000);
+	nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xF00D, 0x00FE);
+	nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xF00E, 0x0032);
+	if (phy_type == NES_PHY_TYPE_KR) {
+		nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xF00F, 0x000C);
+	} else {
+		nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xF00F, 0x0002);
+		nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc314, 0x0063);
+	}
+
+	/* reset serdes */
+	sds = nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL0 + mac_index * 0x200);
+	sds |= 0x1;
+	nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL0 + mac_index * 0x200, sds);
+	sds &= 0xfffffffe;
+	nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL0 + mac_index * 0x200, sds);
+
+	counter = 0;
+	while (((nes_read32(nesdev->regs + NES_SOFTWARE_RESET) & 0x00000040) != 0x00000040)
+			&& (counter++ < 5000))
+		;
+
+	return ret;
+}
+
+
+/**
+ * nes_init_phy
+ */
+int nes_init_phy(struct nes_device *nesdev)
+{
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+	u32 mac_index = nesdev->mac_index;
+	u32 tx_config = 0;
+	unsigned long flags;
+	u8  phy_type = nesadapter->phy_type[mac_index];
+	u8  phy_index = nesadapter->phy_index[mac_index];
+	int ret = 0;
+
+	tx_config = nes_read_indexed(nesdev, NES_IDX_MAC_TX_CONFIG);
+	if (phy_type == NES_PHY_TYPE_1G) {
+		/* setup 1G MDIO operation */
+		tx_config &= 0xFFFFFFE3;
+		tx_config |= 0x04;
+	} else {
+		/* setup 10G MDIO operation */
+		tx_config &= 0xFFFFFFE3;
+		tx_config |= 0x1D;
+	}
+	nes_write_indexed(nesdev, NES_IDX_MAC_TX_CONFIG, tx_config);
+
+	spin_lock_irqsave(&nesdev->nesadapter->phy_lock, flags);
+
+	switch (phy_type) {
+	case NES_PHY_TYPE_1G:
+		ret = nes_init_1g_phy(nesdev, phy_type, phy_index);
+		break;
+	case NES_PHY_TYPE_ARGUS:
+	case NES_PHY_TYPE_SFP_D:
+	case NES_PHY_TYPE_KR:
+		ret = nes_init_2025_phy(nesdev, phy_type, phy_index);
+		break;
+	}
+
+	spin_unlock_irqrestore(&nesdev->nesadapter->phy_lock, flags);
+
+	return ret;
+}
+
+
+/**
+ * nes_replenish_nic_rq
+ */
+static void nes_replenish_nic_rq(struct nes_vnic *nesvnic)
+{
+	unsigned long flags;
+	dma_addr_t bus_address;
+	struct sk_buff *skb;
+	struct nes_hw_nic_rq_wqe *nic_rqe;
+	struct nes_hw_nic *nesnic;
+	struct nes_device *nesdev;
+	struct nes_rskb_cb *cb;
+	u32 rx_wqes_posted = 0;
+
+	nesnic = &nesvnic->nic;
+	nesdev = nesvnic->nesdev;
+	spin_lock_irqsave(&nesnic->rq_lock, flags);
+	if (nesnic->replenishing_rq !=0) {
+		if (((nesnic->rq_size-1) == atomic_read(&nesvnic->rx_skbs_needed)) &&
+				(atomic_read(&nesvnic->rx_skb_timer_running) == 0)) {
+			atomic_set(&nesvnic->rx_skb_timer_running, 1);
+			spin_unlock_irqrestore(&nesnic->rq_lock, flags);
+			nesvnic->rq_wqes_timer.expires = jiffies + (HZ/2);	/* 1/2 second */
+			add_timer(&nesvnic->rq_wqes_timer);
+		} else
+		spin_unlock_irqrestore(&nesnic->rq_lock, flags);
+		return;
+	}
+	nesnic->replenishing_rq = 1;
+	spin_unlock_irqrestore(&nesnic->rq_lock, flags);
+	do {
+		skb = dev_alloc_skb(nesvnic->max_frame_size);
+		if (skb) {
+			skb->dev = nesvnic->netdev;
+
+			bus_address = pci_map_single(nesdev->pcidev,
+					skb->data, nesvnic->max_frame_size, PCI_DMA_FROMDEVICE);
+			cb = (struct nes_rskb_cb *)&skb->cb[0];
+			cb->busaddr = bus_address;
+			cb->maplen = nesvnic->max_frame_size;
+
+			nic_rqe = &nesnic->rq_vbase[nesvnic->nic.rq_head];
+			nic_rqe->wqe_words[NES_NIC_RQ_WQE_LENGTH_1_0_IDX] =
+					cpu_to_le32(nesvnic->max_frame_size);
+			nic_rqe->wqe_words[NES_NIC_RQ_WQE_LENGTH_3_2_IDX] = 0;
+			nic_rqe->wqe_words[NES_NIC_RQ_WQE_FRAG0_LOW_IDX] =
+					cpu_to_le32((u32)bus_address);
+			nic_rqe->wqe_words[NES_NIC_RQ_WQE_FRAG0_HIGH_IDX] =
+					cpu_to_le32((u32)((u64)bus_address >> 32));
+			nesnic->rx_skb[nesnic->rq_head] = skb;
+			nesnic->rq_head++;
+			nesnic->rq_head &= nesnic->rq_size - 1;
+			atomic_dec(&nesvnic->rx_skbs_needed);
+			barrier();
+			if (++rx_wqes_posted == 255) {
+				nes_write32(nesdev->regs+NES_WQE_ALLOC, (rx_wqes_posted << 24) | nesnic->qp_id);
+				rx_wqes_posted = 0;
+			}
+		} else {
+			spin_lock_irqsave(&nesnic->rq_lock, flags);
+			if (((nesnic->rq_size-1) == atomic_read(&nesvnic->rx_skbs_needed)) &&
+					(atomic_read(&nesvnic->rx_skb_timer_running) == 0)) {
+				atomic_set(&nesvnic->rx_skb_timer_running, 1);
+				spin_unlock_irqrestore(&nesnic->rq_lock, flags);
+				nesvnic->rq_wqes_timer.expires = jiffies + (HZ/2);	/* 1/2 second */
+				add_timer(&nesvnic->rq_wqes_timer);
+			} else
+				spin_unlock_irqrestore(&nesnic->rq_lock, flags);
+			break;
+		}
+	} while (atomic_read(&nesvnic->rx_skbs_needed));
+	barrier();
+	if (rx_wqes_posted)
+		nes_write32(nesdev->regs+NES_WQE_ALLOC, (rx_wqes_posted << 24) | nesnic->qp_id);
+	nesnic->replenishing_rq = 0;
+}
+
+
+/**
+ * nes_rq_wqes_timeout
+ */
+static void nes_rq_wqes_timeout(unsigned long parm)
+{
+	struct nes_vnic *nesvnic = (struct nes_vnic *)parm;
+	printk("%s: Timer fired.\n", __func__);
+	atomic_set(&nesvnic->rx_skb_timer_running, 0);
+	if (atomic_read(&nesvnic->rx_skbs_needed))
+		nes_replenish_nic_rq(nesvnic);
+}
+
+
+static int nes_lro_get_skb_hdr(struct sk_buff *skb, void **iphdr,
+			       void **tcph, u64 *hdr_flags, void *priv)
+{
+	unsigned int ip_len;
+	struct iphdr *iph;
+	skb_reset_network_header(skb);
+	iph = ip_hdr(skb);
+	if (iph->protocol != IPPROTO_TCP)
+		return -1;
+	ip_len = ip_hdrlen(skb);
+	skb_set_transport_header(skb, ip_len);
+	*tcph = tcp_hdr(skb);
+
+	*hdr_flags = LRO_IPV4 | LRO_TCP;
+	*iphdr = iph;
+	return 0;
+}
+
+
+/**
+ * nes_init_nic_qp
+ */
+int nes_init_nic_qp(struct nes_device *nesdev, struct net_device *netdev)
+{
+	struct nes_hw_cqp_wqe *cqp_wqe;
+	struct nes_hw_nic_sq_wqe *nic_sqe;
+	struct nes_hw_nic_qp_context *nic_context;
+	struct sk_buff *skb;
+	struct nes_hw_nic_rq_wqe *nic_rqe;
+	struct nes_vnic *nesvnic = netdev_priv(netdev);
+	unsigned long flags;
+	void *vmem;
+	dma_addr_t pmem;
+	u64 u64temp;
+	int ret;
+	u32 cqp_head;
+	u32 counter;
+	u32 wqe_count;
+	struct nes_rskb_cb *cb;
+	u8 jumbomode=0;
+
+	/* Allocate fragment, SQ, RQ, and CQ; Reuse CEQ based on the PCI function */
+	nesvnic->nic_mem_size = 256 +
+			(NES_NIC_WQ_SIZE * sizeof(struct nes_first_frag)) +
+			(NES_NIC_WQ_SIZE * sizeof(struct nes_hw_nic_sq_wqe)) +
+			(NES_NIC_WQ_SIZE * sizeof(struct nes_hw_nic_rq_wqe)) +
+			(NES_NIC_WQ_SIZE * 2 * sizeof(struct nes_hw_nic_cqe)) +
+			sizeof(struct nes_hw_nic_qp_context);
+
+	nesvnic->nic_vbase = pci_zalloc_consistent(nesdev->pcidev,
+						   nesvnic->nic_mem_size,
+						   &nesvnic->nic_pbase);
+	if (!nesvnic->nic_vbase) {
+		nes_debug(NES_DBG_INIT, "Unable to allocate memory for NIC host descriptor rings\n");
+		return -ENOMEM;
+	}
+	nes_debug(NES_DBG_INIT, "Allocated NIC QP structures at %p (phys = %016lX), size = %u.\n",
+			nesvnic->nic_vbase, (unsigned long)nesvnic->nic_pbase, nesvnic->nic_mem_size);
+
+	vmem = (void *)(((unsigned long)nesvnic->nic_vbase + (256 - 1)) &
+			~(unsigned long)(256 - 1));
+	pmem = (dma_addr_t)(((unsigned long long)nesvnic->nic_pbase + (256 - 1)) &
+			~(unsigned long long)(256 - 1));
+
+	/* Setup the first Fragment buffers */
+	nesvnic->nic.first_frag_vbase = vmem;
+
+	for (counter = 0; counter < NES_NIC_WQ_SIZE; counter++) {
+		nesvnic->nic.frag_paddr[counter] = pmem;
+		pmem += sizeof(struct nes_first_frag);
+	}
+
+	/* setup the SQ */
+	vmem += (NES_NIC_WQ_SIZE * sizeof(struct nes_first_frag));
+
+	nesvnic->nic.sq_vbase = (void *)vmem;
+	nesvnic->nic.sq_pbase = pmem;
+	nesvnic->nic.sq_head = 0;
+	nesvnic->nic.sq_tail = 0;
+	nesvnic->nic.sq_size = NES_NIC_WQ_SIZE;
+	for (counter = 0; counter < NES_NIC_WQ_SIZE; counter++) {
+		nic_sqe = &nesvnic->nic.sq_vbase[counter];
+		nic_sqe->wqe_words[NES_NIC_SQ_WQE_MISC_IDX] =
+				cpu_to_le32(NES_NIC_SQ_WQE_DISABLE_CHKSUM |
+				NES_NIC_SQ_WQE_COMPLETION);
+		nic_sqe->wqe_words[NES_NIC_SQ_WQE_LENGTH_0_TAG_IDX] =
+				cpu_to_le32((u32)NES_FIRST_FRAG_SIZE << 16);
+		nic_sqe->wqe_words[NES_NIC_SQ_WQE_FRAG0_LOW_IDX] =
+				cpu_to_le32((u32)nesvnic->nic.frag_paddr[counter]);
+		nic_sqe->wqe_words[NES_NIC_SQ_WQE_FRAG0_HIGH_IDX] =
+				cpu_to_le32((u32)((u64)nesvnic->nic.frag_paddr[counter] >> 32));
+	}
+
+	nesvnic->get_cqp_request = nes_get_cqp_request;
+	nesvnic->post_cqp_request = nes_post_cqp_request;
+	nesvnic->mcrq_mcast_filter = NULL;
+
+	spin_lock_init(&nesvnic->nic.rq_lock);
+
+	/* setup the RQ */
+	vmem += (NES_NIC_WQ_SIZE * sizeof(struct nes_hw_nic_sq_wqe));
+	pmem += (NES_NIC_WQ_SIZE * sizeof(struct nes_hw_nic_sq_wqe));
+
+
+	nesvnic->nic.rq_vbase = vmem;
+	nesvnic->nic.rq_pbase = pmem;
+	nesvnic->nic.rq_head = 0;
+	nesvnic->nic.rq_tail = 0;
+	nesvnic->nic.rq_size = NES_NIC_WQ_SIZE;
+
+	/* setup the CQ */
+	vmem += (NES_NIC_WQ_SIZE * sizeof(struct nes_hw_nic_rq_wqe));
+	pmem += (NES_NIC_WQ_SIZE * sizeof(struct nes_hw_nic_rq_wqe));
+
+	if (nesdev->nesadapter->netdev_count > 2)
+		nesvnic->mcrq_qp_id = nesvnic->nic_index + 32;
+	else
+		nesvnic->mcrq_qp_id = nesvnic->nic.qp_id + 4;
+
+	nesvnic->nic_cq.cq_vbase = vmem;
+	nesvnic->nic_cq.cq_pbase = pmem;
+	nesvnic->nic_cq.cq_head = 0;
+	nesvnic->nic_cq.cq_size = NES_NIC_WQ_SIZE * 2;
+
+	nesvnic->nic_cq.ce_handler = nes_nic_napi_ce_handler;
+
+	/* Send CreateCQ request to CQP */
+	spin_lock_irqsave(&nesdev->cqp.lock, flags);
+	cqp_head = nesdev->cqp.sq_head;
+
+	cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head];
+	nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
+
+	cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] = cpu_to_le32(
+			NES_CQP_CREATE_CQ | NES_CQP_CQ_CEQ_VALID |
+			((u32)nesvnic->nic_cq.cq_size << 16));
+	cqp_wqe->wqe_words[NES_CQP_WQE_ID_IDX] = cpu_to_le32(
+			nesvnic->nic_cq.cq_number | ((u32)nesdev->nic_ceq_index << 16));
+	u64temp = (u64)nesvnic->nic_cq.cq_pbase;
+	set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_CQ_WQE_PBL_LOW_IDX, u64temp);
+	cqp_wqe->wqe_words[NES_CQP_CQ_WQE_CQ_CONTEXT_HIGH_IDX] =  0;
+	u64temp = (unsigned long)&nesvnic->nic_cq;
+	cqp_wqe->wqe_words[NES_CQP_CQ_WQE_CQ_CONTEXT_LOW_IDX] =  cpu_to_le32((u32)(u64temp >> 1));
+	cqp_wqe->wqe_words[NES_CQP_CQ_WQE_CQ_CONTEXT_HIGH_IDX] =
+			cpu_to_le32(((u32)((u64temp) >> 33)) & 0x7FFFFFFF);
+	cqp_wqe->wqe_words[NES_CQP_CQ_WQE_DOORBELL_INDEX_HIGH_IDX] = 0;
+	if (++cqp_head >= nesdev->cqp.sq_size)
+		cqp_head = 0;
+	cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head];
+	nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
+
+	/* Send CreateQP request to CQP */
+	nic_context = (void *)(&nesvnic->nic_cq.cq_vbase[nesvnic->nic_cq.cq_size]);
+	nic_context->context_words[NES_NIC_CTX_MISC_IDX] =
+			cpu_to_le32((u32)NES_NIC_CTX_SIZE |
+			((u32)PCI_FUNC(nesdev->pcidev->devfn) << 12));
+	nes_debug(NES_DBG_INIT, "RX_WINDOW_BUFFER_PAGE_TABLE_SIZE = 0x%08X, RX_WINDOW_BUFFER_SIZE = 0x%08X\n",
+			nes_read_indexed(nesdev, NES_IDX_RX_WINDOW_BUFFER_PAGE_TABLE_SIZE),
+			nes_read_indexed(nesdev, NES_IDX_RX_WINDOW_BUFFER_SIZE));
+	if (nes_read_indexed(nesdev, NES_IDX_RX_WINDOW_BUFFER_SIZE) != 0) {
+		nic_context->context_words[NES_NIC_CTX_MISC_IDX] |= cpu_to_le32(NES_NIC_BACK_STORE);
+	}
+
+	u64temp = (u64)nesvnic->nic.sq_pbase;
+	nic_context->context_words[NES_NIC_CTX_SQ_LOW_IDX]  = cpu_to_le32((u32)u64temp);
+	nic_context->context_words[NES_NIC_CTX_SQ_HIGH_IDX] = cpu_to_le32((u32)(u64temp >> 32));
+	u64temp = (u64)nesvnic->nic.rq_pbase;
+	nic_context->context_words[NES_NIC_CTX_RQ_LOW_IDX]  = cpu_to_le32((u32)u64temp);
+	nic_context->context_words[NES_NIC_CTX_RQ_HIGH_IDX] = cpu_to_le32((u32)(u64temp >> 32));
+
+	cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] = cpu_to_le32(NES_CQP_CREATE_QP |
+			NES_CQP_QP_TYPE_NIC);
+	cqp_wqe->wqe_words[NES_CQP_WQE_ID_IDX] = cpu_to_le32(nesvnic->nic.qp_id);
+	u64temp = (u64)nesvnic->nic_cq.cq_pbase +
+			(nesvnic->nic_cq.cq_size * sizeof(struct nes_hw_nic_cqe));
+	set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_QP_WQE_CONTEXT_LOW_IDX, u64temp);
+
+	if (++cqp_head >= nesdev->cqp.sq_size)
+		cqp_head = 0;
+	nesdev->cqp.sq_head = cqp_head;
+
+	barrier();
+
+	/* Ring doorbell (2 WQEs) */
+	nes_write32(nesdev->regs+NES_WQE_ALLOC, 0x02800000 | nesdev->cqp.qp_id);
+
+	spin_unlock_irqrestore(&nesdev->cqp.lock, flags);
+	nes_debug(NES_DBG_INIT, "Waiting for create NIC QP%u to complete.\n",
+			nesvnic->nic.qp_id);
+
+	ret = wait_event_timeout(nesdev->cqp.waitq, (nesdev->cqp.sq_tail == cqp_head),
+			NES_EVENT_TIMEOUT);
+	nes_debug(NES_DBG_INIT, "Create NIC QP%u completed, wait_event_timeout ret = %u.\n",
+			nesvnic->nic.qp_id, ret);
+	if (!ret) {
+		nes_debug(NES_DBG_INIT, "NIC QP%u create timeout expired\n", nesvnic->nic.qp_id);
+		pci_free_consistent(nesdev->pcidev, nesvnic->nic_mem_size, nesvnic->nic_vbase,
+				nesvnic->nic_pbase);
+		return -EIO;
+	}
+
+	/* Populate the RQ */
+	for (counter = 0; counter < (NES_NIC_WQ_SIZE - 1); counter++) {
+		skb = dev_alloc_skb(nesvnic->max_frame_size);
+		if (!skb) {
+			nes_debug(NES_DBG_INIT, "%s: out of memory for receive skb\n", netdev->name);
+
+			nes_destroy_nic_qp(nesvnic);
+			return -ENOMEM;
+		}
+
+		skb->dev = netdev;
+
+		pmem = pci_map_single(nesdev->pcidev, skb->data,
+				nesvnic->max_frame_size, PCI_DMA_FROMDEVICE);
+		cb = (struct nes_rskb_cb *)&skb->cb[0];
+		cb->busaddr = pmem;
+		cb->maplen = nesvnic->max_frame_size;
+
+		nic_rqe = &nesvnic->nic.rq_vbase[counter];
+		nic_rqe->wqe_words[NES_NIC_RQ_WQE_LENGTH_1_0_IDX] = cpu_to_le32(nesvnic->max_frame_size);
+		nic_rqe->wqe_words[NES_NIC_RQ_WQE_LENGTH_3_2_IDX] = 0;
+		nic_rqe->wqe_words[NES_NIC_RQ_WQE_FRAG0_LOW_IDX]  = cpu_to_le32((u32)pmem);
+		nic_rqe->wqe_words[NES_NIC_RQ_WQE_FRAG0_HIGH_IDX] = cpu_to_le32((u32)((u64)pmem >> 32));
+		nesvnic->nic.rx_skb[counter] = skb;
+	}
+
+	wqe_count = NES_NIC_WQ_SIZE - 1;
+	nesvnic->nic.rq_head = wqe_count;
+	barrier();
+	do {
+		counter = min(wqe_count, ((u32)255));
+		wqe_count -= counter;
+		nes_write32(nesdev->regs+NES_WQE_ALLOC, (counter << 24) | nesvnic->nic.qp_id);
+	} while (wqe_count);
+	init_timer(&nesvnic->rq_wqes_timer);
+	nesvnic->rq_wqes_timer.function = nes_rq_wqes_timeout;
+	nesvnic->rq_wqes_timer.data = (unsigned long)nesvnic;
+	nes_debug(NES_DBG_INIT, "NAPI support Enabled\n");
+	if (nesdev->nesadapter->et_use_adaptive_rx_coalesce)
+	{
+		nes_nic_init_timer(nesdev);
+		if (netdev->mtu > 1500)
+			jumbomode = 1;
+		nes_nic_init_timer_defaults(nesdev, jumbomode);
+	}
+	if ((nesdev->nesadapter->allow_unaligned_fpdus) &&
+		(nes_init_mgt_qp(nesdev, netdev, nesvnic))) {
+			nes_debug(NES_DBG_INIT, "%s: Out of memory for pau nic\n", netdev->name);
+			nes_destroy_nic_qp(nesvnic);
+		return -ENOMEM;
+	}
+
+	nesvnic->lro_mgr.max_aggr       = nes_lro_max_aggr;
+	nesvnic->lro_mgr.max_desc       = NES_MAX_LRO_DESCRIPTORS;
+	nesvnic->lro_mgr.lro_arr        = nesvnic->lro_desc;
+	nesvnic->lro_mgr.get_skb_header = nes_lro_get_skb_hdr;
+	nesvnic->lro_mgr.features       = LRO_F_NAPI | LRO_F_EXTRACT_VLAN_ID;
+	nesvnic->lro_mgr.dev            = netdev;
+	nesvnic->lro_mgr.ip_summed      = CHECKSUM_UNNECESSARY;
+	nesvnic->lro_mgr.ip_summed_aggr = CHECKSUM_UNNECESSARY;
+	return 0;
+}
+
+
+/**
+ * nes_destroy_nic_qp
+ */
+void nes_destroy_nic_qp(struct nes_vnic *nesvnic)
+{
+	u64 u64temp;
+	dma_addr_t bus_address;
+	struct nes_device *nesdev = nesvnic->nesdev;
+	struct nes_hw_cqp_wqe *cqp_wqe;
+	struct nes_hw_nic_sq_wqe *nic_sqe;
+	__le16 *wqe_fragment_length;
+	u16  wqe_fragment_index;
+	u32 cqp_head;
+	u32 wqm_cfg0;
+	unsigned long flags;
+	struct sk_buff *rx_skb;
+	struct nes_rskb_cb *cb;
+	int ret;
+
+	if (nesdev->nesadapter->allow_unaligned_fpdus)
+		nes_destroy_mgt(nesvnic);
+
+	/* clear wqe stall before destroying NIC QP */
+	wqm_cfg0 = nes_read_indexed(nesdev, NES_IDX_WQM_CONFIG0);
+	nes_write_indexed(nesdev, NES_IDX_WQM_CONFIG0, wqm_cfg0 & 0xFFFF7FFF);
+
+	/* Free remaining NIC receive buffers */
+	while (nesvnic->nic.rq_head != nesvnic->nic.rq_tail) {
+		rx_skb = nesvnic->nic.rx_skb[nesvnic->nic.rq_tail];
+		cb = (struct nes_rskb_cb *)&rx_skb->cb[0];
+		pci_unmap_single(nesdev->pcidev, cb->busaddr, cb->maplen,
+			PCI_DMA_FROMDEVICE);
+
+		dev_kfree_skb(nesvnic->nic.rx_skb[nesvnic->nic.rq_tail++]);
+		nesvnic->nic.rq_tail &= (nesvnic->nic.rq_size - 1);
+	}
+
+	/* Free remaining NIC transmit buffers */
+	while (nesvnic->nic.sq_head != nesvnic->nic.sq_tail) {
+		nic_sqe = &nesvnic->nic.sq_vbase[nesvnic->nic.sq_tail];
+		wqe_fragment_index = 1;
+		wqe_fragment_length = (__le16 *)
+			&nic_sqe->wqe_words[NES_NIC_SQ_WQE_LENGTH_0_TAG_IDX];
+		/* bump past the vlan tag */
+		wqe_fragment_length++;
+		if (le16_to_cpu(wqe_fragment_length[wqe_fragment_index]) != 0) {
+			u64temp = (u64)le32_to_cpu(
+				nic_sqe->wqe_words[NES_NIC_SQ_WQE_FRAG0_LOW_IDX+
+				wqe_fragment_index*2]);
+			u64temp += ((u64)le32_to_cpu(
+				nic_sqe->wqe_words[NES_NIC_SQ_WQE_FRAG0_HIGH_IDX
+				+ wqe_fragment_index*2]))<<32;
+			bus_address = (dma_addr_t)u64temp;
+			if (test_and_clear_bit(nesvnic->nic.sq_tail,
+					nesvnic->nic.first_frag_overflow)) {
+				pci_unmap_single(nesdev->pcidev,
+						bus_address,
+						le16_to_cpu(wqe_fragment_length[
+							wqe_fragment_index++]),
+						PCI_DMA_TODEVICE);
+			}
+			for (; wqe_fragment_index < 5; wqe_fragment_index++) {
+				if (wqe_fragment_length[wqe_fragment_index]) {
+					u64temp = le32_to_cpu(
+						nic_sqe->wqe_words[
+						NES_NIC_SQ_WQE_FRAG0_LOW_IDX+
+						wqe_fragment_index*2]);
+					u64temp += ((u64)le32_to_cpu(
+						nic_sqe->wqe_words[
+						NES_NIC_SQ_WQE_FRAG0_HIGH_IDX+
+						wqe_fragment_index*2]))<<32;
+					bus_address = (dma_addr_t)u64temp;
+					pci_unmap_page(nesdev->pcidev,
+							bus_address,
+							le16_to_cpu(
+							wqe_fragment_length[
+							wqe_fragment_index]),
+							PCI_DMA_TODEVICE);
+				} else
+					break;
+			}
+		}
+		if (nesvnic->nic.tx_skb[nesvnic->nic.sq_tail])
+			dev_kfree_skb(
+				nesvnic->nic.tx_skb[nesvnic->nic.sq_tail]);
+
+		nesvnic->nic.sq_tail = (nesvnic->nic.sq_tail + 1)
+					& (nesvnic->nic.sq_size - 1);
+	}
+
+	spin_lock_irqsave(&nesdev->cqp.lock, flags);
+
+	/* Destroy NIC QP */
+	cqp_head = nesdev->cqp.sq_head;
+	cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head];
+	nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
+
+	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX,
+		(NES_CQP_DESTROY_QP | NES_CQP_QP_TYPE_NIC));
+	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_ID_IDX,
+		nesvnic->nic.qp_id);
+
+	if (++cqp_head >= nesdev->cqp.sq_size)
+		cqp_head = 0;
+
+	cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head];
+
+	/* Destroy NIC CQ */
+	nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
+	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX,
+		(NES_CQP_DESTROY_CQ | ((u32)nesvnic->nic_cq.cq_size << 16)));
+	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_ID_IDX,
+		(nesvnic->nic_cq.cq_number | ((u32)nesdev->nic_ceq_index << 16)));
+
+	if (++cqp_head >= nesdev->cqp.sq_size)
+		cqp_head = 0;
+
+	nesdev->cqp.sq_head = cqp_head;
+	barrier();
+
+	/* Ring doorbell (2 WQEs) */
+	nes_write32(nesdev->regs+NES_WQE_ALLOC, 0x02800000 | nesdev->cqp.qp_id);
+
+	spin_unlock_irqrestore(&nesdev->cqp.lock, flags);
+	nes_debug(NES_DBG_SHUTDOWN, "Waiting for CQP, cqp_head=%u, cqp.sq_head=%u,"
+			" cqp.sq_tail=%u, cqp.sq_size=%u\n",
+			cqp_head, nesdev->cqp.sq_head,
+			nesdev->cqp.sq_tail, nesdev->cqp.sq_size);
+
+	ret = wait_event_timeout(nesdev->cqp.waitq, (nesdev->cqp.sq_tail == cqp_head),
+			NES_EVENT_TIMEOUT);
+
+	nes_debug(NES_DBG_SHUTDOWN, "Destroy NIC QP returned, wait_event_timeout ret = %u, cqp_head=%u,"
+			" cqp.sq_head=%u, cqp.sq_tail=%u\n",
+			ret, cqp_head, nesdev->cqp.sq_head, nesdev->cqp.sq_tail);
+	if (!ret) {
+		nes_debug(NES_DBG_SHUTDOWN, "NIC QP%u destroy timeout expired\n",
+				nesvnic->nic.qp_id);
+	}
+
+	pci_free_consistent(nesdev->pcidev, nesvnic->nic_mem_size, nesvnic->nic_vbase,
+			nesvnic->nic_pbase);
+
+	/* restore old wqm_cfg0 value */
+	nes_write_indexed(nesdev, NES_IDX_WQM_CONFIG0, wqm_cfg0);
+}
+
+/**
+ * nes_napi_isr
+ */
+int nes_napi_isr(struct nes_device *nesdev)
+{
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+	u32 int_stat;
+
+	if (nesdev->napi_isr_ran) {
+		/* interrupt status has already been read in ISR */
+		int_stat = nesdev->int_stat;
+	} else {
+		int_stat = nes_read32(nesdev->regs + NES_INT_STAT);
+		nesdev->int_stat = int_stat;
+		nesdev->napi_isr_ran = 1;
+	}
+
+	int_stat &= nesdev->int_req;
+	/* iff NIC, process here, else wait for DPC */
+	if ((int_stat) && ((int_stat & 0x0000ff00) == int_stat)) {
+		nesdev->napi_isr_ran = 0;
+		nes_write32(nesdev->regs + NES_INT_STAT,
+			(int_stat &
+			~(NES_INT_INTF | NES_INT_TIMER | NES_INT_MAC0 | NES_INT_MAC1 | NES_INT_MAC2 | NES_INT_MAC3)));
+
+		/* Process the CEQs */
+		nes_process_ceq(nesdev, &nesdev->nesadapter->ceq[nesdev->nic_ceq_index]);
+
+		if (unlikely((((nesadapter->et_rx_coalesce_usecs_irq) &&
+					(!nesadapter->et_use_adaptive_rx_coalesce)) ||
+					((nesadapter->et_use_adaptive_rx_coalesce) &&
+					 (nesdev->deepcq_count > nesadapter->et_pkt_rate_low))))) {
+			if ((nesdev->int_req & NES_INT_TIMER) == 0) {
+				/* Enable Periodic timer interrupts */
+				nesdev->int_req |= NES_INT_TIMER;
+				/* ack any pending periodic timer interrupts so we don't get an immediate interrupt */
+				/* TODO: need to also ack other unused periodic timer values, get from nesadapter */
+				nes_write32(nesdev->regs+NES_TIMER_STAT,
+						nesdev->timer_int_req  | ~(nesdev->nesadapter->timer_int_req));
+				nes_write32(nesdev->regs+NES_INTF_INT_MASK,
+						~(nesdev->intf_int_req | NES_INTF_PERIODIC_TIMER));
+			}
+
+			if (unlikely(nesadapter->et_use_adaptive_rx_coalesce))
+			{
+				nes_nic_init_timer(nesdev);
+			}
+			/* Enable interrupts, except CEQs */
+			nes_write32(nesdev->regs+NES_INT_MASK, 0x0000ffff | (~nesdev->int_req));
+		} else {
+			/* Enable interrupts, make sure timer is off */
+			nesdev->int_req &= ~NES_INT_TIMER;
+			nes_write32(nesdev->regs+NES_INTF_INT_MASK, ~(nesdev->intf_int_req));
+			nes_write32(nesdev->regs+NES_INT_MASK, ~nesdev->int_req);
+		}
+		nesdev->deepcq_count = 0;
+		return 1;
+	} else {
+		return 0;
+	}
+}
+
+static void process_critical_error(struct nes_device *nesdev)
+{
+	u32 debug_error;
+	u32 nes_idx_debug_error_masks0 = 0;
+	u16 error_module = 0;
+
+	debug_error = nes_read_indexed(nesdev, NES_IDX_DEBUG_ERROR_CONTROL_STATUS);
+	printk(KERN_ERR PFX "Critical Error reported by device!!! 0x%02X\n",
+			(u16)debug_error);
+	nes_write_indexed(nesdev, NES_IDX_DEBUG_ERROR_CONTROL_STATUS,
+			0x01010000 | (debug_error & 0x0000ffff));
+	if (crit_err_count++ > 10)
+		nes_write_indexed(nesdev, NES_IDX_DEBUG_ERROR_MASKS1, 1 << 0x17);
+	error_module = (u16) (debug_error & 0x1F00) >> 8;
+	if (++nesdev->nesadapter->crit_error_count[error_module-1] >=
+			nes_max_critical_error_count) {
+		printk(KERN_ERR PFX "Masking off critical error for module "
+			"0x%02X\n", (u16)error_module);
+		nes_idx_debug_error_masks0 = nes_read_indexed(nesdev,
+			NES_IDX_DEBUG_ERROR_MASKS0);
+		nes_write_indexed(nesdev, NES_IDX_DEBUG_ERROR_MASKS0,
+			nes_idx_debug_error_masks0 | (1 << error_module));
+	}
+}
+/**
+ * nes_dpc
+ */
+void nes_dpc(unsigned long param)
+{
+	struct nes_device *nesdev = (struct nes_device *)param;
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+	u32 counter;
+	u32 loop_counter = 0;
+	u32 int_status_bit;
+	u32 int_stat;
+	u32 timer_stat;
+	u32 temp_int_stat;
+	u32 intf_int_stat;
+	u32 processed_intf_int = 0;
+	u16 processed_timer_int = 0;
+	u16 completion_ints = 0;
+	u16 timer_ints = 0;
+
+	/* nes_debug(NES_DBG_ISR, "\n"); */
+
+	do {
+		timer_stat = 0;
+		if (nesdev->napi_isr_ran) {
+			nesdev->napi_isr_ran = 0;
+			int_stat = nesdev->int_stat;
+		} else
+			int_stat = nes_read32(nesdev->regs+NES_INT_STAT);
+		if (processed_intf_int != 0)
+			int_stat &= nesdev->int_req & ~NES_INT_INTF;
+		else
+			int_stat &= nesdev->int_req;
+		if (processed_timer_int == 0) {
+			processed_timer_int = 1;
+			if (int_stat & NES_INT_TIMER) {
+				timer_stat = nes_read32(nesdev->regs + NES_TIMER_STAT);
+				if ((timer_stat & nesdev->timer_int_req) == 0) {
+					int_stat &= ~NES_INT_TIMER;
+				}
+			}
+		} else {
+			int_stat &= ~NES_INT_TIMER;
+		}
+
+		if (int_stat) {
+			if (int_stat & ~(NES_INT_INTF | NES_INT_TIMER | NES_INT_MAC0|
+					NES_INT_MAC1|NES_INT_MAC2 | NES_INT_MAC3)) {
+				/* Ack the interrupts */
+				nes_write32(nesdev->regs+NES_INT_STAT,
+					(int_stat & ~(NES_INT_INTF | NES_INT_TIMER | NES_INT_MAC0|
+					NES_INT_MAC1 | NES_INT_MAC2 | NES_INT_MAC3)));
+			}
+
+			temp_int_stat = int_stat;
+			for (counter = 0, int_status_bit = 1; counter < 16; counter++) {
+				if (int_stat & int_status_bit) {
+					nes_process_ceq(nesdev, &nesadapter->ceq[counter]);
+					temp_int_stat &= ~int_status_bit;
+					completion_ints = 1;
+				}
+				if (!(temp_int_stat & 0x0000ffff))
+					break;
+				int_status_bit <<= 1;
+			}
+
+			/* Process the AEQ for this pci function */
+			int_status_bit = 1 << (16 + PCI_FUNC(nesdev->pcidev->devfn));
+			if (int_stat & int_status_bit) {
+				nes_process_aeq(nesdev, &nesadapter->aeq[PCI_FUNC(nesdev->pcidev->devfn)]);
+			}
+
+			/* Process the MAC interrupt for this pci function */
+			int_status_bit = 1 << (24 + nesdev->mac_index);
+			if (int_stat & int_status_bit) {
+				nes_process_mac_intr(nesdev, nesdev->mac_index);
+			}
+
+			if (int_stat & NES_INT_TIMER) {
+				if (timer_stat & nesdev->timer_int_req) {
+					nes_write32(nesdev->regs + NES_TIMER_STAT,
+							(timer_stat & nesdev->timer_int_req) |
+							~(nesdev->nesadapter->timer_int_req));
+					timer_ints = 1;
+				}
+			}
+
+			if (int_stat & NES_INT_INTF) {
+				processed_intf_int = 1;
+				intf_int_stat = nes_read32(nesdev->regs+NES_INTF_INT_STAT);
+				intf_int_stat &= nesdev->intf_int_req;
+				if (NES_INTF_INT_CRITERR & intf_int_stat) {
+					process_critical_error(nesdev);
+				}
+				if (NES_INTF_INT_PCIERR & intf_int_stat) {
+					printk(KERN_ERR PFX "PCI Error reported by device!!!\n");
+					BUG();
+				}
+				if (NES_INTF_INT_AEQ_OFLOW & intf_int_stat) {
+					printk(KERN_ERR PFX "AEQ Overflow reported by device!!!\n");
+					BUG();
+				}
+				nes_write32(nesdev->regs+NES_INTF_INT_STAT, intf_int_stat);
+			}
+
+			if (int_stat & NES_INT_TSW) {
+			}
+		}
+		/* Don't use the interface interrupt bit stay in loop */
+		int_stat &= ~NES_INT_INTF | NES_INT_TIMER | NES_INT_MAC0 |
+				NES_INT_MAC1 | NES_INT_MAC2 | NES_INT_MAC3;
+	} while ((int_stat != 0) && (loop_counter++ < MAX_DPC_ITERATIONS));
+
+	if (timer_ints == 1) {
+		if ((nesadapter->et_rx_coalesce_usecs_irq) || (nesadapter->et_use_adaptive_rx_coalesce)) {
+			if (completion_ints == 0) {
+				nesdev->timer_only_int_count++;
+				if (nesdev->timer_only_int_count>=nesadapter->timer_int_limit) {
+					nesdev->timer_only_int_count = 0;
+					nesdev->int_req &= ~NES_INT_TIMER;
+					nes_write32(nesdev->regs + NES_INTF_INT_MASK, ~(nesdev->intf_int_req));
+					nes_write32(nesdev->regs + NES_INT_MASK, ~nesdev->int_req);
+				} else {
+					nes_write32(nesdev->regs+NES_INT_MASK, 0x0000ffff | (~nesdev->int_req));
+				}
+			} else {
+				if (unlikely(nesadapter->et_use_adaptive_rx_coalesce))
+				{
+					nes_nic_init_timer(nesdev);
+				}
+				nesdev->timer_only_int_count = 0;
+				nes_write32(nesdev->regs+NES_INT_MASK, 0x0000ffff | (~nesdev->int_req));
+			}
+		} else {
+			nesdev->timer_only_int_count = 0;
+			nesdev->int_req &= ~NES_INT_TIMER;
+			nes_write32(nesdev->regs+NES_INTF_INT_MASK, ~(nesdev->intf_int_req));
+			nes_write32(nesdev->regs+NES_TIMER_STAT,
+					nesdev->timer_int_req | ~(nesdev->nesadapter->timer_int_req));
+			nes_write32(nesdev->regs+NES_INT_MASK, ~nesdev->int_req);
+		}
+	} else {
+		if ( (completion_ints == 1) &&
+			 (((nesadapter->et_rx_coalesce_usecs_irq) &&
+			   (!nesadapter->et_use_adaptive_rx_coalesce)) ||
+			  ((nesdev->deepcq_count > nesadapter->et_pkt_rate_low) &&
+			   (nesadapter->et_use_adaptive_rx_coalesce) )) ) {
+			/* nes_debug(NES_DBG_ISR, "Enabling periodic timer interrupt.\n" ); */
+			nesdev->timer_only_int_count = 0;
+			nesdev->int_req |= NES_INT_TIMER;
+			nes_write32(nesdev->regs+NES_TIMER_STAT,
+					nesdev->timer_int_req | ~(nesdev->nesadapter->timer_int_req));
+			nes_write32(nesdev->regs+NES_INTF_INT_MASK,
+					~(nesdev->intf_int_req | NES_INTF_PERIODIC_TIMER));
+			nes_write32(nesdev->regs+NES_INT_MASK, 0x0000ffff | (~nesdev->int_req));
+		} else {
+			nes_write32(nesdev->regs+NES_INT_MASK, ~nesdev->int_req);
+		}
+	}
+	nesdev->deepcq_count = 0;
+}
+
+
+/**
+ * nes_process_ceq
+ */
+static void nes_process_ceq(struct nes_device *nesdev, struct nes_hw_ceq *ceq)
+{
+	u64 u64temp;
+	struct nes_hw_cq *cq;
+	u32 head;
+	u32 ceq_size;
+
+	/* nes_debug(NES_DBG_CQ, "\n"); */
+	head = ceq->ceq_head;
+	ceq_size = ceq->ceq_size;
+
+	do {
+		if (le32_to_cpu(ceq->ceq_vbase[head].ceqe_words[NES_CEQE_CQ_CTX_HIGH_IDX]) &
+				NES_CEQE_VALID) {
+			u64temp = (((u64)(le32_to_cpu(ceq->ceq_vbase[head].ceqe_words[NES_CEQE_CQ_CTX_HIGH_IDX]))) << 32) |
+						((u64)(le32_to_cpu(ceq->ceq_vbase[head].ceqe_words[NES_CEQE_CQ_CTX_LOW_IDX])));
+			u64temp <<= 1;
+			cq = *((struct nes_hw_cq **)&u64temp);
+			/* nes_debug(NES_DBG_CQ, "pCQ = %p\n", cq); */
+			barrier();
+			ceq->ceq_vbase[head].ceqe_words[NES_CEQE_CQ_CTX_HIGH_IDX] = 0;
+
+			/* call the event handler */
+			cq->ce_handler(nesdev, cq);
+
+			if (++head >= ceq_size)
+				head = 0;
+		} else {
+			break;
+		}
+
+	} while (1);
+
+	ceq->ceq_head = head;
+}
+
+
+/**
+ * nes_process_aeq
+ */
+static void nes_process_aeq(struct nes_device *nesdev, struct nes_hw_aeq *aeq)
+{
+	/* u64 u64temp; */
+	u32 head;
+	u32 aeq_size;
+	u32 aeqe_misc;
+	u32 aeqe_cq_id;
+	struct nes_hw_aeqe volatile *aeqe;
+
+	head = aeq->aeq_head;
+	aeq_size = aeq->aeq_size;
+
+	do {
+		aeqe = &aeq->aeq_vbase[head];
+		if ((le32_to_cpu(aeqe->aeqe_words[NES_AEQE_MISC_IDX]) & NES_AEQE_VALID) == 0)
+			break;
+		aeqe_misc  = le32_to_cpu(aeqe->aeqe_words[NES_AEQE_MISC_IDX]);
+		aeqe_cq_id = le32_to_cpu(aeqe->aeqe_words[NES_AEQE_COMP_QP_CQ_ID_IDX]);
+		if (aeqe_misc & (NES_AEQE_QP|NES_AEQE_CQ)) {
+			if (aeqe_cq_id >= NES_FIRST_QPN) {
+				/* dealing with an accelerated QP related AE */
+				/*
+				 * u64temp = (((u64)(le32_to_cpu(aeqe->aeqe_words[NES_AEQE_COMP_CTXT_HIGH_IDX]))) << 32) |
+				 *	     ((u64)(le32_to_cpu(aeqe->aeqe_words[NES_AEQE_COMP_CTXT_LOW_IDX])));
+				 */
+				nes_process_iwarp_aeqe(nesdev, (struct nes_hw_aeqe *)aeqe);
+			} else {
+				/* TODO: dealing with a CQP related AE */
+				nes_debug(NES_DBG_AEQ, "Processing CQP related AE, misc = 0x%04X\n",
+						(u16)(aeqe_misc >> 16));
+			}
+		}
+
+		aeqe->aeqe_words[NES_AEQE_MISC_IDX] = 0;
+
+		if (++head >= aeq_size)
+			head = 0;
+
+		nes_write32(nesdev->regs + NES_AEQ_ALLOC, 1 << 16);
+	}
+	while (1);
+	aeq->aeq_head = head;
+}
+
+static void nes_reset_link(struct nes_device *nesdev, u32 mac_index)
+{
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+	u32 reset_value;
+	u32 i=0;
+	u32 u32temp;
+
+	if (nesadapter->hw_rev == NE020_REV) {
+		return;
+	}
+	mh_detected++;
+
+	reset_value = nes_read32(nesdev->regs+NES_SOFTWARE_RESET);
+
+	if ((mac_index == 0) || ((mac_index == 1) && (nesadapter->OneG_Mode)))
+		reset_value |= 0x0000001d;
+	else
+		reset_value |= 0x0000002d;
+
+	if (4 <= (nesadapter->link_interrupt_count[mac_index] / ((u16)NES_MAX_LINK_INTERRUPTS))) {
+		if ((!nesadapter->OneG_Mode) && (nesadapter->port_count == 2)) {
+			nesadapter->link_interrupt_count[0] = 0;
+			nesadapter->link_interrupt_count[1] = 0;
+			u32temp = nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL1);
+			if (0x00000040 & u32temp)
+				nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL1, 0x0000F088);
+			else
+				nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL1, 0x0000F0C8);
+
+			reset_value |= 0x0000003d;
+		}
+		nesadapter->link_interrupt_count[mac_index] = 0;
+	}
+
+	nes_write32(nesdev->regs+NES_SOFTWARE_RESET, reset_value);
+
+	while (((nes_read32(nesdev->regs+NES_SOFTWARE_RESET)
+			& 0x00000040) != 0x00000040) && (i++ < 5000));
+
+	if (0x0000003d == (reset_value & 0x0000003d)) {
+		u32 pcs_control_status0, pcs_control_status1;
+
+		for (i = 0; i < 10; i++) {
+			pcs_control_status0 = nes_read_indexed(nesdev, NES_IDX_PHY_PCS_CONTROL_STATUS0);
+			pcs_control_status1 = nes_read_indexed(nesdev, NES_IDX_PHY_PCS_CONTROL_STATUS0 + 0x200);
+			if (((0x0F000000 == (pcs_control_status0 & 0x0F000000))
+			     && (pcs_control_status0 & 0x00100000))
+			    || ((0x0F000000 == (pcs_control_status1 & 0x0F000000))
+				&& (pcs_control_status1 & 0x00100000)))
+				continue;
+			else
+				break;
+		}
+		if (10 == i) {
+			u32temp = nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL1);
+			if (0x00000040 & u32temp)
+				nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL1, 0x0000F088);
+			else
+				nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL1, 0x0000F0C8);
+
+			nes_write32(nesdev->regs+NES_SOFTWARE_RESET, reset_value);
+
+			while (((nes_read32(nesdev->regs + NES_SOFTWARE_RESET)
+				 & 0x00000040) != 0x00000040) && (i++ < 5000));
+		}
+	}
+}
+
+/**
+ * nes_process_mac_intr
+ */
+static void nes_process_mac_intr(struct nes_device *nesdev, u32 mac_number)
+{
+	unsigned long flags;
+	u32 pcs_control_status;
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+	struct nes_vnic *nesvnic;
+	u32 mac_status;
+	u32 mac_index = nesdev->mac_index;
+	u32 u32temp;
+	u16 phy_data;
+	u16 temp_phy_data;
+	u32 pcs_val  = 0x0f0f0000;
+	u32 pcs_mask = 0x0f1f0000;
+	u32 cdr_ctrl;
+
+	spin_lock_irqsave(&nesadapter->phy_lock, flags);
+	if (nesadapter->mac_sw_state[mac_number] != NES_MAC_SW_IDLE) {
+		spin_unlock_irqrestore(&nesadapter->phy_lock, flags);
+		return;
+	}
+	nesadapter->mac_sw_state[mac_number] = NES_MAC_SW_INTERRUPT;
+
+	/* ack the MAC interrupt */
+	mac_status = nes_read_indexed(nesdev, NES_IDX_MAC_INT_STATUS + (mac_index * 0x200));
+	/* Clear the interrupt */
+	nes_write_indexed(nesdev, NES_IDX_MAC_INT_STATUS + (mac_index * 0x200), mac_status);
+
+	nes_debug(NES_DBG_PHY, "MAC%u interrupt status = 0x%X.\n", mac_number, mac_status);
+
+	if (mac_status & (NES_MAC_INT_LINK_STAT_CHG | NES_MAC_INT_XGMII_EXT)) {
+		nesdev->link_status_interrupts++;
+		if (0 == (++nesadapter->link_interrupt_count[mac_index] % ((u16)NES_MAX_LINK_INTERRUPTS)))
+			nes_reset_link(nesdev, mac_index);
+
+		/* read the PHY interrupt status register */
+		if ((nesadapter->OneG_Mode) &&
+		(nesadapter->phy_type[mac_index] != NES_PHY_TYPE_PUMA_1G)) {
+			do {
+				nes_read_1G_phy_reg(nesdev, 0x1a,
+						nesadapter->phy_index[mac_index], &phy_data);
+				nes_debug(NES_DBG_PHY, "Phy%d data from register 0x1a = 0x%X.\n",
+						nesadapter->phy_index[mac_index], phy_data);
+			} while (phy_data&0x8000);
+
+			temp_phy_data = 0;
+			do {
+				nes_read_1G_phy_reg(nesdev, 0x11,
+						nesadapter->phy_index[mac_index], &phy_data);
+				nes_debug(NES_DBG_PHY, "Phy%d data from register 0x11 = 0x%X.\n",
+						nesadapter->phy_index[mac_index], phy_data);
+				if (temp_phy_data == phy_data)
+					break;
+				temp_phy_data = phy_data;
+			} while (1);
+
+			nes_read_1G_phy_reg(nesdev, 0x1e,
+					nesadapter->phy_index[mac_index], &phy_data);
+			nes_debug(NES_DBG_PHY, "Phy%d data from register 0x1e = 0x%X.\n",
+					nesadapter->phy_index[mac_index], phy_data);
+
+			nes_read_1G_phy_reg(nesdev, 1,
+					nesadapter->phy_index[mac_index], &phy_data);
+			nes_debug(NES_DBG_PHY, "1G phy%u data from register 1 = 0x%X\n",
+					nesadapter->phy_index[mac_index], phy_data);
+
+			if (temp_phy_data & 0x1000) {
+				nes_debug(NES_DBG_PHY, "The Link is up according to the PHY\n");
+				phy_data = 4;
+			} else {
+				nes_debug(NES_DBG_PHY, "The Link is down according to the PHY\n");
+			}
+		}
+		nes_debug(NES_DBG_PHY, "Eth SERDES Common Status: 0=0x%08X, 1=0x%08X\n",
+				nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_STATUS0),
+				nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_STATUS0+0x200));
+
+		if (nesadapter->phy_type[mac_index] == NES_PHY_TYPE_PUMA_1G) {
+			switch (mac_index) {
+			case 1:
+			case 3:
+				pcs_control_status = nes_read_indexed(nesdev,
+						NES_IDX_PHY_PCS_CONTROL_STATUS0 + 0x200);
+				break;
+			default:
+				pcs_control_status = nes_read_indexed(nesdev,
+						NES_IDX_PHY_PCS_CONTROL_STATUS0);
+				break;
+			}
+		} else {
+			pcs_control_status = nes_read_indexed(nesdev,
+					NES_IDX_PHY_PCS_CONTROL_STATUS0 + ((mac_index & 1) * 0x200));
+			pcs_control_status = nes_read_indexed(nesdev,
+					NES_IDX_PHY_PCS_CONTROL_STATUS0 + ((mac_index & 1) * 0x200));
+		}
+
+		nes_debug(NES_DBG_PHY, "PCS PHY Control/Status%u: 0x%08X\n",
+				mac_index, pcs_control_status);
+		if ((nesadapter->OneG_Mode) &&
+				(nesadapter->phy_type[mac_index] != NES_PHY_TYPE_PUMA_1G)) {
+			u32temp = 0x01010000;
+			if (nesadapter->port_count > 2) {
+				u32temp |= 0x02020000;
+			}
+			if ((pcs_control_status & u32temp)!= u32temp) {
+				phy_data = 0;
+				nes_debug(NES_DBG_PHY, "PCS says the link is down\n");
+			}
+		} else {
+			switch (nesadapter->phy_type[mac_index]) {
+			case NES_PHY_TYPE_ARGUS:
+			case NES_PHY_TYPE_SFP_D:
+			case NES_PHY_TYPE_KR:
+				/* clear the alarms */
+				nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 4, 0x0008);
+				nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 4, 0xc001);
+				nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 4, 0xc002);
+				nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 4, 0xc005);
+				nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 4, 0xc006);
+				nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 1, 0x9003);
+				nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 1, 0x9004);
+				nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 1, 0x9005);
+				/* check link status */
+				nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 1, 0x9003);
+				temp_phy_data = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL);
+
+				nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 3, 0x0021);
+				nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL);
+				nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 3, 0x0021);
+				phy_data = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL);
+
+				phy_data = (!temp_phy_data && (phy_data == 0x8000)) ? 0x4 : 0x0;
+
+				nes_debug(NES_DBG_PHY, "%s: Phy data = 0x%04X, link was %s.\n",
+					__func__, phy_data, nesadapter->mac_link_down[mac_index] ? "DOWN" : "UP");
+				break;
+
+			case NES_PHY_TYPE_PUMA_1G:
+				if (mac_index < 2)
+					pcs_val = pcs_mask = 0x01010000;
+				else
+					pcs_val = pcs_mask = 0x02020000;
+				/* fall through */
+			default:
+				phy_data = (pcs_val == (pcs_control_status & pcs_mask)) ? 0x4 : 0x0;
+				break;
+			}
+		}
+
+		if (phy_data & 0x0004) {
+			if (wide_ppm_offset &&
+			    (nesadapter->phy_type[mac_index] == NES_PHY_TYPE_CX4) &&
+			    (nesadapter->hw_rev != NE020_REV)) {
+				cdr_ctrl = nes_read_indexed(nesdev,
+							    NES_IDX_ETH_SERDES_CDR_CONTROL0 +
+							    mac_index * 0x200);
+				nes_write_indexed(nesdev,
+						  NES_IDX_ETH_SERDES_CDR_CONTROL0 +
+						  mac_index * 0x200,
+						  cdr_ctrl | 0x000F0000);
+			}
+			nesadapter->mac_link_down[mac_index] = 0;
+			list_for_each_entry(nesvnic, &nesadapter->nesvnic_list[mac_index], list) {
+				nes_debug(NES_DBG_PHY, "The Link is UP!!.  linkup was %d\n",
+						nesvnic->linkup);
+				if (nesvnic->linkup == 0) {
+					printk(PFX "The Link is now up for port %s, netdev %p.\n",
+							nesvnic->netdev->name, nesvnic->netdev);
+					if (netif_queue_stopped(nesvnic->netdev))
+						netif_start_queue(nesvnic->netdev);
+					nesvnic->linkup = 1;
+					netif_carrier_on(nesvnic->netdev);
+
+					spin_lock(&nesvnic->port_ibevent_lock);
+					if (nesvnic->of_device_registered) {
+						if (nesdev->iw_status == 0) {
+							nesdev->iw_status = 1;
+							nes_port_ibevent(nesvnic);
+						}
+					}
+					spin_unlock(&nesvnic->port_ibevent_lock);
+				}
+			}
+		} else {
+			if (wide_ppm_offset &&
+			    (nesadapter->phy_type[mac_index] == NES_PHY_TYPE_CX4) &&
+			    (nesadapter->hw_rev != NE020_REV)) {
+				cdr_ctrl = nes_read_indexed(nesdev,
+							    NES_IDX_ETH_SERDES_CDR_CONTROL0 +
+							    mac_index * 0x200);
+				nes_write_indexed(nesdev,
+						  NES_IDX_ETH_SERDES_CDR_CONTROL0 +
+						  mac_index * 0x200,
+						  cdr_ctrl & 0xFFF0FFFF);
+			}
+			nesadapter->mac_link_down[mac_index] = 1;
+			list_for_each_entry(nesvnic, &nesadapter->nesvnic_list[mac_index], list) {
+				nes_debug(NES_DBG_PHY, "The Link is Down!!. linkup was %d\n",
+						nesvnic->linkup);
+				if (nesvnic->linkup == 1) {
+					printk(PFX "The Link is now down for port %s, netdev %p.\n",
+							nesvnic->netdev->name, nesvnic->netdev);
+					if (!(netif_queue_stopped(nesvnic->netdev)))
+						netif_stop_queue(nesvnic->netdev);
+					nesvnic->linkup = 0;
+					netif_carrier_off(nesvnic->netdev);
+
+					spin_lock(&nesvnic->port_ibevent_lock);
+					if (nesvnic->of_device_registered) {
+						if (nesdev->iw_status == 1) {
+							nesdev->iw_status = 0;
+							nes_port_ibevent(nesvnic);
+						}
+					}
+					spin_unlock(&nesvnic->port_ibevent_lock);
+				}
+			}
+		}
+		if (nesadapter->phy_type[mac_index] == NES_PHY_TYPE_SFP_D) {
+			nesdev->link_recheck = 1;
+			mod_delayed_work(system_wq, &nesdev->work,
+					 NES_LINK_RECHECK_DELAY);
+		}
+	}
+
+	spin_unlock_irqrestore(&nesadapter->phy_lock, flags);
+
+	nesadapter->mac_sw_state[mac_number] = NES_MAC_SW_IDLE;
+}
+
+void nes_recheck_link_status(struct work_struct *work)
+{
+	unsigned long flags;
+	struct nes_device *nesdev = container_of(work, struct nes_device, work.work);
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+	struct nes_vnic *nesvnic;
+	u32 mac_index = nesdev->mac_index;
+	u16 phy_data;
+	u16 temp_phy_data;
+
+	spin_lock_irqsave(&nesadapter->phy_lock, flags);
+
+	/* check link status */
+	nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 1, 0x9003);
+	temp_phy_data = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL);
+
+	nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 3, 0x0021);
+	nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL);
+	nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 3, 0x0021);
+	phy_data = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL);
+
+	phy_data = (!temp_phy_data && (phy_data == 0x8000)) ? 0x4 : 0x0;
+
+	nes_debug(NES_DBG_PHY, "%s: Phy data = 0x%04X, link was %s.\n",
+		__func__, phy_data,
+		nesadapter->mac_link_down[mac_index] ? "DOWN" : "UP");
+
+	if (phy_data & 0x0004) {
+		nesadapter->mac_link_down[mac_index] = 0;
+		list_for_each_entry(nesvnic, &nesadapter->nesvnic_list[mac_index], list) {
+			if (nesvnic->linkup == 0) {
+				printk(PFX "The Link is now up for port %s, netdev %p.\n",
+						nesvnic->netdev->name, nesvnic->netdev);
+				if (netif_queue_stopped(nesvnic->netdev))
+					netif_start_queue(nesvnic->netdev);
+				nesvnic->linkup = 1;
+				netif_carrier_on(nesvnic->netdev);
+
+				spin_lock(&nesvnic->port_ibevent_lock);
+				if (nesvnic->of_device_registered) {
+					if (nesdev->iw_status == 0) {
+						nesdev->iw_status = 1;
+						nes_port_ibevent(nesvnic);
+					}
+				}
+				spin_unlock(&nesvnic->port_ibevent_lock);
+			}
+		}
+
+	} else {
+		nesadapter->mac_link_down[mac_index] = 1;
+		list_for_each_entry(nesvnic, &nesadapter->nesvnic_list[mac_index], list) {
+			if (nesvnic->linkup == 1) {
+				printk(PFX "The Link is now down for port %s, netdev %p.\n",
+						nesvnic->netdev->name, nesvnic->netdev);
+				if (!(netif_queue_stopped(nesvnic->netdev)))
+					netif_stop_queue(nesvnic->netdev);
+				nesvnic->linkup = 0;
+				netif_carrier_off(nesvnic->netdev);
+
+				spin_lock(&nesvnic->port_ibevent_lock);
+				if (nesvnic->of_device_registered) {
+					if (nesdev->iw_status == 1) {
+						nesdev->iw_status = 0;
+						nes_port_ibevent(nesvnic);
+					}
+				}
+				spin_unlock(&nesvnic->port_ibevent_lock);
+			}
+		}
+	}
+	if (nesdev->link_recheck++ < NES_LINK_RECHECK_MAX)
+		schedule_delayed_work(&nesdev->work, NES_LINK_RECHECK_DELAY);
+	else
+		nesdev->link_recheck = 0;
+
+	spin_unlock_irqrestore(&nesadapter->phy_lock, flags);
+}
+
+
+static void nes_nic_napi_ce_handler(struct nes_device *nesdev, struct nes_hw_nic_cq *cq)
+{
+	struct nes_vnic *nesvnic = container_of(cq, struct nes_vnic, nic_cq);
+
+	napi_schedule(&nesvnic->napi);
+}
+
+
+/* The MAX_RQES_TO_PROCESS defines how many max read requests to complete before
+* getting out of nic_ce_handler
+*/
+#define	MAX_RQES_TO_PROCESS	384
+
+/**
+ * nes_nic_ce_handler
+ */
+void nes_nic_ce_handler(struct nes_device *nesdev, struct nes_hw_nic_cq *cq)
+{
+	u64 u64temp;
+	dma_addr_t bus_address;
+	struct nes_hw_nic *nesnic;
+	struct nes_vnic *nesvnic = container_of(cq, struct nes_vnic, nic_cq);
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+	struct nes_hw_nic_rq_wqe *nic_rqe;
+	struct nes_hw_nic_sq_wqe *nic_sqe;
+	struct sk_buff *skb;
+	struct sk_buff *rx_skb;
+	struct nes_rskb_cb *cb;
+	__le16 *wqe_fragment_length;
+	u32 head;
+	u32 cq_size;
+	u32 rx_pkt_size;
+	u32 cqe_count=0;
+	u32 cqe_errv;
+	u32 cqe_misc;
+	u16 wqe_fragment_index = 1;	/* first fragment (0) is used by copy buffer */
+	u16 vlan_tag;
+	u16 pkt_type;
+	u16 rqes_processed = 0;
+	u8 sq_cqes = 0;
+	u8 nes_use_lro = 0;
+
+	head = cq->cq_head;
+	cq_size = cq->cq_size;
+	cq->cqes_pending = 1;
+	if (nesvnic->netdev->features & NETIF_F_LRO)
+		nes_use_lro = 1;
+	do {
+		if (le32_to_cpu(cq->cq_vbase[head].cqe_words[NES_NIC_CQE_MISC_IDX]) &
+				NES_NIC_CQE_VALID) {
+			nesnic = &nesvnic->nic;
+			cqe_misc = le32_to_cpu(cq->cq_vbase[head].cqe_words[NES_NIC_CQE_MISC_IDX]);
+			if (cqe_misc & NES_NIC_CQE_SQ) {
+				sq_cqes++;
+				wqe_fragment_index = 1;
+				nic_sqe = &nesnic->sq_vbase[nesnic->sq_tail];
+				skb = nesnic->tx_skb[nesnic->sq_tail];
+				wqe_fragment_length = (__le16 *)&nic_sqe->wqe_words[NES_NIC_SQ_WQE_LENGTH_0_TAG_IDX];
+				/* bump past the vlan tag */
+				wqe_fragment_length++;
+				if (le16_to_cpu(wqe_fragment_length[wqe_fragment_index]) != 0) {
+					u64temp = (u64) le32_to_cpu(nic_sqe->wqe_words[NES_NIC_SQ_WQE_FRAG0_LOW_IDX +
+							wqe_fragment_index * 2]);
+					u64temp += ((u64)le32_to_cpu(nic_sqe->wqe_words[NES_NIC_SQ_WQE_FRAG0_HIGH_IDX +
+							wqe_fragment_index * 2])) << 32;
+					bus_address = (dma_addr_t)u64temp;
+					if (test_and_clear_bit(nesnic->sq_tail, nesnic->first_frag_overflow)) {
+						pci_unmap_single(nesdev->pcidev,
+								bus_address,
+								le16_to_cpu(wqe_fragment_length[wqe_fragment_index++]),
+								PCI_DMA_TODEVICE);
+					}
+					for (; wqe_fragment_index < 5; wqe_fragment_index++) {
+						if (wqe_fragment_length[wqe_fragment_index]) {
+							u64temp = le32_to_cpu(nic_sqe->wqe_words[NES_NIC_SQ_WQE_FRAG0_LOW_IDX +
+										wqe_fragment_index * 2]);
+							u64temp += ((u64)le32_to_cpu(nic_sqe->wqe_words[NES_NIC_SQ_WQE_FRAG0_HIGH_IDX
+										+ wqe_fragment_index * 2])) <<32;
+							bus_address = (dma_addr_t)u64temp;
+							pci_unmap_page(nesdev->pcidev,
+									bus_address,
+									le16_to_cpu(wqe_fragment_length[wqe_fragment_index]),
+									PCI_DMA_TODEVICE);
+						} else
+							break;
+					}
+				}
+				if (skb)
+					dev_kfree_skb_any(skb);
+				nesnic->sq_tail++;
+				nesnic->sq_tail &= nesnic->sq_size-1;
+				if (sq_cqes > 128) {
+					barrier();
+					/* restart the queue if it had been stopped */
+					if (netif_queue_stopped(nesvnic->netdev))
+						netif_wake_queue(nesvnic->netdev);
+					sq_cqes = 0;
+				}
+			} else {
+				rqes_processed ++;
+
+				cq->rx_cqes_completed++;
+				cq->rx_pkts_indicated++;
+				rx_pkt_size = cqe_misc & 0x0000ffff;
+				nic_rqe = &nesnic->rq_vbase[nesnic->rq_tail];
+				/* Get the skb */
+				rx_skb = nesnic->rx_skb[nesnic->rq_tail];
+				nic_rqe = &nesnic->rq_vbase[nesvnic->nic.rq_tail];
+				bus_address = (dma_addr_t)le32_to_cpu(nic_rqe->wqe_words[NES_NIC_RQ_WQE_FRAG0_LOW_IDX]);
+				bus_address += ((u64)le32_to_cpu(nic_rqe->wqe_words[NES_NIC_RQ_WQE_FRAG0_HIGH_IDX])) << 32;
+				pci_unmap_single(nesdev->pcidev, bus_address,
+						nesvnic->max_frame_size, PCI_DMA_FROMDEVICE);
+				cb = (struct nes_rskb_cb *)&rx_skb->cb[0];
+				cb->busaddr = 0;
+				/* rx_skb->tail = rx_skb->data + rx_pkt_size; */
+				/* rx_skb->len = rx_pkt_size; */
+				rx_skb->len = 0;  /* TODO: see if this is necessary */
+				skb_put(rx_skb, rx_pkt_size);
+				rx_skb->protocol = eth_type_trans(rx_skb, nesvnic->netdev);
+				nesnic->rq_tail++;
+				nesnic->rq_tail &= nesnic->rq_size - 1;
+
+				atomic_inc(&nesvnic->rx_skbs_needed);
+				if (atomic_read(&nesvnic->rx_skbs_needed) > (nesvnic->nic.rq_size>>1)) {
+					nes_write32(nesdev->regs+NES_CQE_ALLOC,
+							cq->cq_number | (cqe_count << 16));
+					/* nesadapter->tune_timer.cq_count += cqe_count; */
+					nesdev->currcq_count += cqe_count;
+					cqe_count = 0;
+					nes_replenish_nic_rq(nesvnic);
+				}
+				pkt_type = (u16)(le32_to_cpu(cq->cq_vbase[head].cqe_words[NES_NIC_CQE_TAG_PKT_TYPE_IDX]));
+				cqe_errv = (cqe_misc & NES_NIC_CQE_ERRV_MASK) >> NES_NIC_CQE_ERRV_SHIFT;
+				rx_skb->ip_summed = CHECKSUM_NONE;
+
+				if ((NES_PKT_TYPE_TCPV4_BITS == (pkt_type & NES_PKT_TYPE_TCPV4_MASK)) ||
+						(NES_PKT_TYPE_UDPV4_BITS == (pkt_type & NES_PKT_TYPE_UDPV4_MASK))) {
+					if ((cqe_errv &
+							(NES_NIC_ERRV_BITS_IPV4_CSUM_ERR | NES_NIC_ERRV_BITS_TCPUDP_CSUM_ERR |
+							NES_NIC_ERRV_BITS_IPH_ERR | NES_NIC_ERRV_BITS_WQE_OVERRUN)) == 0) {
+						if (nesvnic->netdev->features & NETIF_F_RXCSUM)
+							rx_skb->ip_summed = CHECKSUM_UNNECESSARY;
+					} else
+						nes_debug(NES_DBG_CQ, "%s: unsuccessfully checksummed TCP or UDP packet."
+								" errv = 0x%X, pkt_type = 0x%X.\n",
+								nesvnic->netdev->name, cqe_errv, pkt_type);
+
+				} else if ((pkt_type & NES_PKT_TYPE_IPV4_MASK) == NES_PKT_TYPE_IPV4_BITS) {
+					if ((cqe_errv &
+							(NES_NIC_ERRV_BITS_IPV4_CSUM_ERR | NES_NIC_ERRV_BITS_IPH_ERR |
+							NES_NIC_ERRV_BITS_WQE_OVERRUN)) == 0) {
+						if (nesvnic->netdev->features & NETIF_F_RXCSUM) {
+							rx_skb->ip_summed = CHECKSUM_UNNECESSARY;
+							/* nes_debug(NES_DBG_CQ, "%s: Reporting successfully checksummed IPv4 packet.\n",
+								  nesvnic->netdev->name); */
+						}
+					} else
+						nes_debug(NES_DBG_CQ, "%s: unsuccessfully checksummed TCP or UDP packet."
+								" errv = 0x%X, pkt_type = 0x%X.\n",
+								nesvnic->netdev->name, cqe_errv, pkt_type);
+					}
+				/* nes_debug(NES_DBG_CQ, "pkt_type=%x, APBVT_MASK=%x\n",
+							pkt_type, (pkt_type & NES_PKT_TYPE_APBVT_MASK)); */
+
+				if ((pkt_type & NES_PKT_TYPE_APBVT_MASK) == NES_PKT_TYPE_APBVT_BITS) {
+					if (nes_cm_recv(rx_skb, nesvnic->netdev))
+						rx_skb = NULL;
+				}
+				if (rx_skb == NULL)
+					goto skip_rx_indicate0;
+
+
+				if (cqe_misc & NES_NIC_CQE_TAG_VALID) {
+					vlan_tag = (u16)(le32_to_cpu(
+							cq->cq_vbase[head].cqe_words[NES_NIC_CQE_TAG_PKT_TYPE_IDX])
+							>> 16);
+					nes_debug(NES_DBG_CQ, "%s: Reporting stripped VLAN packet. Tag = 0x%04X\n",
+							nesvnic->netdev->name, vlan_tag);
+
+					__vlan_hwaccel_put_tag(rx_skb, htons(ETH_P_8021Q), vlan_tag);
+				}
+				if (nes_use_lro)
+					lro_receive_skb(&nesvnic->lro_mgr, rx_skb, NULL);
+				else
+					netif_receive_skb(rx_skb);
+
+skip_rx_indicate0:
+				;
+				/* nesvnic->netstats.rx_packets++; */
+				/* nesvnic->netstats.rx_bytes += rx_pkt_size; */
+			}
+
+			cq->cq_vbase[head].cqe_words[NES_NIC_CQE_MISC_IDX] = 0;
+			/* Accounting... */
+			cqe_count++;
+			if (++head >= cq_size)
+				head = 0;
+			if (cqe_count == 255) {
+				/* Replenish Nic CQ */
+				nes_write32(nesdev->regs+NES_CQE_ALLOC,
+						cq->cq_number | (cqe_count << 16));
+				/* nesdev->nesadapter->tune_timer.cq_count += cqe_count; */
+				nesdev->currcq_count += cqe_count;
+				cqe_count = 0;
+			}
+
+			if (cq->rx_cqes_completed >= nesvnic->budget)
+				break;
+		} else {
+			cq->cqes_pending = 0;
+			break;
+		}
+
+	} while (1);
+
+	if (nes_use_lro)
+		lro_flush_all(&nesvnic->lro_mgr);
+	if (sq_cqes) {
+		barrier();
+		/* restart the queue if it had been stopped */
+		if (netif_queue_stopped(nesvnic->netdev))
+			netif_wake_queue(nesvnic->netdev);
+	}
+	cq->cq_head = head;
+	/* nes_debug(NES_DBG_CQ, "CQ%u Processed = %u cqes, new head = %u.\n",
+			cq->cq_number, cqe_count, cq->cq_head); */
+	cq->cqe_allocs_pending = cqe_count;
+	if (unlikely(nesadapter->et_use_adaptive_rx_coalesce))
+	{
+		/* nesdev->nesadapter->tune_timer.cq_count += cqe_count; */
+		nesdev->currcq_count += cqe_count;
+		nes_nic_tune_timer(nesdev);
+	}
+	if (atomic_read(&nesvnic->rx_skbs_needed))
+		nes_replenish_nic_rq(nesvnic);
+}
+
+
+
+/**
+ * nes_cqp_ce_handler
+ */
+static void nes_cqp_ce_handler(struct nes_device *nesdev, struct nes_hw_cq *cq)
+{
+	u64 u64temp;
+	unsigned long flags;
+	struct nes_hw_cqp *cqp = NULL;
+	struct nes_cqp_request *cqp_request;
+	struct nes_hw_cqp_wqe *cqp_wqe;
+	u32 head;
+	u32 cq_size;
+	u32 cqe_count=0;
+	u32 error_code;
+	u32 opcode;
+	u32 ctx_index;
+	/* u32 counter; */
+
+	head = cq->cq_head;
+	cq_size = cq->cq_size;
+
+	do {
+		/* process the CQE */
+		/* nes_debug(NES_DBG_CQP, "head=%u cqe_words=%08X\n", head,
+			  le32_to_cpu(cq->cq_vbase[head].cqe_words[NES_CQE_OPCODE_IDX])); */
+
+		opcode = le32_to_cpu(cq->cq_vbase[head].cqe_words[NES_CQE_OPCODE_IDX]);
+		if (opcode & NES_CQE_VALID) {
+			cqp = &nesdev->cqp;
+
+			error_code = le32_to_cpu(cq->cq_vbase[head].cqe_words[NES_CQE_ERROR_CODE_IDX]);
+			if (error_code) {
+				nes_debug(NES_DBG_CQP, "Bad Completion code for opcode 0x%02X from CQP,"
+						" Major/Minor codes = 0x%04X:%04X.\n",
+						le32_to_cpu(cq->cq_vbase[head].cqe_words[NES_CQE_OPCODE_IDX])&0x3f,
+						(u16)(error_code >> 16),
+						(u16)error_code);
+			}
+
+			u64temp = (((u64)(le32_to_cpu(cq->cq_vbase[head].
+					cqe_words[NES_CQE_COMP_COMP_CTX_HIGH_IDX]))) << 32) |
+					((u64)(le32_to_cpu(cq->cq_vbase[head].
+					cqe_words[NES_CQE_COMP_COMP_CTX_LOW_IDX])));
+
+			cqp_request = (struct nes_cqp_request *)(unsigned long)u64temp;
+			if (cqp_request) {
+				if (cqp_request->waiting) {
+					/* nes_debug(NES_DBG_CQP, "%s: Waking up requestor\n"); */
+					cqp_request->major_code = (u16)(error_code >> 16);
+					cqp_request->minor_code = (u16)error_code;
+					barrier();
+					cqp_request->request_done = 1;
+					wake_up(&cqp_request->waitq);
+					nes_put_cqp_request(nesdev, cqp_request);
+				} else {
+					if (cqp_request->callback)
+						cqp_request->cqp_callback(nesdev, cqp_request);
+					nes_free_cqp_request(nesdev, cqp_request);
+				}
+			} else {
+				wake_up(&nesdev->cqp.waitq);
+			}
+
+			cq->cq_vbase[head].cqe_words[NES_CQE_OPCODE_IDX] = 0;
+			nes_write32(nesdev->regs + NES_CQE_ALLOC, cq->cq_number | (1 << 16));
+			if (++cqp->sq_tail >= cqp->sq_size)
+				cqp->sq_tail = 0;
+
+			/* Accounting... */
+			cqe_count++;
+			if (++head >= cq_size)
+				head = 0;
+		} else {
+			break;
+		}
+	} while (1);
+	cq->cq_head = head;
+
+	spin_lock_irqsave(&nesdev->cqp.lock, flags);
+	while ((!list_empty(&nesdev->cqp_pending_reqs)) &&
+			((((nesdev->cqp.sq_tail+nesdev->cqp.sq_size)-nesdev->cqp.sq_head) &
+			(nesdev->cqp.sq_size - 1)) != 1)) {
+		cqp_request = list_entry(nesdev->cqp_pending_reqs.next,
+				struct nes_cqp_request, list);
+		list_del_init(&cqp_request->list);
+		head = nesdev->cqp.sq_head++;
+		nesdev->cqp.sq_head &= nesdev->cqp.sq_size-1;
+		cqp_wqe = &nesdev->cqp.sq_vbase[head];
+		memcpy(cqp_wqe, &cqp_request->cqp_wqe, sizeof(*cqp_wqe));
+		barrier();
+
+		opcode = cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX];
+		if ((opcode & NES_CQP_OPCODE_MASK) == NES_CQP_DOWNLOAD_SEGMENT)
+			ctx_index = NES_CQP_WQE_DL_COMP_CTX_LOW_IDX;
+		else
+			ctx_index = NES_CQP_WQE_COMP_CTX_LOW_IDX;
+		cqp_wqe->wqe_words[ctx_index] =
+			cpu_to_le32((u32)((unsigned long)cqp_request));
+		cqp_wqe->wqe_words[ctx_index + 1] =
+			cpu_to_le32((u32)(upper_32_bits((unsigned long)cqp_request)));
+		nes_debug(NES_DBG_CQP, "CQP request %p (opcode 0x%02X) put on CQPs SQ wqe%u.\n",
+				cqp_request, le32_to_cpu(cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX])&0x3f, head);
+		/* Ring doorbell (1 WQEs) */
+		barrier();
+		nes_write32(nesdev->regs+NES_WQE_ALLOC, 0x01800000 | nesdev->cqp.qp_id);
+	}
+	spin_unlock_irqrestore(&nesdev->cqp.lock, flags);
+
+	/* Arm the CCQ */
+	nes_write32(nesdev->regs+NES_CQE_ALLOC, NES_CQE_ALLOC_NOTIFY_NEXT |
+			cq->cq_number);
+	nes_read32(nesdev->regs+NES_CQE_ALLOC);
+}
+
+static u8 *locate_mpa(u8 *pkt, u32 aeq_info)
+{
+	if (aeq_info & NES_AEQE_Q2_DATA_ETHERNET) {
+		/* skip over ethernet header */
+		pkt += ETH_HLEN;
+
+		/* Skip over IP and TCP headers */
+		pkt += 4 * (pkt[0] & 0x0f);
+		pkt += 4 * ((pkt[12] >> 4) & 0x0f);
+	}
+	return pkt;
+}
+
+/* Determine if incoming error pkt is rdma layer */
+static u32 iwarp_opcode(struct nes_qp *nesqp, u32 aeq_info)
+{
+	u8 *pkt;
+	u16 *mpa;
+	u32 opcode = 0xffffffff;
+
+	if (aeq_info & NES_AEQE_Q2_DATA_WRITTEN) {
+		pkt = nesqp->hwqp.q2_vbase + BAD_FRAME_OFFSET;
+		mpa = (u16 *)locate_mpa(pkt, aeq_info);
+		opcode = be16_to_cpu(mpa[1]) & 0xf;
+	}
+
+	return opcode;
+}
+
+/* Build iWARP terminate header */
+static int nes_bld_terminate_hdr(struct nes_qp *nesqp, u16 async_event_id, u32 aeq_info)
+{
+	u8 *pkt = nesqp->hwqp.q2_vbase + BAD_FRAME_OFFSET;
+	u16 ddp_seg_len;
+	int copy_len = 0;
+	u8 is_tagged = 0;
+	u8 flush_code = 0;
+	struct nes_terminate_hdr *termhdr;
+
+	termhdr = (struct nes_terminate_hdr *)nesqp->hwqp.q2_vbase;
+	memset(termhdr, 0, 64);
+
+	if (aeq_info & NES_AEQE_Q2_DATA_WRITTEN) {
+
+		/* Use data from offending packet to fill in ddp & rdma hdrs */
+		pkt = locate_mpa(pkt, aeq_info);
+		ddp_seg_len = be16_to_cpu(*(u16 *)pkt);
+		if (ddp_seg_len) {
+			copy_len = 2;
+			termhdr->hdrct = DDP_LEN_FLAG;
+			if (pkt[2] & 0x80) {
+				is_tagged = 1;
+				if (ddp_seg_len >= TERM_DDP_LEN_TAGGED) {
+					copy_len += TERM_DDP_LEN_TAGGED;
+					termhdr->hdrct |= DDP_HDR_FLAG;
+				}
+			} else {
+				if (ddp_seg_len >= TERM_DDP_LEN_UNTAGGED) {
+					copy_len += TERM_DDP_LEN_UNTAGGED;
+					termhdr->hdrct |= DDP_HDR_FLAG;
+				}
+
+				if (ddp_seg_len >= (TERM_DDP_LEN_UNTAGGED + TERM_RDMA_LEN)) {
+					if ((pkt[3] & RDMA_OPCODE_MASK) == RDMA_READ_REQ_OPCODE) {
+						copy_len += TERM_RDMA_LEN;
+						termhdr->hdrct |= RDMA_HDR_FLAG;
+					}
+				}
+			}
+		}
+	}
+
+	switch (async_event_id) {
+	case NES_AEQE_AEID_AMP_UNALLOCATED_STAG:
+		switch (iwarp_opcode(nesqp, aeq_info)) {
+		case IWARP_OPCODE_WRITE:
+			flush_code = IB_WC_LOC_PROT_ERR;
+			termhdr->layer_etype = (LAYER_DDP << 4) | DDP_TAGGED_BUFFER;
+			termhdr->error_code = DDP_TAGGED_INV_STAG;
+			break;
+		default:
+			flush_code = IB_WC_REM_ACCESS_ERR;
+			termhdr->layer_etype = (LAYER_RDMA << 4) | RDMAP_REMOTE_PROT;
+			termhdr->error_code = RDMAP_INV_STAG;
+		}
+		break;
+	case NES_AEQE_AEID_AMP_INVALID_STAG:
+		flush_code = IB_WC_REM_ACCESS_ERR;
+		termhdr->layer_etype = (LAYER_RDMA << 4) | RDMAP_REMOTE_PROT;
+		termhdr->error_code = RDMAP_INV_STAG;
+		break;
+	case NES_AEQE_AEID_AMP_BAD_QP:
+		flush_code = IB_WC_LOC_QP_OP_ERR;
+		termhdr->layer_etype = (LAYER_DDP << 4) | DDP_UNTAGGED_BUFFER;
+		termhdr->error_code = DDP_UNTAGGED_INV_QN;
+		break;
+	case NES_AEQE_AEID_AMP_BAD_STAG_KEY:
+	case NES_AEQE_AEID_AMP_BAD_STAG_INDEX:
+		switch (iwarp_opcode(nesqp, aeq_info)) {
+		case IWARP_OPCODE_SEND_INV:
+		case IWARP_OPCODE_SEND_SE_INV:
+			flush_code = IB_WC_REM_OP_ERR;
+			termhdr->layer_etype = (LAYER_RDMA << 4) | RDMAP_REMOTE_OP;
+			termhdr->error_code = RDMAP_CANT_INV_STAG;
+			break;
+		default:
+			flush_code = IB_WC_REM_ACCESS_ERR;
+			termhdr->layer_etype = (LAYER_RDMA << 4) | RDMAP_REMOTE_PROT;
+			termhdr->error_code = RDMAP_INV_STAG;
+		}
+		break;
+	case NES_AEQE_AEID_AMP_BOUNDS_VIOLATION:
+		if (aeq_info & (NES_AEQE_Q2_DATA_ETHERNET | NES_AEQE_Q2_DATA_MPA)) {
+			flush_code = IB_WC_LOC_PROT_ERR;
+			termhdr->layer_etype = (LAYER_DDP << 4) | DDP_TAGGED_BUFFER;
+			termhdr->error_code = DDP_TAGGED_BOUNDS;
+		} else {
+			flush_code = IB_WC_REM_ACCESS_ERR;
+			termhdr->layer_etype = (LAYER_RDMA << 4) | RDMAP_REMOTE_PROT;
+			termhdr->error_code = RDMAP_INV_BOUNDS;
+		}
+		break;
+	case NES_AEQE_AEID_AMP_RIGHTS_VIOLATION:
+	case NES_AEQE_AEID_AMP_INVALIDATE_NO_REMOTE_ACCESS_RIGHTS:
+	case NES_AEQE_AEID_PRIV_OPERATION_DENIED:
+		flush_code = IB_WC_REM_ACCESS_ERR;
+		termhdr->layer_etype = (LAYER_RDMA << 4) | RDMAP_REMOTE_PROT;
+		termhdr->error_code = RDMAP_ACCESS;
+		break;
+	case NES_AEQE_AEID_AMP_TO_WRAP:
+		flush_code = IB_WC_REM_ACCESS_ERR;
+		termhdr->layer_etype = (LAYER_RDMA << 4) | RDMAP_REMOTE_PROT;
+		termhdr->error_code = RDMAP_TO_WRAP;
+		break;
+	case NES_AEQE_AEID_AMP_BAD_PD:
+		switch (iwarp_opcode(nesqp, aeq_info)) {
+		case IWARP_OPCODE_WRITE:
+			flush_code = IB_WC_LOC_PROT_ERR;
+			termhdr->layer_etype = (LAYER_DDP << 4) | DDP_TAGGED_BUFFER;
+			termhdr->error_code = DDP_TAGGED_UNASSOC_STAG;
+			break;
+		case IWARP_OPCODE_SEND_INV:
+		case IWARP_OPCODE_SEND_SE_INV:
+			flush_code = IB_WC_REM_ACCESS_ERR;
+			termhdr->layer_etype = (LAYER_RDMA << 4) | RDMAP_REMOTE_PROT;
+			termhdr->error_code = RDMAP_CANT_INV_STAG;
+			break;
+		default:
+			flush_code = IB_WC_REM_ACCESS_ERR;
+			termhdr->layer_etype = (LAYER_RDMA << 4) | RDMAP_REMOTE_PROT;
+			termhdr->error_code = RDMAP_UNASSOC_STAG;
+		}
+		break;
+	case NES_AEQE_AEID_LLP_RECEIVED_MARKER_AND_LENGTH_FIELDS_DONT_MATCH:
+		flush_code = IB_WC_LOC_LEN_ERR;
+		termhdr->layer_etype = (LAYER_MPA << 4) | DDP_LLP;
+		termhdr->error_code = MPA_MARKER;
+		break;
+	case NES_AEQE_AEID_LLP_RECEIVED_MPA_CRC_ERROR:
+		flush_code = IB_WC_GENERAL_ERR;
+		termhdr->layer_etype = (LAYER_MPA << 4) | DDP_LLP;
+		termhdr->error_code = MPA_CRC;
+		break;
+	case NES_AEQE_AEID_LLP_SEGMENT_TOO_LARGE:
+	case NES_AEQE_AEID_LLP_SEGMENT_TOO_SMALL:
+		flush_code = IB_WC_LOC_LEN_ERR;
+		termhdr->layer_etype = (LAYER_DDP << 4) | DDP_CATASTROPHIC;
+		termhdr->error_code = DDP_CATASTROPHIC_LOCAL;
+		break;
+	case NES_AEQE_AEID_DDP_LCE_LOCAL_CATASTROPHIC:
+	case NES_AEQE_AEID_DDP_NO_L_BIT:
+		flush_code = IB_WC_FATAL_ERR;
+		termhdr->layer_etype = (LAYER_DDP << 4) | DDP_CATASTROPHIC;
+		termhdr->error_code = DDP_CATASTROPHIC_LOCAL;
+		break;
+	case NES_AEQE_AEID_DDP_INVALID_MSN_GAP_IN_MSN:
+	case NES_AEQE_AEID_DDP_INVALID_MSN_RANGE_IS_NOT_VALID:
+		flush_code = IB_WC_GENERAL_ERR;
+		termhdr->layer_etype = (LAYER_DDP << 4) | DDP_UNTAGGED_BUFFER;
+		termhdr->error_code = DDP_UNTAGGED_INV_MSN_RANGE;
+		break;
+	case NES_AEQE_AEID_DDP_UBE_DDP_MESSAGE_TOO_LONG_FOR_AVAILABLE_BUFFER:
+		flush_code = IB_WC_LOC_LEN_ERR;
+		termhdr->layer_etype = (LAYER_DDP << 4) | DDP_UNTAGGED_BUFFER;
+		termhdr->error_code = DDP_UNTAGGED_INV_TOO_LONG;
+		break;
+	case NES_AEQE_AEID_DDP_UBE_INVALID_DDP_VERSION:
+		flush_code = IB_WC_GENERAL_ERR;
+		if (is_tagged) {
+			termhdr->layer_etype = (LAYER_DDP << 4) | DDP_TAGGED_BUFFER;
+			termhdr->error_code = DDP_TAGGED_INV_DDP_VER;
+		} else {
+			termhdr->layer_etype = (LAYER_DDP << 4) | DDP_UNTAGGED_BUFFER;
+			termhdr->error_code = DDP_UNTAGGED_INV_DDP_VER;
+		}
+		break;
+	case NES_AEQE_AEID_DDP_UBE_INVALID_MO:
+		flush_code = IB_WC_GENERAL_ERR;
+		termhdr->layer_etype = (LAYER_DDP << 4) | DDP_UNTAGGED_BUFFER;
+		termhdr->error_code = DDP_UNTAGGED_INV_MO;
+		break;
+	case NES_AEQE_AEID_DDP_UBE_INVALID_MSN_NO_BUFFER_AVAILABLE:
+		flush_code = IB_WC_REM_OP_ERR;
+		termhdr->layer_etype = (LAYER_DDP << 4) | DDP_UNTAGGED_BUFFER;
+		termhdr->error_code = DDP_UNTAGGED_INV_MSN_NO_BUF;
+		break;
+	case NES_AEQE_AEID_DDP_UBE_INVALID_QN:
+		flush_code = IB_WC_GENERAL_ERR;
+		termhdr->layer_etype = (LAYER_DDP << 4) | DDP_UNTAGGED_BUFFER;
+		termhdr->error_code = DDP_UNTAGGED_INV_QN;
+		break;
+	case NES_AEQE_AEID_RDMAP_ROE_INVALID_RDMAP_VERSION:
+		flush_code = IB_WC_GENERAL_ERR;
+		termhdr->layer_etype = (LAYER_RDMA << 4) | RDMAP_REMOTE_OP;
+		termhdr->error_code = RDMAP_INV_RDMAP_VER;
+		break;
+	case NES_AEQE_AEID_RDMAP_ROE_UNEXPECTED_OPCODE:
+		flush_code = IB_WC_LOC_QP_OP_ERR;
+		termhdr->layer_etype = (LAYER_RDMA << 4) | RDMAP_REMOTE_OP;
+		termhdr->error_code = RDMAP_UNEXPECTED_OP;
+		break;
+	default:
+		flush_code = IB_WC_FATAL_ERR;
+		termhdr->layer_etype = (LAYER_RDMA << 4) | RDMAP_REMOTE_OP;
+		termhdr->error_code = RDMAP_UNSPECIFIED;
+		break;
+	}
+
+	if (copy_len)
+		memcpy(termhdr + 1, pkt, copy_len);
+
+	if ((flush_code) && ((NES_AEQE_INBOUND_RDMA & aeq_info) == 0)) {
+		if (aeq_info & NES_AEQE_SQ)
+			nesqp->term_sq_flush_code = flush_code;
+		else
+			nesqp->term_rq_flush_code = flush_code;
+	}
+
+	return sizeof(struct nes_terminate_hdr) + copy_len;
+}
+
+static void nes_terminate_connection(struct nes_device *nesdev, struct nes_qp *nesqp,
+		 struct nes_hw_aeqe *aeqe, enum ib_event_type eventtype)
+{
+	u64 context;
+	unsigned long flags;
+	u32 aeq_info;
+	u16 async_event_id;
+	u8 tcp_state;
+	u8 iwarp_state;
+	u32 termlen = 0;
+	u32 mod_qp_flags = NES_CQP_QP_IWARP_STATE_TERMINATE |
+			   NES_CQP_QP_TERM_DONT_SEND_FIN;
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+
+	if (nesqp->term_flags & NES_TERM_SENT)
+		return; /* Sanity check */
+
+	aeq_info = le32_to_cpu(aeqe->aeqe_words[NES_AEQE_MISC_IDX]);
+	tcp_state = (aeq_info & NES_AEQE_TCP_STATE_MASK) >> NES_AEQE_TCP_STATE_SHIFT;
+	iwarp_state = (aeq_info & NES_AEQE_IWARP_STATE_MASK) >> NES_AEQE_IWARP_STATE_SHIFT;
+	async_event_id = (u16)aeq_info;
+
+	context = (unsigned long)nesadapter->qp_table[le32_to_cpu(
+		aeqe->aeqe_words[NES_AEQE_COMP_QP_CQ_ID_IDX]) - NES_FIRST_QPN];
+	if (!context) {
+		WARN_ON(!context);
+		return;
+	}
+
+	nesqp = (struct nes_qp *)(unsigned long)context;
+	spin_lock_irqsave(&nesqp->lock, flags);
+	nesqp->hw_iwarp_state = iwarp_state;
+	nesqp->hw_tcp_state = tcp_state;
+	nesqp->last_aeq = async_event_id;
+	nesqp->terminate_eventtype = eventtype;
+	spin_unlock_irqrestore(&nesqp->lock, flags);
+
+	if (nesadapter->send_term_ok)
+		termlen = nes_bld_terminate_hdr(nesqp, async_event_id, aeq_info);
+	else
+		mod_qp_flags |= NES_CQP_QP_TERM_DONT_SEND_TERM_MSG;
+
+	if (!nesdev->iw_status)  {
+		nesqp->term_flags = NES_TERM_DONE;
+		nes_hw_modify_qp(nesdev, nesqp, NES_CQP_QP_IWARP_STATE_ERROR, 0, 0);
+		nes_cm_disconn(nesqp);
+	} else {
+		nes_terminate_start_timer(nesqp);
+		nesqp->term_flags |= NES_TERM_SENT;
+		nes_hw_modify_qp(nesdev, nesqp, mod_qp_flags, termlen, 0);
+	}
+}
+
+static void nes_terminate_send_fin(struct nes_device *nesdev,
+			  struct nes_qp *nesqp, struct nes_hw_aeqe *aeqe)
+{
+	u32 aeq_info;
+	u16 async_event_id;
+	u8 tcp_state;
+	u8 iwarp_state;
+	unsigned long flags;
+
+	aeq_info = le32_to_cpu(aeqe->aeqe_words[NES_AEQE_MISC_IDX]);
+	tcp_state = (aeq_info & NES_AEQE_TCP_STATE_MASK) >> NES_AEQE_TCP_STATE_SHIFT;
+	iwarp_state = (aeq_info & NES_AEQE_IWARP_STATE_MASK) >> NES_AEQE_IWARP_STATE_SHIFT;
+	async_event_id = (u16)aeq_info;
+
+	spin_lock_irqsave(&nesqp->lock, flags);
+	nesqp->hw_iwarp_state = iwarp_state;
+	nesqp->hw_tcp_state = tcp_state;
+	nesqp->last_aeq = async_event_id;
+	spin_unlock_irqrestore(&nesqp->lock, flags);
+
+	/* Send the fin only */
+	nes_hw_modify_qp(nesdev, nesqp, NES_CQP_QP_IWARP_STATE_TERMINATE |
+		NES_CQP_QP_TERM_DONT_SEND_TERM_MSG, 0, 0);
+}
+
+/* Cleanup after a terminate sent or received */
+static void nes_terminate_done(struct nes_qp *nesqp, int timeout_occurred)
+{
+	u32 next_iwarp_state = NES_CQP_QP_IWARP_STATE_ERROR;
+	unsigned long flags;
+	struct nes_vnic *nesvnic = to_nesvnic(nesqp->ibqp.device);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	u8 first_time = 0;
+
+	spin_lock_irqsave(&nesqp->lock, flags);
+	if (nesqp->hte_added) {
+		nesqp->hte_added = 0;
+		next_iwarp_state |= NES_CQP_QP_DEL_HTE;
+	}
+
+	first_time = (nesqp->term_flags & NES_TERM_DONE) == 0;
+	nesqp->term_flags |= NES_TERM_DONE;
+	spin_unlock_irqrestore(&nesqp->lock, flags);
+
+	/* Make sure we go through this only once */
+	if (first_time) {
+		if (timeout_occurred == 0)
+			del_timer(&nesqp->terminate_timer);
+		else
+			next_iwarp_state |= NES_CQP_QP_RESET;
+
+		nes_hw_modify_qp(nesdev, nesqp, next_iwarp_state, 0, 0);
+		nes_cm_disconn(nesqp);
+	}
+}
+
+static void nes_terminate_received(struct nes_device *nesdev,
+				struct nes_qp *nesqp, struct nes_hw_aeqe *aeqe)
+{
+	u32 aeq_info;
+	u8 *pkt;
+	u32 *mpa;
+	u8 ddp_ctl;
+	u8 rdma_ctl;
+	u16 aeq_id = 0;
+
+	aeq_info = le32_to_cpu(aeqe->aeqe_words[NES_AEQE_MISC_IDX]);
+	if (aeq_info & NES_AEQE_Q2_DATA_WRITTEN) {
+		/* Terminate is not a performance path so the silicon */
+		/* did not validate the frame - do it now */
+		pkt = nesqp->hwqp.q2_vbase + BAD_FRAME_OFFSET;
+		mpa = (u32 *)locate_mpa(pkt, aeq_info);
+		ddp_ctl = (be32_to_cpu(mpa[0]) >> 8) & 0xff;
+		rdma_ctl = be32_to_cpu(mpa[0]) & 0xff;
+		if ((ddp_ctl & 0xc0) != 0x40)
+			aeq_id = NES_AEQE_AEID_DDP_LCE_LOCAL_CATASTROPHIC;
+		else if ((ddp_ctl & 0x03) != 1)
+			aeq_id = NES_AEQE_AEID_DDP_UBE_INVALID_DDP_VERSION;
+		else if (be32_to_cpu(mpa[2]) != 2)
+			aeq_id = NES_AEQE_AEID_DDP_UBE_INVALID_QN;
+		else if (be32_to_cpu(mpa[3]) != 1)
+			aeq_id = NES_AEQE_AEID_DDP_INVALID_MSN_GAP_IN_MSN;
+		else if (be32_to_cpu(mpa[4]) != 0)
+			aeq_id = NES_AEQE_AEID_DDP_UBE_INVALID_MO;
+		else if ((rdma_ctl & 0xc0) != 0x40)
+			aeq_id = NES_AEQE_AEID_RDMAP_ROE_INVALID_RDMAP_VERSION;
+
+		if (aeq_id) {
+			/* Bad terminate recvd - send back a terminate */
+			aeq_info = (aeq_info & 0xffff0000) | aeq_id;
+			aeqe->aeqe_words[NES_AEQE_MISC_IDX] = cpu_to_le32(aeq_info);
+			nes_terminate_connection(nesdev, nesqp, aeqe, IB_EVENT_QP_FATAL);
+			return;
+		}
+	}
+
+	nesqp->term_flags |= NES_TERM_RCVD;
+	nesqp->terminate_eventtype = IB_EVENT_QP_FATAL;
+	nes_terminate_start_timer(nesqp);
+	nes_terminate_send_fin(nesdev, nesqp, aeqe);
+}
+
+/* Timeout routine in case terminate fails to complete */
+void nes_terminate_timeout(unsigned long context)
+{
+	struct nes_qp *nesqp = (struct nes_qp *)(unsigned long)context;
+
+	nes_terminate_done(nesqp, 1);
+}
+
+/* Set a timer in case hw cannot complete the terminate sequence */
+static void nes_terminate_start_timer(struct nes_qp *nesqp)
+{
+	mod_timer(&nesqp->terminate_timer, (jiffies + HZ));
+}
+
+/**
+ * nes_process_iwarp_aeqe
+ */
+static void nes_process_iwarp_aeqe(struct nes_device *nesdev,
+				   struct nes_hw_aeqe *aeqe)
+{
+	u64 context;
+	unsigned long flags;
+	struct nes_qp *nesqp;
+	struct nes_hw_cq *hw_cq;
+	struct nes_cq *nescq;
+	int resource_allocated;
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+	u32 aeq_info;
+	u32 next_iwarp_state = 0;
+	u32 aeqe_cq_id;
+	u16 async_event_id;
+	u8 tcp_state;
+	u8 iwarp_state;
+	struct ib_event ibevent;
+
+	nes_debug(NES_DBG_AEQ, "\n");
+	aeq_info = le32_to_cpu(aeqe->aeqe_words[NES_AEQE_MISC_IDX]);
+	if ((NES_AEQE_INBOUND_RDMA & aeq_info) || (!(NES_AEQE_QP & aeq_info))) {
+		context  = le32_to_cpu(aeqe->aeqe_words[NES_AEQE_COMP_CTXT_LOW_IDX]);
+		context += ((u64)le32_to_cpu(aeqe->aeqe_words[NES_AEQE_COMP_CTXT_HIGH_IDX])) << 32;
+	} else {
+		context = (unsigned long)nesadapter->qp_table[le32_to_cpu(
+						aeqe->aeqe_words[NES_AEQE_COMP_QP_CQ_ID_IDX]) - NES_FIRST_QPN];
+		BUG_ON(!context);
+	}
+
+	/* context is nesqp unless async_event_id == CQ ERROR */
+	nesqp = (struct nes_qp *)(unsigned long)context;
+	async_event_id = (u16)aeq_info;
+	tcp_state = (aeq_info & NES_AEQE_TCP_STATE_MASK) >> NES_AEQE_TCP_STATE_SHIFT;
+	iwarp_state = (aeq_info & NES_AEQE_IWARP_STATE_MASK) >> NES_AEQE_IWARP_STATE_SHIFT;
+	nes_debug(NES_DBG_AEQ, "aeid = 0x%04X, qp-cq id = %d, aeqe = %p,"
+			" Tcp state = %s, iWARP state = %s\n",
+			async_event_id,
+			le32_to_cpu(aeqe->aeqe_words[NES_AEQE_COMP_QP_CQ_ID_IDX]), aeqe,
+			nes_tcp_state_str[tcp_state], nes_iwarp_state_str[iwarp_state]);
+
+	aeqe_cq_id = le32_to_cpu(aeqe->aeqe_words[NES_AEQE_COMP_QP_CQ_ID_IDX]);
+	if (aeq_info & NES_AEQE_QP) {
+		if (!nes_is_resource_allocated(nesadapter,
+				nesadapter->allocated_qps,
+				aeqe_cq_id))
+			return;
+	}
+
+	switch (async_event_id) {
+		case NES_AEQE_AEID_LLP_FIN_RECEIVED:
+			if (nesqp->term_flags)
+				return; /* Ignore it, wait for close complete */
+
+			if (atomic_inc_return(&nesqp->close_timer_started) == 1) {
+				if ((tcp_state == NES_AEQE_TCP_STATE_CLOSE_WAIT) &&
+					(nesqp->ibqp_state == IB_QPS_RTS)) {
+					spin_lock_irqsave(&nesqp->lock, flags);
+					nesqp->hw_iwarp_state = iwarp_state;
+					nesqp->hw_tcp_state = tcp_state;
+					nesqp->last_aeq = async_event_id;
+					next_iwarp_state = NES_CQP_QP_IWARP_STATE_CLOSING;
+					nesqp->hw_iwarp_state = NES_AEQE_IWARP_STATE_CLOSING;
+					spin_unlock_irqrestore(&nesqp->lock, flags);
+					nes_hw_modify_qp(nesdev, nesqp, next_iwarp_state, 0, 0);
+					nes_cm_disconn(nesqp);
+				}
+				nesqp->cm_id->add_ref(nesqp->cm_id);
+				schedule_nes_timer(nesqp->cm_node, (struct sk_buff *)nesqp,
+						NES_TIMER_TYPE_CLOSE, 1, 0);
+				nes_debug(NES_DBG_AEQ, "QP%u Not decrementing QP refcount (%d),"
+						" need ae to finish up, original_last_aeq = 0x%04X."
+						" last_aeq = 0x%04X, scheduling timer. TCP state = %d\n",
+						nesqp->hwqp.qp_id, atomic_read(&nesqp->refcount),
+						async_event_id, nesqp->last_aeq, tcp_state);
+			}
+			break;
+		case NES_AEQE_AEID_LLP_CLOSE_COMPLETE:
+			spin_lock_irqsave(&nesqp->lock, flags);
+			nesqp->hw_iwarp_state = iwarp_state;
+			nesqp->hw_tcp_state = tcp_state;
+			nesqp->last_aeq = async_event_id;
+			spin_unlock_irqrestore(&nesqp->lock, flags);
+			nes_cm_disconn(nesqp);
+			break;
+
+		case NES_AEQE_AEID_RESET_SENT:
+			tcp_state = NES_AEQE_TCP_STATE_CLOSED;
+			spin_lock_irqsave(&nesqp->lock, flags);
+			nesqp->hw_iwarp_state = iwarp_state;
+			nesqp->hw_tcp_state = tcp_state;
+			nesqp->last_aeq = async_event_id;
+			nesqp->hte_added = 0;
+			spin_unlock_irqrestore(&nesqp->lock, flags);
+			next_iwarp_state = NES_CQP_QP_IWARP_STATE_ERROR | NES_CQP_QP_DEL_HTE;
+			nes_hw_modify_qp(nesdev, nesqp, next_iwarp_state, 0, 0);
+			nes_cm_disconn(nesqp);
+			break;
+
+		case NES_AEQE_AEID_LLP_CONNECTION_RESET:
+			if (atomic_read(&nesqp->close_timer_started))
+				return;
+			spin_lock_irqsave(&nesqp->lock, flags);
+			nesqp->hw_iwarp_state = iwarp_state;
+			nesqp->hw_tcp_state = tcp_state;
+			nesqp->last_aeq = async_event_id;
+			spin_unlock_irqrestore(&nesqp->lock, flags);
+			nes_cm_disconn(nesqp);
+			break;
+
+		case NES_AEQE_AEID_TERMINATE_SENT:
+			nes_terminate_send_fin(nesdev, nesqp, aeqe);
+			break;
+
+		case NES_AEQE_AEID_LLP_TERMINATE_RECEIVED:
+			nes_terminate_received(nesdev, nesqp, aeqe);
+			break;
+
+		case NES_AEQE_AEID_AMP_BAD_STAG_KEY:
+		case NES_AEQE_AEID_AMP_BAD_STAG_INDEX:
+		case NES_AEQE_AEID_AMP_UNALLOCATED_STAG:
+		case NES_AEQE_AEID_AMP_INVALID_STAG:
+		case NES_AEQE_AEID_AMP_RIGHTS_VIOLATION:
+		case NES_AEQE_AEID_AMP_INVALIDATE_NO_REMOTE_ACCESS_RIGHTS:
+		case NES_AEQE_AEID_PRIV_OPERATION_DENIED:
+		case NES_AEQE_AEID_DDP_UBE_DDP_MESSAGE_TOO_LONG_FOR_AVAILABLE_BUFFER:
+		case NES_AEQE_AEID_AMP_BOUNDS_VIOLATION:
+		case NES_AEQE_AEID_AMP_TO_WRAP:
+			printk(KERN_ERR PFX "QP[%u] async_event_id=0x%04X IB_EVENT_QP_ACCESS_ERR\n",
+					nesqp->hwqp.qp_id, async_event_id);
+			nes_terminate_connection(nesdev, nesqp, aeqe, IB_EVENT_QP_ACCESS_ERR);
+			break;
+
+		case NES_AEQE_AEID_LLP_SEGMENT_TOO_LARGE:
+		case NES_AEQE_AEID_LLP_SEGMENT_TOO_SMALL:
+		case NES_AEQE_AEID_DDP_UBE_INVALID_MO:
+		case NES_AEQE_AEID_DDP_UBE_INVALID_QN:
+			if (iwarp_opcode(nesqp, aeq_info) > IWARP_OPCODE_TERM) {
+				aeq_info &= 0xffff0000;
+				aeq_info |= NES_AEQE_AEID_RDMAP_ROE_UNEXPECTED_OPCODE;
+				aeqe->aeqe_words[NES_AEQE_MISC_IDX] = cpu_to_le32(aeq_info);
+			}
+
+		case NES_AEQE_AEID_RDMAP_ROE_BAD_LLP_CLOSE:
+		case NES_AEQE_AEID_LLP_TOO_MANY_RETRIES:
+		case NES_AEQE_AEID_DDP_UBE_INVALID_MSN_NO_BUFFER_AVAILABLE:
+		case NES_AEQE_AEID_LLP_RECEIVED_MPA_CRC_ERROR:
+		case NES_AEQE_AEID_AMP_BAD_QP:
+		case NES_AEQE_AEID_LLP_RECEIVED_MARKER_AND_LENGTH_FIELDS_DONT_MATCH:
+		case NES_AEQE_AEID_DDP_LCE_LOCAL_CATASTROPHIC:
+		case NES_AEQE_AEID_DDP_NO_L_BIT:
+		case NES_AEQE_AEID_DDP_INVALID_MSN_GAP_IN_MSN:
+		case NES_AEQE_AEID_DDP_INVALID_MSN_RANGE_IS_NOT_VALID:
+		case NES_AEQE_AEID_DDP_UBE_INVALID_DDP_VERSION:
+		case NES_AEQE_AEID_RDMAP_ROE_INVALID_RDMAP_VERSION:
+		case NES_AEQE_AEID_RDMAP_ROE_UNEXPECTED_OPCODE:
+		case NES_AEQE_AEID_AMP_BAD_PD:
+		case NES_AEQE_AEID_AMP_FASTREG_SHARED:
+		case NES_AEQE_AEID_AMP_FASTREG_VALID_STAG:
+		case NES_AEQE_AEID_AMP_FASTREG_MW_STAG:
+		case NES_AEQE_AEID_AMP_FASTREG_INVALID_RIGHTS:
+		case NES_AEQE_AEID_AMP_FASTREG_PBL_TABLE_OVERFLOW:
+		case NES_AEQE_AEID_AMP_FASTREG_INVALID_LENGTH:
+		case NES_AEQE_AEID_AMP_INVALIDATE_SHARED:
+		case NES_AEQE_AEID_AMP_INVALIDATE_MR_WITH_BOUND_WINDOWS:
+		case NES_AEQE_AEID_AMP_MWBIND_VALID_STAG:
+		case NES_AEQE_AEID_AMP_MWBIND_OF_MR_STAG:
+		case NES_AEQE_AEID_AMP_MWBIND_TO_ZERO_BASED_STAG:
+		case NES_AEQE_AEID_AMP_MWBIND_TO_MW_STAG:
+		case NES_AEQE_AEID_AMP_MWBIND_INVALID_RIGHTS:
+		case NES_AEQE_AEID_AMP_MWBIND_INVALID_BOUNDS:
+		case NES_AEQE_AEID_AMP_MWBIND_TO_INVALID_PARENT:
+		case NES_AEQE_AEID_AMP_MWBIND_BIND_DISABLED:
+		case NES_AEQE_AEID_BAD_CLOSE:
+		case NES_AEQE_AEID_RDMA_READ_WHILE_ORD_ZERO:
+		case NES_AEQE_AEID_STAG_ZERO_INVALID:
+		case NES_AEQE_AEID_ROE_INVALID_RDMA_READ_REQUEST:
+		case NES_AEQE_AEID_ROE_INVALID_RDMA_WRITE_OR_READ_RESP:
+			printk(KERN_ERR PFX "QP[%u] async_event_id=0x%04X IB_EVENT_QP_FATAL\n",
+					nesqp->hwqp.qp_id, async_event_id);
+			print_ip(nesqp->cm_node);
+			if (!atomic_read(&nesqp->close_timer_started))
+				nes_terminate_connection(nesdev, nesqp, aeqe, IB_EVENT_QP_FATAL);
+			break;
+
+		case NES_AEQE_AEID_CQ_OPERATION_ERROR:
+			context <<= 1;
+			nes_debug(NES_DBG_AEQ, "Processing an NES_AEQE_AEID_CQ_OPERATION_ERROR event on CQ%u, %p\n",
+					le32_to_cpu(aeqe->aeqe_words[NES_AEQE_COMP_QP_CQ_ID_IDX]), (void *)(unsigned long)context);
+			resource_allocated = nes_is_resource_allocated(nesadapter, nesadapter->allocated_cqs,
+					le32_to_cpu(aeqe->aeqe_words[NES_AEQE_COMP_QP_CQ_ID_IDX]));
+			if (resource_allocated) {
+				printk(KERN_ERR PFX "%s: Processing an NES_AEQE_AEID_CQ_OPERATION_ERROR event on CQ%u\n",
+						__func__, le32_to_cpu(aeqe->aeqe_words[NES_AEQE_COMP_QP_CQ_ID_IDX]));
+				hw_cq = (struct nes_hw_cq *)(unsigned long)context;
+				if (hw_cq) {
+					nescq = container_of(hw_cq, struct nes_cq, hw_cq);
+					if (nescq->ibcq.event_handler) {
+						ibevent.device = nescq->ibcq.device;
+						ibevent.event = IB_EVENT_CQ_ERR;
+						ibevent.element.cq = &nescq->ibcq;
+						nescq->ibcq.event_handler(&ibevent, nescq->ibcq.cq_context);
+					}
+				}
+			}
+			break;
+
+		default:
+			nes_debug(NES_DBG_AEQ, "Processing an iWARP related AE for QP, misc = 0x%04X\n",
+					async_event_id);
+			break;
+	}
+
+}
+
+/**
+ * nes_iwarp_ce_handler
+ */
+void nes_iwarp_ce_handler(struct nes_device *nesdev, struct nes_hw_cq *hw_cq)
+{
+	struct nes_cq *nescq = container_of(hw_cq, struct nes_cq, hw_cq);
+
+	/* nes_debug(NES_DBG_CQ, "Processing completion event for iWARP CQ%u.\n",
+			nescq->hw_cq.cq_number); */
+	nes_write32(nesdev->regs+NES_CQ_ACK, nescq->hw_cq.cq_number);
+
+	if (nescq->ibcq.comp_handler)
+		nescq->ibcq.comp_handler(&nescq->ibcq, nescq->ibcq.cq_context);
+
+	return;
+}
+
+
+/**
+ * nes_manage_apbvt()
+ */
+int nes_manage_apbvt(struct nes_vnic *nesvnic, u32 accel_local_port,
+		u32 nic_index, u32 add_port)
+{
+	struct nes_device *nesdev = nesvnic->nesdev;
+	struct nes_hw_cqp_wqe *cqp_wqe;
+	struct nes_cqp_request *cqp_request;
+	int ret = 0;
+	u16 major_code;
+
+	/* Send manage APBVT request to CQP */
+	cqp_request = nes_get_cqp_request(nesdev);
+	if (cqp_request == NULL) {
+		nes_debug(NES_DBG_QP, "Failed to get a cqp_request.\n");
+		return -ENOMEM;
+	}
+	cqp_request->waiting = 1;
+	cqp_wqe = &cqp_request->cqp_wqe;
+
+	nes_debug(NES_DBG_QP, "%s APBV for local port=%u(0x%04x), nic_index=%u\n",
+			(add_port == NES_MANAGE_APBVT_ADD) ? "ADD" : "DEL",
+			accel_local_port, accel_local_port, nic_index);
+
+	nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
+	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX, (NES_CQP_MANAGE_APBVT |
+			((add_port == NES_MANAGE_APBVT_ADD) ? NES_CQP_APBVT_ADD : 0)));
+	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_ID_IDX,
+			((nic_index << NES_CQP_APBVT_NIC_SHIFT) | accel_local_port));
+
+	nes_debug(NES_DBG_QP, "Waiting for CQP completion for APBVT.\n");
+
+	atomic_set(&cqp_request->refcount, 2);
+	nes_post_cqp_request(nesdev, cqp_request);
+
+	if (add_port == NES_MANAGE_APBVT_ADD)
+		ret = wait_event_timeout(cqp_request->waitq, (cqp_request->request_done != 0),
+				NES_EVENT_TIMEOUT);
+	nes_debug(NES_DBG_QP, "Completed, ret=%u,  CQP Major:Minor codes = 0x%04X:0x%04X\n",
+			ret, cqp_request->major_code, cqp_request->minor_code);
+	major_code = cqp_request->major_code;
+
+	nes_put_cqp_request(nesdev, cqp_request);
+
+	if (!ret)
+		return -ETIME;
+	else if (major_code)
+		return -EIO;
+	else
+		return 0;
+}
+
+
+/**
+ * nes_manage_arp_cache
+ */
+void nes_manage_arp_cache(struct net_device *netdev, unsigned char *mac_addr,
+		u32 ip_addr, u32 action)
+{
+	struct nes_hw_cqp_wqe *cqp_wqe;
+	struct nes_vnic *nesvnic = netdev_priv(netdev);
+	struct nes_device *nesdev;
+	struct nes_cqp_request *cqp_request;
+	int arp_index;
+
+	nesdev = nesvnic->nesdev;
+	arp_index = nes_arp_table(nesdev, ip_addr, mac_addr, action);
+	if (arp_index == -1) {
+		return;
+	}
+
+	/* update the ARP entry */
+	cqp_request = nes_get_cqp_request(nesdev);
+	if (cqp_request == NULL) {
+		nes_debug(NES_DBG_NETDEV, "Failed to get a cqp_request.\n");
+		return;
+	}
+	cqp_request->waiting = 0;
+	cqp_wqe = &cqp_request->cqp_wqe;
+	nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
+
+	cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] = cpu_to_le32(
+			NES_CQP_MANAGE_ARP_CACHE | NES_CQP_ARP_PERM);
+	cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] |= cpu_to_le32(
+			(u32)PCI_FUNC(nesdev->pcidev->devfn) << NES_CQP_ARP_AEQ_INDEX_SHIFT);
+	cqp_wqe->wqe_words[NES_CQP_WQE_ID_IDX] = cpu_to_le32(arp_index);
+
+	if (action == NES_ARP_ADD) {
+		cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] |= cpu_to_le32(NES_CQP_ARP_VALID);
+		cqp_wqe->wqe_words[NES_CQP_ARP_WQE_MAC_ADDR_LOW_IDX] = cpu_to_le32(
+				(((u32)mac_addr[2]) << 24) | (((u32)mac_addr[3]) << 16) |
+				(((u32)mac_addr[4]) << 8)  | (u32)mac_addr[5]);
+		cqp_wqe->wqe_words[NES_CQP_ARP_WQE_MAC_HIGH_IDX] = cpu_to_le32(
+				(((u32)mac_addr[0]) << 16) | (u32)mac_addr[1]);
+	} else {
+		cqp_wqe->wqe_words[NES_CQP_ARP_WQE_MAC_ADDR_LOW_IDX] = 0;
+		cqp_wqe->wqe_words[NES_CQP_ARP_WQE_MAC_HIGH_IDX] = 0;
+	}
+
+	nes_debug(NES_DBG_NETDEV, "Not waiting for CQP, cqp.sq_head=%u, cqp.sq_tail=%u\n",
+			nesdev->cqp.sq_head, nesdev->cqp.sq_tail);
+
+	atomic_set(&cqp_request->refcount, 1);
+	nes_post_cqp_request(nesdev, cqp_request);
+}
+
+
+/**
+ * flush_wqes
+ */
+void flush_wqes(struct nes_device *nesdev, struct nes_qp *nesqp,
+		u32 which_wq, u32 wait_completion)
+{
+	struct nes_cqp_request *cqp_request;
+	struct nes_hw_cqp_wqe *cqp_wqe;
+	u32 sq_code = (NES_IWARP_CQE_MAJOR_FLUSH << 16) | NES_IWARP_CQE_MINOR_FLUSH;
+	u32 rq_code = (NES_IWARP_CQE_MAJOR_FLUSH << 16) | NES_IWARP_CQE_MINOR_FLUSH;
+	int ret;
+
+	cqp_request = nes_get_cqp_request(nesdev);
+	if (cqp_request == NULL) {
+		nes_debug(NES_DBG_QP, "Failed to get a cqp_request.\n");
+		return;
+	}
+	if (wait_completion) {
+		cqp_request->waiting = 1;
+		atomic_set(&cqp_request->refcount, 2);
+	} else {
+		cqp_request->waiting = 0;
+	}
+	cqp_wqe = &cqp_request->cqp_wqe;
+	nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
+
+	/* If wqe in error was identified, set code to be put into cqe */
+	if ((nesqp->term_sq_flush_code) && (which_wq & NES_CQP_FLUSH_SQ)) {
+		which_wq |= NES_CQP_FLUSH_MAJ_MIN;
+		sq_code = (CQE_MAJOR_DRV << 16) | nesqp->term_sq_flush_code;
+		nesqp->term_sq_flush_code = 0;
+	}
+
+	if ((nesqp->term_rq_flush_code) && (which_wq & NES_CQP_FLUSH_RQ)) {
+		which_wq |= NES_CQP_FLUSH_MAJ_MIN;
+		rq_code = (CQE_MAJOR_DRV << 16) | nesqp->term_rq_flush_code;
+		nesqp->term_rq_flush_code = 0;
+	}
+
+	if (which_wq & NES_CQP_FLUSH_MAJ_MIN) {
+		cqp_wqe->wqe_words[NES_CQP_QP_WQE_FLUSH_SQ_CODE] = cpu_to_le32(sq_code);
+		cqp_wqe->wqe_words[NES_CQP_QP_WQE_FLUSH_RQ_CODE] = cpu_to_le32(rq_code);
+	}
+
+	cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] =
+			cpu_to_le32(NES_CQP_FLUSH_WQES | which_wq);
+	cqp_wqe->wqe_words[NES_CQP_WQE_ID_IDX] = cpu_to_le32(nesqp->hwqp.qp_id);
+
+	nes_post_cqp_request(nesdev, cqp_request);
+
+	if (wait_completion) {
+		/* Wait for CQP */
+		ret = wait_event_timeout(cqp_request->waitq, (cqp_request->request_done != 0),
+				NES_EVENT_TIMEOUT);
+		nes_debug(NES_DBG_QP, "Flush SQ QP WQEs completed, ret=%u,"
+				" CQP Major:Minor codes = 0x%04X:0x%04X\n",
+				ret, cqp_request->major_code, cqp_request->minor_code);
+		nes_put_cqp_request(nesdev, cqp_request);
+	}
+}
diff --git a/drivers/infiniband/hw/nes/nes_hw.h b/drivers/infiniband/hw/nes/nes_hw.h
new file mode 100644
index 0000000..d748e4b
--- /dev/null
+++ b/drivers/infiniband/hw/nes/nes_hw.h
@@ -0,0 +1,1392 @@
+/*
+* Copyright (c) 2006 - 2011 Intel Corporation.  All rights reserved.
+*
+* This software is available to you under a choice of one of two
+* licenses.  You may choose to be licensed under the terms of the GNU
+* General Public License (GPL) Version 2, available from the file
+* COPYING in the main directory of this source tree, or the
+* OpenIB.org BSD license below:
+*
+*     Redistribution and use in source and binary forms, with or
+*     without modification, are permitted provided that the following
+*     conditions are met:
+*
+*      - Redistributions of source code must retain the above
+*        copyright notice, this list of conditions and the following
+*        disclaimer.
+*
+*      - Redistributions in binary form must reproduce the above
+*        copyright notice, this list of conditions and the following
+*        disclaimer in the documentation and/or other materials
+*        provided with the distribution.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*/
+
+#ifndef __NES_HW_H
+#define __NES_HW_H
+
+#include <linux/inet_lro.h>
+
+#define NES_PHY_TYPE_CX4       1
+#define NES_PHY_TYPE_1G        2
+#define NES_PHY_TYPE_ARGUS     4
+#define NES_PHY_TYPE_PUMA_1G   5
+#define NES_PHY_TYPE_PUMA_10G  6
+#define NES_PHY_TYPE_GLADIUS   7
+#define NES_PHY_TYPE_SFP_D     8
+#define NES_PHY_TYPE_KR	       9
+
+#define NES_MULTICAST_PF_MAX 8
+#define NES_A0 3
+
+#define NES_ENABLE_PAU 0x07000001
+#define NES_DISABLE_PAU 0x07000000
+#define NES_PAU_COUNTER 10
+#define NES_CQP_OPCODE_MASK 0x3f
+
+enum pci_regs {
+	NES_INT_STAT = 0x0000,
+	NES_INT_MASK = 0x0004,
+	NES_INT_PENDING = 0x0008,
+	NES_INTF_INT_STAT = 0x000C,
+	NES_INTF_INT_MASK = 0x0010,
+	NES_TIMER_STAT = 0x0014,
+	NES_PERIODIC_CONTROL = 0x0018,
+	NES_ONE_SHOT_CONTROL = 0x001C,
+	NES_EEPROM_COMMAND = 0x0020,
+	NES_EEPROM_DATA = 0x0024,
+	NES_FLASH_COMMAND = 0x0028,
+	NES_FLASH_DATA  = 0x002C,
+	NES_SOFTWARE_RESET = 0x0030,
+	NES_CQ_ACK = 0x0034,
+	NES_WQE_ALLOC = 0x0040,
+	NES_CQE_ALLOC = 0x0044,
+	NES_AEQ_ALLOC = 0x0048
+};
+
+enum indexed_regs {
+	NES_IDX_CREATE_CQP_LOW = 0x0000,
+	NES_IDX_CREATE_CQP_HIGH = 0x0004,
+	NES_IDX_QP_CONTROL = 0x0040,
+	NES_IDX_FLM_CONTROL = 0x0080,
+	NES_IDX_INT_CPU_STATUS = 0x00a0,
+	NES_IDX_GPR_TRIGGER = 0x00bc,
+	NES_IDX_GPIO_CONTROL = 0x00f0,
+	NES_IDX_GPIO_DATA = 0x00f4,
+	NES_IDX_GPR2 = 0x010c,
+	NES_IDX_TCP_CONFIG0 = 0x01e4,
+	NES_IDX_TCP_TIMER_CONFIG = 0x01ec,
+	NES_IDX_TCP_NOW = 0x01f0,
+	NES_IDX_QP_MAX_CFG_SIZES = 0x0200,
+	NES_IDX_QP_CTX_SIZE = 0x0218,
+	NES_IDX_TCP_TIMER_SIZE0 = 0x0238,
+	NES_IDX_TCP_TIMER_SIZE1 = 0x0240,
+	NES_IDX_ARP_CACHE_SIZE = 0x0258,
+	NES_IDX_CQ_CTX_SIZE = 0x0260,
+	NES_IDX_MRT_SIZE = 0x0278,
+	NES_IDX_PBL_REGION_SIZE = 0x0280,
+	NES_IDX_IRRQ_COUNT = 0x02b0,
+	NES_IDX_RX_WINDOW_BUFFER_PAGE_TABLE_SIZE = 0x02f0,
+	NES_IDX_RX_WINDOW_BUFFER_SIZE = 0x0300,
+	NES_IDX_DST_IP_ADDR = 0x0400,
+	NES_IDX_PCIX_DIAG = 0x08e8,
+	NES_IDX_MPP_DEBUG = 0x0a00,
+	NES_IDX_PORT_RX_DISCARDS = 0x0a30,
+	NES_IDX_PORT_TX_DISCARDS = 0x0a34,
+	NES_IDX_MPP_LB_DEBUG = 0x0b00,
+	NES_IDX_DENALI_CTL_22 = 0x1058,
+	NES_IDX_MAC_TX_CONTROL = 0x2000,
+	NES_IDX_MAC_TX_CONFIG = 0x2004,
+	NES_IDX_MAC_TX_PAUSE_QUANTA = 0x2008,
+	NES_IDX_MAC_RX_CONTROL = 0x200c,
+	NES_IDX_MAC_RX_CONFIG = 0x2010,
+	NES_IDX_MAC_EXACT_MATCH_BOTTOM = 0x201c,
+	NES_IDX_MAC_MDIO_CONTROL = 0x2084,
+	NES_IDX_MAC_TX_OCTETS_LOW = 0x2100,
+	NES_IDX_MAC_TX_OCTETS_HIGH = 0x2104,
+	NES_IDX_MAC_TX_FRAMES_LOW = 0x2108,
+	NES_IDX_MAC_TX_FRAMES_HIGH = 0x210c,
+	NES_IDX_MAC_TX_PAUSE_FRAMES = 0x2118,
+	NES_IDX_MAC_TX_ERRORS = 0x2138,
+	NES_IDX_MAC_RX_OCTETS_LOW = 0x213c,
+	NES_IDX_MAC_RX_OCTETS_HIGH = 0x2140,
+	NES_IDX_MAC_RX_FRAMES_LOW = 0x2144,
+	NES_IDX_MAC_RX_FRAMES_HIGH = 0x2148,
+	NES_IDX_MAC_RX_BC_FRAMES_LOW = 0x214c,
+	NES_IDX_MAC_RX_MC_FRAMES_HIGH = 0x2150,
+	NES_IDX_MAC_RX_PAUSE_FRAMES = 0x2154,
+	NES_IDX_MAC_RX_SHORT_FRAMES = 0x2174,
+	NES_IDX_MAC_RX_OVERSIZED_FRAMES = 0x2178,
+	NES_IDX_MAC_RX_JABBER_FRAMES = 0x217c,
+	NES_IDX_MAC_RX_CRC_ERR_FRAMES = 0x2180,
+	NES_IDX_MAC_RX_LENGTH_ERR_FRAMES = 0x2184,
+	NES_IDX_MAC_RX_SYMBOL_ERR_FRAMES = 0x2188,
+	NES_IDX_MAC_INT_STATUS = 0x21f0,
+	NES_IDX_MAC_INT_MASK = 0x21f4,
+	NES_IDX_PHY_PCS_CONTROL_STATUS0 = 0x2800,
+	NES_IDX_PHY_PCS_CONTROL_STATUS1 = 0x2a00,
+	NES_IDX_ETH_SERDES_COMMON_CONTROL0 = 0x2808,
+	NES_IDX_ETH_SERDES_COMMON_CONTROL1 = 0x2a08,
+	NES_IDX_ETH_SERDES_COMMON_STATUS0 = 0x280c,
+	NES_IDX_ETH_SERDES_COMMON_STATUS1 = 0x2a0c,
+	NES_IDX_ETH_SERDES_TX_EMP0 = 0x2810,
+	NES_IDX_ETH_SERDES_TX_EMP1 = 0x2a10,
+	NES_IDX_ETH_SERDES_TX_DRIVE0 = 0x2814,
+	NES_IDX_ETH_SERDES_TX_DRIVE1 = 0x2a14,
+	NES_IDX_ETH_SERDES_RX_MODE0 = 0x2818,
+	NES_IDX_ETH_SERDES_RX_MODE1 = 0x2a18,
+	NES_IDX_ETH_SERDES_RX_SIGDET0 = 0x281c,
+	NES_IDX_ETH_SERDES_RX_SIGDET1 = 0x2a1c,
+	NES_IDX_ETH_SERDES_BYPASS0 = 0x2820,
+	NES_IDX_ETH_SERDES_BYPASS1 = 0x2a20,
+	NES_IDX_ETH_SERDES_LOOPBACK_CONTROL0 = 0x2824,
+	NES_IDX_ETH_SERDES_LOOPBACK_CONTROL1 = 0x2a24,
+	NES_IDX_ETH_SERDES_RX_EQ_CONTROL0 = 0x2828,
+	NES_IDX_ETH_SERDES_RX_EQ_CONTROL1 = 0x2a28,
+	NES_IDX_ETH_SERDES_RX_EQ_STATUS0 = 0x282c,
+	NES_IDX_ETH_SERDES_RX_EQ_STATUS1 = 0x2a2c,
+	NES_IDX_ETH_SERDES_CDR_RESET0 = 0x2830,
+	NES_IDX_ETH_SERDES_CDR_RESET1 = 0x2a30,
+	NES_IDX_ETH_SERDES_CDR_CONTROL0 = 0x2834,
+	NES_IDX_ETH_SERDES_CDR_CONTROL1 = 0x2a34,
+	NES_IDX_ETH_SERDES_TX_HIGHZ_LANE_MODE0 = 0x2838,
+	NES_IDX_ETH_SERDES_TX_HIGHZ_LANE_MODE1 = 0x2a38,
+	NES_IDX_ENDNODE0_NSTAT_RX_DISCARD = 0x3080,
+	NES_IDX_ENDNODE0_NSTAT_RX_OCTETS_LO = 0x3000,
+	NES_IDX_ENDNODE0_NSTAT_RX_OCTETS_HI = 0x3004,
+	NES_IDX_ENDNODE0_NSTAT_RX_FRAMES_LO = 0x3008,
+	NES_IDX_ENDNODE0_NSTAT_RX_FRAMES_HI = 0x300c,
+	NES_IDX_ENDNODE0_NSTAT_TX_OCTETS_LO = 0x7000,
+	NES_IDX_ENDNODE0_NSTAT_TX_OCTETS_HI = 0x7004,
+	NES_IDX_ENDNODE0_NSTAT_TX_FRAMES_LO = 0x7008,
+	NES_IDX_ENDNODE0_NSTAT_TX_FRAMES_HI = 0x700c,
+	NES_IDX_WQM_CONFIG0 = 0x5000,
+	NES_IDX_WQM_CONFIG1 = 0x5004,
+	NES_IDX_CM_CONFIG = 0x5100,
+	NES_IDX_NIC_LOGPORT_TO_PHYPORT = 0x6000,
+	NES_IDX_NIC_PHYPORT_TO_USW = 0x6008,
+	NES_IDX_NIC_ACTIVE = 0x6010,
+	NES_IDX_NIC_UNICAST_ALL = 0x6018,
+	NES_IDX_NIC_MULTICAST_ALL = 0x6020,
+	NES_IDX_NIC_MULTICAST_ENABLE = 0x6028,
+	NES_IDX_NIC_BROADCAST_ON = 0x6030,
+	NES_IDX_USED_CHUNKS_TX = 0x60b0,
+	NES_IDX_TX_POOL_SIZE = 0x60b8,
+	NES_IDX_QUAD_HASH_TABLE_SIZE = 0x6148,
+	NES_IDX_PERFECT_FILTER_LOW = 0x6200,
+	NES_IDX_PERFECT_FILTER_HIGH = 0x6204,
+	NES_IDX_IPV4_TCP_REXMITS = 0x7080,
+	NES_IDX_DEBUG_ERROR_CONTROL_STATUS = 0x913c,
+	NES_IDX_DEBUG_ERROR_MASKS0 = 0x9140,
+	NES_IDX_DEBUG_ERROR_MASKS1 = 0x9144,
+	NES_IDX_DEBUG_ERROR_MASKS2 = 0x9148,
+	NES_IDX_DEBUG_ERROR_MASKS3 = 0x914c,
+	NES_IDX_DEBUG_ERROR_MASKS4 = 0x9150,
+	NES_IDX_DEBUG_ERROR_MASKS5 = 0x9154,
+};
+
+#define NES_IDX_MAC_TX_CONFIG_ENABLE_PAUSE   1
+#define NES_IDX_MPP_DEBUG_PORT_DISABLE_PAUSE (1 << 17)
+
+enum nes_cqp_opcodes {
+	NES_CQP_CREATE_QP = 0x00,
+	NES_CQP_MODIFY_QP = 0x01,
+	NES_CQP_DESTROY_QP = 0x02,
+	NES_CQP_CREATE_CQ = 0x03,
+	NES_CQP_MODIFY_CQ = 0x04,
+	NES_CQP_DESTROY_CQ = 0x05,
+	NES_CQP_ALLOCATE_STAG = 0x09,
+	NES_CQP_REGISTER_STAG = 0x0a,
+	NES_CQP_QUERY_STAG = 0x0b,
+	NES_CQP_REGISTER_SHARED_STAG = 0x0c,
+	NES_CQP_DEALLOCATE_STAG = 0x0d,
+	NES_CQP_MANAGE_ARP_CACHE = 0x0f,
+	NES_CQP_DOWNLOAD_SEGMENT = 0x10,
+	NES_CQP_SUSPEND_QPS = 0x11,
+	NES_CQP_UPLOAD_CONTEXT = 0x13,
+	NES_CQP_CREATE_CEQ = 0x16,
+	NES_CQP_DESTROY_CEQ = 0x18,
+	NES_CQP_CREATE_AEQ = 0x19,
+	NES_CQP_DESTROY_AEQ = 0x1b,
+	NES_CQP_LMI_ACCESS = 0x20,
+	NES_CQP_FLUSH_WQES = 0x22,
+	NES_CQP_MANAGE_APBVT = 0x23,
+	NES_CQP_MANAGE_QUAD_HASH = 0x25
+};
+
+enum nes_cqp_wqe_word_idx {
+	NES_CQP_WQE_OPCODE_IDX = 0,
+	NES_CQP_WQE_ID_IDX = 1,
+	NES_CQP_WQE_COMP_CTX_LOW_IDX = 2,
+	NES_CQP_WQE_COMP_CTX_HIGH_IDX = 3,
+	NES_CQP_WQE_COMP_SCRATCH_LOW_IDX = 4,
+	NES_CQP_WQE_COMP_SCRATCH_HIGH_IDX = 5,
+};
+
+enum nes_cqp_wqe_word_download_idx { /* format differs from other cqp ops */
+	NES_CQP_WQE_DL_OPCODE_IDX = 0,
+	NES_CQP_WQE_DL_COMP_CTX_LOW_IDX = 1,
+	NES_CQP_WQE_DL_COMP_CTX_HIGH_IDX = 2,
+	NES_CQP_WQE_DL_LENGTH_0_TOTAL_IDX = 3
+	/* For index values 4-15 use NES_NIC_SQ_WQE_ values */
+};
+
+enum nes_cqp_cq_wqeword_idx {
+	NES_CQP_CQ_WQE_PBL_LOW_IDX = 6,
+	NES_CQP_CQ_WQE_PBL_HIGH_IDX = 7,
+	NES_CQP_CQ_WQE_CQ_CONTEXT_LOW_IDX = 8,
+	NES_CQP_CQ_WQE_CQ_CONTEXT_HIGH_IDX = 9,
+	NES_CQP_CQ_WQE_DOORBELL_INDEX_HIGH_IDX = 10,
+};
+
+enum nes_cqp_stag_wqeword_idx {
+	NES_CQP_STAG_WQE_PBL_BLK_COUNT_IDX = 1,
+	NES_CQP_STAG_WQE_LEN_HIGH_PD_IDX = 6,
+	NES_CQP_STAG_WQE_LEN_LOW_IDX = 7,
+	NES_CQP_STAG_WQE_STAG_IDX = 8,
+	NES_CQP_STAG_WQE_VA_LOW_IDX = 10,
+	NES_CQP_STAG_WQE_VA_HIGH_IDX = 11,
+	NES_CQP_STAG_WQE_PA_LOW_IDX = 12,
+	NES_CQP_STAG_WQE_PA_HIGH_IDX = 13,
+	NES_CQP_STAG_WQE_PBL_LEN_IDX = 14
+};
+
+#define NES_CQP_OP_LOGICAL_PORT_SHIFT 26
+#define NES_CQP_OP_IWARP_STATE_SHIFT 28
+#define NES_CQP_OP_TERMLEN_SHIFT     28
+
+enum nes_cqp_qp_bits {
+	NES_CQP_QP_ARP_VALID = (1<<8),
+	NES_CQP_QP_WINBUF_VALID = (1<<9),
+	NES_CQP_QP_CONTEXT_VALID = (1<<10),
+	NES_CQP_QP_ORD_VALID = (1<<11),
+	NES_CQP_QP_WINBUF_DATAIND_EN = (1<<12),
+	NES_CQP_QP_VIRT_WQS = (1<<13),
+	NES_CQP_QP_DEL_HTE = (1<<14),
+	NES_CQP_QP_CQS_VALID = (1<<15),
+	NES_CQP_QP_TYPE_TSA = 0,
+	NES_CQP_QP_TYPE_IWARP = (1<<16),
+	NES_CQP_QP_TYPE_CQP = (4<<16),
+	NES_CQP_QP_TYPE_NIC = (5<<16),
+	NES_CQP_QP_MSS_CHG = (1<<20),
+	NES_CQP_QP_STATIC_RESOURCES = (1<<21),
+	NES_CQP_QP_IGNORE_MW_BOUND = (1<<22),
+	NES_CQP_QP_VWQ_USE_LMI = (1<<23),
+	NES_CQP_QP_IWARP_STATE_IDLE = (1<<NES_CQP_OP_IWARP_STATE_SHIFT),
+	NES_CQP_QP_IWARP_STATE_RTS = (2<<NES_CQP_OP_IWARP_STATE_SHIFT),
+	NES_CQP_QP_IWARP_STATE_CLOSING = (3<<NES_CQP_OP_IWARP_STATE_SHIFT),
+	NES_CQP_QP_IWARP_STATE_TERMINATE = (5<<NES_CQP_OP_IWARP_STATE_SHIFT),
+	NES_CQP_QP_IWARP_STATE_ERROR = (6<<NES_CQP_OP_IWARP_STATE_SHIFT),
+	NES_CQP_QP_IWARP_STATE_MASK = (7<<NES_CQP_OP_IWARP_STATE_SHIFT),
+	NES_CQP_QP_TERM_DONT_SEND_FIN = (1<<24),
+	NES_CQP_QP_TERM_DONT_SEND_TERM_MSG = (1<<25),
+	NES_CQP_QP_RESET = (1<<31),
+};
+
+enum nes_cqp_qp_wqe_word_idx {
+	NES_CQP_QP_WQE_CONTEXT_LOW_IDX = 6,
+	NES_CQP_QP_WQE_CONTEXT_HIGH_IDX = 7,
+	NES_CQP_QP_WQE_FLUSH_SQ_CODE = 8,
+	NES_CQP_QP_WQE_FLUSH_RQ_CODE = 9,
+	NES_CQP_QP_WQE_NEW_MSS_IDX = 15,
+};
+
+enum nes_nic_ctx_bits {
+	NES_NIC_CTX_RQ_SIZE_32 = (3<<8),
+	NES_NIC_CTX_RQ_SIZE_512 = (3<<8),
+	NES_NIC_CTX_SQ_SIZE_32 = (1<<10),
+	NES_NIC_CTX_SQ_SIZE_512 = (3<<10),
+};
+
+enum nes_nic_qp_ctx_word_idx {
+	NES_NIC_CTX_MISC_IDX = 0,
+	NES_NIC_CTX_SQ_LOW_IDX = 2,
+	NES_NIC_CTX_SQ_HIGH_IDX = 3,
+	NES_NIC_CTX_RQ_LOW_IDX = 4,
+	NES_NIC_CTX_RQ_HIGH_IDX = 5,
+};
+
+enum nes_cqp_cq_bits {
+	NES_CQP_CQ_CEQE_MASK = (1<<9),
+	NES_CQP_CQ_CEQ_VALID = (1<<10),
+	NES_CQP_CQ_RESIZE = (1<<11),
+	NES_CQP_CQ_CHK_OVERFLOW = (1<<12),
+	NES_CQP_CQ_4KB_CHUNK = (1<<14),
+	NES_CQP_CQ_VIRT = (1<<15),
+};
+
+enum nes_cqp_stag_bits {
+	NES_CQP_STAG_VA_TO = (1<<9),
+	NES_CQP_STAG_DEALLOC_PBLS = (1<<10),
+	NES_CQP_STAG_PBL_BLK_SIZE = (1<<11),
+	NES_CQP_STAG_MR = (1<<13),
+	NES_CQP_STAG_RIGHTS_LOCAL_READ = (1<<16),
+	NES_CQP_STAG_RIGHTS_LOCAL_WRITE = (1<<17),
+	NES_CQP_STAG_RIGHTS_REMOTE_READ = (1<<18),
+	NES_CQP_STAG_RIGHTS_REMOTE_WRITE = (1<<19),
+	NES_CQP_STAG_RIGHTS_WINDOW_BIND = (1<<20),
+	NES_CQP_STAG_REM_ACC_EN = (1<<21),
+	NES_CQP_STAG_LEAVE_PENDING = (1<<31),
+};
+
+enum nes_cqp_ceq_wqeword_idx {
+	NES_CQP_CEQ_WQE_ELEMENT_COUNT_IDX = 1,
+	NES_CQP_CEQ_WQE_PBL_LOW_IDX = 6,
+	NES_CQP_CEQ_WQE_PBL_HIGH_IDX = 7,
+};
+
+enum nes_cqp_ceq_bits {
+	NES_CQP_CEQ_4KB_CHUNK = (1<<14),
+	NES_CQP_CEQ_VIRT = (1<<15),
+};
+
+enum nes_cqp_aeq_wqeword_idx {
+	NES_CQP_AEQ_WQE_ELEMENT_COUNT_IDX = 1,
+	NES_CQP_AEQ_WQE_PBL_LOW_IDX = 6,
+	NES_CQP_AEQ_WQE_PBL_HIGH_IDX = 7,
+};
+
+enum nes_cqp_aeq_bits {
+	NES_CQP_AEQ_4KB_CHUNK = (1<<14),
+	NES_CQP_AEQ_VIRT = (1<<15),
+};
+
+enum nes_cqp_lmi_wqeword_idx {
+	NES_CQP_LMI_WQE_LMI_OFFSET_IDX = 1,
+	NES_CQP_LMI_WQE_FRAG_LOW_IDX = 8,
+	NES_CQP_LMI_WQE_FRAG_HIGH_IDX = 9,
+	NES_CQP_LMI_WQE_FRAG_LEN_IDX = 10,
+};
+
+enum nes_cqp_arp_wqeword_idx {
+	NES_CQP_ARP_WQE_MAC_ADDR_LOW_IDX = 6,
+	NES_CQP_ARP_WQE_MAC_HIGH_IDX = 7,
+	NES_CQP_ARP_WQE_REACHABILITY_MAX_IDX = 1,
+};
+
+enum nes_cqp_upload_wqeword_idx {
+	NES_CQP_UPLOAD_WQE_CTXT_LOW_IDX = 6,
+	NES_CQP_UPLOAD_WQE_CTXT_HIGH_IDX = 7,
+	NES_CQP_UPLOAD_WQE_HTE_IDX = 8,
+};
+
+enum nes_cqp_arp_bits {
+	NES_CQP_ARP_VALID = (1<<8),
+	NES_CQP_ARP_PERM = (1<<9),
+};
+
+enum nes_cqp_flush_bits {
+	NES_CQP_FLUSH_SQ = (1<<30),
+	NES_CQP_FLUSH_RQ = (1<<31),
+	NES_CQP_FLUSH_MAJ_MIN = (1<<28),
+};
+
+enum nes_cqe_opcode_bits {
+	NES_CQE_STAG_VALID = (1<<6),
+	NES_CQE_ERROR = (1<<7),
+	NES_CQE_SQ = (1<<8),
+	NES_CQE_SE = (1<<9),
+	NES_CQE_PSH = (1<<29),
+	NES_CQE_FIN = (1<<30),
+	NES_CQE_VALID = (1<<31),
+};
+
+
+enum nes_cqe_word_idx {
+	NES_CQE_PAYLOAD_LENGTH_IDX = 0,
+	NES_CQE_COMP_COMP_CTX_LOW_IDX = 2,
+	NES_CQE_COMP_COMP_CTX_HIGH_IDX = 3,
+	NES_CQE_INV_STAG_IDX = 4,
+	NES_CQE_QP_ID_IDX = 5,
+	NES_CQE_ERROR_CODE_IDX = 6,
+	NES_CQE_OPCODE_IDX = 7,
+};
+
+enum nes_ceqe_word_idx {
+	NES_CEQE_CQ_CTX_LOW_IDX = 0,
+	NES_CEQE_CQ_CTX_HIGH_IDX = 1,
+};
+
+enum nes_ceqe_status_bit {
+	NES_CEQE_VALID = (1<<31),
+};
+
+enum nes_int_bits {
+	NES_INT_CEQ0 = (1<<0),
+	NES_INT_CEQ1 = (1<<1),
+	NES_INT_CEQ2 = (1<<2),
+	NES_INT_CEQ3 = (1<<3),
+	NES_INT_CEQ4 = (1<<4),
+	NES_INT_CEQ5 = (1<<5),
+	NES_INT_CEQ6 = (1<<6),
+	NES_INT_CEQ7 = (1<<7),
+	NES_INT_CEQ8 = (1<<8),
+	NES_INT_CEQ9 = (1<<9),
+	NES_INT_CEQ10 = (1<<10),
+	NES_INT_CEQ11 = (1<<11),
+	NES_INT_CEQ12 = (1<<12),
+	NES_INT_CEQ13 = (1<<13),
+	NES_INT_CEQ14 = (1<<14),
+	NES_INT_CEQ15 = (1<<15),
+	NES_INT_AEQ0 = (1<<16),
+	NES_INT_AEQ1 = (1<<17),
+	NES_INT_AEQ2 = (1<<18),
+	NES_INT_AEQ3 = (1<<19),
+	NES_INT_AEQ4 = (1<<20),
+	NES_INT_AEQ5 = (1<<21),
+	NES_INT_AEQ6 = (1<<22),
+	NES_INT_AEQ7 = (1<<23),
+	NES_INT_MAC0 = (1<<24),
+	NES_INT_MAC1 = (1<<25),
+	NES_INT_MAC2 = (1<<26),
+	NES_INT_MAC3 = (1<<27),
+	NES_INT_TSW = (1<<28),
+	NES_INT_TIMER = (1<<29),
+	NES_INT_INTF = (1<<30),
+};
+
+enum nes_intf_int_bits {
+	NES_INTF_INT_PCIERR = (1<<0),
+	NES_INTF_PERIODIC_TIMER = (1<<2),
+	NES_INTF_ONE_SHOT_TIMER = (1<<3),
+	NES_INTF_INT_CRITERR = (1<<14),
+	NES_INTF_INT_AEQ0_OFLOW = (1<<16),
+	NES_INTF_INT_AEQ1_OFLOW = (1<<17),
+	NES_INTF_INT_AEQ2_OFLOW = (1<<18),
+	NES_INTF_INT_AEQ3_OFLOW = (1<<19),
+	NES_INTF_INT_AEQ4_OFLOW = (1<<20),
+	NES_INTF_INT_AEQ5_OFLOW = (1<<21),
+	NES_INTF_INT_AEQ6_OFLOW = (1<<22),
+	NES_INTF_INT_AEQ7_OFLOW = (1<<23),
+	NES_INTF_INT_AEQ_OFLOW = (0xff<<16),
+};
+
+enum nes_mac_int_bits {
+	NES_MAC_INT_LINK_STAT_CHG = (1<<1),
+	NES_MAC_INT_XGMII_EXT = (1<<2),
+	NES_MAC_INT_TX_UNDERFLOW = (1<<6),
+	NES_MAC_INT_TX_ERROR = (1<<7),
+};
+
+enum nes_cqe_allocate_bits {
+	NES_CQE_ALLOC_INC_SELECT = (1<<28),
+	NES_CQE_ALLOC_NOTIFY_NEXT = (1<<29),
+	NES_CQE_ALLOC_NOTIFY_SE = (1<<30),
+	NES_CQE_ALLOC_RESET = (1<<31),
+};
+
+enum nes_nic_rq_wqe_word_idx {
+	NES_NIC_RQ_WQE_LENGTH_1_0_IDX = 0,
+	NES_NIC_RQ_WQE_LENGTH_3_2_IDX = 1,
+	NES_NIC_RQ_WQE_FRAG0_LOW_IDX = 2,
+	NES_NIC_RQ_WQE_FRAG0_HIGH_IDX = 3,
+	NES_NIC_RQ_WQE_FRAG1_LOW_IDX = 4,
+	NES_NIC_RQ_WQE_FRAG1_HIGH_IDX = 5,
+	NES_NIC_RQ_WQE_FRAG2_LOW_IDX = 6,
+	NES_NIC_RQ_WQE_FRAG2_HIGH_IDX = 7,
+	NES_NIC_RQ_WQE_FRAG3_LOW_IDX = 8,
+	NES_NIC_RQ_WQE_FRAG3_HIGH_IDX = 9,
+};
+
+enum nes_nic_sq_wqe_word_idx {
+	NES_NIC_SQ_WQE_MISC_IDX = 0,
+	NES_NIC_SQ_WQE_TOTAL_LENGTH_IDX = 1,
+	NES_NIC_SQ_WQE_LSO_INFO_IDX = 2,
+	NES_NIC_SQ_WQE_LENGTH_0_TAG_IDX = 3,
+	NES_NIC_SQ_WQE_LENGTH_2_1_IDX = 4,
+	NES_NIC_SQ_WQE_LENGTH_4_3_IDX = 5,
+	NES_NIC_SQ_WQE_FRAG0_LOW_IDX = 6,
+	NES_NIC_SQ_WQE_FRAG0_HIGH_IDX = 7,
+	NES_NIC_SQ_WQE_FRAG1_LOW_IDX = 8,
+	NES_NIC_SQ_WQE_FRAG1_HIGH_IDX = 9,
+	NES_NIC_SQ_WQE_FRAG2_LOW_IDX = 10,
+	NES_NIC_SQ_WQE_FRAG2_HIGH_IDX = 11,
+	NES_NIC_SQ_WQE_FRAG3_LOW_IDX = 12,
+	NES_NIC_SQ_WQE_FRAG3_HIGH_IDX = 13,
+	NES_NIC_SQ_WQE_FRAG4_LOW_IDX = 14,
+	NES_NIC_SQ_WQE_FRAG4_HIGH_IDX = 15,
+};
+
+enum nes_iwarp_sq_wqe_word_idx {
+	NES_IWARP_SQ_WQE_MISC_IDX = 0,
+	NES_IWARP_SQ_WQE_TOTAL_PAYLOAD_IDX = 1,
+	NES_IWARP_SQ_WQE_COMP_CTX_LOW_IDX = 2,
+	NES_IWARP_SQ_WQE_COMP_CTX_HIGH_IDX = 3,
+	NES_IWARP_SQ_WQE_COMP_SCRATCH_LOW_IDX = 4,
+	NES_IWARP_SQ_WQE_COMP_SCRATCH_HIGH_IDX = 5,
+	NES_IWARP_SQ_WQE_INV_STAG_LOW_IDX = 7,
+	NES_IWARP_SQ_WQE_RDMA_TO_LOW_IDX = 8,
+	NES_IWARP_SQ_WQE_RDMA_TO_HIGH_IDX = 9,
+	NES_IWARP_SQ_WQE_RDMA_LENGTH_IDX = 10,
+	NES_IWARP_SQ_WQE_RDMA_STAG_IDX = 11,
+	NES_IWARP_SQ_WQE_IMM_DATA_START_IDX = 12,
+	NES_IWARP_SQ_WQE_FRAG0_LOW_IDX = 16,
+	NES_IWARP_SQ_WQE_FRAG0_HIGH_IDX = 17,
+	NES_IWARP_SQ_WQE_LENGTH0_IDX = 18,
+	NES_IWARP_SQ_WQE_STAG0_IDX = 19,
+	NES_IWARP_SQ_WQE_FRAG1_LOW_IDX = 20,
+	NES_IWARP_SQ_WQE_FRAG1_HIGH_IDX = 21,
+	NES_IWARP_SQ_WQE_LENGTH1_IDX = 22,
+	NES_IWARP_SQ_WQE_STAG1_IDX = 23,
+	NES_IWARP_SQ_WQE_FRAG2_LOW_IDX = 24,
+	NES_IWARP_SQ_WQE_FRAG2_HIGH_IDX = 25,
+	NES_IWARP_SQ_WQE_LENGTH2_IDX = 26,
+	NES_IWARP_SQ_WQE_STAG2_IDX = 27,
+	NES_IWARP_SQ_WQE_FRAG3_LOW_IDX = 28,
+	NES_IWARP_SQ_WQE_FRAG3_HIGH_IDX = 29,
+	NES_IWARP_SQ_WQE_LENGTH3_IDX = 30,
+	NES_IWARP_SQ_WQE_STAG3_IDX = 31,
+};
+
+enum nes_iwarp_sq_bind_wqe_word_idx {
+	NES_IWARP_SQ_BIND_WQE_MR_IDX = 6,
+	NES_IWARP_SQ_BIND_WQE_MW_IDX = 7,
+	NES_IWARP_SQ_BIND_WQE_LENGTH_LOW_IDX = 8,
+	NES_IWARP_SQ_BIND_WQE_LENGTH_HIGH_IDX = 9,
+	NES_IWARP_SQ_BIND_WQE_VA_FBO_LOW_IDX = 10,
+	NES_IWARP_SQ_BIND_WQE_VA_FBO_HIGH_IDX = 11,
+};
+
+enum nes_iwarp_sq_fmr_wqe_word_idx {
+	NES_IWARP_SQ_FMR_WQE_MR_STAG_IDX = 7,
+	NES_IWARP_SQ_FMR_WQE_LENGTH_LOW_IDX = 8,
+	NES_IWARP_SQ_FMR_WQE_LENGTH_HIGH_IDX = 9,
+	NES_IWARP_SQ_FMR_WQE_VA_FBO_LOW_IDX = 10,
+	NES_IWARP_SQ_FMR_WQE_VA_FBO_HIGH_IDX = 11,
+	NES_IWARP_SQ_FMR_WQE_PBL_ADDR_LOW_IDX = 12,
+	NES_IWARP_SQ_FMR_WQE_PBL_ADDR_HIGH_IDX = 13,
+	NES_IWARP_SQ_FMR_WQE_PBL_LENGTH_IDX = 14,
+};
+
+enum nes_iwarp_sq_fmr_opcodes {
+	NES_IWARP_SQ_FMR_WQE_ZERO_BASED			= (1<<6),
+	NES_IWARP_SQ_FMR_WQE_PAGE_SIZE_4K		= (0<<7),
+	NES_IWARP_SQ_FMR_WQE_PAGE_SIZE_2M		= (1<<7),
+	NES_IWARP_SQ_FMR_WQE_RIGHTS_ENABLE_LOCAL_READ	= (1<<16),
+	NES_IWARP_SQ_FMR_WQE_RIGHTS_ENABLE_LOCAL_WRITE 	= (1<<17),
+	NES_IWARP_SQ_FMR_WQE_RIGHTS_ENABLE_REMOTE_READ 	= (1<<18),
+	NES_IWARP_SQ_FMR_WQE_RIGHTS_ENABLE_REMOTE_WRITE = (1<<19),
+	NES_IWARP_SQ_FMR_WQE_RIGHTS_ENABLE_WINDOW_BIND 	= (1<<20),
+};
+
+#define NES_IWARP_SQ_FMR_WQE_MR_LENGTH_HIGH_MASK	0xFF;
+
+enum nes_iwarp_sq_locinv_wqe_word_idx {
+	NES_IWARP_SQ_LOCINV_WQE_INV_STAG_IDX = 6,
+};
+
+enum nes_iwarp_rq_wqe_word_idx {
+	NES_IWARP_RQ_WQE_TOTAL_PAYLOAD_IDX = 1,
+	NES_IWARP_RQ_WQE_COMP_CTX_LOW_IDX = 2,
+	NES_IWARP_RQ_WQE_COMP_CTX_HIGH_IDX = 3,
+	NES_IWARP_RQ_WQE_COMP_SCRATCH_LOW_IDX = 4,
+	NES_IWARP_RQ_WQE_COMP_SCRATCH_HIGH_IDX = 5,
+	NES_IWARP_RQ_WQE_FRAG0_LOW_IDX = 8,
+	NES_IWARP_RQ_WQE_FRAG0_HIGH_IDX = 9,
+	NES_IWARP_RQ_WQE_LENGTH0_IDX = 10,
+	NES_IWARP_RQ_WQE_STAG0_IDX = 11,
+	NES_IWARP_RQ_WQE_FRAG1_LOW_IDX = 12,
+	NES_IWARP_RQ_WQE_FRAG1_HIGH_IDX = 13,
+	NES_IWARP_RQ_WQE_LENGTH1_IDX = 14,
+	NES_IWARP_RQ_WQE_STAG1_IDX = 15,
+	NES_IWARP_RQ_WQE_FRAG2_LOW_IDX = 16,
+	NES_IWARP_RQ_WQE_FRAG2_HIGH_IDX = 17,
+	NES_IWARP_RQ_WQE_LENGTH2_IDX = 18,
+	NES_IWARP_RQ_WQE_STAG2_IDX = 19,
+	NES_IWARP_RQ_WQE_FRAG3_LOW_IDX = 20,
+	NES_IWARP_RQ_WQE_FRAG3_HIGH_IDX = 21,
+	NES_IWARP_RQ_WQE_LENGTH3_IDX = 22,
+	NES_IWARP_RQ_WQE_STAG3_IDX = 23,
+};
+
+enum nes_nic_sq_wqe_bits {
+	NES_NIC_SQ_WQE_PHDR_CS_READY =  (1<<21),
+	NES_NIC_SQ_WQE_LSO_ENABLE = (1<<22),
+	NES_NIC_SQ_WQE_TAGVALUE_ENABLE = (1<<23),
+	NES_NIC_SQ_WQE_DISABLE_CHKSUM = (1<<30),
+	NES_NIC_SQ_WQE_COMPLETION = (1<<31),
+};
+
+enum nes_nic_cqe_word_idx {
+	NES_NIC_CQE_ACCQP_ID_IDX = 0,
+	NES_NIC_CQE_HASH_RCVNXT = 1,
+	NES_NIC_CQE_TAG_PKT_TYPE_IDX = 2,
+	NES_NIC_CQE_MISC_IDX = 3,
+};
+
+#define NES_PKT_TYPE_APBVT_BITS 0xC112
+#define NES_PKT_TYPE_APBVT_MASK 0xff3e
+
+#define NES_PKT_TYPE_PVALID_BITS 0x10000000
+#define NES_PKT_TYPE_PVALID_MASK 0x30000000
+
+#define NES_PKT_TYPE_TCPV4_BITS 0x0110
+#define NES_PKT_TYPE_TCPV4_MASK 0x3f30
+
+#define NES_PKT_TYPE_UDPV4_BITS 0x0210
+#define NES_PKT_TYPE_UDPV4_MASK 0x3f30
+
+#define NES_PKT_TYPE_IPV4_BITS  0x0010
+#define NES_PKT_TYPE_IPV4_MASK  0x3f30
+
+#define NES_PKT_TYPE_OTHER_BITS 0x0000
+#define NES_PKT_TYPE_OTHER_MASK 0x0030
+
+#define NES_NIC_CQE_ERRV_SHIFT 16
+enum nes_nic_ev_bits {
+	NES_NIC_ERRV_BITS_MODE = (1<<0),
+	NES_NIC_ERRV_BITS_IPV4_CSUM_ERR = (1<<1),
+	NES_NIC_ERRV_BITS_TCPUDP_CSUM_ERR = (1<<2),
+	NES_NIC_ERRV_BITS_WQE_OVERRUN = (1<<3),
+	NES_NIC_ERRV_BITS_IPH_ERR = (1<<4),
+};
+
+enum nes_nic_cqe_bits {
+	NES_NIC_CQE_ERRV_MASK = (0xff<<NES_NIC_CQE_ERRV_SHIFT),
+	NES_NIC_CQE_SQ = (1<<24),
+	NES_NIC_CQE_ACCQP_PORT = (1<<28),
+	NES_NIC_CQE_ACCQP_VALID = (1<<29),
+	NES_NIC_CQE_TAG_VALID = (1<<30),
+	NES_NIC_CQE_VALID = (1<<31),
+};
+
+enum nes_aeqe_word_idx {
+	NES_AEQE_COMP_CTXT_LOW_IDX = 0,
+	NES_AEQE_COMP_CTXT_HIGH_IDX = 1,
+	NES_AEQE_COMP_QP_CQ_ID_IDX = 2,
+	NES_AEQE_MISC_IDX = 3,
+};
+
+enum nes_aeqe_bits {
+	NES_AEQE_QP = (1<<16),
+	NES_AEQE_CQ = (1<<17),
+	NES_AEQE_SQ = (1<<18),
+	NES_AEQE_INBOUND_RDMA = (1<<19),
+	NES_AEQE_IWARP_STATE_MASK = (7<<20),
+	NES_AEQE_TCP_STATE_MASK = (0xf<<24),
+	NES_AEQE_Q2_DATA_WRITTEN = (0x3<<28),
+	NES_AEQE_VALID = (1<<31),
+};
+
+#define NES_AEQE_IWARP_STATE_SHIFT	20
+#define NES_AEQE_TCP_STATE_SHIFT	24
+#define NES_AEQE_Q2_DATA_ETHERNET       (1<<28)
+#define NES_AEQE_Q2_DATA_MPA            (1<<29)
+
+enum nes_aeqe_iwarp_state {
+	NES_AEQE_IWARP_STATE_NON_EXISTANT = 0,
+	NES_AEQE_IWARP_STATE_IDLE = 1,
+	NES_AEQE_IWARP_STATE_RTS = 2,
+	NES_AEQE_IWARP_STATE_CLOSING = 3,
+	NES_AEQE_IWARP_STATE_TERMINATE = 5,
+	NES_AEQE_IWARP_STATE_ERROR = 6
+};
+
+enum nes_aeqe_tcp_state {
+	NES_AEQE_TCP_STATE_NON_EXISTANT = 0,
+	NES_AEQE_TCP_STATE_CLOSED = 1,
+	NES_AEQE_TCP_STATE_LISTEN = 2,
+	NES_AEQE_TCP_STATE_SYN_SENT = 3,
+	NES_AEQE_TCP_STATE_SYN_RCVD = 4,
+	NES_AEQE_TCP_STATE_ESTABLISHED = 5,
+	NES_AEQE_TCP_STATE_CLOSE_WAIT = 6,
+	NES_AEQE_TCP_STATE_FIN_WAIT_1 = 7,
+	NES_AEQE_TCP_STATE_CLOSING = 8,
+	NES_AEQE_TCP_STATE_LAST_ACK = 9,
+	NES_AEQE_TCP_STATE_FIN_WAIT_2 = 10,
+	NES_AEQE_TCP_STATE_TIME_WAIT = 11
+};
+
+enum nes_aeqe_aeid {
+	NES_AEQE_AEID_AMP_UNALLOCATED_STAG                            = 0x0102,
+	NES_AEQE_AEID_AMP_INVALID_STAG                                = 0x0103,
+	NES_AEQE_AEID_AMP_BAD_QP                                      = 0x0104,
+	NES_AEQE_AEID_AMP_BAD_PD                                      = 0x0105,
+	NES_AEQE_AEID_AMP_BAD_STAG_KEY                                = 0x0106,
+	NES_AEQE_AEID_AMP_BAD_STAG_INDEX                              = 0x0107,
+	NES_AEQE_AEID_AMP_BOUNDS_VIOLATION                            = 0x0108,
+	NES_AEQE_AEID_AMP_RIGHTS_VIOLATION                            = 0x0109,
+	NES_AEQE_AEID_AMP_TO_WRAP                                     = 0x010a,
+	NES_AEQE_AEID_AMP_FASTREG_SHARED                              = 0x010b,
+	NES_AEQE_AEID_AMP_FASTREG_VALID_STAG                          = 0x010c,
+	NES_AEQE_AEID_AMP_FASTREG_MW_STAG                             = 0x010d,
+	NES_AEQE_AEID_AMP_FASTREG_INVALID_RIGHTS                      = 0x010e,
+	NES_AEQE_AEID_AMP_FASTREG_PBL_TABLE_OVERFLOW                  = 0x010f,
+	NES_AEQE_AEID_AMP_FASTREG_INVALID_LENGTH                      = 0x0110,
+	NES_AEQE_AEID_AMP_INVALIDATE_SHARED                           = 0x0111,
+	NES_AEQE_AEID_AMP_INVALIDATE_NO_REMOTE_ACCESS_RIGHTS          = 0x0112,
+	NES_AEQE_AEID_AMP_INVALIDATE_MR_WITH_BOUND_WINDOWS            = 0x0113,
+	NES_AEQE_AEID_AMP_MWBIND_VALID_STAG                           = 0x0114,
+	NES_AEQE_AEID_AMP_MWBIND_OF_MR_STAG                           = 0x0115,
+	NES_AEQE_AEID_AMP_MWBIND_TO_ZERO_BASED_STAG                   = 0x0116,
+	NES_AEQE_AEID_AMP_MWBIND_TO_MW_STAG                           = 0x0117,
+	NES_AEQE_AEID_AMP_MWBIND_INVALID_RIGHTS                       = 0x0118,
+	NES_AEQE_AEID_AMP_MWBIND_INVALID_BOUNDS                       = 0x0119,
+	NES_AEQE_AEID_AMP_MWBIND_TO_INVALID_PARENT                    = 0x011a,
+	NES_AEQE_AEID_AMP_MWBIND_BIND_DISABLED                        = 0x011b,
+	NES_AEQE_AEID_BAD_CLOSE                                       = 0x0201,
+	NES_AEQE_AEID_RDMAP_ROE_BAD_LLP_CLOSE                         = 0x0202,
+	NES_AEQE_AEID_CQ_OPERATION_ERROR                              = 0x0203,
+	NES_AEQE_AEID_PRIV_OPERATION_DENIED                           = 0x0204,
+	NES_AEQE_AEID_RDMA_READ_WHILE_ORD_ZERO                        = 0x0205,
+	NES_AEQE_AEID_STAG_ZERO_INVALID                               = 0x0206,
+	NES_AEQE_AEID_DDP_INVALID_MSN_GAP_IN_MSN                      = 0x0301,
+	NES_AEQE_AEID_DDP_INVALID_MSN_RANGE_IS_NOT_VALID              = 0x0302,
+	NES_AEQE_AEID_DDP_UBE_DDP_MESSAGE_TOO_LONG_FOR_AVAILABLE_BUFFER = 0x0303,
+	NES_AEQE_AEID_DDP_UBE_INVALID_DDP_VERSION                     = 0x0304,
+	NES_AEQE_AEID_DDP_UBE_INVALID_MO                              = 0x0305,
+	NES_AEQE_AEID_DDP_UBE_INVALID_MSN_NO_BUFFER_AVAILABLE         = 0x0306,
+	NES_AEQE_AEID_DDP_UBE_INVALID_QN                              = 0x0307,
+	NES_AEQE_AEID_DDP_NO_L_BIT                                    = 0x0308,
+	NES_AEQE_AEID_RDMAP_ROE_INVALID_RDMAP_VERSION                 = 0x0311,
+	NES_AEQE_AEID_RDMAP_ROE_UNEXPECTED_OPCODE                     = 0x0312,
+	NES_AEQE_AEID_ROE_INVALID_RDMA_READ_REQUEST                   = 0x0313,
+	NES_AEQE_AEID_ROE_INVALID_RDMA_WRITE_OR_READ_RESP             = 0x0314,
+	NES_AEQE_AEID_INVALID_ARP_ENTRY                               = 0x0401,
+	NES_AEQE_AEID_INVALID_TCP_OPTION_RCVD                         = 0x0402,
+	NES_AEQE_AEID_STALE_ARP_ENTRY                                 = 0x0403,
+	NES_AEQE_AEID_LLP_CLOSE_COMPLETE                              = 0x0501,
+	NES_AEQE_AEID_LLP_CONNECTION_RESET                            = 0x0502,
+	NES_AEQE_AEID_LLP_FIN_RECEIVED                                = 0x0503,
+	NES_AEQE_AEID_LLP_RECEIVED_MARKER_AND_LENGTH_FIELDS_DONT_MATCH =  0x0504,
+	NES_AEQE_AEID_LLP_RECEIVED_MPA_CRC_ERROR                      = 0x0505,
+	NES_AEQE_AEID_LLP_SEGMENT_TOO_LARGE                           = 0x0506,
+	NES_AEQE_AEID_LLP_SEGMENT_TOO_SMALL                           = 0x0507,
+	NES_AEQE_AEID_LLP_SYN_RECEIVED                                = 0x0508,
+	NES_AEQE_AEID_LLP_TERMINATE_RECEIVED                          = 0x0509,
+	NES_AEQE_AEID_LLP_TOO_MANY_RETRIES                            = 0x050a,
+	NES_AEQE_AEID_LLP_TOO_MANY_KEEPALIVE_RETRIES                  = 0x050b,
+	NES_AEQE_AEID_RESET_SENT                                      = 0x0601,
+	NES_AEQE_AEID_TERMINATE_SENT                                  = 0x0602,
+	NES_AEQE_AEID_DDP_LCE_LOCAL_CATASTROPHIC                      = 0x0700
+};
+
+enum nes_iwarp_sq_opcodes {
+	NES_IWARP_SQ_WQE_WRPDU = (1<<15),
+	NES_IWARP_SQ_WQE_PSH = (1<<21),
+	NES_IWARP_SQ_WQE_STREAMING = (1<<23),
+	NES_IWARP_SQ_WQE_IMM_DATA = (1<<28),
+	NES_IWARP_SQ_WQE_READ_FENCE = (1<<29),
+	NES_IWARP_SQ_WQE_LOCAL_FENCE = (1<<30),
+	NES_IWARP_SQ_WQE_SIGNALED_COMPL = (1<<31),
+};
+
+enum nes_iwarp_sq_wqe_bits {
+	NES_IWARP_SQ_OP_RDMAW = 0,
+	NES_IWARP_SQ_OP_RDMAR = 1,
+	NES_IWARP_SQ_OP_SEND = 3,
+	NES_IWARP_SQ_OP_SENDINV = 4,
+	NES_IWARP_SQ_OP_SENDSE = 5,
+	NES_IWARP_SQ_OP_SENDSEINV = 6,
+	NES_IWARP_SQ_OP_BIND = 8,
+	NES_IWARP_SQ_OP_FAST_REG = 9,
+	NES_IWARP_SQ_OP_LOCINV = 10,
+	NES_IWARP_SQ_OP_RDMAR_LOCINV = 11,
+	NES_IWARP_SQ_OP_NOP = 12,
+};
+
+enum nes_iwarp_cqe_major_code {
+	NES_IWARP_CQE_MAJOR_FLUSH = 1,
+	NES_IWARP_CQE_MAJOR_DRV = 0x8000
+};
+
+enum nes_iwarp_cqe_minor_code {
+	NES_IWARP_CQE_MINOR_FLUSH = 1
+};
+
+#define NES_EEPROM_READ_REQUEST (1<<16)
+#define NES_MAC_ADDR_VALID      (1<<20)
+
+/*
+ * NES index registers init values.
+ */
+struct nes_init_values {
+	u32 index;
+	u32 data;
+	u8  wrt;
+};
+
+/*
+ * NES registers in BAR0.
+ */
+struct nes_pci_regs {
+	u32 int_status;
+	u32 int_mask;
+	u32 int_pending;
+	u32 intf_int_status;
+	u32 intf_int_mask;
+	u32 other_regs[59];	 /* pad out to 256 bytes for now */
+};
+
+#define NES_CQP_SQ_SIZE    128
+#define NES_CCQ_SIZE       128
+#define NES_NIC_WQ_SIZE    512
+#define NES_NIC_CTX_SIZE   ((NES_NIC_CTX_RQ_SIZE_512) | (NES_NIC_CTX_SQ_SIZE_512))
+#define NES_NIC_BACK_STORE 0x00038000
+
+struct nes_device;
+
+struct nes_hw_nic_qp_context {
+	__le32 context_words[6];
+};
+
+struct nes_hw_nic_sq_wqe {
+	__le32 wqe_words[16];
+};
+
+struct nes_hw_nic_rq_wqe {
+	__le32 wqe_words[16];
+};
+
+struct nes_hw_nic_cqe {
+	__le32 cqe_words[4];
+};
+
+struct nes_hw_cqp_qp_context {
+	__le32 context_words[4];
+};
+
+struct nes_hw_cqp_wqe {
+	__le32 wqe_words[16];
+};
+
+struct nes_hw_qp_wqe {
+	__le32 wqe_words[32];
+};
+
+struct nes_hw_cqe {
+	__le32 cqe_words[8];
+};
+
+struct nes_hw_ceqe {
+	__le32 ceqe_words[2];
+};
+
+struct nes_hw_aeqe {
+	__le32 aeqe_words[4];
+};
+
+struct nes_cqp_request {
+	union {
+		u64 cqp_callback_context;
+		void *cqp_callback_pointer;
+	};
+	wait_queue_head_t     waitq;
+	struct nes_hw_cqp_wqe cqp_wqe;
+	struct list_head      list;
+	atomic_t              refcount;
+	void (*cqp_callback)(struct nes_device *nesdev, struct nes_cqp_request *cqp_request);
+	u16                   major_code;
+	u16                   minor_code;
+	u8                    waiting;
+	u8                    request_done;
+	u8                    dynamic;
+	u8                    callback;
+};
+
+struct nes_hw_cqp {
+	struct nes_hw_cqp_wqe *sq_vbase;
+	dma_addr_t            sq_pbase;
+	spinlock_t            lock;
+	wait_queue_head_t     waitq;
+	u16                   qp_id;
+	u16                   sq_head;
+	u16                   sq_tail;
+	u16                   sq_size;
+};
+
+#define NES_FIRST_FRAG_SIZE 128
+struct nes_first_frag {
+	u8 buffer[NES_FIRST_FRAG_SIZE];
+};
+
+struct nes_hw_nic {
+	struct nes_first_frag    *first_frag_vbase;	/* virtual address of first frags */
+	struct nes_hw_nic_sq_wqe *sq_vbase;			/* virtual address of sq */
+	struct nes_hw_nic_rq_wqe *rq_vbase;			/* virtual address of rq */
+	struct sk_buff           *tx_skb[NES_NIC_WQ_SIZE];
+	struct sk_buff           *rx_skb[NES_NIC_WQ_SIZE];
+	dma_addr_t frag_paddr[NES_NIC_WQ_SIZE];
+	unsigned long first_frag_overflow[BITS_TO_LONGS(NES_NIC_WQ_SIZE)];
+	dma_addr_t sq_pbase;			/* PCI memory for host rings */
+	dma_addr_t rq_pbase;			/* PCI memory for host rings */
+
+	u16 qp_id;
+	u16 sq_head;
+	u16 sq_tail;
+	u16 sq_size;
+	u16 rq_head;
+	u16 rq_tail;
+	u16 rq_size;
+	u8 replenishing_rq;
+	u8 reserved;
+
+	spinlock_t rq_lock;
+};
+
+struct nes_hw_nic_cq {
+	struct nes_hw_nic_cqe volatile *cq_vbase;	/* PCI memory for host rings */
+	void (*ce_handler)(struct nes_device *nesdev, struct nes_hw_nic_cq *cq);
+	dma_addr_t cq_pbase;	/* PCI memory for host rings */
+	int rx_cqes_completed;
+	int cqe_allocs_pending;
+	int rx_pkts_indicated;
+	u16 cq_head;
+	u16 cq_size;
+	u16 cq_number;
+	u8  cqes_pending;
+};
+
+struct nes_hw_qp {
+	struct nes_hw_qp_wqe *sq_vbase;		/* PCI memory for host rings */
+	struct nes_hw_qp_wqe *rq_vbase;		/* PCI memory for host rings */
+	void                 *q2_vbase;			/* PCI memory for host rings */
+	dma_addr_t sq_pbase;	/* PCI memory for host rings */
+	dma_addr_t rq_pbase;	/* PCI memory for host rings */
+	dma_addr_t q2_pbase;	/* PCI memory for host rings */
+	u32 qp_id;
+	u16 sq_head;
+	u16 sq_tail;
+	u16 sq_size;
+	u16 rq_head;
+	u16 rq_tail;
+	u16 rq_size;
+	u8  rq_encoded_size;
+	u8  sq_encoded_size;
+};
+
+struct nes_hw_cq {
+	struct nes_hw_cqe *cq_vbase;	/* PCI memory for host rings */
+	void (*ce_handler)(struct nes_device *nesdev, struct nes_hw_cq *cq);
+	dma_addr_t cq_pbase;	/* PCI memory for host rings */
+	u16 cq_head;
+	u16 cq_size;
+	u16 cq_number;
+};
+
+struct nes_hw_ceq {
+	struct nes_hw_ceqe volatile *ceq_vbase;	/* PCI memory for host rings */
+	dma_addr_t ceq_pbase;	/* PCI memory for host rings */
+	u16 ceq_head;
+	u16 ceq_size;
+};
+
+struct nes_hw_aeq {
+	struct nes_hw_aeqe volatile *aeq_vbase;	/* PCI memory for host rings */
+	dma_addr_t aeq_pbase;	/* PCI memory for host rings */
+	u16 aeq_head;
+	u16 aeq_size;
+};
+
+struct nic_qp_map {
+	u8 qpid;
+	u8 nic_index;
+	u8 logical_port;
+	u8 is_hnic;
+};
+
+#define	NES_CQP_ARP_AEQ_INDEX_MASK  0x000f0000
+#define	NES_CQP_ARP_AEQ_INDEX_SHIFT 16
+
+#define NES_CQP_APBVT_ADD			0x00008000
+#define NES_CQP_APBVT_NIC_SHIFT		16
+
+#define NES_ARP_ADD     1
+#define NES_ARP_DELETE  2
+#define NES_ARP_RESOLVE 3
+
+#define NES_MAC_SW_IDLE      0
+#define NES_MAC_SW_INTERRUPT 1
+#define NES_MAC_SW_MH        2
+
+struct nes_arp_entry {
+	u32 ip_addr;
+	u8  mac_addr[ETH_ALEN];
+};
+
+#define NES_NIC_FAST_TIMER          96
+#define NES_NIC_FAST_TIMER_LOW      40
+#define NES_NIC_FAST_TIMER_HIGH     1000
+#define DEFAULT_NES_QL_HIGH         256
+#define DEFAULT_NES_QL_LOW          16
+#define DEFAULT_NES_QL_TARGET       64
+#define DEFAULT_JUMBO_NES_QL_LOW    12
+#define DEFAULT_JUMBO_NES_QL_TARGET 40
+#define DEFAULT_JUMBO_NES_QL_HIGH   128
+#define NES_NIC_CQ_DOWNWARD_TREND   16
+#define NES_PFT_SIZE		    48
+
+#define NES_MGT_WQ_COUNT 32
+#define NES_MGT_CTX_SIZE ((NES_NIC_CTX_RQ_SIZE_32) | (NES_NIC_CTX_SQ_SIZE_32))
+#define NES_MGT_QP_OFFSET 36
+#define NES_MGT_QP_COUNT 4
+
+struct nes_hw_tune_timer {
+    /* u16 cq_count; */
+    u16 threshold_low;
+    u16 threshold_target;
+    u16 threshold_high;
+    u16 timer_in_use;
+    u16 timer_in_use_old;
+    u16 timer_in_use_min;
+    u16 timer_in_use_max;
+    u8  timer_direction_upward;
+    u8  timer_direction_downward;
+    u16 cq_count_old;
+    u8  cq_direction_downward;
+};
+
+#define NES_TIMER_INT_LIMIT         2
+#define NES_TIMER_INT_LIMIT_DYNAMIC 10
+#define NES_TIMER_ENABLE_LIMIT      4
+#define NES_MAX_LINK_INTERRUPTS     128
+#define NES_MAX_LINK_CHECK          200
+#define NES_MAX_LRO_DESCRIPTORS     32
+#define NES_LRO_MAX_AGGR            64
+
+struct nes_adapter {
+	u64              fw_ver;
+	unsigned long    *allocated_qps;
+	unsigned long    *allocated_cqs;
+	unsigned long    *allocated_mrs;
+	unsigned long    *allocated_pds;
+	unsigned long    *allocated_arps;
+	struct nes_qp    **qp_table;
+	struct workqueue_struct *work_q;
+
+	struct list_head list;
+	struct list_head active_listeners;
+	/* list of the netdev's associated with each logical port */
+	struct list_head nesvnic_list[4];
+
+	struct timer_list  mh_timer;
+	struct timer_list  lc_timer;
+	struct work_struct work;
+	spinlock_t         resource_lock;
+	spinlock_t         phy_lock;
+	spinlock_t         pbl_lock;
+	spinlock_t         periodic_timer_lock;
+
+	struct nes_arp_entry arp_table[NES_MAX_ARP_TABLE_SIZE];
+
+	/* Adapter CEQ and AEQs */
+	struct nes_hw_ceq ceq[16];
+	struct nes_hw_aeq aeq[8];
+
+	struct nes_hw_tune_timer tune_timer;
+
+	unsigned long doorbell_start;
+
+	u32 hw_rev;
+	u32 vendor_id;
+	u32 vendor_part_id;
+	u32 device_cap_flags;
+	u32 tick_delta;
+	u32 timer_int_req;
+	u32 arp_table_size;
+	u32 next_arp_index;
+
+	u32 max_mr;
+	u32 max_256pbl;
+	u32 max_4kpbl;
+	u32 free_256pbl;
+	u32 free_4kpbl;
+	u32 max_mr_size;
+	u32 max_qp;
+	u32 next_qp;
+	u32 max_irrq;
+	u32 max_qp_wr;
+	u32 max_sge;
+	u32 max_cq;
+	u32 next_cq;
+	u32 max_cqe;
+	u32 max_pd;
+	u32 base_pd;
+	u32 next_pd;
+	u32 hte_index_mask;
+
+	/* EEPROM information */
+	u32 rx_pool_size;
+	u32 tx_pool_size;
+	u32 rx_threshold;
+	u32 tcp_timer_core_clk_divisor;
+	u32 iwarp_config;
+	u32 cm_config;
+	u32 sws_timer_config;
+	u32 tcp_config1;
+	u32 wqm_wat;
+	u32 core_clock;
+	u32 firmware_version;
+	u32 eeprom_version;
+
+	u32 nic_rx_eth_route_err;
+
+	u32 et_rx_coalesce_usecs;
+	u32 et_rx_max_coalesced_frames;
+	u32 et_rx_coalesce_usecs_irq;
+	u32 et_rx_max_coalesced_frames_irq;
+	u32 et_pkt_rate_low;
+	u32 et_rx_coalesce_usecs_low;
+	u32 et_rx_max_coalesced_frames_low;
+	u32 et_pkt_rate_high;
+	u32 et_rx_coalesce_usecs_high;
+	u32 et_rx_max_coalesced_frames_high;
+	u32 et_rate_sample_interval;
+	u32 timer_int_limit;
+	u32 wqm_quanta;
+	u8 allow_unaligned_fpdus;
+
+	/* Adapter base MAC address */
+	u32 mac_addr_low;
+	u16 mac_addr_high;
+
+	u16 firmware_eeprom_offset;
+	u16 software_eeprom_offset;
+
+	u16 max_irrq_wr;
+
+	/* pd config for each port */
+	u16 pd_config_size[4];
+	u16 pd_config_base[4];
+
+	u16 link_interrupt_count[4];
+	u8 crit_error_count[32];
+
+	/* the phy index for each port */
+	u8  phy_index[4];
+	u8  mac_sw_state[4];
+	u8  mac_link_down[4];
+	u8  phy_type[4];
+	u8  log_port;
+
+	/* PCI information */
+	unsigned int  devfn;
+	unsigned char bus_number;
+	unsigned char OneG_Mode;
+
+	unsigned char ref_count;
+	u8            netdev_count;
+	u8            netdev_max;	/* from host nic address count in EEPROM */
+	u8            port_count;
+	u8            virtwq;
+	u8            send_term_ok;
+	u8            et_use_adaptive_rx_coalesce;
+	u8            adapter_fcn_count;
+	u8 pft_mcast_map[NES_PFT_SIZE];
+};
+
+struct nes_pbl {
+	u64              *pbl_vbase;
+	dma_addr_t       pbl_pbase;
+	struct page      *page;
+	unsigned long    user_base;
+	u32              pbl_size;
+	struct list_head list;
+	/* TODO: need to add list for two level tables */
+};
+
+#define NES_4K_PBL_CHUNK_SIZE	4096
+
+struct nes_fast_mr_wqe_pbl {
+	u64		*kva;
+	dma_addr_t	paddr;
+};
+
+struct nes_ib_fast_reg_page_list {
+	struct ib_fast_reg_page_list	ibfrpl;
+	struct nes_fast_mr_wqe_pbl 	nes_wqe_pbl;
+	u64 				pbl;
+};
+
+struct nes_listener {
+	struct work_struct      work;
+	struct workqueue_struct *wq;
+	struct nes_vnic         *nesvnic;
+	struct iw_cm_id         *cm_id;
+	struct list_head        list;
+	unsigned long           socket;
+	u8                      accept_failed;
+};
+
+struct nes_ib_device;
+
+#define NES_EVENT_DELAY msecs_to_jiffies(100)
+
+struct nes_vnic {
+	struct nes_ib_device *nesibdev;
+	u64 sq_full;
+	u64 tso_requests;
+	u64 segmented_tso_requests;
+	u64 linearized_skbs;
+	u64 tx_sw_dropped;
+	u64 endnode_nstat_rx_discard;
+	u64 endnode_nstat_rx_octets;
+	u64 endnode_nstat_rx_frames;
+	u64 endnode_nstat_tx_octets;
+	u64 endnode_nstat_tx_frames;
+	u64 endnode_ipv4_tcp_retransmits;
+	/* void *mem; */
+	struct nes_device *nesdev;
+	struct net_device *netdev;
+	atomic_t          rx_skbs_needed;
+	atomic_t          rx_skb_timer_running;
+	int               budget;
+	u32               msg_enable;
+	/* u32 tx_avail; */
+	__be32            local_ipaddr;
+	struct napi_struct   napi;
+	spinlock_t           tx_lock;	/* could use netdev tx lock? */
+	struct timer_list    rq_wqes_timer;
+	u32                  nic_mem_size;
+	void                 *nic_vbase;
+	dma_addr_t           nic_pbase;
+	struct nes_hw_nic    nic;
+	struct nes_hw_nic_cq nic_cq;
+	u32    mcrq_qp_id;
+	struct nes_ucontext *mcrq_ucontext;
+	struct nes_cqp_request* (*get_cqp_request)(struct nes_device *nesdev);
+	void (*post_cqp_request)(struct nes_device*, struct nes_cqp_request *);
+	int (*mcrq_mcast_filter)( struct nes_vnic* nesvnic, __u8* dmi_addr );
+	struct net_device_stats netstats;
+	/* used to put the netdev on the adapters logical port list */
+	struct list_head list;
+	u16 max_frame_size;
+	u8  netdev_open;
+	u8  linkup;
+	u8  logical_port;
+	u8  netdev_index;  /* might not be needed, indexes nesdev->netdev */
+	u8  perfect_filter_index;
+	u8  nic_index;
+	u8  qp_nic_index[4];
+	u8  next_qp_nic_index;
+	u8  of_device_registered;
+	u8  rdma_enabled;
+	u32 lro_max_aggr;
+	struct net_lro_mgr lro_mgr;
+	struct net_lro_desc lro_desc[NES_MAX_LRO_DESCRIPTORS];
+	struct timer_list event_timer;
+	enum ib_event_type delayed_event;
+	enum ib_event_type last_dispatched_event;
+	spinlock_t port_ibevent_lock;
+	u32 mgt_mem_size;
+	void *mgt_vbase;
+	dma_addr_t mgt_pbase;
+	struct nes_vnic_mgt *mgtvnic[NES_MGT_QP_COUNT];
+	struct task_struct *mgt_thread;
+	wait_queue_head_t mgt_wait_queue;
+	struct sk_buff_head mgt_skb_list;
+
+};
+
+struct nes_ib_device {
+	struct ib_device ibdev;
+	struct nes_vnic *nesvnic;
+
+	/* Virtual RNIC Limits */
+	u32 max_mr;
+	u32 max_qp;
+	u32 max_cq;
+	u32 max_pd;
+	u32 num_mr;
+	u32 num_qp;
+	u32 num_cq;
+	u32 num_pd;
+};
+
+enum nes_hdrct_flags {
+	DDP_LEN_FLAG                    = 0x80,
+	DDP_HDR_FLAG                    = 0x40,
+	RDMA_HDR_FLAG                   = 0x20
+};
+
+enum nes_term_layers {
+	LAYER_RDMA			= 0,
+	LAYER_DDP			= 1,
+	LAYER_MPA			= 2
+};
+
+enum nes_term_error_types {
+	RDMAP_CATASTROPHIC		= 0,
+	RDMAP_REMOTE_PROT		= 1,
+	RDMAP_REMOTE_OP			= 2,
+	DDP_CATASTROPHIC		= 0,
+	DDP_TAGGED_BUFFER		= 1,
+	DDP_UNTAGGED_BUFFER		= 2,
+	DDP_LLP				= 3
+};
+
+enum nes_term_rdma_errors {
+	RDMAP_INV_STAG			= 0x00,
+	RDMAP_INV_BOUNDS		= 0x01,
+	RDMAP_ACCESS			= 0x02,
+	RDMAP_UNASSOC_STAG		= 0x03,
+	RDMAP_TO_WRAP			= 0x04,
+	RDMAP_INV_RDMAP_VER		= 0x05,
+	RDMAP_UNEXPECTED_OP		= 0x06,
+	RDMAP_CATASTROPHIC_LOCAL	= 0x07,
+	RDMAP_CATASTROPHIC_GLOBAL	= 0x08,
+	RDMAP_CANT_INV_STAG		= 0x09,
+	RDMAP_UNSPECIFIED		= 0xff
+};
+
+enum nes_term_ddp_errors {
+	DDP_CATASTROPHIC_LOCAL		= 0x00,
+	DDP_TAGGED_INV_STAG		= 0x00,
+	DDP_TAGGED_BOUNDS		= 0x01,
+	DDP_TAGGED_UNASSOC_STAG		= 0x02,
+	DDP_TAGGED_TO_WRAP		= 0x03,
+	DDP_TAGGED_INV_DDP_VER		= 0x04,
+	DDP_UNTAGGED_INV_QN		= 0x01,
+	DDP_UNTAGGED_INV_MSN_NO_BUF	= 0x02,
+	DDP_UNTAGGED_INV_MSN_RANGE	= 0x03,
+	DDP_UNTAGGED_INV_MO		= 0x04,
+	DDP_UNTAGGED_INV_TOO_LONG	= 0x05,
+	DDP_UNTAGGED_INV_DDP_VER	= 0x06
+};
+
+enum nes_term_mpa_errors {
+	MPA_CLOSED			= 0x01,
+	MPA_CRC				= 0x02,
+	MPA_MARKER			= 0x03,
+	MPA_REQ_RSP			= 0x04,
+};
+
+struct nes_terminate_hdr {
+	u8 layer_etype;
+	u8 error_code;
+	u8 hdrct;
+	u8 rsvd;
+};
+
+/* Used to determine how to fill in terminate error codes */
+#define IWARP_OPCODE_WRITE		0
+#define IWARP_OPCODE_READREQ		1
+#define IWARP_OPCODE_READRSP		2
+#define IWARP_OPCODE_SEND		3
+#define IWARP_OPCODE_SEND_INV		4
+#define IWARP_OPCODE_SEND_SE		5
+#define IWARP_OPCODE_SEND_SE_INV	6
+#define IWARP_OPCODE_TERM		7
+
+/* These values are used only during terminate processing */
+#define TERM_DDP_LEN_TAGGED	14
+#define TERM_DDP_LEN_UNTAGGED	18
+#define TERM_RDMA_LEN		28
+#define RDMA_OPCODE_MASK	0x0f
+#define RDMA_READ_REQ_OPCODE	1
+#define BAD_FRAME_OFFSET	64
+#define CQE_MAJOR_DRV		0x8000
+
+/* Used for link status recheck after interrupt processing */
+#define NES_LINK_RECHECK_DELAY	msecs_to_jiffies(50)
+#define NES_LINK_RECHECK_MAX	60
+
+#endif		/* __NES_HW_H */
diff --git a/drivers/infiniband/hw/nes/nes_mgt.c b/drivers/infiniband/hw/nes/nes_mgt.c
new file mode 100644
index 0000000..4166452
--- /dev/null
+++ b/drivers/infiniband/hw/nes/nes_mgt.c
@@ -0,0 +1,1160 @@
+/*
+ * Copyright (c) 2006 - 2011 Intel-NE, Inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <linux/skbuff.h>
+#include <linux/etherdevice.h>
+#include <linux/kthread.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>
+#include <net/tcp.h>
+#include "nes.h"
+#include "nes_mgt.h"
+
+atomic_t pau_qps_created;
+atomic_t pau_qps_destroyed;
+
+static void nes_replenish_mgt_rq(struct nes_vnic_mgt *mgtvnic)
+{
+	unsigned long flags;
+	dma_addr_t bus_address;
+	struct sk_buff *skb;
+	struct nes_hw_nic_rq_wqe *nic_rqe;
+	struct nes_hw_mgt *nesmgt;
+	struct nes_device *nesdev;
+	struct nes_rskb_cb *cb;
+	u32 rx_wqes_posted = 0;
+
+	nesmgt = &mgtvnic->mgt;
+	nesdev = mgtvnic->nesvnic->nesdev;
+	spin_lock_irqsave(&nesmgt->rq_lock, flags);
+	if (nesmgt->replenishing_rq != 0) {
+		if (((nesmgt->rq_size - 1) == atomic_read(&mgtvnic->rx_skbs_needed)) &&
+		    (atomic_read(&mgtvnic->rx_skb_timer_running) == 0)) {
+			atomic_set(&mgtvnic->rx_skb_timer_running, 1);
+			spin_unlock_irqrestore(&nesmgt->rq_lock, flags);
+			mgtvnic->rq_wqes_timer.expires = jiffies + (HZ / 2);      /* 1/2 second */
+			add_timer(&mgtvnic->rq_wqes_timer);
+		} else {
+			spin_unlock_irqrestore(&nesmgt->rq_lock, flags);
+		}
+		return;
+	}
+	nesmgt->replenishing_rq = 1;
+	spin_unlock_irqrestore(&nesmgt->rq_lock, flags);
+	do {
+		skb = dev_alloc_skb(mgtvnic->nesvnic->max_frame_size);
+		if (skb) {
+			skb->dev = mgtvnic->nesvnic->netdev;
+
+			bus_address = pci_map_single(nesdev->pcidev,
+						     skb->data, mgtvnic->nesvnic->max_frame_size, PCI_DMA_FROMDEVICE);
+			cb = (struct nes_rskb_cb *)&skb->cb[0];
+			cb->busaddr = bus_address;
+			cb->maplen = mgtvnic->nesvnic->max_frame_size;
+
+			nic_rqe = &nesmgt->rq_vbase[mgtvnic->mgt.rq_head];
+			nic_rqe->wqe_words[NES_NIC_RQ_WQE_LENGTH_1_0_IDX] =
+				cpu_to_le32(mgtvnic->nesvnic->max_frame_size);
+			nic_rqe->wqe_words[NES_NIC_RQ_WQE_LENGTH_3_2_IDX] = 0;
+			nic_rqe->wqe_words[NES_NIC_RQ_WQE_FRAG0_LOW_IDX] =
+				cpu_to_le32((u32)bus_address);
+			nic_rqe->wqe_words[NES_NIC_RQ_WQE_FRAG0_HIGH_IDX] =
+				cpu_to_le32((u32)((u64)bus_address >> 32));
+			nesmgt->rx_skb[nesmgt->rq_head] = skb;
+			nesmgt->rq_head++;
+			nesmgt->rq_head &= nesmgt->rq_size - 1;
+			atomic_dec(&mgtvnic->rx_skbs_needed);
+			barrier();
+			if (++rx_wqes_posted == 255) {
+				nes_write32(nesdev->regs + NES_WQE_ALLOC, (rx_wqes_posted << 24) | nesmgt->qp_id);
+				rx_wqes_posted = 0;
+			}
+		} else {
+			spin_lock_irqsave(&nesmgt->rq_lock, flags);
+			if (((nesmgt->rq_size - 1) == atomic_read(&mgtvnic->rx_skbs_needed)) &&
+			    (atomic_read(&mgtvnic->rx_skb_timer_running) == 0)) {
+				atomic_set(&mgtvnic->rx_skb_timer_running, 1);
+				spin_unlock_irqrestore(&nesmgt->rq_lock, flags);
+				mgtvnic->rq_wqes_timer.expires = jiffies + (HZ / 2);      /* 1/2 second */
+				add_timer(&mgtvnic->rq_wqes_timer);
+			} else {
+				spin_unlock_irqrestore(&nesmgt->rq_lock, flags);
+			}
+			break;
+		}
+	} while (atomic_read(&mgtvnic->rx_skbs_needed));
+	barrier();
+	if (rx_wqes_posted)
+		nes_write32(nesdev->regs + NES_WQE_ALLOC, (rx_wqes_posted << 24) | nesmgt->qp_id);
+	nesmgt->replenishing_rq = 0;
+}
+
+/**
+ * nes_mgt_rq_wqes_timeout
+ */
+static void nes_mgt_rq_wqes_timeout(unsigned long parm)
+{
+	struct nes_vnic_mgt *mgtvnic = (struct nes_vnic_mgt *)parm;
+
+	atomic_set(&mgtvnic->rx_skb_timer_running, 0);
+	if (atomic_read(&mgtvnic->rx_skbs_needed))
+		nes_replenish_mgt_rq(mgtvnic);
+}
+
+/**
+ * nes_mgt_free_skb - unmap and free skb
+ */
+static void nes_mgt_free_skb(struct nes_device *nesdev, struct sk_buff *skb, u32 dir)
+{
+	struct nes_rskb_cb *cb;
+
+	cb = (struct nes_rskb_cb *)&skb->cb[0];
+	pci_unmap_single(nesdev->pcidev, cb->busaddr, cb->maplen, dir);
+	cb->busaddr = 0;
+	dev_kfree_skb_any(skb);
+}
+
+/**
+ * nes_download_callback - handle download completions
+ */
+static void nes_download_callback(struct nes_device *nesdev, struct nes_cqp_request *cqp_request)
+{
+	struct pau_fpdu_info *fpdu_info = cqp_request->cqp_callback_pointer;
+	struct nes_qp *nesqp = fpdu_info->nesqp;
+	struct sk_buff *skb;
+	int i;
+
+	for (i = 0; i < fpdu_info->frag_cnt; i++) {
+		skb = fpdu_info->frags[i].skb;
+		if (fpdu_info->frags[i].cmplt) {
+			nes_mgt_free_skb(nesdev, skb, PCI_DMA_TODEVICE);
+			nes_rem_ref_cm_node(nesqp->cm_node);
+		}
+	}
+
+	if (fpdu_info->hdr_vbase)
+		pci_free_consistent(nesdev->pcidev, fpdu_info->hdr_len,
+				    fpdu_info->hdr_vbase, fpdu_info->hdr_pbase);
+	kfree(fpdu_info);
+}
+
+/**
+ * nes_get_seq - Get the seq, ack_seq and window from the packet
+ */
+static u32 nes_get_seq(struct sk_buff *skb, u32 *ack, u16 *wnd, u32 *fin_rcvd, u32 *rst_rcvd)
+{
+	struct nes_rskb_cb *cb = (struct nes_rskb_cb *)&skb->cb[0];
+	struct iphdr *iph = (struct iphdr *)(cb->data_start + ETH_HLEN);
+	struct tcphdr *tcph = (struct tcphdr *)(((char *)iph) + (4 * iph->ihl));
+
+	*ack = be32_to_cpu(tcph->ack_seq);
+	*wnd = be16_to_cpu(tcph->window);
+	*fin_rcvd = tcph->fin;
+	*rst_rcvd = tcph->rst;
+	return be32_to_cpu(tcph->seq);
+}
+
+/**
+ * nes_get_next_skb - Get the next skb based on where current skb is in the queue
+ */
+static struct sk_buff *nes_get_next_skb(struct nes_device *nesdev, struct nes_qp *nesqp,
+					struct sk_buff *skb, u32 nextseq, u32 *ack,
+					u16 *wnd, u32 *fin_rcvd, u32 *rst_rcvd)
+{
+	u32 seq;
+	bool processacks;
+	struct sk_buff *old_skb;
+
+	if (skb) {
+		/* Continue processing fpdu */
+		if (skb->next == (struct sk_buff *)&nesqp->pau_list)
+			goto out;
+		skb = skb->next;
+		processacks = false;
+	} else {
+		/* Starting a new one */
+		if (skb_queue_empty(&nesqp->pau_list))
+			goto out;
+		skb = skb_peek(&nesqp->pau_list);
+		processacks = true;
+	}
+
+	while (1) {
+		if (skb_queue_empty(&nesqp->pau_list))
+			goto out;
+
+		seq = nes_get_seq(skb, ack, wnd, fin_rcvd, rst_rcvd);
+		if (seq == nextseq) {
+			if (skb->len || processacks)
+				break;
+		} else if (after(seq, nextseq)) {
+			goto out;
+		}
+
+		old_skb = skb;
+		skb = skb->next;
+		skb_unlink(old_skb, &nesqp->pau_list);
+		nes_mgt_free_skb(nesdev, old_skb, PCI_DMA_TODEVICE);
+		nes_rem_ref_cm_node(nesqp->cm_node);
+		if (skb == (struct sk_buff *)&nesqp->pau_list)
+			goto out;
+	}
+	return skb;
+
+out:
+	return NULL;
+}
+
+/**
+ * get_fpdu_info - Find the next complete fpdu and return its fragments.
+ */
+static int get_fpdu_info(struct nes_device *nesdev, struct nes_qp *nesqp,
+			 struct pau_fpdu_info **pau_fpdu_info)
+{
+	struct sk_buff *skb;
+	struct iphdr *iph;
+	struct tcphdr *tcph;
+	struct nes_rskb_cb *cb;
+	struct pau_fpdu_info *fpdu_info = NULL;
+	struct pau_fpdu_frag frags[MAX_FPDU_FRAGS];
+	u32 fpdu_len = 0;
+	u32 tmp_len;
+	int frag_cnt = 0;
+	u32 tot_len;
+	u32 frag_tot;
+	u32 ack;
+	u32 fin_rcvd;
+	u32 rst_rcvd;
+	u16 wnd;
+	int i;
+	int rc = 0;
+
+	*pau_fpdu_info = NULL;
+
+	skb = nes_get_next_skb(nesdev, nesqp, NULL, nesqp->pau_rcv_nxt, &ack, &wnd, &fin_rcvd, &rst_rcvd);
+	if (!skb)
+		goto out;
+
+	cb = (struct nes_rskb_cb *)&skb->cb[0];
+	if (skb->len) {
+		fpdu_len = be16_to_cpu(*(__be16 *) skb->data) + MPA_FRAMING;
+		fpdu_len = (fpdu_len + 3) & 0xfffffffc;
+		tmp_len = fpdu_len;
+
+		/* See if we have all of the fpdu */
+		frag_tot = 0;
+		memset(&frags, 0, sizeof frags);
+		for (i = 0; i < MAX_FPDU_FRAGS; i++) {
+			frags[i].physaddr = cb->busaddr;
+			frags[i].physaddr += skb->data - cb->data_start;
+			frags[i].frag_len = min(tmp_len, skb->len);
+			frags[i].skb = skb;
+			frags[i].cmplt = (skb->len == frags[i].frag_len);
+			frag_tot += frags[i].frag_len;
+			frag_cnt++;
+
+			tmp_len -= frags[i].frag_len;
+			if (tmp_len == 0)
+				break;
+
+			skb = nes_get_next_skb(nesdev, nesqp, skb,
+					       nesqp->pau_rcv_nxt + frag_tot, &ack, &wnd, &fin_rcvd, &rst_rcvd);
+			if (!skb)
+				goto out;
+			if (rst_rcvd) {
+				/* rst received in the middle of fpdu */
+				for (; i >= 0; i--) {
+					skb_unlink(frags[i].skb, &nesqp->pau_list);
+					nes_mgt_free_skb(nesdev, frags[i].skb, PCI_DMA_TODEVICE);
+				}
+				cb = (struct nes_rskb_cb *)&skb->cb[0];
+				frags[0].physaddr = cb->busaddr;
+				frags[0].physaddr += skb->data - cb->data_start;
+				frags[0].frag_len = skb->len;
+				frags[0].skb = skb;
+				frags[0].cmplt = true;
+				frag_cnt = 1;
+				break;
+			}
+
+			cb = (struct nes_rskb_cb *)&skb->cb[0];
+		}
+	} else {
+		/* no data */
+		frags[0].physaddr = cb->busaddr;
+		frags[0].frag_len = 0;
+		frags[0].skb = skb;
+		frags[0].cmplt = true;
+		frag_cnt = 1;
+	}
+
+	/* Found one */
+	fpdu_info = kzalloc(sizeof(*fpdu_info), GFP_ATOMIC);
+	if (fpdu_info == NULL) {
+		nes_debug(NES_DBG_PAU, "Failed to alloc a fpdu_info.\n");
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	fpdu_info->cqp_request = nes_get_cqp_request(nesdev);
+	if (fpdu_info->cqp_request == NULL) {
+		nes_debug(NES_DBG_PAU, "Failed to get a cqp_request.\n");
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	cb = (struct nes_rskb_cb *)&frags[0].skb->cb[0];
+	iph = (struct iphdr *)(cb->data_start + ETH_HLEN);
+	tcph = (struct tcphdr *)(((char *)iph) + (4 * iph->ihl));
+	fpdu_info->hdr_len = (((unsigned char *)tcph) + 4 * (tcph->doff)) - cb->data_start;
+	fpdu_info->data_len = fpdu_len;
+	tot_len = fpdu_info->hdr_len + fpdu_len - ETH_HLEN;
+
+	if (frags[0].cmplt) {
+		fpdu_info->hdr_pbase = cb->busaddr;
+		fpdu_info->hdr_vbase = NULL;
+	} else {
+		fpdu_info->hdr_vbase = pci_alloc_consistent(nesdev->pcidev,
+							    fpdu_info->hdr_len, &fpdu_info->hdr_pbase);
+		if (!fpdu_info->hdr_vbase) {
+			nes_debug(NES_DBG_PAU, "Unable to allocate memory for pau first frag\n");
+			rc = -ENOMEM;
+			goto out;
+		}
+
+		/* Copy hdrs, adjusting len and seqnum */
+		memcpy(fpdu_info->hdr_vbase, cb->data_start, fpdu_info->hdr_len);
+		iph = (struct iphdr *)(fpdu_info->hdr_vbase + ETH_HLEN);
+		tcph = (struct tcphdr *)(((char *)iph) + (4 * iph->ihl));
+	}
+
+	iph->tot_len = cpu_to_be16(tot_len);
+	iph->saddr = cpu_to_be32(0x7f000001);
+
+	tcph->seq = cpu_to_be32(nesqp->pau_rcv_nxt);
+	tcph->ack_seq = cpu_to_be32(ack);
+	tcph->window = cpu_to_be16(wnd);
+
+	nesqp->pau_rcv_nxt += fpdu_len + fin_rcvd;
+
+	memcpy(fpdu_info->frags, frags, sizeof(fpdu_info->frags));
+	fpdu_info->frag_cnt = frag_cnt;
+	fpdu_info->nesqp = nesqp;
+	*pau_fpdu_info = fpdu_info;
+
+	/* Update skb's for next pass */
+	for (i = 0; i < frag_cnt; i++) {
+		cb = (struct nes_rskb_cb *)&frags[i].skb->cb[0];
+		skb_pull(frags[i].skb, frags[i].frag_len);
+
+		if (frags[i].skb->len == 0) {
+			/* Pull skb off the list - it will be freed in the callback */
+			if (!skb_queue_empty(&nesqp->pau_list))
+				skb_unlink(frags[i].skb, &nesqp->pau_list);
+		} else {
+			/* Last skb still has data so update the seq */
+			iph = (struct iphdr *)(cb->data_start + ETH_HLEN);
+			tcph = (struct tcphdr *)(((char *)iph) + (4 * iph->ihl));
+			tcph->seq = cpu_to_be32(nesqp->pau_rcv_nxt);
+		}
+	}
+
+out:
+	if (rc) {
+		if (fpdu_info) {
+			if (fpdu_info->cqp_request)
+				nes_put_cqp_request(nesdev, fpdu_info->cqp_request);
+			kfree(fpdu_info);
+		}
+	}
+	return rc;
+}
+
+/**
+ * forward_fpdu - send complete fpdus, one at a time
+ */
+static int forward_fpdus(struct nes_vnic *nesvnic, struct nes_qp *nesqp)
+{
+	struct nes_device *nesdev = nesvnic->nesdev;
+	struct pau_fpdu_info *fpdu_info;
+	struct nes_hw_cqp_wqe *cqp_wqe;
+	struct nes_cqp_request *cqp_request;
+	unsigned long flags;
+	u64 u64tmp;
+	u32 u32tmp;
+	int rc;
+
+	while (1) {
+		spin_lock_irqsave(&nesqp->pau_lock, flags);
+		rc = get_fpdu_info(nesdev, nesqp, &fpdu_info);
+		if (rc || (fpdu_info == NULL)) {
+			spin_unlock_irqrestore(&nesqp->pau_lock, flags);
+			return rc;
+		}
+
+		cqp_request = fpdu_info->cqp_request;
+		cqp_wqe = &cqp_request->cqp_wqe;
+		nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
+		set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_DL_OPCODE_IDX,
+				    NES_CQP_DOWNLOAD_SEGMENT |
+				    (((u32)nesvnic->logical_port) << NES_CQP_OP_LOGICAL_PORT_SHIFT));
+
+		u32tmp = fpdu_info->hdr_len << 16;
+		u32tmp |= fpdu_info->hdr_len + (u32)fpdu_info->data_len;
+		set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_DL_LENGTH_0_TOTAL_IDX,
+				    u32tmp);
+
+		u32tmp = (fpdu_info->frags[1].frag_len << 16) | fpdu_info->frags[0].frag_len;
+		set_wqe_32bit_value(cqp_wqe->wqe_words, NES_NIC_SQ_WQE_LENGTH_2_1_IDX,
+				    u32tmp);
+
+		u32tmp = (fpdu_info->frags[3].frag_len << 16) | fpdu_info->frags[2].frag_len;
+		set_wqe_32bit_value(cqp_wqe->wqe_words, NES_NIC_SQ_WQE_LENGTH_4_3_IDX,
+				    u32tmp);
+
+		u64tmp = (u64)fpdu_info->hdr_pbase;
+		set_wqe_32bit_value(cqp_wqe->wqe_words, NES_NIC_SQ_WQE_FRAG0_LOW_IDX,
+				    lower_32_bits(u64tmp));
+		set_wqe_32bit_value(cqp_wqe->wqe_words, NES_NIC_SQ_WQE_FRAG0_HIGH_IDX,
+				    upper_32_bits(u64tmp));
+
+		set_wqe_32bit_value(cqp_wqe->wqe_words, NES_NIC_SQ_WQE_FRAG1_LOW_IDX,
+				    lower_32_bits(fpdu_info->frags[0].physaddr));
+		set_wqe_32bit_value(cqp_wqe->wqe_words, NES_NIC_SQ_WQE_FRAG1_HIGH_IDX,
+				    upper_32_bits(fpdu_info->frags[0].physaddr));
+
+		set_wqe_32bit_value(cqp_wqe->wqe_words, NES_NIC_SQ_WQE_FRAG2_LOW_IDX,
+				    lower_32_bits(fpdu_info->frags[1].physaddr));
+		set_wqe_32bit_value(cqp_wqe->wqe_words, NES_NIC_SQ_WQE_FRAG2_HIGH_IDX,
+				    upper_32_bits(fpdu_info->frags[1].physaddr));
+
+		set_wqe_32bit_value(cqp_wqe->wqe_words, NES_NIC_SQ_WQE_FRAG3_LOW_IDX,
+				    lower_32_bits(fpdu_info->frags[2].physaddr));
+		set_wqe_32bit_value(cqp_wqe->wqe_words, NES_NIC_SQ_WQE_FRAG3_HIGH_IDX,
+				    upper_32_bits(fpdu_info->frags[2].physaddr));
+
+		set_wqe_32bit_value(cqp_wqe->wqe_words, NES_NIC_SQ_WQE_FRAG4_LOW_IDX,
+				    lower_32_bits(fpdu_info->frags[3].physaddr));
+		set_wqe_32bit_value(cqp_wqe->wqe_words, NES_NIC_SQ_WQE_FRAG4_HIGH_IDX,
+				    upper_32_bits(fpdu_info->frags[3].physaddr));
+
+		cqp_request->cqp_callback_pointer = fpdu_info;
+		cqp_request->callback = 1;
+		cqp_request->cqp_callback = nes_download_callback;
+
+		atomic_set(&cqp_request->refcount, 1);
+		nes_post_cqp_request(nesdev, cqp_request);
+		spin_unlock_irqrestore(&nesqp->pau_lock, flags);
+	}
+
+	return 0;
+}
+
+static void process_fpdus(struct nes_vnic *nesvnic, struct nes_qp *nesqp)
+{
+	int again = 1;
+	unsigned long flags;
+
+	do {
+		/* Ignore rc - if it failed, tcp retries will cause it to try again */
+		forward_fpdus(nesvnic, nesqp);
+
+		spin_lock_irqsave(&nesqp->pau_lock, flags);
+		if (nesqp->pau_pending) {
+			nesqp->pau_pending = 0;
+		} else {
+			nesqp->pau_busy = 0;
+			again = 0;
+		}
+
+		spin_unlock_irqrestore(&nesqp->pau_lock, flags);
+	} while (again);
+}
+
+/**
+ * queue_fpdus - Handle fpdu's that hw passed up to sw
+ */
+static void queue_fpdus(struct sk_buff *skb, struct nes_vnic *nesvnic, struct nes_qp *nesqp)
+{
+	struct sk_buff *tmpskb;
+	struct nes_rskb_cb *cb;
+	struct iphdr *iph;
+	struct tcphdr *tcph;
+	unsigned char *tcph_end;
+	u32 rcv_nxt;
+	u32 rcv_wnd;
+	u32 seqnum;
+	u32 len;
+	bool process_it = false;
+	unsigned long flags;
+
+	/* Move data ptr to after tcp header */
+	iph = (struct iphdr *)skb->data;
+	tcph = (struct tcphdr *)(((char *)iph) + (4 * iph->ihl));
+	seqnum = be32_to_cpu(tcph->seq);
+	tcph_end = (((char *)tcph) + (4 * tcph->doff));
+
+	len = be16_to_cpu(iph->tot_len);
+	if (skb->len > len)
+		skb_trim(skb, len);
+	skb_pull(skb, tcph_end - skb->data);
+
+	/* Initialize tracking values */
+	cb = (struct nes_rskb_cb *)&skb->cb[0];
+	cb->seqnum = seqnum;
+
+	/* Make sure data is in the receive window */
+	rcv_nxt = nesqp->pau_rcv_nxt;
+	rcv_wnd = le32_to_cpu(nesqp->nesqp_context->rcv_wnd);
+	if (!between(seqnum, rcv_nxt, (rcv_nxt + rcv_wnd))) {
+		nes_mgt_free_skb(nesvnic->nesdev, skb, PCI_DMA_TODEVICE);
+		nes_rem_ref_cm_node(nesqp->cm_node);
+		return;
+	}
+
+	spin_lock_irqsave(&nesqp->pau_lock, flags);
+
+	if (nesqp->pau_busy)
+		nesqp->pau_pending = 1;
+	else
+		nesqp->pau_busy = 1;
+
+	/* Queue skb by sequence number */
+	if (skb_queue_len(&nesqp->pau_list) == 0) {
+		skb_queue_head(&nesqp->pau_list, skb);
+	} else {
+		tmpskb = nesqp->pau_list.next;
+		while (tmpskb != (struct sk_buff *)&nesqp->pau_list) {
+			cb = (struct nes_rskb_cb *)&tmpskb->cb[0];
+			if (before(seqnum, cb->seqnum))
+				break;
+			tmpskb = tmpskb->next;
+		}
+		skb_insert(tmpskb, skb, &nesqp->pau_list);
+	}
+	if (nesqp->pau_state == PAU_READY)
+		process_it = true;
+	spin_unlock_irqrestore(&nesqp->pau_lock, flags);
+
+	if (process_it)
+		process_fpdus(nesvnic, nesqp);
+
+	return;
+}
+
+/**
+ * mgt_thread - Handle mgt skbs in a safe context
+ */
+static int mgt_thread(void *context)
+{
+	struct nes_vnic *nesvnic = context;
+	struct sk_buff *skb;
+	struct nes_rskb_cb *cb;
+
+	while (!kthread_should_stop()) {
+		wait_event_interruptible(nesvnic->mgt_wait_queue,
+					 skb_queue_len(&nesvnic->mgt_skb_list) || kthread_should_stop());
+		while ((skb_queue_len(&nesvnic->mgt_skb_list)) && !kthread_should_stop()) {
+			skb = skb_dequeue(&nesvnic->mgt_skb_list);
+			cb = (struct nes_rskb_cb *)&skb->cb[0];
+			cb->data_start = skb->data - ETH_HLEN;
+			cb->busaddr = pci_map_single(nesvnic->nesdev->pcidev, cb->data_start,
+						     nesvnic->max_frame_size, PCI_DMA_TODEVICE);
+			queue_fpdus(skb, nesvnic, cb->nesqp);
+		}
+	}
+
+	/* Closing down so delete any entries on the queue */
+	while (skb_queue_len(&nesvnic->mgt_skb_list)) {
+		skb = skb_dequeue(&nesvnic->mgt_skb_list);
+		cb = (struct nes_rskb_cb *)&skb->cb[0];
+		nes_rem_ref_cm_node(cb->nesqp->cm_node);
+		dev_kfree_skb_any(skb);
+	}
+	return 0;
+}
+
+/**
+ * nes_queue_skbs - Queue skb so it can be handled in a thread context
+ */
+void nes_queue_mgt_skbs(struct sk_buff *skb, struct nes_vnic *nesvnic, struct nes_qp *nesqp)
+{
+	struct nes_rskb_cb *cb;
+
+	cb = (struct nes_rskb_cb *)&skb->cb[0];
+	cb->nesqp = nesqp;
+	skb_queue_tail(&nesvnic->mgt_skb_list, skb);
+	wake_up_interruptible(&nesvnic->mgt_wait_queue);
+}
+
+void nes_destroy_pau_qp(struct nes_device *nesdev, struct nes_qp *nesqp)
+{
+	struct sk_buff *skb;
+	unsigned long flags;
+	atomic_inc(&pau_qps_destroyed);
+
+	/* Free packets that have not yet been forwarded */
+	/* Lock is acquired by skb_dequeue when removing the skb */
+	spin_lock_irqsave(&nesqp->pau_lock, flags);
+	while (skb_queue_len(&nesqp->pau_list)) {
+		skb = skb_dequeue(&nesqp->pau_list);
+		nes_mgt_free_skb(nesdev, skb, PCI_DMA_TODEVICE);
+		nes_rem_ref_cm_node(nesqp->cm_node);
+	}
+	spin_unlock_irqrestore(&nesqp->pau_lock, flags);
+}
+
+static void nes_chg_qh_handler(struct nes_device *nesdev, struct nes_cqp_request *cqp_request)
+{
+	struct pau_qh_chg *qh_chg = cqp_request->cqp_callback_pointer;
+	struct nes_cqp_request *new_request;
+	struct nes_hw_cqp_wqe *cqp_wqe;
+	struct nes_adapter *nesadapter;
+	struct nes_qp *nesqp;
+	struct nes_v4_quad nes_quad;
+	u32 crc_value;
+	u64 u64temp;
+
+	nesadapter = nesdev->nesadapter;
+	nesqp = qh_chg->nesqp;
+
+	/* Should we handle the bad completion */
+	if (cqp_request->major_code)
+		WARN(1, PFX "Invalid cqp_request major_code=0x%x\n",
+		       cqp_request->major_code);
+
+	switch (nesqp->pau_state) {
+	case PAU_DEL_QH:
+		/* Old hash code deleted, now set the new one */
+		nesqp->pau_state = PAU_ADD_LB_QH;
+		new_request = nes_get_cqp_request(nesdev);
+		if (new_request == NULL) {
+			nes_debug(NES_DBG_PAU, "Failed to get a new_request.\n");
+			WARN_ON(1);
+			return;
+		}
+
+		memset(&nes_quad, 0, sizeof(nes_quad));
+		nes_quad.DstIpAdrIndex =
+			cpu_to_le32((u32)PCI_FUNC(nesdev->pcidev->devfn) << 24);
+		nes_quad.SrcIpadr = cpu_to_be32(0x7f000001);
+		nes_quad.TcpPorts[0] = swab16(nesqp->nesqp_context->tcpPorts[1]);
+		nes_quad.TcpPorts[1] = swab16(nesqp->nesqp_context->tcpPorts[0]);
+
+		/* Produce hash key */
+		crc_value = get_crc_value(&nes_quad);
+		nesqp->hte_index = cpu_to_be32(crc_value ^ 0xffffffff);
+		nes_debug(NES_DBG_PAU, "new HTE Index = 0x%08X, CRC = 0x%08X\n",
+			  nesqp->hte_index, nesqp->hte_index & nesadapter->hte_index_mask);
+
+		nesqp->hte_index &= nesadapter->hte_index_mask;
+		nesqp->nesqp_context->hte_index = cpu_to_le32(nesqp->hte_index);
+		nesqp->nesqp_context->ip0 = cpu_to_le32(0x7f000001);
+		nesqp->nesqp_context->rcv_nxt = cpu_to_le32(nesqp->pau_rcv_nxt);
+
+		cqp_wqe = &new_request->cqp_wqe;
+		nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
+		set_wqe_32bit_value(cqp_wqe->wqe_words,
+				    NES_CQP_WQE_OPCODE_IDX, NES_CQP_MANAGE_QUAD_HASH |
+				    NES_CQP_QP_TYPE_IWARP | NES_CQP_QP_CONTEXT_VALID | NES_CQP_QP_IWARP_STATE_RTS);
+		set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_ID_IDX, nesqp->hwqp.qp_id);
+		u64temp = (u64)nesqp->nesqp_context_pbase;
+		set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_QP_WQE_CONTEXT_LOW_IDX, u64temp);
+
+		nes_debug(NES_DBG_PAU, "Waiting for CQP completion for adding the quad hash.\n");
+
+		new_request->cqp_callback_pointer = qh_chg;
+		new_request->callback = 1;
+		new_request->cqp_callback = nes_chg_qh_handler;
+		atomic_set(&new_request->refcount, 1);
+		nes_post_cqp_request(nesdev, new_request);
+		break;
+
+	case PAU_ADD_LB_QH:
+		/* Start processing the queued fpdu's */
+		nesqp->pau_state = PAU_READY;
+		process_fpdus(qh_chg->nesvnic, qh_chg->nesqp);
+		kfree(qh_chg);
+		break;
+	}
+}
+
+/**
+ * nes_change_quad_hash
+ */
+static int nes_change_quad_hash(struct nes_device *nesdev,
+				struct nes_vnic *nesvnic, struct nes_qp *nesqp)
+{
+	struct nes_cqp_request *cqp_request = NULL;
+	struct pau_qh_chg *qh_chg = NULL;
+	u64 u64temp;
+	struct nes_hw_cqp_wqe *cqp_wqe;
+	int ret = 0;
+
+	cqp_request = nes_get_cqp_request(nesdev);
+	if (cqp_request == NULL) {
+		nes_debug(NES_DBG_PAU, "Failed to get a cqp_request.\n");
+		ret = -ENOMEM;
+		goto chg_qh_err;
+	}
+
+	qh_chg = kmalloc(sizeof *qh_chg, GFP_ATOMIC);
+	if (qh_chg == NULL) {
+		nes_debug(NES_DBG_PAU, "Failed to get a cqp_request.\n");
+		ret = -ENOMEM;
+		goto chg_qh_err;
+	}
+	qh_chg->nesdev = nesdev;
+	qh_chg->nesvnic = nesvnic;
+	qh_chg->nesqp = nesqp;
+	nesqp->pau_state = PAU_DEL_QH;
+
+	cqp_wqe = &cqp_request->cqp_wqe;
+	nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
+	set_wqe_32bit_value(cqp_wqe->wqe_words,
+			    NES_CQP_WQE_OPCODE_IDX, NES_CQP_MANAGE_QUAD_HASH | NES_CQP_QP_DEL_HTE |
+			    NES_CQP_QP_TYPE_IWARP | NES_CQP_QP_CONTEXT_VALID | NES_CQP_QP_IWARP_STATE_RTS);
+	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_ID_IDX, nesqp->hwqp.qp_id);
+	u64temp = (u64)nesqp->nesqp_context_pbase;
+	set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_QP_WQE_CONTEXT_LOW_IDX, u64temp);
+
+	nes_debug(NES_DBG_PAU, "Waiting for CQP completion for deleting the quad hash.\n");
+
+	cqp_request->cqp_callback_pointer = qh_chg;
+	cqp_request->callback = 1;
+	cqp_request->cqp_callback = nes_chg_qh_handler;
+	atomic_set(&cqp_request->refcount, 1);
+	nes_post_cqp_request(nesdev, cqp_request);
+
+	return ret;
+
+chg_qh_err:
+	kfree(qh_chg);
+	if (cqp_request)
+		nes_put_cqp_request(nesdev, cqp_request);
+	return ret;
+}
+
+/**
+ * nes_mgt_ce_handler
+ * This management code deals with any packed and unaligned (pau) fpdu's
+ * that the hardware cannot handle.
+ */
+static void nes_mgt_ce_handler(struct nes_device *nesdev, struct nes_hw_nic_cq *cq)
+{
+	struct nes_vnic_mgt *mgtvnic = container_of(cq, struct nes_vnic_mgt, mgt_cq);
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+	u32 head;
+	u32 cq_size;
+	u32 cqe_count = 0;
+	u32 cqe_misc;
+	u32 qp_id = 0;
+	u32 skbs_needed;
+	unsigned long context;
+	struct nes_qp *nesqp;
+	struct sk_buff *rx_skb;
+	struct nes_rskb_cb *cb;
+
+	head = cq->cq_head;
+	cq_size = cq->cq_size;
+
+	while (1) {
+		cqe_misc = le32_to_cpu(cq->cq_vbase[head].cqe_words[NES_NIC_CQE_MISC_IDX]);
+		if (!(cqe_misc & NES_NIC_CQE_VALID))
+			break;
+
+		nesqp = NULL;
+		if (cqe_misc & NES_NIC_CQE_ACCQP_VALID) {
+			qp_id = le32_to_cpu(cq->cq_vbase[head].cqe_words[NES_NIC_CQE_ACCQP_ID_IDX]);
+			qp_id &= 0x001fffff;
+			if (qp_id < nesadapter->max_qp) {
+				context = (unsigned long)nesadapter->qp_table[qp_id - NES_FIRST_QPN];
+				nesqp = (struct nes_qp *)context;
+			}
+		}
+
+		if (nesqp) {
+			if (nesqp->pau_mode == false) {
+				nesqp->pau_mode = true; /* First time for this qp */
+				nesqp->pau_rcv_nxt = le32_to_cpu(
+					cq->cq_vbase[head].cqe_words[NES_NIC_CQE_HASH_RCVNXT]);
+				skb_queue_head_init(&nesqp->pau_list);
+				spin_lock_init(&nesqp->pau_lock);
+				atomic_inc(&pau_qps_created);
+				nes_change_quad_hash(nesdev, mgtvnic->nesvnic, nesqp);
+			}
+
+			rx_skb = mgtvnic->mgt.rx_skb[mgtvnic->mgt.rq_tail];
+			rx_skb->len = 0;
+			skb_put(rx_skb, cqe_misc & 0x0000ffff);
+			rx_skb->protocol = eth_type_trans(rx_skb, mgtvnic->nesvnic->netdev);
+			cb = (struct nes_rskb_cb *)&rx_skb->cb[0];
+			pci_unmap_single(nesdev->pcidev, cb->busaddr, cb->maplen, PCI_DMA_FROMDEVICE);
+			cb->busaddr = 0;
+			mgtvnic->mgt.rq_tail++;
+			mgtvnic->mgt.rq_tail &= mgtvnic->mgt.rq_size - 1;
+
+			nes_add_ref_cm_node(nesqp->cm_node);
+			nes_queue_mgt_skbs(rx_skb, mgtvnic->nesvnic, nesqp);
+		} else {
+			printk(KERN_ERR PFX "Invalid QP %d for packed/unaligned handling\n", qp_id);
+		}
+
+		cq->cq_vbase[head].cqe_words[NES_NIC_CQE_MISC_IDX] = 0;
+		cqe_count++;
+		if (++head >= cq_size)
+			head = 0;
+
+		if (cqe_count == 255) {
+			/* Replenish mgt CQ */
+			nes_write32(nesdev->regs + NES_CQE_ALLOC, cq->cq_number | (cqe_count << 16));
+			nesdev->currcq_count += cqe_count;
+			cqe_count = 0;
+		}
+
+		skbs_needed = atomic_inc_return(&mgtvnic->rx_skbs_needed);
+		if (skbs_needed > (mgtvnic->mgt.rq_size >> 1))
+			nes_replenish_mgt_rq(mgtvnic);
+	}
+
+	cq->cq_head = head;
+	nes_write32(nesdev->regs + NES_CQE_ALLOC, NES_CQE_ALLOC_NOTIFY_NEXT |
+		    cq->cq_number | (cqe_count << 16));
+	nes_read32(nesdev->regs + NES_CQE_ALLOC);
+	nesdev->currcq_count += cqe_count;
+}
+
+/**
+ * nes_init_mgt_qp
+ */
+int nes_init_mgt_qp(struct nes_device *nesdev, struct net_device *netdev, struct nes_vnic *nesvnic)
+{
+	struct nes_vnic_mgt *mgtvnic;
+	u32 counter;
+	void *vmem;
+	dma_addr_t pmem;
+	struct nes_hw_cqp_wqe *cqp_wqe;
+	u32 cqp_head;
+	unsigned long flags;
+	struct nes_hw_nic_qp_context *mgt_context;
+	u64 u64temp;
+	struct nes_hw_nic_rq_wqe *mgt_rqe;
+	struct sk_buff *skb;
+	u32 wqe_count;
+	struct nes_rskb_cb *cb;
+	u32 mgt_mem_size;
+	void *mgt_vbase;
+	dma_addr_t mgt_pbase;
+	int i;
+	int ret;
+
+	/* Allocate space the all mgt QPs once */
+	mgtvnic = kzalloc(NES_MGT_QP_COUNT * sizeof(struct nes_vnic_mgt), GFP_KERNEL);
+	if (mgtvnic == NULL) {
+		nes_debug(NES_DBG_INIT, "Unable to allocate memory for mgt structure\n");
+		return -ENOMEM;
+	}
+
+	/* Allocate fragment, RQ, and CQ; Reuse CEQ based on the PCI function */
+	/* We are not sending from this NIC so sq is not allocated */
+	mgt_mem_size = 256 +
+		       (NES_MGT_WQ_COUNT * sizeof(struct nes_hw_nic_rq_wqe)) +
+		       (NES_MGT_WQ_COUNT * sizeof(struct nes_hw_nic_cqe)) +
+		       sizeof(struct nes_hw_nic_qp_context);
+	mgt_mem_size = (mgt_mem_size + PAGE_SIZE - 1) & ~(PAGE_SIZE - 1);
+	mgt_vbase = pci_alloc_consistent(nesdev->pcidev, NES_MGT_QP_COUNT * mgt_mem_size, &mgt_pbase);
+	if (!mgt_vbase) {
+		kfree(mgtvnic);
+		nes_debug(NES_DBG_INIT, "Unable to allocate memory for mgt host descriptor rings\n");
+		return -ENOMEM;
+	}
+
+	nesvnic->mgt_mem_size = NES_MGT_QP_COUNT * mgt_mem_size;
+	nesvnic->mgt_vbase = mgt_vbase;
+	nesvnic->mgt_pbase = mgt_pbase;
+
+	skb_queue_head_init(&nesvnic->mgt_skb_list);
+	init_waitqueue_head(&nesvnic->mgt_wait_queue);
+	nesvnic->mgt_thread = kthread_run(mgt_thread, nesvnic, "nes_mgt_thread");
+
+	for (i = 0; i < NES_MGT_QP_COUNT; i++) {
+		mgtvnic->nesvnic = nesvnic;
+		mgtvnic->mgt.qp_id = nesdev->mac_index + NES_MGT_QP_OFFSET + i;
+		memset(mgt_vbase, 0, mgt_mem_size);
+		nes_debug(NES_DBG_INIT, "Allocated mgt QP structures at %p (phys = %016lX), size = %u.\n",
+			  mgt_vbase, (unsigned long)mgt_pbase, mgt_mem_size);
+
+		vmem = (void *)(((unsigned long)mgt_vbase + (256 - 1)) &
+				~(unsigned long)(256 - 1));
+		pmem = (dma_addr_t)(((unsigned long long)mgt_pbase + (256 - 1)) &
+				    ~(unsigned long long)(256 - 1));
+
+		spin_lock_init(&mgtvnic->mgt.rq_lock);
+
+		/* setup the RQ */
+		mgtvnic->mgt.rq_vbase = vmem;
+		mgtvnic->mgt.rq_pbase = pmem;
+		mgtvnic->mgt.rq_head = 0;
+		mgtvnic->mgt.rq_tail = 0;
+		mgtvnic->mgt.rq_size = NES_MGT_WQ_COUNT;
+
+		/* setup the CQ */
+		vmem += (NES_MGT_WQ_COUNT * sizeof(struct nes_hw_nic_rq_wqe));
+		pmem += (NES_MGT_WQ_COUNT * sizeof(struct nes_hw_nic_rq_wqe));
+
+		mgtvnic->mgt_cq.cq_number = mgtvnic->mgt.qp_id;
+		mgtvnic->mgt_cq.cq_vbase = vmem;
+		mgtvnic->mgt_cq.cq_pbase = pmem;
+		mgtvnic->mgt_cq.cq_head = 0;
+		mgtvnic->mgt_cq.cq_size = NES_MGT_WQ_COUNT;
+
+		mgtvnic->mgt_cq.ce_handler = nes_mgt_ce_handler;
+
+		/* Send CreateCQ request to CQP */
+		spin_lock_irqsave(&nesdev->cqp.lock, flags);
+		cqp_head = nesdev->cqp.sq_head;
+
+		cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head];
+		nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
+
+		cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] = cpu_to_le32(
+			NES_CQP_CREATE_CQ | NES_CQP_CQ_CEQ_VALID |
+			((u32)mgtvnic->mgt_cq.cq_size << 16));
+		cqp_wqe->wqe_words[NES_CQP_WQE_ID_IDX] = cpu_to_le32(
+			mgtvnic->mgt_cq.cq_number | ((u32)nesdev->ceq_index << 16));
+		u64temp = (u64)mgtvnic->mgt_cq.cq_pbase;
+		set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_CQ_WQE_PBL_LOW_IDX, u64temp);
+		cqp_wqe->wqe_words[NES_CQP_CQ_WQE_CQ_CONTEXT_HIGH_IDX] = 0;
+		u64temp = (unsigned long)&mgtvnic->mgt_cq;
+		cqp_wqe->wqe_words[NES_CQP_CQ_WQE_CQ_CONTEXT_LOW_IDX] = cpu_to_le32((u32)(u64temp >> 1));
+		cqp_wqe->wqe_words[NES_CQP_CQ_WQE_CQ_CONTEXT_HIGH_IDX] =
+			cpu_to_le32(((u32)((u64temp) >> 33)) & 0x7FFFFFFF);
+		cqp_wqe->wqe_words[NES_CQP_CQ_WQE_DOORBELL_INDEX_HIGH_IDX] = 0;
+
+		if (++cqp_head >= nesdev->cqp.sq_size)
+			cqp_head = 0;
+		cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head];
+		nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
+
+		/* Send CreateQP request to CQP */
+		mgt_context = (void *)(&mgtvnic->mgt_cq.cq_vbase[mgtvnic->mgt_cq.cq_size]);
+		mgt_context->context_words[NES_NIC_CTX_MISC_IDX] =
+			cpu_to_le32((u32)NES_MGT_CTX_SIZE |
+				    ((u32)PCI_FUNC(nesdev->pcidev->devfn) << 12));
+		nes_debug(NES_DBG_INIT, "RX_WINDOW_BUFFER_PAGE_TABLE_SIZE = 0x%08X, RX_WINDOW_BUFFER_SIZE = 0x%08X\n",
+			  nes_read_indexed(nesdev, NES_IDX_RX_WINDOW_BUFFER_PAGE_TABLE_SIZE),
+			  nes_read_indexed(nesdev, NES_IDX_RX_WINDOW_BUFFER_SIZE));
+		if (nes_read_indexed(nesdev, NES_IDX_RX_WINDOW_BUFFER_SIZE) != 0)
+			mgt_context->context_words[NES_NIC_CTX_MISC_IDX] |= cpu_to_le32(NES_NIC_BACK_STORE);
+
+		u64temp = (u64)mgtvnic->mgt.rq_pbase;
+		mgt_context->context_words[NES_NIC_CTX_SQ_LOW_IDX] = cpu_to_le32((u32)u64temp);
+		mgt_context->context_words[NES_NIC_CTX_SQ_HIGH_IDX] = cpu_to_le32((u32)(u64temp >> 32));
+		u64temp = (u64)mgtvnic->mgt.rq_pbase;
+		mgt_context->context_words[NES_NIC_CTX_RQ_LOW_IDX] = cpu_to_le32((u32)u64temp);
+		mgt_context->context_words[NES_NIC_CTX_RQ_HIGH_IDX] = cpu_to_le32((u32)(u64temp >> 32));
+
+		cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] = cpu_to_le32(NES_CQP_CREATE_QP |
+									 NES_CQP_QP_TYPE_NIC);
+		cqp_wqe->wqe_words[NES_CQP_WQE_ID_IDX] = cpu_to_le32(mgtvnic->mgt.qp_id);
+		u64temp = (u64)mgtvnic->mgt_cq.cq_pbase +
+			  (mgtvnic->mgt_cq.cq_size * sizeof(struct nes_hw_nic_cqe));
+		set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_QP_WQE_CONTEXT_LOW_IDX, u64temp);
+
+		if (++cqp_head >= nesdev->cqp.sq_size)
+			cqp_head = 0;
+		nesdev->cqp.sq_head = cqp_head;
+
+		barrier();
+
+		/* Ring doorbell (2 WQEs) */
+		nes_write32(nesdev->regs + NES_WQE_ALLOC, 0x02800000 | nesdev->cqp.qp_id);
+
+		spin_unlock_irqrestore(&nesdev->cqp.lock, flags);
+		nes_debug(NES_DBG_INIT, "Waiting for create MGT QP%u to complete.\n",
+			  mgtvnic->mgt.qp_id);
+
+		ret = wait_event_timeout(nesdev->cqp.waitq, (nesdev->cqp.sq_tail == cqp_head),
+					 NES_EVENT_TIMEOUT);
+		nes_debug(NES_DBG_INIT, "Create MGT QP%u completed, wait_event_timeout ret = %u.\n",
+			  mgtvnic->mgt.qp_id, ret);
+		if (!ret) {
+			nes_debug(NES_DBG_INIT, "MGT QP%u create timeout expired\n", mgtvnic->mgt.qp_id);
+			if (i == 0) {
+				pci_free_consistent(nesdev->pcidev, nesvnic->mgt_mem_size, nesvnic->mgt_vbase,
+						    nesvnic->mgt_pbase);
+				kfree(mgtvnic);
+			} else {
+				nes_destroy_mgt(nesvnic);
+			}
+			return -EIO;
+		}
+
+		/* Populate the RQ */
+		for (counter = 0; counter < (NES_MGT_WQ_COUNT - 1); counter++) {
+			skb = dev_alloc_skb(nesvnic->max_frame_size);
+			if (!skb) {
+				nes_debug(NES_DBG_INIT, "%s: out of memory for receive skb\n", netdev->name);
+				return -ENOMEM;
+			}
+
+			skb->dev = netdev;
+
+			pmem = pci_map_single(nesdev->pcidev, skb->data,
+					      nesvnic->max_frame_size, PCI_DMA_FROMDEVICE);
+			cb = (struct nes_rskb_cb *)&skb->cb[0];
+			cb->busaddr = pmem;
+			cb->maplen = nesvnic->max_frame_size;
+
+			mgt_rqe = &mgtvnic->mgt.rq_vbase[counter];
+			mgt_rqe->wqe_words[NES_NIC_RQ_WQE_LENGTH_1_0_IDX] = cpu_to_le32((u32)nesvnic->max_frame_size);
+			mgt_rqe->wqe_words[NES_NIC_RQ_WQE_LENGTH_3_2_IDX] = 0;
+			mgt_rqe->wqe_words[NES_NIC_RQ_WQE_FRAG0_LOW_IDX] = cpu_to_le32((u32)pmem);
+			mgt_rqe->wqe_words[NES_NIC_RQ_WQE_FRAG0_HIGH_IDX] = cpu_to_le32((u32)((u64)pmem >> 32));
+			mgtvnic->mgt.rx_skb[counter] = skb;
+		}
+
+		init_timer(&mgtvnic->rq_wqes_timer);
+		mgtvnic->rq_wqes_timer.function = nes_mgt_rq_wqes_timeout;
+		mgtvnic->rq_wqes_timer.data = (unsigned long)mgtvnic;
+
+		wqe_count = NES_MGT_WQ_COUNT - 1;
+		mgtvnic->mgt.rq_head = wqe_count;
+		barrier();
+		do {
+			counter = min(wqe_count, ((u32)255));
+			wqe_count -= counter;
+			nes_write32(nesdev->regs + NES_WQE_ALLOC, (counter << 24) | mgtvnic->mgt.qp_id);
+		} while (wqe_count);
+
+		nes_write32(nesdev->regs + NES_CQE_ALLOC, NES_CQE_ALLOC_NOTIFY_NEXT |
+			    mgtvnic->mgt_cq.cq_number);
+		nes_read32(nesdev->regs + NES_CQE_ALLOC);
+
+		mgt_vbase += mgt_mem_size;
+		mgt_pbase += mgt_mem_size;
+		nesvnic->mgtvnic[i] = mgtvnic++;
+	}
+	return 0;
+}
+
+
+void nes_destroy_mgt(struct nes_vnic *nesvnic)
+{
+	struct nes_device *nesdev = nesvnic->nesdev;
+	struct nes_vnic_mgt *mgtvnic;
+	struct nes_vnic_mgt *first_mgtvnic;
+	unsigned long flags;
+	struct nes_hw_cqp_wqe *cqp_wqe;
+	u32 cqp_head;
+	struct sk_buff *rx_skb;
+	int i;
+	int ret;
+
+	kthread_stop(nesvnic->mgt_thread);
+
+	/* Free remaining NIC receive buffers */
+	first_mgtvnic = nesvnic->mgtvnic[0];
+	for (i = 0; i < NES_MGT_QP_COUNT; i++) {
+		mgtvnic = nesvnic->mgtvnic[i];
+		if (mgtvnic == NULL)
+			continue;
+
+		while (mgtvnic->mgt.rq_head != mgtvnic->mgt.rq_tail) {
+			rx_skb = mgtvnic->mgt.rx_skb[mgtvnic->mgt.rq_tail];
+			nes_mgt_free_skb(nesdev, rx_skb, PCI_DMA_FROMDEVICE);
+			mgtvnic->mgt.rq_tail++;
+			mgtvnic->mgt.rq_tail &= (mgtvnic->mgt.rq_size - 1);
+		}
+
+		spin_lock_irqsave(&nesdev->cqp.lock, flags);
+
+		/* Destroy NIC QP */
+		cqp_head = nesdev->cqp.sq_head;
+		cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head];
+		nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
+
+		set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX,
+				    (NES_CQP_DESTROY_QP | NES_CQP_QP_TYPE_NIC));
+		set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_ID_IDX,
+				    mgtvnic->mgt.qp_id);
+
+		if (++cqp_head >= nesdev->cqp.sq_size)
+			cqp_head = 0;
+
+		cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head];
+
+		/* Destroy NIC CQ */
+		nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
+		set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX,
+				    (NES_CQP_DESTROY_CQ | ((u32)mgtvnic->mgt_cq.cq_size << 16)));
+		set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_ID_IDX,
+				    (mgtvnic->mgt_cq.cq_number | ((u32)nesdev->ceq_index << 16)));
+
+		if (++cqp_head >= nesdev->cqp.sq_size)
+			cqp_head = 0;
+
+		nesdev->cqp.sq_head = cqp_head;
+		barrier();
+
+		/* Ring doorbell (2 WQEs) */
+		nes_write32(nesdev->regs + NES_WQE_ALLOC, 0x02800000 | nesdev->cqp.qp_id);
+
+		spin_unlock_irqrestore(&nesdev->cqp.lock, flags);
+		nes_debug(NES_DBG_SHUTDOWN, "Waiting for CQP, cqp_head=%u, cqp.sq_head=%u,"
+			  " cqp.sq_tail=%u, cqp.sq_size=%u\n",
+			  cqp_head, nesdev->cqp.sq_head,
+			  nesdev->cqp.sq_tail, nesdev->cqp.sq_size);
+
+		ret = wait_event_timeout(nesdev->cqp.waitq, (nesdev->cqp.sq_tail == cqp_head),
+					 NES_EVENT_TIMEOUT);
+
+		nes_debug(NES_DBG_SHUTDOWN, "Destroy MGT QP returned, wait_event_timeout ret = %u, cqp_head=%u,"
+			  " cqp.sq_head=%u, cqp.sq_tail=%u\n",
+			  ret, cqp_head, nesdev->cqp.sq_head, nesdev->cqp.sq_tail);
+		if (!ret)
+			nes_debug(NES_DBG_SHUTDOWN, "MGT QP%u destroy timeout expired\n",
+				  mgtvnic->mgt.qp_id);
+
+		nesvnic->mgtvnic[i] = NULL;
+	}
+
+	if (nesvnic->mgt_vbase) {
+		pci_free_consistent(nesdev->pcidev, nesvnic->mgt_mem_size, nesvnic->mgt_vbase,
+				    nesvnic->mgt_pbase);
+		nesvnic->mgt_vbase = NULL;
+		nesvnic->mgt_pbase = 0;
+	}
+
+	kfree(first_mgtvnic);
+}
diff --git a/drivers/infiniband/hw/nes/nes_mgt.h b/drivers/infiniband/hw/nes/nes_mgt.h
new file mode 100644
index 0000000..4f7f701
--- /dev/null
+++ b/drivers/infiniband/hw/nes/nes_mgt.h
@@ -0,0 +1,97 @@
+/*
+* Copyright (c) 2006 - 2011 Intel-NE, Inc.  All rights reserved.
+*
+* This software is available to you under a choice of one of two
+* licenses.  You may choose to be licensed under the terms of the GNU
+* General Public License (GPL) Version 2, available from the file
+* COPYING in the main directory of this source tree, or the
+* OpenIB.org BSD license below:
+*
+*     Redistribution and use in source and binary forms, with or
+*     without modification, are permitted provided that the following
+*     conditions are met:
+*
+*      - Redistributions of source code must retain the above
+*        copyright notice, this list of conditions and the following
+*        disclaimer.
+*
+*      - Redistributions in binary form must reproduce the above
+*        copyright notice, this list of conditions and the following
+*        disclaimer in the documentation and/or other materials
+*        provided with the distribution.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*/
+
+#ifndef __NES_MGT_H
+#define __NES_MGT_H
+
+#define MPA_FRAMING 6	/* length is 2 bytes, crc is 4 bytes */
+
+int nes_init_mgt_qp(struct nes_device *nesdev, struct net_device *netdev, struct nes_vnic *nesvnic);
+void nes_queue_mgt_skbs(struct sk_buff *skb, struct nes_vnic *nesvnic, struct nes_qp *nesqp);
+void nes_destroy_mgt(struct nes_vnic *nesvnic);
+void nes_destroy_pau_qp(struct nes_device *nesdev, struct nes_qp *nesqp);
+
+struct nes_hw_mgt {
+	struct nes_hw_nic_rq_wqe *rq_vbase;	/* virtual address of rq */
+	dma_addr_t rq_pbase;			/* PCI memory for host rings */
+	struct sk_buff *rx_skb[NES_NIC_WQ_SIZE];
+	u16 qp_id;
+	u16 sq_head;
+	u16 rq_head;
+	u16 rq_tail;
+	u16 rq_size;
+	u8 replenishing_rq;
+	u8 reserved;
+	spinlock_t rq_lock;
+};
+
+struct nes_vnic_mgt {
+	struct nes_vnic        *nesvnic;
+	struct nes_hw_mgt      mgt;
+	struct nes_hw_nic_cq   mgt_cq;
+	atomic_t               rx_skbs_needed;
+	struct timer_list      rq_wqes_timer;
+	atomic_t               rx_skb_timer_running;
+};
+
+#define MAX_FPDU_FRAGS 4
+struct pau_fpdu_frag {
+	struct sk_buff         *skb;
+	u64                    physaddr;
+	u32                    frag_len;
+	bool                   cmplt;
+};
+
+struct pau_fpdu_info {
+	struct nes_qp          *nesqp;
+	struct nes_cqp_request *cqp_request;
+	void                   *hdr_vbase;
+	dma_addr_t             hdr_pbase;
+	int                    hdr_len;
+	u16                    data_len;
+	u16                    frag_cnt;
+	struct pau_fpdu_frag   frags[MAX_FPDU_FRAGS];
+};
+
+enum pau_qh_state {
+	PAU_DEL_QH,
+	PAU_ADD_LB_QH,
+	PAU_READY
+};
+
+struct pau_qh_chg {
+	struct nes_device *nesdev;
+	struct nes_vnic *nesvnic;
+	struct nes_qp *nesqp;
+};
+
+#endif          /* __NES_MGT_H */
diff --git a/drivers/infiniband/hw/nes/nes_nic.c b/drivers/infiniband/hw/nes/nes_nic.c
new file mode 100644
index 0000000..49eb511
--- /dev/null
+++ b/drivers/infiniband/hw/nes/nes_nic.c
@@ -0,0 +1,1882 @@
+/*
+ * Copyright (c) 2006 - 2011 Intel Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>
+#include <linux/if_arp.h>
+#include <linux/if_vlan.h>
+#include <linux/ethtool.h>
+#include <linux/slab.h>
+#include <net/tcp.h>
+
+#include <net/inet_common.h>
+#include <linux/inet.h>
+
+#include "nes.h"
+
+static struct nic_qp_map nic_qp_mapping_0[] = {
+	{16,0,0,1},{24,4,0,0},{28,8,0,0},{32,12,0,0},
+	{20,2,2,1},{26,6,2,0},{30,10,2,0},{34,14,2,0},
+	{18,1,1,1},{25,5,1,0},{29,9,1,0},{33,13,1,0},
+	{22,3,3,1},{27,7,3,0},{31,11,3,0},{35,15,3,0}
+};
+
+static struct nic_qp_map nic_qp_mapping_1[] = {
+	{18,1,1,1},{25,5,1,0},{29,9,1,0},{33,13,1,0},
+	{22,3,3,1},{27,7,3,0},{31,11,3,0},{35,15,3,0}
+};
+
+static struct nic_qp_map nic_qp_mapping_2[] = {
+	{20,2,2,1},{26,6,2,0},{30,10,2,0},{34,14,2,0}
+};
+
+static struct nic_qp_map nic_qp_mapping_3[] = {
+	{22,3,3,1},{27,7,3,0},{31,11,3,0},{35,15,3,0}
+};
+
+static struct nic_qp_map nic_qp_mapping_4[] = {
+	{28,8,0,0},{32,12,0,0}
+};
+
+static struct nic_qp_map nic_qp_mapping_5[] = {
+	{29,9,1,0},{33,13,1,0}
+};
+
+static struct nic_qp_map nic_qp_mapping_6[] = {
+	{30,10,2,0},{34,14,2,0}
+};
+
+static struct nic_qp_map nic_qp_mapping_7[] = {
+	{31,11,3,0},{35,15,3,0}
+};
+
+static struct nic_qp_map *nic_qp_mapping_per_function[] = {
+	nic_qp_mapping_0, nic_qp_mapping_1, nic_qp_mapping_2, nic_qp_mapping_3,
+	nic_qp_mapping_4, nic_qp_mapping_5, nic_qp_mapping_6, nic_qp_mapping_7
+};
+
+static const u32 default_msg = NETIF_MSG_DRV | NETIF_MSG_PROBE | NETIF_MSG_LINK
+		| NETIF_MSG_IFUP | NETIF_MSG_IFDOWN;
+static int debug = -1;
+static int nics_per_function = 1;
+
+/**
+ * nes_netdev_poll
+ */
+static int nes_netdev_poll(struct napi_struct *napi, int budget)
+{
+	struct nes_vnic *nesvnic = container_of(napi, struct nes_vnic, napi);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	struct nes_hw_nic_cq *nescq = &nesvnic->nic_cq;
+
+	nesvnic->budget = budget;
+	nescq->cqes_pending = 0;
+	nescq->rx_cqes_completed = 0;
+	nescq->cqe_allocs_pending = 0;
+	nescq->rx_pkts_indicated = 0;
+
+	nes_nic_ce_handler(nesdev, nescq);
+
+	if (nescq->cqes_pending == 0) {
+		napi_complete(napi);
+		/* clear out completed cqes and arm */
+		nes_write32(nesdev->regs+NES_CQE_ALLOC, NES_CQE_ALLOC_NOTIFY_NEXT |
+				nescq->cq_number | (nescq->cqe_allocs_pending << 16));
+		nes_read32(nesdev->regs+NES_CQE_ALLOC);
+	} else {
+		/* clear out completed cqes but don't arm */
+		nes_write32(nesdev->regs+NES_CQE_ALLOC,
+				nescq->cq_number | (nescq->cqe_allocs_pending << 16));
+		nes_debug(NES_DBG_NETDEV, "%s: exiting with work pending\n",
+				nesvnic->netdev->name);
+	}
+	return nescq->rx_pkts_indicated;
+}
+
+
+/**
+ * nes_netdev_open - Activate the network interface; ifconfig
+ * ethx up.
+ */
+static int nes_netdev_open(struct net_device *netdev)
+{
+	u32 macaddr_low;
+	u16 macaddr_high;
+	struct nes_vnic *nesvnic = netdev_priv(netdev);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	int ret;
+	int i;
+	struct nes_vnic *first_nesvnic = NULL;
+	u32 nic_active_bit;
+	u32 nic_active;
+	struct list_head *list_pos, *list_temp;
+	unsigned long flags;
+
+	assert(nesdev != NULL);
+
+	if (nesvnic->netdev_open == 1)
+		return 0;
+
+	if (netif_msg_ifup(nesvnic))
+		printk(KERN_INFO PFX "%s: enabling interface\n", netdev->name);
+
+	ret = nes_init_nic_qp(nesdev, netdev);
+	if (ret) {
+		return ret;
+	}
+
+	netif_carrier_off(netdev);
+	netif_stop_queue(netdev);
+
+	if ((!nesvnic->of_device_registered) && (nesvnic->rdma_enabled)) {
+		nesvnic->nesibdev = nes_init_ofa_device(netdev);
+		if (nesvnic->nesibdev == NULL) {
+			printk(KERN_ERR PFX "%s: nesvnic->nesibdev alloc failed", netdev->name);
+		} else {
+			nesvnic->nesibdev->nesvnic = nesvnic;
+			ret = nes_register_ofa_device(nesvnic->nesibdev);
+			if (ret) {
+				printk(KERN_ERR PFX "%s: Unable to register RDMA device, ret = %d\n",
+						netdev->name, ret);
+			}
+		}
+	}
+	/* Set packet filters */
+	nic_active_bit = 1 << nesvnic->nic_index;
+	nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_ACTIVE);
+	nic_active |= nic_active_bit;
+	nes_write_indexed(nesdev, NES_IDX_NIC_ACTIVE, nic_active);
+	nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_MULTICAST_ENABLE);
+	nic_active |= nic_active_bit;
+	nes_write_indexed(nesdev, NES_IDX_NIC_MULTICAST_ENABLE, nic_active);
+	nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_BROADCAST_ON);
+	nic_active |= nic_active_bit;
+	nes_write_indexed(nesdev, NES_IDX_NIC_BROADCAST_ON, nic_active);
+
+	macaddr_high  = ((u16)netdev->dev_addr[0]) << 8;
+	macaddr_high += (u16)netdev->dev_addr[1];
+
+	macaddr_low   = ((u32)netdev->dev_addr[2]) << 24;
+	macaddr_low  += ((u32)netdev->dev_addr[3]) << 16;
+	macaddr_low  += ((u32)netdev->dev_addr[4]) << 8;
+	macaddr_low  += (u32)netdev->dev_addr[5];
+
+	/* Program the various MAC regs */
+	for (i = 0; i < NES_MAX_PORT_COUNT; i++) {
+		if (nesvnic->qp_nic_index[i] == 0xf) {
+			break;
+		}
+		nes_debug(NES_DBG_NETDEV, "i=%d, perfect filter table index= %d, PERF FILTER LOW"
+				" (Addr:%08X) = %08X, HIGH = %08X.\n",
+				i, nesvnic->qp_nic_index[i],
+				NES_IDX_PERFECT_FILTER_LOW+
+					(nesvnic->qp_nic_index[i] * 8),
+				macaddr_low,
+				(u32)macaddr_high | NES_MAC_ADDR_VALID |
+				((((u32)nesvnic->nic_index) << 16)));
+		nes_write_indexed(nesdev,
+				NES_IDX_PERFECT_FILTER_LOW + (nesvnic->qp_nic_index[i] * 8),
+				macaddr_low);
+		nes_write_indexed(nesdev,
+				NES_IDX_PERFECT_FILTER_HIGH + (nesvnic->qp_nic_index[i] * 8),
+				(u32)macaddr_high | NES_MAC_ADDR_VALID |
+				((((u32)nesvnic->nic_index) << 16)));
+	}
+
+
+	nes_write32(nesdev->regs+NES_CQE_ALLOC, NES_CQE_ALLOC_NOTIFY_NEXT |
+			nesvnic->nic_cq.cq_number);
+	nes_read32(nesdev->regs+NES_CQE_ALLOC);
+	list_for_each_safe(list_pos, list_temp, &nesdev->nesadapter->nesvnic_list[nesdev->mac_index]) {
+		first_nesvnic = container_of(list_pos, struct nes_vnic, list);
+		if (first_nesvnic->netdev_open == 1)
+			break;
+	}
+	if (first_nesvnic->netdev_open == 0) {
+		nes_debug(NES_DBG_INIT, "Setting up MAC interrupt mask.\n");
+		nes_write_indexed(nesdev, NES_IDX_MAC_INT_MASK + (0x200 * nesdev->mac_index),
+				~(NES_MAC_INT_LINK_STAT_CHG | NES_MAC_INT_XGMII_EXT |
+				NES_MAC_INT_TX_UNDERFLOW | NES_MAC_INT_TX_ERROR));
+		first_nesvnic = nesvnic;
+	}
+
+	if (first_nesvnic->linkup) {
+		/* Enable network packets */
+		nesvnic->linkup = 1;
+		netif_start_queue(netdev);
+		netif_carrier_on(netdev);
+	}
+
+	spin_lock_irqsave(&nesdev->nesadapter->phy_lock, flags);
+	if (nesdev->nesadapter->phy_type[nesdev->mac_index] == NES_PHY_TYPE_SFP_D) {
+		nesdev->link_recheck = 1;
+		mod_delayed_work(system_wq, &nesdev->work,
+				 NES_LINK_RECHECK_DELAY);
+	}
+	spin_unlock_irqrestore(&nesdev->nesadapter->phy_lock, flags);
+
+	spin_lock_irqsave(&nesvnic->port_ibevent_lock, flags);
+	if (nesvnic->of_device_registered) {
+		nesdev->nesadapter->send_term_ok = 1;
+		if (nesvnic->linkup == 1) {
+			if (nesdev->iw_status == 0) {
+				nesdev->iw_status = 1;
+				nes_port_ibevent(nesvnic);
+			}
+		} else {
+			nesdev->iw_status = 0;
+		}
+	}
+	spin_unlock_irqrestore(&nesvnic->port_ibevent_lock, flags);
+
+	napi_enable(&nesvnic->napi);
+	nesvnic->netdev_open = 1;
+
+	return 0;
+}
+
+
+/**
+ * nes_netdev_stop
+ */
+static int nes_netdev_stop(struct net_device *netdev)
+{
+	struct nes_vnic *nesvnic = netdev_priv(netdev);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	u32 nic_active_mask;
+	u32 nic_active;
+	struct nes_vnic *first_nesvnic = NULL;
+	struct list_head *list_pos, *list_temp;
+	unsigned long flags;
+
+	nes_debug(NES_DBG_SHUTDOWN, "nesvnic=%p, nesdev=%p, netdev=%p %s\n",
+			nesvnic, nesdev, netdev, netdev->name);
+	if (nesvnic->netdev_open == 0)
+		return 0;
+
+	if (netif_msg_ifdown(nesvnic))
+		printk(KERN_INFO PFX "%s: disabling interface\n", netdev->name);
+	netif_carrier_off(netdev);
+
+	/* Disable network packets */
+	napi_disable(&nesvnic->napi);
+	netif_stop_queue(netdev);
+	list_for_each_safe(list_pos, list_temp, &nesdev->nesadapter->nesvnic_list[nesdev->mac_index]) {
+		first_nesvnic = container_of(list_pos, struct nes_vnic, list);
+		if ((first_nesvnic->netdev_open == 1) && (first_nesvnic != nesvnic))
+			break;
+	}
+
+	if ((first_nesvnic->netdev_open == 1) && (first_nesvnic != nesvnic)  &&
+		(PCI_FUNC(first_nesvnic->nesdev->pcidev->devfn) !=
+		PCI_FUNC(nesvnic->nesdev->pcidev->devfn))) {
+			nes_write_indexed(nesdev, NES_IDX_MAC_INT_MASK+
+				(0x200*nesdev->mac_index), 0xffffffff);
+			nes_write_indexed(first_nesvnic->nesdev,
+				NES_IDX_MAC_INT_MASK+
+				(0x200*first_nesvnic->nesdev->mac_index),
+			~(NES_MAC_INT_LINK_STAT_CHG | NES_MAC_INT_XGMII_EXT |
+			NES_MAC_INT_TX_UNDERFLOW | NES_MAC_INT_TX_ERROR));
+	} else {
+		nes_write_indexed(nesdev, NES_IDX_MAC_INT_MASK+(0x200*nesdev->mac_index), 0xffffffff);
+	}
+
+	nic_active_mask = ~((u32)(1 << nesvnic->nic_index));
+	nes_write_indexed(nesdev, NES_IDX_PERFECT_FILTER_HIGH+
+			(nesvnic->perfect_filter_index*8), 0);
+	nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_ACTIVE);
+	nic_active &= nic_active_mask;
+	nes_write_indexed(nesdev, NES_IDX_NIC_ACTIVE, nic_active);
+	nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_MULTICAST_ALL);
+	nic_active &= nic_active_mask;
+	nes_write_indexed(nesdev, NES_IDX_NIC_MULTICAST_ALL, nic_active);
+	nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_MULTICAST_ENABLE);
+	nic_active &= nic_active_mask;
+	nes_write_indexed(nesdev, NES_IDX_NIC_MULTICAST_ENABLE, nic_active);
+	nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_UNICAST_ALL);
+	nic_active &= nic_active_mask;
+	nes_write_indexed(nesdev, NES_IDX_NIC_UNICAST_ALL, nic_active);
+	nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_BROADCAST_ON);
+	nic_active &= nic_active_mask;
+	nes_write_indexed(nesdev, NES_IDX_NIC_BROADCAST_ON, nic_active);
+
+	spin_lock_irqsave(&nesvnic->port_ibevent_lock, flags);
+	if (nesvnic->of_device_registered) {
+		nesdev->nesadapter->send_term_ok = 0;
+		nesdev->iw_status = 0;
+		if (nesvnic->linkup == 1)
+			nes_port_ibevent(nesvnic);
+	}
+	del_timer_sync(&nesvnic->event_timer);
+	nesvnic->event_timer.function = NULL;
+	spin_unlock_irqrestore(&nesvnic->port_ibevent_lock, flags);
+
+	nes_destroy_nic_qp(nesvnic);
+
+	nesvnic->netdev_open = 0;
+
+	return 0;
+}
+
+
+/**
+ * nes_nic_send
+ */
+static int nes_nic_send(struct sk_buff *skb, struct net_device *netdev)
+{
+	struct nes_vnic *nesvnic = netdev_priv(netdev);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	struct nes_hw_nic *nesnic = &nesvnic->nic;
+	struct nes_hw_nic_sq_wqe *nic_sqe;
+	struct tcphdr *tcph;
+	__le16 *wqe_fragment_length;
+	u32 wqe_misc;
+	u16 wqe_fragment_index = 1;	/* first fragment (0) is used by copy buffer */
+	u16 skb_fragment_index;
+	dma_addr_t bus_address;
+
+	nic_sqe = &nesnic->sq_vbase[nesnic->sq_head];
+	wqe_fragment_length = (__le16 *)&nic_sqe->wqe_words[NES_NIC_SQ_WQE_LENGTH_0_TAG_IDX];
+
+	/* setup the VLAN tag if present */
+	if (vlan_tx_tag_present(skb)) {
+		nes_debug(NES_DBG_NIC_TX, "%s: VLAN packet to send... VLAN = %08X\n",
+				netdev->name, vlan_tx_tag_get(skb));
+		wqe_misc = NES_NIC_SQ_WQE_TAGVALUE_ENABLE;
+		wqe_fragment_length[0] = (__force __le16) vlan_tx_tag_get(skb);
+	} else
+		wqe_misc = 0;
+
+	/* bump past the vlan tag */
+	wqe_fragment_length++;
+	/*	wqe_fragment_address = (u64 *)&nic_sqe->wqe_words[NES_NIC_SQ_WQE_FRAG0_LOW_IDX]; */
+	wqe_misc |= NES_NIC_SQ_WQE_COMPLETION;
+
+	if (skb->ip_summed == CHECKSUM_PARTIAL) {
+		if (skb_is_gso(skb)) {
+			tcph = tcp_hdr(skb);
+			/* nes_debug(NES_DBG_NIC_TX, "%s: TSO request... is_gso = %u seg size = %u\n",
+					netdev->name, skb_is_gso(skb), skb_shinfo(skb)->gso_size); */
+			wqe_misc |= NES_NIC_SQ_WQE_LSO_ENABLE | (u16)skb_shinfo(skb)->gso_size;
+			set_wqe_32bit_value(nic_sqe->wqe_words, NES_NIC_SQ_WQE_LSO_INFO_IDX,
+					((u32)tcph->doff) |
+					(((u32)(((unsigned char *)tcph) - skb->data)) << 4));
+		}
+	} else {	/* CHECKSUM_HW */
+		wqe_misc |= NES_NIC_SQ_WQE_DISABLE_CHKSUM;
+	}
+
+	set_wqe_32bit_value(nic_sqe->wqe_words, NES_NIC_SQ_WQE_TOTAL_LENGTH_IDX,
+				skb->len);
+	memcpy(&nesnic->first_frag_vbase[nesnic->sq_head].buffer,
+			skb->data, min(((unsigned int)NES_FIRST_FRAG_SIZE), skb_headlen(skb)));
+	wqe_fragment_length[0] = cpu_to_le16(min(((unsigned int)NES_FIRST_FRAG_SIZE),
+			skb_headlen(skb)));
+	wqe_fragment_length[1] = 0;
+	if (skb_headlen(skb) > NES_FIRST_FRAG_SIZE) {
+		if ((skb_shinfo(skb)->nr_frags + 1) > 4) {
+			nes_debug(NES_DBG_NIC_TX, "%s: Packet with %u fragments not sent, skb_headlen=%u\n",
+					netdev->name, skb_shinfo(skb)->nr_frags + 2, skb_headlen(skb));
+			kfree_skb(skb);
+			nesvnic->tx_sw_dropped++;
+			return NETDEV_TX_LOCKED;
+		}
+		set_bit(nesnic->sq_head, nesnic->first_frag_overflow);
+		bus_address = pci_map_single(nesdev->pcidev, skb->data + NES_FIRST_FRAG_SIZE,
+				skb_headlen(skb) - NES_FIRST_FRAG_SIZE, PCI_DMA_TODEVICE);
+		wqe_fragment_length[wqe_fragment_index++] =
+				cpu_to_le16(skb_headlen(skb) - NES_FIRST_FRAG_SIZE);
+		wqe_fragment_length[wqe_fragment_index] = 0;
+		set_wqe_64bit_value(nic_sqe->wqe_words, NES_NIC_SQ_WQE_FRAG1_LOW_IDX,
+				((u64)(bus_address)));
+		nesnic->tx_skb[nesnic->sq_head] = skb;
+	}
+
+	if (skb_headlen(skb) == skb->len) {
+		if (skb_headlen(skb) <= NES_FIRST_FRAG_SIZE) {
+			nic_sqe->wqe_words[NES_NIC_SQ_WQE_LENGTH_2_1_IDX] = 0;
+			nesnic->tx_skb[nesnic->sq_head] = skb;
+		}
+	} else {
+		/* Deal with Fragments */
+		nesnic->tx_skb[nesnic->sq_head] = skb;
+		for (skb_fragment_index = 0; skb_fragment_index < skb_shinfo(skb)->nr_frags;
+				skb_fragment_index++) {
+			skb_frag_t *frag =
+				&skb_shinfo(skb)->frags[skb_fragment_index];
+			bus_address = skb_frag_dma_map(&nesdev->pcidev->dev,
+						       frag, 0, skb_frag_size(frag),
+						       DMA_TO_DEVICE);
+			wqe_fragment_length[wqe_fragment_index] =
+					cpu_to_le16(skb_frag_size(&skb_shinfo(skb)->frags[skb_fragment_index]));
+			set_wqe_64bit_value(nic_sqe->wqe_words, NES_NIC_SQ_WQE_FRAG0_LOW_IDX+(2*wqe_fragment_index),
+				bus_address);
+			wqe_fragment_index++;
+			if (wqe_fragment_index < 5)
+				wqe_fragment_length[wqe_fragment_index] = 0;
+		}
+	}
+
+	set_wqe_32bit_value(nic_sqe->wqe_words, NES_NIC_SQ_WQE_MISC_IDX, wqe_misc);
+	nesnic->sq_head++;
+	nesnic->sq_head &= nesnic->sq_size - 1;
+
+	return NETDEV_TX_OK;
+}
+
+
+/**
+ * nes_netdev_start_xmit
+ */
+static int nes_netdev_start_xmit(struct sk_buff *skb, struct net_device *netdev)
+{
+	struct nes_vnic *nesvnic = netdev_priv(netdev);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	struct nes_hw_nic *nesnic = &nesvnic->nic;
+	struct nes_hw_nic_sq_wqe *nic_sqe;
+	struct tcphdr *tcph;
+	/* struct udphdr *udph; */
+#define NES_MAX_TSO_FRAGS MAX_SKB_FRAGS
+	/* 64K segment plus overflow on each side */
+	dma_addr_t tso_bus_address[NES_MAX_TSO_FRAGS];
+	dma_addr_t bus_address;
+	u32 tso_frag_index;
+	u32 tso_frag_count;
+	u32 tso_wqe_length;
+	u32 curr_tcp_seq;
+	u32 wqe_count=1;
+	u32 send_rc;
+	struct iphdr *iph;
+	__le16 *wqe_fragment_length;
+	u32 nr_frags;
+	u32 original_first_length;
+	/* u64 *wqe_fragment_address; */
+	/* first fragment (0) is used by copy buffer */
+	u16 wqe_fragment_index=1;
+	u16 hoffset;
+	u16 nhoffset;
+	u16 wqes_needed;
+	u16 wqes_available;
+	u32 wqe_misc;
+
+	/*
+	 * nes_debug(NES_DBG_NIC_TX, "%s Request to tx NIC packet length %u, headlen %u,"
+	 *		" (%u frags), tso_size=%u\n",
+	 *		netdev->name, skb->len, skb_headlen(skb),
+	 *		skb_shinfo(skb)->nr_frags, skb_is_gso(skb));
+	 */
+
+	if (!netif_carrier_ok(netdev))
+		return NETDEV_TX_OK;
+
+	if (netif_queue_stopped(netdev))
+		return NETDEV_TX_BUSY;
+
+	/* Check if SQ is full */
+	if ((((nesnic->sq_tail+(nesnic->sq_size*2))-nesnic->sq_head) & (nesnic->sq_size - 1)) == 1) {
+		if (!netif_queue_stopped(netdev)) {
+			netif_stop_queue(netdev);
+			barrier();
+			if ((((((volatile u16)nesnic->sq_tail)+(nesnic->sq_size*2))-nesnic->sq_head) & (nesnic->sq_size - 1)) != 1) {
+				netif_start_queue(netdev);
+				goto sq_no_longer_full;
+			}
+		}
+		nesvnic->sq_full++;
+		return NETDEV_TX_BUSY;
+	}
+
+sq_no_longer_full:
+	nr_frags = skb_shinfo(skb)->nr_frags;
+	if (skb_headlen(skb) > NES_FIRST_FRAG_SIZE) {
+		nr_frags++;
+	}
+	/* Check if too many fragments */
+	if (unlikely((nr_frags > 4))) {
+		if (skb_is_gso(skb)) {
+			nesvnic->segmented_tso_requests++;
+			nesvnic->tso_requests++;
+			/* Basically 4 fragments available per WQE with extended fragments */
+			wqes_needed = nr_frags >> 2;
+			wqes_needed += (nr_frags&3)?1:0;
+			wqes_available = (((nesnic->sq_tail+nesnic->sq_size)-nesnic->sq_head) - 1) &
+					(nesnic->sq_size - 1);
+
+			if (unlikely(wqes_needed > wqes_available)) {
+				if (!netif_queue_stopped(netdev)) {
+					netif_stop_queue(netdev);
+					barrier();
+					wqes_available = (((((volatile u16)nesnic->sq_tail)+nesnic->sq_size)-nesnic->sq_head) - 1) &
+						(nesnic->sq_size - 1);
+					if (wqes_needed <= wqes_available) {
+						netif_start_queue(netdev);
+						goto tso_sq_no_longer_full;
+					}
+				}
+				nesvnic->sq_full++;
+				nes_debug(NES_DBG_NIC_TX, "%s: HNIC SQ full- TSO request has too many frags!\n",
+						netdev->name);
+				return NETDEV_TX_BUSY;
+			}
+tso_sq_no_longer_full:
+			/* Map all the buffers */
+			for (tso_frag_count=0; tso_frag_count < skb_shinfo(skb)->nr_frags;
+					tso_frag_count++) {
+				skb_frag_t *frag =
+					&skb_shinfo(skb)->frags[tso_frag_count];
+				tso_bus_address[tso_frag_count] =
+					skb_frag_dma_map(&nesdev->pcidev->dev,
+							 frag, 0, skb_frag_size(frag),
+							 DMA_TO_DEVICE);
+			}
+
+			tso_frag_index = 0;
+			curr_tcp_seq = ntohl(tcp_hdr(skb)->seq);
+			hoffset = skb_transport_header(skb) - skb->data;
+			nhoffset = skb_network_header(skb) - skb->data;
+			original_first_length = hoffset + ((((struct tcphdr *)skb_transport_header(skb))->doff)<<2);
+
+			for (wqe_count=0; wqe_count<((u32)wqes_needed); wqe_count++) {
+				tso_wqe_length = 0;
+				nic_sqe = &nesnic->sq_vbase[nesnic->sq_head];
+				wqe_fragment_length =
+						(__le16 *)&nic_sqe->wqe_words[NES_NIC_SQ_WQE_LENGTH_0_TAG_IDX];
+				/* setup the VLAN tag if present */
+				if (vlan_tx_tag_present(skb)) {
+					nes_debug(NES_DBG_NIC_TX, "%s: VLAN packet to send... VLAN = %08X\n",
+							netdev->name, vlan_tx_tag_get(skb) );
+					wqe_misc = NES_NIC_SQ_WQE_TAGVALUE_ENABLE;
+					wqe_fragment_length[0] = (__force __le16) vlan_tx_tag_get(skb);
+				} else
+					wqe_misc = 0;
+
+				/* bump past the vlan tag */
+				wqe_fragment_length++;
+
+				/* Assumes header totally fits in allocated buffer and is in first fragment */
+				if (original_first_length > NES_FIRST_FRAG_SIZE) {
+					nes_debug(NES_DBG_NIC_TX, "ERROR: SKB header too big, headlen=%u, FIRST_FRAG_SIZE=%u\n",
+							original_first_length, NES_FIRST_FRAG_SIZE);
+					nes_debug(NES_DBG_NIC_TX, "%s Request to tx NIC packet length %u, headlen %u,"
+							" (%u frags), is_gso = %u tso_size=%u\n",
+							netdev->name,
+							skb->len, skb_headlen(skb),
+							skb_shinfo(skb)->nr_frags, skb_is_gso(skb), skb_shinfo(skb)->gso_size);
+				}
+				memcpy(&nesnic->first_frag_vbase[nesnic->sq_head].buffer,
+						skb->data, min(((unsigned int)NES_FIRST_FRAG_SIZE),
+						original_first_length));
+				iph = (struct iphdr *)
+				(&nesnic->first_frag_vbase[nesnic->sq_head].buffer[nhoffset]);
+				tcph = (struct tcphdr *)
+				(&nesnic->first_frag_vbase[nesnic->sq_head].buffer[hoffset]);
+				if ((wqe_count+1)!=(u32)wqes_needed) {
+					tcph->fin = 0;
+					tcph->psh = 0;
+					tcph->rst = 0;
+					tcph->urg = 0;
+				}
+				if (wqe_count) {
+					tcph->syn = 0;
+				}
+				tcph->seq = htonl(curr_tcp_seq);
+				wqe_fragment_length[0] = cpu_to_le16(min(((unsigned int)NES_FIRST_FRAG_SIZE),
+						original_first_length));
+
+				wqe_fragment_index = 1;
+				if ((wqe_count==0) && (skb_headlen(skb) > original_first_length)) {
+					set_bit(nesnic->sq_head, nesnic->first_frag_overflow);
+					bus_address = pci_map_single(nesdev->pcidev, skb->data + original_first_length,
+							skb_headlen(skb) - original_first_length, PCI_DMA_TODEVICE);
+					wqe_fragment_length[wqe_fragment_index++] =
+						cpu_to_le16(skb_headlen(skb) - original_first_length);
+					wqe_fragment_length[wqe_fragment_index] = 0;
+					set_wqe_64bit_value(nic_sqe->wqe_words, NES_NIC_SQ_WQE_FRAG1_LOW_IDX,
+									bus_address);
+					tso_wqe_length += skb_headlen(skb) -
+							original_first_length;
+				}
+				while (wqe_fragment_index < 5) {
+					wqe_fragment_length[wqe_fragment_index] =
+							cpu_to_le16(skb_frag_size(&skb_shinfo(skb)->frags[tso_frag_index]));
+					set_wqe_64bit_value(nic_sqe->wqe_words, NES_NIC_SQ_WQE_FRAG0_LOW_IDX+(2*wqe_fragment_index),
+						(u64)tso_bus_address[tso_frag_index]);
+					wqe_fragment_index++;
+					tso_wqe_length += skb_frag_size(&skb_shinfo(skb)->frags[tso_frag_index++]);
+					if (wqe_fragment_index < 5)
+						wqe_fragment_length[wqe_fragment_index] = 0;
+					if (tso_frag_index == tso_frag_count)
+						break;
+				}
+				if ((wqe_count+1) == (u32)wqes_needed) {
+					nesnic->tx_skb[nesnic->sq_head] = skb;
+				} else {
+					nesnic->tx_skb[nesnic->sq_head] = NULL;
+				}
+				wqe_misc |= NES_NIC_SQ_WQE_COMPLETION | (u16)skb_shinfo(skb)->gso_size;
+				if ((tso_wqe_length + original_first_length) > skb_shinfo(skb)->gso_size) {
+					wqe_misc |= NES_NIC_SQ_WQE_LSO_ENABLE;
+				} else {
+					iph->tot_len = htons(tso_wqe_length + original_first_length - nhoffset);
+				}
+
+				set_wqe_32bit_value(nic_sqe->wqe_words, NES_NIC_SQ_WQE_MISC_IDX,
+						 wqe_misc);
+				set_wqe_32bit_value(nic_sqe->wqe_words, NES_NIC_SQ_WQE_LSO_INFO_IDX,
+						((u32)tcph->doff) | (((u32)hoffset) << 4));
+
+				set_wqe_32bit_value(nic_sqe->wqe_words, NES_NIC_SQ_WQE_TOTAL_LENGTH_IDX,
+						tso_wqe_length + original_first_length);
+				curr_tcp_seq += tso_wqe_length;
+				nesnic->sq_head++;
+				nesnic->sq_head &= nesnic->sq_size-1;
+			}
+		} else {
+			nesvnic->linearized_skbs++;
+			hoffset = skb_transport_header(skb) - skb->data;
+			nhoffset = skb_network_header(skb) - skb->data;
+			skb_linearize(skb);
+			skb_set_transport_header(skb, hoffset);
+			skb_set_network_header(skb, nhoffset);
+			send_rc = nes_nic_send(skb, netdev);
+			if (send_rc != NETDEV_TX_OK)
+				return NETDEV_TX_OK;
+		}
+	} else {
+		send_rc = nes_nic_send(skb, netdev);
+		if (send_rc != NETDEV_TX_OK)
+			return NETDEV_TX_OK;
+	}
+
+	barrier();
+
+	if (wqe_count)
+		nes_write32(nesdev->regs+NES_WQE_ALLOC,
+				(wqe_count << 24) | (1 << 23) | nesvnic->nic.qp_id);
+
+	netdev->trans_start = jiffies;
+
+	return NETDEV_TX_OK;
+}
+
+
+/**
+ * nes_netdev_get_stats
+ */
+static struct net_device_stats *nes_netdev_get_stats(struct net_device *netdev)
+{
+	struct nes_vnic *nesvnic = netdev_priv(netdev);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	u64 u64temp;
+	u32 u32temp;
+
+	u32temp = nes_read_indexed(nesdev,
+			NES_IDX_ENDNODE0_NSTAT_RX_DISCARD + (nesvnic->nic_index*0x200));
+	nesvnic->netstats.rx_dropped += u32temp;
+	nesvnic->endnode_nstat_rx_discard += u32temp;
+
+	u64temp = (u64)nes_read_indexed(nesdev,
+			NES_IDX_ENDNODE0_NSTAT_RX_OCTETS_LO + (nesvnic->nic_index*0x200));
+	u64temp += ((u64)nes_read_indexed(nesdev,
+			NES_IDX_ENDNODE0_NSTAT_RX_OCTETS_HI + (nesvnic->nic_index*0x200))) << 32;
+
+	nesvnic->endnode_nstat_rx_octets += u64temp;
+	nesvnic->netstats.rx_bytes += u64temp;
+
+	u64temp = (u64)nes_read_indexed(nesdev,
+			NES_IDX_ENDNODE0_NSTAT_RX_FRAMES_LO + (nesvnic->nic_index*0x200));
+	u64temp += ((u64)nes_read_indexed(nesdev,
+			NES_IDX_ENDNODE0_NSTAT_RX_FRAMES_HI + (nesvnic->nic_index*0x200))) << 32;
+
+	nesvnic->endnode_nstat_rx_frames += u64temp;
+	nesvnic->netstats.rx_packets += u64temp;
+
+	u64temp = (u64)nes_read_indexed(nesdev,
+			NES_IDX_ENDNODE0_NSTAT_TX_OCTETS_LO + (nesvnic->nic_index*0x200));
+	u64temp += ((u64)nes_read_indexed(nesdev,
+			NES_IDX_ENDNODE0_NSTAT_TX_OCTETS_HI + (nesvnic->nic_index*0x200))) << 32;
+
+	nesvnic->endnode_nstat_tx_octets += u64temp;
+	nesvnic->netstats.tx_bytes += u64temp;
+
+	u64temp = (u64)nes_read_indexed(nesdev,
+			NES_IDX_ENDNODE0_NSTAT_TX_FRAMES_LO + (nesvnic->nic_index*0x200));
+	u64temp += ((u64)nes_read_indexed(nesdev,
+			NES_IDX_ENDNODE0_NSTAT_TX_FRAMES_HI + (nesvnic->nic_index*0x200))) << 32;
+
+	nesvnic->endnode_nstat_tx_frames += u64temp;
+	nesvnic->netstats.tx_packets += u64temp;
+
+	u32temp = nes_read_indexed(nesdev,
+			NES_IDX_MAC_RX_SHORT_FRAMES + (nesvnic->nesdev->mac_index*0x200));
+	nesvnic->netstats.rx_dropped += u32temp;
+	nesvnic->nesdev->mac_rx_errors += u32temp;
+	nesvnic->nesdev->mac_rx_short_frames += u32temp;
+
+	u32temp = nes_read_indexed(nesdev,
+			NES_IDX_MAC_RX_OVERSIZED_FRAMES + (nesvnic->nesdev->mac_index*0x200));
+	nesvnic->netstats.rx_dropped += u32temp;
+	nesvnic->nesdev->mac_rx_errors += u32temp;
+	nesvnic->nesdev->mac_rx_oversized_frames += u32temp;
+
+	u32temp = nes_read_indexed(nesdev,
+			NES_IDX_MAC_RX_JABBER_FRAMES + (nesvnic->nesdev->mac_index*0x200));
+	nesvnic->netstats.rx_dropped += u32temp;
+	nesvnic->nesdev->mac_rx_errors += u32temp;
+	nesvnic->nesdev->mac_rx_jabber_frames += u32temp;
+
+	u32temp = nes_read_indexed(nesdev,
+			NES_IDX_MAC_RX_SYMBOL_ERR_FRAMES + (nesvnic->nesdev->mac_index*0x200));
+	nesvnic->netstats.rx_dropped += u32temp;
+	nesvnic->nesdev->mac_rx_errors += u32temp;
+	nesvnic->nesdev->mac_rx_symbol_err_frames += u32temp;
+
+	u32temp = nes_read_indexed(nesdev,
+			NES_IDX_MAC_RX_LENGTH_ERR_FRAMES + (nesvnic->nesdev->mac_index*0x200));
+	nesvnic->netstats.rx_length_errors += u32temp;
+	nesvnic->nesdev->mac_rx_errors += u32temp;
+
+	u32temp = nes_read_indexed(nesdev,
+			NES_IDX_MAC_RX_CRC_ERR_FRAMES + (nesvnic->nesdev->mac_index*0x200));
+	nesvnic->nesdev->mac_rx_errors += u32temp;
+	nesvnic->nesdev->mac_rx_crc_errors += u32temp;
+	nesvnic->netstats.rx_crc_errors += u32temp;
+
+	u32temp = nes_read_indexed(nesdev,
+			NES_IDX_MAC_TX_ERRORS + (nesvnic->nesdev->mac_index*0x200));
+	nesvnic->nesdev->mac_tx_errors += u32temp;
+	nesvnic->netstats.tx_errors += u32temp;
+
+	return &nesvnic->netstats;
+}
+
+
+/**
+ * nes_netdev_tx_timeout
+ */
+static void nes_netdev_tx_timeout(struct net_device *netdev)
+{
+	struct nes_vnic *nesvnic = netdev_priv(netdev);
+
+	if (netif_msg_timer(nesvnic))
+		nes_debug(NES_DBG_NIC_TX, "%s: tx timeout\n", netdev->name);
+}
+
+
+/**
+ * nes_netdev_set_mac_address
+ */
+static int nes_netdev_set_mac_address(struct net_device *netdev, void *p)
+{
+	struct nes_vnic *nesvnic = netdev_priv(netdev);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	struct sockaddr *mac_addr = p;
+	int i;
+	u32 macaddr_low;
+	u16 macaddr_high;
+
+	if (!is_valid_ether_addr(mac_addr->sa_data))
+		return -EADDRNOTAVAIL;
+
+	memcpy(netdev->dev_addr, mac_addr->sa_data, netdev->addr_len);
+	printk(PFX "%s: Address length = %d, Address = %pM\n",
+	       __func__, netdev->addr_len, mac_addr->sa_data);
+	macaddr_high  = ((u16)netdev->dev_addr[0]) << 8;
+	macaddr_high += (u16)netdev->dev_addr[1];
+	macaddr_low   = ((u32)netdev->dev_addr[2]) << 24;
+	macaddr_low  += ((u32)netdev->dev_addr[3]) << 16;
+	macaddr_low  += ((u32)netdev->dev_addr[4]) << 8;
+	macaddr_low  += (u32)netdev->dev_addr[5];
+
+	for (i = 0; i < NES_MAX_PORT_COUNT; i++) {
+		if (nesvnic->qp_nic_index[i] == 0xf) {
+			break;
+		}
+		nes_write_indexed(nesdev,
+				NES_IDX_PERFECT_FILTER_LOW + (nesvnic->qp_nic_index[i] * 8),
+				macaddr_low);
+		nes_write_indexed(nesdev,
+				NES_IDX_PERFECT_FILTER_HIGH + (nesvnic->qp_nic_index[i] * 8),
+				(u32)macaddr_high | NES_MAC_ADDR_VALID |
+				((((u32)nesvnic->nic_index) << 16)));
+	}
+	return 0;
+}
+
+
+static void set_allmulti(struct nes_device *nesdev, u32 nic_active_bit)
+{
+	u32 nic_active;
+
+	nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_MULTICAST_ALL);
+	nic_active |= nic_active_bit;
+	nes_write_indexed(nesdev, NES_IDX_NIC_MULTICAST_ALL, nic_active);
+	nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_UNICAST_ALL);
+	nic_active &= ~nic_active_bit;
+	nes_write_indexed(nesdev, NES_IDX_NIC_UNICAST_ALL, nic_active);
+}
+
+#define get_addr(addrs, index) ((addrs) + (index) * ETH_ALEN)
+
+/**
+ * nes_netdev_set_multicast_list
+ */
+static void nes_netdev_set_multicast_list(struct net_device *netdev)
+{
+	struct nes_vnic *nesvnic = netdev_priv(netdev);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	struct nes_adapter *nesadapter = nesvnic->nesdev->nesadapter;
+	u32 nic_active_bit;
+	u32 nic_active;
+	u32 perfect_filter_register_address;
+	u32 macaddr_low;
+	u16 macaddr_high;
+	u8 mc_all_on = 0;
+	u8 mc_index;
+	int mc_nic_index = -1;
+	u8 pft_entries_preallocated = max(nesadapter->adapter_fcn_count *
+					nics_per_function, 4);
+	u8 max_pft_entries_avaiable = NES_PFT_SIZE - pft_entries_preallocated;
+	unsigned long flags;
+	int mc_count = netdev_mc_count(netdev);
+
+	spin_lock_irqsave(&nesadapter->resource_lock, flags);
+	nic_active_bit = 1 << nesvnic->nic_index;
+
+	if (netdev->flags & IFF_PROMISC) {
+		nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_MULTICAST_ALL);
+		nic_active |= nic_active_bit;
+		nes_write_indexed(nesdev, NES_IDX_NIC_MULTICAST_ALL, nic_active);
+		nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_UNICAST_ALL);
+		nic_active |= nic_active_bit;
+		nes_write_indexed(nesdev, NES_IDX_NIC_UNICAST_ALL, nic_active);
+		mc_all_on = 1;
+	} else if ((netdev->flags & IFF_ALLMULTI) ||
+			   (nesvnic->nic_index > 3)) {
+		set_allmulti(nesdev, nic_active_bit);
+		mc_all_on = 1;
+	} else {
+		nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_MULTICAST_ALL);
+		nic_active &= ~nic_active_bit;
+		nes_write_indexed(nesdev, NES_IDX_NIC_MULTICAST_ALL, nic_active);
+		nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_UNICAST_ALL);
+		nic_active &= ~nic_active_bit;
+		nes_write_indexed(nesdev, NES_IDX_NIC_UNICAST_ALL, nic_active);
+	}
+
+	nes_debug(NES_DBG_NIC_RX, "Number of MC entries = %d, Promiscuous = %d, All Multicast = %d.\n",
+		  mc_count, !!(netdev->flags & IFF_PROMISC),
+		  !!(netdev->flags & IFF_ALLMULTI));
+	if (!mc_all_on) {
+		char *addrs;
+		int i;
+		struct netdev_hw_addr *ha;
+
+		addrs = kmalloc(ETH_ALEN * mc_count, GFP_ATOMIC);
+		if (!addrs) {
+			set_allmulti(nesdev, nic_active_bit);
+			goto unlock;
+		}
+		i = 0;
+		netdev_for_each_mc_addr(ha, netdev)
+			memcpy(get_addr(addrs, i++), ha->addr, ETH_ALEN);
+
+		perfect_filter_register_address = NES_IDX_PERFECT_FILTER_LOW +
+						pft_entries_preallocated * 0x8;
+		for (i = 0, mc_index = 0; mc_index < max_pft_entries_avaiable;
+		     mc_index++) {
+			while (i < mc_count && nesvnic->mcrq_mcast_filter &&
+			((mc_nic_index = nesvnic->mcrq_mcast_filter(nesvnic,
+					get_addr(addrs, i++))) == 0));
+			if (mc_nic_index < 0)
+				mc_nic_index = nesvnic->nic_index;
+			while (nesadapter->pft_mcast_map[mc_index] < 16 &&
+				nesadapter->pft_mcast_map[mc_index] !=
+					nesvnic->nic_index &&
+					mc_index < max_pft_entries_avaiable) {
+						nes_debug(NES_DBG_NIC_RX,
+					"mc_index=%d skipping nic_index=%d, "
+					"used for=%d \n", mc_index,
+					nesvnic->nic_index,
+					nesadapter->pft_mcast_map[mc_index]);
+				mc_index++;
+			}
+			if (mc_index >= max_pft_entries_avaiable)
+				break;
+			if (i < mc_count) {
+				char *addr = get_addr(addrs, i++);
+
+				nes_debug(NES_DBG_NIC_RX, "Assigning MC Address %pM to register 0x%04X nic_idx=%d\n",
+					  addr,
+					  perfect_filter_register_address+(mc_index * 8),
+					  mc_nic_index);
+				macaddr_high  = ((u8) addr[0]) << 8;
+				macaddr_high += (u8) addr[1];
+				macaddr_low   = ((u8) addr[2]) << 24;
+				macaddr_low  += ((u8) addr[3]) << 16;
+				macaddr_low  += ((u8) addr[4]) << 8;
+				macaddr_low  += (u8) addr[5];
+
+				nes_write_indexed(nesdev,
+						perfect_filter_register_address+(mc_index * 8),
+						macaddr_low);
+				nes_write_indexed(nesdev,
+						perfect_filter_register_address+4+(mc_index * 8),
+						(u32)macaddr_high | NES_MAC_ADDR_VALID |
+						((((u32)(1<<mc_nic_index)) << 16)));
+				nesadapter->pft_mcast_map[mc_index] =
+							nesvnic->nic_index;
+			} else {
+				nes_debug(NES_DBG_NIC_RX, "Clearing MC Address at register 0x%04X\n",
+						  perfect_filter_register_address+(mc_index * 8));
+				nes_write_indexed(nesdev,
+						perfect_filter_register_address+4+(mc_index * 8),
+						0);
+				nesadapter->pft_mcast_map[mc_index] = 255;
+			}
+		}
+		kfree(addrs);
+		/* PFT is not large enough */
+		if (i < mc_count)
+			set_allmulti(nesdev, nic_active_bit);
+	}
+
+unlock:
+	spin_unlock_irqrestore(&nesadapter->resource_lock, flags);
+}
+
+
+/**
+ * nes_netdev_change_mtu
+ */
+static int nes_netdev_change_mtu(struct net_device *netdev, int new_mtu)
+{
+	struct nes_vnic	*nesvnic = netdev_priv(netdev);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	int ret = 0;
+	u8 jumbomode = 0;
+	u32 nic_active;
+	u32 nic_active_bit;
+	u32 uc_all_active;
+	u32 mc_all_active;
+
+	if ((new_mtu < ETH_ZLEN) || (new_mtu > max_mtu))
+		return -EINVAL;
+
+	netdev->mtu = new_mtu;
+	nesvnic->max_frame_size	= new_mtu + VLAN_ETH_HLEN;
+
+	if (netdev->mtu	> 1500)	{
+		jumbomode=1;
+	}
+	nes_nic_init_timer_defaults(nesdev, jumbomode);
+
+	if (netif_running(netdev)) {
+		nic_active_bit = 1 << nesvnic->nic_index;
+		mc_all_active = nes_read_indexed(nesdev,
+				NES_IDX_NIC_MULTICAST_ALL) & nic_active_bit;
+		uc_all_active = nes_read_indexed(nesdev,
+				NES_IDX_NIC_UNICAST_ALL)  & nic_active_bit;
+
+		nes_netdev_stop(netdev);
+		nes_netdev_open(netdev);
+
+		nic_active = nes_read_indexed(nesdev,
+					NES_IDX_NIC_MULTICAST_ALL);
+		nic_active |= mc_all_active;
+		nes_write_indexed(nesdev, NES_IDX_NIC_MULTICAST_ALL,
+							nic_active);
+
+		nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_UNICAST_ALL);
+		nic_active |= uc_all_active;
+		nes_write_indexed(nesdev, NES_IDX_NIC_UNICAST_ALL, nic_active);
+	}
+
+	return ret;
+}
+
+
+static const char nes_ethtool_stringset[][ETH_GSTRING_LEN] = {
+	"Link Change Interrupts",
+	"Linearized SKBs",
+	"T/GSO Requests",
+	"Pause Frames Sent",
+	"Pause Frames Received",
+	"Internal Routing Errors",
+	"SQ SW Dropped SKBs",
+	"SQ Full",
+	"Segmented TSO Requests",
+	"Rx Symbol Errors",
+	"Rx Jabber Errors",
+	"Rx Oversized Frames",
+	"Rx Short Frames",
+	"Rx Length Errors",
+	"Rx CRC Errors",
+	"Rx Port Discard",
+	"Endnode Rx Discards",
+	"Endnode Rx Octets",
+	"Endnode Rx Frames",
+	"Endnode Tx Octets",
+	"Endnode Tx Frames",
+	"Tx Errors",
+	"mh detected",
+	"mh pauses",
+	"Retransmission Count",
+	"CM Connects",
+	"CM Accepts",
+	"Disconnects",
+	"Connected Events",
+	"Connect Requests",
+	"CM Rejects",
+	"ModifyQP Timeouts",
+	"CreateQPs",
+	"SW DestroyQPs",
+	"DestroyQPs",
+	"CM Closes",
+	"CM Packets Sent",
+	"CM Packets Bounced",
+	"CM Packets Created",
+	"CM Packets Rcvd",
+	"CM Packets Dropped",
+	"CM Packets Retrans",
+	"CM Listens Created",
+	"CM Listens Destroyed",
+	"CM Backlog Drops",
+	"CM Loopbacks",
+	"CM Nodes Created",
+	"CM Nodes Destroyed",
+	"CM Accel Drops",
+	"CM Resets Received",
+	"Free 4Kpbls",
+	"Free 256pbls",
+	"Timer Inits",
+	"LRO aggregated",
+	"LRO flushed",
+	"LRO no_desc",
+	"PAU CreateQPs",
+	"PAU DestroyQPs",
+};
+#define NES_ETHTOOL_STAT_COUNT  ARRAY_SIZE(nes_ethtool_stringset)
+
+
+/**
+ * nes_netdev_get_sset_count
+ */
+static int nes_netdev_get_sset_count(struct net_device *netdev, int stringset)
+{
+	if (stringset == ETH_SS_STATS)
+		return NES_ETHTOOL_STAT_COUNT;
+	else
+		return -EINVAL;
+}
+
+
+/**
+ * nes_netdev_get_strings
+ */
+static void nes_netdev_get_strings(struct net_device *netdev, u32 stringset,
+		u8 *ethtool_strings)
+{
+	if (stringset == ETH_SS_STATS)
+		memcpy(ethtool_strings,
+				&nes_ethtool_stringset,
+				sizeof(nes_ethtool_stringset));
+}
+
+
+/**
+ * nes_netdev_get_ethtool_stats
+ */
+
+static void nes_netdev_get_ethtool_stats(struct net_device *netdev,
+		struct ethtool_stats *target_ethtool_stats, u64 *target_stat_values)
+{
+	u64 u64temp;
+	struct nes_vnic *nesvnic = netdev_priv(netdev);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+	u32 nic_count;
+	u32 u32temp;
+	u32 index = 0;
+
+	target_ethtool_stats->n_stats = NES_ETHTOOL_STAT_COUNT;
+	target_stat_values[index] = nesvnic->nesdev->link_status_interrupts;
+	target_stat_values[++index] = nesvnic->linearized_skbs;
+	target_stat_values[++index] = nesvnic->tso_requests;
+
+	u32temp = nes_read_indexed(nesdev,
+			NES_IDX_MAC_TX_PAUSE_FRAMES + (nesvnic->nesdev->mac_index*0x200));
+	nesvnic->nesdev->mac_pause_frames_sent += u32temp;
+	target_stat_values[++index] = nesvnic->nesdev->mac_pause_frames_sent;
+
+	u32temp = nes_read_indexed(nesdev,
+			NES_IDX_MAC_RX_PAUSE_FRAMES + (nesvnic->nesdev->mac_index*0x200));
+	nesvnic->nesdev->mac_pause_frames_received += u32temp;
+
+	u32temp = nes_read_indexed(nesdev,
+			NES_IDX_PORT_RX_DISCARDS + (nesvnic->nesdev->mac_index*0x40));
+	nesvnic->nesdev->port_rx_discards += u32temp;
+	nesvnic->netstats.rx_dropped += u32temp;
+
+	u32temp = nes_read_indexed(nesdev,
+			NES_IDX_PORT_TX_DISCARDS + (nesvnic->nesdev->mac_index*0x40));
+	nesvnic->nesdev->port_tx_discards += u32temp;
+	nesvnic->netstats.tx_dropped += u32temp;
+
+	u32temp = nes_read_indexed(nesdev,
+			NES_IDX_MAC_RX_SHORT_FRAMES + (nesvnic->nesdev->mac_index*0x200));
+	nesvnic->netstats.rx_dropped += u32temp;
+	nesvnic->nesdev->mac_rx_errors += u32temp;
+	nesvnic->nesdev->mac_rx_short_frames += u32temp;
+
+	u32temp = nes_read_indexed(nesdev,
+			NES_IDX_MAC_RX_OVERSIZED_FRAMES + (nesvnic->nesdev->mac_index*0x200));
+	nesvnic->netstats.rx_dropped += u32temp;
+	nesvnic->nesdev->mac_rx_errors += u32temp;
+	nesvnic->nesdev->mac_rx_oversized_frames += u32temp;
+
+	u32temp = nes_read_indexed(nesdev,
+			NES_IDX_MAC_RX_JABBER_FRAMES + (nesvnic->nesdev->mac_index*0x200));
+	nesvnic->netstats.rx_dropped += u32temp;
+	nesvnic->nesdev->mac_rx_errors += u32temp;
+	nesvnic->nesdev->mac_rx_jabber_frames += u32temp;
+
+	u32temp = nes_read_indexed(nesdev,
+			NES_IDX_MAC_RX_SYMBOL_ERR_FRAMES + (nesvnic->nesdev->mac_index*0x200));
+	nesvnic->netstats.rx_dropped += u32temp;
+	nesvnic->nesdev->mac_rx_errors += u32temp;
+	nesvnic->nesdev->mac_rx_symbol_err_frames += u32temp;
+
+	u32temp = nes_read_indexed(nesdev,
+			NES_IDX_MAC_RX_LENGTH_ERR_FRAMES + (nesvnic->nesdev->mac_index*0x200));
+	nesvnic->netstats.rx_length_errors += u32temp;
+	nesvnic->nesdev->mac_rx_errors += u32temp;
+
+	u32temp = nes_read_indexed(nesdev,
+			NES_IDX_MAC_RX_CRC_ERR_FRAMES + (nesvnic->nesdev->mac_index*0x200));
+	nesvnic->nesdev->mac_rx_errors += u32temp;
+	nesvnic->nesdev->mac_rx_crc_errors += u32temp;
+	nesvnic->netstats.rx_crc_errors += u32temp;
+
+	u32temp = nes_read_indexed(nesdev,
+			NES_IDX_MAC_TX_ERRORS + (nesvnic->nesdev->mac_index*0x200));
+	nesvnic->nesdev->mac_tx_errors += u32temp;
+	nesvnic->netstats.tx_errors += u32temp;
+
+	for (nic_count = 0; nic_count < NES_MAX_PORT_COUNT; nic_count++) {
+		if (nesvnic->qp_nic_index[nic_count] == 0xf)
+			break;
+
+		u32temp = nes_read_indexed(nesdev,
+				NES_IDX_ENDNODE0_NSTAT_RX_DISCARD +
+				(nesvnic->qp_nic_index[nic_count]*0x200));
+		nesvnic->netstats.rx_dropped += u32temp;
+		nesvnic->endnode_nstat_rx_discard += u32temp;
+
+		u64temp = (u64)nes_read_indexed(nesdev,
+				NES_IDX_ENDNODE0_NSTAT_RX_OCTETS_LO +
+				(nesvnic->qp_nic_index[nic_count]*0x200));
+		u64temp += ((u64)nes_read_indexed(nesdev,
+				NES_IDX_ENDNODE0_NSTAT_RX_OCTETS_HI +
+				(nesvnic->qp_nic_index[nic_count]*0x200))) << 32;
+
+		nesvnic->endnode_nstat_rx_octets += u64temp;
+		nesvnic->netstats.rx_bytes += u64temp;
+
+		u64temp = (u64)nes_read_indexed(nesdev,
+				NES_IDX_ENDNODE0_NSTAT_RX_FRAMES_LO +
+				(nesvnic->qp_nic_index[nic_count]*0x200));
+		u64temp += ((u64)nes_read_indexed(nesdev,
+				NES_IDX_ENDNODE0_NSTAT_RX_FRAMES_HI +
+				(nesvnic->qp_nic_index[nic_count]*0x200))) << 32;
+
+		nesvnic->endnode_nstat_rx_frames += u64temp;
+		nesvnic->netstats.rx_packets += u64temp;
+
+		u64temp = (u64)nes_read_indexed(nesdev,
+				NES_IDX_ENDNODE0_NSTAT_TX_OCTETS_LO +
+				(nesvnic->qp_nic_index[nic_count]*0x200));
+		u64temp += ((u64)nes_read_indexed(nesdev,
+				NES_IDX_ENDNODE0_NSTAT_TX_OCTETS_HI +
+				(nesvnic->qp_nic_index[nic_count]*0x200))) << 32;
+
+		nesvnic->endnode_nstat_tx_octets += u64temp;
+		nesvnic->netstats.tx_bytes += u64temp;
+
+		u64temp = (u64)nes_read_indexed(nesdev,
+				NES_IDX_ENDNODE0_NSTAT_TX_FRAMES_LO +
+				(nesvnic->qp_nic_index[nic_count]*0x200));
+		u64temp += ((u64)nes_read_indexed(nesdev,
+				NES_IDX_ENDNODE0_NSTAT_TX_FRAMES_HI +
+				(nesvnic->qp_nic_index[nic_count]*0x200))) << 32;
+
+		nesvnic->endnode_nstat_tx_frames += u64temp;
+		nesvnic->netstats.tx_packets += u64temp;
+
+		u32temp = nes_read_indexed(nesdev,
+				NES_IDX_IPV4_TCP_REXMITS + (nesvnic->qp_nic_index[nic_count]*0x200));
+		nesvnic->endnode_ipv4_tcp_retransmits += u32temp;
+	}
+
+	target_stat_values[++index] = nesvnic->nesdev->mac_pause_frames_received;
+	target_stat_values[++index] = nesdev->nesadapter->nic_rx_eth_route_err;
+	target_stat_values[++index] = nesvnic->tx_sw_dropped;
+	target_stat_values[++index] = nesvnic->sq_full;
+	target_stat_values[++index] = nesvnic->segmented_tso_requests;
+	target_stat_values[++index] = nesvnic->nesdev->mac_rx_symbol_err_frames;
+	target_stat_values[++index] = nesvnic->nesdev->mac_rx_jabber_frames;
+	target_stat_values[++index] = nesvnic->nesdev->mac_rx_oversized_frames;
+	target_stat_values[++index] = nesvnic->nesdev->mac_rx_short_frames;
+	target_stat_values[++index] = nesvnic->netstats.rx_length_errors;
+	target_stat_values[++index] = nesvnic->nesdev->mac_rx_crc_errors;
+	target_stat_values[++index] = nesvnic->nesdev->port_rx_discards;
+	target_stat_values[++index] = nesvnic->endnode_nstat_rx_discard;
+	target_stat_values[++index] = nesvnic->endnode_nstat_rx_octets;
+	target_stat_values[++index] = nesvnic->endnode_nstat_rx_frames;
+	target_stat_values[++index] = nesvnic->endnode_nstat_tx_octets;
+	target_stat_values[++index] = nesvnic->endnode_nstat_tx_frames;
+	target_stat_values[++index] = nesvnic->nesdev->mac_tx_errors;
+	target_stat_values[++index] = mh_detected;
+	target_stat_values[++index] = mh_pauses_sent;
+	target_stat_values[++index] = nesvnic->endnode_ipv4_tcp_retransmits;
+	target_stat_values[++index] = atomic_read(&cm_connects);
+	target_stat_values[++index] = atomic_read(&cm_accepts);
+	target_stat_values[++index] = atomic_read(&cm_disconnects);
+	target_stat_values[++index] = atomic_read(&cm_connecteds);
+	target_stat_values[++index] = atomic_read(&cm_connect_reqs);
+	target_stat_values[++index] = atomic_read(&cm_rejects);
+	target_stat_values[++index] = atomic_read(&mod_qp_timouts);
+	target_stat_values[++index] = atomic_read(&qps_created);
+	target_stat_values[++index] = atomic_read(&sw_qps_destroyed);
+	target_stat_values[++index] = atomic_read(&qps_destroyed);
+	target_stat_values[++index] = atomic_read(&cm_closes);
+	target_stat_values[++index] = cm_packets_sent;
+	target_stat_values[++index] = cm_packets_bounced;
+	target_stat_values[++index] = cm_packets_created;
+	target_stat_values[++index] = cm_packets_received;
+	target_stat_values[++index] = cm_packets_dropped;
+	target_stat_values[++index] = cm_packets_retrans;
+	target_stat_values[++index] = atomic_read(&cm_listens_created);
+	target_stat_values[++index] = atomic_read(&cm_listens_destroyed);
+	target_stat_values[++index] = cm_backlog_drops;
+	target_stat_values[++index] = atomic_read(&cm_loopbacks);
+	target_stat_values[++index] = atomic_read(&cm_nodes_created);
+	target_stat_values[++index] = atomic_read(&cm_nodes_destroyed);
+	target_stat_values[++index] = atomic_read(&cm_accel_dropped_pkts);
+	target_stat_values[++index] = atomic_read(&cm_resets_recvd);
+	target_stat_values[++index] = nesadapter->free_4kpbl;
+	target_stat_values[++index] = nesadapter->free_256pbl;
+	target_stat_values[++index] = int_mod_timer_init;
+	target_stat_values[++index] = nesvnic->lro_mgr.stats.aggregated;
+	target_stat_values[++index] = nesvnic->lro_mgr.stats.flushed;
+	target_stat_values[++index] = nesvnic->lro_mgr.stats.no_desc;
+	target_stat_values[++index] = atomic_read(&pau_qps_created);
+	target_stat_values[++index] = atomic_read(&pau_qps_destroyed);
+}
+
+/**
+ * nes_netdev_get_drvinfo
+ */
+static void nes_netdev_get_drvinfo(struct net_device *netdev,
+		struct ethtool_drvinfo *drvinfo)
+{
+	struct nes_vnic *nesvnic = netdev_priv(netdev);
+	struct nes_adapter *nesadapter = nesvnic->nesdev->nesadapter;
+
+	strlcpy(drvinfo->driver, DRV_NAME, sizeof(drvinfo->driver));
+	strlcpy(drvinfo->bus_info, pci_name(nesvnic->nesdev->pcidev),
+		sizeof(drvinfo->bus_info));
+	snprintf(drvinfo->fw_version, sizeof(drvinfo->fw_version),
+		 "%u.%u", nesadapter->firmware_version >> 16,
+		 nesadapter->firmware_version & 0x000000ff);
+	strlcpy(drvinfo->version, DRV_VERSION, sizeof(drvinfo->version));
+	drvinfo->testinfo_len = 0;
+	drvinfo->eedump_len = 0;
+	drvinfo->regdump_len = 0;
+}
+
+
+/**
+ * nes_netdev_set_coalesce
+ */
+static int nes_netdev_set_coalesce(struct net_device *netdev,
+		struct ethtool_coalesce	*et_coalesce)
+{
+	struct nes_vnic	*nesvnic = netdev_priv(netdev);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+	struct nes_hw_tune_timer *shared_timer = &nesadapter->tune_timer;
+	unsigned long flags;
+
+	spin_lock_irqsave(&nesadapter->periodic_timer_lock, flags);
+	if (et_coalesce->rx_max_coalesced_frames_low) {
+		shared_timer->threshold_low = et_coalesce->rx_max_coalesced_frames_low;
+	}
+	if (et_coalesce->rx_max_coalesced_frames_irq) {
+		shared_timer->threshold_target = et_coalesce->rx_max_coalesced_frames_irq;
+	}
+	if (et_coalesce->rx_max_coalesced_frames_high) {
+		shared_timer->threshold_high = et_coalesce->rx_max_coalesced_frames_high;
+	}
+	if (et_coalesce->rx_coalesce_usecs_low) {
+		shared_timer->timer_in_use_min = et_coalesce->rx_coalesce_usecs_low;
+	}
+	if (et_coalesce->rx_coalesce_usecs_high) {
+		shared_timer->timer_in_use_max = et_coalesce->rx_coalesce_usecs_high;
+	}
+	spin_unlock_irqrestore(&nesadapter->periodic_timer_lock, flags);
+
+	/* using this to drive total interrupt moderation */
+	nesadapter->et_rx_coalesce_usecs_irq = et_coalesce->rx_coalesce_usecs_irq;
+	if (et_coalesce->use_adaptive_rx_coalesce) {
+		nesadapter->et_use_adaptive_rx_coalesce	= 1;
+		nesadapter->timer_int_limit = NES_TIMER_INT_LIMIT_DYNAMIC;
+		nesadapter->et_rx_coalesce_usecs_irq = 0;
+		if (et_coalesce->pkt_rate_low) {
+			nesadapter->et_pkt_rate_low = et_coalesce->pkt_rate_low;
+		}
+	} else {
+		nesadapter->et_use_adaptive_rx_coalesce	= 0;
+		nesadapter->timer_int_limit = NES_TIMER_INT_LIMIT;
+		if (nesadapter->et_rx_coalesce_usecs_irq) {
+			nes_write32(nesdev->regs+NES_PERIODIC_CONTROL,
+					0x80000000 | ((u32)(nesadapter->et_rx_coalesce_usecs_irq*8)));
+		}
+	}
+	return 0;
+}
+
+
+/**
+ * nes_netdev_get_coalesce
+ */
+static int nes_netdev_get_coalesce(struct net_device *netdev,
+		struct ethtool_coalesce	*et_coalesce)
+{
+	struct nes_vnic	*nesvnic = netdev_priv(netdev);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+	struct ethtool_coalesce	temp_et_coalesce;
+	struct nes_hw_tune_timer *shared_timer = &nesadapter->tune_timer;
+	unsigned long flags;
+
+	memset(&temp_et_coalesce, 0, sizeof(temp_et_coalesce));
+	temp_et_coalesce.rx_coalesce_usecs_irq    = nesadapter->et_rx_coalesce_usecs_irq;
+	temp_et_coalesce.use_adaptive_rx_coalesce = nesadapter->et_use_adaptive_rx_coalesce;
+	temp_et_coalesce.rate_sample_interval     = nesadapter->et_rate_sample_interval;
+	temp_et_coalesce.pkt_rate_low =	nesadapter->et_pkt_rate_low;
+	spin_lock_irqsave(&nesadapter->periodic_timer_lock,	flags);
+	temp_et_coalesce.rx_max_coalesced_frames_low  = shared_timer->threshold_low;
+	temp_et_coalesce.rx_max_coalesced_frames_irq  = shared_timer->threshold_target;
+	temp_et_coalesce.rx_max_coalesced_frames_high = shared_timer->threshold_high;
+	temp_et_coalesce.rx_coalesce_usecs_low  = shared_timer->timer_in_use_min;
+	temp_et_coalesce.rx_coalesce_usecs_high = shared_timer->timer_in_use_max;
+	if (nesadapter->et_use_adaptive_rx_coalesce) {
+		temp_et_coalesce.rx_coalesce_usecs_irq = shared_timer->timer_in_use;
+	}
+	spin_unlock_irqrestore(&nesadapter->periodic_timer_lock, flags);
+	memcpy(et_coalesce, &temp_et_coalesce, sizeof(*et_coalesce));
+	return 0;
+}
+
+
+/**
+ * nes_netdev_get_pauseparam
+ */
+static void nes_netdev_get_pauseparam(struct net_device *netdev,
+		struct ethtool_pauseparam *et_pauseparam)
+{
+	struct nes_vnic *nesvnic = netdev_priv(netdev);
+
+	et_pauseparam->autoneg = 0;
+	et_pauseparam->rx_pause = (nesvnic->nesdev->disable_rx_flow_control == 0) ? 1:0;
+	et_pauseparam->tx_pause = (nesvnic->nesdev->disable_tx_flow_control == 0) ? 1:0;
+}
+
+
+/**
+ * nes_netdev_set_pauseparam
+ */
+static int nes_netdev_set_pauseparam(struct net_device *netdev,
+		struct ethtool_pauseparam *et_pauseparam)
+{
+	struct nes_vnic *nesvnic = netdev_priv(netdev);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	u32 u32temp;
+
+	if (et_pauseparam->autoneg) {
+		/* TODO: should return unsupported */
+		return 0;
+	}
+	if ((et_pauseparam->tx_pause == 1) && (nesdev->disable_tx_flow_control == 1)) {
+		u32temp = nes_read_indexed(nesdev,
+				NES_IDX_MAC_TX_CONFIG + (nesdev->mac_index*0x200));
+		u32temp |= NES_IDX_MAC_TX_CONFIG_ENABLE_PAUSE;
+		nes_write_indexed(nesdev,
+				NES_IDX_MAC_TX_CONFIG + (nesdev->mac_index*0x200), u32temp);
+		nesdev->disable_tx_flow_control = 0;
+	} else if ((et_pauseparam->tx_pause == 0) && (nesdev->disable_tx_flow_control == 0)) {
+		u32temp = nes_read_indexed(nesdev,
+				NES_IDX_MAC_TX_CONFIG + (nesdev->mac_index*0x200));
+		u32temp &= ~NES_IDX_MAC_TX_CONFIG_ENABLE_PAUSE;
+		nes_write_indexed(nesdev,
+				NES_IDX_MAC_TX_CONFIG + (nesdev->mac_index*0x200), u32temp);
+		nesdev->disable_tx_flow_control = 1;
+	}
+	if ((et_pauseparam->rx_pause == 1) && (nesdev->disable_rx_flow_control == 1)) {
+		u32temp = nes_read_indexed(nesdev,
+				NES_IDX_MPP_DEBUG + (nesdev->mac_index*0x40));
+		u32temp &= ~NES_IDX_MPP_DEBUG_PORT_DISABLE_PAUSE;
+		nes_write_indexed(nesdev,
+				NES_IDX_MPP_DEBUG + (nesdev->mac_index*0x40), u32temp);
+		nesdev->disable_rx_flow_control = 0;
+	} else if ((et_pauseparam->rx_pause == 0) && (nesdev->disable_rx_flow_control == 0)) {
+		u32temp = nes_read_indexed(nesdev,
+				NES_IDX_MPP_DEBUG + (nesdev->mac_index*0x40));
+		u32temp |= NES_IDX_MPP_DEBUG_PORT_DISABLE_PAUSE;
+		nes_write_indexed(nesdev,
+				NES_IDX_MPP_DEBUG + (nesdev->mac_index*0x40), u32temp);
+		nesdev->disable_rx_flow_control = 1;
+	}
+
+	return 0;
+}
+
+
+/**
+ * nes_netdev_get_settings
+ */
+static int nes_netdev_get_settings(struct net_device *netdev, struct ethtool_cmd *et_cmd)
+{
+	struct nes_vnic *nesvnic = netdev_priv(netdev);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+	u32 mac_index = nesdev->mac_index;
+	u8 phy_type = nesadapter->phy_type[mac_index];
+	u8 phy_index = nesadapter->phy_index[mac_index];
+	u16 phy_data;
+
+	et_cmd->duplex = DUPLEX_FULL;
+	et_cmd->port   = PORT_MII;
+	et_cmd->maxtxpkt = 511;
+	et_cmd->maxrxpkt = 511;
+
+	if (nesadapter->OneG_Mode) {
+		ethtool_cmd_speed_set(et_cmd, SPEED_1000);
+		if (phy_type == NES_PHY_TYPE_PUMA_1G) {
+			et_cmd->supported   = SUPPORTED_1000baseT_Full;
+			et_cmd->advertising = ADVERTISED_1000baseT_Full;
+			et_cmd->autoneg     = AUTONEG_DISABLE;
+			et_cmd->transceiver = XCVR_INTERNAL;
+			et_cmd->phy_address = mac_index;
+		} else {
+			unsigned long flags;
+			et_cmd->supported   = SUPPORTED_1000baseT_Full
+					    | SUPPORTED_Autoneg;
+			et_cmd->advertising = ADVERTISED_1000baseT_Full
+					    | ADVERTISED_Autoneg;
+			spin_lock_irqsave(&nesadapter->phy_lock, flags);
+			nes_read_1G_phy_reg(nesdev, 0, phy_index, &phy_data);
+			spin_unlock_irqrestore(&nesadapter->phy_lock, flags);
+			if (phy_data & 0x1000)
+				et_cmd->autoneg = AUTONEG_ENABLE;
+			else
+				et_cmd->autoneg = AUTONEG_DISABLE;
+			et_cmd->transceiver = XCVR_EXTERNAL;
+			et_cmd->phy_address = phy_index;
+		}
+		return 0;
+	}
+	if ((phy_type == NES_PHY_TYPE_ARGUS) ||
+	    (phy_type == NES_PHY_TYPE_SFP_D) ||
+	    (phy_type == NES_PHY_TYPE_KR)) {
+		et_cmd->transceiver = XCVR_EXTERNAL;
+		et_cmd->port        = PORT_FIBRE;
+		et_cmd->supported   = SUPPORTED_FIBRE;
+		et_cmd->advertising = ADVERTISED_FIBRE;
+		et_cmd->phy_address = phy_index;
+	} else {
+		et_cmd->transceiver = XCVR_INTERNAL;
+		et_cmd->supported   = SUPPORTED_10000baseT_Full;
+		et_cmd->advertising = ADVERTISED_10000baseT_Full;
+		et_cmd->phy_address = mac_index;
+	}
+	ethtool_cmd_speed_set(et_cmd, SPEED_10000);
+	et_cmd->autoneg = AUTONEG_DISABLE;
+	return 0;
+}
+
+
+/**
+ * nes_netdev_set_settings
+ */
+static int nes_netdev_set_settings(struct net_device *netdev, struct ethtool_cmd *et_cmd)
+{
+	struct nes_vnic *nesvnic = netdev_priv(netdev);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+
+	if ((nesadapter->OneG_Mode) &&
+	    (nesadapter->phy_type[nesdev->mac_index] != NES_PHY_TYPE_PUMA_1G)) {
+		unsigned long flags;
+		u16 phy_data;
+		u8 phy_index = nesadapter->phy_index[nesdev->mac_index];
+
+		spin_lock_irqsave(&nesadapter->phy_lock, flags);
+		nes_read_1G_phy_reg(nesdev, 0, phy_index, &phy_data);
+		if (et_cmd->autoneg) {
+			/* Turn on Full duplex, Autoneg, and restart autonegotiation */
+			phy_data |= 0x1300;
+		} else {
+			/* Turn off autoneg */
+			phy_data &= ~0x1000;
+		}
+		nes_write_1G_phy_reg(nesdev, 0, phy_index, phy_data);
+		spin_unlock_irqrestore(&nesadapter->phy_lock, flags);
+	}
+
+	return 0;
+}
+
+
+static const struct ethtool_ops nes_ethtool_ops = {
+	.get_link = ethtool_op_get_link,
+	.get_settings = nes_netdev_get_settings,
+	.set_settings = nes_netdev_set_settings,
+	.get_strings = nes_netdev_get_strings,
+	.get_sset_count = nes_netdev_get_sset_count,
+	.get_ethtool_stats = nes_netdev_get_ethtool_stats,
+	.get_drvinfo = nes_netdev_get_drvinfo,
+	.get_coalesce = nes_netdev_get_coalesce,
+	.set_coalesce = nes_netdev_set_coalesce,
+	.get_pauseparam = nes_netdev_get_pauseparam,
+	.set_pauseparam = nes_netdev_set_pauseparam,
+};
+
+static void nes_vlan_mode(struct net_device *netdev, struct nes_device *nesdev, netdev_features_t features)
+{
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+	u32 u32temp;
+	unsigned long flags;
+
+	spin_lock_irqsave(&nesadapter->phy_lock, flags);
+
+	nes_debug(NES_DBG_NETDEV, "%s: %s\n", __func__, netdev->name);
+
+	/* Enable/Disable VLAN Stripping */
+	u32temp = nes_read_indexed(nesdev, NES_IDX_PCIX_DIAG);
+	if (features & NETIF_F_HW_VLAN_CTAG_RX)
+		u32temp &= 0xfdffffff;
+	else
+		u32temp	|= 0x02000000;
+
+	nes_write_indexed(nesdev, NES_IDX_PCIX_DIAG, u32temp);
+	spin_unlock_irqrestore(&nesadapter->phy_lock, flags);
+}
+
+static netdev_features_t nes_fix_features(struct net_device *netdev, netdev_features_t features)
+{
+	/*
+	 * Since there is no support for separate rx/tx vlan accel
+	 * enable/disable make sure tx flag is always in same state as rx.
+	 */
+	if (features & NETIF_F_HW_VLAN_CTAG_RX)
+		features |= NETIF_F_HW_VLAN_CTAG_TX;
+	else
+		features &= ~NETIF_F_HW_VLAN_CTAG_TX;
+
+	return features;
+}
+
+static int nes_set_features(struct net_device *netdev, netdev_features_t features)
+{
+	struct nes_vnic *nesvnic = netdev_priv(netdev);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	u32 changed = netdev->features ^ features;
+
+	if (changed & NETIF_F_HW_VLAN_CTAG_RX)
+		nes_vlan_mode(netdev, nesdev, features);
+
+	return 0;
+}
+
+static const struct net_device_ops nes_netdev_ops = {
+	.ndo_open		= nes_netdev_open,
+	.ndo_stop		= nes_netdev_stop,
+	.ndo_start_xmit		= nes_netdev_start_xmit,
+	.ndo_get_stats		= nes_netdev_get_stats,
+	.ndo_tx_timeout		= nes_netdev_tx_timeout,
+	.ndo_set_mac_address	= nes_netdev_set_mac_address,
+	.ndo_set_rx_mode	= nes_netdev_set_multicast_list,
+	.ndo_change_mtu		= nes_netdev_change_mtu,
+	.ndo_validate_addr	= eth_validate_addr,
+	.ndo_fix_features	= nes_fix_features,
+	.ndo_set_features	= nes_set_features,
+};
+
+/**
+ * nes_netdev_init - initialize network device
+ */
+struct net_device *nes_netdev_init(struct nes_device *nesdev,
+		void __iomem *mmio_addr)
+{
+	u64 u64temp;
+	struct nes_vnic *nesvnic;
+	struct net_device *netdev;
+	struct nic_qp_map *curr_qp_map;
+	u8 phy_type = nesdev->nesadapter->phy_type[nesdev->mac_index];
+
+	netdev = alloc_etherdev(sizeof(struct nes_vnic));
+	if (!netdev) {
+		printk(KERN_ERR PFX "nesvnic etherdev alloc failed");
+		return NULL;
+	}
+	nesvnic = netdev_priv(netdev);
+
+	nes_debug(NES_DBG_INIT, "netdev = %p, %s\n", netdev, netdev->name);
+
+	SET_NETDEV_DEV(netdev, &nesdev->pcidev->dev);
+
+	netdev->watchdog_timeo = NES_TX_TIMEOUT;
+	netdev->irq = nesdev->pcidev->irq;
+	netdev->mtu = ETH_DATA_LEN;
+	netdev->hard_header_len = ETH_HLEN;
+	netdev->addr_len = ETH_ALEN;
+	netdev->type = ARPHRD_ETHER;
+	netdev->netdev_ops = &nes_netdev_ops;
+	netdev->ethtool_ops = &nes_ethtool_ops;
+	netif_napi_add(netdev, &nesvnic->napi, nes_netdev_poll, 128);
+	nes_debug(NES_DBG_INIT, "Enabling VLAN Insert/Delete.\n");
+
+	/* Fill in the port structure */
+	nesvnic->netdev = netdev;
+	nesvnic->nesdev = nesdev;
+	nesvnic->msg_enable = netif_msg_init(debug, default_msg);
+	nesvnic->netdev_index = nesdev->netdev_count;
+	nesvnic->perfect_filter_index = nesdev->nesadapter->netdev_count;
+	nesvnic->max_frame_size = netdev->mtu + netdev->hard_header_len + VLAN_HLEN;
+
+	curr_qp_map = nic_qp_mapping_per_function[PCI_FUNC(nesdev->pcidev->devfn)];
+	nesvnic->nic.qp_id = curr_qp_map[nesdev->netdev_count].qpid;
+	nesvnic->nic_index = curr_qp_map[nesdev->netdev_count].nic_index;
+	nesvnic->logical_port = curr_qp_map[nesdev->netdev_count].logical_port;
+
+	/* Setup the burned in MAC address */
+	u64temp = (u64)nesdev->nesadapter->mac_addr_low;
+	u64temp += ((u64)nesdev->nesadapter->mac_addr_high) << 32;
+	u64temp += nesvnic->nic_index;
+	netdev->dev_addr[0] = (u8)(u64temp>>40);
+	netdev->dev_addr[1] = (u8)(u64temp>>32);
+	netdev->dev_addr[2] = (u8)(u64temp>>24);
+	netdev->dev_addr[3] = (u8)(u64temp>>16);
+	netdev->dev_addr[4] = (u8)(u64temp>>8);
+	netdev->dev_addr[5] = (u8)u64temp;
+
+	netdev->hw_features = NETIF_F_SG | NETIF_F_IP_CSUM | NETIF_F_RXCSUM | NETIF_F_HW_VLAN_CTAG_RX;
+	if ((nesvnic->logical_port < 2) || (nesdev->nesadapter->hw_rev != NE020_REV))
+		netdev->hw_features |= NETIF_F_TSO;
+
+	netdev->features = netdev->hw_features | NETIF_F_HIGHDMA | NETIF_F_HW_VLAN_CTAG_TX;
+	netdev->hw_features |= NETIF_F_LRO;
+
+	nes_debug(NES_DBG_INIT, "nesvnic = %p, reported features = 0x%lX, QPid = %d,"
+			" nic_index = %d, logical_port = %d, mac_index = %d.\n",
+			nesvnic, (unsigned long)netdev->features, nesvnic->nic.qp_id,
+			nesvnic->nic_index, nesvnic->logical_port,  nesdev->mac_index);
+
+	if (nesvnic->nesdev->nesadapter->port_count == 1 &&
+		nesvnic->nesdev->nesadapter->adapter_fcn_count == 1) {
+
+		nesvnic->qp_nic_index[0] = nesvnic->nic_index;
+		nesvnic->qp_nic_index[1] = nesvnic->nic_index + 1;
+		if (nes_drv_opt & NES_DRV_OPT_DUAL_LOGICAL_PORT) {
+			nesvnic->qp_nic_index[2] = 0xf;
+			nesvnic->qp_nic_index[3] = 0xf;
+		} else {
+			nesvnic->qp_nic_index[2] = nesvnic->nic_index + 2;
+			nesvnic->qp_nic_index[3] = nesvnic->nic_index + 3;
+		}
+	} else {
+		if (nesvnic->nesdev->nesadapter->port_count == 2 ||
+			(nesvnic->nesdev->nesadapter->port_count == 1 &&
+			nesvnic->nesdev->nesadapter->adapter_fcn_count == 2)) {
+				nesvnic->qp_nic_index[0] = nesvnic->nic_index;
+				nesvnic->qp_nic_index[1] = nesvnic->nic_index
+									+ 2;
+				nesvnic->qp_nic_index[2] = 0xf;
+				nesvnic->qp_nic_index[3] = 0xf;
+		} else {
+			nesvnic->qp_nic_index[0] = nesvnic->nic_index;
+			nesvnic->qp_nic_index[1] = 0xf;
+			nesvnic->qp_nic_index[2] = 0xf;
+			nesvnic->qp_nic_index[3] = 0xf;
+		}
+	}
+	nesvnic->next_qp_nic_index = 0;
+
+	if (nesdev->netdev_count == 0) {
+		nesvnic->rdma_enabled = 1;
+	} else {
+		nesvnic->rdma_enabled = 0;
+	}
+	nesvnic->nic_cq.cq_number = nesvnic->nic.qp_id;
+	init_timer(&nesvnic->event_timer);
+	nesvnic->event_timer.function = NULL;
+	spin_lock_init(&nesvnic->tx_lock);
+	spin_lock_init(&nesvnic->port_ibevent_lock);
+	nesdev->netdev[nesdev->netdev_count] = netdev;
+
+	nes_debug(NES_DBG_INIT, "Adding nesvnic (%p) to the adapters nesvnic_list for MAC%d.\n",
+			nesvnic, nesdev->mac_index);
+	list_add_tail(&nesvnic->list, &nesdev->nesadapter->nesvnic_list[nesdev->mac_index]);
+
+	if ((nesdev->netdev_count == 0) &&
+	    ((PCI_FUNC(nesdev->pcidev->devfn) == nesdev->mac_index) ||
+	     ((phy_type == NES_PHY_TYPE_PUMA_1G) &&
+	      (((PCI_FUNC(nesdev->pcidev->devfn) == 1) && (nesdev->mac_index == 2)) ||
+	       ((PCI_FUNC(nesdev->pcidev->devfn) == 2) && (nesdev->mac_index == 1)))))) {
+		u32 u32temp;
+		u32 link_mask = 0;
+		u32 link_val = 0;
+		u16 temp_phy_data;
+		u16 phy_data = 0;
+		unsigned long flags;
+
+		u32temp = nes_read_indexed(nesdev, NES_IDX_PHY_PCS_CONTROL_STATUS0 +
+				(0x200 * (nesdev->mac_index & 1)));
+		if (phy_type != NES_PHY_TYPE_PUMA_1G) {
+			u32temp |= 0x00200000;
+			nes_write_indexed(nesdev, NES_IDX_PHY_PCS_CONTROL_STATUS0 +
+				(0x200 * (nesdev->mac_index & 1)), u32temp);
+		}
+
+		/* Check and set linkup here.  This is for back to back */
+		/* configuration where second port won't get link interrupt */
+		switch (phy_type) {
+		case NES_PHY_TYPE_PUMA_1G:
+			if (nesdev->mac_index < 2) {
+				link_mask = 0x01010000;
+				link_val = 0x01010000;
+			} else {
+				link_mask = 0x02020000;
+				link_val = 0x02020000;
+			}
+			break;
+		case NES_PHY_TYPE_SFP_D:
+			spin_lock_irqsave(&nesdev->nesadapter->phy_lock, flags);
+			nes_read_10G_phy_reg(nesdev,
+					     nesdev->nesadapter->phy_index[nesdev->mac_index],
+					     1, 0x9003);
+			temp_phy_data = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL);
+			nes_read_10G_phy_reg(nesdev,
+					     nesdev->nesadapter->phy_index[nesdev->mac_index],
+					     3, 0x0021);
+			nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL);
+			nes_read_10G_phy_reg(nesdev,
+					     nesdev->nesadapter->phy_index[nesdev->mac_index],
+					     3, 0x0021);
+			phy_data = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL);
+			spin_unlock_irqrestore(&nesdev->nesadapter->phy_lock, flags);
+			phy_data = (!temp_phy_data && (phy_data == 0x8000)) ? 0x4 : 0x0;
+			break;
+		default:
+			link_mask = 0x0f1f0000;
+			link_val = 0x0f0f0000;
+			break;
+		}
+
+		u32temp = nes_read_indexed(nesdev,
+					   NES_IDX_PHY_PCS_CONTROL_STATUS0 +
+					   (0x200 * (nesdev->mac_index & 1)));
+
+		if (phy_type == NES_PHY_TYPE_SFP_D) {
+			if (phy_data & 0x0004)
+				nesvnic->linkup = 1;
+		} else {
+			if ((u32temp & link_mask) == link_val)
+				nesvnic->linkup = 1;
+		}
+
+		/* clear the MAC interrupt status, assumes direct logical to physical mapping */
+		u32temp = nes_read_indexed(nesdev, NES_IDX_MAC_INT_STATUS + (0x200 * nesdev->mac_index));
+		nes_debug(NES_DBG_INIT, "Phy interrupt status = 0x%X.\n", u32temp);
+		nes_write_indexed(nesdev, NES_IDX_MAC_INT_STATUS + (0x200 * nesdev->mac_index), u32temp);
+
+		nes_init_phy(nesdev);
+	}
+
+	nes_vlan_mode(netdev, nesdev, netdev->features);
+
+	return netdev;
+}
+
+
+/**
+ * nes_netdev_destroy - destroy network device structure
+ */
+void nes_netdev_destroy(struct net_device *netdev)
+{
+	struct nes_vnic *nesvnic = netdev_priv(netdev);
+
+	/* make sure 'stop' method is called by Linux stack */
+	/* nes_netdev_stop(netdev); */
+
+	list_del(&nesvnic->list);
+
+	if (nesvnic->of_device_registered) {
+		nes_destroy_ofa_device(nesvnic->nesibdev);
+	}
+
+	free_netdev(netdev);
+}
+
+
+/**
+ * nes_nic_cm_xmit -- CM calls this to send out pkts
+ */
+int nes_nic_cm_xmit(struct sk_buff *skb, struct net_device *netdev)
+{
+	int ret;
+
+	skb->dev = netdev;
+	ret = dev_queue_xmit(skb);
+	if (ret) {
+		nes_debug(NES_DBG_CM, "Bad return code from dev_queue_xmit %d\n", ret);
+	}
+
+	return ret;
+}
diff --git a/drivers/infiniband/hw/nes/nes_user.h b/drivers/infiniband/hw/nes/nes_user.h
new file mode 100644
index 0000000..529c421
--- /dev/null
+++ b/drivers/infiniband/hw/nes/nes_user.h
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2006 - 2011 Intel Corporation.  All rights reserved.
+ * Copyright (c) 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Cisco Systems.  All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifndef NES_USER_H
+#define NES_USER_H
+
+#include <linux/types.h>
+
+#define NES_ABI_USERSPACE_VER 2
+#define NES_ABI_KERNEL_VER    2
+
+/*
+ * Make sure that all structs defined in this file remain laid out so
+ * that they pack the same way on 32-bit and 64-bit architectures (to
+ * avoid incompatibility between 32-bit userspace and 64-bit kernels).
+ * In particular do not use pointer types -- pass pointers in __u64
+ * instead.
+ */
+
+struct nes_alloc_ucontext_req {
+	__u32 reserved32;
+	__u8  userspace_ver;
+	__u8  reserved8[3];
+};
+
+struct nes_alloc_ucontext_resp {
+	__u32 max_pds; /* maximum pds allowed for this user process */
+	__u32 max_qps; /* maximum qps allowed for this user process */
+	__u32 wq_size; /* size of the WQs (sq+rq) allocated to the mmaped area */
+	__u8  virtwq;  /* flag to indicate if virtual WQ are to be used or not */
+	__u8  kernel_ver;
+	__u8  reserved[2];
+};
+
+struct nes_alloc_pd_resp {
+	__u32 pd_id;
+	__u32 mmap_db_index;
+};
+
+struct nes_create_cq_req {
+	__u64 user_cq_buffer;
+	__u32 mcrqf;
+	__u8 reserved[4];
+};
+
+struct nes_create_qp_req {
+	__u64 user_wqe_buffers;
+	__u64 user_qp_buffer;
+};
+
+enum iwnes_memreg_type {
+	IWNES_MEMREG_TYPE_MEM = 0x0000,
+	IWNES_MEMREG_TYPE_QP = 0x0001,
+	IWNES_MEMREG_TYPE_CQ = 0x0002,
+	IWNES_MEMREG_TYPE_MW = 0x0003,
+	IWNES_MEMREG_TYPE_FMR = 0x0004,
+	IWNES_MEMREG_TYPE_FMEM = 0x0005,
+};
+
+struct nes_mem_reg_req {
+	__u32 reg_type;	/* indicates if id is memory, QP or CQ */
+	__u32 reserved;
+};
+
+struct nes_create_cq_resp {
+	__u32 cq_id;
+	__u32 cq_size;
+	__u32 mmap_db_index;
+	__u32 reserved;
+};
+
+struct nes_create_qp_resp {
+	__u32 qp_id;
+	__u32 actual_sq_size;
+	__u32 actual_rq_size;
+	__u32 mmap_sq_db_index;
+	__u32 mmap_rq_db_index;
+	__u32 nes_drv_opt;
+};
+
+#endif				/* NES_USER_H */
diff --git a/drivers/infiniband/hw/nes/nes_utils.c b/drivers/infiniband/hw/nes/nes_utils.c
new file mode 100644
index 0000000..2042c0f
--- /dev/null
+++ b/drivers/infiniband/hw/nes/nes_utils.c
@@ -0,0 +1,972 @@
+/*
+ * Copyright (c) 2006 - 2011 Intel Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/ethtool.h>
+#include <linux/mii.h>
+#include <linux/if_vlan.h>
+#include <linux/slab.h>
+#include <linux/crc32.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>
+#include <linux/init.h>
+
+#include <asm/io.h>
+#include <asm/irq.h>
+#include <asm/byteorder.h>
+
+#include "nes.h"
+
+static u16 nes_read16_eeprom(void __iomem *addr, u16 offset);
+
+u32 mh_detected;
+u32 mh_pauses_sent;
+
+static u32 nes_set_pau(struct nes_device *nesdev)
+{
+	u32 ret = 0;
+	u32 counter;
+
+	nes_write_indexed(nesdev, NES_IDX_GPR2, NES_ENABLE_PAU);
+	nes_write_indexed(nesdev, NES_IDX_GPR_TRIGGER, 1);
+
+	for (counter = 0; counter < NES_PAU_COUNTER; counter++) {
+		udelay(30);
+		if (!nes_read_indexed(nesdev, NES_IDX_GPR2)) {
+			printk(KERN_INFO PFX "PAU is supported.\n");
+			break;
+		}
+		nes_write_indexed(nesdev, NES_IDX_GPR_TRIGGER, 1);
+	}
+	if (counter == NES_PAU_COUNTER) {
+		printk(KERN_INFO PFX "PAU is not supported.\n");
+		return -EPERM;
+	}
+	return ret;
+}
+
+/**
+ * nes_read_eeprom_values -
+ */
+int nes_read_eeprom_values(struct nes_device *nesdev, struct nes_adapter *nesadapter)
+{
+	u32 mac_addr_low;
+	u16 mac_addr_high;
+	u16 eeprom_data;
+	u16 eeprom_offset;
+	u16 next_section_address;
+	u16 sw_section_ver;
+	u8  major_ver = 0;
+	u8  minor_ver = 0;
+
+	/* TODO: deal with EEPROM endian issues */
+	if (nesadapter->firmware_eeprom_offset == 0) {
+		/* Read the EEPROM Parameters */
+		eeprom_data = nes_read16_eeprom(nesdev->regs, 0);
+		nes_debug(NES_DBG_HW, "EEPROM Offset 0  = 0x%04X\n", eeprom_data);
+		eeprom_offset = 2 + (((eeprom_data & 0x007f) << 3) <<
+				((eeprom_data & 0x0080) >> 7));
+		nes_debug(NES_DBG_HW, "Firmware Offset = 0x%04X\n", eeprom_offset);
+		nesadapter->firmware_eeprom_offset = eeprom_offset;
+		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset + 4);
+		if (eeprom_data != 0x5746) {
+			nes_debug(NES_DBG_HW, "Not a valid Firmware Image = 0x%04X\n", eeprom_data);
+			return -1;
+		}
+
+		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset + 2);
+		nes_debug(NES_DBG_HW, "EEPROM Offset %u  = 0x%04X\n",
+				eeprom_offset + 2, eeprom_data);
+		eeprom_offset += ((eeprom_data & 0x00ff) << 3) << ((eeprom_data & 0x0100) >> 8);
+		nes_debug(NES_DBG_HW, "Software Offset = 0x%04X\n", eeprom_offset);
+		nesadapter->software_eeprom_offset = eeprom_offset;
+		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset + 4);
+		if (eeprom_data != 0x5753) {
+			printk("Not a valid Software Image = 0x%04X\n", eeprom_data);
+			return -1;
+		}
+		sw_section_ver = nes_read16_eeprom(nesdev->regs, nesadapter->software_eeprom_offset  + 6);
+		nes_debug(NES_DBG_HW, "Software section version number = 0x%04X\n",
+				sw_section_ver);
+
+		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset + 2);
+		nes_debug(NES_DBG_HW, "EEPROM Offset %u (next section)  = 0x%04X\n",
+				eeprom_offset + 2, eeprom_data);
+		next_section_address = eeprom_offset + (((eeprom_data & 0x00ff) << 3) <<
+				((eeprom_data & 0x0100) >> 8));
+		eeprom_data = nes_read16_eeprom(nesdev->regs, next_section_address + 4);
+		if (eeprom_data != 0x414d) {
+			nes_debug(NES_DBG_HW, "EEPROM Changed offset should be 0x414d but was 0x%04X\n",
+					eeprom_data);
+			goto no_fw_rev;
+		}
+		eeprom_offset = next_section_address;
+
+		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset + 2);
+		nes_debug(NES_DBG_HW, "EEPROM Offset %u (next section)  = 0x%04X\n",
+				eeprom_offset + 2, eeprom_data);
+		next_section_address = eeprom_offset + (((eeprom_data & 0x00ff) << 3) <<
+				((eeprom_data & 0x0100) >> 8));
+		eeprom_data = nes_read16_eeprom(nesdev->regs, next_section_address + 4);
+		if (eeprom_data != 0x4f52) {
+			nes_debug(NES_DBG_HW, "EEPROM Changed offset should be 0x4f52 but was 0x%04X\n",
+					eeprom_data);
+			goto no_fw_rev;
+		}
+		eeprom_offset = next_section_address;
+
+		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset + 2);
+		nes_debug(NES_DBG_HW, "EEPROM Offset %u (next section)  = 0x%04X\n",
+				eeprom_offset + 2, eeprom_data);
+		next_section_address = eeprom_offset + ((eeprom_data & 0x00ff) << 3);
+		eeprom_data = nes_read16_eeprom(nesdev->regs, next_section_address + 4);
+		if (eeprom_data != 0x5746) {
+			nes_debug(NES_DBG_HW, "EEPROM Changed offset should be 0x5746 but was 0x%04X\n",
+					eeprom_data);
+			goto no_fw_rev;
+		}
+		eeprom_offset = next_section_address;
+
+		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset + 2);
+		nes_debug(NES_DBG_HW, "EEPROM Offset %u (next section)  = 0x%04X\n",
+				eeprom_offset + 2, eeprom_data);
+		next_section_address = eeprom_offset + ((eeprom_data & 0x00ff) << 3);
+		eeprom_data = nes_read16_eeprom(nesdev->regs, next_section_address + 4);
+		if (eeprom_data != 0x5753) {
+			nes_debug(NES_DBG_HW, "EEPROM Changed offset should be 0x5753 but was 0x%04X\n",
+					eeprom_data);
+			goto no_fw_rev;
+		}
+		eeprom_offset = next_section_address;
+
+		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset + 2);
+		nes_debug(NES_DBG_HW, "EEPROM Offset %u (next section)  = 0x%04X\n",
+				eeprom_offset + 2, eeprom_data);
+		next_section_address = eeprom_offset + ((eeprom_data & 0x00ff) << 3);
+		eeprom_data = nes_read16_eeprom(nesdev->regs, next_section_address + 4);
+		if (eeprom_data != 0x414d) {
+			nes_debug(NES_DBG_HW, "EEPROM Changed offset should be 0x414d but was 0x%04X\n",
+					eeprom_data);
+			goto no_fw_rev;
+		}
+		eeprom_offset = next_section_address;
+
+		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset + 2);
+		nes_debug(NES_DBG_HW, "EEPROM Offset %u (next section)  = 0x%04X\n",
+				eeprom_offset + 2, eeprom_data);
+		next_section_address = eeprom_offset + ((eeprom_data & 0x00ff) << 3);
+		eeprom_data = nes_read16_eeprom(nesdev->regs, next_section_address + 4);
+		if (eeprom_data != 0x464e) {
+			nes_debug(NES_DBG_HW, "EEPROM Changed offset should be 0x464e but was 0x%04X\n",
+					eeprom_data);
+			goto no_fw_rev;
+		}
+		eeprom_data = nes_read16_eeprom(nesdev->regs, next_section_address + 8);
+		printk(PFX "Firmware version %u.%u\n", (u8)(eeprom_data>>8), (u8)eeprom_data);
+		major_ver = (u8)(eeprom_data >> 8);
+		minor_ver = (u8)(eeprom_data);
+
+		if (nes_drv_opt & NES_DRV_OPT_DISABLE_VIRT_WQ) {
+			nes_debug(NES_DBG_HW, "Virtual WQs have been disabled\n");
+		} else if (((major_ver == 2) && (minor_ver > 21)) || ((major_ver > 2) && (major_ver != 255))) {
+			nesadapter->virtwq = 1;
+		}
+		if (((major_ver == 3) && (minor_ver >= 16)) || (major_ver > 3))
+			nesadapter->send_term_ok = 1;
+
+		if (nes_drv_opt & NES_DRV_OPT_ENABLE_PAU) {
+			if (!nes_set_pau(nesdev))
+				nesadapter->allow_unaligned_fpdus = 1;
+		}
+
+		nesadapter->firmware_version = (((u32)(u8)(eeprom_data>>8))  <<  16) +
+				(u32)((u8)eeprom_data);
+
+		eeprom_data = nes_read16_eeprom(nesdev->regs, next_section_address + 10);
+		printk(PFX "EEPROM version %u.%u\n", (u8)(eeprom_data>>8), (u8)eeprom_data);
+		nesadapter->eeprom_version = (((u32)(u8)(eeprom_data>>8)) << 16) +
+				(u32)((u8)eeprom_data);
+
+no_fw_rev:
+		/* eeprom is valid */
+		eeprom_offset = nesadapter->software_eeprom_offset;
+		eeprom_offset += 8;
+		nesadapter->netdev_max = (u8)nes_read16_eeprom(nesdev->regs, eeprom_offset);
+		eeprom_offset += 2;
+		mac_addr_high = nes_read16_eeprom(nesdev->regs, eeprom_offset);
+		eeprom_offset += 2;
+		mac_addr_low = (u32)nes_read16_eeprom(nesdev->regs, eeprom_offset);
+		eeprom_offset += 2;
+		mac_addr_low <<= 16;
+		mac_addr_low += (u32)nes_read16_eeprom(nesdev->regs, eeprom_offset);
+		nes_debug(NES_DBG_HW, "Base MAC Address = 0x%04X%08X\n",
+				mac_addr_high, mac_addr_low);
+		nes_debug(NES_DBG_HW, "MAC Address count = %u\n", nesadapter->netdev_max);
+
+		nesadapter->mac_addr_low = mac_addr_low;
+		nesadapter->mac_addr_high = mac_addr_high;
+
+		/* Read the Phy Type array */
+		eeprom_offset += 10;
+		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset);
+		nesadapter->phy_type[0] = (u8)(eeprom_data >> 8);
+		nesadapter->phy_type[1] = (u8)eeprom_data;
+
+		/* Read the port array */
+		eeprom_offset += 2;
+		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset);
+		nesadapter->phy_type[2] = (u8)(eeprom_data >> 8);
+		nesadapter->phy_type[3] = (u8)eeprom_data;
+		/* port_count is set by soft reset reg */
+		nes_debug(NES_DBG_HW, "port_count = %u, port 0 -> %u, port 1 -> %u,"
+				" port 2 -> %u, port 3 -> %u\n",
+				nesadapter->port_count,
+				nesadapter->phy_type[0], nesadapter->phy_type[1],
+				nesadapter->phy_type[2], nesadapter->phy_type[3]);
+
+		/* Read PD config array */
+		eeprom_offset += 10;
+		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset);
+		nesadapter->pd_config_size[0] = eeprom_data;
+		eeprom_offset += 2;
+		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset);
+		nesadapter->pd_config_base[0] = eeprom_data;
+		nes_debug(NES_DBG_HW, "PD0 config, size=0x%04x, base=0x%04x\n",
+				nesadapter->pd_config_size[0], nesadapter->pd_config_base[0]);
+
+		eeprom_offset += 2;
+		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset);
+		nesadapter->pd_config_size[1] = eeprom_data;
+		eeprom_offset += 2;
+		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset);
+		nesadapter->pd_config_base[1] = eeprom_data;
+		nes_debug(NES_DBG_HW, "PD1 config, size=0x%04x, base=0x%04x\n",
+				nesadapter->pd_config_size[1], nesadapter->pd_config_base[1]);
+
+		eeprom_offset += 2;
+		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset);
+		nesadapter->pd_config_size[2] = eeprom_data;
+		eeprom_offset += 2;
+		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset);
+		nesadapter->pd_config_base[2] = eeprom_data;
+		nes_debug(NES_DBG_HW, "PD2 config, size=0x%04x, base=0x%04x\n",
+				nesadapter->pd_config_size[2], nesadapter->pd_config_base[2]);
+
+		eeprom_offset += 2;
+		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset);
+		nesadapter->pd_config_size[3] = eeprom_data;
+		eeprom_offset += 2;
+		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset);
+		nesadapter->pd_config_base[3] = eeprom_data;
+		nes_debug(NES_DBG_HW, "PD3 config, size=0x%04x, base=0x%04x\n",
+				nesadapter->pd_config_size[3], nesadapter->pd_config_base[3]);
+
+		/* Read Rx Pool Size */
+		eeprom_offset += 22;   /* 46 */
+		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset);
+		eeprom_offset += 2;
+		nesadapter->rx_pool_size = (((u32)eeprom_data) << 16) +
+				nes_read16_eeprom(nesdev->regs, eeprom_offset);
+		nes_debug(NES_DBG_HW, "rx_pool_size = 0x%08X\n", nesadapter->rx_pool_size);
+
+		eeprom_offset += 2;
+		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset);
+		eeprom_offset += 2;
+		nesadapter->tx_pool_size = (((u32)eeprom_data) << 16) +
+				nes_read16_eeprom(nesdev->regs, eeprom_offset);
+		nes_debug(NES_DBG_HW, "tx_pool_size = 0x%08X\n", nesadapter->tx_pool_size);
+
+		eeprom_offset += 2;
+		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset);
+		eeprom_offset += 2;
+		nesadapter->rx_threshold = (((u32)eeprom_data) << 16) +
+				nes_read16_eeprom(nesdev->regs, eeprom_offset);
+		nes_debug(NES_DBG_HW, "rx_threshold = 0x%08X\n", nesadapter->rx_threshold);
+
+		eeprom_offset += 2;
+		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset);
+		eeprom_offset += 2;
+		nesadapter->tcp_timer_core_clk_divisor = (((u32)eeprom_data) << 16) +
+				nes_read16_eeprom(nesdev->regs, eeprom_offset);
+		nes_debug(NES_DBG_HW, "tcp_timer_core_clk_divisor = 0x%08X\n",
+				nesadapter->tcp_timer_core_clk_divisor);
+
+		eeprom_offset += 2;
+		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset);
+		eeprom_offset += 2;
+		nesadapter->iwarp_config = (((u32)eeprom_data) << 16) +
+				nes_read16_eeprom(nesdev->regs, eeprom_offset);
+		nes_debug(NES_DBG_HW, "iwarp_config = 0x%08X\n", nesadapter->iwarp_config);
+
+		eeprom_offset += 2;
+		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset);
+		eeprom_offset += 2;
+		nesadapter->cm_config = (((u32)eeprom_data) << 16) +
+				nes_read16_eeprom(nesdev->regs, eeprom_offset);
+		nes_debug(NES_DBG_HW, "cm_config = 0x%08X\n", nesadapter->cm_config);
+
+		eeprom_offset += 2;
+		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset);
+		eeprom_offset += 2;
+		nesadapter->sws_timer_config = (((u32)eeprom_data) << 16) +
+				nes_read16_eeprom(nesdev->regs, eeprom_offset);
+		nes_debug(NES_DBG_HW, "sws_timer_config = 0x%08X\n", nesadapter->sws_timer_config);
+
+		eeprom_offset += 2;
+		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset);
+		eeprom_offset += 2;
+		nesadapter->tcp_config1 = (((u32)eeprom_data) << 16) +
+				nes_read16_eeprom(nesdev->regs, eeprom_offset);
+		nes_debug(NES_DBG_HW, "tcp_config1 = 0x%08X\n", nesadapter->tcp_config1);
+
+		eeprom_offset += 2;
+		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset);
+		eeprom_offset += 2;
+		nesadapter->wqm_wat = (((u32)eeprom_data) << 16) +
+				nes_read16_eeprom(nesdev->regs, eeprom_offset);
+		nes_debug(NES_DBG_HW, "wqm_wat = 0x%08X\n", nesadapter->wqm_wat);
+
+		eeprom_offset += 2;
+		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset);
+		eeprom_offset += 2;
+		nesadapter->core_clock = (((u32)eeprom_data) << 16) +
+				nes_read16_eeprom(nesdev->regs, eeprom_offset);
+		nes_debug(NES_DBG_HW, "core_clock = 0x%08X\n", nesadapter->core_clock);
+
+		if ((sw_section_ver) && (nesadapter->hw_rev != NE020_REV)) {
+			eeprom_offset += 2;
+			eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset);
+			nesadapter->phy_index[0] = (eeprom_data & 0xff00)>>8;
+			nesadapter->phy_index[1] = eeprom_data & 0x00ff;
+			eeprom_offset += 2;
+			eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset);
+			nesadapter->phy_index[2] = (eeprom_data & 0xff00)>>8;
+			nesadapter->phy_index[3] = eeprom_data & 0x00ff;
+		} else {
+			nesadapter->phy_index[0] = 4;
+			nesadapter->phy_index[1] = 5;
+			nesadapter->phy_index[2] = 6;
+			nesadapter->phy_index[3] = 7;
+		}
+		nes_debug(NES_DBG_HW, "Phy address map = 0 > %u,  1 > %u, 2 > %u, 3 > %u\n",
+			   nesadapter->phy_index[0],nesadapter->phy_index[1],
+			   nesadapter->phy_index[2],nesadapter->phy_index[3]);
+	}
+
+	return 0;
+}
+
+
+/**
+ * nes_read16_eeprom
+ */
+static u16 nes_read16_eeprom(void __iomem *addr, u16 offset)
+{
+	writel(NES_EEPROM_READ_REQUEST + (offset >> 1),
+			(void __iomem *)addr + NES_EEPROM_COMMAND);
+
+	do {
+	} while (readl((void __iomem *)addr + NES_EEPROM_COMMAND) &
+			NES_EEPROM_READ_REQUEST);
+
+	return readw((void __iomem *)addr + NES_EEPROM_DATA);
+}
+
+
+/**
+ * nes_write_1G_phy_reg
+ */
+void nes_write_1G_phy_reg(struct nes_device *nesdev, u8 phy_reg, u8 phy_addr, u16 data)
+{
+	u32 u32temp;
+	u32 counter;
+
+	nes_write_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL,
+			0x50020000 | data | ((u32)phy_reg << 18) | ((u32)phy_addr << 23));
+	for (counter = 0; counter < 100 ; counter++) {
+		udelay(30);
+		u32temp = nes_read_indexed(nesdev, NES_IDX_MAC_INT_STATUS);
+		if (u32temp & 1) {
+			/* nes_debug(NES_DBG_PHY, "Phy interrupt status = 0x%X.\n", u32temp); */
+			nes_write_indexed(nesdev, NES_IDX_MAC_INT_STATUS, 1);
+			break;
+		}
+	}
+	if (!(u32temp & 1))
+		nes_debug(NES_DBG_PHY, "Phy is not responding. interrupt status = 0x%X.\n",
+				u32temp);
+}
+
+
+/**
+ * nes_read_1G_phy_reg
+ * This routine only issues the read, the data must be read
+ * separately.
+ */
+void nes_read_1G_phy_reg(struct nes_device *nesdev, u8 phy_reg, u8 phy_addr, u16 *data)
+{
+	u32 u32temp;
+	u32 counter;
+
+	/* nes_debug(NES_DBG_PHY, "phy addr = %d, mac_index = %d\n",
+			phy_addr, nesdev->mac_index); */
+
+	nes_write_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL,
+			0x60020000 | ((u32)phy_reg << 18) | ((u32)phy_addr << 23));
+	for (counter = 0; counter < 100 ; counter++) {
+		udelay(30);
+		u32temp = nes_read_indexed(nesdev, NES_IDX_MAC_INT_STATUS);
+		if (u32temp & 1) {
+			/* nes_debug(NES_DBG_PHY, "Phy interrupt status = 0x%X.\n", u32temp); */
+			nes_write_indexed(nesdev, NES_IDX_MAC_INT_STATUS, 1);
+			break;
+		}
+	}
+	if (!(u32temp & 1)) {
+		nes_debug(NES_DBG_PHY, "Phy is not responding. interrupt status = 0x%X.\n",
+				u32temp);
+		*data = 0xffff;
+	} else {
+		*data = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL);
+	}
+}
+
+
+/**
+ * nes_write_10G_phy_reg
+ */
+void nes_write_10G_phy_reg(struct nes_device *nesdev, u16 phy_addr, u8 dev_addr, u16 phy_reg,
+		u16 data)
+{
+	u32 port_addr;
+	u32 u32temp;
+	u32 counter;
+
+	port_addr = phy_addr;
+
+	/* set address */
+	nes_write_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL,
+			0x00020000 | (u32)phy_reg | (((u32)dev_addr) << 18) | (((u32)port_addr) << 23));
+	for (counter = 0; counter < 100 ; counter++) {
+		udelay(30);
+		u32temp = nes_read_indexed(nesdev, NES_IDX_MAC_INT_STATUS);
+		if (u32temp & 1) {
+			nes_write_indexed(nesdev, NES_IDX_MAC_INT_STATUS, 1);
+			break;
+		}
+	}
+	if (!(u32temp & 1))
+		nes_debug(NES_DBG_PHY, "Phy is not responding. interrupt status = 0x%X.\n",
+				u32temp);
+
+	/* set data */
+	nes_write_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL,
+			0x10020000 | (u32)data | (((u32)dev_addr) << 18) | (((u32)port_addr) << 23));
+	for (counter = 0; counter < 100 ; counter++) {
+		udelay(30);
+		u32temp = nes_read_indexed(nesdev, NES_IDX_MAC_INT_STATUS);
+		if (u32temp & 1) {
+			nes_write_indexed(nesdev, NES_IDX_MAC_INT_STATUS, 1);
+			break;
+		}
+	}
+	if (!(u32temp & 1))
+		nes_debug(NES_DBG_PHY, "Phy is not responding. interrupt status = 0x%X.\n",
+				u32temp);
+}
+
+
+/**
+ * nes_read_10G_phy_reg
+ * This routine only issues the read, the data must be read
+ * separately.
+ */
+void nes_read_10G_phy_reg(struct nes_device *nesdev, u8 phy_addr, u8 dev_addr, u16 phy_reg)
+{
+	u32 port_addr;
+	u32 u32temp;
+	u32 counter;
+
+	port_addr = phy_addr;
+
+	/* set address */
+	nes_write_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL,
+			0x00020000 | (u32)phy_reg | (((u32)dev_addr) << 18) | (((u32)port_addr) << 23));
+	for (counter = 0; counter < 100 ; counter++) {
+		udelay(30);
+		u32temp = nes_read_indexed(nesdev, NES_IDX_MAC_INT_STATUS);
+		if (u32temp & 1) {
+			nes_write_indexed(nesdev, NES_IDX_MAC_INT_STATUS, 1);
+			break;
+		}
+	}
+	if (!(u32temp & 1))
+		nes_debug(NES_DBG_PHY, "Phy is not responding. interrupt status = 0x%X.\n",
+				u32temp);
+
+	/* issue read */
+	nes_write_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL,
+			0x30020000 | (((u32)dev_addr) << 18) | (((u32)port_addr) << 23));
+	for (counter = 0; counter < 100 ; counter++) {
+		udelay(30);
+		u32temp = nes_read_indexed(nesdev, NES_IDX_MAC_INT_STATUS);
+		if (u32temp & 1) {
+			nes_write_indexed(nesdev, NES_IDX_MAC_INT_STATUS, 1);
+			break;
+		}
+	}
+	if (!(u32temp & 1))
+		nes_debug(NES_DBG_PHY, "Phy is not responding. interrupt status = 0x%X.\n",
+				u32temp);
+}
+
+
+/**
+ * nes_get_cqp_request
+ */
+struct nes_cqp_request *nes_get_cqp_request(struct nes_device *nesdev)
+{
+	unsigned long flags;
+	struct nes_cqp_request *cqp_request = NULL;
+
+	if (!list_empty(&nesdev->cqp_avail_reqs)) {
+		spin_lock_irqsave(&nesdev->cqp.lock, flags);
+		if (!list_empty(&nesdev->cqp_avail_reqs)) {
+			cqp_request = list_entry(nesdev->cqp_avail_reqs.next,
+				struct nes_cqp_request, list);
+			list_del_init(&cqp_request->list);
+		}
+		spin_unlock_irqrestore(&nesdev->cqp.lock, flags);
+	}
+	if (cqp_request == NULL) {
+		cqp_request = kzalloc(sizeof(struct nes_cqp_request), GFP_ATOMIC);
+		if (cqp_request) {
+			cqp_request->dynamic = 1;
+			INIT_LIST_HEAD(&cqp_request->list);
+		}
+	}
+
+	if (cqp_request) {
+		init_waitqueue_head(&cqp_request->waitq);
+		cqp_request->waiting = 0;
+		cqp_request->request_done = 0;
+		cqp_request->callback = 0;
+		init_waitqueue_head(&cqp_request->waitq);
+		nes_debug(NES_DBG_CQP, "Got cqp request %p from the available list \n",
+				cqp_request);
+	} else
+		printk(KERN_ERR PFX "%s: Could not allocated a CQP request.\n",
+			   __func__);
+
+	return cqp_request;
+}
+
+void nes_free_cqp_request(struct nes_device *nesdev,
+			  struct nes_cqp_request *cqp_request)
+{
+	unsigned long flags;
+
+	nes_debug(NES_DBG_CQP, "CQP request %p (opcode 0x%02X) freed.\n",
+		  cqp_request,
+		  le32_to_cpu(cqp_request->cqp_wqe.wqe_words[NES_CQP_WQE_OPCODE_IDX]) & 0x3f);
+
+	if (cqp_request->dynamic) {
+		kfree(cqp_request);
+	} else {
+		spin_lock_irqsave(&nesdev->cqp.lock, flags);
+		list_add_tail(&cqp_request->list, &nesdev->cqp_avail_reqs);
+		spin_unlock_irqrestore(&nesdev->cqp.lock, flags);
+	}
+}
+
+void nes_put_cqp_request(struct nes_device *nesdev,
+			 struct nes_cqp_request *cqp_request)
+{
+	if (atomic_dec_and_test(&cqp_request->refcount))
+		nes_free_cqp_request(nesdev, cqp_request);
+}
+
+
+/**
+ * nes_post_cqp_request
+ */
+void nes_post_cqp_request(struct nes_device *nesdev,
+			  struct nes_cqp_request *cqp_request)
+{
+	struct nes_hw_cqp_wqe *cqp_wqe;
+	unsigned long flags;
+	u32 cqp_head;
+	u64 u64temp;
+	u32 opcode;
+	int ctx_index = NES_CQP_WQE_COMP_CTX_LOW_IDX;
+
+	spin_lock_irqsave(&nesdev->cqp.lock, flags);
+
+	if (((((nesdev->cqp.sq_tail+(nesdev->cqp.sq_size*2))-nesdev->cqp.sq_head) &
+			(nesdev->cqp.sq_size - 1)) != 1)
+			&& (list_empty(&nesdev->cqp_pending_reqs))) {
+		cqp_head = nesdev->cqp.sq_head++;
+		nesdev->cqp.sq_head &= nesdev->cqp.sq_size-1;
+		cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head];
+		memcpy(cqp_wqe, &cqp_request->cqp_wqe, sizeof(*cqp_wqe));
+		opcode = le32_to_cpu(cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX]);
+		if ((opcode & NES_CQP_OPCODE_MASK) == NES_CQP_DOWNLOAD_SEGMENT)
+			ctx_index = NES_CQP_WQE_DL_COMP_CTX_LOW_IDX;
+		barrier();
+		u64temp = (unsigned long)cqp_request;
+		set_wqe_64bit_value(cqp_wqe->wqe_words, ctx_index, u64temp);
+		nes_debug(NES_DBG_CQP, "CQP request (opcode 0x%02X), line 1 = 0x%08X put on CQPs SQ,"
+			" request = %p, cqp_head = %u, cqp_tail = %u, cqp_size = %u,"
+			" waiting = %d, refcount = %d.\n",
+			opcode & NES_CQP_OPCODE_MASK,
+			le32_to_cpu(cqp_wqe->wqe_words[NES_CQP_WQE_ID_IDX]), cqp_request,
+			nesdev->cqp.sq_head, nesdev->cqp.sq_tail, nesdev->cqp.sq_size,
+			cqp_request->waiting, atomic_read(&cqp_request->refcount));
+
+		barrier();
+
+		/* Ring doorbell (1 WQEs) */
+		nes_write32(nesdev->regs+NES_WQE_ALLOC, 0x01800000 | nesdev->cqp.qp_id);
+
+		barrier();
+	} else {
+		nes_debug(NES_DBG_CQP, "CQP request %p (opcode 0x%02X), line 1 = 0x%08X"
+				" put on the pending queue.\n",
+				cqp_request,
+				le32_to_cpu(cqp_request->cqp_wqe.wqe_words[NES_CQP_WQE_OPCODE_IDX])&0x3f,
+				le32_to_cpu(cqp_request->cqp_wqe.wqe_words[NES_CQP_WQE_ID_IDX]));
+		list_add_tail(&cqp_request->list, &nesdev->cqp_pending_reqs);
+	}
+
+	spin_unlock_irqrestore(&nesdev->cqp.lock, flags);
+
+	return;
+}
+
+/**
+ * nes_arp_table
+ */
+int nes_arp_table(struct nes_device *nesdev, u32 ip_addr, u8 *mac_addr, u32 action)
+{
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+	int arp_index;
+	int err = 0;
+	__be32 tmp_addr;
+
+	for (arp_index = 0; (u32) arp_index < nesadapter->arp_table_size; arp_index++) {
+		if (nesadapter->arp_table[arp_index].ip_addr == ip_addr)
+			break;
+	}
+
+	if (action == NES_ARP_ADD) {
+		if (arp_index != nesadapter->arp_table_size) {
+			return -1;
+		}
+
+		arp_index = 0;
+		err = nes_alloc_resource(nesadapter, nesadapter->allocated_arps,
+				nesadapter->arp_table_size, (u32 *)&arp_index, &nesadapter->next_arp_index, NES_RESOURCE_ARP);
+		if (err) {
+			nes_debug(NES_DBG_NETDEV, "nes_alloc_resource returned error = %u\n", err);
+			return err;
+		}
+		nes_debug(NES_DBG_NETDEV, "ADD, arp_index=%d\n", arp_index);
+
+		nesadapter->arp_table[arp_index].ip_addr = ip_addr;
+		memcpy(nesadapter->arp_table[arp_index].mac_addr, mac_addr, ETH_ALEN);
+		return arp_index;
+	}
+
+	/* DELETE or RESOLVE */
+	if (arp_index == nesadapter->arp_table_size) {
+		tmp_addr = cpu_to_be32(ip_addr);
+		nes_debug(NES_DBG_NETDEV, "MAC for %pI4 not in ARP table - cannot %s\n",
+			  &tmp_addr, action == NES_ARP_RESOLVE ? "resolve" : "delete");
+		return -1;
+	}
+
+	if (action == NES_ARP_RESOLVE) {
+		nes_debug(NES_DBG_NETDEV, "RESOLVE, arp_index=%d\n", arp_index);
+		return arp_index;
+	}
+
+	if (action == NES_ARP_DELETE) {
+		nes_debug(NES_DBG_NETDEV, "DELETE, arp_index=%d\n", arp_index);
+		nesadapter->arp_table[arp_index].ip_addr = 0;
+		memset(nesadapter->arp_table[arp_index].mac_addr, 0x00, ETH_ALEN);
+		nes_free_resource(nesadapter, nesadapter->allocated_arps, arp_index);
+		return arp_index;
+	}
+
+	return -1;
+}
+
+
+/**
+ * nes_mh_fix
+ */
+void nes_mh_fix(unsigned long parm)
+{
+	unsigned long flags;
+	struct nes_device *nesdev = (struct nes_device *)parm;
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+	struct nes_vnic *nesvnic;
+	u32 used_chunks_tx;
+	u32 temp_used_chunks_tx;
+	u32 temp_last_used_chunks_tx;
+	u32 used_chunks_mask;
+	u32 mac_tx_frames_low;
+	u32 mac_tx_frames_high;
+	u32 mac_tx_pauses;
+	u32 serdes_status;
+	u32 reset_value;
+	u32 tx_control;
+	u32 tx_config;
+	u32 tx_pause_quanta;
+	u32 rx_control;
+	u32 rx_config;
+	u32 mac_exact_match;
+	u32 mpp_debug;
+	u32 i=0;
+	u32 chunks_tx_progress = 0;
+
+	spin_lock_irqsave(&nesadapter->phy_lock, flags);
+	if ((nesadapter->mac_sw_state[0] != NES_MAC_SW_IDLE) || (nesadapter->mac_link_down[0])) {
+		spin_unlock_irqrestore(&nesadapter->phy_lock, flags);
+		goto no_mh_work;
+	}
+	nesadapter->mac_sw_state[0] = NES_MAC_SW_MH;
+	spin_unlock_irqrestore(&nesadapter->phy_lock, flags);
+	do {
+		mac_tx_frames_low = nes_read_indexed(nesdev, NES_IDX_MAC_TX_FRAMES_LOW);
+		mac_tx_frames_high = nes_read_indexed(nesdev, NES_IDX_MAC_TX_FRAMES_HIGH);
+		mac_tx_pauses = nes_read_indexed(nesdev, NES_IDX_MAC_TX_PAUSE_FRAMES);
+		used_chunks_tx = nes_read_indexed(nesdev, NES_IDX_USED_CHUNKS_TX);
+		nesdev->mac_pause_frames_sent += mac_tx_pauses;
+		used_chunks_mask = 0;
+		temp_used_chunks_tx = used_chunks_tx;
+		temp_last_used_chunks_tx = nesdev->last_used_chunks_tx;
+
+		if (nesdev->netdev[0]) {
+			nesvnic = netdev_priv(nesdev->netdev[0]);
+		} else {
+			break;
+		}
+
+		for (i=0; i<4; i++) {
+			used_chunks_mask <<= 8;
+			if (nesvnic->qp_nic_index[i] != 0xff) {
+				used_chunks_mask |= 0xff;
+				if ((temp_used_chunks_tx&0xff)<(temp_last_used_chunks_tx&0xff)) {
+					chunks_tx_progress = 1;
+				}
+			}
+			temp_used_chunks_tx >>= 8;
+			temp_last_used_chunks_tx >>= 8;
+		}
+		if ((mac_tx_frames_low) || (mac_tx_frames_high) ||
+			(!(used_chunks_tx&used_chunks_mask)) ||
+			(!(nesdev->last_used_chunks_tx&used_chunks_mask)) ||
+			(chunks_tx_progress) ) {
+			nesdev->last_used_chunks_tx = used_chunks_tx;
+			break;
+		}
+		nesdev->last_used_chunks_tx = used_chunks_tx;
+		barrier();
+
+		nes_write_indexed(nesdev, NES_IDX_MAC_TX_CONTROL, 0x00000005);
+		mh_pauses_sent++;
+		mac_tx_pauses = nes_read_indexed(nesdev, NES_IDX_MAC_TX_PAUSE_FRAMES);
+		if (mac_tx_pauses) {
+			nesdev->mac_pause_frames_sent += mac_tx_pauses;
+			break;
+		}
+
+		tx_control = nes_read_indexed(nesdev, NES_IDX_MAC_TX_CONTROL);
+		tx_config = nes_read_indexed(nesdev, NES_IDX_MAC_TX_CONFIG);
+		tx_pause_quanta = nes_read_indexed(nesdev, NES_IDX_MAC_TX_PAUSE_QUANTA);
+		rx_control = nes_read_indexed(nesdev, NES_IDX_MAC_RX_CONTROL);
+		rx_config = nes_read_indexed(nesdev, NES_IDX_MAC_RX_CONFIG);
+		mac_exact_match = nes_read_indexed(nesdev, NES_IDX_MAC_EXACT_MATCH_BOTTOM);
+		mpp_debug = nes_read_indexed(nesdev, NES_IDX_MPP_DEBUG);
+
+		/* one last ditch effort to avoid a false positive */
+		mac_tx_pauses = nes_read_indexed(nesdev, NES_IDX_MAC_TX_PAUSE_FRAMES);
+		if (mac_tx_pauses) {
+			nesdev->last_mac_tx_pauses = nesdev->mac_pause_frames_sent;
+			nes_debug(NES_DBG_HW, "failsafe caught slow outbound pause\n");
+			break;
+		}
+		mh_detected++;
+
+		nes_write_indexed(nesdev, NES_IDX_MAC_TX_CONTROL, 0x00000000);
+		nes_write_indexed(nesdev, NES_IDX_MAC_TX_CONFIG, 0x00000000);
+		reset_value = nes_read32(nesdev->regs+NES_SOFTWARE_RESET);
+
+		nes_write32(nesdev->regs+NES_SOFTWARE_RESET, reset_value | 0x0000001d);
+
+		while (((nes_read32(nesdev->regs+NES_SOFTWARE_RESET)
+				& 0x00000040) != 0x00000040) && (i++ < 5000)) {
+			/* mdelay(1); */
+		}
+
+		nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL0, 0x00000008);
+		serdes_status = nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_STATUS0);
+
+		nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_TX_EMP0, 0x000bdef7);
+		nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_TX_DRIVE0, 0x9ce73000);
+		nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_RX_MODE0, 0x0ff00000);
+		nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_RX_SIGDET0, 0x00000000);
+		nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_BYPASS0, 0x00000000);
+		nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_LOOPBACK_CONTROL0, 0x00000000);
+		if (nesadapter->OneG_Mode) {
+			nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_RX_EQ_CONTROL0, 0xf0182222);
+		} else {
+			nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_RX_EQ_CONTROL0, 0xf0042222);
+		}
+		serdes_status = nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_RX_EQ_STATUS0);
+		nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_CDR_CONTROL0, 0x000000ff);
+
+		nes_write_indexed(nesdev, NES_IDX_MAC_TX_CONTROL, tx_control);
+		nes_write_indexed(nesdev, NES_IDX_MAC_TX_CONFIG, tx_config);
+		nes_write_indexed(nesdev, NES_IDX_MAC_TX_PAUSE_QUANTA, tx_pause_quanta);
+		nes_write_indexed(nesdev, NES_IDX_MAC_RX_CONTROL, rx_control);
+		nes_write_indexed(nesdev, NES_IDX_MAC_RX_CONFIG, rx_config);
+		nes_write_indexed(nesdev, NES_IDX_MAC_EXACT_MATCH_BOTTOM, mac_exact_match);
+		nes_write_indexed(nesdev, NES_IDX_MPP_DEBUG, mpp_debug);
+
+	} while (0);
+
+	nesadapter->mac_sw_state[0] = NES_MAC_SW_IDLE;
+no_mh_work:
+	nesdev->nesadapter->mh_timer.expires = jiffies + (HZ/5);
+	add_timer(&nesdev->nesadapter->mh_timer);
+}
+
+/**
+ * nes_clc
+ */
+void nes_clc(unsigned long parm)
+{
+	unsigned long flags;
+	struct nes_device *nesdev = (struct nes_device *)parm;
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+
+	spin_lock_irqsave(&nesadapter->phy_lock, flags);
+    nesadapter->link_interrupt_count[0] = 0;
+    nesadapter->link_interrupt_count[1] = 0;
+    nesadapter->link_interrupt_count[2] = 0;
+    nesadapter->link_interrupt_count[3] = 0;
+	spin_unlock_irqrestore(&nesadapter->phy_lock, flags);
+
+	nesadapter->lc_timer.expires = jiffies + 3600 * HZ;  /* 1 hour */
+	add_timer(&nesadapter->lc_timer);
+}
+
+
+/**
+ * nes_dump_mem
+ */
+void nes_dump_mem(unsigned int dump_debug_level, void *addr, int length)
+{
+	char  xlate[] = {'0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
+		'a', 'b', 'c', 'd', 'e', 'f'};
+	char  *ptr;
+	char  hex_buf[80];
+	char  ascii_buf[20];
+	int   num_char;
+	int   num_ascii;
+	int   num_hex;
+
+	if (!(nes_debug_level & dump_debug_level)) {
+		return;
+	}
+
+	ptr = addr;
+	if (length > 0x100) {
+		nes_debug(dump_debug_level, "Length truncated from %x to %x\n", length, 0x100);
+		length = 0x100;
+	}
+	nes_debug(dump_debug_level, "Address=0x%p, length=0x%x (%d)\n", ptr, length, length);
+
+	memset(ascii_buf, 0, 20);
+	memset(hex_buf, 0, 80);
+
+	num_ascii = 0;
+	num_hex = 0;
+	for (num_char = 0; num_char < length; num_char++) {
+		if (num_ascii == 8) {
+			ascii_buf[num_ascii++] = ' ';
+			hex_buf[num_hex++] = '-';
+			hex_buf[num_hex++] = ' ';
+		}
+
+		if (*ptr < 0x20 || *ptr > 0x7e)
+			ascii_buf[num_ascii++] = '.';
+		else
+			ascii_buf[num_ascii++] = *ptr;
+		hex_buf[num_hex++] = xlate[((*ptr & 0xf0) >> 4)];
+		hex_buf[num_hex++] = xlate[*ptr & 0x0f];
+		hex_buf[num_hex++] = ' ';
+		ptr++;
+
+		if (num_ascii >= 17) {
+			/* output line and reset */
+			nes_debug(dump_debug_level, "   %s |  %s\n", hex_buf, ascii_buf);
+			memset(ascii_buf, 0, 20);
+			memset(hex_buf, 0, 80);
+			num_ascii = 0;
+			num_hex = 0;
+		}
+	}
+
+	/* output the rest */
+	if (num_ascii) {
+		while (num_ascii < 17) {
+			if (num_ascii == 8) {
+				hex_buf[num_hex++] = ' ';
+				hex_buf[num_hex++] = ' ';
+			}
+			hex_buf[num_hex++] = ' ';
+			hex_buf[num_hex++] = ' ';
+			hex_buf[num_hex++] = ' ';
+			num_ascii++;
+		}
+
+		nes_debug(dump_debug_level, "   %s |  %s\n", hex_buf, ascii_buf);
+	}
+}
diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c
new file mode 100644
index 0000000..fef067c
--- /dev/null
+++ b/drivers/infiniband/hw/nes/nes_verbs.c
@@ -0,0 +1,4054 @@
+/*
+ * Copyright (c) 2006 - 2011 Intel Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/random.h>
+#include <linux/highmem.h>
+#include <linux/slab.h>
+#include <asm/byteorder.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/iw_cm.h>
+#include <rdma/ib_user_verbs.h>
+
+#include "nes.h"
+
+#include <rdma/ib_umem.h>
+
+atomic_t mod_qp_timouts;
+atomic_t qps_created;
+atomic_t sw_qps_destroyed;
+
+static void nes_unregister_ofa_device(struct nes_ib_device *nesibdev);
+
+/**
+ * nes_alloc_mw
+ */
+static struct ib_mw *nes_alloc_mw(struct ib_pd *ibpd, enum ib_mw_type type)
+{
+	struct nes_pd *nespd = to_nespd(ibpd);
+	struct nes_vnic *nesvnic = to_nesvnic(ibpd->device);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+	struct nes_cqp_request *cqp_request;
+	struct nes_mr *nesmr;
+	struct ib_mw *ibmw;
+	struct nes_hw_cqp_wqe *cqp_wqe;
+	int ret;
+	u32 stag;
+	u32 stag_index = 0;
+	u32 next_stag_index = 0;
+	u32 driver_key = 0;
+	u8 stag_key = 0;
+
+	if (type != IB_MW_TYPE_1)
+		return ERR_PTR(-EINVAL);
+
+	get_random_bytes(&next_stag_index, sizeof(next_stag_index));
+	stag_key = (u8)next_stag_index;
+
+	driver_key = 0;
+
+	next_stag_index >>= 8;
+	next_stag_index %= nesadapter->max_mr;
+
+	ret = nes_alloc_resource(nesadapter, nesadapter->allocated_mrs,
+			nesadapter->max_mr, &stag_index, &next_stag_index, NES_RESOURCE_MW);
+	if (ret) {
+		return ERR_PTR(ret);
+	}
+
+	nesmr = kzalloc(sizeof(*nesmr), GFP_KERNEL);
+	if (!nesmr) {
+		nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	stag = stag_index << 8;
+	stag |= driver_key;
+	stag += (u32)stag_key;
+
+	nes_debug(NES_DBG_MR, "Registering STag 0x%08X, index = 0x%08X\n",
+			stag, stag_index);
+
+	/* Register the region with the adapter */
+	cqp_request = nes_get_cqp_request(nesdev);
+	if (cqp_request == NULL) {
+		kfree(nesmr);
+		nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	cqp_request->waiting = 1;
+	cqp_wqe = &cqp_request->cqp_wqe;
+
+	cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] =
+			cpu_to_le32( NES_CQP_ALLOCATE_STAG | NES_CQP_STAG_RIGHTS_REMOTE_READ |
+			NES_CQP_STAG_RIGHTS_REMOTE_WRITE | NES_CQP_STAG_VA_TO |
+			NES_CQP_STAG_REM_ACC_EN);
+
+	nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
+	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_LEN_HIGH_PD_IDX, (nespd->pd_id & 0x00007fff));
+	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_STAG_IDX, stag);
+
+	atomic_set(&cqp_request->refcount, 2);
+	nes_post_cqp_request(nesdev, cqp_request);
+
+	/* Wait for CQP */
+	ret = wait_event_timeout(cqp_request->waitq, (cqp_request->request_done != 0),
+			NES_EVENT_TIMEOUT);
+	nes_debug(NES_DBG_MR, "Register STag 0x%08X completed, wait_event_timeout ret = %u,"
+			" CQP Major:Minor codes = 0x%04X:0x%04X.\n",
+			stag, ret, cqp_request->major_code, cqp_request->minor_code);
+	if ((!ret) || (cqp_request->major_code)) {
+		nes_put_cqp_request(nesdev, cqp_request);
+		kfree(nesmr);
+		nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index);
+		if (!ret) {
+			return ERR_PTR(-ETIME);
+		} else {
+			return ERR_PTR(-ENOMEM);
+		}
+	}
+	nes_put_cqp_request(nesdev, cqp_request);
+
+	nesmr->ibmw.rkey = stag;
+	nesmr->mode = IWNES_MEMREG_TYPE_MW;
+	ibmw = &nesmr->ibmw;
+	nesmr->pbl_4k = 0;
+	nesmr->pbls_used = 0;
+
+	return ibmw;
+}
+
+
+/**
+ * nes_dealloc_mw
+ */
+static int nes_dealloc_mw(struct ib_mw *ibmw)
+{
+	struct nes_mr *nesmr = to_nesmw(ibmw);
+	struct nes_vnic *nesvnic = to_nesvnic(ibmw->device);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+	struct nes_hw_cqp_wqe *cqp_wqe;
+	struct nes_cqp_request *cqp_request;
+	int err = 0;
+	int ret;
+
+	/* Deallocate the window with the adapter */
+	cqp_request = nes_get_cqp_request(nesdev);
+	if (cqp_request == NULL) {
+		nes_debug(NES_DBG_MR, "Failed to get a cqp_request.\n");
+		return -ENOMEM;
+	}
+	cqp_request->waiting = 1;
+	cqp_wqe = &cqp_request->cqp_wqe;
+	nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
+	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX, NES_CQP_DEALLOCATE_STAG);
+	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_STAG_IDX, ibmw->rkey);
+
+	atomic_set(&cqp_request->refcount, 2);
+	nes_post_cqp_request(nesdev, cqp_request);
+
+	/* Wait for CQP */
+	nes_debug(NES_DBG_MR, "Waiting for deallocate STag 0x%08X to complete.\n",
+			ibmw->rkey);
+	ret = wait_event_timeout(cqp_request->waitq, (0 != cqp_request->request_done),
+			NES_EVENT_TIMEOUT);
+	nes_debug(NES_DBG_MR, "Deallocate STag completed, wait_event_timeout ret = %u,"
+			" CQP Major:Minor codes = 0x%04X:0x%04X.\n",
+			ret, cqp_request->major_code, cqp_request->minor_code);
+	if (!ret)
+		err = -ETIME;
+	else if (cqp_request->major_code)
+		err = -EIO;
+
+	nes_put_cqp_request(nesdev, cqp_request);
+
+	nes_free_resource(nesadapter, nesadapter->allocated_mrs,
+			(ibmw->rkey & 0x0fffff00) >> 8);
+	kfree(nesmr);
+
+	return err;
+}
+
+
+/**
+ * nes_bind_mw
+ */
+static int nes_bind_mw(struct ib_qp *ibqp, struct ib_mw *ibmw,
+		struct ib_mw_bind *ibmw_bind)
+{
+	u64 u64temp;
+	struct nes_vnic *nesvnic = to_nesvnic(ibqp->device);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	/* struct nes_mr *nesmr = to_nesmw(ibmw); */
+	struct nes_qp *nesqp = to_nesqp(ibqp);
+	struct nes_hw_qp_wqe *wqe;
+	unsigned long flags = 0;
+	u32 head;
+	u32 wqe_misc = 0;
+	u32 qsize;
+
+	if (nesqp->ibqp_state > IB_QPS_RTS)
+		return -EINVAL;
+
+	spin_lock_irqsave(&nesqp->lock, flags);
+
+	head = nesqp->hwqp.sq_head;
+	qsize = nesqp->hwqp.sq_tail;
+
+	/* Check for SQ overflow */
+	if (((head + (2 * qsize) - nesqp->hwqp.sq_tail) % qsize) == (qsize - 1)) {
+		spin_unlock_irqrestore(&nesqp->lock, flags);
+		return -ENOMEM;
+	}
+
+	wqe = &nesqp->hwqp.sq_vbase[head];
+	/* nes_debug(NES_DBG_MR, "processing sq wqe at %p, head = %u.\n", wqe, head); */
+	nes_fill_init_qp_wqe(wqe, nesqp, head);
+	u64temp = ibmw_bind->wr_id;
+	set_wqe_64bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_COMP_SCRATCH_LOW_IDX, u64temp);
+	wqe_misc = NES_IWARP_SQ_OP_BIND;
+
+	wqe_misc |= NES_IWARP_SQ_WQE_LOCAL_FENCE;
+
+	if (ibmw_bind->send_flags & IB_SEND_SIGNALED)
+		wqe_misc |= NES_IWARP_SQ_WQE_SIGNALED_COMPL;
+
+	if (ibmw_bind->bind_info.mw_access_flags & IB_ACCESS_REMOTE_WRITE)
+		wqe_misc |= NES_CQP_STAG_RIGHTS_REMOTE_WRITE;
+	if (ibmw_bind->bind_info.mw_access_flags & IB_ACCESS_REMOTE_READ)
+		wqe_misc |= NES_CQP_STAG_RIGHTS_REMOTE_READ;
+
+	set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_MISC_IDX, wqe_misc);
+	set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_BIND_WQE_MR_IDX,
+			    ibmw_bind->bind_info.mr->lkey);
+	set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_BIND_WQE_MW_IDX, ibmw->rkey);
+	set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_BIND_WQE_LENGTH_LOW_IDX,
+			ibmw_bind->bind_info.length);
+	wqe->wqe_words[NES_IWARP_SQ_BIND_WQE_LENGTH_HIGH_IDX] = 0;
+	u64temp = (u64)ibmw_bind->bind_info.addr;
+	set_wqe_64bit_value(wqe->wqe_words, NES_IWARP_SQ_BIND_WQE_VA_FBO_LOW_IDX, u64temp);
+
+	head++;
+	if (head >= qsize)
+		head = 0;
+
+	nesqp->hwqp.sq_head = head;
+	barrier();
+
+	nes_write32(nesdev->regs+NES_WQE_ALLOC,
+			(1 << 24) | 0x00800000 | nesqp->hwqp.qp_id);
+
+	spin_unlock_irqrestore(&nesqp->lock, flags);
+
+	return 0;
+}
+
+
+/*
+ * nes_alloc_fast_mr
+ */
+static int alloc_fast_reg_mr(struct nes_device *nesdev, struct nes_pd *nespd,
+			     u32 stag, u32 page_count)
+{
+	struct nes_hw_cqp_wqe *cqp_wqe;
+	struct nes_cqp_request *cqp_request;
+	unsigned long flags;
+	int ret;
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+	u32 opcode = 0;
+	u16 major_code;
+	u64 region_length = page_count * PAGE_SIZE;
+
+
+	cqp_request = nes_get_cqp_request(nesdev);
+	if (cqp_request == NULL) {
+		nes_debug(NES_DBG_MR, "Failed to get a cqp_request.\n");
+		return -ENOMEM;
+	}
+	nes_debug(NES_DBG_MR, "alloc_fast_reg_mr: page_count = %d, "
+			      "region_length = %llu\n",
+			      page_count, region_length);
+	cqp_request->waiting = 1;
+	cqp_wqe = &cqp_request->cqp_wqe;
+
+	spin_lock_irqsave(&nesadapter->pbl_lock, flags);
+	if (nesadapter->free_4kpbl > 0) {
+		nesadapter->free_4kpbl--;
+		spin_unlock_irqrestore(&nesadapter->pbl_lock, flags);
+	} else {
+		/* No 4kpbl's available: */
+		spin_unlock_irqrestore(&nesadapter->pbl_lock, flags);
+		nes_debug(NES_DBG_MR, "Out of Pbls\n");
+		nes_free_cqp_request(nesdev, cqp_request);
+		return -ENOMEM;
+	}
+
+	opcode = NES_CQP_ALLOCATE_STAG | NES_CQP_STAG_MR |
+		 NES_CQP_STAG_PBL_BLK_SIZE | NES_CQP_STAG_VA_TO |
+		 NES_CQP_STAG_REM_ACC_EN;
+	/*
+	 * The current OFED API does not support the zero based TO option.
+	 * If added then need to changed the NES_CQP_STAG_VA* option.  Also,
+	 * the API does not support that ability to have the MR set for local
+	 * access only when created and not allow the SQ op to override. Given
+	 * this the remote enable must be set here.
+	 */
+
+	nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
+	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX, opcode);
+	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_PBL_BLK_COUNT_IDX, 1);
+
+	cqp_wqe->wqe_words[NES_CQP_STAG_WQE_LEN_HIGH_PD_IDX] =
+			cpu_to_le32((u32)(region_length >> 8) & 0xff000000);
+	cqp_wqe->wqe_words[NES_CQP_STAG_WQE_LEN_HIGH_PD_IDX] |=
+			cpu_to_le32(nespd->pd_id & 0x00007fff);
+
+	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_STAG_IDX, stag);
+	set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_VA_LOW_IDX, 0);
+	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_LEN_LOW_IDX, 0);
+	set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_PA_LOW_IDX, 0);
+	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_PBL_LEN_IDX, (page_count * 8));
+	cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] |= cpu_to_le32(NES_CQP_STAG_PBL_BLK_SIZE);
+	barrier();
+
+	atomic_set(&cqp_request->refcount, 2);
+	nes_post_cqp_request(nesdev, cqp_request);
+
+	/* Wait for CQP */
+	ret = wait_event_timeout(cqp_request->waitq,
+				 (0 != cqp_request->request_done),
+				 NES_EVENT_TIMEOUT);
+
+	nes_debug(NES_DBG_MR, "Allocate STag 0x%08X completed, "
+		  "wait_event_timeout ret = %u, CQP Major:Minor codes = "
+		  "0x%04X:0x%04X.\n", stag, ret, cqp_request->major_code,
+		  cqp_request->minor_code);
+	major_code = cqp_request->major_code;
+	nes_put_cqp_request(nesdev, cqp_request);
+
+	if (!ret || major_code) {
+		spin_lock_irqsave(&nesadapter->pbl_lock, flags);
+		nesadapter->free_4kpbl++;
+		spin_unlock_irqrestore(&nesadapter->pbl_lock, flags);
+	}
+
+	if (!ret)
+		return -ETIME;
+	else if (major_code)
+		return -EIO;
+	return 0;
+}
+
+/*
+ * nes_alloc_fast_reg_mr
+ */
+static struct ib_mr *nes_alloc_fast_reg_mr(struct ib_pd *ibpd, int max_page_list_len)
+{
+	struct nes_pd *nespd = to_nespd(ibpd);
+	struct nes_vnic *nesvnic = to_nesvnic(ibpd->device);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+
+	u32 next_stag_index;
+	u8 stag_key = 0;
+	u32 driver_key = 0;
+	int err = 0;
+	u32 stag_index = 0;
+	struct nes_mr *nesmr;
+	u32 stag;
+	int ret;
+	struct ib_mr *ibmr;
+/*
+ * Note:  Set to always use a fixed length single page entry PBL.  This is to allow
+ *	 for the fast_reg_mr operation to always know the size of the PBL.
+ */
+	if (max_page_list_len > (NES_4K_PBL_CHUNK_SIZE / sizeof(u64)))
+		return ERR_PTR(-E2BIG);
+
+	get_random_bytes(&next_stag_index, sizeof(next_stag_index));
+	stag_key = (u8)next_stag_index;
+	next_stag_index >>= 8;
+	next_stag_index %= nesadapter->max_mr;
+
+	err = nes_alloc_resource(nesadapter, nesadapter->allocated_mrs,
+				 nesadapter->max_mr, &stag_index,
+				 &next_stag_index, NES_RESOURCE_FAST_MR);
+	if (err)
+		return ERR_PTR(err);
+
+	nesmr = kzalloc(sizeof(*nesmr), GFP_KERNEL);
+	if (!nesmr) {
+		nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	stag = stag_index << 8;
+	stag |= driver_key;
+	stag += (u32)stag_key;
+
+	nes_debug(NES_DBG_MR, "Allocating STag 0x%08X index = 0x%08X\n",
+		  stag, stag_index);
+
+	ret = alloc_fast_reg_mr(nesdev, nespd, stag, max_page_list_len);
+
+	if (ret == 0) {
+		nesmr->ibmr.rkey = stag;
+		nesmr->ibmr.lkey = stag;
+		nesmr->mode = IWNES_MEMREG_TYPE_FMEM;
+		ibmr = &nesmr->ibmr;
+	} else {
+		kfree(nesmr);
+		nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index);
+		ibmr = ERR_PTR(-ENOMEM);
+	}
+	return ibmr;
+}
+
+/*
+ * nes_alloc_fast_reg_page_list
+ */
+static struct ib_fast_reg_page_list *nes_alloc_fast_reg_page_list(
+							struct ib_device *ibdev,
+							int page_list_len)
+{
+	struct nes_vnic *nesvnic = to_nesvnic(ibdev);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	struct ib_fast_reg_page_list *pifrpl;
+	struct nes_ib_fast_reg_page_list *pnesfrpl;
+
+	if (page_list_len > (NES_4K_PBL_CHUNK_SIZE / sizeof(u64)))
+		return ERR_PTR(-E2BIG);
+	/*
+	 * Allocate the ib_fast_reg_page_list structure, the
+	 * nes_fast_bpl structure, and the PLB table.
+	 */
+	pnesfrpl = kmalloc(sizeof(struct nes_ib_fast_reg_page_list) +
+			   page_list_len * sizeof(u64), GFP_KERNEL);
+
+	if (!pnesfrpl)
+		return ERR_PTR(-ENOMEM);
+
+	pifrpl = &pnesfrpl->ibfrpl;
+	pifrpl->page_list = &pnesfrpl->pbl;
+	pifrpl->max_page_list_len = page_list_len;
+	/*
+	 * Allocate the WQE PBL
+	 */
+	pnesfrpl->nes_wqe_pbl.kva = pci_alloc_consistent(nesdev->pcidev,
+							 page_list_len * sizeof(u64),
+							 &pnesfrpl->nes_wqe_pbl.paddr);
+
+	if (!pnesfrpl->nes_wqe_pbl.kva) {
+		kfree(pnesfrpl);
+		return ERR_PTR(-ENOMEM);
+	}
+	nes_debug(NES_DBG_MR, "nes_alloc_fast_reg_pbl: nes_frpl = %p, "
+		  "ibfrpl = %p, ibfrpl.page_list = %p, pbl.kva = %p, "
+		  "pbl.paddr = %llx\n", pnesfrpl, &pnesfrpl->ibfrpl,
+		  pnesfrpl->ibfrpl.page_list, pnesfrpl->nes_wqe_pbl.kva,
+		  (unsigned long long) pnesfrpl->nes_wqe_pbl.paddr);
+
+	return pifrpl;
+}
+
+/*
+ * nes_free_fast_reg_page_list
+ */
+static void nes_free_fast_reg_page_list(struct ib_fast_reg_page_list *pifrpl)
+{
+	struct nes_vnic *nesvnic = to_nesvnic(pifrpl->device);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	struct nes_ib_fast_reg_page_list *pnesfrpl;
+
+	pnesfrpl = container_of(pifrpl, struct nes_ib_fast_reg_page_list, ibfrpl);
+	/*
+	 * Free the WQE PBL.
+	 */
+	pci_free_consistent(nesdev->pcidev,
+			    pifrpl->max_page_list_len * sizeof(u64),
+			    pnesfrpl->nes_wqe_pbl.kva,
+			    pnesfrpl->nes_wqe_pbl.paddr);
+	/*
+	 * Free the PBL structure
+	 */
+	kfree(pnesfrpl);
+}
+
+/**
+ * nes_query_device
+ */
+static int nes_query_device(struct ib_device *ibdev, struct ib_device_attr *props)
+{
+	struct nes_vnic *nesvnic = to_nesvnic(ibdev);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	struct nes_ib_device *nesibdev = nesvnic->nesibdev;
+
+	memset(props, 0, sizeof(*props));
+	memcpy(&props->sys_image_guid, nesvnic->netdev->dev_addr, 6);
+
+	props->fw_ver = nesdev->nesadapter->firmware_version;
+	props->device_cap_flags = nesdev->nesadapter->device_cap_flags;
+	props->vendor_id = nesdev->nesadapter->vendor_id;
+	props->vendor_part_id = nesdev->nesadapter->vendor_part_id;
+	props->hw_ver = nesdev->nesadapter->hw_rev;
+	props->max_mr_size = 0x80000000;
+	props->max_qp = nesibdev->max_qp;
+	props->max_qp_wr = nesdev->nesadapter->max_qp_wr - 2;
+	props->max_sge = nesdev->nesadapter->max_sge;
+	props->max_cq = nesibdev->max_cq;
+	props->max_cqe = nesdev->nesadapter->max_cqe;
+	props->max_mr = nesibdev->max_mr;
+	props->max_mw = nesibdev->max_mr;
+	props->max_pd = nesibdev->max_pd;
+	props->max_sge_rd = 1;
+	switch (nesdev->nesadapter->max_irrq_wr) {
+		case 0:
+			props->max_qp_rd_atom = 2;
+			break;
+		case 1:
+			props->max_qp_rd_atom = 8;
+			break;
+		case 2:
+			props->max_qp_rd_atom = 32;
+			break;
+		case 3:
+			props->max_qp_rd_atom = 64;
+			break;
+		default:
+			props->max_qp_rd_atom = 0;
+	}
+	props->max_qp_init_rd_atom = props->max_qp_rd_atom;
+	props->atomic_cap = IB_ATOMIC_NONE;
+	props->max_map_per_fmr = 1;
+
+	return 0;
+}
+
+
+/**
+ * nes_query_port
+ */
+static int nes_query_port(struct ib_device *ibdev, u8 port, struct ib_port_attr *props)
+{
+	struct nes_vnic *nesvnic = to_nesvnic(ibdev);
+	struct net_device *netdev = nesvnic->netdev;
+
+	memset(props, 0, sizeof(*props));
+
+	props->max_mtu = IB_MTU_4096;
+
+	if (netdev->mtu  >= 4096)
+		props->active_mtu = IB_MTU_4096;
+	else if (netdev->mtu  >= 2048)
+		props->active_mtu = IB_MTU_2048;
+	else if (netdev->mtu  >= 1024)
+		props->active_mtu = IB_MTU_1024;
+	else if (netdev->mtu  >= 512)
+		props->active_mtu = IB_MTU_512;
+	else
+		props->active_mtu = IB_MTU_256;
+
+	props->lid = 1;
+	props->lmc = 0;
+	props->sm_lid = 0;
+	props->sm_sl = 0;
+	if (netif_queue_stopped(netdev))
+		props->state = IB_PORT_DOWN;
+	else if (nesvnic->linkup)
+		props->state = IB_PORT_ACTIVE;
+	else
+		props->state = IB_PORT_DOWN;
+	props->phys_state = 0;
+	props->port_cap_flags = IB_PORT_CM_SUP | IB_PORT_REINIT_SUP |
+			IB_PORT_VENDOR_CLASS_SUP | IB_PORT_BOOT_MGMT_SUP;
+	props->gid_tbl_len = 1;
+	props->pkey_tbl_len = 1;
+	props->qkey_viol_cntr = 0;
+	props->active_width = IB_WIDTH_4X;
+	props->active_speed = IB_SPEED_SDR;
+	props->max_msg_sz = 0x80000000;
+
+	return 0;
+}
+
+
+/**
+ * nes_query_pkey
+ */
+static int nes_query_pkey(struct ib_device *ibdev, u8 port, u16 index, u16 *pkey)
+{
+	*pkey = 0;
+	return 0;
+}
+
+
+/**
+ * nes_query_gid
+ */
+static int nes_query_gid(struct ib_device *ibdev, u8 port,
+		int index, union ib_gid *gid)
+{
+	struct nes_vnic *nesvnic = to_nesvnic(ibdev);
+
+	memset(&(gid->raw[0]), 0, sizeof(gid->raw));
+	memcpy(&(gid->raw[0]), nesvnic->netdev->dev_addr, 6);
+
+	return 0;
+}
+
+
+/**
+ * nes_alloc_ucontext - Allocate the user context data structure. This keeps track
+ * of all objects associated with a particular user-mode client.
+ */
+static struct ib_ucontext *nes_alloc_ucontext(struct ib_device *ibdev,
+		struct ib_udata *udata)
+{
+	struct nes_vnic *nesvnic = to_nesvnic(ibdev);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+	struct nes_alloc_ucontext_req req;
+	struct nes_alloc_ucontext_resp uresp;
+	struct nes_ucontext *nes_ucontext;
+	struct nes_ib_device *nesibdev = nesvnic->nesibdev;
+
+
+	if (ib_copy_from_udata(&req, udata, sizeof(struct nes_alloc_ucontext_req))) {
+		printk(KERN_ERR PFX "Invalid structure size on allocate user context.\n");
+		return ERR_PTR(-EINVAL);
+	}
+
+	if (req.userspace_ver != NES_ABI_USERSPACE_VER) {
+		printk(KERN_ERR PFX "Invalid userspace driver version detected. Detected version %d, should be %d\n",
+			req.userspace_ver, NES_ABI_USERSPACE_VER);
+		return ERR_PTR(-EINVAL);
+	}
+
+
+	memset(&uresp, 0, sizeof uresp);
+
+	uresp.max_qps = nesibdev->max_qp;
+	uresp.max_pds = nesibdev->max_pd;
+	uresp.wq_size = nesdev->nesadapter->max_qp_wr * 2;
+	uresp.virtwq = nesadapter->virtwq;
+	uresp.kernel_ver = NES_ABI_KERNEL_VER;
+
+	nes_ucontext = kzalloc(sizeof *nes_ucontext, GFP_KERNEL);
+	if (!nes_ucontext)
+		return ERR_PTR(-ENOMEM);
+
+	nes_ucontext->nesdev = nesdev;
+	nes_ucontext->mmap_wq_offset = uresp.max_pds;
+	nes_ucontext->mmap_cq_offset = nes_ucontext->mmap_wq_offset +
+			((sizeof(struct nes_hw_qp_wqe) * uresp.max_qps * 2) + PAGE_SIZE-1) /
+			PAGE_SIZE;
+
+
+	if (ib_copy_to_udata(udata, &uresp, sizeof uresp)) {
+		kfree(nes_ucontext);
+		return ERR_PTR(-EFAULT);
+	}
+
+	INIT_LIST_HEAD(&nes_ucontext->cq_reg_mem_list);
+	INIT_LIST_HEAD(&nes_ucontext->qp_reg_mem_list);
+	atomic_set(&nes_ucontext->usecnt, 1);
+	return &nes_ucontext->ibucontext;
+}
+
+
+/**
+ * nes_dealloc_ucontext
+ */
+static int nes_dealloc_ucontext(struct ib_ucontext *context)
+{
+	/* struct nes_vnic *nesvnic = to_nesvnic(context->device); */
+	/* struct nes_device *nesdev = nesvnic->nesdev; */
+	struct nes_ucontext *nes_ucontext = to_nesucontext(context);
+
+	if (!atomic_dec_and_test(&nes_ucontext->usecnt))
+	  return 0;
+	kfree(nes_ucontext);
+	return 0;
+}
+
+
+/**
+ * nes_mmap
+ */
+static int nes_mmap(struct ib_ucontext *context, struct vm_area_struct *vma)
+{
+	unsigned long index;
+	struct nes_vnic *nesvnic = to_nesvnic(context->device);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	/* struct nes_adapter *nesadapter = nesdev->nesadapter; */
+	struct nes_ucontext *nes_ucontext;
+	struct nes_qp *nesqp;
+
+	nes_ucontext = to_nesucontext(context);
+
+
+	if (vma->vm_pgoff >= nes_ucontext->mmap_wq_offset) {
+		index = (vma->vm_pgoff - nes_ucontext->mmap_wq_offset) * PAGE_SIZE;
+		index /= ((sizeof(struct nes_hw_qp_wqe) * nesdev->nesadapter->max_qp_wr * 2) +
+				PAGE_SIZE-1) & (~(PAGE_SIZE-1));
+		if (!test_bit(index, nes_ucontext->allocated_wqs)) {
+			nes_debug(NES_DBG_MMAP, "wq %lu not allocated\n", index);
+			return -EFAULT;
+		}
+		nesqp = nes_ucontext->mmap_nesqp[index];
+		if (nesqp == NULL) {
+			nes_debug(NES_DBG_MMAP, "wq %lu has a NULL QP base.\n", index);
+			return -EFAULT;
+		}
+		if (remap_pfn_range(vma, vma->vm_start,
+				virt_to_phys(nesqp->hwqp.sq_vbase) >> PAGE_SHIFT,
+				vma->vm_end - vma->vm_start,
+				vma->vm_page_prot)) {
+			nes_debug(NES_DBG_MMAP, "remap_pfn_range failed.\n");
+			return -EAGAIN;
+		}
+		vma->vm_private_data = nesqp;
+		return 0;
+	} else {
+		index = vma->vm_pgoff;
+		if (!test_bit(index, nes_ucontext->allocated_doorbells))
+			return -EFAULT;
+
+		vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+		if (io_remap_pfn_range(vma, vma->vm_start,
+				(nesdev->doorbell_start +
+				((nes_ucontext->mmap_db_index[index] - nesdev->base_doorbell_index) * 4096))
+				>> PAGE_SHIFT, PAGE_SIZE, vma->vm_page_prot))
+			return -EAGAIN;
+		vma->vm_private_data = nes_ucontext;
+		return 0;
+	}
+
+	return -ENOSYS;
+}
+
+
+/**
+ * nes_alloc_pd
+ */
+static struct ib_pd *nes_alloc_pd(struct ib_device *ibdev,
+		struct ib_ucontext *context, struct ib_udata *udata)
+{
+	struct nes_pd *nespd;
+	struct nes_vnic *nesvnic = to_nesvnic(ibdev);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+	struct nes_ucontext *nesucontext;
+	struct nes_alloc_pd_resp uresp;
+	u32 pd_num = 0;
+	int err;
+
+	nes_debug(NES_DBG_PD, "nesvnic=%p, netdev=%p %s, ibdev=%p, context=%p, netdev refcnt=%u\n",
+			nesvnic, nesdev->netdev[0], nesdev->netdev[0]->name, ibdev, context,
+			netdev_refcnt_read(nesvnic->netdev));
+
+	err = nes_alloc_resource(nesadapter, nesadapter->allocated_pds,
+			nesadapter->max_pd, &pd_num, &nesadapter->next_pd, NES_RESOURCE_PD);
+	if (err) {
+		return ERR_PTR(err);
+	}
+
+	nespd = kzalloc(sizeof (struct nes_pd), GFP_KERNEL);
+	if (!nespd) {
+		nes_free_resource(nesadapter, nesadapter->allocated_pds, pd_num);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	nes_debug(NES_DBG_PD, "Allocating PD (%p) for ib device %s\n",
+			nespd, nesvnic->nesibdev->ibdev.name);
+
+	nespd->pd_id = (pd_num << (PAGE_SHIFT-12)) + nesadapter->base_pd;
+
+	if (context) {
+		nesucontext = to_nesucontext(context);
+		nespd->mmap_db_index = find_next_zero_bit(nesucontext->allocated_doorbells,
+				NES_MAX_USER_DB_REGIONS, nesucontext->first_free_db);
+		nes_debug(NES_DBG_PD, "find_first_zero_biton doorbells returned %u, mapping pd_id %u.\n",
+				nespd->mmap_db_index, nespd->pd_id);
+		if (nespd->mmap_db_index >= NES_MAX_USER_DB_REGIONS) {
+			nes_debug(NES_DBG_PD, "mmap_db_index > MAX\n");
+			nes_free_resource(nesadapter, nesadapter->allocated_pds, pd_num);
+			kfree(nespd);
+			return ERR_PTR(-ENOMEM);
+		}
+
+		uresp.pd_id = nespd->pd_id;
+		uresp.mmap_db_index = nespd->mmap_db_index;
+		if (ib_copy_to_udata(udata, &uresp, sizeof (struct nes_alloc_pd_resp))) {
+			nes_free_resource(nesadapter, nesadapter->allocated_pds, pd_num);
+			kfree(nespd);
+			return ERR_PTR(-EFAULT);
+		}
+
+		set_bit(nespd->mmap_db_index, nesucontext->allocated_doorbells);
+		nesucontext->mmap_db_index[nespd->mmap_db_index] = nespd->pd_id;
+		nesucontext->first_free_db = nespd->mmap_db_index + 1;
+	}
+
+	nes_debug(NES_DBG_PD, "PD%u structure located @%p.\n", nespd->pd_id, nespd);
+	return &nespd->ibpd;
+}
+
+
+/**
+ * nes_dealloc_pd
+ */
+static int nes_dealloc_pd(struct ib_pd *ibpd)
+{
+	struct nes_ucontext *nesucontext;
+	struct nes_pd *nespd = to_nespd(ibpd);
+	struct nes_vnic *nesvnic = to_nesvnic(ibpd->device);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+
+	if ((ibpd->uobject) && (ibpd->uobject->context)) {
+		nesucontext = to_nesucontext(ibpd->uobject->context);
+		nes_debug(NES_DBG_PD, "Clearing bit %u from allocated doorbells\n",
+				nespd->mmap_db_index);
+		clear_bit(nespd->mmap_db_index, nesucontext->allocated_doorbells);
+		nesucontext->mmap_db_index[nespd->mmap_db_index] = 0;
+		if (nesucontext->first_free_db > nespd->mmap_db_index) {
+			nesucontext->first_free_db = nespd->mmap_db_index;
+		}
+	}
+
+	nes_debug(NES_DBG_PD, "Deallocating PD%u structure located @%p.\n",
+			nespd->pd_id, nespd);
+	nes_free_resource(nesadapter, nesadapter->allocated_pds,
+			(nespd->pd_id-nesadapter->base_pd)>>(PAGE_SHIFT-12));
+	kfree(nespd);
+
+	return 0;
+}
+
+
+/**
+ * nes_create_ah
+ */
+static struct ib_ah *nes_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr)
+{
+	return ERR_PTR(-ENOSYS);
+}
+
+
+/**
+ * nes_destroy_ah
+ */
+static int nes_destroy_ah(struct ib_ah *ah)
+{
+	return -ENOSYS;
+}
+
+
+/**
+ * nes_get_encoded_size
+ */
+static inline u8 nes_get_encoded_size(int *size)
+{
+	u8 encoded_size = 0;
+	if (*size <= 32) {
+		*size = 32;
+		encoded_size = 1;
+	} else if (*size <= 128) {
+		*size = 128;
+		encoded_size = 2;
+	} else if (*size <= 512) {
+		*size = 512;
+		encoded_size = 3;
+	}
+	return (encoded_size);
+}
+
+
+
+/**
+ * nes_setup_virt_qp
+ */
+static int nes_setup_virt_qp(struct nes_qp *nesqp, struct nes_pbl *nespbl,
+		struct nes_vnic *nesvnic, int sq_size, int rq_size)
+{
+	unsigned long flags;
+	void *mem;
+	__le64 *pbl = NULL;
+	__le64 *tpbl;
+	__le64 *pblbuffer;
+	struct nes_device *nesdev = nesvnic->nesdev;
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+	u32 pbl_entries;
+	u8 rq_pbl_entries;
+	u8 sq_pbl_entries;
+
+	pbl_entries = nespbl->pbl_size >> 3;
+	nes_debug(NES_DBG_QP, "Userspace PBL, pbl_size=%u, pbl_entries = %d pbl_vbase=%p, pbl_pbase=%lx\n",
+			nespbl->pbl_size, pbl_entries,
+			(void *)nespbl->pbl_vbase,
+			(unsigned long) nespbl->pbl_pbase);
+	pbl = (__le64 *) nespbl->pbl_vbase; /* points to first pbl entry */
+	/* now lets set the sq_vbase as well as rq_vbase addrs we will assign */
+	/* the first pbl to be fro the rq_vbase... */
+	rq_pbl_entries = (rq_size * sizeof(struct nes_hw_qp_wqe)) >> 12;
+	sq_pbl_entries = (sq_size * sizeof(struct nes_hw_qp_wqe)) >> 12;
+	nesqp->hwqp.sq_pbase = (le32_to_cpu(((__le32 *)pbl)[0])) | ((u64)((le32_to_cpu(((__le32 *)pbl)[1]))) << 32);
+	if (!nespbl->page) {
+		nes_debug(NES_DBG_QP, "QP nespbl->page is NULL \n");
+		kfree(nespbl);
+		return -ENOMEM;
+	}
+
+	nesqp->hwqp.sq_vbase = kmap(nespbl->page);
+	nesqp->page = nespbl->page;
+	if (!nesqp->hwqp.sq_vbase) {
+		nes_debug(NES_DBG_QP, "QP sq_vbase kmap failed\n");
+		kfree(nespbl);
+		return -ENOMEM;
+	}
+
+	/* Now to get to sq.. we need to calculate how many */
+	/* PBL entries were used by the rq.. */
+	pbl += sq_pbl_entries;
+	nesqp->hwqp.rq_pbase = (le32_to_cpu(((__le32 *)pbl)[0])) | ((u64)((le32_to_cpu(((__le32 *)pbl)[1]))) << 32);
+	/* nesqp->hwqp.rq_vbase = bus_to_virt(*pbl); */
+	/*nesqp->hwqp.rq_vbase = phys_to_virt(*pbl); */
+
+	nes_debug(NES_DBG_QP, "QP sq_vbase= %p sq_pbase=%lx rq_vbase=%p rq_pbase=%lx\n",
+		  nesqp->hwqp.sq_vbase, (unsigned long) nesqp->hwqp.sq_pbase,
+		  nesqp->hwqp.rq_vbase, (unsigned long) nesqp->hwqp.rq_pbase);
+	spin_lock_irqsave(&nesadapter->pbl_lock, flags);
+	if (!nesadapter->free_256pbl) {
+		pci_free_consistent(nesdev->pcidev, nespbl->pbl_size, nespbl->pbl_vbase,
+				nespbl->pbl_pbase);
+		spin_unlock_irqrestore(&nesadapter->pbl_lock, flags);
+		kunmap(nesqp->page);
+		kfree(nespbl);
+		return -ENOMEM;
+	}
+	nesadapter->free_256pbl--;
+	spin_unlock_irqrestore(&nesadapter->pbl_lock, flags);
+
+	nesqp->pbl_vbase = pci_alloc_consistent(nesdev->pcidev, 256, &nesqp->pbl_pbase);
+	pblbuffer = nesqp->pbl_vbase;
+	if (!nesqp->pbl_vbase) {
+		/* memory allocated during nes_reg_user_mr() */
+		pci_free_consistent(nesdev->pcidev, nespbl->pbl_size, nespbl->pbl_vbase,
+				    nespbl->pbl_pbase);
+		kfree(nespbl);
+		spin_lock_irqsave(&nesadapter->pbl_lock, flags);
+		nesadapter->free_256pbl++;
+		spin_unlock_irqrestore(&nesadapter->pbl_lock, flags);
+		kunmap(nesqp->page);
+		return -ENOMEM;
+	}
+	memset(nesqp->pbl_vbase, 0, 256);
+	/* fill in the page address in the pbl buffer.. */
+	tpbl = pblbuffer + 16;
+	pbl = (__le64 *)nespbl->pbl_vbase;
+	while (sq_pbl_entries--)
+		*tpbl++ = *pbl++;
+	tpbl = pblbuffer;
+	while (rq_pbl_entries--)
+		*tpbl++ = *pbl++;
+
+	/* done with memory allocated during nes_reg_user_mr() */
+	pci_free_consistent(nesdev->pcidev, nespbl->pbl_size, nespbl->pbl_vbase,
+			    nespbl->pbl_pbase);
+	kfree(nespbl);
+
+	nesqp->qp_mem_size =
+			max((u32)sizeof(struct nes_qp_context), ((u32)256)) + 256;     /* this is Q2 */
+	/* Round up to a multiple of a page */
+	nesqp->qp_mem_size += PAGE_SIZE - 1;
+	nesqp->qp_mem_size &= ~(PAGE_SIZE - 1);
+
+	mem = pci_alloc_consistent(nesdev->pcidev, nesqp->qp_mem_size,
+			&nesqp->hwqp.q2_pbase);
+
+	if (!mem) {
+		pci_free_consistent(nesdev->pcidev, 256, nesqp->pbl_vbase, nesqp->pbl_pbase);
+		nesqp->pbl_vbase = NULL;
+		spin_lock_irqsave(&nesadapter->pbl_lock, flags);
+		nesadapter->free_256pbl++;
+		spin_unlock_irqrestore(&nesadapter->pbl_lock, flags);
+		kunmap(nesqp->page);
+		return -ENOMEM;
+	}
+	nesqp->sq_kmapped = 1;
+	nesqp->hwqp.q2_vbase = mem;
+	mem += 256;
+	memset(nesqp->hwqp.q2_vbase, 0, 256);
+	nesqp->nesqp_context = mem;
+	memset(nesqp->nesqp_context, 0, sizeof(*nesqp->nesqp_context));
+	nesqp->nesqp_context_pbase = nesqp->hwqp.q2_pbase + 256;
+
+	return 0;
+}
+
+
+/**
+ * nes_setup_mmap_qp
+ */
+static int nes_setup_mmap_qp(struct nes_qp *nesqp, struct nes_vnic *nesvnic,
+		int sq_size, int rq_size)
+{
+	void *mem;
+	struct nes_device *nesdev = nesvnic->nesdev;
+
+	nesqp->qp_mem_size = (sizeof(struct nes_hw_qp_wqe) * sq_size) +
+			(sizeof(struct nes_hw_qp_wqe) * rq_size) +
+			max((u32)sizeof(struct nes_qp_context), ((u32)256)) +
+			256; /* this is Q2 */
+	/* Round up to a multiple of a page */
+	nesqp->qp_mem_size += PAGE_SIZE - 1;
+	nesqp->qp_mem_size &= ~(PAGE_SIZE - 1);
+
+	mem = pci_alloc_consistent(nesdev->pcidev, nesqp->qp_mem_size,
+			&nesqp->hwqp.sq_pbase);
+	if (!mem)
+		return -ENOMEM;
+	nes_debug(NES_DBG_QP, "PCI consistent memory for "
+			"host descriptor rings located @ %p (pa = 0x%08lX.) size = %u.\n",
+			mem, (unsigned long)nesqp->hwqp.sq_pbase, nesqp->qp_mem_size);
+
+	memset(mem, 0, nesqp->qp_mem_size);
+
+	nesqp->hwqp.sq_vbase = mem;
+	mem += sizeof(struct nes_hw_qp_wqe) * sq_size;
+
+	nesqp->hwqp.rq_vbase = mem;
+	nesqp->hwqp.rq_pbase = nesqp->hwqp.sq_pbase +
+			sizeof(struct nes_hw_qp_wqe) * sq_size;
+	mem += sizeof(struct nes_hw_qp_wqe) * rq_size;
+
+	nesqp->hwqp.q2_vbase = mem;
+	nesqp->hwqp.q2_pbase = nesqp->hwqp.rq_pbase +
+			sizeof(struct nes_hw_qp_wqe) * rq_size;
+	mem += 256;
+	memset(nesqp->hwqp.q2_vbase, 0, 256);
+
+	nesqp->nesqp_context = mem;
+	nesqp->nesqp_context_pbase = nesqp->hwqp.q2_pbase + 256;
+	memset(nesqp->nesqp_context, 0, sizeof(*nesqp->nesqp_context));
+	return 0;
+}
+
+
+/**
+ * nes_free_qp_mem() is to free up the qp's pci_alloc_consistent() memory.
+ */
+static inline void nes_free_qp_mem(struct nes_device *nesdev,
+		struct nes_qp *nesqp, int virt_wqs)
+{
+	unsigned long flags;
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+	if (!virt_wqs) {
+		pci_free_consistent(nesdev->pcidev, nesqp->qp_mem_size,
+				nesqp->hwqp.sq_vbase, nesqp->hwqp.sq_pbase);
+	}else {
+		spin_lock_irqsave(&nesadapter->pbl_lock, flags);
+		nesadapter->free_256pbl++;
+		spin_unlock_irqrestore(&nesadapter->pbl_lock, flags);
+		pci_free_consistent(nesdev->pcidev, nesqp->qp_mem_size, nesqp->hwqp.q2_vbase, nesqp->hwqp.q2_pbase);
+		pci_free_consistent(nesdev->pcidev, 256, nesqp->pbl_vbase, nesqp->pbl_pbase );
+		nesqp->pbl_vbase = NULL;
+		if (nesqp->sq_kmapped) {
+			nesqp->sq_kmapped = 0;
+			kunmap(nesqp->page);
+		}
+	}
+}
+
+
+/**
+ * nes_create_qp
+ */
+static struct ib_qp *nes_create_qp(struct ib_pd *ibpd,
+		struct ib_qp_init_attr *init_attr, struct ib_udata *udata)
+{
+	u64 u64temp= 0;
+	u64 u64nesqp = 0;
+	struct nes_pd *nespd = to_nespd(ibpd);
+	struct nes_vnic *nesvnic = to_nesvnic(ibpd->device);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+	struct nes_qp *nesqp;
+	struct nes_cq *nescq;
+	struct nes_ucontext *nes_ucontext;
+	struct nes_hw_cqp_wqe *cqp_wqe;
+	struct nes_cqp_request *cqp_request;
+	struct nes_create_qp_req req;
+	struct nes_create_qp_resp uresp;
+	struct nes_pbl  *nespbl = NULL;
+	u32 qp_num = 0;
+	u32 opcode = 0;
+	/* u32 counter = 0; */
+	void *mem;
+	unsigned long flags;
+	int ret;
+	int err;
+	int virt_wqs = 0;
+	int sq_size;
+	int rq_size;
+	u8 sq_encoded_size;
+	u8 rq_encoded_size;
+	/* int counter; */
+
+	if (init_attr->create_flags)
+		return ERR_PTR(-EINVAL);
+
+	atomic_inc(&qps_created);
+	switch (init_attr->qp_type) {
+		case IB_QPT_RC:
+			if (nes_drv_opt & NES_DRV_OPT_NO_INLINE_DATA) {
+				init_attr->cap.max_inline_data = 0;
+			} else {
+				init_attr->cap.max_inline_data = 64;
+			}
+			sq_size = init_attr->cap.max_send_wr;
+			rq_size = init_attr->cap.max_recv_wr;
+
+			/* check if the encoded sizes are OK or not... */
+			sq_encoded_size = nes_get_encoded_size(&sq_size);
+			rq_encoded_size = nes_get_encoded_size(&rq_size);
+
+			if ((!sq_encoded_size) || (!rq_encoded_size)) {
+				nes_debug(NES_DBG_QP, "ERROR bad rq (%u) or sq (%u) size\n",
+						rq_size, sq_size);
+				return ERR_PTR(-EINVAL);
+			}
+
+			init_attr->cap.max_send_wr = sq_size -2;
+			init_attr->cap.max_recv_wr = rq_size -1;
+			nes_debug(NES_DBG_QP, "RQ size=%u, SQ Size=%u\n", rq_size, sq_size);
+
+			ret = nes_alloc_resource(nesadapter, nesadapter->allocated_qps,
+					nesadapter->max_qp, &qp_num, &nesadapter->next_qp, NES_RESOURCE_QP);
+			if (ret) {
+				return ERR_PTR(ret);
+			}
+
+			/* Need 512 (actually now 1024) byte alignment on this structure */
+			mem = kzalloc(sizeof(*nesqp)+NES_SW_CONTEXT_ALIGN-1, GFP_KERNEL);
+			if (!mem) {
+				nes_free_resource(nesadapter, nesadapter->allocated_qps, qp_num);
+				nes_debug(NES_DBG_QP, "Unable to allocate QP\n");
+				return ERR_PTR(-ENOMEM);
+			}
+			u64nesqp = (unsigned long)mem;
+			u64nesqp += ((u64)NES_SW_CONTEXT_ALIGN) - 1;
+			u64temp = ((u64)NES_SW_CONTEXT_ALIGN) - 1;
+			u64nesqp &= ~u64temp;
+			nesqp = (struct nes_qp *)(unsigned long)u64nesqp;
+			/* nes_debug(NES_DBG_QP, "nesqp=%p, allocated buffer=%p.  Rounded to closest %u\n",
+					nesqp, mem, NES_SW_CONTEXT_ALIGN); */
+			nesqp->allocated_buffer = mem;
+
+			if (udata) {
+				if (ib_copy_from_udata(&req, udata, sizeof(struct nes_create_qp_req))) {
+					nes_free_resource(nesadapter, nesadapter->allocated_qps, qp_num);
+					kfree(nesqp->allocated_buffer);
+					nes_debug(NES_DBG_QP, "ib_copy_from_udata() Failed \n");
+					return ERR_PTR(-EFAULT);
+				}
+				if (req.user_wqe_buffers) {
+					virt_wqs = 1;
+				}
+				if (req.user_qp_buffer)
+					nesqp->nesuqp_addr = req.user_qp_buffer;
+				if ((ibpd->uobject) && (ibpd->uobject->context)) {
+					nesqp->user_mode = 1;
+					nes_ucontext = to_nesucontext(ibpd->uobject->context);
+					if (virt_wqs) {
+						err = 1;
+						list_for_each_entry(nespbl, &nes_ucontext->qp_reg_mem_list, list) {
+							if (nespbl->user_base == (unsigned long )req.user_wqe_buffers) {
+								list_del(&nespbl->list);
+								err = 0;
+								nes_debug(NES_DBG_QP, "Found PBL for virtual QP. nespbl=%p. user_base=0x%lx\n",
+									  nespbl, nespbl->user_base);
+								break;
+							}
+						}
+						if (err) {
+							nes_debug(NES_DBG_QP, "Didn't Find PBL for virtual QP. address = %llx.\n",
+								  (long long unsigned int)req.user_wqe_buffers);
+							nes_free_resource(nesadapter, nesadapter->allocated_qps, qp_num);
+							kfree(nesqp->allocated_buffer);
+							return ERR_PTR(-EFAULT);
+						}
+					}
+
+					nes_ucontext = to_nesucontext(ibpd->uobject->context);
+					nesqp->mmap_sq_db_index =
+						find_next_zero_bit(nes_ucontext->allocated_wqs,
+								   NES_MAX_USER_WQ_REGIONS, nes_ucontext->first_free_wq);
+					/* nes_debug(NES_DBG_QP, "find_first_zero_biton wqs returned %u\n",
+							nespd->mmap_db_index); */
+					if (nesqp->mmap_sq_db_index >= NES_MAX_USER_WQ_REGIONS) {
+						nes_debug(NES_DBG_QP,
+							  "db index > max user regions, failing create QP\n");
+						nes_free_resource(nesadapter, nesadapter->allocated_qps, qp_num);
+						if (virt_wqs) {
+							pci_free_consistent(nesdev->pcidev, nespbl->pbl_size, nespbl->pbl_vbase,
+									    nespbl->pbl_pbase);
+							kfree(nespbl);
+						}
+						kfree(nesqp->allocated_buffer);
+						return ERR_PTR(-ENOMEM);
+					}
+					set_bit(nesqp->mmap_sq_db_index, nes_ucontext->allocated_wqs);
+					nes_ucontext->mmap_nesqp[nesqp->mmap_sq_db_index] = nesqp;
+					nes_ucontext->first_free_wq = nesqp->mmap_sq_db_index + 1;
+				} else {
+					nes_free_resource(nesadapter, nesadapter->allocated_qps, qp_num);
+					kfree(nesqp->allocated_buffer);
+					return ERR_PTR(-EFAULT);
+				}
+			}
+			err = (!virt_wqs) ? nes_setup_mmap_qp(nesqp, nesvnic, sq_size, rq_size) :
+					nes_setup_virt_qp(nesqp, nespbl, nesvnic, sq_size, rq_size);
+			if (err) {
+				nes_debug(NES_DBG_QP,
+					  "error geting qp mem code = %d\n", err);
+				nes_free_resource(nesadapter, nesadapter->allocated_qps, qp_num);
+				kfree(nesqp->allocated_buffer);
+				return ERR_PTR(-ENOMEM);
+			}
+
+			nesqp->hwqp.sq_size = sq_size;
+			nesqp->hwqp.sq_encoded_size = sq_encoded_size;
+			nesqp->hwqp.sq_head = 1;
+			nesqp->hwqp.rq_size = rq_size;
+			nesqp->hwqp.rq_encoded_size = rq_encoded_size;
+			/* nes_debug(NES_DBG_QP, "nesqp->nesqp_context_pbase = %p\n",
+					(void *)nesqp->nesqp_context_pbase);
+			*/
+			nesqp->hwqp.qp_id = qp_num;
+			nesqp->ibqp.qp_num = nesqp->hwqp.qp_id;
+			nesqp->nespd = nespd;
+
+			nescq = to_nescq(init_attr->send_cq);
+			nesqp->nesscq = nescq;
+			nescq = to_nescq(init_attr->recv_cq);
+			nesqp->nesrcq = nescq;
+
+			nesqp->nesqp_context->misc |= cpu_to_le32((u32)PCI_FUNC(nesdev->pcidev->devfn) <<
+					NES_QPCONTEXT_MISC_PCI_FCN_SHIFT);
+			nesqp->nesqp_context->misc |= cpu_to_le32((u32)nesqp->hwqp.rq_encoded_size <<
+					NES_QPCONTEXT_MISC_RQ_SIZE_SHIFT);
+			nesqp->nesqp_context->misc |= cpu_to_le32((u32)nesqp->hwqp.sq_encoded_size <<
+					NES_QPCONTEXT_MISC_SQ_SIZE_SHIFT);
+			if (!udata) {
+				nesqp->nesqp_context->misc |= cpu_to_le32(NES_QPCONTEXT_MISC_PRIV_EN);
+				nesqp->nesqp_context->misc |= cpu_to_le32(NES_QPCONTEXT_MISC_FAST_REGISTER_EN);
+			}
+			nesqp->nesqp_context->cqs = cpu_to_le32(nesqp->nesscq->hw_cq.cq_number +
+					((u32)nesqp->nesrcq->hw_cq.cq_number << 16));
+			u64temp = (u64)nesqp->hwqp.sq_pbase;
+			nesqp->nesqp_context->sq_addr_low = cpu_to_le32((u32)u64temp);
+			nesqp->nesqp_context->sq_addr_high = cpu_to_le32((u32)(u64temp >> 32));
+
+
+			if (!virt_wqs) {
+				u64temp = (u64)nesqp->hwqp.sq_pbase;
+				nesqp->nesqp_context->sq_addr_low = cpu_to_le32((u32)u64temp);
+				nesqp->nesqp_context->sq_addr_high = cpu_to_le32((u32)(u64temp >> 32));
+				u64temp = (u64)nesqp->hwqp.rq_pbase;
+				nesqp->nesqp_context->rq_addr_low = cpu_to_le32((u32)u64temp);
+				nesqp->nesqp_context->rq_addr_high = cpu_to_le32((u32)(u64temp >> 32));
+			} else {
+				u64temp = (u64)nesqp->pbl_pbase;
+				nesqp->nesqp_context->rq_addr_low = cpu_to_le32((u32)u64temp);
+				nesqp->nesqp_context->rq_addr_high = cpu_to_le32((u32)(u64temp >> 32));
+			}
+
+			/* nes_debug(NES_DBG_QP, "next_qp_nic_index=%u, using nic_index=%d\n",
+					nesvnic->next_qp_nic_index,
+					nesvnic->qp_nic_index[nesvnic->next_qp_nic_index]); */
+			spin_lock_irqsave(&nesdev->cqp.lock, flags);
+			nesqp->nesqp_context->misc2 |= cpu_to_le32(
+					(u32)nesvnic->qp_nic_index[nesvnic->next_qp_nic_index] <<
+					NES_QPCONTEXT_MISC2_NIC_INDEX_SHIFT);
+			nesvnic->next_qp_nic_index++;
+			if ((nesvnic->next_qp_nic_index > 3) ||
+					(nesvnic->qp_nic_index[nesvnic->next_qp_nic_index] == 0xf)) {
+				nesvnic->next_qp_nic_index = 0;
+			}
+			spin_unlock_irqrestore(&nesdev->cqp.lock, flags);
+
+			nesqp->nesqp_context->pd_index_wscale |= cpu_to_le32((u32)nesqp->nespd->pd_id << 16);
+			u64temp = (u64)nesqp->hwqp.q2_pbase;
+			nesqp->nesqp_context->q2_addr_low = cpu_to_le32((u32)u64temp);
+			nesqp->nesqp_context->q2_addr_high = cpu_to_le32((u32)(u64temp >> 32));
+			nesqp->nesqp_context->aeq_token_low =  cpu_to_le32((u32)((unsigned long)(nesqp)));
+			nesqp->nesqp_context->aeq_token_high =  cpu_to_le32((u32)(upper_32_bits((unsigned long)(nesqp))));
+			nesqp->nesqp_context->ird_ord_sizes = cpu_to_le32(NES_QPCONTEXT_ORDIRD_ALSMM |
+					NES_QPCONTEXT_ORDIRD_AAH |
+					((((u32)nesadapter->max_irrq_wr) <<
+					NES_QPCONTEXT_ORDIRD_IRDSIZE_SHIFT) & NES_QPCONTEXT_ORDIRD_IRDSIZE_MASK));
+			if (disable_mpa_crc) {
+				nes_debug(NES_DBG_QP, "Disabling MPA crc checking due to module option.\n");
+				nesqp->nesqp_context->ird_ord_sizes |= cpu_to_le32(NES_QPCONTEXT_ORDIRD_RNMC);
+			}
+
+
+			/* Create the QP */
+			cqp_request = nes_get_cqp_request(nesdev);
+			if (cqp_request == NULL) {
+				nes_debug(NES_DBG_QP, "Failed to get a cqp_request\n");
+				nes_free_resource(nesadapter, nesadapter->allocated_qps, qp_num);
+				nes_free_qp_mem(nesdev, nesqp,virt_wqs);
+				kfree(nesqp->allocated_buffer);
+				return ERR_PTR(-ENOMEM);
+			}
+			cqp_request->waiting = 1;
+			cqp_wqe = &cqp_request->cqp_wqe;
+
+			if (!virt_wqs) {
+				opcode = NES_CQP_CREATE_QP | NES_CQP_QP_TYPE_IWARP |
+					NES_CQP_QP_IWARP_STATE_IDLE;
+			} else {
+				opcode = NES_CQP_CREATE_QP | NES_CQP_QP_TYPE_IWARP | NES_CQP_QP_VIRT_WQS |
+					NES_CQP_QP_IWARP_STATE_IDLE;
+			}
+			opcode |= NES_CQP_QP_CQS_VALID;
+			nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
+			set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX, opcode);
+			set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_ID_IDX, nesqp->hwqp.qp_id);
+
+			u64temp = (u64)nesqp->nesqp_context_pbase;
+			set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_QP_WQE_CONTEXT_LOW_IDX, u64temp);
+
+			atomic_set(&cqp_request->refcount, 2);
+			nes_post_cqp_request(nesdev, cqp_request);
+
+			/* Wait for CQP */
+			nes_debug(NES_DBG_QP, "Waiting for create iWARP QP%u to complete.\n",
+					nesqp->hwqp.qp_id);
+			ret = wait_event_timeout(cqp_request->waitq,
+					(cqp_request->request_done != 0), NES_EVENT_TIMEOUT);
+			nes_debug(NES_DBG_QP, "Create iwarp QP%u completed, wait_event_timeout ret=%u,"
+					" nesdev->cqp_head = %u, nesdev->cqp.sq_tail = %u,"
+					" CQP Major:Minor codes = 0x%04X:0x%04X.\n",
+					nesqp->hwqp.qp_id, ret, nesdev->cqp.sq_head, nesdev->cqp.sq_tail,
+					cqp_request->major_code, cqp_request->minor_code);
+			if ((!ret) || (cqp_request->major_code)) {
+				nes_put_cqp_request(nesdev, cqp_request);
+				nes_free_resource(nesadapter, nesadapter->allocated_qps, qp_num);
+				nes_free_qp_mem(nesdev, nesqp,virt_wqs);
+				kfree(nesqp->allocated_buffer);
+				if (!ret) {
+					return ERR_PTR(-ETIME);
+				} else {
+					return ERR_PTR(-EIO);
+				}
+			}
+
+			nes_put_cqp_request(nesdev, cqp_request);
+
+			if (ibpd->uobject) {
+				uresp.mmap_sq_db_index = nesqp->mmap_sq_db_index;
+				uresp.mmap_rq_db_index = 0;
+				uresp.actual_sq_size = sq_size;
+				uresp.actual_rq_size = rq_size;
+				uresp.qp_id = nesqp->hwqp.qp_id;
+				uresp.nes_drv_opt = nes_drv_opt;
+				if (ib_copy_to_udata(udata, &uresp, sizeof uresp)) {
+					nes_free_resource(nesadapter, nesadapter->allocated_qps, qp_num);
+					nes_free_qp_mem(nesdev, nesqp,virt_wqs);
+					kfree(nesqp->allocated_buffer);
+					return ERR_PTR(-EFAULT);
+				}
+			}
+
+			nes_debug(NES_DBG_QP, "QP%u structure located @%p.Size = %u.\n",
+					nesqp->hwqp.qp_id, nesqp, (u32)sizeof(*nesqp));
+			spin_lock_init(&nesqp->lock);
+			nes_add_ref(&nesqp->ibqp);
+			break;
+		default:
+			nes_debug(NES_DBG_QP, "Invalid QP type: %d\n", init_attr->qp_type);
+			return ERR_PTR(-EINVAL);
+	}
+
+	nesqp->sig_all = (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR);
+	init_timer(&nesqp->terminate_timer);
+	nesqp->terminate_timer.function = nes_terminate_timeout;
+	nesqp->terminate_timer.data = (unsigned long)nesqp;
+
+	/* update the QP table */
+	nesdev->nesadapter->qp_table[nesqp->hwqp.qp_id-NES_FIRST_QPN] = nesqp;
+	nes_debug(NES_DBG_QP, "netdev refcnt=%u\n",
+			netdev_refcnt_read(nesvnic->netdev));
+
+	return &nesqp->ibqp;
+}
+
+/**
+ * nes_clean_cq
+ */
+static void nes_clean_cq(struct nes_qp *nesqp, struct nes_cq *nescq)
+{
+	u32 cq_head;
+	u32 lo;
+	u32 hi;
+	u64 u64temp;
+	unsigned long flags = 0;
+
+	spin_lock_irqsave(&nescq->lock, flags);
+
+	cq_head = nescq->hw_cq.cq_head;
+	while (le32_to_cpu(nescq->hw_cq.cq_vbase[cq_head].cqe_words[NES_CQE_OPCODE_IDX]) & NES_CQE_VALID) {
+		rmb();
+		lo = le32_to_cpu(nescq->hw_cq.cq_vbase[cq_head].cqe_words[NES_CQE_COMP_COMP_CTX_LOW_IDX]);
+		hi = le32_to_cpu(nescq->hw_cq.cq_vbase[cq_head].cqe_words[NES_CQE_COMP_COMP_CTX_HIGH_IDX]);
+		u64temp = (((u64)hi) << 32) | ((u64)lo);
+		u64temp &= ~(NES_SW_CONTEXT_ALIGN-1);
+		if (u64temp == (u64)(unsigned long)nesqp) {
+			/* Zero the context value so cqe will be ignored */
+			nescq->hw_cq.cq_vbase[cq_head].cqe_words[NES_CQE_COMP_COMP_CTX_LOW_IDX] = 0;
+			nescq->hw_cq.cq_vbase[cq_head].cqe_words[NES_CQE_COMP_COMP_CTX_HIGH_IDX] = 0;
+		}
+
+		if (++cq_head >= nescq->hw_cq.cq_size)
+			cq_head = 0;
+	}
+
+	spin_unlock_irqrestore(&nescq->lock, flags);
+}
+
+
+/**
+ * nes_destroy_qp
+ */
+static int nes_destroy_qp(struct ib_qp *ibqp)
+{
+	struct nes_qp *nesqp = to_nesqp(ibqp);
+	struct nes_ucontext *nes_ucontext;
+	struct ib_qp_attr attr;
+	struct iw_cm_id *cm_id;
+	struct iw_cm_event cm_event;
+	int ret = 0;
+
+	atomic_inc(&sw_qps_destroyed);
+	nesqp->destroyed = 1;
+
+	/* Blow away the connection if it exists. */
+	if (nesqp->ibqp_state >= IB_QPS_INIT && nesqp->ibqp_state <= IB_QPS_RTS) {
+		/* if (nesqp->ibqp_state == IB_QPS_RTS) { */
+		attr.qp_state = IB_QPS_ERR;
+		nes_modify_qp(&nesqp->ibqp, &attr, IB_QP_STATE, NULL);
+	}
+
+	if (((nesqp->ibqp_state == IB_QPS_INIT) ||
+			(nesqp->ibqp_state == IB_QPS_RTR)) && (nesqp->cm_id)) {
+		cm_id = nesqp->cm_id;
+		cm_event.event = IW_CM_EVENT_CONNECT_REPLY;
+		cm_event.status = -ETIMEDOUT;
+		cm_event.local_addr = cm_id->local_addr;
+		cm_event.remote_addr = cm_id->remote_addr;
+		cm_event.private_data = NULL;
+		cm_event.private_data_len = 0;
+
+		nes_debug(NES_DBG_QP, "Generating a CM Timeout Event for "
+				"QP%u. cm_id = %p, refcount = %u. \n",
+				nesqp->hwqp.qp_id, cm_id, atomic_read(&nesqp->refcount));
+
+		cm_id->rem_ref(cm_id);
+		ret = cm_id->event_handler(cm_id, &cm_event);
+		if (ret)
+			nes_debug(NES_DBG_QP, "OFA CM event_handler returned, ret=%d\n", ret);
+	}
+
+	if (nesqp->user_mode) {
+		if ((ibqp->uobject)&&(ibqp->uobject->context)) {
+			nes_ucontext = to_nesucontext(ibqp->uobject->context);
+			clear_bit(nesqp->mmap_sq_db_index, nes_ucontext->allocated_wqs);
+			nes_ucontext->mmap_nesqp[nesqp->mmap_sq_db_index] = NULL;
+			if (nes_ucontext->first_free_wq > nesqp->mmap_sq_db_index) {
+				nes_ucontext->first_free_wq = nesqp->mmap_sq_db_index;
+			}
+		}
+		if (nesqp->pbl_pbase && nesqp->sq_kmapped) {
+			nesqp->sq_kmapped = 0;
+			kunmap(nesqp->page);
+		}
+	} else {
+		/* Clean any pending completions from the cq(s) */
+		if (nesqp->nesscq)
+			nes_clean_cq(nesqp, nesqp->nesscq);
+
+		if ((nesqp->nesrcq) && (nesqp->nesrcq != nesqp->nesscq))
+			nes_clean_cq(nesqp, nesqp->nesrcq);
+	}
+	nes_rem_ref(&nesqp->ibqp);
+	return 0;
+}
+
+
+/**
+ * nes_create_cq
+ */
+static struct ib_cq *nes_create_cq(struct ib_device *ibdev, int entries,
+		int comp_vector,
+		struct ib_ucontext *context, struct ib_udata *udata)
+{
+	u64 u64temp;
+	struct nes_vnic *nesvnic = to_nesvnic(ibdev);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+	struct nes_cq *nescq;
+	struct nes_ucontext *nes_ucontext = NULL;
+	struct nes_cqp_request *cqp_request;
+	void *mem = NULL;
+	struct nes_hw_cqp_wqe *cqp_wqe;
+	struct nes_pbl *nespbl = NULL;
+	struct nes_create_cq_req req;
+	struct nes_create_cq_resp resp;
+	u32 cq_num = 0;
+	u32 opcode = 0;
+	u32 pbl_entries = 1;
+	int err;
+	unsigned long flags;
+	int ret;
+
+	if (entries > nesadapter->max_cqe)
+		return ERR_PTR(-EINVAL);
+
+	err = nes_alloc_resource(nesadapter, nesadapter->allocated_cqs,
+			nesadapter->max_cq, &cq_num, &nesadapter->next_cq, NES_RESOURCE_CQ);
+	if (err) {
+		return ERR_PTR(err);
+	}
+
+	nescq = kzalloc(sizeof(struct nes_cq), GFP_KERNEL);
+	if (!nescq) {
+		nes_free_resource(nesadapter, nesadapter->allocated_cqs, cq_num);
+		nes_debug(NES_DBG_CQ, "Unable to allocate nes_cq struct\n");
+		return ERR_PTR(-ENOMEM);
+	}
+
+	nescq->hw_cq.cq_size = max(entries + 1, 5);
+	nescq->hw_cq.cq_number = cq_num;
+	nescq->ibcq.cqe = nescq->hw_cq.cq_size - 1;
+
+
+	if (context) {
+		nes_ucontext = to_nesucontext(context);
+		if (ib_copy_from_udata(&req, udata, sizeof (struct nes_create_cq_req))) {
+			nes_free_resource(nesadapter, nesadapter->allocated_cqs, cq_num);
+			kfree(nescq);
+			return ERR_PTR(-EFAULT);
+		}
+		nesvnic->mcrq_ucontext = nes_ucontext;
+		nes_ucontext->mcrqf = req.mcrqf;
+		if (nes_ucontext->mcrqf) {
+			if (nes_ucontext->mcrqf & 0x80000000)
+				nescq->hw_cq.cq_number = nesvnic->nic.qp_id + 28 + 2 * ((nes_ucontext->mcrqf & 0xf) - 1);
+			else if (nes_ucontext->mcrqf & 0x40000000)
+				nescq->hw_cq.cq_number = nes_ucontext->mcrqf & 0xffff;
+			else
+				nescq->hw_cq.cq_number = nesvnic->mcrq_qp_id + nes_ucontext->mcrqf-1;
+			nescq->mcrqf = nes_ucontext->mcrqf;
+			nes_free_resource(nesadapter, nesadapter->allocated_cqs, cq_num);
+		}
+		nes_debug(NES_DBG_CQ, "CQ Virtual Address = %08lX, size = %u.\n",
+				(unsigned long)req.user_cq_buffer, entries);
+		err = 1;
+		list_for_each_entry(nespbl, &nes_ucontext->cq_reg_mem_list, list) {
+			if (nespbl->user_base == (unsigned long )req.user_cq_buffer) {
+				list_del(&nespbl->list);
+				err = 0;
+				nes_debug(NES_DBG_CQ, "Found PBL for virtual CQ. nespbl=%p.\n",
+						nespbl);
+				break;
+			}
+		}
+		if (err) {
+			nes_free_resource(nesadapter, nesadapter->allocated_cqs, cq_num);
+			kfree(nescq);
+			return ERR_PTR(-EFAULT);
+		}
+
+		pbl_entries = nespbl->pbl_size >> 3;
+		nescq->cq_mem_size = 0;
+	} else {
+		nescq->cq_mem_size = nescq->hw_cq.cq_size * sizeof(struct nes_hw_cqe);
+		nes_debug(NES_DBG_CQ, "Attempting to allocate pci memory (%u entries, %u bytes) for CQ%u.\n",
+				entries, nescq->cq_mem_size, nescq->hw_cq.cq_number);
+
+		/* allocate the physical buffer space */
+		mem = pci_zalloc_consistent(nesdev->pcidev, nescq->cq_mem_size,
+					    &nescq->hw_cq.cq_pbase);
+		if (!mem) {
+			printk(KERN_ERR PFX "Unable to allocate pci memory for cq\n");
+			nes_free_resource(nesadapter, nesadapter->allocated_cqs, cq_num);
+			kfree(nescq);
+			return ERR_PTR(-ENOMEM);
+		}
+
+		nescq->hw_cq.cq_vbase = mem;
+		nescq->hw_cq.cq_head = 0;
+		nes_debug(NES_DBG_CQ, "CQ%u virtual address @ %p, phys = 0x%08X\n",
+				nescq->hw_cq.cq_number, nescq->hw_cq.cq_vbase,
+				(u32)nescq->hw_cq.cq_pbase);
+	}
+
+	nescq->hw_cq.ce_handler = nes_iwarp_ce_handler;
+	spin_lock_init(&nescq->lock);
+
+	/* send CreateCQ request to CQP */
+	cqp_request = nes_get_cqp_request(nesdev);
+	if (cqp_request == NULL) {
+		nes_debug(NES_DBG_CQ, "Failed to get a cqp_request.\n");
+		if (!context)
+			pci_free_consistent(nesdev->pcidev, nescq->cq_mem_size, mem,
+					nescq->hw_cq.cq_pbase);
+		else {
+			pci_free_consistent(nesdev->pcidev, nespbl->pbl_size,
+					    nespbl->pbl_vbase, nespbl->pbl_pbase);
+			kfree(nespbl);
+		}
+
+		nes_free_resource(nesadapter, nesadapter->allocated_cqs, cq_num);
+		kfree(nescq);
+		return ERR_PTR(-ENOMEM);
+	}
+	cqp_request->waiting = 1;
+	cqp_wqe = &cqp_request->cqp_wqe;
+
+	opcode = NES_CQP_CREATE_CQ | NES_CQP_CQ_CEQ_VALID |
+			NES_CQP_CQ_CHK_OVERFLOW |
+			NES_CQP_CQ_CEQE_MASK | ((u32)nescq->hw_cq.cq_size << 16);
+
+	spin_lock_irqsave(&nesadapter->pbl_lock, flags);
+
+	if (pbl_entries != 1) {
+		if (pbl_entries > 32) {
+			/* use 4k pbl */
+			nes_debug(NES_DBG_CQ, "pbl_entries=%u, use a 4k PBL\n", pbl_entries);
+			if (nesadapter->free_4kpbl == 0) {
+				spin_unlock_irqrestore(&nesadapter->pbl_lock, flags);
+				nes_free_cqp_request(nesdev, cqp_request);
+				if (!context)
+					pci_free_consistent(nesdev->pcidev, nescq->cq_mem_size, mem,
+							nescq->hw_cq.cq_pbase);
+				else {
+					pci_free_consistent(nesdev->pcidev, nespbl->pbl_size,
+							    nespbl->pbl_vbase, nespbl->pbl_pbase);
+					kfree(nespbl);
+				}
+				nes_free_resource(nesadapter, nesadapter->allocated_cqs, cq_num);
+				kfree(nescq);
+				return ERR_PTR(-ENOMEM);
+			} else {
+				opcode |= (NES_CQP_CQ_VIRT | NES_CQP_CQ_4KB_CHUNK);
+				nescq->virtual_cq = 2;
+				nesadapter->free_4kpbl--;
+			}
+		} else {
+			/* use 256 byte pbl */
+			nes_debug(NES_DBG_CQ, "pbl_entries=%u, use a 256 byte PBL\n", pbl_entries);
+			if (nesadapter->free_256pbl == 0) {
+				spin_unlock_irqrestore(&nesadapter->pbl_lock, flags);
+				nes_free_cqp_request(nesdev, cqp_request);
+				if (!context)
+					pci_free_consistent(nesdev->pcidev, nescq->cq_mem_size, mem,
+							nescq->hw_cq.cq_pbase);
+				else {
+					pci_free_consistent(nesdev->pcidev, nespbl->pbl_size,
+							    nespbl->pbl_vbase, nespbl->pbl_pbase);
+					kfree(nespbl);
+				}
+				nes_free_resource(nesadapter, nesadapter->allocated_cqs, cq_num);
+				kfree(nescq);
+				return ERR_PTR(-ENOMEM);
+			} else {
+				opcode |= NES_CQP_CQ_VIRT;
+				nescq->virtual_cq = 1;
+				nesadapter->free_256pbl--;
+			}
+		}
+	}
+
+	spin_unlock_irqrestore(&nesadapter->pbl_lock, flags);
+
+	nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
+	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX, opcode);
+	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_ID_IDX,
+			(nescq->hw_cq.cq_number | ((u32)nesdev->ceq_index << 16)));
+
+	if (context) {
+		if (pbl_entries != 1)
+			u64temp = (u64)nespbl->pbl_pbase;
+		else
+			u64temp	= le64_to_cpu(nespbl->pbl_vbase[0]);
+		set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_CQ_WQE_DOORBELL_INDEX_HIGH_IDX,
+				nes_ucontext->mmap_db_index[0]);
+	} else {
+		u64temp = (u64)nescq->hw_cq.cq_pbase;
+		cqp_wqe->wqe_words[NES_CQP_CQ_WQE_DOORBELL_INDEX_HIGH_IDX] = 0;
+	}
+	set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_CQ_WQE_PBL_LOW_IDX, u64temp);
+	cqp_wqe->wqe_words[NES_CQP_CQ_WQE_CQ_CONTEXT_HIGH_IDX] = 0;
+	u64temp = (u64)(unsigned long)&nescq->hw_cq;
+	cqp_wqe->wqe_words[NES_CQP_CQ_WQE_CQ_CONTEXT_LOW_IDX] =
+			cpu_to_le32((u32)(u64temp >> 1));
+	cqp_wqe->wqe_words[NES_CQP_CQ_WQE_CQ_CONTEXT_HIGH_IDX] =
+			cpu_to_le32(((u32)((u64temp) >> 33)) & 0x7FFFFFFF);
+
+	atomic_set(&cqp_request->refcount, 2);
+	nes_post_cqp_request(nesdev, cqp_request);
+
+	/* Wait for CQP */
+	nes_debug(NES_DBG_CQ, "Waiting for create iWARP CQ%u to complete.\n",
+			nescq->hw_cq.cq_number);
+	ret = wait_event_timeout(cqp_request->waitq, (0 != cqp_request->request_done),
+			NES_EVENT_TIMEOUT * 2);
+	nes_debug(NES_DBG_CQ, "Create iWARP CQ%u completed, wait_event_timeout ret = %d.\n",
+			nescq->hw_cq.cq_number, ret);
+	if ((!ret) || (cqp_request->major_code)) {
+		nes_put_cqp_request(nesdev, cqp_request);
+		if (!context)
+			pci_free_consistent(nesdev->pcidev, nescq->cq_mem_size, mem,
+					nescq->hw_cq.cq_pbase);
+		else {
+			pci_free_consistent(nesdev->pcidev, nespbl->pbl_size,
+					    nespbl->pbl_vbase, nespbl->pbl_pbase);
+			kfree(nespbl);
+		}
+		nes_free_resource(nesadapter, nesadapter->allocated_cqs, cq_num);
+		kfree(nescq);
+		return ERR_PTR(-EIO);
+	}
+	nes_put_cqp_request(nesdev, cqp_request);
+
+	if (context) {
+		/* free the nespbl */
+		pci_free_consistent(nesdev->pcidev, nespbl->pbl_size, nespbl->pbl_vbase,
+				nespbl->pbl_pbase);
+		kfree(nespbl);
+		resp.cq_id = nescq->hw_cq.cq_number;
+		resp.cq_size = nescq->hw_cq.cq_size;
+		resp.mmap_db_index = 0;
+		if (ib_copy_to_udata(udata, &resp, sizeof resp - sizeof resp.reserved)) {
+			nes_free_resource(nesadapter, nesadapter->allocated_cqs, cq_num);
+			kfree(nescq);
+			return ERR_PTR(-EFAULT);
+		}
+	}
+
+	return &nescq->ibcq;
+}
+
+
+/**
+ * nes_destroy_cq
+ */
+static int nes_destroy_cq(struct ib_cq *ib_cq)
+{
+	struct nes_cq *nescq;
+	struct nes_device *nesdev;
+	struct nes_vnic *nesvnic;
+	struct nes_adapter *nesadapter;
+	struct nes_hw_cqp_wqe *cqp_wqe;
+	struct nes_cqp_request *cqp_request;
+	unsigned long flags;
+	u32 opcode = 0;
+	int ret;
+
+	if (ib_cq == NULL)
+		return 0;
+
+	nescq = to_nescq(ib_cq);
+	nesvnic = to_nesvnic(ib_cq->device);
+	nesdev = nesvnic->nesdev;
+	nesadapter = nesdev->nesadapter;
+
+	nes_debug(NES_DBG_CQ, "Destroy CQ%u\n", nescq->hw_cq.cq_number);
+
+	/* Send DestroyCQ request to CQP */
+	cqp_request = nes_get_cqp_request(nesdev);
+	if (cqp_request == NULL) {
+		nes_debug(NES_DBG_CQ, "Failed to get a cqp_request.\n");
+		return -ENOMEM;
+	}
+	cqp_request->waiting = 1;
+	cqp_wqe = &cqp_request->cqp_wqe;
+	opcode = NES_CQP_DESTROY_CQ | (nescq->hw_cq.cq_size << 16);
+	spin_lock_irqsave(&nesadapter->pbl_lock, flags);
+	if (nescq->virtual_cq == 1) {
+		nesadapter->free_256pbl++;
+		if (nesadapter->free_256pbl > nesadapter->max_256pbl) {
+			printk(KERN_ERR PFX "%s: free 256B PBLs(%u) has exceeded the max(%u)\n",
+					__func__, nesadapter->free_256pbl, nesadapter->max_256pbl);
+		}
+	} else if (nescq->virtual_cq == 2) {
+		nesadapter->free_4kpbl++;
+		if (nesadapter->free_4kpbl > nesadapter->max_4kpbl) {
+			printk(KERN_ERR PFX "%s: free 4K PBLs(%u) has exceeded the max(%u)\n",
+					__func__, nesadapter->free_4kpbl, nesadapter->max_4kpbl);
+		}
+		opcode |= NES_CQP_CQ_4KB_CHUNK;
+	}
+
+	spin_unlock_irqrestore(&nesadapter->pbl_lock, flags);
+
+	nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
+	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX, opcode);
+	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_ID_IDX,
+		(nescq->hw_cq.cq_number | ((u32)PCI_FUNC(nesdev->pcidev->devfn) << 16)));
+	if (!nescq->mcrqf)
+		nes_free_resource(nesadapter, nesadapter->allocated_cqs, nescq->hw_cq.cq_number);
+
+	atomic_set(&cqp_request->refcount, 2);
+	nes_post_cqp_request(nesdev, cqp_request);
+
+	/* Wait for CQP */
+	nes_debug(NES_DBG_CQ, "Waiting for destroy iWARP CQ%u to complete.\n",
+			nescq->hw_cq.cq_number);
+	ret = wait_event_timeout(cqp_request->waitq, (0 != cqp_request->request_done),
+			NES_EVENT_TIMEOUT);
+	nes_debug(NES_DBG_CQ, "Destroy iWARP CQ%u completed, wait_event_timeout ret = %u,"
+			" CQP Major:Minor codes = 0x%04X:0x%04X.\n",
+			nescq->hw_cq.cq_number, ret, cqp_request->major_code,
+			cqp_request->minor_code);
+	if (!ret) {
+		nes_debug(NES_DBG_CQ, "iWARP CQ%u destroy timeout expired\n",
+					nescq->hw_cq.cq_number);
+		ret = -ETIME;
+	} else if (cqp_request->major_code) {
+		nes_debug(NES_DBG_CQ, "iWARP CQ%u destroy failed\n",
+					nescq->hw_cq.cq_number);
+		ret = -EIO;
+	} else {
+		ret = 0;
+	}
+	nes_put_cqp_request(nesdev, cqp_request);
+
+	if (nescq->cq_mem_size)
+		pci_free_consistent(nesdev->pcidev, nescq->cq_mem_size,
+				    nescq->hw_cq.cq_vbase, nescq->hw_cq.cq_pbase);
+	kfree(nescq);
+
+	return ret;
+}
+
+/**
+ * root_256
+ */
+static u32 root_256(struct nes_device *nesdev,
+		    struct nes_root_vpbl *root_vpbl,
+		    struct nes_root_vpbl *new_root,
+		    u16 pbl_count_4k)
+{
+	u64 leaf_pbl;
+	int i, j, k;
+
+	if (pbl_count_4k == 1) {
+		new_root->pbl_vbase = pci_alloc_consistent(nesdev->pcidev,
+						512, &new_root->pbl_pbase);
+
+		if (new_root->pbl_vbase == NULL)
+			return 0;
+
+		leaf_pbl = (u64)root_vpbl->pbl_pbase;
+		for (i = 0; i < 16; i++) {
+			new_root->pbl_vbase[i].pa_low =
+				cpu_to_le32((u32)leaf_pbl);
+			new_root->pbl_vbase[i].pa_high =
+				cpu_to_le32((u32)((((u64)leaf_pbl) >> 32)));
+			leaf_pbl += 256;
+		}
+	} else {
+		for (i = 3; i >= 0; i--) {
+			j = i * 16;
+			root_vpbl->pbl_vbase[j] = root_vpbl->pbl_vbase[i];
+			leaf_pbl = le32_to_cpu(root_vpbl->pbl_vbase[j].pa_low) +
+			    (((u64)le32_to_cpu(root_vpbl->pbl_vbase[j].pa_high))
+				<< 32);
+			for (k = 1; k < 16; k++) {
+				leaf_pbl += 256;
+				root_vpbl->pbl_vbase[j + k].pa_low =
+						cpu_to_le32((u32)leaf_pbl);
+				root_vpbl->pbl_vbase[j + k].pa_high =
+				    cpu_to_le32((u32)((((u64)leaf_pbl) >> 32)));
+			}
+		}
+	}
+
+	return 1;
+}
+
+
+/**
+ * nes_reg_mr
+ */
+static int nes_reg_mr(struct nes_device *nesdev, struct nes_pd *nespd,
+		u32 stag, u64 region_length, struct nes_root_vpbl *root_vpbl,
+		dma_addr_t single_buffer, u16 pbl_count_4k,
+		u16 residual_page_count_4k, int acc, u64 *iova_start,
+		u16 *actual_pbl_cnt, u8 *used_4k_pbls)
+{
+	struct nes_hw_cqp_wqe *cqp_wqe;
+	struct nes_cqp_request *cqp_request;
+	unsigned long flags;
+	int ret;
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+	uint pg_cnt = 0;
+	u16 pbl_count_256 = 0;
+	u16 pbl_count = 0;
+	u8  use_256_pbls = 0;
+	u8  use_4k_pbls = 0;
+	u16 use_two_level = (pbl_count_4k > 1) ? 1 : 0;
+	struct nes_root_vpbl new_root = { 0, NULL, NULL };
+	u32 opcode = 0;
+	u16 major_code;
+
+	/* Register the region with the adapter */
+	cqp_request = nes_get_cqp_request(nesdev);
+	if (cqp_request == NULL) {
+		nes_debug(NES_DBG_MR, "Failed to get a cqp_request.\n");
+		return -ENOMEM;
+	}
+	cqp_request->waiting = 1;
+	cqp_wqe = &cqp_request->cqp_wqe;
+
+	if (pbl_count_4k) {
+		spin_lock_irqsave(&nesadapter->pbl_lock, flags);
+
+		pg_cnt = ((pbl_count_4k - 1) * 512) + residual_page_count_4k;
+		pbl_count_256 = (pg_cnt + 31) / 32;
+		if (pg_cnt <= 32) {
+			if (pbl_count_256 <= nesadapter->free_256pbl)
+				use_256_pbls = 1;
+			else if (pbl_count_4k <= nesadapter->free_4kpbl)
+				use_4k_pbls = 1;
+		} else if (pg_cnt <= 2048) {
+			if (((pbl_count_4k + use_two_level) <= nesadapter->free_4kpbl) &&
+			    (nesadapter->free_4kpbl > (nesadapter->max_4kpbl >> 1))) {
+				use_4k_pbls = 1;
+			} else if ((pbl_count_256 + 1) <= nesadapter->free_256pbl) {
+				use_256_pbls = 1;
+				use_two_level = 1;
+			} else if ((pbl_count_4k + use_two_level) <= nesadapter->free_4kpbl) {
+				use_4k_pbls = 1;
+			}
+		} else {
+			if ((pbl_count_4k + 1) <= nesadapter->free_4kpbl)
+				use_4k_pbls = 1;
+		}
+
+		if (use_256_pbls) {
+			pbl_count = pbl_count_256;
+			nesadapter->free_256pbl -= pbl_count + use_two_level;
+		} else if (use_4k_pbls) {
+			pbl_count =  pbl_count_4k;
+			nesadapter->free_4kpbl -= pbl_count + use_two_level;
+		} else {
+			spin_unlock_irqrestore(&nesadapter->pbl_lock, flags);
+			nes_debug(NES_DBG_MR, "Out of Pbls\n");
+			nes_free_cqp_request(nesdev, cqp_request);
+			return -ENOMEM;
+		}
+
+		spin_unlock_irqrestore(&nesadapter->pbl_lock, flags);
+	}
+
+	if (use_256_pbls && use_two_level) {
+		if (root_256(nesdev, root_vpbl, &new_root, pbl_count_4k) == 1) {
+			if (new_root.pbl_pbase != 0)
+				root_vpbl = &new_root;
+		} else {
+			spin_lock_irqsave(&nesadapter->pbl_lock, flags);
+			nesadapter->free_256pbl += pbl_count_256 + use_two_level;
+			use_256_pbls = 0;
+
+			if (pbl_count_4k == 1)
+				use_two_level = 0;
+			pbl_count = pbl_count_4k;
+
+			if ((pbl_count_4k + use_two_level) <= nesadapter->free_4kpbl) {
+				nesadapter->free_4kpbl -= pbl_count + use_two_level;
+				use_4k_pbls = 1;
+			}
+			spin_unlock_irqrestore(&nesadapter->pbl_lock, flags);
+
+			if (use_4k_pbls == 0)
+				return -ENOMEM;
+		}
+	}
+
+	opcode = NES_CQP_REGISTER_STAG | NES_CQP_STAG_RIGHTS_LOCAL_READ |
+					NES_CQP_STAG_VA_TO | NES_CQP_STAG_MR;
+	if (acc & IB_ACCESS_LOCAL_WRITE)
+		opcode |= NES_CQP_STAG_RIGHTS_LOCAL_WRITE;
+	if (acc & IB_ACCESS_REMOTE_WRITE)
+		opcode |= NES_CQP_STAG_RIGHTS_REMOTE_WRITE | NES_CQP_STAG_REM_ACC_EN;
+	if (acc & IB_ACCESS_REMOTE_READ)
+		opcode |= NES_CQP_STAG_RIGHTS_REMOTE_READ | NES_CQP_STAG_REM_ACC_EN;
+	if (acc & IB_ACCESS_MW_BIND)
+		opcode |= NES_CQP_STAG_RIGHTS_WINDOW_BIND | NES_CQP_STAG_REM_ACC_EN;
+
+	nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
+	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX, opcode);
+	set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_VA_LOW_IDX, *iova_start);
+	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_LEN_LOW_IDX, region_length);
+
+	cqp_wqe->wqe_words[NES_CQP_STAG_WQE_LEN_HIGH_PD_IDX] =
+			cpu_to_le32((u32)(region_length >> 8) & 0xff000000);
+	cqp_wqe->wqe_words[NES_CQP_STAG_WQE_LEN_HIGH_PD_IDX] |=
+			cpu_to_le32(nespd->pd_id & 0x00007fff);
+	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_STAG_IDX, stag);
+
+	if (pbl_count == 0) {
+		set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_PA_LOW_IDX, single_buffer);
+	} else {
+		set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_PA_LOW_IDX, root_vpbl->pbl_pbase);
+		set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_PBL_BLK_COUNT_IDX, pbl_count);
+		set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_PBL_LEN_IDX, (pg_cnt * 8));
+
+		if (use_4k_pbls)
+			cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] |= cpu_to_le32(NES_CQP_STAG_PBL_BLK_SIZE);
+	}
+	barrier();
+
+	atomic_set(&cqp_request->refcount, 2);
+	nes_post_cqp_request(nesdev, cqp_request);
+
+	/* Wait for CQP */
+	ret = wait_event_timeout(cqp_request->waitq, (0 != cqp_request->request_done),
+			NES_EVENT_TIMEOUT);
+	nes_debug(NES_DBG_MR, "Register STag 0x%08X completed, wait_event_timeout ret = %u,"
+			" CQP Major:Minor codes = 0x%04X:0x%04X.\n",
+			stag, ret, cqp_request->major_code, cqp_request->minor_code);
+	major_code = cqp_request->major_code;
+	nes_put_cqp_request(nesdev, cqp_request);
+
+	if ((!ret || major_code) && pbl_count != 0) {
+		spin_lock_irqsave(&nesadapter->pbl_lock, flags);
+		if (use_256_pbls)
+			nesadapter->free_256pbl += pbl_count + use_two_level;
+		else if (use_4k_pbls)
+			nesadapter->free_4kpbl += pbl_count + use_two_level;
+		spin_unlock_irqrestore(&nesadapter->pbl_lock, flags);
+	}
+	if (new_root.pbl_pbase)
+		pci_free_consistent(nesdev->pcidev, 512, new_root.pbl_vbase,
+				    new_root.pbl_pbase);
+
+	if (!ret)
+		return -ETIME;
+	else if (major_code)
+		return -EIO;
+
+	*actual_pbl_cnt = pbl_count + use_two_level;
+	*used_4k_pbls = use_4k_pbls;
+	return 0;
+}
+
+
+/**
+ * nes_reg_phys_mr
+ */
+static struct ib_mr *nes_reg_phys_mr(struct ib_pd *ib_pd,
+		struct ib_phys_buf *buffer_list, int num_phys_buf, int acc,
+		u64 * iova_start)
+{
+	u64 region_length;
+	struct nes_pd *nespd = to_nespd(ib_pd);
+	struct nes_vnic *nesvnic = to_nesvnic(ib_pd->device);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+	struct nes_mr *nesmr;
+	struct ib_mr *ibmr;
+	struct nes_vpbl vpbl;
+	struct nes_root_vpbl root_vpbl;
+	u32 stag;
+	u32 i;
+	unsigned long mask;
+	u32 stag_index = 0;
+	u32 next_stag_index = 0;
+	u32 driver_key = 0;
+	u32 root_pbl_index = 0;
+	u32 cur_pbl_index = 0;
+	int err = 0;
+	int ret = 0;
+	u16 pbl_count = 0;
+	u8 single_page = 1;
+	u8 stag_key = 0;
+
+	region_length = 0;
+	vpbl.pbl_vbase = NULL;
+	root_vpbl.pbl_vbase = NULL;
+	root_vpbl.pbl_pbase = 0;
+
+	get_random_bytes(&next_stag_index, sizeof(next_stag_index));
+	stag_key = (u8)next_stag_index;
+
+	driver_key = 0;
+
+	next_stag_index >>= 8;
+	next_stag_index %= nesadapter->max_mr;
+	if (num_phys_buf > (1024*512)) {
+		return ERR_PTR(-E2BIG);
+	}
+
+	if ((buffer_list[0].addr ^ *iova_start) & ~PAGE_MASK)
+		return ERR_PTR(-EINVAL);
+
+	err = nes_alloc_resource(nesadapter, nesadapter->allocated_mrs, nesadapter->max_mr,
+			&stag_index, &next_stag_index, NES_RESOURCE_PHYS_MR);
+	if (err) {
+		return ERR_PTR(err);
+	}
+
+	nesmr = kzalloc(sizeof(*nesmr), GFP_KERNEL);
+	if (!nesmr) {
+		nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	for (i = 0; i < num_phys_buf; i++) {
+
+		if ((i & 0x01FF) == 0) {
+			if (root_pbl_index == 1) {
+				/* Allocate the root PBL */
+				root_vpbl.pbl_vbase = pci_alloc_consistent(nesdev->pcidev, 8192,
+						&root_vpbl.pbl_pbase);
+				nes_debug(NES_DBG_MR, "Allocating root PBL, va = %p, pa = 0x%08X\n",
+						root_vpbl.pbl_vbase, (unsigned int)root_vpbl.pbl_pbase);
+				if (!root_vpbl.pbl_vbase) {
+					pci_free_consistent(nesdev->pcidev, 4096, vpbl.pbl_vbase,
+							vpbl.pbl_pbase);
+					nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index);
+					kfree(nesmr);
+					return ERR_PTR(-ENOMEM);
+				}
+				root_vpbl.leaf_vpbl = kzalloc(sizeof(*root_vpbl.leaf_vpbl)*1024, GFP_KERNEL);
+				if (!root_vpbl.leaf_vpbl) {
+					pci_free_consistent(nesdev->pcidev, 8192, root_vpbl.pbl_vbase,
+							root_vpbl.pbl_pbase);
+					pci_free_consistent(nesdev->pcidev, 4096, vpbl.pbl_vbase,
+							vpbl.pbl_pbase);
+					nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index);
+					kfree(nesmr);
+					return ERR_PTR(-ENOMEM);
+				}
+				root_vpbl.pbl_vbase[0].pa_low = cpu_to_le32((u32)vpbl.pbl_pbase);
+				root_vpbl.pbl_vbase[0].pa_high =
+						cpu_to_le32((u32)((((u64)vpbl.pbl_pbase) >> 32)));
+				root_vpbl.leaf_vpbl[0] = vpbl;
+			}
+			/* Allocate a 4K buffer for the PBL */
+			vpbl.pbl_vbase = pci_alloc_consistent(nesdev->pcidev, 4096,
+					&vpbl.pbl_pbase);
+			nes_debug(NES_DBG_MR, "Allocating leaf PBL, va = %p, pa = 0x%016lX\n",
+					vpbl.pbl_vbase, (unsigned long)vpbl.pbl_pbase);
+			if (!vpbl.pbl_vbase) {
+				nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index);
+				ibmr = ERR_PTR(-ENOMEM);
+				kfree(nesmr);
+				goto reg_phys_err;
+			}
+			/* Fill in the root table */
+			if (1 <= root_pbl_index) {
+				root_vpbl.pbl_vbase[root_pbl_index].pa_low =
+						cpu_to_le32((u32)vpbl.pbl_pbase);
+				root_vpbl.pbl_vbase[root_pbl_index].pa_high =
+						cpu_to_le32((u32)((((u64)vpbl.pbl_pbase) >> 32)));
+				root_vpbl.leaf_vpbl[root_pbl_index] = vpbl;
+			}
+			root_pbl_index++;
+			cur_pbl_index = 0;
+		}
+
+		mask = !buffer_list[i].size;
+		if (i != 0)
+			mask |= buffer_list[i].addr;
+		if (i != num_phys_buf - 1)
+			mask |= buffer_list[i].addr + buffer_list[i].size;
+
+		if (mask & ~PAGE_MASK) {
+			nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index);
+			nes_debug(NES_DBG_MR, "Invalid buffer addr or size\n");
+			ibmr = ERR_PTR(-EINVAL);
+			kfree(nesmr);
+			goto reg_phys_err;
+		}
+
+		region_length += buffer_list[i].size;
+		if ((i != 0) && (single_page)) {
+			if ((buffer_list[i-1].addr+PAGE_SIZE) != buffer_list[i].addr)
+				single_page = 0;
+		}
+		vpbl.pbl_vbase[cur_pbl_index].pa_low = cpu_to_le32((u32)buffer_list[i].addr & PAGE_MASK);
+		vpbl.pbl_vbase[cur_pbl_index++].pa_high =
+				cpu_to_le32((u32)((((u64)buffer_list[i].addr) >> 32)));
+	}
+
+	stag = stag_index << 8;
+	stag |= driver_key;
+	stag += (u32)stag_key;
+
+	nes_debug(NES_DBG_MR, "Registering STag 0x%08X, VA = 0x%016lX,"
+			" length = 0x%016lX, index = 0x%08X\n",
+			stag, (unsigned long)*iova_start, (unsigned long)region_length, stag_index);
+
+	/* Make the leaf PBL the root if only one PBL */
+	if (root_pbl_index == 1) {
+		root_vpbl.pbl_pbase = vpbl.pbl_pbase;
+	}
+
+	if (single_page) {
+		pbl_count = 0;
+	} else {
+		pbl_count = root_pbl_index;
+	}
+	ret = nes_reg_mr(nesdev, nespd, stag, region_length, &root_vpbl,
+			buffer_list[0].addr, pbl_count, (u16)cur_pbl_index, acc, iova_start,
+			&nesmr->pbls_used, &nesmr->pbl_4k);
+
+	if (ret == 0) {
+		nesmr->ibmr.rkey = stag;
+		nesmr->ibmr.lkey = stag;
+		nesmr->mode = IWNES_MEMREG_TYPE_MEM;
+		ibmr = &nesmr->ibmr;
+	} else {
+		kfree(nesmr);
+		ibmr = ERR_PTR(-ENOMEM);
+	}
+
+	reg_phys_err:
+	/* free the resources */
+	if (root_pbl_index == 1) {
+		/* single PBL case */
+		pci_free_consistent(nesdev->pcidev, 4096, vpbl.pbl_vbase, vpbl.pbl_pbase);
+	} else {
+		for (i=0; i<root_pbl_index; i++) {
+			pci_free_consistent(nesdev->pcidev, 4096, root_vpbl.leaf_vpbl[i].pbl_vbase,
+					root_vpbl.leaf_vpbl[i].pbl_pbase);
+		}
+		kfree(root_vpbl.leaf_vpbl);
+		pci_free_consistent(nesdev->pcidev, 8192, root_vpbl.pbl_vbase,
+				root_vpbl.pbl_pbase);
+	}
+
+	return ibmr;
+}
+
+
+/**
+ * nes_get_dma_mr
+ */
+static struct ib_mr *nes_get_dma_mr(struct ib_pd *pd, int acc)
+{
+	struct ib_phys_buf bl;
+	u64 kva = 0;
+
+	nes_debug(NES_DBG_MR, "\n");
+
+	bl.size = (u64)0xffffffffffULL;
+	bl.addr = 0;
+	return nes_reg_phys_mr(pd, &bl, 1, acc, &kva);
+}
+
+
+/**
+ * nes_reg_user_mr
+ */
+static struct ib_mr *nes_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
+		u64 virt, int acc, struct ib_udata *udata)
+{
+	u64 iova_start;
+	__le64 *pbl;
+	u64 region_length;
+	dma_addr_t last_dma_addr = 0;
+	dma_addr_t first_dma_addr = 0;
+	struct nes_pd *nespd = to_nespd(pd);
+	struct nes_vnic *nesvnic = to_nesvnic(pd->device);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+	struct ib_mr *ibmr = ERR_PTR(-EINVAL);
+	struct scatterlist *sg;
+	struct nes_ucontext *nes_ucontext;
+	struct nes_pbl *nespbl;
+	struct nes_mr *nesmr;
+	struct ib_umem *region;
+	struct nes_mem_reg_req req;
+	struct nes_vpbl vpbl;
+	struct nes_root_vpbl root_vpbl;
+	int entry, page_index;
+	int page_count = 0;
+	int err, pbl_depth = 0;
+	int chunk_pages;
+	int ret;
+	u32 stag;
+	u32 stag_index = 0;
+	u32 next_stag_index;
+	u32 driver_key;
+	u32 root_pbl_index = 0;
+	u32 cur_pbl_index = 0;
+	u32 skip_pages;
+	u16 pbl_count;
+	u8 single_page = 1;
+	u8 stag_key;
+	int first_page = 1;
+
+	region = ib_umem_get(pd->uobject->context, start, length, acc, 0);
+	if (IS_ERR(region)) {
+		return (struct ib_mr *)region;
+	}
+
+	nes_debug(NES_DBG_MR, "User base = 0x%lX, Virt base = 0x%lX, length = %u,"
+			" offset = %u, page size = %u.\n",
+			(unsigned long int)start, (unsigned long int)virt, (u32)length,
+			region->offset, region->page_size);
+
+	skip_pages = ((u32)region->offset) >> 12;
+
+	if (ib_copy_from_udata(&req, udata, sizeof(req))) {
+		ib_umem_release(region);
+		return ERR_PTR(-EFAULT);
+	}
+	nes_debug(NES_DBG_MR, "Memory Registration type = %08X.\n", req.reg_type);
+
+	switch (req.reg_type) {
+		case IWNES_MEMREG_TYPE_MEM:
+			pbl_depth = 0;
+			region_length = 0;
+			vpbl.pbl_vbase = NULL;
+			root_vpbl.pbl_vbase = NULL;
+			root_vpbl.pbl_pbase = 0;
+
+			get_random_bytes(&next_stag_index, sizeof(next_stag_index));
+			stag_key = (u8)next_stag_index;
+
+			driver_key = next_stag_index & 0x70000000;
+
+			next_stag_index >>= 8;
+			next_stag_index %= nesadapter->max_mr;
+
+			err = nes_alloc_resource(nesadapter, nesadapter->allocated_mrs,
+					nesadapter->max_mr, &stag_index, &next_stag_index, NES_RESOURCE_USER_MR);
+			if (err) {
+				ib_umem_release(region);
+				return ERR_PTR(err);
+			}
+
+			nesmr = kzalloc(sizeof(*nesmr), GFP_KERNEL);
+			if (!nesmr) {
+				ib_umem_release(region);
+				nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index);
+				return ERR_PTR(-ENOMEM);
+			}
+			nesmr->region = region;
+
+			for_each_sg(region->sg_head.sgl, sg, region->nmap, entry) {
+				if (sg_dma_address(sg) & ~PAGE_MASK) {
+					ib_umem_release(region);
+					nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index);
+					nes_debug(NES_DBG_MR, "Unaligned Memory Buffer: 0x%x\n",
+						  (unsigned int) sg_dma_address(sg));
+					ibmr = ERR_PTR(-EINVAL);
+					kfree(nesmr);
+					goto reg_user_mr_err;
+				}
+
+				if (!sg_dma_len(sg)) {
+					ib_umem_release(region);
+					nes_free_resource(nesadapter, nesadapter->allocated_mrs,
+							  stag_index);
+					nes_debug(NES_DBG_MR, "Invalid Buffer Size\n");
+					ibmr = ERR_PTR(-EINVAL);
+					kfree(nesmr);
+					goto reg_user_mr_err;
+				}
+
+				region_length += sg_dma_len(sg);
+				chunk_pages = sg_dma_len(sg) >> 12;
+				region_length -= skip_pages << 12;
+				for (page_index = skip_pages; page_index < chunk_pages; page_index++) {
+					skip_pages = 0;
+					if ((page_count != 0) && (page_count<<12)-(region->offset&(4096-1)) >= region->length)
+						goto enough_pages;
+					if ((page_count&0x01FF) == 0) {
+						if (page_count >= 1024 * 512) {
+							ib_umem_release(region);
+							nes_free_resource(nesadapter,
+									  nesadapter->allocated_mrs, stag_index);
+							kfree(nesmr);
+							ibmr = ERR_PTR(-E2BIG);
+							goto reg_user_mr_err;
+						}
+						if (root_pbl_index == 1) {
+							root_vpbl.pbl_vbase = pci_alloc_consistent(nesdev->pcidev,
+									8192, &root_vpbl.pbl_pbase);
+							nes_debug(NES_DBG_MR, "Allocating root PBL, va = %p, pa = 0x%08X\n",
+								  root_vpbl.pbl_vbase, (unsigned int)root_vpbl.pbl_pbase);
+							if (!root_vpbl.pbl_vbase) {
+								ib_umem_release(region);
+								pci_free_consistent(nesdev->pcidev, 4096, vpbl.pbl_vbase,
+										    vpbl.pbl_pbase);
+								nes_free_resource(nesadapter, nesadapter->allocated_mrs,
+										  stag_index);
+								kfree(nesmr);
+								ibmr = ERR_PTR(-ENOMEM);
+								goto reg_user_mr_err;
+							}
+							root_vpbl.leaf_vpbl = kzalloc(sizeof(*root_vpbl.leaf_vpbl)*1024,
+									GFP_KERNEL);
+							if (!root_vpbl.leaf_vpbl) {
+								ib_umem_release(region);
+								pci_free_consistent(nesdev->pcidev, 8192, root_vpbl.pbl_vbase,
+										    root_vpbl.pbl_pbase);
+								pci_free_consistent(nesdev->pcidev, 4096, vpbl.pbl_vbase,
+										    vpbl.pbl_pbase);
+								nes_free_resource(nesadapter, nesadapter->allocated_mrs,
+										  stag_index);
+								kfree(nesmr);
+								ibmr = ERR_PTR(-ENOMEM);
+								goto reg_user_mr_err;
+							}
+							root_vpbl.pbl_vbase[0].pa_low =
+									cpu_to_le32((u32)vpbl.pbl_pbase);
+							root_vpbl.pbl_vbase[0].pa_high =
+									cpu_to_le32((u32)((((u64)vpbl.pbl_pbase) >> 32)));
+							root_vpbl.leaf_vpbl[0] = vpbl;
+						}
+						vpbl.pbl_vbase = pci_alloc_consistent(nesdev->pcidev, 4096,
+								&vpbl.pbl_pbase);
+						nes_debug(NES_DBG_MR, "Allocating leaf PBL, va = %p, pa = 0x%08X\n",
+							  vpbl.pbl_vbase, (unsigned int)vpbl.pbl_pbase);
+						if (!vpbl.pbl_vbase) {
+							ib_umem_release(region);
+							nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index);
+							ibmr = ERR_PTR(-ENOMEM);
+							kfree(nesmr);
+							goto reg_user_mr_err;
+						}
+						if (1 <= root_pbl_index) {
+							root_vpbl.pbl_vbase[root_pbl_index].pa_low =
+									cpu_to_le32((u32)vpbl.pbl_pbase);
+							root_vpbl.pbl_vbase[root_pbl_index].pa_high =
+									cpu_to_le32((u32)((((u64)vpbl.pbl_pbase)>>32)));
+							root_vpbl.leaf_vpbl[root_pbl_index] = vpbl;
+						}
+						root_pbl_index++;
+						cur_pbl_index = 0;
+					}
+					if (single_page) {
+						if (page_count != 0) {
+							if ((last_dma_addr+4096) !=
+									(sg_dma_address(sg)+
+									(page_index*4096)))
+								single_page = 0;
+							last_dma_addr = sg_dma_address(sg)+
+									(page_index*4096);
+						} else {
+							first_dma_addr = sg_dma_address(sg)+
+									(page_index*4096);
+							last_dma_addr = first_dma_addr;
+						}
+					}
+
+					vpbl.pbl_vbase[cur_pbl_index].pa_low =
+							cpu_to_le32((u32)(sg_dma_address(sg)+
+							(page_index*4096)));
+					vpbl.pbl_vbase[cur_pbl_index].pa_high =
+							cpu_to_le32((u32)((((u64)(sg_dma_address(sg)+
+							(page_index*4096))) >> 32)));
+					cur_pbl_index++;
+					page_count++;
+				}
+			}
+
+			enough_pages:
+			nes_debug(NES_DBG_MR, "calculating stag, stag_index=0x%08x, driver_key=0x%08x,"
+					" stag_key=0x%08x\n",
+					stag_index, driver_key, stag_key);
+			stag = stag_index << 8;
+			stag |= driver_key;
+			stag += (u32)stag_key;
+
+			iova_start = virt;
+			/* Make the leaf PBL the root if only one PBL */
+			if (root_pbl_index == 1) {
+				root_vpbl.pbl_pbase = vpbl.pbl_pbase;
+			}
+
+			if (single_page) {
+				pbl_count = 0;
+			} else {
+				pbl_count = root_pbl_index;
+				first_dma_addr = 0;
+			}
+			nes_debug(NES_DBG_MR, "Registering STag 0x%08X, VA = 0x%08X, length = 0x%08X,"
+					" index = 0x%08X, region->length=0x%08llx, pbl_count = %u\n",
+					stag, (unsigned int)iova_start,
+					(unsigned int)region_length, stag_index,
+					(unsigned long long)region->length, pbl_count);
+			ret = nes_reg_mr(nesdev, nespd, stag, region->length, &root_vpbl,
+					 first_dma_addr, pbl_count, (u16)cur_pbl_index, acc,
+					 &iova_start, &nesmr->pbls_used, &nesmr->pbl_4k);
+
+			nes_debug(NES_DBG_MR, "ret=%d\n", ret);
+
+			if (ret == 0) {
+				nesmr->ibmr.rkey = stag;
+				nesmr->ibmr.lkey = stag;
+				nesmr->mode = IWNES_MEMREG_TYPE_MEM;
+				ibmr = &nesmr->ibmr;
+			} else {
+				ib_umem_release(region);
+				kfree(nesmr);
+				ibmr = ERR_PTR(-ENOMEM);
+			}
+
+			reg_user_mr_err:
+			/* free the resources */
+			if (root_pbl_index == 1) {
+				pci_free_consistent(nesdev->pcidev, 4096, vpbl.pbl_vbase,
+						vpbl.pbl_pbase);
+			} else {
+				for (page_index=0; page_index<root_pbl_index; page_index++) {
+					pci_free_consistent(nesdev->pcidev, 4096,
+							root_vpbl.leaf_vpbl[page_index].pbl_vbase,
+							root_vpbl.leaf_vpbl[page_index].pbl_pbase);
+				}
+				kfree(root_vpbl.leaf_vpbl);
+				pci_free_consistent(nesdev->pcidev, 8192, root_vpbl.pbl_vbase,
+						root_vpbl.pbl_pbase);
+			}
+
+			nes_debug(NES_DBG_MR, "Leaving, ibmr=%p", ibmr);
+
+			return ibmr;
+		case IWNES_MEMREG_TYPE_QP:
+		case IWNES_MEMREG_TYPE_CQ:
+			if (!region->length) {
+				nes_debug(NES_DBG_MR, "Unable to register zero length region for CQ\n");
+				ib_umem_release(region);
+				return ERR_PTR(-EINVAL);
+			}
+			nespbl = kzalloc(sizeof(*nespbl), GFP_KERNEL);
+			if (!nespbl) {
+				nes_debug(NES_DBG_MR, "Unable to allocate PBL\n");
+				ib_umem_release(region);
+				return ERR_PTR(-ENOMEM);
+			}
+			nesmr = kzalloc(sizeof(*nesmr), GFP_KERNEL);
+			if (!nesmr) {
+				ib_umem_release(region);
+				kfree(nespbl);
+				nes_debug(NES_DBG_MR, "Unable to allocate nesmr\n");
+				return ERR_PTR(-ENOMEM);
+			}
+			nesmr->region = region;
+			nes_ucontext = to_nesucontext(pd->uobject->context);
+			pbl_depth = region->length >> 12;
+			pbl_depth += (region->length & (4096-1)) ? 1 : 0;
+			nespbl->pbl_size = pbl_depth*sizeof(u64);
+			if (req.reg_type == IWNES_MEMREG_TYPE_QP) {
+				nes_debug(NES_DBG_MR, "Attempting to allocate QP PBL memory");
+			} else {
+				nes_debug(NES_DBG_MR, "Attempting to allocate CP PBL memory");
+			}
+
+			nes_debug(NES_DBG_MR, " %u bytes, %u entries.\n",
+					nespbl->pbl_size, pbl_depth);
+			pbl = pci_alloc_consistent(nesdev->pcidev, nespbl->pbl_size,
+					&nespbl->pbl_pbase);
+			if (!pbl) {
+				ib_umem_release(region);
+				kfree(nesmr);
+				kfree(nespbl);
+				nes_debug(NES_DBG_MR, "Unable to allocate PBL memory\n");
+				return ERR_PTR(-ENOMEM);
+			}
+
+			nespbl->pbl_vbase = (u64 *)pbl;
+			nespbl->user_base = start;
+			nes_debug(NES_DBG_MR, "Allocated PBL memory, %u bytes, pbl_pbase=%lx,"
+					" pbl_vbase=%p user_base=0x%lx\n",
+				  nespbl->pbl_size, (unsigned long) nespbl->pbl_pbase,
+				  (void *) nespbl->pbl_vbase, nespbl->user_base);
+
+			for_each_sg(region->sg_head.sgl, sg, region->nmap, entry) {
+				chunk_pages = sg_dma_len(sg) >> 12;
+				chunk_pages += (sg_dma_len(sg) & (4096-1)) ? 1 : 0;
+				if (first_page) {
+					nespbl->page = sg_page(sg);
+					first_page = 0;
+				}
+
+				for (page_index = 0; page_index < chunk_pages; page_index++) {
+					((__le32 *)pbl)[0] = cpu_to_le32((u32)
+							(sg_dma_address(sg)+
+							(page_index*4096)));
+					((__le32 *)pbl)[1] = cpu_to_le32(((u64)
+							(sg_dma_address(sg)+
+							(page_index*4096)))>>32);
+					nes_debug(NES_DBG_MR, "pbl=%p, *pbl=0x%016llx, 0x%08x%08x\n", pbl,
+						  (unsigned long long)*pbl,
+						  le32_to_cpu(((__le32 *)pbl)[1]), le32_to_cpu(((__le32 *)pbl)[0]));
+					pbl++;
+				}
+			}
+
+			if (req.reg_type == IWNES_MEMREG_TYPE_QP) {
+				list_add_tail(&nespbl->list, &nes_ucontext->qp_reg_mem_list);
+			} else {
+				list_add_tail(&nespbl->list, &nes_ucontext->cq_reg_mem_list);
+			}
+			nesmr->ibmr.rkey = -1;
+			nesmr->ibmr.lkey = -1;
+			nesmr->mode = req.reg_type;
+			return &nesmr->ibmr;
+	}
+
+	ib_umem_release(region);
+	return ERR_PTR(-ENOSYS);
+}
+
+
+/**
+ * nes_dereg_mr
+ */
+static int nes_dereg_mr(struct ib_mr *ib_mr)
+{
+	struct nes_mr *nesmr = to_nesmr(ib_mr);
+	struct nes_vnic *nesvnic = to_nesvnic(ib_mr->device);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+	struct nes_hw_cqp_wqe *cqp_wqe;
+	struct nes_cqp_request *cqp_request;
+	unsigned long flags;
+	int ret;
+	u16 major_code;
+	u16 minor_code;
+
+	if (nesmr->region) {
+		ib_umem_release(nesmr->region);
+	}
+	if (nesmr->mode != IWNES_MEMREG_TYPE_MEM) {
+		kfree(nesmr);
+		return 0;
+	}
+
+	/* Deallocate the region with the adapter */
+
+	cqp_request = nes_get_cqp_request(nesdev);
+	if (cqp_request == NULL) {
+		nes_debug(NES_DBG_MR, "Failed to get a cqp_request.\n");
+		return -ENOMEM;
+	}
+	cqp_request->waiting = 1;
+	cqp_wqe = &cqp_request->cqp_wqe;
+
+	nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
+	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX,
+			NES_CQP_DEALLOCATE_STAG | NES_CQP_STAG_VA_TO |
+			NES_CQP_STAG_DEALLOC_PBLS | NES_CQP_STAG_MR);
+	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_STAG_IDX, ib_mr->rkey);
+
+	atomic_set(&cqp_request->refcount, 2);
+	nes_post_cqp_request(nesdev, cqp_request);
+
+	/* Wait for CQP */
+	nes_debug(NES_DBG_MR, "Waiting for deallocate STag 0x%08X completed\n", ib_mr->rkey);
+	ret = wait_event_timeout(cqp_request->waitq, (cqp_request->request_done != 0),
+			NES_EVENT_TIMEOUT);
+	nes_debug(NES_DBG_MR, "Deallocate STag 0x%08X completed, wait_event_timeout ret = %u,"
+			" CQP Major:Minor codes = 0x%04X:0x%04X\n",
+			ib_mr->rkey, ret, cqp_request->major_code, cqp_request->minor_code);
+
+	major_code = cqp_request->major_code;
+	minor_code = cqp_request->minor_code;
+
+	nes_put_cqp_request(nesdev, cqp_request);
+
+	if (!ret) {
+		nes_debug(NES_DBG_MR, "Timeout waiting to destroy STag,"
+				" ib_mr=%p, rkey = 0x%08X\n",
+				ib_mr, ib_mr->rkey);
+		return -ETIME;
+	} else if (major_code) {
+		nes_debug(NES_DBG_MR, "Error (0x%04X:0x%04X) while attempting"
+				" to destroy STag, ib_mr=%p, rkey = 0x%08X\n",
+				major_code, minor_code, ib_mr, ib_mr->rkey);
+		return -EIO;
+	}
+
+	if (nesmr->pbls_used != 0) {
+		spin_lock_irqsave(&nesadapter->pbl_lock, flags);
+		if (nesmr->pbl_4k) {
+			nesadapter->free_4kpbl += nesmr->pbls_used;
+			if (nesadapter->free_4kpbl > nesadapter->max_4kpbl)
+				printk(KERN_ERR PFX "free 4KB PBLs(%u) has "
+					"exceeded the max(%u)\n",
+					nesadapter->free_4kpbl,
+					nesadapter->max_4kpbl);
+		} else {
+			nesadapter->free_256pbl += nesmr->pbls_used;
+			if (nesadapter->free_256pbl > nesadapter->max_256pbl)
+				printk(KERN_ERR PFX "free 256B PBLs(%u) has "
+					"exceeded the max(%u)\n",
+					nesadapter->free_256pbl,
+					nesadapter->max_256pbl);
+		}
+		spin_unlock_irqrestore(&nesadapter->pbl_lock, flags);
+	}
+	nes_free_resource(nesadapter, nesadapter->allocated_mrs,
+			(ib_mr->rkey & 0x0fffff00) >> 8);
+
+	kfree(nesmr);
+
+	return 0;
+}
+
+
+/**
+ * show_rev
+ */
+static ssize_t show_rev(struct device *dev, struct device_attribute *attr,
+			char *buf)
+{
+	struct nes_ib_device *nesibdev =
+			container_of(dev, struct nes_ib_device, ibdev.dev);
+	struct nes_vnic *nesvnic = nesibdev->nesvnic;
+
+	nes_debug(NES_DBG_INIT, "\n");
+	return sprintf(buf, "%x\n", nesvnic->nesdev->nesadapter->hw_rev);
+}
+
+
+/**
+ * show_fw_ver
+ */
+static ssize_t show_fw_ver(struct device *dev, struct device_attribute *attr,
+			   char *buf)
+{
+	struct nes_ib_device *nesibdev =
+			container_of(dev, struct nes_ib_device, ibdev.dev);
+	struct nes_vnic *nesvnic = nesibdev->nesvnic;
+
+	nes_debug(NES_DBG_INIT, "\n");
+	return sprintf(buf, "%u.%u\n",
+		(nesvnic->nesdev->nesadapter->firmware_version >> 16),
+		(nesvnic->nesdev->nesadapter->firmware_version & 0x000000ff));
+}
+
+
+/**
+ * show_hca
+ */
+static ssize_t show_hca(struct device *dev, struct device_attribute *attr,
+		        char *buf)
+{
+	nes_debug(NES_DBG_INIT, "\n");
+	return sprintf(buf, "NES020\n");
+}
+
+
+/**
+ * show_board
+ */
+static ssize_t show_board(struct device *dev, struct device_attribute *attr,
+			  char *buf)
+{
+	nes_debug(NES_DBG_INIT, "\n");
+	return sprintf(buf, "%.*s\n", 32, "NES020 Board ID");
+}
+
+
+static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL);
+static DEVICE_ATTR(fw_ver, S_IRUGO, show_fw_ver, NULL);
+static DEVICE_ATTR(hca_type, S_IRUGO, show_hca, NULL);
+static DEVICE_ATTR(board_id, S_IRUGO, show_board, NULL);
+
+static struct device_attribute *nes_dev_attributes[] = {
+	&dev_attr_hw_rev,
+	&dev_attr_fw_ver,
+	&dev_attr_hca_type,
+	&dev_attr_board_id
+};
+
+
+/**
+ * nes_query_qp
+ */
+static int nes_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+		int attr_mask, struct ib_qp_init_attr *init_attr)
+{
+	struct nes_qp *nesqp = to_nesqp(ibqp);
+
+	nes_debug(NES_DBG_QP, "\n");
+
+	attr->qp_access_flags = 0;
+	attr->cap.max_send_wr = nesqp->hwqp.sq_size;
+	attr->cap.max_recv_wr = nesqp->hwqp.rq_size;
+	attr->cap.max_recv_sge = 1;
+	if (nes_drv_opt & NES_DRV_OPT_NO_INLINE_DATA)
+		attr->cap.max_inline_data = 0;
+	else
+		attr->cap.max_inline_data = 64;
+
+	init_attr->event_handler = nesqp->ibqp.event_handler;
+	init_attr->qp_context = nesqp->ibqp.qp_context;
+	init_attr->send_cq = nesqp->ibqp.send_cq;
+	init_attr->recv_cq = nesqp->ibqp.recv_cq;
+	init_attr->srq = nesqp->ibqp.srq;
+	init_attr->cap = attr->cap;
+
+	return 0;
+}
+
+
+/**
+ * nes_hw_modify_qp
+ */
+int nes_hw_modify_qp(struct nes_device *nesdev, struct nes_qp *nesqp,
+		u32 next_iwarp_state, u32 termlen, u32 wait_completion)
+{
+	struct nes_hw_cqp_wqe *cqp_wqe;
+	/* struct iw_cm_id *cm_id = nesqp->cm_id; */
+	/* struct iw_cm_event cm_event; */
+	struct nes_cqp_request *cqp_request;
+	int ret;
+	u16 major_code;
+
+	nes_debug(NES_DBG_MOD_QP, "QP%u, refcount=%d\n",
+			nesqp->hwqp.qp_id, atomic_read(&nesqp->refcount));
+
+	cqp_request = nes_get_cqp_request(nesdev);
+	if (cqp_request == NULL) {
+		nes_debug(NES_DBG_MOD_QP, "Failed to get a cqp_request.\n");
+		return -ENOMEM;
+	}
+	if (wait_completion) {
+		cqp_request->waiting = 1;
+	} else {
+		cqp_request->waiting = 0;
+	}
+	cqp_wqe = &cqp_request->cqp_wqe;
+
+	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX,
+			NES_CQP_MODIFY_QP | NES_CQP_QP_TYPE_IWARP | next_iwarp_state);
+	nes_debug(NES_DBG_MOD_QP, "using next_iwarp_state=%08x, wqe_words=%08x\n",
+			next_iwarp_state, le32_to_cpu(cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX]));
+	nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
+	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_ID_IDX, nesqp->hwqp.qp_id);
+	set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_QP_WQE_CONTEXT_LOW_IDX, (u64)nesqp->nesqp_context_pbase);
+
+	/* If sending a terminate message, fill in the length (in words) */
+	if (((next_iwarp_state & NES_CQP_QP_IWARP_STATE_MASK) == NES_CQP_QP_IWARP_STATE_TERMINATE) &&
+	    !(next_iwarp_state & NES_CQP_QP_TERM_DONT_SEND_TERM_MSG)) {
+		termlen = ((termlen + 3) >> 2) << NES_CQP_OP_TERMLEN_SHIFT;
+		set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_QP_WQE_NEW_MSS_IDX, termlen);
+	}
+
+	atomic_set(&cqp_request->refcount, 2);
+	nes_post_cqp_request(nesdev, cqp_request);
+
+	/* Wait for CQP */
+	if (wait_completion) {
+		/* nes_debug(NES_DBG_MOD_QP, "Waiting for modify iWARP QP%u to complete.\n",
+				nesqp->hwqp.qp_id); */
+		ret = wait_event_timeout(cqp_request->waitq, (cqp_request->request_done != 0),
+				NES_EVENT_TIMEOUT);
+		nes_debug(NES_DBG_MOD_QP, "Modify iwarp QP%u completed, wait_event_timeout ret=%u, "
+				"CQP Major:Minor codes = 0x%04X:0x%04X.\n",
+				nesqp->hwqp.qp_id, ret, cqp_request->major_code, cqp_request->minor_code);
+		major_code = cqp_request->major_code;
+		if (major_code) {
+			nes_debug(NES_DBG_MOD_QP, "Modify iwarp QP%u failed"
+					"CQP Major:Minor codes = 0x%04X:0x%04X, intended next state = 0x%08X.\n",
+					nesqp->hwqp.qp_id, cqp_request->major_code,
+					cqp_request->minor_code, next_iwarp_state);
+		}
+
+		nes_put_cqp_request(nesdev, cqp_request);
+
+		if (!ret)
+			return -ETIME;
+		else if (major_code)
+			return -EIO;
+		else
+			return 0;
+	} else {
+		return 0;
+	}
+}
+
+
+/**
+ * nes_modify_qp
+ */
+int nes_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+		int attr_mask, struct ib_udata *udata)
+{
+	struct nes_qp *nesqp = to_nesqp(ibqp);
+	struct nes_vnic *nesvnic = to_nesvnic(ibqp->device);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	/* u32 cqp_head; */
+	/* u32 counter; */
+	u32 next_iwarp_state = 0;
+	int err;
+	unsigned long qplockflags;
+	int ret;
+	u16 original_last_aeq;
+	u8 issue_modify_qp = 0;
+	u8 dont_wait = 0;
+
+	nes_debug(NES_DBG_MOD_QP, "QP%u: QP State=%u, cur QP State=%u,"
+			" iwarp_state=0x%X, refcount=%d\n",
+			nesqp->hwqp.qp_id, attr->qp_state, nesqp->ibqp_state,
+			nesqp->iwarp_state, atomic_read(&nesqp->refcount));
+
+	spin_lock_irqsave(&nesqp->lock, qplockflags);
+
+	nes_debug(NES_DBG_MOD_QP, "QP%u: hw_iwarp_state=0x%X, hw_tcp_state=0x%X,"
+			" QP Access Flags=0x%X, attr_mask = 0x%0x\n",
+			nesqp->hwqp.qp_id, nesqp->hw_iwarp_state,
+			nesqp->hw_tcp_state, attr->qp_access_flags, attr_mask);
+
+	if (attr_mask & IB_QP_STATE) {
+		switch (attr->qp_state) {
+			case IB_QPS_INIT:
+				nes_debug(NES_DBG_MOD_QP, "QP%u: new state = init\n",
+						nesqp->hwqp.qp_id);
+				if (nesqp->iwarp_state > (u32)NES_CQP_QP_IWARP_STATE_IDLE) {
+					spin_unlock_irqrestore(&nesqp->lock, qplockflags);
+					return -EINVAL;
+				}
+				next_iwarp_state = NES_CQP_QP_IWARP_STATE_IDLE;
+				issue_modify_qp = 1;
+				break;
+			case IB_QPS_RTR:
+				nes_debug(NES_DBG_MOD_QP, "QP%u: new state = rtr\n",
+						nesqp->hwqp.qp_id);
+				if (nesqp->iwarp_state>(u32)NES_CQP_QP_IWARP_STATE_IDLE) {
+					spin_unlock_irqrestore(&nesqp->lock, qplockflags);
+					return -EINVAL;
+				}
+				next_iwarp_state = NES_CQP_QP_IWARP_STATE_IDLE;
+				issue_modify_qp = 1;
+				break;
+			case IB_QPS_RTS:
+				nes_debug(NES_DBG_MOD_QP, "QP%u: new state = rts\n",
+						nesqp->hwqp.qp_id);
+				if (nesqp->iwarp_state>(u32)NES_CQP_QP_IWARP_STATE_RTS) {
+					spin_unlock_irqrestore(&nesqp->lock, qplockflags);
+					return -EINVAL;
+				}
+				if (nesqp->cm_id == NULL) {
+					nes_debug(NES_DBG_MOD_QP, "QP%u: Failing attempt to move QP to RTS without a CM_ID. \n",
+							nesqp->hwqp.qp_id );
+					spin_unlock_irqrestore(&nesqp->lock, qplockflags);
+					return -EINVAL;
+				}
+				next_iwarp_state = NES_CQP_QP_IWARP_STATE_RTS;
+				if (nesqp->iwarp_state != NES_CQP_QP_IWARP_STATE_RTS)
+					next_iwarp_state |= NES_CQP_QP_CONTEXT_VALID |
+							NES_CQP_QP_ARP_VALID | NES_CQP_QP_ORD_VALID;
+				issue_modify_qp = 1;
+				nesqp->hw_tcp_state = NES_AEQE_TCP_STATE_ESTABLISHED;
+				nesqp->hw_iwarp_state = NES_AEQE_IWARP_STATE_RTS;
+				nesqp->hte_added = 1;
+				break;
+			case IB_QPS_SQD:
+				issue_modify_qp = 1;
+				nes_debug(NES_DBG_MOD_QP, "QP%u: new state=closing. SQ head=%u, SQ tail=%u\n",
+						nesqp->hwqp.qp_id, nesqp->hwqp.sq_head, nesqp->hwqp.sq_tail);
+				if (nesqp->iwarp_state == (u32)NES_CQP_QP_IWARP_STATE_CLOSING) {
+					spin_unlock_irqrestore(&nesqp->lock, qplockflags);
+					return 0;
+				} else {
+					if (nesqp->iwarp_state > (u32)NES_CQP_QP_IWARP_STATE_CLOSING) {
+						nes_debug(NES_DBG_MOD_QP, "QP%u: State change to closing"
+								" ignored due to current iWARP state\n",
+								nesqp->hwqp.qp_id);
+						spin_unlock_irqrestore(&nesqp->lock, qplockflags);
+						return -EINVAL;
+					}
+					if (nesqp->hw_iwarp_state != NES_AEQE_IWARP_STATE_RTS) {
+						nes_debug(NES_DBG_MOD_QP, "QP%u: State change to closing"
+								" already done based on hw state.\n",
+								nesqp->hwqp.qp_id);
+						issue_modify_qp = 0;
+					}
+					switch (nesqp->hw_iwarp_state) {
+						case NES_AEQE_IWARP_STATE_CLOSING:
+							next_iwarp_state = NES_CQP_QP_IWARP_STATE_CLOSING;
+							break;
+						case NES_AEQE_IWARP_STATE_TERMINATE:
+							next_iwarp_state = NES_CQP_QP_IWARP_STATE_TERMINATE;
+							break;
+						case NES_AEQE_IWARP_STATE_ERROR:
+							next_iwarp_state = NES_CQP_QP_IWARP_STATE_ERROR;
+							break;
+						default:
+							next_iwarp_state = NES_CQP_QP_IWARP_STATE_CLOSING;
+							nesqp->hw_iwarp_state = NES_AEQE_IWARP_STATE_CLOSING;
+							break;
+					}
+				}
+				break;
+			case IB_QPS_SQE:
+				nes_debug(NES_DBG_MOD_QP, "QP%u: new state = terminate\n",
+						nesqp->hwqp.qp_id);
+				if (nesqp->iwarp_state>=(u32)NES_CQP_QP_IWARP_STATE_TERMINATE) {
+					spin_unlock_irqrestore(&nesqp->lock, qplockflags);
+					return -EINVAL;
+				}
+				/* next_iwarp_state = (NES_CQP_QP_IWARP_STATE_TERMINATE | 0x02000000); */
+				next_iwarp_state = NES_CQP_QP_IWARP_STATE_TERMINATE;
+				nesqp->hw_iwarp_state = NES_AEQE_IWARP_STATE_TERMINATE;
+				issue_modify_qp = 1;
+				break;
+			case IB_QPS_ERR:
+			case IB_QPS_RESET:
+				if (nesqp->iwarp_state == (u32)NES_CQP_QP_IWARP_STATE_ERROR) {
+					spin_unlock_irqrestore(&nesqp->lock, qplockflags);
+					return -EINVAL;
+				}
+				nes_debug(NES_DBG_MOD_QP, "QP%u: new state = error\n",
+						nesqp->hwqp.qp_id);
+				if (nesqp->term_flags)
+					del_timer(&nesqp->terminate_timer);
+
+				next_iwarp_state = NES_CQP_QP_IWARP_STATE_ERROR;
+				/* next_iwarp_state = (NES_CQP_QP_IWARP_STATE_TERMINATE | 0x02000000); */
+					if (nesqp->hte_added) {
+						nes_debug(NES_DBG_MOD_QP, "set CQP_QP_DEL_HTE\n");
+						next_iwarp_state |= NES_CQP_QP_DEL_HTE;
+						nesqp->hte_added = 0;
+					}
+				if ((nesqp->hw_tcp_state > NES_AEQE_TCP_STATE_CLOSED) &&
+						(nesdev->iw_status) &&
+						(nesqp->hw_tcp_state != NES_AEQE_TCP_STATE_TIME_WAIT)) {
+					next_iwarp_state |= NES_CQP_QP_RESET;
+				} else {
+					nes_debug(NES_DBG_MOD_QP, "QP%u NOT setting NES_CQP_QP_RESET since TCP state = %u\n",
+							nesqp->hwqp.qp_id, nesqp->hw_tcp_state);
+					dont_wait = 1;
+				}
+				issue_modify_qp = 1;
+				nesqp->hw_iwarp_state = NES_AEQE_IWARP_STATE_ERROR;
+				break;
+			default:
+				spin_unlock_irqrestore(&nesqp->lock, qplockflags);
+				return -EINVAL;
+				break;
+		}
+
+		nesqp->ibqp_state = attr->qp_state;
+		nesqp->iwarp_state = next_iwarp_state & NES_CQP_QP_IWARP_STATE_MASK;
+		nes_debug(NES_DBG_MOD_QP, "Change nesqp->iwarp_state=%08x\n",
+				nesqp->iwarp_state);
+	}
+
+	if (attr_mask & IB_QP_ACCESS_FLAGS) {
+		if (attr->qp_access_flags & IB_ACCESS_LOCAL_WRITE) {
+			nesqp->nesqp_context->misc |= cpu_to_le32(NES_QPCONTEXT_MISC_RDMA_WRITE_EN |
+					NES_QPCONTEXT_MISC_RDMA_READ_EN);
+			issue_modify_qp = 1;
+		}
+		if (attr->qp_access_flags & IB_ACCESS_REMOTE_WRITE) {
+			nesqp->nesqp_context->misc |= cpu_to_le32(NES_QPCONTEXT_MISC_RDMA_WRITE_EN);
+			issue_modify_qp = 1;
+		}
+		if (attr->qp_access_flags & IB_ACCESS_REMOTE_READ) {
+			nesqp->nesqp_context->misc |= cpu_to_le32(NES_QPCONTEXT_MISC_RDMA_READ_EN);
+			issue_modify_qp = 1;
+		}
+		if (attr->qp_access_flags & IB_ACCESS_MW_BIND) {
+			nesqp->nesqp_context->misc |= cpu_to_le32(NES_QPCONTEXT_MISC_WBIND_EN);
+			issue_modify_qp = 1;
+		}
+
+		if (nesqp->user_mode) {
+			nesqp->nesqp_context->misc |= cpu_to_le32(NES_QPCONTEXT_MISC_RDMA_WRITE_EN |
+					NES_QPCONTEXT_MISC_RDMA_READ_EN);
+			issue_modify_qp = 1;
+		}
+	}
+
+	original_last_aeq = nesqp->last_aeq;
+	spin_unlock_irqrestore(&nesqp->lock, qplockflags);
+
+	nes_debug(NES_DBG_MOD_QP, "issue_modify_qp=%u\n", issue_modify_qp);
+
+	ret = 0;
+
+
+	if (issue_modify_qp) {
+		nes_debug(NES_DBG_MOD_QP, "call nes_hw_modify_qp\n");
+		ret = nes_hw_modify_qp(nesdev, nesqp, next_iwarp_state, 0, 1);
+		if (ret)
+			nes_debug(NES_DBG_MOD_QP, "nes_hw_modify_qp (next_iwarp_state = 0x%08X)"
+					" failed for QP%u.\n",
+					next_iwarp_state, nesqp->hwqp.qp_id);
+
+	}
+
+	if ((issue_modify_qp) && (nesqp->ibqp_state > IB_QPS_RTS)) {
+		nes_debug(NES_DBG_MOD_QP, "QP%u Issued ModifyQP refcount (%d),"
+				" original_last_aeq = 0x%04X. last_aeq = 0x%04X.\n",
+				nesqp->hwqp.qp_id, atomic_read(&nesqp->refcount),
+				original_last_aeq, nesqp->last_aeq);
+		if (!ret || original_last_aeq != NES_AEQE_AEID_RDMAP_ROE_BAD_LLP_CLOSE) {
+			if (dont_wait) {
+				if (nesqp->cm_id && nesqp->hw_tcp_state != 0) {
+					nes_debug(NES_DBG_MOD_QP, "QP%u Queuing fake disconnect for QP refcount (%d),"
+							" original_last_aeq = 0x%04X. last_aeq = 0x%04X.\n",
+							nesqp->hwqp.qp_id, atomic_read(&nesqp->refcount),
+							original_last_aeq, nesqp->last_aeq);
+					/* this one is for the cm_disconnect thread */
+					spin_lock_irqsave(&nesqp->lock, qplockflags);
+					nesqp->hw_tcp_state = NES_AEQE_TCP_STATE_CLOSED;
+					nesqp->last_aeq = NES_AEQE_AEID_RESET_SENT;
+					spin_unlock_irqrestore(&nesqp->lock, qplockflags);
+					nes_cm_disconn(nesqp);
+				} else {
+					nes_debug(NES_DBG_MOD_QP, "QP%u No fake disconnect, QP refcount=%d\n",
+							nesqp->hwqp.qp_id, atomic_read(&nesqp->refcount));
+				}
+			} else {
+				spin_lock_irqsave(&nesqp->lock, qplockflags);
+				if (nesqp->cm_id) {
+					/* These two are for the timer thread */
+					if (atomic_inc_return(&nesqp->close_timer_started) == 1) {
+						nesqp->cm_id->add_ref(nesqp->cm_id);
+						nes_debug(NES_DBG_MOD_QP, "QP%u Not decrementing QP refcount (%d),"
+								" need ae to finish up, original_last_aeq = 0x%04X."
+								" last_aeq = 0x%04X, scheduling timer.\n",
+								nesqp->hwqp.qp_id, atomic_read(&nesqp->refcount),
+								original_last_aeq, nesqp->last_aeq);
+						schedule_nes_timer(nesqp->cm_node, (struct sk_buff *) nesqp, NES_TIMER_TYPE_CLOSE, 1, 0);
+					}
+					spin_unlock_irqrestore(&nesqp->lock, qplockflags);
+				} else {
+					spin_unlock_irqrestore(&nesqp->lock, qplockflags);
+					nes_debug(NES_DBG_MOD_QP, "QP%u Not decrementing QP refcount (%d),"
+							" need ae to finish up, original_last_aeq = 0x%04X."
+							" last_aeq = 0x%04X.\n",
+							nesqp->hwqp.qp_id, atomic_read(&nesqp->refcount),
+							original_last_aeq, nesqp->last_aeq);
+				}
+			}
+		} else {
+			nes_debug(NES_DBG_MOD_QP, "QP%u Decrementing QP refcount (%d), No ae to finish up,"
+					" original_last_aeq = 0x%04X. last_aeq = 0x%04X.\n",
+					nesqp->hwqp.qp_id, atomic_read(&nesqp->refcount),
+					original_last_aeq, nesqp->last_aeq);
+		}
+	} else {
+		nes_debug(NES_DBG_MOD_QP, "QP%u Decrementing QP refcount (%d), No ae to finish up,"
+				" original_last_aeq = 0x%04X. last_aeq = 0x%04X.\n",
+				nesqp->hwqp.qp_id, atomic_read(&nesqp->refcount),
+				original_last_aeq, nesqp->last_aeq);
+	}
+
+	err = 0;
+
+	nes_debug(NES_DBG_MOD_QP, "QP%u Leaving, refcount=%d\n",
+			nesqp->hwqp.qp_id, atomic_read(&nesqp->refcount));
+
+	return err;
+}
+
+
+/**
+ * nes_muticast_attach
+ */
+static int nes_multicast_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
+{
+	nes_debug(NES_DBG_INIT, "\n");
+	return -ENOSYS;
+}
+
+
+/**
+ * nes_multicast_detach
+ */
+static int nes_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
+{
+	nes_debug(NES_DBG_INIT, "\n");
+	return -ENOSYS;
+}
+
+
+/**
+ * nes_process_mad
+ */
+static int nes_process_mad(struct ib_device *ibdev, int mad_flags,
+		u8 port_num, struct ib_wc *in_wc, struct ib_grh *in_grh,
+		struct ib_mad *in_mad, struct ib_mad *out_mad)
+{
+	nes_debug(NES_DBG_INIT, "\n");
+	return -ENOSYS;
+}
+
+static inline void
+fill_wqe_sg_send(struct nes_hw_qp_wqe *wqe, struct ib_send_wr *ib_wr, u32 uselkey)
+{
+	int sge_index;
+	int total_payload_length = 0;
+	for (sge_index = 0; sge_index < ib_wr->num_sge; sge_index++) {
+		set_wqe_64bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_FRAG0_LOW_IDX+(sge_index*4),
+			ib_wr->sg_list[sge_index].addr);
+		set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_LENGTH0_IDX + (sge_index*4),
+			ib_wr->sg_list[sge_index].length);
+		if (uselkey)
+			set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_STAG0_IDX + (sge_index*4),
+						(ib_wr->sg_list[sge_index].lkey));
+		else
+			set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_STAG0_IDX + (sge_index*4), 0);
+
+		total_payload_length += ib_wr->sg_list[sge_index].length;
+	}
+	nes_debug(NES_DBG_IW_TX, "UC UC UC, sending total_payload_length=%u \n",
+			total_payload_length);
+	set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_TOTAL_PAYLOAD_IDX,
+				total_payload_length);
+}
+
+/**
+ * nes_post_send
+ */
+static int nes_post_send(struct ib_qp *ibqp, struct ib_send_wr *ib_wr,
+		struct ib_send_wr **bad_wr)
+{
+	u64 u64temp;
+	unsigned long flags = 0;
+	struct nes_vnic *nesvnic = to_nesvnic(ibqp->device);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	struct nes_qp *nesqp = to_nesqp(ibqp);
+	struct nes_hw_qp_wqe *wqe;
+	int err = 0;
+	u32 qsize = nesqp->hwqp.sq_size;
+	u32 head;
+	u32 wqe_misc = 0;
+	u32 wqe_count = 0;
+	u32 counter;
+
+	if (nesqp->ibqp_state > IB_QPS_RTS) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	spin_lock_irqsave(&nesqp->lock, flags);
+
+	head = nesqp->hwqp.sq_head;
+
+	while (ib_wr) {
+		/* Check for QP error */
+		if (nesqp->term_flags) {
+			err = -EINVAL;
+			break;
+		}
+
+		/* Check for SQ overflow */
+		if (((head + (2 * qsize) - nesqp->hwqp.sq_tail) % qsize) == (qsize - 1)) {
+			err = -ENOMEM;
+			break;
+		}
+
+		wqe = &nesqp->hwqp.sq_vbase[head];
+		/* nes_debug(NES_DBG_IW_TX, "processing sq wqe for QP%u at %p, head = %u.\n",
+				nesqp->hwqp.qp_id, wqe, head); */
+		nes_fill_init_qp_wqe(wqe, nesqp, head);
+		u64temp = (u64)(ib_wr->wr_id);
+		set_wqe_64bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_COMP_SCRATCH_LOW_IDX,
+					u64temp);
+		switch (ib_wr->opcode) {
+		case IB_WR_SEND:
+		case IB_WR_SEND_WITH_INV:
+			if (IB_WR_SEND == ib_wr->opcode) {
+				if (ib_wr->send_flags & IB_SEND_SOLICITED)
+					wqe_misc = NES_IWARP_SQ_OP_SENDSE;
+				else
+					wqe_misc = NES_IWARP_SQ_OP_SEND;
+			} else {
+				if (ib_wr->send_flags & IB_SEND_SOLICITED)
+					wqe_misc = NES_IWARP_SQ_OP_SENDSEINV;
+				else
+					wqe_misc = NES_IWARP_SQ_OP_SENDINV;
+
+				set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_INV_STAG_LOW_IDX,
+						    ib_wr->ex.invalidate_rkey);
+			}
+
+			if (ib_wr->num_sge > nesdev->nesadapter->max_sge) {
+				err = -EINVAL;
+				break;
+			}
+
+			if (ib_wr->send_flags & IB_SEND_FENCE)
+				wqe_misc |= NES_IWARP_SQ_WQE_LOCAL_FENCE;
+
+			if ((ib_wr->send_flags & IB_SEND_INLINE) &&
+			    ((nes_drv_opt & NES_DRV_OPT_NO_INLINE_DATA) == 0) &&
+			     (ib_wr->sg_list[0].length <= 64)) {
+				memcpy(&wqe->wqe_words[NES_IWARP_SQ_WQE_IMM_DATA_START_IDX],
+				       (void *)(unsigned long)ib_wr->sg_list[0].addr, ib_wr->sg_list[0].length);
+				set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_TOTAL_PAYLOAD_IDX,
+						    ib_wr->sg_list[0].length);
+				wqe_misc |= NES_IWARP_SQ_WQE_IMM_DATA;
+			} else {
+				fill_wqe_sg_send(wqe, ib_wr, 1);
+			}
+
+			break;
+		case IB_WR_RDMA_WRITE:
+			wqe_misc = NES_IWARP_SQ_OP_RDMAW;
+			if (ib_wr->num_sge > nesdev->nesadapter->max_sge) {
+				nes_debug(NES_DBG_IW_TX, "Exceeded max sge, ib_wr=%u, max=%u\n",
+					  ib_wr->num_sge, nesdev->nesadapter->max_sge);
+				err = -EINVAL;
+				break;
+			}
+
+			if (ib_wr->send_flags & IB_SEND_FENCE)
+				wqe_misc |= NES_IWARP_SQ_WQE_LOCAL_FENCE;
+
+			set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_RDMA_STAG_IDX,
+					    ib_wr->wr.rdma.rkey);
+			set_wqe_64bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_RDMA_TO_LOW_IDX,
+					    ib_wr->wr.rdma.remote_addr);
+
+			if ((ib_wr->send_flags & IB_SEND_INLINE) &&
+			    ((nes_drv_opt & NES_DRV_OPT_NO_INLINE_DATA) == 0) &&
+			     (ib_wr->sg_list[0].length <= 64)) {
+				memcpy(&wqe->wqe_words[NES_IWARP_SQ_WQE_IMM_DATA_START_IDX],
+				       (void *)(unsigned long)ib_wr->sg_list[0].addr, ib_wr->sg_list[0].length);
+				set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_TOTAL_PAYLOAD_IDX,
+						    ib_wr->sg_list[0].length);
+				wqe_misc |= NES_IWARP_SQ_WQE_IMM_DATA;
+			} else {
+				fill_wqe_sg_send(wqe, ib_wr, 1);
+			}
+
+			wqe->wqe_words[NES_IWARP_SQ_WQE_RDMA_LENGTH_IDX] =
+				wqe->wqe_words[NES_IWARP_SQ_WQE_TOTAL_PAYLOAD_IDX];
+			break;
+		case IB_WR_RDMA_READ:
+		case IB_WR_RDMA_READ_WITH_INV:
+			/* iWARP only supports 1 sge for RDMA reads */
+			if (ib_wr->num_sge > 1) {
+				nes_debug(NES_DBG_IW_TX, "Exceeded max sge, ib_wr=%u, max=1\n",
+					  ib_wr->num_sge);
+				err = -EINVAL;
+				break;
+			}
+			if (ib_wr->opcode == IB_WR_RDMA_READ) {
+				wqe_misc = NES_IWARP_SQ_OP_RDMAR;
+			} else {
+				wqe_misc = NES_IWARP_SQ_OP_RDMAR_LOCINV;
+				set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_INV_STAG_LOW_IDX,
+						    ib_wr->ex.invalidate_rkey);
+			}
+
+			set_wqe_64bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_RDMA_TO_LOW_IDX,
+					    ib_wr->wr.rdma.remote_addr);
+			set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_RDMA_STAG_IDX,
+					    ib_wr->wr.rdma.rkey);
+			set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_RDMA_LENGTH_IDX,
+					    ib_wr->sg_list->length);
+			set_wqe_64bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_FRAG0_LOW_IDX,
+					    ib_wr->sg_list->addr);
+			set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_STAG0_IDX,
+					    ib_wr->sg_list->lkey);
+			break;
+		case IB_WR_LOCAL_INV:
+			wqe_misc = NES_IWARP_SQ_OP_LOCINV;
+			set_wqe_32bit_value(wqe->wqe_words,
+					    NES_IWARP_SQ_LOCINV_WQE_INV_STAG_IDX,
+					    ib_wr->ex.invalidate_rkey);
+			break;
+		case IB_WR_FAST_REG_MR:
+		{
+			int i;
+			int flags = ib_wr->wr.fast_reg.access_flags;
+			struct nes_ib_fast_reg_page_list *pnesfrpl =
+				container_of(ib_wr->wr.fast_reg.page_list,
+					     struct nes_ib_fast_reg_page_list,
+					     ibfrpl);
+			u64 *src_page_list = pnesfrpl->ibfrpl.page_list;
+			u64 *dst_page_list = pnesfrpl->nes_wqe_pbl.kva;
+
+			if (ib_wr->wr.fast_reg.page_list_len >
+			    (NES_4K_PBL_CHUNK_SIZE / sizeof(u64))) {
+				nes_debug(NES_DBG_IW_TX, "SQ_FMR: bad page_list_len\n");
+				err = -EINVAL;
+				break;
+			}
+			wqe_misc = NES_IWARP_SQ_OP_FAST_REG;
+			set_wqe_64bit_value(wqe->wqe_words,
+					    NES_IWARP_SQ_FMR_WQE_VA_FBO_LOW_IDX,
+					    ib_wr->wr.fast_reg.iova_start);
+			set_wqe_32bit_value(wqe->wqe_words,
+					    NES_IWARP_SQ_FMR_WQE_LENGTH_LOW_IDX,
+					    ib_wr->wr.fast_reg.length);
+			set_wqe_32bit_value(wqe->wqe_words,
+					    NES_IWARP_SQ_FMR_WQE_LENGTH_HIGH_IDX, 0);
+			set_wqe_32bit_value(wqe->wqe_words,
+					    NES_IWARP_SQ_FMR_WQE_MR_STAG_IDX,
+					    ib_wr->wr.fast_reg.rkey);
+			/* Set page size: */
+			if (ib_wr->wr.fast_reg.page_shift == 12) {
+				wqe_misc |= NES_IWARP_SQ_FMR_WQE_PAGE_SIZE_4K;
+			} else if (ib_wr->wr.fast_reg.page_shift == 21) {
+				wqe_misc |= NES_IWARP_SQ_FMR_WQE_PAGE_SIZE_2M;
+			} else {
+				nes_debug(NES_DBG_IW_TX, "Invalid page shift,"
+					  " ib_wr=%u, max=1\n", ib_wr->num_sge);
+				err = -EINVAL;
+				break;
+			}
+			/* Set access_flags */
+			wqe_misc |= NES_IWARP_SQ_FMR_WQE_RIGHTS_ENABLE_LOCAL_READ;
+			if (flags & IB_ACCESS_LOCAL_WRITE)
+				wqe_misc |= NES_IWARP_SQ_FMR_WQE_RIGHTS_ENABLE_LOCAL_WRITE;
+
+			if (flags & IB_ACCESS_REMOTE_WRITE)
+				wqe_misc |= NES_IWARP_SQ_FMR_WQE_RIGHTS_ENABLE_REMOTE_WRITE;
+
+			if (flags & IB_ACCESS_REMOTE_READ)
+				wqe_misc |= NES_IWARP_SQ_FMR_WQE_RIGHTS_ENABLE_REMOTE_READ;
+
+			if (flags & IB_ACCESS_MW_BIND)
+				wqe_misc |= NES_IWARP_SQ_FMR_WQE_RIGHTS_ENABLE_WINDOW_BIND;
+
+			/* Fill in PBL info: */
+			if (ib_wr->wr.fast_reg.page_list_len >
+			    pnesfrpl->ibfrpl.max_page_list_len) {
+				nes_debug(NES_DBG_IW_TX, "Invalid page list length,"
+					  " ib_wr=%p, value=%u, max=%u\n",
+					  ib_wr, ib_wr->wr.fast_reg.page_list_len,
+					  pnesfrpl->ibfrpl.max_page_list_len);
+				err = -EINVAL;
+				break;
+			}
+
+			set_wqe_64bit_value(wqe->wqe_words,
+					    NES_IWARP_SQ_FMR_WQE_PBL_ADDR_LOW_IDX,
+					    pnesfrpl->nes_wqe_pbl.paddr);
+
+			set_wqe_32bit_value(wqe->wqe_words,
+					    NES_IWARP_SQ_FMR_WQE_PBL_LENGTH_IDX,
+					    ib_wr->wr.fast_reg.page_list_len * 8);
+
+			for (i = 0; i < ib_wr->wr.fast_reg.page_list_len; i++)
+				dst_page_list[i] = cpu_to_le64(src_page_list[i]);
+
+			nes_debug(NES_DBG_IW_TX, "SQ_FMR: iova_start: %llx, "
+				  "length: %d, rkey: %0x, pgl_paddr: %llx, "
+				  "page_list_len: %u, wqe_misc: %x\n",
+				  (unsigned long long) ib_wr->wr.fast_reg.iova_start,
+				  ib_wr->wr.fast_reg.length,
+				  ib_wr->wr.fast_reg.rkey,
+				  (unsigned long long) pnesfrpl->nes_wqe_pbl.paddr,
+				  ib_wr->wr.fast_reg.page_list_len,
+				  wqe_misc);
+			break;
+		}
+		default:
+			/* error */
+			err = -EINVAL;
+			break;
+		}
+
+		if (err)
+			break;
+
+		if ((ib_wr->send_flags & IB_SEND_SIGNALED) || nesqp->sig_all)
+			wqe_misc |= NES_IWARP_SQ_WQE_SIGNALED_COMPL;
+
+		wqe->wqe_words[NES_IWARP_SQ_WQE_MISC_IDX] = cpu_to_le32(wqe_misc);
+
+		ib_wr = ib_wr->next;
+		head++;
+		wqe_count++;
+		if (head >= qsize)
+			head = 0;
+
+	}
+
+	nesqp->hwqp.sq_head = head;
+	barrier();
+	while (wqe_count) {
+		counter = min(wqe_count, ((u32)255));
+		wqe_count -= counter;
+		nes_write32(nesdev->regs + NES_WQE_ALLOC,
+				(counter << 24) | 0x00800000 | nesqp->hwqp.qp_id);
+	}
+
+	spin_unlock_irqrestore(&nesqp->lock, flags);
+
+out:
+	if (err)
+		*bad_wr = ib_wr;
+	return err;
+}
+
+
+/**
+ * nes_post_recv
+ */
+static int nes_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *ib_wr,
+		struct ib_recv_wr **bad_wr)
+{
+	u64 u64temp;
+	unsigned long flags = 0;
+	struct nes_vnic *nesvnic = to_nesvnic(ibqp->device);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	struct nes_qp *nesqp = to_nesqp(ibqp);
+	struct nes_hw_qp_wqe *wqe;
+	int err = 0;
+	int sge_index;
+	u32 qsize = nesqp->hwqp.rq_size;
+	u32 head;
+	u32 wqe_count = 0;
+	u32 counter;
+	u32 total_payload_length;
+
+	if (nesqp->ibqp_state > IB_QPS_RTS) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	spin_lock_irqsave(&nesqp->lock, flags);
+
+	head = nesqp->hwqp.rq_head;
+
+	while (ib_wr) {
+		/* Check for QP error */
+		if (nesqp->term_flags) {
+			err = -EINVAL;
+			break;
+		}
+
+		if (ib_wr->num_sge > nesdev->nesadapter->max_sge) {
+			err = -EINVAL;
+			break;
+		}
+		/* Check for RQ overflow */
+		if (((head + (2 * qsize) - nesqp->hwqp.rq_tail) % qsize) == (qsize - 1)) {
+			err = -ENOMEM;
+			break;
+		}
+
+		nes_debug(NES_DBG_IW_RX, "ibwr sge count = %u.\n", ib_wr->num_sge);
+		wqe = &nesqp->hwqp.rq_vbase[head];
+
+		/* nes_debug(NES_DBG_IW_RX, "QP%u:processing rq wqe at %p, head = %u.\n",
+				nesqp->hwqp.qp_id, wqe, head); */
+		nes_fill_init_qp_wqe(wqe, nesqp, head);
+		u64temp = (u64)(ib_wr->wr_id);
+		set_wqe_64bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_COMP_SCRATCH_LOW_IDX,
+					u64temp);
+		total_payload_length = 0;
+		for (sge_index=0; sge_index < ib_wr->num_sge; sge_index++) {
+			set_wqe_64bit_value(wqe->wqe_words, NES_IWARP_RQ_WQE_FRAG0_LOW_IDX+(sge_index*4),
+					ib_wr->sg_list[sge_index].addr);
+			set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_RQ_WQE_LENGTH0_IDX+(sge_index*4),
+					ib_wr->sg_list[sge_index].length);
+			set_wqe_32bit_value(wqe->wqe_words,NES_IWARP_RQ_WQE_STAG0_IDX+(sge_index*4),
+					ib_wr->sg_list[sge_index].lkey);
+
+			total_payload_length += ib_wr->sg_list[sge_index].length;
+		}
+		set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_RQ_WQE_TOTAL_PAYLOAD_IDX,
+					total_payload_length);
+
+		ib_wr = ib_wr->next;
+		head++;
+		wqe_count++;
+		if (head >= qsize)
+			head = 0;
+	}
+
+	nesqp->hwqp.rq_head = head;
+	barrier();
+	while (wqe_count) {
+		counter = min(wqe_count, ((u32)255));
+		wqe_count -= counter;
+		nes_write32(nesdev->regs+NES_WQE_ALLOC, (counter<<24) | nesqp->hwqp.qp_id);
+	}
+
+	spin_unlock_irqrestore(&nesqp->lock, flags);
+
+out:
+	if (err)
+		*bad_wr = ib_wr;
+	return err;
+}
+
+
+/**
+ * nes_poll_cq
+ */
+static int nes_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry)
+{
+	u64 u64temp;
+	u64 wrid;
+	unsigned long flags = 0;
+	struct nes_vnic *nesvnic = to_nesvnic(ibcq->device);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	struct nes_cq *nescq = to_nescq(ibcq);
+	struct nes_qp *nesqp;
+	struct nes_hw_cqe cqe;
+	u32 head;
+	u32 wq_tail = 0;
+	u32 cq_size;
+	u32 cqe_count = 0;
+	u32 wqe_index;
+	u32 u32temp;
+	u32 move_cq_head = 1;
+	u32 err_code;
+
+	nes_debug(NES_DBG_CQ, "\n");
+
+	spin_lock_irqsave(&nescq->lock, flags);
+
+	head = nescq->hw_cq.cq_head;
+	cq_size = nescq->hw_cq.cq_size;
+
+	while (cqe_count < num_entries) {
+		if ((le32_to_cpu(nescq->hw_cq.cq_vbase[head].cqe_words[NES_CQE_OPCODE_IDX]) &
+				NES_CQE_VALID) == 0)
+			break;
+
+		/*
+		 * Make sure we read CQ entry contents *after*
+		 * we've checked the valid bit.
+		 */
+		rmb();
+
+		cqe = nescq->hw_cq.cq_vbase[head];
+		u32temp = le32_to_cpu(cqe.cqe_words[NES_CQE_COMP_COMP_CTX_LOW_IDX]);
+		wqe_index = u32temp & (nesdev->nesadapter->max_qp_wr - 1);
+		u32temp &= ~(NES_SW_CONTEXT_ALIGN-1);
+		/* parse CQE, get completion context from WQE (either rq or sq) */
+		u64temp = (((u64)(le32_to_cpu(cqe.cqe_words[NES_CQE_COMP_COMP_CTX_HIGH_IDX])))<<32) |
+				((u64)u32temp);
+
+		if (u64temp) {
+			nesqp = (struct nes_qp *)(unsigned long)u64temp;
+			memset(entry, 0, sizeof *entry);
+			if (cqe.cqe_words[NES_CQE_ERROR_CODE_IDX] == 0) {
+				entry->status = IB_WC_SUCCESS;
+			} else {
+				err_code = le32_to_cpu(cqe.cqe_words[NES_CQE_ERROR_CODE_IDX]);
+				if (NES_IWARP_CQE_MAJOR_DRV == (err_code >> 16)) {
+					entry->status = err_code & 0x0000ffff;
+
+					/* The rest of the cqe's will be marked as flushed */
+					nescq->hw_cq.cq_vbase[head].cqe_words[NES_CQE_ERROR_CODE_IDX] =
+						cpu_to_le32((NES_IWARP_CQE_MAJOR_FLUSH << 16) |
+							    NES_IWARP_CQE_MINOR_FLUSH);
+				} else
+					entry->status = IB_WC_WR_FLUSH_ERR;
+			}
+
+			entry->qp = &nesqp->ibqp;
+			entry->src_qp = nesqp->hwqp.qp_id;
+
+			if (le32_to_cpu(cqe.cqe_words[NES_CQE_OPCODE_IDX]) & NES_CQE_SQ) {
+				if (nesqp->skip_lsmm) {
+					nesqp->skip_lsmm = 0;
+					nesqp->hwqp.sq_tail++;
+				}
+
+				/* Working on a SQ Completion*/
+				wrid = (((u64)(cpu_to_le32((u32)nesqp->hwqp.sq_vbase[wqe_index].
+						wqe_words[NES_IWARP_SQ_WQE_COMP_SCRATCH_HIGH_IDX]))) << 32) |
+						((u64)(cpu_to_le32((u32)nesqp->hwqp.sq_vbase[wqe_index].
+						wqe_words[NES_IWARP_SQ_WQE_COMP_SCRATCH_LOW_IDX])));
+				entry->byte_len = le32_to_cpu(nesqp->hwqp.sq_vbase[wqe_index].
+						wqe_words[NES_IWARP_SQ_WQE_TOTAL_PAYLOAD_IDX]);
+
+				switch (le32_to_cpu(nesqp->hwqp.sq_vbase[wqe_index].
+						wqe_words[NES_IWARP_SQ_WQE_MISC_IDX]) & 0x3f) {
+					case NES_IWARP_SQ_OP_RDMAW:
+						nes_debug(NES_DBG_CQ, "Operation = RDMA WRITE.\n");
+						entry->opcode = IB_WC_RDMA_WRITE;
+						break;
+					case NES_IWARP_SQ_OP_RDMAR:
+						nes_debug(NES_DBG_CQ, "Operation = RDMA READ.\n");
+						entry->opcode = IB_WC_RDMA_READ;
+						entry->byte_len = le32_to_cpu(nesqp->hwqp.sq_vbase[wqe_index].
+								wqe_words[NES_IWARP_SQ_WQE_RDMA_LENGTH_IDX]);
+						break;
+					case NES_IWARP_SQ_OP_SENDINV:
+					case NES_IWARP_SQ_OP_SENDSEINV:
+					case NES_IWARP_SQ_OP_SEND:
+					case NES_IWARP_SQ_OP_SENDSE:
+						nes_debug(NES_DBG_CQ, "Operation = Send.\n");
+						entry->opcode = IB_WC_SEND;
+						break;
+					case NES_IWARP_SQ_OP_LOCINV:
+						entry->opcode = IB_WC_LOCAL_INV;
+						break;
+					case NES_IWARP_SQ_OP_FAST_REG:
+						entry->opcode = IB_WC_FAST_REG_MR;
+						break;
+				}
+
+				nesqp->hwqp.sq_tail = (wqe_index+1)&(nesqp->hwqp.sq_size - 1);
+				if ((entry->status != IB_WC_SUCCESS) && (nesqp->hwqp.sq_tail != nesqp->hwqp.sq_head)) {
+					move_cq_head = 0;
+					wq_tail = nesqp->hwqp.sq_tail;
+				}
+			} else {
+				/* Working on a RQ Completion*/
+				entry->byte_len = le32_to_cpu(cqe.cqe_words[NES_CQE_PAYLOAD_LENGTH_IDX]);
+				wrid = ((u64)(le32_to_cpu(nesqp->hwqp.rq_vbase[wqe_index].wqe_words[NES_IWARP_RQ_WQE_COMP_SCRATCH_LOW_IDX]))) |
+					((u64)(le32_to_cpu(nesqp->hwqp.rq_vbase[wqe_index].wqe_words[NES_IWARP_RQ_WQE_COMP_SCRATCH_HIGH_IDX]))<<32);
+					entry->opcode = IB_WC_RECV;
+
+				nesqp->hwqp.rq_tail = (wqe_index+1)&(nesqp->hwqp.rq_size - 1);
+				if ((entry->status != IB_WC_SUCCESS) && (nesqp->hwqp.rq_tail != nesqp->hwqp.rq_head)) {
+					move_cq_head = 0;
+					wq_tail = nesqp->hwqp.rq_tail;
+				}
+			}
+
+			entry->wr_id = wrid;
+			entry++;
+			cqe_count++;
+		}
+
+		if (move_cq_head) {
+			nescq->hw_cq.cq_vbase[head].cqe_words[NES_CQE_OPCODE_IDX] = 0;
+			if (++head >= cq_size)
+				head = 0;
+			nescq->polled_completions++;
+
+			if ((nescq->polled_completions > (cq_size / 2)) ||
+					(nescq->polled_completions == 255)) {
+				nes_debug(NES_DBG_CQ, "CQ%u Issuing CQE Allocate since more than half of cqes"
+					" are pending %u of %u.\n",
+					nescq->hw_cq.cq_number, nescq->polled_completions, cq_size);
+				nes_write32(nesdev->regs+NES_CQE_ALLOC,
+					nescq->hw_cq.cq_number | (nescq->polled_completions << 16));
+				nescq->polled_completions = 0;
+			}
+		} else {
+			/* Update the wqe index and set status to flush */
+			wqe_index = le32_to_cpu(cqe.cqe_words[NES_CQE_COMP_COMP_CTX_LOW_IDX]);
+			wqe_index = (wqe_index & (~(nesdev->nesadapter->max_qp_wr - 1))) | wq_tail;
+			nescq->hw_cq.cq_vbase[head].cqe_words[NES_CQE_COMP_COMP_CTX_LOW_IDX] =
+				cpu_to_le32(wqe_index);
+			move_cq_head = 1; /* ready for next pass */
+		}
+	}
+
+	if (nescq->polled_completions) {
+		nes_write32(nesdev->regs+NES_CQE_ALLOC,
+				nescq->hw_cq.cq_number | (nescq->polled_completions << 16));
+		nescq->polled_completions = 0;
+	}
+
+	nescq->hw_cq.cq_head = head;
+	nes_debug(NES_DBG_CQ, "Reporting %u completions for CQ%u.\n",
+			cqe_count, nescq->hw_cq.cq_number);
+
+	spin_unlock_irqrestore(&nescq->lock, flags);
+
+	return cqe_count;
+}
+
+
+/**
+ * nes_req_notify_cq
+ */
+static int nes_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags notify_flags)
+		{
+	struct nes_vnic *nesvnic = to_nesvnic(ibcq->device);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	struct nes_cq *nescq = to_nescq(ibcq);
+	u32 cq_arm;
+
+	nes_debug(NES_DBG_CQ, "Requesting notification for CQ%u.\n",
+			nescq->hw_cq.cq_number);
+
+	cq_arm = nescq->hw_cq.cq_number;
+	if ((notify_flags & IB_CQ_SOLICITED_MASK) == IB_CQ_NEXT_COMP)
+		cq_arm |= NES_CQE_ALLOC_NOTIFY_NEXT;
+	else if ((notify_flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED)
+		cq_arm |= NES_CQE_ALLOC_NOTIFY_SE;
+	else
+		return -EINVAL;
+
+	nes_write32(nesdev->regs+NES_CQE_ALLOC, cq_arm);
+	nes_read32(nesdev->regs+NES_CQE_ALLOC);
+
+	return 0;
+}
+
+
+/**
+ * nes_init_ofa_device
+ */
+struct nes_ib_device *nes_init_ofa_device(struct net_device *netdev)
+{
+	struct nes_ib_device *nesibdev;
+	struct nes_vnic *nesvnic = netdev_priv(netdev);
+	struct nes_device *nesdev = nesvnic->nesdev;
+
+	nesibdev = (struct nes_ib_device *)ib_alloc_device(sizeof(struct nes_ib_device));
+	if (nesibdev == NULL) {
+		return NULL;
+	}
+	strlcpy(nesibdev->ibdev.name, "nes%d", IB_DEVICE_NAME_MAX);
+	nesibdev->ibdev.owner = THIS_MODULE;
+
+	nesibdev->ibdev.node_type = RDMA_NODE_RNIC;
+	memset(&nesibdev->ibdev.node_guid, 0, sizeof(nesibdev->ibdev.node_guid));
+	memcpy(&nesibdev->ibdev.node_guid, netdev->dev_addr, 6);
+
+	nesibdev->ibdev.uverbs_cmd_mask =
+			(1ull << IB_USER_VERBS_CMD_GET_CONTEXT) |
+			(1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) |
+			(1ull << IB_USER_VERBS_CMD_QUERY_PORT) |
+			(1ull << IB_USER_VERBS_CMD_ALLOC_PD) |
+			(1ull << IB_USER_VERBS_CMD_DEALLOC_PD) |
+			(1ull << IB_USER_VERBS_CMD_REG_MR) |
+			(1ull << IB_USER_VERBS_CMD_DEREG_MR) |
+			(1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) |
+			(1ull << IB_USER_VERBS_CMD_CREATE_CQ) |
+			(1ull << IB_USER_VERBS_CMD_DESTROY_CQ) |
+			(1ull << IB_USER_VERBS_CMD_CREATE_AH) |
+			(1ull << IB_USER_VERBS_CMD_DESTROY_AH) |
+			(1ull << IB_USER_VERBS_CMD_REQ_NOTIFY_CQ) |
+			(1ull << IB_USER_VERBS_CMD_CREATE_QP) |
+			(1ull << IB_USER_VERBS_CMD_MODIFY_QP) |
+			(1ull << IB_USER_VERBS_CMD_POLL_CQ) |
+			(1ull << IB_USER_VERBS_CMD_DESTROY_QP) |
+			(1ull << IB_USER_VERBS_CMD_ALLOC_MW) |
+			(1ull << IB_USER_VERBS_CMD_BIND_MW) |
+			(1ull << IB_USER_VERBS_CMD_DEALLOC_MW) |
+			(1ull << IB_USER_VERBS_CMD_POST_RECV) |
+			(1ull << IB_USER_VERBS_CMD_POST_SEND);
+
+	nesibdev->ibdev.phys_port_cnt = 1;
+	nesibdev->ibdev.num_comp_vectors = 1;
+	nesibdev->ibdev.dma_device = &nesdev->pcidev->dev;
+	nesibdev->ibdev.dev.parent = &nesdev->pcidev->dev;
+	nesibdev->ibdev.query_device = nes_query_device;
+	nesibdev->ibdev.query_port = nes_query_port;
+	nesibdev->ibdev.query_pkey = nes_query_pkey;
+	nesibdev->ibdev.query_gid = nes_query_gid;
+	nesibdev->ibdev.alloc_ucontext = nes_alloc_ucontext;
+	nesibdev->ibdev.dealloc_ucontext = nes_dealloc_ucontext;
+	nesibdev->ibdev.mmap = nes_mmap;
+	nesibdev->ibdev.alloc_pd = nes_alloc_pd;
+	nesibdev->ibdev.dealloc_pd = nes_dealloc_pd;
+	nesibdev->ibdev.create_ah = nes_create_ah;
+	nesibdev->ibdev.destroy_ah = nes_destroy_ah;
+	nesibdev->ibdev.create_qp = nes_create_qp;
+	nesibdev->ibdev.modify_qp = nes_modify_qp;
+	nesibdev->ibdev.query_qp = nes_query_qp;
+	nesibdev->ibdev.destroy_qp = nes_destroy_qp;
+	nesibdev->ibdev.create_cq = nes_create_cq;
+	nesibdev->ibdev.destroy_cq = nes_destroy_cq;
+	nesibdev->ibdev.poll_cq = nes_poll_cq;
+	nesibdev->ibdev.get_dma_mr = nes_get_dma_mr;
+	nesibdev->ibdev.reg_phys_mr = nes_reg_phys_mr;
+	nesibdev->ibdev.reg_user_mr = nes_reg_user_mr;
+	nesibdev->ibdev.dereg_mr = nes_dereg_mr;
+	nesibdev->ibdev.alloc_mw = nes_alloc_mw;
+	nesibdev->ibdev.dealloc_mw = nes_dealloc_mw;
+	nesibdev->ibdev.bind_mw = nes_bind_mw;
+
+	nesibdev->ibdev.alloc_fast_reg_mr = nes_alloc_fast_reg_mr;
+	nesibdev->ibdev.alloc_fast_reg_page_list = nes_alloc_fast_reg_page_list;
+	nesibdev->ibdev.free_fast_reg_page_list = nes_free_fast_reg_page_list;
+
+	nesibdev->ibdev.attach_mcast = nes_multicast_attach;
+	nesibdev->ibdev.detach_mcast = nes_multicast_detach;
+	nesibdev->ibdev.process_mad = nes_process_mad;
+
+	nesibdev->ibdev.req_notify_cq = nes_req_notify_cq;
+	nesibdev->ibdev.post_send = nes_post_send;
+	nesibdev->ibdev.post_recv = nes_post_recv;
+
+	nesibdev->ibdev.iwcm = kzalloc(sizeof(*nesibdev->ibdev.iwcm), GFP_KERNEL);
+	if (nesibdev->ibdev.iwcm == NULL) {
+		ib_dealloc_device(&nesibdev->ibdev);
+		return NULL;
+	}
+	nesibdev->ibdev.iwcm->add_ref = nes_add_ref;
+	nesibdev->ibdev.iwcm->rem_ref = nes_rem_ref;
+	nesibdev->ibdev.iwcm->get_qp = nes_get_qp;
+	nesibdev->ibdev.iwcm->connect = nes_connect;
+	nesibdev->ibdev.iwcm->accept = nes_accept;
+	nesibdev->ibdev.iwcm->reject = nes_reject;
+	nesibdev->ibdev.iwcm->create_listen = nes_create_listen;
+	nesibdev->ibdev.iwcm->destroy_listen = nes_destroy_listen;
+
+	return nesibdev;
+}
+
+
+/**
+ * nes_handle_delayed_event
+ */
+static void nes_handle_delayed_event(unsigned long data)
+{
+	struct nes_vnic *nesvnic = (void *) data;
+
+	if (nesvnic->delayed_event != nesvnic->last_dispatched_event) {
+		struct ib_event event;
+
+		event.device = &nesvnic->nesibdev->ibdev;
+		if (!event.device)
+			goto stop_timer;
+		event.event = nesvnic->delayed_event;
+		event.element.port_num = nesvnic->logical_port + 1;
+		ib_dispatch_event(&event);
+	}
+
+stop_timer:
+	nesvnic->event_timer.function = NULL;
+}
+
+
+void  nes_port_ibevent(struct nes_vnic *nesvnic)
+{
+	struct nes_ib_device *nesibdev = nesvnic->nesibdev;
+	struct nes_device *nesdev = nesvnic->nesdev;
+	struct ib_event event;
+	event.device = &nesibdev->ibdev;
+	event.element.port_num = nesvnic->logical_port + 1;
+	event.event = nesdev->iw_status ? IB_EVENT_PORT_ACTIVE : IB_EVENT_PORT_ERR;
+
+	if (!nesvnic->event_timer.function) {
+		ib_dispatch_event(&event);
+		nesvnic->last_dispatched_event = event.event;
+		nesvnic->event_timer.function = nes_handle_delayed_event;
+		nesvnic->event_timer.data = (unsigned long) nesvnic;
+		nesvnic->event_timer.expires = jiffies + NES_EVENT_DELAY;
+		add_timer(&nesvnic->event_timer);
+	} else {
+		mod_timer(&nesvnic->event_timer, jiffies + NES_EVENT_DELAY);
+	}
+	nesvnic->delayed_event = event.event;
+}
+
+
+/**
+ * nes_destroy_ofa_device
+ */
+void nes_destroy_ofa_device(struct nes_ib_device *nesibdev)
+{
+	if (nesibdev == NULL)
+		return;
+
+	nes_unregister_ofa_device(nesibdev);
+
+	kfree(nesibdev->ibdev.iwcm);
+	ib_dealloc_device(&nesibdev->ibdev);
+}
+
+
+/**
+ * nes_register_ofa_device
+ */
+int nes_register_ofa_device(struct nes_ib_device *nesibdev)
+{
+	struct nes_vnic *nesvnic = nesibdev->nesvnic;
+	struct nes_device *nesdev = nesvnic->nesdev;
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+	int i, ret;
+
+	ret = ib_register_device(&nesvnic->nesibdev->ibdev, NULL);
+	if (ret) {
+		return ret;
+	}
+
+	/* Get the resources allocated to this device */
+	nesibdev->max_cq = (nesadapter->max_cq-NES_FIRST_QPN) / nesadapter->port_count;
+	nesibdev->max_mr = nesadapter->max_mr / nesadapter->port_count;
+	nesibdev->max_qp = (nesadapter->max_qp-NES_FIRST_QPN) / nesadapter->port_count;
+	nesibdev->max_pd = nesadapter->max_pd / nesadapter->port_count;
+
+	for (i = 0; i < ARRAY_SIZE(nes_dev_attributes); ++i) {
+		ret = device_create_file(&nesibdev->ibdev.dev, nes_dev_attributes[i]);
+		if (ret) {
+			while (i > 0) {
+				i--;
+				device_remove_file(&nesibdev->ibdev.dev,
+						   nes_dev_attributes[i]);
+			}
+			ib_unregister_device(&nesibdev->ibdev);
+			return ret;
+		}
+	}
+
+	nesvnic->of_device_registered = 1;
+
+	return 0;
+}
+
+
+/**
+ * nes_unregister_ofa_device
+ */
+static void nes_unregister_ofa_device(struct nes_ib_device *nesibdev)
+{
+	struct nes_vnic *nesvnic = nesibdev->nesvnic;
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(nes_dev_attributes); ++i) {
+		device_remove_file(&nesibdev->ibdev.dev, nes_dev_attributes[i]);
+	}
+
+	if (nesvnic->of_device_registered) {
+		ib_unregister_device(&nesibdev->ibdev);
+	}
+
+	nesvnic->of_device_registered = 0;
+}
diff --git a/drivers/infiniband/hw/nes/nes_verbs.h b/drivers/infiniband/hw/nes/nes_verbs.h
new file mode 100644
index 0000000..309b31c
--- /dev/null
+++ b/drivers/infiniband/hw/nes/nes_verbs.h
@@ -0,0 +1,189 @@
+/*
+ * Copyright (c) 2006 - 2011 Intel Corporation.  All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifndef NES_VERBS_H
+#define NES_VERBS_H
+
+struct nes_device;
+
+#define NES_MAX_USER_DB_REGIONS  4096
+#define NES_MAX_USER_WQ_REGIONS  4096
+
+#define NES_TERM_SENT            0x01
+#define NES_TERM_RCVD            0x02
+#define NES_TERM_DONE            0x04
+
+struct nes_ucontext {
+	struct ib_ucontext ibucontext;
+	struct nes_device  *nesdev;
+	unsigned long      mmap_wq_offset;
+	unsigned long      mmap_cq_offset; /* to be removed */
+	int                index;		/* rnic index (minor) */
+	unsigned long      allocated_doorbells[BITS_TO_LONGS(NES_MAX_USER_DB_REGIONS)];
+	u16                mmap_db_index[NES_MAX_USER_DB_REGIONS];
+	u16                first_free_db;
+	unsigned long      allocated_wqs[BITS_TO_LONGS(NES_MAX_USER_WQ_REGIONS)];
+	struct nes_qp      *mmap_nesqp[NES_MAX_USER_WQ_REGIONS];
+	u16                first_free_wq;
+	struct list_head   cq_reg_mem_list;
+	struct list_head   qp_reg_mem_list;
+	u32                mcrqf;
+	atomic_t	   usecnt;
+};
+
+struct nes_pd {
+	struct ib_pd ibpd;
+	u16          pd_id;
+	atomic_t     sqp_count;
+	u16          mmap_db_index;
+};
+
+struct nes_mr {
+	union {
+		struct ib_mr  ibmr;
+		struct ib_mw  ibmw;
+		struct ib_fmr ibfmr;
+	};
+	struct ib_umem    *region;
+	u16               pbls_used;
+	u8                mode;
+	u8                pbl_4k;
+};
+
+struct nes_hw_pb {
+	__le32 pa_low;
+	__le32 pa_high;
+};
+
+struct nes_vpbl {
+	dma_addr_t       pbl_pbase;
+	struct nes_hw_pb *pbl_vbase;
+};
+
+struct nes_root_vpbl {
+	dma_addr_t       pbl_pbase;
+	struct nes_hw_pb *pbl_vbase;
+	struct nes_vpbl  *leaf_vpbl;
+};
+
+struct nes_fmr {
+	struct nes_mr        nesmr;
+	u32                  leaf_pbl_cnt;
+	struct nes_root_vpbl root_vpbl;
+	struct ib_qp         *ib_qp;
+	int                  access_rights;
+	struct ib_fmr_attr   attr;
+};
+
+struct nes_av;
+
+struct nes_cq {
+	struct ib_cq     ibcq;
+	struct nes_hw_cq hw_cq;
+	u32              polled_completions;
+	u32              cq_mem_size;
+	spinlock_t       lock;
+	u8               virtual_cq;
+	u8               pad[3];
+	u32		 mcrqf;
+};
+
+struct nes_wq {
+	spinlock_t lock;
+};
+
+struct disconn_work {
+	struct work_struct    work;
+	struct nes_qp         *nesqp;
+};
+
+struct iw_cm_id;
+struct ietf_mpa_frame;
+
+struct nes_qp {
+	struct ib_qp          ibqp;
+	void                  *allocated_buffer;
+	struct iw_cm_id       *cm_id;
+	struct nes_cq         *nesscq;
+	struct nes_cq         *nesrcq;
+	struct nes_pd         *nespd;
+	void *cm_node; /* handle of the node this QP is associated with */
+	void                  *ietf_frame;
+	u8                    ietf_frame_size;
+	dma_addr_t            ietf_frame_pbase;
+	struct ib_mr          *lsmm_mr;
+	struct nes_hw_qp      hwqp;
+	struct work_struct    work;
+	enum ib_qp_state      ibqp_state;
+	u32                   iwarp_state;
+	u32                   hte_index;
+	u32                   last_aeq;
+	u32                   qp_mem_size;
+	atomic_t              refcount;
+	atomic_t              close_timer_started;
+	u32                   mmap_sq_db_index;
+	u32                   mmap_rq_db_index;
+	spinlock_t            lock;
+	spinlock_t            pau_lock;
+	struct nes_qp_context *nesqp_context;
+	dma_addr_t            nesqp_context_pbase;
+	void	              *pbl_vbase;
+	dma_addr_t            pbl_pbase;
+	struct page           *page;
+	struct timer_list     terminate_timer;
+	enum ib_event_type    terminate_eventtype;
+	struct sk_buff_head   pau_list;
+	u32                   pau_rcv_nxt;
+	u16                   active_conn:1;
+	u16                   skip_lsmm:1;
+	u16                   user_mode:1;
+	u16                   hte_added:1;
+	u16                   flush_issued:1;
+	u16                   destroyed:1;
+	u16                   sig_all:1;
+	u16                   pau_mode:1;
+	u16                   rsvd:8;
+	u16                   private_data_len;
+	u16                   term_sq_flush_code;
+	u16                   term_rq_flush_code;
+	u8                    hw_iwarp_state;
+	u8                    hw_tcp_state;
+	u8                    term_flags;
+	u8                    sq_kmapped;
+	u8                    pau_busy;
+	u8                    pau_pending;
+	u8                    pau_state;
+	__u64                 nesuqp_addr;
+};
+#endif			/* NES_VERBS_H */
diff --git a/drivers/infiniband/hw/ocrdma/Kconfig b/drivers/infiniband/hw/ocrdma/Kconfig
new file mode 100644
index 0000000..c0cddc0
--- /dev/null
+++ b/drivers/infiniband/hw/ocrdma/Kconfig
@@ -0,0 +1,8 @@
+config INFINIBAND_OCRDMA
+	tristate "Emulex One Connect HCA support"
+	depends on ETHERNET && NETDEVICES && PCI && INET && (IPV6 || IPV6=n)
+	select NET_VENDOR_EMULEX
+	select BE2NET
+	---help---
+	  This driver provides low-level InfiniBand over Ethernet
+	  support for Emulex One Connect host channel adapters (HCAs).
diff --git a/drivers/infiniband/hw/ocrdma/Makefile b/drivers/infiniband/hw/ocrdma/Makefile
new file mode 100644
index 0000000..d1bfd4f
--- /dev/null
+++ b/drivers/infiniband/hw/ocrdma/Makefile
@@ -0,0 +1,5 @@
+ccflags-y := -Idrivers/net/ethernet/emulex/benet
+
+obj-$(CONFIG_INFINIBAND_OCRDMA)	+= ocrdma.o
+
+ocrdma-y :=	ocrdma_main.o ocrdma_verbs.o ocrdma_hw.o ocrdma_ah.o ocrdma_stats.o
diff --git a/drivers/infiniband/hw/ocrdma/ocrdma.h b/drivers/infiniband/hw/ocrdma/ocrdma.h
new file mode 100644
index 0000000..b43456a
--- /dev/null
+++ b/drivers/infiniband/hw/ocrdma/ocrdma.h
@@ -0,0 +1,543 @@
+/*******************************************************************
+ * This file is part of the Emulex RoCE Device Driver for          *
+ * RoCE (RDMA over Converged Ethernet) adapters.                   *
+ * Copyright (C) 2008-2012 Emulex. All rights reserved.            *
+ * EMULEX and SLI are trademarks of Emulex.                        *
+ * www.emulex.com                                                  *
+ *                                                                 *
+ * This program is free software; you can redistribute it and/or   *
+ * modify it under the terms of version 2 of the GNU General       *
+ * Public License as published by the Free Software Foundation.    *
+ * This program is distributed in the hope that it will be useful. *
+ * ALL EXPRESS OR IMPLIED CONDITIONS, REPRESENTATIONS AND          *
+ * WARRANTIES, INCLUDING ANY IMPLIED WARRANTY OF MERCHANTABILITY,  *
+ * FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT, ARE      *
+ * DISCLAIMED, EXCEPT TO THE EXTENT THAT SUCH DISCLAIMERS ARE HELD *
+ * TO BE LEGALLY INVALID.  See the GNU General Public License for  *
+ * more details, a copy of which can be found in the file COPYING  *
+ * included with this package.                                     *
+ *
+ * Contact Information:
+ * linux-drivers@emulex.com
+ *
+ * Emulex
+ * 3333 Susan Street
+ * Costa Mesa, CA 92626
+ *******************************************************************/
+
+#ifndef __OCRDMA_H__
+#define __OCRDMA_H__
+
+#include <linux/mutex.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/pci.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_user_verbs.h>
+#include <rdma/ib_addr.h>
+
+#include <be_roce.h>
+#include "ocrdma_sli.h"
+
+#define OCRDMA_ROCE_DRV_VERSION "10.2.287.0u"
+
+#define OCRDMA_ROCE_DRV_DESC "Emulex OneConnect RoCE Driver"
+#define OCRDMA_NODE_DESC "Emulex OneConnect RoCE HCA"
+
+#define OC_NAME_SH	OCRDMA_NODE_DESC "(Skyhawk)"
+#define OC_NAME_UNKNOWN OCRDMA_NODE_DESC "(Unknown)"
+
+#define OC_SKH_DEVICE_PF 0x720
+#define OC_SKH_DEVICE_VF 0x728
+#define OCRDMA_MAX_AH 512
+
+#define OCRDMA_UVERBS(CMD_NAME) (1ull << IB_USER_VERBS_CMD_##CMD_NAME)
+
+#define convert_to_64bit(lo, hi) ((u64)hi << 32 | (u64)lo)
+
+struct ocrdma_dev_attr {
+	u8 fw_ver[32];
+	u32 vendor_id;
+	u32 device_id;
+	u16 max_pd;
+	u16 max_cq;
+	u16 max_cqe;
+	u16 max_qp;
+	u16 max_wqe;
+	u16 max_rqe;
+	u16 max_srq;
+	u32 max_inline_data;
+	int max_send_sge;
+	int max_recv_sge;
+	int max_srq_sge;
+	int max_rdma_sge;
+	int max_mr;
+	u64 max_mr_size;
+	u32 max_num_mr_pbl;
+	int max_mw;
+	int max_fmr;
+	int max_map_per_fmr;
+	int max_pages_per_frmr;
+	u16 max_ord_per_qp;
+	u16 max_ird_per_qp;
+
+	int device_cap_flags;
+	u8 cq_overflow_detect;
+	u8 srq_supported;
+
+	u32 wqe_size;
+	u32 rqe_size;
+	u32 ird_page_size;
+	u8 local_ca_ack_delay;
+	u8 ird;
+	u8 num_ird_pages;
+};
+
+struct ocrdma_dma_mem {
+	void *va;
+	dma_addr_t pa;
+	u32 size;
+};
+
+struct ocrdma_pbl {
+	void *va;
+	dma_addr_t pa;
+};
+
+struct ocrdma_queue_info {
+	void *va;
+	dma_addr_t dma;
+	u32 size;
+	u16 len;
+	u16 entry_size;		/* Size of an element in the queue */
+	u16 id;			/* qid, where to ring the doorbell. */
+	u16 head, tail;
+	bool created;
+};
+
+struct ocrdma_eq {
+	struct ocrdma_queue_info q;
+	u32 vector;
+	int cq_cnt;
+	struct ocrdma_dev *dev;
+	char irq_name[32];
+};
+
+struct ocrdma_mq {
+	struct ocrdma_queue_info sq;
+	struct ocrdma_queue_info cq;
+	bool rearm_cq;
+};
+
+struct mqe_ctx {
+	struct mutex lock; /* for serializing mailbox commands on MQ */
+	wait_queue_head_t cmd_wait;
+	u32 tag;
+	u16 cqe_status;
+	u16 ext_status;
+	bool cmd_done;
+	bool fw_error_state;
+};
+
+struct ocrdma_hw_mr {
+	u32 lkey;
+	u8 fr_mr;
+	u8 remote_atomic;
+	u8 remote_rd;
+	u8 remote_wr;
+	u8 local_rd;
+	u8 local_wr;
+	u8 mw_bind;
+	u8 rsvd;
+	u64 len;
+	struct ocrdma_pbl *pbl_table;
+	u32 num_pbls;
+	u32 num_pbes;
+	u32 pbl_size;
+	u32 pbe_size;
+	u64 fbo;
+	u64 va;
+};
+
+struct ocrdma_mr {
+	struct ib_mr ibmr;
+	struct ib_umem *umem;
+	struct ocrdma_hw_mr hwmr;
+};
+
+struct ocrdma_stats {
+	u8 type;
+	struct ocrdma_dev *dev;
+};
+
+struct stats_mem {
+	struct ocrdma_mqe mqe;
+	void *va;
+	dma_addr_t pa;
+	u32 size;
+	char *debugfs_mem;
+};
+
+struct phy_info {
+	u16 auto_speeds_supported;
+	u16 fixed_speeds_supported;
+	u16 phy_type;
+	u16 interface_type;
+};
+
+struct ocrdma_dev {
+	struct ib_device ibdev;
+	struct ocrdma_dev_attr attr;
+
+	struct mutex dev_lock; /* provides syncronise access to device data */
+	spinlock_t flush_q_lock ____cacheline_aligned;
+
+	struct ocrdma_cq **cq_tbl;
+	struct ocrdma_qp **qp_tbl;
+
+	struct ocrdma_eq *eq_tbl;
+	int eq_cnt;
+	u16 base_eqid;
+	u16 max_eq;
+
+	union ib_gid *sgid_tbl;
+	/* provided synchronization to sgid table for
+	 * updating gid entries triggered by notifier.
+	 */
+	spinlock_t sgid_lock;
+
+	int gsi_qp_created;
+	struct ocrdma_cq *gsi_sqcq;
+	struct ocrdma_cq *gsi_rqcq;
+
+	struct {
+		struct ocrdma_av *va;
+		dma_addr_t pa;
+		u32 size;
+		u32 num_ah;
+		/* provide synchronization for av
+		 * entry allocations.
+		 */
+		spinlock_t lock;
+		u32 ahid;
+		struct ocrdma_pbl pbl;
+	} av_tbl;
+
+	void *mbx_cmd;
+	struct ocrdma_mq mq;
+	struct mqe_ctx mqe_ctx;
+
+	struct be_dev_info nic_info;
+	struct phy_info phy;
+	char model_number[32];
+	u32 hba_port_num;
+
+	struct list_head entry;
+	struct rcu_head rcu;
+	int id;
+	u64 *stag_arr;
+	u8 sl; /* service level */
+	bool pfc_state;
+	atomic_t update_sl;
+	u16 pvid;
+	u32 asic_id;
+
+	ulong last_stats_time;
+	struct mutex stats_lock; /* provide synch for debugfs operations */
+	struct stats_mem stats_mem;
+	struct ocrdma_stats rsrc_stats;
+	struct ocrdma_stats rx_stats;
+	struct ocrdma_stats wqe_stats;
+	struct ocrdma_stats tx_stats;
+	struct ocrdma_stats db_err_stats;
+	struct ocrdma_stats tx_qp_err_stats;
+	struct ocrdma_stats rx_qp_err_stats;
+	struct ocrdma_stats tx_dbg_stats;
+	struct ocrdma_stats rx_dbg_stats;
+	struct dentry *dir;
+};
+
+struct ocrdma_cq {
+	struct ib_cq ibcq;
+	struct ocrdma_cqe *va;
+	u32 phase;
+	u32 getp;	/* pointer to pending wrs to
+			 * return to stack, wrap arounds
+			 * at max_hw_cqe
+			 */
+	u32 max_hw_cqe;
+	bool phase_change;
+	bool deferred_arm, deferred_sol;
+	bool first_arm;
+
+	spinlock_t cq_lock ____cacheline_aligned; /* provide synchronization
+						   * to cq polling
+						   */
+	/* syncronizes cq completion handler invoked from multiple context */
+	spinlock_t comp_handler_lock ____cacheline_aligned;
+	u16 id;
+	u16 eqn;
+
+	struct ocrdma_ucontext *ucontext;
+	dma_addr_t pa;
+	u32 len;
+	u32 cqe_cnt;
+
+	/* head of all qp's sq and rq for which cqes need to be flushed
+	 * by the software.
+	 */
+	struct list_head sq_head, rq_head;
+};
+
+struct ocrdma_pd {
+	struct ib_pd ibpd;
+	struct ocrdma_ucontext *uctx;
+	u32 id;
+	int num_dpp_qp;
+	u32 dpp_page;
+	bool dpp_enabled;
+};
+
+struct ocrdma_ah {
+	struct ib_ah ibah;
+	struct ocrdma_av *av;
+	u16 sgid_index;
+	u32 id;
+};
+
+struct ocrdma_qp_hwq_info {
+	u8 *va;			/* virtual address */
+	u32 max_sges;
+	u32 head, tail;
+	u32 entry_size;
+	u32 max_cnt;
+	u32 max_wqe_idx;
+	u16 dbid;		/* qid, where to ring the doorbell. */
+	u32 len;
+	dma_addr_t pa;
+};
+
+struct ocrdma_srq {
+	struct ib_srq ibsrq;
+	u8 __iomem *db;
+	struct ocrdma_qp_hwq_info rq;
+	u64 *rqe_wr_id_tbl;
+	u32 *idx_bit_fields;
+	u32 bit_fields_len;
+
+	/* provide synchronization to multiple context(s) posting rqe */
+	spinlock_t q_lock ____cacheline_aligned;
+
+	struct ocrdma_pd *pd;
+	u32 id;
+};
+
+struct ocrdma_qp {
+	struct ib_qp ibqp;
+	struct ocrdma_dev *dev;
+
+	u8 __iomem *sq_db;
+	struct ocrdma_qp_hwq_info sq;
+	struct {
+		uint64_t wrid;
+		uint16_t dpp_wqe_idx;
+		uint16_t dpp_wqe;
+		uint8_t  signaled;
+		uint8_t  rsvd[3];
+	} *wqe_wr_id_tbl;
+	u32 max_inline_data;
+
+	/* provide synchronization to multiple context(s) posting wqe, rqe */
+	spinlock_t q_lock ____cacheline_aligned;
+	struct ocrdma_cq *sq_cq;
+	/* list maintained per CQ to flush SQ errors */
+	struct list_head sq_entry;
+
+	u8 __iomem *rq_db;
+	struct ocrdma_qp_hwq_info rq;
+	u64 *rqe_wr_id_tbl;
+	struct ocrdma_cq *rq_cq;
+	struct ocrdma_srq *srq;
+	/* list maintained per CQ to flush RQ errors */
+	struct list_head rq_entry;
+
+	enum ocrdma_qp_state state;	/*  QP state */
+	int cap_flags;
+	u32 max_ord, max_ird;
+
+	u32 id;
+	struct ocrdma_pd *pd;
+
+	enum ib_qp_type qp_type;
+
+	int sgid_idx;
+	u32 qkey;
+	bool dpp_enabled;
+	u8 *ird_q_va;
+	bool signaled;
+};
+
+struct ocrdma_ucontext {
+	struct ib_ucontext ibucontext;
+
+	struct list_head mm_head;
+	struct mutex mm_list_lock; /* protects list entries of mm type */
+	struct ocrdma_pd *cntxt_pd;
+	int pd_in_use;
+
+	struct {
+		u32 *va;
+		dma_addr_t pa;
+		u32 len;
+	} ah_tbl;
+};
+
+struct ocrdma_mm {
+	struct {
+		u64 phy_addr;
+		unsigned long len;
+	} key;
+	struct list_head entry;
+};
+
+static inline struct ocrdma_dev *get_ocrdma_dev(struct ib_device *ibdev)
+{
+	return container_of(ibdev, struct ocrdma_dev, ibdev);
+}
+
+static inline struct ocrdma_ucontext *get_ocrdma_ucontext(struct ib_ucontext
+							  *ibucontext)
+{
+	return container_of(ibucontext, struct ocrdma_ucontext, ibucontext);
+}
+
+static inline struct ocrdma_pd *get_ocrdma_pd(struct ib_pd *ibpd)
+{
+	return container_of(ibpd, struct ocrdma_pd, ibpd);
+}
+
+static inline struct ocrdma_cq *get_ocrdma_cq(struct ib_cq *ibcq)
+{
+	return container_of(ibcq, struct ocrdma_cq, ibcq);
+}
+
+static inline struct ocrdma_qp *get_ocrdma_qp(struct ib_qp *ibqp)
+{
+	return container_of(ibqp, struct ocrdma_qp, ibqp);
+}
+
+static inline struct ocrdma_mr *get_ocrdma_mr(struct ib_mr *ibmr)
+{
+	return container_of(ibmr, struct ocrdma_mr, ibmr);
+}
+
+static inline struct ocrdma_ah *get_ocrdma_ah(struct ib_ah *ibah)
+{
+	return container_of(ibah, struct ocrdma_ah, ibah);
+}
+
+static inline struct ocrdma_srq *get_ocrdma_srq(struct ib_srq *ibsrq)
+{
+	return container_of(ibsrq, struct ocrdma_srq, ibsrq);
+}
+
+static inline int is_cqe_valid(struct ocrdma_cq *cq, struct ocrdma_cqe *cqe)
+{
+	int cqe_valid;
+	cqe_valid = le32_to_cpu(cqe->flags_status_srcqpn) & OCRDMA_CQE_VALID;
+	return (cqe_valid == cq->phase);
+}
+
+static inline int is_cqe_for_sq(struct ocrdma_cqe *cqe)
+{
+	return (le32_to_cpu(cqe->flags_status_srcqpn) &
+		OCRDMA_CQE_QTYPE) ? 0 : 1;
+}
+
+static inline int is_cqe_invalidated(struct ocrdma_cqe *cqe)
+{
+	return (le32_to_cpu(cqe->flags_status_srcqpn) &
+		OCRDMA_CQE_INVALIDATE) ? 1 : 0;
+}
+
+static inline int is_cqe_imm(struct ocrdma_cqe *cqe)
+{
+	return (le32_to_cpu(cqe->flags_status_srcqpn) &
+		OCRDMA_CQE_IMM) ? 1 : 0;
+}
+
+static inline int is_cqe_wr_imm(struct ocrdma_cqe *cqe)
+{
+	return (le32_to_cpu(cqe->flags_status_srcqpn) &
+		OCRDMA_CQE_WRITE_IMM) ? 1 : 0;
+}
+
+static inline int ocrdma_resolve_dmac(struct ocrdma_dev *dev,
+		struct ib_ah_attr *ah_attr, u8 *mac_addr)
+{
+	struct in6_addr in6;
+
+	memcpy(&in6, ah_attr->grh.dgid.raw, sizeof(in6));
+	if (rdma_is_multicast_addr(&in6))
+		rdma_get_mcast_mac(&in6, mac_addr);
+	else
+		memcpy(mac_addr, ah_attr->dmac, ETH_ALEN);
+	return 0;
+}
+
+static inline char *hca_name(struct ocrdma_dev *dev)
+{
+	switch (dev->nic_info.pdev->device) {
+	case OC_SKH_DEVICE_PF:
+	case OC_SKH_DEVICE_VF:
+		return OC_NAME_SH;
+	default:
+		return OC_NAME_UNKNOWN;
+	}
+}
+
+static inline int ocrdma_get_eq_table_index(struct ocrdma_dev *dev,
+		int eqid)
+{
+	int indx;
+
+	for (indx = 0; indx < dev->eq_cnt; indx++) {
+		if (dev->eq_tbl[indx].q.id == eqid)
+			return indx;
+	}
+
+	return -EINVAL;
+}
+
+static inline u8 ocrdma_get_asic_type(struct ocrdma_dev *dev)
+{
+	if (dev->nic_info.dev_family == 0xF && !dev->asic_id) {
+		pci_read_config_dword(
+			dev->nic_info.pdev,
+			OCRDMA_SLI_ASIC_ID_OFFSET, &dev->asic_id);
+	}
+
+	return (dev->asic_id & OCRDMA_SLI_ASIC_GEN_NUM_MASK) >>
+				OCRDMA_SLI_ASIC_GEN_NUM_SHIFT;
+}
+
+static inline u8 ocrdma_get_pfc_prio(u8 *pfc, u8 prio)
+{
+	return *(pfc + prio);
+}
+
+static inline u8 ocrdma_get_app_prio(u8 *app_prio, u8 prio)
+{
+	return *(app_prio + prio);
+}
+
+static inline u8 ocrdma_is_enabled_and_synced(u32 state)
+{	/* May also be used to interpret TC-state, QCN-state
+	 * Appl-state and Logical-link-state in future.
+	 */
+	return (state & OCRDMA_STATE_FLAG_ENABLED) &&
+		(state & OCRDMA_STATE_FLAG_SYNC);
+}
+
+#endif
diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_abi.h b/drivers/infiniband/hw/ocrdma/ocrdma_abi.h
new file mode 100644
index 0000000..1554cca
--- /dev/null
+++ b/drivers/infiniband/hw/ocrdma/ocrdma_abi.h
@@ -0,0 +1,134 @@
+/*******************************************************************
+ * This file is part of the Emulex RoCE Device Driver for          *
+ * RoCE (RDMA over Converged Ethernet) adapters.                   *
+ * Copyright (C) 2008-2012 Emulex. All rights reserved.            *
+ * EMULEX and SLI are trademarks of Emulex.                        *
+ * www.emulex.com                                                  *
+ *                                                                 *
+ * This program is free software; you can redistribute it and/or   *
+ * modify it under the terms of version 2 of the GNU General       *
+ * Public License as published by the Free Software Foundation.    *
+ * This program is distributed in the hope that it will be useful. *
+ * ALL EXPRESS OR IMPLIED CONDITIONS, REPRESENTATIONS AND          *
+ * WARRANTIES, INCLUDING ANY IMPLIED WARRANTY OF MERCHANTABILITY,  *
+ * FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT, ARE      *
+ * DISCLAIMED, EXCEPT TO THE EXTENT THAT SUCH DISCLAIMERS ARE HELD *
+ * TO BE LEGALLY INVALID.  See the GNU General Public License for  *
+ * more details, a copy of which can be found in the file COPYING  *
+ * included with this package.                                     *
+ *
+ * Contact Information:
+ * linux-drivers@emulex.com
+ *
+ * Emulex
+ * 3333 Susan Street
+ * Costa Mesa, CA 92626
+ *******************************************************************/
+
+#ifndef __OCRDMA_ABI_H__
+#define __OCRDMA_ABI_H__
+
+#define OCRDMA_ABI_VERSION 2
+#define OCRDMA_BE_ROCE_ABI_VERSION 1
+/* user kernel communication data structures. */
+
+struct ocrdma_alloc_ucontext_resp {
+	u32 dev_id;
+	u32 wqe_size;
+	u32 max_inline_data;
+	u32 dpp_wqe_size;
+	u64 ah_tbl_page;
+	u32 ah_tbl_len;
+	u32 rqe_size;
+	u8 fw_ver[32];
+	/* for future use/new features in progress */
+	u64 rsvd1;
+	u64 rsvd2;
+};
+
+struct ocrdma_alloc_pd_ureq {
+	u64 rsvd1;
+};
+
+struct ocrdma_alloc_pd_uresp {
+	u32 id;
+	u32 dpp_enabled;
+	u32 dpp_page_addr_hi;
+	u32 dpp_page_addr_lo;
+	u64 rsvd1;
+};
+
+struct ocrdma_create_cq_ureq {
+	u32 dpp_cq;
+	u32 rsvd; /* pad */
+};
+
+#define MAX_CQ_PAGES 8
+struct ocrdma_create_cq_uresp {
+	u32 cq_id;
+	u32 page_size;
+	u32 num_pages;
+	u32 max_hw_cqe;
+	u64 page_addr[MAX_CQ_PAGES];
+	u64 db_page_addr;
+	u32 db_page_size;
+	u32 phase_change;
+	/* for future use/new features in progress */
+	u64 rsvd1;
+	u64 rsvd2;
+};
+
+#define MAX_QP_PAGES 8
+#define MAX_UD_AV_PAGES 8
+
+struct ocrdma_create_qp_ureq {
+	u8 enable_dpp_cq;
+	u8 rsvd;
+	u16 dpp_cq_id;
+	u32 rsvd1;	/* pad */
+};
+
+struct ocrdma_create_qp_uresp {
+	u16 qp_id;
+	u16 sq_dbid;
+	u16 rq_dbid;
+	u16 resv0;	/* pad */
+	u32 sq_page_size;
+	u32 rq_page_size;
+	u32 num_sq_pages;
+	u32 num_rq_pages;
+	u64 sq_page_addr[MAX_QP_PAGES];
+	u64 rq_page_addr[MAX_QP_PAGES];
+	u64 db_page_addr;
+	u32 db_page_size;
+	u32 dpp_credit;
+	u32 dpp_offset;
+	u32 num_wqe_allocated;
+	u32 num_rqe_allocated;
+	u32 db_sq_offset;
+	u32 db_rq_offset;
+	u32 db_shift;
+	u64 rsvd[11];
+} __packed;
+
+struct ocrdma_create_srq_uresp {
+	u16 rq_dbid;
+	u16 resv0;	/* pad */
+	u32 resv1;
+
+	u32 rq_page_size;
+	u32 num_rq_pages;
+
+	u64 rq_page_addr[MAX_QP_PAGES];
+	u64 db_page_addr;
+
+	u32 db_page_size;
+	u32 num_rqe_allocated;
+	u32 db_rq_offset;
+	u32 db_shift;
+
+	u64 rsvd2;
+	u64 rsvd3;
+};
+
+#endif				/* __OCRDMA_ABI_H__ */
diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_ah.c b/drivers/infiniband/hw/ocrdma/ocrdma_ah.c
new file mode 100644
index 0000000..ac02ce4
--- /dev/null
+++ b/drivers/infiniband/hw/ocrdma/ocrdma_ah.c
@@ -0,0 +1,198 @@
+/*******************************************************************
+ * This file is part of the Emulex RoCE Device Driver for          *
+ * RoCE (RDMA over Converged Ethernet) adapters.                   *
+ * Copyright (C) 2008-2012 Emulex. All rights reserved.            *
+ * EMULEX and SLI are trademarks of Emulex.                        *
+ * www.emulex.com                                                  *
+ *                                                                 *
+ * This program is free software; you can redistribute it and/or   *
+ * modify it under the terms of version 2 of the GNU General       *
+ * Public License as published by the Free Software Foundation.    *
+ * This program is distributed in the hope that it will be useful. *
+ * ALL EXPRESS OR IMPLIED CONDITIONS, REPRESENTATIONS AND          *
+ * WARRANTIES, INCLUDING ANY IMPLIED WARRANTY OF MERCHANTABILITY,  *
+ * FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT, ARE      *
+ * DISCLAIMED, EXCEPT TO THE EXTENT THAT SUCH DISCLAIMERS ARE HELD *
+ * TO BE LEGALLY INVALID.  See the GNU General Public License for  *
+ * more details, a copy of which can be found in the file COPYING  *
+ * included with this package.                                     *
+ *
+ * Contact Information:
+ * linux-drivers@emulex.com
+ *
+ * Emulex
+ * 3333 Susan Street
+ * Costa Mesa, CA 92626
+ *******************************************************************/
+
+#include <net/neighbour.h>
+#include <net/netevent.h>
+
+#include <rdma/ib_addr.h>
+
+#include "ocrdma.h"
+#include "ocrdma_verbs.h"
+#include "ocrdma_ah.h"
+#include "ocrdma_hw.h"
+
+#define OCRDMA_VID_PCP_SHIFT	0xD
+
+static inline int set_av_attr(struct ocrdma_dev *dev, struct ocrdma_ah *ah,
+			struct ib_ah_attr *attr, union ib_gid *sgid, int pdid)
+{
+	int status = 0;
+	u16 vlan_tag; bool vlan_enabled = false;
+	struct ocrdma_eth_vlan eth;
+	struct ocrdma_grh grh;
+	int eth_sz;
+
+	memset(&eth, 0, sizeof(eth));
+	memset(&grh, 0, sizeof(grh));
+
+	/* VLAN */
+	vlan_tag = attr->vlan_id;
+	if (!vlan_tag || (vlan_tag > 0xFFF))
+		vlan_tag = dev->pvid;
+	if (vlan_tag && (vlan_tag < 0x1000)) {
+		eth.eth_type = cpu_to_be16(0x8100);
+		eth.roce_eth_type = cpu_to_be16(OCRDMA_ROCE_ETH_TYPE);
+		vlan_tag |= (dev->sl & 0x07) << OCRDMA_VID_PCP_SHIFT;
+		eth.vlan_tag = cpu_to_be16(vlan_tag);
+		eth_sz = sizeof(struct ocrdma_eth_vlan);
+		vlan_enabled = true;
+	} else {
+		eth.eth_type = cpu_to_be16(OCRDMA_ROCE_ETH_TYPE);
+		eth_sz = sizeof(struct ocrdma_eth_basic);
+	}
+	/* MAC */
+	memcpy(&eth.smac[0], &dev->nic_info.mac_addr[0], ETH_ALEN);
+	status = ocrdma_resolve_dmac(dev, attr, &eth.dmac[0]);
+	if (status)
+		return status;
+	ah->sgid_index = attr->grh.sgid_index;
+	memcpy(&grh.sgid[0], sgid->raw, sizeof(union ib_gid));
+	memcpy(&grh.dgid[0], attr->grh.dgid.raw, sizeof(attr->grh.dgid.raw));
+
+	grh.tclass_flow = cpu_to_be32((6 << 28) |
+			(attr->grh.traffic_class << 24) |
+			attr->grh.flow_label);
+	/* 0x1b is next header value in GRH */
+	grh.pdid_hoplimit = cpu_to_be32((pdid << 16) |
+			(0x1b << 8) | attr->grh.hop_limit);
+	/* Eth HDR */
+	memcpy(&ah->av->eth_hdr, &eth, eth_sz);
+	memcpy((u8 *)ah->av + eth_sz, &grh, sizeof(struct ocrdma_grh));
+	if (vlan_enabled)
+		ah->av->valid |= OCRDMA_AV_VLAN_VALID;
+	ah->av->valid = cpu_to_le32(ah->av->valid);
+	return status;
+}
+
+struct ib_ah *ocrdma_create_ah(struct ib_pd *ibpd, struct ib_ah_attr *attr)
+{
+	u32 *ahid_addr;
+	int status;
+	struct ocrdma_ah *ah;
+	struct ocrdma_pd *pd = get_ocrdma_pd(ibpd);
+	struct ocrdma_dev *dev = get_ocrdma_dev(ibpd->device);
+	union ib_gid sgid;
+	u8 zmac[ETH_ALEN];
+
+	if (!(attr->ah_flags & IB_AH_GRH))
+		return ERR_PTR(-EINVAL);
+
+	if (atomic_cmpxchg(&dev->update_sl, 1, 0))
+		ocrdma_init_service_level(dev);
+	ah = kzalloc(sizeof(*ah), GFP_ATOMIC);
+	if (!ah)
+		return ERR_PTR(-ENOMEM);
+
+	status = ocrdma_alloc_av(dev, ah);
+	if (status)
+		goto av_err;
+
+	status = ocrdma_query_gid(&dev->ibdev, 1, attr->grh.sgid_index, &sgid);
+	if (status) {
+		pr_err("%s(): Failed to query sgid, status = %d\n",
+		      __func__, status);
+		goto av_conf_err;
+	}
+
+	memset(&zmac, 0, ETH_ALEN);
+	if (pd->uctx &&
+	    memcmp(attr->dmac, &zmac, ETH_ALEN)) {
+		status = rdma_addr_find_dmac_by_grh(&sgid, &attr->grh.dgid,
+                                        attr->dmac, &attr->vlan_id);
+		if (status) {
+			pr_err("%s(): Failed to resolve dmac from gid." 
+				"status = %d\n", __func__, status);
+			goto av_conf_err;
+		}
+	}
+
+	status = set_av_attr(dev, ah, attr, &sgid, pd->id);
+	if (status)
+		goto av_conf_err;
+
+	/* if pd is for the user process, pass the ah_id to user space */
+	if ((pd->uctx) && (pd->uctx->ah_tbl.va)) {
+		ahid_addr = pd->uctx->ah_tbl.va + attr->dlid;
+		*ahid_addr = ah->id;
+	}
+	return &ah->ibah;
+
+av_conf_err:
+	ocrdma_free_av(dev, ah);
+av_err:
+	kfree(ah);
+	return ERR_PTR(status);
+}
+
+int ocrdma_destroy_ah(struct ib_ah *ibah)
+{
+	struct ocrdma_ah *ah = get_ocrdma_ah(ibah);
+	struct ocrdma_dev *dev = get_ocrdma_dev(ibah->device);
+
+	ocrdma_free_av(dev, ah);
+	kfree(ah);
+	return 0;
+}
+
+int ocrdma_query_ah(struct ib_ah *ibah, struct ib_ah_attr *attr)
+{
+	struct ocrdma_ah *ah = get_ocrdma_ah(ibah);
+	struct ocrdma_av *av = ah->av;
+	struct ocrdma_grh *grh;
+	attr->ah_flags |= IB_AH_GRH;
+	if (ah->av->valid & OCRDMA_AV_VALID) {
+		grh = (struct ocrdma_grh *)((u8 *)ah->av +
+				sizeof(struct ocrdma_eth_vlan));
+		attr->sl = be16_to_cpu(av->eth_hdr.vlan_tag) >> 13;
+	} else {
+		grh = (struct ocrdma_grh *)((u8 *)ah->av +
+					sizeof(struct ocrdma_eth_basic));
+		attr->sl = 0;
+	}
+	memcpy(&attr->grh.dgid.raw[0], &grh->dgid[0], sizeof(grh->dgid));
+	attr->grh.sgid_index = ah->sgid_index;
+	attr->grh.hop_limit = be32_to_cpu(grh->pdid_hoplimit) & 0xff;
+	attr->grh.traffic_class = be32_to_cpu(grh->tclass_flow) >> 24;
+	attr->grh.flow_label = be32_to_cpu(grh->tclass_flow) & 0x00ffffffff;
+	return 0;
+}
+
+int ocrdma_modify_ah(struct ib_ah *ibah, struct ib_ah_attr *attr)
+{
+	/* modify_ah is unsupported */
+	return -ENOSYS;
+}
+
+int ocrdma_process_mad(struct ib_device *ibdev,
+		       int process_mad_flags,
+		       u8 port_num,
+		       struct ib_wc *in_wc,
+		       struct ib_grh *in_grh,
+		       struct ib_mad *in_mad, struct ib_mad *out_mad)
+{
+	return IB_MAD_RESULT_SUCCESS;
+}
diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_ah.h b/drivers/infiniband/hw/ocrdma/ocrdma_ah.h
new file mode 100644
index 0000000..8ac49e7
--- /dev/null
+++ b/drivers/infiniband/hw/ocrdma/ocrdma_ah.h
@@ -0,0 +1,42 @@
+/*******************************************************************
+ * This file is part of the Emulex RoCE Device Driver for          *
+ * RoCE (RDMA over Converged Ethernet) adapters.                   *
+ * Copyright (C) 2008-2012 Emulex. All rights reserved.            *
+ * EMULEX and SLI are trademarks of Emulex.                        *
+ * www.emulex.com                                                  *
+ *                                                                 *
+ * This program is free software; you can redistribute it and/or   *
+ * modify it under the terms of version 2 of the GNU General       *
+ * Public License as published by the Free Software Foundation.    *
+ * This program is distributed in the hope that it will be useful. *
+ * ALL EXPRESS OR IMPLIED CONDITIONS, REPRESENTATIONS AND          *
+ * WARRANTIES, INCLUDING ANY IMPLIED WARRANTY OF MERCHANTABILITY,  *
+ * FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT, ARE      *
+ * DISCLAIMED, EXCEPT TO THE EXTENT THAT SUCH DISCLAIMERS ARE HELD *
+ * TO BE LEGALLY INVALID.  See the GNU General Public License for  *
+ * more details, a copy of which can be found in the file COPYING  *
+ * included with this package.                                     *
+ *
+ * Contact Information:
+ * linux-drivers@emulex.com
+ *
+ * Emulex
+ * 3333 Susan Street
+ * Costa Mesa, CA 92626
+ *******************************************************************/
+
+#ifndef __OCRDMA_AH_H__
+#define __OCRDMA_AH_H__
+
+struct ib_ah *ocrdma_create_ah(struct ib_pd *, struct ib_ah_attr *);
+int ocrdma_destroy_ah(struct ib_ah *);
+int ocrdma_query_ah(struct ib_ah *, struct ib_ah_attr *);
+int ocrdma_modify_ah(struct ib_ah *, struct ib_ah_attr *);
+
+int ocrdma_process_mad(struct ib_device *,
+		       int process_mad_flags,
+		       u8 port_num,
+		       struct ib_wc *in_wc,
+		       struct ib_grh *in_grh,
+		       struct ib_mad *in_mad, struct ib_mad *out_mad);
+#endif				/* __OCRDMA_AH_H__ */
diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_hw.c b/drivers/infiniband/hw/ocrdma/ocrdma_hw.c
new file mode 100644
index 0000000..638bff1
--- /dev/null
+++ b/drivers/infiniband/hw/ocrdma/ocrdma_hw.c
@@ -0,0 +1,2925 @@
+/*******************************************************************
+ * This file is part of the Emulex RoCE Device Driver for          *
+ * RoCE (RDMA over Converged Ethernet) CNA Adapters.              *
+ * Copyright (C) 2008-2012 Emulex. All rights reserved.            *
+ * EMULEX and SLI are trademarks of Emulex.                        *
+ * www.emulex.com                                                  *
+ *                                                                 *
+ * This program is free software; you can redistribute it and/or   *
+ * modify it under the terms of version 2 of the GNU General       *
+ * Public License as published by the Free Software Foundation.    *
+ * This program is distributed in the hope that it will be useful. *
+ * ALL EXPRESS OR IMPLIED CONDITIONS, REPRESENTATIONS AND          *
+ * WARRANTIES, INCLUDING ANY IMPLIED WARRANTY OF MERCHANTABILITY,  *
+ * FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT, ARE      *
+ * DISCLAIMED, EXCEPT TO THE EXTENT THAT SUCH DISCLAIMERS ARE HELD *
+ * TO BE LEGALLY INVALID.  See the GNU General Public License for  *
+ * more details, a copy of which can be found in the file COPYING  *
+ * included with this package.                                     *
+ *
+ * Contact Information:
+ * linux-drivers@emulex.com
+ *
+ * Emulex
+ * 3333 Susan Street
+ * Costa Mesa, CA 92626
+ *******************************************************************/
+
+#include <linux/sched.h>
+#include <linux/interrupt.h>
+#include <linux/log2.h>
+#include <linux/dma-mapping.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_user_verbs.h>
+
+#include "ocrdma.h"
+#include "ocrdma_hw.h"
+#include "ocrdma_verbs.h"
+#include "ocrdma_ah.h"
+
+enum mbx_status {
+	OCRDMA_MBX_STATUS_FAILED		= 1,
+	OCRDMA_MBX_STATUS_ILLEGAL_FIELD		= 3,
+	OCRDMA_MBX_STATUS_OOR			= 100,
+	OCRDMA_MBX_STATUS_INVALID_PD		= 101,
+	OCRDMA_MBX_STATUS_PD_INUSE		= 102,
+	OCRDMA_MBX_STATUS_INVALID_CQ		= 103,
+	OCRDMA_MBX_STATUS_INVALID_QP		= 104,
+	OCRDMA_MBX_STATUS_INVALID_LKEY		= 105,
+	OCRDMA_MBX_STATUS_ORD_EXCEEDS		= 106,
+	OCRDMA_MBX_STATUS_IRD_EXCEEDS		= 107,
+	OCRDMA_MBX_STATUS_SENDQ_WQE_EXCEEDS	= 108,
+	OCRDMA_MBX_STATUS_RECVQ_RQE_EXCEEDS	= 109,
+	OCRDMA_MBX_STATUS_SGE_SEND_EXCEEDS	= 110,
+	OCRDMA_MBX_STATUS_SGE_WRITE_EXCEEDS	= 111,
+	OCRDMA_MBX_STATUS_SGE_RECV_EXCEEDS	= 112,
+	OCRDMA_MBX_STATUS_INVALID_STATE_CHANGE	= 113,
+	OCRDMA_MBX_STATUS_MW_BOUND		= 114,
+	OCRDMA_MBX_STATUS_INVALID_VA		= 115,
+	OCRDMA_MBX_STATUS_INVALID_LENGTH	= 116,
+	OCRDMA_MBX_STATUS_INVALID_FBO		= 117,
+	OCRDMA_MBX_STATUS_INVALID_ACC_RIGHTS	= 118,
+	OCRDMA_MBX_STATUS_INVALID_PBE_SIZE	= 119,
+	OCRDMA_MBX_STATUS_INVALID_PBL_ENTRY	= 120,
+	OCRDMA_MBX_STATUS_INVALID_PBL_SHIFT	= 121,
+	OCRDMA_MBX_STATUS_INVALID_SRQ_ID	= 129,
+	OCRDMA_MBX_STATUS_SRQ_ERROR		= 133,
+	OCRDMA_MBX_STATUS_RQE_EXCEEDS		= 134,
+	OCRDMA_MBX_STATUS_MTU_EXCEEDS		= 135,
+	OCRDMA_MBX_STATUS_MAX_QP_EXCEEDS	= 136,
+	OCRDMA_MBX_STATUS_SRQ_LIMIT_EXCEEDS	= 137,
+	OCRDMA_MBX_STATUS_SRQ_SIZE_UNDERUNS	= 138,
+	OCRDMA_MBX_STATUS_QP_BOUND		= 130,
+	OCRDMA_MBX_STATUS_INVALID_CHANGE	= 139,
+	OCRDMA_MBX_STATUS_ATOMIC_OPS_UNSUP	= 140,
+	OCRDMA_MBX_STATUS_INVALID_RNR_NAK_TIMER	= 141,
+	OCRDMA_MBX_STATUS_MW_STILL_BOUND	= 142,
+	OCRDMA_MBX_STATUS_PKEY_INDEX_INVALID	= 143,
+	OCRDMA_MBX_STATUS_PKEY_INDEX_EXCEEDS	= 144
+};
+
+enum additional_status {
+	OCRDMA_MBX_ADDI_STATUS_INSUFFICIENT_RESOURCES = 22
+};
+
+enum cqe_status {
+	OCRDMA_MBX_CQE_STATUS_INSUFFICIENT_PRIVILEDGES	= 1,
+	OCRDMA_MBX_CQE_STATUS_INVALID_PARAMETER		= 2,
+	OCRDMA_MBX_CQE_STATUS_INSUFFICIENT_RESOURCES	= 3,
+	OCRDMA_MBX_CQE_STATUS_QUEUE_FLUSHING		= 4,
+	OCRDMA_MBX_CQE_STATUS_DMA_FAILED		= 5
+};
+
+static inline void *ocrdma_get_eqe(struct ocrdma_eq *eq)
+{
+	return eq->q.va + (eq->q.tail * sizeof(struct ocrdma_eqe));
+}
+
+static inline void ocrdma_eq_inc_tail(struct ocrdma_eq *eq)
+{
+	eq->q.tail = (eq->q.tail + 1) & (OCRDMA_EQ_LEN - 1);
+}
+
+static inline void *ocrdma_get_mcqe(struct ocrdma_dev *dev)
+{
+	struct ocrdma_mcqe *cqe = (struct ocrdma_mcqe *)
+	    (dev->mq.cq.va + (dev->mq.cq.tail * sizeof(struct ocrdma_mcqe)));
+
+	if (!(le32_to_cpu(cqe->valid_ae_cmpl_cons) & OCRDMA_MCQE_VALID_MASK))
+		return NULL;
+	return cqe;
+}
+
+static inline void ocrdma_mcq_inc_tail(struct ocrdma_dev *dev)
+{
+	dev->mq.cq.tail = (dev->mq.cq.tail + 1) & (OCRDMA_MQ_CQ_LEN - 1);
+}
+
+static inline struct ocrdma_mqe *ocrdma_get_mqe(struct ocrdma_dev *dev)
+{
+	return dev->mq.sq.va + (dev->mq.sq.head * sizeof(struct ocrdma_mqe));
+}
+
+static inline void ocrdma_mq_inc_head(struct ocrdma_dev *dev)
+{
+	dev->mq.sq.head = (dev->mq.sq.head + 1) & (OCRDMA_MQ_LEN - 1);
+}
+
+static inline void *ocrdma_get_mqe_rsp(struct ocrdma_dev *dev)
+{
+	return dev->mq.sq.va + (dev->mqe_ctx.tag * sizeof(struct ocrdma_mqe));
+}
+
+enum ib_qp_state get_ibqp_state(enum ocrdma_qp_state qps)
+{
+	switch (qps) {
+	case OCRDMA_QPS_RST:
+		return IB_QPS_RESET;
+	case OCRDMA_QPS_INIT:
+		return IB_QPS_INIT;
+	case OCRDMA_QPS_RTR:
+		return IB_QPS_RTR;
+	case OCRDMA_QPS_RTS:
+		return IB_QPS_RTS;
+	case OCRDMA_QPS_SQD:
+	case OCRDMA_QPS_SQ_DRAINING:
+		return IB_QPS_SQD;
+	case OCRDMA_QPS_SQE:
+		return IB_QPS_SQE;
+	case OCRDMA_QPS_ERR:
+		return IB_QPS_ERR;
+	}
+	return IB_QPS_ERR;
+}
+
+static enum ocrdma_qp_state get_ocrdma_qp_state(enum ib_qp_state qps)
+{
+	switch (qps) {
+	case IB_QPS_RESET:
+		return OCRDMA_QPS_RST;
+	case IB_QPS_INIT:
+		return OCRDMA_QPS_INIT;
+	case IB_QPS_RTR:
+		return OCRDMA_QPS_RTR;
+	case IB_QPS_RTS:
+		return OCRDMA_QPS_RTS;
+	case IB_QPS_SQD:
+		return OCRDMA_QPS_SQD;
+	case IB_QPS_SQE:
+		return OCRDMA_QPS_SQE;
+	case IB_QPS_ERR:
+		return OCRDMA_QPS_ERR;
+	}
+	return OCRDMA_QPS_ERR;
+}
+
+static int ocrdma_get_mbx_errno(u32 status)
+{
+	int err_num;
+	u8 mbox_status = (status & OCRDMA_MBX_RSP_STATUS_MASK) >>
+					OCRDMA_MBX_RSP_STATUS_SHIFT;
+	u8 add_status = (status & OCRDMA_MBX_RSP_ASTATUS_MASK) >>
+					OCRDMA_MBX_RSP_ASTATUS_SHIFT;
+
+	switch (mbox_status) {
+	case OCRDMA_MBX_STATUS_OOR:
+	case OCRDMA_MBX_STATUS_MAX_QP_EXCEEDS:
+		err_num = -EAGAIN;
+		break;
+
+	case OCRDMA_MBX_STATUS_INVALID_PD:
+	case OCRDMA_MBX_STATUS_INVALID_CQ:
+	case OCRDMA_MBX_STATUS_INVALID_SRQ_ID:
+	case OCRDMA_MBX_STATUS_INVALID_QP:
+	case OCRDMA_MBX_STATUS_INVALID_CHANGE:
+	case OCRDMA_MBX_STATUS_MTU_EXCEEDS:
+	case OCRDMA_MBX_STATUS_INVALID_RNR_NAK_TIMER:
+	case OCRDMA_MBX_STATUS_PKEY_INDEX_INVALID:
+	case OCRDMA_MBX_STATUS_PKEY_INDEX_EXCEEDS:
+	case OCRDMA_MBX_STATUS_ILLEGAL_FIELD:
+	case OCRDMA_MBX_STATUS_INVALID_PBL_ENTRY:
+	case OCRDMA_MBX_STATUS_INVALID_LKEY:
+	case OCRDMA_MBX_STATUS_INVALID_VA:
+	case OCRDMA_MBX_STATUS_INVALID_LENGTH:
+	case OCRDMA_MBX_STATUS_INVALID_FBO:
+	case OCRDMA_MBX_STATUS_INVALID_ACC_RIGHTS:
+	case OCRDMA_MBX_STATUS_INVALID_PBE_SIZE:
+	case OCRDMA_MBX_STATUS_ATOMIC_OPS_UNSUP:
+	case OCRDMA_MBX_STATUS_SRQ_ERROR:
+	case OCRDMA_MBX_STATUS_SRQ_SIZE_UNDERUNS:
+		err_num = -EINVAL;
+		break;
+
+	case OCRDMA_MBX_STATUS_PD_INUSE:
+	case OCRDMA_MBX_STATUS_QP_BOUND:
+	case OCRDMA_MBX_STATUS_MW_STILL_BOUND:
+	case OCRDMA_MBX_STATUS_MW_BOUND:
+		err_num = -EBUSY;
+		break;
+
+	case OCRDMA_MBX_STATUS_RECVQ_RQE_EXCEEDS:
+	case OCRDMA_MBX_STATUS_SGE_RECV_EXCEEDS:
+	case OCRDMA_MBX_STATUS_RQE_EXCEEDS:
+	case OCRDMA_MBX_STATUS_SRQ_LIMIT_EXCEEDS:
+	case OCRDMA_MBX_STATUS_ORD_EXCEEDS:
+	case OCRDMA_MBX_STATUS_IRD_EXCEEDS:
+	case OCRDMA_MBX_STATUS_SENDQ_WQE_EXCEEDS:
+	case OCRDMA_MBX_STATUS_SGE_SEND_EXCEEDS:
+	case OCRDMA_MBX_STATUS_SGE_WRITE_EXCEEDS:
+		err_num = -ENOBUFS;
+		break;
+
+	case OCRDMA_MBX_STATUS_FAILED:
+		switch (add_status) {
+		case OCRDMA_MBX_ADDI_STATUS_INSUFFICIENT_RESOURCES:
+			err_num = -EAGAIN;
+			break;
+		}
+	default:
+		err_num = -EFAULT;
+	}
+	return err_num;
+}
+
+char *port_speed_string(struct ocrdma_dev *dev)
+{
+	char *str = "";
+	u16 speeds_supported;
+
+	speeds_supported = dev->phy.fixed_speeds_supported |
+				dev->phy.auto_speeds_supported;
+	if (speeds_supported & OCRDMA_PHY_SPEED_40GBPS)
+		str = "40Gbps ";
+	else if (speeds_supported & OCRDMA_PHY_SPEED_10GBPS)
+		str = "10Gbps ";
+	else if (speeds_supported & OCRDMA_PHY_SPEED_1GBPS)
+		str = "1Gbps ";
+
+	return str;
+}
+
+static int ocrdma_get_mbx_cqe_errno(u16 cqe_status)
+{
+	int err_num = -EINVAL;
+
+	switch (cqe_status) {
+	case OCRDMA_MBX_CQE_STATUS_INSUFFICIENT_PRIVILEDGES:
+		err_num = -EPERM;
+		break;
+	case OCRDMA_MBX_CQE_STATUS_INVALID_PARAMETER:
+		err_num = -EINVAL;
+		break;
+	case OCRDMA_MBX_CQE_STATUS_INSUFFICIENT_RESOURCES:
+	case OCRDMA_MBX_CQE_STATUS_QUEUE_FLUSHING:
+		err_num = -EINVAL;
+		break;
+	case OCRDMA_MBX_CQE_STATUS_DMA_FAILED:
+	default:
+		err_num = -EINVAL;
+		break;
+	}
+	return err_num;
+}
+
+void ocrdma_ring_cq_db(struct ocrdma_dev *dev, u16 cq_id, bool armed,
+		       bool solicited, u16 cqe_popped)
+{
+	u32 val = cq_id & OCRDMA_DB_CQ_RING_ID_MASK;
+
+	val |= ((cq_id & OCRDMA_DB_CQ_RING_ID_EXT_MASK) <<
+	     OCRDMA_DB_CQ_RING_ID_EXT_MASK_SHIFT);
+
+	if (armed)
+		val |= (1 << OCRDMA_DB_CQ_REARM_SHIFT);
+	if (solicited)
+		val |= (1 << OCRDMA_DB_CQ_SOLICIT_SHIFT);
+	val |= (cqe_popped << OCRDMA_DB_CQ_NUM_POPPED_SHIFT);
+	iowrite32(val, dev->nic_info.db + OCRDMA_DB_CQ_OFFSET);
+}
+
+static void ocrdma_ring_mq_db(struct ocrdma_dev *dev)
+{
+	u32 val = 0;
+
+	val |= dev->mq.sq.id & OCRDMA_MQ_ID_MASK;
+	val |= 1 << OCRDMA_MQ_NUM_MQE_SHIFT;
+	iowrite32(val, dev->nic_info.db + OCRDMA_DB_MQ_OFFSET);
+}
+
+static void ocrdma_ring_eq_db(struct ocrdma_dev *dev, u16 eq_id,
+			      bool arm, bool clear_int, u16 num_eqe)
+{
+	u32 val = 0;
+
+	val |= eq_id & OCRDMA_EQ_ID_MASK;
+	val |= ((eq_id & OCRDMA_EQ_ID_EXT_MASK) << OCRDMA_EQ_ID_EXT_MASK_SHIFT);
+	if (arm)
+		val |= (1 << OCRDMA_REARM_SHIFT);
+	if (clear_int)
+		val |= (1 << OCRDMA_EQ_CLR_SHIFT);
+	val |= (1 << OCRDMA_EQ_TYPE_SHIFT);
+	val |= (num_eqe << OCRDMA_NUM_EQE_SHIFT);
+	iowrite32(val, dev->nic_info.db + OCRDMA_DB_EQ_OFFSET);
+}
+
+static void ocrdma_init_mch(struct ocrdma_mbx_hdr *cmd_hdr,
+			    u8 opcode, u8 subsys, u32 cmd_len)
+{
+	cmd_hdr->subsys_op = (opcode | (subsys << OCRDMA_MCH_SUBSYS_SHIFT));
+	cmd_hdr->timeout = 20; /* seconds */
+	cmd_hdr->cmd_len = cmd_len - sizeof(struct ocrdma_mbx_hdr);
+}
+
+static void *ocrdma_init_emb_mqe(u8 opcode, u32 cmd_len)
+{
+	struct ocrdma_mqe *mqe;
+
+	mqe = kzalloc(sizeof(struct ocrdma_mqe), GFP_KERNEL);
+	if (!mqe)
+		return NULL;
+	mqe->hdr.spcl_sge_cnt_emb |=
+		(OCRDMA_MQE_EMBEDDED << OCRDMA_MQE_HDR_EMB_SHIFT) &
+					OCRDMA_MQE_HDR_EMB_MASK;
+	mqe->hdr.pyld_len = cmd_len - sizeof(struct ocrdma_mqe_hdr);
+
+	ocrdma_init_mch(&mqe->u.emb_req.mch, opcode, OCRDMA_SUBSYS_ROCE,
+			mqe->hdr.pyld_len);
+	return mqe;
+}
+
+static void ocrdma_free_q(struct ocrdma_dev *dev, struct ocrdma_queue_info *q)
+{
+	dma_free_coherent(&dev->nic_info.pdev->dev, q->size, q->va, q->dma);
+}
+
+static int ocrdma_alloc_q(struct ocrdma_dev *dev,
+			  struct ocrdma_queue_info *q, u16 len, u16 entry_size)
+{
+	memset(q, 0, sizeof(*q));
+	q->len = len;
+	q->entry_size = entry_size;
+	q->size = len * entry_size;
+	q->va = dma_alloc_coherent(&dev->nic_info.pdev->dev, q->size,
+				   &q->dma, GFP_KERNEL);
+	if (!q->va)
+		return -ENOMEM;
+	memset(q->va, 0, q->size);
+	return 0;
+}
+
+static void ocrdma_build_q_pages(struct ocrdma_pa *q_pa, int cnt,
+					dma_addr_t host_pa, int hw_page_size)
+{
+	int i;
+
+	for (i = 0; i < cnt; i++) {
+		q_pa[i].lo = (u32) (host_pa & 0xffffffff);
+		q_pa[i].hi = (u32) upper_32_bits(host_pa);
+		host_pa += hw_page_size;
+	}
+}
+
+static int ocrdma_mbx_delete_q(struct ocrdma_dev *dev,
+			       struct ocrdma_queue_info *q, int queue_type)
+{
+	u8 opcode = 0;
+	int status;
+	struct ocrdma_delete_q_req *cmd = dev->mbx_cmd;
+
+	switch (queue_type) {
+	case QTYPE_MCCQ:
+		opcode = OCRDMA_CMD_DELETE_MQ;
+		break;
+	case QTYPE_CQ:
+		opcode = OCRDMA_CMD_DELETE_CQ;
+		break;
+	case QTYPE_EQ:
+		opcode = OCRDMA_CMD_DELETE_EQ;
+		break;
+	default:
+		BUG();
+	}
+	memset(cmd, 0, sizeof(*cmd));
+	ocrdma_init_mch(&cmd->req, opcode, OCRDMA_SUBSYS_COMMON, sizeof(*cmd));
+	cmd->id = q->id;
+
+	status = be_roce_mcc_cmd(dev->nic_info.netdev,
+				 cmd, sizeof(*cmd), NULL, NULL);
+	if (!status)
+		q->created = false;
+	return status;
+}
+
+static int ocrdma_mbx_create_eq(struct ocrdma_dev *dev, struct ocrdma_eq *eq)
+{
+	int status;
+	struct ocrdma_create_eq_req *cmd = dev->mbx_cmd;
+	struct ocrdma_create_eq_rsp *rsp = dev->mbx_cmd;
+
+	memset(cmd, 0, sizeof(*cmd));
+	ocrdma_init_mch(&cmd->req, OCRDMA_CMD_CREATE_EQ, OCRDMA_SUBSYS_COMMON,
+			sizeof(*cmd));
+
+	cmd->req.rsvd_version = 2;
+	cmd->num_pages = 4;
+	cmd->valid = OCRDMA_CREATE_EQ_VALID;
+	cmd->cnt = 4 << OCRDMA_CREATE_EQ_CNT_SHIFT;
+
+	ocrdma_build_q_pages(&cmd->pa[0], cmd->num_pages, eq->q.dma,
+			     PAGE_SIZE_4K);
+	status = be_roce_mcc_cmd(dev->nic_info.netdev, cmd, sizeof(*cmd), NULL,
+				 NULL);
+	if (!status) {
+		eq->q.id = rsp->vector_eqid & 0xffff;
+		eq->vector = (rsp->vector_eqid >> 16) & 0xffff;
+		eq->q.created = true;
+	}
+	return status;
+}
+
+static int ocrdma_create_eq(struct ocrdma_dev *dev,
+			    struct ocrdma_eq *eq, u16 q_len)
+{
+	int status;
+
+	status = ocrdma_alloc_q(dev, &eq->q, OCRDMA_EQ_LEN,
+				sizeof(struct ocrdma_eqe));
+	if (status)
+		return status;
+
+	status = ocrdma_mbx_create_eq(dev, eq);
+	if (status)
+		goto mbx_err;
+	eq->dev = dev;
+	ocrdma_ring_eq_db(dev, eq->q.id, true, true, 0);
+
+	return 0;
+mbx_err:
+	ocrdma_free_q(dev, &eq->q);
+	return status;
+}
+
+int ocrdma_get_irq(struct ocrdma_dev *dev, struct ocrdma_eq *eq)
+{
+	int irq;
+
+	if (dev->nic_info.intr_mode == BE_INTERRUPT_MODE_INTX)
+		irq = dev->nic_info.pdev->irq;
+	else
+		irq = dev->nic_info.msix.vector_list[eq->vector];
+	return irq;
+}
+
+static void _ocrdma_destroy_eq(struct ocrdma_dev *dev, struct ocrdma_eq *eq)
+{
+	if (eq->q.created) {
+		ocrdma_mbx_delete_q(dev, &eq->q, QTYPE_EQ);
+		ocrdma_free_q(dev, &eq->q);
+	}
+}
+
+static void ocrdma_destroy_eq(struct ocrdma_dev *dev, struct ocrdma_eq *eq)
+{
+	int irq;
+
+	/* disarm EQ so that interrupts are not generated
+	 * during freeing and EQ delete is in progress.
+	 */
+	ocrdma_ring_eq_db(dev, eq->q.id, false, false, 0);
+
+	irq = ocrdma_get_irq(dev, eq);
+	free_irq(irq, eq);
+	_ocrdma_destroy_eq(dev, eq);
+}
+
+static void ocrdma_destroy_eqs(struct ocrdma_dev *dev)
+{
+	int i;
+
+	for (i = 0; i < dev->eq_cnt; i++)
+		ocrdma_destroy_eq(dev, &dev->eq_tbl[i]);
+}
+
+static int ocrdma_mbx_mq_cq_create(struct ocrdma_dev *dev,
+				   struct ocrdma_queue_info *cq,
+				   struct ocrdma_queue_info *eq)
+{
+	struct ocrdma_create_cq_cmd *cmd = dev->mbx_cmd;
+	struct ocrdma_create_cq_cmd_rsp *rsp = dev->mbx_cmd;
+	int status;
+
+	memset(cmd, 0, sizeof(*cmd));
+	ocrdma_init_mch(&cmd->req, OCRDMA_CMD_CREATE_CQ,
+			OCRDMA_SUBSYS_COMMON, sizeof(*cmd));
+
+	cmd->req.rsvd_version = OCRDMA_CREATE_CQ_VER2;
+	cmd->pgsz_pgcnt = (cq->size / OCRDMA_MIN_Q_PAGE_SIZE) <<
+		OCRDMA_CREATE_CQ_PAGE_SIZE_SHIFT;
+	cmd->pgsz_pgcnt |= PAGES_4K_SPANNED(cq->va, cq->size);
+
+	cmd->ev_cnt_flags = OCRDMA_CREATE_CQ_DEF_FLAGS;
+	cmd->eqn = eq->id;
+	cmd->pdid_cqecnt = cq->size / sizeof(struct ocrdma_mcqe);
+
+	ocrdma_build_q_pages(&cmd->pa[0], cq->size / OCRDMA_MIN_Q_PAGE_SIZE,
+			     cq->dma, PAGE_SIZE_4K);
+	status = be_roce_mcc_cmd(dev->nic_info.netdev,
+				 cmd, sizeof(*cmd), NULL, NULL);
+	if (!status) {
+		cq->id = (u16) (rsp->cq_id & OCRDMA_CREATE_CQ_RSP_CQ_ID_MASK);
+		cq->created = true;
+	}
+	return status;
+}
+
+static u32 ocrdma_encoded_q_len(int q_len)
+{
+	u32 len_encoded = fls(q_len);	/* log2(len) + 1 */
+
+	if (len_encoded == 16)
+		len_encoded = 0;
+	return len_encoded;
+}
+
+static int ocrdma_mbx_create_mq(struct ocrdma_dev *dev,
+				struct ocrdma_queue_info *mq,
+				struct ocrdma_queue_info *cq)
+{
+	int num_pages, status;
+	struct ocrdma_create_mq_req *cmd = dev->mbx_cmd;
+	struct ocrdma_create_mq_rsp *rsp = dev->mbx_cmd;
+	struct ocrdma_pa *pa;
+
+	memset(cmd, 0, sizeof(*cmd));
+	num_pages = PAGES_4K_SPANNED(mq->va, mq->size);
+
+	ocrdma_init_mch(&cmd->req, OCRDMA_CMD_CREATE_MQ_EXT,
+			OCRDMA_SUBSYS_COMMON, sizeof(*cmd));
+	cmd->req.rsvd_version = 1;
+	cmd->cqid_pages = num_pages;
+	cmd->cqid_pages |= (cq->id << OCRDMA_CREATE_MQ_CQ_ID_SHIFT);
+	cmd->async_cqid_valid = OCRDMA_CREATE_MQ_ASYNC_CQ_VALID;
+
+	cmd->async_event_bitmap = BIT(OCRDMA_ASYNC_GRP5_EVE_CODE);
+	cmd->async_event_bitmap |= BIT(OCRDMA_ASYNC_RDMA_EVE_CODE);
+
+	cmd->async_cqid_ringsize = cq->id;
+	cmd->async_cqid_ringsize |= (ocrdma_encoded_q_len(mq->len) <<
+				OCRDMA_CREATE_MQ_RING_SIZE_SHIFT);
+	cmd->valid = OCRDMA_CREATE_MQ_VALID;
+	pa = &cmd->pa[0];
+
+	ocrdma_build_q_pages(pa, num_pages, mq->dma, PAGE_SIZE_4K);
+	status = be_roce_mcc_cmd(dev->nic_info.netdev,
+				 cmd, sizeof(*cmd), NULL, NULL);
+	if (!status) {
+		mq->id = rsp->id;
+		mq->created = true;
+	}
+	return status;
+}
+
+static int ocrdma_create_mq(struct ocrdma_dev *dev)
+{
+	int status;
+
+	/* Alloc completion queue for Mailbox queue */
+	status = ocrdma_alloc_q(dev, &dev->mq.cq, OCRDMA_MQ_CQ_LEN,
+				sizeof(struct ocrdma_mcqe));
+	if (status)
+		goto alloc_err;
+
+	dev->eq_tbl[0].cq_cnt++;
+	status = ocrdma_mbx_mq_cq_create(dev, &dev->mq.cq, &dev->eq_tbl[0].q);
+	if (status)
+		goto mbx_cq_free;
+
+	memset(&dev->mqe_ctx, 0, sizeof(dev->mqe_ctx));
+	init_waitqueue_head(&dev->mqe_ctx.cmd_wait);
+	mutex_init(&dev->mqe_ctx.lock);
+
+	/* Alloc Mailbox queue */
+	status = ocrdma_alloc_q(dev, &dev->mq.sq, OCRDMA_MQ_LEN,
+				sizeof(struct ocrdma_mqe));
+	if (status)
+		goto mbx_cq_destroy;
+	status = ocrdma_mbx_create_mq(dev, &dev->mq.sq, &dev->mq.cq);
+	if (status)
+		goto mbx_q_free;
+	ocrdma_ring_cq_db(dev, dev->mq.cq.id, true, false, 0);
+	return 0;
+
+mbx_q_free:
+	ocrdma_free_q(dev, &dev->mq.sq);
+mbx_cq_destroy:
+	ocrdma_mbx_delete_q(dev, &dev->mq.cq, QTYPE_CQ);
+mbx_cq_free:
+	ocrdma_free_q(dev, &dev->mq.cq);
+alloc_err:
+	return status;
+}
+
+static void ocrdma_destroy_mq(struct ocrdma_dev *dev)
+{
+	struct ocrdma_queue_info *mbxq, *cq;
+
+	/* mqe_ctx lock synchronizes with any other pending cmds. */
+	mutex_lock(&dev->mqe_ctx.lock);
+	mbxq = &dev->mq.sq;
+	if (mbxq->created) {
+		ocrdma_mbx_delete_q(dev, mbxq, QTYPE_MCCQ);
+		ocrdma_free_q(dev, mbxq);
+	}
+	mutex_unlock(&dev->mqe_ctx.lock);
+
+	cq = &dev->mq.cq;
+	if (cq->created) {
+		ocrdma_mbx_delete_q(dev, cq, QTYPE_CQ);
+		ocrdma_free_q(dev, cq);
+	}
+}
+
+static void ocrdma_process_qpcat_error(struct ocrdma_dev *dev,
+				       struct ocrdma_qp *qp)
+{
+	enum ib_qp_state new_ib_qps = IB_QPS_ERR;
+	enum ib_qp_state old_ib_qps;
+
+	if (qp == NULL)
+		BUG();
+	ocrdma_qp_state_change(qp, new_ib_qps, &old_ib_qps);
+}
+
+static void ocrdma_dispatch_ibevent(struct ocrdma_dev *dev,
+				    struct ocrdma_ae_mcqe *cqe)
+{
+	struct ocrdma_qp *qp = NULL;
+	struct ocrdma_cq *cq = NULL;
+	struct ib_event ib_evt;
+	int cq_event = 0;
+	int qp_event = 1;
+	int srq_event = 0;
+	int dev_event = 0;
+	int type = (cqe->valid_ae_event & OCRDMA_AE_MCQE_EVENT_TYPE_MASK) >>
+	    OCRDMA_AE_MCQE_EVENT_TYPE_SHIFT;
+
+	if (cqe->qpvalid_qpid & OCRDMA_AE_MCQE_QPVALID)
+		qp = dev->qp_tbl[cqe->qpvalid_qpid & OCRDMA_AE_MCQE_QPID_MASK];
+	if (cqe->cqvalid_cqid & OCRDMA_AE_MCQE_CQVALID)
+		cq = dev->cq_tbl[cqe->cqvalid_cqid & OCRDMA_AE_MCQE_CQID_MASK];
+
+	memset(&ib_evt, 0, sizeof(ib_evt));
+
+	ib_evt.device = &dev->ibdev;
+
+	switch (type) {
+	case OCRDMA_CQ_ERROR:
+		ib_evt.element.cq = &cq->ibcq;
+		ib_evt.event = IB_EVENT_CQ_ERR;
+		cq_event = 1;
+		qp_event = 0;
+		break;
+	case OCRDMA_CQ_OVERRUN_ERROR:
+		ib_evt.element.cq = &cq->ibcq;
+		ib_evt.event = IB_EVENT_CQ_ERR;
+		cq_event = 1;
+		qp_event = 0;
+		break;
+	case OCRDMA_CQ_QPCAT_ERROR:
+		ib_evt.element.qp = &qp->ibqp;
+		ib_evt.event = IB_EVENT_QP_FATAL;
+		ocrdma_process_qpcat_error(dev, qp);
+		break;
+	case OCRDMA_QP_ACCESS_ERROR:
+		ib_evt.element.qp = &qp->ibqp;
+		ib_evt.event = IB_EVENT_QP_ACCESS_ERR;
+		break;
+	case OCRDMA_QP_COMM_EST_EVENT:
+		ib_evt.element.qp = &qp->ibqp;
+		ib_evt.event = IB_EVENT_COMM_EST;
+		break;
+	case OCRDMA_SQ_DRAINED_EVENT:
+		ib_evt.element.qp = &qp->ibqp;
+		ib_evt.event = IB_EVENT_SQ_DRAINED;
+		break;
+	case OCRDMA_DEVICE_FATAL_EVENT:
+		ib_evt.element.port_num = 1;
+		ib_evt.event = IB_EVENT_DEVICE_FATAL;
+		qp_event = 0;
+		dev_event = 1;
+		break;
+	case OCRDMA_SRQCAT_ERROR:
+		ib_evt.element.srq = &qp->srq->ibsrq;
+		ib_evt.event = IB_EVENT_SRQ_ERR;
+		srq_event = 1;
+		qp_event = 0;
+		break;
+	case OCRDMA_SRQ_LIMIT_EVENT:
+		ib_evt.element.srq = &qp->srq->ibsrq;
+		ib_evt.event = IB_EVENT_SRQ_LIMIT_REACHED;
+		srq_event = 1;
+		qp_event = 0;
+		break;
+	case OCRDMA_QP_LAST_WQE_EVENT:
+		ib_evt.element.qp = &qp->ibqp;
+		ib_evt.event = IB_EVENT_QP_LAST_WQE_REACHED;
+		break;
+	default:
+		cq_event = 0;
+		qp_event = 0;
+		srq_event = 0;
+		dev_event = 0;
+		pr_err("%s() unknown type=0x%x\n", __func__, type);
+		break;
+	}
+
+	if (qp_event) {
+		if (qp->ibqp.event_handler)
+			qp->ibqp.event_handler(&ib_evt, qp->ibqp.qp_context);
+	} else if (cq_event) {
+		if (cq->ibcq.event_handler)
+			cq->ibcq.event_handler(&ib_evt, cq->ibcq.cq_context);
+	} else if (srq_event) {
+		if (qp->srq->ibsrq.event_handler)
+			qp->srq->ibsrq.event_handler(&ib_evt,
+						     qp->srq->ibsrq.
+						     srq_context);
+	} else if (dev_event) {
+		pr_err("%s: Fatal event received\n", dev->ibdev.name);
+		ib_dispatch_event(&ib_evt);
+	}
+
+}
+
+static void ocrdma_process_grp5_aync(struct ocrdma_dev *dev,
+					struct ocrdma_ae_mcqe *cqe)
+{
+	struct ocrdma_ae_pvid_mcqe *evt;
+	int type = (cqe->valid_ae_event & OCRDMA_AE_MCQE_EVENT_TYPE_MASK) >>
+			OCRDMA_AE_MCQE_EVENT_TYPE_SHIFT;
+
+	switch (type) {
+	case OCRDMA_ASYNC_EVENT_PVID_STATE:
+		evt = (struct ocrdma_ae_pvid_mcqe *)cqe;
+		if ((evt->tag_enabled & OCRDMA_AE_PVID_MCQE_ENABLED_MASK) >>
+			OCRDMA_AE_PVID_MCQE_ENABLED_SHIFT)
+			dev->pvid = ((evt->tag_enabled &
+					OCRDMA_AE_PVID_MCQE_TAG_MASK) >>
+					OCRDMA_AE_PVID_MCQE_TAG_SHIFT);
+		break;
+
+	case OCRDMA_ASYNC_EVENT_COS_VALUE:
+		atomic_set(&dev->update_sl, 1);
+		break;
+	default:
+		/* Not interested evts. */
+		break;
+	}
+}
+
+static void ocrdma_process_acqe(struct ocrdma_dev *dev, void *ae_cqe)
+{
+	/* async CQE processing */
+	struct ocrdma_ae_mcqe *cqe = ae_cqe;
+	u32 evt_code = (cqe->valid_ae_event & OCRDMA_AE_MCQE_EVENT_CODE_MASK) >>
+			OCRDMA_AE_MCQE_EVENT_CODE_SHIFT;
+
+	if (evt_code == OCRDMA_ASYNC_RDMA_EVE_CODE)
+		ocrdma_dispatch_ibevent(dev, cqe);
+	else if (evt_code == OCRDMA_ASYNC_GRP5_EVE_CODE)
+		ocrdma_process_grp5_aync(dev, cqe);
+	else
+		pr_err("%s(%d) invalid evt code=0x%x\n", __func__,
+		       dev->id, evt_code);
+}
+
+static void ocrdma_process_mcqe(struct ocrdma_dev *dev, struct ocrdma_mcqe *cqe)
+{
+	if (dev->mqe_ctx.tag == cqe->tag_lo && dev->mqe_ctx.cmd_done == false) {
+		dev->mqe_ctx.cqe_status = (cqe->status &
+		     OCRDMA_MCQE_STATUS_MASK) >> OCRDMA_MCQE_STATUS_SHIFT;
+		dev->mqe_ctx.ext_status =
+		    (cqe->status & OCRDMA_MCQE_ESTATUS_MASK)
+		    >> OCRDMA_MCQE_ESTATUS_SHIFT;
+		dev->mqe_ctx.cmd_done = true;
+		wake_up(&dev->mqe_ctx.cmd_wait);
+	} else
+		pr_err("%s() cqe for invalid tag0x%x.expected=0x%x\n",
+		       __func__, cqe->tag_lo, dev->mqe_ctx.tag);
+}
+
+static int ocrdma_mq_cq_handler(struct ocrdma_dev *dev, u16 cq_id)
+{
+	u16 cqe_popped = 0;
+	struct ocrdma_mcqe *cqe;
+
+	while (1) {
+		cqe = ocrdma_get_mcqe(dev);
+		if (cqe == NULL)
+			break;
+		ocrdma_le32_to_cpu(cqe, sizeof(*cqe));
+		cqe_popped += 1;
+		if (cqe->valid_ae_cmpl_cons & OCRDMA_MCQE_AE_MASK)
+			ocrdma_process_acqe(dev, cqe);
+		else if (cqe->valid_ae_cmpl_cons & OCRDMA_MCQE_CMPL_MASK)
+			ocrdma_process_mcqe(dev, cqe);
+		memset(cqe, 0, sizeof(struct ocrdma_mcqe));
+		ocrdma_mcq_inc_tail(dev);
+	}
+	ocrdma_ring_cq_db(dev, dev->mq.cq.id, true, false, cqe_popped);
+	return 0;
+}
+
+static void ocrdma_qp_buddy_cq_handler(struct ocrdma_dev *dev,
+				       struct ocrdma_cq *cq)
+{
+	unsigned long flags;
+	struct ocrdma_qp *qp;
+	bool buddy_cq_found = false;
+	/* Go through list of QPs in error state which are using this CQ
+	 * and invoke its callback handler to trigger CQE processing for
+	 * error/flushed CQE. It is rare to find more than few entries in
+	 * this list as most consumers stops after getting error CQE.
+	 * List is traversed only once when a matching buddy cq found for a QP.
+	 */
+	spin_lock_irqsave(&dev->flush_q_lock, flags);
+	list_for_each_entry(qp, &cq->sq_head, sq_entry) {
+		if (qp->srq)
+			continue;
+		/* if wq and rq share the same cq, than comp_handler
+		 * is already invoked.
+		 */
+		if (qp->sq_cq == qp->rq_cq)
+			continue;
+		/* if completion came on sq, rq's cq is buddy cq.
+		 * if completion came on rq, sq's cq is buddy cq.
+		 */
+		if (qp->sq_cq == cq)
+			cq = qp->rq_cq;
+		else
+			cq = qp->sq_cq;
+		buddy_cq_found = true;
+		break;
+	}
+	spin_unlock_irqrestore(&dev->flush_q_lock, flags);
+	if (buddy_cq_found == false)
+		return;
+	if (cq->ibcq.comp_handler) {
+		spin_lock_irqsave(&cq->comp_handler_lock, flags);
+		(*cq->ibcq.comp_handler) (&cq->ibcq, cq->ibcq.cq_context);
+		spin_unlock_irqrestore(&cq->comp_handler_lock, flags);
+	}
+}
+
+static void ocrdma_qp_cq_handler(struct ocrdma_dev *dev, u16 cq_idx)
+{
+	unsigned long flags;
+	struct ocrdma_cq *cq;
+
+	if (cq_idx >= OCRDMA_MAX_CQ)
+		BUG();
+
+	cq = dev->cq_tbl[cq_idx];
+	if (cq == NULL)
+		return;
+
+	if (cq->ibcq.comp_handler) {
+		spin_lock_irqsave(&cq->comp_handler_lock, flags);
+		(*cq->ibcq.comp_handler) (&cq->ibcq, cq->ibcq.cq_context);
+		spin_unlock_irqrestore(&cq->comp_handler_lock, flags);
+	}
+	ocrdma_qp_buddy_cq_handler(dev, cq);
+}
+
+static void ocrdma_cq_handler(struct ocrdma_dev *dev, u16 cq_id)
+{
+	/* process the MQ-CQE. */
+	if (cq_id == dev->mq.cq.id)
+		ocrdma_mq_cq_handler(dev, cq_id);
+	else
+		ocrdma_qp_cq_handler(dev, cq_id);
+}
+
+static irqreturn_t ocrdma_irq_handler(int irq, void *handle)
+{
+	struct ocrdma_eq *eq = handle;
+	struct ocrdma_dev *dev = eq->dev;
+	struct ocrdma_eqe eqe;
+	struct ocrdma_eqe *ptr;
+	u16 cq_id;
+	int budget = eq->cq_cnt;
+
+	do {
+		ptr = ocrdma_get_eqe(eq);
+		eqe = *ptr;
+		ocrdma_le32_to_cpu(&eqe, sizeof(eqe));
+		if ((eqe.id_valid & OCRDMA_EQE_VALID_MASK) == 0)
+			break;
+
+		ptr->id_valid = 0;
+		/* ring eq doorbell as soon as its consumed. */
+		ocrdma_ring_eq_db(dev, eq->q.id, false, true, 1);
+		/* check whether its CQE or not. */
+		if ((eqe.id_valid & OCRDMA_EQE_FOR_CQE_MASK) == 0) {
+			cq_id = eqe.id_valid >> OCRDMA_EQE_RESOURCE_ID_SHIFT;
+			ocrdma_cq_handler(dev, cq_id);
+		}
+		ocrdma_eq_inc_tail(eq);
+
+		/* There can be a stale EQE after the last bound CQ is
+		 * destroyed. EQE valid and budget == 0 implies this.
+		 */
+		if (budget)
+			budget--;
+
+	} while (budget);
+
+	ocrdma_ring_eq_db(dev, eq->q.id, true, true, 0);
+	return IRQ_HANDLED;
+}
+
+static void ocrdma_post_mqe(struct ocrdma_dev *dev, struct ocrdma_mqe *cmd)
+{
+	struct ocrdma_mqe *mqe;
+
+	dev->mqe_ctx.tag = dev->mq.sq.head;
+	dev->mqe_ctx.cmd_done = false;
+	mqe = ocrdma_get_mqe(dev);
+	cmd->hdr.tag_lo = dev->mq.sq.head;
+	ocrdma_copy_cpu_to_le32(mqe, cmd, sizeof(*mqe));
+	/* make sure descriptor is written before ringing doorbell */
+	wmb();
+	ocrdma_mq_inc_head(dev);
+	ocrdma_ring_mq_db(dev);
+}
+
+static int ocrdma_wait_mqe_cmpl(struct ocrdma_dev *dev)
+{
+	long status;
+	/* 30 sec timeout */
+	status = wait_event_timeout(dev->mqe_ctx.cmd_wait,
+				    (dev->mqe_ctx.cmd_done != false),
+				    msecs_to_jiffies(30000));
+	if (status)
+		return 0;
+	else {
+		dev->mqe_ctx.fw_error_state = true;
+		pr_err("%s(%d) mailbox timeout: fw not responding\n",
+		       __func__, dev->id);
+		return -1;
+	}
+}
+
+/* issue a mailbox command on the MQ */
+static int ocrdma_mbx_cmd(struct ocrdma_dev *dev, struct ocrdma_mqe *mqe)
+{
+	int status = 0;
+	u16 cqe_status, ext_status;
+	struct ocrdma_mqe *rsp_mqe;
+	struct ocrdma_mbx_rsp *rsp = NULL;
+
+	mutex_lock(&dev->mqe_ctx.lock);
+	if (dev->mqe_ctx.fw_error_state)
+		goto mbx_err;
+	ocrdma_post_mqe(dev, mqe);
+	status = ocrdma_wait_mqe_cmpl(dev);
+	if (status)
+		goto mbx_err;
+	cqe_status = dev->mqe_ctx.cqe_status;
+	ext_status = dev->mqe_ctx.ext_status;
+	rsp_mqe = ocrdma_get_mqe_rsp(dev);
+	ocrdma_copy_le32_to_cpu(mqe, rsp_mqe, (sizeof(*mqe)));
+	if ((mqe->hdr.spcl_sge_cnt_emb & OCRDMA_MQE_HDR_EMB_MASK) >>
+				OCRDMA_MQE_HDR_EMB_SHIFT)
+		rsp = &mqe->u.rsp;
+
+	if (cqe_status || ext_status) {
+		pr_err("%s() cqe_status=0x%x, ext_status=0x%x,",
+		       __func__, cqe_status, ext_status);
+		if (rsp) {
+			/* This is for embedded cmds. */
+			pr_err("opcode=0x%x, subsystem=0x%x\n",
+			       (rsp->subsys_op & OCRDMA_MBX_RSP_OPCODE_MASK) >>
+				OCRDMA_MBX_RSP_OPCODE_SHIFT,
+				(rsp->subsys_op & OCRDMA_MBX_RSP_SUBSYS_MASK) >>
+				OCRDMA_MBX_RSP_SUBSYS_SHIFT);
+		}
+		status = ocrdma_get_mbx_cqe_errno(cqe_status);
+		goto mbx_err;
+	}
+	/* For non embedded, rsp errors are handled in ocrdma_nonemb_mbx_cmd */
+	if (rsp && (mqe->u.rsp.status & OCRDMA_MBX_RSP_STATUS_MASK))
+		status = ocrdma_get_mbx_errno(mqe->u.rsp.status);
+mbx_err:
+	mutex_unlock(&dev->mqe_ctx.lock);
+	return status;
+}
+
+static int ocrdma_nonemb_mbx_cmd(struct ocrdma_dev *dev, struct ocrdma_mqe *mqe,
+				 void *payload_va)
+{
+	int status = 0;
+	struct ocrdma_mbx_rsp *rsp = payload_va;
+
+	if ((mqe->hdr.spcl_sge_cnt_emb & OCRDMA_MQE_HDR_EMB_MASK) >>
+				OCRDMA_MQE_HDR_EMB_SHIFT)
+		BUG();
+
+	status = ocrdma_mbx_cmd(dev, mqe);
+	if (!status)
+		/* For non embedded, only CQE failures are handled in
+		 * ocrdma_mbx_cmd. We need to check for RSP errors.
+		 */
+		if (rsp->status & OCRDMA_MBX_RSP_STATUS_MASK)
+			status = ocrdma_get_mbx_errno(rsp->status);
+
+	if (status)
+		pr_err("opcode=0x%x, subsystem=0x%x\n",
+		       (rsp->subsys_op & OCRDMA_MBX_RSP_OPCODE_MASK) >>
+			OCRDMA_MBX_RSP_OPCODE_SHIFT,
+			(rsp->subsys_op & OCRDMA_MBX_RSP_SUBSYS_MASK) >>
+			OCRDMA_MBX_RSP_SUBSYS_SHIFT);
+	return status;
+}
+
+static void ocrdma_get_attr(struct ocrdma_dev *dev,
+			      struct ocrdma_dev_attr *attr,
+			      struct ocrdma_mbx_query_config *rsp)
+{
+	attr->max_pd =
+	    (rsp->max_pd_ca_ack_delay & OCRDMA_MBX_QUERY_CFG_MAX_PD_MASK) >>
+	    OCRDMA_MBX_QUERY_CFG_MAX_PD_SHIFT;
+	attr->max_qp =
+	    (rsp->qp_srq_cq_ird_ord & OCRDMA_MBX_QUERY_CFG_MAX_QP_MASK) >>
+	    OCRDMA_MBX_QUERY_CFG_MAX_QP_SHIFT;
+	attr->max_srq =
+		(rsp->max_srq_rpir_qps & OCRDMA_MBX_QUERY_CFG_MAX_SRQ_MASK) >>
+		OCRDMA_MBX_QUERY_CFG_MAX_SRQ_OFFSET;
+	attr->max_send_sge = ((rsp->max_write_send_sge &
+			       OCRDMA_MBX_QUERY_CFG_MAX_SEND_SGE_MASK) >>
+			      OCRDMA_MBX_QUERY_CFG_MAX_SEND_SGE_SHIFT);
+	attr->max_recv_sge = (rsp->max_write_send_sge &
+			      OCRDMA_MBX_QUERY_CFG_MAX_SEND_SGE_MASK) >>
+	    OCRDMA_MBX_QUERY_CFG_MAX_SEND_SGE_SHIFT;
+	attr->max_srq_sge = (rsp->max_srq_rqe_sge &
+			      OCRDMA_MBX_QUERY_CFG_MAX_SRQ_SGE_MASK) >>
+	    OCRDMA_MBX_QUERY_CFG_MAX_SRQ_SGE_OFFSET;
+	attr->max_rdma_sge = (rsp->max_write_send_sge &
+			      OCRDMA_MBX_QUERY_CFG_MAX_WRITE_SGE_MASK) >>
+	    OCRDMA_MBX_QUERY_CFG_MAX_WRITE_SGE_SHIFT;
+	attr->max_ord_per_qp = (rsp->max_ird_ord_per_qp &
+				OCRDMA_MBX_QUERY_CFG_MAX_ORD_PER_QP_MASK) >>
+	    OCRDMA_MBX_QUERY_CFG_MAX_ORD_PER_QP_SHIFT;
+	attr->max_ird_per_qp = (rsp->max_ird_ord_per_qp &
+				OCRDMA_MBX_QUERY_CFG_MAX_IRD_PER_QP_MASK) >>
+	    OCRDMA_MBX_QUERY_CFG_MAX_IRD_PER_QP_SHIFT;
+	attr->cq_overflow_detect = (rsp->qp_srq_cq_ird_ord &
+				    OCRDMA_MBX_QUERY_CFG_CQ_OVERFLOW_MASK) >>
+	    OCRDMA_MBX_QUERY_CFG_CQ_OVERFLOW_SHIFT;
+	attr->srq_supported = (rsp->qp_srq_cq_ird_ord &
+			       OCRDMA_MBX_QUERY_CFG_SRQ_SUPPORTED_MASK) >>
+	    OCRDMA_MBX_QUERY_CFG_SRQ_SUPPORTED_SHIFT;
+	attr->local_ca_ack_delay = (rsp->max_pd_ca_ack_delay &
+				    OCRDMA_MBX_QUERY_CFG_CA_ACK_DELAY_MASK) >>
+	    OCRDMA_MBX_QUERY_CFG_CA_ACK_DELAY_SHIFT;
+	attr->max_mw = rsp->max_mw;
+	attr->max_mr = rsp->max_mr;
+	attr->max_mr_size = ((u64)rsp->max_mr_size_hi << 32) |
+			      rsp->max_mr_size_lo;
+	attr->max_fmr = 0;
+	attr->max_pages_per_frmr = rsp->max_pages_per_frmr;
+	attr->max_num_mr_pbl = rsp->max_num_mr_pbl;
+	attr->max_cqe = rsp->max_cq_cqes_per_cq &
+			OCRDMA_MBX_QUERY_CFG_MAX_CQES_PER_CQ_MASK;
+	attr->max_cq = (rsp->max_cq_cqes_per_cq &
+			OCRDMA_MBX_QUERY_CFG_MAX_CQ_MASK) >>
+			OCRDMA_MBX_QUERY_CFG_MAX_CQ_OFFSET;
+	attr->wqe_size = ((rsp->wqe_rqe_stride_max_dpp_cqs &
+		OCRDMA_MBX_QUERY_CFG_MAX_WQE_SIZE_MASK) >>
+		OCRDMA_MBX_QUERY_CFG_MAX_WQE_SIZE_OFFSET) *
+		OCRDMA_WQE_STRIDE;
+	attr->rqe_size = ((rsp->wqe_rqe_stride_max_dpp_cqs &
+		OCRDMA_MBX_QUERY_CFG_MAX_RQE_SIZE_MASK) >>
+		OCRDMA_MBX_QUERY_CFG_MAX_RQE_SIZE_OFFSET) *
+		OCRDMA_WQE_STRIDE;
+	attr->max_inline_data =
+	    attr->wqe_size - (sizeof(struct ocrdma_hdr_wqe) +
+			      sizeof(struct ocrdma_sge));
+	if (ocrdma_get_asic_type(dev) == OCRDMA_ASIC_GEN_SKH_R) {
+		attr->ird = 1;
+		attr->ird_page_size = OCRDMA_MIN_Q_PAGE_SIZE;
+		attr->num_ird_pages = MAX_OCRDMA_IRD_PAGES;
+	}
+	dev->attr.max_wqe = rsp->max_wqes_rqes_per_q >>
+		 OCRDMA_MBX_QUERY_CFG_MAX_WQES_PER_WQ_OFFSET;
+	dev->attr.max_rqe = rsp->max_wqes_rqes_per_q &
+		OCRDMA_MBX_QUERY_CFG_MAX_RQES_PER_RQ_MASK;
+}
+
+static int ocrdma_check_fw_config(struct ocrdma_dev *dev,
+				   struct ocrdma_fw_conf_rsp *conf)
+{
+	u32 fn_mode;
+
+	fn_mode = conf->fn_mode & OCRDMA_FN_MODE_RDMA;
+	if (fn_mode != OCRDMA_FN_MODE_RDMA)
+		return -EINVAL;
+	dev->base_eqid = conf->base_eqid;
+	dev->max_eq = conf->max_eq;
+	return 0;
+}
+
+/* can be issued only during init time. */
+static int ocrdma_mbx_query_fw_ver(struct ocrdma_dev *dev)
+{
+	int status = -ENOMEM;
+	struct ocrdma_mqe *cmd;
+	struct ocrdma_fw_ver_rsp *rsp;
+
+	cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_GET_FW_VER, sizeof(*cmd));
+	if (!cmd)
+		return -ENOMEM;
+	ocrdma_init_mch((struct ocrdma_mbx_hdr *)&cmd->u.cmd[0],
+			OCRDMA_CMD_GET_FW_VER,
+			OCRDMA_SUBSYS_COMMON, sizeof(*cmd));
+
+	status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd);
+	if (status)
+		goto mbx_err;
+	rsp = (struct ocrdma_fw_ver_rsp *)cmd;
+	memset(&dev->attr.fw_ver[0], 0, sizeof(dev->attr.fw_ver));
+	memcpy(&dev->attr.fw_ver[0], &rsp->running_ver[0],
+	       sizeof(rsp->running_ver));
+	ocrdma_le32_to_cpu(dev->attr.fw_ver, sizeof(rsp->running_ver));
+mbx_err:
+	kfree(cmd);
+	return status;
+}
+
+/* can be issued only during init time. */
+static int ocrdma_mbx_query_fw_config(struct ocrdma_dev *dev)
+{
+	int status = -ENOMEM;
+	struct ocrdma_mqe *cmd;
+	struct ocrdma_fw_conf_rsp *rsp;
+
+	cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_GET_FW_CONFIG, sizeof(*cmd));
+	if (!cmd)
+		return -ENOMEM;
+	ocrdma_init_mch((struct ocrdma_mbx_hdr *)&cmd->u.cmd[0],
+			OCRDMA_CMD_GET_FW_CONFIG,
+			OCRDMA_SUBSYS_COMMON, sizeof(*cmd));
+	status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd);
+	if (status)
+		goto mbx_err;
+	rsp = (struct ocrdma_fw_conf_rsp *)cmd;
+	status = ocrdma_check_fw_config(dev, rsp);
+mbx_err:
+	kfree(cmd);
+	return status;
+}
+
+int ocrdma_mbx_rdma_stats(struct ocrdma_dev *dev, bool reset)
+{
+	struct ocrdma_rdma_stats_req *req = dev->stats_mem.va;
+	struct ocrdma_mqe *mqe = &dev->stats_mem.mqe;
+	struct ocrdma_rdma_stats_resp *old_stats;
+	int status;
+
+	old_stats = kmalloc(sizeof(*old_stats), GFP_KERNEL);
+	if (old_stats == NULL)
+		return -ENOMEM;
+
+	memset(mqe, 0, sizeof(*mqe));
+	mqe->hdr.pyld_len = dev->stats_mem.size;
+	mqe->hdr.spcl_sge_cnt_emb |=
+			(1 << OCRDMA_MQE_HDR_SGE_CNT_SHIFT) &
+				OCRDMA_MQE_HDR_SGE_CNT_MASK;
+	mqe->u.nonemb_req.sge[0].pa_lo = (u32) (dev->stats_mem.pa & 0xffffffff);
+	mqe->u.nonemb_req.sge[0].pa_hi = (u32) upper_32_bits(dev->stats_mem.pa);
+	mqe->u.nonemb_req.sge[0].len = dev->stats_mem.size;
+
+	/* Cache the old stats */
+	memcpy(old_stats, req, sizeof(struct ocrdma_rdma_stats_resp));
+	memset(req, 0, dev->stats_mem.size);
+
+	ocrdma_init_mch((struct ocrdma_mbx_hdr *)req,
+			OCRDMA_CMD_GET_RDMA_STATS,
+			OCRDMA_SUBSYS_ROCE,
+			dev->stats_mem.size);
+	if (reset)
+		req->reset_stats = reset;
+
+	status = ocrdma_nonemb_mbx_cmd(dev, mqe, dev->stats_mem.va);
+	if (status)
+		/* Copy from cache, if mbox fails */
+		memcpy(req, old_stats, sizeof(struct ocrdma_rdma_stats_resp));
+	else
+		ocrdma_le32_to_cpu(req, dev->stats_mem.size);
+
+	kfree(old_stats);
+	return status;
+}
+
+static int ocrdma_mbx_get_ctrl_attribs(struct ocrdma_dev *dev)
+{
+	int status = -ENOMEM;
+	struct ocrdma_dma_mem dma;
+	struct ocrdma_mqe *mqe;
+	struct ocrdma_get_ctrl_attribs_rsp *ctrl_attr_rsp;
+	struct mgmt_hba_attribs *hba_attribs;
+
+	mqe = kzalloc(sizeof(struct ocrdma_mqe), GFP_KERNEL);
+	if (!mqe)
+		return status;
+
+	dma.size = sizeof(struct ocrdma_get_ctrl_attribs_rsp);
+	dma.va	 = dma_alloc_coherent(&dev->nic_info.pdev->dev,
+					dma.size, &dma.pa, GFP_KERNEL);
+	if (!dma.va)
+		goto free_mqe;
+
+	mqe->hdr.pyld_len = dma.size;
+	mqe->hdr.spcl_sge_cnt_emb |=
+			(1 << OCRDMA_MQE_HDR_SGE_CNT_SHIFT) &
+			OCRDMA_MQE_HDR_SGE_CNT_MASK;
+	mqe->u.nonemb_req.sge[0].pa_lo = (u32) (dma.pa & 0xffffffff);
+	mqe->u.nonemb_req.sge[0].pa_hi = (u32) upper_32_bits(dma.pa);
+	mqe->u.nonemb_req.sge[0].len = dma.size;
+
+	memset(dma.va, 0, dma.size);
+	ocrdma_init_mch((struct ocrdma_mbx_hdr *)dma.va,
+			OCRDMA_CMD_GET_CTRL_ATTRIBUTES,
+			OCRDMA_SUBSYS_COMMON,
+			dma.size);
+
+	status = ocrdma_nonemb_mbx_cmd(dev, mqe, dma.va);
+	if (!status) {
+		ctrl_attr_rsp = (struct ocrdma_get_ctrl_attribs_rsp *)dma.va;
+		hba_attribs = &ctrl_attr_rsp->ctrl_attribs.hba_attribs;
+
+		dev->hba_port_num = (hba_attribs->ptpnum_maxdoms_hbast_cv &
+					OCRDMA_HBA_ATTRB_PTNUM_MASK)
+					>> OCRDMA_HBA_ATTRB_PTNUM_SHIFT;
+		strncpy(dev->model_number,
+			hba_attribs->controller_model_number, 31);
+	}
+	dma_free_coherent(&dev->nic_info.pdev->dev, dma.size, dma.va, dma.pa);
+free_mqe:
+	kfree(mqe);
+	return status;
+}
+
+static int ocrdma_mbx_query_dev(struct ocrdma_dev *dev)
+{
+	int status = -ENOMEM;
+	struct ocrdma_mbx_query_config *rsp;
+	struct ocrdma_mqe *cmd;
+
+	cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_QUERY_CONFIG, sizeof(*cmd));
+	if (!cmd)
+		return status;
+	status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd);
+	if (status)
+		goto mbx_err;
+	rsp = (struct ocrdma_mbx_query_config *)cmd;
+	ocrdma_get_attr(dev, &dev->attr, rsp);
+mbx_err:
+	kfree(cmd);
+	return status;
+}
+
+int ocrdma_mbx_get_link_speed(struct ocrdma_dev *dev, u8 *lnk_speed)
+{
+	int status = -ENOMEM;
+	struct ocrdma_get_link_speed_rsp *rsp;
+	struct ocrdma_mqe *cmd;
+
+	cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_QUERY_NTWK_LINK_CONFIG_V1,
+				  sizeof(*cmd));
+	if (!cmd)
+		return status;
+	ocrdma_init_mch((struct ocrdma_mbx_hdr *)&cmd->u.cmd[0],
+			OCRDMA_CMD_QUERY_NTWK_LINK_CONFIG_V1,
+			OCRDMA_SUBSYS_COMMON, sizeof(*cmd));
+
+	((struct ocrdma_mbx_hdr *)cmd->u.cmd)->rsvd_version = 0x1;
+
+	status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd);
+	if (status)
+		goto mbx_err;
+
+	rsp = (struct ocrdma_get_link_speed_rsp *)cmd;
+	*lnk_speed = (rsp->pflt_pps_ld_pnum & OCRDMA_PHY_PS_MASK)
+			>> OCRDMA_PHY_PS_SHIFT;
+
+mbx_err:
+	kfree(cmd);
+	return status;
+}
+
+static int ocrdma_mbx_get_phy_info(struct ocrdma_dev *dev)
+{
+	int status = -ENOMEM;
+	struct ocrdma_mqe *cmd;
+	struct ocrdma_get_phy_info_rsp *rsp;
+
+	cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_PHY_DETAILS, sizeof(*cmd));
+	if (!cmd)
+		return status;
+
+	ocrdma_init_mch((struct ocrdma_mbx_hdr *)&cmd->u.cmd[0],
+			OCRDMA_CMD_PHY_DETAILS, OCRDMA_SUBSYS_COMMON,
+			sizeof(*cmd));
+
+	status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd);
+	if (status)
+		goto mbx_err;
+
+	rsp = (struct ocrdma_get_phy_info_rsp *)cmd;
+	dev->phy.phy_type =
+			(rsp->ityp_ptyp & OCRDMA_PHY_TYPE_MASK);
+	dev->phy.interface_type =
+			(rsp->ityp_ptyp & OCRDMA_IF_TYPE_MASK)
+				>> OCRDMA_IF_TYPE_SHIFT;
+	dev->phy.auto_speeds_supported  =
+			(rsp->fspeed_aspeed & OCRDMA_ASPEED_SUPP_MASK);
+	dev->phy.fixed_speeds_supported =
+			(rsp->fspeed_aspeed & OCRDMA_FSPEED_SUPP_MASK)
+				>> OCRDMA_FSPEED_SUPP_SHIFT;
+mbx_err:
+	kfree(cmd);
+	return status;
+}
+
+int ocrdma_mbx_alloc_pd(struct ocrdma_dev *dev, struct ocrdma_pd *pd)
+{
+	int status = -ENOMEM;
+	struct ocrdma_alloc_pd *cmd;
+	struct ocrdma_alloc_pd_rsp *rsp;
+
+	cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_ALLOC_PD, sizeof(*cmd));
+	if (!cmd)
+		return status;
+	if (pd->dpp_enabled)
+		cmd->enable_dpp_rsvd |= OCRDMA_ALLOC_PD_ENABLE_DPP;
+	status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd);
+	if (status)
+		goto mbx_err;
+	rsp = (struct ocrdma_alloc_pd_rsp *)cmd;
+	pd->id = rsp->dpp_page_pdid & OCRDMA_ALLOC_PD_RSP_PDID_MASK;
+	if (rsp->dpp_page_pdid & OCRDMA_ALLOC_PD_RSP_DPP) {
+		pd->dpp_enabled = true;
+		pd->dpp_page = rsp->dpp_page_pdid >>
+				OCRDMA_ALLOC_PD_RSP_DPP_PAGE_SHIFT;
+	} else {
+		pd->dpp_enabled = false;
+		pd->num_dpp_qp = 0;
+	}
+mbx_err:
+	kfree(cmd);
+	return status;
+}
+
+int ocrdma_mbx_dealloc_pd(struct ocrdma_dev *dev, struct ocrdma_pd *pd)
+{
+	int status = -ENOMEM;
+	struct ocrdma_dealloc_pd *cmd;
+
+	cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_DEALLOC_PD, sizeof(*cmd));
+	if (!cmd)
+		return status;
+	cmd->id = pd->id;
+	status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd);
+	kfree(cmd);
+	return status;
+}
+
+static int ocrdma_build_q_conf(u32 *num_entries, int entry_size,
+			       int *num_pages, int *page_size)
+{
+	int i;
+	int mem_size;
+
+	*num_entries = roundup_pow_of_two(*num_entries);
+	mem_size = *num_entries * entry_size;
+	/* find the possible lowest possible multiplier */
+	for (i = 0; i < OCRDMA_MAX_Q_PAGE_SIZE_CNT; i++) {
+		if (mem_size <= (OCRDMA_Q_PAGE_BASE_SIZE << i))
+			break;
+	}
+	if (i >= OCRDMA_MAX_Q_PAGE_SIZE_CNT)
+		return -EINVAL;
+	mem_size = roundup(mem_size,
+		       ((OCRDMA_Q_PAGE_BASE_SIZE << i) / OCRDMA_MAX_Q_PAGES));
+	*num_pages =
+	    mem_size / ((OCRDMA_Q_PAGE_BASE_SIZE << i) / OCRDMA_MAX_Q_PAGES);
+	*page_size = ((OCRDMA_Q_PAGE_BASE_SIZE << i) / OCRDMA_MAX_Q_PAGES);
+	*num_entries = mem_size / entry_size;
+	return 0;
+}
+
+static int ocrdma_mbx_create_ah_tbl(struct ocrdma_dev *dev)
+{
+	int i;
+	int status = 0;
+	int max_ah;
+	struct ocrdma_create_ah_tbl *cmd;
+	struct ocrdma_create_ah_tbl_rsp *rsp;
+	struct pci_dev *pdev = dev->nic_info.pdev;
+	dma_addr_t pa;
+	struct ocrdma_pbe *pbes;
+
+	cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_CREATE_AH_TBL, sizeof(*cmd));
+	if (!cmd)
+		return status;
+
+	max_ah = OCRDMA_MAX_AH;
+	dev->av_tbl.size = sizeof(struct ocrdma_av) * max_ah;
+
+	/* number of PBEs in PBL */
+	cmd->ah_conf = (OCRDMA_AH_TBL_PAGES <<
+				OCRDMA_CREATE_AH_NUM_PAGES_SHIFT) &
+				OCRDMA_CREATE_AH_NUM_PAGES_MASK;
+
+	/* page size */
+	for (i = 0; i < OCRDMA_MAX_Q_PAGE_SIZE_CNT; i++) {
+		if (PAGE_SIZE == (OCRDMA_MIN_Q_PAGE_SIZE << i))
+			break;
+	}
+	cmd->ah_conf |= (i << OCRDMA_CREATE_AH_PAGE_SIZE_SHIFT) &
+				OCRDMA_CREATE_AH_PAGE_SIZE_MASK;
+
+	/* ah_entry size */
+	cmd->ah_conf |= (sizeof(struct ocrdma_av) <<
+				OCRDMA_CREATE_AH_ENTRY_SIZE_SHIFT) &
+				OCRDMA_CREATE_AH_ENTRY_SIZE_MASK;
+
+	dev->av_tbl.pbl.va = dma_alloc_coherent(&pdev->dev, PAGE_SIZE,
+						&dev->av_tbl.pbl.pa,
+						GFP_KERNEL);
+	if (dev->av_tbl.pbl.va == NULL)
+		goto mem_err;
+
+	dev->av_tbl.va = dma_alloc_coherent(&pdev->dev, dev->av_tbl.size,
+					    &pa, GFP_KERNEL);
+	if (dev->av_tbl.va == NULL)
+		goto mem_err_ah;
+	dev->av_tbl.pa = pa;
+	dev->av_tbl.num_ah = max_ah;
+	memset(dev->av_tbl.va, 0, dev->av_tbl.size);
+
+	pbes = (struct ocrdma_pbe *)dev->av_tbl.pbl.va;
+	for (i = 0; i < dev->av_tbl.size / OCRDMA_MIN_Q_PAGE_SIZE; i++) {
+		pbes[i].pa_lo = (u32)cpu_to_le32(pa & 0xffffffff);
+		pbes[i].pa_hi = (u32)cpu_to_le32(upper_32_bits(pa));
+		pa += PAGE_SIZE;
+	}
+	cmd->tbl_addr[0].lo = (u32)(dev->av_tbl.pbl.pa & 0xFFFFFFFF);
+	cmd->tbl_addr[0].hi = (u32)upper_32_bits(dev->av_tbl.pbl.pa);
+	status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd);
+	if (status)
+		goto mbx_err;
+	rsp = (struct ocrdma_create_ah_tbl_rsp *)cmd;
+	dev->av_tbl.ahid = rsp->ahid & 0xFFFF;
+	kfree(cmd);
+	return 0;
+
+mbx_err:
+	dma_free_coherent(&pdev->dev, dev->av_tbl.size, dev->av_tbl.va,
+			  dev->av_tbl.pa);
+	dev->av_tbl.va = NULL;
+mem_err_ah:
+	dma_free_coherent(&pdev->dev, PAGE_SIZE, dev->av_tbl.pbl.va,
+			  dev->av_tbl.pbl.pa);
+	dev->av_tbl.pbl.va = NULL;
+	dev->av_tbl.size = 0;
+mem_err:
+	kfree(cmd);
+	return status;
+}
+
+static void ocrdma_mbx_delete_ah_tbl(struct ocrdma_dev *dev)
+{
+	struct ocrdma_delete_ah_tbl *cmd;
+	struct pci_dev *pdev = dev->nic_info.pdev;
+
+	if (dev->av_tbl.va == NULL)
+		return;
+
+	cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_DELETE_AH_TBL, sizeof(*cmd));
+	if (!cmd)
+		return;
+	cmd->ahid = dev->av_tbl.ahid;
+
+	ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd);
+	dma_free_coherent(&pdev->dev, dev->av_tbl.size, dev->av_tbl.va,
+			  dev->av_tbl.pa);
+	dev->av_tbl.va = NULL;
+	dma_free_coherent(&pdev->dev, PAGE_SIZE, dev->av_tbl.pbl.va,
+			  dev->av_tbl.pbl.pa);
+	kfree(cmd);
+}
+
+/* Multiple CQs uses the EQ. This routine returns least used
+ * EQ to associate with CQ. This will distributes the interrupt
+ * processing and CPU load to associated EQ, vector and so to that CPU.
+ */
+static u16 ocrdma_bind_eq(struct ocrdma_dev *dev)
+{
+	int i, selected_eq = 0, cq_cnt = 0;
+	u16 eq_id;
+
+	mutex_lock(&dev->dev_lock);
+	cq_cnt = dev->eq_tbl[0].cq_cnt;
+	eq_id = dev->eq_tbl[0].q.id;
+	/* find the EQ which is has the least number of
+	 * CQs associated with it.
+	 */
+	for (i = 0; i < dev->eq_cnt; i++) {
+		if (dev->eq_tbl[i].cq_cnt < cq_cnt) {
+			cq_cnt = dev->eq_tbl[i].cq_cnt;
+			eq_id = dev->eq_tbl[i].q.id;
+			selected_eq = i;
+		}
+	}
+	dev->eq_tbl[selected_eq].cq_cnt += 1;
+	mutex_unlock(&dev->dev_lock);
+	return eq_id;
+}
+
+static void ocrdma_unbind_eq(struct ocrdma_dev *dev, u16 eq_id)
+{
+	int i;
+
+	mutex_lock(&dev->dev_lock);
+	i = ocrdma_get_eq_table_index(dev, eq_id);
+	if (i == -EINVAL)
+		BUG();
+	dev->eq_tbl[i].cq_cnt -= 1;
+	mutex_unlock(&dev->dev_lock);
+}
+
+int ocrdma_mbx_create_cq(struct ocrdma_dev *dev, struct ocrdma_cq *cq,
+			 int entries, int dpp_cq, u16 pd_id)
+{
+	int status = -ENOMEM; int max_hw_cqe;
+	struct pci_dev *pdev = dev->nic_info.pdev;
+	struct ocrdma_create_cq *cmd;
+	struct ocrdma_create_cq_rsp *rsp;
+	u32 hw_pages, cqe_size, page_size, cqe_count;
+
+	if (entries > dev->attr.max_cqe) {
+		pr_err("%s(%d) max_cqe=0x%x, requester_cqe=0x%x\n",
+		       __func__, dev->id, dev->attr.max_cqe, entries);
+		return -EINVAL;
+	}
+	if (dpp_cq && (ocrdma_get_asic_type(dev) != OCRDMA_ASIC_GEN_SKH_R))
+		return -EINVAL;
+
+	if (dpp_cq) {
+		cq->max_hw_cqe = 1;
+		max_hw_cqe = 1;
+		cqe_size = OCRDMA_DPP_CQE_SIZE;
+		hw_pages = 1;
+	} else {
+		cq->max_hw_cqe = dev->attr.max_cqe;
+		max_hw_cqe = dev->attr.max_cqe;
+		cqe_size = sizeof(struct ocrdma_cqe);
+		hw_pages = OCRDMA_CREATE_CQ_MAX_PAGES;
+	}
+
+	cq->len = roundup(max_hw_cqe * cqe_size, OCRDMA_MIN_Q_PAGE_SIZE);
+
+	cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_CREATE_CQ, sizeof(*cmd));
+	if (!cmd)
+		return -ENOMEM;
+	ocrdma_init_mch(&cmd->cmd.req, OCRDMA_CMD_CREATE_CQ,
+			OCRDMA_SUBSYS_COMMON, sizeof(*cmd));
+	cq->va = dma_alloc_coherent(&pdev->dev, cq->len, &cq->pa, GFP_KERNEL);
+	if (!cq->va) {
+		status = -ENOMEM;
+		goto mem_err;
+	}
+	memset(cq->va, 0, cq->len);
+	page_size = cq->len / hw_pages;
+	cmd->cmd.pgsz_pgcnt = (page_size / OCRDMA_MIN_Q_PAGE_SIZE) <<
+					OCRDMA_CREATE_CQ_PAGE_SIZE_SHIFT;
+	cmd->cmd.pgsz_pgcnt |= hw_pages;
+	cmd->cmd.ev_cnt_flags = OCRDMA_CREATE_CQ_DEF_FLAGS;
+
+	cq->eqn = ocrdma_bind_eq(dev);
+	cmd->cmd.req.rsvd_version = OCRDMA_CREATE_CQ_VER3;
+	cqe_count = cq->len / cqe_size;
+	cq->cqe_cnt = cqe_count;
+	if (cqe_count > 1024) {
+		/* Set cnt to 3 to indicate more than 1024 cq entries */
+		cmd->cmd.ev_cnt_flags |= (0x3 << OCRDMA_CREATE_CQ_CNT_SHIFT);
+	} else {
+		u8 count = 0;
+		switch (cqe_count) {
+		case 256:
+			count = 0;
+			break;
+		case 512:
+			count = 1;
+			break;
+		case 1024:
+			count = 2;
+			break;
+		default:
+			goto mbx_err;
+		}
+		cmd->cmd.ev_cnt_flags |= (count << OCRDMA_CREATE_CQ_CNT_SHIFT);
+	}
+	/* shared eq between all the consumer cqs. */
+	cmd->cmd.eqn = cq->eqn;
+	if (ocrdma_get_asic_type(dev) == OCRDMA_ASIC_GEN_SKH_R) {
+		if (dpp_cq)
+			cmd->cmd.pgsz_pgcnt |= OCRDMA_CREATE_CQ_DPP <<
+				OCRDMA_CREATE_CQ_TYPE_SHIFT;
+		cq->phase_change = false;
+		cmd->cmd.pdid_cqecnt = (cq->len / cqe_size);
+	} else {
+		cmd->cmd.pdid_cqecnt = (cq->len / cqe_size) - 1;
+		cmd->cmd.ev_cnt_flags |= OCRDMA_CREATE_CQ_FLAGS_AUTO_VALID;
+		cq->phase_change = true;
+	}
+
+	/* pd_id valid only for v3 */
+	cmd->cmd.pdid_cqecnt |= (pd_id <<
+		OCRDMA_CREATE_CQ_CMD_PDID_SHIFT);
+	ocrdma_build_q_pages(&cmd->cmd.pa[0], hw_pages, cq->pa, page_size);
+	status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd);
+	if (status)
+		goto mbx_err;
+
+	rsp = (struct ocrdma_create_cq_rsp *)cmd;
+	cq->id = (u16) (rsp->rsp.cq_id & OCRDMA_CREATE_CQ_RSP_CQ_ID_MASK);
+	kfree(cmd);
+	return 0;
+mbx_err:
+	ocrdma_unbind_eq(dev, cq->eqn);
+	dma_free_coherent(&pdev->dev, cq->len, cq->va, cq->pa);
+mem_err:
+	kfree(cmd);
+	return status;
+}
+
+int ocrdma_mbx_destroy_cq(struct ocrdma_dev *dev, struct ocrdma_cq *cq)
+{
+	int status = -ENOMEM;
+	struct ocrdma_destroy_cq *cmd;
+
+	cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_DELETE_CQ, sizeof(*cmd));
+	if (!cmd)
+		return status;
+	ocrdma_init_mch(&cmd->req, OCRDMA_CMD_DELETE_CQ,
+			OCRDMA_SUBSYS_COMMON, sizeof(*cmd));
+
+	cmd->bypass_flush_qid |=
+	    (cq->id << OCRDMA_DESTROY_CQ_QID_SHIFT) &
+	    OCRDMA_DESTROY_CQ_QID_MASK;
+
+	status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd);
+	ocrdma_unbind_eq(dev, cq->eqn);
+	dma_free_coherent(&dev->nic_info.pdev->dev, cq->len, cq->va, cq->pa);
+	kfree(cmd);
+	return status;
+}
+
+int ocrdma_mbx_alloc_lkey(struct ocrdma_dev *dev, struct ocrdma_hw_mr *hwmr,
+			  u32 pdid, int addr_check)
+{
+	int status = -ENOMEM;
+	struct ocrdma_alloc_lkey *cmd;
+	struct ocrdma_alloc_lkey_rsp *rsp;
+
+	cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_ALLOC_LKEY, sizeof(*cmd));
+	if (!cmd)
+		return status;
+	cmd->pdid = pdid;
+	cmd->pbl_sz_flags |= addr_check;
+	cmd->pbl_sz_flags |= (hwmr->fr_mr << OCRDMA_ALLOC_LKEY_FMR_SHIFT);
+	cmd->pbl_sz_flags |=
+	    (hwmr->remote_wr << OCRDMA_ALLOC_LKEY_REMOTE_WR_SHIFT);
+	cmd->pbl_sz_flags |=
+	    (hwmr->remote_rd << OCRDMA_ALLOC_LKEY_REMOTE_RD_SHIFT);
+	cmd->pbl_sz_flags |=
+	    (hwmr->local_wr << OCRDMA_ALLOC_LKEY_LOCAL_WR_SHIFT);
+	cmd->pbl_sz_flags |=
+	    (hwmr->remote_atomic << OCRDMA_ALLOC_LKEY_REMOTE_ATOMIC_SHIFT);
+	cmd->pbl_sz_flags |=
+	    (hwmr->num_pbls << OCRDMA_ALLOC_LKEY_PBL_SIZE_SHIFT);
+
+	status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd);
+	if (status)
+		goto mbx_err;
+	rsp = (struct ocrdma_alloc_lkey_rsp *)cmd;
+	hwmr->lkey = rsp->lrkey;
+mbx_err:
+	kfree(cmd);
+	return status;
+}
+
+int ocrdma_mbx_dealloc_lkey(struct ocrdma_dev *dev, int fr_mr, u32 lkey)
+{
+	int status = -ENOMEM;
+	struct ocrdma_dealloc_lkey *cmd;
+
+	cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_DEALLOC_LKEY, sizeof(*cmd));
+	if (!cmd)
+		return -ENOMEM;
+	cmd->lkey = lkey;
+	cmd->rsvd_frmr = fr_mr ? 1 : 0;
+	status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd);
+	if (status)
+		goto mbx_err;
+mbx_err:
+	kfree(cmd);
+	return status;
+}
+
+static int ocrdma_mbx_reg_mr(struct ocrdma_dev *dev, struct ocrdma_hw_mr *hwmr,
+			     u32 pdid, u32 pbl_cnt, u32 pbe_size, u32 last)
+{
+	int status = -ENOMEM;
+	int i;
+	struct ocrdma_reg_nsmr *cmd;
+	struct ocrdma_reg_nsmr_rsp *rsp;
+
+	cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_REGISTER_NSMR, sizeof(*cmd));
+	if (!cmd)
+		return -ENOMEM;
+	cmd->num_pbl_pdid =
+	    pdid | (hwmr->num_pbls << OCRDMA_REG_NSMR_NUM_PBL_SHIFT);
+	cmd->fr_mr = hwmr->fr_mr;
+
+	cmd->flags_hpage_pbe_sz |= (hwmr->remote_wr <<
+				    OCRDMA_REG_NSMR_REMOTE_WR_SHIFT);
+	cmd->flags_hpage_pbe_sz |= (hwmr->remote_rd <<
+				    OCRDMA_REG_NSMR_REMOTE_RD_SHIFT);
+	cmd->flags_hpage_pbe_sz |= (hwmr->local_wr <<
+				    OCRDMA_REG_NSMR_LOCAL_WR_SHIFT);
+	cmd->flags_hpage_pbe_sz |= (hwmr->remote_atomic <<
+				    OCRDMA_REG_NSMR_REMOTE_ATOMIC_SHIFT);
+	cmd->flags_hpage_pbe_sz |= (hwmr->mw_bind <<
+				    OCRDMA_REG_NSMR_BIND_MEMWIN_SHIFT);
+	cmd->flags_hpage_pbe_sz |= (last << OCRDMA_REG_NSMR_LAST_SHIFT);
+
+	cmd->flags_hpage_pbe_sz |= (hwmr->pbe_size / OCRDMA_MIN_HPAGE_SIZE);
+	cmd->flags_hpage_pbe_sz |= (hwmr->pbl_size / OCRDMA_MIN_HPAGE_SIZE) <<
+					OCRDMA_REG_NSMR_HPAGE_SIZE_SHIFT;
+	cmd->totlen_low = hwmr->len;
+	cmd->totlen_high = upper_32_bits(hwmr->len);
+	cmd->fbo_low = (u32) (hwmr->fbo & 0xffffffff);
+	cmd->fbo_high = (u32) upper_32_bits(hwmr->fbo);
+	cmd->va_loaddr = (u32) hwmr->va;
+	cmd->va_hiaddr = (u32) upper_32_bits(hwmr->va);
+
+	for (i = 0; i < pbl_cnt; i++) {
+		cmd->pbl[i].lo = (u32) (hwmr->pbl_table[i].pa & 0xffffffff);
+		cmd->pbl[i].hi = upper_32_bits(hwmr->pbl_table[i].pa);
+	}
+	status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd);
+	if (status)
+		goto mbx_err;
+	rsp = (struct ocrdma_reg_nsmr_rsp *)cmd;
+	hwmr->lkey = rsp->lrkey;
+mbx_err:
+	kfree(cmd);
+	return status;
+}
+
+static int ocrdma_mbx_reg_mr_cont(struct ocrdma_dev *dev,
+				  struct ocrdma_hw_mr *hwmr, u32 pbl_cnt,
+				  u32 pbl_offset, u32 last)
+{
+	int status = -ENOMEM;
+	int i;
+	struct ocrdma_reg_nsmr_cont *cmd;
+
+	cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_REGISTER_NSMR_CONT, sizeof(*cmd));
+	if (!cmd)
+		return -ENOMEM;
+	cmd->lrkey = hwmr->lkey;
+	cmd->num_pbl_offset = (pbl_cnt << OCRDMA_REG_NSMR_CONT_NUM_PBL_SHIFT) |
+	    (pbl_offset & OCRDMA_REG_NSMR_CONT_PBL_SHIFT_MASK);
+	cmd->last = last << OCRDMA_REG_NSMR_CONT_LAST_SHIFT;
+
+	for (i = 0; i < pbl_cnt; i++) {
+		cmd->pbl[i].lo =
+		    (u32) (hwmr->pbl_table[i + pbl_offset].pa & 0xffffffff);
+		cmd->pbl[i].hi =
+		    upper_32_bits(hwmr->pbl_table[i + pbl_offset].pa);
+	}
+	status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd);
+	if (status)
+		goto mbx_err;
+mbx_err:
+	kfree(cmd);
+	return status;
+}
+
+int ocrdma_reg_mr(struct ocrdma_dev *dev,
+		  struct ocrdma_hw_mr *hwmr, u32 pdid, int acc)
+{
+	int status;
+	u32 last = 0;
+	u32 cur_pbl_cnt, pbl_offset;
+	u32 pending_pbl_cnt = hwmr->num_pbls;
+
+	pbl_offset = 0;
+	cur_pbl_cnt = min(pending_pbl_cnt, MAX_OCRDMA_NSMR_PBL);
+	if (cur_pbl_cnt == pending_pbl_cnt)
+		last = 1;
+
+	status = ocrdma_mbx_reg_mr(dev, hwmr, pdid,
+				   cur_pbl_cnt, hwmr->pbe_size, last);
+	if (status) {
+		pr_err("%s() status=%d\n", __func__, status);
+		return status;
+	}
+	/* if there is no more pbls to register then exit. */
+	if (last)
+		return 0;
+
+	while (!last) {
+		pbl_offset += cur_pbl_cnt;
+		pending_pbl_cnt -= cur_pbl_cnt;
+		cur_pbl_cnt = min(pending_pbl_cnt, MAX_OCRDMA_NSMR_PBL);
+		/* if we reach the end of the pbls, then need to set the last
+		 * bit, indicating no more pbls to register for this memory key.
+		 */
+		if (cur_pbl_cnt == pending_pbl_cnt)
+			last = 1;
+
+		status = ocrdma_mbx_reg_mr_cont(dev, hwmr, cur_pbl_cnt,
+						pbl_offset, last);
+		if (status)
+			break;
+	}
+	if (status)
+		pr_err("%s() err. status=%d\n", __func__, status);
+
+	return status;
+}
+
+bool ocrdma_is_qp_in_sq_flushlist(struct ocrdma_cq *cq, struct ocrdma_qp *qp)
+{
+	struct ocrdma_qp *tmp;
+	bool found = false;
+	list_for_each_entry(tmp, &cq->sq_head, sq_entry) {
+		if (qp == tmp) {
+			found = true;
+			break;
+		}
+	}
+	return found;
+}
+
+bool ocrdma_is_qp_in_rq_flushlist(struct ocrdma_cq *cq, struct ocrdma_qp *qp)
+{
+	struct ocrdma_qp *tmp;
+	bool found = false;
+	list_for_each_entry(tmp, &cq->rq_head, rq_entry) {
+		if (qp == tmp) {
+			found = true;
+			break;
+		}
+	}
+	return found;
+}
+
+void ocrdma_flush_qp(struct ocrdma_qp *qp)
+{
+	bool found;
+	unsigned long flags;
+
+	spin_lock_irqsave(&qp->dev->flush_q_lock, flags);
+	found = ocrdma_is_qp_in_sq_flushlist(qp->sq_cq, qp);
+	if (!found)
+		list_add_tail(&qp->sq_entry, &qp->sq_cq->sq_head);
+	if (!qp->srq) {
+		found = ocrdma_is_qp_in_rq_flushlist(qp->rq_cq, qp);
+		if (!found)
+			list_add_tail(&qp->rq_entry, &qp->rq_cq->rq_head);
+	}
+	spin_unlock_irqrestore(&qp->dev->flush_q_lock, flags);
+}
+
+static void ocrdma_init_hwq_ptr(struct ocrdma_qp *qp)
+{
+	qp->sq.head = 0;
+	qp->sq.tail = 0;
+	qp->rq.head = 0;
+	qp->rq.tail = 0;
+}
+
+int ocrdma_qp_state_change(struct ocrdma_qp *qp, enum ib_qp_state new_ib_state,
+			   enum ib_qp_state *old_ib_state)
+{
+	unsigned long flags;
+	int status = 0;
+	enum ocrdma_qp_state new_state;
+	new_state = get_ocrdma_qp_state(new_ib_state);
+
+	/* sync with wqe and rqe posting */
+	spin_lock_irqsave(&qp->q_lock, flags);
+
+	if (old_ib_state)
+		*old_ib_state = get_ibqp_state(qp->state);
+	if (new_state == qp->state) {
+		spin_unlock_irqrestore(&qp->q_lock, flags);
+		return 1;
+	}
+
+
+	if (new_state == OCRDMA_QPS_INIT) {
+		ocrdma_init_hwq_ptr(qp);
+		ocrdma_del_flush_qp(qp);
+	} else if (new_state == OCRDMA_QPS_ERR) {
+		ocrdma_flush_qp(qp);
+	}
+
+	qp->state = new_state;
+
+	spin_unlock_irqrestore(&qp->q_lock, flags);
+	return status;
+}
+
+static u32 ocrdma_set_create_qp_mbx_access_flags(struct ocrdma_qp *qp)
+{
+	u32 flags = 0;
+	if (qp->cap_flags & OCRDMA_QP_INB_RD)
+		flags |= OCRDMA_CREATE_QP_REQ_INB_RDEN_MASK;
+	if (qp->cap_flags & OCRDMA_QP_INB_WR)
+		flags |= OCRDMA_CREATE_QP_REQ_INB_WREN_MASK;
+	if (qp->cap_flags & OCRDMA_QP_MW_BIND)
+		flags |= OCRDMA_CREATE_QP_REQ_BIND_MEMWIN_MASK;
+	if (qp->cap_flags & OCRDMA_QP_LKEY0)
+		flags |= OCRDMA_CREATE_QP_REQ_ZERO_LKEYEN_MASK;
+	if (qp->cap_flags & OCRDMA_QP_FAST_REG)
+		flags |= OCRDMA_CREATE_QP_REQ_FMR_EN_MASK;
+	return flags;
+}
+
+static int ocrdma_set_create_qp_sq_cmd(struct ocrdma_create_qp_req *cmd,
+					struct ib_qp_init_attr *attrs,
+					struct ocrdma_qp *qp)
+{
+	int status;
+	u32 len, hw_pages, hw_page_size;
+	dma_addr_t pa;
+	struct ocrdma_dev *dev = qp->dev;
+	struct pci_dev *pdev = dev->nic_info.pdev;
+	u32 max_wqe_allocated;
+	u32 max_sges = attrs->cap.max_send_sge;
+
+	/* QP1 may exceed 127 */
+	max_wqe_allocated = min_t(u32, attrs->cap.max_send_wr + 1,
+				dev->attr.max_wqe);
+
+	status = ocrdma_build_q_conf(&max_wqe_allocated,
+		dev->attr.wqe_size, &hw_pages, &hw_page_size);
+	if (status) {
+		pr_err("%s() req. max_send_wr=0x%x\n", __func__,
+		       max_wqe_allocated);
+		return -EINVAL;
+	}
+	qp->sq.max_cnt = max_wqe_allocated;
+	len = (hw_pages * hw_page_size);
+
+	qp->sq.va = dma_alloc_coherent(&pdev->dev, len, &pa, GFP_KERNEL);
+	if (!qp->sq.va)
+		return -EINVAL;
+	memset(qp->sq.va, 0, len);
+	qp->sq.len = len;
+	qp->sq.pa = pa;
+	qp->sq.entry_size = dev->attr.wqe_size;
+	ocrdma_build_q_pages(&cmd->wq_addr[0], hw_pages, pa, hw_page_size);
+
+	cmd->type_pgsz_pdn |= (ilog2(hw_page_size / OCRDMA_MIN_Q_PAGE_SIZE)
+				<< OCRDMA_CREATE_QP_REQ_SQ_PAGE_SIZE_SHIFT);
+	cmd->num_wq_rq_pages |= (hw_pages <<
+				 OCRDMA_CREATE_QP_REQ_NUM_WQ_PAGES_SHIFT) &
+	    OCRDMA_CREATE_QP_REQ_NUM_WQ_PAGES_MASK;
+	cmd->max_sge_send_write |= (max_sges <<
+				    OCRDMA_CREATE_QP_REQ_MAX_SGE_SEND_SHIFT) &
+	    OCRDMA_CREATE_QP_REQ_MAX_SGE_SEND_MASK;
+	cmd->max_sge_send_write |= (max_sges <<
+				    OCRDMA_CREATE_QP_REQ_MAX_SGE_WRITE_SHIFT) &
+					OCRDMA_CREATE_QP_REQ_MAX_SGE_WRITE_MASK;
+	cmd->max_wqe_rqe |= (ilog2(qp->sq.max_cnt) <<
+			     OCRDMA_CREATE_QP_REQ_MAX_WQE_SHIFT) &
+				OCRDMA_CREATE_QP_REQ_MAX_WQE_MASK;
+	cmd->wqe_rqe_size |= (dev->attr.wqe_size <<
+			      OCRDMA_CREATE_QP_REQ_WQE_SIZE_SHIFT) &
+				OCRDMA_CREATE_QP_REQ_WQE_SIZE_MASK;
+	return 0;
+}
+
+static int ocrdma_set_create_qp_rq_cmd(struct ocrdma_create_qp_req *cmd,
+					struct ib_qp_init_attr *attrs,
+					struct ocrdma_qp *qp)
+{
+	int status;
+	u32 len, hw_pages, hw_page_size;
+	dma_addr_t pa = 0;
+	struct ocrdma_dev *dev = qp->dev;
+	struct pci_dev *pdev = dev->nic_info.pdev;
+	u32 max_rqe_allocated = attrs->cap.max_recv_wr + 1;
+
+	status = ocrdma_build_q_conf(&max_rqe_allocated, dev->attr.rqe_size,
+				     &hw_pages, &hw_page_size);
+	if (status) {
+		pr_err("%s() req. max_recv_wr=0x%x\n", __func__,
+		       attrs->cap.max_recv_wr + 1);
+		return status;
+	}
+	qp->rq.max_cnt = max_rqe_allocated;
+	len = (hw_pages * hw_page_size);
+
+	qp->rq.va = dma_alloc_coherent(&pdev->dev, len, &pa, GFP_KERNEL);
+	if (!qp->rq.va)
+		return -ENOMEM;
+	memset(qp->rq.va, 0, len);
+	qp->rq.pa = pa;
+	qp->rq.len = len;
+	qp->rq.entry_size = dev->attr.rqe_size;
+
+	ocrdma_build_q_pages(&cmd->rq_addr[0], hw_pages, pa, hw_page_size);
+	cmd->type_pgsz_pdn |= (ilog2(hw_page_size / OCRDMA_MIN_Q_PAGE_SIZE) <<
+		OCRDMA_CREATE_QP_REQ_RQ_PAGE_SIZE_SHIFT);
+	cmd->num_wq_rq_pages |=
+	    (hw_pages << OCRDMA_CREATE_QP_REQ_NUM_RQ_PAGES_SHIFT) &
+	    OCRDMA_CREATE_QP_REQ_NUM_RQ_PAGES_MASK;
+	cmd->max_sge_recv_flags |= (attrs->cap.max_recv_sge <<
+				OCRDMA_CREATE_QP_REQ_MAX_SGE_RECV_SHIFT) &
+				OCRDMA_CREATE_QP_REQ_MAX_SGE_RECV_MASK;
+	cmd->max_wqe_rqe |= (ilog2(qp->rq.max_cnt) <<
+				OCRDMA_CREATE_QP_REQ_MAX_RQE_SHIFT) &
+				OCRDMA_CREATE_QP_REQ_MAX_RQE_MASK;
+	cmd->wqe_rqe_size |= (dev->attr.rqe_size <<
+			OCRDMA_CREATE_QP_REQ_RQE_SIZE_SHIFT) &
+			OCRDMA_CREATE_QP_REQ_RQE_SIZE_MASK;
+	return 0;
+}
+
+static void ocrdma_set_create_qp_dpp_cmd(struct ocrdma_create_qp_req *cmd,
+					 struct ocrdma_pd *pd,
+					 struct ocrdma_qp *qp,
+					 u8 enable_dpp_cq, u16 dpp_cq_id)
+{
+	pd->num_dpp_qp--;
+	qp->dpp_enabled = true;
+	cmd->max_sge_recv_flags |= OCRDMA_CREATE_QP_REQ_ENABLE_DPP_MASK;
+	if (!enable_dpp_cq)
+		return;
+	cmd->max_sge_recv_flags |= OCRDMA_CREATE_QP_REQ_ENABLE_DPP_MASK;
+	cmd->dpp_credits_cqid = dpp_cq_id;
+	cmd->dpp_credits_cqid |= OCRDMA_CREATE_QP_REQ_DPP_CREDIT_LIMIT <<
+					OCRDMA_CREATE_QP_REQ_DPP_CREDIT_SHIFT;
+}
+
+static int ocrdma_set_create_qp_ird_cmd(struct ocrdma_create_qp_req *cmd,
+					struct ocrdma_qp *qp)
+{
+	struct ocrdma_dev *dev = qp->dev;
+	struct pci_dev *pdev = dev->nic_info.pdev;
+	dma_addr_t pa = 0;
+	int ird_page_size = dev->attr.ird_page_size;
+	int ird_q_len = dev->attr.num_ird_pages * ird_page_size;
+	struct ocrdma_hdr_wqe *rqe;
+	int i  = 0;
+
+	if (dev->attr.ird == 0)
+		return 0;
+
+	qp->ird_q_va = dma_alloc_coherent(&pdev->dev, ird_q_len,
+					&pa, GFP_KERNEL);
+	if (!qp->ird_q_va)
+		return -ENOMEM;
+	memset(qp->ird_q_va, 0, ird_q_len);
+	ocrdma_build_q_pages(&cmd->ird_addr[0], dev->attr.num_ird_pages,
+			     pa, ird_page_size);
+	for (; i < ird_q_len / dev->attr.rqe_size; i++) {
+		rqe = (struct ocrdma_hdr_wqe *)(qp->ird_q_va +
+			(i * dev->attr.rqe_size));
+		rqe->cw = 0;
+		rqe->cw |= 2;
+		rqe->cw |= (OCRDMA_TYPE_LKEY << OCRDMA_WQE_TYPE_SHIFT);
+		rqe->cw |= (8 << OCRDMA_WQE_SIZE_SHIFT);
+		rqe->cw |= (8 << OCRDMA_WQE_NXT_WQE_SIZE_SHIFT);
+	}
+	return 0;
+}
+
+static void ocrdma_get_create_qp_rsp(struct ocrdma_create_qp_rsp *rsp,
+				     struct ocrdma_qp *qp,
+				     struct ib_qp_init_attr *attrs,
+				     u16 *dpp_offset, u16 *dpp_credit_lmt)
+{
+	u32 max_wqe_allocated, max_rqe_allocated;
+	qp->id = rsp->qp_id & OCRDMA_CREATE_QP_RSP_QP_ID_MASK;
+	qp->rq.dbid = rsp->sq_rq_id & OCRDMA_CREATE_QP_RSP_RQ_ID_MASK;
+	qp->sq.dbid = rsp->sq_rq_id >> OCRDMA_CREATE_QP_RSP_SQ_ID_SHIFT;
+	qp->max_ird = rsp->max_ord_ird & OCRDMA_CREATE_QP_RSP_MAX_IRD_MASK;
+	qp->max_ord = (rsp->max_ord_ird >> OCRDMA_CREATE_QP_RSP_MAX_ORD_SHIFT);
+	qp->dpp_enabled = false;
+	if (rsp->dpp_response & OCRDMA_CREATE_QP_RSP_DPP_ENABLED_MASK) {
+		qp->dpp_enabled = true;
+		*dpp_credit_lmt = (rsp->dpp_response &
+				OCRDMA_CREATE_QP_RSP_DPP_CREDITS_MASK) >>
+				OCRDMA_CREATE_QP_RSP_DPP_CREDITS_SHIFT;
+		*dpp_offset = (rsp->dpp_response &
+				OCRDMA_CREATE_QP_RSP_DPP_PAGE_OFFSET_MASK) >>
+				OCRDMA_CREATE_QP_RSP_DPP_PAGE_OFFSET_SHIFT;
+	}
+	max_wqe_allocated =
+		rsp->max_wqe_rqe >> OCRDMA_CREATE_QP_RSP_MAX_WQE_SHIFT;
+	max_wqe_allocated = 1 << max_wqe_allocated;
+	max_rqe_allocated = 1 << ((u16)rsp->max_wqe_rqe);
+
+	qp->sq.max_cnt = max_wqe_allocated;
+	qp->sq.max_wqe_idx = max_wqe_allocated - 1;
+
+	if (!attrs->srq) {
+		qp->rq.max_cnt = max_rqe_allocated;
+		qp->rq.max_wqe_idx = max_rqe_allocated - 1;
+	}
+}
+
+int ocrdma_mbx_create_qp(struct ocrdma_qp *qp, struct ib_qp_init_attr *attrs,
+			 u8 enable_dpp_cq, u16 dpp_cq_id, u16 *dpp_offset,
+			 u16 *dpp_credit_lmt)
+{
+	int status = -ENOMEM;
+	u32 flags = 0;
+	struct ocrdma_dev *dev = qp->dev;
+	struct ocrdma_pd *pd = qp->pd;
+	struct pci_dev *pdev = dev->nic_info.pdev;
+	struct ocrdma_cq *cq;
+	struct ocrdma_create_qp_req *cmd;
+	struct ocrdma_create_qp_rsp *rsp;
+	int qptype;
+
+	switch (attrs->qp_type) {
+	case IB_QPT_GSI:
+		qptype = OCRDMA_QPT_GSI;
+		break;
+	case IB_QPT_RC:
+		qptype = OCRDMA_QPT_RC;
+		break;
+	case IB_QPT_UD:
+		qptype = OCRDMA_QPT_UD;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_CREATE_QP, sizeof(*cmd));
+	if (!cmd)
+		return status;
+	cmd->type_pgsz_pdn |= (qptype << OCRDMA_CREATE_QP_REQ_QPT_SHIFT) &
+						OCRDMA_CREATE_QP_REQ_QPT_MASK;
+	status = ocrdma_set_create_qp_sq_cmd(cmd, attrs, qp);
+	if (status)
+		goto sq_err;
+
+	if (attrs->srq) {
+		struct ocrdma_srq *srq = get_ocrdma_srq(attrs->srq);
+		cmd->max_sge_recv_flags |= OCRDMA_CREATE_QP_REQ_USE_SRQ_MASK;
+		cmd->rq_addr[0].lo = srq->id;
+		qp->srq = srq;
+	} else {
+		status = ocrdma_set_create_qp_rq_cmd(cmd, attrs, qp);
+		if (status)
+			goto rq_err;
+	}
+
+	status = ocrdma_set_create_qp_ird_cmd(cmd, qp);
+	if (status)
+		goto mbx_err;
+
+	cmd->type_pgsz_pdn |= (pd->id << OCRDMA_CREATE_QP_REQ_PD_ID_SHIFT) &
+				OCRDMA_CREATE_QP_REQ_PD_ID_MASK;
+
+	flags = ocrdma_set_create_qp_mbx_access_flags(qp);
+
+	cmd->max_sge_recv_flags |= flags;
+	cmd->max_ord_ird |= (dev->attr.max_ord_per_qp <<
+			     OCRDMA_CREATE_QP_REQ_MAX_ORD_SHIFT) &
+				OCRDMA_CREATE_QP_REQ_MAX_ORD_MASK;
+	cmd->max_ord_ird |= (dev->attr.max_ird_per_qp <<
+			     OCRDMA_CREATE_QP_REQ_MAX_IRD_SHIFT) &
+				OCRDMA_CREATE_QP_REQ_MAX_IRD_MASK;
+	cq = get_ocrdma_cq(attrs->send_cq);
+	cmd->wq_rq_cqid |= (cq->id << OCRDMA_CREATE_QP_REQ_WQ_CQID_SHIFT) &
+				OCRDMA_CREATE_QP_REQ_WQ_CQID_MASK;
+	qp->sq_cq = cq;
+	cq = get_ocrdma_cq(attrs->recv_cq);
+	cmd->wq_rq_cqid |= (cq->id << OCRDMA_CREATE_QP_REQ_RQ_CQID_SHIFT) &
+				OCRDMA_CREATE_QP_REQ_RQ_CQID_MASK;
+	qp->rq_cq = cq;
+
+	if (pd->dpp_enabled && attrs->cap.max_inline_data && pd->num_dpp_qp &&
+	    (attrs->cap.max_inline_data <= dev->attr.max_inline_data)) {
+		ocrdma_set_create_qp_dpp_cmd(cmd, pd, qp, enable_dpp_cq,
+					     dpp_cq_id);
+	}
+
+	status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd);
+	if (status)
+		goto mbx_err;
+	rsp = (struct ocrdma_create_qp_rsp *)cmd;
+	ocrdma_get_create_qp_rsp(rsp, qp, attrs, dpp_offset, dpp_credit_lmt);
+	qp->state = OCRDMA_QPS_RST;
+	kfree(cmd);
+	return 0;
+mbx_err:
+	if (qp->rq.va)
+		dma_free_coherent(&pdev->dev, qp->rq.len, qp->rq.va, qp->rq.pa);
+rq_err:
+	pr_err("%s(%d) rq_err\n", __func__, dev->id);
+	dma_free_coherent(&pdev->dev, qp->sq.len, qp->sq.va, qp->sq.pa);
+sq_err:
+	pr_err("%s(%d) sq_err\n", __func__, dev->id);
+	kfree(cmd);
+	return status;
+}
+
+int ocrdma_mbx_query_qp(struct ocrdma_dev *dev, struct ocrdma_qp *qp,
+			struct ocrdma_qp_params *param)
+{
+	int status = -ENOMEM;
+	struct ocrdma_query_qp *cmd;
+	struct ocrdma_query_qp_rsp *rsp;
+
+	cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_QUERY_QP, sizeof(*cmd));
+	if (!cmd)
+		return status;
+	cmd->qp_id = qp->id;
+	status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd);
+	if (status)
+		goto mbx_err;
+	rsp = (struct ocrdma_query_qp_rsp *)cmd;
+	memcpy(param, &rsp->params, sizeof(struct ocrdma_qp_params));
+mbx_err:
+	kfree(cmd);
+	return status;
+}
+
+static int ocrdma_set_av_params(struct ocrdma_qp *qp,
+				struct ocrdma_modify_qp *cmd,
+				struct ib_qp_attr *attrs,
+				int attr_mask)
+{
+	int status;
+	struct ib_ah_attr *ah_attr = &attrs->ah_attr;
+	union ib_gid sgid, zgid;
+	u32 vlan_id;
+	u8 mac_addr[6];
+
+	if ((ah_attr->ah_flags & IB_AH_GRH) == 0)
+		return -EINVAL;
+	if (atomic_cmpxchg(&qp->dev->update_sl, 1, 0))
+		ocrdma_init_service_level(qp->dev);
+	cmd->params.tclass_sq_psn |=
+	    (ah_attr->grh.traffic_class << OCRDMA_QP_PARAMS_TCLASS_SHIFT);
+	cmd->params.rnt_rc_sl_fl |=
+	    (ah_attr->grh.flow_label & OCRDMA_QP_PARAMS_FLOW_LABEL_MASK);
+	cmd->params.rnt_rc_sl_fl |= (ah_attr->sl << OCRDMA_QP_PARAMS_SL_SHIFT);
+	cmd->params.hop_lmt_rq_psn |=
+	    (ah_attr->grh.hop_limit << OCRDMA_QP_PARAMS_HOP_LMT_SHIFT);
+	cmd->flags |= OCRDMA_QP_PARA_FLOW_LBL_VALID;
+	memcpy(&cmd->params.dgid[0], &ah_attr->grh.dgid.raw[0],
+	       sizeof(cmd->params.dgid));
+	status = ocrdma_query_gid(&qp->dev->ibdev, 1,
+			ah_attr->grh.sgid_index, &sgid);
+	if (status)
+		return status;
+
+	memset(&zgid, 0, sizeof(zgid));
+	if (!memcmp(&sgid, &zgid, sizeof(zgid)))
+		return -EINVAL;
+
+	qp->sgid_idx = ah_attr->grh.sgid_index;
+	memcpy(&cmd->params.sgid[0], &sgid.raw[0], sizeof(cmd->params.sgid));
+	ocrdma_resolve_dmac(qp->dev, ah_attr, &mac_addr[0]);
+	cmd->params.dmac_b0_to_b3 = mac_addr[0] | (mac_addr[1] << 8) |
+				(mac_addr[2] << 16) | (mac_addr[3] << 24);
+	/* convert them to LE format. */
+	ocrdma_cpu_to_le32(&cmd->params.dgid[0], sizeof(cmd->params.dgid));
+	ocrdma_cpu_to_le32(&cmd->params.sgid[0], sizeof(cmd->params.sgid));
+	cmd->params.vlan_dmac_b4_to_b5 = mac_addr[4] | (mac_addr[5] << 8);
+	if (attr_mask & IB_QP_VID) {
+		vlan_id = attrs->vlan_id;
+		cmd->params.vlan_dmac_b4_to_b5 |=
+		    vlan_id << OCRDMA_QP_PARAMS_VLAN_SHIFT;
+		cmd->flags |= OCRDMA_QP_PARA_VLAN_EN_VALID;
+		cmd->params.rnt_rc_sl_fl |=
+			(qp->dev->sl & 0x07) << OCRDMA_QP_PARAMS_SL_SHIFT;
+	}
+	return 0;
+}
+
+static int ocrdma_set_qp_params(struct ocrdma_qp *qp,
+				struct ocrdma_modify_qp *cmd,
+				struct ib_qp_attr *attrs, int attr_mask)
+{
+	int status = 0;
+
+	if (attr_mask & IB_QP_PKEY_INDEX) {
+		cmd->params.path_mtu_pkey_indx |= (attrs->pkey_index &
+					    OCRDMA_QP_PARAMS_PKEY_INDEX_MASK);
+		cmd->flags |= OCRDMA_QP_PARA_PKEY_VALID;
+	}
+	if (attr_mask & IB_QP_QKEY) {
+		qp->qkey = attrs->qkey;
+		cmd->params.qkey = attrs->qkey;
+		cmd->flags |= OCRDMA_QP_PARA_QKEY_VALID;
+	}
+	if (attr_mask & IB_QP_AV) {
+		status = ocrdma_set_av_params(qp, cmd, attrs, attr_mask);
+		if (status)
+			return status;
+	} else if (qp->qp_type == IB_QPT_GSI || qp->qp_type == IB_QPT_UD) {
+		/* set the default mac address for UD, GSI QPs */
+		cmd->params.dmac_b0_to_b3 = qp->dev->nic_info.mac_addr[0] |
+			(qp->dev->nic_info.mac_addr[1] << 8) |
+			(qp->dev->nic_info.mac_addr[2] << 16) |
+			(qp->dev->nic_info.mac_addr[3] << 24);
+		cmd->params.vlan_dmac_b4_to_b5 = qp->dev->nic_info.mac_addr[4] |
+					(qp->dev->nic_info.mac_addr[5] << 8);
+	}
+	if ((attr_mask & IB_QP_EN_SQD_ASYNC_NOTIFY) &&
+	    attrs->en_sqd_async_notify) {
+		cmd->params.max_sge_recv_flags |=
+			OCRDMA_QP_PARAMS_FLAGS_SQD_ASYNC;
+		cmd->flags |= OCRDMA_QP_PARA_DST_QPN_VALID;
+	}
+	if (attr_mask & IB_QP_DEST_QPN) {
+		cmd->params.ack_to_rnr_rtc_dest_qpn |= (attrs->dest_qp_num &
+				OCRDMA_QP_PARAMS_DEST_QPN_MASK);
+		cmd->flags |= OCRDMA_QP_PARA_DST_QPN_VALID;
+	}
+	if (attr_mask & IB_QP_PATH_MTU) {
+		if (attrs->path_mtu < IB_MTU_256 ||
+		    attrs->path_mtu > IB_MTU_4096) {
+			status = -EINVAL;
+			goto pmtu_err;
+		}
+		cmd->params.path_mtu_pkey_indx |=
+		    (ib_mtu_enum_to_int(attrs->path_mtu) <<
+		     OCRDMA_QP_PARAMS_PATH_MTU_SHIFT) &
+		    OCRDMA_QP_PARAMS_PATH_MTU_MASK;
+		cmd->flags |= OCRDMA_QP_PARA_PMTU_VALID;
+	}
+	if (attr_mask & IB_QP_TIMEOUT) {
+		cmd->params.ack_to_rnr_rtc_dest_qpn |= attrs->timeout <<
+		    OCRDMA_QP_PARAMS_ACK_TIMEOUT_SHIFT;
+		cmd->flags |= OCRDMA_QP_PARA_ACK_TO_VALID;
+	}
+	if (attr_mask & IB_QP_RETRY_CNT) {
+		cmd->params.rnt_rc_sl_fl |= (attrs->retry_cnt <<
+				      OCRDMA_QP_PARAMS_RETRY_CNT_SHIFT) &
+		    OCRDMA_QP_PARAMS_RETRY_CNT_MASK;
+		cmd->flags |= OCRDMA_QP_PARA_RETRY_CNT_VALID;
+	}
+	if (attr_mask & IB_QP_MIN_RNR_TIMER) {
+		cmd->params.rnt_rc_sl_fl |= (attrs->min_rnr_timer <<
+				      OCRDMA_QP_PARAMS_RNR_NAK_TIMER_SHIFT) &
+		    OCRDMA_QP_PARAMS_RNR_NAK_TIMER_MASK;
+		cmd->flags |= OCRDMA_QP_PARA_RNT_VALID;
+	}
+	if (attr_mask & IB_QP_RNR_RETRY) {
+		cmd->params.ack_to_rnr_rtc_dest_qpn |= (attrs->rnr_retry <<
+			OCRDMA_QP_PARAMS_RNR_RETRY_CNT_SHIFT)
+			& OCRDMA_QP_PARAMS_RNR_RETRY_CNT_MASK;
+		cmd->flags |= OCRDMA_QP_PARA_RRC_VALID;
+	}
+	if (attr_mask & IB_QP_SQ_PSN) {
+		cmd->params.tclass_sq_psn |= (attrs->sq_psn & 0x00ffffff);
+		cmd->flags |= OCRDMA_QP_PARA_SQPSN_VALID;
+	}
+	if (attr_mask & IB_QP_RQ_PSN) {
+		cmd->params.hop_lmt_rq_psn |= (attrs->rq_psn & 0x00ffffff);
+		cmd->flags |= OCRDMA_QP_PARA_RQPSN_VALID;
+	}
+	if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC) {
+		if (attrs->max_rd_atomic > qp->dev->attr.max_ord_per_qp) {
+			status = -EINVAL;
+			goto pmtu_err;
+		}
+		qp->max_ord = attrs->max_rd_atomic;
+		cmd->flags |= OCRDMA_QP_PARA_MAX_ORD_VALID;
+	}
+	if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) {
+		if (attrs->max_dest_rd_atomic > qp->dev->attr.max_ird_per_qp) {
+			status = -EINVAL;
+			goto pmtu_err;
+		}
+		qp->max_ird = attrs->max_dest_rd_atomic;
+		cmd->flags |= OCRDMA_QP_PARA_MAX_IRD_VALID;
+	}
+	cmd->params.max_ord_ird = (qp->max_ord <<
+				OCRDMA_QP_PARAMS_MAX_ORD_SHIFT) |
+				(qp->max_ird & OCRDMA_QP_PARAMS_MAX_IRD_MASK);
+pmtu_err:
+	return status;
+}
+
+int ocrdma_mbx_modify_qp(struct ocrdma_dev *dev, struct ocrdma_qp *qp,
+			 struct ib_qp_attr *attrs, int attr_mask)
+{
+	int status = -ENOMEM;
+	struct ocrdma_modify_qp *cmd;
+
+	cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_MODIFY_QP, sizeof(*cmd));
+	if (!cmd)
+		return status;
+
+	cmd->params.id = qp->id;
+	cmd->flags = 0;
+	if (attr_mask & IB_QP_STATE) {
+		cmd->params.max_sge_recv_flags |=
+		    (get_ocrdma_qp_state(attrs->qp_state) <<
+		     OCRDMA_QP_PARAMS_STATE_SHIFT) &
+		    OCRDMA_QP_PARAMS_STATE_MASK;
+		cmd->flags |= OCRDMA_QP_PARA_QPS_VALID;
+	} else {
+		cmd->params.max_sge_recv_flags |=
+		    (qp->state << OCRDMA_QP_PARAMS_STATE_SHIFT) &
+		    OCRDMA_QP_PARAMS_STATE_MASK;
+	}
+
+	status = ocrdma_set_qp_params(qp, cmd, attrs, attr_mask);
+	if (status)
+		goto mbx_err;
+	status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd);
+	if (status)
+		goto mbx_err;
+
+mbx_err:
+	kfree(cmd);
+	return status;
+}
+
+int ocrdma_mbx_destroy_qp(struct ocrdma_dev *dev, struct ocrdma_qp *qp)
+{
+	int status = -ENOMEM;
+	struct ocrdma_destroy_qp *cmd;
+	struct pci_dev *pdev = dev->nic_info.pdev;
+
+	cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_DELETE_QP, sizeof(*cmd));
+	if (!cmd)
+		return status;
+	cmd->qp_id = qp->id;
+	status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd);
+	if (status)
+		goto mbx_err;
+
+mbx_err:
+	kfree(cmd);
+	if (qp->sq.va)
+		dma_free_coherent(&pdev->dev, qp->sq.len, qp->sq.va, qp->sq.pa);
+	if (!qp->srq && qp->rq.va)
+		dma_free_coherent(&pdev->dev, qp->rq.len, qp->rq.va, qp->rq.pa);
+	if (qp->dpp_enabled)
+		qp->pd->num_dpp_qp++;
+	return status;
+}
+
+int ocrdma_mbx_create_srq(struct ocrdma_dev *dev, struct ocrdma_srq *srq,
+			  struct ib_srq_init_attr *srq_attr,
+			  struct ocrdma_pd *pd)
+{
+	int status = -ENOMEM;
+	int hw_pages, hw_page_size;
+	int len;
+	struct ocrdma_create_srq_rsp *rsp;
+	struct ocrdma_create_srq *cmd;
+	dma_addr_t pa;
+	struct pci_dev *pdev = dev->nic_info.pdev;
+	u32 max_rqe_allocated;
+
+	cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_CREATE_SRQ, sizeof(*cmd));
+	if (!cmd)
+		return status;
+
+	cmd->pgsz_pdid = pd->id & OCRDMA_CREATE_SRQ_PD_ID_MASK;
+	max_rqe_allocated = srq_attr->attr.max_wr + 1;
+	status = ocrdma_build_q_conf(&max_rqe_allocated,
+				dev->attr.rqe_size,
+				&hw_pages, &hw_page_size);
+	if (status) {
+		pr_err("%s() req. max_wr=0x%x\n", __func__,
+		       srq_attr->attr.max_wr);
+		status = -EINVAL;
+		goto ret;
+	}
+	len = hw_pages * hw_page_size;
+	srq->rq.va = dma_alloc_coherent(&pdev->dev, len, &pa, GFP_KERNEL);
+	if (!srq->rq.va) {
+		status = -ENOMEM;
+		goto ret;
+	}
+	ocrdma_build_q_pages(&cmd->rq_addr[0], hw_pages, pa, hw_page_size);
+
+	srq->rq.entry_size = dev->attr.rqe_size;
+	srq->rq.pa = pa;
+	srq->rq.len = len;
+	srq->rq.max_cnt = max_rqe_allocated;
+
+	cmd->max_sge_rqe = ilog2(max_rqe_allocated);
+	cmd->max_sge_rqe |= srq_attr->attr.max_sge <<
+				OCRDMA_CREATE_SRQ_MAX_SGE_RECV_SHIFT;
+
+	cmd->pgsz_pdid |= (ilog2(hw_page_size / OCRDMA_MIN_Q_PAGE_SIZE)
+		<< OCRDMA_CREATE_SRQ_PG_SZ_SHIFT);
+	cmd->pages_rqe_sz |= (dev->attr.rqe_size
+		<< OCRDMA_CREATE_SRQ_RQE_SIZE_SHIFT)
+		& OCRDMA_CREATE_SRQ_RQE_SIZE_MASK;
+	cmd->pages_rqe_sz |= hw_pages << OCRDMA_CREATE_SRQ_NUM_RQ_PAGES_SHIFT;
+
+	status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd);
+	if (status)
+		goto mbx_err;
+	rsp = (struct ocrdma_create_srq_rsp *)cmd;
+	srq->id = rsp->id;
+	srq->rq.dbid = rsp->id;
+	max_rqe_allocated = ((rsp->max_sge_rqe_allocated &
+		OCRDMA_CREATE_SRQ_RSP_MAX_RQE_ALLOCATED_MASK) >>
+		OCRDMA_CREATE_SRQ_RSP_MAX_RQE_ALLOCATED_SHIFT);
+	max_rqe_allocated = (1 << max_rqe_allocated);
+	srq->rq.max_cnt = max_rqe_allocated;
+	srq->rq.max_wqe_idx = max_rqe_allocated - 1;
+	srq->rq.max_sges = (rsp->max_sge_rqe_allocated &
+		OCRDMA_CREATE_SRQ_RSP_MAX_SGE_RECV_ALLOCATED_MASK) >>
+		OCRDMA_CREATE_SRQ_RSP_MAX_SGE_RECV_ALLOCATED_SHIFT;
+	goto ret;
+mbx_err:
+	dma_free_coherent(&pdev->dev, srq->rq.len, srq->rq.va, pa);
+ret:
+	kfree(cmd);
+	return status;
+}
+
+int ocrdma_mbx_modify_srq(struct ocrdma_srq *srq, struct ib_srq_attr *srq_attr)
+{
+	int status = -ENOMEM;
+	struct ocrdma_modify_srq *cmd;
+	struct ocrdma_pd *pd = srq->pd;
+	struct ocrdma_dev *dev = get_ocrdma_dev(pd->ibpd.device);
+
+	cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_MODIFY_SRQ, sizeof(*cmd));
+	if (!cmd)
+		return status;
+	cmd->id = srq->id;
+	cmd->limit_max_rqe |= srq_attr->srq_limit <<
+	    OCRDMA_MODIFY_SRQ_LIMIT_SHIFT;
+	status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd);
+	kfree(cmd);
+	return status;
+}
+
+int ocrdma_mbx_query_srq(struct ocrdma_srq *srq, struct ib_srq_attr *srq_attr)
+{
+	int status = -ENOMEM;
+	struct ocrdma_query_srq *cmd;
+	struct ocrdma_dev *dev = get_ocrdma_dev(srq->ibsrq.device);
+
+	cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_QUERY_SRQ, sizeof(*cmd));
+	if (!cmd)
+		return status;
+	cmd->id = srq->rq.dbid;
+	status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd);
+	if (status == 0) {
+		struct ocrdma_query_srq_rsp *rsp =
+		    (struct ocrdma_query_srq_rsp *)cmd;
+		srq_attr->max_sge =
+		    rsp->srq_lmt_max_sge &
+		    OCRDMA_QUERY_SRQ_RSP_MAX_SGE_RECV_MASK;
+		srq_attr->max_wr =
+		    rsp->max_rqe_pdid >> OCRDMA_QUERY_SRQ_RSP_MAX_RQE_SHIFT;
+		srq_attr->srq_limit = rsp->srq_lmt_max_sge >>
+		    OCRDMA_QUERY_SRQ_RSP_SRQ_LIMIT_SHIFT;
+	}
+	kfree(cmd);
+	return status;
+}
+
+int ocrdma_mbx_destroy_srq(struct ocrdma_dev *dev, struct ocrdma_srq *srq)
+{
+	int status = -ENOMEM;
+	struct ocrdma_destroy_srq *cmd;
+	struct pci_dev *pdev = dev->nic_info.pdev;
+	cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_DELETE_SRQ, sizeof(*cmd));
+	if (!cmd)
+		return status;
+	cmd->id = srq->id;
+	status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd);
+	if (srq->rq.va)
+		dma_free_coherent(&pdev->dev, srq->rq.len,
+				  srq->rq.va, srq->rq.pa);
+	kfree(cmd);
+	return status;
+}
+
+static int ocrdma_mbx_get_dcbx_config(struct ocrdma_dev *dev, u32 ptype,
+				      struct ocrdma_dcbx_cfg *dcbxcfg)
+{
+	int status = 0;
+	dma_addr_t pa;
+	struct ocrdma_mqe cmd;
+
+	struct ocrdma_get_dcbx_cfg_req *req = NULL;
+	struct ocrdma_get_dcbx_cfg_rsp *rsp = NULL;
+	struct pci_dev *pdev = dev->nic_info.pdev;
+	struct ocrdma_mqe_sge *mqe_sge = cmd.u.nonemb_req.sge;
+
+	memset(&cmd, 0, sizeof(struct ocrdma_mqe));
+	cmd.hdr.pyld_len = max_t (u32, sizeof(struct ocrdma_get_dcbx_cfg_rsp),
+					sizeof(struct ocrdma_get_dcbx_cfg_req));
+	req = dma_alloc_coherent(&pdev->dev, cmd.hdr.pyld_len, &pa, GFP_KERNEL);
+	if (!req) {
+		status = -ENOMEM;
+		goto mem_err;
+	}
+
+	cmd.hdr.spcl_sge_cnt_emb |= (1 << OCRDMA_MQE_HDR_SGE_CNT_SHIFT) &
+					OCRDMA_MQE_HDR_SGE_CNT_MASK;
+	mqe_sge->pa_lo = (u32) (pa & 0xFFFFFFFFUL);
+	mqe_sge->pa_hi = (u32) upper_32_bits(pa);
+	mqe_sge->len = cmd.hdr.pyld_len;
+
+	memset(req, 0, sizeof(struct ocrdma_get_dcbx_cfg_req));
+	ocrdma_init_mch(&req->hdr, OCRDMA_CMD_GET_DCBX_CONFIG,
+			OCRDMA_SUBSYS_DCBX, cmd.hdr.pyld_len);
+	req->param_type = ptype;
+
+	status = ocrdma_mbx_cmd(dev, &cmd);
+	if (status)
+		goto mbx_err;
+
+	rsp = (struct ocrdma_get_dcbx_cfg_rsp *)req;
+	ocrdma_le32_to_cpu(rsp, sizeof(struct ocrdma_get_dcbx_cfg_rsp));
+	memcpy(dcbxcfg, &rsp->cfg, sizeof(struct ocrdma_dcbx_cfg));
+
+mbx_err:
+	dma_free_coherent(&pdev->dev, cmd.hdr.pyld_len, req, pa);
+mem_err:
+	return status;
+}
+
+#define OCRDMA_MAX_SERVICE_LEVEL_INDEX	0x08
+#define OCRDMA_DEFAULT_SERVICE_LEVEL	0x05
+
+static int ocrdma_parse_dcbxcfg_rsp(struct ocrdma_dev *dev, int ptype,
+				    struct ocrdma_dcbx_cfg *dcbxcfg,
+				    u8 *srvc_lvl)
+{
+	int status = -EINVAL, indx, slindx;
+	int ventry_cnt;
+	struct ocrdma_app_parameter *app_param;
+	u8 valid, proto_sel;
+	u8 app_prio, pfc_prio;
+	u16 proto;
+
+	if (!(dcbxcfg->tcv_aev_opv_st & OCRDMA_DCBX_STATE_MASK)) {
+		pr_info("%s ocrdma%d DCBX is disabled\n",
+			dev_name(&dev->nic_info.pdev->dev), dev->id);
+		goto out;
+	}
+
+	if (!ocrdma_is_enabled_and_synced(dcbxcfg->pfc_state)) {
+		pr_info("%s ocrdma%d priority flow control(%s) is %s%s\n",
+			dev_name(&dev->nic_info.pdev->dev), dev->id,
+			(ptype > 0 ? "operational" : "admin"),
+			(dcbxcfg->pfc_state & OCRDMA_STATE_FLAG_ENABLED) ?
+			"enabled" : "disabled",
+			(dcbxcfg->pfc_state & OCRDMA_STATE_FLAG_SYNC) ?
+			"" : ", not sync'ed");
+		goto out;
+	} else {
+		pr_info("%s ocrdma%d priority flow control is enabled and sync'ed\n",
+			dev_name(&dev->nic_info.pdev->dev), dev->id);
+	}
+
+	ventry_cnt = (dcbxcfg->tcv_aev_opv_st >>
+				OCRDMA_DCBX_APP_ENTRY_SHIFT)
+				& OCRDMA_DCBX_STATE_MASK;
+
+	for (indx = 0; indx < ventry_cnt; indx++) {
+		app_param = &dcbxcfg->app_param[indx];
+		valid = (app_param->valid_proto_app >>
+				OCRDMA_APP_PARAM_VALID_SHIFT)
+				& OCRDMA_APP_PARAM_VALID_MASK;
+		proto_sel = (app_param->valid_proto_app
+				>>  OCRDMA_APP_PARAM_PROTO_SEL_SHIFT)
+				& OCRDMA_APP_PARAM_PROTO_SEL_MASK;
+		proto = app_param->valid_proto_app &
+				OCRDMA_APP_PARAM_APP_PROTO_MASK;
+
+		if (
+			valid && proto == OCRDMA_APP_PROTO_ROCE &&
+			proto_sel == OCRDMA_PROTO_SELECT_L2) {
+			for (slindx = 0; slindx <
+				OCRDMA_MAX_SERVICE_LEVEL_INDEX; slindx++) {
+				app_prio = ocrdma_get_app_prio(
+						(u8 *)app_param->app_prio,
+						slindx);
+				pfc_prio = ocrdma_get_pfc_prio(
+						(u8 *)dcbxcfg->pfc_prio,
+						slindx);
+
+				if (app_prio && pfc_prio) {
+					*srvc_lvl = slindx;
+					status = 0;
+					goto out;
+				}
+			}
+			if (slindx == OCRDMA_MAX_SERVICE_LEVEL_INDEX) {
+				pr_info("%s ocrdma%d application priority not set for 0x%x protocol\n",
+					dev_name(&dev->nic_info.pdev->dev),
+					dev->id, proto);
+			}
+		}
+	}
+
+out:
+	return status;
+}
+
+void ocrdma_init_service_level(struct ocrdma_dev *dev)
+{
+	int status = 0, indx;
+	struct ocrdma_dcbx_cfg dcbxcfg;
+	u8 srvc_lvl = OCRDMA_DEFAULT_SERVICE_LEVEL;
+	int ptype = OCRDMA_PARAMETER_TYPE_OPER;
+
+	for (indx = 0; indx < 2; indx++) {
+		status = ocrdma_mbx_get_dcbx_config(dev, ptype, &dcbxcfg);
+		if (status) {
+			pr_err("%s(): status=%d\n", __func__, status);
+			ptype = OCRDMA_PARAMETER_TYPE_ADMIN;
+			continue;
+		}
+
+		status = ocrdma_parse_dcbxcfg_rsp(dev, ptype,
+						  &dcbxcfg, &srvc_lvl);
+		if (status) {
+			ptype = OCRDMA_PARAMETER_TYPE_ADMIN;
+			continue;
+		}
+
+		break;
+	}
+
+	if (status)
+		pr_info("%s ocrdma%d service level default\n",
+			dev_name(&dev->nic_info.pdev->dev), dev->id);
+	else
+		pr_info("%s ocrdma%d service level %d\n",
+			dev_name(&dev->nic_info.pdev->dev), dev->id,
+			srvc_lvl);
+
+	dev->pfc_state = ocrdma_is_enabled_and_synced(dcbxcfg.pfc_state);
+	dev->sl = srvc_lvl;
+}
+
+int ocrdma_alloc_av(struct ocrdma_dev *dev, struct ocrdma_ah *ah)
+{
+	int i;
+	int status = -EINVAL;
+	struct ocrdma_av *av;
+	unsigned long flags;
+
+	av = dev->av_tbl.va;
+	spin_lock_irqsave(&dev->av_tbl.lock, flags);
+	for (i = 0; i < dev->av_tbl.num_ah; i++) {
+		if (av->valid == 0) {
+			av->valid = OCRDMA_AV_VALID;
+			ah->av = av;
+			ah->id = i;
+			status = 0;
+			break;
+		}
+		av++;
+	}
+	if (i == dev->av_tbl.num_ah)
+		status = -EAGAIN;
+	spin_unlock_irqrestore(&dev->av_tbl.lock, flags);
+	return status;
+}
+
+int ocrdma_free_av(struct ocrdma_dev *dev, struct ocrdma_ah *ah)
+{
+	unsigned long flags;
+	spin_lock_irqsave(&dev->av_tbl.lock, flags);
+	ah->av->valid = 0;
+	spin_unlock_irqrestore(&dev->av_tbl.lock, flags);
+	return 0;
+}
+
+static int ocrdma_create_eqs(struct ocrdma_dev *dev)
+{
+	int num_eq, i, status = 0;
+	int irq;
+	unsigned long flags = 0;
+
+	num_eq = dev->nic_info.msix.num_vectors -
+			dev->nic_info.msix.start_vector;
+	if (dev->nic_info.intr_mode == BE_INTERRUPT_MODE_INTX) {
+		num_eq = 1;
+		flags = IRQF_SHARED;
+	} else {
+		num_eq = min_t(u32, num_eq, num_online_cpus());
+	}
+
+	if (!num_eq)
+		return -EINVAL;
+
+	dev->eq_tbl = kzalloc(sizeof(struct ocrdma_eq) * num_eq, GFP_KERNEL);
+	if (!dev->eq_tbl)
+		return -ENOMEM;
+
+	for (i = 0; i < num_eq; i++) {
+		status = ocrdma_create_eq(dev, &dev->eq_tbl[i],
+					OCRDMA_EQ_LEN);
+		if (status) {
+			status = -EINVAL;
+			break;
+		}
+		sprintf(dev->eq_tbl[i].irq_name, "ocrdma%d-%d",
+			dev->id, i);
+		irq = ocrdma_get_irq(dev, &dev->eq_tbl[i]);
+		status = request_irq(irq, ocrdma_irq_handler, flags,
+				     dev->eq_tbl[i].irq_name,
+				     &dev->eq_tbl[i]);
+		if (status)
+			goto done;
+		dev->eq_cnt += 1;
+	}
+	/* one eq is sufficient for data path to work */
+	return 0;
+done:
+	ocrdma_destroy_eqs(dev);
+	return status;
+}
+
+int ocrdma_init_hw(struct ocrdma_dev *dev)
+{
+	int status;
+
+	/* create the eqs  */
+	status = ocrdma_create_eqs(dev);
+	if (status)
+		goto qpeq_err;
+	status = ocrdma_create_mq(dev);
+	if (status)
+		goto mq_err;
+	status = ocrdma_mbx_query_fw_config(dev);
+	if (status)
+		goto conf_err;
+	status = ocrdma_mbx_query_dev(dev);
+	if (status)
+		goto conf_err;
+	status = ocrdma_mbx_query_fw_ver(dev);
+	if (status)
+		goto conf_err;
+	status = ocrdma_mbx_create_ah_tbl(dev);
+	if (status)
+		goto conf_err;
+	status = ocrdma_mbx_get_phy_info(dev);
+	if (status)
+		goto info_attrb_err;
+	status = ocrdma_mbx_get_ctrl_attribs(dev);
+	if (status)
+		goto info_attrb_err;
+
+	return 0;
+
+info_attrb_err:
+	ocrdma_mbx_delete_ah_tbl(dev);
+conf_err:
+	ocrdma_destroy_mq(dev);
+mq_err:
+	ocrdma_destroy_eqs(dev);
+qpeq_err:
+	pr_err("%s() status=%d\n", __func__, status);
+	return status;
+}
+
+void ocrdma_cleanup_hw(struct ocrdma_dev *dev)
+{
+	ocrdma_mbx_delete_ah_tbl(dev);
+
+	/* cleanup the eqs */
+	ocrdma_destroy_eqs(dev);
+
+	/* cleanup the control path */
+	ocrdma_destroy_mq(dev);
+}
diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_hw.h b/drivers/infiniband/hw/ocrdma/ocrdma_hw.h
new file mode 100644
index 0000000..6eed8f1
--- /dev/null
+++ b/drivers/infiniband/hw/ocrdma/ocrdma_hw.h
@@ -0,0 +1,140 @@
+/*******************************************************************
+ * This file is part of the Emulex RoCE Device Driver for          *
+ * RoCE (RDMA over Converged Ethernet) CNA Adapters.              *
+ * Copyright (C) 2008-2012 Emulex. All rights reserved.            *
+ * EMULEX and SLI are trademarks of Emulex.                        *
+ * www.emulex.com                                                  *
+ *                                                                 *
+ * This program is free software; you can redistribute it and/or   *
+ * modify it under the terms of version 2 of the GNU General       *
+ * Public License as published by the Free Software Foundation.    *
+ * This program is distributed in the hope that it will be useful. *
+ * ALL EXPRESS OR IMPLIED CONDITIONS, REPRESENTATIONS AND          *
+ * WARRANTIES, INCLUDING ANY IMPLIED WARRANTY OF MERCHANTABILITY,  *
+ * FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT, ARE      *
+ * DISCLAIMED, EXCEPT TO THE EXTENT THAT SUCH DISCLAIMERS ARE HELD *
+ * TO BE LEGALLY INVALID.  See the GNU General Public License for  *
+ * more details, a copy of which can be found in the file COPYING  *
+ * included with this package.                                     *
+ *
+ * Contact Information:
+ * linux-drivers@emulex.com
+ *
+ * Emulex
+ * 3333 Susan Street
+ * Costa Mesa, CA 92626
+ *******************************************************************/
+
+#ifndef __OCRDMA_HW_H__
+#define __OCRDMA_HW_H__
+
+#include "ocrdma_sli.h"
+
+static inline void ocrdma_cpu_to_le32(void *dst, u32 len)
+{
+#ifdef __BIG_ENDIAN
+	int i = 0;
+	u32 *src_ptr = dst;
+	u32 *dst_ptr = dst;
+	for (; i < (len / 4); i++)
+		*(dst_ptr + i) = cpu_to_le32p(src_ptr + i);
+#endif
+}
+
+static inline void ocrdma_le32_to_cpu(void *dst, u32 len)
+{
+#ifdef __BIG_ENDIAN
+	int i = 0;
+	u32 *src_ptr = dst;
+	u32 *dst_ptr = dst;
+	for (; i < (len / sizeof(u32)); i++)
+		*(dst_ptr + i) = le32_to_cpu(*(src_ptr + i));
+#endif
+}
+
+static inline void ocrdma_copy_cpu_to_le32(void *dst, void *src, u32 len)
+{
+#ifdef __BIG_ENDIAN
+	int i = 0;
+	u32 *src_ptr = src;
+	u32 *dst_ptr = dst;
+	for (; i < (len / sizeof(u32)); i++)
+		*(dst_ptr + i) = cpu_to_le32p(src_ptr + i);
+#else
+	memcpy(dst, src, len);
+#endif
+}
+
+static inline void ocrdma_copy_le32_to_cpu(void *dst, void *src, u32 len)
+{
+#ifdef __BIG_ENDIAN
+	int i = 0;
+	u32 *src_ptr = src;
+	u32 *dst_ptr = dst;
+	for (; i < len / sizeof(u32); i++)
+		*(dst_ptr + i) = le32_to_cpu(*(src_ptr + i));
+#else
+	memcpy(dst, src, len);
+#endif
+}
+
+static inline u64 ocrdma_get_db_addr(struct ocrdma_dev *dev, u32 pdid)
+{
+	return dev->nic_info.unmapped_db + (pdid * dev->nic_info.db_page_size);
+}
+
+int ocrdma_init_hw(struct ocrdma_dev *);
+void ocrdma_cleanup_hw(struct ocrdma_dev *);
+
+enum ib_qp_state get_ibqp_state(enum ocrdma_qp_state qps);
+void ocrdma_ring_cq_db(struct ocrdma_dev *, u16 cq_id, bool armed,
+		       bool solicited, u16 cqe_popped);
+
+/* verbs specific mailbox commands */
+int ocrdma_mbx_get_link_speed(struct ocrdma_dev *dev, u8 *lnk_speed);
+int ocrdma_query_config(struct ocrdma_dev *,
+			struct ocrdma_mbx_query_config *config);
+
+int ocrdma_mbx_alloc_pd(struct ocrdma_dev *, struct ocrdma_pd *);
+int ocrdma_mbx_dealloc_pd(struct ocrdma_dev *, struct ocrdma_pd *);
+
+int ocrdma_mbx_alloc_lkey(struct ocrdma_dev *, struct ocrdma_hw_mr *hwmr,
+			  u32 pd_id, int addr_check);
+int ocrdma_mbx_dealloc_lkey(struct ocrdma_dev *, int fmr, u32 lkey);
+
+int ocrdma_reg_mr(struct ocrdma_dev *, struct ocrdma_hw_mr *hwmr,
+			u32 pd_id, int acc);
+int ocrdma_mbx_create_cq(struct ocrdma_dev *, struct ocrdma_cq *,
+				int entries, int dpp_cq, u16 pd_id);
+int ocrdma_mbx_destroy_cq(struct ocrdma_dev *, struct ocrdma_cq *);
+
+int ocrdma_mbx_create_qp(struct ocrdma_qp *, struct ib_qp_init_attr *attrs,
+			 u8 enable_dpp_cq, u16 dpp_cq_id, u16 *dpp_offset,
+			 u16 *dpp_credit_lmt);
+int ocrdma_mbx_modify_qp(struct ocrdma_dev *, struct ocrdma_qp *,
+			 struct ib_qp_attr *attrs, int attr_mask);
+int ocrdma_mbx_query_qp(struct ocrdma_dev *, struct ocrdma_qp *,
+			struct ocrdma_qp_params *param);
+int ocrdma_mbx_destroy_qp(struct ocrdma_dev *, struct ocrdma_qp *);
+int ocrdma_mbx_create_srq(struct ocrdma_dev *, struct ocrdma_srq *,
+			  struct ib_srq_init_attr *,
+			  struct ocrdma_pd *);
+int ocrdma_mbx_modify_srq(struct ocrdma_srq *, struct ib_srq_attr *);
+int ocrdma_mbx_query_srq(struct ocrdma_srq *, struct ib_srq_attr *);
+int ocrdma_mbx_destroy_srq(struct ocrdma_dev *, struct ocrdma_srq *);
+
+int ocrdma_alloc_av(struct ocrdma_dev *, struct ocrdma_ah *);
+int ocrdma_free_av(struct ocrdma_dev *, struct ocrdma_ah *);
+
+int ocrdma_qp_state_change(struct ocrdma_qp *, enum ib_qp_state new_state,
+			    enum ib_qp_state *old_ib_state);
+bool ocrdma_is_qp_in_sq_flushlist(struct ocrdma_cq *, struct ocrdma_qp *);
+bool ocrdma_is_qp_in_rq_flushlist(struct ocrdma_cq *, struct ocrdma_qp *);
+void ocrdma_flush_qp(struct ocrdma_qp *);
+int ocrdma_get_irq(struct ocrdma_dev *dev, struct ocrdma_eq *eq);
+
+int ocrdma_mbx_rdma_stats(struct ocrdma_dev *, bool reset);
+char *port_speed_string(struct ocrdma_dev *dev);
+void ocrdma_init_service_level(struct ocrdma_dev *);
+
+#endif				/* __OCRDMA_HW_H__ */
diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_main.c b/drivers/infiniband/hw/ocrdma/ocrdma_main.c
new file mode 100644
index 0000000..b0b2257
--- /dev/null
+++ b/drivers/infiniband/hw/ocrdma/ocrdma_main.c
@@ -0,0 +1,676 @@
+/*******************************************************************
+ * This file is part of the Emulex RoCE Device Driver for          *
+ * RoCE (RDMA over Converged Ethernet) adapters.                   *
+ * Copyright (C) 2008-2012 Emulex. All rights reserved.            *
+ * EMULEX and SLI are trademarks of Emulex.                        *
+ * www.emulex.com                                                  *
+ *                                                                 *
+ * This program is free software; you can redistribute it and/or   *
+ * modify it under the terms of version 2 of the GNU General       *
+ * Public License as published by the Free Software Foundation.    *
+ * This program is distributed in the hope that it will be useful. *
+ * ALL EXPRESS OR IMPLIED CONDITIONS, REPRESENTATIONS AND          *
+ * WARRANTIES, INCLUDING ANY IMPLIED WARRANTY OF MERCHANTABILITY,  *
+ * FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT, ARE      *
+ * DISCLAIMED, EXCEPT TO THE EXTENT THAT SUCH DISCLAIMERS ARE HELD *
+ * TO BE LEGALLY INVALID.  See the GNU General Public License for  *
+ * more details, a copy of which can be found in the file COPYING  *
+ * included with this package.                                     *
+ *
+ * Contact Information:
+ * linux-drivers@emulex.com
+ *
+ * Emulex
+ * 3333 Susan Street
+ * Costa Mesa, CA 92626
+ *******************************************************************/
+
+#include <linux/module.h>
+#include <linux/idr.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_user_verbs.h>
+#include <rdma/ib_addr.h>
+
+#include <linux/netdevice.h>
+#include <net/addrconf.h>
+
+#include "ocrdma.h"
+#include "ocrdma_verbs.h"
+#include "ocrdma_ah.h"
+#include "be_roce.h"
+#include "ocrdma_hw.h"
+#include "ocrdma_stats.h"
+#include "ocrdma_abi.h"
+
+MODULE_VERSION(OCRDMA_ROCE_DRV_VERSION);
+MODULE_DESCRIPTION(OCRDMA_ROCE_DRV_DESC " " OCRDMA_ROCE_DRV_VERSION);
+MODULE_AUTHOR("Emulex Corporation");
+MODULE_LICENSE("GPL");
+
+static LIST_HEAD(ocrdma_dev_list);
+static DEFINE_SPINLOCK(ocrdma_devlist_lock);
+static DEFINE_IDR(ocrdma_dev_id);
+
+static union ib_gid ocrdma_zero_sgid;
+
+void ocrdma_get_guid(struct ocrdma_dev *dev, u8 *guid)
+{
+	u8 mac_addr[6];
+
+	memcpy(&mac_addr[0], &dev->nic_info.mac_addr[0], ETH_ALEN);
+	guid[0] = mac_addr[0] ^ 2;
+	guid[1] = mac_addr[1];
+	guid[2] = mac_addr[2];
+	guid[3] = 0xff;
+	guid[4] = 0xfe;
+	guid[5] = mac_addr[3];
+	guid[6] = mac_addr[4];
+	guid[7] = mac_addr[5];
+}
+
+static bool ocrdma_add_sgid(struct ocrdma_dev *dev, union ib_gid *new_sgid)
+{
+	int i;
+	unsigned long flags;
+
+	memset(&ocrdma_zero_sgid, 0, sizeof(union ib_gid));
+
+
+	spin_lock_irqsave(&dev->sgid_lock, flags);
+	for (i = 0; i < OCRDMA_MAX_SGID; i++) {
+		if (!memcmp(&dev->sgid_tbl[i], &ocrdma_zero_sgid,
+			    sizeof(union ib_gid))) {
+			/* found free entry */
+			memcpy(&dev->sgid_tbl[i], new_sgid,
+			       sizeof(union ib_gid));
+			spin_unlock_irqrestore(&dev->sgid_lock, flags);
+			return true;
+		} else if (!memcmp(&dev->sgid_tbl[i], new_sgid,
+				   sizeof(union ib_gid))) {
+			/* entry already present, no addition is required. */
+			spin_unlock_irqrestore(&dev->sgid_lock, flags);
+			return false;
+		}
+	}
+	spin_unlock_irqrestore(&dev->sgid_lock, flags);
+	return false;
+}
+
+static bool ocrdma_del_sgid(struct ocrdma_dev *dev, union ib_gid *sgid)
+{
+	int found = false;
+	int i;
+	unsigned long flags;
+
+
+	spin_lock_irqsave(&dev->sgid_lock, flags);
+	/* first is default sgid, which cannot be deleted. */
+	for (i = 1; i < OCRDMA_MAX_SGID; i++) {
+		if (!memcmp(&dev->sgid_tbl[i], sgid, sizeof(union ib_gid))) {
+			/* found matching entry */
+			memset(&dev->sgid_tbl[i], 0, sizeof(union ib_gid));
+			found = true;
+			break;
+		}
+	}
+	spin_unlock_irqrestore(&dev->sgid_lock, flags);
+	return found;
+}
+
+static int ocrdma_addr_event(unsigned long event, struct net_device *netdev,
+			     union ib_gid *gid)
+{
+	struct ib_event gid_event;
+	struct ocrdma_dev *dev;
+	bool found = false;
+	bool updated = false;
+	bool is_vlan = false;
+
+	is_vlan = netdev->priv_flags & IFF_802_1Q_VLAN;
+	if (is_vlan)
+		netdev = rdma_vlan_dev_real_dev(netdev);
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(dev, &ocrdma_dev_list, entry) {
+		if (dev->nic_info.netdev == netdev) {
+			found = true;
+			break;
+		}
+	}
+	rcu_read_unlock();
+
+	if (!found)
+		return NOTIFY_DONE;
+
+	mutex_lock(&dev->dev_lock);
+	switch (event) {
+	case NETDEV_UP:
+		updated = ocrdma_add_sgid(dev, gid);
+		break;
+	case NETDEV_DOWN:
+		updated = ocrdma_del_sgid(dev, gid);
+		break;
+	default:
+		break;
+	}
+	if (updated) {
+		/* GID table updated, notify the consumers about it */
+		gid_event.device = &dev->ibdev;
+		gid_event.element.port_num = 1;
+		gid_event.event = IB_EVENT_GID_CHANGE;
+		ib_dispatch_event(&gid_event);
+	}
+	mutex_unlock(&dev->dev_lock);
+	return NOTIFY_OK;
+}
+
+static int ocrdma_inetaddr_event(struct notifier_block *notifier,
+				  unsigned long event, void *ptr)
+{
+	struct in_ifaddr *ifa = ptr;
+	union ib_gid gid;
+	struct net_device *netdev = ifa->ifa_dev->dev;
+
+	ipv6_addr_set_v4mapped(ifa->ifa_address, (struct in6_addr *)&gid);
+	return ocrdma_addr_event(event, netdev, &gid);
+}
+
+static struct notifier_block ocrdma_inetaddr_notifier = {
+	.notifier_call = ocrdma_inetaddr_event
+};
+
+#if IS_ENABLED(CONFIG_IPV6)
+
+static int ocrdma_inet6addr_event(struct notifier_block *notifier,
+				  unsigned long event, void *ptr)
+{
+	struct inet6_ifaddr *ifa = (struct inet6_ifaddr *)ptr;
+	union  ib_gid *gid = (union ib_gid *)&ifa->addr;
+	struct net_device *netdev = ifa->idev->dev;
+	return ocrdma_addr_event(event, netdev, gid);
+}
+
+static struct notifier_block ocrdma_inet6addr_notifier = {
+	.notifier_call = ocrdma_inet6addr_event
+};
+
+#endif /* IPV6 and VLAN */
+
+static enum rdma_link_layer ocrdma_link_layer(struct ib_device *device,
+					      u8 port_num)
+{
+	return IB_LINK_LAYER_ETHERNET;
+}
+
+static int ocrdma_register_device(struct ocrdma_dev *dev)
+{
+	strlcpy(dev->ibdev.name, "ocrdma%d", IB_DEVICE_NAME_MAX);
+	ocrdma_get_guid(dev, (u8 *)&dev->ibdev.node_guid);
+	memcpy(dev->ibdev.node_desc, OCRDMA_NODE_DESC,
+	       sizeof(OCRDMA_NODE_DESC));
+	dev->ibdev.owner = THIS_MODULE;
+	dev->ibdev.uverbs_abi_ver = OCRDMA_ABI_VERSION;
+	dev->ibdev.uverbs_cmd_mask =
+	    OCRDMA_UVERBS(GET_CONTEXT) |
+	    OCRDMA_UVERBS(QUERY_DEVICE) |
+	    OCRDMA_UVERBS(QUERY_PORT) |
+	    OCRDMA_UVERBS(ALLOC_PD) |
+	    OCRDMA_UVERBS(DEALLOC_PD) |
+	    OCRDMA_UVERBS(REG_MR) |
+	    OCRDMA_UVERBS(DEREG_MR) |
+	    OCRDMA_UVERBS(CREATE_COMP_CHANNEL) |
+	    OCRDMA_UVERBS(CREATE_CQ) |
+	    OCRDMA_UVERBS(RESIZE_CQ) |
+	    OCRDMA_UVERBS(DESTROY_CQ) |
+	    OCRDMA_UVERBS(REQ_NOTIFY_CQ) |
+	    OCRDMA_UVERBS(CREATE_QP) |
+	    OCRDMA_UVERBS(MODIFY_QP) |
+	    OCRDMA_UVERBS(QUERY_QP) |
+	    OCRDMA_UVERBS(DESTROY_QP) |
+	    OCRDMA_UVERBS(POLL_CQ) |
+	    OCRDMA_UVERBS(POST_SEND) |
+	    OCRDMA_UVERBS(POST_RECV);
+
+	dev->ibdev.uverbs_cmd_mask |=
+	    OCRDMA_UVERBS(CREATE_AH) |
+	     OCRDMA_UVERBS(MODIFY_AH) |
+	     OCRDMA_UVERBS(QUERY_AH) |
+	     OCRDMA_UVERBS(DESTROY_AH);
+
+	dev->ibdev.node_type = RDMA_NODE_IB_CA;
+	dev->ibdev.phys_port_cnt = 1;
+	dev->ibdev.num_comp_vectors = 1;
+
+	/* mandatory verbs. */
+	dev->ibdev.query_device = ocrdma_query_device;
+	dev->ibdev.query_port = ocrdma_query_port;
+	dev->ibdev.modify_port = ocrdma_modify_port;
+	dev->ibdev.query_gid = ocrdma_query_gid;
+	dev->ibdev.get_link_layer = ocrdma_link_layer;
+	dev->ibdev.alloc_pd = ocrdma_alloc_pd;
+	dev->ibdev.dealloc_pd = ocrdma_dealloc_pd;
+
+	dev->ibdev.create_cq = ocrdma_create_cq;
+	dev->ibdev.destroy_cq = ocrdma_destroy_cq;
+	dev->ibdev.resize_cq = ocrdma_resize_cq;
+
+	dev->ibdev.create_qp = ocrdma_create_qp;
+	dev->ibdev.modify_qp = ocrdma_modify_qp;
+	dev->ibdev.query_qp = ocrdma_query_qp;
+	dev->ibdev.destroy_qp = ocrdma_destroy_qp;
+
+	dev->ibdev.query_pkey = ocrdma_query_pkey;
+	dev->ibdev.create_ah = ocrdma_create_ah;
+	dev->ibdev.destroy_ah = ocrdma_destroy_ah;
+	dev->ibdev.query_ah = ocrdma_query_ah;
+	dev->ibdev.modify_ah = ocrdma_modify_ah;
+
+	dev->ibdev.poll_cq = ocrdma_poll_cq;
+	dev->ibdev.post_send = ocrdma_post_send;
+	dev->ibdev.post_recv = ocrdma_post_recv;
+	dev->ibdev.req_notify_cq = ocrdma_arm_cq;
+
+	dev->ibdev.get_dma_mr = ocrdma_get_dma_mr;
+	dev->ibdev.reg_phys_mr = ocrdma_reg_kernel_mr;
+	dev->ibdev.dereg_mr = ocrdma_dereg_mr;
+	dev->ibdev.reg_user_mr = ocrdma_reg_user_mr;
+
+	dev->ibdev.alloc_fast_reg_mr = ocrdma_alloc_frmr;
+	dev->ibdev.alloc_fast_reg_page_list = ocrdma_alloc_frmr_page_list;
+	dev->ibdev.free_fast_reg_page_list = ocrdma_free_frmr_page_list;
+
+	/* mandatory to support user space verbs consumer. */
+	dev->ibdev.alloc_ucontext = ocrdma_alloc_ucontext;
+	dev->ibdev.dealloc_ucontext = ocrdma_dealloc_ucontext;
+	dev->ibdev.mmap = ocrdma_mmap;
+	dev->ibdev.dma_device = &dev->nic_info.pdev->dev;
+
+	dev->ibdev.process_mad = ocrdma_process_mad;
+
+	if (ocrdma_get_asic_type(dev) == OCRDMA_ASIC_GEN_SKH_R) {
+		dev->ibdev.uverbs_cmd_mask |=
+		     OCRDMA_UVERBS(CREATE_SRQ) |
+		     OCRDMA_UVERBS(MODIFY_SRQ) |
+		     OCRDMA_UVERBS(QUERY_SRQ) |
+		     OCRDMA_UVERBS(DESTROY_SRQ) |
+		     OCRDMA_UVERBS(POST_SRQ_RECV);
+
+		dev->ibdev.create_srq = ocrdma_create_srq;
+		dev->ibdev.modify_srq = ocrdma_modify_srq;
+		dev->ibdev.query_srq = ocrdma_query_srq;
+		dev->ibdev.destroy_srq = ocrdma_destroy_srq;
+		dev->ibdev.post_srq_recv = ocrdma_post_srq_recv;
+	}
+	return ib_register_device(&dev->ibdev, NULL);
+}
+
+static int ocrdma_alloc_resources(struct ocrdma_dev *dev)
+{
+	mutex_init(&dev->dev_lock);
+	dev->sgid_tbl = kzalloc(sizeof(union ib_gid) *
+				OCRDMA_MAX_SGID, GFP_KERNEL);
+	if (!dev->sgid_tbl)
+		goto alloc_err;
+	spin_lock_init(&dev->sgid_lock);
+
+	dev->cq_tbl = kzalloc(sizeof(struct ocrdma_cq *) *
+			      OCRDMA_MAX_CQ, GFP_KERNEL);
+	if (!dev->cq_tbl)
+		goto alloc_err;
+
+	if (dev->attr.max_qp) {
+		dev->qp_tbl = kzalloc(sizeof(struct ocrdma_qp *) *
+				      OCRDMA_MAX_QP, GFP_KERNEL);
+		if (!dev->qp_tbl)
+			goto alloc_err;
+	}
+
+	dev->stag_arr = kzalloc(sizeof(u64) * OCRDMA_MAX_STAG, GFP_KERNEL);
+	if (dev->stag_arr == NULL)
+		goto alloc_err;
+
+	spin_lock_init(&dev->av_tbl.lock);
+	spin_lock_init(&dev->flush_q_lock);
+	return 0;
+alloc_err:
+	pr_err("%s(%d) error.\n", __func__, dev->id);
+	return -ENOMEM;
+}
+
+static void ocrdma_free_resources(struct ocrdma_dev *dev)
+{
+	kfree(dev->stag_arr);
+	kfree(dev->qp_tbl);
+	kfree(dev->cq_tbl);
+	kfree(dev->sgid_tbl);
+}
+
+/* OCRDMA sysfs interface */
+static ssize_t show_rev(struct device *device, struct device_attribute *attr,
+			char *buf)
+{
+	struct ocrdma_dev *dev = dev_get_drvdata(device);
+
+	return scnprintf(buf, PAGE_SIZE, "0x%x\n", dev->nic_info.pdev->vendor);
+}
+
+static ssize_t show_fw_ver(struct device *device, struct device_attribute *attr,
+			char *buf)
+{
+	struct ocrdma_dev *dev = dev_get_drvdata(device);
+
+	return scnprintf(buf, PAGE_SIZE, "%s\n", &dev->attr.fw_ver[0]);
+}
+
+static ssize_t show_hca_type(struct device *device,
+			     struct device_attribute *attr, char *buf)
+{
+	struct ocrdma_dev *dev = dev_get_drvdata(device);
+
+	return scnprintf(buf, PAGE_SIZE, "%s\n", &dev->model_number[0]);
+}
+
+static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL);
+static DEVICE_ATTR(fw_ver, S_IRUGO, show_fw_ver, NULL);
+static DEVICE_ATTR(hca_type, S_IRUGO, show_hca_type, NULL);
+
+static struct device_attribute *ocrdma_attributes[] = {
+	&dev_attr_hw_rev,
+	&dev_attr_fw_ver,
+	&dev_attr_hca_type
+};
+
+static void ocrdma_remove_sysfiles(struct ocrdma_dev *dev)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(ocrdma_attributes); i++)
+		device_remove_file(&dev->ibdev.dev, ocrdma_attributes[i]);
+}
+
+static void ocrdma_add_default_sgid(struct ocrdma_dev *dev)
+{
+	/* GID Index 0 - Invariant manufacturer-assigned EUI-64 */
+	union ib_gid *sgid = &dev->sgid_tbl[0];
+
+	sgid->global.subnet_prefix = cpu_to_be64(0xfe80000000000000LL);
+	ocrdma_get_guid(dev, &sgid->raw[8]);
+}
+
+static void ocrdma_init_ipv4_gids(struct ocrdma_dev *dev,
+				  struct net_device *net)
+{
+	struct in_device *in_dev;
+	union ib_gid gid;
+	in_dev = in_dev_get(net);
+	if (in_dev) {
+		for_ifa(in_dev) {
+			ipv6_addr_set_v4mapped(ifa->ifa_address,
+					       (struct in6_addr *)&gid);
+			ocrdma_add_sgid(dev, &gid);
+		}
+		endfor_ifa(in_dev);
+		in_dev_put(in_dev);
+	}
+}
+
+static void ocrdma_init_ipv6_gids(struct ocrdma_dev *dev,
+				  struct net_device *net)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+	struct inet6_dev *in6_dev;
+	union ib_gid  *pgid;
+	struct inet6_ifaddr *ifp;
+	in6_dev = in6_dev_get(net);
+	if (in6_dev) {
+		read_lock_bh(&in6_dev->lock);
+		list_for_each_entry(ifp, &in6_dev->addr_list, if_list) {
+			pgid = (union ib_gid *)&ifp->addr;
+			ocrdma_add_sgid(dev, pgid);
+		}
+		read_unlock_bh(&in6_dev->lock);
+		in6_dev_put(in6_dev);
+	}
+#endif
+}
+
+static void ocrdma_init_gid_table(struct ocrdma_dev *dev)
+{
+	struct  net_device *net_dev;
+
+	for_each_netdev(&init_net, net_dev) {
+		struct net_device *real_dev = rdma_vlan_dev_real_dev(net_dev) ?
+				rdma_vlan_dev_real_dev(net_dev) : net_dev;
+
+		if (real_dev == dev->nic_info.netdev) {
+			ocrdma_add_default_sgid(dev);
+			ocrdma_init_ipv4_gids(dev, net_dev);
+			ocrdma_init_ipv6_gids(dev, net_dev);
+		}
+	}
+}
+
+static struct ocrdma_dev *ocrdma_add(struct be_dev_info *dev_info)
+{
+	int status = 0, i;
+	struct ocrdma_dev *dev;
+
+	dev = (struct ocrdma_dev *)ib_alloc_device(sizeof(struct ocrdma_dev));
+	if (!dev) {
+		pr_err("Unable to allocate ib device\n");
+		return NULL;
+	}
+	dev->mbx_cmd = kzalloc(sizeof(struct ocrdma_mqe_emb_cmd), GFP_KERNEL);
+	if (!dev->mbx_cmd)
+		goto idr_err;
+
+	memcpy(&dev->nic_info, dev_info, sizeof(*dev_info));
+	dev->id = idr_alloc(&ocrdma_dev_id, NULL, 0, 0, GFP_KERNEL);
+	if (dev->id < 0)
+		goto idr_err;
+
+	status = ocrdma_init_hw(dev);
+	if (status)
+		goto init_err;
+
+	status = ocrdma_alloc_resources(dev);
+	if (status)
+		goto alloc_err;
+
+	ocrdma_init_service_level(dev);
+	ocrdma_init_gid_table(dev);
+	status = ocrdma_register_device(dev);
+	if (status)
+		goto alloc_err;
+
+	for (i = 0; i < ARRAY_SIZE(ocrdma_attributes); i++)
+		if (device_create_file(&dev->ibdev.dev, ocrdma_attributes[i]))
+			goto sysfs_err;
+	spin_lock(&ocrdma_devlist_lock);
+	list_add_tail_rcu(&dev->entry, &ocrdma_dev_list);
+	spin_unlock(&ocrdma_devlist_lock);
+	/* Init stats */
+	ocrdma_add_port_stats(dev);
+
+	pr_info("%s %s: %s \"%s\" port %d\n",
+		dev_name(&dev->nic_info.pdev->dev), hca_name(dev),
+		port_speed_string(dev), dev->model_number,
+		dev->hba_port_num);
+	pr_info("%s ocrdma%d driver loaded successfully\n",
+		dev_name(&dev->nic_info.pdev->dev), dev->id);
+	return dev;
+
+sysfs_err:
+	ocrdma_remove_sysfiles(dev);
+alloc_err:
+	ocrdma_free_resources(dev);
+	ocrdma_cleanup_hw(dev);
+init_err:
+	idr_remove(&ocrdma_dev_id, dev->id);
+idr_err:
+	kfree(dev->mbx_cmd);
+	ib_dealloc_device(&dev->ibdev);
+	pr_err("%s() leaving. ret=%d\n", __func__, status);
+	return NULL;
+}
+
+static void ocrdma_remove_free(struct rcu_head *rcu)
+{
+	struct ocrdma_dev *dev = container_of(rcu, struct ocrdma_dev, rcu);
+
+	idr_remove(&ocrdma_dev_id, dev->id);
+	kfree(dev->mbx_cmd);
+	ib_dealloc_device(&dev->ibdev);
+}
+
+static void ocrdma_remove(struct ocrdma_dev *dev)
+{
+	/* first unregister with stack to stop all the active traffic
+	 * of the registered clients.
+	 */
+	ocrdma_rem_port_stats(dev);
+	ocrdma_remove_sysfiles(dev);
+
+	ib_unregister_device(&dev->ibdev);
+
+	spin_lock(&ocrdma_devlist_lock);
+	list_del_rcu(&dev->entry);
+	spin_unlock(&ocrdma_devlist_lock);
+
+	ocrdma_free_resources(dev);
+	ocrdma_cleanup_hw(dev);
+
+	call_rcu(&dev->rcu, ocrdma_remove_free);
+}
+
+static int ocrdma_open(struct ocrdma_dev *dev)
+{
+	struct ib_event port_event;
+
+	port_event.event = IB_EVENT_PORT_ACTIVE;
+	port_event.element.port_num = 1;
+	port_event.device = &dev->ibdev;
+	ib_dispatch_event(&port_event);
+	return 0;
+}
+
+static int ocrdma_close(struct ocrdma_dev *dev)
+{
+	int i;
+	struct ocrdma_qp *qp, **cur_qp;
+	struct ib_event err_event;
+	struct ib_qp_attr attrs;
+	int attr_mask = IB_QP_STATE;
+
+	attrs.qp_state = IB_QPS_ERR;
+	mutex_lock(&dev->dev_lock);
+	if (dev->qp_tbl) {
+		cur_qp = dev->qp_tbl;
+		for (i = 0; i < OCRDMA_MAX_QP; i++) {
+			qp = cur_qp[i];
+			if (qp && qp->ibqp.qp_type != IB_QPT_GSI) {
+				/* change the QP state to ERROR */
+				_ocrdma_modify_qp(&qp->ibqp, &attrs, attr_mask);
+
+				err_event.event = IB_EVENT_QP_FATAL;
+				err_event.element.qp = &qp->ibqp;
+				err_event.device = &dev->ibdev;
+				ib_dispatch_event(&err_event);
+			}
+		}
+	}
+	mutex_unlock(&dev->dev_lock);
+
+	err_event.event = IB_EVENT_PORT_ERR;
+	err_event.element.port_num = 1;
+	err_event.device = &dev->ibdev;
+	ib_dispatch_event(&err_event);
+	return 0;
+}
+
+static void ocrdma_shutdown(struct ocrdma_dev *dev)
+{
+	ocrdma_close(dev);
+	ocrdma_remove(dev);
+}
+
+/* event handling via NIC driver ensures that all the NIC specific
+ * initialization done before RoCE driver notifies
+ * event to stack.
+ */
+static void ocrdma_event_handler(struct ocrdma_dev *dev, u32 event)
+{
+	switch (event) {
+	case BE_DEV_UP:
+		ocrdma_open(dev);
+		break;
+	case BE_DEV_DOWN:
+		ocrdma_close(dev);
+		break;
+	case BE_DEV_SHUTDOWN:
+		ocrdma_shutdown(dev);
+		break;
+	}
+}
+
+static struct ocrdma_driver ocrdma_drv = {
+	.name			= "ocrdma_driver",
+	.add			= ocrdma_add,
+	.remove			= ocrdma_remove,
+	.state_change_handler	= ocrdma_event_handler,
+	.be_abi_version		= OCRDMA_BE_ROCE_ABI_VERSION,
+};
+
+static void ocrdma_unregister_inet6addr_notifier(void)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+	unregister_inet6addr_notifier(&ocrdma_inet6addr_notifier);
+#endif
+}
+
+static void ocrdma_unregister_inetaddr_notifier(void)
+{
+	unregister_inetaddr_notifier(&ocrdma_inetaddr_notifier);
+}
+
+static int __init ocrdma_init_module(void)
+{
+	int status;
+
+	ocrdma_init_debugfs();
+
+	status = register_inetaddr_notifier(&ocrdma_inetaddr_notifier);
+	if (status)
+		return status;
+
+#if IS_ENABLED(CONFIG_IPV6)
+	status = register_inet6addr_notifier(&ocrdma_inet6addr_notifier);
+	if (status)
+		goto err_notifier6;
+#endif
+
+	status = be_roce_register_driver(&ocrdma_drv);
+	if (status)
+		goto err_be_reg;
+
+	return 0;
+
+err_be_reg:
+#if IS_ENABLED(CONFIG_IPV6)
+	ocrdma_unregister_inet6addr_notifier();
+err_notifier6:
+#endif
+	ocrdma_unregister_inetaddr_notifier();
+	return status;
+}
+
+static void __exit ocrdma_exit_module(void)
+{
+	be_roce_unregister_driver(&ocrdma_drv);
+	ocrdma_unregister_inet6addr_notifier();
+	ocrdma_unregister_inetaddr_notifier();
+	ocrdma_rem_debugfs();
+}
+
+module_init(ocrdma_init_module);
+module_exit(ocrdma_exit_module);
diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_sli.h b/drivers/infiniband/hw/ocrdma/ocrdma_sli.h
new file mode 100644
index 0000000..4e03648
--- /dev/null
+++ b/drivers/infiniband/hw/ocrdma/ocrdma_sli.h
@@ -0,0 +1,2102 @@
+/*******************************************************************
+ * This file is part of the Emulex RoCE Device Driver for          *
+ * RoCE (RDMA over Converged Ethernet) adapters.                   *
+ * Copyright (C) 2008-2012 Emulex. All rights reserved.            *
+ * EMULEX and SLI are trademarks of Emulex.                        *
+ * www.emulex.com                                                  *
+ *                                                                 *
+ * This program is free software; you can redistribute it and/or   *
+ * modify it under the terms of version 2 of the GNU General       *
+ * Public License as published by the Free Software Foundation.    *
+ * This program is distributed in the hope that it will be useful. *
+ * ALL EXPRESS OR IMPLIED CONDITIONS, REPRESENTATIONS AND          *
+ * WARRANTIES, INCLUDING ANY IMPLIED WARRANTY OF MERCHANTABILITY,  *
+ * FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT, ARE      *
+ * DISCLAIMED, EXCEPT TO THE EXTENT THAT SUCH DISCLAIMERS ARE HELD *
+ * TO BE LEGALLY INVALID.  See the GNU General Public License for  *
+ * more details, a copy of which can be found in the file COPYING  *
+ * included with this package.                                     *
+ *
+ * Contact Information:
+ * linux-drivers@emulex.com
+ *
+ * Emulex
+ * 3333 Susan Street
+ * Costa Mesa, CA 92626
+ *******************************************************************/
+
+#ifndef __OCRDMA_SLI_H__
+#define __OCRDMA_SLI_H__
+
+enum {
+	OCRDMA_ASIC_GEN_SKH_R = 0x04,
+	OCRDMA_ASIC_GEN_LANCER = 0x0B
+};
+
+enum {
+	OCRDMA_ASIC_REV_A0 = 0x00,
+	OCRDMA_ASIC_REV_B0 = 0x10,
+	OCRDMA_ASIC_REV_C0 = 0x20
+};
+
+#define OCRDMA_SUBSYS_ROCE 10
+enum {
+	OCRDMA_CMD_QUERY_CONFIG = 1,
+	OCRDMA_CMD_ALLOC_PD = 2,
+	OCRDMA_CMD_DEALLOC_PD = 3,
+
+	OCRDMA_CMD_CREATE_AH_TBL = 4,
+	OCRDMA_CMD_DELETE_AH_TBL = 5,
+
+	OCRDMA_CMD_CREATE_QP = 6,
+	OCRDMA_CMD_QUERY_QP = 7,
+	OCRDMA_CMD_MODIFY_QP = 8 ,
+	OCRDMA_CMD_DELETE_QP = 9,
+
+	OCRDMA_CMD_RSVD1 = 10,
+	OCRDMA_CMD_ALLOC_LKEY = 11,
+	OCRDMA_CMD_DEALLOC_LKEY = 12,
+	OCRDMA_CMD_REGISTER_NSMR = 13,
+	OCRDMA_CMD_REREGISTER_NSMR = 14,
+	OCRDMA_CMD_REGISTER_NSMR_CONT = 15,
+	OCRDMA_CMD_QUERY_NSMR = 16,
+	OCRDMA_CMD_ALLOC_MW = 17,
+	OCRDMA_CMD_QUERY_MW = 18,
+
+	OCRDMA_CMD_CREATE_SRQ = 19,
+	OCRDMA_CMD_QUERY_SRQ = 20,
+	OCRDMA_CMD_MODIFY_SRQ = 21,
+	OCRDMA_CMD_DELETE_SRQ = 22,
+
+	OCRDMA_CMD_ATTACH_MCAST = 23,
+	OCRDMA_CMD_DETACH_MCAST = 24,
+
+	OCRDMA_CMD_CREATE_RBQ = 25,
+	OCRDMA_CMD_DESTROY_RBQ = 26,
+
+	OCRDMA_CMD_GET_RDMA_STATS = 27,
+
+	OCRDMA_CMD_MAX
+};
+
+#define OCRDMA_SUBSYS_COMMON 1
+enum {
+	OCRDMA_CMD_QUERY_NTWK_LINK_CONFIG_V1 = 5,
+	OCRDMA_CMD_CREATE_CQ		= 12,
+	OCRDMA_CMD_CREATE_EQ		= 13,
+	OCRDMA_CMD_CREATE_MQ		= 21,
+	OCRDMA_CMD_GET_CTRL_ATTRIBUTES  = 32,
+	OCRDMA_CMD_GET_FW_VER		= 35,
+	OCRDMA_CMD_DELETE_MQ		= 53,
+	OCRDMA_CMD_DELETE_CQ		= 54,
+	OCRDMA_CMD_DELETE_EQ		= 55,
+	OCRDMA_CMD_GET_FW_CONFIG	= 58,
+	OCRDMA_CMD_CREATE_MQ_EXT	= 90,
+	OCRDMA_CMD_PHY_DETAILS		= 102
+};
+
+enum {
+	QTYPE_EQ	= 1,
+	QTYPE_CQ	= 2,
+	QTYPE_MCCQ	= 3
+};
+
+#define OCRDMA_MAX_SGID		8
+
+#define OCRDMA_MAX_QP    2048
+#define OCRDMA_MAX_CQ    2048
+#define OCRDMA_MAX_STAG 16384
+
+enum {
+	OCRDMA_DB_RQ_OFFSET		= 0xE0,
+	OCRDMA_DB_GEN2_RQ_OFFSET        = 0x100,
+	OCRDMA_DB_SQ_OFFSET		= 0x60,
+	OCRDMA_DB_GEN2_SQ_OFFSET	= 0x1C0,
+	OCRDMA_DB_SRQ_OFFSET		= OCRDMA_DB_RQ_OFFSET,
+	OCRDMA_DB_GEN2_SRQ_OFFSET	= OCRDMA_DB_GEN2_RQ_OFFSET,
+	OCRDMA_DB_CQ_OFFSET		= 0x120,
+	OCRDMA_DB_EQ_OFFSET		= OCRDMA_DB_CQ_OFFSET,
+	OCRDMA_DB_MQ_OFFSET		= 0x140,
+
+	OCRDMA_DB_SQ_SHIFT		= 16,
+	OCRDMA_DB_RQ_SHIFT		= 24
+};
+
+#define OCRDMA_DB_CQ_RING_ID_MASK       0x3FF	/* bits 0 - 9 */
+#define OCRDMA_DB_CQ_RING_ID_EXT_MASK  0x0C00	/* bits 10-11 of qid at 12-11 */
+/* qid #2 msbits at 12-11 */
+#define OCRDMA_DB_CQ_RING_ID_EXT_MASK_SHIFT  0x1
+#define OCRDMA_DB_CQ_NUM_POPPED_SHIFT	16	/* bits 16 - 28 */
+/* Rearm bit */
+#define OCRDMA_DB_CQ_REARM_SHIFT	29	/* bit 29 */
+/* solicited bit */
+#define OCRDMA_DB_CQ_SOLICIT_SHIFT	31	/* bit 31 */
+
+#define OCRDMA_EQ_ID_MASK		0x1FF	/* bits 0 - 8 */
+#define OCRDMA_EQ_ID_EXT_MASK		0x3e00	/* bits 9-13 */
+#define OCRDMA_EQ_ID_EXT_MASK_SHIFT	2	/* qid bits 9-13 at 11-15 */
+
+/* Clear the interrupt for this eq */
+#define OCRDMA_EQ_CLR_SHIFT		9	/* bit 9 */
+/* Must be 1 */
+#define OCRDMA_EQ_TYPE_SHIFT		10	/* bit 10 */
+/* Number of event entries processed */
+#define OCRDMA_NUM_EQE_SHIFT		16	/* bits 16 - 28 */
+/* Rearm bit */
+#define OCRDMA_REARM_SHIFT		29	/* bit 29 */
+
+#define OCRDMA_MQ_ID_MASK		0x7FF	/* bits 0 - 10 */
+/* Number of entries posted */
+#define OCRDMA_MQ_NUM_MQE_SHIFT	16	/* bits 16 - 29 */
+
+#define OCRDMA_MIN_HPAGE_SIZE	4096
+
+#define OCRDMA_MIN_Q_PAGE_SIZE	4096
+#define OCRDMA_MAX_Q_PAGES	8
+
+#define OCRDMA_SLI_ASIC_ID_OFFSET	0x9C
+#define OCRDMA_SLI_ASIC_REV_MASK	0x000000FF
+#define OCRDMA_SLI_ASIC_GEN_NUM_MASK	0x0000FF00
+#define OCRDMA_SLI_ASIC_GEN_NUM_SHIFT	0x08
+/*
+# 0: 4K Bytes
+# 1: 8K Bytes
+# 2: 16K Bytes
+# 3: 32K Bytes
+# 4: 64K Bytes
+# 5: 128K Bytes
+# 6: 256K Bytes
+# 7: 512K Bytes
+*/
+#define OCRDMA_MAX_Q_PAGE_SIZE_CNT	8
+#define OCRDMA_Q_PAGE_BASE_SIZE (OCRDMA_MIN_Q_PAGE_SIZE * OCRDMA_MAX_Q_PAGES)
+
+#define MAX_OCRDMA_QP_PAGES		8
+#define OCRDMA_MAX_WQE_MEM_SIZE (MAX_OCRDMA_QP_PAGES * OCRDMA_MIN_HQ_PAGE_SIZE)
+
+#define OCRDMA_CREATE_CQ_MAX_PAGES	4
+#define OCRDMA_DPP_CQE_SIZE		4
+
+#define OCRDMA_GEN2_MAX_CQE 1024
+#define OCRDMA_GEN2_CQ_PAGE_SIZE 4096
+#define OCRDMA_GEN2_WQE_SIZE 256
+#define OCRDMA_MAX_CQE  4095
+#define OCRDMA_CQ_PAGE_SIZE 16384
+#define OCRDMA_WQE_SIZE 128
+#define OCRDMA_WQE_STRIDE 8
+#define OCRDMA_WQE_ALIGN_BYTES 16
+
+#define MAX_OCRDMA_SRQ_PAGES MAX_OCRDMA_QP_PAGES
+
+enum {
+	OCRDMA_MCH_OPCODE_SHIFT	= 0,
+	OCRDMA_MCH_OPCODE_MASK	= 0xFF,
+	OCRDMA_MCH_SUBSYS_SHIFT	= 8,
+	OCRDMA_MCH_SUBSYS_MASK	= 0xFF00
+};
+
+/* mailbox cmd header */
+struct ocrdma_mbx_hdr {
+	u32 subsys_op;
+	u32 timeout;		/* in seconds */
+	u32 cmd_len;
+	u32 rsvd_version;
+};
+
+enum {
+	OCRDMA_MBX_RSP_OPCODE_SHIFT	= 0,
+	OCRDMA_MBX_RSP_OPCODE_MASK	= 0xFF,
+	OCRDMA_MBX_RSP_SUBSYS_SHIFT	= 8,
+	OCRDMA_MBX_RSP_SUBSYS_MASK	= 0xFF << OCRDMA_MBX_RSP_SUBSYS_SHIFT,
+
+	OCRDMA_MBX_RSP_STATUS_SHIFT	= 0,
+	OCRDMA_MBX_RSP_STATUS_MASK	= 0xFF,
+	OCRDMA_MBX_RSP_ASTATUS_SHIFT	= 8,
+	OCRDMA_MBX_RSP_ASTATUS_MASK	= 0xFF << OCRDMA_MBX_RSP_ASTATUS_SHIFT
+};
+
+/* mailbox cmd response */
+struct ocrdma_mbx_rsp {
+	u32 subsys_op;
+	u32 status;
+	u32 rsp_len;
+	u32 add_rsp_len;
+};
+
+enum {
+	OCRDMA_MQE_EMBEDDED	= 1,
+	OCRDMA_MQE_NONEMBEDDED	= 0
+};
+
+struct ocrdma_mqe_sge {
+	u32 pa_lo;
+	u32 pa_hi;
+	u32 len;
+};
+
+enum {
+	OCRDMA_MQE_HDR_EMB_SHIFT	= 0,
+	OCRDMA_MQE_HDR_EMB_MASK		= BIT(0),
+	OCRDMA_MQE_HDR_SGE_CNT_SHIFT	= 3,
+	OCRDMA_MQE_HDR_SGE_CNT_MASK	= 0x1F << OCRDMA_MQE_HDR_SGE_CNT_SHIFT,
+	OCRDMA_MQE_HDR_SPECIAL_SHIFT	= 24,
+	OCRDMA_MQE_HDR_SPECIAL_MASK	= 0xFF << OCRDMA_MQE_HDR_SPECIAL_SHIFT
+};
+
+struct ocrdma_mqe_hdr {
+	u32 spcl_sge_cnt_emb;
+	u32 pyld_len;
+	u32 tag_lo;
+	u32 tag_hi;
+	u32 rsvd3;
+};
+
+struct ocrdma_mqe_emb_cmd {
+	struct ocrdma_mbx_hdr mch;
+	u8 pyld[220];
+};
+
+struct ocrdma_mqe {
+	struct ocrdma_mqe_hdr hdr;
+	union {
+		struct ocrdma_mqe_emb_cmd emb_req;
+		struct {
+			struct ocrdma_mqe_sge sge[19];
+		} nonemb_req;
+		u8 cmd[236];
+		struct ocrdma_mbx_rsp rsp;
+	} u;
+};
+
+#define OCRDMA_EQ_LEN       4096
+#define OCRDMA_MQ_CQ_LEN    256
+#define OCRDMA_MQ_LEN       128
+
+#define PAGE_SHIFT_4K		12
+#define PAGE_SIZE_4K		(1 << PAGE_SHIFT_4K)
+
+/* Returns number of pages spanned by the data starting at the given addr */
+#define PAGES_4K_SPANNED(_address, size) \
+	((u32)((((size_t)(_address) & (PAGE_SIZE_4K - 1)) +	\
+			(size) + (PAGE_SIZE_4K - 1)) >> PAGE_SHIFT_4K))
+
+struct ocrdma_delete_q_req {
+	struct ocrdma_mbx_hdr req;
+	u32 id;
+};
+
+struct ocrdma_pa {
+	u32 lo;
+	u32 hi;
+};
+
+#define MAX_OCRDMA_EQ_PAGES	8
+struct ocrdma_create_eq_req {
+	struct ocrdma_mbx_hdr req;
+	u32 num_pages;
+	u32 valid;
+	u32 cnt;
+	u32 delay;
+	u32 rsvd;
+	struct ocrdma_pa pa[MAX_OCRDMA_EQ_PAGES];
+};
+
+enum {
+	OCRDMA_CREATE_EQ_VALID	= BIT(29),
+	OCRDMA_CREATE_EQ_CNT_SHIFT	= 26,
+	OCRDMA_CREATE_CQ_DELAY_SHIFT	= 13,
+};
+
+struct ocrdma_create_eq_rsp {
+	struct ocrdma_mbx_rsp rsp;
+	u32 vector_eqid;
+};
+
+#define OCRDMA_EQ_MINOR_OTHER	0x1
+
+enum {
+	OCRDMA_MCQE_STATUS_SHIFT	= 0,
+	OCRDMA_MCQE_STATUS_MASK		= 0xFFFF,
+	OCRDMA_MCQE_ESTATUS_SHIFT	= 16,
+	OCRDMA_MCQE_ESTATUS_MASK	= 0xFFFF << OCRDMA_MCQE_ESTATUS_SHIFT,
+	OCRDMA_MCQE_CONS_SHIFT		= 27,
+	OCRDMA_MCQE_CONS_MASK		= BIT(27),
+	OCRDMA_MCQE_CMPL_SHIFT		= 28,
+	OCRDMA_MCQE_CMPL_MASK		= BIT(28),
+	OCRDMA_MCQE_AE_SHIFT		= 30,
+	OCRDMA_MCQE_AE_MASK		= BIT(30),
+	OCRDMA_MCQE_VALID_SHIFT		= 31,
+	OCRDMA_MCQE_VALID_MASK		= BIT(31)
+};
+
+struct ocrdma_mcqe {
+	u32 status;
+	u32 tag_lo;
+	u32 tag_hi;
+	u32 valid_ae_cmpl_cons;
+};
+
+enum {
+	OCRDMA_AE_MCQE_QPVALID		= BIT(31),
+	OCRDMA_AE_MCQE_QPID_MASK	= 0xFFFF,
+
+	OCRDMA_AE_MCQE_CQVALID		= BIT(31),
+	OCRDMA_AE_MCQE_CQID_MASK	= 0xFFFF,
+	OCRDMA_AE_MCQE_VALID		= BIT(31),
+	OCRDMA_AE_MCQE_AE		= BIT(30),
+	OCRDMA_AE_MCQE_EVENT_TYPE_SHIFT	= 16,
+	OCRDMA_AE_MCQE_EVENT_TYPE_MASK	=
+					0xFF << OCRDMA_AE_MCQE_EVENT_TYPE_SHIFT,
+	OCRDMA_AE_MCQE_EVENT_CODE_SHIFT	= 8,
+	OCRDMA_AE_MCQE_EVENT_CODE_MASK	=
+					0xFF << OCRDMA_AE_MCQE_EVENT_CODE_SHIFT
+};
+struct ocrdma_ae_mcqe {
+	u32 qpvalid_qpid;
+	u32 cqvalid_cqid;
+	u32 evt_tag;
+	u32 valid_ae_event;
+};
+
+enum {
+	OCRDMA_AE_PVID_MCQE_ENABLED_SHIFT = 0,
+	OCRDMA_AE_PVID_MCQE_ENABLED_MASK  = 0xFF,
+	OCRDMA_AE_PVID_MCQE_TAG_SHIFT = 16,
+	OCRDMA_AE_PVID_MCQE_TAG_MASK = 0xFFFF << OCRDMA_AE_PVID_MCQE_TAG_SHIFT
+};
+
+struct ocrdma_ae_pvid_mcqe {
+	u32 tag_enabled;
+	u32 event_tag;
+	u32 rsvd1;
+	u32 rsvd2;
+};
+
+enum {
+	OCRDMA_AE_MPA_MCQE_REQ_ID_SHIFT		= 16,
+	OCRDMA_AE_MPA_MCQE_REQ_ID_MASK		= 0xFFFF <<
+					OCRDMA_AE_MPA_MCQE_REQ_ID_SHIFT,
+
+	OCRDMA_AE_MPA_MCQE_EVENT_CODE_SHIFT	= 8,
+	OCRDMA_AE_MPA_MCQE_EVENT_CODE_MASK	= 0xFF <<
+					OCRDMA_AE_MPA_MCQE_EVENT_CODE_SHIFT,
+	OCRDMA_AE_MPA_MCQE_EVENT_TYPE_SHIFT	= 16,
+	OCRDMA_AE_MPA_MCQE_EVENT_TYPE_MASK	= 0xFF <<
+					OCRDMA_AE_MPA_MCQE_EVENT_TYPE_SHIFT,
+	OCRDMA_AE_MPA_MCQE_EVENT_AE_SHIFT	= 30,
+	OCRDMA_AE_MPA_MCQE_EVENT_AE_MASK	= BIT(30),
+	OCRDMA_AE_MPA_MCQE_EVENT_VALID_SHIFT	= 31,
+	OCRDMA_AE_MPA_MCQE_EVENT_VALID_MASK	= BIT(31)
+};
+
+struct ocrdma_ae_mpa_mcqe {
+	u32 req_id;
+	u32 w1;
+	u32 w2;
+	u32 valid_ae_event;
+};
+
+enum {
+	OCRDMA_AE_QP_MCQE_NEW_QP_STATE_SHIFT	= 0,
+	OCRDMA_AE_QP_MCQE_NEW_QP_STATE_MASK	= 0xFFFF,
+	OCRDMA_AE_QP_MCQE_QP_ID_SHIFT		= 16,
+	OCRDMA_AE_QP_MCQE_QP_ID_MASK		= 0xFFFF <<
+						OCRDMA_AE_QP_MCQE_QP_ID_SHIFT,
+
+	OCRDMA_AE_QP_MCQE_EVENT_CODE_SHIFT	= 8,
+	OCRDMA_AE_QP_MCQE_EVENT_CODE_MASK	= 0xFF <<
+				OCRDMA_AE_QP_MCQE_EVENT_CODE_SHIFT,
+	OCRDMA_AE_QP_MCQE_EVENT_TYPE_SHIFT	= 16,
+	OCRDMA_AE_QP_MCQE_EVENT_TYPE_MASK	= 0xFF <<
+				OCRDMA_AE_QP_MCQE_EVENT_TYPE_SHIFT,
+	OCRDMA_AE_QP_MCQE_EVENT_AE_SHIFT	= 30,
+	OCRDMA_AE_QP_MCQE_EVENT_AE_MASK		= BIT(30),
+	OCRDMA_AE_QP_MCQE_EVENT_VALID_SHIFT	= 31,
+	OCRDMA_AE_QP_MCQE_EVENT_VALID_MASK	= BIT(31)
+};
+
+struct ocrdma_ae_qp_mcqe {
+	u32 qp_id_state;
+	u32 w1;
+	u32 w2;
+	u32 valid_ae_event;
+};
+
+#define OCRDMA_ASYNC_RDMA_EVE_CODE 0x14
+#define OCRDMA_ASYNC_GRP5_EVE_CODE 0x5
+
+enum ocrdma_async_grp5_events {
+	OCRDMA_ASYNC_EVENT_QOS_VALUE	= 0x01,
+	OCRDMA_ASYNC_EVENT_COS_VALUE	= 0x02,
+	OCRDMA_ASYNC_EVENT_PVID_STATE	= 0x03
+};
+
+enum OCRDMA_ASYNC_EVENT_TYPE {
+	OCRDMA_CQ_ERROR			= 0x00,
+	OCRDMA_CQ_OVERRUN_ERROR		= 0x01,
+	OCRDMA_CQ_QPCAT_ERROR		= 0x02,
+	OCRDMA_QP_ACCESS_ERROR		= 0x03,
+	OCRDMA_QP_COMM_EST_EVENT	= 0x04,
+	OCRDMA_SQ_DRAINED_EVENT		= 0x05,
+	OCRDMA_DEVICE_FATAL_EVENT	= 0x08,
+	OCRDMA_SRQCAT_ERROR		= 0x0E,
+	OCRDMA_SRQ_LIMIT_EVENT		= 0x0F,
+	OCRDMA_QP_LAST_WQE_EVENT	= 0x10
+};
+
+/* mailbox command request and responses */
+enum {
+	OCRDMA_MBX_QUERY_CFG_CQ_OVERFLOW_SHIFT		= 2,
+	OCRDMA_MBX_QUERY_CFG_CQ_OVERFLOW_MASK		= BIT(2),
+	OCRDMA_MBX_QUERY_CFG_SRQ_SUPPORTED_SHIFT	= 3,
+	OCRDMA_MBX_QUERY_CFG_SRQ_SUPPORTED_MASK		= BIT(3),
+	OCRDMA_MBX_QUERY_CFG_MAX_QP_SHIFT		= 8,
+	OCRDMA_MBX_QUERY_CFG_MAX_QP_MASK		= 0xFFFFFF <<
+				OCRDMA_MBX_QUERY_CFG_MAX_QP_SHIFT,
+
+	OCRDMA_MBX_QUERY_CFG_MAX_PD_SHIFT		= 16,
+	OCRDMA_MBX_QUERY_CFG_MAX_PD_MASK		= 0xFFFF <<
+					OCRDMA_MBX_QUERY_CFG_MAX_PD_SHIFT,
+	OCRDMA_MBX_QUERY_CFG_CA_ACK_DELAY_SHIFT		= 8,
+	OCRDMA_MBX_QUERY_CFG_CA_ACK_DELAY_MASK		= 0xFF <<
+				OCRDMA_MBX_QUERY_CFG_CA_ACK_DELAY_SHIFT,
+
+	OCRDMA_MBX_QUERY_CFG_MAX_SEND_SGE_SHIFT		= 0,
+	OCRDMA_MBX_QUERY_CFG_MAX_SEND_SGE_MASK		= 0xFFFF,
+	OCRDMA_MBX_QUERY_CFG_MAX_WRITE_SGE_SHIFT	= 16,
+	OCRDMA_MBX_QUERY_CFG_MAX_WRITE_SGE_MASK		= 0xFFFF <<
+				OCRDMA_MBX_QUERY_CFG_MAX_WRITE_SGE_SHIFT,
+
+	OCRDMA_MBX_QUERY_CFG_MAX_ORD_PER_QP_SHIFT	= 0,
+	OCRDMA_MBX_QUERY_CFG_MAX_ORD_PER_QP_MASK	= 0xFFFF,
+	OCRDMA_MBX_QUERY_CFG_MAX_IRD_PER_QP_SHIFT	= 16,
+	OCRDMA_MBX_QUERY_CFG_MAX_IRD_PER_QP_MASK	= 0xFFFF <<
+				OCRDMA_MBX_QUERY_CFG_MAX_IRD_PER_QP_SHIFT,
+
+	OCRDMA_MBX_QUERY_CFG_MAX_WQE_SIZE_OFFSET	= 24,
+	OCRDMA_MBX_QUERY_CFG_MAX_WQE_SIZE_MASK		= 0xFF <<
+				OCRDMA_MBX_QUERY_CFG_MAX_WQE_SIZE_OFFSET,
+	OCRDMA_MBX_QUERY_CFG_MAX_RQE_SIZE_OFFSET	= 16,
+	OCRDMA_MBX_QUERY_CFG_MAX_RQE_SIZE_MASK		= 0xFF <<
+				OCRDMA_MBX_QUERY_CFG_MAX_RQE_SIZE_OFFSET,
+	OCRDMA_MBX_QUERY_CFG_MAX_DPP_CQES_OFFSET	= 0,
+	OCRDMA_MBX_QUERY_CFG_MAX_DPP_CQES_MASK		= 0xFFFF <<
+				OCRDMA_MBX_QUERY_CFG_MAX_DPP_CQES_OFFSET,
+
+	OCRDMA_MBX_QUERY_CFG_MAX_SRQ_OFFSET		= 16,
+	OCRDMA_MBX_QUERY_CFG_MAX_SRQ_MASK		= 0xFFFF <<
+				OCRDMA_MBX_QUERY_CFG_MAX_SRQ_OFFSET,
+	OCRDMA_MBX_QUERY_CFG_MAX_RPIR_QPS_OFFSET	= 0,
+	OCRDMA_MBX_QUERY_CFG_MAX_RPIR_QPS_MASK		= 0xFFFF <<
+				OCRDMA_MBX_QUERY_CFG_MAX_RPIR_QPS_OFFSET,
+
+	OCRDMA_MBX_QUERY_CFG_MAX_DPP_PDS_OFFSET		= 16,
+	OCRDMA_MBX_QUERY_CFG_MAX_DPP_PDS_MASK		= 0xFFFF <<
+				OCRDMA_MBX_QUERY_CFG_MAX_DPP_PDS_OFFSET,
+	OCRDMA_MBX_QUERY_CFG_MAX_DPP_CREDITS_OFFSET	= 0,
+	OCRDMA_MBX_QUERY_CFG_MAX_DPP_CREDITS_MASK	= 0xFFFF <<
+				OCRDMA_MBX_QUERY_CFG_MAX_DPP_CREDITS_OFFSET,
+
+	OCRDMA_MBX_QUERY_CFG_MAX_DPP_QPS_OFFSET		= 0,
+	OCRDMA_MBX_QUERY_CFG_MAX_DPP_QPS_MASK		= 0xFFFF <<
+				OCRDMA_MBX_QUERY_CFG_MAX_DPP_QPS_OFFSET,
+
+	OCRDMA_MBX_QUERY_CFG_MAX_WQES_PER_WQ_OFFSET	= 16,
+	OCRDMA_MBX_QUERY_CFG_MAX_WQES_PER_WQ_MASK	= 0xFFFF <<
+				OCRDMA_MBX_QUERY_CFG_MAX_WQES_PER_WQ_OFFSET,
+	OCRDMA_MBX_QUERY_CFG_MAX_RQES_PER_RQ_OFFSET	= 0,
+	OCRDMA_MBX_QUERY_CFG_MAX_RQES_PER_RQ_MASK	= 0xFFFF <<
+				OCRDMA_MBX_QUERY_CFG_MAX_RQES_PER_RQ_OFFSET,
+
+	OCRDMA_MBX_QUERY_CFG_MAX_CQ_OFFSET		= 16,
+	OCRDMA_MBX_QUERY_CFG_MAX_CQ_MASK		= 0xFFFF <<
+				OCRDMA_MBX_QUERY_CFG_MAX_CQ_OFFSET,
+	OCRDMA_MBX_QUERY_CFG_MAX_CQES_PER_CQ_OFFSET	= 0,
+	OCRDMA_MBX_QUERY_CFG_MAX_CQES_PER_CQ_MASK	= 0xFFFF <<
+				OCRDMA_MBX_QUERY_CFG_MAX_CQES_PER_CQ_OFFSET,
+
+	OCRDMA_MBX_QUERY_CFG_MAX_SRQ_RQE_OFFSET		= 16,
+	OCRDMA_MBX_QUERY_CFG_MAX_SRQ_RQE_MASK		= 0xFFFF <<
+				OCRDMA_MBX_QUERY_CFG_MAX_SRQ_RQE_OFFSET,
+	OCRDMA_MBX_QUERY_CFG_MAX_SRQ_SGE_OFFSET		= 0,
+	OCRDMA_MBX_QUERY_CFG_MAX_SRQ_SGE_MASK		= 0xFFFF <<
+				OCRDMA_MBX_QUERY_CFG_MAX_SRQ_SGE_OFFSET,
+};
+
+struct ocrdma_mbx_query_config {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_mbx_rsp rsp;
+	u32 qp_srq_cq_ird_ord;
+	u32 max_pd_ca_ack_delay;
+	u32 max_write_send_sge;
+	u32 max_ird_ord_per_qp;
+	u32 max_shared_ird_ord;
+	u32 max_mr;
+	u32 max_mr_size_hi;
+	u32 max_mr_size_lo;
+	u32 max_num_mr_pbl;
+	u32 max_mw;
+	u32 max_fmr;
+	u32 max_pages_per_frmr;
+	u32 max_mcast_group;
+	u32 max_mcast_qp_attach;
+	u32 max_total_mcast_qp_attach;
+	u32 wqe_rqe_stride_max_dpp_cqs;
+	u32 max_srq_rpir_qps;
+	u32 max_dpp_pds_credits;
+	u32 max_dpp_credits_pds_per_pd;
+	u32 max_wqes_rqes_per_q;
+	u32 max_cq_cqes_per_cq;
+	u32 max_srq_rqe_sge;
+};
+
+struct ocrdma_fw_ver_rsp {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_mbx_rsp rsp;
+
+	u8 running_ver[32];
+};
+
+struct ocrdma_fw_conf_rsp {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_mbx_rsp rsp;
+
+	u32 config_num;
+	u32 asic_revision;
+	u32 phy_port;
+	u32 fn_mode;
+	struct {
+		u32 mode;
+		u32 nic_wqid_base;
+		u32 nic_wq_tot;
+		u32 prot_wqid_base;
+		u32 prot_wq_tot;
+		u32 prot_rqid_base;
+		u32 prot_rqid_tot;
+		u32 rsvd[6];
+	} ulp[2];
+	u32 fn_capabilities;
+	u32 rsvd1;
+	u32 rsvd2;
+	u32 base_eqid;
+	u32 max_eq;
+
+};
+
+enum {
+	OCRDMA_FN_MODE_RDMA	= 0x4
+};
+
+enum {
+	OCRDMA_IF_TYPE_MASK		= 0xFFFF0000,
+	OCRDMA_IF_TYPE_SHIFT		= 0x10,
+	OCRDMA_PHY_TYPE_MASK		= 0x0000FFFF,
+	OCRDMA_FUTURE_DETAILS_MASK	= 0xFFFF0000,
+	OCRDMA_FUTURE_DETAILS_SHIFT	= 0x10,
+	OCRDMA_EX_PHY_DETAILS_MASK	= 0x0000FFFF,
+	OCRDMA_FSPEED_SUPP_MASK		= 0xFFFF0000,
+	OCRDMA_FSPEED_SUPP_SHIFT	= 0x10,
+	OCRDMA_ASPEED_SUPP_MASK		= 0x0000FFFF
+};
+
+struct ocrdma_get_phy_info_rsp {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_mbx_rsp rsp;
+
+	u32 ityp_ptyp;
+	u32 misc_params;
+	u32 ftrdtl_exphydtl;
+	u32 fspeed_aspeed;
+	u32 future_use[2];
+};
+
+enum {
+	OCRDMA_PHY_SPEED_ZERO = 0x0,
+	OCRDMA_PHY_SPEED_10MBPS = 0x1,
+	OCRDMA_PHY_SPEED_100MBPS = 0x2,
+	OCRDMA_PHY_SPEED_1GBPS = 0x4,
+	OCRDMA_PHY_SPEED_10GBPS = 0x8,
+	OCRDMA_PHY_SPEED_40GBPS = 0x20
+};
+
+enum {
+	OCRDMA_PORT_NUM_MASK	= 0x3F,
+	OCRDMA_PT_MASK		= 0xC0,
+	OCRDMA_PT_SHIFT		= 0x6,
+	OCRDMA_LINK_DUP_MASK	= 0x0000FF00,
+	OCRDMA_LINK_DUP_SHIFT	= 0x8,
+	OCRDMA_PHY_PS_MASK	= 0x00FF0000,
+	OCRDMA_PHY_PS_SHIFT	= 0x10,
+	OCRDMA_PHY_PFLT_MASK	= 0xFF000000,
+	OCRDMA_PHY_PFLT_SHIFT	= 0x18,
+	OCRDMA_QOS_LNKSP_MASK	= 0xFFFF0000,
+	OCRDMA_QOS_LNKSP_SHIFT	= 0x10,
+	OCRDMA_LLST_MASK	= 0xFF,
+	OCRDMA_PLFC_MASK	= 0x00000400,
+	OCRDMA_PLFC_SHIFT	= 0x8,
+	OCRDMA_PLRFC_MASK	= 0x00000200,
+	OCRDMA_PLRFC_SHIFT	= 0x8,
+	OCRDMA_PLTFC_MASK	= 0x00000100,
+	OCRDMA_PLTFC_SHIFT	= 0x8
+};
+
+struct ocrdma_get_link_speed_rsp {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_mbx_rsp rsp;
+
+	u32 pflt_pps_ld_pnum;
+	u32 qos_lsp;
+	u32 res_lls;
+};
+
+enum {
+	OCRDMA_PHYS_LINK_SPEED_ZERO = 0x0,
+	OCRDMA_PHYS_LINK_SPEED_10MBPS = 0x1,
+	OCRDMA_PHYS_LINK_SPEED_100MBPS = 0x2,
+	OCRDMA_PHYS_LINK_SPEED_1GBPS = 0x3,
+	OCRDMA_PHYS_LINK_SPEED_10GBPS = 0x4,
+	OCRDMA_PHYS_LINK_SPEED_20GBPS = 0x5,
+	OCRDMA_PHYS_LINK_SPEED_25GBPS = 0x6,
+	OCRDMA_PHYS_LINK_SPEED_40GBPS = 0x7,
+	OCRDMA_PHYS_LINK_SPEED_100GBPS = 0x8
+};
+
+enum {
+	OCRDMA_CREATE_CQ_VER2			= 2,
+	OCRDMA_CREATE_CQ_VER3			= 3,
+
+	OCRDMA_CREATE_CQ_PAGE_CNT_MASK		= 0xFFFF,
+	OCRDMA_CREATE_CQ_PAGE_SIZE_SHIFT	= 16,
+	OCRDMA_CREATE_CQ_PAGE_SIZE_MASK		= 0xFF,
+
+	OCRDMA_CREATE_CQ_COALESCWM_SHIFT	= 12,
+	OCRDMA_CREATE_CQ_COALESCWM_MASK		= BIT(13) | BIT(12),
+	OCRDMA_CREATE_CQ_FLAGS_NODELAY		= BIT(14),
+	OCRDMA_CREATE_CQ_FLAGS_AUTO_VALID	= BIT(15),
+
+	OCRDMA_CREATE_CQ_EQ_ID_MASK		= 0xFFFF,
+	OCRDMA_CREATE_CQ_CQE_COUNT_MASK		= 0xFFFF
+};
+
+enum {
+	OCRDMA_CREATE_CQ_VER0			= 0,
+	OCRDMA_CREATE_CQ_DPP			= 1,
+	OCRDMA_CREATE_CQ_TYPE_SHIFT		= 24,
+	OCRDMA_CREATE_CQ_EQID_SHIFT		= 22,
+
+	OCRDMA_CREATE_CQ_CNT_SHIFT		= 27,
+	OCRDMA_CREATE_CQ_FLAGS_VALID		= BIT(29),
+	OCRDMA_CREATE_CQ_FLAGS_EVENTABLE	= BIT(31),
+	OCRDMA_CREATE_CQ_DEF_FLAGS		= OCRDMA_CREATE_CQ_FLAGS_VALID |
+					OCRDMA_CREATE_CQ_FLAGS_EVENTABLE |
+					OCRDMA_CREATE_CQ_FLAGS_NODELAY
+};
+
+struct ocrdma_create_cq_cmd {
+	struct ocrdma_mbx_hdr req;
+	u32 pgsz_pgcnt;
+	u32 ev_cnt_flags;
+	u32 eqn;
+	u32 pdid_cqecnt;
+	u32 rsvd6;
+	struct ocrdma_pa pa[OCRDMA_CREATE_CQ_MAX_PAGES];
+};
+
+struct ocrdma_create_cq {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_create_cq_cmd cmd;
+};
+
+enum {
+	OCRDMA_CREATE_CQ_CMD_PDID_SHIFT	= 0x10
+};
+
+enum {
+	OCRDMA_CREATE_CQ_RSP_CQ_ID_MASK	= 0xFFFF
+};
+
+struct ocrdma_create_cq_cmd_rsp {
+	struct ocrdma_mbx_rsp rsp;
+	u32 cq_id;
+};
+
+struct ocrdma_create_cq_rsp {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_create_cq_cmd_rsp rsp;
+};
+
+enum {
+	OCRDMA_CREATE_MQ_V0_CQ_ID_SHIFT		= 22,
+	OCRDMA_CREATE_MQ_CQ_ID_SHIFT		= 16,
+	OCRDMA_CREATE_MQ_RING_SIZE_SHIFT	= 16,
+	OCRDMA_CREATE_MQ_VALID			= BIT(31),
+	OCRDMA_CREATE_MQ_ASYNC_CQ_VALID		= BIT(0)
+};
+
+struct ocrdma_create_mq_req {
+	struct ocrdma_mbx_hdr req;
+	u32 cqid_pages;
+	u32 async_event_bitmap;
+	u32 async_cqid_ringsize;
+	u32 valid;
+	u32 async_cqid_valid;
+	u32 rsvd;
+	struct ocrdma_pa pa[8];
+};
+
+struct ocrdma_create_mq_rsp {
+	struct ocrdma_mbx_rsp rsp;
+	u32 id;
+};
+
+enum {
+	OCRDMA_DESTROY_CQ_QID_SHIFT			= 0,
+	OCRDMA_DESTROY_CQ_QID_MASK			= 0xFFFF,
+	OCRDMA_DESTROY_CQ_QID_BYPASS_FLUSH_SHIFT	= 16,
+	OCRDMA_DESTROY_CQ_QID_BYPASS_FLUSH_MASK		= 0xFFFF <<
+				OCRDMA_DESTROY_CQ_QID_BYPASS_FLUSH_SHIFT
+};
+
+struct ocrdma_destroy_cq {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_mbx_hdr req;
+
+	u32 bypass_flush_qid;
+};
+
+struct ocrdma_destroy_cq_rsp {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_mbx_rsp rsp;
+};
+
+enum {
+	OCRDMA_QPT_GSI	= 1,
+	OCRDMA_QPT_RC	= 2,
+	OCRDMA_QPT_UD	= 4,
+};
+
+enum {
+	OCRDMA_CREATE_QP_REQ_PD_ID_SHIFT	= 0,
+	OCRDMA_CREATE_QP_REQ_PD_ID_MASK		= 0xFFFF,
+	OCRDMA_CREATE_QP_REQ_SQ_PAGE_SIZE_SHIFT	= 16,
+	OCRDMA_CREATE_QP_REQ_RQ_PAGE_SIZE_SHIFT	= 19,
+	OCRDMA_CREATE_QP_REQ_QPT_SHIFT		= 29,
+	OCRDMA_CREATE_QP_REQ_QPT_MASK		= BIT(31) | BIT(30) | BIT(29),
+
+	OCRDMA_CREATE_QP_REQ_MAX_RQE_SHIFT	= 0,
+	OCRDMA_CREATE_QP_REQ_MAX_RQE_MASK	= 0xFFFF,
+	OCRDMA_CREATE_QP_REQ_MAX_WQE_SHIFT	= 16,
+	OCRDMA_CREATE_QP_REQ_MAX_WQE_MASK	= 0xFFFF <<
+					OCRDMA_CREATE_QP_REQ_MAX_WQE_SHIFT,
+
+	OCRDMA_CREATE_QP_REQ_MAX_SGE_WRITE_SHIFT	= 0,
+	OCRDMA_CREATE_QP_REQ_MAX_SGE_WRITE_MASK		= 0xFFFF,
+	OCRDMA_CREATE_QP_REQ_MAX_SGE_SEND_SHIFT		= 16,
+	OCRDMA_CREATE_QP_REQ_MAX_SGE_SEND_MASK		= 0xFFFF <<
+					OCRDMA_CREATE_QP_REQ_MAX_SGE_SEND_SHIFT,
+
+	OCRDMA_CREATE_QP_REQ_FMR_EN_SHIFT		= 0,
+	OCRDMA_CREATE_QP_REQ_FMR_EN_MASK		= BIT(0),
+	OCRDMA_CREATE_QP_REQ_ZERO_LKEYEN_SHIFT		= 1,
+	OCRDMA_CREATE_QP_REQ_ZERO_LKEYEN_MASK		= BIT(1),
+	OCRDMA_CREATE_QP_REQ_BIND_MEMWIN_SHIFT		= 2,
+	OCRDMA_CREATE_QP_REQ_BIND_MEMWIN_MASK		= BIT(2),
+	OCRDMA_CREATE_QP_REQ_INB_WREN_SHIFT		= 3,
+	OCRDMA_CREATE_QP_REQ_INB_WREN_MASK		= BIT(3),
+	OCRDMA_CREATE_QP_REQ_INB_RDEN_SHIFT		= 4,
+	OCRDMA_CREATE_QP_REQ_INB_RDEN_MASK		= BIT(4),
+	OCRDMA_CREATE_QP_REQ_USE_SRQ_SHIFT		= 5,
+	OCRDMA_CREATE_QP_REQ_USE_SRQ_MASK		= BIT(5),
+	OCRDMA_CREATE_QP_REQ_ENABLE_RPIR_SHIFT		= 6,
+	OCRDMA_CREATE_QP_REQ_ENABLE_RPIR_MASK		= BIT(6),
+	OCRDMA_CREATE_QP_REQ_ENABLE_DPP_SHIFT		= 7,
+	OCRDMA_CREATE_QP_REQ_ENABLE_DPP_MASK		= BIT(7),
+	OCRDMA_CREATE_QP_REQ_ENABLE_DPP_CQ_SHIFT	= 8,
+	OCRDMA_CREATE_QP_REQ_ENABLE_DPP_CQ_MASK		= BIT(8),
+	OCRDMA_CREATE_QP_REQ_MAX_SGE_RECV_SHIFT		= 16,
+	OCRDMA_CREATE_QP_REQ_MAX_SGE_RECV_MASK		= 0xFFFF <<
+				OCRDMA_CREATE_QP_REQ_MAX_SGE_RECV_SHIFT,
+
+	OCRDMA_CREATE_QP_REQ_MAX_IRD_SHIFT		= 0,
+	OCRDMA_CREATE_QP_REQ_MAX_IRD_MASK		= 0xFFFF,
+	OCRDMA_CREATE_QP_REQ_MAX_ORD_SHIFT		= 16,
+	OCRDMA_CREATE_QP_REQ_MAX_ORD_MASK		= 0xFFFF <<
+				OCRDMA_CREATE_QP_REQ_MAX_ORD_SHIFT,
+
+	OCRDMA_CREATE_QP_REQ_NUM_RQ_PAGES_SHIFT		= 0,
+	OCRDMA_CREATE_QP_REQ_NUM_RQ_PAGES_MASK		= 0xFFFF,
+	OCRDMA_CREATE_QP_REQ_NUM_WQ_PAGES_SHIFT		= 16,
+	OCRDMA_CREATE_QP_REQ_NUM_WQ_PAGES_MASK		= 0xFFFF <<
+				OCRDMA_CREATE_QP_REQ_NUM_WQ_PAGES_SHIFT,
+
+	OCRDMA_CREATE_QP_REQ_RQE_SIZE_SHIFT		= 0,
+	OCRDMA_CREATE_QP_REQ_RQE_SIZE_MASK		= 0xFFFF,
+	OCRDMA_CREATE_QP_REQ_WQE_SIZE_SHIFT		= 16,
+	OCRDMA_CREATE_QP_REQ_WQE_SIZE_MASK		= 0xFFFF <<
+				OCRDMA_CREATE_QP_REQ_WQE_SIZE_SHIFT,
+
+	OCRDMA_CREATE_QP_REQ_RQ_CQID_SHIFT		= 0,
+	OCRDMA_CREATE_QP_REQ_RQ_CQID_MASK		= 0xFFFF,
+	OCRDMA_CREATE_QP_REQ_WQ_CQID_SHIFT		= 16,
+	OCRDMA_CREATE_QP_REQ_WQ_CQID_MASK		= 0xFFFF <<
+				OCRDMA_CREATE_QP_REQ_WQ_CQID_SHIFT,
+
+	OCRDMA_CREATE_QP_REQ_DPP_CQPID_SHIFT		= 0,
+	OCRDMA_CREATE_QP_REQ_DPP_CQPID_MASK		= 0xFFFF,
+	OCRDMA_CREATE_QP_REQ_DPP_CREDIT_SHIFT		= 16,
+	OCRDMA_CREATE_QP_REQ_DPP_CREDIT_MASK		= 0xFFFF <<
+				OCRDMA_CREATE_QP_REQ_DPP_CREDIT_SHIFT
+};
+
+enum {
+	OCRDMA_CREATE_QP_REQ_DPP_CREDIT_LIMIT	= 16,
+	OCRDMA_CREATE_QP_RSP_DPP_PAGE_SHIFT	= 1
+};
+
+#define MAX_OCRDMA_IRD_PAGES 4
+
+enum ocrdma_qp_flags {
+	OCRDMA_QP_MW_BIND	= 1,
+	OCRDMA_QP_LKEY0		= (1 << 1),
+	OCRDMA_QP_FAST_REG	= (1 << 2),
+	OCRDMA_QP_INB_RD	= (1 << 6),
+	OCRDMA_QP_INB_WR	= (1 << 7),
+};
+
+enum ocrdma_qp_state {
+	OCRDMA_QPS_RST		= 0,
+	OCRDMA_QPS_INIT		= 1,
+	OCRDMA_QPS_RTR		= 2,
+	OCRDMA_QPS_RTS		= 3,
+	OCRDMA_QPS_SQE		= 4,
+	OCRDMA_QPS_SQ_DRAINING	= 5,
+	OCRDMA_QPS_ERR		= 6,
+	OCRDMA_QPS_SQD		= 7
+};
+
+struct ocrdma_create_qp_req {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_mbx_hdr req;
+
+	u32 type_pgsz_pdn;
+	u32 max_wqe_rqe;
+	u32 max_sge_send_write;
+	u32 max_sge_recv_flags;
+	u32 max_ord_ird;
+	u32 num_wq_rq_pages;
+	u32 wqe_rqe_size;
+	u32 wq_rq_cqid;
+	struct ocrdma_pa wq_addr[MAX_OCRDMA_QP_PAGES];
+	struct ocrdma_pa rq_addr[MAX_OCRDMA_QP_PAGES];
+	u32 dpp_credits_cqid;
+	u32 rpir_lkey;
+	struct ocrdma_pa ird_addr[MAX_OCRDMA_IRD_PAGES];
+};
+
+enum {
+	OCRDMA_CREATE_QP_RSP_QP_ID_SHIFT		= 0,
+	OCRDMA_CREATE_QP_RSP_QP_ID_MASK			= 0xFFFF,
+
+	OCRDMA_CREATE_QP_RSP_MAX_RQE_SHIFT		= 0,
+	OCRDMA_CREATE_QP_RSP_MAX_RQE_MASK		= 0xFFFF,
+	OCRDMA_CREATE_QP_RSP_MAX_WQE_SHIFT		= 16,
+	OCRDMA_CREATE_QP_RSP_MAX_WQE_MASK		= 0xFFFF <<
+				OCRDMA_CREATE_QP_RSP_MAX_WQE_SHIFT,
+
+	OCRDMA_CREATE_QP_RSP_MAX_SGE_WRITE_SHIFT	= 0,
+	OCRDMA_CREATE_QP_RSP_MAX_SGE_WRITE_MASK		= 0xFFFF,
+	OCRDMA_CREATE_QP_RSP_MAX_SGE_SEND_SHIFT		= 16,
+	OCRDMA_CREATE_QP_RSP_MAX_SGE_SEND_MASK		= 0xFFFF <<
+				OCRDMA_CREATE_QP_RSP_MAX_SGE_SEND_SHIFT,
+
+	OCRDMA_CREATE_QP_RSP_MAX_SGE_RECV_SHIFT		= 16,
+	OCRDMA_CREATE_QP_RSP_MAX_SGE_RECV_MASK		= 0xFFFF <<
+				OCRDMA_CREATE_QP_RSP_MAX_SGE_RECV_SHIFT,
+
+	OCRDMA_CREATE_QP_RSP_MAX_IRD_SHIFT		= 0,
+	OCRDMA_CREATE_QP_RSP_MAX_IRD_MASK		= 0xFFFF,
+	OCRDMA_CREATE_QP_RSP_MAX_ORD_SHIFT		= 16,
+	OCRDMA_CREATE_QP_RSP_MAX_ORD_MASK		= 0xFFFF <<
+				OCRDMA_CREATE_QP_RSP_MAX_ORD_SHIFT,
+
+	OCRDMA_CREATE_QP_RSP_RQ_ID_SHIFT		= 0,
+	OCRDMA_CREATE_QP_RSP_RQ_ID_MASK			= 0xFFFF,
+	OCRDMA_CREATE_QP_RSP_SQ_ID_SHIFT		= 16,
+	OCRDMA_CREATE_QP_RSP_SQ_ID_MASK			= 0xFFFF <<
+				OCRDMA_CREATE_QP_RSP_SQ_ID_SHIFT,
+
+	OCRDMA_CREATE_QP_RSP_DPP_ENABLED_MASK		= BIT(0),
+	OCRDMA_CREATE_QP_RSP_DPP_PAGE_OFFSET_SHIFT	= 1,
+	OCRDMA_CREATE_QP_RSP_DPP_PAGE_OFFSET_MASK	= 0x7FFF <<
+				OCRDMA_CREATE_QP_RSP_DPP_PAGE_OFFSET_SHIFT,
+	OCRDMA_CREATE_QP_RSP_DPP_CREDITS_SHIFT		= 16,
+	OCRDMA_CREATE_QP_RSP_DPP_CREDITS_MASK		= 0xFFFF <<
+				OCRDMA_CREATE_QP_RSP_DPP_CREDITS_SHIFT,
+};
+
+struct ocrdma_create_qp_rsp {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_mbx_rsp rsp;
+
+	u32 qp_id;
+	u32 max_wqe_rqe;
+	u32 max_sge_send_write;
+	u32 max_sge_recv;
+	u32 max_ord_ird;
+	u32 sq_rq_id;
+	u32 dpp_response;
+};
+
+struct ocrdma_destroy_qp {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_mbx_hdr req;
+	u32 qp_id;
+};
+
+struct ocrdma_destroy_qp_rsp {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_mbx_rsp rsp;
+};
+
+enum {
+	OCRDMA_MODIFY_QP_ID_SHIFT	= 0,
+	OCRDMA_MODIFY_QP_ID_MASK	= 0xFFFF,
+
+	OCRDMA_QP_PARA_QPS_VALID	= BIT(0),
+	OCRDMA_QP_PARA_SQD_ASYNC_VALID	= BIT(1),
+	OCRDMA_QP_PARA_PKEY_VALID	= BIT(2),
+	OCRDMA_QP_PARA_QKEY_VALID	= BIT(3),
+	OCRDMA_QP_PARA_PMTU_VALID	= BIT(4),
+	OCRDMA_QP_PARA_ACK_TO_VALID	= BIT(5),
+	OCRDMA_QP_PARA_RETRY_CNT_VALID	= BIT(6),
+	OCRDMA_QP_PARA_RRC_VALID	= BIT(7),
+	OCRDMA_QP_PARA_RQPSN_VALID	= BIT(8),
+	OCRDMA_QP_PARA_MAX_IRD_VALID	= BIT(9),
+	OCRDMA_QP_PARA_MAX_ORD_VALID	= BIT(10),
+	OCRDMA_QP_PARA_RNT_VALID	= BIT(11),
+	OCRDMA_QP_PARA_SQPSN_VALID	= BIT(12),
+	OCRDMA_QP_PARA_DST_QPN_VALID	= BIT(13),
+	OCRDMA_QP_PARA_MAX_WQE_VALID	= BIT(14),
+	OCRDMA_QP_PARA_MAX_RQE_VALID	= BIT(15),
+	OCRDMA_QP_PARA_SGE_SEND_VALID	= BIT(16),
+	OCRDMA_QP_PARA_SGE_RECV_VALID	= BIT(17),
+	OCRDMA_QP_PARA_SGE_WR_VALID	= BIT(18),
+	OCRDMA_QP_PARA_INB_RDEN_VALID	= BIT(19),
+	OCRDMA_QP_PARA_INB_WREN_VALID	= BIT(20),
+	OCRDMA_QP_PARA_FLOW_LBL_VALID	= BIT(21),
+	OCRDMA_QP_PARA_BIND_EN_VALID	= BIT(22),
+	OCRDMA_QP_PARA_ZLKEY_EN_VALID	= BIT(23),
+	OCRDMA_QP_PARA_FMR_EN_VALID	= BIT(24),
+	OCRDMA_QP_PARA_INBAT_EN_VALID	= BIT(25),
+	OCRDMA_QP_PARA_VLAN_EN_VALID	= BIT(26),
+
+	OCRDMA_MODIFY_QP_FLAGS_RD	= BIT(0),
+	OCRDMA_MODIFY_QP_FLAGS_WR	= BIT(1),
+	OCRDMA_MODIFY_QP_FLAGS_SEND	= BIT(2),
+	OCRDMA_MODIFY_QP_FLAGS_ATOMIC	= BIT(3)
+};
+
+enum {
+	OCRDMA_QP_PARAMS_SRQ_ID_SHIFT		= 0,
+	OCRDMA_QP_PARAMS_SRQ_ID_MASK		= 0xFFFF,
+
+	OCRDMA_QP_PARAMS_MAX_RQE_SHIFT		= 0,
+	OCRDMA_QP_PARAMS_MAX_RQE_MASK		= 0xFFFF,
+	OCRDMA_QP_PARAMS_MAX_WQE_SHIFT		= 16,
+	OCRDMA_QP_PARAMS_MAX_WQE_MASK		= 0xFFFF <<
+	    OCRDMA_QP_PARAMS_MAX_WQE_SHIFT,
+
+	OCRDMA_QP_PARAMS_MAX_SGE_WRITE_SHIFT	= 0,
+	OCRDMA_QP_PARAMS_MAX_SGE_WRITE_MASK	= 0xFFFF,
+	OCRDMA_QP_PARAMS_MAX_SGE_SEND_SHIFT	= 16,
+	OCRDMA_QP_PARAMS_MAX_SGE_SEND_MASK	= 0xFFFF <<
+					OCRDMA_QP_PARAMS_MAX_SGE_SEND_SHIFT,
+
+	OCRDMA_QP_PARAMS_FLAGS_FMR_EN		= BIT(0),
+	OCRDMA_QP_PARAMS_FLAGS_LKEY_0_EN	= BIT(1),
+	OCRDMA_QP_PARAMS_FLAGS_BIND_MW_EN	= BIT(2),
+	OCRDMA_QP_PARAMS_FLAGS_INBWR_EN		= BIT(3),
+	OCRDMA_QP_PARAMS_FLAGS_INBRD_EN		= BIT(4),
+	OCRDMA_QP_PARAMS_STATE_SHIFT		= 5,
+	OCRDMA_QP_PARAMS_STATE_MASK		= BIT(5) | BIT(6) | BIT(7),
+	OCRDMA_QP_PARAMS_FLAGS_SQD_ASYNC	= BIT(8),
+	OCRDMA_QP_PARAMS_FLAGS_INB_ATEN		= BIT(9),
+	OCRDMA_QP_PARAMS_MAX_SGE_RECV_SHIFT	= 16,
+	OCRDMA_QP_PARAMS_MAX_SGE_RECV_MASK	= 0xFFFF <<
+					OCRDMA_QP_PARAMS_MAX_SGE_RECV_SHIFT,
+
+	OCRDMA_QP_PARAMS_MAX_IRD_SHIFT		= 0,
+	OCRDMA_QP_PARAMS_MAX_IRD_MASK		= 0xFFFF,
+	OCRDMA_QP_PARAMS_MAX_ORD_SHIFT		= 16,
+	OCRDMA_QP_PARAMS_MAX_ORD_MASK		= 0xFFFF <<
+					OCRDMA_QP_PARAMS_MAX_ORD_SHIFT,
+
+	OCRDMA_QP_PARAMS_RQ_CQID_SHIFT		= 0,
+	OCRDMA_QP_PARAMS_RQ_CQID_MASK		= 0xFFFF,
+	OCRDMA_QP_PARAMS_WQ_CQID_SHIFT		= 16,
+	OCRDMA_QP_PARAMS_WQ_CQID_MASK		= 0xFFFF <<
+					OCRDMA_QP_PARAMS_WQ_CQID_SHIFT,
+
+	OCRDMA_QP_PARAMS_RQ_PSN_SHIFT		= 0,
+	OCRDMA_QP_PARAMS_RQ_PSN_MASK		= 0xFFFFFF,
+	OCRDMA_QP_PARAMS_HOP_LMT_SHIFT		= 24,
+	OCRDMA_QP_PARAMS_HOP_LMT_MASK		= 0xFF <<
+					OCRDMA_QP_PARAMS_HOP_LMT_SHIFT,
+
+	OCRDMA_QP_PARAMS_SQ_PSN_SHIFT		= 0,
+	OCRDMA_QP_PARAMS_SQ_PSN_MASK		= 0xFFFFFF,
+	OCRDMA_QP_PARAMS_TCLASS_SHIFT		= 24,
+	OCRDMA_QP_PARAMS_TCLASS_MASK		= 0xFF <<
+					OCRDMA_QP_PARAMS_TCLASS_SHIFT,
+
+	OCRDMA_QP_PARAMS_DEST_QPN_SHIFT		= 0,
+	OCRDMA_QP_PARAMS_DEST_QPN_MASK		= 0xFFFFFF,
+	OCRDMA_QP_PARAMS_RNR_RETRY_CNT_SHIFT	= 24,
+	OCRDMA_QP_PARAMS_RNR_RETRY_CNT_MASK	= 0x7 <<
+					OCRDMA_QP_PARAMS_RNR_RETRY_CNT_SHIFT,
+	OCRDMA_QP_PARAMS_ACK_TIMEOUT_SHIFT	= 27,
+	OCRDMA_QP_PARAMS_ACK_TIMEOUT_MASK	= 0x1F <<
+					OCRDMA_QP_PARAMS_ACK_TIMEOUT_SHIFT,
+
+	OCRDMA_QP_PARAMS_PKEY_IDNEX_SHIFT	= 0,
+	OCRDMA_QP_PARAMS_PKEY_INDEX_MASK	= 0xFFFF,
+	OCRDMA_QP_PARAMS_PATH_MTU_SHIFT		= 18,
+	OCRDMA_QP_PARAMS_PATH_MTU_MASK		= 0x3FFF <<
+					OCRDMA_QP_PARAMS_PATH_MTU_SHIFT,
+
+	OCRDMA_QP_PARAMS_FLOW_LABEL_SHIFT	= 0,
+	OCRDMA_QP_PARAMS_FLOW_LABEL_MASK	= 0xFFFFF,
+	OCRDMA_QP_PARAMS_SL_SHIFT		= 20,
+	OCRDMA_QP_PARAMS_SL_MASK		= 0xF <<
+					OCRDMA_QP_PARAMS_SL_SHIFT,
+	OCRDMA_QP_PARAMS_RETRY_CNT_SHIFT	= 24,
+	OCRDMA_QP_PARAMS_RETRY_CNT_MASK		= 0x7 <<
+					OCRDMA_QP_PARAMS_RETRY_CNT_SHIFT,
+	OCRDMA_QP_PARAMS_RNR_NAK_TIMER_SHIFT	= 27,
+	OCRDMA_QP_PARAMS_RNR_NAK_TIMER_MASK	= 0x1F <<
+					OCRDMA_QP_PARAMS_RNR_NAK_TIMER_SHIFT,
+
+	OCRDMA_QP_PARAMS_DMAC_B4_TO_B5_SHIFT	= 0,
+	OCRDMA_QP_PARAMS_DMAC_B4_TO_B5_MASK	= 0xFFFF,
+	OCRDMA_QP_PARAMS_VLAN_SHIFT		= 16,
+	OCRDMA_QP_PARAMS_VLAN_MASK		= 0xFFFF <<
+					OCRDMA_QP_PARAMS_VLAN_SHIFT
+};
+
+struct ocrdma_qp_params {
+	u32 id;
+	u32 max_wqe_rqe;
+	u32 max_sge_send_write;
+	u32 max_sge_recv_flags;
+	u32 max_ord_ird;
+	u32 wq_rq_cqid;
+	u32 hop_lmt_rq_psn;
+	u32 tclass_sq_psn;
+	u32 ack_to_rnr_rtc_dest_qpn;
+	u32 path_mtu_pkey_indx;
+	u32 rnt_rc_sl_fl;
+	u8 sgid[16];
+	u8 dgid[16];
+	u32 dmac_b0_to_b3;
+	u32 vlan_dmac_b4_to_b5;
+	u32 qkey;
+};
+
+
+struct ocrdma_modify_qp {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_mbx_hdr req;
+
+	struct ocrdma_qp_params params;
+	u32 flags;
+	u32 rdma_flags;
+	u32 num_outstanding_atomic_rd;
+};
+
+enum {
+	OCRDMA_MODIFY_QP_RSP_MAX_RQE_SHIFT	= 0,
+	OCRDMA_MODIFY_QP_RSP_MAX_RQE_MASK	= 0xFFFF,
+	OCRDMA_MODIFY_QP_RSP_MAX_WQE_SHIFT	= 16,
+	OCRDMA_MODIFY_QP_RSP_MAX_WQE_MASK	= 0xFFFF <<
+					OCRDMA_MODIFY_QP_RSP_MAX_WQE_SHIFT,
+
+	OCRDMA_MODIFY_QP_RSP_MAX_IRD_SHIFT	= 0,
+	OCRDMA_MODIFY_QP_RSP_MAX_IRD_MASK	= 0xFFFF,
+	OCRDMA_MODIFY_QP_RSP_MAX_ORD_SHIFT	= 16,
+	OCRDMA_MODIFY_QP_RSP_MAX_ORD_MASK	= 0xFFFF <<
+					OCRDMA_MODIFY_QP_RSP_MAX_ORD_SHIFT
+};
+
+struct ocrdma_modify_qp_rsp {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_mbx_rsp rsp;
+
+	u32 max_wqe_rqe;
+	u32 max_ord_ird;
+};
+
+struct ocrdma_query_qp {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_mbx_hdr req;
+
+#define OCRDMA_QUERY_UP_QP_ID_SHIFT	0
+#define OCRDMA_QUERY_UP_QP_ID_MASK	0xFFFFFF
+	u32 qp_id;
+};
+
+struct ocrdma_query_qp_rsp {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_mbx_rsp rsp;
+	struct ocrdma_qp_params params;
+};
+
+enum {
+	OCRDMA_CREATE_SRQ_PD_ID_SHIFT		= 0,
+	OCRDMA_CREATE_SRQ_PD_ID_MASK		= 0xFFFF,
+	OCRDMA_CREATE_SRQ_PG_SZ_SHIFT		= 16,
+	OCRDMA_CREATE_SRQ_PG_SZ_MASK		= 0x3 <<
+					OCRDMA_CREATE_SRQ_PG_SZ_SHIFT,
+
+	OCRDMA_CREATE_SRQ_MAX_RQE_SHIFT		= 0,
+	OCRDMA_CREATE_SRQ_MAX_SGE_RECV_SHIFT	= 16,
+	OCRDMA_CREATE_SRQ_MAX_SGE_RECV_MASK	= 0xFFFF <<
+					OCRDMA_CREATE_SRQ_MAX_SGE_RECV_SHIFT,
+
+	OCRDMA_CREATE_SRQ_RQE_SIZE_SHIFT	= 0,
+	OCRDMA_CREATE_SRQ_RQE_SIZE_MASK		= 0xFFFF,
+	OCRDMA_CREATE_SRQ_NUM_RQ_PAGES_SHIFT	= 16,
+	OCRDMA_CREATE_SRQ_NUM_RQ_PAGES_MASK	= 0xFFFF <<
+					OCRDMA_CREATE_SRQ_NUM_RQ_PAGES_SHIFT
+};
+
+struct ocrdma_create_srq {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_mbx_hdr req;
+
+	u32 pgsz_pdid;
+	u32 max_sge_rqe;
+	u32 pages_rqe_sz;
+	struct ocrdma_pa rq_addr[MAX_OCRDMA_SRQ_PAGES];
+};
+
+enum {
+	OCRDMA_CREATE_SRQ_RSP_SRQ_ID_SHIFT			= 0,
+	OCRDMA_CREATE_SRQ_RSP_SRQ_ID_MASK			= 0xFFFFFF,
+
+	OCRDMA_CREATE_SRQ_RSP_MAX_RQE_ALLOCATED_SHIFT		= 0,
+	OCRDMA_CREATE_SRQ_RSP_MAX_RQE_ALLOCATED_MASK		= 0xFFFF,
+	OCRDMA_CREATE_SRQ_RSP_MAX_SGE_RECV_ALLOCATED_SHIFT	= 16,
+	OCRDMA_CREATE_SRQ_RSP_MAX_SGE_RECV_ALLOCATED_MASK	= 0xFFFF <<
+			OCRDMA_CREATE_SRQ_RSP_MAX_SGE_RECV_ALLOCATED_SHIFT
+};
+
+struct ocrdma_create_srq_rsp {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_mbx_rsp rsp;
+
+	u32 id;
+	u32 max_sge_rqe_allocated;
+};
+
+enum {
+	OCRDMA_MODIFY_SRQ_ID_SHIFT	= 0,
+	OCRDMA_MODIFY_SRQ_ID_MASK	= 0xFFFFFF,
+
+	OCRDMA_MODIFY_SRQ_MAX_RQE_SHIFT	= 0,
+	OCRDMA_MODIFY_SRQ_MAX_RQE_MASK	= 0xFFFF,
+	OCRDMA_MODIFY_SRQ_LIMIT_SHIFT	= 16,
+	OCRDMA_MODIFY_SRQ__LIMIT_MASK	= 0xFFFF <<
+					OCRDMA_MODIFY_SRQ_LIMIT_SHIFT
+};
+
+struct ocrdma_modify_srq {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_mbx_rsp rep;
+
+	u32 id;
+	u32 limit_max_rqe;
+};
+
+enum {
+	OCRDMA_QUERY_SRQ_ID_SHIFT	= 0,
+	OCRDMA_QUERY_SRQ_ID_MASK	= 0xFFFFFF
+};
+
+struct ocrdma_query_srq {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_mbx_rsp req;
+
+	u32 id;
+};
+
+enum {
+	OCRDMA_QUERY_SRQ_RSP_PD_ID_SHIFT	= 0,
+	OCRDMA_QUERY_SRQ_RSP_PD_ID_MASK		= 0xFFFF,
+	OCRDMA_QUERY_SRQ_RSP_MAX_RQE_SHIFT	= 16,
+	OCRDMA_QUERY_SRQ_RSP_MAX_RQE_MASK	= 0xFFFF <<
+					OCRDMA_QUERY_SRQ_RSP_MAX_RQE_SHIFT,
+
+	OCRDMA_QUERY_SRQ_RSP_MAX_SGE_RECV_SHIFT	= 0,
+	OCRDMA_QUERY_SRQ_RSP_MAX_SGE_RECV_MASK	= 0xFFFF,
+	OCRDMA_QUERY_SRQ_RSP_SRQ_LIMIT_SHIFT	= 16,
+	OCRDMA_QUERY_SRQ_RSP_SRQ_LIMIT_MASK	= 0xFFFF <<
+					OCRDMA_QUERY_SRQ_RSP_SRQ_LIMIT_SHIFT
+};
+
+struct ocrdma_query_srq_rsp {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_mbx_rsp req;
+
+	u32 max_rqe_pdid;
+	u32 srq_lmt_max_sge;
+};
+
+enum {
+	OCRDMA_DESTROY_SRQ_ID_SHIFT	= 0,
+	OCRDMA_DESTROY_SRQ_ID_MASK	= 0xFFFFFF
+};
+
+struct ocrdma_destroy_srq {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_mbx_rsp req;
+
+	u32 id;
+};
+
+enum {
+	OCRDMA_ALLOC_PD_ENABLE_DPP	= BIT(16),
+	OCRDMA_DPP_PAGE_SIZE		= 4096
+};
+
+struct ocrdma_alloc_pd {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_mbx_hdr req;
+	u32 enable_dpp_rsvd;
+};
+
+enum {
+	OCRDMA_ALLOC_PD_RSP_DPP			= BIT(16),
+	OCRDMA_ALLOC_PD_RSP_DPP_PAGE_SHIFT	= 20,
+	OCRDMA_ALLOC_PD_RSP_PDID_MASK		= 0xFFFF,
+};
+
+struct ocrdma_alloc_pd_rsp {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_mbx_rsp rsp;
+	u32 dpp_page_pdid;
+};
+
+struct ocrdma_dealloc_pd {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_mbx_hdr req;
+	u32 id;
+};
+
+struct ocrdma_dealloc_pd_rsp {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_mbx_rsp rsp;
+};
+
+enum {
+	OCRDMA_ADDR_CHECK_ENABLE	= 1,
+	OCRDMA_ADDR_CHECK_DISABLE	= 0
+};
+
+enum {
+	OCRDMA_ALLOC_LKEY_PD_ID_SHIFT		= 0,
+	OCRDMA_ALLOC_LKEY_PD_ID_MASK		= 0xFFFF,
+
+	OCRDMA_ALLOC_LKEY_ADDR_CHECK_SHIFT	= 0,
+	OCRDMA_ALLOC_LKEY_ADDR_CHECK_MASK	= BIT(0),
+	OCRDMA_ALLOC_LKEY_FMR_SHIFT		= 1,
+	OCRDMA_ALLOC_LKEY_FMR_MASK		= BIT(1),
+	OCRDMA_ALLOC_LKEY_REMOTE_INV_SHIFT	= 2,
+	OCRDMA_ALLOC_LKEY_REMOTE_INV_MASK	= BIT(2),
+	OCRDMA_ALLOC_LKEY_REMOTE_WR_SHIFT	= 3,
+	OCRDMA_ALLOC_LKEY_REMOTE_WR_MASK	= BIT(3),
+	OCRDMA_ALLOC_LKEY_REMOTE_RD_SHIFT	= 4,
+	OCRDMA_ALLOC_LKEY_REMOTE_RD_MASK	= BIT(4),
+	OCRDMA_ALLOC_LKEY_LOCAL_WR_SHIFT	= 5,
+	OCRDMA_ALLOC_LKEY_LOCAL_WR_MASK		= BIT(5),
+	OCRDMA_ALLOC_LKEY_REMOTE_ATOMIC_MASK	= BIT(6),
+	OCRDMA_ALLOC_LKEY_REMOTE_ATOMIC_SHIFT	= 6,
+	OCRDMA_ALLOC_LKEY_PBL_SIZE_SHIFT	= 16,
+	OCRDMA_ALLOC_LKEY_PBL_SIZE_MASK		= 0xFFFF <<
+						OCRDMA_ALLOC_LKEY_PBL_SIZE_SHIFT
+};
+
+struct ocrdma_alloc_lkey {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_mbx_hdr req;
+
+	u32 pdid;
+	u32 pbl_sz_flags;
+};
+
+struct ocrdma_alloc_lkey_rsp {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_mbx_rsp rsp;
+
+	u32 lrkey;
+	u32 num_pbl_rsvd;
+};
+
+struct ocrdma_dealloc_lkey {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_mbx_hdr req;
+
+	u32 lkey;
+	u32 rsvd_frmr;
+};
+
+struct ocrdma_dealloc_lkey_rsp {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_mbx_rsp rsp;
+};
+
+#define MAX_OCRDMA_NSMR_PBL    (u32)22
+#define MAX_OCRDMA_PBL_SIZE     65536
+#define MAX_OCRDMA_PBL_PER_LKEY	32767
+
+enum {
+	OCRDMA_REG_NSMR_LRKEY_INDEX_SHIFT	= 0,
+	OCRDMA_REG_NSMR_LRKEY_INDEX_MASK	= 0xFFFFFF,
+	OCRDMA_REG_NSMR_LRKEY_SHIFT		= 24,
+	OCRDMA_REG_NSMR_LRKEY_MASK		= 0xFF <<
+					OCRDMA_REG_NSMR_LRKEY_SHIFT,
+
+	OCRDMA_REG_NSMR_PD_ID_SHIFT		= 0,
+	OCRDMA_REG_NSMR_PD_ID_MASK		= 0xFFFF,
+	OCRDMA_REG_NSMR_NUM_PBL_SHIFT		= 16,
+	OCRDMA_REG_NSMR_NUM_PBL_MASK		= 0xFFFF <<
+					OCRDMA_REG_NSMR_NUM_PBL_SHIFT,
+
+	OCRDMA_REG_NSMR_PBE_SIZE_SHIFT		= 0,
+	OCRDMA_REG_NSMR_PBE_SIZE_MASK		= 0xFFFF,
+	OCRDMA_REG_NSMR_HPAGE_SIZE_SHIFT	= 16,
+	OCRDMA_REG_NSMR_HPAGE_SIZE_MASK		= 0xFF <<
+					OCRDMA_REG_NSMR_HPAGE_SIZE_SHIFT,
+	OCRDMA_REG_NSMR_BIND_MEMWIN_SHIFT	= 24,
+	OCRDMA_REG_NSMR_BIND_MEMWIN_MASK	= BIT(24),
+	OCRDMA_REG_NSMR_ZB_SHIFT		= 25,
+	OCRDMA_REG_NSMR_ZB_SHIFT_MASK		= BIT(25),
+	OCRDMA_REG_NSMR_REMOTE_INV_SHIFT	= 26,
+	OCRDMA_REG_NSMR_REMOTE_INV_MASK		= BIT(26),
+	OCRDMA_REG_NSMR_REMOTE_WR_SHIFT		= 27,
+	OCRDMA_REG_NSMR_REMOTE_WR_MASK		= BIT(27),
+	OCRDMA_REG_NSMR_REMOTE_RD_SHIFT		= 28,
+	OCRDMA_REG_NSMR_REMOTE_RD_MASK		= BIT(28),
+	OCRDMA_REG_NSMR_LOCAL_WR_SHIFT		= 29,
+	OCRDMA_REG_NSMR_LOCAL_WR_MASK		= BIT(29),
+	OCRDMA_REG_NSMR_REMOTE_ATOMIC_SHIFT	= 30,
+	OCRDMA_REG_NSMR_REMOTE_ATOMIC_MASK	= BIT(30),
+	OCRDMA_REG_NSMR_LAST_SHIFT		= 31,
+	OCRDMA_REG_NSMR_LAST_MASK		= BIT(31)
+};
+
+struct ocrdma_reg_nsmr {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_mbx_hdr cmd;
+
+	u32 fr_mr;
+	u32 num_pbl_pdid;
+	u32 flags_hpage_pbe_sz;
+	u32 totlen_low;
+	u32 totlen_high;
+	u32 fbo_low;
+	u32 fbo_high;
+	u32 va_loaddr;
+	u32 va_hiaddr;
+	struct ocrdma_pa pbl[MAX_OCRDMA_NSMR_PBL];
+};
+
+enum {
+	OCRDMA_REG_NSMR_CONT_PBL_SHIFT		= 0,
+	OCRDMA_REG_NSMR_CONT_PBL_SHIFT_MASK	= 0xFFFF,
+	OCRDMA_REG_NSMR_CONT_NUM_PBL_SHIFT	= 16,
+	OCRDMA_REG_NSMR_CONT_NUM_PBL_MASK	= 0xFFFF <<
+					OCRDMA_REG_NSMR_CONT_NUM_PBL_SHIFT,
+
+	OCRDMA_REG_NSMR_CONT_LAST_SHIFT		= 31,
+	OCRDMA_REG_NSMR_CONT_LAST_MASK		= BIT(31)
+};
+
+struct ocrdma_reg_nsmr_cont {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_mbx_hdr cmd;
+
+	u32 lrkey;
+	u32 num_pbl_offset;
+	u32 last;
+
+	struct ocrdma_pa pbl[MAX_OCRDMA_NSMR_PBL];
+};
+
+struct ocrdma_pbe {
+	u32 pa_hi;
+	u32 pa_lo;
+};
+
+enum {
+	OCRDMA_REG_NSMR_RSP_NUM_PBL_SHIFT	= 16,
+	OCRDMA_REG_NSMR_RSP_NUM_PBL_MASK	= 0xFFFF0000
+};
+struct ocrdma_reg_nsmr_rsp {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_mbx_rsp rsp;
+
+	u32 lrkey;
+	u32 num_pbl;
+};
+
+enum {
+	OCRDMA_REG_NSMR_CONT_RSP_LRKEY_INDEX_SHIFT	= 0,
+	OCRDMA_REG_NSMR_CONT_RSP_LRKEY_INDEX_MASK	= 0xFFFFFF,
+	OCRDMA_REG_NSMR_CONT_RSP_LRKEY_SHIFT		= 24,
+	OCRDMA_REG_NSMR_CONT_RSP_LRKEY_MASK		= 0xFF <<
+					OCRDMA_REG_NSMR_CONT_RSP_LRKEY_SHIFT,
+
+	OCRDMA_REG_NSMR_CONT_RSP_NUM_PBL_SHIFT		= 16,
+	OCRDMA_REG_NSMR_CONT_RSP_NUM_PBL_MASK		= 0xFFFF <<
+					OCRDMA_REG_NSMR_CONT_RSP_NUM_PBL_SHIFT
+};
+
+struct ocrdma_reg_nsmr_cont_rsp {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_mbx_rsp rsp;
+
+	u32 lrkey_key_index;
+	u32 num_pbl;
+};
+
+enum {
+	OCRDMA_ALLOC_MW_PD_ID_SHIFT	= 0,
+	OCRDMA_ALLOC_MW_PD_ID_MASK	= 0xFFFF
+};
+
+struct ocrdma_alloc_mw {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_mbx_hdr req;
+
+	u32 pdid;
+};
+
+enum {
+	OCRDMA_ALLOC_MW_RSP_LRKEY_INDEX_SHIFT	= 0,
+	OCRDMA_ALLOC_MW_RSP_LRKEY_INDEX_MASK	= 0xFFFFFF
+};
+
+struct ocrdma_alloc_mw_rsp {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_mbx_rsp rsp;
+
+	u32 lrkey_index;
+};
+
+struct ocrdma_attach_mcast {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_mbx_hdr req;
+	u32 qp_id;
+	u8 mgid[16];
+	u32 mac_b0_to_b3;
+	u32 vlan_mac_b4_to_b5;
+};
+
+struct ocrdma_attach_mcast_rsp {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_mbx_rsp rsp;
+};
+
+struct ocrdma_detach_mcast {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_mbx_hdr req;
+	u32 qp_id;
+	u8 mgid[16];
+	u32 mac_b0_to_b3;
+	u32 vlan_mac_b4_to_b5;
+};
+
+struct ocrdma_detach_mcast_rsp {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_mbx_rsp rsp;
+};
+
+enum {
+	OCRDMA_CREATE_AH_NUM_PAGES_SHIFT	= 19,
+	OCRDMA_CREATE_AH_NUM_PAGES_MASK		= 0xF <<
+					OCRDMA_CREATE_AH_NUM_PAGES_SHIFT,
+
+	OCRDMA_CREATE_AH_PAGE_SIZE_SHIFT	= 16,
+	OCRDMA_CREATE_AH_PAGE_SIZE_MASK		= 0x7 <<
+					OCRDMA_CREATE_AH_PAGE_SIZE_SHIFT,
+
+	OCRDMA_CREATE_AH_ENTRY_SIZE_SHIFT	= 23,
+	OCRDMA_CREATE_AH_ENTRY_SIZE_MASK	= 0x1FF <<
+					OCRDMA_CREATE_AH_ENTRY_SIZE_SHIFT,
+};
+
+#define OCRDMA_AH_TBL_PAGES 8
+
+struct ocrdma_create_ah_tbl {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_mbx_hdr req;
+
+	u32 ah_conf;
+	struct ocrdma_pa tbl_addr[8];
+};
+
+struct ocrdma_create_ah_tbl_rsp {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_mbx_rsp rsp;
+	u32 ahid;
+};
+
+struct ocrdma_delete_ah_tbl {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_mbx_hdr req;
+	u32 ahid;
+};
+
+struct ocrdma_delete_ah_tbl_rsp {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_mbx_rsp rsp;
+};
+
+enum {
+	OCRDMA_EQE_VALID_SHIFT		= 0,
+	OCRDMA_EQE_VALID_MASK		= BIT(0),
+	OCRDMA_EQE_FOR_CQE_MASK		= 0xFFFE,
+	OCRDMA_EQE_RESOURCE_ID_SHIFT	= 16,
+	OCRDMA_EQE_RESOURCE_ID_MASK	= 0xFFFF <<
+				OCRDMA_EQE_RESOURCE_ID_SHIFT,
+};
+
+struct ocrdma_eqe {
+	u32 id_valid;
+};
+
+enum OCRDMA_CQE_STATUS {
+	OCRDMA_CQE_SUCCESS = 0,
+	OCRDMA_CQE_LOC_LEN_ERR,
+	OCRDMA_CQE_LOC_QP_OP_ERR,
+	OCRDMA_CQE_LOC_EEC_OP_ERR,
+	OCRDMA_CQE_LOC_PROT_ERR,
+	OCRDMA_CQE_WR_FLUSH_ERR,
+	OCRDMA_CQE_MW_BIND_ERR,
+	OCRDMA_CQE_BAD_RESP_ERR,
+	OCRDMA_CQE_LOC_ACCESS_ERR,
+	OCRDMA_CQE_REM_INV_REQ_ERR,
+	OCRDMA_CQE_REM_ACCESS_ERR,
+	OCRDMA_CQE_REM_OP_ERR,
+	OCRDMA_CQE_RETRY_EXC_ERR,
+	OCRDMA_CQE_RNR_RETRY_EXC_ERR,
+	OCRDMA_CQE_LOC_RDD_VIOL_ERR,
+	OCRDMA_CQE_REM_INV_RD_REQ_ERR,
+	OCRDMA_CQE_REM_ABORT_ERR,
+	OCRDMA_CQE_INV_EECN_ERR,
+	OCRDMA_CQE_INV_EEC_STATE_ERR,
+	OCRDMA_CQE_FATAL_ERR,
+	OCRDMA_CQE_RESP_TIMEOUT_ERR,
+	OCRDMA_CQE_GENERAL_ERR
+};
+
+enum {
+	/* w0 */
+	OCRDMA_CQE_WQEIDX_SHIFT		= 0,
+	OCRDMA_CQE_WQEIDX_MASK		= 0xFFFF,
+
+	/* w1 */
+	OCRDMA_CQE_UD_XFER_LEN_SHIFT	= 16,
+	OCRDMA_CQE_PKEY_SHIFT		= 0,
+	OCRDMA_CQE_PKEY_MASK		= 0xFFFF,
+
+	/* w2 */
+	OCRDMA_CQE_QPN_SHIFT		= 0,
+	OCRDMA_CQE_QPN_MASK		= 0x0000FFFF,
+
+	OCRDMA_CQE_BUFTAG_SHIFT		= 16,
+	OCRDMA_CQE_BUFTAG_MASK		= 0xFFFF << OCRDMA_CQE_BUFTAG_SHIFT,
+
+	/* w3 */
+	OCRDMA_CQE_UD_STATUS_SHIFT	= 24,
+	OCRDMA_CQE_UD_STATUS_MASK	= 0x7 << OCRDMA_CQE_UD_STATUS_SHIFT,
+	OCRDMA_CQE_STATUS_SHIFT		= 16,
+	OCRDMA_CQE_STATUS_MASK		= 0xFF << OCRDMA_CQE_STATUS_SHIFT,
+	OCRDMA_CQE_VALID		= BIT(31),
+	OCRDMA_CQE_INVALIDATE		= BIT(30),
+	OCRDMA_CQE_QTYPE		= BIT(29),
+	OCRDMA_CQE_IMM			= BIT(28),
+	OCRDMA_CQE_WRITE_IMM		= BIT(27),
+	OCRDMA_CQE_QTYPE_SQ		= 0,
+	OCRDMA_CQE_QTYPE_RQ		= 1,
+	OCRDMA_CQE_SRCQP_MASK		= 0xFFFFFF
+};
+
+struct ocrdma_cqe {
+	union {
+		/* w0 to w2 */
+		struct {
+			u32 wqeidx;
+			u32 bytes_xfered;
+			u32 qpn;
+		} wq;
+		struct {
+			u32 lkey_immdt;
+			u32 rxlen;
+			u32 buftag_qpn;
+		} rq;
+		struct {
+			u32 lkey_immdt;
+			u32 rxlen_pkey;
+			u32 buftag_qpn;
+		} ud;
+		struct {
+			u32 word_0;
+			u32 word_1;
+			u32 qpn;
+		} cmn;
+	};
+	u32 flags_status_srcqpn;	/* w3 */
+};
+
+struct ocrdma_sge {
+	u32 addr_hi;
+	u32 addr_lo;
+	u32 lrkey;
+	u32 len;
+};
+
+enum {
+	OCRDMA_FLAG_SIG		= 0x1,
+	OCRDMA_FLAG_INV		= 0x2,
+	OCRDMA_FLAG_FENCE_L	= 0x4,
+	OCRDMA_FLAG_FENCE_R	= 0x8,
+	OCRDMA_FLAG_SOLICIT	= 0x10,
+	OCRDMA_FLAG_IMM		= 0x20,
+
+	/* Stag flags */
+	OCRDMA_LKEY_FLAG_LOCAL_WR	= 0x1,
+	OCRDMA_LKEY_FLAG_REMOTE_RD	= 0x2,
+	OCRDMA_LKEY_FLAG_REMOTE_WR	= 0x4,
+	OCRDMA_LKEY_FLAG_VATO		= 0x8,
+};
+
+enum OCRDMA_WQE_OPCODE {
+	OCRDMA_WRITE		= 0x06,
+	OCRDMA_READ		= 0x0C,
+	OCRDMA_RESV0		= 0x02,
+	OCRDMA_SEND		= 0x00,
+	OCRDMA_CMP_SWP		= 0x14,
+	OCRDMA_BIND_MW		= 0x10,
+	OCRDMA_FR_MR            = 0x11,
+	OCRDMA_RESV1		= 0x0A,
+	OCRDMA_LKEY_INV		= 0x15,
+	OCRDMA_FETCH_ADD	= 0x13,
+	OCRDMA_POST_RQ		= 0x12
+};
+
+enum {
+	OCRDMA_TYPE_INLINE	= 0x0,
+	OCRDMA_TYPE_LKEY	= 0x1,
+};
+
+enum {
+	OCRDMA_WQE_OPCODE_SHIFT		= 0,
+	OCRDMA_WQE_OPCODE_MASK		= 0x0000001F,
+	OCRDMA_WQE_FLAGS_SHIFT		= 5,
+	OCRDMA_WQE_TYPE_SHIFT		= 16,
+	OCRDMA_WQE_TYPE_MASK		= 0x00030000,
+	OCRDMA_WQE_SIZE_SHIFT		= 18,
+	OCRDMA_WQE_SIZE_MASK		= 0xFF,
+	OCRDMA_WQE_NXT_WQE_SIZE_SHIFT	= 25,
+
+	OCRDMA_WQE_LKEY_FLAGS_SHIFT	= 0,
+	OCRDMA_WQE_LKEY_FLAGS_MASK	= 0xF
+};
+
+/* header WQE for all the SQ and RQ operations */
+struct ocrdma_hdr_wqe {
+	u32 cw;
+	union {
+		u32 rsvd_tag;
+		u32 rsvd_lkey_flags;
+	};
+	union {
+		u32 immdt;
+		u32 lkey;
+	};
+	u32 total_len;
+};
+
+struct ocrdma_ewqe_ud_hdr {
+	u32 rsvd_dest_qpn;
+	u32 qkey;
+	u32 rsvd_ahid;
+	u32 rsvd;
+};
+
+/* extended wqe followed by hdr_wqe for Fast Memory register */
+struct ocrdma_ewqe_fr {
+	u32 va_hi;
+	u32 va_lo;
+	u32 fbo_hi;
+	u32 fbo_lo;
+	u32 size_sge;
+	u32 num_sges;
+	u32 rsvd;
+	u32 rsvd2;
+};
+
+struct ocrdma_eth_basic {
+	u8 dmac[6];
+	u8 smac[6];
+	__be16 eth_type;
+} __packed;
+
+struct ocrdma_eth_vlan {
+	u8 dmac[6];
+	u8 smac[6];
+	__be16 eth_type;
+	__be16 vlan_tag;
+#define OCRDMA_ROCE_ETH_TYPE 0x8915
+	__be16 roce_eth_type;
+} __packed;
+
+struct ocrdma_grh {
+	__be32	tclass_flow;
+	__be32	pdid_hoplimit;
+	u8	sgid[16];
+	u8	dgid[16];
+	u16	rsvd;
+} __packed;
+
+#define OCRDMA_AV_VALID		BIT(7)
+#define OCRDMA_AV_VLAN_VALID	BIT(1)
+
+struct ocrdma_av {
+	struct ocrdma_eth_vlan eth_hdr;
+	struct ocrdma_grh grh;
+	u32 valid;
+} __packed;
+
+struct ocrdma_rsrc_stats {
+	u32 dpp_pds;
+	u32 non_dpp_pds;
+	u32 rc_dpp_qps;
+	u32 uc_dpp_qps;
+	u32 ud_dpp_qps;
+	u32 rc_non_dpp_qps;
+	u32 rsvd;
+	u32 uc_non_dpp_qps;
+	u32 ud_non_dpp_qps;
+	u32 rsvd1;
+	u32 srqs;
+	u32 rbqs;
+	u32 r64K_nsmr;
+	u32 r64K_to_2M_nsmr;
+	u32 r2M_to_44M_nsmr;
+	u32 r44M_to_1G_nsmr;
+	u32 r1G_to_4G_nsmr;
+	u32 nsmr_count_4G_to_32G;
+	u32 r32G_to_64G_nsmr;
+	u32 r64G_to_128G_nsmr;
+	u32 r128G_to_higher_nsmr;
+	u32 embedded_nsmr;
+	u32 frmr;
+	u32 prefetch_qps;
+	u32 ondemand_qps;
+	u32 phy_mr;
+	u32 mw;
+	u32 rsvd2[7];
+};
+
+struct ocrdma_db_err_stats {
+	u32 sq_doorbell_errors;
+	u32 cq_doorbell_errors;
+	u32 rq_srq_doorbell_errors;
+	u32 cq_overflow_errors;
+	u32 rsvd[4];
+};
+
+struct ocrdma_wqe_stats {
+	u32 large_send_rc_wqes_lo;
+	u32 large_send_rc_wqes_hi;
+	u32 large_write_rc_wqes_lo;
+	u32 large_write_rc_wqes_hi;
+	u32 rsvd[4];
+	u32 read_wqes_lo;
+	u32 read_wqes_hi;
+	u32 frmr_wqes_lo;
+	u32 frmr_wqes_hi;
+	u32 mw_bind_wqes_lo;
+	u32 mw_bind_wqes_hi;
+	u32 invalidate_wqes_lo;
+	u32 invalidate_wqes_hi;
+	u32 rsvd1[2];
+	u32 dpp_wqe_drops;
+	u32 rsvd2[5];
+};
+
+struct ocrdma_tx_stats {
+	u32 send_pkts_lo;
+	u32 send_pkts_hi;
+	u32 write_pkts_lo;
+	u32 write_pkts_hi;
+	u32 read_pkts_lo;
+	u32 read_pkts_hi;
+	u32 read_rsp_pkts_lo;
+	u32 read_rsp_pkts_hi;
+	u32 ack_pkts_lo;
+	u32 ack_pkts_hi;
+	u32 send_bytes_lo;
+	u32 send_bytes_hi;
+	u32 write_bytes_lo;
+	u32 write_bytes_hi;
+	u32 read_req_bytes_lo;
+	u32 read_req_bytes_hi;
+	u32 read_rsp_bytes_lo;
+	u32 read_rsp_bytes_hi;
+	u32 ack_timeouts;
+	u32 rsvd[5];
+};
+
+
+struct ocrdma_tx_qp_err_stats {
+	u32 local_length_errors;
+	u32 local_protection_errors;
+	u32 local_qp_operation_errors;
+	u32 retry_count_exceeded_errors;
+	u32 rnr_retry_count_exceeded_errors;
+	u32 rsvd[3];
+};
+
+struct ocrdma_rx_stats {
+	u32 roce_frame_bytes_lo;
+	u32 roce_frame_bytes_hi;
+	u32 roce_frame_icrc_drops;
+	u32 roce_frame_payload_len_drops;
+	u32 ud_drops;
+	u32 qp1_drops;
+	u32 psn_error_request_packets;
+	u32 psn_error_resp_packets;
+	u32 rnr_nak_timeouts;
+	u32 rnr_nak_receives;
+	u32 roce_frame_rxmt_drops;
+	u32 nak_count_psn_sequence_errors;
+	u32 rc_drop_count_lookup_errors;
+	u32 rq_rnr_naks;
+	u32 srq_rnr_naks;
+	u32 roce_frames_lo;
+	u32 roce_frames_hi;
+	u32 rsvd;
+};
+
+struct ocrdma_rx_qp_err_stats {
+	u32 nak_invalid_requst_errors;
+	u32 nak_remote_operation_errors;
+	u32 nak_count_remote_access_errors;
+	u32 local_length_errors;
+	u32 local_protection_errors;
+	u32 local_qp_operation_errors;
+	u32 rsvd[2];
+};
+
+struct ocrdma_tx_dbg_stats {
+	u32 data[100];
+};
+
+struct ocrdma_rx_dbg_stats {
+	u32 data[200];
+};
+
+struct ocrdma_rdma_stats_req {
+	struct ocrdma_mbx_hdr hdr;
+	u8 reset_stats;
+	u8 rsvd[3];
+} __packed;
+
+struct ocrdma_rdma_stats_resp {
+	struct ocrdma_mbx_hdr hdr;
+	struct ocrdma_rsrc_stats act_rsrc_stats;
+	struct ocrdma_rsrc_stats th_rsrc_stats;
+	struct ocrdma_db_err_stats	db_err_stats;
+	struct ocrdma_wqe_stats		wqe_stats;
+	struct ocrdma_tx_stats		tx_stats;
+	struct ocrdma_tx_qp_err_stats	tx_qp_err_stats;
+	struct ocrdma_rx_stats		rx_stats;
+	struct ocrdma_rx_qp_err_stats	rx_qp_err_stats;
+	struct ocrdma_tx_dbg_stats	tx_dbg_stats;
+	struct ocrdma_rx_dbg_stats	rx_dbg_stats;
+} __packed;
+
+enum {
+	OCRDMA_HBA_ATTRB_EPROM_VER_LO_MASK	= 0xFF,
+	OCRDMA_HBA_ATTRB_EPROM_VER_HI_MASK	= 0xFF00,
+	OCRDMA_HBA_ATTRB_EPROM_VER_HI_SHIFT	= 0x08,
+	OCRDMA_HBA_ATTRB_CDBLEN_MASK		= 0xFFFF,
+	OCRDMA_HBA_ATTRB_ASIC_REV_MASK		= 0xFF0000,
+	OCRDMA_HBA_ATTRB_ASIC_REV_SHIFT		= 0x10,
+	OCRDMA_HBA_ATTRB_GUID0_MASK		= 0xFF000000,
+	OCRDMA_HBA_ATTRB_GUID0_SHIFT		= 0x18,
+	OCRDMA_HBA_ATTRB_GUID13_MASK		= 0xFF,
+	OCRDMA_HBA_ATTRB_GUID14_MASK		= 0xFF00,
+	OCRDMA_HBA_ATTRB_GUID14_SHIFT		= 0x08,
+	OCRDMA_HBA_ATTRB_GUID15_MASK		= 0xFF0000,
+	OCRDMA_HBA_ATTRB_GUID15_SHIFT		= 0x10,
+	OCRDMA_HBA_ATTRB_PCNT_MASK		= 0xFF000000,
+	OCRDMA_HBA_ATTRB_PCNT_SHIFT		= 0x18,
+	OCRDMA_HBA_ATTRB_LDTOUT_MASK		= 0xFFFF,
+	OCRDMA_HBA_ATTRB_ISCSI_VER_MASK		= 0xFF0000,
+	OCRDMA_HBA_ATTRB_ISCSI_VER_SHIFT	= 0x10,
+	OCRDMA_HBA_ATTRB_MFUNC_DEV_MASK		= 0xFF000000,
+	OCRDMA_HBA_ATTRB_MFUNC_DEV_SHIFT	= 0x18,
+	OCRDMA_HBA_ATTRB_CV_MASK		= 0xFF,
+	OCRDMA_HBA_ATTRB_HBA_ST_MASK		= 0xFF00,
+	OCRDMA_HBA_ATTRB_HBA_ST_SHIFT		= 0x08,
+	OCRDMA_HBA_ATTRB_MAX_DOMS_MASK		= 0xFF0000,
+	OCRDMA_HBA_ATTRB_MAX_DOMS_SHIFT		= 0x10,
+	OCRDMA_HBA_ATTRB_PTNUM_MASK		= 0x3F000000,
+	OCRDMA_HBA_ATTRB_PTNUM_SHIFT		= 0x18,
+	OCRDMA_HBA_ATTRB_PT_MASK		= 0xC0000000,
+	OCRDMA_HBA_ATTRB_PT_SHIFT		= 0x1E,
+	OCRDMA_HBA_ATTRB_ISCSI_FET_MASK		= 0xFF,
+	OCRDMA_HBA_ATTRB_ASIC_GEN_MASK		= 0xFF00,
+	OCRDMA_HBA_ATTRB_ASIC_GEN_SHIFT		= 0x08,
+	OCRDMA_HBA_ATTRB_PCI_VID_MASK		= 0xFFFF,
+	OCRDMA_HBA_ATTRB_PCI_DID_MASK		= 0xFFFF0000,
+	OCRDMA_HBA_ATTRB_PCI_DID_SHIFT		= 0x10,
+	OCRDMA_HBA_ATTRB_PCI_SVID_MASK		= 0xFFFF,
+	OCRDMA_HBA_ATTRB_PCI_SSID_MASK		= 0xFFFF0000,
+	OCRDMA_HBA_ATTRB_PCI_SSID_SHIFT		= 0x10,
+	OCRDMA_HBA_ATTRB_PCI_BUSNUM_MASK	= 0xFF,
+	OCRDMA_HBA_ATTRB_PCI_DEVNUM_MASK	= 0xFF00,
+	OCRDMA_HBA_ATTRB_PCI_DEVNUM_SHIFT	= 0x08,
+	OCRDMA_HBA_ATTRB_PCI_FUNCNUM_MASK	= 0xFF0000,
+	OCRDMA_HBA_ATTRB_PCI_FUNCNUM_SHIFT	= 0x10,
+	OCRDMA_HBA_ATTRB_IF_TYPE_MASK		= 0xFF000000,
+	OCRDMA_HBA_ATTRB_IF_TYPE_SHIFT		= 0x18,
+	OCRDMA_HBA_ATTRB_NETFIL_MASK		=0xFF
+};
+
+struct mgmt_hba_attribs {
+	u8 flashrom_version_string[32];
+	u8 manufacturer_name[32];
+	u32 supported_modes;
+	u32 rsvd_eprom_verhi_verlo;
+	u32 mbx_ds_ver;
+	u32 epfw_ds_ver;
+	u8 ncsi_ver_string[12];
+	u32 default_extended_timeout;
+	u8 controller_model_number[32];
+	u8 controller_description[64];
+	u8 controller_serial_number[32];
+	u8 ip_version_string[32];
+	u8 firmware_version_string[32];
+	u8 bios_version_string[32];
+	u8 redboot_version_string[32];
+	u8 driver_version_string[32];
+	u8 fw_on_flash_version_string[32];
+	u32 functionalities_supported;
+	u32 guid0_asicrev_cdblen;
+	u8 generational_guid[12];
+	u32 portcnt_guid15;
+	u32 mfuncdev_iscsi_ldtout;
+	u32 ptpnum_maxdoms_hbast_cv;
+	u32 firmware_post_status;
+	u32 hba_mtu[8];
+	u32 res_asicgen_iscsi_feaures;
+	u32 rsvd1[3];
+};
+
+struct mgmt_controller_attrib {
+	struct mgmt_hba_attribs hba_attribs;
+	u32 pci_did_vid;
+	u32 pci_ssid_svid;
+	u32 ityp_fnum_devnum_bnum;
+	u32 uid_hi;
+	u32 uid_lo;
+	u32 res_nnetfil;
+	u32 rsvd0[4];
+};
+
+struct ocrdma_get_ctrl_attribs_rsp {
+	struct ocrdma_mbx_hdr hdr;
+	struct mgmt_controller_attrib ctrl_attribs;
+};
+
+#define OCRDMA_SUBSYS_DCBX 0x10
+
+enum OCRDMA_DCBX_OPCODE {
+	OCRDMA_CMD_GET_DCBX_CONFIG = 0x01
+};
+
+enum OCRDMA_DCBX_PARAM_TYPE {
+	OCRDMA_PARAMETER_TYPE_ADMIN	= 0x00,
+	OCRDMA_PARAMETER_TYPE_OPER	= 0x01,
+	OCRDMA_PARAMETER_TYPE_PEER	= 0x02
+};
+
+enum OCRDMA_DCBX_APP_PROTO {
+	OCRDMA_APP_PROTO_ROCE	= 0x8915
+};
+
+enum OCRDMA_DCBX_PROTO {
+	OCRDMA_PROTO_SELECT_L2	= 0x00,
+	OCRDMA_PROTO_SELECT_L4	= 0x01
+};
+
+enum OCRDMA_DCBX_APP_PARAM {
+	OCRDMA_APP_PARAM_APP_PROTO_MASK = 0xFFFF,
+	OCRDMA_APP_PARAM_PROTO_SEL_MASK = 0xFF,
+	OCRDMA_APP_PARAM_PROTO_SEL_SHIFT = 0x10,
+	OCRDMA_APP_PARAM_VALID_MASK	= 0xFF,
+	OCRDMA_APP_PARAM_VALID_SHIFT	= 0x18
+};
+
+enum OCRDMA_DCBX_STATE_FLAGS {
+	OCRDMA_STATE_FLAG_ENABLED	= 0x01,
+	OCRDMA_STATE_FLAG_ADDVERTISED	= 0x02,
+	OCRDMA_STATE_FLAG_WILLING	= 0x04,
+	OCRDMA_STATE_FLAG_SYNC		= 0x08,
+	OCRDMA_STATE_FLAG_UNSUPPORTED	= 0x40000000,
+	OCRDMA_STATE_FLAG_NEG_FAILD	= 0x80000000
+};
+
+enum OCRDMA_TCV_AEV_OPV_ST {
+	OCRDMA_DCBX_TC_SUPPORT_MASK	= 0xFF,
+	OCRDMA_DCBX_TC_SUPPORT_SHIFT	= 0x18,
+	OCRDMA_DCBX_APP_ENTRY_SHIFT	= 0x10,
+	OCRDMA_DCBX_OP_PARAM_SHIFT	= 0x08,
+	OCRDMA_DCBX_STATE_MASK		= 0xFF
+};
+
+struct ocrdma_app_parameter {
+	u32 valid_proto_app;
+	u32 oui;
+	u32 app_prio[2];
+};
+
+struct ocrdma_dcbx_cfg {
+	u32 tcv_aev_opv_st;
+	u32 tc_state;
+	u32 pfc_state;
+	u32 qcn_state;
+	u32 appl_state;
+	u32 ll_state;
+	u32 tc_bw[2];
+	u32 tc_prio[8];
+	u32 pfc_prio[2];
+	struct ocrdma_app_parameter app_param[15];
+};
+
+struct ocrdma_get_dcbx_cfg_req {
+	struct ocrdma_mbx_hdr hdr;
+	u32 param_type;
+} __packed;
+
+struct ocrdma_get_dcbx_cfg_rsp {
+	struct ocrdma_mbx_rsp hdr;
+	struct ocrdma_dcbx_cfg cfg;
+} __packed;
+
+#endif				/* __OCRDMA_SLI_H__ */
diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_stats.c b/drivers/infiniband/hw/ocrdma/ocrdma_stats.c
new file mode 100644
index 0000000..41a9aec
--- /dev/null
+++ b/drivers/infiniband/hw/ocrdma/ocrdma_stats.c
@@ -0,0 +1,616 @@
+/*******************************************************************
+ * This file is part of the Emulex RoCE Device Driver for          *
+ * RoCE (RDMA over Converged Ethernet) adapters.                   *
+ * Copyright (C) 2008-2014 Emulex. All rights reserved.            *
+ * EMULEX and SLI are trademarks of Emulex.                        *
+ * www.emulex.com                                                  *
+ *                                                                 *
+ * This program is free software; you can redistribute it and/or   *
+ * modify it under the terms of version 2 of the GNU General       *
+ * Public License as published by the Free Software Foundation.    *
+ * This program is distributed in the hope that it will be useful. *
+ * ALL EXPRESS OR IMPLIED CONDITIONS, REPRESENTATIONS AND          *
+ * WARRANTIES, INCLUDING ANY IMPLIED WARRANTY OF MERCHANTABILITY,  *
+ * FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT, ARE      *
+ * DISCLAIMED, EXCEPT TO THE EXTENT THAT SUCH DISCLAIMERS ARE HELD *
+ * TO BE LEGALLY INVALID.  See the GNU General Public License for  *
+ * more details, a copy of which can be found in the file COPYING  *
+ * included with this package.                                     *
+ *
+ * Contact Information:
+ * linux-drivers@emulex.com
+ *
+ * Emulex
+ * 3333 Susan Street
+ * Costa Mesa, CA 92626
+ *******************************************************************/
+
+#include <rdma/ib_addr.h>
+#include "ocrdma_stats.h"
+
+static struct dentry *ocrdma_dbgfs_dir;
+
+static int ocrdma_add_stat(char *start, char *pcur,
+				char *name, u64 count)
+{
+	char buff[128] = {0};
+	int cpy_len = 0;
+
+	snprintf(buff, 128, "%s: %llu\n", name, count);
+	cpy_len = strlen(buff);
+
+	if (pcur + cpy_len > start + OCRDMA_MAX_DBGFS_MEM) {
+		pr_err("%s: No space in stats buff\n", __func__);
+		return 0;
+	}
+
+	memcpy(pcur, buff, cpy_len);
+	return cpy_len;
+}
+
+static bool ocrdma_alloc_stats_mem(struct ocrdma_dev *dev)
+{
+	struct stats_mem *mem = &dev->stats_mem;
+
+	/* Alloc mbox command mem*/
+	mem->size = max_t(u32, sizeof(struct ocrdma_rdma_stats_req),
+			sizeof(struct ocrdma_rdma_stats_resp));
+
+	mem->va   = dma_alloc_coherent(&dev->nic_info.pdev->dev, mem->size,
+					 &mem->pa, GFP_KERNEL);
+	if (!mem->va) {
+		pr_err("%s: stats mbox allocation failed\n", __func__);
+		return false;
+	}
+
+	memset(mem->va, 0, mem->size);
+
+	/* Alloc debugfs mem */
+	mem->debugfs_mem = kzalloc(OCRDMA_MAX_DBGFS_MEM, GFP_KERNEL);
+	if (!mem->debugfs_mem) {
+		pr_err("%s: stats debugfs mem allocation failed\n", __func__);
+		return false;
+	}
+
+	return true;
+}
+
+static void ocrdma_release_stats_mem(struct ocrdma_dev *dev)
+{
+	struct stats_mem *mem = &dev->stats_mem;
+
+	if (mem->va)
+		dma_free_coherent(&dev->nic_info.pdev->dev, mem->size,
+				  mem->va, mem->pa);
+	kfree(mem->debugfs_mem);
+}
+
+static char *ocrdma_resource_stats(struct ocrdma_dev *dev)
+{
+	char *stats = dev->stats_mem.debugfs_mem, *pcur;
+	struct ocrdma_rdma_stats_resp *rdma_stats =
+			(struct ocrdma_rdma_stats_resp *)dev->stats_mem.va;
+	struct ocrdma_rsrc_stats *rsrc_stats = &rdma_stats->act_rsrc_stats;
+
+	memset(stats, 0, (OCRDMA_MAX_DBGFS_MEM));
+
+	pcur = stats;
+	pcur += ocrdma_add_stat(stats, pcur, "active_dpp_pds",
+				(u64)rsrc_stats->dpp_pds);
+	pcur += ocrdma_add_stat(stats, pcur, "active_non_dpp_pds",
+				(u64)rsrc_stats->non_dpp_pds);
+	pcur += ocrdma_add_stat(stats, pcur, "active_rc_dpp_qps",
+				(u64)rsrc_stats->rc_dpp_qps);
+	pcur += ocrdma_add_stat(stats, pcur, "active_uc_dpp_qps",
+				(u64)rsrc_stats->uc_dpp_qps);
+	pcur += ocrdma_add_stat(stats, pcur, "active_ud_dpp_qps",
+				(u64)rsrc_stats->ud_dpp_qps);
+	pcur += ocrdma_add_stat(stats, pcur, "active_rc_non_dpp_qps",
+				(u64)rsrc_stats->rc_non_dpp_qps);
+	pcur += ocrdma_add_stat(stats, pcur, "active_uc_non_dpp_qps",
+				(u64)rsrc_stats->uc_non_dpp_qps);
+	pcur += ocrdma_add_stat(stats, pcur, "active_ud_non_dpp_qps",
+				(u64)rsrc_stats->ud_non_dpp_qps);
+	pcur += ocrdma_add_stat(stats, pcur, "active_srqs",
+				(u64)rsrc_stats->srqs);
+	pcur += ocrdma_add_stat(stats, pcur, "active_rbqs",
+				(u64)rsrc_stats->rbqs);
+	pcur += ocrdma_add_stat(stats, pcur, "active_64K_nsmr",
+				(u64)rsrc_stats->r64K_nsmr);
+	pcur += ocrdma_add_stat(stats, pcur, "active_64K_to_2M_nsmr",
+				(u64)rsrc_stats->r64K_to_2M_nsmr);
+	pcur += ocrdma_add_stat(stats, pcur, "active_2M_to_44M_nsmr",
+				(u64)rsrc_stats->r2M_to_44M_nsmr);
+	pcur += ocrdma_add_stat(stats, pcur, "active_44M_to_1G_nsmr",
+				(u64)rsrc_stats->r44M_to_1G_nsmr);
+	pcur += ocrdma_add_stat(stats, pcur, "active_1G_to_4G_nsmr",
+				(u64)rsrc_stats->r1G_to_4G_nsmr);
+	pcur += ocrdma_add_stat(stats, pcur, "active_nsmr_count_4G_to_32G",
+				(u64)rsrc_stats->nsmr_count_4G_to_32G);
+	pcur += ocrdma_add_stat(stats, pcur, "active_32G_to_64G_nsmr",
+				(u64)rsrc_stats->r32G_to_64G_nsmr);
+	pcur += ocrdma_add_stat(stats, pcur, "active_64G_to_128G_nsmr",
+				(u64)rsrc_stats->r64G_to_128G_nsmr);
+	pcur += ocrdma_add_stat(stats, pcur, "active_128G_to_higher_nsmr",
+				(u64)rsrc_stats->r128G_to_higher_nsmr);
+	pcur += ocrdma_add_stat(stats, pcur, "active_embedded_nsmr",
+				(u64)rsrc_stats->embedded_nsmr);
+	pcur += ocrdma_add_stat(stats, pcur, "active_frmr",
+				(u64)rsrc_stats->frmr);
+	pcur += ocrdma_add_stat(stats, pcur, "active_prefetch_qps",
+				(u64)rsrc_stats->prefetch_qps);
+	pcur += ocrdma_add_stat(stats, pcur, "active_ondemand_qps",
+				(u64)rsrc_stats->ondemand_qps);
+	pcur += ocrdma_add_stat(stats, pcur, "active_phy_mr",
+				(u64)rsrc_stats->phy_mr);
+	pcur += ocrdma_add_stat(stats, pcur, "active_mw",
+				(u64)rsrc_stats->mw);
+
+	/* Print the threshold stats */
+	rsrc_stats = &rdma_stats->th_rsrc_stats;
+
+	pcur += ocrdma_add_stat(stats, pcur, "threshold_dpp_pds",
+				(u64)rsrc_stats->dpp_pds);
+	pcur += ocrdma_add_stat(stats, pcur, "threshold_non_dpp_pds",
+				(u64)rsrc_stats->non_dpp_pds);
+	pcur += ocrdma_add_stat(stats, pcur, "threshold_rc_dpp_qps",
+				(u64)rsrc_stats->rc_dpp_qps);
+	pcur += ocrdma_add_stat(stats, pcur, "threshold_uc_dpp_qps",
+				(u64)rsrc_stats->uc_dpp_qps);
+	pcur += ocrdma_add_stat(stats, pcur, "threshold_ud_dpp_qps",
+				(u64)rsrc_stats->ud_dpp_qps);
+	pcur += ocrdma_add_stat(stats, pcur, "threshold_rc_non_dpp_qps",
+				(u64)rsrc_stats->rc_non_dpp_qps);
+	pcur += ocrdma_add_stat(stats, pcur, "threshold_uc_non_dpp_qps",
+				(u64)rsrc_stats->uc_non_dpp_qps);
+	pcur += ocrdma_add_stat(stats, pcur, "threshold_ud_non_dpp_qps",
+				(u64)rsrc_stats->ud_non_dpp_qps);
+	pcur += ocrdma_add_stat(stats, pcur, "threshold_srqs",
+				(u64)rsrc_stats->srqs);
+	pcur += ocrdma_add_stat(stats, pcur, "threshold_rbqs",
+				(u64)rsrc_stats->rbqs);
+	pcur += ocrdma_add_stat(stats, pcur, "threshold_64K_nsmr",
+				(u64)rsrc_stats->r64K_nsmr);
+	pcur += ocrdma_add_stat(stats, pcur, "threshold_64K_to_2M_nsmr",
+				(u64)rsrc_stats->r64K_to_2M_nsmr);
+	pcur += ocrdma_add_stat(stats, pcur, "threshold_2M_to_44M_nsmr",
+				(u64)rsrc_stats->r2M_to_44M_nsmr);
+	pcur += ocrdma_add_stat(stats, pcur, "threshold_44M_to_1G_nsmr",
+				(u64)rsrc_stats->r44M_to_1G_nsmr);
+	pcur += ocrdma_add_stat(stats, pcur, "threshold_1G_to_4G_nsmr",
+				(u64)rsrc_stats->r1G_to_4G_nsmr);
+	pcur += ocrdma_add_stat(stats, pcur, "threshold_nsmr_count_4G_to_32G",
+				(u64)rsrc_stats->nsmr_count_4G_to_32G);
+	pcur += ocrdma_add_stat(stats, pcur, "threshold_32G_to_64G_nsmr",
+				(u64)rsrc_stats->r32G_to_64G_nsmr);
+	pcur += ocrdma_add_stat(stats, pcur, "threshold_64G_to_128G_nsmr",
+				(u64)rsrc_stats->r64G_to_128G_nsmr);
+	pcur += ocrdma_add_stat(stats, pcur, "threshold_128G_to_higher_nsmr",
+				(u64)rsrc_stats->r128G_to_higher_nsmr);
+	pcur += ocrdma_add_stat(stats, pcur, "threshold_embedded_nsmr",
+				(u64)rsrc_stats->embedded_nsmr);
+	pcur += ocrdma_add_stat(stats, pcur, "threshold_frmr",
+				(u64)rsrc_stats->frmr);
+	pcur += ocrdma_add_stat(stats, pcur, "threshold_prefetch_qps",
+				(u64)rsrc_stats->prefetch_qps);
+	pcur += ocrdma_add_stat(stats, pcur, "threshold_ondemand_qps",
+				(u64)rsrc_stats->ondemand_qps);
+	pcur += ocrdma_add_stat(stats, pcur, "threshold_phy_mr",
+				(u64)rsrc_stats->phy_mr);
+	pcur += ocrdma_add_stat(stats, pcur, "threshold_mw",
+				(u64)rsrc_stats->mw);
+	return stats;
+}
+
+static char *ocrdma_rx_stats(struct ocrdma_dev *dev)
+{
+	char *stats = dev->stats_mem.debugfs_mem, *pcur;
+	struct ocrdma_rdma_stats_resp *rdma_stats =
+		(struct ocrdma_rdma_stats_resp *)dev->stats_mem.va;
+	struct ocrdma_rx_stats *rx_stats = &rdma_stats->rx_stats;
+
+	memset(stats, 0, (OCRDMA_MAX_DBGFS_MEM));
+
+	pcur = stats;
+	pcur += ocrdma_add_stat
+		(stats, pcur, "roce_frame_bytes",
+		 convert_to_64bit(rx_stats->roce_frame_bytes_lo,
+		 rx_stats->roce_frame_bytes_hi));
+	pcur += ocrdma_add_stat(stats, pcur, "roce_frame_icrc_drops",
+				(u64)rx_stats->roce_frame_icrc_drops);
+	pcur += ocrdma_add_stat(stats, pcur, "roce_frame_payload_len_drops",
+				(u64)rx_stats->roce_frame_payload_len_drops);
+	pcur += ocrdma_add_stat(stats, pcur, "ud_drops",
+				(u64)rx_stats->ud_drops);
+	pcur += ocrdma_add_stat(stats, pcur, "qp1_drops",
+				(u64)rx_stats->qp1_drops);
+	pcur += ocrdma_add_stat(stats, pcur, "psn_error_request_packets",
+				(u64)rx_stats->psn_error_request_packets);
+	pcur += ocrdma_add_stat(stats, pcur, "psn_error_resp_packets",
+				(u64)rx_stats->psn_error_resp_packets);
+	pcur += ocrdma_add_stat(stats, pcur, "rnr_nak_timeouts",
+				(u64)rx_stats->rnr_nak_timeouts);
+	pcur += ocrdma_add_stat(stats, pcur, "rnr_nak_receives",
+				(u64)rx_stats->rnr_nak_receives);
+	pcur += ocrdma_add_stat(stats, pcur, "roce_frame_rxmt_drops",
+				(u64)rx_stats->roce_frame_rxmt_drops);
+	pcur += ocrdma_add_stat(stats, pcur, "nak_count_psn_sequence_errors",
+				(u64)rx_stats->nak_count_psn_sequence_errors);
+	pcur += ocrdma_add_stat(stats, pcur, "rc_drop_count_lookup_errors",
+				(u64)rx_stats->rc_drop_count_lookup_errors);
+	pcur += ocrdma_add_stat(stats, pcur, "rq_rnr_naks",
+				(u64)rx_stats->rq_rnr_naks);
+	pcur += ocrdma_add_stat(stats, pcur, "srq_rnr_naks",
+				(u64)rx_stats->srq_rnr_naks);
+	pcur += ocrdma_add_stat(stats, pcur, "roce_frames",
+				convert_to_64bit(rx_stats->roce_frames_lo,
+						 rx_stats->roce_frames_hi));
+
+	return stats;
+}
+
+static char *ocrdma_tx_stats(struct ocrdma_dev *dev)
+{
+	char *stats = dev->stats_mem.debugfs_mem, *pcur;
+	struct ocrdma_rdma_stats_resp *rdma_stats =
+		(struct ocrdma_rdma_stats_resp *)dev->stats_mem.va;
+	struct ocrdma_tx_stats *tx_stats = &rdma_stats->tx_stats;
+
+	memset(stats, 0, (OCRDMA_MAX_DBGFS_MEM));
+
+	pcur = stats;
+	pcur += ocrdma_add_stat(stats, pcur, "send_pkts",
+				convert_to_64bit(tx_stats->send_pkts_lo,
+						 tx_stats->send_pkts_hi));
+	pcur += ocrdma_add_stat(stats, pcur, "write_pkts",
+				convert_to_64bit(tx_stats->write_pkts_lo,
+						 tx_stats->write_pkts_hi));
+	pcur += ocrdma_add_stat(stats, pcur, "read_pkts",
+				convert_to_64bit(tx_stats->read_pkts_lo,
+						 tx_stats->read_pkts_hi));
+	pcur += ocrdma_add_stat(stats, pcur, "read_rsp_pkts",
+				convert_to_64bit(tx_stats->read_rsp_pkts_lo,
+						 tx_stats->read_rsp_pkts_hi));
+	pcur += ocrdma_add_stat(stats, pcur, "ack_pkts",
+				convert_to_64bit(tx_stats->ack_pkts_lo,
+						 tx_stats->ack_pkts_hi));
+	pcur += ocrdma_add_stat(stats, pcur, "send_bytes",
+				convert_to_64bit(tx_stats->send_bytes_lo,
+						 tx_stats->send_bytes_hi));
+	pcur += ocrdma_add_stat(stats, pcur, "write_bytes",
+				convert_to_64bit(tx_stats->write_bytes_lo,
+						 tx_stats->write_bytes_hi));
+	pcur += ocrdma_add_stat(stats, pcur, "read_req_bytes",
+				convert_to_64bit(tx_stats->read_req_bytes_lo,
+						 tx_stats->read_req_bytes_hi));
+	pcur += ocrdma_add_stat(stats, pcur, "read_rsp_bytes",
+				convert_to_64bit(tx_stats->read_rsp_bytes_lo,
+						 tx_stats->read_rsp_bytes_hi));
+	pcur += ocrdma_add_stat(stats, pcur, "ack_timeouts",
+				(u64)tx_stats->ack_timeouts);
+
+	return stats;
+}
+
+static char *ocrdma_wqe_stats(struct ocrdma_dev *dev)
+{
+	char *stats = dev->stats_mem.debugfs_mem, *pcur;
+	struct ocrdma_rdma_stats_resp *rdma_stats =
+		(struct ocrdma_rdma_stats_resp *)dev->stats_mem.va;
+	struct ocrdma_wqe_stats	*wqe_stats = &rdma_stats->wqe_stats;
+
+	memset(stats, 0, (OCRDMA_MAX_DBGFS_MEM));
+
+	pcur = stats;
+	pcur += ocrdma_add_stat(stats, pcur, "large_send_rc_wqes",
+		convert_to_64bit(wqe_stats->large_send_rc_wqes_lo,
+				 wqe_stats->large_send_rc_wqes_hi));
+	pcur += ocrdma_add_stat(stats, pcur, "large_write_rc_wqes",
+		convert_to_64bit(wqe_stats->large_write_rc_wqes_lo,
+				 wqe_stats->large_write_rc_wqes_hi));
+	pcur += ocrdma_add_stat(stats, pcur, "read_wqes",
+				convert_to_64bit(wqe_stats->read_wqes_lo,
+						 wqe_stats->read_wqes_hi));
+	pcur += ocrdma_add_stat(stats, pcur, "frmr_wqes",
+				convert_to_64bit(wqe_stats->frmr_wqes_lo,
+						 wqe_stats->frmr_wqes_hi));
+	pcur += ocrdma_add_stat(stats, pcur, "mw_bind_wqes",
+				convert_to_64bit(wqe_stats->mw_bind_wqes_lo,
+						 wqe_stats->mw_bind_wqes_hi));
+	pcur += ocrdma_add_stat(stats, pcur, "invalidate_wqes",
+		convert_to_64bit(wqe_stats->invalidate_wqes_lo,
+				 wqe_stats->invalidate_wqes_hi));
+	pcur += ocrdma_add_stat(stats, pcur, "dpp_wqe_drops",
+				(u64)wqe_stats->dpp_wqe_drops);
+	return stats;
+}
+
+static char *ocrdma_db_errstats(struct ocrdma_dev *dev)
+{
+	char *stats = dev->stats_mem.debugfs_mem, *pcur;
+	struct ocrdma_rdma_stats_resp *rdma_stats =
+		(struct ocrdma_rdma_stats_resp *)dev->stats_mem.va;
+	struct ocrdma_db_err_stats *db_err_stats = &rdma_stats->db_err_stats;
+
+	memset(stats, 0, (OCRDMA_MAX_DBGFS_MEM));
+
+	pcur = stats;
+	pcur += ocrdma_add_stat(stats, pcur, "sq_doorbell_errors",
+				(u64)db_err_stats->sq_doorbell_errors);
+	pcur += ocrdma_add_stat(stats, pcur, "cq_doorbell_errors",
+				(u64)db_err_stats->cq_doorbell_errors);
+	pcur += ocrdma_add_stat(stats, pcur, "rq_srq_doorbell_errors",
+				(u64)db_err_stats->rq_srq_doorbell_errors);
+	pcur += ocrdma_add_stat(stats, pcur, "cq_overflow_errors",
+				(u64)db_err_stats->cq_overflow_errors);
+	return stats;
+}
+
+static char *ocrdma_rxqp_errstats(struct ocrdma_dev *dev)
+{
+	char *stats = dev->stats_mem.debugfs_mem, *pcur;
+	struct ocrdma_rdma_stats_resp *rdma_stats =
+		(struct ocrdma_rdma_stats_resp *)dev->stats_mem.va;
+	struct ocrdma_rx_qp_err_stats *rx_qp_err_stats =
+		 &rdma_stats->rx_qp_err_stats;
+
+	memset(stats, 0, (OCRDMA_MAX_DBGFS_MEM));
+
+	pcur = stats;
+	pcur += ocrdma_add_stat(stats, pcur, "nak_invalid_requst_errors",
+			(u64)rx_qp_err_stats->nak_invalid_requst_errors);
+	pcur += ocrdma_add_stat(stats, pcur, "nak_remote_operation_errors",
+			(u64)rx_qp_err_stats->nak_remote_operation_errors);
+	pcur += ocrdma_add_stat(stats, pcur, "nak_count_remote_access_errors",
+			(u64)rx_qp_err_stats->nak_count_remote_access_errors);
+	pcur += ocrdma_add_stat(stats, pcur, "local_length_errors",
+			(u64)rx_qp_err_stats->local_length_errors);
+	pcur += ocrdma_add_stat(stats, pcur, "local_protection_errors",
+			(u64)rx_qp_err_stats->local_protection_errors);
+	pcur += ocrdma_add_stat(stats, pcur, "local_qp_operation_errors",
+			(u64)rx_qp_err_stats->local_qp_operation_errors);
+	return stats;
+}
+
+static char *ocrdma_txqp_errstats(struct ocrdma_dev *dev)
+{
+	char *stats = dev->stats_mem.debugfs_mem, *pcur;
+	struct ocrdma_rdma_stats_resp *rdma_stats =
+		(struct ocrdma_rdma_stats_resp *)dev->stats_mem.va;
+	struct ocrdma_tx_qp_err_stats *tx_qp_err_stats =
+		&rdma_stats->tx_qp_err_stats;
+
+	memset(stats, 0, (OCRDMA_MAX_DBGFS_MEM));
+
+	pcur = stats;
+	pcur += ocrdma_add_stat(stats, pcur, "local_length_errors",
+			(u64)tx_qp_err_stats->local_length_errors);
+	pcur += ocrdma_add_stat(stats, pcur, "local_protection_errors",
+			(u64)tx_qp_err_stats->local_protection_errors);
+	pcur += ocrdma_add_stat(stats, pcur, "local_qp_operation_errors",
+			(u64)tx_qp_err_stats->local_qp_operation_errors);
+	pcur += ocrdma_add_stat(stats, pcur, "retry_count_exceeded_errors",
+			(u64)tx_qp_err_stats->retry_count_exceeded_errors);
+	pcur += ocrdma_add_stat(stats, pcur, "rnr_retry_count_exceeded_errors",
+			(u64)tx_qp_err_stats->rnr_retry_count_exceeded_errors);
+	return stats;
+}
+
+static char *ocrdma_tx_dbg_stats(struct ocrdma_dev *dev)
+{
+	int i;
+	char *pstats = dev->stats_mem.debugfs_mem;
+	struct ocrdma_rdma_stats_resp *rdma_stats =
+		(struct ocrdma_rdma_stats_resp *)dev->stats_mem.va;
+	struct ocrdma_tx_dbg_stats *tx_dbg_stats =
+		&rdma_stats->tx_dbg_stats;
+
+	memset(pstats, 0, (OCRDMA_MAX_DBGFS_MEM));
+
+	for (i = 0; i < 100; i++)
+		pstats += snprintf(pstats, 80, "DW[%d] = 0x%x\n", i,
+				 tx_dbg_stats->data[i]);
+
+	return dev->stats_mem.debugfs_mem;
+}
+
+static char *ocrdma_rx_dbg_stats(struct ocrdma_dev *dev)
+{
+	int i;
+	char *pstats = dev->stats_mem.debugfs_mem;
+	struct ocrdma_rdma_stats_resp *rdma_stats =
+		(struct ocrdma_rdma_stats_resp *)dev->stats_mem.va;
+	struct ocrdma_rx_dbg_stats *rx_dbg_stats =
+		&rdma_stats->rx_dbg_stats;
+
+	memset(pstats, 0, (OCRDMA_MAX_DBGFS_MEM));
+
+	for (i = 0; i < 200; i++)
+		pstats += snprintf(pstats, 80, "DW[%d] = 0x%x\n", i,
+				 rx_dbg_stats->data[i]);
+
+	return dev->stats_mem.debugfs_mem;
+}
+
+static void ocrdma_update_stats(struct ocrdma_dev *dev)
+{
+	ulong now = jiffies, secs;
+	int status = 0;
+
+	secs = jiffies_to_msecs(now - dev->last_stats_time) / 1000U;
+	if (secs) {
+		/* update */
+		status = ocrdma_mbx_rdma_stats(dev, false);
+		if (status)
+			pr_err("%s: stats mbox failed with status = %d\n",
+			       __func__, status);
+		dev->last_stats_time = jiffies;
+	}
+}
+
+static ssize_t ocrdma_dbgfs_ops_read(struct file *filp, char __user *buffer,
+					size_t usr_buf_len, loff_t *ppos)
+{
+	struct ocrdma_stats *pstats = filp->private_data;
+	struct ocrdma_dev *dev = pstats->dev;
+	ssize_t status = 0;
+	char *data = NULL;
+
+	/* No partial reads */
+	if (*ppos != 0)
+		return 0;
+
+	mutex_lock(&dev->stats_lock);
+
+	ocrdma_update_stats(dev);
+
+	switch (pstats->type) {
+	case OCRDMA_RSRC_STATS:
+		data = ocrdma_resource_stats(dev);
+		break;
+	case OCRDMA_RXSTATS:
+		data = ocrdma_rx_stats(dev);
+		break;
+	case OCRDMA_WQESTATS:
+		data = ocrdma_wqe_stats(dev);
+		break;
+	case OCRDMA_TXSTATS:
+		data = ocrdma_tx_stats(dev);
+		break;
+	case OCRDMA_DB_ERRSTATS:
+		data = ocrdma_db_errstats(dev);
+		break;
+	case OCRDMA_RXQP_ERRSTATS:
+		data = ocrdma_rxqp_errstats(dev);
+		break;
+	case OCRDMA_TXQP_ERRSTATS:
+		data = ocrdma_txqp_errstats(dev);
+		break;
+	case OCRDMA_TX_DBG_STATS:
+		data = ocrdma_tx_dbg_stats(dev);
+		break;
+	case OCRDMA_RX_DBG_STATS:
+		data = ocrdma_rx_dbg_stats(dev);
+		break;
+
+	default:
+		status = -EFAULT;
+		goto exit;
+	}
+
+	if (usr_buf_len < strlen(data)) {
+		status = -ENOSPC;
+		goto exit;
+	}
+
+	status = simple_read_from_buffer(buffer, usr_buf_len, ppos, data,
+					 strlen(data));
+exit:
+	mutex_unlock(&dev->stats_lock);
+	return status;
+}
+
+static const struct file_operations ocrdma_dbg_ops = {
+	.owner = THIS_MODULE,
+	.open = simple_open,
+	.read = ocrdma_dbgfs_ops_read,
+};
+
+void ocrdma_add_port_stats(struct ocrdma_dev *dev)
+{
+	if (!ocrdma_dbgfs_dir)
+		return;
+
+	/* Create post stats base dir */
+	dev->dir = debugfs_create_dir(dev->ibdev.name, ocrdma_dbgfs_dir);
+	if (!dev->dir)
+		goto err;
+
+	dev->rsrc_stats.type = OCRDMA_RSRC_STATS;
+	dev->rsrc_stats.dev = dev;
+	if (!debugfs_create_file("resource_stats", S_IRUSR, dev->dir,
+				 &dev->rsrc_stats, &ocrdma_dbg_ops))
+		goto err;
+
+	dev->rx_stats.type = OCRDMA_RXSTATS;
+	dev->rx_stats.dev = dev;
+	if (!debugfs_create_file("rx_stats", S_IRUSR, dev->dir,
+				 &dev->rx_stats, &ocrdma_dbg_ops))
+		goto err;
+
+	dev->wqe_stats.type = OCRDMA_WQESTATS;
+	dev->wqe_stats.dev = dev;
+	if (!debugfs_create_file("wqe_stats", S_IRUSR, dev->dir,
+				 &dev->wqe_stats, &ocrdma_dbg_ops))
+		goto err;
+
+	dev->tx_stats.type = OCRDMA_TXSTATS;
+	dev->tx_stats.dev = dev;
+	if (!debugfs_create_file("tx_stats", S_IRUSR, dev->dir,
+				 &dev->tx_stats, &ocrdma_dbg_ops))
+		goto err;
+
+	dev->db_err_stats.type = OCRDMA_DB_ERRSTATS;
+	dev->db_err_stats.dev = dev;
+	if (!debugfs_create_file("db_err_stats", S_IRUSR, dev->dir,
+				 &dev->db_err_stats, &ocrdma_dbg_ops))
+		goto err;
+
+
+	dev->tx_qp_err_stats.type = OCRDMA_TXQP_ERRSTATS;
+	dev->tx_qp_err_stats.dev = dev;
+	if (!debugfs_create_file("tx_qp_err_stats", S_IRUSR, dev->dir,
+				 &dev->tx_qp_err_stats, &ocrdma_dbg_ops))
+		goto err;
+
+	dev->rx_qp_err_stats.type = OCRDMA_RXQP_ERRSTATS;
+	dev->rx_qp_err_stats.dev = dev;
+	if (!debugfs_create_file("rx_qp_err_stats", S_IRUSR, dev->dir,
+				 &dev->rx_qp_err_stats, &ocrdma_dbg_ops))
+		goto err;
+
+
+	dev->tx_dbg_stats.type = OCRDMA_TX_DBG_STATS;
+	dev->tx_dbg_stats.dev = dev;
+	if (!debugfs_create_file("tx_dbg_stats", S_IRUSR, dev->dir,
+				 &dev->tx_dbg_stats, &ocrdma_dbg_ops))
+		goto err;
+
+	dev->rx_dbg_stats.type = OCRDMA_RX_DBG_STATS;
+	dev->rx_dbg_stats.dev = dev;
+	if (!debugfs_create_file("rx_dbg_stats", S_IRUSR, dev->dir,
+				 &dev->rx_dbg_stats, &ocrdma_dbg_ops))
+		goto err;
+
+	/* Now create dma_mem for stats mbx command */
+	if (!ocrdma_alloc_stats_mem(dev))
+		goto err;
+
+	mutex_init(&dev->stats_lock);
+
+	return;
+err:
+	ocrdma_release_stats_mem(dev);
+	debugfs_remove_recursive(dev->dir);
+	dev->dir = NULL;
+}
+
+void ocrdma_rem_port_stats(struct ocrdma_dev *dev)
+{
+	if (!dev->dir)
+		return;
+	mutex_destroy(&dev->stats_lock);
+	ocrdma_release_stats_mem(dev);
+	debugfs_remove(dev->dir);
+}
+
+void ocrdma_init_debugfs(void)
+{
+	/* Create base dir in debugfs root dir */
+	ocrdma_dbgfs_dir = debugfs_create_dir("ocrdma", NULL);
+}
+
+void ocrdma_rem_debugfs(void)
+{
+	debugfs_remove_recursive(ocrdma_dbgfs_dir);
+}
diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_stats.h b/drivers/infiniband/hw/ocrdma/ocrdma_stats.h
new file mode 100644
index 0000000..5f5e20c
--- /dev/null
+++ b/drivers/infiniband/hw/ocrdma/ocrdma_stats.h
@@ -0,0 +1,54 @@
+/*******************************************************************
+ * This file is part of the Emulex RoCE Device Driver for          *
+ * RoCE (RDMA over Converged Ethernet) adapters.                   *
+ * Copyright (C) 2008-2014 Emulex. All rights reserved.            *
+ * EMULEX and SLI are trademarks of Emulex.                        *
+ * www.emulex.com                                                  *
+ *                                                                 *
+ * This program is free software; you can redistribute it and/or   *
+ * modify it under the terms of version 2 of the GNU General       *
+ * Public License as published by the Free Software Foundation.    *
+ * This program is distributed in the hope that it will be useful. *
+ * ALL EXPRESS OR IMPLIED CONDITIONS, REPRESENTATIONS AND          *
+ * WARRANTIES, INCLUDING ANY IMPLIED WARRANTY OF MERCHANTABILITY,  *
+ * FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT, ARE      *
+ * DISCLAIMED, EXCEPT TO THE EXTENT THAT SUCH DISCLAIMERS ARE HELD *
+ * TO BE LEGALLY INVALID.  See the GNU General Public License for  *
+ * more details, a copy of which can be found in the file COPYING  *
+ * included with this package.                                     *
+ *
+ * Contact Information:
+ * linux-drivers@emulex.com
+ *
+ * Emulex
+ * 3333 Susan Street
+ * Costa Mesa, CA 92626
+ *******************************************************************/
+
+#ifndef __OCRDMA_STATS_H__
+#define __OCRDMA_STATS_H__
+
+#include <linux/debugfs.h>
+#include "ocrdma.h"
+#include "ocrdma_hw.h"
+
+#define OCRDMA_MAX_DBGFS_MEM 4096
+
+enum OCRDMA_STATS_TYPE {
+	OCRDMA_RSRC_STATS,
+	OCRDMA_RXSTATS,
+	OCRDMA_WQESTATS,
+	OCRDMA_TXSTATS,
+	OCRDMA_DB_ERRSTATS,
+	OCRDMA_RXQP_ERRSTATS,
+	OCRDMA_TXQP_ERRSTATS,
+	OCRDMA_TX_DBG_STATS,
+	OCRDMA_RX_DBG_STATS
+};
+
+void ocrdma_rem_debugfs(void);
+void ocrdma_init_debugfs(void);
+void ocrdma_rem_port_stats(struct ocrdma_dev *dev);
+void ocrdma_add_port_stats(struct ocrdma_dev *dev);
+
+#endif	/* __OCRDMA_STATS_H__ */
diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c
new file mode 100644
index 0000000..4c68305
--- /dev/null
+++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c
@@ -0,0 +1,3080 @@
+/*******************************************************************
+ * This file is part of the Emulex RoCE Device Driver for          *
+ * RoCE (RDMA over Converged Ethernet) adapters.                   *
+ * Copyright (C) 2008-2012 Emulex. All rights reserved.            *
+ * EMULEX and SLI are trademarks of Emulex.                        *
+ * www.emulex.com                                                  *
+ *                                                                 *
+ * This program is free software; you can redistribute it and/or   *
+ * modify it under the terms of version 2 of the GNU General       *
+ * Public License as published by the Free Software Foundation.    *
+ * This program is distributed in the hope that it will be useful. *
+ * ALL EXPRESS OR IMPLIED CONDITIONS, REPRESENTATIONS AND          *
+ * WARRANTIES, INCLUDING ANY IMPLIED WARRANTY OF MERCHANTABILITY,  *
+ * FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT, ARE      *
+ * DISCLAIMED, EXCEPT TO THE EXTENT THAT SUCH DISCLAIMERS ARE HELD *
+ * TO BE LEGALLY INVALID.  See the GNU General Public License for  *
+ * more details, a copy of which can be found in the file COPYING  *
+ * included with this package.                                     *
+ *
+ * Contact Information:
+ * linux-drivers@emulex.com
+ *
+ * Emulex
+ * 3333 Susan Street
+ * Costa Mesa, CA 92626
+ *******************************************************************/
+
+#include <linux/dma-mapping.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_user_verbs.h>
+#include <rdma/iw_cm.h>
+#include <rdma/ib_umem.h>
+#include <rdma/ib_addr.h>
+
+#include "ocrdma.h"
+#include "ocrdma_hw.h"
+#include "ocrdma_verbs.h"
+#include "ocrdma_abi.h"
+
+int ocrdma_query_pkey(struct ib_device *ibdev, u8 port, u16 index, u16 *pkey)
+{
+	if (index > 1)
+		return -EINVAL;
+
+	*pkey = 0xffff;
+	return 0;
+}
+
+int ocrdma_query_gid(struct ib_device *ibdev, u8 port,
+		     int index, union ib_gid *sgid)
+{
+	struct ocrdma_dev *dev;
+
+	dev = get_ocrdma_dev(ibdev);
+	memset(sgid, 0, sizeof(*sgid));
+	if (index > OCRDMA_MAX_SGID)
+		return -EINVAL;
+
+	memcpy(sgid, &dev->sgid_tbl[index], sizeof(*sgid));
+
+	return 0;
+}
+
+int ocrdma_query_device(struct ib_device *ibdev, struct ib_device_attr *attr)
+{
+	struct ocrdma_dev *dev = get_ocrdma_dev(ibdev);
+
+	memset(attr, 0, sizeof *attr);
+	memcpy(&attr->fw_ver, &dev->attr.fw_ver[0],
+	       min(sizeof(dev->attr.fw_ver), sizeof(attr->fw_ver)));
+	ocrdma_get_guid(dev, (u8 *)&attr->sys_image_guid);
+	attr->max_mr_size = dev->attr.max_mr_size;
+	attr->page_size_cap = 0xffff000;
+	attr->vendor_id = dev->nic_info.pdev->vendor;
+	attr->vendor_part_id = dev->nic_info.pdev->device;
+	attr->hw_ver = dev->asic_id;
+	attr->max_qp = dev->attr.max_qp;
+	attr->max_ah = OCRDMA_MAX_AH;
+	attr->max_qp_wr = dev->attr.max_wqe;
+
+	attr->device_cap_flags = IB_DEVICE_CURR_QP_STATE_MOD |
+					IB_DEVICE_RC_RNR_NAK_GEN |
+					IB_DEVICE_SHUTDOWN_PORT |
+					IB_DEVICE_SYS_IMAGE_GUID |
+					IB_DEVICE_LOCAL_DMA_LKEY |
+					IB_DEVICE_MEM_MGT_EXTENSIONS;
+	attr->max_sge = min(dev->attr.max_send_sge, dev->attr.max_srq_sge);
+	attr->max_sge_rd = 0;
+	attr->max_cq = dev->attr.max_cq;
+	attr->max_cqe = dev->attr.max_cqe;
+	attr->max_mr = dev->attr.max_mr;
+	attr->max_mw = dev->attr.max_mw;
+	attr->max_pd = dev->attr.max_pd;
+	attr->atomic_cap = 0;
+	attr->max_fmr = 0;
+	attr->max_map_per_fmr = 0;
+	attr->max_qp_rd_atom =
+	    min(dev->attr.max_ord_per_qp, dev->attr.max_ird_per_qp);
+	attr->max_qp_init_rd_atom = dev->attr.max_ord_per_qp;
+	attr->max_srq = dev->attr.max_srq;
+	attr->max_srq_sge = dev->attr.max_srq_sge;
+	attr->max_srq_wr = dev->attr.max_rqe;
+	attr->local_ca_ack_delay = dev->attr.local_ca_ack_delay;
+	attr->max_fast_reg_page_list_len = dev->attr.max_pages_per_frmr;
+	attr->max_pkeys = 1;
+	return 0;
+}
+
+static inline void get_link_speed_and_width(struct ocrdma_dev *dev,
+					    u8 *ib_speed, u8 *ib_width)
+{
+	int status;
+	u8 speed;
+
+	status = ocrdma_mbx_get_link_speed(dev, &speed);
+	if (status)
+		speed = OCRDMA_PHYS_LINK_SPEED_ZERO;
+
+	switch (speed) {
+	case OCRDMA_PHYS_LINK_SPEED_1GBPS:
+		*ib_speed = IB_SPEED_SDR;
+		*ib_width = IB_WIDTH_1X;
+		break;
+
+	case OCRDMA_PHYS_LINK_SPEED_10GBPS:
+		*ib_speed = IB_SPEED_QDR;
+		*ib_width = IB_WIDTH_1X;
+		break;
+
+	case OCRDMA_PHYS_LINK_SPEED_20GBPS:
+		*ib_speed = IB_SPEED_DDR;
+		*ib_width = IB_WIDTH_4X;
+		break;
+
+	case OCRDMA_PHYS_LINK_SPEED_40GBPS:
+		*ib_speed = IB_SPEED_QDR;
+		*ib_width = IB_WIDTH_4X;
+		break;
+
+	default:
+		/* Unsupported */
+		*ib_speed = IB_SPEED_SDR;
+		*ib_width = IB_WIDTH_1X;
+	}
+}
+
+int ocrdma_query_port(struct ib_device *ibdev,
+		      u8 port, struct ib_port_attr *props)
+{
+	enum ib_port_state port_state;
+	struct ocrdma_dev *dev;
+	struct net_device *netdev;
+
+	dev = get_ocrdma_dev(ibdev);
+	if (port > 1) {
+		pr_err("%s(%d) invalid_port=0x%x\n", __func__,
+		       dev->id, port);
+		return -EINVAL;
+	}
+	netdev = dev->nic_info.netdev;
+	if (netif_running(netdev) && netif_oper_up(netdev)) {
+		port_state = IB_PORT_ACTIVE;
+		props->phys_state = 5;
+	} else {
+		port_state = IB_PORT_DOWN;
+		props->phys_state = 3;
+	}
+	props->max_mtu = IB_MTU_4096;
+	props->active_mtu = iboe_get_mtu(netdev->mtu);
+	props->lid = 0;
+	props->lmc = 0;
+	props->sm_lid = 0;
+	props->sm_sl = 0;
+	props->state = port_state;
+	props->port_cap_flags =
+	    IB_PORT_CM_SUP |
+	    IB_PORT_REINIT_SUP |
+	    IB_PORT_DEVICE_MGMT_SUP | IB_PORT_VENDOR_CLASS_SUP | IB_PORT_IP_BASED_GIDS;
+	props->gid_tbl_len = OCRDMA_MAX_SGID;
+	props->pkey_tbl_len = 1;
+	props->bad_pkey_cntr = 0;
+	props->qkey_viol_cntr = 0;
+	get_link_speed_and_width(dev, &props->active_speed,
+				 &props->active_width);
+	props->max_msg_sz = 0x80000000;
+	props->max_vl_num = 4;
+	return 0;
+}
+
+int ocrdma_modify_port(struct ib_device *ibdev, u8 port, int mask,
+		       struct ib_port_modify *props)
+{
+	struct ocrdma_dev *dev;
+
+	dev = get_ocrdma_dev(ibdev);
+	if (port > 1) {
+		pr_err("%s(%d) invalid_port=0x%x\n", __func__, dev->id, port);
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static int ocrdma_add_mmap(struct ocrdma_ucontext *uctx, u64 phy_addr,
+			   unsigned long len)
+{
+	struct ocrdma_mm *mm;
+
+	mm = kzalloc(sizeof(*mm), GFP_KERNEL);
+	if (mm == NULL)
+		return -ENOMEM;
+	mm->key.phy_addr = phy_addr;
+	mm->key.len = len;
+	INIT_LIST_HEAD(&mm->entry);
+
+	mutex_lock(&uctx->mm_list_lock);
+	list_add_tail(&mm->entry, &uctx->mm_head);
+	mutex_unlock(&uctx->mm_list_lock);
+	return 0;
+}
+
+static void ocrdma_del_mmap(struct ocrdma_ucontext *uctx, u64 phy_addr,
+			    unsigned long len)
+{
+	struct ocrdma_mm *mm, *tmp;
+
+	mutex_lock(&uctx->mm_list_lock);
+	list_for_each_entry_safe(mm, tmp, &uctx->mm_head, entry) {
+		if (len != mm->key.len && phy_addr != mm->key.phy_addr)
+			continue;
+
+		list_del(&mm->entry);
+		kfree(mm);
+		break;
+	}
+	mutex_unlock(&uctx->mm_list_lock);
+}
+
+static bool ocrdma_search_mmap(struct ocrdma_ucontext *uctx, u64 phy_addr,
+			      unsigned long len)
+{
+	bool found = false;
+	struct ocrdma_mm *mm;
+
+	mutex_lock(&uctx->mm_list_lock);
+	list_for_each_entry(mm, &uctx->mm_head, entry) {
+		if (len != mm->key.len && phy_addr != mm->key.phy_addr)
+			continue;
+
+		found = true;
+		break;
+	}
+	mutex_unlock(&uctx->mm_list_lock);
+	return found;
+}
+
+static struct ocrdma_pd *_ocrdma_alloc_pd(struct ocrdma_dev *dev,
+					  struct ocrdma_ucontext *uctx,
+					  struct ib_udata *udata)
+{
+	struct ocrdma_pd *pd = NULL;
+	int status = 0;
+
+	pd = kzalloc(sizeof(*pd), GFP_KERNEL);
+	if (!pd)
+		return ERR_PTR(-ENOMEM);
+
+	if (udata && uctx) {
+		pd->dpp_enabled =
+			ocrdma_get_asic_type(dev) == OCRDMA_ASIC_GEN_SKH_R;
+		pd->num_dpp_qp =
+			pd->dpp_enabled ? (dev->nic_info.db_page_size /
+					   dev->attr.wqe_size) : 0;
+	}
+
+retry:
+	status = ocrdma_mbx_alloc_pd(dev, pd);
+	if (status) {
+		if (pd->dpp_enabled) {
+			pd->dpp_enabled = false;
+			pd->num_dpp_qp = 0;
+			goto retry;
+		} else {
+			kfree(pd);
+			return ERR_PTR(status);
+		}
+	}
+
+	return pd;
+}
+
+static inline int is_ucontext_pd(struct ocrdma_ucontext *uctx,
+				 struct ocrdma_pd *pd)
+{
+	return (uctx->cntxt_pd == pd ? true : false);
+}
+
+static int _ocrdma_dealloc_pd(struct ocrdma_dev *dev,
+			      struct ocrdma_pd *pd)
+{
+	int status = 0;
+
+	status = ocrdma_mbx_dealloc_pd(dev, pd);
+	kfree(pd);
+	return status;
+}
+
+static int ocrdma_alloc_ucontext_pd(struct ocrdma_dev *dev,
+				    struct ocrdma_ucontext *uctx,
+				    struct ib_udata *udata)
+{
+	int status = 0;
+
+	uctx->cntxt_pd = _ocrdma_alloc_pd(dev, uctx, udata);
+	if (IS_ERR(uctx->cntxt_pd)) {
+		status = PTR_ERR(uctx->cntxt_pd);
+		uctx->cntxt_pd = NULL;
+		goto err;
+	}
+
+	uctx->cntxt_pd->uctx = uctx;
+	uctx->cntxt_pd->ibpd.device = &dev->ibdev;
+err:
+	return status;
+}
+
+static int ocrdma_dealloc_ucontext_pd(struct ocrdma_ucontext *uctx)
+{
+	int status = 0;
+	struct ocrdma_pd *pd = uctx->cntxt_pd;
+	struct ocrdma_dev *dev = get_ocrdma_dev(pd->ibpd.device);
+
+	if (uctx->pd_in_use) {
+		pr_err("%s(%d) Freeing in use pdid=0x%x.\n",
+		       __func__, dev->id, pd->id);
+	}
+	uctx->cntxt_pd = NULL;
+	status = _ocrdma_dealloc_pd(dev, pd);
+	return status;
+}
+
+static struct ocrdma_pd *ocrdma_get_ucontext_pd(struct ocrdma_ucontext *uctx)
+{
+	struct ocrdma_pd *pd = NULL;
+
+	mutex_lock(&uctx->mm_list_lock);
+	if (!uctx->pd_in_use) {
+		uctx->pd_in_use = true;
+		pd = uctx->cntxt_pd;
+	}
+	mutex_unlock(&uctx->mm_list_lock);
+
+	return pd;
+}
+
+static void ocrdma_release_ucontext_pd(struct ocrdma_ucontext *uctx)
+{
+	mutex_lock(&uctx->mm_list_lock);
+	uctx->pd_in_use = false;
+	mutex_unlock(&uctx->mm_list_lock);
+}
+
+struct ib_ucontext *ocrdma_alloc_ucontext(struct ib_device *ibdev,
+					  struct ib_udata *udata)
+{
+	int status;
+	struct ocrdma_ucontext *ctx;
+	struct ocrdma_alloc_ucontext_resp resp;
+	struct ocrdma_dev *dev = get_ocrdma_dev(ibdev);
+	struct pci_dev *pdev = dev->nic_info.pdev;
+	u32 map_len = roundup(sizeof(u32) * 2048, PAGE_SIZE);
+
+	if (!udata)
+		return ERR_PTR(-EFAULT);
+	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+	if (!ctx)
+		return ERR_PTR(-ENOMEM);
+	INIT_LIST_HEAD(&ctx->mm_head);
+	mutex_init(&ctx->mm_list_lock);
+
+	ctx->ah_tbl.va = dma_alloc_coherent(&pdev->dev, map_len,
+					    &ctx->ah_tbl.pa, GFP_KERNEL);
+	if (!ctx->ah_tbl.va) {
+		kfree(ctx);
+		return ERR_PTR(-ENOMEM);
+	}
+	memset(ctx->ah_tbl.va, 0, map_len);
+	ctx->ah_tbl.len = map_len;
+
+	memset(&resp, 0, sizeof(resp));
+	resp.ah_tbl_len = ctx->ah_tbl.len;
+	resp.ah_tbl_page = virt_to_phys(ctx->ah_tbl.va);
+
+	status = ocrdma_add_mmap(ctx, resp.ah_tbl_page, resp.ah_tbl_len);
+	if (status)
+		goto map_err;
+
+	status = ocrdma_alloc_ucontext_pd(dev, ctx, udata);
+	if (status)
+		goto pd_err;
+
+	resp.dev_id = dev->id;
+	resp.max_inline_data = dev->attr.max_inline_data;
+	resp.wqe_size = dev->attr.wqe_size;
+	resp.rqe_size = dev->attr.rqe_size;
+	resp.dpp_wqe_size = dev->attr.wqe_size;
+
+	memcpy(resp.fw_ver, dev->attr.fw_ver, sizeof(resp.fw_ver));
+	status = ib_copy_to_udata(udata, &resp, sizeof(resp));
+	if (status)
+		goto cpy_err;
+	return &ctx->ibucontext;
+
+cpy_err:
+pd_err:
+	ocrdma_del_mmap(ctx, ctx->ah_tbl.pa, ctx->ah_tbl.len);
+map_err:
+	dma_free_coherent(&pdev->dev, ctx->ah_tbl.len, ctx->ah_tbl.va,
+			  ctx->ah_tbl.pa);
+	kfree(ctx);
+	return ERR_PTR(status);
+}
+
+int ocrdma_dealloc_ucontext(struct ib_ucontext *ibctx)
+{
+	int status = 0;
+	struct ocrdma_mm *mm, *tmp;
+	struct ocrdma_ucontext *uctx = get_ocrdma_ucontext(ibctx);
+	struct ocrdma_dev *dev = get_ocrdma_dev(ibctx->device);
+	struct pci_dev *pdev = dev->nic_info.pdev;
+
+	status = ocrdma_dealloc_ucontext_pd(uctx);
+
+	ocrdma_del_mmap(uctx, uctx->ah_tbl.pa, uctx->ah_tbl.len);
+	dma_free_coherent(&pdev->dev, uctx->ah_tbl.len, uctx->ah_tbl.va,
+			  uctx->ah_tbl.pa);
+
+	list_for_each_entry_safe(mm, tmp, &uctx->mm_head, entry) {
+		list_del(&mm->entry);
+		kfree(mm);
+	}
+	kfree(uctx);
+	return status;
+}
+
+int ocrdma_mmap(struct ib_ucontext *context, struct vm_area_struct *vma)
+{
+	struct ocrdma_ucontext *ucontext = get_ocrdma_ucontext(context);
+	struct ocrdma_dev *dev = get_ocrdma_dev(context->device);
+	unsigned long vm_page = vma->vm_pgoff << PAGE_SHIFT;
+	u64 unmapped_db = (u64) dev->nic_info.unmapped_db;
+	unsigned long len = (vma->vm_end - vma->vm_start);
+	int status = 0;
+	bool found;
+
+	if (vma->vm_start & (PAGE_SIZE - 1))
+		return -EINVAL;
+	found = ocrdma_search_mmap(ucontext, vma->vm_pgoff << PAGE_SHIFT, len);
+	if (!found)
+		return -EINVAL;
+
+	if ((vm_page >= unmapped_db) && (vm_page <= (unmapped_db +
+		dev->nic_info.db_total_size)) &&
+		(len <=	dev->nic_info.db_page_size)) {
+		if (vma->vm_flags & VM_READ)
+			return -EPERM;
+
+		vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+		status = io_remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff,
+					    len, vma->vm_page_prot);
+	} else if (dev->nic_info.dpp_unmapped_len &&
+		(vm_page >= (u64) dev->nic_info.dpp_unmapped_addr) &&
+		(vm_page <= (u64) (dev->nic_info.dpp_unmapped_addr +
+			dev->nic_info.dpp_unmapped_len)) &&
+		(len <= dev->nic_info.dpp_unmapped_len)) {
+		if (vma->vm_flags & VM_READ)
+			return -EPERM;
+
+		vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot);
+		status = io_remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff,
+					    len, vma->vm_page_prot);
+	} else {
+		status = remap_pfn_range(vma, vma->vm_start,
+					 vma->vm_pgoff, len, vma->vm_page_prot);
+	}
+	return status;
+}
+
+static int ocrdma_copy_pd_uresp(struct ocrdma_dev *dev, struct ocrdma_pd *pd,
+				struct ib_ucontext *ib_ctx,
+				struct ib_udata *udata)
+{
+	int status;
+	u64 db_page_addr;
+	u64 dpp_page_addr = 0;
+	u32 db_page_size;
+	struct ocrdma_alloc_pd_uresp rsp;
+	struct ocrdma_ucontext *uctx = get_ocrdma_ucontext(ib_ctx);
+
+	memset(&rsp, 0, sizeof(rsp));
+	rsp.id = pd->id;
+	rsp.dpp_enabled = pd->dpp_enabled;
+	db_page_addr = ocrdma_get_db_addr(dev, pd->id);
+	db_page_size = dev->nic_info.db_page_size;
+
+	status = ocrdma_add_mmap(uctx, db_page_addr, db_page_size);
+	if (status)
+		return status;
+
+	if (pd->dpp_enabled) {
+		dpp_page_addr = dev->nic_info.dpp_unmapped_addr +
+				(pd->id * PAGE_SIZE);
+		status = ocrdma_add_mmap(uctx, dpp_page_addr,
+				 PAGE_SIZE);
+		if (status)
+			goto dpp_map_err;
+		rsp.dpp_page_addr_hi = upper_32_bits(dpp_page_addr);
+		rsp.dpp_page_addr_lo = dpp_page_addr;
+	}
+
+	status = ib_copy_to_udata(udata, &rsp, sizeof(rsp));
+	if (status)
+		goto ucopy_err;
+
+	pd->uctx = uctx;
+	return 0;
+
+ucopy_err:
+	if (pd->dpp_enabled)
+		ocrdma_del_mmap(pd->uctx, dpp_page_addr, PAGE_SIZE);
+dpp_map_err:
+	ocrdma_del_mmap(pd->uctx, db_page_addr, db_page_size);
+	return status;
+}
+
+struct ib_pd *ocrdma_alloc_pd(struct ib_device *ibdev,
+			      struct ib_ucontext *context,
+			      struct ib_udata *udata)
+{
+	struct ocrdma_dev *dev = get_ocrdma_dev(ibdev);
+	struct ocrdma_pd *pd;
+	struct ocrdma_ucontext *uctx = NULL;
+	int status;
+	u8 is_uctx_pd = false;
+
+	if (udata && context) {
+		uctx = get_ocrdma_ucontext(context);
+		pd = ocrdma_get_ucontext_pd(uctx);
+		if (pd) {
+			is_uctx_pd = true;
+			goto pd_mapping;
+		}
+	}
+
+	pd = _ocrdma_alloc_pd(dev, uctx, udata);
+	if (IS_ERR(pd)) {
+		status = PTR_ERR(pd);
+		goto exit;
+	}
+
+pd_mapping:
+	if (udata && context) {
+		status = ocrdma_copy_pd_uresp(dev, pd, context, udata);
+		if (status)
+			goto err;
+	}
+	return &pd->ibpd;
+
+err:
+	if (is_uctx_pd) {
+		ocrdma_release_ucontext_pd(uctx);
+	} else {
+		status = ocrdma_mbx_dealloc_pd(dev, pd);
+		kfree(pd);
+	}
+exit:
+	return ERR_PTR(status);
+}
+
+int ocrdma_dealloc_pd(struct ib_pd *ibpd)
+{
+	struct ocrdma_pd *pd = get_ocrdma_pd(ibpd);
+	struct ocrdma_dev *dev = get_ocrdma_dev(ibpd->device);
+	struct ocrdma_ucontext *uctx = NULL;
+	int status = 0;
+	u64 usr_db;
+
+	uctx = pd->uctx;
+	if (uctx) {
+		u64 dpp_db = dev->nic_info.dpp_unmapped_addr +
+			(pd->id * PAGE_SIZE);
+		if (pd->dpp_enabled)
+			ocrdma_del_mmap(pd->uctx, dpp_db, PAGE_SIZE);
+		usr_db = ocrdma_get_db_addr(dev, pd->id);
+		ocrdma_del_mmap(pd->uctx, usr_db, dev->nic_info.db_page_size);
+
+		if (is_ucontext_pd(uctx, pd)) {
+			ocrdma_release_ucontext_pd(uctx);
+			return status;
+		}
+	}
+	status = _ocrdma_dealloc_pd(dev, pd);
+	return status;
+}
+
+static int ocrdma_alloc_lkey(struct ocrdma_dev *dev, struct ocrdma_mr *mr,
+			    u32 pdid, int acc, u32 num_pbls, u32 addr_check)
+{
+	int status;
+
+	mr->hwmr.fr_mr = 0;
+	mr->hwmr.local_rd = 1;
+	mr->hwmr.remote_rd = (acc & IB_ACCESS_REMOTE_READ) ? 1 : 0;
+	mr->hwmr.remote_wr = (acc & IB_ACCESS_REMOTE_WRITE) ? 1 : 0;
+	mr->hwmr.local_wr = (acc & IB_ACCESS_LOCAL_WRITE) ? 1 : 0;
+	mr->hwmr.mw_bind = (acc & IB_ACCESS_MW_BIND) ? 1 : 0;
+	mr->hwmr.remote_atomic = (acc & IB_ACCESS_REMOTE_ATOMIC) ? 1 : 0;
+	mr->hwmr.num_pbls = num_pbls;
+
+	status = ocrdma_mbx_alloc_lkey(dev, &mr->hwmr, pdid, addr_check);
+	if (status)
+		return status;
+
+	mr->ibmr.lkey = mr->hwmr.lkey;
+	if (mr->hwmr.remote_wr || mr->hwmr.remote_rd)
+		mr->ibmr.rkey = mr->hwmr.lkey;
+	return 0;
+}
+
+struct ib_mr *ocrdma_get_dma_mr(struct ib_pd *ibpd, int acc)
+{
+	int status;
+	struct ocrdma_mr *mr;
+	struct ocrdma_pd *pd = get_ocrdma_pd(ibpd);
+	struct ocrdma_dev *dev = get_ocrdma_dev(ibpd->device);
+
+	if (acc & IB_ACCESS_REMOTE_WRITE && !(acc & IB_ACCESS_LOCAL_WRITE)) {
+		pr_err("%s err, invalid access rights\n", __func__);
+		return ERR_PTR(-EINVAL);
+	}
+
+	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
+	if (!mr)
+		return ERR_PTR(-ENOMEM);
+
+	status = ocrdma_alloc_lkey(dev, mr, pd->id, acc, 0,
+				   OCRDMA_ADDR_CHECK_DISABLE);
+	if (status) {
+		kfree(mr);
+		return ERR_PTR(status);
+	}
+
+	return &mr->ibmr;
+}
+
+static void ocrdma_free_mr_pbl_tbl(struct ocrdma_dev *dev,
+				   struct ocrdma_hw_mr *mr)
+{
+	struct pci_dev *pdev = dev->nic_info.pdev;
+	int i = 0;
+
+	if (mr->pbl_table) {
+		for (i = 0; i < mr->num_pbls; i++) {
+			if (!mr->pbl_table[i].va)
+				continue;
+			dma_free_coherent(&pdev->dev, mr->pbl_size,
+					  mr->pbl_table[i].va,
+					  mr->pbl_table[i].pa);
+		}
+		kfree(mr->pbl_table);
+		mr->pbl_table = NULL;
+	}
+}
+
+static int ocrdma_get_pbl_info(struct ocrdma_dev *dev, struct ocrdma_mr *mr,
+			      u32 num_pbes)
+{
+	u32 num_pbls = 0;
+	u32 idx = 0;
+	int status = 0;
+	u32 pbl_size;
+
+	do {
+		pbl_size = OCRDMA_MIN_HPAGE_SIZE * (1 << idx);
+		if (pbl_size > MAX_OCRDMA_PBL_SIZE) {
+			status = -EFAULT;
+			break;
+		}
+		num_pbls = roundup(num_pbes, (pbl_size / sizeof(u64)));
+		num_pbls = num_pbls / (pbl_size / sizeof(u64));
+		idx++;
+	} while (num_pbls >= dev->attr.max_num_mr_pbl);
+
+	mr->hwmr.num_pbes = num_pbes;
+	mr->hwmr.num_pbls = num_pbls;
+	mr->hwmr.pbl_size = pbl_size;
+	return status;
+}
+
+static int ocrdma_build_pbl_tbl(struct ocrdma_dev *dev, struct ocrdma_hw_mr *mr)
+{
+	int status = 0;
+	int i;
+	u32 dma_len = mr->pbl_size;
+	struct pci_dev *pdev = dev->nic_info.pdev;
+	void *va;
+	dma_addr_t pa;
+
+	mr->pbl_table = kzalloc(sizeof(struct ocrdma_pbl) *
+				mr->num_pbls, GFP_KERNEL);
+
+	if (!mr->pbl_table)
+		return -ENOMEM;
+
+	for (i = 0; i < mr->num_pbls; i++) {
+		va = dma_alloc_coherent(&pdev->dev, dma_len, &pa, GFP_KERNEL);
+		if (!va) {
+			ocrdma_free_mr_pbl_tbl(dev, mr);
+			status = -ENOMEM;
+			break;
+		}
+		memset(va, 0, dma_len);
+		mr->pbl_table[i].va = va;
+		mr->pbl_table[i].pa = pa;
+	}
+	return status;
+}
+
+static void build_user_pbes(struct ocrdma_dev *dev, struct ocrdma_mr *mr,
+			    u32 num_pbes)
+{
+	struct ocrdma_pbe *pbe;
+	struct scatterlist *sg;
+	struct ocrdma_pbl *pbl_tbl = mr->hwmr.pbl_table;
+	struct ib_umem *umem = mr->umem;
+	int shift, pg_cnt, pages, pbe_cnt, entry, total_num_pbes = 0;
+
+	if (!mr->hwmr.num_pbes)
+		return;
+
+	pbe = (struct ocrdma_pbe *)pbl_tbl->va;
+	pbe_cnt = 0;
+
+	shift = ilog2(umem->page_size);
+
+	for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) {
+		pages = sg_dma_len(sg) >> shift;
+		for (pg_cnt = 0; pg_cnt < pages; pg_cnt++) {
+			/* store the page address in pbe */
+			pbe->pa_lo =
+			    cpu_to_le32(sg_dma_address
+					(sg) +
+					(umem->page_size * pg_cnt));
+			pbe->pa_hi =
+			    cpu_to_le32(upper_32_bits
+					((sg_dma_address
+					  (sg) +
+					  umem->page_size * pg_cnt)));
+			pbe_cnt += 1;
+			total_num_pbes += 1;
+			pbe++;
+
+			/* if done building pbes, issue the mbx cmd. */
+			if (total_num_pbes == num_pbes)
+				return;
+
+			/* if the given pbl is full storing the pbes,
+			 * move to next pbl.
+			 */
+			if (pbe_cnt ==
+				(mr->hwmr.pbl_size / sizeof(u64))) {
+				pbl_tbl++;
+				pbe = (struct ocrdma_pbe *)pbl_tbl->va;
+				pbe_cnt = 0;
+			}
+
+		}
+	}
+}
+
+struct ib_mr *ocrdma_reg_user_mr(struct ib_pd *ibpd, u64 start, u64 len,
+				 u64 usr_addr, int acc, struct ib_udata *udata)
+{
+	int status = -ENOMEM;
+	struct ocrdma_dev *dev = get_ocrdma_dev(ibpd->device);
+	struct ocrdma_mr *mr;
+	struct ocrdma_pd *pd;
+	u32 num_pbes;
+
+	pd = get_ocrdma_pd(ibpd);
+
+	if (acc & IB_ACCESS_REMOTE_WRITE && !(acc & IB_ACCESS_LOCAL_WRITE))
+		return ERR_PTR(-EINVAL);
+
+	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
+	if (!mr)
+		return ERR_PTR(status);
+	mr->umem = ib_umem_get(ibpd->uobject->context, start, len, acc, 0);
+	if (IS_ERR(mr->umem)) {
+		status = -EFAULT;
+		goto umem_err;
+	}
+	num_pbes = ib_umem_page_count(mr->umem);
+	status = ocrdma_get_pbl_info(dev, mr, num_pbes);
+	if (status)
+		goto umem_err;
+
+	mr->hwmr.pbe_size = mr->umem->page_size;
+	mr->hwmr.fbo = mr->umem->offset;
+	mr->hwmr.va = usr_addr;
+	mr->hwmr.len = len;
+	mr->hwmr.remote_wr = (acc & IB_ACCESS_REMOTE_WRITE) ? 1 : 0;
+	mr->hwmr.remote_rd = (acc & IB_ACCESS_REMOTE_READ) ? 1 : 0;
+	mr->hwmr.local_wr = (acc & IB_ACCESS_LOCAL_WRITE) ? 1 : 0;
+	mr->hwmr.local_rd = 1;
+	mr->hwmr.remote_atomic = (acc & IB_ACCESS_REMOTE_ATOMIC) ? 1 : 0;
+	status = ocrdma_build_pbl_tbl(dev, &mr->hwmr);
+	if (status)
+		goto umem_err;
+	build_user_pbes(dev, mr, num_pbes);
+	status = ocrdma_reg_mr(dev, &mr->hwmr, pd->id, acc);
+	if (status)
+		goto mbx_err;
+	mr->ibmr.lkey = mr->hwmr.lkey;
+	if (mr->hwmr.remote_wr || mr->hwmr.remote_rd)
+		mr->ibmr.rkey = mr->hwmr.lkey;
+
+	return &mr->ibmr;
+
+mbx_err:
+	ocrdma_free_mr_pbl_tbl(dev, &mr->hwmr);
+umem_err:
+	kfree(mr);
+	return ERR_PTR(status);
+}
+
+int ocrdma_dereg_mr(struct ib_mr *ib_mr)
+{
+	struct ocrdma_mr *mr = get_ocrdma_mr(ib_mr);
+	struct ocrdma_dev *dev = get_ocrdma_dev(ib_mr->device);
+	int status;
+
+	status = ocrdma_mbx_dealloc_lkey(dev, mr->hwmr.fr_mr, mr->hwmr.lkey);
+
+	ocrdma_free_mr_pbl_tbl(dev, &mr->hwmr);
+
+	/* it could be user registered memory. */
+	if (mr->umem)
+		ib_umem_release(mr->umem);
+	kfree(mr);
+
+	/* Don't stop cleanup, in case FW is unresponsive */
+	if (dev->mqe_ctx.fw_error_state) {
+		status = 0;
+		pr_err("%s(%d) fw not responding.\n",
+		       __func__, dev->id);
+	}
+	return status;
+}
+
+static int ocrdma_copy_cq_uresp(struct ocrdma_dev *dev, struct ocrdma_cq *cq,
+				struct ib_udata *udata,
+				struct ib_ucontext *ib_ctx)
+{
+	int status;
+	struct ocrdma_ucontext *uctx = get_ocrdma_ucontext(ib_ctx);
+	struct ocrdma_create_cq_uresp uresp;
+
+	memset(&uresp, 0, sizeof(uresp));
+	uresp.cq_id = cq->id;
+	uresp.page_size = PAGE_ALIGN(cq->len);
+	uresp.num_pages = 1;
+	uresp.max_hw_cqe = cq->max_hw_cqe;
+	uresp.page_addr[0] = virt_to_phys(cq->va);
+	uresp.db_page_addr =  ocrdma_get_db_addr(dev, uctx->cntxt_pd->id);
+	uresp.db_page_size = dev->nic_info.db_page_size;
+	uresp.phase_change = cq->phase_change ? 1 : 0;
+	status = ib_copy_to_udata(udata, &uresp, sizeof(uresp));
+	if (status) {
+		pr_err("%s(%d) copy error cqid=0x%x.\n",
+		       __func__, dev->id, cq->id);
+		goto err;
+	}
+	status = ocrdma_add_mmap(uctx, uresp.db_page_addr, uresp.db_page_size);
+	if (status)
+		goto err;
+	status = ocrdma_add_mmap(uctx, uresp.page_addr[0], uresp.page_size);
+	if (status) {
+		ocrdma_del_mmap(uctx, uresp.db_page_addr, uresp.db_page_size);
+		goto err;
+	}
+	cq->ucontext = uctx;
+err:
+	return status;
+}
+
+struct ib_cq *ocrdma_create_cq(struct ib_device *ibdev, int entries, int vector,
+			       struct ib_ucontext *ib_ctx,
+			       struct ib_udata *udata)
+{
+	struct ocrdma_cq *cq;
+	struct ocrdma_dev *dev = get_ocrdma_dev(ibdev);
+	struct ocrdma_ucontext *uctx = NULL;
+	u16 pd_id = 0;
+	int status;
+	struct ocrdma_create_cq_ureq ureq;
+
+	if (udata) {
+		if (ib_copy_from_udata(&ureq, udata, sizeof(ureq)))
+			return ERR_PTR(-EFAULT);
+	} else
+		ureq.dpp_cq = 0;
+	cq = kzalloc(sizeof(*cq), GFP_KERNEL);
+	if (!cq)
+		return ERR_PTR(-ENOMEM);
+
+	spin_lock_init(&cq->cq_lock);
+	spin_lock_init(&cq->comp_handler_lock);
+	INIT_LIST_HEAD(&cq->sq_head);
+	INIT_LIST_HEAD(&cq->rq_head);
+	cq->first_arm = true;
+
+	if (ib_ctx) {
+		uctx = get_ocrdma_ucontext(ib_ctx);
+		pd_id = uctx->cntxt_pd->id;
+	}
+
+	status = ocrdma_mbx_create_cq(dev, cq, entries, ureq.dpp_cq, pd_id);
+	if (status) {
+		kfree(cq);
+		return ERR_PTR(status);
+	}
+	if (ib_ctx) {
+		status = ocrdma_copy_cq_uresp(dev, cq, udata, ib_ctx);
+		if (status)
+			goto ctx_err;
+	}
+	cq->phase = OCRDMA_CQE_VALID;
+	dev->cq_tbl[cq->id] = cq;
+	return &cq->ibcq;
+
+ctx_err:
+	ocrdma_mbx_destroy_cq(dev, cq);
+	kfree(cq);
+	return ERR_PTR(status);
+}
+
+int ocrdma_resize_cq(struct ib_cq *ibcq, int new_cnt,
+		     struct ib_udata *udata)
+{
+	int status = 0;
+	struct ocrdma_cq *cq = get_ocrdma_cq(ibcq);
+
+	if (new_cnt < 1 || new_cnt > cq->max_hw_cqe) {
+		status = -EINVAL;
+		return status;
+	}
+	ibcq->cqe = new_cnt;
+	return status;
+}
+
+static void ocrdma_flush_cq(struct ocrdma_cq *cq)
+{
+	int cqe_cnt;
+	int valid_count = 0;
+	unsigned long flags;
+
+	struct ocrdma_dev *dev = get_ocrdma_dev(cq->ibcq.device);
+	struct ocrdma_cqe *cqe = NULL;
+
+	cqe = cq->va;
+	cqe_cnt = cq->cqe_cnt;
+
+	/* Last irq might have scheduled a polling thread
+	 * sync-up with it before hard flushing.
+	 */
+	spin_lock_irqsave(&cq->cq_lock, flags);
+	while (cqe_cnt) {
+		if (is_cqe_valid(cq, cqe))
+			valid_count++;
+		cqe++;
+		cqe_cnt--;
+	}
+	ocrdma_ring_cq_db(dev, cq->id, false, false, valid_count);
+	spin_unlock_irqrestore(&cq->cq_lock, flags);
+}
+
+int ocrdma_destroy_cq(struct ib_cq *ibcq)
+{
+	int status;
+	struct ocrdma_cq *cq = get_ocrdma_cq(ibcq);
+	struct ocrdma_eq *eq = NULL;
+	struct ocrdma_dev *dev = get_ocrdma_dev(ibcq->device);
+	int pdid = 0;
+	u32 irq, indx;
+
+	dev->cq_tbl[cq->id] = NULL;
+	indx = ocrdma_get_eq_table_index(dev, cq->eqn);
+	if (indx == -EINVAL)
+		BUG();
+
+	eq = &dev->eq_tbl[indx];
+	irq = ocrdma_get_irq(dev, eq);
+	synchronize_irq(irq);
+	ocrdma_flush_cq(cq);
+
+	status = ocrdma_mbx_destroy_cq(dev, cq);
+	if (cq->ucontext) {
+		pdid = cq->ucontext->cntxt_pd->id;
+		ocrdma_del_mmap(cq->ucontext, (u64) cq->pa,
+				PAGE_ALIGN(cq->len));
+		ocrdma_del_mmap(cq->ucontext,
+				ocrdma_get_db_addr(dev, pdid),
+				dev->nic_info.db_page_size);
+	}
+
+	kfree(cq);
+	return status;
+}
+
+static int ocrdma_add_qpn_map(struct ocrdma_dev *dev, struct ocrdma_qp *qp)
+{
+	int status = -EINVAL;
+
+	if (qp->id < OCRDMA_MAX_QP && dev->qp_tbl[qp->id] == NULL) {
+		dev->qp_tbl[qp->id] = qp;
+		status = 0;
+	}
+	return status;
+}
+
+static void ocrdma_del_qpn_map(struct ocrdma_dev *dev, struct ocrdma_qp *qp)
+{
+	dev->qp_tbl[qp->id] = NULL;
+}
+
+static int ocrdma_check_qp_params(struct ib_pd *ibpd, struct ocrdma_dev *dev,
+				  struct ib_qp_init_attr *attrs)
+{
+	if ((attrs->qp_type != IB_QPT_GSI) &&
+	    (attrs->qp_type != IB_QPT_RC) &&
+	    (attrs->qp_type != IB_QPT_UC) &&
+	    (attrs->qp_type != IB_QPT_UD)) {
+		pr_err("%s(%d) unsupported qp type=0x%x requested\n",
+		       __func__, dev->id, attrs->qp_type);
+		return -EINVAL;
+	}
+	/* Skip the check for QP1 to support CM size of 128 */
+	if ((attrs->qp_type != IB_QPT_GSI) &&
+	    (attrs->cap.max_send_wr > dev->attr.max_wqe)) {
+		pr_err("%s(%d) unsupported send_wr=0x%x requested\n",
+		       __func__, dev->id, attrs->cap.max_send_wr);
+		pr_err("%s(%d) supported send_wr=0x%x\n",
+		       __func__, dev->id, dev->attr.max_wqe);
+		return -EINVAL;
+	}
+	if (!attrs->srq && (attrs->cap.max_recv_wr > dev->attr.max_rqe)) {
+		pr_err("%s(%d) unsupported recv_wr=0x%x requested\n",
+		       __func__, dev->id, attrs->cap.max_recv_wr);
+		pr_err("%s(%d) supported recv_wr=0x%x\n",
+		       __func__, dev->id, dev->attr.max_rqe);
+		return -EINVAL;
+	}
+	if (attrs->cap.max_inline_data > dev->attr.max_inline_data) {
+		pr_err("%s(%d) unsupported inline data size=0x%x requested\n",
+		       __func__, dev->id, attrs->cap.max_inline_data);
+		pr_err("%s(%d) supported inline data size=0x%x\n",
+		       __func__, dev->id, dev->attr.max_inline_data);
+		return -EINVAL;
+	}
+	if (attrs->cap.max_send_sge > dev->attr.max_send_sge) {
+		pr_err("%s(%d) unsupported send_sge=0x%x requested\n",
+		       __func__, dev->id, attrs->cap.max_send_sge);
+		pr_err("%s(%d) supported send_sge=0x%x\n",
+		       __func__, dev->id, dev->attr.max_send_sge);
+		return -EINVAL;
+	}
+	if (attrs->cap.max_recv_sge > dev->attr.max_recv_sge) {
+		pr_err("%s(%d) unsupported recv_sge=0x%x requested\n",
+		       __func__, dev->id, attrs->cap.max_recv_sge);
+		pr_err("%s(%d) supported recv_sge=0x%x\n",
+		       __func__, dev->id, dev->attr.max_recv_sge);
+		return -EINVAL;
+	}
+	/* unprivileged user space cannot create special QP */
+	if (ibpd->uobject && attrs->qp_type == IB_QPT_GSI) {
+		pr_err
+		    ("%s(%d) Userspace can't create special QPs of type=0x%x\n",
+		     __func__, dev->id, attrs->qp_type);
+		return -EINVAL;
+	}
+	/* allow creating only one GSI type of QP */
+	if (attrs->qp_type == IB_QPT_GSI && dev->gsi_qp_created) {
+		pr_err("%s(%d) GSI special QPs already created.\n",
+		       __func__, dev->id);
+		return -EINVAL;
+	}
+	/* verify consumer QPs are not trying to use GSI QP's CQ */
+	if ((attrs->qp_type != IB_QPT_GSI) && (dev->gsi_qp_created)) {
+		if ((dev->gsi_sqcq == get_ocrdma_cq(attrs->send_cq)) ||
+			(dev->gsi_rqcq == get_ocrdma_cq(attrs->recv_cq))) {
+			pr_err("%s(%d) Consumer QP cannot use GSI CQs.\n",
+				__func__, dev->id);
+			return -EINVAL;
+		}
+	}
+	return 0;
+}
+
+static int ocrdma_copy_qp_uresp(struct ocrdma_qp *qp,
+				struct ib_udata *udata, int dpp_offset,
+				int dpp_credit_lmt, int srq)
+{
+	int status = 0;
+	u64 usr_db;
+	struct ocrdma_create_qp_uresp uresp;
+	struct ocrdma_dev *dev = qp->dev;
+	struct ocrdma_pd *pd = qp->pd;
+
+	memset(&uresp, 0, sizeof(uresp));
+	usr_db = dev->nic_info.unmapped_db +
+			(pd->id * dev->nic_info.db_page_size);
+	uresp.qp_id = qp->id;
+	uresp.sq_dbid = qp->sq.dbid;
+	uresp.num_sq_pages = 1;
+	uresp.sq_page_size = PAGE_ALIGN(qp->sq.len);
+	uresp.sq_page_addr[0] = virt_to_phys(qp->sq.va);
+	uresp.num_wqe_allocated = qp->sq.max_cnt;
+	if (!srq) {
+		uresp.rq_dbid = qp->rq.dbid;
+		uresp.num_rq_pages = 1;
+		uresp.rq_page_size = PAGE_ALIGN(qp->rq.len);
+		uresp.rq_page_addr[0] = virt_to_phys(qp->rq.va);
+		uresp.num_rqe_allocated = qp->rq.max_cnt;
+	}
+	uresp.db_page_addr = usr_db;
+	uresp.db_page_size = dev->nic_info.db_page_size;
+	uresp.db_sq_offset = OCRDMA_DB_GEN2_SQ_OFFSET;
+	uresp.db_rq_offset = OCRDMA_DB_GEN2_RQ_OFFSET;
+	uresp.db_shift = OCRDMA_DB_RQ_SHIFT;
+
+	if (qp->dpp_enabled) {
+		uresp.dpp_credit = dpp_credit_lmt;
+		uresp.dpp_offset = dpp_offset;
+	}
+	status = ib_copy_to_udata(udata, &uresp, sizeof(uresp));
+	if (status) {
+		pr_err("%s(%d) user copy error.\n", __func__, dev->id);
+		goto err;
+	}
+	status = ocrdma_add_mmap(pd->uctx, uresp.sq_page_addr[0],
+				 uresp.sq_page_size);
+	if (status)
+		goto err;
+
+	if (!srq) {
+		status = ocrdma_add_mmap(pd->uctx, uresp.rq_page_addr[0],
+					 uresp.rq_page_size);
+		if (status)
+			goto rq_map_err;
+	}
+	return status;
+rq_map_err:
+	ocrdma_del_mmap(pd->uctx, uresp.sq_page_addr[0], uresp.sq_page_size);
+err:
+	return status;
+}
+
+static void ocrdma_set_qp_db(struct ocrdma_dev *dev, struct ocrdma_qp *qp,
+			     struct ocrdma_pd *pd)
+{
+	if (ocrdma_get_asic_type(dev) == OCRDMA_ASIC_GEN_SKH_R) {
+		qp->sq_db = dev->nic_info.db +
+			(pd->id * dev->nic_info.db_page_size) +
+			OCRDMA_DB_GEN2_SQ_OFFSET;
+		qp->rq_db = dev->nic_info.db +
+			(pd->id * dev->nic_info.db_page_size) +
+			OCRDMA_DB_GEN2_RQ_OFFSET;
+	} else {
+		qp->sq_db = dev->nic_info.db +
+			(pd->id * dev->nic_info.db_page_size) +
+			OCRDMA_DB_SQ_OFFSET;
+		qp->rq_db = dev->nic_info.db +
+			(pd->id * dev->nic_info.db_page_size) +
+			OCRDMA_DB_RQ_OFFSET;
+	}
+}
+
+static int ocrdma_alloc_wr_id_tbl(struct ocrdma_qp *qp)
+{
+	qp->wqe_wr_id_tbl =
+	    kzalloc(sizeof(*(qp->wqe_wr_id_tbl)) * qp->sq.max_cnt,
+		    GFP_KERNEL);
+	if (qp->wqe_wr_id_tbl == NULL)
+		return -ENOMEM;
+	qp->rqe_wr_id_tbl =
+	    kzalloc(sizeof(u64) * qp->rq.max_cnt, GFP_KERNEL);
+	if (qp->rqe_wr_id_tbl == NULL)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static void ocrdma_set_qp_init_params(struct ocrdma_qp *qp,
+				      struct ocrdma_pd *pd,
+				      struct ib_qp_init_attr *attrs)
+{
+	qp->pd = pd;
+	spin_lock_init(&qp->q_lock);
+	INIT_LIST_HEAD(&qp->sq_entry);
+	INIT_LIST_HEAD(&qp->rq_entry);
+
+	qp->qp_type = attrs->qp_type;
+	qp->cap_flags = OCRDMA_QP_INB_RD | OCRDMA_QP_INB_WR;
+	qp->max_inline_data = attrs->cap.max_inline_data;
+	qp->sq.max_sges = attrs->cap.max_send_sge;
+	qp->rq.max_sges = attrs->cap.max_recv_sge;
+	qp->state = OCRDMA_QPS_RST;
+	qp->signaled = (attrs->sq_sig_type == IB_SIGNAL_ALL_WR) ? true : false;
+}
+
+static void ocrdma_store_gsi_qp_cq(struct ocrdma_dev *dev,
+				   struct ib_qp_init_attr *attrs)
+{
+	if (attrs->qp_type == IB_QPT_GSI) {
+		dev->gsi_qp_created = 1;
+		dev->gsi_sqcq = get_ocrdma_cq(attrs->send_cq);
+		dev->gsi_rqcq = get_ocrdma_cq(attrs->recv_cq);
+	}
+}
+
+struct ib_qp *ocrdma_create_qp(struct ib_pd *ibpd,
+			       struct ib_qp_init_attr *attrs,
+			       struct ib_udata *udata)
+{
+	int status;
+	struct ocrdma_pd *pd = get_ocrdma_pd(ibpd);
+	struct ocrdma_qp *qp;
+	struct ocrdma_dev *dev = get_ocrdma_dev(ibpd->device);
+	struct ocrdma_create_qp_ureq ureq;
+	u16 dpp_credit_lmt, dpp_offset;
+
+	status = ocrdma_check_qp_params(ibpd, dev, attrs);
+	if (status)
+		goto gen_err;
+
+	memset(&ureq, 0, sizeof(ureq));
+	if (udata) {
+		if (ib_copy_from_udata(&ureq, udata, sizeof(ureq)))
+			return ERR_PTR(-EFAULT);
+	}
+	qp = kzalloc(sizeof(*qp), GFP_KERNEL);
+	if (!qp) {
+		status = -ENOMEM;
+		goto gen_err;
+	}
+	qp->dev = dev;
+	ocrdma_set_qp_init_params(qp, pd, attrs);
+	if (udata == NULL)
+		qp->cap_flags |= (OCRDMA_QP_MW_BIND | OCRDMA_QP_LKEY0 |
+					OCRDMA_QP_FAST_REG);
+
+	mutex_lock(&dev->dev_lock);
+	status = ocrdma_mbx_create_qp(qp, attrs, ureq.enable_dpp_cq,
+					ureq.dpp_cq_id,
+					&dpp_offset, &dpp_credit_lmt);
+	if (status)
+		goto mbx_err;
+
+	/* user space QP's wr_id table are managed in library */
+	if (udata == NULL) {
+		status = ocrdma_alloc_wr_id_tbl(qp);
+		if (status)
+			goto map_err;
+	}
+
+	status = ocrdma_add_qpn_map(dev, qp);
+	if (status)
+		goto map_err;
+	ocrdma_set_qp_db(dev, qp, pd);
+	if (udata) {
+		status = ocrdma_copy_qp_uresp(qp, udata, dpp_offset,
+					      dpp_credit_lmt,
+					      (attrs->srq != NULL));
+		if (status)
+			goto cpy_err;
+	}
+	ocrdma_store_gsi_qp_cq(dev, attrs);
+	qp->ibqp.qp_num = qp->id;
+	mutex_unlock(&dev->dev_lock);
+	return &qp->ibqp;
+
+cpy_err:
+	ocrdma_del_qpn_map(dev, qp);
+map_err:
+	ocrdma_mbx_destroy_qp(dev, qp);
+mbx_err:
+	mutex_unlock(&dev->dev_lock);
+	kfree(qp->wqe_wr_id_tbl);
+	kfree(qp->rqe_wr_id_tbl);
+	kfree(qp);
+	pr_err("%s(%d) error=%d\n", __func__, dev->id, status);
+gen_err:
+	return ERR_PTR(status);
+}
+
+int _ocrdma_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+		      int attr_mask)
+{
+	int status = 0;
+	struct ocrdma_qp *qp;
+	struct ocrdma_dev *dev;
+	enum ib_qp_state old_qps;
+
+	qp = get_ocrdma_qp(ibqp);
+	dev = qp->dev;
+	if (attr_mask & IB_QP_STATE)
+		status = ocrdma_qp_state_change(qp, attr->qp_state, &old_qps);
+	/* if new and previous states are same hw doesn't need to
+	 * know about it.
+	 */
+	if (status < 0)
+		return status;
+	status = ocrdma_mbx_modify_qp(dev, qp, attr, attr_mask);
+
+	return status;
+}
+
+int ocrdma_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+		     int attr_mask, struct ib_udata *udata)
+{
+	unsigned long flags;
+	int status = -EINVAL;
+	struct ocrdma_qp *qp;
+	struct ocrdma_dev *dev;
+	enum ib_qp_state old_qps, new_qps;
+
+	qp = get_ocrdma_qp(ibqp);
+	dev = qp->dev;
+
+	/* syncronize with multiple context trying to change, retrive qps */
+	mutex_lock(&dev->dev_lock);
+	/* syncronize with wqe, rqe posting and cqe processing contexts */
+	spin_lock_irqsave(&qp->q_lock, flags);
+	old_qps = get_ibqp_state(qp->state);
+	if (attr_mask & IB_QP_STATE)
+		new_qps = attr->qp_state;
+	else
+		new_qps = old_qps;
+	spin_unlock_irqrestore(&qp->q_lock, flags);
+
+	if (!ib_modify_qp_is_ok(old_qps, new_qps, ibqp->qp_type, attr_mask,
+				IB_LINK_LAYER_ETHERNET)) {
+		pr_err("%s(%d) invalid attribute mask=0x%x specified for\n"
+		       "qpn=0x%x of type=0x%x old_qps=0x%x, new_qps=0x%x\n",
+		       __func__, dev->id, attr_mask, qp->id, ibqp->qp_type,
+		       old_qps, new_qps);
+		goto param_err;
+	}
+
+	status = _ocrdma_modify_qp(ibqp, attr, attr_mask);
+	if (status > 0)
+		status = 0;
+param_err:
+	mutex_unlock(&dev->dev_lock);
+	return status;
+}
+
+static enum ib_mtu ocrdma_mtu_int_to_enum(u16 mtu)
+{
+	switch (mtu) {
+	case 256:
+		return IB_MTU_256;
+	case 512:
+		return IB_MTU_512;
+	case 1024:
+		return IB_MTU_1024;
+	case 2048:
+		return IB_MTU_2048;
+	case 4096:
+		return IB_MTU_4096;
+	default:
+		return IB_MTU_1024;
+	}
+}
+
+static int ocrdma_to_ib_qp_acc_flags(int qp_cap_flags)
+{
+	int ib_qp_acc_flags = 0;
+
+	if (qp_cap_flags & OCRDMA_QP_INB_WR)
+		ib_qp_acc_flags |= IB_ACCESS_REMOTE_WRITE;
+	if (qp_cap_flags & OCRDMA_QP_INB_RD)
+		ib_qp_acc_flags |= IB_ACCESS_LOCAL_WRITE;
+	return ib_qp_acc_flags;
+}
+
+int ocrdma_query_qp(struct ib_qp *ibqp,
+		    struct ib_qp_attr *qp_attr,
+		    int attr_mask, struct ib_qp_init_attr *qp_init_attr)
+{
+	int status;
+	u32 qp_state;
+	struct ocrdma_qp_params params;
+	struct ocrdma_qp *qp = get_ocrdma_qp(ibqp);
+	struct ocrdma_dev *dev = qp->dev;
+
+	memset(&params, 0, sizeof(params));
+	mutex_lock(&dev->dev_lock);
+	status = ocrdma_mbx_query_qp(dev, qp, &params);
+	mutex_unlock(&dev->dev_lock);
+	if (status)
+		goto mbx_err;
+	qp_attr->qp_state = get_ibqp_state(IB_QPS_INIT);
+	qp_attr->cur_qp_state = get_ibqp_state(IB_QPS_INIT);
+	qp_attr->path_mtu =
+		ocrdma_mtu_int_to_enum(params.path_mtu_pkey_indx &
+				OCRDMA_QP_PARAMS_PATH_MTU_MASK) >>
+				OCRDMA_QP_PARAMS_PATH_MTU_SHIFT;
+	qp_attr->path_mig_state = IB_MIG_MIGRATED;
+	qp_attr->rq_psn = params.hop_lmt_rq_psn & OCRDMA_QP_PARAMS_RQ_PSN_MASK;
+	qp_attr->sq_psn = params.tclass_sq_psn & OCRDMA_QP_PARAMS_SQ_PSN_MASK;
+	qp_attr->dest_qp_num =
+	    params.ack_to_rnr_rtc_dest_qpn & OCRDMA_QP_PARAMS_DEST_QPN_MASK;
+
+	qp_attr->qp_access_flags = ocrdma_to_ib_qp_acc_flags(qp->cap_flags);
+	qp_attr->cap.max_send_wr = qp->sq.max_cnt - 1;
+	qp_attr->cap.max_recv_wr = qp->rq.max_cnt - 1;
+	qp_attr->cap.max_send_sge = qp->sq.max_sges;
+	qp_attr->cap.max_recv_sge = qp->rq.max_sges;
+	qp_attr->cap.max_inline_data = qp->max_inline_data;
+	qp_init_attr->cap = qp_attr->cap;
+	memcpy(&qp_attr->ah_attr.grh.dgid, &params.dgid[0],
+	       sizeof(params.dgid));
+	qp_attr->ah_attr.grh.flow_label = params.rnt_rc_sl_fl &
+	    OCRDMA_QP_PARAMS_FLOW_LABEL_MASK;
+	qp_attr->ah_attr.grh.sgid_index = qp->sgid_idx;
+	qp_attr->ah_attr.grh.hop_limit = (params.hop_lmt_rq_psn &
+					  OCRDMA_QP_PARAMS_HOP_LMT_MASK) >>
+						OCRDMA_QP_PARAMS_HOP_LMT_SHIFT;
+	qp_attr->ah_attr.grh.traffic_class = (params.tclass_sq_psn &
+					      OCRDMA_QP_PARAMS_TCLASS_MASK) >>
+						OCRDMA_QP_PARAMS_TCLASS_SHIFT;
+
+	qp_attr->ah_attr.ah_flags = IB_AH_GRH;
+	qp_attr->ah_attr.port_num = 1;
+	qp_attr->ah_attr.sl = (params.rnt_rc_sl_fl &
+			       OCRDMA_QP_PARAMS_SL_MASK) >>
+				OCRDMA_QP_PARAMS_SL_SHIFT;
+	qp_attr->timeout = (params.ack_to_rnr_rtc_dest_qpn &
+			    OCRDMA_QP_PARAMS_ACK_TIMEOUT_MASK) >>
+				OCRDMA_QP_PARAMS_ACK_TIMEOUT_SHIFT;
+	qp_attr->rnr_retry = (params.ack_to_rnr_rtc_dest_qpn &
+			      OCRDMA_QP_PARAMS_RNR_RETRY_CNT_MASK) >>
+				OCRDMA_QP_PARAMS_RNR_RETRY_CNT_SHIFT;
+	qp_attr->retry_cnt =
+	    (params.rnt_rc_sl_fl & OCRDMA_QP_PARAMS_RETRY_CNT_MASK) >>
+		OCRDMA_QP_PARAMS_RETRY_CNT_SHIFT;
+	qp_attr->min_rnr_timer = 0;
+	qp_attr->pkey_index = 0;
+	qp_attr->port_num = 1;
+	qp_attr->ah_attr.src_path_bits = 0;
+	qp_attr->ah_attr.static_rate = 0;
+	qp_attr->alt_pkey_index = 0;
+	qp_attr->alt_port_num = 0;
+	qp_attr->alt_timeout = 0;
+	memset(&qp_attr->alt_ah_attr, 0, sizeof(qp_attr->alt_ah_attr));
+	qp_state = (params.max_sge_recv_flags & OCRDMA_QP_PARAMS_STATE_MASK) >>
+		    OCRDMA_QP_PARAMS_STATE_SHIFT;
+	qp_attr->sq_draining = (qp_state == OCRDMA_QPS_SQ_DRAINING) ? 1 : 0;
+	qp_attr->max_dest_rd_atomic =
+	    params.max_ord_ird >> OCRDMA_QP_PARAMS_MAX_ORD_SHIFT;
+	qp_attr->max_rd_atomic =
+	    params.max_ord_ird & OCRDMA_QP_PARAMS_MAX_IRD_MASK;
+	qp_attr->en_sqd_async_notify = (params.max_sge_recv_flags &
+				OCRDMA_QP_PARAMS_FLAGS_SQD_ASYNC) ? 1 : 0;
+mbx_err:
+	return status;
+}
+
+static void ocrdma_srq_toggle_bit(struct ocrdma_srq *srq, int idx)
+{
+	int i = idx / 32;
+	unsigned int mask = (1 << (idx % 32));
+
+	if (srq->idx_bit_fields[i] & mask)
+		srq->idx_bit_fields[i] &= ~mask;
+	else
+		srq->idx_bit_fields[i] |= mask;
+}
+
+static int ocrdma_hwq_free_cnt(struct ocrdma_qp_hwq_info *q)
+{
+	return ((q->max_wqe_idx - q->head) + q->tail) % q->max_cnt;
+}
+
+static int is_hw_sq_empty(struct ocrdma_qp *qp)
+{
+	return (qp->sq.tail == qp->sq.head);
+}
+
+static int is_hw_rq_empty(struct ocrdma_qp *qp)
+{
+	return (qp->rq.tail == qp->rq.head);
+}
+
+static void *ocrdma_hwq_head(struct ocrdma_qp_hwq_info *q)
+{
+	return q->va + (q->head * q->entry_size);
+}
+
+static void *ocrdma_hwq_head_from_idx(struct ocrdma_qp_hwq_info *q,
+				      u32 idx)
+{
+	return q->va + (idx * q->entry_size);
+}
+
+static void ocrdma_hwq_inc_head(struct ocrdma_qp_hwq_info *q)
+{
+	q->head = (q->head + 1) & q->max_wqe_idx;
+}
+
+static void ocrdma_hwq_inc_tail(struct ocrdma_qp_hwq_info *q)
+{
+	q->tail = (q->tail + 1) & q->max_wqe_idx;
+}
+
+/* discard the cqe for a given QP */
+static void ocrdma_discard_cqes(struct ocrdma_qp *qp, struct ocrdma_cq *cq)
+{
+	unsigned long cq_flags;
+	unsigned long flags;
+	int discard_cnt = 0;
+	u32 cur_getp, stop_getp;
+	struct ocrdma_cqe *cqe;
+	u32 qpn = 0, wqe_idx = 0;
+
+	spin_lock_irqsave(&cq->cq_lock, cq_flags);
+
+	/* traverse through the CQEs in the hw CQ,
+	 * find the matching CQE for a given qp,
+	 * mark the matching one discarded by clearing qpn.
+	 * ring the doorbell in the poll_cq() as
+	 * we don't complete out of order cqe.
+	 */
+
+	cur_getp = cq->getp;
+	/* find upto when do we reap the cq. */
+	stop_getp = cur_getp;
+	do {
+		if (is_hw_sq_empty(qp) && (!qp->srq && is_hw_rq_empty(qp)))
+			break;
+
+		cqe = cq->va + cur_getp;
+		/* if (a) done reaping whole hw cq, or
+		 *    (b) qp_xq becomes empty.
+		 * then exit
+		 */
+		qpn = cqe->cmn.qpn & OCRDMA_CQE_QPN_MASK;
+		/* if previously discarded cqe found, skip that too. */
+		/* check for matching qp */
+		if (qpn == 0 || qpn != qp->id)
+			goto skip_cqe;
+
+		if (is_cqe_for_sq(cqe)) {
+			ocrdma_hwq_inc_tail(&qp->sq);
+		} else {
+			if (qp->srq) {
+				wqe_idx = (le32_to_cpu(cqe->rq.buftag_qpn) >>
+					OCRDMA_CQE_BUFTAG_SHIFT) &
+					qp->srq->rq.max_wqe_idx;
+				if (wqe_idx < 1)
+					BUG();
+				spin_lock_irqsave(&qp->srq->q_lock, flags);
+				ocrdma_hwq_inc_tail(&qp->srq->rq);
+				ocrdma_srq_toggle_bit(qp->srq, wqe_idx - 1);
+				spin_unlock_irqrestore(&qp->srq->q_lock, flags);
+
+			} else {
+				ocrdma_hwq_inc_tail(&qp->rq);
+			}
+		}
+		/* mark cqe discarded so that it is not picked up later
+		 * in the poll_cq().
+		 */
+		discard_cnt += 1;
+		cqe->cmn.qpn = 0;
+skip_cqe:
+		cur_getp = (cur_getp + 1) % cq->max_hw_cqe;
+	} while (cur_getp != stop_getp);
+	spin_unlock_irqrestore(&cq->cq_lock, cq_flags);
+}
+
+void ocrdma_del_flush_qp(struct ocrdma_qp *qp)
+{
+	int found = false;
+	unsigned long flags;
+	struct ocrdma_dev *dev = qp->dev;
+	/* sync with any active CQ poll */
+
+	spin_lock_irqsave(&dev->flush_q_lock, flags);
+	found = ocrdma_is_qp_in_sq_flushlist(qp->sq_cq, qp);
+	if (found)
+		list_del(&qp->sq_entry);
+	if (!qp->srq) {
+		found = ocrdma_is_qp_in_rq_flushlist(qp->rq_cq, qp);
+		if (found)
+			list_del(&qp->rq_entry);
+	}
+	spin_unlock_irqrestore(&dev->flush_q_lock, flags);
+}
+
+int ocrdma_destroy_qp(struct ib_qp *ibqp)
+{
+	int status;
+	struct ocrdma_pd *pd;
+	struct ocrdma_qp *qp;
+	struct ocrdma_dev *dev;
+	struct ib_qp_attr attrs;
+	int attr_mask = IB_QP_STATE;
+	unsigned long flags;
+
+	qp = get_ocrdma_qp(ibqp);
+	dev = qp->dev;
+
+	attrs.qp_state = IB_QPS_ERR;
+	pd = qp->pd;
+
+	/* change the QP state to ERROR */
+	_ocrdma_modify_qp(ibqp, &attrs, attr_mask);
+
+	/* ensure that CQEs for newly created QP (whose id may be same with
+	 * one which just getting destroyed are same), dont get
+	 * discarded until the old CQEs are discarded.
+	 */
+	mutex_lock(&dev->dev_lock);
+	status = ocrdma_mbx_destroy_qp(dev, qp);
+
+	/*
+	 * acquire CQ lock while destroy is in progress, in order to
+	 * protect against proessing in-flight CQEs for this QP.
+	 */
+	spin_lock_irqsave(&qp->sq_cq->cq_lock, flags);
+	if (qp->rq_cq && (qp->rq_cq != qp->sq_cq))
+		spin_lock(&qp->rq_cq->cq_lock);
+
+	ocrdma_del_qpn_map(dev, qp);
+
+	if (qp->rq_cq && (qp->rq_cq != qp->sq_cq))
+		spin_unlock(&qp->rq_cq->cq_lock);
+	spin_unlock_irqrestore(&qp->sq_cq->cq_lock, flags);
+
+	if (!pd->uctx) {
+		ocrdma_discard_cqes(qp, qp->sq_cq);
+		ocrdma_discard_cqes(qp, qp->rq_cq);
+	}
+	mutex_unlock(&dev->dev_lock);
+
+	if (pd->uctx) {
+		ocrdma_del_mmap(pd->uctx, (u64) qp->sq.pa,
+				PAGE_ALIGN(qp->sq.len));
+		if (!qp->srq)
+			ocrdma_del_mmap(pd->uctx, (u64) qp->rq.pa,
+					PAGE_ALIGN(qp->rq.len));
+	}
+
+	ocrdma_del_flush_qp(qp);
+
+	kfree(qp->wqe_wr_id_tbl);
+	kfree(qp->rqe_wr_id_tbl);
+	kfree(qp);
+	return status;
+}
+
+static int ocrdma_copy_srq_uresp(struct ocrdma_dev *dev, struct ocrdma_srq *srq,
+				struct ib_udata *udata)
+{
+	int status;
+	struct ocrdma_create_srq_uresp uresp;
+
+	memset(&uresp, 0, sizeof(uresp));
+	uresp.rq_dbid = srq->rq.dbid;
+	uresp.num_rq_pages = 1;
+	uresp.rq_page_addr[0] = virt_to_phys(srq->rq.va);
+	uresp.rq_page_size = srq->rq.len;
+	uresp.db_page_addr = dev->nic_info.unmapped_db +
+	    (srq->pd->id * dev->nic_info.db_page_size);
+	uresp.db_page_size = dev->nic_info.db_page_size;
+	uresp.num_rqe_allocated = srq->rq.max_cnt;
+	if (ocrdma_get_asic_type(dev) == OCRDMA_ASIC_GEN_SKH_R) {
+		uresp.db_rq_offset = OCRDMA_DB_GEN2_RQ_OFFSET;
+		uresp.db_shift = 24;
+	} else {
+		uresp.db_rq_offset = OCRDMA_DB_RQ_OFFSET;
+		uresp.db_shift = 16;
+	}
+
+	status = ib_copy_to_udata(udata, &uresp, sizeof(uresp));
+	if (status)
+		return status;
+	status = ocrdma_add_mmap(srq->pd->uctx, uresp.rq_page_addr[0],
+				 uresp.rq_page_size);
+	if (status)
+		return status;
+	return status;
+}
+
+struct ib_srq *ocrdma_create_srq(struct ib_pd *ibpd,
+				 struct ib_srq_init_attr *init_attr,
+				 struct ib_udata *udata)
+{
+	int status = -ENOMEM;
+	struct ocrdma_pd *pd = get_ocrdma_pd(ibpd);
+	struct ocrdma_dev *dev = get_ocrdma_dev(ibpd->device);
+	struct ocrdma_srq *srq;
+
+	if (init_attr->attr.max_sge > dev->attr.max_recv_sge)
+		return ERR_PTR(-EINVAL);
+	if (init_attr->attr.max_wr > dev->attr.max_rqe)
+		return ERR_PTR(-EINVAL);
+
+	srq = kzalloc(sizeof(*srq), GFP_KERNEL);
+	if (!srq)
+		return ERR_PTR(status);
+
+	spin_lock_init(&srq->q_lock);
+	srq->pd = pd;
+	srq->db = dev->nic_info.db + (pd->id * dev->nic_info.db_page_size);
+	status = ocrdma_mbx_create_srq(dev, srq, init_attr, pd);
+	if (status)
+		goto err;
+
+	if (udata == NULL) {
+		srq->rqe_wr_id_tbl = kzalloc(sizeof(u64) * srq->rq.max_cnt,
+			    GFP_KERNEL);
+		if (srq->rqe_wr_id_tbl == NULL)
+			goto arm_err;
+
+		srq->bit_fields_len = (srq->rq.max_cnt / 32) +
+		    (srq->rq.max_cnt % 32 ? 1 : 0);
+		srq->idx_bit_fields =
+		    kmalloc(srq->bit_fields_len * sizeof(u32), GFP_KERNEL);
+		if (srq->idx_bit_fields == NULL)
+			goto arm_err;
+		memset(srq->idx_bit_fields, 0xff,
+		       srq->bit_fields_len * sizeof(u32));
+	}
+
+	if (init_attr->attr.srq_limit) {
+		status = ocrdma_mbx_modify_srq(srq, &init_attr->attr);
+		if (status)
+			goto arm_err;
+	}
+
+	if (udata) {
+		status = ocrdma_copy_srq_uresp(dev, srq, udata);
+		if (status)
+			goto arm_err;
+	}
+
+	return &srq->ibsrq;
+
+arm_err:
+	ocrdma_mbx_destroy_srq(dev, srq);
+err:
+	kfree(srq->rqe_wr_id_tbl);
+	kfree(srq->idx_bit_fields);
+	kfree(srq);
+	return ERR_PTR(status);
+}
+
+int ocrdma_modify_srq(struct ib_srq *ibsrq,
+		      struct ib_srq_attr *srq_attr,
+		      enum ib_srq_attr_mask srq_attr_mask,
+		      struct ib_udata *udata)
+{
+	int status = 0;
+	struct ocrdma_srq *srq;
+
+	srq = get_ocrdma_srq(ibsrq);
+	if (srq_attr_mask & IB_SRQ_MAX_WR)
+		status = -EINVAL;
+	else
+		status = ocrdma_mbx_modify_srq(srq, srq_attr);
+	return status;
+}
+
+int ocrdma_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *srq_attr)
+{
+	int status;
+	struct ocrdma_srq *srq;
+
+	srq = get_ocrdma_srq(ibsrq);
+	status = ocrdma_mbx_query_srq(srq, srq_attr);
+	return status;
+}
+
+int ocrdma_destroy_srq(struct ib_srq *ibsrq)
+{
+	int status;
+	struct ocrdma_srq *srq;
+	struct ocrdma_dev *dev = get_ocrdma_dev(ibsrq->device);
+
+	srq = get_ocrdma_srq(ibsrq);
+
+	status = ocrdma_mbx_destroy_srq(dev, srq);
+
+	if (srq->pd->uctx)
+		ocrdma_del_mmap(srq->pd->uctx, (u64) srq->rq.pa,
+				PAGE_ALIGN(srq->rq.len));
+
+	kfree(srq->idx_bit_fields);
+	kfree(srq->rqe_wr_id_tbl);
+	kfree(srq);
+	return status;
+}
+
+/* unprivileged verbs and their support functions. */
+static void ocrdma_build_ud_hdr(struct ocrdma_qp *qp,
+				struct ocrdma_hdr_wqe *hdr,
+				struct ib_send_wr *wr)
+{
+	struct ocrdma_ewqe_ud_hdr *ud_hdr =
+		(struct ocrdma_ewqe_ud_hdr *)(hdr + 1);
+	struct ocrdma_ah *ah = get_ocrdma_ah(wr->wr.ud.ah);
+
+	ud_hdr->rsvd_dest_qpn = wr->wr.ud.remote_qpn;
+	if (qp->qp_type == IB_QPT_GSI)
+		ud_hdr->qkey = qp->qkey;
+	else
+		ud_hdr->qkey = wr->wr.ud.remote_qkey;
+	ud_hdr->rsvd_ahid = ah->id;
+}
+
+static void ocrdma_build_sges(struct ocrdma_hdr_wqe *hdr,
+			      struct ocrdma_sge *sge, int num_sge,
+			      struct ib_sge *sg_list)
+{
+	int i;
+
+	for (i = 0; i < num_sge; i++) {
+		sge[i].lrkey = sg_list[i].lkey;
+		sge[i].addr_lo = sg_list[i].addr;
+		sge[i].addr_hi = upper_32_bits(sg_list[i].addr);
+		sge[i].len = sg_list[i].length;
+		hdr->total_len += sg_list[i].length;
+	}
+	if (num_sge == 0)
+		memset(sge, 0, sizeof(*sge));
+}
+
+static inline uint32_t ocrdma_sglist_len(struct ib_sge *sg_list, int num_sge)
+{
+	uint32_t total_len = 0, i;
+
+	for (i = 0; i < num_sge; i++)
+		total_len += sg_list[i].length;
+	return total_len;
+}
+
+
+static int ocrdma_build_inline_sges(struct ocrdma_qp *qp,
+				    struct ocrdma_hdr_wqe *hdr,
+				    struct ocrdma_sge *sge,
+				    struct ib_send_wr *wr, u32 wqe_size)
+{
+	int i;
+	char *dpp_addr;
+
+	if (wr->send_flags & IB_SEND_INLINE && qp->qp_type != IB_QPT_UD) {
+		hdr->total_len = ocrdma_sglist_len(wr->sg_list, wr->num_sge);
+		if (unlikely(hdr->total_len > qp->max_inline_data)) {
+			pr_err("%s() supported_len=0x%x,\n"
+			       " unsupported len req=0x%x\n", __func__,
+				qp->max_inline_data, hdr->total_len);
+			return -EINVAL;
+		}
+		dpp_addr = (char *)sge;
+		for (i = 0; i < wr->num_sge; i++) {
+			memcpy(dpp_addr,
+			       (void *)(unsigned long)wr->sg_list[i].addr,
+			       wr->sg_list[i].length);
+			dpp_addr += wr->sg_list[i].length;
+		}
+
+		wqe_size += roundup(hdr->total_len, OCRDMA_WQE_ALIGN_BYTES);
+		if (0 == hdr->total_len)
+			wqe_size += sizeof(struct ocrdma_sge);
+		hdr->cw |= (OCRDMA_TYPE_INLINE << OCRDMA_WQE_TYPE_SHIFT);
+	} else {
+		ocrdma_build_sges(hdr, sge, wr->num_sge, wr->sg_list);
+		if (wr->num_sge)
+			wqe_size += (wr->num_sge * sizeof(struct ocrdma_sge));
+		else
+			wqe_size += sizeof(struct ocrdma_sge);
+		hdr->cw |= (OCRDMA_TYPE_LKEY << OCRDMA_WQE_TYPE_SHIFT);
+	}
+	hdr->cw |= ((wqe_size / OCRDMA_WQE_STRIDE) << OCRDMA_WQE_SIZE_SHIFT);
+	return 0;
+}
+
+static int ocrdma_build_send(struct ocrdma_qp *qp, struct ocrdma_hdr_wqe *hdr,
+			     struct ib_send_wr *wr)
+{
+	int status;
+	struct ocrdma_sge *sge;
+	u32 wqe_size = sizeof(*hdr);
+
+	if (qp->qp_type == IB_QPT_UD || qp->qp_type == IB_QPT_GSI) {
+		ocrdma_build_ud_hdr(qp, hdr, wr);
+		sge = (struct ocrdma_sge *)(hdr + 2);
+		wqe_size += sizeof(struct ocrdma_ewqe_ud_hdr);
+	} else {
+		sge = (struct ocrdma_sge *)(hdr + 1);
+	}
+
+	status = ocrdma_build_inline_sges(qp, hdr, sge, wr, wqe_size);
+	return status;
+}
+
+static int ocrdma_build_write(struct ocrdma_qp *qp, struct ocrdma_hdr_wqe *hdr,
+			      struct ib_send_wr *wr)
+{
+	int status;
+	struct ocrdma_sge *ext_rw = (struct ocrdma_sge *)(hdr + 1);
+	struct ocrdma_sge *sge = ext_rw + 1;
+	u32 wqe_size = sizeof(*hdr) + sizeof(*ext_rw);
+
+	status = ocrdma_build_inline_sges(qp, hdr, sge, wr, wqe_size);
+	if (status)
+		return status;
+	ext_rw->addr_lo = wr->wr.rdma.remote_addr;
+	ext_rw->addr_hi = upper_32_bits(wr->wr.rdma.remote_addr);
+	ext_rw->lrkey = wr->wr.rdma.rkey;
+	ext_rw->len = hdr->total_len;
+	return 0;
+}
+
+static void ocrdma_build_read(struct ocrdma_qp *qp, struct ocrdma_hdr_wqe *hdr,
+			      struct ib_send_wr *wr)
+{
+	struct ocrdma_sge *ext_rw = (struct ocrdma_sge *)(hdr + 1);
+	struct ocrdma_sge *sge = ext_rw + 1;
+	u32 wqe_size = ((wr->num_sge + 1) * sizeof(struct ocrdma_sge)) +
+	    sizeof(struct ocrdma_hdr_wqe);
+
+	ocrdma_build_sges(hdr, sge, wr->num_sge, wr->sg_list);
+	hdr->cw |= ((wqe_size / OCRDMA_WQE_STRIDE) << OCRDMA_WQE_SIZE_SHIFT);
+	hdr->cw |= (OCRDMA_READ << OCRDMA_WQE_OPCODE_SHIFT);
+	hdr->cw |= (OCRDMA_TYPE_LKEY << OCRDMA_WQE_TYPE_SHIFT);
+
+	ext_rw->addr_lo = wr->wr.rdma.remote_addr;
+	ext_rw->addr_hi = upper_32_bits(wr->wr.rdma.remote_addr);
+	ext_rw->lrkey = wr->wr.rdma.rkey;
+	ext_rw->len = hdr->total_len;
+}
+
+static void build_frmr_pbes(struct ib_send_wr *wr, struct ocrdma_pbl *pbl_tbl,
+			    struct ocrdma_hw_mr *hwmr)
+{
+	int i;
+	u64 buf_addr = 0;
+	int num_pbes;
+	struct ocrdma_pbe *pbe;
+
+	pbe = (struct ocrdma_pbe *)pbl_tbl->va;
+	num_pbes = 0;
+
+	/* go through the OS phy regions & fill hw pbe entries into pbls. */
+	for (i = 0; i < wr->wr.fast_reg.page_list_len; i++) {
+		/* number of pbes can be more for one OS buf, when
+		 * buffers are of different sizes.
+		 * split the ib_buf to one or more pbes.
+		 */
+		buf_addr = wr->wr.fast_reg.page_list->page_list[i];
+		pbe->pa_lo = cpu_to_le32((u32) (buf_addr & PAGE_MASK));
+		pbe->pa_hi = cpu_to_le32((u32) upper_32_bits(buf_addr));
+		num_pbes += 1;
+		pbe++;
+
+		/* if the pbl is full storing the pbes,
+		 * move to next pbl.
+		*/
+		if (num_pbes == (hwmr->pbl_size/sizeof(u64))) {
+			pbl_tbl++;
+			pbe = (struct ocrdma_pbe *)pbl_tbl->va;
+		}
+	}
+	return;
+}
+
+static int get_encoded_page_size(int pg_sz)
+{
+	/* Max size is 256M 4096 << 16 */
+	int i = 0;
+	for (; i < 17; i++)
+		if (pg_sz == (4096 << i))
+			break;
+	return i;
+}
+
+
+static int ocrdma_build_fr(struct ocrdma_qp *qp, struct ocrdma_hdr_wqe *hdr,
+			   struct ib_send_wr *wr)
+{
+	u64 fbo;
+	struct ocrdma_ewqe_fr *fast_reg = (struct ocrdma_ewqe_fr *)(hdr + 1);
+	struct ocrdma_mr *mr;
+	u32 wqe_size = sizeof(*fast_reg) + sizeof(*hdr);
+
+	wqe_size = roundup(wqe_size, OCRDMA_WQE_ALIGN_BYTES);
+
+	if (wr->wr.fast_reg.page_list_len > qp->dev->attr.max_pages_per_frmr)
+		return -EINVAL;
+
+	hdr->cw |= (OCRDMA_FR_MR << OCRDMA_WQE_OPCODE_SHIFT);
+	hdr->cw |= ((wqe_size / OCRDMA_WQE_STRIDE) << OCRDMA_WQE_SIZE_SHIFT);
+
+	if (wr->wr.fast_reg.page_list_len == 0)
+		BUG();
+	if (wr->wr.fast_reg.access_flags & IB_ACCESS_LOCAL_WRITE)
+		hdr->rsvd_lkey_flags |= OCRDMA_LKEY_FLAG_LOCAL_WR;
+	if (wr->wr.fast_reg.access_flags & IB_ACCESS_REMOTE_WRITE)
+		hdr->rsvd_lkey_flags |= OCRDMA_LKEY_FLAG_REMOTE_WR;
+	if (wr->wr.fast_reg.access_flags & IB_ACCESS_REMOTE_READ)
+		hdr->rsvd_lkey_flags |= OCRDMA_LKEY_FLAG_REMOTE_RD;
+	hdr->lkey = wr->wr.fast_reg.rkey;
+	hdr->total_len = wr->wr.fast_reg.length;
+
+	fbo = wr->wr.fast_reg.iova_start -
+	    (wr->wr.fast_reg.page_list->page_list[0] & PAGE_MASK);
+
+	fast_reg->va_hi = upper_32_bits(wr->wr.fast_reg.iova_start);
+	fast_reg->va_lo = (u32) (wr->wr.fast_reg.iova_start & 0xffffffff);
+	fast_reg->fbo_hi = upper_32_bits(fbo);
+	fast_reg->fbo_lo = (u32) fbo & 0xffffffff;
+	fast_reg->num_sges = wr->wr.fast_reg.page_list_len;
+	fast_reg->size_sge =
+		get_encoded_page_size(1 << wr->wr.fast_reg.page_shift);
+	mr = (struct ocrdma_mr *) (unsigned long)
+		qp->dev->stag_arr[(hdr->lkey >> 8) & (OCRDMA_MAX_STAG - 1)];
+	build_frmr_pbes(wr, mr->hwmr.pbl_table, &mr->hwmr);
+	return 0;
+}
+
+static void ocrdma_ring_sq_db(struct ocrdma_qp *qp)
+{
+	u32 val = qp->sq.dbid | (1 << OCRDMA_DB_SQ_SHIFT);
+
+	iowrite32(val, qp->sq_db);
+}
+
+int ocrdma_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
+		     struct ib_send_wr **bad_wr)
+{
+	int status = 0;
+	struct ocrdma_qp *qp = get_ocrdma_qp(ibqp);
+	struct ocrdma_hdr_wqe *hdr;
+	unsigned long flags;
+
+	spin_lock_irqsave(&qp->q_lock, flags);
+	if (qp->state != OCRDMA_QPS_RTS && qp->state != OCRDMA_QPS_SQD) {
+		spin_unlock_irqrestore(&qp->q_lock, flags);
+		*bad_wr = wr;
+		return -EINVAL;
+	}
+
+	while (wr) {
+		if (qp->qp_type == IB_QPT_UD &&
+		    (wr->opcode != IB_WR_SEND &&
+		     wr->opcode != IB_WR_SEND_WITH_IMM)) {
+			*bad_wr = wr;
+			status = -EINVAL;
+			break;
+		}
+		if (ocrdma_hwq_free_cnt(&qp->sq) == 0 ||
+		    wr->num_sge > qp->sq.max_sges) {
+			*bad_wr = wr;
+			status = -ENOMEM;
+			break;
+		}
+		hdr = ocrdma_hwq_head(&qp->sq);
+		hdr->cw = 0;
+		if (wr->send_flags & IB_SEND_SIGNALED || qp->signaled)
+			hdr->cw |= (OCRDMA_FLAG_SIG << OCRDMA_WQE_FLAGS_SHIFT);
+		if (wr->send_flags & IB_SEND_FENCE)
+			hdr->cw |=
+			    (OCRDMA_FLAG_FENCE_L << OCRDMA_WQE_FLAGS_SHIFT);
+		if (wr->send_flags & IB_SEND_SOLICITED)
+			hdr->cw |=
+			    (OCRDMA_FLAG_SOLICIT << OCRDMA_WQE_FLAGS_SHIFT);
+		hdr->total_len = 0;
+		switch (wr->opcode) {
+		case IB_WR_SEND_WITH_IMM:
+			hdr->cw |= (OCRDMA_FLAG_IMM << OCRDMA_WQE_FLAGS_SHIFT);
+			hdr->immdt = ntohl(wr->ex.imm_data);
+		case IB_WR_SEND:
+			hdr->cw |= (OCRDMA_SEND << OCRDMA_WQE_OPCODE_SHIFT);
+			ocrdma_build_send(qp, hdr, wr);
+			break;
+		case IB_WR_SEND_WITH_INV:
+			hdr->cw |= (OCRDMA_FLAG_INV << OCRDMA_WQE_FLAGS_SHIFT);
+			hdr->cw |= (OCRDMA_SEND << OCRDMA_WQE_OPCODE_SHIFT);
+			hdr->lkey = wr->ex.invalidate_rkey;
+			status = ocrdma_build_send(qp, hdr, wr);
+			break;
+		case IB_WR_RDMA_WRITE_WITH_IMM:
+			hdr->cw |= (OCRDMA_FLAG_IMM << OCRDMA_WQE_FLAGS_SHIFT);
+			hdr->immdt = ntohl(wr->ex.imm_data);
+		case IB_WR_RDMA_WRITE:
+			hdr->cw |= (OCRDMA_WRITE << OCRDMA_WQE_OPCODE_SHIFT);
+			status = ocrdma_build_write(qp, hdr, wr);
+			break;
+		case IB_WR_RDMA_READ_WITH_INV:
+			hdr->cw |= (OCRDMA_FLAG_INV << OCRDMA_WQE_FLAGS_SHIFT);
+		case IB_WR_RDMA_READ:
+			ocrdma_build_read(qp, hdr, wr);
+			break;
+		case IB_WR_LOCAL_INV:
+			hdr->cw |=
+			    (OCRDMA_LKEY_INV << OCRDMA_WQE_OPCODE_SHIFT);
+			hdr->cw |= ((sizeof(struct ocrdma_hdr_wqe) +
+					sizeof(struct ocrdma_sge)) /
+				OCRDMA_WQE_STRIDE) << OCRDMA_WQE_SIZE_SHIFT;
+			hdr->lkey = wr->ex.invalidate_rkey;
+			break;
+		case IB_WR_FAST_REG_MR:
+			status = ocrdma_build_fr(qp, hdr, wr);
+			break;
+		default:
+			status = -EINVAL;
+			break;
+		}
+		if (status) {
+			*bad_wr = wr;
+			break;
+		}
+		if (wr->send_flags & IB_SEND_SIGNALED || qp->signaled)
+			qp->wqe_wr_id_tbl[qp->sq.head].signaled = 1;
+		else
+			qp->wqe_wr_id_tbl[qp->sq.head].signaled = 0;
+		qp->wqe_wr_id_tbl[qp->sq.head].wrid = wr->wr_id;
+		ocrdma_cpu_to_le32(hdr, ((hdr->cw >> OCRDMA_WQE_SIZE_SHIFT) &
+				   OCRDMA_WQE_SIZE_MASK) * OCRDMA_WQE_STRIDE);
+		/* make sure wqe is written before adapter can access it */
+		wmb();
+		/* inform hw to start processing it */
+		ocrdma_ring_sq_db(qp);
+
+		/* update pointer, counter for next wr */
+		ocrdma_hwq_inc_head(&qp->sq);
+		wr = wr->next;
+	}
+	spin_unlock_irqrestore(&qp->q_lock, flags);
+	return status;
+}
+
+static void ocrdma_ring_rq_db(struct ocrdma_qp *qp)
+{
+	u32 val = qp->rq.dbid | (1 << OCRDMA_DB_RQ_SHIFT);
+
+	iowrite32(val, qp->rq_db);
+}
+
+static void ocrdma_build_rqe(struct ocrdma_hdr_wqe *rqe, struct ib_recv_wr *wr,
+			     u16 tag)
+{
+	u32 wqe_size = 0;
+	struct ocrdma_sge *sge;
+	if (wr->num_sge)
+		wqe_size = (wr->num_sge * sizeof(*sge)) + sizeof(*rqe);
+	else
+		wqe_size = sizeof(*sge) + sizeof(*rqe);
+
+	rqe->cw = ((wqe_size / OCRDMA_WQE_STRIDE) <<
+				OCRDMA_WQE_SIZE_SHIFT);
+	rqe->cw |= (OCRDMA_FLAG_SIG << OCRDMA_WQE_FLAGS_SHIFT);
+	rqe->cw |= (OCRDMA_TYPE_LKEY << OCRDMA_WQE_TYPE_SHIFT);
+	rqe->total_len = 0;
+	rqe->rsvd_tag = tag;
+	sge = (struct ocrdma_sge *)(rqe + 1);
+	ocrdma_build_sges(rqe, sge, wr->num_sge, wr->sg_list);
+	ocrdma_cpu_to_le32(rqe, wqe_size);
+}
+
+int ocrdma_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr,
+		     struct ib_recv_wr **bad_wr)
+{
+	int status = 0;
+	unsigned long flags;
+	struct ocrdma_qp *qp = get_ocrdma_qp(ibqp);
+	struct ocrdma_hdr_wqe *rqe;
+
+	spin_lock_irqsave(&qp->q_lock, flags);
+	if (qp->state == OCRDMA_QPS_RST || qp->state == OCRDMA_QPS_ERR) {
+		spin_unlock_irqrestore(&qp->q_lock, flags);
+		*bad_wr = wr;
+		return -EINVAL;
+	}
+	while (wr) {
+		if (ocrdma_hwq_free_cnt(&qp->rq) == 0 ||
+		    wr->num_sge > qp->rq.max_sges) {
+			*bad_wr = wr;
+			status = -ENOMEM;
+			break;
+		}
+		rqe = ocrdma_hwq_head(&qp->rq);
+		ocrdma_build_rqe(rqe, wr, 0);
+
+		qp->rqe_wr_id_tbl[qp->rq.head] = wr->wr_id;
+		/* make sure rqe is written before adapter can access it */
+		wmb();
+
+		/* inform hw to start processing it */
+		ocrdma_ring_rq_db(qp);
+
+		/* update pointer, counter for next wr */
+		ocrdma_hwq_inc_head(&qp->rq);
+		wr = wr->next;
+	}
+	spin_unlock_irqrestore(&qp->q_lock, flags);
+	return status;
+}
+
+/* cqe for srq's rqe can potentially arrive out of order.
+ * index gives the entry in the shadow table where to store
+ * the wr_id. tag/index is returned in cqe to reference back
+ * for a given rqe.
+ */
+static int ocrdma_srq_get_idx(struct ocrdma_srq *srq)
+{
+	int row = 0;
+	int indx = 0;
+
+	for (row = 0; row < srq->bit_fields_len; row++) {
+		if (srq->idx_bit_fields[row]) {
+			indx = ffs(srq->idx_bit_fields[row]);
+			indx = (row * 32) + (indx - 1);
+			if (indx >= srq->rq.max_cnt)
+				BUG();
+			ocrdma_srq_toggle_bit(srq, indx);
+			break;
+		}
+	}
+
+	if (row == srq->bit_fields_len)
+		BUG();
+	return indx + 1; /* Use from index 1 */
+}
+
+static void ocrdma_ring_srq_db(struct ocrdma_srq *srq)
+{
+	u32 val = srq->rq.dbid | (1 << 16);
+
+	iowrite32(val, srq->db + OCRDMA_DB_GEN2_SRQ_OFFSET);
+}
+
+int ocrdma_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr,
+			 struct ib_recv_wr **bad_wr)
+{
+	int status = 0;
+	unsigned long flags;
+	struct ocrdma_srq *srq;
+	struct ocrdma_hdr_wqe *rqe;
+	u16 tag;
+
+	srq = get_ocrdma_srq(ibsrq);
+
+	spin_lock_irqsave(&srq->q_lock, flags);
+	while (wr) {
+		if (ocrdma_hwq_free_cnt(&srq->rq) == 0 ||
+		    wr->num_sge > srq->rq.max_sges) {
+			status = -ENOMEM;
+			*bad_wr = wr;
+			break;
+		}
+		tag = ocrdma_srq_get_idx(srq);
+		rqe = ocrdma_hwq_head(&srq->rq);
+		ocrdma_build_rqe(rqe, wr, tag);
+
+		srq->rqe_wr_id_tbl[tag] = wr->wr_id;
+		/* make sure rqe is written before adapter can perform DMA */
+		wmb();
+		/* inform hw to start processing it */
+		ocrdma_ring_srq_db(srq);
+		/* update pointer, counter for next wr */
+		ocrdma_hwq_inc_head(&srq->rq);
+		wr = wr->next;
+	}
+	spin_unlock_irqrestore(&srq->q_lock, flags);
+	return status;
+}
+
+static enum ib_wc_status ocrdma_to_ibwc_err(u16 status)
+{
+	enum ib_wc_status ibwc_status;
+
+	switch (status) {
+	case OCRDMA_CQE_GENERAL_ERR:
+		ibwc_status = IB_WC_GENERAL_ERR;
+		break;
+	case OCRDMA_CQE_LOC_LEN_ERR:
+		ibwc_status = IB_WC_LOC_LEN_ERR;
+		break;
+	case OCRDMA_CQE_LOC_QP_OP_ERR:
+		ibwc_status = IB_WC_LOC_QP_OP_ERR;
+		break;
+	case OCRDMA_CQE_LOC_EEC_OP_ERR:
+		ibwc_status = IB_WC_LOC_EEC_OP_ERR;
+		break;
+	case OCRDMA_CQE_LOC_PROT_ERR:
+		ibwc_status = IB_WC_LOC_PROT_ERR;
+		break;
+	case OCRDMA_CQE_WR_FLUSH_ERR:
+		ibwc_status = IB_WC_WR_FLUSH_ERR;
+		break;
+	case OCRDMA_CQE_MW_BIND_ERR:
+		ibwc_status = IB_WC_MW_BIND_ERR;
+		break;
+	case OCRDMA_CQE_BAD_RESP_ERR:
+		ibwc_status = IB_WC_BAD_RESP_ERR;
+		break;
+	case OCRDMA_CQE_LOC_ACCESS_ERR:
+		ibwc_status = IB_WC_LOC_ACCESS_ERR;
+		break;
+	case OCRDMA_CQE_REM_INV_REQ_ERR:
+		ibwc_status = IB_WC_REM_INV_REQ_ERR;
+		break;
+	case OCRDMA_CQE_REM_ACCESS_ERR:
+		ibwc_status = IB_WC_REM_ACCESS_ERR;
+		break;
+	case OCRDMA_CQE_REM_OP_ERR:
+		ibwc_status = IB_WC_REM_OP_ERR;
+		break;
+	case OCRDMA_CQE_RETRY_EXC_ERR:
+		ibwc_status = IB_WC_RETRY_EXC_ERR;
+		break;
+	case OCRDMA_CQE_RNR_RETRY_EXC_ERR:
+		ibwc_status = IB_WC_RNR_RETRY_EXC_ERR;
+		break;
+	case OCRDMA_CQE_LOC_RDD_VIOL_ERR:
+		ibwc_status = IB_WC_LOC_RDD_VIOL_ERR;
+		break;
+	case OCRDMA_CQE_REM_INV_RD_REQ_ERR:
+		ibwc_status = IB_WC_REM_INV_RD_REQ_ERR;
+		break;
+	case OCRDMA_CQE_REM_ABORT_ERR:
+		ibwc_status = IB_WC_REM_ABORT_ERR;
+		break;
+	case OCRDMA_CQE_INV_EECN_ERR:
+		ibwc_status = IB_WC_INV_EECN_ERR;
+		break;
+	case OCRDMA_CQE_INV_EEC_STATE_ERR:
+		ibwc_status = IB_WC_INV_EEC_STATE_ERR;
+		break;
+	case OCRDMA_CQE_FATAL_ERR:
+		ibwc_status = IB_WC_FATAL_ERR;
+		break;
+	case OCRDMA_CQE_RESP_TIMEOUT_ERR:
+		ibwc_status = IB_WC_RESP_TIMEOUT_ERR;
+		break;
+	default:
+		ibwc_status = IB_WC_GENERAL_ERR;
+		break;
+	}
+	return ibwc_status;
+}
+
+static void ocrdma_update_wc(struct ocrdma_qp *qp, struct ib_wc *ibwc,
+		      u32 wqe_idx)
+{
+	struct ocrdma_hdr_wqe *hdr;
+	struct ocrdma_sge *rw;
+	int opcode;
+
+	hdr = ocrdma_hwq_head_from_idx(&qp->sq, wqe_idx);
+
+	ibwc->wr_id = qp->wqe_wr_id_tbl[wqe_idx].wrid;
+	/* Undo the hdr->cw swap */
+	opcode = le32_to_cpu(hdr->cw) & OCRDMA_WQE_OPCODE_MASK;
+	switch (opcode) {
+	case OCRDMA_WRITE:
+		ibwc->opcode = IB_WC_RDMA_WRITE;
+		break;
+	case OCRDMA_READ:
+		rw = (struct ocrdma_sge *)(hdr + 1);
+		ibwc->opcode = IB_WC_RDMA_READ;
+		ibwc->byte_len = rw->len;
+		break;
+	case OCRDMA_SEND:
+		ibwc->opcode = IB_WC_SEND;
+		break;
+	case OCRDMA_FR_MR:
+		ibwc->opcode = IB_WC_FAST_REG_MR;
+		break;
+	case OCRDMA_LKEY_INV:
+		ibwc->opcode = IB_WC_LOCAL_INV;
+		break;
+	default:
+		ibwc->status = IB_WC_GENERAL_ERR;
+		pr_err("%s() invalid opcode received = 0x%x\n",
+		       __func__, hdr->cw & OCRDMA_WQE_OPCODE_MASK);
+		break;
+	}
+}
+
+static void ocrdma_set_cqe_status_flushed(struct ocrdma_qp *qp,
+						struct ocrdma_cqe *cqe)
+{
+	if (is_cqe_for_sq(cqe)) {
+		cqe->flags_status_srcqpn = cpu_to_le32(le32_to_cpu(
+				cqe->flags_status_srcqpn) &
+					~OCRDMA_CQE_STATUS_MASK);
+		cqe->flags_status_srcqpn = cpu_to_le32(le32_to_cpu(
+				cqe->flags_status_srcqpn) |
+				(OCRDMA_CQE_WR_FLUSH_ERR <<
+					OCRDMA_CQE_STATUS_SHIFT));
+	} else {
+		if (qp->qp_type == IB_QPT_UD || qp->qp_type == IB_QPT_GSI) {
+			cqe->flags_status_srcqpn = cpu_to_le32(le32_to_cpu(
+					cqe->flags_status_srcqpn) &
+						~OCRDMA_CQE_UD_STATUS_MASK);
+			cqe->flags_status_srcqpn = cpu_to_le32(le32_to_cpu(
+					cqe->flags_status_srcqpn) |
+					(OCRDMA_CQE_WR_FLUSH_ERR <<
+						OCRDMA_CQE_UD_STATUS_SHIFT));
+		} else {
+			cqe->flags_status_srcqpn = cpu_to_le32(le32_to_cpu(
+					cqe->flags_status_srcqpn) &
+						~OCRDMA_CQE_STATUS_MASK);
+			cqe->flags_status_srcqpn = cpu_to_le32(le32_to_cpu(
+					cqe->flags_status_srcqpn) |
+					(OCRDMA_CQE_WR_FLUSH_ERR <<
+						OCRDMA_CQE_STATUS_SHIFT));
+		}
+	}
+}
+
+static bool ocrdma_update_err_cqe(struct ib_wc *ibwc, struct ocrdma_cqe *cqe,
+				  struct ocrdma_qp *qp, int status)
+{
+	bool expand = false;
+
+	ibwc->byte_len = 0;
+	ibwc->qp = &qp->ibqp;
+	ibwc->status = ocrdma_to_ibwc_err(status);
+
+	ocrdma_flush_qp(qp);
+	ocrdma_qp_state_change(qp, IB_QPS_ERR, NULL);
+
+	/* if wqe/rqe pending for which cqe needs to be returned,
+	 * trigger inflating it.
+	 */
+	if (!is_hw_rq_empty(qp) || !is_hw_sq_empty(qp)) {
+		expand = true;
+		ocrdma_set_cqe_status_flushed(qp, cqe);
+	}
+	return expand;
+}
+
+static int ocrdma_update_err_rcqe(struct ib_wc *ibwc, struct ocrdma_cqe *cqe,
+				  struct ocrdma_qp *qp, int status)
+{
+	ibwc->opcode = IB_WC_RECV;
+	ibwc->wr_id = qp->rqe_wr_id_tbl[qp->rq.tail];
+	ocrdma_hwq_inc_tail(&qp->rq);
+
+	return ocrdma_update_err_cqe(ibwc, cqe, qp, status);
+}
+
+static int ocrdma_update_err_scqe(struct ib_wc *ibwc, struct ocrdma_cqe *cqe,
+				  struct ocrdma_qp *qp, int status)
+{
+	ocrdma_update_wc(qp, ibwc, qp->sq.tail);
+	ocrdma_hwq_inc_tail(&qp->sq);
+
+	return ocrdma_update_err_cqe(ibwc, cqe, qp, status);
+}
+
+
+static bool ocrdma_poll_err_scqe(struct ocrdma_qp *qp,
+				 struct ocrdma_cqe *cqe, struct ib_wc *ibwc,
+				 bool *polled, bool *stop)
+{
+	bool expand;
+	int status = (le32_to_cpu(cqe->flags_status_srcqpn) &
+		OCRDMA_CQE_STATUS_MASK) >> OCRDMA_CQE_STATUS_SHIFT;
+
+	/* when hw sq is empty, but rq is not empty, so we continue
+	 * to keep the cqe in order to get the cq event again.
+	 */
+	if (is_hw_sq_empty(qp) && !is_hw_rq_empty(qp)) {
+		/* when cq for rq and sq is same, it is safe to return
+		 * flush cqe for RQEs.
+		 */
+		if (!qp->srq && (qp->sq_cq == qp->rq_cq)) {
+			*polled = true;
+			status = OCRDMA_CQE_WR_FLUSH_ERR;
+			expand = ocrdma_update_err_rcqe(ibwc, cqe, qp, status);
+		} else {
+			/* stop processing further cqe as this cqe is used for
+			 * triggering cq event on buddy cq of RQ.
+			 * When QP is destroyed, this cqe will be removed
+			 * from the cq's hardware q.
+			 */
+			*polled = false;
+			*stop = true;
+			expand = false;
+		}
+	} else if (is_hw_sq_empty(qp)) {
+		/* Do nothing */
+		expand = false;
+		*polled = false;
+		*stop = false;
+	} else {
+		*polled = true;
+		expand = ocrdma_update_err_scqe(ibwc, cqe, qp, status);
+	}
+	return expand;
+}
+
+static bool ocrdma_poll_success_scqe(struct ocrdma_qp *qp,
+				     struct ocrdma_cqe *cqe,
+				     struct ib_wc *ibwc, bool *polled)
+{
+	bool expand = false;
+	int tail = qp->sq.tail;
+	u32 wqe_idx;
+
+	if (!qp->wqe_wr_id_tbl[tail].signaled) {
+		*polled = false;    /* WC cannot be consumed yet */
+	} else {
+		ibwc->status = IB_WC_SUCCESS;
+		ibwc->wc_flags = 0;
+		ibwc->qp = &qp->ibqp;
+		ocrdma_update_wc(qp, ibwc, tail);
+		*polled = true;
+	}
+	wqe_idx = (le32_to_cpu(cqe->wq.wqeidx) &
+			OCRDMA_CQE_WQEIDX_MASK) & qp->sq.max_wqe_idx;
+	if (tail != wqe_idx)
+		expand = true; /* Coalesced CQE can't be consumed yet */
+
+	ocrdma_hwq_inc_tail(&qp->sq);
+	return expand;
+}
+
+static bool ocrdma_poll_scqe(struct ocrdma_qp *qp, struct ocrdma_cqe *cqe,
+			     struct ib_wc *ibwc, bool *polled, bool *stop)
+{
+	int status;
+	bool expand;
+
+	status = (le32_to_cpu(cqe->flags_status_srcqpn) &
+		OCRDMA_CQE_STATUS_MASK) >> OCRDMA_CQE_STATUS_SHIFT;
+
+	if (status == OCRDMA_CQE_SUCCESS)
+		expand = ocrdma_poll_success_scqe(qp, cqe, ibwc, polled);
+	else
+		expand = ocrdma_poll_err_scqe(qp, cqe, ibwc, polled, stop);
+	return expand;
+}
+
+static int ocrdma_update_ud_rcqe(struct ib_wc *ibwc, struct ocrdma_cqe *cqe)
+{
+	int status;
+
+	status = (le32_to_cpu(cqe->flags_status_srcqpn) &
+		OCRDMA_CQE_UD_STATUS_MASK) >> OCRDMA_CQE_UD_STATUS_SHIFT;
+	ibwc->src_qp = le32_to_cpu(cqe->flags_status_srcqpn) &
+						OCRDMA_CQE_SRCQP_MASK;
+	ibwc->pkey_index = le32_to_cpu(cqe->ud.rxlen_pkey) &
+						OCRDMA_CQE_PKEY_MASK;
+	ibwc->wc_flags = IB_WC_GRH;
+	ibwc->byte_len = (le32_to_cpu(cqe->ud.rxlen_pkey) >>
+					OCRDMA_CQE_UD_XFER_LEN_SHIFT);
+	return status;
+}
+
+static void ocrdma_update_free_srq_cqe(struct ib_wc *ibwc,
+				       struct ocrdma_cqe *cqe,
+				       struct ocrdma_qp *qp)
+{
+	unsigned long flags;
+	struct ocrdma_srq *srq;
+	u32 wqe_idx;
+
+	srq = get_ocrdma_srq(qp->ibqp.srq);
+	wqe_idx = (le32_to_cpu(cqe->rq.buftag_qpn) >>
+		OCRDMA_CQE_BUFTAG_SHIFT) & srq->rq.max_wqe_idx;
+	if (wqe_idx < 1)
+		BUG();
+
+	ibwc->wr_id = srq->rqe_wr_id_tbl[wqe_idx];
+	spin_lock_irqsave(&srq->q_lock, flags);
+	ocrdma_srq_toggle_bit(srq, wqe_idx - 1);
+	spin_unlock_irqrestore(&srq->q_lock, flags);
+	ocrdma_hwq_inc_tail(&srq->rq);
+}
+
+static bool ocrdma_poll_err_rcqe(struct ocrdma_qp *qp, struct ocrdma_cqe *cqe,
+				struct ib_wc *ibwc, bool *polled, bool *stop,
+				int status)
+{
+	bool expand;
+
+	/* when hw_rq is empty, but wq is not empty, so continue
+	 * to keep the cqe to get the cq event again.
+	 */
+	if (is_hw_rq_empty(qp) && !is_hw_sq_empty(qp)) {
+		if (!qp->srq && (qp->sq_cq == qp->rq_cq)) {
+			*polled = true;
+			status = OCRDMA_CQE_WR_FLUSH_ERR;
+			expand = ocrdma_update_err_scqe(ibwc, cqe, qp, status);
+		} else {
+			*polled = false;
+			*stop = true;
+			expand = false;
+		}
+	} else if (is_hw_rq_empty(qp)) {
+		/* Do nothing */
+		expand = false;
+		*polled = false;
+		*stop = false;
+	} else {
+		*polled = true;
+		expand = ocrdma_update_err_rcqe(ibwc, cqe, qp, status);
+	}
+	return expand;
+}
+
+static void ocrdma_poll_success_rcqe(struct ocrdma_qp *qp,
+				     struct ocrdma_cqe *cqe, struct ib_wc *ibwc)
+{
+	ibwc->opcode = IB_WC_RECV;
+	ibwc->qp = &qp->ibqp;
+	ibwc->status = IB_WC_SUCCESS;
+
+	if (qp->qp_type == IB_QPT_UD || qp->qp_type == IB_QPT_GSI)
+		ocrdma_update_ud_rcqe(ibwc, cqe);
+	else
+		ibwc->byte_len = le32_to_cpu(cqe->rq.rxlen);
+
+	if (is_cqe_imm(cqe)) {
+		ibwc->ex.imm_data = htonl(le32_to_cpu(cqe->rq.lkey_immdt));
+		ibwc->wc_flags |= IB_WC_WITH_IMM;
+	} else if (is_cqe_wr_imm(cqe)) {
+		ibwc->opcode = IB_WC_RECV_RDMA_WITH_IMM;
+		ibwc->ex.imm_data = htonl(le32_to_cpu(cqe->rq.lkey_immdt));
+		ibwc->wc_flags |= IB_WC_WITH_IMM;
+	} else if (is_cqe_invalidated(cqe)) {
+		ibwc->ex.invalidate_rkey = le32_to_cpu(cqe->rq.lkey_immdt);
+		ibwc->wc_flags |= IB_WC_WITH_INVALIDATE;
+	}
+	if (qp->ibqp.srq) {
+		ocrdma_update_free_srq_cqe(ibwc, cqe, qp);
+	} else {
+		ibwc->wr_id = qp->rqe_wr_id_tbl[qp->rq.tail];
+		ocrdma_hwq_inc_tail(&qp->rq);
+	}
+}
+
+static bool ocrdma_poll_rcqe(struct ocrdma_qp *qp, struct ocrdma_cqe *cqe,
+			     struct ib_wc *ibwc, bool *polled, bool *stop)
+{
+	int status;
+	bool expand = false;
+
+	ibwc->wc_flags = 0;
+	if (qp->qp_type == IB_QPT_UD || qp->qp_type == IB_QPT_GSI) {
+		status = (le32_to_cpu(cqe->flags_status_srcqpn) &
+					OCRDMA_CQE_UD_STATUS_MASK) >>
+					OCRDMA_CQE_UD_STATUS_SHIFT;
+	} else {
+		status = (le32_to_cpu(cqe->flags_status_srcqpn) &
+			     OCRDMA_CQE_STATUS_MASK) >> OCRDMA_CQE_STATUS_SHIFT;
+	}
+
+	if (status == OCRDMA_CQE_SUCCESS) {
+		*polled = true;
+		ocrdma_poll_success_rcqe(qp, cqe, ibwc);
+	} else {
+		expand = ocrdma_poll_err_rcqe(qp, cqe, ibwc, polled, stop,
+					      status);
+	}
+	return expand;
+}
+
+static void ocrdma_change_cq_phase(struct ocrdma_cq *cq, struct ocrdma_cqe *cqe,
+				   u16 cur_getp)
+{
+	if (cq->phase_change) {
+		if (cur_getp == 0)
+			cq->phase = (~cq->phase & OCRDMA_CQE_VALID);
+	} else {
+		/* clear valid bit */
+		cqe->flags_status_srcqpn = 0;
+	}
+}
+
+static int ocrdma_poll_hwcq(struct ocrdma_cq *cq, int num_entries,
+			    struct ib_wc *ibwc)
+{
+	u16 qpn = 0;
+	int i = 0;
+	bool expand = false;
+	int polled_hw_cqes = 0;
+	struct ocrdma_qp *qp = NULL;
+	struct ocrdma_dev *dev = get_ocrdma_dev(cq->ibcq.device);
+	struct ocrdma_cqe *cqe;
+	u16 cur_getp; bool polled = false; bool stop = false;
+
+	cur_getp = cq->getp;
+	while (num_entries) {
+		cqe = cq->va + cur_getp;
+		/* check whether valid cqe or not */
+		if (!is_cqe_valid(cq, cqe))
+			break;
+		qpn = (le32_to_cpu(cqe->cmn.qpn) & OCRDMA_CQE_QPN_MASK);
+		/* ignore discarded cqe */
+		if (qpn == 0)
+			goto skip_cqe;
+		qp = dev->qp_tbl[qpn];
+		BUG_ON(qp == NULL);
+
+		if (is_cqe_for_sq(cqe)) {
+			expand = ocrdma_poll_scqe(qp, cqe, ibwc, &polled,
+						  &stop);
+		} else {
+			expand = ocrdma_poll_rcqe(qp, cqe, ibwc, &polled,
+						  &stop);
+		}
+		if (expand)
+			goto expand_cqe;
+		if (stop)
+			goto stop_cqe;
+		/* clear qpn to avoid duplicate processing by discard_cqe() */
+		cqe->cmn.qpn = 0;
+skip_cqe:
+		polled_hw_cqes += 1;
+		cur_getp = (cur_getp + 1) % cq->max_hw_cqe;
+		ocrdma_change_cq_phase(cq, cqe, cur_getp);
+expand_cqe:
+		if (polled) {
+			num_entries -= 1;
+			i += 1;
+			ibwc = ibwc + 1;
+			polled = false;
+		}
+	}
+stop_cqe:
+	cq->getp = cur_getp;
+	if (cq->deferred_arm) {
+		ocrdma_ring_cq_db(dev, cq->id, true, cq->deferred_sol,
+				  polled_hw_cqes);
+		cq->deferred_arm = false;
+		cq->deferred_sol = false;
+	} else {
+		/* We need to pop the CQE. No need to arm */
+		ocrdma_ring_cq_db(dev, cq->id, false, cq->deferred_sol,
+				  polled_hw_cqes);
+		cq->deferred_sol = false;
+	}
+
+	return i;
+}
+
+/* insert error cqe if the QP's SQ or RQ's CQ matches the CQ under poll. */
+static int ocrdma_add_err_cqe(struct ocrdma_cq *cq, int num_entries,
+			      struct ocrdma_qp *qp, struct ib_wc *ibwc)
+{
+	int err_cqes = 0;
+
+	while (num_entries) {
+		if (is_hw_sq_empty(qp) && is_hw_rq_empty(qp))
+			break;
+		if (!is_hw_sq_empty(qp) && qp->sq_cq == cq) {
+			ocrdma_update_wc(qp, ibwc, qp->sq.tail);
+			ocrdma_hwq_inc_tail(&qp->sq);
+		} else if (!is_hw_rq_empty(qp) && qp->rq_cq == cq) {
+			ibwc->wr_id = qp->rqe_wr_id_tbl[qp->rq.tail];
+			ocrdma_hwq_inc_tail(&qp->rq);
+		} else {
+			return err_cqes;
+		}
+		ibwc->byte_len = 0;
+		ibwc->status = IB_WC_WR_FLUSH_ERR;
+		ibwc = ibwc + 1;
+		err_cqes += 1;
+		num_entries -= 1;
+	}
+	return err_cqes;
+}
+
+int ocrdma_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc)
+{
+	int cqes_to_poll = num_entries;
+	struct ocrdma_cq *cq = get_ocrdma_cq(ibcq);
+	struct ocrdma_dev *dev = get_ocrdma_dev(ibcq->device);
+	int num_os_cqe = 0, err_cqes = 0;
+	struct ocrdma_qp *qp;
+	unsigned long flags;
+
+	/* poll cqes from adapter CQ */
+	spin_lock_irqsave(&cq->cq_lock, flags);
+	num_os_cqe = ocrdma_poll_hwcq(cq, cqes_to_poll, wc);
+	spin_unlock_irqrestore(&cq->cq_lock, flags);
+	cqes_to_poll -= num_os_cqe;
+
+	if (cqes_to_poll) {
+		wc = wc + num_os_cqe;
+		/* adapter returns single error cqe when qp moves to
+		 * error state. So insert error cqes with wc_status as
+		 * FLUSHED for pending WQEs and RQEs of QP's SQ and RQ
+		 * respectively which uses this CQ.
+		 */
+		spin_lock_irqsave(&dev->flush_q_lock, flags);
+		list_for_each_entry(qp, &cq->sq_head, sq_entry) {
+			if (cqes_to_poll == 0)
+				break;
+			err_cqes = ocrdma_add_err_cqe(cq, cqes_to_poll, qp, wc);
+			cqes_to_poll -= err_cqes;
+			num_os_cqe += err_cqes;
+			wc = wc + err_cqes;
+		}
+		spin_unlock_irqrestore(&dev->flush_q_lock, flags);
+	}
+	return num_os_cqe;
+}
+
+int ocrdma_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags cq_flags)
+{
+	struct ocrdma_cq *cq = get_ocrdma_cq(ibcq);
+	struct ocrdma_dev *dev = get_ocrdma_dev(ibcq->device);
+	u16 cq_id;
+	unsigned long flags;
+	bool arm_needed = false, sol_needed = false;
+
+	cq_id = cq->id;
+
+	spin_lock_irqsave(&cq->cq_lock, flags);
+	if (cq_flags & IB_CQ_NEXT_COMP || cq_flags & IB_CQ_SOLICITED)
+		arm_needed = true;
+	if (cq_flags & IB_CQ_SOLICITED)
+		sol_needed = true;
+
+	if (cq->first_arm) {
+		ocrdma_ring_cq_db(dev, cq_id, arm_needed, sol_needed, 0);
+		cq->first_arm = false;
+	}
+
+	cq->deferred_arm = true;
+	cq->deferred_sol = sol_needed;
+	spin_unlock_irqrestore(&cq->cq_lock, flags);
+
+	return 0;
+}
+
+struct ib_mr *ocrdma_alloc_frmr(struct ib_pd *ibpd, int max_page_list_len)
+{
+	int status;
+	struct ocrdma_mr *mr;
+	struct ocrdma_pd *pd = get_ocrdma_pd(ibpd);
+	struct ocrdma_dev *dev = get_ocrdma_dev(ibpd->device);
+
+	if (max_page_list_len > dev->attr.max_pages_per_frmr)
+		return ERR_PTR(-EINVAL);
+
+	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
+	if (!mr)
+		return ERR_PTR(-ENOMEM);
+
+	status = ocrdma_get_pbl_info(dev, mr, max_page_list_len);
+	if (status)
+		goto pbl_err;
+	mr->hwmr.fr_mr = 1;
+	mr->hwmr.remote_rd = 0;
+	mr->hwmr.remote_wr = 0;
+	mr->hwmr.local_rd = 0;
+	mr->hwmr.local_wr = 0;
+	mr->hwmr.mw_bind = 0;
+	status = ocrdma_build_pbl_tbl(dev, &mr->hwmr);
+	if (status)
+		goto pbl_err;
+	status = ocrdma_reg_mr(dev, &mr->hwmr, pd->id, 0);
+	if (status)
+		goto mbx_err;
+	mr->ibmr.rkey = mr->hwmr.lkey;
+	mr->ibmr.lkey = mr->hwmr.lkey;
+	dev->stag_arr[(mr->hwmr.lkey >> 8) & (OCRDMA_MAX_STAG - 1)] =
+		(unsigned long) mr;
+	return &mr->ibmr;
+mbx_err:
+	ocrdma_free_mr_pbl_tbl(dev, &mr->hwmr);
+pbl_err:
+	kfree(mr);
+	return ERR_PTR(-ENOMEM);
+}
+
+struct ib_fast_reg_page_list *ocrdma_alloc_frmr_page_list(struct ib_device
+							  *ibdev,
+							  int page_list_len)
+{
+	struct ib_fast_reg_page_list *frmr_list;
+	int size;
+
+	size = sizeof(*frmr_list) + (page_list_len * sizeof(u64));
+	frmr_list = kzalloc(size, GFP_KERNEL);
+	if (!frmr_list)
+		return ERR_PTR(-ENOMEM);
+	frmr_list->page_list = (u64 *)(frmr_list + 1);
+	return frmr_list;
+}
+
+void ocrdma_free_frmr_page_list(struct ib_fast_reg_page_list *page_list)
+{
+	kfree(page_list);
+}
+
+#define MAX_KERNEL_PBE_SIZE 65536
+static inline int count_kernel_pbes(struct ib_phys_buf *buf_list,
+				    int buf_cnt, u32 *pbe_size)
+{
+	u64 total_size = 0;
+	u64 buf_size = 0;
+	int i;
+	*pbe_size = roundup(buf_list[0].size, PAGE_SIZE);
+	*pbe_size = roundup_pow_of_two(*pbe_size);
+
+	/* find the smallest PBE size that we can have */
+	for (i = 0; i < buf_cnt; i++) {
+		/* first addr may not be page aligned, so ignore checking */
+		if ((i != 0) && ((buf_list[i].addr & ~PAGE_MASK) ||
+				 (buf_list[i].size & ~PAGE_MASK))) {
+			return 0;
+		}
+
+		/* if configured PBE size is greater then the chosen one,
+		 * reduce the PBE size.
+		 */
+		buf_size = roundup(buf_list[i].size, PAGE_SIZE);
+		/* pbe_size has to be even multiple of 4K 1,2,4,8...*/
+		buf_size = roundup_pow_of_two(buf_size);
+		if (*pbe_size > buf_size)
+			*pbe_size = buf_size;
+
+		total_size += buf_size;
+	}
+	*pbe_size = *pbe_size > MAX_KERNEL_PBE_SIZE ?
+	    (MAX_KERNEL_PBE_SIZE) : (*pbe_size);
+
+	/* num_pbes = total_size / (*pbe_size);  this is implemented below. */
+
+	return total_size >> ilog2(*pbe_size);
+}
+
+static void build_kernel_pbes(struct ib_phys_buf *buf_list, int ib_buf_cnt,
+			      u32 pbe_size, struct ocrdma_pbl *pbl_tbl,
+			      struct ocrdma_hw_mr *hwmr)
+{
+	int i;
+	int idx;
+	int pbes_per_buf = 0;
+	u64 buf_addr = 0;
+	int num_pbes;
+	struct ocrdma_pbe *pbe;
+	int total_num_pbes = 0;
+
+	if (!hwmr->num_pbes)
+		return;
+
+	pbe = (struct ocrdma_pbe *)pbl_tbl->va;
+	num_pbes = 0;
+
+	/* go through the OS phy regions & fill hw pbe entries into pbls. */
+	for (i = 0; i < ib_buf_cnt; i++) {
+		buf_addr = buf_list[i].addr;
+		pbes_per_buf =
+		    roundup_pow_of_two(roundup(buf_list[i].size, PAGE_SIZE)) /
+		    pbe_size;
+		hwmr->len += buf_list[i].size;
+		/* number of pbes can be more for one OS buf, when
+		 * buffers are of different sizes.
+		 * split the ib_buf to one or more pbes.
+		 */
+		for (idx = 0; idx < pbes_per_buf; idx++) {
+			/* we program always page aligned addresses,
+			 * first unaligned address is taken care by fbo.
+			 */
+			if (i == 0) {
+				/* for non zero fbo, assign the
+				 * start of the page.
+				 */
+				pbe->pa_lo =
+				    cpu_to_le32((u32) (buf_addr & PAGE_MASK));
+				pbe->pa_hi =
+				    cpu_to_le32((u32) upper_32_bits(buf_addr));
+			} else {
+				pbe->pa_lo =
+				    cpu_to_le32((u32) (buf_addr & 0xffffffff));
+				pbe->pa_hi =
+				    cpu_to_le32((u32) upper_32_bits(buf_addr));
+			}
+			buf_addr += pbe_size;
+			num_pbes += 1;
+			total_num_pbes += 1;
+			pbe++;
+
+			if (total_num_pbes == hwmr->num_pbes)
+				goto mr_tbl_done;
+			/* if the pbl is full storing the pbes,
+			 * move to next pbl.
+			 */
+			if (num_pbes == (hwmr->pbl_size/sizeof(u64))) {
+				pbl_tbl++;
+				pbe = (struct ocrdma_pbe *)pbl_tbl->va;
+				num_pbes = 0;
+			}
+		}
+	}
+mr_tbl_done:
+	return;
+}
+
+struct ib_mr *ocrdma_reg_kernel_mr(struct ib_pd *ibpd,
+				   struct ib_phys_buf *buf_list,
+				   int buf_cnt, int acc, u64 *iova_start)
+{
+	int status = -ENOMEM;
+	struct ocrdma_mr *mr;
+	struct ocrdma_pd *pd = get_ocrdma_pd(ibpd);
+	struct ocrdma_dev *dev = get_ocrdma_dev(ibpd->device);
+	u32 num_pbes;
+	u32 pbe_size = 0;
+
+	if ((acc & IB_ACCESS_REMOTE_WRITE) && !(acc & IB_ACCESS_LOCAL_WRITE))
+		return ERR_PTR(-EINVAL);
+
+	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
+	if (!mr)
+		return ERR_PTR(status);
+
+	num_pbes = count_kernel_pbes(buf_list, buf_cnt, &pbe_size);
+	if (num_pbes == 0) {
+		status = -EINVAL;
+		goto pbl_err;
+	}
+	status = ocrdma_get_pbl_info(dev, mr, num_pbes);
+	if (status)
+		goto pbl_err;
+
+	mr->hwmr.pbe_size = pbe_size;
+	mr->hwmr.fbo = *iova_start - (buf_list[0].addr & PAGE_MASK);
+	mr->hwmr.va = *iova_start;
+	mr->hwmr.local_rd = 1;
+	mr->hwmr.remote_wr = (acc & IB_ACCESS_REMOTE_WRITE) ? 1 : 0;
+	mr->hwmr.remote_rd = (acc & IB_ACCESS_REMOTE_READ) ? 1 : 0;
+	mr->hwmr.local_wr = (acc & IB_ACCESS_LOCAL_WRITE) ? 1 : 0;
+	mr->hwmr.remote_atomic = (acc & IB_ACCESS_REMOTE_ATOMIC) ? 1 : 0;
+	mr->hwmr.mw_bind = (acc & IB_ACCESS_MW_BIND) ? 1 : 0;
+
+	status = ocrdma_build_pbl_tbl(dev, &mr->hwmr);
+	if (status)
+		goto pbl_err;
+	build_kernel_pbes(buf_list, buf_cnt, pbe_size, mr->hwmr.pbl_table,
+			  &mr->hwmr);
+	status = ocrdma_reg_mr(dev, &mr->hwmr, pd->id, acc);
+	if (status)
+		goto mbx_err;
+
+	mr->ibmr.lkey = mr->hwmr.lkey;
+	if (mr->hwmr.remote_wr || mr->hwmr.remote_rd)
+		mr->ibmr.rkey = mr->hwmr.lkey;
+	return &mr->ibmr;
+
+mbx_err:
+	ocrdma_free_mr_pbl_tbl(dev, &mr->hwmr);
+pbl_err:
+	kfree(mr);
+	return ERR_PTR(status);
+}
diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h
new file mode 100644
index 0000000..b8f7853
--- /dev/null
+++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h
@@ -0,0 +1,99 @@
+/*******************************************************************
+ * This file is part of the Emulex RoCE Device Driver for          *
+ * RoCE (RDMA over Converged Ethernet) adapters.                   *
+ * Copyright (C) 2008-2012 Emulex. All rights reserved.            *
+ * EMULEX and SLI are trademarks of Emulex.                        *
+ * www.emulex.com                                                  *
+ *                                                                 *
+ * This program is free software; you can redistribute it and/or   *
+ * modify it under the terms of version 2 of the GNU General       *
+ * Public License as published by the Free Software Foundation.    *
+ * This program is distributed in the hope that it will be useful. *
+ * ALL EXPRESS OR IMPLIED CONDITIONS, REPRESENTATIONS AND          *
+ * WARRANTIES, INCLUDING ANY IMPLIED WARRANTY OF MERCHANTABILITY,  *
+ * FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT, ARE      *
+ * DISCLAIMED, EXCEPT TO THE EXTENT THAT SUCH DISCLAIMERS ARE HELD *
+ * TO BE LEGALLY INVALID.  See the GNU General Public License for  *
+ * more details, a copy of which can be found in the file COPYING  *
+ * included with this package.                                     *
+ *
+ * Contact Information:
+ * linux-drivers@emulex.com
+ *
+ * Emulex
+ * 3333 Susan Street
+ * Costa Mesa, CA 92626
+ *******************************************************************/
+
+#ifndef __OCRDMA_VERBS_H__
+#define __OCRDMA_VERBS_H__
+
+int ocrdma_post_send(struct ib_qp *, struct ib_send_wr *,
+		     struct ib_send_wr **bad_wr);
+int ocrdma_post_recv(struct ib_qp *, struct ib_recv_wr *,
+		     struct ib_recv_wr **bad_wr);
+
+int ocrdma_poll_cq(struct ib_cq *, int num_entries, struct ib_wc *wc);
+int ocrdma_arm_cq(struct ib_cq *, enum ib_cq_notify_flags flags);
+
+int ocrdma_query_device(struct ib_device *, struct ib_device_attr *props);
+int ocrdma_query_port(struct ib_device *, u8 port, struct ib_port_attr *props);
+int ocrdma_modify_port(struct ib_device *, u8 port, int mask,
+		       struct ib_port_modify *props);
+
+void ocrdma_get_guid(struct ocrdma_dev *, u8 *guid);
+int ocrdma_query_gid(struct ib_device *, u8 port,
+		     int index, union ib_gid *gid);
+int ocrdma_query_pkey(struct ib_device *, u8 port, u16 index, u16 *pkey);
+
+struct ib_ucontext *ocrdma_alloc_ucontext(struct ib_device *,
+					  struct ib_udata *);
+int ocrdma_dealloc_ucontext(struct ib_ucontext *);
+
+int ocrdma_mmap(struct ib_ucontext *, struct vm_area_struct *vma);
+
+struct ib_pd *ocrdma_alloc_pd(struct ib_device *,
+			      struct ib_ucontext *, struct ib_udata *);
+int ocrdma_dealloc_pd(struct ib_pd *pd);
+
+struct ib_cq *ocrdma_create_cq(struct ib_device *, int entries, int vector,
+			       struct ib_ucontext *, struct ib_udata *);
+int ocrdma_resize_cq(struct ib_cq *, int cqe, struct ib_udata *);
+int ocrdma_destroy_cq(struct ib_cq *);
+
+struct ib_qp *ocrdma_create_qp(struct ib_pd *,
+			       struct ib_qp_init_attr *attrs,
+			       struct ib_udata *);
+int _ocrdma_modify_qp(struct ib_qp *, struct ib_qp_attr *attr,
+		      int attr_mask);
+int ocrdma_modify_qp(struct ib_qp *, struct ib_qp_attr *attr,
+		     int attr_mask, struct ib_udata *udata);
+int ocrdma_query_qp(struct ib_qp *,
+		    struct ib_qp_attr *qp_attr,
+		    int qp_attr_mask, struct ib_qp_init_attr *);
+int ocrdma_destroy_qp(struct ib_qp *);
+void ocrdma_del_flush_qp(struct ocrdma_qp *qp);
+
+struct ib_srq *ocrdma_create_srq(struct ib_pd *, struct ib_srq_init_attr *,
+				 struct ib_udata *);
+int ocrdma_modify_srq(struct ib_srq *, struct ib_srq_attr *,
+		      enum ib_srq_attr_mask, struct ib_udata *);
+int ocrdma_query_srq(struct ib_srq *, struct ib_srq_attr *);
+int ocrdma_destroy_srq(struct ib_srq *);
+int ocrdma_post_srq_recv(struct ib_srq *, struct ib_recv_wr *,
+			 struct ib_recv_wr **bad_recv_wr);
+
+int ocrdma_dereg_mr(struct ib_mr *);
+struct ib_mr *ocrdma_get_dma_mr(struct ib_pd *, int acc);
+struct ib_mr *ocrdma_reg_kernel_mr(struct ib_pd *,
+				   struct ib_phys_buf *buffer_list,
+				   int num_phys_buf, int acc, u64 *iova_start);
+struct ib_mr *ocrdma_reg_user_mr(struct ib_pd *, u64 start, u64 length,
+				 u64 virt, int acc, struct ib_udata *);
+struct ib_mr *ocrdma_alloc_frmr(struct ib_pd *pd, int max_page_list_len);
+struct ib_fast_reg_page_list *ocrdma_alloc_frmr_page_list(struct ib_device
+							*ibdev,
+							int page_list_len);
+void ocrdma_free_frmr_page_list(struct ib_fast_reg_page_list *page_list);
+
+#endif				/* __OCRDMA_VERBS_H__ */
diff --git a/drivers/infiniband/hw/qib/Kconfig b/drivers/infiniband/hw/qib/Kconfig
new file mode 100644
index 0000000..495be09
--- /dev/null
+++ b/drivers/infiniband/hw/qib/Kconfig
@@ -0,0 +1,15 @@
+config INFINIBAND_QIB
+	tristate "Intel PCIe HCA support"
+	depends on 64BIT
+	---help---
+	This is a low-level driver for Intel PCIe QLE InfiniBand host
+	channel adapters.  This driver does not support the Intel
+	HyperTransport card (model QHT7140).
+
+config INFINIBAND_QIB_DCA
+	bool "QIB DCA support"
+	depends on INFINIBAND_QIB && DCA && SMP && !(INFINIBAND_QIB=y && DCA=m)
+	default y
+	---help---
+	Setting this enables DCA support on some Intel chip sets
+	with the iba7322 HCA.
diff --git a/drivers/infiniband/hw/qib/Makefile b/drivers/infiniband/hw/qib/Makefile
new file mode 100644
index 0000000..57f8103
--- /dev/null
+++ b/drivers/infiniband/hw/qib/Makefile
@@ -0,0 +1,16 @@
+obj-$(CONFIG_INFINIBAND_QIB) += ib_qib.o
+
+ib_qib-y := qib_cq.o qib_diag.o qib_dma.o qib_driver.o qib_eeprom.o \
+	qib_file_ops.o qib_fs.o qib_init.o qib_intr.o qib_keys.o \
+	qib_mad.o qib_mmap.o qib_mr.o qib_pcie.o qib_pio_copy.o \
+	qib_qp.o qib_qsfp.o qib_rc.o qib_ruc.o qib_sdma.o qib_srq.o \
+	qib_sysfs.o qib_twsi.o qib_tx.o qib_uc.o qib_ud.o \
+	qib_user_pages.o qib_user_sdma.o qib_verbs_mcast.o qib_iba7220.o \
+	qib_sd7220.o qib_iba7322.o qib_verbs.o
+
+# 6120 has no fallback if no MSI interrupts, others can do INTx
+ib_qib-$(CONFIG_PCI_MSI) += qib_iba6120.o
+
+ib_qib-$(CONFIG_X86_64) += qib_wc_x86_64.o
+ib_qib-$(CONFIG_PPC64) += qib_wc_ppc64.o
+ib_qib-$(CONFIG_DEBUG_FS) += qib_debugfs.o
diff --git a/drivers/infiniband/hw/qib/qib.h b/drivers/infiniband/hw/qib/qib.h
new file mode 100644
index 0000000..c00ae09
--- /dev/null
+++ b/drivers/infiniband/hw/qib/qib.h
@@ -0,0 +1,1548 @@
+#ifndef _QIB_KERNEL_H
+#define _QIB_KERNEL_H
+/*
+ * Copyright (c) 2012, 2013 Intel Corporation.  All rights reserved.
+ * Copyright (c) 2006 - 2012 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/*
+ * This header file is the base header file for qlogic_ib kernel code
+ * qib_user.h serves a similar purpose for user code.
+ */
+
+#include <linux/interrupt.h>
+#include <linux/pci.h>
+#include <linux/dma-mapping.h>
+#include <linux/mutex.h>
+#include <linux/list.h>
+#include <linux/scatterlist.h>
+#include <linux/slab.h>
+#include <linux/io.h>
+#include <linux/fs.h>
+#include <linux/completion.h>
+#include <linux/kref.h>
+#include <linux/sched.h>
+#include <linux/kthread.h>
+
+#include "qib_common.h"
+#include "qib_verbs.h"
+
+/* only s/w major version of QLogic_IB we can handle */
+#define QIB_CHIP_VERS_MAJ 2U
+
+/* don't care about this except printing */
+#define QIB_CHIP_VERS_MIN 0U
+
+/* The Organization Unique Identifier (Mfg code), and its position in GUID */
+#define QIB_OUI 0x001175
+#define QIB_OUI_LSB 40
+
+/*
+ * per driver stats, either not device nor port-specific, or
+ * summed over all of the devices and ports.
+ * They are described by name via ipathfs filesystem, so layout
+ * and number of elements can change without breaking compatibility.
+ * If members are added or deleted qib_statnames[] in qib_fs.c must
+ * change to match.
+ */
+struct qlogic_ib_stats {
+	__u64 sps_ints; /* number of interrupts handled */
+	__u64 sps_errints; /* number of error interrupts */
+	__u64 sps_txerrs; /* tx-related packet errors */
+	__u64 sps_rcverrs; /* non-crc rcv packet errors */
+	__u64 sps_hwerrs; /* hardware errors reported (parity, etc.) */
+	__u64 sps_nopiobufs; /* no pio bufs avail from kernel */
+	__u64 sps_ctxts; /* number of contexts currently open */
+	__u64 sps_lenerrs; /* number of kernel packets where RHF != LRH len */
+	__u64 sps_buffull;
+	__u64 sps_hdrfull;
+};
+
+extern struct qlogic_ib_stats qib_stats;
+extern const struct pci_error_handlers qib_pci_err_handler;
+
+#define QIB_CHIP_SWVERSION QIB_CHIP_VERS_MAJ
+/*
+ * First-cut critierion for "device is active" is
+ * two thousand dwords combined Tx, Rx traffic per
+ * 5-second interval. SMA packets are 64 dwords,
+ * and occur "a few per second", presumably each way.
+ */
+#define QIB_TRAFFIC_ACTIVE_THRESHOLD (2000)
+
+/*
+ * Struct used to indicate which errors are logged in each of the
+ * error-counters that are logged to EEPROM. A counter is incremented
+ * _once_ (saturating at 255) for each event with any bits set in
+ * the error or hwerror register masks below.
+ */
+#define QIB_EEP_LOG_CNT (4)
+struct qib_eep_log_mask {
+	u64 errs_to_log;
+	u64 hwerrs_to_log;
+};
+
+/*
+ * Below contains all data related to a single context (formerly called port).
+ */
+
+#ifdef CONFIG_DEBUG_FS
+struct qib_opcode_stats_perctx;
+#endif
+
+struct qib_ctxtdata {
+	void **rcvegrbuf;
+	dma_addr_t *rcvegrbuf_phys;
+	/* rcvhdrq base, needs mmap before useful */
+	void *rcvhdrq;
+	/* kernel virtual address where hdrqtail is updated */
+	void *rcvhdrtail_kvaddr;
+	/*
+	 * temp buffer for expected send setup, allocated at open, instead
+	 * of each setup call
+	 */
+	void *tid_pg_list;
+	/*
+	 * Shared page for kernel to signal user processes that send buffers
+	 * need disarming.  The process should call QIB_CMD_DISARM_BUFS
+	 * or QIB_CMD_ACK_EVENT with IPATH_EVENT_DISARM_BUFS set.
+	 */
+	unsigned long *user_event_mask;
+	/* when waiting for rcv or pioavail */
+	wait_queue_head_t wait;
+	/*
+	 * rcvegr bufs base, physical, must fit
+	 * in 44 bits so 32 bit programs mmap64 44 bit works)
+	 */
+	dma_addr_t rcvegr_phys;
+	/* mmap of hdrq, must fit in 44 bits */
+	dma_addr_t rcvhdrq_phys;
+	dma_addr_t rcvhdrqtailaddr_phys;
+
+	/*
+	 * number of opens (including slave sub-contexts) on this instance
+	 * (ignoring forks, dup, etc. for now)
+	 */
+	int cnt;
+	/*
+	 * how much space to leave at start of eager TID entries for
+	 * protocol use, on each TID
+	 */
+	/* instead of calculating it */
+	unsigned ctxt;
+	/* local node of context */
+	int node_id;
+	/* non-zero if ctxt is being shared. */
+	u16 subctxt_cnt;
+	/* non-zero if ctxt is being shared. */
+	u16 subctxt_id;
+	/* number of eager TID entries. */
+	u16 rcvegrcnt;
+	/* index of first eager TID entry. */
+	u16 rcvegr_tid_base;
+	/* number of pio bufs for this ctxt (all procs, if shared) */
+	u32 piocnt;
+	/* first pio buffer for this ctxt */
+	u32 pio_base;
+	/* chip offset of PIO buffers for this ctxt */
+	u32 piobufs;
+	/* how many alloc_pages() chunks in rcvegrbuf_pages */
+	u32 rcvegrbuf_chunks;
+	/* how many egrbufs per chunk */
+	u16 rcvegrbufs_perchunk;
+	/* ilog2 of above */
+	u16 rcvegrbufs_perchunk_shift;
+	/* order for rcvegrbuf_pages */
+	size_t rcvegrbuf_size;
+	/* rcvhdrq size (for freeing) */
+	size_t rcvhdrq_size;
+	/* per-context flags for fileops/intr communication */
+	unsigned long flag;
+	/* next expected TID to check when looking for free */
+	u32 tidcursor;
+	/* WAIT_RCV that timed out, no interrupt */
+	u32 rcvwait_to;
+	/* WAIT_PIO that timed out, no interrupt */
+	u32 piowait_to;
+	/* WAIT_RCV already happened, no wait */
+	u32 rcvnowait;
+	/* WAIT_PIO already happened, no wait */
+	u32 pionowait;
+	/* total number of polled urgent packets */
+	u32 urgent;
+	/* saved total number of polled urgent packets for poll edge trigger */
+	u32 urgent_poll;
+	/* pid of process using this ctxt */
+	pid_t pid;
+	pid_t subpid[QLOGIC_IB_MAX_SUBCTXT];
+	/* same size as task_struct .comm[], command that opened context */
+	char comm[16];
+	/* pkeys set by this use of this ctxt */
+	u16 pkeys[4];
+	/* so file ops can get at unit */
+	struct qib_devdata *dd;
+	/* so funcs that need physical port can get it easily */
+	struct qib_pportdata *ppd;
+	/* A page of memory for rcvhdrhead, rcvegrhead, rcvegrtail * N */
+	void *subctxt_uregbase;
+	/* An array of pages for the eager receive buffers * N */
+	void *subctxt_rcvegrbuf;
+	/* An array of pages for the eager header queue entries * N */
+	void *subctxt_rcvhdr_base;
+	/* The version of the library which opened this ctxt */
+	u32 userversion;
+	/* Bitmask of active slaves */
+	u32 active_slaves;
+	/* Type of packets or conditions we want to poll for */
+	u16 poll_type;
+	/* receive packet sequence counter */
+	u8 seq_cnt;
+	u8 redirect_seq_cnt;
+	/* ctxt rcvhdrq head offset */
+	u32 head;
+	/* lookaside fields */
+	struct qib_qp *lookaside_qp;
+	u32 lookaside_qpn;
+	/* QPs waiting for context processing */
+	struct list_head qp_wait_list;
+#ifdef CONFIG_DEBUG_FS
+	/* verbs stats per CTX */
+	struct qib_opcode_stats_perctx *opstats;
+#endif
+};
+
+struct qib_sge_state;
+
+struct qib_sdma_txreq {
+	int                 flags;
+	int                 sg_count;
+	dma_addr_t          addr;
+	void              (*callback)(struct qib_sdma_txreq *, int);
+	u16                 start_idx;  /* sdma private */
+	u16                 next_descq_idx;  /* sdma private */
+	struct list_head    list;       /* sdma private */
+};
+
+struct qib_sdma_desc {
+	__le64 qw[2];
+};
+
+struct qib_verbs_txreq {
+	struct qib_sdma_txreq   txreq;
+	struct qib_qp           *qp;
+	struct qib_swqe         *wqe;
+	u32                     dwords;
+	u16                     hdr_dwords;
+	u16                     hdr_inx;
+	struct qib_pio_header	*align_buf;
+	struct qib_mregion	*mr;
+	struct qib_sge_state    *ss;
+};
+
+#define QIB_SDMA_TXREQ_F_USELARGEBUF  0x1
+#define QIB_SDMA_TXREQ_F_HEADTOHOST   0x2
+#define QIB_SDMA_TXREQ_F_INTREQ       0x4
+#define QIB_SDMA_TXREQ_F_FREEBUF      0x8
+#define QIB_SDMA_TXREQ_F_FREEDESC     0x10
+
+#define QIB_SDMA_TXREQ_S_OK        0
+#define QIB_SDMA_TXREQ_S_SENDERROR 1
+#define QIB_SDMA_TXREQ_S_ABORTED   2
+#define QIB_SDMA_TXREQ_S_SHUTDOWN  3
+
+/*
+ * Get/Set IB link-level config parameters for f_get/set_ib_cfg()
+ * Mostly for MADs that set or query link parameters, also ipath
+ * config interfaces
+ */
+#define QIB_IB_CFG_LIDLMC 0 /* LID (LS16b) and Mask (MS16b) */
+#define QIB_IB_CFG_LWID_ENB 2 /* allowed Link-width */
+#define QIB_IB_CFG_LWID 3 /* currently active Link-width */
+#define QIB_IB_CFG_SPD_ENB 4 /* allowed Link speeds */
+#define QIB_IB_CFG_SPD 5 /* current Link spd */
+#define QIB_IB_CFG_RXPOL_ENB 6 /* Auto-RX-polarity enable */
+#define QIB_IB_CFG_LREV_ENB 7 /* Auto-Lane-reversal enable */
+#define QIB_IB_CFG_LINKLATENCY 8 /* Link Latency (IB1.2 only) */
+#define QIB_IB_CFG_HRTBT 9 /* IB heartbeat off/enable/auto; DDR/QDR only */
+#define QIB_IB_CFG_OP_VLS 10 /* operational VLs */
+#define QIB_IB_CFG_VL_HIGH_CAP 11 /* num of VL high priority weights */
+#define QIB_IB_CFG_VL_LOW_CAP 12 /* num of VL low priority weights */
+#define QIB_IB_CFG_OVERRUN_THRESH 13 /* IB overrun threshold */
+#define QIB_IB_CFG_PHYERR_THRESH 14 /* IB PHY error threshold */
+#define QIB_IB_CFG_LINKDEFAULT 15 /* IB link default (sleep/poll) */
+#define QIB_IB_CFG_PKEYS 16 /* update partition keys */
+#define QIB_IB_CFG_MTU 17 /* update MTU in IBC */
+#define QIB_IB_CFG_LSTATE 18 /* update linkcmd and linkinitcmd in IBC */
+#define QIB_IB_CFG_VL_HIGH_LIMIT 19
+#define QIB_IB_CFG_PMA_TICKS 20 /* PMA sample tick resolution */
+#define QIB_IB_CFG_PORT 21 /* switch port we are connected to */
+
+/*
+ * for CFG_LSTATE: LINKCMD in upper 16 bits, LINKINITCMD in lower 16
+ * IB_LINKINITCMD_POLL and SLEEP are also used as set/get values for
+ * QIB_IB_CFG_LINKDEFAULT cmd
+ */
+#define   IB_LINKCMD_DOWN   (0 << 16)
+#define   IB_LINKCMD_ARMED  (1 << 16)
+#define   IB_LINKCMD_ACTIVE (2 << 16)
+#define   IB_LINKINITCMD_NOP     0
+#define   IB_LINKINITCMD_POLL    1
+#define   IB_LINKINITCMD_SLEEP   2
+#define   IB_LINKINITCMD_DISABLE 3
+
+/*
+ * valid states passed to qib_set_linkstate() user call
+ */
+#define QIB_IB_LINKDOWN         0
+#define QIB_IB_LINKARM          1
+#define QIB_IB_LINKACTIVE       2
+#define QIB_IB_LINKDOWN_ONLY    3
+#define QIB_IB_LINKDOWN_SLEEP   4
+#define QIB_IB_LINKDOWN_DISABLE 5
+
+/*
+ * These 7 values (SDR, DDR, and QDR may be ORed for auto-speed
+ * negotiation) are used for the 3rd argument to path_f_set_ib_cfg
+ * with cmd QIB_IB_CFG_SPD_ENB, by direct calls or via sysfs.  They
+ * are also the the possible values for qib_link_speed_enabled and active
+ * The values were chosen to match values used within the IB spec.
+ */
+#define QIB_IB_SDR 1
+#define QIB_IB_DDR 2
+#define QIB_IB_QDR 4
+
+#define QIB_DEFAULT_MTU 4096
+
+/* max number of IB ports supported per HCA */
+#define QIB_MAX_IB_PORTS 2
+
+/*
+ * Possible IB config parameters for f_get/set_ib_table()
+ */
+#define QIB_IB_TBL_VL_HIGH_ARB 1 /* Get/set VL high priority weights */
+#define QIB_IB_TBL_VL_LOW_ARB 2 /* Get/set VL low priority weights */
+
+/*
+ * Possible "operations" for f_rcvctrl(ppd, op, ctxt)
+ * these are bits so they can be combined, e.g.
+ * QIB_RCVCTRL_INTRAVAIL_ENB | QIB_RCVCTRL_CTXT_ENB
+ */
+#define QIB_RCVCTRL_TAILUPD_ENB 0x01
+#define QIB_RCVCTRL_TAILUPD_DIS 0x02
+#define QIB_RCVCTRL_CTXT_ENB 0x04
+#define QIB_RCVCTRL_CTXT_DIS 0x08
+#define QIB_RCVCTRL_INTRAVAIL_ENB 0x10
+#define QIB_RCVCTRL_INTRAVAIL_DIS 0x20
+#define QIB_RCVCTRL_PKEY_ENB 0x40  /* Note, default is enabled */
+#define QIB_RCVCTRL_PKEY_DIS 0x80
+#define QIB_RCVCTRL_BP_ENB 0x0100
+#define QIB_RCVCTRL_BP_DIS 0x0200
+#define QIB_RCVCTRL_TIDFLOW_ENB 0x0400
+#define QIB_RCVCTRL_TIDFLOW_DIS 0x0800
+
+/*
+ * Possible "operations" for f_sendctrl(ppd, op, var)
+ * these are bits so they can be combined, e.g.
+ * QIB_SENDCTRL_BUFAVAIL_ENB | QIB_SENDCTRL_ENB
+ * Some operations (e.g. DISARM, ABORT) are known to
+ * be "one-shot", so do not modify shadow.
+ */
+#define QIB_SENDCTRL_DISARM       (0x1000)
+#define QIB_SENDCTRL_DISARM_BUF(bufn) ((bufn) | QIB_SENDCTRL_DISARM)
+	/* available (0x2000) */
+#define QIB_SENDCTRL_AVAIL_DIS    (0x4000)
+#define QIB_SENDCTRL_AVAIL_ENB    (0x8000)
+#define QIB_SENDCTRL_AVAIL_BLIP  (0x10000)
+#define QIB_SENDCTRL_SEND_DIS    (0x20000)
+#define QIB_SENDCTRL_SEND_ENB    (0x40000)
+#define QIB_SENDCTRL_FLUSH       (0x80000)
+#define QIB_SENDCTRL_CLEAR      (0x100000)
+#define QIB_SENDCTRL_DISARM_ALL (0x200000)
+
+/*
+ * These are the generic indices for requesting per-port
+ * counter values via the f_portcntr function.  They
+ * are always returned as 64 bit values, although most
+ * are 32 bit counters.
+ */
+/* send-related counters */
+#define QIBPORTCNTR_PKTSEND         0U
+#define QIBPORTCNTR_WORDSEND        1U
+#define QIBPORTCNTR_PSXMITDATA      2U
+#define QIBPORTCNTR_PSXMITPKTS      3U
+#define QIBPORTCNTR_PSXMITWAIT      4U
+#define QIBPORTCNTR_SENDSTALL       5U
+/* receive-related counters */
+#define QIBPORTCNTR_PKTRCV          6U
+#define QIBPORTCNTR_PSRCVDATA       7U
+#define QIBPORTCNTR_PSRCVPKTS       8U
+#define QIBPORTCNTR_RCVEBP          9U
+#define QIBPORTCNTR_RCVOVFL         10U
+#define QIBPORTCNTR_WORDRCV         11U
+/* IB link related error counters */
+#define QIBPORTCNTR_RXLOCALPHYERR   12U
+#define QIBPORTCNTR_RXVLERR         13U
+#define QIBPORTCNTR_ERRICRC         14U
+#define QIBPORTCNTR_ERRVCRC         15U
+#define QIBPORTCNTR_ERRLPCRC        16U
+#define QIBPORTCNTR_BADFORMAT       17U
+#define QIBPORTCNTR_ERR_RLEN        18U
+#define QIBPORTCNTR_IBSYMBOLERR     19U
+#define QIBPORTCNTR_INVALIDRLEN     20U
+#define QIBPORTCNTR_UNSUPVL         21U
+#define QIBPORTCNTR_EXCESSBUFOVFL   22U
+#define QIBPORTCNTR_ERRLINK         23U
+#define QIBPORTCNTR_IBLINKDOWN      24U
+#define QIBPORTCNTR_IBLINKERRRECOV  25U
+#define QIBPORTCNTR_LLI             26U
+/* other error counters */
+#define QIBPORTCNTR_RXDROPPKT       27U
+#define QIBPORTCNTR_VL15PKTDROP     28U
+#define QIBPORTCNTR_ERRPKEY         29U
+#define QIBPORTCNTR_KHDROVFL        30U
+/* sampling counters (these are actually control registers) */
+#define QIBPORTCNTR_PSINTERVAL      31U
+#define QIBPORTCNTR_PSSTART         32U
+#define QIBPORTCNTR_PSSTAT          33U
+
+/* how often we check for packet activity for "power on hours (in seconds) */
+#define ACTIVITY_TIMER 5
+
+#define MAX_NAME_SIZE 64
+
+#ifdef CONFIG_INFINIBAND_QIB_DCA
+struct qib_irq_notify;
+#endif
+
+struct qib_msix_entry {
+	struct msix_entry msix;
+	void *arg;
+#ifdef CONFIG_INFINIBAND_QIB_DCA
+	int dca;
+	int rcv;
+	struct qib_irq_notify *notifier;
+#endif
+	char name[MAX_NAME_SIZE];
+	cpumask_var_t mask;
+};
+
+/* Below is an opaque struct. Each chip (device) can maintain
+ * private data needed for its operation, but not germane to the
+ * rest of the driver.  For convenience, we define another that
+ * is chip-specific, per-port
+ */
+struct qib_chip_specific;
+struct qib_chipport_specific;
+
+enum qib_sdma_states {
+	qib_sdma_state_s00_hw_down,
+	qib_sdma_state_s10_hw_start_up_wait,
+	qib_sdma_state_s20_idle,
+	qib_sdma_state_s30_sw_clean_up_wait,
+	qib_sdma_state_s40_hw_clean_up_wait,
+	qib_sdma_state_s50_hw_halt_wait,
+	qib_sdma_state_s99_running,
+};
+
+enum qib_sdma_events {
+	qib_sdma_event_e00_go_hw_down,
+	qib_sdma_event_e10_go_hw_start,
+	qib_sdma_event_e20_hw_started,
+	qib_sdma_event_e30_go_running,
+	qib_sdma_event_e40_sw_cleaned,
+	qib_sdma_event_e50_hw_cleaned,
+	qib_sdma_event_e60_hw_halted,
+	qib_sdma_event_e70_go_idle,
+	qib_sdma_event_e7220_err_halted,
+	qib_sdma_event_e7322_err_halted,
+	qib_sdma_event_e90_timer_tick,
+};
+
+extern char *qib_sdma_state_names[];
+extern char *qib_sdma_event_names[];
+
+struct sdma_set_state_action {
+	unsigned op_enable:1;
+	unsigned op_intenable:1;
+	unsigned op_halt:1;
+	unsigned op_drain:1;
+	unsigned go_s99_running_tofalse:1;
+	unsigned go_s99_running_totrue:1;
+};
+
+struct qib_sdma_state {
+	struct kref          kref;
+	struct completion    comp;
+	enum qib_sdma_states current_state;
+	struct sdma_set_state_action *set_state_action;
+	unsigned             current_op;
+	unsigned             go_s99_running;
+	unsigned             first_sendbuf;
+	unsigned             last_sendbuf; /* really last +1 */
+	/* debugging/devel */
+	enum qib_sdma_states previous_state;
+	unsigned             previous_op;
+	enum qib_sdma_events last_event;
+};
+
+struct xmit_wait {
+	struct timer_list timer;
+	u64 counter;
+	u8 flags;
+	struct cache {
+		u64 psxmitdata;
+		u64 psrcvdata;
+		u64 psxmitpkts;
+		u64 psrcvpkts;
+		u64 psxmitwait;
+	} counter_cache;
+};
+
+/*
+ * The structure below encapsulates data relevant to a physical IB Port.
+ * Current chips support only one such port, but the separation
+ * clarifies things a bit. Note that to conform to IB conventions,
+ * port-numbers are one-based. The first or only port is port1.
+ */
+struct qib_pportdata {
+	struct qib_ibport ibport_data;
+
+	struct qib_devdata *dd;
+	struct qib_chippport_specific *cpspec; /* chip-specific per-port */
+	struct kobject pport_kobj;
+	struct kobject pport_cc_kobj;
+	struct kobject sl2vl_kobj;
+	struct kobject diagc_kobj;
+
+	/* GUID for this interface, in network order */
+	__be64 guid;
+
+	/* QIB_POLL, etc. link-state specific flags, per port */
+	u32 lflags;
+	/* qib_lflags driver is waiting for */
+	u32 state_wanted;
+	spinlock_t lflags_lock;
+
+	/* ref count for each pkey */
+	atomic_t pkeyrefs[4];
+
+	/*
+	 * this address is mapped readonly into user processes so they can
+	 * get status cheaply, whenever they want.  One qword of status per port
+	 */
+	u64 *statusp;
+
+	/* SendDMA related entries */
+
+	/* read mostly */
+	struct qib_sdma_desc *sdma_descq;
+	struct workqueue_struct *qib_wq;
+	struct qib_sdma_state sdma_state;
+	dma_addr_t       sdma_descq_phys;
+	volatile __le64 *sdma_head_dma; /* DMA'ed by chip */
+	dma_addr_t       sdma_head_phys;
+	u16                   sdma_descq_cnt;
+
+	/* read/write using lock */
+	spinlock_t            sdma_lock ____cacheline_aligned_in_smp;
+	struct list_head      sdma_activelist;
+	struct list_head      sdma_userpending;
+	u64                   sdma_descq_added;
+	u64                   sdma_descq_removed;
+	u16                   sdma_descq_tail;
+	u16                   sdma_descq_head;
+	u8                    sdma_generation;
+	u8                    sdma_intrequest;
+
+	struct tasklet_struct sdma_sw_clean_up_task
+		____cacheline_aligned_in_smp;
+
+	wait_queue_head_t state_wait; /* for state_wanted */
+
+	/* HoL blocking for SMP replies */
+	unsigned          hol_state;
+	struct timer_list hol_timer;
+
+	/*
+	 * Shadow copies of registers; size indicates read access size.
+	 * Most of them are readonly, but some are write-only register,
+	 * where we manipulate the bits in the shadow copy, and then write
+	 * the shadow copy to qlogic_ib.
+	 *
+	 * We deliberately make most of these 32 bits, since they have
+	 * restricted range.  For any that we read, we won't to generate 32
+	 * bit accesses, since Opteron will generate 2 separate 32 bit HT
+	 * transactions for a 64 bit read, and we want to avoid unnecessary
+	 * bus transactions.
+	 */
+
+	/* This is the 64 bit group */
+	/* last ibcstatus.  opaque outside chip-specific code */
+	u64 lastibcstat;
+
+	/* these are the "32 bit" regs */
+
+	/*
+	 * the following two are 32-bit bitmasks, but {test,clear,set}_bit
+	 * all expect bit fields to be "unsigned long"
+	 */
+	unsigned long p_rcvctrl; /* shadow per-port rcvctrl */
+	unsigned long p_sendctrl; /* shadow per-port sendctrl */
+
+	u32 ibmtu; /* The MTU programmed for this unit */
+	/*
+	 * Current max size IB packet (in bytes) including IB headers, that
+	 * we can send. Changes when ibmtu changes.
+	 */
+	u32 ibmaxlen;
+	/*
+	 * ibmaxlen at init time, limited by chip and by receive buffer
+	 * size.  Not changed after init.
+	 */
+	u32 init_ibmaxlen;
+	/* LID programmed for this instance */
+	u16 lid;
+	/* list of pkeys programmed; 0 if not set */
+	u16 pkeys[4];
+	/* LID mask control */
+	u8 lmc;
+	u8 link_width_supported;
+	u8 link_speed_supported;
+	u8 link_width_enabled;
+	u8 link_speed_enabled;
+	u8 link_width_active;
+	u8 link_speed_active;
+	u8 vls_supported;
+	u8 vls_operational;
+	/* Rx Polarity inversion (compensate for ~tx on partner) */
+	u8 rx_pol_inv;
+
+	u8 hw_pidx;     /* physical port index */
+	u8 port;        /* IB port number and index into dd->pports - 1 */
+
+	u8 delay_mult;
+
+	/* used to override LED behavior */
+	u8 led_override;  /* Substituted for normal value, if non-zero */
+	u16 led_override_timeoff; /* delta to next timer event */
+	u8 led_override_vals[2]; /* Alternates per blink-frame */
+	u8 led_override_phase; /* Just counts, LSB picks from vals[] */
+	atomic_t led_override_timer_active;
+	/* Used to flash LEDs in override mode */
+	struct timer_list led_override_timer;
+	struct xmit_wait cong_stats;
+	struct timer_list symerr_clear_timer;
+
+	/* Synchronize access between driver writes and sysfs reads */
+	spinlock_t cc_shadow_lock
+		____cacheline_aligned_in_smp;
+
+	/* Shadow copy of the congestion control table */
+	struct cc_table_shadow *ccti_entries_shadow;
+
+	/* Shadow copy of the congestion control entries */
+	struct ib_cc_congestion_setting_attr_shadow *congestion_entries_shadow;
+
+	/* List of congestion control table entries */
+	struct ib_cc_table_entry_shadow *ccti_entries;
+
+	/* 16 congestion entries with each entry corresponding to a SL */
+	struct ib_cc_congestion_entry_shadow *congestion_entries;
+
+	/* Maximum number of congestion control entries that the agent expects
+	 * the manager to send.
+	 */
+	u16 cc_supported_table_entries;
+
+	/* Total number of congestion control table entries */
+	u16 total_cct_entry;
+
+	/* Bit map identifying service level */
+	u16 cc_sl_control_map;
+
+	/* maximum congestion control table index */
+	u16 ccti_limit;
+
+	/* CA's max number of 64 entry units in the congestion control table */
+	u8 cc_max_table_entries;
+};
+
+/* Observers. Not to be taken lightly, possibly not to ship. */
+/*
+ * If a diag read or write is to (bottom <= offset <= top),
+ * the "hoook" is called, allowing, e.g. shadows to be
+ * updated in sync with the driver. struct diag_observer
+ * is the "visible" part.
+ */
+struct diag_observer;
+
+typedef int (*diag_hook) (struct qib_devdata *dd,
+	const struct diag_observer *op,
+	u32 offs, u64 *data, u64 mask, int only_32);
+
+struct diag_observer {
+	diag_hook hook;
+	u32 bottom;
+	u32 top;
+};
+
+extern int qib_register_observer(struct qib_devdata *dd,
+	const struct diag_observer *op);
+
+/* Only declared here, not defined. Private to diags */
+struct diag_observer_list_elt;
+
+/* device data struct now contains only "general per-device" info.
+ * fields related to a physical IB port are in a qib_pportdata struct,
+ * described above) while fields only used by a particular chip-type are in
+ * a qib_chipdata struct, whose contents are opaque to this file.
+ */
+struct qib_devdata {
+	struct qib_ibdev verbs_dev;     /* must be first */
+	struct list_head list;
+	/* pointers to related structs for this device */
+	/* pci access data structure */
+	struct pci_dev *pcidev;
+	struct cdev *user_cdev;
+	struct cdev *diag_cdev;
+	struct device *user_device;
+	struct device *diag_device;
+
+	/* mem-mapped pointer to base of chip regs */
+	u64 __iomem *kregbase;
+	/* end of mem-mapped chip space excluding sendbuf and user regs */
+	u64 __iomem *kregend;
+	/* physical address of chip for io_remap, etc. */
+	resource_size_t physaddr;
+	/* qib_cfgctxts pointers */
+	struct qib_ctxtdata **rcd; /* Receive Context Data */
+
+	/* qib_pportdata, points to array of (physical) port-specific
+	 * data structs, indexed by pidx (0..n-1)
+	 */
+	struct qib_pportdata *pport;
+	struct qib_chip_specific *cspec; /* chip-specific */
+
+	/* kvirt address of 1st 2k pio buffer */
+	void __iomem *pio2kbase;
+	/* kvirt address of 1st 4k pio buffer */
+	void __iomem *pio4kbase;
+	/* mem-mapped pointer to base of PIO buffers (if using WC PAT) */
+	void __iomem *piobase;
+	/* mem-mapped pointer to base of user chip regs (if using WC PAT) */
+	u64 __iomem *userbase;
+	void __iomem *piovl15base; /* base of VL15 buffers, if not WC */
+	/*
+	 * points to area where PIOavail registers will be DMA'ed.
+	 * Has to be on a page of it's own, because the page will be
+	 * mapped into user program space.  This copy is *ONLY* ever
+	 * written by DMA, not by the driver!  Need a copy per device
+	 * when we get to multiple devices
+	 */
+	volatile __le64 *pioavailregs_dma; /* DMA'ed by chip */
+	/* physical address where updates occur */
+	dma_addr_t pioavailregs_phys;
+
+	/* device-specific implementations of functions needed by
+	 * common code. Contrary to previous consensus, we can't
+	 * really just point to a device-specific table, because we
+	 * may need to "bend", e.g. *_f_put_tid
+	 */
+	/* fallback to alternate interrupt type if possible */
+	int (*f_intr_fallback)(struct qib_devdata *);
+	/* hard reset chip */
+	int (*f_reset)(struct qib_devdata *);
+	void (*f_quiet_serdes)(struct qib_pportdata *);
+	int (*f_bringup_serdes)(struct qib_pportdata *);
+	int (*f_early_init)(struct qib_devdata *);
+	void (*f_clear_tids)(struct qib_devdata *, struct qib_ctxtdata *);
+	void (*f_put_tid)(struct qib_devdata *, u64 __iomem*,
+				u32, unsigned long);
+	void (*f_cleanup)(struct qib_devdata *);
+	void (*f_setextled)(struct qib_pportdata *, u32);
+	/* fill out chip-specific fields */
+	int (*f_get_base_info)(struct qib_ctxtdata *, struct qib_base_info *);
+	/* free irq */
+	void (*f_free_irq)(struct qib_devdata *);
+	struct qib_message_header *(*f_get_msgheader)
+					(struct qib_devdata *, __le32 *);
+	void (*f_config_ctxts)(struct qib_devdata *);
+	int (*f_get_ib_cfg)(struct qib_pportdata *, int);
+	int (*f_set_ib_cfg)(struct qib_pportdata *, int, u32);
+	int (*f_set_ib_loopback)(struct qib_pportdata *, const char *);
+	int (*f_get_ib_table)(struct qib_pportdata *, int, void *);
+	int (*f_set_ib_table)(struct qib_pportdata *, int, void *);
+	u32 (*f_iblink_state)(u64);
+	u8 (*f_ibphys_portstate)(u64);
+	void (*f_xgxs_reset)(struct qib_pportdata *);
+	/* per chip actions needed for IB Link up/down changes */
+	int (*f_ib_updown)(struct qib_pportdata *, int, u64);
+	u32 __iomem *(*f_getsendbuf)(struct qib_pportdata *, u64, u32 *);
+	/* Read/modify/write of GPIO pins (potentially chip-specific */
+	int (*f_gpio_mod)(struct qib_devdata *dd, u32 out, u32 dir,
+		u32 mask);
+	/* Enable writes to config EEPROM (if supported) */
+	int (*f_eeprom_wen)(struct qib_devdata *dd, int wen);
+	/*
+	 * modify rcvctrl shadow[s] and write to appropriate chip-regs.
+	 * see above QIB_RCVCTRL_xxx_ENB/DIS for operations.
+	 * (ctxt == -1) means "all contexts", only meaningful for
+	 * clearing. Could remove if chip_spec shutdown properly done.
+	 */
+	void (*f_rcvctrl)(struct qib_pportdata *, unsigned int op,
+		int ctxt);
+	/* Read/modify/write sendctrl appropriately for op and port. */
+	void (*f_sendctrl)(struct qib_pportdata *, u32 op);
+	void (*f_set_intr_state)(struct qib_devdata *, u32);
+	void (*f_set_armlaunch)(struct qib_devdata *, u32);
+	void (*f_wantpiobuf_intr)(struct qib_devdata *, u32);
+	int (*f_late_initreg)(struct qib_devdata *);
+	int (*f_init_sdma_regs)(struct qib_pportdata *);
+	u16 (*f_sdma_gethead)(struct qib_pportdata *);
+	int (*f_sdma_busy)(struct qib_pportdata *);
+	void (*f_sdma_update_tail)(struct qib_pportdata *, u16);
+	void (*f_sdma_set_desc_cnt)(struct qib_pportdata *, unsigned);
+	void (*f_sdma_sendctrl)(struct qib_pportdata *, unsigned);
+	void (*f_sdma_hw_clean_up)(struct qib_pportdata *);
+	void (*f_sdma_hw_start_up)(struct qib_pportdata *);
+	void (*f_sdma_init_early)(struct qib_pportdata *);
+	void (*f_set_cntr_sample)(struct qib_pportdata *, u32, u32);
+	void (*f_update_usrhead)(struct qib_ctxtdata *, u64, u32, u32, u32);
+	u32 (*f_hdrqempty)(struct qib_ctxtdata *);
+	u64 (*f_portcntr)(struct qib_pportdata *, u32);
+	u32 (*f_read_cntrs)(struct qib_devdata *, loff_t, char **,
+		u64 **);
+	u32 (*f_read_portcntrs)(struct qib_devdata *, loff_t, u32,
+		char **, u64 **);
+	u32 (*f_setpbc_control)(struct qib_pportdata *, u32, u8, u8);
+	void (*f_initvl15_bufs)(struct qib_devdata *);
+	void (*f_init_ctxt)(struct qib_ctxtdata *);
+	void (*f_txchk_change)(struct qib_devdata *, u32, u32, u32,
+		struct qib_ctxtdata *);
+	void (*f_writescratch)(struct qib_devdata *, u32);
+	int (*f_tempsense_rd)(struct qib_devdata *, int regnum);
+#ifdef CONFIG_INFINIBAND_QIB_DCA
+	int (*f_notify_dca)(struct qib_devdata *, unsigned long event);
+#endif
+
+	char *boardname; /* human readable board info */
+
+	/* template for writing TIDs  */
+	u64 tidtemplate;
+	/* value to write to free TIDs */
+	u64 tidinvalid;
+
+	/* number of registers used for pioavail */
+	u32 pioavregs;
+	/* device (not port) flags, basically device capabilities */
+	u32 flags;
+	/* last buffer for user use */
+	u32 lastctxt_piobuf;
+
+	/* reset value */
+	u64 z_int_counter;
+	/* percpu intcounter */
+	u64 __percpu *int_counter;
+
+	/* pio bufs allocated per ctxt */
+	u32 pbufsctxt;
+	/* if remainder on bufs/ctxt, ctxts < extrabuf get 1 extra */
+	u32 ctxts_extrabuf;
+	/*
+	 * number of ctxts configured as max; zero is set to number chip
+	 * supports, less gives more pio bufs/ctxt, etc.
+	 */
+	u32 cfgctxts;
+	/*
+	 * number of ctxts available for PSM open
+	 */
+	u32 freectxts;
+
+	/*
+	 * hint that we should update pioavailshadow before
+	 * looking for a PIO buffer
+	 */
+	u32 upd_pio_shadow;
+
+	/* internal debugging stats */
+	u32 maxpkts_call;
+	u32 avgpkts_call;
+	u64 nopiobufs;
+
+	/* PCI Vendor ID (here for NodeInfo) */
+	u16 vendorid;
+	/* PCI Device ID (here for NodeInfo) */
+	u16 deviceid;
+	/* for write combining settings */
+	unsigned long wc_cookie;
+	unsigned long wc_base;
+	unsigned long wc_len;
+
+	/* shadow copy of struct page *'s for exp tid pages */
+	struct page **pageshadow;
+	/* shadow copy of dma handles for exp tid pages */
+	dma_addr_t *physshadow;
+	u64 __iomem *egrtidbase;
+	spinlock_t sendctrl_lock; /* protect changes to sendctrl shadow */
+	/* around rcd and (user ctxts) ctxt_cnt use (intr vs free) */
+	spinlock_t uctxt_lock; /* rcd and user context changes */
+	/*
+	 * per unit status, see also portdata statusp
+	 * mapped readonly into user processes so they can get unit and
+	 * IB link status cheaply
+	 */
+	u64 *devstatusp;
+	char *freezemsg; /* freeze msg if hw error put chip in freeze */
+	u32 freezelen; /* max length of freezemsg */
+	/* timer used to prevent stats overflow, error throttling, etc. */
+	struct timer_list stats_timer;
+
+	/* timer to verify interrupts work, and fallback if possible */
+	struct timer_list intrchk_timer;
+	unsigned long ureg_align; /* user register alignment */
+
+	/*
+	 * Protects pioavailshadow, pioavailkernel, pio_need_disarm, and
+	 * pio_writing.
+	 */
+	spinlock_t pioavail_lock;
+	/*
+	 * index of last buffer to optimize search for next
+	 */
+	u32 last_pio;
+	/*
+	 * min kernel pio buffer to optimize search
+	 */
+	u32 min_kernel_pio;
+	/*
+	 * Shadow copies of registers; size indicates read access size.
+	 * Most of them are readonly, but some are write-only register,
+	 * where we manipulate the bits in the shadow copy, and then write
+	 * the shadow copy to qlogic_ib.
+	 *
+	 * We deliberately make most of these 32 bits, since they have
+	 * restricted range.  For any that we read, we won't to generate 32
+	 * bit accesses, since Opteron will generate 2 separate 32 bit HT
+	 * transactions for a 64 bit read, and we want to avoid unnecessary
+	 * bus transactions.
+	 */
+
+	/* This is the 64 bit group */
+
+	unsigned long pioavailshadow[6];
+	/* bitmap of send buffers available for the kernel to use with PIO. */
+	unsigned long pioavailkernel[6];
+	/* bitmap of send buffers which need to be disarmed. */
+	unsigned long pio_need_disarm[3];
+	/* bitmap of send buffers which are being written to. */
+	unsigned long pio_writing[3];
+	/* kr_revision shadow */
+	u64 revision;
+	/* Base GUID for device (from eeprom, network order) */
+	__be64 base_guid;
+
+	/*
+	 * kr_sendpiobufbase value (chip offset of pio buffers), and the
+	 * base of the 2KB buffer s(user processes only use 2K)
+	 */
+	u64 piobufbase;
+	u32 pio2k_bufbase;
+
+	/* these are the "32 bit" regs */
+
+	/* number of GUIDs in the flash for this interface */
+	u32 nguid;
+	/*
+	 * the following two are 32-bit bitmasks, but {test,clear,set}_bit
+	 * all expect bit fields to be "unsigned long"
+	 */
+	unsigned long rcvctrl; /* shadow per device rcvctrl */
+	unsigned long sendctrl; /* shadow per device sendctrl */
+
+	/* value we put in kr_rcvhdrcnt */
+	u32 rcvhdrcnt;
+	/* value we put in kr_rcvhdrsize */
+	u32 rcvhdrsize;
+	/* value we put in kr_rcvhdrentsize */
+	u32 rcvhdrentsize;
+	/* kr_ctxtcnt value */
+	u32 ctxtcnt;
+	/* kr_pagealign value */
+	u32 palign;
+	/* number of "2KB" PIO buffers */
+	u32 piobcnt2k;
+	/* size in bytes of "2KB" PIO buffers */
+	u32 piosize2k;
+	/* max usable size in dwords of a "2KB" PIO buffer before going "4KB" */
+	u32 piosize2kmax_dwords;
+	/* number of "4KB" PIO buffers */
+	u32 piobcnt4k;
+	/* size in bytes of "4KB" PIO buffers */
+	u32 piosize4k;
+	/* kr_rcvegrbase value */
+	u32 rcvegrbase;
+	/* kr_rcvtidbase value */
+	u32 rcvtidbase;
+	/* kr_rcvtidcnt value */
+	u32 rcvtidcnt;
+	/* kr_userregbase */
+	u32 uregbase;
+	/* shadow the control register contents */
+	u32 control;
+
+	/* chip address space used by 4k pio buffers */
+	u32 align4k;
+	/* size of each rcvegrbuffer */
+	u16 rcvegrbufsize;
+	/* log2 of above */
+	u16 rcvegrbufsize_shift;
+	/* localbus width (1, 2,4,8,16,32) from config space  */
+	u32 lbus_width;
+	/* localbus speed in MHz */
+	u32 lbus_speed;
+	int unit; /* unit # of this chip */
+
+	/* start of CHIP_SPEC move to chipspec, but need code changes */
+	/* low and high portions of MSI capability/vector */
+	u32 msi_lo;
+	/* saved after PCIe init for restore after reset */
+	u32 msi_hi;
+	/* MSI data (vector) saved for restore */
+	u16 msi_data;
+	/* so we can rewrite it after a chip reset */
+	u32 pcibar0;
+	/* so we can rewrite it after a chip reset */
+	u32 pcibar1;
+	u64 rhdrhead_intr_off;
+
+	/*
+	 * ASCII serial number, from flash, large enough for original
+	 * all digit strings, and longer QLogic serial number format
+	 */
+	u8 serial[16];
+	/* human readable board version */
+	u8 boardversion[96];
+	u8 lbus_info[32]; /* human readable localbus info */
+	/* chip major rev, from qib_revision */
+	u8 majrev;
+	/* chip minor rev, from qib_revision */
+	u8 minrev;
+
+	/* Misc small ints */
+	/* Number of physical ports available */
+	u8 num_pports;
+	/* Lowest context number which can be used by user processes */
+	u8 first_user_ctxt;
+	u8 n_krcv_queues;
+	u8 qpn_mask;
+	u8 skip_kctxt_mask;
+
+	u16 rhf_offset; /* offset of RHF within receive header entry */
+
+	/*
+	 * GPIO pins for twsi-connected devices, and device code for eeprom
+	 */
+	u8 gpio_sda_num;
+	u8 gpio_scl_num;
+	u8 twsi_eeprom_dev;
+	u8 board_atten;
+
+	/* Support (including locks) for EEPROM logging of errors and time */
+	/* control access to actual counters, timer */
+	spinlock_t eep_st_lock;
+	/* control high-level access to EEPROM */
+	struct mutex eep_lock;
+	uint64_t traffic_wds;
+	/* active time is kept in seconds, but logged in hours */
+	atomic_t active_time;
+	/* Below are nominal shadow of EEPROM, new since last EEPROM update */
+	uint8_t eep_st_errs[QIB_EEP_LOG_CNT];
+	uint8_t eep_st_new_errs[QIB_EEP_LOG_CNT];
+	uint16_t eep_hrs;
+	/*
+	 * masks for which bits of errs, hwerrs that cause
+	 * each of the counters to increment.
+	 */
+	struct qib_eep_log_mask eep_st_masks[QIB_EEP_LOG_CNT];
+	struct qib_diag_client *diag_client;
+	spinlock_t qib_diag_trans_lock; /* protect diag observer ops */
+	struct diag_observer_list_elt *diag_observer_list;
+
+	u8 psxmitwait_supported;
+	/* cycle length of PS* counters in HW (in picoseconds) */
+	u16 psxmitwait_check_rate;
+	/* high volume overflow errors defered to tasklet */
+	struct tasklet_struct error_tasklet;
+	/* per device cq worker */
+	struct kthread_worker *worker;
+
+	int assigned_node_id; /* NUMA node closest to HCA */
+};
+
+/* hol_state values */
+#define QIB_HOL_UP       0
+#define QIB_HOL_INIT     1
+
+#define QIB_SDMA_SENDCTRL_OP_ENABLE    (1U << 0)
+#define QIB_SDMA_SENDCTRL_OP_INTENABLE (1U << 1)
+#define QIB_SDMA_SENDCTRL_OP_HALT      (1U << 2)
+#define QIB_SDMA_SENDCTRL_OP_CLEANUP   (1U << 3)
+#define QIB_SDMA_SENDCTRL_OP_DRAIN     (1U << 4)
+
+/* operation types for f_txchk_change() */
+#define TXCHK_CHG_TYPE_DIS1  3
+#define TXCHK_CHG_TYPE_ENAB1 2
+#define TXCHK_CHG_TYPE_KERN  1
+#define TXCHK_CHG_TYPE_USER  0
+
+#define QIB_CHASE_TIME msecs_to_jiffies(145)
+#define QIB_CHASE_DIS_TIME msecs_to_jiffies(160)
+
+/* Private data for file operations */
+struct qib_filedata {
+	struct qib_ctxtdata *rcd;
+	unsigned subctxt;
+	unsigned tidcursor;
+	struct qib_user_sdma_queue *pq;
+	int rec_cpu_num; /* for cpu affinity; -1 if none */
+};
+
+extern struct list_head qib_dev_list;
+extern spinlock_t qib_devs_lock;
+extern struct qib_devdata *qib_lookup(int unit);
+extern u32 qib_cpulist_count;
+extern unsigned long *qib_cpulist;
+
+extern unsigned qib_wc_pat;
+extern unsigned qib_cc_table_size;
+int qib_init(struct qib_devdata *, int);
+int init_chip_wc_pat(struct qib_devdata *dd, u32);
+int qib_enable_wc(struct qib_devdata *dd);
+void qib_disable_wc(struct qib_devdata *dd);
+int qib_count_units(int *npresentp, int *nupp);
+int qib_count_active_units(void);
+
+int qib_cdev_init(int minor, const char *name,
+		  const struct file_operations *fops,
+		  struct cdev **cdevp, struct device **devp);
+void qib_cdev_cleanup(struct cdev **cdevp, struct device **devp);
+int qib_dev_init(void);
+void qib_dev_cleanup(void);
+
+int qib_diag_add(struct qib_devdata *);
+void qib_diag_remove(struct qib_devdata *);
+void qib_handle_e_ibstatuschanged(struct qib_pportdata *, u64);
+void qib_sdma_update_tail(struct qib_pportdata *, u16); /* hold sdma_lock */
+
+int qib_decode_err(struct qib_devdata *dd, char *buf, size_t blen, u64 err);
+void qib_bad_intrstatus(struct qib_devdata *);
+void qib_handle_urcv(struct qib_devdata *, u64);
+
+/* clean up any per-chip chip-specific stuff */
+void qib_chip_cleanup(struct qib_devdata *);
+/* clean up any chip type-specific stuff */
+void qib_chip_done(void);
+
+/* check to see if we have to force ordering for write combining */
+int qib_unordered_wc(void);
+void qib_pio_copy(void __iomem *to, const void *from, size_t count);
+
+void qib_disarm_piobufs(struct qib_devdata *, unsigned, unsigned);
+int qib_disarm_piobufs_ifneeded(struct qib_ctxtdata *);
+void qib_disarm_piobufs_set(struct qib_devdata *, unsigned long *, unsigned);
+void qib_cancel_sends(struct qib_pportdata *);
+
+int qib_create_rcvhdrq(struct qib_devdata *, struct qib_ctxtdata *);
+int qib_setup_eagerbufs(struct qib_ctxtdata *);
+void qib_set_ctxtcnt(struct qib_devdata *);
+int qib_create_ctxts(struct qib_devdata *dd);
+struct qib_ctxtdata *qib_create_ctxtdata(struct qib_pportdata *, u32, int);
+int qib_init_pportdata(struct qib_pportdata *, struct qib_devdata *, u8, u8);
+void qib_free_ctxtdata(struct qib_devdata *, struct qib_ctxtdata *);
+
+u32 qib_kreceive(struct qib_ctxtdata *, u32 *, u32 *);
+int qib_reset_device(int);
+int qib_wait_linkstate(struct qib_pportdata *, u32, int);
+int qib_set_linkstate(struct qib_pportdata *, u8);
+int qib_set_mtu(struct qib_pportdata *, u16);
+int qib_set_lid(struct qib_pportdata *, u32, u8);
+void qib_hol_down(struct qib_pportdata *);
+void qib_hol_init(struct qib_pportdata *);
+void qib_hol_up(struct qib_pportdata *);
+void qib_hol_event(unsigned long);
+void qib_disable_after_error(struct qib_devdata *);
+int qib_set_uevent_bits(struct qib_pportdata *, const int);
+
+/* for use in system calls, where we want to know device type, etc. */
+#define ctxt_fp(fp) \
+	(((struct qib_filedata *)(fp)->private_data)->rcd)
+#define subctxt_fp(fp) \
+	(((struct qib_filedata *)(fp)->private_data)->subctxt)
+#define tidcursor_fp(fp) \
+	(((struct qib_filedata *)(fp)->private_data)->tidcursor)
+#define user_sdma_queue_fp(fp) \
+	(((struct qib_filedata *)(fp)->private_data)->pq)
+
+static inline struct qib_devdata *dd_from_ppd(struct qib_pportdata *ppd)
+{
+	return ppd->dd;
+}
+
+static inline struct qib_devdata *dd_from_dev(struct qib_ibdev *dev)
+{
+	return container_of(dev, struct qib_devdata, verbs_dev);
+}
+
+static inline struct qib_devdata *dd_from_ibdev(struct ib_device *ibdev)
+{
+	return dd_from_dev(to_idev(ibdev));
+}
+
+static inline struct qib_pportdata *ppd_from_ibp(struct qib_ibport *ibp)
+{
+	return container_of(ibp, struct qib_pportdata, ibport_data);
+}
+
+static inline struct qib_ibport *to_iport(struct ib_device *ibdev, u8 port)
+{
+	struct qib_devdata *dd = dd_from_ibdev(ibdev);
+	unsigned pidx = port - 1; /* IB number port from 1, hdw from 0 */
+
+	WARN_ON(pidx >= dd->num_pports);
+	return &dd->pport[pidx].ibport_data;
+}
+
+/*
+ * values for dd->flags (_device_ related flags) and
+ */
+#define QIB_HAS_LINK_LATENCY  0x1 /* supports link latency (IB 1.2) */
+#define QIB_INITTED           0x2 /* chip and driver up and initted */
+#define QIB_DOING_RESET       0x4  /* in the middle of doing chip reset */
+#define QIB_PRESENT           0x8  /* chip accesses can be done */
+#define QIB_PIO_FLUSH_WC      0x10 /* Needs Write combining flush for PIO */
+#define QIB_HAS_THRESH_UPDATE 0x40
+#define QIB_HAS_SDMA_TIMEOUT  0x80
+#define QIB_USE_SPCL_TRIG     0x100 /* SpecialTrigger launch enabled */
+#define QIB_NODMA_RTAIL       0x200 /* rcvhdrtail register DMA enabled */
+#define QIB_HAS_INTX          0x800 /* Supports INTx interrupts */
+#define QIB_HAS_SEND_DMA      0x1000 /* Supports Send DMA */
+#define QIB_HAS_VLSUPP        0x2000 /* Supports multiple VLs; PBC different */
+#define QIB_HAS_HDRSUPP       0x4000 /* Supports header suppression */
+#define QIB_BADINTR           0x8000 /* severe interrupt problems */
+#define QIB_DCA_ENABLED       0x10000 /* Direct Cache Access enabled */
+#define QIB_HAS_QSFP          0x20000 /* device (card instance) has QSFP */
+
+/*
+ * values for ppd->lflags (_ib_port_ related flags)
+ */
+#define QIBL_LINKV             0x1 /* IB link state valid */
+#define QIBL_LINKDOWN          0x8 /* IB link is down */
+#define QIBL_LINKINIT          0x10 /* IB link level is up */
+#define QIBL_LINKARMED         0x20 /* IB link is ARMED */
+#define QIBL_LINKACTIVE        0x40 /* IB link is ACTIVE */
+/* leave a gap for more IB-link state */
+#define QIBL_IB_AUTONEG_INPROG 0x1000 /* non-IBTA DDR/QDR neg active */
+#define QIBL_IB_AUTONEG_FAILED 0x2000 /* non-IBTA DDR/QDR neg failed */
+#define QIBL_IB_LINK_DISABLED  0x4000 /* Linkdown-disable forced,
+				       * Do not try to bring up */
+#define QIBL_IB_FORCE_NOTIFY   0x8000 /* force notify on next ib change */
+
+/* IB dword length mask in PBC (lower 11 bits); same for all chips */
+#define QIB_PBC_LENGTH_MASK                     ((1 << 11) - 1)
+
+
+/* ctxt_flag bit offsets */
+		/* waiting for a packet to arrive */
+#define QIB_CTXT_WAITING_RCV   2
+		/* master has not finished initializing */
+#define QIB_CTXT_MASTER_UNINIT 4
+		/* waiting for an urgent packet to arrive */
+#define QIB_CTXT_WAITING_URG 5
+
+/* free up any allocated data at closes */
+void qib_free_data(struct qib_ctxtdata *dd);
+void qib_chg_pioavailkernel(struct qib_devdata *, unsigned, unsigned,
+			    u32, struct qib_ctxtdata *);
+struct qib_devdata *qib_init_iba7322_funcs(struct pci_dev *,
+					   const struct pci_device_id *);
+struct qib_devdata *qib_init_iba7220_funcs(struct pci_dev *,
+					   const struct pci_device_id *);
+struct qib_devdata *qib_init_iba6120_funcs(struct pci_dev *,
+					   const struct pci_device_id *);
+void qib_free_devdata(struct qib_devdata *);
+struct qib_devdata *qib_alloc_devdata(struct pci_dev *pdev, size_t extra);
+
+#define QIB_TWSI_NO_DEV 0xFF
+/* Below qib_twsi_ functions must be called with eep_lock held */
+int qib_twsi_reset(struct qib_devdata *dd);
+int qib_twsi_blk_rd(struct qib_devdata *dd, int dev, int addr, void *buffer,
+		    int len);
+int qib_twsi_blk_wr(struct qib_devdata *dd, int dev, int addr,
+		    const void *buffer, int len);
+void qib_get_eeprom_info(struct qib_devdata *);
+int qib_update_eeprom_log(struct qib_devdata *dd);
+void qib_inc_eeprom_err(struct qib_devdata *dd, u32 eidx, u32 incr);
+void qib_dump_lookup_output_queue(struct qib_devdata *);
+void qib_force_pio_avail_update(struct qib_devdata *);
+void qib_clear_symerror_on_linkup(unsigned long opaque);
+
+/*
+ * Set LED override, only the two LSBs have "public" meaning, but
+ * any non-zero value substitutes them for the Link and LinkTrain
+ * LED states.
+ */
+#define QIB_LED_PHYS 1 /* Physical (linktraining) GREEN LED */
+#define QIB_LED_LOG 2  /* Logical (link) YELLOW LED */
+void qib_set_led_override(struct qib_pportdata *ppd, unsigned int val);
+
+/* send dma routines */
+int qib_setup_sdma(struct qib_pportdata *);
+void qib_teardown_sdma(struct qib_pportdata *);
+void __qib_sdma_intr(struct qib_pportdata *);
+void qib_sdma_intr(struct qib_pportdata *);
+void qib_user_sdma_send_desc(struct qib_pportdata *dd,
+			struct list_head *pktlist);
+int qib_sdma_verbs_send(struct qib_pportdata *, struct qib_sge_state *,
+			u32, struct qib_verbs_txreq *);
+/* ppd->sdma_lock should be locked before calling this. */
+int qib_sdma_make_progress(struct qib_pportdata *dd);
+
+static inline int qib_sdma_empty(const struct qib_pportdata *ppd)
+{
+	return ppd->sdma_descq_added == ppd->sdma_descq_removed;
+}
+
+/* must be called under qib_sdma_lock */
+static inline u16 qib_sdma_descq_freecnt(const struct qib_pportdata *ppd)
+{
+	return ppd->sdma_descq_cnt -
+		(ppd->sdma_descq_added - ppd->sdma_descq_removed) - 1;
+}
+
+static inline int __qib_sdma_running(struct qib_pportdata *ppd)
+{
+	return ppd->sdma_state.current_state == qib_sdma_state_s99_running;
+}
+int qib_sdma_running(struct qib_pportdata *);
+void dump_sdma_state(struct qib_pportdata *ppd);
+void __qib_sdma_process_event(struct qib_pportdata *, enum qib_sdma_events);
+void qib_sdma_process_event(struct qib_pportdata *, enum qib_sdma_events);
+
+/*
+ * number of words used for protocol header if not set by qib_userinit();
+ */
+#define QIB_DFLT_RCVHDRSIZE 9
+
+/*
+ * We need to be able to handle an IB header of at least 24 dwords.
+ * We need the rcvhdrq large enough to handle largest IB header, but
+ * still have room for a 2KB MTU standard IB packet.
+ * Additionally, some processor/memory controller combinations
+ * benefit quite strongly from having the DMA'ed data be cacheline
+ * aligned and a cacheline multiple, so we set the size to 32 dwords
+ * (2 64-byte primary cachelines for pretty much all processors of
+ * interest).  The alignment hurts nothing, other than using somewhat
+ * more memory.
+ */
+#define QIB_RCVHDR_ENTSIZE 32
+
+int qib_get_user_pages(unsigned long, size_t, struct page **);
+void qib_release_user_pages(struct page **, size_t);
+int qib_eeprom_read(struct qib_devdata *, u8, void *, int);
+int qib_eeprom_write(struct qib_devdata *, u8, const void *, int);
+u32 __iomem *qib_getsendbuf_range(struct qib_devdata *, u32 *, u32, u32);
+void qib_sendbuf_done(struct qib_devdata *, unsigned);
+
+static inline void qib_clear_rcvhdrtail(const struct qib_ctxtdata *rcd)
+{
+	*((u64 *) rcd->rcvhdrtail_kvaddr) = 0ULL;
+}
+
+static inline u32 qib_get_rcvhdrtail(const struct qib_ctxtdata *rcd)
+{
+	/*
+	 * volatile because it's a DMA target from the chip, routine is
+	 * inlined, and don't want register caching or reordering.
+	 */
+	return (u32) le64_to_cpu(
+		*((volatile __le64 *)rcd->rcvhdrtail_kvaddr)); /* DMA'ed */
+}
+
+static inline u32 qib_get_hdrqtail(const struct qib_ctxtdata *rcd)
+{
+	const struct qib_devdata *dd = rcd->dd;
+	u32 hdrqtail;
+
+	if (dd->flags & QIB_NODMA_RTAIL) {
+		__le32 *rhf_addr;
+		u32 seq;
+
+		rhf_addr = (__le32 *) rcd->rcvhdrq +
+			rcd->head + dd->rhf_offset;
+		seq = qib_hdrget_seq(rhf_addr);
+		hdrqtail = rcd->head;
+		if (seq == rcd->seq_cnt)
+			hdrqtail++;
+	} else
+		hdrqtail = qib_get_rcvhdrtail(rcd);
+
+	return hdrqtail;
+}
+
+/*
+ * sysfs interface.
+ */
+
+extern const char ib_qib_version[];
+
+int qib_device_create(struct qib_devdata *);
+void qib_device_remove(struct qib_devdata *);
+
+int qib_create_port_files(struct ib_device *ibdev, u8 port_num,
+			  struct kobject *kobj);
+int qib_verbs_register_sysfs(struct qib_devdata *);
+void qib_verbs_unregister_sysfs(struct qib_devdata *);
+/* Hook for sysfs read of QSFP */
+extern int qib_qsfp_dump(struct qib_pportdata *ppd, char *buf, int len);
+
+int __init qib_init_qibfs(void);
+int __exit qib_exit_qibfs(void);
+
+int qibfs_add(struct qib_devdata *);
+int qibfs_remove(struct qib_devdata *);
+
+int qib_pcie_init(struct pci_dev *, const struct pci_device_id *);
+int qib_pcie_ddinit(struct qib_devdata *, struct pci_dev *,
+		    const struct pci_device_id *);
+void qib_pcie_ddcleanup(struct qib_devdata *);
+int qib_pcie_params(struct qib_devdata *, u32, u32 *, struct qib_msix_entry *);
+int qib_reinit_intr(struct qib_devdata *);
+void qib_enable_intx(struct pci_dev *);
+void qib_nomsi(struct qib_devdata *);
+void qib_nomsix(struct qib_devdata *);
+void qib_pcie_getcmd(struct qib_devdata *, u16 *, u8 *, u8 *);
+void qib_pcie_reenable(struct qib_devdata *, u16, u8, u8);
+/* interrupts for device */
+u64 qib_int_counter(struct qib_devdata *);
+/* interrupt for all devices */
+u64 qib_sps_ints(void);
+
+/*
+ * dma_addr wrappers - all 0's invalid for hw
+ */
+dma_addr_t qib_map_page(struct pci_dev *, struct page *, unsigned long,
+			  size_t, int);
+const char *qib_get_unit_name(int unit);
+
+/*
+ * Flush write combining store buffers (if present) and perform a write
+ * barrier.
+ */
+#if defined(CONFIG_X86_64)
+#define qib_flush_wc() asm volatile("sfence" : : : "memory")
+#else
+#define qib_flush_wc() wmb() /* no reorder around wc flush */
+#endif
+
+/* global module parameter variables */
+extern unsigned qib_ibmtu;
+extern ushort qib_cfgctxts;
+extern ushort qib_num_cfg_vls;
+extern ushort qib_mini_init; /* If set, do few (ideally 0) writes to chip */
+extern unsigned qib_n_krcv_queues;
+extern unsigned qib_sdma_fetch_arb;
+extern unsigned qib_compat_ddr_negotiate;
+extern int qib_special_trigger;
+extern unsigned qib_numa_aware;
+
+extern struct mutex qib_mutex;
+
+/* Number of seconds before our card status check...  */
+#define STATUS_TIMEOUT 60
+
+#define QIB_DRV_NAME            "ib_qib"
+#define QIB_USER_MINOR_BASE     0
+#define QIB_TRACE_MINOR         127
+#define QIB_DIAGPKT_MINOR       128
+#define QIB_DIAG_MINOR_BASE     129
+#define QIB_NMINORS             255
+
+#define PCI_VENDOR_ID_PATHSCALE 0x1fc1
+#define PCI_VENDOR_ID_QLOGIC 0x1077
+#define PCI_DEVICE_ID_QLOGIC_IB_6120 0x10
+#define PCI_DEVICE_ID_QLOGIC_IB_7220 0x7220
+#define PCI_DEVICE_ID_QLOGIC_IB_7322 0x7322
+
+/*
+ * qib_early_err is used (only!) to print early errors before devdata is
+ * allocated, or when dd->pcidev may not be valid, and at the tail end of
+ * cleanup when devdata may have been freed, etc.  qib_dev_porterr is
+ * the same as qib_dev_err, but is used when the message really needs
+ * the IB port# to be definitive as to what's happening..
+ * All of these go to the trace log, and the trace log entry is done
+ * first to avoid possible serial port delays from printk.
+ */
+#define qib_early_err(dev, fmt, ...) \
+	dev_err(dev, fmt, ##__VA_ARGS__)
+
+#define qib_dev_err(dd, fmt, ...) \
+	dev_err(&(dd)->pcidev->dev, "%s: " fmt, \
+		qib_get_unit_name((dd)->unit), ##__VA_ARGS__)
+
+#define qib_dev_warn(dd, fmt, ...) \
+	dev_warn(&(dd)->pcidev->dev, "%s: " fmt, \
+		qib_get_unit_name((dd)->unit), ##__VA_ARGS__)
+
+#define qib_dev_porterr(dd, port, fmt, ...) \
+	dev_err(&(dd)->pcidev->dev, "%s: IB%u:%u " fmt, \
+		qib_get_unit_name((dd)->unit), (dd)->unit, (port), \
+		##__VA_ARGS__)
+
+#define qib_devinfo(pcidev, fmt, ...) \
+	dev_info(&(pcidev)->dev, fmt, ##__VA_ARGS__)
+
+/*
+ * this is used for formatting hw error messages...
+ */
+struct qib_hwerror_msgs {
+	u64 mask;
+	const char *msg;
+	size_t sz;
+};
+
+#define QLOGIC_IB_HWE_MSG(a, b) { .mask = a, .msg = b }
+
+/* in qib_intr.c... */
+void qib_format_hwerrors(u64 hwerrs,
+			 const struct qib_hwerror_msgs *hwerrmsgs,
+			 size_t nhwerrmsgs, char *msg, size_t lmsg);
+#endif                          /* _QIB_KERNEL_H */
diff --git a/drivers/infiniband/hw/qib/qib_6120_regs.h b/drivers/infiniband/hw/qib/qib_6120_regs.h
new file mode 100644
index 0000000..e16cb6f
--- /dev/null
+++ b/drivers/infiniband/hw/qib/qib_6120_regs.h
@@ -0,0 +1,977 @@
+/*
+ * Copyright (c) 2008, 2009, 2010 QLogic Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/* This file is mechanically generated from RTL. Any hand-edits will be lost! */
+
+#define QIB_6120_Revision_OFFS 0x0
+#define QIB_6120_Revision_R_Simulator_LSB 0x3F
+#define QIB_6120_Revision_R_Simulator_RMASK 0x1
+#define QIB_6120_Revision_Reserved_LSB 0x28
+#define QIB_6120_Revision_Reserved_RMASK 0x7FFFFF
+#define QIB_6120_Revision_BoardID_LSB 0x20
+#define QIB_6120_Revision_BoardID_RMASK 0xFF
+#define QIB_6120_Revision_R_SW_LSB 0x18
+#define QIB_6120_Revision_R_SW_RMASK 0xFF
+#define QIB_6120_Revision_R_Arch_LSB 0x10
+#define QIB_6120_Revision_R_Arch_RMASK 0xFF
+#define QIB_6120_Revision_R_ChipRevMajor_LSB 0x8
+#define QIB_6120_Revision_R_ChipRevMajor_RMASK 0xFF
+#define QIB_6120_Revision_R_ChipRevMinor_LSB 0x0
+#define QIB_6120_Revision_R_ChipRevMinor_RMASK 0xFF
+
+#define QIB_6120_Control_OFFS 0x8
+#define QIB_6120_Control_TxLatency_LSB 0x4
+#define QIB_6120_Control_TxLatency_RMASK 0x1
+#define QIB_6120_Control_PCIERetryBufDiagEn_LSB 0x3
+#define QIB_6120_Control_PCIERetryBufDiagEn_RMASK 0x1
+#define QIB_6120_Control_LinkEn_LSB 0x2
+#define QIB_6120_Control_LinkEn_RMASK 0x1
+#define QIB_6120_Control_FreezeMode_LSB 0x1
+#define QIB_6120_Control_FreezeMode_RMASK 0x1
+#define QIB_6120_Control_SyncReset_LSB 0x0
+#define QIB_6120_Control_SyncReset_RMASK 0x1
+
+#define QIB_6120_PageAlign_OFFS 0x10
+
+#define QIB_6120_PortCnt_OFFS 0x18
+
+#define QIB_6120_SendRegBase_OFFS 0x30
+
+#define QIB_6120_UserRegBase_OFFS 0x38
+
+#define QIB_6120_CntrRegBase_OFFS 0x40
+
+#define QIB_6120_Scratch_OFFS 0x48
+#define QIB_6120_Scratch_TopHalf_LSB 0x20
+#define QIB_6120_Scratch_TopHalf_RMASK 0xFFFFFFFF
+#define QIB_6120_Scratch_BottomHalf_LSB 0x0
+#define QIB_6120_Scratch_BottomHalf_RMASK 0xFFFFFFFF
+
+#define QIB_6120_IntBlocked_OFFS 0x60
+#define QIB_6120_IntBlocked_ErrorIntBlocked_LSB 0x1F
+#define QIB_6120_IntBlocked_ErrorIntBlocked_RMASK 0x1
+#define QIB_6120_IntBlocked_PioSetIntBlocked_LSB 0x1E
+#define QIB_6120_IntBlocked_PioSetIntBlocked_RMASK 0x1
+#define QIB_6120_IntBlocked_PioBufAvailIntBlocked_LSB 0x1D
+#define QIB_6120_IntBlocked_PioBufAvailIntBlocked_RMASK 0x1
+#define QIB_6120_IntBlocked_assertGPIOIntBlocked_LSB 0x1C
+#define QIB_6120_IntBlocked_assertGPIOIntBlocked_RMASK 0x1
+#define QIB_6120_IntBlocked_Reserved_LSB 0xF
+#define QIB_6120_IntBlocked_Reserved_RMASK 0x1FFF
+#define QIB_6120_IntBlocked_RcvAvail4IntBlocked_LSB 0x10
+#define QIB_6120_IntBlocked_RcvAvail4IntBlocked_RMASK 0x1
+#define QIB_6120_IntBlocked_RcvAvail3IntBlocked_LSB 0xF
+#define QIB_6120_IntBlocked_RcvAvail3IntBlocked_RMASK 0x1
+#define QIB_6120_IntBlocked_RcvAvail2IntBlocked_LSB 0xE
+#define QIB_6120_IntBlocked_RcvAvail2IntBlocked_RMASK 0x1
+#define QIB_6120_IntBlocked_RcvAvail1IntBlocked_LSB 0xD
+#define QIB_6120_IntBlocked_RcvAvail1IntBlocked_RMASK 0x1
+#define QIB_6120_IntBlocked_RcvAvail0IntBlocked_LSB 0xC
+#define QIB_6120_IntBlocked_RcvAvail0IntBlocked_RMASK 0x1
+#define QIB_6120_IntBlocked_Reserved1_LSB 0x5
+#define QIB_6120_IntBlocked_Reserved1_RMASK 0x7F
+#define QIB_6120_IntBlocked_RcvUrg4IntBlocked_LSB 0x4
+#define QIB_6120_IntBlocked_RcvUrg4IntBlocked_RMASK 0x1
+#define QIB_6120_IntBlocked_RcvUrg3IntBlocked_LSB 0x3
+#define QIB_6120_IntBlocked_RcvUrg3IntBlocked_RMASK 0x1
+#define QIB_6120_IntBlocked_RcvUrg2IntBlocked_LSB 0x2
+#define QIB_6120_IntBlocked_RcvUrg2IntBlocked_RMASK 0x1
+#define QIB_6120_IntBlocked_RcvUrg1IntBlocked_LSB 0x1
+#define QIB_6120_IntBlocked_RcvUrg1IntBlocked_RMASK 0x1
+#define QIB_6120_IntBlocked_RcvUrg0IntBlocked_LSB 0x0
+#define QIB_6120_IntBlocked_RcvUrg0IntBlocked_RMASK 0x1
+
+#define QIB_6120_IntMask_OFFS 0x68
+#define QIB_6120_IntMask_ErrorIntMask_LSB 0x1F
+#define QIB_6120_IntMask_ErrorIntMask_RMASK 0x1
+#define QIB_6120_IntMask_PioSetIntMask_LSB 0x1E
+#define QIB_6120_IntMask_PioSetIntMask_RMASK 0x1
+#define QIB_6120_IntMask_PioBufAvailIntMask_LSB 0x1D
+#define QIB_6120_IntMask_PioBufAvailIntMask_RMASK 0x1
+#define QIB_6120_IntMask_assertGPIOIntMask_LSB 0x1C
+#define QIB_6120_IntMask_assertGPIOIntMask_RMASK 0x1
+#define QIB_6120_IntMask_Reserved_LSB 0x11
+#define QIB_6120_IntMask_Reserved_RMASK 0x7FF
+#define QIB_6120_IntMask_RcvAvail4IntMask_LSB 0x10
+#define QIB_6120_IntMask_RcvAvail4IntMask_RMASK 0x1
+#define QIB_6120_IntMask_RcvAvail3IntMask_LSB 0xF
+#define QIB_6120_IntMask_RcvAvail3IntMask_RMASK 0x1
+#define QIB_6120_IntMask_RcvAvail2IntMask_LSB 0xE
+#define QIB_6120_IntMask_RcvAvail2IntMask_RMASK 0x1
+#define QIB_6120_IntMask_RcvAvail1IntMask_LSB 0xD
+#define QIB_6120_IntMask_RcvAvail1IntMask_RMASK 0x1
+#define QIB_6120_IntMask_RcvAvail0IntMask_LSB 0xC
+#define QIB_6120_IntMask_RcvAvail0IntMask_RMASK 0x1
+#define QIB_6120_IntMask_Reserved1_LSB 0x5
+#define QIB_6120_IntMask_Reserved1_RMASK 0x7F
+#define QIB_6120_IntMask_RcvUrg4IntMask_LSB 0x4
+#define QIB_6120_IntMask_RcvUrg4IntMask_RMASK 0x1
+#define QIB_6120_IntMask_RcvUrg3IntMask_LSB 0x3
+#define QIB_6120_IntMask_RcvUrg3IntMask_RMASK 0x1
+#define QIB_6120_IntMask_RcvUrg2IntMask_LSB 0x2
+#define QIB_6120_IntMask_RcvUrg2IntMask_RMASK 0x1
+#define QIB_6120_IntMask_RcvUrg1IntMask_LSB 0x1
+#define QIB_6120_IntMask_RcvUrg1IntMask_RMASK 0x1
+#define QIB_6120_IntMask_RcvUrg0IntMask_LSB 0x0
+#define QIB_6120_IntMask_RcvUrg0IntMask_RMASK 0x1
+
+#define QIB_6120_IntStatus_OFFS 0x70
+#define QIB_6120_IntStatus_Error_LSB 0x1F
+#define QIB_6120_IntStatus_Error_RMASK 0x1
+#define QIB_6120_IntStatus_PioSent_LSB 0x1E
+#define QIB_6120_IntStatus_PioSent_RMASK 0x1
+#define QIB_6120_IntStatus_PioBufAvail_LSB 0x1D
+#define QIB_6120_IntStatus_PioBufAvail_RMASK 0x1
+#define QIB_6120_IntStatus_assertGPIO_LSB 0x1C
+#define QIB_6120_IntStatus_assertGPIO_RMASK 0x1
+#define QIB_6120_IntStatus_Reserved_LSB 0xF
+#define QIB_6120_IntStatus_Reserved_RMASK 0x1FFF
+#define QIB_6120_IntStatus_RcvAvail4_LSB 0x10
+#define QIB_6120_IntStatus_RcvAvail4_RMASK 0x1
+#define QIB_6120_IntStatus_RcvAvail3_LSB 0xF
+#define QIB_6120_IntStatus_RcvAvail3_RMASK 0x1
+#define QIB_6120_IntStatus_RcvAvail2_LSB 0xE
+#define QIB_6120_IntStatus_RcvAvail2_RMASK 0x1
+#define QIB_6120_IntStatus_RcvAvail1_LSB 0xD
+#define QIB_6120_IntStatus_RcvAvail1_RMASK 0x1
+#define QIB_6120_IntStatus_RcvAvail0_LSB 0xC
+#define QIB_6120_IntStatus_RcvAvail0_RMASK 0x1
+#define QIB_6120_IntStatus_Reserved1_LSB 0x5
+#define QIB_6120_IntStatus_Reserved1_RMASK 0x7F
+#define QIB_6120_IntStatus_RcvUrg4_LSB 0x4
+#define QIB_6120_IntStatus_RcvUrg4_RMASK 0x1
+#define QIB_6120_IntStatus_RcvUrg3_LSB 0x3
+#define QIB_6120_IntStatus_RcvUrg3_RMASK 0x1
+#define QIB_6120_IntStatus_RcvUrg2_LSB 0x2
+#define QIB_6120_IntStatus_RcvUrg2_RMASK 0x1
+#define QIB_6120_IntStatus_RcvUrg1_LSB 0x1
+#define QIB_6120_IntStatus_RcvUrg1_RMASK 0x1
+#define QIB_6120_IntStatus_RcvUrg0_LSB 0x0
+#define QIB_6120_IntStatus_RcvUrg0_RMASK 0x1
+
+#define QIB_6120_IntClear_OFFS 0x78
+#define QIB_6120_IntClear_ErrorIntClear_LSB 0x1F
+#define QIB_6120_IntClear_ErrorIntClear_RMASK 0x1
+#define QIB_6120_IntClear_PioSetIntClear_LSB 0x1E
+#define QIB_6120_IntClear_PioSetIntClear_RMASK 0x1
+#define QIB_6120_IntClear_PioBufAvailIntClear_LSB 0x1D
+#define QIB_6120_IntClear_PioBufAvailIntClear_RMASK 0x1
+#define QIB_6120_IntClear_assertGPIOIntClear_LSB 0x1C
+#define QIB_6120_IntClear_assertGPIOIntClear_RMASK 0x1
+#define QIB_6120_IntClear_Reserved_LSB 0xF
+#define QIB_6120_IntClear_Reserved_RMASK 0x1FFF
+#define QIB_6120_IntClear_RcvAvail4IntClear_LSB 0x10
+#define QIB_6120_IntClear_RcvAvail4IntClear_RMASK 0x1
+#define QIB_6120_IntClear_RcvAvail3IntClear_LSB 0xF
+#define QIB_6120_IntClear_RcvAvail3IntClear_RMASK 0x1
+#define QIB_6120_IntClear_RcvAvail2IntClear_LSB 0xE
+#define QIB_6120_IntClear_RcvAvail2IntClear_RMASK 0x1
+#define QIB_6120_IntClear_RcvAvail1IntClear_LSB 0xD
+#define QIB_6120_IntClear_RcvAvail1IntClear_RMASK 0x1
+#define QIB_6120_IntClear_RcvAvail0IntClear_LSB 0xC
+#define QIB_6120_IntClear_RcvAvail0IntClear_RMASK 0x1
+#define QIB_6120_IntClear_Reserved1_LSB 0x5
+#define QIB_6120_IntClear_Reserved1_RMASK 0x7F
+#define QIB_6120_IntClear_RcvUrg4IntClear_LSB 0x4
+#define QIB_6120_IntClear_RcvUrg4IntClear_RMASK 0x1
+#define QIB_6120_IntClear_RcvUrg3IntClear_LSB 0x3
+#define QIB_6120_IntClear_RcvUrg3IntClear_RMASK 0x1
+#define QIB_6120_IntClear_RcvUrg2IntClear_LSB 0x2
+#define QIB_6120_IntClear_RcvUrg2IntClear_RMASK 0x1
+#define QIB_6120_IntClear_RcvUrg1IntClear_LSB 0x1
+#define QIB_6120_IntClear_RcvUrg1IntClear_RMASK 0x1
+#define QIB_6120_IntClear_RcvUrg0IntClear_LSB 0x0
+#define QIB_6120_IntClear_RcvUrg0IntClear_RMASK 0x1
+
+#define QIB_6120_ErrMask_OFFS 0x80
+#define QIB_6120_ErrMask_Reserved_LSB 0x34
+#define QIB_6120_ErrMask_Reserved_RMASK 0xFFF
+#define QIB_6120_ErrMask_HardwareErrMask_LSB 0x33
+#define QIB_6120_ErrMask_HardwareErrMask_RMASK 0x1
+#define QIB_6120_ErrMask_ResetNegatedMask_LSB 0x32
+#define QIB_6120_ErrMask_ResetNegatedMask_RMASK 0x1
+#define QIB_6120_ErrMask_InvalidAddrErrMask_LSB 0x31
+#define QIB_6120_ErrMask_InvalidAddrErrMask_RMASK 0x1
+#define QIB_6120_ErrMask_IBStatusChangedMask_LSB 0x30
+#define QIB_6120_ErrMask_IBStatusChangedMask_RMASK 0x1
+#define QIB_6120_ErrMask_Reserved1_LSB 0x26
+#define QIB_6120_ErrMask_Reserved1_RMASK 0x3FF
+#define QIB_6120_ErrMask_SendUnsupportedVLErrMask_LSB 0x25
+#define QIB_6120_ErrMask_SendUnsupportedVLErrMask_RMASK 0x1
+#define QIB_6120_ErrMask_SendUnexpectedPktNumErrMask_LSB 0x24
+#define QIB_6120_ErrMask_SendUnexpectedPktNumErrMask_RMASK 0x1
+#define QIB_6120_ErrMask_SendPioArmLaunchErrMask_LSB 0x23
+#define QIB_6120_ErrMask_SendPioArmLaunchErrMask_RMASK 0x1
+#define QIB_6120_ErrMask_SendDroppedDataPktErrMask_LSB 0x22
+#define QIB_6120_ErrMask_SendDroppedDataPktErrMask_RMASK 0x1
+#define QIB_6120_ErrMask_SendDroppedSmpPktErrMask_LSB 0x21
+#define QIB_6120_ErrMask_SendDroppedSmpPktErrMask_RMASK 0x1
+#define QIB_6120_ErrMask_SendPktLenErrMask_LSB 0x20
+#define QIB_6120_ErrMask_SendPktLenErrMask_RMASK 0x1
+#define QIB_6120_ErrMask_SendUnderRunErrMask_LSB 0x1F
+#define QIB_6120_ErrMask_SendUnderRunErrMask_RMASK 0x1
+#define QIB_6120_ErrMask_SendMaxPktLenErrMask_LSB 0x1E
+#define QIB_6120_ErrMask_SendMaxPktLenErrMask_RMASK 0x1
+#define QIB_6120_ErrMask_SendMinPktLenErrMask_LSB 0x1D
+#define QIB_6120_ErrMask_SendMinPktLenErrMask_RMASK 0x1
+#define QIB_6120_ErrMask_Reserved2_LSB 0x12
+#define QIB_6120_ErrMask_Reserved2_RMASK 0x7FF
+#define QIB_6120_ErrMask_RcvIBLostLinkErrMask_LSB 0x11
+#define QIB_6120_ErrMask_RcvIBLostLinkErrMask_RMASK 0x1
+#define QIB_6120_ErrMask_RcvHdrErrMask_LSB 0x10
+#define QIB_6120_ErrMask_RcvHdrErrMask_RMASK 0x1
+#define QIB_6120_ErrMask_RcvHdrLenErrMask_LSB 0xF
+#define QIB_6120_ErrMask_RcvHdrLenErrMask_RMASK 0x1
+#define QIB_6120_ErrMask_RcvBadTidErrMask_LSB 0xE
+#define QIB_6120_ErrMask_RcvBadTidErrMask_RMASK 0x1
+#define QIB_6120_ErrMask_RcvHdrFullErrMask_LSB 0xD
+#define QIB_6120_ErrMask_RcvHdrFullErrMask_RMASK 0x1
+#define QIB_6120_ErrMask_RcvEgrFullErrMask_LSB 0xC
+#define QIB_6120_ErrMask_RcvEgrFullErrMask_RMASK 0x1
+#define QIB_6120_ErrMask_RcvBadVersionErrMask_LSB 0xB
+#define QIB_6120_ErrMask_RcvBadVersionErrMask_RMASK 0x1
+#define QIB_6120_ErrMask_RcvIBFlowErrMask_LSB 0xA
+#define QIB_6120_ErrMask_RcvIBFlowErrMask_RMASK 0x1
+#define QIB_6120_ErrMask_RcvEBPErrMask_LSB 0x9
+#define QIB_6120_ErrMask_RcvEBPErrMask_RMASK 0x1
+#define QIB_6120_ErrMask_RcvUnsupportedVLErrMask_LSB 0x8
+#define QIB_6120_ErrMask_RcvUnsupportedVLErrMask_RMASK 0x1
+#define QIB_6120_ErrMask_RcvUnexpectedCharErrMask_LSB 0x7
+#define QIB_6120_ErrMask_RcvUnexpectedCharErrMask_RMASK 0x1
+#define QIB_6120_ErrMask_RcvShortPktLenErrMask_LSB 0x6
+#define QIB_6120_ErrMask_RcvShortPktLenErrMask_RMASK 0x1
+#define QIB_6120_ErrMask_RcvLongPktLenErrMask_LSB 0x5
+#define QIB_6120_ErrMask_RcvLongPktLenErrMask_RMASK 0x1
+#define QIB_6120_ErrMask_RcvMaxPktLenErrMask_LSB 0x4
+#define QIB_6120_ErrMask_RcvMaxPktLenErrMask_RMASK 0x1
+#define QIB_6120_ErrMask_RcvMinPktLenErrMask_LSB 0x3
+#define QIB_6120_ErrMask_RcvMinPktLenErrMask_RMASK 0x1
+#define QIB_6120_ErrMask_RcvICRCErrMask_LSB 0x2
+#define QIB_6120_ErrMask_RcvICRCErrMask_RMASK 0x1
+#define QIB_6120_ErrMask_RcvVCRCErrMask_LSB 0x1
+#define QIB_6120_ErrMask_RcvVCRCErrMask_RMASK 0x1
+#define QIB_6120_ErrMask_RcvFormatErrMask_LSB 0x0
+#define QIB_6120_ErrMask_RcvFormatErrMask_RMASK 0x1
+
+#define QIB_6120_ErrStatus_OFFS 0x88
+#define QIB_6120_ErrStatus_Reserved_LSB 0x34
+#define QIB_6120_ErrStatus_Reserved_RMASK 0xFFF
+#define QIB_6120_ErrStatus_HardwareErr_LSB 0x33
+#define QIB_6120_ErrStatus_HardwareErr_RMASK 0x1
+#define QIB_6120_ErrStatus_ResetNegated_LSB 0x32
+#define QIB_6120_ErrStatus_ResetNegated_RMASK 0x1
+#define QIB_6120_ErrStatus_InvalidAddrErr_LSB 0x31
+#define QIB_6120_ErrStatus_InvalidAddrErr_RMASK 0x1
+#define QIB_6120_ErrStatus_IBStatusChanged_LSB 0x30
+#define QIB_6120_ErrStatus_IBStatusChanged_RMASK 0x1
+#define QIB_6120_ErrStatus_Reserved1_LSB 0x26
+#define QIB_6120_ErrStatus_Reserved1_RMASK 0x3FF
+#define QIB_6120_ErrStatus_SendUnsupportedVLErr_LSB 0x25
+#define QIB_6120_ErrStatus_SendUnsupportedVLErr_RMASK 0x1
+#define QIB_6120_ErrStatus_SendUnexpectedPktNumErr_LSB 0x24
+#define QIB_6120_ErrStatus_SendUnexpectedPktNumErr_RMASK 0x1
+#define QIB_6120_ErrStatus_SendPioArmLaunchErr_LSB 0x23
+#define QIB_6120_ErrStatus_SendPioArmLaunchErr_RMASK 0x1
+#define QIB_6120_ErrStatus_SendDroppedDataPktErr_LSB 0x22
+#define QIB_6120_ErrStatus_SendDroppedDataPktErr_RMASK 0x1
+#define QIB_6120_ErrStatus_SendDroppedSmpPktErr_LSB 0x21
+#define QIB_6120_ErrStatus_SendDroppedSmpPktErr_RMASK 0x1
+#define QIB_6120_ErrStatus_SendPktLenErr_LSB 0x20
+#define QIB_6120_ErrStatus_SendPktLenErr_RMASK 0x1
+#define QIB_6120_ErrStatus_SendUnderRunErr_LSB 0x1F
+#define QIB_6120_ErrStatus_SendUnderRunErr_RMASK 0x1
+#define QIB_6120_ErrStatus_SendMaxPktLenErr_LSB 0x1E
+#define QIB_6120_ErrStatus_SendMaxPktLenErr_RMASK 0x1
+#define QIB_6120_ErrStatus_SendMinPktLenErr_LSB 0x1D
+#define QIB_6120_ErrStatus_SendMinPktLenErr_RMASK 0x1
+#define QIB_6120_ErrStatus_Reserved2_LSB 0x12
+#define QIB_6120_ErrStatus_Reserved2_RMASK 0x7FF
+#define QIB_6120_ErrStatus_RcvIBLostLinkErr_LSB 0x11
+#define QIB_6120_ErrStatus_RcvIBLostLinkErr_RMASK 0x1
+#define QIB_6120_ErrStatus_RcvHdrErr_LSB 0x10
+#define QIB_6120_ErrStatus_RcvHdrErr_RMASK 0x1
+#define QIB_6120_ErrStatus_RcvHdrLenErr_LSB 0xF
+#define QIB_6120_ErrStatus_RcvHdrLenErr_RMASK 0x1
+#define QIB_6120_ErrStatus_RcvBadTidErr_LSB 0xE
+#define QIB_6120_ErrStatus_RcvBadTidErr_RMASK 0x1
+#define QIB_6120_ErrStatus_RcvHdrFullErr_LSB 0xD
+#define QIB_6120_ErrStatus_RcvHdrFullErr_RMASK 0x1
+#define QIB_6120_ErrStatus_RcvEgrFullErr_LSB 0xC
+#define QIB_6120_ErrStatus_RcvEgrFullErr_RMASK 0x1
+#define QIB_6120_ErrStatus_RcvBadVersionErr_LSB 0xB
+#define QIB_6120_ErrStatus_RcvBadVersionErr_RMASK 0x1
+#define QIB_6120_ErrStatus_RcvIBFlowErr_LSB 0xA
+#define QIB_6120_ErrStatus_RcvIBFlowErr_RMASK 0x1
+#define QIB_6120_ErrStatus_RcvEBPErr_LSB 0x9
+#define QIB_6120_ErrStatus_RcvEBPErr_RMASK 0x1
+#define QIB_6120_ErrStatus_RcvUnsupportedVLErr_LSB 0x8
+#define QIB_6120_ErrStatus_RcvUnsupportedVLErr_RMASK 0x1
+#define QIB_6120_ErrStatus_RcvUnexpectedCharErr_LSB 0x7
+#define QIB_6120_ErrStatus_RcvUnexpectedCharErr_RMASK 0x1
+#define QIB_6120_ErrStatus_RcvShortPktLenErr_LSB 0x6
+#define QIB_6120_ErrStatus_RcvShortPktLenErr_RMASK 0x1
+#define QIB_6120_ErrStatus_RcvLongPktLenErr_LSB 0x5
+#define QIB_6120_ErrStatus_RcvLongPktLenErr_RMASK 0x1
+#define QIB_6120_ErrStatus_RcvMaxPktLenErr_LSB 0x4
+#define QIB_6120_ErrStatus_RcvMaxPktLenErr_RMASK 0x1
+#define QIB_6120_ErrStatus_RcvMinPktLenErr_LSB 0x3
+#define QIB_6120_ErrStatus_RcvMinPktLenErr_RMASK 0x1
+#define QIB_6120_ErrStatus_RcvICRCErr_LSB 0x2
+#define QIB_6120_ErrStatus_RcvICRCErr_RMASK 0x1
+#define QIB_6120_ErrStatus_RcvVCRCErr_LSB 0x1
+#define QIB_6120_ErrStatus_RcvVCRCErr_RMASK 0x1
+#define QIB_6120_ErrStatus_RcvFormatErr_LSB 0x0
+#define QIB_6120_ErrStatus_RcvFormatErr_RMASK 0x1
+
+#define QIB_6120_ErrClear_OFFS 0x90
+#define QIB_6120_ErrClear_Reserved_LSB 0x34
+#define QIB_6120_ErrClear_Reserved_RMASK 0xFFF
+#define QIB_6120_ErrClear_HardwareErrClear_LSB 0x33
+#define QIB_6120_ErrClear_HardwareErrClear_RMASK 0x1
+#define QIB_6120_ErrClear_ResetNegatedClear_LSB 0x32
+#define QIB_6120_ErrClear_ResetNegatedClear_RMASK 0x1
+#define QIB_6120_ErrClear_InvalidAddrErrClear_LSB 0x31
+#define QIB_6120_ErrClear_InvalidAddrErrClear_RMASK 0x1
+#define QIB_6120_ErrClear_IBStatusChangedClear_LSB 0x30
+#define QIB_6120_ErrClear_IBStatusChangedClear_RMASK 0x1
+#define QIB_6120_ErrClear_Reserved1_LSB 0x26
+#define QIB_6120_ErrClear_Reserved1_RMASK 0x3FF
+#define QIB_6120_ErrClear_SendUnsupportedVLErrClear_LSB 0x25
+#define QIB_6120_ErrClear_SendUnsupportedVLErrClear_RMASK 0x1
+#define QIB_6120_ErrClear_SendUnexpectedPktNumErrClear_LSB 0x24
+#define QIB_6120_ErrClear_SendUnexpectedPktNumErrClear_RMASK 0x1
+#define QIB_6120_ErrClear_SendPioArmLaunchErrClear_LSB 0x23
+#define QIB_6120_ErrClear_SendPioArmLaunchErrClear_RMASK 0x1
+#define QIB_6120_ErrClear_SendDroppedDataPktErrClear_LSB 0x22
+#define QIB_6120_ErrClear_SendDroppedDataPktErrClear_RMASK 0x1
+#define QIB_6120_ErrClear_SendDroppedSmpPktErrClear_LSB 0x21
+#define QIB_6120_ErrClear_SendDroppedSmpPktErrClear_RMASK 0x1
+#define QIB_6120_ErrClear_SendPktLenErrClear_LSB 0x20
+#define QIB_6120_ErrClear_SendPktLenErrClear_RMASK 0x1
+#define QIB_6120_ErrClear_SendUnderRunErrClear_LSB 0x1F
+#define QIB_6120_ErrClear_SendUnderRunErrClear_RMASK 0x1
+#define QIB_6120_ErrClear_SendMaxPktLenErrClear_LSB 0x1E
+#define QIB_6120_ErrClear_SendMaxPktLenErrClear_RMASK 0x1
+#define QIB_6120_ErrClear_SendMinPktLenErrClear_LSB 0x1D
+#define QIB_6120_ErrClear_SendMinPktLenErrClear_RMASK 0x1
+#define QIB_6120_ErrClear_Reserved2_LSB 0x12
+#define QIB_6120_ErrClear_Reserved2_RMASK 0x7FF
+#define QIB_6120_ErrClear_RcvIBLostLinkErrClear_LSB 0x11
+#define QIB_6120_ErrClear_RcvIBLostLinkErrClear_RMASK 0x1
+#define QIB_6120_ErrClear_RcvHdrErrClear_LSB 0x10
+#define QIB_6120_ErrClear_RcvHdrErrClear_RMASK 0x1
+#define QIB_6120_ErrClear_RcvHdrLenErrClear_LSB 0xF
+#define QIB_6120_ErrClear_RcvHdrLenErrClear_RMASK 0x1
+#define QIB_6120_ErrClear_RcvBadTidErrClear_LSB 0xE
+#define QIB_6120_ErrClear_RcvBadTidErrClear_RMASK 0x1
+#define QIB_6120_ErrClear_RcvHdrFullErrClear_LSB 0xD
+#define QIB_6120_ErrClear_RcvHdrFullErrClear_RMASK 0x1
+#define QIB_6120_ErrClear_RcvEgrFullErrClear_LSB 0xC
+#define QIB_6120_ErrClear_RcvEgrFullErrClear_RMASK 0x1
+#define QIB_6120_ErrClear_RcvBadVersionErrClear_LSB 0xB
+#define QIB_6120_ErrClear_RcvBadVersionErrClear_RMASK 0x1
+#define QIB_6120_ErrClear_RcvIBFlowErrClear_LSB 0xA
+#define QIB_6120_ErrClear_RcvIBFlowErrClear_RMASK 0x1
+#define QIB_6120_ErrClear_RcvEBPErrClear_LSB 0x9
+#define QIB_6120_ErrClear_RcvEBPErrClear_RMASK 0x1
+#define QIB_6120_ErrClear_RcvUnsupportedVLErrClear_LSB 0x8
+#define QIB_6120_ErrClear_RcvUnsupportedVLErrClear_RMASK 0x1
+#define QIB_6120_ErrClear_RcvUnexpectedCharErrClear_LSB 0x7
+#define QIB_6120_ErrClear_RcvUnexpectedCharErrClear_RMASK 0x1
+#define QIB_6120_ErrClear_RcvShortPktLenErrClear_LSB 0x6
+#define QIB_6120_ErrClear_RcvShortPktLenErrClear_RMASK 0x1
+#define QIB_6120_ErrClear_RcvLongPktLenErrClear_LSB 0x5
+#define QIB_6120_ErrClear_RcvLongPktLenErrClear_RMASK 0x1
+#define QIB_6120_ErrClear_RcvMaxPktLenErrClear_LSB 0x4
+#define QIB_6120_ErrClear_RcvMaxPktLenErrClear_RMASK 0x1
+#define QIB_6120_ErrClear_RcvMinPktLenErrClear_LSB 0x3
+#define QIB_6120_ErrClear_RcvMinPktLenErrClear_RMASK 0x1
+#define QIB_6120_ErrClear_RcvICRCErrClear_LSB 0x2
+#define QIB_6120_ErrClear_RcvICRCErrClear_RMASK 0x1
+#define QIB_6120_ErrClear_RcvVCRCErrClear_LSB 0x1
+#define QIB_6120_ErrClear_RcvVCRCErrClear_RMASK 0x1
+#define QIB_6120_ErrClear_RcvFormatErrClear_LSB 0x0
+#define QIB_6120_ErrClear_RcvFormatErrClear_RMASK 0x1
+
+#define QIB_6120_HwErrMask_OFFS 0x98
+#define QIB_6120_HwErrMask_IBCBusFromSPCParityErrMask_LSB 0x3F
+#define QIB_6120_HwErrMask_IBCBusFromSPCParityErrMask_RMASK 0x1
+#define QIB_6120_HwErrMask_IBCBusToSPCParityErrMask_LSB 0x3E
+#define QIB_6120_HwErrMask_IBCBusToSPCParityErrMask_RMASK 0x1
+#define QIB_6120_HwErrMask_Reserved_LSB 0x3D
+#define QIB_6120_HwErrMask_Reserved_RMASK 0x1
+#define QIB_6120_HwErrMask_IBSerdesPClkNotDetectMask_LSB 0x3C
+#define QIB_6120_HwErrMask_IBSerdesPClkNotDetectMask_RMASK 0x1
+#define QIB_6120_HwErrMask_PCIESerdesQ0PClkNotDetectMask_LSB 0x3B
+#define QIB_6120_HwErrMask_PCIESerdesQ0PClkNotDetectMask_RMASK 0x1
+#define QIB_6120_HwErrMask_PCIESerdesQ1PClkNotDetectMask_LSB 0x3A
+#define QIB_6120_HwErrMask_PCIESerdesQ1PClkNotDetectMask_RMASK 0x1
+#define QIB_6120_HwErrMask_Reserved1_LSB 0x39
+#define QIB_6120_HwErrMask_Reserved1_RMASK 0x1
+#define QIB_6120_HwErrMask_IBPLLrfSlipMask_LSB 0x38
+#define QIB_6120_HwErrMask_IBPLLrfSlipMask_RMASK 0x1
+#define QIB_6120_HwErrMask_IBPLLfbSlipMask_LSB 0x37
+#define QIB_6120_HwErrMask_IBPLLfbSlipMask_RMASK 0x1
+#define QIB_6120_HwErrMask_PowerOnBISTFailedMask_LSB 0x36
+#define QIB_6120_HwErrMask_PowerOnBISTFailedMask_RMASK 0x1
+#define QIB_6120_HwErrMask_Reserved2_LSB 0x33
+#define QIB_6120_HwErrMask_Reserved2_RMASK 0x7
+#define QIB_6120_HwErrMask_RXEMemParityErrMask_LSB 0x2C
+#define QIB_6120_HwErrMask_RXEMemParityErrMask_RMASK 0x7F
+#define QIB_6120_HwErrMask_TXEMemParityErrMask_LSB 0x28
+#define QIB_6120_HwErrMask_TXEMemParityErrMask_RMASK 0xF
+#define QIB_6120_HwErrMask_Reserved3_LSB 0x22
+#define QIB_6120_HwErrMask_Reserved3_RMASK 0x3F
+#define QIB_6120_HwErrMask_PCIeBusParityErrMask_LSB 0x1F
+#define QIB_6120_HwErrMask_PCIeBusParityErrMask_RMASK 0x7
+#define QIB_6120_HwErrMask_PcieCplTimeoutMask_LSB 0x1E
+#define QIB_6120_HwErrMask_PcieCplTimeoutMask_RMASK 0x1
+#define QIB_6120_HwErrMask_PoisonedTLPMask_LSB 0x1D
+#define QIB_6120_HwErrMask_PoisonedTLPMask_RMASK 0x1
+#define QIB_6120_HwErrMask_Reserved4_LSB 0x6
+#define QIB_6120_HwErrMask_Reserved4_RMASK 0x7FFFFF
+#define QIB_6120_HwErrMask_PCIeMemParityErrMask_LSB 0x0
+#define QIB_6120_HwErrMask_PCIeMemParityErrMask_RMASK 0x3F
+
+#define QIB_6120_HwErrStatus_OFFS 0xA0
+#define QIB_6120_HwErrStatus_IBCBusFromSPCParityErr_LSB 0x3F
+#define QIB_6120_HwErrStatus_IBCBusFromSPCParityErr_RMASK 0x1
+#define QIB_6120_HwErrStatus_IBCBusToSPCParityErr_LSB 0x3E
+#define QIB_6120_HwErrStatus_IBCBusToSPCParityErr_RMASK 0x1
+#define QIB_6120_HwErrStatus_Reserved_LSB 0x3D
+#define QIB_6120_HwErrStatus_Reserved_RMASK 0x1
+#define QIB_6120_HwErrStatus_IBSerdesPClkNotDetect_LSB 0x3C
+#define QIB_6120_HwErrStatus_IBSerdesPClkNotDetect_RMASK 0x1
+#define QIB_6120_HwErrStatus_PCIESerdesQ0PClkNotDetect_LSB 0x3B
+#define QIB_6120_HwErrStatus_PCIESerdesQ0PClkNotDetect_RMASK 0x1
+#define QIB_6120_HwErrStatus_PCIESerdesQ1PClkNotDetect_LSB 0x3A
+#define QIB_6120_HwErrStatus_PCIESerdesQ1PClkNotDetect_RMASK 0x1
+#define QIB_6120_HwErrStatus_Reserved1_LSB 0x39
+#define QIB_6120_HwErrStatus_Reserved1_RMASK 0x1
+#define QIB_6120_HwErrStatus_IBPLLrfSlip_LSB 0x38
+#define QIB_6120_HwErrStatus_IBPLLrfSlip_RMASK 0x1
+#define QIB_6120_HwErrStatus_IBPLLfbSlip_LSB 0x37
+#define QIB_6120_HwErrStatus_IBPLLfbSlip_RMASK 0x1
+#define QIB_6120_HwErrStatus_PowerOnBISTFailed_LSB 0x36
+#define QIB_6120_HwErrStatus_PowerOnBISTFailed_RMASK 0x1
+#define QIB_6120_HwErrStatus_Reserved2_LSB 0x33
+#define QIB_6120_HwErrStatus_Reserved2_RMASK 0x7
+#define QIB_6120_HwErrStatus_RXEMemParity_LSB 0x2C
+#define QIB_6120_HwErrStatus_RXEMemParity_RMASK 0x7F
+#define QIB_6120_HwErrStatus_TXEMemParity_LSB 0x28
+#define QIB_6120_HwErrStatus_TXEMemParity_RMASK 0xF
+#define QIB_6120_HwErrStatus_Reserved3_LSB 0x22
+#define QIB_6120_HwErrStatus_Reserved3_RMASK 0x3F
+#define QIB_6120_HwErrStatus_PCIeBusParity_LSB 0x1F
+#define QIB_6120_HwErrStatus_PCIeBusParity_RMASK 0x7
+#define QIB_6120_HwErrStatus_PcieCplTimeout_LSB 0x1E
+#define QIB_6120_HwErrStatus_PcieCplTimeout_RMASK 0x1
+#define QIB_6120_HwErrStatus_PoisenedTLP_LSB 0x1D
+#define QIB_6120_HwErrStatus_PoisenedTLP_RMASK 0x1
+#define QIB_6120_HwErrStatus_Reserved4_LSB 0x6
+#define QIB_6120_HwErrStatus_Reserved4_RMASK 0x7FFFFF
+#define QIB_6120_HwErrStatus_PCIeMemParity_LSB 0x0
+#define QIB_6120_HwErrStatus_PCIeMemParity_RMASK 0x3F
+
+#define QIB_6120_HwErrClear_OFFS 0xA8
+#define QIB_6120_HwErrClear_IBCBusFromSPCParityErrClear_LSB 0x3F
+#define QIB_6120_HwErrClear_IBCBusFromSPCParityErrClear_RMASK 0x1
+#define QIB_6120_HwErrClear_IBCBusToSPCparityErrClear_LSB 0x3E
+#define QIB_6120_HwErrClear_IBCBusToSPCparityErrClear_RMASK 0x1
+#define QIB_6120_HwErrClear_Reserved_LSB 0x3D
+#define QIB_6120_HwErrClear_Reserved_RMASK 0x1
+#define QIB_6120_HwErrClear_IBSerdesPClkNotDetectClear_LSB 0x3C
+#define QIB_6120_HwErrClear_IBSerdesPClkNotDetectClear_RMASK 0x1
+#define QIB_6120_HwErrClear_PCIESerdesQ0PClkNotDetectClear_LSB 0x3B
+#define QIB_6120_HwErrClear_PCIESerdesQ0PClkNotDetectClear_RMASK 0x1
+#define QIB_6120_HwErrClear_PCIESerdesQ1PClkNotDetectClear_LSB 0x3A
+#define QIB_6120_HwErrClear_PCIESerdesQ1PClkNotDetectClear_RMASK 0x1
+#define QIB_6120_HwErrClear_Reserved1_LSB 0x39
+#define QIB_6120_HwErrClear_Reserved1_RMASK 0x1
+#define QIB_6120_HwErrClear_IBPLLrfSlipClear_LSB 0x38
+#define QIB_6120_HwErrClear_IBPLLrfSlipClear_RMASK 0x1
+#define QIB_6120_HwErrClear_IBPLLfbSlipClear_LSB 0x37
+#define QIB_6120_HwErrClear_IBPLLfbSlipClear_RMASK 0x1
+#define QIB_6120_HwErrClear_PowerOnBISTFailedClear_LSB 0x36
+#define QIB_6120_HwErrClear_PowerOnBISTFailedClear_RMASK 0x1
+#define QIB_6120_HwErrClear_Reserved2_LSB 0x33
+#define QIB_6120_HwErrClear_Reserved2_RMASK 0x7
+#define QIB_6120_HwErrClear_RXEMemParityClear_LSB 0x2C
+#define QIB_6120_HwErrClear_RXEMemParityClear_RMASK 0x7F
+#define QIB_6120_HwErrClear_TXEMemParityClear_LSB 0x28
+#define QIB_6120_HwErrClear_TXEMemParityClear_RMASK 0xF
+#define QIB_6120_HwErrClear_Reserved3_LSB 0x22
+#define QIB_6120_HwErrClear_Reserved3_RMASK 0x3F
+#define QIB_6120_HwErrClear_PCIeBusParityClr_LSB 0x1F
+#define QIB_6120_HwErrClear_PCIeBusParityClr_RMASK 0x7
+#define QIB_6120_HwErrClear_PcieCplTimeoutClear_LSB 0x1E
+#define QIB_6120_HwErrClear_PcieCplTimeoutClear_RMASK 0x1
+#define QIB_6120_HwErrClear_PoisonedTLPClear_LSB 0x1D
+#define QIB_6120_HwErrClear_PoisonedTLPClear_RMASK 0x1
+#define QIB_6120_HwErrClear_Reserved4_LSB 0x6
+#define QIB_6120_HwErrClear_Reserved4_RMASK 0x7FFFFF
+#define QIB_6120_HwErrClear_PCIeMemParityClr_LSB 0x0
+#define QIB_6120_HwErrClear_PCIeMemParityClr_RMASK 0x3F
+
+#define QIB_6120_HwDiagCtrl_OFFS 0xB0
+#define QIB_6120_HwDiagCtrl_ForceIBCBusFromSPCParityErr_LSB 0x3F
+#define QIB_6120_HwDiagCtrl_ForceIBCBusFromSPCParityErr_RMASK 0x1
+#define QIB_6120_HwDiagCtrl_ForceIBCBusToSPCParityErr_LSB 0x3E
+#define QIB_6120_HwDiagCtrl_ForceIBCBusToSPCParityErr_RMASK 0x1
+#define QIB_6120_HwDiagCtrl_CounterWrEnable_LSB 0x3D
+#define QIB_6120_HwDiagCtrl_CounterWrEnable_RMASK 0x1
+#define QIB_6120_HwDiagCtrl_CounterDisable_LSB 0x3C
+#define QIB_6120_HwDiagCtrl_CounterDisable_RMASK 0x1
+#define QIB_6120_HwDiagCtrl_Reserved_LSB 0x33
+#define QIB_6120_HwDiagCtrl_Reserved_RMASK 0x1FF
+#define QIB_6120_HwDiagCtrl_ForceRxMemParityErr_LSB 0x2C
+#define QIB_6120_HwDiagCtrl_ForceRxMemParityErr_RMASK 0x7F
+#define QIB_6120_HwDiagCtrl_ForceTxMemparityErr_LSB 0x28
+#define QIB_6120_HwDiagCtrl_ForceTxMemparityErr_RMASK 0xF
+#define QIB_6120_HwDiagCtrl_Reserved1_LSB 0x23
+#define QIB_6120_HwDiagCtrl_Reserved1_RMASK 0x1F
+#define QIB_6120_HwDiagCtrl_forcePCIeBusParity_LSB 0x1F
+#define QIB_6120_HwDiagCtrl_forcePCIeBusParity_RMASK 0xF
+#define QIB_6120_HwDiagCtrl_Reserved2_LSB 0x6
+#define QIB_6120_HwDiagCtrl_Reserved2_RMASK 0x1FFFFFF
+#define QIB_6120_HwDiagCtrl_forcePCIeMemParity_LSB 0x0
+#define QIB_6120_HwDiagCtrl_forcePCIeMemParity_RMASK 0x3F
+
+#define QIB_6120_IBCStatus_OFFS 0xC0
+#define QIB_6120_IBCStatus_TxCreditOk_LSB 0x1F
+#define QIB_6120_IBCStatus_TxCreditOk_RMASK 0x1
+#define QIB_6120_IBCStatus_TxReady_LSB 0x1E
+#define QIB_6120_IBCStatus_TxReady_RMASK 0x1
+#define QIB_6120_IBCStatus_Reserved_LSB 0x7
+#define QIB_6120_IBCStatus_Reserved_RMASK 0x7FFFFF
+#define QIB_6120_IBCStatus_LinkState_LSB 0x4
+#define QIB_6120_IBCStatus_LinkState_RMASK 0x7
+#define QIB_6120_IBCStatus_LinkTrainingState_LSB 0x0
+#define QIB_6120_IBCStatus_LinkTrainingState_RMASK 0xF
+
+#define QIB_6120_IBCCtrl_OFFS 0xC8
+#define QIB_6120_IBCCtrl_Loopback_LSB 0x3F
+#define QIB_6120_IBCCtrl_Loopback_RMASK 0x1
+#define QIB_6120_IBCCtrl_LinkDownDefaultState_LSB 0x3E
+#define QIB_6120_IBCCtrl_LinkDownDefaultState_RMASK 0x1
+#define QIB_6120_IBCCtrl_Reserved_LSB 0x2B
+#define QIB_6120_IBCCtrl_Reserved_RMASK 0x7FFFF
+#define QIB_6120_IBCCtrl_CreditScale_LSB 0x28
+#define QIB_6120_IBCCtrl_CreditScale_RMASK 0x7
+#define QIB_6120_IBCCtrl_OverrunThreshold_LSB 0x24
+#define QIB_6120_IBCCtrl_OverrunThreshold_RMASK 0xF
+#define QIB_6120_IBCCtrl_PhyerrThreshold_LSB 0x20
+#define QIB_6120_IBCCtrl_PhyerrThreshold_RMASK 0xF
+#define QIB_6120_IBCCtrl_Reserved1_LSB 0x1F
+#define QIB_6120_IBCCtrl_Reserved1_RMASK 0x1
+#define QIB_6120_IBCCtrl_MaxPktLen_LSB 0x14
+#define QIB_6120_IBCCtrl_MaxPktLen_RMASK 0x7FF
+#define QIB_6120_IBCCtrl_LinkCmd_LSB 0x12
+#define QIB_6120_IBCCtrl_LinkCmd_RMASK 0x3
+#define QIB_6120_IBCCtrl_LinkInitCmd_LSB 0x10
+#define QIB_6120_IBCCtrl_LinkInitCmd_RMASK 0x3
+#define QIB_6120_IBCCtrl_FlowCtrlWaterMark_LSB 0x8
+#define QIB_6120_IBCCtrl_FlowCtrlWaterMark_RMASK 0xFF
+#define QIB_6120_IBCCtrl_FlowCtrlPeriod_LSB 0x0
+#define QIB_6120_IBCCtrl_FlowCtrlPeriod_RMASK 0xFF
+
+#define QIB_6120_EXTStatus_OFFS 0xD0
+#define QIB_6120_EXTStatus_GPIOIn_LSB 0x30
+#define QIB_6120_EXTStatus_GPIOIn_RMASK 0xFFFF
+#define QIB_6120_EXTStatus_Reserved_LSB 0x20
+#define QIB_6120_EXTStatus_Reserved_RMASK 0xFFFF
+#define QIB_6120_EXTStatus_Reserved1_LSB 0x10
+#define QIB_6120_EXTStatus_Reserved1_RMASK 0xFFFF
+#define QIB_6120_EXTStatus_MemBISTFoundErr_LSB 0xF
+#define QIB_6120_EXTStatus_MemBISTFoundErr_RMASK 0x1
+#define QIB_6120_EXTStatus_MemBISTEndTest_LSB 0xE
+#define QIB_6120_EXTStatus_MemBISTEndTest_RMASK 0x1
+#define QIB_6120_EXTStatus_Reserved2_LSB 0x0
+#define QIB_6120_EXTStatus_Reserved2_RMASK 0x3FFF
+
+#define QIB_6120_EXTCtrl_OFFS 0xD8
+#define QIB_6120_EXTCtrl_GPIOOe_LSB 0x30
+#define QIB_6120_EXTCtrl_GPIOOe_RMASK 0xFFFF
+#define QIB_6120_EXTCtrl_GPIOInvert_LSB 0x20
+#define QIB_6120_EXTCtrl_GPIOInvert_RMASK 0xFFFF
+#define QIB_6120_EXTCtrl_Reserved_LSB 0x4
+#define QIB_6120_EXTCtrl_Reserved_RMASK 0xFFFFFFF
+#define QIB_6120_EXTCtrl_LEDPriPortGreenOn_LSB 0x3
+#define QIB_6120_EXTCtrl_LEDPriPortGreenOn_RMASK 0x1
+#define QIB_6120_EXTCtrl_LEDPriPortYellowOn_LSB 0x2
+#define QIB_6120_EXTCtrl_LEDPriPortYellowOn_RMASK 0x1
+#define QIB_6120_EXTCtrl_LEDGblOkGreenOn_LSB 0x1
+#define QIB_6120_EXTCtrl_LEDGblOkGreenOn_RMASK 0x1
+#define QIB_6120_EXTCtrl_LEDGblErrRedOff_LSB 0x0
+#define QIB_6120_EXTCtrl_LEDGblErrRedOff_RMASK 0x1
+
+#define QIB_6120_GPIOOut_OFFS 0xE0
+
+#define QIB_6120_GPIOMask_OFFS 0xE8
+
+#define QIB_6120_GPIOStatus_OFFS 0xF0
+
+#define QIB_6120_GPIOClear_OFFS 0xF8
+
+#define QIB_6120_RcvCtrl_OFFS 0x100
+#define QIB_6120_RcvCtrl_TailUpd_LSB 0x1F
+#define QIB_6120_RcvCtrl_TailUpd_RMASK 0x1
+#define QIB_6120_RcvCtrl_RcvPartitionKeyDisable_LSB 0x1E
+#define QIB_6120_RcvCtrl_RcvPartitionKeyDisable_RMASK 0x1
+#define QIB_6120_RcvCtrl_Reserved_LSB 0x15
+#define QIB_6120_RcvCtrl_Reserved_RMASK 0x1FF
+#define QIB_6120_RcvCtrl_IntrAvail_LSB 0x10
+#define QIB_6120_RcvCtrl_IntrAvail_RMASK 0x1F
+#define QIB_6120_RcvCtrl_Reserved1_LSB 0x9
+#define QIB_6120_RcvCtrl_Reserved1_RMASK 0x7F
+#define QIB_6120_RcvCtrl_Reserved2_LSB 0x5
+#define QIB_6120_RcvCtrl_Reserved2_RMASK 0xF
+#define QIB_6120_RcvCtrl_PortEnable_LSB 0x0
+#define QIB_6120_RcvCtrl_PortEnable_RMASK 0x1F
+
+#define QIB_6120_RcvBTHQP_OFFS 0x108
+#define QIB_6120_RcvBTHQP_BTHQP_Mask_LSB 0x1E
+#define QIB_6120_RcvBTHQP_BTHQP_Mask_RMASK 0x3
+#define QIB_6120_RcvBTHQP_Reserved_LSB 0x18
+#define QIB_6120_RcvBTHQP_Reserved_RMASK 0x3F
+#define QIB_6120_RcvBTHQP_RcvBTHQP_LSB 0x0
+#define QIB_6120_RcvBTHQP_RcvBTHQP_RMASK 0xFFFFFF
+
+#define QIB_6120_RcvHdrSize_OFFS 0x110
+
+#define QIB_6120_RcvHdrCnt_OFFS 0x118
+
+#define QIB_6120_RcvHdrEntSize_OFFS 0x120
+
+#define QIB_6120_RcvTIDBase_OFFS 0x128
+
+#define QIB_6120_RcvTIDCnt_OFFS 0x130
+
+#define QIB_6120_RcvEgrBase_OFFS 0x138
+
+#define QIB_6120_RcvEgrCnt_OFFS 0x140
+
+#define QIB_6120_RcvBufBase_OFFS 0x148
+
+#define QIB_6120_RcvBufSize_OFFS 0x150
+
+#define QIB_6120_RxIntMemBase_OFFS 0x158
+
+#define QIB_6120_RxIntMemSize_OFFS 0x160
+
+#define QIB_6120_RcvPartitionKey_OFFS 0x168
+
+#define QIB_6120_RcvPktLEDCnt_OFFS 0x178
+#define QIB_6120_RcvPktLEDCnt_ONperiod_LSB 0x20
+#define QIB_6120_RcvPktLEDCnt_ONperiod_RMASK 0xFFFFFFFF
+#define QIB_6120_RcvPktLEDCnt_OFFperiod_LSB 0x0
+#define QIB_6120_RcvPktLEDCnt_OFFperiod_RMASK 0xFFFFFFFF
+
+#define QIB_6120_SendCtrl_OFFS 0x1C0
+#define QIB_6120_SendCtrl_Disarm_LSB 0x1F
+#define QIB_6120_SendCtrl_Disarm_RMASK 0x1
+#define QIB_6120_SendCtrl_Reserved_LSB 0x17
+#define QIB_6120_SendCtrl_Reserved_RMASK 0xFF
+#define QIB_6120_SendCtrl_DisarmPIOBuf_LSB 0x10
+#define QIB_6120_SendCtrl_DisarmPIOBuf_RMASK 0x7F
+#define QIB_6120_SendCtrl_Reserved1_LSB 0x4
+#define QIB_6120_SendCtrl_Reserved1_RMASK 0xFFF
+#define QIB_6120_SendCtrl_PIOEnable_LSB 0x3
+#define QIB_6120_SendCtrl_PIOEnable_RMASK 0x1
+#define QIB_6120_SendCtrl_PIOBufAvailUpd_LSB 0x2
+#define QIB_6120_SendCtrl_PIOBufAvailUpd_RMASK 0x1
+#define QIB_6120_SendCtrl_PIOIntBufAvail_LSB 0x1
+#define QIB_6120_SendCtrl_PIOIntBufAvail_RMASK 0x1
+#define QIB_6120_SendCtrl_Abort_LSB 0x0
+#define QIB_6120_SendCtrl_Abort_RMASK 0x1
+
+#define QIB_6120_SendPIOBufBase_OFFS 0x1C8
+#define QIB_6120_SendPIOBufBase_Reserved_LSB 0x35
+#define QIB_6120_SendPIOBufBase_Reserved_RMASK 0x7FF
+#define QIB_6120_SendPIOBufBase_BaseAddr_LargePIO_LSB 0x20
+#define QIB_6120_SendPIOBufBase_BaseAddr_LargePIO_RMASK 0x1FFFFF
+#define QIB_6120_SendPIOBufBase_Reserved1_LSB 0x15
+#define QIB_6120_SendPIOBufBase_Reserved1_RMASK 0x7FF
+#define QIB_6120_SendPIOBufBase_BaseAddr_SmallPIO_LSB 0x0
+#define QIB_6120_SendPIOBufBase_BaseAddr_SmallPIO_RMASK 0x1FFFFF
+
+#define QIB_6120_SendPIOSize_OFFS 0x1D0
+#define QIB_6120_SendPIOSize_Reserved_LSB 0x2D
+#define QIB_6120_SendPIOSize_Reserved_RMASK 0xFFFFF
+#define QIB_6120_SendPIOSize_Size_LargePIO_LSB 0x20
+#define QIB_6120_SendPIOSize_Size_LargePIO_RMASK 0x1FFF
+#define QIB_6120_SendPIOSize_Reserved1_LSB 0xC
+#define QIB_6120_SendPIOSize_Reserved1_RMASK 0xFFFFF
+#define QIB_6120_SendPIOSize_Size_SmallPIO_LSB 0x0
+#define QIB_6120_SendPIOSize_Size_SmallPIO_RMASK 0xFFF
+
+#define QIB_6120_SendPIOBufCnt_OFFS 0x1D8
+#define QIB_6120_SendPIOBufCnt_Reserved_LSB 0x24
+#define QIB_6120_SendPIOBufCnt_Reserved_RMASK 0xFFFFFFF
+#define QIB_6120_SendPIOBufCnt_Num_LargePIO_LSB 0x20
+#define QIB_6120_SendPIOBufCnt_Num_LargePIO_RMASK 0xF
+#define QIB_6120_SendPIOBufCnt_Reserved1_LSB 0x9
+#define QIB_6120_SendPIOBufCnt_Reserved1_RMASK 0x7FFFFF
+#define QIB_6120_SendPIOBufCnt_Num_SmallPIO_LSB 0x0
+#define QIB_6120_SendPIOBufCnt_Num_SmallPIO_RMASK 0x1FF
+
+#define QIB_6120_SendPIOAvailAddr_OFFS 0x1E0
+#define QIB_6120_SendPIOAvailAddr_SendPIOAvailAddr_LSB 0x6
+#define QIB_6120_SendPIOAvailAddr_SendPIOAvailAddr_RMASK 0x3FFFFFFFF
+#define QIB_6120_SendPIOAvailAddr_Reserved_LSB 0x0
+#define QIB_6120_SendPIOAvailAddr_Reserved_RMASK 0x3F
+
+#define QIB_6120_SendBufErr0_OFFS 0x240
+#define QIB_6120_SendBufErr0_SendBufErrPIO_63_0_LSB 0x0
+#define QIB_6120_SendBufErr0_SendBufErrPIO_63_0_RMASK 0x0
+
+#define QIB_6120_RcvHdrAddr0_OFFS 0x280
+#define QIB_6120_RcvHdrAddr0_RcvHdrAddr0_LSB 0x2
+#define QIB_6120_RcvHdrAddr0_RcvHdrAddr0_RMASK 0x3FFFFFFFFF
+#define QIB_6120_RcvHdrAddr0_Reserved_LSB 0x0
+#define QIB_6120_RcvHdrAddr0_Reserved_RMASK 0x3
+
+#define QIB_6120_RcvHdrTailAddr0_OFFS 0x300
+#define QIB_6120_RcvHdrTailAddr0_RcvHdrTailAddr0_LSB 0x2
+#define QIB_6120_RcvHdrTailAddr0_RcvHdrTailAddr0_RMASK 0x3FFFFFFFFF
+#define QIB_6120_RcvHdrTailAddr0_Reserved_LSB 0x0
+#define QIB_6120_RcvHdrTailAddr0_Reserved_RMASK 0x3
+
+#define QIB_6120_SerdesCfg0_OFFS 0x3C0
+#define QIB_6120_SerdesCfg0_DisableIBTxIdleDetect_LSB 0x3F
+#define QIB_6120_SerdesCfg0_DisableIBTxIdleDetect_RMASK 0x1
+#define QIB_6120_SerdesCfg0_Reserved_LSB 0x38
+#define QIB_6120_SerdesCfg0_Reserved_RMASK 0x7F
+#define QIB_6120_SerdesCfg0_RxEqCtl_LSB 0x36
+#define QIB_6120_SerdesCfg0_RxEqCtl_RMASK 0x3
+#define QIB_6120_SerdesCfg0_TxTermAdj_LSB 0x34
+#define QIB_6120_SerdesCfg0_TxTermAdj_RMASK 0x3
+#define QIB_6120_SerdesCfg0_RxTermAdj_LSB 0x32
+#define QIB_6120_SerdesCfg0_RxTermAdj_RMASK 0x3
+#define QIB_6120_SerdesCfg0_TermAdj1_LSB 0x31
+#define QIB_6120_SerdesCfg0_TermAdj1_RMASK 0x1
+#define QIB_6120_SerdesCfg0_TermAdj0_LSB 0x30
+#define QIB_6120_SerdesCfg0_TermAdj0_RMASK 0x1
+#define QIB_6120_SerdesCfg0_LPBKA_LSB 0x2F
+#define QIB_6120_SerdesCfg0_LPBKA_RMASK 0x1
+#define QIB_6120_SerdesCfg0_LPBKB_LSB 0x2E
+#define QIB_6120_SerdesCfg0_LPBKB_RMASK 0x1
+#define QIB_6120_SerdesCfg0_LPBKC_LSB 0x2D
+#define QIB_6120_SerdesCfg0_LPBKC_RMASK 0x1
+#define QIB_6120_SerdesCfg0_LPBKD_LSB 0x2C
+#define QIB_6120_SerdesCfg0_LPBKD_RMASK 0x1
+#define QIB_6120_SerdesCfg0_PW_LSB 0x2B
+#define QIB_6120_SerdesCfg0_PW_RMASK 0x1
+#define QIB_6120_SerdesCfg0_RefSel_LSB 0x29
+#define QIB_6120_SerdesCfg0_RefSel_RMASK 0x3
+#define QIB_6120_SerdesCfg0_ParReset_LSB 0x28
+#define QIB_6120_SerdesCfg0_ParReset_RMASK 0x1
+#define QIB_6120_SerdesCfg0_ParLPBK_LSB 0x27
+#define QIB_6120_SerdesCfg0_ParLPBK_RMASK 0x1
+#define QIB_6120_SerdesCfg0_OffsetEn_LSB 0x26
+#define QIB_6120_SerdesCfg0_OffsetEn_RMASK 0x1
+#define QIB_6120_SerdesCfg0_Offset_LSB 0x1E
+#define QIB_6120_SerdesCfg0_Offset_RMASK 0xFF
+#define QIB_6120_SerdesCfg0_L2PwrDn_LSB 0x1D
+#define QIB_6120_SerdesCfg0_L2PwrDn_RMASK 0x1
+#define QIB_6120_SerdesCfg0_ResetPLL_LSB 0x1C
+#define QIB_6120_SerdesCfg0_ResetPLL_RMASK 0x1
+#define QIB_6120_SerdesCfg0_RxTermEnX_LSB 0x18
+#define QIB_6120_SerdesCfg0_RxTermEnX_RMASK 0xF
+#define QIB_6120_SerdesCfg0_BeaconTxEnX_LSB 0x14
+#define QIB_6120_SerdesCfg0_BeaconTxEnX_RMASK 0xF
+#define QIB_6120_SerdesCfg0_RxDetEnX_LSB 0x10
+#define QIB_6120_SerdesCfg0_RxDetEnX_RMASK 0xF
+#define QIB_6120_SerdesCfg0_TxIdeEnX_LSB 0xC
+#define QIB_6120_SerdesCfg0_TxIdeEnX_RMASK 0xF
+#define QIB_6120_SerdesCfg0_RxIdleEnX_LSB 0x8
+#define QIB_6120_SerdesCfg0_RxIdleEnX_RMASK 0xF
+#define QIB_6120_SerdesCfg0_L1PwrDnA_LSB 0x7
+#define QIB_6120_SerdesCfg0_L1PwrDnA_RMASK 0x1
+#define QIB_6120_SerdesCfg0_L1PwrDnB_LSB 0x6
+#define QIB_6120_SerdesCfg0_L1PwrDnB_RMASK 0x1
+#define QIB_6120_SerdesCfg0_L1PwrDnC_LSB 0x5
+#define QIB_6120_SerdesCfg0_L1PwrDnC_RMASK 0x1
+#define QIB_6120_SerdesCfg0_L1PwrDnD_LSB 0x4
+#define QIB_6120_SerdesCfg0_L1PwrDnD_RMASK 0x1
+#define QIB_6120_SerdesCfg0_ResetA_LSB 0x3
+#define QIB_6120_SerdesCfg0_ResetA_RMASK 0x1
+#define QIB_6120_SerdesCfg0_ResetB_LSB 0x2
+#define QIB_6120_SerdesCfg0_ResetB_RMASK 0x1
+#define QIB_6120_SerdesCfg0_ResetC_LSB 0x1
+#define QIB_6120_SerdesCfg0_ResetC_RMASK 0x1
+#define QIB_6120_SerdesCfg0_ResetD_LSB 0x0
+#define QIB_6120_SerdesCfg0_ResetD_RMASK 0x1
+
+#define QIB_6120_SerdesStat_OFFS 0x3D0
+#define QIB_6120_SerdesStat_Reserved_LSB 0xC
+#define QIB_6120_SerdesStat_Reserved_RMASK 0xFFFFFFFFFFFFF
+#define QIB_6120_SerdesStat_BeaconDetA_LSB 0xB
+#define QIB_6120_SerdesStat_BeaconDetA_RMASK 0x1
+#define QIB_6120_SerdesStat_BeaconDetB_LSB 0xA
+#define QIB_6120_SerdesStat_BeaconDetB_RMASK 0x1
+#define QIB_6120_SerdesStat_BeaconDetC_LSB 0x9
+#define QIB_6120_SerdesStat_BeaconDetC_RMASK 0x1
+#define QIB_6120_SerdesStat_BeaconDetD_LSB 0x8
+#define QIB_6120_SerdesStat_BeaconDetD_RMASK 0x1
+#define QIB_6120_SerdesStat_RxDetA_LSB 0x7
+#define QIB_6120_SerdesStat_RxDetA_RMASK 0x1
+#define QIB_6120_SerdesStat_RxDetB_LSB 0x6
+#define QIB_6120_SerdesStat_RxDetB_RMASK 0x1
+#define QIB_6120_SerdesStat_RxDetC_LSB 0x5
+#define QIB_6120_SerdesStat_RxDetC_RMASK 0x1
+#define QIB_6120_SerdesStat_RxDetD_LSB 0x4
+#define QIB_6120_SerdesStat_RxDetD_RMASK 0x1
+#define QIB_6120_SerdesStat_TxIdleDetA_LSB 0x3
+#define QIB_6120_SerdesStat_TxIdleDetA_RMASK 0x1
+#define QIB_6120_SerdesStat_TxIdleDetB_LSB 0x2
+#define QIB_6120_SerdesStat_TxIdleDetB_RMASK 0x1
+#define QIB_6120_SerdesStat_TxIdleDetC_LSB 0x1
+#define QIB_6120_SerdesStat_TxIdleDetC_RMASK 0x1
+#define QIB_6120_SerdesStat_TxIdleDetD_LSB 0x0
+#define QIB_6120_SerdesStat_TxIdleDetD_RMASK 0x1
+
+#define QIB_6120_XGXSCfg_OFFS 0x3D8
+#define QIB_6120_XGXSCfg_ArmLaunchErrorDisable_LSB 0x3F
+#define QIB_6120_XGXSCfg_ArmLaunchErrorDisable_RMASK 0x1
+#define QIB_6120_XGXSCfg_Reserved_LSB 0x17
+#define QIB_6120_XGXSCfg_Reserved_RMASK 0xFFFFFFFFFF
+#define QIB_6120_XGXSCfg_polarity_inv_LSB 0x13
+#define QIB_6120_XGXSCfg_polarity_inv_RMASK 0xF
+#define QIB_6120_XGXSCfg_link_sync_mask_LSB 0x9
+#define QIB_6120_XGXSCfg_link_sync_mask_RMASK 0x3FF
+#define QIB_6120_XGXSCfg_port_addr_LSB 0x4
+#define QIB_6120_XGXSCfg_port_addr_RMASK 0x1F
+#define QIB_6120_XGXSCfg_mdd_30_LSB 0x3
+#define QIB_6120_XGXSCfg_mdd_30_RMASK 0x1
+#define QIB_6120_XGXSCfg_xcv_resetn_LSB 0x2
+#define QIB_6120_XGXSCfg_xcv_resetn_RMASK 0x1
+#define QIB_6120_XGXSCfg_Reserved1_LSB 0x1
+#define QIB_6120_XGXSCfg_Reserved1_RMASK 0x1
+#define QIB_6120_XGXSCfg_tx_rx_resetn_LSB 0x0
+#define QIB_6120_XGXSCfg_tx_rx_resetn_RMASK 0x1
+
+#define QIB_6120_LBIntCnt_OFFS 0x12000
+
+#define QIB_6120_LBFlowStallCnt_OFFS 0x12008
+
+#define QIB_6120_TxUnsupVLErrCnt_OFFS 0x12018
+
+#define QIB_6120_TxDataPktCnt_OFFS 0x12020
+
+#define QIB_6120_TxFlowPktCnt_OFFS 0x12028
+
+#define QIB_6120_TxDwordCnt_OFFS 0x12030
+
+#define QIB_6120_TxLenErrCnt_OFFS 0x12038
+
+#define QIB_6120_TxMaxMinLenErrCnt_OFFS 0x12040
+
+#define QIB_6120_TxUnderrunCnt_OFFS 0x12048
+
+#define QIB_6120_TxFlowStallCnt_OFFS 0x12050
+
+#define QIB_6120_TxDroppedPktCnt_OFFS 0x12058
+
+#define QIB_6120_RxDroppedPktCnt_OFFS 0x12060
+
+#define QIB_6120_RxDataPktCnt_OFFS 0x12068
+
+#define QIB_6120_RxFlowPktCnt_OFFS 0x12070
+
+#define QIB_6120_RxDwordCnt_OFFS 0x12078
+
+#define QIB_6120_RxLenErrCnt_OFFS 0x12080
+
+#define QIB_6120_RxMaxMinLenErrCnt_OFFS 0x12088
+
+#define QIB_6120_RxICRCErrCnt_OFFS 0x12090
+
+#define QIB_6120_RxVCRCErrCnt_OFFS 0x12098
+
+#define QIB_6120_RxFlowCtrlErrCnt_OFFS 0x120A0
+
+#define QIB_6120_RxBadFormatCnt_OFFS 0x120A8
+
+#define QIB_6120_RxLinkProblemCnt_OFFS 0x120B0
+
+#define QIB_6120_RxEBPCnt_OFFS 0x120B8
+
+#define QIB_6120_RxLPCRCErrCnt_OFFS 0x120C0
+
+#define QIB_6120_RxBufOvflCnt_OFFS 0x120C8
+
+#define QIB_6120_RxTIDFullErrCnt_OFFS 0x120D0
+
+#define QIB_6120_RxTIDValidErrCnt_OFFS 0x120D8
+
+#define QIB_6120_RxPKeyMismatchCnt_OFFS 0x120E0
+
+#define QIB_6120_RxP0HdrEgrOvflCnt_OFFS 0x120E8
+
+#define QIB_6120_IBStatusChangeCnt_OFFS 0x12140
+
+#define QIB_6120_IBLinkErrRecoveryCnt_OFFS 0x12148
+
+#define QIB_6120_IBLinkDownedCnt_OFFS 0x12150
+
+#define QIB_6120_IBSymbolErrCnt_OFFS 0x12158
+
+#define QIB_6120_PcieRetryBufDiagQwordCnt_OFFS 0x12170
+
+#define QIB_6120_RcvEgrArray0_OFFS 0x14000
+
+#define QIB_6120_RcvTIDArray0_OFFS 0x54000
+
+#define QIB_6120_PIOLaunchFIFO_OFFS 0x64000
+
+#define QIB_6120_SendPIOpbcCache_OFFS 0x64800
+
+#define QIB_6120_RcvBuf1_OFFS 0x72000
+
+#define QIB_6120_RcvBuf2_OFFS 0x75000
+
+#define QIB_6120_RcvFlags_OFFS 0x77000
+
+#define QIB_6120_RcvLookupBuf1_OFFS 0x79000
+
+#define QIB_6120_RcvDMABuf_OFFS 0x7B000
+
+#define QIB_6120_MiscRXEIntMem_OFFS 0x7C000
+
+#define QIB_6120_PCIERcvBuf_OFFS 0x80000
+
+#define QIB_6120_PCIERetryBuf_OFFS 0x82000
+
+#define QIB_6120_PCIERcvBufRdToWrAddr_OFFS 0x84000
+
+#define QIB_6120_PIOBuf0_MA_OFFS 0x100000
diff --git a/drivers/infiniband/hw/qib/qib_7220.h b/drivers/infiniband/hw/qib/qib_7220.h
new file mode 100644
index 0000000..a5356cb
--- /dev/null
+++ b/drivers/infiniband/hw/qib/qib_7220.h
@@ -0,0 +1,149 @@
+#ifndef _QIB_7220_H
+#define _QIB_7220_H
+/*
+ * Copyright (c) 2007, 2009, 2010 QLogic Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/* grab register-defs auto-generated by HW */
+#include "qib_7220_regs.h"
+
+/* The number of eager receive TIDs for context zero. */
+#define IBA7220_KRCVEGRCNT      2048U
+
+#define IB_7220_LT_STATE_CFGRCVFCFG      0x09
+#define IB_7220_LT_STATE_CFGWAITRMT      0x0a
+#define IB_7220_LT_STATE_TXREVLANES      0x0d
+#define IB_7220_LT_STATE_CFGENH          0x10
+
+struct qib_chip_specific {
+	u64 __iomem *cregbase;
+	u64 *cntrs;
+	u64 *portcntrs;
+	spinlock_t sdepb_lock; /* serdes EPB bus */
+	spinlock_t rcvmod_lock; /* protect rcvctrl shadow changes */
+	spinlock_t gpio_lock; /* RMW of shadows/regs for ExtCtrl and GPIO */
+	u64 hwerrmask;
+	u64 errormask;
+	u64 gpio_out; /* shadow of kr_gpio_out, for rmw ops */
+	u64 gpio_mask; /* shadow the gpio mask register */
+	u64 extctrl; /* shadow the gpio output enable, etc... */
+	u32 ncntrs;
+	u32 nportcntrs;
+	u32 cntrnamelen;
+	u32 portcntrnamelen;
+	u32 numctxts;
+	u32 rcvegrcnt;
+	u32 autoneg_tries;
+	u32 serdes_first_init_done;
+	u32 sdmabufcnt;
+	u32 lastbuf_for_pio;
+	u32 updthresh; /* current AvailUpdThld */
+	u32 updthresh_dflt; /* default AvailUpdThld */
+	int irq;
+	u8 presets_needed;
+	u8 relock_timer_active;
+	char emsgbuf[128];
+	char sdmamsgbuf[192];
+	char bitsmsgbuf[64];
+	struct timer_list relock_timer;
+	unsigned int relock_interval; /* in jiffies */
+};
+
+struct qib_chippport_specific {
+	struct qib_pportdata pportdata;
+	wait_queue_head_t autoneg_wait;
+	struct delayed_work autoneg_work;
+	struct timer_list chase_timer;
+	/*
+	 * these 5 fields are used to establish deltas for IB symbol
+	 * errors and linkrecovery errors.  They can be reported on
+	 * some chips during link negotiation prior to INIT, and with
+	 * DDR when faking DDR negotiations with non-IBTA switches.
+	 * The chip counters are adjusted at driver unload if there is
+	 * a non-zero delta.
+	 */
+	u64 ibdeltainprog;
+	u64 ibsymdelta;
+	u64 ibsymsnap;
+	u64 iblnkerrdelta;
+	u64 iblnkerrsnap;
+	u64 ibcctrl; /* kr_ibcctrl shadow */
+	u64 ibcddrctrl; /* kr_ibcddrctrl shadow */
+	unsigned long chase_end;
+	u32 last_delay_mult;
+};
+
+/*
+ * This header file provides the declarations and common definitions
+ * for (mostly) manipulation of the SerDes blocks within the IBA7220.
+ * the functions declared should only be called from within other
+ * 7220-related files such as qib_iba7220.c or qib_sd7220.c.
+ */
+int qib_sd7220_presets(struct qib_devdata *dd);
+int qib_sd7220_init(struct qib_devdata *dd);
+void qib_sd7220_clr_ibpar(struct qib_devdata *);
+/*
+ * Below used for sdnum parameter, selecting one of the two sections
+ * used for PCIe, or the single SerDes used for IB, which is the
+ * only one currently used
+ */
+#define IB_7220_SERDES 2
+
+static inline u32 qib_read_kreg32(const struct qib_devdata *dd,
+				  const u16 regno)
+{
+	if (!dd->kregbase || !(dd->flags & QIB_PRESENT))
+		return -1;
+	return readl((u32 __iomem *)&dd->kregbase[regno]);
+}
+
+static inline u64 qib_read_kreg64(const struct qib_devdata *dd,
+				  const u16 regno)
+{
+	if (!dd->kregbase || !(dd->flags & QIB_PRESENT))
+		return -1;
+
+	return readq(&dd->kregbase[regno]);
+}
+
+static inline void qib_write_kreg(const struct qib_devdata *dd,
+				  const u16 regno, u64 value)
+{
+	if (dd->kregbase)
+		writeq(value, &dd->kregbase[regno]);
+}
+
+void set_7220_relock_poll(struct qib_devdata *, int);
+void shutdown_7220_relock_poll(struct qib_devdata *);
+void toggle_7220_rclkrls(struct qib_devdata *);
+
+
+#endif /* _QIB_7220_H */
diff --git a/drivers/infiniband/hw/qib/qib_7220_regs.h b/drivers/infiniband/hw/qib/qib_7220_regs.h
new file mode 100644
index 0000000..0da5bb7
--- /dev/null
+++ b/drivers/infiniband/hw/qib/qib_7220_regs.h
@@ -0,0 +1,1496 @@
+/*
+ * Copyright (c) 2008, 2009, 2010 QLogic Corporation. All rights reserved.
+ *
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+/* This file is mechanically generated from RTL. Any hand-edits will be lost! */
+
+#define QIB_7220_Revision_OFFS 0x0
+#define QIB_7220_Revision_R_Simulator_LSB 0x3F
+#define QIB_7220_Revision_R_Simulator_RMASK 0x1
+#define QIB_7220_Revision_R_Emulation_LSB 0x3E
+#define QIB_7220_Revision_R_Emulation_RMASK 0x1
+#define QIB_7220_Revision_R_Emulation_Revcode_LSB 0x28
+#define QIB_7220_Revision_R_Emulation_Revcode_RMASK 0x3FFFFF
+#define QIB_7220_Revision_BoardID_LSB 0x20
+#define QIB_7220_Revision_BoardID_RMASK 0xFF
+#define QIB_7220_Revision_R_SW_LSB 0x18
+#define QIB_7220_Revision_R_SW_RMASK 0xFF
+#define QIB_7220_Revision_R_Arch_LSB 0x10
+#define QIB_7220_Revision_R_Arch_RMASK 0xFF
+#define QIB_7220_Revision_R_ChipRevMajor_LSB 0x8
+#define QIB_7220_Revision_R_ChipRevMajor_RMASK 0xFF
+#define QIB_7220_Revision_R_ChipRevMinor_LSB 0x0
+#define QIB_7220_Revision_R_ChipRevMinor_RMASK 0xFF
+
+#define QIB_7220_Control_OFFS 0x8
+#define QIB_7220_Control_SyncResetExceptPcieIRAMRST_LSB 0x7
+#define QIB_7220_Control_SyncResetExceptPcieIRAMRST_RMASK 0x1
+#define QIB_7220_Control_PCIECplQDiagEn_LSB 0x6
+#define QIB_7220_Control_PCIECplQDiagEn_RMASK 0x1
+#define QIB_7220_Control_Reserved_LSB 0x5
+#define QIB_7220_Control_Reserved_RMASK 0x1
+#define QIB_7220_Control_TxLatency_LSB 0x4
+#define QIB_7220_Control_TxLatency_RMASK 0x1
+#define QIB_7220_Control_PCIERetryBufDiagEn_LSB 0x3
+#define QIB_7220_Control_PCIERetryBufDiagEn_RMASK 0x1
+#define QIB_7220_Control_LinkEn_LSB 0x2
+#define QIB_7220_Control_LinkEn_RMASK 0x1
+#define QIB_7220_Control_FreezeMode_LSB 0x1
+#define QIB_7220_Control_FreezeMode_RMASK 0x1
+#define QIB_7220_Control_SyncReset_LSB 0x0
+#define QIB_7220_Control_SyncReset_RMASK 0x1
+
+#define QIB_7220_PageAlign_OFFS 0x10
+
+#define QIB_7220_PortCnt_OFFS 0x18
+
+#define QIB_7220_SendRegBase_OFFS 0x30
+
+#define QIB_7220_UserRegBase_OFFS 0x38
+
+#define QIB_7220_CntrRegBase_OFFS 0x40
+
+#define QIB_7220_Scratch_OFFS 0x48
+
+#define QIB_7220_IntMask_OFFS 0x68
+#define QIB_7220_IntMask_SDmaIntMask_LSB 0x3F
+#define QIB_7220_IntMask_SDmaIntMask_RMASK 0x1
+#define QIB_7220_IntMask_SDmaDisabledMasked_LSB 0x3E
+#define QIB_7220_IntMask_SDmaDisabledMasked_RMASK 0x1
+#define QIB_7220_IntMask_Reserved_LSB 0x31
+#define QIB_7220_IntMask_Reserved_RMASK 0x1FFF
+#define QIB_7220_IntMask_RcvUrg16IntMask_LSB 0x30
+#define QIB_7220_IntMask_RcvUrg16IntMask_RMASK 0x1
+#define QIB_7220_IntMask_RcvUrg15IntMask_LSB 0x2F
+#define QIB_7220_IntMask_RcvUrg15IntMask_RMASK 0x1
+#define QIB_7220_IntMask_RcvUrg14IntMask_LSB 0x2E
+#define QIB_7220_IntMask_RcvUrg14IntMask_RMASK 0x1
+#define QIB_7220_IntMask_RcvUrg13IntMask_LSB 0x2D
+#define QIB_7220_IntMask_RcvUrg13IntMask_RMASK 0x1
+#define QIB_7220_IntMask_RcvUrg12IntMask_LSB 0x2C
+#define QIB_7220_IntMask_RcvUrg12IntMask_RMASK 0x1
+#define QIB_7220_IntMask_RcvUrg11IntMask_LSB 0x2B
+#define QIB_7220_IntMask_RcvUrg11IntMask_RMASK 0x1
+#define QIB_7220_IntMask_RcvUrg10IntMask_LSB 0x2A
+#define QIB_7220_IntMask_RcvUrg10IntMask_RMASK 0x1
+#define QIB_7220_IntMask_RcvUrg9IntMask_LSB 0x29
+#define QIB_7220_IntMask_RcvUrg9IntMask_RMASK 0x1
+#define QIB_7220_IntMask_RcvUrg8IntMask_LSB 0x28
+#define QIB_7220_IntMask_RcvUrg8IntMask_RMASK 0x1
+#define QIB_7220_IntMask_RcvUrg7IntMask_LSB 0x27
+#define QIB_7220_IntMask_RcvUrg7IntMask_RMASK 0x1
+#define QIB_7220_IntMask_RcvUrg6IntMask_LSB 0x26
+#define QIB_7220_IntMask_RcvUrg6IntMask_RMASK 0x1
+#define QIB_7220_IntMask_RcvUrg5IntMask_LSB 0x25
+#define QIB_7220_IntMask_RcvUrg5IntMask_RMASK 0x1
+#define QIB_7220_IntMask_RcvUrg4IntMask_LSB 0x24
+#define QIB_7220_IntMask_RcvUrg4IntMask_RMASK 0x1
+#define QIB_7220_IntMask_RcvUrg3IntMask_LSB 0x23
+#define QIB_7220_IntMask_RcvUrg3IntMask_RMASK 0x1
+#define QIB_7220_IntMask_RcvUrg2IntMask_LSB 0x22
+#define QIB_7220_IntMask_RcvUrg2IntMask_RMASK 0x1
+#define QIB_7220_IntMask_RcvUrg1IntMask_LSB 0x21
+#define QIB_7220_IntMask_RcvUrg1IntMask_RMASK 0x1
+#define QIB_7220_IntMask_RcvUrg0IntMask_LSB 0x20
+#define QIB_7220_IntMask_RcvUrg0IntMask_RMASK 0x1
+#define QIB_7220_IntMask_ErrorIntMask_LSB 0x1F
+#define QIB_7220_IntMask_ErrorIntMask_RMASK 0x1
+#define QIB_7220_IntMask_PioSetIntMask_LSB 0x1E
+#define QIB_7220_IntMask_PioSetIntMask_RMASK 0x1
+#define QIB_7220_IntMask_PioBufAvailIntMask_LSB 0x1D
+#define QIB_7220_IntMask_PioBufAvailIntMask_RMASK 0x1
+#define QIB_7220_IntMask_assertGPIOIntMask_LSB 0x1C
+#define QIB_7220_IntMask_assertGPIOIntMask_RMASK 0x1
+#define QIB_7220_IntMask_IBSerdesTrimDoneIntMask_LSB 0x1B
+#define QIB_7220_IntMask_IBSerdesTrimDoneIntMask_RMASK 0x1
+#define QIB_7220_IntMask_JIntMask_LSB 0x1A
+#define QIB_7220_IntMask_JIntMask_RMASK 0x1
+#define QIB_7220_IntMask_Reserved1_LSB 0x11
+#define QIB_7220_IntMask_Reserved1_RMASK 0x1FF
+#define QIB_7220_IntMask_RcvAvail16IntMask_LSB 0x10
+#define QIB_7220_IntMask_RcvAvail16IntMask_RMASK 0x1
+#define QIB_7220_IntMask_RcvAvail15IntMask_LSB 0xF
+#define QIB_7220_IntMask_RcvAvail15IntMask_RMASK 0x1
+#define QIB_7220_IntMask_RcvAvail14IntMask_LSB 0xE
+#define QIB_7220_IntMask_RcvAvail14IntMask_RMASK 0x1
+#define QIB_7220_IntMask_RcvAvail13IntMask_LSB 0xD
+#define QIB_7220_IntMask_RcvAvail13IntMask_RMASK 0x1
+#define QIB_7220_IntMask_RcvAvail12IntMask_LSB 0xC
+#define QIB_7220_IntMask_RcvAvail12IntMask_RMASK 0x1
+#define QIB_7220_IntMask_RcvAvail11IntMask_LSB 0xB
+#define QIB_7220_IntMask_RcvAvail11IntMask_RMASK 0x1
+#define QIB_7220_IntMask_RcvAvail10IntMask_LSB 0xA
+#define QIB_7220_IntMask_RcvAvail10IntMask_RMASK 0x1
+#define QIB_7220_IntMask_RcvAvail9IntMask_LSB 0x9
+#define QIB_7220_IntMask_RcvAvail9IntMask_RMASK 0x1
+#define QIB_7220_IntMask_RcvAvail8IntMask_LSB 0x8
+#define QIB_7220_IntMask_RcvAvail8IntMask_RMASK 0x1
+#define QIB_7220_IntMask_RcvAvail7IntMask_LSB 0x7
+#define QIB_7220_IntMask_RcvAvail7IntMask_RMASK 0x1
+#define QIB_7220_IntMask_RcvAvail6IntMask_LSB 0x6
+#define QIB_7220_IntMask_RcvAvail6IntMask_RMASK 0x1
+#define QIB_7220_IntMask_RcvAvail5IntMask_LSB 0x5
+#define QIB_7220_IntMask_RcvAvail5IntMask_RMASK 0x1
+#define QIB_7220_IntMask_RcvAvail4IntMask_LSB 0x4
+#define QIB_7220_IntMask_RcvAvail4IntMask_RMASK 0x1
+#define QIB_7220_IntMask_RcvAvail3IntMask_LSB 0x3
+#define QIB_7220_IntMask_RcvAvail3IntMask_RMASK 0x1
+#define QIB_7220_IntMask_RcvAvail2IntMask_LSB 0x2
+#define QIB_7220_IntMask_RcvAvail2IntMask_RMASK 0x1
+#define QIB_7220_IntMask_RcvAvail1IntMask_LSB 0x1
+#define QIB_7220_IntMask_RcvAvail1IntMask_RMASK 0x1
+#define QIB_7220_IntMask_RcvAvail0IntMask_LSB 0x0
+#define QIB_7220_IntMask_RcvAvail0IntMask_RMASK 0x1
+
+#define QIB_7220_IntStatus_OFFS 0x70
+#define QIB_7220_IntStatus_SDmaInt_LSB 0x3F
+#define QIB_7220_IntStatus_SDmaInt_RMASK 0x1
+#define QIB_7220_IntStatus_SDmaDisabled_LSB 0x3E
+#define QIB_7220_IntStatus_SDmaDisabled_RMASK 0x1
+#define QIB_7220_IntStatus_Reserved_LSB 0x31
+#define QIB_7220_IntStatus_Reserved_RMASK 0x1FFF
+#define QIB_7220_IntStatus_RcvUrg16_LSB 0x30
+#define QIB_7220_IntStatus_RcvUrg16_RMASK 0x1
+#define QIB_7220_IntStatus_RcvUrg15_LSB 0x2F
+#define QIB_7220_IntStatus_RcvUrg15_RMASK 0x1
+#define QIB_7220_IntStatus_RcvUrg14_LSB 0x2E
+#define QIB_7220_IntStatus_RcvUrg14_RMASK 0x1
+#define QIB_7220_IntStatus_RcvUrg13_LSB 0x2D
+#define QIB_7220_IntStatus_RcvUrg13_RMASK 0x1
+#define QIB_7220_IntStatus_RcvUrg12_LSB 0x2C
+#define QIB_7220_IntStatus_RcvUrg12_RMASK 0x1
+#define QIB_7220_IntStatus_RcvUrg11_LSB 0x2B
+#define QIB_7220_IntStatus_RcvUrg11_RMASK 0x1
+#define QIB_7220_IntStatus_RcvUrg10_LSB 0x2A
+#define QIB_7220_IntStatus_RcvUrg10_RMASK 0x1
+#define QIB_7220_IntStatus_RcvUrg9_LSB 0x29
+#define QIB_7220_IntStatus_RcvUrg9_RMASK 0x1
+#define QIB_7220_IntStatus_RcvUrg8_LSB 0x28
+#define QIB_7220_IntStatus_RcvUrg8_RMASK 0x1
+#define QIB_7220_IntStatus_RcvUrg7_LSB 0x27
+#define QIB_7220_IntStatus_RcvUrg7_RMASK 0x1
+#define QIB_7220_IntStatus_RcvUrg6_LSB 0x26
+#define QIB_7220_IntStatus_RcvUrg6_RMASK 0x1
+#define QIB_7220_IntStatus_RcvUrg5_LSB 0x25
+#define QIB_7220_IntStatus_RcvUrg5_RMASK 0x1
+#define QIB_7220_IntStatus_RcvUrg4_LSB 0x24
+#define QIB_7220_IntStatus_RcvUrg4_RMASK 0x1
+#define QIB_7220_IntStatus_RcvUrg3_LSB 0x23
+#define QIB_7220_IntStatus_RcvUrg3_RMASK 0x1
+#define QIB_7220_IntStatus_RcvUrg2_LSB 0x22
+#define QIB_7220_IntStatus_RcvUrg2_RMASK 0x1
+#define QIB_7220_IntStatus_RcvUrg1_LSB 0x21
+#define QIB_7220_IntStatus_RcvUrg1_RMASK 0x1
+#define QIB_7220_IntStatus_RcvUrg0_LSB 0x20
+#define QIB_7220_IntStatus_RcvUrg0_RMASK 0x1
+#define QIB_7220_IntStatus_Error_LSB 0x1F
+#define QIB_7220_IntStatus_Error_RMASK 0x1
+#define QIB_7220_IntStatus_PioSent_LSB 0x1E
+#define QIB_7220_IntStatus_PioSent_RMASK 0x1
+#define QIB_7220_IntStatus_PioBufAvail_LSB 0x1D
+#define QIB_7220_IntStatus_PioBufAvail_RMASK 0x1
+#define QIB_7220_IntStatus_assertGPIO_LSB 0x1C
+#define QIB_7220_IntStatus_assertGPIO_RMASK 0x1
+#define QIB_7220_IntStatus_IBSerdesTrimDone_LSB 0x1B
+#define QIB_7220_IntStatus_IBSerdesTrimDone_RMASK 0x1
+#define QIB_7220_IntStatus_JInt_LSB 0x1A
+#define QIB_7220_IntStatus_JInt_RMASK 0x1
+#define QIB_7220_IntStatus_Reserved1_LSB 0x11
+#define QIB_7220_IntStatus_Reserved1_RMASK 0x1FF
+#define QIB_7220_IntStatus_RcvAvail16_LSB 0x10
+#define QIB_7220_IntStatus_RcvAvail16_RMASK 0x1
+#define QIB_7220_IntStatus_RcvAvail15_LSB 0xF
+#define QIB_7220_IntStatus_RcvAvail15_RMASK 0x1
+#define QIB_7220_IntStatus_RcvAvail14_LSB 0xE
+#define QIB_7220_IntStatus_RcvAvail14_RMASK 0x1
+#define QIB_7220_IntStatus_RcvAvail13_LSB 0xD
+#define QIB_7220_IntStatus_RcvAvail13_RMASK 0x1
+#define QIB_7220_IntStatus_RcvAvail12_LSB 0xC
+#define QIB_7220_IntStatus_RcvAvail12_RMASK 0x1
+#define QIB_7220_IntStatus_RcvAvail11_LSB 0xB
+#define QIB_7220_IntStatus_RcvAvail11_RMASK 0x1
+#define QIB_7220_IntStatus_RcvAvail10_LSB 0xA
+#define QIB_7220_IntStatus_RcvAvail10_RMASK 0x1
+#define QIB_7220_IntStatus_RcvAvail9_LSB 0x9
+#define QIB_7220_IntStatus_RcvAvail9_RMASK 0x1
+#define QIB_7220_IntStatus_RcvAvail8_LSB 0x8
+#define QIB_7220_IntStatus_RcvAvail8_RMASK 0x1
+#define QIB_7220_IntStatus_RcvAvail7_LSB 0x7
+#define QIB_7220_IntStatus_RcvAvail7_RMASK 0x1
+#define QIB_7220_IntStatus_RcvAvail6_LSB 0x6
+#define QIB_7220_IntStatus_RcvAvail6_RMASK 0x1
+#define QIB_7220_IntStatus_RcvAvail5_LSB 0x5
+#define QIB_7220_IntStatus_RcvAvail5_RMASK 0x1
+#define QIB_7220_IntStatus_RcvAvail4_LSB 0x4
+#define QIB_7220_IntStatus_RcvAvail4_RMASK 0x1
+#define QIB_7220_IntStatus_RcvAvail3_LSB 0x3
+#define QIB_7220_IntStatus_RcvAvail3_RMASK 0x1
+#define QIB_7220_IntStatus_RcvAvail2_LSB 0x2
+#define QIB_7220_IntStatus_RcvAvail2_RMASK 0x1
+#define QIB_7220_IntStatus_RcvAvail1_LSB 0x1
+#define QIB_7220_IntStatus_RcvAvail1_RMASK 0x1
+#define QIB_7220_IntStatus_RcvAvail0_LSB 0x0
+#define QIB_7220_IntStatus_RcvAvail0_RMASK 0x1
+
+#define QIB_7220_IntClear_OFFS 0x78
+#define QIB_7220_IntClear_SDmaIntClear_LSB 0x3F
+#define QIB_7220_IntClear_SDmaIntClear_RMASK 0x1
+#define QIB_7220_IntClear_SDmaDisabledClear_LSB 0x3E
+#define QIB_7220_IntClear_SDmaDisabledClear_RMASK 0x1
+#define QIB_7220_IntClear_Reserved_LSB 0x31
+#define QIB_7220_IntClear_Reserved_RMASK 0x1FFF
+#define QIB_7220_IntClear_RcvUrg16IntClear_LSB 0x30
+#define QIB_7220_IntClear_RcvUrg16IntClear_RMASK 0x1
+#define QIB_7220_IntClear_RcvUrg15IntClear_LSB 0x2F
+#define QIB_7220_IntClear_RcvUrg15IntClear_RMASK 0x1
+#define QIB_7220_IntClear_RcvUrg14IntClear_LSB 0x2E
+#define QIB_7220_IntClear_RcvUrg14IntClear_RMASK 0x1
+#define QIB_7220_IntClear_RcvUrg13IntClear_LSB 0x2D
+#define QIB_7220_IntClear_RcvUrg13IntClear_RMASK 0x1
+#define QIB_7220_IntClear_RcvUrg12IntClear_LSB 0x2C
+#define QIB_7220_IntClear_RcvUrg12IntClear_RMASK 0x1
+#define QIB_7220_IntClear_RcvUrg11IntClear_LSB 0x2B
+#define QIB_7220_IntClear_RcvUrg11IntClear_RMASK 0x1
+#define QIB_7220_IntClear_RcvUrg10IntClear_LSB 0x2A
+#define QIB_7220_IntClear_RcvUrg10IntClear_RMASK 0x1
+#define QIB_7220_IntClear_RcvUrg9IntClear_LSB 0x29
+#define QIB_7220_IntClear_RcvUrg9IntClear_RMASK 0x1
+#define QIB_7220_IntClear_RcvUrg8IntClear_LSB 0x28
+#define QIB_7220_IntClear_RcvUrg8IntClear_RMASK 0x1
+#define QIB_7220_IntClear_RcvUrg7IntClear_LSB 0x27
+#define QIB_7220_IntClear_RcvUrg7IntClear_RMASK 0x1
+#define QIB_7220_IntClear_RcvUrg6IntClear_LSB 0x26
+#define QIB_7220_IntClear_RcvUrg6IntClear_RMASK 0x1
+#define QIB_7220_IntClear_RcvUrg5IntClear_LSB 0x25
+#define QIB_7220_IntClear_RcvUrg5IntClear_RMASK 0x1
+#define QIB_7220_IntClear_RcvUrg4IntClear_LSB 0x24
+#define QIB_7220_IntClear_RcvUrg4IntClear_RMASK 0x1
+#define QIB_7220_IntClear_RcvUrg3IntClear_LSB 0x23
+#define QIB_7220_IntClear_RcvUrg3IntClear_RMASK 0x1
+#define QIB_7220_IntClear_RcvUrg2IntClear_LSB 0x22
+#define QIB_7220_IntClear_RcvUrg2IntClear_RMASK 0x1
+#define QIB_7220_IntClear_RcvUrg1IntClear_LSB 0x21
+#define QIB_7220_IntClear_RcvUrg1IntClear_RMASK 0x1
+#define QIB_7220_IntClear_RcvUrg0IntClear_LSB 0x20
+#define QIB_7220_IntClear_RcvUrg0IntClear_RMASK 0x1
+#define QIB_7220_IntClear_ErrorIntClear_LSB 0x1F
+#define QIB_7220_IntClear_ErrorIntClear_RMASK 0x1
+#define QIB_7220_IntClear_PioSetIntClear_LSB 0x1E
+#define QIB_7220_IntClear_PioSetIntClear_RMASK 0x1
+#define QIB_7220_IntClear_PioBufAvailIntClear_LSB 0x1D
+#define QIB_7220_IntClear_PioBufAvailIntClear_RMASK 0x1
+#define QIB_7220_IntClear_assertGPIOIntClear_LSB 0x1C
+#define QIB_7220_IntClear_assertGPIOIntClear_RMASK 0x1
+#define QIB_7220_IntClear_IBSerdesTrimDoneClear_LSB 0x1B
+#define QIB_7220_IntClear_IBSerdesTrimDoneClear_RMASK 0x1
+#define QIB_7220_IntClear_JIntClear_LSB 0x1A
+#define QIB_7220_IntClear_JIntClear_RMASK 0x1
+#define QIB_7220_IntClear_Reserved1_LSB 0x11
+#define QIB_7220_IntClear_Reserved1_RMASK 0x1FF
+#define QIB_7220_IntClear_RcvAvail16IntClear_LSB 0x10
+#define QIB_7220_IntClear_RcvAvail16IntClear_RMASK 0x1
+#define QIB_7220_IntClear_RcvAvail15IntClear_LSB 0xF
+#define QIB_7220_IntClear_RcvAvail15IntClear_RMASK 0x1
+#define QIB_7220_IntClear_RcvAvail14IntClear_LSB 0xE
+#define QIB_7220_IntClear_RcvAvail14IntClear_RMASK 0x1
+#define QIB_7220_IntClear_RcvAvail13IntClear_LSB 0xD
+#define QIB_7220_IntClear_RcvAvail13IntClear_RMASK 0x1
+#define QIB_7220_IntClear_RcvAvail12IntClear_LSB 0xC
+#define QIB_7220_IntClear_RcvAvail12IntClear_RMASK 0x1
+#define QIB_7220_IntClear_RcvAvail11IntClear_LSB 0xB
+#define QIB_7220_IntClear_RcvAvail11IntClear_RMASK 0x1
+#define QIB_7220_IntClear_RcvAvail10IntClear_LSB 0xA
+#define QIB_7220_IntClear_RcvAvail10IntClear_RMASK 0x1
+#define QIB_7220_IntClear_RcvAvail9IntClear_LSB 0x9
+#define QIB_7220_IntClear_RcvAvail9IntClear_RMASK 0x1
+#define QIB_7220_IntClear_RcvAvail8IntClear_LSB 0x8
+#define QIB_7220_IntClear_RcvAvail8IntClear_RMASK 0x1
+#define QIB_7220_IntClear_RcvAvail7IntClear_LSB 0x7
+#define QIB_7220_IntClear_RcvAvail7IntClear_RMASK 0x1
+#define QIB_7220_IntClear_RcvAvail6IntClear_LSB 0x6
+#define QIB_7220_IntClear_RcvAvail6IntClear_RMASK 0x1
+#define QIB_7220_IntClear_RcvAvail5IntClear_LSB 0x5
+#define QIB_7220_IntClear_RcvAvail5IntClear_RMASK 0x1
+#define QIB_7220_IntClear_RcvAvail4IntClear_LSB 0x4
+#define QIB_7220_IntClear_RcvAvail4IntClear_RMASK 0x1
+#define QIB_7220_IntClear_RcvAvail3IntClear_LSB 0x3
+#define QIB_7220_IntClear_RcvAvail3IntClear_RMASK 0x1
+#define QIB_7220_IntClear_RcvAvail2IntClear_LSB 0x2
+#define QIB_7220_IntClear_RcvAvail2IntClear_RMASK 0x1
+#define QIB_7220_IntClear_RcvAvail1IntClear_LSB 0x1
+#define QIB_7220_IntClear_RcvAvail1IntClear_RMASK 0x1
+#define QIB_7220_IntClear_RcvAvail0IntClear_LSB 0x0
+#define QIB_7220_IntClear_RcvAvail0IntClear_RMASK 0x1
+
+#define QIB_7220_ErrMask_OFFS 0x80
+#define QIB_7220_ErrMask_Reserved_LSB 0x36
+#define QIB_7220_ErrMask_Reserved_RMASK 0x3FF
+#define QIB_7220_ErrMask_InvalidEEPCmdMask_LSB 0x35
+#define QIB_7220_ErrMask_InvalidEEPCmdMask_RMASK 0x1
+#define QIB_7220_ErrMask_SDmaDescAddrMisalignErrMask_LSB 0x34
+#define QIB_7220_ErrMask_SDmaDescAddrMisalignErrMask_RMASK 0x1
+#define QIB_7220_ErrMask_HardwareErrMask_LSB 0x33
+#define QIB_7220_ErrMask_HardwareErrMask_RMASK 0x1
+#define QIB_7220_ErrMask_ResetNegatedMask_LSB 0x32
+#define QIB_7220_ErrMask_ResetNegatedMask_RMASK 0x1
+#define QIB_7220_ErrMask_InvalidAddrErrMask_LSB 0x31
+#define QIB_7220_ErrMask_InvalidAddrErrMask_RMASK 0x1
+#define QIB_7220_ErrMask_IBStatusChangedMask_LSB 0x30
+#define QIB_7220_ErrMask_IBStatusChangedMask_RMASK 0x1
+#define QIB_7220_ErrMask_SDmaUnexpDataErrMask_LSB 0x2F
+#define QIB_7220_ErrMask_SDmaUnexpDataErrMask_RMASK 0x1
+#define QIB_7220_ErrMask_SDmaMissingDwErrMask_LSB 0x2E
+#define QIB_7220_ErrMask_SDmaMissingDwErrMask_RMASK 0x1
+#define QIB_7220_ErrMask_SDmaDwEnErrMask_LSB 0x2D
+#define QIB_7220_ErrMask_SDmaDwEnErrMask_RMASK 0x1
+#define QIB_7220_ErrMask_SDmaRpyTagErrMask_LSB 0x2C
+#define QIB_7220_ErrMask_SDmaRpyTagErrMask_RMASK 0x1
+#define QIB_7220_ErrMask_SDma1stDescErrMask_LSB 0x2B
+#define QIB_7220_ErrMask_SDma1stDescErrMask_RMASK 0x1
+#define QIB_7220_ErrMask_SDmaBaseErrMask_LSB 0x2A
+#define QIB_7220_ErrMask_SDmaBaseErrMask_RMASK 0x1
+#define QIB_7220_ErrMask_SDmaTailOutOfBoundErrMask_LSB 0x29
+#define QIB_7220_ErrMask_SDmaTailOutOfBoundErrMask_RMASK 0x1
+#define QIB_7220_ErrMask_SDmaOutOfBoundErrMask_LSB 0x28
+#define QIB_7220_ErrMask_SDmaOutOfBoundErrMask_RMASK 0x1
+#define QIB_7220_ErrMask_SDmaGenMismatchErrMask_LSB 0x27
+#define QIB_7220_ErrMask_SDmaGenMismatchErrMask_RMASK 0x1
+#define QIB_7220_ErrMask_SendBufMisuseErrMask_LSB 0x26
+#define QIB_7220_ErrMask_SendBufMisuseErrMask_RMASK 0x1
+#define QIB_7220_ErrMask_SendUnsupportedVLErrMask_LSB 0x25
+#define QIB_7220_ErrMask_SendUnsupportedVLErrMask_RMASK 0x1
+#define QIB_7220_ErrMask_SendUnexpectedPktNumErrMask_LSB 0x24
+#define QIB_7220_ErrMask_SendUnexpectedPktNumErrMask_RMASK 0x1
+#define QIB_7220_ErrMask_SendPioArmLaunchErrMask_LSB 0x23
+#define QIB_7220_ErrMask_SendPioArmLaunchErrMask_RMASK 0x1
+#define QIB_7220_ErrMask_SendDroppedDataPktErrMask_LSB 0x22
+#define QIB_7220_ErrMask_SendDroppedDataPktErrMask_RMASK 0x1
+#define QIB_7220_ErrMask_SendDroppedSmpPktErrMask_LSB 0x21
+#define QIB_7220_ErrMask_SendDroppedSmpPktErrMask_RMASK 0x1
+#define QIB_7220_ErrMask_SendPktLenErrMask_LSB 0x20
+#define QIB_7220_ErrMask_SendPktLenErrMask_RMASK 0x1
+#define QIB_7220_ErrMask_SendUnderRunErrMask_LSB 0x1F
+#define QIB_7220_ErrMask_SendUnderRunErrMask_RMASK 0x1
+#define QIB_7220_ErrMask_SendMaxPktLenErrMask_LSB 0x1E
+#define QIB_7220_ErrMask_SendMaxPktLenErrMask_RMASK 0x1
+#define QIB_7220_ErrMask_SendMinPktLenErrMask_LSB 0x1D
+#define QIB_7220_ErrMask_SendMinPktLenErrMask_RMASK 0x1
+#define QIB_7220_ErrMask_SDmaDisabledErrMask_LSB 0x1C
+#define QIB_7220_ErrMask_SDmaDisabledErrMask_RMASK 0x1
+#define QIB_7220_ErrMask_SendSpecialTriggerErrMask_LSB 0x1B
+#define QIB_7220_ErrMask_SendSpecialTriggerErrMask_RMASK 0x1
+#define QIB_7220_ErrMask_Reserved1_LSB 0x12
+#define QIB_7220_ErrMask_Reserved1_RMASK 0x1FF
+#define QIB_7220_ErrMask_RcvIBLostLinkErrMask_LSB 0x11
+#define QIB_7220_ErrMask_RcvIBLostLinkErrMask_RMASK 0x1
+#define QIB_7220_ErrMask_RcvHdrErrMask_LSB 0x10
+#define QIB_7220_ErrMask_RcvHdrErrMask_RMASK 0x1
+#define QIB_7220_ErrMask_RcvHdrLenErrMask_LSB 0xF
+#define QIB_7220_ErrMask_RcvHdrLenErrMask_RMASK 0x1
+#define QIB_7220_ErrMask_RcvBadTidErrMask_LSB 0xE
+#define QIB_7220_ErrMask_RcvBadTidErrMask_RMASK 0x1
+#define QIB_7220_ErrMask_RcvHdrFullErrMask_LSB 0xD
+#define QIB_7220_ErrMask_RcvHdrFullErrMask_RMASK 0x1
+#define QIB_7220_ErrMask_RcvEgrFullErrMask_LSB 0xC
+#define QIB_7220_ErrMask_RcvEgrFullErrMask_RMASK 0x1
+#define QIB_7220_ErrMask_RcvBadVersionErrMask_LSB 0xB
+#define QIB_7220_ErrMask_RcvBadVersionErrMask_RMASK 0x1
+#define QIB_7220_ErrMask_RcvIBFlowErrMask_LSB 0xA
+#define QIB_7220_ErrMask_RcvIBFlowErrMask_RMASK 0x1
+#define QIB_7220_ErrMask_RcvEBPErrMask_LSB 0x9
+#define QIB_7220_ErrMask_RcvEBPErrMask_RMASK 0x1
+#define QIB_7220_ErrMask_RcvUnsupportedVLErrMask_LSB 0x8
+#define QIB_7220_ErrMask_RcvUnsupportedVLErrMask_RMASK 0x1
+#define QIB_7220_ErrMask_RcvUnexpectedCharErrMask_LSB 0x7
+#define QIB_7220_ErrMask_RcvUnexpectedCharErrMask_RMASK 0x1
+#define QIB_7220_ErrMask_RcvShortPktLenErrMask_LSB 0x6
+#define QIB_7220_ErrMask_RcvShortPktLenErrMask_RMASK 0x1
+#define QIB_7220_ErrMask_RcvLongPktLenErrMask_LSB 0x5
+#define QIB_7220_ErrMask_RcvLongPktLenErrMask_RMASK 0x1
+#define QIB_7220_ErrMask_RcvMaxPktLenErrMask_LSB 0x4
+#define QIB_7220_ErrMask_RcvMaxPktLenErrMask_RMASK 0x1
+#define QIB_7220_ErrMask_RcvMinPktLenErrMask_LSB 0x3
+#define QIB_7220_ErrMask_RcvMinPktLenErrMask_RMASK 0x1
+#define QIB_7220_ErrMask_RcvICRCErrMask_LSB 0x2
+#define QIB_7220_ErrMask_RcvICRCErrMask_RMASK 0x1
+#define QIB_7220_ErrMask_RcvVCRCErrMask_LSB 0x1
+#define QIB_7220_ErrMask_RcvVCRCErrMask_RMASK 0x1
+#define QIB_7220_ErrMask_RcvFormatErrMask_LSB 0x0
+#define QIB_7220_ErrMask_RcvFormatErrMask_RMASK 0x1
+
+#define QIB_7220_ErrStatus_OFFS 0x88
+#define QIB_7220_ErrStatus_Reserved_LSB 0x36
+#define QIB_7220_ErrStatus_Reserved_RMASK 0x3FF
+#define QIB_7220_ErrStatus_InvalidEEPCmdErr_LSB 0x35
+#define QIB_7220_ErrStatus_InvalidEEPCmdErr_RMASK 0x1
+#define QIB_7220_ErrStatus_SDmaDescAddrMisalignErr_LSB 0x34
+#define QIB_7220_ErrStatus_SDmaDescAddrMisalignErr_RMASK 0x1
+#define QIB_7220_ErrStatus_HardwareErr_LSB 0x33
+#define QIB_7220_ErrStatus_HardwareErr_RMASK 0x1
+#define QIB_7220_ErrStatus_ResetNegated_LSB 0x32
+#define QIB_7220_ErrStatus_ResetNegated_RMASK 0x1
+#define QIB_7220_ErrStatus_InvalidAddrErr_LSB 0x31
+#define QIB_7220_ErrStatus_InvalidAddrErr_RMASK 0x1
+#define QIB_7220_ErrStatus_IBStatusChanged_LSB 0x30
+#define QIB_7220_ErrStatus_IBStatusChanged_RMASK 0x1
+#define QIB_7220_ErrStatus_SDmaUnexpDataErr_LSB 0x2F
+#define QIB_7220_ErrStatus_SDmaUnexpDataErr_RMASK 0x1
+#define QIB_7220_ErrStatus_SDmaMissingDwErr_LSB 0x2E
+#define QIB_7220_ErrStatus_SDmaMissingDwErr_RMASK 0x1
+#define QIB_7220_ErrStatus_SDmaDwEnErr_LSB 0x2D
+#define QIB_7220_ErrStatus_SDmaDwEnErr_RMASK 0x1
+#define QIB_7220_ErrStatus_SDmaRpyTagErr_LSB 0x2C
+#define QIB_7220_ErrStatus_SDmaRpyTagErr_RMASK 0x1
+#define QIB_7220_ErrStatus_SDma1stDescErr_LSB 0x2B
+#define QIB_7220_ErrStatus_SDma1stDescErr_RMASK 0x1
+#define QIB_7220_ErrStatus_SDmaBaseErr_LSB 0x2A
+#define QIB_7220_ErrStatus_SDmaBaseErr_RMASK 0x1
+#define QIB_7220_ErrStatus_SDmaTailOutOfBoundErr_LSB 0x29
+#define QIB_7220_ErrStatus_SDmaTailOutOfBoundErr_RMASK 0x1
+#define QIB_7220_ErrStatus_SDmaOutOfBoundErr_LSB 0x28
+#define QIB_7220_ErrStatus_SDmaOutOfBoundErr_RMASK 0x1
+#define QIB_7220_ErrStatus_SDmaGenMismatchErr_LSB 0x27
+#define QIB_7220_ErrStatus_SDmaGenMismatchErr_RMASK 0x1
+#define QIB_7220_ErrStatus_SendBufMisuseErr_LSB 0x26
+#define QIB_7220_ErrStatus_SendBufMisuseErr_RMASK 0x1
+#define QIB_7220_ErrStatus_SendUnsupportedVLErr_LSB 0x25
+#define QIB_7220_ErrStatus_SendUnsupportedVLErr_RMASK 0x1
+#define QIB_7220_ErrStatus_SendUnexpectedPktNumErr_LSB 0x24
+#define QIB_7220_ErrStatus_SendUnexpectedPktNumErr_RMASK 0x1
+#define QIB_7220_ErrStatus_SendPioArmLaunchErr_LSB 0x23
+#define QIB_7220_ErrStatus_SendPioArmLaunchErr_RMASK 0x1
+#define QIB_7220_ErrStatus_SendDroppedDataPktErr_LSB 0x22
+#define QIB_7220_ErrStatus_SendDroppedDataPktErr_RMASK 0x1
+#define QIB_7220_ErrStatus_SendDroppedSmpPktErr_LSB 0x21
+#define QIB_7220_ErrStatus_SendDroppedSmpPktErr_RMASK 0x1
+#define QIB_7220_ErrStatus_SendPktLenErr_LSB 0x20
+#define QIB_7220_ErrStatus_SendPktLenErr_RMASK 0x1
+#define QIB_7220_ErrStatus_SendUnderRunErr_LSB 0x1F
+#define QIB_7220_ErrStatus_SendUnderRunErr_RMASK 0x1
+#define QIB_7220_ErrStatus_SendMaxPktLenErr_LSB 0x1E
+#define QIB_7220_ErrStatus_SendMaxPktLenErr_RMASK 0x1
+#define QIB_7220_ErrStatus_SendMinPktLenErr_LSB 0x1D
+#define QIB_7220_ErrStatus_SendMinPktLenErr_RMASK 0x1
+#define QIB_7220_ErrStatus_SDmaDisabledErr_LSB 0x1C
+#define QIB_7220_ErrStatus_SDmaDisabledErr_RMASK 0x1
+#define QIB_7220_ErrStatus_SendSpecialTriggerErr_LSB 0x1B
+#define QIB_7220_ErrStatus_SendSpecialTriggerErr_RMASK 0x1
+#define QIB_7220_ErrStatus_Reserved1_LSB 0x12
+#define QIB_7220_ErrStatus_Reserved1_RMASK 0x1FF
+#define QIB_7220_ErrStatus_RcvIBLostLinkErr_LSB 0x11
+#define QIB_7220_ErrStatus_RcvIBLostLinkErr_RMASK 0x1
+#define QIB_7220_ErrStatus_RcvHdrErr_LSB 0x10
+#define QIB_7220_ErrStatus_RcvHdrErr_RMASK 0x1
+#define QIB_7220_ErrStatus_RcvHdrLenErr_LSB 0xF
+#define QIB_7220_ErrStatus_RcvHdrLenErr_RMASK 0x1
+#define QIB_7220_ErrStatus_RcvBadTidErr_LSB 0xE
+#define QIB_7220_ErrStatus_RcvBadTidErr_RMASK 0x1
+#define QIB_7220_ErrStatus_RcvHdrFullErr_LSB 0xD
+#define QIB_7220_ErrStatus_RcvHdrFullErr_RMASK 0x1
+#define QIB_7220_ErrStatus_RcvEgrFullErr_LSB 0xC
+#define QIB_7220_ErrStatus_RcvEgrFullErr_RMASK 0x1
+#define QIB_7220_ErrStatus_RcvBadVersionErr_LSB 0xB
+#define QIB_7220_ErrStatus_RcvBadVersionErr_RMASK 0x1
+#define QIB_7220_ErrStatus_RcvIBFlowErr_LSB 0xA
+#define QIB_7220_ErrStatus_RcvIBFlowErr_RMASK 0x1
+#define QIB_7220_ErrStatus_RcvEBPErr_LSB 0x9
+#define QIB_7220_ErrStatus_RcvEBPErr_RMASK 0x1
+#define QIB_7220_ErrStatus_RcvUnsupportedVLErr_LSB 0x8
+#define QIB_7220_ErrStatus_RcvUnsupportedVLErr_RMASK 0x1
+#define QIB_7220_ErrStatus_RcvUnexpectedCharErr_LSB 0x7
+#define QIB_7220_ErrStatus_RcvUnexpectedCharErr_RMASK 0x1
+#define QIB_7220_ErrStatus_RcvShortPktLenErr_LSB 0x6
+#define QIB_7220_ErrStatus_RcvShortPktLenErr_RMASK 0x1
+#define QIB_7220_ErrStatus_RcvLongPktLenErr_LSB 0x5
+#define QIB_7220_ErrStatus_RcvLongPktLenErr_RMASK 0x1
+#define QIB_7220_ErrStatus_RcvMaxPktLenErr_LSB 0x4
+#define QIB_7220_ErrStatus_RcvMaxPktLenErr_RMASK 0x1
+#define QIB_7220_ErrStatus_RcvMinPktLenErr_LSB 0x3
+#define QIB_7220_ErrStatus_RcvMinPktLenErr_RMASK 0x1
+#define QIB_7220_ErrStatus_RcvICRCErr_LSB 0x2
+#define QIB_7220_ErrStatus_RcvICRCErr_RMASK 0x1
+#define QIB_7220_ErrStatus_RcvVCRCErr_LSB 0x1
+#define QIB_7220_ErrStatus_RcvVCRCErr_RMASK 0x1
+#define QIB_7220_ErrStatus_RcvFormatErr_LSB 0x0
+#define QIB_7220_ErrStatus_RcvFormatErr_RMASK 0x1
+
+#define QIB_7220_ErrClear_OFFS 0x90
+#define QIB_7220_ErrClear_Reserved_LSB 0x36
+#define QIB_7220_ErrClear_Reserved_RMASK 0x3FF
+#define QIB_7220_ErrClear_InvalidEEPCmdErrClear_LSB 0x35
+#define QIB_7220_ErrClear_InvalidEEPCmdErrClear_RMASK 0x1
+#define QIB_7220_ErrClear_SDmaDescAddrMisalignErrClear_LSB 0x34
+#define QIB_7220_ErrClear_SDmaDescAddrMisalignErrClear_RMASK 0x1
+#define QIB_7220_ErrClear_HardwareErrClear_LSB 0x33
+#define QIB_7220_ErrClear_HardwareErrClear_RMASK 0x1
+#define QIB_7220_ErrClear_ResetNegatedClear_LSB 0x32
+#define QIB_7220_ErrClear_ResetNegatedClear_RMASK 0x1
+#define QIB_7220_ErrClear_InvalidAddrErrClear_LSB 0x31
+#define QIB_7220_ErrClear_InvalidAddrErrClear_RMASK 0x1
+#define QIB_7220_ErrClear_IBStatusChangedClear_LSB 0x30
+#define QIB_7220_ErrClear_IBStatusChangedClear_RMASK 0x1
+#define QIB_7220_ErrClear_SDmaUnexpDataErrClear_LSB 0x2F
+#define QIB_7220_ErrClear_SDmaUnexpDataErrClear_RMASK 0x1
+#define QIB_7220_ErrClear_SDmaMissingDwErrClear_LSB 0x2E
+#define QIB_7220_ErrClear_SDmaMissingDwErrClear_RMASK 0x1
+#define QIB_7220_ErrClear_SDmaDwEnErrClear_LSB 0x2D
+#define QIB_7220_ErrClear_SDmaDwEnErrClear_RMASK 0x1
+#define QIB_7220_ErrClear_SDmaRpyTagErrClear_LSB 0x2C
+#define QIB_7220_ErrClear_SDmaRpyTagErrClear_RMASK 0x1
+#define QIB_7220_ErrClear_SDma1stDescErrClear_LSB 0x2B
+#define QIB_7220_ErrClear_SDma1stDescErrClear_RMASK 0x1
+#define QIB_7220_ErrClear_SDmaBaseErrClear_LSB 0x2A
+#define QIB_7220_ErrClear_SDmaBaseErrClear_RMASK 0x1
+#define QIB_7220_ErrClear_SDmaTailOutOfBoundErrClear_LSB 0x29
+#define QIB_7220_ErrClear_SDmaTailOutOfBoundErrClear_RMASK 0x1
+#define QIB_7220_ErrClear_SDmaOutOfBoundErrClear_LSB 0x28
+#define QIB_7220_ErrClear_SDmaOutOfBoundErrClear_RMASK 0x1
+#define QIB_7220_ErrClear_SDmaGenMismatchErrClear_LSB 0x27
+#define QIB_7220_ErrClear_SDmaGenMismatchErrClear_RMASK 0x1
+#define QIB_7220_ErrClear_SendBufMisuseErrClear_LSB 0x26
+#define QIB_7220_ErrClear_SendBufMisuseErrClear_RMASK 0x1
+#define QIB_7220_ErrClear_SendUnsupportedVLErrClear_LSB 0x25
+#define QIB_7220_ErrClear_SendUnsupportedVLErrClear_RMASK 0x1
+#define QIB_7220_ErrClear_SendUnexpectedPktNumErrClear_LSB 0x24
+#define QIB_7220_ErrClear_SendUnexpectedPktNumErrClear_RMASK 0x1
+#define QIB_7220_ErrClear_SendPioArmLaunchErrClear_LSB 0x23
+#define QIB_7220_ErrClear_SendPioArmLaunchErrClear_RMASK 0x1
+#define QIB_7220_ErrClear_SendDroppedDataPktErrClear_LSB 0x22
+#define QIB_7220_ErrClear_SendDroppedDataPktErrClear_RMASK 0x1
+#define QIB_7220_ErrClear_SendDroppedSmpPktErrClear_LSB 0x21
+#define QIB_7220_ErrClear_SendDroppedSmpPktErrClear_RMASK 0x1
+#define QIB_7220_ErrClear_SendPktLenErrClear_LSB 0x20
+#define QIB_7220_ErrClear_SendPktLenErrClear_RMASK 0x1
+#define QIB_7220_ErrClear_SendUnderRunErrClear_LSB 0x1F
+#define QIB_7220_ErrClear_SendUnderRunErrClear_RMASK 0x1
+#define QIB_7220_ErrClear_SendMaxPktLenErrClear_LSB 0x1E
+#define QIB_7220_ErrClear_SendMaxPktLenErrClear_RMASK 0x1
+#define QIB_7220_ErrClear_SendMinPktLenErrClear_LSB 0x1D
+#define QIB_7220_ErrClear_SendMinPktLenErrClear_RMASK 0x1
+#define QIB_7220_ErrClear_SDmaDisabledErrClear_LSB 0x1C
+#define QIB_7220_ErrClear_SDmaDisabledErrClear_RMASK 0x1
+#define QIB_7220_ErrClear_SendSpecialTriggerErrClear_LSB 0x1B
+#define QIB_7220_ErrClear_SendSpecialTriggerErrClear_RMASK 0x1
+#define QIB_7220_ErrClear_Reserved1_LSB 0x12
+#define QIB_7220_ErrClear_Reserved1_RMASK 0x1FF
+#define QIB_7220_ErrClear_RcvIBLostLinkErrClear_LSB 0x11
+#define QIB_7220_ErrClear_RcvIBLostLinkErrClear_RMASK 0x1
+#define QIB_7220_ErrClear_RcvHdrErrClear_LSB 0x10
+#define QIB_7220_ErrClear_RcvHdrErrClear_RMASK 0x1
+#define QIB_7220_ErrClear_RcvHdrLenErrClear_LSB 0xF
+#define QIB_7220_ErrClear_RcvHdrLenErrClear_RMASK 0x1
+#define QIB_7220_ErrClear_RcvBadTidErrClear_LSB 0xE
+#define QIB_7220_ErrClear_RcvBadTidErrClear_RMASK 0x1
+#define QIB_7220_ErrClear_RcvHdrFullErrClear_LSB 0xD
+#define QIB_7220_ErrClear_RcvHdrFullErrClear_RMASK 0x1
+#define QIB_7220_ErrClear_RcvEgrFullErrClear_LSB 0xC
+#define QIB_7220_ErrClear_RcvEgrFullErrClear_RMASK 0x1
+#define QIB_7220_ErrClear_RcvBadVersionErrClear_LSB 0xB
+#define QIB_7220_ErrClear_RcvBadVersionErrClear_RMASK 0x1
+#define QIB_7220_ErrClear_RcvIBFlowErrClear_LSB 0xA
+#define QIB_7220_ErrClear_RcvIBFlowErrClear_RMASK 0x1
+#define QIB_7220_ErrClear_RcvEBPErrClear_LSB 0x9
+#define QIB_7220_ErrClear_RcvEBPErrClear_RMASK 0x1
+#define QIB_7220_ErrClear_RcvUnsupportedVLErrClear_LSB 0x8
+#define QIB_7220_ErrClear_RcvUnsupportedVLErrClear_RMASK 0x1
+#define QIB_7220_ErrClear_RcvUnexpectedCharErrClear_LSB 0x7
+#define QIB_7220_ErrClear_RcvUnexpectedCharErrClear_RMASK 0x1
+#define QIB_7220_ErrClear_RcvShortPktLenErrClear_LSB 0x6
+#define QIB_7220_ErrClear_RcvShortPktLenErrClear_RMASK 0x1
+#define QIB_7220_ErrClear_RcvLongPktLenErrClear_LSB 0x5
+#define QIB_7220_ErrClear_RcvLongPktLenErrClear_RMASK 0x1
+#define QIB_7220_ErrClear_RcvMaxPktLenErrClear_LSB 0x4
+#define QIB_7220_ErrClear_RcvMaxPktLenErrClear_RMASK 0x1
+#define QIB_7220_ErrClear_RcvMinPktLenErrClear_LSB 0x3
+#define QIB_7220_ErrClear_RcvMinPktLenErrClear_RMASK 0x1
+#define QIB_7220_ErrClear_RcvICRCErrClear_LSB 0x2
+#define QIB_7220_ErrClear_RcvICRCErrClear_RMASK 0x1
+#define QIB_7220_ErrClear_RcvVCRCErrClear_LSB 0x1
+#define QIB_7220_ErrClear_RcvVCRCErrClear_RMASK 0x1
+#define QIB_7220_ErrClear_RcvFormatErrClear_LSB 0x0
+#define QIB_7220_ErrClear_RcvFormatErrClear_RMASK 0x1
+
+#define QIB_7220_HwErrMask_OFFS 0x98
+#define QIB_7220_HwErrMask_IBCBusFromSPCParityErrMask_LSB 0x3F
+#define QIB_7220_HwErrMask_IBCBusFromSPCParityErrMask_RMASK 0x1
+#define QIB_7220_HwErrMask_IBCBusToSPCParityErrMask_LSB 0x3E
+#define QIB_7220_HwErrMask_IBCBusToSPCParityErrMask_RMASK 0x1
+#define QIB_7220_HwErrMask_Clk_uC_PLLNotLockedMask_LSB 0x3D
+#define QIB_7220_HwErrMask_Clk_uC_PLLNotLockedMask_RMASK 0x1
+#define QIB_7220_HwErrMask_IBSerdesPClkNotDetectMask_LSB 0x3C
+#define QIB_7220_HwErrMask_IBSerdesPClkNotDetectMask_RMASK 0x1
+#define QIB_7220_HwErrMask_PCIESerdesQ3PClkNotDetectMask_LSB 0x3B
+#define QIB_7220_HwErrMask_PCIESerdesQ3PClkNotDetectMask_RMASK 0x1
+#define QIB_7220_HwErrMask_PCIESerdesQ2PClkNotDetectMask_LSB 0x3A
+#define QIB_7220_HwErrMask_PCIESerdesQ2PClkNotDetectMask_RMASK 0x1
+#define QIB_7220_HwErrMask_PCIESerdesQ1PClkNotDetectMask_LSB 0x39
+#define QIB_7220_HwErrMask_PCIESerdesQ1PClkNotDetectMask_RMASK 0x1
+#define QIB_7220_HwErrMask_PCIESerdesQ0PClkNotDetectMask_LSB 0x38
+#define QIB_7220_HwErrMask_PCIESerdesQ0PClkNotDetectMask_RMASK 0x1
+#define QIB_7220_HwErrMask_Reserved_LSB 0x37
+#define QIB_7220_HwErrMask_Reserved_RMASK 0x1
+#define QIB_7220_HwErrMask_PowerOnBISTFailedMask_LSB 0x36
+#define QIB_7220_HwErrMask_PowerOnBISTFailedMask_RMASK 0x1
+#define QIB_7220_HwErrMask_Reserved1_LSB 0x33
+#define QIB_7220_HwErrMask_Reserved1_RMASK 0x7
+#define QIB_7220_HwErrMask_RXEMemParityErrMask_LSB 0x2C
+#define QIB_7220_HwErrMask_RXEMemParityErrMask_RMASK 0x7F
+#define QIB_7220_HwErrMask_TXEMemParityErrMask_LSB 0x28
+#define QIB_7220_HwErrMask_TXEMemParityErrMask_RMASK 0xF
+#define QIB_7220_HwErrMask_DDSRXEQMemoryParityErrMask_LSB 0x27
+#define QIB_7220_HwErrMask_DDSRXEQMemoryParityErrMask_RMASK 0x1
+#define QIB_7220_HwErrMask_IB_uC_MemoryParityErrMask_LSB 0x26
+#define QIB_7220_HwErrMask_IB_uC_MemoryParityErrMask_RMASK 0x1
+#define QIB_7220_HwErrMask_PCIEOct1_uC_MemoryParityErrMask_LSB 0x25
+#define QIB_7220_HwErrMask_PCIEOct1_uC_MemoryParityErrMask_RMASK 0x1
+#define QIB_7220_HwErrMask_PCIEOct0_uC_MemoryParityErrMask_LSB 0x24
+#define QIB_7220_HwErrMask_PCIEOct0_uC_MemoryParityErrMask_RMASK 0x1
+#define QIB_7220_HwErrMask_Reserved2_LSB 0x22
+#define QIB_7220_HwErrMask_Reserved2_RMASK 0x3
+#define QIB_7220_HwErrMask_PCIeBusParityErrMask_LSB 0x1F
+#define QIB_7220_HwErrMask_PCIeBusParityErrMask_RMASK 0x7
+#define QIB_7220_HwErrMask_PcieCplTimeoutMask_LSB 0x1E
+#define QIB_7220_HwErrMask_PcieCplTimeoutMask_RMASK 0x1
+#define QIB_7220_HwErrMask_PoisonedTLPMask_LSB 0x1D
+#define QIB_7220_HwErrMask_PoisonedTLPMask_RMASK 0x1
+#define QIB_7220_HwErrMask_SDmaMemReadErrMask_LSB 0x1C
+#define QIB_7220_HwErrMask_SDmaMemReadErrMask_RMASK 0x1
+#define QIB_7220_HwErrMask_Reserved3_LSB 0x8
+#define QIB_7220_HwErrMask_Reserved3_RMASK 0xFFFFF
+#define QIB_7220_HwErrMask_PCIeMemParityErrMask_LSB 0x0
+#define QIB_7220_HwErrMask_PCIeMemParityErrMask_RMASK 0xFF
+
+#define QIB_7220_HwErrStatus_OFFS 0xA0
+#define QIB_7220_HwErrStatus_IBCBusFromSPCParityErr_LSB 0x3F
+#define QIB_7220_HwErrStatus_IBCBusFromSPCParityErr_RMASK 0x1
+#define QIB_7220_HwErrStatus_IBCBusToSPCParityErr_LSB 0x3E
+#define QIB_7220_HwErrStatus_IBCBusToSPCParityErr_RMASK 0x1
+#define QIB_7220_HwErrStatus_Clk_uC_PLLNotLocked_LSB 0x3D
+#define QIB_7220_HwErrStatus_Clk_uC_PLLNotLocked_RMASK 0x1
+#define QIB_7220_HwErrStatus_IBSerdesPClkNotDetect_LSB 0x3C
+#define QIB_7220_HwErrStatus_IBSerdesPClkNotDetect_RMASK 0x1
+#define QIB_7220_HwErrStatus_PCIESerdesQ3PClkNotDetect_LSB 0x3B
+#define QIB_7220_HwErrStatus_PCIESerdesQ3PClkNotDetect_RMASK 0x1
+#define QIB_7220_HwErrStatus_PCIESerdesQ2PClkNotDetect_LSB 0x3A
+#define QIB_7220_HwErrStatus_PCIESerdesQ2PClkNotDetect_RMASK 0x1
+#define QIB_7220_HwErrStatus_PCIESerdesQ1PClkNotDetect_LSB 0x39
+#define QIB_7220_HwErrStatus_PCIESerdesQ1PClkNotDetect_RMASK 0x1
+#define QIB_7220_HwErrStatus_PCIESerdesQ0PClkNotDetect_LSB 0x38
+#define QIB_7220_HwErrStatus_PCIESerdesQ0PClkNotDetect_RMASK 0x1
+#define QIB_7220_HwErrStatus_Reserved_LSB 0x37
+#define QIB_7220_HwErrStatus_Reserved_RMASK 0x1
+#define QIB_7220_HwErrStatus_PowerOnBISTFailed_LSB 0x36
+#define QIB_7220_HwErrStatus_PowerOnBISTFailed_RMASK 0x1
+#define QIB_7220_HwErrStatus_Reserved1_LSB 0x33
+#define QIB_7220_HwErrStatus_Reserved1_RMASK 0x7
+#define QIB_7220_HwErrStatus_RXEMemParity_LSB 0x2C
+#define QIB_7220_HwErrStatus_RXEMemParity_RMASK 0x7F
+#define QIB_7220_HwErrStatus_TXEMemParity_LSB 0x28
+#define QIB_7220_HwErrStatus_TXEMemParity_RMASK 0xF
+#define QIB_7220_HwErrStatus_DDSRXEQMemoryParityErr_LSB 0x27
+#define QIB_7220_HwErrStatus_DDSRXEQMemoryParityErr_RMASK 0x1
+#define QIB_7220_HwErrStatus_IB_uC_MemoryParityErr_LSB 0x26
+#define QIB_7220_HwErrStatus_IB_uC_MemoryParityErr_RMASK 0x1
+#define QIB_7220_HwErrStatus_PCIE_uC_Oct1MemoryParityErr_LSB 0x25
+#define QIB_7220_HwErrStatus_PCIE_uC_Oct1MemoryParityErr_RMASK 0x1
+#define QIB_7220_HwErrStatus_PCIE_uC_Oct0MemoryParityErr_LSB 0x24
+#define QIB_7220_HwErrStatus_PCIE_uC_Oct0MemoryParityErr_RMASK 0x1
+#define QIB_7220_HwErrStatus_Reserved2_LSB 0x22
+#define QIB_7220_HwErrStatus_Reserved2_RMASK 0x3
+#define QIB_7220_HwErrStatus_PCIeBusParity_LSB 0x1F
+#define QIB_7220_HwErrStatus_PCIeBusParity_RMASK 0x7
+#define QIB_7220_HwErrStatus_PcieCplTimeout_LSB 0x1E
+#define QIB_7220_HwErrStatus_PcieCplTimeout_RMASK 0x1
+#define QIB_7220_HwErrStatus_PoisenedTLP_LSB 0x1D
+#define QIB_7220_HwErrStatus_PoisenedTLP_RMASK 0x1
+#define QIB_7220_HwErrStatus_SDmaMemReadErr_LSB 0x1C
+#define QIB_7220_HwErrStatus_SDmaMemReadErr_RMASK 0x1
+#define QIB_7220_HwErrStatus_Reserved3_LSB 0x8
+#define QIB_7220_HwErrStatus_Reserved3_RMASK 0xFFFFF
+#define QIB_7220_HwErrStatus_PCIeMemParity_LSB 0x0
+#define QIB_7220_HwErrStatus_PCIeMemParity_RMASK 0xFF
+
+#define QIB_7220_HwErrClear_OFFS 0xA8
+#define QIB_7220_HwErrClear_IBCBusFromSPCParityErrClear_LSB 0x3F
+#define QIB_7220_HwErrClear_IBCBusFromSPCParityErrClear_RMASK 0x1
+#define QIB_7220_HwErrClear_IBCBusToSPCparityErrClear_LSB 0x3E
+#define QIB_7220_HwErrClear_IBCBusToSPCparityErrClear_RMASK 0x1
+#define QIB_7220_HwErrClear_Clk_uC_PLLNotLockedClear_LSB 0x3D
+#define QIB_7220_HwErrClear_Clk_uC_PLLNotLockedClear_RMASK 0x1
+#define QIB_7220_HwErrClear_IBSerdesPClkNotDetectClear_LSB 0x3C
+#define QIB_7220_HwErrClear_IBSerdesPClkNotDetectClear_RMASK 0x1
+#define QIB_7220_HwErrClear_PCIESerdesQ3PClkNotDetectClear_LSB 0x3B
+#define QIB_7220_HwErrClear_PCIESerdesQ3PClkNotDetectClear_RMASK 0x1
+#define QIB_7220_HwErrClear_PCIESerdesQ2PClkNotDetectClear_LSB 0x3A
+#define QIB_7220_HwErrClear_PCIESerdesQ2PClkNotDetectClear_RMASK 0x1
+#define QIB_7220_HwErrClear_PCIESerdesQ1PClkNotDetectClear_LSB 0x39
+#define QIB_7220_HwErrClear_PCIESerdesQ1PClkNotDetectClear_RMASK 0x1
+#define QIB_7220_HwErrClear_PCIESerdesQ0PClkNotDetectClear_LSB 0x38
+#define QIB_7220_HwErrClear_PCIESerdesQ0PClkNotDetectClear_RMASK 0x1
+#define QIB_7220_HwErrClear_Reserved_LSB 0x37
+#define QIB_7220_HwErrClear_Reserved_RMASK 0x1
+#define QIB_7220_HwErrClear_PowerOnBISTFailedClear_LSB 0x36
+#define QIB_7220_HwErrClear_PowerOnBISTFailedClear_RMASK 0x1
+#define QIB_7220_HwErrClear_Reserved1_LSB 0x33
+#define QIB_7220_HwErrClear_Reserved1_RMASK 0x7
+#define QIB_7220_HwErrClear_RXEMemParityClear_LSB 0x2C
+#define QIB_7220_HwErrClear_RXEMemParityClear_RMASK 0x7F
+#define QIB_7220_HwErrClear_TXEMemParityClear_LSB 0x28
+#define QIB_7220_HwErrClear_TXEMemParityClear_RMASK 0xF
+#define QIB_7220_HwErrClear_DDSRXEQMemoryParityErrClear_LSB 0x27
+#define QIB_7220_HwErrClear_DDSRXEQMemoryParityErrClear_RMASK 0x1
+#define QIB_7220_HwErrClear_IB_uC_MemoryParityErrClear_LSB 0x26
+#define QIB_7220_HwErrClear_IB_uC_MemoryParityErrClear_RMASK 0x1
+#define QIB_7220_HwErrClear_PCIE_uC_Oct1MemoryParityErrClear_LSB 0x25
+#define QIB_7220_HwErrClear_PCIE_uC_Oct1MemoryParityErrClear_RMASK 0x1
+#define QIB_7220_HwErrClear_PCIE_uC_Oct0MemoryParityErrClear_LSB 0x24
+#define QIB_7220_HwErrClear_PCIE_uC_Oct0MemoryParityErrClear_RMASK 0x1
+#define QIB_7220_HwErrClear_Reserved2_LSB 0x22
+#define QIB_7220_HwErrClear_Reserved2_RMASK 0x3
+#define QIB_7220_HwErrClear_PCIeBusParityClr_LSB 0x1F
+#define QIB_7220_HwErrClear_PCIeBusParityClr_RMASK 0x7
+#define QIB_7220_HwErrClear_PcieCplTimeoutClear_LSB 0x1E
+#define QIB_7220_HwErrClear_PcieCplTimeoutClear_RMASK 0x1
+#define QIB_7220_HwErrClear_PoisonedTLPClear_LSB 0x1D
+#define QIB_7220_HwErrClear_PoisonedTLPClear_RMASK 0x1
+#define QIB_7220_HwErrClear_SDmaMemReadErrClear_LSB 0x1C
+#define QIB_7220_HwErrClear_SDmaMemReadErrClear_RMASK 0x1
+#define QIB_7220_HwErrClear_Reserved3_LSB 0x8
+#define QIB_7220_HwErrClear_Reserved3_RMASK 0xFFFFF
+#define QIB_7220_HwErrClear_PCIeMemParityClr_LSB 0x0
+#define QIB_7220_HwErrClear_PCIeMemParityClr_RMASK 0xFF
+
+#define QIB_7220_HwDiagCtrl_OFFS 0xB0
+#define QIB_7220_HwDiagCtrl_ForceIBCBusFromSPCParityErr_LSB 0x3F
+#define QIB_7220_HwDiagCtrl_ForceIBCBusFromSPCParityErr_RMASK 0x1
+#define QIB_7220_HwDiagCtrl_ForceIBCBusToSPCParityErr_LSB 0x3E
+#define QIB_7220_HwDiagCtrl_ForceIBCBusToSPCParityErr_RMASK 0x1
+#define QIB_7220_HwDiagCtrl_CounterWrEnable_LSB 0x3D
+#define QIB_7220_HwDiagCtrl_CounterWrEnable_RMASK 0x1
+#define QIB_7220_HwDiagCtrl_CounterDisable_LSB 0x3C
+#define QIB_7220_HwDiagCtrl_CounterDisable_RMASK 0x1
+#define QIB_7220_HwDiagCtrl_Reserved_LSB 0x33
+#define QIB_7220_HwDiagCtrl_Reserved_RMASK 0x1FF
+#define QIB_7220_HwDiagCtrl_ForceRxMemParityErr_LSB 0x2C
+#define QIB_7220_HwDiagCtrl_ForceRxMemParityErr_RMASK 0x7F
+#define QIB_7220_HwDiagCtrl_ForceTxMemparityErr_LSB 0x28
+#define QIB_7220_HwDiagCtrl_ForceTxMemparityErr_RMASK 0xF
+#define QIB_7220_HwDiagCtrl_ForceDDSRXEQMemoryParityErr_LSB 0x27
+#define QIB_7220_HwDiagCtrl_ForceDDSRXEQMemoryParityErr_RMASK 0x1
+#define QIB_7220_HwDiagCtrl_ForceIB_uC_MemoryParityErr_LSB 0x26
+#define QIB_7220_HwDiagCtrl_ForceIB_uC_MemoryParityErr_RMASK 0x1
+#define QIB_7220_HwDiagCtrl_ForcePCIE_uC_Oct1MemoryParityErr_LSB 0x25
+#define QIB_7220_HwDiagCtrl_ForcePCIE_uC_Oct1MemoryParityErr_RMASK 0x1
+#define QIB_7220_HwDiagCtrl_ForcePCIE_uC_Oct0MemoryParityErr_LSB 0x24
+#define QIB_7220_HwDiagCtrl_ForcePCIE_uC_Oct0MemoryParityErr_RMASK 0x1
+#define QIB_7220_HwDiagCtrl_Reserved1_LSB 0x23
+#define QIB_7220_HwDiagCtrl_Reserved1_RMASK 0x1
+#define QIB_7220_HwDiagCtrl_forcePCIeBusParity_LSB 0x1F
+#define QIB_7220_HwDiagCtrl_forcePCIeBusParity_RMASK 0xF
+#define QIB_7220_HwDiagCtrl_Reserved2_LSB 0x8
+#define QIB_7220_HwDiagCtrl_Reserved2_RMASK 0x7FFFFF
+#define QIB_7220_HwDiagCtrl_forcePCIeMemParity_LSB 0x0
+#define QIB_7220_HwDiagCtrl_forcePCIeMemParity_RMASK 0xFF
+
+#define QIB_7220_REG_0000B8_OFFS 0xB8
+
+#define QIB_7220_IBCStatus_OFFS 0xC0
+#define QIB_7220_IBCStatus_TxCreditOk_LSB 0x1F
+#define QIB_7220_IBCStatus_TxCreditOk_RMASK 0x1
+#define QIB_7220_IBCStatus_TxReady_LSB 0x1E
+#define QIB_7220_IBCStatus_TxReady_RMASK 0x1
+#define QIB_7220_IBCStatus_Reserved_LSB 0xE
+#define QIB_7220_IBCStatus_Reserved_RMASK 0xFFFF
+#define QIB_7220_IBCStatus_IBTxLaneReversed_LSB 0xD
+#define QIB_7220_IBCStatus_IBTxLaneReversed_RMASK 0x1
+#define QIB_7220_IBCStatus_IBRxLaneReversed_LSB 0xC
+#define QIB_7220_IBCStatus_IBRxLaneReversed_RMASK 0x1
+#define QIB_7220_IBCStatus_IB_SERDES_TRIM_DONE_LSB 0xB
+#define QIB_7220_IBCStatus_IB_SERDES_TRIM_DONE_RMASK 0x1
+#define QIB_7220_IBCStatus_DDS_RXEQ_FAIL_LSB 0xA
+#define QIB_7220_IBCStatus_DDS_RXEQ_FAIL_RMASK 0x1
+#define QIB_7220_IBCStatus_LinkWidthActive_LSB 0x9
+#define QIB_7220_IBCStatus_LinkWidthActive_RMASK 0x1
+#define QIB_7220_IBCStatus_LinkSpeedActive_LSB 0x8
+#define QIB_7220_IBCStatus_LinkSpeedActive_RMASK 0x1
+#define QIB_7220_IBCStatus_LinkState_LSB 0x5
+#define QIB_7220_IBCStatus_LinkState_RMASK 0x7
+#define QIB_7220_IBCStatus_LinkTrainingState_LSB 0x0
+#define QIB_7220_IBCStatus_LinkTrainingState_RMASK 0x1F
+
+#define QIB_7220_IBCCtrl_OFFS 0xC8
+#define QIB_7220_IBCCtrl_Loopback_LSB 0x3F
+#define QIB_7220_IBCCtrl_Loopback_RMASK 0x1
+#define QIB_7220_IBCCtrl_LinkDownDefaultState_LSB 0x3E
+#define QIB_7220_IBCCtrl_LinkDownDefaultState_RMASK 0x1
+#define QIB_7220_IBCCtrl_Reserved_LSB 0x2B
+#define QIB_7220_IBCCtrl_Reserved_RMASK 0x7FFFF
+#define QIB_7220_IBCCtrl_CreditScale_LSB 0x28
+#define QIB_7220_IBCCtrl_CreditScale_RMASK 0x7
+#define QIB_7220_IBCCtrl_OverrunThreshold_LSB 0x24
+#define QIB_7220_IBCCtrl_OverrunThreshold_RMASK 0xF
+#define QIB_7220_IBCCtrl_PhyerrThreshold_LSB 0x20
+#define QIB_7220_IBCCtrl_PhyerrThreshold_RMASK 0xF
+#define QIB_7220_IBCCtrl_MaxPktLen_LSB 0x15
+#define QIB_7220_IBCCtrl_MaxPktLen_RMASK 0x7FF
+#define QIB_7220_IBCCtrl_LinkCmd_LSB 0x13
+#define QIB_7220_IBCCtrl_LinkCmd_RMASK 0x3
+#define QIB_7220_IBCCtrl_LinkInitCmd_LSB 0x10
+#define QIB_7220_IBCCtrl_LinkInitCmd_RMASK 0x7
+#define QIB_7220_IBCCtrl_FlowCtrlWaterMark_LSB 0x8
+#define QIB_7220_IBCCtrl_FlowCtrlWaterMark_RMASK 0xFF
+#define QIB_7220_IBCCtrl_FlowCtrlPeriod_LSB 0x0
+#define QIB_7220_IBCCtrl_FlowCtrlPeriod_RMASK 0xFF
+
+#define QIB_7220_EXTStatus_OFFS 0xD0
+#define QIB_7220_EXTStatus_GPIOIn_LSB 0x30
+#define QIB_7220_EXTStatus_GPIOIn_RMASK 0xFFFF
+#define QIB_7220_EXTStatus_Reserved_LSB 0x20
+#define QIB_7220_EXTStatus_Reserved_RMASK 0xFFFF
+#define QIB_7220_EXTStatus_Reserved1_LSB 0x10
+#define QIB_7220_EXTStatus_Reserved1_RMASK 0xFFFF
+#define QIB_7220_EXTStatus_MemBISTDisabled_LSB 0xF
+#define QIB_7220_EXTStatus_MemBISTDisabled_RMASK 0x1
+#define QIB_7220_EXTStatus_MemBISTEndTest_LSB 0xE
+#define QIB_7220_EXTStatus_MemBISTEndTest_RMASK 0x1
+#define QIB_7220_EXTStatus_Reserved2_LSB 0x0
+#define QIB_7220_EXTStatus_Reserved2_RMASK 0x3FFF
+
+#define QIB_7220_EXTCtrl_OFFS 0xD8
+#define QIB_7220_EXTCtrl_GPIOOe_LSB 0x30
+#define QIB_7220_EXTCtrl_GPIOOe_RMASK 0xFFFF
+#define QIB_7220_EXTCtrl_GPIOInvert_LSB 0x20
+#define QIB_7220_EXTCtrl_GPIOInvert_RMASK 0xFFFF
+#define QIB_7220_EXTCtrl_Reserved_LSB 0x4
+#define QIB_7220_EXTCtrl_Reserved_RMASK 0xFFFFFFF
+#define QIB_7220_EXTCtrl_LEDPriPortGreenOn_LSB 0x3
+#define QIB_7220_EXTCtrl_LEDPriPortGreenOn_RMASK 0x1
+#define QIB_7220_EXTCtrl_LEDPriPortYellowOn_LSB 0x2
+#define QIB_7220_EXTCtrl_LEDPriPortYellowOn_RMASK 0x1
+#define QIB_7220_EXTCtrl_LEDGblOkGreenOn_LSB 0x1
+#define QIB_7220_EXTCtrl_LEDGblOkGreenOn_RMASK 0x1
+#define QIB_7220_EXTCtrl_LEDGblErrRedOff_LSB 0x0
+#define QIB_7220_EXTCtrl_LEDGblErrRedOff_RMASK 0x1
+
+#define QIB_7220_GPIOOut_OFFS 0xE0
+
+#define QIB_7220_GPIOMask_OFFS 0xE8
+
+#define QIB_7220_GPIOStatus_OFFS 0xF0
+
+#define QIB_7220_GPIOClear_OFFS 0xF8
+
+#define QIB_7220_RcvCtrl_OFFS 0x100
+#define QIB_7220_RcvCtrl_Reserved_LSB 0x27
+#define QIB_7220_RcvCtrl_Reserved_RMASK 0x1FFFFFF
+#define QIB_7220_RcvCtrl_RcvQPMapEnable_LSB 0x26
+#define QIB_7220_RcvCtrl_RcvQPMapEnable_RMASK 0x1
+#define QIB_7220_RcvCtrl_PortCfg_LSB 0x24
+#define QIB_7220_RcvCtrl_PortCfg_RMASK 0x3
+#define QIB_7220_RcvCtrl_TailUpd_LSB 0x23
+#define QIB_7220_RcvCtrl_TailUpd_RMASK 0x1
+#define QIB_7220_RcvCtrl_RcvPartitionKeyDisable_LSB 0x22
+#define QIB_7220_RcvCtrl_RcvPartitionKeyDisable_RMASK 0x1
+#define QIB_7220_RcvCtrl_IntrAvail_LSB 0x11
+#define QIB_7220_RcvCtrl_IntrAvail_RMASK 0x1FFFF
+#define QIB_7220_RcvCtrl_PortEnable_LSB 0x0
+#define QIB_7220_RcvCtrl_PortEnable_RMASK 0x1FFFF
+
+#define QIB_7220_RcvBTHQP_OFFS 0x108
+#define QIB_7220_RcvBTHQP_Reserved_LSB 0x18
+#define QIB_7220_RcvBTHQP_Reserved_RMASK 0xFF
+#define QIB_7220_RcvBTHQP_RcvBTHQP_LSB 0x0
+#define QIB_7220_RcvBTHQP_RcvBTHQP_RMASK 0xFFFFFF
+
+#define QIB_7220_RcvHdrSize_OFFS 0x110
+
+#define QIB_7220_RcvHdrCnt_OFFS 0x118
+
+#define QIB_7220_RcvHdrEntSize_OFFS 0x120
+
+#define QIB_7220_RcvTIDBase_OFFS 0x128
+
+#define QIB_7220_RcvTIDCnt_OFFS 0x130
+
+#define QIB_7220_RcvEgrBase_OFFS 0x138
+
+#define QIB_7220_RcvEgrCnt_OFFS 0x140
+
+#define QIB_7220_RcvBufBase_OFFS 0x148
+
+#define QIB_7220_RcvBufSize_OFFS 0x150
+
+#define QIB_7220_RxIntMemBase_OFFS 0x158
+
+#define QIB_7220_RxIntMemSize_OFFS 0x160
+
+#define QIB_7220_RcvPartitionKey_OFFS 0x168
+
+#define QIB_7220_RcvQPMulticastPort_OFFS 0x170
+#define QIB_7220_RcvQPMulticastPort_Reserved_LSB 0x5
+#define QIB_7220_RcvQPMulticastPort_Reserved_RMASK 0x7FFFFFFFFFFFFFF
+#define QIB_7220_RcvQPMulticastPort_RcvQpMcPort_LSB 0x0
+#define QIB_7220_RcvQPMulticastPort_RcvQpMcPort_RMASK 0x1F
+
+#define QIB_7220_RcvPktLEDCnt_OFFS 0x178
+#define QIB_7220_RcvPktLEDCnt_ONperiod_LSB 0x20
+#define QIB_7220_RcvPktLEDCnt_ONperiod_RMASK 0xFFFFFFFF
+#define QIB_7220_RcvPktLEDCnt_OFFperiod_LSB 0x0
+#define QIB_7220_RcvPktLEDCnt_OFFperiod_RMASK 0xFFFFFFFF
+
+#define QIB_7220_IBCDDRCtrl_OFFS 0x180
+#define QIB_7220_IBCDDRCtrl_IB_DLID_MASK_LSB 0x30
+#define QIB_7220_IBCDDRCtrl_IB_DLID_MASK_RMASK 0xFFFF
+#define QIB_7220_IBCDDRCtrl_IB_DLID_LSB 0x20
+#define QIB_7220_IBCDDRCtrl_IB_DLID_RMASK 0xFFFF
+#define QIB_7220_IBCDDRCtrl_Reserved_LSB 0x1B
+#define QIB_7220_IBCDDRCtrl_Reserved_RMASK 0x1F
+#define QIB_7220_IBCDDRCtrl_HRTBT_REQ_LSB 0x1A
+#define QIB_7220_IBCDDRCtrl_HRTBT_REQ_RMASK 0x1
+#define QIB_7220_IBCDDRCtrl_HRTBT_PORT_LSB 0x12
+#define QIB_7220_IBCDDRCtrl_HRTBT_PORT_RMASK 0xFF
+#define QIB_7220_IBCDDRCtrl_HRTBT_AUTO_LSB 0x11
+#define QIB_7220_IBCDDRCtrl_HRTBT_AUTO_RMASK 0x1
+#define QIB_7220_IBCDDRCtrl_HRTBT_ENB_LSB 0x10
+#define QIB_7220_IBCDDRCtrl_HRTBT_ENB_RMASK 0x1
+#define QIB_7220_IBCDDRCtrl_SD_DDS_LSB 0xC
+#define QIB_7220_IBCDDRCtrl_SD_DDS_RMASK 0xF
+#define QIB_7220_IBCDDRCtrl_SD_DDSV_LSB 0xB
+#define QIB_7220_IBCDDRCtrl_SD_DDSV_RMASK 0x1
+#define QIB_7220_IBCDDRCtrl_SD_ADD_ENB_LSB 0xA
+#define QIB_7220_IBCDDRCtrl_SD_ADD_ENB_RMASK 0x1
+#define QIB_7220_IBCDDRCtrl_SD_RX_EQUAL_ENABLE_LSB 0x9
+#define QIB_7220_IBCDDRCtrl_SD_RX_EQUAL_ENABLE_RMASK 0x1
+#define QIB_7220_IBCDDRCtrl_IB_LANE_REV_SUPPORTED_LSB 0x8
+#define QIB_7220_IBCDDRCtrl_IB_LANE_REV_SUPPORTED_RMASK 0x1
+#define QIB_7220_IBCDDRCtrl_IB_POLARITY_REV_SUPP_LSB 0x7
+#define QIB_7220_IBCDDRCtrl_IB_POLARITY_REV_SUPP_RMASK 0x1
+#define QIB_7220_IBCDDRCtrl_IB_NUM_CHANNELS_LSB 0x5
+#define QIB_7220_IBCDDRCtrl_IB_NUM_CHANNELS_RMASK 0x3
+#define QIB_7220_IBCDDRCtrl_SD_SPEED_QDR_LSB 0x4
+#define QIB_7220_IBCDDRCtrl_SD_SPEED_QDR_RMASK 0x1
+#define QIB_7220_IBCDDRCtrl_SD_SPEED_DDR_LSB 0x3
+#define QIB_7220_IBCDDRCtrl_SD_SPEED_DDR_RMASK 0x1
+#define QIB_7220_IBCDDRCtrl_SD_SPEED_SDR_LSB 0x2
+#define QIB_7220_IBCDDRCtrl_SD_SPEED_SDR_RMASK 0x1
+#define QIB_7220_IBCDDRCtrl_SD_SPEED_LSB 0x1
+#define QIB_7220_IBCDDRCtrl_SD_SPEED_RMASK 0x1
+#define QIB_7220_IBCDDRCtrl_IB_ENHANCED_MODE_LSB 0x0
+#define QIB_7220_IBCDDRCtrl_IB_ENHANCED_MODE_RMASK 0x1
+
+#define QIB_7220_HRTBT_GUID_OFFS 0x188
+
+#define QIB_7220_IBCDDRCtrl2_OFFS 0x1A0
+#define QIB_7220_IBCDDRCtrl2_IB_BACK_PORCH_LSB 0x5
+#define QIB_7220_IBCDDRCtrl2_IB_BACK_PORCH_RMASK 0x1F
+#define QIB_7220_IBCDDRCtrl2_IB_FRONT_PORCH_LSB 0x0
+#define QIB_7220_IBCDDRCtrl2_IB_FRONT_PORCH_RMASK 0x1F
+
+#define QIB_7220_IBCDDRStatus_OFFS 0x1A8
+#define QIB_7220_IBCDDRStatus_heartbeat_timed_out_LSB 0x24
+#define QIB_7220_IBCDDRStatus_heartbeat_timed_out_RMASK 0x1
+#define QIB_7220_IBCDDRStatus_heartbeat_crosstalk_LSB 0x20
+#define QIB_7220_IBCDDRStatus_heartbeat_crosstalk_RMASK 0xF
+#define QIB_7220_IBCDDRStatus_RxEqLocalDevice_LSB 0x1E
+#define QIB_7220_IBCDDRStatus_RxEqLocalDevice_RMASK 0x3
+#define QIB_7220_IBCDDRStatus_ReqDDSLocalFromRmt_LSB 0x1A
+#define QIB_7220_IBCDDRStatus_ReqDDSLocalFromRmt_RMASK 0xF
+#define QIB_7220_IBCDDRStatus_LinkRoundTripLatency_LSB 0x0
+#define QIB_7220_IBCDDRStatus_LinkRoundTripLatency_RMASK 0x3FFFFFF
+
+#define QIB_7220_JIntReload_OFFS 0x1B0
+#define QIB_7220_JIntReload_J_limit_reload_LSB 0x10
+#define QIB_7220_JIntReload_J_limit_reload_RMASK 0xFFFF
+#define QIB_7220_JIntReload_J_reload_LSB 0x0
+#define QIB_7220_JIntReload_J_reload_RMASK 0xFFFF
+
+#define QIB_7220_IBNCModeCtrl_OFFS 0x1B8
+#define QIB_7220_IBNCModeCtrl_Reserved_LSB 0x1A
+#define QIB_7220_IBNCModeCtrl_Reserved_RMASK 0x3FFFFFFFFF
+#define QIB_7220_IBNCModeCtrl_TSMCode_TS2_LSB 0x11
+#define QIB_7220_IBNCModeCtrl_TSMCode_TS2_RMASK 0x1FF
+#define QIB_7220_IBNCModeCtrl_TSMCode_TS1_LSB 0x8
+#define QIB_7220_IBNCModeCtrl_TSMCode_TS1_RMASK 0x1FF
+#define QIB_7220_IBNCModeCtrl_Reserved1_LSB 0x3
+#define QIB_7220_IBNCModeCtrl_Reserved1_RMASK 0x1F
+#define QIB_7220_IBNCModeCtrl_TSMEnable_ignore_TSM_on_rx_LSB 0x2
+#define QIB_7220_IBNCModeCtrl_TSMEnable_ignore_TSM_on_rx_RMASK 0x1
+#define QIB_7220_IBNCModeCtrl_TSMEnable_send_TS2_LSB 0x1
+#define QIB_7220_IBNCModeCtrl_TSMEnable_send_TS2_RMASK 0x1
+#define QIB_7220_IBNCModeCtrl_TSMEnable_send_TS1_LSB 0x0
+#define QIB_7220_IBNCModeCtrl_TSMEnable_send_TS1_RMASK 0x1
+
+#define QIB_7220_SendCtrl_OFFS 0x1C0
+#define QIB_7220_SendCtrl_Disarm_LSB 0x1F
+#define QIB_7220_SendCtrl_Disarm_RMASK 0x1
+#define QIB_7220_SendCtrl_Reserved_LSB 0x1D
+#define QIB_7220_SendCtrl_Reserved_RMASK 0x3
+#define QIB_7220_SendCtrl_AvailUpdThld_LSB 0x18
+#define QIB_7220_SendCtrl_AvailUpdThld_RMASK 0x1F
+#define QIB_7220_SendCtrl_DisarmPIOBuf_LSB 0x10
+#define QIB_7220_SendCtrl_DisarmPIOBuf_RMASK 0xFF
+#define QIB_7220_SendCtrl_Reserved1_LSB 0xD
+#define QIB_7220_SendCtrl_Reserved1_RMASK 0x7
+#define QIB_7220_SendCtrl_SDmaHalt_LSB 0xC
+#define QIB_7220_SendCtrl_SDmaHalt_RMASK 0x1
+#define QIB_7220_SendCtrl_SDmaEnable_LSB 0xB
+#define QIB_7220_SendCtrl_SDmaEnable_RMASK 0x1
+#define QIB_7220_SendCtrl_SDmaSingleDescriptor_LSB 0xA
+#define QIB_7220_SendCtrl_SDmaSingleDescriptor_RMASK 0x1
+#define QIB_7220_SendCtrl_SDmaIntEnable_LSB 0x9
+#define QIB_7220_SendCtrl_SDmaIntEnable_RMASK 0x1
+#define QIB_7220_SendCtrl_Reserved2_LSB 0x5
+#define QIB_7220_SendCtrl_Reserved2_RMASK 0xF
+#define QIB_7220_SendCtrl_SSpecialTriggerEn_LSB 0x4
+#define QIB_7220_SendCtrl_SSpecialTriggerEn_RMASK 0x1
+#define QIB_7220_SendCtrl_SPioEnable_LSB 0x3
+#define QIB_7220_SendCtrl_SPioEnable_RMASK 0x1
+#define QIB_7220_SendCtrl_SendBufAvailUpd_LSB 0x2
+#define QIB_7220_SendCtrl_SendBufAvailUpd_RMASK 0x1
+#define QIB_7220_SendCtrl_SendIntBufAvail_LSB 0x1
+#define QIB_7220_SendCtrl_SendIntBufAvail_RMASK 0x1
+#define QIB_7220_SendCtrl_Abort_LSB 0x0
+#define QIB_7220_SendCtrl_Abort_RMASK 0x1
+
+#define QIB_7220_SendBufBase_OFFS 0x1C8
+#define QIB_7220_SendBufBase_Reserved_LSB 0x35
+#define QIB_7220_SendBufBase_Reserved_RMASK 0x7FF
+#define QIB_7220_SendBufBase_BaseAddr_LargePIO_LSB 0x20
+#define QIB_7220_SendBufBase_BaseAddr_LargePIO_RMASK 0x1FFFFF
+#define QIB_7220_SendBufBase_Reserved1_LSB 0x15
+#define QIB_7220_SendBufBase_Reserved1_RMASK 0x7FF
+#define QIB_7220_SendBufBase_BaseAddr_SmallPIO_LSB 0x0
+#define QIB_7220_SendBufBase_BaseAddr_SmallPIO_RMASK 0x1FFFFF
+
+#define QIB_7220_SendBufSize_OFFS 0x1D0
+#define QIB_7220_SendBufSize_Reserved_LSB 0x2D
+#define QIB_7220_SendBufSize_Reserved_RMASK 0xFFFFF
+#define QIB_7220_SendBufSize_Size_LargePIO_LSB 0x20
+#define QIB_7220_SendBufSize_Size_LargePIO_RMASK 0x1FFF
+#define QIB_7220_SendBufSize_Reserved1_LSB 0xC
+#define QIB_7220_SendBufSize_Reserved1_RMASK 0xFFFFF
+#define QIB_7220_SendBufSize_Size_SmallPIO_LSB 0x0
+#define QIB_7220_SendBufSize_Size_SmallPIO_RMASK 0xFFF
+
+#define QIB_7220_SendBufCnt_OFFS 0x1D8
+#define QIB_7220_SendBufCnt_Reserved_LSB 0x24
+#define QIB_7220_SendBufCnt_Reserved_RMASK 0xFFFFFFF
+#define QIB_7220_SendBufCnt_Num_LargeBuffers_LSB 0x20
+#define QIB_7220_SendBufCnt_Num_LargeBuffers_RMASK 0xF
+#define QIB_7220_SendBufCnt_Reserved1_LSB 0x9
+#define QIB_7220_SendBufCnt_Reserved1_RMASK 0x7FFFFF
+#define QIB_7220_SendBufCnt_Num_SmallBuffers_LSB 0x0
+#define QIB_7220_SendBufCnt_Num_SmallBuffers_RMASK 0x1FF
+
+#define QIB_7220_SendBufAvailAddr_OFFS 0x1E0
+#define QIB_7220_SendBufAvailAddr_SendBufAvailAddr_LSB 0x6
+#define QIB_7220_SendBufAvailAddr_SendBufAvailAddr_RMASK 0x3FFFFFFFF
+#define QIB_7220_SendBufAvailAddr_Reserved_LSB 0x0
+#define QIB_7220_SendBufAvailAddr_Reserved_RMASK 0x3F
+
+#define QIB_7220_TxIntMemBase_OFFS 0x1E8
+
+#define QIB_7220_TxIntMemSize_OFFS 0x1F0
+
+#define QIB_7220_SendDmaBase_OFFS 0x1F8
+#define QIB_7220_SendDmaBase_Reserved_LSB 0x30
+#define QIB_7220_SendDmaBase_Reserved_RMASK 0xFFFF
+#define QIB_7220_SendDmaBase_SendDmaBase_LSB 0x0
+#define QIB_7220_SendDmaBase_SendDmaBase_RMASK 0xFFFFFFFFFFFF
+
+#define QIB_7220_SendDmaLenGen_OFFS 0x200
+#define QIB_7220_SendDmaLenGen_Reserved_LSB 0x13
+#define QIB_7220_SendDmaLenGen_Reserved_RMASK 0x1FFFFFFFFFFF
+#define QIB_7220_SendDmaLenGen_Generation_LSB 0x10
+#define QIB_7220_SendDmaLenGen_Generation_MSB 0x12
+#define QIB_7220_SendDmaLenGen_Generation_RMASK 0x7
+#define QIB_7220_SendDmaLenGen_Length_LSB 0x0
+#define QIB_7220_SendDmaLenGen_Length_RMASK 0xFFFF
+
+#define QIB_7220_SendDmaTail_OFFS 0x208
+#define QIB_7220_SendDmaTail_Reserved_LSB 0x10
+#define QIB_7220_SendDmaTail_Reserved_RMASK 0xFFFFFFFFFFFF
+#define QIB_7220_SendDmaTail_SendDmaTail_LSB 0x0
+#define QIB_7220_SendDmaTail_SendDmaTail_RMASK 0xFFFF
+
+#define QIB_7220_SendDmaHead_OFFS 0x210
+#define QIB_7220_SendDmaHead_Reserved_LSB 0x30
+#define QIB_7220_SendDmaHead_Reserved_RMASK 0xFFFF
+#define QIB_7220_SendDmaHead_InternalSendDmaHead_LSB 0x20
+#define QIB_7220_SendDmaHead_InternalSendDmaHead_RMASK 0xFFFF
+#define QIB_7220_SendDmaHead_Reserved1_LSB 0x10
+#define QIB_7220_SendDmaHead_Reserved1_RMASK 0xFFFF
+#define QIB_7220_SendDmaHead_SendDmaHead_LSB 0x0
+#define QIB_7220_SendDmaHead_SendDmaHead_RMASK 0xFFFF
+
+#define QIB_7220_SendDmaHeadAddr_OFFS 0x218
+#define QIB_7220_SendDmaHeadAddr_Reserved_LSB 0x30
+#define QIB_7220_SendDmaHeadAddr_Reserved_RMASK 0xFFFF
+#define QIB_7220_SendDmaHeadAddr_SendDmaHeadAddr_LSB 0x0
+#define QIB_7220_SendDmaHeadAddr_SendDmaHeadAddr_RMASK 0xFFFFFFFFFFFF
+
+#define QIB_7220_SendDmaBufMask0_OFFS 0x220
+#define QIB_7220_SendDmaBufMask0_BufMask_63_0_LSB 0x0
+#define QIB_7220_SendDmaBufMask0_BufMask_63_0_RMASK 0x0
+
+#define QIB_7220_SendDmaStatus_OFFS 0x238
+#define QIB_7220_SendDmaStatus_ScoreBoardDrainInProg_LSB 0x3F
+#define QIB_7220_SendDmaStatus_ScoreBoardDrainInProg_RMASK 0x1
+#define QIB_7220_SendDmaStatus_AbortInProg_LSB 0x3E
+#define QIB_7220_SendDmaStatus_AbortInProg_RMASK 0x1
+#define QIB_7220_SendDmaStatus_InternalSDmaEnable_LSB 0x3D
+#define QIB_7220_SendDmaStatus_InternalSDmaEnable_RMASK 0x1
+#define QIB_7220_SendDmaStatus_ScbDescIndex_13_0_LSB 0x2F
+#define QIB_7220_SendDmaStatus_ScbDescIndex_13_0_RMASK 0x3FFF
+#define QIB_7220_SendDmaStatus_RpyLowAddr_6_0_LSB 0x28
+#define QIB_7220_SendDmaStatus_RpyLowAddr_6_0_RMASK 0x7F
+#define QIB_7220_SendDmaStatus_RpyTag_7_0_LSB 0x20
+#define QIB_7220_SendDmaStatus_RpyTag_7_0_RMASK 0xFF
+#define QIB_7220_SendDmaStatus_ScbFull_LSB 0x1F
+#define QIB_7220_SendDmaStatus_ScbFull_RMASK 0x1
+#define QIB_7220_SendDmaStatus_ScbEmpty_LSB 0x1E
+#define QIB_7220_SendDmaStatus_ScbEmpty_RMASK 0x1
+#define QIB_7220_SendDmaStatus_ScbEntryValid_LSB 0x1D
+#define QIB_7220_SendDmaStatus_ScbEntryValid_RMASK 0x1
+#define QIB_7220_SendDmaStatus_ScbFetchDescFlag_LSB 0x1C
+#define QIB_7220_SendDmaStatus_ScbFetchDescFlag_RMASK 0x1
+#define QIB_7220_SendDmaStatus_SplFifoReadyToGo_LSB 0x1B
+#define QIB_7220_SendDmaStatus_SplFifoReadyToGo_RMASK 0x1
+#define QIB_7220_SendDmaStatus_SplFifoDisarmed_LSB 0x1A
+#define QIB_7220_SendDmaStatus_SplFifoDisarmed_RMASK 0x1
+#define QIB_7220_SendDmaStatus_SplFifoEmpty_LSB 0x19
+#define QIB_7220_SendDmaStatus_SplFifoEmpty_RMASK 0x1
+#define QIB_7220_SendDmaStatus_SplFifoFull_LSB 0x18
+#define QIB_7220_SendDmaStatus_SplFifoFull_RMASK 0x1
+#define QIB_7220_SendDmaStatus_SplFifoBufNum_LSB 0x10
+#define QIB_7220_SendDmaStatus_SplFifoBufNum_RMASK 0xFF
+#define QIB_7220_SendDmaStatus_SplFifoDescIndex_LSB 0x0
+#define QIB_7220_SendDmaStatus_SplFifoDescIndex_RMASK 0xFFFF
+
+#define QIB_7220_SendBufErr0_OFFS 0x240
+#define QIB_7220_SendBufErr0_SendBufErr_63_0_LSB 0x0
+#define QIB_7220_SendBufErr0_SendBufErr_63_0_RMASK 0x0
+
+#define QIB_7220_RcvHdrAddr0_OFFS 0x270
+#define QIB_7220_RcvHdrAddr0_RcvHdrAddr0_LSB 0x2
+#define QIB_7220_RcvHdrAddr0_RcvHdrAddr0_RMASK 0x3FFFFFFFFF
+#define QIB_7220_RcvHdrAddr0_Reserved_LSB 0x0
+#define QIB_7220_RcvHdrAddr0_Reserved_RMASK 0x3
+
+#define QIB_7220_RcvHdrTailAddr0_OFFS 0x300
+#define QIB_7220_RcvHdrTailAddr0_RcvHdrTailAddr0_LSB 0x2
+#define QIB_7220_RcvHdrTailAddr0_RcvHdrTailAddr0_RMASK 0x3FFFFFFFFF
+#define QIB_7220_RcvHdrTailAddr0_Reserved_LSB 0x0
+#define QIB_7220_RcvHdrTailAddr0_Reserved_RMASK 0x3
+
+#define QIB_7220_ibsd_epb_access_ctrl_OFFS 0x3C0
+#define QIB_7220_ibsd_epb_access_ctrl_sw_ib_epb_req_granted_LSB 0x8
+#define QIB_7220_ibsd_epb_access_ctrl_sw_ib_epb_req_granted_RMASK 0x1
+#define QIB_7220_ibsd_epb_access_ctrl_Reserved_LSB 0x1
+#define QIB_7220_ibsd_epb_access_ctrl_Reserved_RMASK 0x7F
+#define QIB_7220_ibsd_epb_access_ctrl_sw_ib_epb_req_LSB 0x0
+#define QIB_7220_ibsd_epb_access_ctrl_sw_ib_epb_req_RMASK 0x1
+
+#define QIB_7220_ibsd_epb_transaction_reg_OFFS 0x3C8
+#define QIB_7220_ibsd_epb_transaction_reg_ib_epb_rdy_LSB 0x1F
+#define QIB_7220_ibsd_epb_transaction_reg_ib_epb_rdy_RMASK 0x1
+#define QIB_7220_ibsd_epb_transaction_reg_ib_epb_req_error_LSB 0x1E
+#define QIB_7220_ibsd_epb_transaction_reg_ib_epb_req_error_RMASK 0x1
+#define QIB_7220_ibsd_epb_transaction_reg_Reserved_LSB 0x1D
+#define QIB_7220_ibsd_epb_transaction_reg_Reserved_RMASK 0x1
+#define QIB_7220_ibsd_epb_transaction_reg_mem_data_parity_LSB 0x1C
+#define QIB_7220_ibsd_epb_transaction_reg_mem_data_parity_RMASK 0x1
+#define QIB_7220_ibsd_epb_transaction_reg_Reserved1_LSB 0x1B
+#define QIB_7220_ibsd_epb_transaction_reg_Reserved1_RMASK 0x1
+#define QIB_7220_ibsd_epb_transaction_reg_ib_epb_cs_LSB 0x19
+#define QIB_7220_ibsd_epb_transaction_reg_ib_epb_cs_RMASK 0x3
+#define QIB_7220_ibsd_epb_transaction_reg_ib_epb_read_write_LSB 0x18
+#define QIB_7220_ibsd_epb_transaction_reg_ib_epb_read_write_RMASK 0x1
+#define QIB_7220_ibsd_epb_transaction_reg_Reserved2_LSB 0x17
+#define QIB_7220_ibsd_epb_transaction_reg_Reserved2_RMASK 0x1
+#define QIB_7220_ibsd_epb_transaction_reg_ib_epb_address_LSB 0x8
+#define QIB_7220_ibsd_epb_transaction_reg_ib_epb_address_RMASK 0x7FFF
+#define QIB_7220_ibsd_epb_transaction_reg_ib_epb_data_LSB 0x0
+#define QIB_7220_ibsd_epb_transaction_reg_ib_epb_data_RMASK 0xFF
+
+#define QIB_7220_XGXSCfg_OFFS 0x3D8
+#define QIB_7220_XGXSCfg_sel_link_down_for_fctrl_lane_sync_reset_LSB 0x3F
+#define QIB_7220_XGXSCfg_sel_link_down_for_fctrl_lane_sync_reset_RMASK 0x1
+#define QIB_7220_XGXSCfg_Reserved_LSB 0x13
+#define QIB_7220_XGXSCfg_Reserved_RMASK 0xFFFFFFFFFFF
+#define QIB_7220_XGXSCfg_link_sync_mask_LSB 0x9
+#define QIB_7220_XGXSCfg_link_sync_mask_RMASK 0x3FF
+#define QIB_7220_XGXSCfg_Reserved1_LSB 0x3
+#define QIB_7220_XGXSCfg_Reserved1_RMASK 0x3F
+#define QIB_7220_XGXSCfg_xcv_reset_LSB 0x2
+#define QIB_7220_XGXSCfg_xcv_reset_RMASK 0x1
+#define QIB_7220_XGXSCfg_Reserved2_LSB 0x1
+#define QIB_7220_XGXSCfg_Reserved2_RMASK 0x1
+#define QIB_7220_XGXSCfg_tx_rx_reset_LSB 0x0
+#define QIB_7220_XGXSCfg_tx_rx_reset_RMASK 0x1
+
+#define QIB_7220_IBSerDesCtrl_OFFS 0x3E0
+#define QIB_7220_IBSerDesCtrl_Reserved_LSB 0x2D
+#define QIB_7220_IBSerDesCtrl_Reserved_RMASK 0x7FFFF
+#define QIB_7220_IBSerDesCtrl_INT_uC_LSB 0x2C
+#define QIB_7220_IBSerDesCtrl_INT_uC_RMASK 0x1
+#define QIB_7220_IBSerDesCtrl_CKSEL_uC_LSB 0x2A
+#define QIB_7220_IBSerDesCtrl_CKSEL_uC_RMASK 0x3
+#define QIB_7220_IBSerDesCtrl_PLLN_LSB 0x28
+#define QIB_7220_IBSerDesCtrl_PLLN_RMASK 0x3
+#define QIB_7220_IBSerDesCtrl_PLLM_LSB 0x25
+#define QIB_7220_IBSerDesCtrl_PLLM_RMASK 0x7
+#define QIB_7220_IBSerDesCtrl_TXOBPD_LSB 0x24
+#define QIB_7220_IBSerDesCtrl_TXOBPD_RMASK 0x1
+#define QIB_7220_IBSerDesCtrl_TWC_LSB 0x23
+#define QIB_7220_IBSerDesCtrl_TWC_RMASK 0x1
+#define QIB_7220_IBSerDesCtrl_RXIDLE_LSB 0x22
+#define QIB_7220_IBSerDesCtrl_RXIDLE_RMASK 0x1
+#define QIB_7220_IBSerDesCtrl_RXINV_LSB 0x21
+#define QIB_7220_IBSerDesCtrl_RXINV_RMASK 0x1
+#define QIB_7220_IBSerDesCtrl_TXINV_LSB 0x20
+#define QIB_7220_IBSerDesCtrl_TXINV_RMASK 0x1
+#define QIB_7220_IBSerDesCtrl_Reserved1_LSB 0x12
+#define QIB_7220_IBSerDesCtrl_Reserved1_RMASK 0x3FFF
+#define QIB_7220_IBSerDesCtrl_NumSerDesRegsToWrForRXEQ_LSB 0xD
+#define QIB_7220_IBSerDesCtrl_NumSerDesRegsToWrForRXEQ_RMASK 0x1F
+#define QIB_7220_IBSerDesCtrl_NumSerDesRegsToWrForDDS_LSB 0x8
+#define QIB_7220_IBSerDesCtrl_NumSerDesRegsToWrForDDS_RMASK 0x1F
+#define QIB_7220_IBSerDesCtrl_Reserved2_LSB 0x1
+#define QIB_7220_IBSerDesCtrl_Reserved2_RMASK 0x7F
+#define QIB_7220_IBSerDesCtrl_ResetIB_uC_Core_LSB 0x0
+#define QIB_7220_IBSerDesCtrl_ResetIB_uC_Core_RMASK 0x1
+
+#define QIB_7220_pciesd_epb_access_ctrl_OFFS 0x400
+#define QIB_7220_pciesd_epb_access_ctrl_sw_pcie_epb_req_granted_LSB 0x8
+#define QIB_7220_pciesd_epb_access_ctrl_sw_pcie_epb_req_granted_RMASK 0x1
+#define QIB_7220_pciesd_epb_access_ctrl_Reserved_LSB 0x3
+#define QIB_7220_pciesd_epb_access_ctrl_Reserved_RMASK 0x1F
+#define QIB_7220_pciesd_epb_access_ctrl_sw_pcieepb_star_en_LSB 0x1
+#define QIB_7220_pciesd_epb_access_ctrl_sw_pcieepb_star_en_RMASK 0x3
+#define QIB_7220_pciesd_epb_access_ctrl_sw_pcie_epb_req_LSB 0x0
+#define QIB_7220_pciesd_epb_access_ctrl_sw_pcie_epb_req_RMASK 0x1
+
+#define QIB_7220_pciesd_epb_transaction_reg_OFFS 0x408
+#define QIB_7220_pciesd_epb_transaction_reg_pcie_epb_rdy_LSB 0x1F
+#define QIB_7220_pciesd_epb_transaction_reg_pcie_epb_rdy_RMASK 0x1
+#define QIB_7220_pciesd_epb_transaction_reg_pcie_epb_req_error_LSB 0x1E
+#define QIB_7220_pciesd_epb_transaction_reg_pcie_epb_req_error_RMASK 0x1
+#define QIB_7220_pciesd_epb_transaction_reg_Reserved_LSB 0x1D
+#define QIB_7220_pciesd_epb_transaction_reg_Reserved_RMASK 0x1
+#define QIB_7220_pciesd_epb_transaction_reg_mem_data_parity_LSB 0x1C
+#define QIB_7220_pciesd_epb_transaction_reg_mem_data_parity_RMASK 0x1
+#define QIB_7220_pciesd_epb_transaction_reg_pcie_epb_cs_LSB 0x19
+#define QIB_7220_pciesd_epb_transaction_reg_pcie_epb_cs_RMASK 0x7
+#define QIB_7220_pciesd_epb_transaction_reg_pcie_epb_read_write_LSB 0x18
+#define QIB_7220_pciesd_epb_transaction_reg_pcie_epb_read_write_RMASK 0x1
+#define QIB_7220_pciesd_epb_transaction_reg_Reserved1_LSB 0x17
+#define QIB_7220_pciesd_epb_transaction_reg_Reserved1_RMASK 0x1
+#define QIB_7220_pciesd_epb_transaction_reg_pcie_epb_address_LSB 0x8
+#define QIB_7220_pciesd_epb_transaction_reg_pcie_epb_address_RMASK 0x7FFF
+#define QIB_7220_pciesd_epb_transaction_reg_pcie_epb_data_LSB 0x0
+#define QIB_7220_pciesd_epb_transaction_reg_pcie_epb_data_RMASK 0xFF
+
+#define QIB_7220_SerDes_DDSRXEQ0_OFFS 0x500
+#define QIB_7220_SerDes_DDSRXEQ0_reg_addr_LSB 0x4
+#define QIB_7220_SerDes_DDSRXEQ0_reg_addr_RMASK 0x3F
+#define QIB_7220_SerDes_DDSRXEQ0_element_num_LSB 0x0
+#define QIB_7220_SerDes_DDSRXEQ0_element_num_RMASK 0xF
+
+#define QIB_7220_LBIntCnt_OFFS 0x13000
+
+#define QIB_7220_LBFlowStallCnt_OFFS 0x13008
+
+#define QIB_7220_TxSDmaDescCnt_OFFS 0x13010
+
+#define QIB_7220_TxUnsupVLErrCnt_OFFS 0x13018
+
+#define QIB_7220_TxDataPktCnt_OFFS 0x13020
+
+#define QIB_7220_TxFlowPktCnt_OFFS 0x13028
+
+#define QIB_7220_TxDwordCnt_OFFS 0x13030
+
+#define QIB_7220_TxLenErrCnt_OFFS 0x13038
+
+#define QIB_7220_TxMaxMinLenErrCnt_OFFS 0x13040
+
+#define QIB_7220_TxUnderrunCnt_OFFS 0x13048
+
+#define QIB_7220_TxFlowStallCnt_OFFS 0x13050
+
+#define QIB_7220_TxDroppedPktCnt_OFFS 0x13058
+
+#define QIB_7220_RxDroppedPktCnt_OFFS 0x13060
+
+#define QIB_7220_RxDataPktCnt_OFFS 0x13068
+
+#define QIB_7220_RxFlowPktCnt_OFFS 0x13070
+
+#define QIB_7220_RxDwordCnt_OFFS 0x13078
+
+#define QIB_7220_RxLenErrCnt_OFFS 0x13080
+
+#define QIB_7220_RxMaxMinLenErrCnt_OFFS 0x13088
+
+#define QIB_7220_RxICRCErrCnt_OFFS 0x13090
+
+#define QIB_7220_RxVCRCErrCnt_OFFS 0x13098
+
+#define QIB_7220_RxFlowCtrlViolCnt_OFFS 0x130A0
+
+#define QIB_7220_RxVersionErrCnt_OFFS 0x130A8
+
+#define QIB_7220_RxLinkMalformCnt_OFFS 0x130B0
+
+#define QIB_7220_RxEBPCnt_OFFS 0x130B8
+
+#define QIB_7220_RxLPCRCErrCnt_OFFS 0x130C0
+
+#define QIB_7220_RxBufOvflCnt_OFFS 0x130C8
+
+#define QIB_7220_RxTIDFullErrCnt_OFFS 0x130D0
+
+#define QIB_7220_RxTIDValidErrCnt_OFFS 0x130D8
+
+#define QIB_7220_RxPKeyMismatchCnt_OFFS 0x130E0
+
+#define QIB_7220_RxP0HdrEgrOvflCnt_OFFS 0x130E8
+
+#define QIB_7220_IBStatusChangeCnt_OFFS 0x13170
+
+#define QIB_7220_IBLinkErrRecoveryCnt_OFFS 0x13178
+
+#define QIB_7220_IBLinkDownedCnt_OFFS 0x13180
+
+#define QIB_7220_IBSymbolErrCnt_OFFS 0x13188
+
+#define QIB_7220_RxVL15DroppedPktCnt_OFFS 0x13190
+
+#define QIB_7220_RxOtherLocalPhyErrCnt_OFFS 0x13198
+
+#define QIB_7220_PcieRetryBufDiagQwordCnt_OFFS 0x131A0
+
+#define QIB_7220_ExcessBufferOvflCnt_OFFS 0x131A8
+
+#define QIB_7220_LocalLinkIntegrityErrCnt_OFFS 0x131B0
+
+#define QIB_7220_RxVlErrCnt_OFFS 0x131B8
+
+#define QIB_7220_RxDlidFltrCnt_OFFS 0x131C0
+
+#define QIB_7220_CNT_0131C8_OFFS 0x131C8
+
+#define QIB_7220_PSStat_OFFS 0x13200
+
+#define QIB_7220_PSStart_OFFS 0x13208
+
+#define QIB_7220_PSInterval_OFFS 0x13210
+
+#define QIB_7220_PSRcvDataCount_OFFS 0x13218
+
+#define QIB_7220_PSRcvPktsCount_OFFS 0x13220
+
+#define QIB_7220_PSXmitDataCount_OFFS 0x13228
+
+#define QIB_7220_PSXmitPktsCount_OFFS 0x13230
+
+#define QIB_7220_PSXmitWaitCount_OFFS 0x13238
+
+#define QIB_7220_CNT_013240_OFFS 0x13240
+
+#define QIB_7220_RcvEgrArray_OFFS 0x14000
+
+#define QIB_7220_MEM_038000_OFFS 0x38000
+
+#define QIB_7220_RcvTIDArray0_OFFS 0x53000
+
+#define QIB_7220_PIOLaunchFIFO_OFFS 0x64000
+
+#define QIB_7220_MEM_064480_OFFS 0x64480
+
+#define QIB_7220_SendPIOpbcCache_OFFS 0x64800
+
+#define QIB_7220_MEM_064C80_OFFS 0x64C80
+
+#define QIB_7220_PreLaunchFIFO_OFFS 0x65000
+
+#define QIB_7220_MEM_065080_OFFS 0x65080
+
+#define QIB_7220_ScoreBoard_OFFS 0x65400
+
+#define QIB_7220_MEM_065440_OFFS 0x65440
+
+#define QIB_7220_DescriptorFIFO_OFFS 0x65800
+
+#define QIB_7220_MEM_065880_OFFS 0x65880
+
+#define QIB_7220_RcvBuf1_OFFS 0x72000
+
+#define QIB_7220_MEM_074800_OFFS 0x74800
+
+#define QIB_7220_RcvBuf2_OFFS 0x75000
+
+#define QIB_7220_MEM_076400_OFFS 0x76400
+
+#define QIB_7220_RcvFlags_OFFS 0x77000
+
+#define QIB_7220_MEM_078400_OFFS 0x78400
+
+#define QIB_7220_RcvLookupBuf1_OFFS 0x79000
+
+#define QIB_7220_MEM_07A400_OFFS 0x7A400
+
+#define QIB_7220_RcvDMADatBuf_OFFS 0x7B000
+
+#define QIB_7220_RcvDMAHdrBuf_OFFS 0x7B800
+
+#define QIB_7220_MiscRXEIntMem_OFFS 0x7C000
+
+#define QIB_7220_MEM_07D400_OFFS 0x7D400
+
+#define QIB_7220_PCIERcvBuf_OFFS 0x80000
+
+#define QIB_7220_PCIERetryBuf_OFFS 0x84000
+
+#define QIB_7220_PCIERcvBufRdToWrAddr_OFFS 0x88000
+
+#define QIB_7220_PCIECplBuf_OFFS 0x90000
+
+#define QIB_7220_IBSerDesMappTable_OFFS 0x94000
+
+#define QIB_7220_MEM_095000_OFFS 0x95000
+
+#define QIB_7220_SendBuf0_MA_OFFS 0x100000
+
+#define QIB_7220_MEM_1A0000_OFFS 0x1A0000
diff --git a/drivers/infiniband/hw/qib/qib_7322_regs.h b/drivers/infiniband/hw/qib/qib_7322_regs.h
new file mode 100644
index 0000000..32dc81f
--- /dev/null
+++ b/drivers/infiniband/hw/qib/qib_7322_regs.h
@@ -0,0 +1,3163 @@
+/*
+ * Copyright (c) 2008, 2009, 2010 QLogic Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/* This file is mechanically generated from RTL. Any hand-edits will be lost! */
+
+#define QIB_7322_Revision_OFFS 0x0
+#define QIB_7322_Revision_DEF 0x0000000002010601
+#define QIB_7322_Revision_R_Simulator_LSB 0x3F
+#define QIB_7322_Revision_R_Simulator_MSB 0x3F
+#define QIB_7322_Revision_R_Simulator_RMASK 0x1
+#define QIB_7322_Revision_R_Emulation_LSB 0x3E
+#define QIB_7322_Revision_R_Emulation_MSB 0x3E
+#define QIB_7322_Revision_R_Emulation_RMASK 0x1
+#define QIB_7322_Revision_R_Emulation_Revcode_LSB 0x28
+#define QIB_7322_Revision_R_Emulation_Revcode_MSB 0x3D
+#define QIB_7322_Revision_R_Emulation_Revcode_RMASK 0x3FFFFF
+#define QIB_7322_Revision_BoardID_LSB 0x20
+#define QIB_7322_Revision_BoardID_MSB 0x27
+#define QIB_7322_Revision_BoardID_RMASK 0xFF
+#define QIB_7322_Revision_R_SW_LSB 0x18
+#define QIB_7322_Revision_R_SW_MSB 0x1F
+#define QIB_7322_Revision_R_SW_RMASK 0xFF
+#define QIB_7322_Revision_R_Arch_LSB 0x10
+#define QIB_7322_Revision_R_Arch_MSB 0x17
+#define QIB_7322_Revision_R_Arch_RMASK 0xFF
+#define QIB_7322_Revision_R_ChipRevMajor_LSB 0x8
+#define QIB_7322_Revision_R_ChipRevMajor_MSB 0xF
+#define QIB_7322_Revision_R_ChipRevMajor_RMASK 0xFF
+#define QIB_7322_Revision_R_ChipRevMinor_LSB 0x0
+#define QIB_7322_Revision_R_ChipRevMinor_MSB 0x7
+#define QIB_7322_Revision_R_ChipRevMinor_RMASK 0xFF
+
+#define QIB_7322_Control_OFFS 0x8
+#define QIB_7322_Control_DEF 0x0000000000000000
+#define QIB_7322_Control_PCIECplQDiagEn_LSB 0x6
+#define QIB_7322_Control_PCIECplQDiagEn_MSB 0x6
+#define QIB_7322_Control_PCIECplQDiagEn_RMASK 0x1
+#define QIB_7322_Control_PCIEPostQDiagEn_LSB 0x5
+#define QIB_7322_Control_PCIEPostQDiagEn_MSB 0x5
+#define QIB_7322_Control_PCIEPostQDiagEn_RMASK 0x1
+#define QIB_7322_Control_SDmaDescFetchPriorityEn_LSB 0x4
+#define QIB_7322_Control_SDmaDescFetchPriorityEn_MSB 0x4
+#define QIB_7322_Control_SDmaDescFetchPriorityEn_RMASK 0x1
+#define QIB_7322_Control_PCIERetryBufDiagEn_LSB 0x3
+#define QIB_7322_Control_PCIERetryBufDiagEn_MSB 0x3
+#define QIB_7322_Control_PCIERetryBufDiagEn_RMASK 0x1
+#define QIB_7322_Control_FreezeMode_LSB 0x1
+#define QIB_7322_Control_FreezeMode_MSB 0x1
+#define QIB_7322_Control_FreezeMode_RMASK 0x1
+#define QIB_7322_Control_SyncReset_LSB 0x0
+#define QIB_7322_Control_SyncReset_MSB 0x0
+#define QIB_7322_Control_SyncReset_RMASK 0x1
+
+#define QIB_7322_PageAlign_OFFS 0x10
+#define QIB_7322_PageAlign_DEF 0x0000000000001000
+
+#define QIB_7322_ContextCnt_OFFS 0x18
+#define QIB_7322_ContextCnt_DEF 0x0000000000000012
+
+#define QIB_7322_Scratch_OFFS 0x20
+#define QIB_7322_Scratch_DEF 0x0000000000000000
+
+#define QIB_7322_CntrRegBase_OFFS 0x28
+#define QIB_7322_CntrRegBase_DEF 0x0000000000011000
+
+#define QIB_7322_SendRegBase_OFFS 0x30
+#define QIB_7322_SendRegBase_DEF 0x0000000000003000
+
+#define QIB_7322_UserRegBase_OFFS 0x38
+#define QIB_7322_UserRegBase_DEF 0x0000000000200000
+
+#define QIB_7322_IntMask_OFFS 0x68
+#define QIB_7322_IntMask_DEF 0x0000000000000000
+#define QIB_7322_IntMask_SDmaIntMask_1_LSB 0x3F
+#define QIB_7322_IntMask_SDmaIntMask_1_MSB 0x3F
+#define QIB_7322_IntMask_SDmaIntMask_1_RMASK 0x1
+#define QIB_7322_IntMask_SDmaIntMask_0_LSB 0x3E
+#define QIB_7322_IntMask_SDmaIntMask_0_MSB 0x3E
+#define QIB_7322_IntMask_SDmaIntMask_0_RMASK 0x1
+#define QIB_7322_IntMask_SDmaProgressIntMask_1_LSB 0x3D
+#define QIB_7322_IntMask_SDmaProgressIntMask_1_MSB 0x3D
+#define QIB_7322_IntMask_SDmaProgressIntMask_1_RMASK 0x1
+#define QIB_7322_IntMask_SDmaProgressIntMask_0_LSB 0x3C
+#define QIB_7322_IntMask_SDmaProgressIntMask_0_MSB 0x3C
+#define QIB_7322_IntMask_SDmaProgressIntMask_0_RMASK 0x1
+#define QIB_7322_IntMask_SDmaIdleIntMask_1_LSB 0x3B
+#define QIB_7322_IntMask_SDmaIdleIntMask_1_MSB 0x3B
+#define QIB_7322_IntMask_SDmaIdleIntMask_1_RMASK 0x1
+#define QIB_7322_IntMask_SDmaIdleIntMask_0_LSB 0x3A
+#define QIB_7322_IntMask_SDmaIdleIntMask_0_MSB 0x3A
+#define QIB_7322_IntMask_SDmaIdleIntMask_0_RMASK 0x1
+#define QIB_7322_IntMask_SDmaCleanupDoneMask_1_LSB 0x39
+#define QIB_7322_IntMask_SDmaCleanupDoneMask_1_MSB 0x39
+#define QIB_7322_IntMask_SDmaCleanupDoneMask_1_RMASK 0x1
+#define QIB_7322_IntMask_SDmaCleanupDoneMask_0_LSB 0x38
+#define QIB_7322_IntMask_SDmaCleanupDoneMask_0_MSB 0x38
+#define QIB_7322_IntMask_SDmaCleanupDoneMask_0_RMASK 0x1
+#define QIB_7322_IntMask_RcvUrg17IntMask_LSB 0x31
+#define QIB_7322_IntMask_RcvUrg17IntMask_MSB 0x31
+#define QIB_7322_IntMask_RcvUrg17IntMask_RMASK 0x1
+#define QIB_7322_IntMask_RcvUrg16IntMask_LSB 0x30
+#define QIB_7322_IntMask_RcvUrg16IntMask_MSB 0x30
+#define QIB_7322_IntMask_RcvUrg16IntMask_RMASK 0x1
+#define QIB_7322_IntMask_RcvUrg15IntMask_LSB 0x2F
+#define QIB_7322_IntMask_RcvUrg15IntMask_MSB 0x2F
+#define QIB_7322_IntMask_RcvUrg15IntMask_RMASK 0x1
+#define QIB_7322_IntMask_RcvUrg14IntMask_LSB 0x2E
+#define QIB_7322_IntMask_RcvUrg14IntMask_MSB 0x2E
+#define QIB_7322_IntMask_RcvUrg14IntMask_RMASK 0x1
+#define QIB_7322_IntMask_RcvUrg13IntMask_LSB 0x2D
+#define QIB_7322_IntMask_RcvUrg13IntMask_MSB 0x2D
+#define QIB_7322_IntMask_RcvUrg13IntMask_RMASK 0x1
+#define QIB_7322_IntMask_RcvUrg12IntMask_LSB 0x2C
+#define QIB_7322_IntMask_RcvUrg12IntMask_MSB 0x2C
+#define QIB_7322_IntMask_RcvUrg12IntMask_RMASK 0x1
+#define QIB_7322_IntMask_RcvUrg11IntMask_LSB 0x2B
+#define QIB_7322_IntMask_RcvUrg11IntMask_MSB 0x2B
+#define QIB_7322_IntMask_RcvUrg11IntMask_RMASK 0x1
+#define QIB_7322_IntMask_RcvUrg10IntMask_LSB 0x2A
+#define QIB_7322_IntMask_RcvUrg10IntMask_MSB 0x2A
+#define QIB_7322_IntMask_RcvUrg10IntMask_RMASK 0x1
+#define QIB_7322_IntMask_RcvUrg9IntMask_LSB 0x29
+#define QIB_7322_IntMask_RcvUrg9IntMask_MSB 0x29
+#define QIB_7322_IntMask_RcvUrg9IntMask_RMASK 0x1
+#define QIB_7322_IntMask_RcvUrg8IntMask_LSB 0x28
+#define QIB_7322_IntMask_RcvUrg8IntMask_MSB 0x28
+#define QIB_7322_IntMask_RcvUrg8IntMask_RMASK 0x1
+#define QIB_7322_IntMask_RcvUrg7IntMask_LSB 0x27
+#define QIB_7322_IntMask_RcvUrg7IntMask_MSB 0x27
+#define QIB_7322_IntMask_RcvUrg7IntMask_RMASK 0x1
+#define QIB_7322_IntMask_RcvUrg6IntMask_LSB 0x26
+#define QIB_7322_IntMask_RcvUrg6IntMask_MSB 0x26
+#define QIB_7322_IntMask_RcvUrg6IntMask_RMASK 0x1
+#define QIB_7322_IntMask_RcvUrg5IntMask_LSB 0x25
+#define QIB_7322_IntMask_RcvUrg5IntMask_MSB 0x25
+#define QIB_7322_IntMask_RcvUrg5IntMask_RMASK 0x1
+#define QIB_7322_IntMask_RcvUrg4IntMask_LSB 0x24
+#define QIB_7322_IntMask_RcvUrg4IntMask_MSB 0x24
+#define QIB_7322_IntMask_RcvUrg4IntMask_RMASK 0x1
+#define QIB_7322_IntMask_RcvUrg3IntMask_LSB 0x23
+#define QIB_7322_IntMask_RcvUrg3IntMask_MSB 0x23
+#define QIB_7322_IntMask_RcvUrg3IntMask_RMASK 0x1
+#define QIB_7322_IntMask_RcvUrg2IntMask_LSB 0x22
+#define QIB_7322_IntMask_RcvUrg2IntMask_MSB 0x22
+#define QIB_7322_IntMask_RcvUrg2IntMask_RMASK 0x1
+#define QIB_7322_IntMask_RcvUrg1IntMask_LSB 0x21
+#define QIB_7322_IntMask_RcvUrg1IntMask_MSB 0x21
+#define QIB_7322_IntMask_RcvUrg1IntMask_RMASK 0x1
+#define QIB_7322_IntMask_RcvUrg0IntMask_LSB 0x20
+#define QIB_7322_IntMask_RcvUrg0IntMask_MSB 0x20
+#define QIB_7322_IntMask_RcvUrg0IntMask_RMASK 0x1
+#define QIB_7322_IntMask_ErrIntMask_1_LSB 0x1F
+#define QIB_7322_IntMask_ErrIntMask_1_MSB 0x1F
+#define QIB_7322_IntMask_ErrIntMask_1_RMASK 0x1
+#define QIB_7322_IntMask_ErrIntMask_0_LSB 0x1E
+#define QIB_7322_IntMask_ErrIntMask_0_MSB 0x1E
+#define QIB_7322_IntMask_ErrIntMask_0_RMASK 0x1
+#define QIB_7322_IntMask_ErrIntMask_LSB 0x1D
+#define QIB_7322_IntMask_ErrIntMask_MSB 0x1D
+#define QIB_7322_IntMask_ErrIntMask_RMASK 0x1
+#define QIB_7322_IntMask_AssertGPIOIntMask_LSB 0x1C
+#define QIB_7322_IntMask_AssertGPIOIntMask_MSB 0x1C
+#define QIB_7322_IntMask_AssertGPIOIntMask_RMASK 0x1
+#define QIB_7322_IntMask_SendDoneIntMask_1_LSB 0x19
+#define QIB_7322_IntMask_SendDoneIntMask_1_MSB 0x19
+#define QIB_7322_IntMask_SendDoneIntMask_1_RMASK 0x1
+#define QIB_7322_IntMask_SendDoneIntMask_0_LSB 0x18
+#define QIB_7322_IntMask_SendDoneIntMask_0_MSB 0x18
+#define QIB_7322_IntMask_SendDoneIntMask_0_RMASK 0x1
+#define QIB_7322_IntMask_SendBufAvailIntMask_LSB 0x17
+#define QIB_7322_IntMask_SendBufAvailIntMask_MSB 0x17
+#define QIB_7322_IntMask_SendBufAvailIntMask_RMASK 0x1
+#define QIB_7322_IntMask_RcvAvail17IntMask_LSB 0x11
+#define QIB_7322_IntMask_RcvAvail17IntMask_MSB 0x11
+#define QIB_7322_IntMask_RcvAvail17IntMask_RMASK 0x1
+#define QIB_7322_IntMask_RcvAvail16IntMask_LSB 0x10
+#define QIB_7322_IntMask_RcvAvail16IntMask_MSB 0x10
+#define QIB_7322_IntMask_RcvAvail16IntMask_RMASK 0x1
+#define QIB_7322_IntMask_RcvAvail15IntMask_LSB 0xF
+#define QIB_7322_IntMask_RcvAvail15IntMask_MSB 0xF
+#define QIB_7322_IntMask_RcvAvail15IntMask_RMASK 0x1
+#define QIB_7322_IntMask_RcvAvail14IntMask_LSB 0xE
+#define QIB_7322_IntMask_RcvAvail14IntMask_MSB 0xE
+#define QIB_7322_IntMask_RcvAvail14IntMask_RMASK 0x1
+#define QIB_7322_IntMask_RcvAvail13IntMask_LSB 0xD
+#define QIB_7322_IntMask_RcvAvail13IntMask_MSB 0xD
+#define QIB_7322_IntMask_RcvAvail13IntMask_RMASK 0x1
+#define QIB_7322_IntMask_RcvAvail12IntMask_LSB 0xC
+#define QIB_7322_IntMask_RcvAvail12IntMask_MSB 0xC
+#define QIB_7322_IntMask_RcvAvail12IntMask_RMASK 0x1
+#define QIB_7322_IntMask_RcvAvail11IntMask_LSB 0xB
+#define QIB_7322_IntMask_RcvAvail11IntMask_MSB 0xB
+#define QIB_7322_IntMask_RcvAvail11IntMask_RMASK 0x1
+#define QIB_7322_IntMask_RcvAvail10IntMask_LSB 0xA
+#define QIB_7322_IntMask_RcvAvail10IntMask_MSB 0xA
+#define QIB_7322_IntMask_RcvAvail10IntMask_RMASK 0x1
+#define QIB_7322_IntMask_RcvAvail9IntMask_LSB 0x9
+#define QIB_7322_IntMask_RcvAvail9IntMask_MSB 0x9
+#define QIB_7322_IntMask_RcvAvail9IntMask_RMASK 0x1
+#define QIB_7322_IntMask_RcvAvail8IntMask_LSB 0x8
+#define QIB_7322_IntMask_RcvAvail8IntMask_MSB 0x8
+#define QIB_7322_IntMask_RcvAvail8IntMask_RMASK 0x1
+#define QIB_7322_IntMask_RcvAvail7IntMask_LSB 0x7
+#define QIB_7322_IntMask_RcvAvail7IntMask_MSB 0x7
+#define QIB_7322_IntMask_RcvAvail7IntMask_RMASK 0x1
+#define QIB_7322_IntMask_RcvAvail6IntMask_LSB 0x6
+#define QIB_7322_IntMask_RcvAvail6IntMask_MSB 0x6
+#define QIB_7322_IntMask_RcvAvail6IntMask_RMASK 0x1
+#define QIB_7322_IntMask_RcvAvail5IntMask_LSB 0x5
+#define QIB_7322_IntMask_RcvAvail5IntMask_MSB 0x5
+#define QIB_7322_IntMask_RcvAvail5IntMask_RMASK 0x1
+#define QIB_7322_IntMask_RcvAvail4IntMask_LSB 0x4
+#define QIB_7322_IntMask_RcvAvail4IntMask_MSB 0x4
+#define QIB_7322_IntMask_RcvAvail4IntMask_RMASK 0x1
+#define QIB_7322_IntMask_RcvAvail3IntMask_LSB 0x3
+#define QIB_7322_IntMask_RcvAvail3IntMask_MSB 0x3
+#define QIB_7322_IntMask_RcvAvail3IntMask_RMASK 0x1
+#define QIB_7322_IntMask_RcvAvail2IntMask_LSB 0x2
+#define QIB_7322_IntMask_RcvAvail2IntMask_MSB 0x2
+#define QIB_7322_IntMask_RcvAvail2IntMask_RMASK 0x1
+#define QIB_7322_IntMask_RcvAvail1IntMask_LSB 0x1
+#define QIB_7322_IntMask_RcvAvail1IntMask_MSB 0x1
+#define QIB_7322_IntMask_RcvAvail1IntMask_RMASK 0x1
+#define QIB_7322_IntMask_RcvAvail0IntMask_LSB 0x0
+#define QIB_7322_IntMask_RcvAvail0IntMask_MSB 0x0
+#define QIB_7322_IntMask_RcvAvail0IntMask_RMASK 0x1
+
+#define QIB_7322_IntStatus_OFFS 0x70
+#define QIB_7322_IntStatus_DEF 0x0000000000000000
+#define QIB_7322_IntStatus_SDmaInt_1_LSB 0x3F
+#define QIB_7322_IntStatus_SDmaInt_1_MSB 0x3F
+#define QIB_7322_IntStatus_SDmaInt_1_RMASK 0x1
+#define QIB_7322_IntStatus_SDmaInt_0_LSB 0x3E
+#define QIB_7322_IntStatus_SDmaInt_0_MSB 0x3E
+#define QIB_7322_IntStatus_SDmaInt_0_RMASK 0x1
+#define QIB_7322_IntStatus_SDmaProgressInt_1_LSB 0x3D
+#define QIB_7322_IntStatus_SDmaProgressInt_1_MSB 0x3D
+#define QIB_7322_IntStatus_SDmaProgressInt_1_RMASK 0x1
+#define QIB_7322_IntStatus_SDmaProgressInt_0_LSB 0x3C
+#define QIB_7322_IntStatus_SDmaProgressInt_0_MSB 0x3C
+#define QIB_7322_IntStatus_SDmaProgressInt_0_RMASK 0x1
+#define QIB_7322_IntStatus_SDmaIdleInt_1_LSB 0x3B
+#define QIB_7322_IntStatus_SDmaIdleInt_1_MSB 0x3B
+#define QIB_7322_IntStatus_SDmaIdleInt_1_RMASK 0x1
+#define QIB_7322_IntStatus_SDmaIdleInt_0_LSB 0x3A
+#define QIB_7322_IntStatus_SDmaIdleInt_0_MSB 0x3A
+#define QIB_7322_IntStatus_SDmaIdleInt_0_RMASK 0x1
+#define QIB_7322_IntStatus_SDmaCleanupDone_1_LSB 0x39
+#define QIB_7322_IntStatus_SDmaCleanupDone_1_MSB 0x39
+#define QIB_7322_IntStatus_SDmaCleanupDone_1_RMASK 0x1
+#define QIB_7322_IntStatus_SDmaCleanupDone_0_LSB 0x38
+#define QIB_7322_IntStatus_SDmaCleanupDone_0_MSB 0x38
+#define QIB_7322_IntStatus_SDmaCleanupDone_0_RMASK 0x1
+#define QIB_7322_IntStatus_RcvUrg17_LSB 0x31
+#define QIB_7322_IntStatus_RcvUrg17_MSB 0x31
+#define QIB_7322_IntStatus_RcvUrg17_RMASK 0x1
+#define QIB_7322_IntStatus_RcvUrg16_LSB 0x30
+#define QIB_7322_IntStatus_RcvUrg16_MSB 0x30
+#define QIB_7322_IntStatus_RcvUrg16_RMASK 0x1
+#define QIB_7322_IntStatus_RcvUrg15_LSB 0x2F
+#define QIB_7322_IntStatus_RcvUrg15_MSB 0x2F
+#define QIB_7322_IntStatus_RcvUrg15_RMASK 0x1
+#define QIB_7322_IntStatus_RcvUrg14_LSB 0x2E
+#define QIB_7322_IntStatus_RcvUrg14_MSB 0x2E
+#define QIB_7322_IntStatus_RcvUrg14_RMASK 0x1
+#define QIB_7322_IntStatus_RcvUrg13_LSB 0x2D
+#define QIB_7322_IntStatus_RcvUrg13_MSB 0x2D
+#define QIB_7322_IntStatus_RcvUrg13_RMASK 0x1
+#define QIB_7322_IntStatus_RcvUrg12_LSB 0x2C
+#define QIB_7322_IntStatus_RcvUrg12_MSB 0x2C
+#define QIB_7322_IntStatus_RcvUrg12_RMASK 0x1
+#define QIB_7322_IntStatus_RcvUrg11_LSB 0x2B
+#define QIB_7322_IntStatus_RcvUrg11_MSB 0x2B
+#define QIB_7322_IntStatus_RcvUrg11_RMASK 0x1
+#define QIB_7322_IntStatus_RcvUrg10_LSB 0x2A
+#define QIB_7322_IntStatus_RcvUrg10_MSB 0x2A
+#define QIB_7322_IntStatus_RcvUrg10_RMASK 0x1
+#define QIB_7322_IntStatus_RcvUrg9_LSB 0x29
+#define QIB_7322_IntStatus_RcvUrg9_MSB 0x29
+#define QIB_7322_IntStatus_RcvUrg9_RMASK 0x1
+#define QIB_7322_IntStatus_RcvUrg8_LSB 0x28
+#define QIB_7322_IntStatus_RcvUrg8_MSB 0x28
+#define QIB_7322_IntStatus_RcvUrg8_RMASK 0x1
+#define QIB_7322_IntStatus_RcvUrg7_LSB 0x27
+#define QIB_7322_IntStatus_RcvUrg7_MSB 0x27
+#define QIB_7322_IntStatus_RcvUrg7_RMASK 0x1
+#define QIB_7322_IntStatus_RcvUrg6_LSB 0x26
+#define QIB_7322_IntStatus_RcvUrg6_MSB 0x26
+#define QIB_7322_IntStatus_RcvUrg6_RMASK 0x1
+#define QIB_7322_IntStatus_RcvUrg5_LSB 0x25
+#define QIB_7322_IntStatus_RcvUrg5_MSB 0x25
+#define QIB_7322_IntStatus_RcvUrg5_RMASK 0x1
+#define QIB_7322_IntStatus_RcvUrg4_LSB 0x24
+#define QIB_7322_IntStatus_RcvUrg4_MSB 0x24
+#define QIB_7322_IntStatus_RcvUrg4_RMASK 0x1
+#define QIB_7322_IntStatus_RcvUrg3_LSB 0x23
+#define QIB_7322_IntStatus_RcvUrg3_MSB 0x23
+#define QIB_7322_IntStatus_RcvUrg3_RMASK 0x1
+#define QIB_7322_IntStatus_RcvUrg2_LSB 0x22
+#define QIB_7322_IntStatus_RcvUrg2_MSB 0x22
+#define QIB_7322_IntStatus_RcvUrg2_RMASK 0x1
+#define QIB_7322_IntStatus_RcvUrg1_LSB 0x21
+#define QIB_7322_IntStatus_RcvUrg1_MSB 0x21
+#define QIB_7322_IntStatus_RcvUrg1_RMASK 0x1
+#define QIB_7322_IntStatus_RcvUrg0_LSB 0x20
+#define QIB_7322_IntStatus_RcvUrg0_MSB 0x20
+#define QIB_7322_IntStatus_RcvUrg0_RMASK 0x1
+#define QIB_7322_IntStatus_Err_1_LSB 0x1F
+#define QIB_7322_IntStatus_Err_1_MSB 0x1F
+#define QIB_7322_IntStatus_Err_1_RMASK 0x1
+#define QIB_7322_IntStatus_Err_0_LSB 0x1E
+#define QIB_7322_IntStatus_Err_0_MSB 0x1E
+#define QIB_7322_IntStatus_Err_0_RMASK 0x1
+#define QIB_7322_IntStatus_Err_LSB 0x1D
+#define QIB_7322_IntStatus_Err_MSB 0x1D
+#define QIB_7322_IntStatus_Err_RMASK 0x1
+#define QIB_7322_IntStatus_AssertGPIO_LSB 0x1C
+#define QIB_7322_IntStatus_AssertGPIO_MSB 0x1C
+#define QIB_7322_IntStatus_AssertGPIO_RMASK 0x1
+#define QIB_7322_IntStatus_SendDone_1_LSB 0x19
+#define QIB_7322_IntStatus_SendDone_1_MSB 0x19
+#define QIB_7322_IntStatus_SendDone_1_RMASK 0x1
+#define QIB_7322_IntStatus_SendDone_0_LSB 0x18
+#define QIB_7322_IntStatus_SendDone_0_MSB 0x18
+#define QIB_7322_IntStatus_SendDone_0_RMASK 0x1
+#define QIB_7322_IntStatus_SendBufAvail_LSB 0x17
+#define QIB_7322_IntStatus_SendBufAvail_MSB 0x17
+#define QIB_7322_IntStatus_SendBufAvail_RMASK 0x1
+#define QIB_7322_IntStatus_RcvAvail17_LSB 0x11
+#define QIB_7322_IntStatus_RcvAvail17_MSB 0x11
+#define QIB_7322_IntStatus_RcvAvail17_RMASK 0x1
+#define QIB_7322_IntStatus_RcvAvail16_LSB 0x10
+#define QIB_7322_IntStatus_RcvAvail16_MSB 0x10
+#define QIB_7322_IntStatus_RcvAvail16_RMASK 0x1
+#define QIB_7322_IntStatus_RcvAvail15_LSB 0xF
+#define QIB_7322_IntStatus_RcvAvail15_MSB 0xF
+#define QIB_7322_IntStatus_RcvAvail15_RMASK 0x1
+#define QIB_7322_IntStatus_RcvAvail14_LSB 0xE
+#define QIB_7322_IntStatus_RcvAvail14_MSB 0xE
+#define QIB_7322_IntStatus_RcvAvail14_RMASK 0x1
+#define QIB_7322_IntStatus_RcvAvail13_LSB 0xD
+#define QIB_7322_IntStatus_RcvAvail13_MSB 0xD
+#define QIB_7322_IntStatus_RcvAvail13_RMASK 0x1
+#define QIB_7322_IntStatus_RcvAvail12_LSB 0xC
+#define QIB_7322_IntStatus_RcvAvail12_MSB 0xC
+#define QIB_7322_IntStatus_RcvAvail12_RMASK 0x1
+#define QIB_7322_IntStatus_RcvAvail11_LSB 0xB
+#define QIB_7322_IntStatus_RcvAvail11_MSB 0xB
+#define QIB_7322_IntStatus_RcvAvail11_RMASK 0x1
+#define QIB_7322_IntStatus_RcvAvail10_LSB 0xA
+#define QIB_7322_IntStatus_RcvAvail10_MSB 0xA
+#define QIB_7322_IntStatus_RcvAvail10_RMASK 0x1
+#define QIB_7322_IntStatus_RcvAvail9_LSB 0x9
+#define QIB_7322_IntStatus_RcvAvail9_MSB 0x9
+#define QIB_7322_IntStatus_RcvAvail9_RMASK 0x1
+#define QIB_7322_IntStatus_RcvAvail8_LSB 0x8
+#define QIB_7322_IntStatus_RcvAvail8_MSB 0x8
+#define QIB_7322_IntStatus_RcvAvail8_RMASK 0x1
+#define QIB_7322_IntStatus_RcvAvail7_LSB 0x7
+#define QIB_7322_IntStatus_RcvAvail7_MSB 0x7
+#define QIB_7322_IntStatus_RcvAvail7_RMASK 0x1
+#define QIB_7322_IntStatus_RcvAvail6_LSB 0x6
+#define QIB_7322_IntStatus_RcvAvail6_MSB 0x6
+#define QIB_7322_IntStatus_RcvAvail6_RMASK 0x1
+#define QIB_7322_IntStatus_RcvAvail5_LSB 0x5
+#define QIB_7322_IntStatus_RcvAvail5_MSB 0x5
+#define QIB_7322_IntStatus_RcvAvail5_RMASK 0x1
+#define QIB_7322_IntStatus_RcvAvail4_LSB 0x4
+#define QIB_7322_IntStatus_RcvAvail4_MSB 0x4
+#define QIB_7322_IntStatus_RcvAvail4_RMASK 0x1
+#define QIB_7322_IntStatus_RcvAvail3_LSB 0x3
+#define QIB_7322_IntStatus_RcvAvail3_MSB 0x3
+#define QIB_7322_IntStatus_RcvAvail3_RMASK 0x1
+#define QIB_7322_IntStatus_RcvAvail2_LSB 0x2
+#define QIB_7322_IntStatus_RcvAvail2_MSB 0x2
+#define QIB_7322_IntStatus_RcvAvail2_RMASK 0x1
+#define QIB_7322_IntStatus_RcvAvail1_LSB 0x1
+#define QIB_7322_IntStatus_RcvAvail1_MSB 0x1
+#define QIB_7322_IntStatus_RcvAvail1_RMASK 0x1
+#define QIB_7322_IntStatus_RcvAvail0_LSB 0x0
+#define QIB_7322_IntStatus_RcvAvail0_MSB 0x0
+#define QIB_7322_IntStatus_RcvAvail0_RMASK 0x1
+
+#define QIB_7322_IntClear_OFFS 0x78
+#define QIB_7322_IntClear_DEF 0x0000000000000000
+#define QIB_7322_IntClear_SDmaIntClear_1_LSB 0x3F
+#define QIB_7322_IntClear_SDmaIntClear_1_MSB 0x3F
+#define QIB_7322_IntClear_SDmaIntClear_1_RMASK 0x1
+#define QIB_7322_IntClear_SDmaIntClear_0_LSB 0x3E
+#define QIB_7322_IntClear_SDmaIntClear_0_MSB 0x3E
+#define QIB_7322_IntClear_SDmaIntClear_0_RMASK 0x1
+#define QIB_7322_IntClear_SDmaProgressIntClear_1_LSB 0x3D
+#define QIB_7322_IntClear_SDmaProgressIntClear_1_MSB 0x3D
+#define QIB_7322_IntClear_SDmaProgressIntClear_1_RMASK 0x1
+#define QIB_7322_IntClear_SDmaProgressIntClear_0_LSB 0x3C
+#define QIB_7322_IntClear_SDmaProgressIntClear_0_MSB 0x3C
+#define QIB_7322_IntClear_SDmaProgressIntClear_0_RMASK 0x1
+#define QIB_7322_IntClear_SDmaIdleIntClear_1_LSB 0x3B
+#define QIB_7322_IntClear_SDmaIdleIntClear_1_MSB 0x3B
+#define QIB_7322_IntClear_SDmaIdleIntClear_1_RMASK 0x1
+#define QIB_7322_IntClear_SDmaIdleIntClear_0_LSB 0x3A
+#define QIB_7322_IntClear_SDmaIdleIntClear_0_MSB 0x3A
+#define QIB_7322_IntClear_SDmaIdleIntClear_0_RMASK 0x1
+#define QIB_7322_IntClear_SDmaCleanupDoneClear_1_LSB 0x39
+#define QIB_7322_IntClear_SDmaCleanupDoneClear_1_MSB 0x39
+#define QIB_7322_IntClear_SDmaCleanupDoneClear_1_RMASK 0x1
+#define QIB_7322_IntClear_SDmaCleanupDoneClear_0_LSB 0x38
+#define QIB_7322_IntClear_SDmaCleanupDoneClear_0_MSB 0x38
+#define QIB_7322_IntClear_SDmaCleanupDoneClear_0_RMASK 0x1
+#define QIB_7322_IntClear_RcvUrg17IntClear_LSB 0x31
+#define QIB_7322_IntClear_RcvUrg17IntClear_MSB 0x31
+#define QIB_7322_IntClear_RcvUrg17IntClear_RMASK 0x1
+#define QIB_7322_IntClear_RcvUrg16IntClear_LSB 0x30
+#define QIB_7322_IntClear_RcvUrg16IntClear_MSB 0x30
+#define QIB_7322_IntClear_RcvUrg16IntClear_RMASK 0x1
+#define QIB_7322_IntClear_RcvUrg15IntClear_LSB 0x2F
+#define QIB_7322_IntClear_RcvUrg15IntClear_MSB 0x2F
+#define QIB_7322_IntClear_RcvUrg15IntClear_RMASK 0x1
+#define QIB_7322_IntClear_RcvUrg14IntClear_LSB 0x2E
+#define QIB_7322_IntClear_RcvUrg14IntClear_MSB 0x2E
+#define QIB_7322_IntClear_RcvUrg14IntClear_RMASK 0x1
+#define QIB_7322_IntClear_RcvUrg13IntClear_LSB 0x2D
+#define QIB_7322_IntClear_RcvUrg13IntClear_MSB 0x2D
+#define QIB_7322_IntClear_RcvUrg13IntClear_RMASK 0x1
+#define QIB_7322_IntClear_RcvUrg12IntClear_LSB 0x2C
+#define QIB_7322_IntClear_RcvUrg12IntClear_MSB 0x2C
+#define QIB_7322_IntClear_RcvUrg12IntClear_RMASK 0x1
+#define QIB_7322_IntClear_RcvUrg11IntClear_LSB 0x2B
+#define QIB_7322_IntClear_RcvUrg11IntClear_MSB 0x2B
+#define QIB_7322_IntClear_RcvUrg11IntClear_RMASK 0x1
+#define QIB_7322_IntClear_RcvUrg10IntClear_LSB 0x2A
+#define QIB_7322_IntClear_RcvUrg10IntClear_MSB 0x2A
+#define QIB_7322_IntClear_RcvUrg10IntClear_RMASK 0x1
+#define QIB_7322_IntClear_RcvUrg9IntClear_LSB 0x29
+#define QIB_7322_IntClear_RcvUrg9IntClear_MSB 0x29
+#define QIB_7322_IntClear_RcvUrg9IntClear_RMASK 0x1
+#define QIB_7322_IntClear_RcvUrg8IntClear_LSB 0x28
+#define QIB_7322_IntClear_RcvUrg8IntClear_MSB 0x28
+#define QIB_7322_IntClear_RcvUrg8IntClear_RMASK 0x1
+#define QIB_7322_IntClear_RcvUrg7IntClear_LSB 0x27
+#define QIB_7322_IntClear_RcvUrg7IntClear_MSB 0x27
+#define QIB_7322_IntClear_RcvUrg7IntClear_RMASK 0x1
+#define QIB_7322_IntClear_RcvUrg6IntClear_LSB 0x26
+#define QIB_7322_IntClear_RcvUrg6IntClear_MSB 0x26
+#define QIB_7322_IntClear_RcvUrg6IntClear_RMASK 0x1
+#define QIB_7322_IntClear_RcvUrg5IntClear_LSB 0x25
+#define QIB_7322_IntClear_RcvUrg5IntClear_MSB 0x25
+#define QIB_7322_IntClear_RcvUrg5IntClear_RMASK 0x1
+#define QIB_7322_IntClear_RcvUrg4IntClear_LSB 0x24
+#define QIB_7322_IntClear_RcvUrg4IntClear_MSB 0x24
+#define QIB_7322_IntClear_RcvUrg4IntClear_RMASK 0x1
+#define QIB_7322_IntClear_RcvUrg3IntClear_LSB 0x23
+#define QIB_7322_IntClear_RcvUrg3IntClear_MSB 0x23
+#define QIB_7322_IntClear_RcvUrg3IntClear_RMASK 0x1
+#define QIB_7322_IntClear_RcvUrg2IntClear_LSB 0x22
+#define QIB_7322_IntClear_RcvUrg2IntClear_MSB 0x22
+#define QIB_7322_IntClear_RcvUrg2IntClear_RMASK 0x1
+#define QIB_7322_IntClear_RcvUrg1IntClear_LSB 0x21
+#define QIB_7322_IntClear_RcvUrg1IntClear_MSB 0x21
+#define QIB_7322_IntClear_RcvUrg1IntClear_RMASK 0x1
+#define QIB_7322_IntClear_RcvUrg0IntClear_LSB 0x20
+#define QIB_7322_IntClear_RcvUrg0IntClear_MSB 0x20
+#define QIB_7322_IntClear_RcvUrg0IntClear_RMASK 0x1
+#define QIB_7322_IntClear_ErrIntClear_1_LSB 0x1F
+#define QIB_7322_IntClear_ErrIntClear_1_MSB 0x1F
+#define QIB_7322_IntClear_ErrIntClear_1_RMASK 0x1
+#define QIB_7322_IntClear_ErrIntClear_0_LSB 0x1E
+#define QIB_7322_IntClear_ErrIntClear_0_MSB 0x1E
+#define QIB_7322_IntClear_ErrIntClear_0_RMASK 0x1
+#define QIB_7322_IntClear_ErrIntClear_LSB 0x1D
+#define QIB_7322_IntClear_ErrIntClear_MSB 0x1D
+#define QIB_7322_IntClear_ErrIntClear_RMASK 0x1
+#define QIB_7322_IntClear_AssertGPIOIntClear_LSB 0x1C
+#define QIB_7322_IntClear_AssertGPIOIntClear_MSB 0x1C
+#define QIB_7322_IntClear_AssertGPIOIntClear_RMASK 0x1
+#define QIB_7322_IntClear_SendDoneIntClear_1_LSB 0x19
+#define QIB_7322_IntClear_SendDoneIntClear_1_MSB 0x19
+#define QIB_7322_IntClear_SendDoneIntClear_1_RMASK 0x1
+#define QIB_7322_IntClear_SendDoneIntClear_0_LSB 0x18
+#define QIB_7322_IntClear_SendDoneIntClear_0_MSB 0x18
+#define QIB_7322_IntClear_SendDoneIntClear_0_RMASK 0x1
+#define QIB_7322_IntClear_SendBufAvailIntClear_LSB 0x17
+#define QIB_7322_IntClear_SendBufAvailIntClear_MSB 0x17
+#define QIB_7322_IntClear_SendBufAvailIntClear_RMASK 0x1
+#define QIB_7322_IntClear_RcvAvail17IntClear_LSB 0x11
+#define QIB_7322_IntClear_RcvAvail17IntClear_MSB 0x11
+#define QIB_7322_IntClear_RcvAvail17IntClear_RMASK 0x1
+#define QIB_7322_IntClear_RcvAvail16IntClear_LSB 0x10
+#define QIB_7322_IntClear_RcvAvail16IntClear_MSB 0x10
+#define QIB_7322_IntClear_RcvAvail16IntClear_RMASK 0x1
+#define QIB_7322_IntClear_RcvAvail15IntClear_LSB 0xF
+#define QIB_7322_IntClear_RcvAvail15IntClear_MSB 0xF
+#define QIB_7322_IntClear_RcvAvail15IntClear_RMASK 0x1
+#define QIB_7322_IntClear_RcvAvail14IntClear_LSB 0xE
+#define QIB_7322_IntClear_RcvAvail14IntClear_MSB 0xE
+#define QIB_7322_IntClear_RcvAvail14IntClear_RMASK 0x1
+#define QIB_7322_IntClear_RcvAvail13IntClear_LSB 0xD
+#define QIB_7322_IntClear_RcvAvail13IntClear_MSB 0xD
+#define QIB_7322_IntClear_RcvAvail13IntClear_RMASK 0x1
+#define QIB_7322_IntClear_RcvAvail12IntClear_LSB 0xC
+#define QIB_7322_IntClear_RcvAvail12IntClear_MSB 0xC
+#define QIB_7322_IntClear_RcvAvail12IntClear_RMASK 0x1
+#define QIB_7322_IntClear_RcvAvail11IntClear_LSB 0xB
+#define QIB_7322_IntClear_RcvAvail11IntClear_MSB 0xB
+#define QIB_7322_IntClear_RcvAvail11IntClear_RMASK 0x1
+#define QIB_7322_IntClear_RcvAvail10IntClear_LSB 0xA
+#define QIB_7322_IntClear_RcvAvail10IntClear_MSB 0xA
+#define QIB_7322_IntClear_RcvAvail10IntClear_RMASK 0x1
+#define QIB_7322_IntClear_RcvAvail9IntClear_LSB 0x9
+#define QIB_7322_IntClear_RcvAvail9IntClear_MSB 0x9
+#define QIB_7322_IntClear_RcvAvail9IntClear_RMASK 0x1
+#define QIB_7322_IntClear_RcvAvail8IntClear_LSB 0x8
+#define QIB_7322_IntClear_RcvAvail8IntClear_MSB 0x8
+#define QIB_7322_IntClear_RcvAvail8IntClear_RMASK 0x1
+#define QIB_7322_IntClear_RcvAvail7IntClear_LSB 0x7
+#define QIB_7322_IntClear_RcvAvail7IntClear_MSB 0x7
+#define QIB_7322_IntClear_RcvAvail7IntClear_RMASK 0x1
+#define QIB_7322_IntClear_RcvAvail6IntClear_LSB 0x6
+#define QIB_7322_IntClear_RcvAvail6IntClear_MSB 0x6
+#define QIB_7322_IntClear_RcvAvail6IntClear_RMASK 0x1
+#define QIB_7322_IntClear_RcvAvail5IntClear_LSB 0x5
+#define QIB_7322_IntClear_RcvAvail5IntClear_MSB 0x5
+#define QIB_7322_IntClear_RcvAvail5IntClear_RMASK 0x1
+#define QIB_7322_IntClear_RcvAvail4IntClear_LSB 0x4
+#define QIB_7322_IntClear_RcvAvail4IntClear_MSB 0x4
+#define QIB_7322_IntClear_RcvAvail4IntClear_RMASK 0x1
+#define QIB_7322_IntClear_RcvAvail3IntClear_LSB 0x3
+#define QIB_7322_IntClear_RcvAvail3IntClear_MSB 0x3
+#define QIB_7322_IntClear_RcvAvail3IntClear_RMASK 0x1
+#define QIB_7322_IntClear_RcvAvail2IntClear_LSB 0x2
+#define QIB_7322_IntClear_RcvAvail2IntClear_MSB 0x2
+#define QIB_7322_IntClear_RcvAvail2IntClear_RMASK 0x1
+#define QIB_7322_IntClear_RcvAvail1IntClear_LSB 0x1
+#define QIB_7322_IntClear_RcvAvail1IntClear_MSB 0x1
+#define QIB_7322_IntClear_RcvAvail1IntClear_RMASK 0x1
+#define QIB_7322_IntClear_RcvAvail0IntClear_LSB 0x0
+#define QIB_7322_IntClear_RcvAvail0IntClear_MSB 0x0
+#define QIB_7322_IntClear_RcvAvail0IntClear_RMASK 0x1
+
+#define QIB_7322_ErrMask_OFFS 0x80
+#define QIB_7322_ErrMask_DEF 0x0000000000000000
+#define QIB_7322_ErrMask_ResetNegatedMask_LSB 0x3F
+#define QIB_7322_ErrMask_ResetNegatedMask_MSB 0x3F
+#define QIB_7322_ErrMask_ResetNegatedMask_RMASK 0x1
+#define QIB_7322_ErrMask_HardwareErrMask_LSB 0x3E
+#define QIB_7322_ErrMask_HardwareErrMask_MSB 0x3E
+#define QIB_7322_ErrMask_HardwareErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_InvalidAddrErrMask_LSB 0x3D
+#define QIB_7322_ErrMask_InvalidAddrErrMask_MSB 0x3D
+#define QIB_7322_ErrMask_InvalidAddrErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_SDmaVL15ErrMask_LSB 0x38
+#define QIB_7322_ErrMask_SDmaVL15ErrMask_MSB 0x38
+#define QIB_7322_ErrMask_SDmaVL15ErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_SBufVL15MisUseErrMask_LSB 0x37
+#define QIB_7322_ErrMask_SBufVL15MisUseErrMask_MSB 0x37
+#define QIB_7322_ErrMask_SBufVL15MisUseErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_InvalidEEPCmdMask_LSB 0x35
+#define QIB_7322_ErrMask_InvalidEEPCmdMask_MSB 0x35
+#define QIB_7322_ErrMask_InvalidEEPCmdMask_RMASK 0x1
+#define QIB_7322_ErrMask_RcvContextShareErrMask_LSB 0x34
+#define QIB_7322_ErrMask_RcvContextShareErrMask_MSB 0x34
+#define QIB_7322_ErrMask_RcvContextShareErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_SendVLMismatchErrMask_LSB 0x24
+#define QIB_7322_ErrMask_SendVLMismatchErrMask_MSB 0x24
+#define QIB_7322_ErrMask_SendVLMismatchErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_SendArmLaunchErrMask_LSB 0x23
+#define QIB_7322_ErrMask_SendArmLaunchErrMask_MSB 0x23
+#define QIB_7322_ErrMask_SendArmLaunchErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_SendSpecialTriggerErrMask_LSB 0x1B
+#define QIB_7322_ErrMask_SendSpecialTriggerErrMask_MSB 0x1B
+#define QIB_7322_ErrMask_SendSpecialTriggerErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_SDmaWrongPortErrMask_LSB 0x1A
+#define QIB_7322_ErrMask_SDmaWrongPortErrMask_MSB 0x1A
+#define QIB_7322_ErrMask_SDmaWrongPortErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_SDmaBufMaskDuplicateErrMask_LSB 0x19
+#define QIB_7322_ErrMask_SDmaBufMaskDuplicateErrMask_MSB 0x19
+#define QIB_7322_ErrMask_SDmaBufMaskDuplicateErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_RcvHdrFullErrMask_LSB 0xD
+#define QIB_7322_ErrMask_RcvHdrFullErrMask_MSB 0xD
+#define QIB_7322_ErrMask_RcvHdrFullErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_RcvEgrFullErrMask_LSB 0xC
+#define QIB_7322_ErrMask_RcvEgrFullErrMask_MSB 0xC
+#define QIB_7322_ErrMask_RcvEgrFullErrMask_RMASK 0x1
+
+#define QIB_7322_ErrStatus_OFFS 0x88
+#define QIB_7322_ErrStatus_DEF 0x0000000000000000
+#define QIB_7322_ErrStatus_ResetNegated_LSB 0x3F
+#define QIB_7322_ErrStatus_ResetNegated_MSB 0x3F
+#define QIB_7322_ErrStatus_ResetNegated_RMASK 0x1
+#define QIB_7322_ErrStatus_HardwareErr_LSB 0x3E
+#define QIB_7322_ErrStatus_HardwareErr_MSB 0x3E
+#define QIB_7322_ErrStatus_HardwareErr_RMASK 0x1
+#define QIB_7322_ErrStatus_InvalidAddrErr_LSB 0x3D
+#define QIB_7322_ErrStatus_InvalidAddrErr_MSB 0x3D
+#define QIB_7322_ErrStatus_InvalidAddrErr_RMASK 0x1
+#define QIB_7322_ErrStatus_SDmaVL15Err_LSB 0x38
+#define QIB_7322_ErrStatus_SDmaVL15Err_MSB 0x38
+#define QIB_7322_ErrStatus_SDmaVL15Err_RMASK 0x1
+#define QIB_7322_ErrStatus_SBufVL15MisUseErr_LSB 0x37
+#define QIB_7322_ErrStatus_SBufVL15MisUseErr_MSB 0x37
+#define QIB_7322_ErrStatus_SBufVL15MisUseErr_RMASK 0x1
+#define QIB_7322_ErrStatus_InvalidEEPCmdErr_LSB 0x35
+#define QIB_7322_ErrStatus_InvalidEEPCmdErr_MSB 0x35
+#define QIB_7322_ErrStatus_InvalidEEPCmdErr_RMASK 0x1
+#define QIB_7322_ErrStatus_RcvContextShareErr_LSB 0x34
+#define QIB_7322_ErrStatus_RcvContextShareErr_MSB 0x34
+#define QIB_7322_ErrStatus_RcvContextShareErr_RMASK 0x1
+#define QIB_7322_ErrStatus_SendVLMismatchErr_LSB 0x24
+#define QIB_7322_ErrStatus_SendVLMismatchErr_MSB 0x24
+#define QIB_7322_ErrStatus_SendVLMismatchErr_RMASK 0x1
+#define QIB_7322_ErrStatus_SendArmLaunchErr_LSB 0x23
+#define QIB_7322_ErrStatus_SendArmLaunchErr_MSB 0x23
+#define QIB_7322_ErrStatus_SendArmLaunchErr_RMASK 0x1
+#define QIB_7322_ErrStatus_SendSpecialTriggerErr_LSB 0x1B
+#define QIB_7322_ErrStatus_SendSpecialTriggerErr_MSB 0x1B
+#define QIB_7322_ErrStatus_SendSpecialTriggerErr_RMASK 0x1
+#define QIB_7322_ErrStatus_SDmaWrongPortErr_LSB 0x1A
+#define QIB_7322_ErrStatus_SDmaWrongPortErr_MSB 0x1A
+#define QIB_7322_ErrStatus_SDmaWrongPortErr_RMASK 0x1
+#define QIB_7322_ErrStatus_SDmaBufMaskDuplicateErr_LSB 0x19
+#define QIB_7322_ErrStatus_SDmaBufMaskDuplicateErr_MSB 0x19
+#define QIB_7322_ErrStatus_SDmaBufMaskDuplicateErr_RMASK 0x1
+#define QIB_7322_ErrStatus_RcvHdrFullErr_LSB 0xD
+#define QIB_7322_ErrStatus_RcvHdrFullErr_MSB 0xD
+#define QIB_7322_ErrStatus_RcvHdrFullErr_RMASK 0x1
+#define QIB_7322_ErrStatus_RcvEgrFullErr_LSB 0xC
+#define QIB_7322_ErrStatus_RcvEgrFullErr_MSB 0xC
+#define QIB_7322_ErrStatus_RcvEgrFullErr_RMASK 0x1
+
+#define QIB_7322_ErrClear_OFFS 0x90
+#define QIB_7322_ErrClear_DEF 0x0000000000000000
+#define QIB_7322_ErrClear_ResetNegatedClear_LSB 0x3F
+#define QIB_7322_ErrClear_ResetNegatedClear_MSB 0x3F
+#define QIB_7322_ErrClear_ResetNegatedClear_RMASK 0x1
+#define QIB_7322_ErrClear_HardwareErrClear_LSB 0x3E
+#define QIB_7322_ErrClear_HardwareErrClear_MSB 0x3E
+#define QIB_7322_ErrClear_HardwareErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_InvalidAddrErrClear_LSB 0x3D
+#define QIB_7322_ErrClear_InvalidAddrErrClear_MSB 0x3D
+#define QIB_7322_ErrClear_InvalidAddrErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_SDmaVL15ErrClear_LSB 0x38
+#define QIB_7322_ErrClear_SDmaVL15ErrClear_MSB 0x38
+#define QIB_7322_ErrClear_SDmaVL15ErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_SBufVL15MisUseErrClear_LSB 0x37
+#define QIB_7322_ErrClear_SBufVL15MisUseErrClear_MSB 0x37
+#define QIB_7322_ErrClear_SBufVL15MisUseErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_InvalidEEPCmdErrClear_LSB 0x35
+#define QIB_7322_ErrClear_InvalidEEPCmdErrClear_MSB 0x35
+#define QIB_7322_ErrClear_InvalidEEPCmdErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_RcvContextShareErrClear_LSB 0x34
+#define QIB_7322_ErrClear_RcvContextShareErrClear_MSB 0x34
+#define QIB_7322_ErrClear_RcvContextShareErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_SendVLMismatchErrMask_LSB 0x24
+#define QIB_7322_ErrClear_SendVLMismatchErrMask_MSB 0x24
+#define QIB_7322_ErrClear_SendVLMismatchErrMask_RMASK 0x1
+#define QIB_7322_ErrClear_SendArmLaunchErrClear_LSB 0x23
+#define QIB_7322_ErrClear_SendArmLaunchErrClear_MSB 0x23
+#define QIB_7322_ErrClear_SendArmLaunchErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_SendSpecialTriggerErrClear_LSB 0x1B
+#define QIB_7322_ErrClear_SendSpecialTriggerErrClear_MSB 0x1B
+#define QIB_7322_ErrClear_SendSpecialTriggerErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_SDmaWrongPortErrClear_LSB 0x1A
+#define QIB_7322_ErrClear_SDmaWrongPortErrClear_MSB 0x1A
+#define QIB_7322_ErrClear_SDmaWrongPortErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_SDmaBufMaskDuplicateErrClear_LSB 0x19
+#define QIB_7322_ErrClear_SDmaBufMaskDuplicateErrClear_MSB 0x19
+#define QIB_7322_ErrClear_SDmaBufMaskDuplicateErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_RcvHdrFullErrClear_LSB 0xD
+#define QIB_7322_ErrClear_RcvHdrFullErrClear_MSB 0xD
+#define QIB_7322_ErrClear_RcvHdrFullErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_RcvEgrFullErrClear_LSB 0xC
+#define QIB_7322_ErrClear_RcvEgrFullErrClear_MSB 0xC
+#define QIB_7322_ErrClear_RcvEgrFullErrClear_RMASK 0x1
+
+#define QIB_7322_HwErrMask_OFFS 0x98
+#define QIB_7322_HwErrMask_DEF 0x0000000000000000
+#define QIB_7322_HwErrMask_IBSerdesPClkNotDetectMask_1_LSB 0x3F
+#define QIB_7322_HwErrMask_IBSerdesPClkNotDetectMask_1_MSB 0x3F
+#define QIB_7322_HwErrMask_IBSerdesPClkNotDetectMask_1_RMASK 0x1
+#define QIB_7322_HwErrMask_IBSerdesPClkNotDetectMask_0_LSB 0x3E
+#define QIB_7322_HwErrMask_IBSerdesPClkNotDetectMask_0_MSB 0x3E
+#define QIB_7322_HwErrMask_IBSerdesPClkNotDetectMask_0_RMASK 0x1
+#define QIB_7322_HwErrMask_PCIESerdesPClkNotDetectMask_LSB 0x37
+#define QIB_7322_HwErrMask_PCIESerdesPClkNotDetectMask_MSB 0x37
+#define QIB_7322_HwErrMask_PCIESerdesPClkNotDetectMask_RMASK 0x1
+#define QIB_7322_HwErrMask_PowerOnBISTFailedMask_LSB 0x36
+#define QIB_7322_HwErrMask_PowerOnBISTFailedMask_MSB 0x36
+#define QIB_7322_HwErrMask_PowerOnBISTFailedMask_RMASK 0x1
+#define QIB_7322_HwErrMask_TempsenseTholdReachedMask_LSB 0x35
+#define QIB_7322_HwErrMask_TempsenseTholdReachedMask_MSB 0x35
+#define QIB_7322_HwErrMask_TempsenseTholdReachedMask_RMASK 0x1
+#define QIB_7322_HwErrMask_MemoryErrMask_LSB 0x30
+#define QIB_7322_HwErrMask_MemoryErrMask_MSB 0x30
+#define QIB_7322_HwErrMask_MemoryErrMask_RMASK 0x1
+#define QIB_7322_HwErrMask_pcie_phy_txParityErr_LSB 0x22
+#define QIB_7322_HwErrMask_pcie_phy_txParityErr_MSB 0x22
+#define QIB_7322_HwErrMask_pcie_phy_txParityErr_RMASK 0x1
+#define QIB_7322_HwErrMask_PCIeBusParityErrMask_LSB 0x1F
+#define QIB_7322_HwErrMask_PCIeBusParityErrMask_MSB 0x21
+#define QIB_7322_HwErrMask_PCIeBusParityErrMask_RMASK 0x7
+#define QIB_7322_HwErrMask_PcieCplTimeoutMask_LSB 0x1E
+#define QIB_7322_HwErrMask_PcieCplTimeoutMask_MSB 0x1E
+#define QIB_7322_HwErrMask_PcieCplTimeoutMask_RMASK 0x1
+#define QIB_7322_HwErrMask_PciePoisonedTLPMask_LSB 0x1D
+#define QIB_7322_HwErrMask_PciePoisonedTLPMask_MSB 0x1D
+#define QIB_7322_HwErrMask_PciePoisonedTLPMask_RMASK 0x1
+#define QIB_7322_HwErrMask_SDmaMemReadErrMask_1_LSB 0x1C
+#define QIB_7322_HwErrMask_SDmaMemReadErrMask_1_MSB 0x1C
+#define QIB_7322_HwErrMask_SDmaMemReadErrMask_1_RMASK 0x1
+#define QIB_7322_HwErrMask_SDmaMemReadErrMask_0_LSB 0x1B
+#define QIB_7322_HwErrMask_SDmaMemReadErrMask_0_MSB 0x1B
+#define QIB_7322_HwErrMask_SDmaMemReadErrMask_0_RMASK 0x1
+#define QIB_7322_HwErrMask_IBCBusFromSPCParityErrMask_1_LSB 0xF
+#define QIB_7322_HwErrMask_IBCBusFromSPCParityErrMask_1_MSB 0xF
+#define QIB_7322_HwErrMask_IBCBusFromSPCParityErrMask_1_RMASK 0x1
+#define QIB_7322_HwErrMask_IBCBusToSPCParityErrMask_1_LSB 0xE
+#define QIB_7322_HwErrMask_IBCBusToSPCParityErrMask_1_MSB 0xE
+#define QIB_7322_HwErrMask_IBCBusToSPCParityErrMask_1_RMASK 0x1
+#define QIB_7322_HwErrMask_IBCBusFromSPCParityErrMask_0_LSB 0xD
+#define QIB_7322_HwErrMask_IBCBusFromSPCParityErrMask_0_MSB 0xD
+#define QIB_7322_HwErrMask_IBCBusFromSPCParityErrMask_0_RMASK 0x1
+#define QIB_7322_HwErrMask_statusValidNoEopMask_LSB 0xC
+#define QIB_7322_HwErrMask_statusValidNoEopMask_MSB 0xC
+#define QIB_7322_HwErrMask_statusValidNoEopMask_RMASK 0x1
+#define QIB_7322_HwErrMask_LATriggeredMask_LSB 0xB
+#define QIB_7322_HwErrMask_LATriggeredMask_MSB 0xB
+#define QIB_7322_HwErrMask_LATriggeredMask_RMASK 0x1
+
+#define QIB_7322_HwErrStatus_OFFS 0xA0
+#define QIB_7322_HwErrStatus_DEF 0x0000000000000000
+#define QIB_7322_HwErrStatus_IBSerdesPClkNotDetect_1_LSB 0x3F
+#define QIB_7322_HwErrStatus_IBSerdesPClkNotDetect_1_MSB 0x3F
+#define QIB_7322_HwErrStatus_IBSerdesPClkNotDetect_1_RMASK 0x1
+#define QIB_7322_HwErrStatus_IBSerdesPClkNotDetect_0_LSB 0x3E
+#define QIB_7322_HwErrStatus_IBSerdesPClkNotDetect_0_MSB 0x3E
+#define QIB_7322_HwErrStatus_IBSerdesPClkNotDetect_0_RMASK 0x1
+#define QIB_7322_HwErrStatus_PCIESerdesPClkNotDetect_LSB 0x37
+#define QIB_7322_HwErrStatus_PCIESerdesPClkNotDetect_MSB 0x37
+#define QIB_7322_HwErrStatus_PCIESerdesPClkNotDetect_RMASK 0x1
+#define QIB_7322_HwErrStatus_PowerOnBISTFailed_LSB 0x36
+#define QIB_7322_HwErrStatus_PowerOnBISTFailed_MSB 0x36
+#define QIB_7322_HwErrStatus_PowerOnBISTFailed_RMASK 0x1
+#define QIB_7322_HwErrStatus_TempsenseTholdReached_LSB 0x35
+#define QIB_7322_HwErrStatus_TempsenseTholdReached_MSB 0x35
+#define QIB_7322_HwErrStatus_TempsenseTholdReached_RMASK 0x1
+#define QIB_7322_HwErrStatus_MemoryErr_LSB 0x30
+#define QIB_7322_HwErrStatus_MemoryErr_MSB 0x30
+#define QIB_7322_HwErrStatus_MemoryErr_RMASK 0x1
+#define QIB_7322_HwErrStatus_pcie_phy_txParityErr_LSB 0x22
+#define QIB_7322_HwErrStatus_pcie_phy_txParityErr_MSB 0x22
+#define QIB_7322_HwErrStatus_pcie_phy_txParityErr_RMASK 0x1
+#define QIB_7322_HwErrStatus_PCIeBusParity_LSB 0x1F
+#define QIB_7322_HwErrStatus_PCIeBusParity_MSB 0x21
+#define QIB_7322_HwErrStatus_PCIeBusParity_RMASK 0x7
+#define QIB_7322_HwErrStatus_PcieCplTimeout_LSB 0x1E
+#define QIB_7322_HwErrStatus_PcieCplTimeout_MSB 0x1E
+#define QIB_7322_HwErrStatus_PcieCplTimeout_RMASK 0x1
+#define QIB_7322_HwErrStatus_PciePoisonedTLP_LSB 0x1D
+#define QIB_7322_HwErrStatus_PciePoisonedTLP_MSB 0x1D
+#define QIB_7322_HwErrStatus_PciePoisonedTLP_RMASK 0x1
+#define QIB_7322_HwErrStatus_SDmaMemReadErr_1_LSB 0x1C
+#define QIB_7322_HwErrStatus_SDmaMemReadErr_1_MSB 0x1C
+#define QIB_7322_HwErrStatus_SDmaMemReadErr_1_RMASK 0x1
+#define QIB_7322_HwErrStatus_SDmaMemReadErr_0_LSB 0x1B
+#define QIB_7322_HwErrStatus_SDmaMemReadErr_0_MSB 0x1B
+#define QIB_7322_HwErrStatus_SDmaMemReadErr_0_RMASK 0x1
+#define QIB_7322_HwErrStatus_IBCBusFromSPCParityErr_1_LSB 0xF
+#define QIB_7322_HwErrStatus_IBCBusFromSPCParityErr_1_MSB 0xF
+#define QIB_7322_HwErrStatus_IBCBusFromSPCParityErr_1_RMASK 0x1
+#define QIB_7322_HwErrStatus_IBCBusToSPCParityErr_1_LSB 0xE
+#define QIB_7322_HwErrStatus_IBCBusToSPCParityErr_1_MSB 0xE
+#define QIB_7322_HwErrStatus_IBCBusToSPCParityErr_1_RMASK 0x1
+#define QIB_7322_HwErrStatus_IBCBusFromSPCParityErr_0_LSB 0xD
+#define QIB_7322_HwErrStatus_IBCBusFromSPCParityErr_0_MSB 0xD
+#define QIB_7322_HwErrStatus_IBCBusFromSPCParityErr_0_RMASK 0x1
+#define QIB_7322_HwErrStatus_statusValidNoEop_LSB 0xC
+#define QIB_7322_HwErrStatus_statusValidNoEop_MSB 0xC
+#define QIB_7322_HwErrStatus_statusValidNoEop_RMASK 0x1
+#define QIB_7322_HwErrStatus_LATriggered_LSB 0xB
+#define QIB_7322_HwErrStatus_LATriggered_MSB 0xB
+#define QIB_7322_HwErrStatus_LATriggered_RMASK 0x1
+
+#define QIB_7322_HwErrClear_OFFS 0xA8
+#define QIB_7322_HwErrClear_DEF 0x0000000000000000
+#define QIB_7322_HwErrClear_IBSerdesPClkNotDetectClear_1_LSB 0x3F
+#define QIB_7322_HwErrClear_IBSerdesPClkNotDetectClear_1_MSB 0x3F
+#define QIB_7322_HwErrClear_IBSerdesPClkNotDetectClear_1_RMASK 0x1
+#define QIB_7322_HwErrClear_IBSerdesPClkNotDetectClear_0_LSB 0x3E
+#define QIB_7322_HwErrClear_IBSerdesPClkNotDetectClear_0_MSB 0x3E
+#define QIB_7322_HwErrClear_IBSerdesPClkNotDetectClear_0_RMASK 0x1
+#define QIB_7322_HwErrClear_PCIESerdesPClkNotDetectClear_LSB 0x37
+#define QIB_7322_HwErrClear_PCIESerdesPClkNotDetectClear_MSB 0x37
+#define QIB_7322_HwErrClear_PCIESerdesPClkNotDetectClear_RMASK 0x1
+#define QIB_7322_HwErrClear_PowerOnBISTFailedClear_LSB 0x36
+#define QIB_7322_HwErrClear_PowerOnBISTFailedClear_MSB 0x36
+#define QIB_7322_HwErrClear_PowerOnBISTFailedClear_RMASK 0x1
+#define QIB_7322_HwErrClear_TempsenseTholdReachedClear_LSB 0x35
+#define QIB_7322_HwErrClear_TempsenseTholdReachedClear_MSB 0x35
+#define QIB_7322_HwErrClear_TempsenseTholdReachedClear_RMASK 0x1
+#define QIB_7322_HwErrClear_MemoryErrClear_LSB 0x30
+#define QIB_7322_HwErrClear_MemoryErrClear_MSB 0x30
+#define QIB_7322_HwErrClear_MemoryErrClear_RMASK 0x1
+#define QIB_7322_HwErrClear_pcie_phy_txParityErr_LSB 0x22
+#define QIB_7322_HwErrClear_pcie_phy_txParityErr_MSB 0x22
+#define QIB_7322_HwErrClear_pcie_phy_txParityErr_RMASK 0x1
+#define QIB_7322_HwErrClear_PCIeBusParityClear_LSB 0x1F
+#define QIB_7322_HwErrClear_PCIeBusParityClear_MSB 0x21
+#define QIB_7322_HwErrClear_PCIeBusParityClear_RMASK 0x7
+#define QIB_7322_HwErrClear_PcieCplTimeoutClear_LSB 0x1E
+#define QIB_7322_HwErrClear_PcieCplTimeoutClear_MSB 0x1E
+#define QIB_7322_HwErrClear_PcieCplTimeoutClear_RMASK 0x1
+#define QIB_7322_HwErrClear_PciePoisonedTLPClear_LSB 0x1D
+#define QIB_7322_HwErrClear_PciePoisonedTLPClear_MSB 0x1D
+#define QIB_7322_HwErrClear_PciePoisonedTLPClear_RMASK 0x1
+#define QIB_7322_HwErrClear_SDmaMemReadErrClear_1_LSB 0x1C
+#define QIB_7322_HwErrClear_SDmaMemReadErrClear_1_MSB 0x1C
+#define QIB_7322_HwErrClear_SDmaMemReadErrClear_1_RMASK 0x1
+#define QIB_7322_HwErrClear_SDmaMemReadErrClear_0_LSB 0x1B
+#define QIB_7322_HwErrClear_SDmaMemReadErrClear_0_MSB 0x1B
+#define QIB_7322_HwErrClear_SDmaMemReadErrClear_0_RMASK 0x1
+#define QIB_7322_HwErrClear_IBCBusFromSPCParityErrClear_1_LSB 0xF
+#define QIB_7322_HwErrClear_IBCBusFromSPCParityErrClear_1_MSB 0xF
+#define QIB_7322_HwErrClear_IBCBusFromSPCParityErrClear_1_RMASK 0x1
+#define QIB_7322_HwErrClear_IBCBusToSPCParityErrClear_1_LSB 0xE
+#define QIB_7322_HwErrClear_IBCBusToSPCParityErrClear_1_MSB 0xE
+#define QIB_7322_HwErrClear_IBCBusToSPCParityErrClear_1_RMASK 0x1
+#define QIB_7322_HwErrClear_IBCBusFromSPCParityErrClear_0_LSB 0xD
+#define QIB_7322_HwErrClear_IBCBusFromSPCParityErrClear_0_MSB 0xD
+#define QIB_7322_HwErrClear_IBCBusFromSPCParityErrClear_0_RMASK 0x1
+#define QIB_7322_HwErrClear_statusValidNoEopClear_LSB 0xC
+#define QIB_7322_HwErrClear_statusValidNoEopClear_MSB 0xC
+#define QIB_7322_HwErrClear_statusValidNoEopClear_RMASK 0x1
+#define QIB_7322_HwErrClear_LATriggeredClear_LSB 0xB
+#define QIB_7322_HwErrClear_LATriggeredClear_MSB 0xB
+#define QIB_7322_HwErrClear_LATriggeredClear_RMASK 0x1
+
+#define QIB_7322_HwDiagCtrl_OFFS 0xB0
+#define QIB_7322_HwDiagCtrl_DEF 0x0000000000000000
+#define QIB_7322_HwDiagCtrl_Diagnostic_LSB 0x3F
+#define QIB_7322_HwDiagCtrl_Diagnostic_MSB 0x3F
+#define QIB_7322_HwDiagCtrl_Diagnostic_RMASK 0x1
+#define QIB_7322_HwDiagCtrl_CounterWrEnable_LSB 0x3D
+#define QIB_7322_HwDiagCtrl_CounterWrEnable_MSB 0x3D
+#define QIB_7322_HwDiagCtrl_CounterWrEnable_RMASK 0x1
+#define QIB_7322_HwDiagCtrl_CounterDisable_LSB 0x3C
+#define QIB_7322_HwDiagCtrl_CounterDisable_MSB 0x3C
+#define QIB_7322_HwDiagCtrl_CounterDisable_RMASK 0x1
+#define QIB_7322_HwDiagCtrl_forcePCIeBusParity_LSB 0x1F
+#define QIB_7322_HwDiagCtrl_forcePCIeBusParity_MSB 0x22
+#define QIB_7322_HwDiagCtrl_forcePCIeBusParity_RMASK 0xF
+#define QIB_7322_HwDiagCtrl_ForceIBCBusFromSPCParityErr_1_LSB 0xF
+#define QIB_7322_HwDiagCtrl_ForceIBCBusFromSPCParityErr_1_MSB 0xF
+#define QIB_7322_HwDiagCtrl_ForceIBCBusFromSPCParityErr_1_RMASK 0x1
+#define QIB_7322_HwDiagCtrl_ForceIBCBusToSPCParityErr_1_LSB 0xE
+#define QIB_7322_HwDiagCtrl_ForceIBCBusToSPCParityErr_1_MSB 0xE
+#define QIB_7322_HwDiagCtrl_ForceIBCBusToSPCParityErr_1_RMASK 0x1
+#define QIB_7322_HwDiagCtrl_ForceIBCBusFromSPCParityErr_0_LSB 0xD
+#define QIB_7322_HwDiagCtrl_ForceIBCBusFromSPCParityErr_0_MSB 0xD
+#define QIB_7322_HwDiagCtrl_ForceIBCBusFromSPCParityErr_0_RMASK 0x1
+#define QIB_7322_HwDiagCtrl_ForceIBCBusToSPCParityErr_0_LSB 0xC
+#define QIB_7322_HwDiagCtrl_ForceIBCBusToSPCParityErr_0_MSB 0xC
+#define QIB_7322_HwDiagCtrl_ForceIBCBusToSPCParityErr_0_RMASK 0x1
+
+#define QIB_7322_EXTStatus_OFFS 0xC0
+#define QIB_7322_EXTStatus_DEF 0x000000000000X000
+#define QIB_7322_EXTStatus_GPIOIn_LSB 0x30
+#define QIB_7322_EXTStatus_GPIOIn_MSB 0x3F
+#define QIB_7322_EXTStatus_GPIOIn_RMASK 0xFFFF
+#define QIB_7322_EXTStatus_MemBISTDisabled_LSB 0xF
+#define QIB_7322_EXTStatus_MemBISTDisabled_MSB 0xF
+#define QIB_7322_EXTStatus_MemBISTDisabled_RMASK 0x1
+#define QIB_7322_EXTStatus_MemBISTEndTest_LSB 0xE
+#define QIB_7322_EXTStatus_MemBISTEndTest_MSB 0xE
+#define QIB_7322_EXTStatus_MemBISTEndTest_RMASK 0x1
+
+#define QIB_7322_EXTCtrl_OFFS 0xC8
+#define QIB_7322_EXTCtrl_DEF 0x0000000000000000
+#define QIB_7322_EXTCtrl_GPIOOe_LSB 0x30
+#define QIB_7322_EXTCtrl_GPIOOe_MSB 0x3F
+#define QIB_7322_EXTCtrl_GPIOOe_RMASK 0xFFFF
+#define QIB_7322_EXTCtrl_GPIOInvert_LSB 0x20
+#define QIB_7322_EXTCtrl_GPIOInvert_MSB 0x2F
+#define QIB_7322_EXTCtrl_GPIOInvert_RMASK 0xFFFF
+#define QIB_7322_EXTCtrl_LEDPort1GreenOn_LSB 0x3
+#define QIB_7322_EXTCtrl_LEDPort1GreenOn_MSB 0x3
+#define QIB_7322_EXTCtrl_LEDPort1GreenOn_RMASK 0x1
+#define QIB_7322_EXTCtrl_LEDPort1YellowOn_LSB 0x2
+#define QIB_7322_EXTCtrl_LEDPort1YellowOn_MSB 0x2
+#define QIB_7322_EXTCtrl_LEDPort1YellowOn_RMASK 0x1
+#define QIB_7322_EXTCtrl_LEDPort0GreenOn_LSB 0x1
+#define QIB_7322_EXTCtrl_LEDPort0GreenOn_MSB 0x1
+#define QIB_7322_EXTCtrl_LEDPort0GreenOn_RMASK 0x1
+#define QIB_7322_EXTCtrl_LEDPort0YellowOn_LSB 0x0
+#define QIB_7322_EXTCtrl_LEDPort0YellowOn_MSB 0x0
+#define QIB_7322_EXTCtrl_LEDPort0YellowOn_RMASK 0x1
+
+#define QIB_7322_GPIOOut_OFFS 0xE0
+#define QIB_7322_GPIOOut_DEF 0x0000000000000000
+
+#define QIB_7322_GPIOMask_OFFS 0xE8
+#define QIB_7322_GPIOMask_DEF 0x0000000000000000
+
+#define QIB_7322_GPIOStatus_OFFS 0xF0
+#define QIB_7322_GPIOStatus_DEF 0x0000000000000000
+
+#define QIB_7322_GPIOClear_OFFS 0xF8
+#define QIB_7322_GPIOClear_DEF 0x0000000000000000
+
+#define QIB_7322_RcvCtrl_OFFS 0x100
+#define QIB_7322_RcvCtrl_DEF 0x0000000000000000
+#define QIB_7322_RcvCtrl_TidReDirect_LSB 0x30
+#define QIB_7322_RcvCtrl_TidReDirect_MSB 0x3F
+#define QIB_7322_RcvCtrl_TidReDirect_RMASK 0xFFFF
+#define QIB_7322_RcvCtrl_TailUpd_LSB 0x2F
+#define QIB_7322_RcvCtrl_TailUpd_MSB 0x2F
+#define QIB_7322_RcvCtrl_TailUpd_RMASK 0x1
+#define QIB_7322_RcvCtrl_XrcTypeCode_LSB 0x2C
+#define QIB_7322_RcvCtrl_XrcTypeCode_MSB 0x2E
+#define QIB_7322_RcvCtrl_XrcTypeCode_RMASK 0x7
+#define QIB_7322_RcvCtrl_TidFlowEnable_LSB 0x2B
+#define QIB_7322_RcvCtrl_TidFlowEnable_MSB 0x2B
+#define QIB_7322_RcvCtrl_TidFlowEnable_RMASK 0x1
+#define QIB_7322_RcvCtrl_ContextCfg_LSB 0x29
+#define QIB_7322_RcvCtrl_ContextCfg_MSB 0x2A
+#define QIB_7322_RcvCtrl_ContextCfg_RMASK 0x3
+#define QIB_7322_RcvCtrl_IntrAvail_LSB 0x14
+#define QIB_7322_RcvCtrl_IntrAvail_MSB 0x25
+#define QIB_7322_RcvCtrl_IntrAvail_RMASK 0x3FFFF
+#define QIB_7322_RcvCtrl_dontDropRHQFull_LSB 0x0
+#define QIB_7322_RcvCtrl_dontDropRHQFull_MSB 0x11
+#define QIB_7322_RcvCtrl_dontDropRHQFull_RMASK 0x3FFFF
+
+#define QIB_7322_RcvHdrSize_OFFS 0x110
+#define QIB_7322_RcvHdrSize_DEF 0x0000000000000000
+
+#define QIB_7322_RcvHdrCnt_OFFS 0x118
+#define QIB_7322_RcvHdrCnt_DEF 0x0000000000000000
+
+#define QIB_7322_RcvHdrEntSize_OFFS 0x120
+#define QIB_7322_RcvHdrEntSize_DEF 0x0000000000000000
+
+#define QIB_7322_RcvTIDBase_OFFS 0x128
+#define QIB_7322_RcvTIDBase_DEF 0x0000000000050000
+
+#define QIB_7322_RcvTIDCnt_OFFS 0x130
+#define QIB_7322_RcvTIDCnt_DEF 0x0000000000000200
+
+#define QIB_7322_RcvEgrBase_OFFS 0x138
+#define QIB_7322_RcvEgrBase_DEF 0x0000000000014000
+
+#define QIB_7322_RcvEgrCnt_OFFS 0x140
+#define QIB_7322_RcvEgrCnt_DEF 0x0000000000001000
+
+#define QIB_7322_RcvBufBase_OFFS 0x148
+#define QIB_7322_RcvBufBase_DEF 0x0000000000080000
+
+#define QIB_7322_RcvBufSize_OFFS 0x150
+#define QIB_7322_RcvBufSize_DEF 0x0000000000005000
+
+#define QIB_7322_RxIntMemBase_OFFS 0x158
+#define QIB_7322_RxIntMemBase_DEF 0x0000000000077000
+
+#define QIB_7322_RxIntMemSize_OFFS 0x160
+#define QIB_7322_RxIntMemSize_DEF 0x0000000000007000
+
+#define QIB_7322_feature_mask_OFFS 0x190
+#define QIB_7322_feature_mask_DEF 0x00000000000000XX
+
+#define QIB_7322_active_feature_mask_OFFS 0x198
+#define QIB_7322_active_feature_mask_DEF 0x00000000000000XX
+#define QIB_7322_active_feature_mask_Port1_QDR_Enabled_LSB 0x5
+#define QIB_7322_active_feature_mask_Port1_QDR_Enabled_MSB 0x5
+#define QIB_7322_active_feature_mask_Port1_QDR_Enabled_RMASK 0x1
+#define QIB_7322_active_feature_mask_Port1_DDR_Enabled_LSB 0x4
+#define QIB_7322_active_feature_mask_Port1_DDR_Enabled_MSB 0x4
+#define QIB_7322_active_feature_mask_Port1_DDR_Enabled_RMASK 0x1
+#define QIB_7322_active_feature_mask_Port1_SDR_Enabled_LSB 0x3
+#define QIB_7322_active_feature_mask_Port1_SDR_Enabled_MSB 0x3
+#define QIB_7322_active_feature_mask_Port1_SDR_Enabled_RMASK 0x1
+#define QIB_7322_active_feature_mask_Port0_QDR_Enabled_LSB 0x2
+#define QIB_7322_active_feature_mask_Port0_QDR_Enabled_MSB 0x2
+#define QIB_7322_active_feature_mask_Port0_QDR_Enabled_RMASK 0x1
+#define QIB_7322_active_feature_mask_Port0_DDR_Enabled_LSB 0x1
+#define QIB_7322_active_feature_mask_Port0_DDR_Enabled_MSB 0x1
+#define QIB_7322_active_feature_mask_Port0_DDR_Enabled_RMASK 0x1
+#define QIB_7322_active_feature_mask_Port0_SDR_Enabled_LSB 0x0
+#define QIB_7322_active_feature_mask_Port0_SDR_Enabled_MSB 0x0
+#define QIB_7322_active_feature_mask_Port0_SDR_Enabled_RMASK 0x1
+
+#define QIB_7322_SendCtrl_OFFS 0x1C0
+#define QIB_7322_SendCtrl_DEF 0x0000000000000000
+#define QIB_7322_SendCtrl_Disarm_LSB 0x1F
+#define QIB_7322_SendCtrl_Disarm_MSB 0x1F
+#define QIB_7322_SendCtrl_Disarm_RMASK 0x1
+#define QIB_7322_SendCtrl_SendBufAvailPad64Byte_LSB 0x1D
+#define QIB_7322_SendCtrl_SendBufAvailPad64Byte_MSB 0x1D
+#define QIB_7322_SendCtrl_SendBufAvailPad64Byte_RMASK 0x1
+#define QIB_7322_SendCtrl_AvailUpdThld_LSB 0x18
+#define QIB_7322_SendCtrl_AvailUpdThld_MSB 0x1C
+#define QIB_7322_SendCtrl_AvailUpdThld_RMASK 0x1F
+#define QIB_7322_SendCtrl_DisarmSendBuf_LSB 0x10
+#define QIB_7322_SendCtrl_DisarmSendBuf_MSB 0x17
+#define QIB_7322_SendCtrl_DisarmSendBuf_RMASK 0xFF
+#define QIB_7322_SendCtrl_SpecialTriggerEn_LSB 0x4
+#define QIB_7322_SendCtrl_SpecialTriggerEn_MSB 0x4
+#define QIB_7322_SendCtrl_SpecialTriggerEn_RMASK 0x1
+#define QIB_7322_SendCtrl_SendBufAvailUpd_LSB 0x2
+#define QIB_7322_SendCtrl_SendBufAvailUpd_MSB 0x2
+#define QIB_7322_SendCtrl_SendBufAvailUpd_RMASK 0x1
+#define QIB_7322_SendCtrl_SendIntBufAvail_LSB 0x1
+#define QIB_7322_SendCtrl_SendIntBufAvail_MSB 0x1
+#define QIB_7322_SendCtrl_SendIntBufAvail_RMASK 0x1
+
+#define QIB_7322_SendBufBase_OFFS 0x1C8
+#define QIB_7322_SendBufBase_DEF 0x0018000000100000
+#define QIB_7322_SendBufBase_BaseAddr_LargePIO_LSB 0x20
+#define QIB_7322_SendBufBase_BaseAddr_LargePIO_MSB 0x34
+#define QIB_7322_SendBufBase_BaseAddr_LargePIO_RMASK 0x1FFFFF
+#define QIB_7322_SendBufBase_BaseAddr_SmallPIO_LSB 0x0
+#define QIB_7322_SendBufBase_BaseAddr_SmallPIO_MSB 0x14
+#define QIB_7322_SendBufBase_BaseAddr_SmallPIO_RMASK 0x1FFFFF
+
+#define QIB_7322_SendBufSize_OFFS 0x1D0
+#define QIB_7322_SendBufSize_DEF 0x0000108000000880
+#define QIB_7322_SendBufSize_Size_LargePIO_LSB 0x20
+#define QIB_7322_SendBufSize_Size_LargePIO_MSB 0x2C
+#define QIB_7322_SendBufSize_Size_LargePIO_RMASK 0x1FFF
+#define QIB_7322_SendBufSize_Size_SmallPIO_LSB 0x0
+#define QIB_7322_SendBufSize_Size_SmallPIO_MSB 0xB
+#define QIB_7322_SendBufSize_Size_SmallPIO_RMASK 0xFFF
+
+#define QIB_7322_SendBufCnt_OFFS 0x1D8
+#define QIB_7322_SendBufCnt_DEF 0x0000002000000080
+#define QIB_7322_SendBufCnt_Num_LargeBuffers_LSB 0x20
+#define QIB_7322_SendBufCnt_Num_LargeBuffers_MSB 0x25
+#define QIB_7322_SendBufCnt_Num_LargeBuffers_RMASK 0x3F
+#define QIB_7322_SendBufCnt_Num_SmallBuffers_LSB 0x0
+#define QIB_7322_SendBufCnt_Num_SmallBuffers_MSB 0x8
+#define QIB_7322_SendBufCnt_Num_SmallBuffers_RMASK 0x1FF
+
+#define QIB_7322_SendBufAvailAddr_OFFS 0x1E0
+#define QIB_7322_SendBufAvailAddr_DEF 0x0000000000000000
+#define QIB_7322_SendBufAvailAddr_SendBufAvailAddr_LSB 0x6
+#define QIB_7322_SendBufAvailAddr_SendBufAvailAddr_MSB 0x27
+#define QIB_7322_SendBufAvailAddr_SendBufAvailAddr_RMASK 0x3FFFFFFFF
+
+#define QIB_7322_SendBufErr0_OFFS 0x240
+#define QIB_7322_SendBufErr0_DEF 0x0000000000000000
+#define QIB_7322_SendBufErr0_SendBufErr_63_0_LSB 0x0
+#define QIB_7322_SendBufErr0_SendBufErr_63_0_MSB 0x3F
+#define QIB_7322_SendBufErr0_SendBufErr_63_0_RMASK 0x0
+
+#define QIB_7322_AvailUpdCount_OFFS 0x268
+#define QIB_7322_AvailUpdCount_DEF 0x0000000000000000
+#define QIB_7322_AvailUpdCount_AvailUpdCount_LSB 0x0
+#define QIB_7322_AvailUpdCount_AvailUpdCount_MSB 0x4
+#define QIB_7322_AvailUpdCount_AvailUpdCount_RMASK 0x1F
+
+#define QIB_7322_RcvHdrAddr0_OFFS 0x280
+#define QIB_7322_RcvHdrAddr0_DEF 0x0000000000000000
+#define QIB_7322_RcvHdrAddr0_RcvHdrAddr_LSB 0x2
+#define QIB_7322_RcvHdrAddr0_RcvHdrAddr_MSB 0x27
+#define QIB_7322_RcvHdrAddr0_RcvHdrAddr_RMASK 0x3FFFFFFFFF
+
+#define QIB_7322_RcvHdrTailAddr0_OFFS 0x340
+#define QIB_7322_RcvHdrTailAddr0_DEF 0x0000000000000000
+#define QIB_7322_RcvHdrTailAddr0_RcvHdrTailAddr_LSB 0x2
+#define QIB_7322_RcvHdrTailAddr0_RcvHdrTailAddr_MSB 0x27
+#define QIB_7322_RcvHdrTailAddr0_RcvHdrTailAddr_RMASK 0x3FFFFFFFFF
+
+#define QIB_7322_ahb_access_ctrl_OFFS 0x460
+#define QIB_7322_ahb_access_ctrl_DEF 0x0000000000000000
+#define QIB_7322_ahb_access_ctrl_sw_sel_ahb_trgt_LSB 0x1
+#define QIB_7322_ahb_access_ctrl_sw_sel_ahb_trgt_MSB 0x2
+#define QIB_7322_ahb_access_ctrl_sw_sel_ahb_trgt_RMASK 0x3
+#define QIB_7322_ahb_access_ctrl_sw_ahb_sel_LSB 0x0
+#define QIB_7322_ahb_access_ctrl_sw_ahb_sel_MSB 0x0
+#define QIB_7322_ahb_access_ctrl_sw_ahb_sel_RMASK 0x1
+
+#define QIB_7322_ahb_transaction_reg_OFFS 0x468
+#define QIB_7322_ahb_transaction_reg_DEF 0x0000000080000000
+#define QIB_7322_ahb_transaction_reg_ahb_data_LSB 0x20
+#define QIB_7322_ahb_transaction_reg_ahb_data_MSB 0x3F
+#define QIB_7322_ahb_transaction_reg_ahb_data_RMASK 0xFFFFFFFF
+#define QIB_7322_ahb_transaction_reg_ahb_rdy_LSB 0x1F
+#define QIB_7322_ahb_transaction_reg_ahb_rdy_MSB 0x1F
+#define QIB_7322_ahb_transaction_reg_ahb_rdy_RMASK 0x1
+#define QIB_7322_ahb_transaction_reg_ahb_req_err_LSB 0x1E
+#define QIB_7322_ahb_transaction_reg_ahb_req_err_MSB 0x1E
+#define QIB_7322_ahb_transaction_reg_ahb_req_err_RMASK 0x1
+#define QIB_7322_ahb_transaction_reg_write_not_read_LSB 0x1B
+#define QIB_7322_ahb_transaction_reg_write_not_read_MSB 0x1B
+#define QIB_7322_ahb_transaction_reg_write_not_read_RMASK 0x1
+#define QIB_7322_ahb_transaction_reg_ahb_address_LSB 0x10
+#define QIB_7322_ahb_transaction_reg_ahb_address_MSB 0x1A
+#define QIB_7322_ahb_transaction_reg_ahb_address_RMASK 0x7FF
+
+#define QIB_7322_SPC_JTAG_ACCESS_REG_OFFS 0x470
+#define QIB_7322_SPC_JTAG_ACCESS_REG_DEF 0x0000000000000001
+#define QIB_7322_SPC_JTAG_ACCESS_REG_SPC_JTAG_ACCESS_EN_LSB 0xA
+#define QIB_7322_SPC_JTAG_ACCESS_REG_SPC_JTAG_ACCESS_EN_MSB 0xA
+#define QIB_7322_SPC_JTAG_ACCESS_REG_SPC_JTAG_ACCESS_EN_RMASK 0x1
+#define QIB_7322_SPC_JTAG_ACCESS_REG_bist_en_LSB 0x5
+#define QIB_7322_SPC_JTAG_ACCESS_REG_bist_en_MSB 0x9
+#define QIB_7322_SPC_JTAG_ACCESS_REG_bist_en_RMASK 0x1F
+#define QIB_7322_SPC_JTAG_ACCESS_REG_opcode_LSB 0x3
+#define QIB_7322_SPC_JTAG_ACCESS_REG_opcode_MSB 0x4
+#define QIB_7322_SPC_JTAG_ACCESS_REG_opcode_RMASK 0x3
+#define QIB_7322_SPC_JTAG_ACCESS_REG_tdi_LSB 0x2
+#define QIB_7322_SPC_JTAG_ACCESS_REG_tdi_MSB 0x2
+#define QIB_7322_SPC_JTAG_ACCESS_REG_tdi_RMASK 0x1
+#define QIB_7322_SPC_JTAG_ACCESS_REG_tdo_LSB 0x1
+#define QIB_7322_SPC_JTAG_ACCESS_REG_tdo_MSB 0x1
+#define QIB_7322_SPC_JTAG_ACCESS_REG_tdo_RMASK 0x1
+#define QIB_7322_SPC_JTAG_ACCESS_REG_rdy_LSB 0x0
+#define QIB_7322_SPC_JTAG_ACCESS_REG_rdy_MSB 0x0
+#define QIB_7322_SPC_JTAG_ACCESS_REG_rdy_RMASK 0x1
+
+#define QIB_7322_SendCheckMask0_OFFS 0x4C0
+#define QIB_7322_SendCheckMask0_DEF 0x0000000000000000
+#define QIB_7322_SendCheckMask0_SendCheckMask_63_32_LSB 0x0
+#define QIB_7322_SendCheckMask0_SendCheckMask_63_32_MSB 0x3F
+#define QIB_7322_SendCheckMask0_SendCheckMask_63_32_RMASK 0x0
+
+#define QIB_7322_SendGRHCheckMask0_OFFS 0x4E0
+#define QIB_7322_SendGRHCheckMask0_DEF 0x0000000000000000
+#define QIB_7322_SendGRHCheckMask0_SendGRHCheckMask_63_32_LSB 0x0
+#define QIB_7322_SendGRHCheckMask0_SendGRHCheckMask_63_32_MSB 0x3F
+#define QIB_7322_SendGRHCheckMask0_SendGRHCheckMask_63_32_RMASK 0x0
+
+#define QIB_7322_SendIBPacketMask0_OFFS 0x500
+#define QIB_7322_SendIBPacketMask0_DEF 0x0000000000000000
+#define QIB_7322_SendIBPacketMask0_SendIBPacketMask_63_32_LSB 0x0
+#define QIB_7322_SendIBPacketMask0_SendIBPacketMask_63_32_MSB 0x3F
+#define QIB_7322_SendIBPacketMask0_SendIBPacketMask_63_32_RMASK 0x0
+
+#define QIB_7322_IntRedirect0_OFFS 0x540
+#define QIB_7322_IntRedirect0_DEF 0x0000000000000000
+#define QIB_7322_IntRedirect0_vec11_LSB 0x37
+#define QIB_7322_IntRedirect0_vec11_MSB 0x3B
+#define QIB_7322_IntRedirect0_vec11_RMASK 0x1F
+#define QIB_7322_IntRedirect0_vec10_LSB 0x32
+#define QIB_7322_IntRedirect0_vec10_MSB 0x36
+#define QIB_7322_IntRedirect0_vec10_RMASK 0x1F
+#define QIB_7322_IntRedirect0_vec9_LSB 0x2D
+#define QIB_7322_IntRedirect0_vec9_MSB 0x31
+#define QIB_7322_IntRedirect0_vec9_RMASK 0x1F
+#define QIB_7322_IntRedirect0_vec8_LSB 0x28
+#define QIB_7322_IntRedirect0_vec8_MSB 0x2C
+#define QIB_7322_IntRedirect0_vec8_RMASK 0x1F
+#define QIB_7322_IntRedirect0_vec7_LSB 0x23
+#define QIB_7322_IntRedirect0_vec7_MSB 0x27
+#define QIB_7322_IntRedirect0_vec7_RMASK 0x1F
+#define QIB_7322_IntRedirect0_vec6_LSB 0x1E
+#define QIB_7322_IntRedirect0_vec6_MSB 0x22
+#define QIB_7322_IntRedirect0_vec6_RMASK 0x1F
+#define QIB_7322_IntRedirect0_vec5_LSB 0x19
+#define QIB_7322_IntRedirect0_vec5_MSB 0x1D
+#define QIB_7322_IntRedirect0_vec5_RMASK 0x1F
+#define QIB_7322_IntRedirect0_vec4_LSB 0x14
+#define QIB_7322_IntRedirect0_vec4_MSB 0x18
+#define QIB_7322_IntRedirect0_vec4_RMASK 0x1F
+#define QIB_7322_IntRedirect0_vec3_LSB 0xF
+#define QIB_7322_IntRedirect0_vec3_MSB 0x13
+#define QIB_7322_IntRedirect0_vec3_RMASK 0x1F
+#define QIB_7322_IntRedirect0_vec2_LSB 0xA
+#define QIB_7322_IntRedirect0_vec2_MSB 0xE
+#define QIB_7322_IntRedirect0_vec2_RMASK 0x1F
+#define QIB_7322_IntRedirect0_vec1_LSB 0x5
+#define QIB_7322_IntRedirect0_vec1_MSB 0x9
+#define QIB_7322_IntRedirect0_vec1_RMASK 0x1F
+#define QIB_7322_IntRedirect0_vec0_LSB 0x0
+#define QIB_7322_IntRedirect0_vec0_MSB 0x4
+#define QIB_7322_IntRedirect0_vec0_RMASK 0x1F
+
+#define QIB_7322_Int_Granted_OFFS 0x570
+#define QIB_7322_Int_Granted_DEF 0x0000000000000000
+
+#define QIB_7322_vec_clr_without_int_OFFS 0x578
+#define QIB_7322_vec_clr_without_int_DEF 0x0000000000000000
+
+#define QIB_7322_DCACtrlA_OFFS 0x580
+#define QIB_7322_DCACtrlA_DEF 0x0000000000000000
+#define QIB_7322_DCACtrlA_SendDMAHead1DCAEnable_LSB 0x4
+#define QIB_7322_DCACtrlA_SendDMAHead1DCAEnable_MSB 0x4
+#define QIB_7322_DCACtrlA_SendDMAHead1DCAEnable_RMASK 0x1
+#define QIB_7322_DCACtrlA_SendDMAHead0DCAEnable_LSB 0x3
+#define QIB_7322_DCACtrlA_SendDMAHead0DCAEnable_MSB 0x3
+#define QIB_7322_DCACtrlA_SendDMAHead0DCAEnable_RMASK 0x1
+#define QIB_7322_DCACtrlA_RcvTailUpdDCAEnable_LSB 0x2
+#define QIB_7322_DCACtrlA_RcvTailUpdDCAEnable_MSB 0x2
+#define QIB_7322_DCACtrlA_RcvTailUpdDCAEnable_RMASK 0x1
+#define QIB_7322_DCACtrlA_EagerDCAEnable_LSB 0x1
+#define QIB_7322_DCACtrlA_EagerDCAEnable_MSB 0x1
+#define QIB_7322_DCACtrlA_EagerDCAEnable_RMASK 0x1
+#define QIB_7322_DCACtrlA_RcvHdrqDCAEnable_LSB 0x0
+#define QIB_7322_DCACtrlA_RcvHdrqDCAEnable_MSB 0x0
+#define QIB_7322_DCACtrlA_RcvHdrqDCAEnable_RMASK 0x1
+
+#define QIB_7322_DCACtrlB_OFFS 0x588
+#define QIB_7322_DCACtrlB_DEF 0x0000000000000000
+#define QIB_7322_DCACtrlB_RcvHdrq3DCAXfrCnt_LSB 0x36
+#define QIB_7322_DCACtrlB_RcvHdrq3DCAXfrCnt_MSB 0x3B
+#define QIB_7322_DCACtrlB_RcvHdrq3DCAXfrCnt_RMASK 0x3F
+#define QIB_7322_DCACtrlB_RcvHdrq3DCAOPH_LSB 0x2E
+#define QIB_7322_DCACtrlB_RcvHdrq3DCAOPH_MSB 0x35
+#define QIB_7322_DCACtrlB_RcvHdrq3DCAOPH_RMASK 0xFF
+#define QIB_7322_DCACtrlB_RcvHdrq2DCAXfrCnt_LSB 0x28
+#define QIB_7322_DCACtrlB_RcvHdrq2DCAXfrCnt_MSB 0x2D
+#define QIB_7322_DCACtrlB_RcvHdrq2DCAXfrCnt_RMASK 0x3F
+#define QIB_7322_DCACtrlB_RcvHdrq2DCAOPH_LSB 0x20
+#define QIB_7322_DCACtrlB_RcvHdrq2DCAOPH_MSB 0x27
+#define QIB_7322_DCACtrlB_RcvHdrq2DCAOPH_RMASK 0xFF
+#define QIB_7322_DCACtrlB_RcvHdrq1DCAXfrCnt_LSB 0x16
+#define QIB_7322_DCACtrlB_RcvHdrq1DCAXfrCnt_MSB 0x1B
+#define QIB_7322_DCACtrlB_RcvHdrq1DCAXfrCnt_RMASK 0x3F
+#define QIB_7322_DCACtrlB_RcvHdrq1DCAOPH_LSB 0xE
+#define QIB_7322_DCACtrlB_RcvHdrq1DCAOPH_MSB 0x15
+#define QIB_7322_DCACtrlB_RcvHdrq1DCAOPH_RMASK 0xFF
+#define QIB_7322_DCACtrlB_RcvHdrq0DCAXfrCnt_LSB 0x8
+#define QIB_7322_DCACtrlB_RcvHdrq0DCAXfrCnt_MSB 0xD
+#define QIB_7322_DCACtrlB_RcvHdrq0DCAXfrCnt_RMASK 0x3F
+#define QIB_7322_DCACtrlB_RcvHdrq0DCAOPH_LSB 0x0
+#define QIB_7322_DCACtrlB_RcvHdrq0DCAOPH_MSB 0x7
+#define QIB_7322_DCACtrlB_RcvHdrq0DCAOPH_RMASK 0xFF
+
+#define QIB_7322_DCACtrlC_OFFS 0x590
+#define QIB_7322_DCACtrlC_DEF 0x0000000000000000
+#define QIB_7322_DCACtrlC_RcvHdrq7DCAXfrCnt_LSB 0x36
+#define QIB_7322_DCACtrlC_RcvHdrq7DCAXfrCnt_MSB 0x3B
+#define QIB_7322_DCACtrlC_RcvHdrq7DCAXfrCnt_RMASK 0x3F
+#define QIB_7322_DCACtrlC_RcvHdrq7DCAOPH_LSB 0x2E
+#define QIB_7322_DCACtrlC_RcvHdrq7DCAOPH_MSB 0x35
+#define QIB_7322_DCACtrlC_RcvHdrq7DCAOPH_RMASK 0xFF
+#define QIB_7322_DCACtrlC_RcvHdrq6DCAXfrCnt_LSB 0x28
+#define QIB_7322_DCACtrlC_RcvHdrq6DCAXfrCnt_MSB 0x2D
+#define QIB_7322_DCACtrlC_RcvHdrq6DCAXfrCnt_RMASK 0x3F
+#define QIB_7322_DCACtrlC_RcvHdrq6DCAOPH_LSB 0x20
+#define QIB_7322_DCACtrlC_RcvHdrq6DCAOPH_MSB 0x27
+#define QIB_7322_DCACtrlC_RcvHdrq6DCAOPH_RMASK 0xFF
+#define QIB_7322_DCACtrlC_RcvHdrq5DCAXfrCnt_LSB 0x16
+#define QIB_7322_DCACtrlC_RcvHdrq5DCAXfrCnt_MSB 0x1B
+#define QIB_7322_DCACtrlC_RcvHdrq5DCAXfrCnt_RMASK 0x3F
+#define QIB_7322_DCACtrlC_RcvHdrq5DCAOPH_LSB 0xE
+#define QIB_7322_DCACtrlC_RcvHdrq5DCAOPH_MSB 0x15
+#define QIB_7322_DCACtrlC_RcvHdrq5DCAOPH_RMASK 0xFF
+#define QIB_7322_DCACtrlC_RcvHdrq4DCAXfrCnt_LSB 0x8
+#define QIB_7322_DCACtrlC_RcvHdrq4DCAXfrCnt_MSB 0xD
+#define QIB_7322_DCACtrlC_RcvHdrq4DCAXfrCnt_RMASK 0x3F
+#define QIB_7322_DCACtrlC_RcvHdrq4DCAOPH_LSB 0x0
+#define QIB_7322_DCACtrlC_RcvHdrq4DCAOPH_MSB 0x7
+#define QIB_7322_DCACtrlC_RcvHdrq4DCAOPH_RMASK 0xFF
+
+#define QIB_7322_DCACtrlD_OFFS 0x598
+#define QIB_7322_DCACtrlD_DEF 0x0000000000000000
+#define QIB_7322_DCACtrlD_RcvHdrq11DCAXfrCnt_LSB 0x36
+#define QIB_7322_DCACtrlD_RcvHdrq11DCAXfrCnt_MSB 0x3B
+#define QIB_7322_DCACtrlD_RcvHdrq11DCAXfrCnt_RMASK 0x3F
+#define QIB_7322_DCACtrlD_RcvHdrq11DCAOPH_LSB 0x2E
+#define QIB_7322_DCACtrlD_RcvHdrq11DCAOPH_MSB 0x35
+#define QIB_7322_DCACtrlD_RcvHdrq11DCAOPH_RMASK 0xFF
+#define QIB_7322_DCACtrlD_RcvHdrq10DCAXfrCnt_LSB 0x28
+#define QIB_7322_DCACtrlD_RcvHdrq10DCAXfrCnt_MSB 0x2D
+#define QIB_7322_DCACtrlD_RcvHdrq10DCAXfrCnt_RMASK 0x3F
+#define QIB_7322_DCACtrlD_RcvHdrq10DCAOPH_LSB 0x20
+#define QIB_7322_DCACtrlD_RcvHdrq10DCAOPH_MSB 0x27
+#define QIB_7322_DCACtrlD_RcvHdrq10DCAOPH_RMASK 0xFF
+#define QIB_7322_DCACtrlD_RcvHdrq9DCAXfrCnt_LSB 0x16
+#define QIB_7322_DCACtrlD_RcvHdrq9DCAXfrCnt_MSB 0x1B
+#define QIB_7322_DCACtrlD_RcvHdrq9DCAXfrCnt_RMASK 0x3F
+#define QIB_7322_DCACtrlD_RcvHdrq9DCAOPH_LSB 0xE
+#define QIB_7322_DCACtrlD_RcvHdrq9DCAOPH_MSB 0x15
+#define QIB_7322_DCACtrlD_RcvHdrq9DCAOPH_RMASK 0xFF
+#define QIB_7322_DCACtrlD_RcvHdrq8DCAXfrCnt_LSB 0x8
+#define QIB_7322_DCACtrlD_RcvHdrq8DCAXfrCnt_MSB 0xD
+#define QIB_7322_DCACtrlD_RcvHdrq8DCAXfrCnt_RMASK 0x3F
+#define QIB_7322_DCACtrlD_RcvHdrq8DCAOPH_LSB 0x0
+#define QIB_7322_DCACtrlD_RcvHdrq8DCAOPH_MSB 0x7
+#define QIB_7322_DCACtrlD_RcvHdrq8DCAOPH_RMASK 0xFF
+
+#define QIB_7322_DCACtrlE_OFFS 0x5A0
+#define QIB_7322_DCACtrlE_DEF 0x0000000000000000
+#define QIB_7322_DCACtrlE_RcvHdrq15DCAXfrCnt_LSB 0x36
+#define QIB_7322_DCACtrlE_RcvHdrq15DCAXfrCnt_MSB 0x3B
+#define QIB_7322_DCACtrlE_RcvHdrq15DCAXfrCnt_RMASK 0x3F
+#define QIB_7322_DCACtrlE_RcvHdrq15DCAOPH_LSB 0x2E
+#define QIB_7322_DCACtrlE_RcvHdrq15DCAOPH_MSB 0x35
+#define QIB_7322_DCACtrlE_RcvHdrq15DCAOPH_RMASK 0xFF
+#define QIB_7322_DCACtrlE_RcvHdrq14DCAXfrCnt_LSB 0x28
+#define QIB_7322_DCACtrlE_RcvHdrq14DCAXfrCnt_MSB 0x2D
+#define QIB_7322_DCACtrlE_RcvHdrq14DCAXfrCnt_RMASK 0x3F
+#define QIB_7322_DCACtrlE_RcvHdrq14DCAOPH_LSB 0x20
+#define QIB_7322_DCACtrlE_RcvHdrq14DCAOPH_MSB 0x27
+#define QIB_7322_DCACtrlE_RcvHdrq14DCAOPH_RMASK 0xFF
+#define QIB_7322_DCACtrlE_RcvHdrq13DCAXfrCnt_LSB 0x16
+#define QIB_7322_DCACtrlE_RcvHdrq13DCAXfrCnt_MSB 0x1B
+#define QIB_7322_DCACtrlE_RcvHdrq13DCAXfrCnt_RMASK 0x3F
+#define QIB_7322_DCACtrlE_RcvHdrq13DCAOPH_LSB 0xE
+#define QIB_7322_DCACtrlE_RcvHdrq13DCAOPH_MSB 0x15
+#define QIB_7322_DCACtrlE_RcvHdrq13DCAOPH_RMASK 0xFF
+#define QIB_7322_DCACtrlE_RcvHdrq12DCAXfrCnt_LSB 0x8
+#define QIB_7322_DCACtrlE_RcvHdrq12DCAXfrCnt_MSB 0xD
+#define QIB_7322_DCACtrlE_RcvHdrq12DCAXfrCnt_RMASK 0x3F
+#define QIB_7322_DCACtrlE_RcvHdrq12DCAOPH_LSB 0x0
+#define QIB_7322_DCACtrlE_RcvHdrq12DCAOPH_MSB 0x7
+#define QIB_7322_DCACtrlE_RcvHdrq12DCAOPH_RMASK 0xFF
+
+#define QIB_7322_DCACtrlF_OFFS 0x5A8
+#define QIB_7322_DCACtrlF_DEF 0x0000000000000000
+#define QIB_7322_DCACtrlF_SendDma1DCAOPH_LSB 0x28
+#define QIB_7322_DCACtrlF_SendDma1DCAOPH_MSB 0x2F
+#define QIB_7322_DCACtrlF_SendDma1DCAOPH_RMASK 0xFF
+#define QIB_7322_DCACtrlF_SendDma0DCAOPH_LSB 0x20
+#define QIB_7322_DCACtrlF_SendDma0DCAOPH_MSB 0x27
+#define QIB_7322_DCACtrlF_SendDma0DCAOPH_RMASK 0xFF
+#define QIB_7322_DCACtrlF_RcvHdrq17DCAXfrCnt_LSB 0x16
+#define QIB_7322_DCACtrlF_RcvHdrq17DCAXfrCnt_MSB 0x1B
+#define QIB_7322_DCACtrlF_RcvHdrq17DCAXfrCnt_RMASK 0x3F
+#define QIB_7322_DCACtrlF_RcvHdrq17DCAOPH_LSB 0xE
+#define QIB_7322_DCACtrlF_RcvHdrq17DCAOPH_MSB 0x15
+#define QIB_7322_DCACtrlF_RcvHdrq17DCAOPH_RMASK 0xFF
+#define QIB_7322_DCACtrlF_RcvHdrq16DCAXfrCnt_LSB 0x8
+#define QIB_7322_DCACtrlF_RcvHdrq16DCAXfrCnt_MSB 0xD
+#define QIB_7322_DCACtrlF_RcvHdrq16DCAXfrCnt_RMASK 0x3F
+#define QIB_7322_DCACtrlF_RcvHdrq16DCAOPH_LSB 0x0
+#define QIB_7322_DCACtrlF_RcvHdrq16DCAOPH_MSB 0x7
+#define QIB_7322_DCACtrlF_RcvHdrq16DCAOPH_RMASK 0xFF
+
+#define QIB_7322_RcvAvailTimeOut0_OFFS 0xC00
+#define QIB_7322_RcvAvailTimeOut0_DEF 0x0000000000000000
+#define QIB_7322_RcvAvailTimeOut0_RcvAvailTOCount_LSB 0x10
+#define QIB_7322_RcvAvailTimeOut0_RcvAvailTOCount_MSB 0x1F
+#define QIB_7322_RcvAvailTimeOut0_RcvAvailTOCount_RMASK 0xFFFF
+#define QIB_7322_RcvAvailTimeOut0_RcvAvailTOReload_LSB 0x0
+#define QIB_7322_RcvAvailTimeOut0_RcvAvailTOReload_MSB 0xF
+#define QIB_7322_RcvAvailTimeOut0_RcvAvailTOReload_RMASK 0xFFFF
+
+#define QIB_7322_CntrRegBase_0_OFFS 0x1028
+#define QIB_7322_CntrRegBase_0_DEF 0x0000000000012000
+
+#define QIB_7322_ErrMask_0_OFFS 0x1080
+#define QIB_7322_ErrMask_0_DEF 0x0000000000000000
+#define QIB_7322_ErrMask_0_IBStatusChangedMask_LSB 0x3A
+#define QIB_7322_ErrMask_0_IBStatusChangedMask_MSB 0x3A
+#define QIB_7322_ErrMask_0_IBStatusChangedMask_RMASK 0x1
+#define QIB_7322_ErrMask_0_SHeadersErrMask_LSB 0x39
+#define QIB_7322_ErrMask_0_SHeadersErrMask_MSB 0x39
+#define QIB_7322_ErrMask_0_SHeadersErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_0_VL15BufMisuseErrMask_LSB 0x36
+#define QIB_7322_ErrMask_0_VL15BufMisuseErrMask_MSB 0x36
+#define QIB_7322_ErrMask_0_VL15BufMisuseErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_0_SDmaHaltErrMask_LSB 0x31
+#define QIB_7322_ErrMask_0_SDmaHaltErrMask_MSB 0x31
+#define QIB_7322_ErrMask_0_SDmaHaltErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_0_SDmaDescAddrMisalignErrMask_LSB 0x30
+#define QIB_7322_ErrMask_0_SDmaDescAddrMisalignErrMask_MSB 0x30
+#define QIB_7322_ErrMask_0_SDmaDescAddrMisalignErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_0_SDmaUnexpDataErrMask_LSB 0x2F
+#define QIB_7322_ErrMask_0_SDmaUnexpDataErrMask_MSB 0x2F
+#define QIB_7322_ErrMask_0_SDmaUnexpDataErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_0_SDmaMissingDwErrMask_LSB 0x2E
+#define QIB_7322_ErrMask_0_SDmaMissingDwErrMask_MSB 0x2E
+#define QIB_7322_ErrMask_0_SDmaMissingDwErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_0_SDmaDwEnErrMask_LSB 0x2D
+#define QIB_7322_ErrMask_0_SDmaDwEnErrMask_MSB 0x2D
+#define QIB_7322_ErrMask_0_SDmaDwEnErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_0_SDmaRpyTagErrMask_LSB 0x2C
+#define QIB_7322_ErrMask_0_SDmaRpyTagErrMask_MSB 0x2C
+#define QIB_7322_ErrMask_0_SDmaRpyTagErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_0_SDma1stDescErrMask_LSB 0x2B
+#define QIB_7322_ErrMask_0_SDma1stDescErrMask_MSB 0x2B
+#define QIB_7322_ErrMask_0_SDma1stDescErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_0_SDmaBaseErrMask_LSB 0x2A
+#define QIB_7322_ErrMask_0_SDmaBaseErrMask_MSB 0x2A
+#define QIB_7322_ErrMask_0_SDmaBaseErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_0_SDmaTailOutOfBoundErrMask_LSB 0x29
+#define QIB_7322_ErrMask_0_SDmaTailOutOfBoundErrMask_MSB 0x29
+#define QIB_7322_ErrMask_0_SDmaTailOutOfBoundErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_0_SDmaOutOfBoundErrMask_LSB 0x28
+#define QIB_7322_ErrMask_0_SDmaOutOfBoundErrMask_MSB 0x28
+#define QIB_7322_ErrMask_0_SDmaOutOfBoundErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_0_SDmaGenMismatchErrMask_LSB 0x27
+#define QIB_7322_ErrMask_0_SDmaGenMismatchErrMask_MSB 0x27
+#define QIB_7322_ErrMask_0_SDmaGenMismatchErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_0_SendBufMisuseErrMask_LSB 0x26
+#define QIB_7322_ErrMask_0_SendBufMisuseErrMask_MSB 0x26
+#define QIB_7322_ErrMask_0_SendBufMisuseErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_0_SendUnsupportedVLErrMask_LSB 0x25
+#define QIB_7322_ErrMask_0_SendUnsupportedVLErrMask_MSB 0x25
+#define QIB_7322_ErrMask_0_SendUnsupportedVLErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_0_SendUnexpectedPktNumErrMask_LSB 0x24
+#define QIB_7322_ErrMask_0_SendUnexpectedPktNumErrMask_MSB 0x24
+#define QIB_7322_ErrMask_0_SendUnexpectedPktNumErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_0_SendDroppedDataPktErrMask_LSB 0x22
+#define QIB_7322_ErrMask_0_SendDroppedDataPktErrMask_MSB 0x22
+#define QIB_7322_ErrMask_0_SendDroppedDataPktErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_0_SendDroppedSmpPktErrMask_LSB 0x21
+#define QIB_7322_ErrMask_0_SendDroppedSmpPktErrMask_MSB 0x21
+#define QIB_7322_ErrMask_0_SendDroppedSmpPktErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_0_SendPktLenErrMask_LSB 0x20
+#define QIB_7322_ErrMask_0_SendPktLenErrMask_MSB 0x20
+#define QIB_7322_ErrMask_0_SendPktLenErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_0_SendUnderRunErrMask_LSB 0x1F
+#define QIB_7322_ErrMask_0_SendUnderRunErrMask_MSB 0x1F
+#define QIB_7322_ErrMask_0_SendUnderRunErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_0_SendMaxPktLenErrMask_LSB 0x1E
+#define QIB_7322_ErrMask_0_SendMaxPktLenErrMask_MSB 0x1E
+#define QIB_7322_ErrMask_0_SendMaxPktLenErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_0_SendMinPktLenErrMask_LSB 0x1D
+#define QIB_7322_ErrMask_0_SendMinPktLenErrMask_MSB 0x1D
+#define QIB_7322_ErrMask_0_SendMinPktLenErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_0_RcvIBLostLinkErrMask_LSB 0x11
+#define QIB_7322_ErrMask_0_RcvIBLostLinkErrMask_MSB 0x11
+#define QIB_7322_ErrMask_0_RcvIBLostLinkErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_0_RcvHdrErrMask_LSB 0x10
+#define QIB_7322_ErrMask_0_RcvHdrErrMask_MSB 0x10
+#define QIB_7322_ErrMask_0_RcvHdrErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_0_RcvHdrLenErrMask_LSB 0xF
+#define QIB_7322_ErrMask_0_RcvHdrLenErrMask_MSB 0xF
+#define QIB_7322_ErrMask_0_RcvHdrLenErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_0_RcvBadTidErrMask_LSB 0xE
+#define QIB_7322_ErrMask_0_RcvBadTidErrMask_MSB 0xE
+#define QIB_7322_ErrMask_0_RcvBadTidErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_0_RcvBadVersionErrMask_LSB 0xB
+#define QIB_7322_ErrMask_0_RcvBadVersionErrMask_MSB 0xB
+#define QIB_7322_ErrMask_0_RcvBadVersionErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_0_RcvIBFlowErrMask_LSB 0xA
+#define QIB_7322_ErrMask_0_RcvIBFlowErrMask_MSB 0xA
+#define QIB_7322_ErrMask_0_RcvIBFlowErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_0_RcvEBPErrMask_LSB 0x9
+#define QIB_7322_ErrMask_0_RcvEBPErrMask_MSB 0x9
+#define QIB_7322_ErrMask_0_RcvEBPErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_0_RcvUnsupportedVLErrMask_LSB 0x8
+#define QIB_7322_ErrMask_0_RcvUnsupportedVLErrMask_MSB 0x8
+#define QIB_7322_ErrMask_0_RcvUnsupportedVLErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_0_RcvUnexpectedCharErrMask_LSB 0x7
+#define QIB_7322_ErrMask_0_RcvUnexpectedCharErrMask_MSB 0x7
+#define QIB_7322_ErrMask_0_RcvUnexpectedCharErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_0_RcvShortPktLenErrMask_LSB 0x6
+#define QIB_7322_ErrMask_0_RcvShortPktLenErrMask_MSB 0x6
+#define QIB_7322_ErrMask_0_RcvShortPktLenErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_0_RcvLongPktLenErrMask_LSB 0x5
+#define QIB_7322_ErrMask_0_RcvLongPktLenErrMask_MSB 0x5
+#define QIB_7322_ErrMask_0_RcvLongPktLenErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_0_RcvMaxPktLenErrMask_LSB 0x4
+#define QIB_7322_ErrMask_0_RcvMaxPktLenErrMask_MSB 0x4
+#define QIB_7322_ErrMask_0_RcvMaxPktLenErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_0_RcvMinPktLenErrMask_LSB 0x3
+#define QIB_7322_ErrMask_0_RcvMinPktLenErrMask_MSB 0x3
+#define QIB_7322_ErrMask_0_RcvMinPktLenErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_0_RcvICRCErrMask_LSB 0x2
+#define QIB_7322_ErrMask_0_RcvICRCErrMask_MSB 0x2
+#define QIB_7322_ErrMask_0_RcvICRCErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_0_RcvVCRCErrMask_LSB 0x1
+#define QIB_7322_ErrMask_0_RcvVCRCErrMask_MSB 0x1
+#define QIB_7322_ErrMask_0_RcvVCRCErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_0_RcvFormatErrMask_LSB 0x0
+#define QIB_7322_ErrMask_0_RcvFormatErrMask_MSB 0x0
+#define QIB_7322_ErrMask_0_RcvFormatErrMask_RMASK 0x1
+
+#define QIB_7322_ErrStatus_0_OFFS 0x1088
+#define QIB_7322_ErrStatus_0_DEF 0x0000000000000000
+#define QIB_7322_ErrStatus_0_IBStatusChanged_LSB 0x3A
+#define QIB_7322_ErrStatus_0_IBStatusChanged_MSB 0x3A
+#define QIB_7322_ErrStatus_0_IBStatusChanged_RMASK 0x1
+#define QIB_7322_ErrStatus_0_SHeadersErr_LSB 0x39
+#define QIB_7322_ErrStatus_0_SHeadersErr_MSB 0x39
+#define QIB_7322_ErrStatus_0_SHeadersErr_RMASK 0x1
+#define QIB_7322_ErrStatus_0_VL15BufMisuseErr_LSB 0x36
+#define QIB_7322_ErrStatus_0_VL15BufMisuseErr_MSB 0x36
+#define QIB_7322_ErrStatus_0_VL15BufMisuseErr_RMASK 0x1
+#define QIB_7322_ErrStatus_0_SDmaHaltErr_LSB 0x31
+#define QIB_7322_ErrStatus_0_SDmaHaltErr_MSB 0x31
+#define QIB_7322_ErrStatus_0_SDmaHaltErr_RMASK 0x1
+#define QIB_7322_ErrStatus_0_SDmaDescAddrMisalignErr_LSB 0x30
+#define QIB_7322_ErrStatus_0_SDmaDescAddrMisalignErr_MSB 0x30
+#define QIB_7322_ErrStatus_0_SDmaDescAddrMisalignErr_RMASK 0x1
+#define QIB_7322_ErrStatus_0_SDmaUnexpDataErr_LSB 0x2F
+#define QIB_7322_ErrStatus_0_SDmaUnexpDataErr_MSB 0x2F
+#define QIB_7322_ErrStatus_0_SDmaUnexpDataErr_RMASK 0x1
+#define QIB_7322_ErrStatus_0_SDmaMissingDwErr_LSB 0x2E
+#define QIB_7322_ErrStatus_0_SDmaMissingDwErr_MSB 0x2E
+#define QIB_7322_ErrStatus_0_SDmaMissingDwErr_RMASK 0x1
+#define QIB_7322_ErrStatus_0_SDmaDwEnErr_LSB 0x2D
+#define QIB_7322_ErrStatus_0_SDmaDwEnErr_MSB 0x2D
+#define QIB_7322_ErrStatus_0_SDmaDwEnErr_RMASK 0x1
+#define QIB_7322_ErrStatus_0_SDmaRpyTagErr_LSB 0x2C
+#define QIB_7322_ErrStatus_0_SDmaRpyTagErr_MSB 0x2C
+#define QIB_7322_ErrStatus_0_SDmaRpyTagErr_RMASK 0x1
+#define QIB_7322_ErrStatus_0_SDma1stDescErr_LSB 0x2B
+#define QIB_7322_ErrStatus_0_SDma1stDescErr_MSB 0x2B
+#define QIB_7322_ErrStatus_0_SDma1stDescErr_RMASK 0x1
+#define QIB_7322_ErrStatus_0_SDmaBaseErr_LSB 0x2A
+#define QIB_7322_ErrStatus_0_SDmaBaseErr_MSB 0x2A
+#define QIB_7322_ErrStatus_0_SDmaBaseErr_RMASK 0x1
+#define QIB_7322_ErrStatus_0_SDmaTailOutOfBoundErr_LSB 0x29
+#define QIB_7322_ErrStatus_0_SDmaTailOutOfBoundErr_MSB 0x29
+#define QIB_7322_ErrStatus_0_SDmaTailOutOfBoundErr_RMASK 0x1
+#define QIB_7322_ErrStatus_0_SDmaOutOfBoundErr_LSB 0x28
+#define QIB_7322_ErrStatus_0_SDmaOutOfBoundErr_MSB 0x28
+#define QIB_7322_ErrStatus_0_SDmaOutOfBoundErr_RMASK 0x1
+#define QIB_7322_ErrStatus_0_SDmaGenMismatchErr_LSB 0x27
+#define QIB_7322_ErrStatus_0_SDmaGenMismatchErr_MSB 0x27
+#define QIB_7322_ErrStatus_0_SDmaGenMismatchErr_RMASK 0x1
+#define QIB_7322_ErrStatus_0_SendBufMisuseErr_LSB 0x26
+#define QIB_7322_ErrStatus_0_SendBufMisuseErr_MSB 0x26
+#define QIB_7322_ErrStatus_0_SendBufMisuseErr_RMASK 0x1
+#define QIB_7322_ErrStatus_0_SendUnsupportedVLErr_LSB 0x25
+#define QIB_7322_ErrStatus_0_SendUnsupportedVLErr_MSB 0x25
+#define QIB_7322_ErrStatus_0_SendUnsupportedVLErr_RMASK 0x1
+#define QIB_7322_ErrStatus_0_SendUnexpectedPktNumErr_LSB 0x24
+#define QIB_7322_ErrStatus_0_SendUnexpectedPktNumErr_MSB 0x24
+#define QIB_7322_ErrStatus_0_SendUnexpectedPktNumErr_RMASK 0x1
+#define QIB_7322_ErrStatus_0_SendDroppedDataPktErr_LSB 0x22
+#define QIB_7322_ErrStatus_0_SendDroppedDataPktErr_MSB 0x22
+#define QIB_7322_ErrStatus_0_SendDroppedDataPktErr_RMASK 0x1
+#define QIB_7322_ErrStatus_0_SendDroppedSmpPktErr_LSB 0x21
+#define QIB_7322_ErrStatus_0_SendDroppedSmpPktErr_MSB 0x21
+#define QIB_7322_ErrStatus_0_SendDroppedSmpPktErr_RMASK 0x1
+#define QIB_7322_ErrStatus_0_SendPktLenErr_LSB 0x20
+#define QIB_7322_ErrStatus_0_SendPktLenErr_MSB 0x20
+#define QIB_7322_ErrStatus_0_SendPktLenErr_RMASK 0x1
+#define QIB_7322_ErrStatus_0_SendUnderRunErr_LSB 0x1F
+#define QIB_7322_ErrStatus_0_SendUnderRunErr_MSB 0x1F
+#define QIB_7322_ErrStatus_0_SendUnderRunErr_RMASK 0x1
+#define QIB_7322_ErrStatus_0_SendMaxPktLenErr_LSB 0x1E
+#define QIB_7322_ErrStatus_0_SendMaxPktLenErr_MSB 0x1E
+#define QIB_7322_ErrStatus_0_SendMaxPktLenErr_RMASK 0x1
+#define QIB_7322_ErrStatus_0_SendMinPktLenErr_LSB 0x1D
+#define QIB_7322_ErrStatus_0_SendMinPktLenErr_MSB 0x1D
+#define QIB_7322_ErrStatus_0_SendMinPktLenErr_RMASK 0x1
+#define QIB_7322_ErrStatus_0_RcvIBLostLinkErr_LSB 0x11
+#define QIB_7322_ErrStatus_0_RcvIBLostLinkErr_MSB 0x11
+#define QIB_7322_ErrStatus_0_RcvIBLostLinkErr_RMASK 0x1
+#define QIB_7322_ErrStatus_0_RcvHdrErr_LSB 0x10
+#define QIB_7322_ErrStatus_0_RcvHdrErr_MSB 0x10
+#define QIB_7322_ErrStatus_0_RcvHdrErr_RMASK 0x1
+#define QIB_7322_ErrStatus_0_RcvHdrLenErr_LSB 0xF
+#define QIB_7322_ErrStatus_0_RcvHdrLenErr_MSB 0xF
+#define QIB_7322_ErrStatus_0_RcvHdrLenErr_RMASK 0x1
+#define QIB_7322_ErrStatus_0_RcvBadTidErr_LSB 0xE
+#define QIB_7322_ErrStatus_0_RcvBadTidErr_MSB 0xE
+#define QIB_7322_ErrStatus_0_RcvBadTidErr_RMASK 0x1
+#define QIB_7322_ErrStatus_0_RcvBadVersionErr_LSB 0xB
+#define QIB_7322_ErrStatus_0_RcvBadVersionErr_MSB 0xB
+#define QIB_7322_ErrStatus_0_RcvBadVersionErr_RMASK 0x1
+#define QIB_7322_ErrStatus_0_RcvIBFlowErr_LSB 0xA
+#define QIB_7322_ErrStatus_0_RcvIBFlowErr_MSB 0xA
+#define QIB_7322_ErrStatus_0_RcvIBFlowErr_RMASK 0x1
+#define QIB_7322_ErrStatus_0_RcvEBPErr_LSB 0x9
+#define QIB_7322_ErrStatus_0_RcvEBPErr_MSB 0x9
+#define QIB_7322_ErrStatus_0_RcvEBPErr_RMASK 0x1
+#define QIB_7322_ErrStatus_0_RcvUnsupportedVLErr_LSB 0x8
+#define QIB_7322_ErrStatus_0_RcvUnsupportedVLErr_MSB 0x8
+#define QIB_7322_ErrStatus_0_RcvUnsupportedVLErr_RMASK 0x1
+#define QIB_7322_ErrStatus_0_RcvUnexpectedCharErr_LSB 0x7
+#define QIB_7322_ErrStatus_0_RcvUnexpectedCharErr_MSB 0x7
+#define QIB_7322_ErrStatus_0_RcvUnexpectedCharErr_RMASK 0x1
+#define QIB_7322_ErrStatus_0_RcvShortPktLenErr_LSB 0x6
+#define QIB_7322_ErrStatus_0_RcvShortPktLenErr_MSB 0x6
+#define QIB_7322_ErrStatus_0_RcvShortPktLenErr_RMASK 0x1
+#define QIB_7322_ErrStatus_0_RcvLongPktLenErr_LSB 0x5
+#define QIB_7322_ErrStatus_0_RcvLongPktLenErr_MSB 0x5
+#define QIB_7322_ErrStatus_0_RcvLongPktLenErr_RMASK 0x1
+#define QIB_7322_ErrStatus_0_RcvMaxPktLenErr_LSB 0x4
+#define QIB_7322_ErrStatus_0_RcvMaxPktLenErr_MSB 0x4
+#define QIB_7322_ErrStatus_0_RcvMaxPktLenErr_RMASK 0x1
+#define QIB_7322_ErrStatus_0_RcvMinPktLenErr_LSB 0x3
+#define QIB_7322_ErrStatus_0_RcvMinPktLenErr_MSB 0x3
+#define QIB_7322_ErrStatus_0_RcvMinPktLenErr_RMASK 0x1
+#define QIB_7322_ErrStatus_0_RcvICRCErr_LSB 0x2
+#define QIB_7322_ErrStatus_0_RcvICRCErr_MSB 0x2
+#define QIB_7322_ErrStatus_0_RcvICRCErr_RMASK 0x1
+#define QIB_7322_ErrStatus_0_RcvVCRCErr_LSB 0x1
+#define QIB_7322_ErrStatus_0_RcvVCRCErr_MSB 0x1
+#define QIB_7322_ErrStatus_0_RcvVCRCErr_RMASK 0x1
+#define QIB_7322_ErrStatus_0_RcvFormatErr_LSB 0x0
+#define QIB_7322_ErrStatus_0_RcvFormatErr_MSB 0x0
+#define QIB_7322_ErrStatus_0_RcvFormatErr_RMASK 0x1
+
+#define QIB_7322_ErrClear_0_OFFS 0x1090
+#define QIB_7322_ErrClear_0_DEF 0x0000000000000000
+#define QIB_7322_ErrClear_0_IBStatusChangedClear_LSB 0x3A
+#define QIB_7322_ErrClear_0_IBStatusChangedClear_MSB 0x3A
+#define QIB_7322_ErrClear_0_IBStatusChangedClear_RMASK 0x1
+#define QIB_7322_ErrClear_0_SHeadersErrClear_LSB 0x39
+#define QIB_7322_ErrClear_0_SHeadersErrClear_MSB 0x39
+#define QIB_7322_ErrClear_0_SHeadersErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_0_VL15BufMisuseErrClear_LSB 0x36
+#define QIB_7322_ErrClear_0_VL15BufMisuseErrClear_MSB 0x36
+#define QIB_7322_ErrClear_0_VL15BufMisuseErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_0_SDmaHaltErrClear_LSB 0x31
+#define QIB_7322_ErrClear_0_SDmaHaltErrClear_MSB 0x31
+#define QIB_7322_ErrClear_0_SDmaHaltErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_0_SDmaDescAddrMisalignErrClear_LSB 0x30
+#define QIB_7322_ErrClear_0_SDmaDescAddrMisalignErrClear_MSB 0x30
+#define QIB_7322_ErrClear_0_SDmaDescAddrMisalignErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_0_SDmaUnexpDataErrClear_LSB 0x2F
+#define QIB_7322_ErrClear_0_SDmaUnexpDataErrClear_MSB 0x2F
+#define QIB_7322_ErrClear_0_SDmaUnexpDataErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_0_SDmaMissingDwErrClear_LSB 0x2E
+#define QIB_7322_ErrClear_0_SDmaMissingDwErrClear_MSB 0x2E
+#define QIB_7322_ErrClear_0_SDmaMissingDwErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_0_SDmaDwEnErrClear_LSB 0x2D
+#define QIB_7322_ErrClear_0_SDmaDwEnErrClear_MSB 0x2D
+#define QIB_7322_ErrClear_0_SDmaDwEnErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_0_SDmaRpyTagErrClear_LSB 0x2C
+#define QIB_7322_ErrClear_0_SDmaRpyTagErrClear_MSB 0x2C
+#define QIB_7322_ErrClear_0_SDmaRpyTagErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_0_SDma1stDescErrClear_LSB 0x2B
+#define QIB_7322_ErrClear_0_SDma1stDescErrClear_MSB 0x2B
+#define QIB_7322_ErrClear_0_SDma1stDescErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_0_SDmaBaseErrClear_LSB 0x2A
+#define QIB_7322_ErrClear_0_SDmaBaseErrClear_MSB 0x2A
+#define QIB_7322_ErrClear_0_SDmaBaseErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_0_SDmaTailOutOfBoundErrClear_LSB 0x29
+#define QIB_7322_ErrClear_0_SDmaTailOutOfBoundErrClear_MSB 0x29
+#define QIB_7322_ErrClear_0_SDmaTailOutOfBoundErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_0_SDmaOutOfBoundErrClear_LSB 0x28
+#define QIB_7322_ErrClear_0_SDmaOutOfBoundErrClear_MSB 0x28
+#define QIB_7322_ErrClear_0_SDmaOutOfBoundErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_0_SDmaGenMismatchErrClear_LSB 0x27
+#define QIB_7322_ErrClear_0_SDmaGenMismatchErrClear_MSB 0x27
+#define QIB_7322_ErrClear_0_SDmaGenMismatchErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_0_SendBufMisuseErrClear_LSB 0x26
+#define QIB_7322_ErrClear_0_SendBufMisuseErrClear_MSB 0x26
+#define QIB_7322_ErrClear_0_SendBufMisuseErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_0_SendUnsupportedVLErrClear_LSB 0x25
+#define QIB_7322_ErrClear_0_SendUnsupportedVLErrClear_MSB 0x25
+#define QIB_7322_ErrClear_0_SendUnsupportedVLErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_0_SendUnexpectedPktNumErrClear_LSB 0x24
+#define QIB_7322_ErrClear_0_SendUnexpectedPktNumErrClear_MSB 0x24
+#define QIB_7322_ErrClear_0_SendUnexpectedPktNumErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_0_SendDroppedDataPktErrClear_LSB 0x22
+#define QIB_7322_ErrClear_0_SendDroppedDataPktErrClear_MSB 0x22
+#define QIB_7322_ErrClear_0_SendDroppedDataPktErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_0_SendDroppedSmpPktErrClear_LSB 0x21
+#define QIB_7322_ErrClear_0_SendDroppedSmpPktErrClear_MSB 0x21
+#define QIB_7322_ErrClear_0_SendDroppedSmpPktErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_0_SendPktLenErrClear_LSB 0x20
+#define QIB_7322_ErrClear_0_SendPktLenErrClear_MSB 0x20
+#define QIB_7322_ErrClear_0_SendPktLenErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_0_SendUnderRunErrClear_LSB 0x1F
+#define QIB_7322_ErrClear_0_SendUnderRunErrClear_MSB 0x1F
+#define QIB_7322_ErrClear_0_SendUnderRunErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_0_SendMaxPktLenErrClear_LSB 0x1E
+#define QIB_7322_ErrClear_0_SendMaxPktLenErrClear_MSB 0x1E
+#define QIB_7322_ErrClear_0_SendMaxPktLenErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_0_SendMinPktLenErrClear_LSB 0x1D
+#define QIB_7322_ErrClear_0_SendMinPktLenErrClear_MSB 0x1D
+#define QIB_7322_ErrClear_0_SendMinPktLenErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_0_RcvIBLostLinkErrClear_LSB 0x11
+#define QIB_7322_ErrClear_0_RcvIBLostLinkErrClear_MSB 0x11
+#define QIB_7322_ErrClear_0_RcvIBLostLinkErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_0_RcvHdrErrClear_LSB 0x10
+#define QIB_7322_ErrClear_0_RcvHdrErrClear_MSB 0x10
+#define QIB_7322_ErrClear_0_RcvHdrErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_0_RcvHdrLenErrClear_LSB 0xF
+#define QIB_7322_ErrClear_0_RcvHdrLenErrClear_MSB 0xF
+#define QIB_7322_ErrClear_0_RcvHdrLenErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_0_RcvBadTidErrClear_LSB 0xE
+#define QIB_7322_ErrClear_0_RcvBadTidErrClear_MSB 0xE
+#define QIB_7322_ErrClear_0_RcvBadTidErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_0_RcvBadVersionErrClear_LSB 0xB
+#define QIB_7322_ErrClear_0_RcvBadVersionErrClear_MSB 0xB
+#define QIB_7322_ErrClear_0_RcvBadVersionErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_0_RcvIBFlowErrClear_LSB 0xA
+#define QIB_7322_ErrClear_0_RcvIBFlowErrClear_MSB 0xA
+#define QIB_7322_ErrClear_0_RcvIBFlowErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_0_RcvEBPErrClear_LSB 0x9
+#define QIB_7322_ErrClear_0_RcvEBPErrClear_MSB 0x9
+#define QIB_7322_ErrClear_0_RcvEBPErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_0_RcvUnsupportedVLErrClear_LSB 0x8
+#define QIB_7322_ErrClear_0_RcvUnsupportedVLErrClear_MSB 0x8
+#define QIB_7322_ErrClear_0_RcvUnsupportedVLErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_0_RcvUnexpectedCharErrClear_LSB 0x7
+#define QIB_7322_ErrClear_0_RcvUnexpectedCharErrClear_MSB 0x7
+#define QIB_7322_ErrClear_0_RcvUnexpectedCharErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_0_RcvShortPktLenErrClear_LSB 0x6
+#define QIB_7322_ErrClear_0_RcvShortPktLenErrClear_MSB 0x6
+#define QIB_7322_ErrClear_0_RcvShortPktLenErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_0_RcvLongPktLenErrClear_LSB 0x5
+#define QIB_7322_ErrClear_0_RcvLongPktLenErrClear_MSB 0x5
+#define QIB_7322_ErrClear_0_RcvLongPktLenErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_0_RcvMaxPktLenErrClear_LSB 0x4
+#define QIB_7322_ErrClear_0_RcvMaxPktLenErrClear_MSB 0x4
+#define QIB_7322_ErrClear_0_RcvMaxPktLenErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_0_RcvMinPktLenErrClear_LSB 0x3
+#define QIB_7322_ErrClear_0_RcvMinPktLenErrClear_MSB 0x3
+#define QIB_7322_ErrClear_0_RcvMinPktLenErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_0_RcvICRCErrClear_LSB 0x2
+#define QIB_7322_ErrClear_0_RcvICRCErrClear_MSB 0x2
+#define QIB_7322_ErrClear_0_RcvICRCErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_0_RcvVCRCErrClear_LSB 0x1
+#define QIB_7322_ErrClear_0_RcvVCRCErrClear_MSB 0x1
+#define QIB_7322_ErrClear_0_RcvVCRCErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_0_RcvFormatErrClear_LSB 0x0
+#define QIB_7322_ErrClear_0_RcvFormatErrClear_MSB 0x0
+#define QIB_7322_ErrClear_0_RcvFormatErrClear_RMASK 0x1
+
+#define QIB_7322_TXEStatus_0_OFFS 0x10B8
+#define QIB_7322_TXEStatus_0_DEF 0x0000000XC00080FF
+#define QIB_7322_TXEStatus_0_TXE_IBC_Idle_LSB 0x1F
+#define QIB_7322_TXEStatus_0_TXE_IBC_Idle_MSB 0x1F
+#define QIB_7322_TXEStatus_0_TXE_IBC_Idle_RMASK 0x1
+#define QIB_7322_TXEStatus_0_RmFifoEmpty_LSB 0x1E
+#define QIB_7322_TXEStatus_0_RmFifoEmpty_MSB 0x1E
+#define QIB_7322_TXEStatus_0_RmFifoEmpty_RMASK 0x1
+#define QIB_7322_TXEStatus_0_LaFifoEmpty_VL15_LSB 0xF
+#define QIB_7322_TXEStatus_0_LaFifoEmpty_VL15_MSB 0xF
+#define QIB_7322_TXEStatus_0_LaFifoEmpty_VL15_RMASK 0x1
+#define QIB_7322_TXEStatus_0_LaFifoEmpty_VL7_LSB 0x7
+#define QIB_7322_TXEStatus_0_LaFifoEmpty_VL7_MSB 0x7
+#define QIB_7322_TXEStatus_0_LaFifoEmpty_VL7_RMASK 0x1
+#define QIB_7322_TXEStatus_0_LaFifoEmpty_VL6_LSB 0x6
+#define QIB_7322_TXEStatus_0_LaFifoEmpty_VL6_MSB 0x6
+#define QIB_7322_TXEStatus_0_LaFifoEmpty_VL6_RMASK 0x1
+#define QIB_7322_TXEStatus_0_LaFifoEmpty_VL5_LSB 0x5
+#define QIB_7322_TXEStatus_0_LaFifoEmpty_VL5_MSB 0x5
+#define QIB_7322_TXEStatus_0_LaFifoEmpty_VL5_RMASK 0x1
+#define QIB_7322_TXEStatus_0_LaFifoEmpty_VL4_LSB 0x4
+#define QIB_7322_TXEStatus_0_LaFifoEmpty_VL4_MSB 0x4
+#define QIB_7322_TXEStatus_0_LaFifoEmpty_VL4_RMASK 0x1
+#define QIB_7322_TXEStatus_0_LaFifoEmpty_VL3_LSB 0x3
+#define QIB_7322_TXEStatus_0_LaFifoEmpty_VL3_MSB 0x3
+#define QIB_7322_TXEStatus_0_LaFifoEmpty_VL3_RMASK 0x1
+#define QIB_7322_TXEStatus_0_LaFifoEmpty_VL2_LSB 0x2
+#define QIB_7322_TXEStatus_0_LaFifoEmpty_VL2_MSB 0x2
+#define QIB_7322_TXEStatus_0_LaFifoEmpty_VL2_RMASK 0x1
+#define QIB_7322_TXEStatus_0_LaFifoEmpty_VL1_LSB 0x1
+#define QIB_7322_TXEStatus_0_LaFifoEmpty_VL1_MSB 0x1
+#define QIB_7322_TXEStatus_0_LaFifoEmpty_VL1_RMASK 0x1
+#define QIB_7322_TXEStatus_0_LaFifoEmpty_VL0_LSB 0x0
+#define QIB_7322_TXEStatus_0_LaFifoEmpty_VL0_MSB 0x0
+#define QIB_7322_TXEStatus_0_LaFifoEmpty_VL0_RMASK 0x1
+
+#define QIB_7322_RcvCtrl_0_OFFS 0x1100
+#define QIB_7322_RcvCtrl_0_DEF 0x0000000000000000
+#define QIB_7322_RcvCtrl_0_RcvResetCredit_LSB 0x2A
+#define QIB_7322_RcvCtrl_0_RcvResetCredit_MSB 0x2A
+#define QIB_7322_RcvCtrl_0_RcvResetCredit_RMASK 0x1
+#define QIB_7322_RcvCtrl_0_RcvPartitionKeyDisable_LSB 0x29
+#define QIB_7322_RcvCtrl_0_RcvPartitionKeyDisable_MSB 0x29
+#define QIB_7322_RcvCtrl_0_RcvPartitionKeyDisable_RMASK 0x1
+#define QIB_7322_RcvCtrl_0_RcvQPMapEnable_LSB 0x28
+#define QIB_7322_RcvCtrl_0_RcvQPMapEnable_MSB 0x28
+#define QIB_7322_RcvCtrl_0_RcvQPMapEnable_RMASK 0x1
+#define QIB_7322_RcvCtrl_0_RcvIBPortEnable_LSB 0x27
+#define QIB_7322_RcvCtrl_0_RcvIBPortEnable_MSB 0x27
+#define QIB_7322_RcvCtrl_0_RcvIBPortEnable_RMASK 0x1
+#define QIB_7322_RcvCtrl_0_ContextEnableUser_LSB 0x2
+#define QIB_7322_RcvCtrl_0_ContextEnableUser_MSB 0x11
+#define QIB_7322_RcvCtrl_0_ContextEnableUser_RMASK 0xFFFF
+#define QIB_7322_RcvCtrl_0_ContextEnableKernel_LSB 0x0
+#define QIB_7322_RcvCtrl_0_ContextEnableKernel_MSB 0x0
+#define QIB_7322_RcvCtrl_0_ContextEnableKernel_RMASK 0x1
+
+#define QIB_7322_RcvBTHQP_0_OFFS 0x1108
+#define QIB_7322_RcvBTHQP_0_DEF 0x0000000000000000
+#define QIB_7322_RcvBTHQP_0_RcvBTHQP_LSB 0x0
+#define QIB_7322_RcvBTHQP_0_RcvBTHQP_MSB 0x17
+#define QIB_7322_RcvBTHQP_0_RcvBTHQP_RMASK 0xFFFFFF
+
+#define QIB_7322_RcvQPMapTableA_0_OFFS 0x1110
+#define QIB_7322_RcvQPMapTableA_0_DEF 0x0000000000000000
+#define QIB_7322_RcvQPMapTableA_0_RcvQPMapContext5_LSB 0x19
+#define QIB_7322_RcvQPMapTableA_0_RcvQPMapContext5_MSB 0x1D
+#define QIB_7322_RcvQPMapTableA_0_RcvQPMapContext5_RMASK 0x1F
+#define QIB_7322_RcvQPMapTableA_0_RcvQPMapContext4_LSB 0x14
+#define QIB_7322_RcvQPMapTableA_0_RcvQPMapContext4_MSB 0x18
+#define QIB_7322_RcvQPMapTableA_0_RcvQPMapContext4_RMASK 0x1F
+#define QIB_7322_RcvQPMapTableA_0_RcvQPMapContext3_LSB 0xF
+#define QIB_7322_RcvQPMapTableA_0_RcvQPMapContext3_MSB 0x13
+#define QIB_7322_RcvQPMapTableA_0_RcvQPMapContext3_RMASK 0x1F
+#define QIB_7322_RcvQPMapTableA_0_RcvQPMapContext2_LSB 0xA
+#define QIB_7322_RcvQPMapTableA_0_RcvQPMapContext2_MSB 0xE
+#define QIB_7322_RcvQPMapTableA_0_RcvQPMapContext2_RMASK 0x1F
+#define QIB_7322_RcvQPMapTableA_0_RcvQPMapContext1_LSB 0x5
+#define QIB_7322_RcvQPMapTableA_0_RcvQPMapContext1_MSB 0x9
+#define QIB_7322_RcvQPMapTableA_0_RcvQPMapContext1_RMASK 0x1F
+#define QIB_7322_RcvQPMapTableA_0_RcvQPMapContext0_LSB 0x0
+#define QIB_7322_RcvQPMapTableA_0_RcvQPMapContext0_MSB 0x4
+#define QIB_7322_RcvQPMapTableA_0_RcvQPMapContext0_RMASK 0x1F
+
+#define QIB_7322_RcvQPMapTableB_0_OFFS 0x1118
+#define QIB_7322_RcvQPMapTableB_0_DEF 0x0000000000000000
+#define QIB_7322_RcvQPMapTableB_0_RcvQPMapContext11_LSB 0x19
+#define QIB_7322_RcvQPMapTableB_0_RcvQPMapContext11_MSB 0x1D
+#define QIB_7322_RcvQPMapTableB_0_RcvQPMapContext11_RMASK 0x1F
+#define QIB_7322_RcvQPMapTableB_0_RcvQPMapContext10_LSB 0x14
+#define QIB_7322_RcvQPMapTableB_0_RcvQPMapContext10_MSB 0x18
+#define QIB_7322_RcvQPMapTableB_0_RcvQPMapContext10_RMASK 0x1F
+#define QIB_7322_RcvQPMapTableB_0_RcvQPMapContext9_LSB 0xF
+#define QIB_7322_RcvQPMapTableB_0_RcvQPMapContext9_MSB 0x13
+#define QIB_7322_RcvQPMapTableB_0_RcvQPMapContext9_RMASK 0x1F
+#define QIB_7322_RcvQPMapTableB_0_RcvQPMapContext8_LSB 0xA
+#define QIB_7322_RcvQPMapTableB_0_RcvQPMapContext8_MSB 0xE
+#define QIB_7322_RcvQPMapTableB_0_RcvQPMapContext8_RMASK 0x1F
+#define QIB_7322_RcvQPMapTableB_0_RcvQPMapContext7_LSB 0x5
+#define QIB_7322_RcvQPMapTableB_0_RcvQPMapContext7_MSB 0x9
+#define QIB_7322_RcvQPMapTableB_0_RcvQPMapContext7_RMASK 0x1F
+#define QIB_7322_RcvQPMapTableB_0_RcvQPMapContext6_LSB 0x0
+#define QIB_7322_RcvQPMapTableB_0_RcvQPMapContext6_MSB 0x4
+#define QIB_7322_RcvQPMapTableB_0_RcvQPMapContext6_RMASK 0x1F
+
+#define QIB_7322_RcvQPMapTableC_0_OFFS 0x1120
+#define QIB_7322_RcvQPMapTableC_0_DEF 0x0000000000000000
+#define QIB_7322_RcvQPMapTableC_0_RcvQPMapContext17_LSB 0x19
+#define QIB_7322_RcvQPMapTableC_0_RcvQPMapContext17_MSB 0x1D
+#define QIB_7322_RcvQPMapTableC_0_RcvQPMapContext17_RMASK 0x1F
+#define QIB_7322_RcvQPMapTableC_0_RcvQPMapContext16_LSB 0x14
+#define QIB_7322_RcvQPMapTableC_0_RcvQPMapContext16_MSB 0x18
+#define QIB_7322_RcvQPMapTableC_0_RcvQPMapContext16_RMASK 0x1F
+#define QIB_7322_RcvQPMapTableC_0_RcvQPMapContext15_LSB 0xF
+#define QIB_7322_RcvQPMapTableC_0_RcvQPMapContext15_MSB 0x13
+#define QIB_7322_RcvQPMapTableC_0_RcvQPMapContext15_RMASK 0x1F
+#define QIB_7322_RcvQPMapTableC_0_RcvQPMapContext14_LSB 0xA
+#define QIB_7322_RcvQPMapTableC_0_RcvQPMapContext14_MSB 0xE
+#define QIB_7322_RcvQPMapTableC_0_RcvQPMapContext14_RMASK 0x1F
+#define QIB_7322_RcvQPMapTableC_0_RcvQPMapContext13_LSB 0x5
+#define QIB_7322_RcvQPMapTableC_0_RcvQPMapContext13_MSB 0x9
+#define QIB_7322_RcvQPMapTableC_0_RcvQPMapContext13_RMASK 0x1F
+#define QIB_7322_RcvQPMapTableC_0_RcvQPMapContext12_LSB 0x0
+#define QIB_7322_RcvQPMapTableC_0_RcvQPMapContext12_MSB 0x4
+#define QIB_7322_RcvQPMapTableC_0_RcvQPMapContext12_RMASK 0x1F
+
+#define QIB_7322_RcvQPMapTableD_0_OFFS 0x1128
+#define QIB_7322_RcvQPMapTableD_0_DEF 0x0000000000000000
+#define QIB_7322_RcvQPMapTableD_0_RcvQPMapContext23_LSB 0x19
+#define QIB_7322_RcvQPMapTableD_0_RcvQPMapContext23_MSB 0x1D
+#define QIB_7322_RcvQPMapTableD_0_RcvQPMapContext23_RMASK 0x1F
+#define QIB_7322_RcvQPMapTableD_0_RcvQPMapContext22_LSB 0x14
+#define QIB_7322_RcvQPMapTableD_0_RcvQPMapContext22_MSB 0x18
+#define QIB_7322_RcvQPMapTableD_0_RcvQPMapContext22_RMASK 0x1F
+#define QIB_7322_RcvQPMapTableD_0_RcvQPMapContext21_LSB 0xF
+#define QIB_7322_RcvQPMapTableD_0_RcvQPMapContext21_MSB 0x13
+#define QIB_7322_RcvQPMapTableD_0_RcvQPMapContext21_RMASK 0x1F
+#define QIB_7322_RcvQPMapTableD_0_RcvQPMapContext20_LSB 0xA
+#define QIB_7322_RcvQPMapTableD_0_RcvQPMapContext20_MSB 0xE
+#define QIB_7322_RcvQPMapTableD_0_RcvQPMapContext20_RMASK 0x1F
+#define QIB_7322_RcvQPMapTableD_0_RcvQPMapContext19_LSB 0x5
+#define QIB_7322_RcvQPMapTableD_0_RcvQPMapContext19_MSB 0x9
+#define QIB_7322_RcvQPMapTableD_0_RcvQPMapContext19_RMASK 0x1F
+#define QIB_7322_RcvQPMapTableD_0_RcvQPMapContext18_LSB 0x0
+#define QIB_7322_RcvQPMapTableD_0_RcvQPMapContext18_MSB 0x4
+#define QIB_7322_RcvQPMapTableD_0_RcvQPMapContext18_RMASK 0x1F
+
+#define QIB_7322_RcvQPMapTableE_0_OFFS 0x1130
+#define QIB_7322_RcvQPMapTableE_0_DEF 0x0000000000000000
+#define QIB_7322_RcvQPMapTableE_0_RcvQPMapContext29_LSB 0x19
+#define QIB_7322_RcvQPMapTableE_0_RcvQPMapContext29_MSB 0x1D
+#define QIB_7322_RcvQPMapTableE_0_RcvQPMapContext29_RMASK 0x1F
+#define QIB_7322_RcvQPMapTableE_0_RcvQPMapContext28_LSB 0x14
+#define QIB_7322_RcvQPMapTableE_0_RcvQPMapContext28_MSB 0x18
+#define QIB_7322_RcvQPMapTableE_0_RcvQPMapContext28_RMASK 0x1F
+#define QIB_7322_RcvQPMapTableE_0_RcvQPMapContext27_LSB 0xF
+#define QIB_7322_RcvQPMapTableE_0_RcvQPMapContext27_MSB 0x13
+#define QIB_7322_RcvQPMapTableE_0_RcvQPMapContext27_RMASK 0x1F
+#define QIB_7322_RcvQPMapTableE_0_RcvQPMapContext26_LSB 0xA
+#define QIB_7322_RcvQPMapTableE_0_RcvQPMapContext26_MSB 0xE
+#define QIB_7322_RcvQPMapTableE_0_RcvQPMapContext26_RMASK 0x1F
+#define QIB_7322_RcvQPMapTableE_0_RcvQPMapContext25_LSB 0x5
+#define QIB_7322_RcvQPMapTableE_0_RcvQPMapContext25_MSB 0x9
+#define QIB_7322_RcvQPMapTableE_0_RcvQPMapContext25_RMASK 0x1F
+#define QIB_7322_RcvQPMapTableE_0_RcvQPMapContext24_LSB 0x0
+#define QIB_7322_RcvQPMapTableE_0_RcvQPMapContext24_MSB 0x4
+#define QIB_7322_RcvQPMapTableE_0_RcvQPMapContext24_RMASK 0x1F
+
+#define QIB_7322_RcvQPMapTableF_0_OFFS 0x1138
+#define QIB_7322_RcvQPMapTableF_0_DEF 0x0000000000000000
+#define QIB_7322_RcvQPMapTableF_0_RcvQPMapContext31_LSB 0x5
+#define QIB_7322_RcvQPMapTableF_0_RcvQPMapContext31_MSB 0x9
+#define QIB_7322_RcvQPMapTableF_0_RcvQPMapContext31_RMASK 0x1F
+#define QIB_7322_RcvQPMapTableF_0_RcvQPMapContext30_LSB 0x0
+#define QIB_7322_RcvQPMapTableF_0_RcvQPMapContext30_MSB 0x4
+#define QIB_7322_RcvQPMapTableF_0_RcvQPMapContext30_RMASK 0x1F
+
+#define QIB_7322_PSStat_0_OFFS 0x1140
+#define QIB_7322_PSStat_0_DEF 0x0000000000000000
+
+#define QIB_7322_PSStart_0_OFFS 0x1148
+#define QIB_7322_PSStart_0_DEF 0x0000000000000000
+
+#define QIB_7322_PSInterval_0_OFFS 0x1150
+#define QIB_7322_PSInterval_0_DEF 0x0000000000000000
+
+#define QIB_7322_RcvStatus_0_OFFS 0x1160
+#define QIB_7322_RcvStatus_0_DEF 0x0000000000000000
+#define QIB_7322_RcvStatus_0_DmaeqBlockingContext_LSB 0x1
+#define QIB_7322_RcvStatus_0_DmaeqBlockingContext_MSB 0x5
+#define QIB_7322_RcvStatus_0_DmaeqBlockingContext_RMASK 0x1F
+#define QIB_7322_RcvStatus_0_RxPktInProgress_LSB 0x0
+#define QIB_7322_RcvStatus_0_RxPktInProgress_MSB 0x0
+#define QIB_7322_RcvStatus_0_RxPktInProgress_RMASK 0x1
+
+#define QIB_7322_RcvPartitionKey_0_OFFS 0x1168
+#define QIB_7322_RcvPartitionKey_0_DEF 0x0000000000000000
+
+#define QIB_7322_RcvQPMulticastContext_0_OFFS 0x1170
+#define QIB_7322_RcvQPMulticastContext_0_DEF 0x0000000000000000
+#define QIB_7322_RcvQPMulticastContext_0_RcvQpMcContext_LSB 0x0
+#define QIB_7322_RcvQPMulticastContext_0_RcvQpMcContext_MSB 0x4
+#define QIB_7322_RcvQPMulticastContext_0_RcvQpMcContext_RMASK 0x1F
+
+#define QIB_7322_RcvPktLEDCnt_0_OFFS 0x1178
+#define QIB_7322_RcvPktLEDCnt_0_DEF 0x0000000000000000
+#define QIB_7322_RcvPktLEDCnt_0_ONperiod_LSB 0x20
+#define QIB_7322_RcvPktLEDCnt_0_ONperiod_MSB 0x3F
+#define QIB_7322_RcvPktLEDCnt_0_ONperiod_RMASK 0xFFFFFFFF
+#define QIB_7322_RcvPktLEDCnt_0_OFFperiod_LSB 0x0
+#define QIB_7322_RcvPktLEDCnt_0_OFFperiod_MSB 0x1F
+#define QIB_7322_RcvPktLEDCnt_0_OFFperiod_RMASK 0xFFFFFFFF
+
+#define QIB_7322_SendDmaIdleCnt_0_OFFS 0x1180
+#define QIB_7322_SendDmaIdleCnt_0_DEF 0x0000000000000000
+#define QIB_7322_SendDmaIdleCnt_0_SendDmaIdleCnt_LSB 0x0
+#define QIB_7322_SendDmaIdleCnt_0_SendDmaIdleCnt_MSB 0xF
+#define QIB_7322_SendDmaIdleCnt_0_SendDmaIdleCnt_RMASK 0xFFFF
+
+#define QIB_7322_SendDmaReloadCnt_0_OFFS 0x1188
+#define QIB_7322_SendDmaReloadCnt_0_DEF 0x0000000000000000
+#define QIB_7322_SendDmaReloadCnt_0_SendDmaReloadCnt_LSB 0x0
+#define QIB_7322_SendDmaReloadCnt_0_SendDmaReloadCnt_MSB 0xF
+#define QIB_7322_SendDmaReloadCnt_0_SendDmaReloadCnt_RMASK 0xFFFF
+
+#define QIB_7322_SendDmaDescCnt_0_OFFS 0x1190
+#define QIB_7322_SendDmaDescCnt_0_DEF 0x0000000000000000
+#define QIB_7322_SendDmaDescCnt_0_SendDmaDescCnt_LSB 0x0
+#define QIB_7322_SendDmaDescCnt_0_SendDmaDescCnt_MSB 0xF
+#define QIB_7322_SendDmaDescCnt_0_SendDmaDescCnt_RMASK 0xFFFF
+
+#define QIB_7322_SendCtrl_0_OFFS 0x11C0
+#define QIB_7322_SendCtrl_0_DEF 0x0000000000000000
+#define QIB_7322_SendCtrl_0_IBVLArbiterEn_LSB 0xF
+#define QIB_7322_SendCtrl_0_IBVLArbiterEn_MSB 0xF
+#define QIB_7322_SendCtrl_0_IBVLArbiterEn_RMASK 0x1
+#define QIB_7322_SendCtrl_0_TxeDrainRmFifo_LSB 0xE
+#define QIB_7322_SendCtrl_0_TxeDrainRmFifo_MSB 0xE
+#define QIB_7322_SendCtrl_0_TxeDrainRmFifo_RMASK 0x1
+#define QIB_7322_SendCtrl_0_TxeDrainLaFifo_LSB 0xD
+#define QIB_7322_SendCtrl_0_TxeDrainLaFifo_MSB 0xD
+#define QIB_7322_SendCtrl_0_TxeDrainLaFifo_RMASK 0x1
+#define QIB_7322_SendCtrl_0_SDmaHalt_LSB 0xC
+#define QIB_7322_SendCtrl_0_SDmaHalt_MSB 0xC
+#define QIB_7322_SendCtrl_0_SDmaHalt_RMASK 0x1
+#define QIB_7322_SendCtrl_0_SDmaEnable_LSB 0xB
+#define QIB_7322_SendCtrl_0_SDmaEnable_MSB 0xB
+#define QIB_7322_SendCtrl_0_SDmaEnable_RMASK 0x1
+#define QIB_7322_SendCtrl_0_SDmaSingleDescriptor_LSB 0xA
+#define QIB_7322_SendCtrl_0_SDmaSingleDescriptor_MSB 0xA
+#define QIB_7322_SendCtrl_0_SDmaSingleDescriptor_RMASK 0x1
+#define QIB_7322_SendCtrl_0_SDmaIntEnable_LSB 0x9
+#define QIB_7322_SendCtrl_0_SDmaIntEnable_MSB 0x9
+#define QIB_7322_SendCtrl_0_SDmaIntEnable_RMASK 0x1
+#define QIB_7322_SendCtrl_0_SDmaCleanup_LSB 0x8
+#define QIB_7322_SendCtrl_0_SDmaCleanup_MSB 0x8
+#define QIB_7322_SendCtrl_0_SDmaCleanup_RMASK 0x1
+#define QIB_7322_SendCtrl_0_ForceCreditUpToDate_LSB 0x7
+#define QIB_7322_SendCtrl_0_ForceCreditUpToDate_MSB 0x7
+#define QIB_7322_SendCtrl_0_ForceCreditUpToDate_RMASK 0x1
+#define QIB_7322_SendCtrl_0_SendEnable_LSB 0x3
+#define QIB_7322_SendCtrl_0_SendEnable_MSB 0x3
+#define QIB_7322_SendCtrl_0_SendEnable_RMASK 0x1
+#define QIB_7322_SendCtrl_0_TxeBypassIbc_LSB 0x1
+#define QIB_7322_SendCtrl_0_TxeBypassIbc_MSB 0x1
+#define QIB_7322_SendCtrl_0_TxeBypassIbc_RMASK 0x1
+#define QIB_7322_SendCtrl_0_TxeAbortIbc_LSB 0x0
+#define QIB_7322_SendCtrl_0_TxeAbortIbc_MSB 0x0
+#define QIB_7322_SendCtrl_0_TxeAbortIbc_RMASK 0x1
+
+#define QIB_7322_SendDmaBase_0_OFFS 0x11F8
+#define QIB_7322_SendDmaBase_0_DEF 0x0000000000000000
+#define QIB_7322_SendDmaBase_0_SendDmaBase_LSB 0x0
+#define QIB_7322_SendDmaBase_0_SendDmaBase_MSB 0x2F
+#define QIB_7322_SendDmaBase_0_SendDmaBase_RMASK 0xFFFFFFFFFFFF
+
+#define QIB_7322_SendDmaLenGen_0_OFFS 0x1200
+#define QIB_7322_SendDmaLenGen_0_DEF 0x0000000000000000
+#define QIB_7322_SendDmaLenGen_0_Generation_LSB 0x10
+#define QIB_7322_SendDmaLenGen_0_Generation_MSB 0x12
+#define QIB_7322_SendDmaLenGen_0_Generation_RMASK 0x7
+#define QIB_7322_SendDmaLenGen_0_Length_LSB 0x0
+#define QIB_7322_SendDmaLenGen_0_Length_MSB 0xF
+#define QIB_7322_SendDmaLenGen_0_Length_RMASK 0xFFFF
+
+#define QIB_7322_SendDmaTail_0_OFFS 0x1208
+#define QIB_7322_SendDmaTail_0_DEF 0x0000000000000000
+#define QIB_7322_SendDmaTail_0_SendDmaTail_LSB 0x0
+#define QIB_7322_SendDmaTail_0_SendDmaTail_MSB 0xF
+#define QIB_7322_SendDmaTail_0_SendDmaTail_RMASK 0xFFFF
+
+#define QIB_7322_SendDmaHead_0_OFFS 0x1210
+#define QIB_7322_SendDmaHead_0_DEF 0x0000000000000000
+#define QIB_7322_SendDmaHead_0_InternalSendDmaHead_LSB 0x20
+#define QIB_7322_SendDmaHead_0_InternalSendDmaHead_MSB 0x2F
+#define QIB_7322_SendDmaHead_0_InternalSendDmaHead_RMASK 0xFFFF
+#define QIB_7322_SendDmaHead_0_SendDmaHead_LSB 0x0
+#define QIB_7322_SendDmaHead_0_SendDmaHead_MSB 0xF
+#define QIB_7322_SendDmaHead_0_SendDmaHead_RMASK 0xFFFF
+
+#define QIB_7322_SendDmaHeadAddr_0_OFFS 0x1218
+#define QIB_7322_SendDmaHeadAddr_0_DEF 0x0000000000000000
+#define QIB_7322_SendDmaHeadAddr_0_SendDmaHeadAddr_LSB 0x0
+#define QIB_7322_SendDmaHeadAddr_0_SendDmaHeadAddr_MSB 0x2F
+#define QIB_7322_SendDmaHeadAddr_0_SendDmaHeadAddr_RMASK 0xFFFFFFFFFFFF
+
+#define QIB_7322_SendDmaBufMask0_0_OFFS 0x1220
+#define QIB_7322_SendDmaBufMask0_0_DEF 0x0000000000000000
+#define QIB_7322_SendDmaBufMask0_0_BufMask_63_0_LSB 0x0
+#define QIB_7322_SendDmaBufMask0_0_BufMask_63_0_MSB 0x3F
+#define QIB_7322_SendDmaBufMask0_0_BufMask_63_0_RMASK 0x0
+
+#define QIB_7322_SendDmaStatus_0_OFFS 0x1238
+#define QIB_7322_SendDmaStatus_0_DEF 0x0000000042000000
+#define QIB_7322_SendDmaStatus_0_ScoreBoardDrainInProg_LSB 0x3F
+#define QIB_7322_SendDmaStatus_0_ScoreBoardDrainInProg_MSB 0x3F
+#define QIB_7322_SendDmaStatus_0_ScoreBoardDrainInProg_RMASK 0x1
+#define QIB_7322_SendDmaStatus_0_HaltInProg_LSB 0x3E
+#define QIB_7322_SendDmaStatus_0_HaltInProg_MSB 0x3E
+#define QIB_7322_SendDmaStatus_0_HaltInProg_RMASK 0x1
+#define QIB_7322_SendDmaStatus_0_InternalSDmaHalt_LSB 0x3D
+#define QIB_7322_SendDmaStatus_0_InternalSDmaHalt_MSB 0x3D
+#define QIB_7322_SendDmaStatus_0_InternalSDmaHalt_RMASK 0x1
+#define QIB_7322_SendDmaStatus_0_ScbDescIndex_13_0_LSB 0x2F
+#define QIB_7322_SendDmaStatus_0_ScbDescIndex_13_0_MSB 0x3C
+#define QIB_7322_SendDmaStatus_0_ScbDescIndex_13_0_RMASK 0x3FFF
+#define QIB_7322_SendDmaStatus_0_RpyLowAddr_6_0_LSB 0x28
+#define QIB_7322_SendDmaStatus_0_RpyLowAddr_6_0_MSB 0x2E
+#define QIB_7322_SendDmaStatus_0_RpyLowAddr_6_0_RMASK 0x7F
+#define QIB_7322_SendDmaStatus_0_RpyTag_7_0_LSB 0x20
+#define QIB_7322_SendDmaStatus_0_RpyTag_7_0_MSB 0x27
+#define QIB_7322_SendDmaStatus_0_RpyTag_7_0_RMASK 0xFF
+#define QIB_7322_SendDmaStatus_0_ScbFull_LSB 0x1F
+#define QIB_7322_SendDmaStatus_0_ScbFull_MSB 0x1F
+#define QIB_7322_SendDmaStatus_0_ScbFull_RMASK 0x1
+#define QIB_7322_SendDmaStatus_0_ScbEmpty_LSB 0x1E
+#define QIB_7322_SendDmaStatus_0_ScbEmpty_MSB 0x1E
+#define QIB_7322_SendDmaStatus_0_ScbEmpty_RMASK 0x1
+#define QIB_7322_SendDmaStatus_0_ScbEntryValid_LSB 0x1D
+#define QIB_7322_SendDmaStatus_0_ScbEntryValid_MSB 0x1D
+#define QIB_7322_SendDmaStatus_0_ScbEntryValid_RMASK 0x1
+#define QIB_7322_SendDmaStatus_0_ScbFetchDescFlag_LSB 0x1C
+#define QIB_7322_SendDmaStatus_0_ScbFetchDescFlag_MSB 0x1C
+#define QIB_7322_SendDmaStatus_0_ScbFetchDescFlag_RMASK 0x1
+#define QIB_7322_SendDmaStatus_0_SplFifoReadyToGo_LSB 0x1B
+#define QIB_7322_SendDmaStatus_0_SplFifoReadyToGo_MSB 0x1B
+#define QIB_7322_SendDmaStatus_0_SplFifoReadyToGo_RMASK 0x1
+#define QIB_7322_SendDmaStatus_0_SplFifoDisarmed_LSB 0x1A
+#define QIB_7322_SendDmaStatus_0_SplFifoDisarmed_MSB 0x1A
+#define QIB_7322_SendDmaStatus_0_SplFifoDisarmed_RMASK 0x1
+#define QIB_7322_SendDmaStatus_0_SplFifoEmpty_LSB 0x19
+#define QIB_7322_SendDmaStatus_0_SplFifoEmpty_MSB 0x19
+#define QIB_7322_SendDmaStatus_0_SplFifoEmpty_RMASK 0x1
+#define QIB_7322_SendDmaStatus_0_SplFifoFull_LSB 0x18
+#define QIB_7322_SendDmaStatus_0_SplFifoFull_MSB 0x18
+#define QIB_7322_SendDmaStatus_0_SplFifoFull_RMASK 0x1
+#define QIB_7322_SendDmaStatus_0_SplFifoBufNum_LSB 0x10
+#define QIB_7322_SendDmaStatus_0_SplFifoBufNum_MSB 0x17
+#define QIB_7322_SendDmaStatus_0_SplFifoBufNum_RMASK 0xFF
+#define QIB_7322_SendDmaStatus_0_SplFifoDescIndex_LSB 0x0
+#define QIB_7322_SendDmaStatus_0_SplFifoDescIndex_MSB 0xF
+#define QIB_7322_SendDmaStatus_0_SplFifoDescIndex_RMASK 0xFFFF
+
+#define QIB_7322_SendDmaPriorityThld_0_OFFS 0x1258
+#define QIB_7322_SendDmaPriorityThld_0_DEF 0x0000000000000000
+#define QIB_7322_SendDmaPriorityThld_0_PriorityThreshold_LSB 0x0
+#define QIB_7322_SendDmaPriorityThld_0_PriorityThreshold_MSB 0x3
+#define QIB_7322_SendDmaPriorityThld_0_PriorityThreshold_RMASK 0xF
+
+#define QIB_7322_SendHdrErrSymptom_0_OFFS 0x1260
+#define QIB_7322_SendHdrErrSymptom_0_DEF 0x0000000000000000
+#define QIB_7322_SendHdrErrSymptom_0_NonKeyPacket_LSB 0x6
+#define QIB_7322_SendHdrErrSymptom_0_NonKeyPacket_MSB 0x6
+#define QIB_7322_SendHdrErrSymptom_0_NonKeyPacket_RMASK 0x1
+#define QIB_7322_SendHdrErrSymptom_0_GRHFail_LSB 0x5
+#define QIB_7322_SendHdrErrSymptom_0_GRHFail_MSB 0x5
+#define QIB_7322_SendHdrErrSymptom_0_GRHFail_RMASK 0x1
+#define QIB_7322_SendHdrErrSymptom_0_PkeyFail_LSB 0x4
+#define QIB_7322_SendHdrErrSymptom_0_PkeyFail_MSB 0x4
+#define QIB_7322_SendHdrErrSymptom_0_PkeyFail_RMASK 0x1
+#define QIB_7322_SendHdrErrSymptom_0_QPFail_LSB 0x3
+#define QIB_7322_SendHdrErrSymptom_0_QPFail_MSB 0x3
+#define QIB_7322_SendHdrErrSymptom_0_QPFail_RMASK 0x1
+#define QIB_7322_SendHdrErrSymptom_0_SLIDFail_LSB 0x2
+#define QIB_7322_SendHdrErrSymptom_0_SLIDFail_MSB 0x2
+#define QIB_7322_SendHdrErrSymptom_0_SLIDFail_RMASK 0x1
+#define QIB_7322_SendHdrErrSymptom_0_RawIPV6_LSB 0x1
+#define QIB_7322_SendHdrErrSymptom_0_RawIPV6_MSB 0x1
+#define QIB_7322_SendHdrErrSymptom_0_RawIPV6_RMASK 0x1
+#define QIB_7322_SendHdrErrSymptom_0_PacketTooSmall_LSB 0x0
+#define QIB_7322_SendHdrErrSymptom_0_PacketTooSmall_MSB 0x0
+#define QIB_7322_SendHdrErrSymptom_0_PacketTooSmall_RMASK 0x1
+
+#define QIB_7322_RxCreditVL0_0_OFFS 0x1280
+#define QIB_7322_RxCreditVL0_0_DEF 0x0000000000000000
+#define QIB_7322_RxCreditVL0_0_RxBufrConsumedVL_LSB 0x10
+#define QIB_7322_RxCreditVL0_0_RxBufrConsumedVL_MSB 0x1B
+#define QIB_7322_RxCreditVL0_0_RxBufrConsumedVL_RMASK 0xFFF
+#define QIB_7322_RxCreditVL0_0_RxMaxCreditVL_LSB 0x0
+#define QIB_7322_RxCreditVL0_0_RxMaxCreditVL_MSB 0xB
+#define QIB_7322_RxCreditVL0_0_RxMaxCreditVL_RMASK 0xFFF
+
+#define QIB_7322_SendDmaBufUsed0_0_OFFS 0x1480
+#define QIB_7322_SendDmaBufUsed0_0_DEF 0x0000000000000000
+#define QIB_7322_SendDmaBufUsed0_0_BufUsed_63_0_LSB 0x0
+#define QIB_7322_SendDmaBufUsed0_0_BufUsed_63_0_MSB 0x3F
+#define QIB_7322_SendDmaBufUsed0_0_BufUsed_63_0_RMASK 0x0
+
+#define QIB_7322_SendCheckControl_0_OFFS 0x14A8
+#define QIB_7322_SendCheckControl_0_DEF 0x0000000000000000
+#define QIB_7322_SendCheckControl_0_PKey_En_LSB 0x4
+#define QIB_7322_SendCheckControl_0_PKey_En_MSB 0x4
+#define QIB_7322_SendCheckControl_0_PKey_En_RMASK 0x1
+#define QIB_7322_SendCheckControl_0_BTHQP_En_LSB 0x3
+#define QIB_7322_SendCheckControl_0_BTHQP_En_MSB 0x3
+#define QIB_7322_SendCheckControl_0_BTHQP_En_RMASK 0x1
+#define QIB_7322_SendCheckControl_0_SLID_En_LSB 0x2
+#define QIB_7322_SendCheckControl_0_SLID_En_MSB 0x2
+#define QIB_7322_SendCheckControl_0_SLID_En_RMASK 0x1
+#define QIB_7322_SendCheckControl_0_RawIPV6_En_LSB 0x1
+#define QIB_7322_SendCheckControl_0_RawIPV6_En_MSB 0x1
+#define QIB_7322_SendCheckControl_0_RawIPV6_En_RMASK 0x1
+#define QIB_7322_SendCheckControl_0_PacketTooSmall_En_LSB 0x0
+#define QIB_7322_SendCheckControl_0_PacketTooSmall_En_MSB 0x0
+#define QIB_7322_SendCheckControl_0_PacketTooSmall_En_RMASK 0x1
+
+#define QIB_7322_SendIBSLIDMask_0_OFFS 0x14B0
+#define QIB_7322_SendIBSLIDMask_0_DEF 0x0000000000000000
+#define QIB_7322_SendIBSLIDMask_0_SendIBSLIDMask_15_0_LSB 0x0
+#define QIB_7322_SendIBSLIDMask_0_SendIBSLIDMask_15_0_MSB 0xF
+#define QIB_7322_SendIBSLIDMask_0_SendIBSLIDMask_15_0_RMASK 0xFFFF
+
+#define QIB_7322_SendIBSLIDAssign_0_OFFS 0x14B8
+#define QIB_7322_SendIBSLIDAssign_0_DEF 0x0000000000000000
+#define QIB_7322_SendIBSLIDAssign_0_SendIBSLIDAssign_15_0_LSB 0x0
+#define QIB_7322_SendIBSLIDAssign_0_SendIBSLIDAssign_15_0_MSB 0xF
+#define QIB_7322_SendIBSLIDAssign_0_SendIBSLIDAssign_15_0_RMASK 0xFFFF
+
+#define QIB_7322_IBCStatusA_0_OFFS 0x1540
+#define QIB_7322_IBCStatusA_0_DEF 0x0000000000000X02
+#define QIB_7322_IBCStatusA_0_TxCreditOk_VL7_LSB 0x27
+#define QIB_7322_IBCStatusA_0_TxCreditOk_VL7_MSB 0x27
+#define QIB_7322_IBCStatusA_0_TxCreditOk_VL7_RMASK 0x1
+#define QIB_7322_IBCStatusA_0_TxCreditOk_VL6_LSB 0x26
+#define QIB_7322_IBCStatusA_0_TxCreditOk_VL6_MSB 0x26
+#define QIB_7322_IBCStatusA_0_TxCreditOk_VL6_RMASK 0x1
+#define QIB_7322_IBCStatusA_0_TxCreditOk_VL5_LSB 0x25
+#define QIB_7322_IBCStatusA_0_TxCreditOk_VL5_MSB 0x25
+#define QIB_7322_IBCStatusA_0_TxCreditOk_VL5_RMASK 0x1
+#define QIB_7322_IBCStatusA_0_TxCreditOk_VL4_LSB 0x24
+#define QIB_7322_IBCStatusA_0_TxCreditOk_VL4_MSB 0x24
+#define QIB_7322_IBCStatusA_0_TxCreditOk_VL4_RMASK 0x1
+#define QIB_7322_IBCStatusA_0_TxCreditOk_VL3_LSB 0x23
+#define QIB_7322_IBCStatusA_0_TxCreditOk_VL3_MSB 0x23
+#define QIB_7322_IBCStatusA_0_TxCreditOk_VL3_RMASK 0x1
+#define QIB_7322_IBCStatusA_0_TxCreditOk_VL2_LSB 0x22
+#define QIB_7322_IBCStatusA_0_TxCreditOk_VL2_MSB 0x22
+#define QIB_7322_IBCStatusA_0_TxCreditOk_VL2_RMASK 0x1
+#define QIB_7322_IBCStatusA_0_TxCreditOk_VL1_LSB 0x21
+#define QIB_7322_IBCStatusA_0_TxCreditOk_VL1_MSB 0x21
+#define QIB_7322_IBCStatusA_0_TxCreditOk_VL1_RMASK 0x1
+#define QIB_7322_IBCStatusA_0_TxCreditOk_VL0_LSB 0x20
+#define QIB_7322_IBCStatusA_0_TxCreditOk_VL0_MSB 0x20
+#define QIB_7322_IBCStatusA_0_TxCreditOk_VL0_RMASK 0x1
+#define QIB_7322_IBCStatusA_0_TxReady_LSB 0x1E
+#define QIB_7322_IBCStatusA_0_TxReady_MSB 0x1E
+#define QIB_7322_IBCStatusA_0_TxReady_RMASK 0x1
+#define QIB_7322_IBCStatusA_0_LinkSpeedQDR_LSB 0x1D
+#define QIB_7322_IBCStatusA_0_LinkSpeedQDR_MSB 0x1D
+#define QIB_7322_IBCStatusA_0_LinkSpeedQDR_RMASK 0x1
+#define QIB_7322_IBCStatusA_0_ScrambleCapRemote_LSB 0xF
+#define QIB_7322_IBCStatusA_0_ScrambleCapRemote_MSB 0xF
+#define QIB_7322_IBCStatusA_0_ScrambleCapRemote_RMASK 0x1
+#define QIB_7322_IBCStatusA_0_ScrambleEn_LSB 0xE
+#define QIB_7322_IBCStatusA_0_ScrambleEn_MSB 0xE
+#define QIB_7322_IBCStatusA_0_ScrambleEn_RMASK 0x1
+#define QIB_7322_IBCStatusA_0_IBTxLaneReversed_LSB 0xD
+#define QIB_7322_IBCStatusA_0_IBTxLaneReversed_MSB 0xD
+#define QIB_7322_IBCStatusA_0_IBTxLaneReversed_RMASK 0x1
+#define QIB_7322_IBCStatusA_0_IBRxLaneReversed_LSB 0xC
+#define QIB_7322_IBCStatusA_0_IBRxLaneReversed_MSB 0xC
+#define QIB_7322_IBCStatusA_0_IBRxLaneReversed_RMASK 0x1
+#define QIB_7322_IBCStatusA_0_DDS_RXEQ_FAIL_LSB 0xA
+#define QIB_7322_IBCStatusA_0_DDS_RXEQ_FAIL_MSB 0xA
+#define QIB_7322_IBCStatusA_0_DDS_RXEQ_FAIL_RMASK 0x1
+#define QIB_7322_IBCStatusA_0_LinkWidthActive_LSB 0x9
+#define QIB_7322_IBCStatusA_0_LinkWidthActive_MSB 0x9
+#define QIB_7322_IBCStatusA_0_LinkWidthActive_RMASK 0x1
+#define QIB_7322_IBCStatusA_0_LinkSpeedActive_LSB 0x8
+#define QIB_7322_IBCStatusA_0_LinkSpeedActive_MSB 0x8
+#define QIB_7322_IBCStatusA_0_LinkSpeedActive_RMASK 0x1
+#define QIB_7322_IBCStatusA_0_LinkState_LSB 0x5
+#define QIB_7322_IBCStatusA_0_LinkState_MSB 0x7
+#define QIB_7322_IBCStatusA_0_LinkState_RMASK 0x7
+#define QIB_7322_IBCStatusA_0_LinkTrainingState_LSB 0x0
+#define QIB_7322_IBCStatusA_0_LinkTrainingState_MSB 0x4
+#define QIB_7322_IBCStatusA_0_LinkTrainingState_RMASK 0x1F
+
+#define QIB_7322_IBCStatusB_0_OFFS 0x1548
+#define QIB_7322_IBCStatusB_0_DEF 0x00000000XXXXXXXX
+#define QIB_7322_IBCStatusB_0_ibsd_adaptation_timer_debug_LSB 0x27
+#define QIB_7322_IBCStatusB_0_ibsd_adaptation_timer_debug_MSB 0x27
+#define QIB_7322_IBCStatusB_0_ibsd_adaptation_timer_debug_RMASK 0x1
+#define QIB_7322_IBCStatusB_0_ibsd_adaptation_timer_reached_threshold_LSB 0x26
+#define QIB_7322_IBCStatusB_0_ibsd_adaptation_timer_reached_threshold_MSB 0x26
+#define QIB_7322_IBCStatusB_0_ibsd_adaptation_timer_reached_threshold_RMASK 0x1
+#define QIB_7322_IBCStatusB_0_ibsd_adaptation_timer_started_LSB 0x25
+#define QIB_7322_IBCStatusB_0_ibsd_adaptation_timer_started_MSB 0x25
+#define QIB_7322_IBCStatusB_0_ibsd_adaptation_timer_started_RMASK 0x1
+#define QIB_7322_IBCStatusB_0_heartbeat_timed_out_LSB 0x24
+#define QIB_7322_IBCStatusB_0_heartbeat_timed_out_MSB 0x24
+#define QIB_7322_IBCStatusB_0_heartbeat_timed_out_RMASK 0x1
+#define QIB_7322_IBCStatusB_0_heartbeat_crosstalk_LSB 0x20
+#define QIB_7322_IBCStatusB_0_heartbeat_crosstalk_MSB 0x23
+#define QIB_7322_IBCStatusB_0_heartbeat_crosstalk_RMASK 0xF
+#define QIB_7322_IBCStatusB_0_RxEqLocalDevice_LSB 0x1E
+#define QIB_7322_IBCStatusB_0_RxEqLocalDevice_MSB 0x1F
+#define QIB_7322_IBCStatusB_0_RxEqLocalDevice_RMASK 0x3
+#define QIB_7322_IBCStatusB_0_ReqDDSLocalFromRmt_LSB 0x1A
+#define QIB_7322_IBCStatusB_0_ReqDDSLocalFromRmt_MSB 0x1D
+#define QIB_7322_IBCStatusB_0_ReqDDSLocalFromRmt_RMASK 0xF
+#define QIB_7322_IBCStatusB_0_LinkRoundTripLatency_LSB 0x0
+#define QIB_7322_IBCStatusB_0_LinkRoundTripLatency_MSB 0x19
+#define QIB_7322_IBCStatusB_0_LinkRoundTripLatency_RMASK 0x3FFFFFF
+
+#define QIB_7322_IBCCtrlA_0_OFFS 0x1560
+#define QIB_7322_IBCCtrlA_0_DEF 0x0000000000000000
+#define QIB_7322_IBCCtrlA_0_Loopback_LSB 0x3F
+#define QIB_7322_IBCCtrlA_0_Loopback_MSB 0x3F
+#define QIB_7322_IBCCtrlA_0_Loopback_RMASK 0x1
+#define QIB_7322_IBCCtrlA_0_LinkDownDefaultState_LSB 0x3E
+#define QIB_7322_IBCCtrlA_0_LinkDownDefaultState_MSB 0x3E
+#define QIB_7322_IBCCtrlA_0_LinkDownDefaultState_RMASK 0x1
+#define QIB_7322_IBCCtrlA_0_IBLinkEn_LSB 0x3D
+#define QIB_7322_IBCCtrlA_0_IBLinkEn_MSB 0x3D
+#define QIB_7322_IBCCtrlA_0_IBLinkEn_RMASK 0x1
+#define QIB_7322_IBCCtrlA_0_IBStatIntReductionEn_LSB 0x3C
+#define QIB_7322_IBCCtrlA_0_IBStatIntReductionEn_MSB 0x3C
+#define QIB_7322_IBCCtrlA_0_IBStatIntReductionEn_RMASK 0x1
+#define QIB_7322_IBCCtrlA_0_NumVLane_LSB 0x30
+#define QIB_7322_IBCCtrlA_0_NumVLane_MSB 0x32
+#define QIB_7322_IBCCtrlA_0_NumVLane_RMASK 0x7
+#define QIB_7322_IBCCtrlA_0_OverrunThreshold_LSB 0x24
+#define QIB_7322_IBCCtrlA_0_OverrunThreshold_MSB 0x27
+#define QIB_7322_IBCCtrlA_0_OverrunThreshold_RMASK 0xF
+#define QIB_7322_IBCCtrlA_0_PhyerrThreshold_LSB 0x20
+#define QIB_7322_IBCCtrlA_0_PhyerrThreshold_MSB 0x23
+#define QIB_7322_IBCCtrlA_0_PhyerrThreshold_RMASK 0xF
+#define QIB_7322_IBCCtrlA_0_MaxPktLen_LSB 0x15
+#define QIB_7322_IBCCtrlA_0_MaxPktLen_MSB 0x1F
+#define QIB_7322_IBCCtrlA_0_MaxPktLen_RMASK 0x7FF
+#define QIB_7322_IBCCtrlA_0_LinkCmd_LSB 0x13
+#define QIB_7322_IBCCtrlA_0_LinkCmd_MSB 0x14
+#define QIB_7322_IBCCtrlA_0_LinkCmd_RMASK 0x3
+#define QIB_7322_IBCCtrlA_0_LinkInitCmd_LSB 0x10
+#define QIB_7322_IBCCtrlA_0_LinkInitCmd_MSB 0x12
+#define QIB_7322_IBCCtrlA_0_LinkInitCmd_RMASK 0x7
+#define QIB_7322_IBCCtrlA_0_FlowCtrlWaterMark_LSB 0x8
+#define QIB_7322_IBCCtrlA_0_FlowCtrlWaterMark_MSB 0xF
+#define QIB_7322_IBCCtrlA_0_FlowCtrlWaterMark_RMASK 0xFF
+#define QIB_7322_IBCCtrlA_0_FlowCtrlPeriod_LSB 0x0
+#define QIB_7322_IBCCtrlA_0_FlowCtrlPeriod_MSB 0x7
+#define QIB_7322_IBCCtrlA_0_FlowCtrlPeriod_RMASK 0xFF
+
+#define QIB_7322_IBCCtrlB_0_OFFS 0x1568
+#define QIB_7322_IBCCtrlB_0_DEF 0x00000000000305FF
+#define QIB_7322_IBCCtrlB_0_IB_DLID_MASK_LSB 0x30
+#define QIB_7322_IBCCtrlB_0_IB_DLID_MASK_MSB 0x3F
+#define QIB_7322_IBCCtrlB_0_IB_DLID_MASK_RMASK 0xFFFF
+#define QIB_7322_IBCCtrlB_0_IB_DLID_LSB 0x20
+#define QIB_7322_IBCCtrlB_0_IB_DLID_MSB 0x2F
+#define QIB_7322_IBCCtrlB_0_IB_DLID_RMASK 0xFFFF
+#define QIB_7322_IBCCtrlB_0_IB_ENABLE_FILT_DPKT_LSB 0x1B
+#define QIB_7322_IBCCtrlB_0_IB_ENABLE_FILT_DPKT_MSB 0x1B
+#define QIB_7322_IBCCtrlB_0_IB_ENABLE_FILT_DPKT_RMASK 0x1
+#define QIB_7322_IBCCtrlB_0_HRTBT_REQ_LSB 0x1A
+#define QIB_7322_IBCCtrlB_0_HRTBT_REQ_MSB 0x1A
+#define QIB_7322_IBCCtrlB_0_HRTBT_REQ_RMASK 0x1
+#define QIB_7322_IBCCtrlB_0_HRTBT_PORT_LSB 0x12
+#define QIB_7322_IBCCtrlB_0_HRTBT_PORT_MSB 0x19
+#define QIB_7322_IBCCtrlB_0_HRTBT_PORT_RMASK 0xFF
+#define QIB_7322_IBCCtrlB_0_HRTBT_AUTO_LSB 0x11
+#define QIB_7322_IBCCtrlB_0_HRTBT_AUTO_MSB 0x11
+#define QIB_7322_IBCCtrlB_0_HRTBT_AUTO_RMASK 0x1
+#define QIB_7322_IBCCtrlB_0_HRTBT_ENB_LSB 0x10
+#define QIB_7322_IBCCtrlB_0_HRTBT_ENB_MSB 0x10
+#define QIB_7322_IBCCtrlB_0_HRTBT_ENB_RMASK 0x1
+#define QIB_7322_IBCCtrlB_0_SD_DDS_LSB 0xC
+#define QIB_7322_IBCCtrlB_0_SD_DDS_MSB 0xF
+#define QIB_7322_IBCCtrlB_0_SD_DDS_RMASK 0xF
+#define QIB_7322_IBCCtrlB_0_SD_DDSV_LSB 0xB
+#define QIB_7322_IBCCtrlB_0_SD_DDSV_MSB 0xB
+#define QIB_7322_IBCCtrlB_0_SD_DDSV_RMASK 0x1
+#define QIB_7322_IBCCtrlB_0_SD_ADD_ENB_LSB 0xA
+#define QIB_7322_IBCCtrlB_0_SD_ADD_ENB_MSB 0xA
+#define QIB_7322_IBCCtrlB_0_SD_ADD_ENB_RMASK 0x1
+#define QIB_7322_IBCCtrlB_0_SD_RX_EQUAL_ENABLE_LSB 0x9
+#define QIB_7322_IBCCtrlB_0_SD_RX_EQUAL_ENABLE_MSB 0x9
+#define QIB_7322_IBCCtrlB_0_SD_RX_EQUAL_ENABLE_RMASK 0x1
+#define QIB_7322_IBCCtrlB_0_IB_LANE_REV_SUPPORTED_LSB 0x8
+#define QIB_7322_IBCCtrlB_0_IB_LANE_REV_SUPPORTED_MSB 0x8
+#define QIB_7322_IBCCtrlB_0_IB_LANE_REV_SUPPORTED_RMASK 0x1
+#define QIB_7322_IBCCtrlB_0_IB_POLARITY_REV_SUPP_LSB 0x7
+#define QIB_7322_IBCCtrlB_0_IB_POLARITY_REV_SUPP_MSB 0x7
+#define QIB_7322_IBCCtrlB_0_IB_POLARITY_REV_SUPP_RMASK 0x1
+#define QIB_7322_IBCCtrlB_0_IB_NUM_CHANNELS_LSB 0x5
+#define QIB_7322_IBCCtrlB_0_IB_NUM_CHANNELS_MSB 0x6
+#define QIB_7322_IBCCtrlB_0_IB_NUM_CHANNELS_RMASK 0x3
+#define QIB_7322_IBCCtrlB_0_SD_SPEED_QDR_LSB 0x4
+#define QIB_7322_IBCCtrlB_0_SD_SPEED_QDR_MSB 0x4
+#define QIB_7322_IBCCtrlB_0_SD_SPEED_QDR_RMASK 0x1
+#define QIB_7322_IBCCtrlB_0_SD_SPEED_DDR_LSB 0x3
+#define QIB_7322_IBCCtrlB_0_SD_SPEED_DDR_MSB 0x3
+#define QIB_7322_IBCCtrlB_0_SD_SPEED_DDR_RMASK 0x1
+#define QIB_7322_IBCCtrlB_0_SD_SPEED_SDR_LSB 0x2
+#define QIB_7322_IBCCtrlB_0_SD_SPEED_SDR_MSB 0x2
+#define QIB_7322_IBCCtrlB_0_SD_SPEED_SDR_RMASK 0x1
+#define QIB_7322_IBCCtrlB_0_SD_SPEED_LSB 0x1
+#define QIB_7322_IBCCtrlB_0_SD_SPEED_MSB 0x1
+#define QIB_7322_IBCCtrlB_0_SD_SPEED_RMASK 0x1
+#define QIB_7322_IBCCtrlB_0_IB_ENHANCED_MODE_LSB 0x0
+#define QIB_7322_IBCCtrlB_0_IB_ENHANCED_MODE_MSB 0x0
+#define QIB_7322_IBCCtrlB_0_IB_ENHANCED_MODE_RMASK 0x1
+
+#define QIB_7322_IBCCtrlC_0_OFFS 0x1570
+#define QIB_7322_IBCCtrlC_0_DEF 0x0000000000000301
+#define QIB_7322_IBCCtrlC_0_IB_BACK_PORCH_LSB 0x5
+#define QIB_7322_IBCCtrlC_0_IB_BACK_PORCH_MSB 0x9
+#define QIB_7322_IBCCtrlC_0_IB_BACK_PORCH_RMASK 0x1F
+#define QIB_7322_IBCCtrlC_0_IB_FRONT_PORCH_LSB 0x0
+#define QIB_7322_IBCCtrlC_0_IB_FRONT_PORCH_MSB 0x4
+#define QIB_7322_IBCCtrlC_0_IB_FRONT_PORCH_RMASK 0x1F
+
+#define QIB_7322_HRTBT_GUID_0_OFFS 0x1588
+#define QIB_7322_HRTBT_GUID_0_DEF 0x0000000000000000
+
+#define QIB_7322_IB_SDTEST_IF_TX_0_OFFS 0x1590
+#define QIB_7322_IB_SDTEST_IF_TX_0_DEF 0x0000000000000000
+#define QIB_7322_IB_SDTEST_IF_TX_0_TS_TX_RX_CFG_LSB 0x30
+#define QIB_7322_IB_SDTEST_IF_TX_0_TS_TX_RX_CFG_MSB 0x3F
+#define QIB_7322_IB_SDTEST_IF_TX_0_TS_TX_RX_CFG_RMASK 0xFFFF
+#define QIB_7322_IB_SDTEST_IF_TX_0_TS_TX_TX_CFG_LSB 0x20
+#define QIB_7322_IB_SDTEST_IF_TX_0_TS_TX_TX_CFG_MSB 0x2F
+#define QIB_7322_IB_SDTEST_IF_TX_0_TS_TX_TX_CFG_RMASK 0xFFFF
+#define QIB_7322_IB_SDTEST_IF_TX_0_TS_TX_SPEED_LSB 0xD
+#define QIB_7322_IB_SDTEST_IF_TX_0_TS_TX_SPEED_MSB 0xF
+#define QIB_7322_IB_SDTEST_IF_TX_0_TS_TX_SPEED_RMASK 0x7
+#define QIB_7322_IB_SDTEST_IF_TX_0_TS_TX_OPCODE_LSB 0xB
+#define QIB_7322_IB_SDTEST_IF_TX_0_TS_TX_OPCODE_MSB 0xC
+#define QIB_7322_IB_SDTEST_IF_TX_0_TS_TX_OPCODE_RMASK 0x3
+#define QIB_7322_IB_SDTEST_IF_TX_0_CREDIT_CHANGE_LSB 0x4
+#define QIB_7322_IB_SDTEST_IF_TX_0_CREDIT_CHANGE_MSB 0x4
+#define QIB_7322_IB_SDTEST_IF_TX_0_CREDIT_CHANGE_RMASK 0x1
+#define QIB_7322_IB_SDTEST_IF_TX_0_VL_CAP_LSB 0x2
+#define QIB_7322_IB_SDTEST_IF_TX_0_VL_CAP_MSB 0x3
+#define QIB_7322_IB_SDTEST_IF_TX_0_VL_CAP_RMASK 0x3
+#define QIB_7322_IB_SDTEST_IF_TX_0_TS_3_TX_VALID_LSB 0x1
+#define QIB_7322_IB_SDTEST_IF_TX_0_TS_3_TX_VALID_MSB 0x1
+#define QIB_7322_IB_SDTEST_IF_TX_0_TS_3_TX_VALID_RMASK 0x1
+#define QIB_7322_IB_SDTEST_IF_TX_0_TS_T_TX_VALID_LSB 0x0
+#define QIB_7322_IB_SDTEST_IF_TX_0_TS_T_TX_VALID_MSB 0x0
+#define QIB_7322_IB_SDTEST_IF_TX_0_TS_T_TX_VALID_RMASK 0x1
+
+#define QIB_7322_IB_SDTEST_IF_RX_0_OFFS 0x1598
+#define QIB_7322_IB_SDTEST_IF_RX_0_DEF 0x0000000000000000
+#define QIB_7322_IB_SDTEST_IF_RX_0_TS_RX_RX_CFG_LSB 0x30
+#define QIB_7322_IB_SDTEST_IF_RX_0_TS_RX_RX_CFG_MSB 0x3F
+#define QIB_7322_IB_SDTEST_IF_RX_0_TS_RX_RX_CFG_RMASK 0xFFFF
+#define QIB_7322_IB_SDTEST_IF_RX_0_TS_RX_TX_CFG_LSB 0x20
+#define QIB_7322_IB_SDTEST_IF_RX_0_TS_RX_TX_CFG_MSB 0x2F
+#define QIB_7322_IB_SDTEST_IF_RX_0_TS_RX_TX_CFG_RMASK 0xFFFF
+#define QIB_7322_IB_SDTEST_IF_RX_0_TS_RX_B_LSB 0x18
+#define QIB_7322_IB_SDTEST_IF_RX_0_TS_RX_B_MSB 0x1F
+#define QIB_7322_IB_SDTEST_IF_RX_0_TS_RX_B_RMASK 0xFF
+#define QIB_7322_IB_SDTEST_IF_RX_0_TS_RX_A_LSB 0x10
+#define QIB_7322_IB_SDTEST_IF_RX_0_TS_RX_A_MSB 0x17
+#define QIB_7322_IB_SDTEST_IF_RX_0_TS_RX_A_RMASK 0xFF
+#define QIB_7322_IB_SDTEST_IF_RX_0_TS_3_RX_VALID_LSB 0x1
+#define QIB_7322_IB_SDTEST_IF_RX_0_TS_3_RX_VALID_MSB 0x1
+#define QIB_7322_IB_SDTEST_IF_RX_0_TS_3_RX_VALID_RMASK 0x1
+#define QIB_7322_IB_SDTEST_IF_RX_0_TS_T_RX_VALID_LSB 0x0
+#define QIB_7322_IB_SDTEST_IF_RX_0_TS_T_RX_VALID_MSB 0x0
+#define QIB_7322_IB_SDTEST_IF_RX_0_TS_T_RX_VALID_RMASK 0x1
+
+#define QIB_7322_IBNCModeCtrl_0_OFFS 0x15B8
+#define QIB_7322_IBNCModeCtrl_0_DEF 0x0000000000000000
+#define QIB_7322_IBNCModeCtrl_0_ScrambleCapRemoteForce_LSB 0x22
+#define QIB_7322_IBNCModeCtrl_0_ScrambleCapRemoteForce_MSB 0x22
+#define QIB_7322_IBNCModeCtrl_0_ScrambleCapRemoteForce_RMASK 0x1
+#define QIB_7322_IBNCModeCtrl_0_ScrambleCapRemoteMask_LSB 0x21
+#define QIB_7322_IBNCModeCtrl_0_ScrambleCapRemoteMask_MSB 0x21
+#define QIB_7322_IBNCModeCtrl_0_ScrambleCapRemoteMask_RMASK 0x1
+#define QIB_7322_IBNCModeCtrl_0_ScrambleCapLocal_LSB 0x20
+#define QIB_7322_IBNCModeCtrl_0_ScrambleCapLocal_MSB 0x20
+#define QIB_7322_IBNCModeCtrl_0_ScrambleCapLocal_RMASK 0x1
+#define QIB_7322_IBNCModeCtrl_0_TSMCode_TS2_LSB 0x11
+#define QIB_7322_IBNCModeCtrl_0_TSMCode_TS2_MSB 0x19
+#define QIB_7322_IBNCModeCtrl_0_TSMCode_TS2_RMASK 0x1FF
+#define QIB_7322_IBNCModeCtrl_0_TSMCode_TS1_LSB 0x8
+#define QIB_7322_IBNCModeCtrl_0_TSMCode_TS1_MSB 0x10
+#define QIB_7322_IBNCModeCtrl_0_TSMCode_TS1_RMASK 0x1FF
+#define QIB_7322_IBNCModeCtrl_0_TSMEnable_ignore_TSM_on_rx_LSB 0x2
+#define QIB_7322_IBNCModeCtrl_0_TSMEnable_ignore_TSM_on_rx_MSB 0x2
+#define QIB_7322_IBNCModeCtrl_0_TSMEnable_ignore_TSM_on_rx_RMASK 0x1
+#define QIB_7322_IBNCModeCtrl_0_TSMEnable_send_TS2_LSB 0x1
+#define QIB_7322_IBNCModeCtrl_0_TSMEnable_send_TS2_MSB 0x1
+#define QIB_7322_IBNCModeCtrl_0_TSMEnable_send_TS2_RMASK 0x1
+#define QIB_7322_IBNCModeCtrl_0_TSMEnable_send_TS1_LSB 0x0
+#define QIB_7322_IBNCModeCtrl_0_TSMEnable_send_TS1_MSB 0x0
+#define QIB_7322_IBNCModeCtrl_0_TSMEnable_send_TS1_RMASK 0x1
+
+#define QIB_7322_IBSerdesStatus_0_OFFS 0x15D0
+#define QIB_7322_IBSerdesStatus_0_DEF 0x0000000000000000
+
+#define QIB_7322_IBPCSConfig_0_OFFS 0x15D8
+#define QIB_7322_IBPCSConfig_0_DEF 0x0000000000000007
+#define QIB_7322_IBPCSConfig_0_link_sync_mask_LSB 0x9
+#define QIB_7322_IBPCSConfig_0_link_sync_mask_MSB 0x12
+#define QIB_7322_IBPCSConfig_0_link_sync_mask_RMASK 0x3FF
+#define QIB_7322_IBPCSConfig_0_xcv_rreset_LSB 0x2
+#define QIB_7322_IBPCSConfig_0_xcv_rreset_MSB 0x2
+#define QIB_7322_IBPCSConfig_0_xcv_rreset_RMASK 0x1
+#define QIB_7322_IBPCSConfig_0_xcv_treset_LSB 0x1
+#define QIB_7322_IBPCSConfig_0_xcv_treset_MSB 0x1
+#define QIB_7322_IBPCSConfig_0_xcv_treset_RMASK 0x1
+#define QIB_7322_IBPCSConfig_0_tx_rx_reset_LSB 0x0
+#define QIB_7322_IBPCSConfig_0_tx_rx_reset_MSB 0x0
+#define QIB_7322_IBPCSConfig_0_tx_rx_reset_RMASK 0x1
+
+#define QIB_7322_IBSerdesCtrl_0_OFFS 0x15E0
+#define QIB_7322_IBSerdesCtrl_0_DEF 0x0000000000FFA00F
+#define QIB_7322_IBSerdesCtrl_0_DISABLE_RXLATOFF_QDR_LSB 0x1A
+#define QIB_7322_IBSerdesCtrl_0_DISABLE_RXLATOFF_QDR_MSB 0x1A
+#define QIB_7322_IBSerdesCtrl_0_DISABLE_RXLATOFF_QDR_RMASK 0x1
+#define QIB_7322_IBSerdesCtrl_0_DISABLE_RXLATOFF_DDR_LSB 0x19
+#define QIB_7322_IBSerdesCtrl_0_DISABLE_RXLATOFF_DDR_MSB 0x19
+#define QIB_7322_IBSerdesCtrl_0_DISABLE_RXLATOFF_DDR_RMASK 0x1
+#define QIB_7322_IBSerdesCtrl_0_DISABLE_RXLATOFF_SDR_LSB 0x18
+#define QIB_7322_IBSerdesCtrl_0_DISABLE_RXLATOFF_SDR_MSB 0x18
+#define QIB_7322_IBSerdesCtrl_0_DISABLE_RXLATOFF_SDR_RMASK 0x1
+#define QIB_7322_IBSerdesCtrl_0_CHANNEL_RESET_N_LSB 0x14
+#define QIB_7322_IBSerdesCtrl_0_CHANNEL_RESET_N_MSB 0x17
+#define QIB_7322_IBSerdesCtrl_0_CHANNEL_RESET_N_RMASK 0xF
+#define QIB_7322_IBSerdesCtrl_0_CGMODE_LSB 0x10
+#define QIB_7322_IBSerdesCtrl_0_CGMODE_MSB 0x13
+#define QIB_7322_IBSerdesCtrl_0_CGMODE_RMASK 0xF
+#define QIB_7322_IBSerdesCtrl_0_IB_LAT_MODE_LSB 0xF
+#define QIB_7322_IBSerdesCtrl_0_IB_LAT_MODE_MSB 0xF
+#define QIB_7322_IBSerdesCtrl_0_IB_LAT_MODE_RMASK 0x1
+#define QIB_7322_IBSerdesCtrl_0_RXLOSEN_LSB 0xD
+#define QIB_7322_IBSerdesCtrl_0_RXLOSEN_MSB 0xD
+#define QIB_7322_IBSerdesCtrl_0_RXLOSEN_RMASK 0x1
+#define QIB_7322_IBSerdesCtrl_0_LPEN_LSB 0xC
+#define QIB_7322_IBSerdesCtrl_0_LPEN_MSB 0xC
+#define QIB_7322_IBSerdesCtrl_0_LPEN_RMASK 0x1
+#define QIB_7322_IBSerdesCtrl_0_PLLPD_LSB 0xB
+#define QIB_7322_IBSerdesCtrl_0_PLLPD_MSB 0xB
+#define QIB_7322_IBSerdesCtrl_0_PLLPD_RMASK 0x1
+#define QIB_7322_IBSerdesCtrl_0_TXPD_LSB 0xA
+#define QIB_7322_IBSerdesCtrl_0_TXPD_MSB 0xA
+#define QIB_7322_IBSerdesCtrl_0_TXPD_RMASK 0x1
+#define QIB_7322_IBSerdesCtrl_0_RXPD_LSB 0x9
+#define QIB_7322_IBSerdesCtrl_0_RXPD_MSB 0x9
+#define QIB_7322_IBSerdesCtrl_0_RXPD_RMASK 0x1
+#define QIB_7322_IBSerdesCtrl_0_TXIDLE_LSB 0x8
+#define QIB_7322_IBSerdesCtrl_0_TXIDLE_MSB 0x8
+#define QIB_7322_IBSerdesCtrl_0_TXIDLE_RMASK 0x1
+#define QIB_7322_IBSerdesCtrl_0_CMODE_LSB 0x0
+#define QIB_7322_IBSerdesCtrl_0_CMODE_MSB 0x6
+#define QIB_7322_IBSerdesCtrl_0_CMODE_RMASK 0x7F
+
+#define QIB_7322_IBSD_TX_DEEMPHASIS_OVERRIDE_0_OFFS 0x1600
+#define QIB_7322_IBSD_TX_DEEMPHASIS_OVERRIDE_0_DEF 0x0000000000000000
+#define QIB_7322_IBSD_TX_DEEMPHASIS_OVERRIDE_0_tx_override_deemphasis_select_LSB 0x1F
+#define QIB_7322_IBSD_TX_DEEMPHASIS_OVERRIDE_0_tx_override_deemphasis_select_MSB 0x1F
+#define QIB_7322_IBSD_TX_DEEMPHASIS_OVERRIDE_0_tx_override_deemphasis_select_RMASK 0x1
+#define QIB_7322_IBSD_TX_DEEMPHASIS_OVERRIDE_0_reset_tx_deemphasis_override_LSB 0x1E
+#define QIB_7322_IBSD_TX_DEEMPHASIS_OVERRIDE_0_reset_tx_deemphasis_override_MSB 0x1E
+#define QIB_7322_IBSD_TX_DEEMPHASIS_OVERRIDE_0_reset_tx_deemphasis_override_RMASK 0x1
+#define QIB_7322_IBSD_TX_DEEMPHASIS_OVERRIDE_0_txampcntl_d2a_LSB 0xE
+#define QIB_7322_IBSD_TX_DEEMPHASIS_OVERRIDE_0_txampcntl_d2a_MSB 0x11
+#define QIB_7322_IBSD_TX_DEEMPHASIS_OVERRIDE_0_txampcntl_d2a_RMASK 0xF
+#define QIB_7322_IBSD_TX_DEEMPHASIS_OVERRIDE_0_txc0_ena_LSB 0x9
+#define QIB_7322_IBSD_TX_DEEMPHASIS_OVERRIDE_0_txc0_ena_MSB 0xD
+#define QIB_7322_IBSD_TX_DEEMPHASIS_OVERRIDE_0_txc0_ena_RMASK 0x1F
+#define QIB_7322_IBSD_TX_DEEMPHASIS_OVERRIDE_0_txcp1_ena_LSB 0x5
+#define QIB_7322_IBSD_TX_DEEMPHASIS_OVERRIDE_0_txcp1_ena_MSB 0x8
+#define QIB_7322_IBSD_TX_DEEMPHASIS_OVERRIDE_0_txcp1_ena_RMASK 0xF
+#define QIB_7322_IBSD_TX_DEEMPHASIS_OVERRIDE_0_txcn1_xtra_emph0_LSB 0x3
+#define QIB_7322_IBSD_TX_DEEMPHASIS_OVERRIDE_0_txcn1_xtra_emph0_MSB 0x4
+#define QIB_7322_IBSD_TX_DEEMPHASIS_OVERRIDE_0_txcn1_xtra_emph0_RMASK 0x3
+#define QIB_7322_IBSD_TX_DEEMPHASIS_OVERRIDE_0_txcn1_ena_LSB 0x0
+#define QIB_7322_IBSD_TX_DEEMPHASIS_OVERRIDE_0_txcn1_ena_MSB 0x2
+#define QIB_7322_IBSD_TX_DEEMPHASIS_OVERRIDE_0_txcn1_ena_RMASK 0x7
+
+#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_OFFS 0x1640
+#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_DEF 0x0000000000000000
+#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenagain_sdr_ch3_LSB 0x27
+#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenagain_sdr_ch3_MSB 0x27
+#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenagain_sdr_ch3_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenagain_sdr_ch2_LSB 0x26
+#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenagain_sdr_ch2_MSB 0x26
+#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenagain_sdr_ch2_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenagain_sdr_ch1_LSB 0x25
+#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenagain_sdr_ch1_MSB 0x25
+#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenagain_sdr_ch1_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenagain_sdr_ch0_LSB 0x24
+#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenagain_sdr_ch0_MSB 0x24
+#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenagain_sdr_ch0_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenale_sdr_ch3_LSB 0x23
+#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenale_sdr_ch3_MSB 0x23
+#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenale_sdr_ch3_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenale_sdr_ch2_LSB 0x22
+#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenale_sdr_ch2_MSB 0x22
+#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenale_sdr_ch2_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenale_sdr_ch1_LSB 0x21
+#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenale_sdr_ch1_MSB 0x21
+#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenale_sdr_ch1_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenale_sdr_ch0_LSB 0x20
+#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenale_sdr_ch0_MSB 0x20
+#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenale_sdr_ch0_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenadfe_sdr_ch3_LSB 0x18
+#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenadfe_sdr_ch3_MSB 0x1F
+#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenadfe_sdr_ch3_RMASK 0xFF
+#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenadfe_sdr_ch2_LSB 0x10
+#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenadfe_sdr_ch2_MSB 0x17
+#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenadfe_sdr_ch2_RMASK 0xFF
+#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenadfe_sdr_ch1_LSB 0x8
+#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenadfe_sdr_ch1_MSB 0xF
+#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenadfe_sdr_ch1_RMASK 0xFF
+#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenadfe_sdr_ch0_LSB 0x0
+#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenadfe_sdr_ch0_MSB 0x7
+#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenadfe_sdr_ch0_RMASK 0xFF
+
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_OFFS 0x1648
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_DEF 0x0000000000000000
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenagain_sdr_ch3_LSB 0x27
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenagain_sdr_ch3_MSB 0x27
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenagain_sdr_ch3_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenagain_sdr_ch2_LSB 0x26
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenagain_sdr_ch2_MSB 0x26
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenagain_sdr_ch2_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenagain_sdr_ch1_LSB 0x25
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenagain_sdr_ch1_MSB 0x25
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenagain_sdr_ch1_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenagain_sdr_ch0_LSB 0x24
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenagain_sdr_ch0_MSB 0x24
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenagain_sdr_ch0_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenale_sdr_ch3_LSB 0x23
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenale_sdr_ch3_MSB 0x23
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenale_sdr_ch3_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenale_sdr_ch2_LSB 0x22
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenale_sdr_ch2_MSB 0x22
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenale_sdr_ch2_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenale_sdr_ch1_LSB 0x21
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenale_sdr_ch1_MSB 0x21
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenale_sdr_ch1_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenale_sdr_ch0_LSB 0x20
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenale_sdr_ch0_MSB 0x20
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenale_sdr_ch0_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenadfe_sdr_ch3_LSB 0x18
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenadfe_sdr_ch3_MSB 0x1F
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenadfe_sdr_ch3_RMASK 0xFF
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenadfe_sdr_ch2_LSB 0x10
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenadfe_sdr_ch2_MSB 0x17
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenadfe_sdr_ch2_RMASK 0xFF
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenadfe_sdr_ch1_LSB 0x8
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenadfe_sdr_ch1_MSB 0xF
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenadfe_sdr_ch1_RMASK 0xFF
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenadfe_sdr_ch0_LSB 0x0
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenadfe_sdr_ch0_MSB 0x7
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenadfe_sdr_ch0_RMASK 0xFF
+
+#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_OFFS 0x1650
+#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_DEF 0x0000000000000000
+#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenagain_ddr_ch3_LSB 0x27
+#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenagain_ddr_ch3_MSB 0x27
+#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenagain_ddr_ch3_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenagain_ddr_ch2_LSB 0x26
+#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenagain_ddr_ch2_MSB 0x26
+#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenagain_ddr_ch2_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenagain_ddr_ch1_LSB 0x25
+#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenagain_ddr_ch1_MSB 0x25
+#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenagain_ddr_ch1_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenagain_ddr_ch0_LSB 0x24
+#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenagain_ddr_ch0_MSB 0x24
+#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenagain_ddr_ch0_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenale_ddr_ch3_LSB 0x23
+#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenale_ddr_ch3_MSB 0x23
+#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenale_ddr_ch3_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenale_ddr_ch2_LSB 0x22
+#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenale_ddr_ch2_MSB 0x22
+#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenale_ddr_ch2_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenale_ddr_ch1_LSB 0x21
+#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenale_ddr_ch1_MSB 0x21
+#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenale_ddr_ch1_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenale_ddr_ch0_LSB 0x20
+#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenale_ddr_ch0_MSB 0x20
+#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenale_ddr_ch0_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenadfe_ddr_ch3_LSB 0x18
+#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenadfe_ddr_ch3_MSB 0x1F
+#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenadfe_ddr_ch3_RMASK 0xFF
+#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenadfe_ddr_ch2_LSB 0x10
+#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenadfe_ddr_ch2_MSB 0x17
+#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenadfe_ddr_ch2_RMASK 0xFF
+#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenadfe_ddr_ch1_LSB 0x8
+#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenadfe_ddr_ch1_MSB 0xF
+#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenadfe_ddr_ch1_RMASK 0xFF
+#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenadfe_ddr_ch0_LSB 0x0
+#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenadfe_ddr_ch0_MSB 0x7
+#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenadfe_ddr_ch0_RMASK 0xFF
+
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_OFFS 0x1658
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_DEF 0x0000000000000000
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenagain_ddr_ch3_LSB 0x27
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenagain_ddr_ch3_MSB 0x27
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenagain_ddr_ch3_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenagain_ddr_ch2_LSB 0x26
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenagain_ddr_ch2_MSB 0x26
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenagain_ddr_ch2_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenagain_ddr_ch1_LSB 0x25
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenagain_ddr_ch1_MSB 0x25
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenagain_ddr_ch1_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenagain_ddr_ch0_LSB 0x24
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenagain_ddr_ch0_MSB 0x24
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenagain_ddr_ch0_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenale_ddr_ch3_LSB 0x23
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenale_ddr_ch3_MSB 0x23
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenale_ddr_ch3_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenale_ddr_ch2_LSB 0x22
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenale_ddr_ch2_MSB 0x22
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenale_ddr_ch2_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenale_ddr_ch1_LSB 0x21
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenale_ddr_ch1_MSB 0x21
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenale_ddr_ch1_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenale_ddr_ch0_LSB 0x20
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenale_ddr_ch0_MSB 0x20
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenale_ddr_ch0_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenadfe_ddr_ch3_LSB 0x18
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenadfe_ddr_ch3_MSB 0x1F
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenadfe_ddr_ch3_RMASK 0xFF
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenadfe_ddr_ch2_LSB 0x10
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenadfe_ddr_ch2_MSB 0x17
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenadfe_ddr_ch2_RMASK 0xFF
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenadfe_ddr_ch1_LSB 0x8
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenadfe_ddr_ch1_MSB 0xF
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenadfe_ddr_ch1_RMASK 0xFF
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenadfe_ddr_ch0_LSB 0x0
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenadfe_ddr_ch0_MSB 0x7
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenadfe_ddr_ch0_RMASK 0xFF
+
+#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_OFFS 0x1660
+#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_DEF 0x0000000000000000
+#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenagain_qdr_ch3_LSB 0x27
+#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenagain_qdr_ch3_MSB 0x27
+#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenagain_qdr_ch3_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenagain_qdr_ch2_LSB 0x26
+#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenagain_qdr_ch2_MSB 0x26
+#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenagain_qdr_ch2_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenagain_qdr_ch1_LSB 0x25
+#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenagain_qdr_ch1_MSB 0x25
+#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenagain_qdr_ch1_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenagain_qdr_ch0_LSB 0x24
+#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenagain_qdr_ch0_MSB 0x24
+#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenagain_qdr_ch0_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenale_qdr_ch3_LSB 0x23
+#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenale_qdr_ch3_MSB 0x23
+#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenale_qdr_ch3_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenale_qdr_ch2_LSB 0x22
+#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenale_qdr_ch2_MSB 0x22
+#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenale_qdr_ch2_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenale_qdr_ch1_LSB 0x21
+#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenale_qdr_ch1_MSB 0x21
+#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenale_qdr_ch1_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenale_qdr_ch0_LSB 0x20
+#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenale_qdr_ch0_MSB 0x20
+#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenale_qdr_ch0_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenadfe_qdr_ch3_LSB 0x18
+#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenadfe_qdr_ch3_MSB 0x1F
+#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenadfe_qdr_ch3_RMASK 0xFF
+#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenadfe_qdr_ch2_LSB 0x10
+#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenadfe_qdr_ch2_MSB 0x17
+#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenadfe_qdr_ch2_RMASK 0xFF
+#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenadfe_qdr_ch1_LSB 0x8
+#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenadfe_qdr_ch1_MSB 0xF
+#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenadfe_qdr_ch1_RMASK 0xFF
+#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenadfe_qdr_ch0_LSB 0x0
+#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenadfe_qdr_ch0_MSB 0x7
+#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenadfe_qdr_ch0_RMASK 0xFF
+
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_OFFS 0x1668
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_DEF 0x0000000000000000
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenagain_qdr_ch3_LSB 0x27
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenagain_qdr_ch3_MSB 0x27
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenagain_qdr_ch3_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenagain_qdr_ch2_LSB 0x26
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenagain_qdr_ch2_MSB 0x26
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenagain_qdr_ch2_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenagain_qdr_ch1_LSB 0x25
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenagain_qdr_ch1_MSB 0x25
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenagain_qdr_ch1_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenagain_qdr_ch0_LSB 0x24
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenagain_qdr_ch0_MSB 0x24
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenagain_qdr_ch0_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenale_qdr_ch3_LSB 0x23
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenale_qdr_ch3_MSB 0x23
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenale_qdr_ch3_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenale_qdr_ch2_LSB 0x22
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenale_qdr_ch2_MSB 0x22
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenale_qdr_ch2_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenale_qdr_ch1_LSB 0x21
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenale_qdr_ch1_MSB 0x21
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenale_qdr_ch1_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenale_qdr_ch0_LSB 0x20
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenale_qdr_ch0_MSB 0x20
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenale_qdr_ch0_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenadfe_qdr_ch3_LSB 0x18
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenadfe_qdr_ch3_MSB 0x1F
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenadfe_qdr_ch3_RMASK 0xFF
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenadfe_qdr_ch2_LSB 0x10
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenadfe_qdr_ch2_MSB 0x17
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenadfe_qdr_ch2_RMASK 0xFF
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenadfe_qdr_ch1_LSB 0x8
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenadfe_qdr_ch1_MSB 0xF
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenadfe_qdr_ch1_RMASK 0xFF
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenadfe_qdr_ch0_LSB 0x0
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenadfe_qdr_ch0_MSB 0x7
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenadfe_qdr_ch0_RMASK 0xFF
+
+#define QIB_7322_ADAPT_DISABLE_TIMER_THRESHOLD_0_OFFS 0x1670
+#define QIB_7322_ADAPT_DISABLE_TIMER_THRESHOLD_0_DEF 0x0000000000000000
+
+#define QIB_7322_HighPriorityLimit_0_OFFS 0x1BC0
+#define QIB_7322_HighPriorityLimit_0_DEF 0x0000000000000000
+#define QIB_7322_HighPriorityLimit_0_Limit_LSB 0x0
+#define QIB_7322_HighPriorityLimit_0_Limit_MSB 0x7
+#define QIB_7322_HighPriorityLimit_0_Limit_RMASK 0xFF
+
+#define QIB_7322_LowPriority0_0_OFFS 0x1C00
+#define QIB_7322_LowPriority0_0_DEF 0x0000000000000000
+#define QIB_7322_LowPriority0_0_VirtualLane_LSB 0x10
+#define QIB_7322_LowPriority0_0_VirtualLane_MSB 0x12
+#define QIB_7322_LowPriority0_0_VirtualLane_RMASK 0x7
+#define QIB_7322_LowPriority0_0_Weight_LSB 0x0
+#define QIB_7322_LowPriority0_0_Weight_MSB 0x7
+#define QIB_7322_LowPriority0_0_Weight_RMASK 0xFF
+
+#define QIB_7322_HighPriority0_0_OFFS 0x1E00
+#define QIB_7322_HighPriority0_0_DEF 0x0000000000000000
+#define QIB_7322_HighPriority0_0_VirtualLane_LSB 0x10
+#define QIB_7322_HighPriority0_0_VirtualLane_MSB 0x12
+#define QIB_7322_HighPriority0_0_VirtualLane_RMASK 0x7
+#define QIB_7322_HighPriority0_0_Weight_LSB 0x0
+#define QIB_7322_HighPriority0_0_Weight_MSB 0x7
+#define QIB_7322_HighPriority0_0_Weight_RMASK 0xFF
+
+#define QIB_7322_CntrRegBase_1_OFFS 0x2028
+#define QIB_7322_CntrRegBase_1_DEF 0x0000000000013000
+
+#define QIB_7322_RcvQPMulticastContext_1_OFFS 0x2170
+
+#define QIB_7322_SendCtrl_1_OFFS 0x21C0
+
+#define QIB_7322_SendBufAvail0_OFFS 0x3000
+#define QIB_7322_SendBufAvail0_DEF 0x0000000000000000
+#define QIB_7322_SendBufAvail0_SendBuf_31_0_LSB 0x0
+#define QIB_7322_SendBufAvail0_SendBuf_31_0_MSB 0x3F
+#define QIB_7322_SendBufAvail0_SendBuf_31_0_RMASK 0x0
+
+#define QIB_7322_MsixTable_OFFS 0x8000
+#define QIB_7322_MsixTable_DEF 0x0000000000000000
+
+#define QIB_7322_MsixPba_OFFS 0x9000
+#define QIB_7322_MsixPba_DEF 0x0000000000000000
+
+#define QIB_7322_LAMemory_OFFS 0xA000
+#define QIB_7322_LAMemory_DEF 0x0000000000000000
+
+#define QIB_7322_LBIntCnt_OFFS 0x11000
+#define QIB_7322_LBIntCnt_DEF 0x0000000000000000
+
+#define QIB_7322_LBFlowStallCnt_OFFS 0x11008
+#define QIB_7322_LBFlowStallCnt_DEF 0x0000000000000000
+
+#define QIB_7322_RxTIDFullErrCnt_OFFS 0x110D0
+#define QIB_7322_RxTIDFullErrCnt_DEF 0x0000000000000000
+
+#define QIB_7322_RxTIDValidErrCnt_OFFS 0x110D8
+#define QIB_7322_RxTIDValidErrCnt_DEF 0x0000000000000000
+
+#define QIB_7322_RxP0HdrEgrOvflCnt_OFFS 0x110E8
+#define QIB_7322_RxP0HdrEgrOvflCnt_DEF 0x0000000000000000
+
+#define QIB_7322_PcieRetryBufDiagQwordCnt_OFFS 0x111A0
+#define QIB_7322_PcieRetryBufDiagQwordCnt_DEF 0x0000000000000000
+
+#define QIB_7322_RxTidFlowDropCnt_OFFS 0x111E0
+#define QIB_7322_RxTidFlowDropCnt_DEF 0x0000000000000000
+
+#define QIB_7322_LBIntCnt_0_OFFS 0x12000
+#define QIB_7322_LBIntCnt_0_DEF 0x0000000000000000
+
+#define QIB_7322_TxCreditUpToDateTimeOut_0_OFFS 0x12008
+#define QIB_7322_TxCreditUpToDateTimeOut_0_DEF 0x0000000000000000
+
+#define QIB_7322_TxSDmaDescCnt_0_OFFS 0x12010
+#define QIB_7322_TxSDmaDescCnt_0_DEF 0x0000000000000000
+
+#define QIB_7322_TxUnsupVLErrCnt_0_OFFS 0x12018
+#define QIB_7322_TxUnsupVLErrCnt_0_DEF 0x0000000000000000
+
+#define QIB_7322_TxDataPktCnt_0_OFFS 0x12020
+#define QIB_7322_TxDataPktCnt_0_DEF 0x0000000000000000
+
+#define QIB_7322_TxFlowPktCnt_0_OFFS 0x12028
+#define QIB_7322_TxFlowPktCnt_0_DEF 0x0000000000000000
+
+#define QIB_7322_TxDwordCnt_0_OFFS 0x12030
+#define QIB_7322_TxDwordCnt_0_DEF 0x0000000000000000
+
+#define QIB_7322_TxLenErrCnt_0_OFFS 0x12038
+#define QIB_7322_TxLenErrCnt_0_DEF 0x0000000000000000
+
+#define QIB_7322_TxMaxMinLenErrCnt_0_OFFS 0x12040
+#define QIB_7322_TxMaxMinLenErrCnt_0_DEF 0x0000000000000000
+
+#define QIB_7322_TxUnderrunCnt_0_OFFS 0x12048
+#define QIB_7322_TxUnderrunCnt_0_DEF 0x0000000000000000
+
+#define QIB_7322_TxFlowStallCnt_0_OFFS 0x12050
+#define QIB_7322_TxFlowStallCnt_0_DEF 0x0000000000000000
+
+#define QIB_7322_TxDroppedPktCnt_0_OFFS 0x12058
+#define QIB_7322_TxDroppedPktCnt_0_DEF 0x0000000000000000
+
+#define QIB_7322_RxDroppedPktCnt_0_OFFS 0x12060
+#define QIB_7322_RxDroppedPktCnt_0_DEF 0x0000000000000000
+
+#define QIB_7322_RxDataPktCnt_0_OFFS 0x12068
+#define QIB_7322_RxDataPktCnt_0_DEF 0x0000000000000000
+
+#define QIB_7322_RxFlowPktCnt_0_OFFS 0x12070
+#define QIB_7322_RxFlowPktCnt_0_DEF 0x0000000000000000
+
+#define QIB_7322_RxDwordCnt_0_OFFS 0x12078
+#define QIB_7322_RxDwordCnt_0_DEF 0x0000000000000000
+
+#define QIB_7322_RxLenErrCnt_0_OFFS 0x12080
+#define QIB_7322_RxLenErrCnt_0_DEF 0x0000000000000000
+
+#define QIB_7322_RxMaxMinLenErrCnt_0_OFFS 0x12088
+#define QIB_7322_RxMaxMinLenErrCnt_0_DEF 0x0000000000000000
+
+#define QIB_7322_RxICRCErrCnt_0_OFFS 0x12090
+#define QIB_7322_RxICRCErrCnt_0_DEF 0x0000000000000000
+
+#define QIB_7322_RxVCRCErrCnt_0_OFFS 0x12098
+#define QIB_7322_RxVCRCErrCnt_0_DEF 0x0000000000000000
+
+#define QIB_7322_RxFlowCtrlViolCnt_0_OFFS 0x120A0
+#define QIB_7322_RxFlowCtrlViolCnt_0_DEF 0x0000000000000000
+
+#define QIB_7322_RxVersionErrCnt_0_OFFS 0x120A8
+#define QIB_7322_RxVersionErrCnt_0_DEF 0x0000000000000000
+
+#define QIB_7322_RxLinkMalformCnt_0_OFFS 0x120B0
+#define QIB_7322_RxLinkMalformCnt_0_DEF 0x0000000000000000
+
+#define QIB_7322_RxEBPCnt_0_OFFS 0x120B8
+#define QIB_7322_RxEBPCnt_0_DEF 0x0000000000000000
+
+#define QIB_7322_RxLPCRCErrCnt_0_OFFS 0x120C0
+#define QIB_7322_RxLPCRCErrCnt_0_DEF 0x0000000000000000
+
+#define QIB_7322_RxBufOvflCnt_0_OFFS 0x120C8
+#define QIB_7322_RxBufOvflCnt_0_DEF 0x0000000000000000
+
+#define QIB_7322_RxLenTruncateCnt_0_OFFS 0x120D0
+#define QIB_7322_RxLenTruncateCnt_0_DEF 0x0000000000000000
+
+#define QIB_7322_RxPKeyMismatchCnt_0_OFFS 0x120E0
+#define QIB_7322_RxPKeyMismatchCnt_0_DEF 0x0000000000000000
+
+#define QIB_7322_IBLinkDownedCnt_0_OFFS 0x12180
+#define QIB_7322_IBLinkDownedCnt_0_DEF 0x0000000000000000
+
+#define QIB_7322_IBSymbolErrCnt_0_OFFS 0x12188
+#define QIB_7322_IBSymbolErrCnt_0_DEF 0x0000000000000000
+
+#define QIB_7322_IBStatusChangeCnt_0_OFFS 0x12190
+#define QIB_7322_IBStatusChangeCnt_0_DEF 0x0000000000000000
+
+#define QIB_7322_IBLinkErrRecoveryCnt_0_OFFS 0x12198
+#define QIB_7322_IBLinkErrRecoveryCnt_0_DEF 0x0000000000000000
+
+#define QIB_7322_ExcessBufferOvflCnt_0_OFFS 0x121A8
+#define QIB_7322_ExcessBufferOvflCnt_0_DEF 0x0000000000000000
+
+#define QIB_7322_LocalLinkIntegrityErrCnt_0_OFFS 0x121B0
+#define QIB_7322_LocalLinkIntegrityErrCnt_0_DEF 0x0000000000000000
+
+#define QIB_7322_RxVlErrCnt_0_OFFS 0x121B8
+#define QIB_7322_RxVlErrCnt_0_DEF 0x0000000000000000
+
+#define QIB_7322_RxDlidFltrCnt_0_OFFS 0x121C0
+#define QIB_7322_RxDlidFltrCnt_0_DEF 0x0000000000000000
+
+#define QIB_7322_RxVL15DroppedPktCnt_0_OFFS 0x121C8
+#define QIB_7322_RxVL15DroppedPktCnt_0_DEF 0x0000000000000000
+
+#define QIB_7322_RxOtherLocalPhyErrCnt_0_OFFS 0x121D0
+#define QIB_7322_RxOtherLocalPhyErrCnt_0_DEF 0x0000000000000000
+
+#define QIB_7322_RxQPInvalidContextCnt_0_OFFS 0x121D8
+#define QIB_7322_RxQPInvalidContextCnt_0_DEF 0x0000000000000000
+
+#define QIB_7322_TxHeadersErrCnt_0_OFFS 0x121F8
+#define QIB_7322_TxHeadersErrCnt_0_DEF 0x0000000000000000
+
+#define QIB_7322_PSRcvDataCount_0_OFFS 0x12218
+#define QIB_7322_PSRcvDataCount_0_DEF 0x0000000000000000
+
+#define QIB_7322_PSRcvPktsCount_0_OFFS 0x12220
+#define QIB_7322_PSRcvPktsCount_0_DEF 0x0000000000000000
+
+#define QIB_7322_PSXmitDataCount_0_OFFS 0x12228
+#define QIB_7322_PSXmitDataCount_0_DEF 0x0000000000000000
+
+#define QIB_7322_PSXmitPktsCount_0_OFFS 0x12230
+#define QIB_7322_PSXmitPktsCount_0_DEF 0x0000000000000000
+
+#define QIB_7322_PSXmitWaitCount_0_OFFS 0x12238
+#define QIB_7322_PSXmitWaitCount_0_DEF 0x0000000000000000
+
+#define QIB_7322_LBIntCnt_1_OFFS 0x13000
+#define QIB_7322_LBIntCnt_1_DEF 0x0000000000000000
+
+#define QIB_7322_TxCreditUpToDateTimeOut_1_OFFS 0x13008
+#define QIB_7322_TxCreditUpToDateTimeOut_1_DEF 0x0000000000000000
+
+#define QIB_7322_TxSDmaDescCnt_1_OFFS 0x13010
+#define QIB_7322_TxSDmaDescCnt_1_DEF 0x0000000000000000
+
+#define QIB_7322_TxUnsupVLErrCnt_1_OFFS 0x13018
+#define QIB_7322_TxUnsupVLErrCnt_1_DEF 0x0000000000000000
+
+#define QIB_7322_TxDataPktCnt_1_OFFS 0x13020
+#define QIB_7322_TxDataPktCnt_1_DEF 0x0000000000000000
+
+#define QIB_7322_TxFlowPktCnt_1_OFFS 0x13028
+#define QIB_7322_TxFlowPktCnt_1_DEF 0x0000000000000000
+
+#define QIB_7322_TxDwordCnt_1_OFFS 0x13030
+#define QIB_7322_TxDwordCnt_1_DEF 0x0000000000000000
+
+#define QIB_7322_TxLenErrCnt_1_OFFS 0x13038
+#define QIB_7322_TxLenErrCnt_1_DEF 0x0000000000000000
+
+#define QIB_7322_TxMaxMinLenErrCnt_1_OFFS 0x13040
+#define QIB_7322_TxMaxMinLenErrCnt_1_DEF 0x0000000000000000
+
+#define QIB_7322_TxUnderrunCnt_1_OFFS 0x13048
+#define QIB_7322_TxUnderrunCnt_1_DEF 0x0000000000000000
+
+#define QIB_7322_TxFlowStallCnt_1_OFFS 0x13050
+#define QIB_7322_TxFlowStallCnt_1_DEF 0x0000000000000000
+
+#define QIB_7322_TxDroppedPktCnt_1_OFFS 0x13058
+#define QIB_7322_TxDroppedPktCnt_1_DEF 0x0000000000000000
+
+#define QIB_7322_RxDroppedPktCnt_1_OFFS 0x13060
+#define QIB_7322_RxDroppedPktCnt_1_DEF 0x0000000000000000
+
+#define QIB_7322_RxDataPktCnt_1_OFFS 0x13068
+#define QIB_7322_RxDataPktCnt_1_DEF 0x0000000000000000
+
+#define QIB_7322_RxFlowPktCnt_1_OFFS 0x13070
+#define QIB_7322_RxFlowPktCnt_1_DEF 0x0000000000000000
+
+#define QIB_7322_RxDwordCnt_1_OFFS 0x13078
+#define QIB_7322_RxDwordCnt_1_DEF 0x0000000000000000
+
+#define QIB_7322_RxLenErrCnt_1_OFFS 0x13080
+#define QIB_7322_RxLenErrCnt_1_DEF 0x0000000000000000
+
+#define QIB_7322_RxMaxMinLenErrCnt_1_OFFS 0x13088
+#define QIB_7322_RxMaxMinLenErrCnt_1_DEF 0x0000000000000000
+
+#define QIB_7322_RxICRCErrCnt_1_OFFS 0x13090
+#define QIB_7322_RxICRCErrCnt_1_DEF 0x0000000000000000
+
+#define QIB_7322_RxVCRCErrCnt_1_OFFS 0x13098
+#define QIB_7322_RxVCRCErrCnt_1_DEF 0x0000000000000000
+
+#define QIB_7322_RxFlowCtrlViolCnt_1_OFFS 0x130A0
+#define QIB_7322_RxFlowCtrlViolCnt_1_DEF 0x0000000000000000
+
+#define QIB_7322_RxVersionErrCnt_1_OFFS 0x130A8
+#define QIB_7322_RxVersionErrCnt_1_DEF 0x0000000000000000
+
+#define QIB_7322_RxLinkMalformCnt_1_OFFS 0x130B0
+#define QIB_7322_RxLinkMalformCnt_1_DEF 0x0000000000000000
+
+#define QIB_7322_RxEBPCnt_1_OFFS 0x130B8
+#define QIB_7322_RxEBPCnt_1_DEF 0x0000000000000000
+
+#define QIB_7322_RxLPCRCErrCnt_1_OFFS 0x130C0
+#define QIB_7322_RxLPCRCErrCnt_1_DEF 0x0000000000000000
+
+#define QIB_7322_RxBufOvflCnt_1_OFFS 0x130C8
+#define QIB_7322_RxBufOvflCnt_1_DEF 0x0000000000000000
+
+#define QIB_7322_RxLenTruncateCnt_1_OFFS 0x130D0
+#define QIB_7322_RxLenTruncateCnt_1_DEF 0x0000000000000000
+
+#define QIB_7322_RxPKeyMismatchCnt_1_OFFS 0x130E0
+#define QIB_7322_RxPKeyMismatchCnt_1_DEF 0x0000000000000000
+
+#define QIB_7322_IBLinkDownedCnt_1_OFFS 0x13180
+#define QIB_7322_IBLinkDownedCnt_1_DEF 0x0000000000000000
+
+#define QIB_7322_IBSymbolErrCnt_1_OFFS 0x13188
+#define QIB_7322_IBSymbolErrCnt_1_DEF 0x0000000000000000
+
+#define QIB_7322_IBStatusChangeCnt_1_OFFS 0x13190
+#define QIB_7322_IBStatusChangeCnt_1_DEF 0x0000000000000000
+
+#define QIB_7322_IBLinkErrRecoveryCnt_1_OFFS 0x13198
+#define QIB_7322_IBLinkErrRecoveryCnt_1_DEF 0x0000000000000000
+
+#define QIB_7322_ExcessBufferOvflCnt_1_OFFS 0x131A8
+#define QIB_7322_ExcessBufferOvflCnt_1_DEF 0x0000000000000000
+
+#define QIB_7322_LocalLinkIntegrityErrCnt_1_OFFS 0x131B0
+#define QIB_7322_LocalLinkIntegrityErrCnt_1_DEF 0x0000000000000000
+
+#define QIB_7322_RxVlErrCnt_1_OFFS 0x131B8
+#define QIB_7322_RxVlErrCnt_1_DEF 0x0000000000000000
+
+#define QIB_7322_RxDlidFltrCnt_1_OFFS 0x131C0
+#define QIB_7322_RxDlidFltrCnt_1_DEF 0x0000000000000000
+
+#define QIB_7322_RxVL15DroppedPktCnt_1_OFFS 0x131C8
+#define QIB_7322_RxVL15DroppedPktCnt_1_DEF 0x0000000000000000
+
+#define QIB_7322_RxOtherLocalPhyErrCnt_1_OFFS 0x131D0
+#define QIB_7322_RxOtherLocalPhyErrCnt_1_DEF 0x0000000000000000
+
+#define QIB_7322_RxQPInvalidContextCnt_1_OFFS 0x131D8
+#define QIB_7322_RxQPInvalidContextCnt_1_DEF 0x0000000000000000
+
+#define QIB_7322_TxHeadersErrCnt_1_OFFS 0x131F8
+#define QIB_7322_TxHeadersErrCnt_1_DEF 0x0000000000000000
+
+#define QIB_7322_PSRcvDataCount_1_OFFS 0x13218
+#define QIB_7322_PSRcvDataCount_1_DEF 0x0000000000000000
+
+#define QIB_7322_PSRcvPktsCount_1_OFFS 0x13220
+#define QIB_7322_PSRcvPktsCount_1_DEF 0x0000000000000000
+
+#define QIB_7322_PSXmitDataCount_1_OFFS 0x13228
+#define QIB_7322_PSXmitDataCount_1_DEF 0x0000000000000000
+
+#define QIB_7322_PSXmitPktsCount_1_OFFS 0x13230
+#define QIB_7322_PSXmitPktsCount_1_DEF 0x0000000000000000
+
+#define QIB_7322_PSXmitWaitCount_1_OFFS 0x13238
+#define QIB_7322_PSXmitWaitCount_1_DEF 0x0000000000000000
+
+#define QIB_7322_RcvEgrArray_OFFS 0x14000
+#define QIB_7322_RcvEgrArray_DEF 0x0000000000000000
+#define QIB_7322_RcvEgrArray_RT_BufSize_LSB 0x25
+#define QIB_7322_RcvEgrArray_RT_BufSize_MSB 0x27
+#define QIB_7322_RcvEgrArray_RT_BufSize_RMASK 0x7
+#define QIB_7322_RcvEgrArray_RT_Addr_LSB 0x0
+#define QIB_7322_RcvEgrArray_RT_Addr_MSB 0x24
+#define QIB_7322_RcvEgrArray_RT_Addr_RMASK 0x1FFFFFFFFF
+
+#define QIB_7322_RcvTIDArray0_OFFS 0x50000
+#define QIB_7322_RcvTIDArray0_DEF 0x0000000000000000
+#define QIB_7322_RcvTIDArray0_RT_BufSize_LSB 0x25
+#define QIB_7322_RcvTIDArray0_RT_BufSize_MSB 0x27
+#define QIB_7322_RcvTIDArray0_RT_BufSize_RMASK 0x7
+#define QIB_7322_RcvTIDArray0_RT_Addr_LSB 0x0
+#define QIB_7322_RcvTIDArray0_RT_Addr_MSB 0x24
+#define QIB_7322_RcvTIDArray0_RT_Addr_RMASK 0x1FFFFFFFFF
+
+#define QIB_7322_IBSD_DDS_MAP_TABLE_0_OFFS 0xD0000
+#define QIB_7322_IBSD_DDS_MAP_TABLE_0_DEF 0x0000000000000000
+
+#define QIB_7322_RcvHdrTail0_OFFS 0x200000
+#define QIB_7322_RcvHdrTail0_DEF 0x0000000000000000
+
+#define QIB_7322_RcvHdrHead0_OFFS 0x200008
+#define QIB_7322_RcvHdrHead0_DEF 0x0000000000000000
+#define QIB_7322_RcvHdrHead0_counter_LSB 0x20
+#define QIB_7322_RcvHdrHead0_counter_MSB 0x2F
+#define QIB_7322_RcvHdrHead0_counter_RMASK 0xFFFF
+#define QIB_7322_RcvHdrHead0_RcvHeadPointer_LSB 0x0
+#define QIB_7322_RcvHdrHead0_RcvHeadPointer_MSB 0x1F
+#define QIB_7322_RcvHdrHead0_RcvHeadPointer_RMASK 0xFFFFFFFF
+
+#define QIB_7322_RcvEgrIndexTail0_OFFS 0x200010
+#define QIB_7322_RcvEgrIndexTail0_DEF 0x0000000000000000
+
+#define QIB_7322_RcvEgrIndexHead0_OFFS 0x200018
+#define QIB_7322_RcvEgrIndexHead0_DEF 0x0000000000000000
+
+#define QIB_7322_RcvTIDFlowTable0_OFFS 0x201000
+#define QIB_7322_RcvTIDFlowTable0_DEF 0x0000000000000000
+#define QIB_7322_RcvTIDFlowTable0_GenMismatch_LSB 0x1C
+#define QIB_7322_RcvTIDFlowTable0_GenMismatch_MSB 0x1C
+#define QIB_7322_RcvTIDFlowTable0_GenMismatch_RMASK 0x1
+#define QIB_7322_RcvTIDFlowTable0_SeqMismatch_LSB 0x1B
+#define QIB_7322_RcvTIDFlowTable0_SeqMismatch_MSB 0x1B
+#define QIB_7322_RcvTIDFlowTable0_SeqMismatch_RMASK 0x1
+#define QIB_7322_RcvTIDFlowTable0_KeepOnGenErr_LSB 0x16
+#define QIB_7322_RcvTIDFlowTable0_KeepOnGenErr_MSB 0x16
+#define QIB_7322_RcvTIDFlowTable0_KeepOnGenErr_RMASK 0x1
+#define QIB_7322_RcvTIDFlowTable0_KeepAfterSeqErr_LSB 0x15
+#define QIB_7322_RcvTIDFlowTable0_KeepAfterSeqErr_MSB 0x15
+#define QIB_7322_RcvTIDFlowTable0_KeepAfterSeqErr_RMASK 0x1
+#define QIB_7322_RcvTIDFlowTable0_HdrSuppEnabled_LSB 0x14
+#define QIB_7322_RcvTIDFlowTable0_HdrSuppEnabled_MSB 0x14
+#define QIB_7322_RcvTIDFlowTable0_HdrSuppEnabled_RMASK 0x1
+#define QIB_7322_RcvTIDFlowTable0_FlowValid_LSB 0x13
+#define QIB_7322_RcvTIDFlowTable0_FlowValid_MSB 0x13
+#define QIB_7322_RcvTIDFlowTable0_FlowValid_RMASK 0x1
+#define QIB_7322_RcvTIDFlowTable0_GenVal_LSB 0xB
+#define QIB_7322_RcvTIDFlowTable0_GenVal_MSB 0x12
+#define QIB_7322_RcvTIDFlowTable0_GenVal_RMASK 0xFF
+#define QIB_7322_RcvTIDFlowTable0_SeqNum_LSB 0x0
+#define QIB_7322_RcvTIDFlowTable0_SeqNum_MSB 0xA
+#define QIB_7322_RcvTIDFlowTable0_SeqNum_RMASK 0x7FF
diff --git a/drivers/infiniband/hw/qib/qib_common.h b/drivers/infiniband/hw/qib/qib_common.h
new file mode 100644
index 0000000..5670ace
--- /dev/null
+++ b/drivers/infiniband/hw/qib/qib_common.h
@@ -0,0 +1,812 @@
+/*
+ * Copyright (c) 2006, 2007, 2008, 2009, 2010 QLogic Corporation.
+ * All rights reserved.
+ * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _QIB_COMMON_H
+#define _QIB_COMMON_H
+
+/*
+ * This file contains defines, structures, etc. that are used
+ * to communicate between kernel and user code.
+ */
+
+/* This is the IEEE-assigned OUI for QLogic Inc. QLogic_IB */
+#define QIB_SRC_OUI_1 0x00
+#define QIB_SRC_OUI_2 0x11
+#define QIB_SRC_OUI_3 0x75
+
+/* version of protocol header (known to chip also). In the long run,
+ * we should be able to generate and accept a range of version numbers;
+ * for now we only accept one, and it's compiled in.
+ */
+#define IPS_PROTO_VERSION 2
+
+/*
+ * These are compile time constants that you may want to enable or disable
+ * if you are trying to debug problems with code or performance.
+ * QIB_VERBOSE_TRACING define as 1 if you want additional tracing in
+ * fastpath code
+ * QIB_TRACE_REGWRITES define as 1 if you want register writes to be
+ * traced in faspath code
+ * _QIB_TRACING define as 0 if you want to remove all tracing in a
+ * compilation unit
+ */
+
+/*
+ * The value in the BTH QP field that QLogic_IB uses to differentiate
+ * an qlogic_ib protocol IB packet vs standard IB transport
+ * This it needs to be even (0x656b78), because the LSB is sometimes
+ * used for the MSB of context. The change may cause a problem
+ * interoperating with older software.
+ */
+#define QIB_KD_QP 0x656b78
+
+/*
+ * These are the status bits readable (in ascii form, 64bit value)
+ * from the "status" sysfs file.  For binary compatibility, values
+ * must remain as is; removed states can be reused for different
+ * purposes.
+ */
+#define QIB_STATUS_INITTED       0x1    /* basic initialization done */
+/* Chip has been found and initted */
+#define QIB_STATUS_CHIP_PRESENT 0x20
+/* IB link is at ACTIVE, usable for data traffic */
+#define QIB_STATUS_IB_READY     0x40
+/* link is configured, LID, MTU, etc. have been set */
+#define QIB_STATUS_IB_CONF      0x80
+/* A Fatal hardware error has occurred. */
+#define QIB_STATUS_HWERROR     0x200
+
+/*
+ * The list of usermode accessible registers.  Also see Reg_* later in file.
+ */
+enum qib_ureg {
+	/* (RO)  DMA RcvHdr to be used next. */
+	ur_rcvhdrtail = 0,
+	/* (RW)  RcvHdr entry to be processed next by host. */
+	ur_rcvhdrhead = 1,
+	/* (RO)  Index of next Eager index to use. */
+	ur_rcvegrindextail = 2,
+	/* (RW)  Eager TID to be processed next */
+	ur_rcvegrindexhead = 3,
+	/* For internal use only; max register number. */
+	_QIB_UregMax
+};
+
+/* bit values for spi_runtime_flags */
+#define QIB_RUNTIME_PCIE                0x0002
+#define QIB_RUNTIME_FORCE_WC_ORDER      0x0004
+#define QIB_RUNTIME_RCVHDR_COPY         0x0008
+#define QIB_RUNTIME_MASTER              0x0010
+#define QIB_RUNTIME_RCHK                0x0020
+#define QIB_RUNTIME_NODMA_RTAIL         0x0080
+#define QIB_RUNTIME_SPECIAL_TRIGGER     0x0100
+#define QIB_RUNTIME_SDMA                0x0200
+#define QIB_RUNTIME_FORCE_PIOAVAIL      0x0400
+#define QIB_RUNTIME_PIO_REGSWAPPED      0x0800
+#define QIB_RUNTIME_CTXT_MSB_IN_QP      0x1000
+#define QIB_RUNTIME_CTXT_REDIRECT       0x2000
+#define QIB_RUNTIME_HDRSUPP             0x4000
+
+/*
+ * This structure is returned by qib_userinit() immediately after
+ * open to get implementation-specific info, and info specific to this
+ * instance.
+ *
+ * This struct must have explict pad fields where type sizes
+ * may result in different alignments between 32 and 64 bit
+ * programs, since the 64 bit * bit kernel requires the user code
+ * to have matching offsets
+ */
+struct qib_base_info {
+	/* version of hardware, for feature checking. */
+	__u32 spi_hw_version;
+	/* version of software, for feature checking. */
+	__u32 spi_sw_version;
+	/* QLogic_IB context assigned, goes into sent packets */
+	__u16 spi_ctxt;
+	__u16 spi_subctxt;
+	/*
+	 * IB MTU, packets IB data must be less than this.
+	 * The MTU is in bytes, and will be a multiple of 4 bytes.
+	 */
+	__u32 spi_mtu;
+	/*
+	 * Size of a PIO buffer.  Any given packet's total size must be less
+	 * than this (in words).  Included is the starting control word, so
+	 * if 513 is returned, then total pkt size is 512 words or less.
+	 */
+	__u32 spi_piosize;
+	/* size of the TID cache in qlogic_ib, in entries */
+	__u32 spi_tidcnt;
+	/* size of the TID Eager list in qlogic_ib, in entries */
+	__u32 spi_tidegrcnt;
+	/* size of a single receive header queue entry in words. */
+	__u32 spi_rcvhdrent_size;
+	/*
+	 * Count of receive header queue entries allocated.
+	 * This may be less than the spu_rcvhdrcnt passed in!.
+	 */
+	__u32 spi_rcvhdr_cnt;
+
+	/* per-chip and other runtime features bitmap (QIB_RUNTIME_*) */
+	__u32 spi_runtime_flags;
+
+	/* address where hardware receive header queue is mapped */
+	__u64 spi_rcvhdr_base;
+
+	/* user program. */
+
+	/* base address of eager TID receive buffers used by hardware. */
+	__u64 spi_rcv_egrbufs;
+
+	/* Allocated by initialization code, not by protocol. */
+
+	/*
+	 * Size of each TID buffer in host memory, starting at
+	 * spi_rcv_egrbufs.  The buffers are virtually contiguous.
+	 */
+	__u32 spi_rcv_egrbufsize;
+	/*
+	 * The special QP (queue pair) value that identifies an qlogic_ib
+	 * protocol packet from standard IB packets.  More, probably much
+	 * more, to be added.
+	 */
+	__u32 spi_qpair;
+
+	/*
+	 * User register base for init code, not to be used directly by
+	 * protocol or applications.  Always points to chip registers,
+	 * for normal or shared context.
+	 */
+	__u64 spi_uregbase;
+	/*
+	 * Maximum buffer size in bytes that can be used in a single TID
+	 * entry (assuming the buffer is aligned to this boundary).  This is
+	 * the minimum of what the hardware and software support Guaranteed
+	 * to be a power of 2.
+	 */
+	__u32 spi_tid_maxsize;
+	/*
+	 * alignment of each pio send buffer (byte count
+	 * to add to spi_piobufbase to get to second buffer)
+	 */
+	__u32 spi_pioalign;
+	/*
+	 * The index of the first pio buffer available to this process;
+	 * needed to do lookup in spi_pioavailaddr; not added to
+	 * spi_piobufbase.
+	 */
+	__u32 spi_pioindex;
+	 /* number of buffers mapped for this process */
+	__u32 spi_piocnt;
+
+	/*
+	 * Base address of writeonly pio buffers for this process.
+	 * Each buffer has spi_piosize words, and is aligned on spi_pioalign
+	 * boundaries.  spi_piocnt buffers are mapped from this address
+	 */
+	__u64 spi_piobufbase;
+
+	/*
+	 * Base address of readonly memory copy of the pioavail registers.
+	 * There are 2 bits for each buffer.
+	 */
+	__u64 spi_pioavailaddr;
+
+	/*
+	 * Address where driver updates a copy of the interface and driver
+	 * status (QIB_STATUS_*) as a 64 bit value.  It's followed by a
+	 * link status qword (formerly combined with driver status), then a
+	 * string indicating hardware error, if there was one.
+	 */
+	__u64 spi_status;
+
+	/* number of chip ctxts available to user processes */
+	__u32 spi_nctxts;
+	__u16 spi_unit; /* unit number of chip we are using */
+	__u16 spi_port; /* IB port number we are using */
+	/* num bufs in each contiguous set */
+	__u32 spi_rcv_egrperchunk;
+	/* size in bytes of each contiguous set */
+	__u32 spi_rcv_egrchunksize;
+	/* total size of mmap to cover full rcvegrbuffers */
+	__u32 spi_rcv_egrbuftotlen;
+	__u32 spi_rhf_offset; /* dword offset in hdrqent for rcvhdr flags */
+	/* address of readonly memory copy of the rcvhdrq tail register. */
+	__u64 spi_rcvhdr_tailaddr;
+
+	/*
+	 * shared memory pages for subctxts if ctxt is shared; these cover
+	 * all the processes in the group sharing a single context.
+	 * all have enough space for the num_subcontexts value on this job.
+	 */
+	__u64 spi_subctxt_uregbase;
+	__u64 spi_subctxt_rcvegrbuf;
+	__u64 spi_subctxt_rcvhdr_base;
+
+	/* shared memory page for send buffer disarm status */
+	__u64 spi_sendbuf_status;
+} __attribute__ ((aligned(8)));
+
+/*
+ * This version number is given to the driver by the user code during
+ * initialization in the spu_userversion field of qib_user_info, so
+ * the driver can check for compatibility with user code.
+ *
+ * The major version changes when data structures
+ * change in an incompatible way.  The driver must be the same or higher
+ * for initialization to succeed.  In some cases, a higher version
+ * driver will not interoperate with older software, and initialization
+ * will return an error.
+ */
+#define QIB_USER_SWMAJOR 1
+
+/*
+ * Minor version differences are always compatible
+ * a within a major version, however if user software is larger
+ * than driver software, some new features and/or structure fields
+ * may not be implemented; the user code must deal with this if it
+ * cares, or it must abort after initialization reports the difference.
+ */
+#define QIB_USER_SWMINOR 13
+
+#define QIB_USER_SWVERSION ((QIB_USER_SWMAJOR << 16) | QIB_USER_SWMINOR)
+
+#ifndef QIB_KERN_TYPE
+#define QIB_KERN_TYPE 0
+#endif
+
+/*
+ * Similarly, this is the kernel version going back to the user.  It's
+ * slightly different, in that we want to tell if the driver was built as
+ * part of a QLogic release, or from the driver from openfabrics.org,
+ * kernel.org, or a standard distribution, for support reasons.
+ * The high bit is 0 for non-QLogic and 1 for QLogic-built/supplied.
+ *
+ * It's returned by the driver to the user code during initialization in the
+ * spi_sw_version field of qib_base_info, so the user code can in turn
+ * check for compatibility with the kernel.
+*/
+#define QIB_KERN_SWVERSION ((QIB_KERN_TYPE << 31) | QIB_USER_SWVERSION)
+
+/*
+ * Define the driver version number.  This is something that refers only
+ * to the driver itself, not the software interfaces it supports.
+ */
+#define QIB_DRIVER_VERSION_BASE "1.11"
+
+/* create the final driver version string */
+#ifdef QIB_IDSTR
+#define QIB_DRIVER_VERSION QIB_DRIVER_VERSION_BASE " " QIB_IDSTR
+#else
+#define QIB_DRIVER_VERSION QIB_DRIVER_VERSION_BASE
+#endif
+
+/*
+ * If the unit is specified via open, HCA choice is fixed.  If port is
+ * specified, it's also fixed.  Otherwise we try to spread contexts
+ * across ports and HCAs, using different algorithims.  WITHIN is
+ * the old default, prior to this mechanism.
+ */
+#define QIB_PORT_ALG_ACROSS 0 /* round robin contexts across HCAs, then
+			       * ports; this is the default */
+#define QIB_PORT_ALG_WITHIN 1 /* use all contexts on an HCA (round robin
+			       * active ports within), then next HCA */
+#define QIB_PORT_ALG_COUNT 2 /* number of algorithm choices */
+
+/*
+ * This structure is passed to qib_userinit() to tell the driver where
+ * user code buffers are, sizes, etc.   The offsets and sizes of the
+ * fields must remain unchanged, for binary compatibility.  It can
+ * be extended, if userversion is changed so user code can tell, if needed
+ */
+struct qib_user_info {
+	/*
+	 * version of user software, to detect compatibility issues.
+	 * Should be set to QIB_USER_SWVERSION.
+	 */
+	__u32 spu_userversion;
+
+	__u32 _spu_unused2;
+
+	/* size of struct base_info to write to */
+	__u32 spu_base_info_size;
+
+	__u32 spu_port_alg; /* which QIB_PORT_ALG_*; unused user minor < 11 */
+
+	/*
+	 * If two or more processes wish to share a context, each process
+	 * must set the spu_subctxt_cnt and spu_subctxt_id to the same
+	 * values.  The only restriction on the spu_subctxt_id is that
+	 * it be unique for a given node.
+	 */
+	__u16 spu_subctxt_cnt;
+	__u16 spu_subctxt_id;
+
+	__u32 spu_port; /* IB port requested by user if > 0 */
+
+	/*
+	 * address of struct base_info to write to
+	 */
+	__u64 spu_base_info;
+
+} __attribute__ ((aligned(8)));
+
+/* User commands. */
+
+/* 16 available, was: old set up userspace (for old user code) */
+#define QIB_CMD_CTXT_INFO       17      /* find out what resources we got */
+#define QIB_CMD_RECV_CTRL       18      /* control receipt of packets */
+#define QIB_CMD_TID_UPDATE      19      /* update expected TID entries */
+#define QIB_CMD_TID_FREE        20      /* free expected TID entries */
+#define QIB_CMD_SET_PART_KEY    21      /* add partition key */
+/* 22 available, was: return info on slave processes (for old user code) */
+#define QIB_CMD_ASSIGN_CTXT     23      /* allocate HCA and ctxt */
+#define QIB_CMD_USER_INIT       24      /* set up userspace */
+#define QIB_CMD_UNUSED_1        25
+#define QIB_CMD_UNUSED_2        26
+#define QIB_CMD_PIOAVAILUPD     27      /* force an update of PIOAvail reg */
+#define QIB_CMD_POLL_TYPE       28      /* set the kind of polling we want */
+#define QIB_CMD_ARMLAUNCH_CTRL  29      /* armlaunch detection control */
+/* 30 is unused */
+#define QIB_CMD_SDMA_INFLIGHT   31      /* sdma inflight counter request */
+#define QIB_CMD_SDMA_COMPLETE   32      /* sdma completion counter request */
+/* 33 available, was a testing feature  */
+#define QIB_CMD_DISARM_BUFS     34      /* disarm send buffers w/ errors */
+#define QIB_CMD_ACK_EVENT       35      /* ack & clear bits */
+#define QIB_CMD_CPUS_LIST       36      /* list of cpus allocated, for pinned
+					 * processes: qib_cpus_list */
+
+/*
+ * QIB_CMD_ACK_EVENT obsoletes QIB_CMD_DISARM_BUFS, but we keep it for
+ * compatibility with libraries from previous release.   The ACK_EVENT
+ * will take appropriate driver action (if any, just DISARM for now),
+ * then clear the bits passed in as part of the mask.  These bits are
+ * in the first 64bit word at spi_sendbuf_status, and are passed to
+ * the driver in the event_mask union as well.
+ */
+#define _QIB_EVENT_DISARM_BUFS_BIT	0
+#define _QIB_EVENT_LINKDOWN_BIT		1
+#define _QIB_EVENT_LID_CHANGE_BIT	2
+#define _QIB_EVENT_LMC_CHANGE_BIT	3
+#define _QIB_EVENT_SL2VL_CHANGE_BIT	4
+#define _QIB_MAX_EVENT_BIT _QIB_EVENT_SL2VL_CHANGE_BIT
+
+#define QIB_EVENT_DISARM_BUFS_BIT	(1UL << _QIB_EVENT_DISARM_BUFS_BIT)
+#define QIB_EVENT_LINKDOWN_BIT		(1UL << _QIB_EVENT_LINKDOWN_BIT)
+#define QIB_EVENT_LID_CHANGE_BIT	(1UL << _QIB_EVENT_LID_CHANGE_BIT)
+#define QIB_EVENT_LMC_CHANGE_BIT	(1UL << _QIB_EVENT_LMC_CHANGE_BIT)
+#define QIB_EVENT_SL2VL_CHANGE_BIT	(1UL << _QIB_EVENT_SL2VL_CHANGE_BIT)
+
+
+/*
+ * Poll types
+ */
+#define QIB_POLL_TYPE_ANYRCV     0x0
+#define QIB_POLL_TYPE_URGENT     0x1
+
+struct qib_ctxt_info {
+	__u16 num_active;       /* number of active units */
+	__u16 unit;             /* unit (chip) assigned to caller */
+	__u16 port;             /* IB port assigned to caller (1-based) */
+	__u16 ctxt;             /* ctxt on unit assigned to caller */
+	__u16 subctxt;          /* subctxt on unit assigned to caller */
+	__u16 num_ctxts;        /* number of ctxts available on unit */
+	__u16 num_subctxts;     /* number of subctxts opened on ctxt */
+	__u16 rec_cpu;          /* cpu # for affinity (ffff if none) */
+};
+
+struct qib_tid_info {
+	__u32 tidcnt;
+	/* make structure same size in 32 and 64 bit */
+	__u32 tid__unused;
+	/* virtual address of first page in transfer */
+	__u64 tidvaddr;
+	/* pointer (same size 32/64 bit) to __u16 tid array */
+	__u64 tidlist;
+
+	/*
+	 * pointer (same size 32/64 bit) to bitmap of TIDs used
+	 * for this call; checked for being large enough at open
+	 */
+	__u64 tidmap;
+};
+
+struct qib_cmd {
+	__u32 type;                     /* command type */
+	union {
+		struct qib_tid_info tid_info;
+		struct qib_user_info user_info;
+
+		/*
+		 * address in userspace where we should put the sdma
+		 * inflight counter
+		 */
+		__u64 sdma_inflight;
+		/*
+		 * address in userspace where we should put the sdma
+		 * completion counter
+		 */
+		__u64 sdma_complete;
+		/* address in userspace of struct qib_ctxt_info to
+		   write result to */
+		__u64 ctxt_info;
+		/* enable/disable receipt of packets */
+		__u32 recv_ctrl;
+		/* enable/disable armlaunch errors (non-zero to enable) */
+		__u32 armlaunch_ctrl;
+		/* partition key to set */
+		__u16 part_key;
+		/* user address of __u32 bitmask of active slaves */
+		__u64 slave_mask_addr;
+		/* type of polling we want */
+		__u16 poll_type;
+		/* back pressure enable bit for one particular context */
+		__u8 ctxt_bp;
+		/* qib_user_event_ack(), IPATH_EVENT_* bits */
+		__u64 event_mask;
+	} cmd;
+};
+
+struct qib_iovec {
+	/* Pointer to data, but same size 32 and 64 bit */
+	__u64 iov_base;
+
+	/*
+	 * Length of data; don't need 64 bits, but want
+	 * qib_sendpkt to remain same size as before 32 bit changes, so...
+	 */
+	__u64 iov_len;
+};
+
+/*
+ * Describes a single packet for send.  Each packet can have one or more
+ * buffers, but the total length (exclusive of IB headers) must be less
+ * than the MTU, and if using the PIO method, entire packet length,
+ * including IB headers, must be less than the qib_piosize value (words).
+ * Use of this necessitates including sys/uio.h
+ */
+struct __qib_sendpkt {
+	__u32 sps_flags;        /* flags for packet (TBD) */
+	__u32 sps_cnt;          /* number of entries to use in sps_iov */
+	/* array of iov's describing packet. TEMPORARY */
+	struct qib_iovec sps_iov[4];
+};
+
+/*
+ * Diagnostics can send a packet by "writing" the following
+ * structs to the diag data special file.
+ * This allows a custom
+ * pbc (+ static rate) qword, so that special modes and deliberate
+ * changes to CRCs can be used. The elements were also re-ordered
+ * for better alignment and to avoid padding issues.
+ */
+#define _DIAG_XPKT_VERS 3
+struct qib_diag_xpkt {
+	__u16 version;
+	__u16 unit;
+	__u16 port;
+	__u16 len;
+	__u64 data;
+	__u64 pbc_wd;
+};
+
+/*
+ * Data layout in I2C flash (for GUID, etc.)
+ * All fields are little-endian binary unless otherwise stated
+ */
+#define QIB_FLASH_VERSION 2
+struct qib_flash {
+	/* flash layout version (QIB_FLASH_VERSION) */
+	__u8 if_fversion;
+	/* checksum protecting if_length bytes */
+	__u8 if_csum;
+	/*
+	 * valid length (in use, protected by if_csum), including
+	 * if_fversion and if_csum themselves)
+	 */
+	__u8 if_length;
+	/* the GUID, in network order */
+	__u8 if_guid[8];
+	/* number of GUIDs to use, starting from if_guid */
+	__u8 if_numguid;
+	/* the (last 10 characters of) board serial number, in ASCII */
+	char if_serial[12];
+	/* board mfg date (YYYYMMDD ASCII) */
+	char if_mfgdate[8];
+	/* last board rework/test date (YYYYMMDD ASCII) */
+	char if_testdate[8];
+	/* logging of error counts, TBD */
+	__u8 if_errcntp[4];
+	/* powered on hours, updated at driver unload */
+	__u8 if_powerhour[2];
+	/* ASCII free-form comment field */
+	char if_comment[32];
+	/* Backwards compatible prefix for longer QLogic Serial Numbers */
+	char if_sprefix[4];
+	/* 82 bytes used, min flash size is 128 bytes */
+	__u8 if_future[46];
+};
+
+/*
+ * These are the counters implemented in the chip, and are listed in order.
+ * The InterCaps naming is taken straight from the chip spec.
+ */
+struct qlogic_ib_counters {
+	__u64 LBIntCnt;
+	__u64 LBFlowStallCnt;
+	__u64 TxSDmaDescCnt;    /* was Reserved1 */
+	__u64 TxUnsupVLErrCnt;
+	__u64 TxDataPktCnt;
+	__u64 TxFlowPktCnt;
+	__u64 TxDwordCnt;
+	__u64 TxLenErrCnt;
+	__u64 TxMaxMinLenErrCnt;
+	__u64 TxUnderrunCnt;
+	__u64 TxFlowStallCnt;
+	__u64 TxDroppedPktCnt;
+	__u64 RxDroppedPktCnt;
+	__u64 RxDataPktCnt;
+	__u64 RxFlowPktCnt;
+	__u64 RxDwordCnt;
+	__u64 RxLenErrCnt;
+	__u64 RxMaxMinLenErrCnt;
+	__u64 RxICRCErrCnt;
+	__u64 RxVCRCErrCnt;
+	__u64 RxFlowCtrlErrCnt;
+	__u64 RxBadFormatCnt;
+	__u64 RxLinkProblemCnt;
+	__u64 RxEBPCnt;
+	__u64 RxLPCRCErrCnt;
+	__u64 RxBufOvflCnt;
+	__u64 RxTIDFullErrCnt;
+	__u64 RxTIDValidErrCnt;
+	__u64 RxPKeyMismatchCnt;
+	__u64 RxP0HdrEgrOvflCnt;
+	__u64 RxP1HdrEgrOvflCnt;
+	__u64 RxP2HdrEgrOvflCnt;
+	__u64 RxP3HdrEgrOvflCnt;
+	__u64 RxP4HdrEgrOvflCnt;
+	__u64 RxP5HdrEgrOvflCnt;
+	__u64 RxP6HdrEgrOvflCnt;
+	__u64 RxP7HdrEgrOvflCnt;
+	__u64 RxP8HdrEgrOvflCnt;
+	__u64 RxP9HdrEgrOvflCnt;
+	__u64 RxP10HdrEgrOvflCnt;
+	__u64 RxP11HdrEgrOvflCnt;
+	__u64 RxP12HdrEgrOvflCnt;
+	__u64 RxP13HdrEgrOvflCnt;
+	__u64 RxP14HdrEgrOvflCnt;
+	__u64 RxP15HdrEgrOvflCnt;
+	__u64 RxP16HdrEgrOvflCnt;
+	__u64 IBStatusChangeCnt;
+	__u64 IBLinkErrRecoveryCnt;
+	__u64 IBLinkDownedCnt;
+	__u64 IBSymbolErrCnt;
+	__u64 RxVL15DroppedPktCnt;
+	__u64 RxOtherLocalPhyErrCnt;
+	__u64 PcieRetryBufDiagQwordCnt;
+	__u64 ExcessBufferOvflCnt;
+	__u64 LocalLinkIntegrityErrCnt;
+	__u64 RxVlErrCnt;
+	__u64 RxDlidFltrCnt;
+};
+
+/*
+ * The next set of defines are for packet headers, and chip register
+ * and memory bits that are visible to and/or used by user-mode software.
+ */
+
+/* RcvHdrFlags bits */
+#define QLOGIC_IB_RHF_LENGTH_MASK 0x7FF
+#define QLOGIC_IB_RHF_LENGTH_SHIFT 0
+#define QLOGIC_IB_RHF_RCVTYPE_MASK 0x7
+#define QLOGIC_IB_RHF_RCVTYPE_SHIFT 11
+#define QLOGIC_IB_RHF_EGRINDEX_MASK 0xFFF
+#define QLOGIC_IB_RHF_EGRINDEX_SHIFT 16
+#define QLOGIC_IB_RHF_SEQ_MASK 0xF
+#define QLOGIC_IB_RHF_SEQ_SHIFT 0
+#define QLOGIC_IB_RHF_HDRQ_OFFSET_MASK 0x7FF
+#define QLOGIC_IB_RHF_HDRQ_OFFSET_SHIFT 4
+#define QLOGIC_IB_RHF_H_ICRCERR   0x80000000
+#define QLOGIC_IB_RHF_H_VCRCERR   0x40000000
+#define QLOGIC_IB_RHF_H_PARITYERR 0x20000000
+#define QLOGIC_IB_RHF_H_LENERR    0x10000000
+#define QLOGIC_IB_RHF_H_MTUERR    0x08000000
+#define QLOGIC_IB_RHF_H_IHDRERR   0x04000000
+#define QLOGIC_IB_RHF_H_TIDERR    0x02000000
+#define QLOGIC_IB_RHF_H_MKERR     0x01000000
+#define QLOGIC_IB_RHF_H_IBERR     0x00800000
+#define QLOGIC_IB_RHF_H_ERR_MASK  0xFF800000
+#define QLOGIC_IB_RHF_L_USE_EGR   0x80000000
+#define QLOGIC_IB_RHF_L_SWA       0x00008000
+#define QLOGIC_IB_RHF_L_SWB       0x00004000
+
+/* qlogic_ib header fields */
+#define QLOGIC_IB_I_VERS_MASK 0xF
+#define QLOGIC_IB_I_VERS_SHIFT 28
+#define QLOGIC_IB_I_CTXT_MASK 0xF
+#define QLOGIC_IB_I_CTXT_SHIFT 24
+#define QLOGIC_IB_I_TID_MASK 0x7FF
+#define QLOGIC_IB_I_TID_SHIFT 13
+#define QLOGIC_IB_I_OFFSET_MASK 0x1FFF
+#define QLOGIC_IB_I_OFFSET_SHIFT 0
+
+/* K_PktFlags bits */
+#define QLOGIC_IB_KPF_INTR 0x1
+#define QLOGIC_IB_KPF_SUBCTXT_MASK 0x3
+#define QLOGIC_IB_KPF_SUBCTXT_SHIFT 1
+
+#define QLOGIC_IB_MAX_SUBCTXT   4
+
+/* SendPIO per-buffer control */
+#define QLOGIC_IB_SP_TEST    0x40
+#define QLOGIC_IB_SP_TESTEBP 0x20
+#define QLOGIC_IB_SP_TRIGGER_SHIFT  15
+
+/* SendPIOAvail bits */
+#define QLOGIC_IB_SENDPIOAVAIL_BUSY_SHIFT 1
+#define QLOGIC_IB_SENDPIOAVAIL_CHECK_SHIFT 0
+
+/* qlogic_ib header format */
+struct qib_header {
+	/*
+	 * Version - 4 bits, Context - 4 bits, TID - 10 bits and Offset -
+	 * 14 bits before ECO change ~28 Dec 03.  After that, Vers 4,
+	 * Context 4, TID 11, offset 13.
+	 */
+	__le32 ver_ctxt_tid_offset;
+	__le16 chksum;
+	__le16 pkt_flags;
+};
+
+/*
+ * qlogic_ib user message header format.
+ * This structure contains the first 4 fields common to all protocols
+ * that employ qlogic_ib.
+ */
+struct qib_message_header {
+	__be16 lrh[4];
+	__be32 bth[3];
+	/* fields below this point are in host byte order */
+	struct qib_header iph;
+	/* fields below are simplified, but should match PSM */
+	/* some are accessed by driver when packet spliting is needed */
+	__u8 sub_opcode;
+	__u8 flags;
+	__u16 commidx;
+	__u32 ack_seq_num;
+	__u8 flowid;
+	__u8 hdr_dlen;
+	__u16 mqhdr;
+	__u32 uwords[4];
+};
+
+/* sequence number bits for message */
+union qib_seqnum {
+	struct {
+		__u32 seq:11;
+		__u32 gen:8;
+		__u32 flow:5;
+	};
+	struct {
+		__u32 pkt:16;
+		__u32 msg:8;
+	};
+	__u32 val;
+};
+
+/* qib receiving-dma tid-session-member */
+struct qib_tid_session_member {
+	__u16 tid;
+	__u16 offset;
+	__u16 length;
+};
+
+/* IB - LRH header consts */
+#define QIB_LRH_GRH 0x0003      /* 1. word of IB LRH - next header: GRH */
+#define QIB_LRH_BTH 0x0002      /* 1. word of IB LRH - next header: BTH */
+
+/* misc. */
+#define SIZE_OF_CRC 1
+
+#define QIB_DEFAULT_P_KEY 0xFFFF
+#define QIB_PERMISSIVE_LID 0xFFFF
+#define QIB_AETH_CREDIT_SHIFT 24
+#define QIB_AETH_CREDIT_MASK 0x1F
+#define QIB_AETH_CREDIT_INVAL 0x1F
+#define QIB_PSN_MASK 0xFFFFFF
+#define QIB_MSN_MASK 0xFFFFFF
+#define QIB_QPN_MASK 0xFFFFFF
+#define QIB_MULTICAST_LID_BASE 0xC000
+#define QIB_EAGER_TID_ID QLOGIC_IB_I_TID_MASK
+#define QIB_MULTICAST_QPN 0xFFFFFF
+
+/* Receive Header Queue: receive type (from qlogic_ib) */
+#define RCVHQ_RCV_TYPE_EXPECTED  0
+#define RCVHQ_RCV_TYPE_EAGER     1
+#define RCVHQ_RCV_TYPE_NON_KD    2
+#define RCVHQ_RCV_TYPE_ERROR     3
+
+#define QIB_HEADER_QUEUE_WORDS 9
+
+/* functions for extracting fields from rcvhdrq entries for the driver.
+ */
+static inline __u32 qib_hdrget_err_flags(const __le32 *rbuf)
+{
+	return __le32_to_cpu(rbuf[1]) & QLOGIC_IB_RHF_H_ERR_MASK;
+}
+
+static inline __u32 qib_hdrget_rcv_type(const __le32 *rbuf)
+{
+	return (__le32_to_cpu(rbuf[0]) >> QLOGIC_IB_RHF_RCVTYPE_SHIFT) &
+		QLOGIC_IB_RHF_RCVTYPE_MASK;
+}
+
+static inline __u32 qib_hdrget_length_in_bytes(const __le32 *rbuf)
+{
+	return ((__le32_to_cpu(rbuf[0]) >> QLOGIC_IB_RHF_LENGTH_SHIFT) &
+		QLOGIC_IB_RHF_LENGTH_MASK) << 2;
+}
+
+static inline __u32 qib_hdrget_index(const __le32 *rbuf)
+{
+	return (__le32_to_cpu(rbuf[0]) >> QLOGIC_IB_RHF_EGRINDEX_SHIFT) &
+		QLOGIC_IB_RHF_EGRINDEX_MASK;
+}
+
+static inline __u32 qib_hdrget_seq(const __le32 *rbuf)
+{
+	return (__le32_to_cpu(rbuf[1]) >> QLOGIC_IB_RHF_SEQ_SHIFT) &
+		QLOGIC_IB_RHF_SEQ_MASK;
+}
+
+static inline __u32 qib_hdrget_offset(const __le32 *rbuf)
+{
+	return (__le32_to_cpu(rbuf[1]) >> QLOGIC_IB_RHF_HDRQ_OFFSET_SHIFT) &
+		QLOGIC_IB_RHF_HDRQ_OFFSET_MASK;
+}
+
+static inline __u32 qib_hdrget_use_egr_buf(const __le32 *rbuf)
+{
+	return __le32_to_cpu(rbuf[0]) & QLOGIC_IB_RHF_L_USE_EGR;
+}
+
+static inline __u32 qib_hdrget_qib_ver(__le32 hdrword)
+{
+	return (__le32_to_cpu(hdrword) >> QLOGIC_IB_I_VERS_SHIFT) &
+		QLOGIC_IB_I_VERS_MASK;
+}
+
+#endif                          /* _QIB_COMMON_H */
diff --git a/drivers/infiniband/hw/qib/qib_cq.c b/drivers/infiniband/hw/qib/qib_cq.c
new file mode 100644
index 0000000..ab4e11c
--- /dev/null
+++ b/drivers/infiniband/hw/qib/qib_cq.c
@@ -0,0 +1,540 @@
+/*
+ * Copyright (c) 2013 Intel Corporation.  All rights reserved.
+ * Copyright (c) 2006, 2007, 2008, 2010 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/err.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/kthread.h>
+
+#include "qib_verbs.h"
+#include "qib.h"
+
+/**
+ * qib_cq_enter - add a new entry to the completion queue
+ * @cq: completion queue
+ * @entry: work completion entry to add
+ * @sig: true if @entry is a solicitated entry
+ *
+ * This may be called with qp->s_lock held.
+ */
+void qib_cq_enter(struct qib_cq *cq, struct ib_wc *entry, int solicited)
+{
+	struct qib_cq_wc *wc;
+	unsigned long flags;
+	u32 head;
+	u32 next;
+
+	spin_lock_irqsave(&cq->lock, flags);
+
+	/*
+	 * Note that the head pointer might be writable by user processes.
+	 * Take care to verify it is a sane value.
+	 */
+	wc = cq->queue;
+	head = wc->head;
+	if (head >= (unsigned) cq->ibcq.cqe) {
+		head = cq->ibcq.cqe;
+		next = 0;
+	} else
+		next = head + 1;
+	if (unlikely(next == wc->tail)) {
+		spin_unlock_irqrestore(&cq->lock, flags);
+		if (cq->ibcq.event_handler) {
+			struct ib_event ev;
+
+			ev.device = cq->ibcq.device;
+			ev.element.cq = &cq->ibcq;
+			ev.event = IB_EVENT_CQ_ERR;
+			cq->ibcq.event_handler(&ev, cq->ibcq.cq_context);
+		}
+		return;
+	}
+	if (cq->ip) {
+		wc->uqueue[head].wr_id = entry->wr_id;
+		wc->uqueue[head].status = entry->status;
+		wc->uqueue[head].opcode = entry->opcode;
+		wc->uqueue[head].vendor_err = entry->vendor_err;
+		wc->uqueue[head].byte_len = entry->byte_len;
+		wc->uqueue[head].ex.imm_data =
+			(__u32 __force)entry->ex.imm_data;
+		wc->uqueue[head].qp_num = entry->qp->qp_num;
+		wc->uqueue[head].src_qp = entry->src_qp;
+		wc->uqueue[head].wc_flags = entry->wc_flags;
+		wc->uqueue[head].pkey_index = entry->pkey_index;
+		wc->uqueue[head].slid = entry->slid;
+		wc->uqueue[head].sl = entry->sl;
+		wc->uqueue[head].dlid_path_bits = entry->dlid_path_bits;
+		wc->uqueue[head].port_num = entry->port_num;
+		/* Make sure entry is written before the head index. */
+		smp_wmb();
+	} else
+		wc->kqueue[head] = *entry;
+	wc->head = next;
+
+	if (cq->notify == IB_CQ_NEXT_COMP ||
+	    (cq->notify == IB_CQ_SOLICITED &&
+	     (solicited || entry->status != IB_WC_SUCCESS))) {
+		struct kthread_worker *worker;
+		/*
+		 * This will cause send_complete() to be called in
+		 * another thread.
+		 */
+		smp_rmb();
+		worker = cq->dd->worker;
+		if (likely(worker)) {
+			cq->notify = IB_CQ_NONE;
+			cq->triggered++;
+			queue_kthread_work(worker, &cq->comptask);
+		}
+	}
+
+	spin_unlock_irqrestore(&cq->lock, flags);
+}
+
+/**
+ * qib_poll_cq - poll for work completion entries
+ * @ibcq: the completion queue to poll
+ * @num_entries: the maximum number of entries to return
+ * @entry: pointer to array where work completions are placed
+ *
+ * Returns the number of completion entries polled.
+ *
+ * This may be called from interrupt context.  Also called by ib_poll_cq()
+ * in the generic verbs code.
+ */
+int qib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry)
+{
+	struct qib_cq *cq = to_icq(ibcq);
+	struct qib_cq_wc *wc;
+	unsigned long flags;
+	int npolled;
+	u32 tail;
+
+	/* The kernel can only poll a kernel completion queue */
+	if (cq->ip) {
+		npolled = -EINVAL;
+		goto bail;
+	}
+
+	spin_lock_irqsave(&cq->lock, flags);
+
+	wc = cq->queue;
+	tail = wc->tail;
+	if (tail > (u32) cq->ibcq.cqe)
+		tail = (u32) cq->ibcq.cqe;
+	for (npolled = 0; npolled < num_entries; ++npolled, ++entry) {
+		if (tail == wc->head)
+			break;
+		/* The kernel doesn't need a RMB since it has the lock. */
+		*entry = wc->kqueue[tail];
+		if (tail >= cq->ibcq.cqe)
+			tail = 0;
+		else
+			tail++;
+	}
+	wc->tail = tail;
+
+	spin_unlock_irqrestore(&cq->lock, flags);
+
+bail:
+	return npolled;
+}
+
+static void send_complete(struct kthread_work *work)
+{
+	struct qib_cq *cq = container_of(work, struct qib_cq, comptask);
+
+	/*
+	 * The completion handler will most likely rearm the notification
+	 * and poll for all pending entries.  If a new completion entry
+	 * is added while we are in this routine, queue_work()
+	 * won't call us again until we return so we check triggered to
+	 * see if we need to call the handler again.
+	 */
+	for (;;) {
+		u8 triggered = cq->triggered;
+
+		/*
+		 * IPoIB connected mode assumes the callback is from a
+		 * soft IRQ. We simulate this by blocking "bottom halves".
+		 * See the implementation for ipoib_cm_handle_tx_wc(),
+		 * netif_tx_lock_bh() and netif_tx_lock().
+		 */
+		local_bh_disable();
+		cq->ibcq.comp_handler(&cq->ibcq, cq->ibcq.cq_context);
+		local_bh_enable();
+
+		if (cq->triggered == triggered)
+			return;
+	}
+}
+
+/**
+ * qib_create_cq - create a completion queue
+ * @ibdev: the device this completion queue is attached to
+ * @entries: the minimum size of the completion queue
+ * @context: unused by the QLogic_IB driver
+ * @udata: user data for libibverbs.so
+ *
+ * Returns a pointer to the completion queue or negative errno values
+ * for failure.
+ *
+ * Called by ib_create_cq() in the generic verbs code.
+ */
+struct ib_cq *qib_create_cq(struct ib_device *ibdev, int entries,
+			    int comp_vector, struct ib_ucontext *context,
+			    struct ib_udata *udata)
+{
+	struct qib_ibdev *dev = to_idev(ibdev);
+	struct qib_cq *cq;
+	struct qib_cq_wc *wc;
+	struct ib_cq *ret;
+	u32 sz;
+
+	if (entries < 1 || entries > ib_qib_max_cqes) {
+		ret = ERR_PTR(-EINVAL);
+		goto done;
+	}
+
+	/* Allocate the completion queue structure. */
+	cq = kmalloc(sizeof(*cq), GFP_KERNEL);
+	if (!cq) {
+		ret = ERR_PTR(-ENOMEM);
+		goto done;
+	}
+
+	/*
+	 * Allocate the completion queue entries and head/tail pointers.
+	 * This is allocated separately so that it can be resized and
+	 * also mapped into user space.
+	 * We need to use vmalloc() in order to support mmap and large
+	 * numbers of entries.
+	 */
+	sz = sizeof(*wc);
+	if (udata && udata->outlen >= sizeof(__u64))
+		sz += sizeof(struct ib_uverbs_wc) * (entries + 1);
+	else
+		sz += sizeof(struct ib_wc) * (entries + 1);
+	wc = vmalloc_user(sz);
+	if (!wc) {
+		ret = ERR_PTR(-ENOMEM);
+		goto bail_cq;
+	}
+
+	/*
+	 * Return the address of the WC as the offset to mmap.
+	 * See qib_mmap() for details.
+	 */
+	if (udata && udata->outlen >= sizeof(__u64)) {
+		int err;
+
+		cq->ip = qib_create_mmap_info(dev, sz, context, wc);
+		if (!cq->ip) {
+			ret = ERR_PTR(-ENOMEM);
+			goto bail_wc;
+		}
+
+		err = ib_copy_to_udata(udata, &cq->ip->offset,
+				       sizeof(cq->ip->offset));
+		if (err) {
+			ret = ERR_PTR(err);
+			goto bail_ip;
+		}
+	} else
+		cq->ip = NULL;
+
+	spin_lock(&dev->n_cqs_lock);
+	if (dev->n_cqs_allocated == ib_qib_max_cqs) {
+		spin_unlock(&dev->n_cqs_lock);
+		ret = ERR_PTR(-ENOMEM);
+		goto bail_ip;
+	}
+
+	dev->n_cqs_allocated++;
+	spin_unlock(&dev->n_cqs_lock);
+
+	if (cq->ip) {
+		spin_lock_irq(&dev->pending_lock);
+		list_add(&cq->ip->pending_mmaps, &dev->pending_mmaps);
+		spin_unlock_irq(&dev->pending_lock);
+	}
+
+	/*
+	 * ib_create_cq() will initialize cq->ibcq except for cq->ibcq.cqe.
+	 * The number of entries should be >= the number requested or return
+	 * an error.
+	 */
+	cq->dd = dd_from_dev(dev);
+	cq->ibcq.cqe = entries;
+	cq->notify = IB_CQ_NONE;
+	cq->triggered = 0;
+	spin_lock_init(&cq->lock);
+	init_kthread_work(&cq->comptask, send_complete);
+	wc->head = 0;
+	wc->tail = 0;
+	cq->queue = wc;
+
+	ret = &cq->ibcq;
+
+	goto done;
+
+bail_ip:
+	kfree(cq->ip);
+bail_wc:
+	vfree(wc);
+bail_cq:
+	kfree(cq);
+done:
+	return ret;
+}
+
+/**
+ * qib_destroy_cq - destroy a completion queue
+ * @ibcq: the completion queue to destroy.
+ *
+ * Returns 0 for success.
+ *
+ * Called by ib_destroy_cq() in the generic verbs code.
+ */
+int qib_destroy_cq(struct ib_cq *ibcq)
+{
+	struct qib_ibdev *dev = to_idev(ibcq->device);
+	struct qib_cq *cq = to_icq(ibcq);
+
+	flush_kthread_work(&cq->comptask);
+	spin_lock(&dev->n_cqs_lock);
+	dev->n_cqs_allocated--;
+	spin_unlock(&dev->n_cqs_lock);
+	if (cq->ip)
+		kref_put(&cq->ip->ref, qib_release_mmap_info);
+	else
+		vfree(cq->queue);
+	kfree(cq);
+
+	return 0;
+}
+
+/**
+ * qib_req_notify_cq - change the notification type for a completion queue
+ * @ibcq: the completion queue
+ * @notify_flags: the type of notification to request
+ *
+ * Returns 0 for success.
+ *
+ * This may be called from interrupt context.  Also called by
+ * ib_req_notify_cq() in the generic verbs code.
+ */
+int qib_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags notify_flags)
+{
+	struct qib_cq *cq = to_icq(ibcq);
+	unsigned long flags;
+	int ret = 0;
+
+	spin_lock_irqsave(&cq->lock, flags);
+	/*
+	 * Don't change IB_CQ_NEXT_COMP to IB_CQ_SOLICITED but allow
+	 * any other transitions (see C11-31 and C11-32 in ch. 11.4.2.2).
+	 */
+	if (cq->notify != IB_CQ_NEXT_COMP)
+		cq->notify = notify_flags & IB_CQ_SOLICITED_MASK;
+
+	if ((notify_flags & IB_CQ_REPORT_MISSED_EVENTS) &&
+	    cq->queue->head != cq->queue->tail)
+		ret = 1;
+
+	spin_unlock_irqrestore(&cq->lock, flags);
+
+	return ret;
+}
+
+/**
+ * qib_resize_cq - change the size of the CQ
+ * @ibcq: the completion queue
+ *
+ * Returns 0 for success.
+ */
+int qib_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata)
+{
+	struct qib_cq *cq = to_icq(ibcq);
+	struct qib_cq_wc *old_wc;
+	struct qib_cq_wc *wc;
+	u32 head, tail, n;
+	int ret;
+	u32 sz;
+
+	if (cqe < 1 || cqe > ib_qib_max_cqes) {
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	/*
+	 * Need to use vmalloc() if we want to support large #s of entries.
+	 */
+	sz = sizeof(*wc);
+	if (udata && udata->outlen >= sizeof(__u64))
+		sz += sizeof(struct ib_uverbs_wc) * (cqe + 1);
+	else
+		sz += sizeof(struct ib_wc) * (cqe + 1);
+	wc = vmalloc_user(sz);
+	if (!wc) {
+		ret = -ENOMEM;
+		goto bail;
+	}
+
+	/* Check that we can write the offset to mmap. */
+	if (udata && udata->outlen >= sizeof(__u64)) {
+		__u64 offset = 0;
+
+		ret = ib_copy_to_udata(udata, &offset, sizeof(offset));
+		if (ret)
+			goto bail_free;
+	}
+
+	spin_lock_irq(&cq->lock);
+	/*
+	 * Make sure head and tail are sane since they
+	 * might be user writable.
+	 */
+	old_wc = cq->queue;
+	head = old_wc->head;
+	if (head > (u32) cq->ibcq.cqe)
+		head = (u32) cq->ibcq.cqe;
+	tail = old_wc->tail;
+	if (tail > (u32) cq->ibcq.cqe)
+		tail = (u32) cq->ibcq.cqe;
+	if (head < tail)
+		n = cq->ibcq.cqe + 1 + head - tail;
+	else
+		n = head - tail;
+	if (unlikely((u32)cqe < n)) {
+		ret = -EINVAL;
+		goto bail_unlock;
+	}
+	for (n = 0; tail != head; n++) {
+		if (cq->ip)
+			wc->uqueue[n] = old_wc->uqueue[tail];
+		else
+			wc->kqueue[n] = old_wc->kqueue[tail];
+		if (tail == (u32) cq->ibcq.cqe)
+			tail = 0;
+		else
+			tail++;
+	}
+	cq->ibcq.cqe = cqe;
+	wc->head = n;
+	wc->tail = 0;
+	cq->queue = wc;
+	spin_unlock_irq(&cq->lock);
+
+	vfree(old_wc);
+
+	if (cq->ip) {
+		struct qib_ibdev *dev = to_idev(ibcq->device);
+		struct qib_mmap_info *ip = cq->ip;
+
+		qib_update_mmap_info(dev, ip, sz, wc);
+
+		/*
+		 * Return the offset to mmap.
+		 * See qib_mmap() for details.
+		 */
+		if (udata && udata->outlen >= sizeof(__u64)) {
+			ret = ib_copy_to_udata(udata, &ip->offset,
+					       sizeof(ip->offset));
+			if (ret)
+				goto bail;
+		}
+
+		spin_lock_irq(&dev->pending_lock);
+		if (list_empty(&ip->pending_mmaps))
+			list_add(&ip->pending_mmaps, &dev->pending_mmaps);
+		spin_unlock_irq(&dev->pending_lock);
+	}
+
+	ret = 0;
+	goto bail;
+
+bail_unlock:
+	spin_unlock_irq(&cq->lock);
+bail_free:
+	vfree(wc);
+bail:
+	return ret;
+}
+
+int qib_cq_init(struct qib_devdata *dd)
+{
+	int ret = 0;
+	int cpu;
+	struct task_struct *task;
+
+	if (dd->worker)
+		return 0;
+	dd->worker = kzalloc(sizeof(*dd->worker), GFP_KERNEL);
+	if (!dd->worker)
+		return -ENOMEM;
+	init_kthread_worker(dd->worker);
+	task = kthread_create_on_node(
+		kthread_worker_fn,
+		dd->worker,
+		dd->assigned_node_id,
+		"qib_cq%d", dd->unit);
+	if (IS_ERR(task))
+		goto task_fail;
+	cpu = cpumask_first(cpumask_of_node(dd->assigned_node_id));
+	kthread_bind(task, cpu);
+	wake_up_process(task);
+out:
+	return ret;
+task_fail:
+	ret = PTR_ERR(task);
+	kfree(dd->worker);
+	dd->worker = NULL;
+	goto out;
+}
+
+void qib_cq_exit(struct qib_devdata *dd)
+{
+	struct kthread_worker *worker;
+
+	worker = dd->worker;
+	if (!worker)
+		return;
+	/* blocks future queuing from send_complete() */
+	dd->worker = NULL;
+	smp_wmb();
+	flush_kthread_worker(worker);
+	kthread_stop(worker->task);
+	kfree(worker);
+}
diff --git a/drivers/infiniband/hw/qib/qib_debugfs.c b/drivers/infiniband/hw/qib/qib_debugfs.c
new file mode 100644
index 0000000..6abd3ed
--- /dev/null
+++ b/drivers/infiniband/hw/qib/qib_debugfs.c
@@ -0,0 +1,284 @@
+#ifdef CONFIG_DEBUG_FS
+/*
+ * Copyright (c) 2013 Intel Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+#include <linux/kernel.h>
+#include <linux/export.h>
+
+#include "qib.h"
+#include "qib_verbs.h"
+#include "qib_debugfs.h"
+
+static struct dentry *qib_dbg_root;
+
+#define DEBUGFS_FILE(name) \
+static const struct seq_operations _##name##_seq_ops = { \
+	.start = _##name##_seq_start, \
+	.next  = _##name##_seq_next, \
+	.stop  = _##name##_seq_stop, \
+	.show  = _##name##_seq_show \
+}; \
+static int _##name##_open(struct inode *inode, struct file *s) \
+{ \
+	struct seq_file *seq; \
+	int ret; \
+	ret =  seq_open(s, &_##name##_seq_ops); \
+	if (ret) \
+		return ret; \
+	seq = s->private_data; \
+	seq->private = inode->i_private; \
+	return 0; \
+} \
+static const struct file_operations _##name##_file_ops = { \
+	.owner   = THIS_MODULE, \
+	.open    = _##name##_open, \
+	.read    = seq_read, \
+	.llseek  = seq_lseek, \
+	.release = seq_release \
+};
+
+#define DEBUGFS_FILE_CREATE(name) \
+do { \
+	struct dentry *ent; \
+	ent = debugfs_create_file(#name , 0400, ibd->qib_ibdev_dbg, \
+		ibd, &_##name##_file_ops); \
+	if (!ent) \
+		pr_warn("create of " #name " failed\n"); \
+} while (0)
+
+static void *_opcode_stats_seq_start(struct seq_file *s, loff_t *pos)
+{
+	struct qib_opcode_stats_perctx *opstats;
+
+	if (*pos >= ARRAY_SIZE(opstats->stats))
+		return NULL;
+	return pos;
+}
+
+static void *_opcode_stats_seq_next(struct seq_file *s, void *v, loff_t *pos)
+{
+	struct qib_opcode_stats_perctx *opstats;
+
+	++*pos;
+	if (*pos >= ARRAY_SIZE(opstats->stats))
+		return NULL;
+	return pos;
+}
+
+
+static void _opcode_stats_seq_stop(struct seq_file *s, void *v)
+{
+	/* nothing allocated */
+}
+
+static int _opcode_stats_seq_show(struct seq_file *s, void *v)
+{
+	loff_t *spos = v;
+	loff_t i = *spos, j;
+	u64 n_packets = 0, n_bytes = 0;
+	struct qib_ibdev *ibd = (struct qib_ibdev *)s->private;
+	struct qib_devdata *dd = dd_from_dev(ibd);
+
+	for (j = 0; j < dd->first_user_ctxt; j++) {
+		if (!dd->rcd[j])
+			continue;
+		n_packets += dd->rcd[j]->opstats->stats[i].n_packets;
+		n_bytes += dd->rcd[j]->opstats->stats[i].n_bytes;
+	}
+	if (!n_packets && !n_bytes)
+		return SEQ_SKIP;
+	seq_printf(s, "%02llx %llu/%llu\n", i,
+		(unsigned long long) n_packets,
+		(unsigned long long) n_bytes);
+
+	return 0;
+}
+
+DEBUGFS_FILE(opcode_stats)
+
+static void *_ctx_stats_seq_start(struct seq_file *s, loff_t *pos)
+{
+	struct qib_ibdev *ibd = (struct qib_ibdev *)s->private;
+	struct qib_devdata *dd = dd_from_dev(ibd);
+
+	if (!*pos)
+		return SEQ_START_TOKEN;
+	if (*pos >= dd->first_user_ctxt)
+		return NULL;
+	return pos;
+}
+
+static void *_ctx_stats_seq_next(struct seq_file *s, void *v, loff_t *pos)
+{
+	struct qib_ibdev *ibd = (struct qib_ibdev *)s->private;
+	struct qib_devdata *dd = dd_from_dev(ibd);
+
+	if (v == SEQ_START_TOKEN)
+		return pos;
+
+	++*pos;
+	if (*pos >= dd->first_user_ctxt)
+		return NULL;
+	return pos;
+}
+
+static void _ctx_stats_seq_stop(struct seq_file *s, void *v)
+{
+	/* nothing allocated */
+}
+
+static int _ctx_stats_seq_show(struct seq_file *s, void *v)
+{
+	loff_t *spos;
+	loff_t i, j;
+	u64 n_packets = 0;
+	struct qib_ibdev *ibd = (struct qib_ibdev *)s->private;
+	struct qib_devdata *dd = dd_from_dev(ibd);
+
+	if (v == SEQ_START_TOKEN) {
+		seq_puts(s, "Ctx:npkts\n");
+		return 0;
+	}
+
+	spos = v;
+	i = *spos;
+
+	if (!dd->rcd[i])
+		return SEQ_SKIP;
+
+	for (j = 0; j < ARRAY_SIZE(dd->rcd[i]->opstats->stats); j++)
+		n_packets += dd->rcd[i]->opstats->stats[j].n_packets;
+
+	if (!n_packets)
+		return SEQ_SKIP;
+
+	seq_printf(s, "  %llu:%llu\n", i, n_packets);
+	return 0;
+}
+
+DEBUGFS_FILE(ctx_stats)
+
+static void *_qp_stats_seq_start(struct seq_file *s, loff_t *pos)
+{
+	struct qib_qp_iter *iter;
+	loff_t n = *pos;
+
+	rcu_read_lock();
+	iter = qib_qp_iter_init(s->private);
+	if (!iter)
+		return NULL;
+
+	while (n--) {
+		if (qib_qp_iter_next(iter)) {
+			kfree(iter);
+			return NULL;
+		}
+	}
+
+	return iter;
+}
+
+static void *_qp_stats_seq_next(struct seq_file *s, void *iter_ptr,
+				   loff_t *pos)
+{
+	struct qib_qp_iter *iter = iter_ptr;
+
+	(*pos)++;
+
+	if (qib_qp_iter_next(iter)) {
+		kfree(iter);
+		return NULL;
+	}
+
+	return iter;
+}
+
+static void _qp_stats_seq_stop(struct seq_file *s, void *iter_ptr)
+{
+	rcu_read_unlock();
+}
+
+static int _qp_stats_seq_show(struct seq_file *s, void *iter_ptr)
+{
+	struct qib_qp_iter *iter = iter_ptr;
+
+	if (!iter)
+		return 0;
+
+	qib_qp_iter_print(s, iter);
+
+	return 0;
+}
+
+DEBUGFS_FILE(qp_stats)
+
+void qib_dbg_ibdev_init(struct qib_ibdev *ibd)
+{
+	char name[10];
+
+	snprintf(name, sizeof(name), "qib%d", dd_from_dev(ibd)->unit);
+	ibd->qib_ibdev_dbg = debugfs_create_dir(name, qib_dbg_root);
+	if (!ibd->qib_ibdev_dbg) {
+		pr_warn("create of %s failed\n", name);
+		return;
+	}
+	DEBUGFS_FILE_CREATE(opcode_stats);
+	DEBUGFS_FILE_CREATE(ctx_stats);
+	DEBUGFS_FILE_CREATE(qp_stats);
+	return;
+}
+
+void qib_dbg_ibdev_exit(struct qib_ibdev *ibd)
+{
+	if (!qib_dbg_root)
+		goto out;
+	debugfs_remove_recursive(ibd->qib_ibdev_dbg);
+out:
+	ibd->qib_ibdev_dbg = NULL;
+}
+
+void qib_dbg_init(void)
+{
+	qib_dbg_root = debugfs_create_dir(QIB_DRV_NAME, NULL);
+	if (!qib_dbg_root)
+		pr_warn("init of debugfs failed\n");
+}
+
+void qib_dbg_exit(void)
+{
+	debugfs_remove_recursive(qib_dbg_root);
+	qib_dbg_root = NULL;
+}
+
+#endif
+
diff --git a/drivers/infiniband/hw/qib/qib_debugfs.h b/drivers/infiniband/hw/qib/qib_debugfs.h
new file mode 100644
index 0000000..7ae983a
--- /dev/null
+++ b/drivers/infiniband/hw/qib/qib_debugfs.h
@@ -0,0 +1,45 @@
+#ifndef _QIB_DEBUGFS_H
+#define _QIB_DEBUGFS_H
+
+#ifdef CONFIG_DEBUG_FS
+/*
+ * Copyright (c) 2013 Intel Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+struct qib_ibdev;
+void qib_dbg_ibdev_init(struct qib_ibdev *ibd);
+void qib_dbg_ibdev_exit(struct qib_ibdev *ibd);
+void qib_dbg_init(void);
+void qib_dbg_exit(void);
+
+#endif
+
+#endif                          /* _QIB_DEBUGFS_H */
diff --git a/drivers/infiniband/hw/qib/qib_diag.c b/drivers/infiniband/hw/qib/qib_diag.c
new file mode 100644
index 0000000..5dfda4c
--- /dev/null
+++ b/drivers/infiniband/hw/qib/qib_diag.c
@@ -0,0 +1,911 @@
+/*
+ * Copyright (c) 2012 Intel Corporation. All rights reserved.
+ * Copyright (c) 2006 - 2012 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/*
+ * This file contains support for diagnostic functions.  It is accessed by
+ * opening the qib_diag device, normally minor number 129.  Diagnostic use
+ * of the QLogic_IB chip may render the chip or board unusable until the
+ * driver is unloaded, or in some cases, until the system is rebooted.
+ *
+ * Accesses to the chip through this interface are not similar to going
+ * through the /sys/bus/pci resource mmap interface.
+ */
+
+#include <linux/io.h>
+#include <linux/pci.h>
+#include <linux/poll.h>
+#include <linux/vmalloc.h>
+#include <linux/export.h>
+#include <linux/fs.h>
+#include <linux/uaccess.h>
+
+#include "qib.h"
+#include "qib_common.h"
+
+#undef pr_fmt
+#define pr_fmt(fmt) QIB_DRV_NAME ": " fmt
+
+/*
+ * Each client that opens the diag device must read then write
+ * offset 0, to prevent lossage from random cat or od. diag_state
+ * sequences this "handshake".
+ */
+enum diag_state { UNUSED = 0, OPENED, INIT, READY };
+
+/* State for an individual client. PID so children cannot abuse handshake */
+static struct qib_diag_client {
+	struct qib_diag_client *next;
+	struct qib_devdata *dd;
+	pid_t pid;
+	enum diag_state state;
+} *client_pool;
+
+/*
+ * Get a client struct. Recycled if possible, else kmalloc.
+ * Must be called with qib_mutex held
+ */
+static struct qib_diag_client *get_client(struct qib_devdata *dd)
+{
+	struct qib_diag_client *dc;
+
+	dc = client_pool;
+	if (dc)
+		/* got from pool remove it and use */
+		client_pool = dc->next;
+	else
+		/* None in pool, alloc and init */
+		dc = kmalloc(sizeof *dc, GFP_KERNEL);
+
+	if (dc) {
+		dc->next = NULL;
+		dc->dd = dd;
+		dc->pid = current->pid;
+		dc->state = OPENED;
+	}
+	return dc;
+}
+
+/*
+ * Return to pool. Must be called with qib_mutex held
+ */
+static void return_client(struct qib_diag_client *dc)
+{
+	struct qib_devdata *dd = dc->dd;
+	struct qib_diag_client *tdc, *rdc;
+
+	rdc = NULL;
+	if (dc == dd->diag_client) {
+		dd->diag_client = dc->next;
+		rdc = dc;
+	} else {
+		tdc = dc->dd->diag_client;
+		while (tdc) {
+			if (dc == tdc->next) {
+				tdc->next = dc->next;
+				rdc = dc;
+				break;
+			}
+			tdc = tdc->next;
+		}
+	}
+	if (rdc) {
+		rdc->state = UNUSED;
+		rdc->dd = NULL;
+		rdc->pid = 0;
+		rdc->next = client_pool;
+		client_pool = rdc;
+	}
+}
+
+static int qib_diag_open(struct inode *in, struct file *fp);
+static int qib_diag_release(struct inode *in, struct file *fp);
+static ssize_t qib_diag_read(struct file *fp, char __user *data,
+			     size_t count, loff_t *off);
+static ssize_t qib_diag_write(struct file *fp, const char __user *data,
+			      size_t count, loff_t *off);
+
+static const struct file_operations diag_file_ops = {
+	.owner = THIS_MODULE,
+	.write = qib_diag_write,
+	.read = qib_diag_read,
+	.open = qib_diag_open,
+	.release = qib_diag_release,
+	.llseek = default_llseek,
+};
+
+static atomic_t diagpkt_count = ATOMIC_INIT(0);
+static struct cdev *diagpkt_cdev;
+static struct device *diagpkt_device;
+
+static ssize_t qib_diagpkt_write(struct file *fp, const char __user *data,
+				 size_t count, loff_t *off);
+
+static const struct file_operations diagpkt_file_ops = {
+	.owner = THIS_MODULE,
+	.write = qib_diagpkt_write,
+	.llseek = noop_llseek,
+};
+
+int qib_diag_add(struct qib_devdata *dd)
+{
+	char name[16];
+	int ret = 0;
+
+	if (atomic_inc_return(&diagpkt_count) == 1) {
+		ret = qib_cdev_init(QIB_DIAGPKT_MINOR, "ipath_diagpkt",
+				    &diagpkt_file_ops, &diagpkt_cdev,
+				    &diagpkt_device);
+		if (ret)
+			goto done;
+	}
+
+	snprintf(name, sizeof(name), "ipath_diag%d", dd->unit);
+	ret = qib_cdev_init(QIB_DIAG_MINOR_BASE + dd->unit, name,
+			    &diag_file_ops, &dd->diag_cdev,
+			    &dd->diag_device);
+done:
+	return ret;
+}
+
+static void qib_unregister_observers(struct qib_devdata *dd);
+
+void qib_diag_remove(struct qib_devdata *dd)
+{
+	struct qib_diag_client *dc;
+
+	if (atomic_dec_and_test(&diagpkt_count))
+		qib_cdev_cleanup(&diagpkt_cdev, &diagpkt_device);
+
+	qib_cdev_cleanup(&dd->diag_cdev, &dd->diag_device);
+
+	/*
+	 * Return all diag_clients of this device. There should be none,
+	 * as we are "guaranteed" that no clients are still open
+	 */
+	while (dd->diag_client)
+		return_client(dd->diag_client);
+
+	/* Now clean up all unused client structs */
+	while (client_pool) {
+		dc = client_pool;
+		client_pool = dc->next;
+		kfree(dc);
+	}
+	/* Clean up observer list */
+	qib_unregister_observers(dd);
+}
+
+/* qib_remap_ioaddr32 - remap an offset into chip address space to __iomem *
+ *
+ * @dd: the qlogic_ib device
+ * @offs: the offset in chip-space
+ * @cntp: Pointer to max (byte) count for transfer starting at offset
+ * This returns a u32 __iomem * so it can be used for both 64 and 32-bit
+ * mapping. It is needed because with the use of PAT for control of
+ * write-combining, the logically contiguous address-space of the chip
+ * may be split into virtually non-contiguous spaces, with different
+ * attributes, which are them mapped to contiguous physical space
+ * based from the first BAR.
+ *
+ * The code below makes the same assumptions as were made in
+ * init_chip_wc_pat() (qib_init.c), copied here:
+ * Assumes chip address space looks like:
+ *		- kregs + sregs + cregs + uregs (in any order)
+ *		- piobufs (2K and 4K bufs in either order)
+ *	or:
+ *		- kregs + sregs + cregs (in any order)
+ *		- piobufs (2K and 4K bufs in either order)
+ *		- uregs
+ *
+ * If cntp is non-NULL, returns how many bytes from offset can be accessed
+ * Returns 0 if the offset is not mapped.
+ */
+static u32 __iomem *qib_remap_ioaddr32(struct qib_devdata *dd, u32 offset,
+				       u32 *cntp)
+{
+	u32 kreglen;
+	u32 snd_bottom, snd_lim = 0;
+	u32 __iomem *krb32 = (u32 __iomem *)dd->kregbase;
+	u32 __iomem *map = NULL;
+	u32 cnt = 0;
+	u32 tot4k, offs4k;
+
+	/* First, simplest case, offset is within the first map. */
+	kreglen = (dd->kregend - dd->kregbase) * sizeof(u64);
+	if (offset < kreglen) {
+		map = krb32 + (offset / sizeof(u32));
+		cnt = kreglen - offset;
+		goto mapped;
+	}
+
+	/*
+	 * Next check for user regs, the next most common case,
+	 * and a cheap check because if they are not in the first map
+	 * they are last in chip.
+	 */
+	if (dd->userbase) {
+		/* If user regs mapped, they are after send, so set limit. */
+		u32 ulim = (dd->cfgctxts * dd->ureg_align) + dd->uregbase;
+		if (!dd->piovl15base)
+			snd_lim = dd->uregbase;
+		krb32 = (u32 __iomem *)dd->userbase;
+		if (offset >= dd->uregbase && offset < ulim) {
+			map = krb32 + (offset - dd->uregbase) / sizeof(u32);
+			cnt = ulim - offset;
+			goto mapped;
+		}
+	}
+
+	/*
+	 * Lastly, check for offset within Send Buffers.
+	 * This is gnarly because struct devdata is deliberately vague
+	 * about things like 7322 VL15 buffers, and we are not in
+	 * chip-specific code here, so should not make many assumptions.
+	 * The one we _do_ make is that the only chip that has more sndbufs
+	 * than we admit is the 7322, and it has userregs above that, so
+	 * we know the snd_lim.
+	 */
+	/* Assume 2K buffers are first. */
+	snd_bottom = dd->pio2k_bufbase;
+	if (snd_lim == 0) {
+		u32 tot2k = dd->piobcnt2k * ALIGN(dd->piosize2k, dd->palign);
+		snd_lim = snd_bottom + tot2k;
+	}
+	/* If 4k buffers exist, account for them by bumping
+	 * appropriate limit.
+	 */
+	tot4k = dd->piobcnt4k * dd->align4k;
+	offs4k = dd->piobufbase >> 32;
+	if (dd->piobcnt4k) {
+		if (snd_bottom > offs4k)
+			snd_bottom = offs4k;
+		else {
+			/* 4k above 2k. Bump snd_lim, if needed*/
+			if (!dd->userbase || dd->piovl15base)
+				snd_lim = offs4k + tot4k;
+		}
+	}
+	/*
+	 * Judgement call: can we ignore the space between SendBuffs and
+	 * UserRegs, where we would like to see vl15 buffs, but not more?
+	 */
+	if (offset >= snd_bottom && offset < snd_lim) {
+		offset -= snd_bottom;
+		map = (u32 __iomem *)dd->piobase + (offset / sizeof(u32));
+		cnt = snd_lim - offset;
+	}
+
+	if (!map && offs4k && dd->piovl15base) {
+		snd_lim = offs4k + tot4k + 2 * dd->align4k;
+		if (offset >= (offs4k + tot4k) && offset < snd_lim) {
+			map = (u32 __iomem *)dd->piovl15base +
+				((offset - (offs4k + tot4k)) / sizeof(u32));
+			cnt = snd_lim - offset;
+		}
+	}
+
+mapped:
+	if (cntp)
+		*cntp = cnt;
+	return map;
+}
+
+/*
+ * qib_read_umem64 - read a 64-bit quantity from the chip into user space
+ * @dd: the qlogic_ib device
+ * @uaddr: the location to store the data in user memory
+ * @regoffs: the offset from BAR0 (_NOT_ full pointer, anymore)
+ * @count: number of bytes to copy (multiple of 32 bits)
+ *
+ * This function also localizes all chip memory accesses.
+ * The copy should be written such that we read full cacheline packets
+ * from the chip.  This is usually used for a single qword
+ *
+ * NOTE:  This assumes the chip address is 64-bit aligned.
+ */
+static int qib_read_umem64(struct qib_devdata *dd, void __user *uaddr,
+			   u32 regoffs, size_t count)
+{
+	const u64 __iomem *reg_addr;
+	const u64 __iomem *reg_end;
+	u32 limit;
+	int ret;
+
+	reg_addr = (const u64 __iomem *)qib_remap_ioaddr32(dd, regoffs, &limit);
+	if (reg_addr == NULL || limit == 0 || !(dd->flags & QIB_PRESENT)) {
+		ret = -EINVAL;
+		goto bail;
+	}
+	if (count >= limit)
+		count = limit;
+	reg_end = reg_addr + (count / sizeof(u64));
+
+	/* not very efficient, but it works for now */
+	while (reg_addr < reg_end) {
+		u64 data = readq(reg_addr);
+
+		if (copy_to_user(uaddr, &data, sizeof(u64))) {
+			ret = -EFAULT;
+			goto bail;
+		}
+		reg_addr++;
+		uaddr += sizeof(u64);
+	}
+	ret = 0;
+bail:
+	return ret;
+}
+
+/*
+ * qib_write_umem64 - write a 64-bit quantity to the chip from user space
+ * @dd: the qlogic_ib device
+ * @regoffs: the offset from BAR0 (_NOT_ full pointer, anymore)
+ * @uaddr: the source of the data in user memory
+ * @count: the number of bytes to copy (multiple of 32 bits)
+ *
+ * This is usually used for a single qword
+ * NOTE:  This assumes the chip address is 64-bit aligned.
+ */
+
+static int qib_write_umem64(struct qib_devdata *dd, u32 regoffs,
+			    const void __user *uaddr, size_t count)
+{
+	u64 __iomem *reg_addr;
+	const u64 __iomem *reg_end;
+	u32 limit;
+	int ret;
+
+	reg_addr = (u64 __iomem *)qib_remap_ioaddr32(dd, regoffs, &limit);
+	if (reg_addr == NULL || limit == 0 || !(dd->flags & QIB_PRESENT)) {
+		ret = -EINVAL;
+		goto bail;
+	}
+	if (count >= limit)
+		count = limit;
+	reg_end = reg_addr + (count / sizeof(u64));
+
+	/* not very efficient, but it works for now */
+	while (reg_addr < reg_end) {
+		u64 data;
+		if (copy_from_user(&data, uaddr, sizeof(data))) {
+			ret = -EFAULT;
+			goto bail;
+		}
+		writeq(data, reg_addr);
+
+		reg_addr++;
+		uaddr += sizeof(u64);
+	}
+	ret = 0;
+bail:
+	return ret;
+}
+
+/*
+ * qib_read_umem32 - read a 32-bit quantity from the chip into user space
+ * @dd: the qlogic_ib device
+ * @uaddr: the location to store the data in user memory
+ * @regoffs: the offset from BAR0 (_NOT_ full pointer, anymore)
+ * @count: number of bytes to copy
+ *
+ * read 32 bit values, not 64 bit; for memories that only
+ * support 32 bit reads; usually a single dword.
+ */
+static int qib_read_umem32(struct qib_devdata *dd, void __user *uaddr,
+			   u32 regoffs, size_t count)
+{
+	const u32 __iomem *reg_addr;
+	const u32 __iomem *reg_end;
+	u32 limit;
+	int ret;
+
+	reg_addr = qib_remap_ioaddr32(dd, regoffs, &limit);
+	if (reg_addr == NULL || limit == 0 || !(dd->flags & QIB_PRESENT)) {
+		ret = -EINVAL;
+		goto bail;
+	}
+	if (count >= limit)
+		count = limit;
+	reg_end = reg_addr + (count / sizeof(u32));
+
+	/* not very efficient, but it works for now */
+	while (reg_addr < reg_end) {
+		u32 data = readl(reg_addr);
+
+		if (copy_to_user(uaddr, &data, sizeof(data))) {
+			ret = -EFAULT;
+			goto bail;
+		}
+
+		reg_addr++;
+		uaddr += sizeof(u32);
+
+	}
+	ret = 0;
+bail:
+	return ret;
+}
+
+/*
+ * qib_write_umem32 - write a 32-bit quantity to the chip from user space
+ * @dd: the qlogic_ib device
+ * @regoffs: the offset from BAR0 (_NOT_ full pointer, anymore)
+ * @uaddr: the source of the data in user memory
+ * @count: number of bytes to copy
+ *
+ * write 32 bit values, not 64 bit; for memories that only
+ * support 32 bit write; usually a single dword.
+ */
+
+static int qib_write_umem32(struct qib_devdata *dd, u32 regoffs,
+			    const void __user *uaddr, size_t count)
+{
+	u32 __iomem *reg_addr;
+	const u32 __iomem *reg_end;
+	u32 limit;
+	int ret;
+
+	reg_addr = qib_remap_ioaddr32(dd, regoffs, &limit);
+	if (reg_addr == NULL || limit == 0 || !(dd->flags & QIB_PRESENT)) {
+		ret = -EINVAL;
+		goto bail;
+	}
+	if (count >= limit)
+		count = limit;
+	reg_end = reg_addr + (count / sizeof(u32));
+
+	while (reg_addr < reg_end) {
+		u32 data;
+
+		if (copy_from_user(&data, uaddr, sizeof(data))) {
+			ret = -EFAULT;
+			goto bail;
+		}
+		writel(data, reg_addr);
+
+		reg_addr++;
+		uaddr += sizeof(u32);
+	}
+	ret = 0;
+bail:
+	return ret;
+}
+
+static int qib_diag_open(struct inode *in, struct file *fp)
+{
+	int unit = iminor(in) - QIB_DIAG_MINOR_BASE;
+	struct qib_devdata *dd;
+	struct qib_diag_client *dc;
+	int ret;
+
+	mutex_lock(&qib_mutex);
+
+	dd = qib_lookup(unit);
+
+	if (dd == NULL || !(dd->flags & QIB_PRESENT) ||
+	    !dd->kregbase) {
+		ret = -ENODEV;
+		goto bail;
+	}
+
+	dc = get_client(dd);
+	if (!dc) {
+		ret = -ENOMEM;
+		goto bail;
+	}
+	dc->next = dd->diag_client;
+	dd->diag_client = dc;
+	fp->private_data = dc;
+	ret = 0;
+bail:
+	mutex_unlock(&qib_mutex);
+
+	return ret;
+}
+
+/**
+ * qib_diagpkt_write - write an IB packet
+ * @fp: the diag data device file pointer
+ * @data: qib_diag_pkt structure saying where to get the packet
+ * @count: size of data to write
+ * @off: unused by this code
+ */
+static ssize_t qib_diagpkt_write(struct file *fp,
+				 const char __user *data,
+				 size_t count, loff_t *off)
+{
+	u32 __iomem *piobuf;
+	u32 plen, pbufn, maxlen_reserve;
+	struct qib_diag_xpkt dp;
+	u32 *tmpbuf = NULL;
+	struct qib_devdata *dd;
+	struct qib_pportdata *ppd;
+	ssize_t ret = 0;
+
+	if (count != sizeof(dp)) {
+		ret = -EINVAL;
+		goto bail;
+	}
+	if (copy_from_user(&dp, data, sizeof(dp))) {
+		ret = -EFAULT;
+		goto bail;
+	}
+
+	dd = qib_lookup(dp.unit);
+	if (!dd || !(dd->flags & QIB_PRESENT) || !dd->kregbase) {
+		ret = -ENODEV;
+		goto bail;
+	}
+	if (!(dd->flags & QIB_INITTED)) {
+		/* no hardware, freeze, etc. */
+		ret = -ENODEV;
+		goto bail;
+	}
+
+	if (dp.version != _DIAG_XPKT_VERS) {
+		qib_dev_err(dd, "Invalid version %u for diagpkt_write\n",
+			    dp.version);
+		ret = -EINVAL;
+		goto bail;
+	}
+	/* send count must be an exact number of dwords */
+	if (dp.len & 3) {
+		ret = -EINVAL;
+		goto bail;
+	}
+	if (!dp.port || dp.port > dd->num_pports) {
+		ret = -EINVAL;
+		goto bail;
+	}
+	ppd = &dd->pport[dp.port - 1];
+
+	/*
+	 * need total length before first word written, plus 2 Dwords. One Dword
+	 * is for padding so we get the full user data when not aligned on
+	 * a word boundary. The other Dword is to make sure we have room for the
+	 * ICRC which gets tacked on later.
+	 */
+	maxlen_reserve = 2 * sizeof(u32);
+	if (dp.len > ppd->ibmaxlen - maxlen_reserve) {
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	plen = sizeof(u32) + dp.len;
+
+	tmpbuf = vmalloc(plen);
+	if (!tmpbuf) {
+		qib_devinfo(dd->pcidev,
+			"Unable to allocate tmp buffer, failing\n");
+		ret = -ENOMEM;
+		goto bail;
+	}
+
+	if (copy_from_user(tmpbuf,
+			   (const void __user *) (unsigned long) dp.data,
+			   dp.len)) {
+		ret = -EFAULT;
+		goto bail;
+	}
+
+	plen >>= 2;             /* in dwords */
+
+	if (dp.pbc_wd == 0)
+		dp.pbc_wd = plen;
+
+	piobuf = dd->f_getsendbuf(ppd, dp.pbc_wd, &pbufn);
+	if (!piobuf) {
+		ret = -EBUSY;
+		goto bail;
+	}
+	/* disarm it just to be extra sure */
+	dd->f_sendctrl(dd->pport, QIB_SENDCTRL_DISARM_BUF(pbufn));
+
+	/* disable header check on pbufn for this packet */
+	dd->f_txchk_change(dd, pbufn, 1, TXCHK_CHG_TYPE_DIS1, NULL);
+
+	writeq(dp.pbc_wd, piobuf);
+	/*
+	 * Copy all but the trigger word, then flush, so it's written
+	 * to chip before trigger word, then write trigger word, then
+	 * flush again, so packet is sent.
+	 */
+	if (dd->flags & QIB_PIO_FLUSH_WC) {
+		qib_flush_wc();
+		qib_pio_copy(piobuf + 2, tmpbuf, plen - 1);
+		qib_flush_wc();
+		__raw_writel(tmpbuf[plen - 1], piobuf + plen + 1);
+	} else
+		qib_pio_copy(piobuf + 2, tmpbuf, plen);
+
+	if (dd->flags & QIB_USE_SPCL_TRIG) {
+		u32 spcl_off = (pbufn >= dd->piobcnt2k) ? 2047 : 1023;
+
+		qib_flush_wc();
+		__raw_writel(0xaebecede, piobuf + spcl_off);
+	}
+
+	/*
+	 * Ensure buffer is written to the chip, then re-enable
+	 * header checks (if supported by chip).  The txchk
+	 * code will ensure seen by chip before returning.
+	 */
+	qib_flush_wc();
+	qib_sendbuf_done(dd, pbufn);
+	dd->f_txchk_change(dd, pbufn, 1, TXCHK_CHG_TYPE_ENAB1, NULL);
+
+	ret = sizeof(dp);
+
+bail:
+	vfree(tmpbuf);
+	return ret;
+}
+
+static int qib_diag_release(struct inode *in, struct file *fp)
+{
+	mutex_lock(&qib_mutex);
+	return_client(fp->private_data);
+	fp->private_data = NULL;
+	mutex_unlock(&qib_mutex);
+	return 0;
+}
+
+/*
+ * Chip-specific code calls to register its interest in
+ * a specific range.
+ */
+struct diag_observer_list_elt {
+	struct diag_observer_list_elt *next;
+	const struct diag_observer *op;
+};
+
+int qib_register_observer(struct qib_devdata *dd,
+			  const struct diag_observer *op)
+{
+	struct diag_observer_list_elt *olp;
+	unsigned long flags;
+
+	if (!dd || !op)
+		return -EINVAL;
+	olp = vmalloc(sizeof *olp);
+	if (!olp) {
+		pr_err("vmalloc for observer failed\n");
+		return -ENOMEM;
+	}
+
+	spin_lock_irqsave(&dd->qib_diag_trans_lock, flags);
+	olp->op = op;
+	olp->next = dd->diag_observer_list;
+	dd->diag_observer_list = olp;
+	spin_unlock_irqrestore(&dd->qib_diag_trans_lock, flags);
+
+	return 0;
+}
+
+/* Remove all registered observers when device is closed */
+static void qib_unregister_observers(struct qib_devdata *dd)
+{
+	struct diag_observer_list_elt *olp;
+	unsigned long flags;
+
+	spin_lock_irqsave(&dd->qib_diag_trans_lock, flags);
+	olp = dd->diag_observer_list;
+	while (olp) {
+		/* Pop one observer, let go of lock */
+		dd->diag_observer_list = olp->next;
+		spin_unlock_irqrestore(&dd->qib_diag_trans_lock, flags);
+		vfree(olp);
+		/* try again. */
+		spin_lock_irqsave(&dd->qib_diag_trans_lock, flags);
+		olp = dd->diag_observer_list;
+	}
+	spin_unlock_irqrestore(&dd->qib_diag_trans_lock, flags);
+}
+
+/*
+ * Find the observer, if any, for the specified address. Initial implementation
+ * is simple stack of observers. This must be called with diag transaction
+ * lock held.
+ */
+static const struct diag_observer *diag_get_observer(struct qib_devdata *dd,
+						     u32 addr)
+{
+	struct diag_observer_list_elt *olp;
+	const struct diag_observer *op = NULL;
+
+	olp = dd->diag_observer_list;
+	while (olp) {
+		op = olp->op;
+		if (addr >= op->bottom && addr <= op->top)
+			break;
+		olp = olp->next;
+	}
+	if (!olp)
+		op = NULL;
+
+	return op;
+}
+
+static ssize_t qib_diag_read(struct file *fp, char __user *data,
+			     size_t count, loff_t *off)
+{
+	struct qib_diag_client *dc = fp->private_data;
+	struct qib_devdata *dd = dc->dd;
+	void __iomem *kreg_base;
+	ssize_t ret;
+
+	if (dc->pid != current->pid) {
+		ret = -EPERM;
+		goto bail;
+	}
+
+	kreg_base = dd->kregbase;
+
+	if (count == 0)
+		ret = 0;
+	else if ((count % 4) || (*off % 4))
+		/* address or length is not 32-bit aligned, hence invalid */
+		ret = -EINVAL;
+	else if (dc->state < READY && (*off || count != 8))
+		ret = -EINVAL;  /* prevent cat /dev/qib_diag* */
+	else {
+		unsigned long flags;
+		u64 data64 = 0;
+		int use_32;
+		const struct diag_observer *op;
+
+		use_32 = (count % 8) || (*off % 8);
+		ret = -1;
+		spin_lock_irqsave(&dd->qib_diag_trans_lock, flags);
+		/*
+		 * Check for observer on this address range.
+		 * we only support a single 32 or 64-bit read
+		 * via observer, currently.
+		 */
+		op = diag_get_observer(dd, *off);
+		if (op) {
+			u32 offset = *off;
+			ret = op->hook(dd, op, offset, &data64, 0, use_32);
+		}
+		/*
+		 * We need to release lock before any copy_to_user(),
+		 * whether implicit in qib_read_umem* or explicit below.
+		 */
+		spin_unlock_irqrestore(&dd->qib_diag_trans_lock, flags);
+		if (!op) {
+			if (use_32)
+				/*
+				 * Address or length is not 64-bit aligned;
+				 * do 32-bit rd
+				 */
+				ret = qib_read_umem32(dd, data, (u32) *off,
+						      count);
+			else
+				ret = qib_read_umem64(dd, data, (u32) *off,
+						      count);
+		} else if (ret == count) {
+			/* Below finishes case where observer existed */
+			ret = copy_to_user(data, &data64, use_32 ?
+					   sizeof(u32) : sizeof(u64));
+			if (ret)
+				ret = -EFAULT;
+		}
+	}
+
+	if (ret >= 0) {
+		*off += count;
+		ret = count;
+		if (dc->state == OPENED)
+			dc->state = INIT;
+	}
+bail:
+	return ret;
+}
+
+static ssize_t qib_diag_write(struct file *fp, const char __user *data,
+			      size_t count, loff_t *off)
+{
+	struct qib_diag_client *dc = fp->private_data;
+	struct qib_devdata *dd = dc->dd;
+	void __iomem *kreg_base;
+	ssize_t ret;
+
+	if (dc->pid != current->pid) {
+		ret = -EPERM;
+		goto bail;
+	}
+
+	kreg_base = dd->kregbase;
+
+	if (count == 0)
+		ret = 0;
+	else if ((count % 4) || (*off % 4))
+		/* address or length is not 32-bit aligned, hence invalid */
+		ret = -EINVAL;
+	else if (dc->state < READY &&
+		((*off || count != 8) || dc->state != INIT))
+		/* No writes except second-step of init seq */
+		ret = -EINVAL;  /* before any other write allowed */
+	else {
+		unsigned long flags;
+		const struct diag_observer *op = NULL;
+		int use_32 =  (count % 8) || (*off % 8);
+
+		/*
+		 * Check for observer on this address range.
+		 * We only support a single 32 or 64-bit write
+		 * via observer, currently. This helps, because
+		 * we would otherwise have to jump through hoops
+		 * to make "diag transaction" meaningful when we
+		 * cannot do a copy_from_user while holding the lock.
+		 */
+		if (count == 4 || count == 8) {
+			u64 data64;
+			u32 offset = *off;
+			ret = copy_from_user(&data64, data, count);
+			if (ret) {
+				ret = -EFAULT;
+				goto bail;
+			}
+			spin_lock_irqsave(&dd->qib_diag_trans_lock, flags);
+			op = diag_get_observer(dd, *off);
+			if (op)
+				ret = op->hook(dd, op, offset, &data64, ~0Ull,
+					       use_32);
+			spin_unlock_irqrestore(&dd->qib_diag_trans_lock, flags);
+		}
+
+		if (!op) {
+			if (use_32)
+				/*
+				 * Address or length is not 64-bit aligned;
+				 * do 32-bit write
+				 */
+				ret = qib_write_umem32(dd, (u32) *off, data,
+						       count);
+			else
+				ret = qib_write_umem64(dd, (u32) *off, data,
+						       count);
+		}
+	}
+
+	if (ret >= 0) {
+		*off += count;
+		ret = count;
+		if (dc->state == INIT)
+			dc->state = READY; /* all read/write OK now */
+	}
+bail:
+	return ret;
+}
diff --git a/drivers/infiniband/hw/qib/qib_dma.c b/drivers/infiniband/hw/qib/qib_dma.c
new file mode 100644
index 0000000..59fe092
--- /dev/null
+++ b/drivers/infiniband/hw/qib/qib_dma.c
@@ -0,0 +1,169 @@
+/*
+ * Copyright (c) 2006, 2009, 2010 QLogic, Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include <linux/types.h>
+#include <linux/scatterlist.h>
+
+#include "qib_verbs.h"
+
+#define BAD_DMA_ADDRESS ((u64) 0)
+
+/*
+ * The following functions implement driver specific replacements
+ * for the ib_dma_*() functions.
+ *
+ * These functions return kernel virtual addresses instead of
+ * device bus addresses since the driver uses the CPU to copy
+ * data instead of using hardware DMA.
+ */
+
+static int qib_mapping_error(struct ib_device *dev, u64 dma_addr)
+{
+	return dma_addr == BAD_DMA_ADDRESS;
+}
+
+static u64 qib_dma_map_single(struct ib_device *dev, void *cpu_addr,
+			      size_t size, enum dma_data_direction direction)
+{
+	BUG_ON(!valid_dma_direction(direction));
+	return (u64) cpu_addr;
+}
+
+static void qib_dma_unmap_single(struct ib_device *dev, u64 addr, size_t size,
+				 enum dma_data_direction direction)
+{
+	BUG_ON(!valid_dma_direction(direction));
+}
+
+static u64 qib_dma_map_page(struct ib_device *dev, struct page *page,
+			    unsigned long offset, size_t size,
+			    enum dma_data_direction direction)
+{
+	u64 addr;
+
+	BUG_ON(!valid_dma_direction(direction));
+
+	if (offset + size > PAGE_SIZE) {
+		addr = BAD_DMA_ADDRESS;
+		goto done;
+	}
+
+	addr = (u64) page_address(page);
+	if (addr)
+		addr += offset;
+	/* TODO: handle highmem pages */
+
+done:
+	return addr;
+}
+
+static void qib_dma_unmap_page(struct ib_device *dev, u64 addr, size_t size,
+			       enum dma_data_direction direction)
+{
+	BUG_ON(!valid_dma_direction(direction));
+}
+
+static int qib_map_sg(struct ib_device *dev, struct scatterlist *sgl,
+		      int nents, enum dma_data_direction direction)
+{
+	struct scatterlist *sg;
+	u64 addr;
+	int i;
+	int ret = nents;
+
+	BUG_ON(!valid_dma_direction(direction));
+
+	for_each_sg(sgl, sg, nents, i) {
+		addr = (u64) page_address(sg_page(sg));
+		/* TODO: handle highmem pages */
+		if (!addr) {
+			ret = 0;
+			break;
+		}
+		sg->dma_address = addr + sg->offset;
+#ifdef CONFIG_NEED_SG_DMA_LENGTH
+		sg->dma_length = sg->length;
+#endif
+	}
+	return ret;
+}
+
+static void qib_unmap_sg(struct ib_device *dev,
+			 struct scatterlist *sg, int nents,
+			 enum dma_data_direction direction)
+{
+	BUG_ON(!valid_dma_direction(direction));
+}
+
+static void qib_sync_single_for_cpu(struct ib_device *dev, u64 addr,
+				    size_t size, enum dma_data_direction dir)
+{
+}
+
+static void qib_sync_single_for_device(struct ib_device *dev, u64 addr,
+				       size_t size,
+				       enum dma_data_direction dir)
+{
+}
+
+static void *qib_dma_alloc_coherent(struct ib_device *dev, size_t size,
+				    u64 *dma_handle, gfp_t flag)
+{
+	struct page *p;
+	void *addr = NULL;
+
+	p = alloc_pages(flag, get_order(size));
+	if (p)
+		addr = page_address(p);
+	if (dma_handle)
+		*dma_handle = (u64) addr;
+	return addr;
+}
+
+static void qib_dma_free_coherent(struct ib_device *dev, size_t size,
+				  void *cpu_addr, u64 dma_handle)
+{
+	free_pages((unsigned long) cpu_addr, get_order(size));
+}
+
+struct ib_dma_mapping_ops qib_dma_mapping_ops = {
+	.mapping_error = qib_mapping_error,
+	.map_single = qib_dma_map_single,
+	.unmap_single = qib_dma_unmap_single,
+	.map_page = qib_dma_map_page,
+	.unmap_page = qib_dma_unmap_page,
+	.map_sg = qib_map_sg,
+	.unmap_sg = qib_unmap_sg,
+	.sync_single_for_cpu = qib_sync_single_for_cpu,
+	.sync_single_for_device = qib_sync_single_for_device,
+	.alloc_coherent = qib_dma_alloc_coherent,
+	.free_coherent = qib_dma_free_coherent
+};
diff --git a/drivers/infiniband/hw/qib/qib_driver.c b/drivers/infiniband/hw/qib/qib_driver.c
new file mode 100644
index 0000000..5bee08f
--- /dev/null
+++ b/drivers/infiniband/hw/qib/qib_driver.c
@@ -0,0 +1,817 @@
+/*
+ * Copyright (c) 2013 Intel Corporation. All rights reserved.
+ * Copyright (c) 2006, 2007, 2008, 2009 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/spinlock.h>
+#include <linux/pci.h>
+#include <linux/io.h>
+#include <linux/delay.h>
+#include <linux/netdevice.h>
+#include <linux/vmalloc.h>
+#include <linux/module.h>
+#include <linux/prefetch.h>
+
+#include "qib.h"
+
+/*
+ * The size has to be longer than this string, so we can append
+ * board/chip information to it in the init code.
+ */
+const char ib_qib_version[] = QIB_DRIVER_VERSION "\n";
+
+DEFINE_SPINLOCK(qib_devs_lock);
+LIST_HEAD(qib_dev_list);
+DEFINE_MUTEX(qib_mutex);	/* general driver use */
+
+unsigned qib_ibmtu;
+module_param_named(ibmtu, qib_ibmtu, uint, S_IRUGO);
+MODULE_PARM_DESC(ibmtu, "Set max IB MTU (0=2KB, 1=256, 2=512, ... 5=4096");
+
+unsigned qib_compat_ddr_negotiate = 1;
+module_param_named(compat_ddr_negotiate, qib_compat_ddr_negotiate, uint,
+		   S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(compat_ddr_negotiate,
+		 "Attempt pre-IBTA 1.2 DDR speed negotiation");
+
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_AUTHOR("Intel <ibsupport@intel.com>");
+MODULE_DESCRIPTION("Intel IB driver");
+MODULE_VERSION(QIB_DRIVER_VERSION);
+
+/*
+ * QIB_PIO_MAXIBHDR is the max IB header size allowed for in our
+ * PIO send buffers.  This is well beyond anything currently
+ * defined in the InfiniBand spec.
+ */
+#define QIB_PIO_MAXIBHDR 128
+
+/*
+ * QIB_MAX_PKT_RCV is the max # if packets processed per receive interrupt.
+ */
+#define QIB_MAX_PKT_RECV 64
+
+struct qlogic_ib_stats qib_stats;
+
+const char *qib_get_unit_name(int unit)
+{
+	static char iname[16];
+
+	snprintf(iname, sizeof iname, "infinipath%u", unit);
+	return iname;
+}
+
+/*
+ * Return count of units with at least one port ACTIVE.
+ */
+int qib_count_active_units(void)
+{
+	struct qib_devdata *dd;
+	struct qib_pportdata *ppd;
+	unsigned long flags;
+	int pidx, nunits_active = 0;
+
+	spin_lock_irqsave(&qib_devs_lock, flags);
+	list_for_each_entry(dd, &qib_dev_list, list) {
+		if (!(dd->flags & QIB_PRESENT) || !dd->kregbase)
+			continue;
+		for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+			ppd = dd->pport + pidx;
+			if (ppd->lid && (ppd->lflags & (QIBL_LINKINIT |
+					 QIBL_LINKARMED | QIBL_LINKACTIVE))) {
+				nunits_active++;
+				break;
+			}
+		}
+	}
+	spin_unlock_irqrestore(&qib_devs_lock, flags);
+	return nunits_active;
+}
+
+/*
+ * Return count of all units, optionally return in arguments
+ * the number of usable (present) units, and the number of
+ * ports that are up.
+ */
+int qib_count_units(int *npresentp, int *nupp)
+{
+	int nunits = 0, npresent = 0, nup = 0;
+	struct qib_devdata *dd;
+	unsigned long flags;
+	int pidx;
+	struct qib_pportdata *ppd;
+
+	spin_lock_irqsave(&qib_devs_lock, flags);
+
+	list_for_each_entry(dd, &qib_dev_list, list) {
+		nunits++;
+		if ((dd->flags & QIB_PRESENT) && dd->kregbase)
+			npresent++;
+		for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+			ppd = dd->pport + pidx;
+			if (ppd->lid && (ppd->lflags & (QIBL_LINKINIT |
+					 QIBL_LINKARMED | QIBL_LINKACTIVE)))
+				nup++;
+		}
+	}
+
+	spin_unlock_irqrestore(&qib_devs_lock, flags);
+
+	if (npresentp)
+		*npresentp = npresent;
+	if (nupp)
+		*nupp = nup;
+
+	return nunits;
+}
+
+/**
+ * qib_wait_linkstate - wait for an IB link state change to occur
+ * @dd: the qlogic_ib device
+ * @state: the state to wait for
+ * @msecs: the number of milliseconds to wait
+ *
+ * wait up to msecs milliseconds for IB link state change to occur for
+ * now, take the easy polling route.  Currently used only by
+ * qib_set_linkstate.  Returns 0 if state reached, otherwise
+ * -ETIMEDOUT state can have multiple states set, for any of several
+ * transitions.
+ */
+int qib_wait_linkstate(struct qib_pportdata *ppd, u32 state, int msecs)
+{
+	int ret;
+	unsigned long flags;
+
+	spin_lock_irqsave(&ppd->lflags_lock, flags);
+	if (ppd->state_wanted) {
+		spin_unlock_irqrestore(&ppd->lflags_lock, flags);
+		ret = -EBUSY;
+		goto bail;
+	}
+	ppd->state_wanted = state;
+	spin_unlock_irqrestore(&ppd->lflags_lock, flags);
+	wait_event_interruptible_timeout(ppd->state_wait,
+					 (ppd->lflags & state),
+					 msecs_to_jiffies(msecs));
+	spin_lock_irqsave(&ppd->lflags_lock, flags);
+	ppd->state_wanted = 0;
+	spin_unlock_irqrestore(&ppd->lflags_lock, flags);
+
+	if (!(ppd->lflags & state))
+		ret = -ETIMEDOUT;
+	else
+		ret = 0;
+bail:
+	return ret;
+}
+
+int qib_set_linkstate(struct qib_pportdata *ppd, u8 newstate)
+{
+	u32 lstate;
+	int ret;
+	struct qib_devdata *dd = ppd->dd;
+	unsigned long flags;
+
+	switch (newstate) {
+	case QIB_IB_LINKDOWN_ONLY:
+		dd->f_set_ib_cfg(ppd, QIB_IB_CFG_LSTATE,
+				 IB_LINKCMD_DOWN | IB_LINKINITCMD_NOP);
+		/* don't wait */
+		ret = 0;
+		goto bail;
+
+	case QIB_IB_LINKDOWN:
+		dd->f_set_ib_cfg(ppd, QIB_IB_CFG_LSTATE,
+				 IB_LINKCMD_DOWN | IB_LINKINITCMD_POLL);
+		/* don't wait */
+		ret = 0;
+		goto bail;
+
+	case QIB_IB_LINKDOWN_SLEEP:
+		dd->f_set_ib_cfg(ppd, QIB_IB_CFG_LSTATE,
+				 IB_LINKCMD_DOWN | IB_LINKINITCMD_SLEEP);
+		/* don't wait */
+		ret = 0;
+		goto bail;
+
+	case QIB_IB_LINKDOWN_DISABLE:
+		dd->f_set_ib_cfg(ppd, QIB_IB_CFG_LSTATE,
+				 IB_LINKCMD_DOWN | IB_LINKINITCMD_DISABLE);
+		/* don't wait */
+		ret = 0;
+		goto bail;
+
+	case QIB_IB_LINKARM:
+		if (ppd->lflags & QIBL_LINKARMED) {
+			ret = 0;
+			goto bail;
+		}
+		if (!(ppd->lflags & (QIBL_LINKINIT | QIBL_LINKACTIVE))) {
+			ret = -EINVAL;
+			goto bail;
+		}
+		/*
+		 * Since the port can be ACTIVE when we ask for ARMED,
+		 * clear QIBL_LINKV so we can wait for a transition.
+		 * If the link isn't ARMED, then something else happened
+		 * and there is no point waiting for ARMED.
+		 */
+		spin_lock_irqsave(&ppd->lflags_lock, flags);
+		ppd->lflags &= ~QIBL_LINKV;
+		spin_unlock_irqrestore(&ppd->lflags_lock, flags);
+		dd->f_set_ib_cfg(ppd, QIB_IB_CFG_LSTATE,
+				 IB_LINKCMD_ARMED | IB_LINKINITCMD_NOP);
+		lstate = QIBL_LINKV;
+		break;
+
+	case QIB_IB_LINKACTIVE:
+		if (ppd->lflags & QIBL_LINKACTIVE) {
+			ret = 0;
+			goto bail;
+		}
+		if (!(ppd->lflags & QIBL_LINKARMED)) {
+			ret = -EINVAL;
+			goto bail;
+		}
+		dd->f_set_ib_cfg(ppd, QIB_IB_CFG_LSTATE,
+				 IB_LINKCMD_ACTIVE | IB_LINKINITCMD_NOP);
+		lstate = QIBL_LINKACTIVE;
+		break;
+
+	default:
+		ret = -EINVAL;
+		goto bail;
+	}
+	ret = qib_wait_linkstate(ppd, lstate, 10);
+
+bail:
+	return ret;
+}
+
+/*
+ * Get address of eager buffer from it's index (allocated in chunks, not
+ * contiguous).
+ */
+static inline void *qib_get_egrbuf(const struct qib_ctxtdata *rcd, u32 etail)
+{
+	const u32 chunk = etail >> rcd->rcvegrbufs_perchunk_shift;
+	const u32 idx =  etail & ((u32)rcd->rcvegrbufs_perchunk - 1);
+
+	return rcd->rcvegrbuf[chunk] + (idx << rcd->dd->rcvegrbufsize_shift);
+}
+
+/*
+ * Returns 1 if error was a CRC, else 0.
+ * Needed for some chip's synthesized error counters.
+ */
+static u32 qib_rcv_hdrerr(struct qib_ctxtdata *rcd, struct qib_pportdata *ppd,
+			  u32 ctxt, u32 eflags, u32 l, u32 etail,
+			  __le32 *rhf_addr, struct qib_message_header *rhdr)
+{
+	u32 ret = 0;
+
+	if (eflags & (QLOGIC_IB_RHF_H_ICRCERR | QLOGIC_IB_RHF_H_VCRCERR))
+		ret = 1;
+	else if (eflags == QLOGIC_IB_RHF_H_TIDERR) {
+		/* For TIDERR and RC QPs premptively schedule a NAK */
+		struct qib_ib_header *hdr = (struct qib_ib_header *) rhdr;
+		struct qib_other_headers *ohdr = NULL;
+		struct qib_ibport *ibp = &ppd->ibport_data;
+		struct qib_qp *qp = NULL;
+		u32 tlen = qib_hdrget_length_in_bytes(rhf_addr);
+		u16 lid  = be16_to_cpu(hdr->lrh[1]);
+		int lnh = be16_to_cpu(hdr->lrh[0]) & 3;
+		u32 qp_num;
+		u32 opcode;
+		u32 psn;
+		int diff;
+
+		/* Sanity check packet */
+		if (tlen < 24)
+			goto drop;
+
+		if (lid < QIB_MULTICAST_LID_BASE) {
+			lid &= ~((1 << ppd->lmc) - 1);
+			if (unlikely(lid != ppd->lid))
+				goto drop;
+		}
+
+		/* Check for GRH */
+		if (lnh == QIB_LRH_BTH)
+			ohdr = &hdr->u.oth;
+		else if (lnh == QIB_LRH_GRH) {
+			u32 vtf;
+
+			ohdr = &hdr->u.l.oth;
+			if (hdr->u.l.grh.next_hdr != IB_GRH_NEXT_HDR)
+				goto drop;
+			vtf = be32_to_cpu(hdr->u.l.grh.version_tclass_flow);
+			if ((vtf >> IB_GRH_VERSION_SHIFT) != IB_GRH_VERSION)
+				goto drop;
+		} else
+			goto drop;
+
+		/* Get opcode and PSN from packet */
+		opcode = be32_to_cpu(ohdr->bth[0]);
+		opcode >>= 24;
+		psn = be32_to_cpu(ohdr->bth[2]);
+
+		/* Get the destination QP number. */
+		qp_num = be32_to_cpu(ohdr->bth[1]) & QIB_QPN_MASK;
+		if (qp_num != QIB_MULTICAST_QPN) {
+			int ruc_res;
+			qp = qib_lookup_qpn(ibp, qp_num);
+			if (!qp)
+				goto drop;
+
+			/*
+			 * Handle only RC QPs - for other QP types drop error
+			 * packet.
+			 */
+			spin_lock(&qp->r_lock);
+
+			/* Check for valid receive state. */
+			if (!(ib_qib_state_ops[qp->state] &
+			      QIB_PROCESS_RECV_OK)) {
+				ibp->n_pkt_drops++;
+				goto unlock;
+			}
+
+			switch (qp->ibqp.qp_type) {
+			case IB_QPT_RC:
+				ruc_res =
+					qib_ruc_check_hdr(
+						ibp, hdr,
+						lnh == QIB_LRH_GRH,
+						qp,
+						be32_to_cpu(ohdr->bth[0]));
+				if (ruc_res)
+					goto unlock;
+
+				/* Only deal with RDMA Writes for now */
+				if (opcode <
+				    IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST) {
+					diff = qib_cmp24(psn, qp->r_psn);
+					if (!qp->r_nak_state && diff >= 0) {
+						ibp->n_rc_seqnak++;
+						qp->r_nak_state =
+							IB_NAK_PSN_ERROR;
+						/* Use the expected PSN. */
+						qp->r_ack_psn = qp->r_psn;
+						/*
+						 * Wait to send the sequence
+						 * NAK until all packets
+						 * in the receive queue have
+						 * been processed.
+						 * Otherwise, we end up
+						 * propagating congestion.
+						 */
+						if (list_empty(&qp->rspwait)) {
+							qp->r_flags |=
+								QIB_R_RSP_NAK;
+							atomic_inc(
+								&qp->refcount);
+							list_add_tail(
+							 &qp->rspwait,
+							 &rcd->qp_wait_list);
+						}
+					} /* Out of sequence NAK */
+				} /* QP Request NAKs */
+				break;
+			case IB_QPT_SMI:
+			case IB_QPT_GSI:
+			case IB_QPT_UD:
+			case IB_QPT_UC:
+			default:
+				/* For now don't handle any other QP types */
+				break;
+			}
+
+unlock:
+			spin_unlock(&qp->r_lock);
+			/*
+			 * Notify qib_destroy_qp() if it is waiting
+			 * for us to finish.
+			 */
+			if (atomic_dec_and_test(&qp->refcount))
+				wake_up(&qp->wait);
+		} /* Unicast QP */
+	} /* Valid packet with TIDErr */
+
+drop:
+	return ret;
+}
+
+/*
+ * qib_kreceive - receive a packet
+ * @rcd: the qlogic_ib context
+ * @llic: gets count of good packets needed to clear lli,
+ *          (used with chips that need need to track crcs for lli)
+ *
+ * called from interrupt handler for errors or receive interrupt
+ * Returns number of CRC error packets, needed by some chips for
+ * local link integrity tracking.   crcs are adjusted down by following
+ * good packets, if any, and count of good packets is also tracked.
+ */
+u32 qib_kreceive(struct qib_ctxtdata *rcd, u32 *llic, u32 *npkts)
+{
+	struct qib_devdata *dd = rcd->dd;
+	struct qib_pportdata *ppd = rcd->ppd;
+	__le32 *rhf_addr;
+	void *ebuf;
+	const u32 rsize = dd->rcvhdrentsize;        /* words */
+	const u32 maxcnt = dd->rcvhdrcnt * rsize;   /* words */
+	u32 etail = -1, l, hdrqtail;
+	struct qib_message_header *hdr;
+	u32 eflags, etype, tlen, i = 0, updegr = 0, crcs = 0;
+	int last;
+	u64 lval;
+	struct qib_qp *qp, *nqp;
+
+	l = rcd->head;
+	rhf_addr = (__le32 *) rcd->rcvhdrq + l + dd->rhf_offset;
+	if (dd->flags & QIB_NODMA_RTAIL) {
+		u32 seq = qib_hdrget_seq(rhf_addr);
+		if (seq != rcd->seq_cnt)
+			goto bail;
+		hdrqtail = 0;
+	} else {
+		hdrqtail = qib_get_rcvhdrtail(rcd);
+		if (l == hdrqtail)
+			goto bail;
+		smp_rmb();  /* prevent speculative reads of dma'ed hdrq */
+	}
+
+	for (last = 0, i = 1; !last; i += !last) {
+		hdr = dd->f_get_msgheader(dd, rhf_addr);
+		eflags = qib_hdrget_err_flags(rhf_addr);
+		etype = qib_hdrget_rcv_type(rhf_addr);
+		/* total length */
+		tlen = qib_hdrget_length_in_bytes(rhf_addr);
+		ebuf = NULL;
+		if ((dd->flags & QIB_NODMA_RTAIL) ?
+		    qib_hdrget_use_egr_buf(rhf_addr) :
+		    (etype != RCVHQ_RCV_TYPE_EXPECTED)) {
+			etail = qib_hdrget_index(rhf_addr);
+			updegr = 1;
+			if (tlen > sizeof(*hdr) ||
+			    etype >= RCVHQ_RCV_TYPE_NON_KD) {
+				ebuf = qib_get_egrbuf(rcd, etail);
+				prefetch_range(ebuf, tlen - sizeof(*hdr));
+			}
+		}
+		if (!eflags) {
+			u16 lrh_len = be16_to_cpu(hdr->lrh[2]) << 2;
+
+			if (lrh_len != tlen) {
+				qib_stats.sps_lenerrs++;
+				goto move_along;
+			}
+		}
+		if (etype == RCVHQ_RCV_TYPE_NON_KD && !eflags &&
+		    ebuf == NULL &&
+		    tlen > (dd->rcvhdrentsize - 2 + 1 -
+				qib_hdrget_offset(rhf_addr)) << 2) {
+			goto move_along;
+		}
+
+		/*
+		 * Both tiderr and qibhdrerr are set for all plain IB
+		 * packets; only qibhdrerr should be set.
+		 */
+		if (unlikely(eflags))
+			crcs += qib_rcv_hdrerr(rcd, ppd, rcd->ctxt, eflags, l,
+					       etail, rhf_addr, hdr);
+		else if (etype == RCVHQ_RCV_TYPE_NON_KD) {
+			qib_ib_rcv(rcd, hdr, ebuf, tlen);
+			if (crcs)
+				crcs--;
+			else if (llic && *llic)
+				--*llic;
+		}
+move_along:
+		l += rsize;
+		if (l >= maxcnt)
+			l = 0;
+		if (i == QIB_MAX_PKT_RECV)
+			last = 1;
+
+		rhf_addr = (__le32 *) rcd->rcvhdrq + l + dd->rhf_offset;
+		if (dd->flags & QIB_NODMA_RTAIL) {
+			u32 seq = qib_hdrget_seq(rhf_addr);
+
+			if (++rcd->seq_cnt > 13)
+				rcd->seq_cnt = 1;
+			if (seq != rcd->seq_cnt)
+				last = 1;
+		} else if (l == hdrqtail)
+			last = 1;
+		/*
+		 * Update head regs etc., every 16 packets, if not last pkt,
+		 * to help prevent rcvhdrq overflows, when many packets
+		 * are processed and queue is nearly full.
+		 * Don't request an interrupt for intermediate updates.
+		 */
+		lval = l;
+		if (!last && !(i & 0xf)) {
+			dd->f_update_usrhead(rcd, lval, updegr, etail, i);
+			updegr = 0;
+		}
+	}
+	/*
+	 * Notify qib_destroy_qp() if it is waiting
+	 * for lookaside_qp to finish.
+	 */
+	if (rcd->lookaside_qp) {
+		if (atomic_dec_and_test(&rcd->lookaside_qp->refcount))
+			wake_up(&rcd->lookaside_qp->wait);
+		rcd->lookaside_qp = NULL;
+	}
+
+	rcd->head = l;
+
+	/*
+	 * Iterate over all QPs waiting to respond.
+	 * The list won't change since the IRQ is only run on one CPU.
+	 */
+	list_for_each_entry_safe(qp, nqp, &rcd->qp_wait_list, rspwait) {
+		list_del_init(&qp->rspwait);
+		if (qp->r_flags & QIB_R_RSP_NAK) {
+			qp->r_flags &= ~QIB_R_RSP_NAK;
+			qib_send_rc_ack(qp);
+		}
+		if (qp->r_flags & QIB_R_RSP_SEND) {
+			unsigned long flags;
+
+			qp->r_flags &= ~QIB_R_RSP_SEND;
+			spin_lock_irqsave(&qp->s_lock, flags);
+			if (ib_qib_state_ops[qp->state] &
+					QIB_PROCESS_OR_FLUSH_SEND)
+				qib_schedule_send(qp);
+			spin_unlock_irqrestore(&qp->s_lock, flags);
+		}
+		if (atomic_dec_and_test(&qp->refcount))
+			wake_up(&qp->wait);
+	}
+
+bail:
+	/* Report number of packets consumed */
+	if (npkts)
+		*npkts = i;
+
+	/*
+	 * Always write head at end, and setup rcv interrupt, even
+	 * if no packets were processed.
+	 */
+	lval = (u64)rcd->head | dd->rhdrhead_intr_off;
+	dd->f_update_usrhead(rcd, lval, updegr, etail, i);
+	return crcs;
+}
+
+/**
+ * qib_set_mtu - set the MTU
+ * @ppd: the perport data
+ * @arg: the new MTU
+ *
+ * We can handle "any" incoming size, the issue here is whether we
+ * need to restrict our outgoing size.   For now, we don't do any
+ * sanity checking on this, and we don't deal with what happens to
+ * programs that are already running when the size changes.
+ * NOTE: changing the MTU will usually cause the IBC to go back to
+ * link INIT state...
+ */
+int qib_set_mtu(struct qib_pportdata *ppd, u16 arg)
+{
+	u32 piosize;
+	int ret, chk;
+
+	if (arg != 256 && arg != 512 && arg != 1024 && arg != 2048 &&
+	    arg != 4096) {
+		ret = -EINVAL;
+		goto bail;
+	}
+	chk = ib_mtu_enum_to_int(qib_ibmtu);
+	if (chk > 0 && arg > chk) {
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	piosize = ppd->ibmaxlen;
+	ppd->ibmtu = arg;
+
+	if (arg >= (piosize - QIB_PIO_MAXIBHDR)) {
+		/* Only if it's not the initial value (or reset to it) */
+		if (piosize != ppd->init_ibmaxlen) {
+			if (arg > piosize && arg <= ppd->init_ibmaxlen)
+				piosize = ppd->init_ibmaxlen - 2 * sizeof(u32);
+			ppd->ibmaxlen = piosize;
+		}
+	} else if ((arg + QIB_PIO_MAXIBHDR) != ppd->ibmaxlen) {
+		piosize = arg + QIB_PIO_MAXIBHDR - 2 * sizeof(u32);
+		ppd->ibmaxlen = piosize;
+	}
+
+	ppd->dd->f_set_ib_cfg(ppd, QIB_IB_CFG_MTU, 0);
+
+	ret = 0;
+
+bail:
+	return ret;
+}
+
+int qib_set_lid(struct qib_pportdata *ppd, u32 lid, u8 lmc)
+{
+	struct qib_devdata *dd = ppd->dd;
+	ppd->lid = lid;
+	ppd->lmc = lmc;
+
+	dd->f_set_ib_cfg(ppd, QIB_IB_CFG_LIDLMC,
+			 lid | (~((1U << lmc) - 1)) << 16);
+
+	qib_devinfo(dd->pcidev, "IB%u:%u got a lid: 0x%x\n",
+		    dd->unit, ppd->port, lid);
+
+	return 0;
+}
+
+/*
+ * Following deal with the "obviously simple" task of overriding the state
+ * of the LEDS, which normally indicate link physical and logical status.
+ * The complications arise in dealing with different hardware mappings
+ * and the board-dependent routine being called from interrupts.
+ * and then there's the requirement to _flash_ them.
+ */
+#define LED_OVER_FREQ_SHIFT 8
+#define LED_OVER_FREQ_MASK (0xFF<<LED_OVER_FREQ_SHIFT)
+/* Below is "non-zero" to force override, but both actual LEDs are off */
+#define LED_OVER_BOTH_OFF (8)
+
+static void qib_run_led_override(unsigned long opaque)
+{
+	struct qib_pportdata *ppd = (struct qib_pportdata *)opaque;
+	struct qib_devdata *dd = ppd->dd;
+	int timeoff;
+	int ph_idx;
+
+	if (!(dd->flags & QIB_INITTED))
+		return;
+
+	ph_idx = ppd->led_override_phase++ & 1;
+	ppd->led_override = ppd->led_override_vals[ph_idx];
+	timeoff = ppd->led_override_timeoff;
+
+	dd->f_setextled(ppd, 1);
+	/*
+	 * don't re-fire the timer if user asked for it to be off; we let
+	 * it fire one more time after they turn it off to simplify
+	 */
+	if (ppd->led_override_vals[0] || ppd->led_override_vals[1])
+		mod_timer(&ppd->led_override_timer, jiffies + timeoff);
+}
+
+void qib_set_led_override(struct qib_pportdata *ppd, unsigned int val)
+{
+	struct qib_devdata *dd = ppd->dd;
+	int timeoff, freq;
+
+	if (!(dd->flags & QIB_INITTED))
+		return;
+
+	/* First check if we are blinking. If not, use 1HZ polling */
+	timeoff = HZ;
+	freq = (val & LED_OVER_FREQ_MASK) >> LED_OVER_FREQ_SHIFT;
+
+	if (freq) {
+		/* For blink, set each phase from one nybble of val */
+		ppd->led_override_vals[0] = val & 0xF;
+		ppd->led_override_vals[1] = (val >> 4) & 0xF;
+		timeoff = (HZ << 4)/freq;
+	} else {
+		/* Non-blink set both phases the same. */
+		ppd->led_override_vals[0] = val & 0xF;
+		ppd->led_override_vals[1] = val & 0xF;
+	}
+	ppd->led_override_timeoff = timeoff;
+
+	/*
+	 * If the timer has not already been started, do so. Use a "quick"
+	 * timeout so the function will be called soon, to look at our request.
+	 */
+	if (atomic_inc_return(&ppd->led_override_timer_active) == 1) {
+		/* Need to start timer */
+		init_timer(&ppd->led_override_timer);
+		ppd->led_override_timer.function = qib_run_led_override;
+		ppd->led_override_timer.data = (unsigned long) ppd;
+		ppd->led_override_timer.expires = jiffies + 1;
+		add_timer(&ppd->led_override_timer);
+	} else {
+		if (ppd->led_override_vals[0] || ppd->led_override_vals[1])
+			mod_timer(&ppd->led_override_timer, jiffies + 1);
+		atomic_dec(&ppd->led_override_timer_active);
+	}
+}
+
+/**
+ * qib_reset_device - reset the chip if possible
+ * @unit: the device to reset
+ *
+ * Whether or not reset is successful, we attempt to re-initialize the chip
+ * (that is, much like a driver unload/reload).  We clear the INITTED flag
+ * so that the various entry points will fail until we reinitialize.  For
+ * now, we only allow this if no user contexts are open that use chip resources
+ */
+int qib_reset_device(int unit)
+{
+	int ret, i;
+	struct qib_devdata *dd = qib_lookup(unit);
+	struct qib_pportdata *ppd;
+	unsigned long flags;
+	int pidx;
+
+	if (!dd) {
+		ret = -ENODEV;
+		goto bail;
+	}
+
+	qib_devinfo(dd->pcidev, "Reset on unit %u requested\n", unit);
+
+	if (!dd->kregbase || !(dd->flags & QIB_PRESENT)) {
+		qib_devinfo(dd->pcidev,
+			"Invalid unit number %u or not initialized or not present\n",
+			unit);
+		ret = -ENXIO;
+		goto bail;
+	}
+
+	spin_lock_irqsave(&dd->uctxt_lock, flags);
+	if (dd->rcd)
+		for (i = dd->first_user_ctxt; i < dd->cfgctxts; i++) {
+			if (!dd->rcd[i] || !dd->rcd[i]->cnt)
+				continue;
+			spin_unlock_irqrestore(&dd->uctxt_lock, flags);
+			ret = -EBUSY;
+			goto bail;
+		}
+	spin_unlock_irqrestore(&dd->uctxt_lock, flags);
+
+	for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+		ppd = dd->pport + pidx;
+		if (atomic_read(&ppd->led_override_timer_active)) {
+			/* Need to stop LED timer, _then_ shut off LEDs */
+			del_timer_sync(&ppd->led_override_timer);
+			atomic_set(&ppd->led_override_timer_active, 0);
+		}
+
+		/* Shut off LEDs after we are sure timer is not running */
+		ppd->led_override = LED_OVER_BOTH_OFF;
+		dd->f_setextled(ppd, 0);
+		if (dd->flags & QIB_HAS_SEND_DMA)
+			qib_teardown_sdma(ppd);
+	}
+
+	ret = dd->f_reset(dd);
+	if (ret == 1)
+		ret = qib_init(dd, 1);
+	else
+		ret = -EAGAIN;
+	if (ret)
+		qib_dev_err(dd,
+			"Reinitialize unit %u after reset failed with %d\n",
+			unit, ret);
+	else
+		qib_devinfo(dd->pcidev,
+			"Reinitialized unit %u after resetting\n",
+			unit);
+
+bail:
+	return ret;
+}
diff --git a/drivers/infiniband/hw/qib/qib_eeprom.c b/drivers/infiniband/hw/qib/qib_eeprom.c
new file mode 100644
index 0000000..4d5d71a
--- /dev/null
+++ b/drivers/infiniband/hw/qib/qib_eeprom.c
@@ -0,0 +1,456 @@
+/*
+ * Copyright (c) 2012 Intel Corporation. All rights reserved.
+ * Copyright (c) 2006 - 2012 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/delay.h>
+#include <linux/pci.h>
+#include <linux/vmalloc.h>
+
+#include "qib.h"
+
+/*
+ * Functions specific to the serial EEPROM on cards handled by ib_qib.
+ * The actual serail interface code is in qib_twsi.c. This file is a client
+ */
+
+/**
+ * qib_eeprom_read - receives bytes from the eeprom via I2C
+ * @dd: the qlogic_ib device
+ * @eeprom_offset: address to read from
+ * @buffer: where to store result
+ * @len: number of bytes to receive
+ */
+int qib_eeprom_read(struct qib_devdata *dd, u8 eeprom_offset,
+		    void *buff, int len)
+{
+	int ret;
+
+	ret = mutex_lock_interruptible(&dd->eep_lock);
+	if (!ret) {
+		ret = qib_twsi_reset(dd);
+		if (ret)
+			qib_dev_err(dd, "EEPROM Reset for read failed\n");
+		else
+			ret = qib_twsi_blk_rd(dd, dd->twsi_eeprom_dev,
+					      eeprom_offset, buff, len);
+		mutex_unlock(&dd->eep_lock);
+	}
+
+	return ret;
+}
+
+/*
+ * Actually update the eeprom, first doing write enable if
+ * needed, then restoring write enable state.
+ * Must be called with eep_lock held
+ */
+static int eeprom_write_with_enable(struct qib_devdata *dd, u8 offset,
+		     const void *buf, int len)
+{
+	int ret, pwen;
+
+	pwen = dd->f_eeprom_wen(dd, 1);
+	ret = qib_twsi_reset(dd);
+	if (ret)
+		qib_dev_err(dd, "EEPROM Reset for write failed\n");
+	else
+		ret = qib_twsi_blk_wr(dd, dd->twsi_eeprom_dev,
+				      offset, buf, len);
+	dd->f_eeprom_wen(dd, pwen);
+	return ret;
+}
+
+/**
+ * qib_eeprom_write - writes data to the eeprom via I2C
+ * @dd: the qlogic_ib device
+ * @eeprom_offset: where to place data
+ * @buffer: data to write
+ * @len: number of bytes to write
+ */
+int qib_eeprom_write(struct qib_devdata *dd, u8 eeprom_offset,
+		     const void *buff, int len)
+{
+	int ret;
+
+	ret = mutex_lock_interruptible(&dd->eep_lock);
+	if (!ret) {
+		ret = eeprom_write_with_enable(dd, eeprom_offset, buff, len);
+		mutex_unlock(&dd->eep_lock);
+	}
+
+	return ret;
+}
+
+static u8 flash_csum(struct qib_flash *ifp, int adjust)
+{
+	u8 *ip = (u8 *) ifp;
+	u8 csum = 0, len;
+
+	/*
+	 * Limit length checksummed to max length of actual data.
+	 * Checksum of erased eeprom will still be bad, but we avoid
+	 * reading past the end of the buffer we were passed.
+	 */
+	len = ifp->if_length;
+	if (len > sizeof(struct qib_flash))
+		len = sizeof(struct qib_flash);
+	while (len--)
+		csum += *ip++;
+	csum -= ifp->if_csum;
+	csum = ~csum;
+	if (adjust)
+		ifp->if_csum = csum;
+
+	return csum;
+}
+
+/**
+ * qib_get_eeprom_info- get the GUID et al. from the TSWI EEPROM device
+ * @dd: the qlogic_ib device
+ *
+ * We have the capability to use the nguid field, and get
+ * the guid from the first chip's flash, to use for all of them.
+ */
+void qib_get_eeprom_info(struct qib_devdata *dd)
+{
+	void *buf;
+	struct qib_flash *ifp;
+	__be64 guid;
+	int len, eep_stat;
+	u8 csum, *bguid;
+	int t = dd->unit;
+	struct qib_devdata *dd0 = qib_lookup(0);
+
+	if (t && dd0->nguid > 1 && t <= dd0->nguid) {
+		u8 oguid;
+		dd->base_guid = dd0->base_guid;
+		bguid = (u8 *) &dd->base_guid;
+
+		oguid = bguid[7];
+		bguid[7] += t;
+		if (oguid > bguid[7]) {
+			if (bguid[6] == 0xff) {
+				if (bguid[5] == 0xff) {
+					qib_dev_err(dd,
+						"Can't set %s GUID from base, wraps to OUI!\n",
+						qib_get_unit_name(t));
+					dd->base_guid = 0;
+					goto bail;
+				}
+				bguid[5]++;
+			}
+			bguid[6]++;
+		}
+		dd->nguid = 1;
+		goto bail;
+	}
+
+	/*
+	 * Read full flash, not just currently used part, since it may have
+	 * been written with a newer definition.
+	 * */
+	len = sizeof(struct qib_flash);
+	buf = vmalloc(len);
+	if (!buf) {
+		qib_dev_err(dd,
+			"Couldn't allocate memory to read %u bytes from eeprom for GUID\n",
+			len);
+		goto bail;
+	}
+
+	/*
+	 * Use "public" eeprom read function, which does locking and
+	 * figures out device. This will migrate to chip-specific.
+	 */
+	eep_stat = qib_eeprom_read(dd, 0, buf, len);
+
+	if (eep_stat) {
+		qib_dev_err(dd, "Failed reading GUID from eeprom\n");
+		goto done;
+	}
+	ifp = (struct qib_flash *)buf;
+
+	csum = flash_csum(ifp, 0);
+	if (csum != ifp->if_csum) {
+		qib_devinfo(dd->pcidev,
+			"Bad I2C flash checksum: 0x%x, not 0x%x\n",
+			csum, ifp->if_csum);
+		goto done;
+	}
+	if (*(__be64 *) ifp->if_guid == cpu_to_be64(0) ||
+	    *(__be64 *) ifp->if_guid == ~cpu_to_be64(0)) {
+		qib_dev_err(dd,
+			"Invalid GUID %llx from flash; ignoring\n",
+			*(unsigned long long *) ifp->if_guid);
+		/* don't allow GUID if all 0 or all 1's */
+		goto done;
+	}
+
+	/* complain, but allow it */
+	if (*(u64 *) ifp->if_guid == 0x100007511000000ULL)
+		qib_devinfo(dd->pcidev,
+			"Warning, GUID %llx is default, probably not correct!\n",
+			*(unsigned long long *) ifp->if_guid);
+
+	bguid = ifp->if_guid;
+	if (!bguid[0] && !bguid[1] && !bguid[2]) {
+		/*
+		 * Original incorrect GUID format in flash; fix in
+		 * core copy, by shifting up 2 octets; don't need to
+		 * change top octet, since both it and shifted are 0.
+		 */
+		bguid[1] = bguid[3];
+		bguid[2] = bguid[4];
+		bguid[3] = 0;
+		bguid[4] = 0;
+		guid = *(__be64 *) ifp->if_guid;
+	} else
+		guid = *(__be64 *) ifp->if_guid;
+	dd->base_guid = guid;
+	dd->nguid = ifp->if_numguid;
+	/*
+	 * Things are slightly complicated by the desire to transparently
+	 * support both the Pathscale 10-digit serial number and the QLogic
+	 * 13-character version.
+	 */
+	if ((ifp->if_fversion > 1) && ifp->if_sprefix[0] &&
+	    ((u8 *) ifp->if_sprefix)[0] != 0xFF) {
+		char *snp = dd->serial;
+
+		/*
+		 * This board has a Serial-prefix, which is stored
+		 * elsewhere for backward-compatibility.
+		 */
+		memcpy(snp, ifp->if_sprefix, sizeof ifp->if_sprefix);
+		snp[sizeof ifp->if_sprefix] = '\0';
+		len = strlen(snp);
+		snp += len;
+		len = (sizeof dd->serial) - len;
+		if (len > sizeof ifp->if_serial)
+			len = sizeof ifp->if_serial;
+		memcpy(snp, ifp->if_serial, len);
+	} else
+		memcpy(dd->serial, ifp->if_serial,
+		       sizeof ifp->if_serial);
+	if (!strstr(ifp->if_comment, "Tested successfully"))
+		qib_dev_err(dd,
+			"Board SN %s did not pass functional test: %s\n",
+			dd->serial, ifp->if_comment);
+
+	memcpy(&dd->eep_st_errs, &ifp->if_errcntp, QIB_EEP_LOG_CNT);
+	/*
+	 * Power-on (actually "active") hours are kept as little-endian value
+	 * in EEPROM, but as seconds in a (possibly as small as 24-bit)
+	 * atomic_t while running.
+	 */
+	atomic_set(&dd->active_time, 0);
+	dd->eep_hrs = ifp->if_powerhour[0] | (ifp->if_powerhour[1] << 8);
+
+done:
+	vfree(buf);
+
+bail:;
+}
+
+/**
+ * qib_update_eeprom_log - copy active-time and error counters to eeprom
+ * @dd: the qlogic_ib device
+ *
+ * Although the time is kept as seconds in the qib_devdata struct, it is
+ * rounded to hours for re-write, as we have only 16 bits in EEPROM.
+ * First-cut code reads whole (expected) struct qib_flash, modifies,
+ * re-writes. Future direction: read/write only what we need, assuming
+ * that the EEPROM had to have been "good enough" for driver init, and
+ * if not, we aren't making it worse.
+ *
+ */
+int qib_update_eeprom_log(struct qib_devdata *dd)
+{
+	void *buf;
+	struct qib_flash *ifp;
+	int len, hi_water;
+	uint32_t new_time, new_hrs;
+	u8 csum;
+	int ret, idx;
+	unsigned long flags;
+
+	/* first, check if we actually need to do anything. */
+	ret = 0;
+	for (idx = 0; idx < QIB_EEP_LOG_CNT; ++idx) {
+		if (dd->eep_st_new_errs[idx]) {
+			ret = 1;
+			break;
+		}
+	}
+	new_time = atomic_read(&dd->active_time);
+
+	if (ret == 0 && new_time < 3600)
+		goto bail;
+
+	/*
+	 * The quick-check above determined that there is something worthy
+	 * of logging, so get current contents and do a more detailed idea.
+	 * read full flash, not just currently used part, since it may have
+	 * been written with a newer definition
+	 */
+	len = sizeof(struct qib_flash);
+	buf = vmalloc(len);
+	ret = 1;
+	if (!buf) {
+		qib_dev_err(dd,
+			"Couldn't allocate memory to read %u bytes from eeprom for logging\n",
+			len);
+		goto bail;
+	}
+
+	/* Grab semaphore and read current EEPROM. If we get an
+	 * error, let go, but if not, keep it until we finish write.
+	 */
+	ret = mutex_lock_interruptible(&dd->eep_lock);
+	if (ret) {
+		qib_dev_err(dd, "Unable to acquire EEPROM for logging\n");
+		goto free_bail;
+	}
+	ret = qib_twsi_blk_rd(dd, dd->twsi_eeprom_dev, 0, buf, len);
+	if (ret) {
+		mutex_unlock(&dd->eep_lock);
+		qib_dev_err(dd, "Unable read EEPROM for logging\n");
+		goto free_bail;
+	}
+	ifp = (struct qib_flash *)buf;
+
+	csum = flash_csum(ifp, 0);
+	if (csum != ifp->if_csum) {
+		mutex_unlock(&dd->eep_lock);
+		qib_dev_err(dd, "EEPROM cks err (0x%02X, S/B 0x%02X)\n",
+			    csum, ifp->if_csum);
+		ret = 1;
+		goto free_bail;
+	}
+	hi_water = 0;
+	spin_lock_irqsave(&dd->eep_st_lock, flags);
+	for (idx = 0; idx < QIB_EEP_LOG_CNT; ++idx) {
+		int new_val = dd->eep_st_new_errs[idx];
+		if (new_val) {
+			/*
+			 * If we have seen any errors, add to EEPROM values
+			 * We need to saturate at 0xFF (255) and we also
+			 * would need to adjust the checksum if we were
+			 * trying to minimize EEPROM traffic
+			 * Note that we add to actual current count in EEPROM,
+			 * in case it was altered while we were running.
+			 */
+			new_val += ifp->if_errcntp[idx];
+			if (new_val > 0xFF)
+				new_val = 0xFF;
+			if (ifp->if_errcntp[idx] != new_val) {
+				ifp->if_errcntp[idx] = new_val;
+				hi_water = offsetof(struct qib_flash,
+						    if_errcntp) + idx;
+			}
+			/*
+			 * update our shadow (used to minimize EEPROM
+			 * traffic), to match what we are about to write.
+			 */
+			dd->eep_st_errs[idx] = new_val;
+			dd->eep_st_new_errs[idx] = 0;
+		}
+	}
+	/*
+	 * Now update active-time. We would like to round to the nearest hour
+	 * but unless atomic_t are sure to be proper signed ints we cannot,
+	 * because we need to account for what we "transfer" to EEPROM and
+	 * if we log an hour at 31 minutes, then we would need to set
+	 * active_time to -29 to accurately count the _next_ hour.
+	 */
+	if (new_time >= 3600) {
+		new_hrs = new_time / 3600;
+		atomic_sub((new_hrs * 3600), &dd->active_time);
+		new_hrs += dd->eep_hrs;
+		if (new_hrs > 0xFFFF)
+			new_hrs = 0xFFFF;
+		dd->eep_hrs = new_hrs;
+		if ((new_hrs & 0xFF) != ifp->if_powerhour[0]) {
+			ifp->if_powerhour[0] = new_hrs & 0xFF;
+			hi_water = offsetof(struct qib_flash, if_powerhour);
+		}
+		if ((new_hrs >> 8) != ifp->if_powerhour[1]) {
+			ifp->if_powerhour[1] = new_hrs >> 8;
+			hi_water = offsetof(struct qib_flash, if_powerhour) + 1;
+		}
+	}
+	/*
+	 * There is a tiny possibility that we could somehow fail to write
+	 * the EEPROM after updating our shadows, but problems from holding
+	 * the spinlock too long are a much bigger issue.
+	 */
+	spin_unlock_irqrestore(&dd->eep_st_lock, flags);
+	if (hi_water) {
+		/* we made some change to the data, uopdate cksum and write */
+		csum = flash_csum(ifp, 1);
+		ret = eeprom_write_with_enable(dd, 0, buf, hi_water + 1);
+	}
+	mutex_unlock(&dd->eep_lock);
+	if (ret)
+		qib_dev_err(dd, "Failed updating EEPROM\n");
+
+free_bail:
+	vfree(buf);
+bail:
+	return ret;
+}
+
+/**
+ * qib_inc_eeprom_err - increment one of the four error counters
+ * that are logged to EEPROM.
+ * @dd: the qlogic_ib device
+ * @eidx: 0..3, the counter to increment
+ * @incr: how much to add
+ *
+ * Each counter is 8-bits, and saturates at 255 (0xFF). They
+ * are copied to the EEPROM (aka flash) whenever qib_update_eeprom_log()
+ * is called, but it can only be called in a context that allows sleep.
+ * This function can be called even at interrupt level.
+ */
+void qib_inc_eeprom_err(struct qib_devdata *dd, u32 eidx, u32 incr)
+{
+	uint new_val;
+	unsigned long flags;
+
+	spin_lock_irqsave(&dd->eep_st_lock, flags);
+	new_val = dd->eep_st_new_errs[eidx] + incr;
+	if (new_val > 255)
+		new_val = 255;
+	dd->eep_st_new_errs[eidx] = new_val;
+	spin_unlock_irqrestore(&dd->eep_st_lock, flags);
+}
diff --git a/drivers/infiniband/hw/qib/qib_file_ops.c b/drivers/infiniband/hw/qib/qib_file_ops.c
new file mode 100644
index 0000000..b15e34e
--- /dev/null
+++ b/drivers/infiniband/hw/qib/qib_file_ops.c
@@ -0,0 +1,2410 @@
+/*
+ * Copyright (c) 2012, 2013 Intel Corporation. All rights reserved.
+ * Copyright (c) 2006 - 2012 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/pci.h>
+#include <linux/poll.h>
+#include <linux/cdev.h>
+#include <linux/swap.h>
+#include <linux/vmalloc.h>
+#include <linux/highmem.h>
+#include <linux/io.h>
+#include <linux/aio.h>
+#include <linux/jiffies.h>
+#include <asm/pgtable.h>
+#include <linux/delay.h>
+#include <linux/export.h>
+
+#include "qib.h"
+#include "qib_common.h"
+#include "qib_user_sdma.h"
+
+#undef pr_fmt
+#define pr_fmt(fmt) QIB_DRV_NAME ": " fmt
+
+static int qib_open(struct inode *, struct file *);
+static int qib_close(struct inode *, struct file *);
+static ssize_t qib_write(struct file *, const char __user *, size_t, loff_t *);
+static ssize_t qib_aio_write(struct kiocb *, const struct iovec *,
+			     unsigned long, loff_t);
+static unsigned int qib_poll(struct file *, struct poll_table_struct *);
+static int qib_mmapf(struct file *, struct vm_area_struct *);
+
+static const struct file_operations qib_file_ops = {
+	.owner = THIS_MODULE,
+	.write = qib_write,
+	.aio_write = qib_aio_write,
+	.open = qib_open,
+	.release = qib_close,
+	.poll = qib_poll,
+	.mmap = qib_mmapf,
+	.llseek = noop_llseek,
+};
+
+/*
+ * Convert kernel virtual addresses to physical addresses so they don't
+ * potentially conflict with the chip addresses used as mmap offsets.
+ * It doesn't really matter what mmap offset we use as long as we can
+ * interpret it correctly.
+ */
+static u64 cvt_kvaddr(void *p)
+{
+	struct page *page;
+	u64 paddr = 0;
+
+	page = vmalloc_to_page(p);
+	if (page)
+		paddr = page_to_pfn(page) << PAGE_SHIFT;
+
+	return paddr;
+}
+
+static int qib_get_base_info(struct file *fp, void __user *ubase,
+			     size_t ubase_size)
+{
+	struct qib_ctxtdata *rcd = ctxt_fp(fp);
+	int ret = 0;
+	struct qib_base_info *kinfo = NULL;
+	struct qib_devdata *dd = rcd->dd;
+	struct qib_pportdata *ppd = rcd->ppd;
+	unsigned subctxt_cnt;
+	int shared, master;
+	size_t sz;
+
+	subctxt_cnt = rcd->subctxt_cnt;
+	if (!subctxt_cnt) {
+		shared = 0;
+		master = 0;
+		subctxt_cnt = 1;
+	} else {
+		shared = 1;
+		master = !subctxt_fp(fp);
+	}
+
+	sz = sizeof(*kinfo);
+	/* If context sharing is not requested, allow the old size structure */
+	if (!shared)
+		sz -= 7 * sizeof(u64);
+	if (ubase_size < sz) {
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	kinfo = kzalloc(sizeof(*kinfo), GFP_KERNEL);
+	if (kinfo == NULL) {
+		ret = -ENOMEM;
+		goto bail;
+	}
+
+	ret = dd->f_get_base_info(rcd, kinfo);
+	if (ret < 0)
+		goto bail;
+
+	kinfo->spi_rcvhdr_cnt = dd->rcvhdrcnt;
+	kinfo->spi_rcvhdrent_size = dd->rcvhdrentsize;
+	kinfo->spi_tidegrcnt = rcd->rcvegrcnt;
+	kinfo->spi_rcv_egrbufsize = dd->rcvegrbufsize;
+	/*
+	 * have to mmap whole thing
+	 */
+	kinfo->spi_rcv_egrbuftotlen =
+		rcd->rcvegrbuf_chunks * rcd->rcvegrbuf_size;
+	kinfo->spi_rcv_egrperchunk = rcd->rcvegrbufs_perchunk;
+	kinfo->spi_rcv_egrchunksize = kinfo->spi_rcv_egrbuftotlen /
+		rcd->rcvegrbuf_chunks;
+	kinfo->spi_tidcnt = dd->rcvtidcnt / subctxt_cnt;
+	if (master)
+		kinfo->spi_tidcnt += dd->rcvtidcnt % subctxt_cnt;
+	/*
+	 * for this use, may be cfgctxts summed over all chips that
+	 * are are configured and present
+	 */
+	kinfo->spi_nctxts = dd->cfgctxts;
+	/* unit (chip/board) our context is on */
+	kinfo->spi_unit = dd->unit;
+	kinfo->spi_port = ppd->port;
+	/* for now, only a single page */
+	kinfo->spi_tid_maxsize = PAGE_SIZE;
+
+	/*
+	 * Doing this per context, and based on the skip value, etc.  This has
+	 * to be the actual buffer size, since the protocol code treats it
+	 * as an array.
+	 *
+	 * These have to be set to user addresses in the user code via mmap.
+	 * These values are used on return to user code for the mmap target
+	 * addresses only.  For 32 bit, same 44 bit address problem, so use
+	 * the physical address, not virtual.  Before 2.6.11, using the
+	 * page_address() macro worked, but in 2.6.11, even that returns the
+	 * full 64 bit address (upper bits all 1's).  So far, using the
+	 * physical addresses (or chip offsets, for chip mapping) works, but
+	 * no doubt some future kernel release will change that, and we'll be
+	 * on to yet another method of dealing with this.
+	 * Normally only one of rcvhdr_tailaddr or rhf_offset is useful
+	 * since the chips with non-zero rhf_offset don't normally
+	 * enable tail register updates to host memory, but for testing,
+	 * both can be enabled and used.
+	 */
+	kinfo->spi_rcvhdr_base = (u64) rcd->rcvhdrq_phys;
+	kinfo->spi_rcvhdr_tailaddr = (u64) rcd->rcvhdrqtailaddr_phys;
+	kinfo->spi_rhf_offset = dd->rhf_offset;
+	kinfo->spi_rcv_egrbufs = (u64) rcd->rcvegr_phys;
+	kinfo->spi_pioavailaddr = (u64) dd->pioavailregs_phys;
+	/* setup per-unit (not port) status area for user programs */
+	kinfo->spi_status = (u64) kinfo->spi_pioavailaddr +
+		(char *) ppd->statusp -
+		(char *) dd->pioavailregs_dma;
+	kinfo->spi_uregbase = (u64) dd->uregbase + dd->ureg_align * rcd->ctxt;
+	if (!shared) {
+		kinfo->spi_piocnt = rcd->piocnt;
+		kinfo->spi_piobufbase = (u64) rcd->piobufs;
+		kinfo->spi_sendbuf_status = cvt_kvaddr(rcd->user_event_mask);
+	} else if (master) {
+		kinfo->spi_piocnt = (rcd->piocnt / subctxt_cnt) +
+				    (rcd->piocnt % subctxt_cnt);
+		/* Master's PIO buffers are after all the slave's */
+		kinfo->spi_piobufbase = (u64) rcd->piobufs +
+			dd->palign *
+			(rcd->piocnt - kinfo->spi_piocnt);
+	} else {
+		unsigned slave = subctxt_fp(fp) - 1;
+
+		kinfo->spi_piocnt = rcd->piocnt / subctxt_cnt;
+		kinfo->spi_piobufbase = (u64) rcd->piobufs +
+			dd->palign * kinfo->spi_piocnt * slave;
+	}
+
+	if (shared) {
+		kinfo->spi_sendbuf_status =
+			cvt_kvaddr(&rcd->user_event_mask[subctxt_fp(fp)]);
+		/* only spi_subctxt_* fields should be set in this block! */
+		kinfo->spi_subctxt_uregbase = cvt_kvaddr(rcd->subctxt_uregbase);
+
+		kinfo->spi_subctxt_rcvegrbuf =
+			cvt_kvaddr(rcd->subctxt_rcvegrbuf);
+		kinfo->spi_subctxt_rcvhdr_base =
+			cvt_kvaddr(rcd->subctxt_rcvhdr_base);
+	}
+
+	/*
+	 * All user buffers are 2KB buffers.  If we ever support
+	 * giving 4KB buffers to user processes, this will need some
+	 * work.  Can't use piobufbase directly, because it has
+	 * both 2K and 4K buffer base values.
+	 */
+	kinfo->spi_pioindex = (kinfo->spi_piobufbase - dd->pio2k_bufbase) /
+		dd->palign;
+	kinfo->spi_pioalign = dd->palign;
+	kinfo->spi_qpair = QIB_KD_QP;
+	/*
+	 * user mode PIO buffers are always 2KB, even when 4KB can
+	 * be received, and sent via the kernel; this is ibmaxlen
+	 * for 2K MTU.
+	 */
+	kinfo->spi_piosize = dd->piosize2k - 2 * sizeof(u32);
+	kinfo->spi_mtu = ppd->ibmaxlen; /* maxlen, not ibmtu */
+	kinfo->spi_ctxt = rcd->ctxt;
+	kinfo->spi_subctxt = subctxt_fp(fp);
+	kinfo->spi_sw_version = QIB_KERN_SWVERSION;
+	kinfo->spi_sw_version |= 1U << 31; /* QLogic-built, not kernel.org */
+	kinfo->spi_hw_version = dd->revision;
+
+	if (master)
+		kinfo->spi_runtime_flags |= QIB_RUNTIME_MASTER;
+
+	sz = (ubase_size < sizeof(*kinfo)) ? ubase_size : sizeof(*kinfo);
+	if (copy_to_user(ubase, kinfo, sz))
+		ret = -EFAULT;
+bail:
+	kfree(kinfo);
+	return ret;
+}
+
+/**
+ * qib_tid_update - update a context TID
+ * @rcd: the context
+ * @fp: the qib device file
+ * @ti: the TID information
+ *
+ * The new implementation as of Oct 2004 is that the driver assigns
+ * the tid and returns it to the caller.   To reduce search time, we
+ * keep a cursor for each context, walking the shadow tid array to find
+ * one that's not in use.
+ *
+ * For now, if we can't allocate the full list, we fail, although
+ * in the long run, we'll allocate as many as we can, and the
+ * caller will deal with that by trying the remaining pages later.
+ * That means that when we fail, we have to mark the tids as not in
+ * use again, in our shadow copy.
+ *
+ * It's up to the caller to free the tids when they are done.
+ * We'll unlock the pages as they free them.
+ *
+ * Also, right now we are locking one page at a time, but since
+ * the intended use of this routine is for a single group of
+ * virtually contiguous pages, that should change to improve
+ * performance.
+ */
+static int qib_tid_update(struct qib_ctxtdata *rcd, struct file *fp,
+			  const struct qib_tid_info *ti)
+{
+	int ret = 0, ntids;
+	u32 tid, ctxttid, cnt, i, tidcnt, tidoff;
+	u16 *tidlist;
+	struct qib_devdata *dd = rcd->dd;
+	u64 physaddr;
+	unsigned long vaddr;
+	u64 __iomem *tidbase;
+	unsigned long tidmap[8];
+	struct page **pagep = NULL;
+	unsigned subctxt = subctxt_fp(fp);
+
+	if (!dd->pageshadow) {
+		ret = -ENOMEM;
+		goto done;
+	}
+
+	cnt = ti->tidcnt;
+	if (!cnt) {
+		ret = -EFAULT;
+		goto done;
+	}
+	ctxttid = rcd->ctxt * dd->rcvtidcnt;
+	if (!rcd->subctxt_cnt) {
+		tidcnt = dd->rcvtidcnt;
+		tid = rcd->tidcursor;
+		tidoff = 0;
+	} else if (!subctxt) {
+		tidcnt = (dd->rcvtidcnt / rcd->subctxt_cnt) +
+			 (dd->rcvtidcnt % rcd->subctxt_cnt);
+		tidoff = dd->rcvtidcnt - tidcnt;
+		ctxttid += tidoff;
+		tid = tidcursor_fp(fp);
+	} else {
+		tidcnt = dd->rcvtidcnt / rcd->subctxt_cnt;
+		tidoff = tidcnt * (subctxt - 1);
+		ctxttid += tidoff;
+		tid = tidcursor_fp(fp);
+	}
+	if (cnt > tidcnt) {
+		/* make sure it all fits in tid_pg_list */
+		qib_devinfo(dd->pcidev,
+			"Process tried to allocate %u TIDs, only trying max (%u)\n",
+			cnt, tidcnt);
+		cnt = tidcnt;
+	}
+	pagep = (struct page **) rcd->tid_pg_list;
+	tidlist = (u16 *) &pagep[dd->rcvtidcnt];
+	pagep += tidoff;
+	tidlist += tidoff;
+
+	memset(tidmap, 0, sizeof(tidmap));
+	/* before decrement; chip actual # */
+	ntids = tidcnt;
+	tidbase = (u64 __iomem *) (((char __iomem *) dd->kregbase) +
+				   dd->rcvtidbase +
+				   ctxttid * sizeof(*tidbase));
+
+	/* virtual address of first page in transfer */
+	vaddr = ti->tidvaddr;
+	if (!access_ok(VERIFY_WRITE, (void __user *) vaddr,
+		       cnt * PAGE_SIZE)) {
+		ret = -EFAULT;
+		goto done;
+	}
+	ret = qib_get_user_pages(vaddr, cnt, pagep);
+	if (ret) {
+		/*
+		 * if (ret == -EBUSY)
+		 * We can't continue because the pagep array won't be
+		 * initialized. This should never happen,
+		 * unless perhaps the user has mpin'ed the pages
+		 * themselves.
+		 */
+		qib_devinfo(dd->pcidev,
+			 "Failed to lock addr %p, %u pages: "
+			 "errno %d\n", (void *) vaddr, cnt, -ret);
+		goto done;
+	}
+	for (i = 0; i < cnt; i++, vaddr += PAGE_SIZE) {
+		for (; ntids--; tid++) {
+			if (tid == tidcnt)
+				tid = 0;
+			if (!dd->pageshadow[ctxttid + tid])
+				break;
+		}
+		if (ntids < 0) {
+			/*
+			 * Oops, wrapped all the way through their TIDs,
+			 * and didn't have enough free; see comments at
+			 * start of routine
+			 */
+			i--;    /* last tidlist[i] not filled in */
+			ret = -ENOMEM;
+			break;
+		}
+		tidlist[i] = tid + tidoff;
+		/* we "know" system pages and TID pages are same size */
+		dd->pageshadow[ctxttid + tid] = pagep[i];
+		dd->physshadow[ctxttid + tid] =
+			qib_map_page(dd->pcidev, pagep[i], 0, PAGE_SIZE,
+				     PCI_DMA_FROMDEVICE);
+		/*
+		 * don't need atomic or it's overhead
+		 */
+		__set_bit(tid, tidmap);
+		physaddr = dd->physshadow[ctxttid + tid];
+		/* PERFORMANCE: below should almost certainly be cached */
+		dd->f_put_tid(dd, &tidbase[tid],
+				  RCVHQ_RCV_TYPE_EXPECTED, physaddr);
+		/*
+		 * don't check this tid in qib_ctxtshadow, since we
+		 * just filled it in; start with the next one.
+		 */
+		tid++;
+	}
+
+	if (ret) {
+		u32 limit;
+cleanup:
+		/* jump here if copy out of updated info failed... */
+		/* same code that's in qib_free_tid() */
+		limit = sizeof(tidmap) * BITS_PER_BYTE;
+		if (limit > tidcnt)
+			/* just in case size changes in future */
+			limit = tidcnt;
+		tid = find_first_bit((const unsigned long *)tidmap, limit);
+		for (; tid < limit; tid++) {
+			if (!test_bit(tid, tidmap))
+				continue;
+			if (dd->pageshadow[ctxttid + tid]) {
+				dma_addr_t phys;
+
+				phys = dd->physshadow[ctxttid + tid];
+				dd->physshadow[ctxttid + tid] = dd->tidinvalid;
+				/* PERFORMANCE: below should almost certainly
+				 * be cached
+				 */
+				dd->f_put_tid(dd, &tidbase[tid],
+					      RCVHQ_RCV_TYPE_EXPECTED,
+					      dd->tidinvalid);
+				pci_unmap_page(dd->pcidev, phys, PAGE_SIZE,
+					       PCI_DMA_FROMDEVICE);
+				dd->pageshadow[ctxttid + tid] = NULL;
+			}
+		}
+		qib_release_user_pages(pagep, cnt);
+	} else {
+		/*
+		 * Copy the updated array, with qib_tid's filled in, back
+		 * to user.  Since we did the copy in already, this "should
+		 * never fail" If it does, we have to clean up...
+		 */
+		if (copy_to_user((void __user *)
+				 (unsigned long) ti->tidlist,
+				 tidlist, cnt * sizeof(*tidlist))) {
+			ret = -EFAULT;
+			goto cleanup;
+		}
+		if (copy_to_user((void __user *) (unsigned long) ti->tidmap,
+				 tidmap, sizeof tidmap)) {
+			ret = -EFAULT;
+			goto cleanup;
+		}
+		if (tid == tidcnt)
+			tid = 0;
+		if (!rcd->subctxt_cnt)
+			rcd->tidcursor = tid;
+		else
+			tidcursor_fp(fp) = tid;
+	}
+
+done:
+	return ret;
+}
+
+/**
+ * qib_tid_free - free a context TID
+ * @rcd: the context
+ * @subctxt: the subcontext
+ * @ti: the TID info
+ *
+ * right now we are unlocking one page at a time, but since
+ * the intended use of this routine is for a single group of
+ * virtually contiguous pages, that should change to improve
+ * performance.  We check that the TID is in range for this context
+ * but otherwise don't check validity; if user has an error and
+ * frees the wrong tid, it's only their own data that can thereby
+ * be corrupted.  We do check that the TID was in use, for sanity
+ * We always use our idea of the saved address, not the address that
+ * they pass in to us.
+ */
+static int qib_tid_free(struct qib_ctxtdata *rcd, unsigned subctxt,
+			const struct qib_tid_info *ti)
+{
+	int ret = 0;
+	u32 tid, ctxttid, cnt, limit, tidcnt;
+	struct qib_devdata *dd = rcd->dd;
+	u64 __iomem *tidbase;
+	unsigned long tidmap[8];
+
+	if (!dd->pageshadow) {
+		ret = -ENOMEM;
+		goto done;
+	}
+
+	if (copy_from_user(tidmap, (void __user *)(unsigned long)ti->tidmap,
+			   sizeof tidmap)) {
+		ret = -EFAULT;
+		goto done;
+	}
+
+	ctxttid = rcd->ctxt * dd->rcvtidcnt;
+	if (!rcd->subctxt_cnt)
+		tidcnt = dd->rcvtidcnt;
+	else if (!subctxt) {
+		tidcnt = (dd->rcvtidcnt / rcd->subctxt_cnt) +
+			 (dd->rcvtidcnt % rcd->subctxt_cnt);
+		ctxttid += dd->rcvtidcnt - tidcnt;
+	} else {
+		tidcnt = dd->rcvtidcnt / rcd->subctxt_cnt;
+		ctxttid += tidcnt * (subctxt - 1);
+	}
+	tidbase = (u64 __iomem *) ((char __iomem *)(dd->kregbase) +
+				   dd->rcvtidbase +
+				   ctxttid * sizeof(*tidbase));
+
+	limit = sizeof(tidmap) * BITS_PER_BYTE;
+	if (limit > tidcnt)
+		/* just in case size changes in future */
+		limit = tidcnt;
+	tid = find_first_bit(tidmap, limit);
+	for (cnt = 0; tid < limit; tid++) {
+		/*
+		 * small optimization; if we detect a run of 3 or so without
+		 * any set, use find_first_bit again.  That's mainly to
+		 * accelerate the case where we wrapped, so we have some at
+		 * the beginning, and some at the end, and a big gap
+		 * in the middle.
+		 */
+		if (!test_bit(tid, tidmap))
+			continue;
+		cnt++;
+		if (dd->pageshadow[ctxttid + tid]) {
+			struct page *p;
+			dma_addr_t phys;
+
+			p = dd->pageshadow[ctxttid + tid];
+			dd->pageshadow[ctxttid + tid] = NULL;
+			phys = dd->physshadow[ctxttid + tid];
+			dd->physshadow[ctxttid + tid] = dd->tidinvalid;
+			/* PERFORMANCE: below should almost certainly be
+			 * cached
+			 */
+			dd->f_put_tid(dd, &tidbase[tid],
+				      RCVHQ_RCV_TYPE_EXPECTED, dd->tidinvalid);
+			pci_unmap_page(dd->pcidev, phys, PAGE_SIZE,
+				       PCI_DMA_FROMDEVICE);
+			qib_release_user_pages(&p, 1);
+		}
+	}
+done:
+	return ret;
+}
+
+/**
+ * qib_set_part_key - set a partition key
+ * @rcd: the context
+ * @key: the key
+ *
+ * We can have up to 4 active at a time (other than the default, which is
+ * always allowed).  This is somewhat tricky, since multiple contexts may set
+ * the same key, so we reference count them, and clean up at exit.  All 4
+ * partition keys are packed into a single qlogic_ib register.  It's an
+ * error for a process to set the same pkey multiple times.  We provide no
+ * mechanism to de-allocate a pkey at this time, we may eventually need to
+ * do that.  I've used the atomic operations, and no locking, and only make
+ * a single pass through what's available.  This should be more than
+ * adequate for some time. I'll think about spinlocks or the like if and as
+ * it's necessary.
+ */
+static int qib_set_part_key(struct qib_ctxtdata *rcd, u16 key)
+{
+	struct qib_pportdata *ppd = rcd->ppd;
+	int i, any = 0, pidx = -1;
+	u16 lkey = key & 0x7FFF;
+	int ret;
+
+	if (lkey == (QIB_DEFAULT_P_KEY & 0x7FFF)) {
+		/* nothing to do; this key always valid */
+		ret = 0;
+		goto bail;
+	}
+
+	if (!lkey) {
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	/*
+	 * Set the full membership bit, because it has to be
+	 * set in the register or the packet, and it seems
+	 * cleaner to set in the register than to force all
+	 * callers to set it.
+	 */
+	key |= 0x8000;
+
+	for (i = 0; i < ARRAY_SIZE(rcd->pkeys); i++) {
+		if (!rcd->pkeys[i] && pidx == -1)
+			pidx = i;
+		if (rcd->pkeys[i] == key) {
+			ret = -EEXIST;
+			goto bail;
+		}
+	}
+	if (pidx == -1) {
+		ret = -EBUSY;
+		goto bail;
+	}
+	for (any = i = 0; i < ARRAY_SIZE(ppd->pkeys); i++) {
+		if (!ppd->pkeys[i]) {
+			any++;
+			continue;
+		}
+		if (ppd->pkeys[i] == key) {
+			atomic_t *pkrefs = &ppd->pkeyrefs[i];
+
+			if (atomic_inc_return(pkrefs) > 1) {
+				rcd->pkeys[pidx] = key;
+				ret = 0;
+				goto bail;
+			} else {
+				/*
+				 * lost race, decrement count, catch below
+				 */
+				atomic_dec(pkrefs);
+				any++;
+			}
+		}
+		if ((ppd->pkeys[i] & 0x7FFF) == lkey) {
+			/*
+			 * It makes no sense to have both the limited and
+			 * full membership PKEY set at the same time since
+			 * the unlimited one will disable the limited one.
+			 */
+			ret = -EEXIST;
+			goto bail;
+		}
+	}
+	if (!any) {
+		ret = -EBUSY;
+		goto bail;
+	}
+	for (any = i = 0; i < ARRAY_SIZE(ppd->pkeys); i++) {
+		if (!ppd->pkeys[i] &&
+		    atomic_inc_return(&ppd->pkeyrefs[i]) == 1) {
+			rcd->pkeys[pidx] = key;
+			ppd->pkeys[i] = key;
+			(void) ppd->dd->f_set_ib_cfg(ppd, QIB_IB_CFG_PKEYS, 0);
+			ret = 0;
+			goto bail;
+		}
+	}
+	ret = -EBUSY;
+
+bail:
+	return ret;
+}
+
+/**
+ * qib_manage_rcvq - manage a context's receive queue
+ * @rcd: the context
+ * @subctxt: the subcontext
+ * @start_stop: action to carry out
+ *
+ * start_stop == 0 disables receive on the context, for use in queue
+ * overflow conditions.  start_stop==1 re-enables, to be used to
+ * re-init the software copy of the head register
+ */
+static int qib_manage_rcvq(struct qib_ctxtdata *rcd, unsigned subctxt,
+			   int start_stop)
+{
+	struct qib_devdata *dd = rcd->dd;
+	unsigned int rcvctrl_op;
+
+	if (subctxt)
+		goto bail;
+	/* atomically clear receive enable ctxt. */
+	if (start_stop) {
+		/*
+		 * On enable, force in-memory copy of the tail register to
+		 * 0, so that protocol code doesn't have to worry about
+		 * whether or not the chip has yet updated the in-memory
+		 * copy or not on return from the system call. The chip
+		 * always resets it's tail register back to 0 on a
+		 * transition from disabled to enabled.
+		 */
+		if (rcd->rcvhdrtail_kvaddr)
+			qib_clear_rcvhdrtail(rcd);
+		rcvctrl_op = QIB_RCVCTRL_CTXT_ENB;
+	} else
+		rcvctrl_op = QIB_RCVCTRL_CTXT_DIS;
+	dd->f_rcvctrl(rcd->ppd, rcvctrl_op, rcd->ctxt);
+	/* always; new head should be equal to new tail; see above */
+bail:
+	return 0;
+}
+
+static void qib_clean_part_key(struct qib_ctxtdata *rcd,
+			       struct qib_devdata *dd)
+{
+	int i, j, pchanged = 0;
+	u64 oldpkey;
+	struct qib_pportdata *ppd = rcd->ppd;
+
+	/* for debugging only */
+	oldpkey = (u64) ppd->pkeys[0] |
+		((u64) ppd->pkeys[1] << 16) |
+		((u64) ppd->pkeys[2] << 32) |
+		((u64) ppd->pkeys[3] << 48);
+
+	for (i = 0; i < ARRAY_SIZE(rcd->pkeys); i++) {
+		if (!rcd->pkeys[i])
+			continue;
+		for (j = 0; j < ARRAY_SIZE(ppd->pkeys); j++) {
+			/* check for match independent of the global bit */
+			if ((ppd->pkeys[j] & 0x7fff) !=
+			    (rcd->pkeys[i] & 0x7fff))
+				continue;
+			if (atomic_dec_and_test(&ppd->pkeyrefs[j])) {
+				ppd->pkeys[j] = 0;
+				pchanged++;
+			}
+			break;
+		}
+		rcd->pkeys[i] = 0;
+	}
+	if (pchanged)
+		(void) ppd->dd->f_set_ib_cfg(ppd, QIB_IB_CFG_PKEYS, 0);
+}
+
+/* common code for the mappings on dma_alloc_coherent mem */
+static int qib_mmap_mem(struct vm_area_struct *vma, struct qib_ctxtdata *rcd,
+			unsigned len, void *kvaddr, u32 write_ok, char *what)
+{
+	struct qib_devdata *dd = rcd->dd;
+	unsigned long pfn;
+	int ret;
+
+	if ((vma->vm_end - vma->vm_start) > len) {
+		qib_devinfo(dd->pcidev,
+			 "FAIL on %s: len %lx > %x\n", what,
+			 vma->vm_end - vma->vm_start, len);
+		ret = -EFAULT;
+		goto bail;
+	}
+
+	/*
+	 * shared context user code requires rcvhdrq mapped r/w, others
+	 * only allowed readonly mapping.
+	 */
+	if (!write_ok) {
+		if (vma->vm_flags & VM_WRITE) {
+			qib_devinfo(dd->pcidev,
+				 "%s must be mapped readonly\n", what);
+			ret = -EPERM;
+			goto bail;
+		}
+
+		/* don't allow them to later change with mprotect */
+		vma->vm_flags &= ~VM_MAYWRITE;
+	}
+
+	pfn = virt_to_phys(kvaddr) >> PAGE_SHIFT;
+	ret = remap_pfn_range(vma, vma->vm_start, pfn,
+			      len, vma->vm_page_prot);
+	if (ret)
+		qib_devinfo(dd->pcidev,
+			"%s ctxt%u mmap of %lx, %x bytes failed: %d\n",
+			what, rcd->ctxt, pfn, len, ret);
+bail:
+	return ret;
+}
+
+static int mmap_ureg(struct vm_area_struct *vma, struct qib_devdata *dd,
+		     u64 ureg)
+{
+	unsigned long phys;
+	unsigned long sz;
+	int ret;
+
+	/*
+	 * This is real hardware, so use io_remap.  This is the mechanism
+	 * for the user process to update the head registers for their ctxt
+	 * in the chip.
+	 */
+	sz = dd->flags & QIB_HAS_HDRSUPP ? 2 * PAGE_SIZE : PAGE_SIZE;
+	if ((vma->vm_end - vma->vm_start) > sz) {
+		qib_devinfo(dd->pcidev,
+			"FAIL mmap userreg: reqlen %lx > PAGE\n",
+			vma->vm_end - vma->vm_start);
+		ret = -EFAULT;
+	} else {
+		phys = dd->physaddr + ureg;
+		vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+
+		vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND;
+		ret = io_remap_pfn_range(vma, vma->vm_start,
+					 phys >> PAGE_SHIFT,
+					 vma->vm_end - vma->vm_start,
+					 vma->vm_page_prot);
+	}
+	return ret;
+}
+
+static int mmap_piobufs(struct vm_area_struct *vma,
+			struct qib_devdata *dd,
+			struct qib_ctxtdata *rcd,
+			unsigned piobufs, unsigned piocnt)
+{
+	unsigned long phys;
+	int ret;
+
+	/*
+	 * When we map the PIO buffers in the chip, we want to map them as
+	 * writeonly, no read possible; unfortunately, x86 doesn't allow
+	 * for this in hardware, but we still prevent users from asking
+	 * for it.
+	 */
+	if ((vma->vm_end - vma->vm_start) > (piocnt * dd->palign)) {
+		qib_devinfo(dd->pcidev,
+			"FAIL mmap piobufs: reqlen %lx > PAGE\n",
+			 vma->vm_end - vma->vm_start);
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	phys = dd->physaddr + piobufs;
+
+#if defined(__powerpc__)
+	/* There isn't a generic way to specify writethrough mappings */
+	pgprot_val(vma->vm_page_prot) |= _PAGE_NO_CACHE;
+	pgprot_val(vma->vm_page_prot) |= _PAGE_WRITETHRU;
+	pgprot_val(vma->vm_page_prot) &= ~_PAGE_GUARDED;
+#endif
+
+	/*
+	 * don't allow them to later change to readable with mprotect (for when
+	 * not initially mapped readable, as is normally the case)
+	 */
+	vma->vm_flags &= ~VM_MAYREAD;
+	vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND;
+
+	if (qib_wc_pat)
+		vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot);
+
+	ret = io_remap_pfn_range(vma, vma->vm_start, phys >> PAGE_SHIFT,
+				 vma->vm_end - vma->vm_start,
+				 vma->vm_page_prot);
+bail:
+	return ret;
+}
+
+static int mmap_rcvegrbufs(struct vm_area_struct *vma,
+			   struct qib_ctxtdata *rcd)
+{
+	struct qib_devdata *dd = rcd->dd;
+	unsigned long start, size;
+	size_t total_size, i;
+	unsigned long pfn;
+	int ret;
+
+	size = rcd->rcvegrbuf_size;
+	total_size = rcd->rcvegrbuf_chunks * size;
+	if ((vma->vm_end - vma->vm_start) > total_size) {
+		qib_devinfo(dd->pcidev,
+			"FAIL on egr bufs: reqlen %lx > actual %lx\n",
+			 vma->vm_end - vma->vm_start,
+			 (unsigned long) total_size);
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	if (vma->vm_flags & VM_WRITE) {
+		qib_devinfo(dd->pcidev,
+			"Can't map eager buffers as writable (flags=%lx)\n",
+			vma->vm_flags);
+		ret = -EPERM;
+		goto bail;
+	}
+	/* don't allow them to later change to writeable with mprotect */
+	vma->vm_flags &= ~VM_MAYWRITE;
+
+	start = vma->vm_start;
+
+	for (i = 0; i < rcd->rcvegrbuf_chunks; i++, start += size) {
+		pfn = virt_to_phys(rcd->rcvegrbuf[i]) >> PAGE_SHIFT;
+		ret = remap_pfn_range(vma, start, pfn, size,
+				      vma->vm_page_prot);
+		if (ret < 0)
+			goto bail;
+	}
+	ret = 0;
+
+bail:
+	return ret;
+}
+
+/*
+ * qib_file_vma_fault - handle a VMA page fault.
+ */
+static int qib_file_vma_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	struct page *page;
+
+	page = vmalloc_to_page((void *)(vmf->pgoff << PAGE_SHIFT));
+	if (!page)
+		return VM_FAULT_SIGBUS;
+
+	get_page(page);
+	vmf->page = page;
+
+	return 0;
+}
+
+static struct vm_operations_struct qib_file_vm_ops = {
+	.fault = qib_file_vma_fault,
+};
+
+static int mmap_kvaddr(struct vm_area_struct *vma, u64 pgaddr,
+		       struct qib_ctxtdata *rcd, unsigned subctxt)
+{
+	struct qib_devdata *dd = rcd->dd;
+	unsigned subctxt_cnt;
+	unsigned long len;
+	void *addr;
+	size_t size;
+	int ret = 0;
+
+	subctxt_cnt = rcd->subctxt_cnt;
+	size = rcd->rcvegrbuf_chunks * rcd->rcvegrbuf_size;
+
+	/*
+	 * Each process has all the subctxt uregbase, rcvhdrq, and
+	 * rcvegrbufs mmapped - as an array for all the processes,
+	 * and also separately for this process.
+	 */
+	if (pgaddr == cvt_kvaddr(rcd->subctxt_uregbase)) {
+		addr = rcd->subctxt_uregbase;
+		size = PAGE_SIZE * subctxt_cnt;
+	} else if (pgaddr == cvt_kvaddr(rcd->subctxt_rcvhdr_base)) {
+		addr = rcd->subctxt_rcvhdr_base;
+		size = rcd->rcvhdrq_size * subctxt_cnt;
+	} else if (pgaddr == cvt_kvaddr(rcd->subctxt_rcvegrbuf)) {
+		addr = rcd->subctxt_rcvegrbuf;
+		size *= subctxt_cnt;
+	} else if (pgaddr == cvt_kvaddr(rcd->subctxt_uregbase +
+					PAGE_SIZE * subctxt)) {
+		addr = rcd->subctxt_uregbase + PAGE_SIZE * subctxt;
+		size = PAGE_SIZE;
+	} else if (pgaddr == cvt_kvaddr(rcd->subctxt_rcvhdr_base +
+					rcd->rcvhdrq_size * subctxt)) {
+		addr = rcd->subctxt_rcvhdr_base +
+			rcd->rcvhdrq_size * subctxt;
+		size = rcd->rcvhdrq_size;
+	} else if (pgaddr == cvt_kvaddr(&rcd->user_event_mask[subctxt])) {
+		addr = rcd->user_event_mask;
+		size = PAGE_SIZE;
+	} else if (pgaddr == cvt_kvaddr(rcd->subctxt_rcvegrbuf +
+					size * subctxt)) {
+		addr = rcd->subctxt_rcvegrbuf + size * subctxt;
+		/* rcvegrbufs are read-only on the slave */
+		if (vma->vm_flags & VM_WRITE) {
+			qib_devinfo(dd->pcidev,
+				 "Can't map eager buffers as "
+				 "writable (flags=%lx)\n", vma->vm_flags);
+			ret = -EPERM;
+			goto bail;
+		}
+		/*
+		 * Don't allow permission to later change to writeable
+		 * with mprotect.
+		 */
+		vma->vm_flags &= ~VM_MAYWRITE;
+	} else
+		goto bail;
+	len = vma->vm_end - vma->vm_start;
+	if (len > size) {
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	vma->vm_pgoff = (unsigned long) addr >> PAGE_SHIFT;
+	vma->vm_ops = &qib_file_vm_ops;
+	vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
+	ret = 1;
+
+bail:
+	return ret;
+}
+
+/**
+ * qib_mmapf - mmap various structures into user space
+ * @fp: the file pointer
+ * @vma: the VM area
+ *
+ * We use this to have a shared buffer between the kernel and the user code
+ * for the rcvhdr queue, egr buffers, and the per-context user regs and pio
+ * buffers in the chip.  We have the open and close entries so we can bump
+ * the ref count and keep the driver from being unloaded while still mapped.
+ */
+static int qib_mmapf(struct file *fp, struct vm_area_struct *vma)
+{
+	struct qib_ctxtdata *rcd;
+	struct qib_devdata *dd;
+	u64 pgaddr, ureg;
+	unsigned piobufs, piocnt;
+	int ret, match = 1;
+
+	rcd = ctxt_fp(fp);
+	if (!rcd || !(vma->vm_flags & VM_SHARED)) {
+		ret = -EINVAL;
+		goto bail;
+	}
+	dd = rcd->dd;
+
+	/*
+	 * This is the qib_do_user_init() code, mapping the shared buffers
+	 * and per-context user registers into the user process. The address
+	 * referred to by vm_pgoff is the file offset passed via mmap().
+	 * For shared contexts, this is the kernel vmalloc() address of the
+	 * pages to share with the master.
+	 * For non-shared or master ctxts, this is a physical address.
+	 * We only do one mmap for each space mapped.
+	 */
+	pgaddr = vma->vm_pgoff << PAGE_SHIFT;
+
+	/*
+	 * Check for 0 in case one of the allocations failed, but user
+	 * called mmap anyway.
+	 */
+	if (!pgaddr)  {
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	/*
+	 * Physical addresses must fit in 40 bits for our hardware.
+	 * Check for kernel virtual addresses first, anything else must
+	 * match a HW or memory address.
+	 */
+	ret = mmap_kvaddr(vma, pgaddr, rcd, subctxt_fp(fp));
+	if (ret) {
+		if (ret > 0)
+			ret = 0;
+		goto bail;
+	}
+
+	ureg = dd->uregbase + dd->ureg_align * rcd->ctxt;
+	if (!rcd->subctxt_cnt) {
+		/* ctxt is not shared */
+		piocnt = rcd->piocnt;
+		piobufs = rcd->piobufs;
+	} else if (!subctxt_fp(fp)) {
+		/* caller is the master */
+		piocnt = (rcd->piocnt / rcd->subctxt_cnt) +
+			 (rcd->piocnt % rcd->subctxt_cnt);
+		piobufs = rcd->piobufs +
+			dd->palign * (rcd->piocnt - piocnt);
+	} else {
+		unsigned slave = subctxt_fp(fp) - 1;
+
+		/* caller is a slave */
+		piocnt = rcd->piocnt / rcd->subctxt_cnt;
+		piobufs = rcd->piobufs + dd->palign * piocnt * slave;
+	}
+
+	if (pgaddr == ureg)
+		ret = mmap_ureg(vma, dd, ureg);
+	else if (pgaddr == piobufs)
+		ret = mmap_piobufs(vma, dd, rcd, piobufs, piocnt);
+	else if (pgaddr == dd->pioavailregs_phys)
+		/* in-memory copy of pioavail registers */
+		ret = qib_mmap_mem(vma, rcd, PAGE_SIZE,
+				   (void *) dd->pioavailregs_dma, 0,
+				   "pioavail registers");
+	else if (pgaddr == rcd->rcvegr_phys)
+		ret = mmap_rcvegrbufs(vma, rcd);
+	else if (pgaddr == (u64) rcd->rcvhdrq_phys)
+		/*
+		 * The rcvhdrq itself; multiple pages, contiguous
+		 * from an i/o perspective.  Shared contexts need
+		 * to map r/w, so we allow writing.
+		 */
+		ret = qib_mmap_mem(vma, rcd, rcd->rcvhdrq_size,
+				   rcd->rcvhdrq, 1, "rcvhdrq");
+	else if (pgaddr == (u64) rcd->rcvhdrqtailaddr_phys)
+		/* in-memory copy of rcvhdrq tail register */
+		ret = qib_mmap_mem(vma, rcd, PAGE_SIZE,
+				   rcd->rcvhdrtail_kvaddr, 0,
+				   "rcvhdrq tail");
+	else
+		match = 0;
+	if (!match)
+		ret = -EINVAL;
+
+	vma->vm_private_data = NULL;
+
+	if (ret < 0)
+		qib_devinfo(dd->pcidev,
+			 "mmap Failure %d: off %llx len %lx\n",
+			 -ret, (unsigned long long)pgaddr,
+			 vma->vm_end - vma->vm_start);
+bail:
+	return ret;
+}
+
+static unsigned int qib_poll_urgent(struct qib_ctxtdata *rcd,
+				    struct file *fp,
+				    struct poll_table_struct *pt)
+{
+	struct qib_devdata *dd = rcd->dd;
+	unsigned pollflag;
+
+	poll_wait(fp, &rcd->wait, pt);
+
+	spin_lock_irq(&dd->uctxt_lock);
+	if (rcd->urgent != rcd->urgent_poll) {
+		pollflag = POLLIN | POLLRDNORM;
+		rcd->urgent_poll = rcd->urgent;
+	} else {
+		pollflag = 0;
+		set_bit(QIB_CTXT_WAITING_URG, &rcd->flag);
+	}
+	spin_unlock_irq(&dd->uctxt_lock);
+
+	return pollflag;
+}
+
+static unsigned int qib_poll_next(struct qib_ctxtdata *rcd,
+				  struct file *fp,
+				  struct poll_table_struct *pt)
+{
+	struct qib_devdata *dd = rcd->dd;
+	unsigned pollflag;
+
+	poll_wait(fp, &rcd->wait, pt);
+
+	spin_lock_irq(&dd->uctxt_lock);
+	if (dd->f_hdrqempty(rcd)) {
+		set_bit(QIB_CTXT_WAITING_RCV, &rcd->flag);
+		dd->f_rcvctrl(rcd->ppd, QIB_RCVCTRL_INTRAVAIL_ENB, rcd->ctxt);
+		pollflag = 0;
+	} else
+		pollflag = POLLIN | POLLRDNORM;
+	spin_unlock_irq(&dd->uctxt_lock);
+
+	return pollflag;
+}
+
+static unsigned int qib_poll(struct file *fp, struct poll_table_struct *pt)
+{
+	struct qib_ctxtdata *rcd;
+	unsigned pollflag;
+
+	rcd = ctxt_fp(fp);
+	if (!rcd)
+		pollflag = POLLERR;
+	else if (rcd->poll_type == QIB_POLL_TYPE_URGENT)
+		pollflag = qib_poll_urgent(rcd, fp, pt);
+	else  if (rcd->poll_type == QIB_POLL_TYPE_ANYRCV)
+		pollflag = qib_poll_next(rcd, fp, pt);
+	else /* invalid */
+		pollflag = POLLERR;
+
+	return pollflag;
+}
+
+static void assign_ctxt_affinity(struct file *fp, struct qib_devdata *dd)
+{
+	struct qib_filedata *fd = fp->private_data;
+	const unsigned int weight = cpumask_weight(&current->cpus_allowed);
+	const struct cpumask *local_mask = cpumask_of_pcibus(dd->pcidev->bus);
+	int local_cpu;
+
+	/*
+	 * If process has NOT already set it's affinity, select and
+	 * reserve a processor for it on the local NUMA node.
+	 */
+	if ((weight >= qib_cpulist_count) &&
+		(cpumask_weight(local_mask) <= qib_cpulist_count)) {
+		for_each_cpu(local_cpu, local_mask)
+			if (!test_and_set_bit(local_cpu, qib_cpulist)) {
+				fd->rec_cpu_num = local_cpu;
+				return;
+			}
+	}
+
+	/*
+	 * If process has NOT already set it's affinity, select and
+	 * reserve a processor for it, as a rendevous for all
+	 * users of the driver.  If they don't actually later
+	 * set affinity to this cpu, or set it to some other cpu,
+	 * it just means that sooner or later we don't recommend
+	 * a cpu, and let the scheduler do it's best.
+	 */
+	if (weight >= qib_cpulist_count) {
+		int cpu;
+		cpu = find_first_zero_bit(qib_cpulist,
+					  qib_cpulist_count);
+		if (cpu == qib_cpulist_count)
+			qib_dev_err(dd,
+			"no cpus avail for affinity PID %u\n",
+			current->pid);
+		else {
+			__set_bit(cpu, qib_cpulist);
+			fd->rec_cpu_num = cpu;
+		}
+	}
+}
+
+/*
+ * Check that userland and driver are compatible for subcontexts.
+ */
+static int qib_compatible_subctxts(int user_swmajor, int user_swminor)
+{
+	/* this code is written long-hand for clarity */
+	if (QIB_USER_SWMAJOR != user_swmajor) {
+		/* no promise of compatibility if major mismatch */
+		return 0;
+	}
+	if (QIB_USER_SWMAJOR == 1) {
+		switch (QIB_USER_SWMINOR) {
+		case 0:
+		case 1:
+		case 2:
+			/* no subctxt implementation so cannot be compatible */
+			return 0;
+		case 3:
+			/* 3 is only compatible with itself */
+			return user_swminor == 3;
+		default:
+			/* >= 4 are compatible (or are expected to be) */
+			return user_swminor <= QIB_USER_SWMINOR;
+		}
+	}
+	/* make no promises yet for future major versions */
+	return 0;
+}
+
+static int init_subctxts(struct qib_devdata *dd,
+			 struct qib_ctxtdata *rcd,
+			 const struct qib_user_info *uinfo)
+{
+	int ret = 0;
+	unsigned num_subctxts;
+	size_t size;
+
+	/*
+	 * If the user is requesting zero subctxts,
+	 * skip the subctxt allocation.
+	 */
+	if (uinfo->spu_subctxt_cnt <= 0)
+		goto bail;
+	num_subctxts = uinfo->spu_subctxt_cnt;
+
+	/* Check for subctxt compatibility */
+	if (!qib_compatible_subctxts(uinfo->spu_userversion >> 16,
+		uinfo->spu_userversion & 0xffff)) {
+		qib_devinfo(dd->pcidev,
+			 "Mismatched user version (%d.%d) and driver "
+			 "version (%d.%d) while context sharing. Ensure "
+			 "that driver and library are from the same "
+			 "release.\n",
+			 (int) (uinfo->spu_userversion >> 16),
+			 (int) (uinfo->spu_userversion & 0xffff),
+			 QIB_USER_SWMAJOR, QIB_USER_SWMINOR);
+		goto bail;
+	}
+	if (num_subctxts > QLOGIC_IB_MAX_SUBCTXT) {
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	rcd->subctxt_uregbase = vmalloc_user(PAGE_SIZE * num_subctxts);
+	if (!rcd->subctxt_uregbase) {
+		ret = -ENOMEM;
+		goto bail;
+	}
+	/* Note: rcd->rcvhdrq_size isn't initialized yet. */
+	size = ALIGN(dd->rcvhdrcnt * dd->rcvhdrentsize *
+		     sizeof(u32), PAGE_SIZE) * num_subctxts;
+	rcd->subctxt_rcvhdr_base = vmalloc_user(size);
+	if (!rcd->subctxt_rcvhdr_base) {
+		ret = -ENOMEM;
+		goto bail_ureg;
+	}
+
+	rcd->subctxt_rcvegrbuf = vmalloc_user(rcd->rcvegrbuf_chunks *
+					      rcd->rcvegrbuf_size *
+					      num_subctxts);
+	if (!rcd->subctxt_rcvegrbuf) {
+		ret = -ENOMEM;
+		goto bail_rhdr;
+	}
+
+	rcd->subctxt_cnt = uinfo->spu_subctxt_cnt;
+	rcd->subctxt_id = uinfo->spu_subctxt_id;
+	rcd->active_slaves = 1;
+	rcd->redirect_seq_cnt = 1;
+	set_bit(QIB_CTXT_MASTER_UNINIT, &rcd->flag);
+	goto bail;
+
+bail_rhdr:
+	vfree(rcd->subctxt_rcvhdr_base);
+bail_ureg:
+	vfree(rcd->subctxt_uregbase);
+	rcd->subctxt_uregbase = NULL;
+bail:
+	return ret;
+}
+
+static int setup_ctxt(struct qib_pportdata *ppd, int ctxt,
+		      struct file *fp, const struct qib_user_info *uinfo)
+{
+	struct qib_filedata *fd = fp->private_data;
+	struct qib_devdata *dd = ppd->dd;
+	struct qib_ctxtdata *rcd;
+	void *ptmp = NULL;
+	int ret;
+	int numa_id;
+
+	assign_ctxt_affinity(fp, dd);
+
+	numa_id = qib_numa_aware ? ((fd->rec_cpu_num != -1) ?
+		cpu_to_node(fd->rec_cpu_num) :
+		numa_node_id()) : dd->assigned_node_id;
+
+	rcd = qib_create_ctxtdata(ppd, ctxt, numa_id);
+
+	/*
+	 * Allocate memory for use in qib_tid_update() at open to
+	 * reduce cost of expected send setup per message segment
+	 */
+	if (rcd)
+		ptmp = kmalloc(dd->rcvtidcnt * sizeof(u16) +
+			       dd->rcvtidcnt * sizeof(struct page **),
+			       GFP_KERNEL);
+
+	if (!rcd || !ptmp) {
+		qib_dev_err(dd,
+			"Unable to allocate ctxtdata memory, failing open\n");
+		ret = -ENOMEM;
+		goto bailerr;
+	}
+	rcd->userversion = uinfo->spu_userversion;
+	ret = init_subctxts(dd, rcd, uinfo);
+	if (ret)
+		goto bailerr;
+	rcd->tid_pg_list = ptmp;
+	rcd->pid = current->pid;
+	init_waitqueue_head(&dd->rcd[ctxt]->wait);
+	strlcpy(rcd->comm, current->comm, sizeof(rcd->comm));
+	ctxt_fp(fp) = rcd;
+	qib_stats.sps_ctxts++;
+	dd->freectxts--;
+	ret = 0;
+	goto bail;
+
+bailerr:
+	if (fd->rec_cpu_num != -1)
+		__clear_bit(fd->rec_cpu_num, qib_cpulist);
+
+	dd->rcd[ctxt] = NULL;
+	kfree(rcd);
+	kfree(ptmp);
+bail:
+	return ret;
+}
+
+static inline int usable(struct qib_pportdata *ppd)
+{
+	struct qib_devdata *dd = ppd->dd;
+
+	return dd && (dd->flags & QIB_PRESENT) && dd->kregbase && ppd->lid &&
+		(ppd->lflags & QIBL_LINKACTIVE);
+}
+
+/*
+ * Select a context on the given device, either using a requested port
+ * or the port based on the context number.
+ */
+static int choose_port_ctxt(struct file *fp, struct qib_devdata *dd, u32 port,
+			    const struct qib_user_info *uinfo)
+{
+	struct qib_pportdata *ppd = NULL;
+	int ret, ctxt;
+
+	if (port) {
+		if (!usable(dd->pport + port - 1)) {
+			ret = -ENETDOWN;
+			goto done;
+		} else
+			ppd = dd->pport + port - 1;
+	}
+	for (ctxt = dd->first_user_ctxt; ctxt < dd->cfgctxts && dd->rcd[ctxt];
+	     ctxt++)
+		;
+	if (ctxt == dd->cfgctxts) {
+		ret = -EBUSY;
+		goto done;
+	}
+	if (!ppd) {
+		u32 pidx = ctxt % dd->num_pports;
+		if (usable(dd->pport + pidx))
+			ppd = dd->pport + pidx;
+		else {
+			for (pidx = 0; pidx < dd->num_pports && !ppd;
+			     pidx++)
+				if (usable(dd->pport + pidx))
+					ppd = dd->pport + pidx;
+		}
+	}
+	ret = ppd ? setup_ctxt(ppd, ctxt, fp, uinfo) : -ENETDOWN;
+done:
+	return ret;
+}
+
+static int find_free_ctxt(int unit, struct file *fp,
+			  const struct qib_user_info *uinfo)
+{
+	struct qib_devdata *dd = qib_lookup(unit);
+	int ret;
+
+	if (!dd || (uinfo->spu_port && uinfo->spu_port > dd->num_pports))
+		ret = -ENODEV;
+	else
+		ret = choose_port_ctxt(fp, dd, uinfo->spu_port, uinfo);
+
+	return ret;
+}
+
+static int get_a_ctxt(struct file *fp, const struct qib_user_info *uinfo,
+		      unsigned alg)
+{
+	struct qib_devdata *udd = NULL;
+	int ret = 0, devmax, npresent, nup, ndev, dusable = 0, i;
+	u32 port = uinfo->spu_port, ctxt;
+
+	devmax = qib_count_units(&npresent, &nup);
+	if (!npresent) {
+		ret = -ENXIO;
+		goto done;
+	}
+	if (nup == 0) {
+		ret = -ENETDOWN;
+		goto done;
+	}
+
+	if (alg == QIB_PORT_ALG_ACROSS) {
+		unsigned inuse = ~0U;
+		/* find device (with ACTIVE ports) with fewest ctxts in use */
+		for (ndev = 0; ndev < devmax; ndev++) {
+			struct qib_devdata *dd = qib_lookup(ndev);
+			unsigned cused = 0, cfree = 0, pusable = 0;
+			if (!dd)
+				continue;
+			if (port && port <= dd->num_pports &&
+			    usable(dd->pport + port - 1))
+				pusable = 1;
+			else
+				for (i = 0; i < dd->num_pports; i++)
+					if (usable(dd->pport + i))
+						pusable++;
+			if (!pusable)
+				continue;
+			for (ctxt = dd->first_user_ctxt; ctxt < dd->cfgctxts;
+			     ctxt++)
+				if (dd->rcd[ctxt])
+					cused++;
+				else
+					cfree++;
+			if (cfree && cused < inuse) {
+				udd = dd;
+				inuse = cused;
+			}
+		}
+		if (udd) {
+			ret = choose_port_ctxt(fp, udd, port, uinfo);
+			goto done;
+		}
+	} else {
+		for (ndev = 0; ndev < devmax; ndev++) {
+			struct qib_devdata *dd = qib_lookup(ndev);
+			if (dd) {
+				ret = choose_port_ctxt(fp, dd, port, uinfo);
+				if (!ret)
+					goto done;
+				if (ret == -EBUSY)
+					dusable++;
+			}
+		}
+	}
+	ret = dusable ? -EBUSY : -ENETDOWN;
+
+done:
+	return ret;
+}
+
+static int find_shared_ctxt(struct file *fp,
+			    const struct qib_user_info *uinfo)
+{
+	int devmax, ndev, i;
+	int ret = 0;
+
+	devmax = qib_count_units(NULL, NULL);
+
+	for (ndev = 0; ndev < devmax; ndev++) {
+		struct qib_devdata *dd = qib_lookup(ndev);
+
+		/* device portion of usable() */
+		if (!(dd && (dd->flags & QIB_PRESENT) && dd->kregbase))
+			continue;
+		for (i = dd->first_user_ctxt; i < dd->cfgctxts; i++) {
+			struct qib_ctxtdata *rcd = dd->rcd[i];
+
+			/* Skip ctxts which are not yet open */
+			if (!rcd || !rcd->cnt)
+				continue;
+			/* Skip ctxt if it doesn't match the requested one */
+			if (rcd->subctxt_id != uinfo->spu_subctxt_id)
+				continue;
+			/* Verify the sharing process matches the master */
+			if (rcd->subctxt_cnt != uinfo->spu_subctxt_cnt ||
+			    rcd->userversion != uinfo->spu_userversion ||
+			    rcd->cnt >= rcd->subctxt_cnt) {
+				ret = -EINVAL;
+				goto done;
+			}
+			ctxt_fp(fp) = rcd;
+			subctxt_fp(fp) = rcd->cnt++;
+			rcd->subpid[subctxt_fp(fp)] = current->pid;
+			tidcursor_fp(fp) = 0;
+			rcd->active_slaves |= 1 << subctxt_fp(fp);
+			ret = 1;
+			goto done;
+		}
+	}
+
+done:
+	return ret;
+}
+
+static int qib_open(struct inode *in, struct file *fp)
+{
+	/* The real work is performed later in qib_assign_ctxt() */
+	fp->private_data = kzalloc(sizeof(struct qib_filedata), GFP_KERNEL);
+	if (fp->private_data) /* no cpu affinity by default */
+		((struct qib_filedata *)fp->private_data)->rec_cpu_num = -1;
+	return fp->private_data ? 0 : -ENOMEM;
+}
+
+static int find_hca(unsigned int cpu, int *unit)
+{
+	int ret = 0, devmax, npresent, nup, ndev;
+
+	*unit = -1;
+
+	devmax = qib_count_units(&npresent, &nup);
+	if (!npresent) {
+		ret = -ENXIO;
+		goto done;
+	}
+	if (!nup) {
+		ret = -ENETDOWN;
+		goto done;
+	}
+	for (ndev = 0; ndev < devmax; ndev++) {
+		struct qib_devdata *dd = qib_lookup(ndev);
+		if (dd) {
+			if (pcibus_to_node(dd->pcidev->bus) < 0) {
+				ret = -EINVAL;
+				goto done;
+			}
+			if (cpu_to_node(cpu) ==
+				pcibus_to_node(dd->pcidev->bus)) {
+				*unit = ndev;
+				goto done;
+			}
+		}
+	}
+done:
+	return ret;
+}
+
+static int do_qib_user_sdma_queue_create(struct file *fp)
+{
+	struct qib_filedata *fd = fp->private_data;
+	struct qib_ctxtdata *rcd = fd->rcd;
+	struct qib_devdata *dd = rcd->dd;
+
+	if (dd->flags & QIB_HAS_SEND_DMA) {
+
+		fd->pq = qib_user_sdma_queue_create(&dd->pcidev->dev,
+						    dd->unit,
+						    rcd->ctxt,
+						    fd->subctxt);
+		if (!fd->pq)
+			return -ENOMEM;
+	}
+
+	return 0;
+}
+
+/*
+ * Get ctxt early, so can set affinity prior to memory allocation.
+ */
+static int qib_assign_ctxt(struct file *fp, const struct qib_user_info *uinfo)
+{
+	int ret;
+	int i_minor;
+	unsigned swmajor, swminor, alg = QIB_PORT_ALG_ACROSS;
+
+	/* Check to be sure we haven't already initialized this file */
+	if (ctxt_fp(fp)) {
+		ret = -EINVAL;
+		goto done;
+	}
+
+	/* for now, if major version is different, bail */
+	swmajor = uinfo->spu_userversion >> 16;
+	if (swmajor != QIB_USER_SWMAJOR) {
+		ret = -ENODEV;
+		goto done;
+	}
+
+	swminor = uinfo->spu_userversion & 0xffff;
+
+	if (swminor >= 11 && uinfo->spu_port_alg < QIB_PORT_ALG_COUNT)
+		alg = uinfo->spu_port_alg;
+
+	mutex_lock(&qib_mutex);
+
+	if (qib_compatible_subctxts(swmajor, swminor) &&
+	    uinfo->spu_subctxt_cnt) {
+		ret = find_shared_ctxt(fp, uinfo);
+		if (ret > 0) {
+			ret = do_qib_user_sdma_queue_create(fp);
+			if (!ret)
+				assign_ctxt_affinity(fp, (ctxt_fp(fp))->dd);
+			goto done_ok;
+		}
+	}
+
+	i_minor = iminor(file_inode(fp)) - QIB_USER_MINOR_BASE;
+	if (i_minor)
+		ret = find_free_ctxt(i_minor - 1, fp, uinfo);
+	else {
+		int unit;
+		const unsigned int cpu = cpumask_first(&current->cpus_allowed);
+		const unsigned int weight =
+			cpumask_weight(&current->cpus_allowed);
+
+		if (weight == 1 && !test_bit(cpu, qib_cpulist))
+			if (!find_hca(cpu, &unit) && unit >= 0)
+				if (!find_free_ctxt(unit, fp, uinfo)) {
+					ret = 0;
+					goto done_chk_sdma;
+				}
+		ret = get_a_ctxt(fp, uinfo, alg);
+	}
+
+done_chk_sdma:
+	if (!ret)
+		ret = do_qib_user_sdma_queue_create(fp);
+done_ok:
+	mutex_unlock(&qib_mutex);
+
+done:
+	return ret;
+}
+
+
+static int qib_do_user_init(struct file *fp,
+			    const struct qib_user_info *uinfo)
+{
+	int ret;
+	struct qib_ctxtdata *rcd = ctxt_fp(fp);
+	struct qib_devdata *dd;
+	unsigned uctxt;
+
+	/* Subctxts don't need to initialize anything since master did it. */
+	if (subctxt_fp(fp)) {
+		ret = wait_event_interruptible(rcd->wait,
+			!test_bit(QIB_CTXT_MASTER_UNINIT, &rcd->flag));
+		goto bail;
+	}
+
+	dd = rcd->dd;
+
+	/* some ctxts may get extra buffers, calculate that here */
+	uctxt = rcd->ctxt - dd->first_user_ctxt;
+	if (uctxt < dd->ctxts_extrabuf) {
+		rcd->piocnt = dd->pbufsctxt + 1;
+		rcd->pio_base = rcd->piocnt * uctxt;
+	} else {
+		rcd->piocnt = dd->pbufsctxt;
+		rcd->pio_base = rcd->piocnt * uctxt +
+			dd->ctxts_extrabuf;
+	}
+
+	/*
+	 * All user buffers are 2KB buffers.  If we ever support
+	 * giving 4KB buffers to user processes, this will need some
+	 * work.  Can't use piobufbase directly, because it has
+	 * both 2K and 4K buffer base values.  So check and handle.
+	 */
+	if ((rcd->pio_base + rcd->piocnt) > dd->piobcnt2k) {
+		if (rcd->pio_base >= dd->piobcnt2k) {
+			qib_dev_err(dd,
+				    "%u:ctxt%u: no 2KB buffers available\n",
+				    dd->unit, rcd->ctxt);
+			ret = -ENOBUFS;
+			goto bail;
+		}
+		rcd->piocnt = dd->piobcnt2k - rcd->pio_base;
+		qib_dev_err(dd, "Ctxt%u: would use 4KB bufs, using %u\n",
+			    rcd->ctxt, rcd->piocnt);
+	}
+
+	rcd->piobufs = dd->pio2k_bufbase + rcd->pio_base * dd->palign;
+	qib_chg_pioavailkernel(dd, rcd->pio_base, rcd->piocnt,
+			       TXCHK_CHG_TYPE_USER, rcd);
+	/*
+	 * try to ensure that processes start up with consistent avail update
+	 * for their own range, at least.   If system very quiet, it might
+	 * have the in-memory copy out of date at startup for this range of
+	 * buffers, when a context gets re-used.  Do after the chg_pioavail
+	 * and before the rest of setup, so it's "almost certain" the dma
+	 * will have occurred (can't 100% guarantee, but should be many
+	 * decimals of 9s, with this ordering), given how much else happens
+	 * after this.
+	 */
+	dd->f_sendctrl(dd->pport, QIB_SENDCTRL_AVAIL_BLIP);
+
+	/*
+	 * Now allocate the rcvhdr Q and eager TIDs; skip the TID
+	 * array for time being.  If rcd->ctxt > chip-supported,
+	 * we need to do extra stuff here to handle by handling overflow
+	 * through ctxt 0, someday
+	 */
+	ret = qib_create_rcvhdrq(dd, rcd);
+	if (!ret)
+		ret = qib_setup_eagerbufs(rcd);
+	if (ret)
+		goto bail_pio;
+
+	rcd->tidcursor = 0; /* start at beginning after open */
+
+	/* initialize poll variables... */
+	rcd->urgent = 0;
+	rcd->urgent_poll = 0;
+
+	/*
+	 * Now enable the ctxt for receive.
+	 * For chips that are set to DMA the tail register to memory
+	 * when they change (and when the update bit transitions from
+	 * 0 to 1.  So for those chips, we turn it off and then back on.
+	 * This will (very briefly) affect any other open ctxts, but the
+	 * duration is very short, and therefore isn't an issue.  We
+	 * explicitly set the in-memory tail copy to 0 beforehand, so we
+	 * don't have to wait to be sure the DMA update has happened
+	 * (chip resets head/tail to 0 on transition to enable).
+	 */
+	if (rcd->rcvhdrtail_kvaddr)
+		qib_clear_rcvhdrtail(rcd);
+
+	dd->f_rcvctrl(rcd->ppd, QIB_RCVCTRL_CTXT_ENB | QIB_RCVCTRL_TIDFLOW_ENB,
+		      rcd->ctxt);
+
+	/* Notify any waiting slaves */
+	if (rcd->subctxt_cnt) {
+		clear_bit(QIB_CTXT_MASTER_UNINIT, &rcd->flag);
+		wake_up(&rcd->wait);
+	}
+	return 0;
+
+bail_pio:
+	qib_chg_pioavailkernel(dd, rcd->pio_base, rcd->piocnt,
+			       TXCHK_CHG_TYPE_KERN, rcd);
+bail:
+	return ret;
+}
+
+/**
+ * unlock_exptid - unlock any expected TID entries context still had in use
+ * @rcd: ctxt
+ *
+ * We don't actually update the chip here, because we do a bulk update
+ * below, using f_clear_tids.
+ */
+static void unlock_expected_tids(struct qib_ctxtdata *rcd)
+{
+	struct qib_devdata *dd = rcd->dd;
+	int ctxt_tidbase = rcd->ctxt * dd->rcvtidcnt;
+	int i, cnt = 0, maxtid = ctxt_tidbase + dd->rcvtidcnt;
+
+	for (i = ctxt_tidbase; i < maxtid; i++) {
+		struct page *p = dd->pageshadow[i];
+		dma_addr_t phys;
+
+		if (!p)
+			continue;
+
+		phys = dd->physshadow[i];
+		dd->physshadow[i] = dd->tidinvalid;
+		dd->pageshadow[i] = NULL;
+		pci_unmap_page(dd->pcidev, phys, PAGE_SIZE,
+			       PCI_DMA_FROMDEVICE);
+		qib_release_user_pages(&p, 1);
+		cnt++;
+	}
+}
+
+static int qib_close(struct inode *in, struct file *fp)
+{
+	int ret = 0;
+	struct qib_filedata *fd;
+	struct qib_ctxtdata *rcd;
+	struct qib_devdata *dd;
+	unsigned long flags;
+	unsigned ctxt;
+	pid_t pid;
+
+	mutex_lock(&qib_mutex);
+
+	fd = fp->private_data;
+	fp->private_data = NULL;
+	rcd = fd->rcd;
+	if (!rcd) {
+		mutex_unlock(&qib_mutex);
+		goto bail;
+	}
+
+	dd = rcd->dd;
+
+	/* ensure all pio buffer writes in progress are flushed */
+	qib_flush_wc();
+
+	/* drain user sdma queue */
+	if (fd->pq) {
+		qib_user_sdma_queue_drain(rcd->ppd, fd->pq);
+		qib_user_sdma_queue_destroy(fd->pq);
+	}
+
+	if (fd->rec_cpu_num != -1)
+		__clear_bit(fd->rec_cpu_num, qib_cpulist);
+
+	if (--rcd->cnt) {
+		/*
+		 * XXX If the master closes the context before the slave(s),
+		 * revoke the mmap for the eager receive queue so
+		 * the slave(s) don't wait for receive data forever.
+		 */
+		rcd->active_slaves &= ~(1 << fd->subctxt);
+		rcd->subpid[fd->subctxt] = 0;
+		mutex_unlock(&qib_mutex);
+		goto bail;
+	}
+
+	/* early; no interrupt users after this */
+	spin_lock_irqsave(&dd->uctxt_lock, flags);
+	ctxt = rcd->ctxt;
+	dd->rcd[ctxt] = NULL;
+	pid = rcd->pid;
+	rcd->pid = 0;
+	spin_unlock_irqrestore(&dd->uctxt_lock, flags);
+
+	if (rcd->rcvwait_to || rcd->piowait_to ||
+	    rcd->rcvnowait || rcd->pionowait) {
+		rcd->rcvwait_to = 0;
+		rcd->piowait_to = 0;
+		rcd->rcvnowait = 0;
+		rcd->pionowait = 0;
+	}
+	if (rcd->flag)
+		rcd->flag = 0;
+
+	if (dd->kregbase) {
+		/* atomically clear receive enable ctxt and intr avail. */
+		dd->f_rcvctrl(rcd->ppd, QIB_RCVCTRL_CTXT_DIS |
+				  QIB_RCVCTRL_INTRAVAIL_DIS, ctxt);
+
+		/* clean up the pkeys for this ctxt user */
+		qib_clean_part_key(rcd, dd);
+		qib_disarm_piobufs(dd, rcd->pio_base, rcd->piocnt);
+		qib_chg_pioavailkernel(dd, rcd->pio_base,
+				       rcd->piocnt, TXCHK_CHG_TYPE_KERN, NULL);
+
+		dd->f_clear_tids(dd, rcd);
+
+		if (dd->pageshadow)
+			unlock_expected_tids(rcd);
+		qib_stats.sps_ctxts--;
+		dd->freectxts++;
+	}
+
+	mutex_unlock(&qib_mutex);
+	qib_free_ctxtdata(dd, rcd); /* after releasing the mutex */
+
+bail:
+	kfree(fd);
+	return ret;
+}
+
+static int qib_ctxt_info(struct file *fp, struct qib_ctxt_info __user *uinfo)
+{
+	struct qib_ctxt_info info;
+	int ret;
+	size_t sz;
+	struct qib_ctxtdata *rcd = ctxt_fp(fp);
+	struct qib_filedata *fd;
+
+	fd = fp->private_data;
+
+	info.num_active = qib_count_active_units();
+	info.unit = rcd->dd->unit;
+	info.port = rcd->ppd->port;
+	info.ctxt = rcd->ctxt;
+	info.subctxt =  subctxt_fp(fp);
+	/* Number of user ctxts available for this device. */
+	info.num_ctxts = rcd->dd->cfgctxts - rcd->dd->first_user_ctxt;
+	info.num_subctxts = rcd->subctxt_cnt;
+	info.rec_cpu = fd->rec_cpu_num;
+	sz = sizeof(info);
+
+	if (copy_to_user(uinfo, &info, sz)) {
+		ret = -EFAULT;
+		goto bail;
+	}
+	ret = 0;
+
+bail:
+	return ret;
+}
+
+static int qib_sdma_get_inflight(struct qib_user_sdma_queue *pq,
+				 u32 __user *inflightp)
+{
+	const u32 val = qib_user_sdma_inflight_counter(pq);
+
+	if (put_user(val, inflightp))
+		return -EFAULT;
+
+	return 0;
+}
+
+static int qib_sdma_get_complete(struct qib_pportdata *ppd,
+				 struct qib_user_sdma_queue *pq,
+				 u32 __user *completep)
+{
+	u32 val;
+	int err;
+
+	if (!pq)
+		return -EINVAL;
+
+	err = qib_user_sdma_make_progress(ppd, pq);
+	if (err < 0)
+		return err;
+
+	val = qib_user_sdma_complete_counter(pq);
+	if (put_user(val, completep))
+		return -EFAULT;
+
+	return 0;
+}
+
+static int disarm_req_delay(struct qib_ctxtdata *rcd)
+{
+	int ret = 0;
+
+	if (!usable(rcd->ppd)) {
+		int i;
+		/*
+		 * if link is down, or otherwise not usable, delay
+		 * the caller up to 30 seconds, so we don't thrash
+		 * in trying to get the chip back to ACTIVE, and
+		 * set flag so they make the call again.
+		 */
+		if (rcd->user_event_mask) {
+			/*
+			 * subctxt_cnt is 0 if not shared, so do base
+			 * separately, first, then remaining subctxt, if any
+			 */
+			set_bit(_QIB_EVENT_DISARM_BUFS_BIT,
+				&rcd->user_event_mask[0]);
+			for (i = 1; i < rcd->subctxt_cnt; i++)
+				set_bit(_QIB_EVENT_DISARM_BUFS_BIT,
+					&rcd->user_event_mask[i]);
+		}
+		for (i = 0; !usable(rcd->ppd) && i < 300; i++)
+			msleep(100);
+		ret = -ENETDOWN;
+	}
+	return ret;
+}
+
+/*
+ * Find all user contexts in use, and set the specified bit in their
+ * event mask.
+ * See also find_ctxt() for a similar use, that is specific to send buffers.
+ */
+int qib_set_uevent_bits(struct qib_pportdata *ppd, const int evtbit)
+{
+	struct qib_ctxtdata *rcd;
+	unsigned ctxt;
+	int ret = 0;
+	unsigned long flags;
+
+	spin_lock_irqsave(&ppd->dd->uctxt_lock, flags);
+	for (ctxt = ppd->dd->first_user_ctxt; ctxt < ppd->dd->cfgctxts;
+	     ctxt++) {
+		rcd = ppd->dd->rcd[ctxt];
+		if (!rcd)
+			continue;
+		if (rcd->user_event_mask) {
+			int i;
+			/*
+			 * subctxt_cnt is 0 if not shared, so do base
+			 * separately, first, then remaining subctxt, if any
+			 */
+			set_bit(evtbit, &rcd->user_event_mask[0]);
+			for (i = 1; i < rcd->subctxt_cnt; i++)
+				set_bit(evtbit, &rcd->user_event_mask[i]);
+		}
+		ret = 1;
+		break;
+	}
+	spin_unlock_irqrestore(&ppd->dd->uctxt_lock, flags);
+
+	return ret;
+}
+
+/*
+ * clear the event notifier events for this context.
+ * For the DISARM_BUFS case, we also take action (this obsoletes
+ * the older QIB_CMD_DISARM_BUFS, but we keep it for backwards
+ * compatibility.
+ * Other bits don't currently require actions, just atomically clear.
+ * User process then performs actions appropriate to bit having been
+ * set, if desired, and checks again in future.
+ */
+static int qib_user_event_ack(struct qib_ctxtdata *rcd, int subctxt,
+			      unsigned long events)
+{
+	int ret = 0, i;
+
+	for (i = 0; i <= _QIB_MAX_EVENT_BIT; i++) {
+		if (!test_bit(i, &events))
+			continue;
+		if (i == _QIB_EVENT_DISARM_BUFS_BIT) {
+			(void)qib_disarm_piobufs_ifneeded(rcd);
+			ret = disarm_req_delay(rcd);
+		} else
+			clear_bit(i, &rcd->user_event_mask[subctxt]);
+	}
+	return ret;
+}
+
+static ssize_t qib_write(struct file *fp, const char __user *data,
+			 size_t count, loff_t *off)
+{
+	const struct qib_cmd __user *ucmd;
+	struct qib_ctxtdata *rcd;
+	const void __user *src;
+	size_t consumed, copy = 0;
+	struct qib_cmd cmd;
+	ssize_t ret = 0;
+	void *dest;
+
+	if (count < sizeof(cmd.type)) {
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	ucmd = (const struct qib_cmd __user *) data;
+
+	if (copy_from_user(&cmd.type, &ucmd->type, sizeof(cmd.type))) {
+		ret = -EFAULT;
+		goto bail;
+	}
+
+	consumed = sizeof(cmd.type);
+
+	switch (cmd.type) {
+	case QIB_CMD_ASSIGN_CTXT:
+	case QIB_CMD_USER_INIT:
+		copy = sizeof(cmd.cmd.user_info);
+		dest = &cmd.cmd.user_info;
+		src = &ucmd->cmd.user_info;
+		break;
+
+	case QIB_CMD_RECV_CTRL:
+		copy = sizeof(cmd.cmd.recv_ctrl);
+		dest = &cmd.cmd.recv_ctrl;
+		src = &ucmd->cmd.recv_ctrl;
+		break;
+
+	case QIB_CMD_CTXT_INFO:
+		copy = sizeof(cmd.cmd.ctxt_info);
+		dest = &cmd.cmd.ctxt_info;
+		src = &ucmd->cmd.ctxt_info;
+		break;
+
+	case QIB_CMD_TID_UPDATE:
+	case QIB_CMD_TID_FREE:
+		copy = sizeof(cmd.cmd.tid_info);
+		dest = &cmd.cmd.tid_info;
+		src = &ucmd->cmd.tid_info;
+		break;
+
+	case QIB_CMD_SET_PART_KEY:
+		copy = sizeof(cmd.cmd.part_key);
+		dest = &cmd.cmd.part_key;
+		src = &ucmd->cmd.part_key;
+		break;
+
+	case QIB_CMD_DISARM_BUFS:
+	case QIB_CMD_PIOAVAILUPD: /* force an update of PIOAvail reg */
+		copy = 0;
+		src = NULL;
+		dest = NULL;
+		break;
+
+	case QIB_CMD_POLL_TYPE:
+		copy = sizeof(cmd.cmd.poll_type);
+		dest = &cmd.cmd.poll_type;
+		src = &ucmd->cmd.poll_type;
+		break;
+
+	case QIB_CMD_ARMLAUNCH_CTRL:
+		copy = sizeof(cmd.cmd.armlaunch_ctrl);
+		dest = &cmd.cmd.armlaunch_ctrl;
+		src = &ucmd->cmd.armlaunch_ctrl;
+		break;
+
+	case QIB_CMD_SDMA_INFLIGHT:
+		copy = sizeof(cmd.cmd.sdma_inflight);
+		dest = &cmd.cmd.sdma_inflight;
+		src = &ucmd->cmd.sdma_inflight;
+		break;
+
+	case QIB_CMD_SDMA_COMPLETE:
+		copy = sizeof(cmd.cmd.sdma_complete);
+		dest = &cmd.cmd.sdma_complete;
+		src = &ucmd->cmd.sdma_complete;
+		break;
+
+	case QIB_CMD_ACK_EVENT:
+		copy = sizeof(cmd.cmd.event_mask);
+		dest = &cmd.cmd.event_mask;
+		src = &ucmd->cmd.event_mask;
+		break;
+
+	default:
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	if (copy) {
+		if ((count - consumed) < copy) {
+			ret = -EINVAL;
+			goto bail;
+		}
+		if (copy_from_user(dest, src, copy)) {
+			ret = -EFAULT;
+			goto bail;
+		}
+		consumed += copy;
+	}
+
+	rcd = ctxt_fp(fp);
+	if (!rcd && cmd.type != QIB_CMD_ASSIGN_CTXT) {
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	switch (cmd.type) {
+	case QIB_CMD_ASSIGN_CTXT:
+		ret = qib_assign_ctxt(fp, &cmd.cmd.user_info);
+		if (ret)
+			goto bail;
+		break;
+
+	case QIB_CMD_USER_INIT:
+		ret = qib_do_user_init(fp, &cmd.cmd.user_info);
+		if (ret)
+			goto bail;
+		ret = qib_get_base_info(fp, (void __user *) (unsigned long)
+					cmd.cmd.user_info.spu_base_info,
+					cmd.cmd.user_info.spu_base_info_size);
+		break;
+
+	case QIB_CMD_RECV_CTRL:
+		ret = qib_manage_rcvq(rcd, subctxt_fp(fp), cmd.cmd.recv_ctrl);
+		break;
+
+	case QIB_CMD_CTXT_INFO:
+		ret = qib_ctxt_info(fp, (struct qib_ctxt_info __user *)
+				    (unsigned long) cmd.cmd.ctxt_info);
+		break;
+
+	case QIB_CMD_TID_UPDATE:
+		ret = qib_tid_update(rcd, fp, &cmd.cmd.tid_info);
+		break;
+
+	case QIB_CMD_TID_FREE:
+		ret = qib_tid_free(rcd, subctxt_fp(fp), &cmd.cmd.tid_info);
+		break;
+
+	case QIB_CMD_SET_PART_KEY:
+		ret = qib_set_part_key(rcd, cmd.cmd.part_key);
+		break;
+
+	case QIB_CMD_DISARM_BUFS:
+		(void)qib_disarm_piobufs_ifneeded(rcd);
+		ret = disarm_req_delay(rcd);
+		break;
+
+	case QIB_CMD_PIOAVAILUPD:
+		qib_force_pio_avail_update(rcd->dd);
+		break;
+
+	case QIB_CMD_POLL_TYPE:
+		rcd->poll_type = cmd.cmd.poll_type;
+		break;
+
+	case QIB_CMD_ARMLAUNCH_CTRL:
+		rcd->dd->f_set_armlaunch(rcd->dd, cmd.cmd.armlaunch_ctrl);
+		break;
+
+	case QIB_CMD_SDMA_INFLIGHT:
+		ret = qib_sdma_get_inflight(user_sdma_queue_fp(fp),
+					    (u32 __user *) (unsigned long)
+					    cmd.cmd.sdma_inflight);
+		break;
+
+	case QIB_CMD_SDMA_COMPLETE:
+		ret = qib_sdma_get_complete(rcd->ppd,
+					    user_sdma_queue_fp(fp),
+					    (u32 __user *) (unsigned long)
+					    cmd.cmd.sdma_complete);
+		break;
+
+	case QIB_CMD_ACK_EVENT:
+		ret = qib_user_event_ack(rcd, subctxt_fp(fp),
+					 cmd.cmd.event_mask);
+		break;
+	}
+
+	if (ret >= 0)
+		ret = consumed;
+
+bail:
+	return ret;
+}
+
+static ssize_t qib_aio_write(struct kiocb *iocb, const struct iovec *iov,
+			     unsigned long dim, loff_t off)
+{
+	struct qib_filedata *fp = iocb->ki_filp->private_data;
+	struct qib_ctxtdata *rcd = ctxt_fp(iocb->ki_filp);
+	struct qib_user_sdma_queue *pq = fp->pq;
+
+	if (!dim || !pq)
+		return -EINVAL;
+
+	return qib_user_sdma_writev(rcd, pq, iov, dim);
+}
+
+static struct class *qib_class;
+static dev_t qib_dev;
+
+int qib_cdev_init(int minor, const char *name,
+		  const struct file_operations *fops,
+		  struct cdev **cdevp, struct device **devp)
+{
+	const dev_t dev = MKDEV(MAJOR(qib_dev), minor);
+	struct cdev *cdev;
+	struct device *device = NULL;
+	int ret;
+
+	cdev = cdev_alloc();
+	if (!cdev) {
+		pr_err("Could not allocate cdev for minor %d, %s\n",
+		       minor, name);
+		ret = -ENOMEM;
+		goto done;
+	}
+
+	cdev->owner = THIS_MODULE;
+	cdev->ops = fops;
+	kobject_set_name(&cdev->kobj, name);
+
+	ret = cdev_add(cdev, dev, 1);
+	if (ret < 0) {
+		pr_err("Could not add cdev for minor %d, %s (err %d)\n",
+		       minor, name, -ret);
+		goto err_cdev;
+	}
+
+	device = device_create(qib_class, NULL, dev, NULL, "%s", name);
+	if (!IS_ERR(device))
+		goto done;
+	ret = PTR_ERR(device);
+	device = NULL;
+	pr_err("Could not create device for minor %d, %s (err %d)\n",
+	       minor, name, -ret);
+err_cdev:
+	cdev_del(cdev);
+	cdev = NULL;
+done:
+	*cdevp = cdev;
+	*devp = device;
+	return ret;
+}
+
+void qib_cdev_cleanup(struct cdev **cdevp, struct device **devp)
+{
+	struct device *device = *devp;
+
+	if (device) {
+		device_unregister(device);
+		*devp = NULL;
+	}
+
+	if (*cdevp) {
+		cdev_del(*cdevp);
+		*cdevp = NULL;
+	}
+}
+
+static struct cdev *wildcard_cdev;
+static struct device *wildcard_device;
+
+int __init qib_dev_init(void)
+{
+	int ret;
+
+	ret = alloc_chrdev_region(&qib_dev, 0, QIB_NMINORS, QIB_DRV_NAME);
+	if (ret < 0) {
+		pr_err("Could not allocate chrdev region (err %d)\n", -ret);
+		goto done;
+	}
+
+	qib_class = class_create(THIS_MODULE, "ipath");
+	if (IS_ERR(qib_class)) {
+		ret = PTR_ERR(qib_class);
+		pr_err("Could not create device class (err %d)\n", -ret);
+		unregister_chrdev_region(qib_dev, QIB_NMINORS);
+	}
+
+done:
+	return ret;
+}
+
+void qib_dev_cleanup(void)
+{
+	if (qib_class) {
+		class_destroy(qib_class);
+		qib_class = NULL;
+	}
+
+	unregister_chrdev_region(qib_dev, QIB_NMINORS);
+}
+
+static atomic_t user_count = ATOMIC_INIT(0);
+
+static void qib_user_remove(struct qib_devdata *dd)
+{
+	if (atomic_dec_return(&user_count) == 0)
+		qib_cdev_cleanup(&wildcard_cdev, &wildcard_device);
+
+	qib_cdev_cleanup(&dd->user_cdev, &dd->user_device);
+}
+
+static int qib_user_add(struct qib_devdata *dd)
+{
+	char name[10];
+	int ret;
+
+	if (atomic_inc_return(&user_count) == 1) {
+		ret = qib_cdev_init(0, "ipath", &qib_file_ops,
+				    &wildcard_cdev, &wildcard_device);
+		if (ret)
+			goto done;
+	}
+
+	snprintf(name, sizeof(name), "ipath%d", dd->unit);
+	ret = qib_cdev_init(dd->unit + 1, name, &qib_file_ops,
+			    &dd->user_cdev, &dd->user_device);
+	if (ret)
+		qib_user_remove(dd);
+done:
+	return ret;
+}
+
+/*
+ * Create per-unit files in /dev
+ */
+int qib_device_create(struct qib_devdata *dd)
+{
+	int r, ret;
+
+	r = qib_user_add(dd);
+	ret = qib_diag_add(dd);
+	if (r && !ret)
+		ret = r;
+	return ret;
+}
+
+/*
+ * Remove per-unit files in /dev
+ * void, core kernel returns no errors for this stuff
+ */
+void qib_device_remove(struct qib_devdata *dd)
+{
+	qib_user_remove(dd);
+	qib_diag_remove(dd);
+}
diff --git a/drivers/infiniband/hw/qib/qib_fs.c b/drivers/infiniband/hw/qib/qib_fs.c
new file mode 100644
index 0000000..8185458
--- /dev/null
+++ b/drivers/infiniband/hw/qib/qib_fs.c
@@ -0,0 +1,620 @@
+/*
+ * Copyright (c) 2012 Intel Corporation. All rights reserved.
+ * Copyright (c) 2006 - 2012 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/mount.h>
+#include <linux/pagemap.h>
+#include <linux/init.h>
+#include <linux/namei.h>
+
+#include "qib.h"
+
+#define QIBFS_MAGIC 0x726a77
+
+static struct super_block *qib_super;
+
+#define private2dd(file) (file_inode(file)->i_private)
+
+static int qibfs_mknod(struct inode *dir, struct dentry *dentry,
+		       umode_t mode, const struct file_operations *fops,
+		       void *data)
+{
+	int error;
+	struct inode *inode = new_inode(dir->i_sb);
+
+	if (!inode) {
+		error = -EPERM;
+		goto bail;
+	}
+
+	inode->i_ino = get_next_ino();
+	inode->i_mode = mode;
+	inode->i_uid = GLOBAL_ROOT_UID;
+	inode->i_gid = GLOBAL_ROOT_GID;
+	inode->i_blocks = 0;
+	inode->i_atime = CURRENT_TIME;
+	inode->i_mtime = inode->i_atime;
+	inode->i_ctime = inode->i_atime;
+	inode->i_private = data;
+	if (S_ISDIR(mode)) {
+		inode->i_op = &simple_dir_inode_operations;
+		inc_nlink(inode);
+		inc_nlink(dir);
+	}
+
+	inode->i_fop = fops;
+
+	d_instantiate(dentry, inode);
+	error = 0;
+
+bail:
+	return error;
+}
+
+static int create_file(const char *name, umode_t mode,
+		       struct dentry *parent, struct dentry **dentry,
+		       const struct file_operations *fops, void *data)
+{
+	int error;
+
+	mutex_lock(&parent->d_inode->i_mutex);
+	*dentry = lookup_one_len(name, parent, strlen(name));
+	if (!IS_ERR(*dentry))
+		error = qibfs_mknod(parent->d_inode, *dentry,
+				    mode, fops, data);
+	else
+		error = PTR_ERR(*dentry);
+	mutex_unlock(&parent->d_inode->i_mutex);
+
+	return error;
+}
+
+static ssize_t driver_stats_read(struct file *file, char __user *buf,
+				 size_t count, loff_t *ppos)
+{
+	qib_stats.sps_ints = qib_sps_ints();
+	return simple_read_from_buffer(buf, count, ppos, &qib_stats,
+				       sizeof qib_stats);
+}
+
+/*
+ * driver stats field names, one line per stat, single string.  Used by
+ * programs like ipathstats to print the stats in a way which works for
+ * different versions of drivers, without changing program source.
+ * if qlogic_ib_stats changes, this needs to change.  Names need to be
+ * 12 chars or less (w/o newline), for proper display by ipathstats utility.
+ */
+static const char qib_statnames[] =
+	"KernIntr\n"
+	"ErrorIntr\n"
+	"Tx_Errs\n"
+	"Rcv_Errs\n"
+	"H/W_Errs\n"
+	"NoPIOBufs\n"
+	"CtxtsOpen\n"
+	"RcvLen_Errs\n"
+	"EgrBufFull\n"
+	"EgrHdrFull\n"
+	;
+
+static ssize_t driver_names_read(struct file *file, char __user *buf,
+				 size_t count, loff_t *ppos)
+{
+	return simple_read_from_buffer(buf, count, ppos, qib_statnames,
+		sizeof qib_statnames - 1); /* no null */
+}
+
+static const struct file_operations driver_ops[] = {
+	{ .read = driver_stats_read, .llseek = generic_file_llseek, },
+	{ .read = driver_names_read, .llseek = generic_file_llseek, },
+};
+
+/* read the per-device counters */
+static ssize_t dev_counters_read(struct file *file, char __user *buf,
+				 size_t count, loff_t *ppos)
+{
+	u64 *counters;
+	size_t avail;
+	struct qib_devdata *dd = private2dd(file);
+
+	avail = dd->f_read_cntrs(dd, *ppos, NULL, &counters);
+	return simple_read_from_buffer(buf, count, ppos, counters, avail);
+}
+
+/* read the per-device counters */
+static ssize_t dev_names_read(struct file *file, char __user *buf,
+			      size_t count, loff_t *ppos)
+{
+	char *names;
+	size_t avail;
+	struct qib_devdata *dd = private2dd(file);
+
+	avail = dd->f_read_cntrs(dd, *ppos, &names, NULL);
+	return simple_read_from_buffer(buf, count, ppos, names, avail);
+}
+
+static const struct file_operations cntr_ops[] = {
+	{ .read = dev_counters_read, .llseek = generic_file_llseek, },
+	{ .read = dev_names_read, .llseek = generic_file_llseek, },
+};
+
+/*
+ * Could use file_inode(file)->i_ino to figure out which file,
+ * instead of separate routine for each, but for now, this works...
+ */
+
+/* read the per-port names (same for each port) */
+static ssize_t portnames_read(struct file *file, char __user *buf,
+			      size_t count, loff_t *ppos)
+{
+	char *names;
+	size_t avail;
+	struct qib_devdata *dd = private2dd(file);
+
+	avail = dd->f_read_portcntrs(dd, *ppos, 0, &names, NULL);
+	return simple_read_from_buffer(buf, count, ppos, names, avail);
+}
+
+/* read the per-port counters for port 1 (pidx 0) */
+static ssize_t portcntrs_1_read(struct file *file, char __user *buf,
+				size_t count, loff_t *ppos)
+{
+	u64 *counters;
+	size_t avail;
+	struct qib_devdata *dd = private2dd(file);
+
+	avail = dd->f_read_portcntrs(dd, *ppos, 0, NULL, &counters);
+	return simple_read_from_buffer(buf, count, ppos, counters, avail);
+}
+
+/* read the per-port counters for port 2 (pidx 1) */
+static ssize_t portcntrs_2_read(struct file *file, char __user *buf,
+				size_t count, loff_t *ppos)
+{
+	u64 *counters;
+	size_t avail;
+	struct qib_devdata *dd = private2dd(file);
+
+	avail = dd->f_read_portcntrs(dd, *ppos, 1, NULL, &counters);
+	return simple_read_from_buffer(buf, count, ppos, counters, avail);
+}
+
+static const struct file_operations portcntr_ops[] = {
+	{ .read = portnames_read, .llseek = generic_file_llseek, },
+	{ .read = portcntrs_1_read, .llseek = generic_file_llseek, },
+	{ .read = portcntrs_2_read, .llseek = generic_file_llseek, },
+};
+
+/*
+ * read the per-port QSFP data for port 1 (pidx 0)
+ */
+static ssize_t qsfp_1_read(struct file *file, char __user *buf,
+			   size_t count, loff_t *ppos)
+{
+	struct qib_devdata *dd = private2dd(file);
+	char *tmp;
+	int ret;
+
+	tmp = kmalloc(PAGE_SIZE, GFP_KERNEL);
+	if (!tmp)
+		return -ENOMEM;
+
+	ret = qib_qsfp_dump(dd->pport, tmp, PAGE_SIZE);
+	if (ret > 0)
+		ret = simple_read_from_buffer(buf, count, ppos, tmp, ret);
+	kfree(tmp);
+	return ret;
+}
+
+/*
+ * read the per-port QSFP data for port 2 (pidx 1)
+ */
+static ssize_t qsfp_2_read(struct file *file, char __user *buf,
+			   size_t count, loff_t *ppos)
+{
+	struct qib_devdata *dd = private2dd(file);
+	char *tmp;
+	int ret;
+
+	if (dd->num_pports < 2)
+		return -ENODEV;
+
+	tmp = kmalloc(PAGE_SIZE, GFP_KERNEL);
+	if (!tmp)
+		return -ENOMEM;
+
+	ret = qib_qsfp_dump(dd->pport + 1, tmp, PAGE_SIZE);
+	if (ret > 0)
+		ret = simple_read_from_buffer(buf, count, ppos, tmp, ret);
+	kfree(tmp);
+	return ret;
+}
+
+static const struct file_operations qsfp_ops[] = {
+	{ .read = qsfp_1_read, .llseek = generic_file_llseek, },
+	{ .read = qsfp_2_read, .llseek = generic_file_llseek, },
+};
+
+static ssize_t flash_read(struct file *file, char __user *buf,
+			  size_t count, loff_t *ppos)
+{
+	struct qib_devdata *dd;
+	ssize_t ret;
+	loff_t pos;
+	char *tmp;
+
+	pos = *ppos;
+
+	if (pos < 0) {
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	if (pos >= sizeof(struct qib_flash)) {
+		ret = 0;
+		goto bail;
+	}
+
+	if (count > sizeof(struct qib_flash) - pos)
+		count = sizeof(struct qib_flash) - pos;
+
+	tmp = kmalloc(count, GFP_KERNEL);
+	if (!tmp) {
+		ret = -ENOMEM;
+		goto bail;
+	}
+
+	dd = private2dd(file);
+	if (qib_eeprom_read(dd, pos, tmp, count)) {
+		qib_dev_err(dd, "failed to read from flash\n");
+		ret = -ENXIO;
+		goto bail_tmp;
+	}
+
+	if (copy_to_user(buf, tmp, count)) {
+		ret = -EFAULT;
+		goto bail_tmp;
+	}
+
+	*ppos = pos + count;
+	ret = count;
+
+bail_tmp:
+	kfree(tmp);
+
+bail:
+	return ret;
+}
+
+static ssize_t flash_write(struct file *file, const char __user *buf,
+			   size_t count, loff_t *ppos)
+{
+	struct qib_devdata *dd;
+	ssize_t ret;
+	loff_t pos;
+	char *tmp;
+
+	pos = *ppos;
+
+	if (pos != 0) {
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	if (count != sizeof(struct qib_flash)) {
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	tmp = kmalloc(count, GFP_KERNEL);
+	if (!tmp) {
+		ret = -ENOMEM;
+		goto bail;
+	}
+
+	if (copy_from_user(tmp, buf, count)) {
+		ret = -EFAULT;
+		goto bail_tmp;
+	}
+
+	dd = private2dd(file);
+	if (qib_eeprom_write(dd, pos, tmp, count)) {
+		ret = -ENXIO;
+		qib_dev_err(dd, "failed to write to flash\n");
+		goto bail_tmp;
+	}
+
+	*ppos = pos + count;
+	ret = count;
+
+bail_tmp:
+	kfree(tmp);
+
+bail:
+	return ret;
+}
+
+static const struct file_operations flash_ops = {
+	.read = flash_read,
+	.write = flash_write,
+	.llseek = default_llseek,
+};
+
+static int add_cntr_files(struct super_block *sb, struct qib_devdata *dd)
+{
+	struct dentry *dir, *tmp;
+	char unit[10];
+	int ret, i;
+
+	/* create the per-unit directory */
+	snprintf(unit, sizeof unit, "%u", dd->unit);
+	ret = create_file(unit, S_IFDIR|S_IRUGO|S_IXUGO, sb->s_root, &dir,
+			  &simple_dir_operations, dd);
+	if (ret) {
+		pr_err("create_file(%s) failed: %d\n", unit, ret);
+		goto bail;
+	}
+
+	/* create the files in the new directory */
+	ret = create_file("counters", S_IFREG|S_IRUGO, dir, &tmp,
+			  &cntr_ops[0], dd);
+	if (ret) {
+		pr_err("create_file(%s/counters) failed: %d\n",
+		       unit, ret);
+		goto bail;
+	}
+	ret = create_file("counter_names", S_IFREG|S_IRUGO, dir, &tmp,
+			  &cntr_ops[1], dd);
+	if (ret) {
+		pr_err("create_file(%s/counter_names) failed: %d\n",
+		       unit, ret);
+		goto bail;
+	}
+	ret = create_file("portcounter_names", S_IFREG|S_IRUGO, dir, &tmp,
+			  &portcntr_ops[0], dd);
+	if (ret) {
+		pr_err("create_file(%s/%s) failed: %d\n",
+		       unit, "portcounter_names", ret);
+		goto bail;
+	}
+	for (i = 1; i <= dd->num_pports; i++) {
+		char fname[24];
+
+		sprintf(fname, "port%dcounters", i);
+		/* create the files in the new directory */
+		ret = create_file(fname, S_IFREG|S_IRUGO, dir, &tmp,
+				  &portcntr_ops[i], dd);
+		if (ret) {
+			pr_err("create_file(%s/%s) failed: %d\n",
+				unit, fname, ret);
+			goto bail;
+		}
+		if (!(dd->flags & QIB_HAS_QSFP))
+			continue;
+		sprintf(fname, "qsfp%d", i);
+		ret = create_file(fname, S_IFREG|S_IRUGO, dir, &tmp,
+				  &qsfp_ops[i - 1], dd);
+		if (ret) {
+			pr_err("create_file(%s/%s) failed: %d\n",
+				unit, fname, ret);
+			goto bail;
+		}
+	}
+
+	ret = create_file("flash", S_IFREG|S_IWUSR|S_IRUGO, dir, &tmp,
+			  &flash_ops, dd);
+	if (ret)
+		pr_err("create_file(%s/flash) failed: %d\n",
+			unit, ret);
+bail:
+	return ret;
+}
+
+static int remove_file(struct dentry *parent, char *name)
+{
+	struct dentry *tmp;
+	int ret;
+
+	tmp = lookup_one_len(name, parent, strlen(name));
+
+	if (IS_ERR(tmp)) {
+		ret = PTR_ERR(tmp);
+		goto bail;
+	}
+
+	spin_lock(&tmp->d_lock);
+	if (!(d_unhashed(tmp) && tmp->d_inode)) {
+		__d_drop(tmp);
+		spin_unlock(&tmp->d_lock);
+		simple_unlink(parent->d_inode, tmp);
+	} else {
+		spin_unlock(&tmp->d_lock);
+	}
+	dput(tmp);
+
+	ret = 0;
+bail:
+	/*
+	 * We don't expect clients to care about the return value, but
+	 * it's there if they need it.
+	 */
+	return ret;
+}
+
+static int remove_device_files(struct super_block *sb,
+			       struct qib_devdata *dd)
+{
+	struct dentry *dir, *root;
+	char unit[10];
+	int ret, i;
+
+	root = dget(sb->s_root);
+	mutex_lock(&root->d_inode->i_mutex);
+	snprintf(unit, sizeof unit, "%u", dd->unit);
+	dir = lookup_one_len(unit, root, strlen(unit));
+
+	if (IS_ERR(dir)) {
+		ret = PTR_ERR(dir);
+		pr_err("Lookup of %s failed\n", unit);
+		goto bail;
+	}
+
+	mutex_lock(&dir->d_inode->i_mutex);
+	remove_file(dir, "counters");
+	remove_file(dir, "counter_names");
+	remove_file(dir, "portcounter_names");
+	for (i = 0; i < dd->num_pports; i++) {
+		char fname[24];
+
+		sprintf(fname, "port%dcounters", i + 1);
+		remove_file(dir, fname);
+		if (dd->flags & QIB_HAS_QSFP) {
+			sprintf(fname, "qsfp%d", i + 1);
+			remove_file(dir, fname);
+		}
+	}
+	remove_file(dir, "flash");
+	mutex_unlock(&dir->d_inode->i_mutex);
+	ret = simple_rmdir(root->d_inode, dir);
+	d_delete(dir);
+	dput(dir);
+
+bail:
+	mutex_unlock(&root->d_inode->i_mutex);
+	dput(root);
+	return ret;
+}
+
+/*
+ * This fills everything in when the fs is mounted, to handle umount/mount
+ * after device init.  The direct add_cntr_files() call handles adding
+ * them from the init code, when the fs is already mounted.
+ */
+static int qibfs_fill_super(struct super_block *sb, void *data, int silent)
+{
+	struct qib_devdata *dd, *tmp;
+	unsigned long flags;
+	int ret;
+
+	static struct tree_descr files[] = {
+		[2] = {"driver_stats", &driver_ops[0], S_IRUGO},
+		[3] = {"driver_stats_names", &driver_ops[1], S_IRUGO},
+		{""},
+	};
+
+	ret = simple_fill_super(sb, QIBFS_MAGIC, files);
+	if (ret) {
+		pr_err("simple_fill_super failed: %d\n", ret);
+		goto bail;
+	}
+
+	spin_lock_irqsave(&qib_devs_lock, flags);
+
+	list_for_each_entry_safe(dd, tmp, &qib_dev_list, list) {
+		spin_unlock_irqrestore(&qib_devs_lock, flags);
+		ret = add_cntr_files(sb, dd);
+		if (ret)
+			goto bail;
+		spin_lock_irqsave(&qib_devs_lock, flags);
+	}
+
+	spin_unlock_irqrestore(&qib_devs_lock, flags);
+
+bail:
+	return ret;
+}
+
+static struct dentry *qibfs_mount(struct file_system_type *fs_type, int flags,
+			const char *dev_name, void *data)
+{
+	struct dentry *ret;
+	ret = mount_single(fs_type, flags, data, qibfs_fill_super);
+	if (!IS_ERR(ret))
+		qib_super = ret->d_sb;
+	return ret;
+}
+
+static void qibfs_kill_super(struct super_block *s)
+{
+	kill_litter_super(s);
+	qib_super = NULL;
+}
+
+int qibfs_add(struct qib_devdata *dd)
+{
+	int ret;
+
+	/*
+	 * On first unit initialized, qib_super will not yet exist
+	 * because nobody has yet tried to mount the filesystem, so
+	 * we can't consider that to be an error; if an error occurs
+	 * during the mount, that will get a complaint, so this is OK.
+	 * add_cntr_files() for all units is done at mount from
+	 * qibfs_fill_super(), so one way or another, everything works.
+	 */
+	if (qib_super == NULL)
+		ret = 0;
+	else
+		ret = add_cntr_files(qib_super, dd);
+	return ret;
+}
+
+int qibfs_remove(struct qib_devdata *dd)
+{
+	int ret = 0;
+
+	if (qib_super)
+		ret = remove_device_files(qib_super, dd);
+
+	return ret;
+}
+
+static struct file_system_type qibfs_fs_type = {
+	.owner =        THIS_MODULE,
+	.name =         "ipathfs",
+	.mount =        qibfs_mount,
+	.kill_sb =      qibfs_kill_super,
+};
+MODULE_ALIAS_FS("ipathfs");
+
+int __init qib_init_qibfs(void)
+{
+	return register_filesystem(&qibfs_fs_type);
+}
+
+int __exit qib_exit_qibfs(void)
+{
+	return unregister_filesystem(&qibfs_fs_type);
+}
diff --git a/drivers/infiniband/hw/qib/qib_iba6120.c b/drivers/infiniband/hw/qib/qib_iba6120.c
new file mode 100644
index 0000000..d68266a
--- /dev/null
+++ b/drivers/infiniband/hw/qib/qib_iba6120.c
@@ -0,0 +1,3599 @@
+/*
+ * Copyright (c) 2013 Intel Corporation. All rights reserved.
+ * Copyright (c) 2006, 2007, 2008, 2009, 2010 QLogic Corporation.
+ * All rights reserved.
+ * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/*
+ * This file contains all of the code that is specific to the
+ * QLogic_IB 6120 PCIe chip.
+ */
+
+#include <linux/interrupt.h>
+#include <linux/pci.h>
+#include <linux/delay.h>
+#include <rdma/ib_verbs.h>
+
+#include "qib.h"
+#include "qib_6120_regs.h"
+
+static void qib_6120_setup_setextled(struct qib_pportdata *, u32);
+static void sendctrl_6120_mod(struct qib_pportdata *ppd, u32 op);
+static u8 qib_6120_phys_portstate(u64);
+static u32 qib_6120_iblink_state(u64);
+
+/*
+ * This file contains all the chip-specific register information and
+ * access functions for the Intel Intel_IB PCI-Express chip.
+ *
+ */
+
+/* KREG_IDX uses machine-generated #defines */
+#define KREG_IDX(regname) (QIB_6120_##regname##_OFFS / sizeof(u64))
+
+/* Use defines to tie machine-generated names to lower-case names */
+#define kr_extctrl KREG_IDX(EXTCtrl)
+#define kr_extstatus KREG_IDX(EXTStatus)
+#define kr_gpio_clear KREG_IDX(GPIOClear)
+#define kr_gpio_mask KREG_IDX(GPIOMask)
+#define kr_gpio_out KREG_IDX(GPIOOut)
+#define kr_gpio_status KREG_IDX(GPIOStatus)
+#define kr_rcvctrl KREG_IDX(RcvCtrl)
+#define kr_sendctrl KREG_IDX(SendCtrl)
+#define kr_partitionkey KREG_IDX(RcvPartitionKey)
+#define kr_hwdiagctrl KREG_IDX(HwDiagCtrl)
+#define kr_ibcstatus KREG_IDX(IBCStatus)
+#define kr_ibcctrl KREG_IDX(IBCCtrl)
+#define kr_sendbuffererror KREG_IDX(SendBufErr0)
+#define kr_rcvbthqp KREG_IDX(RcvBTHQP)
+#define kr_counterregbase KREG_IDX(CntrRegBase)
+#define kr_palign KREG_IDX(PageAlign)
+#define kr_rcvegrbase KREG_IDX(RcvEgrBase)
+#define kr_rcvegrcnt KREG_IDX(RcvEgrCnt)
+#define kr_rcvhdrcnt KREG_IDX(RcvHdrCnt)
+#define kr_rcvhdrentsize KREG_IDX(RcvHdrEntSize)
+#define kr_rcvhdrsize KREG_IDX(RcvHdrSize)
+#define kr_rcvtidbase KREG_IDX(RcvTIDBase)
+#define kr_rcvtidcnt KREG_IDX(RcvTIDCnt)
+#define kr_scratch KREG_IDX(Scratch)
+#define kr_sendctrl KREG_IDX(SendCtrl)
+#define kr_sendpioavailaddr KREG_IDX(SendPIOAvailAddr)
+#define kr_sendpiobufbase KREG_IDX(SendPIOBufBase)
+#define kr_sendpiobufcnt KREG_IDX(SendPIOBufCnt)
+#define kr_sendpiosize KREG_IDX(SendPIOSize)
+#define kr_sendregbase KREG_IDX(SendRegBase)
+#define kr_userregbase KREG_IDX(UserRegBase)
+#define kr_control KREG_IDX(Control)
+#define kr_intclear KREG_IDX(IntClear)
+#define kr_intmask KREG_IDX(IntMask)
+#define kr_intstatus KREG_IDX(IntStatus)
+#define kr_errclear KREG_IDX(ErrClear)
+#define kr_errmask KREG_IDX(ErrMask)
+#define kr_errstatus KREG_IDX(ErrStatus)
+#define kr_hwerrclear KREG_IDX(HwErrClear)
+#define kr_hwerrmask KREG_IDX(HwErrMask)
+#define kr_hwerrstatus KREG_IDX(HwErrStatus)
+#define kr_revision KREG_IDX(Revision)
+#define kr_portcnt KREG_IDX(PortCnt)
+#define kr_serdes_cfg0 KREG_IDX(SerdesCfg0)
+#define kr_serdes_cfg1 (kr_serdes_cfg0 + 1)
+#define kr_serdes_stat KREG_IDX(SerdesStat)
+#define kr_xgxs_cfg KREG_IDX(XGXSCfg)
+
+/* These must only be written via qib_write_kreg_ctxt() */
+#define kr_rcvhdraddr KREG_IDX(RcvHdrAddr0)
+#define kr_rcvhdrtailaddr KREG_IDX(RcvHdrTailAddr0)
+
+#define CREG_IDX(regname) ((QIB_6120_##regname##_OFFS - \
+			QIB_6120_LBIntCnt_OFFS) / sizeof(u64))
+
+#define cr_badformat CREG_IDX(RxBadFormatCnt)
+#define cr_erricrc CREG_IDX(RxICRCErrCnt)
+#define cr_errlink CREG_IDX(RxLinkProblemCnt)
+#define cr_errlpcrc CREG_IDX(RxLPCRCErrCnt)
+#define cr_errpkey CREG_IDX(RxPKeyMismatchCnt)
+#define cr_rcvflowctrl_err CREG_IDX(RxFlowCtrlErrCnt)
+#define cr_err_rlen CREG_IDX(RxLenErrCnt)
+#define cr_errslen CREG_IDX(TxLenErrCnt)
+#define cr_errtidfull CREG_IDX(RxTIDFullErrCnt)
+#define cr_errtidvalid CREG_IDX(RxTIDValidErrCnt)
+#define cr_errvcrc CREG_IDX(RxVCRCErrCnt)
+#define cr_ibstatuschange CREG_IDX(IBStatusChangeCnt)
+#define cr_lbint CREG_IDX(LBIntCnt)
+#define cr_invalidrlen CREG_IDX(RxMaxMinLenErrCnt)
+#define cr_invalidslen CREG_IDX(TxMaxMinLenErrCnt)
+#define cr_lbflowstall CREG_IDX(LBFlowStallCnt)
+#define cr_pktrcv CREG_IDX(RxDataPktCnt)
+#define cr_pktrcvflowctrl CREG_IDX(RxFlowPktCnt)
+#define cr_pktsend CREG_IDX(TxDataPktCnt)
+#define cr_pktsendflow CREG_IDX(TxFlowPktCnt)
+#define cr_portovfl CREG_IDX(RxP0HdrEgrOvflCnt)
+#define cr_rcvebp CREG_IDX(RxEBPCnt)
+#define cr_rcvovfl CREG_IDX(RxBufOvflCnt)
+#define cr_senddropped CREG_IDX(TxDroppedPktCnt)
+#define cr_sendstall CREG_IDX(TxFlowStallCnt)
+#define cr_sendunderrun CREG_IDX(TxUnderrunCnt)
+#define cr_wordrcv CREG_IDX(RxDwordCnt)
+#define cr_wordsend CREG_IDX(TxDwordCnt)
+#define cr_txunsupvl CREG_IDX(TxUnsupVLErrCnt)
+#define cr_rxdroppkt CREG_IDX(RxDroppedPktCnt)
+#define cr_iblinkerrrecov CREG_IDX(IBLinkErrRecoveryCnt)
+#define cr_iblinkdown CREG_IDX(IBLinkDownedCnt)
+#define cr_ibsymbolerr CREG_IDX(IBSymbolErrCnt)
+
+#define SYM_RMASK(regname, fldname) ((u64)              \
+	QIB_6120_##regname##_##fldname##_RMASK)
+#define SYM_MASK(regname, fldname) ((u64)               \
+	QIB_6120_##regname##_##fldname##_RMASK <<       \
+	 QIB_6120_##regname##_##fldname##_LSB)
+#define SYM_LSB(regname, fldname) (QIB_6120_##regname##_##fldname##_LSB)
+
+#define SYM_FIELD(value, regname, fldname) ((u64) \
+	(((value) >> SYM_LSB(regname, fldname)) & \
+	 SYM_RMASK(regname, fldname)))
+#define ERR_MASK(fldname) SYM_MASK(ErrMask, fldname##Mask)
+#define HWE_MASK(fldname) SYM_MASK(HwErrMask, fldname##Mask)
+
+/* link training states, from IBC */
+#define IB_6120_LT_STATE_DISABLED        0x00
+#define IB_6120_LT_STATE_LINKUP          0x01
+#define IB_6120_LT_STATE_POLLACTIVE      0x02
+#define IB_6120_LT_STATE_POLLQUIET       0x03
+#define IB_6120_LT_STATE_SLEEPDELAY      0x04
+#define IB_6120_LT_STATE_SLEEPQUIET      0x05
+#define IB_6120_LT_STATE_CFGDEBOUNCE     0x08
+#define IB_6120_LT_STATE_CFGRCVFCFG      0x09
+#define IB_6120_LT_STATE_CFGWAITRMT      0x0a
+#define IB_6120_LT_STATE_CFGIDLE 0x0b
+#define IB_6120_LT_STATE_RECOVERRETRAIN  0x0c
+#define IB_6120_LT_STATE_RECOVERWAITRMT  0x0e
+#define IB_6120_LT_STATE_RECOVERIDLE     0x0f
+
+/* link state machine states from IBC */
+#define IB_6120_L_STATE_DOWN             0x0
+#define IB_6120_L_STATE_INIT             0x1
+#define IB_6120_L_STATE_ARM              0x2
+#define IB_6120_L_STATE_ACTIVE           0x3
+#define IB_6120_L_STATE_ACT_DEFER        0x4
+
+static const u8 qib_6120_physportstate[0x20] = {
+	[IB_6120_LT_STATE_DISABLED] = IB_PHYSPORTSTATE_DISABLED,
+	[IB_6120_LT_STATE_LINKUP] = IB_PHYSPORTSTATE_LINKUP,
+	[IB_6120_LT_STATE_POLLACTIVE] = IB_PHYSPORTSTATE_POLL,
+	[IB_6120_LT_STATE_POLLQUIET] = IB_PHYSPORTSTATE_POLL,
+	[IB_6120_LT_STATE_SLEEPDELAY] = IB_PHYSPORTSTATE_SLEEP,
+	[IB_6120_LT_STATE_SLEEPQUIET] = IB_PHYSPORTSTATE_SLEEP,
+	[IB_6120_LT_STATE_CFGDEBOUNCE] =
+		IB_PHYSPORTSTATE_CFG_TRAIN,
+	[IB_6120_LT_STATE_CFGRCVFCFG] =
+		IB_PHYSPORTSTATE_CFG_TRAIN,
+	[IB_6120_LT_STATE_CFGWAITRMT] =
+		IB_PHYSPORTSTATE_CFG_TRAIN,
+	[IB_6120_LT_STATE_CFGIDLE] = IB_PHYSPORTSTATE_CFG_TRAIN,
+	[IB_6120_LT_STATE_RECOVERRETRAIN] =
+		IB_PHYSPORTSTATE_LINK_ERR_RECOVER,
+	[IB_6120_LT_STATE_RECOVERWAITRMT] =
+		IB_PHYSPORTSTATE_LINK_ERR_RECOVER,
+	[IB_6120_LT_STATE_RECOVERIDLE] =
+		IB_PHYSPORTSTATE_LINK_ERR_RECOVER,
+	[0x10] = IB_PHYSPORTSTATE_CFG_TRAIN,
+	[0x11] = IB_PHYSPORTSTATE_CFG_TRAIN,
+	[0x12] = IB_PHYSPORTSTATE_CFG_TRAIN,
+	[0x13] = IB_PHYSPORTSTATE_CFG_TRAIN,
+	[0x14] = IB_PHYSPORTSTATE_CFG_TRAIN,
+	[0x15] = IB_PHYSPORTSTATE_CFG_TRAIN,
+	[0x16] = IB_PHYSPORTSTATE_CFG_TRAIN,
+	[0x17] = IB_PHYSPORTSTATE_CFG_TRAIN
+};
+
+
+struct qib_chip_specific {
+	u64 __iomem *cregbase;
+	u64 *cntrs;
+	u64 *portcntrs;
+	void *dummy_hdrq;   /* used after ctxt close */
+	dma_addr_t dummy_hdrq_phys;
+	spinlock_t kernel_tid_lock; /* no back to back kernel TID writes */
+	spinlock_t user_tid_lock; /* no back to back user TID writes */
+	spinlock_t rcvmod_lock; /* protect rcvctrl shadow changes */
+	spinlock_t gpio_lock; /* RMW of shadows/regs for ExtCtrl and GPIO */
+	u64 hwerrmask;
+	u64 errormask;
+	u64 gpio_out; /* shadow of kr_gpio_out, for rmw ops */
+	u64 gpio_mask; /* shadow the gpio mask register */
+	u64 extctrl; /* shadow the gpio output enable, etc... */
+	/*
+	 * these 5 fields are used to establish deltas for IB symbol
+	 * errors and linkrecovery errors.  They can be reported on
+	 * some chips during link negotiation prior to INIT, and with
+	 * DDR when faking DDR negotiations with non-IBTA switches.
+	 * The chip counters are adjusted at driver unload if there is
+	 * a non-zero delta.
+	 */
+	u64 ibdeltainprog;
+	u64 ibsymdelta;
+	u64 ibsymsnap;
+	u64 iblnkerrdelta;
+	u64 iblnkerrsnap;
+	u64 ibcctrl; /* shadow for kr_ibcctrl */
+	u32 lastlinkrecov; /* link recovery issue */
+	int irq;
+	u32 cntrnamelen;
+	u32 portcntrnamelen;
+	u32 ncntrs;
+	u32 nportcntrs;
+	/* used with gpio interrupts to implement IB counters */
+	u32 rxfc_unsupvl_errs;
+	u32 overrun_thresh_errs;
+	/*
+	 * these count only cases where _successive_ LocalLinkIntegrity
+	 * errors were seen in the receive headers of IB standard packets
+	 */
+	u32 lli_errs;
+	u32 lli_counter;
+	u64 lli_thresh;
+	u64 sword; /* total dwords sent (sample result) */
+	u64 rword; /* total dwords received (sample result) */
+	u64 spkts; /* total packets sent (sample result) */
+	u64 rpkts; /* total packets received (sample result) */
+	u64 xmit_wait; /* # of ticks no data sent (sample result) */
+	struct timer_list pma_timer;
+	char emsgbuf[128];
+	char bitsmsgbuf[64];
+	u8 pma_sample_status;
+};
+
+/* ibcctrl bits */
+#define QLOGIC_IB_IBCC_LINKINITCMD_DISABLE 1
+/* cycle through TS1/TS2 till OK */
+#define QLOGIC_IB_IBCC_LINKINITCMD_POLL 2
+/* wait for TS1, then go on */
+#define QLOGIC_IB_IBCC_LINKINITCMD_SLEEP 3
+#define QLOGIC_IB_IBCC_LINKINITCMD_SHIFT 16
+
+#define QLOGIC_IB_IBCC_LINKCMD_DOWN 1           /* move to 0x11 */
+#define QLOGIC_IB_IBCC_LINKCMD_ARMED 2          /* move to 0x21 */
+#define QLOGIC_IB_IBCC_LINKCMD_ACTIVE 3 /* move to 0x31 */
+#define QLOGIC_IB_IBCC_LINKCMD_SHIFT 18
+
+/*
+ * We could have a single register get/put routine, that takes a group type,
+ * but this is somewhat clearer and cleaner.  It also gives us some error
+ * checking.  64 bit register reads should always work, but are inefficient
+ * on opteron (the northbridge always generates 2 separate HT 32 bit reads),
+ * so we use kreg32 wherever possible.  User register and counter register
+ * reads are always 32 bit reads, so only one form of those routines.
+ */
+
+/**
+ * qib_read_ureg32 - read 32-bit virtualized per-context register
+ * @dd: device
+ * @regno: register number
+ * @ctxt: context number
+ *
+ * Return the contents of a register that is virtualized to be per context.
+ * Returns -1 on errors (not distinguishable from valid contents at
+ * runtime; we may add a separate error variable at some point).
+ */
+static inline u32 qib_read_ureg32(const struct qib_devdata *dd,
+				  enum qib_ureg regno, int ctxt)
+{
+	if (!dd->kregbase || !(dd->flags & QIB_PRESENT))
+		return 0;
+
+	if (dd->userbase)
+		return readl(regno + (u64 __iomem *)
+			     ((char __iomem *)dd->userbase +
+			      dd->ureg_align * ctxt));
+	else
+		return readl(regno + (u64 __iomem *)
+			     (dd->uregbase +
+			      (char __iomem *)dd->kregbase +
+			      dd->ureg_align * ctxt));
+}
+
+/**
+ * qib_write_ureg - write 32-bit virtualized per-context register
+ * @dd: device
+ * @regno: register number
+ * @value: value
+ * @ctxt: context
+ *
+ * Write the contents of a register that is virtualized to be per context.
+ */
+static inline void qib_write_ureg(const struct qib_devdata *dd,
+				  enum qib_ureg regno, u64 value, int ctxt)
+{
+	u64 __iomem *ubase;
+	if (dd->userbase)
+		ubase = (u64 __iomem *)
+			((char __iomem *) dd->userbase +
+			 dd->ureg_align * ctxt);
+	else
+		ubase = (u64 __iomem *)
+			(dd->uregbase +
+			 (char __iomem *) dd->kregbase +
+			 dd->ureg_align * ctxt);
+
+	if (dd->kregbase && (dd->flags & QIB_PRESENT))
+		writeq(value, &ubase[regno]);
+}
+
+static inline u32 qib_read_kreg32(const struct qib_devdata *dd,
+				  const u16 regno)
+{
+	if (!dd->kregbase || !(dd->flags & QIB_PRESENT))
+		return -1;
+	return readl((u32 __iomem *)&dd->kregbase[regno]);
+}
+
+static inline u64 qib_read_kreg64(const struct qib_devdata *dd,
+				  const u16 regno)
+{
+	if (!dd->kregbase || !(dd->flags & QIB_PRESENT))
+		return -1;
+
+	return readq(&dd->kregbase[regno]);
+}
+
+static inline void qib_write_kreg(const struct qib_devdata *dd,
+				  const u16 regno, u64 value)
+{
+	if (dd->kregbase && (dd->flags & QIB_PRESENT))
+		writeq(value, &dd->kregbase[regno]);
+}
+
+/**
+ * qib_write_kreg_ctxt - write a device's per-ctxt 64-bit kernel register
+ * @dd: the qlogic_ib device
+ * @regno: the register number to write
+ * @ctxt: the context containing the register
+ * @value: the value to write
+ */
+static inline void qib_write_kreg_ctxt(const struct qib_devdata *dd,
+				       const u16 regno, unsigned ctxt,
+				       u64 value)
+{
+	qib_write_kreg(dd, regno + ctxt, value);
+}
+
+static inline void write_6120_creg(const struct qib_devdata *dd,
+				   u16 regno, u64 value)
+{
+	if (dd->cspec->cregbase && (dd->flags & QIB_PRESENT))
+		writeq(value, &dd->cspec->cregbase[regno]);
+}
+
+static inline u64 read_6120_creg(const struct qib_devdata *dd, u16 regno)
+{
+	if (!dd->cspec->cregbase || !(dd->flags & QIB_PRESENT))
+		return 0;
+	return readq(&dd->cspec->cregbase[regno]);
+}
+
+static inline u32 read_6120_creg32(const struct qib_devdata *dd, u16 regno)
+{
+	if (!dd->cspec->cregbase || !(dd->flags & QIB_PRESENT))
+		return 0;
+	return readl(&dd->cspec->cregbase[regno]);
+}
+
+/* kr_control bits */
+#define QLOGIC_IB_C_RESET 1U
+
+/* kr_intstatus, kr_intclear, kr_intmask bits */
+#define QLOGIC_IB_I_RCVURG_MASK ((1U << 5) - 1)
+#define QLOGIC_IB_I_RCVURG_SHIFT 0
+#define QLOGIC_IB_I_RCVAVAIL_MASK ((1U << 5) - 1)
+#define QLOGIC_IB_I_RCVAVAIL_SHIFT 12
+
+#define QLOGIC_IB_C_FREEZEMODE 0x00000002
+#define QLOGIC_IB_C_LINKENABLE 0x00000004
+#define QLOGIC_IB_I_ERROR               0x0000000080000000ULL
+#define QLOGIC_IB_I_SPIOSENT            0x0000000040000000ULL
+#define QLOGIC_IB_I_SPIOBUFAVAIL        0x0000000020000000ULL
+#define QLOGIC_IB_I_GPIO                0x0000000010000000ULL
+#define QLOGIC_IB_I_BITSEXTANT \
+		((QLOGIC_IB_I_RCVURG_MASK << QLOGIC_IB_I_RCVURG_SHIFT) | \
+		(QLOGIC_IB_I_RCVAVAIL_MASK << \
+		 QLOGIC_IB_I_RCVAVAIL_SHIFT) | \
+		QLOGIC_IB_I_ERROR | QLOGIC_IB_I_SPIOSENT | \
+		QLOGIC_IB_I_SPIOBUFAVAIL | QLOGIC_IB_I_GPIO)
+
+/* kr_hwerrclear, kr_hwerrmask, kr_hwerrstatus, bits */
+#define QLOGIC_IB_HWE_PCIEMEMPARITYERR_MASK  0x000000000000003fULL
+#define QLOGIC_IB_HWE_PCIEMEMPARITYERR_SHIFT 0
+#define QLOGIC_IB_HWE_PCIEPOISONEDTLP      0x0000000010000000ULL
+#define QLOGIC_IB_HWE_PCIECPLTIMEOUT       0x0000000020000000ULL
+#define QLOGIC_IB_HWE_PCIEBUSPARITYXTLH    0x0000000040000000ULL
+#define QLOGIC_IB_HWE_PCIEBUSPARITYXADM    0x0000000080000000ULL
+#define QLOGIC_IB_HWE_PCIEBUSPARITYRADM    0x0000000100000000ULL
+#define QLOGIC_IB_HWE_COREPLL_FBSLIP       0x0080000000000000ULL
+#define QLOGIC_IB_HWE_COREPLL_RFSLIP       0x0100000000000000ULL
+#define QLOGIC_IB_HWE_PCIE1PLLFAILED       0x0400000000000000ULL
+#define QLOGIC_IB_HWE_PCIE0PLLFAILED       0x0800000000000000ULL
+#define QLOGIC_IB_HWE_SERDESPLLFAILED      0x1000000000000000ULL
+
+
+/* kr_extstatus bits */
+#define QLOGIC_IB_EXTS_FREQSEL 0x2
+#define QLOGIC_IB_EXTS_SERDESSEL 0x4
+#define QLOGIC_IB_EXTS_MEMBIST_ENDTEST     0x0000000000004000
+#define QLOGIC_IB_EXTS_MEMBIST_FOUND       0x0000000000008000
+
+/* kr_xgxsconfig bits */
+#define QLOGIC_IB_XGXS_RESET          0x5ULL
+
+#define _QIB_GPIO_SDA_NUM 1
+#define _QIB_GPIO_SCL_NUM 0
+
+/* Bits in GPIO for the added IB link interrupts */
+#define GPIO_RXUVL_BIT 3
+#define GPIO_OVRUN_BIT 4
+#define GPIO_LLI_BIT 5
+#define GPIO_ERRINTR_MASK 0x38
+
+
+#define QLOGIC_IB_RT_BUFSIZE_MASK 0xe0000000ULL
+#define QLOGIC_IB_RT_BUFSIZE_SHIFTVAL(tid) \
+	((((tid) & QLOGIC_IB_RT_BUFSIZE_MASK) >> 29) + 11 - 1)
+#define QLOGIC_IB_RT_BUFSIZE(tid) (1 << QLOGIC_IB_RT_BUFSIZE_SHIFTVAL(tid))
+#define QLOGIC_IB_RT_IS_VALID(tid) \
+	(((tid) & QLOGIC_IB_RT_BUFSIZE_MASK) && \
+	 ((((tid) & QLOGIC_IB_RT_BUFSIZE_MASK) != QLOGIC_IB_RT_BUFSIZE_MASK)))
+#define QLOGIC_IB_RT_ADDR_MASK 0x1FFFFFFFULL /* 29 bits valid */
+#define QLOGIC_IB_RT_ADDR_SHIFT 10
+
+#define QLOGIC_IB_R_INTRAVAIL_SHIFT 16
+#define QLOGIC_IB_R_TAILUPD_SHIFT 31
+#define IBA6120_R_PKEY_DIS_SHIFT 30
+
+#define PBC_6120_VL15_SEND_CTRL (1ULL << 31) /* pbc; VL15; link_buf only */
+
+#define IBCBUSFRSPCPARITYERR HWE_MASK(IBCBusFromSPCParityErr)
+#define IBCBUSTOSPCPARITYERR HWE_MASK(IBCBusToSPCParityErr)
+
+#define SYM_MASK_BIT(regname, fldname, bit) ((u64) \
+	((1ULL << (SYM_LSB(regname, fldname) + (bit)))))
+
+#define TXEMEMPARITYERR_PIOBUF \
+	SYM_MASK_BIT(HwErrMask, TXEMemParityErrMask, 0)
+#define TXEMEMPARITYERR_PIOPBC \
+	SYM_MASK_BIT(HwErrMask, TXEMemParityErrMask, 1)
+#define TXEMEMPARITYERR_PIOLAUNCHFIFO \
+	SYM_MASK_BIT(HwErrMask, TXEMemParityErrMask, 2)
+
+#define RXEMEMPARITYERR_RCVBUF \
+	SYM_MASK_BIT(HwErrMask, RXEMemParityErrMask, 0)
+#define RXEMEMPARITYERR_LOOKUPQ \
+	SYM_MASK_BIT(HwErrMask, RXEMemParityErrMask, 1)
+#define RXEMEMPARITYERR_EXPTID \
+	SYM_MASK_BIT(HwErrMask, RXEMemParityErrMask, 2)
+#define RXEMEMPARITYERR_EAGERTID \
+	SYM_MASK_BIT(HwErrMask, RXEMemParityErrMask, 3)
+#define RXEMEMPARITYERR_FLAGBUF \
+	SYM_MASK_BIT(HwErrMask, RXEMemParityErrMask, 4)
+#define RXEMEMPARITYERR_DATAINFO \
+	SYM_MASK_BIT(HwErrMask, RXEMemParityErrMask, 5)
+#define RXEMEMPARITYERR_HDRINFO \
+	SYM_MASK_BIT(HwErrMask, RXEMemParityErrMask, 6)
+
+/* 6120 specific hardware errors... */
+static const struct qib_hwerror_msgs qib_6120_hwerror_msgs[] = {
+	/* generic hardware errors */
+	QLOGIC_IB_HWE_MSG(IBCBUSFRSPCPARITYERR, "QIB2IB Parity"),
+	QLOGIC_IB_HWE_MSG(IBCBUSTOSPCPARITYERR, "IB2QIB Parity"),
+
+	QLOGIC_IB_HWE_MSG(TXEMEMPARITYERR_PIOBUF,
+			  "TXE PIOBUF Memory Parity"),
+	QLOGIC_IB_HWE_MSG(TXEMEMPARITYERR_PIOPBC,
+			  "TXE PIOPBC Memory Parity"),
+	QLOGIC_IB_HWE_MSG(TXEMEMPARITYERR_PIOLAUNCHFIFO,
+			  "TXE PIOLAUNCHFIFO Memory Parity"),
+
+	QLOGIC_IB_HWE_MSG(RXEMEMPARITYERR_RCVBUF,
+			  "RXE RCVBUF Memory Parity"),
+	QLOGIC_IB_HWE_MSG(RXEMEMPARITYERR_LOOKUPQ,
+			  "RXE LOOKUPQ Memory Parity"),
+	QLOGIC_IB_HWE_MSG(RXEMEMPARITYERR_EAGERTID,
+			  "RXE EAGERTID Memory Parity"),
+	QLOGIC_IB_HWE_MSG(RXEMEMPARITYERR_EXPTID,
+			  "RXE EXPTID Memory Parity"),
+	QLOGIC_IB_HWE_MSG(RXEMEMPARITYERR_FLAGBUF,
+			  "RXE FLAGBUF Memory Parity"),
+	QLOGIC_IB_HWE_MSG(RXEMEMPARITYERR_DATAINFO,
+			  "RXE DATAINFO Memory Parity"),
+	QLOGIC_IB_HWE_MSG(RXEMEMPARITYERR_HDRINFO,
+			  "RXE HDRINFO Memory Parity"),
+
+	/* chip-specific hardware errors */
+	QLOGIC_IB_HWE_MSG(QLOGIC_IB_HWE_PCIEPOISONEDTLP,
+			  "PCIe Poisoned TLP"),
+	QLOGIC_IB_HWE_MSG(QLOGIC_IB_HWE_PCIECPLTIMEOUT,
+			  "PCIe completion timeout"),
+	/*
+	 * In practice, it's unlikely wthat we'll see PCIe PLL, or bus
+	 * parity or memory parity error failures, because most likely we
+	 * won't be able to talk to the core of the chip.  Nonetheless, we
+	 * might see them, if they are in parts of the PCIe core that aren't
+	 * essential.
+	 */
+	QLOGIC_IB_HWE_MSG(QLOGIC_IB_HWE_PCIE1PLLFAILED,
+			  "PCIePLL1"),
+	QLOGIC_IB_HWE_MSG(QLOGIC_IB_HWE_PCIE0PLLFAILED,
+			  "PCIePLL0"),
+	QLOGIC_IB_HWE_MSG(QLOGIC_IB_HWE_PCIEBUSPARITYXTLH,
+			  "PCIe XTLH core parity"),
+	QLOGIC_IB_HWE_MSG(QLOGIC_IB_HWE_PCIEBUSPARITYXADM,
+			  "PCIe ADM TX core parity"),
+	QLOGIC_IB_HWE_MSG(QLOGIC_IB_HWE_PCIEBUSPARITYRADM,
+			  "PCIe ADM RX core parity"),
+	QLOGIC_IB_HWE_MSG(QLOGIC_IB_HWE_SERDESPLLFAILED,
+			  "SerDes PLL"),
+};
+
+#define TXE_PIO_PARITY (TXEMEMPARITYERR_PIOBUF | TXEMEMPARITYERR_PIOPBC)
+#define _QIB_PLL_FAIL (QLOGIC_IB_HWE_COREPLL_FBSLIP |   \
+		QLOGIC_IB_HWE_COREPLL_RFSLIP)
+
+	/* variables for sanity checking interrupt and errors */
+#define IB_HWE_BITSEXTANT \
+	(HWE_MASK(RXEMemParityErr) |					\
+	 HWE_MASK(TXEMemParityErr) |					\
+	 (QLOGIC_IB_HWE_PCIEMEMPARITYERR_MASK <<			\
+	  QLOGIC_IB_HWE_PCIEMEMPARITYERR_SHIFT) |			\
+	 QLOGIC_IB_HWE_PCIE1PLLFAILED |					\
+	 QLOGIC_IB_HWE_PCIE0PLLFAILED |					\
+	 QLOGIC_IB_HWE_PCIEPOISONEDTLP |				\
+	 QLOGIC_IB_HWE_PCIECPLTIMEOUT |					\
+	 QLOGIC_IB_HWE_PCIEBUSPARITYXTLH |				\
+	 QLOGIC_IB_HWE_PCIEBUSPARITYXADM |				\
+	 QLOGIC_IB_HWE_PCIEBUSPARITYRADM |				\
+	 HWE_MASK(PowerOnBISTFailed) |					\
+	 QLOGIC_IB_HWE_COREPLL_FBSLIP |					\
+	 QLOGIC_IB_HWE_COREPLL_RFSLIP |					\
+	 QLOGIC_IB_HWE_SERDESPLLFAILED |				\
+	 HWE_MASK(IBCBusToSPCParityErr) |				\
+	 HWE_MASK(IBCBusFromSPCParityErr))
+
+#define IB_E_BITSEXTANT \
+	(ERR_MASK(RcvFormatErr) | ERR_MASK(RcvVCRCErr) |		\
+	 ERR_MASK(RcvICRCErr) | ERR_MASK(RcvMinPktLenErr) |		\
+	 ERR_MASK(RcvMaxPktLenErr) | ERR_MASK(RcvLongPktLenErr) |	\
+	 ERR_MASK(RcvShortPktLenErr) | ERR_MASK(RcvUnexpectedCharErr) | \
+	 ERR_MASK(RcvUnsupportedVLErr) | ERR_MASK(RcvEBPErr) |		\
+	 ERR_MASK(RcvIBFlowErr) | ERR_MASK(RcvBadVersionErr) |		\
+	 ERR_MASK(RcvEgrFullErr) | ERR_MASK(RcvHdrFullErr) |		\
+	 ERR_MASK(RcvBadTidErr) | ERR_MASK(RcvHdrLenErr) |		\
+	 ERR_MASK(RcvHdrErr) | ERR_MASK(RcvIBLostLinkErr) |		\
+	 ERR_MASK(SendMinPktLenErr) | ERR_MASK(SendMaxPktLenErr) |	\
+	 ERR_MASK(SendUnderRunErr) | ERR_MASK(SendPktLenErr) |		\
+	 ERR_MASK(SendDroppedSmpPktErr) |				\
+	 ERR_MASK(SendDroppedDataPktErr) |				\
+	 ERR_MASK(SendPioArmLaunchErr) |				\
+	 ERR_MASK(SendUnexpectedPktNumErr) |				\
+	 ERR_MASK(SendUnsupportedVLErr) | ERR_MASK(IBStatusChanged) |	\
+	 ERR_MASK(InvalidAddrErr) | ERR_MASK(ResetNegated) |		\
+	 ERR_MASK(HardwareErr))
+
+#define QLOGIC_IB_E_PKTERRS ( \
+		ERR_MASK(SendPktLenErr) |				\
+		ERR_MASK(SendDroppedDataPktErr) |			\
+		ERR_MASK(RcvVCRCErr) |					\
+		ERR_MASK(RcvICRCErr) |					\
+		ERR_MASK(RcvShortPktLenErr) |				\
+		ERR_MASK(RcvEBPErr))
+
+/* These are all rcv-related errors which we want to count for stats */
+#define E_SUM_PKTERRS						\
+	(ERR_MASK(RcvHdrLenErr) | ERR_MASK(RcvBadTidErr) |		\
+	 ERR_MASK(RcvBadVersionErr) | ERR_MASK(RcvHdrErr) |		\
+	 ERR_MASK(RcvLongPktLenErr) | ERR_MASK(RcvShortPktLenErr) |	\
+	 ERR_MASK(RcvMaxPktLenErr) | ERR_MASK(RcvMinPktLenErr) |	\
+	 ERR_MASK(RcvFormatErr) | ERR_MASK(RcvUnsupportedVLErr) |	\
+	 ERR_MASK(RcvUnexpectedCharErr) | ERR_MASK(RcvEBPErr))
+
+/* These are all send-related errors which we want to count for stats */
+#define E_SUM_ERRS							\
+	(ERR_MASK(SendPioArmLaunchErr) |				\
+	 ERR_MASK(SendUnexpectedPktNumErr) |				\
+	 ERR_MASK(SendDroppedDataPktErr) |				\
+	 ERR_MASK(SendDroppedSmpPktErr) |				\
+	 ERR_MASK(SendMaxPktLenErr) | ERR_MASK(SendUnsupportedVLErr) |	\
+	 ERR_MASK(SendMinPktLenErr) | ERR_MASK(SendPktLenErr) |		\
+	 ERR_MASK(InvalidAddrErr))
+
+/*
+ * this is similar to E_SUM_ERRS, but can't ignore armlaunch, don't ignore
+ * errors not related to freeze and cancelling buffers.  Can't ignore
+ * armlaunch because could get more while still cleaning up, and need
+ * to cancel those as they happen.
+ */
+#define E_SPKT_ERRS_IGNORE \
+	(ERR_MASK(SendDroppedDataPktErr) |				\
+	 ERR_MASK(SendDroppedSmpPktErr) |				\
+	 ERR_MASK(SendMaxPktLenErr) | ERR_MASK(SendMinPktLenErr) |	\
+	 ERR_MASK(SendPktLenErr))
+
+/*
+ * these are errors that can occur when the link changes state while
+ * a packet is being sent or received.  This doesn't cover things
+ * like EBP or VCRC that can be the result of a sending having the
+ * link change state, so we receive a "known bad" packet.
+ */
+#define E_SUM_LINK_PKTERRS		\
+	(ERR_MASK(SendDroppedDataPktErr) |				\
+	 ERR_MASK(SendDroppedSmpPktErr) |				\
+	 ERR_MASK(SendMinPktLenErr) | ERR_MASK(SendPktLenErr) |		\
+	 ERR_MASK(RcvShortPktLenErr) | ERR_MASK(RcvMinPktLenErr) |	\
+	 ERR_MASK(RcvUnexpectedCharErr))
+
+static void qib_6120_put_tid_2(struct qib_devdata *, u64 __iomem *,
+			       u32, unsigned long);
+
+/*
+ * On platforms using this chip, and not having ordered WC stores, we
+ * can get TXE parity errors due to speculative reads to the PIO buffers,
+ * and this, due to a chip issue can result in (many) false parity error
+ * reports.  So it's a debug print on those, and an info print on systems
+ * where the speculative reads don't occur.
+ */
+static void qib_6120_txe_recover(struct qib_devdata *dd)
+{
+	if (!qib_unordered_wc())
+		qib_devinfo(dd->pcidev,
+			    "Recovering from TXE PIO parity error\n");
+}
+
+/* enable/disable chip from delivering interrupts */
+static void qib_6120_set_intr_state(struct qib_devdata *dd, u32 enable)
+{
+	if (enable) {
+		if (dd->flags & QIB_BADINTR)
+			return;
+		qib_write_kreg(dd, kr_intmask, ~0ULL);
+		/* force re-interrupt of any pending interrupts. */
+		qib_write_kreg(dd, kr_intclear, 0ULL);
+	} else
+		qib_write_kreg(dd, kr_intmask, 0ULL);
+}
+
+/*
+ * Try to cleanup as much as possible for anything that might have gone
+ * wrong while in freeze mode, such as pio buffers being written by user
+ * processes (causing armlaunch), send errors due to going into freeze mode,
+ * etc., and try to avoid causing extra interrupts while doing so.
+ * Forcibly update the in-memory pioavail register copies after cleanup
+ * because the chip won't do it while in freeze mode (the register values
+ * themselves are kept correct).
+ * Make sure that we don't lose any important interrupts by using the chip
+ * feature that says that writing 0 to a bit in *clear that is set in
+ * *status will cause an interrupt to be generated again (if allowed by
+ * the *mask value).
+ * This is in chip-specific code because of all of the register accesses,
+ * even though the details are similar on most chips
+ */
+static void qib_6120_clear_freeze(struct qib_devdata *dd)
+{
+	/* disable error interrupts, to avoid confusion */
+	qib_write_kreg(dd, kr_errmask, 0ULL);
+
+	/* also disable interrupts; errormask is sometimes overwriten */
+	qib_6120_set_intr_state(dd, 0);
+
+	qib_cancel_sends(dd->pport);
+
+	/* clear the freeze, and be sure chip saw it */
+	qib_write_kreg(dd, kr_control, dd->control);
+	qib_read_kreg32(dd, kr_scratch);
+
+	/* force in-memory update now we are out of freeze */
+	qib_force_pio_avail_update(dd);
+
+	/*
+	 * force new interrupt if any hwerr, error or interrupt bits are
+	 * still set, and clear "safe" send packet errors related to freeze
+	 * and cancelling sends.  Re-enable error interrupts before possible
+	 * force of re-interrupt on pending interrupts.
+	 */
+	qib_write_kreg(dd, kr_hwerrclear, 0ULL);
+	qib_write_kreg(dd, kr_errclear, E_SPKT_ERRS_IGNORE);
+	qib_write_kreg(dd, kr_errmask, dd->cspec->errormask);
+	qib_6120_set_intr_state(dd, 1);
+}
+
+/**
+ * qib_handle_6120_hwerrors - display hardware errors.
+ * @dd: the qlogic_ib device
+ * @msg: the output buffer
+ * @msgl: the size of the output buffer
+ *
+ * Use same msg buffer as regular errors to avoid excessive stack
+ * use.  Most hardware errors are catastrophic, but for right now,
+ * we'll print them and continue.  Reuse the same message buffer as
+ * handle_6120_errors() to avoid excessive stack usage.
+ */
+static void qib_handle_6120_hwerrors(struct qib_devdata *dd, char *msg,
+				     size_t msgl)
+{
+	u64 hwerrs;
+	u32 bits, ctrl;
+	int isfatal = 0;
+	char *bitsmsg;
+	int log_idx;
+
+	hwerrs = qib_read_kreg64(dd, kr_hwerrstatus);
+	if (!hwerrs)
+		return;
+	if (hwerrs == ~0ULL) {
+		qib_dev_err(dd,
+			"Read of hardware error status failed (all bits set); ignoring\n");
+		return;
+	}
+	qib_stats.sps_hwerrs++;
+
+	/* Always clear the error status register, except MEMBISTFAIL,
+	 * regardless of whether we continue or stop using the chip.
+	 * We want that set so we know it failed, even across driver reload.
+	 * We'll still ignore it in the hwerrmask.  We do this partly for
+	 * diagnostics, but also for support */
+	qib_write_kreg(dd, kr_hwerrclear,
+		       hwerrs & ~HWE_MASK(PowerOnBISTFailed));
+
+	hwerrs &= dd->cspec->hwerrmask;
+
+	/* We log some errors to EEPROM, check if we have any of those. */
+	for (log_idx = 0; log_idx < QIB_EEP_LOG_CNT; ++log_idx)
+		if (hwerrs & dd->eep_st_masks[log_idx].hwerrs_to_log)
+			qib_inc_eeprom_err(dd, log_idx, 1);
+
+	/*
+	 * Make sure we get this much out, unless told to be quiet,
+	 * or it's occurred within the last 5 seconds.
+	 */
+	if (hwerrs & ~(TXE_PIO_PARITY | RXEMEMPARITYERR_EAGERTID))
+		qib_devinfo(dd->pcidev,
+			"Hardware error: hwerr=0x%llx (cleared)\n",
+			(unsigned long long) hwerrs);
+
+	if (hwerrs & ~IB_HWE_BITSEXTANT)
+		qib_dev_err(dd,
+			"hwerror interrupt with unknown errors %llx set\n",
+			(unsigned long long)(hwerrs & ~IB_HWE_BITSEXTANT));
+
+	ctrl = qib_read_kreg32(dd, kr_control);
+	if ((ctrl & QLOGIC_IB_C_FREEZEMODE) && !dd->diag_client) {
+		/*
+		 * Parity errors in send memory are recoverable,
+		 * just cancel the send (if indicated in * sendbuffererror),
+		 * count the occurrence, unfreeze (if no other handled
+		 * hardware error bits are set), and continue. They can
+		 * occur if a processor speculative read is done to the PIO
+		 * buffer while we are sending a packet, for example.
+		 */
+		if (hwerrs & TXE_PIO_PARITY) {
+			qib_6120_txe_recover(dd);
+			hwerrs &= ~TXE_PIO_PARITY;
+		}
+
+		if (!hwerrs) {
+			static u32 freeze_cnt;
+
+			freeze_cnt++;
+			qib_6120_clear_freeze(dd);
+		} else
+			isfatal = 1;
+	}
+
+	*msg = '\0';
+
+	if (hwerrs & HWE_MASK(PowerOnBISTFailed)) {
+		isfatal = 1;
+		strlcat(msg,
+			"[Memory BIST test failed, InfiniPath hardware unusable]",
+			msgl);
+		/* ignore from now on, so disable until driver reloaded */
+		dd->cspec->hwerrmask &= ~HWE_MASK(PowerOnBISTFailed);
+		qib_write_kreg(dd, kr_hwerrmask, dd->cspec->hwerrmask);
+	}
+
+	qib_format_hwerrors(hwerrs, qib_6120_hwerror_msgs,
+			    ARRAY_SIZE(qib_6120_hwerror_msgs), msg, msgl);
+
+	bitsmsg = dd->cspec->bitsmsgbuf;
+	if (hwerrs & (QLOGIC_IB_HWE_PCIEMEMPARITYERR_MASK <<
+		      QLOGIC_IB_HWE_PCIEMEMPARITYERR_SHIFT)) {
+		bits = (u32) ((hwerrs >>
+			       QLOGIC_IB_HWE_PCIEMEMPARITYERR_SHIFT) &
+			      QLOGIC_IB_HWE_PCIEMEMPARITYERR_MASK);
+		snprintf(bitsmsg, sizeof dd->cspec->bitsmsgbuf,
+			 "[PCIe Mem Parity Errs %x] ", bits);
+		strlcat(msg, bitsmsg, msgl);
+	}
+
+	if (hwerrs & _QIB_PLL_FAIL) {
+		isfatal = 1;
+		snprintf(bitsmsg, sizeof dd->cspec->bitsmsgbuf,
+			 "[PLL failed (%llx), InfiniPath hardware unusable]",
+			 (unsigned long long) hwerrs & _QIB_PLL_FAIL);
+		strlcat(msg, bitsmsg, msgl);
+		/* ignore from now on, so disable until driver reloaded */
+		dd->cspec->hwerrmask &= ~(hwerrs & _QIB_PLL_FAIL);
+		qib_write_kreg(dd, kr_hwerrmask, dd->cspec->hwerrmask);
+	}
+
+	if (hwerrs & QLOGIC_IB_HWE_SERDESPLLFAILED) {
+		/*
+		 * If it occurs, it is left masked since the external
+		 * interface is unused
+		 */
+		dd->cspec->hwerrmask &= ~QLOGIC_IB_HWE_SERDESPLLFAILED;
+		qib_write_kreg(dd, kr_hwerrmask, dd->cspec->hwerrmask);
+	}
+
+	if (hwerrs)
+		/*
+		 * if any set that we aren't ignoring; only
+		 * make the complaint once, in case it's stuck
+		 * or recurring, and we get here multiple
+		 * times.
+		 */
+		qib_dev_err(dd, "%s hardware error\n", msg);
+	else
+		*msg = 0; /* recovered from all of them */
+
+	if (isfatal && !dd->diag_client) {
+		qib_dev_err(dd,
+			"Fatal Hardware Error, no longer usable, SN %.16s\n",
+			dd->serial);
+		/*
+		 * for /sys status file and user programs to print; if no
+		 * trailing brace is copied, we'll know it was truncated.
+		 */
+		if (dd->freezemsg)
+			snprintf(dd->freezemsg, dd->freezelen,
+				 "{%s}", msg);
+		qib_disable_after_error(dd);
+	}
+}
+
+/*
+ * Decode the error status into strings, deciding whether to always
+ * print * it or not depending on "normal packet errors" vs everything
+ * else.   Return 1 if "real" errors, otherwise 0 if only packet
+ * errors, so caller can decide what to print with the string.
+ */
+static int qib_decode_6120_err(struct qib_devdata *dd, char *buf, size_t blen,
+			       u64 err)
+{
+	int iserr = 1;
+
+	*buf = '\0';
+	if (err & QLOGIC_IB_E_PKTERRS) {
+		if (!(err & ~QLOGIC_IB_E_PKTERRS))
+			iserr = 0;
+		if ((err & ERR_MASK(RcvICRCErr)) &&
+		    !(err&(ERR_MASK(RcvVCRCErr)|ERR_MASK(RcvEBPErr))))
+			strlcat(buf, "CRC ", blen);
+		if (!iserr)
+			goto done;
+	}
+	if (err & ERR_MASK(RcvHdrLenErr))
+		strlcat(buf, "rhdrlen ", blen);
+	if (err & ERR_MASK(RcvBadTidErr))
+		strlcat(buf, "rbadtid ", blen);
+	if (err & ERR_MASK(RcvBadVersionErr))
+		strlcat(buf, "rbadversion ", blen);
+	if (err & ERR_MASK(RcvHdrErr))
+		strlcat(buf, "rhdr ", blen);
+	if (err & ERR_MASK(RcvLongPktLenErr))
+		strlcat(buf, "rlongpktlen ", blen);
+	if (err & ERR_MASK(RcvMaxPktLenErr))
+		strlcat(buf, "rmaxpktlen ", blen);
+	if (err & ERR_MASK(RcvMinPktLenErr))
+		strlcat(buf, "rminpktlen ", blen);
+	if (err & ERR_MASK(SendMinPktLenErr))
+		strlcat(buf, "sminpktlen ", blen);
+	if (err & ERR_MASK(RcvFormatErr))
+		strlcat(buf, "rformaterr ", blen);
+	if (err & ERR_MASK(RcvUnsupportedVLErr))
+		strlcat(buf, "runsupvl ", blen);
+	if (err & ERR_MASK(RcvUnexpectedCharErr))
+		strlcat(buf, "runexpchar ", blen);
+	if (err & ERR_MASK(RcvIBFlowErr))
+		strlcat(buf, "ribflow ", blen);
+	if (err & ERR_MASK(SendUnderRunErr))
+		strlcat(buf, "sunderrun ", blen);
+	if (err & ERR_MASK(SendPioArmLaunchErr))
+		strlcat(buf, "spioarmlaunch ", blen);
+	if (err & ERR_MASK(SendUnexpectedPktNumErr))
+		strlcat(buf, "sunexperrpktnum ", blen);
+	if (err & ERR_MASK(SendDroppedSmpPktErr))
+		strlcat(buf, "sdroppedsmppkt ", blen);
+	if (err & ERR_MASK(SendMaxPktLenErr))
+		strlcat(buf, "smaxpktlen ", blen);
+	if (err & ERR_MASK(SendUnsupportedVLErr))
+		strlcat(buf, "sunsupVL ", blen);
+	if (err & ERR_MASK(InvalidAddrErr))
+		strlcat(buf, "invalidaddr ", blen);
+	if (err & ERR_MASK(RcvEgrFullErr))
+		strlcat(buf, "rcvegrfull ", blen);
+	if (err & ERR_MASK(RcvHdrFullErr))
+		strlcat(buf, "rcvhdrfull ", blen);
+	if (err & ERR_MASK(IBStatusChanged))
+		strlcat(buf, "ibcstatuschg ", blen);
+	if (err & ERR_MASK(RcvIBLostLinkErr))
+		strlcat(buf, "riblostlink ", blen);
+	if (err & ERR_MASK(HardwareErr))
+		strlcat(buf, "hardware ", blen);
+	if (err & ERR_MASK(ResetNegated))
+		strlcat(buf, "reset ", blen);
+done:
+	return iserr;
+}
+
+/*
+ * Called when we might have an error that is specific to a particular
+ * PIO buffer, and may need to cancel that buffer, so it can be re-used.
+ */
+static void qib_disarm_6120_senderrbufs(struct qib_pportdata *ppd)
+{
+	unsigned long sbuf[2];
+	struct qib_devdata *dd = ppd->dd;
+
+	/*
+	 * It's possible that sendbuffererror could have bits set; might
+	 * have already done this as a result of hardware error handling.
+	 */
+	sbuf[0] = qib_read_kreg64(dd, kr_sendbuffererror);
+	sbuf[1] = qib_read_kreg64(dd, kr_sendbuffererror + 1);
+
+	if (sbuf[0] || sbuf[1])
+		qib_disarm_piobufs_set(dd, sbuf,
+				       dd->piobcnt2k + dd->piobcnt4k);
+}
+
+static int chk_6120_linkrecovery(struct qib_devdata *dd, u64 ibcs)
+{
+	int ret = 1;
+	u32 ibstate = qib_6120_iblink_state(ibcs);
+	u32 linkrecov = read_6120_creg32(dd, cr_iblinkerrrecov);
+
+	if (linkrecov != dd->cspec->lastlinkrecov) {
+		/* and no more until active again */
+		dd->cspec->lastlinkrecov = 0;
+		qib_set_linkstate(dd->pport, QIB_IB_LINKDOWN);
+		ret = 0;
+	}
+	if (ibstate == IB_PORT_ACTIVE)
+		dd->cspec->lastlinkrecov =
+			read_6120_creg32(dd, cr_iblinkerrrecov);
+	return ret;
+}
+
+static void handle_6120_errors(struct qib_devdata *dd, u64 errs)
+{
+	char *msg;
+	u64 ignore_this_time = 0;
+	u64 iserr = 0;
+	int log_idx;
+	struct qib_pportdata *ppd = dd->pport;
+	u64 mask;
+
+	/* don't report errors that are masked */
+	errs &= dd->cspec->errormask;
+	msg = dd->cspec->emsgbuf;
+
+	/* do these first, they are most important */
+	if (errs & ERR_MASK(HardwareErr))
+		qib_handle_6120_hwerrors(dd, msg, sizeof dd->cspec->emsgbuf);
+	else
+		for (log_idx = 0; log_idx < QIB_EEP_LOG_CNT; ++log_idx)
+			if (errs & dd->eep_st_masks[log_idx].errs_to_log)
+				qib_inc_eeprom_err(dd, log_idx, 1);
+
+	if (errs & ~IB_E_BITSEXTANT)
+		qib_dev_err(dd,
+			"error interrupt with unknown errors %llx set\n",
+			(unsigned long long) (errs & ~IB_E_BITSEXTANT));
+
+	if (errs & E_SUM_ERRS) {
+		qib_disarm_6120_senderrbufs(ppd);
+		if ((errs & E_SUM_LINK_PKTERRS) &&
+		    !(ppd->lflags & QIBL_LINKACTIVE)) {
+			/*
+			 * This can happen when trying to bring the link
+			 * up, but the IB link changes state at the "wrong"
+			 * time. The IB logic then complains that the packet
+			 * isn't valid.  We don't want to confuse people, so
+			 * we just don't print them, except at debug
+			 */
+			ignore_this_time = errs & E_SUM_LINK_PKTERRS;
+		}
+	} else if ((errs & E_SUM_LINK_PKTERRS) &&
+		   !(ppd->lflags & QIBL_LINKACTIVE)) {
+		/*
+		 * This can happen when SMA is trying to bring the link
+		 * up, but the IB link changes state at the "wrong" time.
+		 * The IB logic then complains that the packet isn't
+		 * valid.  We don't want to confuse people, so we just
+		 * don't print them, except at debug
+		 */
+		ignore_this_time = errs & E_SUM_LINK_PKTERRS;
+	}
+
+	qib_write_kreg(dd, kr_errclear, errs);
+
+	errs &= ~ignore_this_time;
+	if (!errs)
+		goto done;
+
+	/*
+	 * The ones we mask off are handled specially below
+	 * or above.
+	 */
+	mask = ERR_MASK(IBStatusChanged) | ERR_MASK(RcvEgrFullErr) |
+		ERR_MASK(RcvHdrFullErr) | ERR_MASK(HardwareErr);
+	qib_decode_6120_err(dd, msg, sizeof dd->cspec->emsgbuf, errs & ~mask);
+
+	if (errs & E_SUM_PKTERRS)
+		qib_stats.sps_rcverrs++;
+	if (errs & E_SUM_ERRS)
+		qib_stats.sps_txerrs++;
+
+	iserr = errs & ~(E_SUM_PKTERRS | QLOGIC_IB_E_PKTERRS);
+
+	if (errs & ERR_MASK(IBStatusChanged)) {
+		u64 ibcs = qib_read_kreg64(dd, kr_ibcstatus);
+		u32 ibstate = qib_6120_iblink_state(ibcs);
+		int handle = 1;
+
+		if (ibstate != IB_PORT_INIT && dd->cspec->lastlinkrecov)
+			handle = chk_6120_linkrecovery(dd, ibcs);
+		/*
+		 * Since going into a recovery state causes the link state
+		 * to go down and since recovery is transitory, it is better
+		 * if we "miss" ever seeing the link training state go into
+		 * recovery (i.e., ignore this transition for link state
+		 * special handling purposes) without updating lastibcstat.
+		 */
+		if (handle && qib_6120_phys_portstate(ibcs) ==
+					    IB_PHYSPORTSTATE_LINK_ERR_RECOVER)
+			handle = 0;
+		if (handle)
+			qib_handle_e_ibstatuschanged(ppd, ibcs);
+	}
+
+	if (errs & ERR_MASK(ResetNegated)) {
+		qib_dev_err(dd,
+			"Got reset, requires re-init (unload and reload driver)\n");
+		dd->flags &= ~QIB_INITTED;  /* needs re-init */
+		/* mark as having had error */
+		*dd->devstatusp |= QIB_STATUS_HWERROR;
+		*dd->pport->statusp &= ~QIB_STATUS_IB_CONF;
+	}
+
+	if (*msg && iserr)
+		qib_dev_porterr(dd, ppd->port, "%s error\n", msg);
+
+	if (ppd->state_wanted & ppd->lflags)
+		wake_up_interruptible(&ppd->state_wait);
+
+	/*
+	 * If there were hdrq or egrfull errors, wake up any processes
+	 * waiting in poll.  We used to try to check which contexts had
+	 * the overflow, but given the cost of that and the chip reads
+	 * to support it, it's better to just wake everybody up if we
+	 * get an overflow; waiters can poll again if it's not them.
+	 */
+	if (errs & (ERR_MASK(RcvEgrFullErr) | ERR_MASK(RcvHdrFullErr))) {
+		qib_handle_urcv(dd, ~0U);
+		if (errs & ERR_MASK(RcvEgrFullErr))
+			qib_stats.sps_buffull++;
+		else
+			qib_stats.sps_hdrfull++;
+	}
+done:
+	return;
+}
+
+/**
+ * qib_6120_init_hwerrors - enable hardware errors
+ * @dd: the qlogic_ib device
+ *
+ * now that we have finished initializing everything that might reasonably
+ * cause a hardware error, and cleared those errors bits as they occur,
+ * we can enable hardware errors in the mask (potentially enabling
+ * freeze mode), and enable hardware errors as errors (along with
+ * everything else) in errormask
+ */
+static void qib_6120_init_hwerrors(struct qib_devdata *dd)
+{
+	u64 val;
+	u64 extsval;
+
+	extsval = qib_read_kreg64(dd, kr_extstatus);
+
+	if (!(extsval & QLOGIC_IB_EXTS_MEMBIST_ENDTEST))
+		qib_dev_err(dd, "MemBIST did not complete!\n");
+
+	/* init so all hwerrors interrupt, and enter freeze, ajdust below */
+	val = ~0ULL;
+	if (dd->minrev < 2) {
+		/*
+		 * Avoid problem with internal interface bus parity
+		 * checking. Fixed in Rev2.
+		 */
+		val &= ~QLOGIC_IB_HWE_PCIEBUSPARITYRADM;
+	}
+	/* avoid some intel cpu's speculative read freeze mode issue */
+	val &= ~TXEMEMPARITYERR_PIOBUF;
+
+	dd->cspec->hwerrmask = val;
+
+	qib_write_kreg(dd, kr_hwerrclear, ~HWE_MASK(PowerOnBISTFailed));
+	qib_write_kreg(dd, kr_hwerrmask, dd->cspec->hwerrmask);
+
+	/* clear all */
+	qib_write_kreg(dd, kr_errclear, ~0ULL);
+	/* enable errors that are masked, at least this first time. */
+	qib_write_kreg(dd, kr_errmask, ~0ULL);
+	dd->cspec->errormask = qib_read_kreg64(dd, kr_errmask);
+	/* clear any interrupts up to this point (ints still not enabled) */
+	qib_write_kreg(dd, kr_intclear, ~0ULL);
+
+	qib_write_kreg(dd, kr_rcvbthqp,
+		       dd->qpn_mask << (QIB_6120_RcvBTHQP_BTHQP_Mask_LSB - 1) |
+		       QIB_KD_QP);
+}
+
+/*
+ * Disable and enable the armlaunch error.  Used for PIO bandwidth testing
+ * on chips that are count-based, rather than trigger-based.  There is no
+ * reference counting, but that's also fine, given the intended use.
+ * Only chip-specific because it's all register accesses
+ */
+static void qib_set_6120_armlaunch(struct qib_devdata *dd, u32 enable)
+{
+	if (enable) {
+		qib_write_kreg(dd, kr_errclear,
+			       ERR_MASK(SendPioArmLaunchErr));
+		dd->cspec->errormask |= ERR_MASK(SendPioArmLaunchErr);
+	} else
+		dd->cspec->errormask &= ~ERR_MASK(SendPioArmLaunchErr);
+	qib_write_kreg(dd, kr_errmask, dd->cspec->errormask);
+}
+
+/*
+ * Formerly took parameter <which> in pre-shifted,
+ * pre-merged form with LinkCmd and LinkInitCmd
+ * together, and assuming the zero was NOP.
+ */
+static void qib_set_ib_6120_lstate(struct qib_pportdata *ppd, u16 linkcmd,
+				   u16 linitcmd)
+{
+	u64 mod_wd;
+	struct qib_devdata *dd = ppd->dd;
+	unsigned long flags;
+
+	if (linitcmd == QLOGIC_IB_IBCC_LINKINITCMD_DISABLE) {
+		/*
+		 * If we are told to disable, note that so link-recovery
+		 * code does not attempt to bring us back up.
+		 */
+		spin_lock_irqsave(&ppd->lflags_lock, flags);
+		ppd->lflags |= QIBL_IB_LINK_DISABLED;
+		spin_unlock_irqrestore(&ppd->lflags_lock, flags);
+	} else if (linitcmd || linkcmd == QLOGIC_IB_IBCC_LINKCMD_DOWN) {
+		/*
+		 * Any other linkinitcmd will lead to LINKDOWN and then
+		 * to INIT (if all is well), so clear flag to let
+		 * link-recovery code attempt to bring us back up.
+		 */
+		spin_lock_irqsave(&ppd->lflags_lock, flags);
+		ppd->lflags &= ~QIBL_IB_LINK_DISABLED;
+		spin_unlock_irqrestore(&ppd->lflags_lock, flags);
+	}
+
+	mod_wd = (linkcmd << QLOGIC_IB_IBCC_LINKCMD_SHIFT) |
+		(linitcmd << QLOGIC_IB_IBCC_LINKINITCMD_SHIFT);
+
+	qib_write_kreg(dd, kr_ibcctrl, dd->cspec->ibcctrl | mod_wd);
+	/* write to chip to prevent back-to-back writes of control reg */
+	qib_write_kreg(dd, kr_scratch, 0);
+}
+
+/**
+ * qib_6120_bringup_serdes - bring up the serdes
+ * @dd: the qlogic_ib device
+ */
+static int qib_6120_bringup_serdes(struct qib_pportdata *ppd)
+{
+	struct qib_devdata *dd = ppd->dd;
+	u64 val, config1, prev_val, hwstat, ibc;
+
+	/* Put IBC in reset, sends disabled */
+	dd->control &= ~QLOGIC_IB_C_LINKENABLE;
+	qib_write_kreg(dd, kr_control, 0ULL);
+
+	dd->cspec->ibdeltainprog = 1;
+	dd->cspec->ibsymsnap = read_6120_creg32(dd, cr_ibsymbolerr);
+	dd->cspec->iblnkerrsnap = read_6120_creg32(dd, cr_iblinkerrrecov);
+
+	/* flowcontrolwatermark is in units of KBytes */
+	ibc = 0x5ULL << SYM_LSB(IBCCtrl, FlowCtrlWaterMark);
+	/*
+	 * How often flowctrl sent.  More or less in usecs; balance against
+	 * watermark value, so that in theory senders always get a flow
+	 * control update in time to not let the IB link go idle.
+	 */
+	ibc |= 0x3ULL << SYM_LSB(IBCCtrl, FlowCtrlPeriod);
+	/* max error tolerance */
+	dd->cspec->lli_thresh = 0xf;
+	ibc |= (u64) dd->cspec->lli_thresh << SYM_LSB(IBCCtrl, PhyerrThreshold);
+	/* use "real" buffer space for */
+	ibc |= 4ULL << SYM_LSB(IBCCtrl, CreditScale);
+	/* IB credit flow control. */
+	ibc |= 0xfULL << SYM_LSB(IBCCtrl, OverrunThreshold);
+	/*
+	 * set initial max size pkt IBC will send, including ICRC; it's the
+	 * PIO buffer size in dwords, less 1; also see qib_set_mtu()
+	 */
+	ibc |= ((u64)(ppd->ibmaxlen >> 2) + 1) << SYM_LSB(IBCCtrl, MaxPktLen);
+	dd->cspec->ibcctrl = ibc; /* without linkcmd or linkinitcmd! */
+
+	/* initially come up waiting for TS1, without sending anything. */
+	val = dd->cspec->ibcctrl | (QLOGIC_IB_IBCC_LINKINITCMD_DISABLE <<
+		QLOGIC_IB_IBCC_LINKINITCMD_SHIFT);
+	qib_write_kreg(dd, kr_ibcctrl, val);
+
+	val = qib_read_kreg64(dd, kr_serdes_cfg0);
+	config1 = qib_read_kreg64(dd, kr_serdes_cfg1);
+
+	/*
+	 * Force reset on, also set rxdetect enable.  Must do before reading
+	 * serdesstatus at least for simulation, or some of the bits in
+	 * serdes status will come back as undefined and cause simulation
+	 * failures
+	 */
+	val |= SYM_MASK(SerdesCfg0, ResetPLL) |
+		SYM_MASK(SerdesCfg0, RxDetEnX) |
+		(SYM_MASK(SerdesCfg0, L1PwrDnA) |
+		 SYM_MASK(SerdesCfg0, L1PwrDnB) |
+		 SYM_MASK(SerdesCfg0, L1PwrDnC) |
+		 SYM_MASK(SerdesCfg0, L1PwrDnD));
+	qib_write_kreg(dd, kr_serdes_cfg0, val);
+	/* be sure chip saw it */
+	qib_read_kreg64(dd, kr_scratch);
+	udelay(5);              /* need pll reset set at least for a bit */
+	/*
+	 * after PLL is reset, set the per-lane Resets and TxIdle and
+	 * clear the PLL reset and rxdetect (to get falling edge).
+	 * Leave L1PWR bits set (permanently)
+	 */
+	val &= ~(SYM_MASK(SerdesCfg0, RxDetEnX) |
+		 SYM_MASK(SerdesCfg0, ResetPLL) |
+		 (SYM_MASK(SerdesCfg0, L1PwrDnA) |
+		  SYM_MASK(SerdesCfg0, L1PwrDnB) |
+		  SYM_MASK(SerdesCfg0, L1PwrDnC) |
+		  SYM_MASK(SerdesCfg0, L1PwrDnD)));
+	val |= (SYM_MASK(SerdesCfg0, ResetA) |
+		SYM_MASK(SerdesCfg0, ResetB) |
+		SYM_MASK(SerdesCfg0, ResetC) |
+		SYM_MASK(SerdesCfg0, ResetD)) |
+		SYM_MASK(SerdesCfg0, TxIdeEnX);
+	qib_write_kreg(dd, kr_serdes_cfg0, val);
+	/* be sure chip saw it */
+	(void) qib_read_kreg64(dd, kr_scratch);
+	/* need PLL reset clear for at least 11 usec before lane
+	 * resets cleared; give it a few more to be sure */
+	udelay(15);
+	val &= ~((SYM_MASK(SerdesCfg0, ResetA) |
+		  SYM_MASK(SerdesCfg0, ResetB) |
+		  SYM_MASK(SerdesCfg0, ResetC) |
+		  SYM_MASK(SerdesCfg0, ResetD)) |
+		 SYM_MASK(SerdesCfg0, TxIdeEnX));
+
+	qib_write_kreg(dd, kr_serdes_cfg0, val);
+	/* be sure chip saw it */
+	(void) qib_read_kreg64(dd, kr_scratch);
+
+	val = qib_read_kreg64(dd, kr_xgxs_cfg);
+	prev_val = val;
+	if (val & QLOGIC_IB_XGXS_RESET)
+		val &= ~QLOGIC_IB_XGXS_RESET;
+	if (SYM_FIELD(val, XGXSCfg, polarity_inv) != ppd->rx_pol_inv) {
+		/* need to compensate for Tx inversion in partner */
+		val &= ~SYM_MASK(XGXSCfg, polarity_inv);
+		val |= (u64)ppd->rx_pol_inv << SYM_LSB(XGXSCfg, polarity_inv);
+	}
+	if (val != prev_val)
+		qib_write_kreg(dd, kr_xgxs_cfg, val);
+
+	val = qib_read_kreg64(dd, kr_serdes_cfg0);
+
+	/* clear current and de-emphasis bits */
+	config1 &= ~0x0ffffffff00ULL;
+	/* set current to 20ma */
+	config1 |= 0x00000000000ULL;
+	/* set de-emphasis to -5.68dB */
+	config1 |= 0x0cccc000000ULL;
+	qib_write_kreg(dd, kr_serdes_cfg1, config1);
+
+	/* base and port guid same for single port */
+	ppd->guid = dd->base_guid;
+
+	/*
+	 * the process of setting and un-resetting the serdes normally
+	 * causes a serdes PLL error, so check for that and clear it
+	 * here.  Also clearr hwerr bit in errstatus, but not others.
+	 */
+	hwstat = qib_read_kreg64(dd, kr_hwerrstatus);
+	if (hwstat) {
+		/* should just have PLL, clear all set, in an case */
+		qib_write_kreg(dd, kr_hwerrclear, hwstat);
+		qib_write_kreg(dd, kr_errclear, ERR_MASK(HardwareErr));
+	}
+
+	dd->control |= QLOGIC_IB_C_LINKENABLE;
+	dd->control &= ~QLOGIC_IB_C_FREEZEMODE;
+	qib_write_kreg(dd, kr_control, dd->control);
+
+	return 0;
+}
+
+/**
+ * qib_6120_quiet_serdes - set serdes to txidle
+ * @ppd: physical port of the qlogic_ib device
+ * Called when driver is being unloaded
+ */
+static void qib_6120_quiet_serdes(struct qib_pportdata *ppd)
+{
+	struct qib_devdata *dd = ppd->dd;
+	u64 val;
+
+	qib_set_ib_6120_lstate(ppd, 0, QLOGIC_IB_IBCC_LINKINITCMD_DISABLE);
+
+	/* disable IBC */
+	dd->control &= ~QLOGIC_IB_C_LINKENABLE;
+	qib_write_kreg(dd, kr_control,
+		       dd->control | QLOGIC_IB_C_FREEZEMODE);
+
+	if (dd->cspec->ibsymdelta || dd->cspec->iblnkerrdelta ||
+	    dd->cspec->ibdeltainprog) {
+		u64 diagc;
+
+		/* enable counter writes */
+		diagc = qib_read_kreg64(dd, kr_hwdiagctrl);
+		qib_write_kreg(dd, kr_hwdiagctrl,
+			       diagc | SYM_MASK(HwDiagCtrl, CounterWrEnable));
+
+		if (dd->cspec->ibsymdelta || dd->cspec->ibdeltainprog) {
+			val = read_6120_creg32(dd, cr_ibsymbolerr);
+			if (dd->cspec->ibdeltainprog)
+				val -= val - dd->cspec->ibsymsnap;
+			val -= dd->cspec->ibsymdelta;
+			write_6120_creg(dd, cr_ibsymbolerr, val);
+		}
+		if (dd->cspec->iblnkerrdelta || dd->cspec->ibdeltainprog) {
+			val = read_6120_creg32(dd, cr_iblinkerrrecov);
+			if (dd->cspec->ibdeltainprog)
+				val -= val - dd->cspec->iblnkerrsnap;
+			val -= dd->cspec->iblnkerrdelta;
+			write_6120_creg(dd, cr_iblinkerrrecov, val);
+		}
+
+		/* and disable counter writes */
+		qib_write_kreg(dd, kr_hwdiagctrl, diagc);
+	}
+
+	val = qib_read_kreg64(dd, kr_serdes_cfg0);
+	val |= SYM_MASK(SerdesCfg0, TxIdeEnX);
+	qib_write_kreg(dd, kr_serdes_cfg0, val);
+}
+
+/**
+ * qib_6120_setup_setextled - set the state of the two external LEDs
+ * @dd: the qlogic_ib device
+ * @on: whether the link is up or not
+ *
+ * The exact combo of LEDs if on is true is determined by looking
+ * at the ibcstatus.
+
+ * These LEDs indicate the physical and logical state of IB link.
+ * For this chip (at least with recommended board pinouts), LED1
+ * is Yellow (logical state) and LED2 is Green (physical state),
+ *
+ * Note:  We try to match the Mellanox HCA LED behavior as best
+ * we can.  Green indicates physical link state is OK (something is
+ * plugged in, and we can train).
+ * Amber indicates the link is logically up (ACTIVE).
+ * Mellanox further blinks the amber LED to indicate data packet
+ * activity, but we have no hardware support for that, so it would
+ * require waking up every 10-20 msecs and checking the counters
+ * on the chip, and then turning the LED off if appropriate.  That's
+ * visible overhead, so not something we will do.
+ *
+ */
+static void qib_6120_setup_setextled(struct qib_pportdata *ppd, u32 on)
+{
+	u64 extctl, val, lst, ltst;
+	unsigned long flags;
+	struct qib_devdata *dd = ppd->dd;
+
+	/*
+	 * The diags use the LED to indicate diag info, so we leave
+	 * the external LED alone when the diags are running.
+	 */
+	if (dd->diag_client)
+		return;
+
+	/* Allow override of LED display for, e.g. Locating system in rack */
+	if (ppd->led_override) {
+		ltst = (ppd->led_override & QIB_LED_PHYS) ?
+			IB_PHYSPORTSTATE_LINKUP : IB_PHYSPORTSTATE_DISABLED,
+		lst = (ppd->led_override & QIB_LED_LOG) ?
+			IB_PORT_ACTIVE : IB_PORT_DOWN;
+	} else if (on) {
+		val = qib_read_kreg64(dd, kr_ibcstatus);
+		ltst = qib_6120_phys_portstate(val);
+		lst = qib_6120_iblink_state(val);
+	} else {
+		ltst = 0;
+		lst = 0;
+	}
+
+	spin_lock_irqsave(&dd->cspec->gpio_lock, flags);
+	extctl = dd->cspec->extctrl & ~(SYM_MASK(EXTCtrl, LEDPriPortGreenOn) |
+				 SYM_MASK(EXTCtrl, LEDPriPortYellowOn));
+
+	if (ltst == IB_PHYSPORTSTATE_LINKUP)
+		extctl |= SYM_MASK(EXTCtrl, LEDPriPortYellowOn);
+	if (lst == IB_PORT_ACTIVE)
+		extctl |= SYM_MASK(EXTCtrl, LEDPriPortGreenOn);
+	dd->cspec->extctrl = extctl;
+	qib_write_kreg(dd, kr_extctrl, extctl);
+	spin_unlock_irqrestore(&dd->cspec->gpio_lock, flags);
+}
+
+static void qib_6120_free_irq(struct qib_devdata *dd)
+{
+	if (dd->cspec->irq) {
+		free_irq(dd->cspec->irq, dd);
+		dd->cspec->irq = 0;
+	}
+	qib_nomsi(dd);
+}
+
+/**
+ * qib_6120_setup_cleanup - clean up any per-chip chip-specific stuff
+ * @dd: the qlogic_ib device
+ *
+ * This is called during driver unload.
+*/
+static void qib_6120_setup_cleanup(struct qib_devdata *dd)
+{
+	qib_6120_free_irq(dd);
+	kfree(dd->cspec->cntrs);
+	kfree(dd->cspec->portcntrs);
+	if (dd->cspec->dummy_hdrq) {
+		dma_free_coherent(&dd->pcidev->dev,
+				  ALIGN(dd->rcvhdrcnt *
+					dd->rcvhdrentsize *
+					sizeof(u32), PAGE_SIZE),
+				  dd->cspec->dummy_hdrq,
+				  dd->cspec->dummy_hdrq_phys);
+		dd->cspec->dummy_hdrq = NULL;
+	}
+}
+
+static void qib_wantpiobuf_6120_intr(struct qib_devdata *dd, u32 needint)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&dd->sendctrl_lock, flags);
+	if (needint)
+		dd->sendctrl |= SYM_MASK(SendCtrl, PIOIntBufAvail);
+	else
+		dd->sendctrl &= ~SYM_MASK(SendCtrl, PIOIntBufAvail);
+	qib_write_kreg(dd, kr_sendctrl, dd->sendctrl);
+	qib_write_kreg(dd, kr_scratch, 0ULL);
+	spin_unlock_irqrestore(&dd->sendctrl_lock, flags);
+}
+
+/*
+ * handle errors and unusual events first, separate function
+ * to improve cache hits for fast path interrupt handling
+ */
+static noinline void unlikely_6120_intr(struct qib_devdata *dd, u64 istat)
+{
+	if (unlikely(istat & ~QLOGIC_IB_I_BITSEXTANT))
+		qib_dev_err(dd, "interrupt with unknown interrupts %Lx set\n",
+			    istat & ~QLOGIC_IB_I_BITSEXTANT);
+
+	if (istat & QLOGIC_IB_I_ERROR) {
+		u64 estat = 0;
+
+		qib_stats.sps_errints++;
+		estat = qib_read_kreg64(dd, kr_errstatus);
+		if (!estat)
+			qib_devinfo(dd->pcidev,
+				"error interrupt (%Lx), but no error bits set!\n",
+				istat);
+		handle_6120_errors(dd, estat);
+	}
+
+	if (istat & QLOGIC_IB_I_GPIO) {
+		u32 gpiostatus;
+		u32 to_clear = 0;
+
+		/*
+		 * GPIO_3..5 on IBA6120 Rev2 chips indicate
+		 * errors that we need to count.
+		 */
+		gpiostatus = qib_read_kreg32(dd, kr_gpio_status);
+		/* First the error-counter case. */
+		if (gpiostatus & GPIO_ERRINTR_MASK) {
+			/* want to clear the bits we see asserted. */
+			to_clear |= (gpiostatus & GPIO_ERRINTR_MASK);
+
+			/*
+			 * Count appropriately, clear bits out of our copy,
+			 * as they have been "handled".
+			 */
+			if (gpiostatus & (1 << GPIO_RXUVL_BIT))
+				dd->cspec->rxfc_unsupvl_errs++;
+			if (gpiostatus & (1 << GPIO_OVRUN_BIT))
+				dd->cspec->overrun_thresh_errs++;
+			if (gpiostatus & (1 << GPIO_LLI_BIT))
+				dd->cspec->lli_errs++;
+			gpiostatus &= ~GPIO_ERRINTR_MASK;
+		}
+		if (gpiostatus) {
+			/*
+			 * Some unexpected bits remain. If they could have
+			 * caused the interrupt, complain and clear.
+			 * To avoid repetition of this condition, also clear
+			 * the mask. It is almost certainly due to error.
+			 */
+			const u32 mask = qib_read_kreg32(dd, kr_gpio_mask);
+
+			/*
+			 * Also check that the chip reflects our shadow,
+			 * and report issues, If they caused the interrupt.
+			 * we will suppress by refreshing from the shadow.
+			 */
+			if (mask & gpiostatus) {
+				to_clear |= (gpiostatus & mask);
+				dd->cspec->gpio_mask &= ~(gpiostatus & mask);
+				qib_write_kreg(dd, kr_gpio_mask,
+					       dd->cspec->gpio_mask);
+			}
+		}
+		if (to_clear)
+			qib_write_kreg(dd, kr_gpio_clear, (u64) to_clear);
+	}
+}
+
+static irqreturn_t qib_6120intr(int irq, void *data)
+{
+	struct qib_devdata *dd = data;
+	irqreturn_t ret;
+	u32 istat, ctxtrbits, rmask, crcs = 0;
+	unsigned i;
+
+	if ((dd->flags & (QIB_PRESENT | QIB_BADINTR)) != QIB_PRESENT) {
+		/*
+		 * This return value is not great, but we do not want the
+		 * interrupt core code to remove our interrupt handler
+		 * because we don't appear to be handling an interrupt
+		 * during a chip reset.
+		 */
+		ret = IRQ_HANDLED;
+		goto bail;
+	}
+
+	istat = qib_read_kreg32(dd, kr_intstatus);
+
+	if (unlikely(!istat)) {
+		ret = IRQ_NONE; /* not our interrupt, or already handled */
+		goto bail;
+	}
+	if (unlikely(istat == -1)) {
+		qib_bad_intrstatus(dd);
+		/* don't know if it was our interrupt or not */
+		ret = IRQ_NONE;
+		goto bail;
+	}
+
+	this_cpu_inc(*dd->int_counter);
+
+	if (unlikely(istat & (~QLOGIC_IB_I_BITSEXTANT |
+			      QLOGIC_IB_I_GPIO | QLOGIC_IB_I_ERROR)))
+		unlikely_6120_intr(dd, istat);
+
+	/*
+	 * Clear the interrupt bits we found set, relatively early, so we
+	 * "know" know the chip will have seen this by the time we process
+	 * the queue, and will re-interrupt if necessary.  The processor
+	 * itself won't take the interrupt again until we return.
+	 */
+	qib_write_kreg(dd, kr_intclear, istat);
+
+	/*
+	 * Handle kernel receive queues before checking for pio buffers
+	 * available since receives can overflow; piobuf waiters can afford
+	 * a few extra cycles, since they were waiting anyway.
+	 */
+	ctxtrbits = istat &
+		((QLOGIC_IB_I_RCVAVAIL_MASK << QLOGIC_IB_I_RCVAVAIL_SHIFT) |
+		 (QLOGIC_IB_I_RCVURG_MASK << QLOGIC_IB_I_RCVURG_SHIFT));
+	if (ctxtrbits) {
+		rmask = (1U << QLOGIC_IB_I_RCVAVAIL_SHIFT) |
+			(1U << QLOGIC_IB_I_RCVURG_SHIFT);
+		for (i = 0; i < dd->first_user_ctxt; i++) {
+			if (ctxtrbits & rmask) {
+				ctxtrbits &= ~rmask;
+				crcs += qib_kreceive(dd->rcd[i],
+						     &dd->cspec->lli_counter,
+						     NULL);
+			}
+			rmask <<= 1;
+		}
+		if (crcs) {
+			u32 cntr = dd->cspec->lli_counter;
+			cntr += crcs;
+			if (cntr) {
+				if (cntr > dd->cspec->lli_thresh) {
+					dd->cspec->lli_counter = 0;
+					dd->cspec->lli_errs++;
+				} else
+					dd->cspec->lli_counter += cntr;
+			}
+		}
+
+
+		if (ctxtrbits) {
+			ctxtrbits =
+				(ctxtrbits >> QLOGIC_IB_I_RCVAVAIL_SHIFT) |
+				(ctxtrbits >> QLOGIC_IB_I_RCVURG_SHIFT);
+			qib_handle_urcv(dd, ctxtrbits);
+		}
+	}
+
+	if ((istat & QLOGIC_IB_I_SPIOBUFAVAIL) && (dd->flags & QIB_INITTED))
+		qib_ib_piobufavail(dd);
+
+	ret = IRQ_HANDLED;
+bail:
+	return ret;
+}
+
+/*
+ * Set up our chip-specific interrupt handler
+ * The interrupt type has already been setup, so
+ * we just need to do the registration and error checking.
+ */
+static void qib_setup_6120_interrupt(struct qib_devdata *dd)
+{
+	/*
+	 * If the chip supports added error indication via GPIO pins,
+	 * enable interrupts on those bits so the interrupt routine
+	 * can count the events. Also set flag so interrupt routine
+	 * can know they are expected.
+	 */
+	if (SYM_FIELD(dd->revision, Revision_R,
+		      ChipRevMinor) > 1) {
+		/* Rev2+ reports extra errors via internal GPIO pins */
+		dd->cspec->gpio_mask |= GPIO_ERRINTR_MASK;
+		qib_write_kreg(dd, kr_gpio_mask, dd->cspec->gpio_mask);
+	}
+
+	if (!dd->cspec->irq)
+		qib_dev_err(dd,
+			"irq is 0, BIOS error?  Interrupts won't work\n");
+	else {
+		int ret;
+		ret = request_irq(dd->cspec->irq, qib_6120intr, 0,
+				  QIB_DRV_NAME, dd);
+		if (ret)
+			qib_dev_err(dd,
+				"Couldn't setup interrupt (irq=%d): %d\n",
+				dd->cspec->irq, ret);
+	}
+}
+
+/**
+ * pe_boardname - fill in the board name
+ * @dd: the qlogic_ib device
+ *
+ * info is based on the board revision register
+ */
+static void pe_boardname(struct qib_devdata *dd)
+{
+	char *n;
+	u32 boardid, namelen;
+
+	boardid = SYM_FIELD(dd->revision, Revision,
+			    BoardID);
+
+	switch (boardid) {
+	case 2:
+		n = "InfiniPath_QLE7140";
+		break;
+	default:
+		qib_dev_err(dd, "Unknown 6120 board with ID %u\n", boardid);
+		n = "Unknown_InfiniPath_6120";
+		break;
+	}
+	namelen = strlen(n) + 1;
+	dd->boardname = kmalloc(namelen, GFP_KERNEL);
+	if (!dd->boardname)
+		qib_dev_err(dd, "Failed allocation for board name: %s\n", n);
+	else
+		snprintf(dd->boardname, namelen, "%s", n);
+
+	if (dd->majrev != 4 || !dd->minrev || dd->minrev > 2)
+		qib_dev_err(dd,
+			"Unsupported InfiniPath hardware revision %u.%u!\n",
+			dd->majrev, dd->minrev);
+
+	snprintf(dd->boardversion, sizeof(dd->boardversion),
+		 "ChipABI %u.%u, %s, InfiniPath%u %u.%u, SW Compat %u\n",
+		 QIB_CHIP_VERS_MAJ, QIB_CHIP_VERS_MIN, dd->boardname,
+		 (unsigned)SYM_FIELD(dd->revision, Revision_R, Arch),
+		 dd->majrev, dd->minrev,
+		 (unsigned)SYM_FIELD(dd->revision, Revision_R, SW));
+
+}
+
+/*
+ * This routine sleeps, so it can only be called from user context, not
+ * from interrupt context.  If we need interrupt context, we can split
+ * it into two routines.
+ */
+static int qib_6120_setup_reset(struct qib_devdata *dd)
+{
+	u64 val;
+	int i;
+	int ret;
+	u16 cmdval;
+	u8 int_line, clinesz;
+
+	qib_pcie_getcmd(dd, &cmdval, &int_line, &clinesz);
+
+	/* Use ERROR so it shows up in logs, etc. */
+	qib_dev_err(dd, "Resetting InfiniPath unit %u\n", dd->unit);
+
+	/* no interrupts till re-initted */
+	qib_6120_set_intr_state(dd, 0);
+
+	dd->cspec->ibdeltainprog = 0;
+	dd->cspec->ibsymdelta = 0;
+	dd->cspec->iblnkerrdelta = 0;
+
+	/*
+	 * Keep chip from being accessed until we are ready.  Use
+	 * writeq() directly, to allow the write even though QIB_PRESENT
+	 * isn't set.
+	 */
+	dd->flags &= ~(QIB_INITTED | QIB_PRESENT);
+	/* so we check interrupts work again */
+	dd->z_int_counter = qib_int_counter(dd);
+	val = dd->control | QLOGIC_IB_C_RESET;
+	writeq(val, &dd->kregbase[kr_control]);
+	mb(); /* prevent compiler re-ordering around actual reset */
+
+	for (i = 1; i <= 5; i++) {
+		/*
+		 * Allow MBIST, etc. to complete; longer on each retry.
+		 * We sometimes get machine checks from bus timeout if no
+		 * response, so for now, make it *really* long.
+		 */
+		msleep(1000 + (1 + i) * 2000);
+
+		qib_pcie_reenable(dd, cmdval, int_line, clinesz);
+
+		/*
+		 * Use readq directly, so we don't need to mark it as PRESENT
+		 * until we get a successful indication that all is well.
+		 */
+		val = readq(&dd->kregbase[kr_revision]);
+		if (val == dd->revision) {
+			dd->flags |= QIB_PRESENT; /* it's back */
+			ret = qib_reinit_intr(dd);
+			goto bail;
+		}
+	}
+	ret = 0; /* failed */
+
+bail:
+	if (ret) {
+		if (qib_pcie_params(dd, dd->lbus_width, NULL, NULL))
+			qib_dev_err(dd,
+				"Reset failed to setup PCIe or interrupts; continuing anyway\n");
+		/* clear the reset error, init error/hwerror mask */
+		qib_6120_init_hwerrors(dd);
+		/* for Rev2 error interrupts; nop for rev 1 */
+		qib_write_kreg(dd, kr_gpio_mask, dd->cspec->gpio_mask);
+		/* clear the reset error, init error/hwerror mask */
+		qib_6120_init_hwerrors(dd);
+	}
+	return ret;
+}
+
+/**
+ * qib_6120_put_tid - write a TID in chip
+ * @dd: the qlogic_ib device
+ * @tidptr: pointer to the expected TID (in chip) to update
+ * @tidtype: RCVHQ_RCV_TYPE_EAGER (1) for eager, RCVHQ_RCV_TYPE_EXPECTED (0)
+ * for expected
+ * @pa: physical address of in memory buffer; tidinvalid if freeing
+ *
+ * This exists as a separate routine to allow for special locking etc.
+ * It's used for both the full cleanup on exit, as well as the normal
+ * setup and teardown.
+ */
+static void qib_6120_put_tid(struct qib_devdata *dd, u64 __iomem *tidptr,
+			     u32 type, unsigned long pa)
+{
+	u32 __iomem *tidp32 = (u32 __iomem *)tidptr;
+	unsigned long flags;
+	int tidx;
+	spinlock_t *tidlockp; /* select appropriate spinlock */
+
+	if (!dd->kregbase)
+		return;
+
+	if (pa != dd->tidinvalid) {
+		if (pa & ((1U << 11) - 1)) {
+			qib_dev_err(dd, "Physaddr %lx not 2KB aligned!\n",
+				    pa);
+			return;
+		}
+		pa >>= 11;
+		if (pa & ~QLOGIC_IB_RT_ADDR_MASK) {
+			qib_dev_err(dd,
+				"Physical page address 0x%lx larger than supported\n",
+				pa);
+			return;
+		}
+
+		if (type == RCVHQ_RCV_TYPE_EAGER)
+			pa |= dd->tidtemplate;
+		else /* for now, always full 4KB page */
+			pa |= 2 << 29;
+	}
+
+	/*
+	 * Avoid chip issue by writing the scratch register
+	 * before and after the TID, and with an io write barrier.
+	 * We use a spinlock around the writes, so they can't intermix
+	 * with other TID (eager or expected) writes (the chip problem
+	 * is triggered by back to back TID writes). Unfortunately, this
+	 * call can be done from interrupt level for the ctxt 0 eager TIDs,
+	 * so we have to use irqsave locks.
+	 */
+	/*
+	 * Assumes tidptr always > egrtidbase
+	 * if type == RCVHQ_RCV_TYPE_EAGER.
+	 */
+	tidx = tidptr - dd->egrtidbase;
+
+	tidlockp = (type == RCVHQ_RCV_TYPE_EAGER && tidx < dd->rcvhdrcnt)
+		? &dd->cspec->kernel_tid_lock : &dd->cspec->user_tid_lock;
+	spin_lock_irqsave(tidlockp, flags);
+	qib_write_kreg(dd, kr_scratch, 0xfeeddeaf);
+	writel(pa, tidp32);
+	qib_write_kreg(dd, kr_scratch, 0xdeadbeef);
+	mmiowb();
+	spin_unlock_irqrestore(tidlockp, flags);
+}
+
+/**
+ * qib_6120_put_tid_2 - write a TID in chip, Revision 2 or higher
+ * @dd: the qlogic_ib device
+ * @tidptr: pointer to the expected TID (in chip) to update
+ * @tidtype: RCVHQ_RCV_TYPE_EAGER (1) for eager, RCVHQ_RCV_TYPE_EXPECTED (0)
+ * for expected
+ * @pa: physical address of in memory buffer; tidinvalid if freeing
+ *
+ * This exists as a separate routine to allow for selection of the
+ * appropriate "flavor". The static calls in cleanup just use the
+ * revision-agnostic form, as they are not performance critical.
+ */
+static void qib_6120_put_tid_2(struct qib_devdata *dd, u64 __iomem *tidptr,
+			       u32 type, unsigned long pa)
+{
+	u32 __iomem *tidp32 = (u32 __iomem *)tidptr;
+	u32 tidx;
+
+	if (!dd->kregbase)
+		return;
+
+	if (pa != dd->tidinvalid) {
+		if (pa & ((1U << 11) - 1)) {
+			qib_dev_err(dd, "Physaddr %lx not 2KB aligned!\n",
+				    pa);
+			return;
+		}
+		pa >>= 11;
+		if (pa & ~QLOGIC_IB_RT_ADDR_MASK) {
+			qib_dev_err(dd,
+				"Physical page address 0x%lx larger than supported\n",
+				pa);
+			return;
+		}
+
+		if (type == RCVHQ_RCV_TYPE_EAGER)
+			pa |= dd->tidtemplate;
+		else /* for now, always full 4KB page */
+			pa |= 2 << 29;
+	}
+	tidx = tidptr - dd->egrtidbase;
+	writel(pa, tidp32);
+	mmiowb();
+}
+
+
+/**
+ * qib_6120_clear_tids - clear all TID entries for a context, expected and eager
+ * @dd: the qlogic_ib device
+ * @ctxt: the context
+ *
+ * clear all TID entries for a context, expected and eager.
+ * Used from qib_close().  On this chip, TIDs are only 32 bits,
+ * not 64, but they are still on 64 bit boundaries, so tidbase
+ * is declared as u64 * for the pointer math, even though we write 32 bits
+ */
+static void qib_6120_clear_tids(struct qib_devdata *dd,
+				struct qib_ctxtdata *rcd)
+{
+	u64 __iomem *tidbase;
+	unsigned long tidinv;
+	u32 ctxt;
+	int i;
+
+	if (!dd->kregbase || !rcd)
+		return;
+
+	ctxt = rcd->ctxt;
+
+	tidinv = dd->tidinvalid;
+	tidbase = (u64 __iomem *)
+		((char __iomem *)(dd->kregbase) +
+		 dd->rcvtidbase +
+		 ctxt * dd->rcvtidcnt * sizeof(*tidbase));
+
+	for (i = 0; i < dd->rcvtidcnt; i++)
+		/* use func pointer because could be one of two funcs */
+		dd->f_put_tid(dd, &tidbase[i], RCVHQ_RCV_TYPE_EXPECTED,
+				  tidinv);
+
+	tidbase = (u64 __iomem *)
+		((char __iomem *)(dd->kregbase) +
+		 dd->rcvegrbase +
+		 rcd->rcvegr_tid_base * sizeof(*tidbase));
+
+	for (i = 0; i < rcd->rcvegrcnt; i++)
+		/* use func pointer because could be one of two funcs */
+		dd->f_put_tid(dd, &tidbase[i], RCVHQ_RCV_TYPE_EAGER,
+				  tidinv);
+}
+
+/**
+ * qib_6120_tidtemplate - setup constants for TID updates
+ * @dd: the qlogic_ib device
+ *
+ * We setup stuff that we use a lot, to avoid calculating each time
+ */
+static void qib_6120_tidtemplate(struct qib_devdata *dd)
+{
+	u32 egrsize = dd->rcvegrbufsize;
+
+	/*
+	 * For now, we always allocate 4KB buffers (at init) so we can
+	 * receive max size packets.  We may want a module parameter to
+	 * specify 2KB or 4KB and/or make be per ctxt instead of per device
+	 * for those who want to reduce memory footprint.  Note that the
+	 * rcvhdrentsize size must be large enough to hold the largest
+	 * IB header (currently 96 bytes) that we expect to handle (plus of
+	 * course the 2 dwords of RHF).
+	 */
+	if (egrsize == 2048)
+		dd->tidtemplate = 1U << 29;
+	else if (egrsize == 4096)
+		dd->tidtemplate = 2U << 29;
+	dd->tidinvalid = 0;
+}
+
+int __attribute__((weak)) qib_unordered_wc(void)
+{
+	return 0;
+}
+
+/**
+ * qib_6120_get_base_info - set chip-specific flags for user code
+ * @rcd: the qlogic_ib ctxt
+ * @kbase: qib_base_info pointer
+ *
+ * We set the PCIE flag because the lower bandwidth on PCIe vs
+ * HyperTransport can affect some user packet algorithms.
+ */
+static int qib_6120_get_base_info(struct qib_ctxtdata *rcd,
+				  struct qib_base_info *kinfo)
+{
+	if (qib_unordered_wc())
+		kinfo->spi_runtime_flags |= QIB_RUNTIME_FORCE_WC_ORDER;
+
+	kinfo->spi_runtime_flags |= QIB_RUNTIME_PCIE |
+		QIB_RUNTIME_FORCE_PIOAVAIL | QIB_RUNTIME_PIO_REGSWAPPED;
+	return 0;
+}
+
+
+static struct qib_message_header *
+qib_6120_get_msgheader(struct qib_devdata *dd, __le32 *rhf_addr)
+{
+	return (struct qib_message_header *)
+		&rhf_addr[sizeof(u64) / sizeof(u32)];
+}
+
+static void qib_6120_config_ctxts(struct qib_devdata *dd)
+{
+	dd->ctxtcnt = qib_read_kreg32(dd, kr_portcnt);
+	if (qib_n_krcv_queues > 1) {
+		dd->first_user_ctxt = qib_n_krcv_queues * dd->num_pports;
+		if (dd->first_user_ctxt > dd->ctxtcnt)
+			dd->first_user_ctxt = dd->ctxtcnt;
+		dd->qpn_mask = dd->first_user_ctxt <= 2 ? 2 : 6;
+	} else
+		dd->first_user_ctxt = dd->num_pports;
+	dd->n_krcv_queues = dd->first_user_ctxt;
+}
+
+static void qib_update_6120_usrhead(struct qib_ctxtdata *rcd, u64 hd,
+				    u32 updegr, u32 egrhd, u32 npkts)
+{
+	if (updegr)
+		qib_write_ureg(rcd->dd, ur_rcvegrindexhead, egrhd, rcd->ctxt);
+	mmiowb();
+	qib_write_ureg(rcd->dd, ur_rcvhdrhead, hd, rcd->ctxt);
+	mmiowb();
+}
+
+static u32 qib_6120_hdrqempty(struct qib_ctxtdata *rcd)
+{
+	u32 head, tail;
+
+	head = qib_read_ureg32(rcd->dd, ur_rcvhdrhead, rcd->ctxt);
+	if (rcd->rcvhdrtail_kvaddr)
+		tail = qib_get_rcvhdrtail(rcd);
+	else
+		tail = qib_read_ureg32(rcd->dd, ur_rcvhdrtail, rcd->ctxt);
+	return head == tail;
+}
+
+/*
+ * Used when we close any ctxt, for DMA already in flight
+ * at close.  Can't be done until we know hdrq size, so not
+ * early in chip init.
+ */
+static void alloc_dummy_hdrq(struct qib_devdata *dd)
+{
+	dd->cspec->dummy_hdrq = dma_alloc_coherent(&dd->pcidev->dev,
+					dd->rcd[0]->rcvhdrq_size,
+					&dd->cspec->dummy_hdrq_phys,
+					GFP_ATOMIC | __GFP_COMP);
+	if (!dd->cspec->dummy_hdrq) {
+		qib_devinfo(dd->pcidev, "Couldn't allocate dummy hdrq\n");
+		/* fallback to just 0'ing */
+		dd->cspec->dummy_hdrq_phys = 0UL;
+	}
+}
+
+/*
+ * Modify the RCVCTRL register in chip-specific way. This
+ * is a function because bit positions and (future) register
+ * location is chip-specific, but the needed operations are
+ * generic. <op> is a bit-mask because we often want to
+ * do multiple modifications.
+ */
+static void rcvctrl_6120_mod(struct qib_pportdata *ppd, unsigned int op,
+			     int ctxt)
+{
+	struct qib_devdata *dd = ppd->dd;
+	u64 mask, val;
+	unsigned long flags;
+
+	spin_lock_irqsave(&dd->cspec->rcvmod_lock, flags);
+
+	if (op & QIB_RCVCTRL_TAILUPD_ENB)
+		dd->rcvctrl |= (1ULL << QLOGIC_IB_R_TAILUPD_SHIFT);
+	if (op & QIB_RCVCTRL_TAILUPD_DIS)
+		dd->rcvctrl &= ~(1ULL << QLOGIC_IB_R_TAILUPD_SHIFT);
+	if (op & QIB_RCVCTRL_PKEY_ENB)
+		dd->rcvctrl &= ~(1ULL << IBA6120_R_PKEY_DIS_SHIFT);
+	if (op & QIB_RCVCTRL_PKEY_DIS)
+		dd->rcvctrl |= (1ULL << IBA6120_R_PKEY_DIS_SHIFT);
+	if (ctxt < 0)
+		mask = (1ULL << dd->ctxtcnt) - 1;
+	else
+		mask = (1ULL << ctxt);
+	if (op & QIB_RCVCTRL_CTXT_ENB) {
+		/* always done for specific ctxt */
+		dd->rcvctrl |= (mask << SYM_LSB(RcvCtrl, PortEnable));
+		if (!(dd->flags & QIB_NODMA_RTAIL))
+			dd->rcvctrl |= 1ULL << QLOGIC_IB_R_TAILUPD_SHIFT;
+		/* Write these registers before the context is enabled. */
+		qib_write_kreg_ctxt(dd, kr_rcvhdrtailaddr, ctxt,
+			dd->rcd[ctxt]->rcvhdrqtailaddr_phys);
+		qib_write_kreg_ctxt(dd, kr_rcvhdraddr, ctxt,
+			dd->rcd[ctxt]->rcvhdrq_phys);
+
+		if (ctxt == 0 && !dd->cspec->dummy_hdrq)
+			alloc_dummy_hdrq(dd);
+	}
+	if (op & QIB_RCVCTRL_CTXT_DIS)
+		dd->rcvctrl &= ~(mask << SYM_LSB(RcvCtrl, PortEnable));
+	if (op & QIB_RCVCTRL_INTRAVAIL_ENB)
+		dd->rcvctrl |= (mask << QLOGIC_IB_R_INTRAVAIL_SHIFT);
+	if (op & QIB_RCVCTRL_INTRAVAIL_DIS)
+		dd->rcvctrl &= ~(mask << QLOGIC_IB_R_INTRAVAIL_SHIFT);
+	qib_write_kreg(dd, kr_rcvctrl, dd->rcvctrl);
+	if ((op & QIB_RCVCTRL_INTRAVAIL_ENB) && dd->rhdrhead_intr_off) {
+		/* arm rcv interrupt */
+		val = qib_read_ureg32(dd, ur_rcvhdrhead, ctxt) |
+			dd->rhdrhead_intr_off;
+		qib_write_ureg(dd, ur_rcvhdrhead, val, ctxt);
+	}
+	if (op & QIB_RCVCTRL_CTXT_ENB) {
+		/*
+		 * Init the context registers also; if we were
+		 * disabled, tail and head should both be zero
+		 * already from the enable, but since we don't
+		 * know, we have to do it explicitly.
+		 */
+		val = qib_read_ureg32(dd, ur_rcvegrindextail, ctxt);
+		qib_write_ureg(dd, ur_rcvegrindexhead, val, ctxt);
+
+		val = qib_read_ureg32(dd, ur_rcvhdrtail, ctxt);
+		dd->rcd[ctxt]->head = val;
+		/* If kctxt, interrupt on next receive. */
+		if (ctxt < dd->first_user_ctxt)
+			val |= dd->rhdrhead_intr_off;
+		qib_write_ureg(dd, ur_rcvhdrhead, val, ctxt);
+	}
+	if (op & QIB_RCVCTRL_CTXT_DIS) {
+		/*
+		 * Be paranoid, and never write 0's to these, just use an
+		 * unused page.  Of course,
+		 * rcvhdraddr points to a large chunk of memory, so this
+		 * could still trash things, but at least it won't trash
+		 * page 0, and by disabling the ctxt, it should stop "soon",
+		 * even if a packet or two is in already in flight after we
+		 * disabled the ctxt.  Only 6120 has this issue.
+		 */
+		if (ctxt >= 0) {
+			qib_write_kreg_ctxt(dd, kr_rcvhdrtailaddr, ctxt,
+					    dd->cspec->dummy_hdrq_phys);
+			qib_write_kreg_ctxt(dd, kr_rcvhdraddr, ctxt,
+					    dd->cspec->dummy_hdrq_phys);
+		} else {
+			unsigned i;
+
+			for (i = 0; i < dd->cfgctxts; i++) {
+				qib_write_kreg_ctxt(dd, kr_rcvhdrtailaddr,
+					    i, dd->cspec->dummy_hdrq_phys);
+				qib_write_kreg_ctxt(dd, kr_rcvhdraddr,
+					    i, dd->cspec->dummy_hdrq_phys);
+			}
+		}
+	}
+	spin_unlock_irqrestore(&dd->cspec->rcvmod_lock, flags);
+}
+
+/*
+ * Modify the SENDCTRL register in chip-specific way. This
+ * is a function there may be multiple such registers with
+ * slightly different layouts. Only operations actually used
+ * are implemented yet.
+ * Chip requires no back-back sendctrl writes, so write
+ * scratch register after writing sendctrl
+ */
+static void sendctrl_6120_mod(struct qib_pportdata *ppd, u32 op)
+{
+	struct qib_devdata *dd = ppd->dd;
+	u64 tmp_dd_sendctrl;
+	unsigned long flags;
+
+	spin_lock_irqsave(&dd->sendctrl_lock, flags);
+
+	/* First the ones that are "sticky", saved in shadow */
+	if (op & QIB_SENDCTRL_CLEAR)
+		dd->sendctrl = 0;
+	if (op & QIB_SENDCTRL_SEND_DIS)
+		dd->sendctrl &= ~SYM_MASK(SendCtrl, PIOEnable);
+	else if (op & QIB_SENDCTRL_SEND_ENB)
+		dd->sendctrl |= SYM_MASK(SendCtrl, PIOEnable);
+	if (op & QIB_SENDCTRL_AVAIL_DIS)
+		dd->sendctrl &= ~SYM_MASK(SendCtrl, PIOBufAvailUpd);
+	else if (op & QIB_SENDCTRL_AVAIL_ENB)
+		dd->sendctrl |= SYM_MASK(SendCtrl, PIOBufAvailUpd);
+
+	if (op & QIB_SENDCTRL_DISARM_ALL) {
+		u32 i, last;
+
+		tmp_dd_sendctrl = dd->sendctrl;
+		/*
+		 * disarm any that are not yet launched, disabling sends
+		 * and updates until done.
+		 */
+		last = dd->piobcnt2k + dd->piobcnt4k;
+		tmp_dd_sendctrl &=
+			~(SYM_MASK(SendCtrl, PIOEnable) |
+			  SYM_MASK(SendCtrl, PIOBufAvailUpd));
+		for (i = 0; i < last; i++) {
+			qib_write_kreg(dd, kr_sendctrl, tmp_dd_sendctrl |
+				       SYM_MASK(SendCtrl, Disarm) | i);
+			qib_write_kreg(dd, kr_scratch, 0);
+		}
+	}
+
+	tmp_dd_sendctrl = dd->sendctrl;
+
+	if (op & QIB_SENDCTRL_FLUSH)
+		tmp_dd_sendctrl |= SYM_MASK(SendCtrl, Abort);
+	if (op & QIB_SENDCTRL_DISARM)
+		tmp_dd_sendctrl |= SYM_MASK(SendCtrl, Disarm) |
+			((op & QIB_6120_SendCtrl_DisarmPIOBuf_RMASK) <<
+			 SYM_LSB(SendCtrl, DisarmPIOBuf));
+	if (op & QIB_SENDCTRL_AVAIL_BLIP)
+		tmp_dd_sendctrl &= ~SYM_MASK(SendCtrl, PIOBufAvailUpd);
+
+	qib_write_kreg(dd, kr_sendctrl, tmp_dd_sendctrl);
+	qib_write_kreg(dd, kr_scratch, 0);
+
+	if (op & QIB_SENDCTRL_AVAIL_BLIP) {
+		qib_write_kreg(dd, kr_sendctrl, dd->sendctrl);
+		qib_write_kreg(dd, kr_scratch, 0);
+	}
+
+	spin_unlock_irqrestore(&dd->sendctrl_lock, flags);
+
+	if (op & QIB_SENDCTRL_FLUSH) {
+		u32 v;
+		/*
+		 * ensure writes have hit chip, then do a few
+		 * more reads, to allow DMA of pioavail registers
+		 * to occur, so in-memory copy is in sync with
+		 * the chip.  Not always safe to sleep.
+		 */
+		v = qib_read_kreg32(dd, kr_scratch);
+		qib_write_kreg(dd, kr_scratch, v);
+		v = qib_read_kreg32(dd, kr_scratch);
+		qib_write_kreg(dd, kr_scratch, v);
+		qib_read_kreg32(dd, kr_scratch);
+	}
+}
+
+/**
+ * qib_portcntr_6120 - read a per-port counter
+ * @dd: the qlogic_ib device
+ * @creg: the counter to snapshot
+ */
+static u64 qib_portcntr_6120(struct qib_pportdata *ppd, u32 reg)
+{
+	u64 ret = 0ULL;
+	struct qib_devdata *dd = ppd->dd;
+	u16 creg;
+	/* 0xffff for unimplemented or synthesized counters */
+	static const u16 xlator[] = {
+		[QIBPORTCNTR_PKTSEND] = cr_pktsend,
+		[QIBPORTCNTR_WORDSEND] = cr_wordsend,
+		[QIBPORTCNTR_PSXMITDATA] = 0xffff,
+		[QIBPORTCNTR_PSXMITPKTS] = 0xffff,
+		[QIBPORTCNTR_PSXMITWAIT] = 0xffff,
+		[QIBPORTCNTR_SENDSTALL] = cr_sendstall,
+		[QIBPORTCNTR_PKTRCV] = cr_pktrcv,
+		[QIBPORTCNTR_PSRCVDATA] = 0xffff,
+		[QIBPORTCNTR_PSRCVPKTS] = 0xffff,
+		[QIBPORTCNTR_RCVEBP] = cr_rcvebp,
+		[QIBPORTCNTR_RCVOVFL] = cr_rcvovfl,
+		[QIBPORTCNTR_WORDRCV] = cr_wordrcv,
+		[QIBPORTCNTR_RXDROPPKT] = cr_rxdroppkt,
+		[QIBPORTCNTR_RXLOCALPHYERR] = 0xffff,
+		[QIBPORTCNTR_RXVLERR] = 0xffff,
+		[QIBPORTCNTR_ERRICRC] = cr_erricrc,
+		[QIBPORTCNTR_ERRVCRC] = cr_errvcrc,
+		[QIBPORTCNTR_ERRLPCRC] = cr_errlpcrc,
+		[QIBPORTCNTR_BADFORMAT] = cr_badformat,
+		[QIBPORTCNTR_ERR_RLEN] = cr_err_rlen,
+		[QIBPORTCNTR_IBSYMBOLERR] = cr_ibsymbolerr,
+		[QIBPORTCNTR_INVALIDRLEN] = cr_invalidrlen,
+		[QIBPORTCNTR_UNSUPVL] = cr_txunsupvl,
+		[QIBPORTCNTR_EXCESSBUFOVFL] = 0xffff,
+		[QIBPORTCNTR_ERRLINK] = cr_errlink,
+		[QIBPORTCNTR_IBLINKDOWN] = cr_iblinkdown,
+		[QIBPORTCNTR_IBLINKERRRECOV] = cr_iblinkerrrecov,
+		[QIBPORTCNTR_LLI] = 0xffff,
+		[QIBPORTCNTR_PSINTERVAL] = 0xffff,
+		[QIBPORTCNTR_PSSTART] = 0xffff,
+		[QIBPORTCNTR_PSSTAT] = 0xffff,
+		[QIBPORTCNTR_VL15PKTDROP] = 0xffff,
+		[QIBPORTCNTR_ERRPKEY] = cr_errpkey,
+		[QIBPORTCNTR_KHDROVFL] = 0xffff,
+	};
+
+	if (reg >= ARRAY_SIZE(xlator)) {
+		qib_devinfo(ppd->dd->pcidev,
+			 "Unimplemented portcounter %u\n", reg);
+		goto done;
+	}
+	creg = xlator[reg];
+
+	/* handle counters requests not implemented as chip counters */
+	if (reg == QIBPORTCNTR_LLI)
+		ret = dd->cspec->lli_errs;
+	else if (reg == QIBPORTCNTR_EXCESSBUFOVFL)
+		ret = dd->cspec->overrun_thresh_errs;
+	else if (reg == QIBPORTCNTR_KHDROVFL) {
+		int i;
+
+		/* sum over all kernel contexts */
+		for (i = 0; i < dd->first_user_ctxt; i++)
+			ret += read_6120_creg32(dd, cr_portovfl + i);
+	} else if (reg == QIBPORTCNTR_PSSTAT)
+		ret = dd->cspec->pma_sample_status;
+	if (creg == 0xffff)
+		goto done;
+
+	/*
+	 * only fast incrementing counters are 64bit; use 32 bit reads to
+	 * avoid two independent reads when on opteron
+	 */
+	if (creg == cr_wordsend || creg == cr_wordrcv ||
+	    creg == cr_pktsend || creg == cr_pktrcv)
+		ret = read_6120_creg(dd, creg);
+	else
+		ret = read_6120_creg32(dd, creg);
+	if (creg == cr_ibsymbolerr) {
+		if (dd->cspec->ibdeltainprog)
+			ret -= ret - dd->cspec->ibsymsnap;
+		ret -= dd->cspec->ibsymdelta;
+	} else if (creg == cr_iblinkerrrecov) {
+		if (dd->cspec->ibdeltainprog)
+			ret -= ret - dd->cspec->iblnkerrsnap;
+		ret -= dd->cspec->iblnkerrdelta;
+	}
+	if (reg == QIBPORTCNTR_RXDROPPKT) /* add special cased count */
+		ret += dd->cspec->rxfc_unsupvl_errs;
+
+done:
+	return ret;
+}
+
+/*
+ * Device counter names (not port-specific), one line per stat,
+ * single string.  Used by utilities like ipathstats to print the stats
+ * in a way which works for different versions of drivers, without changing
+ * the utility.  Names need to be 12 chars or less (w/o newline), for proper
+ * display by utility.
+ * Non-error counters are first.
+ * Start of "error" conters is indicated by a leading "E " on the first
+ * "error" counter, and doesn't count in label length.
+ * The EgrOvfl list needs to be last so we truncate them at the configured
+ * context count for the device.
+ * cntr6120indices contains the corresponding register indices.
+ */
+static const char cntr6120names[] =
+	"Interrupts\n"
+	"HostBusStall\n"
+	"E RxTIDFull\n"
+	"RxTIDInvalid\n"
+	"Ctxt0EgrOvfl\n"
+	"Ctxt1EgrOvfl\n"
+	"Ctxt2EgrOvfl\n"
+	"Ctxt3EgrOvfl\n"
+	"Ctxt4EgrOvfl\n";
+
+static const size_t cntr6120indices[] = {
+	cr_lbint,
+	cr_lbflowstall,
+	cr_errtidfull,
+	cr_errtidvalid,
+	cr_portovfl + 0,
+	cr_portovfl + 1,
+	cr_portovfl + 2,
+	cr_portovfl + 3,
+	cr_portovfl + 4,
+};
+
+/*
+ * same as cntr6120names and cntr6120indices, but for port-specific counters.
+ * portcntr6120indices is somewhat complicated by some registers needing
+ * adjustments of various kinds, and those are ORed with _PORT_VIRT_FLAG
+ */
+static const char portcntr6120names[] =
+	"TxPkt\n"
+	"TxFlowPkt\n"
+	"TxWords\n"
+	"RxPkt\n"
+	"RxFlowPkt\n"
+	"RxWords\n"
+	"TxFlowStall\n"
+	"E IBStatusChng\n"
+	"IBLinkDown\n"
+	"IBLnkRecov\n"
+	"IBRxLinkErr\n"
+	"IBSymbolErr\n"
+	"RxLLIErr\n"
+	"RxBadFormat\n"
+	"RxBadLen\n"
+	"RxBufOvrfl\n"
+	"RxEBP\n"
+	"RxFlowCtlErr\n"
+	"RxICRCerr\n"
+	"RxLPCRCerr\n"
+	"RxVCRCerr\n"
+	"RxInvalLen\n"
+	"RxInvalPKey\n"
+	"RxPktDropped\n"
+	"TxBadLength\n"
+	"TxDropped\n"
+	"TxInvalLen\n"
+	"TxUnderrun\n"
+	"TxUnsupVL\n"
+	;
+
+#define _PORT_VIRT_FLAG 0x8000 /* "virtual", need adjustments */
+static const size_t portcntr6120indices[] = {
+	QIBPORTCNTR_PKTSEND | _PORT_VIRT_FLAG,
+	cr_pktsendflow,
+	QIBPORTCNTR_WORDSEND | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_PKTRCV | _PORT_VIRT_FLAG,
+	cr_pktrcvflowctrl,
+	QIBPORTCNTR_WORDRCV | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_SENDSTALL | _PORT_VIRT_FLAG,
+	cr_ibstatuschange,
+	QIBPORTCNTR_IBLINKDOWN | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_IBLINKERRRECOV | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_ERRLINK | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_IBSYMBOLERR | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_LLI | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_BADFORMAT | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_ERR_RLEN | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_RCVOVFL | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_RCVEBP | _PORT_VIRT_FLAG,
+	cr_rcvflowctrl_err,
+	QIBPORTCNTR_ERRICRC | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_ERRLPCRC | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_ERRVCRC | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_INVALIDRLEN | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_ERRPKEY | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_RXDROPPKT | _PORT_VIRT_FLAG,
+	cr_invalidslen,
+	cr_senddropped,
+	cr_errslen,
+	cr_sendunderrun,
+	cr_txunsupvl,
+};
+
+/* do all the setup to make the counter reads efficient later */
+static void init_6120_cntrnames(struct qib_devdata *dd)
+{
+	int i, j = 0;
+	char *s;
+
+	for (i = 0, s = (char *)cntr6120names; s && j <= dd->cfgctxts;
+	     i++) {
+		/* we always have at least one counter before the egrovfl */
+		if (!j && !strncmp("Ctxt0EgrOvfl", s + 1, 12))
+			j = 1;
+		s = strchr(s + 1, '\n');
+		if (s && j)
+			j++;
+	}
+	dd->cspec->ncntrs = i;
+	if (!s)
+		/* full list; size is without terminating null */
+		dd->cspec->cntrnamelen = sizeof(cntr6120names) - 1;
+	else
+		dd->cspec->cntrnamelen = 1 + s - cntr6120names;
+	dd->cspec->cntrs = kmalloc(dd->cspec->ncntrs
+		* sizeof(u64), GFP_KERNEL);
+	if (!dd->cspec->cntrs)
+		qib_dev_err(dd, "Failed allocation for counters\n");
+
+	for (i = 0, s = (char *)portcntr6120names; s; i++)
+		s = strchr(s + 1, '\n');
+	dd->cspec->nportcntrs = i - 1;
+	dd->cspec->portcntrnamelen = sizeof(portcntr6120names) - 1;
+	dd->cspec->portcntrs = kmalloc(dd->cspec->nportcntrs
+		* sizeof(u64), GFP_KERNEL);
+	if (!dd->cspec->portcntrs)
+		qib_dev_err(dd, "Failed allocation for portcounters\n");
+}
+
+static u32 qib_read_6120cntrs(struct qib_devdata *dd, loff_t pos, char **namep,
+			      u64 **cntrp)
+{
+	u32 ret;
+
+	if (namep) {
+		ret = dd->cspec->cntrnamelen;
+		if (pos >= ret)
+			ret = 0; /* final read after getting everything */
+		else
+			*namep = (char *)cntr6120names;
+	} else {
+		u64 *cntr = dd->cspec->cntrs;
+		int i;
+
+		ret = dd->cspec->ncntrs * sizeof(u64);
+		if (!cntr || pos >= ret) {
+			/* everything read, or couldn't get memory */
+			ret = 0;
+			goto done;
+		}
+		if (pos >= ret) {
+			ret = 0; /* final read after getting everything */
+			goto done;
+		}
+		*cntrp = cntr;
+		for (i = 0; i < dd->cspec->ncntrs; i++)
+			*cntr++ = read_6120_creg32(dd, cntr6120indices[i]);
+	}
+done:
+	return ret;
+}
+
+static u32 qib_read_6120portcntrs(struct qib_devdata *dd, loff_t pos, u32 port,
+				  char **namep, u64 **cntrp)
+{
+	u32 ret;
+
+	if (namep) {
+		ret = dd->cspec->portcntrnamelen;
+		if (pos >= ret)
+			ret = 0; /* final read after getting everything */
+		else
+			*namep = (char *)portcntr6120names;
+	} else {
+		u64 *cntr = dd->cspec->portcntrs;
+		struct qib_pportdata *ppd = &dd->pport[port];
+		int i;
+
+		ret = dd->cspec->nportcntrs * sizeof(u64);
+		if (!cntr || pos >= ret) {
+			/* everything read, or couldn't get memory */
+			ret = 0;
+			goto done;
+		}
+		*cntrp = cntr;
+		for (i = 0; i < dd->cspec->nportcntrs; i++) {
+			if (portcntr6120indices[i] & _PORT_VIRT_FLAG)
+				*cntr++ = qib_portcntr_6120(ppd,
+					portcntr6120indices[i] &
+					~_PORT_VIRT_FLAG);
+			else
+				*cntr++ = read_6120_creg32(dd,
+					   portcntr6120indices[i]);
+		}
+	}
+done:
+	return ret;
+}
+
+static void qib_chk_6120_errormask(struct qib_devdata *dd)
+{
+	static u32 fixed;
+	u32 ctrl;
+	unsigned long errormask;
+	unsigned long hwerrs;
+
+	if (!dd->cspec->errormask || !(dd->flags & QIB_INITTED))
+		return;
+
+	errormask = qib_read_kreg64(dd, kr_errmask);
+
+	if (errormask == dd->cspec->errormask)
+		return;
+	fixed++;
+
+	hwerrs = qib_read_kreg64(dd, kr_hwerrstatus);
+	ctrl = qib_read_kreg32(dd, kr_control);
+
+	qib_write_kreg(dd, kr_errmask,
+		dd->cspec->errormask);
+
+	if ((hwerrs & dd->cspec->hwerrmask) ||
+	    (ctrl & QLOGIC_IB_C_FREEZEMODE)) {
+		qib_write_kreg(dd, kr_hwerrclear, 0ULL);
+		qib_write_kreg(dd, kr_errclear, 0ULL);
+		/* force re-interrupt of pending events, just in case */
+		qib_write_kreg(dd, kr_intclear, 0ULL);
+		qib_devinfo(dd->pcidev,
+			 "errormask fixed(%u) %lx->%lx, ctrl %x hwerr %lx\n",
+			 fixed, errormask, (unsigned long)dd->cspec->errormask,
+			 ctrl, hwerrs);
+	}
+}
+
+/**
+ * qib_get_faststats - get word counters from chip before they overflow
+ * @opaque - contains a pointer to the qlogic_ib device qib_devdata
+ *
+ * This needs more work; in particular, decision on whether we really
+ * need traffic_wds done the way it is
+ * called from add_timer
+ */
+static void qib_get_6120_faststats(unsigned long opaque)
+{
+	struct qib_devdata *dd = (struct qib_devdata *) opaque;
+	struct qib_pportdata *ppd = dd->pport;
+	unsigned long flags;
+	u64 traffic_wds;
+
+	/*
+	 * don't access the chip while running diags, or memory diags can
+	 * fail
+	 */
+	if (!(dd->flags & QIB_INITTED) || dd->diag_client)
+		/* but re-arm the timer, for diags case; won't hurt other */
+		goto done;
+
+	/*
+	 * We now try to maintain an activity timer, based on traffic
+	 * exceeding a threshold, so we need to check the word-counts
+	 * even if they are 64-bit.
+	 */
+	traffic_wds = qib_portcntr_6120(ppd, cr_wordsend) +
+		qib_portcntr_6120(ppd, cr_wordrcv);
+	spin_lock_irqsave(&dd->eep_st_lock, flags);
+	traffic_wds -= dd->traffic_wds;
+	dd->traffic_wds += traffic_wds;
+	if (traffic_wds  >= QIB_TRAFFIC_ACTIVE_THRESHOLD)
+		atomic_add(5, &dd->active_time); /* S/B #define */
+	spin_unlock_irqrestore(&dd->eep_st_lock, flags);
+
+	qib_chk_6120_errormask(dd);
+done:
+	mod_timer(&dd->stats_timer, jiffies + HZ * ACTIVITY_TIMER);
+}
+
+/* no interrupt fallback for these chips */
+static int qib_6120_nointr_fallback(struct qib_devdata *dd)
+{
+	return 0;
+}
+
+/*
+ * reset the XGXS (between serdes and IBC).  Slightly less intrusive
+ * than resetting the IBC or external link state, and useful in some
+ * cases to cause some retraining.  To do this right, we reset IBC
+ * as well.
+ */
+static void qib_6120_xgxs_reset(struct qib_pportdata *ppd)
+{
+	u64 val, prev_val;
+	struct qib_devdata *dd = ppd->dd;
+
+	prev_val = qib_read_kreg64(dd, kr_xgxs_cfg);
+	val = prev_val | QLOGIC_IB_XGXS_RESET;
+	prev_val &= ~QLOGIC_IB_XGXS_RESET; /* be sure */
+	qib_write_kreg(dd, kr_control,
+		       dd->control & ~QLOGIC_IB_C_LINKENABLE);
+	qib_write_kreg(dd, kr_xgxs_cfg, val);
+	qib_read_kreg32(dd, kr_scratch);
+	qib_write_kreg(dd, kr_xgxs_cfg, prev_val);
+	qib_write_kreg(dd, kr_control, dd->control);
+}
+
+static int qib_6120_get_ib_cfg(struct qib_pportdata *ppd, int which)
+{
+	int ret;
+
+	switch (which) {
+	case QIB_IB_CFG_LWID:
+		ret = ppd->link_width_active;
+		break;
+
+	case QIB_IB_CFG_SPD:
+		ret = ppd->link_speed_active;
+		break;
+
+	case QIB_IB_CFG_LWID_ENB:
+		ret = ppd->link_width_enabled;
+		break;
+
+	case QIB_IB_CFG_SPD_ENB:
+		ret = ppd->link_speed_enabled;
+		break;
+
+	case QIB_IB_CFG_OP_VLS:
+		ret = ppd->vls_operational;
+		break;
+
+	case QIB_IB_CFG_VL_HIGH_CAP:
+		ret = 0;
+		break;
+
+	case QIB_IB_CFG_VL_LOW_CAP:
+		ret = 0;
+		break;
+
+	case QIB_IB_CFG_OVERRUN_THRESH: /* IB overrun threshold */
+		ret = SYM_FIELD(ppd->dd->cspec->ibcctrl, IBCCtrl,
+				OverrunThreshold);
+		break;
+
+	case QIB_IB_CFG_PHYERR_THRESH: /* IB PHY error threshold */
+		ret = SYM_FIELD(ppd->dd->cspec->ibcctrl, IBCCtrl,
+				PhyerrThreshold);
+		break;
+
+	case QIB_IB_CFG_LINKDEFAULT: /* IB link default (sleep/poll) */
+		/* will only take effect when the link state changes */
+		ret = (ppd->dd->cspec->ibcctrl &
+		       SYM_MASK(IBCCtrl, LinkDownDefaultState)) ?
+			IB_LINKINITCMD_SLEEP : IB_LINKINITCMD_POLL;
+		break;
+
+	case QIB_IB_CFG_HRTBT: /* Get Heartbeat off/enable/auto */
+		ret = 0; /* no heartbeat on this chip */
+		break;
+
+	case QIB_IB_CFG_PMA_TICKS:
+		ret = 250; /* 1 usec. */
+		break;
+
+	default:
+		ret =  -EINVAL;
+		break;
+	}
+	return ret;
+}
+
+/*
+ * We assume range checking is already done, if needed.
+ */
+static int qib_6120_set_ib_cfg(struct qib_pportdata *ppd, int which, u32 val)
+{
+	struct qib_devdata *dd = ppd->dd;
+	int ret = 0;
+	u64 val64;
+	u16 lcmd, licmd;
+
+	switch (which) {
+	case QIB_IB_CFG_LWID_ENB:
+		ppd->link_width_enabled = val;
+		break;
+
+	case QIB_IB_CFG_SPD_ENB:
+		ppd->link_speed_enabled = val;
+		break;
+
+	case QIB_IB_CFG_OVERRUN_THRESH: /* IB overrun threshold */
+		val64 = SYM_FIELD(dd->cspec->ibcctrl, IBCCtrl,
+				  OverrunThreshold);
+		if (val64 != val) {
+			dd->cspec->ibcctrl &=
+				~SYM_MASK(IBCCtrl, OverrunThreshold);
+			dd->cspec->ibcctrl |= (u64) val <<
+				SYM_LSB(IBCCtrl, OverrunThreshold);
+			qib_write_kreg(dd, kr_ibcctrl, dd->cspec->ibcctrl);
+			qib_write_kreg(dd, kr_scratch, 0);
+		}
+		break;
+
+	case QIB_IB_CFG_PHYERR_THRESH: /* IB PHY error threshold */
+		val64 = SYM_FIELD(dd->cspec->ibcctrl, IBCCtrl,
+				  PhyerrThreshold);
+		if (val64 != val) {
+			dd->cspec->ibcctrl &=
+				~SYM_MASK(IBCCtrl, PhyerrThreshold);
+			dd->cspec->ibcctrl |= (u64) val <<
+				SYM_LSB(IBCCtrl, PhyerrThreshold);
+			qib_write_kreg(dd, kr_ibcctrl, dd->cspec->ibcctrl);
+			qib_write_kreg(dd, kr_scratch, 0);
+		}
+		break;
+
+	case QIB_IB_CFG_PKEYS: /* update pkeys */
+		val64 = (u64) ppd->pkeys[0] | ((u64) ppd->pkeys[1] << 16) |
+			((u64) ppd->pkeys[2] << 32) |
+			((u64) ppd->pkeys[3] << 48);
+		qib_write_kreg(dd, kr_partitionkey, val64);
+		break;
+
+	case QIB_IB_CFG_LINKDEFAULT: /* IB link default (sleep/poll) */
+		/* will only take effect when the link state changes */
+		if (val == IB_LINKINITCMD_POLL)
+			dd->cspec->ibcctrl &=
+				~SYM_MASK(IBCCtrl, LinkDownDefaultState);
+		else /* SLEEP */
+			dd->cspec->ibcctrl |=
+				SYM_MASK(IBCCtrl, LinkDownDefaultState);
+		qib_write_kreg(dd, kr_ibcctrl, dd->cspec->ibcctrl);
+		qib_write_kreg(dd, kr_scratch, 0);
+		break;
+
+	case QIB_IB_CFG_MTU: /* update the MTU in IBC */
+		/*
+		 * Update our housekeeping variables, and set IBC max
+		 * size, same as init code; max IBC is max we allow in
+		 * buffer, less the qword pbc, plus 1 for ICRC, in dwords
+		 * Set even if it's unchanged, print debug message only
+		 * on changes.
+		 */
+		val = (ppd->ibmaxlen >> 2) + 1;
+		dd->cspec->ibcctrl &= ~SYM_MASK(IBCCtrl, MaxPktLen);
+		dd->cspec->ibcctrl |= (u64)val <<
+			SYM_LSB(IBCCtrl, MaxPktLen);
+		qib_write_kreg(dd, kr_ibcctrl, dd->cspec->ibcctrl);
+		qib_write_kreg(dd, kr_scratch, 0);
+		break;
+
+	case QIB_IB_CFG_LSTATE: /* set the IB link state */
+		switch (val & 0xffff0000) {
+		case IB_LINKCMD_DOWN:
+			lcmd = QLOGIC_IB_IBCC_LINKCMD_DOWN;
+			if (!dd->cspec->ibdeltainprog) {
+				dd->cspec->ibdeltainprog = 1;
+				dd->cspec->ibsymsnap =
+					read_6120_creg32(dd, cr_ibsymbolerr);
+				dd->cspec->iblnkerrsnap =
+					read_6120_creg32(dd, cr_iblinkerrrecov);
+			}
+			break;
+
+		case IB_LINKCMD_ARMED:
+			lcmd = QLOGIC_IB_IBCC_LINKCMD_ARMED;
+			break;
+
+		case IB_LINKCMD_ACTIVE:
+			lcmd = QLOGIC_IB_IBCC_LINKCMD_ACTIVE;
+			break;
+
+		default:
+			ret = -EINVAL;
+			qib_dev_err(dd, "bad linkcmd req 0x%x\n", val >> 16);
+			goto bail;
+		}
+		switch (val & 0xffff) {
+		case IB_LINKINITCMD_NOP:
+			licmd = 0;
+			break;
+
+		case IB_LINKINITCMD_POLL:
+			licmd = QLOGIC_IB_IBCC_LINKINITCMD_POLL;
+			break;
+
+		case IB_LINKINITCMD_SLEEP:
+			licmd = QLOGIC_IB_IBCC_LINKINITCMD_SLEEP;
+			break;
+
+		case IB_LINKINITCMD_DISABLE:
+			licmd = QLOGIC_IB_IBCC_LINKINITCMD_DISABLE;
+			break;
+
+		default:
+			ret = -EINVAL;
+			qib_dev_err(dd, "bad linkinitcmd req 0x%x\n",
+				    val & 0xffff);
+			goto bail;
+		}
+		qib_set_ib_6120_lstate(ppd, lcmd, licmd);
+		goto bail;
+
+	case QIB_IB_CFG_HRTBT:
+		ret = -EINVAL;
+		break;
+
+	default:
+		ret = -EINVAL;
+	}
+bail:
+	return ret;
+}
+
+static int qib_6120_set_loopback(struct qib_pportdata *ppd, const char *what)
+{
+	int ret = 0;
+	if (!strncmp(what, "ibc", 3)) {
+		ppd->dd->cspec->ibcctrl |= SYM_MASK(IBCCtrl, Loopback);
+		qib_devinfo(ppd->dd->pcidev, "Enabling IB%u:%u IBC loopback\n",
+			 ppd->dd->unit, ppd->port);
+	} else if (!strncmp(what, "off", 3)) {
+		ppd->dd->cspec->ibcctrl &= ~SYM_MASK(IBCCtrl, Loopback);
+		qib_devinfo(ppd->dd->pcidev,
+			"Disabling IB%u:%u IBC loopback (normal)\n",
+			ppd->dd->unit, ppd->port);
+	} else
+		ret = -EINVAL;
+	if (!ret) {
+		qib_write_kreg(ppd->dd, kr_ibcctrl, ppd->dd->cspec->ibcctrl);
+		qib_write_kreg(ppd->dd, kr_scratch, 0);
+	}
+	return ret;
+}
+
+static void pma_6120_timer(unsigned long data)
+{
+	struct qib_pportdata *ppd = (struct qib_pportdata *)data;
+	struct qib_chip_specific *cs = ppd->dd->cspec;
+	struct qib_ibport *ibp = &ppd->ibport_data;
+	unsigned long flags;
+
+	spin_lock_irqsave(&ibp->lock, flags);
+	if (cs->pma_sample_status == IB_PMA_SAMPLE_STATUS_STARTED) {
+		cs->pma_sample_status = IB_PMA_SAMPLE_STATUS_RUNNING;
+		qib_snapshot_counters(ppd, &cs->sword, &cs->rword,
+				      &cs->spkts, &cs->rpkts, &cs->xmit_wait);
+		mod_timer(&cs->pma_timer,
+			  jiffies + usecs_to_jiffies(ibp->pma_sample_interval));
+	} else if (cs->pma_sample_status == IB_PMA_SAMPLE_STATUS_RUNNING) {
+		u64 ta, tb, tc, td, te;
+
+		cs->pma_sample_status = IB_PMA_SAMPLE_STATUS_DONE;
+		qib_snapshot_counters(ppd, &ta, &tb, &tc, &td, &te);
+
+		cs->sword = ta - cs->sword;
+		cs->rword = tb - cs->rword;
+		cs->spkts = tc - cs->spkts;
+		cs->rpkts = td - cs->rpkts;
+		cs->xmit_wait = te - cs->xmit_wait;
+	}
+	spin_unlock_irqrestore(&ibp->lock, flags);
+}
+
+/*
+ * Note that the caller has the ibp->lock held.
+ */
+static void qib_set_cntr_6120_sample(struct qib_pportdata *ppd, u32 intv,
+				     u32 start)
+{
+	struct qib_chip_specific *cs = ppd->dd->cspec;
+
+	if (start && intv) {
+		cs->pma_sample_status = IB_PMA_SAMPLE_STATUS_STARTED;
+		mod_timer(&cs->pma_timer, jiffies + usecs_to_jiffies(start));
+	} else if (intv) {
+		cs->pma_sample_status = IB_PMA_SAMPLE_STATUS_RUNNING;
+		qib_snapshot_counters(ppd, &cs->sword, &cs->rword,
+				      &cs->spkts, &cs->rpkts, &cs->xmit_wait);
+		mod_timer(&cs->pma_timer, jiffies + usecs_to_jiffies(intv));
+	} else {
+		cs->pma_sample_status = IB_PMA_SAMPLE_STATUS_DONE;
+		cs->sword = 0;
+		cs->rword = 0;
+		cs->spkts = 0;
+		cs->rpkts = 0;
+		cs->xmit_wait = 0;
+	}
+}
+
+static u32 qib_6120_iblink_state(u64 ibcs)
+{
+	u32 state = (u32)SYM_FIELD(ibcs, IBCStatus, LinkState);
+
+	switch (state) {
+	case IB_6120_L_STATE_INIT:
+		state = IB_PORT_INIT;
+		break;
+	case IB_6120_L_STATE_ARM:
+		state = IB_PORT_ARMED;
+		break;
+	case IB_6120_L_STATE_ACTIVE:
+		/* fall through */
+	case IB_6120_L_STATE_ACT_DEFER:
+		state = IB_PORT_ACTIVE;
+		break;
+	default: /* fall through */
+	case IB_6120_L_STATE_DOWN:
+		state = IB_PORT_DOWN;
+		break;
+	}
+	return state;
+}
+
+/* returns the IBTA port state, rather than the IBC link training state */
+static u8 qib_6120_phys_portstate(u64 ibcs)
+{
+	u8 state = (u8)SYM_FIELD(ibcs, IBCStatus, LinkTrainingState);
+	return qib_6120_physportstate[state];
+}
+
+static int qib_6120_ib_updown(struct qib_pportdata *ppd, int ibup, u64 ibcs)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&ppd->lflags_lock, flags);
+	ppd->lflags &= ~QIBL_IB_FORCE_NOTIFY;
+	spin_unlock_irqrestore(&ppd->lflags_lock, flags);
+
+	if (ibup) {
+		if (ppd->dd->cspec->ibdeltainprog) {
+			ppd->dd->cspec->ibdeltainprog = 0;
+			ppd->dd->cspec->ibsymdelta +=
+				read_6120_creg32(ppd->dd, cr_ibsymbolerr) -
+					ppd->dd->cspec->ibsymsnap;
+			ppd->dd->cspec->iblnkerrdelta +=
+				read_6120_creg32(ppd->dd, cr_iblinkerrrecov) -
+					ppd->dd->cspec->iblnkerrsnap;
+		}
+		qib_hol_init(ppd);
+	} else {
+		ppd->dd->cspec->lli_counter = 0;
+		if (!ppd->dd->cspec->ibdeltainprog) {
+			ppd->dd->cspec->ibdeltainprog = 1;
+			ppd->dd->cspec->ibsymsnap =
+				read_6120_creg32(ppd->dd, cr_ibsymbolerr);
+			ppd->dd->cspec->iblnkerrsnap =
+				read_6120_creg32(ppd->dd, cr_iblinkerrrecov);
+		}
+		qib_hol_down(ppd);
+	}
+
+	qib_6120_setup_setextled(ppd, ibup);
+
+	return 0;
+}
+
+/* Does read/modify/write to appropriate registers to
+ * set output and direction bits selected by mask.
+ * these are in their canonical postions (e.g. lsb of
+ * dir will end up in D48 of extctrl on existing chips).
+ * returns contents of GP Inputs.
+ */
+static int gpio_6120_mod(struct qib_devdata *dd, u32 out, u32 dir, u32 mask)
+{
+	u64 read_val, new_out;
+	unsigned long flags;
+
+	if (mask) {
+		/* some bits being written, lock access to GPIO */
+		dir &= mask;
+		out &= mask;
+		spin_lock_irqsave(&dd->cspec->gpio_lock, flags);
+		dd->cspec->extctrl &= ~((u64)mask << SYM_LSB(EXTCtrl, GPIOOe));
+		dd->cspec->extctrl |= ((u64) dir << SYM_LSB(EXTCtrl, GPIOOe));
+		new_out = (dd->cspec->gpio_out & ~mask) | out;
+
+		qib_write_kreg(dd, kr_extctrl, dd->cspec->extctrl);
+		qib_write_kreg(dd, kr_gpio_out, new_out);
+		dd->cspec->gpio_out = new_out;
+		spin_unlock_irqrestore(&dd->cspec->gpio_lock, flags);
+	}
+	/*
+	 * It is unlikely that a read at this time would get valid
+	 * data on a pin whose direction line was set in the same
+	 * call to this function. We include the read here because
+	 * that allows us to potentially combine a change on one pin with
+	 * a read on another, and because the old code did something like
+	 * this.
+	 */
+	read_val = qib_read_kreg64(dd, kr_extstatus);
+	return SYM_FIELD(read_val, EXTStatus, GPIOIn);
+}
+
+/*
+ * Read fundamental info we need to use the chip.  These are
+ * the registers that describe chip capabilities, and are
+ * saved in shadow registers.
+ */
+static void get_6120_chip_params(struct qib_devdata *dd)
+{
+	u64 val;
+	u32 piobufs;
+	int mtu;
+
+	dd->uregbase = qib_read_kreg32(dd, kr_userregbase);
+
+	dd->rcvtidcnt = qib_read_kreg32(dd, kr_rcvtidcnt);
+	dd->rcvtidbase = qib_read_kreg32(dd, kr_rcvtidbase);
+	dd->rcvegrbase = qib_read_kreg32(dd, kr_rcvegrbase);
+	dd->palign = qib_read_kreg32(dd, kr_palign);
+	dd->piobufbase = qib_read_kreg64(dd, kr_sendpiobufbase);
+	dd->pio2k_bufbase = dd->piobufbase & 0xffffffff;
+
+	dd->rcvhdrcnt = qib_read_kreg32(dd, kr_rcvegrcnt);
+
+	val = qib_read_kreg64(dd, kr_sendpiosize);
+	dd->piosize2k = val & ~0U;
+	dd->piosize4k = val >> 32;
+
+	mtu = ib_mtu_enum_to_int(qib_ibmtu);
+	if (mtu == -1)
+		mtu = QIB_DEFAULT_MTU;
+	dd->pport->ibmtu = (u32)mtu;
+
+	val = qib_read_kreg64(dd, kr_sendpiobufcnt);
+	dd->piobcnt2k = val & ~0U;
+	dd->piobcnt4k = val >> 32;
+	dd->last_pio = dd->piobcnt4k + dd->piobcnt2k - 1;
+	/* these may be adjusted in init_chip_wc_pat() */
+	dd->pio2kbase = (u32 __iomem *)
+		(((char __iomem *)dd->kregbase) + dd->pio2k_bufbase);
+	if (dd->piobcnt4k) {
+		dd->pio4kbase = (u32 __iomem *)
+			(((char __iomem *) dd->kregbase) +
+			 (dd->piobufbase >> 32));
+		/*
+		 * 4K buffers take 2 pages; we use roundup just to be
+		 * paranoid; we calculate it once here, rather than on
+		 * ever buf allocate
+		 */
+		dd->align4k = ALIGN(dd->piosize4k, dd->palign);
+	}
+
+	piobufs = dd->piobcnt4k + dd->piobcnt2k;
+
+	dd->pioavregs = ALIGN(piobufs, sizeof(u64) * BITS_PER_BYTE / 2) /
+		(sizeof(u64) * BITS_PER_BYTE / 2);
+}
+
+/*
+ * The chip base addresses in cspec and cpspec have to be set
+ * after possible init_chip_wc_pat(), rather than in
+ * get_6120_chip_params(), so split out as separate function
+ */
+static void set_6120_baseaddrs(struct qib_devdata *dd)
+{
+	u32 cregbase;
+	cregbase = qib_read_kreg32(dd, kr_counterregbase);
+	dd->cspec->cregbase = (u64 __iomem *)
+		((char __iomem *) dd->kregbase + cregbase);
+
+	dd->egrtidbase = (u64 __iomem *)
+		((char __iomem *) dd->kregbase + dd->rcvegrbase);
+}
+
+/*
+ * Write the final few registers that depend on some of the
+ * init setup.  Done late in init, just before bringing up
+ * the serdes.
+ */
+static int qib_late_6120_initreg(struct qib_devdata *dd)
+{
+	int ret = 0;
+	u64 val;
+
+	qib_write_kreg(dd, kr_rcvhdrentsize, dd->rcvhdrentsize);
+	qib_write_kreg(dd, kr_rcvhdrsize, dd->rcvhdrsize);
+	qib_write_kreg(dd, kr_rcvhdrcnt, dd->rcvhdrcnt);
+	qib_write_kreg(dd, kr_sendpioavailaddr, dd->pioavailregs_phys);
+	val = qib_read_kreg64(dd, kr_sendpioavailaddr);
+	if (val != dd->pioavailregs_phys) {
+		qib_dev_err(dd,
+			"Catastrophic software error, SendPIOAvailAddr written as %lx, read back as %llx\n",
+			(unsigned long) dd->pioavailregs_phys,
+			(unsigned long long) val);
+		ret = -EINVAL;
+	}
+	return ret;
+}
+
+static int init_6120_variables(struct qib_devdata *dd)
+{
+	int ret = 0;
+	struct qib_pportdata *ppd;
+	u32 sbufs;
+
+	ppd = (struct qib_pportdata *)(dd + 1);
+	dd->pport = ppd;
+	dd->num_pports = 1;
+
+	dd->cspec = (struct qib_chip_specific *)(ppd + dd->num_pports);
+	ppd->cpspec = NULL; /* not used in this chip */
+
+	spin_lock_init(&dd->cspec->kernel_tid_lock);
+	spin_lock_init(&dd->cspec->user_tid_lock);
+	spin_lock_init(&dd->cspec->rcvmod_lock);
+	spin_lock_init(&dd->cspec->gpio_lock);
+
+	/* we haven't yet set QIB_PRESENT, so use read directly */
+	dd->revision = readq(&dd->kregbase[kr_revision]);
+
+	if ((dd->revision & 0xffffffffU) == 0xffffffffU) {
+		qib_dev_err(dd,
+			"Revision register read failure, giving up initialization\n");
+		ret = -ENODEV;
+		goto bail;
+	}
+	dd->flags |= QIB_PRESENT;  /* now register routines work */
+
+	dd->majrev = (u8) SYM_FIELD(dd->revision, Revision_R,
+				    ChipRevMajor);
+	dd->minrev = (u8) SYM_FIELD(dd->revision, Revision_R,
+				    ChipRevMinor);
+
+	get_6120_chip_params(dd);
+	pe_boardname(dd); /* fill in boardname */
+
+	/*
+	 * GPIO bits for TWSI data and clock,
+	 * used for serial EEPROM.
+	 */
+	dd->gpio_sda_num = _QIB_GPIO_SDA_NUM;
+	dd->gpio_scl_num = _QIB_GPIO_SCL_NUM;
+	dd->twsi_eeprom_dev = QIB_TWSI_NO_DEV;
+
+	if (qib_unordered_wc())
+		dd->flags |= QIB_PIO_FLUSH_WC;
+
+	/*
+	 * EEPROM error log 0 is TXE Parity errors. 1 is RXE Parity.
+	 * 2 is Some Misc, 3 is reserved for future.
+	 */
+	dd->eep_st_masks[0].hwerrs_to_log = HWE_MASK(TXEMemParityErr);
+
+	/* Ignore errors in PIO/PBC on systems with unordered write-combining */
+	if (qib_unordered_wc())
+		dd->eep_st_masks[0].hwerrs_to_log &= ~TXE_PIO_PARITY;
+
+	dd->eep_st_masks[1].hwerrs_to_log = HWE_MASK(RXEMemParityErr);
+
+	dd->eep_st_masks[2].errs_to_log = ERR_MASK(ResetNegated);
+
+	ret = qib_init_pportdata(ppd, dd, 0, 1);
+	if (ret)
+		goto bail;
+	ppd->link_width_supported = IB_WIDTH_1X | IB_WIDTH_4X;
+	ppd->link_speed_supported = QIB_IB_SDR;
+	ppd->link_width_enabled = IB_WIDTH_4X;
+	ppd->link_speed_enabled = ppd->link_speed_supported;
+	/* these can't change for this chip, so set once */
+	ppd->link_width_active = ppd->link_width_enabled;
+	ppd->link_speed_active = ppd->link_speed_enabled;
+	ppd->vls_supported = IB_VL_VL0;
+	ppd->vls_operational = ppd->vls_supported;
+
+	dd->rcvhdrentsize = QIB_RCVHDR_ENTSIZE;
+	dd->rcvhdrsize = QIB_DFLT_RCVHDRSIZE;
+	dd->rhf_offset = 0;
+
+	/* we always allocate at least 2048 bytes for eager buffers */
+	ret = ib_mtu_enum_to_int(qib_ibmtu);
+	dd->rcvegrbufsize = ret != -1 ? max(ret, 2048) : QIB_DEFAULT_MTU;
+	BUG_ON(!is_power_of_2(dd->rcvegrbufsize));
+	dd->rcvegrbufsize_shift = ilog2(dd->rcvegrbufsize);
+
+	qib_6120_tidtemplate(dd);
+
+	/*
+	 * We can request a receive interrupt for 1 or
+	 * more packets from current offset.  For now, we set this
+	 * up for a single packet.
+	 */
+	dd->rhdrhead_intr_off = 1ULL << 32;
+
+	/* setup the stats timer; the add_timer is done at end of init */
+	init_timer(&dd->stats_timer);
+	dd->stats_timer.function = qib_get_6120_faststats;
+	dd->stats_timer.data = (unsigned long) dd;
+
+	init_timer(&dd->cspec->pma_timer);
+	dd->cspec->pma_timer.function = pma_6120_timer;
+	dd->cspec->pma_timer.data = (unsigned long) ppd;
+
+	dd->ureg_align = qib_read_kreg32(dd, kr_palign);
+
+	dd->piosize2kmax_dwords = dd->piosize2k >> 2;
+	qib_6120_config_ctxts(dd);
+	qib_set_ctxtcnt(dd);
+
+	if (qib_wc_pat) {
+		ret = init_chip_wc_pat(dd, 0);
+		if (ret)
+			goto bail;
+	}
+	set_6120_baseaddrs(dd); /* set chip access pointers now */
+
+	ret = 0;
+	if (qib_mini_init)
+		goto bail;
+
+	qib_num_cfg_vls = 1; /* if any 6120's, only one VL */
+
+	ret = qib_create_ctxts(dd);
+	init_6120_cntrnames(dd);
+
+	/* use all of 4KB buffers for the kernel, otherwise 16 */
+	sbufs = dd->piobcnt4k ?  dd->piobcnt4k : 16;
+
+	dd->lastctxt_piobuf = dd->piobcnt2k + dd->piobcnt4k - sbufs;
+	dd->pbufsctxt = dd->lastctxt_piobuf /
+		(dd->cfgctxts - dd->first_user_ctxt);
+
+	if (ret)
+		goto bail;
+bail:
+	return ret;
+}
+
+/*
+ * For this chip, we want to use the same buffer every time
+ * when we are trying to bring the link up (they are always VL15
+ * packets).  At that link state the packet should always go out immediately
+ * (or at least be discarded at the tx interface if the link is down).
+ * If it doesn't, and the buffer isn't available, that means some other
+ * sender has gotten ahead of us, and is preventing our packet from going
+ * out.  In that case, we flush all packets, and try again.  If that still
+ * fails, we fail the request, and hope things work the next time around.
+ *
+ * We don't need very complicated heuristics on whether the packet had
+ * time to go out or not, since even at SDR 1X, it goes out in very short
+ * time periods, covered by the chip reads done here and as part of the
+ * flush.
+ */
+static u32 __iomem *get_6120_link_buf(struct qib_pportdata *ppd, u32 *bnum)
+{
+	u32 __iomem *buf;
+	u32 lbuf = ppd->dd->piobcnt2k + ppd->dd->piobcnt4k - 1;
+
+	/*
+	 * always blip to get avail list updated, since it's almost
+	 * always needed, and is fairly cheap.
+	 */
+	sendctrl_6120_mod(ppd->dd->pport, QIB_SENDCTRL_AVAIL_BLIP);
+	qib_read_kreg64(ppd->dd, kr_scratch); /* extra chip flush */
+	buf = qib_getsendbuf_range(ppd->dd, bnum, lbuf, lbuf);
+	if (buf)
+		goto done;
+
+	sendctrl_6120_mod(ppd, QIB_SENDCTRL_DISARM_ALL | QIB_SENDCTRL_FLUSH |
+			  QIB_SENDCTRL_AVAIL_BLIP);
+	ppd->dd->upd_pio_shadow  = 1; /* update our idea of what's busy */
+	qib_read_kreg64(ppd->dd, kr_scratch); /* extra chip flush */
+	buf = qib_getsendbuf_range(ppd->dd, bnum, lbuf, lbuf);
+done:
+	return buf;
+}
+
+static u32 __iomem *qib_6120_getsendbuf(struct qib_pportdata *ppd, u64 pbc,
+					u32 *pbufnum)
+{
+	u32 first, last, plen = pbc & QIB_PBC_LENGTH_MASK;
+	struct qib_devdata *dd = ppd->dd;
+	u32 __iomem *buf;
+
+	if (((pbc >> 32) & PBC_6120_VL15_SEND_CTRL) &&
+		!(ppd->lflags & (QIBL_IB_AUTONEG_INPROG | QIBL_LINKACTIVE)))
+		buf = get_6120_link_buf(ppd, pbufnum);
+	else {
+
+		if ((plen + 1) > dd->piosize2kmax_dwords)
+			first = dd->piobcnt2k;
+		else
+			first = 0;
+		/* try 4k if all 2k busy, so same last for both sizes */
+		last = dd->piobcnt2k + dd->piobcnt4k - 1;
+		buf = qib_getsendbuf_range(dd, pbufnum, first, last);
+	}
+	return buf;
+}
+
+static int init_sdma_6120_regs(struct qib_pportdata *ppd)
+{
+	return -ENODEV;
+}
+
+static u16 qib_sdma_6120_gethead(struct qib_pportdata *ppd)
+{
+	return 0;
+}
+
+static int qib_sdma_6120_busy(struct qib_pportdata *ppd)
+{
+	return 0;
+}
+
+static void qib_sdma_update_6120_tail(struct qib_pportdata *ppd, u16 tail)
+{
+}
+
+static void qib_6120_sdma_sendctrl(struct qib_pportdata *ppd, unsigned op)
+{
+}
+
+static void qib_sdma_set_6120_desc_cnt(struct qib_pportdata *ppd, unsigned cnt)
+{
+}
+
+/*
+ * the pbc doesn't need a VL15 indicator, but we need it for link_buf.
+ * The chip ignores the bit if set.
+ */
+static u32 qib_6120_setpbc_control(struct qib_pportdata *ppd, u32 plen,
+				   u8 srate, u8 vl)
+{
+	return vl == 15 ? PBC_6120_VL15_SEND_CTRL : 0;
+}
+
+static void qib_6120_initvl15_bufs(struct qib_devdata *dd)
+{
+}
+
+static void qib_6120_init_ctxt(struct qib_ctxtdata *rcd)
+{
+	rcd->rcvegrcnt = rcd->dd->rcvhdrcnt;
+	rcd->rcvegr_tid_base = rcd->ctxt * rcd->rcvegrcnt;
+}
+
+static void qib_6120_txchk_change(struct qib_devdata *dd, u32 start,
+	u32 len, u32 avail, struct qib_ctxtdata *rcd)
+{
+}
+
+static void writescratch(struct qib_devdata *dd, u32 val)
+{
+	(void) qib_write_kreg(dd, kr_scratch, val);
+}
+
+static int qib_6120_tempsense_rd(struct qib_devdata *dd, int regnum)
+{
+	return -ENXIO;
+}
+
+#ifdef CONFIG_INFINIBAND_QIB_DCA
+static int qib_6120_notify_dca(struct qib_devdata *dd, unsigned long event)
+{
+	return 0;
+}
+#endif
+
+/* Dummy function, as 6120 boards never disable EEPROM Write */
+static int qib_6120_eeprom_wen(struct qib_devdata *dd, int wen)
+{
+	return 1;
+}
+
+/**
+ * qib_init_iba6120_funcs - set up the chip-specific function pointers
+ * @pdev: pci_dev of the qlogic_ib device
+ * @ent: pci_device_id matching this chip
+ *
+ * This is global, and is called directly at init to set up the
+ * chip-specific function pointers for later use.
+ *
+ * It also allocates/partially-inits the qib_devdata struct for
+ * this device.
+ */
+struct qib_devdata *qib_init_iba6120_funcs(struct pci_dev *pdev,
+					   const struct pci_device_id *ent)
+{
+	struct qib_devdata *dd;
+	int ret;
+
+	dd = qib_alloc_devdata(pdev, sizeof(struct qib_pportdata) +
+			       sizeof(struct qib_chip_specific));
+	if (IS_ERR(dd))
+		goto bail;
+
+	dd->f_bringup_serdes    = qib_6120_bringup_serdes;
+	dd->f_cleanup           = qib_6120_setup_cleanup;
+	dd->f_clear_tids        = qib_6120_clear_tids;
+	dd->f_free_irq          = qib_6120_free_irq;
+	dd->f_get_base_info     = qib_6120_get_base_info;
+	dd->f_get_msgheader     = qib_6120_get_msgheader;
+	dd->f_getsendbuf        = qib_6120_getsendbuf;
+	dd->f_gpio_mod          = gpio_6120_mod;
+	dd->f_eeprom_wen	= qib_6120_eeprom_wen;
+	dd->f_hdrqempty         = qib_6120_hdrqempty;
+	dd->f_ib_updown         = qib_6120_ib_updown;
+	dd->f_init_ctxt         = qib_6120_init_ctxt;
+	dd->f_initvl15_bufs     = qib_6120_initvl15_bufs;
+	dd->f_intr_fallback     = qib_6120_nointr_fallback;
+	dd->f_late_initreg      = qib_late_6120_initreg;
+	dd->f_setpbc_control    = qib_6120_setpbc_control;
+	dd->f_portcntr          = qib_portcntr_6120;
+	dd->f_put_tid           = (dd->minrev >= 2) ?
+				      qib_6120_put_tid_2 :
+				      qib_6120_put_tid;
+	dd->f_quiet_serdes      = qib_6120_quiet_serdes;
+	dd->f_rcvctrl           = rcvctrl_6120_mod;
+	dd->f_read_cntrs        = qib_read_6120cntrs;
+	dd->f_read_portcntrs    = qib_read_6120portcntrs;
+	dd->f_reset             = qib_6120_setup_reset;
+	dd->f_init_sdma_regs    = init_sdma_6120_regs;
+	dd->f_sdma_busy         = qib_sdma_6120_busy;
+	dd->f_sdma_gethead      = qib_sdma_6120_gethead;
+	dd->f_sdma_sendctrl     = qib_6120_sdma_sendctrl;
+	dd->f_sdma_set_desc_cnt = qib_sdma_set_6120_desc_cnt;
+	dd->f_sdma_update_tail  = qib_sdma_update_6120_tail;
+	dd->f_sendctrl          = sendctrl_6120_mod;
+	dd->f_set_armlaunch     = qib_set_6120_armlaunch;
+	dd->f_set_cntr_sample   = qib_set_cntr_6120_sample;
+	dd->f_iblink_state      = qib_6120_iblink_state;
+	dd->f_ibphys_portstate  = qib_6120_phys_portstate;
+	dd->f_get_ib_cfg        = qib_6120_get_ib_cfg;
+	dd->f_set_ib_cfg        = qib_6120_set_ib_cfg;
+	dd->f_set_ib_loopback   = qib_6120_set_loopback;
+	dd->f_set_intr_state    = qib_6120_set_intr_state;
+	dd->f_setextled         = qib_6120_setup_setextled;
+	dd->f_txchk_change      = qib_6120_txchk_change;
+	dd->f_update_usrhead    = qib_update_6120_usrhead;
+	dd->f_wantpiobuf_intr   = qib_wantpiobuf_6120_intr;
+	dd->f_xgxs_reset        = qib_6120_xgxs_reset;
+	dd->f_writescratch      = writescratch;
+	dd->f_tempsense_rd	= qib_6120_tempsense_rd;
+#ifdef CONFIG_INFINIBAND_QIB_DCA
+	dd->f_notify_dca = qib_6120_notify_dca;
+#endif
+	/*
+	 * Do remaining pcie setup and save pcie values in dd.
+	 * Any error printing is already done by the init code.
+	 * On return, we have the chip mapped and accessible,
+	 * but chip registers are not set up until start of
+	 * init_6120_variables.
+	 */
+	ret = qib_pcie_ddinit(dd, pdev, ent);
+	if (ret < 0)
+		goto bail_free;
+
+	/* initialize chip-specific variables */
+	ret = init_6120_variables(dd);
+	if (ret)
+		goto bail_cleanup;
+
+	if (qib_mini_init)
+		goto bail;
+
+	if (qib_pcie_params(dd, 8, NULL, NULL))
+		qib_dev_err(dd,
+			"Failed to setup PCIe or interrupts; continuing anyway\n");
+	dd->cspec->irq = pdev->irq; /* save IRQ */
+
+	/* clear diagctrl register, in case diags were running and crashed */
+	qib_write_kreg(dd, kr_hwdiagctrl, 0);
+
+	if (qib_read_kreg64(dd, kr_hwerrstatus) &
+	    QLOGIC_IB_HWE_SERDESPLLFAILED)
+		qib_write_kreg(dd, kr_hwerrclear,
+			       QLOGIC_IB_HWE_SERDESPLLFAILED);
+
+	/* setup interrupt handler (interrupt type handled above) */
+	qib_setup_6120_interrupt(dd);
+	/* Note that qpn_mask is set by qib_6120_config_ctxts() first */
+	qib_6120_init_hwerrors(dd);
+
+	goto bail;
+
+bail_cleanup:
+	qib_pcie_ddcleanup(dd);
+bail_free:
+	qib_free_devdata(dd);
+	dd = ERR_PTR(ret);
+bail:
+	return dd;
+}
diff --git a/drivers/infiniband/hw/qib/qib_iba7220.c b/drivers/infiniband/hw/qib/qib_iba7220.c
new file mode 100644
index 0000000..7dec89f
--- /dev/null
+++ b/drivers/infiniband/hw/qib/qib_iba7220.c
@@ -0,0 +1,4659 @@
+/*
+ * Copyright (c) 2006, 2007, 2008, 2009, 2010 QLogic Corporation.
+ * All rights reserved.
+ * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/*
+ * This file contains all of the code that is specific to the
+ * QLogic_IB 7220 chip (except that specific to the SerDes)
+ */
+
+#include <linux/interrupt.h>
+#include <linux/pci.h>
+#include <linux/delay.h>
+#include <linux/module.h>
+#include <linux/io.h>
+#include <rdma/ib_verbs.h>
+
+#include "qib.h"
+#include "qib_7220.h"
+
+static void qib_setup_7220_setextled(struct qib_pportdata *, u32);
+static void qib_7220_handle_hwerrors(struct qib_devdata *, char *, size_t);
+static void sendctrl_7220_mod(struct qib_pportdata *ppd, u32 op);
+static u32 qib_7220_iblink_state(u64);
+static u8 qib_7220_phys_portstate(u64);
+static void qib_sdma_update_7220_tail(struct qib_pportdata *, u16);
+static void qib_set_ib_7220_lstate(struct qib_pportdata *, u16, u16);
+
+/*
+ * This file contains almost all the chip-specific register information and
+ * access functions for the QLogic QLogic_IB 7220 PCI-Express chip, with the
+ * exception of SerDes support, which in in qib_sd7220.c.
+ */
+
+/* Below uses machine-generated qib_chipnum_regs.h file */
+#define KREG_IDX(regname) (QIB_7220_##regname##_OFFS / sizeof(u64))
+
+/* Use defines to tie machine-generated names to lower-case names */
+#define kr_control KREG_IDX(Control)
+#define kr_counterregbase KREG_IDX(CntrRegBase)
+#define kr_errclear KREG_IDX(ErrClear)
+#define kr_errmask KREG_IDX(ErrMask)
+#define kr_errstatus KREG_IDX(ErrStatus)
+#define kr_extctrl KREG_IDX(EXTCtrl)
+#define kr_extstatus KREG_IDX(EXTStatus)
+#define kr_gpio_clear KREG_IDX(GPIOClear)
+#define kr_gpio_mask KREG_IDX(GPIOMask)
+#define kr_gpio_out KREG_IDX(GPIOOut)
+#define kr_gpio_status KREG_IDX(GPIOStatus)
+#define kr_hrtbt_guid KREG_IDX(HRTBT_GUID)
+#define kr_hwdiagctrl KREG_IDX(HwDiagCtrl)
+#define kr_hwerrclear KREG_IDX(HwErrClear)
+#define kr_hwerrmask KREG_IDX(HwErrMask)
+#define kr_hwerrstatus KREG_IDX(HwErrStatus)
+#define kr_ibcctrl KREG_IDX(IBCCtrl)
+#define kr_ibcddrctrl KREG_IDX(IBCDDRCtrl)
+#define kr_ibcddrstatus KREG_IDX(IBCDDRStatus)
+#define kr_ibcstatus KREG_IDX(IBCStatus)
+#define kr_ibserdesctrl KREG_IDX(IBSerDesCtrl)
+#define kr_intclear KREG_IDX(IntClear)
+#define kr_intmask KREG_IDX(IntMask)
+#define kr_intstatus KREG_IDX(IntStatus)
+#define kr_ncmodectrl KREG_IDX(IBNCModeCtrl)
+#define kr_palign KREG_IDX(PageAlign)
+#define kr_partitionkey KREG_IDX(RcvPartitionKey)
+#define kr_portcnt KREG_IDX(PortCnt)
+#define kr_rcvbthqp KREG_IDX(RcvBTHQP)
+#define kr_rcvctrl KREG_IDX(RcvCtrl)
+#define kr_rcvegrbase KREG_IDX(RcvEgrBase)
+#define kr_rcvegrcnt KREG_IDX(RcvEgrCnt)
+#define kr_rcvhdrcnt KREG_IDX(RcvHdrCnt)
+#define kr_rcvhdrentsize KREG_IDX(RcvHdrEntSize)
+#define kr_rcvhdrsize KREG_IDX(RcvHdrSize)
+#define kr_rcvpktledcnt KREG_IDX(RcvPktLEDCnt)
+#define kr_rcvtidbase KREG_IDX(RcvTIDBase)
+#define kr_rcvtidcnt KREG_IDX(RcvTIDCnt)
+#define kr_revision KREG_IDX(Revision)
+#define kr_scratch KREG_IDX(Scratch)
+#define kr_sendbuffererror KREG_IDX(SendBufErr0)
+#define kr_sendctrl KREG_IDX(SendCtrl)
+#define kr_senddmabase KREG_IDX(SendDmaBase)
+#define kr_senddmabufmask0 KREG_IDX(SendDmaBufMask0)
+#define kr_senddmabufmask1 (KREG_IDX(SendDmaBufMask0) + 1)
+#define kr_senddmabufmask2 (KREG_IDX(SendDmaBufMask0) + 2)
+#define kr_senddmahead KREG_IDX(SendDmaHead)
+#define kr_senddmaheadaddr KREG_IDX(SendDmaHeadAddr)
+#define kr_senddmalengen KREG_IDX(SendDmaLenGen)
+#define kr_senddmastatus KREG_IDX(SendDmaStatus)
+#define kr_senddmatail KREG_IDX(SendDmaTail)
+#define kr_sendpioavailaddr KREG_IDX(SendBufAvailAddr)
+#define kr_sendpiobufbase KREG_IDX(SendBufBase)
+#define kr_sendpiobufcnt KREG_IDX(SendBufCnt)
+#define kr_sendpiosize KREG_IDX(SendBufSize)
+#define kr_sendregbase KREG_IDX(SendRegBase)
+#define kr_userregbase KREG_IDX(UserRegBase)
+#define kr_xgxs_cfg KREG_IDX(XGXSCfg)
+
+/* These must only be written via qib_write_kreg_ctxt() */
+#define kr_rcvhdraddr KREG_IDX(RcvHdrAddr0)
+#define kr_rcvhdrtailaddr KREG_IDX(RcvHdrTailAddr0)
+
+
+#define CREG_IDX(regname) ((QIB_7220_##regname##_OFFS - \
+			QIB_7220_LBIntCnt_OFFS) / sizeof(u64))
+
+#define cr_badformat CREG_IDX(RxVersionErrCnt)
+#define cr_erricrc CREG_IDX(RxICRCErrCnt)
+#define cr_errlink CREG_IDX(RxLinkMalformCnt)
+#define cr_errlpcrc CREG_IDX(RxLPCRCErrCnt)
+#define cr_errpkey CREG_IDX(RxPKeyMismatchCnt)
+#define cr_rcvflowctrl_err CREG_IDX(RxFlowCtrlViolCnt)
+#define cr_err_rlen CREG_IDX(RxLenErrCnt)
+#define cr_errslen CREG_IDX(TxLenErrCnt)
+#define cr_errtidfull CREG_IDX(RxTIDFullErrCnt)
+#define cr_errtidvalid CREG_IDX(RxTIDValidErrCnt)
+#define cr_errvcrc CREG_IDX(RxVCRCErrCnt)
+#define cr_ibstatuschange CREG_IDX(IBStatusChangeCnt)
+#define cr_lbint CREG_IDX(LBIntCnt)
+#define cr_invalidrlen CREG_IDX(RxMaxMinLenErrCnt)
+#define cr_invalidslen CREG_IDX(TxMaxMinLenErrCnt)
+#define cr_lbflowstall CREG_IDX(LBFlowStallCnt)
+#define cr_pktrcv CREG_IDX(RxDataPktCnt)
+#define cr_pktrcvflowctrl CREG_IDX(RxFlowPktCnt)
+#define cr_pktsend CREG_IDX(TxDataPktCnt)
+#define cr_pktsendflow CREG_IDX(TxFlowPktCnt)
+#define cr_portovfl CREG_IDX(RxP0HdrEgrOvflCnt)
+#define cr_rcvebp CREG_IDX(RxEBPCnt)
+#define cr_rcvovfl CREG_IDX(RxBufOvflCnt)
+#define cr_senddropped CREG_IDX(TxDroppedPktCnt)
+#define cr_sendstall CREG_IDX(TxFlowStallCnt)
+#define cr_sendunderrun CREG_IDX(TxUnderrunCnt)
+#define cr_wordrcv CREG_IDX(RxDwordCnt)
+#define cr_wordsend CREG_IDX(TxDwordCnt)
+#define cr_txunsupvl CREG_IDX(TxUnsupVLErrCnt)
+#define cr_rxdroppkt CREG_IDX(RxDroppedPktCnt)
+#define cr_iblinkerrrecov CREG_IDX(IBLinkErrRecoveryCnt)
+#define cr_iblinkdown CREG_IDX(IBLinkDownedCnt)
+#define cr_ibsymbolerr CREG_IDX(IBSymbolErrCnt)
+#define cr_vl15droppedpkt CREG_IDX(RxVL15DroppedPktCnt)
+#define cr_rxotherlocalphyerr CREG_IDX(RxOtherLocalPhyErrCnt)
+#define cr_excessbufferovfl CREG_IDX(ExcessBufferOvflCnt)
+#define cr_locallinkintegrityerr CREG_IDX(LocalLinkIntegrityErrCnt)
+#define cr_rxvlerr CREG_IDX(RxVlErrCnt)
+#define cr_rxdlidfltr CREG_IDX(RxDlidFltrCnt)
+#define cr_psstat CREG_IDX(PSStat)
+#define cr_psstart CREG_IDX(PSStart)
+#define cr_psinterval CREG_IDX(PSInterval)
+#define cr_psrcvdatacount CREG_IDX(PSRcvDataCount)
+#define cr_psrcvpktscount CREG_IDX(PSRcvPktsCount)
+#define cr_psxmitdatacount CREG_IDX(PSXmitDataCount)
+#define cr_psxmitpktscount CREG_IDX(PSXmitPktsCount)
+#define cr_psxmitwaitcount CREG_IDX(PSXmitWaitCount)
+#define cr_txsdmadesc CREG_IDX(TxSDmaDescCnt)
+#define cr_pcieretrydiag CREG_IDX(PcieRetryBufDiagQwordCnt)
+
+#define SYM_RMASK(regname, fldname) ((u64)              \
+	QIB_7220_##regname##_##fldname##_RMASK)
+#define SYM_MASK(regname, fldname) ((u64)               \
+	QIB_7220_##regname##_##fldname##_RMASK <<       \
+	 QIB_7220_##regname##_##fldname##_LSB)
+#define SYM_LSB(regname, fldname) (QIB_7220_##regname##_##fldname##_LSB)
+#define SYM_FIELD(value, regname, fldname) ((u64) \
+	(((value) >> SYM_LSB(regname, fldname)) & \
+	 SYM_RMASK(regname, fldname)))
+#define ERR_MASK(fldname) SYM_MASK(ErrMask, fldname##Mask)
+#define HWE_MASK(fldname) SYM_MASK(HwErrMask, fldname##Mask)
+
+/* ibcctrl bits */
+#define QLOGIC_IB_IBCC_LINKINITCMD_DISABLE 1
+/* cycle through TS1/TS2 till OK */
+#define QLOGIC_IB_IBCC_LINKINITCMD_POLL 2
+/* wait for TS1, then go on */
+#define QLOGIC_IB_IBCC_LINKINITCMD_SLEEP 3
+#define QLOGIC_IB_IBCC_LINKINITCMD_SHIFT 16
+
+#define QLOGIC_IB_IBCC_LINKCMD_DOWN 1           /* move to 0x11 */
+#define QLOGIC_IB_IBCC_LINKCMD_ARMED 2          /* move to 0x21 */
+#define QLOGIC_IB_IBCC_LINKCMD_ACTIVE 3 /* move to 0x31 */
+
+#define BLOB_7220_IBCHG 0x81
+
+/*
+ * We could have a single register get/put routine, that takes a group type,
+ * but this is somewhat clearer and cleaner.  It also gives us some error
+ * checking.  64 bit register reads should always work, but are inefficient
+ * on opteron (the northbridge always generates 2 separate HT 32 bit reads),
+ * so we use kreg32 wherever possible.  User register and counter register
+ * reads are always 32 bit reads, so only one form of those routines.
+ */
+
+/**
+ * qib_read_ureg32 - read 32-bit virtualized per-context register
+ * @dd: device
+ * @regno: register number
+ * @ctxt: context number
+ *
+ * Return the contents of a register that is virtualized to be per context.
+ * Returns -1 on errors (not distinguishable from valid contents at
+ * runtime; we may add a separate error variable at some point).
+ */
+static inline u32 qib_read_ureg32(const struct qib_devdata *dd,
+				  enum qib_ureg regno, int ctxt)
+{
+	if (!dd->kregbase || !(dd->flags & QIB_PRESENT))
+		return 0;
+
+	if (dd->userbase)
+		return readl(regno + (u64 __iomem *)
+			     ((char __iomem *)dd->userbase +
+			      dd->ureg_align * ctxt));
+	else
+		return readl(regno + (u64 __iomem *)
+			     (dd->uregbase +
+			      (char __iomem *)dd->kregbase +
+			      dd->ureg_align * ctxt));
+}
+
+/**
+ * qib_write_ureg - write 32-bit virtualized per-context register
+ * @dd: device
+ * @regno: register number
+ * @value: value
+ * @ctxt: context
+ *
+ * Write the contents of a register that is virtualized to be per context.
+ */
+static inline void qib_write_ureg(const struct qib_devdata *dd,
+				  enum qib_ureg regno, u64 value, int ctxt)
+{
+	u64 __iomem *ubase;
+
+	if (dd->userbase)
+		ubase = (u64 __iomem *)
+			((char __iomem *) dd->userbase +
+			 dd->ureg_align * ctxt);
+	else
+		ubase = (u64 __iomem *)
+			(dd->uregbase +
+			 (char __iomem *) dd->kregbase +
+			 dd->ureg_align * ctxt);
+
+	if (dd->kregbase && (dd->flags & QIB_PRESENT))
+		writeq(value, &ubase[regno]);
+}
+
+/**
+ * qib_write_kreg_ctxt - write a device's per-ctxt 64-bit kernel register
+ * @dd: the qlogic_ib device
+ * @regno: the register number to write
+ * @ctxt: the context containing the register
+ * @value: the value to write
+ */
+static inline void qib_write_kreg_ctxt(const struct qib_devdata *dd,
+				       const u16 regno, unsigned ctxt,
+				       u64 value)
+{
+	qib_write_kreg(dd, regno + ctxt, value);
+}
+
+static inline void write_7220_creg(const struct qib_devdata *dd,
+				   u16 regno, u64 value)
+{
+	if (dd->cspec->cregbase && (dd->flags & QIB_PRESENT))
+		writeq(value, &dd->cspec->cregbase[regno]);
+}
+
+static inline u64 read_7220_creg(const struct qib_devdata *dd, u16 regno)
+{
+	if (!dd->cspec->cregbase || !(dd->flags & QIB_PRESENT))
+		return 0;
+	return readq(&dd->cspec->cregbase[regno]);
+}
+
+static inline u32 read_7220_creg32(const struct qib_devdata *dd, u16 regno)
+{
+	if (!dd->cspec->cregbase || !(dd->flags & QIB_PRESENT))
+		return 0;
+	return readl(&dd->cspec->cregbase[regno]);
+}
+
+/* kr_revision bits */
+#define QLOGIC_IB_R_EMULATORREV_MASK ((1ULL << 22) - 1)
+#define QLOGIC_IB_R_EMULATORREV_SHIFT 40
+
+/* kr_control bits */
+#define QLOGIC_IB_C_RESET (1U << 7)
+
+/* kr_intstatus, kr_intclear, kr_intmask bits */
+#define QLOGIC_IB_I_RCVURG_MASK ((1ULL << 17) - 1)
+#define QLOGIC_IB_I_RCVURG_SHIFT 32
+#define QLOGIC_IB_I_RCVAVAIL_MASK ((1ULL << 17) - 1)
+#define QLOGIC_IB_I_RCVAVAIL_SHIFT 0
+#define QLOGIC_IB_I_SERDESTRIMDONE (1ULL << 27)
+
+#define QLOGIC_IB_C_FREEZEMODE 0x00000002
+#define QLOGIC_IB_C_LINKENABLE 0x00000004
+
+#define QLOGIC_IB_I_SDMAINT             0x8000000000000000ULL
+#define QLOGIC_IB_I_SDMADISABLED        0x4000000000000000ULL
+#define QLOGIC_IB_I_ERROR               0x0000000080000000ULL
+#define QLOGIC_IB_I_SPIOSENT            0x0000000040000000ULL
+#define QLOGIC_IB_I_SPIOBUFAVAIL        0x0000000020000000ULL
+#define QLOGIC_IB_I_GPIO                0x0000000010000000ULL
+
+/* variables for sanity checking interrupt and errors */
+#define QLOGIC_IB_I_BITSEXTANT \
+		(QLOGIC_IB_I_SDMAINT | QLOGIC_IB_I_SDMADISABLED | \
+		(QLOGIC_IB_I_RCVURG_MASK << QLOGIC_IB_I_RCVURG_SHIFT) | \
+		(QLOGIC_IB_I_RCVAVAIL_MASK << \
+		 QLOGIC_IB_I_RCVAVAIL_SHIFT) | \
+		QLOGIC_IB_I_ERROR | QLOGIC_IB_I_SPIOSENT | \
+		QLOGIC_IB_I_SPIOBUFAVAIL | QLOGIC_IB_I_GPIO | \
+		QLOGIC_IB_I_SERDESTRIMDONE)
+
+#define IB_HWE_BITSEXTANT \
+	       (HWE_MASK(RXEMemParityErr) | \
+		HWE_MASK(TXEMemParityErr) | \
+		(QLOGIC_IB_HWE_PCIEMEMPARITYERR_MASK <<	 \
+		 QLOGIC_IB_HWE_PCIEMEMPARITYERR_SHIFT) | \
+		QLOGIC_IB_HWE_PCIE1PLLFAILED | \
+		QLOGIC_IB_HWE_PCIE0PLLFAILED | \
+		QLOGIC_IB_HWE_PCIEPOISONEDTLP | \
+		QLOGIC_IB_HWE_PCIECPLTIMEOUT | \
+		QLOGIC_IB_HWE_PCIEBUSPARITYXTLH | \
+		QLOGIC_IB_HWE_PCIEBUSPARITYXADM | \
+		QLOGIC_IB_HWE_PCIEBUSPARITYRADM | \
+		HWE_MASK(PowerOnBISTFailed) |	  \
+		QLOGIC_IB_HWE_COREPLL_FBSLIP | \
+		QLOGIC_IB_HWE_COREPLL_RFSLIP | \
+		QLOGIC_IB_HWE_SERDESPLLFAILED | \
+		HWE_MASK(IBCBusToSPCParityErr) | \
+		HWE_MASK(IBCBusFromSPCParityErr) | \
+		QLOGIC_IB_HWE_PCIECPLDATAQUEUEERR | \
+		QLOGIC_IB_HWE_PCIECPLHDRQUEUEERR | \
+		QLOGIC_IB_HWE_SDMAMEMREADERR | \
+		QLOGIC_IB_HWE_CLK_UC_PLLNOTLOCKED | \
+		QLOGIC_IB_HWE_PCIESERDESQ0PCLKNOTDETECT | \
+		QLOGIC_IB_HWE_PCIESERDESQ1PCLKNOTDETECT | \
+		QLOGIC_IB_HWE_PCIESERDESQ2PCLKNOTDETECT | \
+		QLOGIC_IB_HWE_PCIESERDESQ3PCLKNOTDETECT | \
+		QLOGIC_IB_HWE_DDSRXEQMEMORYPARITYERR | \
+		QLOGIC_IB_HWE_IB_UC_MEMORYPARITYERR | \
+		QLOGIC_IB_HWE_PCIE_UC_OCT0MEMORYPARITYERR | \
+		QLOGIC_IB_HWE_PCIE_UC_OCT1MEMORYPARITYERR)
+
+#define IB_E_BITSEXTANT							\
+	(ERR_MASK(RcvFormatErr) | ERR_MASK(RcvVCRCErr) |		\
+	 ERR_MASK(RcvICRCErr) | ERR_MASK(RcvMinPktLenErr) |		\
+	 ERR_MASK(RcvMaxPktLenErr) | ERR_MASK(RcvLongPktLenErr) |	\
+	 ERR_MASK(RcvShortPktLenErr) | ERR_MASK(RcvUnexpectedCharErr) | \
+	 ERR_MASK(RcvUnsupportedVLErr) | ERR_MASK(RcvEBPErr) |		\
+	 ERR_MASK(RcvIBFlowErr) | ERR_MASK(RcvBadVersionErr) |		\
+	 ERR_MASK(RcvEgrFullErr) | ERR_MASK(RcvHdrFullErr) |		\
+	 ERR_MASK(RcvBadTidErr) | ERR_MASK(RcvHdrLenErr) |		\
+	 ERR_MASK(RcvHdrErr) | ERR_MASK(RcvIBLostLinkErr) |		\
+	 ERR_MASK(SendSpecialTriggerErr) |				\
+	 ERR_MASK(SDmaDisabledErr) | ERR_MASK(SendMinPktLenErr) |	\
+	 ERR_MASK(SendMaxPktLenErr) | ERR_MASK(SendUnderRunErr) |	\
+	 ERR_MASK(SendPktLenErr) | ERR_MASK(SendDroppedSmpPktErr) |	\
+	 ERR_MASK(SendDroppedDataPktErr) |				\
+	 ERR_MASK(SendPioArmLaunchErr) |				\
+	 ERR_MASK(SendUnexpectedPktNumErr) |				\
+	 ERR_MASK(SendUnsupportedVLErr) | ERR_MASK(SendBufMisuseErr) |	\
+	 ERR_MASK(SDmaGenMismatchErr) | ERR_MASK(SDmaOutOfBoundErr) |	\
+	 ERR_MASK(SDmaTailOutOfBoundErr) | ERR_MASK(SDmaBaseErr) |	\
+	 ERR_MASK(SDma1stDescErr) | ERR_MASK(SDmaRpyTagErr) |		\
+	 ERR_MASK(SDmaDwEnErr) | ERR_MASK(SDmaMissingDwErr) |		\
+	 ERR_MASK(SDmaUnexpDataErr) |					\
+	 ERR_MASK(IBStatusChanged) | ERR_MASK(InvalidAddrErr) |		\
+	 ERR_MASK(ResetNegated) | ERR_MASK(HardwareErr) |		\
+	 ERR_MASK(SDmaDescAddrMisalignErr) |				\
+	 ERR_MASK(InvalidEEPCmd))
+
+/* kr_hwerrclear, kr_hwerrmask, kr_hwerrstatus, bits */
+#define QLOGIC_IB_HWE_PCIEMEMPARITYERR_MASK  0x00000000000000ffULL
+#define QLOGIC_IB_HWE_PCIEMEMPARITYERR_SHIFT 0
+#define QLOGIC_IB_HWE_PCIEPOISONEDTLP      0x0000000010000000ULL
+#define QLOGIC_IB_HWE_PCIECPLTIMEOUT       0x0000000020000000ULL
+#define QLOGIC_IB_HWE_PCIEBUSPARITYXTLH    0x0000000040000000ULL
+#define QLOGIC_IB_HWE_PCIEBUSPARITYXADM    0x0000000080000000ULL
+#define QLOGIC_IB_HWE_PCIEBUSPARITYRADM    0x0000000100000000ULL
+#define QLOGIC_IB_HWE_COREPLL_FBSLIP       0x0080000000000000ULL
+#define QLOGIC_IB_HWE_COREPLL_RFSLIP       0x0100000000000000ULL
+#define QLOGIC_IB_HWE_PCIE1PLLFAILED       0x0400000000000000ULL
+#define QLOGIC_IB_HWE_PCIE0PLLFAILED       0x0800000000000000ULL
+#define QLOGIC_IB_HWE_SERDESPLLFAILED      0x1000000000000000ULL
+/* specific to this chip */
+#define QLOGIC_IB_HWE_PCIECPLDATAQUEUEERR         0x0000000000000040ULL
+#define QLOGIC_IB_HWE_PCIECPLHDRQUEUEERR          0x0000000000000080ULL
+#define QLOGIC_IB_HWE_SDMAMEMREADERR              0x0000000010000000ULL
+#define QLOGIC_IB_HWE_CLK_UC_PLLNOTLOCKED          0x2000000000000000ULL
+#define QLOGIC_IB_HWE_PCIESERDESQ0PCLKNOTDETECT   0x0100000000000000ULL
+#define QLOGIC_IB_HWE_PCIESERDESQ1PCLKNOTDETECT   0x0200000000000000ULL
+#define QLOGIC_IB_HWE_PCIESERDESQ2PCLKNOTDETECT   0x0400000000000000ULL
+#define QLOGIC_IB_HWE_PCIESERDESQ3PCLKNOTDETECT   0x0800000000000000ULL
+#define QLOGIC_IB_HWE_DDSRXEQMEMORYPARITYERR       0x0000008000000000ULL
+#define QLOGIC_IB_HWE_IB_UC_MEMORYPARITYERR        0x0000004000000000ULL
+#define QLOGIC_IB_HWE_PCIE_UC_OCT0MEMORYPARITYERR 0x0000001000000000ULL
+#define QLOGIC_IB_HWE_PCIE_UC_OCT1MEMORYPARITYERR 0x0000002000000000ULL
+
+#define IBA7220_IBCC_LINKCMD_SHIFT 19
+
+/* kr_ibcddrctrl bits */
+#define IBA7220_IBC_DLIDLMC_MASK        0xFFFFFFFFUL
+#define IBA7220_IBC_DLIDLMC_SHIFT       32
+
+#define IBA7220_IBC_HRTBT_MASK  (SYM_RMASK(IBCDDRCtrl, HRTBT_AUTO) | \
+				 SYM_RMASK(IBCDDRCtrl, HRTBT_ENB))
+#define IBA7220_IBC_HRTBT_SHIFT SYM_LSB(IBCDDRCtrl, HRTBT_ENB)
+
+#define IBA7220_IBC_LANE_REV_SUPPORTED (1<<8)
+#define IBA7220_IBC_LREV_MASK   1
+#define IBA7220_IBC_LREV_SHIFT  8
+#define IBA7220_IBC_RXPOL_MASK  1
+#define IBA7220_IBC_RXPOL_SHIFT 7
+#define IBA7220_IBC_WIDTH_SHIFT 5
+#define IBA7220_IBC_WIDTH_MASK  0x3
+#define IBA7220_IBC_WIDTH_1X_ONLY       (0 << IBA7220_IBC_WIDTH_SHIFT)
+#define IBA7220_IBC_WIDTH_4X_ONLY       (1 << IBA7220_IBC_WIDTH_SHIFT)
+#define IBA7220_IBC_WIDTH_AUTONEG       (2 << IBA7220_IBC_WIDTH_SHIFT)
+#define IBA7220_IBC_SPEED_AUTONEG       (1 << 1)
+#define IBA7220_IBC_SPEED_SDR           (1 << 2)
+#define IBA7220_IBC_SPEED_DDR           (1 << 3)
+#define IBA7220_IBC_SPEED_AUTONEG_MASK  (0x7 << 1)
+#define IBA7220_IBC_IBTA_1_2_MASK       (1)
+
+/* kr_ibcddrstatus */
+/* link latency shift is 0, don't bother defining */
+#define IBA7220_DDRSTAT_LINKLAT_MASK    0x3ffffff
+
+/* kr_extstatus bits */
+#define QLOGIC_IB_EXTS_FREQSEL 0x2
+#define QLOGIC_IB_EXTS_SERDESSEL 0x4
+#define QLOGIC_IB_EXTS_MEMBIST_ENDTEST     0x0000000000004000
+#define QLOGIC_IB_EXTS_MEMBIST_DISABLED    0x0000000000008000
+
+/* kr_xgxsconfig bits */
+#define QLOGIC_IB_XGXS_RESET          0x5ULL
+#define QLOGIC_IB_XGXS_FC_SAFE        (1ULL << 63)
+
+/* kr_rcvpktledcnt */
+#define IBA7220_LEDBLINK_ON_SHIFT 32 /* 4ns period on after packet */
+#define IBA7220_LEDBLINK_OFF_SHIFT 0 /* 4ns period off before next on */
+
+#define _QIB_GPIO_SDA_NUM 1
+#define _QIB_GPIO_SCL_NUM 0
+#define QIB_TWSI_EEPROM_DEV 0xA2 /* All Production 7220 cards. */
+#define QIB_TWSI_TEMP_DEV 0x98
+
+/* HW counter clock is at 4nsec */
+#define QIB_7220_PSXMITWAIT_CHECK_RATE 4000
+
+#define IBA7220_R_INTRAVAIL_SHIFT 17
+#define IBA7220_R_PKEY_DIS_SHIFT 34
+#define IBA7220_R_TAILUPD_SHIFT 35
+#define IBA7220_R_CTXTCFG_SHIFT 36
+
+#define IBA7220_HDRHEAD_PKTINT_SHIFT 32 /* interrupt cnt in upper 32 bits */
+
+/*
+ * the size bits give us 2^N, in KB units.  0 marks as invalid,
+ * and 7 is reserved.  We currently use only 2KB and 4KB
+ */
+#define IBA7220_TID_SZ_SHIFT 37 /* shift to 3bit size selector */
+#define IBA7220_TID_SZ_2K (1UL << IBA7220_TID_SZ_SHIFT) /* 2KB */
+#define IBA7220_TID_SZ_4K (2UL << IBA7220_TID_SZ_SHIFT) /* 4KB */
+#define IBA7220_TID_PA_SHIFT 11U /* TID addr in chip stored w/o low bits */
+#define PBC_7220_VL15_SEND (1ULL << 63) /* pbc; VL15, no credit check */
+#define PBC_7220_VL15_SEND_CTRL (1ULL << 31) /* control version of same */
+
+#define AUTONEG_TRIES 5 /* sequential retries to negotiate DDR */
+
+/* packet rate matching delay multiplier */
+static u8 rate_to_delay[2][2] = {
+	/* 1x, 4x */
+	{   8, 2 }, /* SDR */
+	{   4, 1 }  /* DDR */
+};
+
+static u8 ib_rate_to_delay[IB_RATE_120_GBPS + 1] = {
+	[IB_RATE_2_5_GBPS] = 8,
+	[IB_RATE_5_GBPS] = 4,
+	[IB_RATE_10_GBPS] = 2,
+	[IB_RATE_20_GBPS] = 1
+};
+
+#define IBA7220_LINKSPEED_SHIFT SYM_LSB(IBCStatus, LinkSpeedActive)
+#define IBA7220_LINKWIDTH_SHIFT SYM_LSB(IBCStatus, LinkWidthActive)
+
+/* link training states, from IBC */
+#define IB_7220_LT_STATE_DISABLED        0x00
+#define IB_7220_LT_STATE_LINKUP          0x01
+#define IB_7220_LT_STATE_POLLACTIVE      0x02
+#define IB_7220_LT_STATE_POLLQUIET       0x03
+#define IB_7220_LT_STATE_SLEEPDELAY      0x04
+#define IB_7220_LT_STATE_SLEEPQUIET      0x05
+#define IB_7220_LT_STATE_CFGDEBOUNCE     0x08
+#define IB_7220_LT_STATE_CFGRCVFCFG      0x09
+#define IB_7220_LT_STATE_CFGWAITRMT      0x0a
+#define IB_7220_LT_STATE_CFGIDLE 0x0b
+#define IB_7220_LT_STATE_RECOVERRETRAIN  0x0c
+#define IB_7220_LT_STATE_RECOVERWAITRMT  0x0e
+#define IB_7220_LT_STATE_RECOVERIDLE     0x0f
+
+/* link state machine states from IBC */
+#define IB_7220_L_STATE_DOWN             0x0
+#define IB_7220_L_STATE_INIT             0x1
+#define IB_7220_L_STATE_ARM              0x2
+#define IB_7220_L_STATE_ACTIVE           0x3
+#define IB_7220_L_STATE_ACT_DEFER        0x4
+
+static const u8 qib_7220_physportstate[0x20] = {
+	[IB_7220_LT_STATE_DISABLED] = IB_PHYSPORTSTATE_DISABLED,
+	[IB_7220_LT_STATE_LINKUP] = IB_PHYSPORTSTATE_LINKUP,
+	[IB_7220_LT_STATE_POLLACTIVE] = IB_PHYSPORTSTATE_POLL,
+	[IB_7220_LT_STATE_POLLQUIET] = IB_PHYSPORTSTATE_POLL,
+	[IB_7220_LT_STATE_SLEEPDELAY] = IB_PHYSPORTSTATE_SLEEP,
+	[IB_7220_LT_STATE_SLEEPQUIET] = IB_PHYSPORTSTATE_SLEEP,
+	[IB_7220_LT_STATE_CFGDEBOUNCE] =
+		IB_PHYSPORTSTATE_CFG_TRAIN,
+	[IB_7220_LT_STATE_CFGRCVFCFG] =
+		IB_PHYSPORTSTATE_CFG_TRAIN,
+	[IB_7220_LT_STATE_CFGWAITRMT] =
+		IB_PHYSPORTSTATE_CFG_TRAIN,
+	[IB_7220_LT_STATE_CFGIDLE] = IB_PHYSPORTSTATE_CFG_TRAIN,
+	[IB_7220_LT_STATE_RECOVERRETRAIN] =
+		IB_PHYSPORTSTATE_LINK_ERR_RECOVER,
+	[IB_7220_LT_STATE_RECOVERWAITRMT] =
+		IB_PHYSPORTSTATE_LINK_ERR_RECOVER,
+	[IB_7220_LT_STATE_RECOVERIDLE] =
+		IB_PHYSPORTSTATE_LINK_ERR_RECOVER,
+	[0x10] = IB_PHYSPORTSTATE_CFG_TRAIN,
+	[0x11] = IB_PHYSPORTSTATE_CFG_TRAIN,
+	[0x12] = IB_PHYSPORTSTATE_CFG_TRAIN,
+	[0x13] = IB_PHYSPORTSTATE_CFG_TRAIN,
+	[0x14] = IB_PHYSPORTSTATE_CFG_TRAIN,
+	[0x15] = IB_PHYSPORTSTATE_CFG_TRAIN,
+	[0x16] = IB_PHYSPORTSTATE_CFG_TRAIN,
+	[0x17] = IB_PHYSPORTSTATE_CFG_TRAIN
+};
+
+int qib_special_trigger;
+module_param_named(special_trigger, qib_special_trigger, int, S_IRUGO);
+MODULE_PARM_DESC(special_trigger, "Enable SpecialTrigger arm/launch");
+
+#define IBCBUSFRSPCPARITYERR HWE_MASK(IBCBusFromSPCParityErr)
+#define IBCBUSTOSPCPARITYERR HWE_MASK(IBCBusToSPCParityErr)
+
+#define SYM_MASK_BIT(regname, fldname, bit) ((u64) \
+	(1ULL << (SYM_LSB(regname, fldname) + (bit))))
+
+#define TXEMEMPARITYERR_PIOBUF \
+	SYM_MASK_BIT(HwErrMask, TXEMemParityErrMask, 0)
+#define TXEMEMPARITYERR_PIOPBC \
+	SYM_MASK_BIT(HwErrMask, TXEMemParityErrMask, 1)
+#define TXEMEMPARITYERR_PIOLAUNCHFIFO \
+	SYM_MASK_BIT(HwErrMask, TXEMemParityErrMask, 2)
+
+#define RXEMEMPARITYERR_RCVBUF \
+	SYM_MASK_BIT(HwErrMask, RXEMemParityErrMask, 0)
+#define RXEMEMPARITYERR_LOOKUPQ \
+	SYM_MASK_BIT(HwErrMask, RXEMemParityErrMask, 1)
+#define RXEMEMPARITYERR_EXPTID \
+	SYM_MASK_BIT(HwErrMask, RXEMemParityErrMask, 2)
+#define RXEMEMPARITYERR_EAGERTID \
+	SYM_MASK_BIT(HwErrMask, RXEMemParityErrMask, 3)
+#define RXEMEMPARITYERR_FLAGBUF \
+	SYM_MASK_BIT(HwErrMask, RXEMemParityErrMask, 4)
+#define RXEMEMPARITYERR_DATAINFO \
+	SYM_MASK_BIT(HwErrMask, RXEMemParityErrMask, 5)
+#define RXEMEMPARITYERR_HDRINFO \
+	SYM_MASK_BIT(HwErrMask, RXEMemParityErrMask, 6)
+
+/* 7220 specific hardware errors... */
+static const struct qib_hwerror_msgs qib_7220_hwerror_msgs[] = {
+	/* generic hardware errors */
+	QLOGIC_IB_HWE_MSG(IBCBUSFRSPCPARITYERR, "QIB2IB Parity"),
+	QLOGIC_IB_HWE_MSG(IBCBUSTOSPCPARITYERR, "IB2QIB Parity"),
+
+	QLOGIC_IB_HWE_MSG(TXEMEMPARITYERR_PIOBUF,
+			  "TXE PIOBUF Memory Parity"),
+	QLOGIC_IB_HWE_MSG(TXEMEMPARITYERR_PIOPBC,
+			  "TXE PIOPBC Memory Parity"),
+	QLOGIC_IB_HWE_MSG(TXEMEMPARITYERR_PIOLAUNCHFIFO,
+			  "TXE PIOLAUNCHFIFO Memory Parity"),
+
+	QLOGIC_IB_HWE_MSG(RXEMEMPARITYERR_RCVBUF,
+			  "RXE RCVBUF Memory Parity"),
+	QLOGIC_IB_HWE_MSG(RXEMEMPARITYERR_LOOKUPQ,
+			  "RXE LOOKUPQ Memory Parity"),
+	QLOGIC_IB_HWE_MSG(RXEMEMPARITYERR_EAGERTID,
+			  "RXE EAGERTID Memory Parity"),
+	QLOGIC_IB_HWE_MSG(RXEMEMPARITYERR_EXPTID,
+			  "RXE EXPTID Memory Parity"),
+	QLOGIC_IB_HWE_MSG(RXEMEMPARITYERR_FLAGBUF,
+			  "RXE FLAGBUF Memory Parity"),
+	QLOGIC_IB_HWE_MSG(RXEMEMPARITYERR_DATAINFO,
+			  "RXE DATAINFO Memory Parity"),
+	QLOGIC_IB_HWE_MSG(RXEMEMPARITYERR_HDRINFO,
+			  "RXE HDRINFO Memory Parity"),
+
+	/* chip-specific hardware errors */
+	QLOGIC_IB_HWE_MSG(QLOGIC_IB_HWE_PCIEPOISONEDTLP,
+			  "PCIe Poisoned TLP"),
+	QLOGIC_IB_HWE_MSG(QLOGIC_IB_HWE_PCIECPLTIMEOUT,
+			  "PCIe completion timeout"),
+	/*
+	 * In practice, it's unlikely wthat we'll see PCIe PLL, or bus
+	 * parity or memory parity error failures, because most likely we
+	 * won't be able to talk to the core of the chip.  Nonetheless, we
+	 * might see them, if they are in parts of the PCIe core that aren't
+	 * essential.
+	 */
+	QLOGIC_IB_HWE_MSG(QLOGIC_IB_HWE_PCIE1PLLFAILED,
+			  "PCIePLL1"),
+	QLOGIC_IB_HWE_MSG(QLOGIC_IB_HWE_PCIE0PLLFAILED,
+			  "PCIePLL0"),
+	QLOGIC_IB_HWE_MSG(QLOGIC_IB_HWE_PCIEBUSPARITYXTLH,
+			  "PCIe XTLH core parity"),
+	QLOGIC_IB_HWE_MSG(QLOGIC_IB_HWE_PCIEBUSPARITYXADM,
+			  "PCIe ADM TX core parity"),
+	QLOGIC_IB_HWE_MSG(QLOGIC_IB_HWE_PCIEBUSPARITYRADM,
+			  "PCIe ADM RX core parity"),
+	QLOGIC_IB_HWE_MSG(QLOGIC_IB_HWE_SERDESPLLFAILED,
+			  "SerDes PLL"),
+	QLOGIC_IB_HWE_MSG(QLOGIC_IB_HWE_PCIECPLDATAQUEUEERR,
+			  "PCIe cpl header queue"),
+	QLOGIC_IB_HWE_MSG(QLOGIC_IB_HWE_PCIECPLHDRQUEUEERR,
+			  "PCIe cpl data queue"),
+	QLOGIC_IB_HWE_MSG(QLOGIC_IB_HWE_SDMAMEMREADERR,
+			  "Send DMA memory read"),
+	QLOGIC_IB_HWE_MSG(QLOGIC_IB_HWE_CLK_UC_PLLNOTLOCKED,
+			  "uC PLL clock not locked"),
+	QLOGIC_IB_HWE_MSG(QLOGIC_IB_HWE_PCIESERDESQ0PCLKNOTDETECT,
+			  "PCIe serdes Q0 no clock"),
+	QLOGIC_IB_HWE_MSG(QLOGIC_IB_HWE_PCIESERDESQ1PCLKNOTDETECT,
+			  "PCIe serdes Q1 no clock"),
+	QLOGIC_IB_HWE_MSG(QLOGIC_IB_HWE_PCIESERDESQ2PCLKNOTDETECT,
+			  "PCIe serdes Q2 no clock"),
+	QLOGIC_IB_HWE_MSG(QLOGIC_IB_HWE_PCIESERDESQ3PCLKNOTDETECT,
+			  "PCIe serdes Q3 no clock"),
+	QLOGIC_IB_HWE_MSG(QLOGIC_IB_HWE_DDSRXEQMEMORYPARITYERR,
+			  "DDS RXEQ memory parity"),
+	QLOGIC_IB_HWE_MSG(QLOGIC_IB_HWE_IB_UC_MEMORYPARITYERR,
+			  "IB uC memory parity"),
+	QLOGIC_IB_HWE_MSG(QLOGIC_IB_HWE_PCIE_UC_OCT0MEMORYPARITYERR,
+			  "PCIe uC oct0 memory parity"),
+	QLOGIC_IB_HWE_MSG(QLOGIC_IB_HWE_PCIE_UC_OCT1MEMORYPARITYERR,
+			  "PCIe uC oct1 memory parity"),
+};
+
+#define RXE_PARITY (RXEMEMPARITYERR_EAGERTID|RXEMEMPARITYERR_EXPTID)
+
+#define QLOGIC_IB_E_PKTERRS (\
+		ERR_MASK(SendPktLenErr) |				\
+		ERR_MASK(SendDroppedDataPktErr) |			\
+		ERR_MASK(RcvVCRCErr) |					\
+		ERR_MASK(RcvICRCErr) |					\
+		ERR_MASK(RcvShortPktLenErr) |				\
+		ERR_MASK(RcvEBPErr))
+
+/* Convenience for decoding Send DMA errors */
+#define QLOGIC_IB_E_SDMAERRS ( \
+		ERR_MASK(SDmaGenMismatchErr) |				\
+		ERR_MASK(SDmaOutOfBoundErr) |				\
+		ERR_MASK(SDmaTailOutOfBoundErr) | ERR_MASK(SDmaBaseErr) | \
+		ERR_MASK(SDma1stDescErr) | ERR_MASK(SDmaRpyTagErr) |	\
+		ERR_MASK(SDmaDwEnErr) | ERR_MASK(SDmaMissingDwErr) |	\
+		ERR_MASK(SDmaUnexpDataErr) |				\
+		ERR_MASK(SDmaDescAddrMisalignErr) |			\
+		ERR_MASK(SDmaDisabledErr) |				\
+		ERR_MASK(SendBufMisuseErr))
+
+/* These are all rcv-related errors which we want to count for stats */
+#define E_SUM_PKTERRS \
+	(ERR_MASK(RcvHdrLenErr) | ERR_MASK(RcvBadTidErr) |		\
+	 ERR_MASK(RcvBadVersionErr) | ERR_MASK(RcvHdrErr) |		\
+	 ERR_MASK(RcvLongPktLenErr) | ERR_MASK(RcvShortPktLenErr) |	\
+	 ERR_MASK(RcvMaxPktLenErr) | ERR_MASK(RcvMinPktLenErr) |	\
+	 ERR_MASK(RcvFormatErr) | ERR_MASK(RcvUnsupportedVLErr) |	\
+	 ERR_MASK(RcvUnexpectedCharErr) | ERR_MASK(RcvEBPErr))
+
+/* These are all send-related errors which we want to count for stats */
+#define E_SUM_ERRS \
+	(ERR_MASK(SendPioArmLaunchErr) | ERR_MASK(SendUnexpectedPktNumErr) | \
+	 ERR_MASK(SendDroppedDataPktErr) | ERR_MASK(SendDroppedSmpPktErr) | \
+	 ERR_MASK(SendMaxPktLenErr) | ERR_MASK(SendUnsupportedVLErr) |	\
+	 ERR_MASK(SendMinPktLenErr) | ERR_MASK(SendPktLenErr) |		\
+	 ERR_MASK(InvalidAddrErr))
+
+/*
+ * this is similar to E_SUM_ERRS, but can't ignore armlaunch, don't ignore
+ * errors not related to freeze and cancelling buffers.  Can't ignore
+ * armlaunch because could get more while still cleaning up, and need
+ * to cancel those as they happen.
+ */
+#define E_SPKT_ERRS_IGNORE \
+	(ERR_MASK(SendDroppedDataPktErr) | ERR_MASK(SendDroppedSmpPktErr) | \
+	 ERR_MASK(SendMaxPktLenErr) | ERR_MASK(SendMinPktLenErr) |	\
+	 ERR_MASK(SendPktLenErr))
+
+/*
+ * these are errors that can occur when the link changes state while
+ * a packet is being sent or received.  This doesn't cover things
+ * like EBP or VCRC that can be the result of a sending having the
+ * link change state, so we receive a "known bad" packet.
+ */
+#define E_SUM_LINK_PKTERRS \
+	(ERR_MASK(SendDroppedDataPktErr) | ERR_MASK(SendDroppedSmpPktErr) | \
+	 ERR_MASK(SendMinPktLenErr) | ERR_MASK(SendPktLenErr) |		\
+	 ERR_MASK(RcvShortPktLenErr) | ERR_MASK(RcvMinPktLenErr) |	\
+	 ERR_MASK(RcvUnexpectedCharErr))
+
+static void autoneg_7220_work(struct work_struct *);
+static u32 __iomem *qib_7220_getsendbuf(struct qib_pportdata *, u64, u32 *);
+
+/*
+ * Called when we might have an error that is specific to a particular
+ * PIO buffer, and may need to cancel that buffer, so it can be re-used.
+ * because we don't need to force the update of pioavail.
+ */
+static void qib_disarm_7220_senderrbufs(struct qib_pportdata *ppd)
+{
+	unsigned long sbuf[3];
+	struct qib_devdata *dd = ppd->dd;
+
+	/*
+	 * It's possible that sendbuffererror could have bits set; might
+	 * have already done this as a result of hardware error handling.
+	 */
+	/* read these before writing errorclear */
+	sbuf[0] = qib_read_kreg64(dd, kr_sendbuffererror);
+	sbuf[1] = qib_read_kreg64(dd, kr_sendbuffererror + 1);
+	sbuf[2] = qib_read_kreg64(dd, kr_sendbuffererror + 2);
+
+	if (sbuf[0] || sbuf[1] || sbuf[2])
+		qib_disarm_piobufs_set(dd, sbuf,
+				       dd->piobcnt2k + dd->piobcnt4k);
+}
+
+static void qib_7220_txe_recover(struct qib_devdata *dd)
+{
+	qib_devinfo(dd->pcidev, "Recovering from TXE PIO parity error\n");
+	qib_disarm_7220_senderrbufs(dd->pport);
+}
+
+/*
+ * This is called with interrupts disabled and sdma_lock held.
+ */
+static void qib_7220_sdma_sendctrl(struct qib_pportdata *ppd, unsigned op)
+{
+	struct qib_devdata *dd = ppd->dd;
+	u64 set_sendctrl = 0;
+	u64 clr_sendctrl = 0;
+
+	if (op & QIB_SDMA_SENDCTRL_OP_ENABLE)
+		set_sendctrl |= SYM_MASK(SendCtrl, SDmaEnable);
+	else
+		clr_sendctrl |= SYM_MASK(SendCtrl, SDmaEnable);
+
+	if (op & QIB_SDMA_SENDCTRL_OP_INTENABLE)
+		set_sendctrl |= SYM_MASK(SendCtrl, SDmaIntEnable);
+	else
+		clr_sendctrl |= SYM_MASK(SendCtrl, SDmaIntEnable);
+
+	if (op & QIB_SDMA_SENDCTRL_OP_HALT)
+		set_sendctrl |= SYM_MASK(SendCtrl, SDmaHalt);
+	else
+		clr_sendctrl |= SYM_MASK(SendCtrl, SDmaHalt);
+
+	spin_lock(&dd->sendctrl_lock);
+
+	dd->sendctrl |= set_sendctrl;
+	dd->sendctrl &= ~clr_sendctrl;
+
+	qib_write_kreg(dd, kr_sendctrl, dd->sendctrl);
+	qib_write_kreg(dd, kr_scratch, 0);
+
+	spin_unlock(&dd->sendctrl_lock);
+}
+
+static void qib_decode_7220_sdma_errs(struct qib_pportdata *ppd,
+				      u64 err, char *buf, size_t blen)
+{
+	static const struct {
+		u64 err;
+		const char *msg;
+	} errs[] = {
+		{ ERR_MASK(SDmaGenMismatchErr),
+		  "SDmaGenMismatch" },
+		{ ERR_MASK(SDmaOutOfBoundErr),
+		  "SDmaOutOfBound" },
+		{ ERR_MASK(SDmaTailOutOfBoundErr),
+		  "SDmaTailOutOfBound" },
+		{ ERR_MASK(SDmaBaseErr),
+		  "SDmaBase" },
+		{ ERR_MASK(SDma1stDescErr),
+		  "SDma1stDesc" },
+		{ ERR_MASK(SDmaRpyTagErr),
+		  "SDmaRpyTag" },
+		{ ERR_MASK(SDmaDwEnErr),
+		  "SDmaDwEn" },
+		{ ERR_MASK(SDmaMissingDwErr),
+		  "SDmaMissingDw" },
+		{ ERR_MASK(SDmaUnexpDataErr),
+		  "SDmaUnexpData" },
+		{ ERR_MASK(SDmaDescAddrMisalignErr),
+		  "SDmaDescAddrMisalign" },
+		{ ERR_MASK(SendBufMisuseErr),
+		  "SendBufMisuse" },
+		{ ERR_MASK(SDmaDisabledErr),
+		  "SDmaDisabled" },
+	};
+	int i;
+	size_t bidx = 0;
+
+	for (i = 0; i < ARRAY_SIZE(errs); i++) {
+		if (err & errs[i].err)
+			bidx += scnprintf(buf + bidx, blen - bidx,
+					 "%s ", errs[i].msg);
+	}
+}
+
+/*
+ * This is called as part of link down clean up so disarm and flush
+ * all send buffers so that SMP packets can be sent.
+ */
+static void qib_7220_sdma_hw_clean_up(struct qib_pportdata *ppd)
+{
+	/* This will trigger the Abort interrupt */
+	sendctrl_7220_mod(ppd, QIB_SENDCTRL_DISARM_ALL | QIB_SENDCTRL_FLUSH |
+			  QIB_SENDCTRL_AVAIL_BLIP);
+	ppd->dd->upd_pio_shadow  = 1; /* update our idea of what's busy */
+}
+
+static void qib_sdma_7220_setlengen(struct qib_pportdata *ppd)
+{
+	/*
+	 * Set SendDmaLenGen and clear and set
+	 * the MSB of the generation count to enable generation checking
+	 * and load the internal generation counter.
+	 */
+	qib_write_kreg(ppd->dd, kr_senddmalengen, ppd->sdma_descq_cnt);
+	qib_write_kreg(ppd->dd, kr_senddmalengen,
+		       ppd->sdma_descq_cnt |
+		       (1ULL << QIB_7220_SendDmaLenGen_Generation_MSB));
+}
+
+static void qib_7220_sdma_hw_start_up(struct qib_pportdata *ppd)
+{
+	qib_sdma_7220_setlengen(ppd);
+	qib_sdma_update_7220_tail(ppd, 0); /* Set SendDmaTail */
+	ppd->sdma_head_dma[0] = 0;
+}
+
+#define DISABLES_SDMA (							\
+		ERR_MASK(SDmaDisabledErr) |				\
+		ERR_MASK(SDmaBaseErr) |					\
+		ERR_MASK(SDmaTailOutOfBoundErr) |			\
+		ERR_MASK(SDmaOutOfBoundErr) |				\
+		ERR_MASK(SDma1stDescErr) |				\
+		ERR_MASK(SDmaRpyTagErr) |				\
+		ERR_MASK(SDmaGenMismatchErr) |				\
+		ERR_MASK(SDmaDescAddrMisalignErr) |			\
+		ERR_MASK(SDmaMissingDwErr) |				\
+		ERR_MASK(SDmaDwEnErr))
+
+static void sdma_7220_errors(struct qib_pportdata *ppd, u64 errs)
+{
+	unsigned long flags;
+	struct qib_devdata *dd = ppd->dd;
+	char *msg;
+
+	errs &= QLOGIC_IB_E_SDMAERRS;
+
+	msg = dd->cspec->sdmamsgbuf;
+	qib_decode_7220_sdma_errs(ppd, errs, msg, sizeof dd->cspec->sdmamsgbuf);
+	spin_lock_irqsave(&ppd->sdma_lock, flags);
+
+	if (errs & ERR_MASK(SendBufMisuseErr)) {
+		unsigned long sbuf[3];
+
+		sbuf[0] = qib_read_kreg64(dd, kr_sendbuffererror);
+		sbuf[1] = qib_read_kreg64(dd, kr_sendbuffererror + 1);
+		sbuf[2] = qib_read_kreg64(dd, kr_sendbuffererror + 2);
+
+		qib_dev_err(ppd->dd,
+			    "IB%u:%u SendBufMisuse: %04lx %016lx %016lx\n",
+			    ppd->dd->unit, ppd->port, sbuf[2], sbuf[1],
+			    sbuf[0]);
+	}
+
+	if (errs & ERR_MASK(SDmaUnexpDataErr))
+		qib_dev_err(dd, "IB%u:%u SDmaUnexpData\n", ppd->dd->unit,
+			    ppd->port);
+
+	switch (ppd->sdma_state.current_state) {
+	case qib_sdma_state_s00_hw_down:
+		/* not expecting any interrupts */
+		break;
+
+	case qib_sdma_state_s10_hw_start_up_wait:
+		/* handled in intr path */
+		break;
+
+	case qib_sdma_state_s20_idle:
+		/* not expecting any interrupts */
+		break;
+
+	case qib_sdma_state_s30_sw_clean_up_wait:
+		/* not expecting any interrupts */
+		break;
+
+	case qib_sdma_state_s40_hw_clean_up_wait:
+		if (errs & ERR_MASK(SDmaDisabledErr))
+			__qib_sdma_process_event(ppd,
+				qib_sdma_event_e50_hw_cleaned);
+		break;
+
+	case qib_sdma_state_s50_hw_halt_wait:
+		/* handled in intr path */
+		break;
+
+	case qib_sdma_state_s99_running:
+		if (errs & DISABLES_SDMA)
+			__qib_sdma_process_event(ppd,
+				qib_sdma_event_e7220_err_halted);
+		break;
+	}
+
+	spin_unlock_irqrestore(&ppd->sdma_lock, flags);
+}
+
+/*
+ * Decode the error status into strings, deciding whether to always
+ * print * it or not depending on "normal packet errors" vs everything
+ * else.   Return 1 if "real" errors, otherwise 0 if only packet
+ * errors, so caller can decide what to print with the string.
+ */
+static int qib_decode_7220_err(struct qib_devdata *dd, char *buf, size_t blen,
+			       u64 err)
+{
+	int iserr = 1;
+
+	*buf = '\0';
+	if (err & QLOGIC_IB_E_PKTERRS) {
+		if (!(err & ~QLOGIC_IB_E_PKTERRS))
+			iserr = 0;
+		if ((err & ERR_MASK(RcvICRCErr)) &&
+		    !(err & (ERR_MASK(RcvVCRCErr) | ERR_MASK(RcvEBPErr))))
+			strlcat(buf, "CRC ", blen);
+		if (!iserr)
+			goto done;
+	}
+	if (err & ERR_MASK(RcvHdrLenErr))
+		strlcat(buf, "rhdrlen ", blen);
+	if (err & ERR_MASK(RcvBadTidErr))
+		strlcat(buf, "rbadtid ", blen);
+	if (err & ERR_MASK(RcvBadVersionErr))
+		strlcat(buf, "rbadversion ", blen);
+	if (err & ERR_MASK(RcvHdrErr))
+		strlcat(buf, "rhdr ", blen);
+	if (err & ERR_MASK(SendSpecialTriggerErr))
+		strlcat(buf, "sendspecialtrigger ", blen);
+	if (err & ERR_MASK(RcvLongPktLenErr))
+		strlcat(buf, "rlongpktlen ", blen);
+	if (err & ERR_MASK(RcvMaxPktLenErr))
+		strlcat(buf, "rmaxpktlen ", blen);
+	if (err & ERR_MASK(RcvMinPktLenErr))
+		strlcat(buf, "rminpktlen ", blen);
+	if (err & ERR_MASK(SendMinPktLenErr))
+		strlcat(buf, "sminpktlen ", blen);
+	if (err & ERR_MASK(RcvFormatErr))
+		strlcat(buf, "rformaterr ", blen);
+	if (err & ERR_MASK(RcvUnsupportedVLErr))
+		strlcat(buf, "runsupvl ", blen);
+	if (err & ERR_MASK(RcvUnexpectedCharErr))
+		strlcat(buf, "runexpchar ", blen);
+	if (err & ERR_MASK(RcvIBFlowErr))
+		strlcat(buf, "ribflow ", blen);
+	if (err & ERR_MASK(SendUnderRunErr))
+		strlcat(buf, "sunderrun ", blen);
+	if (err & ERR_MASK(SendPioArmLaunchErr))
+		strlcat(buf, "spioarmlaunch ", blen);
+	if (err & ERR_MASK(SendUnexpectedPktNumErr))
+		strlcat(buf, "sunexperrpktnum ", blen);
+	if (err & ERR_MASK(SendDroppedSmpPktErr))
+		strlcat(buf, "sdroppedsmppkt ", blen);
+	if (err & ERR_MASK(SendMaxPktLenErr))
+		strlcat(buf, "smaxpktlen ", blen);
+	if (err & ERR_MASK(SendUnsupportedVLErr))
+		strlcat(buf, "sunsupVL ", blen);
+	if (err & ERR_MASK(InvalidAddrErr))
+		strlcat(buf, "invalidaddr ", blen);
+	if (err & ERR_MASK(RcvEgrFullErr))
+		strlcat(buf, "rcvegrfull ", blen);
+	if (err & ERR_MASK(RcvHdrFullErr))
+		strlcat(buf, "rcvhdrfull ", blen);
+	if (err & ERR_MASK(IBStatusChanged))
+		strlcat(buf, "ibcstatuschg ", blen);
+	if (err & ERR_MASK(RcvIBLostLinkErr))
+		strlcat(buf, "riblostlink ", blen);
+	if (err & ERR_MASK(HardwareErr))
+		strlcat(buf, "hardware ", blen);
+	if (err & ERR_MASK(ResetNegated))
+		strlcat(buf, "reset ", blen);
+	if (err & QLOGIC_IB_E_SDMAERRS)
+		qib_decode_7220_sdma_errs(dd->pport, err, buf, blen);
+	if (err & ERR_MASK(InvalidEEPCmd))
+		strlcat(buf, "invalideepromcmd ", blen);
+done:
+	return iserr;
+}
+
+static void reenable_7220_chase(unsigned long opaque)
+{
+	struct qib_pportdata *ppd = (struct qib_pportdata *)opaque;
+	ppd->cpspec->chase_timer.expires = 0;
+	qib_set_ib_7220_lstate(ppd, QLOGIC_IB_IBCC_LINKCMD_DOWN,
+		QLOGIC_IB_IBCC_LINKINITCMD_POLL);
+}
+
+static void handle_7220_chase(struct qib_pportdata *ppd, u64 ibcst)
+{
+	u8 ibclt;
+	unsigned long tnow;
+
+	ibclt = (u8)SYM_FIELD(ibcst, IBCStatus, LinkTrainingState);
+
+	/*
+	 * Detect and handle the state chase issue, where we can
+	 * get stuck if we are unlucky on timing on both sides of
+	 * the link.   If we are, we disable, set a timer, and
+	 * then re-enable.
+	 */
+	switch (ibclt) {
+	case IB_7220_LT_STATE_CFGRCVFCFG:
+	case IB_7220_LT_STATE_CFGWAITRMT:
+	case IB_7220_LT_STATE_TXREVLANES:
+	case IB_7220_LT_STATE_CFGENH:
+		tnow = jiffies;
+		if (ppd->cpspec->chase_end &&
+		    time_after(tnow, ppd->cpspec->chase_end)) {
+			ppd->cpspec->chase_end = 0;
+			qib_set_ib_7220_lstate(ppd,
+				QLOGIC_IB_IBCC_LINKCMD_DOWN,
+				QLOGIC_IB_IBCC_LINKINITCMD_DISABLE);
+			ppd->cpspec->chase_timer.expires = jiffies +
+				QIB_CHASE_DIS_TIME;
+			add_timer(&ppd->cpspec->chase_timer);
+		} else if (!ppd->cpspec->chase_end)
+			ppd->cpspec->chase_end = tnow + QIB_CHASE_TIME;
+		break;
+
+	default:
+		ppd->cpspec->chase_end = 0;
+		break;
+	}
+}
+
+static void handle_7220_errors(struct qib_devdata *dd, u64 errs)
+{
+	char *msg;
+	u64 ignore_this_time = 0;
+	u64 iserr = 0;
+	int log_idx;
+	struct qib_pportdata *ppd = dd->pport;
+	u64 mask;
+
+	/* don't report errors that are masked */
+	errs &= dd->cspec->errormask;
+	msg = dd->cspec->emsgbuf;
+
+	/* do these first, they are most important */
+	if (errs & ERR_MASK(HardwareErr))
+		qib_7220_handle_hwerrors(dd, msg, sizeof dd->cspec->emsgbuf);
+	else
+		for (log_idx = 0; log_idx < QIB_EEP_LOG_CNT; ++log_idx)
+			if (errs & dd->eep_st_masks[log_idx].errs_to_log)
+				qib_inc_eeprom_err(dd, log_idx, 1);
+
+	if (errs & QLOGIC_IB_E_SDMAERRS)
+		sdma_7220_errors(ppd, errs);
+
+	if (errs & ~IB_E_BITSEXTANT)
+		qib_dev_err(dd,
+			"error interrupt with unknown errors %llx set\n",
+			(unsigned long long) (errs & ~IB_E_BITSEXTANT));
+
+	if (errs & E_SUM_ERRS) {
+		qib_disarm_7220_senderrbufs(ppd);
+		if ((errs & E_SUM_LINK_PKTERRS) &&
+		    !(ppd->lflags & QIBL_LINKACTIVE)) {
+			/*
+			 * This can happen when trying to bring the link
+			 * up, but the IB link changes state at the "wrong"
+			 * time. The IB logic then complains that the packet
+			 * isn't valid.  We don't want to confuse people, so
+			 * we just don't print them, except at debug
+			 */
+			ignore_this_time = errs & E_SUM_LINK_PKTERRS;
+		}
+	} else if ((errs & E_SUM_LINK_PKTERRS) &&
+		   !(ppd->lflags & QIBL_LINKACTIVE)) {
+		/*
+		 * This can happen when SMA is trying to bring the link
+		 * up, but the IB link changes state at the "wrong" time.
+		 * The IB logic then complains that the packet isn't
+		 * valid.  We don't want to confuse people, so we just
+		 * don't print them, except at debug
+		 */
+		ignore_this_time = errs & E_SUM_LINK_PKTERRS;
+	}
+
+	qib_write_kreg(dd, kr_errclear, errs);
+
+	errs &= ~ignore_this_time;
+	if (!errs)
+		goto done;
+
+	/*
+	 * The ones we mask off are handled specially below
+	 * or above.  Also mask SDMADISABLED by default as it
+	 * is too chatty.
+	 */
+	mask = ERR_MASK(IBStatusChanged) |
+		ERR_MASK(RcvEgrFullErr) | ERR_MASK(RcvHdrFullErr) |
+		ERR_MASK(HardwareErr) | ERR_MASK(SDmaDisabledErr);
+
+	qib_decode_7220_err(dd, msg, sizeof dd->cspec->emsgbuf, errs & ~mask);
+
+	if (errs & E_SUM_PKTERRS)
+		qib_stats.sps_rcverrs++;
+	if (errs & E_SUM_ERRS)
+		qib_stats.sps_txerrs++;
+	iserr = errs & ~(E_SUM_PKTERRS | QLOGIC_IB_E_PKTERRS |
+			 ERR_MASK(SDmaDisabledErr));
+
+	if (errs & ERR_MASK(IBStatusChanged)) {
+		u64 ibcs;
+
+		ibcs = qib_read_kreg64(dd, kr_ibcstatus);
+		if (!(ppd->lflags & QIBL_IB_AUTONEG_INPROG))
+			handle_7220_chase(ppd, ibcs);
+
+		/* Update our picture of width and speed from chip */
+		ppd->link_width_active =
+			((ibcs >> IBA7220_LINKWIDTH_SHIFT) & 1) ?
+			    IB_WIDTH_4X : IB_WIDTH_1X;
+		ppd->link_speed_active =
+			((ibcs >> IBA7220_LINKSPEED_SHIFT) & 1) ?
+			    QIB_IB_DDR : QIB_IB_SDR;
+
+		/*
+		 * Since going into a recovery state causes the link state
+		 * to go down and since recovery is transitory, it is better
+		 * if we "miss" ever seeing the link training state go into
+		 * recovery (i.e., ignore this transition for link state
+		 * special handling purposes) without updating lastibcstat.
+		 */
+		if (qib_7220_phys_portstate(ibcs) !=
+					    IB_PHYSPORTSTATE_LINK_ERR_RECOVER)
+			qib_handle_e_ibstatuschanged(ppd, ibcs);
+	}
+
+	if (errs & ERR_MASK(ResetNegated)) {
+		qib_dev_err(dd,
+			"Got reset, requires re-init (unload and reload driver)\n");
+		dd->flags &= ~QIB_INITTED;  /* needs re-init */
+		/* mark as having had error */
+		*dd->devstatusp |= QIB_STATUS_HWERROR;
+		*dd->pport->statusp &= ~QIB_STATUS_IB_CONF;
+	}
+
+	if (*msg && iserr)
+		qib_dev_porterr(dd, ppd->port, "%s error\n", msg);
+
+	if (ppd->state_wanted & ppd->lflags)
+		wake_up_interruptible(&ppd->state_wait);
+
+	/*
+	 * If there were hdrq or egrfull errors, wake up any processes
+	 * waiting in poll.  We used to try to check which contexts had
+	 * the overflow, but given the cost of that and the chip reads
+	 * to support it, it's better to just wake everybody up if we
+	 * get an overflow; waiters can poll again if it's not them.
+	 */
+	if (errs & (ERR_MASK(RcvEgrFullErr) | ERR_MASK(RcvHdrFullErr))) {
+		qib_handle_urcv(dd, ~0U);
+		if (errs & ERR_MASK(RcvEgrFullErr))
+			qib_stats.sps_buffull++;
+		else
+			qib_stats.sps_hdrfull++;
+	}
+done:
+	return;
+}
+
+/* enable/disable chip from delivering interrupts */
+static void qib_7220_set_intr_state(struct qib_devdata *dd, u32 enable)
+{
+	if (enable) {
+		if (dd->flags & QIB_BADINTR)
+			return;
+		qib_write_kreg(dd, kr_intmask, ~0ULL);
+		/* force re-interrupt of any pending interrupts. */
+		qib_write_kreg(dd, kr_intclear, 0ULL);
+	} else
+		qib_write_kreg(dd, kr_intmask, 0ULL);
+}
+
+/*
+ * Try to cleanup as much as possible for anything that might have gone
+ * wrong while in freeze mode, such as pio buffers being written by user
+ * processes (causing armlaunch), send errors due to going into freeze mode,
+ * etc., and try to avoid causing extra interrupts while doing so.
+ * Forcibly update the in-memory pioavail register copies after cleanup
+ * because the chip won't do it while in freeze mode (the register values
+ * themselves are kept correct).
+ * Make sure that we don't lose any important interrupts by using the chip
+ * feature that says that writing 0 to a bit in *clear that is set in
+ * *status will cause an interrupt to be generated again (if allowed by
+ * the *mask value).
+ * This is in chip-specific code because of all of the register accesses,
+ * even though the details are similar on most chips.
+ */
+static void qib_7220_clear_freeze(struct qib_devdata *dd)
+{
+	/* disable error interrupts, to avoid confusion */
+	qib_write_kreg(dd, kr_errmask, 0ULL);
+
+	/* also disable interrupts; errormask is sometimes overwriten */
+	qib_7220_set_intr_state(dd, 0);
+
+	qib_cancel_sends(dd->pport);
+
+	/* clear the freeze, and be sure chip saw it */
+	qib_write_kreg(dd, kr_control, dd->control);
+	qib_read_kreg32(dd, kr_scratch);
+
+	/* force in-memory update now we are out of freeze */
+	qib_force_pio_avail_update(dd);
+
+	/*
+	 * force new interrupt if any hwerr, error or interrupt bits are
+	 * still set, and clear "safe" send packet errors related to freeze
+	 * and cancelling sends.  Re-enable error interrupts before possible
+	 * force of re-interrupt on pending interrupts.
+	 */
+	qib_write_kreg(dd, kr_hwerrclear, 0ULL);
+	qib_write_kreg(dd, kr_errclear, E_SPKT_ERRS_IGNORE);
+	qib_write_kreg(dd, kr_errmask, dd->cspec->errormask);
+	qib_7220_set_intr_state(dd, 1);
+}
+
+/**
+ * qib_7220_handle_hwerrors - display hardware errors.
+ * @dd: the qlogic_ib device
+ * @msg: the output buffer
+ * @msgl: the size of the output buffer
+ *
+ * Use same msg buffer as regular errors to avoid excessive stack
+ * use.  Most hardware errors are catastrophic, but for right now,
+ * we'll print them and continue.  We reuse the same message buffer as
+ * handle_7220_errors() to avoid excessive stack usage.
+ */
+static void qib_7220_handle_hwerrors(struct qib_devdata *dd, char *msg,
+				     size_t msgl)
+{
+	u64 hwerrs;
+	u32 bits, ctrl;
+	int isfatal = 0;
+	char *bitsmsg;
+	int log_idx;
+
+	hwerrs = qib_read_kreg64(dd, kr_hwerrstatus);
+	if (!hwerrs)
+		goto bail;
+	if (hwerrs == ~0ULL) {
+		qib_dev_err(dd,
+			"Read of hardware error status failed (all bits set); ignoring\n");
+		goto bail;
+	}
+	qib_stats.sps_hwerrs++;
+
+	/*
+	 * Always clear the error status register, except MEMBISTFAIL,
+	 * regardless of whether we continue or stop using the chip.
+	 * We want that set so we know it failed, even across driver reload.
+	 * We'll still ignore it in the hwerrmask.  We do this partly for
+	 * diagnostics, but also for support.
+	 */
+	qib_write_kreg(dd, kr_hwerrclear,
+		       hwerrs & ~HWE_MASK(PowerOnBISTFailed));
+
+	hwerrs &= dd->cspec->hwerrmask;
+
+	/* We log some errors to EEPROM, check if we have any of those. */
+	for (log_idx = 0; log_idx < QIB_EEP_LOG_CNT; ++log_idx)
+		if (hwerrs & dd->eep_st_masks[log_idx].hwerrs_to_log)
+			qib_inc_eeprom_err(dd, log_idx, 1);
+	if (hwerrs & ~(TXEMEMPARITYERR_PIOBUF | TXEMEMPARITYERR_PIOPBC |
+		       RXE_PARITY))
+		qib_devinfo(dd->pcidev,
+			"Hardware error: hwerr=0x%llx (cleared)\n",
+			(unsigned long long) hwerrs);
+
+	if (hwerrs & ~IB_HWE_BITSEXTANT)
+		qib_dev_err(dd,
+			"hwerror interrupt with unknown errors %llx set\n",
+			(unsigned long long) (hwerrs & ~IB_HWE_BITSEXTANT));
+
+	if (hwerrs & QLOGIC_IB_HWE_IB_UC_MEMORYPARITYERR)
+		qib_sd7220_clr_ibpar(dd);
+
+	ctrl = qib_read_kreg32(dd, kr_control);
+	if ((ctrl & QLOGIC_IB_C_FREEZEMODE) && !dd->diag_client) {
+		/*
+		 * Parity errors in send memory are recoverable by h/w
+		 * just do housekeeping, exit freeze mode and continue.
+		 */
+		if (hwerrs & (TXEMEMPARITYERR_PIOBUF |
+			      TXEMEMPARITYERR_PIOPBC)) {
+			qib_7220_txe_recover(dd);
+			hwerrs &= ~(TXEMEMPARITYERR_PIOBUF |
+				    TXEMEMPARITYERR_PIOPBC);
+		}
+		if (hwerrs)
+			isfatal = 1;
+		else
+			qib_7220_clear_freeze(dd);
+	}
+
+	*msg = '\0';
+
+	if (hwerrs & HWE_MASK(PowerOnBISTFailed)) {
+		isfatal = 1;
+		strlcat(msg,
+			"[Memory BIST test failed, InfiniPath hardware unusable]",
+			msgl);
+		/* ignore from now on, so disable until driver reloaded */
+		dd->cspec->hwerrmask &= ~HWE_MASK(PowerOnBISTFailed);
+		qib_write_kreg(dd, kr_hwerrmask, dd->cspec->hwerrmask);
+	}
+
+	qib_format_hwerrors(hwerrs, qib_7220_hwerror_msgs,
+			    ARRAY_SIZE(qib_7220_hwerror_msgs), msg, msgl);
+
+	bitsmsg = dd->cspec->bitsmsgbuf;
+	if (hwerrs & (QLOGIC_IB_HWE_PCIEMEMPARITYERR_MASK <<
+		      QLOGIC_IB_HWE_PCIEMEMPARITYERR_SHIFT)) {
+		bits = (u32) ((hwerrs >>
+			       QLOGIC_IB_HWE_PCIEMEMPARITYERR_SHIFT) &
+			      QLOGIC_IB_HWE_PCIEMEMPARITYERR_MASK);
+		snprintf(bitsmsg, sizeof dd->cspec->bitsmsgbuf,
+			 "[PCIe Mem Parity Errs %x] ", bits);
+		strlcat(msg, bitsmsg, msgl);
+	}
+
+#define _QIB_PLL_FAIL (QLOGIC_IB_HWE_COREPLL_FBSLIP |   \
+			 QLOGIC_IB_HWE_COREPLL_RFSLIP)
+
+	if (hwerrs & _QIB_PLL_FAIL) {
+		isfatal = 1;
+		snprintf(bitsmsg, sizeof dd->cspec->bitsmsgbuf,
+			 "[PLL failed (%llx), InfiniPath hardware unusable]",
+			 (unsigned long long) hwerrs & _QIB_PLL_FAIL);
+		strlcat(msg, bitsmsg, msgl);
+		/* ignore from now on, so disable until driver reloaded */
+		dd->cspec->hwerrmask &= ~(hwerrs & _QIB_PLL_FAIL);
+		qib_write_kreg(dd, kr_hwerrmask, dd->cspec->hwerrmask);
+	}
+
+	if (hwerrs & QLOGIC_IB_HWE_SERDESPLLFAILED) {
+		/*
+		 * If it occurs, it is left masked since the eternal
+		 * interface is unused.
+		 */
+		dd->cspec->hwerrmask &= ~QLOGIC_IB_HWE_SERDESPLLFAILED;
+		qib_write_kreg(dd, kr_hwerrmask, dd->cspec->hwerrmask);
+	}
+
+	qib_dev_err(dd, "%s hardware error\n", msg);
+
+	if (isfatal && !dd->diag_client) {
+		qib_dev_err(dd,
+			"Fatal Hardware Error, no longer usable, SN %.16s\n",
+			dd->serial);
+		/*
+		 * For /sys status file and user programs to print; if no
+		 * trailing brace is copied, we'll know it was truncated.
+		 */
+		if (dd->freezemsg)
+			snprintf(dd->freezemsg, dd->freezelen,
+				 "{%s}", msg);
+		qib_disable_after_error(dd);
+	}
+bail:;
+}
+
+/**
+ * qib_7220_init_hwerrors - enable hardware errors
+ * @dd: the qlogic_ib device
+ *
+ * now that we have finished initializing everything that might reasonably
+ * cause a hardware error, and cleared those errors bits as they occur,
+ * we can enable hardware errors in the mask (potentially enabling
+ * freeze mode), and enable hardware errors as errors (along with
+ * everything else) in errormask
+ */
+static void qib_7220_init_hwerrors(struct qib_devdata *dd)
+{
+	u64 val;
+	u64 extsval;
+
+	extsval = qib_read_kreg64(dd, kr_extstatus);
+
+	if (!(extsval & (QLOGIC_IB_EXTS_MEMBIST_ENDTEST |
+			 QLOGIC_IB_EXTS_MEMBIST_DISABLED)))
+		qib_dev_err(dd, "MemBIST did not complete!\n");
+	if (extsval & QLOGIC_IB_EXTS_MEMBIST_DISABLED)
+		qib_devinfo(dd->pcidev, "MemBIST is disabled.\n");
+
+	val = ~0ULL;    /* default to all hwerrors become interrupts, */
+
+	val &= ~QLOGIC_IB_HWE_IB_UC_MEMORYPARITYERR;
+	dd->cspec->hwerrmask = val;
+
+	qib_write_kreg(dd, kr_hwerrclear, ~HWE_MASK(PowerOnBISTFailed));
+	qib_write_kreg(dd, kr_hwerrmask, dd->cspec->hwerrmask);
+
+	/* clear all */
+	qib_write_kreg(dd, kr_errclear, ~0ULL);
+	/* enable errors that are masked, at least this first time. */
+	qib_write_kreg(dd, kr_errmask, ~0ULL);
+	dd->cspec->errormask = qib_read_kreg64(dd, kr_errmask);
+	/* clear any interrupts up to this point (ints still not enabled) */
+	qib_write_kreg(dd, kr_intclear, ~0ULL);
+}
+
+/*
+ * Disable and enable the armlaunch error.  Used for PIO bandwidth testing
+ * on chips that are count-based, rather than trigger-based.  There is no
+ * reference counting, but that's also fine, given the intended use.
+ * Only chip-specific because it's all register accesses
+ */
+static void qib_set_7220_armlaunch(struct qib_devdata *dd, u32 enable)
+{
+	if (enable) {
+		qib_write_kreg(dd, kr_errclear, ERR_MASK(SendPioArmLaunchErr));
+		dd->cspec->errormask |= ERR_MASK(SendPioArmLaunchErr);
+	} else
+		dd->cspec->errormask &= ~ERR_MASK(SendPioArmLaunchErr);
+	qib_write_kreg(dd, kr_errmask, dd->cspec->errormask);
+}
+
+/*
+ * Formerly took parameter <which> in pre-shifted,
+ * pre-merged form with LinkCmd and LinkInitCmd
+ * together, and assuming the zero was NOP.
+ */
+static void qib_set_ib_7220_lstate(struct qib_pportdata *ppd, u16 linkcmd,
+				   u16 linitcmd)
+{
+	u64 mod_wd;
+	struct qib_devdata *dd = ppd->dd;
+	unsigned long flags;
+
+	if (linitcmd == QLOGIC_IB_IBCC_LINKINITCMD_DISABLE) {
+		/*
+		 * If we are told to disable, note that so link-recovery
+		 * code does not attempt to bring us back up.
+		 */
+		spin_lock_irqsave(&ppd->lflags_lock, flags);
+		ppd->lflags |= QIBL_IB_LINK_DISABLED;
+		spin_unlock_irqrestore(&ppd->lflags_lock, flags);
+	} else if (linitcmd || linkcmd == QLOGIC_IB_IBCC_LINKCMD_DOWN) {
+		/*
+		 * Any other linkinitcmd will lead to LINKDOWN and then
+		 * to INIT (if all is well), so clear flag to let
+		 * link-recovery code attempt to bring us back up.
+		 */
+		spin_lock_irqsave(&ppd->lflags_lock, flags);
+		ppd->lflags &= ~QIBL_IB_LINK_DISABLED;
+		spin_unlock_irqrestore(&ppd->lflags_lock, flags);
+	}
+
+	mod_wd = (linkcmd << IBA7220_IBCC_LINKCMD_SHIFT) |
+		(linitcmd << QLOGIC_IB_IBCC_LINKINITCMD_SHIFT);
+
+	qib_write_kreg(dd, kr_ibcctrl, ppd->cpspec->ibcctrl | mod_wd);
+	/* write to chip to prevent back-to-back writes of ibc reg */
+	qib_write_kreg(dd, kr_scratch, 0);
+}
+
+/*
+ * All detailed interaction with the SerDes has been moved to qib_sd7220.c
+ *
+ * The portion of IBA7220-specific bringup_serdes() that actually deals with
+ * registers and memory within the SerDes itself is qib_sd7220_init().
+ */
+
+/**
+ * qib_7220_bringup_serdes - bring up the serdes
+ * @ppd: physical port on the qlogic_ib device
+ */
+static int qib_7220_bringup_serdes(struct qib_pportdata *ppd)
+{
+	struct qib_devdata *dd = ppd->dd;
+	u64 val, prev_val, guid, ibc;
+	int ret = 0;
+
+	/* Put IBC in reset, sends disabled */
+	dd->control &= ~QLOGIC_IB_C_LINKENABLE;
+	qib_write_kreg(dd, kr_control, 0ULL);
+
+	if (qib_compat_ddr_negotiate) {
+		ppd->cpspec->ibdeltainprog = 1;
+		ppd->cpspec->ibsymsnap = read_7220_creg32(dd, cr_ibsymbolerr);
+		ppd->cpspec->iblnkerrsnap =
+			read_7220_creg32(dd, cr_iblinkerrrecov);
+	}
+
+	/* flowcontrolwatermark is in units of KBytes */
+	ibc = 0x5ULL << SYM_LSB(IBCCtrl, FlowCtrlWaterMark);
+	/*
+	 * How often flowctrl sent.  More or less in usecs; balance against
+	 * watermark value, so that in theory senders always get a flow
+	 * control update in time to not let the IB link go idle.
+	 */
+	ibc |= 0x3ULL << SYM_LSB(IBCCtrl, FlowCtrlPeriod);
+	/* max error tolerance */
+	ibc |= 0xfULL << SYM_LSB(IBCCtrl, PhyerrThreshold);
+	/* use "real" buffer space for */
+	ibc |= 4ULL << SYM_LSB(IBCCtrl, CreditScale);
+	/* IB credit flow control. */
+	ibc |= 0xfULL << SYM_LSB(IBCCtrl, OverrunThreshold);
+	/*
+	 * set initial max size pkt IBC will send, including ICRC; it's the
+	 * PIO buffer size in dwords, less 1; also see qib_set_mtu()
+	 */
+	ibc |= ((u64)(ppd->ibmaxlen >> 2) + 1) << SYM_LSB(IBCCtrl, MaxPktLen);
+	ppd->cpspec->ibcctrl = ibc; /* without linkcmd or linkinitcmd! */
+
+	/* initially come up waiting for TS1, without sending anything. */
+	val = ppd->cpspec->ibcctrl | (QLOGIC_IB_IBCC_LINKINITCMD_DISABLE <<
+		QLOGIC_IB_IBCC_LINKINITCMD_SHIFT);
+	qib_write_kreg(dd, kr_ibcctrl, val);
+
+	if (!ppd->cpspec->ibcddrctrl) {
+		/* not on re-init after reset */
+		ppd->cpspec->ibcddrctrl = qib_read_kreg64(dd, kr_ibcddrctrl);
+
+		if (ppd->link_speed_enabled == (QIB_IB_SDR | QIB_IB_DDR))
+			ppd->cpspec->ibcddrctrl |=
+				IBA7220_IBC_SPEED_AUTONEG_MASK |
+				IBA7220_IBC_IBTA_1_2_MASK;
+		else
+			ppd->cpspec->ibcddrctrl |=
+				ppd->link_speed_enabled == QIB_IB_DDR ?
+				IBA7220_IBC_SPEED_DDR : IBA7220_IBC_SPEED_SDR;
+		if ((ppd->link_width_enabled & (IB_WIDTH_1X | IB_WIDTH_4X)) ==
+		    (IB_WIDTH_1X | IB_WIDTH_4X))
+			ppd->cpspec->ibcddrctrl |= IBA7220_IBC_WIDTH_AUTONEG;
+		else
+			ppd->cpspec->ibcddrctrl |=
+				ppd->link_width_enabled == IB_WIDTH_4X ?
+				IBA7220_IBC_WIDTH_4X_ONLY :
+				IBA7220_IBC_WIDTH_1X_ONLY;
+
+		/* always enable these on driver reload, not sticky */
+		ppd->cpspec->ibcddrctrl |=
+			IBA7220_IBC_RXPOL_MASK << IBA7220_IBC_RXPOL_SHIFT;
+		ppd->cpspec->ibcddrctrl |=
+			IBA7220_IBC_HRTBT_MASK << IBA7220_IBC_HRTBT_SHIFT;
+
+		/* enable automatic lane reversal detection for receive */
+		ppd->cpspec->ibcddrctrl |= IBA7220_IBC_LANE_REV_SUPPORTED;
+	} else
+		/* write to chip to prevent back-to-back writes of ibc reg */
+		qib_write_kreg(dd, kr_scratch, 0);
+
+	qib_write_kreg(dd, kr_ibcddrctrl, ppd->cpspec->ibcddrctrl);
+	qib_write_kreg(dd, kr_scratch, 0);
+
+	qib_write_kreg(dd, kr_ncmodectrl, 0Ull);
+	qib_write_kreg(dd, kr_scratch, 0);
+
+	ret = qib_sd7220_init(dd);
+
+	val = qib_read_kreg64(dd, kr_xgxs_cfg);
+	prev_val = val;
+	val |= QLOGIC_IB_XGXS_FC_SAFE;
+	if (val != prev_val) {
+		qib_write_kreg(dd, kr_xgxs_cfg, val);
+		qib_read_kreg32(dd, kr_scratch);
+	}
+	if (val & QLOGIC_IB_XGXS_RESET)
+		val &= ~QLOGIC_IB_XGXS_RESET;
+	if (val != prev_val)
+		qib_write_kreg(dd, kr_xgxs_cfg, val);
+
+	/* first time through, set port guid */
+	if (!ppd->guid)
+		ppd->guid = dd->base_guid;
+	guid = be64_to_cpu(ppd->guid);
+
+	qib_write_kreg(dd, kr_hrtbt_guid, guid);
+	if (!ret) {
+		dd->control |= QLOGIC_IB_C_LINKENABLE;
+		qib_write_kreg(dd, kr_control, dd->control);
+	} else
+		/* write to chip to prevent back-to-back writes of ibc reg */
+		qib_write_kreg(dd, kr_scratch, 0);
+	return ret;
+}
+
+/**
+ * qib_7220_quiet_serdes - set serdes to txidle
+ * @ppd: physical port of the qlogic_ib device
+ * Called when driver is being unloaded
+ */
+static void qib_7220_quiet_serdes(struct qib_pportdata *ppd)
+{
+	u64 val;
+	struct qib_devdata *dd = ppd->dd;
+	unsigned long flags;
+
+	/* disable IBC */
+	dd->control &= ~QLOGIC_IB_C_LINKENABLE;
+	qib_write_kreg(dd, kr_control,
+		       dd->control | QLOGIC_IB_C_FREEZEMODE);
+
+	ppd->cpspec->chase_end = 0;
+	if (ppd->cpspec->chase_timer.data) /* if initted */
+		del_timer_sync(&ppd->cpspec->chase_timer);
+
+	if (ppd->cpspec->ibsymdelta || ppd->cpspec->iblnkerrdelta ||
+	    ppd->cpspec->ibdeltainprog) {
+		u64 diagc;
+
+		/* enable counter writes */
+		diagc = qib_read_kreg64(dd, kr_hwdiagctrl);
+		qib_write_kreg(dd, kr_hwdiagctrl,
+			       diagc | SYM_MASK(HwDiagCtrl, CounterWrEnable));
+
+		if (ppd->cpspec->ibsymdelta || ppd->cpspec->ibdeltainprog) {
+			val = read_7220_creg32(dd, cr_ibsymbolerr);
+			if (ppd->cpspec->ibdeltainprog)
+				val -= val - ppd->cpspec->ibsymsnap;
+			val -= ppd->cpspec->ibsymdelta;
+			write_7220_creg(dd, cr_ibsymbolerr, val);
+		}
+		if (ppd->cpspec->iblnkerrdelta || ppd->cpspec->ibdeltainprog) {
+			val = read_7220_creg32(dd, cr_iblinkerrrecov);
+			if (ppd->cpspec->ibdeltainprog)
+				val -= val - ppd->cpspec->iblnkerrsnap;
+			val -= ppd->cpspec->iblnkerrdelta;
+			write_7220_creg(dd, cr_iblinkerrrecov, val);
+		}
+
+		/* and disable counter writes */
+		qib_write_kreg(dd, kr_hwdiagctrl, diagc);
+	}
+	qib_set_ib_7220_lstate(ppd, 0, QLOGIC_IB_IBCC_LINKINITCMD_DISABLE);
+
+	spin_lock_irqsave(&ppd->lflags_lock, flags);
+	ppd->lflags &= ~QIBL_IB_AUTONEG_INPROG;
+	spin_unlock_irqrestore(&ppd->lflags_lock, flags);
+	wake_up(&ppd->cpspec->autoneg_wait);
+	cancel_delayed_work_sync(&ppd->cpspec->autoneg_work);
+
+	shutdown_7220_relock_poll(ppd->dd);
+	val = qib_read_kreg64(ppd->dd, kr_xgxs_cfg);
+	val |= QLOGIC_IB_XGXS_RESET;
+	qib_write_kreg(ppd->dd, kr_xgxs_cfg, val);
+}
+
+/**
+ * qib_setup_7220_setextled - set the state of the two external LEDs
+ * @dd: the qlogic_ib device
+ * @on: whether the link is up or not
+ *
+ * The exact combo of LEDs if on is true is determined by looking
+ * at the ibcstatus.
+ *
+ * These LEDs indicate the physical and logical state of IB link.
+ * For this chip (at least with recommended board pinouts), LED1
+ * is Yellow (logical state) and LED2 is Green (physical state),
+ *
+ * Note:  We try to match the Mellanox HCA LED behavior as best
+ * we can.  Green indicates physical link state is OK (something is
+ * plugged in, and we can train).
+ * Amber indicates the link is logically up (ACTIVE).
+ * Mellanox further blinks the amber LED to indicate data packet
+ * activity, but we have no hardware support for that, so it would
+ * require waking up every 10-20 msecs and checking the counters
+ * on the chip, and then turning the LED off if appropriate.  That's
+ * visible overhead, so not something we will do.
+ *
+ */
+static void qib_setup_7220_setextled(struct qib_pportdata *ppd, u32 on)
+{
+	struct qib_devdata *dd = ppd->dd;
+	u64 extctl, ledblink = 0, val, lst, ltst;
+	unsigned long flags;
+
+	/*
+	 * The diags use the LED to indicate diag info, so we leave
+	 * the external LED alone when the diags are running.
+	 */
+	if (dd->diag_client)
+		return;
+
+	if (ppd->led_override) {
+		ltst = (ppd->led_override & QIB_LED_PHYS) ?
+			IB_PHYSPORTSTATE_LINKUP : IB_PHYSPORTSTATE_DISABLED,
+		lst = (ppd->led_override & QIB_LED_LOG) ?
+			IB_PORT_ACTIVE : IB_PORT_DOWN;
+	} else if (on) {
+		val = qib_read_kreg64(dd, kr_ibcstatus);
+		ltst = qib_7220_phys_portstate(val);
+		lst = qib_7220_iblink_state(val);
+	} else {
+		ltst = 0;
+		lst = 0;
+	}
+
+	spin_lock_irqsave(&dd->cspec->gpio_lock, flags);
+	extctl = dd->cspec->extctrl & ~(SYM_MASK(EXTCtrl, LEDPriPortGreenOn) |
+				 SYM_MASK(EXTCtrl, LEDPriPortYellowOn));
+	if (ltst == IB_PHYSPORTSTATE_LINKUP) {
+		extctl |= SYM_MASK(EXTCtrl, LEDPriPortGreenOn);
+		/*
+		 * counts are in chip clock (4ns) periods.
+		 * This is 1/16 sec (66.6ms) on,
+		 * 3/16 sec (187.5 ms) off, with packets rcvd
+		 */
+		ledblink = ((66600 * 1000UL / 4) << IBA7220_LEDBLINK_ON_SHIFT)
+			| ((187500 * 1000UL / 4) << IBA7220_LEDBLINK_OFF_SHIFT);
+	}
+	if (lst == IB_PORT_ACTIVE)
+		extctl |= SYM_MASK(EXTCtrl, LEDPriPortYellowOn);
+	dd->cspec->extctrl = extctl;
+	qib_write_kreg(dd, kr_extctrl, extctl);
+	spin_unlock_irqrestore(&dd->cspec->gpio_lock, flags);
+
+	if (ledblink) /* blink the LED on packet receive */
+		qib_write_kreg(dd, kr_rcvpktledcnt, ledblink);
+}
+
+static void qib_7220_free_irq(struct qib_devdata *dd)
+{
+	if (dd->cspec->irq) {
+		free_irq(dd->cspec->irq, dd);
+		dd->cspec->irq = 0;
+	}
+	qib_nomsi(dd);
+}
+
+/*
+ * qib_setup_7220_cleanup - clean up any per-chip chip-specific stuff
+ * @dd: the qlogic_ib device
+ *
+ * This is called during driver unload.
+ *
+ */
+static void qib_setup_7220_cleanup(struct qib_devdata *dd)
+{
+	qib_7220_free_irq(dd);
+	kfree(dd->cspec->cntrs);
+	kfree(dd->cspec->portcntrs);
+}
+
+/*
+ * This is only called for SDmaInt.
+ * SDmaDisabled is handled on the error path.
+ */
+static void sdma_7220_intr(struct qib_pportdata *ppd, u64 istat)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&ppd->sdma_lock, flags);
+
+	switch (ppd->sdma_state.current_state) {
+	case qib_sdma_state_s00_hw_down:
+		break;
+
+	case qib_sdma_state_s10_hw_start_up_wait:
+		__qib_sdma_process_event(ppd, qib_sdma_event_e20_hw_started);
+		break;
+
+	case qib_sdma_state_s20_idle:
+		break;
+
+	case qib_sdma_state_s30_sw_clean_up_wait:
+		break;
+
+	case qib_sdma_state_s40_hw_clean_up_wait:
+		break;
+
+	case qib_sdma_state_s50_hw_halt_wait:
+		__qib_sdma_process_event(ppd, qib_sdma_event_e60_hw_halted);
+		break;
+
+	case qib_sdma_state_s99_running:
+		/* too chatty to print here */
+		__qib_sdma_intr(ppd);
+		break;
+	}
+	spin_unlock_irqrestore(&ppd->sdma_lock, flags);
+}
+
+static void qib_wantpiobuf_7220_intr(struct qib_devdata *dd, u32 needint)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&dd->sendctrl_lock, flags);
+	if (needint) {
+		if (!(dd->sendctrl & SYM_MASK(SendCtrl, SendBufAvailUpd)))
+			goto done;
+		/*
+		 * blip the availupd off, next write will be on, so
+		 * we ensure an avail update, regardless of threshold or
+		 * buffers becoming free, whenever we want an interrupt
+		 */
+		qib_write_kreg(dd, kr_sendctrl, dd->sendctrl &
+			~SYM_MASK(SendCtrl, SendBufAvailUpd));
+		qib_write_kreg(dd, kr_scratch, 0ULL);
+		dd->sendctrl |= SYM_MASK(SendCtrl, SendIntBufAvail);
+	} else
+		dd->sendctrl &= ~SYM_MASK(SendCtrl, SendIntBufAvail);
+	qib_write_kreg(dd, kr_sendctrl, dd->sendctrl);
+	qib_write_kreg(dd, kr_scratch, 0ULL);
+done:
+	spin_unlock_irqrestore(&dd->sendctrl_lock, flags);
+}
+
+/*
+ * Handle errors and unusual events first, separate function
+ * to improve cache hits for fast path interrupt handling.
+ */
+static noinline void unlikely_7220_intr(struct qib_devdata *dd, u64 istat)
+{
+	if (unlikely(istat & ~QLOGIC_IB_I_BITSEXTANT))
+		qib_dev_err(dd,
+			    "interrupt with unknown interrupts %Lx set\n",
+			    istat & ~QLOGIC_IB_I_BITSEXTANT);
+
+	if (istat & QLOGIC_IB_I_GPIO) {
+		u32 gpiostatus;
+
+		/*
+		 * Boards for this chip currently don't use GPIO interrupts,
+		 * so clear by writing GPIOstatus to GPIOclear, and complain
+		 * to alert developer. To avoid endless repeats, clear
+		 * the bits in the mask, since there is some kind of
+		 * programming error or chip problem.
+		 */
+		gpiostatus = qib_read_kreg32(dd, kr_gpio_status);
+		/*
+		 * In theory, writing GPIOstatus to GPIOclear could
+		 * have a bad side-effect on some diagnostic that wanted
+		 * to poll for a status-change, but the various shadows
+		 * make that problematic at best. Diags will just suppress
+		 * all GPIO interrupts during such tests.
+		 */
+		qib_write_kreg(dd, kr_gpio_clear, gpiostatus);
+
+		if (gpiostatus) {
+			const u32 mask = qib_read_kreg32(dd, kr_gpio_mask);
+			u32 gpio_irq = mask & gpiostatus;
+
+			/*
+			 * A bit set in status and (chip) Mask register
+			 * would cause an interrupt. Since we are not
+			 * expecting any, report it. Also check that the
+			 * chip reflects our shadow, report issues,
+			 * and refresh from the shadow.
+			 */
+			/*
+			 * Clear any troublemakers, and update chip
+			 * from shadow
+			 */
+			dd->cspec->gpio_mask &= ~gpio_irq;
+			qib_write_kreg(dd, kr_gpio_mask, dd->cspec->gpio_mask);
+		}
+	}
+
+	if (istat & QLOGIC_IB_I_ERROR) {
+		u64 estat;
+
+		qib_stats.sps_errints++;
+		estat = qib_read_kreg64(dd, kr_errstatus);
+		if (!estat)
+			qib_devinfo(dd->pcidev,
+				"error interrupt (%Lx), but no error bits set!\n",
+				istat);
+		else
+			handle_7220_errors(dd, estat);
+	}
+}
+
+static irqreturn_t qib_7220intr(int irq, void *data)
+{
+	struct qib_devdata *dd = data;
+	irqreturn_t ret;
+	u64 istat;
+	u64 ctxtrbits;
+	u64 rmask;
+	unsigned i;
+
+	if ((dd->flags & (QIB_PRESENT | QIB_BADINTR)) != QIB_PRESENT) {
+		/*
+		 * This return value is not great, but we do not want the
+		 * interrupt core code to remove our interrupt handler
+		 * because we don't appear to be handling an interrupt
+		 * during a chip reset.
+		 */
+		ret = IRQ_HANDLED;
+		goto bail;
+	}
+
+	istat = qib_read_kreg64(dd, kr_intstatus);
+
+	if (unlikely(!istat)) {
+		ret = IRQ_NONE; /* not our interrupt, or already handled */
+		goto bail;
+	}
+	if (unlikely(istat == -1)) {
+		qib_bad_intrstatus(dd);
+		/* don't know if it was our interrupt or not */
+		ret = IRQ_NONE;
+		goto bail;
+	}
+
+	this_cpu_inc(*dd->int_counter);
+	if (unlikely(istat & (~QLOGIC_IB_I_BITSEXTANT |
+			      QLOGIC_IB_I_GPIO | QLOGIC_IB_I_ERROR)))
+		unlikely_7220_intr(dd, istat);
+
+	/*
+	 * Clear the interrupt bits we found set, relatively early, so we
+	 * "know" know the chip will have seen this by the time we process
+	 * the queue, and will re-interrupt if necessary.  The processor
+	 * itself won't take the interrupt again until we return.
+	 */
+	qib_write_kreg(dd, kr_intclear, istat);
+
+	/*
+	 * Handle kernel receive queues before checking for pio buffers
+	 * available since receives can overflow; piobuf waiters can afford
+	 * a few extra cycles, since they were waiting anyway.
+	 */
+	ctxtrbits = istat &
+		((QLOGIC_IB_I_RCVAVAIL_MASK << QLOGIC_IB_I_RCVAVAIL_SHIFT) |
+		 (QLOGIC_IB_I_RCVURG_MASK << QLOGIC_IB_I_RCVURG_SHIFT));
+	if (ctxtrbits) {
+		rmask = (1ULL << QLOGIC_IB_I_RCVAVAIL_SHIFT) |
+			(1ULL << QLOGIC_IB_I_RCVURG_SHIFT);
+		for (i = 0; i < dd->first_user_ctxt; i++) {
+			if (ctxtrbits & rmask) {
+				ctxtrbits &= ~rmask;
+				qib_kreceive(dd->rcd[i], NULL, NULL);
+			}
+			rmask <<= 1;
+		}
+		if (ctxtrbits) {
+			ctxtrbits =
+				(ctxtrbits >> QLOGIC_IB_I_RCVAVAIL_SHIFT) |
+				(ctxtrbits >> QLOGIC_IB_I_RCVURG_SHIFT);
+			qib_handle_urcv(dd, ctxtrbits);
+		}
+	}
+
+	/* only call for SDmaInt */
+	if (istat & QLOGIC_IB_I_SDMAINT)
+		sdma_7220_intr(dd->pport, istat);
+
+	if ((istat & QLOGIC_IB_I_SPIOBUFAVAIL) && (dd->flags & QIB_INITTED))
+		qib_ib_piobufavail(dd);
+
+	ret = IRQ_HANDLED;
+bail:
+	return ret;
+}
+
+/*
+ * Set up our chip-specific interrupt handler.
+ * The interrupt type has already been setup, so
+ * we just need to do the registration and error checking.
+ * If we are using MSI interrupts, we may fall back to
+ * INTx later, if the interrupt handler doesn't get called
+ * within 1/2 second (see verify_interrupt()).
+ */
+static void qib_setup_7220_interrupt(struct qib_devdata *dd)
+{
+	if (!dd->cspec->irq)
+		qib_dev_err(dd,
+			"irq is 0, BIOS error?  Interrupts won't work\n");
+	else {
+		int ret = request_irq(dd->cspec->irq, qib_7220intr,
+			dd->msi_lo ? 0 : IRQF_SHARED,
+			QIB_DRV_NAME, dd);
+
+		if (ret)
+			qib_dev_err(dd,
+				"Couldn't setup %s interrupt (irq=%d): %d\n",
+				dd->msi_lo ?  "MSI" : "INTx",
+				dd->cspec->irq, ret);
+	}
+}
+
+/**
+ * qib_7220_boardname - fill in the board name
+ * @dd: the qlogic_ib device
+ *
+ * info is based on the board revision register
+ */
+static void qib_7220_boardname(struct qib_devdata *dd)
+{
+	char *n;
+	u32 boardid, namelen;
+
+	boardid = SYM_FIELD(dd->revision, Revision,
+			    BoardID);
+
+	switch (boardid) {
+	case 1:
+		n = "InfiniPath_QLE7240";
+		break;
+	case 2:
+		n = "InfiniPath_QLE7280";
+		break;
+	default:
+		qib_dev_err(dd, "Unknown 7220 board with ID %u\n", boardid);
+		n = "Unknown_InfiniPath_7220";
+		break;
+	}
+
+	namelen = strlen(n) + 1;
+	dd->boardname = kmalloc(namelen, GFP_KERNEL);
+	if (!dd->boardname)
+		qib_dev_err(dd, "Failed allocation for board name: %s\n", n);
+	else
+		snprintf(dd->boardname, namelen, "%s", n);
+
+	if (dd->majrev != 5 || !dd->minrev || dd->minrev > 2)
+		qib_dev_err(dd,
+			"Unsupported InfiniPath hardware revision %u.%u!\n",
+			dd->majrev, dd->minrev);
+
+	snprintf(dd->boardversion, sizeof(dd->boardversion),
+		 "ChipABI %u.%u, %s, InfiniPath%u %u.%u, SW Compat %u\n",
+		 QIB_CHIP_VERS_MAJ, QIB_CHIP_VERS_MIN, dd->boardname,
+		 (unsigned)SYM_FIELD(dd->revision, Revision_R, Arch),
+		 dd->majrev, dd->minrev,
+		 (unsigned)SYM_FIELD(dd->revision, Revision_R, SW));
+}
+
+/*
+ * This routine sleeps, so it can only be called from user context, not
+ * from interrupt context.
+ */
+static int qib_setup_7220_reset(struct qib_devdata *dd)
+{
+	u64 val;
+	int i;
+	int ret;
+	u16 cmdval;
+	u8 int_line, clinesz;
+	unsigned long flags;
+
+	qib_pcie_getcmd(dd, &cmdval, &int_line, &clinesz);
+
+	/* Use dev_err so it shows up in logs, etc. */
+	qib_dev_err(dd, "Resetting InfiniPath unit %u\n", dd->unit);
+
+	/* no interrupts till re-initted */
+	qib_7220_set_intr_state(dd, 0);
+
+	dd->pport->cpspec->ibdeltainprog = 0;
+	dd->pport->cpspec->ibsymdelta = 0;
+	dd->pport->cpspec->iblnkerrdelta = 0;
+
+	/*
+	 * Keep chip from being accessed until we are ready.  Use
+	 * writeq() directly, to allow the write even though QIB_PRESENT
+	 * isn't set.
+	 */
+	dd->flags &= ~(QIB_INITTED | QIB_PRESENT);
+	/* so we check interrupts work again */
+	dd->z_int_counter = qib_int_counter(dd);
+	val = dd->control | QLOGIC_IB_C_RESET;
+	writeq(val, &dd->kregbase[kr_control]);
+	mb(); /* prevent compiler reordering around actual reset */
+
+	for (i = 1; i <= 5; i++) {
+		/*
+		 * Allow MBIST, etc. to complete; longer on each retry.
+		 * We sometimes get machine checks from bus timeout if no
+		 * response, so for now, make it *really* long.
+		 */
+		msleep(1000 + (1 + i) * 2000);
+
+		qib_pcie_reenable(dd, cmdval, int_line, clinesz);
+
+		/*
+		 * Use readq directly, so we don't need to mark it as PRESENT
+		 * until we get a successful indication that all is well.
+		 */
+		val = readq(&dd->kregbase[kr_revision]);
+		if (val == dd->revision) {
+			dd->flags |= QIB_PRESENT; /* it's back */
+			ret = qib_reinit_intr(dd);
+			goto bail;
+		}
+	}
+	ret = 0; /* failed */
+
+bail:
+	if (ret) {
+		if (qib_pcie_params(dd, dd->lbus_width, NULL, NULL))
+			qib_dev_err(dd,
+				"Reset failed to setup PCIe or interrupts; continuing anyway\n");
+
+		/* hold IBC in reset, no sends, etc till later */
+		qib_write_kreg(dd, kr_control, 0ULL);
+
+		/* clear the reset error, init error/hwerror mask */
+		qib_7220_init_hwerrors(dd);
+
+		/* do setup similar to speed or link-width changes */
+		if (dd->pport->cpspec->ibcddrctrl & IBA7220_IBC_IBTA_1_2_MASK)
+			dd->cspec->presets_needed = 1;
+		spin_lock_irqsave(&dd->pport->lflags_lock, flags);
+		dd->pport->lflags |= QIBL_IB_FORCE_NOTIFY;
+		dd->pport->lflags &= ~QIBL_IB_AUTONEG_FAILED;
+		spin_unlock_irqrestore(&dd->pport->lflags_lock, flags);
+	}
+
+	return ret;
+}
+
+/**
+ * qib_7220_put_tid - write a TID to the chip
+ * @dd: the qlogic_ib device
+ * @tidptr: pointer to the expected TID (in chip) to update
+ * @tidtype: 0 for eager, 1 for expected
+ * @pa: physical address of in memory buffer; tidinvalid if freeing
+ */
+static void qib_7220_put_tid(struct qib_devdata *dd, u64 __iomem *tidptr,
+			     u32 type, unsigned long pa)
+{
+	if (pa != dd->tidinvalid) {
+		u64 chippa = pa >> IBA7220_TID_PA_SHIFT;
+
+		/* paranoia checks */
+		if (pa != (chippa << IBA7220_TID_PA_SHIFT)) {
+			qib_dev_err(dd, "Physaddr %lx not 2KB aligned!\n",
+				    pa);
+			return;
+		}
+		if (chippa >= (1UL << IBA7220_TID_SZ_SHIFT)) {
+			qib_dev_err(dd,
+				"Physical page address 0x%lx larger than supported\n",
+				pa);
+			return;
+		}
+
+		if (type == RCVHQ_RCV_TYPE_EAGER)
+			chippa |= dd->tidtemplate;
+		else /* for now, always full 4KB page */
+			chippa |= IBA7220_TID_SZ_4K;
+		pa = chippa;
+	}
+	writeq(pa, tidptr);
+	mmiowb();
+}
+
+/**
+ * qib_7220_clear_tids - clear all TID entries for a ctxt, expected and eager
+ * @dd: the qlogic_ib device
+ * @ctxt: the ctxt
+ *
+ * clear all TID entries for a ctxt, expected and eager.
+ * Used from qib_close().  On this chip, TIDs are only 32 bits,
+ * not 64, but they are still on 64 bit boundaries, so tidbase
+ * is declared as u64 * for the pointer math, even though we write 32 bits
+ */
+static void qib_7220_clear_tids(struct qib_devdata *dd,
+				struct qib_ctxtdata *rcd)
+{
+	u64 __iomem *tidbase;
+	unsigned long tidinv;
+	u32 ctxt;
+	int i;
+
+	if (!dd->kregbase || !rcd)
+		return;
+
+	ctxt = rcd->ctxt;
+
+	tidinv = dd->tidinvalid;
+	tidbase = (u64 __iomem *)
+		((char __iomem *)(dd->kregbase) +
+		 dd->rcvtidbase +
+		 ctxt * dd->rcvtidcnt * sizeof(*tidbase));
+
+	for (i = 0; i < dd->rcvtidcnt; i++)
+		qib_7220_put_tid(dd, &tidbase[i], RCVHQ_RCV_TYPE_EXPECTED,
+				 tidinv);
+
+	tidbase = (u64 __iomem *)
+		((char __iomem *)(dd->kregbase) +
+		 dd->rcvegrbase +
+		 rcd->rcvegr_tid_base * sizeof(*tidbase));
+
+	for (i = 0; i < rcd->rcvegrcnt; i++)
+		qib_7220_put_tid(dd, &tidbase[i], RCVHQ_RCV_TYPE_EAGER,
+				 tidinv);
+}
+
+/**
+ * qib_7220_tidtemplate - setup constants for TID updates
+ * @dd: the qlogic_ib device
+ *
+ * We setup stuff that we use a lot, to avoid calculating each time
+ */
+static void qib_7220_tidtemplate(struct qib_devdata *dd)
+{
+	if (dd->rcvegrbufsize == 2048)
+		dd->tidtemplate = IBA7220_TID_SZ_2K;
+	else if (dd->rcvegrbufsize == 4096)
+		dd->tidtemplate = IBA7220_TID_SZ_4K;
+	dd->tidinvalid = 0;
+}
+
+/**
+ * qib_init_7220_get_base_info - set chip-specific flags for user code
+ * @rcd: the qlogic_ib ctxt
+ * @kbase: qib_base_info pointer
+ *
+ * We set the PCIE flag because the lower bandwidth on PCIe vs
+ * HyperTransport can affect some user packet algorithims.
+ */
+static int qib_7220_get_base_info(struct qib_ctxtdata *rcd,
+				  struct qib_base_info *kinfo)
+{
+	kinfo->spi_runtime_flags |= QIB_RUNTIME_PCIE |
+		QIB_RUNTIME_NODMA_RTAIL | QIB_RUNTIME_SDMA;
+
+	if (rcd->dd->flags & QIB_USE_SPCL_TRIG)
+		kinfo->spi_runtime_flags |= QIB_RUNTIME_SPECIAL_TRIGGER;
+
+	return 0;
+}
+
+static struct qib_message_header *
+qib_7220_get_msgheader(struct qib_devdata *dd, __le32 *rhf_addr)
+{
+	u32 offset = qib_hdrget_offset(rhf_addr);
+
+	return (struct qib_message_header *)
+		(rhf_addr - dd->rhf_offset + offset);
+}
+
+static void qib_7220_config_ctxts(struct qib_devdata *dd)
+{
+	unsigned long flags;
+	u32 nchipctxts;
+
+	nchipctxts = qib_read_kreg32(dd, kr_portcnt);
+	dd->cspec->numctxts = nchipctxts;
+	if (qib_n_krcv_queues > 1) {
+		dd->qpn_mask = 0x3e;
+		dd->first_user_ctxt = qib_n_krcv_queues * dd->num_pports;
+		if (dd->first_user_ctxt > nchipctxts)
+			dd->first_user_ctxt = nchipctxts;
+	} else
+		dd->first_user_ctxt = dd->num_pports;
+	dd->n_krcv_queues = dd->first_user_ctxt;
+
+	if (!qib_cfgctxts) {
+		int nctxts = dd->first_user_ctxt + num_online_cpus();
+
+		if (nctxts <= 5)
+			dd->ctxtcnt = 5;
+		else if (nctxts <= 9)
+			dd->ctxtcnt = 9;
+		else if (nctxts <= nchipctxts)
+			dd->ctxtcnt = nchipctxts;
+	} else if (qib_cfgctxts <= nchipctxts)
+		dd->ctxtcnt = qib_cfgctxts;
+	if (!dd->ctxtcnt) /* none of the above, set to max */
+		dd->ctxtcnt = nchipctxts;
+
+	/*
+	 * Chip can be configured for 5, 9, or 17 ctxts, and choice
+	 * affects number of eager TIDs per ctxt (1K, 2K, 4K).
+	 * Lock to be paranoid about later motion, etc.
+	 */
+	spin_lock_irqsave(&dd->cspec->rcvmod_lock, flags);
+	if (dd->ctxtcnt > 9)
+		dd->rcvctrl |= 2ULL << IBA7220_R_CTXTCFG_SHIFT;
+	else if (dd->ctxtcnt > 5)
+		dd->rcvctrl |= 1ULL << IBA7220_R_CTXTCFG_SHIFT;
+	/* else configure for default 5 receive ctxts */
+	if (dd->qpn_mask)
+		dd->rcvctrl |= 1ULL << QIB_7220_RcvCtrl_RcvQPMapEnable_LSB;
+	qib_write_kreg(dd, kr_rcvctrl, dd->rcvctrl);
+	spin_unlock_irqrestore(&dd->cspec->rcvmod_lock, flags);
+
+	/* kr_rcvegrcnt changes based on the number of contexts enabled */
+	dd->cspec->rcvegrcnt = qib_read_kreg32(dd, kr_rcvegrcnt);
+	dd->rcvhdrcnt = max(dd->cspec->rcvegrcnt, IBA7220_KRCVEGRCNT);
+}
+
+static int qib_7220_get_ib_cfg(struct qib_pportdata *ppd, int which)
+{
+	int lsb, ret = 0;
+	u64 maskr; /* right-justified mask */
+
+	switch (which) {
+	case QIB_IB_CFG_LWID_ENB: /* Get allowed Link-width */
+		ret = ppd->link_width_enabled;
+		goto done;
+
+	case QIB_IB_CFG_LWID: /* Get currently active Link-width */
+		ret = ppd->link_width_active;
+		goto done;
+
+	case QIB_IB_CFG_SPD_ENB: /* Get allowed Link speeds */
+		ret = ppd->link_speed_enabled;
+		goto done;
+
+	case QIB_IB_CFG_SPD: /* Get current Link spd */
+		ret = ppd->link_speed_active;
+		goto done;
+
+	case QIB_IB_CFG_RXPOL_ENB: /* Get Auto-RX-polarity enable */
+		lsb = IBA7220_IBC_RXPOL_SHIFT;
+		maskr = IBA7220_IBC_RXPOL_MASK;
+		break;
+
+	case QIB_IB_CFG_LREV_ENB: /* Get Auto-Lane-reversal enable */
+		lsb = IBA7220_IBC_LREV_SHIFT;
+		maskr = IBA7220_IBC_LREV_MASK;
+		break;
+
+	case QIB_IB_CFG_LINKLATENCY:
+		ret = qib_read_kreg64(ppd->dd, kr_ibcddrstatus)
+			& IBA7220_DDRSTAT_LINKLAT_MASK;
+		goto done;
+
+	case QIB_IB_CFG_OP_VLS:
+		ret = ppd->vls_operational;
+		goto done;
+
+	case QIB_IB_CFG_VL_HIGH_CAP:
+		ret = 0;
+		goto done;
+
+	case QIB_IB_CFG_VL_LOW_CAP:
+		ret = 0;
+		goto done;
+
+	case QIB_IB_CFG_OVERRUN_THRESH: /* IB overrun threshold */
+		ret = SYM_FIELD(ppd->cpspec->ibcctrl, IBCCtrl,
+				OverrunThreshold);
+		goto done;
+
+	case QIB_IB_CFG_PHYERR_THRESH: /* IB PHY error threshold */
+		ret = SYM_FIELD(ppd->cpspec->ibcctrl, IBCCtrl,
+				PhyerrThreshold);
+		goto done;
+
+	case QIB_IB_CFG_LINKDEFAULT: /* IB link default (sleep/poll) */
+		/* will only take effect when the link state changes */
+		ret = (ppd->cpspec->ibcctrl &
+		       SYM_MASK(IBCCtrl, LinkDownDefaultState)) ?
+			IB_LINKINITCMD_SLEEP : IB_LINKINITCMD_POLL;
+		goto done;
+
+	case QIB_IB_CFG_HRTBT: /* Get Heartbeat off/enable/auto */
+		lsb = IBA7220_IBC_HRTBT_SHIFT;
+		maskr = IBA7220_IBC_HRTBT_MASK;
+		break;
+
+	case QIB_IB_CFG_PMA_TICKS:
+		/*
+		 * 0x00 = 10x link transfer rate or 4 nsec. for 2.5Gbs
+		 * Since the clock is always 250MHz, the value is 1 or 0.
+		 */
+		ret = (ppd->link_speed_active == QIB_IB_DDR);
+		goto done;
+
+	default:
+		ret = -EINVAL;
+		goto done;
+	}
+	ret = (int)((ppd->cpspec->ibcddrctrl >> lsb) & maskr);
+done:
+	return ret;
+}
+
+static int qib_7220_set_ib_cfg(struct qib_pportdata *ppd, int which, u32 val)
+{
+	struct qib_devdata *dd = ppd->dd;
+	u64 maskr; /* right-justified mask */
+	int lsb, ret = 0, setforce = 0;
+	u16 lcmd, licmd;
+	unsigned long flags;
+	u32 tmp = 0;
+
+	switch (which) {
+	case QIB_IB_CFG_LIDLMC:
+		/*
+		 * Set LID and LMC. Combined to avoid possible hazard
+		 * caller puts LMC in 16MSbits, DLID in 16LSbits of val
+		 */
+		lsb = IBA7220_IBC_DLIDLMC_SHIFT;
+		maskr = IBA7220_IBC_DLIDLMC_MASK;
+		break;
+
+	case QIB_IB_CFG_LWID_ENB: /* set allowed Link-width */
+		/*
+		 * As with speed, only write the actual register if
+		 * the link is currently down, otherwise takes effect
+		 * on next link change.
+		 */
+		ppd->link_width_enabled = val;
+		if (!(ppd->lflags & QIBL_LINKDOWN))
+			goto bail;
+		/*
+		 * We set the QIBL_IB_FORCE_NOTIFY bit so updown
+		 * will get called because we want update
+		 * link_width_active, and the change may not take
+		 * effect for some time (if we are in POLL), so this
+		 * flag will force the updown routine to be called
+		 * on the next ibstatuschange down interrupt, even
+		 * if it's not an down->up transition.
+		 */
+		val--; /* convert from IB to chip */
+		maskr = IBA7220_IBC_WIDTH_MASK;
+		lsb = IBA7220_IBC_WIDTH_SHIFT;
+		setforce = 1;
+		break;
+
+	case QIB_IB_CFG_SPD_ENB: /* set allowed Link speeds */
+		/*
+		 * If we turn off IB1.2, need to preset SerDes defaults,
+		 * but not right now. Set a flag for the next time
+		 * we command the link down.  As with width, only write the
+		 * actual register if the link is currently down, otherwise
+		 * takes effect on next link change.  Since setting is being
+		 * explicitly requested (via MAD or sysfs), clear autoneg
+		 * failure status if speed autoneg is enabled.
+		 */
+		ppd->link_speed_enabled = val;
+		if ((ppd->cpspec->ibcddrctrl & IBA7220_IBC_IBTA_1_2_MASK) &&
+		    !(val & (val - 1)))
+			dd->cspec->presets_needed = 1;
+		if (!(ppd->lflags & QIBL_LINKDOWN))
+			goto bail;
+		/*
+		 * We set the QIBL_IB_FORCE_NOTIFY bit so updown
+		 * will get called because we want update
+		 * link_speed_active, and the change may not take
+		 * effect for some time (if we are in POLL), so this
+		 * flag will force the updown routine to be called
+		 * on the next ibstatuschange down interrupt, even
+		 * if it's not an down->up transition.
+		 */
+		if (val == (QIB_IB_SDR | QIB_IB_DDR)) {
+			val = IBA7220_IBC_SPEED_AUTONEG_MASK |
+				IBA7220_IBC_IBTA_1_2_MASK;
+			spin_lock_irqsave(&ppd->lflags_lock, flags);
+			ppd->lflags &= ~QIBL_IB_AUTONEG_FAILED;
+			spin_unlock_irqrestore(&ppd->lflags_lock, flags);
+		} else
+			val = val == QIB_IB_DDR ?
+				IBA7220_IBC_SPEED_DDR : IBA7220_IBC_SPEED_SDR;
+		maskr = IBA7220_IBC_SPEED_AUTONEG_MASK |
+			IBA7220_IBC_IBTA_1_2_MASK;
+		/* IBTA 1.2 mode + speed bits are contiguous */
+		lsb = SYM_LSB(IBCDDRCtrl, IB_ENHANCED_MODE);
+		setforce = 1;
+		break;
+
+	case QIB_IB_CFG_RXPOL_ENB: /* set Auto-RX-polarity enable */
+		lsb = IBA7220_IBC_RXPOL_SHIFT;
+		maskr = IBA7220_IBC_RXPOL_MASK;
+		break;
+
+	case QIB_IB_CFG_LREV_ENB: /* set Auto-Lane-reversal enable */
+		lsb = IBA7220_IBC_LREV_SHIFT;
+		maskr = IBA7220_IBC_LREV_MASK;
+		break;
+
+	case QIB_IB_CFG_OVERRUN_THRESH: /* IB overrun threshold */
+		maskr = SYM_FIELD(ppd->cpspec->ibcctrl, IBCCtrl,
+				  OverrunThreshold);
+		if (maskr != val) {
+			ppd->cpspec->ibcctrl &=
+				~SYM_MASK(IBCCtrl, OverrunThreshold);
+			ppd->cpspec->ibcctrl |= (u64) val <<
+				SYM_LSB(IBCCtrl, OverrunThreshold);
+			qib_write_kreg(dd, kr_ibcctrl, ppd->cpspec->ibcctrl);
+			qib_write_kreg(dd, kr_scratch, 0);
+		}
+		goto bail;
+
+	case QIB_IB_CFG_PHYERR_THRESH: /* IB PHY error threshold */
+		maskr = SYM_FIELD(ppd->cpspec->ibcctrl, IBCCtrl,
+				  PhyerrThreshold);
+		if (maskr != val) {
+			ppd->cpspec->ibcctrl &=
+				~SYM_MASK(IBCCtrl, PhyerrThreshold);
+			ppd->cpspec->ibcctrl |= (u64) val <<
+				SYM_LSB(IBCCtrl, PhyerrThreshold);
+			qib_write_kreg(dd, kr_ibcctrl, ppd->cpspec->ibcctrl);
+			qib_write_kreg(dd, kr_scratch, 0);
+		}
+		goto bail;
+
+	case QIB_IB_CFG_PKEYS: /* update pkeys */
+		maskr = (u64) ppd->pkeys[0] | ((u64) ppd->pkeys[1] << 16) |
+			((u64) ppd->pkeys[2] << 32) |
+			((u64) ppd->pkeys[3] << 48);
+		qib_write_kreg(dd, kr_partitionkey, maskr);
+		goto bail;
+
+	case QIB_IB_CFG_LINKDEFAULT: /* IB link default (sleep/poll) */
+		/* will only take effect when the link state changes */
+		if (val == IB_LINKINITCMD_POLL)
+			ppd->cpspec->ibcctrl &=
+				~SYM_MASK(IBCCtrl, LinkDownDefaultState);
+		else /* SLEEP */
+			ppd->cpspec->ibcctrl |=
+				SYM_MASK(IBCCtrl, LinkDownDefaultState);
+		qib_write_kreg(dd, kr_ibcctrl, ppd->cpspec->ibcctrl);
+		qib_write_kreg(dd, kr_scratch, 0);
+		goto bail;
+
+	case QIB_IB_CFG_MTU: /* update the MTU in IBC */
+		/*
+		 * Update our housekeeping variables, and set IBC max
+		 * size, same as init code; max IBC is max we allow in
+		 * buffer, less the qword pbc, plus 1 for ICRC, in dwords
+		 * Set even if it's unchanged, print debug message only
+		 * on changes.
+		 */
+		val = (ppd->ibmaxlen >> 2) + 1;
+		ppd->cpspec->ibcctrl &= ~SYM_MASK(IBCCtrl, MaxPktLen);
+		ppd->cpspec->ibcctrl |= (u64)val << SYM_LSB(IBCCtrl, MaxPktLen);
+		qib_write_kreg(dd, kr_ibcctrl, ppd->cpspec->ibcctrl);
+		qib_write_kreg(dd, kr_scratch, 0);
+		goto bail;
+
+	case QIB_IB_CFG_LSTATE: /* set the IB link state */
+		switch (val & 0xffff0000) {
+		case IB_LINKCMD_DOWN:
+			lcmd = QLOGIC_IB_IBCC_LINKCMD_DOWN;
+			if (!ppd->cpspec->ibdeltainprog &&
+			    qib_compat_ddr_negotiate) {
+				ppd->cpspec->ibdeltainprog = 1;
+				ppd->cpspec->ibsymsnap =
+					read_7220_creg32(dd, cr_ibsymbolerr);
+				ppd->cpspec->iblnkerrsnap =
+					read_7220_creg32(dd, cr_iblinkerrrecov);
+			}
+			break;
+
+		case IB_LINKCMD_ARMED:
+			lcmd = QLOGIC_IB_IBCC_LINKCMD_ARMED;
+			break;
+
+		case IB_LINKCMD_ACTIVE:
+			lcmd = QLOGIC_IB_IBCC_LINKCMD_ACTIVE;
+			break;
+
+		default:
+			ret = -EINVAL;
+			qib_dev_err(dd, "bad linkcmd req 0x%x\n", val >> 16);
+			goto bail;
+		}
+		switch (val & 0xffff) {
+		case IB_LINKINITCMD_NOP:
+			licmd = 0;
+			break;
+
+		case IB_LINKINITCMD_POLL:
+			licmd = QLOGIC_IB_IBCC_LINKINITCMD_POLL;
+			break;
+
+		case IB_LINKINITCMD_SLEEP:
+			licmd = QLOGIC_IB_IBCC_LINKINITCMD_SLEEP;
+			break;
+
+		case IB_LINKINITCMD_DISABLE:
+			licmd = QLOGIC_IB_IBCC_LINKINITCMD_DISABLE;
+			ppd->cpspec->chase_end = 0;
+			/*
+			 * stop state chase counter and timer, if running.
+			 * wait forpending timer, but don't clear .data (ppd)!
+			 */
+			if (ppd->cpspec->chase_timer.expires) {
+				del_timer_sync(&ppd->cpspec->chase_timer);
+				ppd->cpspec->chase_timer.expires = 0;
+			}
+			break;
+
+		default:
+			ret = -EINVAL;
+			qib_dev_err(dd, "bad linkinitcmd req 0x%x\n",
+				    val & 0xffff);
+			goto bail;
+		}
+		qib_set_ib_7220_lstate(ppd, lcmd, licmd);
+
+		maskr = IBA7220_IBC_WIDTH_MASK;
+		lsb = IBA7220_IBC_WIDTH_SHIFT;
+		tmp = (ppd->cpspec->ibcddrctrl >> lsb) & maskr;
+		/* If the width active on the chip does not match the
+		 * width in the shadow register, write the new active
+		 * width to the chip.
+		 * We don't have to worry about speed as the speed is taken
+		 * care of by set_7220_ibspeed_fast called by ib_updown.
+		 */
+		if (ppd->link_width_enabled-1 != tmp) {
+			ppd->cpspec->ibcddrctrl &= ~(maskr << lsb);
+			ppd->cpspec->ibcddrctrl |=
+				(((u64)(ppd->link_width_enabled-1) & maskr) <<
+				 lsb);
+			qib_write_kreg(dd, kr_ibcddrctrl,
+				       ppd->cpspec->ibcddrctrl);
+			qib_write_kreg(dd, kr_scratch, 0);
+			spin_lock_irqsave(&ppd->lflags_lock, flags);
+			ppd->lflags |= QIBL_IB_FORCE_NOTIFY;
+			spin_unlock_irqrestore(&ppd->lflags_lock, flags);
+		}
+		goto bail;
+
+	case QIB_IB_CFG_HRTBT: /* set Heartbeat off/enable/auto */
+		if (val > IBA7220_IBC_HRTBT_MASK) {
+			ret = -EINVAL;
+			goto bail;
+		}
+		lsb = IBA7220_IBC_HRTBT_SHIFT;
+		maskr = IBA7220_IBC_HRTBT_MASK;
+		break;
+
+	default:
+		ret = -EINVAL;
+		goto bail;
+	}
+	ppd->cpspec->ibcddrctrl &= ~(maskr << lsb);
+	ppd->cpspec->ibcddrctrl |= (((u64) val & maskr) << lsb);
+	qib_write_kreg(dd, kr_ibcddrctrl, ppd->cpspec->ibcddrctrl);
+	qib_write_kreg(dd, kr_scratch, 0);
+	if (setforce) {
+		spin_lock_irqsave(&ppd->lflags_lock, flags);
+		ppd->lflags |= QIBL_IB_FORCE_NOTIFY;
+		spin_unlock_irqrestore(&ppd->lflags_lock, flags);
+	}
+bail:
+	return ret;
+}
+
+static int qib_7220_set_loopback(struct qib_pportdata *ppd, const char *what)
+{
+	int ret = 0;
+	u64 val, ddr;
+
+	if (!strncmp(what, "ibc", 3)) {
+		ppd->cpspec->ibcctrl |= SYM_MASK(IBCCtrl, Loopback);
+		val = 0; /* disable heart beat, so link will come up */
+		qib_devinfo(ppd->dd->pcidev, "Enabling IB%u:%u IBC loopback\n",
+			 ppd->dd->unit, ppd->port);
+	} else if (!strncmp(what, "off", 3)) {
+		ppd->cpspec->ibcctrl &= ~SYM_MASK(IBCCtrl, Loopback);
+		/* enable heart beat again */
+		val = IBA7220_IBC_HRTBT_MASK << IBA7220_IBC_HRTBT_SHIFT;
+		qib_devinfo(ppd->dd->pcidev,
+			"Disabling IB%u:%u IBC loopback (normal)\n",
+			ppd->dd->unit, ppd->port);
+	} else
+		ret = -EINVAL;
+	if (!ret) {
+		qib_write_kreg(ppd->dd, kr_ibcctrl, ppd->cpspec->ibcctrl);
+		ddr = ppd->cpspec->ibcddrctrl & ~(IBA7220_IBC_HRTBT_MASK
+					     << IBA7220_IBC_HRTBT_SHIFT);
+		ppd->cpspec->ibcddrctrl = ddr | val;
+		qib_write_kreg(ppd->dd, kr_ibcddrctrl,
+			       ppd->cpspec->ibcddrctrl);
+		qib_write_kreg(ppd->dd, kr_scratch, 0);
+	}
+	return ret;
+}
+
+static void qib_update_7220_usrhead(struct qib_ctxtdata *rcd, u64 hd,
+				    u32 updegr, u32 egrhd, u32 npkts)
+{
+	if (updegr)
+		qib_write_ureg(rcd->dd, ur_rcvegrindexhead, egrhd, rcd->ctxt);
+	mmiowb();
+	qib_write_ureg(rcd->dd, ur_rcvhdrhead, hd, rcd->ctxt);
+	mmiowb();
+}
+
+static u32 qib_7220_hdrqempty(struct qib_ctxtdata *rcd)
+{
+	u32 head, tail;
+
+	head = qib_read_ureg32(rcd->dd, ur_rcvhdrhead, rcd->ctxt);
+	if (rcd->rcvhdrtail_kvaddr)
+		tail = qib_get_rcvhdrtail(rcd);
+	else
+		tail = qib_read_ureg32(rcd->dd, ur_rcvhdrtail, rcd->ctxt);
+	return head == tail;
+}
+
+/*
+ * Modify the RCVCTRL register in chip-specific way. This
+ * is a function because bit positions and (future) register
+ * location is chip-specifc, but the needed operations are
+ * generic. <op> is a bit-mask because we often want to
+ * do multiple modifications.
+ */
+static void rcvctrl_7220_mod(struct qib_pportdata *ppd, unsigned int op,
+			     int ctxt)
+{
+	struct qib_devdata *dd = ppd->dd;
+	u64 mask, val;
+	unsigned long flags;
+
+	spin_lock_irqsave(&dd->cspec->rcvmod_lock, flags);
+	if (op & QIB_RCVCTRL_TAILUPD_ENB)
+		dd->rcvctrl |= (1ULL << IBA7220_R_TAILUPD_SHIFT);
+	if (op & QIB_RCVCTRL_TAILUPD_DIS)
+		dd->rcvctrl &= ~(1ULL << IBA7220_R_TAILUPD_SHIFT);
+	if (op & QIB_RCVCTRL_PKEY_ENB)
+		dd->rcvctrl &= ~(1ULL << IBA7220_R_PKEY_DIS_SHIFT);
+	if (op & QIB_RCVCTRL_PKEY_DIS)
+		dd->rcvctrl |= (1ULL << IBA7220_R_PKEY_DIS_SHIFT);
+	if (ctxt < 0)
+		mask = (1ULL << dd->ctxtcnt) - 1;
+	else
+		mask = (1ULL << ctxt);
+	if (op & QIB_RCVCTRL_CTXT_ENB) {
+		/* always done for specific ctxt */
+		dd->rcvctrl |= (mask << SYM_LSB(RcvCtrl, PortEnable));
+		if (!(dd->flags & QIB_NODMA_RTAIL))
+			dd->rcvctrl |= 1ULL << IBA7220_R_TAILUPD_SHIFT;
+		/* Write these registers before the context is enabled. */
+		qib_write_kreg_ctxt(dd, kr_rcvhdrtailaddr, ctxt,
+			dd->rcd[ctxt]->rcvhdrqtailaddr_phys);
+		qib_write_kreg_ctxt(dd, kr_rcvhdraddr, ctxt,
+			dd->rcd[ctxt]->rcvhdrq_phys);
+		dd->rcd[ctxt]->seq_cnt = 1;
+	}
+	if (op & QIB_RCVCTRL_CTXT_DIS)
+		dd->rcvctrl &= ~(mask << SYM_LSB(RcvCtrl, PortEnable));
+	if (op & QIB_RCVCTRL_INTRAVAIL_ENB)
+		dd->rcvctrl |= (mask << IBA7220_R_INTRAVAIL_SHIFT);
+	if (op & QIB_RCVCTRL_INTRAVAIL_DIS)
+		dd->rcvctrl &= ~(mask << IBA7220_R_INTRAVAIL_SHIFT);
+	qib_write_kreg(dd, kr_rcvctrl, dd->rcvctrl);
+	if ((op & QIB_RCVCTRL_INTRAVAIL_ENB) && dd->rhdrhead_intr_off) {
+		/* arm rcv interrupt */
+		val = qib_read_ureg32(dd, ur_rcvhdrhead, ctxt) |
+			dd->rhdrhead_intr_off;
+		qib_write_ureg(dd, ur_rcvhdrhead, val, ctxt);
+	}
+	if (op & QIB_RCVCTRL_CTXT_ENB) {
+		/*
+		 * Init the context registers also; if we were
+		 * disabled, tail and head should both be zero
+		 * already from the enable, but since we don't
+		 * know, we have to do it explicitly.
+		 */
+		val = qib_read_ureg32(dd, ur_rcvegrindextail, ctxt);
+		qib_write_ureg(dd, ur_rcvegrindexhead, val, ctxt);
+
+		val = qib_read_ureg32(dd, ur_rcvhdrtail, ctxt);
+		dd->rcd[ctxt]->head = val;
+		/* If kctxt, interrupt on next receive. */
+		if (ctxt < dd->first_user_ctxt)
+			val |= dd->rhdrhead_intr_off;
+		qib_write_ureg(dd, ur_rcvhdrhead, val, ctxt);
+	}
+	if (op & QIB_RCVCTRL_CTXT_DIS) {
+		if (ctxt >= 0) {
+			qib_write_kreg_ctxt(dd, kr_rcvhdrtailaddr, ctxt, 0);
+			qib_write_kreg_ctxt(dd, kr_rcvhdraddr, ctxt, 0);
+		} else {
+			unsigned i;
+
+			for (i = 0; i < dd->cfgctxts; i++) {
+				qib_write_kreg_ctxt(dd, kr_rcvhdrtailaddr,
+						    i, 0);
+				qib_write_kreg_ctxt(dd, kr_rcvhdraddr, i, 0);
+			}
+		}
+	}
+	spin_unlock_irqrestore(&dd->cspec->rcvmod_lock, flags);
+}
+
+/*
+ * Modify the SENDCTRL register in chip-specific way. This
+ * is a function there may be multiple such registers with
+ * slightly different layouts. To start, we assume the
+ * "canonical" register layout of the first chips.
+ * Chip requires no back-back sendctrl writes, so write
+ * scratch register after writing sendctrl
+ */
+static void sendctrl_7220_mod(struct qib_pportdata *ppd, u32 op)
+{
+	struct qib_devdata *dd = ppd->dd;
+	u64 tmp_dd_sendctrl;
+	unsigned long flags;
+
+	spin_lock_irqsave(&dd->sendctrl_lock, flags);
+
+	/* First the ones that are "sticky", saved in shadow */
+	if (op & QIB_SENDCTRL_CLEAR)
+		dd->sendctrl = 0;
+	if (op & QIB_SENDCTRL_SEND_DIS)
+		dd->sendctrl &= ~SYM_MASK(SendCtrl, SPioEnable);
+	else if (op & QIB_SENDCTRL_SEND_ENB) {
+		dd->sendctrl |= SYM_MASK(SendCtrl, SPioEnable);
+		if (dd->flags & QIB_USE_SPCL_TRIG)
+			dd->sendctrl |= SYM_MASK(SendCtrl,
+						 SSpecialTriggerEn);
+	}
+	if (op & QIB_SENDCTRL_AVAIL_DIS)
+		dd->sendctrl &= ~SYM_MASK(SendCtrl, SendBufAvailUpd);
+	else if (op & QIB_SENDCTRL_AVAIL_ENB)
+		dd->sendctrl |= SYM_MASK(SendCtrl, SendBufAvailUpd);
+
+	if (op & QIB_SENDCTRL_DISARM_ALL) {
+		u32 i, last;
+
+		tmp_dd_sendctrl = dd->sendctrl;
+		/*
+		 * disarm any that are not yet launched, disabling sends
+		 * and updates until done.
+		 */
+		last = dd->piobcnt2k + dd->piobcnt4k;
+		tmp_dd_sendctrl &=
+			~(SYM_MASK(SendCtrl, SPioEnable) |
+			  SYM_MASK(SendCtrl, SendBufAvailUpd));
+		for (i = 0; i < last; i++) {
+			qib_write_kreg(dd, kr_sendctrl,
+				       tmp_dd_sendctrl |
+				       SYM_MASK(SendCtrl, Disarm) | i);
+			qib_write_kreg(dd, kr_scratch, 0);
+		}
+	}
+
+	tmp_dd_sendctrl = dd->sendctrl;
+
+	if (op & QIB_SENDCTRL_FLUSH)
+		tmp_dd_sendctrl |= SYM_MASK(SendCtrl, Abort);
+	if (op & QIB_SENDCTRL_DISARM)
+		tmp_dd_sendctrl |= SYM_MASK(SendCtrl, Disarm) |
+			((op & QIB_7220_SendCtrl_DisarmPIOBuf_RMASK) <<
+			 SYM_LSB(SendCtrl, DisarmPIOBuf));
+	if ((op & QIB_SENDCTRL_AVAIL_BLIP) &&
+	    (dd->sendctrl & SYM_MASK(SendCtrl, SendBufAvailUpd)))
+		tmp_dd_sendctrl &= ~SYM_MASK(SendCtrl, SendBufAvailUpd);
+
+	qib_write_kreg(dd, kr_sendctrl, tmp_dd_sendctrl);
+	qib_write_kreg(dd, kr_scratch, 0);
+
+	if (op & QIB_SENDCTRL_AVAIL_BLIP) {
+		qib_write_kreg(dd, kr_sendctrl, dd->sendctrl);
+		qib_write_kreg(dd, kr_scratch, 0);
+	}
+
+	spin_unlock_irqrestore(&dd->sendctrl_lock, flags);
+
+	if (op & QIB_SENDCTRL_FLUSH) {
+		u32 v;
+		/*
+		 * ensure writes have hit chip, then do a few
+		 * more reads, to allow DMA of pioavail registers
+		 * to occur, so in-memory copy is in sync with
+		 * the chip.  Not always safe to sleep.
+		 */
+		v = qib_read_kreg32(dd, kr_scratch);
+		qib_write_kreg(dd, kr_scratch, v);
+		v = qib_read_kreg32(dd, kr_scratch);
+		qib_write_kreg(dd, kr_scratch, v);
+		qib_read_kreg32(dd, kr_scratch);
+	}
+}
+
+/**
+ * qib_portcntr_7220 - read a per-port counter
+ * @dd: the qlogic_ib device
+ * @creg: the counter to snapshot
+ */
+static u64 qib_portcntr_7220(struct qib_pportdata *ppd, u32 reg)
+{
+	u64 ret = 0ULL;
+	struct qib_devdata *dd = ppd->dd;
+	u16 creg;
+	/* 0xffff for unimplemented or synthesized counters */
+	static const u16 xlator[] = {
+		[QIBPORTCNTR_PKTSEND] = cr_pktsend,
+		[QIBPORTCNTR_WORDSEND] = cr_wordsend,
+		[QIBPORTCNTR_PSXMITDATA] = cr_psxmitdatacount,
+		[QIBPORTCNTR_PSXMITPKTS] = cr_psxmitpktscount,
+		[QIBPORTCNTR_PSXMITWAIT] = cr_psxmitwaitcount,
+		[QIBPORTCNTR_SENDSTALL] = cr_sendstall,
+		[QIBPORTCNTR_PKTRCV] = cr_pktrcv,
+		[QIBPORTCNTR_PSRCVDATA] = cr_psrcvdatacount,
+		[QIBPORTCNTR_PSRCVPKTS] = cr_psrcvpktscount,
+		[QIBPORTCNTR_RCVEBP] = cr_rcvebp,
+		[QIBPORTCNTR_RCVOVFL] = cr_rcvovfl,
+		[QIBPORTCNTR_WORDRCV] = cr_wordrcv,
+		[QIBPORTCNTR_RXDROPPKT] = cr_rxdroppkt,
+		[QIBPORTCNTR_RXLOCALPHYERR] = cr_rxotherlocalphyerr,
+		[QIBPORTCNTR_RXVLERR] = cr_rxvlerr,
+		[QIBPORTCNTR_ERRICRC] = cr_erricrc,
+		[QIBPORTCNTR_ERRVCRC] = cr_errvcrc,
+		[QIBPORTCNTR_ERRLPCRC] = cr_errlpcrc,
+		[QIBPORTCNTR_BADFORMAT] = cr_badformat,
+		[QIBPORTCNTR_ERR_RLEN] = cr_err_rlen,
+		[QIBPORTCNTR_IBSYMBOLERR] = cr_ibsymbolerr,
+		[QIBPORTCNTR_INVALIDRLEN] = cr_invalidrlen,
+		[QIBPORTCNTR_UNSUPVL] = cr_txunsupvl,
+		[QIBPORTCNTR_EXCESSBUFOVFL] = cr_excessbufferovfl,
+		[QIBPORTCNTR_ERRLINK] = cr_errlink,
+		[QIBPORTCNTR_IBLINKDOWN] = cr_iblinkdown,
+		[QIBPORTCNTR_IBLINKERRRECOV] = cr_iblinkerrrecov,
+		[QIBPORTCNTR_LLI] = cr_locallinkintegrityerr,
+		[QIBPORTCNTR_PSINTERVAL] = cr_psinterval,
+		[QIBPORTCNTR_PSSTART] = cr_psstart,
+		[QIBPORTCNTR_PSSTAT] = cr_psstat,
+		[QIBPORTCNTR_VL15PKTDROP] = cr_vl15droppedpkt,
+		[QIBPORTCNTR_ERRPKEY] = cr_errpkey,
+		[QIBPORTCNTR_KHDROVFL] = 0xffff,
+	};
+
+	if (reg >= ARRAY_SIZE(xlator)) {
+		qib_devinfo(ppd->dd->pcidev,
+			 "Unimplemented portcounter %u\n", reg);
+		goto done;
+	}
+	creg = xlator[reg];
+
+	if (reg == QIBPORTCNTR_KHDROVFL) {
+		int i;
+
+		/* sum over all kernel contexts */
+		for (i = 0; i < dd->first_user_ctxt; i++)
+			ret += read_7220_creg32(dd, cr_portovfl + i);
+	}
+	if (creg == 0xffff)
+		goto done;
+
+	/*
+	 * only fast incrementing counters are 64bit; use 32 bit reads to
+	 * avoid two independent reads when on opteron
+	 */
+	if ((creg == cr_wordsend || creg == cr_wordrcv ||
+	     creg == cr_pktsend || creg == cr_pktrcv))
+		ret = read_7220_creg(dd, creg);
+	else
+		ret = read_7220_creg32(dd, creg);
+	if (creg == cr_ibsymbolerr) {
+		if (dd->pport->cpspec->ibdeltainprog)
+			ret -= ret - ppd->cpspec->ibsymsnap;
+		ret -= dd->pport->cpspec->ibsymdelta;
+	} else if (creg == cr_iblinkerrrecov) {
+		if (dd->pport->cpspec->ibdeltainprog)
+			ret -= ret - ppd->cpspec->iblnkerrsnap;
+		ret -= dd->pport->cpspec->iblnkerrdelta;
+	}
+done:
+	return ret;
+}
+
+/*
+ * Device counter names (not port-specific), one line per stat,
+ * single string.  Used by utilities like ipathstats to print the stats
+ * in a way which works for different versions of drivers, without changing
+ * the utility.  Names need to be 12 chars or less (w/o newline), for proper
+ * display by utility.
+ * Non-error counters are first.
+ * Start of "error" conters is indicated by a leading "E " on the first
+ * "error" counter, and doesn't count in label length.
+ * The EgrOvfl list needs to be last so we truncate them at the configured
+ * context count for the device.
+ * cntr7220indices contains the corresponding register indices.
+ */
+static const char cntr7220names[] =
+	"Interrupts\n"
+	"HostBusStall\n"
+	"E RxTIDFull\n"
+	"RxTIDInvalid\n"
+	"Ctxt0EgrOvfl\n"
+	"Ctxt1EgrOvfl\n"
+	"Ctxt2EgrOvfl\n"
+	"Ctxt3EgrOvfl\n"
+	"Ctxt4EgrOvfl\n"
+	"Ctxt5EgrOvfl\n"
+	"Ctxt6EgrOvfl\n"
+	"Ctxt7EgrOvfl\n"
+	"Ctxt8EgrOvfl\n"
+	"Ctxt9EgrOvfl\n"
+	"Ctx10EgrOvfl\n"
+	"Ctx11EgrOvfl\n"
+	"Ctx12EgrOvfl\n"
+	"Ctx13EgrOvfl\n"
+	"Ctx14EgrOvfl\n"
+	"Ctx15EgrOvfl\n"
+	"Ctx16EgrOvfl\n";
+
+static const size_t cntr7220indices[] = {
+	cr_lbint,
+	cr_lbflowstall,
+	cr_errtidfull,
+	cr_errtidvalid,
+	cr_portovfl + 0,
+	cr_portovfl + 1,
+	cr_portovfl + 2,
+	cr_portovfl + 3,
+	cr_portovfl + 4,
+	cr_portovfl + 5,
+	cr_portovfl + 6,
+	cr_portovfl + 7,
+	cr_portovfl + 8,
+	cr_portovfl + 9,
+	cr_portovfl + 10,
+	cr_portovfl + 11,
+	cr_portovfl + 12,
+	cr_portovfl + 13,
+	cr_portovfl + 14,
+	cr_portovfl + 15,
+	cr_portovfl + 16,
+};
+
+/*
+ * same as cntr7220names and cntr7220indices, but for port-specific counters.
+ * portcntr7220indices is somewhat complicated by some registers needing
+ * adjustments of various kinds, and those are ORed with _PORT_VIRT_FLAG
+ */
+static const char portcntr7220names[] =
+	"TxPkt\n"
+	"TxFlowPkt\n"
+	"TxWords\n"
+	"RxPkt\n"
+	"RxFlowPkt\n"
+	"RxWords\n"
+	"TxFlowStall\n"
+	"TxDmaDesc\n"  /* 7220 and 7322-only */
+	"E RxDlidFltr\n"  /* 7220 and 7322-only */
+	"IBStatusChng\n"
+	"IBLinkDown\n"
+	"IBLnkRecov\n"
+	"IBRxLinkErr\n"
+	"IBSymbolErr\n"
+	"RxLLIErr\n"
+	"RxBadFormat\n"
+	"RxBadLen\n"
+	"RxBufOvrfl\n"
+	"RxEBP\n"
+	"RxFlowCtlErr\n"
+	"RxICRCerr\n"
+	"RxLPCRCerr\n"
+	"RxVCRCerr\n"
+	"RxInvalLen\n"
+	"RxInvalPKey\n"
+	"RxPktDropped\n"
+	"TxBadLength\n"
+	"TxDropped\n"
+	"TxInvalLen\n"
+	"TxUnderrun\n"
+	"TxUnsupVL\n"
+	"RxLclPhyErr\n" /* 7220 and 7322-only */
+	"RxVL15Drop\n" /* 7220 and 7322-only */
+	"RxVlErr\n" /* 7220 and 7322-only */
+	"XcessBufOvfl\n" /* 7220 and 7322-only */
+	;
+
+#define _PORT_VIRT_FLAG 0x8000 /* "virtual", need adjustments */
+static const size_t portcntr7220indices[] = {
+	QIBPORTCNTR_PKTSEND | _PORT_VIRT_FLAG,
+	cr_pktsendflow,
+	QIBPORTCNTR_WORDSEND | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_PKTRCV | _PORT_VIRT_FLAG,
+	cr_pktrcvflowctrl,
+	QIBPORTCNTR_WORDRCV | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_SENDSTALL | _PORT_VIRT_FLAG,
+	cr_txsdmadesc,
+	cr_rxdlidfltr,
+	cr_ibstatuschange,
+	QIBPORTCNTR_IBLINKDOWN | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_IBLINKERRRECOV | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_ERRLINK | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_IBSYMBOLERR | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_LLI | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_BADFORMAT | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_ERR_RLEN | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_RCVOVFL | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_RCVEBP | _PORT_VIRT_FLAG,
+	cr_rcvflowctrl_err,
+	QIBPORTCNTR_ERRICRC | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_ERRLPCRC | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_ERRVCRC | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_INVALIDRLEN | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_ERRPKEY | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_RXDROPPKT | _PORT_VIRT_FLAG,
+	cr_invalidslen,
+	cr_senddropped,
+	cr_errslen,
+	cr_sendunderrun,
+	cr_txunsupvl,
+	QIBPORTCNTR_RXLOCALPHYERR | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_VL15PKTDROP | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_RXVLERR | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_EXCESSBUFOVFL | _PORT_VIRT_FLAG,
+};
+
+/* do all the setup to make the counter reads efficient later */
+static void init_7220_cntrnames(struct qib_devdata *dd)
+{
+	int i, j = 0;
+	char *s;
+
+	for (i = 0, s = (char *)cntr7220names; s && j <= dd->cfgctxts;
+	     i++) {
+		/* we always have at least one counter before the egrovfl */
+		if (!j && !strncmp("Ctxt0EgrOvfl", s + 1, 12))
+			j = 1;
+		s = strchr(s + 1, '\n');
+		if (s && j)
+			j++;
+	}
+	dd->cspec->ncntrs = i;
+	if (!s)
+		/* full list; size is without terminating null */
+		dd->cspec->cntrnamelen = sizeof(cntr7220names) - 1;
+	else
+		dd->cspec->cntrnamelen = 1 + s - cntr7220names;
+	dd->cspec->cntrs = kmalloc(dd->cspec->ncntrs
+		* sizeof(u64), GFP_KERNEL);
+	if (!dd->cspec->cntrs)
+		qib_dev_err(dd, "Failed allocation for counters\n");
+
+	for (i = 0, s = (char *)portcntr7220names; s; i++)
+		s = strchr(s + 1, '\n');
+	dd->cspec->nportcntrs = i - 1;
+	dd->cspec->portcntrnamelen = sizeof(portcntr7220names) - 1;
+	dd->cspec->portcntrs = kmalloc(dd->cspec->nportcntrs
+		* sizeof(u64), GFP_KERNEL);
+	if (!dd->cspec->portcntrs)
+		qib_dev_err(dd, "Failed allocation for portcounters\n");
+}
+
+static u32 qib_read_7220cntrs(struct qib_devdata *dd, loff_t pos, char **namep,
+			      u64 **cntrp)
+{
+	u32 ret;
+
+	if (!dd->cspec->cntrs) {
+		ret = 0;
+		goto done;
+	}
+
+	if (namep) {
+		*namep = (char *)cntr7220names;
+		ret = dd->cspec->cntrnamelen;
+		if (pos >= ret)
+			ret = 0; /* final read after getting everything */
+	} else {
+		u64 *cntr = dd->cspec->cntrs;
+		int i;
+
+		ret = dd->cspec->ncntrs * sizeof(u64);
+		if (!cntr || pos >= ret) {
+			/* everything read, or couldn't get memory */
+			ret = 0;
+			goto done;
+		}
+
+		*cntrp = cntr;
+		for (i = 0; i < dd->cspec->ncntrs; i++)
+			*cntr++ = read_7220_creg32(dd, cntr7220indices[i]);
+	}
+done:
+	return ret;
+}
+
+static u32 qib_read_7220portcntrs(struct qib_devdata *dd, loff_t pos, u32 port,
+				  char **namep, u64 **cntrp)
+{
+	u32 ret;
+
+	if (!dd->cspec->portcntrs) {
+		ret = 0;
+		goto done;
+	}
+	if (namep) {
+		*namep = (char *)portcntr7220names;
+		ret = dd->cspec->portcntrnamelen;
+		if (pos >= ret)
+			ret = 0; /* final read after getting everything */
+	} else {
+		u64 *cntr = dd->cspec->portcntrs;
+		struct qib_pportdata *ppd = &dd->pport[port];
+		int i;
+
+		ret = dd->cspec->nportcntrs * sizeof(u64);
+		if (!cntr || pos >= ret) {
+			/* everything read, or couldn't get memory */
+			ret = 0;
+			goto done;
+		}
+		*cntrp = cntr;
+		for (i = 0; i < dd->cspec->nportcntrs; i++) {
+			if (portcntr7220indices[i] & _PORT_VIRT_FLAG)
+				*cntr++ = qib_portcntr_7220(ppd,
+					portcntr7220indices[i] &
+					~_PORT_VIRT_FLAG);
+			else
+				*cntr++ = read_7220_creg32(dd,
+					   portcntr7220indices[i]);
+		}
+	}
+done:
+	return ret;
+}
+
+/**
+ * qib_get_7220_faststats - get word counters from chip before they overflow
+ * @opaque - contains a pointer to the qlogic_ib device qib_devdata
+ *
+ * This needs more work; in particular, decision on whether we really
+ * need traffic_wds done the way it is
+ * called from add_timer
+ */
+static void qib_get_7220_faststats(unsigned long opaque)
+{
+	struct qib_devdata *dd = (struct qib_devdata *) opaque;
+	struct qib_pportdata *ppd = dd->pport;
+	unsigned long flags;
+	u64 traffic_wds;
+
+	/*
+	 * don't access the chip while running diags, or memory diags can
+	 * fail
+	 */
+	if (!(dd->flags & QIB_INITTED) || dd->diag_client)
+		/* but re-arm the timer, for diags case; won't hurt other */
+		goto done;
+
+	/*
+	 * We now try to maintain an activity timer, based on traffic
+	 * exceeding a threshold, so we need to check the word-counts
+	 * even if they are 64-bit.
+	 */
+	traffic_wds = qib_portcntr_7220(ppd, cr_wordsend) +
+		qib_portcntr_7220(ppd, cr_wordrcv);
+	spin_lock_irqsave(&dd->eep_st_lock, flags);
+	traffic_wds -= dd->traffic_wds;
+	dd->traffic_wds += traffic_wds;
+	if (traffic_wds  >= QIB_TRAFFIC_ACTIVE_THRESHOLD)
+		atomic_add(5, &dd->active_time); /* S/B #define */
+	spin_unlock_irqrestore(&dd->eep_st_lock, flags);
+done:
+	mod_timer(&dd->stats_timer, jiffies + HZ * ACTIVITY_TIMER);
+}
+
+/*
+ * If we are using MSI, try to fallback to INTx.
+ */
+static int qib_7220_intr_fallback(struct qib_devdata *dd)
+{
+	if (!dd->msi_lo)
+		return 0;
+
+	qib_devinfo(dd->pcidev,
+		"MSI interrupt not detected, trying INTx interrupts\n");
+	qib_7220_free_irq(dd);
+	qib_enable_intx(dd->pcidev);
+	/*
+	 * Some newer kernels require free_irq before disable_msi,
+	 * and irq can be changed during disable and INTx enable
+	 * and we need to therefore use the pcidev->irq value,
+	 * not our saved MSI value.
+	 */
+	dd->cspec->irq = dd->pcidev->irq;
+	qib_setup_7220_interrupt(dd);
+	return 1;
+}
+
+/*
+ * Reset the XGXS (between serdes and IBC).  Slightly less intrusive
+ * than resetting the IBC or external link state, and useful in some
+ * cases to cause some retraining.  To do this right, we reset IBC
+ * as well.
+ */
+static void qib_7220_xgxs_reset(struct qib_pportdata *ppd)
+{
+	u64 val, prev_val;
+	struct qib_devdata *dd = ppd->dd;
+
+	prev_val = qib_read_kreg64(dd, kr_xgxs_cfg);
+	val = prev_val | QLOGIC_IB_XGXS_RESET;
+	prev_val &= ~QLOGIC_IB_XGXS_RESET; /* be sure */
+	qib_write_kreg(dd, kr_control,
+		       dd->control & ~QLOGIC_IB_C_LINKENABLE);
+	qib_write_kreg(dd, kr_xgxs_cfg, val);
+	qib_read_kreg32(dd, kr_scratch);
+	qib_write_kreg(dd, kr_xgxs_cfg, prev_val);
+	qib_write_kreg(dd, kr_control, dd->control);
+}
+
+/*
+ * For this chip, we want to use the same buffer every time
+ * when we are trying to bring the link up (they are always VL15
+ * packets).  At that link state the packet should always go out immediately
+ * (or at least be discarded at the tx interface if the link is down).
+ * If it doesn't, and the buffer isn't available, that means some other
+ * sender has gotten ahead of us, and is preventing our packet from going
+ * out.  In that case, we flush all packets, and try again.  If that still
+ * fails, we fail the request, and hope things work the next time around.
+ *
+ * We don't need very complicated heuristics on whether the packet had
+ * time to go out or not, since even at SDR 1X, it goes out in very short
+ * time periods, covered by the chip reads done here and as part of the
+ * flush.
+ */
+static u32 __iomem *get_7220_link_buf(struct qib_pportdata *ppd, u32 *bnum)
+{
+	u32 __iomem *buf;
+	u32 lbuf = ppd->dd->cspec->lastbuf_for_pio;
+	int do_cleanup;
+	unsigned long flags;
+
+	/*
+	 * always blip to get avail list updated, since it's almost
+	 * always needed, and is fairly cheap.
+	 */
+	sendctrl_7220_mod(ppd->dd->pport, QIB_SENDCTRL_AVAIL_BLIP);
+	qib_read_kreg64(ppd->dd, kr_scratch); /* extra chip flush */
+	buf = qib_getsendbuf_range(ppd->dd, bnum, lbuf, lbuf);
+	if (buf)
+		goto done;
+
+	spin_lock_irqsave(&ppd->sdma_lock, flags);
+	if (ppd->sdma_state.current_state == qib_sdma_state_s20_idle &&
+	    ppd->sdma_state.current_state != qib_sdma_state_s00_hw_down) {
+		__qib_sdma_process_event(ppd, qib_sdma_event_e00_go_hw_down);
+		do_cleanup = 0;
+	} else {
+		do_cleanup = 1;
+		qib_7220_sdma_hw_clean_up(ppd);
+	}
+	spin_unlock_irqrestore(&ppd->sdma_lock, flags);
+
+	if (do_cleanup) {
+		qib_read_kreg64(ppd->dd, kr_scratch); /* extra chip flush */
+		buf = qib_getsendbuf_range(ppd->dd, bnum, lbuf, lbuf);
+	}
+done:
+	return buf;
+}
+
+/*
+ * This code for non-IBTA-compliant IB speed negotiation is only known to
+ * work for the SDR to DDR transition, and only between an HCA and a switch
+ * with recent firmware.  It is based on observed heuristics, rather than
+ * actual knowledge of the non-compliant speed negotiation.
+ * It has a number of hard-coded fields, since the hope is to rewrite this
+ * when a spec is available on how the negoation is intended to work.
+ */
+static void autoneg_7220_sendpkt(struct qib_pportdata *ppd, u32 *hdr,
+				 u32 dcnt, u32 *data)
+{
+	int i;
+	u64 pbc;
+	u32 __iomem *piobuf;
+	u32 pnum;
+	struct qib_devdata *dd = ppd->dd;
+
+	i = 0;
+	pbc = 7 + dcnt + 1; /* 7 dword header, dword data, icrc */
+	pbc |= PBC_7220_VL15_SEND;
+	while (!(piobuf = get_7220_link_buf(ppd, &pnum))) {
+		if (i++ > 5)
+			return;
+		udelay(2);
+	}
+	sendctrl_7220_mod(dd->pport, QIB_SENDCTRL_DISARM_BUF(pnum));
+	writeq(pbc, piobuf);
+	qib_flush_wc();
+	qib_pio_copy(piobuf + 2, hdr, 7);
+	qib_pio_copy(piobuf + 9, data, dcnt);
+	if (dd->flags & QIB_USE_SPCL_TRIG) {
+		u32 spcl_off = (pnum >= dd->piobcnt2k) ? 2047 : 1023;
+
+		qib_flush_wc();
+		__raw_writel(0xaebecede, piobuf + spcl_off);
+	}
+	qib_flush_wc();
+	qib_sendbuf_done(dd, pnum);
+}
+
+/*
+ * _start packet gets sent twice at start, _done gets sent twice at end
+ */
+static void autoneg_7220_send(struct qib_pportdata *ppd, int which)
+{
+	struct qib_devdata *dd = ppd->dd;
+	static u32 swapped;
+	u32 dw, i, hcnt, dcnt, *data;
+	static u32 hdr[7] = { 0xf002ffff, 0x48ffff, 0x6400abba };
+	static u32 madpayload_start[0x40] = {
+		0x1810103, 0x1, 0x0, 0x0, 0x2c90000, 0x2c9, 0x0, 0x0,
+		0xffffffff, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+		0x1, 0x1388, 0x15e, 0x1, /* rest 0's */
+		};
+	static u32 madpayload_done[0x40] = {
+		0x1810103, 0x1, 0x0, 0x0, 0x2c90000, 0x2c9, 0x0, 0x0,
+		0xffffffff, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+		0x40000001, 0x1388, 0x15e, /* rest 0's */
+		};
+
+	dcnt = ARRAY_SIZE(madpayload_start);
+	hcnt = ARRAY_SIZE(hdr);
+	if (!swapped) {
+		/* for maintainability, do it at runtime */
+		for (i = 0; i < hcnt; i++) {
+			dw = (__force u32) cpu_to_be32(hdr[i]);
+			hdr[i] = dw;
+		}
+		for (i = 0; i < dcnt; i++) {
+			dw = (__force u32) cpu_to_be32(madpayload_start[i]);
+			madpayload_start[i] = dw;
+			dw = (__force u32) cpu_to_be32(madpayload_done[i]);
+			madpayload_done[i] = dw;
+		}
+		swapped = 1;
+	}
+
+	data = which ? madpayload_done : madpayload_start;
+
+	autoneg_7220_sendpkt(ppd, hdr, dcnt, data);
+	qib_read_kreg64(dd, kr_scratch);
+	udelay(2);
+	autoneg_7220_sendpkt(ppd, hdr, dcnt, data);
+	qib_read_kreg64(dd, kr_scratch);
+	udelay(2);
+}
+
+/*
+ * Do the absolute minimum to cause an IB speed change, and make it
+ * ready, but don't actually trigger the change.   The caller will
+ * do that when ready (if link is in Polling training state, it will
+ * happen immediately, otherwise when link next goes down)
+ *
+ * This routine should only be used as part of the DDR autonegotation
+ * code for devices that are not compliant with IB 1.2 (or code that
+ * fixes things up for same).
+ *
+ * When link has gone down, and autoneg enabled, or autoneg has
+ * failed and we give up until next time we set both speeds, and
+ * then we want IBTA enabled as well as "use max enabled speed.
+ */
+static void set_7220_ibspeed_fast(struct qib_pportdata *ppd, u32 speed)
+{
+	ppd->cpspec->ibcddrctrl &= ~(IBA7220_IBC_SPEED_AUTONEG_MASK |
+		IBA7220_IBC_IBTA_1_2_MASK);
+
+	if (speed == (QIB_IB_SDR | QIB_IB_DDR))
+		ppd->cpspec->ibcddrctrl |= IBA7220_IBC_SPEED_AUTONEG_MASK |
+			IBA7220_IBC_IBTA_1_2_MASK;
+	else
+		ppd->cpspec->ibcddrctrl |= speed == QIB_IB_DDR ?
+			IBA7220_IBC_SPEED_DDR : IBA7220_IBC_SPEED_SDR;
+
+	qib_write_kreg(ppd->dd, kr_ibcddrctrl, ppd->cpspec->ibcddrctrl);
+	qib_write_kreg(ppd->dd, kr_scratch, 0);
+}
+
+/*
+ * This routine is only used when we are not talking to another
+ * IB 1.2-compliant device that we think can do DDR.
+ * (This includes all existing switch chips as of Oct 2007.)
+ * 1.2-compliant devices go directly to DDR prior to reaching INIT
+ */
+static void try_7220_autoneg(struct qib_pportdata *ppd)
+{
+	unsigned long flags;
+
+	/*
+	 * Required for older non-IB1.2 DDR switches.  Newer
+	 * non-IB-compliant switches don't need it, but so far,
+	 * aren't bothered by it either.  "Magic constant"
+	 */
+	qib_write_kreg(ppd->dd, kr_ncmodectrl, 0x3b9dc07);
+
+	spin_lock_irqsave(&ppd->lflags_lock, flags);
+	ppd->lflags |= QIBL_IB_AUTONEG_INPROG;
+	spin_unlock_irqrestore(&ppd->lflags_lock, flags);
+	autoneg_7220_send(ppd, 0);
+	set_7220_ibspeed_fast(ppd, QIB_IB_DDR);
+
+	toggle_7220_rclkrls(ppd->dd);
+	/* 2 msec is minimum length of a poll cycle */
+	queue_delayed_work(ib_wq, &ppd->cpspec->autoneg_work,
+			   msecs_to_jiffies(2));
+}
+
+/*
+ * Handle the empirically determined mechanism for auto-negotiation
+ * of DDR speed with switches.
+ */
+static void autoneg_7220_work(struct work_struct *work)
+{
+	struct qib_pportdata *ppd;
+	struct qib_devdata *dd;
+	u64 startms;
+	u32 i;
+	unsigned long flags;
+
+	ppd = &container_of(work, struct qib_chippport_specific,
+			    autoneg_work.work)->pportdata;
+	dd = ppd->dd;
+
+	startms = jiffies_to_msecs(jiffies);
+
+	/*
+	 * Busy wait for this first part, it should be at most a
+	 * few hundred usec, since we scheduled ourselves for 2msec.
+	 */
+	for (i = 0; i < 25; i++) {
+		if (SYM_FIELD(ppd->lastibcstat, IBCStatus, LinkTrainingState)
+		     == IB_7220_LT_STATE_POLLQUIET) {
+			qib_set_linkstate(ppd, QIB_IB_LINKDOWN_DISABLE);
+			break;
+		}
+		udelay(100);
+	}
+
+	if (!(ppd->lflags & QIBL_IB_AUTONEG_INPROG))
+		goto done; /* we got there early or told to stop */
+
+	/* we expect this to timeout */
+	if (wait_event_timeout(ppd->cpspec->autoneg_wait,
+			       !(ppd->lflags & QIBL_IB_AUTONEG_INPROG),
+			       msecs_to_jiffies(90)))
+		goto done;
+
+	toggle_7220_rclkrls(dd);
+
+	/* we expect this to timeout */
+	if (wait_event_timeout(ppd->cpspec->autoneg_wait,
+			       !(ppd->lflags & QIBL_IB_AUTONEG_INPROG),
+			       msecs_to_jiffies(1700)))
+		goto done;
+
+	set_7220_ibspeed_fast(ppd, QIB_IB_SDR);
+	toggle_7220_rclkrls(dd);
+
+	/*
+	 * Wait up to 250 msec for link to train and get to INIT at DDR;
+	 * this should terminate early.
+	 */
+	wait_event_timeout(ppd->cpspec->autoneg_wait,
+		!(ppd->lflags & QIBL_IB_AUTONEG_INPROG),
+		msecs_to_jiffies(250));
+done:
+	if (ppd->lflags & QIBL_IB_AUTONEG_INPROG) {
+		spin_lock_irqsave(&ppd->lflags_lock, flags);
+		ppd->lflags &= ~QIBL_IB_AUTONEG_INPROG;
+		if (dd->cspec->autoneg_tries == AUTONEG_TRIES) {
+			ppd->lflags |= QIBL_IB_AUTONEG_FAILED;
+			dd->cspec->autoneg_tries = 0;
+		}
+		spin_unlock_irqrestore(&ppd->lflags_lock, flags);
+		set_7220_ibspeed_fast(ppd, ppd->link_speed_enabled);
+	}
+}
+
+static u32 qib_7220_iblink_state(u64 ibcs)
+{
+	u32 state = (u32)SYM_FIELD(ibcs, IBCStatus, LinkState);
+
+	switch (state) {
+	case IB_7220_L_STATE_INIT:
+		state = IB_PORT_INIT;
+		break;
+	case IB_7220_L_STATE_ARM:
+		state = IB_PORT_ARMED;
+		break;
+	case IB_7220_L_STATE_ACTIVE:
+		/* fall through */
+	case IB_7220_L_STATE_ACT_DEFER:
+		state = IB_PORT_ACTIVE;
+		break;
+	default: /* fall through */
+	case IB_7220_L_STATE_DOWN:
+		state = IB_PORT_DOWN;
+		break;
+	}
+	return state;
+}
+
+/* returns the IBTA port state, rather than the IBC link training state */
+static u8 qib_7220_phys_portstate(u64 ibcs)
+{
+	u8 state = (u8)SYM_FIELD(ibcs, IBCStatus, LinkTrainingState);
+	return qib_7220_physportstate[state];
+}
+
+static int qib_7220_ib_updown(struct qib_pportdata *ppd, int ibup, u64 ibcs)
+{
+	int ret = 0, symadj = 0;
+	struct qib_devdata *dd = ppd->dd;
+	unsigned long flags;
+
+	spin_lock_irqsave(&ppd->lflags_lock, flags);
+	ppd->lflags &= ~QIBL_IB_FORCE_NOTIFY;
+	spin_unlock_irqrestore(&ppd->lflags_lock, flags);
+
+	if (!ibup) {
+		/*
+		 * When the link goes down we don't want AEQ running, so it
+		 * won't interfere with IBC training, etc., and we need
+		 * to go back to the static SerDes preset values.
+		 */
+		if (!(ppd->lflags & (QIBL_IB_AUTONEG_FAILED |
+				     QIBL_IB_AUTONEG_INPROG)))
+			set_7220_ibspeed_fast(ppd, ppd->link_speed_enabled);
+		if (!(ppd->lflags & QIBL_IB_AUTONEG_INPROG)) {
+			qib_sd7220_presets(dd);
+			qib_cancel_sends(ppd); /* initial disarm, etc. */
+			spin_lock_irqsave(&ppd->sdma_lock, flags);
+			if (__qib_sdma_running(ppd))
+				__qib_sdma_process_event(ppd,
+					qib_sdma_event_e70_go_idle);
+			spin_unlock_irqrestore(&ppd->sdma_lock, flags);
+		}
+		/* this might better in qib_sd7220_presets() */
+		set_7220_relock_poll(dd, ibup);
+	} else {
+		if (qib_compat_ddr_negotiate &&
+		    !(ppd->lflags & (QIBL_IB_AUTONEG_FAILED |
+				     QIBL_IB_AUTONEG_INPROG)) &&
+		    ppd->link_speed_active == QIB_IB_SDR &&
+		    (ppd->link_speed_enabled & (QIB_IB_DDR | QIB_IB_SDR)) ==
+		    (QIB_IB_DDR | QIB_IB_SDR) &&
+		    dd->cspec->autoneg_tries < AUTONEG_TRIES) {
+			/* we are SDR, and DDR auto-negotiation enabled */
+			++dd->cspec->autoneg_tries;
+			if (!ppd->cpspec->ibdeltainprog) {
+				ppd->cpspec->ibdeltainprog = 1;
+				ppd->cpspec->ibsymsnap = read_7220_creg32(dd,
+					cr_ibsymbolerr);
+				ppd->cpspec->iblnkerrsnap = read_7220_creg32(dd,
+					cr_iblinkerrrecov);
+			}
+			try_7220_autoneg(ppd);
+			ret = 1; /* no other IB status change processing */
+		} else if ((ppd->lflags & QIBL_IB_AUTONEG_INPROG) &&
+			   ppd->link_speed_active == QIB_IB_SDR) {
+			autoneg_7220_send(ppd, 1);
+			set_7220_ibspeed_fast(ppd, QIB_IB_DDR);
+			udelay(2);
+			toggle_7220_rclkrls(dd);
+			ret = 1; /* no other IB status change processing */
+		} else {
+			if ((ppd->lflags & QIBL_IB_AUTONEG_INPROG) &&
+			    (ppd->link_speed_active & QIB_IB_DDR)) {
+				spin_lock_irqsave(&ppd->lflags_lock, flags);
+				ppd->lflags &= ~(QIBL_IB_AUTONEG_INPROG |
+						 QIBL_IB_AUTONEG_FAILED);
+				spin_unlock_irqrestore(&ppd->lflags_lock,
+						       flags);
+				dd->cspec->autoneg_tries = 0;
+				/* re-enable SDR, for next link down */
+				set_7220_ibspeed_fast(ppd,
+						      ppd->link_speed_enabled);
+				wake_up(&ppd->cpspec->autoneg_wait);
+				symadj = 1;
+			} else if (ppd->lflags & QIBL_IB_AUTONEG_FAILED) {
+				/*
+				 * Clear autoneg failure flag, and do setup
+				 * so we'll try next time link goes down and
+				 * back to INIT (possibly connected to a
+				 * different device).
+				 */
+				spin_lock_irqsave(&ppd->lflags_lock, flags);
+				ppd->lflags &= ~QIBL_IB_AUTONEG_FAILED;
+				spin_unlock_irqrestore(&ppd->lflags_lock,
+						       flags);
+				ppd->cpspec->ibcddrctrl |=
+					IBA7220_IBC_IBTA_1_2_MASK;
+				qib_write_kreg(dd, kr_ncmodectrl, 0);
+				symadj = 1;
+			}
+		}
+
+		if (!(ppd->lflags & QIBL_IB_AUTONEG_INPROG))
+			symadj = 1;
+
+		if (!ret) {
+			ppd->delay_mult = rate_to_delay
+			    [(ibcs >> IBA7220_LINKSPEED_SHIFT) & 1]
+			    [(ibcs >> IBA7220_LINKWIDTH_SHIFT) & 1];
+
+			set_7220_relock_poll(dd, ibup);
+			spin_lock_irqsave(&ppd->sdma_lock, flags);
+			/*
+			 * Unlike 7322, the 7220 needs this, due to lack of
+			 * interrupt in some cases when we have sdma active
+			 * when the link goes down.
+			 */
+			if (ppd->sdma_state.current_state !=
+			    qib_sdma_state_s20_idle)
+				__qib_sdma_process_event(ppd,
+					qib_sdma_event_e00_go_hw_down);
+			spin_unlock_irqrestore(&ppd->sdma_lock, flags);
+		}
+	}
+
+	if (symadj) {
+		if (ppd->cpspec->ibdeltainprog) {
+			ppd->cpspec->ibdeltainprog = 0;
+			ppd->cpspec->ibsymdelta += read_7220_creg32(ppd->dd,
+				cr_ibsymbolerr) - ppd->cpspec->ibsymsnap;
+			ppd->cpspec->iblnkerrdelta += read_7220_creg32(ppd->dd,
+				cr_iblinkerrrecov) - ppd->cpspec->iblnkerrsnap;
+		}
+	} else if (!ibup && qib_compat_ddr_negotiate &&
+		   !ppd->cpspec->ibdeltainprog &&
+			!(ppd->lflags & QIBL_IB_AUTONEG_INPROG)) {
+		ppd->cpspec->ibdeltainprog = 1;
+		ppd->cpspec->ibsymsnap = read_7220_creg32(ppd->dd,
+							  cr_ibsymbolerr);
+		ppd->cpspec->iblnkerrsnap = read_7220_creg32(ppd->dd,
+						     cr_iblinkerrrecov);
+	}
+
+	if (!ret)
+		qib_setup_7220_setextled(ppd, ibup);
+	return ret;
+}
+
+/*
+ * Does read/modify/write to appropriate registers to
+ * set output and direction bits selected by mask.
+ * these are in their canonical postions (e.g. lsb of
+ * dir will end up in D48 of extctrl on existing chips).
+ * returns contents of GP Inputs.
+ */
+static int gpio_7220_mod(struct qib_devdata *dd, u32 out, u32 dir, u32 mask)
+{
+	u64 read_val, new_out;
+	unsigned long flags;
+
+	if (mask) {
+		/* some bits being written, lock access to GPIO */
+		dir &= mask;
+		out &= mask;
+		spin_lock_irqsave(&dd->cspec->gpio_lock, flags);
+		dd->cspec->extctrl &= ~((u64)mask << SYM_LSB(EXTCtrl, GPIOOe));
+		dd->cspec->extctrl |= ((u64) dir << SYM_LSB(EXTCtrl, GPIOOe));
+		new_out = (dd->cspec->gpio_out & ~mask) | out;
+
+		qib_write_kreg(dd, kr_extctrl, dd->cspec->extctrl);
+		qib_write_kreg(dd, kr_gpio_out, new_out);
+		dd->cspec->gpio_out = new_out;
+		spin_unlock_irqrestore(&dd->cspec->gpio_lock, flags);
+	}
+	/*
+	 * It is unlikely that a read at this time would get valid
+	 * data on a pin whose direction line was set in the same
+	 * call to this function. We include the read here because
+	 * that allows us to potentially combine a change on one pin with
+	 * a read on another, and because the old code did something like
+	 * this.
+	 */
+	read_val = qib_read_kreg64(dd, kr_extstatus);
+	return SYM_FIELD(read_val, EXTStatus, GPIOIn);
+}
+
+/*
+ * Read fundamental info we need to use the chip.  These are
+ * the registers that describe chip capabilities, and are
+ * saved in shadow registers.
+ */
+static void get_7220_chip_params(struct qib_devdata *dd)
+{
+	u64 val;
+	u32 piobufs;
+	int mtu;
+
+	dd->uregbase = qib_read_kreg32(dd, kr_userregbase);
+
+	dd->rcvtidcnt = qib_read_kreg32(dd, kr_rcvtidcnt);
+	dd->rcvtidbase = qib_read_kreg32(dd, kr_rcvtidbase);
+	dd->rcvegrbase = qib_read_kreg32(dd, kr_rcvegrbase);
+	dd->palign = qib_read_kreg32(dd, kr_palign);
+	dd->piobufbase = qib_read_kreg64(dd, kr_sendpiobufbase);
+	dd->pio2k_bufbase = dd->piobufbase & 0xffffffff;
+
+	val = qib_read_kreg64(dd, kr_sendpiosize);
+	dd->piosize2k = val & ~0U;
+	dd->piosize4k = val >> 32;
+
+	mtu = ib_mtu_enum_to_int(qib_ibmtu);
+	if (mtu == -1)
+		mtu = QIB_DEFAULT_MTU;
+	dd->pport->ibmtu = (u32)mtu;
+
+	val = qib_read_kreg64(dd, kr_sendpiobufcnt);
+	dd->piobcnt2k = val & ~0U;
+	dd->piobcnt4k = val >> 32;
+	/* these may be adjusted in init_chip_wc_pat() */
+	dd->pio2kbase = (u32 __iomem *)
+		((char __iomem *) dd->kregbase + dd->pio2k_bufbase);
+	if (dd->piobcnt4k) {
+		dd->pio4kbase = (u32 __iomem *)
+			((char __iomem *) dd->kregbase +
+			 (dd->piobufbase >> 32));
+		/*
+		 * 4K buffers take 2 pages; we use roundup just to be
+		 * paranoid; we calculate it once here, rather than on
+		 * ever buf allocate
+		 */
+		dd->align4k = ALIGN(dd->piosize4k, dd->palign);
+	}
+
+	piobufs = dd->piobcnt4k + dd->piobcnt2k;
+
+	dd->pioavregs = ALIGN(piobufs, sizeof(u64) * BITS_PER_BYTE / 2) /
+		(sizeof(u64) * BITS_PER_BYTE / 2);
+}
+
+/*
+ * The chip base addresses in cspec and cpspec have to be set
+ * after possible init_chip_wc_pat(), rather than in
+ * qib_get_7220_chip_params(), so split out as separate function
+ */
+static void set_7220_baseaddrs(struct qib_devdata *dd)
+{
+	u32 cregbase;
+	/* init after possible re-map in init_chip_wc_pat() */
+	cregbase = qib_read_kreg32(dd, kr_counterregbase);
+	dd->cspec->cregbase = (u64 __iomem *)
+		((char __iomem *) dd->kregbase + cregbase);
+
+	dd->egrtidbase = (u64 __iomem *)
+		((char __iomem *) dd->kregbase + dd->rcvegrbase);
+}
+
+
+#define SENDCTRL_SHADOWED (SYM_MASK(SendCtrl, SendIntBufAvail) |	\
+			   SYM_MASK(SendCtrl, SPioEnable) |		\
+			   SYM_MASK(SendCtrl, SSpecialTriggerEn) |	\
+			   SYM_MASK(SendCtrl, SendBufAvailUpd) |	\
+			   SYM_MASK(SendCtrl, AvailUpdThld) |		\
+			   SYM_MASK(SendCtrl, SDmaEnable) |		\
+			   SYM_MASK(SendCtrl, SDmaIntEnable) |		\
+			   SYM_MASK(SendCtrl, SDmaHalt) |		\
+			   SYM_MASK(SendCtrl, SDmaSingleDescriptor))
+
+static int sendctrl_hook(struct qib_devdata *dd,
+			 const struct diag_observer *op,
+			 u32 offs, u64 *data, u64 mask, int only_32)
+{
+	unsigned long flags;
+	unsigned idx = offs / sizeof(u64);
+	u64 local_data, all_bits;
+
+	if (idx != kr_sendctrl) {
+		qib_dev_err(dd, "SendCtrl Hook called with offs %X, %s-bit\n",
+			    offs, only_32 ? "32" : "64");
+		return 0;
+	}
+
+	all_bits = ~0ULL;
+	if (only_32)
+		all_bits >>= 32;
+	spin_lock_irqsave(&dd->sendctrl_lock, flags);
+	if ((mask & all_bits) != all_bits) {
+		/*
+		 * At least some mask bits are zero, so we need
+		 * to read. The judgement call is whether from
+		 * reg or shadow. First-cut: read reg, and complain
+		 * if any bits which should be shadowed are different
+		 * from their shadowed value.
+		 */
+		if (only_32)
+			local_data = (u64)qib_read_kreg32(dd, idx);
+		else
+			local_data = qib_read_kreg64(dd, idx);
+		qib_dev_err(dd, "Sendctrl -> %X, Shad -> %X\n",
+			    (u32)local_data, (u32)dd->sendctrl);
+		if ((local_data & SENDCTRL_SHADOWED) !=
+		    (dd->sendctrl & SENDCTRL_SHADOWED))
+			qib_dev_err(dd, "Sendctrl read: %X shadow is %X\n",
+				(u32)local_data, (u32) dd->sendctrl);
+		*data = (local_data & ~mask) | (*data & mask);
+	}
+	if (mask) {
+		/*
+		 * At least some mask bits are one, so we need
+		 * to write, but only shadow some bits.
+		 */
+		u64 sval, tval; /* Shadowed, transient */
+
+		/*
+		 * New shadow val is bits we don't want to touch,
+		 * ORed with bits we do, that are intended for shadow.
+		 */
+		sval = (dd->sendctrl & ~mask);
+		sval |= *data & SENDCTRL_SHADOWED & mask;
+		dd->sendctrl = sval;
+		tval = sval | (*data & ~SENDCTRL_SHADOWED & mask);
+		qib_dev_err(dd, "Sendctrl <- %X, Shad <- %X\n",
+			    (u32)tval, (u32)sval);
+		qib_write_kreg(dd, kr_sendctrl, tval);
+		qib_write_kreg(dd, kr_scratch, 0Ull);
+	}
+	spin_unlock_irqrestore(&dd->sendctrl_lock, flags);
+
+	return only_32 ? 4 : 8;
+}
+
+static const struct diag_observer sendctrl_observer = {
+	sendctrl_hook, kr_sendctrl * sizeof(u64),
+	kr_sendctrl * sizeof(u64)
+};
+
+/*
+ * write the final few registers that depend on some of the
+ * init setup.  Done late in init, just before bringing up
+ * the serdes.
+ */
+static int qib_late_7220_initreg(struct qib_devdata *dd)
+{
+	int ret = 0;
+	u64 val;
+
+	qib_write_kreg(dd, kr_rcvhdrentsize, dd->rcvhdrentsize);
+	qib_write_kreg(dd, kr_rcvhdrsize, dd->rcvhdrsize);
+	qib_write_kreg(dd, kr_rcvhdrcnt, dd->rcvhdrcnt);
+	qib_write_kreg(dd, kr_sendpioavailaddr, dd->pioavailregs_phys);
+	val = qib_read_kreg64(dd, kr_sendpioavailaddr);
+	if (val != dd->pioavailregs_phys) {
+		qib_dev_err(dd,
+			"Catastrophic software error, SendPIOAvailAddr written as %lx, read back as %llx\n",
+			(unsigned long) dd->pioavailregs_phys,
+			(unsigned long long) val);
+		ret = -EINVAL;
+	}
+	qib_register_observer(dd, &sendctrl_observer);
+	return ret;
+}
+
+static int qib_init_7220_variables(struct qib_devdata *dd)
+{
+	struct qib_chippport_specific *cpspec;
+	struct qib_pportdata *ppd;
+	int ret = 0;
+	u32 sbufs, updthresh;
+
+	cpspec = (struct qib_chippport_specific *)(dd + 1);
+	ppd = &cpspec->pportdata;
+	dd->pport = ppd;
+	dd->num_pports = 1;
+
+	dd->cspec = (struct qib_chip_specific *)(cpspec + dd->num_pports);
+	ppd->cpspec = cpspec;
+
+	spin_lock_init(&dd->cspec->sdepb_lock);
+	spin_lock_init(&dd->cspec->rcvmod_lock);
+	spin_lock_init(&dd->cspec->gpio_lock);
+
+	/* we haven't yet set QIB_PRESENT, so use read directly */
+	dd->revision = readq(&dd->kregbase[kr_revision]);
+
+	if ((dd->revision & 0xffffffffU) == 0xffffffffU) {
+		qib_dev_err(dd,
+			"Revision register read failure, giving up initialization\n");
+		ret = -ENODEV;
+		goto bail;
+	}
+	dd->flags |= QIB_PRESENT;  /* now register routines work */
+
+	dd->majrev = (u8) SYM_FIELD(dd->revision, Revision_R,
+				    ChipRevMajor);
+	dd->minrev = (u8) SYM_FIELD(dd->revision, Revision_R,
+				    ChipRevMinor);
+
+	get_7220_chip_params(dd);
+	qib_7220_boardname(dd);
+
+	/*
+	 * GPIO bits for TWSI data and clock,
+	 * used for serial EEPROM.
+	 */
+	dd->gpio_sda_num = _QIB_GPIO_SDA_NUM;
+	dd->gpio_scl_num = _QIB_GPIO_SCL_NUM;
+	dd->twsi_eeprom_dev = QIB_TWSI_EEPROM_DEV;
+
+	dd->flags |= QIB_HAS_INTX | QIB_HAS_LINK_LATENCY |
+		QIB_NODMA_RTAIL | QIB_HAS_THRESH_UPDATE;
+	dd->flags |= qib_special_trigger ?
+		QIB_USE_SPCL_TRIG : QIB_HAS_SEND_DMA;
+
+	/*
+	 * EEPROM error log 0 is TXE Parity errors. 1 is RXE Parity.
+	 * 2 is Some Misc, 3 is reserved for future.
+	 */
+	dd->eep_st_masks[0].hwerrs_to_log = HWE_MASK(TXEMemParityErr);
+
+	dd->eep_st_masks[1].hwerrs_to_log = HWE_MASK(RXEMemParityErr);
+
+	dd->eep_st_masks[2].errs_to_log = ERR_MASK(ResetNegated);
+
+	init_waitqueue_head(&cpspec->autoneg_wait);
+	INIT_DELAYED_WORK(&cpspec->autoneg_work, autoneg_7220_work);
+
+	ret = qib_init_pportdata(ppd, dd, 0, 1);
+	if (ret)
+		goto bail;
+	ppd->link_width_supported = IB_WIDTH_1X | IB_WIDTH_4X;
+	ppd->link_speed_supported = QIB_IB_SDR | QIB_IB_DDR;
+
+	ppd->link_width_enabled = ppd->link_width_supported;
+	ppd->link_speed_enabled = ppd->link_speed_supported;
+	/*
+	 * Set the initial values to reasonable default, will be set
+	 * for real when link is up.
+	 */
+	ppd->link_width_active = IB_WIDTH_4X;
+	ppd->link_speed_active = QIB_IB_SDR;
+	ppd->delay_mult = rate_to_delay[0][1];
+	ppd->vls_supported = IB_VL_VL0;
+	ppd->vls_operational = ppd->vls_supported;
+
+	if (!qib_mini_init)
+		qib_write_kreg(dd, kr_rcvbthqp, QIB_KD_QP);
+
+	init_timer(&ppd->cpspec->chase_timer);
+	ppd->cpspec->chase_timer.function = reenable_7220_chase;
+	ppd->cpspec->chase_timer.data = (unsigned long)ppd;
+
+	qib_num_cfg_vls = 1; /* if any 7220's, only one VL */
+
+	dd->rcvhdrentsize = QIB_RCVHDR_ENTSIZE;
+	dd->rcvhdrsize = QIB_DFLT_RCVHDRSIZE;
+	dd->rhf_offset =
+		dd->rcvhdrentsize - sizeof(u64) / sizeof(u32);
+
+	/* we always allocate at least 2048 bytes for eager buffers */
+	ret = ib_mtu_enum_to_int(qib_ibmtu);
+	dd->rcvegrbufsize = ret != -1 ? max(ret, 2048) : QIB_DEFAULT_MTU;
+	BUG_ON(!is_power_of_2(dd->rcvegrbufsize));
+	dd->rcvegrbufsize_shift = ilog2(dd->rcvegrbufsize);
+
+	qib_7220_tidtemplate(dd);
+
+	/*
+	 * We can request a receive interrupt for 1 or
+	 * more packets from current offset.  For now, we set this
+	 * up for a single packet.
+	 */
+	dd->rhdrhead_intr_off = 1ULL << 32;
+
+	/* setup the stats timer; the add_timer is done at end of init */
+	init_timer(&dd->stats_timer);
+	dd->stats_timer.function = qib_get_7220_faststats;
+	dd->stats_timer.data = (unsigned long) dd;
+	dd->stats_timer.expires = jiffies + ACTIVITY_TIMER * HZ;
+
+	/*
+	 * Control[4] has been added to change the arbitration within
+	 * the SDMA engine between favoring data fetches over descriptor
+	 * fetches.  qib_sdma_fetch_arb==0 gives data fetches priority.
+	 */
+	if (qib_sdma_fetch_arb)
+		dd->control |= 1 << 4;
+
+	dd->ureg_align = 0x10000;  /* 64KB alignment */
+
+	dd->piosize2kmax_dwords = (dd->piosize2k >> 2)-1;
+	qib_7220_config_ctxts(dd);
+	qib_set_ctxtcnt(dd);  /* needed for PAT setup */
+
+	if (qib_wc_pat) {
+		ret = init_chip_wc_pat(dd, 0);
+		if (ret)
+			goto bail;
+	}
+	set_7220_baseaddrs(dd); /* set chip access pointers now */
+
+	ret = 0;
+	if (qib_mini_init)
+		goto bail;
+
+	ret = qib_create_ctxts(dd);
+	init_7220_cntrnames(dd);
+
+	/* use all of 4KB buffers for the kernel SDMA, zero if !SDMA.
+	 * reserve the update threshold amount for other kernel use, such
+	 * as sending SMI, MAD, and ACKs, or 3, whichever is greater,
+	 * unless we aren't enabling SDMA, in which case we want to use
+	 * all the 4k bufs for the kernel.
+	 * if this was less than the update threshold, we could wait
+	 * a long time for an update.  Coded this way because we
+	 * sometimes change the update threshold for various reasons,
+	 * and we want this to remain robust.
+	 */
+	updthresh = 8U; /* update threshold */
+	if (dd->flags & QIB_HAS_SEND_DMA) {
+		dd->cspec->sdmabufcnt =  dd->piobcnt4k;
+		sbufs = updthresh > 3 ? updthresh : 3;
+	} else {
+		dd->cspec->sdmabufcnt = 0;
+		sbufs = dd->piobcnt4k;
+	}
+
+	dd->cspec->lastbuf_for_pio = dd->piobcnt2k + dd->piobcnt4k -
+		dd->cspec->sdmabufcnt;
+	dd->lastctxt_piobuf = dd->cspec->lastbuf_for_pio - sbufs;
+	dd->cspec->lastbuf_for_pio--; /* range is <= , not < */
+	dd->last_pio = dd->cspec->lastbuf_for_pio;
+	dd->pbufsctxt = dd->lastctxt_piobuf /
+		(dd->cfgctxts - dd->first_user_ctxt);
+
+	/*
+	 * if we are at 16 user contexts, we will have one 7 sbufs
+	 * per context, so drop the update threshold to match.  We
+	 * want to update before we actually run out, at low pbufs/ctxt
+	 * so give ourselves some margin
+	 */
+	if ((dd->pbufsctxt - 2) < updthresh)
+		updthresh = dd->pbufsctxt - 2;
+
+	dd->cspec->updthresh_dflt = updthresh;
+	dd->cspec->updthresh = updthresh;
+
+	/* before full enable, no interrupts, no locking needed */
+	dd->sendctrl |= (updthresh & SYM_RMASK(SendCtrl, AvailUpdThld))
+			     << SYM_LSB(SendCtrl, AvailUpdThld);
+
+	dd->psxmitwait_supported = 1;
+	dd->psxmitwait_check_rate = QIB_7220_PSXMITWAIT_CHECK_RATE;
+bail:
+	return ret;
+}
+
+static u32 __iomem *qib_7220_getsendbuf(struct qib_pportdata *ppd, u64 pbc,
+					u32 *pbufnum)
+{
+	u32 first, last, plen = pbc & QIB_PBC_LENGTH_MASK;
+	struct qib_devdata *dd = ppd->dd;
+	u32 __iomem *buf;
+
+	if (((pbc >> 32) & PBC_7220_VL15_SEND_CTRL) &&
+		!(ppd->lflags & (QIBL_IB_AUTONEG_INPROG | QIBL_LINKACTIVE)))
+		buf = get_7220_link_buf(ppd, pbufnum);
+	else {
+		if ((plen + 1) > dd->piosize2kmax_dwords)
+			first = dd->piobcnt2k;
+		else
+			first = 0;
+		/* try 4k if all 2k busy, so same last for both sizes */
+		last = dd->cspec->lastbuf_for_pio;
+		buf = qib_getsendbuf_range(dd, pbufnum, first, last);
+	}
+	return buf;
+}
+
+/* these 2 "counters" are really control registers, and are always RW */
+static void qib_set_cntr_7220_sample(struct qib_pportdata *ppd, u32 intv,
+				     u32 start)
+{
+	write_7220_creg(ppd->dd, cr_psinterval, intv);
+	write_7220_creg(ppd->dd, cr_psstart, start);
+}
+
+/*
+ * NOTE: no real attempt is made to generalize the SDMA stuff.
+ * At some point "soon" we will have a new more generalized
+ * set of sdma interface, and then we'll clean this up.
+ */
+
+/* Must be called with sdma_lock held, or before init finished */
+static void qib_sdma_update_7220_tail(struct qib_pportdata *ppd, u16 tail)
+{
+	/* Commit writes to memory and advance the tail on the chip */
+	wmb();
+	ppd->sdma_descq_tail = tail;
+	qib_write_kreg(ppd->dd, kr_senddmatail, tail);
+}
+
+static void qib_sdma_set_7220_desc_cnt(struct qib_pportdata *ppd, unsigned cnt)
+{
+}
+
+static struct sdma_set_state_action sdma_7220_action_table[] = {
+	[qib_sdma_state_s00_hw_down] = {
+		.op_enable = 0,
+		.op_intenable = 0,
+		.op_halt = 0,
+		.go_s99_running_tofalse = 1,
+	},
+	[qib_sdma_state_s10_hw_start_up_wait] = {
+		.op_enable = 1,
+		.op_intenable = 1,
+		.op_halt = 1,
+	},
+	[qib_sdma_state_s20_idle] = {
+		.op_enable = 1,
+		.op_intenable = 1,
+		.op_halt = 1,
+	},
+	[qib_sdma_state_s30_sw_clean_up_wait] = {
+		.op_enable = 0,
+		.op_intenable = 1,
+		.op_halt = 0,
+	},
+	[qib_sdma_state_s40_hw_clean_up_wait] = {
+		.op_enable = 1,
+		.op_intenable = 1,
+		.op_halt = 1,
+	},
+	[qib_sdma_state_s50_hw_halt_wait] = {
+		.op_enable = 1,
+		.op_intenable = 1,
+		.op_halt = 1,
+	},
+	[qib_sdma_state_s99_running] = {
+		.op_enable = 1,
+		.op_intenable = 1,
+		.op_halt = 0,
+		.go_s99_running_totrue = 1,
+	},
+};
+
+static void qib_7220_sdma_init_early(struct qib_pportdata *ppd)
+{
+	ppd->sdma_state.set_state_action = sdma_7220_action_table;
+}
+
+static int init_sdma_7220_regs(struct qib_pportdata *ppd)
+{
+	struct qib_devdata *dd = ppd->dd;
+	unsigned i, n;
+	u64 senddmabufmask[3] = { 0 };
+
+	/* Set SendDmaBase */
+	qib_write_kreg(dd, kr_senddmabase, ppd->sdma_descq_phys);
+	qib_sdma_7220_setlengen(ppd);
+	qib_sdma_update_7220_tail(ppd, 0); /* Set SendDmaTail */
+	/* Set SendDmaHeadAddr */
+	qib_write_kreg(dd, kr_senddmaheadaddr, ppd->sdma_head_phys);
+
+	/*
+	 * Reserve all the former "kernel" piobufs, using high number range
+	 * so we get as many 4K buffers as possible
+	 */
+	n = dd->piobcnt2k + dd->piobcnt4k;
+	i = n - dd->cspec->sdmabufcnt;
+
+	for (; i < n; ++i) {
+		unsigned word = i / 64;
+		unsigned bit = i & 63;
+
+		BUG_ON(word >= 3);
+		senddmabufmask[word] |= 1ULL << bit;
+	}
+	qib_write_kreg(dd, kr_senddmabufmask0, senddmabufmask[0]);
+	qib_write_kreg(dd, kr_senddmabufmask1, senddmabufmask[1]);
+	qib_write_kreg(dd, kr_senddmabufmask2, senddmabufmask[2]);
+
+	ppd->sdma_state.first_sendbuf = i;
+	ppd->sdma_state.last_sendbuf = n;
+
+	return 0;
+}
+
+/* sdma_lock must be held */
+static u16 qib_sdma_7220_gethead(struct qib_pportdata *ppd)
+{
+	struct qib_devdata *dd = ppd->dd;
+	int sane;
+	int use_dmahead;
+	u16 swhead;
+	u16 swtail;
+	u16 cnt;
+	u16 hwhead;
+
+	use_dmahead = __qib_sdma_running(ppd) &&
+		(dd->flags & QIB_HAS_SDMA_TIMEOUT);
+retry:
+	hwhead = use_dmahead ?
+		(u16)le64_to_cpu(*ppd->sdma_head_dma) :
+		(u16)qib_read_kreg32(dd, kr_senddmahead);
+
+	swhead = ppd->sdma_descq_head;
+	swtail = ppd->sdma_descq_tail;
+	cnt = ppd->sdma_descq_cnt;
+
+	if (swhead < swtail) {
+		/* not wrapped */
+		sane = (hwhead >= swhead) & (hwhead <= swtail);
+	} else if (swhead > swtail) {
+		/* wrapped around */
+		sane = ((hwhead >= swhead) && (hwhead < cnt)) ||
+			(hwhead <= swtail);
+	} else {
+		/* empty */
+		sane = (hwhead == swhead);
+	}
+
+	if (unlikely(!sane)) {
+		if (use_dmahead) {
+			/* try one more time, directly from the register */
+			use_dmahead = 0;
+			goto retry;
+		}
+		/* assume no progress */
+		hwhead = swhead;
+	}
+
+	return hwhead;
+}
+
+static int qib_sdma_7220_busy(struct qib_pportdata *ppd)
+{
+	u64 hwstatus = qib_read_kreg64(ppd->dd, kr_senddmastatus);
+
+	return (hwstatus & SYM_MASK(SendDmaStatus, ScoreBoardDrainInProg)) ||
+	       (hwstatus & SYM_MASK(SendDmaStatus, AbortInProg)) ||
+	       (hwstatus & SYM_MASK(SendDmaStatus, InternalSDmaEnable)) ||
+	       !(hwstatus & SYM_MASK(SendDmaStatus, ScbEmpty));
+}
+
+/*
+ * Compute the amount of delay before sending the next packet if the
+ * port's send rate differs from the static rate set for the QP.
+ * Since the delay affects this packet but the amount of the delay is
+ * based on the length of the previous packet, use the last delay computed
+ * and save the delay count for this packet to be used next time
+ * we get here.
+ */
+static u32 qib_7220_setpbc_control(struct qib_pportdata *ppd, u32 plen,
+				   u8 srate, u8 vl)
+{
+	u8 snd_mult = ppd->delay_mult;
+	u8 rcv_mult = ib_rate_to_delay[srate];
+	u32 ret = ppd->cpspec->last_delay_mult;
+
+	ppd->cpspec->last_delay_mult = (rcv_mult > snd_mult) ?
+		(plen * (rcv_mult - snd_mult) + 1) >> 1 : 0;
+
+	/* Indicate VL15, if necessary */
+	if (vl == 15)
+		ret |= PBC_7220_VL15_SEND_CTRL;
+	return ret;
+}
+
+static void qib_7220_initvl15_bufs(struct qib_devdata *dd)
+{
+}
+
+static void qib_7220_init_ctxt(struct qib_ctxtdata *rcd)
+{
+	if (!rcd->ctxt) {
+		rcd->rcvegrcnt = IBA7220_KRCVEGRCNT;
+		rcd->rcvegr_tid_base = 0;
+	} else {
+		rcd->rcvegrcnt = rcd->dd->cspec->rcvegrcnt;
+		rcd->rcvegr_tid_base = IBA7220_KRCVEGRCNT +
+			(rcd->ctxt - 1) * rcd->rcvegrcnt;
+	}
+}
+
+static void qib_7220_txchk_change(struct qib_devdata *dd, u32 start,
+				  u32 len, u32 which, struct qib_ctxtdata *rcd)
+{
+	int i;
+	unsigned long flags;
+
+	switch (which) {
+	case TXCHK_CHG_TYPE_KERN:
+		/* see if we need to raise avail update threshold */
+		spin_lock_irqsave(&dd->uctxt_lock, flags);
+		for (i = dd->first_user_ctxt;
+		     dd->cspec->updthresh != dd->cspec->updthresh_dflt
+		     && i < dd->cfgctxts; i++)
+			if (dd->rcd[i] && dd->rcd[i]->subctxt_cnt &&
+			   ((dd->rcd[i]->piocnt / dd->rcd[i]->subctxt_cnt) - 1)
+			   < dd->cspec->updthresh_dflt)
+				break;
+		spin_unlock_irqrestore(&dd->uctxt_lock, flags);
+		if (i == dd->cfgctxts) {
+			spin_lock_irqsave(&dd->sendctrl_lock, flags);
+			dd->cspec->updthresh = dd->cspec->updthresh_dflt;
+			dd->sendctrl &= ~SYM_MASK(SendCtrl, AvailUpdThld);
+			dd->sendctrl |= (dd->cspec->updthresh &
+					 SYM_RMASK(SendCtrl, AvailUpdThld)) <<
+					   SYM_LSB(SendCtrl, AvailUpdThld);
+			spin_unlock_irqrestore(&dd->sendctrl_lock, flags);
+			sendctrl_7220_mod(dd->pport, QIB_SENDCTRL_AVAIL_BLIP);
+		}
+		break;
+	case TXCHK_CHG_TYPE_USER:
+		spin_lock_irqsave(&dd->sendctrl_lock, flags);
+		if (rcd && rcd->subctxt_cnt && ((rcd->piocnt
+			/ rcd->subctxt_cnt) - 1) < dd->cspec->updthresh) {
+			dd->cspec->updthresh = (rcd->piocnt /
+						rcd->subctxt_cnt) - 1;
+			dd->sendctrl &= ~SYM_MASK(SendCtrl, AvailUpdThld);
+			dd->sendctrl |= (dd->cspec->updthresh &
+					SYM_RMASK(SendCtrl, AvailUpdThld))
+					<< SYM_LSB(SendCtrl, AvailUpdThld);
+			spin_unlock_irqrestore(&dd->sendctrl_lock, flags);
+			sendctrl_7220_mod(dd->pport, QIB_SENDCTRL_AVAIL_BLIP);
+		} else
+			spin_unlock_irqrestore(&dd->sendctrl_lock, flags);
+		break;
+	}
+}
+
+static void writescratch(struct qib_devdata *dd, u32 val)
+{
+	qib_write_kreg(dd, kr_scratch, val);
+}
+
+#define VALID_TS_RD_REG_MASK 0xBF
+/**
+ * qib_7220_tempsense_read - read register of temp sensor via TWSI
+ * @dd: the qlogic_ib device
+ * @regnum: register to read from
+ *
+ * returns reg contents (0..255) or < 0 for error
+ */
+static int qib_7220_tempsense_rd(struct qib_devdata *dd, int regnum)
+{
+	int ret;
+	u8 rdata;
+
+	if (regnum > 7) {
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	/* return a bogus value for (the one) register we do not have */
+	if (!((1 << regnum) & VALID_TS_RD_REG_MASK)) {
+		ret = 0;
+		goto bail;
+	}
+
+	ret = mutex_lock_interruptible(&dd->eep_lock);
+	if (ret)
+		goto bail;
+
+	ret = qib_twsi_blk_rd(dd, QIB_TWSI_TEMP_DEV, regnum, &rdata, 1);
+	if (!ret)
+		ret = rdata;
+
+	mutex_unlock(&dd->eep_lock);
+
+	/*
+	 * There are three possibilities here:
+	 * ret is actual value (0..255)
+	 * ret is -ENXIO or -EINVAL from twsi code or this file
+	 * ret is -EINTR from mutex_lock_interruptible.
+	 */
+bail:
+	return ret;
+}
+
+#ifdef CONFIG_INFINIBAND_QIB_DCA
+static int qib_7220_notify_dca(struct qib_devdata *dd, unsigned long event)
+{
+	return 0;
+}
+#endif
+
+/* Dummy function, as 7220 boards never disable EEPROM Write */
+static int qib_7220_eeprom_wen(struct qib_devdata *dd, int wen)
+{
+	return 1;
+}
+
+/**
+ * qib_init_iba7220_funcs - set up the chip-specific function pointers
+ * @dev: the pci_dev for qlogic_ib device
+ * @ent: pci_device_id struct for this dev
+ *
+ * This is global, and is called directly at init to set up the
+ * chip-specific function pointers for later use.
+ */
+struct qib_devdata *qib_init_iba7220_funcs(struct pci_dev *pdev,
+					   const struct pci_device_id *ent)
+{
+	struct qib_devdata *dd;
+	int ret;
+	u32 boardid, minwidth;
+
+	dd = qib_alloc_devdata(pdev, sizeof(struct qib_chip_specific) +
+		sizeof(struct qib_chippport_specific));
+	if (IS_ERR(dd))
+		goto bail;
+
+	dd->f_bringup_serdes    = qib_7220_bringup_serdes;
+	dd->f_cleanup           = qib_setup_7220_cleanup;
+	dd->f_clear_tids        = qib_7220_clear_tids;
+	dd->f_free_irq          = qib_7220_free_irq;
+	dd->f_get_base_info     = qib_7220_get_base_info;
+	dd->f_get_msgheader     = qib_7220_get_msgheader;
+	dd->f_getsendbuf        = qib_7220_getsendbuf;
+	dd->f_gpio_mod          = gpio_7220_mod;
+	dd->f_eeprom_wen        = qib_7220_eeprom_wen;
+	dd->f_hdrqempty         = qib_7220_hdrqempty;
+	dd->f_ib_updown         = qib_7220_ib_updown;
+	dd->f_init_ctxt         = qib_7220_init_ctxt;
+	dd->f_initvl15_bufs     = qib_7220_initvl15_bufs;
+	dd->f_intr_fallback     = qib_7220_intr_fallback;
+	dd->f_late_initreg      = qib_late_7220_initreg;
+	dd->f_setpbc_control    = qib_7220_setpbc_control;
+	dd->f_portcntr          = qib_portcntr_7220;
+	dd->f_put_tid           = qib_7220_put_tid;
+	dd->f_quiet_serdes      = qib_7220_quiet_serdes;
+	dd->f_rcvctrl           = rcvctrl_7220_mod;
+	dd->f_read_cntrs        = qib_read_7220cntrs;
+	dd->f_read_portcntrs    = qib_read_7220portcntrs;
+	dd->f_reset             = qib_setup_7220_reset;
+	dd->f_init_sdma_regs    = init_sdma_7220_regs;
+	dd->f_sdma_busy         = qib_sdma_7220_busy;
+	dd->f_sdma_gethead      = qib_sdma_7220_gethead;
+	dd->f_sdma_sendctrl     = qib_7220_sdma_sendctrl;
+	dd->f_sdma_set_desc_cnt = qib_sdma_set_7220_desc_cnt;
+	dd->f_sdma_update_tail  = qib_sdma_update_7220_tail;
+	dd->f_sdma_hw_clean_up  = qib_7220_sdma_hw_clean_up;
+	dd->f_sdma_hw_start_up  = qib_7220_sdma_hw_start_up;
+	dd->f_sdma_init_early   = qib_7220_sdma_init_early;
+	dd->f_sendctrl          = sendctrl_7220_mod;
+	dd->f_set_armlaunch     = qib_set_7220_armlaunch;
+	dd->f_set_cntr_sample   = qib_set_cntr_7220_sample;
+	dd->f_iblink_state      = qib_7220_iblink_state;
+	dd->f_ibphys_portstate  = qib_7220_phys_portstate;
+	dd->f_get_ib_cfg        = qib_7220_get_ib_cfg;
+	dd->f_set_ib_cfg        = qib_7220_set_ib_cfg;
+	dd->f_set_ib_loopback   = qib_7220_set_loopback;
+	dd->f_set_intr_state    = qib_7220_set_intr_state;
+	dd->f_setextled         = qib_setup_7220_setextled;
+	dd->f_txchk_change      = qib_7220_txchk_change;
+	dd->f_update_usrhead    = qib_update_7220_usrhead;
+	dd->f_wantpiobuf_intr   = qib_wantpiobuf_7220_intr;
+	dd->f_xgxs_reset        = qib_7220_xgxs_reset;
+	dd->f_writescratch      = writescratch;
+	dd->f_tempsense_rd	= qib_7220_tempsense_rd;
+#ifdef CONFIG_INFINIBAND_QIB_DCA
+	dd->f_notify_dca = qib_7220_notify_dca;
+#endif
+	/*
+	 * Do remaining pcie setup and save pcie values in dd.
+	 * Any error printing is already done by the init code.
+	 * On return, we have the chip mapped, but chip registers
+	 * are not set up until start of qib_init_7220_variables.
+	 */
+	ret = qib_pcie_ddinit(dd, pdev, ent);
+	if (ret < 0)
+		goto bail_free;
+
+	/* initialize chip-specific variables */
+	ret = qib_init_7220_variables(dd);
+	if (ret)
+		goto bail_cleanup;
+
+	if (qib_mini_init)
+		goto bail;
+
+	boardid = SYM_FIELD(dd->revision, Revision,
+			    BoardID);
+	switch (boardid) {
+	case 0:
+	case 2:
+	case 10:
+	case 12:
+		minwidth = 16; /* x16 capable boards */
+		break;
+	default:
+		minwidth = 8; /* x8 capable boards */
+		break;
+	}
+	if (qib_pcie_params(dd, minwidth, NULL, NULL))
+		qib_dev_err(dd,
+			"Failed to setup PCIe or interrupts; continuing anyway\n");
+
+	/* save IRQ for possible later use */
+	dd->cspec->irq = pdev->irq;
+
+	if (qib_read_kreg64(dd, kr_hwerrstatus) &
+	    QLOGIC_IB_HWE_SERDESPLLFAILED)
+		qib_write_kreg(dd, kr_hwerrclear,
+			       QLOGIC_IB_HWE_SERDESPLLFAILED);
+
+	/* setup interrupt handler (interrupt type handled above) */
+	qib_setup_7220_interrupt(dd);
+	qib_7220_init_hwerrors(dd);
+
+	/* clear diagctrl register, in case diags were running and crashed */
+	qib_write_kreg(dd, kr_hwdiagctrl, 0);
+
+	goto bail;
+
+bail_cleanup:
+	qib_pcie_ddcleanup(dd);
+bail_free:
+	qib_free_devdata(dd);
+	dd = ERR_PTR(ret);
+bail:
+	return dd;
+}
diff --git a/drivers/infiniband/hw/qib/qib_iba7322.c b/drivers/infiniband/hw/qib/qib_iba7322.c
new file mode 100644
index 0000000..a7eb325
--- /dev/null
+++ b/drivers/infiniband/hw/qib/qib_iba7322.c
@@ -0,0 +1,8554 @@
+/*
+ * Copyright (c) 2012 Intel Corporation.  All rights reserved.
+ * Copyright (c) 2008 - 2012 QLogic Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/*
+ * This file contains all of the code that is specific to the
+ * InfiniPath 7322 chip
+ */
+
+#include <linux/interrupt.h>
+#include <linux/pci.h>
+#include <linux/delay.h>
+#include <linux/io.h>
+#include <linux/jiffies.h>
+#include <linux/module.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_smi.h>
+#ifdef CONFIG_INFINIBAND_QIB_DCA
+#include <linux/dca.h>
+#endif
+
+#include "qib.h"
+#include "qib_7322_regs.h"
+#include "qib_qsfp.h"
+
+#include "qib_mad.h"
+#include "qib_verbs.h"
+
+#undef pr_fmt
+#define pr_fmt(fmt) QIB_DRV_NAME " " fmt
+
+static void qib_setup_7322_setextled(struct qib_pportdata *, u32);
+static void qib_7322_handle_hwerrors(struct qib_devdata *, char *, size_t);
+static void sendctrl_7322_mod(struct qib_pportdata *ppd, u32 op);
+static irqreturn_t qib_7322intr(int irq, void *data);
+static irqreturn_t qib_7322bufavail(int irq, void *data);
+static irqreturn_t sdma_intr(int irq, void *data);
+static irqreturn_t sdma_idle_intr(int irq, void *data);
+static irqreturn_t sdma_progress_intr(int irq, void *data);
+static irqreturn_t sdma_cleanup_intr(int irq, void *data);
+static void qib_7322_txchk_change(struct qib_devdata *, u32, u32, u32,
+				  struct qib_ctxtdata *rcd);
+static u8 qib_7322_phys_portstate(u64);
+static u32 qib_7322_iblink_state(u64);
+static void qib_set_ib_7322_lstate(struct qib_pportdata *ppd, u16 linkcmd,
+				   u16 linitcmd);
+static void force_h1(struct qib_pportdata *);
+static void adj_tx_serdes(struct qib_pportdata *);
+static u32 qib_7322_setpbc_control(struct qib_pportdata *, u32, u8, u8);
+static void qib_7322_mini_pcs_reset(struct qib_pportdata *);
+
+static u32 ahb_mod(struct qib_devdata *, int, int, int, u32, u32);
+static void ibsd_wr_allchans(struct qib_pportdata *, int, unsigned, unsigned);
+static void serdes_7322_los_enable(struct qib_pportdata *, int);
+static int serdes_7322_init_old(struct qib_pportdata *);
+static int serdes_7322_init_new(struct qib_pportdata *);
+static void dump_sdma_7322_state(struct qib_pportdata *);
+
+#define BMASK(msb, lsb) (((1 << ((msb) + 1 - (lsb))) - 1) << (lsb))
+
+/* LE2 serdes values for different cases */
+#define LE2_DEFAULT 5
+#define LE2_5m 4
+#define LE2_QME 0
+
+/* Below is special-purpose, so only really works for the IB SerDes blocks. */
+#define IBSD(hw_pidx) (hw_pidx + 2)
+
+/* these are variables for documentation and experimentation purposes */
+static const unsigned rcv_int_timeout = 375;
+static const unsigned rcv_int_count = 16;
+static const unsigned sdma_idle_cnt = 64;
+
+/* Time to stop altering Rx Equalization parameters, after link up. */
+#define RXEQ_DISABLE_MSECS 2500
+
+/*
+ * Number of VLs we are configured to use (to allow for more
+ * credits per vl, etc.)
+ */
+ushort qib_num_cfg_vls = 2;
+module_param_named(num_vls, qib_num_cfg_vls, ushort, S_IRUGO);
+MODULE_PARM_DESC(num_vls, "Set number of Virtual Lanes to use (1-8)");
+
+static ushort qib_chase = 1;
+module_param_named(chase, qib_chase, ushort, S_IRUGO);
+MODULE_PARM_DESC(chase, "Enable state chase handling");
+
+static ushort qib_long_atten = 10; /* 10 dB ~= 5m length */
+module_param_named(long_attenuation, qib_long_atten, ushort, S_IRUGO);
+MODULE_PARM_DESC(long_attenuation, \
+		 "attenuation cutoff (dB) for long copper cable setup");
+
+static ushort qib_singleport;
+module_param_named(singleport, qib_singleport, ushort, S_IRUGO);
+MODULE_PARM_DESC(singleport, "Use only IB port 1; more per-port buffer space");
+
+static ushort qib_krcvq01_no_msi;
+module_param_named(krcvq01_no_msi, qib_krcvq01_no_msi, ushort, S_IRUGO);
+MODULE_PARM_DESC(krcvq01_no_msi, "No MSI for kctx < 2");
+
+/*
+ * Receive header queue sizes
+ */
+static unsigned qib_rcvhdrcnt;
+module_param_named(rcvhdrcnt, qib_rcvhdrcnt, uint, S_IRUGO);
+MODULE_PARM_DESC(rcvhdrcnt, "receive header count");
+
+static unsigned qib_rcvhdrsize;
+module_param_named(rcvhdrsize, qib_rcvhdrsize, uint, S_IRUGO);
+MODULE_PARM_DESC(rcvhdrsize, "receive header size in 32-bit words");
+
+static unsigned qib_rcvhdrentsize;
+module_param_named(rcvhdrentsize, qib_rcvhdrentsize, uint, S_IRUGO);
+MODULE_PARM_DESC(rcvhdrentsize, "receive header entry size in 32-bit words");
+
+#define MAX_ATTEN_LEN 64 /* plenty for any real system */
+/* for read back, default index is ~5m copper cable */
+static char txselect_list[MAX_ATTEN_LEN] = "10";
+static struct kparam_string kp_txselect = {
+	.string = txselect_list,
+	.maxlen = MAX_ATTEN_LEN
+};
+static int  setup_txselect(const char *, struct kernel_param *);
+module_param_call(txselect, setup_txselect, param_get_string,
+		  &kp_txselect, S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(txselect, \
+		 "Tx serdes indices (for no QSFP or invalid QSFP data)");
+
+#define BOARD_QME7342 5
+#define BOARD_QMH7342 6
+#define IS_QMH(dd) (SYM_FIELD((dd)->revision, Revision, BoardID) == \
+		    BOARD_QMH7342)
+#define IS_QME(dd) (SYM_FIELD((dd)->revision, Revision, BoardID) == \
+		    BOARD_QME7342)
+
+#define KREG_IDX(regname)     (QIB_7322_##regname##_OFFS / sizeof(u64))
+
+#define KREG_IBPORT_IDX(regname) ((QIB_7322_##regname##_0_OFFS / sizeof(u64)))
+
+#define MASK_ACROSS(lsb, msb) \
+	(((1ULL << ((msb) + 1 - (lsb))) - 1) << (lsb))
+
+#define SYM_RMASK(regname, fldname) ((u64)              \
+	QIB_7322_##regname##_##fldname##_RMASK)
+
+#define SYM_MASK(regname, fldname) ((u64)               \
+	QIB_7322_##regname##_##fldname##_RMASK <<       \
+	 QIB_7322_##regname##_##fldname##_LSB)
+
+#define SYM_FIELD(value, regname, fldname) ((u64)	\
+	(((value) >> SYM_LSB(regname, fldname)) &	\
+	 SYM_RMASK(regname, fldname)))
+
+/* useful for things like LaFifoEmpty_0...7, TxCreditOK_0...7, etc. */
+#define SYM_FIELD_ACROSS(value, regname, fldname, nbits) \
+	(((value) >> SYM_LSB(regname, fldname)) & MASK_ACROSS(0, nbits))
+
+#define HWE_MASK(fldname) SYM_MASK(HwErrMask, fldname##Mask)
+#define ERR_MASK(fldname) SYM_MASK(ErrMask, fldname##Mask)
+#define ERR_MASK_N(fldname) SYM_MASK(ErrMask_0, fldname##Mask)
+#define INT_MASK(fldname) SYM_MASK(IntMask, fldname##IntMask)
+#define INT_MASK_P(fldname, port) SYM_MASK(IntMask, fldname##IntMask##_##port)
+/* Below because most, but not all, fields of IntMask have that full suffix */
+#define INT_MASK_PM(fldname, port) SYM_MASK(IntMask, fldname##Mask##_##port)
+
+
+#define SYM_LSB(regname, fldname) (QIB_7322_##regname##_##fldname##_LSB)
+
+/*
+ * the size bits give us 2^N, in KB units.  0 marks as invalid,
+ * and 7 is reserved.  We currently use only 2KB and 4KB
+ */
+#define IBA7322_TID_SZ_SHIFT QIB_7322_RcvTIDArray0_RT_BufSize_LSB
+#define IBA7322_TID_SZ_2K (1UL<<IBA7322_TID_SZ_SHIFT) /* 2KB */
+#define IBA7322_TID_SZ_4K (2UL<<IBA7322_TID_SZ_SHIFT) /* 4KB */
+#define IBA7322_TID_PA_SHIFT 11U /* TID addr in chip stored w/o low bits */
+
+#define SendIBSLIDAssignMask \
+	QIB_7322_SendIBSLIDAssign_0_SendIBSLIDAssign_15_0_RMASK
+#define SendIBSLMCMask \
+	QIB_7322_SendIBSLIDMask_0_SendIBSLIDMask_15_0_RMASK
+
+#define ExtLED_IB1_YEL SYM_MASK(EXTCtrl, LEDPort0YellowOn)
+#define ExtLED_IB1_GRN SYM_MASK(EXTCtrl, LEDPort0GreenOn)
+#define ExtLED_IB2_YEL SYM_MASK(EXTCtrl, LEDPort1YellowOn)
+#define ExtLED_IB2_GRN SYM_MASK(EXTCtrl, LEDPort1GreenOn)
+#define ExtLED_IB1_MASK (ExtLED_IB1_YEL | ExtLED_IB1_GRN)
+#define ExtLED_IB2_MASK (ExtLED_IB2_YEL | ExtLED_IB2_GRN)
+
+#define _QIB_GPIO_SDA_NUM 1
+#define _QIB_GPIO_SCL_NUM 0
+#define QIB_EEPROM_WEN_NUM 14
+#define QIB_TWSI_EEPROM_DEV 0xA2 /* All Production 7322 cards. */
+
+/* HW counter clock is at 4nsec */
+#define QIB_7322_PSXMITWAIT_CHECK_RATE 4000
+
+/* full speed IB port 1 only */
+#define PORT_SPD_CAP (QIB_IB_SDR | QIB_IB_DDR | QIB_IB_QDR)
+#define PORT_SPD_CAP_SHIFT 3
+
+/* full speed featuremask, both ports */
+#define DUAL_PORT_CAP (PORT_SPD_CAP | (PORT_SPD_CAP << PORT_SPD_CAP_SHIFT))
+
+/*
+ * This file contains almost all the chip-specific register information and
+ * access functions for the FAKED QLogic InfiniPath 7322 PCI-Express chip.
+ */
+
+/* Use defines to tie machine-generated names to lower-case names */
+#define kr_contextcnt KREG_IDX(ContextCnt)
+#define kr_control KREG_IDX(Control)
+#define kr_counterregbase KREG_IDX(CntrRegBase)
+#define kr_errclear KREG_IDX(ErrClear)
+#define kr_errmask KREG_IDX(ErrMask)
+#define kr_errstatus KREG_IDX(ErrStatus)
+#define kr_extctrl KREG_IDX(EXTCtrl)
+#define kr_extstatus KREG_IDX(EXTStatus)
+#define kr_gpio_clear KREG_IDX(GPIOClear)
+#define kr_gpio_mask KREG_IDX(GPIOMask)
+#define kr_gpio_out KREG_IDX(GPIOOut)
+#define kr_gpio_status KREG_IDX(GPIOStatus)
+#define kr_hwdiagctrl KREG_IDX(HwDiagCtrl)
+#define kr_debugportval KREG_IDX(DebugPortValueReg)
+#define kr_fmask KREG_IDX(feature_mask)
+#define kr_act_fmask KREG_IDX(active_feature_mask)
+#define kr_hwerrclear KREG_IDX(HwErrClear)
+#define kr_hwerrmask KREG_IDX(HwErrMask)
+#define kr_hwerrstatus KREG_IDX(HwErrStatus)
+#define kr_intclear KREG_IDX(IntClear)
+#define kr_intmask KREG_IDX(IntMask)
+#define kr_intredirect KREG_IDX(IntRedirect0)
+#define kr_intstatus KREG_IDX(IntStatus)
+#define kr_pagealign KREG_IDX(PageAlign)
+#define kr_rcvavailtimeout KREG_IDX(RcvAvailTimeOut0)
+#define kr_rcvctrl KREG_IDX(RcvCtrl) /* Common, but chip also has per-port */
+#define kr_rcvegrbase KREG_IDX(RcvEgrBase)
+#define kr_rcvegrcnt KREG_IDX(RcvEgrCnt)
+#define kr_rcvhdrcnt KREG_IDX(RcvHdrCnt)
+#define kr_rcvhdrentsize KREG_IDX(RcvHdrEntSize)
+#define kr_rcvhdrsize KREG_IDX(RcvHdrSize)
+#define kr_rcvtidbase KREG_IDX(RcvTIDBase)
+#define kr_rcvtidcnt KREG_IDX(RcvTIDCnt)
+#define kr_revision KREG_IDX(Revision)
+#define kr_scratch KREG_IDX(Scratch)
+#define kr_sendbuffererror KREG_IDX(SendBufErr0) /* and base for 1 and 2 */
+#define kr_sendcheckmask KREG_IDX(SendCheckMask0) /* and 1, 2 */
+#define kr_sendctrl KREG_IDX(SendCtrl)
+#define kr_sendgrhcheckmask KREG_IDX(SendGRHCheckMask0) /* and 1, 2 */
+#define kr_sendibpktmask KREG_IDX(SendIBPacketMask0) /* and 1, 2 */
+#define kr_sendpioavailaddr KREG_IDX(SendBufAvailAddr)
+#define kr_sendpiobufbase KREG_IDX(SendBufBase)
+#define kr_sendpiobufcnt KREG_IDX(SendBufCnt)
+#define kr_sendpiosize KREG_IDX(SendBufSize)
+#define kr_sendregbase KREG_IDX(SendRegBase)
+#define kr_sendbufavail0 KREG_IDX(SendBufAvail0)
+#define kr_userregbase KREG_IDX(UserRegBase)
+#define kr_intgranted KREG_IDX(Int_Granted)
+#define kr_vecclr_wo_int KREG_IDX(vec_clr_without_int)
+#define kr_intblocked KREG_IDX(IntBlocked)
+#define kr_r_access KREG_IDX(SPC_JTAG_ACCESS_REG)
+
+/*
+ * per-port kernel registers.  Access only with qib_read_kreg_port()
+ * or qib_write_kreg_port()
+ */
+#define krp_errclear KREG_IBPORT_IDX(ErrClear)
+#define krp_errmask KREG_IBPORT_IDX(ErrMask)
+#define krp_errstatus KREG_IBPORT_IDX(ErrStatus)
+#define krp_highprio_0 KREG_IBPORT_IDX(HighPriority0)
+#define krp_highprio_limit KREG_IBPORT_IDX(HighPriorityLimit)
+#define krp_hrtbt_guid KREG_IBPORT_IDX(HRTBT_GUID)
+#define krp_ib_pcsconfig KREG_IBPORT_IDX(IBPCSConfig)
+#define krp_ibcctrl_a KREG_IBPORT_IDX(IBCCtrlA)
+#define krp_ibcctrl_b KREG_IBPORT_IDX(IBCCtrlB)
+#define krp_ibcctrl_c KREG_IBPORT_IDX(IBCCtrlC)
+#define krp_ibcstatus_a KREG_IBPORT_IDX(IBCStatusA)
+#define krp_ibcstatus_b KREG_IBPORT_IDX(IBCStatusB)
+#define krp_txestatus KREG_IBPORT_IDX(TXEStatus)
+#define krp_lowprio_0 KREG_IBPORT_IDX(LowPriority0)
+#define krp_ncmodectrl KREG_IBPORT_IDX(IBNCModeCtrl)
+#define krp_partitionkey KREG_IBPORT_IDX(RcvPartitionKey)
+#define krp_psinterval KREG_IBPORT_IDX(PSInterval)
+#define krp_psstart KREG_IBPORT_IDX(PSStart)
+#define krp_psstat KREG_IBPORT_IDX(PSStat)
+#define krp_rcvbthqp KREG_IBPORT_IDX(RcvBTHQP)
+#define krp_rcvctrl KREG_IBPORT_IDX(RcvCtrl)
+#define krp_rcvpktledcnt KREG_IBPORT_IDX(RcvPktLEDCnt)
+#define krp_rcvqpmaptable KREG_IBPORT_IDX(RcvQPMapTableA)
+#define krp_rxcreditvl0 KREG_IBPORT_IDX(RxCreditVL0)
+#define krp_rxcreditvl15 (KREG_IBPORT_IDX(RxCreditVL0)+15)
+#define krp_sendcheckcontrol KREG_IBPORT_IDX(SendCheckControl)
+#define krp_sendctrl KREG_IBPORT_IDX(SendCtrl)
+#define krp_senddmabase KREG_IBPORT_IDX(SendDmaBase)
+#define krp_senddmabufmask0 KREG_IBPORT_IDX(SendDmaBufMask0)
+#define krp_senddmabufmask1 (KREG_IBPORT_IDX(SendDmaBufMask0) + 1)
+#define krp_senddmabufmask2 (KREG_IBPORT_IDX(SendDmaBufMask0) + 2)
+#define krp_senddmabuf_use0 KREG_IBPORT_IDX(SendDmaBufUsed0)
+#define krp_senddmabuf_use1 (KREG_IBPORT_IDX(SendDmaBufUsed0) + 1)
+#define krp_senddmabuf_use2 (KREG_IBPORT_IDX(SendDmaBufUsed0) + 2)
+#define krp_senddmadesccnt KREG_IBPORT_IDX(SendDmaDescCnt)
+#define krp_senddmahead KREG_IBPORT_IDX(SendDmaHead)
+#define krp_senddmaheadaddr KREG_IBPORT_IDX(SendDmaHeadAddr)
+#define krp_senddmaidlecnt KREG_IBPORT_IDX(SendDmaIdleCnt)
+#define krp_senddmalengen KREG_IBPORT_IDX(SendDmaLenGen)
+#define krp_senddmaprioritythld KREG_IBPORT_IDX(SendDmaPriorityThld)
+#define krp_senddmareloadcnt KREG_IBPORT_IDX(SendDmaReloadCnt)
+#define krp_senddmastatus KREG_IBPORT_IDX(SendDmaStatus)
+#define krp_senddmatail KREG_IBPORT_IDX(SendDmaTail)
+#define krp_sendhdrsymptom KREG_IBPORT_IDX(SendHdrErrSymptom)
+#define krp_sendslid KREG_IBPORT_IDX(SendIBSLIDAssign)
+#define krp_sendslidmask KREG_IBPORT_IDX(SendIBSLIDMask)
+#define krp_ibsdtestiftx KREG_IBPORT_IDX(IB_SDTEST_IF_TX)
+#define krp_adapt_dis_timer KREG_IBPORT_IDX(ADAPT_DISABLE_TIMER_THRESHOLD)
+#define krp_tx_deemph_override KREG_IBPORT_IDX(IBSD_TX_DEEMPHASIS_OVERRIDE)
+#define krp_serdesctrl KREG_IBPORT_IDX(IBSerdesCtrl)
+
+/*
+ * Per-context kernel registers.  Access only with qib_read_kreg_ctxt()
+ * or qib_write_kreg_ctxt()
+ */
+#define krc_rcvhdraddr KREG_IDX(RcvHdrAddr0)
+#define krc_rcvhdrtailaddr KREG_IDX(RcvHdrTailAddr0)
+
+/*
+ * TID Flow table, per context.  Reduces
+ * number of hdrq updates to one per flow (or on errors).
+ * context 0 and 1 share same memory, but have distinct
+ * addresses.  Since for now, we never use expected sends
+ * on kernel contexts, we don't worry about that (we initialize
+ * those entries for ctxt 0/1 on driver load twice, for example).
+ */
+#define NUM_TIDFLOWS_CTXT 0x20 /* 0x20 per context; have to hardcode */
+#define ur_rcvflowtable (KREG_IDX(RcvTIDFlowTable0) - KREG_IDX(RcvHdrTail0))
+
+/* these are the error bits in the tid flows, and are W1C */
+#define TIDFLOW_ERRBITS  ( \
+	(SYM_MASK(RcvTIDFlowTable0, GenMismatch) << \
+	SYM_LSB(RcvTIDFlowTable0, GenMismatch)) | \
+	(SYM_MASK(RcvTIDFlowTable0, SeqMismatch) << \
+	SYM_LSB(RcvTIDFlowTable0, SeqMismatch)))
+
+/* Most (not all) Counters are per-IBport.
+ * Requires LBIntCnt is at offset 0 in the group
+ */
+#define CREG_IDX(regname) \
+((QIB_7322_##regname##_0_OFFS - QIB_7322_LBIntCnt_OFFS) / sizeof(u64))
+
+#define crp_badformat CREG_IDX(RxVersionErrCnt)
+#define crp_err_rlen CREG_IDX(RxLenErrCnt)
+#define crp_erricrc CREG_IDX(RxICRCErrCnt)
+#define crp_errlink CREG_IDX(RxLinkMalformCnt)
+#define crp_errlpcrc CREG_IDX(RxLPCRCErrCnt)
+#define crp_errpkey CREG_IDX(RxPKeyMismatchCnt)
+#define crp_errvcrc CREG_IDX(RxVCRCErrCnt)
+#define crp_excessbufferovfl CREG_IDX(ExcessBufferOvflCnt)
+#define crp_iblinkdown CREG_IDX(IBLinkDownedCnt)
+#define crp_iblinkerrrecov CREG_IDX(IBLinkErrRecoveryCnt)
+#define crp_ibstatuschange CREG_IDX(IBStatusChangeCnt)
+#define crp_ibsymbolerr CREG_IDX(IBSymbolErrCnt)
+#define crp_invalidrlen CREG_IDX(RxMaxMinLenErrCnt)
+#define crp_locallinkintegrityerr CREG_IDX(LocalLinkIntegrityErrCnt)
+#define crp_pktrcv CREG_IDX(RxDataPktCnt)
+#define crp_pktrcvflowctrl CREG_IDX(RxFlowPktCnt)
+#define crp_pktsend CREG_IDX(TxDataPktCnt)
+#define crp_pktsendflow CREG_IDX(TxFlowPktCnt)
+#define crp_psrcvdatacount CREG_IDX(PSRcvDataCount)
+#define crp_psrcvpktscount CREG_IDX(PSRcvPktsCount)
+#define crp_psxmitdatacount CREG_IDX(PSXmitDataCount)
+#define crp_psxmitpktscount CREG_IDX(PSXmitPktsCount)
+#define crp_psxmitwaitcount CREG_IDX(PSXmitWaitCount)
+#define crp_rcvebp CREG_IDX(RxEBPCnt)
+#define crp_rcvflowctrlviol CREG_IDX(RxFlowCtrlViolCnt)
+#define crp_rcvovfl CREG_IDX(RxBufOvflCnt)
+#define crp_rxdlidfltr CREG_IDX(RxDlidFltrCnt)
+#define crp_rxdroppkt CREG_IDX(RxDroppedPktCnt)
+#define crp_rxotherlocalphyerr CREG_IDX(RxOtherLocalPhyErrCnt)
+#define crp_rxqpinvalidctxt CREG_IDX(RxQPInvalidContextCnt)
+#define crp_rxvlerr CREG_IDX(RxVlErrCnt)
+#define crp_sendstall CREG_IDX(TxFlowStallCnt)
+#define crp_txdroppedpkt CREG_IDX(TxDroppedPktCnt)
+#define crp_txhdrerr CREG_IDX(TxHeadersErrCnt)
+#define crp_txlenerr CREG_IDX(TxLenErrCnt)
+#define crp_txminmaxlenerr CREG_IDX(TxMaxMinLenErrCnt)
+#define crp_txsdmadesc CREG_IDX(TxSDmaDescCnt)
+#define crp_txunderrun CREG_IDX(TxUnderrunCnt)
+#define crp_txunsupvl CREG_IDX(TxUnsupVLErrCnt)
+#define crp_vl15droppedpkt CREG_IDX(RxVL15DroppedPktCnt)
+#define crp_wordrcv CREG_IDX(RxDwordCnt)
+#define crp_wordsend CREG_IDX(TxDwordCnt)
+#define crp_tx_creditstalls CREG_IDX(TxCreditUpToDateTimeOut)
+
+/* these are the (few) counters that are not port-specific */
+#define CREG_DEVIDX(regname) ((QIB_7322_##regname##_OFFS - \
+			QIB_7322_LBIntCnt_OFFS) / sizeof(u64))
+#define cr_base_egrovfl CREG_DEVIDX(RxP0HdrEgrOvflCnt)
+#define cr_lbint CREG_DEVIDX(LBIntCnt)
+#define cr_lbstall CREG_DEVIDX(LBFlowStallCnt)
+#define cr_pcieretrydiag CREG_DEVIDX(PcieRetryBufDiagQwordCnt)
+#define cr_rxtidflowdrop CREG_DEVIDX(RxTidFlowDropCnt)
+#define cr_tidfull CREG_DEVIDX(RxTIDFullErrCnt)
+#define cr_tidinvalid CREG_DEVIDX(RxTIDValidErrCnt)
+
+/* no chip register for # of IB ports supported, so define */
+#define NUM_IB_PORTS 2
+
+/* 1 VL15 buffer per hardware IB port, no register for this, so define */
+#define NUM_VL15_BUFS NUM_IB_PORTS
+
+/*
+ * context 0 and 1 are special, and there is no chip register that
+ * defines this value, so we have to define it here.
+ * These are all allocated to either 0 or 1 for single port
+ * hardware configuration, otherwise each gets half
+ */
+#define KCTXT0_EGRCNT 2048
+
+/* values for vl and port fields in PBC, 7322-specific */
+#define PBC_PORT_SEL_LSB 26
+#define PBC_PORT_SEL_RMASK 1
+#define PBC_VL_NUM_LSB 27
+#define PBC_VL_NUM_RMASK 7
+#define PBC_7322_VL15_SEND (1ULL << 63) /* pbc; VL15, no credit check */
+#define PBC_7322_VL15_SEND_CTRL (1ULL << 31) /* control version of same */
+
+static u8 ib_rate_to_delay[IB_RATE_120_GBPS + 1] = {
+	[IB_RATE_2_5_GBPS] = 16,
+	[IB_RATE_5_GBPS] = 8,
+	[IB_RATE_10_GBPS] = 4,
+	[IB_RATE_20_GBPS] = 2,
+	[IB_RATE_30_GBPS] = 2,
+	[IB_RATE_40_GBPS] = 1
+};
+
+#define IBA7322_LINKSPEED_SHIFT SYM_LSB(IBCStatusA_0, LinkSpeedActive)
+#define IBA7322_LINKWIDTH_SHIFT SYM_LSB(IBCStatusA_0, LinkWidthActive)
+
+/* link training states, from IBC */
+#define IB_7322_LT_STATE_DISABLED        0x00
+#define IB_7322_LT_STATE_LINKUP          0x01
+#define IB_7322_LT_STATE_POLLACTIVE      0x02
+#define IB_7322_LT_STATE_POLLQUIET       0x03
+#define IB_7322_LT_STATE_SLEEPDELAY      0x04
+#define IB_7322_LT_STATE_SLEEPQUIET      0x05
+#define IB_7322_LT_STATE_CFGDEBOUNCE     0x08
+#define IB_7322_LT_STATE_CFGRCVFCFG      0x09
+#define IB_7322_LT_STATE_CFGWAITRMT      0x0a
+#define IB_7322_LT_STATE_CFGIDLE         0x0b
+#define IB_7322_LT_STATE_RECOVERRETRAIN  0x0c
+#define IB_7322_LT_STATE_TXREVLANES      0x0d
+#define IB_7322_LT_STATE_RECOVERWAITRMT  0x0e
+#define IB_7322_LT_STATE_RECOVERIDLE     0x0f
+#define IB_7322_LT_STATE_CFGENH          0x10
+#define IB_7322_LT_STATE_CFGTEST         0x11
+#define IB_7322_LT_STATE_CFGWAITRMTTEST  0x12
+#define IB_7322_LT_STATE_CFGWAITENH      0x13
+
+/* link state machine states from IBC */
+#define IB_7322_L_STATE_DOWN             0x0
+#define IB_7322_L_STATE_INIT             0x1
+#define IB_7322_L_STATE_ARM              0x2
+#define IB_7322_L_STATE_ACTIVE           0x3
+#define IB_7322_L_STATE_ACT_DEFER        0x4
+
+static const u8 qib_7322_physportstate[0x20] = {
+	[IB_7322_LT_STATE_DISABLED] = IB_PHYSPORTSTATE_DISABLED,
+	[IB_7322_LT_STATE_LINKUP] = IB_PHYSPORTSTATE_LINKUP,
+	[IB_7322_LT_STATE_POLLACTIVE] = IB_PHYSPORTSTATE_POLL,
+	[IB_7322_LT_STATE_POLLQUIET] = IB_PHYSPORTSTATE_POLL,
+	[IB_7322_LT_STATE_SLEEPDELAY] = IB_PHYSPORTSTATE_SLEEP,
+	[IB_7322_LT_STATE_SLEEPQUIET] = IB_PHYSPORTSTATE_SLEEP,
+	[IB_7322_LT_STATE_CFGDEBOUNCE] = IB_PHYSPORTSTATE_CFG_TRAIN,
+	[IB_7322_LT_STATE_CFGRCVFCFG] =
+		IB_PHYSPORTSTATE_CFG_TRAIN,
+	[IB_7322_LT_STATE_CFGWAITRMT] =
+		IB_PHYSPORTSTATE_CFG_TRAIN,
+	[IB_7322_LT_STATE_CFGIDLE] = IB_PHYSPORTSTATE_CFG_IDLE,
+	[IB_7322_LT_STATE_RECOVERRETRAIN] =
+		IB_PHYSPORTSTATE_LINK_ERR_RECOVER,
+	[IB_7322_LT_STATE_RECOVERWAITRMT] =
+		IB_PHYSPORTSTATE_LINK_ERR_RECOVER,
+	[IB_7322_LT_STATE_RECOVERIDLE] =
+		IB_PHYSPORTSTATE_LINK_ERR_RECOVER,
+	[IB_7322_LT_STATE_CFGENH] = IB_PHYSPORTSTATE_CFG_ENH,
+	[IB_7322_LT_STATE_CFGTEST] = IB_PHYSPORTSTATE_CFG_TRAIN,
+	[IB_7322_LT_STATE_CFGWAITRMTTEST] =
+		IB_PHYSPORTSTATE_CFG_TRAIN,
+	[IB_7322_LT_STATE_CFGWAITENH] =
+		IB_PHYSPORTSTATE_CFG_WAIT_ENH,
+	[0x14] = IB_PHYSPORTSTATE_CFG_TRAIN,
+	[0x15] = IB_PHYSPORTSTATE_CFG_TRAIN,
+	[0x16] = IB_PHYSPORTSTATE_CFG_TRAIN,
+	[0x17] = IB_PHYSPORTSTATE_CFG_TRAIN
+};
+
+#ifdef CONFIG_INFINIBAND_QIB_DCA
+struct qib_irq_notify {
+	int rcv;
+	void *arg;
+	struct irq_affinity_notify notify;
+};
+#endif
+
+struct qib_chip_specific {
+	u64 __iomem *cregbase;
+	u64 *cntrs;
+	spinlock_t rcvmod_lock; /* protect rcvctrl shadow changes */
+	spinlock_t gpio_lock; /* RMW of shadows/regs for ExtCtrl and GPIO */
+	u64 main_int_mask;      /* clear bits which have dedicated handlers */
+	u64 int_enable_mask;  /* for per port interrupts in single port mode */
+	u64 errormask;
+	u64 hwerrmask;
+	u64 gpio_out; /* shadow of kr_gpio_out, for rmw ops */
+	u64 gpio_mask; /* shadow the gpio mask register */
+	u64 extctrl; /* shadow the gpio output enable, etc... */
+	u32 ncntrs;
+	u32 nportcntrs;
+	u32 cntrnamelen;
+	u32 portcntrnamelen;
+	u32 numctxts;
+	u32 rcvegrcnt;
+	u32 updthresh; /* current AvailUpdThld */
+	u32 updthresh_dflt; /* default AvailUpdThld */
+	u32 r1;
+	int irq;
+	u32 num_msix_entries;
+	u32 sdmabufcnt;
+	u32 lastbuf_for_pio;
+	u32 stay_in_freeze;
+	u32 recovery_ports_initted;
+#ifdef CONFIG_INFINIBAND_QIB_DCA
+	u32 dca_ctrl;
+	int rhdr_cpu[18];
+	int sdma_cpu[2];
+	u64 dca_rcvhdr_ctrl[5]; /* B, C, D, E, F */
+#endif
+	struct qib_msix_entry *msix_entries;
+	unsigned long *sendchkenable;
+	unsigned long *sendgrhchk;
+	unsigned long *sendibchk;
+	u32 rcvavail_timeout[18];
+	char emsgbuf[128]; /* for device error interrupt msg buffer */
+};
+
+/* Table of entries in "human readable" form Tx Emphasis. */
+struct txdds_ent {
+	u8 amp;
+	u8 pre;
+	u8 main;
+	u8 post;
+};
+
+struct vendor_txdds_ent {
+	u8 oui[QSFP_VOUI_LEN];
+	u8 *partnum;
+	struct txdds_ent sdr;
+	struct txdds_ent ddr;
+	struct txdds_ent qdr;
+};
+
+static void write_tx_serdes_param(struct qib_pportdata *, struct txdds_ent *);
+
+#define TXDDS_TABLE_SZ 16 /* number of entries per speed in onchip table */
+#define TXDDS_EXTRA_SZ 18 /* number of extra tx settings entries */
+#define TXDDS_MFG_SZ 2    /* number of mfg tx settings entries */
+#define SERDES_CHANS 4 /* yes, it's obvious, but one less magic number */
+
+#define H1_FORCE_VAL 8
+#define H1_FORCE_QME 1 /*  may be overridden via setup_txselect() */
+#define H1_FORCE_QMH 7 /*  may be overridden via setup_txselect() */
+
+/* The static and dynamic registers are paired, and the pairs indexed by spd */
+#define krp_static_adapt_dis(spd) (KREG_IBPORT_IDX(ADAPT_DISABLE_STATIC_SDR) \
+	+ ((spd) * 2))
+
+#define QDR_DFE_DISABLE_DELAY 4000 /* msec after LINKUP */
+#define QDR_STATIC_ADAPT_DOWN 0xf0f0f0f0ULL /* link down, H1-H4 QDR adapts */
+#define QDR_STATIC_ADAPT_DOWN_R1 0ULL /* r1 link down, H1-H4 QDR adapts */
+#define QDR_STATIC_ADAPT_INIT 0xffffffffffULL /* up, disable H0,H1-8, LE */
+#define QDR_STATIC_ADAPT_INIT_R1 0xf0ffffffffULL /* r1 up, disable H0,H1-8 */
+
+struct qib_chippport_specific {
+	u64 __iomem *kpregbase;
+	u64 __iomem *cpregbase;
+	u64 *portcntrs;
+	struct qib_pportdata *ppd;
+	wait_queue_head_t autoneg_wait;
+	struct delayed_work autoneg_work;
+	struct delayed_work ipg_work;
+	struct timer_list chase_timer;
+	/*
+	 * these 5 fields are used to establish deltas for IB symbol
+	 * errors and linkrecovery errors.  They can be reported on
+	 * some chips during link negotiation prior to INIT, and with
+	 * DDR when faking DDR negotiations with non-IBTA switches.
+	 * The chip counters are adjusted at driver unload if there is
+	 * a non-zero delta.
+	 */
+	u64 ibdeltainprog;
+	u64 ibsymdelta;
+	u64 ibsymsnap;
+	u64 iblnkerrdelta;
+	u64 iblnkerrsnap;
+	u64 iblnkdownsnap;
+	u64 iblnkdowndelta;
+	u64 ibmalfdelta;
+	u64 ibmalfsnap;
+	u64 ibcctrl_a; /* krp_ibcctrl_a shadow */
+	u64 ibcctrl_b; /* krp_ibcctrl_b shadow */
+	unsigned long qdr_dfe_time;
+	unsigned long chase_end;
+	u32 autoneg_tries;
+	u32 recovery_init;
+	u32 qdr_dfe_on;
+	u32 qdr_reforce;
+	/*
+	 * Per-bay per-channel rcv QMH H1 values and Tx values for QDR.
+	 * entry zero is unused, to simplify indexing
+	 */
+	u8 h1_val;
+	u8 no_eep;  /* txselect table index to use if no qsfp info */
+	u8 ipg_tries;
+	u8 ibmalfusesnap;
+	struct qib_qsfp_data qsfp_data;
+	char epmsgbuf[192]; /* for port error interrupt msg buffer */
+	char sdmamsgbuf[192]; /* for per-port sdma error messages */
+};
+
+static struct {
+	const char *name;
+	irq_handler_t handler;
+	int lsb;
+	int port; /* 0 if not port-specific, else port # */
+	int dca;
+} irq_table[] = {
+	{ "", qib_7322intr, -1, 0, 0 },
+	{ " (buf avail)", qib_7322bufavail,
+		SYM_LSB(IntStatus, SendBufAvail), 0, 0},
+	{ " (sdma 0)", sdma_intr,
+		SYM_LSB(IntStatus, SDmaInt_0), 1, 1 },
+	{ " (sdma 1)", sdma_intr,
+		SYM_LSB(IntStatus, SDmaInt_1), 2, 1 },
+	{ " (sdmaI 0)", sdma_idle_intr,
+		SYM_LSB(IntStatus, SDmaIdleInt_0), 1, 1},
+	{ " (sdmaI 1)", sdma_idle_intr,
+		SYM_LSB(IntStatus, SDmaIdleInt_1), 2, 1},
+	{ " (sdmaP 0)", sdma_progress_intr,
+		SYM_LSB(IntStatus, SDmaProgressInt_0), 1, 1 },
+	{ " (sdmaP 1)", sdma_progress_intr,
+		SYM_LSB(IntStatus, SDmaProgressInt_1), 2, 1 },
+	{ " (sdmaC 0)", sdma_cleanup_intr,
+		SYM_LSB(IntStatus, SDmaCleanupDone_0), 1, 0 },
+	{ " (sdmaC 1)", sdma_cleanup_intr,
+		SYM_LSB(IntStatus, SDmaCleanupDone_1), 2 , 0},
+};
+
+#ifdef CONFIG_INFINIBAND_QIB_DCA
+
+static const struct dca_reg_map {
+	int     shadow_inx;
+	int     lsb;
+	u64     mask;
+	u16     regno;
+} dca_rcvhdr_reg_map[] = {
+	{ 0, SYM_LSB(DCACtrlB, RcvHdrq0DCAOPH),
+	   ~SYM_MASK(DCACtrlB, RcvHdrq0DCAOPH) , KREG_IDX(DCACtrlB) },
+	{ 0, SYM_LSB(DCACtrlB, RcvHdrq1DCAOPH),
+	   ~SYM_MASK(DCACtrlB, RcvHdrq1DCAOPH) , KREG_IDX(DCACtrlB) },
+	{ 0, SYM_LSB(DCACtrlB, RcvHdrq2DCAOPH),
+	   ~SYM_MASK(DCACtrlB, RcvHdrq2DCAOPH) , KREG_IDX(DCACtrlB) },
+	{ 0, SYM_LSB(DCACtrlB, RcvHdrq3DCAOPH),
+	   ~SYM_MASK(DCACtrlB, RcvHdrq3DCAOPH) , KREG_IDX(DCACtrlB) },
+	{ 1, SYM_LSB(DCACtrlC, RcvHdrq4DCAOPH),
+	   ~SYM_MASK(DCACtrlC, RcvHdrq4DCAOPH) , KREG_IDX(DCACtrlC) },
+	{ 1, SYM_LSB(DCACtrlC, RcvHdrq5DCAOPH),
+	   ~SYM_MASK(DCACtrlC, RcvHdrq5DCAOPH) , KREG_IDX(DCACtrlC) },
+	{ 1, SYM_LSB(DCACtrlC, RcvHdrq6DCAOPH),
+	   ~SYM_MASK(DCACtrlC, RcvHdrq6DCAOPH) , KREG_IDX(DCACtrlC) },
+	{ 1, SYM_LSB(DCACtrlC, RcvHdrq7DCAOPH),
+	   ~SYM_MASK(DCACtrlC, RcvHdrq7DCAOPH) , KREG_IDX(DCACtrlC) },
+	{ 2, SYM_LSB(DCACtrlD, RcvHdrq8DCAOPH),
+	   ~SYM_MASK(DCACtrlD, RcvHdrq8DCAOPH) , KREG_IDX(DCACtrlD) },
+	{ 2, SYM_LSB(DCACtrlD, RcvHdrq9DCAOPH),
+	   ~SYM_MASK(DCACtrlD, RcvHdrq9DCAOPH) , KREG_IDX(DCACtrlD) },
+	{ 2, SYM_LSB(DCACtrlD, RcvHdrq10DCAOPH),
+	   ~SYM_MASK(DCACtrlD, RcvHdrq10DCAOPH) , KREG_IDX(DCACtrlD) },
+	{ 2, SYM_LSB(DCACtrlD, RcvHdrq11DCAOPH),
+	   ~SYM_MASK(DCACtrlD, RcvHdrq11DCAOPH) , KREG_IDX(DCACtrlD) },
+	{ 3, SYM_LSB(DCACtrlE, RcvHdrq12DCAOPH),
+	   ~SYM_MASK(DCACtrlE, RcvHdrq12DCAOPH) , KREG_IDX(DCACtrlE) },
+	{ 3, SYM_LSB(DCACtrlE, RcvHdrq13DCAOPH),
+	   ~SYM_MASK(DCACtrlE, RcvHdrq13DCAOPH) , KREG_IDX(DCACtrlE) },
+	{ 3, SYM_LSB(DCACtrlE, RcvHdrq14DCAOPH),
+	   ~SYM_MASK(DCACtrlE, RcvHdrq14DCAOPH) , KREG_IDX(DCACtrlE) },
+	{ 3, SYM_LSB(DCACtrlE, RcvHdrq15DCAOPH),
+	   ~SYM_MASK(DCACtrlE, RcvHdrq15DCAOPH) , KREG_IDX(DCACtrlE) },
+	{ 4, SYM_LSB(DCACtrlF, RcvHdrq16DCAOPH),
+	   ~SYM_MASK(DCACtrlF, RcvHdrq16DCAOPH) , KREG_IDX(DCACtrlF) },
+	{ 4, SYM_LSB(DCACtrlF, RcvHdrq17DCAOPH),
+	   ~SYM_MASK(DCACtrlF, RcvHdrq17DCAOPH) , KREG_IDX(DCACtrlF) },
+};
+#endif
+
+/* ibcctrl bits */
+#define QLOGIC_IB_IBCC_LINKINITCMD_DISABLE 1
+/* cycle through TS1/TS2 till OK */
+#define QLOGIC_IB_IBCC_LINKINITCMD_POLL 2
+/* wait for TS1, then go on */
+#define QLOGIC_IB_IBCC_LINKINITCMD_SLEEP 3
+#define QLOGIC_IB_IBCC_LINKINITCMD_SHIFT 16
+
+#define QLOGIC_IB_IBCC_LINKCMD_DOWN 1           /* move to 0x11 */
+#define QLOGIC_IB_IBCC_LINKCMD_ARMED 2          /* move to 0x21 */
+#define QLOGIC_IB_IBCC_LINKCMD_ACTIVE 3 /* move to 0x31 */
+
+#define BLOB_7322_IBCHG 0x101
+
+static inline void qib_write_kreg(const struct qib_devdata *dd,
+				  const u32 regno, u64 value);
+static inline u32 qib_read_kreg32(const struct qib_devdata *, const u32);
+static void write_7322_initregs(struct qib_devdata *);
+static void write_7322_init_portregs(struct qib_pportdata *);
+static void setup_7322_link_recovery(struct qib_pportdata *, u32);
+static void check_7322_rxe_status(struct qib_pportdata *);
+static u32 __iomem *qib_7322_getsendbuf(struct qib_pportdata *, u64, u32 *);
+#ifdef CONFIG_INFINIBAND_QIB_DCA
+static void qib_setup_dca(struct qib_devdata *dd);
+static void setup_dca_notifier(struct qib_devdata *dd,
+			       struct qib_msix_entry *m);
+static void reset_dca_notifier(struct qib_devdata *dd,
+			       struct qib_msix_entry *m);
+#endif
+
+/**
+ * qib_read_ureg32 - read 32-bit virtualized per-context register
+ * @dd: device
+ * @regno: register number
+ * @ctxt: context number
+ *
+ * Return the contents of a register that is virtualized to be per context.
+ * Returns -1 on errors (not distinguishable from valid contents at
+ * runtime; we may add a separate error variable at some point).
+ */
+static inline u32 qib_read_ureg32(const struct qib_devdata *dd,
+				  enum qib_ureg regno, int ctxt)
+{
+	if (!dd->kregbase || !(dd->flags & QIB_PRESENT))
+		return 0;
+	return readl(regno + (u64 __iomem *)(
+		(dd->ureg_align * ctxt) + (dd->userbase ?
+		 (char __iomem *)dd->userbase :
+		 (char __iomem *)dd->kregbase + dd->uregbase)));
+}
+
+/**
+ * qib_read_ureg - read virtualized per-context register
+ * @dd: device
+ * @regno: register number
+ * @ctxt: context number
+ *
+ * Return the contents of a register that is virtualized to be per context.
+ * Returns -1 on errors (not distinguishable from valid contents at
+ * runtime; we may add a separate error variable at some point).
+ */
+static inline u64 qib_read_ureg(const struct qib_devdata *dd,
+				enum qib_ureg regno, int ctxt)
+{
+
+	if (!dd->kregbase || !(dd->flags & QIB_PRESENT))
+		return 0;
+	return readq(regno + (u64 __iomem *)(
+		(dd->ureg_align * ctxt) + (dd->userbase ?
+		 (char __iomem *)dd->userbase :
+		 (char __iomem *)dd->kregbase + dd->uregbase)));
+}
+
+/**
+ * qib_write_ureg - write virtualized per-context register
+ * @dd: device
+ * @regno: register number
+ * @value: value
+ * @ctxt: context
+ *
+ * Write the contents of a register that is virtualized to be per context.
+ */
+static inline void qib_write_ureg(const struct qib_devdata *dd,
+				  enum qib_ureg regno, u64 value, int ctxt)
+{
+	u64 __iomem *ubase;
+	if (dd->userbase)
+		ubase = (u64 __iomem *)
+			((char __iomem *) dd->userbase +
+			 dd->ureg_align * ctxt);
+	else
+		ubase = (u64 __iomem *)
+			(dd->uregbase +
+			 (char __iomem *) dd->kregbase +
+			 dd->ureg_align * ctxt);
+
+	if (dd->kregbase && (dd->flags & QIB_PRESENT))
+		writeq(value, &ubase[regno]);
+}
+
+static inline u32 qib_read_kreg32(const struct qib_devdata *dd,
+				  const u32 regno)
+{
+	if (!dd->kregbase || !(dd->flags & QIB_PRESENT))
+		return -1;
+	return readl((u32 __iomem *) &dd->kregbase[regno]);
+}
+
+static inline u64 qib_read_kreg64(const struct qib_devdata *dd,
+				  const u32 regno)
+{
+	if (!dd->kregbase || !(dd->flags & QIB_PRESENT))
+		return -1;
+	return readq(&dd->kregbase[regno]);
+}
+
+static inline void qib_write_kreg(const struct qib_devdata *dd,
+				  const u32 regno, u64 value)
+{
+	if (dd->kregbase && (dd->flags & QIB_PRESENT))
+		writeq(value, &dd->kregbase[regno]);
+}
+
+/*
+ * not many sanity checks for the port-specific kernel register routines,
+ * since they are only used when it's known to be safe.
+*/
+static inline u64 qib_read_kreg_port(const struct qib_pportdata *ppd,
+				     const u16 regno)
+{
+	if (!ppd->cpspec->kpregbase || !(ppd->dd->flags & QIB_PRESENT))
+		return 0ULL;
+	return readq(&ppd->cpspec->kpregbase[regno]);
+}
+
+static inline void qib_write_kreg_port(const struct qib_pportdata *ppd,
+				       const u16 regno, u64 value)
+{
+	if (ppd->cpspec && ppd->dd && ppd->cpspec->kpregbase &&
+	    (ppd->dd->flags & QIB_PRESENT))
+		writeq(value, &ppd->cpspec->kpregbase[regno]);
+}
+
+/**
+ * qib_write_kreg_ctxt - write a device's per-ctxt 64-bit kernel register
+ * @dd: the qlogic_ib device
+ * @regno: the register number to write
+ * @ctxt: the context containing the register
+ * @value: the value to write
+ */
+static inline void qib_write_kreg_ctxt(const struct qib_devdata *dd,
+				       const u16 regno, unsigned ctxt,
+				       u64 value)
+{
+	qib_write_kreg(dd, regno + ctxt, value);
+}
+
+static inline u64 read_7322_creg(const struct qib_devdata *dd, u16 regno)
+{
+	if (!dd->cspec->cregbase || !(dd->flags & QIB_PRESENT))
+		return 0;
+	return readq(&dd->cspec->cregbase[regno]);
+
+
+}
+
+static inline u32 read_7322_creg32(const struct qib_devdata *dd, u16 regno)
+{
+	if (!dd->cspec->cregbase || !(dd->flags & QIB_PRESENT))
+		return 0;
+	return readl(&dd->cspec->cregbase[regno]);
+
+
+}
+
+static inline void write_7322_creg_port(const struct qib_pportdata *ppd,
+					u16 regno, u64 value)
+{
+	if (ppd->cpspec && ppd->cpspec->cpregbase &&
+	    (ppd->dd->flags & QIB_PRESENT))
+		writeq(value, &ppd->cpspec->cpregbase[regno]);
+}
+
+static inline u64 read_7322_creg_port(const struct qib_pportdata *ppd,
+				      u16 regno)
+{
+	if (!ppd->cpspec || !ppd->cpspec->cpregbase ||
+	    !(ppd->dd->flags & QIB_PRESENT))
+		return 0;
+	return readq(&ppd->cpspec->cpregbase[regno]);
+}
+
+static inline u32 read_7322_creg32_port(const struct qib_pportdata *ppd,
+					u16 regno)
+{
+	if (!ppd->cpspec || !ppd->cpspec->cpregbase ||
+	    !(ppd->dd->flags & QIB_PRESENT))
+		return 0;
+	return readl(&ppd->cpspec->cpregbase[regno]);
+}
+
+/* bits in Control register */
+#define QLOGIC_IB_C_RESET SYM_MASK(Control, SyncReset)
+#define QLOGIC_IB_C_SDMAFETCHPRIOEN SYM_MASK(Control, SDmaDescFetchPriorityEn)
+
+/* bits in general interrupt regs */
+#define QIB_I_RCVURG_LSB SYM_LSB(IntMask, RcvUrg0IntMask)
+#define QIB_I_RCVURG_RMASK MASK_ACROSS(0, 17)
+#define QIB_I_RCVURG_MASK (QIB_I_RCVURG_RMASK << QIB_I_RCVURG_LSB)
+#define QIB_I_RCVAVAIL_LSB SYM_LSB(IntMask, RcvAvail0IntMask)
+#define QIB_I_RCVAVAIL_RMASK MASK_ACROSS(0, 17)
+#define QIB_I_RCVAVAIL_MASK (QIB_I_RCVAVAIL_RMASK << QIB_I_RCVAVAIL_LSB)
+#define QIB_I_C_ERROR INT_MASK(Err)
+
+#define QIB_I_SPIOSENT (INT_MASK_P(SendDone, 0) | INT_MASK_P(SendDone, 1))
+#define QIB_I_SPIOBUFAVAIL INT_MASK(SendBufAvail)
+#define QIB_I_GPIO INT_MASK(AssertGPIO)
+#define QIB_I_P_SDMAINT(pidx) \
+	(INT_MASK_P(SDma, pidx) | INT_MASK_P(SDmaIdle, pidx) | \
+	 INT_MASK_P(SDmaProgress, pidx) | \
+	 INT_MASK_PM(SDmaCleanupDone, pidx))
+
+/* Interrupt bits that are "per port" */
+#define QIB_I_P_BITSEXTANT(pidx) \
+	(INT_MASK_P(Err, pidx) | INT_MASK_P(SendDone, pidx) | \
+	INT_MASK_P(SDma, pidx) | INT_MASK_P(SDmaIdle, pidx) | \
+	INT_MASK_P(SDmaProgress, pidx) | \
+	INT_MASK_PM(SDmaCleanupDone, pidx))
+
+/* Interrupt bits that are common to a device */
+/* currently unused: QIB_I_SPIOSENT */
+#define QIB_I_C_BITSEXTANT \
+	(QIB_I_RCVURG_MASK | QIB_I_RCVAVAIL_MASK | \
+	QIB_I_SPIOSENT | \
+	QIB_I_C_ERROR | QIB_I_SPIOBUFAVAIL | QIB_I_GPIO)
+
+#define QIB_I_BITSEXTANT (QIB_I_C_BITSEXTANT | \
+	QIB_I_P_BITSEXTANT(0) | QIB_I_P_BITSEXTANT(1))
+
+/*
+ * Error bits that are "per port".
+ */
+#define QIB_E_P_IBSTATUSCHANGED ERR_MASK_N(IBStatusChanged)
+#define QIB_E_P_SHDR ERR_MASK_N(SHeadersErr)
+#define QIB_E_P_VL15_BUF_MISUSE ERR_MASK_N(VL15BufMisuseErr)
+#define QIB_E_P_SND_BUF_MISUSE ERR_MASK_N(SendBufMisuseErr)
+#define QIB_E_P_SUNSUPVL ERR_MASK_N(SendUnsupportedVLErr)
+#define QIB_E_P_SUNEXP_PKTNUM ERR_MASK_N(SendUnexpectedPktNumErr)
+#define QIB_E_P_SDROP_DATA ERR_MASK_N(SendDroppedDataPktErr)
+#define QIB_E_P_SDROP_SMP ERR_MASK_N(SendDroppedSmpPktErr)
+#define QIB_E_P_SPKTLEN ERR_MASK_N(SendPktLenErr)
+#define QIB_E_P_SUNDERRUN ERR_MASK_N(SendUnderRunErr)
+#define QIB_E_P_SMAXPKTLEN ERR_MASK_N(SendMaxPktLenErr)
+#define QIB_E_P_SMINPKTLEN ERR_MASK_N(SendMinPktLenErr)
+#define QIB_E_P_RIBLOSTLINK ERR_MASK_N(RcvIBLostLinkErr)
+#define QIB_E_P_RHDR ERR_MASK_N(RcvHdrErr)
+#define QIB_E_P_RHDRLEN ERR_MASK_N(RcvHdrLenErr)
+#define QIB_E_P_RBADTID ERR_MASK_N(RcvBadTidErr)
+#define QIB_E_P_RBADVERSION ERR_MASK_N(RcvBadVersionErr)
+#define QIB_E_P_RIBFLOW ERR_MASK_N(RcvIBFlowErr)
+#define QIB_E_P_REBP ERR_MASK_N(RcvEBPErr)
+#define QIB_E_P_RUNSUPVL ERR_MASK_N(RcvUnsupportedVLErr)
+#define QIB_E_P_RUNEXPCHAR ERR_MASK_N(RcvUnexpectedCharErr)
+#define QIB_E_P_RSHORTPKTLEN ERR_MASK_N(RcvShortPktLenErr)
+#define QIB_E_P_RLONGPKTLEN ERR_MASK_N(RcvLongPktLenErr)
+#define QIB_E_P_RMAXPKTLEN ERR_MASK_N(RcvMaxPktLenErr)
+#define QIB_E_P_RMINPKTLEN ERR_MASK_N(RcvMinPktLenErr)
+#define QIB_E_P_RICRC ERR_MASK_N(RcvICRCErr)
+#define QIB_E_P_RVCRC ERR_MASK_N(RcvVCRCErr)
+#define QIB_E_P_RFORMATERR ERR_MASK_N(RcvFormatErr)
+
+#define QIB_E_P_SDMA1STDESC ERR_MASK_N(SDma1stDescErr)
+#define QIB_E_P_SDMABASE ERR_MASK_N(SDmaBaseErr)
+#define QIB_E_P_SDMADESCADDRMISALIGN ERR_MASK_N(SDmaDescAddrMisalignErr)
+#define QIB_E_P_SDMADWEN ERR_MASK_N(SDmaDwEnErr)
+#define QIB_E_P_SDMAGENMISMATCH ERR_MASK_N(SDmaGenMismatchErr)
+#define QIB_E_P_SDMAHALT ERR_MASK_N(SDmaHaltErr)
+#define QIB_E_P_SDMAMISSINGDW ERR_MASK_N(SDmaMissingDwErr)
+#define QIB_E_P_SDMAOUTOFBOUND ERR_MASK_N(SDmaOutOfBoundErr)
+#define QIB_E_P_SDMARPYTAG ERR_MASK_N(SDmaRpyTagErr)
+#define QIB_E_P_SDMATAILOUTOFBOUND ERR_MASK_N(SDmaTailOutOfBoundErr)
+#define QIB_E_P_SDMAUNEXPDATA ERR_MASK_N(SDmaUnexpDataErr)
+
+/* Error bits that are common to a device */
+#define QIB_E_RESET ERR_MASK(ResetNegated)
+#define QIB_E_HARDWARE ERR_MASK(HardwareErr)
+#define QIB_E_INVALIDADDR ERR_MASK(InvalidAddrErr)
+
+
+/*
+ * Per chip (rather than per-port) errors.  Most either do
+ * nothing but trigger a print (because they self-recover, or
+ * always occur in tandem with other errors that handle the
+ * issue), or because they indicate errors with no recovery,
+ * but we want to know that they happened.
+ */
+#define QIB_E_SBUF_VL15_MISUSE ERR_MASK(SBufVL15MisUseErr)
+#define QIB_E_BADEEP ERR_MASK(InvalidEEPCmd)
+#define QIB_E_VLMISMATCH ERR_MASK(SendVLMismatchErr)
+#define QIB_E_ARMLAUNCH ERR_MASK(SendArmLaunchErr)
+#define QIB_E_SPCLTRIG ERR_MASK(SendSpecialTriggerErr)
+#define QIB_E_RRCVHDRFULL ERR_MASK(RcvHdrFullErr)
+#define QIB_E_RRCVEGRFULL ERR_MASK(RcvEgrFullErr)
+#define QIB_E_RCVCTXTSHARE ERR_MASK(RcvContextShareErr)
+
+/* SDMA chip errors (not per port)
+ * QIB_E_SDMA_BUF_DUP needs no special handling, because we will also get
+ * the SDMAHALT error immediately, so we just print the dup error via the
+ * E_AUTO mechanism.  This is true of most of the per-port fatal errors
+ * as well, but since this is port-independent, by definition, it's
+ * handled a bit differently.  SDMA_VL15 and SDMA_WRONG_PORT are per
+ * packet send errors, and so are handled in the same manner as other
+ * per-packet errors.
+ */
+#define QIB_E_SDMA_VL15 ERR_MASK(SDmaVL15Err)
+#define QIB_E_SDMA_WRONG_PORT ERR_MASK(SDmaWrongPortErr)
+#define QIB_E_SDMA_BUF_DUP ERR_MASK(SDmaBufMaskDuplicateErr)
+
+/*
+ * Below functionally equivalent to legacy QLOGIC_IB_E_PKTERRS
+ * it is used to print "common" packet errors.
+ */
+#define QIB_E_P_PKTERRS (QIB_E_P_SPKTLEN |\
+	QIB_E_P_SDROP_DATA | QIB_E_P_RVCRC |\
+	QIB_E_P_RICRC | QIB_E_P_RSHORTPKTLEN |\
+	QIB_E_P_VL15_BUF_MISUSE | QIB_E_P_SHDR | \
+	QIB_E_P_REBP)
+
+/* Error Bits that Packet-related (Receive, per-port) */
+#define QIB_E_P_RPKTERRS (\
+	QIB_E_P_RHDRLEN | QIB_E_P_RBADTID | \
+	QIB_E_P_RBADVERSION | QIB_E_P_RHDR | \
+	QIB_E_P_RLONGPKTLEN | QIB_E_P_RSHORTPKTLEN |\
+	QIB_E_P_RMAXPKTLEN | QIB_E_P_RMINPKTLEN | \
+	QIB_E_P_RFORMATERR | QIB_E_P_RUNSUPVL | \
+	QIB_E_P_RUNEXPCHAR | QIB_E_P_RIBFLOW | QIB_E_P_REBP)
+
+/*
+ * Error bits that are Send-related (per port)
+ * (ARMLAUNCH excluded from E_SPKTERRS because it gets special handling).
+ * All of these potentially need to have a buffer disarmed
+ */
+#define QIB_E_P_SPKTERRS (\
+	QIB_E_P_SUNEXP_PKTNUM |\
+	QIB_E_P_SDROP_DATA | QIB_E_P_SDROP_SMP |\
+	QIB_E_P_SMAXPKTLEN |\
+	QIB_E_P_VL15_BUF_MISUSE | QIB_E_P_SHDR | \
+	QIB_E_P_SMINPKTLEN | QIB_E_P_SPKTLEN | \
+	QIB_E_P_SND_BUF_MISUSE | QIB_E_P_SUNSUPVL)
+
+#define QIB_E_SPKTERRS ( \
+		QIB_E_SBUF_VL15_MISUSE | QIB_E_VLMISMATCH | \
+		ERR_MASK_N(SendUnsupportedVLErr) |			\
+		QIB_E_SPCLTRIG | QIB_E_SDMA_VL15 | QIB_E_SDMA_WRONG_PORT)
+
+#define QIB_E_P_SDMAERRS ( \
+	QIB_E_P_SDMAHALT | \
+	QIB_E_P_SDMADESCADDRMISALIGN | \
+	QIB_E_P_SDMAUNEXPDATA | \
+	QIB_E_P_SDMAMISSINGDW | \
+	QIB_E_P_SDMADWEN | \
+	QIB_E_P_SDMARPYTAG | \
+	QIB_E_P_SDMA1STDESC | \
+	QIB_E_P_SDMABASE | \
+	QIB_E_P_SDMATAILOUTOFBOUND | \
+	QIB_E_P_SDMAOUTOFBOUND | \
+	QIB_E_P_SDMAGENMISMATCH)
+
+/*
+ * This sets some bits more than once, but makes it more obvious which
+ * bits are not handled under other categories, and the repeat definition
+ * is not a problem.
+ */
+#define QIB_E_P_BITSEXTANT ( \
+	QIB_E_P_SPKTERRS | QIB_E_P_PKTERRS | QIB_E_P_RPKTERRS | \
+	QIB_E_P_RIBLOSTLINK | QIB_E_P_IBSTATUSCHANGED | \
+	QIB_E_P_SND_BUF_MISUSE | QIB_E_P_SUNDERRUN | \
+	QIB_E_P_SHDR | QIB_E_P_VL15_BUF_MISUSE | QIB_E_P_SDMAERRS \
+	)
+
+/*
+ * These are errors that can occur when the link
+ * changes state while a packet is being sent or received.  This doesn't
+ * cover things like EBP or VCRC that can be the result of a sending
+ * having the link change state, so we receive a "known bad" packet.
+ * All of these are "per port", so renamed:
+ */
+#define QIB_E_P_LINK_PKTERRS (\
+	QIB_E_P_SDROP_DATA | QIB_E_P_SDROP_SMP |\
+	QIB_E_P_SMINPKTLEN | QIB_E_P_SPKTLEN |\
+	QIB_E_P_RSHORTPKTLEN | QIB_E_P_RMINPKTLEN |\
+	QIB_E_P_RUNEXPCHAR)
+
+/*
+ * This sets some bits more than once, but makes it more obvious which
+ * bits are not handled under other categories (such as QIB_E_SPKTERRS),
+ * and the repeat definition is not a problem.
+ */
+#define QIB_E_C_BITSEXTANT (\
+	QIB_E_HARDWARE | QIB_E_INVALIDADDR | QIB_E_BADEEP |\
+	QIB_E_ARMLAUNCH | QIB_E_VLMISMATCH | QIB_E_RRCVHDRFULL |\
+	QIB_E_RRCVEGRFULL | QIB_E_RESET | QIB_E_SBUF_VL15_MISUSE)
+
+/* Likewise Neuter E_SPKT_ERRS_IGNORE */
+#define E_SPKT_ERRS_IGNORE 0
+
+#define QIB_EXTS_MEMBIST_DISABLED \
+	SYM_MASK(EXTStatus, MemBISTDisabled)
+#define QIB_EXTS_MEMBIST_ENDTEST \
+	SYM_MASK(EXTStatus, MemBISTEndTest)
+
+#define QIB_E_SPIOARMLAUNCH \
+	ERR_MASK(SendArmLaunchErr)
+
+#define IBA7322_IBCC_LINKINITCMD_MASK SYM_RMASK(IBCCtrlA_0, LinkInitCmd)
+#define IBA7322_IBCC_LINKCMD_SHIFT SYM_LSB(IBCCtrlA_0, LinkCmd)
+
+/*
+ * IBTA_1_2 is set when multiple speeds are enabled (normal),
+ * and also if forced QDR (only QDR enabled).  It's enabled for the
+ * forced QDR case so that scrambling will be enabled by the TS3
+ * exchange, when supported by both sides of the link.
+ */
+#define IBA7322_IBC_IBTA_1_2_MASK SYM_MASK(IBCCtrlB_0, IB_ENHANCED_MODE)
+#define IBA7322_IBC_MAX_SPEED_MASK SYM_MASK(IBCCtrlB_0, SD_SPEED)
+#define IBA7322_IBC_SPEED_QDR SYM_MASK(IBCCtrlB_0, SD_SPEED_QDR)
+#define IBA7322_IBC_SPEED_DDR SYM_MASK(IBCCtrlB_0, SD_SPEED_DDR)
+#define IBA7322_IBC_SPEED_SDR SYM_MASK(IBCCtrlB_0, SD_SPEED_SDR)
+#define IBA7322_IBC_SPEED_MASK (SYM_MASK(IBCCtrlB_0, SD_SPEED_SDR) | \
+	SYM_MASK(IBCCtrlB_0, SD_SPEED_DDR) | SYM_MASK(IBCCtrlB_0, SD_SPEED_QDR))
+#define IBA7322_IBC_SPEED_LSB SYM_LSB(IBCCtrlB_0, SD_SPEED_SDR)
+
+#define IBA7322_LEDBLINK_OFF_SHIFT SYM_LSB(RcvPktLEDCnt_0, OFFperiod)
+#define IBA7322_LEDBLINK_ON_SHIFT SYM_LSB(RcvPktLEDCnt_0, ONperiod)
+
+#define IBA7322_IBC_WIDTH_AUTONEG SYM_MASK(IBCCtrlB_0, IB_NUM_CHANNELS)
+#define IBA7322_IBC_WIDTH_4X_ONLY (1<<SYM_LSB(IBCCtrlB_0, IB_NUM_CHANNELS))
+#define IBA7322_IBC_WIDTH_1X_ONLY (0<<SYM_LSB(IBCCtrlB_0, IB_NUM_CHANNELS))
+
+#define IBA7322_IBC_RXPOL_MASK SYM_MASK(IBCCtrlB_0, IB_POLARITY_REV_SUPP)
+#define IBA7322_IBC_RXPOL_LSB SYM_LSB(IBCCtrlB_0, IB_POLARITY_REV_SUPP)
+#define IBA7322_IBC_HRTBT_MASK (SYM_MASK(IBCCtrlB_0, HRTBT_AUTO) | \
+	SYM_MASK(IBCCtrlB_0, HRTBT_ENB))
+#define IBA7322_IBC_HRTBT_RMASK (IBA7322_IBC_HRTBT_MASK >> \
+	SYM_LSB(IBCCtrlB_0, HRTBT_ENB))
+#define IBA7322_IBC_HRTBT_LSB SYM_LSB(IBCCtrlB_0, HRTBT_ENB)
+
+#define IBA7322_REDIRECT_VEC_PER_REG 12
+
+#define IBA7322_SENDCHK_PKEY SYM_MASK(SendCheckControl_0, PKey_En)
+#define IBA7322_SENDCHK_BTHQP SYM_MASK(SendCheckControl_0, BTHQP_En)
+#define IBA7322_SENDCHK_SLID SYM_MASK(SendCheckControl_0, SLID_En)
+#define IBA7322_SENDCHK_RAW_IPV6 SYM_MASK(SendCheckControl_0, RawIPV6_En)
+#define IBA7322_SENDCHK_MINSZ SYM_MASK(SendCheckControl_0, PacketTooSmall_En)
+
+#define AUTONEG_TRIES 3 /* sequential retries to negotiate DDR */
+
+#define HWE_AUTO(fldname) { .mask = SYM_MASK(HwErrMask, fldname##Mask), \
+	.msg = #fldname , .sz = sizeof(#fldname) }
+#define HWE_AUTO_P(fldname, port) { .mask = SYM_MASK(HwErrMask, \
+	fldname##Mask##_##port), .msg = #fldname , .sz = sizeof(#fldname) }
+static const struct qib_hwerror_msgs qib_7322_hwerror_msgs[] = {
+	HWE_AUTO_P(IBSerdesPClkNotDetect, 1),
+	HWE_AUTO_P(IBSerdesPClkNotDetect, 0),
+	HWE_AUTO(PCIESerdesPClkNotDetect),
+	HWE_AUTO(PowerOnBISTFailed),
+	HWE_AUTO(TempsenseTholdReached),
+	HWE_AUTO(MemoryErr),
+	HWE_AUTO(PCIeBusParityErr),
+	HWE_AUTO(PcieCplTimeout),
+	HWE_AUTO(PciePoisonedTLP),
+	HWE_AUTO_P(SDmaMemReadErr, 1),
+	HWE_AUTO_P(SDmaMemReadErr, 0),
+	HWE_AUTO_P(IBCBusFromSPCParityErr, 1),
+	HWE_AUTO_P(IBCBusToSPCParityErr, 1),
+	HWE_AUTO_P(IBCBusFromSPCParityErr, 0),
+	HWE_AUTO(statusValidNoEop),
+	HWE_AUTO(LATriggered),
+	{ .mask = 0, .sz = 0 }
+};
+
+#define E_AUTO(fldname) { .mask = SYM_MASK(ErrMask, fldname##Mask), \
+	.msg = #fldname, .sz = sizeof(#fldname) }
+#define E_P_AUTO(fldname) { .mask = SYM_MASK(ErrMask_0, fldname##Mask), \
+	.msg = #fldname, .sz = sizeof(#fldname) }
+static const struct qib_hwerror_msgs qib_7322error_msgs[] = {
+	E_AUTO(RcvEgrFullErr),
+	E_AUTO(RcvHdrFullErr),
+	E_AUTO(ResetNegated),
+	E_AUTO(HardwareErr),
+	E_AUTO(InvalidAddrErr),
+	E_AUTO(SDmaVL15Err),
+	E_AUTO(SBufVL15MisUseErr),
+	E_AUTO(InvalidEEPCmd),
+	E_AUTO(RcvContextShareErr),
+	E_AUTO(SendVLMismatchErr),
+	E_AUTO(SendArmLaunchErr),
+	E_AUTO(SendSpecialTriggerErr),
+	E_AUTO(SDmaWrongPortErr),
+	E_AUTO(SDmaBufMaskDuplicateErr),
+	{ .mask = 0, .sz = 0 }
+};
+
+static const struct  qib_hwerror_msgs qib_7322p_error_msgs[] = {
+	E_P_AUTO(IBStatusChanged),
+	E_P_AUTO(SHeadersErr),
+	E_P_AUTO(VL15BufMisuseErr),
+	/*
+	 * SDmaHaltErr is not really an error, make it clearer;
+	 */
+	{.mask = SYM_MASK(ErrMask_0, SDmaHaltErrMask), .msg = "SDmaHalted",
+		.sz = 11},
+	E_P_AUTO(SDmaDescAddrMisalignErr),
+	E_P_AUTO(SDmaUnexpDataErr),
+	E_P_AUTO(SDmaMissingDwErr),
+	E_P_AUTO(SDmaDwEnErr),
+	E_P_AUTO(SDmaRpyTagErr),
+	E_P_AUTO(SDma1stDescErr),
+	E_P_AUTO(SDmaBaseErr),
+	E_P_AUTO(SDmaTailOutOfBoundErr),
+	E_P_AUTO(SDmaOutOfBoundErr),
+	E_P_AUTO(SDmaGenMismatchErr),
+	E_P_AUTO(SendBufMisuseErr),
+	E_P_AUTO(SendUnsupportedVLErr),
+	E_P_AUTO(SendUnexpectedPktNumErr),
+	E_P_AUTO(SendDroppedDataPktErr),
+	E_P_AUTO(SendDroppedSmpPktErr),
+	E_P_AUTO(SendPktLenErr),
+	E_P_AUTO(SendUnderRunErr),
+	E_P_AUTO(SendMaxPktLenErr),
+	E_P_AUTO(SendMinPktLenErr),
+	E_P_AUTO(RcvIBLostLinkErr),
+	E_P_AUTO(RcvHdrErr),
+	E_P_AUTO(RcvHdrLenErr),
+	E_P_AUTO(RcvBadTidErr),
+	E_P_AUTO(RcvBadVersionErr),
+	E_P_AUTO(RcvIBFlowErr),
+	E_P_AUTO(RcvEBPErr),
+	E_P_AUTO(RcvUnsupportedVLErr),
+	E_P_AUTO(RcvUnexpectedCharErr),
+	E_P_AUTO(RcvShortPktLenErr),
+	E_P_AUTO(RcvLongPktLenErr),
+	E_P_AUTO(RcvMaxPktLenErr),
+	E_P_AUTO(RcvMinPktLenErr),
+	E_P_AUTO(RcvICRCErr),
+	E_P_AUTO(RcvVCRCErr),
+	E_P_AUTO(RcvFormatErr),
+	{ .mask = 0, .sz = 0 }
+};
+
+/*
+ * Below generates "auto-message" for interrupts not specific to any port or
+ * context
+ */
+#define INTR_AUTO(fldname) { .mask = SYM_MASK(IntMask, fldname##Mask), \
+	.msg = #fldname, .sz = sizeof(#fldname) }
+/* Below generates "auto-message" for interrupts specific to a port */
+#define INTR_AUTO_P(fldname) { .mask = MASK_ACROSS(\
+	SYM_LSB(IntMask, fldname##Mask##_0), \
+	SYM_LSB(IntMask, fldname##Mask##_1)), \
+	.msg = #fldname "_P", .sz = sizeof(#fldname "_P") }
+/* For some reason, the SerDesTrimDone bits are reversed */
+#define INTR_AUTO_PI(fldname) { .mask = MASK_ACROSS(\
+	SYM_LSB(IntMask, fldname##Mask##_1), \
+	SYM_LSB(IntMask, fldname##Mask##_0)), \
+	.msg = #fldname "_P", .sz = sizeof(#fldname "_P") }
+/*
+ * Below generates "auto-message" for interrupts specific to a context,
+ * with ctxt-number appended
+ */
+#define INTR_AUTO_C(fldname) { .mask = MASK_ACROSS(\
+	SYM_LSB(IntMask, fldname##0IntMask), \
+	SYM_LSB(IntMask, fldname##17IntMask)), \
+	.msg = #fldname "_C", .sz = sizeof(#fldname "_C") }
+
+static const struct  qib_hwerror_msgs qib_7322_intr_msgs[] = {
+	INTR_AUTO_P(SDmaInt),
+	INTR_AUTO_P(SDmaProgressInt),
+	INTR_AUTO_P(SDmaIdleInt),
+	INTR_AUTO_P(SDmaCleanupDone),
+	INTR_AUTO_C(RcvUrg),
+	INTR_AUTO_P(ErrInt),
+	INTR_AUTO(ErrInt),      /* non-port-specific errs */
+	INTR_AUTO(AssertGPIOInt),
+	INTR_AUTO_P(SendDoneInt),
+	INTR_AUTO(SendBufAvailInt),
+	INTR_AUTO_C(RcvAvail),
+	{ .mask = 0, .sz = 0 }
+};
+
+#define TXSYMPTOM_AUTO_P(fldname) \
+	{ .mask = SYM_MASK(SendHdrErrSymptom_0, fldname), \
+	.msg = #fldname, .sz = sizeof(#fldname) }
+static const struct  qib_hwerror_msgs hdrchk_msgs[] = {
+	TXSYMPTOM_AUTO_P(NonKeyPacket),
+	TXSYMPTOM_AUTO_P(GRHFail),
+	TXSYMPTOM_AUTO_P(PkeyFail),
+	TXSYMPTOM_AUTO_P(QPFail),
+	TXSYMPTOM_AUTO_P(SLIDFail),
+	TXSYMPTOM_AUTO_P(RawIPV6),
+	TXSYMPTOM_AUTO_P(PacketTooSmall),
+	{ .mask = 0, .sz = 0 }
+};
+
+#define IBA7322_HDRHEAD_PKTINT_SHIFT 32 /* interrupt cnt in upper 32 bits */
+
+/*
+ * Called when we might have an error that is specific to a particular
+ * PIO buffer, and may need to cancel that buffer, so it can be re-used,
+ * because we don't need to force the update of pioavail
+ */
+static void qib_disarm_7322_senderrbufs(struct qib_pportdata *ppd)
+{
+	struct qib_devdata *dd = ppd->dd;
+	u32 i;
+	int any;
+	u32 piobcnt = dd->piobcnt2k + dd->piobcnt4k + NUM_VL15_BUFS;
+	u32 regcnt = (piobcnt + BITS_PER_LONG - 1) / BITS_PER_LONG;
+	unsigned long sbuf[4];
+
+	/*
+	 * It's possible that sendbuffererror could have bits set; might
+	 * have already done this as a result of hardware error handling.
+	 */
+	any = 0;
+	for (i = 0; i < regcnt; ++i) {
+		sbuf[i] = qib_read_kreg64(dd, kr_sendbuffererror + i);
+		if (sbuf[i]) {
+			any = 1;
+			qib_write_kreg(dd, kr_sendbuffererror + i, sbuf[i]);
+		}
+	}
+
+	if (any)
+		qib_disarm_piobufs_set(dd, sbuf, piobcnt);
+}
+
+/* No txe_recover yet, if ever */
+
+/* No decode__errors yet */
+static void err_decode(char *msg, size_t len, u64 errs,
+		       const struct qib_hwerror_msgs *msp)
+{
+	u64 these, lmask;
+	int took, multi, n = 0;
+
+	while (errs && msp && msp->mask) {
+		multi = (msp->mask & (msp->mask - 1));
+		while (errs & msp->mask) {
+			these = (errs & msp->mask);
+			lmask = (these & (these - 1)) ^ these;
+			if (len) {
+				if (n++) {
+					/* separate the strings */
+					*msg++ = ',';
+					len--;
+				}
+				BUG_ON(!msp->sz);
+				/* msp->sz counts the nul */
+				took = min_t(size_t, msp->sz - (size_t)1, len);
+				memcpy(msg,  msp->msg, took);
+				len -= took;
+				msg += took;
+				if (len)
+					*msg = '\0';
+			}
+			errs &= ~lmask;
+			if (len && multi) {
+				/* More than one bit this mask */
+				int idx = -1;
+
+				while (lmask & msp->mask) {
+					++idx;
+					lmask >>= 1;
+				}
+				took = scnprintf(msg, len, "_%d", idx);
+				len -= took;
+				msg += took;
+			}
+		}
+		++msp;
+	}
+	/* If some bits are left, show in hex. */
+	if (len && errs)
+		snprintf(msg, len, "%sMORE:%llX", n ? "," : "",
+			(unsigned long long) errs);
+}
+
+/* only called if r1 set */
+static void flush_fifo(struct qib_pportdata *ppd)
+{
+	struct qib_devdata *dd = ppd->dd;
+	u32 __iomem *piobuf;
+	u32 bufn;
+	u32 *hdr;
+	u64 pbc;
+	const unsigned hdrwords = 7;
+	static struct qib_ib_header ibhdr = {
+		.lrh[0] = cpu_to_be16(0xF000 | QIB_LRH_BTH),
+		.lrh[1] = IB_LID_PERMISSIVE,
+		.lrh[2] = cpu_to_be16(hdrwords + SIZE_OF_CRC),
+		.lrh[3] = IB_LID_PERMISSIVE,
+		.u.oth.bth[0] = cpu_to_be32(
+			(IB_OPCODE_UD_SEND_ONLY << 24) | QIB_DEFAULT_P_KEY),
+		.u.oth.bth[1] = cpu_to_be32(0),
+		.u.oth.bth[2] = cpu_to_be32(0),
+		.u.oth.u.ud.deth[0] = cpu_to_be32(0),
+		.u.oth.u.ud.deth[1] = cpu_to_be32(0),
+	};
+
+	/*
+	 * Send a dummy VL15 packet to flush the launch FIFO.
+	 * This will not actually be sent since the TxeBypassIbc bit is set.
+	 */
+	pbc = PBC_7322_VL15_SEND |
+		(((u64)ppd->hw_pidx) << (PBC_PORT_SEL_LSB + 32)) |
+		(hdrwords + SIZE_OF_CRC);
+	piobuf = qib_7322_getsendbuf(ppd, pbc, &bufn);
+	if (!piobuf)
+		return;
+	writeq(pbc, piobuf);
+	hdr = (u32 *) &ibhdr;
+	if (dd->flags & QIB_PIO_FLUSH_WC) {
+		qib_flush_wc();
+		qib_pio_copy(piobuf + 2, hdr, hdrwords - 1);
+		qib_flush_wc();
+		__raw_writel(hdr[hdrwords - 1], piobuf + hdrwords + 1);
+		qib_flush_wc();
+	} else
+		qib_pio_copy(piobuf + 2, hdr, hdrwords);
+	qib_sendbuf_done(dd, bufn);
+}
+
+/*
+ * This is called with interrupts disabled and sdma_lock held.
+ */
+static void qib_7322_sdma_sendctrl(struct qib_pportdata *ppd, unsigned op)
+{
+	struct qib_devdata *dd = ppd->dd;
+	u64 set_sendctrl = 0;
+	u64 clr_sendctrl = 0;
+
+	if (op & QIB_SDMA_SENDCTRL_OP_ENABLE)
+		set_sendctrl |= SYM_MASK(SendCtrl_0, SDmaEnable);
+	else
+		clr_sendctrl |= SYM_MASK(SendCtrl_0, SDmaEnable);
+
+	if (op & QIB_SDMA_SENDCTRL_OP_INTENABLE)
+		set_sendctrl |= SYM_MASK(SendCtrl_0, SDmaIntEnable);
+	else
+		clr_sendctrl |= SYM_MASK(SendCtrl_0, SDmaIntEnable);
+
+	if (op & QIB_SDMA_SENDCTRL_OP_HALT)
+		set_sendctrl |= SYM_MASK(SendCtrl_0, SDmaHalt);
+	else
+		clr_sendctrl |= SYM_MASK(SendCtrl_0, SDmaHalt);
+
+	if (op & QIB_SDMA_SENDCTRL_OP_DRAIN)
+		set_sendctrl |= SYM_MASK(SendCtrl_0, TxeBypassIbc) |
+				SYM_MASK(SendCtrl_0, TxeAbortIbc) |
+				SYM_MASK(SendCtrl_0, TxeDrainRmFifo);
+	else
+		clr_sendctrl |= SYM_MASK(SendCtrl_0, TxeBypassIbc) |
+				SYM_MASK(SendCtrl_0, TxeAbortIbc) |
+				SYM_MASK(SendCtrl_0, TxeDrainRmFifo);
+
+	spin_lock(&dd->sendctrl_lock);
+
+	/* If we are draining everything, block sends first */
+	if (op & QIB_SDMA_SENDCTRL_OP_DRAIN) {
+		ppd->p_sendctrl &= ~SYM_MASK(SendCtrl_0, SendEnable);
+		qib_write_kreg_port(ppd, krp_sendctrl, ppd->p_sendctrl);
+		qib_write_kreg(dd, kr_scratch, 0);
+	}
+
+	ppd->p_sendctrl |= set_sendctrl;
+	ppd->p_sendctrl &= ~clr_sendctrl;
+
+	if (op & QIB_SDMA_SENDCTRL_OP_CLEANUP)
+		qib_write_kreg_port(ppd, krp_sendctrl,
+				    ppd->p_sendctrl |
+				    SYM_MASK(SendCtrl_0, SDmaCleanup));
+	else
+		qib_write_kreg_port(ppd, krp_sendctrl, ppd->p_sendctrl);
+	qib_write_kreg(dd, kr_scratch, 0);
+
+	if (op & QIB_SDMA_SENDCTRL_OP_DRAIN) {
+		ppd->p_sendctrl |= SYM_MASK(SendCtrl_0, SendEnable);
+		qib_write_kreg_port(ppd, krp_sendctrl, ppd->p_sendctrl);
+		qib_write_kreg(dd, kr_scratch, 0);
+	}
+
+	spin_unlock(&dd->sendctrl_lock);
+
+	if ((op & QIB_SDMA_SENDCTRL_OP_DRAIN) && ppd->dd->cspec->r1)
+		flush_fifo(ppd);
+}
+
+static void qib_7322_sdma_hw_clean_up(struct qib_pportdata *ppd)
+{
+	__qib_sdma_process_event(ppd, qib_sdma_event_e50_hw_cleaned);
+}
+
+static void qib_sdma_7322_setlengen(struct qib_pportdata *ppd)
+{
+	/*
+	 * Set SendDmaLenGen and clear and set
+	 * the MSB of the generation count to enable generation checking
+	 * and load the internal generation counter.
+	 */
+	qib_write_kreg_port(ppd, krp_senddmalengen, ppd->sdma_descq_cnt);
+	qib_write_kreg_port(ppd, krp_senddmalengen,
+			    ppd->sdma_descq_cnt |
+			    (1ULL << QIB_7322_SendDmaLenGen_0_Generation_MSB));
+}
+
+/*
+ * Must be called with sdma_lock held, or before init finished.
+ */
+static void qib_sdma_update_7322_tail(struct qib_pportdata *ppd, u16 tail)
+{
+	/* Commit writes to memory and advance the tail on the chip */
+	wmb();
+	ppd->sdma_descq_tail = tail;
+	qib_write_kreg_port(ppd, krp_senddmatail, tail);
+}
+
+/*
+ * This is called with interrupts disabled and sdma_lock held.
+ */
+static void qib_7322_sdma_hw_start_up(struct qib_pportdata *ppd)
+{
+	/*
+	 * Drain all FIFOs.
+	 * The hardware doesn't require this but we do it so that verbs
+	 * and user applications don't wait for link active to send stale
+	 * data.
+	 */
+	sendctrl_7322_mod(ppd, QIB_SENDCTRL_FLUSH);
+
+	qib_sdma_7322_setlengen(ppd);
+	qib_sdma_update_7322_tail(ppd, 0); /* Set SendDmaTail */
+	ppd->sdma_head_dma[0] = 0;
+	qib_7322_sdma_sendctrl(ppd,
+		ppd->sdma_state.current_op | QIB_SDMA_SENDCTRL_OP_CLEANUP);
+}
+
+#define DISABLES_SDMA ( \
+	QIB_E_P_SDMAHALT | \
+	QIB_E_P_SDMADESCADDRMISALIGN | \
+	QIB_E_P_SDMAMISSINGDW | \
+	QIB_E_P_SDMADWEN | \
+	QIB_E_P_SDMARPYTAG | \
+	QIB_E_P_SDMA1STDESC | \
+	QIB_E_P_SDMABASE | \
+	QIB_E_P_SDMATAILOUTOFBOUND | \
+	QIB_E_P_SDMAOUTOFBOUND | \
+	QIB_E_P_SDMAGENMISMATCH)
+
+static void sdma_7322_p_errors(struct qib_pportdata *ppd, u64 errs)
+{
+	unsigned long flags;
+	struct qib_devdata *dd = ppd->dd;
+
+	errs &= QIB_E_P_SDMAERRS;
+	err_decode(ppd->cpspec->sdmamsgbuf, sizeof(ppd->cpspec->sdmamsgbuf),
+		   errs, qib_7322p_error_msgs);
+
+	if (errs & QIB_E_P_SDMAUNEXPDATA)
+		qib_dev_err(dd, "IB%u:%u SDmaUnexpData\n", dd->unit,
+			    ppd->port);
+
+	spin_lock_irqsave(&ppd->sdma_lock, flags);
+
+	if (errs != QIB_E_P_SDMAHALT) {
+		/* SDMA errors have QIB_E_P_SDMAHALT and another bit set */
+		qib_dev_porterr(dd, ppd->port,
+			"SDMA %s 0x%016llx %s\n",
+			qib_sdma_state_names[ppd->sdma_state.current_state],
+			errs, ppd->cpspec->sdmamsgbuf);
+		dump_sdma_7322_state(ppd);
+	}
+
+	switch (ppd->sdma_state.current_state) {
+	case qib_sdma_state_s00_hw_down:
+		break;
+
+	case qib_sdma_state_s10_hw_start_up_wait:
+		if (errs & QIB_E_P_SDMAHALT)
+			__qib_sdma_process_event(ppd,
+				qib_sdma_event_e20_hw_started);
+		break;
+
+	case qib_sdma_state_s20_idle:
+		break;
+
+	case qib_sdma_state_s30_sw_clean_up_wait:
+		break;
+
+	case qib_sdma_state_s40_hw_clean_up_wait:
+		if (errs & QIB_E_P_SDMAHALT)
+			__qib_sdma_process_event(ppd,
+				qib_sdma_event_e50_hw_cleaned);
+		break;
+
+	case qib_sdma_state_s50_hw_halt_wait:
+		if (errs & QIB_E_P_SDMAHALT)
+			__qib_sdma_process_event(ppd,
+				qib_sdma_event_e60_hw_halted);
+		break;
+
+	case qib_sdma_state_s99_running:
+		__qib_sdma_process_event(ppd, qib_sdma_event_e7322_err_halted);
+		__qib_sdma_process_event(ppd, qib_sdma_event_e60_hw_halted);
+		break;
+	}
+
+	spin_unlock_irqrestore(&ppd->sdma_lock, flags);
+}
+
+/*
+ * handle per-device errors (not per-port errors)
+ */
+static noinline void handle_7322_errors(struct qib_devdata *dd)
+{
+	char *msg;
+	u64 iserr = 0;
+	u64 errs;
+	u64 mask;
+	int log_idx;
+
+	qib_stats.sps_errints++;
+	errs = qib_read_kreg64(dd, kr_errstatus);
+	if (!errs) {
+		qib_devinfo(dd->pcidev,
+			"device error interrupt, but no error bits set!\n");
+		goto done;
+	}
+
+	/* don't report errors that are masked */
+	errs &= dd->cspec->errormask;
+	msg = dd->cspec->emsgbuf;
+
+	/* do these first, they are most important */
+	if (errs & QIB_E_HARDWARE) {
+		*msg = '\0';
+		qib_7322_handle_hwerrors(dd, msg, sizeof dd->cspec->emsgbuf);
+	} else
+		for (log_idx = 0; log_idx < QIB_EEP_LOG_CNT; ++log_idx)
+			if (errs & dd->eep_st_masks[log_idx].errs_to_log)
+				qib_inc_eeprom_err(dd, log_idx, 1);
+
+	if (errs & QIB_E_SPKTERRS) {
+		qib_disarm_7322_senderrbufs(dd->pport);
+		qib_stats.sps_txerrs++;
+	} else if (errs & QIB_E_INVALIDADDR)
+		qib_stats.sps_txerrs++;
+	else if (errs & QIB_E_ARMLAUNCH) {
+		qib_stats.sps_txerrs++;
+		qib_disarm_7322_senderrbufs(dd->pport);
+	}
+	qib_write_kreg(dd, kr_errclear, errs);
+
+	/*
+	 * The ones we mask off are handled specially below
+	 * or above.  Also mask SDMADISABLED by default as it
+	 * is too chatty.
+	 */
+	mask = QIB_E_HARDWARE;
+	*msg = '\0';
+
+	err_decode(msg, sizeof dd->cspec->emsgbuf, errs & ~mask,
+		   qib_7322error_msgs);
+
+	/*
+	 * Getting reset is a tragedy for all ports. Mark the device
+	 * _and_ the ports as "offline" in way meaningful to each.
+	 */
+	if (errs & QIB_E_RESET) {
+		int pidx;
+
+		qib_dev_err(dd,
+			"Got reset, requires re-init (unload and reload driver)\n");
+		dd->flags &= ~QIB_INITTED;  /* needs re-init */
+		/* mark as having had error */
+		*dd->devstatusp |= QIB_STATUS_HWERROR;
+		for (pidx = 0; pidx < dd->num_pports; ++pidx)
+			if (dd->pport[pidx].link_speed_supported)
+				*dd->pport[pidx].statusp &= ~QIB_STATUS_IB_CONF;
+	}
+
+	if (*msg && iserr)
+		qib_dev_err(dd, "%s error\n", msg);
+
+	/*
+	 * If there were hdrq or egrfull errors, wake up any processes
+	 * waiting in poll.  We used to try to check which contexts had
+	 * the overflow, but given the cost of that and the chip reads
+	 * to support it, it's better to just wake everybody up if we
+	 * get an overflow; waiters can poll again if it's not them.
+	 */
+	if (errs & (ERR_MASK(RcvEgrFullErr) | ERR_MASK(RcvHdrFullErr))) {
+		qib_handle_urcv(dd, ~0U);
+		if (errs & ERR_MASK(RcvEgrFullErr))
+			qib_stats.sps_buffull++;
+		else
+			qib_stats.sps_hdrfull++;
+	}
+
+done:
+	return;
+}
+
+static void qib_error_tasklet(unsigned long data)
+{
+	struct qib_devdata *dd = (struct qib_devdata *)data;
+
+	handle_7322_errors(dd);
+	qib_write_kreg(dd, kr_errmask, dd->cspec->errormask);
+}
+
+static void reenable_chase(unsigned long opaque)
+{
+	struct qib_pportdata *ppd = (struct qib_pportdata *)opaque;
+
+	ppd->cpspec->chase_timer.expires = 0;
+	qib_set_ib_7322_lstate(ppd, QLOGIC_IB_IBCC_LINKCMD_DOWN,
+		QLOGIC_IB_IBCC_LINKINITCMD_POLL);
+}
+
+static void disable_chase(struct qib_pportdata *ppd, unsigned long tnow,
+		u8 ibclt)
+{
+	ppd->cpspec->chase_end = 0;
+
+	if (!qib_chase)
+		return;
+
+	qib_set_ib_7322_lstate(ppd, QLOGIC_IB_IBCC_LINKCMD_DOWN,
+		QLOGIC_IB_IBCC_LINKINITCMD_DISABLE);
+	ppd->cpspec->chase_timer.expires = jiffies + QIB_CHASE_DIS_TIME;
+	add_timer(&ppd->cpspec->chase_timer);
+}
+
+static void handle_serdes_issues(struct qib_pportdata *ppd, u64 ibcst)
+{
+	u8 ibclt;
+	unsigned long tnow;
+
+	ibclt = (u8)SYM_FIELD(ibcst, IBCStatusA_0, LinkTrainingState);
+
+	/*
+	 * Detect and handle the state chase issue, where we can
+	 * get stuck if we are unlucky on timing on both sides of
+	 * the link.   If we are, we disable, set a timer, and
+	 * then re-enable.
+	 */
+	switch (ibclt) {
+	case IB_7322_LT_STATE_CFGRCVFCFG:
+	case IB_7322_LT_STATE_CFGWAITRMT:
+	case IB_7322_LT_STATE_TXREVLANES:
+	case IB_7322_LT_STATE_CFGENH:
+		tnow = jiffies;
+		if (ppd->cpspec->chase_end &&
+		     time_after(tnow, ppd->cpspec->chase_end))
+			disable_chase(ppd, tnow, ibclt);
+		else if (!ppd->cpspec->chase_end)
+			ppd->cpspec->chase_end = tnow + QIB_CHASE_TIME;
+		break;
+	default:
+		ppd->cpspec->chase_end = 0;
+		break;
+	}
+
+	if (((ibclt >= IB_7322_LT_STATE_CFGTEST &&
+	      ibclt <= IB_7322_LT_STATE_CFGWAITENH) ||
+	     ibclt == IB_7322_LT_STATE_LINKUP) &&
+	    (ibcst & SYM_MASK(IBCStatusA_0, LinkSpeedQDR))) {
+		force_h1(ppd);
+		ppd->cpspec->qdr_reforce = 1;
+		if (!ppd->dd->cspec->r1)
+			serdes_7322_los_enable(ppd, 0);
+	} else if (ppd->cpspec->qdr_reforce &&
+		(ibcst & SYM_MASK(IBCStatusA_0, LinkSpeedQDR)) &&
+		 (ibclt == IB_7322_LT_STATE_CFGENH ||
+		ibclt == IB_7322_LT_STATE_CFGIDLE ||
+		ibclt == IB_7322_LT_STATE_LINKUP))
+		force_h1(ppd);
+
+	if ((IS_QMH(ppd->dd) || IS_QME(ppd->dd)) &&
+	    ppd->link_speed_enabled == QIB_IB_QDR &&
+	    (ibclt == IB_7322_LT_STATE_CFGTEST ||
+	     ibclt == IB_7322_LT_STATE_CFGENH ||
+	     (ibclt >= IB_7322_LT_STATE_POLLACTIVE &&
+	      ibclt <= IB_7322_LT_STATE_SLEEPQUIET)))
+		adj_tx_serdes(ppd);
+
+	if (ibclt != IB_7322_LT_STATE_LINKUP) {
+		u8 ltstate = qib_7322_phys_portstate(ibcst);
+		u8 pibclt = (u8)SYM_FIELD(ppd->lastibcstat, IBCStatusA_0,
+					  LinkTrainingState);
+		if (!ppd->dd->cspec->r1 &&
+		    pibclt == IB_7322_LT_STATE_LINKUP &&
+		    ltstate != IB_PHYSPORTSTATE_LINK_ERR_RECOVER &&
+		    ltstate != IB_PHYSPORTSTATE_RECOVERY_RETRAIN &&
+		    ltstate != IB_PHYSPORTSTATE_RECOVERY_WAITRMT &&
+		    ltstate != IB_PHYSPORTSTATE_RECOVERY_IDLE)
+			/* If the link went down (but no into recovery,
+			 * turn LOS back on */
+			serdes_7322_los_enable(ppd, 1);
+		if (!ppd->cpspec->qdr_dfe_on &&
+		    ibclt <= IB_7322_LT_STATE_SLEEPQUIET) {
+			ppd->cpspec->qdr_dfe_on = 1;
+			ppd->cpspec->qdr_dfe_time = 0;
+			/* On link down, reenable QDR adaptation */
+			qib_write_kreg_port(ppd, krp_static_adapt_dis(2),
+					    ppd->dd->cspec->r1 ?
+					    QDR_STATIC_ADAPT_DOWN_R1 :
+					    QDR_STATIC_ADAPT_DOWN);
+			pr_info(
+				"IB%u:%u re-enabled QDR adaptation ibclt %x\n",
+				ppd->dd->unit, ppd->port, ibclt);
+		}
+	}
+}
+
+static int qib_7322_set_ib_cfg(struct qib_pportdata *, int, u32);
+
+/*
+ * This is per-pport error handling.
+ * will likely get it's own MSIx interrupt (one for each port,
+ * although just a single handler).
+ */
+static noinline void handle_7322_p_errors(struct qib_pportdata *ppd)
+{
+	char *msg;
+	u64 ignore_this_time = 0, iserr = 0, errs, fmask;
+	struct qib_devdata *dd = ppd->dd;
+
+	/* do this as soon as possible */
+	fmask = qib_read_kreg64(dd, kr_act_fmask);
+	if (!fmask)
+		check_7322_rxe_status(ppd);
+
+	errs = qib_read_kreg_port(ppd, krp_errstatus);
+	if (!errs)
+		qib_devinfo(dd->pcidev,
+			 "Port%d error interrupt, but no error bits set!\n",
+			 ppd->port);
+	if (!fmask)
+		errs &= ~QIB_E_P_IBSTATUSCHANGED;
+	if (!errs)
+		goto done;
+
+	msg = ppd->cpspec->epmsgbuf;
+	*msg = '\0';
+
+	if (errs & ~QIB_E_P_BITSEXTANT) {
+		err_decode(msg, sizeof ppd->cpspec->epmsgbuf,
+			   errs & ~QIB_E_P_BITSEXTANT, qib_7322p_error_msgs);
+		if (!*msg)
+			snprintf(msg, sizeof ppd->cpspec->epmsgbuf,
+				 "no others");
+		qib_dev_porterr(dd, ppd->port,
+			"error interrupt with unknown errors 0x%016Lx set (and %s)\n",
+			(errs & ~QIB_E_P_BITSEXTANT), msg);
+		*msg = '\0';
+	}
+
+	if (errs & QIB_E_P_SHDR) {
+		u64 symptom;
+
+		/* determine cause, then write to clear */
+		symptom = qib_read_kreg_port(ppd, krp_sendhdrsymptom);
+		qib_write_kreg_port(ppd, krp_sendhdrsymptom, 0);
+		err_decode(msg, sizeof ppd->cpspec->epmsgbuf, symptom,
+			   hdrchk_msgs);
+		*msg = '\0';
+		/* senderrbuf cleared in SPKTERRS below */
+	}
+
+	if (errs & QIB_E_P_SPKTERRS) {
+		if ((errs & QIB_E_P_LINK_PKTERRS) &&
+		    !(ppd->lflags & QIBL_LINKACTIVE)) {
+			/*
+			 * This can happen when trying to bring the link
+			 * up, but the IB link changes state at the "wrong"
+			 * time. The IB logic then complains that the packet
+			 * isn't valid.  We don't want to confuse people, so
+			 * we just don't print them, except at debug
+			 */
+			err_decode(msg, sizeof ppd->cpspec->epmsgbuf,
+				   (errs & QIB_E_P_LINK_PKTERRS),
+				   qib_7322p_error_msgs);
+			*msg = '\0';
+			ignore_this_time = errs & QIB_E_P_LINK_PKTERRS;
+		}
+		qib_disarm_7322_senderrbufs(ppd);
+	} else if ((errs & QIB_E_P_LINK_PKTERRS) &&
+		   !(ppd->lflags & QIBL_LINKACTIVE)) {
+		/*
+		 * This can happen when SMA is trying to bring the link
+		 * up, but the IB link changes state at the "wrong" time.
+		 * The IB logic then complains that the packet isn't
+		 * valid.  We don't want to confuse people, so we just
+		 * don't print them, except at debug
+		 */
+		err_decode(msg, sizeof ppd->cpspec->epmsgbuf, errs,
+			   qib_7322p_error_msgs);
+		ignore_this_time = errs & QIB_E_P_LINK_PKTERRS;
+		*msg = '\0';
+	}
+
+	qib_write_kreg_port(ppd, krp_errclear, errs);
+
+	errs &= ~ignore_this_time;
+	if (!errs)
+		goto done;
+
+	if (errs & QIB_E_P_RPKTERRS)
+		qib_stats.sps_rcverrs++;
+	if (errs & QIB_E_P_SPKTERRS)
+		qib_stats.sps_txerrs++;
+
+	iserr = errs & ~(QIB_E_P_RPKTERRS | QIB_E_P_PKTERRS);
+
+	if (errs & QIB_E_P_SDMAERRS)
+		sdma_7322_p_errors(ppd, errs);
+
+	if (errs & QIB_E_P_IBSTATUSCHANGED) {
+		u64 ibcs;
+		u8 ltstate;
+
+		ibcs = qib_read_kreg_port(ppd, krp_ibcstatus_a);
+		ltstate = qib_7322_phys_portstate(ibcs);
+
+		if (!(ppd->lflags & QIBL_IB_AUTONEG_INPROG))
+			handle_serdes_issues(ppd, ibcs);
+		if (!(ppd->cpspec->ibcctrl_a &
+		      SYM_MASK(IBCCtrlA_0, IBStatIntReductionEn))) {
+			/*
+			 * We got our interrupt, so init code should be
+			 * happy and not try alternatives. Now squelch
+			 * other "chatter" from link-negotiation (pre Init)
+			 */
+			ppd->cpspec->ibcctrl_a |=
+				SYM_MASK(IBCCtrlA_0, IBStatIntReductionEn);
+			qib_write_kreg_port(ppd, krp_ibcctrl_a,
+					    ppd->cpspec->ibcctrl_a);
+		}
+
+		/* Update our picture of width and speed from chip */
+		ppd->link_width_active =
+			(ibcs & SYM_MASK(IBCStatusA_0, LinkWidthActive)) ?
+			    IB_WIDTH_4X : IB_WIDTH_1X;
+		ppd->link_speed_active = (ibcs & SYM_MASK(IBCStatusA_0,
+			LinkSpeedQDR)) ? QIB_IB_QDR : (ibcs &
+			  SYM_MASK(IBCStatusA_0, LinkSpeedActive)) ?
+				   QIB_IB_DDR : QIB_IB_SDR;
+
+		if ((ppd->lflags & QIBL_IB_LINK_DISABLED) && ltstate !=
+		    IB_PHYSPORTSTATE_DISABLED)
+			qib_set_ib_7322_lstate(ppd, 0,
+			       QLOGIC_IB_IBCC_LINKINITCMD_DISABLE);
+		else
+			/*
+			 * Since going into a recovery state causes the link
+			 * state to go down and since recovery is transitory,
+			 * it is better if we "miss" ever seeing the link
+			 * training state go into recovery (i.e., ignore this
+			 * transition for link state special handling purposes)
+			 * without updating lastibcstat.
+			 */
+			if (ltstate != IB_PHYSPORTSTATE_LINK_ERR_RECOVER &&
+			    ltstate != IB_PHYSPORTSTATE_RECOVERY_RETRAIN &&
+			    ltstate != IB_PHYSPORTSTATE_RECOVERY_WAITRMT &&
+			    ltstate != IB_PHYSPORTSTATE_RECOVERY_IDLE)
+				qib_handle_e_ibstatuschanged(ppd, ibcs);
+	}
+	if (*msg && iserr)
+		qib_dev_porterr(dd, ppd->port, "%s error\n", msg);
+
+	if (ppd->state_wanted & ppd->lflags)
+		wake_up_interruptible(&ppd->state_wait);
+done:
+	return;
+}
+
+/* enable/disable chip from delivering interrupts */
+static void qib_7322_set_intr_state(struct qib_devdata *dd, u32 enable)
+{
+	if (enable) {
+		if (dd->flags & QIB_BADINTR)
+			return;
+		qib_write_kreg(dd, kr_intmask, dd->cspec->int_enable_mask);
+		/* cause any pending enabled interrupts to be re-delivered */
+		qib_write_kreg(dd, kr_intclear, 0ULL);
+		if (dd->cspec->num_msix_entries) {
+			/* and same for MSIx */
+			u64 val = qib_read_kreg64(dd, kr_intgranted);
+			if (val)
+				qib_write_kreg(dd, kr_intgranted, val);
+		}
+	} else
+		qib_write_kreg(dd, kr_intmask, 0ULL);
+}
+
+/*
+ * Try to cleanup as much as possible for anything that might have gone
+ * wrong while in freeze mode, such as pio buffers being written by user
+ * processes (causing armlaunch), send errors due to going into freeze mode,
+ * etc., and try to avoid causing extra interrupts while doing so.
+ * Forcibly update the in-memory pioavail register copies after cleanup
+ * because the chip won't do it while in freeze mode (the register values
+ * themselves are kept correct).
+ * Make sure that we don't lose any important interrupts by using the chip
+ * feature that says that writing 0 to a bit in *clear that is set in
+ * *status will cause an interrupt to be generated again (if allowed by
+ * the *mask value).
+ * This is in chip-specific code because of all of the register accesses,
+ * even though the details are similar on most chips.
+ */
+static void qib_7322_clear_freeze(struct qib_devdata *dd)
+{
+	int pidx;
+
+	/* disable error interrupts, to avoid confusion */
+	qib_write_kreg(dd, kr_errmask, 0ULL);
+
+	for (pidx = 0; pidx < dd->num_pports; ++pidx)
+		if (dd->pport[pidx].link_speed_supported)
+			qib_write_kreg_port(dd->pport + pidx, krp_errmask,
+					    0ULL);
+
+	/* also disable interrupts; errormask is sometimes overwriten */
+	qib_7322_set_intr_state(dd, 0);
+
+	/* clear the freeze, and be sure chip saw it */
+	qib_write_kreg(dd, kr_control, dd->control);
+	qib_read_kreg32(dd, kr_scratch);
+
+	/*
+	 * Force new interrupt if any hwerr, error or interrupt bits are
+	 * still set, and clear "safe" send packet errors related to freeze
+	 * and cancelling sends.  Re-enable error interrupts before possible
+	 * force of re-interrupt on pending interrupts.
+	 */
+	qib_write_kreg(dd, kr_hwerrclear, 0ULL);
+	qib_write_kreg(dd, kr_errclear, E_SPKT_ERRS_IGNORE);
+	qib_write_kreg(dd, kr_errmask, dd->cspec->errormask);
+	/* We need to purge per-port errs and reset mask, too */
+	for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+		if (!dd->pport[pidx].link_speed_supported)
+			continue;
+		qib_write_kreg_port(dd->pport + pidx, krp_errclear, ~0Ull);
+		qib_write_kreg_port(dd->pport + pidx, krp_errmask, ~0Ull);
+	}
+	qib_7322_set_intr_state(dd, 1);
+}
+
+/* no error handling to speak of */
+/**
+ * qib_7322_handle_hwerrors - display hardware errors.
+ * @dd: the qlogic_ib device
+ * @msg: the output buffer
+ * @msgl: the size of the output buffer
+ *
+ * Use same msg buffer as regular errors to avoid excessive stack
+ * use.  Most hardware errors are catastrophic, but for right now,
+ * we'll print them and continue.  We reuse the same message buffer as
+ * qib_handle_errors() to avoid excessive stack usage.
+ */
+static void qib_7322_handle_hwerrors(struct qib_devdata *dd, char *msg,
+				     size_t msgl)
+{
+	u64 hwerrs;
+	u32 ctrl;
+	int isfatal = 0;
+
+	hwerrs = qib_read_kreg64(dd, kr_hwerrstatus);
+	if (!hwerrs)
+		goto bail;
+	if (hwerrs == ~0ULL) {
+		qib_dev_err(dd,
+			"Read of hardware error status failed (all bits set); ignoring\n");
+		goto bail;
+	}
+	qib_stats.sps_hwerrs++;
+
+	/* Always clear the error status register, except BIST fail */
+	qib_write_kreg(dd, kr_hwerrclear, hwerrs &
+		       ~HWE_MASK(PowerOnBISTFailed));
+
+	hwerrs &= dd->cspec->hwerrmask;
+
+	/* no EEPROM logging, yet */
+
+	if (hwerrs)
+		qib_devinfo(dd->pcidev,
+			"Hardware error: hwerr=0x%llx (cleared)\n",
+			(unsigned long long) hwerrs);
+
+	ctrl = qib_read_kreg32(dd, kr_control);
+	if ((ctrl & SYM_MASK(Control, FreezeMode)) && !dd->diag_client) {
+		/*
+		 * No recovery yet...
+		 */
+		if ((hwerrs & ~HWE_MASK(LATriggered)) ||
+		    dd->cspec->stay_in_freeze) {
+			/*
+			 * If any set that we aren't ignoring only make the
+			 * complaint once, in case it's stuck or recurring,
+			 * and we get here multiple times
+			 * Force link down, so switch knows, and
+			 * LEDs are turned off.
+			 */
+			if (dd->flags & QIB_INITTED)
+				isfatal = 1;
+		} else
+			qib_7322_clear_freeze(dd);
+	}
+
+	if (hwerrs & HWE_MASK(PowerOnBISTFailed)) {
+		isfatal = 1;
+		strlcpy(msg,
+			"[Memory BIST test failed, InfiniPath hardware unusable]",
+			msgl);
+		/* ignore from now on, so disable until driver reloaded */
+		dd->cspec->hwerrmask &= ~HWE_MASK(PowerOnBISTFailed);
+		qib_write_kreg(dd, kr_hwerrmask, dd->cspec->hwerrmask);
+	}
+
+	err_decode(msg, msgl, hwerrs, qib_7322_hwerror_msgs);
+
+	/* Ignore esoteric PLL failures et al. */
+
+	qib_dev_err(dd, "%s hardware error\n", msg);
+
+	if (hwerrs &
+		   (SYM_MASK(HwErrMask, SDmaMemReadErrMask_0) |
+		    SYM_MASK(HwErrMask, SDmaMemReadErrMask_1))) {
+		int pidx = 0;
+		int err;
+		unsigned long flags;
+		struct qib_pportdata *ppd = dd->pport;
+		for (; pidx < dd->num_pports; ++pidx, ppd++) {
+			err = 0;
+			if (pidx == 0 && (hwerrs &
+				SYM_MASK(HwErrMask, SDmaMemReadErrMask_0)))
+				err++;
+			if (pidx == 1 && (hwerrs &
+				SYM_MASK(HwErrMask, SDmaMemReadErrMask_1)))
+				err++;
+			if (err) {
+				spin_lock_irqsave(&ppd->sdma_lock, flags);
+				dump_sdma_7322_state(ppd);
+				spin_unlock_irqrestore(&ppd->sdma_lock, flags);
+			}
+		}
+	}
+
+	if (isfatal && !dd->diag_client) {
+		qib_dev_err(dd,
+			"Fatal Hardware Error, no longer usable, SN %.16s\n",
+			dd->serial);
+		/*
+		 * for /sys status file and user programs to print; if no
+		 * trailing brace is copied, we'll know it was truncated.
+		 */
+		if (dd->freezemsg)
+			snprintf(dd->freezemsg, dd->freezelen,
+				 "{%s}", msg);
+		qib_disable_after_error(dd);
+	}
+bail:;
+}
+
+/**
+ * qib_7322_init_hwerrors - enable hardware errors
+ * @dd: the qlogic_ib device
+ *
+ * now that we have finished initializing everything that might reasonably
+ * cause a hardware error, and cleared those errors bits as they occur,
+ * we can enable hardware errors in the mask (potentially enabling
+ * freeze mode), and enable hardware errors as errors (along with
+ * everything else) in errormask
+ */
+static void qib_7322_init_hwerrors(struct qib_devdata *dd)
+{
+	int pidx;
+	u64 extsval;
+
+	extsval = qib_read_kreg64(dd, kr_extstatus);
+	if (!(extsval & (QIB_EXTS_MEMBIST_DISABLED |
+			 QIB_EXTS_MEMBIST_ENDTEST)))
+		qib_dev_err(dd, "MemBIST did not complete!\n");
+
+	/* never clear BIST failure, so reported on each driver load */
+	qib_write_kreg(dd, kr_hwerrclear, ~HWE_MASK(PowerOnBISTFailed));
+	qib_write_kreg(dd, kr_hwerrmask, dd->cspec->hwerrmask);
+
+	/* clear all */
+	qib_write_kreg(dd, kr_errclear, ~0ULL);
+	/* enable errors that are masked, at least this first time. */
+	qib_write_kreg(dd, kr_errmask, ~0ULL);
+	dd->cspec->errormask = qib_read_kreg64(dd, kr_errmask);
+	for (pidx = 0; pidx < dd->num_pports; ++pidx)
+		if (dd->pport[pidx].link_speed_supported)
+			qib_write_kreg_port(dd->pport + pidx, krp_errmask,
+					    ~0ULL);
+}
+
+/*
+ * Disable and enable the armlaunch error.  Used for PIO bandwidth testing
+ * on chips that are count-based, rather than trigger-based.  There is no
+ * reference counting, but that's also fine, given the intended use.
+ * Only chip-specific because it's all register accesses
+ */
+static void qib_set_7322_armlaunch(struct qib_devdata *dd, u32 enable)
+{
+	if (enable) {
+		qib_write_kreg(dd, kr_errclear, QIB_E_SPIOARMLAUNCH);
+		dd->cspec->errormask |= QIB_E_SPIOARMLAUNCH;
+	} else
+		dd->cspec->errormask &= ~QIB_E_SPIOARMLAUNCH;
+	qib_write_kreg(dd, kr_errmask, dd->cspec->errormask);
+}
+
+/*
+ * Formerly took parameter <which> in pre-shifted,
+ * pre-merged form with LinkCmd and LinkInitCmd
+ * together, and assuming the zero was NOP.
+ */
+static void qib_set_ib_7322_lstate(struct qib_pportdata *ppd, u16 linkcmd,
+				   u16 linitcmd)
+{
+	u64 mod_wd;
+	struct qib_devdata *dd = ppd->dd;
+	unsigned long flags;
+
+	if (linitcmd == QLOGIC_IB_IBCC_LINKINITCMD_DISABLE) {
+		/*
+		 * If we are told to disable, note that so link-recovery
+		 * code does not attempt to bring us back up.
+		 * Also reset everything that we can, so we start
+		 * completely clean when re-enabled (before we
+		 * actually issue the disable to the IBC)
+		 */
+		qib_7322_mini_pcs_reset(ppd);
+		spin_lock_irqsave(&ppd->lflags_lock, flags);
+		ppd->lflags |= QIBL_IB_LINK_DISABLED;
+		spin_unlock_irqrestore(&ppd->lflags_lock, flags);
+	} else if (linitcmd || linkcmd == QLOGIC_IB_IBCC_LINKCMD_DOWN) {
+		/*
+		 * Any other linkinitcmd will lead to LINKDOWN and then
+		 * to INIT (if all is well), so clear flag to let
+		 * link-recovery code attempt to bring us back up.
+		 */
+		spin_lock_irqsave(&ppd->lflags_lock, flags);
+		ppd->lflags &= ~QIBL_IB_LINK_DISABLED;
+		spin_unlock_irqrestore(&ppd->lflags_lock, flags);
+		/*
+		 * Clear status change interrupt reduction so the
+		 * new state is seen.
+		 */
+		ppd->cpspec->ibcctrl_a &=
+			~SYM_MASK(IBCCtrlA_0, IBStatIntReductionEn);
+	}
+
+	mod_wd = (linkcmd << IBA7322_IBCC_LINKCMD_SHIFT) |
+		(linitcmd << QLOGIC_IB_IBCC_LINKINITCMD_SHIFT);
+
+	qib_write_kreg_port(ppd, krp_ibcctrl_a, ppd->cpspec->ibcctrl_a |
+			    mod_wd);
+	/* write to chip to prevent back-to-back writes of ibc reg */
+	qib_write_kreg(dd, kr_scratch, 0);
+
+}
+
+/*
+ * The total RCV buffer memory is 64KB, used for both ports, and is
+ * in units of 64 bytes (same as IB flow control credit unit).
+ * The consumedVL unit in the same registers are in 32 byte units!
+ * So, a VL15 packet needs 4.50 IB credits, and 9 rx buffer chunks,
+ * and we can therefore allocate just 9 IB credits for 2 VL15 packets
+ * in krp_rxcreditvl15, rather than 10.
+ */
+#define RCV_BUF_UNITSZ 64
+#define NUM_RCV_BUF_UNITS(dd) ((64 * 1024) / (RCV_BUF_UNITSZ * dd->num_pports))
+
+static void set_vls(struct qib_pportdata *ppd)
+{
+	int i, numvls, totcred, cred_vl, vl0extra;
+	struct qib_devdata *dd = ppd->dd;
+	u64 val;
+
+	numvls = qib_num_vls(ppd->vls_operational);
+
+	/*
+	 * Set up per-VL credits. Below is kluge based on these assumptions:
+	 * 1) port is disabled at the time early_init is called.
+	 * 2) give VL15 17 credits, for two max-plausible packets.
+	 * 3) Give VL0-N the rest, with any rounding excess used for VL0
+	 */
+	/* 2 VL15 packets @ 288 bytes each (including IB headers) */
+	totcred = NUM_RCV_BUF_UNITS(dd);
+	cred_vl = (2 * 288 + RCV_BUF_UNITSZ - 1) / RCV_BUF_UNITSZ;
+	totcred -= cred_vl;
+	qib_write_kreg_port(ppd, krp_rxcreditvl15, (u64) cred_vl);
+	cred_vl = totcred / numvls;
+	vl0extra = totcred - cred_vl * numvls;
+	qib_write_kreg_port(ppd, krp_rxcreditvl0, cred_vl + vl0extra);
+	for (i = 1; i < numvls; i++)
+		qib_write_kreg_port(ppd, krp_rxcreditvl0 + i, cred_vl);
+	for (; i < 8; i++) /* no buffer space for other VLs */
+		qib_write_kreg_port(ppd, krp_rxcreditvl0 + i, 0);
+
+	/* Notify IBC that credits need to be recalculated */
+	val = qib_read_kreg_port(ppd, krp_ibsdtestiftx);
+	val |= SYM_MASK(IB_SDTEST_IF_TX_0, CREDIT_CHANGE);
+	qib_write_kreg_port(ppd, krp_ibsdtestiftx, val);
+	qib_write_kreg(dd, kr_scratch, 0ULL);
+	val &= ~SYM_MASK(IB_SDTEST_IF_TX_0, CREDIT_CHANGE);
+	qib_write_kreg_port(ppd, krp_ibsdtestiftx, val);
+
+	for (i = 0; i < numvls; i++)
+		val = qib_read_kreg_port(ppd, krp_rxcreditvl0 + i);
+	val = qib_read_kreg_port(ppd, krp_rxcreditvl15);
+
+	/* Change the number of operational VLs */
+	ppd->cpspec->ibcctrl_a = (ppd->cpspec->ibcctrl_a &
+				~SYM_MASK(IBCCtrlA_0, NumVLane)) |
+		((u64)(numvls - 1) << SYM_LSB(IBCCtrlA_0, NumVLane));
+	qib_write_kreg_port(ppd, krp_ibcctrl_a, ppd->cpspec->ibcctrl_a);
+	qib_write_kreg(dd, kr_scratch, 0ULL);
+}
+
+/*
+ * The code that deals with actual SerDes is in serdes_7322_init().
+ * Compared to the code for iba7220, it is minimal.
+ */
+static int serdes_7322_init(struct qib_pportdata *ppd);
+
+/**
+ * qib_7322_bringup_serdes - bring up the serdes
+ * @ppd: physical port on the qlogic_ib device
+ */
+static int qib_7322_bringup_serdes(struct qib_pportdata *ppd)
+{
+	struct qib_devdata *dd = ppd->dd;
+	u64 val, guid, ibc;
+	unsigned long flags;
+	int ret = 0;
+
+	/*
+	 * SerDes model not in Pd, but still need to
+	 * set up much of IBCCtrl and IBCDDRCtrl; move elsewhere
+	 * eventually.
+	 */
+	/* Put IBC in reset, sends disabled (should be in reset already) */
+	ppd->cpspec->ibcctrl_a &= ~SYM_MASK(IBCCtrlA_0, IBLinkEn);
+	qib_write_kreg_port(ppd, krp_ibcctrl_a, ppd->cpspec->ibcctrl_a);
+	qib_write_kreg(dd, kr_scratch, 0ULL);
+
+	/* ensure previous Tx parameters are not still forced */
+	qib_write_kreg_port(ppd, krp_tx_deemph_override,
+		SYM_MASK(IBSD_TX_DEEMPHASIS_OVERRIDE_0,
+		reset_tx_deemphasis_override));
+
+	if (qib_compat_ddr_negotiate) {
+		ppd->cpspec->ibdeltainprog = 1;
+		ppd->cpspec->ibsymsnap = read_7322_creg32_port(ppd,
+						crp_ibsymbolerr);
+		ppd->cpspec->iblnkerrsnap = read_7322_creg32_port(ppd,
+						crp_iblinkerrrecov);
+	}
+
+	/* flowcontrolwatermark is in units of KBytes */
+	ibc = 0x5ULL << SYM_LSB(IBCCtrlA_0, FlowCtrlWaterMark);
+	/*
+	 * Flow control is sent this often, even if no changes in
+	 * buffer space occur.  Units are 128ns for this chip.
+	 * Set to 3usec.
+	 */
+	ibc |= 24ULL << SYM_LSB(IBCCtrlA_0, FlowCtrlPeriod);
+	/* max error tolerance */
+	ibc |= 0xfULL << SYM_LSB(IBCCtrlA_0, PhyerrThreshold);
+	/* IB credit flow control. */
+	ibc |= 0xfULL << SYM_LSB(IBCCtrlA_0, OverrunThreshold);
+	/*
+	 * set initial max size pkt IBC will send, including ICRC; it's the
+	 * PIO buffer size in dwords, less 1; also see qib_set_mtu()
+	 */
+	ibc |= ((u64)(ppd->ibmaxlen >> 2) + 1) <<
+		SYM_LSB(IBCCtrlA_0, MaxPktLen);
+	ppd->cpspec->ibcctrl_a = ibc; /* without linkcmd or linkinitcmd! */
+
+	/*
+	 * Reset the PCS interface to the serdes (and also ibc, which is still
+	 * in reset from above).  Writes new value of ibcctrl_a as last step.
+	 */
+	qib_7322_mini_pcs_reset(ppd);
+
+	if (!ppd->cpspec->ibcctrl_b) {
+		unsigned lse = ppd->link_speed_enabled;
+
+		/*
+		 * Not on re-init after reset, establish shadow
+		 * and force initial config.
+		 */
+		ppd->cpspec->ibcctrl_b = qib_read_kreg_port(ppd,
+							     krp_ibcctrl_b);
+		ppd->cpspec->ibcctrl_b &= ~(IBA7322_IBC_SPEED_QDR |
+				IBA7322_IBC_SPEED_DDR |
+				IBA7322_IBC_SPEED_SDR |
+				IBA7322_IBC_WIDTH_AUTONEG |
+				SYM_MASK(IBCCtrlB_0, IB_LANE_REV_SUPPORTED));
+		if (lse & (lse - 1)) /* Muliple speeds enabled */
+			ppd->cpspec->ibcctrl_b |=
+				(lse << IBA7322_IBC_SPEED_LSB) |
+				IBA7322_IBC_IBTA_1_2_MASK |
+				IBA7322_IBC_MAX_SPEED_MASK;
+		else
+			ppd->cpspec->ibcctrl_b |= (lse == QIB_IB_QDR) ?
+				IBA7322_IBC_SPEED_QDR |
+				 IBA7322_IBC_IBTA_1_2_MASK :
+				(lse == QIB_IB_DDR) ?
+					IBA7322_IBC_SPEED_DDR :
+					IBA7322_IBC_SPEED_SDR;
+		if ((ppd->link_width_enabled & (IB_WIDTH_1X | IB_WIDTH_4X)) ==
+		    (IB_WIDTH_1X | IB_WIDTH_4X))
+			ppd->cpspec->ibcctrl_b |= IBA7322_IBC_WIDTH_AUTONEG;
+		else
+			ppd->cpspec->ibcctrl_b |=
+				ppd->link_width_enabled == IB_WIDTH_4X ?
+				IBA7322_IBC_WIDTH_4X_ONLY :
+				IBA7322_IBC_WIDTH_1X_ONLY;
+
+		/* always enable these on driver reload, not sticky */
+		ppd->cpspec->ibcctrl_b |= (IBA7322_IBC_RXPOL_MASK |
+			IBA7322_IBC_HRTBT_MASK);
+	}
+	qib_write_kreg_port(ppd, krp_ibcctrl_b, ppd->cpspec->ibcctrl_b);
+
+	/* setup so we have more time at CFGTEST to change H1 */
+	val = qib_read_kreg_port(ppd, krp_ibcctrl_c);
+	val &= ~SYM_MASK(IBCCtrlC_0, IB_FRONT_PORCH);
+	val |= 0xfULL << SYM_LSB(IBCCtrlC_0, IB_FRONT_PORCH);
+	qib_write_kreg_port(ppd, krp_ibcctrl_c, val);
+
+	serdes_7322_init(ppd);
+
+	guid = be64_to_cpu(ppd->guid);
+	if (!guid) {
+		if (dd->base_guid)
+			guid = be64_to_cpu(dd->base_guid) + ppd->port - 1;
+		ppd->guid = cpu_to_be64(guid);
+	}
+
+	qib_write_kreg_port(ppd, krp_hrtbt_guid, guid);
+	/* write to chip to prevent back-to-back writes of ibc reg */
+	qib_write_kreg(dd, kr_scratch, 0);
+
+	/* Enable port */
+	ppd->cpspec->ibcctrl_a |= SYM_MASK(IBCCtrlA_0, IBLinkEn);
+	set_vls(ppd);
+
+	/* initially come up DISABLED, without sending anything. */
+	val = ppd->cpspec->ibcctrl_a | (QLOGIC_IB_IBCC_LINKINITCMD_DISABLE <<
+					QLOGIC_IB_IBCC_LINKINITCMD_SHIFT);
+	qib_write_kreg_port(ppd, krp_ibcctrl_a, val);
+	qib_write_kreg(dd, kr_scratch, 0ULL);
+	/* clear the linkinit cmds */
+	ppd->cpspec->ibcctrl_a = val & ~SYM_MASK(IBCCtrlA_0, LinkInitCmd);
+
+	/* be paranoid against later code motion, etc. */
+	spin_lock_irqsave(&dd->cspec->rcvmod_lock, flags);
+	ppd->p_rcvctrl |= SYM_MASK(RcvCtrl_0, RcvIBPortEnable);
+	qib_write_kreg_port(ppd, krp_rcvctrl, ppd->p_rcvctrl);
+	spin_unlock_irqrestore(&dd->cspec->rcvmod_lock, flags);
+
+	/* Also enable IBSTATUSCHG interrupt.  */
+	val = qib_read_kreg_port(ppd, krp_errmask);
+	qib_write_kreg_port(ppd, krp_errmask,
+		val | ERR_MASK_N(IBStatusChanged));
+
+	/* Always zero until we start messing with SerDes for real */
+	return ret;
+}
+
+/**
+ * qib_7322_quiet_serdes - set serdes to txidle
+ * @dd: the qlogic_ib device
+ * Called when driver is being unloaded
+ */
+static void qib_7322_mini_quiet_serdes(struct qib_pportdata *ppd)
+{
+	u64 val;
+	unsigned long flags;
+
+	qib_set_ib_7322_lstate(ppd, 0, QLOGIC_IB_IBCC_LINKINITCMD_DISABLE);
+
+	spin_lock_irqsave(&ppd->lflags_lock, flags);
+	ppd->lflags &= ~QIBL_IB_AUTONEG_INPROG;
+	spin_unlock_irqrestore(&ppd->lflags_lock, flags);
+	wake_up(&ppd->cpspec->autoneg_wait);
+	cancel_delayed_work_sync(&ppd->cpspec->autoneg_work);
+	if (ppd->dd->cspec->r1)
+		cancel_delayed_work_sync(&ppd->cpspec->ipg_work);
+
+	ppd->cpspec->chase_end = 0;
+	if (ppd->cpspec->chase_timer.data) /* if initted */
+		del_timer_sync(&ppd->cpspec->chase_timer);
+
+	/*
+	 * Despite the name, actually disables IBC as well. Do it when
+	 * we are as sure as possible that no more packets can be
+	 * received, following the down and the PCS reset.
+	 * The actual disabling happens in qib_7322_mini_pci_reset(),
+	 * along with the PCS being reset.
+	 */
+	ppd->cpspec->ibcctrl_a &= ~SYM_MASK(IBCCtrlA_0, IBLinkEn);
+	qib_7322_mini_pcs_reset(ppd);
+
+	/*
+	 * Update the adjusted counters so the adjustment persists
+	 * across driver reload.
+	 */
+	if (ppd->cpspec->ibsymdelta || ppd->cpspec->iblnkerrdelta ||
+	    ppd->cpspec->ibdeltainprog || ppd->cpspec->iblnkdowndelta) {
+		struct qib_devdata *dd = ppd->dd;
+		u64 diagc;
+
+		/* enable counter writes */
+		diagc = qib_read_kreg64(dd, kr_hwdiagctrl);
+		qib_write_kreg(dd, kr_hwdiagctrl,
+			       diagc | SYM_MASK(HwDiagCtrl, CounterWrEnable));
+
+		if (ppd->cpspec->ibsymdelta || ppd->cpspec->ibdeltainprog) {
+			val = read_7322_creg32_port(ppd, crp_ibsymbolerr);
+			if (ppd->cpspec->ibdeltainprog)
+				val -= val - ppd->cpspec->ibsymsnap;
+			val -= ppd->cpspec->ibsymdelta;
+			write_7322_creg_port(ppd, crp_ibsymbolerr, val);
+		}
+		if (ppd->cpspec->iblnkerrdelta || ppd->cpspec->ibdeltainprog) {
+			val = read_7322_creg32_port(ppd, crp_iblinkerrrecov);
+			if (ppd->cpspec->ibdeltainprog)
+				val -= val - ppd->cpspec->iblnkerrsnap;
+			val -= ppd->cpspec->iblnkerrdelta;
+			write_7322_creg_port(ppd, crp_iblinkerrrecov, val);
+		}
+		if (ppd->cpspec->iblnkdowndelta) {
+			val = read_7322_creg32_port(ppd, crp_iblinkdown);
+			val += ppd->cpspec->iblnkdowndelta;
+			write_7322_creg_port(ppd, crp_iblinkdown, val);
+		}
+		/*
+		 * No need to save ibmalfdelta since IB perfcounters
+		 * are cleared on driver reload.
+		 */
+
+		/* and disable counter writes */
+		qib_write_kreg(dd, kr_hwdiagctrl, diagc);
+	}
+}
+
+/**
+ * qib_setup_7322_setextled - set the state of the two external LEDs
+ * @ppd: physical port on the qlogic_ib device
+ * @on: whether the link is up or not
+ *
+ * The exact combo of LEDs if on is true is determined by looking
+ * at the ibcstatus.
+ *
+ * These LEDs indicate the physical and logical state of IB link.
+ * For this chip (at least with recommended board pinouts), LED1
+ * is Yellow (logical state) and LED2 is Green (physical state),
+ *
+ * Note:  We try to match the Mellanox HCA LED behavior as best
+ * we can.  Green indicates physical link state is OK (something is
+ * plugged in, and we can train).
+ * Amber indicates the link is logically up (ACTIVE).
+ * Mellanox further blinks the amber LED to indicate data packet
+ * activity, but we have no hardware support for that, so it would
+ * require waking up every 10-20 msecs and checking the counters
+ * on the chip, and then turning the LED off if appropriate.  That's
+ * visible overhead, so not something we will do.
+ */
+static void qib_setup_7322_setextled(struct qib_pportdata *ppd, u32 on)
+{
+	struct qib_devdata *dd = ppd->dd;
+	u64 extctl, ledblink = 0, val;
+	unsigned long flags;
+	int yel, grn;
+
+	/*
+	 * The diags use the LED to indicate diag info, so we leave
+	 * the external LED alone when the diags are running.
+	 */
+	if (dd->diag_client)
+		return;
+
+	/* Allow override of LED display for, e.g. Locating system in rack */
+	if (ppd->led_override) {
+		grn = (ppd->led_override & QIB_LED_PHYS);
+		yel = (ppd->led_override & QIB_LED_LOG);
+	} else if (on) {
+		val = qib_read_kreg_port(ppd, krp_ibcstatus_a);
+		grn = qib_7322_phys_portstate(val) ==
+			IB_PHYSPORTSTATE_LINKUP;
+		yel = qib_7322_iblink_state(val) == IB_PORT_ACTIVE;
+	} else {
+		grn = 0;
+		yel = 0;
+	}
+
+	spin_lock_irqsave(&dd->cspec->gpio_lock, flags);
+	extctl = dd->cspec->extctrl & (ppd->port == 1 ?
+		~ExtLED_IB1_MASK : ~ExtLED_IB2_MASK);
+	if (grn) {
+		extctl |= ppd->port == 1 ? ExtLED_IB1_GRN : ExtLED_IB2_GRN;
+		/*
+		 * Counts are in chip clock (4ns) periods.
+		 * This is 1/16 sec (66.6ms) on,
+		 * 3/16 sec (187.5 ms) off, with packets rcvd.
+		 */
+		ledblink = ((66600 * 1000UL / 4) << IBA7322_LEDBLINK_ON_SHIFT) |
+			((187500 * 1000UL / 4) << IBA7322_LEDBLINK_OFF_SHIFT);
+	}
+	if (yel)
+		extctl |= ppd->port == 1 ? ExtLED_IB1_YEL : ExtLED_IB2_YEL;
+	dd->cspec->extctrl = extctl;
+	qib_write_kreg(dd, kr_extctrl, dd->cspec->extctrl);
+	spin_unlock_irqrestore(&dd->cspec->gpio_lock, flags);
+
+	if (ledblink) /* blink the LED on packet receive */
+		qib_write_kreg_port(ppd, krp_rcvpktledcnt, ledblink);
+}
+
+#ifdef CONFIG_INFINIBAND_QIB_DCA
+
+static int qib_7322_notify_dca(struct qib_devdata *dd, unsigned long event)
+{
+	switch (event) {
+	case DCA_PROVIDER_ADD:
+		if (dd->flags & QIB_DCA_ENABLED)
+			break;
+		if (!dca_add_requester(&dd->pcidev->dev)) {
+			qib_devinfo(dd->pcidev, "DCA enabled\n");
+			dd->flags |= QIB_DCA_ENABLED;
+			qib_setup_dca(dd);
+		}
+		break;
+	case DCA_PROVIDER_REMOVE:
+		if (dd->flags & QIB_DCA_ENABLED) {
+			dca_remove_requester(&dd->pcidev->dev);
+			dd->flags &= ~QIB_DCA_ENABLED;
+			dd->cspec->dca_ctrl = 0;
+			qib_write_kreg(dd, KREG_IDX(DCACtrlA),
+				dd->cspec->dca_ctrl);
+		}
+		break;
+	}
+	return 0;
+}
+
+static void qib_update_rhdrq_dca(struct qib_ctxtdata *rcd, int cpu)
+{
+	struct qib_devdata *dd = rcd->dd;
+	struct qib_chip_specific *cspec = dd->cspec;
+
+	if (!(dd->flags & QIB_DCA_ENABLED))
+		return;
+	if (cspec->rhdr_cpu[rcd->ctxt] != cpu) {
+		const struct dca_reg_map *rmp;
+
+		cspec->rhdr_cpu[rcd->ctxt] = cpu;
+		rmp = &dca_rcvhdr_reg_map[rcd->ctxt];
+		cspec->dca_rcvhdr_ctrl[rmp->shadow_inx] &= rmp->mask;
+		cspec->dca_rcvhdr_ctrl[rmp->shadow_inx] |=
+			(u64) dca3_get_tag(&dd->pcidev->dev, cpu) << rmp->lsb;
+		qib_devinfo(dd->pcidev,
+			"Ctxt %d cpu %d dca %llx\n", rcd->ctxt, cpu,
+			(long long) cspec->dca_rcvhdr_ctrl[rmp->shadow_inx]);
+		qib_write_kreg(dd, rmp->regno,
+			       cspec->dca_rcvhdr_ctrl[rmp->shadow_inx]);
+		cspec->dca_ctrl |= SYM_MASK(DCACtrlA, RcvHdrqDCAEnable);
+		qib_write_kreg(dd, KREG_IDX(DCACtrlA), cspec->dca_ctrl);
+	}
+}
+
+static void qib_update_sdma_dca(struct qib_pportdata *ppd, int cpu)
+{
+	struct qib_devdata *dd = ppd->dd;
+	struct qib_chip_specific *cspec = dd->cspec;
+	unsigned pidx = ppd->port - 1;
+
+	if (!(dd->flags & QIB_DCA_ENABLED))
+		return;
+	if (cspec->sdma_cpu[pidx] != cpu) {
+		cspec->sdma_cpu[pidx] = cpu;
+		cspec->dca_rcvhdr_ctrl[4] &= ~(ppd->hw_pidx ?
+			SYM_MASK(DCACtrlF, SendDma1DCAOPH) :
+			SYM_MASK(DCACtrlF, SendDma0DCAOPH));
+		cspec->dca_rcvhdr_ctrl[4] |=
+			(u64) dca3_get_tag(&dd->pcidev->dev, cpu) <<
+				(ppd->hw_pidx ?
+					SYM_LSB(DCACtrlF, SendDma1DCAOPH) :
+					SYM_LSB(DCACtrlF, SendDma0DCAOPH));
+		qib_devinfo(dd->pcidev,
+			"sdma %d cpu %d dca %llx\n", ppd->hw_pidx, cpu,
+			(long long) cspec->dca_rcvhdr_ctrl[4]);
+		qib_write_kreg(dd, KREG_IDX(DCACtrlF),
+			       cspec->dca_rcvhdr_ctrl[4]);
+		cspec->dca_ctrl |= ppd->hw_pidx ?
+			SYM_MASK(DCACtrlA, SendDMAHead1DCAEnable) :
+			SYM_MASK(DCACtrlA, SendDMAHead0DCAEnable);
+		qib_write_kreg(dd, KREG_IDX(DCACtrlA), cspec->dca_ctrl);
+	}
+}
+
+static void qib_setup_dca(struct qib_devdata *dd)
+{
+	struct qib_chip_specific *cspec = dd->cspec;
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(cspec->rhdr_cpu); i++)
+		cspec->rhdr_cpu[i] = -1;
+	for (i = 0; i < ARRAY_SIZE(cspec->sdma_cpu); i++)
+		cspec->sdma_cpu[i] = -1;
+	cspec->dca_rcvhdr_ctrl[0] =
+		(1ULL << SYM_LSB(DCACtrlB, RcvHdrq0DCAXfrCnt)) |
+		(1ULL << SYM_LSB(DCACtrlB, RcvHdrq1DCAXfrCnt)) |
+		(1ULL << SYM_LSB(DCACtrlB, RcvHdrq2DCAXfrCnt)) |
+		(1ULL << SYM_LSB(DCACtrlB, RcvHdrq3DCAXfrCnt));
+	cspec->dca_rcvhdr_ctrl[1] =
+		(1ULL << SYM_LSB(DCACtrlC, RcvHdrq4DCAXfrCnt)) |
+		(1ULL << SYM_LSB(DCACtrlC, RcvHdrq5DCAXfrCnt)) |
+		(1ULL << SYM_LSB(DCACtrlC, RcvHdrq6DCAXfrCnt)) |
+		(1ULL << SYM_LSB(DCACtrlC, RcvHdrq7DCAXfrCnt));
+	cspec->dca_rcvhdr_ctrl[2] =
+		(1ULL << SYM_LSB(DCACtrlD, RcvHdrq8DCAXfrCnt)) |
+		(1ULL << SYM_LSB(DCACtrlD, RcvHdrq9DCAXfrCnt)) |
+		(1ULL << SYM_LSB(DCACtrlD, RcvHdrq10DCAXfrCnt)) |
+		(1ULL << SYM_LSB(DCACtrlD, RcvHdrq11DCAXfrCnt));
+	cspec->dca_rcvhdr_ctrl[3] =
+		(1ULL << SYM_LSB(DCACtrlE, RcvHdrq12DCAXfrCnt)) |
+		(1ULL << SYM_LSB(DCACtrlE, RcvHdrq13DCAXfrCnt)) |
+		(1ULL << SYM_LSB(DCACtrlE, RcvHdrq14DCAXfrCnt)) |
+		(1ULL << SYM_LSB(DCACtrlE, RcvHdrq15DCAXfrCnt));
+	cspec->dca_rcvhdr_ctrl[4] =
+		(1ULL << SYM_LSB(DCACtrlF, RcvHdrq16DCAXfrCnt)) |
+		(1ULL << SYM_LSB(DCACtrlF, RcvHdrq17DCAXfrCnt));
+	for (i = 0; i < ARRAY_SIZE(cspec->sdma_cpu); i++)
+		qib_write_kreg(dd, KREG_IDX(DCACtrlB) + i,
+			       cspec->dca_rcvhdr_ctrl[i]);
+	for (i = 0; i < cspec->num_msix_entries; i++)
+		setup_dca_notifier(dd, &cspec->msix_entries[i]);
+}
+
+static void qib_irq_notifier_notify(struct irq_affinity_notify *notify,
+			     const cpumask_t *mask)
+{
+	struct qib_irq_notify *n =
+		container_of(notify, struct qib_irq_notify, notify);
+	int cpu = cpumask_first(mask);
+
+	if (n->rcv) {
+		struct qib_ctxtdata *rcd = (struct qib_ctxtdata *)n->arg;
+		qib_update_rhdrq_dca(rcd, cpu);
+	} else {
+		struct qib_pportdata *ppd = (struct qib_pportdata *)n->arg;
+		qib_update_sdma_dca(ppd, cpu);
+	}
+}
+
+static void qib_irq_notifier_release(struct kref *ref)
+{
+	struct qib_irq_notify *n =
+		container_of(ref, struct qib_irq_notify, notify.kref);
+	struct qib_devdata *dd;
+
+	if (n->rcv) {
+		struct qib_ctxtdata *rcd = (struct qib_ctxtdata *)n->arg;
+		dd = rcd->dd;
+	} else {
+		struct qib_pportdata *ppd = (struct qib_pportdata *)n->arg;
+		dd = ppd->dd;
+	}
+	qib_devinfo(dd->pcidev,
+		"release on HCA notify 0x%p n 0x%p\n", ref, n);
+	kfree(n);
+}
+#endif
+
+/*
+ * Disable MSIx interrupt if enabled, call generic MSIx code
+ * to cleanup, and clear pending MSIx interrupts.
+ * Used for fallback to INTx, after reset, and when MSIx setup fails.
+ */
+static void qib_7322_nomsix(struct qib_devdata *dd)
+{
+	u64 intgranted;
+	int n;
+
+	dd->cspec->main_int_mask = ~0ULL;
+	n = dd->cspec->num_msix_entries;
+	if (n) {
+		int i;
+
+		dd->cspec->num_msix_entries = 0;
+		for (i = 0; i < n; i++) {
+#ifdef CONFIG_INFINIBAND_QIB_DCA
+			reset_dca_notifier(dd, &dd->cspec->msix_entries[i]);
+#endif
+			irq_set_affinity_hint(
+			  dd->cspec->msix_entries[i].msix.vector, NULL);
+			free_cpumask_var(dd->cspec->msix_entries[i].mask);
+			free_irq(dd->cspec->msix_entries[i].msix.vector,
+			   dd->cspec->msix_entries[i].arg);
+		}
+		qib_nomsix(dd);
+	}
+	/* make sure no MSIx interrupts are left pending */
+	intgranted = qib_read_kreg64(dd, kr_intgranted);
+	if (intgranted)
+		qib_write_kreg(dd, kr_intgranted, intgranted);
+}
+
+static void qib_7322_free_irq(struct qib_devdata *dd)
+{
+	if (dd->cspec->irq) {
+		free_irq(dd->cspec->irq, dd);
+		dd->cspec->irq = 0;
+	}
+	qib_7322_nomsix(dd);
+}
+
+static void qib_setup_7322_cleanup(struct qib_devdata *dd)
+{
+	int i;
+
+#ifdef CONFIG_INFINIBAND_QIB_DCA
+	if (dd->flags & QIB_DCA_ENABLED) {
+		dca_remove_requester(&dd->pcidev->dev);
+		dd->flags &= ~QIB_DCA_ENABLED;
+		dd->cspec->dca_ctrl = 0;
+		qib_write_kreg(dd, KREG_IDX(DCACtrlA), dd->cspec->dca_ctrl);
+	}
+#endif
+
+	qib_7322_free_irq(dd);
+	kfree(dd->cspec->cntrs);
+	kfree(dd->cspec->sendchkenable);
+	kfree(dd->cspec->sendgrhchk);
+	kfree(dd->cspec->sendibchk);
+	kfree(dd->cspec->msix_entries);
+	for (i = 0; i < dd->num_pports; i++) {
+		unsigned long flags;
+		u32 mask = QSFP_GPIO_MOD_PRS_N |
+			(QSFP_GPIO_MOD_PRS_N << QSFP_GPIO_PORT2_SHIFT);
+
+		kfree(dd->pport[i].cpspec->portcntrs);
+		if (dd->flags & QIB_HAS_QSFP) {
+			spin_lock_irqsave(&dd->cspec->gpio_lock, flags);
+			dd->cspec->gpio_mask &= ~mask;
+			qib_write_kreg(dd, kr_gpio_mask, dd->cspec->gpio_mask);
+			spin_unlock_irqrestore(&dd->cspec->gpio_lock, flags);
+			qib_qsfp_deinit(&dd->pport[i].cpspec->qsfp_data);
+		}
+		if (dd->pport[i].ibport_data.smi_ah)
+			ib_destroy_ah(&dd->pport[i].ibport_data.smi_ah->ibah);
+	}
+}
+
+/* handle SDMA interrupts */
+static void sdma_7322_intr(struct qib_devdata *dd, u64 istat)
+{
+	struct qib_pportdata *ppd0 = &dd->pport[0];
+	struct qib_pportdata *ppd1 = &dd->pport[1];
+	u64 intr0 = istat & (INT_MASK_P(SDma, 0) |
+		INT_MASK_P(SDmaIdle, 0) | INT_MASK_P(SDmaProgress, 0));
+	u64 intr1 = istat & (INT_MASK_P(SDma, 1) |
+		INT_MASK_P(SDmaIdle, 1) | INT_MASK_P(SDmaProgress, 1));
+
+	if (intr0)
+		qib_sdma_intr(ppd0);
+	if (intr1)
+		qib_sdma_intr(ppd1);
+
+	if (istat & INT_MASK_PM(SDmaCleanupDone, 0))
+		qib_sdma_process_event(ppd0, qib_sdma_event_e20_hw_started);
+	if (istat & INT_MASK_PM(SDmaCleanupDone, 1))
+		qib_sdma_process_event(ppd1, qib_sdma_event_e20_hw_started);
+}
+
+/*
+ * Set or clear the Send buffer available interrupt enable bit.
+ */
+static void qib_wantpiobuf_7322_intr(struct qib_devdata *dd, u32 needint)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&dd->sendctrl_lock, flags);
+	if (needint)
+		dd->sendctrl |= SYM_MASK(SendCtrl, SendIntBufAvail);
+	else
+		dd->sendctrl &= ~SYM_MASK(SendCtrl, SendIntBufAvail);
+	qib_write_kreg(dd, kr_sendctrl, dd->sendctrl);
+	qib_write_kreg(dd, kr_scratch, 0ULL);
+	spin_unlock_irqrestore(&dd->sendctrl_lock, flags);
+}
+
+/*
+ * Somehow got an interrupt with reserved bits set in interrupt status.
+ * Print a message so we know it happened, then clear them.
+ * keep mainline interrupt handler cache-friendly
+ */
+static noinline void unknown_7322_ibits(struct qib_devdata *dd, u64 istat)
+{
+	u64 kills;
+	char msg[128];
+
+	kills = istat & ~QIB_I_BITSEXTANT;
+	qib_dev_err(dd,
+		"Clearing reserved interrupt(s) 0x%016llx: %s\n",
+		(unsigned long long) kills, msg);
+	qib_write_kreg(dd, kr_intmask, (dd->cspec->int_enable_mask & ~kills));
+}
+
+/* keep mainline interrupt handler cache-friendly */
+static noinline void unknown_7322_gpio_intr(struct qib_devdata *dd)
+{
+	u32 gpiostatus;
+	int handled = 0;
+	int pidx;
+
+	/*
+	 * Boards for this chip currently don't use GPIO interrupts,
+	 * so clear by writing GPIOstatus to GPIOclear, and complain
+	 * to developer.  To avoid endless repeats, clear
+	 * the bits in the mask, since there is some kind of
+	 * programming error or chip problem.
+	 */
+	gpiostatus = qib_read_kreg32(dd, kr_gpio_status);
+	/*
+	 * In theory, writing GPIOstatus to GPIOclear could
+	 * have a bad side-effect on some diagnostic that wanted
+	 * to poll for a status-change, but the various shadows
+	 * make that problematic at best. Diags will just suppress
+	 * all GPIO interrupts during such tests.
+	 */
+	qib_write_kreg(dd, kr_gpio_clear, gpiostatus);
+	/*
+	 * Check for QSFP MOD_PRS changes
+	 * only works for single port if IB1 != pidx1
+	 */
+	for (pidx = 0; pidx < dd->num_pports && (dd->flags & QIB_HAS_QSFP);
+	     ++pidx) {
+		struct qib_pportdata *ppd;
+		struct qib_qsfp_data *qd;
+		u32 mask;
+		if (!dd->pport[pidx].link_speed_supported)
+			continue;
+		mask = QSFP_GPIO_MOD_PRS_N;
+		ppd = dd->pport + pidx;
+		mask <<= (QSFP_GPIO_PORT2_SHIFT * ppd->hw_pidx);
+		if (gpiostatus & dd->cspec->gpio_mask & mask) {
+			u64 pins;
+			qd = &ppd->cpspec->qsfp_data;
+			gpiostatus &= ~mask;
+			pins = qib_read_kreg64(dd, kr_extstatus);
+			pins >>= SYM_LSB(EXTStatus, GPIOIn);
+			if (!(pins & mask)) {
+				++handled;
+				qd->t_insert = jiffies;
+				queue_work(ib_wq, &qd->work);
+			}
+		}
+	}
+
+	if (gpiostatus && !handled) {
+		const u32 mask = qib_read_kreg32(dd, kr_gpio_mask);
+		u32 gpio_irq = mask & gpiostatus;
+
+		/*
+		 * Clear any troublemakers, and update chip from shadow
+		 */
+		dd->cspec->gpio_mask &= ~gpio_irq;
+		qib_write_kreg(dd, kr_gpio_mask, dd->cspec->gpio_mask);
+	}
+}
+
+/*
+ * Handle errors and unusual events first, separate function
+ * to improve cache hits for fast path interrupt handling.
+ */
+static noinline void unlikely_7322_intr(struct qib_devdata *dd, u64 istat)
+{
+	if (istat & ~QIB_I_BITSEXTANT)
+		unknown_7322_ibits(dd, istat);
+	if (istat & QIB_I_GPIO)
+		unknown_7322_gpio_intr(dd);
+	if (istat & QIB_I_C_ERROR) {
+		qib_write_kreg(dd, kr_errmask, 0ULL);
+		tasklet_schedule(&dd->error_tasklet);
+	}
+	if (istat & INT_MASK_P(Err, 0) && dd->rcd[0])
+		handle_7322_p_errors(dd->rcd[0]->ppd);
+	if (istat & INT_MASK_P(Err, 1) && dd->rcd[1])
+		handle_7322_p_errors(dd->rcd[1]->ppd);
+}
+
+/*
+ * Dynamically adjust the rcv int timeout for a context based on incoming
+ * packet rate.
+ */
+static void adjust_rcv_timeout(struct qib_ctxtdata *rcd, int npkts)
+{
+	struct qib_devdata *dd = rcd->dd;
+	u32 timeout = dd->cspec->rcvavail_timeout[rcd->ctxt];
+
+	/*
+	 * Dynamically adjust idle timeout on chip
+	 * based on number of packets processed.
+	 */
+	if (npkts < rcv_int_count && timeout > 2)
+		timeout >>= 1;
+	else if (npkts >= rcv_int_count && timeout < rcv_int_timeout)
+		timeout = min(timeout << 1, rcv_int_timeout);
+	else
+		return;
+
+	dd->cspec->rcvavail_timeout[rcd->ctxt] = timeout;
+	qib_write_kreg(dd, kr_rcvavailtimeout + rcd->ctxt, timeout);
+}
+
+/*
+ * This is the main interrupt handler.
+ * It will normally only be used for low frequency interrupts but may
+ * have to handle all interrupts if INTx is enabled or fewer than normal
+ * MSIx interrupts were allocated.
+ * This routine should ignore the interrupt bits for any of the
+ * dedicated MSIx handlers.
+ */
+static irqreturn_t qib_7322intr(int irq, void *data)
+{
+	struct qib_devdata *dd = data;
+	irqreturn_t ret;
+	u64 istat;
+	u64 ctxtrbits;
+	u64 rmask;
+	unsigned i;
+	u32 npkts;
+
+	if ((dd->flags & (QIB_PRESENT | QIB_BADINTR)) != QIB_PRESENT) {
+		/*
+		 * This return value is not great, but we do not want the
+		 * interrupt core code to remove our interrupt handler
+		 * because we don't appear to be handling an interrupt
+		 * during a chip reset.
+		 */
+		ret = IRQ_HANDLED;
+		goto bail;
+	}
+
+	istat = qib_read_kreg64(dd, kr_intstatus);
+
+	if (unlikely(istat == ~0ULL)) {
+		qib_bad_intrstatus(dd);
+		qib_dev_err(dd, "Interrupt status all f's, skipping\n");
+		/* don't know if it was our interrupt or not */
+		ret = IRQ_NONE;
+		goto bail;
+	}
+
+	istat &= dd->cspec->main_int_mask;
+	if (unlikely(!istat)) {
+		/* already handled, or shared and not us */
+		ret = IRQ_NONE;
+		goto bail;
+	}
+
+	this_cpu_inc(*dd->int_counter);
+
+	/* handle "errors" of various kinds first, device ahead of port */
+	if (unlikely(istat & (~QIB_I_BITSEXTANT | QIB_I_GPIO |
+			      QIB_I_C_ERROR | INT_MASK_P(Err, 0) |
+			      INT_MASK_P(Err, 1))))
+		unlikely_7322_intr(dd, istat);
+
+	/*
+	 * Clear the interrupt bits we found set, relatively early, so we
+	 * "know" know the chip will have seen this by the time we process
+	 * the queue, and will re-interrupt if necessary.  The processor
+	 * itself won't take the interrupt again until we return.
+	 */
+	qib_write_kreg(dd, kr_intclear, istat);
+
+	/*
+	 * Handle kernel receive queues before checking for pio buffers
+	 * available since receives can overflow; piobuf waiters can afford
+	 * a few extra cycles, since they were waiting anyway.
+	 */
+	ctxtrbits = istat & (QIB_I_RCVAVAIL_MASK | QIB_I_RCVURG_MASK);
+	if (ctxtrbits) {
+		rmask = (1ULL << QIB_I_RCVAVAIL_LSB) |
+			(1ULL << QIB_I_RCVURG_LSB);
+		for (i = 0; i < dd->first_user_ctxt; i++) {
+			if (ctxtrbits & rmask) {
+				ctxtrbits &= ~rmask;
+				if (dd->rcd[i])
+					qib_kreceive(dd->rcd[i], NULL, &npkts);
+			}
+			rmask <<= 1;
+		}
+		if (ctxtrbits) {
+			ctxtrbits = (ctxtrbits >> QIB_I_RCVAVAIL_LSB) |
+				(ctxtrbits >> QIB_I_RCVURG_LSB);
+			qib_handle_urcv(dd, ctxtrbits);
+		}
+	}
+
+	if (istat & (QIB_I_P_SDMAINT(0) | QIB_I_P_SDMAINT(1)))
+		sdma_7322_intr(dd, istat);
+
+	if ((istat & QIB_I_SPIOBUFAVAIL) && (dd->flags & QIB_INITTED))
+		qib_ib_piobufavail(dd);
+
+	ret = IRQ_HANDLED;
+bail:
+	return ret;
+}
+
+/*
+ * Dedicated receive packet available interrupt handler.
+ */
+static irqreturn_t qib_7322pintr(int irq, void *data)
+{
+	struct qib_ctxtdata *rcd = data;
+	struct qib_devdata *dd = rcd->dd;
+	u32 npkts;
+
+	if ((dd->flags & (QIB_PRESENT | QIB_BADINTR)) != QIB_PRESENT)
+		/*
+		 * This return value is not great, but we do not want the
+		 * interrupt core code to remove our interrupt handler
+		 * because we don't appear to be handling an interrupt
+		 * during a chip reset.
+		 */
+		return IRQ_HANDLED;
+
+	this_cpu_inc(*dd->int_counter);
+
+	/* Clear the interrupt bit we expect to be set. */
+	qib_write_kreg(dd, kr_intclear, ((1ULL << QIB_I_RCVAVAIL_LSB) |
+		       (1ULL << QIB_I_RCVURG_LSB)) << rcd->ctxt);
+
+	qib_kreceive(rcd, NULL, &npkts);
+
+	return IRQ_HANDLED;
+}
+
+/*
+ * Dedicated Send buffer available interrupt handler.
+ */
+static irqreturn_t qib_7322bufavail(int irq, void *data)
+{
+	struct qib_devdata *dd = data;
+
+	if ((dd->flags & (QIB_PRESENT | QIB_BADINTR)) != QIB_PRESENT)
+		/*
+		 * This return value is not great, but we do not want the
+		 * interrupt core code to remove our interrupt handler
+		 * because we don't appear to be handling an interrupt
+		 * during a chip reset.
+		 */
+		return IRQ_HANDLED;
+
+	this_cpu_inc(*dd->int_counter);
+
+	/* Clear the interrupt bit we expect to be set. */
+	qib_write_kreg(dd, kr_intclear, QIB_I_SPIOBUFAVAIL);
+
+	/* qib_ib_piobufavail() will clear the want PIO interrupt if needed */
+	if (dd->flags & QIB_INITTED)
+		qib_ib_piobufavail(dd);
+	else
+		qib_wantpiobuf_7322_intr(dd, 0);
+
+	return IRQ_HANDLED;
+}
+
+/*
+ * Dedicated Send DMA interrupt handler.
+ */
+static irqreturn_t sdma_intr(int irq, void *data)
+{
+	struct qib_pportdata *ppd = data;
+	struct qib_devdata *dd = ppd->dd;
+
+	if ((dd->flags & (QIB_PRESENT | QIB_BADINTR)) != QIB_PRESENT)
+		/*
+		 * This return value is not great, but we do not want the
+		 * interrupt core code to remove our interrupt handler
+		 * because we don't appear to be handling an interrupt
+		 * during a chip reset.
+		 */
+		return IRQ_HANDLED;
+
+	this_cpu_inc(*dd->int_counter);
+
+	/* Clear the interrupt bit we expect to be set. */
+	qib_write_kreg(dd, kr_intclear, ppd->hw_pidx ?
+		       INT_MASK_P(SDma, 1) : INT_MASK_P(SDma, 0));
+	qib_sdma_intr(ppd);
+
+	return IRQ_HANDLED;
+}
+
+/*
+ * Dedicated Send DMA idle interrupt handler.
+ */
+static irqreturn_t sdma_idle_intr(int irq, void *data)
+{
+	struct qib_pportdata *ppd = data;
+	struct qib_devdata *dd = ppd->dd;
+
+	if ((dd->flags & (QIB_PRESENT | QIB_BADINTR)) != QIB_PRESENT)
+		/*
+		 * This return value is not great, but we do not want the
+		 * interrupt core code to remove our interrupt handler
+		 * because we don't appear to be handling an interrupt
+		 * during a chip reset.
+		 */
+		return IRQ_HANDLED;
+
+	this_cpu_inc(*dd->int_counter);
+
+	/* Clear the interrupt bit we expect to be set. */
+	qib_write_kreg(dd, kr_intclear, ppd->hw_pidx ?
+		       INT_MASK_P(SDmaIdle, 1) : INT_MASK_P(SDmaIdle, 0));
+	qib_sdma_intr(ppd);
+
+	return IRQ_HANDLED;
+}
+
+/*
+ * Dedicated Send DMA progress interrupt handler.
+ */
+static irqreturn_t sdma_progress_intr(int irq, void *data)
+{
+	struct qib_pportdata *ppd = data;
+	struct qib_devdata *dd = ppd->dd;
+
+	if ((dd->flags & (QIB_PRESENT | QIB_BADINTR)) != QIB_PRESENT)
+		/*
+		 * This return value is not great, but we do not want the
+		 * interrupt core code to remove our interrupt handler
+		 * because we don't appear to be handling an interrupt
+		 * during a chip reset.
+		 */
+		return IRQ_HANDLED;
+
+	this_cpu_inc(*dd->int_counter);
+
+	/* Clear the interrupt bit we expect to be set. */
+	qib_write_kreg(dd, kr_intclear, ppd->hw_pidx ?
+		       INT_MASK_P(SDmaProgress, 1) :
+		       INT_MASK_P(SDmaProgress, 0));
+	qib_sdma_intr(ppd);
+
+	return IRQ_HANDLED;
+}
+
+/*
+ * Dedicated Send DMA cleanup interrupt handler.
+ */
+static irqreturn_t sdma_cleanup_intr(int irq, void *data)
+{
+	struct qib_pportdata *ppd = data;
+	struct qib_devdata *dd = ppd->dd;
+
+	if ((dd->flags & (QIB_PRESENT | QIB_BADINTR)) != QIB_PRESENT)
+		/*
+		 * This return value is not great, but we do not want the
+		 * interrupt core code to remove our interrupt handler
+		 * because we don't appear to be handling an interrupt
+		 * during a chip reset.
+		 */
+		return IRQ_HANDLED;
+
+	this_cpu_inc(*dd->int_counter);
+
+	/* Clear the interrupt bit we expect to be set. */
+	qib_write_kreg(dd, kr_intclear, ppd->hw_pidx ?
+		       INT_MASK_PM(SDmaCleanupDone, 1) :
+		       INT_MASK_PM(SDmaCleanupDone, 0));
+	qib_sdma_process_event(ppd, qib_sdma_event_e20_hw_started);
+
+	return IRQ_HANDLED;
+}
+
+#ifdef CONFIG_INFINIBAND_QIB_DCA
+
+static void reset_dca_notifier(struct qib_devdata *dd, struct qib_msix_entry *m)
+{
+	if (!m->dca)
+		return;
+	qib_devinfo(dd->pcidev,
+		"Disabling notifier on HCA %d irq %d\n",
+		dd->unit,
+		m->msix.vector);
+	irq_set_affinity_notifier(
+		m->msix.vector,
+		NULL);
+	m->notifier = NULL;
+}
+
+static void setup_dca_notifier(struct qib_devdata *dd, struct qib_msix_entry *m)
+{
+	struct qib_irq_notify *n;
+
+	if (!m->dca)
+		return;
+	n = kzalloc(sizeof(*n), GFP_KERNEL);
+	if (n) {
+		int ret;
+
+		m->notifier = n;
+		n->notify.irq = m->msix.vector;
+		n->notify.notify = qib_irq_notifier_notify;
+		n->notify.release = qib_irq_notifier_release;
+		n->arg = m->arg;
+		n->rcv = m->rcv;
+		qib_devinfo(dd->pcidev,
+			"set notifier irq %d rcv %d notify %p\n",
+			n->notify.irq, n->rcv, &n->notify);
+		ret = irq_set_affinity_notifier(
+				n->notify.irq,
+				&n->notify);
+		if (ret) {
+			m->notifier = NULL;
+			kfree(n);
+		}
+	}
+}
+
+#endif
+
+/*
+ * Set up our chip-specific interrupt handler.
+ * The interrupt type has already been setup, so
+ * we just need to do the registration and error checking.
+ * If we are using MSIx interrupts, we may fall back to
+ * INTx later, if the interrupt handler doesn't get called
+ * within 1/2 second (see verify_interrupt()).
+ */
+static void qib_setup_7322_interrupt(struct qib_devdata *dd, int clearpend)
+{
+	int ret, i, msixnum;
+	u64 redirect[6];
+	u64 mask;
+	const struct cpumask *local_mask;
+	int firstcpu, secondcpu = 0, currrcvcpu = 0;
+
+	if (!dd->num_pports)
+		return;
+
+	if (clearpend) {
+		/*
+		 * if not switching interrupt types, be sure interrupts are
+		 * disabled, and then clear anything pending at this point,
+		 * because we are starting clean.
+		 */
+		qib_7322_set_intr_state(dd, 0);
+
+		/* clear the reset error, init error/hwerror mask */
+		qib_7322_init_hwerrors(dd);
+
+		/* clear any interrupt bits that might be set */
+		qib_write_kreg(dd, kr_intclear, ~0ULL);
+
+		/* make sure no pending MSIx intr, and clear diag reg */
+		qib_write_kreg(dd, kr_intgranted, ~0ULL);
+		qib_write_kreg(dd, kr_vecclr_wo_int, ~0ULL);
+	}
+
+	if (!dd->cspec->num_msix_entries) {
+		/* Try to get INTx interrupt */
+try_intx:
+		if (!dd->pcidev->irq) {
+			qib_dev_err(dd,
+				"irq is 0, BIOS error?  Interrupts won't work\n");
+			goto bail;
+		}
+		ret = request_irq(dd->pcidev->irq, qib_7322intr,
+				  IRQF_SHARED, QIB_DRV_NAME, dd);
+		if (ret) {
+			qib_dev_err(dd,
+				"Couldn't setup INTx interrupt (irq=%d): %d\n",
+				dd->pcidev->irq, ret);
+			goto bail;
+		}
+		dd->cspec->irq = dd->pcidev->irq;
+		dd->cspec->main_int_mask = ~0ULL;
+		goto bail;
+	}
+
+	/* Try to get MSIx interrupts */
+	memset(redirect, 0, sizeof redirect);
+	mask = ~0ULL;
+	msixnum = 0;
+	local_mask = cpumask_of_pcibus(dd->pcidev->bus);
+	firstcpu = cpumask_first(local_mask);
+	if (firstcpu >= nr_cpu_ids ||
+			cpumask_weight(local_mask) == num_online_cpus()) {
+		local_mask = topology_core_cpumask(0);
+		firstcpu = cpumask_first(local_mask);
+	}
+	if (firstcpu < nr_cpu_ids) {
+		secondcpu = cpumask_next(firstcpu, local_mask);
+		if (secondcpu >= nr_cpu_ids)
+			secondcpu = firstcpu;
+		currrcvcpu = secondcpu;
+	}
+	for (i = 0; msixnum < dd->cspec->num_msix_entries; i++) {
+		irq_handler_t handler;
+		void *arg;
+		u64 val;
+		int lsb, reg, sh;
+#ifdef CONFIG_INFINIBAND_QIB_DCA
+		int dca = 0;
+#endif
+
+		dd->cspec->msix_entries[msixnum].
+			name[sizeof(dd->cspec->msix_entries[msixnum].name) - 1]
+			= '\0';
+		if (i < ARRAY_SIZE(irq_table)) {
+			if (irq_table[i].port) {
+				/* skip if for a non-configured port */
+				if (irq_table[i].port > dd->num_pports)
+					continue;
+				arg = dd->pport + irq_table[i].port - 1;
+			} else
+				arg = dd;
+#ifdef CONFIG_INFINIBAND_QIB_DCA
+			dca = irq_table[i].dca;
+#endif
+			lsb = irq_table[i].lsb;
+			handler = irq_table[i].handler;
+			snprintf(dd->cspec->msix_entries[msixnum].name,
+				sizeof(dd->cspec->msix_entries[msixnum].name)
+				 - 1,
+				QIB_DRV_NAME "%d%s", dd->unit,
+				irq_table[i].name);
+		} else {
+			unsigned ctxt;
+
+			ctxt = i - ARRAY_SIZE(irq_table);
+			/* per krcvq context receive interrupt */
+			arg = dd->rcd[ctxt];
+			if (!arg)
+				continue;
+			if (qib_krcvq01_no_msi && ctxt < 2)
+				continue;
+#ifdef CONFIG_INFINIBAND_QIB_DCA
+			dca = 1;
+#endif
+			lsb = QIB_I_RCVAVAIL_LSB + ctxt;
+			handler = qib_7322pintr;
+			snprintf(dd->cspec->msix_entries[msixnum].name,
+				sizeof(dd->cspec->msix_entries[msixnum].name)
+				 - 1,
+				QIB_DRV_NAME "%d (kctx)", dd->unit);
+		}
+		ret = request_irq(
+			dd->cspec->msix_entries[msixnum].msix.vector,
+			handler, 0, dd->cspec->msix_entries[msixnum].name,
+			arg);
+		if (ret) {
+			/*
+			 * Shouldn't happen since the enable said we could
+			 * have as many as we are trying to setup here.
+			 */
+			qib_dev_err(dd,
+				"Couldn't setup MSIx interrupt (vec=%d, irq=%d): %d\n",
+				msixnum,
+				dd->cspec->msix_entries[msixnum].msix.vector,
+				ret);
+			qib_7322_nomsix(dd);
+			goto try_intx;
+		}
+		dd->cspec->msix_entries[msixnum].arg = arg;
+#ifdef CONFIG_INFINIBAND_QIB_DCA
+		dd->cspec->msix_entries[msixnum].dca = dca;
+		dd->cspec->msix_entries[msixnum].rcv =
+			handler == qib_7322pintr;
+#endif
+		if (lsb >= 0) {
+			reg = lsb / IBA7322_REDIRECT_VEC_PER_REG;
+			sh = (lsb % IBA7322_REDIRECT_VEC_PER_REG) *
+				SYM_LSB(IntRedirect0, vec1);
+			mask &= ~(1ULL << lsb);
+			redirect[reg] |= ((u64) msixnum) << sh;
+		}
+		val = qib_read_kreg64(dd, 2 * msixnum + 1 +
+			(QIB_7322_MsixTable_OFFS / sizeof(u64)));
+		if (firstcpu < nr_cpu_ids &&
+			zalloc_cpumask_var(
+				&dd->cspec->msix_entries[msixnum].mask,
+				GFP_KERNEL)) {
+			if (handler == qib_7322pintr) {
+				cpumask_set_cpu(currrcvcpu,
+					dd->cspec->msix_entries[msixnum].mask);
+				currrcvcpu = cpumask_next(currrcvcpu,
+					local_mask);
+				if (currrcvcpu >= nr_cpu_ids)
+					currrcvcpu = secondcpu;
+			} else {
+				cpumask_set_cpu(firstcpu,
+					dd->cspec->msix_entries[msixnum].mask);
+			}
+			irq_set_affinity_hint(
+				dd->cspec->msix_entries[msixnum].msix.vector,
+				dd->cspec->msix_entries[msixnum].mask);
+		}
+		msixnum++;
+	}
+	/* Initialize the vector mapping */
+	for (i = 0; i < ARRAY_SIZE(redirect); i++)
+		qib_write_kreg(dd, kr_intredirect + i, redirect[i]);
+	dd->cspec->main_int_mask = mask;
+	tasklet_init(&dd->error_tasklet, qib_error_tasklet,
+		(unsigned long)dd);
+bail:;
+}
+
+/**
+ * qib_7322_boardname - fill in the board name and note features
+ * @dd: the qlogic_ib device
+ *
+ * info will be based on the board revision register
+ */
+static unsigned qib_7322_boardname(struct qib_devdata *dd)
+{
+	/* Will need enumeration of board-types here */
+	char *n;
+	u32 boardid, namelen;
+	unsigned features = DUAL_PORT_CAP;
+
+	boardid = SYM_FIELD(dd->revision, Revision, BoardID);
+
+	switch (boardid) {
+	case 0:
+		n = "InfiniPath_QLE7342_Emulation";
+		break;
+	case 1:
+		n = "InfiniPath_QLE7340";
+		dd->flags |= QIB_HAS_QSFP;
+		features = PORT_SPD_CAP;
+		break;
+	case 2:
+		n = "InfiniPath_QLE7342";
+		dd->flags |= QIB_HAS_QSFP;
+		break;
+	case 3:
+		n = "InfiniPath_QMI7342";
+		break;
+	case 4:
+		n = "InfiniPath_Unsupported7342";
+		qib_dev_err(dd, "Unsupported version of QMH7342\n");
+		features = 0;
+		break;
+	case BOARD_QMH7342:
+		n = "InfiniPath_QMH7342";
+		features = 0x24;
+		break;
+	case BOARD_QME7342:
+		n = "InfiniPath_QME7342";
+		break;
+	case 8:
+		n = "InfiniPath_QME7362";
+		dd->flags |= QIB_HAS_QSFP;
+		break;
+	case 15:
+		n = "InfiniPath_QLE7342_TEST";
+		dd->flags |= QIB_HAS_QSFP;
+		break;
+	default:
+		n = "InfiniPath_QLE73xy_UNKNOWN";
+		qib_dev_err(dd, "Unknown 7322 board type %u\n", boardid);
+		break;
+	}
+	dd->board_atten = 1; /* index into txdds_Xdr */
+
+	namelen = strlen(n) + 1;
+	dd->boardname = kmalloc(namelen, GFP_KERNEL);
+	if (!dd->boardname)
+		qib_dev_err(dd, "Failed allocation for board name: %s\n", n);
+	else
+		snprintf(dd->boardname, namelen, "%s", n);
+
+	snprintf(dd->boardversion, sizeof(dd->boardversion),
+		 "ChipABI %u.%u, %s, InfiniPath%u %u.%u, SW Compat %u\n",
+		 QIB_CHIP_VERS_MAJ, QIB_CHIP_VERS_MIN, dd->boardname,
+		 (unsigned)SYM_FIELD(dd->revision, Revision_R, Arch),
+		 dd->majrev, dd->minrev,
+		 (unsigned)SYM_FIELD(dd->revision, Revision_R, SW));
+
+	if (qib_singleport && (features >> PORT_SPD_CAP_SHIFT) & PORT_SPD_CAP) {
+		qib_devinfo(dd->pcidev,
+			"IB%u: Forced to single port mode by module parameter\n",
+			dd->unit);
+		features &= PORT_SPD_CAP;
+	}
+
+	return features;
+}
+
+/*
+ * This routine sleeps, so it can only be called from user context, not
+ * from interrupt context.
+ */
+static int qib_do_7322_reset(struct qib_devdata *dd)
+{
+	u64 val;
+	u64 *msix_vecsave;
+	int i, msix_entries, ret = 1;
+	u16 cmdval;
+	u8 int_line, clinesz;
+	unsigned long flags;
+
+	/* Use dev_err so it shows up in logs, etc. */
+	qib_dev_err(dd, "Resetting InfiniPath unit %u\n", dd->unit);
+
+	qib_pcie_getcmd(dd, &cmdval, &int_line, &clinesz);
+
+	msix_entries = dd->cspec->num_msix_entries;
+
+	/* no interrupts till re-initted */
+	qib_7322_set_intr_state(dd, 0);
+
+	if (msix_entries) {
+		qib_7322_nomsix(dd);
+		/* can be up to 512 bytes, too big for stack */
+		msix_vecsave = kmalloc(2 * dd->cspec->num_msix_entries *
+			sizeof(u64), GFP_KERNEL);
+		if (!msix_vecsave)
+			qib_dev_err(dd, "No mem to save MSIx data\n");
+	} else
+		msix_vecsave = NULL;
+
+	/*
+	 * Core PCI (as of 2.6.18) doesn't save or rewrite the full vector
+	 * info that is set up by the BIOS, so we have to save and restore
+	 * it ourselves.   There is some risk something could change it,
+	 * after we save it, but since we have disabled the MSIx, it
+	 * shouldn't be touched...
+	 */
+	for (i = 0; i < msix_entries; i++) {
+		u64 vecaddr, vecdata;
+		vecaddr = qib_read_kreg64(dd, 2 * i +
+				  (QIB_7322_MsixTable_OFFS / sizeof(u64)));
+		vecdata = qib_read_kreg64(dd, 1 + 2 * i +
+				  (QIB_7322_MsixTable_OFFS / sizeof(u64)));
+		if (msix_vecsave) {
+			msix_vecsave[2 * i] = vecaddr;
+			/* save it without the masked bit set */
+			msix_vecsave[1 + 2 * i] = vecdata & ~0x100000000ULL;
+		}
+	}
+
+	dd->pport->cpspec->ibdeltainprog = 0;
+	dd->pport->cpspec->ibsymdelta = 0;
+	dd->pport->cpspec->iblnkerrdelta = 0;
+	dd->pport->cpspec->ibmalfdelta = 0;
+	/* so we check interrupts work again */
+	dd->z_int_counter = qib_int_counter(dd);
+
+	/*
+	 * Keep chip from being accessed until we are ready.  Use
+	 * writeq() directly, to allow the write even though QIB_PRESENT
+	 * isn't set.
+	 */
+	dd->flags &= ~(QIB_INITTED | QIB_PRESENT | QIB_BADINTR);
+	dd->flags |= QIB_DOING_RESET;
+	val = dd->control | QLOGIC_IB_C_RESET;
+	writeq(val, &dd->kregbase[kr_control]);
+
+	for (i = 1; i <= 5; i++) {
+		/*
+		 * Allow MBIST, etc. to complete; longer on each retry.
+		 * We sometimes get machine checks from bus timeout if no
+		 * response, so for now, make it *really* long.
+		 */
+		msleep(1000 + (1 + i) * 3000);
+
+		qib_pcie_reenable(dd, cmdval, int_line, clinesz);
+
+		/*
+		 * Use readq directly, so we don't need to mark it as PRESENT
+		 * until we get a successful indication that all is well.
+		 */
+		val = readq(&dd->kregbase[kr_revision]);
+		if (val == dd->revision)
+			break;
+		if (i == 5) {
+			qib_dev_err(dd,
+				"Failed to initialize after reset, unusable\n");
+			ret = 0;
+			goto  bail;
+		}
+	}
+
+	dd->flags |= QIB_PRESENT; /* it's back */
+
+	if (msix_entries) {
+		/* restore the MSIx vector address and data if saved above */
+		for (i = 0; i < msix_entries; i++) {
+			dd->cspec->msix_entries[i].msix.entry = i;
+			if (!msix_vecsave || !msix_vecsave[2 * i])
+				continue;
+			qib_write_kreg(dd, 2 * i +
+				(QIB_7322_MsixTable_OFFS / sizeof(u64)),
+				msix_vecsave[2 * i]);
+			qib_write_kreg(dd, 1 + 2 * i +
+				(QIB_7322_MsixTable_OFFS / sizeof(u64)),
+				msix_vecsave[1 + 2 * i]);
+		}
+	}
+
+	/* initialize the remaining registers.  */
+	for (i = 0; i < dd->num_pports; ++i)
+		write_7322_init_portregs(&dd->pport[i]);
+	write_7322_initregs(dd);
+
+	if (qib_pcie_params(dd, dd->lbus_width,
+			    &dd->cspec->num_msix_entries,
+			    dd->cspec->msix_entries))
+		qib_dev_err(dd,
+			"Reset failed to setup PCIe or interrupts; continuing anyway\n");
+
+	qib_setup_7322_interrupt(dd, 1);
+
+	for (i = 0; i < dd->num_pports; ++i) {
+		struct qib_pportdata *ppd = &dd->pport[i];
+
+		spin_lock_irqsave(&ppd->lflags_lock, flags);
+		ppd->lflags |= QIBL_IB_FORCE_NOTIFY;
+		ppd->lflags &= ~QIBL_IB_AUTONEG_FAILED;
+		spin_unlock_irqrestore(&ppd->lflags_lock, flags);
+	}
+
+bail:
+	dd->flags &= ~QIB_DOING_RESET; /* OK or not, no longer resetting */
+	kfree(msix_vecsave);
+	return ret;
+}
+
+/**
+ * qib_7322_put_tid - write a TID to the chip
+ * @dd: the qlogic_ib device
+ * @tidptr: pointer to the expected TID (in chip) to update
+ * @tidtype: 0 for eager, 1 for expected
+ * @pa: physical address of in memory buffer; tidinvalid if freeing
+ */
+static void qib_7322_put_tid(struct qib_devdata *dd, u64 __iomem *tidptr,
+			     u32 type, unsigned long pa)
+{
+	if (!(dd->flags & QIB_PRESENT))
+		return;
+	if (pa != dd->tidinvalid) {
+		u64 chippa = pa >> IBA7322_TID_PA_SHIFT;
+
+		/* paranoia checks */
+		if (pa != (chippa << IBA7322_TID_PA_SHIFT)) {
+			qib_dev_err(dd, "Physaddr %lx not 2KB aligned!\n",
+				    pa);
+			return;
+		}
+		if (chippa >= (1UL << IBA7322_TID_SZ_SHIFT)) {
+			qib_dev_err(dd,
+				"Physical page address 0x%lx larger than supported\n",
+				pa);
+			return;
+		}
+
+		if (type == RCVHQ_RCV_TYPE_EAGER)
+			chippa |= dd->tidtemplate;
+		else /* for now, always full 4KB page */
+			chippa |= IBA7322_TID_SZ_4K;
+		pa = chippa;
+	}
+	writeq(pa, tidptr);
+	mmiowb();
+}
+
+/**
+ * qib_7322_clear_tids - clear all TID entries for a ctxt, expected and eager
+ * @dd: the qlogic_ib device
+ * @ctxt: the ctxt
+ *
+ * clear all TID entries for a ctxt, expected and eager.
+ * Used from qib_close().
+ */
+static void qib_7322_clear_tids(struct qib_devdata *dd,
+				struct qib_ctxtdata *rcd)
+{
+	u64 __iomem *tidbase;
+	unsigned long tidinv;
+	u32 ctxt;
+	int i;
+
+	if (!dd->kregbase || !rcd)
+		return;
+
+	ctxt = rcd->ctxt;
+
+	tidinv = dd->tidinvalid;
+	tidbase = (u64 __iomem *)
+		((char __iomem *) dd->kregbase +
+		 dd->rcvtidbase +
+		 ctxt * dd->rcvtidcnt * sizeof(*tidbase));
+
+	for (i = 0; i < dd->rcvtidcnt; i++)
+		qib_7322_put_tid(dd, &tidbase[i], RCVHQ_RCV_TYPE_EXPECTED,
+				 tidinv);
+
+	tidbase = (u64 __iomem *)
+		((char __iomem *) dd->kregbase +
+		 dd->rcvegrbase +
+		 rcd->rcvegr_tid_base * sizeof(*tidbase));
+
+	for (i = 0; i < rcd->rcvegrcnt; i++)
+		qib_7322_put_tid(dd, &tidbase[i], RCVHQ_RCV_TYPE_EAGER,
+				 tidinv);
+}
+
+/**
+ * qib_7322_tidtemplate - setup constants for TID updates
+ * @dd: the qlogic_ib device
+ *
+ * We setup stuff that we use a lot, to avoid calculating each time
+ */
+static void qib_7322_tidtemplate(struct qib_devdata *dd)
+{
+	/*
+	 * For now, we always allocate 4KB buffers (at init) so we can
+	 * receive max size packets.  We may want a module parameter to
+	 * specify 2KB or 4KB and/or make it per port instead of per device
+	 * for those who want to reduce memory footprint.  Note that the
+	 * rcvhdrentsize size must be large enough to hold the largest
+	 * IB header (currently 96 bytes) that we expect to handle (plus of
+	 * course the 2 dwords of RHF).
+	 */
+	if (dd->rcvegrbufsize == 2048)
+		dd->tidtemplate = IBA7322_TID_SZ_2K;
+	else if (dd->rcvegrbufsize == 4096)
+		dd->tidtemplate = IBA7322_TID_SZ_4K;
+	dd->tidinvalid = 0;
+}
+
+/**
+ * qib_init_7322_get_base_info - set chip-specific flags for user code
+ * @rcd: the qlogic_ib ctxt
+ * @kbase: qib_base_info pointer
+ *
+ * We set the PCIE flag because the lower bandwidth on PCIe vs
+ * HyperTransport can affect some user packet algorithims.
+ */
+
+static int qib_7322_get_base_info(struct qib_ctxtdata *rcd,
+				  struct qib_base_info *kinfo)
+{
+	kinfo->spi_runtime_flags |= QIB_RUNTIME_CTXT_MSB_IN_QP |
+		QIB_RUNTIME_PCIE | QIB_RUNTIME_NODMA_RTAIL |
+		QIB_RUNTIME_HDRSUPP | QIB_RUNTIME_SDMA;
+	if (rcd->dd->cspec->r1)
+		kinfo->spi_runtime_flags |= QIB_RUNTIME_RCHK;
+	if (rcd->dd->flags & QIB_USE_SPCL_TRIG)
+		kinfo->spi_runtime_flags |= QIB_RUNTIME_SPECIAL_TRIGGER;
+
+	return 0;
+}
+
+static struct qib_message_header *
+qib_7322_get_msgheader(struct qib_devdata *dd, __le32 *rhf_addr)
+{
+	u32 offset = qib_hdrget_offset(rhf_addr);
+
+	return (struct qib_message_header *)
+		(rhf_addr - dd->rhf_offset + offset);
+}
+
+/*
+ * Configure number of contexts.
+ */
+static void qib_7322_config_ctxts(struct qib_devdata *dd)
+{
+	unsigned long flags;
+	u32 nchipctxts;
+
+	nchipctxts = qib_read_kreg32(dd, kr_contextcnt);
+	dd->cspec->numctxts = nchipctxts;
+	if (qib_n_krcv_queues > 1 && dd->num_pports) {
+		dd->first_user_ctxt = NUM_IB_PORTS +
+			(qib_n_krcv_queues - 1) * dd->num_pports;
+		if (dd->first_user_ctxt > nchipctxts)
+			dd->first_user_ctxt = nchipctxts;
+		dd->n_krcv_queues = dd->first_user_ctxt / dd->num_pports;
+	} else {
+		dd->first_user_ctxt = NUM_IB_PORTS;
+		dd->n_krcv_queues = 1;
+	}
+
+	if (!qib_cfgctxts) {
+		int nctxts = dd->first_user_ctxt + num_online_cpus();
+
+		if (nctxts <= 6)
+			dd->ctxtcnt = 6;
+		else if (nctxts <= 10)
+			dd->ctxtcnt = 10;
+		else if (nctxts <= nchipctxts)
+			dd->ctxtcnt = nchipctxts;
+	} else if (qib_cfgctxts < dd->num_pports)
+		dd->ctxtcnt = dd->num_pports;
+	else if (qib_cfgctxts <= nchipctxts)
+		dd->ctxtcnt = qib_cfgctxts;
+	if (!dd->ctxtcnt) /* none of the above, set to max */
+		dd->ctxtcnt = nchipctxts;
+
+	/*
+	 * Chip can be configured for 6, 10, or 18 ctxts, and choice
+	 * affects number of eager TIDs per ctxt (1K, 2K, 4K).
+	 * Lock to be paranoid about later motion, etc.
+	 */
+	spin_lock_irqsave(&dd->cspec->rcvmod_lock, flags);
+	if (dd->ctxtcnt > 10)
+		dd->rcvctrl |= 2ULL << SYM_LSB(RcvCtrl, ContextCfg);
+	else if (dd->ctxtcnt > 6)
+		dd->rcvctrl |= 1ULL << SYM_LSB(RcvCtrl, ContextCfg);
+	/* else configure for default 6 receive ctxts */
+
+	/* The XRC opcode is 5. */
+	dd->rcvctrl |= 5ULL << SYM_LSB(RcvCtrl, XrcTypeCode);
+
+	/*
+	 * RcvCtrl *must* be written here so that the
+	 * chip understands how to change rcvegrcnt below.
+	 */
+	qib_write_kreg(dd, kr_rcvctrl, dd->rcvctrl);
+	spin_unlock_irqrestore(&dd->cspec->rcvmod_lock, flags);
+
+	/* kr_rcvegrcnt changes based on the number of contexts enabled */
+	dd->cspec->rcvegrcnt = qib_read_kreg32(dd, kr_rcvegrcnt);
+	if (qib_rcvhdrcnt)
+		dd->rcvhdrcnt = max(dd->cspec->rcvegrcnt, qib_rcvhdrcnt);
+	else
+		dd->rcvhdrcnt = 2 * max(dd->cspec->rcvegrcnt,
+				    dd->num_pports > 1 ? 1024U : 2048U);
+}
+
+static int qib_7322_get_ib_cfg(struct qib_pportdata *ppd, int which)
+{
+
+	int lsb, ret = 0;
+	u64 maskr; /* right-justified mask */
+
+	switch (which) {
+
+	case QIB_IB_CFG_LWID_ENB: /* Get allowed Link-width */
+		ret = ppd->link_width_enabled;
+		goto done;
+
+	case QIB_IB_CFG_LWID: /* Get currently active Link-width */
+		ret = ppd->link_width_active;
+		goto done;
+
+	case QIB_IB_CFG_SPD_ENB: /* Get allowed Link speeds */
+		ret = ppd->link_speed_enabled;
+		goto done;
+
+	case QIB_IB_CFG_SPD: /* Get current Link spd */
+		ret = ppd->link_speed_active;
+		goto done;
+
+	case QIB_IB_CFG_RXPOL_ENB: /* Get Auto-RX-polarity enable */
+		lsb = SYM_LSB(IBCCtrlB_0, IB_POLARITY_REV_SUPP);
+		maskr = SYM_RMASK(IBCCtrlB_0, IB_POLARITY_REV_SUPP);
+		break;
+
+	case QIB_IB_CFG_LREV_ENB: /* Get Auto-Lane-reversal enable */
+		lsb = SYM_LSB(IBCCtrlB_0, IB_LANE_REV_SUPPORTED);
+		maskr = SYM_RMASK(IBCCtrlB_0, IB_LANE_REV_SUPPORTED);
+		break;
+
+	case QIB_IB_CFG_LINKLATENCY:
+		ret = qib_read_kreg_port(ppd, krp_ibcstatus_b) &
+			SYM_MASK(IBCStatusB_0, LinkRoundTripLatency);
+		goto done;
+
+	case QIB_IB_CFG_OP_VLS:
+		ret = ppd->vls_operational;
+		goto done;
+
+	case QIB_IB_CFG_VL_HIGH_CAP:
+		ret = 16;
+		goto done;
+
+	case QIB_IB_CFG_VL_LOW_CAP:
+		ret = 16;
+		goto done;
+
+	case QIB_IB_CFG_OVERRUN_THRESH: /* IB overrun threshold */
+		ret = SYM_FIELD(ppd->cpspec->ibcctrl_a, IBCCtrlA_0,
+				OverrunThreshold);
+		goto done;
+
+	case QIB_IB_CFG_PHYERR_THRESH: /* IB PHY error threshold */
+		ret = SYM_FIELD(ppd->cpspec->ibcctrl_a, IBCCtrlA_0,
+				PhyerrThreshold);
+		goto done;
+
+	case QIB_IB_CFG_LINKDEFAULT: /* IB link default (sleep/poll) */
+		/* will only take effect when the link state changes */
+		ret = (ppd->cpspec->ibcctrl_a &
+		       SYM_MASK(IBCCtrlA_0, LinkDownDefaultState)) ?
+			IB_LINKINITCMD_SLEEP : IB_LINKINITCMD_POLL;
+		goto done;
+
+	case QIB_IB_CFG_HRTBT: /* Get Heartbeat off/enable/auto */
+		lsb = IBA7322_IBC_HRTBT_LSB;
+		maskr = IBA7322_IBC_HRTBT_RMASK; /* OR of AUTO and ENB */
+		break;
+
+	case QIB_IB_CFG_PMA_TICKS:
+		/*
+		 * 0x00 = 10x link transfer rate or 4 nsec. for 2.5Gbs
+		 * Since the clock is always 250MHz, the value is 3, 1 or 0.
+		 */
+		if (ppd->link_speed_active == QIB_IB_QDR)
+			ret = 3;
+		else if (ppd->link_speed_active == QIB_IB_DDR)
+			ret = 1;
+		else
+			ret = 0;
+		goto done;
+
+	default:
+		ret = -EINVAL;
+		goto done;
+	}
+	ret = (int)((ppd->cpspec->ibcctrl_b >> lsb) & maskr);
+done:
+	return ret;
+}
+
+/*
+ * Below again cribbed liberally from older version. Do not lean
+ * heavily on it.
+ */
+#define IBA7322_IBC_DLIDLMC_SHIFT QIB_7322_IBCCtrlB_0_IB_DLID_LSB
+#define IBA7322_IBC_DLIDLMC_MASK (QIB_7322_IBCCtrlB_0_IB_DLID_RMASK \
+	| (QIB_7322_IBCCtrlB_0_IB_DLID_MASK_RMASK << 16))
+
+static int qib_7322_set_ib_cfg(struct qib_pportdata *ppd, int which, u32 val)
+{
+	struct qib_devdata *dd = ppd->dd;
+	u64 maskr; /* right-justified mask */
+	int lsb, ret = 0;
+	u16 lcmd, licmd;
+	unsigned long flags;
+
+	switch (which) {
+	case QIB_IB_CFG_LIDLMC:
+		/*
+		 * Set LID and LMC. Combined to avoid possible hazard
+		 * caller puts LMC in 16MSbits, DLID in 16LSbits of val
+		 */
+		lsb = IBA7322_IBC_DLIDLMC_SHIFT;
+		maskr = IBA7322_IBC_DLIDLMC_MASK;
+		/*
+		 * For header-checking, the SLID in the packet will
+		 * be masked with SendIBSLMCMask, and compared
+		 * with SendIBSLIDAssignMask. Make sure we do not
+		 * set any bits not covered by the mask, or we get
+		 * false-positives.
+		 */
+		qib_write_kreg_port(ppd, krp_sendslid,
+				    val & (val >> 16) & SendIBSLIDAssignMask);
+		qib_write_kreg_port(ppd, krp_sendslidmask,
+				    (val >> 16) & SendIBSLMCMask);
+		break;
+
+	case QIB_IB_CFG_LWID_ENB: /* set allowed Link-width */
+		ppd->link_width_enabled = val;
+		/* convert IB value to chip register value */
+		if (val == IB_WIDTH_1X)
+			val = 0;
+		else if (val == IB_WIDTH_4X)
+			val = 1;
+		else
+			val = 3;
+		maskr = SYM_RMASK(IBCCtrlB_0, IB_NUM_CHANNELS);
+		lsb = SYM_LSB(IBCCtrlB_0, IB_NUM_CHANNELS);
+		break;
+
+	case QIB_IB_CFG_SPD_ENB: /* set allowed Link speeds */
+		/*
+		 * As with width, only write the actual register if the
+		 * link is currently down, otherwise takes effect on next
+		 * link change.  Since setting is being explicitly requested
+		 * (via MAD or sysfs), clear autoneg failure status if speed
+		 * autoneg is enabled.
+		 */
+		ppd->link_speed_enabled = val;
+		val <<= IBA7322_IBC_SPEED_LSB;
+		maskr = IBA7322_IBC_SPEED_MASK | IBA7322_IBC_IBTA_1_2_MASK |
+			IBA7322_IBC_MAX_SPEED_MASK;
+		if (val & (val - 1)) {
+			/* Muliple speeds enabled */
+			val |= IBA7322_IBC_IBTA_1_2_MASK |
+				IBA7322_IBC_MAX_SPEED_MASK;
+			spin_lock_irqsave(&ppd->lflags_lock, flags);
+			ppd->lflags &= ~QIBL_IB_AUTONEG_FAILED;
+			spin_unlock_irqrestore(&ppd->lflags_lock, flags);
+		} else if (val & IBA7322_IBC_SPEED_QDR)
+			val |= IBA7322_IBC_IBTA_1_2_MASK;
+		/* IBTA 1.2 mode + min/max + speed bits are contiguous */
+		lsb = SYM_LSB(IBCCtrlB_0, IB_ENHANCED_MODE);
+		break;
+
+	case QIB_IB_CFG_RXPOL_ENB: /* set Auto-RX-polarity enable */
+		lsb = SYM_LSB(IBCCtrlB_0, IB_POLARITY_REV_SUPP);
+		maskr = SYM_RMASK(IBCCtrlB_0, IB_POLARITY_REV_SUPP);
+		break;
+
+	case QIB_IB_CFG_LREV_ENB: /* set Auto-Lane-reversal enable */
+		lsb = SYM_LSB(IBCCtrlB_0, IB_LANE_REV_SUPPORTED);
+		maskr = SYM_RMASK(IBCCtrlB_0, IB_LANE_REV_SUPPORTED);
+		break;
+
+	case QIB_IB_CFG_OVERRUN_THRESH: /* IB overrun threshold */
+		maskr = SYM_FIELD(ppd->cpspec->ibcctrl_a, IBCCtrlA_0,
+				  OverrunThreshold);
+		if (maskr != val) {
+			ppd->cpspec->ibcctrl_a &=
+				~SYM_MASK(IBCCtrlA_0, OverrunThreshold);
+			ppd->cpspec->ibcctrl_a |= (u64) val <<
+				SYM_LSB(IBCCtrlA_0, OverrunThreshold);
+			qib_write_kreg_port(ppd, krp_ibcctrl_a,
+					    ppd->cpspec->ibcctrl_a);
+			qib_write_kreg(dd, kr_scratch, 0ULL);
+		}
+		goto bail;
+
+	case QIB_IB_CFG_PHYERR_THRESH: /* IB PHY error threshold */
+		maskr = SYM_FIELD(ppd->cpspec->ibcctrl_a, IBCCtrlA_0,
+				  PhyerrThreshold);
+		if (maskr != val) {
+			ppd->cpspec->ibcctrl_a &=
+				~SYM_MASK(IBCCtrlA_0, PhyerrThreshold);
+			ppd->cpspec->ibcctrl_a |= (u64) val <<
+				SYM_LSB(IBCCtrlA_0, PhyerrThreshold);
+			qib_write_kreg_port(ppd, krp_ibcctrl_a,
+					    ppd->cpspec->ibcctrl_a);
+			qib_write_kreg(dd, kr_scratch, 0ULL);
+		}
+		goto bail;
+
+	case QIB_IB_CFG_PKEYS: /* update pkeys */
+		maskr = (u64) ppd->pkeys[0] | ((u64) ppd->pkeys[1] << 16) |
+			((u64) ppd->pkeys[2] << 32) |
+			((u64) ppd->pkeys[3] << 48);
+		qib_write_kreg_port(ppd, krp_partitionkey, maskr);
+		goto bail;
+
+	case QIB_IB_CFG_LINKDEFAULT: /* IB link default (sleep/poll) */
+		/* will only take effect when the link state changes */
+		if (val == IB_LINKINITCMD_POLL)
+			ppd->cpspec->ibcctrl_a &=
+				~SYM_MASK(IBCCtrlA_0, LinkDownDefaultState);
+		else /* SLEEP */
+			ppd->cpspec->ibcctrl_a |=
+				SYM_MASK(IBCCtrlA_0, LinkDownDefaultState);
+		qib_write_kreg_port(ppd, krp_ibcctrl_a, ppd->cpspec->ibcctrl_a);
+		qib_write_kreg(dd, kr_scratch, 0ULL);
+		goto bail;
+
+	case QIB_IB_CFG_MTU: /* update the MTU in IBC */
+		/*
+		 * Update our housekeeping variables, and set IBC max
+		 * size, same as init code; max IBC is max we allow in
+		 * buffer, less the qword pbc, plus 1 for ICRC, in dwords
+		 * Set even if it's unchanged, print debug message only
+		 * on changes.
+		 */
+		val = (ppd->ibmaxlen >> 2) + 1;
+		ppd->cpspec->ibcctrl_a &= ~SYM_MASK(IBCCtrlA_0, MaxPktLen);
+		ppd->cpspec->ibcctrl_a |= (u64)val <<
+			SYM_LSB(IBCCtrlA_0, MaxPktLen);
+		qib_write_kreg_port(ppd, krp_ibcctrl_a,
+				    ppd->cpspec->ibcctrl_a);
+		qib_write_kreg(dd, kr_scratch, 0ULL);
+		goto bail;
+
+	case QIB_IB_CFG_LSTATE: /* set the IB link state */
+		switch (val & 0xffff0000) {
+		case IB_LINKCMD_DOWN:
+			lcmd = QLOGIC_IB_IBCC_LINKCMD_DOWN;
+			ppd->cpspec->ibmalfusesnap = 1;
+			ppd->cpspec->ibmalfsnap = read_7322_creg32_port(ppd,
+				crp_errlink);
+			if (!ppd->cpspec->ibdeltainprog &&
+			    qib_compat_ddr_negotiate) {
+				ppd->cpspec->ibdeltainprog = 1;
+				ppd->cpspec->ibsymsnap =
+					read_7322_creg32_port(ppd,
+							      crp_ibsymbolerr);
+				ppd->cpspec->iblnkerrsnap =
+					read_7322_creg32_port(ppd,
+						      crp_iblinkerrrecov);
+			}
+			break;
+
+		case IB_LINKCMD_ARMED:
+			lcmd = QLOGIC_IB_IBCC_LINKCMD_ARMED;
+			if (ppd->cpspec->ibmalfusesnap) {
+				ppd->cpspec->ibmalfusesnap = 0;
+				ppd->cpspec->ibmalfdelta +=
+					read_7322_creg32_port(ppd,
+							      crp_errlink) -
+					ppd->cpspec->ibmalfsnap;
+			}
+			break;
+
+		case IB_LINKCMD_ACTIVE:
+			lcmd = QLOGIC_IB_IBCC_LINKCMD_ACTIVE;
+			break;
+
+		default:
+			ret = -EINVAL;
+			qib_dev_err(dd, "bad linkcmd req 0x%x\n", val >> 16);
+			goto bail;
+		}
+		switch (val & 0xffff) {
+		case IB_LINKINITCMD_NOP:
+			licmd = 0;
+			break;
+
+		case IB_LINKINITCMD_POLL:
+			licmd = QLOGIC_IB_IBCC_LINKINITCMD_POLL;
+			break;
+
+		case IB_LINKINITCMD_SLEEP:
+			licmd = QLOGIC_IB_IBCC_LINKINITCMD_SLEEP;
+			break;
+
+		case IB_LINKINITCMD_DISABLE:
+			licmd = QLOGIC_IB_IBCC_LINKINITCMD_DISABLE;
+			ppd->cpspec->chase_end = 0;
+			/*
+			 * stop state chase counter and timer, if running.
+			 * wait forpending timer, but don't clear .data (ppd)!
+			 */
+			if (ppd->cpspec->chase_timer.expires) {
+				del_timer_sync(&ppd->cpspec->chase_timer);
+				ppd->cpspec->chase_timer.expires = 0;
+			}
+			break;
+
+		default:
+			ret = -EINVAL;
+			qib_dev_err(dd, "bad linkinitcmd req 0x%x\n",
+				    val & 0xffff);
+			goto bail;
+		}
+		qib_set_ib_7322_lstate(ppd, lcmd, licmd);
+		goto bail;
+
+	case QIB_IB_CFG_OP_VLS:
+		if (ppd->vls_operational != val) {
+			ppd->vls_operational = val;
+			set_vls(ppd);
+		}
+		goto bail;
+
+	case QIB_IB_CFG_VL_HIGH_LIMIT:
+		qib_write_kreg_port(ppd, krp_highprio_limit, val);
+		goto bail;
+
+	case QIB_IB_CFG_HRTBT: /* set Heartbeat off/enable/auto */
+		if (val > 3) {
+			ret = -EINVAL;
+			goto bail;
+		}
+		lsb = IBA7322_IBC_HRTBT_LSB;
+		maskr = IBA7322_IBC_HRTBT_RMASK; /* OR of AUTO and ENB */
+		break;
+
+	case QIB_IB_CFG_PORT:
+		/* val is the port number of the switch we are connected to. */
+		if (ppd->dd->cspec->r1) {
+			cancel_delayed_work(&ppd->cpspec->ipg_work);
+			ppd->cpspec->ipg_tries = 0;
+		}
+		goto bail;
+
+	default:
+		ret = -EINVAL;
+		goto bail;
+	}
+	ppd->cpspec->ibcctrl_b &= ~(maskr << lsb);
+	ppd->cpspec->ibcctrl_b |= (((u64) val & maskr) << lsb);
+	qib_write_kreg_port(ppd, krp_ibcctrl_b, ppd->cpspec->ibcctrl_b);
+	qib_write_kreg(dd, kr_scratch, 0);
+bail:
+	return ret;
+}
+
+static int qib_7322_set_loopback(struct qib_pportdata *ppd, const char *what)
+{
+	int ret = 0;
+	u64 val, ctrlb;
+
+	/* only IBC loopback, may add serdes and xgxs loopbacks later */
+	if (!strncmp(what, "ibc", 3)) {
+		ppd->cpspec->ibcctrl_a |= SYM_MASK(IBCCtrlA_0,
+						       Loopback);
+		val = 0; /* disable heart beat, so link will come up */
+		qib_devinfo(ppd->dd->pcidev, "Enabling IB%u:%u IBC loopback\n",
+			 ppd->dd->unit, ppd->port);
+	} else if (!strncmp(what, "off", 3)) {
+		ppd->cpspec->ibcctrl_a &= ~SYM_MASK(IBCCtrlA_0,
+							Loopback);
+		/* enable heart beat again */
+		val = IBA7322_IBC_HRTBT_RMASK << IBA7322_IBC_HRTBT_LSB;
+		qib_devinfo(ppd->dd->pcidev,
+			"Disabling IB%u:%u IBC loopback (normal)\n",
+			ppd->dd->unit, ppd->port);
+	} else
+		ret = -EINVAL;
+	if (!ret) {
+		qib_write_kreg_port(ppd, krp_ibcctrl_a,
+				    ppd->cpspec->ibcctrl_a);
+		ctrlb = ppd->cpspec->ibcctrl_b & ~(IBA7322_IBC_HRTBT_MASK
+					     << IBA7322_IBC_HRTBT_LSB);
+		ppd->cpspec->ibcctrl_b = ctrlb | val;
+		qib_write_kreg_port(ppd, krp_ibcctrl_b,
+				    ppd->cpspec->ibcctrl_b);
+		qib_write_kreg(ppd->dd, kr_scratch, 0);
+	}
+	return ret;
+}
+
+static void get_vl_weights(struct qib_pportdata *ppd, unsigned regno,
+			   struct ib_vl_weight_elem *vl)
+{
+	unsigned i;
+
+	for (i = 0; i < 16; i++, regno++, vl++) {
+		u32 val = qib_read_kreg_port(ppd, regno);
+
+		vl->vl = (val >> SYM_LSB(LowPriority0_0, VirtualLane)) &
+			SYM_RMASK(LowPriority0_0, VirtualLane);
+		vl->weight = (val >> SYM_LSB(LowPriority0_0, Weight)) &
+			SYM_RMASK(LowPriority0_0, Weight);
+	}
+}
+
+static void set_vl_weights(struct qib_pportdata *ppd, unsigned regno,
+			   struct ib_vl_weight_elem *vl)
+{
+	unsigned i;
+
+	for (i = 0; i < 16; i++, regno++, vl++) {
+		u64 val;
+
+		val = ((vl->vl & SYM_RMASK(LowPriority0_0, VirtualLane)) <<
+			SYM_LSB(LowPriority0_0, VirtualLane)) |
+		      ((vl->weight & SYM_RMASK(LowPriority0_0, Weight)) <<
+			SYM_LSB(LowPriority0_0, Weight));
+		qib_write_kreg_port(ppd, regno, val);
+	}
+	if (!(ppd->p_sendctrl & SYM_MASK(SendCtrl_0, IBVLArbiterEn))) {
+		struct qib_devdata *dd = ppd->dd;
+		unsigned long flags;
+
+		spin_lock_irqsave(&dd->sendctrl_lock, flags);
+		ppd->p_sendctrl |= SYM_MASK(SendCtrl_0, IBVLArbiterEn);
+		qib_write_kreg_port(ppd, krp_sendctrl, ppd->p_sendctrl);
+		qib_write_kreg(dd, kr_scratch, 0);
+		spin_unlock_irqrestore(&dd->sendctrl_lock, flags);
+	}
+}
+
+static int qib_7322_get_ib_table(struct qib_pportdata *ppd, int which, void *t)
+{
+	switch (which) {
+	case QIB_IB_TBL_VL_HIGH_ARB:
+		get_vl_weights(ppd, krp_highprio_0, t);
+		break;
+
+	case QIB_IB_TBL_VL_LOW_ARB:
+		get_vl_weights(ppd, krp_lowprio_0, t);
+		break;
+
+	default:
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static int qib_7322_set_ib_table(struct qib_pportdata *ppd, int which, void *t)
+{
+	switch (which) {
+	case QIB_IB_TBL_VL_HIGH_ARB:
+		set_vl_weights(ppd, krp_highprio_0, t);
+		break;
+
+	case QIB_IB_TBL_VL_LOW_ARB:
+		set_vl_weights(ppd, krp_lowprio_0, t);
+		break;
+
+	default:
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static void qib_update_7322_usrhead(struct qib_ctxtdata *rcd, u64 hd,
+				    u32 updegr, u32 egrhd, u32 npkts)
+{
+	/*
+	 * Need to write timeout register before updating rcvhdrhead to ensure
+	 * that the timer is enabled on reception of a packet.
+	 */
+	if (hd >> IBA7322_HDRHEAD_PKTINT_SHIFT)
+		adjust_rcv_timeout(rcd, npkts);
+	if (updegr)
+		qib_write_ureg(rcd->dd, ur_rcvegrindexhead, egrhd, rcd->ctxt);
+	mmiowb();
+	qib_write_ureg(rcd->dd, ur_rcvhdrhead, hd, rcd->ctxt);
+	qib_write_ureg(rcd->dd, ur_rcvhdrhead, hd, rcd->ctxt);
+	mmiowb();
+}
+
+static u32 qib_7322_hdrqempty(struct qib_ctxtdata *rcd)
+{
+	u32 head, tail;
+
+	head = qib_read_ureg32(rcd->dd, ur_rcvhdrhead, rcd->ctxt);
+	if (rcd->rcvhdrtail_kvaddr)
+		tail = qib_get_rcvhdrtail(rcd);
+	else
+		tail = qib_read_ureg32(rcd->dd, ur_rcvhdrtail, rcd->ctxt);
+	return head == tail;
+}
+
+#define RCVCTRL_COMMON_MODS (QIB_RCVCTRL_CTXT_ENB | \
+	QIB_RCVCTRL_CTXT_DIS | \
+	QIB_RCVCTRL_TIDFLOW_ENB | \
+	QIB_RCVCTRL_TIDFLOW_DIS | \
+	QIB_RCVCTRL_TAILUPD_ENB | \
+	QIB_RCVCTRL_TAILUPD_DIS | \
+	QIB_RCVCTRL_INTRAVAIL_ENB | \
+	QIB_RCVCTRL_INTRAVAIL_DIS | \
+	QIB_RCVCTRL_BP_ENB | \
+	QIB_RCVCTRL_BP_DIS)
+
+#define RCVCTRL_PORT_MODS (QIB_RCVCTRL_CTXT_ENB | \
+	QIB_RCVCTRL_CTXT_DIS | \
+	QIB_RCVCTRL_PKEY_DIS | \
+	QIB_RCVCTRL_PKEY_ENB)
+
+/*
+ * Modify the RCVCTRL register in chip-specific way. This
+ * is a function because bit positions and (future) register
+ * location is chip-specifc, but the needed operations are
+ * generic. <op> is a bit-mask because we often want to
+ * do multiple modifications.
+ */
+static void rcvctrl_7322_mod(struct qib_pportdata *ppd, unsigned int op,
+			     int ctxt)
+{
+	struct qib_devdata *dd = ppd->dd;
+	struct qib_ctxtdata *rcd;
+	u64 mask, val;
+	unsigned long flags;
+
+	spin_lock_irqsave(&dd->cspec->rcvmod_lock, flags);
+
+	if (op & QIB_RCVCTRL_TIDFLOW_ENB)
+		dd->rcvctrl |= SYM_MASK(RcvCtrl, TidFlowEnable);
+	if (op & QIB_RCVCTRL_TIDFLOW_DIS)
+		dd->rcvctrl &= ~SYM_MASK(RcvCtrl, TidFlowEnable);
+	if (op & QIB_RCVCTRL_TAILUPD_ENB)
+		dd->rcvctrl |= SYM_MASK(RcvCtrl, TailUpd);
+	if (op & QIB_RCVCTRL_TAILUPD_DIS)
+		dd->rcvctrl &= ~SYM_MASK(RcvCtrl, TailUpd);
+	if (op & QIB_RCVCTRL_PKEY_ENB)
+		ppd->p_rcvctrl &= ~SYM_MASK(RcvCtrl_0, RcvPartitionKeyDisable);
+	if (op & QIB_RCVCTRL_PKEY_DIS)
+		ppd->p_rcvctrl |= SYM_MASK(RcvCtrl_0, RcvPartitionKeyDisable);
+	if (ctxt < 0) {
+		mask = (1ULL << dd->ctxtcnt) - 1;
+		rcd = NULL;
+	} else {
+		mask = (1ULL << ctxt);
+		rcd = dd->rcd[ctxt];
+	}
+	if ((op & QIB_RCVCTRL_CTXT_ENB) && rcd) {
+		ppd->p_rcvctrl |=
+			(mask << SYM_LSB(RcvCtrl_0, ContextEnableKernel));
+		if (!(dd->flags & QIB_NODMA_RTAIL)) {
+			op |= QIB_RCVCTRL_TAILUPD_ENB; /* need reg write */
+			dd->rcvctrl |= SYM_MASK(RcvCtrl, TailUpd);
+		}
+		/* Write these registers before the context is enabled. */
+		qib_write_kreg_ctxt(dd, krc_rcvhdrtailaddr, ctxt,
+				    rcd->rcvhdrqtailaddr_phys);
+		qib_write_kreg_ctxt(dd, krc_rcvhdraddr, ctxt,
+				    rcd->rcvhdrq_phys);
+		rcd->seq_cnt = 1;
+	}
+	if (op & QIB_RCVCTRL_CTXT_DIS)
+		ppd->p_rcvctrl &=
+			~(mask << SYM_LSB(RcvCtrl_0, ContextEnableKernel));
+	if (op & QIB_RCVCTRL_BP_ENB)
+		dd->rcvctrl |= mask << SYM_LSB(RcvCtrl, dontDropRHQFull);
+	if (op & QIB_RCVCTRL_BP_DIS)
+		dd->rcvctrl &= ~(mask << SYM_LSB(RcvCtrl, dontDropRHQFull));
+	if (op & QIB_RCVCTRL_INTRAVAIL_ENB)
+		dd->rcvctrl |= (mask << SYM_LSB(RcvCtrl, IntrAvail));
+	if (op & QIB_RCVCTRL_INTRAVAIL_DIS)
+		dd->rcvctrl &= ~(mask << SYM_LSB(RcvCtrl, IntrAvail));
+	/*
+	 * Decide which registers to write depending on the ops enabled.
+	 * Special case is "flush" (no bits set at all)
+	 * which needs to write both.
+	 */
+	if (op == 0 || (op & RCVCTRL_COMMON_MODS))
+		qib_write_kreg(dd, kr_rcvctrl, dd->rcvctrl);
+	if (op == 0 || (op & RCVCTRL_PORT_MODS))
+		qib_write_kreg_port(ppd, krp_rcvctrl, ppd->p_rcvctrl);
+	if ((op & QIB_RCVCTRL_CTXT_ENB) && dd->rcd[ctxt]) {
+		/*
+		 * Init the context registers also; if we were
+		 * disabled, tail and head should both be zero
+		 * already from the enable, but since we don't
+		 * know, we have to do it explicitly.
+		 */
+		val = qib_read_ureg32(dd, ur_rcvegrindextail, ctxt);
+		qib_write_ureg(dd, ur_rcvegrindexhead, val, ctxt);
+
+		/* be sure enabling write seen; hd/tl should be 0 */
+		(void) qib_read_kreg32(dd, kr_scratch);
+		val = qib_read_ureg32(dd, ur_rcvhdrtail, ctxt);
+		dd->rcd[ctxt]->head = val;
+		/* If kctxt, interrupt on next receive. */
+		if (ctxt < dd->first_user_ctxt)
+			val |= dd->rhdrhead_intr_off;
+		qib_write_ureg(dd, ur_rcvhdrhead, val, ctxt);
+	} else if ((op & QIB_RCVCTRL_INTRAVAIL_ENB) &&
+		dd->rcd[ctxt] && dd->rhdrhead_intr_off) {
+		/* arm rcv interrupt */
+		val = dd->rcd[ctxt]->head | dd->rhdrhead_intr_off;
+		qib_write_ureg(dd, ur_rcvhdrhead, val, ctxt);
+	}
+	if (op & QIB_RCVCTRL_CTXT_DIS) {
+		unsigned f;
+
+		/* Now that the context is disabled, clear these registers. */
+		if (ctxt >= 0) {
+			qib_write_kreg_ctxt(dd, krc_rcvhdrtailaddr, ctxt, 0);
+			qib_write_kreg_ctxt(dd, krc_rcvhdraddr, ctxt, 0);
+			for (f = 0; f < NUM_TIDFLOWS_CTXT; f++)
+				qib_write_ureg(dd, ur_rcvflowtable + f,
+					       TIDFLOW_ERRBITS, ctxt);
+		} else {
+			unsigned i;
+
+			for (i = 0; i < dd->cfgctxts; i++) {
+				qib_write_kreg_ctxt(dd, krc_rcvhdrtailaddr,
+						    i, 0);
+				qib_write_kreg_ctxt(dd, krc_rcvhdraddr, i, 0);
+				for (f = 0; f < NUM_TIDFLOWS_CTXT; f++)
+					qib_write_ureg(dd, ur_rcvflowtable + f,
+						       TIDFLOW_ERRBITS, i);
+			}
+		}
+	}
+	spin_unlock_irqrestore(&dd->cspec->rcvmod_lock, flags);
+}
+
+/*
+ * Modify the SENDCTRL register in chip-specific way. This
+ * is a function where there are multiple such registers with
+ * slightly different layouts.
+ * The chip doesn't allow back-to-back sendctrl writes, so write
+ * the scratch register after writing sendctrl.
+ *
+ * Which register is written depends on the operation.
+ * Most operate on the common register, while
+ * SEND_ENB and SEND_DIS operate on the per-port ones.
+ * SEND_ENB is included in common because it can change SPCL_TRIG
+ */
+#define SENDCTRL_COMMON_MODS (\
+	QIB_SENDCTRL_CLEAR | \
+	QIB_SENDCTRL_AVAIL_DIS | \
+	QIB_SENDCTRL_AVAIL_ENB | \
+	QIB_SENDCTRL_AVAIL_BLIP | \
+	QIB_SENDCTRL_DISARM | \
+	QIB_SENDCTRL_DISARM_ALL | \
+	QIB_SENDCTRL_SEND_ENB)
+
+#define SENDCTRL_PORT_MODS (\
+	QIB_SENDCTRL_CLEAR | \
+	QIB_SENDCTRL_SEND_ENB | \
+	QIB_SENDCTRL_SEND_DIS | \
+	QIB_SENDCTRL_FLUSH)
+
+static void sendctrl_7322_mod(struct qib_pportdata *ppd, u32 op)
+{
+	struct qib_devdata *dd = ppd->dd;
+	u64 tmp_dd_sendctrl;
+	unsigned long flags;
+
+	spin_lock_irqsave(&dd->sendctrl_lock, flags);
+
+	/* First the dd ones that are "sticky", saved in shadow */
+	if (op & QIB_SENDCTRL_CLEAR)
+		dd->sendctrl = 0;
+	if (op & QIB_SENDCTRL_AVAIL_DIS)
+		dd->sendctrl &= ~SYM_MASK(SendCtrl, SendBufAvailUpd);
+	else if (op & QIB_SENDCTRL_AVAIL_ENB) {
+		dd->sendctrl |= SYM_MASK(SendCtrl, SendBufAvailUpd);
+		if (dd->flags & QIB_USE_SPCL_TRIG)
+			dd->sendctrl |= SYM_MASK(SendCtrl, SpecialTriggerEn);
+	}
+
+	/* Then the ppd ones that are "sticky", saved in shadow */
+	if (op & QIB_SENDCTRL_SEND_DIS)
+		ppd->p_sendctrl &= ~SYM_MASK(SendCtrl_0, SendEnable);
+	else if (op & QIB_SENDCTRL_SEND_ENB)
+		ppd->p_sendctrl |= SYM_MASK(SendCtrl_0, SendEnable);
+
+	if (op & QIB_SENDCTRL_DISARM_ALL) {
+		u32 i, last;
+
+		tmp_dd_sendctrl = dd->sendctrl;
+		last = dd->piobcnt2k + dd->piobcnt4k + NUM_VL15_BUFS;
+		/*
+		 * Disarm any buffers that are not yet launched,
+		 * disabling updates until done.
+		 */
+		tmp_dd_sendctrl &= ~SYM_MASK(SendCtrl, SendBufAvailUpd);
+		for (i = 0; i < last; i++) {
+			qib_write_kreg(dd, kr_sendctrl,
+				       tmp_dd_sendctrl |
+				       SYM_MASK(SendCtrl, Disarm) | i);
+			qib_write_kreg(dd, kr_scratch, 0);
+		}
+	}
+
+	if (op & QIB_SENDCTRL_FLUSH) {
+		u64 tmp_ppd_sendctrl = ppd->p_sendctrl;
+
+		/*
+		 * Now drain all the fifos.  The Abort bit should never be
+		 * needed, so for now, at least, we don't use it.
+		 */
+		tmp_ppd_sendctrl |=
+			SYM_MASK(SendCtrl_0, TxeDrainRmFifo) |
+			SYM_MASK(SendCtrl_0, TxeDrainLaFifo) |
+			SYM_MASK(SendCtrl_0, TxeBypassIbc);
+		qib_write_kreg_port(ppd, krp_sendctrl, tmp_ppd_sendctrl);
+		qib_write_kreg(dd, kr_scratch, 0);
+	}
+
+	tmp_dd_sendctrl = dd->sendctrl;
+
+	if (op & QIB_SENDCTRL_DISARM)
+		tmp_dd_sendctrl |= SYM_MASK(SendCtrl, Disarm) |
+			((op & QIB_7322_SendCtrl_DisarmSendBuf_RMASK) <<
+			 SYM_LSB(SendCtrl, DisarmSendBuf));
+	if ((op & QIB_SENDCTRL_AVAIL_BLIP) &&
+	    (dd->sendctrl & SYM_MASK(SendCtrl, SendBufAvailUpd)))
+		tmp_dd_sendctrl &= ~SYM_MASK(SendCtrl, SendBufAvailUpd);
+
+	if (op == 0 || (op & SENDCTRL_COMMON_MODS)) {
+		qib_write_kreg(dd, kr_sendctrl, tmp_dd_sendctrl);
+		qib_write_kreg(dd, kr_scratch, 0);
+	}
+
+	if (op == 0 || (op & SENDCTRL_PORT_MODS)) {
+		qib_write_kreg_port(ppd, krp_sendctrl, ppd->p_sendctrl);
+		qib_write_kreg(dd, kr_scratch, 0);
+	}
+
+	if (op & QIB_SENDCTRL_AVAIL_BLIP) {
+		qib_write_kreg(dd, kr_sendctrl, dd->sendctrl);
+		qib_write_kreg(dd, kr_scratch, 0);
+	}
+
+	spin_unlock_irqrestore(&dd->sendctrl_lock, flags);
+
+	if (op & QIB_SENDCTRL_FLUSH) {
+		u32 v;
+		/*
+		 * ensure writes have hit chip, then do a few
+		 * more reads, to allow DMA of pioavail registers
+		 * to occur, so in-memory copy is in sync with
+		 * the chip.  Not always safe to sleep.
+		 */
+		v = qib_read_kreg32(dd, kr_scratch);
+		qib_write_kreg(dd, kr_scratch, v);
+		v = qib_read_kreg32(dd, kr_scratch);
+		qib_write_kreg(dd, kr_scratch, v);
+		qib_read_kreg32(dd, kr_scratch);
+	}
+}
+
+#define _PORT_VIRT_FLAG 0x8000U /* "virtual", need adjustments */
+#define _PORT_64BIT_FLAG 0x10000U /* not "virtual", but 64bit */
+#define _PORT_CNTR_IDXMASK 0x7fffU /* mask off flags above */
+
+/**
+ * qib_portcntr_7322 - read a per-port chip counter
+ * @ppd: the qlogic_ib pport
+ * @creg: the counter to read (not a chip offset)
+ */
+static u64 qib_portcntr_7322(struct qib_pportdata *ppd, u32 reg)
+{
+	struct qib_devdata *dd = ppd->dd;
+	u64 ret = 0ULL;
+	u16 creg;
+	/* 0xffff for unimplemented or synthesized counters */
+	static const u32 xlator[] = {
+		[QIBPORTCNTR_PKTSEND] = crp_pktsend | _PORT_64BIT_FLAG,
+		[QIBPORTCNTR_WORDSEND] = crp_wordsend | _PORT_64BIT_FLAG,
+		[QIBPORTCNTR_PSXMITDATA] = crp_psxmitdatacount,
+		[QIBPORTCNTR_PSXMITPKTS] = crp_psxmitpktscount,
+		[QIBPORTCNTR_PSXMITWAIT] = crp_psxmitwaitcount,
+		[QIBPORTCNTR_SENDSTALL] = crp_sendstall,
+		[QIBPORTCNTR_PKTRCV] = crp_pktrcv | _PORT_64BIT_FLAG,
+		[QIBPORTCNTR_PSRCVDATA] = crp_psrcvdatacount,
+		[QIBPORTCNTR_PSRCVPKTS] = crp_psrcvpktscount,
+		[QIBPORTCNTR_RCVEBP] = crp_rcvebp,
+		[QIBPORTCNTR_RCVOVFL] = crp_rcvovfl,
+		[QIBPORTCNTR_WORDRCV] = crp_wordrcv | _PORT_64BIT_FLAG,
+		[QIBPORTCNTR_RXDROPPKT] = 0xffff, /* not needed  for 7322 */
+		[QIBPORTCNTR_RXLOCALPHYERR] = crp_rxotherlocalphyerr,
+		[QIBPORTCNTR_RXVLERR] = crp_rxvlerr,
+		[QIBPORTCNTR_ERRICRC] = crp_erricrc,
+		[QIBPORTCNTR_ERRVCRC] = crp_errvcrc,
+		[QIBPORTCNTR_ERRLPCRC] = crp_errlpcrc,
+		[QIBPORTCNTR_BADFORMAT] = crp_badformat,
+		[QIBPORTCNTR_ERR_RLEN] = crp_err_rlen,
+		[QIBPORTCNTR_IBSYMBOLERR] = crp_ibsymbolerr,
+		[QIBPORTCNTR_INVALIDRLEN] = crp_invalidrlen,
+		[QIBPORTCNTR_UNSUPVL] = crp_txunsupvl,
+		[QIBPORTCNTR_EXCESSBUFOVFL] = crp_excessbufferovfl,
+		[QIBPORTCNTR_ERRLINK] = crp_errlink,
+		[QIBPORTCNTR_IBLINKDOWN] = crp_iblinkdown,
+		[QIBPORTCNTR_IBLINKERRRECOV] = crp_iblinkerrrecov,
+		[QIBPORTCNTR_LLI] = crp_locallinkintegrityerr,
+		[QIBPORTCNTR_VL15PKTDROP] = crp_vl15droppedpkt,
+		[QIBPORTCNTR_ERRPKEY] = crp_errpkey,
+		/*
+		 * the next 3 aren't really counters, but were implemented
+		 * as counters in older chips, so still get accessed as
+		 * though they were counters from this code.
+		 */
+		[QIBPORTCNTR_PSINTERVAL] = krp_psinterval,
+		[QIBPORTCNTR_PSSTART] = krp_psstart,
+		[QIBPORTCNTR_PSSTAT] = krp_psstat,
+		/* pseudo-counter, summed for all ports */
+		[QIBPORTCNTR_KHDROVFL] = 0xffff,
+	};
+
+	if (reg >= ARRAY_SIZE(xlator)) {
+		qib_devinfo(ppd->dd->pcidev,
+			 "Unimplemented portcounter %u\n", reg);
+		goto done;
+	}
+	creg = xlator[reg] & _PORT_CNTR_IDXMASK;
+
+	/* handle non-counters and special cases first */
+	if (reg == QIBPORTCNTR_KHDROVFL) {
+		int i;
+
+		/* sum over all kernel contexts (skip if mini_init) */
+		for (i = 0; dd->rcd && i < dd->first_user_ctxt; i++) {
+			struct qib_ctxtdata *rcd = dd->rcd[i];
+
+			if (!rcd || rcd->ppd != ppd)
+				continue;
+			ret += read_7322_creg32(dd, cr_base_egrovfl + i);
+		}
+		goto done;
+	} else if (reg == QIBPORTCNTR_RXDROPPKT) {
+		/*
+		 * Used as part of the synthesis of port_rcv_errors
+		 * in the verbs code for IBTA counters.  Not needed for 7322,
+		 * because all the errors are already counted by other cntrs.
+		 */
+		goto done;
+	} else if (reg == QIBPORTCNTR_PSINTERVAL ||
+		   reg == QIBPORTCNTR_PSSTART || reg == QIBPORTCNTR_PSSTAT) {
+		/* were counters in older chips, now per-port kernel regs */
+		ret = qib_read_kreg_port(ppd, creg);
+		goto done;
+	}
+
+	/*
+	 * Only fast increment counters are 64 bits; use 32 bit reads to
+	 * avoid two independent reads when on Opteron.
+	 */
+	if (xlator[reg] & _PORT_64BIT_FLAG)
+		ret = read_7322_creg_port(ppd, creg);
+	else
+		ret = read_7322_creg32_port(ppd, creg);
+	if (creg == crp_ibsymbolerr) {
+		if (ppd->cpspec->ibdeltainprog)
+			ret -= ret - ppd->cpspec->ibsymsnap;
+		ret -= ppd->cpspec->ibsymdelta;
+	} else if (creg == crp_iblinkerrrecov) {
+		if (ppd->cpspec->ibdeltainprog)
+			ret -= ret - ppd->cpspec->iblnkerrsnap;
+		ret -= ppd->cpspec->iblnkerrdelta;
+	} else if (creg == crp_errlink)
+		ret -= ppd->cpspec->ibmalfdelta;
+	else if (creg == crp_iblinkdown)
+		ret += ppd->cpspec->iblnkdowndelta;
+done:
+	return ret;
+}
+
+/*
+ * Device counter names (not port-specific), one line per stat,
+ * single string.  Used by utilities like ipathstats to print the stats
+ * in a way which works for different versions of drivers, without changing
+ * the utility.  Names need to be 12 chars or less (w/o newline), for proper
+ * display by utility.
+ * Non-error counters are first.
+ * Start of "error" conters is indicated by a leading "E " on the first
+ * "error" counter, and doesn't count in label length.
+ * The EgrOvfl list needs to be last so we truncate them at the configured
+ * context count for the device.
+ * cntr7322indices contains the corresponding register indices.
+ */
+static const char cntr7322names[] =
+	"Interrupts\n"
+	"HostBusStall\n"
+	"E RxTIDFull\n"
+	"RxTIDInvalid\n"
+	"RxTIDFloDrop\n" /* 7322 only */
+	"Ctxt0EgrOvfl\n"
+	"Ctxt1EgrOvfl\n"
+	"Ctxt2EgrOvfl\n"
+	"Ctxt3EgrOvfl\n"
+	"Ctxt4EgrOvfl\n"
+	"Ctxt5EgrOvfl\n"
+	"Ctxt6EgrOvfl\n"
+	"Ctxt7EgrOvfl\n"
+	"Ctxt8EgrOvfl\n"
+	"Ctxt9EgrOvfl\n"
+	"Ctx10EgrOvfl\n"
+	"Ctx11EgrOvfl\n"
+	"Ctx12EgrOvfl\n"
+	"Ctx13EgrOvfl\n"
+	"Ctx14EgrOvfl\n"
+	"Ctx15EgrOvfl\n"
+	"Ctx16EgrOvfl\n"
+	"Ctx17EgrOvfl\n"
+	;
+
+static const u32 cntr7322indices[] = {
+	cr_lbint | _PORT_64BIT_FLAG,
+	cr_lbstall | _PORT_64BIT_FLAG,
+	cr_tidfull,
+	cr_tidinvalid,
+	cr_rxtidflowdrop,
+	cr_base_egrovfl + 0,
+	cr_base_egrovfl + 1,
+	cr_base_egrovfl + 2,
+	cr_base_egrovfl + 3,
+	cr_base_egrovfl + 4,
+	cr_base_egrovfl + 5,
+	cr_base_egrovfl + 6,
+	cr_base_egrovfl + 7,
+	cr_base_egrovfl + 8,
+	cr_base_egrovfl + 9,
+	cr_base_egrovfl + 10,
+	cr_base_egrovfl + 11,
+	cr_base_egrovfl + 12,
+	cr_base_egrovfl + 13,
+	cr_base_egrovfl + 14,
+	cr_base_egrovfl + 15,
+	cr_base_egrovfl + 16,
+	cr_base_egrovfl + 17,
+};
+
+/*
+ * same as cntr7322names and cntr7322indices, but for port-specific counters.
+ * portcntr7322indices is somewhat complicated by some registers needing
+ * adjustments of various kinds, and those are ORed with _PORT_VIRT_FLAG
+ */
+static const char portcntr7322names[] =
+	"TxPkt\n"
+	"TxFlowPkt\n"
+	"TxWords\n"
+	"RxPkt\n"
+	"RxFlowPkt\n"
+	"RxWords\n"
+	"TxFlowStall\n"
+	"TxDmaDesc\n"  /* 7220 and 7322-only */
+	"E RxDlidFltr\n"  /* 7220 and 7322-only */
+	"IBStatusChng\n"
+	"IBLinkDown\n"
+	"IBLnkRecov\n"
+	"IBRxLinkErr\n"
+	"IBSymbolErr\n"
+	"RxLLIErr\n"
+	"RxBadFormat\n"
+	"RxBadLen\n"
+	"RxBufOvrfl\n"
+	"RxEBP\n"
+	"RxFlowCtlErr\n"
+	"RxICRCerr\n"
+	"RxLPCRCerr\n"
+	"RxVCRCerr\n"
+	"RxInvalLen\n"
+	"RxInvalPKey\n"
+	"RxPktDropped\n"
+	"TxBadLength\n"
+	"TxDropped\n"
+	"TxInvalLen\n"
+	"TxUnderrun\n"
+	"TxUnsupVL\n"
+	"RxLclPhyErr\n" /* 7220 and 7322-only from here down */
+	"RxVL15Drop\n"
+	"RxVlErr\n"
+	"XcessBufOvfl\n"
+	"RxQPBadCtxt\n" /* 7322-only from here down */
+	"TXBadHeader\n"
+	;
+
+static const u32 portcntr7322indices[] = {
+	QIBPORTCNTR_PKTSEND | _PORT_VIRT_FLAG,
+	crp_pktsendflow,
+	QIBPORTCNTR_WORDSEND | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_PKTRCV | _PORT_VIRT_FLAG,
+	crp_pktrcvflowctrl,
+	QIBPORTCNTR_WORDRCV | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_SENDSTALL | _PORT_VIRT_FLAG,
+	crp_txsdmadesc | _PORT_64BIT_FLAG,
+	crp_rxdlidfltr,
+	crp_ibstatuschange,
+	QIBPORTCNTR_IBLINKDOWN | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_IBLINKERRRECOV | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_ERRLINK | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_IBSYMBOLERR | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_LLI | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_BADFORMAT | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_ERR_RLEN | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_RCVOVFL | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_RCVEBP | _PORT_VIRT_FLAG,
+	crp_rcvflowctrlviol,
+	QIBPORTCNTR_ERRICRC | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_ERRLPCRC | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_ERRVCRC | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_INVALIDRLEN | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_ERRPKEY | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_RXDROPPKT | _PORT_VIRT_FLAG,
+	crp_txminmaxlenerr,
+	crp_txdroppedpkt,
+	crp_txlenerr,
+	crp_txunderrun,
+	crp_txunsupvl,
+	QIBPORTCNTR_RXLOCALPHYERR | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_VL15PKTDROP | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_RXVLERR | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_EXCESSBUFOVFL | _PORT_VIRT_FLAG,
+	crp_rxqpinvalidctxt,
+	crp_txhdrerr,
+};
+
+/* do all the setup to make the counter reads efficient later */
+static void init_7322_cntrnames(struct qib_devdata *dd)
+{
+	int i, j = 0;
+	char *s;
+
+	for (i = 0, s = (char *)cntr7322names; s && j <= dd->cfgctxts;
+	     i++) {
+		/* we always have at least one counter before the egrovfl */
+		if (!j && !strncmp("Ctxt0EgrOvfl", s + 1, 12))
+			j = 1;
+		s = strchr(s + 1, '\n');
+		if (s && j)
+			j++;
+	}
+	dd->cspec->ncntrs = i;
+	if (!s)
+		/* full list; size is without terminating null */
+		dd->cspec->cntrnamelen = sizeof(cntr7322names) - 1;
+	else
+		dd->cspec->cntrnamelen = 1 + s - cntr7322names;
+	dd->cspec->cntrs = kmalloc(dd->cspec->ncntrs
+		* sizeof(u64), GFP_KERNEL);
+	if (!dd->cspec->cntrs)
+		qib_dev_err(dd, "Failed allocation for counters\n");
+
+	for (i = 0, s = (char *)portcntr7322names; s; i++)
+		s = strchr(s + 1, '\n');
+	dd->cspec->nportcntrs = i - 1;
+	dd->cspec->portcntrnamelen = sizeof(portcntr7322names) - 1;
+	for (i = 0; i < dd->num_pports; ++i) {
+		dd->pport[i].cpspec->portcntrs = kmalloc(dd->cspec->nportcntrs
+			* sizeof(u64), GFP_KERNEL);
+		if (!dd->pport[i].cpspec->portcntrs)
+			qib_dev_err(dd,
+				"Failed allocation for portcounters\n");
+	}
+}
+
+static u32 qib_read_7322cntrs(struct qib_devdata *dd, loff_t pos, char **namep,
+			      u64 **cntrp)
+{
+	u32 ret;
+
+	if (namep) {
+		ret = dd->cspec->cntrnamelen;
+		if (pos >= ret)
+			ret = 0; /* final read after getting everything */
+		else
+			*namep = (char *) cntr7322names;
+	} else {
+		u64 *cntr = dd->cspec->cntrs;
+		int i;
+
+		ret = dd->cspec->ncntrs * sizeof(u64);
+		if (!cntr || pos >= ret) {
+			/* everything read, or couldn't get memory */
+			ret = 0;
+			goto done;
+		}
+		*cntrp = cntr;
+		for (i = 0; i < dd->cspec->ncntrs; i++)
+			if (cntr7322indices[i] & _PORT_64BIT_FLAG)
+				*cntr++ = read_7322_creg(dd,
+							 cntr7322indices[i] &
+							 _PORT_CNTR_IDXMASK);
+			else
+				*cntr++ = read_7322_creg32(dd,
+							   cntr7322indices[i]);
+	}
+done:
+	return ret;
+}
+
+static u32 qib_read_7322portcntrs(struct qib_devdata *dd, loff_t pos, u32 port,
+				  char **namep, u64 **cntrp)
+{
+	u32 ret;
+
+	if (namep) {
+		ret = dd->cspec->portcntrnamelen;
+		if (pos >= ret)
+			ret = 0; /* final read after getting everything */
+		else
+			*namep = (char *)portcntr7322names;
+	} else {
+		struct qib_pportdata *ppd = &dd->pport[port];
+		u64 *cntr = ppd->cpspec->portcntrs;
+		int i;
+
+		ret = dd->cspec->nportcntrs * sizeof(u64);
+		if (!cntr || pos >= ret) {
+			/* everything read, or couldn't get memory */
+			ret = 0;
+			goto done;
+		}
+		*cntrp = cntr;
+		for (i = 0; i < dd->cspec->nportcntrs; i++) {
+			if (portcntr7322indices[i] & _PORT_VIRT_FLAG)
+				*cntr++ = qib_portcntr_7322(ppd,
+					portcntr7322indices[i] &
+					_PORT_CNTR_IDXMASK);
+			else if (portcntr7322indices[i] & _PORT_64BIT_FLAG)
+				*cntr++ = read_7322_creg_port(ppd,
+					   portcntr7322indices[i] &
+					    _PORT_CNTR_IDXMASK);
+			else
+				*cntr++ = read_7322_creg32_port(ppd,
+					   portcntr7322indices[i]);
+		}
+	}
+done:
+	return ret;
+}
+
+/**
+ * qib_get_7322_faststats - get word counters from chip before they overflow
+ * @opaque - contains a pointer to the qlogic_ib device qib_devdata
+ *
+ * VESTIGIAL IBA7322 has no "small fast counters", so the only
+ * real purpose of this function is to maintain the notion of
+ * "active time", which in turn is only logged into the eeprom,
+ * which we don;t have, yet, for 7322-based boards.
+ *
+ * called from add_timer
+ */
+static void qib_get_7322_faststats(unsigned long opaque)
+{
+	struct qib_devdata *dd = (struct qib_devdata *) opaque;
+	struct qib_pportdata *ppd;
+	unsigned long flags;
+	u64 traffic_wds;
+	int pidx;
+
+	for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+		ppd = dd->pport + pidx;
+
+		/*
+		 * If port isn't enabled or not operational ports, or
+		 * diags is running (can cause memory diags to fail)
+		 * skip this port this time.
+		 */
+		if (!ppd->link_speed_supported || !(dd->flags & QIB_INITTED)
+		    || dd->diag_client)
+			continue;
+
+		/*
+		 * Maintain an activity timer, based on traffic
+		 * exceeding a threshold, so we need to check the word-counts
+		 * even if they are 64-bit.
+		 */
+		traffic_wds = qib_portcntr_7322(ppd, QIBPORTCNTR_WORDRCV) +
+			qib_portcntr_7322(ppd, QIBPORTCNTR_WORDSEND);
+		spin_lock_irqsave(&ppd->dd->eep_st_lock, flags);
+		traffic_wds -= ppd->dd->traffic_wds;
+		ppd->dd->traffic_wds += traffic_wds;
+		if (traffic_wds >= QIB_TRAFFIC_ACTIVE_THRESHOLD)
+			atomic_add(ACTIVITY_TIMER, &ppd->dd->active_time);
+		spin_unlock_irqrestore(&ppd->dd->eep_st_lock, flags);
+		if (ppd->cpspec->qdr_dfe_on && (ppd->link_speed_active &
+						QIB_IB_QDR) &&
+		    (ppd->lflags & (QIBL_LINKINIT | QIBL_LINKARMED |
+				    QIBL_LINKACTIVE)) &&
+		    ppd->cpspec->qdr_dfe_time &&
+		    time_is_before_jiffies(ppd->cpspec->qdr_dfe_time)) {
+			ppd->cpspec->qdr_dfe_on = 0;
+
+			qib_write_kreg_port(ppd, krp_static_adapt_dis(2),
+					    ppd->dd->cspec->r1 ?
+					    QDR_STATIC_ADAPT_INIT_R1 :
+					    QDR_STATIC_ADAPT_INIT);
+			force_h1(ppd);
+		}
+	}
+	mod_timer(&dd->stats_timer, jiffies + HZ * ACTIVITY_TIMER);
+}
+
+/*
+ * If we were using MSIx, try to fallback to INTx.
+ */
+static int qib_7322_intr_fallback(struct qib_devdata *dd)
+{
+	if (!dd->cspec->num_msix_entries)
+		return 0; /* already using INTx */
+
+	qib_devinfo(dd->pcidev,
+		"MSIx interrupt not detected, trying INTx interrupts\n");
+	qib_7322_nomsix(dd);
+	qib_enable_intx(dd->pcidev);
+	qib_setup_7322_interrupt(dd, 0);
+	return 1;
+}
+
+/*
+ * Reset the XGXS (between serdes and IBC).  Slightly less intrusive
+ * than resetting the IBC or external link state, and useful in some
+ * cases to cause some retraining.  To do this right, we reset IBC
+ * as well, then return to previous state (which may be still in reset)
+ * NOTE: some callers of this "know" this writes the current value
+ * of cpspec->ibcctrl_a as part of it's operation, so if that changes,
+ * check all callers.
+ */
+static void qib_7322_mini_pcs_reset(struct qib_pportdata *ppd)
+{
+	u64 val;
+	struct qib_devdata *dd = ppd->dd;
+	const u64 reset_bits = SYM_MASK(IBPCSConfig_0, xcv_rreset) |
+		SYM_MASK(IBPCSConfig_0, xcv_treset) |
+		SYM_MASK(IBPCSConfig_0, tx_rx_reset);
+
+	val = qib_read_kreg_port(ppd, krp_ib_pcsconfig);
+	qib_write_kreg(dd, kr_hwerrmask,
+		       dd->cspec->hwerrmask & ~HWE_MASK(statusValidNoEop));
+	qib_write_kreg_port(ppd, krp_ibcctrl_a,
+			    ppd->cpspec->ibcctrl_a &
+			    ~SYM_MASK(IBCCtrlA_0, IBLinkEn));
+
+	qib_write_kreg_port(ppd, krp_ib_pcsconfig, val | reset_bits);
+	qib_read_kreg32(dd, kr_scratch);
+	qib_write_kreg_port(ppd, krp_ib_pcsconfig, val & ~reset_bits);
+	qib_write_kreg_port(ppd, krp_ibcctrl_a, ppd->cpspec->ibcctrl_a);
+	qib_write_kreg(dd, kr_scratch, 0ULL);
+	qib_write_kreg(dd, kr_hwerrclear,
+		       SYM_MASK(HwErrClear, statusValidNoEopClear));
+	qib_write_kreg(dd, kr_hwerrmask, dd->cspec->hwerrmask);
+}
+
+/*
+ * This code for non-IBTA-compliant IB speed negotiation is only known to
+ * work for the SDR to DDR transition, and only between an HCA and a switch
+ * with recent firmware.  It is based on observed heuristics, rather than
+ * actual knowledge of the non-compliant speed negotiation.
+ * It has a number of hard-coded fields, since the hope is to rewrite this
+ * when a spec is available on how the negoation is intended to work.
+ */
+static void autoneg_7322_sendpkt(struct qib_pportdata *ppd, u32 *hdr,
+				 u32 dcnt, u32 *data)
+{
+	int i;
+	u64 pbc;
+	u32 __iomem *piobuf;
+	u32 pnum, control, len;
+	struct qib_devdata *dd = ppd->dd;
+
+	i = 0;
+	len = 7 + dcnt + 1; /* 7 dword header, dword data, icrc */
+	control = qib_7322_setpbc_control(ppd, len, 0, 15);
+	pbc = ((u64) control << 32) | len;
+	while (!(piobuf = qib_7322_getsendbuf(ppd, pbc, &pnum))) {
+		if (i++ > 15)
+			return;
+		udelay(2);
+	}
+	/* disable header check on this packet, since it can't be valid */
+	dd->f_txchk_change(dd, pnum, 1, TXCHK_CHG_TYPE_DIS1, NULL);
+	writeq(pbc, piobuf);
+	qib_flush_wc();
+	qib_pio_copy(piobuf + 2, hdr, 7);
+	qib_pio_copy(piobuf + 9, data, dcnt);
+	if (dd->flags & QIB_USE_SPCL_TRIG) {
+		u32 spcl_off = (pnum >= dd->piobcnt2k) ? 2047 : 1023;
+
+		qib_flush_wc();
+		__raw_writel(0xaebecede, piobuf + spcl_off);
+	}
+	qib_flush_wc();
+	qib_sendbuf_done(dd, pnum);
+	/* and re-enable hdr check */
+	dd->f_txchk_change(dd, pnum, 1, TXCHK_CHG_TYPE_ENAB1, NULL);
+}
+
+/*
+ * _start packet gets sent twice at start, _done gets sent twice at end
+ */
+static void qib_autoneg_7322_send(struct qib_pportdata *ppd, int which)
+{
+	struct qib_devdata *dd = ppd->dd;
+	static u32 swapped;
+	u32 dw, i, hcnt, dcnt, *data;
+	static u32 hdr[7] = { 0xf002ffff, 0x48ffff, 0x6400abba };
+	static u32 madpayload_start[0x40] = {
+		0x1810103, 0x1, 0x0, 0x0, 0x2c90000, 0x2c9, 0x0, 0x0,
+		0xffffffff, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+		0x1, 0x1388, 0x15e, 0x1, /* rest 0's */
+		};
+	static u32 madpayload_done[0x40] = {
+		0x1810103, 0x1, 0x0, 0x0, 0x2c90000, 0x2c9, 0x0, 0x0,
+		0xffffffff, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+		0x40000001, 0x1388, 0x15e, /* rest 0's */
+		};
+
+	dcnt = ARRAY_SIZE(madpayload_start);
+	hcnt = ARRAY_SIZE(hdr);
+	if (!swapped) {
+		/* for maintainability, do it at runtime */
+		for (i = 0; i < hcnt; i++) {
+			dw = (__force u32) cpu_to_be32(hdr[i]);
+			hdr[i] = dw;
+		}
+		for (i = 0; i < dcnt; i++) {
+			dw = (__force u32) cpu_to_be32(madpayload_start[i]);
+			madpayload_start[i] = dw;
+			dw = (__force u32) cpu_to_be32(madpayload_done[i]);
+			madpayload_done[i] = dw;
+		}
+		swapped = 1;
+	}
+
+	data = which ? madpayload_done : madpayload_start;
+
+	autoneg_7322_sendpkt(ppd, hdr, dcnt, data);
+	qib_read_kreg64(dd, kr_scratch);
+	udelay(2);
+	autoneg_7322_sendpkt(ppd, hdr, dcnt, data);
+	qib_read_kreg64(dd, kr_scratch);
+	udelay(2);
+}
+
+/*
+ * Do the absolute minimum to cause an IB speed change, and make it
+ * ready, but don't actually trigger the change.   The caller will
+ * do that when ready (if link is in Polling training state, it will
+ * happen immediately, otherwise when link next goes down)
+ *
+ * This routine should only be used as part of the DDR autonegotation
+ * code for devices that are not compliant with IB 1.2 (or code that
+ * fixes things up for same).
+ *
+ * When link has gone down, and autoneg enabled, or autoneg has
+ * failed and we give up until next time we set both speeds, and
+ * then we want IBTA enabled as well as "use max enabled speed.
+ */
+static void set_7322_ibspeed_fast(struct qib_pportdata *ppd, u32 speed)
+{
+	u64 newctrlb;
+	newctrlb = ppd->cpspec->ibcctrl_b & ~(IBA7322_IBC_SPEED_MASK |
+				    IBA7322_IBC_IBTA_1_2_MASK |
+				    IBA7322_IBC_MAX_SPEED_MASK);
+
+	if (speed & (speed - 1)) /* multiple speeds */
+		newctrlb |= (speed << IBA7322_IBC_SPEED_LSB) |
+				    IBA7322_IBC_IBTA_1_2_MASK |
+				    IBA7322_IBC_MAX_SPEED_MASK;
+	else
+		newctrlb |= speed == QIB_IB_QDR ?
+			IBA7322_IBC_SPEED_QDR | IBA7322_IBC_IBTA_1_2_MASK :
+			((speed == QIB_IB_DDR ?
+			  IBA7322_IBC_SPEED_DDR : IBA7322_IBC_SPEED_SDR));
+
+	if (newctrlb == ppd->cpspec->ibcctrl_b)
+		return;
+
+	ppd->cpspec->ibcctrl_b = newctrlb;
+	qib_write_kreg_port(ppd, krp_ibcctrl_b, ppd->cpspec->ibcctrl_b);
+	qib_write_kreg(ppd->dd, kr_scratch, 0);
+}
+
+/*
+ * This routine is only used when we are not talking to another
+ * IB 1.2-compliant device that we think can do DDR.
+ * (This includes all existing switch chips as of Oct 2007.)
+ * 1.2-compliant devices go directly to DDR prior to reaching INIT
+ */
+static void try_7322_autoneg(struct qib_pportdata *ppd)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&ppd->lflags_lock, flags);
+	ppd->lflags |= QIBL_IB_AUTONEG_INPROG;
+	spin_unlock_irqrestore(&ppd->lflags_lock, flags);
+	qib_autoneg_7322_send(ppd, 0);
+	set_7322_ibspeed_fast(ppd, QIB_IB_DDR);
+	qib_7322_mini_pcs_reset(ppd);
+	/* 2 msec is minimum length of a poll cycle */
+	queue_delayed_work(ib_wq, &ppd->cpspec->autoneg_work,
+			   msecs_to_jiffies(2));
+}
+
+/*
+ * Handle the empirically determined mechanism for auto-negotiation
+ * of DDR speed with switches.
+ */
+static void autoneg_7322_work(struct work_struct *work)
+{
+	struct qib_pportdata *ppd;
+	struct qib_devdata *dd;
+	u64 startms;
+	u32 i;
+	unsigned long flags;
+
+	ppd = container_of(work, struct qib_chippport_specific,
+			    autoneg_work.work)->ppd;
+	dd = ppd->dd;
+
+	startms = jiffies_to_msecs(jiffies);
+
+	/*
+	 * Busy wait for this first part, it should be at most a
+	 * few hundred usec, since we scheduled ourselves for 2msec.
+	 */
+	for (i = 0; i < 25; i++) {
+		if (SYM_FIELD(ppd->lastibcstat, IBCStatusA_0, LinkState)
+		     == IB_7322_LT_STATE_POLLQUIET) {
+			qib_set_linkstate(ppd, QIB_IB_LINKDOWN_DISABLE);
+			break;
+		}
+		udelay(100);
+	}
+
+	if (!(ppd->lflags & QIBL_IB_AUTONEG_INPROG))
+		goto done; /* we got there early or told to stop */
+
+	/* we expect this to timeout */
+	if (wait_event_timeout(ppd->cpspec->autoneg_wait,
+			       !(ppd->lflags & QIBL_IB_AUTONEG_INPROG),
+			       msecs_to_jiffies(90)))
+		goto done;
+	qib_7322_mini_pcs_reset(ppd);
+
+	/* we expect this to timeout */
+	if (wait_event_timeout(ppd->cpspec->autoneg_wait,
+			       !(ppd->lflags & QIBL_IB_AUTONEG_INPROG),
+			       msecs_to_jiffies(1700)))
+		goto done;
+	qib_7322_mini_pcs_reset(ppd);
+
+	set_7322_ibspeed_fast(ppd, QIB_IB_SDR);
+
+	/*
+	 * Wait up to 250 msec for link to train and get to INIT at DDR;
+	 * this should terminate early.
+	 */
+	wait_event_timeout(ppd->cpspec->autoneg_wait,
+		!(ppd->lflags & QIBL_IB_AUTONEG_INPROG),
+		msecs_to_jiffies(250));
+done:
+	if (ppd->lflags & QIBL_IB_AUTONEG_INPROG) {
+		spin_lock_irqsave(&ppd->lflags_lock, flags);
+		ppd->lflags &= ~QIBL_IB_AUTONEG_INPROG;
+		if (ppd->cpspec->autoneg_tries == AUTONEG_TRIES) {
+			ppd->lflags |= QIBL_IB_AUTONEG_FAILED;
+			ppd->cpspec->autoneg_tries = 0;
+		}
+		spin_unlock_irqrestore(&ppd->lflags_lock, flags);
+		set_7322_ibspeed_fast(ppd, ppd->link_speed_enabled);
+	}
+}
+
+/*
+ * This routine is used to request IPG set in the QLogic switch.
+ * Only called if r1.
+ */
+static void try_7322_ipg(struct qib_pportdata *ppd)
+{
+	struct qib_ibport *ibp = &ppd->ibport_data;
+	struct ib_mad_send_buf *send_buf;
+	struct ib_mad_agent *agent;
+	struct ib_smp *smp;
+	unsigned delay;
+	int ret;
+
+	agent = ibp->send_agent;
+	if (!agent)
+		goto retry;
+
+	send_buf = ib_create_send_mad(agent, 0, 0, 0, IB_MGMT_MAD_HDR,
+				      IB_MGMT_MAD_DATA, GFP_ATOMIC);
+	if (IS_ERR(send_buf))
+		goto retry;
+
+	if (!ibp->smi_ah) {
+		struct ib_ah *ah;
+
+		ah = qib_create_qp0_ah(ibp, be16_to_cpu(IB_LID_PERMISSIVE));
+		if (IS_ERR(ah))
+			ret = PTR_ERR(ah);
+		else {
+			send_buf->ah = ah;
+			ibp->smi_ah = to_iah(ah);
+			ret = 0;
+		}
+	} else {
+		send_buf->ah = &ibp->smi_ah->ibah;
+		ret = 0;
+	}
+
+	smp = send_buf->mad;
+	smp->base_version = IB_MGMT_BASE_VERSION;
+	smp->mgmt_class = IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE;
+	smp->class_version = 1;
+	smp->method = IB_MGMT_METHOD_SEND;
+	smp->hop_cnt = 1;
+	smp->attr_id = QIB_VENDOR_IPG;
+	smp->attr_mod = 0;
+
+	if (!ret)
+		ret = ib_post_send_mad(send_buf, NULL);
+	if (ret)
+		ib_free_send_mad(send_buf);
+retry:
+	delay = 2 << ppd->cpspec->ipg_tries;
+	queue_delayed_work(ib_wq, &ppd->cpspec->ipg_work,
+			   msecs_to_jiffies(delay));
+}
+
+/*
+ * Timeout handler for setting IPG.
+ * Only called if r1.
+ */
+static void ipg_7322_work(struct work_struct *work)
+{
+	struct qib_pportdata *ppd;
+
+	ppd = container_of(work, struct qib_chippport_specific,
+			   ipg_work.work)->ppd;
+	if ((ppd->lflags & (QIBL_LINKINIT | QIBL_LINKARMED | QIBL_LINKACTIVE))
+	    && ++ppd->cpspec->ipg_tries <= 10)
+		try_7322_ipg(ppd);
+}
+
+static u32 qib_7322_iblink_state(u64 ibcs)
+{
+	u32 state = (u32)SYM_FIELD(ibcs, IBCStatusA_0, LinkState);
+
+	switch (state) {
+	case IB_7322_L_STATE_INIT:
+		state = IB_PORT_INIT;
+		break;
+	case IB_7322_L_STATE_ARM:
+		state = IB_PORT_ARMED;
+		break;
+	case IB_7322_L_STATE_ACTIVE:
+		/* fall through */
+	case IB_7322_L_STATE_ACT_DEFER:
+		state = IB_PORT_ACTIVE;
+		break;
+	default: /* fall through */
+	case IB_7322_L_STATE_DOWN:
+		state = IB_PORT_DOWN;
+		break;
+	}
+	return state;
+}
+
+/* returns the IBTA port state, rather than the IBC link training state */
+static u8 qib_7322_phys_portstate(u64 ibcs)
+{
+	u8 state = (u8)SYM_FIELD(ibcs, IBCStatusA_0, LinkTrainingState);
+	return qib_7322_physportstate[state];
+}
+
+static int qib_7322_ib_updown(struct qib_pportdata *ppd, int ibup, u64 ibcs)
+{
+	int ret = 0, symadj = 0;
+	unsigned long flags;
+	int mult;
+
+	spin_lock_irqsave(&ppd->lflags_lock, flags);
+	ppd->lflags &= ~QIBL_IB_FORCE_NOTIFY;
+	spin_unlock_irqrestore(&ppd->lflags_lock, flags);
+
+	/* Update our picture of width and speed from chip */
+	if (ibcs & SYM_MASK(IBCStatusA_0, LinkSpeedQDR)) {
+		ppd->link_speed_active = QIB_IB_QDR;
+		mult = 4;
+	} else if (ibcs & SYM_MASK(IBCStatusA_0, LinkSpeedActive)) {
+		ppd->link_speed_active = QIB_IB_DDR;
+		mult = 2;
+	} else {
+		ppd->link_speed_active = QIB_IB_SDR;
+		mult = 1;
+	}
+	if (ibcs & SYM_MASK(IBCStatusA_0, LinkWidthActive)) {
+		ppd->link_width_active = IB_WIDTH_4X;
+		mult *= 4;
+	} else
+		ppd->link_width_active = IB_WIDTH_1X;
+	ppd->delay_mult = ib_rate_to_delay[mult_to_ib_rate(mult)];
+
+	if (!ibup) {
+		u64 clr;
+
+		/* Link went down. */
+		/* do IPG MAD again after linkdown, even if last time failed */
+		ppd->cpspec->ipg_tries = 0;
+		clr = qib_read_kreg_port(ppd, krp_ibcstatus_b) &
+			(SYM_MASK(IBCStatusB_0, heartbeat_timed_out) |
+			 SYM_MASK(IBCStatusB_0, heartbeat_crosstalk));
+		if (clr)
+			qib_write_kreg_port(ppd, krp_ibcstatus_b, clr);
+		if (!(ppd->lflags & (QIBL_IB_AUTONEG_FAILED |
+				     QIBL_IB_AUTONEG_INPROG)))
+			set_7322_ibspeed_fast(ppd, ppd->link_speed_enabled);
+		if (!(ppd->lflags & QIBL_IB_AUTONEG_INPROG)) {
+			struct qib_qsfp_data *qd =
+				&ppd->cpspec->qsfp_data;
+			/* unlock the Tx settings, speed may change */
+			qib_write_kreg_port(ppd, krp_tx_deemph_override,
+				SYM_MASK(IBSD_TX_DEEMPHASIS_OVERRIDE_0,
+				reset_tx_deemphasis_override));
+			qib_cancel_sends(ppd);
+			/* on link down, ensure sane pcs state */
+			qib_7322_mini_pcs_reset(ppd);
+			/* schedule the qsfp refresh which should turn the link
+			   off */
+			if (ppd->dd->flags & QIB_HAS_QSFP) {
+				qd->t_insert = jiffies;
+				queue_work(ib_wq, &qd->work);
+			}
+			spin_lock_irqsave(&ppd->sdma_lock, flags);
+			if (__qib_sdma_running(ppd))
+				__qib_sdma_process_event(ppd,
+					qib_sdma_event_e70_go_idle);
+			spin_unlock_irqrestore(&ppd->sdma_lock, flags);
+		}
+		clr = read_7322_creg32_port(ppd, crp_iblinkdown);
+		if (clr == ppd->cpspec->iblnkdownsnap)
+			ppd->cpspec->iblnkdowndelta++;
+	} else {
+		if (qib_compat_ddr_negotiate &&
+		    !(ppd->lflags & (QIBL_IB_AUTONEG_FAILED |
+				     QIBL_IB_AUTONEG_INPROG)) &&
+		    ppd->link_speed_active == QIB_IB_SDR &&
+		    (ppd->link_speed_enabled & QIB_IB_DDR)
+		    && ppd->cpspec->autoneg_tries < AUTONEG_TRIES) {
+			/* we are SDR, and auto-negotiation enabled */
+			++ppd->cpspec->autoneg_tries;
+			if (!ppd->cpspec->ibdeltainprog) {
+				ppd->cpspec->ibdeltainprog = 1;
+				ppd->cpspec->ibsymdelta +=
+					read_7322_creg32_port(ppd,
+						crp_ibsymbolerr) -
+						ppd->cpspec->ibsymsnap;
+				ppd->cpspec->iblnkerrdelta +=
+					read_7322_creg32_port(ppd,
+						crp_iblinkerrrecov) -
+						ppd->cpspec->iblnkerrsnap;
+			}
+			try_7322_autoneg(ppd);
+			ret = 1; /* no other IB status change processing */
+		} else if ((ppd->lflags & QIBL_IB_AUTONEG_INPROG) &&
+			   ppd->link_speed_active == QIB_IB_SDR) {
+			qib_autoneg_7322_send(ppd, 1);
+			set_7322_ibspeed_fast(ppd, QIB_IB_DDR);
+			qib_7322_mini_pcs_reset(ppd);
+			udelay(2);
+			ret = 1; /* no other IB status change processing */
+		} else if ((ppd->lflags & QIBL_IB_AUTONEG_INPROG) &&
+			   (ppd->link_speed_active & QIB_IB_DDR)) {
+			spin_lock_irqsave(&ppd->lflags_lock, flags);
+			ppd->lflags &= ~(QIBL_IB_AUTONEG_INPROG |
+					 QIBL_IB_AUTONEG_FAILED);
+			spin_unlock_irqrestore(&ppd->lflags_lock, flags);
+			ppd->cpspec->autoneg_tries = 0;
+			/* re-enable SDR, for next link down */
+			set_7322_ibspeed_fast(ppd, ppd->link_speed_enabled);
+			wake_up(&ppd->cpspec->autoneg_wait);
+			symadj = 1;
+		} else if (ppd->lflags & QIBL_IB_AUTONEG_FAILED) {
+			/*
+			 * Clear autoneg failure flag, and do setup
+			 * so we'll try next time link goes down and
+			 * back to INIT (possibly connected to a
+			 * different device).
+			 */
+			spin_lock_irqsave(&ppd->lflags_lock, flags);
+			ppd->lflags &= ~QIBL_IB_AUTONEG_FAILED;
+			spin_unlock_irqrestore(&ppd->lflags_lock, flags);
+			ppd->cpspec->ibcctrl_b |= IBA7322_IBC_IBTA_1_2_MASK;
+			symadj = 1;
+		}
+		if (!(ppd->lflags & QIBL_IB_AUTONEG_INPROG)) {
+			symadj = 1;
+			if (ppd->dd->cspec->r1 && ppd->cpspec->ipg_tries <= 10)
+				try_7322_ipg(ppd);
+			if (!ppd->cpspec->recovery_init)
+				setup_7322_link_recovery(ppd, 0);
+			ppd->cpspec->qdr_dfe_time = jiffies +
+				msecs_to_jiffies(QDR_DFE_DISABLE_DELAY);
+		}
+		ppd->cpspec->ibmalfusesnap = 0;
+		ppd->cpspec->ibmalfsnap = read_7322_creg32_port(ppd,
+			crp_errlink);
+	}
+	if (symadj) {
+		ppd->cpspec->iblnkdownsnap =
+			read_7322_creg32_port(ppd, crp_iblinkdown);
+		if (ppd->cpspec->ibdeltainprog) {
+			ppd->cpspec->ibdeltainprog = 0;
+			ppd->cpspec->ibsymdelta += read_7322_creg32_port(ppd,
+				crp_ibsymbolerr) - ppd->cpspec->ibsymsnap;
+			ppd->cpspec->iblnkerrdelta += read_7322_creg32_port(ppd,
+				crp_iblinkerrrecov) - ppd->cpspec->iblnkerrsnap;
+		}
+	} else if (!ibup && qib_compat_ddr_negotiate &&
+		   !ppd->cpspec->ibdeltainprog &&
+			!(ppd->lflags & QIBL_IB_AUTONEG_INPROG)) {
+		ppd->cpspec->ibdeltainprog = 1;
+		ppd->cpspec->ibsymsnap = read_7322_creg32_port(ppd,
+			crp_ibsymbolerr);
+		ppd->cpspec->iblnkerrsnap = read_7322_creg32_port(ppd,
+			crp_iblinkerrrecov);
+	}
+
+	if (!ret)
+		qib_setup_7322_setextled(ppd, ibup);
+	return ret;
+}
+
+/*
+ * Does read/modify/write to appropriate registers to
+ * set output and direction bits selected by mask.
+ * these are in their canonical postions (e.g. lsb of
+ * dir will end up in D48 of extctrl on existing chips).
+ * returns contents of GP Inputs.
+ */
+static int gpio_7322_mod(struct qib_devdata *dd, u32 out, u32 dir, u32 mask)
+{
+	u64 read_val, new_out;
+	unsigned long flags;
+
+	if (mask) {
+		/* some bits being written, lock access to GPIO */
+		dir &= mask;
+		out &= mask;
+		spin_lock_irqsave(&dd->cspec->gpio_lock, flags);
+		dd->cspec->extctrl &= ~((u64)mask << SYM_LSB(EXTCtrl, GPIOOe));
+		dd->cspec->extctrl |= ((u64) dir << SYM_LSB(EXTCtrl, GPIOOe));
+		new_out = (dd->cspec->gpio_out & ~mask) | out;
+
+		qib_write_kreg(dd, kr_extctrl, dd->cspec->extctrl);
+		qib_write_kreg(dd, kr_gpio_out, new_out);
+		dd->cspec->gpio_out = new_out;
+		spin_unlock_irqrestore(&dd->cspec->gpio_lock, flags);
+	}
+	/*
+	 * It is unlikely that a read at this time would get valid
+	 * data on a pin whose direction line was set in the same
+	 * call to this function. We include the read here because
+	 * that allows us to potentially combine a change on one pin with
+	 * a read on another, and because the old code did something like
+	 * this.
+	 */
+	read_val = qib_read_kreg64(dd, kr_extstatus);
+	return SYM_FIELD(read_val, EXTStatus, GPIOIn);
+}
+
+/* Enable writes to config EEPROM, if possible. Returns previous state */
+static int qib_7322_eeprom_wen(struct qib_devdata *dd, int wen)
+{
+	int prev_wen;
+	u32 mask;
+
+	mask = 1 << QIB_EEPROM_WEN_NUM;
+	prev_wen = ~gpio_7322_mod(dd, 0, 0, 0) >> QIB_EEPROM_WEN_NUM;
+	gpio_7322_mod(dd, wen ? 0 : mask, mask, mask);
+
+	return prev_wen & 1;
+}
+
+/*
+ * Read fundamental info we need to use the chip.  These are
+ * the registers that describe chip capabilities, and are
+ * saved in shadow registers.
+ */
+static void get_7322_chip_params(struct qib_devdata *dd)
+{
+	u64 val;
+	u32 piobufs;
+	int mtu;
+
+	dd->palign = qib_read_kreg32(dd, kr_pagealign);
+
+	dd->uregbase = qib_read_kreg32(dd, kr_userregbase);
+
+	dd->rcvtidcnt = qib_read_kreg32(dd, kr_rcvtidcnt);
+	dd->rcvtidbase = qib_read_kreg32(dd, kr_rcvtidbase);
+	dd->rcvegrbase = qib_read_kreg32(dd, kr_rcvegrbase);
+	dd->piobufbase = qib_read_kreg64(dd, kr_sendpiobufbase);
+	dd->pio2k_bufbase = dd->piobufbase & 0xffffffff;
+
+	val = qib_read_kreg64(dd, kr_sendpiobufcnt);
+	dd->piobcnt2k = val & ~0U;
+	dd->piobcnt4k = val >> 32;
+	val = qib_read_kreg64(dd, kr_sendpiosize);
+	dd->piosize2k = val & ~0U;
+	dd->piosize4k = val >> 32;
+
+	mtu = ib_mtu_enum_to_int(qib_ibmtu);
+	if (mtu == -1)
+		mtu = QIB_DEFAULT_MTU;
+	dd->pport[0].ibmtu = (u32)mtu;
+	dd->pport[1].ibmtu = (u32)mtu;
+
+	/* these may be adjusted in init_chip_wc_pat() */
+	dd->pio2kbase = (u32 __iomem *)
+		((char __iomem *) dd->kregbase + dd->pio2k_bufbase);
+	dd->pio4kbase = (u32 __iomem *)
+		((char __iomem *) dd->kregbase +
+		 (dd->piobufbase >> 32));
+	/*
+	 * 4K buffers take 2 pages; we use roundup just to be
+	 * paranoid; we calculate it once here, rather than on
+	 * ever buf allocate
+	 */
+	dd->align4k = ALIGN(dd->piosize4k, dd->palign);
+
+	piobufs = dd->piobcnt4k + dd->piobcnt2k + NUM_VL15_BUFS;
+
+	dd->pioavregs = ALIGN(piobufs, sizeof(u64) * BITS_PER_BYTE / 2) /
+		(sizeof(u64) * BITS_PER_BYTE / 2);
+}
+
+/*
+ * The chip base addresses in cspec and cpspec have to be set
+ * after possible init_chip_wc_pat(), rather than in
+ * get_7322_chip_params(), so split out as separate function
+ */
+static void qib_7322_set_baseaddrs(struct qib_devdata *dd)
+{
+	u32 cregbase;
+	cregbase = qib_read_kreg32(dd, kr_counterregbase);
+
+	dd->cspec->cregbase = (u64 __iomem *)(cregbase +
+		(char __iomem *)dd->kregbase);
+
+	dd->egrtidbase = (u64 __iomem *)
+		((char __iomem *) dd->kregbase + dd->rcvegrbase);
+
+	/* port registers are defined as relative to base of chip */
+	dd->pport[0].cpspec->kpregbase =
+		(u64 __iomem *)((char __iomem *)dd->kregbase);
+	dd->pport[1].cpspec->kpregbase =
+		(u64 __iomem *)(dd->palign +
+		(char __iomem *)dd->kregbase);
+	dd->pport[0].cpspec->cpregbase =
+		(u64 __iomem *)(qib_read_kreg_port(&dd->pport[0],
+		kr_counterregbase) + (char __iomem *)dd->kregbase);
+	dd->pport[1].cpspec->cpregbase =
+		(u64 __iomem *)(qib_read_kreg_port(&dd->pport[1],
+		kr_counterregbase) + (char __iomem *)dd->kregbase);
+}
+
+/*
+ * This is a fairly special-purpose observer, so we only support
+ * the port-specific parts of SendCtrl
+ */
+
+#define SENDCTRL_SHADOWED (SYM_MASK(SendCtrl_0, SendEnable) |		\
+			   SYM_MASK(SendCtrl_0, SDmaEnable) |		\
+			   SYM_MASK(SendCtrl_0, SDmaIntEnable) |	\
+			   SYM_MASK(SendCtrl_0, SDmaSingleDescriptor) | \
+			   SYM_MASK(SendCtrl_0, SDmaHalt) |		\
+			   SYM_MASK(SendCtrl_0, IBVLArbiterEn) |	\
+			   SYM_MASK(SendCtrl_0, ForceCreditUpToDate))
+
+static int sendctrl_hook(struct qib_devdata *dd,
+			 const struct diag_observer *op, u32 offs,
+			 u64 *data, u64 mask, int only_32)
+{
+	unsigned long flags;
+	unsigned idx;
+	unsigned pidx;
+	struct qib_pportdata *ppd = NULL;
+	u64 local_data, all_bits;
+
+	/*
+	 * The fixed correspondence between Physical ports and pports is
+	 * severed. We need to hunt for the ppd that corresponds
+	 * to the offset we got. And we have to do that without admitting
+	 * we know the stride, apparently.
+	 */
+	for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+		u64 __iomem *psptr;
+		u32 psoffs;
+
+		ppd = dd->pport + pidx;
+		if (!ppd->cpspec->kpregbase)
+			continue;
+
+		psptr = ppd->cpspec->kpregbase + krp_sendctrl;
+		psoffs = (u32) (psptr - dd->kregbase) * sizeof(*psptr);
+		if (psoffs == offs)
+			break;
+	}
+
+	/* If pport is not being managed by driver, just avoid shadows. */
+	if (pidx >= dd->num_pports)
+		ppd = NULL;
+
+	/* In any case, "idx" is flat index in kreg space */
+	idx = offs / sizeof(u64);
+
+	all_bits = ~0ULL;
+	if (only_32)
+		all_bits >>= 32;
+
+	spin_lock_irqsave(&dd->sendctrl_lock, flags);
+	if (!ppd || (mask & all_bits) != all_bits) {
+		/*
+		 * At least some mask bits are zero, so we need
+		 * to read. The judgement call is whether from
+		 * reg or shadow. First-cut: read reg, and complain
+		 * if any bits which should be shadowed are different
+		 * from their shadowed value.
+		 */
+		if (only_32)
+			local_data = (u64)qib_read_kreg32(dd, idx);
+		else
+			local_data = qib_read_kreg64(dd, idx);
+		*data = (local_data & ~mask) | (*data & mask);
+	}
+	if (mask) {
+		/*
+		 * At least some mask bits are one, so we need
+		 * to write, but only shadow some bits.
+		 */
+		u64 sval, tval; /* Shadowed, transient */
+
+		/*
+		 * New shadow val is bits we don't want to touch,
+		 * ORed with bits we do, that are intended for shadow.
+		 */
+		if (ppd) {
+			sval = ppd->p_sendctrl & ~mask;
+			sval |= *data & SENDCTRL_SHADOWED & mask;
+			ppd->p_sendctrl = sval;
+		} else
+			sval = *data & SENDCTRL_SHADOWED & mask;
+		tval = sval | (*data & ~SENDCTRL_SHADOWED & mask);
+		qib_write_kreg(dd, idx, tval);
+		qib_write_kreg(dd, kr_scratch, 0Ull);
+	}
+	spin_unlock_irqrestore(&dd->sendctrl_lock, flags);
+	return only_32 ? 4 : 8;
+}
+
+static const struct diag_observer sendctrl_0_observer = {
+	sendctrl_hook, KREG_IDX(SendCtrl_0) * sizeof(u64),
+	KREG_IDX(SendCtrl_0) * sizeof(u64)
+};
+
+static const struct diag_observer sendctrl_1_observer = {
+	sendctrl_hook, KREG_IDX(SendCtrl_1) * sizeof(u64),
+	KREG_IDX(SendCtrl_1) * sizeof(u64)
+};
+
+static ushort sdma_fetch_prio = 8;
+module_param_named(sdma_fetch_prio, sdma_fetch_prio, ushort, S_IRUGO);
+MODULE_PARM_DESC(sdma_fetch_prio, "SDMA descriptor fetch priority");
+
+/* Besides logging QSFP events, we set appropriate TxDDS values */
+static void init_txdds_table(struct qib_pportdata *ppd, int override);
+
+static void qsfp_7322_event(struct work_struct *work)
+{
+	struct qib_qsfp_data *qd;
+	struct qib_pportdata *ppd;
+	unsigned long pwrup;
+	unsigned long flags;
+	int ret;
+	u32 le2;
+
+	qd = container_of(work, struct qib_qsfp_data, work);
+	ppd = qd->ppd;
+	pwrup = qd->t_insert +
+		msecs_to_jiffies(QSFP_PWR_LAG_MSEC - QSFP_MODPRS_LAG_MSEC);
+
+	/* Delay for 20 msecs to allow ModPrs resistor to setup */
+	mdelay(QSFP_MODPRS_LAG_MSEC);
+
+	if (!qib_qsfp_mod_present(ppd)) {
+		ppd->cpspec->qsfp_data.modpresent = 0;
+		/* Set the physical link to disabled */
+		qib_set_ib_7322_lstate(ppd, 0,
+				       QLOGIC_IB_IBCC_LINKINITCMD_DISABLE);
+		spin_lock_irqsave(&ppd->lflags_lock, flags);
+		ppd->lflags &= ~QIBL_LINKV;
+		spin_unlock_irqrestore(&ppd->lflags_lock, flags);
+	} else {
+		/*
+		 * Some QSFP's not only do not respond until the full power-up
+		 * time, but may behave badly if we try. So hold off responding
+		 * to insertion.
+		 */
+		while (1) {
+			if (time_is_before_jiffies(pwrup))
+				break;
+			msleep(20);
+		}
+
+		ret = qib_refresh_qsfp_cache(ppd, &qd->cache);
+
+		/*
+		 * Need to change LE2 back to defaults if we couldn't
+		 * read the cable type (to handle cable swaps), so do this
+		 * even on failure to read cable information.  We don't
+		 * get here for QME, so IS_QME check not needed here.
+		 */
+		if (!ret && !ppd->dd->cspec->r1) {
+			if (QSFP_IS_ACTIVE_FAR(qd->cache.tech))
+				le2 = LE2_QME;
+			else if (qd->cache.atten[1] >= qib_long_atten &&
+				 QSFP_IS_CU(qd->cache.tech))
+				le2 = LE2_5m;
+			else
+				le2 = LE2_DEFAULT;
+		} else
+			le2 = LE2_DEFAULT;
+		ibsd_wr_allchans(ppd, 13, (le2 << 7), BMASK(9, 7));
+		/*
+		 * We always change parameteters, since we can choose
+		 * values for cables without eeproms, and the cable may have
+		 * changed from a cable with full or partial eeprom content
+		 * to one with partial or no content.
+		 */
+		init_txdds_table(ppd, 0);
+		/* The physical link is being re-enabled only when the
+		 * previous state was DISABLED and the VALID bit is not
+		 * set. This should only happen when  the cable has been
+		 * physically pulled. */
+		if (!ppd->cpspec->qsfp_data.modpresent &&
+		    (ppd->lflags & (QIBL_LINKV | QIBL_IB_LINK_DISABLED))) {
+			ppd->cpspec->qsfp_data.modpresent = 1;
+			qib_set_ib_7322_lstate(ppd, 0,
+				QLOGIC_IB_IBCC_LINKINITCMD_SLEEP);
+			spin_lock_irqsave(&ppd->lflags_lock, flags);
+			ppd->lflags |= QIBL_LINKV;
+			spin_unlock_irqrestore(&ppd->lflags_lock, flags);
+		}
+	}
+}
+
+/*
+ * There is little we can do but complain to the user if QSFP
+ * initialization fails.
+ */
+static void qib_init_7322_qsfp(struct qib_pportdata *ppd)
+{
+	unsigned long flags;
+	struct qib_qsfp_data *qd = &ppd->cpspec->qsfp_data;
+	struct qib_devdata *dd = ppd->dd;
+	u64 mod_prs_bit = QSFP_GPIO_MOD_PRS_N;
+
+	mod_prs_bit <<= (QSFP_GPIO_PORT2_SHIFT * ppd->hw_pidx);
+	qd->ppd = ppd;
+	qib_qsfp_init(qd, qsfp_7322_event);
+	spin_lock_irqsave(&dd->cspec->gpio_lock, flags);
+	dd->cspec->extctrl |= (mod_prs_bit << SYM_LSB(EXTCtrl, GPIOInvert));
+	dd->cspec->gpio_mask |= mod_prs_bit;
+	qib_write_kreg(dd, kr_extctrl, dd->cspec->extctrl);
+	qib_write_kreg(dd, kr_gpio_mask, dd->cspec->gpio_mask);
+	spin_unlock_irqrestore(&dd->cspec->gpio_lock, flags);
+}
+
+/*
+ * called at device initialization time, and also if the txselect
+ * module parameter is changed.  This is used for cables that don't
+ * have valid QSFP EEPROMs (not present, or attenuation is zero).
+ * We initialize to the default, then if there is a specific
+ * unit,port match, we use that (and set it immediately, for the
+ * current speed, if the link is at INIT or better).
+ * String format is "default# unit#,port#=# ... u,p=#", separators must
+ * be a SPACE character.  A newline terminates.  The u,p=# tuples may
+ * optionally have "u,p=#,#", where the final # is the H1 value
+ * The last specific match is used (actually, all are used, but last
+ * one is the one that winds up set); if none at all, fall back on default.
+ */
+static void set_no_qsfp_atten(struct qib_devdata *dd, int change)
+{
+	char *nxt, *str;
+	u32 pidx, unit, port, deflt, h1;
+	unsigned long val;
+	int any = 0, seth1;
+	int txdds_size;
+
+	str = txselect_list;
+
+	/* default number is validated in setup_txselect() */
+	deflt = simple_strtoul(str, &nxt, 0);
+	for (pidx = 0; pidx < dd->num_pports; ++pidx)
+		dd->pport[pidx].cpspec->no_eep = deflt;
+
+	txdds_size = TXDDS_TABLE_SZ + TXDDS_EXTRA_SZ;
+	if (IS_QME(dd) || IS_QMH(dd))
+		txdds_size += TXDDS_MFG_SZ;
+
+	while (*nxt && nxt[1]) {
+		str = ++nxt;
+		unit = simple_strtoul(str, &nxt, 0);
+		if (nxt == str || !*nxt || *nxt != ',') {
+			while (*nxt && *nxt++ != ' ') /* skip to next, if any */
+				;
+			continue;
+		}
+		str = ++nxt;
+		port = simple_strtoul(str, &nxt, 0);
+		if (nxt == str || *nxt != '=') {
+			while (*nxt && *nxt++ != ' ') /* skip to next, if any */
+				;
+			continue;
+		}
+		str = ++nxt;
+		val = simple_strtoul(str, &nxt, 0);
+		if (nxt == str) {
+			while (*nxt && *nxt++ != ' ') /* skip to next, if any */
+				;
+			continue;
+		}
+		if (val >= txdds_size)
+			continue;
+		seth1 = 0;
+		h1 = 0; /* gcc thinks it might be used uninitted */
+		if (*nxt == ',' && nxt[1]) {
+			str = ++nxt;
+			h1 = (u32)simple_strtoul(str, &nxt, 0);
+			if (nxt == str)
+				while (*nxt && *nxt++ != ' ') /* skip */
+					;
+			else
+				seth1 = 1;
+		}
+		for (pidx = 0; dd->unit == unit && pidx < dd->num_pports;
+		     ++pidx) {
+			struct qib_pportdata *ppd = &dd->pport[pidx];
+
+			if (ppd->port != port || !ppd->link_speed_supported)
+				continue;
+			ppd->cpspec->no_eep = val;
+			if (seth1)
+				ppd->cpspec->h1_val = h1;
+			/* now change the IBC and serdes, overriding generic */
+			init_txdds_table(ppd, 1);
+			/* Re-enable the physical state machine on mezz boards
+			 * now that the correct settings have been set.
+			 * QSFP boards are handles by the QSFP event handler */
+			if (IS_QMH(dd) || IS_QME(dd))
+				qib_set_ib_7322_lstate(ppd, 0,
+					    QLOGIC_IB_IBCC_LINKINITCMD_SLEEP);
+			any++;
+		}
+		if (*nxt == '\n')
+			break; /* done */
+	}
+	if (change && !any) {
+		/* no specific setting, use the default.
+		 * Change the IBC and serdes, but since it's
+		 * general, don't override specific settings.
+		 */
+		for (pidx = 0; pidx < dd->num_pports; ++pidx)
+			if (dd->pport[pidx].link_speed_supported)
+				init_txdds_table(&dd->pport[pidx], 0);
+	}
+}
+
+/* handle the txselect parameter changing */
+static int setup_txselect(const char *str, struct kernel_param *kp)
+{
+	struct qib_devdata *dd;
+	unsigned long val;
+	char *n;
+	if (strlen(str) >= MAX_ATTEN_LEN) {
+		pr_info("txselect_values string too long\n");
+		return -ENOSPC;
+	}
+	val = simple_strtoul(str, &n, 0);
+	if (n == str || val >= (TXDDS_TABLE_SZ + TXDDS_EXTRA_SZ +
+				TXDDS_MFG_SZ)) {
+		pr_info("txselect_values must start with a number < %d\n",
+			TXDDS_TABLE_SZ + TXDDS_EXTRA_SZ + TXDDS_MFG_SZ);
+		return -EINVAL;
+	}
+	strcpy(txselect_list, str);
+
+	list_for_each_entry(dd, &qib_dev_list, list)
+		if (dd->deviceid == PCI_DEVICE_ID_QLOGIC_IB_7322)
+			set_no_qsfp_atten(dd, 1);
+	return 0;
+}
+
+/*
+ * Write the final few registers that depend on some of the
+ * init setup.  Done late in init, just before bringing up
+ * the serdes.
+ */
+static int qib_late_7322_initreg(struct qib_devdata *dd)
+{
+	int ret = 0, n;
+	u64 val;
+
+	qib_write_kreg(dd, kr_rcvhdrentsize, dd->rcvhdrentsize);
+	qib_write_kreg(dd, kr_rcvhdrsize, dd->rcvhdrsize);
+	qib_write_kreg(dd, kr_rcvhdrcnt, dd->rcvhdrcnt);
+	qib_write_kreg(dd, kr_sendpioavailaddr, dd->pioavailregs_phys);
+	val = qib_read_kreg64(dd, kr_sendpioavailaddr);
+	if (val != dd->pioavailregs_phys) {
+		qib_dev_err(dd,
+			"Catastrophic software error, SendPIOAvailAddr written as %lx, read back as %llx\n",
+			(unsigned long) dd->pioavailregs_phys,
+			(unsigned long long) val);
+		ret = -EINVAL;
+	}
+
+	n = dd->piobcnt2k + dd->piobcnt4k + NUM_VL15_BUFS;
+	qib_7322_txchk_change(dd, 0, n, TXCHK_CHG_TYPE_KERN, NULL);
+	/* driver sends get pkey, lid, etc. checking also, to catch bugs */
+	qib_7322_txchk_change(dd, 0, n, TXCHK_CHG_TYPE_ENAB1, NULL);
+
+	qib_register_observer(dd, &sendctrl_0_observer);
+	qib_register_observer(dd, &sendctrl_1_observer);
+
+	dd->control &= ~QLOGIC_IB_C_SDMAFETCHPRIOEN;
+	qib_write_kreg(dd, kr_control, dd->control);
+	/*
+	 * Set SendDmaFetchPriority and init Tx params, including
+	 * QSFP handler on boards that have QSFP.
+	 * First set our default attenuation entry for cables that
+	 * don't have valid attenuation.
+	 */
+	set_no_qsfp_atten(dd, 0);
+	for (n = 0; n < dd->num_pports; ++n) {
+		struct qib_pportdata *ppd = dd->pport + n;
+
+		qib_write_kreg_port(ppd, krp_senddmaprioritythld,
+				    sdma_fetch_prio & 0xf);
+		/* Initialize qsfp if present on board. */
+		if (dd->flags & QIB_HAS_QSFP)
+			qib_init_7322_qsfp(ppd);
+	}
+	dd->control |= QLOGIC_IB_C_SDMAFETCHPRIOEN;
+	qib_write_kreg(dd, kr_control, dd->control);
+
+	return ret;
+}
+
+/* per IB port errors.  */
+#define SENDCTRL_PIBP (MASK_ACROSS(0, 1) | MASK_ACROSS(3, 3) | \
+	MASK_ACROSS(8, 15))
+#define RCVCTRL_PIBP (MASK_ACROSS(0, 17) | MASK_ACROSS(39, 41))
+#define ERRS_PIBP (MASK_ACROSS(57, 58) | MASK_ACROSS(54, 54) | \
+	MASK_ACROSS(36, 49) | MASK_ACROSS(29, 34) | MASK_ACROSS(14, 17) | \
+	MASK_ACROSS(0, 11))
+
+/*
+ * Write the initialization per-port registers that need to be done at
+ * driver load and after reset completes (i.e., that aren't done as part
+ * of other init procedures called from qib_init.c).
+ * Some of these should be redundant on reset, but play safe.
+ */
+static void write_7322_init_portregs(struct qib_pportdata *ppd)
+{
+	u64 val;
+	int i;
+
+	if (!ppd->link_speed_supported) {
+		/* no buffer credits for this port */
+		for (i = 1; i < 8; i++)
+			qib_write_kreg_port(ppd, krp_rxcreditvl0 + i, 0);
+		qib_write_kreg_port(ppd, krp_ibcctrl_b, 0);
+		qib_write_kreg(ppd->dd, kr_scratch, 0);
+		return;
+	}
+
+	/*
+	 * Set the number of supported virtual lanes in IBC,
+	 * for flow control packet handling on unsupported VLs
+	 */
+	val = qib_read_kreg_port(ppd, krp_ibsdtestiftx);
+	val &= ~SYM_MASK(IB_SDTEST_IF_TX_0, VL_CAP);
+	val |= (u64)(ppd->vls_supported - 1) <<
+		SYM_LSB(IB_SDTEST_IF_TX_0, VL_CAP);
+	qib_write_kreg_port(ppd, krp_ibsdtestiftx, val);
+
+	qib_write_kreg_port(ppd, krp_rcvbthqp, QIB_KD_QP);
+
+	/* enable tx header checking */
+	qib_write_kreg_port(ppd, krp_sendcheckcontrol, IBA7322_SENDCHK_PKEY |
+			    IBA7322_SENDCHK_BTHQP | IBA7322_SENDCHK_SLID |
+			    IBA7322_SENDCHK_RAW_IPV6 | IBA7322_SENDCHK_MINSZ);
+
+	qib_write_kreg_port(ppd, krp_ncmodectrl,
+		SYM_MASK(IBNCModeCtrl_0, ScrambleCapLocal));
+
+	/*
+	 * Unconditionally clear the bufmask bits.  If SDMA is
+	 * enabled, we'll set them appropriately later.
+	 */
+	qib_write_kreg_port(ppd, krp_senddmabufmask0, 0);
+	qib_write_kreg_port(ppd, krp_senddmabufmask1, 0);
+	qib_write_kreg_port(ppd, krp_senddmabufmask2, 0);
+	if (ppd->dd->cspec->r1)
+		ppd->p_sendctrl |= SYM_MASK(SendCtrl_0, ForceCreditUpToDate);
+}
+
+/*
+ * Write the initialization per-device registers that need to be done at
+ * driver load and after reset completes (i.e., that aren't done as part
+ * of other init procedures called from qib_init.c).  Also write per-port
+ * registers that are affected by overall device config, such as QP mapping
+ * Some of these should be redundant on reset, but play safe.
+ */
+static void write_7322_initregs(struct qib_devdata *dd)
+{
+	struct qib_pportdata *ppd;
+	int i, pidx;
+	u64 val;
+
+	/* Set Multicast QPs received by port 2 to map to context one. */
+	qib_write_kreg(dd, KREG_IDX(RcvQPMulticastContext_1), 1);
+
+	for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+		unsigned n, regno;
+		unsigned long flags;
+
+		if (dd->n_krcv_queues < 2 ||
+			!dd->pport[pidx].link_speed_supported)
+			continue;
+
+		ppd = &dd->pport[pidx];
+
+		/* be paranoid against later code motion, etc. */
+		spin_lock_irqsave(&dd->cspec->rcvmod_lock, flags);
+		ppd->p_rcvctrl |= SYM_MASK(RcvCtrl_0, RcvQPMapEnable);
+		spin_unlock_irqrestore(&dd->cspec->rcvmod_lock, flags);
+
+		/* Initialize QP to context mapping */
+		regno = krp_rcvqpmaptable;
+		val = 0;
+		if (dd->num_pports > 1)
+			n = dd->first_user_ctxt / dd->num_pports;
+		else
+			n = dd->first_user_ctxt - 1;
+		for (i = 0; i < 32; ) {
+			unsigned ctxt;
+
+			if (dd->num_pports > 1)
+				ctxt = (i % n) * dd->num_pports + pidx;
+			else if (i % n)
+				ctxt = (i % n) + 1;
+			else
+				ctxt = ppd->hw_pidx;
+			val |= ctxt << (5 * (i % 6));
+			i++;
+			if (i % 6 == 0) {
+				qib_write_kreg_port(ppd, regno, val);
+				val = 0;
+				regno++;
+			}
+		}
+		qib_write_kreg_port(ppd, regno, val);
+	}
+
+	/*
+	 * Setup up interrupt mitigation for kernel contexts, but
+	 * not user contexts (user contexts use interrupts when
+	 * stalled waiting for any packet, so want those interrupts
+	 * right away).
+	 */
+	for (i = 0; i < dd->first_user_ctxt; i++) {
+		dd->cspec->rcvavail_timeout[i] = rcv_int_timeout;
+		qib_write_kreg(dd, kr_rcvavailtimeout + i, rcv_int_timeout);
+	}
+
+	/*
+	 * Initialize  as (disabled) rcvflow tables.  Application code
+	 * will setup each flow as it uses the flow.
+	 * Doesn't clear any of the error bits that might be set.
+	 */
+	val = TIDFLOW_ERRBITS; /* these are W1C */
+	for (i = 0; i < dd->cfgctxts; i++) {
+		int flow;
+		for (flow = 0; flow < NUM_TIDFLOWS_CTXT; flow++)
+			qib_write_ureg(dd, ur_rcvflowtable+flow, val, i);
+	}
+
+	/*
+	 * dual cards init to dual port recovery, single port cards to
+	 * the one port.  Dual port cards may later adjust to 1 port,
+	 * and then back to dual port if both ports are connected
+	 * */
+	if (dd->num_pports)
+		setup_7322_link_recovery(dd->pport, dd->num_pports > 1);
+}
+
+static int qib_init_7322_variables(struct qib_devdata *dd)
+{
+	struct qib_pportdata *ppd;
+	unsigned features, pidx, sbufcnt;
+	int ret, mtu;
+	u32 sbufs, updthresh;
+
+	/* pport structs are contiguous, allocated after devdata */
+	ppd = (struct qib_pportdata *)(dd + 1);
+	dd->pport = ppd;
+	ppd[0].dd = dd;
+	ppd[1].dd = dd;
+
+	dd->cspec = (struct qib_chip_specific *)(ppd + 2);
+
+	ppd[0].cpspec = (struct qib_chippport_specific *)(dd->cspec + 1);
+	ppd[1].cpspec = &ppd[0].cpspec[1];
+	ppd[0].cpspec->ppd = &ppd[0]; /* for autoneg_7322_work() */
+	ppd[1].cpspec->ppd = &ppd[1]; /* for autoneg_7322_work() */
+
+	spin_lock_init(&dd->cspec->rcvmod_lock);
+	spin_lock_init(&dd->cspec->gpio_lock);
+
+	/* we haven't yet set QIB_PRESENT, so use read directly */
+	dd->revision = readq(&dd->kregbase[kr_revision]);
+
+	if ((dd->revision & 0xffffffffU) == 0xffffffffU) {
+		qib_dev_err(dd,
+			"Revision register read failure, giving up initialization\n");
+		ret = -ENODEV;
+		goto bail;
+	}
+	dd->flags |= QIB_PRESENT;  /* now register routines work */
+
+	dd->majrev = (u8) SYM_FIELD(dd->revision, Revision_R, ChipRevMajor);
+	dd->minrev = (u8) SYM_FIELD(dd->revision, Revision_R, ChipRevMinor);
+	dd->cspec->r1 = dd->minrev == 1;
+
+	get_7322_chip_params(dd);
+	features = qib_7322_boardname(dd);
+
+	/* now that piobcnt2k and 4k set, we can allocate these */
+	sbufcnt = dd->piobcnt2k + dd->piobcnt4k +
+		NUM_VL15_BUFS + BITS_PER_LONG - 1;
+	sbufcnt /= BITS_PER_LONG;
+	dd->cspec->sendchkenable = kmalloc(sbufcnt *
+		sizeof(*dd->cspec->sendchkenable), GFP_KERNEL);
+	dd->cspec->sendgrhchk = kmalloc(sbufcnt *
+		sizeof(*dd->cspec->sendgrhchk), GFP_KERNEL);
+	dd->cspec->sendibchk = kmalloc(sbufcnt *
+		sizeof(*dd->cspec->sendibchk), GFP_KERNEL);
+	if (!dd->cspec->sendchkenable || !dd->cspec->sendgrhchk ||
+		!dd->cspec->sendibchk) {
+		qib_dev_err(dd, "Failed allocation for hdrchk bitmaps\n");
+		ret = -ENOMEM;
+		goto bail;
+	}
+
+	ppd = dd->pport;
+
+	/*
+	 * GPIO bits for TWSI data and clock,
+	 * used for serial EEPROM.
+	 */
+	dd->gpio_sda_num = _QIB_GPIO_SDA_NUM;
+	dd->gpio_scl_num = _QIB_GPIO_SCL_NUM;
+	dd->twsi_eeprom_dev = QIB_TWSI_EEPROM_DEV;
+
+	dd->flags |= QIB_HAS_INTX | QIB_HAS_LINK_LATENCY |
+		QIB_NODMA_RTAIL | QIB_HAS_VLSUPP | QIB_HAS_HDRSUPP |
+		QIB_HAS_THRESH_UPDATE |
+		(sdma_idle_cnt ? QIB_HAS_SDMA_TIMEOUT : 0);
+	dd->flags |= qib_special_trigger ?
+		QIB_USE_SPCL_TRIG : QIB_HAS_SEND_DMA;
+
+	/*
+	 * Setup initial values.  These may change when PAT is enabled, but
+	 * we need these to do initial chip register accesses.
+	 */
+	qib_7322_set_baseaddrs(dd);
+
+	mtu = ib_mtu_enum_to_int(qib_ibmtu);
+	if (mtu == -1)
+		mtu = QIB_DEFAULT_MTU;
+
+	dd->cspec->int_enable_mask = QIB_I_BITSEXTANT;
+	/* all hwerrors become interrupts, unless special purposed */
+	dd->cspec->hwerrmask = ~0ULL;
+	/*  link_recovery setup causes these errors, so ignore them,
+	 *  other than clearing them when they occur */
+	dd->cspec->hwerrmask &=
+		~(SYM_MASK(HwErrMask, IBSerdesPClkNotDetectMask_0) |
+		  SYM_MASK(HwErrMask, IBSerdesPClkNotDetectMask_1) |
+		  HWE_MASK(LATriggered));
+
+	for (pidx = 0; pidx < NUM_IB_PORTS; ++pidx) {
+		struct qib_chippport_specific *cp = ppd->cpspec;
+		ppd->link_speed_supported = features & PORT_SPD_CAP;
+		features >>=  PORT_SPD_CAP_SHIFT;
+		if (!ppd->link_speed_supported) {
+			/* single port mode (7340, or configured) */
+			dd->skip_kctxt_mask |= 1 << pidx;
+			if (pidx == 0) {
+				/* Make sure port is disabled. */
+				qib_write_kreg_port(ppd, krp_rcvctrl, 0);
+				qib_write_kreg_port(ppd, krp_ibcctrl_a, 0);
+				ppd[0] = ppd[1];
+				dd->cspec->hwerrmask &= ~(SYM_MASK(HwErrMask,
+						  IBSerdesPClkNotDetectMask_0)
+						  | SYM_MASK(HwErrMask,
+						  SDmaMemReadErrMask_0));
+				dd->cspec->int_enable_mask &= ~(
+				     SYM_MASK(IntMask, SDmaCleanupDoneMask_0) |
+				     SYM_MASK(IntMask, SDmaIdleIntMask_0) |
+				     SYM_MASK(IntMask, SDmaProgressIntMask_0) |
+				     SYM_MASK(IntMask, SDmaIntMask_0) |
+				     SYM_MASK(IntMask, ErrIntMask_0) |
+				     SYM_MASK(IntMask, SendDoneIntMask_0));
+			} else {
+				/* Make sure port is disabled. */
+				qib_write_kreg_port(ppd, krp_rcvctrl, 0);
+				qib_write_kreg_port(ppd, krp_ibcctrl_a, 0);
+				dd->cspec->hwerrmask &= ~(SYM_MASK(HwErrMask,
+						  IBSerdesPClkNotDetectMask_1)
+						  | SYM_MASK(HwErrMask,
+						  SDmaMemReadErrMask_1));
+				dd->cspec->int_enable_mask &= ~(
+				     SYM_MASK(IntMask, SDmaCleanupDoneMask_1) |
+				     SYM_MASK(IntMask, SDmaIdleIntMask_1) |
+				     SYM_MASK(IntMask, SDmaProgressIntMask_1) |
+				     SYM_MASK(IntMask, SDmaIntMask_1) |
+				     SYM_MASK(IntMask, ErrIntMask_1) |
+				     SYM_MASK(IntMask, SendDoneIntMask_1));
+			}
+			continue;
+		}
+
+		dd->num_pports++;
+		ret = qib_init_pportdata(ppd, dd, pidx, dd->num_pports);
+		if (ret) {
+			dd->num_pports--;
+			goto bail;
+		}
+
+		ppd->link_width_supported = IB_WIDTH_1X | IB_WIDTH_4X;
+		ppd->link_width_enabled = IB_WIDTH_4X;
+		ppd->link_speed_enabled = ppd->link_speed_supported;
+		/*
+		 * Set the initial values to reasonable default, will be set
+		 * for real when link is up.
+		 */
+		ppd->link_width_active = IB_WIDTH_4X;
+		ppd->link_speed_active = QIB_IB_SDR;
+		ppd->delay_mult = ib_rate_to_delay[IB_RATE_10_GBPS];
+		switch (qib_num_cfg_vls) {
+		case 1:
+			ppd->vls_supported = IB_VL_VL0;
+			break;
+		case 2:
+			ppd->vls_supported = IB_VL_VL0_1;
+			break;
+		default:
+			qib_devinfo(dd->pcidev,
+				    "Invalid num_vls %u, using 4 VLs\n",
+				    qib_num_cfg_vls);
+			qib_num_cfg_vls = 4;
+			/* fall through */
+		case 4:
+			ppd->vls_supported = IB_VL_VL0_3;
+			break;
+		case 8:
+			if (mtu <= 2048)
+				ppd->vls_supported = IB_VL_VL0_7;
+			else {
+				qib_devinfo(dd->pcidev,
+					    "Invalid num_vls %u for MTU %d "
+					    ", using 4 VLs\n",
+					    qib_num_cfg_vls, mtu);
+				ppd->vls_supported = IB_VL_VL0_3;
+				qib_num_cfg_vls = 4;
+			}
+			break;
+		}
+		ppd->vls_operational = ppd->vls_supported;
+
+		init_waitqueue_head(&cp->autoneg_wait);
+		INIT_DELAYED_WORK(&cp->autoneg_work,
+				  autoneg_7322_work);
+		if (ppd->dd->cspec->r1)
+			INIT_DELAYED_WORK(&cp->ipg_work, ipg_7322_work);
+
+		/*
+		 * For Mez and similar cards, no qsfp info, so do
+		 * the "cable info" setup here.  Can be overridden
+		 * in adapter-specific routines.
+		 */
+		if (!(dd->flags & QIB_HAS_QSFP)) {
+			if (!IS_QMH(dd) && !IS_QME(dd))
+				qib_devinfo(dd->pcidev,
+					"IB%u:%u: Unknown mezzanine card type\n",
+					dd->unit, ppd->port);
+			cp->h1_val = IS_QMH(dd) ? H1_FORCE_QMH : H1_FORCE_QME;
+			/*
+			 * Choose center value as default tx serdes setting
+			 * until changed through module parameter.
+			 */
+			ppd->cpspec->no_eep = IS_QMH(dd) ?
+				TXDDS_TABLE_SZ + 2 : TXDDS_TABLE_SZ + 4;
+		} else
+			cp->h1_val = H1_FORCE_VAL;
+
+		/* Avoid writes to chip for mini_init */
+		if (!qib_mini_init)
+			write_7322_init_portregs(ppd);
+
+		init_timer(&cp->chase_timer);
+		cp->chase_timer.function = reenable_chase;
+		cp->chase_timer.data = (unsigned long)ppd;
+
+		ppd++;
+	}
+
+	dd->rcvhdrentsize = qib_rcvhdrentsize ?
+		qib_rcvhdrentsize : QIB_RCVHDR_ENTSIZE;
+	dd->rcvhdrsize = qib_rcvhdrsize ?
+		qib_rcvhdrsize : QIB_DFLT_RCVHDRSIZE;
+	dd->rhf_offset = dd->rcvhdrentsize - sizeof(u64) / sizeof(u32);
+
+	/* we always allocate at least 2048 bytes for eager buffers */
+	dd->rcvegrbufsize = max(mtu, 2048);
+	BUG_ON(!is_power_of_2(dd->rcvegrbufsize));
+	dd->rcvegrbufsize_shift = ilog2(dd->rcvegrbufsize);
+
+	qib_7322_tidtemplate(dd);
+
+	/*
+	 * We can request a receive interrupt for 1 or
+	 * more packets from current offset.
+	 */
+	dd->rhdrhead_intr_off =
+		(u64) rcv_int_count << IBA7322_HDRHEAD_PKTINT_SHIFT;
+
+	/* setup the stats timer; the add_timer is done at end of init */
+	init_timer(&dd->stats_timer);
+	dd->stats_timer.function = qib_get_7322_faststats;
+	dd->stats_timer.data = (unsigned long) dd;
+
+	dd->ureg_align = 0x10000;  /* 64KB alignment */
+
+	dd->piosize2kmax_dwords = dd->piosize2k >> 2;
+
+	qib_7322_config_ctxts(dd);
+	qib_set_ctxtcnt(dd);
+
+	if (qib_wc_pat) {
+		resource_size_t vl15off;
+		/*
+		 * We do not set WC on the VL15 buffers to avoid
+		 * a rare problem with unaligned writes from
+		 * interrupt-flushed store buffers, so we need
+		 * to map those separately here.  We can't solve
+		 * this for the rarely used mtrr case.
+		 */
+		ret = init_chip_wc_pat(dd, 0);
+		if (ret)
+			goto bail;
+
+		/* vl15 buffers start just after the 4k buffers */
+		vl15off = dd->physaddr + (dd->piobufbase >> 32) +
+			dd->piobcnt4k * dd->align4k;
+		dd->piovl15base	= ioremap_nocache(vl15off,
+						  NUM_VL15_BUFS * dd->align4k);
+		if (!dd->piovl15base) {
+			ret = -ENOMEM;
+			goto bail;
+		}
+	}
+	qib_7322_set_baseaddrs(dd); /* set chip access pointers now */
+
+	ret = 0;
+	if (qib_mini_init)
+		goto bail;
+	if (!dd->num_pports) {
+		qib_dev_err(dd, "No ports enabled, giving up initialization\n");
+		goto bail; /* no error, so can still figure out why err */
+	}
+
+	write_7322_initregs(dd);
+	ret = qib_create_ctxts(dd);
+	init_7322_cntrnames(dd);
+
+	updthresh = 8U; /* update threshold */
+
+	/* use all of 4KB buffers for the kernel SDMA, zero if !SDMA.
+	 * reserve the update threshold amount for other kernel use, such
+	 * as sending SMI, MAD, and ACKs, or 3, whichever is greater,
+	 * unless we aren't enabling SDMA, in which case we want to use
+	 * all the 4k bufs for the kernel.
+	 * if this was less than the update threshold, we could wait
+	 * a long time for an update.  Coded this way because we
+	 * sometimes change the update threshold for various reasons,
+	 * and we want this to remain robust.
+	 */
+	if (dd->flags & QIB_HAS_SEND_DMA) {
+		dd->cspec->sdmabufcnt = dd->piobcnt4k;
+		sbufs = updthresh > 3 ? updthresh : 3;
+	} else {
+		dd->cspec->sdmabufcnt = 0;
+		sbufs = dd->piobcnt4k;
+	}
+	dd->cspec->lastbuf_for_pio = dd->piobcnt2k + dd->piobcnt4k -
+		dd->cspec->sdmabufcnt;
+	dd->lastctxt_piobuf = dd->cspec->lastbuf_for_pio - sbufs;
+	dd->cspec->lastbuf_for_pio--; /* range is <= , not < */
+	dd->last_pio = dd->cspec->lastbuf_for_pio;
+	dd->pbufsctxt = (dd->cfgctxts > dd->first_user_ctxt) ?
+		dd->lastctxt_piobuf / (dd->cfgctxts - dd->first_user_ctxt) : 0;
+
+	/*
+	 * If we have 16 user contexts, we will have 7 sbufs
+	 * per context, so reduce the update threshold to match.  We
+	 * want to update before we actually run out, at low pbufs/ctxt
+	 * so give ourselves some margin.
+	 */
+	if (dd->pbufsctxt >= 2 && dd->pbufsctxt - 2 < updthresh)
+		updthresh = dd->pbufsctxt - 2;
+	dd->cspec->updthresh_dflt = updthresh;
+	dd->cspec->updthresh = updthresh;
+
+	/* before full enable, no interrupts, no locking needed */
+	dd->sendctrl |= ((updthresh & SYM_RMASK(SendCtrl, AvailUpdThld))
+			     << SYM_LSB(SendCtrl, AvailUpdThld)) |
+			SYM_MASK(SendCtrl, SendBufAvailPad64Byte);
+
+	dd->psxmitwait_supported = 1;
+	dd->psxmitwait_check_rate = QIB_7322_PSXMITWAIT_CHECK_RATE;
+bail:
+	if (!dd->ctxtcnt)
+		dd->ctxtcnt = 1; /* for other initialization code */
+
+	return ret;
+}
+
+static u32 __iomem *qib_7322_getsendbuf(struct qib_pportdata *ppd, u64 pbc,
+					u32 *pbufnum)
+{
+	u32 first, last, plen = pbc & QIB_PBC_LENGTH_MASK;
+	struct qib_devdata *dd = ppd->dd;
+
+	/* last is same for 2k and 4k, because we use 4k if all 2k busy */
+	if (pbc & PBC_7322_VL15_SEND) {
+		first = dd->piobcnt2k + dd->piobcnt4k + ppd->hw_pidx;
+		last = first;
+	} else {
+		if ((plen + 1) > dd->piosize2kmax_dwords)
+			first = dd->piobcnt2k;
+		else
+			first = 0;
+		last = dd->cspec->lastbuf_for_pio;
+	}
+	return qib_getsendbuf_range(dd, pbufnum, first, last);
+}
+
+static void qib_set_cntr_7322_sample(struct qib_pportdata *ppd, u32 intv,
+				     u32 start)
+{
+	qib_write_kreg_port(ppd, krp_psinterval, intv);
+	qib_write_kreg_port(ppd, krp_psstart, start);
+}
+
+/*
+ * Must be called with sdma_lock held, or before init finished.
+ */
+static void qib_sdma_set_7322_desc_cnt(struct qib_pportdata *ppd, unsigned cnt)
+{
+	qib_write_kreg_port(ppd, krp_senddmadesccnt, cnt);
+}
+
+/*
+ * sdma_lock should be acquired before calling this routine
+ */
+static void dump_sdma_7322_state(struct qib_pportdata *ppd)
+{
+	u64 reg, reg1, reg2;
+
+	reg = qib_read_kreg_port(ppd, krp_senddmastatus);
+	qib_dev_porterr(ppd->dd, ppd->port,
+		"SDMA senddmastatus: 0x%016llx\n", reg);
+
+	reg = qib_read_kreg_port(ppd, krp_sendctrl);
+	qib_dev_porterr(ppd->dd, ppd->port,
+		"SDMA sendctrl: 0x%016llx\n", reg);
+
+	reg = qib_read_kreg_port(ppd, krp_senddmabase);
+	qib_dev_porterr(ppd->dd, ppd->port,
+		"SDMA senddmabase: 0x%016llx\n", reg);
+
+	reg = qib_read_kreg_port(ppd, krp_senddmabufmask0);
+	reg1 = qib_read_kreg_port(ppd, krp_senddmabufmask1);
+	reg2 = qib_read_kreg_port(ppd, krp_senddmabufmask2);
+	qib_dev_porterr(ppd->dd, ppd->port,
+		"SDMA senddmabufmask 0:%llx  1:%llx  2:%llx\n",
+		 reg, reg1, reg2);
+
+	/* get bufuse bits, clear them, and print them again if non-zero */
+	reg = qib_read_kreg_port(ppd, krp_senddmabuf_use0);
+	qib_write_kreg_port(ppd, krp_senddmabuf_use0, reg);
+	reg1 = qib_read_kreg_port(ppd, krp_senddmabuf_use1);
+	qib_write_kreg_port(ppd, krp_senddmabuf_use0, reg1);
+	reg2 = qib_read_kreg_port(ppd, krp_senddmabuf_use2);
+	qib_write_kreg_port(ppd, krp_senddmabuf_use0, reg2);
+	/* 0 and 1 should always be zero, so print as short form */
+	qib_dev_porterr(ppd->dd, ppd->port,
+		 "SDMA current senddmabuf_use 0:%llx  1:%llx  2:%llx\n",
+		 reg, reg1, reg2);
+	reg = qib_read_kreg_port(ppd, krp_senddmabuf_use0);
+	reg1 = qib_read_kreg_port(ppd, krp_senddmabuf_use1);
+	reg2 = qib_read_kreg_port(ppd, krp_senddmabuf_use2);
+	/* 0 and 1 should always be zero, so print as short form */
+	qib_dev_porterr(ppd->dd, ppd->port,
+		 "SDMA cleared senddmabuf_use 0:%llx  1:%llx  2:%llx\n",
+		 reg, reg1, reg2);
+
+	reg = qib_read_kreg_port(ppd, krp_senddmatail);
+	qib_dev_porterr(ppd->dd, ppd->port,
+		"SDMA senddmatail: 0x%016llx\n", reg);
+
+	reg = qib_read_kreg_port(ppd, krp_senddmahead);
+	qib_dev_porterr(ppd->dd, ppd->port,
+		"SDMA senddmahead: 0x%016llx\n", reg);
+
+	reg = qib_read_kreg_port(ppd, krp_senddmaheadaddr);
+	qib_dev_porterr(ppd->dd, ppd->port,
+		"SDMA senddmaheadaddr: 0x%016llx\n", reg);
+
+	reg = qib_read_kreg_port(ppd, krp_senddmalengen);
+	qib_dev_porterr(ppd->dd, ppd->port,
+		"SDMA senddmalengen: 0x%016llx\n", reg);
+
+	reg = qib_read_kreg_port(ppd, krp_senddmadesccnt);
+	qib_dev_porterr(ppd->dd, ppd->port,
+		"SDMA senddmadesccnt: 0x%016llx\n", reg);
+
+	reg = qib_read_kreg_port(ppd, krp_senddmaidlecnt);
+	qib_dev_porterr(ppd->dd, ppd->port,
+		"SDMA senddmaidlecnt: 0x%016llx\n", reg);
+
+	reg = qib_read_kreg_port(ppd, krp_senddmaprioritythld);
+	qib_dev_porterr(ppd->dd, ppd->port,
+		"SDMA senddmapriorityhld: 0x%016llx\n", reg);
+
+	reg = qib_read_kreg_port(ppd, krp_senddmareloadcnt);
+	qib_dev_porterr(ppd->dd, ppd->port,
+		"SDMA senddmareloadcnt: 0x%016llx\n", reg);
+
+	dump_sdma_state(ppd);
+}
+
+static struct sdma_set_state_action sdma_7322_action_table[] = {
+	[qib_sdma_state_s00_hw_down] = {
+		.go_s99_running_tofalse = 1,
+		.op_enable = 0,
+		.op_intenable = 0,
+		.op_halt = 0,
+		.op_drain = 0,
+	},
+	[qib_sdma_state_s10_hw_start_up_wait] = {
+		.op_enable = 0,
+		.op_intenable = 1,
+		.op_halt = 1,
+		.op_drain = 0,
+	},
+	[qib_sdma_state_s20_idle] = {
+		.op_enable = 1,
+		.op_intenable = 1,
+		.op_halt = 1,
+		.op_drain = 0,
+	},
+	[qib_sdma_state_s30_sw_clean_up_wait] = {
+		.op_enable = 0,
+		.op_intenable = 1,
+		.op_halt = 1,
+		.op_drain = 0,
+	},
+	[qib_sdma_state_s40_hw_clean_up_wait] = {
+		.op_enable = 1,
+		.op_intenable = 1,
+		.op_halt = 1,
+		.op_drain = 0,
+	},
+	[qib_sdma_state_s50_hw_halt_wait] = {
+		.op_enable = 1,
+		.op_intenable = 1,
+		.op_halt = 1,
+		.op_drain = 1,
+	},
+	[qib_sdma_state_s99_running] = {
+		.op_enable = 1,
+		.op_intenable = 1,
+		.op_halt = 0,
+		.op_drain = 0,
+		.go_s99_running_totrue = 1,
+	},
+};
+
+static void qib_7322_sdma_init_early(struct qib_pportdata *ppd)
+{
+	ppd->sdma_state.set_state_action = sdma_7322_action_table;
+}
+
+static int init_sdma_7322_regs(struct qib_pportdata *ppd)
+{
+	struct qib_devdata *dd = ppd->dd;
+	unsigned lastbuf, erstbuf;
+	u64 senddmabufmask[3] = { 0 };
+	int n, ret = 0;
+
+	qib_write_kreg_port(ppd, krp_senddmabase, ppd->sdma_descq_phys);
+	qib_sdma_7322_setlengen(ppd);
+	qib_sdma_update_7322_tail(ppd, 0); /* Set SendDmaTail */
+	qib_write_kreg_port(ppd, krp_senddmareloadcnt, sdma_idle_cnt);
+	qib_write_kreg_port(ppd, krp_senddmadesccnt, 0);
+	qib_write_kreg_port(ppd, krp_senddmaheadaddr, ppd->sdma_head_phys);
+
+	if (dd->num_pports)
+		n = dd->cspec->sdmabufcnt / dd->num_pports; /* no remainder */
+	else
+		n = dd->cspec->sdmabufcnt; /* failsafe for init */
+	erstbuf = (dd->piobcnt2k + dd->piobcnt4k) -
+		((dd->num_pports == 1 || ppd->port == 2) ? n :
+		dd->cspec->sdmabufcnt);
+	lastbuf = erstbuf + n;
+
+	ppd->sdma_state.first_sendbuf = erstbuf;
+	ppd->sdma_state.last_sendbuf = lastbuf;
+	for (; erstbuf < lastbuf; ++erstbuf) {
+		unsigned word = erstbuf / BITS_PER_LONG;
+		unsigned bit = erstbuf & (BITS_PER_LONG - 1);
+
+		BUG_ON(word >= 3);
+		senddmabufmask[word] |= 1ULL << bit;
+	}
+	qib_write_kreg_port(ppd, krp_senddmabufmask0, senddmabufmask[0]);
+	qib_write_kreg_port(ppd, krp_senddmabufmask1, senddmabufmask[1]);
+	qib_write_kreg_port(ppd, krp_senddmabufmask2, senddmabufmask[2]);
+	return ret;
+}
+
+/* sdma_lock must be held */
+static u16 qib_sdma_7322_gethead(struct qib_pportdata *ppd)
+{
+	struct qib_devdata *dd = ppd->dd;
+	int sane;
+	int use_dmahead;
+	u16 swhead;
+	u16 swtail;
+	u16 cnt;
+	u16 hwhead;
+
+	use_dmahead = __qib_sdma_running(ppd) &&
+		(dd->flags & QIB_HAS_SDMA_TIMEOUT);
+retry:
+	hwhead = use_dmahead ?
+		(u16) le64_to_cpu(*ppd->sdma_head_dma) :
+		(u16) qib_read_kreg_port(ppd, krp_senddmahead);
+
+	swhead = ppd->sdma_descq_head;
+	swtail = ppd->sdma_descq_tail;
+	cnt = ppd->sdma_descq_cnt;
+
+	if (swhead < swtail)
+		/* not wrapped */
+		sane = (hwhead >= swhead) & (hwhead <= swtail);
+	else if (swhead > swtail)
+		/* wrapped around */
+		sane = ((hwhead >= swhead) && (hwhead < cnt)) ||
+			(hwhead <= swtail);
+	else
+		/* empty */
+		sane = (hwhead == swhead);
+
+	if (unlikely(!sane)) {
+		if (use_dmahead) {
+			/* try one more time, directly from the register */
+			use_dmahead = 0;
+			goto retry;
+		}
+		/* proceed as if no progress */
+		hwhead = swhead;
+	}
+
+	return hwhead;
+}
+
+static int qib_sdma_7322_busy(struct qib_pportdata *ppd)
+{
+	u64 hwstatus = qib_read_kreg_port(ppd, krp_senddmastatus);
+
+	return (hwstatus & SYM_MASK(SendDmaStatus_0, ScoreBoardDrainInProg)) ||
+	       (hwstatus & SYM_MASK(SendDmaStatus_0, HaltInProg)) ||
+	       !(hwstatus & SYM_MASK(SendDmaStatus_0, InternalSDmaHalt)) ||
+	       !(hwstatus & SYM_MASK(SendDmaStatus_0, ScbEmpty));
+}
+
+/*
+ * Compute the amount of delay before sending the next packet if the
+ * port's send rate differs from the static rate set for the QP.
+ * The delay affects the next packet and the amount of the delay is
+ * based on the length of the this packet.
+ */
+static u32 qib_7322_setpbc_control(struct qib_pportdata *ppd, u32 plen,
+				   u8 srate, u8 vl)
+{
+	u8 snd_mult = ppd->delay_mult;
+	u8 rcv_mult = ib_rate_to_delay[srate];
+	u32 ret;
+
+	ret = rcv_mult > snd_mult ? ((plen + 1) >> 1) * snd_mult : 0;
+
+	/* Indicate VL15, else set the VL in the control word */
+	if (vl == 15)
+		ret |= PBC_7322_VL15_SEND_CTRL;
+	else
+		ret |= vl << PBC_VL_NUM_LSB;
+	ret |= ((u32)(ppd->hw_pidx)) << PBC_PORT_SEL_LSB;
+
+	return ret;
+}
+
+/*
+ * Enable the per-port VL15 send buffers for use.
+ * They follow the rest of the buffers, without a config parameter.
+ * This was in initregs, but that is done before the shadow
+ * is set up, and this has to be done after the shadow is
+ * set up.
+ */
+static void qib_7322_initvl15_bufs(struct qib_devdata *dd)
+{
+	unsigned vl15bufs;
+
+	vl15bufs = dd->piobcnt2k + dd->piobcnt4k;
+	qib_chg_pioavailkernel(dd, vl15bufs, NUM_VL15_BUFS,
+			       TXCHK_CHG_TYPE_KERN, NULL);
+}
+
+static void qib_7322_init_ctxt(struct qib_ctxtdata *rcd)
+{
+	if (rcd->ctxt < NUM_IB_PORTS) {
+		if (rcd->dd->num_pports > 1) {
+			rcd->rcvegrcnt = KCTXT0_EGRCNT / 2;
+			rcd->rcvegr_tid_base = rcd->ctxt ? rcd->rcvegrcnt : 0;
+		} else {
+			rcd->rcvegrcnt = KCTXT0_EGRCNT;
+			rcd->rcvegr_tid_base = 0;
+		}
+	} else {
+		rcd->rcvegrcnt = rcd->dd->cspec->rcvegrcnt;
+		rcd->rcvegr_tid_base = KCTXT0_EGRCNT +
+			(rcd->ctxt - NUM_IB_PORTS) * rcd->rcvegrcnt;
+	}
+}
+
+#define QTXSLEEPS 5000
+static void qib_7322_txchk_change(struct qib_devdata *dd, u32 start,
+				  u32 len, u32 which, struct qib_ctxtdata *rcd)
+{
+	int i;
+	const int last = start + len - 1;
+	const int lastr = last / BITS_PER_LONG;
+	u32 sleeps = 0;
+	int wait = rcd != NULL;
+	unsigned long flags;
+
+	while (wait) {
+		unsigned long shadow;
+		int cstart, previ = -1;
+
+		/*
+		 * when flipping from kernel to user, we can't change
+		 * the checking type if the buffer is allocated to the
+		 * driver.   It's OK the other direction, because it's
+		 * from close, and we have just disarm'ed all the
+		 * buffers.  All the kernel to kernel changes are also
+		 * OK.
+		 */
+		for (cstart = start; cstart <= last; cstart++) {
+			i = ((2 * cstart) + QLOGIC_IB_SENDPIOAVAIL_BUSY_SHIFT)
+				/ BITS_PER_LONG;
+			if (i != previ) {
+				shadow = (unsigned long)
+					le64_to_cpu(dd->pioavailregs_dma[i]);
+				previ = i;
+			}
+			if (test_bit(((2 * cstart) +
+				      QLOGIC_IB_SENDPIOAVAIL_BUSY_SHIFT)
+				     % BITS_PER_LONG, &shadow))
+				break;
+		}
+
+		if (cstart > last)
+			break;
+
+		if (sleeps == QTXSLEEPS)
+			break;
+		/* make sure we see an updated copy next time around */
+		sendctrl_7322_mod(dd->pport, QIB_SENDCTRL_AVAIL_BLIP);
+		sleeps++;
+		msleep(20);
+	}
+
+	switch (which) {
+	case TXCHK_CHG_TYPE_DIS1:
+		/*
+		 * disable checking on a range; used by diags; just
+		 * one buffer, but still written generically
+		 */
+		for (i = start; i <= last; i++)
+			clear_bit(i, dd->cspec->sendchkenable);
+		break;
+
+	case TXCHK_CHG_TYPE_ENAB1:
+		/*
+		 * (re)enable checking on a range; used by diags; just
+		 * one buffer, but still written generically; read
+		 * scratch to be sure buffer actually triggered, not
+		 * just flushed from processor.
+		 */
+		qib_read_kreg32(dd, kr_scratch);
+		for (i = start; i <= last; i++)
+			set_bit(i, dd->cspec->sendchkenable);
+		break;
+
+	case TXCHK_CHG_TYPE_KERN:
+		/* usable by kernel */
+		for (i = start; i <= last; i++) {
+			set_bit(i, dd->cspec->sendibchk);
+			clear_bit(i, dd->cspec->sendgrhchk);
+		}
+		spin_lock_irqsave(&dd->uctxt_lock, flags);
+		/* see if we need to raise avail update threshold */
+		for (i = dd->first_user_ctxt;
+		     dd->cspec->updthresh != dd->cspec->updthresh_dflt
+		     && i < dd->cfgctxts; i++)
+			if (dd->rcd[i] && dd->rcd[i]->subctxt_cnt &&
+			   ((dd->rcd[i]->piocnt / dd->rcd[i]->subctxt_cnt) - 1)
+			   < dd->cspec->updthresh_dflt)
+				break;
+		spin_unlock_irqrestore(&dd->uctxt_lock, flags);
+		if (i == dd->cfgctxts) {
+			spin_lock_irqsave(&dd->sendctrl_lock, flags);
+			dd->cspec->updthresh = dd->cspec->updthresh_dflt;
+			dd->sendctrl &= ~SYM_MASK(SendCtrl, AvailUpdThld);
+			dd->sendctrl |= (dd->cspec->updthresh &
+					 SYM_RMASK(SendCtrl, AvailUpdThld)) <<
+					   SYM_LSB(SendCtrl, AvailUpdThld);
+			spin_unlock_irqrestore(&dd->sendctrl_lock, flags);
+			sendctrl_7322_mod(dd->pport, QIB_SENDCTRL_AVAIL_BLIP);
+		}
+		break;
+
+	case TXCHK_CHG_TYPE_USER:
+		/* for user process */
+		for (i = start; i <= last; i++) {
+			clear_bit(i, dd->cspec->sendibchk);
+			set_bit(i, dd->cspec->sendgrhchk);
+		}
+		spin_lock_irqsave(&dd->sendctrl_lock, flags);
+		if (rcd && rcd->subctxt_cnt && ((rcd->piocnt
+			/ rcd->subctxt_cnt) - 1) < dd->cspec->updthresh) {
+			dd->cspec->updthresh = (rcd->piocnt /
+						rcd->subctxt_cnt) - 1;
+			dd->sendctrl &= ~SYM_MASK(SendCtrl, AvailUpdThld);
+			dd->sendctrl |= (dd->cspec->updthresh &
+					SYM_RMASK(SendCtrl, AvailUpdThld))
+					<< SYM_LSB(SendCtrl, AvailUpdThld);
+			spin_unlock_irqrestore(&dd->sendctrl_lock, flags);
+			sendctrl_7322_mod(dd->pport, QIB_SENDCTRL_AVAIL_BLIP);
+		} else
+			spin_unlock_irqrestore(&dd->sendctrl_lock, flags);
+		break;
+
+	default:
+		break;
+	}
+
+	for (i = start / BITS_PER_LONG; which >= 2 && i <= lastr; ++i)
+		qib_write_kreg(dd, kr_sendcheckmask + i,
+			       dd->cspec->sendchkenable[i]);
+
+	for (i = start / BITS_PER_LONG; which < 2 && i <= lastr; ++i) {
+		qib_write_kreg(dd, kr_sendgrhcheckmask + i,
+			       dd->cspec->sendgrhchk[i]);
+		qib_write_kreg(dd, kr_sendibpktmask + i,
+			       dd->cspec->sendibchk[i]);
+	}
+
+	/*
+	 * Be sure whatever we did was seen by the chip and acted upon,
+	 * before we return.  Mostly important for which >= 2.
+	 */
+	qib_read_kreg32(dd, kr_scratch);
+}
+
+
+/* useful for trigger analyzers, etc. */
+static void writescratch(struct qib_devdata *dd, u32 val)
+{
+	qib_write_kreg(dd, kr_scratch, val);
+}
+
+/* Dummy for now, use chip regs soon */
+static int qib_7322_tempsense_rd(struct qib_devdata *dd, int regnum)
+{
+	return -ENXIO;
+}
+
+/**
+ * qib_init_iba7322_funcs - set up the chip-specific function pointers
+ * @dev: the pci_dev for qlogic_ib device
+ * @ent: pci_device_id struct for this dev
+ *
+ * Also allocates, inits, and returns the devdata struct for this
+ * device instance
+ *
+ * This is global, and is called directly at init to set up the
+ * chip-specific function pointers for later use.
+ */
+struct qib_devdata *qib_init_iba7322_funcs(struct pci_dev *pdev,
+					   const struct pci_device_id *ent)
+{
+	struct qib_devdata *dd;
+	int ret, i;
+	u32 tabsize, actual_cnt = 0;
+
+	dd = qib_alloc_devdata(pdev,
+		NUM_IB_PORTS * sizeof(struct qib_pportdata) +
+		sizeof(struct qib_chip_specific) +
+		NUM_IB_PORTS * sizeof(struct qib_chippport_specific));
+	if (IS_ERR(dd))
+		goto bail;
+
+	dd->f_bringup_serdes    = qib_7322_bringup_serdes;
+	dd->f_cleanup           = qib_setup_7322_cleanup;
+	dd->f_clear_tids        = qib_7322_clear_tids;
+	dd->f_free_irq          = qib_7322_free_irq;
+	dd->f_get_base_info     = qib_7322_get_base_info;
+	dd->f_get_msgheader     = qib_7322_get_msgheader;
+	dd->f_getsendbuf        = qib_7322_getsendbuf;
+	dd->f_gpio_mod          = gpio_7322_mod;
+	dd->f_eeprom_wen        = qib_7322_eeprom_wen;
+	dd->f_hdrqempty         = qib_7322_hdrqempty;
+	dd->f_ib_updown         = qib_7322_ib_updown;
+	dd->f_init_ctxt         = qib_7322_init_ctxt;
+	dd->f_initvl15_bufs     = qib_7322_initvl15_bufs;
+	dd->f_intr_fallback     = qib_7322_intr_fallback;
+	dd->f_late_initreg      = qib_late_7322_initreg;
+	dd->f_setpbc_control    = qib_7322_setpbc_control;
+	dd->f_portcntr          = qib_portcntr_7322;
+	dd->f_put_tid           = qib_7322_put_tid;
+	dd->f_quiet_serdes      = qib_7322_mini_quiet_serdes;
+	dd->f_rcvctrl           = rcvctrl_7322_mod;
+	dd->f_read_cntrs        = qib_read_7322cntrs;
+	dd->f_read_portcntrs    = qib_read_7322portcntrs;
+	dd->f_reset             = qib_do_7322_reset;
+	dd->f_init_sdma_regs    = init_sdma_7322_regs;
+	dd->f_sdma_busy         = qib_sdma_7322_busy;
+	dd->f_sdma_gethead      = qib_sdma_7322_gethead;
+	dd->f_sdma_sendctrl     = qib_7322_sdma_sendctrl;
+	dd->f_sdma_set_desc_cnt = qib_sdma_set_7322_desc_cnt;
+	dd->f_sdma_update_tail  = qib_sdma_update_7322_tail;
+	dd->f_sendctrl          = sendctrl_7322_mod;
+	dd->f_set_armlaunch     = qib_set_7322_armlaunch;
+	dd->f_set_cntr_sample   = qib_set_cntr_7322_sample;
+	dd->f_iblink_state      = qib_7322_iblink_state;
+	dd->f_ibphys_portstate  = qib_7322_phys_portstate;
+	dd->f_get_ib_cfg        = qib_7322_get_ib_cfg;
+	dd->f_set_ib_cfg        = qib_7322_set_ib_cfg;
+	dd->f_set_ib_loopback   = qib_7322_set_loopback;
+	dd->f_get_ib_table      = qib_7322_get_ib_table;
+	dd->f_set_ib_table      = qib_7322_set_ib_table;
+	dd->f_set_intr_state    = qib_7322_set_intr_state;
+	dd->f_setextled         = qib_setup_7322_setextled;
+	dd->f_txchk_change      = qib_7322_txchk_change;
+	dd->f_update_usrhead    = qib_update_7322_usrhead;
+	dd->f_wantpiobuf_intr   = qib_wantpiobuf_7322_intr;
+	dd->f_xgxs_reset        = qib_7322_mini_pcs_reset;
+	dd->f_sdma_hw_clean_up  = qib_7322_sdma_hw_clean_up;
+	dd->f_sdma_hw_start_up  = qib_7322_sdma_hw_start_up;
+	dd->f_sdma_init_early   = qib_7322_sdma_init_early;
+	dd->f_writescratch      = writescratch;
+	dd->f_tempsense_rd	= qib_7322_tempsense_rd;
+#ifdef CONFIG_INFINIBAND_QIB_DCA
+	dd->f_notify_dca	= qib_7322_notify_dca;
+#endif
+	/*
+	 * Do remaining PCIe setup and save PCIe values in dd.
+	 * Any error printing is already done by the init code.
+	 * On return, we have the chip mapped, but chip registers
+	 * are not set up until start of qib_init_7322_variables.
+	 */
+	ret = qib_pcie_ddinit(dd, pdev, ent);
+	if (ret < 0)
+		goto bail_free;
+
+	/* initialize chip-specific variables */
+	ret = qib_init_7322_variables(dd);
+	if (ret)
+		goto bail_cleanup;
+
+	if (qib_mini_init || !dd->num_pports)
+		goto bail;
+
+	/*
+	 * Determine number of vectors we want; depends on port count
+	 * and number of configured kernel receive queues actually used.
+	 * Should also depend on whether sdma is enabled or not, but
+	 * that's such a rare testing case it's not worth worrying about.
+	 */
+	tabsize = dd->first_user_ctxt + ARRAY_SIZE(irq_table);
+	for (i = 0; i < tabsize; i++)
+		if ((i < ARRAY_SIZE(irq_table) &&
+		     irq_table[i].port <= dd->num_pports) ||
+		    (i >= ARRAY_SIZE(irq_table) &&
+		     dd->rcd[i - ARRAY_SIZE(irq_table)]))
+			actual_cnt++;
+	/* reduce by ctxt's < 2 */
+	if (qib_krcvq01_no_msi)
+		actual_cnt -= dd->num_pports;
+
+	tabsize = actual_cnt;
+	dd->cspec->msix_entries = kzalloc(tabsize *
+			sizeof(struct qib_msix_entry), GFP_KERNEL);
+	if (!dd->cspec->msix_entries) {
+		qib_dev_err(dd, "No memory for MSIx table\n");
+		tabsize = 0;
+	}
+	for (i = 0; i < tabsize; i++)
+		dd->cspec->msix_entries[i].msix.entry = i;
+
+	if (qib_pcie_params(dd, 8, &tabsize, dd->cspec->msix_entries))
+		qib_dev_err(dd,
+			"Failed to setup PCIe or interrupts; continuing anyway\n");
+	/* may be less than we wanted, if not enough available */
+	dd->cspec->num_msix_entries = tabsize;
+
+	/* setup interrupt handler */
+	qib_setup_7322_interrupt(dd, 1);
+
+	/* clear diagctrl register, in case diags were running and crashed */
+	qib_write_kreg(dd, kr_hwdiagctrl, 0);
+#ifdef CONFIG_INFINIBAND_QIB_DCA
+	if (!dca_add_requester(&pdev->dev)) {
+		qib_devinfo(dd->pcidev, "DCA enabled\n");
+		dd->flags |= QIB_DCA_ENABLED;
+		qib_setup_dca(dd);
+	}
+#endif
+	goto bail;
+
+bail_cleanup:
+	qib_pcie_ddcleanup(dd);
+bail_free:
+	qib_free_devdata(dd);
+	dd = ERR_PTR(ret);
+bail:
+	return dd;
+}
+
+/*
+ * Set the table entry at the specified index from the table specifed.
+ * There are 3 * TXDDS_TABLE_SZ entries in all per port, with the first
+ * TXDDS_TABLE_SZ for SDR, the next for DDR, and the last for QDR.
+ * 'idx' below addresses the correct entry, while its 4 LSBs select the
+ * corresponding entry (one of TXDDS_TABLE_SZ) from the selected table.
+ */
+#define DDS_ENT_AMP_LSB 14
+#define DDS_ENT_MAIN_LSB 9
+#define DDS_ENT_POST_LSB 5
+#define DDS_ENT_PRE_XTRA_LSB 3
+#define DDS_ENT_PRE_LSB 0
+
+/*
+ * Set one entry in the TxDDS table for spec'd port
+ * ridx picks one of the entries, while tp points
+ * to the appropriate table entry.
+ */
+static void set_txdds(struct qib_pportdata *ppd, int ridx,
+		      const struct txdds_ent *tp)
+{
+	struct qib_devdata *dd = ppd->dd;
+	u32 pack_ent;
+	int regidx;
+
+	/* Get correct offset in chip-space, and in source table */
+	regidx = KREG_IBPORT_IDX(IBSD_DDS_MAP_TABLE) + ridx;
+	/*
+	 * We do not use qib_write_kreg_port() because it was intended
+	 * only for registers in the lower "port specific" pages.
+	 * So do index calculation  by hand.
+	 */
+	if (ppd->hw_pidx)
+		regidx += (dd->palign / sizeof(u64));
+
+	pack_ent = tp->amp << DDS_ENT_AMP_LSB;
+	pack_ent |= tp->main << DDS_ENT_MAIN_LSB;
+	pack_ent |= tp->pre << DDS_ENT_PRE_LSB;
+	pack_ent |= tp->post << DDS_ENT_POST_LSB;
+	qib_write_kreg(dd, regidx, pack_ent);
+	/* Prevent back-to-back writes by hitting scratch */
+	qib_write_kreg(ppd->dd, kr_scratch, 0);
+}
+
+static const struct vendor_txdds_ent vendor_txdds[] = {
+	{ /* Amphenol 1m 30awg NoEq */
+		{ 0x41, 0x50, 0x48 }, "584470002       ",
+		{ 10,  0,  0,  5 }, { 10,  0,  0,  9 }, {  7,  1,  0, 13 },
+	},
+	{ /* Amphenol 3m 28awg NoEq */
+		{ 0x41, 0x50, 0x48 }, "584470004       ",
+		{  0,  0,  0,  8 }, {  0,  0,  0, 11 }, {  0,  1,  7, 15 },
+	},
+	{ /* Finisar 3m OM2 Optical */
+		{ 0x00, 0x90, 0x65 }, "FCBG410QB1C03-QL",
+		{  0,  0,  0,  3 }, {  0,  0,  0,  4 }, {  0,  0,  0, 13 },
+	},
+	{ /* Finisar 30m OM2 Optical */
+		{ 0x00, 0x90, 0x65 }, "FCBG410QB1C30-QL",
+		{  0,  0,  0,  1 }, {  0,  0,  0,  5 }, {  0,  0,  0, 11 },
+	},
+	{ /* Finisar Default OM2 Optical */
+		{ 0x00, 0x90, 0x65 }, NULL,
+		{  0,  0,  0,  2 }, {  0,  0,  0,  5 }, {  0,  0,  0, 12 },
+	},
+	{ /* Gore 1m 30awg NoEq */
+		{ 0x00, 0x21, 0x77 }, "QSN3300-1       ",
+		{  0,  0,  0,  6 }, {  0,  0,  0,  9 }, {  0,  1,  0, 15 },
+	},
+	{ /* Gore 2m 30awg NoEq */
+		{ 0x00, 0x21, 0x77 }, "QSN3300-2       ",
+		{  0,  0,  0,  8 }, {  0,  0,  0, 10 }, {  0,  1,  7, 15 },
+	},
+	{ /* Gore 1m 28awg NoEq */
+		{ 0x00, 0x21, 0x77 }, "QSN3800-1       ",
+		{  0,  0,  0,  6 }, {  0,  0,  0,  8 }, {  0,  1,  0, 15 },
+	},
+	{ /* Gore 3m 28awg NoEq */
+		{ 0x00, 0x21, 0x77 }, "QSN3800-3       ",
+		{  0,  0,  0,  9 }, {  0,  0,  0, 13 }, {  0,  1,  7, 15 },
+	},
+	{ /* Gore 5m 24awg Eq */
+		{ 0x00, 0x21, 0x77 }, "QSN7000-5       ",
+		{  0,  0,  0,  7 }, {  0,  0,  0,  9 }, {  0,  1,  3, 15 },
+	},
+	{ /* Gore 7m 24awg Eq */
+		{ 0x00, 0x21, 0x77 }, "QSN7000-7       ",
+		{  0,  0,  0,  9 }, {  0,  0,  0, 11 }, {  0,  2,  6, 15 },
+	},
+	{ /* Gore 5m 26awg Eq */
+		{ 0x00, 0x21, 0x77 }, "QSN7600-5       ",
+		{  0,  0,  0,  8 }, {  0,  0,  0, 11 }, {  0,  1,  9, 13 },
+	},
+	{ /* Gore 7m 26awg Eq */
+		{ 0x00, 0x21, 0x77 }, "QSN7600-7       ",
+		{  0,  0,  0,  8 }, {  0,  0,  0, 11 }, {  10,  1,  8, 15 },
+	},
+	{ /* Intersil 12m 24awg Active */
+		{ 0x00, 0x30, 0xB4 }, "QLX4000CQSFP1224",
+		{  0,  0,  0,  2 }, {  0,  0,  0,  5 }, {  0,  3,  0,  9 },
+	},
+	{ /* Intersil 10m 28awg Active */
+		{ 0x00, 0x30, 0xB4 }, "QLX4000CQSFP1028",
+		{  0,  0,  0,  6 }, {  0,  0,  0,  4 }, {  0,  2,  0,  2 },
+	},
+	{ /* Intersil 7m 30awg Active */
+		{ 0x00, 0x30, 0xB4 }, "QLX4000CQSFP0730",
+		{  0,  0,  0,  6 }, {  0,  0,  0,  4 }, {  0,  1,  0,  3 },
+	},
+	{ /* Intersil 5m 32awg Active */
+		{ 0x00, 0x30, 0xB4 }, "QLX4000CQSFP0532",
+		{  0,  0,  0,  6 }, {  0,  0,  0,  6 }, {  0,  2,  0,  8 },
+	},
+	{ /* Intersil Default Active */
+		{ 0x00, 0x30, 0xB4 }, NULL,
+		{  0,  0,  0,  6 }, {  0,  0,  0,  5 }, {  0,  2,  0,  5 },
+	},
+	{ /* Luxtera 20m Active Optical */
+		{ 0x00, 0x25, 0x63 }, NULL,
+		{  0,  0,  0,  5 }, {  0,  0,  0,  8 }, {  0,  2,  0,  12 },
+	},
+	{ /* Molex 1M Cu loopback */
+		{ 0x00, 0x09, 0x3A }, "74763-0025      ",
+		{  2,  2,  6, 15 }, {  2,  2,  6, 15 }, {  2,  2,  6, 15 },
+	},
+	{ /* Molex 2m 28awg NoEq */
+		{ 0x00, 0x09, 0x3A }, "74757-2201      ",
+		{  0,  0,  0,  6 }, {  0,  0,  0,  9 }, {  0,  1,  1, 15 },
+	},
+};
+
+static const struct txdds_ent txdds_sdr[TXDDS_TABLE_SZ] = {
+	/* amp, pre, main, post */
+	{  2, 2, 15,  6 },	/* Loopback */
+	{  0, 0,  0,  1 },	/*  2 dB */
+	{  0, 0,  0,  2 },	/*  3 dB */
+	{  0, 0,  0,  3 },	/*  4 dB */
+	{  0, 0,  0,  4 },	/*  5 dB */
+	{  0, 0,  0,  5 },	/*  6 dB */
+	{  0, 0,  0,  6 },	/*  7 dB */
+	{  0, 0,  0,  7 },	/*  8 dB */
+	{  0, 0,  0,  8 },	/*  9 dB */
+	{  0, 0,  0,  9 },	/* 10 dB */
+	{  0, 0,  0, 10 },	/* 11 dB */
+	{  0, 0,  0, 11 },	/* 12 dB */
+	{  0, 0,  0, 12 },	/* 13 dB */
+	{  0, 0,  0, 13 },	/* 14 dB */
+	{  0, 0,  0, 14 },	/* 15 dB */
+	{  0, 0,  0, 15 },	/* 16 dB */
+};
+
+static const struct txdds_ent txdds_ddr[TXDDS_TABLE_SZ] = {
+	/* amp, pre, main, post */
+	{  2, 2, 15,  6 },	/* Loopback */
+	{  0, 0,  0,  8 },	/*  2 dB */
+	{  0, 0,  0,  8 },	/*  3 dB */
+	{  0, 0,  0,  9 },	/*  4 dB */
+	{  0, 0,  0,  9 },	/*  5 dB */
+	{  0, 0,  0, 10 },	/*  6 dB */
+	{  0, 0,  0, 10 },	/*  7 dB */
+	{  0, 0,  0, 11 },	/*  8 dB */
+	{  0, 0,  0, 11 },	/*  9 dB */
+	{  0, 0,  0, 12 },	/* 10 dB */
+	{  0, 0,  0, 12 },	/* 11 dB */
+	{  0, 0,  0, 13 },	/* 12 dB */
+	{  0, 0,  0, 13 },	/* 13 dB */
+	{  0, 0,  0, 14 },	/* 14 dB */
+	{  0, 0,  0, 14 },	/* 15 dB */
+	{  0, 0,  0, 15 },	/* 16 dB */
+};
+
+static const struct txdds_ent txdds_qdr[TXDDS_TABLE_SZ] = {
+	/* amp, pre, main, post */
+	{  2, 2, 15,  6 },	/* Loopback */
+	{  0, 1,  0,  7 },	/*  2 dB (also QMH7342) */
+	{  0, 1,  0,  9 },	/*  3 dB (also QMH7342) */
+	{  0, 1,  0, 11 },	/*  4 dB */
+	{  0, 1,  0, 13 },	/*  5 dB */
+	{  0, 1,  0, 15 },	/*  6 dB */
+	{  0, 1,  3, 15 },	/*  7 dB */
+	{  0, 1,  7, 15 },	/*  8 dB */
+	{  0, 1,  7, 15 },	/*  9 dB */
+	{  0, 1,  8, 15 },	/* 10 dB */
+	{  0, 1,  9, 15 },	/* 11 dB */
+	{  0, 1, 10, 15 },	/* 12 dB */
+	{  0, 2,  6, 15 },	/* 13 dB */
+	{  0, 2,  7, 15 },	/* 14 dB */
+	{  0, 2,  8, 15 },	/* 15 dB */
+	{  0, 2,  9, 15 },	/* 16 dB */
+};
+
+/*
+ * extra entries for use with txselect, for indices >= TXDDS_TABLE_SZ.
+ * These are mostly used for mez cards going through connectors
+ * and backplane traces, but can be used to add other "unusual"
+ * table values as well.
+ */
+static const struct txdds_ent txdds_extra_sdr[TXDDS_EXTRA_SZ] = {
+	/* amp, pre, main, post */
+	{  0, 0, 0,  1 },	/* QMH7342 backplane settings */
+	{  0, 0, 0,  1 },	/* QMH7342 backplane settings */
+	{  0, 0, 0,  2 },	/* QMH7342 backplane settings */
+	{  0, 0, 0,  2 },	/* QMH7342 backplane settings */
+	{  0, 0, 0,  3 },	/* QMH7342 backplane settings */
+	{  0, 0, 0,  4 },	/* QMH7342 backplane settings */
+	{  0, 1, 4, 15 },	/* QME7342 backplane settings 1.0 */
+	{  0, 1, 3, 15 },	/* QME7342 backplane settings 1.0 */
+	{  0, 1, 0, 12 },	/* QME7342 backplane settings 1.0 */
+	{  0, 1, 0, 11 },	/* QME7342 backplane settings 1.0 */
+	{  0, 1, 0,  9 },	/* QME7342 backplane settings 1.0 */
+	{  0, 1, 0, 14 },	/* QME7342 backplane settings 1.0 */
+	{  0, 1, 2, 15 },	/* QME7342 backplane settings 1.0 */
+	{  0, 1, 0, 11 },       /* QME7342 backplane settings 1.1 */
+	{  0, 1, 0,  7 },       /* QME7342 backplane settings 1.1 */
+	{  0, 1, 0,  9 },       /* QME7342 backplane settings 1.1 */
+	{  0, 1, 0,  6 },       /* QME7342 backplane settings 1.1 */
+	{  0, 1, 0,  8 },       /* QME7342 backplane settings 1.1 */
+};
+
+static const struct txdds_ent txdds_extra_ddr[TXDDS_EXTRA_SZ] = {
+	/* amp, pre, main, post */
+	{  0, 0, 0,  7 },	/* QMH7342 backplane settings */
+	{  0, 0, 0,  7 },	/* QMH7342 backplane settings */
+	{  0, 0, 0,  8 },	/* QMH7342 backplane settings */
+	{  0, 0, 0,  8 },	/* QMH7342 backplane settings */
+	{  0, 0, 0,  9 },	/* QMH7342 backplane settings */
+	{  0, 0, 0, 10 },	/* QMH7342 backplane settings */
+	{  0, 1, 4, 15 },	/* QME7342 backplane settings 1.0 */
+	{  0, 1, 3, 15 },	/* QME7342 backplane settings 1.0 */
+	{  0, 1, 0, 12 },	/* QME7342 backplane settings 1.0 */
+	{  0, 1, 0, 11 },	/* QME7342 backplane settings 1.0 */
+	{  0, 1, 0,  9 },	/* QME7342 backplane settings 1.0 */
+	{  0, 1, 0, 14 },	/* QME7342 backplane settings 1.0 */
+	{  0, 1, 2, 15 },	/* QME7342 backplane settings 1.0 */
+	{  0, 1, 0, 11 },       /* QME7342 backplane settings 1.1 */
+	{  0, 1, 0,  7 },       /* QME7342 backplane settings 1.1 */
+	{  0, 1, 0,  9 },       /* QME7342 backplane settings 1.1 */
+	{  0, 1, 0,  6 },       /* QME7342 backplane settings 1.1 */
+	{  0, 1, 0,  8 },       /* QME7342 backplane settings 1.1 */
+};
+
+static const struct txdds_ent txdds_extra_qdr[TXDDS_EXTRA_SZ] = {
+	/* amp, pre, main, post */
+	{  0, 1,  0,  4 },	/* QMH7342 backplane settings */
+	{  0, 1,  0,  5 },	/* QMH7342 backplane settings */
+	{  0, 1,  0,  6 },	/* QMH7342 backplane settings */
+	{  0, 1,  0,  8 },	/* QMH7342 backplane settings */
+	{  0, 1,  0, 10 },	/* QMH7342 backplane settings */
+	{  0, 1,  0, 12 },	/* QMH7342 backplane settings */
+	{  0, 1,  4, 15 },	/* QME7342 backplane settings 1.0 */
+	{  0, 1,  3, 15 },	/* QME7342 backplane settings 1.0 */
+	{  0, 1,  0, 12 },	/* QME7342 backplane settings 1.0 */
+	{  0, 1,  0, 11 },	/* QME7342 backplane settings 1.0 */
+	{  0, 1,  0,  9 },	/* QME7342 backplane settings 1.0 */
+	{  0, 1,  0, 14 },	/* QME7342 backplane settings 1.0 */
+	{  0, 1,  2, 15 },	/* QME7342 backplane settings 1.0 */
+	{  0, 1,  0, 11 },      /* QME7342 backplane settings 1.1 */
+	{  0, 1,  0,  7 },      /* QME7342 backplane settings 1.1 */
+	{  0, 1,  0,  9 },      /* QME7342 backplane settings 1.1 */
+	{  0, 1,  0,  6 },      /* QME7342 backplane settings 1.1 */
+	{  0, 1,  0,  8 },      /* QME7342 backplane settings 1.1 */
+};
+
+static const struct txdds_ent txdds_extra_mfg[TXDDS_MFG_SZ] = {
+	/* amp, pre, main, post */
+	{ 0, 0, 0, 0 },         /* QME7342 mfg settings */
+	{ 0, 0, 0, 6 },         /* QME7342 P2 mfg settings */
+};
+
+static const struct txdds_ent *get_atten_table(const struct txdds_ent *txdds,
+					       unsigned atten)
+{
+	/*
+	 * The attenuation table starts at 2dB for entry 1,
+	 * with entry 0 being the loopback entry.
+	 */
+	if (atten <= 2)
+		atten = 1;
+	else if (atten > TXDDS_TABLE_SZ)
+		atten = TXDDS_TABLE_SZ - 1;
+	else
+		atten--;
+	return txdds + atten;
+}
+
+/*
+ * if override is set, the module parameter txselect has a value
+ * for this specific port, so use it, rather than our normal mechanism.
+ */
+static void find_best_ent(struct qib_pportdata *ppd,
+			  const struct txdds_ent **sdr_dds,
+			  const struct txdds_ent **ddr_dds,
+			  const struct txdds_ent **qdr_dds, int override)
+{
+	struct qib_qsfp_cache *qd = &ppd->cpspec->qsfp_data.cache;
+	int idx;
+
+	/* Search table of known cables */
+	for (idx = 0; !override && idx < ARRAY_SIZE(vendor_txdds); ++idx) {
+		const struct vendor_txdds_ent *v = vendor_txdds + idx;
+
+		if (!memcmp(v->oui, qd->oui, QSFP_VOUI_LEN) &&
+		    (!v->partnum ||
+		     !memcmp(v->partnum, qd->partnum, QSFP_PN_LEN))) {
+			*sdr_dds = &v->sdr;
+			*ddr_dds = &v->ddr;
+			*qdr_dds = &v->qdr;
+			return;
+		}
+	}
+
+	/* Active cables don't have attenuation so we only set SERDES
+	 * settings to account for the attenuation of the board traces. */
+	if (!override && QSFP_IS_ACTIVE(qd->tech)) {
+		*sdr_dds = txdds_sdr + ppd->dd->board_atten;
+		*ddr_dds = txdds_ddr + ppd->dd->board_atten;
+		*qdr_dds = txdds_qdr + ppd->dd->board_atten;
+		return;
+	}
+
+	if (!override && QSFP_HAS_ATTEN(qd->tech) && (qd->atten[0] ||
+						      qd->atten[1])) {
+		*sdr_dds = get_atten_table(txdds_sdr, qd->atten[0]);
+		*ddr_dds = get_atten_table(txdds_ddr, qd->atten[0]);
+		*qdr_dds = get_atten_table(txdds_qdr, qd->atten[1]);
+		return;
+	} else if (ppd->cpspec->no_eep < TXDDS_TABLE_SZ) {
+		/*
+		 * If we have no (or incomplete) data from the cable
+		 * EEPROM, or no QSFP, or override is set, use the
+		 * module parameter value to index into the attentuation
+		 * table.
+		 */
+		idx = ppd->cpspec->no_eep;
+		*sdr_dds = &txdds_sdr[idx];
+		*ddr_dds = &txdds_ddr[idx];
+		*qdr_dds = &txdds_qdr[idx];
+	} else if (ppd->cpspec->no_eep < (TXDDS_TABLE_SZ + TXDDS_EXTRA_SZ)) {
+		/* similar to above, but index into the "extra" table. */
+		idx = ppd->cpspec->no_eep - TXDDS_TABLE_SZ;
+		*sdr_dds = &txdds_extra_sdr[idx];
+		*ddr_dds = &txdds_extra_ddr[idx];
+		*qdr_dds = &txdds_extra_qdr[idx];
+	} else if ((IS_QME(ppd->dd) || IS_QMH(ppd->dd)) &&
+		   ppd->cpspec->no_eep < (TXDDS_TABLE_SZ + TXDDS_EXTRA_SZ +
+					  TXDDS_MFG_SZ)) {
+		idx = ppd->cpspec->no_eep - (TXDDS_TABLE_SZ + TXDDS_EXTRA_SZ);
+		pr_info("IB%u:%u use idx %u into txdds_mfg\n",
+			ppd->dd->unit, ppd->port, idx);
+		*sdr_dds = &txdds_extra_mfg[idx];
+		*ddr_dds = &txdds_extra_mfg[idx];
+		*qdr_dds = &txdds_extra_mfg[idx];
+	} else {
+		/* this shouldn't happen, it's range checked */
+		*sdr_dds = txdds_sdr + qib_long_atten;
+		*ddr_dds = txdds_ddr + qib_long_atten;
+		*qdr_dds = txdds_qdr + qib_long_atten;
+	}
+}
+
+static void init_txdds_table(struct qib_pportdata *ppd, int override)
+{
+	const struct txdds_ent *sdr_dds, *ddr_dds, *qdr_dds;
+	struct txdds_ent *dds;
+	int idx;
+	int single_ent = 0;
+
+	find_best_ent(ppd, &sdr_dds, &ddr_dds, &qdr_dds, override);
+
+	/* for mez cards or override, use the selected value for all entries */
+	if (!(ppd->dd->flags & QIB_HAS_QSFP) || override)
+		single_ent = 1;
+
+	/* Fill in the first entry with the best entry found. */
+	set_txdds(ppd, 0, sdr_dds);
+	set_txdds(ppd, TXDDS_TABLE_SZ, ddr_dds);
+	set_txdds(ppd, 2 * TXDDS_TABLE_SZ, qdr_dds);
+	if (ppd->lflags & (QIBL_LINKINIT | QIBL_LINKARMED |
+		QIBL_LINKACTIVE)) {
+		dds = (struct txdds_ent *)(ppd->link_speed_active ==
+					   QIB_IB_QDR ?  qdr_dds :
+					   (ppd->link_speed_active ==
+					    QIB_IB_DDR ? ddr_dds : sdr_dds));
+		write_tx_serdes_param(ppd, dds);
+	}
+
+	/* Fill in the remaining entries with the default table values. */
+	for (idx = 1; idx < ARRAY_SIZE(txdds_sdr); ++idx) {
+		set_txdds(ppd, idx, single_ent ? sdr_dds : txdds_sdr + idx);
+		set_txdds(ppd, idx + TXDDS_TABLE_SZ,
+			  single_ent ? ddr_dds : txdds_ddr + idx);
+		set_txdds(ppd, idx + 2 * TXDDS_TABLE_SZ,
+			  single_ent ? qdr_dds : txdds_qdr + idx);
+	}
+}
+
+#define KR_AHB_ACC KREG_IDX(ahb_access_ctrl)
+#define KR_AHB_TRANS KREG_IDX(ahb_transaction_reg)
+#define AHB_TRANS_RDY SYM_MASK(ahb_transaction_reg, ahb_rdy)
+#define AHB_ADDR_LSB SYM_LSB(ahb_transaction_reg, ahb_address)
+#define AHB_DATA_LSB SYM_LSB(ahb_transaction_reg, ahb_data)
+#define AHB_WR SYM_MASK(ahb_transaction_reg, write_not_read)
+#define AHB_TRANS_TRIES 10
+
+/*
+ * The chan argument is 0=chan0, 1=chan1, 2=pll, 3=chan2, 4=chan4,
+ * 5=subsystem which is why most calls have "chan + chan >> 1"
+ * for the channel argument.
+ */
+static u32 ahb_mod(struct qib_devdata *dd, int quad, int chan, int addr,
+		    u32 data, u32 mask)
+{
+	u32 rd_data, wr_data, sz_mask;
+	u64 trans, acc, prev_acc;
+	u32 ret = 0xBAD0BAD;
+	int tries;
+
+	prev_acc = qib_read_kreg64(dd, KR_AHB_ACC);
+	/* From this point on, make sure we return access */
+	acc = (quad << 1) | 1;
+	qib_write_kreg(dd, KR_AHB_ACC, acc);
+
+	for (tries = 1; tries < AHB_TRANS_TRIES; ++tries) {
+		trans = qib_read_kreg64(dd, KR_AHB_TRANS);
+		if (trans & AHB_TRANS_RDY)
+			break;
+	}
+	if (tries >= AHB_TRANS_TRIES) {
+		qib_dev_err(dd, "No ahb_rdy in %d tries\n", AHB_TRANS_TRIES);
+		goto bail;
+	}
+
+	/* If mask is not all 1s, we need to read, but different SerDes
+	 * entities have different sizes
+	 */
+	sz_mask = (1UL << ((quad == 1) ? 32 : 16)) - 1;
+	wr_data = data & mask & sz_mask;
+	if ((~mask & sz_mask) != 0) {
+		trans = ((chan << 6) | addr) << (AHB_ADDR_LSB + 1);
+		qib_write_kreg(dd, KR_AHB_TRANS, trans);
+
+		for (tries = 1; tries < AHB_TRANS_TRIES; ++tries) {
+			trans = qib_read_kreg64(dd, KR_AHB_TRANS);
+			if (trans & AHB_TRANS_RDY)
+				break;
+		}
+		if (tries >= AHB_TRANS_TRIES) {
+			qib_dev_err(dd, "No Rd ahb_rdy in %d tries\n",
+				    AHB_TRANS_TRIES);
+			goto bail;
+		}
+		/* Re-read in case host split reads and read data first */
+		trans = qib_read_kreg64(dd, KR_AHB_TRANS);
+		rd_data = (uint32_t)(trans >> AHB_DATA_LSB);
+		wr_data |= (rd_data & ~mask & sz_mask);
+	}
+
+	/* If mask is not zero, we need to write. */
+	if (mask & sz_mask) {
+		trans = ((chan << 6) | addr) << (AHB_ADDR_LSB + 1);
+		trans |= ((uint64_t)wr_data << AHB_DATA_LSB);
+		trans |= AHB_WR;
+		qib_write_kreg(dd, KR_AHB_TRANS, trans);
+
+		for (tries = 1; tries < AHB_TRANS_TRIES; ++tries) {
+			trans = qib_read_kreg64(dd, KR_AHB_TRANS);
+			if (trans & AHB_TRANS_RDY)
+				break;
+		}
+		if (tries >= AHB_TRANS_TRIES) {
+			qib_dev_err(dd, "No Wr ahb_rdy in %d tries\n",
+				    AHB_TRANS_TRIES);
+			goto bail;
+		}
+	}
+	ret = wr_data;
+bail:
+	qib_write_kreg(dd, KR_AHB_ACC, prev_acc);
+	return ret;
+}
+
+static void ibsd_wr_allchans(struct qib_pportdata *ppd, int addr, unsigned data,
+			     unsigned mask)
+{
+	struct qib_devdata *dd = ppd->dd;
+	int chan;
+	u32 rbc;
+
+	for (chan = 0; chan < SERDES_CHANS; ++chan) {
+		ahb_mod(dd, IBSD(ppd->hw_pidx), (chan + (chan >> 1)), addr,
+			data, mask);
+		rbc = ahb_mod(dd, IBSD(ppd->hw_pidx), (chan + (chan >> 1)),
+			      addr, 0, 0);
+	}
+}
+
+static void serdes_7322_los_enable(struct qib_pportdata *ppd, int enable)
+{
+	u64 data = qib_read_kreg_port(ppd, krp_serdesctrl);
+	u8 state = SYM_FIELD(data, IBSerdesCtrl_0, RXLOSEN);
+
+	if (enable && !state) {
+		pr_info("IB%u:%u Turning LOS on\n",
+			ppd->dd->unit, ppd->port);
+		data |= SYM_MASK(IBSerdesCtrl_0, RXLOSEN);
+	} else if (!enable && state) {
+		pr_info("IB%u:%u Turning LOS off\n",
+			ppd->dd->unit, ppd->port);
+		data &= ~SYM_MASK(IBSerdesCtrl_0, RXLOSEN);
+	}
+	qib_write_kreg_port(ppd, krp_serdesctrl, data);
+}
+
+static int serdes_7322_init(struct qib_pportdata *ppd)
+{
+	int ret = 0;
+	if (ppd->dd->cspec->r1)
+		ret = serdes_7322_init_old(ppd);
+	else
+		ret = serdes_7322_init_new(ppd);
+	return ret;
+}
+
+static int serdes_7322_init_old(struct qib_pportdata *ppd)
+{
+	u32 le_val;
+
+	/*
+	 * Initialize the Tx DDS tables.  Also done every QSFP event,
+	 * for adapters with QSFP
+	 */
+	init_txdds_table(ppd, 0);
+
+	/* ensure no tx overrides from earlier driver loads */
+	qib_write_kreg_port(ppd, krp_tx_deemph_override,
+		SYM_MASK(IBSD_TX_DEEMPHASIS_OVERRIDE_0,
+		reset_tx_deemphasis_override));
+
+	/* Patch some SerDes defaults to "Better for IB" */
+	/* Timing Loop Bandwidth: cdr_timing[11:9] = 0 */
+	ibsd_wr_allchans(ppd, 2, 0, BMASK(11, 9));
+
+	/* Termination: rxtermctrl_r2d addr 11 bits [12:11] = 1 */
+	ibsd_wr_allchans(ppd, 11, (1 << 11), BMASK(12, 11));
+	/* Enable LE2: rxle2en_r2a addr 13 bit [6] = 1 */
+	ibsd_wr_allchans(ppd, 13, (1 << 6), (1 << 6));
+
+	/* May be overridden in qsfp_7322_event */
+	le_val = IS_QME(ppd->dd) ? LE2_QME : LE2_DEFAULT;
+	ibsd_wr_allchans(ppd, 13, (le_val << 7), BMASK(9, 7));
+
+	/* enable LE1 adaptation for all but QME, which is disabled */
+	le_val = IS_QME(ppd->dd) ? 0 : 1;
+	ibsd_wr_allchans(ppd, 13, (le_val << 5), (1 << 5));
+
+	/* Clear cmode-override, may be set from older driver */
+	ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), 5, 10, 0 << 14, 1 << 14);
+
+	/* Timing Recovery: rxtapsel addr 5 bits [9:8] = 0 */
+	ibsd_wr_allchans(ppd, 5, (0 << 8), BMASK(9, 8));
+
+	/* setup LoS params; these are subsystem, so chan == 5 */
+	/* LoS filter threshold_count on, ch 0-3, set to 8 */
+	ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), 5, 5, 8 << 11, BMASK(14, 11));
+	ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), 5, 7, 8 << 4, BMASK(7, 4));
+	ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), 5, 8, 8 << 11, BMASK(14, 11));
+	ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), 5, 10, 8 << 4, BMASK(7, 4));
+
+	/* LoS filter threshold_count off, ch 0-3, set to 4 */
+	ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), 5, 6, 4 << 0, BMASK(3, 0));
+	ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), 5, 7, 4 << 8, BMASK(11, 8));
+	ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), 5, 9, 4 << 0, BMASK(3, 0));
+	ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), 5, 10, 4 << 8, BMASK(11, 8));
+
+	/* LoS filter select enabled */
+	ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), 5, 9, 1 << 15, 1 << 15);
+
+	/* LoS target data:  SDR=4, DDR=2, QDR=1 */
+	ibsd_wr_allchans(ppd, 14, (1 << 3), BMASK(5, 3)); /* QDR */
+	ibsd_wr_allchans(ppd, 20, (2 << 10), BMASK(12, 10)); /* DDR */
+	ibsd_wr_allchans(ppd, 20, (4 << 13), BMASK(15, 13)); /* SDR */
+
+	serdes_7322_los_enable(ppd, 1);
+
+	/* rxbistena; set 0 to avoid effects of it switch later */
+	ibsd_wr_allchans(ppd, 9, 0 << 15, 1 << 15);
+
+	/* Configure 4 DFE taps, and only they adapt */
+	ibsd_wr_allchans(ppd, 16, 0 << 0, BMASK(1, 0));
+
+	/* gain hi stop 32 (22) (6:1) lo stop 7 (10:7) target 22 (13) (15:11) */
+	le_val = (ppd->dd->cspec->r1 || IS_QME(ppd->dd)) ? 0xb6c0 : 0x6bac;
+	ibsd_wr_allchans(ppd, 21, le_val, 0xfffe);
+
+	/*
+	 * Set receive adaptation mode.  SDR and DDR adaptation are
+	 * always on, and QDR is initially enabled; later disabled.
+	 */
+	qib_write_kreg_port(ppd, krp_static_adapt_dis(0), 0ULL);
+	qib_write_kreg_port(ppd, krp_static_adapt_dis(1), 0ULL);
+	qib_write_kreg_port(ppd, krp_static_adapt_dis(2),
+			    ppd->dd->cspec->r1 ?
+			    QDR_STATIC_ADAPT_DOWN_R1 : QDR_STATIC_ADAPT_DOWN);
+	ppd->cpspec->qdr_dfe_on = 1;
+
+	/* FLoop LOS gate: PPM filter  enabled */
+	ibsd_wr_allchans(ppd, 38, 0 << 10, 1 << 10);
+
+	/* rx offset center enabled */
+	ibsd_wr_allchans(ppd, 12, 1 << 4, 1 << 4);
+
+	if (!ppd->dd->cspec->r1) {
+		ibsd_wr_allchans(ppd, 12, 1 << 12, 1 << 12);
+		ibsd_wr_allchans(ppd, 12, 2 << 8, 0x0f << 8);
+	}
+
+	/* Set the frequency loop bandwidth to 15 */
+	ibsd_wr_allchans(ppd, 2, 15 << 5, BMASK(8, 5));
+
+	return 0;
+}
+
+static int serdes_7322_init_new(struct qib_pportdata *ppd)
+{
+	unsigned long tend;
+	u32 le_val, rxcaldone;
+	int chan, chan_done = (1 << SERDES_CHANS) - 1;
+
+	/* Clear cmode-override, may be set from older driver */
+	ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), 5, 10, 0 << 14, 1 << 14);
+
+	/* ensure no tx overrides from earlier driver loads */
+	qib_write_kreg_port(ppd, krp_tx_deemph_override,
+		SYM_MASK(IBSD_TX_DEEMPHASIS_OVERRIDE_0,
+		reset_tx_deemphasis_override));
+
+	/* START OF LSI SUGGESTED SERDES BRINGUP */
+	/* Reset - Calibration Setup */
+	/*       Stop DFE adaptaion */
+	ibsd_wr_allchans(ppd, 1, 0, BMASK(9, 1));
+	/*       Disable LE1 */
+	ibsd_wr_allchans(ppd, 13, 0, BMASK(5, 5));
+	/*       Disable autoadapt for LE1 */
+	ibsd_wr_allchans(ppd, 1, 0, BMASK(15, 15));
+	/*       Disable LE2 */
+	ibsd_wr_allchans(ppd, 13, 0, BMASK(6, 6));
+	/*       Disable VGA */
+	ibsd_wr_allchans(ppd, 5, 0, BMASK(0, 0));
+	/*       Disable AFE Offset Cancel */
+	ibsd_wr_allchans(ppd, 12, 0, BMASK(12, 12));
+	/*       Disable Timing Loop */
+	ibsd_wr_allchans(ppd, 2, 0, BMASK(3, 3));
+	/*       Disable Frequency Loop */
+	ibsd_wr_allchans(ppd, 2, 0, BMASK(4, 4));
+	/*       Disable Baseline Wander Correction */
+	ibsd_wr_allchans(ppd, 13, 0, BMASK(13, 13));
+	/*       Disable RX Calibration */
+	ibsd_wr_allchans(ppd, 4, 0, BMASK(10, 10));
+	/*       Disable RX Offset Calibration */
+	ibsd_wr_allchans(ppd, 12, 0, BMASK(4, 4));
+	/*       Select BB CDR */
+	ibsd_wr_allchans(ppd, 2, (1 << 15), BMASK(15, 15));
+	/*       CDR Step Size */
+	ibsd_wr_allchans(ppd, 5, 0, BMASK(9, 8));
+	/*       Enable phase Calibration */
+	ibsd_wr_allchans(ppd, 12, (1 << 5), BMASK(5, 5));
+	/*       DFE Bandwidth [2:14-12] */
+	ibsd_wr_allchans(ppd, 2, (4 << 12), BMASK(14, 12));
+	/*       DFE Config (4 taps only) */
+	ibsd_wr_allchans(ppd, 16, 0, BMASK(1, 0));
+	/*       Gain Loop Bandwidth */
+	if (!ppd->dd->cspec->r1) {
+		ibsd_wr_allchans(ppd, 12, 1 << 12, BMASK(12, 12));
+		ibsd_wr_allchans(ppd, 12, 2 << 8, BMASK(11, 8));
+	} else {
+		ibsd_wr_allchans(ppd, 19, (3 << 11), BMASK(13, 11));
+	}
+	/*       Baseline Wander Correction Gain [13:4-0] (leave as default) */
+	/*       Baseline Wander Correction Gain [3:7-5] (leave as default) */
+	/*       Data Rate Select [5:7-6] (leave as default) */
+	/*       RX Parallel Word Width [3:10-8] (leave as default) */
+
+	/* RX REST */
+	/*       Single- or Multi-channel reset */
+	/*       RX Analog reset */
+	/*       RX Digital reset */
+	ibsd_wr_allchans(ppd, 0, 0, BMASK(15, 13));
+	msleep(20);
+	/*       RX Analog reset */
+	ibsd_wr_allchans(ppd, 0, (1 << 14), BMASK(14, 14));
+	msleep(20);
+	/*       RX Digital reset */
+	ibsd_wr_allchans(ppd, 0, (1 << 13), BMASK(13, 13));
+	msleep(20);
+
+	/* setup LoS params; these are subsystem, so chan == 5 */
+	/* LoS filter threshold_count on, ch 0-3, set to 8 */
+	ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), 5, 5, 8 << 11, BMASK(14, 11));
+	ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), 5, 7, 8 << 4, BMASK(7, 4));
+	ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), 5, 8, 8 << 11, BMASK(14, 11));
+	ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), 5, 10, 8 << 4, BMASK(7, 4));
+
+	/* LoS filter threshold_count off, ch 0-3, set to 4 */
+	ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), 5, 6, 4 << 0, BMASK(3, 0));
+	ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), 5, 7, 4 << 8, BMASK(11, 8));
+	ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), 5, 9, 4 << 0, BMASK(3, 0));
+	ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), 5, 10, 4 << 8, BMASK(11, 8));
+
+	/* LoS filter select enabled */
+	ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), 5, 9, 1 << 15, 1 << 15);
+
+	/* LoS target data:  SDR=4, DDR=2, QDR=1 */
+	ibsd_wr_allchans(ppd, 14, (1 << 3), BMASK(5, 3)); /* QDR */
+	ibsd_wr_allchans(ppd, 20, (2 << 10), BMASK(12, 10)); /* DDR */
+	ibsd_wr_allchans(ppd, 20, (4 << 13), BMASK(15, 13)); /* SDR */
+
+	/* Turn on LOS on initial SERDES init */
+	serdes_7322_los_enable(ppd, 1);
+	/* FLoop LOS gate: PPM filter  enabled */
+	ibsd_wr_allchans(ppd, 38, 0 << 10, 1 << 10);
+
+	/* RX LATCH CALIBRATION */
+	/*       Enable Eyefinder Phase Calibration latch */
+	ibsd_wr_allchans(ppd, 15, 1, BMASK(0, 0));
+	/*       Enable RX Offset Calibration latch */
+	ibsd_wr_allchans(ppd, 12, (1 << 4), BMASK(4, 4));
+	msleep(20);
+	/*       Start Calibration */
+	ibsd_wr_allchans(ppd, 4, (1 << 10), BMASK(10, 10));
+	tend = jiffies + msecs_to_jiffies(500);
+	while (chan_done && !time_is_before_jiffies(tend)) {
+		msleep(20);
+		for (chan = 0; chan < SERDES_CHANS; ++chan) {
+			rxcaldone = ahb_mod(ppd->dd, IBSD(ppd->hw_pidx),
+					    (chan + (chan >> 1)),
+					    25, 0, 0);
+			if ((~rxcaldone & (u32)BMASK(9, 9)) == 0 &&
+			    (~chan_done & (1 << chan)) == 0)
+				chan_done &= ~(1 << chan);
+		}
+	}
+	if (chan_done) {
+		pr_info("Serdes %d calibration not done after .5 sec: 0x%x\n",
+			 IBSD(ppd->hw_pidx), chan_done);
+	} else {
+		for (chan = 0; chan < SERDES_CHANS; ++chan) {
+			rxcaldone = ahb_mod(ppd->dd, IBSD(ppd->hw_pidx),
+					    (chan + (chan >> 1)),
+					    25, 0, 0);
+			if ((~rxcaldone & (u32)BMASK(10, 10)) == 0)
+				pr_info("Serdes %d chan %d calibration failed\n",
+					IBSD(ppd->hw_pidx), chan);
+		}
+	}
+
+	/*       Turn off Calibration */
+	ibsd_wr_allchans(ppd, 4, 0, BMASK(10, 10));
+	msleep(20);
+
+	/* BRING RX UP */
+	/*       Set LE2 value (May be overridden in qsfp_7322_event) */
+	le_val = IS_QME(ppd->dd) ? LE2_QME : LE2_DEFAULT;
+	ibsd_wr_allchans(ppd, 13, (le_val << 7), BMASK(9, 7));
+	/*       Set LE2 Loop bandwidth */
+	ibsd_wr_allchans(ppd, 3, (7 << 5), BMASK(7, 5));
+	/*       Enable LE2 */
+	ibsd_wr_allchans(ppd, 13, (1 << 6), BMASK(6, 6));
+	msleep(20);
+	/*       Enable H0 only */
+	ibsd_wr_allchans(ppd, 1, 1, BMASK(9, 1));
+	/* gain hi stop 32 (22) (6:1) lo stop 7 (10:7) target 22 (13) (15:11) */
+	le_val = (ppd->dd->cspec->r1 || IS_QME(ppd->dd)) ? 0xb6c0 : 0x6bac;
+	ibsd_wr_allchans(ppd, 21, le_val, 0xfffe);
+	/*       Enable VGA */
+	ibsd_wr_allchans(ppd, 5, 0, BMASK(0, 0));
+	msleep(20);
+	/*       Set Frequency Loop Bandwidth */
+	ibsd_wr_allchans(ppd, 2, (15 << 5), BMASK(8, 5));
+	/*       Enable Frequency Loop */
+	ibsd_wr_allchans(ppd, 2, (1 << 4), BMASK(4, 4));
+	/*       Set Timing Loop Bandwidth */
+	ibsd_wr_allchans(ppd, 2, 0, BMASK(11, 9));
+	/*       Enable Timing Loop */
+	ibsd_wr_allchans(ppd, 2, (1 << 3), BMASK(3, 3));
+	msleep(50);
+	/*       Enable DFE
+	 *       Set receive adaptation mode.  SDR and DDR adaptation are
+	 *       always on, and QDR is initially enabled; later disabled.
+	 */
+	qib_write_kreg_port(ppd, krp_static_adapt_dis(0), 0ULL);
+	qib_write_kreg_port(ppd, krp_static_adapt_dis(1), 0ULL);
+	qib_write_kreg_port(ppd, krp_static_adapt_dis(2),
+			    ppd->dd->cspec->r1 ?
+			    QDR_STATIC_ADAPT_DOWN_R1 : QDR_STATIC_ADAPT_DOWN);
+	ppd->cpspec->qdr_dfe_on = 1;
+	/*       Disable LE1  */
+	ibsd_wr_allchans(ppd, 13, (0 << 5), (1 << 5));
+	/*       Disable auto adapt for LE1 */
+	ibsd_wr_allchans(ppd, 1, (0 << 15), BMASK(15, 15));
+	msleep(20);
+	/*       Enable AFE Offset Cancel */
+	ibsd_wr_allchans(ppd, 12, (1 << 12), BMASK(12, 12));
+	/*       Enable Baseline Wander Correction */
+	ibsd_wr_allchans(ppd, 12, (1 << 13), BMASK(13, 13));
+	/* Termination: rxtermctrl_r2d addr 11 bits [12:11] = 1 */
+	ibsd_wr_allchans(ppd, 11, (1 << 11), BMASK(12, 11));
+	/* VGA output common mode */
+	ibsd_wr_allchans(ppd, 12, (3 << 2), BMASK(3, 2));
+
+	/*
+	 * Initialize the Tx DDS tables.  Also done every QSFP event,
+	 * for adapters with QSFP
+	 */
+	init_txdds_table(ppd, 0);
+
+	return 0;
+}
+
+/* start adjust QMH serdes parameters */
+
+static void set_man_code(struct qib_pportdata *ppd, int chan, int code)
+{
+	ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), (chan + (chan >> 1)),
+		9, code << 9, 0x3f << 9);
+}
+
+static void set_man_mode_h1(struct qib_pportdata *ppd, int chan,
+	int enable, u32 tapenable)
+{
+	if (enable)
+		ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), (chan + (chan >> 1)),
+			1, 3 << 10, 0x1f << 10);
+	else
+		ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), (chan + (chan >> 1)),
+			1, 0, 0x1f << 10);
+}
+
+/* Set clock to 1, 0, 1, 0 */
+static void clock_man(struct qib_pportdata *ppd, int chan)
+{
+	ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), (chan + (chan >> 1)),
+		4, 0x4000, 0x4000);
+	ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), (chan + (chan >> 1)),
+		4, 0, 0x4000);
+	ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), (chan + (chan >> 1)),
+		4, 0x4000, 0x4000);
+	ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), (chan + (chan >> 1)),
+		4, 0, 0x4000);
+}
+
+/*
+ * write the current Tx serdes pre,post,main,amp settings into the serdes.
+ * The caller must pass the settings appropriate for the current speed,
+ * or not care if they are correct for the current speed.
+ */
+static void write_tx_serdes_param(struct qib_pportdata *ppd,
+				  struct txdds_ent *txdds)
+{
+	u64 deemph;
+
+	deemph = qib_read_kreg_port(ppd, krp_tx_deemph_override);
+	/* field names for amp, main, post, pre, respectively */
+	deemph &= ~(SYM_MASK(IBSD_TX_DEEMPHASIS_OVERRIDE_0, txampcntl_d2a) |
+		    SYM_MASK(IBSD_TX_DEEMPHASIS_OVERRIDE_0, txc0_ena) |
+		    SYM_MASK(IBSD_TX_DEEMPHASIS_OVERRIDE_0, txcp1_ena) |
+		    SYM_MASK(IBSD_TX_DEEMPHASIS_OVERRIDE_0, txcn1_ena));
+
+	deemph |= SYM_MASK(IBSD_TX_DEEMPHASIS_OVERRIDE_0,
+			   tx_override_deemphasis_select);
+	deemph |= (txdds->amp & SYM_RMASK(IBSD_TX_DEEMPHASIS_OVERRIDE_0,
+		    txampcntl_d2a)) << SYM_LSB(IBSD_TX_DEEMPHASIS_OVERRIDE_0,
+				       txampcntl_d2a);
+	deemph |= (txdds->main & SYM_RMASK(IBSD_TX_DEEMPHASIS_OVERRIDE_0,
+		     txc0_ena)) << SYM_LSB(IBSD_TX_DEEMPHASIS_OVERRIDE_0,
+				   txc0_ena);
+	deemph |= (txdds->post & SYM_RMASK(IBSD_TX_DEEMPHASIS_OVERRIDE_0,
+		     txcp1_ena)) << SYM_LSB(IBSD_TX_DEEMPHASIS_OVERRIDE_0,
+				    txcp1_ena);
+	deemph |= (txdds->pre & SYM_RMASK(IBSD_TX_DEEMPHASIS_OVERRIDE_0,
+		     txcn1_ena)) << SYM_LSB(IBSD_TX_DEEMPHASIS_OVERRIDE_0,
+				    txcn1_ena);
+	qib_write_kreg_port(ppd, krp_tx_deemph_override, deemph);
+}
+
+/*
+ * Set the parameters for mez cards on link bounce, so they are
+ * always exactly what was requested.  Similar logic to init_txdds
+ * but does just the serdes.
+ */
+static void adj_tx_serdes(struct qib_pportdata *ppd)
+{
+	const struct txdds_ent *sdr_dds, *ddr_dds, *qdr_dds;
+	struct txdds_ent *dds;
+
+	find_best_ent(ppd, &sdr_dds, &ddr_dds, &qdr_dds, 1);
+	dds = (struct txdds_ent *)(ppd->link_speed_active == QIB_IB_QDR ?
+		qdr_dds : (ppd->link_speed_active == QIB_IB_DDR ?
+				ddr_dds : sdr_dds));
+	write_tx_serdes_param(ppd, dds);
+}
+
+/* set QDR forced value for H1, if needed */
+static void force_h1(struct qib_pportdata *ppd)
+{
+	int chan;
+
+	ppd->cpspec->qdr_reforce = 0;
+	if (!ppd->dd->cspec->r1)
+		return;
+
+	for (chan = 0; chan < SERDES_CHANS; chan++) {
+		set_man_mode_h1(ppd, chan, 1, 0);
+		set_man_code(ppd, chan, ppd->cpspec->h1_val);
+		clock_man(ppd, chan);
+		set_man_mode_h1(ppd, chan, 0, 0);
+	}
+}
+
+#define SJA_EN SYM_MASK(SPC_JTAG_ACCESS_REG, SPC_JTAG_ACCESS_EN)
+#define BISTEN_LSB SYM_LSB(SPC_JTAG_ACCESS_REG, bist_en)
+
+#define R_OPCODE_LSB 3
+#define R_OP_NOP 0
+#define R_OP_SHIFT 2
+#define R_OP_UPDATE 3
+#define R_TDI_LSB 2
+#define R_TDO_LSB 1
+#define R_RDY 1
+
+static int qib_r_grab(struct qib_devdata *dd)
+{
+	u64 val;
+	val = SJA_EN;
+	qib_write_kreg(dd, kr_r_access, val);
+	qib_read_kreg32(dd, kr_scratch);
+	return 0;
+}
+
+/* qib_r_wait_for_rdy() not only waits for the ready bit, it
+ * returns the current state of R_TDO
+ */
+static int qib_r_wait_for_rdy(struct qib_devdata *dd)
+{
+	u64 val;
+	int timeout;
+	for (timeout = 0; timeout < 100 ; ++timeout) {
+		val = qib_read_kreg32(dd, kr_r_access);
+		if (val & R_RDY)
+			return (val >> R_TDO_LSB) & 1;
+	}
+	return -1;
+}
+
+static int qib_r_shift(struct qib_devdata *dd, int bisten,
+		       int len, u8 *inp, u8 *outp)
+{
+	u64 valbase, val;
+	int ret, pos;
+
+	valbase = SJA_EN | (bisten << BISTEN_LSB) |
+		(R_OP_SHIFT << R_OPCODE_LSB);
+	ret = qib_r_wait_for_rdy(dd);
+	if (ret < 0)
+		goto bail;
+	for (pos = 0; pos < len; ++pos) {
+		val = valbase;
+		if (outp) {
+			outp[pos >> 3] &= ~(1 << (pos & 7));
+			outp[pos >> 3] |= (ret << (pos & 7));
+		}
+		if (inp) {
+			int tdi = inp[pos >> 3] >> (pos & 7);
+			val |= ((tdi & 1) << R_TDI_LSB);
+		}
+		qib_write_kreg(dd, kr_r_access, val);
+		qib_read_kreg32(dd, kr_scratch);
+		ret = qib_r_wait_for_rdy(dd);
+		if (ret < 0)
+			break;
+	}
+	/* Restore to NOP between operations. */
+	val =  SJA_EN | (bisten << BISTEN_LSB);
+	qib_write_kreg(dd, kr_r_access, val);
+	qib_read_kreg32(dd, kr_scratch);
+	ret = qib_r_wait_for_rdy(dd);
+
+	if (ret >= 0)
+		ret = pos;
+bail:
+	return ret;
+}
+
+static int qib_r_update(struct qib_devdata *dd, int bisten)
+{
+	u64 val;
+	int ret;
+
+	val = SJA_EN | (bisten << BISTEN_LSB) | (R_OP_UPDATE << R_OPCODE_LSB);
+	ret = qib_r_wait_for_rdy(dd);
+	if (ret >= 0) {
+		qib_write_kreg(dd, kr_r_access, val);
+		qib_read_kreg32(dd, kr_scratch);
+	}
+	return ret;
+}
+
+#define BISTEN_PORT_SEL 15
+#define LEN_PORT_SEL 625
+#define BISTEN_AT 17
+#define LEN_AT 156
+#define BISTEN_ETM 16
+#define LEN_ETM 632
+
+#define BIT2BYTE(x) (((x) +  BITS_PER_BYTE - 1) / BITS_PER_BYTE)
+
+/* these are common for all IB port use cases. */
+static u8 reset_at[BIT2BYTE(LEN_AT)] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x20, 0x00,
+};
+static u8 reset_atetm[BIT2BYTE(LEN_ETM)] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x80, 0xe3, 0x81, 0x73, 0x3c, 0x70, 0x8e,
+	0x07, 0xce, 0xf1, 0xc0, 0x39, 0x1e, 0x38, 0xc7, 0x03, 0xe7,
+	0x78, 0xe0, 0x1c, 0x0f, 0x9c, 0x7f, 0x80, 0x73, 0x0f, 0x70,
+	0xde, 0x01, 0xce, 0x39, 0xc0, 0xf9, 0x06, 0x38, 0xd7, 0x00,
+	0xe7, 0x19, 0xe0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00,
+};
+static u8 at[BIT2BYTE(LEN_AT)] = {
+	0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x20, 0x00,
+};
+
+/* used for IB1 or IB2, only one in use */
+static u8 atetm_1port[BIT2BYTE(LEN_ETM)] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x10, 0xf2, 0x80, 0x83, 0x1e, 0x38, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x50, 0xf4, 0x41, 0x00, 0x18, 0x78, 0xc8, 0x03,
+	0x07, 0x7b, 0xa0, 0x3e, 0x00, 0x02, 0x00, 0x00, 0x18, 0x00,
+	0x18, 0x00, 0x00, 0x00, 0x00, 0x4b, 0x00, 0x00, 0x00,
+};
+
+/* used when both IB1 and IB2 are in use */
+static u8 atetm_2port[BIT2BYTE(LEN_ETM)] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x79,
+	0xc0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0xf8, 0x80, 0x83, 0x1e, 0x38, 0xe0, 0x03, 0x05,
+	0x7b, 0xa0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80,
+	0xa2, 0x0f, 0x50, 0xf4, 0x41, 0x00, 0x18, 0x78, 0xd1, 0x07,
+	0x02, 0x7c, 0x80, 0x3e, 0x00, 0x02, 0x00, 0x00, 0x3e, 0x00,
+	0x02, 0x00, 0x00, 0x00, 0x00, 0x64, 0x00, 0x00, 0x00,
+};
+
+/* used when only IB1 is in use */
+static u8 portsel_port1[BIT2BYTE(LEN_PORT_SEL)] = {
+	0x32, 0x65, 0xa4, 0x7b, 0x10, 0x98, 0xdc, 0xfe, 0x13, 0x13,
+	0x13, 0x13, 0x13, 0x13, 0x13, 0x13, 0x73, 0x0c, 0x0c, 0x0c,
+	0x0c, 0x0c, 0x13, 0x13, 0x13, 0x13, 0x13, 0x13, 0x13, 0x13,
+	0x13, 0x78, 0x78, 0x13, 0x13, 0x13, 0x13, 0x13, 0x13, 0x13,
+	0x14, 0x14, 0x14, 0x14, 0x14, 0x14, 0x14, 0x14, 0x74, 0x32,
+	0x32, 0x32, 0x32, 0x32, 0x14, 0x14, 0x14, 0x14, 0x14, 0x14,
+	0x14, 0x14, 0x14, 0x14, 0x14, 0x14, 0x14, 0x14, 0x14, 0x14,
+	0x14, 0x14, 0x9f, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00,
+};
+
+/* used when only IB2 is in use */
+static u8 portsel_port2[BIT2BYTE(LEN_PORT_SEL)] = {
+	0x32, 0x65, 0xa4, 0x7b, 0x10, 0x98, 0xdc, 0xfe, 0x39, 0x39,
+	0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x73, 0x32, 0x32, 0x32,
+	0x32, 0x32, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39,
+	0x39, 0x78, 0x78, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39,
+	0x3a, 0x3a, 0x3a, 0x3a, 0x3a, 0x3a, 0x3a, 0x3a, 0x74, 0x32,
+	0x32, 0x32, 0x32, 0x32, 0x3a, 0x3a, 0x3a, 0x3a, 0x3a, 0x3a,
+	0x3a, 0x3a, 0x3a, 0x3a, 0x3a, 0x3a, 0x3a, 0x3a, 0x3a, 0x3a,
+	0x3a, 0x3a, 0x9f, 0x01, 0x00, 0x00, 0x00, 0x00, 0x01,
+};
+
+/* used when both IB1 and IB2 are in use */
+static u8 portsel_2port[BIT2BYTE(LEN_PORT_SEL)] = {
+	0x32, 0xba, 0x54, 0x76, 0x10, 0x98, 0xdc, 0xfe, 0x13, 0x13,
+	0x13, 0x13, 0x13, 0x13, 0x13, 0x13, 0x73, 0x0c, 0x0c, 0x0c,
+	0x0c, 0x0c, 0x13, 0x13, 0x13, 0x13, 0x13, 0x13, 0x13, 0x13,
+	0x13, 0x13, 0x13, 0x13, 0x13, 0x13, 0x13, 0x13, 0x13, 0x13,
+	0x14, 0x14, 0x14, 0x14, 0x14, 0x14, 0x14, 0x14, 0x74, 0x32,
+	0x32, 0x32, 0x32, 0x32, 0x14, 0x14, 0x14, 0x14, 0x14, 0x3a,
+	0x3a, 0x14, 0x14, 0x14, 0x14, 0x14, 0x14, 0x14, 0x14, 0x14,
+	0x14, 0x14, 0x9f, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00,
+};
+
+/*
+ * Do setup to properly handle IB link recovery; if port is zero, we
+ * are initializing to cover both ports; otherwise we are initializing
+ * to cover a single port card, or the port has reached INIT and we may
+ * need to switch coverage types.
+ */
+static void setup_7322_link_recovery(struct qib_pportdata *ppd, u32 both)
+{
+	u8 *portsel, *etm;
+	struct qib_devdata *dd = ppd->dd;
+
+	if (!ppd->dd->cspec->r1)
+		return;
+	if (!both) {
+		dd->cspec->recovery_ports_initted++;
+		ppd->cpspec->recovery_init = 1;
+	}
+	if (!both && dd->cspec->recovery_ports_initted == 1) {
+		portsel = ppd->port == 1 ? portsel_port1 : portsel_port2;
+		etm = atetm_1port;
+	} else {
+		portsel = portsel_2port;
+		etm = atetm_2port;
+	}
+
+	if (qib_r_grab(dd) < 0 ||
+		qib_r_shift(dd, BISTEN_ETM, LEN_ETM, reset_atetm, NULL) < 0 ||
+		qib_r_update(dd, BISTEN_ETM) < 0 ||
+		qib_r_shift(dd, BISTEN_AT, LEN_AT, reset_at, NULL) < 0 ||
+		qib_r_update(dd, BISTEN_AT) < 0 ||
+		qib_r_shift(dd, BISTEN_PORT_SEL, LEN_PORT_SEL,
+			    portsel, NULL) < 0 ||
+		qib_r_update(dd, BISTEN_PORT_SEL) < 0 ||
+		qib_r_shift(dd, BISTEN_AT, LEN_AT, at, NULL) < 0 ||
+		qib_r_update(dd, BISTEN_AT) < 0 ||
+		qib_r_shift(dd, BISTEN_ETM, LEN_ETM, etm, NULL) < 0 ||
+		qib_r_update(dd, BISTEN_ETM) < 0)
+		qib_dev_err(dd, "Failed IB link recovery setup\n");
+}
+
+static void check_7322_rxe_status(struct qib_pportdata *ppd)
+{
+	struct qib_devdata *dd = ppd->dd;
+	u64 fmask;
+
+	if (dd->cspec->recovery_ports_initted != 1)
+		return; /* rest doesn't apply to dualport */
+	qib_write_kreg(dd, kr_control, dd->control |
+		       SYM_MASK(Control, FreezeMode));
+	(void)qib_read_kreg64(dd, kr_scratch);
+	udelay(3); /* ibcreset asserted 400ns, be sure that's over */
+	fmask = qib_read_kreg64(dd, kr_act_fmask);
+	if (!fmask) {
+		/*
+		 * require a powercycle before we'll work again, and make
+		 * sure we get no more interrupts, and don't turn off
+		 * freeze.
+		 */
+		ppd->dd->cspec->stay_in_freeze = 1;
+		qib_7322_set_intr_state(ppd->dd, 0);
+		qib_write_kreg(dd, kr_fmask, 0ULL);
+		qib_dev_err(dd, "HCA unusable until powercycled\n");
+		return; /* eventually reset */
+	}
+
+	qib_write_kreg(ppd->dd, kr_hwerrclear,
+	    SYM_MASK(HwErrClear, IBSerdesPClkNotDetectClear_1));
+
+	/* don't do the full clear_freeze(), not needed for this */
+	qib_write_kreg(dd, kr_control, dd->control);
+	qib_read_kreg32(dd, kr_scratch);
+	/* take IBC out of reset */
+	if (ppd->link_speed_supported) {
+		ppd->cpspec->ibcctrl_a &=
+			~SYM_MASK(IBCCtrlA_0, IBStatIntReductionEn);
+		qib_write_kreg_port(ppd, krp_ibcctrl_a,
+				    ppd->cpspec->ibcctrl_a);
+		qib_read_kreg32(dd, kr_scratch);
+		if (ppd->lflags & QIBL_IB_LINK_DISABLED)
+			qib_set_ib_7322_lstate(ppd, 0,
+				QLOGIC_IB_IBCC_LINKINITCMD_DISABLE);
+	}
+}
diff --git a/drivers/infiniband/hw/qib/qib_init.c b/drivers/infiniband/hw/qib/qib_init.c
new file mode 100644
index 0000000..729da39
--- /dev/null
+++ b/drivers/infiniband/hw/qib/qib_init.c
@@ -0,0 +1,1857 @@
+/*
+ * Copyright (c) 2012, 2013 Intel Corporation.  All rights reserved.
+ * Copyright (c) 2006 - 2012 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/pci.h>
+#include <linux/netdevice.h>
+#include <linux/vmalloc.h>
+#include <linux/delay.h>
+#include <linux/idr.h>
+#include <linux/module.h>
+#include <linux/printk.h>
+#ifdef CONFIG_INFINIBAND_QIB_DCA
+#include <linux/dca.h>
+#endif
+
+#include "qib.h"
+#include "qib_common.h"
+#include "qib_mad.h"
+#ifdef CONFIG_DEBUG_FS
+#include "qib_debugfs.h"
+#include "qib_verbs.h"
+#endif
+
+#undef pr_fmt
+#define pr_fmt(fmt) QIB_DRV_NAME ": " fmt
+
+/*
+ * min buffers we want to have per context, after driver
+ */
+#define QIB_MIN_USER_CTXT_BUFCNT 7
+
+#define QLOGIC_IB_R_SOFTWARE_MASK 0xFF
+#define QLOGIC_IB_R_SOFTWARE_SHIFT 24
+#define QLOGIC_IB_R_EMULATOR_MASK (1ULL<<62)
+
+/*
+ * Number of ctxts we are configured to use (to allow for more pio
+ * buffers per ctxt, etc.)  Zero means use chip value.
+ */
+ushort qib_cfgctxts;
+module_param_named(cfgctxts, qib_cfgctxts, ushort, S_IRUGO);
+MODULE_PARM_DESC(cfgctxts, "Set max number of contexts to use");
+
+unsigned qib_numa_aware;
+module_param_named(numa_aware, qib_numa_aware, uint, S_IRUGO);
+MODULE_PARM_DESC(numa_aware,
+	"0 -> PSM allocation close to HCA, 1 -> PSM allocation local to process");
+
+/*
+ * If set, do not write to any regs if avoidable, hack to allow
+ * check for deranged default register values.
+ */
+ushort qib_mini_init;
+module_param_named(mini_init, qib_mini_init, ushort, S_IRUGO);
+MODULE_PARM_DESC(mini_init, "If set, do minimal diag init");
+
+unsigned qib_n_krcv_queues;
+module_param_named(krcvqs, qib_n_krcv_queues, uint, S_IRUGO);
+MODULE_PARM_DESC(krcvqs, "number of kernel receive queues per IB port");
+
+unsigned qib_cc_table_size;
+module_param_named(cc_table_size, qib_cc_table_size, uint, S_IRUGO);
+MODULE_PARM_DESC(cc_table_size, "Congestion control table entries 0 (CCA disabled - default), min = 128, max = 1984");
+/*
+ * qib_wc_pat parameter:
+ *      0 is WC via MTRR
+ *      1 is WC via PAT
+ *      If PAT initialization fails, code reverts back to MTRR
+ */
+unsigned qib_wc_pat = 1; /* default (1) is to use PAT, not MTRR */
+module_param_named(wc_pat, qib_wc_pat, uint, S_IRUGO);
+MODULE_PARM_DESC(wc_pat, "enable write-combining via PAT mechanism");
+
+static void verify_interrupt(unsigned long);
+
+static struct idr qib_unit_table;
+u32 qib_cpulist_count;
+unsigned long *qib_cpulist;
+
+/* set number of contexts we'll actually use */
+void qib_set_ctxtcnt(struct qib_devdata *dd)
+{
+	if (!qib_cfgctxts) {
+		dd->cfgctxts = dd->first_user_ctxt + num_online_cpus();
+		if (dd->cfgctxts > dd->ctxtcnt)
+			dd->cfgctxts = dd->ctxtcnt;
+	} else if (qib_cfgctxts < dd->num_pports)
+		dd->cfgctxts = dd->ctxtcnt;
+	else if (qib_cfgctxts <= dd->ctxtcnt)
+		dd->cfgctxts = qib_cfgctxts;
+	else
+		dd->cfgctxts = dd->ctxtcnt;
+	dd->freectxts = (dd->first_user_ctxt > dd->cfgctxts) ? 0 :
+		dd->cfgctxts - dd->first_user_ctxt;
+}
+
+/*
+ * Common code for creating the receive context array.
+ */
+int qib_create_ctxts(struct qib_devdata *dd)
+{
+	unsigned i;
+	int local_node_id = pcibus_to_node(dd->pcidev->bus);
+
+	if (local_node_id < 0)
+		local_node_id = numa_node_id();
+	dd->assigned_node_id = local_node_id;
+
+	/*
+	 * Allocate full ctxtcnt array, rather than just cfgctxts, because
+	 * cleanup iterates across all possible ctxts.
+	 */
+	dd->rcd = kzalloc(sizeof(*dd->rcd) * dd->ctxtcnt, GFP_KERNEL);
+	if (!dd->rcd) {
+		qib_dev_err(dd,
+			"Unable to allocate ctxtdata array, failing\n");
+		return -ENOMEM;
+	}
+
+	/* create (one or more) kctxt */
+	for (i = 0; i < dd->first_user_ctxt; ++i) {
+		struct qib_pportdata *ppd;
+		struct qib_ctxtdata *rcd;
+
+		if (dd->skip_kctxt_mask & (1 << i))
+			continue;
+
+		ppd = dd->pport + (i % dd->num_pports);
+
+		rcd = qib_create_ctxtdata(ppd, i, dd->assigned_node_id);
+		if (!rcd) {
+			qib_dev_err(dd,
+				"Unable to allocate ctxtdata for Kernel ctxt, failing\n");
+			kfree(dd->rcd);
+			dd->rcd = NULL;
+			return -ENOMEM;
+		}
+		rcd->pkeys[0] = QIB_DEFAULT_P_KEY;
+		rcd->seq_cnt = 1;
+	}
+	return 0;
+}
+
+/*
+ * Common code for user and kernel context setup.
+ */
+struct qib_ctxtdata *qib_create_ctxtdata(struct qib_pportdata *ppd, u32 ctxt,
+	int node_id)
+{
+	struct qib_devdata *dd = ppd->dd;
+	struct qib_ctxtdata *rcd;
+
+	rcd = kzalloc_node(sizeof(*rcd), GFP_KERNEL, node_id);
+	if (rcd) {
+		INIT_LIST_HEAD(&rcd->qp_wait_list);
+		rcd->node_id = node_id;
+		rcd->ppd = ppd;
+		rcd->dd = dd;
+		rcd->cnt = 1;
+		rcd->ctxt = ctxt;
+		dd->rcd[ctxt] = rcd;
+#ifdef CONFIG_DEBUG_FS
+		if (ctxt < dd->first_user_ctxt) { /* N/A for PSM contexts */
+			rcd->opstats = kzalloc_node(sizeof(*rcd->opstats),
+				GFP_KERNEL, node_id);
+			if (!rcd->opstats) {
+				kfree(rcd);
+				qib_dev_err(dd,
+					"Unable to allocate per ctxt stats buffer\n");
+				return NULL;
+			}
+		}
+#endif
+		dd->f_init_ctxt(rcd);
+
+		/*
+		 * To avoid wasting a lot of memory, we allocate 32KB chunks
+		 * of physically contiguous memory, advance through it until
+		 * used up and then allocate more.  Of course, we need
+		 * memory to store those extra pointers, now.  32KB seems to
+		 * be the most that is "safe" under memory pressure
+		 * (creating large files and then copying them over
+		 * NFS while doing lots of MPI jobs).  The OOM killer can
+		 * get invoked, even though we say we can sleep and this can
+		 * cause significant system problems....
+		 */
+		rcd->rcvegrbuf_size = 0x8000;
+		rcd->rcvegrbufs_perchunk =
+			rcd->rcvegrbuf_size / dd->rcvegrbufsize;
+		rcd->rcvegrbuf_chunks = (rcd->rcvegrcnt +
+			rcd->rcvegrbufs_perchunk - 1) /
+			rcd->rcvegrbufs_perchunk;
+		BUG_ON(!is_power_of_2(rcd->rcvegrbufs_perchunk));
+		rcd->rcvegrbufs_perchunk_shift =
+			ilog2(rcd->rcvegrbufs_perchunk);
+	}
+	return rcd;
+}
+
+/*
+ * Common code for initializing the physical port structure.
+ */
+int qib_init_pportdata(struct qib_pportdata *ppd, struct qib_devdata *dd,
+			u8 hw_pidx, u8 port)
+{
+	int size;
+	ppd->dd = dd;
+	ppd->hw_pidx = hw_pidx;
+	ppd->port = port; /* IB port number, not index */
+
+	spin_lock_init(&ppd->sdma_lock);
+	spin_lock_init(&ppd->lflags_lock);
+	spin_lock_init(&ppd->cc_shadow_lock);
+	init_waitqueue_head(&ppd->state_wait);
+
+	init_timer(&ppd->symerr_clear_timer);
+	ppd->symerr_clear_timer.function = qib_clear_symerror_on_linkup;
+	ppd->symerr_clear_timer.data = (unsigned long)ppd;
+
+	ppd->qib_wq = NULL;
+	ppd->ibport_data.pmastats =
+		alloc_percpu(struct qib_pma_counters);
+	if (!ppd->ibport_data.pmastats)
+		return -ENOMEM;
+
+	if (qib_cc_table_size < IB_CCT_MIN_ENTRIES)
+		goto bail;
+
+	ppd->cc_supported_table_entries = min(max_t(int, qib_cc_table_size,
+		IB_CCT_MIN_ENTRIES), IB_CCT_ENTRIES*IB_CC_TABLE_CAP_DEFAULT);
+
+	ppd->cc_max_table_entries =
+		ppd->cc_supported_table_entries/IB_CCT_ENTRIES;
+
+	size = IB_CC_TABLE_CAP_DEFAULT * sizeof(struct ib_cc_table_entry)
+		* IB_CCT_ENTRIES;
+	ppd->ccti_entries = kzalloc(size, GFP_KERNEL);
+	if (!ppd->ccti_entries) {
+		qib_dev_err(dd,
+		  "failed to allocate congestion control table for port %d!\n",
+		  port);
+		goto bail;
+	}
+
+	size = IB_CC_CCS_ENTRIES * sizeof(struct ib_cc_congestion_entry);
+	ppd->congestion_entries = kzalloc(size, GFP_KERNEL);
+	if (!ppd->congestion_entries) {
+		qib_dev_err(dd,
+		 "failed to allocate congestion setting list for port %d!\n",
+		 port);
+		goto bail_1;
+	}
+
+	size = sizeof(struct cc_table_shadow);
+	ppd->ccti_entries_shadow = kzalloc(size, GFP_KERNEL);
+	if (!ppd->ccti_entries_shadow) {
+		qib_dev_err(dd,
+		 "failed to allocate shadow ccti list for port %d!\n",
+		 port);
+		goto bail_2;
+	}
+
+	size = sizeof(struct ib_cc_congestion_setting_attr);
+	ppd->congestion_entries_shadow = kzalloc(size, GFP_KERNEL);
+	if (!ppd->congestion_entries_shadow) {
+		qib_dev_err(dd,
+		 "failed to allocate shadow congestion setting list for port %d!\n",
+		 port);
+		goto bail_3;
+	}
+
+	return 0;
+
+bail_3:
+	kfree(ppd->ccti_entries_shadow);
+	ppd->ccti_entries_shadow = NULL;
+bail_2:
+	kfree(ppd->congestion_entries);
+	ppd->congestion_entries = NULL;
+bail_1:
+	kfree(ppd->ccti_entries);
+	ppd->ccti_entries = NULL;
+bail:
+	/* User is intentionally disabling the congestion control agent */
+	if (!qib_cc_table_size)
+		return 0;
+
+	if (qib_cc_table_size < IB_CCT_MIN_ENTRIES) {
+		qib_cc_table_size = 0;
+		qib_dev_err(dd,
+		 "Congestion Control table size %d less than minimum %d for port %d\n",
+		 qib_cc_table_size, IB_CCT_MIN_ENTRIES, port);
+	}
+
+	qib_dev_err(dd, "Congestion Control Agent disabled for port %d\n",
+		port);
+	return 0;
+}
+
+static int init_pioavailregs(struct qib_devdata *dd)
+{
+	int ret, pidx;
+	u64 *status_page;
+
+	dd->pioavailregs_dma = dma_alloc_coherent(
+		&dd->pcidev->dev, PAGE_SIZE, &dd->pioavailregs_phys,
+		GFP_KERNEL);
+	if (!dd->pioavailregs_dma) {
+		qib_dev_err(dd,
+			"failed to allocate PIOavail reg area in memory\n");
+		ret = -ENOMEM;
+		goto done;
+	}
+
+	/*
+	 * We really want L2 cache aligned, but for current CPUs of
+	 * interest, they are the same.
+	 */
+	status_page = (u64 *)
+		((char *) dd->pioavailregs_dma +
+		 ((2 * L1_CACHE_BYTES +
+		   dd->pioavregs * sizeof(u64)) & ~L1_CACHE_BYTES));
+	/* device status comes first, for backwards compatibility */
+	dd->devstatusp = status_page;
+	*status_page++ = 0;
+	for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+		dd->pport[pidx].statusp = status_page;
+		*status_page++ = 0;
+	}
+
+	/*
+	 * Setup buffer to hold freeze and other messages, accessible to
+	 * apps, following statusp.  This is per-unit, not per port.
+	 */
+	dd->freezemsg = (char *) status_page;
+	*dd->freezemsg = 0;
+	/* length of msg buffer is "whatever is left" */
+	ret = (char *) status_page - (char *) dd->pioavailregs_dma;
+	dd->freezelen = PAGE_SIZE - ret;
+
+	ret = 0;
+
+done:
+	return ret;
+}
+
+/**
+ * init_shadow_tids - allocate the shadow TID array
+ * @dd: the qlogic_ib device
+ *
+ * allocate the shadow TID array, so we can qib_munlock previous
+ * entries.  It may make more sense to move the pageshadow to the
+ * ctxt data structure, so we only allocate memory for ctxts actually
+ * in use, since we at 8k per ctxt, now.
+ * We don't want failures here to prevent use of the driver/chip,
+ * so no return value.
+ */
+static void init_shadow_tids(struct qib_devdata *dd)
+{
+	struct page **pages;
+	dma_addr_t *addrs;
+
+	pages = vzalloc(dd->cfgctxts * dd->rcvtidcnt * sizeof(struct page *));
+	if (!pages) {
+		qib_dev_err(dd,
+			"failed to allocate shadow page * array, no expected sends!\n");
+		goto bail;
+	}
+
+	addrs = vzalloc(dd->cfgctxts * dd->rcvtidcnt * sizeof(dma_addr_t));
+	if (!addrs) {
+		qib_dev_err(dd,
+			"failed to allocate shadow dma handle array, no expected sends!\n");
+		goto bail_free;
+	}
+
+	dd->pageshadow = pages;
+	dd->physshadow = addrs;
+	return;
+
+bail_free:
+	vfree(pages);
+bail:
+	dd->pageshadow = NULL;
+}
+
+/*
+ * Do initialization for device that is only needed on
+ * first detect, not on resets.
+ */
+static int loadtime_init(struct qib_devdata *dd)
+{
+	int ret = 0;
+
+	if (((dd->revision >> QLOGIC_IB_R_SOFTWARE_SHIFT) &
+	     QLOGIC_IB_R_SOFTWARE_MASK) != QIB_CHIP_SWVERSION) {
+		qib_dev_err(dd,
+			"Driver only handles version %d, chip swversion is %d (%llx), failng\n",
+			QIB_CHIP_SWVERSION,
+			(int)(dd->revision >>
+				QLOGIC_IB_R_SOFTWARE_SHIFT) &
+				QLOGIC_IB_R_SOFTWARE_MASK,
+			(unsigned long long) dd->revision);
+		ret = -ENOSYS;
+		goto done;
+	}
+
+	if (dd->revision & QLOGIC_IB_R_EMULATOR_MASK)
+		qib_devinfo(dd->pcidev, "%s", dd->boardversion);
+
+	spin_lock_init(&dd->pioavail_lock);
+	spin_lock_init(&dd->sendctrl_lock);
+	spin_lock_init(&dd->uctxt_lock);
+	spin_lock_init(&dd->qib_diag_trans_lock);
+	spin_lock_init(&dd->eep_st_lock);
+	mutex_init(&dd->eep_lock);
+
+	if (qib_mini_init)
+		goto done;
+
+	ret = init_pioavailregs(dd);
+	init_shadow_tids(dd);
+
+	qib_get_eeprom_info(dd);
+
+	/* setup time (don't start yet) to verify we got interrupt */
+	init_timer(&dd->intrchk_timer);
+	dd->intrchk_timer.function = verify_interrupt;
+	dd->intrchk_timer.data = (unsigned long) dd;
+
+	ret = qib_cq_init(dd);
+done:
+	return ret;
+}
+
+/**
+ * init_after_reset - re-initialize after a reset
+ * @dd: the qlogic_ib device
+ *
+ * sanity check at least some of the values after reset, and
+ * ensure no receive or transmit (explicitly, in case reset
+ * failed
+ */
+static int init_after_reset(struct qib_devdata *dd)
+{
+	int i;
+
+	/*
+	 * Ensure chip does no sends or receives, tail updates, or
+	 * pioavail updates while we re-initialize.  This is mostly
+	 * for the driver data structures, not chip registers.
+	 */
+	for (i = 0; i < dd->num_pports; ++i) {
+		/*
+		 * ctxt == -1 means "all contexts". Only really safe for
+		 * _dis_abling things, as here.
+		 */
+		dd->f_rcvctrl(dd->pport + i, QIB_RCVCTRL_CTXT_DIS |
+				  QIB_RCVCTRL_INTRAVAIL_DIS |
+				  QIB_RCVCTRL_TAILUPD_DIS, -1);
+		/* Redundant across ports for some, but no big deal.  */
+		dd->f_sendctrl(dd->pport + i, QIB_SENDCTRL_SEND_DIS |
+			QIB_SENDCTRL_AVAIL_DIS);
+	}
+
+	return 0;
+}
+
+static void enable_chip(struct qib_devdata *dd)
+{
+	u64 rcvmask;
+	int i;
+
+	/*
+	 * Enable PIO send, and update of PIOavail regs to memory.
+	 */
+	for (i = 0; i < dd->num_pports; ++i)
+		dd->f_sendctrl(dd->pport + i, QIB_SENDCTRL_SEND_ENB |
+			QIB_SENDCTRL_AVAIL_ENB);
+	/*
+	 * Enable kernel ctxts' receive and receive interrupt.
+	 * Other ctxts done as user opens and inits them.
+	 */
+	rcvmask = QIB_RCVCTRL_CTXT_ENB | QIB_RCVCTRL_INTRAVAIL_ENB;
+	rcvmask |= (dd->flags & QIB_NODMA_RTAIL) ?
+		  QIB_RCVCTRL_TAILUPD_DIS : QIB_RCVCTRL_TAILUPD_ENB;
+	for (i = 0; dd->rcd && i < dd->first_user_ctxt; ++i) {
+		struct qib_ctxtdata *rcd = dd->rcd[i];
+
+		if (rcd)
+			dd->f_rcvctrl(rcd->ppd, rcvmask, i);
+	}
+}
+
+static void verify_interrupt(unsigned long opaque)
+{
+	struct qib_devdata *dd = (struct qib_devdata *) opaque;
+	u64 int_counter;
+
+	if (!dd)
+		return; /* being torn down */
+
+	/*
+	 * If we don't have a lid or any interrupts, let the user know and
+	 * don't bother checking again.
+	 */
+	int_counter = qib_int_counter(dd) - dd->z_int_counter;
+	if (int_counter == 0) {
+		if (!dd->f_intr_fallback(dd))
+			dev_err(&dd->pcidev->dev,
+				"No interrupts detected, not usable.\n");
+		else /* re-arm the timer to see if fallback works */
+			mod_timer(&dd->intrchk_timer, jiffies + HZ/2);
+	}
+}
+
+static void init_piobuf_state(struct qib_devdata *dd)
+{
+	int i, pidx;
+	u32 uctxts;
+
+	/*
+	 * Ensure all buffers are free, and fifos empty.  Buffers
+	 * are common, so only do once for port 0.
+	 *
+	 * After enable and qib_chg_pioavailkernel so we can safely
+	 * enable pioavail updates and PIOENABLE.  After this, packets
+	 * are ready and able to go out.
+	 */
+	dd->f_sendctrl(dd->pport, QIB_SENDCTRL_DISARM_ALL);
+	for (pidx = 0; pidx < dd->num_pports; ++pidx)
+		dd->f_sendctrl(dd->pport + pidx, QIB_SENDCTRL_FLUSH);
+
+	/*
+	 * If not all sendbufs are used, add the one to each of the lower
+	 * numbered contexts.  pbufsctxt and lastctxt_piobuf are
+	 * calculated in chip-specific code because it may cause some
+	 * chip-specific adjustments to be made.
+	 */
+	uctxts = dd->cfgctxts - dd->first_user_ctxt;
+	dd->ctxts_extrabuf = dd->pbufsctxt ?
+		dd->lastctxt_piobuf - (dd->pbufsctxt * uctxts) : 0;
+
+	/*
+	 * Set up the shadow copies of the piobufavail registers,
+	 * which we compare against the chip registers for now, and
+	 * the in memory DMA'ed copies of the registers.
+	 * By now pioavail updates to memory should have occurred, so
+	 * copy them into our working/shadow registers; this is in
+	 * case something went wrong with abort, but mostly to get the
+	 * initial values of the generation bit correct.
+	 */
+	for (i = 0; i < dd->pioavregs; i++) {
+		__le64 tmp;
+
+		tmp = dd->pioavailregs_dma[i];
+		/*
+		 * Don't need to worry about pioavailkernel here
+		 * because we will call qib_chg_pioavailkernel() later
+		 * in initialization, to busy out buffers as needed.
+		 */
+		dd->pioavailshadow[i] = le64_to_cpu(tmp);
+	}
+	while (i < ARRAY_SIZE(dd->pioavailshadow))
+		dd->pioavailshadow[i++] = 0; /* for debugging sanity */
+
+	/* after pioavailshadow is setup */
+	qib_chg_pioavailkernel(dd, 0, dd->piobcnt2k + dd->piobcnt4k,
+			       TXCHK_CHG_TYPE_KERN, NULL);
+	dd->f_initvl15_bufs(dd);
+}
+
+/**
+ * qib_create_workqueues - create per port workqueues
+ * @dd: the qlogic_ib device
+ */
+static int qib_create_workqueues(struct qib_devdata *dd)
+{
+	int pidx;
+	struct qib_pportdata *ppd;
+
+	for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+		ppd = dd->pport + pidx;
+		if (!ppd->qib_wq) {
+			char wq_name[8]; /* 3 + 2 + 1 + 1 + 1 */
+			snprintf(wq_name, sizeof(wq_name), "qib%d_%d",
+				dd->unit, pidx);
+			ppd->qib_wq =
+				create_singlethread_workqueue(wq_name);
+			if (!ppd->qib_wq)
+				goto wq_error;
+		}
+	}
+	return 0;
+wq_error:
+	pr_err("create_singlethread_workqueue failed for port %d\n",
+		pidx + 1);
+	for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+		ppd = dd->pport + pidx;
+		if (ppd->qib_wq) {
+			destroy_workqueue(ppd->qib_wq);
+			ppd->qib_wq = NULL;
+		}
+	}
+	return -ENOMEM;
+}
+
+static void qib_free_pportdata(struct qib_pportdata *ppd)
+{
+	free_percpu(ppd->ibport_data.pmastats);
+	ppd->ibport_data.pmastats = NULL;
+}
+
+/**
+ * qib_init - do the actual initialization sequence on the chip
+ * @dd: the qlogic_ib device
+ * @reinit: reinitializing, so don't allocate new memory
+ *
+ * Do the actual initialization sequence on the chip.  This is done
+ * both from the init routine called from the PCI infrastructure, and
+ * when we reset the chip, or detect that it was reset internally,
+ * or it's administratively re-enabled.
+ *
+ * Memory allocation here and in called routines is only done in
+ * the first case (reinit == 0).  We have to be careful, because even
+ * without memory allocation, we need to re-write all the chip registers
+ * TIDs, etc. after the reset or enable has completed.
+ */
+int qib_init(struct qib_devdata *dd, int reinit)
+{
+	int ret = 0, pidx, lastfail = 0;
+	u32 portok = 0;
+	unsigned i;
+	struct qib_ctxtdata *rcd;
+	struct qib_pportdata *ppd;
+	unsigned long flags;
+
+	/* Set linkstate to unknown, so we can watch for a transition. */
+	for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+		ppd = dd->pport + pidx;
+		spin_lock_irqsave(&ppd->lflags_lock, flags);
+		ppd->lflags &= ~(QIBL_LINKACTIVE | QIBL_LINKARMED |
+				 QIBL_LINKDOWN | QIBL_LINKINIT |
+				 QIBL_LINKV);
+		spin_unlock_irqrestore(&ppd->lflags_lock, flags);
+	}
+
+	if (reinit)
+		ret = init_after_reset(dd);
+	else
+		ret = loadtime_init(dd);
+	if (ret)
+		goto done;
+
+	/* Bypass most chip-init, to get to device creation */
+	if (qib_mini_init)
+		return 0;
+
+	ret = dd->f_late_initreg(dd);
+	if (ret)
+		goto done;
+
+	/* dd->rcd can be NULL if early init failed */
+	for (i = 0; dd->rcd && i < dd->first_user_ctxt; ++i) {
+		/*
+		 * Set up the (kernel) rcvhdr queue and egr TIDs.  If doing
+		 * re-init, the simplest way to handle this is to free
+		 * existing, and re-allocate.
+		 * Need to re-create rest of ctxt 0 ctxtdata as well.
+		 */
+		rcd = dd->rcd[i];
+		if (!rcd)
+			continue;
+
+		lastfail = qib_create_rcvhdrq(dd, rcd);
+		if (!lastfail)
+			lastfail = qib_setup_eagerbufs(rcd);
+		if (lastfail) {
+			qib_dev_err(dd,
+				"failed to allocate kernel ctxt's rcvhdrq and/or egr bufs\n");
+			continue;
+		}
+	}
+
+	for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+		int mtu;
+		if (lastfail)
+			ret = lastfail;
+		ppd = dd->pport + pidx;
+		mtu = ib_mtu_enum_to_int(qib_ibmtu);
+		if (mtu == -1) {
+			mtu = QIB_DEFAULT_MTU;
+			qib_ibmtu = 0; /* don't leave invalid value */
+		}
+		/* set max we can ever have for this driver load */
+		ppd->init_ibmaxlen = min(mtu > 2048 ?
+					 dd->piosize4k : dd->piosize2k,
+					 dd->rcvegrbufsize +
+					 (dd->rcvhdrentsize << 2));
+		/*
+		 * Have to initialize ibmaxlen, but this will normally
+		 * change immediately in qib_set_mtu().
+		 */
+		ppd->ibmaxlen = ppd->init_ibmaxlen;
+		qib_set_mtu(ppd, mtu);
+
+		spin_lock_irqsave(&ppd->lflags_lock, flags);
+		ppd->lflags |= QIBL_IB_LINK_DISABLED;
+		spin_unlock_irqrestore(&ppd->lflags_lock, flags);
+
+		lastfail = dd->f_bringup_serdes(ppd);
+		if (lastfail) {
+			qib_devinfo(dd->pcidev,
+				 "Failed to bringup IB port %u\n", ppd->port);
+			lastfail = -ENETDOWN;
+			continue;
+		}
+
+		portok++;
+	}
+
+	if (!portok) {
+		/* none of the ports initialized */
+		if (!ret && lastfail)
+			ret = lastfail;
+		else if (!ret)
+			ret = -ENETDOWN;
+		/* but continue on, so we can debug cause */
+	}
+
+	enable_chip(dd);
+
+	init_piobuf_state(dd);
+
+done:
+	if (!ret) {
+		/* chip is OK for user apps; mark it as initialized */
+		for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+			ppd = dd->pport + pidx;
+			/*
+			 * Set status even if port serdes is not initialized
+			 * so that diags will work.
+			 */
+			*ppd->statusp |= QIB_STATUS_CHIP_PRESENT |
+				QIB_STATUS_INITTED;
+			if (!ppd->link_speed_enabled)
+				continue;
+			if (dd->flags & QIB_HAS_SEND_DMA)
+				ret = qib_setup_sdma(ppd);
+			init_timer(&ppd->hol_timer);
+			ppd->hol_timer.function = qib_hol_event;
+			ppd->hol_timer.data = (unsigned long)ppd;
+			ppd->hol_state = QIB_HOL_UP;
+		}
+
+		/* now we can enable all interrupts from the chip */
+		dd->f_set_intr_state(dd, 1);
+
+		/*
+		 * Setup to verify we get an interrupt, and fallback
+		 * to an alternate if necessary and possible.
+		 */
+		mod_timer(&dd->intrchk_timer, jiffies + HZ/2);
+		/* start stats retrieval timer */
+		mod_timer(&dd->stats_timer, jiffies + HZ * ACTIVITY_TIMER);
+	}
+
+	/* if ret is non-zero, we probably should do some cleanup here... */
+	return ret;
+}
+
+/*
+ * These next two routines are placeholders in case we don't have per-arch
+ * code for controlling write combining.  If explicit control of write
+ * combining is not available, performance will probably be awful.
+ */
+
+int __attribute__((weak)) qib_enable_wc(struct qib_devdata *dd)
+{
+	return -EOPNOTSUPP;
+}
+
+void __attribute__((weak)) qib_disable_wc(struct qib_devdata *dd)
+{
+}
+
+static inline struct qib_devdata *__qib_lookup(int unit)
+{
+	return idr_find(&qib_unit_table, unit);
+}
+
+struct qib_devdata *qib_lookup(int unit)
+{
+	struct qib_devdata *dd;
+	unsigned long flags;
+
+	spin_lock_irqsave(&qib_devs_lock, flags);
+	dd = __qib_lookup(unit);
+	spin_unlock_irqrestore(&qib_devs_lock, flags);
+
+	return dd;
+}
+
+/*
+ * Stop the timers during unit shutdown, or after an error late
+ * in initialization.
+ */
+static void qib_stop_timers(struct qib_devdata *dd)
+{
+	struct qib_pportdata *ppd;
+	int pidx;
+
+	if (dd->stats_timer.data) {
+		del_timer_sync(&dd->stats_timer);
+		dd->stats_timer.data = 0;
+	}
+	if (dd->intrchk_timer.data) {
+		del_timer_sync(&dd->intrchk_timer);
+		dd->intrchk_timer.data = 0;
+	}
+	for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+		ppd = dd->pport + pidx;
+		if (ppd->hol_timer.data)
+			del_timer_sync(&ppd->hol_timer);
+		if (ppd->led_override_timer.data) {
+			del_timer_sync(&ppd->led_override_timer);
+			atomic_set(&ppd->led_override_timer_active, 0);
+		}
+		if (ppd->symerr_clear_timer.data)
+			del_timer_sync(&ppd->symerr_clear_timer);
+	}
+}
+
+/**
+ * qib_shutdown_device - shut down a device
+ * @dd: the qlogic_ib device
+ *
+ * This is called to make the device quiet when we are about to
+ * unload the driver, and also when the device is administratively
+ * disabled.   It does not free any data structures.
+ * Everything it does has to be setup again by qib_init(dd, 1)
+ */
+static void qib_shutdown_device(struct qib_devdata *dd)
+{
+	struct qib_pportdata *ppd;
+	unsigned pidx;
+
+	for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+		ppd = dd->pport + pidx;
+
+		spin_lock_irq(&ppd->lflags_lock);
+		ppd->lflags &= ~(QIBL_LINKDOWN | QIBL_LINKINIT |
+				 QIBL_LINKARMED | QIBL_LINKACTIVE |
+				 QIBL_LINKV);
+		spin_unlock_irq(&ppd->lflags_lock);
+		*ppd->statusp &= ~(QIB_STATUS_IB_CONF | QIB_STATUS_IB_READY);
+	}
+	dd->flags &= ~QIB_INITTED;
+
+	/* mask interrupts, but not errors */
+	dd->f_set_intr_state(dd, 0);
+
+	for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+		ppd = dd->pport + pidx;
+		dd->f_rcvctrl(ppd, QIB_RCVCTRL_TAILUPD_DIS |
+				   QIB_RCVCTRL_CTXT_DIS |
+				   QIB_RCVCTRL_INTRAVAIL_DIS |
+				   QIB_RCVCTRL_PKEY_ENB, -1);
+		/*
+		 * Gracefully stop all sends allowing any in progress to
+		 * trickle out first.
+		 */
+		dd->f_sendctrl(ppd, QIB_SENDCTRL_CLEAR);
+	}
+
+	/*
+	 * Enough for anything that's going to trickle out to have actually
+	 * done so.
+	 */
+	udelay(20);
+
+	for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+		ppd = dd->pport + pidx;
+		dd->f_setextled(ppd, 0); /* make sure LEDs are off */
+
+		if (dd->flags & QIB_HAS_SEND_DMA)
+			qib_teardown_sdma(ppd);
+
+		dd->f_sendctrl(ppd, QIB_SENDCTRL_AVAIL_DIS |
+				    QIB_SENDCTRL_SEND_DIS);
+		/*
+		 * Clear SerdesEnable.
+		 * We can't count on interrupts since we are stopping.
+		 */
+		dd->f_quiet_serdes(ppd);
+
+		if (ppd->qib_wq) {
+			destroy_workqueue(ppd->qib_wq);
+			ppd->qib_wq = NULL;
+		}
+		qib_free_pportdata(ppd);
+	}
+
+	qib_update_eeprom_log(dd);
+}
+
+/**
+ * qib_free_ctxtdata - free a context's allocated data
+ * @dd: the qlogic_ib device
+ * @rcd: the ctxtdata structure
+ *
+ * free up any allocated data for a context
+ * This should not touch anything that would affect a simultaneous
+ * re-allocation of context data, because it is called after qib_mutex
+ * is released (and can be called from reinit as well).
+ * It should never change any chip state, or global driver state.
+ */
+void qib_free_ctxtdata(struct qib_devdata *dd, struct qib_ctxtdata *rcd)
+{
+	if (!rcd)
+		return;
+
+	if (rcd->rcvhdrq) {
+		dma_free_coherent(&dd->pcidev->dev, rcd->rcvhdrq_size,
+				  rcd->rcvhdrq, rcd->rcvhdrq_phys);
+		rcd->rcvhdrq = NULL;
+		if (rcd->rcvhdrtail_kvaddr) {
+			dma_free_coherent(&dd->pcidev->dev, PAGE_SIZE,
+					  rcd->rcvhdrtail_kvaddr,
+					  rcd->rcvhdrqtailaddr_phys);
+			rcd->rcvhdrtail_kvaddr = NULL;
+		}
+	}
+	if (rcd->rcvegrbuf) {
+		unsigned e;
+
+		for (e = 0; e < rcd->rcvegrbuf_chunks; e++) {
+			void *base = rcd->rcvegrbuf[e];
+			size_t size = rcd->rcvegrbuf_size;
+
+			dma_free_coherent(&dd->pcidev->dev, size,
+					  base, rcd->rcvegrbuf_phys[e]);
+		}
+		kfree(rcd->rcvegrbuf);
+		rcd->rcvegrbuf = NULL;
+		kfree(rcd->rcvegrbuf_phys);
+		rcd->rcvegrbuf_phys = NULL;
+		rcd->rcvegrbuf_chunks = 0;
+	}
+
+	kfree(rcd->tid_pg_list);
+	vfree(rcd->user_event_mask);
+	vfree(rcd->subctxt_uregbase);
+	vfree(rcd->subctxt_rcvegrbuf);
+	vfree(rcd->subctxt_rcvhdr_base);
+#ifdef CONFIG_DEBUG_FS
+	kfree(rcd->opstats);
+	rcd->opstats = NULL;
+#endif
+	kfree(rcd);
+}
+
+/*
+ * Perform a PIO buffer bandwidth write test, to verify proper system
+ * configuration.  Even when all the setup calls work, occasionally
+ * BIOS or other issues can prevent write combining from working, or
+ * can cause other bandwidth problems to the chip.
+ *
+ * This test simply writes the same buffer over and over again, and
+ * measures close to the peak bandwidth to the chip (not testing
+ * data bandwidth to the wire).   On chips that use an address-based
+ * trigger to send packets to the wire, this is easy.  On chips that
+ * use a count to trigger, we want to make sure that the packet doesn't
+ * go out on the wire, or trigger flow control checks.
+ */
+static void qib_verify_pioperf(struct qib_devdata *dd)
+{
+	u32 pbnum, cnt, lcnt;
+	u32 __iomem *piobuf;
+	u32 *addr;
+	u64 msecs, emsecs;
+
+	piobuf = dd->f_getsendbuf(dd->pport, 0ULL, &pbnum);
+	if (!piobuf) {
+		qib_devinfo(dd->pcidev,
+			 "No PIObufs for checking perf, skipping\n");
+		return;
+	}
+
+	/*
+	 * Enough to give us a reasonable test, less than piobuf size, and
+	 * likely multiple of store buffer length.
+	 */
+	cnt = 1024;
+
+	addr = vmalloc(cnt);
+	if (!addr) {
+		qib_devinfo(dd->pcidev,
+			 "Couldn't get memory for checking PIO perf,"
+			 " skipping\n");
+		goto done;
+	}
+
+	preempt_disable();  /* we want reasonably accurate elapsed time */
+	msecs = 1 + jiffies_to_msecs(jiffies);
+	for (lcnt = 0; lcnt < 10000U; lcnt++) {
+		/* wait until we cross msec boundary */
+		if (jiffies_to_msecs(jiffies) >= msecs)
+			break;
+		udelay(1);
+	}
+
+	dd->f_set_armlaunch(dd, 0);
+
+	/*
+	 * length 0, no dwords actually sent
+	 */
+	writeq(0, piobuf);
+	qib_flush_wc();
+
+	/*
+	 * This is only roughly accurate, since even with preempt we
+	 * still take interrupts that could take a while.   Running for
+	 * >= 5 msec seems to get us "close enough" to accurate values.
+	 */
+	msecs = jiffies_to_msecs(jiffies);
+	for (emsecs = lcnt = 0; emsecs <= 5UL; lcnt++) {
+		qib_pio_copy(piobuf + 64, addr, cnt >> 2);
+		emsecs = jiffies_to_msecs(jiffies) - msecs;
+	}
+
+	/* 1 GiB/sec, slightly over IB SDR line rate */
+	if (lcnt < (emsecs * 1024U))
+		qib_dev_err(dd,
+			    "Performance problem: bandwidth to PIO buffers is only %u MiB/sec\n",
+			    lcnt / (u32) emsecs);
+
+	preempt_enable();
+
+	vfree(addr);
+
+done:
+	/* disarm piobuf, so it's available again */
+	dd->f_sendctrl(dd->pport, QIB_SENDCTRL_DISARM_BUF(pbnum));
+	qib_sendbuf_done(dd, pbnum);
+	dd->f_set_armlaunch(dd, 1);
+}
+
+void qib_free_devdata(struct qib_devdata *dd)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&qib_devs_lock, flags);
+	idr_remove(&qib_unit_table, dd->unit);
+	list_del(&dd->list);
+	spin_unlock_irqrestore(&qib_devs_lock, flags);
+
+#ifdef CONFIG_DEBUG_FS
+	qib_dbg_ibdev_exit(&dd->verbs_dev);
+#endif
+	free_percpu(dd->int_counter);
+	ib_dealloc_device(&dd->verbs_dev.ibdev);
+}
+
+u64 qib_int_counter(struct qib_devdata *dd)
+{
+	int cpu;
+	u64 int_counter = 0;
+
+	for_each_possible_cpu(cpu)
+		int_counter += *per_cpu_ptr(dd->int_counter, cpu);
+	return int_counter;
+}
+
+u64 qib_sps_ints(void)
+{
+	unsigned long flags;
+	struct qib_devdata *dd;
+	u64 sps_ints = 0;
+
+	spin_lock_irqsave(&qib_devs_lock, flags);
+	list_for_each_entry(dd, &qib_dev_list, list) {
+		sps_ints += qib_int_counter(dd);
+	}
+	spin_unlock_irqrestore(&qib_devs_lock, flags);
+	return sps_ints;
+}
+
+/*
+ * Allocate our primary per-unit data structure.  Must be done via verbs
+ * allocator, because the verbs cleanup process both does cleanup and
+ * free of the data structure.
+ * "extra" is for chip-specific data.
+ *
+ * Use the idr mechanism to get a unit number for this unit.
+ */
+struct qib_devdata *qib_alloc_devdata(struct pci_dev *pdev, size_t extra)
+{
+	unsigned long flags;
+	struct qib_devdata *dd;
+	int ret;
+
+	dd = (struct qib_devdata *) ib_alloc_device(sizeof(*dd) + extra);
+	if (!dd)
+		return ERR_PTR(-ENOMEM);
+
+	INIT_LIST_HEAD(&dd->list);
+
+	idr_preload(GFP_KERNEL);
+	spin_lock_irqsave(&qib_devs_lock, flags);
+
+	ret = idr_alloc(&qib_unit_table, dd, 0, 0, GFP_NOWAIT);
+	if (ret >= 0) {
+		dd->unit = ret;
+		list_add(&dd->list, &qib_dev_list);
+	}
+
+	spin_unlock_irqrestore(&qib_devs_lock, flags);
+	idr_preload_end();
+
+	if (ret < 0) {
+		qib_early_err(&pdev->dev,
+			      "Could not allocate unit ID: error %d\n", -ret);
+		goto bail;
+	}
+	dd->int_counter = alloc_percpu(u64);
+	if (!dd->int_counter) {
+		ret = -ENOMEM;
+		qib_early_err(&pdev->dev,
+			      "Could not allocate per-cpu int_counter\n");
+		goto bail;
+	}
+
+	if (!qib_cpulist_count) {
+		u32 count = num_online_cpus();
+		qib_cpulist = kzalloc(BITS_TO_LONGS(count) *
+				      sizeof(long), GFP_KERNEL);
+		if (qib_cpulist)
+			qib_cpulist_count = count;
+		else
+			qib_early_err(&pdev->dev,
+				"Could not alloc cpulist info, cpu affinity might be wrong\n");
+	}
+#ifdef CONFIG_DEBUG_FS
+	qib_dbg_ibdev_init(&dd->verbs_dev);
+#endif
+	return dd;
+bail:
+	if (!list_empty(&dd->list))
+		list_del_init(&dd->list);
+	ib_dealloc_device(&dd->verbs_dev.ibdev);
+	return ERR_PTR(ret);;
+}
+
+/*
+ * Called from freeze mode handlers, and from PCI error
+ * reporting code.  Should be paranoid about state of
+ * system and data structures.
+ */
+void qib_disable_after_error(struct qib_devdata *dd)
+{
+	if (dd->flags & QIB_INITTED) {
+		u32 pidx;
+
+		dd->flags &= ~QIB_INITTED;
+		if (dd->pport)
+			for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+				struct qib_pportdata *ppd;
+
+				ppd = dd->pport + pidx;
+				if (dd->flags & QIB_PRESENT) {
+					qib_set_linkstate(ppd,
+						QIB_IB_LINKDOWN_DISABLE);
+					dd->f_setextled(ppd, 0);
+				}
+				*ppd->statusp &= ~QIB_STATUS_IB_READY;
+			}
+	}
+
+	/*
+	 * Mark as having had an error for driver, and also
+	 * for /sys and status word mapped to user programs.
+	 * This marks unit as not usable, until reset.
+	 */
+	if (dd->devstatusp)
+		*dd->devstatusp |= QIB_STATUS_HWERROR;
+}
+
+static void qib_remove_one(struct pci_dev *);
+static int qib_init_one(struct pci_dev *, const struct pci_device_id *);
+
+#define DRIVER_LOAD_MSG "Intel " QIB_DRV_NAME " loaded: "
+#define PFX QIB_DRV_NAME ": "
+
+static const struct pci_device_id qib_pci_tbl[] = {
+	{ PCI_DEVICE(PCI_VENDOR_ID_PATHSCALE, PCI_DEVICE_ID_QLOGIC_IB_6120) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_QLOGIC, PCI_DEVICE_ID_QLOGIC_IB_7220) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_QLOGIC, PCI_DEVICE_ID_QLOGIC_IB_7322) },
+	{ 0, }
+};
+
+MODULE_DEVICE_TABLE(pci, qib_pci_tbl);
+
+static struct pci_driver qib_driver = {
+	.name = QIB_DRV_NAME,
+	.probe = qib_init_one,
+	.remove = qib_remove_one,
+	.id_table = qib_pci_tbl,
+	.err_handler = &qib_pci_err_handler,
+};
+
+#ifdef CONFIG_INFINIBAND_QIB_DCA
+
+static int qib_notify_dca(struct notifier_block *, unsigned long, void *);
+static struct notifier_block dca_notifier = {
+	.notifier_call  = qib_notify_dca,
+	.next           = NULL,
+	.priority       = 0
+};
+
+static int qib_notify_dca_device(struct device *device, void *data)
+{
+	struct qib_devdata *dd = dev_get_drvdata(device);
+	unsigned long event = *(unsigned long *)data;
+
+	return dd->f_notify_dca(dd, event);
+}
+
+static int qib_notify_dca(struct notifier_block *nb, unsigned long event,
+					  void *p)
+{
+	int rval;
+
+	rval = driver_for_each_device(&qib_driver.driver, NULL,
+				      &event, qib_notify_dca_device);
+	return rval ? NOTIFY_BAD : NOTIFY_DONE;
+}
+
+#endif
+
+/*
+ * Do all the generic driver unit- and chip-independent memory
+ * allocation and initialization.
+ */
+static int __init qib_ib_init(void)
+{
+	int ret;
+
+	ret = qib_dev_init();
+	if (ret)
+		goto bail;
+
+	/*
+	 * These must be called before the driver is registered with
+	 * the PCI subsystem.
+	 */
+	idr_init(&qib_unit_table);
+
+#ifdef CONFIG_INFINIBAND_QIB_DCA
+	dca_register_notify(&dca_notifier);
+#endif
+#ifdef CONFIG_DEBUG_FS
+	qib_dbg_init();
+#endif
+	ret = pci_register_driver(&qib_driver);
+	if (ret < 0) {
+		pr_err("Unable to register driver: error %d\n", -ret);
+		goto bail_dev;
+	}
+
+	/* not fatal if it doesn't work */
+	if (qib_init_qibfs())
+		pr_err("Unable to register ipathfs\n");
+	goto bail; /* all OK */
+
+bail_dev:
+#ifdef CONFIG_INFINIBAND_QIB_DCA
+	dca_unregister_notify(&dca_notifier);
+#endif
+#ifdef CONFIG_DEBUG_FS
+	qib_dbg_exit();
+#endif
+	idr_destroy(&qib_unit_table);
+	qib_dev_cleanup();
+bail:
+	return ret;
+}
+
+module_init(qib_ib_init);
+
+/*
+ * Do the non-unit driver cleanup, memory free, etc. at unload.
+ */
+static void __exit qib_ib_cleanup(void)
+{
+	int ret;
+
+	ret = qib_exit_qibfs();
+	if (ret)
+		pr_err(
+			"Unable to cleanup counter filesystem: error %d\n",
+			-ret);
+
+#ifdef CONFIG_INFINIBAND_QIB_DCA
+	dca_unregister_notify(&dca_notifier);
+#endif
+	pci_unregister_driver(&qib_driver);
+#ifdef CONFIG_DEBUG_FS
+	qib_dbg_exit();
+#endif
+
+	qib_cpulist_count = 0;
+	kfree(qib_cpulist);
+
+	idr_destroy(&qib_unit_table);
+	qib_dev_cleanup();
+}
+
+module_exit(qib_ib_cleanup);
+
+/* this can only be called after a successful initialization */
+static void cleanup_device_data(struct qib_devdata *dd)
+{
+	int ctxt;
+	int pidx;
+	struct qib_ctxtdata **tmp;
+	unsigned long flags;
+
+	/* users can't do anything more with chip */
+	for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+		if (dd->pport[pidx].statusp)
+			*dd->pport[pidx].statusp &= ~QIB_STATUS_CHIP_PRESENT;
+
+		spin_lock(&dd->pport[pidx].cc_shadow_lock);
+
+		kfree(dd->pport[pidx].congestion_entries);
+		dd->pport[pidx].congestion_entries = NULL;
+		kfree(dd->pport[pidx].ccti_entries);
+		dd->pport[pidx].ccti_entries = NULL;
+		kfree(dd->pport[pidx].ccti_entries_shadow);
+		dd->pport[pidx].ccti_entries_shadow = NULL;
+		kfree(dd->pport[pidx].congestion_entries_shadow);
+		dd->pport[pidx].congestion_entries_shadow = NULL;
+
+		spin_unlock(&dd->pport[pidx].cc_shadow_lock);
+	}
+
+	if (!qib_wc_pat)
+		qib_disable_wc(dd);
+
+	if (dd->pioavailregs_dma) {
+		dma_free_coherent(&dd->pcidev->dev, PAGE_SIZE,
+				  (void *) dd->pioavailregs_dma,
+				  dd->pioavailregs_phys);
+		dd->pioavailregs_dma = NULL;
+	}
+
+	if (dd->pageshadow) {
+		struct page **tmpp = dd->pageshadow;
+		dma_addr_t *tmpd = dd->physshadow;
+		int i;
+
+		for (ctxt = 0; ctxt < dd->cfgctxts; ctxt++) {
+			int ctxt_tidbase = ctxt * dd->rcvtidcnt;
+			int maxtid = ctxt_tidbase + dd->rcvtidcnt;
+
+			for (i = ctxt_tidbase; i < maxtid; i++) {
+				if (!tmpp[i])
+					continue;
+				pci_unmap_page(dd->pcidev, tmpd[i],
+					       PAGE_SIZE, PCI_DMA_FROMDEVICE);
+				qib_release_user_pages(&tmpp[i], 1);
+				tmpp[i] = NULL;
+			}
+		}
+
+		dd->pageshadow = NULL;
+		vfree(tmpp);
+		dd->physshadow = NULL;
+		vfree(tmpd);
+	}
+
+	/*
+	 * Free any resources still in use (usually just kernel contexts)
+	 * at unload; we do for ctxtcnt, because that's what we allocate.
+	 * We acquire lock to be really paranoid that rcd isn't being
+	 * accessed from some interrupt-related code (that should not happen,
+	 * but best to be sure).
+	 */
+	spin_lock_irqsave(&dd->uctxt_lock, flags);
+	tmp = dd->rcd;
+	dd->rcd = NULL;
+	spin_unlock_irqrestore(&dd->uctxt_lock, flags);
+	for (ctxt = 0; tmp && ctxt < dd->ctxtcnt; ctxt++) {
+		struct qib_ctxtdata *rcd = tmp[ctxt];
+
+		tmp[ctxt] = NULL; /* debugging paranoia */
+		qib_free_ctxtdata(dd, rcd);
+	}
+	kfree(tmp);
+	kfree(dd->boardname);
+	qib_cq_exit(dd);
+}
+
+/*
+ * Clean up on unit shutdown, or error during unit load after
+ * successful initialization.
+ */
+static void qib_postinit_cleanup(struct qib_devdata *dd)
+{
+	/*
+	 * Clean up chip-specific stuff.
+	 * We check for NULL here, because it's outside
+	 * the kregbase check, and we need to call it
+	 * after the free_irq.  Thus it's possible that
+	 * the function pointers were never initialized.
+	 */
+	if (dd->f_cleanup)
+		dd->f_cleanup(dd);
+
+	qib_pcie_ddcleanup(dd);
+
+	cleanup_device_data(dd);
+
+	qib_free_devdata(dd);
+}
+
+static int qib_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
+{
+	int ret, j, pidx, initfail;
+	struct qib_devdata *dd = NULL;
+
+	ret = qib_pcie_init(pdev, ent);
+	if (ret)
+		goto bail;
+
+	/*
+	 * Do device-specific initialiation, function table setup, dd
+	 * allocation, etc.
+	 */
+	switch (ent->device) {
+	case PCI_DEVICE_ID_QLOGIC_IB_6120:
+#ifdef CONFIG_PCI_MSI
+		dd = qib_init_iba6120_funcs(pdev, ent);
+#else
+		qib_early_err(&pdev->dev,
+			"Intel PCIE device 0x%x cannot work if CONFIG_PCI_MSI is not enabled\n",
+			ent->device);
+		dd = ERR_PTR(-ENODEV);
+#endif
+		break;
+
+	case PCI_DEVICE_ID_QLOGIC_IB_7220:
+		dd = qib_init_iba7220_funcs(pdev, ent);
+		break;
+
+	case PCI_DEVICE_ID_QLOGIC_IB_7322:
+		dd = qib_init_iba7322_funcs(pdev, ent);
+		break;
+
+	default:
+		qib_early_err(&pdev->dev,
+			"Failing on unknown Intel deviceid 0x%x\n",
+			ent->device);
+		ret = -ENODEV;
+	}
+
+	if (IS_ERR(dd))
+		ret = PTR_ERR(dd);
+	if (ret)
+		goto bail; /* error already printed */
+
+	ret = qib_create_workqueues(dd);
+	if (ret)
+		goto bail;
+
+	/* do the generic initialization */
+	initfail = qib_init(dd, 0);
+
+	ret = qib_register_ib_device(dd);
+
+	/*
+	 * Now ready for use.  this should be cleared whenever we
+	 * detect a reset, or initiate one.  If earlier failure,
+	 * we still create devices, so diags, etc. can be used
+	 * to determine cause of problem.
+	 */
+	if (!qib_mini_init && !initfail && !ret)
+		dd->flags |= QIB_INITTED;
+
+	j = qib_device_create(dd);
+	if (j)
+		qib_dev_err(dd, "Failed to create /dev devices: %d\n", -j);
+	j = qibfs_add(dd);
+	if (j)
+		qib_dev_err(dd, "Failed filesystem setup for counters: %d\n",
+			    -j);
+
+	if (qib_mini_init || initfail || ret) {
+		qib_stop_timers(dd);
+		flush_workqueue(ib_wq);
+		for (pidx = 0; pidx < dd->num_pports; ++pidx)
+			dd->f_quiet_serdes(dd->pport + pidx);
+		if (qib_mini_init)
+			goto bail;
+		if (!j) {
+			(void) qibfs_remove(dd);
+			qib_device_remove(dd);
+		}
+		if (!ret)
+			qib_unregister_ib_device(dd);
+		qib_postinit_cleanup(dd);
+		if (initfail)
+			ret = initfail;
+		goto bail;
+	}
+
+	if (!qib_wc_pat) {
+		ret = qib_enable_wc(dd);
+		if (ret) {
+			qib_dev_err(dd,
+				"Write combining not enabled (err %d): performance may be poor\n",
+				-ret);
+			ret = 0;
+		}
+	}
+
+	qib_verify_pioperf(dd);
+bail:
+	return ret;
+}
+
+static void qib_remove_one(struct pci_dev *pdev)
+{
+	struct qib_devdata *dd = pci_get_drvdata(pdev);
+	int ret;
+
+	/* unregister from IB core */
+	qib_unregister_ib_device(dd);
+
+	/*
+	 * Disable the IB link, disable interrupts on the device,
+	 * clear dma engines, etc.
+	 */
+	if (!qib_mini_init)
+		qib_shutdown_device(dd);
+
+	qib_stop_timers(dd);
+
+	/* wait until all of our (qsfp) queue_work() calls complete */
+	flush_workqueue(ib_wq);
+
+	ret = qibfs_remove(dd);
+	if (ret)
+		qib_dev_err(dd, "Failed counters filesystem cleanup: %d\n",
+			    -ret);
+
+	qib_device_remove(dd);
+
+	qib_postinit_cleanup(dd);
+}
+
+/**
+ * qib_create_rcvhdrq - create a receive header queue
+ * @dd: the qlogic_ib device
+ * @rcd: the context data
+ *
+ * This must be contiguous memory (from an i/o perspective), and must be
+ * DMA'able (which means for some systems, it will go through an IOMMU,
+ * or be forced into a low address range).
+ */
+int qib_create_rcvhdrq(struct qib_devdata *dd, struct qib_ctxtdata *rcd)
+{
+	unsigned amt;
+	int old_node_id;
+
+	if (!rcd->rcvhdrq) {
+		dma_addr_t phys_hdrqtail;
+		gfp_t gfp_flags;
+
+		amt = ALIGN(dd->rcvhdrcnt * dd->rcvhdrentsize *
+			    sizeof(u32), PAGE_SIZE);
+		gfp_flags = (rcd->ctxt >= dd->first_user_ctxt) ?
+			GFP_USER : GFP_KERNEL;
+
+		old_node_id = dev_to_node(&dd->pcidev->dev);
+		set_dev_node(&dd->pcidev->dev, rcd->node_id);
+		rcd->rcvhdrq = dma_alloc_coherent(
+			&dd->pcidev->dev, amt, &rcd->rcvhdrq_phys,
+			gfp_flags | __GFP_COMP);
+		set_dev_node(&dd->pcidev->dev, old_node_id);
+
+		if (!rcd->rcvhdrq) {
+			qib_dev_err(dd,
+				"attempt to allocate %d bytes for ctxt %u rcvhdrq failed\n",
+				amt, rcd->ctxt);
+			goto bail;
+		}
+
+		if (rcd->ctxt >= dd->first_user_ctxt) {
+			rcd->user_event_mask = vmalloc_user(PAGE_SIZE);
+			if (!rcd->user_event_mask)
+				goto bail_free_hdrq;
+		}
+
+		if (!(dd->flags & QIB_NODMA_RTAIL)) {
+			set_dev_node(&dd->pcidev->dev, rcd->node_id);
+			rcd->rcvhdrtail_kvaddr = dma_alloc_coherent(
+				&dd->pcidev->dev, PAGE_SIZE, &phys_hdrqtail,
+				gfp_flags);
+			set_dev_node(&dd->pcidev->dev, old_node_id);
+			if (!rcd->rcvhdrtail_kvaddr)
+				goto bail_free;
+			rcd->rcvhdrqtailaddr_phys = phys_hdrqtail;
+		}
+
+		rcd->rcvhdrq_size = amt;
+	}
+
+	/* clear for security and sanity on each use */
+	memset(rcd->rcvhdrq, 0, rcd->rcvhdrq_size);
+	if (rcd->rcvhdrtail_kvaddr)
+		memset(rcd->rcvhdrtail_kvaddr, 0, PAGE_SIZE);
+	return 0;
+
+bail_free:
+	qib_dev_err(dd,
+		"attempt to allocate 1 page for ctxt %u rcvhdrqtailaddr failed\n",
+		rcd->ctxt);
+	vfree(rcd->user_event_mask);
+	rcd->user_event_mask = NULL;
+bail_free_hdrq:
+	dma_free_coherent(&dd->pcidev->dev, amt, rcd->rcvhdrq,
+			  rcd->rcvhdrq_phys);
+	rcd->rcvhdrq = NULL;
+bail:
+	return -ENOMEM;
+}
+
+/**
+ * allocate eager buffers, both kernel and user contexts.
+ * @rcd: the context we are setting up.
+ *
+ * Allocate the eager TID buffers and program them into hip.
+ * They are no longer completely contiguous, we do multiple allocation
+ * calls.  Otherwise we get the OOM code involved, by asking for too
+ * much per call, with disastrous results on some kernels.
+ */
+int qib_setup_eagerbufs(struct qib_ctxtdata *rcd)
+{
+	struct qib_devdata *dd = rcd->dd;
+	unsigned e, egrcnt, egrperchunk, chunk, egrsize, egroff;
+	size_t size;
+	gfp_t gfp_flags;
+	int old_node_id;
+
+	/*
+	 * GFP_USER, but without GFP_FS, so buffer cache can be
+	 * coalesced (we hope); otherwise, even at order 4,
+	 * heavy filesystem activity makes these fail, and we can
+	 * use compound pages.
+	 */
+	gfp_flags = __GFP_WAIT | __GFP_IO | __GFP_COMP;
+
+	egrcnt = rcd->rcvegrcnt;
+	egroff = rcd->rcvegr_tid_base;
+	egrsize = dd->rcvegrbufsize;
+
+	chunk = rcd->rcvegrbuf_chunks;
+	egrperchunk = rcd->rcvegrbufs_perchunk;
+	size = rcd->rcvegrbuf_size;
+	if (!rcd->rcvegrbuf) {
+		rcd->rcvegrbuf =
+			kzalloc_node(chunk * sizeof(rcd->rcvegrbuf[0]),
+				GFP_KERNEL, rcd->node_id);
+		if (!rcd->rcvegrbuf)
+			goto bail;
+	}
+	if (!rcd->rcvegrbuf_phys) {
+		rcd->rcvegrbuf_phys =
+			kmalloc_node(chunk * sizeof(rcd->rcvegrbuf_phys[0]),
+				GFP_KERNEL, rcd->node_id);
+		if (!rcd->rcvegrbuf_phys)
+			goto bail_rcvegrbuf;
+	}
+	for (e = 0; e < rcd->rcvegrbuf_chunks; e++) {
+		if (rcd->rcvegrbuf[e])
+			continue;
+
+		old_node_id = dev_to_node(&dd->pcidev->dev);
+		set_dev_node(&dd->pcidev->dev, rcd->node_id);
+		rcd->rcvegrbuf[e] =
+			dma_alloc_coherent(&dd->pcidev->dev, size,
+					   &rcd->rcvegrbuf_phys[e],
+					   gfp_flags);
+		set_dev_node(&dd->pcidev->dev, old_node_id);
+		if (!rcd->rcvegrbuf[e])
+			goto bail_rcvegrbuf_phys;
+	}
+
+	rcd->rcvegr_phys = rcd->rcvegrbuf_phys[0];
+
+	for (e = chunk = 0; chunk < rcd->rcvegrbuf_chunks; chunk++) {
+		dma_addr_t pa = rcd->rcvegrbuf_phys[chunk];
+		unsigned i;
+
+		/* clear for security and sanity on each use */
+		memset(rcd->rcvegrbuf[chunk], 0, size);
+
+		for (i = 0; e < egrcnt && i < egrperchunk; e++, i++) {
+			dd->f_put_tid(dd, e + egroff +
+					  (u64 __iomem *)
+					  ((char __iomem *)
+					   dd->kregbase +
+					   dd->rcvegrbase),
+					  RCVHQ_RCV_TYPE_EAGER, pa);
+			pa += egrsize;
+		}
+		cond_resched(); /* don't hog the cpu */
+	}
+
+	return 0;
+
+bail_rcvegrbuf_phys:
+	for (e = 0; e < rcd->rcvegrbuf_chunks && rcd->rcvegrbuf[e]; e++)
+		dma_free_coherent(&dd->pcidev->dev, size,
+				  rcd->rcvegrbuf[e], rcd->rcvegrbuf_phys[e]);
+	kfree(rcd->rcvegrbuf_phys);
+	rcd->rcvegrbuf_phys = NULL;
+bail_rcvegrbuf:
+	kfree(rcd->rcvegrbuf);
+	rcd->rcvegrbuf = NULL;
+bail:
+	return -ENOMEM;
+}
+
+/*
+ * Note: Changes to this routine should be mirrored
+ * for the diagnostics routine qib_remap_ioaddr32().
+ * There is also related code for VL15 buffers in qib_init_7322_variables().
+ * The teardown code that unmaps is in qib_pcie_ddcleanup()
+ */
+int init_chip_wc_pat(struct qib_devdata *dd, u32 vl15buflen)
+{
+	u64 __iomem *qib_kregbase = NULL;
+	void __iomem *qib_piobase = NULL;
+	u64 __iomem *qib_userbase = NULL;
+	u64 qib_kreglen;
+	u64 qib_pio2koffset = dd->piobufbase & 0xffffffff;
+	u64 qib_pio4koffset = dd->piobufbase >> 32;
+	u64 qib_pio2klen = dd->piobcnt2k * dd->palign;
+	u64 qib_pio4klen = dd->piobcnt4k * dd->align4k;
+	u64 qib_physaddr = dd->physaddr;
+	u64 qib_piolen;
+	u64 qib_userlen = 0;
+
+	/*
+	 * Free the old mapping because the kernel will try to reuse the
+	 * old mapping and not create a new mapping with the
+	 * write combining attribute.
+	 */
+	iounmap(dd->kregbase);
+	dd->kregbase = NULL;
+
+	/*
+	 * Assumes chip address space looks like:
+	 *	- kregs + sregs + cregs + uregs (in any order)
+	 *	- piobufs (2K and 4K bufs in either order)
+	 * or:
+	 *	- kregs + sregs + cregs (in any order)
+	 *	- piobufs (2K and 4K bufs in either order)
+	 *	- uregs
+	 */
+	if (dd->piobcnt4k == 0) {
+		qib_kreglen = qib_pio2koffset;
+		qib_piolen = qib_pio2klen;
+	} else if (qib_pio2koffset < qib_pio4koffset) {
+		qib_kreglen = qib_pio2koffset;
+		qib_piolen = qib_pio4koffset + qib_pio4klen - qib_kreglen;
+	} else {
+		qib_kreglen = qib_pio4koffset;
+		qib_piolen = qib_pio2koffset + qib_pio2klen - qib_kreglen;
+	}
+	qib_piolen += vl15buflen;
+	/* Map just the configured ports (not all hw ports) */
+	if (dd->uregbase > qib_kreglen)
+		qib_userlen = dd->ureg_align * dd->cfgctxts;
+
+	/* Sanity checks passed, now create the new mappings */
+	qib_kregbase = ioremap_nocache(qib_physaddr, qib_kreglen);
+	if (!qib_kregbase)
+		goto bail;
+
+	qib_piobase = ioremap_wc(qib_physaddr + qib_kreglen, qib_piolen);
+	if (!qib_piobase)
+		goto bail_kregbase;
+
+	if (qib_userlen) {
+		qib_userbase = ioremap_nocache(qib_physaddr + dd->uregbase,
+					       qib_userlen);
+		if (!qib_userbase)
+			goto bail_piobase;
+	}
+
+	dd->kregbase = qib_kregbase;
+	dd->kregend = (u64 __iomem *)
+		((char __iomem *) qib_kregbase + qib_kreglen);
+	dd->piobase = qib_piobase;
+	dd->pio2kbase = (void __iomem *)
+		(((char __iomem *) dd->piobase) +
+		 qib_pio2koffset - qib_kreglen);
+	if (dd->piobcnt4k)
+		dd->pio4kbase = (void __iomem *)
+			(((char __iomem *) dd->piobase) +
+			 qib_pio4koffset - qib_kreglen);
+	if (qib_userlen)
+		/* ureg will now be accessed relative to dd->userbase */
+		dd->userbase = qib_userbase;
+	return 0;
+
+bail_piobase:
+	iounmap(qib_piobase);
+bail_kregbase:
+	iounmap(qib_kregbase);
+bail:
+	return -ENOMEM;
+}
diff --git a/drivers/infiniband/hw/qib/qib_intr.c b/drivers/infiniband/hw/qib/qib_intr.c
new file mode 100644
index 0000000..f4918f2
--- /dev/null
+++ b/drivers/infiniband/hw/qib/qib_intr.c
@@ -0,0 +1,241 @@
+/*
+ * Copyright (c) 2006, 2007, 2008, 2009, 2010 QLogic Corporation.
+ * All rights reserved.
+ * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/pci.h>
+#include <linux/delay.h>
+
+#include "qib.h"
+#include "qib_common.h"
+
+/**
+ * qib_format_hwmsg - format a single hwerror message
+ * @msg message buffer
+ * @msgl length of message buffer
+ * @hwmsg message to add to message buffer
+ */
+static void qib_format_hwmsg(char *msg, size_t msgl, const char *hwmsg)
+{
+	strlcat(msg, "[", msgl);
+	strlcat(msg, hwmsg, msgl);
+	strlcat(msg, "]", msgl);
+}
+
+/**
+ * qib_format_hwerrors - format hardware error messages for display
+ * @hwerrs hardware errors bit vector
+ * @hwerrmsgs hardware error descriptions
+ * @nhwerrmsgs number of hwerrmsgs
+ * @msg message buffer
+ * @msgl message buffer length
+ */
+void qib_format_hwerrors(u64 hwerrs, const struct qib_hwerror_msgs *hwerrmsgs,
+			 size_t nhwerrmsgs, char *msg, size_t msgl)
+{
+	int i;
+
+	for (i = 0; i < nhwerrmsgs; i++)
+		if (hwerrs & hwerrmsgs[i].mask)
+			qib_format_hwmsg(msg, msgl, hwerrmsgs[i].msg);
+}
+
+static void signal_ib_event(struct qib_pportdata *ppd, enum ib_event_type ev)
+{
+	struct ib_event event;
+	struct qib_devdata *dd = ppd->dd;
+
+	event.device = &dd->verbs_dev.ibdev;
+	event.element.port_num = ppd->port;
+	event.event = ev;
+	ib_dispatch_event(&event);
+}
+
+void qib_handle_e_ibstatuschanged(struct qib_pportdata *ppd, u64 ibcs)
+{
+	struct qib_devdata *dd = ppd->dd;
+	unsigned long flags;
+	u32 lstate;
+	u8 ltstate;
+	enum ib_event_type ev = 0;
+
+	lstate = dd->f_iblink_state(ibcs); /* linkstate */
+	ltstate = dd->f_ibphys_portstate(ibcs);
+
+	/*
+	 * If linkstate transitions into INIT from any of the various down
+	 * states, or if it transitions from any of the up (INIT or better)
+	 * states into any of the down states (except link recovery), then
+	 * call the chip-specific code to take appropriate actions.
+	 *
+	 * ppd->lflags could be 0 if this is the first time the interrupt
+	 * handlers has been called but the link is already up.
+	 */
+	if (lstate >= IB_PORT_INIT &&
+	    (!ppd->lflags || (ppd->lflags & QIBL_LINKDOWN)) &&
+	    ltstate == IB_PHYSPORTSTATE_LINKUP) {
+		/* transitioned to UP */
+		if (dd->f_ib_updown(ppd, 1, ibcs))
+			goto skip_ibchange; /* chip-code handled */
+	} else if (ppd->lflags & (QIBL_LINKINIT | QIBL_LINKARMED |
+		   QIBL_LINKACTIVE | QIBL_IB_FORCE_NOTIFY)) {
+		if (ltstate != IB_PHYSPORTSTATE_LINKUP &&
+		    ltstate <= IB_PHYSPORTSTATE_CFG_TRAIN &&
+		    dd->f_ib_updown(ppd, 0, ibcs))
+			goto skip_ibchange; /* chip-code handled */
+		qib_set_uevent_bits(ppd, _QIB_EVENT_LINKDOWN_BIT);
+	}
+
+	if (lstate != IB_PORT_DOWN) {
+		/* lstate is INIT, ARMED, or ACTIVE */
+		if (lstate != IB_PORT_ACTIVE) {
+			*ppd->statusp &= ~QIB_STATUS_IB_READY;
+			if (ppd->lflags & QIBL_LINKACTIVE)
+				ev = IB_EVENT_PORT_ERR;
+			spin_lock_irqsave(&ppd->lflags_lock, flags);
+			if (lstate == IB_PORT_ARMED) {
+				ppd->lflags |= QIBL_LINKARMED | QIBL_LINKV;
+				ppd->lflags &= ~(QIBL_LINKINIT |
+					QIBL_LINKDOWN | QIBL_LINKACTIVE);
+			} else {
+				ppd->lflags |= QIBL_LINKINIT | QIBL_LINKV;
+				ppd->lflags &= ~(QIBL_LINKARMED |
+					QIBL_LINKDOWN | QIBL_LINKACTIVE);
+			}
+			spin_unlock_irqrestore(&ppd->lflags_lock, flags);
+			/* start a 75msec timer to clear symbol errors */
+			mod_timer(&ppd->symerr_clear_timer,
+				  msecs_to_jiffies(75));
+		} else if (ltstate == IB_PHYSPORTSTATE_LINKUP &&
+			   !(ppd->lflags & QIBL_LINKACTIVE)) {
+			/* active, but not active defered */
+			qib_hol_up(ppd); /* useful only for 6120 now */
+			*ppd->statusp |=
+				QIB_STATUS_IB_READY | QIB_STATUS_IB_CONF;
+			qib_clear_symerror_on_linkup((unsigned long)ppd);
+			spin_lock_irqsave(&ppd->lflags_lock, flags);
+			ppd->lflags |= QIBL_LINKACTIVE | QIBL_LINKV;
+			ppd->lflags &= ~(QIBL_LINKINIT |
+				QIBL_LINKDOWN | QIBL_LINKARMED);
+			spin_unlock_irqrestore(&ppd->lflags_lock, flags);
+			if (dd->flags & QIB_HAS_SEND_DMA)
+				qib_sdma_process_event(ppd,
+					qib_sdma_event_e30_go_running);
+			ev = IB_EVENT_PORT_ACTIVE;
+			dd->f_setextled(ppd, 1);
+		}
+	} else { /* down */
+		if (ppd->lflags & QIBL_LINKACTIVE)
+			ev = IB_EVENT_PORT_ERR;
+		spin_lock_irqsave(&ppd->lflags_lock, flags);
+		ppd->lflags |= QIBL_LINKDOWN | QIBL_LINKV;
+		ppd->lflags &= ~(QIBL_LINKINIT |
+				 QIBL_LINKACTIVE | QIBL_LINKARMED);
+		spin_unlock_irqrestore(&ppd->lflags_lock, flags);
+		*ppd->statusp &= ~QIB_STATUS_IB_READY;
+	}
+
+skip_ibchange:
+	ppd->lastibcstat = ibcs;
+	if (ev)
+		signal_ib_event(ppd, ev);
+	return;
+}
+
+void qib_clear_symerror_on_linkup(unsigned long opaque)
+{
+	struct qib_pportdata *ppd = (struct qib_pportdata *)opaque;
+
+	if (ppd->lflags & QIBL_LINKACTIVE)
+		return;
+
+	ppd->ibport_data.z_symbol_error_counter =
+		ppd->dd->f_portcntr(ppd, QIBPORTCNTR_IBSYMBOLERR);
+}
+
+/*
+ * Handle receive interrupts for user ctxts; this means a user
+ * process was waiting for a packet to arrive, and didn't want
+ * to poll.
+ */
+void qib_handle_urcv(struct qib_devdata *dd, u64 ctxtr)
+{
+	struct qib_ctxtdata *rcd;
+	unsigned long flags;
+	int i;
+
+	spin_lock_irqsave(&dd->uctxt_lock, flags);
+	for (i = dd->first_user_ctxt; dd->rcd && i < dd->cfgctxts; i++) {
+		if (!(ctxtr & (1ULL << i)))
+			continue;
+		rcd = dd->rcd[i];
+		if (!rcd || !rcd->cnt)
+			continue;
+
+		if (test_and_clear_bit(QIB_CTXT_WAITING_RCV, &rcd->flag)) {
+			wake_up_interruptible(&rcd->wait);
+			dd->f_rcvctrl(rcd->ppd, QIB_RCVCTRL_INTRAVAIL_DIS,
+				      rcd->ctxt);
+		} else if (test_and_clear_bit(QIB_CTXT_WAITING_URG,
+					      &rcd->flag)) {
+			rcd->urgent++;
+			wake_up_interruptible(&rcd->wait);
+		}
+	}
+	spin_unlock_irqrestore(&dd->uctxt_lock, flags);
+}
+
+void qib_bad_intrstatus(struct qib_devdata *dd)
+{
+	static int allbits;
+
+	/* separate routine, for better optimization of qib_intr() */
+
+	/*
+	 * We print the message and disable interrupts, in hope of
+	 * having a better chance of debugging the problem.
+	 */
+	qib_dev_err(dd,
+		"Read of chip interrupt status failed disabling interrupts\n");
+	if (allbits++) {
+		/* disable interrupt delivery, something is very wrong */
+		if (allbits == 2)
+			dd->f_set_intr_state(dd, 0);
+		if (allbits == 3) {
+			qib_dev_err(dd,
+				"2nd bad interrupt status, unregistering interrupts\n");
+			dd->flags |= QIB_BADINTR;
+			dd->flags &= ~QIB_INITTED;
+			dd->f_free_irq(dd);
+		}
+	}
+}
diff --git a/drivers/infiniband/hw/qib/qib_keys.c b/drivers/infiniband/hw/qib/qib_keys.c
new file mode 100644
index 0000000..3b9afcc
--- /dev/null
+++ b/drivers/infiniband/hw/qib/qib_keys.c
@@ -0,0 +1,387 @@
+/*
+ * Copyright (c) 2006, 2007, 2009 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "qib.h"
+
+/**
+ * qib_alloc_lkey - allocate an lkey
+ * @mr: memory region that this lkey protects
+ * @dma_region: 0->normal key, 1->restricted DMA key
+ *
+ * Returns 0 if successful, otherwise returns -errno.
+ *
+ * Increments mr reference count as required.
+ *
+ * Sets the lkey field mr for non-dma regions.
+ *
+ */
+
+int qib_alloc_lkey(struct qib_mregion *mr, int dma_region)
+{
+	unsigned long flags;
+	u32 r;
+	u32 n;
+	int ret = 0;
+	struct qib_ibdev *dev = to_idev(mr->pd->device);
+	struct qib_lkey_table *rkt = &dev->lk_table;
+
+	spin_lock_irqsave(&rkt->lock, flags);
+
+	/* special case for dma_mr lkey == 0 */
+	if (dma_region) {
+		struct qib_mregion *tmr;
+
+		tmr = rcu_access_pointer(dev->dma_mr);
+		if (!tmr) {
+			qib_get_mr(mr);
+			rcu_assign_pointer(dev->dma_mr, mr);
+			mr->lkey_published = 1;
+		}
+		goto success;
+	}
+
+	/* Find the next available LKEY */
+	r = rkt->next;
+	n = r;
+	for (;;) {
+		if (rkt->table[r] == NULL)
+			break;
+		r = (r + 1) & (rkt->max - 1);
+		if (r == n)
+			goto bail;
+	}
+	rkt->next = (r + 1) & (rkt->max - 1);
+	/*
+	 * Make sure lkey is never zero which is reserved to indicate an
+	 * unrestricted LKEY.
+	 */
+	rkt->gen++;
+	mr->lkey = (r << (32 - ib_qib_lkey_table_size)) |
+		((((1 << (24 - ib_qib_lkey_table_size)) - 1) & rkt->gen)
+		 << 8);
+	if (mr->lkey == 0) {
+		mr->lkey |= 1 << 8;
+		rkt->gen++;
+	}
+	qib_get_mr(mr);
+	rcu_assign_pointer(rkt->table[r], mr);
+	mr->lkey_published = 1;
+success:
+	spin_unlock_irqrestore(&rkt->lock, flags);
+out:
+	return ret;
+bail:
+	spin_unlock_irqrestore(&rkt->lock, flags);
+	ret = -ENOMEM;
+	goto out;
+}
+
+/**
+ * qib_free_lkey - free an lkey
+ * @mr: mr to free from tables
+ */
+void qib_free_lkey(struct qib_mregion *mr)
+{
+	unsigned long flags;
+	u32 lkey = mr->lkey;
+	u32 r;
+	struct qib_ibdev *dev = to_idev(mr->pd->device);
+	struct qib_lkey_table *rkt = &dev->lk_table;
+
+	spin_lock_irqsave(&rkt->lock, flags);
+	if (!mr->lkey_published)
+		goto out;
+	if (lkey == 0)
+		rcu_assign_pointer(dev->dma_mr, NULL);
+	else {
+		r = lkey >> (32 - ib_qib_lkey_table_size);
+		rcu_assign_pointer(rkt->table[r], NULL);
+	}
+	qib_put_mr(mr);
+	mr->lkey_published = 0;
+out:
+	spin_unlock_irqrestore(&rkt->lock, flags);
+}
+
+/**
+ * qib_lkey_ok - check IB SGE for validity and initialize
+ * @rkt: table containing lkey to check SGE against
+ * @pd: protection domain
+ * @isge: outgoing internal SGE
+ * @sge: SGE to check
+ * @acc: access flags
+ *
+ * Return 1 if valid and successful, otherwise returns 0.
+ *
+ * increments the reference count upon success
+ *
+ * Check the IB SGE for validity and initialize our internal version
+ * of it.
+ */
+int qib_lkey_ok(struct qib_lkey_table *rkt, struct qib_pd *pd,
+		struct qib_sge *isge, struct ib_sge *sge, int acc)
+{
+	struct qib_mregion *mr;
+	unsigned n, m;
+	size_t off;
+
+	/*
+	 * We use LKEY == zero for kernel virtual addresses
+	 * (see qib_get_dma_mr and qib_dma.c).
+	 */
+	rcu_read_lock();
+	if (sge->lkey == 0) {
+		struct qib_ibdev *dev = to_idev(pd->ibpd.device);
+
+		if (pd->user)
+			goto bail;
+		mr = rcu_dereference(dev->dma_mr);
+		if (!mr)
+			goto bail;
+		if (unlikely(!atomic_inc_not_zero(&mr->refcount)))
+			goto bail;
+		rcu_read_unlock();
+
+		isge->mr = mr;
+		isge->vaddr = (void *) sge->addr;
+		isge->length = sge->length;
+		isge->sge_length = sge->length;
+		isge->m = 0;
+		isge->n = 0;
+		goto ok;
+	}
+	mr = rcu_dereference(
+		rkt->table[(sge->lkey >> (32 - ib_qib_lkey_table_size))]);
+	if (unlikely(!mr || mr->lkey != sge->lkey || mr->pd != &pd->ibpd))
+		goto bail;
+
+	off = sge->addr - mr->user_base;
+	if (unlikely(sge->addr < mr->user_base ||
+		     off + sge->length > mr->length ||
+		     (mr->access_flags & acc) != acc))
+		goto bail;
+	if (unlikely(!atomic_inc_not_zero(&mr->refcount)))
+		goto bail;
+	rcu_read_unlock();
+
+	off += mr->offset;
+	if (mr->page_shift) {
+		/*
+		page sizes are uniform power of 2 so no loop is necessary
+		entries_spanned_by_off is the number of times the loop below
+		would have executed.
+		*/
+		size_t entries_spanned_by_off;
+
+		entries_spanned_by_off = off >> mr->page_shift;
+		off -= (entries_spanned_by_off << mr->page_shift);
+		m = entries_spanned_by_off/QIB_SEGSZ;
+		n = entries_spanned_by_off%QIB_SEGSZ;
+	} else {
+		m = 0;
+		n = 0;
+		while (off >= mr->map[m]->segs[n].length) {
+			off -= mr->map[m]->segs[n].length;
+			n++;
+			if (n >= QIB_SEGSZ) {
+				m++;
+				n = 0;
+			}
+		}
+	}
+	isge->mr = mr;
+	isge->vaddr = mr->map[m]->segs[n].vaddr + off;
+	isge->length = mr->map[m]->segs[n].length - off;
+	isge->sge_length = sge->length;
+	isge->m = m;
+	isge->n = n;
+ok:
+	return 1;
+bail:
+	rcu_read_unlock();
+	return 0;
+}
+
+/**
+ * qib_rkey_ok - check the IB virtual address, length, and RKEY
+ * @qp: qp for validation
+ * @sge: SGE state
+ * @len: length of data
+ * @vaddr: virtual address to place data
+ * @rkey: rkey to check
+ * @acc: access flags
+ *
+ * Return 1 if successful, otherwise 0.
+ *
+ * increments the reference count upon success
+ */
+int qib_rkey_ok(struct qib_qp *qp, struct qib_sge *sge,
+		u32 len, u64 vaddr, u32 rkey, int acc)
+{
+	struct qib_lkey_table *rkt = &to_idev(qp->ibqp.device)->lk_table;
+	struct qib_mregion *mr;
+	unsigned n, m;
+	size_t off;
+
+	/*
+	 * We use RKEY == zero for kernel virtual addresses
+	 * (see qib_get_dma_mr and qib_dma.c).
+	 */
+	rcu_read_lock();
+	if (rkey == 0) {
+		struct qib_pd *pd = to_ipd(qp->ibqp.pd);
+		struct qib_ibdev *dev = to_idev(pd->ibpd.device);
+
+		if (pd->user)
+			goto bail;
+		mr = rcu_dereference(dev->dma_mr);
+		if (!mr)
+			goto bail;
+		if (unlikely(!atomic_inc_not_zero(&mr->refcount)))
+			goto bail;
+		rcu_read_unlock();
+
+		sge->mr = mr;
+		sge->vaddr = (void *) vaddr;
+		sge->length = len;
+		sge->sge_length = len;
+		sge->m = 0;
+		sge->n = 0;
+		goto ok;
+	}
+
+	mr = rcu_dereference(
+		rkt->table[(rkey >> (32 - ib_qib_lkey_table_size))]);
+	if (unlikely(!mr || mr->lkey != rkey || qp->ibqp.pd != mr->pd))
+		goto bail;
+
+	off = vaddr - mr->iova;
+	if (unlikely(vaddr < mr->iova || off + len > mr->length ||
+		     (mr->access_flags & acc) == 0))
+		goto bail;
+	if (unlikely(!atomic_inc_not_zero(&mr->refcount)))
+		goto bail;
+	rcu_read_unlock();
+
+	off += mr->offset;
+	if (mr->page_shift) {
+		/*
+		page sizes are uniform power of 2 so no loop is necessary
+		entries_spanned_by_off is the number of times the loop below
+		would have executed.
+		*/
+		size_t entries_spanned_by_off;
+
+		entries_spanned_by_off = off >> mr->page_shift;
+		off -= (entries_spanned_by_off << mr->page_shift);
+		m = entries_spanned_by_off/QIB_SEGSZ;
+		n = entries_spanned_by_off%QIB_SEGSZ;
+	} else {
+		m = 0;
+		n = 0;
+		while (off >= mr->map[m]->segs[n].length) {
+			off -= mr->map[m]->segs[n].length;
+			n++;
+			if (n >= QIB_SEGSZ) {
+				m++;
+				n = 0;
+			}
+		}
+	}
+	sge->mr = mr;
+	sge->vaddr = mr->map[m]->segs[n].vaddr + off;
+	sge->length = mr->map[m]->segs[n].length - off;
+	sge->sge_length = len;
+	sge->m = m;
+	sge->n = n;
+ok:
+	return 1;
+bail:
+	rcu_read_unlock();
+	return 0;
+}
+
+/*
+ * Initialize the memory region specified by the work reqeust.
+ */
+int qib_fast_reg_mr(struct qib_qp *qp, struct ib_send_wr *wr)
+{
+	struct qib_lkey_table *rkt = &to_idev(qp->ibqp.device)->lk_table;
+	struct qib_pd *pd = to_ipd(qp->ibqp.pd);
+	struct qib_mregion *mr;
+	u32 rkey = wr->wr.fast_reg.rkey;
+	unsigned i, n, m;
+	int ret = -EINVAL;
+	unsigned long flags;
+	u64 *page_list;
+	size_t ps;
+
+	spin_lock_irqsave(&rkt->lock, flags);
+	if (pd->user || rkey == 0)
+		goto bail;
+
+	mr = rcu_dereference_protected(
+		rkt->table[(rkey >> (32 - ib_qib_lkey_table_size))],
+		lockdep_is_held(&rkt->lock));
+	if (unlikely(mr == NULL || qp->ibqp.pd != mr->pd))
+		goto bail;
+
+	if (wr->wr.fast_reg.page_list_len > mr->max_segs)
+		goto bail;
+
+	ps = 1UL << wr->wr.fast_reg.page_shift;
+	if (wr->wr.fast_reg.length > ps * wr->wr.fast_reg.page_list_len)
+		goto bail;
+
+	mr->user_base = wr->wr.fast_reg.iova_start;
+	mr->iova = wr->wr.fast_reg.iova_start;
+	mr->lkey = rkey;
+	mr->length = wr->wr.fast_reg.length;
+	mr->access_flags = wr->wr.fast_reg.access_flags;
+	page_list = wr->wr.fast_reg.page_list->page_list;
+	m = 0;
+	n = 0;
+	for (i = 0; i < wr->wr.fast_reg.page_list_len; i++) {
+		mr->map[m]->segs[n].vaddr = (void *) page_list[i];
+		mr->map[m]->segs[n].length = ps;
+		if (++n == QIB_SEGSZ) {
+			m++;
+			n = 0;
+		}
+	}
+
+	ret = 0;
+bail:
+	spin_unlock_irqrestore(&rkt->lock, flags);
+	return ret;
+}
diff --git a/drivers/infiniband/hw/qib/qib_mad.c b/drivers/infiniband/hw/qib/qib_mad.c
new file mode 100644
index 0000000..636be11
--- /dev/null
+++ b/drivers/infiniband/hw/qib/qib_mad.c
@@ -0,0 +1,2533 @@
+/*
+ * Copyright (c) 2012 Intel Corporation.  All rights reserved.
+ * Copyright (c) 2006 - 2012 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <rdma/ib_smi.h>
+
+#include "qib.h"
+#include "qib_mad.h"
+
+static int reply(struct ib_smp *smp)
+{
+	/*
+	 * The verbs framework will handle the directed/LID route
+	 * packet changes.
+	 */
+	smp->method = IB_MGMT_METHOD_GET_RESP;
+	if (smp->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)
+		smp->status |= IB_SMP_DIRECTION;
+	return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY;
+}
+
+static int reply_failure(struct ib_smp *smp)
+{
+	/*
+	 * The verbs framework will handle the directed/LID route
+	 * packet changes.
+	 */
+	smp->method = IB_MGMT_METHOD_GET_RESP;
+	if (smp->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)
+		smp->status |= IB_SMP_DIRECTION;
+	return IB_MAD_RESULT_FAILURE | IB_MAD_RESULT_REPLY;
+}
+
+static void qib_send_trap(struct qib_ibport *ibp, void *data, unsigned len)
+{
+	struct ib_mad_send_buf *send_buf;
+	struct ib_mad_agent *agent;
+	struct ib_smp *smp;
+	int ret;
+	unsigned long flags;
+	unsigned long timeout;
+
+	agent = ibp->send_agent;
+	if (!agent)
+		return;
+
+	/* o14-3.2.1 */
+	if (!(ppd_from_ibp(ibp)->lflags & QIBL_LINKACTIVE))
+		return;
+
+	/* o14-2 */
+	if (ibp->trap_timeout && time_before(jiffies, ibp->trap_timeout))
+		return;
+
+	send_buf = ib_create_send_mad(agent, 0, 0, 0, IB_MGMT_MAD_HDR,
+				      IB_MGMT_MAD_DATA, GFP_ATOMIC);
+	if (IS_ERR(send_buf))
+		return;
+
+	smp = send_buf->mad;
+	smp->base_version = IB_MGMT_BASE_VERSION;
+	smp->mgmt_class = IB_MGMT_CLASS_SUBN_LID_ROUTED;
+	smp->class_version = 1;
+	smp->method = IB_MGMT_METHOD_TRAP;
+	ibp->tid++;
+	smp->tid = cpu_to_be64(ibp->tid);
+	smp->attr_id = IB_SMP_ATTR_NOTICE;
+	/* o14-1: smp->mkey = 0; */
+	memcpy(smp->data, data, len);
+
+	spin_lock_irqsave(&ibp->lock, flags);
+	if (!ibp->sm_ah) {
+		if (ibp->sm_lid != be16_to_cpu(IB_LID_PERMISSIVE)) {
+			struct ib_ah *ah;
+
+			ah = qib_create_qp0_ah(ibp, ibp->sm_lid);
+			if (IS_ERR(ah))
+				ret = PTR_ERR(ah);
+			else {
+				send_buf->ah = ah;
+				ibp->sm_ah = to_iah(ah);
+				ret = 0;
+			}
+		} else
+			ret = -EINVAL;
+	} else {
+		send_buf->ah = &ibp->sm_ah->ibah;
+		ret = 0;
+	}
+	spin_unlock_irqrestore(&ibp->lock, flags);
+
+	if (!ret)
+		ret = ib_post_send_mad(send_buf, NULL);
+	if (!ret) {
+		/* 4.096 usec. */
+		timeout = (4096 * (1UL << ibp->subnet_timeout)) / 1000;
+		ibp->trap_timeout = jiffies + usecs_to_jiffies(timeout);
+	} else {
+		ib_free_send_mad(send_buf);
+		ibp->trap_timeout = 0;
+	}
+}
+
+/*
+ * Send a bad [PQ]_Key trap (ch. 14.3.8).
+ */
+void qib_bad_pqkey(struct qib_ibport *ibp, __be16 trap_num, u32 key, u32 sl,
+		   u32 qp1, u32 qp2, __be16 lid1, __be16 lid2)
+{
+	struct ib_mad_notice_attr data;
+
+	if (trap_num == IB_NOTICE_TRAP_BAD_PKEY)
+		ibp->pkey_violations++;
+	else
+		ibp->qkey_violations++;
+	ibp->n_pkt_drops++;
+
+	/* Send violation trap */
+	data.generic_type = IB_NOTICE_TYPE_SECURITY;
+	data.prod_type_msb = 0;
+	data.prod_type_lsb = IB_NOTICE_PROD_CA;
+	data.trap_num = trap_num;
+	data.issuer_lid = cpu_to_be16(ppd_from_ibp(ibp)->lid);
+	data.toggle_count = 0;
+	memset(&data.details, 0, sizeof data.details);
+	data.details.ntc_257_258.lid1 = lid1;
+	data.details.ntc_257_258.lid2 = lid2;
+	data.details.ntc_257_258.key = cpu_to_be32(key);
+	data.details.ntc_257_258.sl_qp1 = cpu_to_be32((sl << 28) | qp1);
+	data.details.ntc_257_258.qp2 = cpu_to_be32(qp2);
+
+	qib_send_trap(ibp, &data, sizeof data);
+}
+
+/*
+ * Send a bad M_Key trap (ch. 14.3.9).
+ */
+static void qib_bad_mkey(struct qib_ibport *ibp, struct ib_smp *smp)
+{
+	struct ib_mad_notice_attr data;
+
+	/* Send violation trap */
+	data.generic_type = IB_NOTICE_TYPE_SECURITY;
+	data.prod_type_msb = 0;
+	data.prod_type_lsb = IB_NOTICE_PROD_CA;
+	data.trap_num = IB_NOTICE_TRAP_BAD_MKEY;
+	data.issuer_lid = cpu_to_be16(ppd_from_ibp(ibp)->lid);
+	data.toggle_count = 0;
+	memset(&data.details, 0, sizeof data.details);
+	data.details.ntc_256.lid = data.issuer_lid;
+	data.details.ntc_256.method = smp->method;
+	data.details.ntc_256.attr_id = smp->attr_id;
+	data.details.ntc_256.attr_mod = smp->attr_mod;
+	data.details.ntc_256.mkey = smp->mkey;
+	if (smp->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) {
+		u8 hop_cnt;
+
+		data.details.ntc_256.dr_slid = smp->dr_slid;
+		data.details.ntc_256.dr_trunc_hop = IB_NOTICE_TRAP_DR_NOTICE;
+		hop_cnt = smp->hop_cnt;
+		if (hop_cnt > ARRAY_SIZE(data.details.ntc_256.dr_rtn_path)) {
+			data.details.ntc_256.dr_trunc_hop |=
+				IB_NOTICE_TRAP_DR_TRUNC;
+			hop_cnt = ARRAY_SIZE(data.details.ntc_256.dr_rtn_path);
+		}
+		data.details.ntc_256.dr_trunc_hop |= hop_cnt;
+		memcpy(data.details.ntc_256.dr_rtn_path, smp->return_path,
+		       hop_cnt);
+	}
+
+	qib_send_trap(ibp, &data, sizeof data);
+}
+
+/*
+ * Send a Port Capability Mask Changed trap (ch. 14.3.11).
+ */
+void qib_cap_mask_chg(struct qib_ibport *ibp)
+{
+	struct ib_mad_notice_attr data;
+
+	data.generic_type = IB_NOTICE_TYPE_INFO;
+	data.prod_type_msb = 0;
+	data.prod_type_lsb = IB_NOTICE_PROD_CA;
+	data.trap_num = IB_NOTICE_TRAP_CAP_MASK_CHG;
+	data.issuer_lid = cpu_to_be16(ppd_from_ibp(ibp)->lid);
+	data.toggle_count = 0;
+	memset(&data.details, 0, sizeof data.details);
+	data.details.ntc_144.lid = data.issuer_lid;
+	data.details.ntc_144.new_cap_mask = cpu_to_be32(ibp->port_cap_flags);
+
+	qib_send_trap(ibp, &data, sizeof data);
+}
+
+/*
+ * Send a System Image GUID Changed trap (ch. 14.3.12).
+ */
+void qib_sys_guid_chg(struct qib_ibport *ibp)
+{
+	struct ib_mad_notice_attr data;
+
+	data.generic_type = IB_NOTICE_TYPE_INFO;
+	data.prod_type_msb = 0;
+	data.prod_type_lsb = IB_NOTICE_PROD_CA;
+	data.trap_num = IB_NOTICE_TRAP_SYS_GUID_CHG;
+	data.issuer_lid = cpu_to_be16(ppd_from_ibp(ibp)->lid);
+	data.toggle_count = 0;
+	memset(&data.details, 0, sizeof data.details);
+	data.details.ntc_145.lid = data.issuer_lid;
+	data.details.ntc_145.new_sys_guid = ib_qib_sys_image_guid;
+
+	qib_send_trap(ibp, &data, sizeof data);
+}
+
+/*
+ * Send a Node Description Changed trap (ch. 14.3.13).
+ */
+void qib_node_desc_chg(struct qib_ibport *ibp)
+{
+	struct ib_mad_notice_attr data;
+
+	data.generic_type = IB_NOTICE_TYPE_INFO;
+	data.prod_type_msb = 0;
+	data.prod_type_lsb = IB_NOTICE_PROD_CA;
+	data.trap_num = IB_NOTICE_TRAP_CAP_MASK_CHG;
+	data.issuer_lid = cpu_to_be16(ppd_from_ibp(ibp)->lid);
+	data.toggle_count = 0;
+	memset(&data.details, 0, sizeof data.details);
+	data.details.ntc_144.lid = data.issuer_lid;
+	data.details.ntc_144.local_changes = 1;
+	data.details.ntc_144.change_flags = IB_NOTICE_TRAP_NODE_DESC_CHG;
+
+	qib_send_trap(ibp, &data, sizeof data);
+}
+
+static int subn_get_nodedescription(struct ib_smp *smp,
+				    struct ib_device *ibdev)
+{
+	if (smp->attr_mod)
+		smp->status |= IB_SMP_INVALID_FIELD;
+
+	memcpy(smp->data, ibdev->node_desc, sizeof(smp->data));
+
+	return reply(smp);
+}
+
+static int subn_get_nodeinfo(struct ib_smp *smp, struct ib_device *ibdev,
+			     u8 port)
+{
+	struct ib_node_info *nip = (struct ib_node_info *)&smp->data;
+	struct qib_devdata *dd = dd_from_ibdev(ibdev);
+	u32 vendor, majrev, minrev;
+	unsigned pidx = port - 1; /* IB number port from 1, hdw from 0 */
+
+	/* GUID 0 is illegal */
+	if (smp->attr_mod || pidx >= dd->num_pports ||
+	    dd->pport[pidx].guid == 0)
+		smp->status |= IB_SMP_INVALID_FIELD;
+	else
+		nip->port_guid = dd->pport[pidx].guid;
+
+	nip->base_version = 1;
+	nip->class_version = 1;
+	nip->node_type = 1;     /* channel adapter */
+	nip->num_ports = ibdev->phys_port_cnt;
+	/* This is already in network order */
+	nip->sys_guid = ib_qib_sys_image_guid;
+	nip->node_guid = dd->pport->guid; /* Use first-port GUID as node */
+	nip->partition_cap = cpu_to_be16(qib_get_npkeys(dd));
+	nip->device_id = cpu_to_be16(dd->deviceid);
+	majrev = dd->majrev;
+	minrev = dd->minrev;
+	nip->revision = cpu_to_be32((majrev << 16) | minrev);
+	nip->local_port_num = port;
+	vendor = dd->vendorid;
+	nip->vendor_id[0] = QIB_SRC_OUI_1;
+	nip->vendor_id[1] = QIB_SRC_OUI_2;
+	nip->vendor_id[2] = QIB_SRC_OUI_3;
+
+	return reply(smp);
+}
+
+static int subn_get_guidinfo(struct ib_smp *smp, struct ib_device *ibdev,
+			     u8 port)
+{
+	struct qib_devdata *dd = dd_from_ibdev(ibdev);
+	u32 startgx = 8 * be32_to_cpu(smp->attr_mod);
+	__be64 *p = (__be64 *) smp->data;
+	unsigned pidx = port - 1; /* IB number port from 1, hdw from 0 */
+
+	/* 32 blocks of 8 64-bit GUIDs per block */
+
+	memset(smp->data, 0, sizeof(smp->data));
+
+	if (startgx == 0 && pidx < dd->num_pports) {
+		struct qib_pportdata *ppd = dd->pport + pidx;
+		struct qib_ibport *ibp = &ppd->ibport_data;
+		__be64 g = ppd->guid;
+		unsigned i;
+
+		/* GUID 0 is illegal */
+		if (g == 0)
+			smp->status |= IB_SMP_INVALID_FIELD;
+		else {
+			/* The first is a copy of the read-only HW GUID. */
+			p[0] = g;
+			for (i = 1; i < QIB_GUIDS_PER_PORT; i++)
+				p[i] = ibp->guids[i - 1];
+		}
+	} else
+		smp->status |= IB_SMP_INVALID_FIELD;
+
+	return reply(smp);
+}
+
+static void set_link_width_enabled(struct qib_pportdata *ppd, u32 w)
+{
+	(void) ppd->dd->f_set_ib_cfg(ppd, QIB_IB_CFG_LWID_ENB, w);
+}
+
+static void set_link_speed_enabled(struct qib_pportdata *ppd, u32 s)
+{
+	(void) ppd->dd->f_set_ib_cfg(ppd, QIB_IB_CFG_SPD_ENB, s);
+}
+
+static int get_overrunthreshold(struct qib_pportdata *ppd)
+{
+	return ppd->dd->f_get_ib_cfg(ppd, QIB_IB_CFG_OVERRUN_THRESH);
+}
+
+/**
+ * set_overrunthreshold - set the overrun threshold
+ * @ppd: the physical port data
+ * @n: the new threshold
+ *
+ * Note that this will only take effect when the link state changes.
+ */
+static int set_overrunthreshold(struct qib_pportdata *ppd, unsigned n)
+{
+	(void) ppd->dd->f_set_ib_cfg(ppd, QIB_IB_CFG_OVERRUN_THRESH,
+					 (u32)n);
+	return 0;
+}
+
+static int get_phyerrthreshold(struct qib_pportdata *ppd)
+{
+	return ppd->dd->f_get_ib_cfg(ppd, QIB_IB_CFG_PHYERR_THRESH);
+}
+
+/**
+ * set_phyerrthreshold - set the physical error threshold
+ * @ppd: the physical port data
+ * @n: the new threshold
+ *
+ * Note that this will only take effect when the link state changes.
+ */
+static int set_phyerrthreshold(struct qib_pportdata *ppd, unsigned n)
+{
+	(void) ppd->dd->f_set_ib_cfg(ppd, QIB_IB_CFG_PHYERR_THRESH,
+					 (u32)n);
+	return 0;
+}
+
+/**
+ * get_linkdowndefaultstate - get the default linkdown state
+ * @ppd: the physical port data
+ *
+ * Returns zero if the default is POLL, 1 if the default is SLEEP.
+ */
+static int get_linkdowndefaultstate(struct qib_pportdata *ppd)
+{
+	return ppd->dd->f_get_ib_cfg(ppd, QIB_IB_CFG_LINKDEFAULT) ==
+		IB_LINKINITCMD_SLEEP;
+}
+
+static int check_mkey(struct qib_ibport *ibp, struct ib_smp *smp, int mad_flags)
+{
+	int valid_mkey = 0;
+	int ret = 0;
+
+	/* Is the mkey in the process of expiring? */
+	if (ibp->mkey_lease_timeout &&
+	    time_after_eq(jiffies, ibp->mkey_lease_timeout)) {
+		/* Clear timeout and mkey protection field. */
+		ibp->mkey_lease_timeout = 0;
+		ibp->mkeyprot = 0;
+	}
+
+	if ((mad_flags & IB_MAD_IGNORE_MKEY) ||  ibp->mkey == 0 ||
+	    ibp->mkey == smp->mkey)
+		valid_mkey = 1;
+
+	/* Unset lease timeout on any valid Get/Set/TrapRepress */
+	if (valid_mkey && ibp->mkey_lease_timeout &&
+	    (smp->method == IB_MGMT_METHOD_GET ||
+	     smp->method == IB_MGMT_METHOD_SET ||
+	     smp->method == IB_MGMT_METHOD_TRAP_REPRESS))
+		ibp->mkey_lease_timeout = 0;
+
+	if (!valid_mkey) {
+		switch (smp->method) {
+		case IB_MGMT_METHOD_GET:
+			/* Bad mkey not a violation below level 2 */
+			if (ibp->mkeyprot < 2)
+				break;
+		case IB_MGMT_METHOD_SET:
+		case IB_MGMT_METHOD_TRAP_REPRESS:
+			if (ibp->mkey_violations != 0xFFFF)
+				++ibp->mkey_violations;
+			if (!ibp->mkey_lease_timeout && ibp->mkey_lease_period)
+				ibp->mkey_lease_timeout = jiffies +
+					ibp->mkey_lease_period * HZ;
+			/* Generate a trap notice. */
+			qib_bad_mkey(ibp, smp);
+			ret = 1;
+		}
+	}
+
+	return ret;
+}
+
+static int subn_get_portinfo(struct ib_smp *smp, struct ib_device *ibdev,
+			     u8 port)
+{
+	struct qib_devdata *dd;
+	struct qib_pportdata *ppd;
+	struct qib_ibport *ibp;
+	struct ib_port_info *pip = (struct ib_port_info *)smp->data;
+	u8 mtu;
+	int ret;
+	u32 state;
+	u32 port_num = be32_to_cpu(smp->attr_mod);
+
+	if (port_num == 0)
+		port_num = port;
+	else {
+		if (port_num > ibdev->phys_port_cnt) {
+			smp->status |= IB_SMP_INVALID_FIELD;
+			ret = reply(smp);
+			goto bail;
+		}
+		if (port_num != port) {
+			ibp = to_iport(ibdev, port_num);
+			ret = check_mkey(ibp, smp, 0);
+			if (ret) {
+				ret = IB_MAD_RESULT_FAILURE;
+				goto bail;
+			}
+		}
+	}
+
+	dd = dd_from_ibdev(ibdev);
+	/* IB numbers ports from 1, hdw from 0 */
+	ppd = dd->pport + (port_num - 1);
+	ibp = &ppd->ibport_data;
+
+	/* Clear all fields.  Only set the non-zero fields. */
+	memset(smp->data, 0, sizeof(smp->data));
+
+	/* Only return the mkey if the protection field allows it. */
+	if (!(smp->method == IB_MGMT_METHOD_GET &&
+	      ibp->mkey != smp->mkey &&
+	      ibp->mkeyprot == 1))
+		pip->mkey = ibp->mkey;
+	pip->gid_prefix = ibp->gid_prefix;
+	pip->lid = cpu_to_be16(ppd->lid);
+	pip->sm_lid = cpu_to_be16(ibp->sm_lid);
+	pip->cap_mask = cpu_to_be32(ibp->port_cap_flags);
+	/* pip->diag_code; */
+	pip->mkey_lease_period = cpu_to_be16(ibp->mkey_lease_period);
+	pip->local_port_num = port;
+	pip->link_width_enabled = ppd->link_width_enabled;
+	pip->link_width_supported = ppd->link_width_supported;
+	pip->link_width_active = ppd->link_width_active;
+	state = dd->f_iblink_state(ppd->lastibcstat);
+	pip->linkspeed_portstate = ppd->link_speed_supported << 4 | state;
+
+	pip->portphysstate_linkdown =
+		(dd->f_ibphys_portstate(ppd->lastibcstat) << 4) |
+		(get_linkdowndefaultstate(ppd) ? 1 : 2);
+	pip->mkeyprot_resv_lmc = (ibp->mkeyprot << 6) | ppd->lmc;
+	pip->linkspeedactive_enabled = (ppd->link_speed_active << 4) |
+		ppd->link_speed_enabled;
+	switch (ppd->ibmtu) {
+	default: /* something is wrong; fall through */
+	case 4096:
+		mtu = IB_MTU_4096;
+		break;
+	case 2048:
+		mtu = IB_MTU_2048;
+		break;
+	case 1024:
+		mtu = IB_MTU_1024;
+		break;
+	case 512:
+		mtu = IB_MTU_512;
+		break;
+	case 256:
+		mtu = IB_MTU_256;
+		break;
+	}
+	pip->neighbormtu_mastersmsl = (mtu << 4) | ibp->sm_sl;
+	pip->vlcap_inittype = ppd->vls_supported << 4;  /* InitType = 0 */
+	pip->vl_high_limit = ibp->vl_high_limit;
+	pip->vl_arb_high_cap =
+		dd->f_get_ib_cfg(ppd, QIB_IB_CFG_VL_HIGH_CAP);
+	pip->vl_arb_low_cap =
+		dd->f_get_ib_cfg(ppd, QIB_IB_CFG_VL_LOW_CAP);
+	/* InitTypeReply = 0 */
+	pip->inittypereply_mtucap = qib_ibmtu ? qib_ibmtu : IB_MTU_4096;
+	/* HCAs ignore VLStallCount and HOQLife */
+	/* pip->vlstallcnt_hoqlife; */
+	pip->operationalvl_pei_peo_fpi_fpo =
+		dd->f_get_ib_cfg(ppd, QIB_IB_CFG_OP_VLS) << 4;
+	pip->mkey_violations = cpu_to_be16(ibp->mkey_violations);
+	/* P_KeyViolations are counted by hardware. */
+	pip->pkey_violations = cpu_to_be16(ibp->pkey_violations);
+	pip->qkey_violations = cpu_to_be16(ibp->qkey_violations);
+	/* Only the hardware GUID is supported for now */
+	pip->guid_cap = QIB_GUIDS_PER_PORT;
+	pip->clientrereg_resv_subnetto = ibp->subnet_timeout;
+	/* 32.768 usec. response time (guessing) */
+	pip->resv_resptimevalue = 3;
+	pip->localphyerrors_overrunerrors =
+		(get_phyerrthreshold(ppd) << 4) |
+		get_overrunthreshold(ppd);
+	/* pip->max_credit_hint; */
+	if (ibp->port_cap_flags & IB_PORT_LINK_LATENCY_SUP) {
+		u32 v;
+
+		v = dd->f_get_ib_cfg(ppd, QIB_IB_CFG_LINKLATENCY);
+		pip->link_roundtrip_latency[0] = v >> 16;
+		pip->link_roundtrip_latency[1] = v >> 8;
+		pip->link_roundtrip_latency[2] = v;
+	}
+
+	ret = reply(smp);
+
+bail:
+	return ret;
+}
+
+/**
+ * get_pkeys - return the PKEY table
+ * @dd: the qlogic_ib device
+ * @port: the IB port number
+ * @pkeys: the pkey table is placed here
+ */
+static int get_pkeys(struct qib_devdata *dd, u8 port, u16 *pkeys)
+{
+	struct qib_pportdata *ppd = dd->pport + port - 1;
+	/*
+	 * always a kernel context, no locking needed.
+	 * If we get here with ppd setup, no need to check
+	 * that pd is valid.
+	 */
+	struct qib_ctxtdata *rcd = dd->rcd[ppd->hw_pidx];
+
+	memcpy(pkeys, rcd->pkeys, sizeof(rcd->pkeys));
+
+	return 0;
+}
+
+static int subn_get_pkeytable(struct ib_smp *smp, struct ib_device *ibdev,
+			      u8 port)
+{
+	u32 startpx = 32 * (be32_to_cpu(smp->attr_mod) & 0xffff);
+	u16 *p = (u16 *) smp->data;
+	__be16 *q = (__be16 *) smp->data;
+
+	/* 64 blocks of 32 16-bit P_Key entries */
+
+	memset(smp->data, 0, sizeof(smp->data));
+	if (startpx == 0) {
+		struct qib_devdata *dd = dd_from_ibdev(ibdev);
+		unsigned i, n = qib_get_npkeys(dd);
+
+		get_pkeys(dd, port, p);
+
+		for (i = 0; i < n; i++)
+			q[i] = cpu_to_be16(p[i]);
+	} else
+		smp->status |= IB_SMP_INVALID_FIELD;
+
+	return reply(smp);
+}
+
+static int subn_set_guidinfo(struct ib_smp *smp, struct ib_device *ibdev,
+			     u8 port)
+{
+	struct qib_devdata *dd = dd_from_ibdev(ibdev);
+	u32 startgx = 8 * be32_to_cpu(smp->attr_mod);
+	__be64 *p = (__be64 *) smp->data;
+	unsigned pidx = port - 1; /* IB number port from 1, hdw from 0 */
+
+	/* 32 blocks of 8 64-bit GUIDs per block */
+
+	if (startgx == 0 && pidx < dd->num_pports) {
+		struct qib_pportdata *ppd = dd->pport + pidx;
+		struct qib_ibport *ibp = &ppd->ibport_data;
+		unsigned i;
+
+		/* The first entry is read-only. */
+		for (i = 1; i < QIB_GUIDS_PER_PORT; i++)
+			ibp->guids[i - 1] = p[i];
+	} else
+		smp->status |= IB_SMP_INVALID_FIELD;
+
+	/* The only GUID we support is the first read-only entry. */
+	return subn_get_guidinfo(smp, ibdev, port);
+}
+
+/**
+ * subn_set_portinfo - set port information
+ * @smp: the incoming SM packet
+ * @ibdev: the infiniband device
+ * @port: the port on the device
+ *
+ * Set Portinfo (see ch. 14.2.5.6).
+ */
+static int subn_set_portinfo(struct ib_smp *smp, struct ib_device *ibdev,
+			     u8 port)
+{
+	struct ib_port_info *pip = (struct ib_port_info *)smp->data;
+	struct ib_event event;
+	struct qib_devdata *dd;
+	struct qib_pportdata *ppd;
+	struct qib_ibport *ibp;
+	u8 clientrereg = (pip->clientrereg_resv_subnetto & 0x80);
+	unsigned long flags;
+	u16 lid, smlid;
+	u8 lwe;
+	u8 lse;
+	u8 state;
+	u8 vls;
+	u8 msl;
+	u16 lstate;
+	int ret, ore, mtu;
+	u32 port_num = be32_to_cpu(smp->attr_mod);
+
+	if (port_num == 0)
+		port_num = port;
+	else {
+		if (port_num > ibdev->phys_port_cnt)
+			goto err;
+		/* Port attributes can only be set on the receiving port */
+		if (port_num != port)
+			goto get_only;
+	}
+
+	dd = dd_from_ibdev(ibdev);
+	/* IB numbers ports from 1, hdw from 0 */
+	ppd = dd->pport + (port_num - 1);
+	ibp = &ppd->ibport_data;
+	event.device = ibdev;
+	event.element.port_num = port;
+
+	ibp->mkey = pip->mkey;
+	ibp->gid_prefix = pip->gid_prefix;
+	ibp->mkey_lease_period = be16_to_cpu(pip->mkey_lease_period);
+
+	lid = be16_to_cpu(pip->lid);
+	/* Must be a valid unicast LID address. */
+	if (lid == 0 || lid >= QIB_MULTICAST_LID_BASE)
+		smp->status |= IB_SMP_INVALID_FIELD;
+	else if (ppd->lid != lid || ppd->lmc != (pip->mkeyprot_resv_lmc & 7)) {
+		if (ppd->lid != lid)
+			qib_set_uevent_bits(ppd, _QIB_EVENT_LID_CHANGE_BIT);
+		if (ppd->lmc != (pip->mkeyprot_resv_lmc & 7))
+			qib_set_uevent_bits(ppd, _QIB_EVENT_LMC_CHANGE_BIT);
+		qib_set_lid(ppd, lid, pip->mkeyprot_resv_lmc & 7);
+		event.event = IB_EVENT_LID_CHANGE;
+		ib_dispatch_event(&event);
+	}
+
+	smlid = be16_to_cpu(pip->sm_lid);
+	msl = pip->neighbormtu_mastersmsl & 0xF;
+	/* Must be a valid unicast LID address. */
+	if (smlid == 0 || smlid >= QIB_MULTICAST_LID_BASE)
+		smp->status |= IB_SMP_INVALID_FIELD;
+	else if (smlid != ibp->sm_lid || msl != ibp->sm_sl) {
+		spin_lock_irqsave(&ibp->lock, flags);
+		if (ibp->sm_ah) {
+			if (smlid != ibp->sm_lid)
+				ibp->sm_ah->attr.dlid = smlid;
+			if (msl != ibp->sm_sl)
+				ibp->sm_ah->attr.sl = msl;
+		}
+		spin_unlock_irqrestore(&ibp->lock, flags);
+		if (smlid != ibp->sm_lid)
+			ibp->sm_lid = smlid;
+		if (msl != ibp->sm_sl)
+			ibp->sm_sl = msl;
+		event.event = IB_EVENT_SM_CHANGE;
+		ib_dispatch_event(&event);
+	}
+
+	/* Allow 1x or 4x to be set (see 14.2.6.6). */
+	lwe = pip->link_width_enabled;
+	if (lwe) {
+		if (lwe == 0xFF)
+			set_link_width_enabled(ppd, ppd->link_width_supported);
+		else if (lwe >= 16 || (lwe & ~ppd->link_width_supported))
+			smp->status |= IB_SMP_INVALID_FIELD;
+		else if (lwe != ppd->link_width_enabled)
+			set_link_width_enabled(ppd, lwe);
+	}
+
+	lse = pip->linkspeedactive_enabled & 0xF;
+	if (lse) {
+		/*
+		 * The IB 1.2 spec. only allows link speed values
+		 * 1, 3, 5, 7, 15.  1.2.1 extended to allow specific
+		 * speeds.
+		 */
+		if (lse == 15)
+			set_link_speed_enabled(ppd,
+					       ppd->link_speed_supported);
+		else if (lse >= 8 || (lse & ~ppd->link_speed_supported))
+			smp->status |= IB_SMP_INVALID_FIELD;
+		else if (lse != ppd->link_speed_enabled)
+			set_link_speed_enabled(ppd, lse);
+	}
+
+	/* Set link down default state. */
+	switch (pip->portphysstate_linkdown & 0xF) {
+	case 0: /* NOP */
+		break;
+	case 1: /* SLEEP */
+		(void) dd->f_set_ib_cfg(ppd, QIB_IB_CFG_LINKDEFAULT,
+					IB_LINKINITCMD_SLEEP);
+		break;
+	case 2: /* POLL */
+		(void) dd->f_set_ib_cfg(ppd, QIB_IB_CFG_LINKDEFAULT,
+					IB_LINKINITCMD_POLL);
+		break;
+	default:
+		smp->status |= IB_SMP_INVALID_FIELD;
+	}
+
+	ibp->mkeyprot = pip->mkeyprot_resv_lmc >> 6;
+	ibp->vl_high_limit = pip->vl_high_limit;
+	(void) dd->f_set_ib_cfg(ppd, QIB_IB_CFG_VL_HIGH_LIMIT,
+				    ibp->vl_high_limit);
+
+	mtu = ib_mtu_enum_to_int((pip->neighbormtu_mastersmsl >> 4) & 0xF);
+	if (mtu == -1)
+		smp->status |= IB_SMP_INVALID_FIELD;
+	else
+		qib_set_mtu(ppd, mtu);
+
+	/* Set operational VLs */
+	vls = (pip->operationalvl_pei_peo_fpi_fpo >> 4) & 0xF;
+	if (vls) {
+		if (vls > ppd->vls_supported)
+			smp->status |= IB_SMP_INVALID_FIELD;
+		else
+			(void) dd->f_set_ib_cfg(ppd, QIB_IB_CFG_OP_VLS, vls);
+	}
+
+	if (pip->mkey_violations == 0)
+		ibp->mkey_violations = 0;
+
+	if (pip->pkey_violations == 0)
+		ibp->pkey_violations = 0;
+
+	if (pip->qkey_violations == 0)
+		ibp->qkey_violations = 0;
+
+	ore = pip->localphyerrors_overrunerrors;
+	if (set_phyerrthreshold(ppd, (ore >> 4) & 0xF))
+		smp->status |= IB_SMP_INVALID_FIELD;
+
+	if (set_overrunthreshold(ppd, (ore & 0xF)))
+		smp->status |= IB_SMP_INVALID_FIELD;
+
+	ibp->subnet_timeout = pip->clientrereg_resv_subnetto & 0x1F;
+
+	/*
+	 * Do the port state change now that the other link parameters
+	 * have been set.
+	 * Changing the port physical state only makes sense if the link
+	 * is down or is being set to down.
+	 */
+	state = pip->linkspeed_portstate & 0xF;
+	lstate = (pip->portphysstate_linkdown >> 4) & 0xF;
+	if (lstate && !(state == IB_PORT_DOWN || state == IB_PORT_NOP))
+		smp->status |= IB_SMP_INVALID_FIELD;
+
+	/*
+	 * Only state changes of DOWN, ARM, and ACTIVE are valid
+	 * and must be in the correct state to take effect (see 7.2.6).
+	 */
+	switch (state) {
+	case IB_PORT_NOP:
+		if (lstate == 0)
+			break;
+		/* FALLTHROUGH */
+	case IB_PORT_DOWN:
+		if (lstate == 0)
+			lstate = QIB_IB_LINKDOWN_ONLY;
+		else if (lstate == 1)
+			lstate = QIB_IB_LINKDOWN_SLEEP;
+		else if (lstate == 2)
+			lstate = QIB_IB_LINKDOWN;
+		else if (lstate == 3)
+			lstate = QIB_IB_LINKDOWN_DISABLE;
+		else {
+			smp->status |= IB_SMP_INVALID_FIELD;
+			break;
+		}
+		spin_lock_irqsave(&ppd->lflags_lock, flags);
+		ppd->lflags &= ~QIBL_LINKV;
+		spin_unlock_irqrestore(&ppd->lflags_lock, flags);
+		qib_set_linkstate(ppd, lstate);
+		/*
+		 * Don't send a reply if the response would be sent
+		 * through the disabled port.
+		 */
+		if (lstate == QIB_IB_LINKDOWN_DISABLE && smp->hop_cnt) {
+			ret = IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED;
+			goto done;
+		}
+		qib_wait_linkstate(ppd, QIBL_LINKV, 10);
+		break;
+	case IB_PORT_ARMED:
+		qib_set_linkstate(ppd, QIB_IB_LINKARM);
+		break;
+	case IB_PORT_ACTIVE:
+		qib_set_linkstate(ppd, QIB_IB_LINKACTIVE);
+		break;
+	default:
+		smp->status |= IB_SMP_INVALID_FIELD;
+	}
+
+	if (clientrereg) {
+		event.event = IB_EVENT_CLIENT_REREGISTER;
+		ib_dispatch_event(&event);
+	}
+
+	ret = subn_get_portinfo(smp, ibdev, port);
+
+	/* restore re-reg bit per o14-12.2.1 */
+	pip->clientrereg_resv_subnetto |= clientrereg;
+
+	goto get_only;
+
+err:
+	smp->status |= IB_SMP_INVALID_FIELD;
+get_only:
+	ret = subn_get_portinfo(smp, ibdev, port);
+done:
+	return ret;
+}
+
+/**
+ * rm_pkey - decrecment the reference count for the given PKEY
+ * @dd: the qlogic_ib device
+ * @key: the PKEY index
+ *
+ * Return true if this was the last reference and the hardware table entry
+ * needs to be changed.
+ */
+static int rm_pkey(struct qib_pportdata *ppd, u16 key)
+{
+	int i;
+	int ret;
+
+	for (i = 0; i < ARRAY_SIZE(ppd->pkeys); i++) {
+		if (ppd->pkeys[i] != key)
+			continue;
+		if (atomic_dec_and_test(&ppd->pkeyrefs[i])) {
+			ppd->pkeys[i] = 0;
+			ret = 1;
+			goto bail;
+		}
+		break;
+	}
+
+	ret = 0;
+
+bail:
+	return ret;
+}
+
+/**
+ * add_pkey - add the given PKEY to the hardware table
+ * @dd: the qlogic_ib device
+ * @key: the PKEY
+ *
+ * Return an error code if unable to add the entry, zero if no change,
+ * or 1 if the hardware PKEY register needs to be updated.
+ */
+static int add_pkey(struct qib_pportdata *ppd, u16 key)
+{
+	int i;
+	u16 lkey = key & 0x7FFF;
+	int any = 0;
+	int ret;
+
+	if (lkey == 0x7FFF) {
+		ret = 0;
+		goto bail;
+	}
+
+	/* Look for an empty slot or a matching PKEY. */
+	for (i = 0; i < ARRAY_SIZE(ppd->pkeys); i++) {
+		if (!ppd->pkeys[i]) {
+			any++;
+			continue;
+		}
+		/* If it matches exactly, try to increment the ref count */
+		if (ppd->pkeys[i] == key) {
+			if (atomic_inc_return(&ppd->pkeyrefs[i]) > 1) {
+				ret = 0;
+				goto bail;
+			}
+			/* Lost the race. Look for an empty slot below. */
+			atomic_dec(&ppd->pkeyrefs[i]);
+			any++;
+		}
+		/*
+		 * It makes no sense to have both the limited and unlimited
+		 * PKEY set at the same time since the unlimited one will
+		 * disable the limited one.
+		 */
+		if ((ppd->pkeys[i] & 0x7FFF) == lkey) {
+			ret = -EEXIST;
+			goto bail;
+		}
+	}
+	if (!any) {
+		ret = -EBUSY;
+		goto bail;
+	}
+	for (i = 0; i < ARRAY_SIZE(ppd->pkeys); i++) {
+		if (!ppd->pkeys[i] &&
+		    atomic_inc_return(&ppd->pkeyrefs[i]) == 1) {
+			/* for qibstats, etc. */
+			ppd->pkeys[i] = key;
+			ret = 1;
+			goto bail;
+		}
+	}
+	ret = -EBUSY;
+
+bail:
+	return ret;
+}
+
+/**
+ * set_pkeys - set the PKEY table for ctxt 0
+ * @dd: the qlogic_ib device
+ * @port: the IB port number
+ * @pkeys: the PKEY table
+ */
+static int set_pkeys(struct qib_devdata *dd, u8 port, u16 *pkeys)
+{
+	struct qib_pportdata *ppd;
+	struct qib_ctxtdata *rcd;
+	int i;
+	int changed = 0;
+
+	/*
+	 * IB port one/two always maps to context zero/one,
+	 * always a kernel context, no locking needed
+	 * If we get here with ppd setup, no need to check
+	 * that rcd is valid.
+	 */
+	ppd = dd->pport + (port - 1);
+	rcd = dd->rcd[ppd->hw_pidx];
+
+	for (i = 0; i < ARRAY_SIZE(rcd->pkeys); i++) {
+		u16 key = pkeys[i];
+		u16 okey = rcd->pkeys[i];
+
+		if (key == okey)
+			continue;
+		/*
+		 * The value of this PKEY table entry is changing.
+		 * Remove the old entry in the hardware's array of PKEYs.
+		 */
+		if (okey & 0x7FFF)
+			changed |= rm_pkey(ppd, okey);
+		if (key & 0x7FFF) {
+			int ret = add_pkey(ppd, key);
+
+			if (ret < 0)
+				key = 0;
+			else
+				changed |= ret;
+		}
+		rcd->pkeys[i] = key;
+	}
+	if (changed) {
+		struct ib_event event;
+
+		(void) dd->f_set_ib_cfg(ppd, QIB_IB_CFG_PKEYS, 0);
+
+		event.event = IB_EVENT_PKEY_CHANGE;
+		event.device = &dd->verbs_dev.ibdev;
+		event.element.port_num = port;
+		ib_dispatch_event(&event);
+	}
+	return 0;
+}
+
+static int subn_set_pkeytable(struct ib_smp *smp, struct ib_device *ibdev,
+			      u8 port)
+{
+	u32 startpx = 32 * (be32_to_cpu(smp->attr_mod) & 0xffff);
+	__be16 *p = (__be16 *) smp->data;
+	u16 *q = (u16 *) smp->data;
+	struct qib_devdata *dd = dd_from_ibdev(ibdev);
+	unsigned i, n = qib_get_npkeys(dd);
+
+	for (i = 0; i < n; i++)
+		q[i] = be16_to_cpu(p[i]);
+
+	if (startpx != 0 || set_pkeys(dd, port, q) != 0)
+		smp->status |= IB_SMP_INVALID_FIELD;
+
+	return subn_get_pkeytable(smp, ibdev, port);
+}
+
+static int subn_get_sl_to_vl(struct ib_smp *smp, struct ib_device *ibdev,
+			     u8 port)
+{
+	struct qib_ibport *ibp = to_iport(ibdev, port);
+	u8 *p = (u8 *) smp->data;
+	unsigned i;
+
+	memset(smp->data, 0, sizeof(smp->data));
+
+	if (!(ibp->port_cap_flags & IB_PORT_SL_MAP_SUP))
+		smp->status |= IB_SMP_UNSUP_METHOD;
+	else
+		for (i = 0; i < ARRAY_SIZE(ibp->sl_to_vl); i += 2)
+			*p++ = (ibp->sl_to_vl[i] << 4) | ibp->sl_to_vl[i + 1];
+
+	return reply(smp);
+}
+
+static int subn_set_sl_to_vl(struct ib_smp *smp, struct ib_device *ibdev,
+			     u8 port)
+{
+	struct qib_ibport *ibp = to_iport(ibdev, port);
+	u8 *p = (u8 *) smp->data;
+	unsigned i;
+
+	if (!(ibp->port_cap_flags & IB_PORT_SL_MAP_SUP)) {
+		smp->status |= IB_SMP_UNSUP_METHOD;
+		return reply(smp);
+	}
+
+	for (i = 0; i < ARRAY_SIZE(ibp->sl_to_vl); i += 2, p++) {
+		ibp->sl_to_vl[i] = *p >> 4;
+		ibp->sl_to_vl[i + 1] = *p & 0xF;
+	}
+	qib_set_uevent_bits(ppd_from_ibp(to_iport(ibdev, port)),
+			    _QIB_EVENT_SL2VL_CHANGE_BIT);
+
+	return subn_get_sl_to_vl(smp, ibdev, port);
+}
+
+static int subn_get_vl_arb(struct ib_smp *smp, struct ib_device *ibdev,
+			   u8 port)
+{
+	unsigned which = be32_to_cpu(smp->attr_mod) >> 16;
+	struct qib_pportdata *ppd = ppd_from_ibp(to_iport(ibdev, port));
+
+	memset(smp->data, 0, sizeof(smp->data));
+
+	if (ppd->vls_supported == IB_VL_VL0)
+		smp->status |= IB_SMP_UNSUP_METHOD;
+	else if (which == IB_VLARB_LOWPRI_0_31)
+		(void) ppd->dd->f_get_ib_table(ppd, QIB_IB_TBL_VL_LOW_ARB,
+						   smp->data);
+	else if (which == IB_VLARB_HIGHPRI_0_31)
+		(void) ppd->dd->f_get_ib_table(ppd, QIB_IB_TBL_VL_HIGH_ARB,
+						   smp->data);
+	else
+		smp->status |= IB_SMP_INVALID_FIELD;
+
+	return reply(smp);
+}
+
+static int subn_set_vl_arb(struct ib_smp *smp, struct ib_device *ibdev,
+			   u8 port)
+{
+	unsigned which = be32_to_cpu(smp->attr_mod) >> 16;
+	struct qib_pportdata *ppd = ppd_from_ibp(to_iport(ibdev, port));
+
+	if (ppd->vls_supported == IB_VL_VL0)
+		smp->status |= IB_SMP_UNSUP_METHOD;
+	else if (which == IB_VLARB_LOWPRI_0_31)
+		(void) ppd->dd->f_set_ib_table(ppd, QIB_IB_TBL_VL_LOW_ARB,
+						   smp->data);
+	else if (which == IB_VLARB_HIGHPRI_0_31)
+		(void) ppd->dd->f_set_ib_table(ppd, QIB_IB_TBL_VL_HIGH_ARB,
+						   smp->data);
+	else
+		smp->status |= IB_SMP_INVALID_FIELD;
+
+	return subn_get_vl_arb(smp, ibdev, port);
+}
+
+static int subn_trap_repress(struct ib_smp *smp, struct ib_device *ibdev,
+			     u8 port)
+{
+	/*
+	 * For now, we only send the trap once so no need to process this.
+	 * o13-6, o13-7,
+	 * o14-3.a4 The SMA shall not send any message in response to a valid
+	 * SubnTrapRepress() message.
+	 */
+	return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED;
+}
+
+static int pma_get_classportinfo(struct ib_pma_mad *pmp,
+				 struct ib_device *ibdev)
+{
+	struct ib_class_port_info *p =
+		(struct ib_class_port_info *)pmp->data;
+	struct qib_devdata *dd = dd_from_ibdev(ibdev);
+
+	memset(pmp->data, 0, sizeof(pmp->data));
+
+	if (pmp->mad_hdr.attr_mod != 0)
+		pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+
+	/* Note that AllPortSelect is not valid */
+	p->base_version = 1;
+	p->class_version = 1;
+	p->capability_mask = IB_PMA_CLASS_CAP_EXT_WIDTH;
+	/*
+	 * Set the most significant bit of CM2 to indicate support for
+	 * congestion statistics
+	 */
+	p->reserved[0] = dd->psxmitwait_supported << 7;
+	/*
+	 * Expected response time is 4.096 usec. * 2^18 == 1.073741824 sec.
+	 */
+	p->resp_time_value = 18;
+
+	return reply((struct ib_smp *) pmp);
+}
+
+static int pma_get_portsamplescontrol(struct ib_pma_mad *pmp,
+				      struct ib_device *ibdev, u8 port)
+{
+	struct ib_pma_portsamplescontrol *p =
+		(struct ib_pma_portsamplescontrol *)pmp->data;
+	struct qib_ibdev *dev = to_idev(ibdev);
+	struct qib_devdata *dd = dd_from_dev(dev);
+	struct qib_ibport *ibp = to_iport(ibdev, port);
+	struct qib_pportdata *ppd = ppd_from_ibp(ibp);
+	unsigned long flags;
+	u8 port_select = p->port_select;
+
+	memset(pmp->data, 0, sizeof(pmp->data));
+
+	p->port_select = port_select;
+	if (pmp->mad_hdr.attr_mod != 0 || port_select != port) {
+		pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+		goto bail;
+	}
+	spin_lock_irqsave(&ibp->lock, flags);
+	p->tick = dd->f_get_ib_cfg(ppd, QIB_IB_CFG_PMA_TICKS);
+	p->sample_status = dd->f_portcntr(ppd, QIBPORTCNTR_PSSTAT);
+	p->counter_width = 4;   /* 32 bit counters */
+	p->counter_mask0_9 = COUNTER_MASK0_9;
+	p->sample_start = cpu_to_be32(ibp->pma_sample_start);
+	p->sample_interval = cpu_to_be32(ibp->pma_sample_interval);
+	p->tag = cpu_to_be16(ibp->pma_tag);
+	p->counter_select[0] = ibp->pma_counter_select[0];
+	p->counter_select[1] = ibp->pma_counter_select[1];
+	p->counter_select[2] = ibp->pma_counter_select[2];
+	p->counter_select[3] = ibp->pma_counter_select[3];
+	p->counter_select[4] = ibp->pma_counter_select[4];
+	spin_unlock_irqrestore(&ibp->lock, flags);
+
+bail:
+	return reply((struct ib_smp *) pmp);
+}
+
+static int pma_set_portsamplescontrol(struct ib_pma_mad *pmp,
+				      struct ib_device *ibdev, u8 port)
+{
+	struct ib_pma_portsamplescontrol *p =
+		(struct ib_pma_portsamplescontrol *)pmp->data;
+	struct qib_ibdev *dev = to_idev(ibdev);
+	struct qib_devdata *dd = dd_from_dev(dev);
+	struct qib_ibport *ibp = to_iport(ibdev, port);
+	struct qib_pportdata *ppd = ppd_from_ibp(ibp);
+	unsigned long flags;
+	u8 status, xmit_flags;
+	int ret;
+
+	if (pmp->mad_hdr.attr_mod != 0 || p->port_select != port) {
+		pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+		ret = reply((struct ib_smp *) pmp);
+		goto bail;
+	}
+
+	spin_lock_irqsave(&ibp->lock, flags);
+
+	/* Port Sampling code owns the PS* HW counters */
+	xmit_flags = ppd->cong_stats.flags;
+	ppd->cong_stats.flags = IB_PMA_CONG_HW_CONTROL_SAMPLE;
+	status = dd->f_portcntr(ppd, QIBPORTCNTR_PSSTAT);
+	if (status == IB_PMA_SAMPLE_STATUS_DONE ||
+	    (status == IB_PMA_SAMPLE_STATUS_RUNNING &&
+	     xmit_flags == IB_PMA_CONG_HW_CONTROL_TIMER)) {
+		ibp->pma_sample_start = be32_to_cpu(p->sample_start);
+		ibp->pma_sample_interval = be32_to_cpu(p->sample_interval);
+		ibp->pma_tag = be16_to_cpu(p->tag);
+		ibp->pma_counter_select[0] = p->counter_select[0];
+		ibp->pma_counter_select[1] = p->counter_select[1];
+		ibp->pma_counter_select[2] = p->counter_select[2];
+		ibp->pma_counter_select[3] = p->counter_select[3];
+		ibp->pma_counter_select[4] = p->counter_select[4];
+		dd->f_set_cntr_sample(ppd, ibp->pma_sample_interval,
+				      ibp->pma_sample_start);
+	}
+	spin_unlock_irqrestore(&ibp->lock, flags);
+
+	ret = pma_get_portsamplescontrol(pmp, ibdev, port);
+
+bail:
+	return ret;
+}
+
+static u64 get_counter(struct qib_ibport *ibp, struct qib_pportdata *ppd,
+		       __be16 sel)
+{
+	u64 ret;
+
+	switch (sel) {
+	case IB_PMA_PORT_XMIT_DATA:
+		ret = ppd->dd->f_portcntr(ppd, QIBPORTCNTR_PSXMITDATA);
+		break;
+	case IB_PMA_PORT_RCV_DATA:
+		ret = ppd->dd->f_portcntr(ppd, QIBPORTCNTR_PSRCVDATA);
+		break;
+	case IB_PMA_PORT_XMIT_PKTS:
+		ret = ppd->dd->f_portcntr(ppd, QIBPORTCNTR_PSXMITPKTS);
+		break;
+	case IB_PMA_PORT_RCV_PKTS:
+		ret = ppd->dd->f_portcntr(ppd, QIBPORTCNTR_PSRCVPKTS);
+		break;
+	case IB_PMA_PORT_XMIT_WAIT:
+		ret = ppd->dd->f_portcntr(ppd, QIBPORTCNTR_PSXMITWAIT);
+		break;
+	default:
+		ret = 0;
+	}
+
+	return ret;
+}
+
+/* This function assumes that the xmit_wait lock is already held */
+static u64 xmit_wait_get_value_delta(struct qib_pportdata *ppd)
+{
+	u32 delta;
+
+	delta = get_counter(&ppd->ibport_data, ppd,
+			    IB_PMA_PORT_XMIT_WAIT);
+	return ppd->cong_stats.counter + delta;
+}
+
+static void cache_hw_sample_counters(struct qib_pportdata *ppd)
+{
+	struct qib_ibport *ibp = &ppd->ibport_data;
+
+	ppd->cong_stats.counter_cache.psxmitdata =
+		get_counter(ibp, ppd, IB_PMA_PORT_XMIT_DATA);
+	ppd->cong_stats.counter_cache.psrcvdata =
+		get_counter(ibp, ppd, IB_PMA_PORT_RCV_DATA);
+	ppd->cong_stats.counter_cache.psxmitpkts =
+		get_counter(ibp, ppd, IB_PMA_PORT_XMIT_PKTS);
+	ppd->cong_stats.counter_cache.psrcvpkts =
+		get_counter(ibp, ppd, IB_PMA_PORT_RCV_PKTS);
+	ppd->cong_stats.counter_cache.psxmitwait =
+		get_counter(ibp, ppd, IB_PMA_PORT_XMIT_WAIT);
+}
+
+static u64 get_cache_hw_sample_counters(struct qib_pportdata *ppd,
+					__be16 sel)
+{
+	u64 ret;
+
+	switch (sel) {
+	case IB_PMA_PORT_XMIT_DATA:
+		ret = ppd->cong_stats.counter_cache.psxmitdata;
+		break;
+	case IB_PMA_PORT_RCV_DATA:
+		ret = ppd->cong_stats.counter_cache.psrcvdata;
+		break;
+	case IB_PMA_PORT_XMIT_PKTS:
+		ret = ppd->cong_stats.counter_cache.psxmitpkts;
+		break;
+	case IB_PMA_PORT_RCV_PKTS:
+		ret = ppd->cong_stats.counter_cache.psrcvpkts;
+		break;
+	case IB_PMA_PORT_XMIT_WAIT:
+		ret = ppd->cong_stats.counter_cache.psxmitwait;
+		break;
+	default:
+		ret = 0;
+	}
+
+	return ret;
+}
+
+static int pma_get_portsamplesresult(struct ib_pma_mad *pmp,
+				     struct ib_device *ibdev, u8 port)
+{
+	struct ib_pma_portsamplesresult *p =
+		(struct ib_pma_portsamplesresult *)pmp->data;
+	struct qib_ibdev *dev = to_idev(ibdev);
+	struct qib_devdata *dd = dd_from_dev(dev);
+	struct qib_ibport *ibp = to_iport(ibdev, port);
+	struct qib_pportdata *ppd = ppd_from_ibp(ibp);
+	unsigned long flags;
+	u8 status;
+	int i;
+
+	memset(pmp->data, 0, sizeof(pmp->data));
+	spin_lock_irqsave(&ibp->lock, flags);
+	p->tag = cpu_to_be16(ibp->pma_tag);
+	if (ppd->cong_stats.flags == IB_PMA_CONG_HW_CONTROL_TIMER)
+		p->sample_status = IB_PMA_SAMPLE_STATUS_DONE;
+	else {
+		status = dd->f_portcntr(ppd, QIBPORTCNTR_PSSTAT);
+		p->sample_status = cpu_to_be16(status);
+		if (status == IB_PMA_SAMPLE_STATUS_DONE) {
+			cache_hw_sample_counters(ppd);
+			ppd->cong_stats.counter =
+				xmit_wait_get_value_delta(ppd);
+			dd->f_set_cntr_sample(ppd,
+					      QIB_CONG_TIMER_PSINTERVAL, 0);
+			ppd->cong_stats.flags = IB_PMA_CONG_HW_CONTROL_TIMER;
+		}
+	}
+	for (i = 0; i < ARRAY_SIZE(ibp->pma_counter_select); i++)
+		p->counter[i] = cpu_to_be32(
+			get_cache_hw_sample_counters(
+				ppd, ibp->pma_counter_select[i]));
+	spin_unlock_irqrestore(&ibp->lock, flags);
+
+	return reply((struct ib_smp *) pmp);
+}
+
+static int pma_get_portsamplesresult_ext(struct ib_pma_mad *pmp,
+					 struct ib_device *ibdev, u8 port)
+{
+	struct ib_pma_portsamplesresult_ext *p =
+		(struct ib_pma_portsamplesresult_ext *)pmp->data;
+	struct qib_ibdev *dev = to_idev(ibdev);
+	struct qib_devdata *dd = dd_from_dev(dev);
+	struct qib_ibport *ibp = to_iport(ibdev, port);
+	struct qib_pportdata *ppd = ppd_from_ibp(ibp);
+	unsigned long flags;
+	u8 status;
+	int i;
+
+	/* Port Sampling code owns the PS* HW counters */
+	memset(pmp->data, 0, sizeof(pmp->data));
+	spin_lock_irqsave(&ibp->lock, flags);
+	p->tag = cpu_to_be16(ibp->pma_tag);
+	if (ppd->cong_stats.flags == IB_PMA_CONG_HW_CONTROL_TIMER)
+		p->sample_status = IB_PMA_SAMPLE_STATUS_DONE;
+	else {
+		status = dd->f_portcntr(ppd, QIBPORTCNTR_PSSTAT);
+		p->sample_status = cpu_to_be16(status);
+		/* 64 bits */
+		p->extended_width = cpu_to_be32(0x80000000);
+		if (status == IB_PMA_SAMPLE_STATUS_DONE) {
+			cache_hw_sample_counters(ppd);
+			ppd->cong_stats.counter =
+				xmit_wait_get_value_delta(ppd);
+			dd->f_set_cntr_sample(ppd,
+					      QIB_CONG_TIMER_PSINTERVAL, 0);
+			ppd->cong_stats.flags = IB_PMA_CONG_HW_CONTROL_TIMER;
+		}
+	}
+	for (i = 0; i < ARRAY_SIZE(ibp->pma_counter_select); i++)
+		p->counter[i] = cpu_to_be64(
+			get_cache_hw_sample_counters(
+				ppd, ibp->pma_counter_select[i]));
+	spin_unlock_irqrestore(&ibp->lock, flags);
+
+	return reply((struct ib_smp *) pmp);
+}
+
+static int pma_get_portcounters(struct ib_pma_mad *pmp,
+				struct ib_device *ibdev, u8 port)
+{
+	struct ib_pma_portcounters *p = (struct ib_pma_portcounters *)
+		pmp->data;
+	struct qib_ibport *ibp = to_iport(ibdev, port);
+	struct qib_pportdata *ppd = ppd_from_ibp(ibp);
+	struct qib_verbs_counters cntrs;
+	u8 port_select = p->port_select;
+
+	qib_get_counters(ppd, &cntrs);
+
+	/* Adjust counters for any resets done. */
+	cntrs.symbol_error_counter -= ibp->z_symbol_error_counter;
+	cntrs.link_error_recovery_counter -=
+		ibp->z_link_error_recovery_counter;
+	cntrs.link_downed_counter -= ibp->z_link_downed_counter;
+	cntrs.port_rcv_errors -= ibp->z_port_rcv_errors;
+	cntrs.port_rcv_remphys_errors -= ibp->z_port_rcv_remphys_errors;
+	cntrs.port_xmit_discards -= ibp->z_port_xmit_discards;
+	cntrs.port_xmit_data -= ibp->z_port_xmit_data;
+	cntrs.port_rcv_data -= ibp->z_port_rcv_data;
+	cntrs.port_xmit_packets -= ibp->z_port_xmit_packets;
+	cntrs.port_rcv_packets -= ibp->z_port_rcv_packets;
+	cntrs.local_link_integrity_errors -=
+		ibp->z_local_link_integrity_errors;
+	cntrs.excessive_buffer_overrun_errors -=
+		ibp->z_excessive_buffer_overrun_errors;
+	cntrs.vl15_dropped -= ibp->z_vl15_dropped;
+	cntrs.vl15_dropped += ibp->n_vl15_dropped;
+
+	memset(pmp->data, 0, sizeof(pmp->data));
+
+	p->port_select = port_select;
+	if (pmp->mad_hdr.attr_mod != 0 || port_select != port)
+		pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+
+	if (cntrs.symbol_error_counter > 0xFFFFUL)
+		p->symbol_error_counter = cpu_to_be16(0xFFFF);
+	else
+		p->symbol_error_counter =
+			cpu_to_be16((u16)cntrs.symbol_error_counter);
+	if (cntrs.link_error_recovery_counter > 0xFFUL)
+		p->link_error_recovery_counter = 0xFF;
+	else
+		p->link_error_recovery_counter =
+			(u8)cntrs.link_error_recovery_counter;
+	if (cntrs.link_downed_counter > 0xFFUL)
+		p->link_downed_counter = 0xFF;
+	else
+		p->link_downed_counter = (u8)cntrs.link_downed_counter;
+	if (cntrs.port_rcv_errors > 0xFFFFUL)
+		p->port_rcv_errors = cpu_to_be16(0xFFFF);
+	else
+		p->port_rcv_errors =
+			cpu_to_be16((u16) cntrs.port_rcv_errors);
+	if (cntrs.port_rcv_remphys_errors > 0xFFFFUL)
+		p->port_rcv_remphys_errors = cpu_to_be16(0xFFFF);
+	else
+		p->port_rcv_remphys_errors =
+			cpu_to_be16((u16)cntrs.port_rcv_remphys_errors);
+	if (cntrs.port_xmit_discards > 0xFFFFUL)
+		p->port_xmit_discards = cpu_to_be16(0xFFFF);
+	else
+		p->port_xmit_discards =
+			cpu_to_be16((u16)cntrs.port_xmit_discards);
+	if (cntrs.local_link_integrity_errors > 0xFUL)
+		cntrs.local_link_integrity_errors = 0xFUL;
+	if (cntrs.excessive_buffer_overrun_errors > 0xFUL)
+		cntrs.excessive_buffer_overrun_errors = 0xFUL;
+	p->link_overrun_errors = (cntrs.local_link_integrity_errors << 4) |
+		cntrs.excessive_buffer_overrun_errors;
+	if (cntrs.vl15_dropped > 0xFFFFUL)
+		p->vl15_dropped = cpu_to_be16(0xFFFF);
+	else
+		p->vl15_dropped = cpu_to_be16((u16)cntrs.vl15_dropped);
+	if (cntrs.port_xmit_data > 0xFFFFFFFFUL)
+		p->port_xmit_data = cpu_to_be32(0xFFFFFFFF);
+	else
+		p->port_xmit_data = cpu_to_be32((u32)cntrs.port_xmit_data);
+	if (cntrs.port_rcv_data > 0xFFFFFFFFUL)
+		p->port_rcv_data = cpu_to_be32(0xFFFFFFFF);
+	else
+		p->port_rcv_data = cpu_to_be32((u32)cntrs.port_rcv_data);
+	if (cntrs.port_xmit_packets > 0xFFFFFFFFUL)
+		p->port_xmit_packets = cpu_to_be32(0xFFFFFFFF);
+	else
+		p->port_xmit_packets =
+			cpu_to_be32((u32)cntrs.port_xmit_packets);
+	if (cntrs.port_rcv_packets > 0xFFFFFFFFUL)
+		p->port_rcv_packets = cpu_to_be32(0xFFFFFFFF);
+	else
+		p->port_rcv_packets =
+			cpu_to_be32((u32) cntrs.port_rcv_packets);
+
+	return reply((struct ib_smp *) pmp);
+}
+
+static int pma_get_portcounters_cong(struct ib_pma_mad *pmp,
+				     struct ib_device *ibdev, u8 port)
+{
+	/* Congestion PMA packets start at offset 24 not 64 */
+	struct ib_pma_portcounters_cong *p =
+		(struct ib_pma_portcounters_cong *)pmp->reserved;
+	struct qib_verbs_counters cntrs;
+	struct qib_ibport *ibp = to_iport(ibdev, port);
+	struct qib_pportdata *ppd = ppd_from_ibp(ibp);
+	struct qib_devdata *dd = dd_from_ppd(ppd);
+	u32 port_select = be32_to_cpu(pmp->mad_hdr.attr_mod) & 0xFF;
+	u64 xmit_wait_counter;
+	unsigned long flags;
+
+	/*
+	 * This check is performed only in the GET method because the
+	 * SET method ends up calling this anyway.
+	 */
+	if (!dd->psxmitwait_supported)
+		pmp->mad_hdr.status |= IB_SMP_UNSUP_METH_ATTR;
+	if (port_select != port)
+		pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+
+	qib_get_counters(ppd, &cntrs);
+	spin_lock_irqsave(&ppd->ibport_data.lock, flags);
+	xmit_wait_counter = xmit_wait_get_value_delta(ppd);
+	spin_unlock_irqrestore(&ppd->ibport_data.lock, flags);
+
+	/* Adjust counters for any resets done. */
+	cntrs.symbol_error_counter -= ibp->z_symbol_error_counter;
+	cntrs.link_error_recovery_counter -=
+		ibp->z_link_error_recovery_counter;
+	cntrs.link_downed_counter -= ibp->z_link_downed_counter;
+	cntrs.port_rcv_errors -= ibp->z_port_rcv_errors;
+	cntrs.port_rcv_remphys_errors -=
+		ibp->z_port_rcv_remphys_errors;
+	cntrs.port_xmit_discards -= ibp->z_port_xmit_discards;
+	cntrs.local_link_integrity_errors -=
+		ibp->z_local_link_integrity_errors;
+	cntrs.excessive_buffer_overrun_errors -=
+		ibp->z_excessive_buffer_overrun_errors;
+	cntrs.vl15_dropped -= ibp->z_vl15_dropped;
+	cntrs.vl15_dropped += ibp->n_vl15_dropped;
+	cntrs.port_xmit_data -= ibp->z_port_xmit_data;
+	cntrs.port_rcv_data -= ibp->z_port_rcv_data;
+	cntrs.port_xmit_packets -= ibp->z_port_xmit_packets;
+	cntrs.port_rcv_packets -= ibp->z_port_rcv_packets;
+
+	memset(pmp->reserved, 0, sizeof(pmp->reserved) +
+	       sizeof(pmp->data));
+
+	/*
+	 * Set top 3 bits to indicate interval in picoseconds in
+	 * remaining bits.
+	 */
+	p->port_check_rate =
+		cpu_to_be16((QIB_XMIT_RATE_PICO << 13) |
+			    (dd->psxmitwait_check_rate &
+			     ~(QIB_XMIT_RATE_PICO << 13)));
+	p->port_adr_events = cpu_to_be64(0);
+	p->port_xmit_wait = cpu_to_be64(xmit_wait_counter);
+	p->port_xmit_data = cpu_to_be64(cntrs.port_xmit_data);
+	p->port_rcv_data = cpu_to_be64(cntrs.port_rcv_data);
+	p->port_xmit_packets =
+		cpu_to_be64(cntrs.port_xmit_packets);
+	p->port_rcv_packets =
+		cpu_to_be64(cntrs.port_rcv_packets);
+	if (cntrs.symbol_error_counter > 0xFFFFUL)
+		p->symbol_error_counter = cpu_to_be16(0xFFFF);
+	else
+		p->symbol_error_counter =
+			cpu_to_be16(
+				(u16)cntrs.symbol_error_counter);
+	if (cntrs.link_error_recovery_counter > 0xFFUL)
+		p->link_error_recovery_counter = 0xFF;
+	else
+		p->link_error_recovery_counter =
+			(u8)cntrs.link_error_recovery_counter;
+	if (cntrs.link_downed_counter > 0xFFUL)
+		p->link_downed_counter = 0xFF;
+	else
+		p->link_downed_counter =
+			(u8)cntrs.link_downed_counter;
+	if (cntrs.port_rcv_errors > 0xFFFFUL)
+		p->port_rcv_errors = cpu_to_be16(0xFFFF);
+	else
+		p->port_rcv_errors =
+			cpu_to_be16((u16) cntrs.port_rcv_errors);
+	if (cntrs.port_rcv_remphys_errors > 0xFFFFUL)
+		p->port_rcv_remphys_errors = cpu_to_be16(0xFFFF);
+	else
+		p->port_rcv_remphys_errors =
+			cpu_to_be16(
+				(u16)cntrs.port_rcv_remphys_errors);
+	if (cntrs.port_xmit_discards > 0xFFFFUL)
+		p->port_xmit_discards = cpu_to_be16(0xFFFF);
+	else
+		p->port_xmit_discards =
+			cpu_to_be16((u16)cntrs.port_xmit_discards);
+	if (cntrs.local_link_integrity_errors > 0xFUL)
+		cntrs.local_link_integrity_errors = 0xFUL;
+	if (cntrs.excessive_buffer_overrun_errors > 0xFUL)
+		cntrs.excessive_buffer_overrun_errors = 0xFUL;
+	p->link_overrun_errors = (cntrs.local_link_integrity_errors << 4) |
+		cntrs.excessive_buffer_overrun_errors;
+	if (cntrs.vl15_dropped > 0xFFFFUL)
+		p->vl15_dropped = cpu_to_be16(0xFFFF);
+	else
+		p->vl15_dropped = cpu_to_be16((u16)cntrs.vl15_dropped);
+
+	return reply((struct ib_smp *)pmp);
+}
+
+static void qib_snapshot_pmacounters(
+	struct qib_ibport *ibp,
+	struct qib_pma_counters *pmacounters)
+{
+	struct qib_pma_counters *p;
+	int cpu;
+
+	memset(pmacounters, 0, sizeof(*pmacounters));
+	for_each_possible_cpu(cpu) {
+		p = per_cpu_ptr(ibp->pmastats, cpu);
+		pmacounters->n_unicast_xmit += p->n_unicast_xmit;
+		pmacounters->n_unicast_rcv += p->n_unicast_rcv;
+		pmacounters->n_multicast_xmit += p->n_multicast_xmit;
+		pmacounters->n_multicast_rcv += p->n_multicast_rcv;
+	}
+}
+
+static int pma_get_portcounters_ext(struct ib_pma_mad *pmp,
+				    struct ib_device *ibdev, u8 port)
+{
+	struct ib_pma_portcounters_ext *p =
+		(struct ib_pma_portcounters_ext *)pmp->data;
+	struct qib_ibport *ibp = to_iport(ibdev, port);
+	struct qib_pportdata *ppd = ppd_from_ibp(ibp);
+	u64 swords, rwords, spkts, rpkts, xwait;
+	struct qib_pma_counters pma;
+	u8 port_select = p->port_select;
+
+	memset(pmp->data, 0, sizeof(pmp->data));
+
+	p->port_select = port_select;
+	if (pmp->mad_hdr.attr_mod != 0 || port_select != port) {
+		pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+		goto bail;
+	}
+
+	qib_snapshot_counters(ppd, &swords, &rwords, &spkts, &rpkts, &xwait);
+
+	/* Adjust counters for any resets done. */
+	swords -= ibp->z_port_xmit_data;
+	rwords -= ibp->z_port_rcv_data;
+	spkts -= ibp->z_port_xmit_packets;
+	rpkts -= ibp->z_port_rcv_packets;
+
+	p->port_xmit_data = cpu_to_be64(swords);
+	p->port_rcv_data = cpu_to_be64(rwords);
+	p->port_xmit_packets = cpu_to_be64(spkts);
+	p->port_rcv_packets = cpu_to_be64(rpkts);
+
+	qib_snapshot_pmacounters(ibp, &pma);
+
+	p->port_unicast_xmit_packets = cpu_to_be64(pma.n_unicast_xmit
+		- ibp->z_unicast_xmit);
+	p->port_unicast_rcv_packets = cpu_to_be64(pma.n_unicast_rcv
+		- ibp->z_unicast_rcv);
+	p->port_multicast_xmit_packets = cpu_to_be64(pma.n_multicast_xmit
+		- ibp->z_multicast_xmit);
+	p->port_multicast_rcv_packets = cpu_to_be64(pma.n_multicast_rcv
+		- ibp->z_multicast_rcv);
+
+bail:
+	return reply((struct ib_smp *) pmp);
+}
+
+static int pma_set_portcounters(struct ib_pma_mad *pmp,
+				struct ib_device *ibdev, u8 port)
+{
+	struct ib_pma_portcounters *p = (struct ib_pma_portcounters *)
+		pmp->data;
+	struct qib_ibport *ibp = to_iport(ibdev, port);
+	struct qib_pportdata *ppd = ppd_from_ibp(ibp);
+	struct qib_verbs_counters cntrs;
+
+	/*
+	 * Since the HW doesn't support clearing counters, we save the
+	 * current count and subtract it from future responses.
+	 */
+	qib_get_counters(ppd, &cntrs);
+
+	if (p->counter_select & IB_PMA_SEL_SYMBOL_ERROR)
+		ibp->z_symbol_error_counter = cntrs.symbol_error_counter;
+
+	if (p->counter_select & IB_PMA_SEL_LINK_ERROR_RECOVERY)
+		ibp->z_link_error_recovery_counter =
+			cntrs.link_error_recovery_counter;
+
+	if (p->counter_select & IB_PMA_SEL_LINK_DOWNED)
+		ibp->z_link_downed_counter = cntrs.link_downed_counter;
+
+	if (p->counter_select & IB_PMA_SEL_PORT_RCV_ERRORS)
+		ibp->z_port_rcv_errors = cntrs.port_rcv_errors;
+
+	if (p->counter_select & IB_PMA_SEL_PORT_RCV_REMPHYS_ERRORS)
+		ibp->z_port_rcv_remphys_errors =
+			cntrs.port_rcv_remphys_errors;
+
+	if (p->counter_select & IB_PMA_SEL_PORT_XMIT_DISCARDS)
+		ibp->z_port_xmit_discards = cntrs.port_xmit_discards;
+
+	if (p->counter_select & IB_PMA_SEL_LOCAL_LINK_INTEGRITY_ERRORS)
+		ibp->z_local_link_integrity_errors =
+			cntrs.local_link_integrity_errors;
+
+	if (p->counter_select & IB_PMA_SEL_EXCESSIVE_BUFFER_OVERRUNS)
+		ibp->z_excessive_buffer_overrun_errors =
+			cntrs.excessive_buffer_overrun_errors;
+
+	if (p->counter_select & IB_PMA_SEL_PORT_VL15_DROPPED) {
+		ibp->n_vl15_dropped = 0;
+		ibp->z_vl15_dropped = cntrs.vl15_dropped;
+	}
+
+	if (p->counter_select & IB_PMA_SEL_PORT_XMIT_DATA)
+		ibp->z_port_xmit_data = cntrs.port_xmit_data;
+
+	if (p->counter_select & IB_PMA_SEL_PORT_RCV_DATA)
+		ibp->z_port_rcv_data = cntrs.port_rcv_data;
+
+	if (p->counter_select & IB_PMA_SEL_PORT_XMIT_PACKETS)
+		ibp->z_port_xmit_packets = cntrs.port_xmit_packets;
+
+	if (p->counter_select & IB_PMA_SEL_PORT_RCV_PACKETS)
+		ibp->z_port_rcv_packets = cntrs.port_rcv_packets;
+
+	return pma_get_portcounters(pmp, ibdev, port);
+}
+
+static int pma_set_portcounters_cong(struct ib_pma_mad *pmp,
+				     struct ib_device *ibdev, u8 port)
+{
+	struct qib_ibport *ibp = to_iport(ibdev, port);
+	struct qib_pportdata *ppd = ppd_from_ibp(ibp);
+	struct qib_devdata *dd = dd_from_ppd(ppd);
+	struct qib_verbs_counters cntrs;
+	u32 counter_select = (be32_to_cpu(pmp->mad_hdr.attr_mod) >> 24) & 0xFF;
+	int ret = 0;
+	unsigned long flags;
+
+	qib_get_counters(ppd, &cntrs);
+	/* Get counter values before we save them */
+	ret = pma_get_portcounters_cong(pmp, ibdev, port);
+
+	if (counter_select & IB_PMA_SEL_CONG_XMIT) {
+		spin_lock_irqsave(&ppd->ibport_data.lock, flags);
+		ppd->cong_stats.counter = 0;
+		dd->f_set_cntr_sample(ppd, QIB_CONG_TIMER_PSINTERVAL,
+				      0x0);
+		spin_unlock_irqrestore(&ppd->ibport_data.lock, flags);
+	}
+	if (counter_select & IB_PMA_SEL_CONG_PORT_DATA) {
+		ibp->z_port_xmit_data = cntrs.port_xmit_data;
+		ibp->z_port_rcv_data = cntrs.port_rcv_data;
+		ibp->z_port_xmit_packets = cntrs.port_xmit_packets;
+		ibp->z_port_rcv_packets = cntrs.port_rcv_packets;
+	}
+	if (counter_select & IB_PMA_SEL_CONG_ALL) {
+		ibp->z_symbol_error_counter =
+			cntrs.symbol_error_counter;
+		ibp->z_link_error_recovery_counter =
+			cntrs.link_error_recovery_counter;
+		ibp->z_link_downed_counter =
+			cntrs.link_downed_counter;
+		ibp->z_port_rcv_errors = cntrs.port_rcv_errors;
+		ibp->z_port_rcv_remphys_errors =
+			cntrs.port_rcv_remphys_errors;
+		ibp->z_port_xmit_discards =
+			cntrs.port_xmit_discards;
+		ibp->z_local_link_integrity_errors =
+			cntrs.local_link_integrity_errors;
+		ibp->z_excessive_buffer_overrun_errors =
+			cntrs.excessive_buffer_overrun_errors;
+		ibp->n_vl15_dropped = 0;
+		ibp->z_vl15_dropped = cntrs.vl15_dropped;
+	}
+
+	return ret;
+}
+
+static int pma_set_portcounters_ext(struct ib_pma_mad *pmp,
+				    struct ib_device *ibdev, u8 port)
+{
+	struct ib_pma_portcounters *p = (struct ib_pma_portcounters *)
+		pmp->data;
+	struct qib_ibport *ibp = to_iport(ibdev, port);
+	struct qib_pportdata *ppd = ppd_from_ibp(ibp);
+	u64 swords, rwords, spkts, rpkts, xwait;
+	struct qib_pma_counters pma;
+
+	qib_snapshot_counters(ppd, &swords, &rwords, &spkts, &rpkts, &xwait);
+
+	if (p->counter_select & IB_PMA_SELX_PORT_XMIT_DATA)
+		ibp->z_port_xmit_data = swords;
+
+	if (p->counter_select & IB_PMA_SELX_PORT_RCV_DATA)
+		ibp->z_port_rcv_data = rwords;
+
+	if (p->counter_select & IB_PMA_SELX_PORT_XMIT_PACKETS)
+		ibp->z_port_xmit_packets = spkts;
+
+	if (p->counter_select & IB_PMA_SELX_PORT_RCV_PACKETS)
+		ibp->z_port_rcv_packets = rpkts;
+
+	qib_snapshot_pmacounters(ibp, &pma);
+
+	if (p->counter_select & IB_PMA_SELX_PORT_UNI_XMIT_PACKETS)
+		ibp->z_unicast_xmit = pma.n_unicast_xmit;
+
+	if (p->counter_select & IB_PMA_SELX_PORT_UNI_RCV_PACKETS)
+		ibp->z_unicast_rcv = pma.n_unicast_rcv;
+
+	if (p->counter_select & IB_PMA_SELX_PORT_MULTI_XMIT_PACKETS)
+		ibp->z_multicast_xmit = pma.n_multicast_xmit;
+
+	if (p->counter_select & IB_PMA_SELX_PORT_MULTI_RCV_PACKETS)
+		ibp->z_multicast_rcv = pma.n_multicast_rcv;
+
+	return pma_get_portcounters_ext(pmp, ibdev, port);
+}
+
+static int process_subn(struct ib_device *ibdev, int mad_flags,
+			u8 port, struct ib_mad *in_mad,
+			struct ib_mad *out_mad)
+{
+	struct ib_smp *smp = (struct ib_smp *)out_mad;
+	struct qib_ibport *ibp = to_iport(ibdev, port);
+	struct qib_pportdata *ppd = ppd_from_ibp(ibp);
+	int ret;
+
+	*out_mad = *in_mad;
+	if (smp->class_version != 1) {
+		smp->status |= IB_SMP_UNSUP_VERSION;
+		ret = reply(smp);
+		goto bail;
+	}
+
+	ret = check_mkey(ibp, smp, mad_flags);
+	if (ret) {
+		u32 port_num = be32_to_cpu(smp->attr_mod);
+
+		/*
+		 * If this is a get/set portinfo, we already check the
+		 * M_Key if the MAD is for another port and the M_Key
+		 * is OK on the receiving port. This check is needed
+		 * to increment the error counters when the M_Key
+		 * fails to match on *both* ports.
+		 */
+		if (in_mad->mad_hdr.attr_id == IB_SMP_ATTR_PORT_INFO &&
+		    (smp->method == IB_MGMT_METHOD_GET ||
+		     smp->method == IB_MGMT_METHOD_SET) &&
+		    port_num && port_num <= ibdev->phys_port_cnt &&
+		    port != port_num)
+			(void) check_mkey(to_iport(ibdev, port_num), smp, 0);
+		ret = IB_MAD_RESULT_FAILURE;
+		goto bail;
+	}
+
+	switch (smp->method) {
+	case IB_MGMT_METHOD_GET:
+		switch (smp->attr_id) {
+		case IB_SMP_ATTR_NODE_DESC:
+			ret = subn_get_nodedescription(smp, ibdev);
+			goto bail;
+		case IB_SMP_ATTR_NODE_INFO:
+			ret = subn_get_nodeinfo(smp, ibdev, port);
+			goto bail;
+		case IB_SMP_ATTR_GUID_INFO:
+			ret = subn_get_guidinfo(smp, ibdev, port);
+			goto bail;
+		case IB_SMP_ATTR_PORT_INFO:
+			ret = subn_get_portinfo(smp, ibdev, port);
+			goto bail;
+		case IB_SMP_ATTR_PKEY_TABLE:
+			ret = subn_get_pkeytable(smp, ibdev, port);
+			goto bail;
+		case IB_SMP_ATTR_SL_TO_VL_TABLE:
+			ret = subn_get_sl_to_vl(smp, ibdev, port);
+			goto bail;
+		case IB_SMP_ATTR_VL_ARB_TABLE:
+			ret = subn_get_vl_arb(smp, ibdev, port);
+			goto bail;
+		case IB_SMP_ATTR_SM_INFO:
+			if (ibp->port_cap_flags & IB_PORT_SM_DISABLED) {
+				ret = IB_MAD_RESULT_SUCCESS |
+					IB_MAD_RESULT_CONSUMED;
+				goto bail;
+			}
+			if (ibp->port_cap_flags & IB_PORT_SM) {
+				ret = IB_MAD_RESULT_SUCCESS;
+				goto bail;
+			}
+			/* FALLTHROUGH */
+		default:
+			smp->status |= IB_SMP_UNSUP_METH_ATTR;
+			ret = reply(smp);
+			goto bail;
+		}
+
+	case IB_MGMT_METHOD_SET:
+		switch (smp->attr_id) {
+		case IB_SMP_ATTR_GUID_INFO:
+			ret = subn_set_guidinfo(smp, ibdev, port);
+			goto bail;
+		case IB_SMP_ATTR_PORT_INFO:
+			ret = subn_set_portinfo(smp, ibdev, port);
+			goto bail;
+		case IB_SMP_ATTR_PKEY_TABLE:
+			ret = subn_set_pkeytable(smp, ibdev, port);
+			goto bail;
+		case IB_SMP_ATTR_SL_TO_VL_TABLE:
+			ret = subn_set_sl_to_vl(smp, ibdev, port);
+			goto bail;
+		case IB_SMP_ATTR_VL_ARB_TABLE:
+			ret = subn_set_vl_arb(smp, ibdev, port);
+			goto bail;
+		case IB_SMP_ATTR_SM_INFO:
+			if (ibp->port_cap_flags & IB_PORT_SM_DISABLED) {
+				ret = IB_MAD_RESULT_SUCCESS |
+					IB_MAD_RESULT_CONSUMED;
+				goto bail;
+			}
+			if (ibp->port_cap_flags & IB_PORT_SM) {
+				ret = IB_MAD_RESULT_SUCCESS;
+				goto bail;
+			}
+			/* FALLTHROUGH */
+		default:
+			smp->status |= IB_SMP_UNSUP_METH_ATTR;
+			ret = reply(smp);
+			goto bail;
+		}
+
+	case IB_MGMT_METHOD_TRAP_REPRESS:
+		if (smp->attr_id == IB_SMP_ATTR_NOTICE)
+			ret = subn_trap_repress(smp, ibdev, port);
+		else {
+			smp->status |= IB_SMP_UNSUP_METH_ATTR;
+			ret = reply(smp);
+		}
+		goto bail;
+
+	case IB_MGMT_METHOD_TRAP:
+	case IB_MGMT_METHOD_REPORT:
+	case IB_MGMT_METHOD_REPORT_RESP:
+	case IB_MGMT_METHOD_GET_RESP:
+		/*
+		 * The ib_mad module will call us to process responses
+		 * before checking for other consumers.
+		 * Just tell the caller to process it normally.
+		 */
+		ret = IB_MAD_RESULT_SUCCESS;
+		goto bail;
+
+	case IB_MGMT_METHOD_SEND:
+		if (ib_get_smp_direction(smp) &&
+		    smp->attr_id == QIB_VENDOR_IPG) {
+			ppd->dd->f_set_ib_cfg(ppd, QIB_IB_CFG_PORT,
+					      smp->data[0]);
+			ret = IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED;
+		} else
+			ret = IB_MAD_RESULT_SUCCESS;
+		goto bail;
+
+	default:
+		smp->status |= IB_SMP_UNSUP_METHOD;
+		ret = reply(smp);
+	}
+
+bail:
+	return ret;
+}
+
+static int process_perf(struct ib_device *ibdev, u8 port,
+			struct ib_mad *in_mad,
+			struct ib_mad *out_mad)
+{
+	struct ib_pma_mad *pmp = (struct ib_pma_mad *)out_mad;
+	int ret;
+
+	*out_mad = *in_mad;
+	if (pmp->mad_hdr.class_version != 1) {
+		pmp->mad_hdr.status |= IB_SMP_UNSUP_VERSION;
+		ret = reply((struct ib_smp *) pmp);
+		goto bail;
+	}
+
+	switch (pmp->mad_hdr.method) {
+	case IB_MGMT_METHOD_GET:
+		switch (pmp->mad_hdr.attr_id) {
+		case IB_PMA_CLASS_PORT_INFO:
+			ret = pma_get_classportinfo(pmp, ibdev);
+			goto bail;
+		case IB_PMA_PORT_SAMPLES_CONTROL:
+			ret = pma_get_portsamplescontrol(pmp, ibdev, port);
+			goto bail;
+		case IB_PMA_PORT_SAMPLES_RESULT:
+			ret = pma_get_portsamplesresult(pmp, ibdev, port);
+			goto bail;
+		case IB_PMA_PORT_SAMPLES_RESULT_EXT:
+			ret = pma_get_portsamplesresult_ext(pmp, ibdev, port);
+			goto bail;
+		case IB_PMA_PORT_COUNTERS:
+			ret = pma_get_portcounters(pmp, ibdev, port);
+			goto bail;
+		case IB_PMA_PORT_COUNTERS_EXT:
+			ret = pma_get_portcounters_ext(pmp, ibdev, port);
+			goto bail;
+		case IB_PMA_PORT_COUNTERS_CONG:
+			ret = pma_get_portcounters_cong(pmp, ibdev, port);
+			goto bail;
+		default:
+			pmp->mad_hdr.status |= IB_SMP_UNSUP_METH_ATTR;
+			ret = reply((struct ib_smp *) pmp);
+			goto bail;
+		}
+
+	case IB_MGMT_METHOD_SET:
+		switch (pmp->mad_hdr.attr_id) {
+		case IB_PMA_PORT_SAMPLES_CONTROL:
+			ret = pma_set_portsamplescontrol(pmp, ibdev, port);
+			goto bail;
+		case IB_PMA_PORT_COUNTERS:
+			ret = pma_set_portcounters(pmp, ibdev, port);
+			goto bail;
+		case IB_PMA_PORT_COUNTERS_EXT:
+			ret = pma_set_portcounters_ext(pmp, ibdev, port);
+			goto bail;
+		case IB_PMA_PORT_COUNTERS_CONG:
+			ret = pma_set_portcounters_cong(pmp, ibdev, port);
+			goto bail;
+		default:
+			pmp->mad_hdr.status |= IB_SMP_UNSUP_METH_ATTR;
+			ret = reply((struct ib_smp *) pmp);
+			goto bail;
+		}
+
+	case IB_MGMT_METHOD_TRAP:
+	case IB_MGMT_METHOD_GET_RESP:
+		/*
+		 * The ib_mad module will call us to process responses
+		 * before checking for other consumers.
+		 * Just tell the caller to process it normally.
+		 */
+		ret = IB_MAD_RESULT_SUCCESS;
+		goto bail;
+
+	default:
+		pmp->mad_hdr.status |= IB_SMP_UNSUP_METHOD;
+		ret = reply((struct ib_smp *) pmp);
+	}
+
+bail:
+	return ret;
+}
+
+static int cc_get_classportinfo(struct ib_cc_mad *ccp,
+				struct ib_device *ibdev)
+{
+	struct ib_cc_classportinfo_attr *p =
+		(struct ib_cc_classportinfo_attr *)ccp->mgmt_data;
+
+	memset(ccp->mgmt_data, 0, sizeof(ccp->mgmt_data));
+
+	p->base_version = 1;
+	p->class_version = 1;
+	p->cap_mask = 0;
+
+	/*
+	 * Expected response time is 4.096 usec. * 2^18 == 1.073741824 sec.
+	 */
+	p->resp_time_value = 18;
+
+	return reply((struct ib_smp *) ccp);
+}
+
+static int cc_get_congestion_info(struct ib_cc_mad *ccp,
+				struct ib_device *ibdev, u8 port)
+{
+	struct ib_cc_info_attr *p =
+		(struct ib_cc_info_attr *)ccp->mgmt_data;
+	struct qib_ibport *ibp = to_iport(ibdev, port);
+	struct qib_pportdata *ppd = ppd_from_ibp(ibp);
+
+	memset(ccp->mgmt_data, 0, sizeof(ccp->mgmt_data));
+
+	p->congestion_info = 0;
+	p->control_table_cap = ppd->cc_max_table_entries;
+
+	return reply((struct ib_smp *) ccp);
+}
+
+static int cc_get_congestion_setting(struct ib_cc_mad *ccp,
+				struct ib_device *ibdev, u8 port)
+{
+	int i;
+	struct ib_cc_congestion_setting_attr *p =
+		(struct ib_cc_congestion_setting_attr *)ccp->mgmt_data;
+	struct qib_ibport *ibp = to_iport(ibdev, port);
+	struct qib_pportdata *ppd = ppd_from_ibp(ibp);
+	struct ib_cc_congestion_entry_shadow *entries;
+
+	memset(ccp->mgmt_data, 0, sizeof(ccp->mgmt_data));
+
+	spin_lock(&ppd->cc_shadow_lock);
+
+	entries = ppd->congestion_entries_shadow->entries;
+	p->port_control = cpu_to_be16(
+		ppd->congestion_entries_shadow->port_control);
+	p->control_map = cpu_to_be16(
+		ppd->congestion_entries_shadow->control_map);
+	for (i = 0; i < IB_CC_CCS_ENTRIES; i++) {
+		p->entries[i].ccti_increase = entries[i].ccti_increase;
+		p->entries[i].ccti_timer = cpu_to_be16(entries[i].ccti_timer);
+		p->entries[i].trigger_threshold = entries[i].trigger_threshold;
+		p->entries[i].ccti_min = entries[i].ccti_min;
+	}
+
+	spin_unlock(&ppd->cc_shadow_lock);
+
+	return reply((struct ib_smp *) ccp);
+}
+
+static int cc_get_congestion_control_table(struct ib_cc_mad *ccp,
+				struct ib_device *ibdev, u8 port)
+{
+	struct ib_cc_table_attr *p =
+		(struct ib_cc_table_attr *)ccp->mgmt_data;
+	struct qib_ibport *ibp = to_iport(ibdev, port);
+	struct qib_pportdata *ppd = ppd_from_ibp(ibp);
+	u32 cct_block_index = be32_to_cpu(ccp->attr_mod);
+	u32 max_cct_block;
+	u32 cct_entry;
+	struct ib_cc_table_entry_shadow *entries;
+	int i;
+
+	/* Is the table index more than what is supported? */
+	if (cct_block_index > IB_CC_TABLE_CAP_DEFAULT - 1)
+		goto bail;
+
+	memset(ccp->mgmt_data, 0, sizeof(ccp->mgmt_data));
+
+	spin_lock(&ppd->cc_shadow_lock);
+
+	max_cct_block =
+		(ppd->ccti_entries_shadow->ccti_last_entry + 1)/IB_CCT_ENTRIES;
+	max_cct_block = max_cct_block ? max_cct_block - 1 : 0;
+
+	if (cct_block_index > max_cct_block) {
+		spin_unlock(&ppd->cc_shadow_lock);
+		goto bail;
+	}
+
+	ccp->attr_mod = cpu_to_be32(cct_block_index);
+
+	cct_entry = IB_CCT_ENTRIES * (cct_block_index + 1);
+
+	cct_entry--;
+
+	p->ccti_limit = cpu_to_be16(cct_entry);
+
+	entries = &ppd->ccti_entries_shadow->
+			entries[IB_CCT_ENTRIES * cct_block_index];
+	cct_entry %= IB_CCT_ENTRIES;
+
+	for (i = 0; i <= cct_entry; i++)
+		p->ccti_entries[i].entry = cpu_to_be16(entries[i].entry);
+
+	spin_unlock(&ppd->cc_shadow_lock);
+
+	return reply((struct ib_smp *) ccp);
+
+bail:
+	return reply_failure((struct ib_smp *) ccp);
+}
+
+static int cc_set_congestion_setting(struct ib_cc_mad *ccp,
+				struct ib_device *ibdev, u8 port)
+{
+	struct ib_cc_congestion_setting_attr *p =
+		(struct ib_cc_congestion_setting_attr *)ccp->mgmt_data;
+	struct qib_ibport *ibp = to_iport(ibdev, port);
+	struct qib_pportdata *ppd = ppd_from_ibp(ibp);
+	int i;
+
+	ppd->cc_sl_control_map = be16_to_cpu(p->control_map);
+
+	for (i = 0; i < IB_CC_CCS_ENTRIES; i++) {
+		ppd->congestion_entries[i].ccti_increase =
+			p->entries[i].ccti_increase;
+
+		ppd->congestion_entries[i].ccti_timer =
+			be16_to_cpu(p->entries[i].ccti_timer);
+
+		ppd->congestion_entries[i].trigger_threshold =
+			p->entries[i].trigger_threshold;
+
+		ppd->congestion_entries[i].ccti_min =
+			p->entries[i].ccti_min;
+	}
+
+	return reply((struct ib_smp *) ccp);
+}
+
+static int cc_set_congestion_control_table(struct ib_cc_mad *ccp,
+				struct ib_device *ibdev, u8 port)
+{
+	struct ib_cc_table_attr *p =
+		(struct ib_cc_table_attr *)ccp->mgmt_data;
+	struct qib_ibport *ibp = to_iport(ibdev, port);
+	struct qib_pportdata *ppd = ppd_from_ibp(ibp);
+	u32 cct_block_index = be32_to_cpu(ccp->attr_mod);
+	u32 cct_entry;
+	struct ib_cc_table_entry_shadow *entries;
+	int i;
+
+	/* Is the table index more than what is supported? */
+	if (cct_block_index > IB_CC_TABLE_CAP_DEFAULT - 1)
+		goto bail;
+
+	/* If this packet is the first in the sequence then
+	 * zero the total table entry count.
+	 */
+	if (be16_to_cpu(p->ccti_limit) < IB_CCT_ENTRIES)
+		ppd->total_cct_entry = 0;
+
+	cct_entry = (be16_to_cpu(p->ccti_limit))%IB_CCT_ENTRIES;
+
+	/* ccti_limit is 0 to 63 */
+	ppd->total_cct_entry += (cct_entry + 1);
+
+	if (ppd->total_cct_entry > ppd->cc_supported_table_entries)
+		goto bail;
+
+	ppd->ccti_limit = be16_to_cpu(p->ccti_limit);
+
+	entries = ppd->ccti_entries + (IB_CCT_ENTRIES * cct_block_index);
+
+	for (i = 0; i <= cct_entry; i++)
+		entries[i].entry = be16_to_cpu(p->ccti_entries[i].entry);
+
+	spin_lock(&ppd->cc_shadow_lock);
+
+	ppd->ccti_entries_shadow->ccti_last_entry = ppd->total_cct_entry - 1;
+	memcpy(ppd->ccti_entries_shadow->entries, ppd->ccti_entries,
+		(ppd->total_cct_entry * sizeof(struct ib_cc_table_entry)));
+
+	ppd->congestion_entries_shadow->port_control = IB_CC_CCS_PC_SL_BASED;
+	ppd->congestion_entries_shadow->control_map = ppd->cc_sl_control_map;
+	memcpy(ppd->congestion_entries_shadow->entries, ppd->congestion_entries,
+		IB_CC_CCS_ENTRIES * sizeof(struct ib_cc_congestion_entry));
+
+	spin_unlock(&ppd->cc_shadow_lock);
+
+	return reply((struct ib_smp *) ccp);
+
+bail:
+	return reply_failure((struct ib_smp *) ccp);
+}
+
+static int check_cc_key(struct qib_ibport *ibp,
+			struct ib_cc_mad *ccp, int mad_flags)
+{
+	return 0;
+}
+
+static int process_cc(struct ib_device *ibdev, int mad_flags,
+			u8 port, struct ib_mad *in_mad,
+			struct ib_mad *out_mad)
+{
+	struct ib_cc_mad *ccp = (struct ib_cc_mad *)out_mad;
+	struct qib_ibport *ibp = to_iport(ibdev, port);
+	int ret;
+
+	*out_mad = *in_mad;
+
+	if (ccp->class_version != 2) {
+		ccp->status |= IB_SMP_UNSUP_VERSION;
+		ret = reply((struct ib_smp *)ccp);
+		goto bail;
+	}
+
+	ret = check_cc_key(ibp, ccp, mad_flags);
+	if (ret)
+		goto bail;
+
+	switch (ccp->method) {
+	case IB_MGMT_METHOD_GET:
+		switch (ccp->attr_id) {
+		case IB_CC_ATTR_CLASSPORTINFO:
+			ret = cc_get_classportinfo(ccp, ibdev);
+			goto bail;
+
+		case IB_CC_ATTR_CONGESTION_INFO:
+			ret = cc_get_congestion_info(ccp, ibdev, port);
+			goto bail;
+
+		case IB_CC_ATTR_CA_CONGESTION_SETTING:
+			ret = cc_get_congestion_setting(ccp, ibdev, port);
+			goto bail;
+
+		case IB_CC_ATTR_CONGESTION_CONTROL_TABLE:
+			ret = cc_get_congestion_control_table(ccp, ibdev, port);
+			goto bail;
+
+			/* FALLTHROUGH */
+		default:
+			ccp->status |= IB_SMP_UNSUP_METH_ATTR;
+			ret = reply((struct ib_smp *) ccp);
+			goto bail;
+		}
+
+	case IB_MGMT_METHOD_SET:
+		switch (ccp->attr_id) {
+		case IB_CC_ATTR_CA_CONGESTION_SETTING:
+			ret = cc_set_congestion_setting(ccp, ibdev, port);
+			goto bail;
+
+		case IB_CC_ATTR_CONGESTION_CONTROL_TABLE:
+			ret = cc_set_congestion_control_table(ccp, ibdev, port);
+			goto bail;
+
+			/* FALLTHROUGH */
+		default:
+			ccp->status |= IB_SMP_UNSUP_METH_ATTR;
+			ret = reply((struct ib_smp *) ccp);
+			goto bail;
+		}
+
+	case IB_MGMT_METHOD_GET_RESP:
+		/*
+		 * The ib_mad module will call us to process responses
+		 * before checking for other consumers.
+		 * Just tell the caller to process it normally.
+		 */
+		ret = IB_MAD_RESULT_SUCCESS;
+		goto bail;
+
+	case IB_MGMT_METHOD_TRAP:
+	default:
+		ccp->status |= IB_SMP_UNSUP_METHOD;
+		ret = reply((struct ib_smp *) ccp);
+	}
+
+bail:
+	return ret;
+}
+
+/**
+ * qib_process_mad - process an incoming MAD packet
+ * @ibdev: the infiniband device this packet came in on
+ * @mad_flags: MAD flags
+ * @port: the port number this packet came in on
+ * @in_wc: the work completion entry for this packet
+ * @in_grh: the global route header for this packet
+ * @in_mad: the incoming MAD
+ * @out_mad: any outgoing MAD reply
+ *
+ * Returns IB_MAD_RESULT_SUCCESS if this is a MAD that we are not
+ * interested in processing.
+ *
+ * Note that the verbs framework has already done the MAD sanity checks,
+ * and hop count/pointer updating for IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE
+ * MADs.
+ *
+ * This is called by the ib_mad module.
+ */
+int qib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port,
+		    struct ib_wc *in_wc, struct ib_grh *in_grh,
+		    struct ib_mad *in_mad, struct ib_mad *out_mad)
+{
+	int ret;
+	struct qib_ibport *ibp = to_iport(ibdev, port);
+	struct qib_pportdata *ppd = ppd_from_ibp(ibp);
+
+	switch (in_mad->mad_hdr.mgmt_class) {
+	case IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE:
+	case IB_MGMT_CLASS_SUBN_LID_ROUTED:
+		ret = process_subn(ibdev, mad_flags, port, in_mad, out_mad);
+		goto bail;
+
+	case IB_MGMT_CLASS_PERF_MGMT:
+		ret = process_perf(ibdev, port, in_mad, out_mad);
+		goto bail;
+
+	case IB_MGMT_CLASS_CONG_MGMT:
+		if (!ppd->congestion_entries_shadow ||
+			 !qib_cc_table_size) {
+			ret = IB_MAD_RESULT_SUCCESS;
+			goto bail;
+		}
+		ret = process_cc(ibdev, mad_flags, port, in_mad, out_mad);
+		goto bail;
+
+	default:
+		ret = IB_MAD_RESULT_SUCCESS;
+	}
+
+bail:
+	return ret;
+}
+
+static void send_handler(struct ib_mad_agent *agent,
+			 struct ib_mad_send_wc *mad_send_wc)
+{
+	ib_free_send_mad(mad_send_wc->send_buf);
+}
+
+static void xmit_wait_timer_func(unsigned long opaque)
+{
+	struct qib_pportdata *ppd = (struct qib_pportdata *)opaque;
+	struct qib_devdata *dd = dd_from_ppd(ppd);
+	unsigned long flags;
+	u8 status;
+
+	spin_lock_irqsave(&ppd->ibport_data.lock, flags);
+	if (ppd->cong_stats.flags == IB_PMA_CONG_HW_CONTROL_SAMPLE) {
+		status = dd->f_portcntr(ppd, QIBPORTCNTR_PSSTAT);
+		if (status == IB_PMA_SAMPLE_STATUS_DONE) {
+			/* save counter cache */
+			cache_hw_sample_counters(ppd);
+			ppd->cong_stats.flags = IB_PMA_CONG_HW_CONTROL_TIMER;
+		} else
+			goto done;
+	}
+	ppd->cong_stats.counter = xmit_wait_get_value_delta(ppd);
+	dd->f_set_cntr_sample(ppd, QIB_CONG_TIMER_PSINTERVAL, 0x0);
+done:
+	spin_unlock_irqrestore(&ppd->ibport_data.lock, flags);
+	mod_timer(&ppd->cong_stats.timer, jiffies + HZ);
+}
+
+int qib_create_agents(struct qib_ibdev *dev)
+{
+	struct qib_devdata *dd = dd_from_dev(dev);
+	struct ib_mad_agent *agent;
+	struct qib_ibport *ibp;
+	int p;
+	int ret;
+
+	for (p = 0; p < dd->num_pports; p++) {
+		ibp = &dd->pport[p].ibport_data;
+		agent = ib_register_mad_agent(&dev->ibdev, p + 1, IB_QPT_SMI,
+					      NULL, 0, send_handler,
+					      NULL, NULL, 0);
+		if (IS_ERR(agent)) {
+			ret = PTR_ERR(agent);
+			goto err;
+		}
+
+		/* Initialize xmit_wait structure */
+		dd->pport[p].cong_stats.counter = 0;
+		init_timer(&dd->pport[p].cong_stats.timer);
+		dd->pport[p].cong_stats.timer.function = xmit_wait_timer_func;
+		dd->pport[p].cong_stats.timer.data =
+			(unsigned long)(&dd->pport[p]);
+		dd->pport[p].cong_stats.timer.expires = 0;
+		add_timer(&dd->pport[p].cong_stats.timer);
+
+		ibp->send_agent = agent;
+	}
+
+	return 0;
+
+err:
+	for (p = 0; p < dd->num_pports; p++) {
+		ibp = &dd->pport[p].ibport_data;
+		if (ibp->send_agent) {
+			agent = ibp->send_agent;
+			ibp->send_agent = NULL;
+			ib_unregister_mad_agent(agent);
+		}
+	}
+
+	return ret;
+}
+
+void qib_free_agents(struct qib_ibdev *dev)
+{
+	struct qib_devdata *dd = dd_from_dev(dev);
+	struct ib_mad_agent *agent;
+	struct qib_ibport *ibp;
+	int p;
+
+	for (p = 0; p < dd->num_pports; p++) {
+		ibp = &dd->pport[p].ibport_data;
+		if (ibp->send_agent) {
+			agent = ibp->send_agent;
+			ibp->send_agent = NULL;
+			ib_unregister_mad_agent(agent);
+		}
+		if (ibp->sm_ah) {
+			ib_destroy_ah(&ibp->sm_ah->ibah);
+			ibp->sm_ah = NULL;
+		}
+		if (dd->pport[p].cong_stats.timer.data)
+			del_timer_sync(&dd->pport[p].cong_stats.timer);
+	}
+}
diff --git a/drivers/infiniband/hw/qib/qib_mad.h b/drivers/infiniband/hw/qib/qib_mad.h
new file mode 100644
index 0000000..941d4d5
--- /dev/null
+++ b/drivers/infiniband/hw/qib/qib_mad.h
@@ -0,0 +1,431 @@
+/*
+ * Copyright (c) 2012 Intel Corporation.  All rights reserved.
+ * Copyright (c) 2006 - 2012 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef _QIB_MAD_H
+#define _QIB_MAD_H
+
+#include <rdma/ib_pma.h>
+
+#define IB_SMP_UNSUP_VERSION    cpu_to_be16(0x0004)
+#define IB_SMP_UNSUP_METHOD     cpu_to_be16(0x0008)
+#define IB_SMP_UNSUP_METH_ATTR  cpu_to_be16(0x000C)
+#define IB_SMP_INVALID_FIELD    cpu_to_be16(0x001C)
+
+struct ib_node_info {
+	u8 base_version;
+	u8 class_version;
+	u8 node_type;
+	u8 num_ports;
+	__be64 sys_guid;
+	__be64 node_guid;
+	__be64 port_guid;
+	__be16 partition_cap;
+	__be16 device_id;
+	__be32 revision;
+	u8 local_port_num;
+	u8 vendor_id[3];
+} __packed;
+
+struct ib_mad_notice_attr {
+	u8 generic_type;
+	u8 prod_type_msb;
+	__be16 prod_type_lsb;
+	__be16 trap_num;
+	__be16 issuer_lid;
+	__be16 toggle_count;
+
+	union {
+		struct {
+			u8	details[54];
+		} raw_data;
+
+		struct {
+			__be16	reserved;
+			__be16	lid;		/* where violation happened */
+			u8	port_num;	/* where violation happened */
+		} __packed ntc_129_131;
+
+		struct {
+			__be16	reserved;
+			__be16	lid;		/* LID where change occurred */
+			u8	reserved2;
+			u8	local_changes;	/* low bit - local changes */
+			__be32	new_cap_mask;	/* new capability mask */
+			u8	reserved3;
+			u8	change_flags;	/* low 3 bits only */
+		} __packed ntc_144;
+
+		struct {
+			__be16	reserved;
+			__be16	lid;		/* lid where sys guid changed */
+			__be16	reserved2;
+			__be64	new_sys_guid;
+		} __packed ntc_145;
+
+		struct {
+			__be16	reserved;
+			__be16	lid;
+			__be16	dr_slid;
+			u8	method;
+			u8	reserved2;
+			__be16	attr_id;
+			__be32	attr_mod;
+			__be64	mkey;
+			u8	reserved3;
+			u8	dr_trunc_hop;
+			u8	dr_rtn_path[30];
+		} __packed ntc_256;
+
+		struct {
+			__be16		reserved;
+			__be16		lid1;
+			__be16		lid2;
+			__be32		key;
+			__be32		sl_qp1;	/* SL: high 4 bits */
+			__be32		qp2;	/* high 8 bits reserved */
+			union ib_gid	gid1;
+			union ib_gid	gid2;
+		} __packed ntc_257_258;
+
+	} details;
+};
+
+/*
+ * Generic trap/notice types
+ */
+#define IB_NOTICE_TYPE_FATAL	0x80
+#define IB_NOTICE_TYPE_URGENT	0x81
+#define IB_NOTICE_TYPE_SECURITY	0x82
+#define IB_NOTICE_TYPE_SM	0x83
+#define IB_NOTICE_TYPE_INFO	0x84
+
+/*
+ * Generic trap/notice producers
+ */
+#define IB_NOTICE_PROD_CA		cpu_to_be16(1)
+#define IB_NOTICE_PROD_SWITCH		cpu_to_be16(2)
+#define IB_NOTICE_PROD_ROUTER		cpu_to_be16(3)
+#define IB_NOTICE_PROD_CLASS_MGR	cpu_to_be16(4)
+
+/*
+ * Generic trap/notice numbers
+ */
+#define IB_NOTICE_TRAP_LLI_THRESH	cpu_to_be16(129)
+#define IB_NOTICE_TRAP_EBO_THRESH	cpu_to_be16(130)
+#define IB_NOTICE_TRAP_FLOW_UPDATE	cpu_to_be16(131)
+#define IB_NOTICE_TRAP_CAP_MASK_CHG	cpu_to_be16(144)
+#define IB_NOTICE_TRAP_SYS_GUID_CHG	cpu_to_be16(145)
+#define IB_NOTICE_TRAP_BAD_MKEY		cpu_to_be16(256)
+#define IB_NOTICE_TRAP_BAD_PKEY		cpu_to_be16(257)
+#define IB_NOTICE_TRAP_BAD_QKEY		cpu_to_be16(258)
+
+/*
+ * Repress trap/notice flags
+ */
+#define IB_NOTICE_REPRESS_LLI_THRESH	(1 << 0)
+#define IB_NOTICE_REPRESS_EBO_THRESH	(1 << 1)
+#define IB_NOTICE_REPRESS_FLOW_UPDATE	(1 << 2)
+#define IB_NOTICE_REPRESS_CAP_MASK_CHG	(1 << 3)
+#define IB_NOTICE_REPRESS_SYS_GUID_CHG	(1 << 4)
+#define IB_NOTICE_REPRESS_BAD_MKEY	(1 << 5)
+#define IB_NOTICE_REPRESS_BAD_PKEY	(1 << 6)
+#define IB_NOTICE_REPRESS_BAD_QKEY	(1 << 7)
+
+/*
+ * Generic trap/notice other local changes flags (trap 144).
+ */
+#define IB_NOTICE_TRAP_LSE_CHG		0x04	/* Link Speed Enable changed */
+#define IB_NOTICE_TRAP_LWE_CHG		0x02	/* Link Width Enable changed */
+#define IB_NOTICE_TRAP_NODE_DESC_CHG	0x01
+
+/*
+ * Generic trap/notice M_Key volation flags in dr_trunc_hop (trap 256).
+ */
+#define IB_NOTICE_TRAP_DR_NOTICE	0x80
+#define IB_NOTICE_TRAP_DR_TRUNC		0x40
+
+struct ib_vl_weight_elem {
+	u8      vl;     /* Only low 4 bits, upper 4 bits reserved */
+	u8      weight;
+};
+
+#define IB_VLARB_LOWPRI_0_31    1
+#define IB_VLARB_LOWPRI_32_63   2
+#define IB_VLARB_HIGHPRI_0_31   3
+#define IB_VLARB_HIGHPRI_32_63  4
+
+#define IB_PMA_PORT_COUNTERS_CONG       cpu_to_be16(0xFF00)
+
+struct ib_pma_portcounters_cong {
+	u8 reserved;
+	u8 reserved1;
+	__be16 port_check_rate;
+	__be16 symbol_error_counter;
+	u8 link_error_recovery_counter;
+	u8 link_downed_counter;
+	__be16 port_rcv_errors;
+	__be16 port_rcv_remphys_errors;
+	__be16 port_rcv_switch_relay_errors;
+	__be16 port_xmit_discards;
+	u8 port_xmit_constraint_errors;
+	u8 port_rcv_constraint_errors;
+	u8 reserved2;
+	u8 link_overrun_errors; /* LocalLink: 7:4, BufferOverrun: 3:0 */
+	__be16 reserved3;
+	__be16 vl15_dropped;
+	__be64 port_xmit_data;
+	__be64 port_rcv_data;
+	__be64 port_xmit_packets;
+	__be64 port_rcv_packets;
+	__be64 port_xmit_wait;
+	__be64 port_adr_events;
+} __packed;
+
+#define IB_PMA_CONG_HW_CONTROL_TIMER            0x00
+#define IB_PMA_CONG_HW_CONTROL_SAMPLE           0x01
+
+#define QIB_XMIT_RATE_UNSUPPORTED               0x0
+#define QIB_XMIT_RATE_PICO                      0x7
+/* number of 4nsec cycles equaling 2secs */
+#define QIB_CONG_TIMER_PSINTERVAL               0x1DCD64EC
+
+#define IB_PMA_SEL_CONG_ALL                     0x01
+#define IB_PMA_SEL_CONG_PORT_DATA               0x02
+#define IB_PMA_SEL_CONG_XMIT                    0x04
+#define IB_PMA_SEL_CONG_ROUTING                 0x08
+
+/*
+ * Congestion control class attributes
+ */
+#define IB_CC_ATTR_CLASSPORTINFO			cpu_to_be16(0x0001)
+#define IB_CC_ATTR_NOTICE				cpu_to_be16(0x0002)
+#define IB_CC_ATTR_CONGESTION_INFO			cpu_to_be16(0x0011)
+#define IB_CC_ATTR_CONGESTION_KEY_INFO			cpu_to_be16(0x0012)
+#define IB_CC_ATTR_CONGESTION_LOG			cpu_to_be16(0x0013)
+#define IB_CC_ATTR_SWITCH_CONGESTION_SETTING		cpu_to_be16(0x0014)
+#define IB_CC_ATTR_SWITCH_PORT_CONGESTION_SETTING	cpu_to_be16(0x0015)
+#define IB_CC_ATTR_CA_CONGESTION_SETTING		cpu_to_be16(0x0016)
+#define IB_CC_ATTR_CONGESTION_CONTROL_TABLE		cpu_to_be16(0x0017)
+#define IB_CC_ATTR_TIME_STAMP				cpu_to_be16(0x0018)
+
+/* generalizations for threshold values */
+#define IB_CC_THRESHOLD_NONE 0x0
+#define IB_CC_THRESHOLD_MIN  0x1
+#define IB_CC_THRESHOLD_MAX  0xf
+
+/* CCA MAD header constants */
+#define IB_CC_MAD_LOGDATA_LEN 32
+#define IB_CC_MAD_MGMTDATA_LEN 192
+
+struct ib_cc_mad {
+	u8	base_version;
+	u8	mgmt_class;
+	u8	class_version;
+	u8	method;
+	__be16	status;
+	__be16	class_specific;
+	__be64	tid;
+	__be16	attr_id;
+	__be16	resv;
+	__be32	attr_mod;
+	__be64 cckey;
+
+	/* For CongestionLog attribute only */
+	u8 log_data[IB_CC_MAD_LOGDATA_LEN];
+
+	u8 mgmt_data[IB_CC_MAD_MGMTDATA_LEN];
+} __packed;
+
+/*
+ * Congestion Control class portinfo capability mask bits
+ */
+#define IB_CC_CPI_CM_TRAP_GEN		cpu_to_be16(1 << 0)
+#define IB_CC_CPI_CM_GET_SET_NOTICE	cpu_to_be16(1 << 1)
+#define IB_CC_CPI_CM_CAP2		cpu_to_be16(1 << 2)
+#define IB_CC_CPI_CM_ENHANCEDPORT0_CC	cpu_to_be16(1 << 8)
+
+struct ib_cc_classportinfo_attr {
+	u8 base_version;
+	u8 class_version;
+	__be16 cap_mask;
+	u8 reserved[3];
+	u8 resp_time_value;     /* only lower 5 bits */
+	union ib_gid redirect_gid;
+	__be32 redirect_tc_sl_fl;       /* 8, 4, 20 bits respectively */
+	__be16 redirect_lid;
+	__be16 redirect_pkey;
+	__be32 redirect_qp;     /* only lower 24 bits */
+	__be32 redirect_qkey;
+	union ib_gid trap_gid;
+	__be32 trap_tc_sl_fl;   /* 8, 4, 20 bits respectively */
+	__be16 trap_lid;
+	__be16 trap_pkey;
+	__be32 trap_hl_qp;      /* 8, 24 bits respectively */
+	__be32 trap_qkey;
+} __packed;
+
+/* Congestion control traps */
+#define IB_CC_TRAP_KEY_VIOLATION 0x0000
+
+struct ib_cc_trap_key_violation_attr {
+	__be16 source_lid;
+	u8 method;
+	u8 reserved1;
+	__be16 attrib_id;
+	__be32 attrib_mod;
+	__be32 qp;
+	__be64 cckey;
+	u8 sgid[16];
+	u8 padding[24];
+} __packed;
+
+/* Congestion info flags */
+#define IB_CC_CI_FLAGS_CREDIT_STARVATION 0x1
+#define IB_CC_TABLE_CAP_DEFAULT 31
+
+struct ib_cc_info_attr {
+	__be16 congestion_info;
+	u8  control_table_cap; /* Multiple of 64 entry unit CCTs */
+} __packed;
+
+struct ib_cc_key_info_attr {
+	__be64 cckey;
+	u8  protect;
+	__be16 lease_period;
+	__be16 violations;
+} __packed;
+
+#define IB_CC_CL_CA_LOGEVENTS_LEN 208
+
+struct ib_cc_log_attr {
+	u8 log_type;
+	u8 congestion_flags;
+	__be16 threshold_event_counter;
+	__be16 threshold_congestion_event_map;
+	__be16 current_time_stamp;
+	u8 log_events[IB_CC_CL_CA_LOGEVENTS_LEN];
+} __packed;
+
+#define IB_CC_CLEC_SERVICETYPE_RC 0x0
+#define IB_CC_CLEC_SERVICETYPE_UC 0x1
+#define IB_CC_CLEC_SERVICETYPE_RD 0x2
+#define IB_CC_CLEC_SERVICETYPE_UD 0x3
+
+struct ib_cc_log_event {
+	u8 local_qp_cn_entry;
+	u8 remote_qp_number_cn_entry[3];
+	u8  sl_cn_entry:4;
+	u8  service_type_cn_entry:4;
+	__be32 remote_lid_cn_entry;
+	__be32 timestamp_cn_entry;
+} __packed;
+
+/* Sixteen congestion entries */
+#define IB_CC_CCS_ENTRIES 16
+
+/* Port control flags */
+#define IB_CC_CCS_PC_SL_BASED 0x01
+
+struct ib_cc_congestion_entry {
+	u8 ccti_increase;
+	__be16 ccti_timer;
+	u8 trigger_threshold;
+	u8 ccti_min; /* min CCTI for cc table */
+} __packed;
+
+struct ib_cc_congestion_entry_shadow {
+	u8 ccti_increase;
+	u16 ccti_timer;
+	u8 trigger_threshold;
+	u8 ccti_min; /* min CCTI for cc table */
+} __packed;
+
+struct ib_cc_congestion_setting_attr {
+	__be16 port_control;
+	__be16 control_map;
+	struct ib_cc_congestion_entry entries[IB_CC_CCS_ENTRIES];
+} __packed;
+
+struct ib_cc_congestion_setting_attr_shadow {
+	u16 port_control;
+	u16 control_map;
+	struct ib_cc_congestion_entry_shadow entries[IB_CC_CCS_ENTRIES];
+} __packed;
+
+#define IB_CC_TABLE_ENTRY_INCREASE_DEFAULT 1
+#define IB_CC_TABLE_ENTRY_TIMER_DEFAULT 1
+
+/* 64 Congestion Control table entries in a single MAD */
+#define IB_CCT_ENTRIES 64
+#define IB_CCT_MIN_ENTRIES (IB_CCT_ENTRIES * 2)
+
+struct ib_cc_table_entry {
+	__be16 entry; /* shift:2, multiplier:14 */
+};
+
+struct ib_cc_table_entry_shadow {
+	u16 entry; /* shift:2, multiplier:14 */
+};
+
+struct ib_cc_table_attr {
+	__be16 ccti_limit; /* max CCTI for cc table */
+	struct ib_cc_table_entry ccti_entries[IB_CCT_ENTRIES];
+} __packed;
+
+struct ib_cc_table_attr_shadow {
+	u16 ccti_limit; /* max CCTI for cc table */
+	struct ib_cc_table_entry_shadow ccti_entries[IB_CCT_ENTRIES];
+} __packed;
+
+#define CC_TABLE_SHADOW_MAX \
+	(IB_CC_TABLE_CAP_DEFAULT * IB_CCT_ENTRIES)
+
+struct cc_table_shadow {
+	u16 ccti_last_entry;
+	struct ib_cc_table_entry_shadow entries[CC_TABLE_SHADOW_MAX];
+} __packed;
+
+/*
+ * The PortSamplesControl.CounterMasks field is an array of 3 bit fields
+ * which specify the N'th counter's capabilities. See ch. 16.1.3.2.
+ * We support 5 counters which only count the mandatory quantities.
+ */
+#define COUNTER_MASK(q, n) (q << ((9 - n) * 3))
+#define COUNTER_MASK0_9 \
+	cpu_to_be32(COUNTER_MASK(1, 0) | \
+		    COUNTER_MASK(1, 1) | \
+		    COUNTER_MASK(1, 2) | \
+		    COUNTER_MASK(1, 3) | \
+		    COUNTER_MASK(1, 4))
+
+#endif				/* _QIB_MAD_H */
diff --git a/drivers/infiniband/hw/qib/qib_mmap.c b/drivers/infiniband/hw/qib/qib_mmap.c
new file mode 100644
index 0000000..8b73a11
--- /dev/null
+++ b/drivers/infiniband/hw/qib/qib_mmap.c
@@ -0,0 +1,174 @@
+/*
+ * Copyright (c) 2006, 2007, 2008, 2009 QLogic Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/mm.h>
+#include <linux/errno.h>
+#include <asm/pgtable.h>
+
+#include "qib_verbs.h"
+
+/**
+ * qib_release_mmap_info - free mmap info structure
+ * @ref: a pointer to the kref within struct qib_mmap_info
+ */
+void qib_release_mmap_info(struct kref *ref)
+{
+	struct qib_mmap_info *ip =
+		container_of(ref, struct qib_mmap_info, ref);
+	struct qib_ibdev *dev = to_idev(ip->context->device);
+
+	spin_lock_irq(&dev->pending_lock);
+	list_del(&ip->pending_mmaps);
+	spin_unlock_irq(&dev->pending_lock);
+
+	vfree(ip->obj);
+	kfree(ip);
+}
+
+/*
+ * open and close keep track of how many times the CQ is mapped,
+ * to avoid releasing it.
+ */
+static void qib_vma_open(struct vm_area_struct *vma)
+{
+	struct qib_mmap_info *ip = vma->vm_private_data;
+
+	kref_get(&ip->ref);
+}
+
+static void qib_vma_close(struct vm_area_struct *vma)
+{
+	struct qib_mmap_info *ip = vma->vm_private_data;
+
+	kref_put(&ip->ref, qib_release_mmap_info);
+}
+
+static struct vm_operations_struct qib_vm_ops = {
+	.open =     qib_vma_open,
+	.close =    qib_vma_close,
+};
+
+/**
+ * qib_mmap - create a new mmap region
+ * @context: the IB user context of the process making the mmap() call
+ * @vma: the VMA to be initialized
+ * Return zero if the mmap is OK. Otherwise, return an errno.
+ */
+int qib_mmap(struct ib_ucontext *context, struct vm_area_struct *vma)
+{
+	struct qib_ibdev *dev = to_idev(context->device);
+	unsigned long offset = vma->vm_pgoff << PAGE_SHIFT;
+	unsigned long size = vma->vm_end - vma->vm_start;
+	struct qib_mmap_info *ip, *pp;
+	int ret = -EINVAL;
+
+	/*
+	 * Search the device's list of objects waiting for a mmap call.
+	 * Normally, this list is very short since a call to create a
+	 * CQ, QP, or SRQ is soon followed by a call to mmap().
+	 */
+	spin_lock_irq(&dev->pending_lock);
+	list_for_each_entry_safe(ip, pp, &dev->pending_mmaps,
+				 pending_mmaps) {
+		/* Only the creator is allowed to mmap the object */
+		if (context != ip->context || (__u64) offset != ip->offset)
+			continue;
+		/* Don't allow a mmap larger than the object. */
+		if (size > ip->size)
+			break;
+
+		list_del_init(&ip->pending_mmaps);
+		spin_unlock_irq(&dev->pending_lock);
+
+		ret = remap_vmalloc_range(vma, ip->obj, 0);
+		if (ret)
+			goto done;
+		vma->vm_ops = &qib_vm_ops;
+		vma->vm_private_data = ip;
+		qib_vma_open(vma);
+		goto done;
+	}
+	spin_unlock_irq(&dev->pending_lock);
+done:
+	return ret;
+}
+
+/*
+ * Allocate information for qib_mmap
+ */
+struct qib_mmap_info *qib_create_mmap_info(struct qib_ibdev *dev,
+					   u32 size,
+					   struct ib_ucontext *context,
+					   void *obj) {
+	struct qib_mmap_info *ip;
+
+	ip = kmalloc(sizeof *ip, GFP_KERNEL);
+	if (!ip)
+		goto bail;
+
+	size = PAGE_ALIGN(size);
+
+	spin_lock_irq(&dev->mmap_offset_lock);
+	if (dev->mmap_offset == 0)
+		dev->mmap_offset = PAGE_SIZE;
+	ip->offset = dev->mmap_offset;
+	dev->mmap_offset += size;
+	spin_unlock_irq(&dev->mmap_offset_lock);
+
+	INIT_LIST_HEAD(&ip->pending_mmaps);
+	ip->size = size;
+	ip->context = context;
+	ip->obj = obj;
+	kref_init(&ip->ref);
+
+bail:
+	return ip;
+}
+
+void qib_update_mmap_info(struct qib_ibdev *dev, struct qib_mmap_info *ip,
+			  u32 size, void *obj)
+{
+	size = PAGE_ALIGN(size);
+
+	spin_lock_irq(&dev->mmap_offset_lock);
+	if (dev->mmap_offset == 0)
+		dev->mmap_offset = PAGE_SIZE;
+	ip->offset = dev->mmap_offset;
+	dev->mmap_offset += size;
+	spin_unlock_irq(&dev->mmap_offset_lock);
+
+	ip->size = size;
+	ip->obj = obj;
+}
diff --git a/drivers/infiniband/hw/qib/qib_mr.c b/drivers/infiniband/hw/qib/qib_mr.c
new file mode 100644
index 0000000..9bbb553
--- /dev/null
+++ b/drivers/infiniband/hw/qib/qib_mr.c
@@ -0,0 +1,532 @@
+/*
+ * Copyright (c) 2006, 2007, 2008, 2009 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <rdma/ib_umem.h>
+#include <rdma/ib_smi.h>
+
+#include "qib.h"
+
+/* Fast memory region */
+struct qib_fmr {
+	struct ib_fmr ibfmr;
+	struct qib_mregion mr;        /* must be last */
+};
+
+static inline struct qib_fmr *to_ifmr(struct ib_fmr *ibfmr)
+{
+	return container_of(ibfmr, struct qib_fmr, ibfmr);
+}
+
+static int init_qib_mregion(struct qib_mregion *mr, struct ib_pd *pd,
+	int count)
+{
+	int m, i = 0;
+	int rval = 0;
+
+	m = (count + QIB_SEGSZ - 1) / QIB_SEGSZ;
+	for (; i < m; i++) {
+		mr->map[i] = kzalloc(sizeof *mr->map[0], GFP_KERNEL);
+		if (!mr->map[i])
+			goto bail;
+	}
+	mr->mapsz = m;
+	init_completion(&mr->comp);
+	/* count returning the ptr to user */
+	atomic_set(&mr->refcount, 1);
+	mr->pd = pd;
+	mr->max_segs = count;
+out:
+	return rval;
+bail:
+	while (i)
+		kfree(mr->map[--i]);
+	rval = -ENOMEM;
+	goto out;
+}
+
+static void deinit_qib_mregion(struct qib_mregion *mr)
+{
+	int i = mr->mapsz;
+
+	mr->mapsz = 0;
+	while (i)
+		kfree(mr->map[--i]);
+}
+
+
+/**
+ * qib_get_dma_mr - get a DMA memory region
+ * @pd: protection domain for this memory region
+ * @acc: access flags
+ *
+ * Returns the memory region on success, otherwise returns an errno.
+ * Note that all DMA addresses should be created via the
+ * struct ib_dma_mapping_ops functions (see qib_dma.c).
+ */
+struct ib_mr *qib_get_dma_mr(struct ib_pd *pd, int acc)
+{
+	struct qib_mr *mr = NULL;
+	struct ib_mr *ret;
+	int rval;
+
+	if (to_ipd(pd)->user) {
+		ret = ERR_PTR(-EPERM);
+		goto bail;
+	}
+
+	mr = kzalloc(sizeof *mr, GFP_KERNEL);
+	if (!mr) {
+		ret = ERR_PTR(-ENOMEM);
+		goto bail;
+	}
+
+	rval = init_qib_mregion(&mr->mr, pd, 0);
+	if (rval) {
+		ret = ERR_PTR(rval);
+		goto bail;
+	}
+
+
+	rval = qib_alloc_lkey(&mr->mr, 1);
+	if (rval) {
+		ret = ERR_PTR(rval);
+		goto bail_mregion;
+	}
+
+	mr->mr.access_flags = acc;
+	ret = &mr->ibmr;
+done:
+	return ret;
+
+bail_mregion:
+	deinit_qib_mregion(&mr->mr);
+bail:
+	kfree(mr);
+	goto done;
+}
+
+static struct qib_mr *alloc_mr(int count, struct ib_pd *pd)
+{
+	struct qib_mr *mr;
+	int rval = -ENOMEM;
+	int m;
+
+	/* Allocate struct plus pointers to first level page tables. */
+	m = (count + QIB_SEGSZ - 1) / QIB_SEGSZ;
+	mr = kzalloc(sizeof *mr + m * sizeof mr->mr.map[0], GFP_KERNEL);
+	if (!mr)
+		goto bail;
+
+	rval = init_qib_mregion(&mr->mr, pd, count);
+	if (rval)
+		goto bail;
+	/*
+	 * ib_reg_phys_mr() will initialize mr->ibmr except for
+	 * lkey and rkey.
+	 */
+	rval = qib_alloc_lkey(&mr->mr, 0);
+	if (rval)
+		goto bail_mregion;
+	mr->ibmr.lkey = mr->mr.lkey;
+	mr->ibmr.rkey = mr->mr.lkey;
+done:
+	return mr;
+
+bail_mregion:
+	deinit_qib_mregion(&mr->mr);
+bail:
+	kfree(mr);
+	mr = ERR_PTR(rval);
+	goto done;
+}
+
+/**
+ * qib_reg_phys_mr - register a physical memory region
+ * @pd: protection domain for this memory region
+ * @buffer_list: pointer to the list of physical buffers to register
+ * @num_phys_buf: the number of physical buffers to register
+ * @iova_start: the starting address passed over IB which maps to this MR
+ *
+ * Returns the memory region on success, otherwise returns an errno.
+ */
+struct ib_mr *qib_reg_phys_mr(struct ib_pd *pd,
+			      struct ib_phys_buf *buffer_list,
+			      int num_phys_buf, int acc, u64 *iova_start)
+{
+	struct qib_mr *mr;
+	int n, m, i;
+	struct ib_mr *ret;
+
+	mr = alloc_mr(num_phys_buf, pd);
+	if (IS_ERR(mr)) {
+		ret = (struct ib_mr *)mr;
+		goto bail;
+	}
+
+	mr->mr.user_base = *iova_start;
+	mr->mr.iova = *iova_start;
+	mr->mr.access_flags = acc;
+
+	m = 0;
+	n = 0;
+	for (i = 0; i < num_phys_buf; i++) {
+		mr->mr.map[m]->segs[n].vaddr = (void *) buffer_list[i].addr;
+		mr->mr.map[m]->segs[n].length = buffer_list[i].size;
+		mr->mr.length += buffer_list[i].size;
+		n++;
+		if (n == QIB_SEGSZ) {
+			m++;
+			n = 0;
+		}
+	}
+
+	ret = &mr->ibmr;
+
+bail:
+	return ret;
+}
+
+/**
+ * qib_reg_user_mr - register a userspace memory region
+ * @pd: protection domain for this memory region
+ * @start: starting userspace address
+ * @length: length of region to register
+ * @mr_access_flags: access flags for this memory region
+ * @udata: unused by the QLogic_IB driver
+ *
+ * Returns the memory region on success, otherwise returns an errno.
+ */
+struct ib_mr *qib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
+			      u64 virt_addr, int mr_access_flags,
+			      struct ib_udata *udata)
+{
+	struct qib_mr *mr;
+	struct ib_umem *umem;
+	struct scatterlist *sg;
+	int n, m, entry;
+	struct ib_mr *ret;
+
+	if (length == 0) {
+		ret = ERR_PTR(-EINVAL);
+		goto bail;
+	}
+
+	umem = ib_umem_get(pd->uobject->context, start, length,
+			   mr_access_flags, 0);
+	if (IS_ERR(umem))
+		return (void *) umem;
+
+	n = umem->nmap;
+
+	mr = alloc_mr(n, pd);
+	if (IS_ERR(mr)) {
+		ret = (struct ib_mr *)mr;
+		ib_umem_release(umem);
+		goto bail;
+	}
+
+	mr->mr.user_base = start;
+	mr->mr.iova = virt_addr;
+	mr->mr.length = length;
+	mr->mr.offset = umem->offset;
+	mr->mr.access_flags = mr_access_flags;
+	mr->umem = umem;
+
+	if (is_power_of_2(umem->page_size))
+		mr->mr.page_shift = ilog2(umem->page_size);
+	m = 0;
+	n = 0;
+	for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) {
+			void *vaddr;
+
+			vaddr = page_address(sg_page(sg));
+			if (!vaddr) {
+				ret = ERR_PTR(-EINVAL);
+				goto bail;
+			}
+			mr->mr.map[m]->segs[n].vaddr = vaddr;
+			mr->mr.map[m]->segs[n].length = umem->page_size;
+			n++;
+			if (n == QIB_SEGSZ) {
+				m++;
+				n = 0;
+			}
+	}
+	ret = &mr->ibmr;
+
+bail:
+	return ret;
+}
+
+/**
+ * qib_dereg_mr - unregister and free a memory region
+ * @ibmr: the memory region to free
+ *
+ * Returns 0 on success.
+ *
+ * Note that this is called to free MRs created by qib_get_dma_mr()
+ * or qib_reg_user_mr().
+ */
+int qib_dereg_mr(struct ib_mr *ibmr)
+{
+	struct qib_mr *mr = to_imr(ibmr);
+	int ret = 0;
+	unsigned long timeout;
+
+	qib_free_lkey(&mr->mr);
+
+	qib_put_mr(&mr->mr); /* will set completion if last */
+	timeout = wait_for_completion_timeout(&mr->mr.comp,
+		5 * HZ);
+	if (!timeout) {
+		qib_get_mr(&mr->mr);
+		ret = -EBUSY;
+		goto out;
+	}
+	deinit_qib_mregion(&mr->mr);
+	if (mr->umem)
+		ib_umem_release(mr->umem);
+	kfree(mr);
+out:
+	return ret;
+}
+
+/*
+ * Allocate a memory region usable with the
+ * IB_WR_FAST_REG_MR send work request.
+ *
+ * Return the memory region on success, otherwise return an errno.
+ */
+struct ib_mr *qib_alloc_fast_reg_mr(struct ib_pd *pd, int max_page_list_len)
+{
+	struct qib_mr *mr;
+
+	mr = alloc_mr(max_page_list_len, pd);
+	if (IS_ERR(mr))
+		return (struct ib_mr *)mr;
+
+	return &mr->ibmr;
+}
+
+struct ib_fast_reg_page_list *
+qib_alloc_fast_reg_page_list(struct ib_device *ibdev, int page_list_len)
+{
+	unsigned size = page_list_len * sizeof(u64);
+	struct ib_fast_reg_page_list *pl;
+
+	if (size > PAGE_SIZE)
+		return ERR_PTR(-EINVAL);
+
+	pl = kzalloc(sizeof *pl, GFP_KERNEL);
+	if (!pl)
+		return ERR_PTR(-ENOMEM);
+
+	pl->page_list = kzalloc(size, GFP_KERNEL);
+	if (!pl->page_list)
+		goto err_free;
+
+	return pl;
+
+err_free:
+	kfree(pl);
+	return ERR_PTR(-ENOMEM);
+}
+
+void qib_free_fast_reg_page_list(struct ib_fast_reg_page_list *pl)
+{
+	kfree(pl->page_list);
+	kfree(pl);
+}
+
+/**
+ * qib_alloc_fmr - allocate a fast memory region
+ * @pd: the protection domain for this memory region
+ * @mr_access_flags: access flags for this memory region
+ * @fmr_attr: fast memory region attributes
+ *
+ * Returns the memory region on success, otherwise returns an errno.
+ */
+struct ib_fmr *qib_alloc_fmr(struct ib_pd *pd, int mr_access_flags,
+			     struct ib_fmr_attr *fmr_attr)
+{
+	struct qib_fmr *fmr;
+	int m;
+	struct ib_fmr *ret;
+	int rval = -ENOMEM;
+
+	/* Allocate struct plus pointers to first level page tables. */
+	m = (fmr_attr->max_pages + QIB_SEGSZ - 1) / QIB_SEGSZ;
+	fmr = kzalloc(sizeof *fmr + m * sizeof fmr->mr.map[0], GFP_KERNEL);
+	if (!fmr)
+		goto bail;
+
+	rval = init_qib_mregion(&fmr->mr, pd, fmr_attr->max_pages);
+	if (rval)
+		goto bail;
+
+	/*
+	 * ib_alloc_fmr() will initialize fmr->ibfmr except for lkey &
+	 * rkey.
+	 */
+	rval = qib_alloc_lkey(&fmr->mr, 0);
+	if (rval)
+		goto bail_mregion;
+	fmr->ibfmr.rkey = fmr->mr.lkey;
+	fmr->ibfmr.lkey = fmr->mr.lkey;
+	/*
+	 * Resources are allocated but no valid mapping (RKEY can't be
+	 * used).
+	 */
+	fmr->mr.access_flags = mr_access_flags;
+	fmr->mr.max_segs = fmr_attr->max_pages;
+	fmr->mr.page_shift = fmr_attr->page_shift;
+
+	ret = &fmr->ibfmr;
+done:
+	return ret;
+
+bail_mregion:
+	deinit_qib_mregion(&fmr->mr);
+bail:
+	kfree(fmr);
+	ret = ERR_PTR(rval);
+	goto done;
+}
+
+/**
+ * qib_map_phys_fmr - set up a fast memory region
+ * @ibmfr: the fast memory region to set up
+ * @page_list: the list of pages to associate with the fast memory region
+ * @list_len: the number of pages to associate with the fast memory region
+ * @iova: the virtual address of the start of the fast memory region
+ *
+ * This may be called from interrupt context.
+ */
+
+int qib_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list,
+		     int list_len, u64 iova)
+{
+	struct qib_fmr *fmr = to_ifmr(ibfmr);
+	struct qib_lkey_table *rkt;
+	unsigned long flags;
+	int m, n, i;
+	u32 ps;
+	int ret;
+
+	i = atomic_read(&fmr->mr.refcount);
+	if (i > 2)
+		return -EBUSY;
+
+	if (list_len > fmr->mr.max_segs) {
+		ret = -EINVAL;
+		goto bail;
+	}
+	rkt = &to_idev(ibfmr->device)->lk_table;
+	spin_lock_irqsave(&rkt->lock, flags);
+	fmr->mr.user_base = iova;
+	fmr->mr.iova = iova;
+	ps = 1 << fmr->mr.page_shift;
+	fmr->mr.length = list_len * ps;
+	m = 0;
+	n = 0;
+	for (i = 0; i < list_len; i++) {
+		fmr->mr.map[m]->segs[n].vaddr = (void *) page_list[i];
+		fmr->mr.map[m]->segs[n].length = ps;
+		if (++n == QIB_SEGSZ) {
+			m++;
+			n = 0;
+		}
+	}
+	spin_unlock_irqrestore(&rkt->lock, flags);
+	ret = 0;
+
+bail:
+	return ret;
+}
+
+/**
+ * qib_unmap_fmr - unmap fast memory regions
+ * @fmr_list: the list of fast memory regions to unmap
+ *
+ * Returns 0 on success.
+ */
+int qib_unmap_fmr(struct list_head *fmr_list)
+{
+	struct qib_fmr *fmr;
+	struct qib_lkey_table *rkt;
+	unsigned long flags;
+
+	list_for_each_entry(fmr, fmr_list, ibfmr.list) {
+		rkt = &to_idev(fmr->ibfmr.device)->lk_table;
+		spin_lock_irqsave(&rkt->lock, flags);
+		fmr->mr.user_base = 0;
+		fmr->mr.iova = 0;
+		fmr->mr.length = 0;
+		spin_unlock_irqrestore(&rkt->lock, flags);
+	}
+	return 0;
+}
+
+/**
+ * qib_dealloc_fmr - deallocate a fast memory region
+ * @ibfmr: the fast memory region to deallocate
+ *
+ * Returns 0 on success.
+ */
+int qib_dealloc_fmr(struct ib_fmr *ibfmr)
+{
+	struct qib_fmr *fmr = to_ifmr(ibfmr);
+	int ret = 0;
+	unsigned long timeout;
+
+	qib_free_lkey(&fmr->mr);
+	qib_put_mr(&fmr->mr); /* will set completion if last */
+	timeout = wait_for_completion_timeout(&fmr->mr.comp,
+		5 * HZ);
+	if (!timeout) {
+		qib_get_mr(&fmr->mr);
+		ret = -EBUSY;
+		goto out;
+	}
+	deinit_qib_mregion(&fmr->mr);
+	kfree(fmr);
+out:
+	return ret;
+}
+
+void mr_rcu_callback(struct rcu_head *list)
+{
+	struct qib_mregion *mr = container_of(list, struct qib_mregion, list);
+
+	complete(&mr->comp);
+}
diff --git a/drivers/infiniband/hw/qib/qib_pcie.c b/drivers/infiniband/hw/qib/qib_pcie.c
new file mode 100644
index 0000000..61a0046
--- /dev/null
+++ b/drivers/infiniband/hw/qib/qib_pcie.c
@@ -0,0 +1,715 @@
+/*
+ * Copyright (c) 2008, 2009 QLogic Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/pci.h>
+#include <linux/io.h>
+#include <linux/delay.h>
+#include <linux/vmalloc.h>
+#include <linux/aer.h>
+#include <linux/module.h>
+
+#include "qib.h"
+
+/*
+ * This file contains PCIe utility routines that are common to the
+ * various QLogic InfiniPath adapters
+ */
+
+/*
+ * Code to adjust PCIe capabilities.
+ * To minimize the change footprint, we call it
+ * from qib_pcie_params, which every chip-specific
+ * file calls, even though this violates some
+ * expectations of harmlessness.
+ */
+static void qib_tune_pcie_caps(struct qib_devdata *);
+static void qib_tune_pcie_coalesce(struct qib_devdata *);
+
+/*
+ * Do all the common PCIe setup and initialization.
+ * devdata is not yet allocated, and is not allocated until after this
+ * routine returns success.  Therefore qib_dev_err() can't be used for error
+ * printing.
+ */
+int qib_pcie_init(struct pci_dev *pdev, const struct pci_device_id *ent)
+{
+	int ret;
+
+	ret = pci_enable_device(pdev);
+	if (ret) {
+		/*
+		 * This can happen (in theory) iff:
+		 * We did a chip reset, and then failed to reprogram the
+		 * BAR, or the chip reset due to an internal error.  We then
+		 * unloaded the driver and reloaded it.
+		 *
+		 * Both reset cases set the BAR back to initial state.  For
+		 * the latter case, the AER sticky error bit at offset 0x718
+		 * should be set, but the Linux kernel doesn't yet know
+		 * about that, it appears.  If the original BAR was retained
+		 * in the kernel data structures, this may be OK.
+		 */
+		qib_early_err(&pdev->dev, "pci enable failed: error %d\n",
+			      -ret);
+		goto done;
+	}
+
+	ret = pci_request_regions(pdev, QIB_DRV_NAME);
+	if (ret) {
+		qib_devinfo(pdev, "pci_request_regions fails: err %d\n", -ret);
+		goto bail;
+	}
+
+	ret = pci_set_dma_mask(pdev, DMA_BIT_MASK(64));
+	if (ret) {
+		/*
+		 * If the 64 bit setup fails, try 32 bit.  Some systems
+		 * do not setup 64 bit maps on systems with 2GB or less
+		 * memory installed.
+		 */
+		ret = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
+		if (ret) {
+			qib_devinfo(pdev, "Unable to set DMA mask: %d\n", ret);
+			goto bail;
+		}
+		ret = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32));
+	} else
+		ret = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64));
+	if (ret) {
+		qib_early_err(&pdev->dev,
+			      "Unable to set DMA consistent mask: %d\n", ret);
+		goto bail;
+	}
+
+	pci_set_master(pdev);
+	ret = pci_enable_pcie_error_reporting(pdev);
+	if (ret) {
+		qib_early_err(&pdev->dev,
+			      "Unable to enable pcie error reporting: %d\n",
+			      ret);
+		ret = 0;
+	}
+	goto done;
+
+bail:
+	pci_disable_device(pdev);
+	pci_release_regions(pdev);
+done:
+	return ret;
+}
+
+/*
+ * Do remaining PCIe setup, once dd is allocated, and save away
+ * fields required to re-initialize after a chip reset, or for
+ * various other purposes
+ */
+int qib_pcie_ddinit(struct qib_devdata *dd, struct pci_dev *pdev,
+		    const struct pci_device_id *ent)
+{
+	unsigned long len;
+	resource_size_t addr;
+
+	dd->pcidev = pdev;
+	pci_set_drvdata(pdev, dd);
+
+	addr = pci_resource_start(pdev, 0);
+	len = pci_resource_len(pdev, 0);
+
+#if defined(__powerpc__)
+	/* There isn't a generic way to specify writethrough mappings */
+	dd->kregbase = __ioremap(addr, len, _PAGE_NO_CACHE | _PAGE_WRITETHRU);
+#else
+	dd->kregbase = ioremap_nocache(addr, len);
+#endif
+
+	if (!dd->kregbase)
+		return -ENOMEM;
+
+	dd->kregend = (u64 __iomem *)((void __iomem *) dd->kregbase + len);
+	dd->physaddr = addr;        /* used for io_remap, etc. */
+
+	/*
+	 * Save BARs to rewrite after device reset.  Save all 64 bits of
+	 * BAR, just in case.
+	 */
+	dd->pcibar0 = addr;
+	dd->pcibar1 = addr >> 32;
+	dd->deviceid = ent->device; /* save for later use */
+	dd->vendorid = ent->vendor;
+
+	return 0;
+}
+
+/*
+ * Do PCIe cleanup, after chip-specific cleanup, etc.  Just prior
+ * to releasing the dd memory.
+ * void because none of the core pcie cleanup returns are void
+ */
+void qib_pcie_ddcleanup(struct qib_devdata *dd)
+{
+	u64 __iomem *base = (void __iomem *) dd->kregbase;
+
+	dd->kregbase = NULL;
+	iounmap(base);
+	if (dd->piobase)
+		iounmap(dd->piobase);
+	if (dd->userbase)
+		iounmap(dd->userbase);
+	if (dd->piovl15base)
+		iounmap(dd->piovl15base);
+
+	pci_disable_device(dd->pcidev);
+	pci_release_regions(dd->pcidev);
+
+	pci_set_drvdata(dd->pcidev, NULL);
+}
+
+static void qib_msix_setup(struct qib_devdata *dd, int pos, u32 *msixcnt,
+			   struct qib_msix_entry *qib_msix_entry)
+{
+	int ret;
+	int nvec = *msixcnt;
+	struct msix_entry *msix_entry;
+	int i;
+
+	ret = pci_msix_vec_count(dd->pcidev);
+	if (ret < 0)
+		goto do_intx;
+
+	nvec = min(nvec, ret);
+
+	/* We can't pass qib_msix_entry array to qib_msix_setup
+	 * so use a dummy msix_entry array and copy the allocated
+	 * irq back to the qib_msix_entry array. */
+	msix_entry = kmalloc(nvec * sizeof(*msix_entry), GFP_KERNEL);
+	if (!msix_entry)
+		goto do_intx;
+
+	for (i = 0; i < nvec; i++)
+		msix_entry[i] = qib_msix_entry[i].msix;
+
+	ret = pci_enable_msix_range(dd->pcidev, msix_entry, 1, nvec);
+	if (ret < 0)
+		goto free_msix_entry;
+	else
+		nvec = ret;
+
+	for (i = 0; i < nvec; i++)
+		qib_msix_entry[i].msix = msix_entry[i];
+
+	kfree(msix_entry);
+	*msixcnt = nvec;
+	return;
+
+free_msix_entry:
+	kfree(msix_entry);
+
+do_intx:
+	qib_dev_err(dd, "pci_enable_msix_range %d vectors failed: %d, "
+			"falling back to INTx\n", nvec, ret);
+	*msixcnt = 0;
+	qib_enable_intx(dd->pcidev);
+}
+
+/**
+ * We save the msi lo and hi values, so we can restore them after
+ * chip reset (the kernel PCI infrastructure doesn't yet handle that
+ * correctly.
+ */
+static int qib_msi_setup(struct qib_devdata *dd, int pos)
+{
+	struct pci_dev *pdev = dd->pcidev;
+	u16 control;
+	int ret;
+
+	ret = pci_enable_msi(pdev);
+	if (ret)
+		qib_dev_err(dd,
+			"pci_enable_msi failed: %d, interrupts may not work\n",
+			ret);
+	/* continue even if it fails, we may still be OK... */
+
+	pci_read_config_dword(pdev, pos + PCI_MSI_ADDRESS_LO,
+			      &dd->msi_lo);
+	pci_read_config_dword(pdev, pos + PCI_MSI_ADDRESS_HI,
+			      &dd->msi_hi);
+	pci_read_config_word(pdev, pos + PCI_MSI_FLAGS, &control);
+	/* now save the data (vector) info */
+	pci_read_config_word(pdev, pos + ((control & PCI_MSI_FLAGS_64BIT)
+				    ? 12 : 8),
+			     &dd->msi_data);
+	return ret;
+}
+
+int qib_pcie_params(struct qib_devdata *dd, u32 minw, u32 *nent,
+		    struct qib_msix_entry *entry)
+{
+	u16 linkstat, speed;
+	int pos = 0, ret = 1;
+
+	if (!pci_is_pcie(dd->pcidev)) {
+		qib_dev_err(dd, "Can't find PCI Express capability!\n");
+		/* set up something... */
+		dd->lbus_width = 1;
+		dd->lbus_speed = 2500; /* Gen1, 2.5GHz */
+		goto bail;
+	}
+
+	pos = dd->pcidev->msix_cap;
+	if (nent && *nent && pos) {
+		qib_msix_setup(dd, pos, nent, entry);
+		ret = 0; /* did it, either MSIx or INTx */
+	} else {
+		pos = dd->pcidev->msi_cap;
+		if (pos)
+			ret = qib_msi_setup(dd, pos);
+		else
+			qib_dev_err(dd, "No PCI MSI or MSIx capability!\n");
+	}
+	if (!pos)
+		qib_enable_intx(dd->pcidev);
+
+	pcie_capability_read_word(dd->pcidev, PCI_EXP_LNKSTA, &linkstat);
+	/*
+	 * speed is bits 0-3, linkwidth is bits 4-8
+	 * no defines for them in headers
+	 */
+	speed = linkstat & 0xf;
+	linkstat >>= 4;
+	linkstat &= 0x1f;
+	dd->lbus_width = linkstat;
+
+	switch (speed) {
+	case 1:
+		dd->lbus_speed = 2500; /* Gen1, 2.5GHz */
+		break;
+	case 2:
+		dd->lbus_speed = 5000; /* Gen1, 5GHz */
+		break;
+	default: /* not defined, assume gen1 */
+		dd->lbus_speed = 2500;
+		break;
+	}
+
+	/*
+	 * Check against expected pcie width and complain if "wrong"
+	 * on first initialization, not afterwards (i.e., reset).
+	 */
+	if (minw && linkstat < minw)
+		qib_dev_err(dd,
+			    "PCIe width %u (x%u HCA), performance reduced\n",
+			    linkstat, minw);
+
+	qib_tune_pcie_caps(dd);
+
+	qib_tune_pcie_coalesce(dd);
+
+bail:
+	/* fill in string, even on errors */
+	snprintf(dd->lbus_info, sizeof(dd->lbus_info),
+		 "PCIe,%uMHz,x%u\n", dd->lbus_speed, dd->lbus_width);
+	return ret;
+}
+
+/*
+ * Setup pcie interrupt stuff again after a reset.  I'd like to just call
+ * pci_enable_msi() again for msi, but when I do that,
+ * the MSI enable bit doesn't get set in the command word, and
+ * we switch to to a different interrupt vector, which is confusing,
+ * so I instead just do it all inline.  Perhaps somehow can tie this
+ * into the PCIe hotplug support at some point
+ */
+int qib_reinit_intr(struct qib_devdata *dd)
+{
+	int pos;
+	u16 control;
+	int ret = 0;
+
+	/* If we aren't using MSI, don't restore it */
+	if (!dd->msi_lo)
+		goto bail;
+
+	pos = dd->pcidev->msi_cap;
+	if (!pos) {
+		qib_dev_err(dd,
+			"Can't find MSI capability, can't restore MSI settings\n");
+		ret = 0;
+		/* nothing special for MSIx, just MSI */
+		goto bail;
+	}
+	pci_write_config_dword(dd->pcidev, pos + PCI_MSI_ADDRESS_LO,
+			       dd->msi_lo);
+	pci_write_config_dword(dd->pcidev, pos + PCI_MSI_ADDRESS_HI,
+			       dd->msi_hi);
+	pci_read_config_word(dd->pcidev, pos + PCI_MSI_FLAGS, &control);
+	if (!(control & PCI_MSI_FLAGS_ENABLE)) {
+		control |= PCI_MSI_FLAGS_ENABLE;
+		pci_write_config_word(dd->pcidev, pos + PCI_MSI_FLAGS,
+				      control);
+	}
+	/* now rewrite the data (vector) info */
+	pci_write_config_word(dd->pcidev, pos +
+			      ((control & PCI_MSI_FLAGS_64BIT) ? 12 : 8),
+			      dd->msi_data);
+	ret = 1;
+bail:
+	if (!ret && (dd->flags & QIB_HAS_INTX)) {
+		qib_enable_intx(dd->pcidev);
+		ret = 1;
+	}
+
+	/* and now set the pci master bit again */
+	pci_set_master(dd->pcidev);
+
+	return ret;
+}
+
+/*
+ * Disable msi interrupt if enabled, and clear msi_lo.
+ * This is used primarily for the fallback to INTx, but
+ * is also used in reinit after reset, and during cleanup.
+ */
+void qib_nomsi(struct qib_devdata *dd)
+{
+	dd->msi_lo = 0;
+	pci_disable_msi(dd->pcidev);
+}
+
+/*
+ * Same as qib_nosmi, but for MSIx.
+ */
+void qib_nomsix(struct qib_devdata *dd)
+{
+	pci_disable_msix(dd->pcidev);
+}
+
+/*
+ * Similar to pci_intx(pdev, 1), except that we make sure
+ * msi(x) is off.
+ */
+void qib_enable_intx(struct pci_dev *pdev)
+{
+	u16 cw, new;
+	int pos;
+
+	/* first, turn on INTx */
+	pci_read_config_word(pdev, PCI_COMMAND, &cw);
+	new = cw & ~PCI_COMMAND_INTX_DISABLE;
+	if (new != cw)
+		pci_write_config_word(pdev, PCI_COMMAND, new);
+
+	pos = pdev->msi_cap;
+	if (pos) {
+		/* then turn off MSI */
+		pci_read_config_word(pdev, pos + PCI_MSI_FLAGS, &cw);
+		new = cw & ~PCI_MSI_FLAGS_ENABLE;
+		if (new != cw)
+			pci_write_config_word(pdev, pos + PCI_MSI_FLAGS, new);
+	}
+	pos = pdev->msix_cap;
+	if (pos) {
+		/* then turn off MSIx */
+		pci_read_config_word(pdev, pos + PCI_MSIX_FLAGS, &cw);
+		new = cw & ~PCI_MSIX_FLAGS_ENABLE;
+		if (new != cw)
+			pci_write_config_word(pdev, pos + PCI_MSIX_FLAGS, new);
+	}
+}
+
+/*
+ * These two routines are helper routines for the device reset code
+ * to move all the pcie code out of the chip-specific driver code.
+ */
+void qib_pcie_getcmd(struct qib_devdata *dd, u16 *cmd, u8 *iline, u8 *cline)
+{
+	pci_read_config_word(dd->pcidev, PCI_COMMAND, cmd);
+	pci_read_config_byte(dd->pcidev, PCI_INTERRUPT_LINE, iline);
+	pci_read_config_byte(dd->pcidev, PCI_CACHE_LINE_SIZE, cline);
+}
+
+void qib_pcie_reenable(struct qib_devdata *dd, u16 cmd, u8 iline, u8 cline)
+{
+	int r;
+	r = pci_write_config_dword(dd->pcidev, PCI_BASE_ADDRESS_0,
+				   dd->pcibar0);
+	if (r)
+		qib_dev_err(dd, "rewrite of BAR0 failed: %d\n", r);
+	r = pci_write_config_dword(dd->pcidev, PCI_BASE_ADDRESS_1,
+				   dd->pcibar1);
+	if (r)
+		qib_dev_err(dd, "rewrite of BAR1 failed: %d\n", r);
+	/* now re-enable memory access, and restore cosmetic settings */
+	pci_write_config_word(dd->pcidev, PCI_COMMAND, cmd);
+	pci_write_config_byte(dd->pcidev, PCI_INTERRUPT_LINE, iline);
+	pci_write_config_byte(dd->pcidev, PCI_CACHE_LINE_SIZE, cline);
+	r = pci_enable_device(dd->pcidev);
+	if (r)
+		qib_dev_err(dd,
+			"pci_enable_device failed after reset: %d\n", r);
+}
+
+
+static int qib_pcie_coalesce;
+module_param_named(pcie_coalesce, qib_pcie_coalesce, int, S_IRUGO);
+MODULE_PARM_DESC(pcie_coalesce, "tune PCIe colescing on some Intel chipsets");
+
+/*
+ * Enable PCIe completion and data coalescing, on Intel 5x00 and 7300
+ * chipsets.   This is known to be unsafe for some revisions of some
+ * of these chipsets, with some BIOS settings, and enabling it on those
+ * systems may result in the system crashing, and/or data corruption.
+ */
+static void qib_tune_pcie_coalesce(struct qib_devdata *dd)
+{
+	int r;
+	struct pci_dev *parent;
+	u16 devid;
+	u32 mask, bits, val;
+
+	if (!qib_pcie_coalesce)
+		return;
+
+	/* Find out supported and configured values for parent (root) */
+	parent = dd->pcidev->bus->self;
+	if (parent->bus->parent) {
+		qib_devinfo(dd->pcidev, "Parent not root\n");
+		return;
+	}
+	if (!pci_is_pcie(parent))
+		return;
+	if (parent->vendor != 0x8086)
+		return;
+
+	/*
+	 *  - bit 12: Max_rdcmp_Imt_EN: need to set to 1
+	 *  - bit 11: COALESCE_FORCE: need to set to 0
+	 *  - bit 10: COALESCE_EN: need to set to 1
+	 *  (but limitations on some on some chipsets)
+	 *
+	 *  On the Intel 5000, 5100, and 7300 chipsets, there is
+	 *  also: - bit 25:24: COALESCE_MODE, need to set to 0
+	 */
+	devid = parent->device;
+	if (devid >= 0x25e2 && devid <= 0x25fa) {
+		/* 5000 P/V/X/Z */
+		if (parent->revision <= 0xb2)
+			bits = 1U << 10;
+		else
+			bits = 7U << 10;
+		mask = (3U << 24) | (7U << 10);
+	} else if (devid >= 0x65e2 && devid <= 0x65fa) {
+		/* 5100 */
+		bits = 1U << 10;
+		mask = (3U << 24) | (7U << 10);
+	} else if (devid >= 0x4021 && devid <= 0x402e) {
+		/* 5400 */
+		bits = 7U << 10;
+		mask = 7U << 10;
+	} else if (devid >= 0x3604 && devid <= 0x360a) {
+		/* 7300 */
+		bits = 7U << 10;
+		mask = (3U << 24) | (7U << 10);
+	} else {
+		/* not one of the chipsets that we know about */
+		return;
+	}
+	pci_read_config_dword(parent, 0x48, &val);
+	val &= ~mask;
+	val |= bits;
+	r = pci_write_config_dword(parent, 0x48, val);
+}
+
+/*
+ * BIOS may not set PCIe bus-utilization parameters for best performance.
+ * Check and optionally adjust them to maximize our throughput.
+ */
+static int qib_pcie_caps;
+module_param_named(pcie_caps, qib_pcie_caps, int, S_IRUGO);
+MODULE_PARM_DESC(pcie_caps, "Max PCIe tuning: Payload (0..3), ReadReq (4..7)");
+
+static void qib_tune_pcie_caps(struct qib_devdata *dd)
+{
+	struct pci_dev *parent;
+	u16 rc_mpss, rc_mps, ep_mpss, ep_mps;
+	u16 rc_mrrs, ep_mrrs, max_mrrs;
+
+	/* Find out supported and configured values for parent (root) */
+	parent = dd->pcidev->bus->self;
+	if (!pci_is_root_bus(parent->bus)) {
+		qib_devinfo(dd->pcidev, "Parent not root\n");
+		return;
+	}
+
+	if (!pci_is_pcie(parent) || !pci_is_pcie(dd->pcidev))
+		return;
+
+	rc_mpss = parent->pcie_mpss;
+	rc_mps = ffs(pcie_get_mps(parent)) - 8;
+	/* Find out supported and configured values for endpoint (us) */
+	ep_mpss = dd->pcidev->pcie_mpss;
+	ep_mps = ffs(pcie_get_mps(dd->pcidev)) - 8;
+
+	/* Find max payload supported by root, endpoint */
+	if (rc_mpss > ep_mpss)
+		rc_mpss = ep_mpss;
+
+	/* If Supported greater than limit in module param, limit it */
+	if (rc_mpss > (qib_pcie_caps & 7))
+		rc_mpss = qib_pcie_caps & 7;
+	/* If less than (allowed, supported), bump root payload */
+	if (rc_mpss > rc_mps) {
+		rc_mps = rc_mpss;
+		pcie_set_mps(parent, 128 << rc_mps);
+	}
+	/* If less than (allowed, supported), bump endpoint payload */
+	if (rc_mpss > ep_mps) {
+		ep_mps = rc_mpss;
+		pcie_set_mps(dd->pcidev, 128 << ep_mps);
+	}
+
+	/*
+	 * Now the Read Request size.
+	 * No field for max supported, but PCIe spec limits it to 4096,
+	 * which is code '5' (log2(4096) - 7)
+	 */
+	max_mrrs = 5;
+	if (max_mrrs > ((qib_pcie_caps >> 4) & 7))
+		max_mrrs = (qib_pcie_caps >> 4) & 7;
+
+	max_mrrs = 128 << max_mrrs;
+	rc_mrrs = pcie_get_readrq(parent);
+	ep_mrrs = pcie_get_readrq(dd->pcidev);
+
+	if (max_mrrs > rc_mrrs) {
+		rc_mrrs = max_mrrs;
+		pcie_set_readrq(parent, rc_mrrs);
+	}
+	if (max_mrrs > ep_mrrs) {
+		ep_mrrs = max_mrrs;
+		pcie_set_readrq(dd->pcidev, ep_mrrs);
+	}
+}
+/* End of PCIe capability tuning */
+
+/*
+ * From here through qib_pci_err_handler definition is invoked via
+ * PCI error infrastructure, registered via pci
+ */
+static pci_ers_result_t
+qib_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state)
+{
+	struct qib_devdata *dd = pci_get_drvdata(pdev);
+	pci_ers_result_t ret = PCI_ERS_RESULT_RECOVERED;
+
+	switch (state) {
+	case pci_channel_io_normal:
+		qib_devinfo(pdev, "State Normal, ignoring\n");
+		break;
+
+	case pci_channel_io_frozen:
+		qib_devinfo(pdev, "State Frozen, requesting reset\n");
+		pci_disable_device(pdev);
+		ret = PCI_ERS_RESULT_NEED_RESET;
+		break;
+
+	case pci_channel_io_perm_failure:
+		qib_devinfo(pdev, "State Permanent Failure, disabling\n");
+		if (dd) {
+			/* no more register accesses! */
+			dd->flags &= ~QIB_PRESENT;
+			qib_disable_after_error(dd);
+		}
+		 /* else early, or other problem */
+		ret =  PCI_ERS_RESULT_DISCONNECT;
+		break;
+
+	default: /* shouldn't happen */
+		qib_devinfo(pdev, "QIB PCI errors detected (state %d)\n",
+			state);
+		break;
+	}
+	return ret;
+}
+
+static pci_ers_result_t
+qib_pci_mmio_enabled(struct pci_dev *pdev)
+{
+	u64 words = 0U;
+	struct qib_devdata *dd = pci_get_drvdata(pdev);
+	pci_ers_result_t ret = PCI_ERS_RESULT_RECOVERED;
+
+	if (dd && dd->pport) {
+		words = dd->f_portcntr(dd->pport, QIBPORTCNTR_WORDRCV);
+		if (words == ~0ULL)
+			ret = PCI_ERS_RESULT_NEED_RESET;
+	}
+	qib_devinfo(pdev,
+		"QIB mmio_enabled function called, read wordscntr %Lx, returning %d\n",
+		words, ret);
+	return  ret;
+}
+
+static pci_ers_result_t
+qib_pci_slot_reset(struct pci_dev *pdev)
+{
+	qib_devinfo(pdev, "QIB slot_reset function called, ignored\n");
+	return PCI_ERS_RESULT_CAN_RECOVER;
+}
+
+static pci_ers_result_t
+qib_pci_link_reset(struct pci_dev *pdev)
+{
+	qib_devinfo(pdev, "QIB link_reset function called, ignored\n");
+	return PCI_ERS_RESULT_CAN_RECOVER;
+}
+
+static void
+qib_pci_resume(struct pci_dev *pdev)
+{
+	struct qib_devdata *dd = pci_get_drvdata(pdev);
+	qib_devinfo(pdev, "QIB resume function called\n");
+	pci_cleanup_aer_uncorrect_error_status(pdev);
+	/*
+	 * Running jobs will fail, since it's asynchronous
+	 * unlike sysfs-requested reset.   Better than
+	 * doing nothing.
+	 */
+	qib_init(dd, 1); /* same as re-init after reset */
+}
+
+const struct pci_error_handlers qib_pci_err_handler = {
+	.error_detected = qib_pci_error_detected,
+	.mmio_enabled = qib_pci_mmio_enabled,
+	.link_reset = qib_pci_link_reset,
+	.slot_reset = qib_pci_slot_reset,
+	.resume = qib_pci_resume,
+};
diff --git a/drivers/infiniband/hw/qib/qib_pio_copy.c b/drivers/infiniband/hw/qib/qib_pio_copy.c
new file mode 100644
index 0000000..10b8c44
--- /dev/null
+++ b/drivers/infiniband/hw/qib/qib_pio_copy.c
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2009 QLogic Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "qib.h"
+
+/**
+ * qib_pio_copy - copy data to MMIO space, in multiples of 32-bits
+ * @to: destination, in MMIO space (must be 64-bit aligned)
+ * @from: source (must be 64-bit aligned)
+ * @count: number of 32-bit quantities to copy
+ *
+ * Copy data from kernel space to MMIO space, in multiples of 32 bits at a
+ * time.  Order of access is not guaranteed, nor is a memory barrier
+ * performed afterwards.
+ */
+void qib_pio_copy(void __iomem *to, const void *from, size_t count)
+{
+#ifdef CONFIG_64BIT
+	u64 __iomem *dst = to;
+	const u64 *src = from;
+	const u64 *end = src + (count >> 1);
+
+	while (src < end)
+		__raw_writeq(*src++, dst++);
+	if (count & 1)
+		__raw_writel(*(const u32 *)src, dst);
+#else
+	u32 __iomem *dst = to;
+	const u32 *src = from;
+	const u32 *end = src + count;
+
+	while (src < end)
+		__raw_writel(*src++, dst++);
+#endif
+}
diff --git a/drivers/infiniband/hw/qib/qib_qp.c b/drivers/infiniband/hw/qib/qib_qp.c
new file mode 100644
index 0000000..6ddc026
--- /dev/null
+++ b/drivers/infiniband/hw/qib/qib_qp.c
@@ -0,0 +1,1376 @@
+/*
+ * Copyright (c) 2012, 2013 Intel Corporation.  All rights reserved.
+ * Copyright (c) 2006 - 2012 QLogic Corporation.  * All rights reserved.
+ * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/err.h>
+#include <linux/vmalloc.h>
+#include <linux/jhash.h>
+#ifdef CONFIG_DEBUG_FS
+#include <linux/seq_file.h>
+#endif
+
+#include "qib.h"
+
+#define BITS_PER_PAGE           (PAGE_SIZE*BITS_PER_BYTE)
+#define BITS_PER_PAGE_MASK      (BITS_PER_PAGE-1)
+
+static inline unsigned mk_qpn(struct qib_qpn_table *qpt,
+			      struct qpn_map *map, unsigned off)
+{
+	return (map - qpt->map) * BITS_PER_PAGE + off;
+}
+
+static inline unsigned find_next_offset(struct qib_qpn_table *qpt,
+					struct qpn_map *map, unsigned off,
+					unsigned n)
+{
+	if (qpt->mask) {
+		off++;
+		if (((off & qpt->mask) >> 1) >= n)
+			off = (off | qpt->mask) + 2;
+	} else
+		off = find_next_zero_bit(map->page, BITS_PER_PAGE, off);
+	return off;
+}
+
+/*
+ * Convert the AETH credit code into the number of credits.
+ */
+static u32 credit_table[31] = {
+	0,                      /* 0 */
+	1,                      /* 1 */
+	2,                      /* 2 */
+	3,                      /* 3 */
+	4,                      /* 4 */
+	6,                      /* 5 */
+	8,                      /* 6 */
+	12,                     /* 7 */
+	16,                     /* 8 */
+	24,                     /* 9 */
+	32,                     /* A */
+	48,                     /* B */
+	64,                     /* C */
+	96,                     /* D */
+	128,                    /* E */
+	192,                    /* F */
+	256,                    /* 10 */
+	384,                    /* 11 */
+	512,                    /* 12 */
+	768,                    /* 13 */
+	1024,                   /* 14 */
+	1536,                   /* 15 */
+	2048,                   /* 16 */
+	3072,                   /* 17 */
+	4096,                   /* 18 */
+	6144,                   /* 19 */
+	8192,                   /* 1A */
+	12288,                  /* 1B */
+	16384,                  /* 1C */
+	24576,                  /* 1D */
+	32768                   /* 1E */
+};
+
+static void get_map_page(struct qib_qpn_table *qpt, struct qpn_map *map)
+{
+	unsigned long page = get_zeroed_page(GFP_KERNEL);
+
+	/*
+	 * Free the page if someone raced with us installing it.
+	 */
+
+	spin_lock(&qpt->lock);
+	if (map->page)
+		free_page(page);
+	else
+		map->page = (void *)page;
+	spin_unlock(&qpt->lock);
+}
+
+/*
+ * Allocate the next available QPN or
+ * zero/one for QP type IB_QPT_SMI/IB_QPT_GSI.
+ */
+static int alloc_qpn(struct qib_devdata *dd, struct qib_qpn_table *qpt,
+		     enum ib_qp_type type, u8 port)
+{
+	u32 i, offset, max_scan, qpn;
+	struct qpn_map *map;
+	u32 ret;
+
+	if (type == IB_QPT_SMI || type == IB_QPT_GSI) {
+		unsigned n;
+
+		ret = type == IB_QPT_GSI;
+		n = 1 << (ret + 2 * (port - 1));
+		spin_lock(&qpt->lock);
+		if (qpt->flags & n)
+			ret = -EINVAL;
+		else
+			qpt->flags |= n;
+		spin_unlock(&qpt->lock);
+		goto bail;
+	}
+
+	qpn = qpt->last + 2;
+	if (qpn >= QPN_MAX)
+		qpn = 2;
+	if (qpt->mask && ((qpn & qpt->mask) >> 1) >= dd->n_krcv_queues)
+		qpn = (qpn | qpt->mask) + 2;
+	offset = qpn & BITS_PER_PAGE_MASK;
+	map = &qpt->map[qpn / BITS_PER_PAGE];
+	max_scan = qpt->nmaps - !offset;
+	for (i = 0;;) {
+		if (unlikely(!map->page)) {
+			get_map_page(qpt, map);
+			if (unlikely(!map->page))
+				break;
+		}
+		do {
+			if (!test_and_set_bit(offset, map->page)) {
+				qpt->last = qpn;
+				ret = qpn;
+				goto bail;
+			}
+			offset = find_next_offset(qpt, map, offset,
+				dd->n_krcv_queues);
+			qpn = mk_qpn(qpt, map, offset);
+			/*
+			 * This test differs from alloc_pidmap().
+			 * If find_next_offset() does find a zero
+			 * bit, we don't need to check for QPN
+			 * wrapping around past our starting QPN.
+			 * We just need to be sure we don't loop
+			 * forever.
+			 */
+		} while (offset < BITS_PER_PAGE && qpn < QPN_MAX);
+		/*
+		 * In order to keep the number of pages allocated to a
+		 * minimum, we scan the all existing pages before increasing
+		 * the size of the bitmap table.
+		 */
+		if (++i > max_scan) {
+			if (qpt->nmaps == QPNMAP_ENTRIES)
+				break;
+			map = &qpt->map[qpt->nmaps++];
+			offset = 0;
+		} else if (map < &qpt->map[qpt->nmaps]) {
+			++map;
+			offset = 0;
+		} else {
+			map = &qpt->map[0];
+			offset = 2;
+		}
+		qpn = mk_qpn(qpt, map, offset);
+	}
+
+	ret = -ENOMEM;
+
+bail:
+	return ret;
+}
+
+static void free_qpn(struct qib_qpn_table *qpt, u32 qpn)
+{
+	struct qpn_map *map;
+
+	map = qpt->map + qpn / BITS_PER_PAGE;
+	if (map->page)
+		clear_bit(qpn & BITS_PER_PAGE_MASK, map->page);
+}
+
+static inline unsigned qpn_hash(struct qib_ibdev *dev, u32 qpn)
+{
+	return jhash_1word(qpn, dev->qp_rnd) &
+		(dev->qp_table_size - 1);
+}
+
+
+/*
+ * Put the QP into the hash table.
+ * The hash table holds a reference to the QP.
+ */
+static void insert_qp(struct qib_ibdev *dev, struct qib_qp *qp)
+{
+	struct qib_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
+	unsigned long flags;
+	unsigned n = qpn_hash(dev, qp->ibqp.qp_num);
+
+	atomic_inc(&qp->refcount);
+	spin_lock_irqsave(&dev->qpt_lock, flags);
+
+	if (qp->ibqp.qp_num == 0)
+		rcu_assign_pointer(ibp->qp0, qp);
+	else if (qp->ibqp.qp_num == 1)
+		rcu_assign_pointer(ibp->qp1, qp);
+	else {
+		qp->next = dev->qp_table[n];
+		rcu_assign_pointer(dev->qp_table[n], qp);
+	}
+
+	spin_unlock_irqrestore(&dev->qpt_lock, flags);
+}
+
+/*
+ * Remove the QP from the table so it can't be found asynchronously by
+ * the receive interrupt routine.
+ */
+static void remove_qp(struct qib_ibdev *dev, struct qib_qp *qp)
+{
+	struct qib_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
+	unsigned n = qpn_hash(dev, qp->ibqp.qp_num);
+	unsigned long flags;
+	int removed = 1;
+
+	spin_lock_irqsave(&dev->qpt_lock, flags);
+
+	if (rcu_dereference_protected(ibp->qp0,
+			lockdep_is_held(&dev->qpt_lock)) == qp) {
+		rcu_assign_pointer(ibp->qp0, NULL);
+	} else if (rcu_dereference_protected(ibp->qp1,
+			lockdep_is_held(&dev->qpt_lock)) == qp) {
+		rcu_assign_pointer(ibp->qp1, NULL);
+	} else {
+		struct qib_qp *q;
+		struct qib_qp __rcu **qpp;
+
+		removed = 0;
+		qpp = &dev->qp_table[n];
+		for (; (q = rcu_dereference_protected(*qpp,
+				lockdep_is_held(&dev->qpt_lock))) != NULL;
+				qpp = &q->next)
+			if (q == qp) {
+				rcu_assign_pointer(*qpp,
+					rcu_dereference_protected(qp->next,
+					 lockdep_is_held(&dev->qpt_lock)));
+				removed = 1;
+				break;
+			}
+	}
+
+	spin_unlock_irqrestore(&dev->qpt_lock, flags);
+	if (removed) {
+		synchronize_rcu();
+		atomic_dec(&qp->refcount);
+	}
+}
+
+/**
+ * qib_free_all_qps - check for QPs still in use
+ * @qpt: the QP table to empty
+ *
+ * There should not be any QPs still in use.
+ * Free memory for table.
+ */
+unsigned qib_free_all_qps(struct qib_devdata *dd)
+{
+	struct qib_ibdev *dev = &dd->verbs_dev;
+	unsigned long flags;
+	struct qib_qp *qp;
+	unsigned n, qp_inuse = 0;
+
+	for (n = 0; n < dd->num_pports; n++) {
+		struct qib_ibport *ibp = &dd->pport[n].ibport_data;
+
+		if (!qib_mcast_tree_empty(ibp))
+			qp_inuse++;
+		rcu_read_lock();
+		if (rcu_dereference(ibp->qp0))
+			qp_inuse++;
+		if (rcu_dereference(ibp->qp1))
+			qp_inuse++;
+		rcu_read_unlock();
+	}
+
+	spin_lock_irqsave(&dev->qpt_lock, flags);
+	for (n = 0; n < dev->qp_table_size; n++) {
+		qp = rcu_dereference_protected(dev->qp_table[n],
+			lockdep_is_held(&dev->qpt_lock));
+		rcu_assign_pointer(dev->qp_table[n], NULL);
+
+		for (; qp; qp = rcu_dereference_protected(qp->next,
+					lockdep_is_held(&dev->qpt_lock)))
+			qp_inuse++;
+	}
+	spin_unlock_irqrestore(&dev->qpt_lock, flags);
+	synchronize_rcu();
+
+	return qp_inuse;
+}
+
+/**
+ * qib_lookup_qpn - return the QP with the given QPN
+ * @qpt: the QP table
+ * @qpn: the QP number to look up
+ *
+ * The caller is responsible for decrementing the QP reference count
+ * when done.
+ */
+struct qib_qp *qib_lookup_qpn(struct qib_ibport *ibp, u32 qpn)
+{
+	struct qib_qp *qp = NULL;
+
+	rcu_read_lock();
+	if (unlikely(qpn <= 1)) {
+		if (qpn == 0)
+			qp = rcu_dereference(ibp->qp0);
+		else
+			qp = rcu_dereference(ibp->qp1);
+		if (qp)
+			atomic_inc(&qp->refcount);
+	} else {
+		struct qib_ibdev *dev = &ppd_from_ibp(ibp)->dd->verbs_dev;
+		unsigned n = qpn_hash(dev, qpn);
+
+		for (qp = rcu_dereference(dev->qp_table[n]); qp;
+			qp = rcu_dereference(qp->next))
+			if (qp->ibqp.qp_num == qpn) {
+				atomic_inc(&qp->refcount);
+				break;
+			}
+	}
+	rcu_read_unlock();
+	return qp;
+}
+
+/**
+ * qib_reset_qp - initialize the QP state to the reset state
+ * @qp: the QP to reset
+ * @type: the QP type
+ */
+static void qib_reset_qp(struct qib_qp *qp, enum ib_qp_type type)
+{
+	qp->remote_qpn = 0;
+	qp->qkey = 0;
+	qp->qp_access_flags = 0;
+	atomic_set(&qp->s_dma_busy, 0);
+	qp->s_flags &= QIB_S_SIGNAL_REQ_WR;
+	qp->s_hdrwords = 0;
+	qp->s_wqe = NULL;
+	qp->s_draining = 0;
+	qp->s_next_psn = 0;
+	qp->s_last_psn = 0;
+	qp->s_sending_psn = 0;
+	qp->s_sending_hpsn = 0;
+	qp->s_psn = 0;
+	qp->r_psn = 0;
+	qp->r_msn = 0;
+	if (type == IB_QPT_RC) {
+		qp->s_state = IB_OPCODE_RC_SEND_LAST;
+		qp->r_state = IB_OPCODE_RC_SEND_LAST;
+	} else {
+		qp->s_state = IB_OPCODE_UC_SEND_LAST;
+		qp->r_state = IB_OPCODE_UC_SEND_LAST;
+	}
+	qp->s_ack_state = IB_OPCODE_RC_ACKNOWLEDGE;
+	qp->r_nak_state = 0;
+	qp->r_aflags = 0;
+	qp->r_flags = 0;
+	qp->s_head = 0;
+	qp->s_tail = 0;
+	qp->s_cur = 0;
+	qp->s_acked = 0;
+	qp->s_last = 0;
+	qp->s_ssn = 1;
+	qp->s_lsn = 0;
+	qp->s_mig_state = IB_MIG_MIGRATED;
+	memset(qp->s_ack_queue, 0, sizeof(qp->s_ack_queue));
+	qp->r_head_ack_queue = 0;
+	qp->s_tail_ack_queue = 0;
+	qp->s_num_rd_atomic = 0;
+	if (qp->r_rq.wq) {
+		qp->r_rq.wq->head = 0;
+		qp->r_rq.wq->tail = 0;
+	}
+	qp->r_sge.num_sge = 0;
+}
+
+static void clear_mr_refs(struct qib_qp *qp, int clr_sends)
+{
+	unsigned n;
+
+	if (test_and_clear_bit(QIB_R_REWIND_SGE, &qp->r_aflags))
+		qib_put_ss(&qp->s_rdma_read_sge);
+
+	qib_put_ss(&qp->r_sge);
+
+	if (clr_sends) {
+		while (qp->s_last != qp->s_head) {
+			struct qib_swqe *wqe = get_swqe_ptr(qp, qp->s_last);
+			unsigned i;
+
+			for (i = 0; i < wqe->wr.num_sge; i++) {
+				struct qib_sge *sge = &wqe->sg_list[i];
+
+				qib_put_mr(sge->mr);
+			}
+			if (qp->ibqp.qp_type == IB_QPT_UD ||
+			    qp->ibqp.qp_type == IB_QPT_SMI ||
+			    qp->ibqp.qp_type == IB_QPT_GSI)
+				atomic_dec(&to_iah(wqe->wr.wr.ud.ah)->refcount);
+			if (++qp->s_last >= qp->s_size)
+				qp->s_last = 0;
+		}
+		if (qp->s_rdma_mr) {
+			qib_put_mr(qp->s_rdma_mr);
+			qp->s_rdma_mr = NULL;
+		}
+	}
+
+	if (qp->ibqp.qp_type != IB_QPT_RC)
+		return;
+
+	for (n = 0; n < ARRAY_SIZE(qp->s_ack_queue); n++) {
+		struct qib_ack_entry *e = &qp->s_ack_queue[n];
+
+		if (e->opcode == IB_OPCODE_RC_RDMA_READ_REQUEST &&
+		    e->rdma_sge.mr) {
+			qib_put_mr(e->rdma_sge.mr);
+			e->rdma_sge.mr = NULL;
+		}
+	}
+}
+
+/**
+ * qib_error_qp - put a QP into the error state
+ * @qp: the QP to put into the error state
+ * @err: the receive completion error to signal if a RWQE is active
+ *
+ * Flushes both send and receive work queues.
+ * Returns true if last WQE event should be generated.
+ * The QP r_lock and s_lock should be held and interrupts disabled.
+ * If we are already in error state, just return.
+ */
+int qib_error_qp(struct qib_qp *qp, enum ib_wc_status err)
+{
+	struct qib_ibdev *dev = to_idev(qp->ibqp.device);
+	struct ib_wc wc;
+	int ret = 0;
+
+	if (qp->state == IB_QPS_ERR || qp->state == IB_QPS_RESET)
+		goto bail;
+
+	qp->state = IB_QPS_ERR;
+
+	if (qp->s_flags & (QIB_S_TIMER | QIB_S_WAIT_RNR)) {
+		qp->s_flags &= ~(QIB_S_TIMER | QIB_S_WAIT_RNR);
+		del_timer(&qp->s_timer);
+	}
+
+	if (qp->s_flags & QIB_S_ANY_WAIT_SEND)
+		qp->s_flags &= ~QIB_S_ANY_WAIT_SEND;
+
+	spin_lock(&dev->pending_lock);
+	if (!list_empty(&qp->iowait) && !(qp->s_flags & QIB_S_BUSY)) {
+		qp->s_flags &= ~QIB_S_ANY_WAIT_IO;
+		list_del_init(&qp->iowait);
+	}
+	spin_unlock(&dev->pending_lock);
+
+	if (!(qp->s_flags & QIB_S_BUSY)) {
+		qp->s_hdrwords = 0;
+		if (qp->s_rdma_mr) {
+			qib_put_mr(qp->s_rdma_mr);
+			qp->s_rdma_mr = NULL;
+		}
+		if (qp->s_tx) {
+			qib_put_txreq(qp->s_tx);
+			qp->s_tx = NULL;
+		}
+	}
+
+	/* Schedule the sending tasklet to drain the send work queue. */
+	if (qp->s_last != qp->s_head)
+		qib_schedule_send(qp);
+
+	clear_mr_refs(qp, 0);
+
+	memset(&wc, 0, sizeof(wc));
+	wc.qp = &qp->ibqp;
+	wc.opcode = IB_WC_RECV;
+
+	if (test_and_clear_bit(QIB_R_WRID_VALID, &qp->r_aflags)) {
+		wc.wr_id = qp->r_wr_id;
+		wc.status = err;
+		qib_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, 1);
+	}
+	wc.status = IB_WC_WR_FLUSH_ERR;
+
+	if (qp->r_rq.wq) {
+		struct qib_rwq *wq;
+		u32 head;
+		u32 tail;
+
+		spin_lock(&qp->r_rq.lock);
+
+		/* sanity check pointers before trusting them */
+		wq = qp->r_rq.wq;
+		head = wq->head;
+		if (head >= qp->r_rq.size)
+			head = 0;
+		tail = wq->tail;
+		if (tail >= qp->r_rq.size)
+			tail = 0;
+		while (tail != head) {
+			wc.wr_id = get_rwqe_ptr(&qp->r_rq, tail)->wr_id;
+			if (++tail >= qp->r_rq.size)
+				tail = 0;
+			qib_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, 1);
+		}
+		wq->tail = tail;
+
+		spin_unlock(&qp->r_rq.lock);
+	} else if (qp->ibqp.event_handler)
+		ret = 1;
+
+bail:
+	return ret;
+}
+
+/**
+ * qib_modify_qp - modify the attributes of a queue pair
+ * @ibqp: the queue pair who's attributes we're modifying
+ * @attr: the new attributes
+ * @attr_mask: the mask of attributes to modify
+ * @udata: user data for libibverbs.so
+ *
+ * Returns 0 on success, otherwise returns an errno.
+ */
+int qib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+		  int attr_mask, struct ib_udata *udata)
+{
+	struct qib_ibdev *dev = to_idev(ibqp->device);
+	struct qib_qp *qp = to_iqp(ibqp);
+	enum ib_qp_state cur_state, new_state;
+	struct ib_event ev;
+	int lastwqe = 0;
+	int mig = 0;
+	int ret;
+	u32 pmtu = 0; /* for gcc warning only */
+
+	spin_lock_irq(&qp->r_lock);
+	spin_lock(&qp->s_lock);
+
+	cur_state = attr_mask & IB_QP_CUR_STATE ?
+		attr->cur_qp_state : qp->state;
+	new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state;
+
+	if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type,
+				attr_mask, IB_LINK_LAYER_UNSPECIFIED))
+		goto inval;
+
+	if (attr_mask & IB_QP_AV) {
+		if (attr->ah_attr.dlid >= QIB_MULTICAST_LID_BASE)
+			goto inval;
+		if (qib_check_ah(qp->ibqp.device, &attr->ah_attr))
+			goto inval;
+	}
+
+	if (attr_mask & IB_QP_ALT_PATH) {
+		if (attr->alt_ah_attr.dlid >= QIB_MULTICAST_LID_BASE)
+			goto inval;
+		if (qib_check_ah(qp->ibqp.device, &attr->alt_ah_attr))
+			goto inval;
+		if (attr->alt_pkey_index >= qib_get_npkeys(dd_from_dev(dev)))
+			goto inval;
+	}
+
+	if (attr_mask & IB_QP_PKEY_INDEX)
+		if (attr->pkey_index >= qib_get_npkeys(dd_from_dev(dev)))
+			goto inval;
+
+	if (attr_mask & IB_QP_MIN_RNR_TIMER)
+		if (attr->min_rnr_timer > 31)
+			goto inval;
+
+	if (attr_mask & IB_QP_PORT)
+		if (qp->ibqp.qp_type == IB_QPT_SMI ||
+		    qp->ibqp.qp_type == IB_QPT_GSI ||
+		    attr->port_num == 0 ||
+		    attr->port_num > ibqp->device->phys_port_cnt)
+			goto inval;
+
+	if (attr_mask & IB_QP_DEST_QPN)
+		if (attr->dest_qp_num > QIB_QPN_MASK)
+			goto inval;
+
+	if (attr_mask & IB_QP_RETRY_CNT)
+		if (attr->retry_cnt > 7)
+			goto inval;
+
+	if (attr_mask & IB_QP_RNR_RETRY)
+		if (attr->rnr_retry > 7)
+			goto inval;
+
+	/*
+	 * Don't allow invalid path_mtu values.  OK to set greater
+	 * than the active mtu (or even the max_cap, if we have tuned
+	 * that to a small mtu.  We'll set qp->path_mtu
+	 * to the lesser of requested attribute mtu and active,
+	 * for packetizing messages.
+	 * Note that the QP port has to be set in INIT and MTU in RTR.
+	 */
+	if (attr_mask & IB_QP_PATH_MTU) {
+		struct qib_devdata *dd = dd_from_dev(dev);
+		int mtu, pidx = qp->port_num - 1;
+
+		mtu = ib_mtu_enum_to_int(attr->path_mtu);
+		if (mtu == -1)
+			goto inval;
+		if (mtu > dd->pport[pidx].ibmtu) {
+			switch (dd->pport[pidx].ibmtu) {
+			case 4096:
+				pmtu = IB_MTU_4096;
+				break;
+			case 2048:
+				pmtu = IB_MTU_2048;
+				break;
+			case 1024:
+				pmtu = IB_MTU_1024;
+				break;
+			case 512:
+				pmtu = IB_MTU_512;
+				break;
+			case 256:
+				pmtu = IB_MTU_256;
+				break;
+			default:
+				pmtu = IB_MTU_2048;
+			}
+		} else
+			pmtu = attr->path_mtu;
+	}
+
+	if (attr_mask & IB_QP_PATH_MIG_STATE) {
+		if (attr->path_mig_state == IB_MIG_REARM) {
+			if (qp->s_mig_state == IB_MIG_ARMED)
+				goto inval;
+			if (new_state != IB_QPS_RTS)
+				goto inval;
+		} else if (attr->path_mig_state == IB_MIG_MIGRATED) {
+			if (qp->s_mig_state == IB_MIG_REARM)
+				goto inval;
+			if (new_state != IB_QPS_RTS && new_state != IB_QPS_SQD)
+				goto inval;
+			if (qp->s_mig_state == IB_MIG_ARMED)
+				mig = 1;
+		} else
+			goto inval;
+	}
+
+	if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC)
+		if (attr->max_dest_rd_atomic > QIB_MAX_RDMA_ATOMIC)
+			goto inval;
+
+	switch (new_state) {
+	case IB_QPS_RESET:
+		if (qp->state != IB_QPS_RESET) {
+			qp->state = IB_QPS_RESET;
+			spin_lock(&dev->pending_lock);
+			if (!list_empty(&qp->iowait))
+				list_del_init(&qp->iowait);
+			spin_unlock(&dev->pending_lock);
+			qp->s_flags &= ~(QIB_S_TIMER | QIB_S_ANY_WAIT);
+			spin_unlock(&qp->s_lock);
+			spin_unlock_irq(&qp->r_lock);
+			/* Stop the sending work queue and retry timer */
+			cancel_work_sync(&qp->s_work);
+			del_timer_sync(&qp->s_timer);
+			wait_event(qp->wait_dma, !atomic_read(&qp->s_dma_busy));
+			if (qp->s_tx) {
+				qib_put_txreq(qp->s_tx);
+				qp->s_tx = NULL;
+			}
+			remove_qp(dev, qp);
+			wait_event(qp->wait, !atomic_read(&qp->refcount));
+			spin_lock_irq(&qp->r_lock);
+			spin_lock(&qp->s_lock);
+			clear_mr_refs(qp, 1);
+			qib_reset_qp(qp, ibqp->qp_type);
+		}
+		break;
+
+	case IB_QPS_RTR:
+		/* Allow event to retrigger if QP set to RTR more than once */
+		qp->r_flags &= ~QIB_R_COMM_EST;
+		qp->state = new_state;
+		break;
+
+	case IB_QPS_SQD:
+		qp->s_draining = qp->s_last != qp->s_cur;
+		qp->state = new_state;
+		break;
+
+	case IB_QPS_SQE:
+		if (qp->ibqp.qp_type == IB_QPT_RC)
+			goto inval;
+		qp->state = new_state;
+		break;
+
+	case IB_QPS_ERR:
+		lastwqe = qib_error_qp(qp, IB_WC_WR_FLUSH_ERR);
+		break;
+
+	default:
+		qp->state = new_state;
+		break;
+	}
+
+	if (attr_mask & IB_QP_PKEY_INDEX)
+		qp->s_pkey_index = attr->pkey_index;
+
+	if (attr_mask & IB_QP_PORT)
+		qp->port_num = attr->port_num;
+
+	if (attr_mask & IB_QP_DEST_QPN)
+		qp->remote_qpn = attr->dest_qp_num;
+
+	if (attr_mask & IB_QP_SQ_PSN) {
+		qp->s_next_psn = attr->sq_psn & QIB_PSN_MASK;
+		qp->s_psn = qp->s_next_psn;
+		qp->s_sending_psn = qp->s_next_psn;
+		qp->s_last_psn = qp->s_next_psn - 1;
+		qp->s_sending_hpsn = qp->s_last_psn;
+	}
+
+	if (attr_mask & IB_QP_RQ_PSN)
+		qp->r_psn = attr->rq_psn & QIB_PSN_MASK;
+
+	if (attr_mask & IB_QP_ACCESS_FLAGS)
+		qp->qp_access_flags = attr->qp_access_flags;
+
+	if (attr_mask & IB_QP_AV) {
+		qp->remote_ah_attr = attr->ah_attr;
+		qp->s_srate = attr->ah_attr.static_rate;
+	}
+
+	if (attr_mask & IB_QP_ALT_PATH) {
+		qp->alt_ah_attr = attr->alt_ah_attr;
+		qp->s_alt_pkey_index = attr->alt_pkey_index;
+	}
+
+	if (attr_mask & IB_QP_PATH_MIG_STATE) {
+		qp->s_mig_state = attr->path_mig_state;
+		if (mig) {
+			qp->remote_ah_attr = qp->alt_ah_attr;
+			qp->port_num = qp->alt_ah_attr.port_num;
+			qp->s_pkey_index = qp->s_alt_pkey_index;
+		}
+	}
+
+	if (attr_mask & IB_QP_PATH_MTU) {
+		qp->path_mtu = pmtu;
+		qp->pmtu = ib_mtu_enum_to_int(pmtu);
+	}
+
+	if (attr_mask & IB_QP_RETRY_CNT) {
+		qp->s_retry_cnt = attr->retry_cnt;
+		qp->s_retry = attr->retry_cnt;
+	}
+
+	if (attr_mask & IB_QP_RNR_RETRY) {
+		qp->s_rnr_retry_cnt = attr->rnr_retry;
+		qp->s_rnr_retry = attr->rnr_retry;
+	}
+
+	if (attr_mask & IB_QP_MIN_RNR_TIMER)
+		qp->r_min_rnr_timer = attr->min_rnr_timer;
+
+	if (attr_mask & IB_QP_TIMEOUT) {
+		qp->timeout = attr->timeout;
+		qp->timeout_jiffies =
+			usecs_to_jiffies((4096UL * (1UL << qp->timeout)) /
+				1000UL);
+	}
+
+	if (attr_mask & IB_QP_QKEY)
+		qp->qkey = attr->qkey;
+
+	if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC)
+		qp->r_max_rd_atomic = attr->max_dest_rd_atomic;
+
+	if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC)
+		qp->s_max_rd_atomic = attr->max_rd_atomic;
+
+	spin_unlock(&qp->s_lock);
+	spin_unlock_irq(&qp->r_lock);
+
+	if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT)
+		insert_qp(dev, qp);
+
+	if (lastwqe) {
+		ev.device = qp->ibqp.device;
+		ev.element.qp = &qp->ibqp;
+		ev.event = IB_EVENT_QP_LAST_WQE_REACHED;
+		qp->ibqp.event_handler(&ev, qp->ibqp.qp_context);
+	}
+	if (mig) {
+		ev.device = qp->ibqp.device;
+		ev.element.qp = &qp->ibqp;
+		ev.event = IB_EVENT_PATH_MIG;
+		qp->ibqp.event_handler(&ev, qp->ibqp.qp_context);
+	}
+	ret = 0;
+	goto bail;
+
+inval:
+	spin_unlock(&qp->s_lock);
+	spin_unlock_irq(&qp->r_lock);
+	ret = -EINVAL;
+
+bail:
+	return ret;
+}
+
+int qib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+		 int attr_mask, struct ib_qp_init_attr *init_attr)
+{
+	struct qib_qp *qp = to_iqp(ibqp);
+
+	attr->qp_state = qp->state;
+	attr->cur_qp_state = attr->qp_state;
+	attr->path_mtu = qp->path_mtu;
+	attr->path_mig_state = qp->s_mig_state;
+	attr->qkey = qp->qkey;
+	attr->rq_psn = qp->r_psn & QIB_PSN_MASK;
+	attr->sq_psn = qp->s_next_psn & QIB_PSN_MASK;
+	attr->dest_qp_num = qp->remote_qpn;
+	attr->qp_access_flags = qp->qp_access_flags;
+	attr->cap.max_send_wr = qp->s_size - 1;
+	attr->cap.max_recv_wr = qp->ibqp.srq ? 0 : qp->r_rq.size - 1;
+	attr->cap.max_send_sge = qp->s_max_sge;
+	attr->cap.max_recv_sge = qp->r_rq.max_sge;
+	attr->cap.max_inline_data = 0;
+	attr->ah_attr = qp->remote_ah_attr;
+	attr->alt_ah_attr = qp->alt_ah_attr;
+	attr->pkey_index = qp->s_pkey_index;
+	attr->alt_pkey_index = qp->s_alt_pkey_index;
+	attr->en_sqd_async_notify = 0;
+	attr->sq_draining = qp->s_draining;
+	attr->max_rd_atomic = qp->s_max_rd_atomic;
+	attr->max_dest_rd_atomic = qp->r_max_rd_atomic;
+	attr->min_rnr_timer = qp->r_min_rnr_timer;
+	attr->port_num = qp->port_num;
+	attr->timeout = qp->timeout;
+	attr->retry_cnt = qp->s_retry_cnt;
+	attr->rnr_retry = qp->s_rnr_retry_cnt;
+	attr->alt_port_num = qp->alt_ah_attr.port_num;
+	attr->alt_timeout = qp->alt_timeout;
+
+	init_attr->event_handler = qp->ibqp.event_handler;
+	init_attr->qp_context = qp->ibqp.qp_context;
+	init_attr->send_cq = qp->ibqp.send_cq;
+	init_attr->recv_cq = qp->ibqp.recv_cq;
+	init_attr->srq = qp->ibqp.srq;
+	init_attr->cap = attr->cap;
+	if (qp->s_flags & QIB_S_SIGNAL_REQ_WR)
+		init_attr->sq_sig_type = IB_SIGNAL_REQ_WR;
+	else
+		init_attr->sq_sig_type = IB_SIGNAL_ALL_WR;
+	init_attr->qp_type = qp->ibqp.qp_type;
+	init_attr->port_num = qp->port_num;
+	return 0;
+}
+
+/**
+ * qib_compute_aeth - compute the AETH (syndrome + MSN)
+ * @qp: the queue pair to compute the AETH for
+ *
+ * Returns the AETH.
+ */
+__be32 qib_compute_aeth(struct qib_qp *qp)
+{
+	u32 aeth = qp->r_msn & QIB_MSN_MASK;
+
+	if (qp->ibqp.srq) {
+		/*
+		 * Shared receive queues don't generate credits.
+		 * Set the credit field to the invalid value.
+		 */
+		aeth |= QIB_AETH_CREDIT_INVAL << QIB_AETH_CREDIT_SHIFT;
+	} else {
+		u32 min, max, x;
+		u32 credits;
+		struct qib_rwq *wq = qp->r_rq.wq;
+		u32 head;
+		u32 tail;
+
+		/* sanity check pointers before trusting them */
+		head = wq->head;
+		if (head >= qp->r_rq.size)
+			head = 0;
+		tail = wq->tail;
+		if (tail >= qp->r_rq.size)
+			tail = 0;
+		/*
+		 * Compute the number of credits available (RWQEs).
+		 * XXX Not holding the r_rq.lock here so there is a small
+		 * chance that the pair of reads are not atomic.
+		 */
+		credits = head - tail;
+		if ((int)credits < 0)
+			credits += qp->r_rq.size;
+		/*
+		 * Binary search the credit table to find the code to
+		 * use.
+		 */
+		min = 0;
+		max = 31;
+		for (;;) {
+			x = (min + max) / 2;
+			if (credit_table[x] == credits)
+				break;
+			if (credit_table[x] > credits)
+				max = x;
+			else if (min == x)
+				break;
+			else
+				min = x;
+		}
+		aeth |= x << QIB_AETH_CREDIT_SHIFT;
+	}
+	return cpu_to_be32(aeth);
+}
+
+/**
+ * qib_create_qp - create a queue pair for a device
+ * @ibpd: the protection domain who's device we create the queue pair for
+ * @init_attr: the attributes of the queue pair
+ * @udata: user data for libibverbs.so
+ *
+ * Returns the queue pair on success, otherwise returns an errno.
+ *
+ * Called by the ib_create_qp() core verbs function.
+ */
+struct ib_qp *qib_create_qp(struct ib_pd *ibpd,
+			    struct ib_qp_init_attr *init_attr,
+			    struct ib_udata *udata)
+{
+	struct qib_qp *qp;
+	int err;
+	struct qib_swqe *swq = NULL;
+	struct qib_ibdev *dev;
+	struct qib_devdata *dd;
+	size_t sz;
+	size_t sg_list_sz;
+	struct ib_qp *ret;
+
+	if (init_attr->cap.max_send_sge > ib_qib_max_sges ||
+	    init_attr->cap.max_send_wr > ib_qib_max_qp_wrs ||
+	    init_attr->create_flags) {
+		ret = ERR_PTR(-EINVAL);
+		goto bail;
+	}
+
+	/* Check receive queue parameters if no SRQ is specified. */
+	if (!init_attr->srq) {
+		if (init_attr->cap.max_recv_sge > ib_qib_max_sges ||
+		    init_attr->cap.max_recv_wr > ib_qib_max_qp_wrs) {
+			ret = ERR_PTR(-EINVAL);
+			goto bail;
+		}
+		if (init_attr->cap.max_send_sge +
+		    init_attr->cap.max_send_wr +
+		    init_attr->cap.max_recv_sge +
+		    init_attr->cap.max_recv_wr == 0) {
+			ret = ERR_PTR(-EINVAL);
+			goto bail;
+		}
+	}
+
+	switch (init_attr->qp_type) {
+	case IB_QPT_SMI:
+	case IB_QPT_GSI:
+		if (init_attr->port_num == 0 ||
+		    init_attr->port_num > ibpd->device->phys_port_cnt) {
+			ret = ERR_PTR(-EINVAL);
+			goto bail;
+		}
+	case IB_QPT_UC:
+	case IB_QPT_RC:
+	case IB_QPT_UD:
+		sz = sizeof(struct qib_sge) *
+			init_attr->cap.max_send_sge +
+			sizeof(struct qib_swqe);
+		swq = vmalloc((init_attr->cap.max_send_wr + 1) * sz);
+		if (swq == NULL) {
+			ret = ERR_PTR(-ENOMEM);
+			goto bail;
+		}
+		sz = sizeof(*qp);
+		sg_list_sz = 0;
+		if (init_attr->srq) {
+			struct qib_srq *srq = to_isrq(init_attr->srq);
+
+			if (srq->rq.max_sge > 1)
+				sg_list_sz = sizeof(*qp->r_sg_list) *
+					(srq->rq.max_sge - 1);
+		} else if (init_attr->cap.max_recv_sge > 1)
+			sg_list_sz = sizeof(*qp->r_sg_list) *
+				(init_attr->cap.max_recv_sge - 1);
+		qp = kzalloc(sz + sg_list_sz, GFP_KERNEL);
+		if (!qp) {
+			ret = ERR_PTR(-ENOMEM);
+			goto bail_swq;
+		}
+		RCU_INIT_POINTER(qp->next, NULL);
+		qp->s_hdr = kzalloc(sizeof(*qp->s_hdr), GFP_KERNEL);
+		if (!qp->s_hdr) {
+			ret = ERR_PTR(-ENOMEM);
+			goto bail_qp;
+		}
+		qp->timeout_jiffies =
+			usecs_to_jiffies((4096UL * (1UL << qp->timeout)) /
+				1000UL);
+		if (init_attr->srq)
+			sz = 0;
+		else {
+			qp->r_rq.size = init_attr->cap.max_recv_wr + 1;
+			qp->r_rq.max_sge = init_attr->cap.max_recv_sge;
+			sz = (sizeof(struct ib_sge) * qp->r_rq.max_sge) +
+				sizeof(struct qib_rwqe);
+			qp->r_rq.wq = vmalloc_user(sizeof(struct qib_rwq) +
+						   qp->r_rq.size * sz);
+			if (!qp->r_rq.wq) {
+				ret = ERR_PTR(-ENOMEM);
+				goto bail_qp;
+			}
+		}
+
+		/*
+		 * ib_create_qp() will initialize qp->ibqp
+		 * except for qp->ibqp.qp_num.
+		 */
+		spin_lock_init(&qp->r_lock);
+		spin_lock_init(&qp->s_lock);
+		spin_lock_init(&qp->r_rq.lock);
+		atomic_set(&qp->refcount, 0);
+		init_waitqueue_head(&qp->wait);
+		init_waitqueue_head(&qp->wait_dma);
+		init_timer(&qp->s_timer);
+		qp->s_timer.data = (unsigned long)qp;
+		INIT_WORK(&qp->s_work, qib_do_send);
+		INIT_LIST_HEAD(&qp->iowait);
+		INIT_LIST_HEAD(&qp->rspwait);
+		qp->state = IB_QPS_RESET;
+		qp->s_wq = swq;
+		qp->s_size = init_attr->cap.max_send_wr + 1;
+		qp->s_max_sge = init_attr->cap.max_send_sge;
+		if (init_attr->sq_sig_type == IB_SIGNAL_REQ_WR)
+			qp->s_flags = QIB_S_SIGNAL_REQ_WR;
+		dev = to_idev(ibpd->device);
+		dd = dd_from_dev(dev);
+		err = alloc_qpn(dd, &dev->qpn_table, init_attr->qp_type,
+				init_attr->port_num);
+		if (err < 0) {
+			ret = ERR_PTR(err);
+			vfree(qp->r_rq.wq);
+			goto bail_qp;
+		}
+		qp->ibqp.qp_num = err;
+		qp->port_num = init_attr->port_num;
+		qib_reset_qp(qp, init_attr->qp_type);
+		break;
+
+	default:
+		/* Don't support raw QPs */
+		ret = ERR_PTR(-ENOSYS);
+		goto bail;
+	}
+
+	init_attr->cap.max_inline_data = 0;
+
+	/*
+	 * Return the address of the RWQ as the offset to mmap.
+	 * See qib_mmap() for details.
+	 */
+	if (udata && udata->outlen >= sizeof(__u64)) {
+		if (!qp->r_rq.wq) {
+			__u64 offset = 0;
+
+			err = ib_copy_to_udata(udata, &offset,
+					       sizeof(offset));
+			if (err) {
+				ret = ERR_PTR(err);
+				goto bail_ip;
+			}
+		} else {
+			u32 s = sizeof(struct qib_rwq) + qp->r_rq.size * sz;
+
+			qp->ip = qib_create_mmap_info(dev, s,
+						      ibpd->uobject->context,
+						      qp->r_rq.wq);
+			if (!qp->ip) {
+				ret = ERR_PTR(-ENOMEM);
+				goto bail_ip;
+			}
+
+			err = ib_copy_to_udata(udata, &(qp->ip->offset),
+					       sizeof(qp->ip->offset));
+			if (err) {
+				ret = ERR_PTR(err);
+				goto bail_ip;
+			}
+		}
+	}
+
+	spin_lock(&dev->n_qps_lock);
+	if (dev->n_qps_allocated == ib_qib_max_qps) {
+		spin_unlock(&dev->n_qps_lock);
+		ret = ERR_PTR(-ENOMEM);
+		goto bail_ip;
+	}
+
+	dev->n_qps_allocated++;
+	spin_unlock(&dev->n_qps_lock);
+
+	if (qp->ip) {
+		spin_lock_irq(&dev->pending_lock);
+		list_add(&qp->ip->pending_mmaps, &dev->pending_mmaps);
+		spin_unlock_irq(&dev->pending_lock);
+	}
+
+	ret = &qp->ibqp;
+	goto bail;
+
+bail_ip:
+	if (qp->ip)
+		kref_put(&qp->ip->ref, qib_release_mmap_info);
+	else
+		vfree(qp->r_rq.wq);
+	free_qpn(&dev->qpn_table, qp->ibqp.qp_num);
+bail_qp:
+	kfree(qp->s_hdr);
+	kfree(qp);
+bail_swq:
+	vfree(swq);
+bail:
+	return ret;
+}
+
+/**
+ * qib_destroy_qp - destroy a queue pair
+ * @ibqp: the queue pair to destroy
+ *
+ * Returns 0 on success.
+ *
+ * Note that this can be called while the QP is actively sending or
+ * receiving!
+ */
+int qib_destroy_qp(struct ib_qp *ibqp)
+{
+	struct qib_qp *qp = to_iqp(ibqp);
+	struct qib_ibdev *dev = to_idev(ibqp->device);
+
+	/* Make sure HW and driver activity is stopped. */
+	spin_lock_irq(&qp->s_lock);
+	if (qp->state != IB_QPS_RESET) {
+		qp->state = IB_QPS_RESET;
+		spin_lock(&dev->pending_lock);
+		if (!list_empty(&qp->iowait))
+			list_del_init(&qp->iowait);
+		spin_unlock(&dev->pending_lock);
+		qp->s_flags &= ~(QIB_S_TIMER | QIB_S_ANY_WAIT);
+		spin_unlock_irq(&qp->s_lock);
+		cancel_work_sync(&qp->s_work);
+		del_timer_sync(&qp->s_timer);
+		wait_event(qp->wait_dma, !atomic_read(&qp->s_dma_busy));
+		if (qp->s_tx) {
+			qib_put_txreq(qp->s_tx);
+			qp->s_tx = NULL;
+		}
+		remove_qp(dev, qp);
+		wait_event(qp->wait, !atomic_read(&qp->refcount));
+		clear_mr_refs(qp, 1);
+	} else
+		spin_unlock_irq(&qp->s_lock);
+
+	/* all user's cleaned up, mark it available */
+	free_qpn(&dev->qpn_table, qp->ibqp.qp_num);
+	spin_lock(&dev->n_qps_lock);
+	dev->n_qps_allocated--;
+	spin_unlock(&dev->n_qps_lock);
+
+	if (qp->ip)
+		kref_put(&qp->ip->ref, qib_release_mmap_info);
+	else
+		vfree(qp->r_rq.wq);
+	vfree(qp->s_wq);
+	kfree(qp->s_hdr);
+	kfree(qp);
+	return 0;
+}
+
+/**
+ * qib_init_qpn_table - initialize the QP number table for a device
+ * @qpt: the QPN table
+ */
+void qib_init_qpn_table(struct qib_devdata *dd, struct qib_qpn_table *qpt)
+{
+	spin_lock_init(&qpt->lock);
+	qpt->last = 1;          /* start with QPN 2 */
+	qpt->nmaps = 1;
+	qpt->mask = dd->qpn_mask;
+}
+
+/**
+ * qib_free_qpn_table - free the QP number table for a device
+ * @qpt: the QPN table
+ */
+void qib_free_qpn_table(struct qib_qpn_table *qpt)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(qpt->map); i++)
+		if (qpt->map[i].page)
+			free_page((unsigned long) qpt->map[i].page);
+}
+
+/**
+ * qib_get_credit - flush the send work queue of a QP
+ * @qp: the qp who's send work queue to flush
+ * @aeth: the Acknowledge Extended Transport Header
+ *
+ * The QP s_lock should be held.
+ */
+void qib_get_credit(struct qib_qp *qp, u32 aeth)
+{
+	u32 credit = (aeth >> QIB_AETH_CREDIT_SHIFT) & QIB_AETH_CREDIT_MASK;
+
+	/*
+	 * If the credit is invalid, we can send
+	 * as many packets as we like.  Otherwise, we have to
+	 * honor the credit field.
+	 */
+	if (credit == QIB_AETH_CREDIT_INVAL) {
+		if (!(qp->s_flags & QIB_S_UNLIMITED_CREDIT)) {
+			qp->s_flags |= QIB_S_UNLIMITED_CREDIT;
+			if (qp->s_flags & QIB_S_WAIT_SSN_CREDIT) {
+				qp->s_flags &= ~QIB_S_WAIT_SSN_CREDIT;
+				qib_schedule_send(qp);
+			}
+		}
+	} else if (!(qp->s_flags & QIB_S_UNLIMITED_CREDIT)) {
+		/* Compute new LSN (i.e., MSN + credit) */
+		credit = (aeth + credit_table[credit]) & QIB_MSN_MASK;
+		if (qib_cmp24(credit, qp->s_lsn) > 0) {
+			qp->s_lsn = credit;
+			if (qp->s_flags & QIB_S_WAIT_SSN_CREDIT) {
+				qp->s_flags &= ~QIB_S_WAIT_SSN_CREDIT;
+				qib_schedule_send(qp);
+			}
+		}
+	}
+}
+
+#ifdef CONFIG_DEBUG_FS
+
+struct qib_qp_iter {
+	struct qib_ibdev *dev;
+	struct qib_qp *qp;
+	int n;
+};
+
+struct qib_qp_iter *qib_qp_iter_init(struct qib_ibdev *dev)
+{
+	struct qib_qp_iter *iter;
+
+	iter = kzalloc(sizeof(*iter), GFP_KERNEL);
+	if (!iter)
+		return NULL;
+
+	iter->dev = dev;
+	if (qib_qp_iter_next(iter)) {
+		kfree(iter);
+		return NULL;
+	}
+
+	return iter;
+}
+
+int qib_qp_iter_next(struct qib_qp_iter *iter)
+{
+	struct qib_ibdev *dev = iter->dev;
+	int n = iter->n;
+	int ret = 1;
+	struct qib_qp *pqp = iter->qp;
+	struct qib_qp *qp;
+
+	for (; n < dev->qp_table_size; n++) {
+		if (pqp)
+			qp = rcu_dereference(pqp->next);
+		else
+			qp = rcu_dereference(dev->qp_table[n]);
+		pqp = qp;
+		if (qp) {
+			iter->qp = qp;
+			iter->n = n;
+			return 0;
+		}
+	}
+	return ret;
+}
+
+static const char * const qp_type_str[] = {
+	"SMI", "GSI", "RC", "UC", "UD",
+};
+
+void qib_qp_iter_print(struct seq_file *s, struct qib_qp_iter *iter)
+{
+	struct qib_swqe *wqe;
+	struct qib_qp *qp = iter->qp;
+
+	wqe = get_swqe_ptr(qp, qp->s_last);
+	seq_printf(s,
+		   "N %d QP%u %s %u %u %u f=%x %u %u %u %u %u PSN %x %x %x %x %x (%u %u %u %u %u %u) QP%u LID %x\n",
+		   iter->n,
+		   qp->ibqp.qp_num,
+		   qp_type_str[qp->ibqp.qp_type],
+		   qp->state,
+		   wqe->wr.opcode,
+		   qp->s_hdrwords,
+		   qp->s_flags,
+		   atomic_read(&qp->s_dma_busy),
+		   !list_empty(&qp->iowait),
+		   qp->timeout,
+		   wqe->ssn,
+		   qp->s_lsn,
+		   qp->s_last_psn,
+		   qp->s_psn, qp->s_next_psn,
+		   qp->s_sending_psn, qp->s_sending_hpsn,
+		   qp->s_last, qp->s_acked, qp->s_cur,
+		   qp->s_tail, qp->s_head, qp->s_size,
+		   qp->remote_qpn,
+		   qp->remote_ah_attr.dlid);
+}
+
+#endif
diff --git a/drivers/infiniband/hw/qib/qib_qsfp.c b/drivers/infiniband/hw/qib/qib_qsfp.c
new file mode 100644
index 0000000..fa71b1e
--- /dev/null
+++ b/drivers/infiniband/hw/qib/qib_qsfp.c
@@ -0,0 +1,556 @@
+/*
+ * Copyright (c) 2006, 2007, 2008, 2009 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/delay.h>
+#include <linux/pci.h>
+#include <linux/vmalloc.h>
+
+#include "qib.h"
+#include "qib_qsfp.h"
+
+/*
+ * QSFP support for ib_qib driver, using "Two Wire Serial Interface" driver
+ * in qib_twsi.c
+ */
+#define QSFP_MAX_RETRY 4
+
+static int qsfp_read(struct qib_pportdata *ppd, int addr, void *bp, int len)
+{
+	struct qib_devdata *dd = ppd->dd;
+	u32 out, mask;
+	int ret, cnt, pass = 0;
+	int stuck = 0;
+	u8 *buff = bp;
+
+	ret = mutex_lock_interruptible(&dd->eep_lock);
+	if (ret)
+		goto no_unlock;
+
+	if (dd->twsi_eeprom_dev == QIB_TWSI_NO_DEV) {
+		ret = -ENXIO;
+		goto bail;
+	}
+
+	/*
+	 * We presume, if we are called at all, that this board has
+	 * QSFP. This is on the same i2c chain as the legacy parts,
+	 * but only responds if the module is selected via GPIO pins.
+	 * Further, there are very long setup and hold requirements
+	 * on MODSEL.
+	 */
+	mask = QSFP_GPIO_MOD_SEL_N | QSFP_GPIO_MOD_RST_N | QSFP_GPIO_LP_MODE;
+	out = QSFP_GPIO_MOD_RST_N | QSFP_GPIO_LP_MODE;
+	if (ppd->hw_pidx) {
+		mask <<= QSFP_GPIO_PORT2_SHIFT;
+		out <<= QSFP_GPIO_PORT2_SHIFT;
+	}
+
+	dd->f_gpio_mod(dd, out, mask, mask);
+
+	/*
+	 * Module could take up to 2 Msec to respond to MOD_SEL, and there
+	 * is no way to tell if it is ready, so we must wait.
+	 */
+	msleep(2);
+
+	/* Make sure TWSI bus is in sane state. */
+	ret = qib_twsi_reset(dd);
+	if (ret) {
+		qib_dev_porterr(dd, ppd->port,
+				"QSFP interface Reset for read failed\n");
+		ret = -EIO;
+		stuck = 1;
+		goto deselect;
+	}
+
+	/* All QSFP modules are at A0 */
+
+	cnt = 0;
+	while (cnt < len) {
+		unsigned in_page;
+		int wlen = len - cnt;
+		in_page = addr % QSFP_PAGESIZE;
+		if ((in_page + wlen) > QSFP_PAGESIZE)
+			wlen = QSFP_PAGESIZE - in_page;
+		ret = qib_twsi_blk_rd(dd, QSFP_DEV, addr, buff + cnt, wlen);
+		/* Some QSFP's fail first try. Retry as experiment */
+		if (ret && cnt == 0 && ++pass < QSFP_MAX_RETRY)
+			continue;
+		if (ret) {
+			/* qib_twsi_blk_rd() 1 for error, else 0 */
+			ret = -EIO;
+			goto deselect;
+		}
+		addr += wlen;
+		cnt += wlen;
+	}
+	ret = cnt;
+
+deselect:
+	/*
+	 * Module could take up to 10 uSec after transfer before
+	 * ready to respond to MOD_SEL negation, and there is no way
+	 * to tell if it is ready, so we must wait.
+	 */
+	udelay(10);
+	/* set QSFP MODSEL, RST. LP all high */
+	dd->f_gpio_mod(dd, mask, mask, mask);
+
+	/*
+	 * Module could take up to 2 Msec to respond to MOD_SEL
+	 * going away, and there is no way to tell if it is ready.
+	 * so we must wait.
+	 */
+	if (stuck)
+		qib_dev_err(dd, "QSFP interface bus stuck non-idle\n");
+
+	if (pass >= QSFP_MAX_RETRY && ret)
+		qib_dev_porterr(dd, ppd->port, "QSFP failed even retrying\n");
+	else if (pass)
+		qib_dev_porterr(dd, ppd->port, "QSFP retries: %d\n", pass);
+
+	msleep(2);
+
+bail:
+	mutex_unlock(&dd->eep_lock);
+
+no_unlock:
+	return ret;
+}
+
+/*
+ * qsfp_write
+ * We do not ordinarily write the QSFP, but this is needed to select
+ * the page on non-flat QSFPs, and possibly later unusual cases
+ */
+static int qib_qsfp_write(struct qib_pportdata *ppd, int addr, void *bp,
+			  int len)
+{
+	struct qib_devdata *dd = ppd->dd;
+	u32 out, mask;
+	int ret, cnt;
+	u8 *buff = bp;
+
+	ret = mutex_lock_interruptible(&dd->eep_lock);
+	if (ret)
+		goto no_unlock;
+
+	if (dd->twsi_eeprom_dev == QIB_TWSI_NO_DEV) {
+		ret = -ENXIO;
+		goto bail;
+	}
+
+	/*
+	 * We presume, if we are called at all, that this board has
+	 * QSFP. This is on the same i2c chain as the legacy parts,
+	 * but only responds if the module is selected via GPIO pins.
+	 * Further, there are very long setup and hold requirements
+	 * on MODSEL.
+	 */
+	mask = QSFP_GPIO_MOD_SEL_N | QSFP_GPIO_MOD_RST_N | QSFP_GPIO_LP_MODE;
+	out = QSFP_GPIO_MOD_RST_N | QSFP_GPIO_LP_MODE;
+	if (ppd->hw_pidx) {
+		mask <<= QSFP_GPIO_PORT2_SHIFT;
+		out <<= QSFP_GPIO_PORT2_SHIFT;
+	}
+	dd->f_gpio_mod(dd, out, mask, mask);
+
+	/*
+	 * Module could take up to 2 Msec to respond to MOD_SEL,
+	 * and there is no way to tell if it is ready, so we must wait.
+	 */
+	msleep(2);
+
+	/* Make sure TWSI bus is in sane state. */
+	ret = qib_twsi_reset(dd);
+	if (ret) {
+		qib_dev_porterr(dd, ppd->port,
+				"QSFP interface Reset for write failed\n");
+		ret = -EIO;
+		goto deselect;
+	}
+
+	/* All QSFP modules are at A0 */
+
+	cnt = 0;
+	while (cnt < len) {
+		unsigned in_page;
+		int wlen = len - cnt;
+		in_page = addr % QSFP_PAGESIZE;
+		if ((in_page + wlen) > QSFP_PAGESIZE)
+			wlen = QSFP_PAGESIZE - in_page;
+		ret = qib_twsi_blk_wr(dd, QSFP_DEV, addr, buff + cnt, wlen);
+		if (ret) {
+			/* qib_twsi_blk_wr() 1 for error, else 0 */
+			ret = -EIO;
+			goto deselect;
+		}
+		addr += wlen;
+		cnt += wlen;
+	}
+	ret = cnt;
+
+deselect:
+	/*
+	 * Module could take up to 10 uSec after transfer before
+	 * ready to respond to MOD_SEL negation, and there is no way
+	 * to tell if it is ready, so we must wait.
+	 */
+	udelay(10);
+	/* set QSFP MODSEL, RST, LP high */
+	dd->f_gpio_mod(dd, mask, mask, mask);
+	/*
+	 * Module could take up to 2 Msec to respond to MOD_SEL
+	 * going away, and there is no way to tell if it is ready.
+	 * so we must wait.
+	 */
+	msleep(2);
+
+bail:
+	mutex_unlock(&dd->eep_lock);
+
+no_unlock:
+	return ret;
+}
+
+/*
+ * For validation, we want to check the checksums, even of the
+ * fields we do not otherwise use. This function reads the bytes from
+ * <first> to <next-1> and returns the 8lsbs of the sum, or <0 for errors
+ */
+static int qsfp_cks(struct qib_pportdata *ppd, int first, int next)
+{
+	int ret;
+	u16 cks;
+	u8 bval;
+
+	cks = 0;
+	while (first < next) {
+		ret = qsfp_read(ppd, first, &bval, 1);
+		if (ret < 0)
+			goto bail;
+		cks += bval;
+		++first;
+	}
+	ret = cks & 0xFF;
+bail:
+	return ret;
+
+}
+
+int qib_refresh_qsfp_cache(struct qib_pportdata *ppd, struct qib_qsfp_cache *cp)
+{
+	int ret;
+	int idx;
+	u16 cks;
+	u8 peek[4];
+
+	/* ensure sane contents on invalid reads, for cable swaps */
+	memset(cp, 0, sizeof(*cp));
+
+	if (!qib_qsfp_mod_present(ppd)) {
+		ret = -ENODEV;
+		goto bail;
+	}
+
+	ret = qsfp_read(ppd, 0, peek, 3);
+	if (ret < 0)
+		goto bail;
+	if ((peek[0] & 0xFE) != 0x0C)
+		qib_dev_porterr(ppd->dd, ppd->port,
+				"QSFP byte0 is 0x%02X, S/B 0x0C/D\n", peek[0]);
+
+	if ((peek[2] & 2) == 0) {
+		/*
+		 * If cable is paged, rather than "flat memory", we need to
+		 * set the page to zero, Even if it already appears to be zero.
+		 */
+		u8 poke = 0;
+		ret = qib_qsfp_write(ppd, 127, &poke, 1);
+		udelay(50);
+		if (ret != 1) {
+			qib_dev_porterr(ppd->dd, ppd->port,
+					"Failed QSFP Page set\n");
+			goto bail;
+		}
+	}
+
+	ret = qsfp_read(ppd, QSFP_MOD_ID_OFFS, &cp->id, 1);
+	if (ret < 0)
+		goto bail;
+	if ((cp->id & 0xFE) != 0x0C)
+		qib_dev_porterr(ppd->dd, ppd->port,
+				"QSFP ID byte is 0x%02X, S/B 0x0C/D\n", cp->id);
+	cks = cp->id;
+
+	ret = qsfp_read(ppd, QSFP_MOD_PWR_OFFS, &cp->pwr, 1);
+	if (ret < 0)
+		goto bail;
+	cks += cp->pwr;
+
+	ret = qsfp_cks(ppd, QSFP_MOD_PWR_OFFS + 1, QSFP_MOD_LEN_OFFS);
+	if (ret < 0)
+		goto bail;
+	cks += ret;
+
+	ret = qsfp_read(ppd, QSFP_MOD_LEN_OFFS, &cp->len, 1);
+	if (ret < 0)
+		goto bail;
+	cks += cp->len;
+
+	ret = qsfp_read(ppd, QSFP_MOD_TECH_OFFS, &cp->tech, 1);
+	if (ret < 0)
+		goto bail;
+	cks += cp->tech;
+
+	ret = qsfp_read(ppd, QSFP_VEND_OFFS, &cp->vendor, QSFP_VEND_LEN);
+	if (ret < 0)
+		goto bail;
+	for (idx = 0; idx < QSFP_VEND_LEN; ++idx)
+		cks += cp->vendor[idx];
+
+	ret = qsfp_read(ppd, QSFP_IBXCV_OFFS, &cp->xt_xcv, 1);
+	if (ret < 0)
+		goto bail;
+	cks += cp->xt_xcv;
+
+	ret = qsfp_read(ppd, QSFP_VOUI_OFFS, &cp->oui, QSFP_VOUI_LEN);
+	if (ret < 0)
+		goto bail;
+	for (idx = 0; idx < QSFP_VOUI_LEN; ++idx)
+		cks += cp->oui[idx];
+
+	ret = qsfp_read(ppd, QSFP_PN_OFFS, &cp->partnum, QSFP_PN_LEN);
+	if (ret < 0)
+		goto bail;
+	for (idx = 0; idx < QSFP_PN_LEN; ++idx)
+		cks += cp->partnum[idx];
+
+	ret = qsfp_read(ppd, QSFP_REV_OFFS, &cp->rev, QSFP_REV_LEN);
+	if (ret < 0)
+		goto bail;
+	for (idx = 0; idx < QSFP_REV_LEN; ++idx)
+		cks += cp->rev[idx];
+
+	ret = qsfp_read(ppd, QSFP_ATTEN_OFFS, &cp->atten, QSFP_ATTEN_LEN);
+	if (ret < 0)
+		goto bail;
+	for (idx = 0; idx < QSFP_ATTEN_LEN; ++idx)
+		cks += cp->atten[idx];
+
+	ret = qsfp_cks(ppd, QSFP_ATTEN_OFFS + QSFP_ATTEN_LEN, QSFP_CC_OFFS);
+	if (ret < 0)
+		goto bail;
+	cks += ret;
+
+	cks &= 0xFF;
+	ret = qsfp_read(ppd, QSFP_CC_OFFS, &cp->cks1, 1);
+	if (ret < 0)
+		goto bail;
+	if (cks != cp->cks1)
+		qib_dev_porterr(ppd->dd, ppd->port,
+				"QSFP cks1 is %02X, computed %02X\n", cp->cks1,
+				cks);
+
+	/* Second checksum covers 192 to (serial, date, lot) */
+	ret = qsfp_cks(ppd, QSFP_CC_OFFS + 1, QSFP_SN_OFFS);
+	if (ret < 0)
+		goto bail;
+	cks = ret;
+
+	ret = qsfp_read(ppd, QSFP_SN_OFFS, &cp->serial, QSFP_SN_LEN);
+	if (ret < 0)
+		goto bail;
+	for (idx = 0; idx < QSFP_SN_LEN; ++idx)
+		cks += cp->serial[idx];
+
+	ret = qsfp_read(ppd, QSFP_DATE_OFFS, &cp->date, QSFP_DATE_LEN);
+	if (ret < 0)
+		goto bail;
+	for (idx = 0; idx < QSFP_DATE_LEN; ++idx)
+		cks += cp->date[idx];
+
+	ret = qsfp_read(ppd, QSFP_LOT_OFFS, &cp->lot, QSFP_LOT_LEN);
+	if (ret < 0)
+		goto bail;
+	for (idx = 0; idx < QSFP_LOT_LEN; ++idx)
+		cks += cp->lot[idx];
+
+	ret = qsfp_cks(ppd, QSFP_LOT_OFFS + QSFP_LOT_LEN, QSFP_CC_EXT_OFFS);
+	if (ret < 0)
+		goto bail;
+	cks += ret;
+
+	ret = qsfp_read(ppd, QSFP_CC_EXT_OFFS, &cp->cks2, 1);
+	if (ret < 0)
+		goto bail;
+	cks &= 0xFF;
+	if (cks != cp->cks2)
+		qib_dev_porterr(ppd->dd, ppd->port,
+				"QSFP cks2 is %02X, computed %02X\n", cp->cks2,
+				cks);
+	return 0;
+
+bail:
+	cp->id = 0;
+	return ret;
+}
+
+const char * const qib_qsfp_devtech[16] = {
+	"850nm VCSEL", "1310nm VCSEL", "1550nm VCSEL", "1310nm FP",
+	"1310nm DFB", "1550nm DFB", "1310nm EML", "1550nm EML",
+	"Cu Misc", "1490nm DFB", "Cu NoEq", "Cu Eq",
+	"Undef", "Cu Active BothEq", "Cu FarEq", "Cu NearEq"
+};
+
+#define QSFP_DUMP_CHUNK 16 /* Holds longest string */
+#define QSFP_DEFAULT_HDR_CNT 224
+
+static const char *pwr_codes = "1.5W2.0W2.5W3.5W";
+
+int qib_qsfp_mod_present(struct qib_pportdata *ppd)
+{
+	u32 mask;
+	int ret;
+
+	mask = QSFP_GPIO_MOD_PRS_N <<
+		(ppd->hw_pidx * QSFP_GPIO_PORT2_SHIFT);
+	ret = ppd->dd->f_gpio_mod(ppd->dd, 0, 0, 0);
+
+	return !((ret & mask) >>
+		 ((ppd->hw_pidx * QSFP_GPIO_PORT2_SHIFT) + 3));
+}
+
+/*
+ * Initialize structures that control access to QSFP. Called once per port
+ * on cards that support QSFP.
+ */
+void qib_qsfp_init(struct qib_qsfp_data *qd,
+		   void (*fevent)(struct work_struct *))
+{
+	u32 mask, highs;
+
+	struct qib_devdata *dd = qd->ppd->dd;
+
+	/* Initialize work struct for later QSFP events */
+	INIT_WORK(&qd->work, fevent);
+
+	/*
+	 * Later, we may want more validation. For now, just set up pins and
+	 * blip reset. If module is present, call qib_refresh_qsfp_cache(),
+	 * to do further init.
+	 */
+	mask = QSFP_GPIO_MOD_SEL_N | QSFP_GPIO_MOD_RST_N | QSFP_GPIO_LP_MODE;
+	highs = mask - QSFP_GPIO_MOD_RST_N;
+	if (qd->ppd->hw_pidx) {
+		mask <<= QSFP_GPIO_PORT2_SHIFT;
+		highs <<= QSFP_GPIO_PORT2_SHIFT;
+	}
+	dd->f_gpio_mod(dd, highs, mask, mask);
+	udelay(20); /* Generous RST dwell */
+
+	dd->f_gpio_mod(dd, mask, mask, mask);
+	return;
+}
+
+void qib_qsfp_deinit(struct qib_qsfp_data *qd)
+{
+	/*
+	 * There is nothing to do here for now.  our work is scheduled
+	 * with queue_work(), and flush_workqueue() from remove_one
+	 * will block until all work setup with queue_work()
+	 * completes.
+	 */
+}
+
+int qib_qsfp_dump(struct qib_pportdata *ppd, char *buf, int len)
+{
+	struct qib_qsfp_cache cd;
+	u8 bin_buff[QSFP_DUMP_CHUNK];
+	char lenstr[6];
+	int sofar, ret;
+	int bidx = 0;
+
+	sofar = 0;
+	ret = qib_refresh_qsfp_cache(ppd, &cd);
+	if (ret < 0)
+		goto bail;
+
+	lenstr[0] = ' ';
+	lenstr[1] = '\0';
+	if (QSFP_IS_CU(cd.tech))
+		sprintf(lenstr, "%dM ", cd.len);
+
+	sofar += scnprintf(buf + sofar, len - sofar, "PWR:%.3sW\n", pwr_codes +
+			   (QSFP_PWR(cd.pwr) * 4));
+
+	sofar += scnprintf(buf + sofar, len - sofar, "TECH:%s%s\n", lenstr,
+			   qib_qsfp_devtech[cd.tech >> 4]);
+
+	sofar += scnprintf(buf + sofar, len - sofar, "Vendor:%.*s\n",
+			   QSFP_VEND_LEN, cd.vendor);
+
+	sofar += scnprintf(buf + sofar, len - sofar, "OUI:%06X\n",
+			   QSFP_OUI(cd.oui));
+
+	sofar += scnprintf(buf + sofar, len - sofar, "Part#:%.*s\n",
+			   QSFP_PN_LEN, cd.partnum);
+	sofar += scnprintf(buf + sofar, len - sofar, "Rev:%.*s\n",
+			   QSFP_REV_LEN, cd.rev);
+	if (QSFP_IS_CU(cd.tech))
+		sofar += scnprintf(buf + sofar, len - sofar, "Atten:%d, %d\n",
+				   QSFP_ATTEN_SDR(cd.atten),
+				   QSFP_ATTEN_DDR(cd.atten));
+	sofar += scnprintf(buf + sofar, len - sofar, "Serial:%.*s\n",
+			   QSFP_SN_LEN, cd.serial);
+	sofar += scnprintf(buf + sofar, len - sofar, "Date:%.*s\n",
+			   QSFP_DATE_LEN, cd.date);
+	sofar += scnprintf(buf + sofar, len - sofar, "Lot:%.*s\n",
+			   QSFP_LOT_LEN, cd.date);
+
+	while (bidx < QSFP_DEFAULT_HDR_CNT) {
+		int iidx;
+		ret = qsfp_read(ppd, bidx, bin_buff, QSFP_DUMP_CHUNK);
+		if (ret < 0)
+			goto bail;
+		for (iidx = 0; iidx < ret; ++iidx) {
+			sofar += scnprintf(buf + sofar, len-sofar, " %02X",
+				bin_buff[iidx]);
+		}
+		sofar += scnprintf(buf + sofar, len - sofar, "\n");
+		bidx += QSFP_DUMP_CHUNK;
+	}
+	ret = sofar;
+bail:
+	return ret;
+}
diff --git a/drivers/infiniband/hw/qib/qib_qsfp.h b/drivers/infiniband/hw/qib/qib_qsfp.h
new file mode 100644
index 0000000..91908f5
--- /dev/null
+++ b/drivers/infiniband/hw/qib/qib_qsfp.h
@@ -0,0 +1,189 @@
+/*
+ * Copyright (c) 2006, 2007, 2008, 2009 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/* QSFP support common definitions, for ib_qib driver */
+
+#define QSFP_DEV 0xA0
+#define QSFP_PWR_LAG_MSEC 2000
+#define QSFP_MODPRS_LAG_MSEC 20
+
+/*
+ * Below are masks for various QSFP signals, for Port 1.
+ * Port2 equivalents are shifted by QSFP_GPIO_PORT2_SHIFT.
+ * _N means asserted low
+ */
+#define QSFP_GPIO_MOD_SEL_N (4)
+#define QSFP_GPIO_MOD_PRS_N (8)
+#define QSFP_GPIO_INT_N (0x10)
+#define QSFP_GPIO_MOD_RST_N (0x20)
+#define QSFP_GPIO_LP_MODE (0x40)
+#define QSFP_GPIO_PORT2_SHIFT 5
+
+#define QSFP_PAGESIZE 128
+/* Defined fields that QLogic requires of qualified cables */
+/* Byte 0 is Identifier, not checked */
+/* Byte 1 is reserved "status MSB" */
+/* Byte 2 is "status LSB" We only care that D2 "Flat Mem" is set. */
+/*
+ * Rest of first 128 not used, although 127 is reserved for page select
+ * if module is not "Flat memory".
+ */
+/* Byte 128 is Identifier: must be 0x0c for QSFP, or 0x0d for QSFP+ */
+#define QSFP_MOD_ID_OFFS 128
+/*
+ * Byte 129 is "Extended Identifier". We only care about D7,D6: Power class
+ *  0:1.5W, 1:2.0W, 2:2.5W, 3:3.5W
+ */
+#define QSFP_MOD_PWR_OFFS 129
+/* Byte 130 is Connector type. Not QLogic req'd */
+/* Bytes 131..138 are Transceiver types, bit maps for various tech, none IB */
+/* Byte 139 is encoding. code 0x01 is 8b10b. Not QLogic req'd */
+/* byte 140 is nominal bit-rate, in units of 100Mbits/sec Not QLogic req'd */
+/* Byte 141 is Extended Rate Select. Not QLogic req'd */
+/* Bytes 142..145 are lengths for various fiber types. Not QLogic req'd */
+/* Byte 146 is length for Copper. Units of 1 meter */
+#define QSFP_MOD_LEN_OFFS 146
+/*
+ * Byte 147 is Device technology. D0..3 not Qlogc req'd
+ * D4..7 select from 15 choices, translated by table:
+ */
+#define QSFP_MOD_TECH_OFFS 147
+extern const char *const qib_qsfp_devtech[16];
+/* Active Equalization includes fiber, copper full EQ, and copper near Eq */
+#define QSFP_IS_ACTIVE(tech) ((0xA2FF >> ((tech) >> 4)) & 1)
+/* Active Equalization includes fiber, copper full EQ, and copper far Eq */
+#define QSFP_IS_ACTIVE_FAR(tech) ((0x32FF >> ((tech) >> 4)) & 1)
+/* Attenuation should be valid for copper other than full/near Eq */
+#define QSFP_HAS_ATTEN(tech) ((0x4D00 >> ((tech) >> 4)) & 1)
+/* Length is only valid if technology is "copper" */
+#define QSFP_IS_CU(tech) ((0xED00 >> ((tech) >> 4)) & 1)
+#define QSFP_TECH_1490 9
+
+#define QSFP_OUI(oui) (((unsigned)oui[0] << 16) | ((unsigned)oui[1] << 8) | \
+			oui[2])
+#define QSFP_OUI_AMPHENOL 0x415048
+#define QSFP_OUI_FINISAR  0x009065
+#define QSFP_OUI_GORE     0x002177
+
+/* Bytes 148..163 are Vendor Name, Left-justified Blank-filled */
+#define QSFP_VEND_OFFS 148
+#define QSFP_VEND_LEN 16
+/* Byte 164 is IB Extended tranceiver codes Bits D0..3 are SDR,DDR,QDR,EDR */
+#define QSFP_IBXCV_OFFS 164
+/* Bytes 165..167 are Vendor OUI number */
+#define QSFP_VOUI_OFFS 165
+#define QSFP_VOUI_LEN 3
+/* Bytes 168..183 are Vendor Part Number, string */
+#define QSFP_PN_OFFS 168
+#define QSFP_PN_LEN 16
+/* Bytes 184,185 are Vendor Rev. Left Justified, Blank-filled */
+#define QSFP_REV_OFFS 184
+#define QSFP_REV_LEN 2
+/*
+ * Bytes 186,187 are Wavelength, if Optical. Not Qlogic req'd
+ *  If copper, they are attenuation in dB:
+ * Byte 186 is at 2.5Gb/sec (SDR), Byte 187 at 5.0Gb/sec (DDR)
+ */
+#define QSFP_ATTEN_OFFS 186
+#define QSFP_ATTEN_LEN 2
+/* Bytes 188,189 are Wavelength tolerance, not QLogic req'd */
+/* Byte 190 is Max Case Temp. Not QLogic req'd */
+/* Byte 191 is LSB of sum of bytes 128..190. Not QLogic req'd */
+#define QSFP_CC_OFFS 191
+/* Bytes 192..195 are Options implemented in qsfp. Not Qlogic req'd */
+/* Bytes 196..211 are Serial Number, String */
+#define QSFP_SN_OFFS 196
+#define QSFP_SN_LEN 16
+/* Bytes 212..219 are date-code YYMMDD (MM==1 for Jan) */
+#define QSFP_DATE_OFFS 212
+#define QSFP_DATE_LEN 6
+/* Bytes 218,219 are optional lot-code, string */
+#define QSFP_LOT_OFFS 218
+#define QSFP_LOT_LEN 2
+/* Bytes 220, 221 indicate monitoring options, Not QLogic req'd */
+/* Byte 223 is LSB of sum of bytes 192..222 */
+#define QSFP_CC_EXT_OFFS 223
+
+/*
+ * struct qib_qsfp_data encapsulates state of QSFP device for one port.
+ * it will be part of port-chip-specific data if a board supports QSFP.
+ *
+ * Since multiple board-types use QSFP, and their pport_data structs
+ * differ (in the chip-specific section), we need a pointer to its head.
+ *
+ * Avoiding premature optimization, we will have one work_struct per port,
+ * and let the (increasingly inaccurately named) eep_lock arbitrate
+ * access to common resources.
+ *
+ */
+
+/*
+ * Hold the parts of the onboard EEPROM that we care about, so we aren't
+ * coonstantly bit-boffing
+ */
+struct qib_qsfp_cache {
+	u8 id;	/* must be 0x0C or 0x0D; 0 indicates invalid EEPROM read */
+	u8 pwr; /* in D6,7 */
+	u8 len;	/* in meters, Cu only */
+	u8 tech;
+	char vendor[QSFP_VEND_LEN];
+	u8 xt_xcv; /* Ext. tranceiver codes, 4 lsbs are IB speed supported */
+	u8 oui[QSFP_VOUI_LEN];
+	u8 partnum[QSFP_PN_LEN];
+	u8 rev[QSFP_REV_LEN];
+	u8 atten[QSFP_ATTEN_LEN];
+	u8 cks1;	/* Checksum of bytes 128..190 */
+	u8 serial[QSFP_SN_LEN];
+	u8 date[QSFP_DATE_LEN];
+	u8 lot[QSFP_LOT_LEN];
+	u8 cks2;	/* Checsum of bytes 192..222 */
+};
+
+#define QSFP_PWR(pbyte) (((pbyte) >> 6) & 3)
+#define QSFP_ATTEN_SDR(attenarray) (attenarray[0])
+#define QSFP_ATTEN_DDR(attenarray) (attenarray[1])
+
+struct qib_qsfp_data {
+	/* Helps to find our way */
+	struct qib_pportdata *ppd;
+	struct work_struct work;
+	struct qib_qsfp_cache cache;
+	unsigned long t_insert;
+	u8 modpresent;
+};
+
+extern int qib_refresh_qsfp_cache(struct qib_pportdata *ppd,
+				  struct qib_qsfp_cache *cp);
+extern int qib_qsfp_mod_present(struct qib_pportdata *ppd);
+extern void qib_qsfp_init(struct qib_qsfp_data *qd,
+			  void (*fevent)(struct work_struct *));
+extern void qib_qsfp_deinit(struct qib_qsfp_data *qd);
diff --git a/drivers/infiniband/hw/qib/qib_rc.c b/drivers/infiniband/hw/qib/qib_rc.c
new file mode 100644
index 0000000..2f25018
--- /dev/null
+++ b/drivers/infiniband/hw/qib/qib_rc.c
@@ -0,0 +1,2290 @@
+/*
+ * Copyright (c) 2006, 2007, 2008, 2009 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/io.h>
+
+#include "qib.h"
+
+/* cut down ridiculously long IB macro names */
+#define OP(x) IB_OPCODE_RC_##x
+
+static void rc_timeout(unsigned long arg);
+
+static u32 restart_sge(struct qib_sge_state *ss, struct qib_swqe *wqe,
+		       u32 psn, u32 pmtu)
+{
+	u32 len;
+
+	len = ((psn - wqe->psn) & QIB_PSN_MASK) * pmtu;
+	ss->sge = wqe->sg_list[0];
+	ss->sg_list = wqe->sg_list + 1;
+	ss->num_sge = wqe->wr.num_sge;
+	ss->total_len = wqe->length;
+	qib_skip_sge(ss, len, 0);
+	return wqe->length - len;
+}
+
+static void start_timer(struct qib_qp *qp)
+{
+	qp->s_flags |= QIB_S_TIMER;
+	qp->s_timer.function = rc_timeout;
+	/* 4.096 usec. * (1 << qp->timeout) */
+	qp->s_timer.expires = jiffies + qp->timeout_jiffies;
+	add_timer(&qp->s_timer);
+}
+
+/**
+ * qib_make_rc_ack - construct a response packet (ACK, NAK, or RDMA read)
+ * @dev: the device for this QP
+ * @qp: a pointer to the QP
+ * @ohdr: a pointer to the IB header being constructed
+ * @pmtu: the path MTU
+ *
+ * Return 1 if constructed; otherwise, return 0.
+ * Note that we are in the responder's side of the QP context.
+ * Note the QP s_lock must be held.
+ */
+static int qib_make_rc_ack(struct qib_ibdev *dev, struct qib_qp *qp,
+			   struct qib_other_headers *ohdr, u32 pmtu)
+{
+	struct qib_ack_entry *e;
+	u32 hwords;
+	u32 len;
+	u32 bth0;
+	u32 bth2;
+
+	/* Don't send an ACK if we aren't supposed to. */
+	if (!(ib_qib_state_ops[qp->state] & QIB_PROCESS_RECV_OK))
+		goto bail;
+
+	/* header size in 32-bit words LRH+BTH = (8+12)/4. */
+	hwords = 5;
+
+	switch (qp->s_ack_state) {
+	case OP(RDMA_READ_RESPONSE_LAST):
+	case OP(RDMA_READ_RESPONSE_ONLY):
+		e = &qp->s_ack_queue[qp->s_tail_ack_queue];
+		if (e->rdma_sge.mr) {
+			qib_put_mr(e->rdma_sge.mr);
+			e->rdma_sge.mr = NULL;
+		}
+		/* FALLTHROUGH */
+	case OP(ATOMIC_ACKNOWLEDGE):
+		/*
+		 * We can increment the tail pointer now that the last
+		 * response has been sent instead of only being
+		 * constructed.
+		 */
+		if (++qp->s_tail_ack_queue > QIB_MAX_RDMA_ATOMIC)
+			qp->s_tail_ack_queue = 0;
+		/* FALLTHROUGH */
+	case OP(SEND_ONLY):
+	case OP(ACKNOWLEDGE):
+		/* Check for no next entry in the queue. */
+		if (qp->r_head_ack_queue == qp->s_tail_ack_queue) {
+			if (qp->s_flags & QIB_S_ACK_PENDING)
+				goto normal;
+			goto bail;
+		}
+
+		e = &qp->s_ack_queue[qp->s_tail_ack_queue];
+		if (e->opcode == OP(RDMA_READ_REQUEST)) {
+			/*
+			 * If a RDMA read response is being resent and
+			 * we haven't seen the duplicate request yet,
+			 * then stop sending the remaining responses the
+			 * responder has seen until the requester resends it.
+			 */
+			len = e->rdma_sge.sge_length;
+			if (len && !e->rdma_sge.mr) {
+				qp->s_tail_ack_queue = qp->r_head_ack_queue;
+				goto bail;
+			}
+			/* Copy SGE state in case we need to resend */
+			qp->s_rdma_mr = e->rdma_sge.mr;
+			if (qp->s_rdma_mr)
+				qib_get_mr(qp->s_rdma_mr);
+			qp->s_ack_rdma_sge.sge = e->rdma_sge;
+			qp->s_ack_rdma_sge.num_sge = 1;
+			qp->s_cur_sge = &qp->s_ack_rdma_sge;
+			if (len > pmtu) {
+				len = pmtu;
+				qp->s_ack_state = OP(RDMA_READ_RESPONSE_FIRST);
+			} else {
+				qp->s_ack_state = OP(RDMA_READ_RESPONSE_ONLY);
+				e->sent = 1;
+			}
+			ohdr->u.aeth = qib_compute_aeth(qp);
+			hwords++;
+			qp->s_ack_rdma_psn = e->psn;
+			bth2 = qp->s_ack_rdma_psn++ & QIB_PSN_MASK;
+		} else {
+			/* COMPARE_SWAP or FETCH_ADD */
+			qp->s_cur_sge = NULL;
+			len = 0;
+			qp->s_ack_state = OP(ATOMIC_ACKNOWLEDGE);
+			ohdr->u.at.aeth = qib_compute_aeth(qp);
+			ohdr->u.at.atomic_ack_eth[0] =
+				cpu_to_be32(e->atomic_data >> 32);
+			ohdr->u.at.atomic_ack_eth[1] =
+				cpu_to_be32(e->atomic_data);
+			hwords += sizeof(ohdr->u.at) / sizeof(u32);
+			bth2 = e->psn & QIB_PSN_MASK;
+			e->sent = 1;
+		}
+		bth0 = qp->s_ack_state << 24;
+		break;
+
+	case OP(RDMA_READ_RESPONSE_FIRST):
+		qp->s_ack_state = OP(RDMA_READ_RESPONSE_MIDDLE);
+		/* FALLTHROUGH */
+	case OP(RDMA_READ_RESPONSE_MIDDLE):
+		qp->s_cur_sge = &qp->s_ack_rdma_sge;
+		qp->s_rdma_mr = qp->s_ack_rdma_sge.sge.mr;
+		if (qp->s_rdma_mr)
+			qib_get_mr(qp->s_rdma_mr);
+		len = qp->s_ack_rdma_sge.sge.sge_length;
+		if (len > pmtu)
+			len = pmtu;
+		else {
+			ohdr->u.aeth = qib_compute_aeth(qp);
+			hwords++;
+			qp->s_ack_state = OP(RDMA_READ_RESPONSE_LAST);
+			e = &qp->s_ack_queue[qp->s_tail_ack_queue];
+			e->sent = 1;
+		}
+		bth0 = qp->s_ack_state << 24;
+		bth2 = qp->s_ack_rdma_psn++ & QIB_PSN_MASK;
+		break;
+
+	default:
+normal:
+		/*
+		 * Send a regular ACK.
+		 * Set the s_ack_state so we wait until after sending
+		 * the ACK before setting s_ack_state to ACKNOWLEDGE
+		 * (see above).
+		 */
+		qp->s_ack_state = OP(SEND_ONLY);
+		qp->s_flags &= ~QIB_S_ACK_PENDING;
+		qp->s_cur_sge = NULL;
+		if (qp->s_nak_state)
+			ohdr->u.aeth =
+				cpu_to_be32((qp->r_msn & QIB_MSN_MASK) |
+					    (qp->s_nak_state <<
+					     QIB_AETH_CREDIT_SHIFT));
+		else
+			ohdr->u.aeth = qib_compute_aeth(qp);
+		hwords++;
+		len = 0;
+		bth0 = OP(ACKNOWLEDGE) << 24;
+		bth2 = qp->s_ack_psn & QIB_PSN_MASK;
+	}
+	qp->s_rdma_ack_cnt++;
+	qp->s_hdrwords = hwords;
+	qp->s_cur_size = len;
+	qib_make_ruc_header(qp, ohdr, bth0, bth2);
+	return 1;
+
+bail:
+	qp->s_ack_state = OP(ACKNOWLEDGE);
+	qp->s_flags &= ~(QIB_S_RESP_PENDING | QIB_S_ACK_PENDING);
+	return 0;
+}
+
+/**
+ * qib_make_rc_req - construct a request packet (SEND, RDMA r/w, ATOMIC)
+ * @qp: a pointer to the QP
+ *
+ * Return 1 if constructed; otherwise, return 0.
+ */
+int qib_make_rc_req(struct qib_qp *qp)
+{
+	struct qib_ibdev *dev = to_idev(qp->ibqp.device);
+	struct qib_other_headers *ohdr;
+	struct qib_sge_state *ss;
+	struct qib_swqe *wqe;
+	u32 hwords;
+	u32 len;
+	u32 bth0;
+	u32 bth2;
+	u32 pmtu = qp->pmtu;
+	char newreq;
+	unsigned long flags;
+	int ret = 0;
+	int delta;
+
+	ohdr = &qp->s_hdr->u.oth;
+	if (qp->remote_ah_attr.ah_flags & IB_AH_GRH)
+		ohdr = &qp->s_hdr->u.l.oth;
+
+	/*
+	 * The lock is needed to synchronize between the sending tasklet,
+	 * the receive interrupt handler, and timeout resends.
+	 */
+	spin_lock_irqsave(&qp->s_lock, flags);
+
+	/* Sending responses has higher priority over sending requests. */
+	if ((qp->s_flags & QIB_S_RESP_PENDING) &&
+	    qib_make_rc_ack(dev, qp, ohdr, pmtu))
+		goto done;
+
+	if (!(ib_qib_state_ops[qp->state] & QIB_PROCESS_SEND_OK)) {
+		if (!(ib_qib_state_ops[qp->state] & QIB_FLUSH_SEND))
+			goto bail;
+		/* We are in the error state, flush the work request. */
+		if (qp->s_last == qp->s_head)
+			goto bail;
+		/* If DMAs are in progress, we can't flush immediately. */
+		if (atomic_read(&qp->s_dma_busy)) {
+			qp->s_flags |= QIB_S_WAIT_DMA;
+			goto bail;
+		}
+		wqe = get_swqe_ptr(qp, qp->s_last);
+		qib_send_complete(qp, wqe, qp->s_last != qp->s_acked ?
+			IB_WC_SUCCESS : IB_WC_WR_FLUSH_ERR);
+		/* will get called again */
+		goto done;
+	}
+
+	if (qp->s_flags & (QIB_S_WAIT_RNR | QIB_S_WAIT_ACK))
+		goto bail;
+
+	if (qib_cmp24(qp->s_psn, qp->s_sending_hpsn) <= 0) {
+		if (qib_cmp24(qp->s_sending_psn, qp->s_sending_hpsn) <= 0) {
+			qp->s_flags |= QIB_S_WAIT_PSN;
+			goto bail;
+		}
+		qp->s_sending_psn = qp->s_psn;
+		qp->s_sending_hpsn = qp->s_psn - 1;
+	}
+
+	/* header size in 32-bit words LRH+BTH = (8+12)/4. */
+	hwords = 5;
+	bth0 = 0;
+
+	/* Send a request. */
+	wqe = get_swqe_ptr(qp, qp->s_cur);
+	switch (qp->s_state) {
+	default:
+		if (!(ib_qib_state_ops[qp->state] & QIB_PROCESS_NEXT_SEND_OK))
+			goto bail;
+		/*
+		 * Resend an old request or start a new one.
+		 *
+		 * We keep track of the current SWQE so that
+		 * we don't reset the "furthest progress" state
+		 * if we need to back up.
+		 */
+		newreq = 0;
+		if (qp->s_cur == qp->s_tail) {
+			/* Check if send work queue is empty. */
+			if (qp->s_tail == qp->s_head)
+				goto bail;
+			/*
+			 * If a fence is requested, wait for previous
+			 * RDMA read and atomic operations to finish.
+			 */
+			if ((wqe->wr.send_flags & IB_SEND_FENCE) &&
+			    qp->s_num_rd_atomic) {
+				qp->s_flags |= QIB_S_WAIT_FENCE;
+				goto bail;
+			}
+			wqe->psn = qp->s_next_psn;
+			newreq = 1;
+		}
+		/*
+		 * Note that we have to be careful not to modify the
+		 * original work request since we may need to resend
+		 * it.
+		 */
+		len = wqe->length;
+		ss = &qp->s_sge;
+		bth2 = qp->s_psn & QIB_PSN_MASK;
+		switch (wqe->wr.opcode) {
+		case IB_WR_SEND:
+		case IB_WR_SEND_WITH_IMM:
+			/* If no credit, return. */
+			if (!(qp->s_flags & QIB_S_UNLIMITED_CREDIT) &&
+			    qib_cmp24(wqe->ssn, qp->s_lsn + 1) > 0) {
+				qp->s_flags |= QIB_S_WAIT_SSN_CREDIT;
+				goto bail;
+			}
+			wqe->lpsn = wqe->psn;
+			if (len > pmtu) {
+				wqe->lpsn += (len - 1) / pmtu;
+				qp->s_state = OP(SEND_FIRST);
+				len = pmtu;
+				break;
+			}
+			if (wqe->wr.opcode == IB_WR_SEND)
+				qp->s_state = OP(SEND_ONLY);
+			else {
+				qp->s_state = OP(SEND_ONLY_WITH_IMMEDIATE);
+				/* Immediate data comes after the BTH */
+				ohdr->u.imm_data = wqe->wr.ex.imm_data;
+				hwords += 1;
+			}
+			if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+				bth0 |= IB_BTH_SOLICITED;
+			bth2 |= IB_BTH_REQ_ACK;
+			if (++qp->s_cur == qp->s_size)
+				qp->s_cur = 0;
+			break;
+
+		case IB_WR_RDMA_WRITE:
+			if (newreq && !(qp->s_flags & QIB_S_UNLIMITED_CREDIT))
+				qp->s_lsn++;
+			/* FALLTHROUGH */
+		case IB_WR_RDMA_WRITE_WITH_IMM:
+			/* If no credit, return. */
+			if (!(qp->s_flags & QIB_S_UNLIMITED_CREDIT) &&
+			    qib_cmp24(wqe->ssn, qp->s_lsn + 1) > 0) {
+				qp->s_flags |= QIB_S_WAIT_SSN_CREDIT;
+				goto bail;
+			}
+			ohdr->u.rc.reth.vaddr =
+				cpu_to_be64(wqe->wr.wr.rdma.remote_addr);
+			ohdr->u.rc.reth.rkey =
+				cpu_to_be32(wqe->wr.wr.rdma.rkey);
+			ohdr->u.rc.reth.length = cpu_to_be32(len);
+			hwords += sizeof(struct ib_reth) / sizeof(u32);
+			wqe->lpsn = wqe->psn;
+			if (len > pmtu) {
+				wqe->lpsn += (len - 1) / pmtu;
+				qp->s_state = OP(RDMA_WRITE_FIRST);
+				len = pmtu;
+				break;
+			}
+			if (wqe->wr.opcode == IB_WR_RDMA_WRITE)
+				qp->s_state = OP(RDMA_WRITE_ONLY);
+			else {
+				qp->s_state =
+					OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE);
+				/* Immediate data comes after RETH */
+				ohdr->u.rc.imm_data = wqe->wr.ex.imm_data;
+				hwords += 1;
+				if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+					bth0 |= IB_BTH_SOLICITED;
+			}
+			bth2 |= IB_BTH_REQ_ACK;
+			if (++qp->s_cur == qp->s_size)
+				qp->s_cur = 0;
+			break;
+
+		case IB_WR_RDMA_READ:
+			/*
+			 * Don't allow more operations to be started
+			 * than the QP limits allow.
+			 */
+			if (newreq) {
+				if (qp->s_num_rd_atomic >=
+				    qp->s_max_rd_atomic) {
+					qp->s_flags |= QIB_S_WAIT_RDMAR;
+					goto bail;
+				}
+				qp->s_num_rd_atomic++;
+				if (!(qp->s_flags & QIB_S_UNLIMITED_CREDIT))
+					qp->s_lsn++;
+				/*
+				 * Adjust s_next_psn to count the
+				 * expected number of responses.
+				 */
+				if (len > pmtu)
+					qp->s_next_psn += (len - 1) / pmtu;
+				wqe->lpsn = qp->s_next_psn++;
+			}
+			ohdr->u.rc.reth.vaddr =
+				cpu_to_be64(wqe->wr.wr.rdma.remote_addr);
+			ohdr->u.rc.reth.rkey =
+				cpu_to_be32(wqe->wr.wr.rdma.rkey);
+			ohdr->u.rc.reth.length = cpu_to_be32(len);
+			qp->s_state = OP(RDMA_READ_REQUEST);
+			hwords += sizeof(ohdr->u.rc.reth) / sizeof(u32);
+			ss = NULL;
+			len = 0;
+			bth2 |= IB_BTH_REQ_ACK;
+			if (++qp->s_cur == qp->s_size)
+				qp->s_cur = 0;
+			break;
+
+		case IB_WR_ATOMIC_CMP_AND_SWP:
+		case IB_WR_ATOMIC_FETCH_AND_ADD:
+			/*
+			 * Don't allow more operations to be started
+			 * than the QP limits allow.
+			 */
+			if (newreq) {
+				if (qp->s_num_rd_atomic >=
+				    qp->s_max_rd_atomic) {
+					qp->s_flags |= QIB_S_WAIT_RDMAR;
+					goto bail;
+				}
+				qp->s_num_rd_atomic++;
+				if (!(qp->s_flags & QIB_S_UNLIMITED_CREDIT))
+					qp->s_lsn++;
+				wqe->lpsn = wqe->psn;
+			}
+			if (wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP) {
+				qp->s_state = OP(COMPARE_SWAP);
+				ohdr->u.atomic_eth.swap_data = cpu_to_be64(
+					wqe->wr.wr.atomic.swap);
+				ohdr->u.atomic_eth.compare_data = cpu_to_be64(
+					wqe->wr.wr.atomic.compare_add);
+			} else {
+				qp->s_state = OP(FETCH_ADD);
+				ohdr->u.atomic_eth.swap_data = cpu_to_be64(
+					wqe->wr.wr.atomic.compare_add);
+				ohdr->u.atomic_eth.compare_data = 0;
+			}
+			ohdr->u.atomic_eth.vaddr[0] = cpu_to_be32(
+				wqe->wr.wr.atomic.remote_addr >> 32);
+			ohdr->u.atomic_eth.vaddr[1] = cpu_to_be32(
+				wqe->wr.wr.atomic.remote_addr);
+			ohdr->u.atomic_eth.rkey = cpu_to_be32(
+				wqe->wr.wr.atomic.rkey);
+			hwords += sizeof(struct ib_atomic_eth) / sizeof(u32);
+			ss = NULL;
+			len = 0;
+			bth2 |= IB_BTH_REQ_ACK;
+			if (++qp->s_cur == qp->s_size)
+				qp->s_cur = 0;
+			break;
+
+		default:
+			goto bail;
+		}
+		qp->s_sge.sge = wqe->sg_list[0];
+		qp->s_sge.sg_list = wqe->sg_list + 1;
+		qp->s_sge.num_sge = wqe->wr.num_sge;
+		qp->s_sge.total_len = wqe->length;
+		qp->s_len = wqe->length;
+		if (newreq) {
+			qp->s_tail++;
+			if (qp->s_tail >= qp->s_size)
+				qp->s_tail = 0;
+		}
+		if (wqe->wr.opcode == IB_WR_RDMA_READ)
+			qp->s_psn = wqe->lpsn + 1;
+		else {
+			qp->s_psn++;
+			if (qib_cmp24(qp->s_psn, qp->s_next_psn) > 0)
+				qp->s_next_psn = qp->s_psn;
+		}
+		break;
+
+	case OP(RDMA_READ_RESPONSE_FIRST):
+		/*
+		 * qp->s_state is normally set to the opcode of the
+		 * last packet constructed for new requests and therefore
+		 * is never set to RDMA read response.
+		 * RDMA_READ_RESPONSE_FIRST is used by the ACK processing
+		 * thread to indicate a SEND needs to be restarted from an
+		 * earlier PSN without interferring with the sending thread.
+		 * See qib_restart_rc().
+		 */
+		qp->s_len = restart_sge(&qp->s_sge, wqe, qp->s_psn, pmtu);
+		/* FALLTHROUGH */
+	case OP(SEND_FIRST):
+		qp->s_state = OP(SEND_MIDDLE);
+		/* FALLTHROUGH */
+	case OP(SEND_MIDDLE):
+		bth2 = qp->s_psn++ & QIB_PSN_MASK;
+		if (qib_cmp24(qp->s_psn, qp->s_next_psn) > 0)
+			qp->s_next_psn = qp->s_psn;
+		ss = &qp->s_sge;
+		len = qp->s_len;
+		if (len > pmtu) {
+			len = pmtu;
+			break;
+		}
+		if (wqe->wr.opcode == IB_WR_SEND)
+			qp->s_state = OP(SEND_LAST);
+		else {
+			qp->s_state = OP(SEND_LAST_WITH_IMMEDIATE);
+			/* Immediate data comes after the BTH */
+			ohdr->u.imm_data = wqe->wr.ex.imm_data;
+			hwords += 1;
+		}
+		if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+			bth0 |= IB_BTH_SOLICITED;
+		bth2 |= IB_BTH_REQ_ACK;
+		qp->s_cur++;
+		if (qp->s_cur >= qp->s_size)
+			qp->s_cur = 0;
+		break;
+
+	case OP(RDMA_READ_RESPONSE_LAST):
+		/*
+		 * qp->s_state is normally set to the opcode of the
+		 * last packet constructed for new requests and therefore
+		 * is never set to RDMA read response.
+		 * RDMA_READ_RESPONSE_LAST is used by the ACK processing
+		 * thread to indicate a RDMA write needs to be restarted from
+		 * an earlier PSN without interferring with the sending thread.
+		 * See qib_restart_rc().
+		 */
+		qp->s_len = restart_sge(&qp->s_sge, wqe, qp->s_psn, pmtu);
+		/* FALLTHROUGH */
+	case OP(RDMA_WRITE_FIRST):
+		qp->s_state = OP(RDMA_WRITE_MIDDLE);
+		/* FALLTHROUGH */
+	case OP(RDMA_WRITE_MIDDLE):
+		bth2 = qp->s_psn++ & QIB_PSN_MASK;
+		if (qib_cmp24(qp->s_psn, qp->s_next_psn) > 0)
+			qp->s_next_psn = qp->s_psn;
+		ss = &qp->s_sge;
+		len = qp->s_len;
+		if (len > pmtu) {
+			len = pmtu;
+			break;
+		}
+		if (wqe->wr.opcode == IB_WR_RDMA_WRITE)
+			qp->s_state = OP(RDMA_WRITE_LAST);
+		else {
+			qp->s_state = OP(RDMA_WRITE_LAST_WITH_IMMEDIATE);
+			/* Immediate data comes after the BTH */
+			ohdr->u.imm_data = wqe->wr.ex.imm_data;
+			hwords += 1;
+			if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+				bth0 |= IB_BTH_SOLICITED;
+		}
+		bth2 |= IB_BTH_REQ_ACK;
+		qp->s_cur++;
+		if (qp->s_cur >= qp->s_size)
+			qp->s_cur = 0;
+		break;
+
+	case OP(RDMA_READ_RESPONSE_MIDDLE):
+		/*
+		 * qp->s_state is normally set to the opcode of the
+		 * last packet constructed for new requests and therefore
+		 * is never set to RDMA read response.
+		 * RDMA_READ_RESPONSE_MIDDLE is used by the ACK processing
+		 * thread to indicate a RDMA read needs to be restarted from
+		 * an earlier PSN without interferring with the sending thread.
+		 * See qib_restart_rc().
+		 */
+		len = ((qp->s_psn - wqe->psn) & QIB_PSN_MASK) * pmtu;
+		ohdr->u.rc.reth.vaddr =
+			cpu_to_be64(wqe->wr.wr.rdma.remote_addr + len);
+		ohdr->u.rc.reth.rkey =
+			cpu_to_be32(wqe->wr.wr.rdma.rkey);
+		ohdr->u.rc.reth.length = cpu_to_be32(wqe->length - len);
+		qp->s_state = OP(RDMA_READ_REQUEST);
+		hwords += sizeof(ohdr->u.rc.reth) / sizeof(u32);
+		bth2 = (qp->s_psn & QIB_PSN_MASK) | IB_BTH_REQ_ACK;
+		qp->s_psn = wqe->lpsn + 1;
+		ss = NULL;
+		len = 0;
+		qp->s_cur++;
+		if (qp->s_cur == qp->s_size)
+			qp->s_cur = 0;
+		break;
+	}
+	qp->s_sending_hpsn = bth2;
+	delta = (((int) bth2 - (int) wqe->psn) << 8) >> 8;
+	if (delta && delta % QIB_PSN_CREDIT == 0)
+		bth2 |= IB_BTH_REQ_ACK;
+	if (qp->s_flags & QIB_S_SEND_ONE) {
+		qp->s_flags &= ~QIB_S_SEND_ONE;
+		qp->s_flags |= QIB_S_WAIT_ACK;
+		bth2 |= IB_BTH_REQ_ACK;
+	}
+	qp->s_len -= len;
+	qp->s_hdrwords = hwords;
+	qp->s_cur_sge = ss;
+	qp->s_cur_size = len;
+	qib_make_ruc_header(qp, ohdr, bth0 | (qp->s_state << 24), bth2);
+done:
+	ret = 1;
+	goto unlock;
+
+bail:
+	qp->s_flags &= ~QIB_S_BUSY;
+unlock:
+	spin_unlock_irqrestore(&qp->s_lock, flags);
+	return ret;
+}
+
+/**
+ * qib_send_rc_ack - Construct an ACK packet and send it
+ * @qp: a pointer to the QP
+ *
+ * This is called from qib_rc_rcv() and qib_kreceive().
+ * Note that RDMA reads and atomics are handled in the
+ * send side QP state and tasklet.
+ */
+void qib_send_rc_ack(struct qib_qp *qp)
+{
+	struct qib_devdata *dd = dd_from_ibdev(qp->ibqp.device);
+	struct qib_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
+	struct qib_pportdata *ppd = ppd_from_ibp(ibp);
+	u64 pbc;
+	u16 lrh0;
+	u32 bth0;
+	u32 hwords;
+	u32 pbufn;
+	u32 __iomem *piobuf;
+	struct qib_ib_header hdr;
+	struct qib_other_headers *ohdr;
+	u32 control;
+	unsigned long flags;
+
+	spin_lock_irqsave(&qp->s_lock, flags);
+
+	if (!(ib_qib_state_ops[qp->state] & QIB_PROCESS_RECV_OK))
+		goto unlock;
+
+	/* Don't send ACK or NAK if a RDMA read or atomic is pending. */
+	if ((qp->s_flags & QIB_S_RESP_PENDING) || qp->s_rdma_ack_cnt)
+		goto queue_ack;
+
+	/* Construct the header with s_lock held so APM doesn't change it. */
+	ohdr = &hdr.u.oth;
+	lrh0 = QIB_LRH_BTH;
+	/* header size in 32-bit words LRH+BTH+AETH = (8+12+4)/4. */
+	hwords = 6;
+	if (unlikely(qp->remote_ah_attr.ah_flags & IB_AH_GRH)) {
+		hwords += qib_make_grh(ibp, &hdr.u.l.grh,
+				       &qp->remote_ah_attr.grh, hwords, 0);
+		ohdr = &hdr.u.l.oth;
+		lrh0 = QIB_LRH_GRH;
+	}
+	/* read pkey_index w/o lock (its atomic) */
+	bth0 = qib_get_pkey(ibp, qp->s_pkey_index) | (OP(ACKNOWLEDGE) << 24);
+	if (qp->s_mig_state == IB_MIG_MIGRATED)
+		bth0 |= IB_BTH_MIG_REQ;
+	if (qp->r_nak_state)
+		ohdr->u.aeth = cpu_to_be32((qp->r_msn & QIB_MSN_MASK) |
+					    (qp->r_nak_state <<
+					     QIB_AETH_CREDIT_SHIFT));
+	else
+		ohdr->u.aeth = qib_compute_aeth(qp);
+	lrh0 |= ibp->sl_to_vl[qp->remote_ah_attr.sl] << 12 |
+		qp->remote_ah_attr.sl << 4;
+	hdr.lrh[0] = cpu_to_be16(lrh0);
+	hdr.lrh[1] = cpu_to_be16(qp->remote_ah_attr.dlid);
+	hdr.lrh[2] = cpu_to_be16(hwords + SIZE_OF_CRC);
+	hdr.lrh[3] = cpu_to_be16(ppd->lid | qp->remote_ah_attr.src_path_bits);
+	ohdr->bth[0] = cpu_to_be32(bth0);
+	ohdr->bth[1] = cpu_to_be32(qp->remote_qpn);
+	ohdr->bth[2] = cpu_to_be32(qp->r_ack_psn & QIB_PSN_MASK);
+
+	spin_unlock_irqrestore(&qp->s_lock, flags);
+
+	/* Don't try to send ACKs if the link isn't ACTIVE */
+	if (!(ppd->lflags & QIBL_LINKACTIVE))
+		goto done;
+
+	control = dd->f_setpbc_control(ppd, hwords + SIZE_OF_CRC,
+				       qp->s_srate, lrh0 >> 12);
+	/* length is + 1 for the control dword */
+	pbc = ((u64) control << 32) | (hwords + 1);
+
+	piobuf = dd->f_getsendbuf(ppd, pbc, &pbufn);
+	if (!piobuf) {
+		/*
+		 * We are out of PIO buffers at the moment.
+		 * Pass responsibility for sending the ACK to the
+		 * send tasklet so that when a PIO buffer becomes
+		 * available, the ACK is sent ahead of other outgoing
+		 * packets.
+		 */
+		spin_lock_irqsave(&qp->s_lock, flags);
+		goto queue_ack;
+	}
+
+	/*
+	 * Write the pbc.
+	 * We have to flush after the PBC for correctness
+	 * on some cpus or WC buffer can be written out of order.
+	 */
+	writeq(pbc, piobuf);
+
+	if (dd->flags & QIB_PIO_FLUSH_WC) {
+		u32 *hdrp = (u32 *) &hdr;
+
+		qib_flush_wc();
+		qib_pio_copy(piobuf + 2, hdrp, hwords - 1);
+		qib_flush_wc();
+		__raw_writel(hdrp[hwords - 1], piobuf + hwords + 1);
+	} else
+		qib_pio_copy(piobuf + 2, (u32 *) &hdr, hwords);
+
+	if (dd->flags & QIB_USE_SPCL_TRIG) {
+		u32 spcl_off = (pbufn >= dd->piobcnt2k) ? 2047 : 1023;
+
+		qib_flush_wc();
+		__raw_writel(0xaebecede, piobuf + spcl_off);
+	}
+
+	qib_flush_wc();
+	qib_sendbuf_done(dd, pbufn);
+
+	this_cpu_inc(ibp->pmastats->n_unicast_xmit);
+	goto done;
+
+queue_ack:
+	if (ib_qib_state_ops[qp->state] & QIB_PROCESS_RECV_OK) {
+		ibp->n_rc_qacks++;
+		qp->s_flags |= QIB_S_ACK_PENDING | QIB_S_RESP_PENDING;
+		qp->s_nak_state = qp->r_nak_state;
+		qp->s_ack_psn = qp->r_ack_psn;
+
+		/* Schedule the send tasklet. */
+		qib_schedule_send(qp);
+	}
+unlock:
+	spin_unlock_irqrestore(&qp->s_lock, flags);
+done:
+	return;
+}
+
+/**
+ * reset_psn - reset the QP state to send starting from PSN
+ * @qp: the QP
+ * @psn: the packet sequence number to restart at
+ *
+ * This is called from qib_rc_rcv() to process an incoming RC ACK
+ * for the given QP.
+ * Called at interrupt level with the QP s_lock held.
+ */
+static void reset_psn(struct qib_qp *qp, u32 psn)
+{
+	u32 n = qp->s_acked;
+	struct qib_swqe *wqe = get_swqe_ptr(qp, n);
+	u32 opcode;
+
+	qp->s_cur = n;
+
+	/*
+	 * If we are starting the request from the beginning,
+	 * let the normal send code handle initialization.
+	 */
+	if (qib_cmp24(psn, wqe->psn) <= 0) {
+		qp->s_state = OP(SEND_LAST);
+		goto done;
+	}
+
+	/* Find the work request opcode corresponding to the given PSN. */
+	opcode = wqe->wr.opcode;
+	for (;;) {
+		int diff;
+
+		if (++n == qp->s_size)
+			n = 0;
+		if (n == qp->s_tail)
+			break;
+		wqe = get_swqe_ptr(qp, n);
+		diff = qib_cmp24(psn, wqe->psn);
+		if (diff < 0)
+			break;
+		qp->s_cur = n;
+		/*
+		 * If we are starting the request from the beginning,
+		 * let the normal send code handle initialization.
+		 */
+		if (diff == 0) {
+			qp->s_state = OP(SEND_LAST);
+			goto done;
+		}
+		opcode = wqe->wr.opcode;
+	}
+
+	/*
+	 * Set the state to restart in the middle of a request.
+	 * Don't change the s_sge, s_cur_sge, or s_cur_size.
+	 * See qib_make_rc_req().
+	 */
+	switch (opcode) {
+	case IB_WR_SEND:
+	case IB_WR_SEND_WITH_IMM:
+		qp->s_state = OP(RDMA_READ_RESPONSE_FIRST);
+		break;
+
+	case IB_WR_RDMA_WRITE:
+	case IB_WR_RDMA_WRITE_WITH_IMM:
+		qp->s_state = OP(RDMA_READ_RESPONSE_LAST);
+		break;
+
+	case IB_WR_RDMA_READ:
+		qp->s_state = OP(RDMA_READ_RESPONSE_MIDDLE);
+		break;
+
+	default:
+		/*
+		 * This case shouldn't happen since its only
+		 * one PSN per req.
+		 */
+		qp->s_state = OP(SEND_LAST);
+	}
+done:
+	qp->s_psn = psn;
+	/*
+	 * Set QIB_S_WAIT_PSN as qib_rc_complete() may start the timer
+	 * asynchronously before the send tasklet can get scheduled.
+	 * Doing it in qib_make_rc_req() is too late.
+	 */
+	if ((qib_cmp24(qp->s_psn, qp->s_sending_hpsn) <= 0) &&
+	    (qib_cmp24(qp->s_sending_psn, qp->s_sending_hpsn) <= 0))
+		qp->s_flags |= QIB_S_WAIT_PSN;
+}
+
+/*
+ * Back up requester to resend the last un-ACKed request.
+ * The QP r_lock and s_lock should be held and interrupts disabled.
+ */
+static void qib_restart_rc(struct qib_qp *qp, u32 psn, int wait)
+{
+	struct qib_swqe *wqe = get_swqe_ptr(qp, qp->s_acked);
+	struct qib_ibport *ibp;
+
+	if (qp->s_retry == 0) {
+		if (qp->s_mig_state == IB_MIG_ARMED) {
+			qib_migrate_qp(qp);
+			qp->s_retry = qp->s_retry_cnt;
+		} else if (qp->s_last == qp->s_acked) {
+			qib_send_complete(qp, wqe, IB_WC_RETRY_EXC_ERR);
+			qib_error_qp(qp, IB_WC_WR_FLUSH_ERR);
+			return;
+		} else /* XXX need to handle delayed completion */
+			return;
+	} else
+		qp->s_retry--;
+
+	ibp = to_iport(qp->ibqp.device, qp->port_num);
+	if (wqe->wr.opcode == IB_WR_RDMA_READ)
+		ibp->n_rc_resends++;
+	else
+		ibp->n_rc_resends += (qp->s_psn - psn) & QIB_PSN_MASK;
+
+	qp->s_flags &= ~(QIB_S_WAIT_FENCE | QIB_S_WAIT_RDMAR |
+			 QIB_S_WAIT_SSN_CREDIT | QIB_S_WAIT_PSN |
+			 QIB_S_WAIT_ACK);
+	if (wait)
+		qp->s_flags |= QIB_S_SEND_ONE;
+	reset_psn(qp, psn);
+}
+
+/*
+ * This is called from s_timer for missing responses.
+ */
+static void rc_timeout(unsigned long arg)
+{
+	struct qib_qp *qp = (struct qib_qp *)arg;
+	struct qib_ibport *ibp;
+	unsigned long flags;
+
+	spin_lock_irqsave(&qp->r_lock, flags);
+	spin_lock(&qp->s_lock);
+	if (qp->s_flags & QIB_S_TIMER) {
+		ibp = to_iport(qp->ibqp.device, qp->port_num);
+		ibp->n_rc_timeouts++;
+		qp->s_flags &= ~QIB_S_TIMER;
+		del_timer(&qp->s_timer);
+		qib_restart_rc(qp, qp->s_last_psn + 1, 1);
+		qib_schedule_send(qp);
+	}
+	spin_unlock(&qp->s_lock);
+	spin_unlock_irqrestore(&qp->r_lock, flags);
+}
+
+/*
+ * This is called from s_timer for RNR timeouts.
+ */
+void qib_rc_rnr_retry(unsigned long arg)
+{
+	struct qib_qp *qp = (struct qib_qp *)arg;
+	unsigned long flags;
+
+	spin_lock_irqsave(&qp->s_lock, flags);
+	if (qp->s_flags & QIB_S_WAIT_RNR) {
+		qp->s_flags &= ~QIB_S_WAIT_RNR;
+		del_timer(&qp->s_timer);
+		qib_schedule_send(qp);
+	}
+	spin_unlock_irqrestore(&qp->s_lock, flags);
+}
+
+/*
+ * Set qp->s_sending_psn to the next PSN after the given one.
+ * This would be psn+1 except when RDMA reads are present.
+ */
+static void reset_sending_psn(struct qib_qp *qp, u32 psn)
+{
+	struct qib_swqe *wqe;
+	u32 n = qp->s_last;
+
+	/* Find the work request corresponding to the given PSN. */
+	for (;;) {
+		wqe = get_swqe_ptr(qp, n);
+		if (qib_cmp24(psn, wqe->lpsn) <= 0) {
+			if (wqe->wr.opcode == IB_WR_RDMA_READ)
+				qp->s_sending_psn = wqe->lpsn + 1;
+			else
+				qp->s_sending_psn = psn + 1;
+			break;
+		}
+		if (++n == qp->s_size)
+			n = 0;
+		if (n == qp->s_tail)
+			break;
+	}
+}
+
+/*
+ * This should be called with the QP s_lock held and interrupts disabled.
+ */
+void qib_rc_send_complete(struct qib_qp *qp, struct qib_ib_header *hdr)
+{
+	struct qib_other_headers *ohdr;
+	struct qib_swqe *wqe;
+	struct ib_wc wc;
+	unsigned i;
+	u32 opcode;
+	u32 psn;
+
+	if (!(ib_qib_state_ops[qp->state] & QIB_PROCESS_OR_FLUSH_SEND))
+		return;
+
+	/* Find out where the BTH is */
+	if ((be16_to_cpu(hdr->lrh[0]) & 3) == QIB_LRH_BTH)
+		ohdr = &hdr->u.oth;
+	else
+		ohdr = &hdr->u.l.oth;
+
+	opcode = be32_to_cpu(ohdr->bth[0]) >> 24;
+	if (opcode >= OP(RDMA_READ_RESPONSE_FIRST) &&
+	    opcode <= OP(ATOMIC_ACKNOWLEDGE)) {
+		WARN_ON(!qp->s_rdma_ack_cnt);
+		qp->s_rdma_ack_cnt--;
+		return;
+	}
+
+	psn = be32_to_cpu(ohdr->bth[2]);
+	reset_sending_psn(qp, psn);
+
+	/*
+	 * Start timer after a packet requesting an ACK has been sent and
+	 * there are still requests that haven't been acked.
+	 */
+	if ((psn & IB_BTH_REQ_ACK) && qp->s_acked != qp->s_tail &&
+	    !(qp->s_flags & (QIB_S_TIMER | QIB_S_WAIT_RNR | QIB_S_WAIT_PSN)) &&
+	    (ib_qib_state_ops[qp->state] & QIB_PROCESS_RECV_OK))
+		start_timer(qp);
+
+	while (qp->s_last != qp->s_acked) {
+		wqe = get_swqe_ptr(qp, qp->s_last);
+		if (qib_cmp24(wqe->lpsn, qp->s_sending_psn) >= 0 &&
+		    qib_cmp24(qp->s_sending_psn, qp->s_sending_hpsn) <= 0)
+			break;
+		for (i = 0; i < wqe->wr.num_sge; i++) {
+			struct qib_sge *sge = &wqe->sg_list[i];
+
+			qib_put_mr(sge->mr);
+		}
+		/* Post a send completion queue entry if requested. */
+		if (!(qp->s_flags & QIB_S_SIGNAL_REQ_WR) ||
+		    (wqe->wr.send_flags & IB_SEND_SIGNALED)) {
+			memset(&wc, 0, sizeof wc);
+			wc.wr_id = wqe->wr.wr_id;
+			wc.status = IB_WC_SUCCESS;
+			wc.opcode = ib_qib_wc_opcode[wqe->wr.opcode];
+			wc.byte_len = wqe->length;
+			wc.qp = &qp->ibqp;
+			qib_cq_enter(to_icq(qp->ibqp.send_cq), &wc, 0);
+		}
+		if (++qp->s_last >= qp->s_size)
+			qp->s_last = 0;
+	}
+	/*
+	 * If we were waiting for sends to complete before resending,
+	 * and they are now complete, restart sending.
+	 */
+	if (qp->s_flags & QIB_S_WAIT_PSN &&
+	    qib_cmp24(qp->s_sending_psn, qp->s_sending_hpsn) > 0) {
+		qp->s_flags &= ~QIB_S_WAIT_PSN;
+		qp->s_sending_psn = qp->s_psn;
+		qp->s_sending_hpsn = qp->s_psn - 1;
+		qib_schedule_send(qp);
+	}
+}
+
+static inline void update_last_psn(struct qib_qp *qp, u32 psn)
+{
+	qp->s_last_psn = psn;
+}
+
+/*
+ * Generate a SWQE completion.
+ * This is similar to qib_send_complete but has to check to be sure
+ * that the SGEs are not being referenced if the SWQE is being resent.
+ */
+static struct qib_swqe *do_rc_completion(struct qib_qp *qp,
+					 struct qib_swqe *wqe,
+					 struct qib_ibport *ibp)
+{
+	struct ib_wc wc;
+	unsigned i;
+
+	/*
+	 * Don't decrement refcount and don't generate a
+	 * completion if the SWQE is being resent until the send
+	 * is finished.
+	 */
+	if (qib_cmp24(wqe->lpsn, qp->s_sending_psn) < 0 ||
+	    qib_cmp24(qp->s_sending_psn, qp->s_sending_hpsn) > 0) {
+		for (i = 0; i < wqe->wr.num_sge; i++) {
+			struct qib_sge *sge = &wqe->sg_list[i];
+
+			qib_put_mr(sge->mr);
+		}
+		/* Post a send completion queue entry if requested. */
+		if (!(qp->s_flags & QIB_S_SIGNAL_REQ_WR) ||
+		    (wqe->wr.send_flags & IB_SEND_SIGNALED)) {
+			memset(&wc, 0, sizeof wc);
+			wc.wr_id = wqe->wr.wr_id;
+			wc.status = IB_WC_SUCCESS;
+			wc.opcode = ib_qib_wc_opcode[wqe->wr.opcode];
+			wc.byte_len = wqe->length;
+			wc.qp = &qp->ibqp;
+			qib_cq_enter(to_icq(qp->ibqp.send_cq), &wc, 0);
+		}
+		if (++qp->s_last >= qp->s_size)
+			qp->s_last = 0;
+	} else
+		ibp->n_rc_delayed_comp++;
+
+	qp->s_retry = qp->s_retry_cnt;
+	update_last_psn(qp, wqe->lpsn);
+
+	/*
+	 * If we are completing a request which is in the process of
+	 * being resent, we can stop resending it since we know the
+	 * responder has already seen it.
+	 */
+	if (qp->s_acked == qp->s_cur) {
+		if (++qp->s_cur >= qp->s_size)
+			qp->s_cur = 0;
+		qp->s_acked = qp->s_cur;
+		wqe = get_swqe_ptr(qp, qp->s_cur);
+		if (qp->s_acked != qp->s_tail) {
+			qp->s_state = OP(SEND_LAST);
+			qp->s_psn = wqe->psn;
+		}
+	} else {
+		if (++qp->s_acked >= qp->s_size)
+			qp->s_acked = 0;
+		if (qp->state == IB_QPS_SQD && qp->s_acked == qp->s_cur)
+			qp->s_draining = 0;
+		wqe = get_swqe_ptr(qp, qp->s_acked);
+	}
+	return wqe;
+}
+
+/**
+ * do_rc_ack - process an incoming RC ACK
+ * @qp: the QP the ACK came in on
+ * @psn: the packet sequence number of the ACK
+ * @opcode: the opcode of the request that resulted in the ACK
+ *
+ * This is called from qib_rc_rcv_resp() to process an incoming RC ACK
+ * for the given QP.
+ * Called at interrupt level with the QP s_lock held.
+ * Returns 1 if OK, 0 if current operation should be aborted (NAK).
+ */
+static int do_rc_ack(struct qib_qp *qp, u32 aeth, u32 psn, int opcode,
+		     u64 val, struct qib_ctxtdata *rcd)
+{
+	struct qib_ibport *ibp;
+	enum ib_wc_status status;
+	struct qib_swqe *wqe;
+	int ret = 0;
+	u32 ack_psn;
+	int diff;
+
+	/* Remove QP from retry timer */
+	if (qp->s_flags & (QIB_S_TIMER | QIB_S_WAIT_RNR)) {
+		qp->s_flags &= ~(QIB_S_TIMER | QIB_S_WAIT_RNR);
+		del_timer(&qp->s_timer);
+	}
+
+	/*
+	 * Note that NAKs implicitly ACK outstanding SEND and RDMA write
+	 * requests and implicitly NAK RDMA read and atomic requests issued
+	 * before the NAK'ed request.  The MSN won't include the NAK'ed
+	 * request but will include an ACK'ed request(s).
+	 */
+	ack_psn = psn;
+	if (aeth >> 29)
+		ack_psn--;
+	wqe = get_swqe_ptr(qp, qp->s_acked);
+	ibp = to_iport(qp->ibqp.device, qp->port_num);
+
+	/*
+	 * The MSN might be for a later WQE than the PSN indicates so
+	 * only complete WQEs that the PSN finishes.
+	 */
+	while ((diff = qib_cmp24(ack_psn, wqe->lpsn)) >= 0) {
+		/*
+		 * RDMA_READ_RESPONSE_ONLY is a special case since
+		 * we want to generate completion events for everything
+		 * before the RDMA read, copy the data, then generate
+		 * the completion for the read.
+		 */
+		if (wqe->wr.opcode == IB_WR_RDMA_READ &&
+		    opcode == OP(RDMA_READ_RESPONSE_ONLY) &&
+		    diff == 0) {
+			ret = 1;
+			goto bail;
+		}
+		/*
+		 * If this request is a RDMA read or atomic, and the ACK is
+		 * for a later operation, this ACK NAKs the RDMA read or
+		 * atomic.  In other words, only a RDMA_READ_LAST or ONLY
+		 * can ACK a RDMA read and likewise for atomic ops.  Note
+		 * that the NAK case can only happen if relaxed ordering is
+		 * used and requests are sent after an RDMA read or atomic
+		 * is sent but before the response is received.
+		 */
+		if ((wqe->wr.opcode == IB_WR_RDMA_READ &&
+		     (opcode != OP(RDMA_READ_RESPONSE_LAST) || diff != 0)) ||
+		    ((wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
+		      wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) &&
+		     (opcode != OP(ATOMIC_ACKNOWLEDGE) || diff != 0))) {
+			/* Retry this request. */
+			if (!(qp->r_flags & QIB_R_RDMAR_SEQ)) {
+				qp->r_flags |= QIB_R_RDMAR_SEQ;
+				qib_restart_rc(qp, qp->s_last_psn + 1, 0);
+				if (list_empty(&qp->rspwait)) {
+					qp->r_flags |= QIB_R_RSP_SEND;
+					atomic_inc(&qp->refcount);
+					list_add_tail(&qp->rspwait,
+						      &rcd->qp_wait_list);
+				}
+			}
+			/*
+			 * No need to process the ACK/NAK since we are
+			 * restarting an earlier request.
+			 */
+			goto bail;
+		}
+		if (wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
+		    wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) {
+			u64 *vaddr = wqe->sg_list[0].vaddr;
+			*vaddr = val;
+		}
+		if (qp->s_num_rd_atomic &&
+		    (wqe->wr.opcode == IB_WR_RDMA_READ ||
+		     wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
+		     wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD)) {
+			qp->s_num_rd_atomic--;
+			/* Restart sending task if fence is complete */
+			if ((qp->s_flags & QIB_S_WAIT_FENCE) &&
+			    !qp->s_num_rd_atomic) {
+				qp->s_flags &= ~(QIB_S_WAIT_FENCE |
+						 QIB_S_WAIT_ACK);
+				qib_schedule_send(qp);
+			} else if (qp->s_flags & QIB_S_WAIT_RDMAR) {
+				qp->s_flags &= ~(QIB_S_WAIT_RDMAR |
+						 QIB_S_WAIT_ACK);
+				qib_schedule_send(qp);
+			}
+		}
+		wqe = do_rc_completion(qp, wqe, ibp);
+		if (qp->s_acked == qp->s_tail)
+			break;
+	}
+
+	switch (aeth >> 29) {
+	case 0:         /* ACK */
+		ibp->n_rc_acks++;
+		if (qp->s_acked != qp->s_tail) {
+			/*
+			 * We are expecting more ACKs so
+			 * reset the retransmit timer.
+			 */
+			start_timer(qp);
+			/*
+			 * We can stop resending the earlier packets and
+			 * continue with the next packet the receiver wants.
+			 */
+			if (qib_cmp24(qp->s_psn, psn) <= 0)
+				reset_psn(qp, psn + 1);
+		} else if (qib_cmp24(qp->s_psn, psn) <= 0) {
+			qp->s_state = OP(SEND_LAST);
+			qp->s_psn = psn + 1;
+		}
+		if (qp->s_flags & QIB_S_WAIT_ACK) {
+			qp->s_flags &= ~QIB_S_WAIT_ACK;
+			qib_schedule_send(qp);
+		}
+		qib_get_credit(qp, aeth);
+		qp->s_rnr_retry = qp->s_rnr_retry_cnt;
+		qp->s_retry = qp->s_retry_cnt;
+		update_last_psn(qp, psn);
+		ret = 1;
+		goto bail;
+
+	case 1:         /* RNR NAK */
+		ibp->n_rnr_naks++;
+		if (qp->s_acked == qp->s_tail)
+			goto bail;
+		if (qp->s_flags & QIB_S_WAIT_RNR)
+			goto bail;
+		if (qp->s_rnr_retry == 0) {
+			status = IB_WC_RNR_RETRY_EXC_ERR;
+			goto class_b;
+		}
+		if (qp->s_rnr_retry_cnt < 7)
+			qp->s_rnr_retry--;
+
+		/* The last valid PSN is the previous PSN. */
+		update_last_psn(qp, psn - 1);
+
+		ibp->n_rc_resends += (qp->s_psn - psn) & QIB_PSN_MASK;
+
+		reset_psn(qp, psn);
+
+		qp->s_flags &= ~(QIB_S_WAIT_SSN_CREDIT | QIB_S_WAIT_ACK);
+		qp->s_flags |= QIB_S_WAIT_RNR;
+		qp->s_timer.function = qib_rc_rnr_retry;
+		qp->s_timer.expires = jiffies + usecs_to_jiffies(
+			ib_qib_rnr_table[(aeth >> QIB_AETH_CREDIT_SHIFT) &
+					   QIB_AETH_CREDIT_MASK]);
+		add_timer(&qp->s_timer);
+		goto bail;
+
+	case 3:         /* NAK */
+		if (qp->s_acked == qp->s_tail)
+			goto bail;
+		/* The last valid PSN is the previous PSN. */
+		update_last_psn(qp, psn - 1);
+		switch ((aeth >> QIB_AETH_CREDIT_SHIFT) &
+			QIB_AETH_CREDIT_MASK) {
+		case 0: /* PSN sequence error */
+			ibp->n_seq_naks++;
+			/*
+			 * Back up to the responder's expected PSN.
+			 * Note that we might get a NAK in the middle of an
+			 * RDMA READ response which terminates the RDMA
+			 * READ.
+			 */
+			qib_restart_rc(qp, psn, 0);
+			qib_schedule_send(qp);
+			break;
+
+		case 1: /* Invalid Request */
+			status = IB_WC_REM_INV_REQ_ERR;
+			ibp->n_other_naks++;
+			goto class_b;
+
+		case 2: /* Remote Access Error */
+			status = IB_WC_REM_ACCESS_ERR;
+			ibp->n_other_naks++;
+			goto class_b;
+
+		case 3: /* Remote Operation Error */
+			status = IB_WC_REM_OP_ERR;
+			ibp->n_other_naks++;
+class_b:
+			if (qp->s_last == qp->s_acked) {
+				qib_send_complete(qp, wqe, status);
+				qib_error_qp(qp, IB_WC_WR_FLUSH_ERR);
+			}
+			break;
+
+		default:
+			/* Ignore other reserved NAK error codes */
+			goto reserved;
+		}
+		qp->s_retry = qp->s_retry_cnt;
+		qp->s_rnr_retry = qp->s_rnr_retry_cnt;
+		goto bail;
+
+	default:                /* 2: reserved */
+reserved:
+		/* Ignore reserved NAK codes. */
+		goto bail;
+	}
+
+bail:
+	return ret;
+}
+
+/*
+ * We have seen an out of sequence RDMA read middle or last packet.
+ * This ACKs SENDs and RDMA writes up to the first RDMA read or atomic SWQE.
+ */
+static void rdma_seq_err(struct qib_qp *qp, struct qib_ibport *ibp, u32 psn,
+			 struct qib_ctxtdata *rcd)
+{
+	struct qib_swqe *wqe;
+
+	/* Remove QP from retry timer */
+	if (qp->s_flags & (QIB_S_TIMER | QIB_S_WAIT_RNR)) {
+		qp->s_flags &= ~(QIB_S_TIMER | QIB_S_WAIT_RNR);
+		del_timer(&qp->s_timer);
+	}
+
+	wqe = get_swqe_ptr(qp, qp->s_acked);
+
+	while (qib_cmp24(psn, wqe->lpsn) > 0) {
+		if (wqe->wr.opcode == IB_WR_RDMA_READ ||
+		    wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
+		    wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD)
+			break;
+		wqe = do_rc_completion(qp, wqe, ibp);
+	}
+
+	ibp->n_rdma_seq++;
+	qp->r_flags |= QIB_R_RDMAR_SEQ;
+	qib_restart_rc(qp, qp->s_last_psn + 1, 0);
+	if (list_empty(&qp->rspwait)) {
+		qp->r_flags |= QIB_R_RSP_SEND;
+		atomic_inc(&qp->refcount);
+		list_add_tail(&qp->rspwait, &rcd->qp_wait_list);
+	}
+}
+
+/**
+ * qib_rc_rcv_resp - process an incoming RC response packet
+ * @ibp: the port this packet came in on
+ * @ohdr: the other headers for this packet
+ * @data: the packet data
+ * @tlen: the packet length
+ * @qp: the QP for this packet
+ * @opcode: the opcode for this packet
+ * @psn: the packet sequence number for this packet
+ * @hdrsize: the header length
+ * @pmtu: the path MTU
+ *
+ * This is called from qib_rc_rcv() to process an incoming RC response
+ * packet for the given QP.
+ * Called at interrupt level.
+ */
+static void qib_rc_rcv_resp(struct qib_ibport *ibp,
+			    struct qib_other_headers *ohdr,
+			    void *data, u32 tlen,
+			    struct qib_qp *qp,
+			    u32 opcode,
+			    u32 psn, u32 hdrsize, u32 pmtu,
+			    struct qib_ctxtdata *rcd)
+{
+	struct qib_swqe *wqe;
+	struct qib_pportdata *ppd = ppd_from_ibp(ibp);
+	enum ib_wc_status status;
+	unsigned long flags;
+	int diff;
+	u32 pad;
+	u32 aeth;
+	u64 val;
+
+	if (opcode != OP(RDMA_READ_RESPONSE_MIDDLE)) {
+		/*
+		 * If ACK'd PSN on SDMA busy list try to make progress to
+		 * reclaim SDMA credits.
+		 */
+		if ((qib_cmp24(psn, qp->s_sending_psn) >= 0) &&
+		    (qib_cmp24(qp->s_sending_psn, qp->s_sending_hpsn) <= 0)) {
+
+			/*
+			 * If send tasklet not running attempt to progress
+			 * SDMA queue.
+			 */
+			if (!(qp->s_flags & QIB_S_BUSY)) {
+				/* Acquire SDMA Lock */
+				spin_lock_irqsave(&ppd->sdma_lock, flags);
+				/* Invoke sdma make progress */
+				qib_sdma_make_progress(ppd);
+				/* Release SDMA Lock */
+				spin_unlock_irqrestore(&ppd->sdma_lock, flags);
+			}
+		}
+	}
+
+	spin_lock_irqsave(&qp->s_lock, flags);
+	if (!(ib_qib_state_ops[qp->state] & QIB_PROCESS_RECV_OK))
+		goto ack_done;
+
+	/* Ignore invalid responses. */
+	if (qib_cmp24(psn, qp->s_next_psn) >= 0)
+		goto ack_done;
+
+	/* Ignore duplicate responses. */
+	diff = qib_cmp24(psn, qp->s_last_psn);
+	if (unlikely(diff <= 0)) {
+		/* Update credits for "ghost" ACKs */
+		if (diff == 0 && opcode == OP(ACKNOWLEDGE)) {
+			aeth = be32_to_cpu(ohdr->u.aeth);
+			if ((aeth >> 29) == 0)
+				qib_get_credit(qp, aeth);
+		}
+		goto ack_done;
+	}
+
+	/*
+	 * Skip everything other than the PSN we expect, if we are waiting
+	 * for a reply to a restarted RDMA read or atomic op.
+	 */
+	if (qp->r_flags & QIB_R_RDMAR_SEQ) {
+		if (qib_cmp24(psn, qp->s_last_psn + 1) != 0)
+			goto ack_done;
+		qp->r_flags &= ~QIB_R_RDMAR_SEQ;
+	}
+
+	if (unlikely(qp->s_acked == qp->s_tail))
+		goto ack_done;
+	wqe = get_swqe_ptr(qp, qp->s_acked);
+	status = IB_WC_SUCCESS;
+
+	switch (opcode) {
+	case OP(ACKNOWLEDGE):
+	case OP(ATOMIC_ACKNOWLEDGE):
+	case OP(RDMA_READ_RESPONSE_FIRST):
+		aeth = be32_to_cpu(ohdr->u.aeth);
+		if (opcode == OP(ATOMIC_ACKNOWLEDGE)) {
+			__be32 *p = ohdr->u.at.atomic_ack_eth;
+
+			val = ((u64) be32_to_cpu(p[0]) << 32) |
+				be32_to_cpu(p[1]);
+		} else
+			val = 0;
+		if (!do_rc_ack(qp, aeth, psn, opcode, val, rcd) ||
+		    opcode != OP(RDMA_READ_RESPONSE_FIRST))
+			goto ack_done;
+		hdrsize += 4;
+		wqe = get_swqe_ptr(qp, qp->s_acked);
+		if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ))
+			goto ack_op_err;
+		/*
+		 * If this is a response to a resent RDMA read, we
+		 * have to be careful to copy the data to the right
+		 * location.
+		 */
+		qp->s_rdma_read_len = restart_sge(&qp->s_rdma_read_sge,
+						  wqe, psn, pmtu);
+		goto read_middle;
+
+	case OP(RDMA_READ_RESPONSE_MIDDLE):
+		/* no AETH, no ACK */
+		if (unlikely(qib_cmp24(psn, qp->s_last_psn + 1)))
+			goto ack_seq_err;
+		if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ))
+			goto ack_op_err;
+read_middle:
+		if (unlikely(tlen != (hdrsize + pmtu + 4)))
+			goto ack_len_err;
+		if (unlikely(pmtu >= qp->s_rdma_read_len))
+			goto ack_len_err;
+
+		/*
+		 * We got a response so update the timeout.
+		 * 4.096 usec. * (1 << qp->timeout)
+		 */
+		qp->s_flags |= QIB_S_TIMER;
+		mod_timer(&qp->s_timer, jiffies + qp->timeout_jiffies);
+		if (qp->s_flags & QIB_S_WAIT_ACK) {
+			qp->s_flags &= ~QIB_S_WAIT_ACK;
+			qib_schedule_send(qp);
+		}
+
+		if (opcode == OP(RDMA_READ_RESPONSE_MIDDLE))
+			qp->s_retry = qp->s_retry_cnt;
+
+		/*
+		 * Update the RDMA receive state but do the copy w/o
+		 * holding the locks and blocking interrupts.
+		 */
+		qp->s_rdma_read_len -= pmtu;
+		update_last_psn(qp, psn);
+		spin_unlock_irqrestore(&qp->s_lock, flags);
+		qib_copy_sge(&qp->s_rdma_read_sge, data, pmtu, 0);
+		goto bail;
+
+	case OP(RDMA_READ_RESPONSE_ONLY):
+		aeth = be32_to_cpu(ohdr->u.aeth);
+		if (!do_rc_ack(qp, aeth, psn, opcode, 0, rcd))
+			goto ack_done;
+		/* Get the number of bytes the message was padded by. */
+		pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
+		/*
+		 * Check that the data size is >= 0 && <= pmtu.
+		 * Remember to account for the AETH header (4) and
+		 * ICRC (4).
+		 */
+		if (unlikely(tlen < (hdrsize + pad + 8)))
+			goto ack_len_err;
+		/*
+		 * If this is a response to a resent RDMA read, we
+		 * have to be careful to copy the data to the right
+		 * location.
+		 */
+		wqe = get_swqe_ptr(qp, qp->s_acked);
+		qp->s_rdma_read_len = restart_sge(&qp->s_rdma_read_sge,
+						  wqe, psn, pmtu);
+		goto read_last;
+
+	case OP(RDMA_READ_RESPONSE_LAST):
+		/* ACKs READ req. */
+		if (unlikely(qib_cmp24(psn, qp->s_last_psn + 1)))
+			goto ack_seq_err;
+		if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ))
+			goto ack_op_err;
+		/* Get the number of bytes the message was padded by. */
+		pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
+		/*
+		 * Check that the data size is >= 1 && <= pmtu.
+		 * Remember to account for the AETH header (4) and
+		 * ICRC (4).
+		 */
+		if (unlikely(tlen <= (hdrsize + pad + 8)))
+			goto ack_len_err;
+read_last:
+		tlen -= hdrsize + pad + 8;
+		if (unlikely(tlen != qp->s_rdma_read_len))
+			goto ack_len_err;
+		aeth = be32_to_cpu(ohdr->u.aeth);
+		qib_copy_sge(&qp->s_rdma_read_sge, data, tlen, 0);
+		WARN_ON(qp->s_rdma_read_sge.num_sge);
+		(void) do_rc_ack(qp, aeth, psn,
+				 OP(RDMA_READ_RESPONSE_LAST), 0, rcd);
+		goto ack_done;
+	}
+
+ack_op_err:
+	status = IB_WC_LOC_QP_OP_ERR;
+	goto ack_err;
+
+ack_seq_err:
+	rdma_seq_err(qp, ibp, psn, rcd);
+	goto ack_done;
+
+ack_len_err:
+	status = IB_WC_LOC_LEN_ERR;
+ack_err:
+	if (qp->s_last == qp->s_acked) {
+		qib_send_complete(qp, wqe, status);
+		qib_error_qp(qp, IB_WC_WR_FLUSH_ERR);
+	}
+ack_done:
+	spin_unlock_irqrestore(&qp->s_lock, flags);
+bail:
+	return;
+}
+
+/**
+ * qib_rc_rcv_error - process an incoming duplicate or error RC packet
+ * @ohdr: the other headers for this packet
+ * @data: the packet data
+ * @qp: the QP for this packet
+ * @opcode: the opcode for this packet
+ * @psn: the packet sequence number for this packet
+ * @diff: the difference between the PSN and the expected PSN
+ *
+ * This is called from qib_rc_rcv() to process an unexpected
+ * incoming RC packet for the given QP.
+ * Called at interrupt level.
+ * Return 1 if no more processing is needed; otherwise return 0 to
+ * schedule a response to be sent.
+ */
+static int qib_rc_rcv_error(struct qib_other_headers *ohdr,
+			    void *data,
+			    struct qib_qp *qp,
+			    u32 opcode,
+			    u32 psn,
+			    int diff,
+			    struct qib_ctxtdata *rcd)
+{
+	struct qib_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
+	struct qib_ack_entry *e;
+	unsigned long flags;
+	u8 i, prev;
+	int old_req;
+
+	if (diff > 0) {
+		/*
+		 * Packet sequence error.
+		 * A NAK will ACK earlier sends and RDMA writes.
+		 * Don't queue the NAK if we already sent one.
+		 */
+		if (!qp->r_nak_state) {
+			ibp->n_rc_seqnak++;
+			qp->r_nak_state = IB_NAK_PSN_ERROR;
+			/* Use the expected PSN. */
+			qp->r_ack_psn = qp->r_psn;
+			/*
+			 * Wait to send the sequence NAK until all packets
+			 * in the receive queue have been processed.
+			 * Otherwise, we end up propagating congestion.
+			 */
+			if (list_empty(&qp->rspwait)) {
+				qp->r_flags |= QIB_R_RSP_NAK;
+				atomic_inc(&qp->refcount);
+				list_add_tail(&qp->rspwait, &rcd->qp_wait_list);
+			}
+		}
+		goto done;
+	}
+
+	/*
+	 * Handle a duplicate request.  Don't re-execute SEND, RDMA
+	 * write or atomic op.  Don't NAK errors, just silently drop
+	 * the duplicate request.  Note that r_sge, r_len, and
+	 * r_rcv_len may be in use so don't modify them.
+	 *
+	 * We are supposed to ACK the earliest duplicate PSN but we
+	 * can coalesce an outstanding duplicate ACK.  We have to
+	 * send the earliest so that RDMA reads can be restarted at
+	 * the requester's expected PSN.
+	 *
+	 * First, find where this duplicate PSN falls within the
+	 * ACKs previously sent.
+	 * old_req is true if there is an older response that is scheduled
+	 * to be sent before sending this one.
+	 */
+	e = NULL;
+	old_req = 1;
+	ibp->n_rc_dupreq++;
+
+	spin_lock_irqsave(&qp->s_lock, flags);
+
+	for (i = qp->r_head_ack_queue; ; i = prev) {
+		if (i == qp->s_tail_ack_queue)
+			old_req = 0;
+		if (i)
+			prev = i - 1;
+		else
+			prev = QIB_MAX_RDMA_ATOMIC;
+		if (prev == qp->r_head_ack_queue) {
+			e = NULL;
+			break;
+		}
+		e = &qp->s_ack_queue[prev];
+		if (!e->opcode) {
+			e = NULL;
+			break;
+		}
+		if (qib_cmp24(psn, e->psn) >= 0) {
+			if (prev == qp->s_tail_ack_queue &&
+			    qib_cmp24(psn, e->lpsn) <= 0)
+				old_req = 0;
+			break;
+		}
+	}
+	switch (opcode) {
+	case OP(RDMA_READ_REQUEST): {
+		struct ib_reth *reth;
+		u32 offset;
+		u32 len;
+
+		/*
+		 * If we didn't find the RDMA read request in the ack queue,
+		 * we can ignore this request.
+		 */
+		if (!e || e->opcode != OP(RDMA_READ_REQUEST))
+			goto unlock_done;
+		/* RETH comes after BTH */
+		reth = &ohdr->u.rc.reth;
+		/*
+		 * Address range must be a subset of the original
+		 * request and start on pmtu boundaries.
+		 * We reuse the old ack_queue slot since the requester
+		 * should not back up and request an earlier PSN for the
+		 * same request.
+		 */
+		offset = ((psn - e->psn) & QIB_PSN_MASK) *
+			qp->pmtu;
+		len = be32_to_cpu(reth->length);
+		if (unlikely(offset + len != e->rdma_sge.sge_length))
+			goto unlock_done;
+		if (e->rdma_sge.mr) {
+			qib_put_mr(e->rdma_sge.mr);
+			e->rdma_sge.mr = NULL;
+		}
+		if (len != 0) {
+			u32 rkey = be32_to_cpu(reth->rkey);
+			u64 vaddr = be64_to_cpu(reth->vaddr);
+			int ok;
+
+			ok = qib_rkey_ok(qp, &e->rdma_sge, len, vaddr, rkey,
+					 IB_ACCESS_REMOTE_READ);
+			if (unlikely(!ok))
+				goto unlock_done;
+		} else {
+			e->rdma_sge.vaddr = NULL;
+			e->rdma_sge.length = 0;
+			e->rdma_sge.sge_length = 0;
+		}
+		e->psn = psn;
+		if (old_req)
+			goto unlock_done;
+		qp->s_tail_ack_queue = prev;
+		break;
+	}
+
+	case OP(COMPARE_SWAP):
+	case OP(FETCH_ADD): {
+		/*
+		 * If we didn't find the atomic request in the ack queue
+		 * or the send tasklet is already backed up to send an
+		 * earlier entry, we can ignore this request.
+		 */
+		if (!e || e->opcode != (u8) opcode || old_req)
+			goto unlock_done;
+		qp->s_tail_ack_queue = prev;
+		break;
+	}
+
+	default:
+		/*
+		 * Ignore this operation if it doesn't request an ACK
+		 * or an earlier RDMA read or atomic is going to be resent.
+		 */
+		if (!(psn & IB_BTH_REQ_ACK) || old_req)
+			goto unlock_done;
+		/*
+		 * Resend the most recent ACK if this request is
+		 * after all the previous RDMA reads and atomics.
+		 */
+		if (i == qp->r_head_ack_queue) {
+			spin_unlock_irqrestore(&qp->s_lock, flags);
+			qp->r_nak_state = 0;
+			qp->r_ack_psn = qp->r_psn - 1;
+			goto send_ack;
+		}
+		/*
+		 * Try to send a simple ACK to work around a Mellanox bug
+		 * which doesn't accept a RDMA read response or atomic
+		 * response as an ACK for earlier SENDs or RDMA writes.
+		 */
+		if (!(qp->s_flags & QIB_S_RESP_PENDING)) {
+			spin_unlock_irqrestore(&qp->s_lock, flags);
+			qp->r_nak_state = 0;
+			qp->r_ack_psn = qp->s_ack_queue[i].psn - 1;
+			goto send_ack;
+		}
+		/*
+		 * Resend the RDMA read or atomic op which
+		 * ACKs this duplicate request.
+		 */
+		qp->s_tail_ack_queue = i;
+		break;
+	}
+	qp->s_ack_state = OP(ACKNOWLEDGE);
+	qp->s_flags |= QIB_S_RESP_PENDING;
+	qp->r_nak_state = 0;
+	qib_schedule_send(qp);
+
+unlock_done:
+	spin_unlock_irqrestore(&qp->s_lock, flags);
+done:
+	return 1;
+
+send_ack:
+	return 0;
+}
+
+void qib_rc_error(struct qib_qp *qp, enum ib_wc_status err)
+{
+	unsigned long flags;
+	int lastwqe;
+
+	spin_lock_irqsave(&qp->s_lock, flags);
+	lastwqe = qib_error_qp(qp, err);
+	spin_unlock_irqrestore(&qp->s_lock, flags);
+
+	if (lastwqe) {
+		struct ib_event ev;
+
+		ev.device = qp->ibqp.device;
+		ev.element.qp = &qp->ibqp;
+		ev.event = IB_EVENT_QP_LAST_WQE_REACHED;
+		qp->ibqp.event_handler(&ev, qp->ibqp.qp_context);
+	}
+}
+
+static inline void qib_update_ack_queue(struct qib_qp *qp, unsigned n)
+{
+	unsigned next;
+
+	next = n + 1;
+	if (next > QIB_MAX_RDMA_ATOMIC)
+		next = 0;
+	qp->s_tail_ack_queue = next;
+	qp->s_ack_state = OP(ACKNOWLEDGE);
+}
+
+/**
+ * qib_rc_rcv - process an incoming RC packet
+ * @rcd: the context pointer
+ * @hdr: the header of this packet
+ * @has_grh: true if the header has a GRH
+ * @data: the packet data
+ * @tlen: the packet length
+ * @qp: the QP for this packet
+ *
+ * This is called from qib_qp_rcv() to process an incoming RC packet
+ * for the given QP.
+ * Called at interrupt level.
+ */
+void qib_rc_rcv(struct qib_ctxtdata *rcd, struct qib_ib_header *hdr,
+		int has_grh, void *data, u32 tlen, struct qib_qp *qp)
+{
+	struct qib_ibport *ibp = &rcd->ppd->ibport_data;
+	struct qib_other_headers *ohdr;
+	u32 opcode;
+	u32 hdrsize;
+	u32 psn;
+	u32 pad;
+	struct ib_wc wc;
+	u32 pmtu = qp->pmtu;
+	int diff;
+	struct ib_reth *reth;
+	unsigned long flags;
+	int ret;
+
+	/* Check for GRH */
+	if (!has_grh) {
+		ohdr = &hdr->u.oth;
+		hdrsize = 8 + 12;       /* LRH + BTH */
+	} else {
+		ohdr = &hdr->u.l.oth;
+		hdrsize = 8 + 40 + 12;  /* LRH + GRH + BTH */
+	}
+
+	opcode = be32_to_cpu(ohdr->bth[0]);
+	if (qib_ruc_check_hdr(ibp, hdr, has_grh, qp, opcode))
+		return;
+
+	psn = be32_to_cpu(ohdr->bth[2]);
+	opcode >>= 24;
+
+	/*
+	 * Process responses (ACKs) before anything else.  Note that the
+	 * packet sequence number will be for something in the send work
+	 * queue rather than the expected receive packet sequence number.
+	 * In other words, this QP is the requester.
+	 */
+	if (opcode >= OP(RDMA_READ_RESPONSE_FIRST) &&
+	    opcode <= OP(ATOMIC_ACKNOWLEDGE)) {
+		qib_rc_rcv_resp(ibp, ohdr, data, tlen, qp, opcode, psn,
+				hdrsize, pmtu, rcd);
+		return;
+	}
+
+	/* Compute 24 bits worth of difference. */
+	diff = qib_cmp24(psn, qp->r_psn);
+	if (unlikely(diff)) {
+		if (qib_rc_rcv_error(ohdr, data, qp, opcode, psn, diff, rcd))
+			return;
+		goto send_ack;
+	}
+
+	/* Check for opcode sequence errors. */
+	switch (qp->r_state) {
+	case OP(SEND_FIRST):
+	case OP(SEND_MIDDLE):
+		if (opcode == OP(SEND_MIDDLE) ||
+		    opcode == OP(SEND_LAST) ||
+		    opcode == OP(SEND_LAST_WITH_IMMEDIATE))
+			break;
+		goto nack_inv;
+
+	case OP(RDMA_WRITE_FIRST):
+	case OP(RDMA_WRITE_MIDDLE):
+		if (opcode == OP(RDMA_WRITE_MIDDLE) ||
+		    opcode == OP(RDMA_WRITE_LAST) ||
+		    opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE))
+			break;
+		goto nack_inv;
+
+	default:
+		if (opcode == OP(SEND_MIDDLE) ||
+		    opcode == OP(SEND_LAST) ||
+		    opcode == OP(SEND_LAST_WITH_IMMEDIATE) ||
+		    opcode == OP(RDMA_WRITE_MIDDLE) ||
+		    opcode == OP(RDMA_WRITE_LAST) ||
+		    opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE))
+			goto nack_inv;
+		/*
+		 * Note that it is up to the requester to not send a new
+		 * RDMA read or atomic operation before receiving an ACK
+		 * for the previous operation.
+		 */
+		break;
+	}
+
+	if (qp->state == IB_QPS_RTR && !(qp->r_flags & QIB_R_COMM_EST)) {
+		qp->r_flags |= QIB_R_COMM_EST;
+		if (qp->ibqp.event_handler) {
+			struct ib_event ev;
+
+			ev.device = qp->ibqp.device;
+			ev.element.qp = &qp->ibqp;
+			ev.event = IB_EVENT_COMM_EST;
+			qp->ibqp.event_handler(&ev, qp->ibqp.qp_context);
+		}
+	}
+
+	/* OK, process the packet. */
+	switch (opcode) {
+	case OP(SEND_FIRST):
+		ret = qib_get_rwqe(qp, 0);
+		if (ret < 0)
+			goto nack_op_err;
+		if (!ret)
+			goto rnr_nak;
+		qp->r_rcv_len = 0;
+		/* FALLTHROUGH */
+	case OP(SEND_MIDDLE):
+	case OP(RDMA_WRITE_MIDDLE):
+send_middle:
+		/* Check for invalid length PMTU or posted rwqe len. */
+		if (unlikely(tlen != (hdrsize + pmtu + 4)))
+			goto nack_inv;
+		qp->r_rcv_len += pmtu;
+		if (unlikely(qp->r_rcv_len > qp->r_len))
+			goto nack_inv;
+		qib_copy_sge(&qp->r_sge, data, pmtu, 1);
+		break;
+
+	case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE):
+		/* consume RWQE */
+		ret = qib_get_rwqe(qp, 1);
+		if (ret < 0)
+			goto nack_op_err;
+		if (!ret)
+			goto rnr_nak;
+		goto send_last_imm;
+
+	case OP(SEND_ONLY):
+	case OP(SEND_ONLY_WITH_IMMEDIATE):
+		ret = qib_get_rwqe(qp, 0);
+		if (ret < 0)
+			goto nack_op_err;
+		if (!ret)
+			goto rnr_nak;
+		qp->r_rcv_len = 0;
+		if (opcode == OP(SEND_ONLY))
+			goto no_immediate_data;
+		/* FALLTHROUGH for SEND_ONLY_WITH_IMMEDIATE */
+	case OP(SEND_LAST_WITH_IMMEDIATE):
+send_last_imm:
+		wc.ex.imm_data = ohdr->u.imm_data;
+		hdrsize += 4;
+		wc.wc_flags = IB_WC_WITH_IMM;
+		goto send_last;
+	case OP(SEND_LAST):
+	case OP(RDMA_WRITE_LAST):
+no_immediate_data:
+		wc.wc_flags = 0;
+		wc.ex.imm_data = 0;
+send_last:
+		/* Get the number of bytes the message was padded by. */
+		pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
+		/* Check for invalid length. */
+		/* XXX LAST len should be >= 1 */
+		if (unlikely(tlen < (hdrsize + pad + 4)))
+			goto nack_inv;
+		/* Don't count the CRC. */
+		tlen -= (hdrsize + pad + 4);
+		wc.byte_len = tlen + qp->r_rcv_len;
+		if (unlikely(wc.byte_len > qp->r_len))
+			goto nack_inv;
+		qib_copy_sge(&qp->r_sge, data, tlen, 1);
+		qib_put_ss(&qp->r_sge);
+		qp->r_msn++;
+		if (!test_and_clear_bit(QIB_R_WRID_VALID, &qp->r_aflags))
+			break;
+		wc.wr_id = qp->r_wr_id;
+		wc.status = IB_WC_SUCCESS;
+		if (opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE) ||
+		    opcode == OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE))
+			wc.opcode = IB_WC_RECV_RDMA_WITH_IMM;
+		else
+			wc.opcode = IB_WC_RECV;
+		wc.qp = &qp->ibqp;
+		wc.src_qp = qp->remote_qpn;
+		wc.slid = qp->remote_ah_attr.dlid;
+		wc.sl = qp->remote_ah_attr.sl;
+		/* zero fields that are N/A */
+		wc.vendor_err = 0;
+		wc.pkey_index = 0;
+		wc.dlid_path_bits = 0;
+		wc.port_num = 0;
+		/* Signal completion event if the solicited bit is set. */
+		qib_cq_enter(to_icq(qp->ibqp.recv_cq), &wc,
+			     (ohdr->bth[0] &
+			      cpu_to_be32(IB_BTH_SOLICITED)) != 0);
+		break;
+
+	case OP(RDMA_WRITE_FIRST):
+	case OP(RDMA_WRITE_ONLY):
+	case OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE):
+		if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE)))
+			goto nack_inv;
+		/* consume RWQE */
+		reth = &ohdr->u.rc.reth;
+		hdrsize += sizeof(*reth);
+		qp->r_len = be32_to_cpu(reth->length);
+		qp->r_rcv_len = 0;
+		qp->r_sge.sg_list = NULL;
+		if (qp->r_len != 0) {
+			u32 rkey = be32_to_cpu(reth->rkey);
+			u64 vaddr = be64_to_cpu(reth->vaddr);
+			int ok;
+
+			/* Check rkey & NAK */
+			ok = qib_rkey_ok(qp, &qp->r_sge.sge, qp->r_len, vaddr,
+					 rkey, IB_ACCESS_REMOTE_WRITE);
+			if (unlikely(!ok))
+				goto nack_acc;
+			qp->r_sge.num_sge = 1;
+		} else {
+			qp->r_sge.num_sge = 0;
+			qp->r_sge.sge.mr = NULL;
+			qp->r_sge.sge.vaddr = NULL;
+			qp->r_sge.sge.length = 0;
+			qp->r_sge.sge.sge_length = 0;
+		}
+		if (opcode == OP(RDMA_WRITE_FIRST))
+			goto send_middle;
+		else if (opcode == OP(RDMA_WRITE_ONLY))
+			goto no_immediate_data;
+		ret = qib_get_rwqe(qp, 1);
+		if (ret < 0)
+			goto nack_op_err;
+		if (!ret)
+			goto rnr_nak;
+		wc.ex.imm_data = ohdr->u.rc.imm_data;
+		hdrsize += 4;
+		wc.wc_flags = IB_WC_WITH_IMM;
+		goto send_last;
+
+	case OP(RDMA_READ_REQUEST): {
+		struct qib_ack_entry *e;
+		u32 len;
+		u8 next;
+
+		if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_READ)))
+			goto nack_inv;
+		next = qp->r_head_ack_queue + 1;
+		/* s_ack_queue is size QIB_MAX_RDMA_ATOMIC+1 so use > not >= */
+		if (next > QIB_MAX_RDMA_ATOMIC)
+			next = 0;
+		spin_lock_irqsave(&qp->s_lock, flags);
+		if (unlikely(next == qp->s_tail_ack_queue)) {
+			if (!qp->s_ack_queue[next].sent)
+				goto nack_inv_unlck;
+			qib_update_ack_queue(qp, next);
+		}
+		e = &qp->s_ack_queue[qp->r_head_ack_queue];
+		if (e->opcode == OP(RDMA_READ_REQUEST) && e->rdma_sge.mr) {
+			qib_put_mr(e->rdma_sge.mr);
+			e->rdma_sge.mr = NULL;
+		}
+		reth = &ohdr->u.rc.reth;
+		len = be32_to_cpu(reth->length);
+		if (len) {
+			u32 rkey = be32_to_cpu(reth->rkey);
+			u64 vaddr = be64_to_cpu(reth->vaddr);
+			int ok;
+
+			/* Check rkey & NAK */
+			ok = qib_rkey_ok(qp, &e->rdma_sge, len, vaddr,
+					 rkey, IB_ACCESS_REMOTE_READ);
+			if (unlikely(!ok))
+				goto nack_acc_unlck;
+			/*
+			 * Update the next expected PSN.  We add 1 later
+			 * below, so only add the remainder here.
+			 */
+			if (len > pmtu)
+				qp->r_psn += (len - 1) / pmtu;
+		} else {
+			e->rdma_sge.mr = NULL;
+			e->rdma_sge.vaddr = NULL;
+			e->rdma_sge.length = 0;
+			e->rdma_sge.sge_length = 0;
+		}
+		e->opcode = opcode;
+		e->sent = 0;
+		e->psn = psn;
+		e->lpsn = qp->r_psn;
+		/*
+		 * We need to increment the MSN here instead of when we
+		 * finish sending the result since a duplicate request would
+		 * increment it more than once.
+		 */
+		qp->r_msn++;
+		qp->r_psn++;
+		qp->r_state = opcode;
+		qp->r_nak_state = 0;
+		qp->r_head_ack_queue = next;
+
+		/* Schedule the send tasklet. */
+		qp->s_flags |= QIB_S_RESP_PENDING;
+		qib_schedule_send(qp);
+
+		goto sunlock;
+	}
+
+	case OP(COMPARE_SWAP):
+	case OP(FETCH_ADD): {
+		struct ib_atomic_eth *ateth;
+		struct qib_ack_entry *e;
+		u64 vaddr;
+		atomic64_t *maddr;
+		u64 sdata;
+		u32 rkey;
+		u8 next;
+
+		if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC)))
+			goto nack_inv;
+		next = qp->r_head_ack_queue + 1;
+		if (next > QIB_MAX_RDMA_ATOMIC)
+			next = 0;
+		spin_lock_irqsave(&qp->s_lock, flags);
+		if (unlikely(next == qp->s_tail_ack_queue)) {
+			if (!qp->s_ack_queue[next].sent)
+				goto nack_inv_unlck;
+			qib_update_ack_queue(qp, next);
+		}
+		e = &qp->s_ack_queue[qp->r_head_ack_queue];
+		if (e->opcode == OP(RDMA_READ_REQUEST) && e->rdma_sge.mr) {
+			qib_put_mr(e->rdma_sge.mr);
+			e->rdma_sge.mr = NULL;
+		}
+		ateth = &ohdr->u.atomic_eth;
+		vaddr = ((u64) be32_to_cpu(ateth->vaddr[0]) << 32) |
+			be32_to_cpu(ateth->vaddr[1]);
+		if (unlikely(vaddr & (sizeof(u64) - 1)))
+			goto nack_inv_unlck;
+		rkey = be32_to_cpu(ateth->rkey);
+		/* Check rkey & NAK */
+		if (unlikely(!qib_rkey_ok(qp, &qp->r_sge.sge, sizeof(u64),
+					  vaddr, rkey,
+					  IB_ACCESS_REMOTE_ATOMIC)))
+			goto nack_acc_unlck;
+		/* Perform atomic OP and save result. */
+		maddr = (atomic64_t *) qp->r_sge.sge.vaddr;
+		sdata = be64_to_cpu(ateth->swap_data);
+		e->atomic_data = (opcode == OP(FETCH_ADD)) ?
+			(u64) atomic64_add_return(sdata, maddr) - sdata :
+			(u64) cmpxchg((u64 *) qp->r_sge.sge.vaddr,
+				      be64_to_cpu(ateth->compare_data),
+				      sdata);
+		qib_put_mr(qp->r_sge.sge.mr);
+		qp->r_sge.num_sge = 0;
+		e->opcode = opcode;
+		e->sent = 0;
+		e->psn = psn;
+		e->lpsn = psn;
+		qp->r_msn++;
+		qp->r_psn++;
+		qp->r_state = opcode;
+		qp->r_nak_state = 0;
+		qp->r_head_ack_queue = next;
+
+		/* Schedule the send tasklet. */
+		qp->s_flags |= QIB_S_RESP_PENDING;
+		qib_schedule_send(qp);
+
+		goto sunlock;
+	}
+
+	default:
+		/* NAK unknown opcodes. */
+		goto nack_inv;
+	}
+	qp->r_psn++;
+	qp->r_state = opcode;
+	qp->r_ack_psn = psn;
+	qp->r_nak_state = 0;
+	/* Send an ACK if requested or required. */
+	if (psn & (1 << 31))
+		goto send_ack;
+	return;
+
+rnr_nak:
+	qp->r_nak_state = IB_RNR_NAK | qp->r_min_rnr_timer;
+	qp->r_ack_psn = qp->r_psn;
+	/* Queue RNR NAK for later */
+	if (list_empty(&qp->rspwait)) {
+		qp->r_flags |= QIB_R_RSP_NAK;
+		atomic_inc(&qp->refcount);
+		list_add_tail(&qp->rspwait, &rcd->qp_wait_list);
+	}
+	return;
+
+nack_op_err:
+	qib_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
+	qp->r_nak_state = IB_NAK_REMOTE_OPERATIONAL_ERROR;
+	qp->r_ack_psn = qp->r_psn;
+	/* Queue NAK for later */
+	if (list_empty(&qp->rspwait)) {
+		qp->r_flags |= QIB_R_RSP_NAK;
+		atomic_inc(&qp->refcount);
+		list_add_tail(&qp->rspwait, &rcd->qp_wait_list);
+	}
+	return;
+
+nack_inv_unlck:
+	spin_unlock_irqrestore(&qp->s_lock, flags);
+nack_inv:
+	qib_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
+	qp->r_nak_state = IB_NAK_INVALID_REQUEST;
+	qp->r_ack_psn = qp->r_psn;
+	/* Queue NAK for later */
+	if (list_empty(&qp->rspwait)) {
+		qp->r_flags |= QIB_R_RSP_NAK;
+		atomic_inc(&qp->refcount);
+		list_add_tail(&qp->rspwait, &rcd->qp_wait_list);
+	}
+	return;
+
+nack_acc_unlck:
+	spin_unlock_irqrestore(&qp->s_lock, flags);
+nack_acc:
+	qib_rc_error(qp, IB_WC_LOC_PROT_ERR);
+	qp->r_nak_state = IB_NAK_REMOTE_ACCESS_ERROR;
+	qp->r_ack_psn = qp->r_psn;
+send_ack:
+	qib_send_rc_ack(qp);
+	return;
+
+sunlock:
+	spin_unlock_irqrestore(&qp->s_lock, flags);
+}
diff --git a/drivers/infiniband/hw/qib/qib_ruc.c b/drivers/infiniband/hw/qib/qib_ruc.c
new file mode 100644
index 0000000..4c07a8b
--- /dev/null
+++ b/drivers/infiniband/hw/qib/qib_ruc.c
@@ -0,0 +1,819 @@
+/*
+ * Copyright (c) 2006, 2007, 2008, 2009 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/spinlock.h>
+
+#include "qib.h"
+#include "qib_mad.h"
+
+/*
+ * Convert the AETH RNR timeout code into the number of microseconds.
+ */
+const u32 ib_qib_rnr_table[32] = {
+	655360,	/* 00: 655.36 */
+	10,	/* 01:    .01 */
+	20,	/* 02     .02 */
+	30,	/* 03:    .03 */
+	40,	/* 04:    .04 */
+	60,	/* 05:    .06 */
+	80,	/* 06:    .08 */
+	120,	/* 07:    .12 */
+	160,	/* 08:    .16 */
+	240,	/* 09:    .24 */
+	320,	/* 0A:    .32 */
+	480,	/* 0B:    .48 */
+	640,	/* 0C:    .64 */
+	960,	/* 0D:    .96 */
+	1280,	/* 0E:   1.28 */
+	1920,	/* 0F:   1.92 */
+	2560,	/* 10:   2.56 */
+	3840,	/* 11:   3.84 */
+	5120,	/* 12:   5.12 */
+	7680,	/* 13:   7.68 */
+	10240,	/* 14:  10.24 */
+	15360,	/* 15:  15.36 */
+	20480,	/* 16:  20.48 */
+	30720,	/* 17:  30.72 */
+	40960,	/* 18:  40.96 */
+	61440,	/* 19:  61.44 */
+	81920,	/* 1A:  81.92 */
+	122880,	/* 1B: 122.88 */
+	163840,	/* 1C: 163.84 */
+	245760,	/* 1D: 245.76 */
+	327680,	/* 1E: 327.68 */
+	491520	/* 1F: 491.52 */
+};
+
+/*
+ * Validate a RWQE and fill in the SGE state.
+ * Return 1 if OK.
+ */
+static int qib_init_sge(struct qib_qp *qp, struct qib_rwqe *wqe)
+{
+	int i, j, ret;
+	struct ib_wc wc;
+	struct qib_lkey_table *rkt;
+	struct qib_pd *pd;
+	struct qib_sge_state *ss;
+
+	rkt = &to_idev(qp->ibqp.device)->lk_table;
+	pd = to_ipd(qp->ibqp.srq ? qp->ibqp.srq->pd : qp->ibqp.pd);
+	ss = &qp->r_sge;
+	ss->sg_list = qp->r_sg_list;
+	qp->r_len = 0;
+	for (i = j = 0; i < wqe->num_sge; i++) {
+		if (wqe->sg_list[i].length == 0)
+			continue;
+		/* Check LKEY */
+		if (!qib_lkey_ok(rkt, pd, j ? &ss->sg_list[j - 1] : &ss->sge,
+				 &wqe->sg_list[i], IB_ACCESS_LOCAL_WRITE))
+			goto bad_lkey;
+		qp->r_len += wqe->sg_list[i].length;
+		j++;
+	}
+	ss->num_sge = j;
+	ss->total_len = qp->r_len;
+	ret = 1;
+	goto bail;
+
+bad_lkey:
+	while (j) {
+		struct qib_sge *sge = --j ? &ss->sg_list[j - 1] : &ss->sge;
+
+		qib_put_mr(sge->mr);
+	}
+	ss->num_sge = 0;
+	memset(&wc, 0, sizeof(wc));
+	wc.wr_id = wqe->wr_id;
+	wc.status = IB_WC_LOC_PROT_ERR;
+	wc.opcode = IB_WC_RECV;
+	wc.qp = &qp->ibqp;
+	/* Signal solicited completion event. */
+	qib_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, 1);
+	ret = 0;
+bail:
+	return ret;
+}
+
+/**
+ * qib_get_rwqe - copy the next RWQE into the QP's RWQE
+ * @qp: the QP
+ * @wr_id_only: update qp->r_wr_id only, not qp->r_sge
+ *
+ * Return -1 if there is a local error, 0 if no RWQE is available,
+ * otherwise return 1.
+ *
+ * Can be called from interrupt level.
+ */
+int qib_get_rwqe(struct qib_qp *qp, int wr_id_only)
+{
+	unsigned long flags;
+	struct qib_rq *rq;
+	struct qib_rwq *wq;
+	struct qib_srq *srq;
+	struct qib_rwqe *wqe;
+	void (*handler)(struct ib_event *, void *);
+	u32 tail;
+	int ret;
+
+	if (qp->ibqp.srq) {
+		srq = to_isrq(qp->ibqp.srq);
+		handler = srq->ibsrq.event_handler;
+		rq = &srq->rq;
+	} else {
+		srq = NULL;
+		handler = NULL;
+		rq = &qp->r_rq;
+	}
+
+	spin_lock_irqsave(&rq->lock, flags);
+	if (!(ib_qib_state_ops[qp->state] & QIB_PROCESS_RECV_OK)) {
+		ret = 0;
+		goto unlock;
+	}
+
+	wq = rq->wq;
+	tail = wq->tail;
+	/* Validate tail before using it since it is user writable. */
+	if (tail >= rq->size)
+		tail = 0;
+	if (unlikely(tail == wq->head)) {
+		ret = 0;
+		goto unlock;
+	}
+	/* Make sure entry is read after head index is read. */
+	smp_rmb();
+	wqe = get_rwqe_ptr(rq, tail);
+	/*
+	 * Even though we update the tail index in memory, the verbs
+	 * consumer is not supposed to post more entries until a
+	 * completion is generated.
+	 */
+	if (++tail >= rq->size)
+		tail = 0;
+	wq->tail = tail;
+	if (!wr_id_only && !qib_init_sge(qp, wqe)) {
+		ret = -1;
+		goto unlock;
+	}
+	qp->r_wr_id = wqe->wr_id;
+
+	ret = 1;
+	set_bit(QIB_R_WRID_VALID, &qp->r_aflags);
+	if (handler) {
+		u32 n;
+
+		/*
+		 * Validate head pointer value and compute
+		 * the number of remaining WQEs.
+		 */
+		n = wq->head;
+		if (n >= rq->size)
+			n = 0;
+		if (n < tail)
+			n += rq->size - tail;
+		else
+			n -= tail;
+		if (n < srq->limit) {
+			struct ib_event ev;
+
+			srq->limit = 0;
+			spin_unlock_irqrestore(&rq->lock, flags);
+			ev.device = qp->ibqp.device;
+			ev.element.srq = qp->ibqp.srq;
+			ev.event = IB_EVENT_SRQ_LIMIT_REACHED;
+			handler(&ev, srq->ibsrq.srq_context);
+			goto bail;
+		}
+	}
+unlock:
+	spin_unlock_irqrestore(&rq->lock, flags);
+bail:
+	return ret;
+}
+
+/*
+ * Switch to alternate path.
+ * The QP s_lock should be held and interrupts disabled.
+ */
+void qib_migrate_qp(struct qib_qp *qp)
+{
+	struct ib_event ev;
+
+	qp->s_mig_state = IB_MIG_MIGRATED;
+	qp->remote_ah_attr = qp->alt_ah_attr;
+	qp->port_num = qp->alt_ah_attr.port_num;
+	qp->s_pkey_index = qp->s_alt_pkey_index;
+
+	ev.device = qp->ibqp.device;
+	ev.element.qp = &qp->ibqp;
+	ev.event = IB_EVENT_PATH_MIG;
+	qp->ibqp.event_handler(&ev, qp->ibqp.qp_context);
+}
+
+static __be64 get_sguid(struct qib_ibport *ibp, unsigned index)
+{
+	if (!index) {
+		struct qib_pportdata *ppd = ppd_from_ibp(ibp);
+
+		return ppd->guid;
+	} else
+		return ibp->guids[index - 1];
+}
+
+static int gid_ok(union ib_gid *gid, __be64 gid_prefix, __be64 id)
+{
+	return (gid->global.interface_id == id &&
+		(gid->global.subnet_prefix == gid_prefix ||
+		 gid->global.subnet_prefix == IB_DEFAULT_GID_PREFIX));
+}
+
+/*
+ *
+ * This should be called with the QP r_lock held.
+ *
+ * The s_lock will be acquired around the qib_migrate_qp() call.
+ */
+int qib_ruc_check_hdr(struct qib_ibport *ibp, struct qib_ib_header *hdr,
+		      int has_grh, struct qib_qp *qp, u32 bth0)
+{
+	__be64 guid;
+	unsigned long flags;
+
+	if (qp->s_mig_state == IB_MIG_ARMED && (bth0 & IB_BTH_MIG_REQ)) {
+		if (!has_grh) {
+			if (qp->alt_ah_attr.ah_flags & IB_AH_GRH)
+				goto err;
+		} else {
+			if (!(qp->alt_ah_attr.ah_flags & IB_AH_GRH))
+				goto err;
+			guid = get_sguid(ibp, qp->alt_ah_attr.grh.sgid_index);
+			if (!gid_ok(&hdr->u.l.grh.dgid, ibp->gid_prefix, guid))
+				goto err;
+			if (!gid_ok(&hdr->u.l.grh.sgid,
+			    qp->alt_ah_attr.grh.dgid.global.subnet_prefix,
+			    qp->alt_ah_attr.grh.dgid.global.interface_id))
+				goto err;
+		}
+		if (!qib_pkey_ok((u16)bth0,
+				 qib_get_pkey(ibp, qp->s_alt_pkey_index))) {
+			qib_bad_pqkey(ibp, IB_NOTICE_TRAP_BAD_PKEY,
+				      (u16)bth0,
+				      (be16_to_cpu(hdr->lrh[0]) >> 4) & 0xF,
+				      0, qp->ibqp.qp_num,
+				      hdr->lrh[3], hdr->lrh[1]);
+			goto err;
+		}
+		/* Validate the SLID. See Ch. 9.6.1.5 and 17.2.8 */
+		if (be16_to_cpu(hdr->lrh[3]) != qp->alt_ah_attr.dlid ||
+		    ppd_from_ibp(ibp)->port != qp->alt_ah_attr.port_num)
+			goto err;
+		spin_lock_irqsave(&qp->s_lock, flags);
+		qib_migrate_qp(qp);
+		spin_unlock_irqrestore(&qp->s_lock, flags);
+	} else {
+		if (!has_grh) {
+			if (qp->remote_ah_attr.ah_flags & IB_AH_GRH)
+				goto err;
+		} else {
+			if (!(qp->remote_ah_attr.ah_flags & IB_AH_GRH))
+				goto err;
+			guid = get_sguid(ibp,
+					 qp->remote_ah_attr.grh.sgid_index);
+			if (!gid_ok(&hdr->u.l.grh.dgid, ibp->gid_prefix, guid))
+				goto err;
+			if (!gid_ok(&hdr->u.l.grh.sgid,
+			    qp->remote_ah_attr.grh.dgid.global.subnet_prefix,
+			    qp->remote_ah_attr.grh.dgid.global.interface_id))
+				goto err;
+		}
+		if (!qib_pkey_ok((u16)bth0,
+				 qib_get_pkey(ibp, qp->s_pkey_index))) {
+			qib_bad_pqkey(ibp, IB_NOTICE_TRAP_BAD_PKEY,
+				      (u16)bth0,
+				      (be16_to_cpu(hdr->lrh[0]) >> 4) & 0xF,
+				      0, qp->ibqp.qp_num,
+				      hdr->lrh[3], hdr->lrh[1]);
+			goto err;
+		}
+		/* Validate the SLID. See Ch. 9.6.1.5 */
+		if (be16_to_cpu(hdr->lrh[3]) != qp->remote_ah_attr.dlid ||
+		    ppd_from_ibp(ibp)->port != qp->port_num)
+			goto err;
+		if (qp->s_mig_state == IB_MIG_REARM &&
+		    !(bth0 & IB_BTH_MIG_REQ))
+			qp->s_mig_state = IB_MIG_ARMED;
+	}
+
+	return 0;
+
+err:
+	return 1;
+}
+
+/**
+ * qib_ruc_loopback - handle UC and RC lookback requests
+ * @sqp: the sending QP
+ *
+ * This is called from qib_do_send() to
+ * forward a WQE addressed to the same HCA.
+ * Note that although we are single threaded due to the tasklet, we still
+ * have to protect against post_send().  We don't have to worry about
+ * receive interrupts since this is a connected protocol and all packets
+ * will pass through here.
+ */
+static void qib_ruc_loopback(struct qib_qp *sqp)
+{
+	struct qib_ibport *ibp = to_iport(sqp->ibqp.device, sqp->port_num);
+	struct qib_qp *qp;
+	struct qib_swqe *wqe;
+	struct qib_sge *sge;
+	unsigned long flags;
+	struct ib_wc wc;
+	u64 sdata;
+	atomic64_t *maddr;
+	enum ib_wc_status send_status;
+	int release;
+	int ret;
+
+	/*
+	 * Note that we check the responder QP state after
+	 * checking the requester's state.
+	 */
+	qp = qib_lookup_qpn(ibp, sqp->remote_qpn);
+
+	spin_lock_irqsave(&sqp->s_lock, flags);
+
+	/* Return if we are already busy processing a work request. */
+	if ((sqp->s_flags & (QIB_S_BUSY | QIB_S_ANY_WAIT)) ||
+	    !(ib_qib_state_ops[sqp->state] & QIB_PROCESS_OR_FLUSH_SEND))
+		goto unlock;
+
+	sqp->s_flags |= QIB_S_BUSY;
+
+again:
+	if (sqp->s_last == sqp->s_head)
+		goto clr_busy;
+	wqe = get_swqe_ptr(sqp, sqp->s_last);
+
+	/* Return if it is not OK to start a new work reqeust. */
+	if (!(ib_qib_state_ops[sqp->state] & QIB_PROCESS_NEXT_SEND_OK)) {
+		if (!(ib_qib_state_ops[sqp->state] & QIB_FLUSH_SEND))
+			goto clr_busy;
+		/* We are in the error state, flush the work request. */
+		send_status = IB_WC_WR_FLUSH_ERR;
+		goto flush_send;
+	}
+
+	/*
+	 * We can rely on the entry not changing without the s_lock
+	 * being held until we update s_last.
+	 * We increment s_cur to indicate s_last is in progress.
+	 */
+	if (sqp->s_last == sqp->s_cur) {
+		if (++sqp->s_cur >= sqp->s_size)
+			sqp->s_cur = 0;
+	}
+	spin_unlock_irqrestore(&sqp->s_lock, flags);
+
+	if (!qp || !(ib_qib_state_ops[qp->state] & QIB_PROCESS_RECV_OK) ||
+	    qp->ibqp.qp_type != sqp->ibqp.qp_type) {
+		ibp->n_pkt_drops++;
+		/*
+		 * For RC, the requester would timeout and retry so
+		 * shortcut the timeouts and just signal too many retries.
+		 */
+		if (sqp->ibqp.qp_type == IB_QPT_RC)
+			send_status = IB_WC_RETRY_EXC_ERR;
+		else
+			send_status = IB_WC_SUCCESS;
+		goto serr;
+	}
+
+	memset(&wc, 0, sizeof wc);
+	send_status = IB_WC_SUCCESS;
+
+	release = 1;
+	sqp->s_sge.sge = wqe->sg_list[0];
+	sqp->s_sge.sg_list = wqe->sg_list + 1;
+	sqp->s_sge.num_sge = wqe->wr.num_sge;
+	sqp->s_len = wqe->length;
+	switch (wqe->wr.opcode) {
+	case IB_WR_SEND_WITH_IMM:
+		wc.wc_flags = IB_WC_WITH_IMM;
+		wc.ex.imm_data = wqe->wr.ex.imm_data;
+		/* FALLTHROUGH */
+	case IB_WR_SEND:
+		ret = qib_get_rwqe(qp, 0);
+		if (ret < 0)
+			goto op_err;
+		if (!ret)
+			goto rnr_nak;
+		break;
+
+	case IB_WR_RDMA_WRITE_WITH_IMM:
+		if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE)))
+			goto inv_err;
+		wc.wc_flags = IB_WC_WITH_IMM;
+		wc.ex.imm_data = wqe->wr.ex.imm_data;
+		ret = qib_get_rwqe(qp, 1);
+		if (ret < 0)
+			goto op_err;
+		if (!ret)
+			goto rnr_nak;
+		/* FALLTHROUGH */
+	case IB_WR_RDMA_WRITE:
+		if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE)))
+			goto inv_err;
+		if (wqe->length == 0)
+			break;
+		if (unlikely(!qib_rkey_ok(qp, &qp->r_sge.sge, wqe->length,
+					  wqe->wr.wr.rdma.remote_addr,
+					  wqe->wr.wr.rdma.rkey,
+					  IB_ACCESS_REMOTE_WRITE)))
+			goto acc_err;
+		qp->r_sge.sg_list = NULL;
+		qp->r_sge.num_sge = 1;
+		qp->r_sge.total_len = wqe->length;
+		break;
+
+	case IB_WR_RDMA_READ:
+		if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_READ)))
+			goto inv_err;
+		if (unlikely(!qib_rkey_ok(qp, &sqp->s_sge.sge, wqe->length,
+					  wqe->wr.wr.rdma.remote_addr,
+					  wqe->wr.wr.rdma.rkey,
+					  IB_ACCESS_REMOTE_READ)))
+			goto acc_err;
+		release = 0;
+		sqp->s_sge.sg_list = NULL;
+		sqp->s_sge.num_sge = 1;
+		qp->r_sge.sge = wqe->sg_list[0];
+		qp->r_sge.sg_list = wqe->sg_list + 1;
+		qp->r_sge.num_sge = wqe->wr.num_sge;
+		qp->r_sge.total_len = wqe->length;
+		break;
+
+	case IB_WR_ATOMIC_CMP_AND_SWP:
+	case IB_WR_ATOMIC_FETCH_AND_ADD:
+		if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC)))
+			goto inv_err;
+		if (unlikely(!qib_rkey_ok(qp, &qp->r_sge.sge, sizeof(u64),
+					  wqe->wr.wr.atomic.remote_addr,
+					  wqe->wr.wr.atomic.rkey,
+					  IB_ACCESS_REMOTE_ATOMIC)))
+			goto acc_err;
+		/* Perform atomic OP and save result. */
+		maddr = (atomic64_t *) qp->r_sge.sge.vaddr;
+		sdata = wqe->wr.wr.atomic.compare_add;
+		*(u64 *) sqp->s_sge.sge.vaddr =
+			(wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) ?
+			(u64) atomic64_add_return(sdata, maddr) - sdata :
+			(u64) cmpxchg((u64 *) qp->r_sge.sge.vaddr,
+				      sdata, wqe->wr.wr.atomic.swap);
+		qib_put_mr(qp->r_sge.sge.mr);
+		qp->r_sge.num_sge = 0;
+		goto send_comp;
+
+	default:
+		send_status = IB_WC_LOC_QP_OP_ERR;
+		goto serr;
+	}
+
+	sge = &sqp->s_sge.sge;
+	while (sqp->s_len) {
+		u32 len = sqp->s_len;
+
+		if (len > sge->length)
+			len = sge->length;
+		if (len > sge->sge_length)
+			len = sge->sge_length;
+		BUG_ON(len == 0);
+		qib_copy_sge(&qp->r_sge, sge->vaddr, len, release);
+		sge->vaddr += len;
+		sge->length -= len;
+		sge->sge_length -= len;
+		if (sge->sge_length == 0) {
+			if (!release)
+				qib_put_mr(sge->mr);
+			if (--sqp->s_sge.num_sge)
+				*sge = *sqp->s_sge.sg_list++;
+		} else if (sge->length == 0 && sge->mr->lkey) {
+			if (++sge->n >= QIB_SEGSZ) {
+				if (++sge->m >= sge->mr->mapsz)
+					break;
+				sge->n = 0;
+			}
+			sge->vaddr =
+				sge->mr->map[sge->m]->segs[sge->n].vaddr;
+			sge->length =
+				sge->mr->map[sge->m]->segs[sge->n].length;
+		}
+		sqp->s_len -= len;
+	}
+	if (release)
+		qib_put_ss(&qp->r_sge);
+
+	if (!test_and_clear_bit(QIB_R_WRID_VALID, &qp->r_aflags))
+		goto send_comp;
+
+	if (wqe->wr.opcode == IB_WR_RDMA_WRITE_WITH_IMM)
+		wc.opcode = IB_WC_RECV_RDMA_WITH_IMM;
+	else
+		wc.opcode = IB_WC_RECV;
+	wc.wr_id = qp->r_wr_id;
+	wc.status = IB_WC_SUCCESS;
+	wc.byte_len = wqe->length;
+	wc.qp = &qp->ibqp;
+	wc.src_qp = qp->remote_qpn;
+	wc.slid = qp->remote_ah_attr.dlid;
+	wc.sl = qp->remote_ah_attr.sl;
+	wc.port_num = 1;
+	/* Signal completion event if the solicited bit is set. */
+	qib_cq_enter(to_icq(qp->ibqp.recv_cq), &wc,
+		       wqe->wr.send_flags & IB_SEND_SOLICITED);
+
+send_comp:
+	spin_lock_irqsave(&sqp->s_lock, flags);
+	ibp->n_loop_pkts++;
+flush_send:
+	sqp->s_rnr_retry = sqp->s_rnr_retry_cnt;
+	qib_send_complete(sqp, wqe, send_status);
+	goto again;
+
+rnr_nak:
+	/* Handle RNR NAK */
+	if (qp->ibqp.qp_type == IB_QPT_UC)
+		goto send_comp;
+	ibp->n_rnr_naks++;
+	/*
+	 * Note: we don't need the s_lock held since the BUSY flag
+	 * makes this single threaded.
+	 */
+	if (sqp->s_rnr_retry == 0) {
+		send_status = IB_WC_RNR_RETRY_EXC_ERR;
+		goto serr;
+	}
+	if (sqp->s_rnr_retry_cnt < 7)
+		sqp->s_rnr_retry--;
+	spin_lock_irqsave(&sqp->s_lock, flags);
+	if (!(ib_qib_state_ops[sqp->state] & QIB_PROCESS_RECV_OK))
+		goto clr_busy;
+	sqp->s_flags |= QIB_S_WAIT_RNR;
+	sqp->s_timer.function = qib_rc_rnr_retry;
+	sqp->s_timer.expires = jiffies +
+		usecs_to_jiffies(ib_qib_rnr_table[qp->r_min_rnr_timer]);
+	add_timer(&sqp->s_timer);
+	goto clr_busy;
+
+op_err:
+	send_status = IB_WC_REM_OP_ERR;
+	wc.status = IB_WC_LOC_QP_OP_ERR;
+	goto err;
+
+inv_err:
+	send_status = IB_WC_REM_INV_REQ_ERR;
+	wc.status = IB_WC_LOC_QP_OP_ERR;
+	goto err;
+
+acc_err:
+	send_status = IB_WC_REM_ACCESS_ERR;
+	wc.status = IB_WC_LOC_PROT_ERR;
+err:
+	/* responder goes to error state */
+	qib_rc_error(qp, wc.status);
+
+serr:
+	spin_lock_irqsave(&sqp->s_lock, flags);
+	qib_send_complete(sqp, wqe, send_status);
+	if (sqp->ibqp.qp_type == IB_QPT_RC) {
+		int lastwqe = qib_error_qp(sqp, IB_WC_WR_FLUSH_ERR);
+
+		sqp->s_flags &= ~QIB_S_BUSY;
+		spin_unlock_irqrestore(&sqp->s_lock, flags);
+		if (lastwqe) {
+			struct ib_event ev;
+
+			ev.device = sqp->ibqp.device;
+			ev.element.qp = &sqp->ibqp;
+			ev.event = IB_EVENT_QP_LAST_WQE_REACHED;
+			sqp->ibqp.event_handler(&ev, sqp->ibqp.qp_context);
+		}
+		goto done;
+	}
+clr_busy:
+	sqp->s_flags &= ~QIB_S_BUSY;
+unlock:
+	spin_unlock_irqrestore(&sqp->s_lock, flags);
+done:
+	if (qp && atomic_dec_and_test(&qp->refcount))
+		wake_up(&qp->wait);
+}
+
+/**
+ * qib_make_grh - construct a GRH header
+ * @ibp: a pointer to the IB port
+ * @hdr: a pointer to the GRH header being constructed
+ * @grh: the global route address to send to
+ * @hwords: the number of 32 bit words of header being sent
+ * @nwords: the number of 32 bit words of data being sent
+ *
+ * Return the size of the header in 32 bit words.
+ */
+u32 qib_make_grh(struct qib_ibport *ibp, struct ib_grh *hdr,
+		 struct ib_global_route *grh, u32 hwords, u32 nwords)
+{
+	hdr->version_tclass_flow =
+		cpu_to_be32((IB_GRH_VERSION << IB_GRH_VERSION_SHIFT) |
+			    (grh->traffic_class << IB_GRH_TCLASS_SHIFT) |
+			    (grh->flow_label << IB_GRH_FLOW_SHIFT));
+	hdr->paylen = cpu_to_be16((hwords - 2 + nwords + SIZE_OF_CRC) << 2);
+	/* next_hdr is defined by C8-7 in ch. 8.4.1 */
+	hdr->next_hdr = IB_GRH_NEXT_HDR;
+	hdr->hop_limit = grh->hop_limit;
+	/* The SGID is 32-bit aligned. */
+	hdr->sgid.global.subnet_prefix = ibp->gid_prefix;
+	hdr->sgid.global.interface_id = grh->sgid_index ?
+		ibp->guids[grh->sgid_index - 1] : ppd_from_ibp(ibp)->guid;
+	hdr->dgid = grh->dgid;
+
+	/* GRH header size in 32-bit words. */
+	return sizeof(struct ib_grh) / sizeof(u32);
+}
+
+void qib_make_ruc_header(struct qib_qp *qp, struct qib_other_headers *ohdr,
+			 u32 bth0, u32 bth2)
+{
+	struct qib_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
+	u16 lrh0;
+	u32 nwords;
+	u32 extra_bytes;
+
+	/* Construct the header. */
+	extra_bytes = -qp->s_cur_size & 3;
+	nwords = (qp->s_cur_size + extra_bytes) >> 2;
+	lrh0 = QIB_LRH_BTH;
+	if (unlikely(qp->remote_ah_attr.ah_flags & IB_AH_GRH)) {
+		qp->s_hdrwords += qib_make_grh(ibp, &qp->s_hdr->u.l.grh,
+					       &qp->remote_ah_attr.grh,
+					       qp->s_hdrwords, nwords);
+		lrh0 = QIB_LRH_GRH;
+	}
+	lrh0 |= ibp->sl_to_vl[qp->remote_ah_attr.sl] << 12 |
+		qp->remote_ah_attr.sl << 4;
+	qp->s_hdr->lrh[0] = cpu_to_be16(lrh0);
+	qp->s_hdr->lrh[1] = cpu_to_be16(qp->remote_ah_attr.dlid);
+	qp->s_hdr->lrh[2] = cpu_to_be16(qp->s_hdrwords + nwords + SIZE_OF_CRC);
+	qp->s_hdr->lrh[3] = cpu_to_be16(ppd_from_ibp(ibp)->lid |
+				       qp->remote_ah_attr.src_path_bits);
+	bth0 |= qib_get_pkey(ibp, qp->s_pkey_index);
+	bth0 |= extra_bytes << 20;
+	if (qp->s_mig_state == IB_MIG_MIGRATED)
+		bth0 |= IB_BTH_MIG_REQ;
+	ohdr->bth[0] = cpu_to_be32(bth0);
+	ohdr->bth[1] = cpu_to_be32(qp->remote_qpn);
+	ohdr->bth[2] = cpu_to_be32(bth2);
+	this_cpu_inc(ibp->pmastats->n_unicast_xmit);
+}
+
+/**
+ * qib_do_send - perform a send on a QP
+ * @work: contains a pointer to the QP
+ *
+ * Process entries in the send work queue until credit or queue is
+ * exhausted.  Only allow one CPU to send a packet per QP (tasklet).
+ * Otherwise, two threads could send packets out of order.
+ */
+void qib_do_send(struct work_struct *work)
+{
+	struct qib_qp *qp = container_of(work, struct qib_qp, s_work);
+	struct qib_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
+	struct qib_pportdata *ppd = ppd_from_ibp(ibp);
+	int (*make_req)(struct qib_qp *qp);
+	unsigned long flags;
+
+	if ((qp->ibqp.qp_type == IB_QPT_RC ||
+	     qp->ibqp.qp_type == IB_QPT_UC) &&
+	    (qp->remote_ah_attr.dlid & ~((1 << ppd->lmc) - 1)) == ppd->lid) {
+		qib_ruc_loopback(qp);
+		return;
+	}
+
+	if (qp->ibqp.qp_type == IB_QPT_RC)
+		make_req = qib_make_rc_req;
+	else if (qp->ibqp.qp_type == IB_QPT_UC)
+		make_req = qib_make_uc_req;
+	else
+		make_req = qib_make_ud_req;
+
+	spin_lock_irqsave(&qp->s_lock, flags);
+
+	/* Return if we are already busy processing a work request. */
+	if (!qib_send_ok(qp)) {
+		spin_unlock_irqrestore(&qp->s_lock, flags);
+		return;
+	}
+
+	qp->s_flags |= QIB_S_BUSY;
+
+	spin_unlock_irqrestore(&qp->s_lock, flags);
+
+	do {
+		/* Check for a constructed packet to be sent. */
+		if (qp->s_hdrwords != 0) {
+			/*
+			 * If the packet cannot be sent now, return and
+			 * the send tasklet will be woken up later.
+			 */
+			if (qib_verbs_send(qp, qp->s_hdr, qp->s_hdrwords,
+					   qp->s_cur_sge, qp->s_cur_size))
+				break;
+			/* Record that s_hdr is empty. */
+			qp->s_hdrwords = 0;
+		}
+	} while (make_req(qp));
+}
+
+/*
+ * This should be called with s_lock held.
+ */
+void qib_send_complete(struct qib_qp *qp, struct qib_swqe *wqe,
+		       enum ib_wc_status status)
+{
+	u32 old_last, last;
+	unsigned i;
+
+	if (!(ib_qib_state_ops[qp->state] & QIB_PROCESS_OR_FLUSH_SEND))
+		return;
+
+	for (i = 0; i < wqe->wr.num_sge; i++) {
+		struct qib_sge *sge = &wqe->sg_list[i];
+
+		qib_put_mr(sge->mr);
+	}
+	if (qp->ibqp.qp_type == IB_QPT_UD ||
+	    qp->ibqp.qp_type == IB_QPT_SMI ||
+	    qp->ibqp.qp_type == IB_QPT_GSI)
+		atomic_dec(&to_iah(wqe->wr.wr.ud.ah)->refcount);
+
+	/* See ch. 11.2.4.1 and 10.7.3.1 */
+	if (!(qp->s_flags & QIB_S_SIGNAL_REQ_WR) ||
+	    (wqe->wr.send_flags & IB_SEND_SIGNALED) ||
+	    status != IB_WC_SUCCESS) {
+		struct ib_wc wc;
+
+		memset(&wc, 0, sizeof wc);
+		wc.wr_id = wqe->wr.wr_id;
+		wc.status = status;
+		wc.opcode = ib_qib_wc_opcode[wqe->wr.opcode];
+		wc.qp = &qp->ibqp;
+		if (status == IB_WC_SUCCESS)
+			wc.byte_len = wqe->length;
+		qib_cq_enter(to_icq(qp->ibqp.send_cq), &wc,
+			     status != IB_WC_SUCCESS);
+	}
+
+	last = qp->s_last;
+	old_last = last;
+	if (++last >= qp->s_size)
+		last = 0;
+	qp->s_last = last;
+	if (qp->s_acked == old_last)
+		qp->s_acked = last;
+	if (qp->s_cur == old_last)
+		qp->s_cur = last;
+	if (qp->s_tail == old_last)
+		qp->s_tail = last;
+	if (qp->state == IB_QPS_SQD && last == qp->s_cur)
+		qp->s_draining = 0;
+}
diff --git a/drivers/infiniband/hw/qib/qib_sd7220.c b/drivers/infiniband/hw/qib/qib_sd7220.c
new file mode 100644
index 0000000..911205d
--- /dev/null
+++ b/drivers/infiniband/hw/qib/qib_sd7220.c
@@ -0,0 +1,1449 @@
+/*
+ * Copyright (c) 2013 Intel Corporation. All rights reserved.
+ * Copyright (c) 2006 - 2012 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/*
+ * This file contains all of the code that is specific to the SerDes
+ * on the QLogic_IB 7220 chip.
+ */
+
+#include <linux/pci.h>
+#include <linux/delay.h>
+#include <linux/module.h>
+#include <linux/firmware.h>
+
+#include "qib.h"
+#include "qib_7220.h"
+
+#define SD7220_FW_NAME "qlogic/sd7220.fw"
+MODULE_FIRMWARE(SD7220_FW_NAME);
+
+/*
+ * Same as in qib_iba7220.c, but just the registers needed here.
+ * Could move whole set to qib_7220.h, but decided better to keep
+ * local.
+ */
+#define KREG_IDX(regname) (QIB_7220_##regname##_OFFS / sizeof(u64))
+#define kr_hwerrclear KREG_IDX(HwErrClear)
+#define kr_hwerrmask KREG_IDX(HwErrMask)
+#define kr_hwerrstatus KREG_IDX(HwErrStatus)
+#define kr_ibcstatus KREG_IDX(IBCStatus)
+#define kr_ibserdesctrl KREG_IDX(IBSerDesCtrl)
+#define kr_scratch KREG_IDX(Scratch)
+#define kr_xgxs_cfg KREG_IDX(XGXSCfg)
+/* these are used only here, not in qib_iba7220.c */
+#define kr_ibsd_epb_access_ctrl KREG_IDX(ibsd_epb_access_ctrl)
+#define kr_ibsd_epb_transaction_reg KREG_IDX(ibsd_epb_transaction_reg)
+#define kr_pciesd_epb_transaction_reg KREG_IDX(pciesd_epb_transaction_reg)
+#define kr_pciesd_epb_access_ctrl KREG_IDX(pciesd_epb_access_ctrl)
+#define kr_serdes_ddsrxeq0 KREG_IDX(SerDes_DDSRXEQ0)
+
+/*
+ * The IBSerDesMappTable is a memory that holds values to be stored in
+ * various SerDes registers by IBC.
+ */
+#define kr_serdes_maptable KREG_IDX(IBSerDesMappTable)
+
+/*
+ * Below used for sdnum parameter, selecting one of the two sections
+ * used for PCIe, or the single SerDes used for IB.
+ */
+#define PCIE_SERDES0 0
+#define PCIE_SERDES1 1
+
+/*
+ * The EPB requires addressing in a particular form. EPB_LOC() is intended
+ * to make #definitions a little more readable.
+ */
+#define EPB_ADDR_SHF 8
+#define EPB_LOC(chn, elt, reg) \
+	(((elt & 0xf) | ((chn & 7) << 4) | ((reg & 0x3f) << 9)) << \
+	 EPB_ADDR_SHF)
+#define EPB_IB_QUAD0_CS_SHF (25)
+#define EPB_IB_QUAD0_CS (1U <<  EPB_IB_QUAD0_CS_SHF)
+#define EPB_IB_UC_CS_SHF (26)
+#define EPB_PCIE_UC_CS_SHF (27)
+#define EPB_GLOBAL_WR (1U << (EPB_ADDR_SHF + 8))
+
+/* Forward declarations. */
+static int qib_sd7220_reg_mod(struct qib_devdata *dd, int sdnum, u32 loc,
+			      u32 data, u32 mask);
+static int ibsd_mod_allchnls(struct qib_devdata *dd, int loc, int val,
+			     int mask);
+static int qib_sd_trimdone_poll(struct qib_devdata *dd);
+static void qib_sd_trimdone_monitor(struct qib_devdata *dd, const char *where);
+static int qib_sd_setvals(struct qib_devdata *dd);
+static int qib_sd_early(struct qib_devdata *dd);
+static int qib_sd_dactrim(struct qib_devdata *dd);
+static int qib_internal_presets(struct qib_devdata *dd);
+/* Tweak the register (CMUCTRL5) that contains the TRIMSELF controls */
+static int qib_sd_trimself(struct qib_devdata *dd, int val);
+static int epb_access(struct qib_devdata *dd, int sdnum, int claim);
+static int qib_sd7220_ib_load(struct qib_devdata *dd,
+			      const struct firmware *fw);
+static int qib_sd7220_ib_vfy(struct qib_devdata *dd,
+			     const struct firmware *fw);
+
+/*
+ * Below keeps track of whether the "once per power-on" initialization has
+ * been done, because uC code Version 1.32.17 or higher allows the uC to
+ * be reset at will, and Automatic Equalization may require it. So the
+ * state of the reset "pin", is no longer valid. Instead, we check for the
+ * actual uC code having been loaded.
+ */
+static int qib_ibsd_ucode_loaded(struct qib_pportdata *ppd,
+				 const struct firmware *fw)
+{
+	struct qib_devdata *dd = ppd->dd;
+
+	if (!dd->cspec->serdes_first_init_done &&
+	    qib_sd7220_ib_vfy(dd, fw) > 0)
+		dd->cspec->serdes_first_init_done = 1;
+	return dd->cspec->serdes_first_init_done;
+}
+
+/* repeat #define for local use. "Real" #define is in qib_iba7220.c */
+#define QLOGIC_IB_HWE_IB_UC_MEMORYPARITYERR      0x0000004000000000ULL
+#define IB_MPREG5 (EPB_LOC(6, 0, 0xE) | (1L << EPB_IB_UC_CS_SHF))
+#define IB_MPREG6 (EPB_LOC(6, 0, 0xF) | (1U << EPB_IB_UC_CS_SHF))
+#define UC_PAR_CLR_D 8
+#define UC_PAR_CLR_M 0xC
+#define IB_CTRL2(chn) (EPB_LOC(chn, 7, 3) | EPB_IB_QUAD0_CS)
+#define START_EQ1(chan) EPB_LOC(chan, 7, 0x27)
+
+void qib_sd7220_clr_ibpar(struct qib_devdata *dd)
+{
+	int ret;
+
+	/* clear, then re-enable parity errs */
+	ret = qib_sd7220_reg_mod(dd, IB_7220_SERDES, IB_MPREG6,
+		UC_PAR_CLR_D, UC_PAR_CLR_M);
+	if (ret < 0) {
+		qib_dev_err(dd, "Failed clearing IBSerDes Parity err\n");
+		goto bail;
+	}
+	ret = qib_sd7220_reg_mod(dd, IB_7220_SERDES, IB_MPREG6, 0,
+		UC_PAR_CLR_M);
+
+	qib_read_kreg32(dd, kr_scratch);
+	udelay(4);
+	qib_write_kreg(dd, kr_hwerrclear,
+		QLOGIC_IB_HWE_IB_UC_MEMORYPARITYERR);
+	qib_read_kreg32(dd, kr_scratch);
+bail:
+	return;
+}
+
+/*
+ * After a reset or other unusual event, the epb interface may need
+ * to be re-synchronized, between the host and the uC.
+ * returns <0 for failure to resync within IBSD_RESYNC_TRIES (not expected)
+ */
+#define IBSD_RESYNC_TRIES 3
+#define IB_PGUDP(chn) (EPB_LOC((chn), 2, 1) | EPB_IB_QUAD0_CS)
+#define IB_CMUDONE(chn) (EPB_LOC((chn), 7, 0xF) | EPB_IB_QUAD0_CS)
+
+static int qib_resync_ibepb(struct qib_devdata *dd)
+{
+	int ret, pat, tries, chn;
+	u32 loc;
+
+	ret = -1;
+	chn = 0;
+	for (tries = 0; tries < (4 * IBSD_RESYNC_TRIES); ++tries) {
+		loc = IB_PGUDP(chn);
+		ret = qib_sd7220_reg_mod(dd, IB_7220_SERDES, loc, 0, 0);
+		if (ret < 0) {
+			qib_dev_err(dd, "Failed read in resync\n");
+			continue;
+		}
+		if (ret != 0xF0 && ret != 0x55 && tries == 0)
+			qib_dev_err(dd, "unexpected pattern in resync\n");
+		pat = ret ^ 0xA5; /* alternate F0 and 55 */
+		ret = qib_sd7220_reg_mod(dd, IB_7220_SERDES, loc, pat, 0xFF);
+		if (ret < 0) {
+			qib_dev_err(dd, "Failed write in resync\n");
+			continue;
+		}
+		ret = qib_sd7220_reg_mod(dd, IB_7220_SERDES, loc, 0, 0);
+		if (ret < 0) {
+			qib_dev_err(dd, "Failed re-read in resync\n");
+			continue;
+		}
+		if (ret != pat) {
+			qib_dev_err(dd, "Failed compare1 in resync\n");
+			continue;
+		}
+		loc = IB_CMUDONE(chn);
+		ret = qib_sd7220_reg_mod(dd, IB_7220_SERDES, loc, 0, 0);
+		if (ret < 0) {
+			qib_dev_err(dd, "Failed CMUDONE rd in resync\n");
+			continue;
+		}
+		if ((ret & 0x70) != ((chn << 4) | 0x40)) {
+			qib_dev_err(dd, "Bad CMUDONE value %02X, chn %d\n",
+				    ret, chn);
+			continue;
+		}
+		if (++chn == 4)
+			break;  /* Success */
+	}
+	return (ret > 0) ? 0 : ret;
+}
+
+/*
+ * Localize the stuff that should be done to change IB uC reset
+ * returns <0 for errors.
+ */
+static int qib_ibsd_reset(struct qib_devdata *dd, int assert_rst)
+{
+	u64 rst_val;
+	int ret = 0;
+	unsigned long flags;
+
+	rst_val = qib_read_kreg64(dd, kr_ibserdesctrl);
+	if (assert_rst) {
+		/*
+		 * Vendor recommends "interrupting" uC before reset, to
+		 * minimize possible glitches.
+		 */
+		spin_lock_irqsave(&dd->cspec->sdepb_lock, flags);
+		epb_access(dd, IB_7220_SERDES, 1);
+		rst_val |= 1ULL;
+		/* Squelch possible parity error from _asserting_ reset */
+		qib_write_kreg(dd, kr_hwerrmask,
+			       dd->cspec->hwerrmask &
+			       ~QLOGIC_IB_HWE_IB_UC_MEMORYPARITYERR);
+		qib_write_kreg(dd, kr_ibserdesctrl, rst_val);
+		/* flush write, delay to ensure it took effect */
+		qib_read_kreg32(dd, kr_scratch);
+		udelay(2);
+		/* once it's reset, can remove interrupt */
+		epb_access(dd, IB_7220_SERDES, -1);
+		spin_unlock_irqrestore(&dd->cspec->sdepb_lock, flags);
+	} else {
+		/*
+		 * Before we de-assert reset, we need to deal with
+		 * possible glitch on the Parity-error line.
+		 * Suppress it around the reset, both in chip-level
+		 * hwerrmask and in IB uC control reg. uC will allow
+		 * it again during startup.
+		 */
+		u64 val;
+		rst_val &= ~(1ULL);
+		qib_write_kreg(dd, kr_hwerrmask,
+			       dd->cspec->hwerrmask &
+			       ~QLOGIC_IB_HWE_IB_UC_MEMORYPARITYERR);
+
+		ret = qib_resync_ibepb(dd);
+		if (ret < 0)
+			qib_dev_err(dd, "unable to re-sync IB EPB\n");
+
+		/* set uC control regs to suppress parity errs */
+		ret = qib_sd7220_reg_mod(dd, IB_7220_SERDES, IB_MPREG5, 1, 1);
+		if (ret < 0)
+			goto bail;
+		/* IB uC code past Version 1.32.17 allow suppression of wdog */
+		ret = qib_sd7220_reg_mod(dd, IB_7220_SERDES, IB_MPREG6, 0x80,
+			0x80);
+		if (ret < 0) {
+			qib_dev_err(dd, "Failed to set WDOG disable\n");
+			goto bail;
+		}
+		qib_write_kreg(dd, kr_ibserdesctrl, rst_val);
+		/* flush write, delay for startup */
+		qib_read_kreg32(dd, kr_scratch);
+		udelay(1);
+		/* clear, then re-enable parity errs */
+		qib_sd7220_clr_ibpar(dd);
+		val = qib_read_kreg64(dd, kr_hwerrstatus);
+		if (val & QLOGIC_IB_HWE_IB_UC_MEMORYPARITYERR) {
+			qib_dev_err(dd, "IBUC Parity still set after RST\n");
+			dd->cspec->hwerrmask &=
+				~QLOGIC_IB_HWE_IB_UC_MEMORYPARITYERR;
+		}
+		qib_write_kreg(dd, kr_hwerrmask,
+			dd->cspec->hwerrmask);
+	}
+
+bail:
+	return ret;
+}
+
+static void qib_sd_trimdone_monitor(struct qib_devdata *dd,
+	const char *where)
+{
+	int ret, chn, baduns;
+	u64 val;
+
+	if (!where)
+		where = "?";
+
+	/* give time for reset to settle out in EPB */
+	udelay(2);
+
+	ret = qib_resync_ibepb(dd);
+	if (ret < 0)
+		qib_dev_err(dd, "not able to re-sync IB EPB (%s)\n", where);
+
+	/* Do "sacrificial read" to get EPB in sane state after reset */
+	ret = qib_sd7220_reg_mod(dd, IB_7220_SERDES, IB_CTRL2(0), 0, 0);
+	if (ret < 0)
+		qib_dev_err(dd, "Failed TRIMDONE 1st read, (%s)\n", where);
+
+	/* Check/show "summary" Trim-done bit in IBCStatus */
+	val = qib_read_kreg64(dd, kr_ibcstatus);
+	if (!(val & (1ULL << 11)))
+		qib_dev_err(dd, "IBCS TRIMDONE clear (%s)\n", where);
+	/*
+	 * Do "dummy read/mod/wr" to get EPB in sane state after reset
+	 * The default value for MPREG6 is 0.
+	 */
+	udelay(2);
+
+	ret = qib_sd7220_reg_mod(dd, IB_7220_SERDES, IB_MPREG6, 0x80, 0x80);
+	if (ret < 0)
+		qib_dev_err(dd, "Failed Dummy RMW, (%s)\n", where);
+	udelay(10);
+
+	baduns = 0;
+
+	for (chn = 3; chn >= 0; --chn) {
+		/* Read CTRL reg for each channel to check TRIMDONE */
+		ret = qib_sd7220_reg_mod(dd, IB_7220_SERDES,
+			IB_CTRL2(chn), 0, 0);
+		if (ret < 0)
+			qib_dev_err(dd,
+				"Failed checking TRIMDONE, chn %d (%s)\n",
+				chn, where);
+
+		if (!(ret & 0x10)) {
+			int probe;
+
+			baduns |= (1 << chn);
+			qib_dev_err(dd,
+				"TRIMDONE cleared on chn %d (%02X). (%s)\n",
+				chn, ret, where);
+			probe = qib_sd7220_reg_mod(dd, IB_7220_SERDES,
+				IB_PGUDP(0), 0, 0);
+			qib_dev_err(dd, "probe is %d (%02X)\n",
+				probe, probe);
+			probe = qib_sd7220_reg_mod(dd, IB_7220_SERDES,
+				IB_CTRL2(chn), 0, 0);
+			qib_dev_err(dd, "re-read: %d (%02X)\n",
+				probe, probe);
+			ret = qib_sd7220_reg_mod(dd, IB_7220_SERDES,
+				IB_CTRL2(chn), 0x10, 0x10);
+			if (ret < 0)
+				qib_dev_err(dd,
+					"Err on TRIMDONE rewrite1\n");
+		}
+	}
+	for (chn = 3; chn >= 0; --chn) {
+		/* Read CTRL reg for each channel to check TRIMDONE */
+		if (baduns & (1 << chn)) {
+			qib_dev_err(dd,
+				"Resetting TRIMDONE on chn %d (%s)\n",
+				chn, where);
+			ret = qib_sd7220_reg_mod(dd, IB_7220_SERDES,
+				IB_CTRL2(chn), 0x10, 0x10);
+			if (ret < 0)
+				qib_dev_err(dd,
+					"Failed re-setting TRIMDONE, chn %d (%s)\n",
+					chn, where);
+		}
+	}
+}
+
+/*
+ * Below is portion of IBA7220-specific bringup_serdes() that actually
+ * deals with registers and memory within the SerDes itself.
+ * Post IB uC code version 1.32.17, was_reset being 1 is not really
+ * informative, so we double-check.
+ */
+int qib_sd7220_init(struct qib_devdata *dd)
+{
+	const struct firmware *fw;
+	int ret = 1; /* default to failure */
+	int first_reset, was_reset;
+
+	/* SERDES MPU reset recorded in D0 */
+	was_reset = (qib_read_kreg64(dd, kr_ibserdesctrl) & 1);
+	if (!was_reset) {
+		/* entered with reset not asserted, we need to do it */
+		qib_ibsd_reset(dd, 1);
+		qib_sd_trimdone_monitor(dd, "Driver-reload");
+	}
+
+	ret = request_firmware(&fw, SD7220_FW_NAME, &dd->pcidev->dev);
+	if (ret) {
+		qib_dev_err(dd, "Failed to load IB SERDES image\n");
+		goto done;
+	}
+
+	/* Substitute our deduced value for was_reset */
+	ret = qib_ibsd_ucode_loaded(dd->pport, fw);
+	if (ret < 0)
+		goto bail;
+
+	first_reset = !ret; /* First reset if IBSD uCode not yet loaded */
+	/*
+	 * Alter some regs per vendor latest doc, reset-defaults
+	 * are not right for IB.
+	 */
+	ret = qib_sd_early(dd);
+	if (ret < 0) {
+		qib_dev_err(dd, "Failed to set IB SERDES early defaults\n");
+		goto bail;
+	}
+	/*
+	 * Set DAC manual trim IB.
+	 * We only do this once after chip has been reset (usually
+	 * same as once per system boot).
+	 */
+	if (first_reset) {
+		ret = qib_sd_dactrim(dd);
+		if (ret < 0) {
+			qib_dev_err(dd, "Failed IB SERDES DAC trim\n");
+			goto bail;
+		}
+	}
+	/*
+	 * Set various registers (DDS and RXEQ) that will be
+	 * controlled by IBC (in 1.2 mode) to reasonable preset values
+	 * Calling the "internal" version avoids the "check for needed"
+	 * and "trimdone monitor" that might be counter-productive.
+	 */
+	ret = qib_internal_presets(dd);
+	if (ret < 0) {
+		qib_dev_err(dd, "Failed to set IB SERDES presets\n");
+		goto bail;
+	}
+	ret = qib_sd_trimself(dd, 0x80);
+	if (ret < 0) {
+		qib_dev_err(dd, "Failed to set IB SERDES TRIMSELF\n");
+		goto bail;
+	}
+
+	/* Load image, then try to verify */
+	ret = 0;        /* Assume success */
+	if (first_reset) {
+		int vfy;
+		int trim_done;
+
+		ret = qib_sd7220_ib_load(dd, fw);
+		if (ret < 0) {
+			qib_dev_err(dd, "Failed to load IB SERDES image\n");
+			goto bail;
+		} else {
+			/* Loaded image, try to verify */
+			vfy = qib_sd7220_ib_vfy(dd, fw);
+			if (vfy != ret) {
+				qib_dev_err(dd, "SERDES PRAM VFY failed\n");
+				goto bail;
+			} /* end if verified */
+		} /* end if loaded */
+
+		/*
+		 * Loaded and verified. Almost good...
+		 * hold "success" in ret
+		 */
+		ret = 0;
+		/*
+		 * Prev steps all worked, continue bringup
+		 * De-assert RESET to uC, only in first reset, to allow
+		 * trimming.
+		 *
+		 * Since our default setup sets START_EQ1 to
+		 * PRESET, we need to clear that for this very first run.
+		 */
+		ret = ibsd_mod_allchnls(dd, START_EQ1(0), 0, 0x38);
+		if (ret < 0) {
+			qib_dev_err(dd, "Failed clearing START_EQ1\n");
+			goto bail;
+		}
+
+		qib_ibsd_reset(dd, 0);
+		/*
+		 * If this is not the first reset, trimdone should be set
+		 * already. We may need to check about this.
+		 */
+		trim_done = qib_sd_trimdone_poll(dd);
+		/*
+		 * Whether or not trimdone succeeded, we need to put the
+		 * uC back into reset to avoid a possible fight with the
+		 * IBC state-machine.
+		 */
+		qib_ibsd_reset(dd, 1);
+
+		if (!trim_done) {
+			qib_dev_err(dd, "No TRIMDONE seen\n");
+			goto bail;
+		}
+		/*
+		 * DEBUG: check each time we reset if trimdone bits have
+		 * gotten cleared, and re-set them.
+		 */
+		qib_sd_trimdone_monitor(dd, "First-reset");
+		/* Remember so we do not re-do the load, dactrim, etc. */
+		dd->cspec->serdes_first_init_done = 1;
+	}
+	/*
+	 * setup for channel training and load values for
+	 * RxEq and DDS in tables used by IBC in IB1.2 mode
+	 */
+	ret = 0;
+	if (qib_sd_setvals(dd) >= 0)
+		goto done;
+bail:
+	ret = 1;
+done:
+	/* start relock timer regardless, but start at 1 second */
+	set_7220_relock_poll(dd, -1);
+
+	release_firmware(fw);
+	return ret;
+}
+
+#define EPB_ACC_REQ 1
+#define EPB_ACC_GNT 0x100
+#define EPB_DATA_MASK 0xFF
+#define EPB_RD (1ULL << 24)
+#define EPB_TRANS_RDY (1ULL << 31)
+#define EPB_TRANS_ERR (1ULL << 30)
+#define EPB_TRANS_TRIES 5
+
+/*
+ * query, claim, release ownership of the EPB (External Parallel Bus)
+ * for a specified SERDES.
+ * the "claim" parameter is >0 to claim, <0 to release, 0 to query.
+ * Returns <0 for errors, >0 if we had ownership, else 0.
+ */
+static int epb_access(struct qib_devdata *dd, int sdnum, int claim)
+{
+	u16 acc;
+	u64 accval;
+	int owned = 0;
+	u64 oct_sel = 0;
+
+	switch (sdnum) {
+	case IB_7220_SERDES:
+		/*
+		 * The IB SERDES "ownership" is fairly simple. A single each
+		 * request/grant.
+		 */
+		acc = kr_ibsd_epb_access_ctrl;
+		break;
+
+	case PCIE_SERDES0:
+	case PCIE_SERDES1:
+		/* PCIe SERDES has two "octants", need to select which */
+		acc = kr_pciesd_epb_access_ctrl;
+		oct_sel = (2 << (sdnum - PCIE_SERDES0));
+		break;
+
+	default:
+		return 0;
+	}
+
+	/* Make sure any outstanding transaction was seen */
+	qib_read_kreg32(dd, kr_scratch);
+	udelay(15);
+
+	accval = qib_read_kreg32(dd, acc);
+
+	owned = !!(accval & EPB_ACC_GNT);
+	if (claim < 0) {
+		/* Need to release */
+		u64 pollval;
+		/*
+		 * The only writeable bits are the request and CS.
+		 * Both should be clear
+		 */
+		u64 newval = 0;
+		qib_write_kreg(dd, acc, newval);
+		/* First read after write is not trustworthy */
+		pollval = qib_read_kreg32(dd, acc);
+		udelay(5);
+		pollval = qib_read_kreg32(dd, acc);
+		if (pollval & EPB_ACC_GNT)
+			owned = -1;
+	} else if (claim > 0) {
+		/* Need to claim */
+		u64 pollval;
+		u64 newval = EPB_ACC_REQ | oct_sel;
+		qib_write_kreg(dd, acc, newval);
+		/* First read after write is not trustworthy */
+		pollval = qib_read_kreg32(dd, acc);
+		udelay(5);
+		pollval = qib_read_kreg32(dd, acc);
+		if (!(pollval & EPB_ACC_GNT))
+			owned = -1;
+	}
+	return owned;
+}
+
+/*
+ * Lemma to deal with race condition of write..read to epb regs
+ */
+static int epb_trans(struct qib_devdata *dd, u16 reg, u64 i_val, u64 *o_vp)
+{
+	int tries;
+	u64 transval;
+
+	qib_write_kreg(dd, reg, i_val);
+	/* Throw away first read, as RDY bit may be stale */
+	transval = qib_read_kreg64(dd, reg);
+
+	for (tries = EPB_TRANS_TRIES; tries; --tries) {
+		transval = qib_read_kreg32(dd, reg);
+		if (transval & EPB_TRANS_RDY)
+			break;
+		udelay(5);
+	}
+	if (transval & EPB_TRANS_ERR)
+		return -1;
+	if (tries > 0 && o_vp)
+		*o_vp = transval;
+	return tries;
+}
+
+/**
+ * qib_sd7220_reg_mod - modify SERDES register
+ * @dd: the qlogic_ib device
+ * @sdnum: which SERDES to access
+ * @loc: location - channel, element, register, as packed by EPB_LOC() macro.
+ * @wd: Write Data - value to set in register
+ * @mask: ones where data should be spliced into reg.
+ *
+ * Basic register read/modify/write, with un-needed acesses elided. That is,
+ * a mask of zero will prevent write, while a mask of 0xFF will prevent read.
+ * returns current (presumed, if a write was done) contents of selected
+ * register, or <0 if errors.
+ */
+static int qib_sd7220_reg_mod(struct qib_devdata *dd, int sdnum, u32 loc,
+			      u32 wd, u32 mask)
+{
+	u16 trans;
+	u64 transval;
+	int owned;
+	int tries, ret;
+	unsigned long flags;
+
+	switch (sdnum) {
+	case IB_7220_SERDES:
+		trans = kr_ibsd_epb_transaction_reg;
+		break;
+
+	case PCIE_SERDES0:
+	case PCIE_SERDES1:
+		trans = kr_pciesd_epb_transaction_reg;
+		break;
+
+	default:
+		return -1;
+	}
+
+	/*
+	 * All access is locked in software (vs other host threads) and
+	 * hardware (vs uC access).
+	 */
+	spin_lock_irqsave(&dd->cspec->sdepb_lock, flags);
+
+	owned = epb_access(dd, sdnum, 1);
+	if (owned < 0) {
+		spin_unlock_irqrestore(&dd->cspec->sdepb_lock, flags);
+		return -1;
+	}
+	ret = 0;
+	for (tries = EPB_TRANS_TRIES; tries; --tries) {
+		transval = qib_read_kreg32(dd, trans);
+		if (transval & EPB_TRANS_RDY)
+			break;
+		udelay(5);
+	}
+
+	if (tries > 0) {
+		tries = 1;      /* to make read-skip work */
+		if (mask != 0xFF) {
+			/*
+			 * Not a pure write, so need to read.
+			 * loc encodes chip-select as well as address
+			 */
+			transval = loc | EPB_RD;
+			tries = epb_trans(dd, trans, transval, &transval);
+		}
+		if (tries > 0 && mask != 0) {
+			/*
+			 * Not a pure read, so need to write.
+			 */
+			wd = (wd & mask) | (transval & ~mask);
+			transval = loc | (wd & EPB_DATA_MASK);
+			tries = epb_trans(dd, trans, transval, &transval);
+		}
+	}
+	/* else, failed to see ready, what error-handling? */
+
+	/*
+	 * Release bus. Failure is an error.
+	 */
+	if (epb_access(dd, sdnum, -1) < 0)
+		ret = -1;
+	else
+		ret = transval & EPB_DATA_MASK;
+
+	spin_unlock_irqrestore(&dd->cspec->sdepb_lock, flags);
+	if (tries <= 0)
+		ret = -1;
+	return ret;
+}
+
+#define EPB_ROM_R (2)
+#define EPB_ROM_W (1)
+/*
+ * Below, all uC-related, use appropriate UC_CS, depending
+ * on which SerDes is used.
+ */
+#define EPB_UC_CTL EPB_LOC(6, 0, 0)
+#define EPB_MADDRL EPB_LOC(6, 0, 2)
+#define EPB_MADDRH EPB_LOC(6, 0, 3)
+#define EPB_ROMDATA EPB_LOC(6, 0, 4)
+#define EPB_RAMDATA EPB_LOC(6, 0, 5)
+
+/* Transfer date to/from uC Program RAM of IB or PCIe SerDes */
+static int qib_sd7220_ram_xfer(struct qib_devdata *dd, int sdnum, u32 loc,
+			       u8 *buf, int cnt, int rd_notwr)
+{
+	u16 trans;
+	u64 transval;
+	u64 csbit;
+	int owned;
+	int tries;
+	int sofar;
+	int addr;
+	int ret;
+	unsigned long flags;
+	const char *op;
+
+	/* Pick appropriate transaction reg and "Chip select" for this serdes */
+	switch (sdnum) {
+	case IB_7220_SERDES:
+		csbit = 1ULL << EPB_IB_UC_CS_SHF;
+		trans = kr_ibsd_epb_transaction_reg;
+		break;
+
+	case PCIE_SERDES0:
+	case PCIE_SERDES1:
+		/* PCIe SERDES has uC "chip select" in different bit, too */
+		csbit = 1ULL << EPB_PCIE_UC_CS_SHF;
+		trans = kr_pciesd_epb_transaction_reg;
+		break;
+
+	default:
+		return -1;
+	}
+
+	op = rd_notwr ? "Rd" : "Wr";
+	spin_lock_irqsave(&dd->cspec->sdepb_lock, flags);
+
+	owned = epb_access(dd, sdnum, 1);
+	if (owned < 0) {
+		spin_unlock_irqrestore(&dd->cspec->sdepb_lock, flags);
+		return -1;
+	}
+
+	/*
+	 * In future code, we may need to distinguish several address ranges,
+	 * and select various memories based on this. For now, just trim
+	 * "loc" (location including address and memory select) to
+	 * "addr" (address within memory). we will only support PRAM
+	 * The memory is 8KB.
+	 */
+	addr = loc & 0x1FFF;
+	for (tries = EPB_TRANS_TRIES; tries; --tries) {
+		transval = qib_read_kreg32(dd, trans);
+		if (transval & EPB_TRANS_RDY)
+			break;
+		udelay(5);
+	}
+
+	sofar = 0;
+	if (tries > 0) {
+		/*
+		 * Every "memory" access is doubly-indirect.
+		 * We set two bytes of address, then read/write
+		 * one or mores bytes of data.
+		 */
+
+		/* First, we set control to "Read" or "Write" */
+		transval = csbit | EPB_UC_CTL |
+			(rd_notwr ? EPB_ROM_R : EPB_ROM_W);
+		tries = epb_trans(dd, trans, transval, &transval);
+		while (tries > 0 && sofar < cnt) {
+			if (!sofar) {
+				/* Only set address at start of chunk */
+				int addrbyte = (addr + sofar) >> 8;
+				transval = csbit | EPB_MADDRH | addrbyte;
+				tries = epb_trans(dd, trans, transval,
+						  &transval);
+				if (tries <= 0)
+					break;
+				addrbyte = (addr + sofar) & 0xFF;
+				transval = csbit | EPB_MADDRL | addrbyte;
+				tries = epb_trans(dd, trans, transval,
+						 &transval);
+				if (tries <= 0)
+					break;
+			}
+
+			if (rd_notwr)
+				transval = csbit | EPB_ROMDATA | EPB_RD;
+			else
+				transval = csbit | EPB_ROMDATA | buf[sofar];
+			tries = epb_trans(dd, trans, transval, &transval);
+			if (tries <= 0)
+				break;
+			if (rd_notwr)
+				buf[sofar] = transval & EPB_DATA_MASK;
+			++sofar;
+		}
+		/* Finally, clear control-bit for Read or Write */
+		transval = csbit | EPB_UC_CTL;
+		tries = epb_trans(dd, trans, transval, &transval);
+	}
+
+	ret = sofar;
+	/* Release bus. Failure is an error */
+	if (epb_access(dd, sdnum, -1) < 0)
+		ret = -1;
+
+	spin_unlock_irqrestore(&dd->cspec->sdepb_lock, flags);
+	if (tries <= 0)
+		ret = -1;
+	return ret;
+}
+
+#define PROG_CHUNK 64
+
+static int qib_sd7220_prog_ld(struct qib_devdata *dd, int sdnum,
+			      const u8 *img, int len, int offset)
+{
+	int cnt, sofar, req;
+
+	sofar = 0;
+	while (sofar < len) {
+		req = len - sofar;
+		if (req > PROG_CHUNK)
+			req = PROG_CHUNK;
+		cnt = qib_sd7220_ram_xfer(dd, sdnum, offset + sofar,
+					  (u8 *)img + sofar, req, 0);
+		if (cnt < req) {
+			sofar = -1;
+			break;
+		}
+		sofar += req;
+	}
+	return sofar;
+}
+
+#define VFY_CHUNK 64
+#define SD_PRAM_ERROR_LIMIT 42
+
+static int qib_sd7220_prog_vfy(struct qib_devdata *dd, int sdnum,
+			       const u8 *img, int len, int offset)
+{
+	int cnt, sofar, req, idx, errors;
+	unsigned char readback[VFY_CHUNK];
+
+	errors = 0;
+	sofar = 0;
+	while (sofar < len) {
+		req = len - sofar;
+		if (req > VFY_CHUNK)
+			req = VFY_CHUNK;
+		cnt = qib_sd7220_ram_xfer(dd, sdnum, sofar + offset,
+					  readback, req, 1);
+		if (cnt < req) {
+			/* failed in read itself */
+			sofar = -1;
+			break;
+		}
+		for (idx = 0; idx < cnt; ++idx) {
+			if (readback[idx] != img[idx+sofar])
+				++errors;
+		}
+		sofar += cnt;
+	}
+	return errors ? -errors : sofar;
+}
+
+static int
+qib_sd7220_ib_load(struct qib_devdata *dd, const struct firmware *fw)
+{
+	return qib_sd7220_prog_ld(dd, IB_7220_SERDES, fw->data, fw->size, 0);
+}
+
+static int
+qib_sd7220_ib_vfy(struct qib_devdata *dd, const struct firmware *fw)
+{
+	return qib_sd7220_prog_vfy(dd, IB_7220_SERDES, fw->data, fw->size, 0);
+}
+
+/*
+ * IRQ not set up at this point in init, so we poll.
+ */
+#define IB_SERDES_TRIM_DONE (1ULL << 11)
+#define TRIM_TMO (30)
+
+static int qib_sd_trimdone_poll(struct qib_devdata *dd)
+{
+	int trim_tmo, ret;
+	uint64_t val;
+
+	/*
+	 * Default to failure, so IBC will not start
+	 * without IB_SERDES_TRIM_DONE.
+	 */
+	ret = 0;
+	for (trim_tmo = 0; trim_tmo < TRIM_TMO; ++trim_tmo) {
+		val = qib_read_kreg64(dd, kr_ibcstatus);
+		if (val & IB_SERDES_TRIM_DONE) {
+			ret = 1;
+			break;
+		}
+		msleep(10);
+	}
+	if (trim_tmo >= TRIM_TMO) {
+		qib_dev_err(dd, "No TRIMDONE in %d tries\n", trim_tmo);
+		ret = 0;
+	}
+	return ret;
+}
+
+#define TX_FAST_ELT (9)
+
+/*
+ * Set the "negotiation" values for SERDES. These are used by the IB1.2
+ * link negotiation. Macros below are attempt to keep the values a
+ * little more human-editable.
+ * First, values related to Drive De-emphasis Settings.
+ */
+
+#define NUM_DDS_REGS 6
+#define DDS_REG_MAP 0x76A910 /* LSB-first list of regs (in elt 9) to mod */
+
+#define DDS_VAL(amp_d, main_d, ipst_d, ipre_d, amp_s, main_s, ipst_s, ipre_s) \
+	{ { ((amp_d & 0x1F) << 1) | 1, ((amp_s & 0x1F) << 1) | 1, \
+	  (main_d << 3) | 4 | (ipre_d >> 2), \
+	  (main_s << 3) | 4 | (ipre_s >> 2), \
+	  ((ipst_d & 0xF) << 1) | ((ipre_d & 3) << 6) | 0x21, \
+	  ((ipst_s & 0xF) << 1) | ((ipre_s & 3) << 6) | 0x21 } }
+
+static struct dds_init {
+	uint8_t reg_vals[NUM_DDS_REGS];
+} dds_init_vals[] = {
+	/*       DDR(FDR)       SDR(HDR)   */
+	/* Vendor recommends below for 3m cable */
+#define DDS_3M 0
+	DDS_VAL(31, 19, 12, 0, 29, 22,  9, 0),
+	DDS_VAL(31, 12, 15, 4, 31, 15, 15, 1),
+	DDS_VAL(31, 13, 15, 3, 31, 16, 15, 0),
+	DDS_VAL(31, 14, 15, 2, 31, 17, 14, 0),
+	DDS_VAL(31, 15, 15, 1, 31, 18, 13, 0),
+	DDS_VAL(31, 16, 15, 0, 31, 19, 12, 0),
+	DDS_VAL(31, 17, 14, 0, 31, 20, 11, 0),
+	DDS_VAL(31, 18, 13, 0, 30, 21, 10, 0),
+	DDS_VAL(31, 20, 11, 0, 28, 23,  8, 0),
+	DDS_VAL(31, 21, 10, 0, 27, 24,  7, 0),
+	DDS_VAL(31, 22,  9, 0, 26, 25,  6, 0),
+	DDS_VAL(30, 23,  8, 0, 25, 26,  5, 0),
+	DDS_VAL(29, 24,  7, 0, 23, 27,  4, 0),
+	/* Vendor recommends below for 1m cable */
+#define DDS_1M 13
+	DDS_VAL(28, 25,  6, 0, 21, 28,  3, 0),
+	DDS_VAL(27, 26,  5, 0, 19, 29,  2, 0),
+	DDS_VAL(25, 27,  4, 0, 17, 30,  1, 0)
+};
+
+/*
+ * Now the RXEQ section of the table.
+ */
+/* Hardware packs an element number and register address thus: */
+#define RXEQ_INIT_RDESC(elt, addr) (((elt) & 0xF) | ((addr) << 4))
+#define RXEQ_VAL(elt, adr, val0, val1, val2, val3) \
+	{RXEQ_INIT_RDESC((elt), (adr)), {(val0), (val1), (val2), (val3)} }
+
+#define RXEQ_VAL_ALL(elt, adr, val)  \
+	{RXEQ_INIT_RDESC((elt), (adr)), {(val), (val), (val), (val)} }
+
+#define RXEQ_SDR_DFELTH 0
+#define RXEQ_SDR_TLTH 0
+#define RXEQ_SDR_G1CNT_Z1CNT 0x11
+#define RXEQ_SDR_ZCNT 23
+
+static struct rxeq_init {
+	u16 rdesc;      /* in form used in SerDesDDSRXEQ */
+	u8  rdata[4];
+} rxeq_init_vals[] = {
+	/* Set Rcv Eq. to Preset node */
+	RXEQ_VAL_ALL(7, 0x27, 0x10),
+	/* Set DFELTHFDR/HDR thresholds */
+	RXEQ_VAL(7, 8,    0, 0, 0, 0), /* FDR, was 0, 1, 2, 3 */
+	RXEQ_VAL(7, 0x21, 0, 0, 0, 0), /* HDR */
+	/* Set TLTHFDR/HDR theshold */
+	RXEQ_VAL(7, 9,    2, 2, 2, 2), /* FDR, was 0, 2, 4, 6 */
+	RXEQ_VAL(7, 0x23, 2, 2, 2, 2), /* HDR, was  0, 1, 2, 3 */
+	/* Set Preamp setting 2 (ZFR/ZCNT) */
+	RXEQ_VAL(7, 0x1B, 12, 12, 12, 12), /* FDR, was 12, 16, 20, 24 */
+	RXEQ_VAL(7, 0x1C, 12, 12, 12, 12), /* HDR, was 12, 16, 20, 24 */
+	/* Set Preamp DC gain and Setting 1 (GFR/GHR) */
+	RXEQ_VAL(7, 0x1E, 16, 16, 16, 16), /* FDR, was 16, 17, 18, 20 */
+	RXEQ_VAL(7, 0x1F, 16, 16, 16, 16), /* HDR, was 16, 17, 18, 20 */
+	/* Toggle RELOCK (in VCDL_CTRL0) to lock to data */
+	RXEQ_VAL_ALL(6, 6, 0x20), /* Set D5 High */
+	RXEQ_VAL_ALL(6, 6, 0), /* Set D5 Low */
+};
+
+/* There are 17 values from vendor, but IBC only accesses the first 16 */
+#define DDS_ROWS (16)
+#define RXEQ_ROWS ARRAY_SIZE(rxeq_init_vals)
+
+static int qib_sd_setvals(struct qib_devdata *dd)
+{
+	int idx, midx;
+	int min_idx;     /* Minimum index for this portion of table */
+	uint32_t dds_reg_map;
+	u64 __iomem *taddr, *iaddr;
+	uint64_t data;
+	uint64_t sdctl;
+
+	taddr = dd->kregbase + kr_serdes_maptable;
+	iaddr = dd->kregbase + kr_serdes_ddsrxeq0;
+
+	/*
+	 * Init the DDS section of the table.
+	 * Each "row" of the table provokes NUM_DDS_REG writes, to the
+	 * registers indicated in DDS_REG_MAP.
+	 */
+	sdctl = qib_read_kreg64(dd, kr_ibserdesctrl);
+	sdctl = (sdctl & ~(0x1f << 8)) | (NUM_DDS_REGS << 8);
+	sdctl = (sdctl & ~(0x1f << 13)) | (RXEQ_ROWS << 13);
+	qib_write_kreg(dd, kr_ibserdesctrl, sdctl);
+
+	/*
+	 * Iterate down table within loop for each register to store.
+	 */
+	dds_reg_map = DDS_REG_MAP;
+	for (idx = 0; idx < NUM_DDS_REGS; ++idx) {
+		data = ((dds_reg_map & 0xF) << 4) | TX_FAST_ELT;
+		writeq(data, iaddr + idx);
+		mmiowb();
+		qib_read_kreg32(dd, kr_scratch);
+		dds_reg_map >>= 4;
+		for (midx = 0; midx < DDS_ROWS; ++midx) {
+			u64 __iomem *daddr = taddr + ((midx << 4) + idx);
+			data = dds_init_vals[midx].reg_vals[idx];
+			writeq(data, daddr);
+			mmiowb();
+			qib_read_kreg32(dd, kr_scratch);
+		} /* End inner for (vals for this reg, each row) */
+	} /* end outer for (regs to be stored) */
+
+	/*
+	 * Init the RXEQ section of the table.
+	 * This runs in a different order, as the pattern of
+	 * register references is more complex, but there are only
+	 * four "data" values per register.
+	 */
+	min_idx = idx; /* RXEQ indices pick up where DDS left off */
+	taddr += 0x100; /* RXEQ data is in second half of table */
+	/* Iterate through RXEQ register addresses */
+	for (idx = 0; idx < RXEQ_ROWS; ++idx) {
+		int didx; /* "destination" */
+		int vidx;
+
+		/* didx is offset by min_idx to address RXEQ range of regs */
+		didx = idx + min_idx;
+		/* Store the next RXEQ register address */
+		writeq(rxeq_init_vals[idx].rdesc, iaddr + didx);
+		mmiowb();
+		qib_read_kreg32(dd, kr_scratch);
+		/* Iterate through RXEQ values */
+		for (vidx = 0; vidx < 4; vidx++) {
+			data = rxeq_init_vals[idx].rdata[vidx];
+			writeq(data, taddr + (vidx << 6) + idx);
+			mmiowb();
+			qib_read_kreg32(dd, kr_scratch);
+		}
+	} /* end outer for (Reg-writes for RXEQ) */
+	return 0;
+}
+
+#define CMUCTRL5 EPB_LOC(7, 0, 0x15)
+#define RXHSCTRL0(chan) EPB_LOC(chan, 6, 0)
+#define VCDL_DAC2(chan) EPB_LOC(chan, 6, 5)
+#define VCDL_CTRL0(chan) EPB_LOC(chan, 6, 6)
+#define VCDL_CTRL2(chan) EPB_LOC(chan, 6, 8)
+#define START_EQ2(chan) EPB_LOC(chan, 7, 0x28)
+
+/*
+ * Repeat a "store" across all channels of the IB SerDes.
+ * Although nominally it inherits the "read value" of the last
+ * channel it modified, the only really useful return is <0 for
+ * failure, >= 0 for success. The parameter 'loc' is assumed to
+ * be the location in some channel of the register to be modified
+ * The caller can specify use of the "gang write" option of EPB,
+ * in which case we use the specified channel data for any fields
+ * not explicitely written.
+ */
+static int ibsd_mod_allchnls(struct qib_devdata *dd, int loc, int val,
+			     int mask)
+{
+	int ret = -1;
+	int chnl;
+
+	if (loc & EPB_GLOBAL_WR) {
+		/*
+		 * Our caller has assured us that we can set all four
+		 * channels at once. Trust that. If mask is not 0xFF,
+		 * we will read the _specified_ channel for our starting
+		 * value.
+		 */
+		loc |= (1U << EPB_IB_QUAD0_CS_SHF);
+		chnl = (loc >> (4 + EPB_ADDR_SHF)) & 7;
+		if (mask != 0xFF) {
+			ret = qib_sd7220_reg_mod(dd, IB_7220_SERDES,
+						 loc & ~EPB_GLOBAL_WR, 0, 0);
+			if (ret < 0) {
+				int sloc = loc >> EPB_ADDR_SHF;
+
+				qib_dev_err(dd,
+					"pre-read failed: elt %d, addr 0x%X, chnl %d\n",
+					(sloc & 0xF),
+					(sloc >> 9) & 0x3f, chnl);
+				return ret;
+			}
+			val = (ret & ~mask) | (val & mask);
+		}
+		loc &=  ~(7 << (4+EPB_ADDR_SHF));
+		ret = qib_sd7220_reg_mod(dd, IB_7220_SERDES, loc, val, 0xFF);
+		if (ret < 0) {
+			int sloc = loc >> EPB_ADDR_SHF;
+
+			qib_dev_err(dd,
+				"Global WR failed: elt %d, addr 0x%X, val %02X\n",
+				(sloc & 0xF), (sloc >> 9) & 0x3f, val);
+		}
+		return ret;
+	}
+	/* Clear "channel" and set CS so we can simply iterate */
+	loc &=  ~(7 << (4+EPB_ADDR_SHF));
+	loc |= (1U << EPB_IB_QUAD0_CS_SHF);
+	for (chnl = 0; chnl < 4; ++chnl) {
+		int cloc = loc | (chnl << (4+EPB_ADDR_SHF));
+
+		ret = qib_sd7220_reg_mod(dd, IB_7220_SERDES, cloc, val, mask);
+		if (ret < 0) {
+			int sloc = loc >> EPB_ADDR_SHF;
+
+			qib_dev_err(dd,
+				"Write failed: elt %d, addr 0x%X, chnl %d, val 0x%02X, mask 0x%02X\n",
+				(sloc & 0xF), (sloc >> 9) & 0x3f, chnl,
+				val & 0xFF, mask & 0xFF);
+			break;
+		}
+	}
+	return ret;
+}
+
+/*
+ * Set the Tx values normally modified by IBC in IB1.2 mode to default
+ * values, as gotten from first row of init table.
+ */
+static int set_dds_vals(struct qib_devdata *dd, struct dds_init *ddi)
+{
+	int ret;
+	int idx, reg, data;
+	uint32_t regmap;
+
+	regmap = DDS_REG_MAP;
+	for (idx = 0; idx < NUM_DDS_REGS; ++idx) {
+		reg = (regmap & 0xF);
+		regmap >>= 4;
+		data = ddi->reg_vals[idx];
+		/* Vendor says RMW not needed for these regs, use 0xFF mask */
+		ret = ibsd_mod_allchnls(dd, EPB_LOC(0, 9, reg), data, 0xFF);
+		if (ret < 0)
+			break;
+	}
+	return ret;
+}
+
+/*
+ * Set the Rx values normally modified by IBC in IB1.2 mode to default
+ * values, as gotten from selected column of init table.
+ */
+static int set_rxeq_vals(struct qib_devdata *dd, int vsel)
+{
+	int ret;
+	int ridx;
+	int cnt = ARRAY_SIZE(rxeq_init_vals);
+
+	for (ridx = 0; ridx < cnt; ++ridx) {
+		int elt, reg, val, loc;
+
+		elt = rxeq_init_vals[ridx].rdesc & 0xF;
+		reg = rxeq_init_vals[ridx].rdesc >> 4;
+		loc = EPB_LOC(0, elt, reg);
+		val = rxeq_init_vals[ridx].rdata[vsel];
+		/* mask of 0xFF, because hardware does full-byte store. */
+		ret = ibsd_mod_allchnls(dd, loc, val, 0xFF);
+		if (ret < 0)
+			break;
+	}
+	return ret;
+}
+
+/*
+ * Set the default values (row 0) for DDR Driver Demphasis.
+ * we do this initially and whenever we turn off IB-1.2
+ *
+ * The "default" values for Rx equalization are also stored to
+ * SerDes registers. Formerly (and still default), we used set 2.
+ * For experimenting with cables and link-partners, we allow changing
+ * that via a module parameter.
+ */
+static unsigned qib_rxeq_set = 2;
+module_param_named(rxeq_default_set, qib_rxeq_set, uint,
+		   S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(rxeq_default_set,
+		 "Which set [0..3] of Rx Equalization values is default");
+
+static int qib_internal_presets(struct qib_devdata *dd)
+{
+	int ret = 0;
+
+	ret = set_dds_vals(dd, dds_init_vals + DDS_3M);
+
+	if (ret < 0)
+		qib_dev_err(dd, "Failed to set default DDS values\n");
+	ret = set_rxeq_vals(dd, qib_rxeq_set & 3);
+	if (ret < 0)
+		qib_dev_err(dd, "Failed to set default RXEQ values\n");
+	return ret;
+}
+
+int qib_sd7220_presets(struct qib_devdata *dd)
+{
+	int ret = 0;
+
+	if (!dd->cspec->presets_needed)
+		return ret;
+	dd->cspec->presets_needed = 0;
+	/* Assert uC reset, so we don't clash with it. */
+	qib_ibsd_reset(dd, 1);
+	udelay(2);
+	qib_sd_trimdone_monitor(dd, "link-down");
+
+	ret = qib_internal_presets(dd);
+	return ret;
+}
+
+static int qib_sd_trimself(struct qib_devdata *dd, int val)
+{
+	int loc = CMUCTRL5 | (1U << EPB_IB_QUAD0_CS_SHF);
+
+	return qib_sd7220_reg_mod(dd, IB_7220_SERDES, loc, val, 0xFF);
+}
+
+static int qib_sd_early(struct qib_devdata *dd)
+{
+	int ret;
+
+	ret = ibsd_mod_allchnls(dd, RXHSCTRL0(0) | EPB_GLOBAL_WR, 0xD4, 0xFF);
+	if (ret < 0)
+		goto bail;
+	ret = ibsd_mod_allchnls(dd, START_EQ1(0) | EPB_GLOBAL_WR, 0x10, 0xFF);
+	if (ret < 0)
+		goto bail;
+	ret = ibsd_mod_allchnls(dd, START_EQ2(0) | EPB_GLOBAL_WR, 0x30, 0xFF);
+bail:
+	return ret;
+}
+
+#define BACTRL(chnl) EPB_LOC(chnl, 6, 0x0E)
+#define LDOUTCTRL1(chnl) EPB_LOC(chnl, 7, 6)
+#define RXHSSTATUS(chnl) EPB_LOC(chnl, 6, 0xF)
+
+static int qib_sd_dactrim(struct qib_devdata *dd)
+{
+	int ret;
+
+	ret = ibsd_mod_allchnls(dd, VCDL_DAC2(0) | EPB_GLOBAL_WR, 0x2D, 0xFF);
+	if (ret < 0)
+		goto bail;
+
+	/* more fine-tuning of what will be default */
+	ret = ibsd_mod_allchnls(dd, VCDL_CTRL2(0), 3, 0xF);
+	if (ret < 0)
+		goto bail;
+
+	ret = ibsd_mod_allchnls(dd, BACTRL(0) | EPB_GLOBAL_WR, 0x40, 0xFF);
+	if (ret < 0)
+		goto bail;
+
+	ret = ibsd_mod_allchnls(dd, LDOUTCTRL1(0) | EPB_GLOBAL_WR, 0x04, 0xFF);
+	if (ret < 0)
+		goto bail;
+
+	ret = ibsd_mod_allchnls(dd, RXHSSTATUS(0) | EPB_GLOBAL_WR, 0x04, 0xFF);
+	if (ret < 0)
+		goto bail;
+
+	/*
+	 * Delay for max possible number of steps, with slop.
+	 * Each step is about 4usec.
+	 */
+	udelay(415);
+
+	ret = ibsd_mod_allchnls(dd, LDOUTCTRL1(0) | EPB_GLOBAL_WR, 0x00, 0xFF);
+
+bail:
+	return ret;
+}
+
+#define RELOCK_FIRST_MS 3
+#define RXLSPPM(chan) EPB_LOC(chan, 0, 2)
+void toggle_7220_rclkrls(struct qib_devdata *dd)
+{
+	int loc = RXLSPPM(0) | EPB_GLOBAL_WR;
+	int ret;
+
+	ret = ibsd_mod_allchnls(dd, loc, 0, 0x80);
+	if (ret < 0)
+		qib_dev_err(dd, "RCLKRLS failed to clear D7\n");
+	else {
+		udelay(1);
+		ibsd_mod_allchnls(dd, loc, 0x80, 0x80);
+	}
+	/* And again for good measure */
+	udelay(1);
+	ret = ibsd_mod_allchnls(dd, loc, 0, 0x80);
+	if (ret < 0)
+		qib_dev_err(dd, "RCLKRLS failed to clear D7\n");
+	else {
+		udelay(1);
+		ibsd_mod_allchnls(dd, loc, 0x80, 0x80);
+	}
+	/* Now reset xgxs and IBC to complete the recovery */
+	dd->f_xgxs_reset(dd->pport);
+}
+
+/*
+ * Shut down the timer that polls for relock occasions, if needed
+ * this is "hooked" from qib_7220_quiet_serdes(), which is called
+ * just before qib_shutdown_device() in qib_driver.c shuts down all
+ * the other timers
+ */
+void shutdown_7220_relock_poll(struct qib_devdata *dd)
+{
+	if (dd->cspec->relock_timer_active)
+		del_timer_sync(&dd->cspec->relock_timer);
+}
+
+static unsigned qib_relock_by_timer = 1;
+module_param_named(relock_by_timer, qib_relock_by_timer, uint,
+		   S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(relock_by_timer, "Allow relock attempt if link not up");
+
+static void qib_run_relock(unsigned long opaque)
+{
+	struct qib_devdata *dd = (struct qib_devdata *)opaque;
+	struct qib_pportdata *ppd = dd->pport;
+	struct qib_chip_specific *cs = dd->cspec;
+	int timeoff;
+
+	/*
+	 * Check link-training state for "stuck" state, when down.
+	 * if found, try relock and schedule another try at
+	 * exponentially growing delay, maxed at one second.
+	 * if not stuck, our work is done.
+	 */
+	if ((dd->flags & QIB_INITTED) && !(ppd->lflags &
+	    (QIBL_IB_AUTONEG_INPROG | QIBL_LINKINIT | QIBL_LINKARMED |
+	     QIBL_LINKACTIVE))) {
+		if (qib_relock_by_timer) {
+			if (!(ppd->lflags & QIBL_IB_LINK_DISABLED))
+				toggle_7220_rclkrls(dd);
+		}
+		/* re-set timer for next check */
+		timeoff = cs->relock_interval << 1;
+		if (timeoff > HZ)
+			timeoff = HZ;
+		cs->relock_interval = timeoff;
+	} else
+		timeoff = HZ;
+	mod_timer(&cs->relock_timer, jiffies + timeoff);
+}
+
+void set_7220_relock_poll(struct qib_devdata *dd, int ibup)
+{
+	struct qib_chip_specific *cs = dd->cspec;
+
+	if (ibup) {
+		/* We are now up, relax timer to 1 second interval */
+		if (cs->relock_timer_active) {
+			cs->relock_interval = HZ;
+			mod_timer(&cs->relock_timer, jiffies + HZ);
+		}
+	} else {
+		/* Transition to down, (re-)set timer to short interval. */
+		unsigned int timeout;
+
+		timeout = msecs_to_jiffies(RELOCK_FIRST_MS);
+		if (timeout == 0)
+			timeout = 1;
+		/* If timer has not yet been started, do so. */
+		if (!cs->relock_timer_active) {
+			cs->relock_timer_active = 1;
+			init_timer(&cs->relock_timer);
+			cs->relock_timer.function = qib_run_relock;
+			cs->relock_timer.data = (unsigned long) dd;
+			cs->relock_interval = timeout;
+			cs->relock_timer.expires = jiffies + timeout;
+			add_timer(&cs->relock_timer);
+		} else {
+			cs->relock_interval = timeout;
+			mod_timer(&cs->relock_timer, jiffies + timeout);
+		}
+	}
+}
diff --git a/drivers/infiniband/hw/qib/qib_sdma.c b/drivers/infiniband/hw/qib/qib_sdma.c
new file mode 100644
index 0000000..c6d6a54
--- /dev/null
+++ b/drivers/infiniband/hw/qib/qib_sdma.c
@@ -0,0 +1,1039 @@
+/*
+ * Copyright (c) 2012 Intel Corporation. All rights reserved.
+ * Copyright (c) 2007 - 2012 QLogic Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/spinlock.h>
+#include <linux/netdevice.h>
+#include <linux/moduleparam.h>
+
+#include "qib.h"
+#include "qib_common.h"
+
+/* default pio off, sdma on */
+static ushort sdma_descq_cnt = 256;
+module_param_named(sdma_descq_cnt, sdma_descq_cnt, ushort, S_IRUGO);
+MODULE_PARM_DESC(sdma_descq_cnt, "Number of SDMA descq entries");
+
+/*
+ * Bits defined in the send DMA descriptor.
+ */
+#define SDMA_DESC_LAST          (1ULL << 11)
+#define SDMA_DESC_FIRST         (1ULL << 12)
+#define SDMA_DESC_DMA_HEAD      (1ULL << 13)
+#define SDMA_DESC_USE_LARGE_BUF (1ULL << 14)
+#define SDMA_DESC_INTR          (1ULL << 15)
+#define SDMA_DESC_COUNT_LSB     16
+#define SDMA_DESC_GEN_LSB       30
+
+char *qib_sdma_state_names[] = {
+	[qib_sdma_state_s00_hw_down]          = "s00_HwDown",
+	[qib_sdma_state_s10_hw_start_up_wait] = "s10_HwStartUpWait",
+	[qib_sdma_state_s20_idle]             = "s20_Idle",
+	[qib_sdma_state_s30_sw_clean_up_wait] = "s30_SwCleanUpWait",
+	[qib_sdma_state_s40_hw_clean_up_wait] = "s40_HwCleanUpWait",
+	[qib_sdma_state_s50_hw_halt_wait]     = "s50_HwHaltWait",
+	[qib_sdma_state_s99_running]          = "s99_Running",
+};
+
+char *qib_sdma_event_names[] = {
+	[qib_sdma_event_e00_go_hw_down]   = "e00_GoHwDown",
+	[qib_sdma_event_e10_go_hw_start]  = "e10_GoHwStart",
+	[qib_sdma_event_e20_hw_started]   = "e20_HwStarted",
+	[qib_sdma_event_e30_go_running]   = "e30_GoRunning",
+	[qib_sdma_event_e40_sw_cleaned]   = "e40_SwCleaned",
+	[qib_sdma_event_e50_hw_cleaned]   = "e50_HwCleaned",
+	[qib_sdma_event_e60_hw_halted]    = "e60_HwHalted",
+	[qib_sdma_event_e70_go_idle]      = "e70_GoIdle",
+	[qib_sdma_event_e7220_err_halted] = "e7220_ErrHalted",
+	[qib_sdma_event_e7322_err_halted] = "e7322_ErrHalted",
+	[qib_sdma_event_e90_timer_tick]   = "e90_TimerTick",
+};
+
+/* declare all statics here rather than keep sorting */
+static int alloc_sdma(struct qib_pportdata *);
+static void sdma_complete(struct kref *);
+static void sdma_finalput(struct qib_sdma_state *);
+static void sdma_get(struct qib_sdma_state *);
+static void sdma_put(struct qib_sdma_state *);
+static void sdma_set_state(struct qib_pportdata *, enum qib_sdma_states);
+static void sdma_start_sw_clean_up(struct qib_pportdata *);
+static void sdma_sw_clean_up_task(unsigned long);
+static void unmap_desc(struct qib_pportdata *, unsigned);
+
+static void sdma_get(struct qib_sdma_state *ss)
+{
+	kref_get(&ss->kref);
+}
+
+static void sdma_complete(struct kref *kref)
+{
+	struct qib_sdma_state *ss =
+		container_of(kref, struct qib_sdma_state, kref);
+
+	complete(&ss->comp);
+}
+
+static void sdma_put(struct qib_sdma_state *ss)
+{
+	kref_put(&ss->kref, sdma_complete);
+}
+
+static void sdma_finalput(struct qib_sdma_state *ss)
+{
+	sdma_put(ss);
+	wait_for_completion(&ss->comp);
+}
+
+/*
+ * Complete all the sdma requests on the active list, in the correct
+ * order, and with appropriate processing.   Called when cleaning up
+ * after sdma shutdown, and when new sdma requests are submitted for
+ * a link that is down.   This matches what is done for requests
+ * that complete normally, it's just the full list.
+ *
+ * Must be called with sdma_lock held
+ */
+static void clear_sdma_activelist(struct qib_pportdata *ppd)
+{
+	struct qib_sdma_txreq *txp, *txp_next;
+
+	list_for_each_entry_safe(txp, txp_next, &ppd->sdma_activelist, list) {
+		list_del_init(&txp->list);
+		if (txp->flags & QIB_SDMA_TXREQ_F_FREEDESC) {
+			unsigned idx;
+
+			idx = txp->start_idx;
+			while (idx != txp->next_descq_idx) {
+				unmap_desc(ppd, idx);
+				if (++idx == ppd->sdma_descq_cnt)
+					idx = 0;
+			}
+		}
+		if (txp->callback)
+			(*txp->callback)(txp, QIB_SDMA_TXREQ_S_ABORTED);
+	}
+}
+
+static void sdma_sw_clean_up_task(unsigned long opaque)
+{
+	struct qib_pportdata *ppd = (struct qib_pportdata *) opaque;
+	unsigned long flags;
+
+	spin_lock_irqsave(&ppd->sdma_lock, flags);
+
+	/*
+	 * At this point, the following should always be true:
+	 * - We are halted, so no more descriptors are getting retired.
+	 * - We are not running, so no one is submitting new work.
+	 * - Only we can send the e40_sw_cleaned, so we can't start
+	 *   running again until we say so.  So, the active list and
+	 *   descq are ours to play with.
+	 */
+
+	/* Process all retired requests. */
+	qib_sdma_make_progress(ppd);
+
+	clear_sdma_activelist(ppd);
+
+	/*
+	 * Resync count of added and removed.  It is VERY important that
+	 * sdma_descq_removed NEVER decrement - user_sdma depends on it.
+	 */
+	ppd->sdma_descq_removed = ppd->sdma_descq_added;
+
+	/*
+	 * Reset our notion of head and tail.
+	 * Note that the HW registers will be reset when switching states
+	 * due to calling __qib_sdma_process_event() below.
+	 */
+	ppd->sdma_descq_tail = 0;
+	ppd->sdma_descq_head = 0;
+	ppd->sdma_head_dma[0] = 0;
+	ppd->sdma_generation = 0;
+
+	__qib_sdma_process_event(ppd, qib_sdma_event_e40_sw_cleaned);
+
+	spin_unlock_irqrestore(&ppd->sdma_lock, flags);
+}
+
+/*
+ * This is called when changing to state qib_sdma_state_s10_hw_start_up_wait
+ * as a result of send buffer errors or send DMA descriptor errors.
+ * We want to disarm the buffers in these cases.
+ */
+static void sdma_hw_start_up(struct qib_pportdata *ppd)
+{
+	struct qib_sdma_state *ss = &ppd->sdma_state;
+	unsigned bufno;
+
+	for (bufno = ss->first_sendbuf; bufno < ss->last_sendbuf; ++bufno)
+		ppd->dd->f_sendctrl(ppd, QIB_SENDCTRL_DISARM_BUF(bufno));
+
+	ppd->dd->f_sdma_hw_start_up(ppd);
+}
+
+static void sdma_sw_tear_down(struct qib_pportdata *ppd)
+{
+	struct qib_sdma_state *ss = &ppd->sdma_state;
+
+	/* Releasing this reference means the state machine has stopped. */
+	sdma_put(ss);
+}
+
+static void sdma_start_sw_clean_up(struct qib_pportdata *ppd)
+{
+	tasklet_hi_schedule(&ppd->sdma_sw_clean_up_task);
+}
+
+static void sdma_set_state(struct qib_pportdata *ppd,
+	enum qib_sdma_states next_state)
+{
+	struct qib_sdma_state *ss = &ppd->sdma_state;
+	struct sdma_set_state_action *action = ss->set_state_action;
+	unsigned op = 0;
+
+	/* debugging bookkeeping */
+	ss->previous_state = ss->current_state;
+	ss->previous_op = ss->current_op;
+
+	ss->current_state = next_state;
+
+	if (action[next_state].op_enable)
+		op |= QIB_SDMA_SENDCTRL_OP_ENABLE;
+
+	if (action[next_state].op_intenable)
+		op |= QIB_SDMA_SENDCTRL_OP_INTENABLE;
+
+	if (action[next_state].op_halt)
+		op |= QIB_SDMA_SENDCTRL_OP_HALT;
+
+	if (action[next_state].op_drain)
+		op |= QIB_SDMA_SENDCTRL_OP_DRAIN;
+
+	if (action[next_state].go_s99_running_tofalse)
+		ss->go_s99_running = 0;
+
+	if (action[next_state].go_s99_running_totrue)
+		ss->go_s99_running = 1;
+
+	ss->current_op = op;
+
+	ppd->dd->f_sdma_sendctrl(ppd, ss->current_op);
+}
+
+static void unmap_desc(struct qib_pportdata *ppd, unsigned head)
+{
+	__le64 *descqp = &ppd->sdma_descq[head].qw[0];
+	u64 desc[2];
+	dma_addr_t addr;
+	size_t len;
+
+	desc[0] = le64_to_cpu(descqp[0]);
+	desc[1] = le64_to_cpu(descqp[1]);
+
+	addr = (desc[1] << 32) | (desc[0] >> 32);
+	len = (desc[0] >> 14) & (0x7ffULL << 2);
+	dma_unmap_single(&ppd->dd->pcidev->dev, addr, len, DMA_TO_DEVICE);
+}
+
+static int alloc_sdma(struct qib_pportdata *ppd)
+{
+	ppd->sdma_descq_cnt = sdma_descq_cnt;
+	if (!ppd->sdma_descq_cnt)
+		ppd->sdma_descq_cnt = 256;
+
+	/* Allocate memory for SendDMA descriptor FIFO */
+	ppd->sdma_descq = dma_alloc_coherent(&ppd->dd->pcidev->dev,
+		ppd->sdma_descq_cnt * sizeof(u64[2]), &ppd->sdma_descq_phys,
+		GFP_KERNEL);
+
+	if (!ppd->sdma_descq) {
+		qib_dev_err(ppd->dd,
+			"failed to allocate SendDMA descriptor FIFO memory\n");
+		goto bail;
+	}
+
+	/* Allocate memory for DMA of head register to memory */
+	ppd->sdma_head_dma = dma_alloc_coherent(&ppd->dd->pcidev->dev,
+		PAGE_SIZE, &ppd->sdma_head_phys, GFP_KERNEL);
+	if (!ppd->sdma_head_dma) {
+		qib_dev_err(ppd->dd,
+			"failed to allocate SendDMA head memory\n");
+		goto cleanup_descq;
+	}
+	ppd->sdma_head_dma[0] = 0;
+	return 0;
+
+cleanup_descq:
+	dma_free_coherent(&ppd->dd->pcidev->dev,
+		ppd->sdma_descq_cnt * sizeof(u64[2]), (void *)ppd->sdma_descq,
+		ppd->sdma_descq_phys);
+	ppd->sdma_descq = NULL;
+	ppd->sdma_descq_phys = 0;
+bail:
+	ppd->sdma_descq_cnt = 0;
+	return -ENOMEM;
+}
+
+static void free_sdma(struct qib_pportdata *ppd)
+{
+	struct qib_devdata *dd = ppd->dd;
+
+	if (ppd->sdma_head_dma) {
+		dma_free_coherent(&dd->pcidev->dev, PAGE_SIZE,
+				  (void *)ppd->sdma_head_dma,
+				  ppd->sdma_head_phys);
+		ppd->sdma_head_dma = NULL;
+		ppd->sdma_head_phys = 0;
+	}
+
+	if (ppd->sdma_descq) {
+		dma_free_coherent(&dd->pcidev->dev,
+				  ppd->sdma_descq_cnt * sizeof(u64[2]),
+				  ppd->sdma_descq, ppd->sdma_descq_phys);
+		ppd->sdma_descq = NULL;
+		ppd->sdma_descq_phys = 0;
+	}
+}
+
+static inline void make_sdma_desc(struct qib_pportdata *ppd,
+				  u64 *sdmadesc, u64 addr, u64 dwlen,
+				  u64 dwoffset)
+{
+
+	WARN_ON(addr & 3);
+	/* SDmaPhyAddr[47:32] */
+	sdmadesc[1] = addr >> 32;
+	/* SDmaPhyAddr[31:0] */
+	sdmadesc[0] = (addr & 0xfffffffcULL) << 32;
+	/* SDmaGeneration[1:0] */
+	sdmadesc[0] |= (ppd->sdma_generation & 3ULL) <<
+		SDMA_DESC_GEN_LSB;
+	/* SDmaDwordCount[10:0] */
+	sdmadesc[0] |= (dwlen & 0x7ffULL) << SDMA_DESC_COUNT_LSB;
+	/* SDmaBufOffset[12:2] */
+	sdmadesc[0] |= dwoffset & 0x7ffULL;
+}
+
+/* sdma_lock must be held */
+int qib_sdma_make_progress(struct qib_pportdata *ppd)
+{
+	struct list_head *lp = NULL;
+	struct qib_sdma_txreq *txp = NULL;
+	struct qib_devdata *dd = ppd->dd;
+	int progress = 0;
+	u16 hwhead;
+	u16 idx = 0;
+
+	hwhead = dd->f_sdma_gethead(ppd);
+
+	/* The reason for some of the complexity of this code is that
+	 * not all descriptors have corresponding txps.  So, we have to
+	 * be able to skip over descs until we wander into the range of
+	 * the next txp on the list.
+	 */
+
+	if (!list_empty(&ppd->sdma_activelist)) {
+		lp = ppd->sdma_activelist.next;
+		txp = list_entry(lp, struct qib_sdma_txreq, list);
+		idx = txp->start_idx;
+	}
+
+	while (ppd->sdma_descq_head != hwhead) {
+		/* if desc is part of this txp, unmap if needed */
+		if (txp && (txp->flags & QIB_SDMA_TXREQ_F_FREEDESC) &&
+		    (idx == ppd->sdma_descq_head)) {
+			unmap_desc(ppd, ppd->sdma_descq_head);
+			if (++idx == ppd->sdma_descq_cnt)
+				idx = 0;
+		}
+
+		/* increment dequed desc count */
+		ppd->sdma_descq_removed++;
+
+		/* advance head, wrap if needed */
+		if (++ppd->sdma_descq_head == ppd->sdma_descq_cnt)
+			ppd->sdma_descq_head = 0;
+
+		/* if now past this txp's descs, do the callback */
+		if (txp && txp->next_descq_idx == ppd->sdma_descq_head) {
+			/* remove from active list */
+			list_del_init(&txp->list);
+			if (txp->callback)
+				(*txp->callback)(txp, QIB_SDMA_TXREQ_S_OK);
+			/* see if there is another txp */
+			if (list_empty(&ppd->sdma_activelist))
+				txp = NULL;
+			else {
+				lp = ppd->sdma_activelist.next;
+				txp = list_entry(lp, struct qib_sdma_txreq,
+					list);
+				idx = txp->start_idx;
+			}
+		}
+		progress = 1;
+	}
+	if (progress)
+		qib_verbs_sdma_desc_avail(ppd, qib_sdma_descq_freecnt(ppd));
+	return progress;
+}
+
+/*
+ * This is called from interrupt context.
+ */
+void qib_sdma_intr(struct qib_pportdata *ppd)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&ppd->sdma_lock, flags);
+
+	__qib_sdma_intr(ppd);
+
+	spin_unlock_irqrestore(&ppd->sdma_lock, flags);
+}
+
+void __qib_sdma_intr(struct qib_pportdata *ppd)
+{
+	if (__qib_sdma_running(ppd)) {
+		qib_sdma_make_progress(ppd);
+		if (!list_empty(&ppd->sdma_userpending))
+			qib_user_sdma_send_desc(ppd, &ppd->sdma_userpending);
+	}
+}
+
+int qib_setup_sdma(struct qib_pportdata *ppd)
+{
+	struct qib_devdata *dd = ppd->dd;
+	unsigned long flags;
+	int ret = 0;
+
+	ret = alloc_sdma(ppd);
+	if (ret)
+		goto bail;
+
+	/* set consistent sdma state */
+	ppd->dd->f_sdma_init_early(ppd);
+	spin_lock_irqsave(&ppd->sdma_lock, flags);
+	sdma_set_state(ppd, qib_sdma_state_s00_hw_down);
+	spin_unlock_irqrestore(&ppd->sdma_lock, flags);
+
+	/* set up reference counting */
+	kref_init(&ppd->sdma_state.kref);
+	init_completion(&ppd->sdma_state.comp);
+
+	ppd->sdma_generation = 0;
+	ppd->sdma_descq_head = 0;
+	ppd->sdma_descq_removed = 0;
+	ppd->sdma_descq_added = 0;
+
+	ppd->sdma_intrequest = 0;
+	INIT_LIST_HEAD(&ppd->sdma_userpending);
+
+	INIT_LIST_HEAD(&ppd->sdma_activelist);
+
+	tasklet_init(&ppd->sdma_sw_clean_up_task, sdma_sw_clean_up_task,
+		(unsigned long)ppd);
+
+	ret = dd->f_init_sdma_regs(ppd);
+	if (ret)
+		goto bail_alloc;
+
+	qib_sdma_process_event(ppd, qib_sdma_event_e10_go_hw_start);
+
+	return 0;
+
+bail_alloc:
+	qib_teardown_sdma(ppd);
+bail:
+	return ret;
+}
+
+void qib_teardown_sdma(struct qib_pportdata *ppd)
+{
+	qib_sdma_process_event(ppd, qib_sdma_event_e00_go_hw_down);
+
+	/*
+	 * This waits for the state machine to exit so it is not
+	 * necessary to kill the sdma_sw_clean_up_task to make sure
+	 * it is not running.
+	 */
+	sdma_finalput(&ppd->sdma_state);
+
+	free_sdma(ppd);
+}
+
+int qib_sdma_running(struct qib_pportdata *ppd)
+{
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&ppd->sdma_lock, flags);
+	ret = __qib_sdma_running(ppd);
+	spin_unlock_irqrestore(&ppd->sdma_lock, flags);
+
+	return ret;
+}
+
+/*
+ * Complete a request when sdma not running; likely only request
+ * but to simplify the code, always queue it, then process the full
+ * activelist.  We process the entire list to ensure that this particular
+ * request does get it's callback, but in the correct order.
+ * Must be called with sdma_lock held
+ */
+static void complete_sdma_err_req(struct qib_pportdata *ppd,
+				  struct qib_verbs_txreq *tx)
+{
+	atomic_inc(&tx->qp->s_dma_busy);
+	/* no sdma descriptors, so no unmap_desc */
+	tx->txreq.start_idx = 0;
+	tx->txreq.next_descq_idx = 0;
+	list_add_tail(&tx->txreq.list, &ppd->sdma_activelist);
+	clear_sdma_activelist(ppd);
+}
+
+/*
+ * This function queues one IB packet onto the send DMA queue per call.
+ * The caller is responsible for checking:
+ * 1) The number of send DMA descriptor entries is less than the size of
+ *    the descriptor queue.
+ * 2) The IB SGE addresses and lengths are 32-bit aligned
+ *    (except possibly the last SGE's length)
+ * 3) The SGE addresses are suitable for passing to dma_map_single().
+ */
+int qib_sdma_verbs_send(struct qib_pportdata *ppd,
+			struct qib_sge_state *ss, u32 dwords,
+			struct qib_verbs_txreq *tx)
+{
+	unsigned long flags;
+	struct qib_sge *sge;
+	struct qib_qp *qp;
+	int ret = 0;
+	u16 tail;
+	__le64 *descqp;
+	u64 sdmadesc[2];
+	u32 dwoffset;
+	dma_addr_t addr;
+
+	spin_lock_irqsave(&ppd->sdma_lock, flags);
+
+retry:
+	if (unlikely(!__qib_sdma_running(ppd))) {
+		complete_sdma_err_req(ppd, tx);
+		goto unlock;
+	}
+
+	if (tx->txreq.sg_count > qib_sdma_descq_freecnt(ppd)) {
+		if (qib_sdma_make_progress(ppd))
+			goto retry;
+		if (ppd->dd->flags & QIB_HAS_SDMA_TIMEOUT)
+			ppd->dd->f_sdma_set_desc_cnt(ppd,
+					ppd->sdma_descq_cnt / 2);
+		goto busy;
+	}
+
+	dwoffset = tx->hdr_dwords;
+	make_sdma_desc(ppd, sdmadesc, (u64) tx->txreq.addr, dwoffset, 0);
+
+	sdmadesc[0] |= SDMA_DESC_FIRST;
+	if (tx->txreq.flags & QIB_SDMA_TXREQ_F_USELARGEBUF)
+		sdmadesc[0] |= SDMA_DESC_USE_LARGE_BUF;
+
+	/* write to the descq */
+	tail = ppd->sdma_descq_tail;
+	descqp = &ppd->sdma_descq[tail].qw[0];
+	*descqp++ = cpu_to_le64(sdmadesc[0]);
+	*descqp++ = cpu_to_le64(sdmadesc[1]);
+
+	/* increment the tail */
+	if (++tail == ppd->sdma_descq_cnt) {
+		tail = 0;
+		descqp = &ppd->sdma_descq[0].qw[0];
+		++ppd->sdma_generation;
+	}
+
+	tx->txreq.start_idx = tail;
+
+	sge = &ss->sge;
+	while (dwords) {
+		u32 dw;
+		u32 len;
+
+		len = dwords << 2;
+		if (len > sge->length)
+			len = sge->length;
+		if (len > sge->sge_length)
+			len = sge->sge_length;
+		BUG_ON(len == 0);
+		dw = (len + 3) >> 2;
+		addr = dma_map_single(&ppd->dd->pcidev->dev, sge->vaddr,
+				      dw << 2, DMA_TO_DEVICE);
+		if (dma_mapping_error(&ppd->dd->pcidev->dev, addr))
+			goto unmap;
+		sdmadesc[0] = 0;
+		make_sdma_desc(ppd, sdmadesc, (u64) addr, dw, dwoffset);
+		/* SDmaUseLargeBuf has to be set in every descriptor */
+		if (tx->txreq.flags & QIB_SDMA_TXREQ_F_USELARGEBUF)
+			sdmadesc[0] |= SDMA_DESC_USE_LARGE_BUF;
+		/* write to the descq */
+		*descqp++ = cpu_to_le64(sdmadesc[0]);
+		*descqp++ = cpu_to_le64(sdmadesc[1]);
+
+		/* increment the tail */
+		if (++tail == ppd->sdma_descq_cnt) {
+			tail = 0;
+			descqp = &ppd->sdma_descq[0].qw[0];
+			++ppd->sdma_generation;
+		}
+		sge->vaddr += len;
+		sge->length -= len;
+		sge->sge_length -= len;
+		if (sge->sge_length == 0) {
+			if (--ss->num_sge)
+				*sge = *ss->sg_list++;
+		} else if (sge->length == 0 && sge->mr->lkey) {
+			if (++sge->n >= QIB_SEGSZ) {
+				if (++sge->m >= sge->mr->mapsz)
+					break;
+				sge->n = 0;
+			}
+			sge->vaddr =
+				sge->mr->map[sge->m]->segs[sge->n].vaddr;
+			sge->length =
+				sge->mr->map[sge->m]->segs[sge->n].length;
+		}
+
+		dwoffset += dw;
+		dwords -= dw;
+	}
+
+	if (!tail)
+		descqp = &ppd->sdma_descq[ppd->sdma_descq_cnt].qw[0];
+	descqp -= 2;
+	descqp[0] |= cpu_to_le64(SDMA_DESC_LAST);
+	if (tx->txreq.flags & QIB_SDMA_TXREQ_F_HEADTOHOST)
+		descqp[0] |= cpu_to_le64(SDMA_DESC_DMA_HEAD);
+	if (tx->txreq.flags & QIB_SDMA_TXREQ_F_INTREQ)
+		descqp[0] |= cpu_to_le64(SDMA_DESC_INTR);
+
+	atomic_inc(&tx->qp->s_dma_busy);
+	tx->txreq.next_descq_idx = tail;
+	ppd->dd->f_sdma_update_tail(ppd, tail);
+	ppd->sdma_descq_added += tx->txreq.sg_count;
+	list_add_tail(&tx->txreq.list, &ppd->sdma_activelist);
+	goto unlock;
+
+unmap:
+	for (;;) {
+		if (!tail)
+			tail = ppd->sdma_descq_cnt - 1;
+		else
+			tail--;
+		if (tail == ppd->sdma_descq_tail)
+			break;
+		unmap_desc(ppd, tail);
+	}
+	qp = tx->qp;
+	qib_put_txreq(tx);
+	spin_lock(&qp->r_lock);
+	spin_lock(&qp->s_lock);
+	if (qp->ibqp.qp_type == IB_QPT_RC) {
+		/* XXX what about error sending RDMA read responses? */
+		if (ib_qib_state_ops[qp->state] & QIB_PROCESS_RECV_OK)
+			qib_error_qp(qp, IB_WC_GENERAL_ERR);
+	} else if (qp->s_wqe)
+		qib_send_complete(qp, qp->s_wqe, IB_WC_GENERAL_ERR);
+	spin_unlock(&qp->s_lock);
+	spin_unlock(&qp->r_lock);
+	/* return zero to process the next send work request */
+	goto unlock;
+
+busy:
+	qp = tx->qp;
+	spin_lock(&qp->s_lock);
+	if (ib_qib_state_ops[qp->state] & QIB_PROCESS_RECV_OK) {
+		struct qib_ibdev *dev;
+
+		/*
+		 * If we couldn't queue the DMA request, save the info
+		 * and try again later rather than destroying the
+		 * buffer and undoing the side effects of the copy.
+		 */
+		tx->ss = ss;
+		tx->dwords = dwords;
+		qp->s_tx = tx;
+		dev = &ppd->dd->verbs_dev;
+		spin_lock(&dev->pending_lock);
+		if (list_empty(&qp->iowait)) {
+			struct qib_ibport *ibp;
+
+			ibp = &ppd->ibport_data;
+			ibp->n_dmawait++;
+			qp->s_flags |= QIB_S_WAIT_DMA_DESC;
+			list_add_tail(&qp->iowait, &dev->dmawait);
+		}
+		spin_unlock(&dev->pending_lock);
+		qp->s_flags &= ~QIB_S_BUSY;
+		spin_unlock(&qp->s_lock);
+		ret = -EBUSY;
+	} else {
+		spin_unlock(&qp->s_lock);
+		qib_put_txreq(tx);
+	}
+unlock:
+	spin_unlock_irqrestore(&ppd->sdma_lock, flags);
+	return ret;
+}
+
+/*
+ * sdma_lock should be acquired before calling this routine
+ */
+void dump_sdma_state(struct qib_pportdata *ppd)
+{
+	struct qib_sdma_desc *descq;
+	struct qib_sdma_txreq *txp, *txpnext;
+	__le64 *descqp;
+	u64 desc[2];
+	u64 addr;
+	u16 gen, dwlen, dwoffset;
+	u16 head, tail, cnt;
+
+	head = ppd->sdma_descq_head;
+	tail = ppd->sdma_descq_tail;
+	cnt = qib_sdma_descq_freecnt(ppd);
+	descq = ppd->sdma_descq;
+
+	qib_dev_porterr(ppd->dd, ppd->port,
+		"SDMA ppd->sdma_descq_head: %u\n", head);
+	qib_dev_porterr(ppd->dd, ppd->port,
+		"SDMA ppd->sdma_descq_tail: %u\n", tail);
+	qib_dev_porterr(ppd->dd, ppd->port,
+		"SDMA sdma_descq_freecnt: %u\n", cnt);
+
+	/* print info for each entry in the descriptor queue */
+	while (head != tail) {
+		char flags[6] = { 'x', 'x', 'x', 'x', 'x', 0 };
+
+		descqp = &descq[head].qw[0];
+		desc[0] = le64_to_cpu(descqp[0]);
+		desc[1] = le64_to_cpu(descqp[1]);
+		flags[0] = (desc[0] & 1<<15) ? 'I' : '-';
+		flags[1] = (desc[0] & 1<<14) ? 'L' : 'S';
+		flags[2] = (desc[0] & 1<<13) ? 'H' : '-';
+		flags[3] = (desc[0] & 1<<12) ? 'F' : '-';
+		flags[4] = (desc[0] & 1<<11) ? 'L' : '-';
+		addr = (desc[1] << 32) | ((desc[0] >> 32) & 0xfffffffcULL);
+		gen = (desc[0] >> 30) & 3ULL;
+		dwlen = (desc[0] >> 14) & (0x7ffULL << 2);
+		dwoffset = (desc[0] & 0x7ffULL) << 2;
+		qib_dev_porterr(ppd->dd, ppd->port,
+			"SDMA sdmadesc[%u]: flags:%s addr:0x%016llx gen:%u len:%u bytes offset:%u bytes\n",
+			 head, flags, addr, gen, dwlen, dwoffset);
+		if (++head == ppd->sdma_descq_cnt)
+			head = 0;
+	}
+
+	/* print dma descriptor indices from the TX requests */
+	list_for_each_entry_safe(txp, txpnext, &ppd->sdma_activelist,
+				 list)
+		qib_dev_porterr(ppd->dd, ppd->port,
+			"SDMA txp->start_idx: %u txp->next_descq_idx: %u\n",
+			txp->start_idx, txp->next_descq_idx);
+}
+
+void qib_sdma_process_event(struct qib_pportdata *ppd,
+	enum qib_sdma_events event)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&ppd->sdma_lock, flags);
+
+	__qib_sdma_process_event(ppd, event);
+
+	if (ppd->sdma_state.current_state == qib_sdma_state_s99_running)
+		qib_verbs_sdma_desc_avail(ppd, qib_sdma_descq_freecnt(ppd));
+
+	spin_unlock_irqrestore(&ppd->sdma_lock, flags);
+}
+
+void __qib_sdma_process_event(struct qib_pportdata *ppd,
+	enum qib_sdma_events event)
+{
+	struct qib_sdma_state *ss = &ppd->sdma_state;
+
+	switch (ss->current_state) {
+	case qib_sdma_state_s00_hw_down:
+		switch (event) {
+		case qib_sdma_event_e00_go_hw_down:
+			break;
+		case qib_sdma_event_e30_go_running:
+			/*
+			 * If down, but running requested (usually result
+			 * of link up, then we need to start up.
+			 * This can happen when hw down is requested while
+			 * bringing the link up with traffic active on
+			 * 7220, e.g. */
+			ss->go_s99_running = 1;
+			/* fall through and start dma engine */
+		case qib_sdma_event_e10_go_hw_start:
+			/* This reference means the state machine is started */
+			sdma_get(&ppd->sdma_state);
+			sdma_set_state(ppd,
+				       qib_sdma_state_s10_hw_start_up_wait);
+			break;
+		case qib_sdma_event_e20_hw_started:
+			break;
+		case qib_sdma_event_e40_sw_cleaned:
+			sdma_sw_tear_down(ppd);
+			break;
+		case qib_sdma_event_e50_hw_cleaned:
+			break;
+		case qib_sdma_event_e60_hw_halted:
+			break;
+		case qib_sdma_event_e70_go_idle:
+			break;
+		case qib_sdma_event_e7220_err_halted:
+			break;
+		case qib_sdma_event_e7322_err_halted:
+			break;
+		case qib_sdma_event_e90_timer_tick:
+			break;
+		}
+		break;
+
+	case qib_sdma_state_s10_hw_start_up_wait:
+		switch (event) {
+		case qib_sdma_event_e00_go_hw_down:
+			sdma_set_state(ppd, qib_sdma_state_s00_hw_down);
+			sdma_sw_tear_down(ppd);
+			break;
+		case qib_sdma_event_e10_go_hw_start:
+			break;
+		case qib_sdma_event_e20_hw_started:
+			sdma_set_state(ppd, ss->go_s99_running ?
+				       qib_sdma_state_s99_running :
+				       qib_sdma_state_s20_idle);
+			break;
+		case qib_sdma_event_e30_go_running:
+			ss->go_s99_running = 1;
+			break;
+		case qib_sdma_event_e40_sw_cleaned:
+			break;
+		case qib_sdma_event_e50_hw_cleaned:
+			break;
+		case qib_sdma_event_e60_hw_halted:
+			break;
+		case qib_sdma_event_e70_go_idle:
+			ss->go_s99_running = 0;
+			break;
+		case qib_sdma_event_e7220_err_halted:
+			break;
+		case qib_sdma_event_e7322_err_halted:
+			break;
+		case qib_sdma_event_e90_timer_tick:
+			break;
+		}
+		break;
+
+	case qib_sdma_state_s20_idle:
+		switch (event) {
+		case qib_sdma_event_e00_go_hw_down:
+			sdma_set_state(ppd, qib_sdma_state_s00_hw_down);
+			sdma_sw_tear_down(ppd);
+			break;
+		case qib_sdma_event_e10_go_hw_start:
+			break;
+		case qib_sdma_event_e20_hw_started:
+			break;
+		case qib_sdma_event_e30_go_running:
+			sdma_set_state(ppd, qib_sdma_state_s99_running);
+			ss->go_s99_running = 1;
+			break;
+		case qib_sdma_event_e40_sw_cleaned:
+			break;
+		case qib_sdma_event_e50_hw_cleaned:
+			break;
+		case qib_sdma_event_e60_hw_halted:
+			break;
+		case qib_sdma_event_e70_go_idle:
+			break;
+		case qib_sdma_event_e7220_err_halted:
+			break;
+		case qib_sdma_event_e7322_err_halted:
+			break;
+		case qib_sdma_event_e90_timer_tick:
+			break;
+		}
+		break;
+
+	case qib_sdma_state_s30_sw_clean_up_wait:
+		switch (event) {
+		case qib_sdma_event_e00_go_hw_down:
+			sdma_set_state(ppd, qib_sdma_state_s00_hw_down);
+			break;
+		case qib_sdma_event_e10_go_hw_start:
+			break;
+		case qib_sdma_event_e20_hw_started:
+			break;
+		case qib_sdma_event_e30_go_running:
+			ss->go_s99_running = 1;
+			break;
+		case qib_sdma_event_e40_sw_cleaned:
+			sdma_set_state(ppd,
+				       qib_sdma_state_s10_hw_start_up_wait);
+			sdma_hw_start_up(ppd);
+			break;
+		case qib_sdma_event_e50_hw_cleaned:
+			break;
+		case qib_sdma_event_e60_hw_halted:
+			break;
+		case qib_sdma_event_e70_go_idle:
+			ss->go_s99_running = 0;
+			break;
+		case qib_sdma_event_e7220_err_halted:
+			break;
+		case qib_sdma_event_e7322_err_halted:
+			break;
+		case qib_sdma_event_e90_timer_tick:
+			break;
+		}
+		break;
+
+	case qib_sdma_state_s40_hw_clean_up_wait:
+		switch (event) {
+		case qib_sdma_event_e00_go_hw_down:
+			sdma_set_state(ppd, qib_sdma_state_s00_hw_down);
+			sdma_start_sw_clean_up(ppd);
+			break;
+		case qib_sdma_event_e10_go_hw_start:
+			break;
+		case qib_sdma_event_e20_hw_started:
+			break;
+		case qib_sdma_event_e30_go_running:
+			ss->go_s99_running = 1;
+			break;
+		case qib_sdma_event_e40_sw_cleaned:
+			break;
+		case qib_sdma_event_e50_hw_cleaned:
+			sdma_set_state(ppd,
+				       qib_sdma_state_s30_sw_clean_up_wait);
+			sdma_start_sw_clean_up(ppd);
+			break;
+		case qib_sdma_event_e60_hw_halted:
+			break;
+		case qib_sdma_event_e70_go_idle:
+			ss->go_s99_running = 0;
+			break;
+		case qib_sdma_event_e7220_err_halted:
+			break;
+		case qib_sdma_event_e7322_err_halted:
+			break;
+		case qib_sdma_event_e90_timer_tick:
+			break;
+		}
+		break;
+
+	case qib_sdma_state_s50_hw_halt_wait:
+		switch (event) {
+		case qib_sdma_event_e00_go_hw_down:
+			sdma_set_state(ppd, qib_sdma_state_s00_hw_down);
+			sdma_start_sw_clean_up(ppd);
+			break;
+		case qib_sdma_event_e10_go_hw_start:
+			break;
+		case qib_sdma_event_e20_hw_started:
+			break;
+		case qib_sdma_event_e30_go_running:
+			ss->go_s99_running = 1;
+			break;
+		case qib_sdma_event_e40_sw_cleaned:
+			break;
+		case qib_sdma_event_e50_hw_cleaned:
+			break;
+		case qib_sdma_event_e60_hw_halted:
+			sdma_set_state(ppd,
+				       qib_sdma_state_s40_hw_clean_up_wait);
+			ppd->dd->f_sdma_hw_clean_up(ppd);
+			break;
+		case qib_sdma_event_e70_go_idle:
+			ss->go_s99_running = 0;
+			break;
+		case qib_sdma_event_e7220_err_halted:
+			break;
+		case qib_sdma_event_e7322_err_halted:
+			break;
+		case qib_sdma_event_e90_timer_tick:
+			break;
+		}
+		break;
+
+	case qib_sdma_state_s99_running:
+		switch (event) {
+		case qib_sdma_event_e00_go_hw_down:
+			sdma_set_state(ppd, qib_sdma_state_s00_hw_down);
+			sdma_start_sw_clean_up(ppd);
+			break;
+		case qib_sdma_event_e10_go_hw_start:
+			break;
+		case qib_sdma_event_e20_hw_started:
+			break;
+		case qib_sdma_event_e30_go_running:
+			break;
+		case qib_sdma_event_e40_sw_cleaned:
+			break;
+		case qib_sdma_event_e50_hw_cleaned:
+			break;
+		case qib_sdma_event_e60_hw_halted:
+			sdma_set_state(ppd,
+				       qib_sdma_state_s30_sw_clean_up_wait);
+			sdma_start_sw_clean_up(ppd);
+			break;
+		case qib_sdma_event_e70_go_idle:
+			sdma_set_state(ppd, qib_sdma_state_s50_hw_halt_wait);
+			ss->go_s99_running = 0;
+			break;
+		case qib_sdma_event_e7220_err_halted:
+			sdma_set_state(ppd,
+				       qib_sdma_state_s30_sw_clean_up_wait);
+			sdma_start_sw_clean_up(ppd);
+			break;
+		case qib_sdma_event_e7322_err_halted:
+			sdma_set_state(ppd, qib_sdma_state_s50_hw_halt_wait);
+			break;
+		case qib_sdma_event_e90_timer_tick:
+			break;
+		}
+		break;
+	}
+
+	ss->last_event = event;
+}
diff --git a/drivers/infiniband/hw/qib/qib_srq.c b/drivers/infiniband/hw/qib/qib_srq.c
new file mode 100644
index 0000000..d623593
--- /dev/null
+++ b/drivers/infiniband/hw/qib/qib_srq.c
@@ -0,0 +1,380 @@
+/*
+ * Copyright (c) 2006, 2007, 2008, 2009 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/err.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+
+#include "qib_verbs.h"
+
+/**
+ * qib_post_srq_receive - post a receive on a shared receive queue
+ * @ibsrq: the SRQ to post the receive on
+ * @wr: the list of work requests to post
+ * @bad_wr: A pointer to the first WR to cause a problem is put here
+ *
+ * This may be called from interrupt context.
+ */
+int qib_post_srq_receive(struct ib_srq *ibsrq, struct ib_recv_wr *wr,
+			 struct ib_recv_wr **bad_wr)
+{
+	struct qib_srq *srq = to_isrq(ibsrq);
+	struct qib_rwq *wq;
+	unsigned long flags;
+	int ret;
+
+	for (; wr; wr = wr->next) {
+		struct qib_rwqe *wqe;
+		u32 next;
+		int i;
+
+		if ((unsigned) wr->num_sge > srq->rq.max_sge) {
+			*bad_wr = wr;
+			ret = -EINVAL;
+			goto bail;
+		}
+
+		spin_lock_irqsave(&srq->rq.lock, flags);
+		wq = srq->rq.wq;
+		next = wq->head + 1;
+		if (next >= srq->rq.size)
+			next = 0;
+		if (next == wq->tail) {
+			spin_unlock_irqrestore(&srq->rq.lock, flags);
+			*bad_wr = wr;
+			ret = -ENOMEM;
+			goto bail;
+		}
+
+		wqe = get_rwqe_ptr(&srq->rq, wq->head);
+		wqe->wr_id = wr->wr_id;
+		wqe->num_sge = wr->num_sge;
+		for (i = 0; i < wr->num_sge; i++)
+			wqe->sg_list[i] = wr->sg_list[i];
+		/* Make sure queue entry is written before the head index. */
+		smp_wmb();
+		wq->head = next;
+		spin_unlock_irqrestore(&srq->rq.lock, flags);
+	}
+	ret = 0;
+
+bail:
+	return ret;
+}
+
+/**
+ * qib_create_srq - create a shared receive queue
+ * @ibpd: the protection domain of the SRQ to create
+ * @srq_init_attr: the attributes of the SRQ
+ * @udata: data from libibverbs when creating a user SRQ
+ */
+struct ib_srq *qib_create_srq(struct ib_pd *ibpd,
+			      struct ib_srq_init_attr *srq_init_attr,
+			      struct ib_udata *udata)
+{
+	struct qib_ibdev *dev = to_idev(ibpd->device);
+	struct qib_srq *srq;
+	u32 sz;
+	struct ib_srq *ret;
+
+	if (srq_init_attr->srq_type != IB_SRQT_BASIC) {
+		ret = ERR_PTR(-ENOSYS);
+		goto done;
+	}
+
+	if (srq_init_attr->attr.max_sge == 0 ||
+	    srq_init_attr->attr.max_sge > ib_qib_max_srq_sges ||
+	    srq_init_attr->attr.max_wr == 0 ||
+	    srq_init_attr->attr.max_wr > ib_qib_max_srq_wrs) {
+		ret = ERR_PTR(-EINVAL);
+		goto done;
+	}
+
+	srq = kmalloc(sizeof(*srq), GFP_KERNEL);
+	if (!srq) {
+		ret = ERR_PTR(-ENOMEM);
+		goto done;
+	}
+
+	/*
+	 * Need to use vmalloc() if we want to support large #s of entries.
+	 */
+	srq->rq.size = srq_init_attr->attr.max_wr + 1;
+	srq->rq.max_sge = srq_init_attr->attr.max_sge;
+	sz = sizeof(struct ib_sge) * srq->rq.max_sge +
+		sizeof(struct qib_rwqe);
+	srq->rq.wq = vmalloc_user(sizeof(struct qib_rwq) + srq->rq.size * sz);
+	if (!srq->rq.wq) {
+		ret = ERR_PTR(-ENOMEM);
+		goto bail_srq;
+	}
+
+	/*
+	 * Return the address of the RWQ as the offset to mmap.
+	 * See qib_mmap() for details.
+	 */
+	if (udata && udata->outlen >= sizeof(__u64)) {
+		int err;
+		u32 s = sizeof(struct qib_rwq) + srq->rq.size * sz;
+
+		srq->ip =
+		    qib_create_mmap_info(dev, s, ibpd->uobject->context,
+					 srq->rq.wq);
+		if (!srq->ip) {
+			ret = ERR_PTR(-ENOMEM);
+			goto bail_wq;
+		}
+
+		err = ib_copy_to_udata(udata, &srq->ip->offset,
+				       sizeof(srq->ip->offset));
+		if (err) {
+			ret = ERR_PTR(err);
+			goto bail_ip;
+		}
+	} else
+		srq->ip = NULL;
+
+	/*
+	 * ib_create_srq() will initialize srq->ibsrq.
+	 */
+	spin_lock_init(&srq->rq.lock);
+	srq->rq.wq->head = 0;
+	srq->rq.wq->tail = 0;
+	srq->limit = srq_init_attr->attr.srq_limit;
+
+	spin_lock(&dev->n_srqs_lock);
+	if (dev->n_srqs_allocated == ib_qib_max_srqs) {
+		spin_unlock(&dev->n_srqs_lock);
+		ret = ERR_PTR(-ENOMEM);
+		goto bail_ip;
+	}
+
+	dev->n_srqs_allocated++;
+	spin_unlock(&dev->n_srqs_lock);
+
+	if (srq->ip) {
+		spin_lock_irq(&dev->pending_lock);
+		list_add(&srq->ip->pending_mmaps, &dev->pending_mmaps);
+		spin_unlock_irq(&dev->pending_lock);
+	}
+
+	ret = &srq->ibsrq;
+	goto done;
+
+bail_ip:
+	kfree(srq->ip);
+bail_wq:
+	vfree(srq->rq.wq);
+bail_srq:
+	kfree(srq);
+done:
+	return ret;
+}
+
+/**
+ * qib_modify_srq - modify a shared receive queue
+ * @ibsrq: the SRQ to modify
+ * @attr: the new attributes of the SRQ
+ * @attr_mask: indicates which attributes to modify
+ * @udata: user data for libibverbs.so
+ */
+int qib_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
+		   enum ib_srq_attr_mask attr_mask,
+		   struct ib_udata *udata)
+{
+	struct qib_srq *srq = to_isrq(ibsrq);
+	struct qib_rwq *wq;
+	int ret = 0;
+
+	if (attr_mask & IB_SRQ_MAX_WR) {
+		struct qib_rwq *owq;
+		struct qib_rwqe *p;
+		u32 sz, size, n, head, tail;
+
+		/* Check that the requested sizes are below the limits. */
+		if ((attr->max_wr > ib_qib_max_srq_wrs) ||
+		    ((attr_mask & IB_SRQ_LIMIT) ?
+		     attr->srq_limit : srq->limit) > attr->max_wr) {
+			ret = -EINVAL;
+			goto bail;
+		}
+
+		sz = sizeof(struct qib_rwqe) +
+			srq->rq.max_sge * sizeof(struct ib_sge);
+		size = attr->max_wr + 1;
+		wq = vmalloc_user(sizeof(struct qib_rwq) + size * sz);
+		if (!wq) {
+			ret = -ENOMEM;
+			goto bail;
+		}
+
+		/* Check that we can write the offset to mmap. */
+		if (udata && udata->inlen >= sizeof(__u64)) {
+			__u64 offset_addr;
+			__u64 offset = 0;
+
+			ret = ib_copy_from_udata(&offset_addr, udata,
+						 sizeof(offset_addr));
+			if (ret)
+				goto bail_free;
+			udata->outbuf =
+				(void __user *) (unsigned long) offset_addr;
+			ret = ib_copy_to_udata(udata, &offset,
+					       sizeof(offset));
+			if (ret)
+				goto bail_free;
+		}
+
+		spin_lock_irq(&srq->rq.lock);
+		/*
+		 * validate head and tail pointer values and compute
+		 * the number of remaining WQEs.
+		 */
+		owq = srq->rq.wq;
+		head = owq->head;
+		tail = owq->tail;
+		if (head >= srq->rq.size || tail >= srq->rq.size) {
+			ret = -EINVAL;
+			goto bail_unlock;
+		}
+		n = head;
+		if (n < tail)
+			n += srq->rq.size - tail;
+		else
+			n -= tail;
+		if (size <= n) {
+			ret = -EINVAL;
+			goto bail_unlock;
+		}
+		n = 0;
+		p = wq->wq;
+		while (tail != head) {
+			struct qib_rwqe *wqe;
+			int i;
+
+			wqe = get_rwqe_ptr(&srq->rq, tail);
+			p->wr_id = wqe->wr_id;
+			p->num_sge = wqe->num_sge;
+			for (i = 0; i < wqe->num_sge; i++)
+				p->sg_list[i] = wqe->sg_list[i];
+			n++;
+			p = (struct qib_rwqe *)((char *) p + sz);
+			if (++tail >= srq->rq.size)
+				tail = 0;
+		}
+		srq->rq.wq = wq;
+		srq->rq.size = size;
+		wq->head = n;
+		wq->tail = 0;
+		if (attr_mask & IB_SRQ_LIMIT)
+			srq->limit = attr->srq_limit;
+		spin_unlock_irq(&srq->rq.lock);
+
+		vfree(owq);
+
+		if (srq->ip) {
+			struct qib_mmap_info *ip = srq->ip;
+			struct qib_ibdev *dev = to_idev(srq->ibsrq.device);
+			u32 s = sizeof(struct qib_rwq) + size * sz;
+
+			qib_update_mmap_info(dev, ip, s, wq);
+
+			/*
+			 * Return the offset to mmap.
+			 * See qib_mmap() for details.
+			 */
+			if (udata && udata->inlen >= sizeof(__u64)) {
+				ret = ib_copy_to_udata(udata, &ip->offset,
+						       sizeof(ip->offset));
+				if (ret)
+					goto bail;
+			}
+
+			/*
+			 * Put user mapping info onto the pending list
+			 * unless it already is on the list.
+			 */
+			spin_lock_irq(&dev->pending_lock);
+			if (list_empty(&ip->pending_mmaps))
+				list_add(&ip->pending_mmaps,
+					 &dev->pending_mmaps);
+			spin_unlock_irq(&dev->pending_lock);
+		}
+	} else if (attr_mask & IB_SRQ_LIMIT) {
+		spin_lock_irq(&srq->rq.lock);
+		if (attr->srq_limit >= srq->rq.size)
+			ret = -EINVAL;
+		else
+			srq->limit = attr->srq_limit;
+		spin_unlock_irq(&srq->rq.lock);
+	}
+	goto bail;
+
+bail_unlock:
+	spin_unlock_irq(&srq->rq.lock);
+bail_free:
+	vfree(wq);
+bail:
+	return ret;
+}
+
+int qib_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr)
+{
+	struct qib_srq *srq = to_isrq(ibsrq);
+
+	attr->max_wr = srq->rq.size - 1;
+	attr->max_sge = srq->rq.max_sge;
+	attr->srq_limit = srq->limit;
+	return 0;
+}
+
+/**
+ * qib_destroy_srq - destroy a shared receive queue
+ * @ibsrq: the SRQ to destroy
+ */
+int qib_destroy_srq(struct ib_srq *ibsrq)
+{
+	struct qib_srq *srq = to_isrq(ibsrq);
+	struct qib_ibdev *dev = to_idev(ibsrq->device);
+
+	spin_lock(&dev->n_srqs_lock);
+	dev->n_srqs_allocated--;
+	spin_unlock(&dev->n_srqs_lock);
+	if (srq->ip)
+		kref_put(&srq->ip->ref, qib_release_mmap_info);
+	else
+		vfree(srq->rq.wq);
+	kfree(srq);
+
+	return 0;
+}
diff --git a/drivers/infiniband/hw/qib/qib_sysfs.c b/drivers/infiniband/hw/qib/qib_sysfs.c
new file mode 100644
index 0000000..3c8e4e3
--- /dev/null
+++ b/drivers/infiniband/hw/qib/qib_sysfs.c
@@ -0,0 +1,842 @@
+/*
+ * Copyright (c) 2012 Intel Corporation.  All rights reserved.
+ * Copyright (c) 2006 - 2012 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include <linux/ctype.h>
+
+#include "qib.h"
+#include "qib_mad.h"
+
+/* start of per-port functions */
+/*
+ * Get/Set heartbeat enable. OR of 1=enabled, 2=auto
+ */
+static ssize_t show_hrtbt_enb(struct qib_pportdata *ppd, char *buf)
+{
+	struct qib_devdata *dd = ppd->dd;
+	int ret;
+
+	ret = dd->f_get_ib_cfg(ppd, QIB_IB_CFG_HRTBT);
+	ret = scnprintf(buf, PAGE_SIZE, "%d\n", ret);
+	return ret;
+}
+
+static ssize_t store_hrtbt_enb(struct qib_pportdata *ppd, const char *buf,
+			       size_t count)
+{
+	struct qib_devdata *dd = ppd->dd;
+	int ret;
+	u16 val;
+
+	ret = kstrtou16(buf, 0, &val);
+	if (ret) {
+		qib_dev_err(dd, "attempt to set invalid Heartbeat enable\n");
+		return ret;
+	}
+
+	/*
+	 * Set the "intentional" heartbeat enable per either of
+	 * "Enable" and "Auto", as these are normally set together.
+	 * This bit is consulted when leaving loopback mode,
+	 * because entering loopback mode overrides it and automatically
+	 * disables heartbeat.
+	 */
+	ret = dd->f_set_ib_cfg(ppd, QIB_IB_CFG_HRTBT, val);
+	return ret < 0 ? ret : count;
+}
+
+static ssize_t store_loopback(struct qib_pportdata *ppd, const char *buf,
+			      size_t count)
+{
+	struct qib_devdata *dd = ppd->dd;
+	int ret = count, r;
+
+	r = dd->f_set_ib_loopback(ppd, buf);
+	if (r < 0)
+		ret = r;
+
+	return ret;
+}
+
+static ssize_t store_led_override(struct qib_pportdata *ppd, const char *buf,
+				  size_t count)
+{
+	struct qib_devdata *dd = ppd->dd;
+	int ret;
+	u16 val;
+
+	ret = kstrtou16(buf, 0, &val);
+	if (ret) {
+		qib_dev_err(dd, "attempt to set invalid LED override\n");
+		return ret;
+	}
+
+	qib_set_led_override(ppd, val);
+	return count;
+}
+
+static ssize_t show_status(struct qib_pportdata *ppd, char *buf)
+{
+	ssize_t ret;
+
+	if (!ppd->statusp)
+		ret = -EINVAL;
+	else
+		ret = scnprintf(buf, PAGE_SIZE, "0x%llx\n",
+				(unsigned long long) *(ppd->statusp));
+	return ret;
+}
+
+/*
+ * For userland compatibility, these offsets must remain fixed.
+ * They are strings for QIB_STATUS_*
+ */
+static const char * const qib_status_str[] = {
+	"Initted",
+	"",
+	"",
+	"",
+	"",
+	"Present",
+	"IB_link_up",
+	"IB_configured",
+	"",
+	"Fatal_Hardware_Error",
+	NULL,
+};
+
+static ssize_t show_status_str(struct qib_pportdata *ppd, char *buf)
+{
+	int i, any;
+	u64 s;
+	ssize_t ret;
+
+	if (!ppd->statusp) {
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	s = *(ppd->statusp);
+	*buf = '\0';
+	for (any = i = 0; s && qib_status_str[i]; i++) {
+		if (s & 1) {
+			/* if overflow */
+			if (any && strlcat(buf, " ", PAGE_SIZE) >= PAGE_SIZE)
+				break;
+			if (strlcat(buf, qib_status_str[i], PAGE_SIZE) >=
+					PAGE_SIZE)
+				break;
+			any = 1;
+		}
+		s >>= 1;
+	}
+	if (any)
+		strlcat(buf, "\n", PAGE_SIZE);
+
+	ret = strlen(buf);
+
+bail:
+	return ret;
+}
+
+/* end of per-port functions */
+
+/*
+ * Start of per-port file structures and support code
+ * Because we are fitting into other infrastructure, we have to supply the
+ * full set of kobject/sysfs_ops structures and routines.
+ */
+#define QIB_PORT_ATTR(name, mode, show, store) \
+	static struct qib_port_attr qib_port_attr_##name = \
+		__ATTR(name, mode, show, store)
+
+struct qib_port_attr {
+	struct attribute attr;
+	ssize_t (*show)(struct qib_pportdata *, char *);
+	ssize_t (*store)(struct qib_pportdata *, const char *, size_t);
+};
+
+QIB_PORT_ATTR(loopback, S_IWUSR, NULL, store_loopback);
+QIB_PORT_ATTR(led_override, S_IWUSR, NULL, store_led_override);
+QIB_PORT_ATTR(hrtbt_enable, S_IWUSR | S_IRUGO, show_hrtbt_enb,
+	      store_hrtbt_enb);
+QIB_PORT_ATTR(status, S_IRUGO, show_status, NULL);
+QIB_PORT_ATTR(status_str, S_IRUGO, show_status_str, NULL);
+
+static struct attribute *port_default_attributes[] = {
+	&qib_port_attr_loopback.attr,
+	&qib_port_attr_led_override.attr,
+	&qib_port_attr_hrtbt_enable.attr,
+	&qib_port_attr_status.attr,
+	&qib_port_attr_status_str.attr,
+	NULL
+};
+
+/*
+ * Start of per-port congestion control structures and support code
+ */
+
+/*
+ * Congestion control table size followed by table entries
+ */
+static ssize_t read_cc_table_bin(struct file *filp, struct kobject *kobj,
+		struct bin_attribute *bin_attr,
+		char *buf, loff_t pos, size_t count)
+{
+	int ret;
+	struct qib_pportdata *ppd =
+		container_of(kobj, struct qib_pportdata, pport_cc_kobj);
+
+	if (!qib_cc_table_size || !ppd->ccti_entries_shadow)
+		return -EINVAL;
+
+	ret = ppd->total_cct_entry * sizeof(struct ib_cc_table_entry_shadow)
+		 + sizeof(__be16);
+
+	if (pos > ret)
+		return -EINVAL;
+
+	if (count > ret - pos)
+		count = ret - pos;
+
+	if (!count)
+		return count;
+
+	spin_lock(&ppd->cc_shadow_lock);
+	memcpy(buf, ppd->ccti_entries_shadow, count);
+	spin_unlock(&ppd->cc_shadow_lock);
+
+	return count;
+}
+
+static void qib_port_release(struct kobject *kobj)
+{
+	/* nothing to do since memory is freed by qib_free_devdata() */
+}
+
+static struct kobj_type qib_port_cc_ktype = {
+	.release = qib_port_release,
+};
+
+static struct bin_attribute cc_table_bin_attr = {
+	.attr = {.name = "cc_table_bin", .mode = 0444},
+	.read = read_cc_table_bin,
+	.size = PAGE_SIZE,
+};
+
+/*
+ * Congestion settings: port control, control map and an array of 16
+ * entries for the congestion entries - increase, timer, event log
+ * trigger threshold and the minimum injection rate delay.
+ */
+static ssize_t read_cc_setting_bin(struct file *filp, struct kobject *kobj,
+		struct bin_attribute *bin_attr,
+		char *buf, loff_t pos, size_t count)
+{
+	int ret;
+	struct qib_pportdata *ppd =
+		container_of(kobj, struct qib_pportdata, pport_cc_kobj);
+
+	if (!qib_cc_table_size || !ppd->congestion_entries_shadow)
+		return -EINVAL;
+
+	ret = sizeof(struct ib_cc_congestion_setting_attr_shadow);
+
+	if (pos > ret)
+		return -EINVAL;
+	if (count > ret - pos)
+		count = ret - pos;
+
+	if (!count)
+		return count;
+
+	spin_lock(&ppd->cc_shadow_lock);
+	memcpy(buf, ppd->congestion_entries_shadow, count);
+	spin_unlock(&ppd->cc_shadow_lock);
+
+	return count;
+}
+
+static struct bin_attribute cc_setting_bin_attr = {
+	.attr = {.name = "cc_settings_bin", .mode = 0444},
+	.read = read_cc_setting_bin,
+	.size = PAGE_SIZE,
+};
+
+
+static ssize_t qib_portattr_show(struct kobject *kobj,
+	struct attribute *attr, char *buf)
+{
+	struct qib_port_attr *pattr =
+		container_of(attr, struct qib_port_attr, attr);
+	struct qib_pportdata *ppd =
+		container_of(kobj, struct qib_pportdata, pport_kobj);
+
+	return pattr->show(ppd, buf);
+}
+
+static ssize_t qib_portattr_store(struct kobject *kobj,
+	struct attribute *attr, const char *buf, size_t len)
+{
+	struct qib_port_attr *pattr =
+		container_of(attr, struct qib_port_attr, attr);
+	struct qib_pportdata *ppd =
+		container_of(kobj, struct qib_pportdata, pport_kobj);
+
+	return pattr->store(ppd, buf, len);
+}
+
+
+static const struct sysfs_ops qib_port_ops = {
+	.show = qib_portattr_show,
+	.store = qib_portattr_store,
+};
+
+static struct kobj_type qib_port_ktype = {
+	.release = qib_port_release,
+	.sysfs_ops = &qib_port_ops,
+	.default_attrs = port_default_attributes
+};
+
+/* Start sl2vl */
+
+#define QIB_SL2VL_ATTR(N) \
+	static struct qib_sl2vl_attr qib_sl2vl_attr_##N = { \
+		.attr = { .name = __stringify(N), .mode = 0444 }, \
+		.sl = N \
+	}
+
+struct qib_sl2vl_attr {
+	struct attribute attr;
+	int sl;
+};
+
+QIB_SL2VL_ATTR(0);
+QIB_SL2VL_ATTR(1);
+QIB_SL2VL_ATTR(2);
+QIB_SL2VL_ATTR(3);
+QIB_SL2VL_ATTR(4);
+QIB_SL2VL_ATTR(5);
+QIB_SL2VL_ATTR(6);
+QIB_SL2VL_ATTR(7);
+QIB_SL2VL_ATTR(8);
+QIB_SL2VL_ATTR(9);
+QIB_SL2VL_ATTR(10);
+QIB_SL2VL_ATTR(11);
+QIB_SL2VL_ATTR(12);
+QIB_SL2VL_ATTR(13);
+QIB_SL2VL_ATTR(14);
+QIB_SL2VL_ATTR(15);
+
+static struct attribute *sl2vl_default_attributes[] = {
+	&qib_sl2vl_attr_0.attr,
+	&qib_sl2vl_attr_1.attr,
+	&qib_sl2vl_attr_2.attr,
+	&qib_sl2vl_attr_3.attr,
+	&qib_sl2vl_attr_4.attr,
+	&qib_sl2vl_attr_5.attr,
+	&qib_sl2vl_attr_6.attr,
+	&qib_sl2vl_attr_7.attr,
+	&qib_sl2vl_attr_8.attr,
+	&qib_sl2vl_attr_9.attr,
+	&qib_sl2vl_attr_10.attr,
+	&qib_sl2vl_attr_11.attr,
+	&qib_sl2vl_attr_12.attr,
+	&qib_sl2vl_attr_13.attr,
+	&qib_sl2vl_attr_14.attr,
+	&qib_sl2vl_attr_15.attr,
+	NULL
+};
+
+static ssize_t sl2vl_attr_show(struct kobject *kobj, struct attribute *attr,
+			       char *buf)
+{
+	struct qib_sl2vl_attr *sattr =
+		container_of(attr, struct qib_sl2vl_attr, attr);
+	struct qib_pportdata *ppd =
+		container_of(kobj, struct qib_pportdata, sl2vl_kobj);
+	struct qib_ibport *qibp = &ppd->ibport_data;
+
+	return sprintf(buf, "%u\n", qibp->sl_to_vl[sattr->sl]);
+}
+
+static const struct sysfs_ops qib_sl2vl_ops = {
+	.show = sl2vl_attr_show,
+};
+
+static struct kobj_type qib_sl2vl_ktype = {
+	.release = qib_port_release,
+	.sysfs_ops = &qib_sl2vl_ops,
+	.default_attrs = sl2vl_default_attributes
+};
+
+/* End sl2vl */
+
+/* Start diag_counters */
+
+#define QIB_DIAGC_ATTR(N) \
+	static struct qib_diagc_attr qib_diagc_attr_##N = { \
+		.attr = { .name = __stringify(N), .mode = 0664 }, \
+		.counter = offsetof(struct qib_ibport, n_##N) \
+	}
+
+struct qib_diagc_attr {
+	struct attribute attr;
+	size_t counter;
+};
+
+QIB_DIAGC_ATTR(rc_resends);
+QIB_DIAGC_ATTR(rc_acks);
+QIB_DIAGC_ATTR(rc_qacks);
+QIB_DIAGC_ATTR(rc_delayed_comp);
+QIB_DIAGC_ATTR(seq_naks);
+QIB_DIAGC_ATTR(rdma_seq);
+QIB_DIAGC_ATTR(rnr_naks);
+QIB_DIAGC_ATTR(other_naks);
+QIB_DIAGC_ATTR(rc_timeouts);
+QIB_DIAGC_ATTR(loop_pkts);
+QIB_DIAGC_ATTR(pkt_drops);
+QIB_DIAGC_ATTR(dmawait);
+QIB_DIAGC_ATTR(unaligned);
+QIB_DIAGC_ATTR(rc_dupreq);
+QIB_DIAGC_ATTR(rc_seqnak);
+
+static struct attribute *diagc_default_attributes[] = {
+	&qib_diagc_attr_rc_resends.attr,
+	&qib_diagc_attr_rc_acks.attr,
+	&qib_diagc_attr_rc_qacks.attr,
+	&qib_diagc_attr_rc_delayed_comp.attr,
+	&qib_diagc_attr_seq_naks.attr,
+	&qib_diagc_attr_rdma_seq.attr,
+	&qib_diagc_attr_rnr_naks.attr,
+	&qib_diagc_attr_other_naks.attr,
+	&qib_diagc_attr_rc_timeouts.attr,
+	&qib_diagc_attr_loop_pkts.attr,
+	&qib_diagc_attr_pkt_drops.attr,
+	&qib_diagc_attr_dmawait.attr,
+	&qib_diagc_attr_unaligned.attr,
+	&qib_diagc_attr_rc_dupreq.attr,
+	&qib_diagc_attr_rc_seqnak.attr,
+	NULL
+};
+
+static ssize_t diagc_attr_show(struct kobject *kobj, struct attribute *attr,
+			       char *buf)
+{
+	struct qib_diagc_attr *dattr =
+		container_of(attr, struct qib_diagc_attr, attr);
+	struct qib_pportdata *ppd =
+		container_of(kobj, struct qib_pportdata, diagc_kobj);
+	struct qib_ibport *qibp = &ppd->ibport_data;
+
+	return sprintf(buf, "%u\n", *(u32 *)((char *)qibp + dattr->counter));
+}
+
+static ssize_t diagc_attr_store(struct kobject *kobj, struct attribute *attr,
+				const char *buf, size_t size)
+{
+	struct qib_diagc_attr *dattr =
+		container_of(attr, struct qib_diagc_attr, attr);
+	struct qib_pportdata *ppd =
+		container_of(kobj, struct qib_pportdata, diagc_kobj);
+	struct qib_ibport *qibp = &ppd->ibport_data;
+	u32 val;
+	int ret;
+
+	ret = kstrtou32(buf, 0, &val);
+	if (ret)
+		return ret;
+	*(u32 *)((char *) qibp + dattr->counter) = val;
+	return size;
+}
+
+static const struct sysfs_ops qib_diagc_ops = {
+	.show = diagc_attr_show,
+	.store = diagc_attr_store,
+};
+
+static struct kobj_type qib_diagc_ktype = {
+	.release = qib_port_release,
+	.sysfs_ops = &qib_diagc_ops,
+	.default_attrs = diagc_default_attributes
+};
+
+/* End diag_counters */
+
+/* end of per-port file structures and support code */
+
+/*
+ * Start of per-unit (or driver, in some cases, but replicated
+ * per unit) functions (these get a device *)
+ */
+static ssize_t show_rev(struct device *device, struct device_attribute *attr,
+			char *buf)
+{
+	struct qib_ibdev *dev =
+		container_of(device, struct qib_ibdev, ibdev.dev);
+
+	return sprintf(buf, "%x\n", dd_from_dev(dev)->minrev);
+}
+
+static ssize_t show_hca(struct device *device, struct device_attribute *attr,
+			char *buf)
+{
+	struct qib_ibdev *dev =
+		container_of(device, struct qib_ibdev, ibdev.dev);
+	struct qib_devdata *dd = dd_from_dev(dev);
+	int ret;
+
+	if (!dd->boardname)
+		ret = -EINVAL;
+	else
+		ret = scnprintf(buf, PAGE_SIZE, "%s\n", dd->boardname);
+	return ret;
+}
+
+static ssize_t show_version(struct device *device,
+			    struct device_attribute *attr, char *buf)
+{
+	/* The string printed here is already newline-terminated. */
+	return scnprintf(buf, PAGE_SIZE, "%s", (char *)ib_qib_version);
+}
+
+static ssize_t show_boardversion(struct device *device,
+				 struct device_attribute *attr, char *buf)
+{
+	struct qib_ibdev *dev =
+		container_of(device, struct qib_ibdev, ibdev.dev);
+	struct qib_devdata *dd = dd_from_dev(dev);
+
+	/* The string printed here is already newline-terminated. */
+	return scnprintf(buf, PAGE_SIZE, "%s", dd->boardversion);
+}
+
+
+static ssize_t show_localbus_info(struct device *device,
+				  struct device_attribute *attr, char *buf)
+{
+	struct qib_ibdev *dev =
+		container_of(device, struct qib_ibdev, ibdev.dev);
+	struct qib_devdata *dd = dd_from_dev(dev);
+
+	/* The string printed here is already newline-terminated. */
+	return scnprintf(buf, PAGE_SIZE, "%s", dd->lbus_info);
+}
+
+
+static ssize_t show_nctxts(struct device *device,
+			   struct device_attribute *attr, char *buf)
+{
+	struct qib_ibdev *dev =
+		container_of(device, struct qib_ibdev, ibdev.dev);
+	struct qib_devdata *dd = dd_from_dev(dev);
+
+	/* Return the number of user ports (contexts) available. */
+	/* The calculation below deals with a special case where
+	 * cfgctxts is set to 1 on a single-port board. */
+	return scnprintf(buf, PAGE_SIZE, "%u\n",
+			(dd->first_user_ctxt > dd->cfgctxts) ? 0 :
+			(dd->cfgctxts - dd->first_user_ctxt));
+}
+
+static ssize_t show_nfreectxts(struct device *device,
+			   struct device_attribute *attr, char *buf)
+{
+	struct qib_ibdev *dev =
+		container_of(device, struct qib_ibdev, ibdev.dev);
+	struct qib_devdata *dd = dd_from_dev(dev);
+
+	/* Return the number of free user ports (contexts) available. */
+	return scnprintf(buf, PAGE_SIZE, "%u\n", dd->freectxts);
+}
+
+static ssize_t show_serial(struct device *device,
+			   struct device_attribute *attr, char *buf)
+{
+	struct qib_ibdev *dev =
+		container_of(device, struct qib_ibdev, ibdev.dev);
+	struct qib_devdata *dd = dd_from_dev(dev);
+
+	buf[sizeof dd->serial] = '\0';
+	memcpy(buf, dd->serial, sizeof dd->serial);
+	strcat(buf, "\n");
+	return strlen(buf);
+}
+
+static ssize_t store_chip_reset(struct device *device,
+				struct device_attribute *attr, const char *buf,
+				size_t count)
+{
+	struct qib_ibdev *dev =
+		container_of(device, struct qib_ibdev, ibdev.dev);
+	struct qib_devdata *dd = dd_from_dev(dev);
+	int ret;
+
+	if (count < 5 || memcmp(buf, "reset", 5) || !dd->diag_client) {
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	ret = qib_reset_device(dd->unit);
+bail:
+	return ret < 0 ? ret : count;
+}
+
+static ssize_t show_logged_errs(struct device *device,
+				struct device_attribute *attr, char *buf)
+{
+	struct qib_ibdev *dev =
+		container_of(device, struct qib_ibdev, ibdev.dev);
+	struct qib_devdata *dd = dd_from_dev(dev);
+	int idx, count;
+
+	/* force consistency with actual EEPROM */
+	if (qib_update_eeprom_log(dd) != 0)
+		return -ENXIO;
+
+	count = 0;
+	for (idx = 0; idx < QIB_EEP_LOG_CNT; ++idx) {
+		count += scnprintf(buf + count, PAGE_SIZE - count, "%d%c",
+				   dd->eep_st_errs[idx],
+				   idx == (QIB_EEP_LOG_CNT - 1) ? '\n' : ' ');
+	}
+
+	return count;
+}
+
+/*
+ * Dump tempsense regs. in decimal, to ease shell-scripts.
+ */
+static ssize_t show_tempsense(struct device *device,
+			      struct device_attribute *attr, char *buf)
+{
+	struct qib_ibdev *dev =
+		container_of(device, struct qib_ibdev, ibdev.dev);
+	struct qib_devdata *dd = dd_from_dev(dev);
+	int ret;
+	int idx;
+	u8 regvals[8];
+
+	ret = -ENXIO;
+	for (idx = 0; idx < 8; ++idx) {
+		if (idx == 6)
+			continue;
+		ret = dd->f_tempsense_rd(dd, idx);
+		if (ret < 0)
+			break;
+		regvals[idx] = ret;
+	}
+	if (idx == 8)
+		ret = scnprintf(buf, PAGE_SIZE, "%d %d %02X %02X %d %d\n",
+				*(signed char *)(regvals),
+				*(signed char *)(regvals + 1),
+				regvals[2], regvals[3],
+				*(signed char *)(regvals + 5),
+				*(signed char *)(regvals + 7));
+	return ret;
+}
+
+/*
+ * end of per-unit (or driver, in some cases, but replicated
+ * per unit) functions
+ */
+
+/* start of per-unit file structures and support code */
+static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL);
+static DEVICE_ATTR(hca_type, S_IRUGO, show_hca, NULL);
+static DEVICE_ATTR(board_id, S_IRUGO, show_hca, NULL);
+static DEVICE_ATTR(version, S_IRUGO, show_version, NULL);
+static DEVICE_ATTR(nctxts, S_IRUGO, show_nctxts, NULL);
+static DEVICE_ATTR(nfreectxts, S_IRUGO, show_nfreectxts, NULL);
+static DEVICE_ATTR(serial, S_IRUGO, show_serial, NULL);
+static DEVICE_ATTR(boardversion, S_IRUGO, show_boardversion, NULL);
+static DEVICE_ATTR(logged_errors, S_IRUGO, show_logged_errs, NULL);
+static DEVICE_ATTR(tempsense, S_IRUGO, show_tempsense, NULL);
+static DEVICE_ATTR(localbus_info, S_IRUGO, show_localbus_info, NULL);
+static DEVICE_ATTR(chip_reset, S_IWUSR, NULL, store_chip_reset);
+
+static struct device_attribute *qib_attributes[] = {
+	&dev_attr_hw_rev,
+	&dev_attr_hca_type,
+	&dev_attr_board_id,
+	&dev_attr_version,
+	&dev_attr_nctxts,
+	&dev_attr_nfreectxts,
+	&dev_attr_serial,
+	&dev_attr_boardversion,
+	&dev_attr_logged_errors,
+	&dev_attr_tempsense,
+	&dev_attr_localbus_info,
+	&dev_attr_chip_reset,
+};
+
+int qib_create_port_files(struct ib_device *ibdev, u8 port_num,
+			  struct kobject *kobj)
+{
+	struct qib_pportdata *ppd;
+	struct qib_devdata *dd = dd_from_ibdev(ibdev);
+	int ret;
+
+	if (!port_num || port_num > dd->num_pports) {
+		qib_dev_err(dd,
+			"Skipping infiniband class with invalid port %u\n",
+			port_num);
+		ret = -ENODEV;
+		goto bail;
+	}
+	ppd = &dd->pport[port_num - 1];
+
+	ret = kobject_init_and_add(&ppd->pport_kobj, &qib_port_ktype, kobj,
+				   "linkcontrol");
+	if (ret) {
+		qib_dev_err(dd,
+			"Skipping linkcontrol sysfs info, (err %d) port %u\n",
+			ret, port_num);
+		goto bail;
+	}
+	kobject_uevent(&ppd->pport_kobj, KOBJ_ADD);
+
+	ret = kobject_init_and_add(&ppd->sl2vl_kobj, &qib_sl2vl_ktype, kobj,
+				   "sl2vl");
+	if (ret) {
+		qib_dev_err(dd,
+			"Skipping sl2vl sysfs info, (err %d) port %u\n",
+			ret, port_num);
+		goto bail_link;
+	}
+	kobject_uevent(&ppd->sl2vl_kobj, KOBJ_ADD);
+
+	ret = kobject_init_and_add(&ppd->diagc_kobj, &qib_diagc_ktype, kobj,
+				   "diag_counters");
+	if (ret) {
+		qib_dev_err(dd,
+			"Skipping diag_counters sysfs info, (err %d) port %u\n",
+			ret, port_num);
+		goto bail_sl;
+	}
+	kobject_uevent(&ppd->diagc_kobj, KOBJ_ADD);
+
+	if (!qib_cc_table_size || !ppd->congestion_entries_shadow)
+		return 0;
+
+	ret = kobject_init_and_add(&ppd->pport_cc_kobj, &qib_port_cc_ktype,
+				kobj, "CCMgtA");
+	if (ret) {
+		qib_dev_err(dd,
+		 "Skipping Congestion Control sysfs info, (err %d) port %u\n",
+		 ret, port_num);
+		goto bail_diagc;
+	}
+
+	kobject_uevent(&ppd->pport_cc_kobj, KOBJ_ADD);
+
+	ret = sysfs_create_bin_file(&ppd->pport_cc_kobj,
+				&cc_setting_bin_attr);
+	if (ret) {
+		qib_dev_err(dd,
+		 "Skipping Congestion Control setting sysfs info, (err %d) port %u\n",
+		 ret, port_num);
+		goto bail_cc;
+	}
+
+	ret = sysfs_create_bin_file(&ppd->pport_cc_kobj,
+				&cc_table_bin_attr);
+	if (ret) {
+		qib_dev_err(dd,
+		 "Skipping Congestion Control table sysfs info, (err %d) port %u\n",
+		 ret, port_num);
+		goto bail_cc_entry_bin;
+	}
+
+	qib_devinfo(dd->pcidev,
+		"IB%u: Congestion Control Agent enabled for port %d\n",
+		dd->unit, port_num);
+
+	return 0;
+
+bail_cc_entry_bin:
+	sysfs_remove_bin_file(&ppd->pport_cc_kobj, &cc_setting_bin_attr);
+bail_cc:
+	kobject_put(&ppd->pport_cc_kobj);
+bail_diagc:
+	kobject_put(&ppd->diagc_kobj);
+bail_sl:
+	kobject_put(&ppd->sl2vl_kobj);
+bail_link:
+	kobject_put(&ppd->pport_kobj);
+bail:
+	return ret;
+}
+
+/*
+ * Register and create our files in /sys/class/infiniband.
+ */
+int qib_verbs_register_sysfs(struct qib_devdata *dd)
+{
+	struct ib_device *dev = &dd->verbs_dev.ibdev;
+	int i, ret;
+
+	for (i = 0; i < ARRAY_SIZE(qib_attributes); ++i) {
+		ret = device_create_file(&dev->dev, qib_attributes[i]);
+		if (ret)
+			goto bail;
+	}
+
+	return 0;
+bail:
+	for (i = 0; i < ARRAY_SIZE(qib_attributes); ++i)
+		device_remove_file(&dev->dev, qib_attributes[i]);
+	return ret;
+}
+
+/*
+ * Unregister and remove our files in /sys/class/infiniband.
+ */
+void qib_verbs_unregister_sysfs(struct qib_devdata *dd)
+{
+	struct qib_pportdata *ppd;
+	int i;
+
+	for (i = 0; i < dd->num_pports; i++) {
+		ppd = &dd->pport[i];
+		if (qib_cc_table_size &&
+			ppd->congestion_entries_shadow) {
+			sysfs_remove_bin_file(&ppd->pport_cc_kobj,
+				&cc_setting_bin_attr);
+			sysfs_remove_bin_file(&ppd->pport_cc_kobj,
+				&cc_table_bin_attr);
+			kobject_put(&ppd->pport_cc_kobj);
+		}
+		kobject_put(&ppd->sl2vl_kobj);
+		kobject_put(&ppd->pport_kobj);
+	}
+}
diff --git a/drivers/infiniband/hw/qib/qib_twsi.c b/drivers/infiniband/hw/qib/qib_twsi.c
new file mode 100644
index 0000000..647f7be
--- /dev/null
+++ b/drivers/infiniband/hw/qib/qib_twsi.c
@@ -0,0 +1,500 @@
+/*
+ * Copyright (c) 2012 Intel Corporation. All rights reserved.
+ * Copyright (c) 2006 - 2012 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/delay.h>
+#include <linux/pci.h>
+#include <linux/vmalloc.h>
+
+#include "qib.h"
+
+/*
+ * QLogic_IB "Two Wire Serial Interface" driver.
+ * Originally written for a not-quite-i2c serial eeprom, which is
+ * still used on some supported boards. Later boards have added a
+ * variety of other uses, most board-specific, so the bit-boffing
+ * part has been split off to this file, while the other parts
+ * have been moved to chip-specific files.
+ *
+ * We have also dropped all pretense of fully generic (e.g. pretend
+ * we don't know whether '1' is the higher voltage) interface, as
+ * the restrictions of the generic i2c interface (e.g. no access from
+ * driver itself) make it unsuitable for this use.
+ */
+
+#define READ_CMD 1
+#define WRITE_CMD 0
+
+/**
+ * i2c_wait_for_writes - wait for a write
+ * @dd: the qlogic_ib device
+ *
+ * We use this instead of udelay directly, so we can make sure
+ * that previous register writes have been flushed all the way
+ * to the chip.  Since we are delaying anyway, the cost doesn't
+ * hurt, and makes the bit twiddling more regular
+ */
+static void i2c_wait_for_writes(struct qib_devdata *dd)
+{
+	/*
+	 * implicit read of EXTStatus is as good as explicit
+	 * read of scratch, if all we want to do is flush
+	 * writes.
+	 */
+	dd->f_gpio_mod(dd, 0, 0, 0);
+	rmb(); /* inlined, so prevent compiler reordering */
+}
+
+/*
+ * QSFP modules are allowed to hold SCL low for 500uSec. Allow twice that
+ * for "almost compliant" modules
+ */
+#define SCL_WAIT_USEC 1000
+
+/* BUF_WAIT is time bus must be free between STOP or ACK and to next START.
+ * Should be 20, but some chips need more.
+ */
+#define TWSI_BUF_WAIT_USEC 60
+
+static void scl_out(struct qib_devdata *dd, u8 bit)
+{
+	u32 mask;
+
+	udelay(1);
+
+	mask = 1UL << dd->gpio_scl_num;
+
+	/* SCL is meant to be bare-drain, so never set "OUT", just DIR */
+	dd->f_gpio_mod(dd, 0, bit ? 0 : mask, mask);
+
+	/*
+	 * Allow for slow slaves by simple
+	 * delay for falling edge, sampling on rise.
+	 */
+	if (!bit)
+		udelay(2);
+	else {
+		int rise_usec;
+		for (rise_usec = SCL_WAIT_USEC; rise_usec > 0; rise_usec -= 2) {
+			if (mask & dd->f_gpio_mod(dd, 0, 0, 0))
+				break;
+			udelay(2);
+		}
+		if (rise_usec <= 0)
+			qib_dev_err(dd, "SCL interface stuck low > %d uSec\n",
+				    SCL_WAIT_USEC);
+	}
+	i2c_wait_for_writes(dd);
+}
+
+static void sda_out(struct qib_devdata *dd, u8 bit)
+{
+	u32 mask;
+
+	mask = 1UL << dd->gpio_sda_num;
+
+	/* SDA is meant to be bare-drain, so never set "OUT", just DIR */
+	dd->f_gpio_mod(dd, 0, bit ? 0 : mask, mask);
+
+	i2c_wait_for_writes(dd);
+	udelay(2);
+}
+
+static u8 sda_in(struct qib_devdata *dd, int wait)
+{
+	int bnum;
+	u32 read_val, mask;
+
+	bnum = dd->gpio_sda_num;
+	mask = (1UL << bnum);
+	/* SDA is meant to be bare-drain, so never set "OUT", just DIR */
+	dd->f_gpio_mod(dd, 0, 0, mask);
+	read_val = dd->f_gpio_mod(dd, 0, 0, 0);
+	if (wait)
+		i2c_wait_for_writes(dd);
+	return (read_val & mask) >> bnum;
+}
+
+/**
+ * i2c_ackrcv - see if ack following write is true
+ * @dd: the qlogic_ib device
+ */
+static int i2c_ackrcv(struct qib_devdata *dd)
+{
+	u8 ack_received;
+
+	/* AT ENTRY SCL = LOW */
+	/* change direction, ignore data */
+	ack_received = sda_in(dd, 1);
+	scl_out(dd, 1);
+	ack_received = sda_in(dd, 1) == 0;
+	scl_out(dd, 0);
+	return ack_received;
+}
+
+static void stop_cmd(struct qib_devdata *dd);
+
+/**
+ * rd_byte - read a byte, sending STOP on last, else ACK
+ * @dd: the qlogic_ib device
+ *
+ * Returns byte shifted out of device
+ */
+static int rd_byte(struct qib_devdata *dd, int last)
+{
+	int bit_cntr, data;
+
+	data = 0;
+
+	for (bit_cntr = 7; bit_cntr >= 0; --bit_cntr) {
+		data <<= 1;
+		scl_out(dd, 1);
+		data |= sda_in(dd, 0);
+		scl_out(dd, 0);
+	}
+	if (last) {
+		scl_out(dd, 1);
+		stop_cmd(dd);
+	} else {
+		sda_out(dd, 0);
+		scl_out(dd, 1);
+		scl_out(dd, 0);
+		sda_out(dd, 1);
+	}
+	return data;
+}
+
+/**
+ * wr_byte - write a byte, one bit at a time
+ * @dd: the qlogic_ib device
+ * @data: the byte to write
+ *
+ * Returns 0 if we got the following ack, otherwise 1
+ */
+static int wr_byte(struct qib_devdata *dd, u8 data)
+{
+	int bit_cntr;
+	u8 bit;
+
+	for (bit_cntr = 7; bit_cntr >= 0; bit_cntr--) {
+		bit = (data >> bit_cntr) & 1;
+		sda_out(dd, bit);
+		scl_out(dd, 1);
+		scl_out(dd, 0);
+	}
+	return (!i2c_ackrcv(dd)) ? 1 : 0;
+}
+
+/*
+ * issue TWSI start sequence:
+ * (both clock/data high, clock high, data low while clock is high)
+ */
+static void start_seq(struct qib_devdata *dd)
+{
+	sda_out(dd, 1);
+	scl_out(dd, 1);
+	sda_out(dd, 0);
+	udelay(1);
+	scl_out(dd, 0);
+}
+
+/**
+ * stop_seq - transmit the stop sequence
+ * @dd: the qlogic_ib device
+ *
+ * (both clock/data low, clock high, data high while clock is high)
+ */
+static void stop_seq(struct qib_devdata *dd)
+{
+	scl_out(dd, 0);
+	sda_out(dd, 0);
+	scl_out(dd, 1);
+	sda_out(dd, 1);
+}
+
+/**
+ * stop_cmd - transmit the stop condition
+ * @dd: the qlogic_ib device
+ *
+ * (both clock/data low, clock high, data high while clock is high)
+ */
+static void stop_cmd(struct qib_devdata *dd)
+{
+	stop_seq(dd);
+	udelay(TWSI_BUF_WAIT_USEC);
+}
+
+/**
+ * qib_twsi_reset - reset I2C communication
+ * @dd: the qlogic_ib device
+ */
+
+int qib_twsi_reset(struct qib_devdata *dd)
+{
+	int clock_cycles_left = 9;
+	int was_high = 0;
+	u32 pins, mask;
+
+	/* Both SCL and SDA should be high. If not, there
+	 * is something wrong.
+	 */
+	mask = (1UL << dd->gpio_scl_num) | (1UL << dd->gpio_sda_num);
+
+	/*
+	 * Force pins to desired innocuous state.
+	 * This is the default power-on state with out=0 and dir=0,
+	 * So tri-stated and should be floating high (barring HW problems)
+	 */
+	dd->f_gpio_mod(dd, 0, 0, mask);
+
+	/*
+	 * Clock nine times to get all listeners into a sane state.
+	 * If SDA does not go high at any point, we are wedged.
+	 * One vendor recommends then issuing START followed by STOP.
+	 * we cannot use our "normal" functions to do that, because
+	 * if SCL drops between them, another vendor's part will
+	 * wedge, dropping SDA and keeping it low forever, at the end of
+	 * the next transaction (even if it was not the device addressed).
+	 * So our START and STOP take place with SCL held high.
+	 */
+	while (clock_cycles_left--) {
+		scl_out(dd, 0);
+		scl_out(dd, 1);
+		/* Note if SDA is high, but keep clocking to sync slave */
+		was_high |= sda_in(dd, 0);
+	}
+
+	if (was_high) {
+		/*
+		 * We saw a high, which we hope means the slave is sync'd.
+		 * Issue START, STOP, pause for T_BUF.
+		 */
+
+		pins = dd->f_gpio_mod(dd, 0, 0, 0);
+		if ((pins & mask) != mask)
+			qib_dev_err(dd, "GPIO pins not at rest: %d\n",
+				    pins & mask);
+		/* Drop SDA to issue START */
+		udelay(1); /* Guarantee .6 uSec setup */
+		sda_out(dd, 0);
+		udelay(1); /* Guarantee .6 uSec hold */
+		/* At this point, SCL is high, SDA low. Raise SDA for STOP */
+		sda_out(dd, 1);
+		udelay(TWSI_BUF_WAIT_USEC);
+	}
+
+	return !was_high;
+}
+
+#define QIB_TWSI_START 0x100
+#define QIB_TWSI_STOP 0x200
+
+/* Write byte to TWSI, optionally prefixed with START or suffixed with
+ * STOP.
+ * returns 0 if OK (ACK received), else != 0
+ */
+static int qib_twsi_wr(struct qib_devdata *dd, int data, int flags)
+{
+	int ret = 1;
+	if (flags & QIB_TWSI_START)
+		start_seq(dd);
+
+	ret = wr_byte(dd, data); /* Leaves SCL low (from i2c_ackrcv()) */
+
+	if (flags & QIB_TWSI_STOP)
+		stop_cmd(dd);
+	return ret;
+}
+
+/* Added functionality for IBA7220-based cards */
+#define QIB_TEMP_DEV 0x98
+
+/*
+ * qib_twsi_blk_rd
+ * Formerly called qib_eeprom_internal_read, and only used for eeprom,
+ * but now the general interface for data transfer from twsi devices.
+ * One vestige of its former role is that it recognizes a device
+ * QIB_TWSI_NO_DEV and does the correct operation for the legacy part,
+ * which responded to all TWSI device codes, interpreting them as
+ * address within device. On all other devices found on board handled by
+ * this driver, the device is followed by a one-byte "address" which selects
+ * the "register" or "offset" within the device from which data should
+ * be read.
+ */
+int qib_twsi_blk_rd(struct qib_devdata *dd, int dev, int addr,
+		    void *buffer, int len)
+{
+	int ret;
+	u8 *bp = buffer;
+
+	ret = 1;
+
+	if (dev == QIB_TWSI_NO_DEV) {
+		/* legacy not-really-I2C */
+		addr = (addr << 1) | READ_CMD;
+		ret = qib_twsi_wr(dd, addr, QIB_TWSI_START);
+	} else {
+		/* Actual I2C */
+		ret = qib_twsi_wr(dd, dev | WRITE_CMD, QIB_TWSI_START);
+		if (ret) {
+			stop_cmd(dd);
+			ret = 1;
+			goto bail;
+		}
+		/*
+		 * SFF spec claims we do _not_ stop after the addr
+		 * but simply issue a start with the "read" dev-addr.
+		 * Since we are implicitely waiting for ACK here,
+		 * we need t_buf (nominally 20uSec) before that start,
+		 * and cannot rely on the delay built in to the STOP
+		 */
+		ret = qib_twsi_wr(dd, addr, 0);
+		udelay(TWSI_BUF_WAIT_USEC);
+
+		if (ret) {
+			qib_dev_err(dd,
+				"Failed to write interface read addr %02X\n",
+				addr);
+			ret = 1;
+			goto bail;
+		}
+		ret = qib_twsi_wr(dd, dev | READ_CMD, QIB_TWSI_START);
+	}
+	if (ret) {
+		stop_cmd(dd);
+		ret = 1;
+		goto bail;
+	}
+
+	/*
+	 * block devices keeps clocking data out as long as we ack,
+	 * automatically incrementing the address. Some have "pages"
+	 * whose boundaries will not be crossed, but the handling
+	 * of these is left to the caller, who is in a better
+	 * position to know.
+	 */
+	while (len-- > 0) {
+		/*
+		 * Get and store data, sending ACK if length remaining,
+		 * else STOP
+		 */
+		*bp++ = rd_byte(dd, !len);
+	}
+
+	ret = 0;
+
+bail:
+	return ret;
+}
+
+/*
+ * qib_twsi_blk_wr
+ * Formerly called qib_eeprom_internal_write, and only used for eeprom,
+ * but now the general interface for data transfer to twsi devices.
+ * One vestige of its former role is that it recognizes a device
+ * QIB_TWSI_NO_DEV and does the correct operation for the legacy part,
+ * which responded to all TWSI device codes, interpreting them as
+ * address within device. On all other devices found on board handled by
+ * this driver, the device is followed by a one-byte "address" which selects
+ * the "register" or "offset" within the device to which data should
+ * be written.
+ */
+int qib_twsi_blk_wr(struct qib_devdata *dd, int dev, int addr,
+		    const void *buffer, int len)
+{
+	int sub_len;
+	const u8 *bp = buffer;
+	int max_wait_time, i;
+	int ret;
+	ret = 1;
+
+	while (len > 0) {
+		if (dev == QIB_TWSI_NO_DEV) {
+			if (qib_twsi_wr(dd, (addr << 1) | WRITE_CMD,
+					QIB_TWSI_START)) {
+				goto failed_write;
+			}
+		} else {
+			/* Real I2C */
+			if (qib_twsi_wr(dd, dev | WRITE_CMD, QIB_TWSI_START))
+				goto failed_write;
+			ret = qib_twsi_wr(dd, addr, 0);
+			if (ret) {
+				qib_dev_err(dd,
+					"Failed to write interface write addr %02X\n",
+					addr);
+				goto failed_write;
+			}
+		}
+
+		sub_len = min(len, 4);
+		addr += sub_len;
+		len -= sub_len;
+
+		for (i = 0; i < sub_len; i++)
+			if (qib_twsi_wr(dd, *bp++, 0))
+				goto failed_write;
+
+		stop_cmd(dd);
+
+		/*
+		 * Wait for write complete by waiting for a successful
+		 * read (the chip replies with a zero after the write
+		 * cmd completes, and before it writes to the eeprom.
+		 * The startcmd for the read will fail the ack until
+		 * the writes have completed.   We do this inline to avoid
+		 * the debug prints that are in the real read routine
+		 * if the startcmd fails.
+		 * We also use the proper device address, so it doesn't matter
+		 * whether we have real eeprom_dev. Legacy likes any address.
+		 */
+		max_wait_time = 100;
+		while (qib_twsi_wr(dd, dev | READ_CMD, QIB_TWSI_START)) {
+			stop_cmd(dd);
+			if (!--max_wait_time)
+				goto failed_write;
+		}
+		/* now read (and ignore) the resulting byte */
+		rd_byte(dd, 1);
+	}
+
+	ret = 0;
+	goto bail;
+
+failed_write:
+	stop_cmd(dd);
+	ret = 1;
+
+bail:
+	return ret;
+}
diff --git a/drivers/infiniband/hw/qib/qib_tx.c b/drivers/infiniband/hw/qib/qib_tx.c
new file mode 100644
index 0000000..31d3561
--- /dev/null
+++ b/drivers/infiniband/hw/qib/qib_tx.c
@@ -0,0 +1,571 @@
+/*
+ * Copyright (c) 2008, 2009, 2010 QLogic Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/spinlock.h>
+#include <linux/pci.h>
+#include <linux/io.h>
+#include <linux/delay.h>
+#include <linux/netdevice.h>
+#include <linux/vmalloc.h>
+#include <linux/moduleparam.h>
+
+#include "qib.h"
+
+static unsigned qib_hol_timeout_ms = 3000;
+module_param_named(hol_timeout_ms, qib_hol_timeout_ms, uint, S_IRUGO);
+MODULE_PARM_DESC(hol_timeout_ms,
+		 "duration of user app suspension after link failure");
+
+unsigned qib_sdma_fetch_arb = 1;
+module_param_named(fetch_arb, qib_sdma_fetch_arb, uint, S_IRUGO);
+MODULE_PARM_DESC(fetch_arb, "IBA7220: change SDMA descriptor arbitration");
+
+/**
+ * qib_disarm_piobufs - cancel a range of PIO buffers
+ * @dd: the qlogic_ib device
+ * @first: the first PIO buffer to cancel
+ * @cnt: the number of PIO buffers to cancel
+ *
+ * Cancel a range of PIO buffers. Used at user process close,
+ * in case it died while writing to a PIO buffer.
+ */
+void qib_disarm_piobufs(struct qib_devdata *dd, unsigned first, unsigned cnt)
+{
+	unsigned long flags;
+	unsigned i;
+	unsigned last;
+
+	last = first + cnt;
+	spin_lock_irqsave(&dd->pioavail_lock, flags);
+	for (i = first; i < last; i++) {
+		__clear_bit(i, dd->pio_need_disarm);
+		dd->f_sendctrl(dd->pport, QIB_SENDCTRL_DISARM_BUF(i));
+	}
+	spin_unlock_irqrestore(&dd->pioavail_lock, flags);
+}
+
+/*
+ * This is called by a user process when it sees the DISARM_BUFS event
+ * bit is set.
+ */
+int qib_disarm_piobufs_ifneeded(struct qib_ctxtdata *rcd)
+{
+	struct qib_devdata *dd = rcd->dd;
+	unsigned i;
+	unsigned last;
+	unsigned n = 0;
+
+	last = rcd->pio_base + rcd->piocnt;
+	/*
+	 * Don't need uctxt_lock here, since user has called in to us.
+	 * Clear at start in case more interrupts set bits while we
+	 * are disarming
+	 */
+	if (rcd->user_event_mask) {
+		/*
+		 * subctxt_cnt is 0 if not shared, so do base
+		 * separately, first, then remaining subctxt, if any
+		 */
+		clear_bit(_QIB_EVENT_DISARM_BUFS_BIT, &rcd->user_event_mask[0]);
+		for (i = 1; i < rcd->subctxt_cnt; i++)
+			clear_bit(_QIB_EVENT_DISARM_BUFS_BIT,
+				  &rcd->user_event_mask[i]);
+	}
+	spin_lock_irq(&dd->pioavail_lock);
+	for (i = rcd->pio_base; i < last; i++) {
+		if (__test_and_clear_bit(i, dd->pio_need_disarm)) {
+			n++;
+			dd->f_sendctrl(rcd->ppd, QIB_SENDCTRL_DISARM_BUF(i));
+		}
+	}
+	spin_unlock_irq(&dd->pioavail_lock);
+	return 0;
+}
+
+static struct qib_pportdata *is_sdma_buf(struct qib_devdata *dd, unsigned i)
+{
+	struct qib_pportdata *ppd;
+	unsigned pidx;
+
+	for (pidx = 0; pidx < dd->num_pports; pidx++) {
+		ppd = dd->pport + pidx;
+		if (i >= ppd->sdma_state.first_sendbuf &&
+		    i < ppd->sdma_state.last_sendbuf)
+			return ppd;
+	}
+	return NULL;
+}
+
+/*
+ * Return true if send buffer is being used by a user context.
+ * Sets  _QIB_EVENT_DISARM_BUFS_BIT in user_event_mask as a side effect
+ */
+static int find_ctxt(struct qib_devdata *dd, unsigned bufn)
+{
+	struct qib_ctxtdata *rcd;
+	unsigned ctxt;
+	int ret = 0;
+
+	spin_lock(&dd->uctxt_lock);
+	for (ctxt = dd->first_user_ctxt; ctxt < dd->cfgctxts; ctxt++) {
+		rcd = dd->rcd[ctxt];
+		if (!rcd || bufn < rcd->pio_base ||
+		    bufn >= rcd->pio_base + rcd->piocnt)
+			continue;
+		if (rcd->user_event_mask) {
+			int i;
+			/*
+			 * subctxt_cnt is 0 if not shared, so do base
+			 * separately, first, then remaining subctxt, if any
+			 */
+			set_bit(_QIB_EVENT_DISARM_BUFS_BIT,
+				&rcd->user_event_mask[0]);
+			for (i = 1; i < rcd->subctxt_cnt; i++)
+				set_bit(_QIB_EVENT_DISARM_BUFS_BIT,
+					&rcd->user_event_mask[i]);
+		}
+		ret = 1;
+		break;
+	}
+	spin_unlock(&dd->uctxt_lock);
+
+	return ret;
+}
+
+/*
+ * Disarm a set of send buffers.  If the buffer might be actively being
+ * written to, mark the buffer to be disarmed later when it is not being
+ * written to.
+ *
+ * This should only be called from the IRQ error handler.
+ */
+void qib_disarm_piobufs_set(struct qib_devdata *dd, unsigned long *mask,
+			    unsigned cnt)
+{
+	struct qib_pportdata *ppd, *pppd[QIB_MAX_IB_PORTS];
+	unsigned i;
+	unsigned long flags;
+
+	for (i = 0; i < dd->num_pports; i++)
+		pppd[i] = NULL;
+
+	for (i = 0; i < cnt; i++) {
+		int which;
+		if (!test_bit(i, mask))
+			continue;
+		/*
+		 * If the buffer is owned by the DMA hardware,
+		 * reset the DMA engine.
+		 */
+		ppd = is_sdma_buf(dd, i);
+		if (ppd) {
+			pppd[ppd->port] = ppd;
+			continue;
+		}
+		/*
+		 * If the kernel is writing the buffer or the buffer is
+		 * owned by a user process, we can't clear it yet.
+		 */
+		spin_lock_irqsave(&dd->pioavail_lock, flags);
+		if (test_bit(i, dd->pio_writing) ||
+		    (!test_bit(i << 1, dd->pioavailkernel) &&
+		     find_ctxt(dd, i))) {
+			__set_bit(i, dd->pio_need_disarm);
+			which = 0;
+		} else {
+			which = 1;
+			dd->f_sendctrl(dd->pport, QIB_SENDCTRL_DISARM_BUF(i));
+		}
+		spin_unlock_irqrestore(&dd->pioavail_lock, flags);
+	}
+
+	/* do cancel_sends once per port that had sdma piobufs in error */
+	for (i = 0; i < dd->num_pports; i++)
+		if (pppd[i])
+			qib_cancel_sends(pppd[i]);
+}
+
+/**
+ * update_send_bufs - update shadow copy of the PIO availability map
+ * @dd: the qlogic_ib device
+ *
+ * called whenever our local copy indicates we have run out of send buffers
+ */
+static void update_send_bufs(struct qib_devdata *dd)
+{
+	unsigned long flags;
+	unsigned i;
+	const unsigned piobregs = dd->pioavregs;
+
+	/*
+	 * If the generation (check) bits have changed, then we update the
+	 * busy bit for the corresponding PIO buffer.  This algorithm will
+	 * modify positions to the value they already have in some cases
+	 * (i.e., no change), but it's faster than changing only the bits
+	 * that have changed.
+	 *
+	 * We would like to do this atomicly, to avoid spinlocks in the
+	 * critical send path, but that's not really possible, given the
+	 * type of changes, and that this routine could be called on
+	 * multiple cpu's simultaneously, so we lock in this routine only,
+	 * to avoid conflicting updates; all we change is the shadow, and
+	 * it's a single 64 bit memory location, so by definition the update
+	 * is atomic in terms of what other cpu's can see in testing the
+	 * bits.  The spin_lock overhead isn't too bad, since it only
+	 * happens when all buffers are in use, so only cpu overhead, not
+	 * latency or bandwidth is affected.
+	 */
+	if (!dd->pioavailregs_dma)
+		return;
+	spin_lock_irqsave(&dd->pioavail_lock, flags);
+	for (i = 0; i < piobregs; i++) {
+		u64 pchbusy, pchg, piov, pnew;
+
+		piov = le64_to_cpu(dd->pioavailregs_dma[i]);
+		pchg = dd->pioavailkernel[i] &
+			~(dd->pioavailshadow[i] ^ piov);
+		pchbusy = pchg << QLOGIC_IB_SENDPIOAVAIL_BUSY_SHIFT;
+		if (pchg && (pchbusy & dd->pioavailshadow[i])) {
+			pnew = dd->pioavailshadow[i] & ~pchbusy;
+			pnew |= piov & pchbusy;
+			dd->pioavailshadow[i] = pnew;
+		}
+	}
+	spin_unlock_irqrestore(&dd->pioavail_lock, flags);
+}
+
+/*
+ * Debugging code and stats updates if no pio buffers available.
+ */
+static noinline void no_send_bufs(struct qib_devdata *dd)
+{
+	dd->upd_pio_shadow = 1;
+
+	/* not atomic, but if we lose a stat count in a while, that's OK */
+	qib_stats.sps_nopiobufs++;
+}
+
+/*
+ * Common code for normal driver send buffer allocation, and reserved
+ * allocation.
+ *
+ * Do appropriate marking as busy, etc.
+ * Returns buffer pointer if one is found, otherwise NULL.
+ */
+u32 __iomem *qib_getsendbuf_range(struct qib_devdata *dd, u32 *pbufnum,
+				  u32 first, u32 last)
+{
+	unsigned i, j, updated = 0;
+	unsigned nbufs;
+	unsigned long flags;
+	unsigned long *shadow = dd->pioavailshadow;
+	u32 __iomem *buf;
+
+	if (!(dd->flags & QIB_PRESENT))
+		return NULL;
+
+	nbufs = last - first + 1; /* number in range to check */
+	if (dd->upd_pio_shadow) {
+update_shadow:
+		/*
+		 * Minor optimization.  If we had no buffers on last call,
+		 * start out by doing the update; continue and do scan even
+		 * if no buffers were updated, to be paranoid.
+		 */
+		update_send_bufs(dd);
+		updated++;
+	}
+	i = first;
+	/*
+	 * While test_and_set_bit() is atomic, we do that and then the
+	 * change_bit(), and the pair is not.  See if this is the cause
+	 * of the remaining armlaunch errors.
+	 */
+	spin_lock_irqsave(&dd->pioavail_lock, flags);
+	if (dd->last_pio >= first && dd->last_pio <= last)
+		i = dd->last_pio + 1;
+	if (!first)
+		/* adjust to min possible  */
+		nbufs = last - dd->min_kernel_pio + 1;
+	for (j = 0; j < nbufs; j++, i++) {
+		if (i > last)
+			i = !first ? dd->min_kernel_pio : first;
+		if (__test_and_set_bit((2 * i) + 1, shadow))
+			continue;
+		/* flip generation bit */
+		__change_bit(2 * i, shadow);
+		/* remember that the buffer can be written to now */
+		__set_bit(i, dd->pio_writing);
+		if (!first && first != last) /* first == last on VL15, avoid */
+			dd->last_pio = i;
+		break;
+	}
+	spin_unlock_irqrestore(&dd->pioavail_lock, flags);
+
+	if (j == nbufs) {
+		if (!updated)
+			/*
+			 * First time through; shadow exhausted, but may be
+			 * buffers available, try an update and then rescan.
+			 */
+			goto update_shadow;
+		no_send_bufs(dd);
+		buf = NULL;
+	} else {
+		if (i < dd->piobcnt2k)
+			buf = (u32 __iomem *)(dd->pio2kbase +
+				i * dd->palign);
+		else if (i < dd->piobcnt2k + dd->piobcnt4k || !dd->piovl15base)
+			buf = (u32 __iomem *)(dd->pio4kbase +
+				(i - dd->piobcnt2k) * dd->align4k);
+		else
+			buf = (u32 __iomem *)(dd->piovl15base +
+				(i - (dd->piobcnt2k + dd->piobcnt4k)) *
+				dd->align4k);
+		if (pbufnum)
+			*pbufnum = i;
+		dd->upd_pio_shadow = 0;
+	}
+
+	return buf;
+}
+
+/*
+ * Record that the caller is finished writing to the buffer so we don't
+ * disarm it while it is being written and disarm it now if needed.
+ */
+void qib_sendbuf_done(struct qib_devdata *dd, unsigned n)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&dd->pioavail_lock, flags);
+	__clear_bit(n, dd->pio_writing);
+	if (__test_and_clear_bit(n, dd->pio_need_disarm))
+		dd->f_sendctrl(dd->pport, QIB_SENDCTRL_DISARM_BUF(n));
+	spin_unlock_irqrestore(&dd->pioavail_lock, flags);
+}
+
+/**
+ * qib_chg_pioavailkernel - change which send buffers are available for kernel
+ * @dd: the qlogic_ib device
+ * @start: the starting send buffer number
+ * @len: the number of send buffers
+ * @avail: true if the buffers are available for kernel use, false otherwise
+ */
+void qib_chg_pioavailkernel(struct qib_devdata *dd, unsigned start,
+	unsigned len, u32 avail, struct qib_ctxtdata *rcd)
+{
+	unsigned long flags;
+	unsigned end;
+	unsigned ostart = start;
+
+	/* There are two bits per send buffer (busy and generation) */
+	start *= 2;
+	end = start + len * 2;
+
+	spin_lock_irqsave(&dd->pioavail_lock, flags);
+	/* Set or clear the busy bit in the shadow. */
+	while (start < end) {
+		if (avail) {
+			unsigned long dma;
+			int i;
+
+			/*
+			 * The BUSY bit will never be set, because we disarm
+			 * the user buffers before we hand them back to the
+			 * kernel.  We do have to make sure the generation
+			 * bit is set correctly in shadow, since it could
+			 * have changed many times while allocated to user.
+			 * We can't use the bitmap functions on the full
+			 * dma array because it is always little-endian, so
+			 * we have to flip to host-order first.
+			 * BITS_PER_LONG is slightly wrong, since it's
+			 * always 64 bits per register in chip...
+			 * We only work on 64 bit kernels, so that's OK.
+			 */
+			i = start / BITS_PER_LONG;
+			__clear_bit(QLOGIC_IB_SENDPIOAVAIL_BUSY_SHIFT + start,
+				    dd->pioavailshadow);
+			dma = (unsigned long)
+				le64_to_cpu(dd->pioavailregs_dma[i]);
+			if (test_bit((QLOGIC_IB_SENDPIOAVAIL_CHECK_SHIFT +
+				      start) % BITS_PER_LONG, &dma))
+				__set_bit(QLOGIC_IB_SENDPIOAVAIL_CHECK_SHIFT +
+					  start, dd->pioavailshadow);
+			else
+				__clear_bit(QLOGIC_IB_SENDPIOAVAIL_CHECK_SHIFT
+					    + start, dd->pioavailshadow);
+			__set_bit(start, dd->pioavailkernel);
+			if ((start >> 1) < dd->min_kernel_pio)
+				dd->min_kernel_pio = start >> 1;
+		} else {
+			__set_bit(start + QLOGIC_IB_SENDPIOAVAIL_BUSY_SHIFT,
+				  dd->pioavailshadow);
+			__clear_bit(start, dd->pioavailkernel);
+			if ((start >> 1) > dd->min_kernel_pio)
+				dd->min_kernel_pio = start >> 1;
+		}
+		start += 2;
+	}
+
+	if (dd->min_kernel_pio > 0 && dd->last_pio < dd->min_kernel_pio - 1)
+		dd->last_pio = dd->min_kernel_pio - 1;
+	spin_unlock_irqrestore(&dd->pioavail_lock, flags);
+
+	dd->f_txchk_change(dd, ostart, len, avail, rcd);
+}
+
+/*
+ * Flush all sends that might be in the ready to send state, as well as any
+ * that are in the process of being sent.  Used whenever we need to be
+ * sure the send side is idle.  Cleans up all buffer state by canceling
+ * all pio buffers, and issuing an abort, which cleans up anything in the
+ * launch fifo.  The cancel is superfluous on some chip versions, but
+ * it's safer to always do it.
+ * PIOAvail bits are updated by the chip as if a normal send had happened.
+ */
+void qib_cancel_sends(struct qib_pportdata *ppd)
+{
+	struct qib_devdata *dd = ppd->dd;
+	struct qib_ctxtdata *rcd;
+	unsigned long flags;
+	unsigned ctxt;
+	unsigned i;
+	unsigned last;
+
+	/*
+	 * Tell PSM to disarm buffers again before trying to reuse them.
+	 * We need to be sure the rcd doesn't change out from under us
+	 * while we do so.  We hold the two locks sequentially.  We might
+	 * needlessly set some need_disarm bits as a result, if the
+	 * context is closed after we release the uctxt_lock, but that's
+	 * fairly benign, and safer than nesting the locks.
+	 */
+	for (ctxt = dd->first_user_ctxt; ctxt < dd->cfgctxts; ctxt++) {
+		spin_lock_irqsave(&dd->uctxt_lock, flags);
+		rcd = dd->rcd[ctxt];
+		if (rcd && rcd->ppd == ppd) {
+			last = rcd->pio_base + rcd->piocnt;
+			if (rcd->user_event_mask) {
+				/*
+				 * subctxt_cnt is 0 if not shared, so do base
+				 * separately, first, then remaining subctxt,
+				 * if any
+				 */
+				set_bit(_QIB_EVENT_DISARM_BUFS_BIT,
+					&rcd->user_event_mask[0]);
+				for (i = 1; i < rcd->subctxt_cnt; i++)
+					set_bit(_QIB_EVENT_DISARM_BUFS_BIT,
+						&rcd->user_event_mask[i]);
+			}
+			i = rcd->pio_base;
+			spin_unlock_irqrestore(&dd->uctxt_lock, flags);
+			spin_lock_irqsave(&dd->pioavail_lock, flags);
+			for (; i < last; i++)
+				__set_bit(i, dd->pio_need_disarm);
+			spin_unlock_irqrestore(&dd->pioavail_lock, flags);
+		} else
+			spin_unlock_irqrestore(&dd->uctxt_lock, flags);
+	}
+
+	if (!(dd->flags & QIB_HAS_SEND_DMA))
+		dd->f_sendctrl(ppd, QIB_SENDCTRL_DISARM_ALL |
+				    QIB_SENDCTRL_FLUSH);
+}
+
+/*
+ * Force an update of in-memory copy of the pioavail registers, when
+ * needed for any of a variety of reasons.
+ * If already off, this routine is a nop, on the assumption that the
+ * caller (or set of callers) will "do the right thing".
+ * This is a per-device operation, so just the first port.
+ */
+void qib_force_pio_avail_update(struct qib_devdata *dd)
+{
+	dd->f_sendctrl(dd->pport, QIB_SENDCTRL_AVAIL_BLIP);
+}
+
+void qib_hol_down(struct qib_pportdata *ppd)
+{
+	/*
+	 * Cancel sends when the link goes DOWN so that we aren't doing it
+	 * at INIT when we might be trying to send SMI packets.
+	 */
+	if (!(ppd->lflags & QIBL_IB_AUTONEG_INPROG))
+		qib_cancel_sends(ppd);
+}
+
+/*
+ * Link is at INIT.
+ * We start the HoL timer so we can detect stuck packets blocking SMP replies.
+ * Timer may already be running, so use mod_timer, not add_timer.
+ */
+void qib_hol_init(struct qib_pportdata *ppd)
+{
+	if (ppd->hol_state != QIB_HOL_INIT) {
+		ppd->hol_state = QIB_HOL_INIT;
+		mod_timer(&ppd->hol_timer,
+			  jiffies + msecs_to_jiffies(qib_hol_timeout_ms));
+	}
+}
+
+/*
+ * Link is up, continue any user processes, and ensure timer
+ * is a nop, if running.  Let timer keep running, if set; it
+ * will nop when it sees the link is up.
+ */
+void qib_hol_up(struct qib_pportdata *ppd)
+{
+	ppd->hol_state = QIB_HOL_UP;
+}
+
+/*
+ * This is only called via the timer.
+ */
+void qib_hol_event(unsigned long opaque)
+{
+	struct qib_pportdata *ppd = (struct qib_pportdata *)opaque;
+
+	/* If hardware error, etc, skip. */
+	if (!(ppd->dd->flags & QIB_INITTED))
+		return;
+
+	if (ppd->hol_state != QIB_HOL_UP) {
+		/*
+		 * Try to flush sends in case a stuck packet is blocking
+		 * SMP replies.
+		 */
+		qib_hol_down(ppd);
+		mod_timer(&ppd->hol_timer,
+			  jiffies + msecs_to_jiffies(qib_hol_timeout_ms));
+	}
+}
diff --git a/drivers/infiniband/hw/qib/qib_uc.c b/drivers/infiniband/hw/qib/qib_uc.c
new file mode 100644
index 0000000..aa3a803
--- /dev/null
+++ b/drivers/infiniband/hw/qib/qib_uc.c
@@ -0,0 +1,536 @@
+/*
+ * Copyright (c) 2006, 2007, 2008, 2009, 2010 QLogic Corporation.
+ * All rights reserved.
+ * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "qib.h"
+
+/* cut down ridiculously long IB macro names */
+#define OP(x) IB_OPCODE_UC_##x
+
+/**
+ * qib_make_uc_req - construct a request packet (SEND, RDMA write)
+ * @qp: a pointer to the QP
+ *
+ * Return 1 if constructed; otherwise, return 0.
+ */
+int qib_make_uc_req(struct qib_qp *qp)
+{
+	struct qib_other_headers *ohdr;
+	struct qib_swqe *wqe;
+	unsigned long flags;
+	u32 hwords;
+	u32 bth0;
+	u32 len;
+	u32 pmtu = qp->pmtu;
+	int ret = 0;
+
+	spin_lock_irqsave(&qp->s_lock, flags);
+
+	if (!(ib_qib_state_ops[qp->state] & QIB_PROCESS_SEND_OK)) {
+		if (!(ib_qib_state_ops[qp->state] & QIB_FLUSH_SEND))
+			goto bail;
+		/* We are in the error state, flush the work request. */
+		if (qp->s_last == qp->s_head)
+			goto bail;
+		/* If DMAs are in progress, we can't flush immediately. */
+		if (atomic_read(&qp->s_dma_busy)) {
+			qp->s_flags |= QIB_S_WAIT_DMA;
+			goto bail;
+		}
+		wqe = get_swqe_ptr(qp, qp->s_last);
+		qib_send_complete(qp, wqe, IB_WC_WR_FLUSH_ERR);
+		goto done;
+	}
+
+	ohdr = &qp->s_hdr->u.oth;
+	if (qp->remote_ah_attr.ah_flags & IB_AH_GRH)
+		ohdr = &qp->s_hdr->u.l.oth;
+
+	/* header size in 32-bit words LRH+BTH = (8+12)/4. */
+	hwords = 5;
+	bth0 = 0;
+
+	/* Get the next send request. */
+	wqe = get_swqe_ptr(qp, qp->s_cur);
+	qp->s_wqe = NULL;
+	switch (qp->s_state) {
+	default:
+		if (!(ib_qib_state_ops[qp->state] &
+		    QIB_PROCESS_NEXT_SEND_OK))
+			goto bail;
+		/* Check if send work queue is empty. */
+		if (qp->s_cur == qp->s_head)
+			goto bail;
+		/*
+		 * Start a new request.
+		 */
+		wqe->psn = qp->s_next_psn;
+		qp->s_psn = qp->s_next_psn;
+		qp->s_sge.sge = wqe->sg_list[0];
+		qp->s_sge.sg_list = wqe->sg_list + 1;
+		qp->s_sge.num_sge = wqe->wr.num_sge;
+		qp->s_sge.total_len = wqe->length;
+		len = wqe->length;
+		qp->s_len = len;
+		switch (wqe->wr.opcode) {
+		case IB_WR_SEND:
+		case IB_WR_SEND_WITH_IMM:
+			if (len > pmtu) {
+				qp->s_state = OP(SEND_FIRST);
+				len = pmtu;
+				break;
+			}
+			if (wqe->wr.opcode == IB_WR_SEND)
+				qp->s_state = OP(SEND_ONLY);
+			else {
+				qp->s_state =
+					OP(SEND_ONLY_WITH_IMMEDIATE);
+				/* Immediate data comes after the BTH */
+				ohdr->u.imm_data = wqe->wr.ex.imm_data;
+				hwords += 1;
+			}
+			if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+				bth0 |= IB_BTH_SOLICITED;
+			qp->s_wqe = wqe;
+			if (++qp->s_cur >= qp->s_size)
+				qp->s_cur = 0;
+			break;
+
+		case IB_WR_RDMA_WRITE:
+		case IB_WR_RDMA_WRITE_WITH_IMM:
+			ohdr->u.rc.reth.vaddr =
+				cpu_to_be64(wqe->wr.wr.rdma.remote_addr);
+			ohdr->u.rc.reth.rkey =
+				cpu_to_be32(wqe->wr.wr.rdma.rkey);
+			ohdr->u.rc.reth.length = cpu_to_be32(len);
+			hwords += sizeof(struct ib_reth) / 4;
+			if (len > pmtu) {
+				qp->s_state = OP(RDMA_WRITE_FIRST);
+				len = pmtu;
+				break;
+			}
+			if (wqe->wr.opcode == IB_WR_RDMA_WRITE)
+				qp->s_state = OP(RDMA_WRITE_ONLY);
+			else {
+				qp->s_state =
+					OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE);
+				/* Immediate data comes after the RETH */
+				ohdr->u.rc.imm_data = wqe->wr.ex.imm_data;
+				hwords += 1;
+				if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+					bth0 |= IB_BTH_SOLICITED;
+			}
+			qp->s_wqe = wqe;
+			if (++qp->s_cur >= qp->s_size)
+				qp->s_cur = 0;
+			break;
+
+		default:
+			goto bail;
+		}
+		break;
+
+	case OP(SEND_FIRST):
+		qp->s_state = OP(SEND_MIDDLE);
+		/* FALLTHROUGH */
+	case OP(SEND_MIDDLE):
+		len = qp->s_len;
+		if (len > pmtu) {
+			len = pmtu;
+			break;
+		}
+		if (wqe->wr.opcode == IB_WR_SEND)
+			qp->s_state = OP(SEND_LAST);
+		else {
+			qp->s_state = OP(SEND_LAST_WITH_IMMEDIATE);
+			/* Immediate data comes after the BTH */
+			ohdr->u.imm_data = wqe->wr.ex.imm_data;
+			hwords += 1;
+		}
+		if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+			bth0 |= IB_BTH_SOLICITED;
+		qp->s_wqe = wqe;
+		if (++qp->s_cur >= qp->s_size)
+			qp->s_cur = 0;
+		break;
+
+	case OP(RDMA_WRITE_FIRST):
+		qp->s_state = OP(RDMA_WRITE_MIDDLE);
+		/* FALLTHROUGH */
+	case OP(RDMA_WRITE_MIDDLE):
+		len = qp->s_len;
+		if (len > pmtu) {
+			len = pmtu;
+			break;
+		}
+		if (wqe->wr.opcode == IB_WR_RDMA_WRITE)
+			qp->s_state = OP(RDMA_WRITE_LAST);
+		else {
+			qp->s_state =
+				OP(RDMA_WRITE_LAST_WITH_IMMEDIATE);
+			/* Immediate data comes after the BTH */
+			ohdr->u.imm_data = wqe->wr.ex.imm_data;
+			hwords += 1;
+			if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+				bth0 |= IB_BTH_SOLICITED;
+		}
+		qp->s_wqe = wqe;
+		if (++qp->s_cur >= qp->s_size)
+			qp->s_cur = 0;
+		break;
+	}
+	qp->s_len -= len;
+	qp->s_hdrwords = hwords;
+	qp->s_cur_sge = &qp->s_sge;
+	qp->s_cur_size = len;
+	qib_make_ruc_header(qp, ohdr, bth0 | (qp->s_state << 24),
+			    qp->s_next_psn++ & QIB_PSN_MASK);
+done:
+	ret = 1;
+	goto unlock;
+
+bail:
+	qp->s_flags &= ~QIB_S_BUSY;
+unlock:
+	spin_unlock_irqrestore(&qp->s_lock, flags);
+	return ret;
+}
+
+/**
+ * qib_uc_rcv - handle an incoming UC packet
+ * @ibp: the port the packet came in on
+ * @hdr: the header of the packet
+ * @has_grh: true if the packet has a GRH
+ * @data: the packet data
+ * @tlen: the length of the packet
+ * @qp: the QP for this packet.
+ *
+ * This is called from qib_qp_rcv() to process an incoming UC packet
+ * for the given QP.
+ * Called at interrupt level.
+ */
+void qib_uc_rcv(struct qib_ibport *ibp, struct qib_ib_header *hdr,
+		int has_grh, void *data, u32 tlen, struct qib_qp *qp)
+{
+	struct qib_other_headers *ohdr;
+	u32 opcode;
+	u32 hdrsize;
+	u32 psn;
+	u32 pad;
+	struct ib_wc wc;
+	u32 pmtu = qp->pmtu;
+	struct ib_reth *reth;
+	int ret;
+
+	/* Check for GRH */
+	if (!has_grh) {
+		ohdr = &hdr->u.oth;
+		hdrsize = 8 + 12;       /* LRH + BTH */
+	} else {
+		ohdr = &hdr->u.l.oth;
+		hdrsize = 8 + 40 + 12;  /* LRH + GRH + BTH */
+	}
+
+	opcode = be32_to_cpu(ohdr->bth[0]);
+	if (qib_ruc_check_hdr(ibp, hdr, has_grh, qp, opcode))
+		return;
+
+	psn = be32_to_cpu(ohdr->bth[2]);
+	opcode >>= 24;
+
+	/* Compare the PSN verses the expected PSN. */
+	if (unlikely(qib_cmp24(psn, qp->r_psn) != 0)) {
+		/*
+		 * Handle a sequence error.
+		 * Silently drop any current message.
+		 */
+		qp->r_psn = psn;
+inv:
+		if (qp->r_state == OP(SEND_FIRST) ||
+		    qp->r_state == OP(SEND_MIDDLE)) {
+			set_bit(QIB_R_REWIND_SGE, &qp->r_aflags);
+			qp->r_sge.num_sge = 0;
+		} else
+			qib_put_ss(&qp->r_sge);
+		qp->r_state = OP(SEND_LAST);
+		switch (opcode) {
+		case OP(SEND_FIRST):
+		case OP(SEND_ONLY):
+		case OP(SEND_ONLY_WITH_IMMEDIATE):
+			goto send_first;
+
+		case OP(RDMA_WRITE_FIRST):
+		case OP(RDMA_WRITE_ONLY):
+		case OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE):
+			goto rdma_first;
+
+		default:
+			goto drop;
+		}
+	}
+
+	/* Check for opcode sequence errors. */
+	switch (qp->r_state) {
+	case OP(SEND_FIRST):
+	case OP(SEND_MIDDLE):
+		if (opcode == OP(SEND_MIDDLE) ||
+		    opcode == OP(SEND_LAST) ||
+		    opcode == OP(SEND_LAST_WITH_IMMEDIATE))
+			break;
+		goto inv;
+
+	case OP(RDMA_WRITE_FIRST):
+	case OP(RDMA_WRITE_MIDDLE):
+		if (opcode == OP(RDMA_WRITE_MIDDLE) ||
+		    opcode == OP(RDMA_WRITE_LAST) ||
+		    opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE))
+			break;
+		goto inv;
+
+	default:
+		if (opcode == OP(SEND_FIRST) ||
+		    opcode == OP(SEND_ONLY) ||
+		    opcode == OP(SEND_ONLY_WITH_IMMEDIATE) ||
+		    opcode == OP(RDMA_WRITE_FIRST) ||
+		    opcode == OP(RDMA_WRITE_ONLY) ||
+		    opcode == OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE))
+			break;
+		goto inv;
+	}
+
+	if (qp->state == IB_QPS_RTR && !(qp->r_flags & QIB_R_COMM_EST)) {
+		qp->r_flags |= QIB_R_COMM_EST;
+		if (qp->ibqp.event_handler) {
+			struct ib_event ev;
+
+			ev.device = qp->ibqp.device;
+			ev.element.qp = &qp->ibqp;
+			ev.event = IB_EVENT_COMM_EST;
+			qp->ibqp.event_handler(&ev, qp->ibqp.qp_context);
+		}
+	}
+
+	/* OK, process the packet. */
+	switch (opcode) {
+	case OP(SEND_FIRST):
+	case OP(SEND_ONLY):
+	case OP(SEND_ONLY_WITH_IMMEDIATE):
+send_first:
+		if (test_and_clear_bit(QIB_R_REWIND_SGE, &qp->r_aflags))
+			qp->r_sge = qp->s_rdma_read_sge;
+		else {
+			ret = qib_get_rwqe(qp, 0);
+			if (ret < 0)
+				goto op_err;
+			if (!ret)
+				goto drop;
+			/*
+			 * qp->s_rdma_read_sge will be the owner
+			 * of the mr references.
+			 */
+			qp->s_rdma_read_sge = qp->r_sge;
+		}
+		qp->r_rcv_len = 0;
+		if (opcode == OP(SEND_ONLY))
+			goto no_immediate_data;
+		else if (opcode == OP(SEND_ONLY_WITH_IMMEDIATE))
+			goto send_last_imm;
+		/* FALLTHROUGH */
+	case OP(SEND_MIDDLE):
+		/* Check for invalid length PMTU or posted rwqe len. */
+		if (unlikely(tlen != (hdrsize + pmtu + 4)))
+			goto rewind;
+		qp->r_rcv_len += pmtu;
+		if (unlikely(qp->r_rcv_len > qp->r_len))
+			goto rewind;
+		qib_copy_sge(&qp->r_sge, data, pmtu, 0);
+		break;
+
+	case OP(SEND_LAST_WITH_IMMEDIATE):
+send_last_imm:
+		wc.ex.imm_data = ohdr->u.imm_data;
+		hdrsize += 4;
+		wc.wc_flags = IB_WC_WITH_IMM;
+		goto send_last;
+	case OP(SEND_LAST):
+no_immediate_data:
+		wc.ex.imm_data = 0;
+		wc.wc_flags = 0;
+send_last:
+		/* Get the number of bytes the message was padded by. */
+		pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
+		/* Check for invalid length. */
+		/* XXX LAST len should be >= 1 */
+		if (unlikely(tlen < (hdrsize + pad + 4)))
+			goto rewind;
+		/* Don't count the CRC. */
+		tlen -= (hdrsize + pad + 4);
+		wc.byte_len = tlen + qp->r_rcv_len;
+		if (unlikely(wc.byte_len > qp->r_len))
+			goto rewind;
+		wc.opcode = IB_WC_RECV;
+		qib_copy_sge(&qp->r_sge, data, tlen, 0);
+		qib_put_ss(&qp->s_rdma_read_sge);
+last_imm:
+		wc.wr_id = qp->r_wr_id;
+		wc.status = IB_WC_SUCCESS;
+		wc.qp = &qp->ibqp;
+		wc.src_qp = qp->remote_qpn;
+		wc.slid = qp->remote_ah_attr.dlid;
+		wc.sl = qp->remote_ah_attr.sl;
+		/* zero fields that are N/A */
+		wc.vendor_err = 0;
+		wc.pkey_index = 0;
+		wc.dlid_path_bits = 0;
+		wc.port_num = 0;
+		/* Signal completion event if the solicited bit is set. */
+		qib_cq_enter(to_icq(qp->ibqp.recv_cq), &wc,
+			     (ohdr->bth[0] &
+				cpu_to_be32(IB_BTH_SOLICITED)) != 0);
+		break;
+
+	case OP(RDMA_WRITE_FIRST):
+	case OP(RDMA_WRITE_ONLY):
+	case OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE): /* consume RWQE */
+rdma_first:
+		if (unlikely(!(qp->qp_access_flags &
+			       IB_ACCESS_REMOTE_WRITE))) {
+			goto drop;
+		}
+		reth = &ohdr->u.rc.reth;
+		hdrsize += sizeof(*reth);
+		qp->r_len = be32_to_cpu(reth->length);
+		qp->r_rcv_len = 0;
+		qp->r_sge.sg_list = NULL;
+		if (qp->r_len != 0) {
+			u32 rkey = be32_to_cpu(reth->rkey);
+			u64 vaddr = be64_to_cpu(reth->vaddr);
+			int ok;
+
+			/* Check rkey */
+			ok = qib_rkey_ok(qp, &qp->r_sge.sge, qp->r_len,
+					 vaddr, rkey, IB_ACCESS_REMOTE_WRITE);
+			if (unlikely(!ok))
+				goto drop;
+			qp->r_sge.num_sge = 1;
+		} else {
+			qp->r_sge.num_sge = 0;
+			qp->r_sge.sge.mr = NULL;
+			qp->r_sge.sge.vaddr = NULL;
+			qp->r_sge.sge.length = 0;
+			qp->r_sge.sge.sge_length = 0;
+		}
+		if (opcode == OP(RDMA_WRITE_ONLY))
+			goto rdma_last;
+		else if (opcode == OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE)) {
+			wc.ex.imm_data = ohdr->u.rc.imm_data;
+			goto rdma_last_imm;
+		}
+		/* FALLTHROUGH */
+	case OP(RDMA_WRITE_MIDDLE):
+		/* Check for invalid length PMTU or posted rwqe len. */
+		if (unlikely(tlen != (hdrsize + pmtu + 4)))
+			goto drop;
+		qp->r_rcv_len += pmtu;
+		if (unlikely(qp->r_rcv_len > qp->r_len))
+			goto drop;
+		qib_copy_sge(&qp->r_sge, data, pmtu, 1);
+		break;
+
+	case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE):
+		wc.ex.imm_data = ohdr->u.imm_data;
+rdma_last_imm:
+		hdrsize += 4;
+		wc.wc_flags = IB_WC_WITH_IMM;
+
+		/* Get the number of bytes the message was padded by. */
+		pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
+		/* Check for invalid length. */
+		/* XXX LAST len should be >= 1 */
+		if (unlikely(tlen < (hdrsize + pad + 4)))
+			goto drop;
+		/* Don't count the CRC. */
+		tlen -= (hdrsize + pad + 4);
+		if (unlikely(tlen + qp->r_rcv_len != qp->r_len))
+			goto drop;
+		if (test_and_clear_bit(QIB_R_REWIND_SGE, &qp->r_aflags))
+			qib_put_ss(&qp->s_rdma_read_sge);
+		else {
+			ret = qib_get_rwqe(qp, 1);
+			if (ret < 0)
+				goto op_err;
+			if (!ret)
+				goto drop;
+		}
+		wc.byte_len = qp->r_len;
+		wc.opcode = IB_WC_RECV_RDMA_WITH_IMM;
+		qib_copy_sge(&qp->r_sge, data, tlen, 1);
+		qib_put_ss(&qp->r_sge);
+		goto last_imm;
+
+	case OP(RDMA_WRITE_LAST):
+rdma_last:
+		/* Get the number of bytes the message was padded by. */
+		pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
+		/* Check for invalid length. */
+		/* XXX LAST len should be >= 1 */
+		if (unlikely(tlen < (hdrsize + pad + 4)))
+			goto drop;
+		/* Don't count the CRC. */
+		tlen -= (hdrsize + pad + 4);
+		if (unlikely(tlen + qp->r_rcv_len != qp->r_len))
+			goto drop;
+		qib_copy_sge(&qp->r_sge, data, tlen, 1);
+		qib_put_ss(&qp->r_sge);
+		break;
+
+	default:
+		/* Drop packet for unknown opcodes. */
+		goto drop;
+	}
+	qp->r_psn++;
+	qp->r_state = opcode;
+	return;
+
+rewind:
+	set_bit(QIB_R_REWIND_SGE, &qp->r_aflags);
+	qp->r_sge.num_sge = 0;
+drop:
+	ibp->n_pkt_drops++;
+	return;
+
+op_err:
+	qib_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
+	return;
+
+}
diff --git a/drivers/infiniband/hw/qib/qib_ud.c b/drivers/infiniband/hw/qib/qib_ud.c
new file mode 100644
index 0000000..aaf7039
--- /dev/null
+++ b/drivers/infiniband/hw/qib/qib_ud.c
@@ -0,0 +1,590 @@
+/*
+ * Copyright (c) 2006, 2007, 2008, 2009 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <rdma/ib_smi.h>
+
+#include "qib.h"
+#include "qib_mad.h"
+
+/**
+ * qib_ud_loopback - handle send on loopback QPs
+ * @sqp: the sending QP
+ * @swqe: the send work request
+ *
+ * This is called from qib_make_ud_req() to forward a WQE addressed
+ * to the same HCA.
+ * Note that the receive interrupt handler may be calling qib_ud_rcv()
+ * while this is being called.
+ */
+static void qib_ud_loopback(struct qib_qp *sqp, struct qib_swqe *swqe)
+{
+	struct qib_ibport *ibp = to_iport(sqp->ibqp.device, sqp->port_num);
+	struct qib_pportdata *ppd;
+	struct qib_qp *qp;
+	struct ib_ah_attr *ah_attr;
+	unsigned long flags;
+	struct qib_sge_state ssge;
+	struct qib_sge *sge;
+	struct ib_wc wc;
+	u32 length;
+	enum ib_qp_type sqptype, dqptype;
+
+	qp = qib_lookup_qpn(ibp, swqe->wr.wr.ud.remote_qpn);
+	if (!qp) {
+		ibp->n_pkt_drops++;
+		return;
+	}
+
+	sqptype = sqp->ibqp.qp_type == IB_QPT_GSI ?
+			IB_QPT_UD : sqp->ibqp.qp_type;
+	dqptype = qp->ibqp.qp_type == IB_QPT_GSI ?
+			IB_QPT_UD : qp->ibqp.qp_type;
+
+	if (dqptype != sqptype ||
+	    !(ib_qib_state_ops[qp->state] & QIB_PROCESS_RECV_OK)) {
+		ibp->n_pkt_drops++;
+		goto drop;
+	}
+
+	ah_attr = &to_iah(swqe->wr.wr.ud.ah)->attr;
+	ppd = ppd_from_ibp(ibp);
+
+	if (qp->ibqp.qp_num > 1) {
+		u16 pkey1;
+		u16 pkey2;
+		u16 lid;
+
+		pkey1 = qib_get_pkey(ibp, sqp->s_pkey_index);
+		pkey2 = qib_get_pkey(ibp, qp->s_pkey_index);
+		if (unlikely(!qib_pkey_ok(pkey1, pkey2))) {
+			lid = ppd->lid | (ah_attr->src_path_bits &
+					  ((1 << ppd->lmc) - 1));
+			qib_bad_pqkey(ibp, IB_NOTICE_TRAP_BAD_PKEY, pkey1,
+				      ah_attr->sl,
+				      sqp->ibqp.qp_num, qp->ibqp.qp_num,
+				      cpu_to_be16(lid),
+				      cpu_to_be16(ah_attr->dlid));
+			goto drop;
+		}
+	}
+
+	/*
+	 * Check that the qkey matches (except for QP0, see 9.6.1.4.1).
+	 * Qkeys with the high order bit set mean use the
+	 * qkey from the QP context instead of the WR (see 10.2.5).
+	 */
+	if (qp->ibqp.qp_num) {
+		u32 qkey;
+
+		qkey = (int)swqe->wr.wr.ud.remote_qkey < 0 ?
+			sqp->qkey : swqe->wr.wr.ud.remote_qkey;
+		if (unlikely(qkey != qp->qkey)) {
+			u16 lid;
+
+			lid = ppd->lid | (ah_attr->src_path_bits &
+					  ((1 << ppd->lmc) - 1));
+			qib_bad_pqkey(ibp, IB_NOTICE_TRAP_BAD_QKEY, qkey,
+				      ah_attr->sl,
+				      sqp->ibqp.qp_num, qp->ibqp.qp_num,
+				      cpu_to_be16(lid),
+				      cpu_to_be16(ah_attr->dlid));
+			goto drop;
+		}
+	}
+
+	/*
+	 * A GRH is expected to precede the data even if not
+	 * present on the wire.
+	 */
+	length = swqe->length;
+	memset(&wc, 0, sizeof wc);
+	wc.byte_len = length + sizeof(struct ib_grh);
+
+	if (swqe->wr.opcode == IB_WR_SEND_WITH_IMM) {
+		wc.wc_flags = IB_WC_WITH_IMM;
+		wc.ex.imm_data = swqe->wr.ex.imm_data;
+	}
+
+	spin_lock_irqsave(&qp->r_lock, flags);
+
+	/*
+	 * Get the next work request entry to find where to put the data.
+	 */
+	if (qp->r_flags & QIB_R_REUSE_SGE)
+		qp->r_flags &= ~QIB_R_REUSE_SGE;
+	else {
+		int ret;
+
+		ret = qib_get_rwqe(qp, 0);
+		if (ret < 0) {
+			qib_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
+			goto bail_unlock;
+		}
+		if (!ret) {
+			if (qp->ibqp.qp_num == 0)
+				ibp->n_vl15_dropped++;
+			goto bail_unlock;
+		}
+	}
+	/* Silently drop packets which are too big. */
+	if (unlikely(wc.byte_len > qp->r_len)) {
+		qp->r_flags |= QIB_R_REUSE_SGE;
+		ibp->n_pkt_drops++;
+		goto bail_unlock;
+	}
+
+	if (ah_attr->ah_flags & IB_AH_GRH) {
+		qib_copy_sge(&qp->r_sge, &ah_attr->grh,
+			     sizeof(struct ib_grh), 1);
+		wc.wc_flags |= IB_WC_GRH;
+	} else
+		qib_skip_sge(&qp->r_sge, sizeof(struct ib_grh), 1);
+	ssge.sg_list = swqe->sg_list + 1;
+	ssge.sge = *swqe->sg_list;
+	ssge.num_sge = swqe->wr.num_sge;
+	sge = &ssge.sge;
+	while (length) {
+		u32 len = sge->length;
+
+		if (len > length)
+			len = length;
+		if (len > sge->sge_length)
+			len = sge->sge_length;
+		BUG_ON(len == 0);
+		qib_copy_sge(&qp->r_sge, sge->vaddr, len, 1);
+		sge->vaddr += len;
+		sge->length -= len;
+		sge->sge_length -= len;
+		if (sge->sge_length == 0) {
+			if (--ssge.num_sge)
+				*sge = *ssge.sg_list++;
+		} else if (sge->length == 0 && sge->mr->lkey) {
+			if (++sge->n >= QIB_SEGSZ) {
+				if (++sge->m >= sge->mr->mapsz)
+					break;
+				sge->n = 0;
+			}
+			sge->vaddr =
+				sge->mr->map[sge->m]->segs[sge->n].vaddr;
+			sge->length =
+				sge->mr->map[sge->m]->segs[sge->n].length;
+		}
+		length -= len;
+	}
+	qib_put_ss(&qp->r_sge);
+	if (!test_and_clear_bit(QIB_R_WRID_VALID, &qp->r_aflags))
+		goto bail_unlock;
+	wc.wr_id = qp->r_wr_id;
+	wc.status = IB_WC_SUCCESS;
+	wc.opcode = IB_WC_RECV;
+	wc.qp = &qp->ibqp;
+	wc.src_qp = sqp->ibqp.qp_num;
+	wc.pkey_index = qp->ibqp.qp_type == IB_QPT_GSI ?
+		swqe->wr.wr.ud.pkey_index : 0;
+	wc.slid = ppd->lid | (ah_attr->src_path_bits & ((1 << ppd->lmc) - 1));
+	wc.sl = ah_attr->sl;
+	wc.dlid_path_bits = ah_attr->dlid & ((1 << ppd->lmc) - 1);
+	wc.port_num = qp->port_num;
+	/* Signal completion event if the solicited bit is set. */
+	qib_cq_enter(to_icq(qp->ibqp.recv_cq), &wc,
+		     swqe->wr.send_flags & IB_SEND_SOLICITED);
+	ibp->n_loop_pkts++;
+bail_unlock:
+	spin_unlock_irqrestore(&qp->r_lock, flags);
+drop:
+	if (atomic_dec_and_test(&qp->refcount))
+		wake_up(&qp->wait);
+}
+
+/**
+ * qib_make_ud_req - construct a UD request packet
+ * @qp: the QP
+ *
+ * Return 1 if constructed; otherwise, return 0.
+ */
+int qib_make_ud_req(struct qib_qp *qp)
+{
+	struct qib_other_headers *ohdr;
+	struct ib_ah_attr *ah_attr;
+	struct qib_pportdata *ppd;
+	struct qib_ibport *ibp;
+	struct qib_swqe *wqe;
+	unsigned long flags;
+	u32 nwords;
+	u32 extra_bytes;
+	u32 bth0;
+	u16 lrh0;
+	u16 lid;
+	int ret = 0;
+	int next_cur;
+
+	spin_lock_irqsave(&qp->s_lock, flags);
+
+	if (!(ib_qib_state_ops[qp->state] & QIB_PROCESS_NEXT_SEND_OK)) {
+		if (!(ib_qib_state_ops[qp->state] & QIB_FLUSH_SEND))
+			goto bail;
+		/* We are in the error state, flush the work request. */
+		if (qp->s_last == qp->s_head)
+			goto bail;
+		/* If DMAs are in progress, we can't flush immediately. */
+		if (atomic_read(&qp->s_dma_busy)) {
+			qp->s_flags |= QIB_S_WAIT_DMA;
+			goto bail;
+		}
+		wqe = get_swqe_ptr(qp, qp->s_last);
+		qib_send_complete(qp, wqe, IB_WC_WR_FLUSH_ERR);
+		goto done;
+	}
+
+	if (qp->s_cur == qp->s_head)
+		goto bail;
+
+	wqe = get_swqe_ptr(qp, qp->s_cur);
+	next_cur = qp->s_cur + 1;
+	if (next_cur >= qp->s_size)
+		next_cur = 0;
+
+	/* Construct the header. */
+	ibp = to_iport(qp->ibqp.device, qp->port_num);
+	ppd = ppd_from_ibp(ibp);
+	ah_attr = &to_iah(wqe->wr.wr.ud.ah)->attr;
+	if (ah_attr->dlid >= QIB_MULTICAST_LID_BASE) {
+		if (ah_attr->dlid != QIB_PERMISSIVE_LID)
+			this_cpu_inc(ibp->pmastats->n_multicast_xmit);
+		else
+			this_cpu_inc(ibp->pmastats->n_unicast_xmit);
+	} else {
+		this_cpu_inc(ibp->pmastats->n_unicast_xmit);
+		lid = ah_attr->dlid & ~((1 << ppd->lmc) - 1);
+		if (unlikely(lid == ppd->lid)) {
+			/*
+			 * If DMAs are in progress, we can't generate
+			 * a completion for the loopback packet since
+			 * it would be out of order.
+			 * XXX Instead of waiting, we could queue a
+			 * zero length descriptor so we get a callback.
+			 */
+			if (atomic_read(&qp->s_dma_busy)) {
+				qp->s_flags |= QIB_S_WAIT_DMA;
+				goto bail;
+			}
+			qp->s_cur = next_cur;
+			spin_unlock_irqrestore(&qp->s_lock, flags);
+			qib_ud_loopback(qp, wqe);
+			spin_lock_irqsave(&qp->s_lock, flags);
+			qib_send_complete(qp, wqe, IB_WC_SUCCESS);
+			goto done;
+		}
+	}
+
+	qp->s_cur = next_cur;
+	extra_bytes = -wqe->length & 3;
+	nwords = (wqe->length + extra_bytes) >> 2;
+
+	/* header size in 32-bit words LRH+BTH+DETH = (8+12+8)/4. */
+	qp->s_hdrwords = 7;
+	qp->s_cur_size = wqe->length;
+	qp->s_cur_sge = &qp->s_sge;
+	qp->s_srate = ah_attr->static_rate;
+	qp->s_wqe = wqe;
+	qp->s_sge.sge = wqe->sg_list[0];
+	qp->s_sge.sg_list = wqe->sg_list + 1;
+	qp->s_sge.num_sge = wqe->wr.num_sge;
+	qp->s_sge.total_len = wqe->length;
+
+	if (ah_attr->ah_flags & IB_AH_GRH) {
+		/* Header size in 32-bit words. */
+		qp->s_hdrwords += qib_make_grh(ibp, &qp->s_hdr->u.l.grh,
+					       &ah_attr->grh,
+					       qp->s_hdrwords, nwords);
+		lrh0 = QIB_LRH_GRH;
+		ohdr = &qp->s_hdr->u.l.oth;
+		/*
+		 * Don't worry about sending to locally attached multicast
+		 * QPs.  It is unspecified by the spec. what happens.
+		 */
+	} else {
+		/* Header size in 32-bit words. */
+		lrh0 = QIB_LRH_BTH;
+		ohdr = &qp->s_hdr->u.oth;
+	}
+	if (wqe->wr.opcode == IB_WR_SEND_WITH_IMM) {
+		qp->s_hdrwords++;
+		ohdr->u.ud.imm_data = wqe->wr.ex.imm_data;
+		bth0 = IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE << 24;
+	} else
+		bth0 = IB_OPCODE_UD_SEND_ONLY << 24;
+	lrh0 |= ah_attr->sl << 4;
+	if (qp->ibqp.qp_type == IB_QPT_SMI)
+		lrh0 |= 0xF000; /* Set VL (see ch. 13.5.3.1) */
+	else
+		lrh0 |= ibp->sl_to_vl[ah_attr->sl] << 12;
+	qp->s_hdr->lrh[0] = cpu_to_be16(lrh0);
+	qp->s_hdr->lrh[1] = cpu_to_be16(ah_attr->dlid);  /* DEST LID */
+	qp->s_hdr->lrh[2] = cpu_to_be16(qp->s_hdrwords + nwords + SIZE_OF_CRC);
+	lid = ppd->lid;
+	if (lid) {
+		lid |= ah_attr->src_path_bits & ((1 << ppd->lmc) - 1);
+		qp->s_hdr->lrh[3] = cpu_to_be16(lid);
+	} else
+		qp->s_hdr->lrh[3] = IB_LID_PERMISSIVE;
+	if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+		bth0 |= IB_BTH_SOLICITED;
+	bth0 |= extra_bytes << 20;
+	bth0 |= qp->ibqp.qp_type == IB_QPT_SMI ? QIB_DEFAULT_P_KEY :
+		qib_get_pkey(ibp, qp->ibqp.qp_type == IB_QPT_GSI ?
+			     wqe->wr.wr.ud.pkey_index : qp->s_pkey_index);
+	ohdr->bth[0] = cpu_to_be32(bth0);
+	/*
+	 * Use the multicast QP if the destination LID is a multicast LID.
+	 */
+	ohdr->bth[1] = ah_attr->dlid >= QIB_MULTICAST_LID_BASE &&
+		ah_attr->dlid != QIB_PERMISSIVE_LID ?
+		cpu_to_be32(QIB_MULTICAST_QPN) :
+		cpu_to_be32(wqe->wr.wr.ud.remote_qpn);
+	ohdr->bth[2] = cpu_to_be32(qp->s_next_psn++ & QIB_PSN_MASK);
+	/*
+	 * Qkeys with the high order bit set mean use the
+	 * qkey from the QP context instead of the WR (see 10.2.5).
+	 */
+	ohdr->u.ud.deth[0] = cpu_to_be32((int)wqe->wr.wr.ud.remote_qkey < 0 ?
+					 qp->qkey : wqe->wr.wr.ud.remote_qkey);
+	ohdr->u.ud.deth[1] = cpu_to_be32(qp->ibqp.qp_num);
+
+done:
+	ret = 1;
+	goto unlock;
+
+bail:
+	qp->s_flags &= ~QIB_S_BUSY;
+unlock:
+	spin_unlock_irqrestore(&qp->s_lock, flags);
+	return ret;
+}
+
+static unsigned qib_lookup_pkey(struct qib_ibport *ibp, u16 pkey)
+{
+	struct qib_pportdata *ppd = ppd_from_ibp(ibp);
+	struct qib_devdata *dd = ppd->dd;
+	unsigned ctxt = ppd->hw_pidx;
+	unsigned i;
+
+	pkey &= 0x7fff;	/* remove limited/full membership bit */
+
+	for (i = 0; i < ARRAY_SIZE(dd->rcd[ctxt]->pkeys); ++i)
+		if ((dd->rcd[ctxt]->pkeys[i] & 0x7fff) == pkey)
+			return i;
+
+	/*
+	 * Should not get here, this means hardware failed to validate pkeys.
+	 * Punt and return index 0.
+	 */
+	return 0;
+}
+
+/**
+ * qib_ud_rcv - receive an incoming UD packet
+ * @ibp: the port the packet came in on
+ * @hdr: the packet header
+ * @has_grh: true if the packet has a GRH
+ * @data: the packet data
+ * @tlen: the packet length
+ * @qp: the QP the packet came on
+ *
+ * This is called from qib_qp_rcv() to process an incoming UD packet
+ * for the given QP.
+ * Called at interrupt level.
+ */
+void qib_ud_rcv(struct qib_ibport *ibp, struct qib_ib_header *hdr,
+		int has_grh, void *data, u32 tlen, struct qib_qp *qp)
+{
+	struct qib_other_headers *ohdr;
+	int opcode;
+	u32 hdrsize;
+	u32 pad;
+	struct ib_wc wc;
+	u32 qkey;
+	u32 src_qp;
+	u16 dlid;
+
+	/* Check for GRH */
+	if (!has_grh) {
+		ohdr = &hdr->u.oth;
+		hdrsize = 8 + 12 + 8;   /* LRH + BTH + DETH */
+	} else {
+		ohdr = &hdr->u.l.oth;
+		hdrsize = 8 + 40 + 12 + 8; /* LRH + GRH + BTH + DETH */
+	}
+	qkey = be32_to_cpu(ohdr->u.ud.deth[0]);
+	src_qp = be32_to_cpu(ohdr->u.ud.deth[1]) & QIB_QPN_MASK;
+
+	/*
+	 * Get the number of bytes the message was padded by
+	 * and drop incomplete packets.
+	 */
+	pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
+	if (unlikely(tlen < (hdrsize + pad + 4)))
+		goto drop;
+
+	tlen -= hdrsize + pad + 4;
+
+	/*
+	 * Check that the permissive LID is only used on QP0
+	 * and the QKEY matches (see 9.6.1.4.1 and 9.6.1.5.1).
+	 */
+	if (qp->ibqp.qp_num) {
+		if (unlikely(hdr->lrh[1] == IB_LID_PERMISSIVE ||
+			     hdr->lrh[3] == IB_LID_PERMISSIVE))
+			goto drop;
+		if (qp->ibqp.qp_num > 1) {
+			u16 pkey1, pkey2;
+
+			pkey1 = be32_to_cpu(ohdr->bth[0]);
+			pkey2 = qib_get_pkey(ibp, qp->s_pkey_index);
+			if (unlikely(!qib_pkey_ok(pkey1, pkey2))) {
+				qib_bad_pqkey(ibp, IB_NOTICE_TRAP_BAD_PKEY,
+					      pkey1,
+					      (be16_to_cpu(hdr->lrh[0]) >> 4) &
+						0xF,
+					      src_qp, qp->ibqp.qp_num,
+					      hdr->lrh[3], hdr->lrh[1]);
+				return;
+			}
+		}
+		if (unlikely(qkey != qp->qkey)) {
+			qib_bad_pqkey(ibp, IB_NOTICE_TRAP_BAD_QKEY, qkey,
+				      (be16_to_cpu(hdr->lrh[0]) >> 4) & 0xF,
+				      src_qp, qp->ibqp.qp_num,
+				      hdr->lrh[3], hdr->lrh[1]);
+			return;
+		}
+		/* Drop invalid MAD packets (see 13.5.3.1). */
+		if (unlikely(qp->ibqp.qp_num == 1 &&
+			     (tlen != 256 ||
+			      (be16_to_cpu(hdr->lrh[0]) >> 12) == 15)))
+			goto drop;
+	} else {
+		struct ib_smp *smp;
+
+		/* Drop invalid MAD packets (see 13.5.3.1). */
+		if (tlen != 256 || (be16_to_cpu(hdr->lrh[0]) >> 12) != 15)
+			goto drop;
+		smp = (struct ib_smp *) data;
+		if ((hdr->lrh[1] == IB_LID_PERMISSIVE ||
+		     hdr->lrh[3] == IB_LID_PERMISSIVE) &&
+		    smp->mgmt_class != IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)
+			goto drop;
+	}
+
+	/*
+	 * The opcode is in the low byte when its in network order
+	 * (top byte when in host order).
+	 */
+	opcode = be32_to_cpu(ohdr->bth[0]) >> 24;
+	if (qp->ibqp.qp_num > 1 &&
+	    opcode == IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE) {
+		wc.ex.imm_data = ohdr->u.ud.imm_data;
+		wc.wc_flags = IB_WC_WITH_IMM;
+		tlen -= sizeof(u32);
+	} else if (opcode == IB_OPCODE_UD_SEND_ONLY) {
+		wc.ex.imm_data = 0;
+		wc.wc_flags = 0;
+	} else
+		goto drop;
+
+	/*
+	 * A GRH is expected to precede the data even if not
+	 * present on the wire.
+	 */
+	wc.byte_len = tlen + sizeof(struct ib_grh);
+
+	/*
+	 * Get the next work request entry to find where to put the data.
+	 */
+	if (qp->r_flags & QIB_R_REUSE_SGE)
+		qp->r_flags &= ~QIB_R_REUSE_SGE;
+	else {
+		int ret;
+
+		ret = qib_get_rwqe(qp, 0);
+		if (ret < 0) {
+			qib_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
+			return;
+		}
+		if (!ret) {
+			if (qp->ibqp.qp_num == 0)
+				ibp->n_vl15_dropped++;
+			return;
+		}
+	}
+	/* Silently drop packets which are too big. */
+	if (unlikely(wc.byte_len > qp->r_len)) {
+		qp->r_flags |= QIB_R_REUSE_SGE;
+		goto drop;
+	}
+	if (has_grh) {
+		qib_copy_sge(&qp->r_sge, &hdr->u.l.grh,
+			     sizeof(struct ib_grh), 1);
+		wc.wc_flags |= IB_WC_GRH;
+	} else
+		qib_skip_sge(&qp->r_sge, sizeof(struct ib_grh), 1);
+	qib_copy_sge(&qp->r_sge, data, wc.byte_len - sizeof(struct ib_grh), 1);
+	qib_put_ss(&qp->r_sge);
+	if (!test_and_clear_bit(QIB_R_WRID_VALID, &qp->r_aflags))
+		return;
+	wc.wr_id = qp->r_wr_id;
+	wc.status = IB_WC_SUCCESS;
+	wc.opcode = IB_WC_RECV;
+	wc.vendor_err = 0;
+	wc.qp = &qp->ibqp;
+	wc.src_qp = src_qp;
+	wc.pkey_index = qp->ibqp.qp_type == IB_QPT_GSI ?
+		qib_lookup_pkey(ibp, be32_to_cpu(ohdr->bth[0])) : 0;
+	wc.slid = be16_to_cpu(hdr->lrh[3]);
+	wc.sl = (be16_to_cpu(hdr->lrh[0]) >> 4) & 0xF;
+	dlid = be16_to_cpu(hdr->lrh[1]);
+	/*
+	 * Save the LMC lower bits if the destination LID is a unicast LID.
+	 */
+	wc.dlid_path_bits = dlid >= QIB_MULTICAST_LID_BASE ? 0 :
+		dlid & ((1 << ppd_from_ibp(ibp)->lmc) - 1);
+	wc.port_num = qp->port_num;
+	/* Signal completion event if the solicited bit is set. */
+	qib_cq_enter(to_icq(qp->ibqp.recv_cq), &wc,
+		     (ohdr->bth[0] &
+			cpu_to_be32(IB_BTH_SOLICITED)) != 0);
+	return;
+
+drop:
+	ibp->n_pkt_drops++;
+}
diff --git a/drivers/infiniband/hw/qib/qib_user_pages.c b/drivers/infiniband/hw/qib/qib_user_pages.c
new file mode 100644
index 0000000..74f90b2
--- /dev/null
+++ b/drivers/infiniband/hw/qib/qib_user_pages.c
@@ -0,0 +1,157 @@
+/*
+ * Copyright (c) 2006, 2007, 2008, 2009 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/mm.h>
+#include <linux/device.h>
+
+#include "qib.h"
+
+static void __qib_release_user_pages(struct page **p, size_t num_pages,
+				     int dirty)
+{
+	size_t i;
+
+	for (i = 0; i < num_pages; i++) {
+		if (dirty)
+			set_page_dirty_lock(p[i]);
+		put_page(p[i]);
+	}
+}
+
+/*
+ * Call with current->mm->mmap_sem held.
+ */
+static int __qib_get_user_pages(unsigned long start_page, size_t num_pages,
+				struct page **p)
+{
+	unsigned long lock_limit;
+	size_t got;
+	int ret;
+
+	lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+
+	if (num_pages > lock_limit && !capable(CAP_IPC_LOCK)) {
+		ret = -ENOMEM;
+		goto bail;
+	}
+
+	for (got = 0; got < num_pages; got += ret) {
+		ret = get_user_pages(current, current->mm,
+				     start_page + got * PAGE_SIZE,
+				     num_pages - got, 1, 1,
+				     p + got, NULL);
+		if (ret < 0)
+			goto bail_release;
+	}
+
+	current->mm->pinned_vm += num_pages;
+
+	ret = 0;
+	goto bail;
+
+bail_release:
+	__qib_release_user_pages(p, got, 0);
+bail:
+	return ret;
+}
+
+/**
+ * qib_map_page - a safety wrapper around pci_map_page()
+ *
+ * A dma_addr of all 0's is interpreted by the chip as "disabled".
+ * Unfortunately, it can also be a valid dma_addr returned on some
+ * architectures.
+ *
+ * The powerpc iommu assigns dma_addrs in ascending order, so we don't
+ * have to bother with retries or mapping a dummy page to insure we
+ * don't just get the same mapping again.
+ *
+ * I'm sure we won't be so lucky with other iommu's, so FIXME.
+ */
+dma_addr_t qib_map_page(struct pci_dev *hwdev, struct page *page,
+			unsigned long offset, size_t size, int direction)
+{
+	dma_addr_t phys;
+
+	phys = pci_map_page(hwdev, page, offset, size, direction);
+
+	if (phys == 0) {
+		pci_unmap_page(hwdev, phys, size, direction);
+		phys = pci_map_page(hwdev, page, offset, size, direction);
+		/*
+		 * FIXME: If we get 0 again, we should keep this page,
+		 * map another, then free the 0 page.
+		 */
+	}
+
+	return phys;
+}
+
+/**
+ * qib_get_user_pages - lock user pages into memory
+ * @start_page: the start page
+ * @num_pages: the number of pages
+ * @p: the output page structures
+ *
+ * This function takes a given start page (page aligned user virtual
+ * address) and pins it and the following specified number of pages.  For
+ * now, num_pages is always 1, but that will probably change at some point
+ * (because caller is doing expected sends on a single virtually contiguous
+ * buffer, so we can do all pages at once).
+ */
+int qib_get_user_pages(unsigned long start_page, size_t num_pages,
+		       struct page **p)
+{
+	int ret;
+
+	down_write(&current->mm->mmap_sem);
+
+	ret = __qib_get_user_pages(start_page, num_pages, p);
+
+	up_write(&current->mm->mmap_sem);
+
+	return ret;
+}
+
+void qib_release_user_pages(struct page **p, size_t num_pages)
+{
+	if (current->mm) /* during close after signal, mm can be NULL */
+		down_write(&current->mm->mmap_sem);
+
+	__qib_release_user_pages(p, num_pages, 1);
+
+	if (current->mm) {
+		current->mm->pinned_vm -= num_pages;
+		up_write(&current->mm->mmap_sem);
+	}
+}
diff --git a/drivers/infiniband/hw/qib/qib_user_sdma.c b/drivers/infiniband/hw/qib/qib_user_sdma.c
new file mode 100644
index 0000000..d2806ca
--- /dev/null
+++ b/drivers/infiniband/hw/qib/qib_user_sdma.c
@@ -0,0 +1,1465 @@
+/*
+ * Copyright (c) 2007, 2008, 2009 QLogic Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include <linux/mm.h>
+#include <linux/types.h>
+#include <linux/device.h>
+#include <linux/dmapool.h>
+#include <linux/slab.h>
+#include <linux/list.h>
+#include <linux/highmem.h>
+#include <linux/io.h>
+#include <linux/uio.h>
+#include <linux/rbtree.h>
+#include <linux/spinlock.h>
+#include <linux/delay.h>
+
+#include "qib.h"
+#include "qib_user_sdma.h"
+
+/* minimum size of header */
+#define QIB_USER_SDMA_MIN_HEADER_LENGTH 64
+/* expected size of headers (for dma_pool) */
+#define QIB_USER_SDMA_EXP_HEADER_LENGTH 64
+/* attempt to drain the queue for 5secs */
+#define QIB_USER_SDMA_DRAIN_TIMEOUT 500
+
+/*
+ * track how many times a process open this driver.
+ */
+static struct rb_root qib_user_sdma_rb_root = RB_ROOT;
+
+struct qib_user_sdma_rb_node {
+	struct rb_node node;
+	int refcount;
+	pid_t pid;
+};
+
+struct qib_user_sdma_pkt {
+	struct list_head list;  /* list element */
+
+	u8  tiddma;		/* if this is NEW tid-sdma */
+	u8  largepkt;		/* this is large pkt from kmalloc */
+	u16 frag_size;		/* frag size used by PSM */
+	u16 index;              /* last header index or push index */
+	u16 naddr;              /* dimension of addr (1..3) ... */
+	u16 addrlimit;		/* addr array size */
+	u16 tidsmidx;		/* current tidsm index */
+	u16 tidsmcount;		/* tidsm array item count */
+	u16 payload_size;	/* payload size so far for header */
+	u32 bytes_togo;		/* bytes for processing */
+	u32 counter;            /* sdma pkts queued counter for this entry */
+	struct qib_tid_session_member *tidsm;	/* tid session member array */
+	struct qib_user_sdma_queue *pq;	/* which pq this pkt belongs to */
+	u64 added;              /* global descq number of entries */
+
+	struct {
+		u16 offset;                     /* offset for kvaddr, addr */
+		u16 length;                     /* length in page */
+		u16 first_desc;			/* first desc */
+		u16 last_desc;			/* last desc */
+		u16 put_page;                   /* should we put_page? */
+		u16 dma_mapped;                 /* is page dma_mapped? */
+		u16 dma_length;			/* for dma_unmap_page() */
+		u16 padding;
+		struct page *page;              /* may be NULL (coherent mem) */
+		void *kvaddr;                   /* FIXME: only for pio hack */
+		dma_addr_t addr;
+	} addr[4];   /* max pages, any more and we coalesce */
+};
+
+struct qib_user_sdma_queue {
+	/*
+	 * pkts sent to dma engine are queued on this
+	 * list head.  the type of the elements of this
+	 * list are struct qib_user_sdma_pkt...
+	 */
+	struct list_head sent;
+
+	/*
+	 * Because above list will be accessed by both process and
+	 * signal handler, we need a spinlock for it.
+	 */
+	spinlock_t sent_lock ____cacheline_aligned_in_smp;
+
+	/* headers with expected length are allocated from here... */
+	char header_cache_name[64];
+	struct dma_pool *header_cache;
+
+	/* packets are allocated from the slab cache... */
+	char pkt_slab_name[64];
+	struct kmem_cache *pkt_slab;
+
+	/* as packets go on the queued queue, they are counted... */
+	u32 counter;
+	u32 sent_counter;
+	/* pending packets, not sending yet */
+	u32 num_pending;
+	/* sending packets, not complete yet */
+	u32 num_sending;
+	/* global descq number of entry of last sending packet */
+	u64 added;
+
+	/* dma page table */
+	struct rb_root dma_pages_root;
+
+	struct qib_user_sdma_rb_node *sdma_rb_node;
+
+	/* protect everything above... */
+	struct mutex lock;
+};
+
+static struct qib_user_sdma_rb_node *
+qib_user_sdma_rb_search(struct rb_root *root, pid_t pid)
+{
+	struct qib_user_sdma_rb_node *sdma_rb_node;
+	struct rb_node *node = root->rb_node;
+
+	while (node) {
+		sdma_rb_node = container_of(node,
+			struct qib_user_sdma_rb_node, node);
+		if (pid < sdma_rb_node->pid)
+			node = node->rb_left;
+		else if (pid > sdma_rb_node->pid)
+			node = node->rb_right;
+		else
+			return sdma_rb_node;
+	}
+	return NULL;
+}
+
+static int
+qib_user_sdma_rb_insert(struct rb_root *root, struct qib_user_sdma_rb_node *new)
+{
+	struct rb_node **node = &(root->rb_node);
+	struct rb_node *parent = NULL;
+	struct qib_user_sdma_rb_node *got;
+
+	while (*node) {
+		got = container_of(*node, struct qib_user_sdma_rb_node, node);
+		parent = *node;
+		if (new->pid < got->pid)
+			node = &((*node)->rb_left);
+		else if (new->pid > got->pid)
+			node = &((*node)->rb_right);
+		else
+			return 0;
+	}
+
+	rb_link_node(&new->node, parent, node);
+	rb_insert_color(&new->node, root);
+	return 1;
+}
+
+struct qib_user_sdma_queue *
+qib_user_sdma_queue_create(struct device *dev, int unit, int ctxt, int sctxt)
+{
+	struct qib_user_sdma_queue *pq =
+		kmalloc(sizeof(struct qib_user_sdma_queue), GFP_KERNEL);
+	struct qib_user_sdma_rb_node *sdma_rb_node;
+
+	if (!pq)
+		goto done;
+
+	pq->counter = 0;
+	pq->sent_counter = 0;
+	pq->num_pending = 0;
+	pq->num_sending = 0;
+	pq->added = 0;
+	pq->sdma_rb_node = NULL;
+
+	INIT_LIST_HEAD(&pq->sent);
+	spin_lock_init(&pq->sent_lock);
+	mutex_init(&pq->lock);
+
+	snprintf(pq->pkt_slab_name, sizeof(pq->pkt_slab_name),
+		 "qib-user-sdma-pkts-%u-%02u.%02u", unit, ctxt, sctxt);
+	pq->pkt_slab = kmem_cache_create(pq->pkt_slab_name,
+					 sizeof(struct qib_user_sdma_pkt),
+					 0, 0, NULL);
+
+	if (!pq->pkt_slab)
+		goto err_kfree;
+
+	snprintf(pq->header_cache_name, sizeof(pq->header_cache_name),
+		 "qib-user-sdma-headers-%u-%02u.%02u", unit, ctxt, sctxt);
+	pq->header_cache = dma_pool_create(pq->header_cache_name,
+					   dev,
+					   QIB_USER_SDMA_EXP_HEADER_LENGTH,
+					   4, 0);
+	if (!pq->header_cache)
+		goto err_slab;
+
+	pq->dma_pages_root = RB_ROOT;
+
+	sdma_rb_node = qib_user_sdma_rb_search(&qib_user_sdma_rb_root,
+					current->pid);
+	if (sdma_rb_node) {
+		sdma_rb_node->refcount++;
+	} else {
+		int ret;
+		sdma_rb_node = kmalloc(sizeof(
+			struct qib_user_sdma_rb_node), GFP_KERNEL);
+		if (!sdma_rb_node)
+			goto err_rb;
+
+		sdma_rb_node->refcount = 1;
+		sdma_rb_node->pid = current->pid;
+
+		ret = qib_user_sdma_rb_insert(&qib_user_sdma_rb_root,
+					sdma_rb_node);
+		BUG_ON(ret == 0);
+	}
+	pq->sdma_rb_node = sdma_rb_node;
+
+	goto done;
+
+err_rb:
+	dma_pool_destroy(pq->header_cache);
+err_slab:
+	kmem_cache_destroy(pq->pkt_slab);
+err_kfree:
+	kfree(pq);
+	pq = NULL;
+
+done:
+	return pq;
+}
+
+static void qib_user_sdma_init_frag(struct qib_user_sdma_pkt *pkt,
+				    int i, u16 offset, u16 len,
+				    u16 first_desc, u16 last_desc,
+				    u16 put_page, u16 dma_mapped,
+				    struct page *page, void *kvaddr,
+				    dma_addr_t dma_addr, u16 dma_length)
+{
+	pkt->addr[i].offset = offset;
+	pkt->addr[i].length = len;
+	pkt->addr[i].first_desc = first_desc;
+	pkt->addr[i].last_desc = last_desc;
+	pkt->addr[i].put_page = put_page;
+	pkt->addr[i].dma_mapped = dma_mapped;
+	pkt->addr[i].page = page;
+	pkt->addr[i].kvaddr = kvaddr;
+	pkt->addr[i].addr = dma_addr;
+	pkt->addr[i].dma_length = dma_length;
+}
+
+static void *qib_user_sdma_alloc_header(struct qib_user_sdma_queue *pq,
+				size_t len, dma_addr_t *dma_addr)
+{
+	void *hdr;
+
+	if (len == QIB_USER_SDMA_EXP_HEADER_LENGTH)
+		hdr = dma_pool_alloc(pq->header_cache, GFP_KERNEL,
+					     dma_addr);
+	else
+		hdr = NULL;
+
+	if (!hdr) {
+		hdr = kmalloc(len, GFP_KERNEL);
+		if (!hdr)
+			return NULL;
+
+		*dma_addr = 0;
+	}
+
+	return hdr;
+}
+
+static int qib_user_sdma_page_to_frags(const struct qib_devdata *dd,
+				       struct qib_user_sdma_queue *pq,
+				       struct qib_user_sdma_pkt *pkt,
+				       struct page *page, u16 put,
+				       u16 offset, u16 len, void *kvaddr)
+{
+	__le16 *pbc16;
+	void *pbcvaddr;
+	struct qib_message_header *hdr;
+	u16 newlen, pbclen, lastdesc, dma_mapped;
+	u32 vcto;
+	union qib_seqnum seqnum;
+	dma_addr_t pbcdaddr;
+	dma_addr_t dma_addr =
+		dma_map_page(&dd->pcidev->dev,
+			page, offset, len, DMA_TO_DEVICE);
+	int ret = 0;
+
+	if (dma_mapping_error(&dd->pcidev->dev, dma_addr)) {
+		/*
+		 * dma mapping error, pkt has not managed
+		 * this page yet, return the page here so
+		 * the caller can ignore this page.
+		 */
+		if (put) {
+			put_page(page);
+		} else {
+			/* coalesce case */
+			kunmap(page);
+			__free_page(page);
+		}
+		ret = -ENOMEM;
+		goto done;
+	}
+	offset = 0;
+	dma_mapped = 1;
+
+
+next_fragment:
+
+	/*
+	 * In tid-sdma, the transfer length is restricted by
+	 * receiver side current tid page length.
+	 */
+	if (pkt->tiddma && len > pkt->tidsm[pkt->tidsmidx].length)
+		newlen = pkt->tidsm[pkt->tidsmidx].length;
+	else
+		newlen = len;
+
+	/*
+	 * Then the transfer length is restricted by MTU.
+	 * the last descriptor flag is determined by:
+	 * 1. the current packet is at frag size length.
+	 * 2. the current tid page is done if tid-sdma.
+	 * 3. there is no more byte togo if sdma.
+	 */
+	lastdesc = 0;
+	if ((pkt->payload_size + newlen) >= pkt->frag_size) {
+		newlen = pkt->frag_size - pkt->payload_size;
+		lastdesc = 1;
+	} else if (pkt->tiddma) {
+		if (newlen == pkt->tidsm[pkt->tidsmidx].length)
+			lastdesc = 1;
+	} else {
+		if (newlen == pkt->bytes_togo)
+			lastdesc = 1;
+	}
+
+	/* fill the next fragment in this page */
+	qib_user_sdma_init_frag(pkt, pkt->naddr, /* index */
+		offset, newlen,		/* offset, len */
+		0, lastdesc,		/* first last desc */
+		put, dma_mapped,	/* put page, dma mapped */
+		page, kvaddr,		/* struct page, virt addr */
+		dma_addr, len);		/* dma addr, dma length */
+	pkt->bytes_togo -= newlen;
+	pkt->payload_size += newlen;
+	pkt->naddr++;
+	if (pkt->naddr == pkt->addrlimit) {
+		ret = -EFAULT;
+		goto done;
+	}
+
+	/* If there is no more byte togo. (lastdesc==1) */
+	if (pkt->bytes_togo == 0) {
+		/* The packet is done, header is not dma mapped yet.
+		 * it should be from kmalloc */
+		if (!pkt->addr[pkt->index].addr) {
+			pkt->addr[pkt->index].addr =
+				dma_map_single(&dd->pcidev->dev,
+					pkt->addr[pkt->index].kvaddr,
+					pkt->addr[pkt->index].dma_length,
+					DMA_TO_DEVICE);
+			if (dma_mapping_error(&dd->pcidev->dev,
+					pkt->addr[pkt->index].addr)) {
+				ret = -ENOMEM;
+				goto done;
+			}
+			pkt->addr[pkt->index].dma_mapped = 1;
+		}
+
+		goto done;
+	}
+
+	/* If tid-sdma, advance tid info. */
+	if (pkt->tiddma) {
+		pkt->tidsm[pkt->tidsmidx].length -= newlen;
+		if (pkt->tidsm[pkt->tidsmidx].length) {
+			pkt->tidsm[pkt->tidsmidx].offset += newlen;
+		} else {
+			pkt->tidsmidx++;
+			if (pkt->tidsmidx == pkt->tidsmcount) {
+				ret = -EFAULT;
+				goto done;
+			}
+		}
+	}
+
+	/*
+	 * If this is NOT the last descriptor. (newlen==len)
+	 * the current packet is not done yet, but the current
+	 * send side page is done.
+	 */
+	if (lastdesc == 0)
+		goto done;
+
+	/*
+	 * If running this driver under PSM with message size
+	 * fitting into one transfer unit, it is not possible
+	 * to pass this line. otherwise, it is a buggggg.
+	 */
+
+	/*
+	 * Since the current packet is done, and there are more
+	 * bytes togo, we need to create a new sdma header, copying
+	 * from previous sdma header and modify both.
+	 */
+	pbclen = pkt->addr[pkt->index].length;
+	pbcvaddr = qib_user_sdma_alloc_header(pq, pbclen, &pbcdaddr);
+	if (!pbcvaddr) {
+		ret = -ENOMEM;
+		goto done;
+	}
+	/* Copy the previous sdma header to new sdma header */
+	pbc16 = (__le16 *)pkt->addr[pkt->index].kvaddr;
+	memcpy(pbcvaddr, pbc16, pbclen);
+
+	/* Modify the previous sdma header */
+	hdr = (struct qib_message_header *)&pbc16[4];
+
+	/* New pbc length */
+	pbc16[0] = cpu_to_le16(le16_to_cpu(pbc16[0])-(pkt->bytes_togo>>2));
+
+	/* New packet length */
+	hdr->lrh[2] = cpu_to_be16(le16_to_cpu(pbc16[0]));
+
+	if (pkt->tiddma) {
+		/* turn on the header suppression */
+		hdr->iph.pkt_flags =
+			cpu_to_le16(le16_to_cpu(hdr->iph.pkt_flags)|0x2);
+		/* turn off ACK_REQ: 0x04 and EXPECTED_DONE: 0x20 */
+		hdr->flags &= ~(0x04|0x20);
+	} else {
+		/* turn off extra bytes: 20-21 bits */
+		hdr->bth[0] = cpu_to_be32(be32_to_cpu(hdr->bth[0])&0xFFCFFFFF);
+		/* turn off ACK_REQ: 0x04 */
+		hdr->flags &= ~(0x04);
+	}
+
+	/* New kdeth checksum */
+	vcto = le32_to_cpu(hdr->iph.ver_ctxt_tid_offset);
+	hdr->iph.chksum = cpu_to_le16(QIB_LRH_BTH +
+		be16_to_cpu(hdr->lrh[2]) -
+		((vcto>>16)&0xFFFF) - (vcto&0xFFFF) -
+		le16_to_cpu(hdr->iph.pkt_flags));
+
+	/* The packet is done, header is not dma mapped yet.
+	 * it should be from kmalloc */
+	if (!pkt->addr[pkt->index].addr) {
+		pkt->addr[pkt->index].addr =
+			dma_map_single(&dd->pcidev->dev,
+				pkt->addr[pkt->index].kvaddr,
+				pkt->addr[pkt->index].dma_length,
+				DMA_TO_DEVICE);
+		if (dma_mapping_error(&dd->pcidev->dev,
+				pkt->addr[pkt->index].addr)) {
+			ret = -ENOMEM;
+			goto done;
+		}
+		pkt->addr[pkt->index].dma_mapped = 1;
+	}
+
+	/* Modify the new sdma header */
+	pbc16 = (__le16 *)pbcvaddr;
+	hdr = (struct qib_message_header *)&pbc16[4];
+
+	/* New pbc length */
+	pbc16[0] = cpu_to_le16(le16_to_cpu(pbc16[0])-(pkt->payload_size>>2));
+
+	/* New packet length */
+	hdr->lrh[2] = cpu_to_be16(le16_to_cpu(pbc16[0]));
+
+	if (pkt->tiddma) {
+		/* Set new tid and offset for new sdma header */
+		hdr->iph.ver_ctxt_tid_offset = cpu_to_le32(
+			(le32_to_cpu(hdr->iph.ver_ctxt_tid_offset)&0xFF000000) +
+			(pkt->tidsm[pkt->tidsmidx].tid<<QLOGIC_IB_I_TID_SHIFT) +
+			(pkt->tidsm[pkt->tidsmidx].offset>>2));
+	} else {
+		/* Middle protocol new packet offset */
+		hdr->uwords[2] += pkt->payload_size;
+	}
+
+	/* New kdeth checksum */
+	vcto = le32_to_cpu(hdr->iph.ver_ctxt_tid_offset);
+	hdr->iph.chksum = cpu_to_le16(QIB_LRH_BTH +
+		be16_to_cpu(hdr->lrh[2]) -
+		((vcto>>16)&0xFFFF) - (vcto&0xFFFF) -
+		le16_to_cpu(hdr->iph.pkt_flags));
+
+	/* Next sequence number in new sdma header */
+	seqnum.val = be32_to_cpu(hdr->bth[2]);
+	if (pkt->tiddma)
+		seqnum.seq++;
+	else
+		seqnum.pkt++;
+	hdr->bth[2] = cpu_to_be32(seqnum.val);
+
+	/* Init new sdma header. */
+	qib_user_sdma_init_frag(pkt, pkt->naddr, /* index */
+		0, pbclen,		/* offset, len */
+		1, 0,			/* first last desc */
+		0, 0,			/* put page, dma mapped */
+		NULL, pbcvaddr,		/* struct page, virt addr */
+		pbcdaddr, pbclen);	/* dma addr, dma length */
+	pkt->index = pkt->naddr;
+	pkt->payload_size = 0;
+	pkt->naddr++;
+	if (pkt->naddr == pkt->addrlimit) {
+		ret = -EFAULT;
+		goto done;
+	}
+
+	/* Prepare for next fragment in this page */
+	if (newlen != len) {
+		if (dma_mapped) {
+			put = 0;
+			dma_mapped = 0;
+			page = NULL;
+			kvaddr = NULL;
+		}
+		len -= newlen;
+		offset += newlen;
+
+		goto next_fragment;
+	}
+
+done:
+	return ret;
+}
+
+/* we've too many pages in the iovec, coalesce to a single page */
+static int qib_user_sdma_coalesce(const struct qib_devdata *dd,
+				  struct qib_user_sdma_queue *pq,
+				  struct qib_user_sdma_pkt *pkt,
+				  const struct iovec *iov,
+				  unsigned long niov)
+{
+	int ret = 0;
+	struct page *page = alloc_page(GFP_KERNEL);
+	void *mpage_save;
+	char *mpage;
+	int i;
+	int len = 0;
+
+	if (!page) {
+		ret = -ENOMEM;
+		goto done;
+	}
+
+	mpage = kmap(page);
+	mpage_save = mpage;
+	for (i = 0; i < niov; i++) {
+		int cfur;
+
+		cfur = copy_from_user(mpage,
+				      iov[i].iov_base, iov[i].iov_len);
+		if (cfur) {
+			ret = -EFAULT;
+			goto free_unmap;
+		}
+
+		mpage += iov[i].iov_len;
+		len += iov[i].iov_len;
+	}
+
+	ret = qib_user_sdma_page_to_frags(dd, pq, pkt,
+			page, 0, 0, len, mpage_save);
+	goto done;
+
+free_unmap:
+	kunmap(page);
+	__free_page(page);
+done:
+	return ret;
+}
+
+/*
+ * How many pages in this iovec element?
+ */
+static int qib_user_sdma_num_pages(const struct iovec *iov)
+{
+	const unsigned long addr  = (unsigned long) iov->iov_base;
+	const unsigned long  len  = iov->iov_len;
+	const unsigned long spage = addr & PAGE_MASK;
+	const unsigned long epage = (addr + len - 1) & PAGE_MASK;
+
+	return 1 + ((epage - spage) >> PAGE_SHIFT);
+}
+
+static void qib_user_sdma_free_pkt_frag(struct device *dev,
+					struct qib_user_sdma_queue *pq,
+					struct qib_user_sdma_pkt *pkt,
+					int frag)
+{
+	const int i = frag;
+
+	if (pkt->addr[i].page) {
+		/* only user data has page */
+		if (pkt->addr[i].dma_mapped)
+			dma_unmap_page(dev,
+				       pkt->addr[i].addr,
+				       pkt->addr[i].dma_length,
+				       DMA_TO_DEVICE);
+
+		if (pkt->addr[i].kvaddr)
+			kunmap(pkt->addr[i].page);
+
+		if (pkt->addr[i].put_page)
+			put_page(pkt->addr[i].page);
+		else
+			__free_page(pkt->addr[i].page);
+	} else if (pkt->addr[i].kvaddr) {
+		/* for headers */
+		if (pkt->addr[i].dma_mapped) {
+			/* from kmalloc & dma mapped */
+			dma_unmap_single(dev,
+				       pkt->addr[i].addr,
+				       pkt->addr[i].dma_length,
+				       DMA_TO_DEVICE);
+			kfree(pkt->addr[i].kvaddr);
+		} else if (pkt->addr[i].addr) {
+			/* free coherent mem from cache... */
+			dma_pool_free(pq->header_cache,
+			      pkt->addr[i].kvaddr, pkt->addr[i].addr);
+		} else {
+			/* from kmalloc but not dma mapped */
+			kfree(pkt->addr[i].kvaddr);
+		}
+	}
+}
+
+/* return number of pages pinned... */
+static int qib_user_sdma_pin_pages(const struct qib_devdata *dd,
+				   struct qib_user_sdma_queue *pq,
+				   struct qib_user_sdma_pkt *pkt,
+				   unsigned long addr, int tlen, int npages)
+{
+	struct page *pages[8];
+	int i, j;
+	int ret = 0;
+
+	while (npages) {
+		if (npages > 8)
+			j = 8;
+		else
+			j = npages;
+
+		ret = get_user_pages_fast(addr, j, 0, pages);
+		if (ret != j) {
+			i = 0;
+			j = ret;
+			ret = -ENOMEM;
+			goto free_pages;
+		}
+
+		for (i = 0; i < j; i++) {
+			/* map the pages... */
+			unsigned long fofs = addr & ~PAGE_MASK;
+			int flen = ((fofs + tlen) > PAGE_SIZE) ?
+				(PAGE_SIZE - fofs) : tlen;
+
+			ret = qib_user_sdma_page_to_frags(dd, pq, pkt,
+				pages[i], 1, fofs, flen, NULL);
+			if (ret < 0) {
+				/* current page has beed taken
+				 * care of inside above call.
+				 */
+				i++;
+				goto free_pages;
+			}
+
+			addr += flen;
+			tlen -= flen;
+		}
+
+		npages -= j;
+	}
+
+	goto done;
+
+	/* if error, return all pages not managed by pkt */
+free_pages:
+	while (i < j)
+		put_page(pages[i++]);
+
+done:
+	return ret;
+}
+
+static int qib_user_sdma_pin_pkt(const struct qib_devdata *dd,
+				 struct qib_user_sdma_queue *pq,
+				 struct qib_user_sdma_pkt *pkt,
+				 const struct iovec *iov,
+				 unsigned long niov)
+{
+	int ret = 0;
+	unsigned long idx;
+
+	for (idx = 0; idx < niov; idx++) {
+		const int npages = qib_user_sdma_num_pages(iov + idx);
+		const unsigned long addr = (unsigned long) iov[idx].iov_base;
+
+		ret = qib_user_sdma_pin_pages(dd, pq, pkt, addr,
+					      iov[idx].iov_len, npages);
+		if (ret < 0)
+			goto free_pkt;
+	}
+
+	goto done;
+
+free_pkt:
+	/* we need to ignore the first entry here */
+	for (idx = 1; idx < pkt->naddr; idx++)
+		qib_user_sdma_free_pkt_frag(&dd->pcidev->dev, pq, pkt, idx);
+
+	/* need to dma unmap the first entry, this is to restore to
+	 * the original state so that caller can free the memory in
+	 * error condition. Caller does not know if dma mapped or not*/
+	if (pkt->addr[0].dma_mapped) {
+		dma_unmap_single(&dd->pcidev->dev,
+		       pkt->addr[0].addr,
+		       pkt->addr[0].dma_length,
+		       DMA_TO_DEVICE);
+		pkt->addr[0].addr = 0;
+		pkt->addr[0].dma_mapped = 0;
+	}
+
+done:
+	return ret;
+}
+
+static int qib_user_sdma_init_payload(const struct qib_devdata *dd,
+				      struct qib_user_sdma_queue *pq,
+				      struct qib_user_sdma_pkt *pkt,
+				      const struct iovec *iov,
+				      unsigned long niov, int npages)
+{
+	int ret = 0;
+
+	if (pkt->frag_size == pkt->bytes_togo &&
+			npages >= ARRAY_SIZE(pkt->addr))
+		ret = qib_user_sdma_coalesce(dd, pq, pkt, iov, niov);
+	else
+		ret = qib_user_sdma_pin_pkt(dd, pq, pkt, iov, niov);
+
+	return ret;
+}
+
+/* free a packet list -- return counter value of last packet */
+static void qib_user_sdma_free_pkt_list(struct device *dev,
+					struct qib_user_sdma_queue *pq,
+					struct list_head *list)
+{
+	struct qib_user_sdma_pkt *pkt, *pkt_next;
+
+	list_for_each_entry_safe(pkt, pkt_next, list, list) {
+		int i;
+
+		for (i = 0; i < pkt->naddr; i++)
+			qib_user_sdma_free_pkt_frag(dev, pq, pkt, i);
+
+		if (pkt->largepkt)
+			kfree(pkt);
+		else
+			kmem_cache_free(pq->pkt_slab, pkt);
+	}
+	INIT_LIST_HEAD(list);
+}
+
+/*
+ * copy headers, coalesce etc -- pq->lock must be held
+ *
+ * we queue all the packets to list, returning the
+ * number of bytes total.  list must be empty initially,
+ * as, if there is an error we clean it...
+ */
+static int qib_user_sdma_queue_pkts(const struct qib_devdata *dd,
+				    struct qib_pportdata *ppd,
+				    struct qib_user_sdma_queue *pq,
+				    const struct iovec *iov,
+				    unsigned long niov,
+				    struct list_head *list,
+				    int *maxpkts, int *ndesc)
+{
+	unsigned long idx = 0;
+	int ret = 0;
+	int npkts = 0;
+	__le32 *pbc;
+	dma_addr_t dma_addr;
+	struct qib_user_sdma_pkt *pkt = NULL;
+	size_t len;
+	size_t nw;
+	u32 counter = pq->counter;
+	u16 frag_size;
+
+	while (idx < niov && npkts < *maxpkts) {
+		const unsigned long addr = (unsigned long) iov[idx].iov_base;
+		const unsigned long idx_save = idx;
+		unsigned pktnw;
+		unsigned pktnwc;
+		int nfrags = 0;
+		int npages = 0;
+		int bytes_togo = 0;
+		int tiddma = 0;
+		int cfur;
+
+		len = iov[idx].iov_len;
+		nw = len >> 2;
+
+		if (len < QIB_USER_SDMA_MIN_HEADER_LENGTH ||
+		    len > PAGE_SIZE || len & 3 || addr & 3) {
+			ret = -EINVAL;
+			goto free_list;
+		}
+
+		pbc = qib_user_sdma_alloc_header(pq, len, &dma_addr);
+		if (!pbc) {
+			ret = -ENOMEM;
+			goto free_list;
+		}
+
+		cfur = copy_from_user(pbc, iov[idx].iov_base, len);
+		if (cfur) {
+			ret = -EFAULT;
+			goto free_pbc;
+		}
+
+		/*
+		 * This assignment is a bit strange.  it's because the
+		 * the pbc counts the number of 32 bit words in the full
+		 * packet _except_ the first word of the pbc itself...
+		 */
+		pktnwc = nw - 1;
+
+		/*
+		 * pktnw computation yields the number of 32 bit words
+		 * that the caller has indicated in the PBC.  note that
+		 * this is one less than the total number of words that
+		 * goes to the send DMA engine as the first 32 bit word
+		 * of the PBC itself is not counted.  Armed with this count,
+		 * we can verify that the packet is consistent with the
+		 * iovec lengths.
+		 */
+		pktnw = le32_to_cpu(*pbc) & 0xFFFF;
+		if (pktnw < pktnwc) {
+			ret = -EINVAL;
+			goto free_pbc;
+		}
+
+		idx++;
+		while (pktnwc < pktnw && idx < niov) {
+			const size_t slen = iov[idx].iov_len;
+			const unsigned long faddr =
+				(unsigned long) iov[idx].iov_base;
+
+			if (slen & 3 || faddr & 3 || !slen) {
+				ret = -EINVAL;
+				goto free_pbc;
+			}
+
+			npages += qib_user_sdma_num_pages(&iov[idx]);
+
+			bytes_togo += slen;
+			pktnwc += slen >> 2;
+			idx++;
+			nfrags++;
+		}
+
+		if (pktnwc != pktnw) {
+			ret = -EINVAL;
+			goto free_pbc;
+		}
+
+		frag_size = ((le32_to_cpu(*pbc))>>16) & 0xFFFF;
+		if (((frag_size ? frag_size : bytes_togo) + len) >
+						ppd->ibmaxlen) {
+			ret = -EINVAL;
+			goto free_pbc;
+		}
+
+		if (frag_size) {
+			int pktsize, tidsmsize, n;
+
+			n = npages*((2*PAGE_SIZE/frag_size)+1);
+			pktsize = sizeof(*pkt) + sizeof(pkt->addr[0])*n;
+
+			/*
+			 * Determine if this is tid-sdma or just sdma.
+			 */
+			tiddma = (((le32_to_cpu(pbc[7])>>
+				QLOGIC_IB_I_TID_SHIFT)&
+				QLOGIC_IB_I_TID_MASK) !=
+				QLOGIC_IB_I_TID_MASK);
+
+			if (tiddma)
+				tidsmsize = iov[idx].iov_len;
+			else
+				tidsmsize = 0;
+
+			pkt = kmalloc(pktsize+tidsmsize, GFP_KERNEL);
+			if (!pkt) {
+				ret = -ENOMEM;
+				goto free_pbc;
+			}
+			pkt->largepkt = 1;
+			pkt->frag_size = frag_size;
+			pkt->addrlimit = n + ARRAY_SIZE(pkt->addr);
+
+			if (tiddma) {
+				char *tidsm = (char *)pkt + pktsize;
+				cfur = copy_from_user(tidsm,
+					iov[idx].iov_base, tidsmsize);
+				if (cfur) {
+					ret = -EFAULT;
+					goto free_pkt;
+				}
+				pkt->tidsm =
+					(struct qib_tid_session_member *)tidsm;
+				pkt->tidsmcount = tidsmsize/
+					sizeof(struct qib_tid_session_member);
+				pkt->tidsmidx = 0;
+				idx++;
+			}
+
+			/*
+			 * pbc 'fill1' field is borrowed to pass frag size,
+			 * we need to clear it after picking frag size, the
+			 * hardware requires this field to be zero.
+			 */
+			*pbc = cpu_to_le32(le32_to_cpu(*pbc) & 0x0000FFFF);
+		} else {
+			pkt = kmem_cache_alloc(pq->pkt_slab, GFP_KERNEL);
+			if (!pkt) {
+				ret = -ENOMEM;
+				goto free_pbc;
+			}
+			pkt->largepkt = 0;
+			pkt->frag_size = bytes_togo;
+			pkt->addrlimit = ARRAY_SIZE(pkt->addr);
+		}
+		pkt->bytes_togo = bytes_togo;
+		pkt->payload_size = 0;
+		pkt->counter = counter;
+		pkt->tiddma = tiddma;
+
+		/* setup the first header */
+		qib_user_sdma_init_frag(pkt, 0, /* index */
+			0, len,		/* offset, len */
+			1, 0,		/* first last desc */
+			0, 0,		/* put page, dma mapped */
+			NULL, pbc,	/* struct page, virt addr */
+			dma_addr, len);	/* dma addr, dma length */
+		pkt->index = 0;
+		pkt->naddr = 1;
+
+		if (nfrags) {
+			ret = qib_user_sdma_init_payload(dd, pq, pkt,
+							 iov + idx_save + 1,
+							 nfrags, npages);
+			if (ret < 0)
+				goto free_pkt;
+		} else {
+			/* since there is no payload, mark the
+			 * header as the last desc. */
+			pkt->addr[0].last_desc = 1;
+
+			if (dma_addr == 0) {
+				/*
+				 * the header is not dma mapped yet.
+				 * it should be from kmalloc.
+				 */
+				dma_addr = dma_map_single(&dd->pcidev->dev,
+					pbc, len, DMA_TO_DEVICE);
+				if (dma_mapping_error(&dd->pcidev->dev,
+								dma_addr)) {
+					ret = -ENOMEM;
+					goto free_pkt;
+				}
+				pkt->addr[0].addr = dma_addr;
+				pkt->addr[0].dma_mapped = 1;
+			}
+		}
+
+		counter++;
+		npkts++;
+		pkt->pq = pq;
+		pkt->index = 0; /* reset index for push on hw */
+		*ndesc += pkt->naddr;
+
+		list_add_tail(&pkt->list, list);
+	}
+
+	*maxpkts = npkts;
+	ret = idx;
+	goto done;
+
+free_pkt:
+	if (pkt->largepkt)
+		kfree(pkt);
+	else
+		kmem_cache_free(pq->pkt_slab, pkt);
+free_pbc:
+	if (dma_addr)
+		dma_pool_free(pq->header_cache, pbc, dma_addr);
+	else
+		kfree(pbc);
+free_list:
+	qib_user_sdma_free_pkt_list(&dd->pcidev->dev, pq, list);
+done:
+	return ret;
+}
+
+static void qib_user_sdma_set_complete_counter(struct qib_user_sdma_queue *pq,
+					       u32 c)
+{
+	pq->sent_counter = c;
+}
+
+/* try to clean out queue -- needs pq->lock */
+static int qib_user_sdma_queue_clean(struct qib_pportdata *ppd,
+				     struct qib_user_sdma_queue *pq)
+{
+	struct qib_devdata *dd = ppd->dd;
+	struct list_head free_list;
+	struct qib_user_sdma_pkt *pkt;
+	struct qib_user_sdma_pkt *pkt_prev;
+	unsigned long flags;
+	int ret = 0;
+
+	if (!pq->num_sending)
+		return 0;
+
+	INIT_LIST_HEAD(&free_list);
+
+	/*
+	 * We need this spin lock here because interrupt handler
+	 * might modify this list in qib_user_sdma_send_desc(), also
+	 * we can not get interrupted, otherwise it is a deadlock.
+	 */
+	spin_lock_irqsave(&pq->sent_lock, flags);
+	list_for_each_entry_safe(pkt, pkt_prev, &pq->sent, list) {
+		s64 descd = ppd->sdma_descq_removed - pkt->added;
+
+		if (descd < 0)
+			break;
+
+		list_move_tail(&pkt->list, &free_list);
+
+		/* one more packet cleaned */
+		ret++;
+		pq->num_sending--;
+	}
+	spin_unlock_irqrestore(&pq->sent_lock, flags);
+
+	if (!list_empty(&free_list)) {
+		u32 counter;
+
+		pkt = list_entry(free_list.prev,
+				 struct qib_user_sdma_pkt, list);
+		counter = pkt->counter;
+
+		qib_user_sdma_free_pkt_list(&dd->pcidev->dev, pq, &free_list);
+		qib_user_sdma_set_complete_counter(pq, counter);
+	}
+
+	return ret;
+}
+
+void qib_user_sdma_queue_destroy(struct qib_user_sdma_queue *pq)
+{
+	if (!pq)
+		return;
+
+	pq->sdma_rb_node->refcount--;
+	if (pq->sdma_rb_node->refcount == 0) {
+		rb_erase(&pq->sdma_rb_node->node, &qib_user_sdma_rb_root);
+		kfree(pq->sdma_rb_node);
+	}
+	dma_pool_destroy(pq->header_cache);
+	kmem_cache_destroy(pq->pkt_slab);
+	kfree(pq);
+}
+
+/* clean descriptor queue, returns > 0 if some elements cleaned */
+static int qib_user_sdma_hwqueue_clean(struct qib_pportdata *ppd)
+{
+	int ret;
+	unsigned long flags;
+
+	spin_lock_irqsave(&ppd->sdma_lock, flags);
+	ret = qib_sdma_make_progress(ppd);
+	spin_unlock_irqrestore(&ppd->sdma_lock, flags);
+
+	return ret;
+}
+
+/* we're in close, drain packets so that we can cleanup successfully... */
+void qib_user_sdma_queue_drain(struct qib_pportdata *ppd,
+			       struct qib_user_sdma_queue *pq)
+{
+	struct qib_devdata *dd = ppd->dd;
+	unsigned long flags;
+	int i;
+
+	if (!pq)
+		return;
+
+	for (i = 0; i < QIB_USER_SDMA_DRAIN_TIMEOUT; i++) {
+		mutex_lock(&pq->lock);
+		if (!pq->num_pending && !pq->num_sending) {
+			mutex_unlock(&pq->lock);
+			break;
+		}
+		qib_user_sdma_hwqueue_clean(ppd);
+		qib_user_sdma_queue_clean(ppd, pq);
+		mutex_unlock(&pq->lock);
+		msleep(10);
+	}
+
+	if (pq->num_pending || pq->num_sending) {
+		struct qib_user_sdma_pkt *pkt;
+		struct qib_user_sdma_pkt *pkt_prev;
+		struct list_head free_list;
+
+		mutex_lock(&pq->lock);
+		spin_lock_irqsave(&ppd->sdma_lock, flags);
+		/*
+		 * Since we hold sdma_lock, it is safe without sent_lock.
+		 */
+		if (pq->num_pending) {
+			list_for_each_entry_safe(pkt, pkt_prev,
+					&ppd->sdma_userpending, list) {
+				if (pkt->pq == pq) {
+					list_move_tail(&pkt->list, &pq->sent);
+					pq->num_pending--;
+					pq->num_sending++;
+				}
+			}
+		}
+		spin_unlock_irqrestore(&ppd->sdma_lock, flags);
+
+		qib_dev_err(dd, "user sdma lists not empty: forcing!\n");
+		INIT_LIST_HEAD(&free_list);
+		list_splice_init(&pq->sent, &free_list);
+		pq->num_sending = 0;
+		qib_user_sdma_free_pkt_list(&dd->pcidev->dev, pq, &free_list);
+		mutex_unlock(&pq->lock);
+	}
+}
+
+static inline __le64 qib_sdma_make_desc0(u8 gen,
+					 u64 addr, u64 dwlen, u64 dwoffset)
+{
+	return cpu_to_le64(/* SDmaPhyAddr[31:0] */
+			   ((addr & 0xfffffffcULL) << 32) |
+			   /* SDmaGeneration[1:0] */
+			   ((gen & 3ULL) << 30) |
+			   /* SDmaDwordCount[10:0] */
+			   ((dwlen & 0x7ffULL) << 16) |
+			   /* SDmaBufOffset[12:2] */
+			   (dwoffset & 0x7ffULL));
+}
+
+static inline __le64 qib_sdma_make_first_desc0(__le64 descq)
+{
+	return descq | cpu_to_le64(1ULL << 12);
+}
+
+static inline __le64 qib_sdma_make_last_desc0(__le64 descq)
+{
+					      /* last */  /* dma head */
+	return descq | cpu_to_le64(1ULL << 11 | 1ULL << 13);
+}
+
+static inline __le64 qib_sdma_make_desc1(u64 addr)
+{
+	/* SDmaPhyAddr[47:32] */
+	return cpu_to_le64(addr >> 32);
+}
+
+static void qib_user_sdma_send_frag(struct qib_pportdata *ppd,
+				    struct qib_user_sdma_pkt *pkt, int idx,
+				    unsigned ofs, u16 tail, u8 gen)
+{
+	const u64 addr = (u64) pkt->addr[idx].addr +
+		(u64) pkt->addr[idx].offset;
+	const u64 dwlen = (u64) pkt->addr[idx].length / 4;
+	__le64 *descqp;
+	__le64 descq0;
+
+	descqp = &ppd->sdma_descq[tail].qw[0];
+
+	descq0 = qib_sdma_make_desc0(gen, addr, dwlen, ofs);
+	if (pkt->addr[idx].first_desc)
+		descq0 = qib_sdma_make_first_desc0(descq0);
+	if (pkt->addr[idx].last_desc) {
+		descq0 = qib_sdma_make_last_desc0(descq0);
+		if (ppd->sdma_intrequest) {
+			descq0 |= cpu_to_le64(1ULL << 15);
+			ppd->sdma_intrequest = 0;
+		}
+	}
+
+	descqp[0] = descq0;
+	descqp[1] = qib_sdma_make_desc1(addr);
+}
+
+void qib_user_sdma_send_desc(struct qib_pportdata *ppd,
+				struct list_head *pktlist)
+{
+	struct qib_devdata *dd = ppd->dd;
+	u16 nfree, nsent;
+	u16 tail, tail_c;
+	u8 gen, gen_c;
+
+	nfree = qib_sdma_descq_freecnt(ppd);
+	if (!nfree)
+		return;
+
+retry:
+	nsent = 0;
+	tail_c = tail = ppd->sdma_descq_tail;
+	gen_c = gen = ppd->sdma_generation;
+	while (!list_empty(pktlist)) {
+		struct qib_user_sdma_pkt *pkt =
+			list_entry(pktlist->next, struct qib_user_sdma_pkt,
+				   list);
+		int i, j, c = 0;
+		unsigned ofs = 0;
+		u16 dtail = tail;
+
+		for (i = pkt->index; i < pkt->naddr && nfree; i++) {
+			qib_user_sdma_send_frag(ppd, pkt, i, ofs, tail, gen);
+			ofs += pkt->addr[i].length >> 2;
+
+			if (++tail == ppd->sdma_descq_cnt) {
+				tail = 0;
+				++gen;
+				ppd->sdma_intrequest = 1;
+			} else if (tail == (ppd->sdma_descq_cnt>>1)) {
+				ppd->sdma_intrequest = 1;
+			}
+			nfree--;
+			if (pkt->addr[i].last_desc == 0)
+				continue;
+
+			/*
+			 * If the packet is >= 2KB mtu equivalent, we
+			 * have to use the large buffers, and have to
+			 * mark each descriptor as part of a large
+			 * buffer packet.
+			 */
+			if (ofs > dd->piosize2kmax_dwords) {
+				for (j = pkt->index; j <= i; j++) {
+					ppd->sdma_descq[dtail].qw[0] |=
+						cpu_to_le64(1ULL << 14);
+					if (++dtail == ppd->sdma_descq_cnt)
+						dtail = 0;
+				}
+			}
+			c += i + 1 - pkt->index;
+			pkt->index = i + 1; /* index for next first */
+			tail_c = dtail = tail;
+			gen_c = gen;
+			ofs = 0;  /* reset for next packet */
+		}
+
+		ppd->sdma_descq_added += c;
+		nsent += c;
+		if (pkt->index == pkt->naddr) {
+			pkt->added = ppd->sdma_descq_added;
+			pkt->pq->added = pkt->added;
+			pkt->pq->num_pending--;
+			spin_lock(&pkt->pq->sent_lock);
+			pkt->pq->num_sending++;
+			list_move_tail(&pkt->list, &pkt->pq->sent);
+			spin_unlock(&pkt->pq->sent_lock);
+		}
+		if (!nfree || (nsent<<2) > ppd->sdma_descq_cnt)
+			break;
+	}
+
+	/* advance the tail on the chip if necessary */
+	if (ppd->sdma_descq_tail != tail_c) {
+		ppd->sdma_generation = gen_c;
+		dd->f_sdma_update_tail(ppd, tail_c);
+	}
+
+	if (nfree && !list_empty(pktlist))
+		goto retry;
+
+	return;
+}
+
+/* pq->lock must be held, get packets on the wire... */
+static int qib_user_sdma_push_pkts(struct qib_pportdata *ppd,
+				 struct qib_user_sdma_queue *pq,
+				 struct list_head *pktlist, int count)
+{
+	unsigned long flags;
+
+	if (unlikely(!(ppd->lflags & QIBL_LINKACTIVE)))
+		return -ECOMM;
+
+	/* non-blocking mode */
+	if (pq->sdma_rb_node->refcount > 1) {
+		spin_lock_irqsave(&ppd->sdma_lock, flags);
+		if (unlikely(!__qib_sdma_running(ppd))) {
+			spin_unlock_irqrestore(&ppd->sdma_lock, flags);
+			return -ECOMM;
+		}
+		pq->num_pending += count;
+		list_splice_tail_init(pktlist, &ppd->sdma_userpending);
+		qib_user_sdma_send_desc(ppd, &ppd->sdma_userpending);
+		spin_unlock_irqrestore(&ppd->sdma_lock, flags);
+		return 0;
+	}
+
+	/* In this case, descriptors from this process are not
+	 * linked to ppd pending queue, interrupt handler
+	 * won't update this process, it is OK to directly
+	 * modify without sdma lock.
+	 */
+
+
+	pq->num_pending += count;
+	/*
+	 * Blocking mode for single rail process, we must
+	 * release/regain sdma_lock to give other process
+	 * chance to make progress. This is important for
+	 * performance.
+	 */
+	do {
+		spin_lock_irqsave(&ppd->sdma_lock, flags);
+		if (unlikely(!__qib_sdma_running(ppd))) {
+			spin_unlock_irqrestore(&ppd->sdma_lock, flags);
+			return -ECOMM;
+		}
+		qib_user_sdma_send_desc(ppd, pktlist);
+		if (!list_empty(pktlist))
+			qib_sdma_make_progress(ppd);
+		spin_unlock_irqrestore(&ppd->sdma_lock, flags);
+	} while (!list_empty(pktlist));
+
+	return 0;
+}
+
+int qib_user_sdma_writev(struct qib_ctxtdata *rcd,
+			 struct qib_user_sdma_queue *pq,
+			 const struct iovec *iov,
+			 unsigned long dim)
+{
+	struct qib_devdata *dd = rcd->dd;
+	struct qib_pportdata *ppd = rcd->ppd;
+	int ret = 0;
+	struct list_head list;
+	int npkts = 0;
+
+	INIT_LIST_HEAD(&list);
+
+	mutex_lock(&pq->lock);
+
+	/* why not -ECOMM like qib_user_sdma_push_pkts() below? */
+	if (!qib_sdma_running(ppd))
+		goto done_unlock;
+
+	/* if I have packets not complete yet */
+	if (pq->added > ppd->sdma_descq_removed)
+		qib_user_sdma_hwqueue_clean(ppd);
+	/* if I have complete packets to be freed */
+	if (pq->num_sending)
+		qib_user_sdma_queue_clean(ppd, pq);
+
+	while (dim) {
+		int mxp = 1;
+		int ndesc = 0;
+
+		ret = qib_user_sdma_queue_pkts(dd, ppd, pq,
+				iov, dim, &list, &mxp, &ndesc);
+		if (ret < 0)
+			goto done_unlock;
+		else {
+			dim -= ret;
+			iov += ret;
+		}
+
+		/* force packets onto the sdma hw queue... */
+		if (!list_empty(&list)) {
+			/*
+			 * Lazily clean hw queue.
+			 */
+			if (qib_sdma_descq_freecnt(ppd) < ndesc) {
+				qib_user_sdma_hwqueue_clean(ppd);
+				if (pq->num_sending)
+					qib_user_sdma_queue_clean(ppd, pq);
+			}
+
+			ret = qib_user_sdma_push_pkts(ppd, pq, &list, mxp);
+			if (ret < 0)
+				goto done_unlock;
+			else {
+				npkts += mxp;
+				pq->counter += mxp;
+			}
+		}
+	}
+
+done_unlock:
+	if (!list_empty(&list))
+		qib_user_sdma_free_pkt_list(&dd->pcidev->dev, pq, &list);
+	mutex_unlock(&pq->lock);
+
+	return (ret < 0) ? ret : npkts;
+}
+
+int qib_user_sdma_make_progress(struct qib_pportdata *ppd,
+				struct qib_user_sdma_queue *pq)
+{
+	int ret = 0;
+
+	mutex_lock(&pq->lock);
+	qib_user_sdma_hwqueue_clean(ppd);
+	ret = qib_user_sdma_queue_clean(ppd, pq);
+	mutex_unlock(&pq->lock);
+
+	return ret;
+}
+
+u32 qib_user_sdma_complete_counter(const struct qib_user_sdma_queue *pq)
+{
+	return pq ? pq->sent_counter : 0;
+}
+
+u32 qib_user_sdma_inflight_counter(struct qib_user_sdma_queue *pq)
+{
+	return pq ? pq->counter : 0;
+}
diff --git a/drivers/infiniband/hw/qib/qib_user_sdma.h b/drivers/infiniband/hw/qib/qib_user_sdma.h
new file mode 100644
index 0000000..ce8cbaf
--- /dev/null
+++ b/drivers/infiniband/hw/qib/qib_user_sdma.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2007, 2008 QLogic Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include <linux/device.h>
+
+struct qib_user_sdma_queue;
+
+struct qib_user_sdma_queue *
+qib_user_sdma_queue_create(struct device *dev, int unit, int port, int sport);
+void qib_user_sdma_queue_destroy(struct qib_user_sdma_queue *pq);
+
+int qib_user_sdma_writev(struct qib_ctxtdata *pd,
+			 struct qib_user_sdma_queue *pq,
+			 const struct iovec *iov,
+			 unsigned long dim);
+
+int qib_user_sdma_make_progress(struct qib_pportdata *ppd,
+				struct qib_user_sdma_queue *pq);
+
+void qib_user_sdma_queue_drain(struct qib_pportdata *ppd,
+			       struct qib_user_sdma_queue *pq);
+
+u32 qib_user_sdma_complete_counter(const struct qib_user_sdma_queue *pq);
+u32 qib_user_sdma_inflight_counter(struct qib_user_sdma_queue *pq);
diff --git a/drivers/infiniband/hw/qib/qib_verbs.c b/drivers/infiniband/hw/qib/qib_verbs.c
new file mode 100644
index 0000000..9bcfbd8
--- /dev/null
+++ b/drivers/infiniband/hw/qib/qib_verbs.c
@@ -0,0 +1,2336 @@
+/*
+ * Copyright (c) 2012, 2013 Intel Corporation.  All rights reserved.
+ * Copyright (c) 2006 - 2012 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <rdma/ib_mad.h>
+#include <rdma/ib_user_verbs.h>
+#include <linux/io.h>
+#include <linux/module.h>
+#include <linux/utsname.h>
+#include <linux/rculist.h>
+#include <linux/mm.h>
+#include <linux/random.h>
+
+#include "qib.h"
+#include "qib_common.h"
+
+static unsigned int ib_qib_qp_table_size = 256;
+module_param_named(qp_table_size, ib_qib_qp_table_size, uint, S_IRUGO);
+MODULE_PARM_DESC(qp_table_size, "QP table size");
+
+unsigned int ib_qib_lkey_table_size = 16;
+module_param_named(lkey_table_size, ib_qib_lkey_table_size, uint,
+		   S_IRUGO);
+MODULE_PARM_DESC(lkey_table_size,
+		 "LKEY table size in bits (2^n, 1 <= n <= 23)");
+
+static unsigned int ib_qib_max_pds = 0xFFFF;
+module_param_named(max_pds, ib_qib_max_pds, uint, S_IRUGO);
+MODULE_PARM_DESC(max_pds,
+		 "Maximum number of protection domains to support");
+
+static unsigned int ib_qib_max_ahs = 0xFFFF;
+module_param_named(max_ahs, ib_qib_max_ahs, uint, S_IRUGO);
+MODULE_PARM_DESC(max_ahs, "Maximum number of address handles to support");
+
+unsigned int ib_qib_max_cqes = 0x2FFFF;
+module_param_named(max_cqes, ib_qib_max_cqes, uint, S_IRUGO);
+MODULE_PARM_DESC(max_cqes,
+		 "Maximum number of completion queue entries to support");
+
+unsigned int ib_qib_max_cqs = 0x1FFFF;
+module_param_named(max_cqs, ib_qib_max_cqs, uint, S_IRUGO);
+MODULE_PARM_DESC(max_cqs, "Maximum number of completion queues to support");
+
+unsigned int ib_qib_max_qp_wrs = 0x3FFF;
+module_param_named(max_qp_wrs, ib_qib_max_qp_wrs, uint, S_IRUGO);
+MODULE_PARM_DESC(max_qp_wrs, "Maximum number of QP WRs to support");
+
+unsigned int ib_qib_max_qps = 16384;
+module_param_named(max_qps, ib_qib_max_qps, uint, S_IRUGO);
+MODULE_PARM_DESC(max_qps, "Maximum number of QPs to support");
+
+unsigned int ib_qib_max_sges = 0x60;
+module_param_named(max_sges, ib_qib_max_sges, uint, S_IRUGO);
+MODULE_PARM_DESC(max_sges, "Maximum number of SGEs to support");
+
+unsigned int ib_qib_max_mcast_grps = 16384;
+module_param_named(max_mcast_grps, ib_qib_max_mcast_grps, uint, S_IRUGO);
+MODULE_PARM_DESC(max_mcast_grps,
+		 "Maximum number of multicast groups to support");
+
+unsigned int ib_qib_max_mcast_qp_attached = 16;
+module_param_named(max_mcast_qp_attached, ib_qib_max_mcast_qp_attached,
+		   uint, S_IRUGO);
+MODULE_PARM_DESC(max_mcast_qp_attached,
+		 "Maximum number of attached QPs to support");
+
+unsigned int ib_qib_max_srqs = 1024;
+module_param_named(max_srqs, ib_qib_max_srqs, uint, S_IRUGO);
+MODULE_PARM_DESC(max_srqs, "Maximum number of SRQs to support");
+
+unsigned int ib_qib_max_srq_sges = 128;
+module_param_named(max_srq_sges, ib_qib_max_srq_sges, uint, S_IRUGO);
+MODULE_PARM_DESC(max_srq_sges, "Maximum number of SRQ SGEs to support");
+
+unsigned int ib_qib_max_srq_wrs = 0x1FFFF;
+module_param_named(max_srq_wrs, ib_qib_max_srq_wrs, uint, S_IRUGO);
+MODULE_PARM_DESC(max_srq_wrs, "Maximum number of SRQ WRs support");
+
+static unsigned int ib_qib_disable_sma;
+module_param_named(disable_sma, ib_qib_disable_sma, uint, S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(disable_sma, "Disable the SMA");
+
+/*
+ * Note that it is OK to post send work requests in the SQE and ERR
+ * states; qib_do_send() will process them and generate error
+ * completions as per IB 1.2 C10-96.
+ */
+const int ib_qib_state_ops[IB_QPS_ERR + 1] = {
+	[IB_QPS_RESET] = 0,
+	[IB_QPS_INIT] = QIB_POST_RECV_OK,
+	[IB_QPS_RTR] = QIB_POST_RECV_OK | QIB_PROCESS_RECV_OK,
+	[IB_QPS_RTS] = QIB_POST_RECV_OK | QIB_PROCESS_RECV_OK |
+	    QIB_POST_SEND_OK | QIB_PROCESS_SEND_OK |
+	    QIB_PROCESS_NEXT_SEND_OK,
+	[IB_QPS_SQD] = QIB_POST_RECV_OK | QIB_PROCESS_RECV_OK |
+	    QIB_POST_SEND_OK | QIB_PROCESS_SEND_OK,
+	[IB_QPS_SQE] = QIB_POST_RECV_OK | QIB_PROCESS_RECV_OK |
+	    QIB_POST_SEND_OK | QIB_FLUSH_SEND,
+	[IB_QPS_ERR] = QIB_POST_RECV_OK | QIB_FLUSH_RECV |
+	    QIB_POST_SEND_OK | QIB_FLUSH_SEND,
+};
+
+struct qib_ucontext {
+	struct ib_ucontext ibucontext;
+};
+
+static inline struct qib_ucontext *to_iucontext(struct ib_ucontext
+						  *ibucontext)
+{
+	return container_of(ibucontext, struct qib_ucontext, ibucontext);
+}
+
+/*
+ * Translate ib_wr_opcode into ib_wc_opcode.
+ */
+const enum ib_wc_opcode ib_qib_wc_opcode[] = {
+	[IB_WR_RDMA_WRITE] = IB_WC_RDMA_WRITE,
+	[IB_WR_RDMA_WRITE_WITH_IMM] = IB_WC_RDMA_WRITE,
+	[IB_WR_SEND] = IB_WC_SEND,
+	[IB_WR_SEND_WITH_IMM] = IB_WC_SEND,
+	[IB_WR_RDMA_READ] = IB_WC_RDMA_READ,
+	[IB_WR_ATOMIC_CMP_AND_SWP] = IB_WC_COMP_SWAP,
+	[IB_WR_ATOMIC_FETCH_AND_ADD] = IB_WC_FETCH_ADD
+};
+
+/*
+ * System image GUID.
+ */
+__be64 ib_qib_sys_image_guid;
+
+/**
+ * qib_copy_sge - copy data to SGE memory
+ * @ss: the SGE state
+ * @data: the data to copy
+ * @length: the length of the data
+ */
+void qib_copy_sge(struct qib_sge_state *ss, void *data, u32 length, int release)
+{
+	struct qib_sge *sge = &ss->sge;
+
+	while (length) {
+		u32 len = sge->length;
+
+		if (len > length)
+			len = length;
+		if (len > sge->sge_length)
+			len = sge->sge_length;
+		BUG_ON(len == 0);
+		memcpy(sge->vaddr, data, len);
+		sge->vaddr += len;
+		sge->length -= len;
+		sge->sge_length -= len;
+		if (sge->sge_length == 0) {
+			if (release)
+				qib_put_mr(sge->mr);
+			if (--ss->num_sge)
+				*sge = *ss->sg_list++;
+		} else if (sge->length == 0 && sge->mr->lkey) {
+			if (++sge->n >= QIB_SEGSZ) {
+				if (++sge->m >= sge->mr->mapsz)
+					break;
+				sge->n = 0;
+			}
+			sge->vaddr =
+				sge->mr->map[sge->m]->segs[sge->n].vaddr;
+			sge->length =
+				sge->mr->map[sge->m]->segs[sge->n].length;
+		}
+		data += len;
+		length -= len;
+	}
+}
+
+/**
+ * qib_skip_sge - skip over SGE memory - XXX almost dup of prev func
+ * @ss: the SGE state
+ * @length: the number of bytes to skip
+ */
+void qib_skip_sge(struct qib_sge_state *ss, u32 length, int release)
+{
+	struct qib_sge *sge = &ss->sge;
+
+	while (length) {
+		u32 len = sge->length;
+
+		if (len > length)
+			len = length;
+		if (len > sge->sge_length)
+			len = sge->sge_length;
+		BUG_ON(len == 0);
+		sge->vaddr += len;
+		sge->length -= len;
+		sge->sge_length -= len;
+		if (sge->sge_length == 0) {
+			if (release)
+				qib_put_mr(sge->mr);
+			if (--ss->num_sge)
+				*sge = *ss->sg_list++;
+		} else if (sge->length == 0 && sge->mr->lkey) {
+			if (++sge->n >= QIB_SEGSZ) {
+				if (++sge->m >= sge->mr->mapsz)
+					break;
+				sge->n = 0;
+			}
+			sge->vaddr =
+				sge->mr->map[sge->m]->segs[sge->n].vaddr;
+			sge->length =
+				sge->mr->map[sge->m]->segs[sge->n].length;
+		}
+		length -= len;
+	}
+}
+
+/*
+ * Count the number of DMA descriptors needed to send length bytes of data.
+ * Don't modify the qib_sge_state to get the count.
+ * Return zero if any of the segments is not aligned.
+ */
+static u32 qib_count_sge(struct qib_sge_state *ss, u32 length)
+{
+	struct qib_sge *sg_list = ss->sg_list;
+	struct qib_sge sge = ss->sge;
+	u8 num_sge = ss->num_sge;
+	u32 ndesc = 1;  /* count the header */
+
+	while (length) {
+		u32 len = sge.length;
+
+		if (len > length)
+			len = length;
+		if (len > sge.sge_length)
+			len = sge.sge_length;
+		BUG_ON(len == 0);
+		if (((long) sge.vaddr & (sizeof(u32) - 1)) ||
+		    (len != length && (len & (sizeof(u32) - 1)))) {
+			ndesc = 0;
+			break;
+		}
+		ndesc++;
+		sge.vaddr += len;
+		sge.length -= len;
+		sge.sge_length -= len;
+		if (sge.sge_length == 0) {
+			if (--num_sge)
+				sge = *sg_list++;
+		} else if (sge.length == 0 && sge.mr->lkey) {
+			if (++sge.n >= QIB_SEGSZ) {
+				if (++sge.m >= sge.mr->mapsz)
+					break;
+				sge.n = 0;
+			}
+			sge.vaddr =
+				sge.mr->map[sge.m]->segs[sge.n].vaddr;
+			sge.length =
+				sge.mr->map[sge.m]->segs[sge.n].length;
+		}
+		length -= len;
+	}
+	return ndesc;
+}
+
+/*
+ * Copy from the SGEs to the data buffer.
+ */
+static void qib_copy_from_sge(void *data, struct qib_sge_state *ss, u32 length)
+{
+	struct qib_sge *sge = &ss->sge;
+
+	while (length) {
+		u32 len = sge->length;
+
+		if (len > length)
+			len = length;
+		if (len > sge->sge_length)
+			len = sge->sge_length;
+		BUG_ON(len == 0);
+		memcpy(data, sge->vaddr, len);
+		sge->vaddr += len;
+		sge->length -= len;
+		sge->sge_length -= len;
+		if (sge->sge_length == 0) {
+			if (--ss->num_sge)
+				*sge = *ss->sg_list++;
+		} else if (sge->length == 0 && sge->mr->lkey) {
+			if (++sge->n >= QIB_SEGSZ) {
+				if (++sge->m >= sge->mr->mapsz)
+					break;
+				sge->n = 0;
+			}
+			sge->vaddr =
+				sge->mr->map[sge->m]->segs[sge->n].vaddr;
+			sge->length =
+				sge->mr->map[sge->m]->segs[sge->n].length;
+		}
+		data += len;
+		length -= len;
+	}
+}
+
+/**
+ * qib_post_one_send - post one RC, UC, or UD send work request
+ * @qp: the QP to post on
+ * @wr: the work request to send
+ */
+static int qib_post_one_send(struct qib_qp *qp, struct ib_send_wr *wr,
+	int *scheduled)
+{
+	struct qib_swqe *wqe;
+	u32 next;
+	int i;
+	int j;
+	int acc;
+	int ret;
+	unsigned long flags;
+	struct qib_lkey_table *rkt;
+	struct qib_pd *pd;
+
+	spin_lock_irqsave(&qp->s_lock, flags);
+
+	/* Check that state is OK to post send. */
+	if (unlikely(!(ib_qib_state_ops[qp->state] & QIB_POST_SEND_OK)))
+		goto bail_inval;
+
+	/* IB spec says that num_sge == 0 is OK. */
+	if (wr->num_sge > qp->s_max_sge)
+		goto bail_inval;
+
+	/*
+	 * Don't allow RDMA reads or atomic operations on UC or
+	 * undefined operations.
+	 * Make sure buffer is large enough to hold the result for atomics.
+	 */
+	if (wr->opcode == IB_WR_FAST_REG_MR) {
+		if (qib_fast_reg_mr(qp, wr))
+			goto bail_inval;
+	} else if (qp->ibqp.qp_type == IB_QPT_UC) {
+		if ((unsigned) wr->opcode >= IB_WR_RDMA_READ)
+			goto bail_inval;
+	} else if (qp->ibqp.qp_type != IB_QPT_RC) {
+		/* Check IB_QPT_SMI, IB_QPT_GSI, IB_QPT_UD opcode */
+		if (wr->opcode != IB_WR_SEND &&
+		    wr->opcode != IB_WR_SEND_WITH_IMM)
+			goto bail_inval;
+		/* Check UD destination address PD */
+		if (qp->ibqp.pd != wr->wr.ud.ah->pd)
+			goto bail_inval;
+	} else if ((unsigned) wr->opcode > IB_WR_ATOMIC_FETCH_AND_ADD)
+		goto bail_inval;
+	else if (wr->opcode >= IB_WR_ATOMIC_CMP_AND_SWP &&
+		   (wr->num_sge == 0 ||
+		    wr->sg_list[0].length < sizeof(u64) ||
+		    wr->sg_list[0].addr & (sizeof(u64) - 1)))
+		goto bail_inval;
+	else if (wr->opcode >= IB_WR_RDMA_READ && !qp->s_max_rd_atomic)
+		goto bail_inval;
+
+	next = qp->s_head + 1;
+	if (next >= qp->s_size)
+		next = 0;
+	if (next == qp->s_last) {
+		ret = -ENOMEM;
+		goto bail;
+	}
+
+	rkt = &to_idev(qp->ibqp.device)->lk_table;
+	pd = to_ipd(qp->ibqp.pd);
+	wqe = get_swqe_ptr(qp, qp->s_head);
+	wqe->wr = *wr;
+	wqe->length = 0;
+	j = 0;
+	if (wr->num_sge) {
+		acc = wr->opcode >= IB_WR_RDMA_READ ?
+			IB_ACCESS_LOCAL_WRITE : 0;
+		for (i = 0; i < wr->num_sge; i++) {
+			u32 length = wr->sg_list[i].length;
+			int ok;
+
+			if (length == 0)
+				continue;
+			ok = qib_lkey_ok(rkt, pd, &wqe->sg_list[j],
+					 &wr->sg_list[i], acc);
+			if (!ok)
+				goto bail_inval_free;
+			wqe->length += length;
+			j++;
+		}
+		wqe->wr.num_sge = j;
+	}
+	if (qp->ibqp.qp_type == IB_QPT_UC ||
+	    qp->ibqp.qp_type == IB_QPT_RC) {
+		if (wqe->length > 0x80000000U)
+			goto bail_inval_free;
+	} else if (wqe->length > (dd_from_ibdev(qp->ibqp.device)->pport +
+				  qp->port_num - 1)->ibmtu)
+		goto bail_inval_free;
+	else
+		atomic_inc(&to_iah(wr->wr.ud.ah)->refcount);
+	wqe->ssn = qp->s_ssn++;
+	qp->s_head = next;
+
+	ret = 0;
+	goto bail;
+
+bail_inval_free:
+	while (j) {
+		struct qib_sge *sge = &wqe->sg_list[--j];
+
+		qib_put_mr(sge->mr);
+	}
+bail_inval:
+	ret = -EINVAL;
+bail:
+	if (!ret && !wr->next &&
+	 !qib_sdma_empty(
+	   dd_from_ibdev(qp->ibqp.device)->pport + qp->port_num - 1)) {
+		qib_schedule_send(qp);
+		*scheduled = 1;
+	}
+	spin_unlock_irqrestore(&qp->s_lock, flags);
+	return ret;
+}
+
+/**
+ * qib_post_send - post a send on a QP
+ * @ibqp: the QP to post the send on
+ * @wr: the list of work requests to post
+ * @bad_wr: the first bad WR is put here
+ *
+ * This may be called from interrupt context.
+ */
+static int qib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
+			 struct ib_send_wr **bad_wr)
+{
+	struct qib_qp *qp = to_iqp(ibqp);
+	int err = 0;
+	int scheduled = 0;
+
+	for (; wr; wr = wr->next) {
+		err = qib_post_one_send(qp, wr, &scheduled);
+		if (err) {
+			*bad_wr = wr;
+			goto bail;
+		}
+	}
+
+	/* Try to do the send work in the caller's context. */
+	if (!scheduled)
+		qib_do_send(&qp->s_work);
+
+bail:
+	return err;
+}
+
+/**
+ * qib_post_receive - post a receive on a QP
+ * @ibqp: the QP to post the receive on
+ * @wr: the WR to post
+ * @bad_wr: the first bad WR is put here
+ *
+ * This may be called from interrupt context.
+ */
+static int qib_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr,
+			    struct ib_recv_wr **bad_wr)
+{
+	struct qib_qp *qp = to_iqp(ibqp);
+	struct qib_rwq *wq = qp->r_rq.wq;
+	unsigned long flags;
+	int ret;
+
+	/* Check that state is OK to post receive. */
+	if (!(ib_qib_state_ops[qp->state] & QIB_POST_RECV_OK) || !wq) {
+		*bad_wr = wr;
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	for (; wr; wr = wr->next) {
+		struct qib_rwqe *wqe;
+		u32 next;
+		int i;
+
+		if ((unsigned) wr->num_sge > qp->r_rq.max_sge) {
+			*bad_wr = wr;
+			ret = -EINVAL;
+			goto bail;
+		}
+
+		spin_lock_irqsave(&qp->r_rq.lock, flags);
+		next = wq->head + 1;
+		if (next >= qp->r_rq.size)
+			next = 0;
+		if (next == wq->tail) {
+			spin_unlock_irqrestore(&qp->r_rq.lock, flags);
+			*bad_wr = wr;
+			ret = -ENOMEM;
+			goto bail;
+		}
+
+		wqe = get_rwqe_ptr(&qp->r_rq, wq->head);
+		wqe->wr_id = wr->wr_id;
+		wqe->num_sge = wr->num_sge;
+		for (i = 0; i < wr->num_sge; i++)
+			wqe->sg_list[i] = wr->sg_list[i];
+		/* Make sure queue entry is written before the head index. */
+		smp_wmb();
+		wq->head = next;
+		spin_unlock_irqrestore(&qp->r_rq.lock, flags);
+	}
+	ret = 0;
+
+bail:
+	return ret;
+}
+
+/**
+ * qib_qp_rcv - processing an incoming packet on a QP
+ * @rcd: the context pointer
+ * @hdr: the packet header
+ * @has_grh: true if the packet has a GRH
+ * @data: the packet data
+ * @tlen: the packet length
+ * @qp: the QP the packet came on
+ *
+ * This is called from qib_ib_rcv() to process an incoming packet
+ * for the given QP.
+ * Called at interrupt level.
+ */
+static void qib_qp_rcv(struct qib_ctxtdata *rcd, struct qib_ib_header *hdr,
+		       int has_grh, void *data, u32 tlen, struct qib_qp *qp)
+{
+	struct qib_ibport *ibp = &rcd->ppd->ibport_data;
+
+	spin_lock(&qp->r_lock);
+
+	/* Check for valid receive state. */
+	if (!(ib_qib_state_ops[qp->state] & QIB_PROCESS_RECV_OK)) {
+		ibp->n_pkt_drops++;
+		goto unlock;
+	}
+
+	switch (qp->ibqp.qp_type) {
+	case IB_QPT_SMI:
+	case IB_QPT_GSI:
+		if (ib_qib_disable_sma)
+			break;
+		/* FALLTHROUGH */
+	case IB_QPT_UD:
+		qib_ud_rcv(ibp, hdr, has_grh, data, tlen, qp);
+		break;
+
+	case IB_QPT_RC:
+		qib_rc_rcv(rcd, hdr, has_grh, data, tlen, qp);
+		break;
+
+	case IB_QPT_UC:
+		qib_uc_rcv(ibp, hdr, has_grh, data, tlen, qp);
+		break;
+
+	default:
+		break;
+	}
+
+unlock:
+	spin_unlock(&qp->r_lock);
+}
+
+/**
+ * qib_ib_rcv - process an incoming packet
+ * @rcd: the context pointer
+ * @rhdr: the header of the packet
+ * @data: the packet payload
+ * @tlen: the packet length
+ *
+ * This is called from qib_kreceive() to process an incoming packet at
+ * interrupt level. Tlen is the length of the header + data + CRC in bytes.
+ */
+void qib_ib_rcv(struct qib_ctxtdata *rcd, void *rhdr, void *data, u32 tlen)
+{
+	struct qib_pportdata *ppd = rcd->ppd;
+	struct qib_ibport *ibp = &ppd->ibport_data;
+	struct qib_ib_header *hdr = rhdr;
+	struct qib_other_headers *ohdr;
+	struct qib_qp *qp;
+	u32 qp_num;
+	int lnh;
+	u8 opcode;
+	u16 lid;
+
+	/* 24 == LRH+BTH+CRC */
+	if (unlikely(tlen < 24))
+		goto drop;
+
+	/* Check for a valid destination LID (see ch. 7.11.1). */
+	lid = be16_to_cpu(hdr->lrh[1]);
+	if (lid < QIB_MULTICAST_LID_BASE) {
+		lid &= ~((1 << ppd->lmc) - 1);
+		if (unlikely(lid != ppd->lid))
+			goto drop;
+	}
+
+	/* Check for GRH */
+	lnh = be16_to_cpu(hdr->lrh[0]) & 3;
+	if (lnh == QIB_LRH_BTH)
+		ohdr = &hdr->u.oth;
+	else if (lnh == QIB_LRH_GRH) {
+		u32 vtf;
+
+		ohdr = &hdr->u.l.oth;
+		if (hdr->u.l.grh.next_hdr != IB_GRH_NEXT_HDR)
+			goto drop;
+		vtf = be32_to_cpu(hdr->u.l.grh.version_tclass_flow);
+		if ((vtf >> IB_GRH_VERSION_SHIFT) != IB_GRH_VERSION)
+			goto drop;
+	} else
+		goto drop;
+
+	opcode = (be32_to_cpu(ohdr->bth[0]) >> 24) & 0x7f;
+#ifdef CONFIG_DEBUG_FS
+	rcd->opstats->stats[opcode].n_bytes += tlen;
+	rcd->opstats->stats[opcode].n_packets++;
+#endif
+
+	/* Get the destination QP number. */
+	qp_num = be32_to_cpu(ohdr->bth[1]) & QIB_QPN_MASK;
+	if (qp_num == QIB_MULTICAST_QPN) {
+		struct qib_mcast *mcast;
+		struct qib_mcast_qp *p;
+
+		if (lnh != QIB_LRH_GRH)
+			goto drop;
+		mcast = qib_mcast_find(ibp, &hdr->u.l.grh.dgid);
+		if (mcast == NULL)
+			goto drop;
+		this_cpu_inc(ibp->pmastats->n_multicast_rcv);
+		list_for_each_entry_rcu(p, &mcast->qp_list, list)
+			qib_qp_rcv(rcd, hdr, 1, data, tlen, p->qp);
+		/*
+		 * Notify qib_multicast_detach() if it is waiting for us
+		 * to finish.
+		 */
+		if (atomic_dec_return(&mcast->refcount) <= 1)
+			wake_up(&mcast->wait);
+	} else {
+		if (rcd->lookaside_qp) {
+			if (rcd->lookaside_qpn != qp_num) {
+				if (atomic_dec_and_test(
+					&rcd->lookaside_qp->refcount))
+					wake_up(
+					 &rcd->lookaside_qp->wait);
+				rcd->lookaside_qp = NULL;
+			}
+		}
+		if (!rcd->lookaside_qp) {
+			qp = qib_lookup_qpn(ibp, qp_num);
+			if (!qp)
+				goto drop;
+			rcd->lookaside_qp = qp;
+			rcd->lookaside_qpn = qp_num;
+		} else
+			qp = rcd->lookaside_qp;
+		this_cpu_inc(ibp->pmastats->n_unicast_rcv);
+		qib_qp_rcv(rcd, hdr, lnh == QIB_LRH_GRH, data, tlen, qp);
+	}
+	return;
+
+drop:
+	ibp->n_pkt_drops++;
+}
+
+/*
+ * This is called from a timer to check for QPs
+ * which need kernel memory in order to send a packet.
+ */
+static void mem_timer(unsigned long data)
+{
+	struct qib_ibdev *dev = (struct qib_ibdev *) data;
+	struct list_head *list = &dev->memwait;
+	struct qib_qp *qp = NULL;
+	unsigned long flags;
+
+	spin_lock_irqsave(&dev->pending_lock, flags);
+	if (!list_empty(list)) {
+		qp = list_entry(list->next, struct qib_qp, iowait);
+		list_del_init(&qp->iowait);
+		atomic_inc(&qp->refcount);
+		if (!list_empty(list))
+			mod_timer(&dev->mem_timer, jiffies + 1);
+	}
+	spin_unlock_irqrestore(&dev->pending_lock, flags);
+
+	if (qp) {
+		spin_lock_irqsave(&qp->s_lock, flags);
+		if (qp->s_flags & QIB_S_WAIT_KMEM) {
+			qp->s_flags &= ~QIB_S_WAIT_KMEM;
+			qib_schedule_send(qp);
+		}
+		spin_unlock_irqrestore(&qp->s_lock, flags);
+		if (atomic_dec_and_test(&qp->refcount))
+			wake_up(&qp->wait);
+	}
+}
+
+static void update_sge(struct qib_sge_state *ss, u32 length)
+{
+	struct qib_sge *sge = &ss->sge;
+
+	sge->vaddr += length;
+	sge->length -= length;
+	sge->sge_length -= length;
+	if (sge->sge_length == 0) {
+		if (--ss->num_sge)
+			*sge = *ss->sg_list++;
+	} else if (sge->length == 0 && sge->mr->lkey) {
+		if (++sge->n >= QIB_SEGSZ) {
+			if (++sge->m >= sge->mr->mapsz)
+				return;
+			sge->n = 0;
+		}
+		sge->vaddr = sge->mr->map[sge->m]->segs[sge->n].vaddr;
+		sge->length = sge->mr->map[sge->m]->segs[sge->n].length;
+	}
+}
+
+#ifdef __LITTLE_ENDIAN
+static inline u32 get_upper_bits(u32 data, u32 shift)
+{
+	return data >> shift;
+}
+
+static inline u32 set_upper_bits(u32 data, u32 shift)
+{
+	return data << shift;
+}
+
+static inline u32 clear_upper_bytes(u32 data, u32 n, u32 off)
+{
+	data <<= ((sizeof(u32) - n) * BITS_PER_BYTE);
+	data >>= ((sizeof(u32) - n - off) * BITS_PER_BYTE);
+	return data;
+}
+#else
+static inline u32 get_upper_bits(u32 data, u32 shift)
+{
+	return data << shift;
+}
+
+static inline u32 set_upper_bits(u32 data, u32 shift)
+{
+	return data >> shift;
+}
+
+static inline u32 clear_upper_bytes(u32 data, u32 n, u32 off)
+{
+	data >>= ((sizeof(u32) - n) * BITS_PER_BYTE);
+	data <<= ((sizeof(u32) - n - off) * BITS_PER_BYTE);
+	return data;
+}
+#endif
+
+static void copy_io(u32 __iomem *piobuf, struct qib_sge_state *ss,
+		    u32 length, unsigned flush_wc)
+{
+	u32 extra = 0;
+	u32 data = 0;
+	u32 last;
+
+	while (1) {
+		u32 len = ss->sge.length;
+		u32 off;
+
+		if (len > length)
+			len = length;
+		if (len > ss->sge.sge_length)
+			len = ss->sge.sge_length;
+		BUG_ON(len == 0);
+		/* If the source address is not aligned, try to align it. */
+		off = (unsigned long)ss->sge.vaddr & (sizeof(u32) - 1);
+		if (off) {
+			u32 *addr = (u32 *)((unsigned long)ss->sge.vaddr &
+					    ~(sizeof(u32) - 1));
+			u32 v = get_upper_bits(*addr, off * BITS_PER_BYTE);
+			u32 y;
+
+			y = sizeof(u32) - off;
+			if (len > y)
+				len = y;
+			if (len + extra >= sizeof(u32)) {
+				data |= set_upper_bits(v, extra *
+						       BITS_PER_BYTE);
+				len = sizeof(u32) - extra;
+				if (len == length) {
+					last = data;
+					break;
+				}
+				__raw_writel(data, piobuf);
+				piobuf++;
+				extra = 0;
+				data = 0;
+			} else {
+				/* Clear unused upper bytes */
+				data |= clear_upper_bytes(v, len, extra);
+				if (len == length) {
+					last = data;
+					break;
+				}
+				extra += len;
+			}
+		} else if (extra) {
+			/* Source address is aligned. */
+			u32 *addr = (u32 *) ss->sge.vaddr;
+			int shift = extra * BITS_PER_BYTE;
+			int ushift = 32 - shift;
+			u32 l = len;
+
+			while (l >= sizeof(u32)) {
+				u32 v = *addr;
+
+				data |= set_upper_bits(v, shift);
+				__raw_writel(data, piobuf);
+				data = get_upper_bits(v, ushift);
+				piobuf++;
+				addr++;
+				l -= sizeof(u32);
+			}
+			/*
+			 * We still have 'extra' number of bytes leftover.
+			 */
+			if (l) {
+				u32 v = *addr;
+
+				if (l + extra >= sizeof(u32)) {
+					data |= set_upper_bits(v, shift);
+					len -= l + extra - sizeof(u32);
+					if (len == length) {
+						last = data;
+						break;
+					}
+					__raw_writel(data, piobuf);
+					piobuf++;
+					extra = 0;
+					data = 0;
+				} else {
+					/* Clear unused upper bytes */
+					data |= clear_upper_bytes(v, l, extra);
+					if (len == length) {
+						last = data;
+						break;
+					}
+					extra += l;
+				}
+			} else if (len == length) {
+				last = data;
+				break;
+			}
+		} else if (len == length) {
+			u32 w;
+
+			/*
+			 * Need to round up for the last dword in the
+			 * packet.
+			 */
+			w = (len + 3) >> 2;
+			qib_pio_copy(piobuf, ss->sge.vaddr, w - 1);
+			piobuf += w - 1;
+			last = ((u32 *) ss->sge.vaddr)[w - 1];
+			break;
+		} else {
+			u32 w = len >> 2;
+
+			qib_pio_copy(piobuf, ss->sge.vaddr, w);
+			piobuf += w;
+
+			extra = len & (sizeof(u32) - 1);
+			if (extra) {
+				u32 v = ((u32 *) ss->sge.vaddr)[w];
+
+				/* Clear unused upper bytes */
+				data = clear_upper_bytes(v, extra, 0);
+			}
+		}
+		update_sge(ss, len);
+		length -= len;
+	}
+	/* Update address before sending packet. */
+	update_sge(ss, length);
+	if (flush_wc) {
+		/* must flush early everything before trigger word */
+		qib_flush_wc();
+		__raw_writel(last, piobuf);
+		/* be sure trigger word is written */
+		qib_flush_wc();
+	} else
+		__raw_writel(last, piobuf);
+}
+
+static noinline struct qib_verbs_txreq *__get_txreq(struct qib_ibdev *dev,
+					   struct qib_qp *qp)
+{
+	struct qib_verbs_txreq *tx;
+	unsigned long flags;
+
+	spin_lock_irqsave(&qp->s_lock, flags);
+	spin_lock(&dev->pending_lock);
+
+	if (!list_empty(&dev->txreq_free)) {
+		struct list_head *l = dev->txreq_free.next;
+
+		list_del(l);
+		spin_unlock(&dev->pending_lock);
+		spin_unlock_irqrestore(&qp->s_lock, flags);
+		tx = list_entry(l, struct qib_verbs_txreq, txreq.list);
+	} else {
+		if (ib_qib_state_ops[qp->state] & QIB_PROCESS_RECV_OK &&
+		    list_empty(&qp->iowait)) {
+			dev->n_txwait++;
+			qp->s_flags |= QIB_S_WAIT_TX;
+			list_add_tail(&qp->iowait, &dev->txwait);
+		}
+		qp->s_flags &= ~QIB_S_BUSY;
+		spin_unlock(&dev->pending_lock);
+		spin_unlock_irqrestore(&qp->s_lock, flags);
+		tx = ERR_PTR(-EBUSY);
+	}
+	return tx;
+}
+
+static inline struct qib_verbs_txreq *get_txreq(struct qib_ibdev *dev,
+					 struct qib_qp *qp)
+{
+	struct qib_verbs_txreq *tx;
+	unsigned long flags;
+
+	spin_lock_irqsave(&dev->pending_lock, flags);
+	/* assume the list non empty */
+	if (likely(!list_empty(&dev->txreq_free))) {
+		struct list_head *l = dev->txreq_free.next;
+
+		list_del(l);
+		spin_unlock_irqrestore(&dev->pending_lock, flags);
+		tx = list_entry(l, struct qib_verbs_txreq, txreq.list);
+	} else {
+		/* call slow path to get the extra lock */
+		spin_unlock_irqrestore(&dev->pending_lock, flags);
+		tx =  __get_txreq(dev, qp);
+	}
+	return tx;
+}
+
+void qib_put_txreq(struct qib_verbs_txreq *tx)
+{
+	struct qib_ibdev *dev;
+	struct qib_qp *qp;
+	unsigned long flags;
+
+	qp = tx->qp;
+	dev = to_idev(qp->ibqp.device);
+
+	if (atomic_dec_and_test(&qp->refcount))
+		wake_up(&qp->wait);
+	if (tx->mr) {
+		qib_put_mr(tx->mr);
+		tx->mr = NULL;
+	}
+	if (tx->txreq.flags & QIB_SDMA_TXREQ_F_FREEBUF) {
+		tx->txreq.flags &= ~QIB_SDMA_TXREQ_F_FREEBUF;
+		dma_unmap_single(&dd_from_dev(dev)->pcidev->dev,
+				 tx->txreq.addr, tx->hdr_dwords << 2,
+				 DMA_TO_DEVICE);
+		kfree(tx->align_buf);
+	}
+
+	spin_lock_irqsave(&dev->pending_lock, flags);
+
+	/* Put struct back on free list */
+	list_add(&tx->txreq.list, &dev->txreq_free);
+
+	if (!list_empty(&dev->txwait)) {
+		/* Wake up first QP wanting a free struct */
+		qp = list_entry(dev->txwait.next, struct qib_qp, iowait);
+		list_del_init(&qp->iowait);
+		atomic_inc(&qp->refcount);
+		spin_unlock_irqrestore(&dev->pending_lock, flags);
+
+		spin_lock_irqsave(&qp->s_lock, flags);
+		if (qp->s_flags & QIB_S_WAIT_TX) {
+			qp->s_flags &= ~QIB_S_WAIT_TX;
+			qib_schedule_send(qp);
+		}
+		spin_unlock_irqrestore(&qp->s_lock, flags);
+
+		if (atomic_dec_and_test(&qp->refcount))
+			wake_up(&qp->wait);
+	} else
+		spin_unlock_irqrestore(&dev->pending_lock, flags);
+}
+
+/*
+ * This is called when there are send DMA descriptors that might be
+ * available.
+ *
+ * This is called with ppd->sdma_lock held.
+ */
+void qib_verbs_sdma_desc_avail(struct qib_pportdata *ppd, unsigned avail)
+{
+	struct qib_qp *qp, *nqp;
+	struct qib_qp *qps[20];
+	struct qib_ibdev *dev;
+	unsigned i, n;
+
+	n = 0;
+	dev = &ppd->dd->verbs_dev;
+	spin_lock(&dev->pending_lock);
+
+	/* Search wait list for first QP wanting DMA descriptors. */
+	list_for_each_entry_safe(qp, nqp, &dev->dmawait, iowait) {
+		if (qp->port_num != ppd->port)
+			continue;
+		if (n == ARRAY_SIZE(qps))
+			break;
+		if (qp->s_tx->txreq.sg_count > avail)
+			break;
+		avail -= qp->s_tx->txreq.sg_count;
+		list_del_init(&qp->iowait);
+		atomic_inc(&qp->refcount);
+		qps[n++] = qp;
+	}
+
+	spin_unlock(&dev->pending_lock);
+
+	for (i = 0; i < n; i++) {
+		qp = qps[i];
+		spin_lock(&qp->s_lock);
+		if (qp->s_flags & QIB_S_WAIT_DMA_DESC) {
+			qp->s_flags &= ~QIB_S_WAIT_DMA_DESC;
+			qib_schedule_send(qp);
+		}
+		spin_unlock(&qp->s_lock);
+		if (atomic_dec_and_test(&qp->refcount))
+			wake_up(&qp->wait);
+	}
+}
+
+/*
+ * This is called with ppd->sdma_lock held.
+ */
+static void sdma_complete(struct qib_sdma_txreq *cookie, int status)
+{
+	struct qib_verbs_txreq *tx =
+		container_of(cookie, struct qib_verbs_txreq, txreq);
+	struct qib_qp *qp = tx->qp;
+
+	spin_lock(&qp->s_lock);
+	if (tx->wqe)
+		qib_send_complete(qp, tx->wqe, IB_WC_SUCCESS);
+	else if (qp->ibqp.qp_type == IB_QPT_RC) {
+		struct qib_ib_header *hdr;
+
+		if (tx->txreq.flags & QIB_SDMA_TXREQ_F_FREEBUF)
+			hdr = &tx->align_buf->hdr;
+		else {
+			struct qib_ibdev *dev = to_idev(qp->ibqp.device);
+
+			hdr = &dev->pio_hdrs[tx->hdr_inx].hdr;
+		}
+		qib_rc_send_complete(qp, hdr);
+	}
+	if (atomic_dec_and_test(&qp->s_dma_busy)) {
+		if (qp->state == IB_QPS_RESET)
+			wake_up(&qp->wait_dma);
+		else if (qp->s_flags & QIB_S_WAIT_DMA) {
+			qp->s_flags &= ~QIB_S_WAIT_DMA;
+			qib_schedule_send(qp);
+		}
+	}
+	spin_unlock(&qp->s_lock);
+
+	qib_put_txreq(tx);
+}
+
+static int wait_kmem(struct qib_ibdev *dev, struct qib_qp *qp)
+{
+	unsigned long flags;
+	int ret = 0;
+
+	spin_lock_irqsave(&qp->s_lock, flags);
+	if (ib_qib_state_ops[qp->state] & QIB_PROCESS_RECV_OK) {
+		spin_lock(&dev->pending_lock);
+		if (list_empty(&qp->iowait)) {
+			if (list_empty(&dev->memwait))
+				mod_timer(&dev->mem_timer, jiffies + 1);
+			qp->s_flags |= QIB_S_WAIT_KMEM;
+			list_add_tail(&qp->iowait, &dev->memwait);
+		}
+		spin_unlock(&dev->pending_lock);
+		qp->s_flags &= ~QIB_S_BUSY;
+		ret = -EBUSY;
+	}
+	spin_unlock_irqrestore(&qp->s_lock, flags);
+
+	return ret;
+}
+
+static int qib_verbs_send_dma(struct qib_qp *qp, struct qib_ib_header *hdr,
+			      u32 hdrwords, struct qib_sge_state *ss, u32 len,
+			      u32 plen, u32 dwords)
+{
+	struct qib_ibdev *dev = to_idev(qp->ibqp.device);
+	struct qib_devdata *dd = dd_from_dev(dev);
+	struct qib_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
+	struct qib_pportdata *ppd = ppd_from_ibp(ibp);
+	struct qib_verbs_txreq *tx;
+	struct qib_pio_header *phdr;
+	u32 control;
+	u32 ndesc;
+	int ret;
+
+	tx = qp->s_tx;
+	if (tx) {
+		qp->s_tx = NULL;
+		/* resend previously constructed packet */
+		ret = qib_sdma_verbs_send(ppd, tx->ss, tx->dwords, tx);
+		goto bail;
+	}
+
+	tx = get_txreq(dev, qp);
+	if (IS_ERR(tx))
+		goto bail_tx;
+
+	control = dd->f_setpbc_control(ppd, plen, qp->s_srate,
+				       be16_to_cpu(hdr->lrh[0]) >> 12);
+	tx->qp = qp;
+	atomic_inc(&qp->refcount);
+	tx->wqe = qp->s_wqe;
+	tx->mr = qp->s_rdma_mr;
+	if (qp->s_rdma_mr)
+		qp->s_rdma_mr = NULL;
+	tx->txreq.callback = sdma_complete;
+	if (dd->flags & QIB_HAS_SDMA_TIMEOUT)
+		tx->txreq.flags = QIB_SDMA_TXREQ_F_HEADTOHOST;
+	else
+		tx->txreq.flags = QIB_SDMA_TXREQ_F_INTREQ;
+	if (plen + 1 > dd->piosize2kmax_dwords)
+		tx->txreq.flags |= QIB_SDMA_TXREQ_F_USELARGEBUF;
+
+	if (len) {
+		/*
+		 * Don't try to DMA if it takes more descriptors than
+		 * the queue holds.
+		 */
+		ndesc = qib_count_sge(ss, len);
+		if (ndesc >= ppd->sdma_descq_cnt)
+			ndesc = 0;
+	} else
+		ndesc = 1;
+	if (ndesc) {
+		phdr = &dev->pio_hdrs[tx->hdr_inx];
+		phdr->pbc[0] = cpu_to_le32(plen);
+		phdr->pbc[1] = cpu_to_le32(control);
+		memcpy(&phdr->hdr, hdr, hdrwords << 2);
+		tx->txreq.flags |= QIB_SDMA_TXREQ_F_FREEDESC;
+		tx->txreq.sg_count = ndesc;
+		tx->txreq.addr = dev->pio_hdrs_phys +
+			tx->hdr_inx * sizeof(struct qib_pio_header);
+		tx->hdr_dwords = hdrwords + 2; /* add PBC length */
+		ret = qib_sdma_verbs_send(ppd, ss, dwords, tx);
+		goto bail;
+	}
+
+	/* Allocate a buffer and copy the header and payload to it. */
+	tx->hdr_dwords = plen + 1;
+	phdr = kmalloc(tx->hdr_dwords << 2, GFP_ATOMIC);
+	if (!phdr)
+		goto err_tx;
+	phdr->pbc[0] = cpu_to_le32(plen);
+	phdr->pbc[1] = cpu_to_le32(control);
+	memcpy(&phdr->hdr, hdr, hdrwords << 2);
+	qib_copy_from_sge((u32 *) &phdr->hdr + hdrwords, ss, len);
+
+	tx->txreq.addr = dma_map_single(&dd->pcidev->dev, phdr,
+					tx->hdr_dwords << 2, DMA_TO_DEVICE);
+	if (dma_mapping_error(&dd->pcidev->dev, tx->txreq.addr))
+		goto map_err;
+	tx->align_buf = phdr;
+	tx->txreq.flags |= QIB_SDMA_TXREQ_F_FREEBUF;
+	tx->txreq.sg_count = 1;
+	ret = qib_sdma_verbs_send(ppd, NULL, 0, tx);
+	goto unaligned;
+
+map_err:
+	kfree(phdr);
+err_tx:
+	qib_put_txreq(tx);
+	ret = wait_kmem(dev, qp);
+unaligned:
+	ibp->n_unaligned++;
+bail:
+	return ret;
+bail_tx:
+	ret = PTR_ERR(tx);
+	goto bail;
+}
+
+/*
+ * If we are now in the error state, return zero to flush the
+ * send work request.
+ */
+static int no_bufs_available(struct qib_qp *qp)
+{
+	struct qib_ibdev *dev = to_idev(qp->ibqp.device);
+	struct qib_devdata *dd;
+	unsigned long flags;
+	int ret = 0;
+
+	/*
+	 * Note that as soon as want_buffer() is called and
+	 * possibly before it returns, qib_ib_piobufavail()
+	 * could be called. Therefore, put QP on the I/O wait list before
+	 * enabling the PIO avail interrupt.
+	 */
+	spin_lock_irqsave(&qp->s_lock, flags);
+	if (ib_qib_state_ops[qp->state] & QIB_PROCESS_RECV_OK) {
+		spin_lock(&dev->pending_lock);
+		if (list_empty(&qp->iowait)) {
+			dev->n_piowait++;
+			qp->s_flags |= QIB_S_WAIT_PIO;
+			list_add_tail(&qp->iowait, &dev->piowait);
+			dd = dd_from_dev(dev);
+			dd->f_wantpiobuf_intr(dd, 1);
+		}
+		spin_unlock(&dev->pending_lock);
+		qp->s_flags &= ~QIB_S_BUSY;
+		ret = -EBUSY;
+	}
+	spin_unlock_irqrestore(&qp->s_lock, flags);
+	return ret;
+}
+
+static int qib_verbs_send_pio(struct qib_qp *qp, struct qib_ib_header *ibhdr,
+			      u32 hdrwords, struct qib_sge_state *ss, u32 len,
+			      u32 plen, u32 dwords)
+{
+	struct qib_devdata *dd = dd_from_ibdev(qp->ibqp.device);
+	struct qib_pportdata *ppd = dd->pport + qp->port_num - 1;
+	u32 *hdr = (u32 *) ibhdr;
+	u32 __iomem *piobuf_orig;
+	u32 __iomem *piobuf;
+	u64 pbc;
+	unsigned long flags;
+	unsigned flush_wc;
+	u32 control;
+	u32 pbufn;
+
+	control = dd->f_setpbc_control(ppd, plen, qp->s_srate,
+		be16_to_cpu(ibhdr->lrh[0]) >> 12);
+	pbc = ((u64) control << 32) | plen;
+	piobuf = dd->f_getsendbuf(ppd, pbc, &pbufn);
+	if (unlikely(piobuf == NULL))
+		return no_bufs_available(qp);
+
+	/*
+	 * Write the pbc.
+	 * We have to flush after the PBC for correctness on some cpus
+	 * or WC buffer can be written out of order.
+	 */
+	writeq(pbc, piobuf);
+	piobuf_orig = piobuf;
+	piobuf += 2;
+
+	flush_wc = dd->flags & QIB_PIO_FLUSH_WC;
+	if (len == 0) {
+		/*
+		 * If there is just the header portion, must flush before
+		 * writing last word of header for correctness, and after
+		 * the last header word (trigger word).
+		 */
+		if (flush_wc) {
+			qib_flush_wc();
+			qib_pio_copy(piobuf, hdr, hdrwords - 1);
+			qib_flush_wc();
+			__raw_writel(hdr[hdrwords - 1], piobuf + hdrwords - 1);
+			qib_flush_wc();
+		} else
+			qib_pio_copy(piobuf, hdr, hdrwords);
+		goto done;
+	}
+
+	if (flush_wc)
+		qib_flush_wc();
+	qib_pio_copy(piobuf, hdr, hdrwords);
+	piobuf += hdrwords;
+
+	/* The common case is aligned and contained in one segment. */
+	if (likely(ss->num_sge == 1 && len <= ss->sge.length &&
+		   !((unsigned long)ss->sge.vaddr & (sizeof(u32) - 1)))) {
+		u32 *addr = (u32 *) ss->sge.vaddr;
+
+		/* Update address before sending packet. */
+		update_sge(ss, len);
+		if (flush_wc) {
+			qib_pio_copy(piobuf, addr, dwords - 1);
+			/* must flush early everything before trigger word */
+			qib_flush_wc();
+			__raw_writel(addr[dwords - 1], piobuf + dwords - 1);
+			/* be sure trigger word is written */
+			qib_flush_wc();
+		} else
+			qib_pio_copy(piobuf, addr, dwords);
+		goto done;
+	}
+	copy_io(piobuf, ss, len, flush_wc);
+done:
+	if (dd->flags & QIB_USE_SPCL_TRIG) {
+		u32 spcl_off = (pbufn >= dd->piobcnt2k) ? 2047 : 1023;
+		qib_flush_wc();
+		__raw_writel(0xaebecede, piobuf_orig + spcl_off);
+	}
+	qib_sendbuf_done(dd, pbufn);
+	if (qp->s_rdma_mr) {
+		qib_put_mr(qp->s_rdma_mr);
+		qp->s_rdma_mr = NULL;
+	}
+	if (qp->s_wqe) {
+		spin_lock_irqsave(&qp->s_lock, flags);
+		qib_send_complete(qp, qp->s_wqe, IB_WC_SUCCESS);
+		spin_unlock_irqrestore(&qp->s_lock, flags);
+	} else if (qp->ibqp.qp_type == IB_QPT_RC) {
+		spin_lock_irqsave(&qp->s_lock, flags);
+		qib_rc_send_complete(qp, ibhdr);
+		spin_unlock_irqrestore(&qp->s_lock, flags);
+	}
+	return 0;
+}
+
+/**
+ * qib_verbs_send - send a packet
+ * @qp: the QP to send on
+ * @hdr: the packet header
+ * @hdrwords: the number of 32-bit words in the header
+ * @ss: the SGE to send
+ * @len: the length of the packet in bytes
+ *
+ * Return zero if packet is sent or queued OK.
+ * Return non-zero and clear qp->s_flags QIB_S_BUSY otherwise.
+ */
+int qib_verbs_send(struct qib_qp *qp, struct qib_ib_header *hdr,
+		   u32 hdrwords, struct qib_sge_state *ss, u32 len)
+{
+	struct qib_devdata *dd = dd_from_ibdev(qp->ibqp.device);
+	u32 plen;
+	int ret;
+	u32 dwords = (len + 3) >> 2;
+
+	/*
+	 * Calculate the send buffer trigger address.
+	 * The +1 counts for the pbc control dword following the pbc length.
+	 */
+	plen = hdrwords + dwords + 1;
+
+	/*
+	 * VL15 packets (IB_QPT_SMI) will always use PIO, so we
+	 * can defer SDMA restart until link goes ACTIVE without
+	 * worrying about just how we got there.
+	 */
+	if (qp->ibqp.qp_type == IB_QPT_SMI ||
+	    !(dd->flags & QIB_HAS_SEND_DMA))
+		ret = qib_verbs_send_pio(qp, hdr, hdrwords, ss, len,
+					 plen, dwords);
+	else
+		ret = qib_verbs_send_dma(qp, hdr, hdrwords, ss, len,
+					 plen, dwords);
+
+	return ret;
+}
+
+int qib_snapshot_counters(struct qib_pportdata *ppd, u64 *swords,
+			  u64 *rwords, u64 *spkts, u64 *rpkts,
+			  u64 *xmit_wait)
+{
+	int ret;
+	struct qib_devdata *dd = ppd->dd;
+
+	if (!(dd->flags & QIB_PRESENT)) {
+		/* no hardware, freeze, etc. */
+		ret = -EINVAL;
+		goto bail;
+	}
+	*swords = dd->f_portcntr(ppd, QIBPORTCNTR_WORDSEND);
+	*rwords = dd->f_portcntr(ppd, QIBPORTCNTR_WORDRCV);
+	*spkts = dd->f_portcntr(ppd, QIBPORTCNTR_PKTSEND);
+	*rpkts = dd->f_portcntr(ppd, QIBPORTCNTR_PKTRCV);
+	*xmit_wait = dd->f_portcntr(ppd, QIBPORTCNTR_SENDSTALL);
+
+	ret = 0;
+
+bail:
+	return ret;
+}
+
+/**
+ * qib_get_counters - get various chip counters
+ * @dd: the qlogic_ib device
+ * @cntrs: counters are placed here
+ *
+ * Return the counters needed by recv_pma_get_portcounters().
+ */
+int qib_get_counters(struct qib_pportdata *ppd,
+		     struct qib_verbs_counters *cntrs)
+{
+	int ret;
+
+	if (!(ppd->dd->flags & QIB_PRESENT)) {
+		/* no hardware, freeze, etc. */
+		ret = -EINVAL;
+		goto bail;
+	}
+	cntrs->symbol_error_counter =
+		ppd->dd->f_portcntr(ppd, QIBPORTCNTR_IBSYMBOLERR);
+	cntrs->link_error_recovery_counter =
+		ppd->dd->f_portcntr(ppd, QIBPORTCNTR_IBLINKERRRECOV);
+	/*
+	 * The link downed counter counts when the other side downs the
+	 * connection.  We add in the number of times we downed the link
+	 * due to local link integrity errors to compensate.
+	 */
+	cntrs->link_downed_counter =
+		ppd->dd->f_portcntr(ppd, QIBPORTCNTR_IBLINKDOWN);
+	cntrs->port_rcv_errors =
+		ppd->dd->f_portcntr(ppd, QIBPORTCNTR_RXDROPPKT) +
+		ppd->dd->f_portcntr(ppd, QIBPORTCNTR_RCVOVFL) +
+		ppd->dd->f_portcntr(ppd, QIBPORTCNTR_ERR_RLEN) +
+		ppd->dd->f_portcntr(ppd, QIBPORTCNTR_INVALIDRLEN) +
+		ppd->dd->f_portcntr(ppd, QIBPORTCNTR_ERRLINK) +
+		ppd->dd->f_portcntr(ppd, QIBPORTCNTR_ERRICRC) +
+		ppd->dd->f_portcntr(ppd, QIBPORTCNTR_ERRVCRC) +
+		ppd->dd->f_portcntr(ppd, QIBPORTCNTR_ERRLPCRC) +
+		ppd->dd->f_portcntr(ppd, QIBPORTCNTR_BADFORMAT);
+	cntrs->port_rcv_errors +=
+		ppd->dd->f_portcntr(ppd, QIBPORTCNTR_RXLOCALPHYERR);
+	cntrs->port_rcv_errors +=
+		ppd->dd->f_portcntr(ppd, QIBPORTCNTR_RXVLERR);
+	cntrs->port_rcv_remphys_errors =
+		ppd->dd->f_portcntr(ppd, QIBPORTCNTR_RCVEBP);
+	cntrs->port_xmit_discards =
+		ppd->dd->f_portcntr(ppd, QIBPORTCNTR_UNSUPVL);
+	cntrs->port_xmit_data = ppd->dd->f_portcntr(ppd,
+			QIBPORTCNTR_WORDSEND);
+	cntrs->port_rcv_data = ppd->dd->f_portcntr(ppd,
+			QIBPORTCNTR_WORDRCV);
+	cntrs->port_xmit_packets = ppd->dd->f_portcntr(ppd,
+			QIBPORTCNTR_PKTSEND);
+	cntrs->port_rcv_packets = ppd->dd->f_portcntr(ppd,
+			QIBPORTCNTR_PKTRCV);
+	cntrs->local_link_integrity_errors =
+		ppd->dd->f_portcntr(ppd, QIBPORTCNTR_LLI);
+	cntrs->excessive_buffer_overrun_errors =
+		ppd->dd->f_portcntr(ppd, QIBPORTCNTR_EXCESSBUFOVFL);
+	cntrs->vl15_dropped =
+		ppd->dd->f_portcntr(ppd, QIBPORTCNTR_VL15PKTDROP);
+
+	ret = 0;
+
+bail:
+	return ret;
+}
+
+/**
+ * qib_ib_piobufavail - callback when a PIO buffer is available
+ * @dd: the device pointer
+ *
+ * This is called from qib_intr() at interrupt level when a PIO buffer is
+ * available after qib_verbs_send() returned an error that no buffers were
+ * available. Disable the interrupt if there are no more QPs waiting.
+ */
+void qib_ib_piobufavail(struct qib_devdata *dd)
+{
+	struct qib_ibdev *dev = &dd->verbs_dev;
+	struct list_head *list;
+	struct qib_qp *qps[5];
+	struct qib_qp *qp;
+	unsigned long flags;
+	unsigned i, n;
+
+	list = &dev->piowait;
+	n = 0;
+
+	/*
+	 * Note: checking that the piowait list is empty and clearing
+	 * the buffer available interrupt needs to be atomic or we
+	 * could end up with QPs on the wait list with the interrupt
+	 * disabled.
+	 */
+	spin_lock_irqsave(&dev->pending_lock, flags);
+	while (!list_empty(list)) {
+		if (n == ARRAY_SIZE(qps))
+			goto full;
+		qp = list_entry(list->next, struct qib_qp, iowait);
+		list_del_init(&qp->iowait);
+		atomic_inc(&qp->refcount);
+		qps[n++] = qp;
+	}
+	dd->f_wantpiobuf_intr(dd, 0);
+full:
+	spin_unlock_irqrestore(&dev->pending_lock, flags);
+
+	for (i = 0; i < n; i++) {
+		qp = qps[i];
+
+		spin_lock_irqsave(&qp->s_lock, flags);
+		if (qp->s_flags & QIB_S_WAIT_PIO) {
+			qp->s_flags &= ~QIB_S_WAIT_PIO;
+			qib_schedule_send(qp);
+		}
+		spin_unlock_irqrestore(&qp->s_lock, flags);
+
+		/* Notify qib_destroy_qp() if it is waiting. */
+		if (atomic_dec_and_test(&qp->refcount))
+			wake_up(&qp->wait);
+	}
+}
+
+static int qib_query_device(struct ib_device *ibdev,
+			    struct ib_device_attr *props)
+{
+	struct qib_devdata *dd = dd_from_ibdev(ibdev);
+	struct qib_ibdev *dev = to_idev(ibdev);
+
+	memset(props, 0, sizeof(*props));
+
+	props->device_cap_flags = IB_DEVICE_BAD_PKEY_CNTR |
+		IB_DEVICE_BAD_QKEY_CNTR | IB_DEVICE_SHUTDOWN_PORT |
+		IB_DEVICE_SYS_IMAGE_GUID | IB_DEVICE_RC_RNR_NAK_GEN |
+		IB_DEVICE_PORT_ACTIVE_EVENT | IB_DEVICE_SRQ_RESIZE;
+	props->page_size_cap = PAGE_SIZE;
+	props->vendor_id =
+		QIB_SRC_OUI_1 << 16 | QIB_SRC_OUI_2 << 8 | QIB_SRC_OUI_3;
+	props->vendor_part_id = dd->deviceid;
+	props->hw_ver = dd->minrev;
+	props->sys_image_guid = ib_qib_sys_image_guid;
+	props->max_mr_size = ~0ULL;
+	props->max_qp = ib_qib_max_qps;
+	props->max_qp_wr = ib_qib_max_qp_wrs;
+	props->max_sge = ib_qib_max_sges;
+	props->max_cq = ib_qib_max_cqs;
+	props->max_ah = ib_qib_max_ahs;
+	props->max_cqe = ib_qib_max_cqes;
+	props->max_mr = dev->lk_table.max;
+	props->max_fmr = dev->lk_table.max;
+	props->max_map_per_fmr = 32767;
+	props->max_pd = ib_qib_max_pds;
+	props->max_qp_rd_atom = QIB_MAX_RDMA_ATOMIC;
+	props->max_qp_init_rd_atom = 255;
+	/* props->max_res_rd_atom */
+	props->max_srq = ib_qib_max_srqs;
+	props->max_srq_wr = ib_qib_max_srq_wrs;
+	props->max_srq_sge = ib_qib_max_srq_sges;
+	/* props->local_ca_ack_delay */
+	props->atomic_cap = IB_ATOMIC_GLOB;
+	props->max_pkeys = qib_get_npkeys(dd);
+	props->max_mcast_grp = ib_qib_max_mcast_grps;
+	props->max_mcast_qp_attach = ib_qib_max_mcast_qp_attached;
+	props->max_total_mcast_qp_attach = props->max_mcast_qp_attach *
+		props->max_mcast_grp;
+
+	return 0;
+}
+
+static int qib_query_port(struct ib_device *ibdev, u8 port,
+			  struct ib_port_attr *props)
+{
+	struct qib_devdata *dd = dd_from_ibdev(ibdev);
+	struct qib_ibport *ibp = to_iport(ibdev, port);
+	struct qib_pportdata *ppd = ppd_from_ibp(ibp);
+	enum ib_mtu mtu;
+	u16 lid = ppd->lid;
+
+	memset(props, 0, sizeof(*props));
+	props->lid = lid ? lid : be16_to_cpu(IB_LID_PERMISSIVE);
+	props->lmc = ppd->lmc;
+	props->sm_lid = ibp->sm_lid;
+	props->sm_sl = ibp->sm_sl;
+	props->state = dd->f_iblink_state(ppd->lastibcstat);
+	props->phys_state = dd->f_ibphys_portstate(ppd->lastibcstat);
+	props->port_cap_flags = ibp->port_cap_flags;
+	props->gid_tbl_len = QIB_GUIDS_PER_PORT;
+	props->max_msg_sz = 0x80000000;
+	props->pkey_tbl_len = qib_get_npkeys(dd);
+	props->bad_pkey_cntr = ibp->pkey_violations;
+	props->qkey_viol_cntr = ibp->qkey_violations;
+	props->active_width = ppd->link_width_active;
+	/* See rate_show() */
+	props->active_speed = ppd->link_speed_active;
+	props->max_vl_num = qib_num_vls(ppd->vls_supported);
+	props->init_type_reply = 0;
+
+	props->max_mtu = qib_ibmtu ? qib_ibmtu : IB_MTU_4096;
+	switch (ppd->ibmtu) {
+	case 4096:
+		mtu = IB_MTU_4096;
+		break;
+	case 2048:
+		mtu = IB_MTU_2048;
+		break;
+	case 1024:
+		mtu = IB_MTU_1024;
+		break;
+	case 512:
+		mtu = IB_MTU_512;
+		break;
+	case 256:
+		mtu = IB_MTU_256;
+		break;
+	default:
+		mtu = IB_MTU_2048;
+	}
+	props->active_mtu = mtu;
+	props->subnet_timeout = ibp->subnet_timeout;
+
+	return 0;
+}
+
+static int qib_modify_device(struct ib_device *device,
+			     int device_modify_mask,
+			     struct ib_device_modify *device_modify)
+{
+	struct qib_devdata *dd = dd_from_ibdev(device);
+	unsigned i;
+	int ret;
+
+	if (device_modify_mask & ~(IB_DEVICE_MODIFY_SYS_IMAGE_GUID |
+				   IB_DEVICE_MODIFY_NODE_DESC)) {
+		ret = -EOPNOTSUPP;
+		goto bail;
+	}
+
+	if (device_modify_mask & IB_DEVICE_MODIFY_NODE_DESC) {
+		memcpy(device->node_desc, device_modify->node_desc, 64);
+		for (i = 0; i < dd->num_pports; i++) {
+			struct qib_ibport *ibp = &dd->pport[i].ibport_data;
+
+			qib_node_desc_chg(ibp);
+		}
+	}
+
+	if (device_modify_mask & IB_DEVICE_MODIFY_SYS_IMAGE_GUID) {
+		ib_qib_sys_image_guid =
+			cpu_to_be64(device_modify->sys_image_guid);
+		for (i = 0; i < dd->num_pports; i++) {
+			struct qib_ibport *ibp = &dd->pport[i].ibport_data;
+
+			qib_sys_guid_chg(ibp);
+		}
+	}
+
+	ret = 0;
+
+bail:
+	return ret;
+}
+
+static int qib_modify_port(struct ib_device *ibdev, u8 port,
+			   int port_modify_mask, struct ib_port_modify *props)
+{
+	struct qib_ibport *ibp = to_iport(ibdev, port);
+	struct qib_pportdata *ppd = ppd_from_ibp(ibp);
+
+	ibp->port_cap_flags |= props->set_port_cap_mask;
+	ibp->port_cap_flags &= ~props->clr_port_cap_mask;
+	if (props->set_port_cap_mask || props->clr_port_cap_mask)
+		qib_cap_mask_chg(ibp);
+	if (port_modify_mask & IB_PORT_SHUTDOWN)
+		qib_set_linkstate(ppd, QIB_IB_LINKDOWN);
+	if (port_modify_mask & IB_PORT_RESET_QKEY_CNTR)
+		ibp->qkey_violations = 0;
+	return 0;
+}
+
+static int qib_query_gid(struct ib_device *ibdev, u8 port,
+			 int index, union ib_gid *gid)
+{
+	struct qib_devdata *dd = dd_from_ibdev(ibdev);
+	int ret = 0;
+
+	if (!port || port > dd->num_pports)
+		ret = -EINVAL;
+	else {
+		struct qib_ibport *ibp = to_iport(ibdev, port);
+		struct qib_pportdata *ppd = ppd_from_ibp(ibp);
+
+		gid->global.subnet_prefix = ibp->gid_prefix;
+		if (index == 0)
+			gid->global.interface_id = ppd->guid;
+		else if (index < QIB_GUIDS_PER_PORT)
+			gid->global.interface_id = ibp->guids[index - 1];
+		else
+			ret = -EINVAL;
+	}
+
+	return ret;
+}
+
+static struct ib_pd *qib_alloc_pd(struct ib_device *ibdev,
+				  struct ib_ucontext *context,
+				  struct ib_udata *udata)
+{
+	struct qib_ibdev *dev = to_idev(ibdev);
+	struct qib_pd *pd;
+	struct ib_pd *ret;
+
+	/*
+	 * This is actually totally arbitrary.  Some correctness tests
+	 * assume there's a maximum number of PDs that can be allocated.
+	 * We don't actually have this limit, but we fail the test if
+	 * we allow allocations of more than we report for this value.
+	 */
+
+	pd = kmalloc(sizeof *pd, GFP_KERNEL);
+	if (!pd) {
+		ret = ERR_PTR(-ENOMEM);
+		goto bail;
+	}
+
+	spin_lock(&dev->n_pds_lock);
+	if (dev->n_pds_allocated == ib_qib_max_pds) {
+		spin_unlock(&dev->n_pds_lock);
+		kfree(pd);
+		ret = ERR_PTR(-ENOMEM);
+		goto bail;
+	}
+
+	dev->n_pds_allocated++;
+	spin_unlock(&dev->n_pds_lock);
+
+	/* ib_alloc_pd() will initialize pd->ibpd. */
+	pd->user = udata != NULL;
+
+	ret = &pd->ibpd;
+
+bail:
+	return ret;
+}
+
+static int qib_dealloc_pd(struct ib_pd *ibpd)
+{
+	struct qib_pd *pd = to_ipd(ibpd);
+	struct qib_ibdev *dev = to_idev(ibpd->device);
+
+	spin_lock(&dev->n_pds_lock);
+	dev->n_pds_allocated--;
+	spin_unlock(&dev->n_pds_lock);
+
+	kfree(pd);
+
+	return 0;
+}
+
+int qib_check_ah(struct ib_device *ibdev, struct ib_ah_attr *ah_attr)
+{
+	/* A multicast address requires a GRH (see ch. 8.4.1). */
+	if (ah_attr->dlid >= QIB_MULTICAST_LID_BASE &&
+	    ah_attr->dlid != QIB_PERMISSIVE_LID &&
+	    !(ah_attr->ah_flags & IB_AH_GRH))
+		goto bail;
+	if ((ah_attr->ah_flags & IB_AH_GRH) &&
+	    ah_attr->grh.sgid_index >= QIB_GUIDS_PER_PORT)
+		goto bail;
+	if (ah_attr->dlid == 0)
+		goto bail;
+	if (ah_attr->port_num < 1 ||
+	    ah_attr->port_num > ibdev->phys_port_cnt)
+		goto bail;
+	if (ah_attr->static_rate != IB_RATE_PORT_CURRENT &&
+	    ib_rate_to_mult(ah_attr->static_rate) < 0)
+		goto bail;
+	if (ah_attr->sl > 15)
+		goto bail;
+	return 0;
+bail:
+	return -EINVAL;
+}
+
+/**
+ * qib_create_ah - create an address handle
+ * @pd: the protection domain
+ * @ah_attr: the attributes of the AH
+ *
+ * This may be called from interrupt context.
+ */
+static struct ib_ah *qib_create_ah(struct ib_pd *pd,
+				   struct ib_ah_attr *ah_attr)
+{
+	struct qib_ah *ah;
+	struct ib_ah *ret;
+	struct qib_ibdev *dev = to_idev(pd->device);
+	unsigned long flags;
+
+	if (qib_check_ah(pd->device, ah_attr)) {
+		ret = ERR_PTR(-EINVAL);
+		goto bail;
+	}
+
+	ah = kmalloc(sizeof *ah, GFP_ATOMIC);
+	if (!ah) {
+		ret = ERR_PTR(-ENOMEM);
+		goto bail;
+	}
+
+	spin_lock_irqsave(&dev->n_ahs_lock, flags);
+	if (dev->n_ahs_allocated == ib_qib_max_ahs) {
+		spin_unlock_irqrestore(&dev->n_ahs_lock, flags);
+		kfree(ah);
+		ret = ERR_PTR(-ENOMEM);
+		goto bail;
+	}
+
+	dev->n_ahs_allocated++;
+	spin_unlock_irqrestore(&dev->n_ahs_lock, flags);
+
+	/* ib_create_ah() will initialize ah->ibah. */
+	ah->attr = *ah_attr;
+	atomic_set(&ah->refcount, 0);
+
+	ret = &ah->ibah;
+
+bail:
+	return ret;
+}
+
+struct ib_ah *qib_create_qp0_ah(struct qib_ibport *ibp, u16 dlid)
+{
+	struct ib_ah_attr attr;
+	struct ib_ah *ah = ERR_PTR(-EINVAL);
+	struct qib_qp *qp0;
+
+	memset(&attr, 0, sizeof attr);
+	attr.dlid = dlid;
+	attr.port_num = ppd_from_ibp(ibp)->port;
+	rcu_read_lock();
+	qp0 = rcu_dereference(ibp->qp0);
+	if (qp0)
+		ah = ib_create_ah(qp0->ibqp.pd, &attr);
+	rcu_read_unlock();
+	return ah;
+}
+
+/**
+ * qib_destroy_ah - destroy an address handle
+ * @ibah: the AH to destroy
+ *
+ * This may be called from interrupt context.
+ */
+static int qib_destroy_ah(struct ib_ah *ibah)
+{
+	struct qib_ibdev *dev = to_idev(ibah->device);
+	struct qib_ah *ah = to_iah(ibah);
+	unsigned long flags;
+
+	if (atomic_read(&ah->refcount) != 0)
+		return -EBUSY;
+
+	spin_lock_irqsave(&dev->n_ahs_lock, flags);
+	dev->n_ahs_allocated--;
+	spin_unlock_irqrestore(&dev->n_ahs_lock, flags);
+
+	kfree(ah);
+
+	return 0;
+}
+
+static int qib_modify_ah(struct ib_ah *ibah, struct ib_ah_attr *ah_attr)
+{
+	struct qib_ah *ah = to_iah(ibah);
+
+	if (qib_check_ah(ibah->device, ah_attr))
+		return -EINVAL;
+
+	ah->attr = *ah_attr;
+
+	return 0;
+}
+
+static int qib_query_ah(struct ib_ah *ibah, struct ib_ah_attr *ah_attr)
+{
+	struct qib_ah *ah = to_iah(ibah);
+
+	*ah_attr = ah->attr;
+
+	return 0;
+}
+
+/**
+ * qib_get_npkeys - return the size of the PKEY table for context 0
+ * @dd: the qlogic_ib device
+ */
+unsigned qib_get_npkeys(struct qib_devdata *dd)
+{
+	return ARRAY_SIZE(dd->rcd[0]->pkeys);
+}
+
+/*
+ * Return the indexed PKEY from the port PKEY table.
+ * No need to validate rcd[ctxt]; the port is setup if we are here.
+ */
+unsigned qib_get_pkey(struct qib_ibport *ibp, unsigned index)
+{
+	struct qib_pportdata *ppd = ppd_from_ibp(ibp);
+	struct qib_devdata *dd = ppd->dd;
+	unsigned ctxt = ppd->hw_pidx;
+	unsigned ret;
+
+	/* dd->rcd null if mini_init or some init failures */
+	if (!dd->rcd || index >= ARRAY_SIZE(dd->rcd[ctxt]->pkeys))
+		ret = 0;
+	else
+		ret = dd->rcd[ctxt]->pkeys[index];
+
+	return ret;
+}
+
+static int qib_query_pkey(struct ib_device *ibdev, u8 port, u16 index,
+			  u16 *pkey)
+{
+	struct qib_devdata *dd = dd_from_ibdev(ibdev);
+	int ret;
+
+	if (index >= qib_get_npkeys(dd)) {
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	*pkey = qib_get_pkey(to_iport(ibdev, port), index);
+	ret = 0;
+
+bail:
+	return ret;
+}
+
+/**
+ * qib_alloc_ucontext - allocate a ucontest
+ * @ibdev: the infiniband device
+ * @udata: not used by the QLogic_IB driver
+ */
+
+static struct ib_ucontext *qib_alloc_ucontext(struct ib_device *ibdev,
+					      struct ib_udata *udata)
+{
+	struct qib_ucontext *context;
+	struct ib_ucontext *ret;
+
+	context = kmalloc(sizeof *context, GFP_KERNEL);
+	if (!context) {
+		ret = ERR_PTR(-ENOMEM);
+		goto bail;
+	}
+
+	ret = &context->ibucontext;
+
+bail:
+	return ret;
+}
+
+static int qib_dealloc_ucontext(struct ib_ucontext *context)
+{
+	kfree(to_iucontext(context));
+	return 0;
+}
+
+static void init_ibport(struct qib_pportdata *ppd)
+{
+	struct qib_verbs_counters cntrs;
+	struct qib_ibport *ibp = &ppd->ibport_data;
+
+	spin_lock_init(&ibp->lock);
+	/* Set the prefix to the default value (see ch. 4.1.1) */
+	ibp->gid_prefix = IB_DEFAULT_GID_PREFIX;
+	ibp->sm_lid = be16_to_cpu(IB_LID_PERMISSIVE);
+	ibp->port_cap_flags = IB_PORT_SYS_IMAGE_GUID_SUP |
+		IB_PORT_CLIENT_REG_SUP | IB_PORT_SL_MAP_SUP |
+		IB_PORT_TRAP_SUP | IB_PORT_AUTO_MIGR_SUP |
+		IB_PORT_DR_NOTICE_SUP | IB_PORT_CAP_MASK_NOTICE_SUP |
+		IB_PORT_OTHER_LOCAL_CHANGES_SUP;
+	if (ppd->dd->flags & QIB_HAS_LINK_LATENCY)
+		ibp->port_cap_flags |= IB_PORT_LINK_LATENCY_SUP;
+	ibp->pma_counter_select[0] = IB_PMA_PORT_XMIT_DATA;
+	ibp->pma_counter_select[1] = IB_PMA_PORT_RCV_DATA;
+	ibp->pma_counter_select[2] = IB_PMA_PORT_XMIT_PKTS;
+	ibp->pma_counter_select[3] = IB_PMA_PORT_RCV_PKTS;
+	ibp->pma_counter_select[4] = IB_PMA_PORT_XMIT_WAIT;
+
+	/* Snapshot current HW counters to "clear" them. */
+	qib_get_counters(ppd, &cntrs);
+	ibp->z_symbol_error_counter = cntrs.symbol_error_counter;
+	ibp->z_link_error_recovery_counter =
+		cntrs.link_error_recovery_counter;
+	ibp->z_link_downed_counter = cntrs.link_downed_counter;
+	ibp->z_port_rcv_errors = cntrs.port_rcv_errors;
+	ibp->z_port_rcv_remphys_errors = cntrs.port_rcv_remphys_errors;
+	ibp->z_port_xmit_discards = cntrs.port_xmit_discards;
+	ibp->z_port_xmit_data = cntrs.port_xmit_data;
+	ibp->z_port_rcv_data = cntrs.port_rcv_data;
+	ibp->z_port_xmit_packets = cntrs.port_xmit_packets;
+	ibp->z_port_rcv_packets = cntrs.port_rcv_packets;
+	ibp->z_local_link_integrity_errors =
+		cntrs.local_link_integrity_errors;
+	ibp->z_excessive_buffer_overrun_errors =
+		cntrs.excessive_buffer_overrun_errors;
+	ibp->z_vl15_dropped = cntrs.vl15_dropped;
+	RCU_INIT_POINTER(ibp->qp0, NULL);
+	RCU_INIT_POINTER(ibp->qp1, NULL);
+}
+
+/**
+ * qib_register_ib_device - register our device with the infiniband core
+ * @dd: the device data structure
+ * Return the allocated qib_ibdev pointer or NULL on error.
+ */
+int qib_register_ib_device(struct qib_devdata *dd)
+{
+	struct qib_ibdev *dev = &dd->verbs_dev;
+	struct ib_device *ibdev = &dev->ibdev;
+	struct qib_pportdata *ppd = dd->pport;
+	unsigned i, lk_tab_size;
+	int ret;
+
+	dev->qp_table_size = ib_qib_qp_table_size;
+	get_random_bytes(&dev->qp_rnd, sizeof(dev->qp_rnd));
+	dev->qp_table = kmalloc(dev->qp_table_size * sizeof *dev->qp_table,
+				GFP_KERNEL);
+	if (!dev->qp_table) {
+		ret = -ENOMEM;
+		goto err_qpt;
+	}
+	for (i = 0; i < dev->qp_table_size; i++)
+		RCU_INIT_POINTER(dev->qp_table[i], NULL);
+
+	for (i = 0; i < dd->num_pports; i++)
+		init_ibport(ppd + i);
+
+	/* Only need to initialize non-zero fields. */
+	spin_lock_init(&dev->qpt_lock);
+	spin_lock_init(&dev->n_pds_lock);
+	spin_lock_init(&dev->n_ahs_lock);
+	spin_lock_init(&dev->n_cqs_lock);
+	spin_lock_init(&dev->n_qps_lock);
+	spin_lock_init(&dev->n_srqs_lock);
+	spin_lock_init(&dev->n_mcast_grps_lock);
+	init_timer(&dev->mem_timer);
+	dev->mem_timer.function = mem_timer;
+	dev->mem_timer.data = (unsigned long) dev;
+
+	qib_init_qpn_table(dd, &dev->qpn_table);
+
+	/*
+	 * The top ib_qib_lkey_table_size bits are used to index the
+	 * table.  The lower 8 bits can be owned by the user (copied from
+	 * the LKEY).  The remaining bits act as a generation number or tag.
+	 */
+	spin_lock_init(&dev->lk_table.lock);
+	dev->lk_table.max = 1 << ib_qib_lkey_table_size;
+	lk_tab_size = dev->lk_table.max * sizeof(*dev->lk_table.table);
+	dev->lk_table.table = (struct qib_mregion __rcu **)
+		__get_free_pages(GFP_KERNEL, get_order(lk_tab_size));
+	if (dev->lk_table.table == NULL) {
+		ret = -ENOMEM;
+		goto err_lk;
+	}
+	RCU_INIT_POINTER(dev->dma_mr, NULL);
+	for (i = 0; i < dev->lk_table.max; i++)
+		RCU_INIT_POINTER(dev->lk_table.table[i], NULL);
+	INIT_LIST_HEAD(&dev->pending_mmaps);
+	spin_lock_init(&dev->pending_lock);
+	dev->mmap_offset = PAGE_SIZE;
+	spin_lock_init(&dev->mmap_offset_lock);
+	INIT_LIST_HEAD(&dev->piowait);
+	INIT_LIST_HEAD(&dev->dmawait);
+	INIT_LIST_HEAD(&dev->txwait);
+	INIT_LIST_HEAD(&dev->memwait);
+	INIT_LIST_HEAD(&dev->txreq_free);
+
+	if (ppd->sdma_descq_cnt) {
+		dev->pio_hdrs = dma_alloc_coherent(&dd->pcidev->dev,
+						ppd->sdma_descq_cnt *
+						sizeof(struct qib_pio_header),
+						&dev->pio_hdrs_phys,
+						GFP_KERNEL);
+		if (!dev->pio_hdrs) {
+			ret = -ENOMEM;
+			goto err_hdrs;
+		}
+	}
+
+	for (i = 0; i < ppd->sdma_descq_cnt; i++) {
+		struct qib_verbs_txreq *tx;
+
+		tx = kzalloc(sizeof *tx, GFP_KERNEL);
+		if (!tx) {
+			ret = -ENOMEM;
+			goto err_tx;
+		}
+		tx->hdr_inx = i;
+		list_add(&tx->txreq.list, &dev->txreq_free);
+	}
+
+	/*
+	 * The system image GUID is supposed to be the same for all
+	 * IB HCAs in a single system but since there can be other
+	 * device types in the system, we can't be sure this is unique.
+	 */
+	if (!ib_qib_sys_image_guid)
+		ib_qib_sys_image_guid = ppd->guid;
+
+	strlcpy(ibdev->name, "qib%d", IB_DEVICE_NAME_MAX);
+	ibdev->owner = THIS_MODULE;
+	ibdev->node_guid = ppd->guid;
+	ibdev->uverbs_abi_ver = QIB_UVERBS_ABI_VERSION;
+	ibdev->uverbs_cmd_mask =
+		(1ull << IB_USER_VERBS_CMD_GET_CONTEXT)         |
+		(1ull << IB_USER_VERBS_CMD_QUERY_DEVICE)        |
+		(1ull << IB_USER_VERBS_CMD_QUERY_PORT)          |
+		(1ull << IB_USER_VERBS_CMD_ALLOC_PD)            |
+		(1ull << IB_USER_VERBS_CMD_DEALLOC_PD)          |
+		(1ull << IB_USER_VERBS_CMD_CREATE_AH)           |
+		(1ull << IB_USER_VERBS_CMD_MODIFY_AH)           |
+		(1ull << IB_USER_VERBS_CMD_QUERY_AH)            |
+		(1ull << IB_USER_VERBS_CMD_DESTROY_AH)          |
+		(1ull << IB_USER_VERBS_CMD_REG_MR)              |
+		(1ull << IB_USER_VERBS_CMD_DEREG_MR)            |
+		(1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) |
+		(1ull << IB_USER_VERBS_CMD_CREATE_CQ)           |
+		(1ull << IB_USER_VERBS_CMD_RESIZE_CQ)           |
+		(1ull << IB_USER_VERBS_CMD_DESTROY_CQ)          |
+		(1ull << IB_USER_VERBS_CMD_POLL_CQ)             |
+		(1ull << IB_USER_VERBS_CMD_REQ_NOTIFY_CQ)       |
+		(1ull << IB_USER_VERBS_CMD_CREATE_QP)           |
+		(1ull << IB_USER_VERBS_CMD_QUERY_QP)            |
+		(1ull << IB_USER_VERBS_CMD_MODIFY_QP)           |
+		(1ull << IB_USER_VERBS_CMD_DESTROY_QP)          |
+		(1ull << IB_USER_VERBS_CMD_POST_SEND)           |
+		(1ull << IB_USER_VERBS_CMD_POST_RECV)           |
+		(1ull << IB_USER_VERBS_CMD_ATTACH_MCAST)        |
+		(1ull << IB_USER_VERBS_CMD_DETACH_MCAST)        |
+		(1ull << IB_USER_VERBS_CMD_CREATE_SRQ)          |
+		(1ull << IB_USER_VERBS_CMD_MODIFY_SRQ)          |
+		(1ull << IB_USER_VERBS_CMD_QUERY_SRQ)           |
+		(1ull << IB_USER_VERBS_CMD_DESTROY_SRQ)         |
+		(1ull << IB_USER_VERBS_CMD_POST_SRQ_RECV);
+	ibdev->node_type = RDMA_NODE_IB_CA;
+	ibdev->phys_port_cnt = dd->num_pports;
+	ibdev->num_comp_vectors = 1;
+	ibdev->dma_device = &dd->pcidev->dev;
+	ibdev->query_device = qib_query_device;
+	ibdev->modify_device = qib_modify_device;
+	ibdev->query_port = qib_query_port;
+	ibdev->modify_port = qib_modify_port;
+	ibdev->query_pkey = qib_query_pkey;
+	ibdev->query_gid = qib_query_gid;
+	ibdev->alloc_ucontext = qib_alloc_ucontext;
+	ibdev->dealloc_ucontext = qib_dealloc_ucontext;
+	ibdev->alloc_pd = qib_alloc_pd;
+	ibdev->dealloc_pd = qib_dealloc_pd;
+	ibdev->create_ah = qib_create_ah;
+	ibdev->destroy_ah = qib_destroy_ah;
+	ibdev->modify_ah = qib_modify_ah;
+	ibdev->query_ah = qib_query_ah;
+	ibdev->create_srq = qib_create_srq;
+	ibdev->modify_srq = qib_modify_srq;
+	ibdev->query_srq = qib_query_srq;
+	ibdev->destroy_srq = qib_destroy_srq;
+	ibdev->create_qp = qib_create_qp;
+	ibdev->modify_qp = qib_modify_qp;
+	ibdev->query_qp = qib_query_qp;
+	ibdev->destroy_qp = qib_destroy_qp;
+	ibdev->post_send = qib_post_send;
+	ibdev->post_recv = qib_post_receive;
+	ibdev->post_srq_recv = qib_post_srq_receive;
+	ibdev->create_cq = qib_create_cq;
+	ibdev->destroy_cq = qib_destroy_cq;
+	ibdev->resize_cq = qib_resize_cq;
+	ibdev->poll_cq = qib_poll_cq;
+	ibdev->req_notify_cq = qib_req_notify_cq;
+	ibdev->get_dma_mr = qib_get_dma_mr;
+	ibdev->reg_phys_mr = qib_reg_phys_mr;
+	ibdev->reg_user_mr = qib_reg_user_mr;
+	ibdev->dereg_mr = qib_dereg_mr;
+	ibdev->alloc_fast_reg_mr = qib_alloc_fast_reg_mr;
+	ibdev->alloc_fast_reg_page_list = qib_alloc_fast_reg_page_list;
+	ibdev->free_fast_reg_page_list = qib_free_fast_reg_page_list;
+	ibdev->alloc_fmr = qib_alloc_fmr;
+	ibdev->map_phys_fmr = qib_map_phys_fmr;
+	ibdev->unmap_fmr = qib_unmap_fmr;
+	ibdev->dealloc_fmr = qib_dealloc_fmr;
+	ibdev->attach_mcast = qib_multicast_attach;
+	ibdev->detach_mcast = qib_multicast_detach;
+	ibdev->process_mad = qib_process_mad;
+	ibdev->mmap = qib_mmap;
+	ibdev->dma_ops = &qib_dma_mapping_ops;
+
+	snprintf(ibdev->node_desc, sizeof(ibdev->node_desc),
+		 "Intel Infiniband HCA %s", init_utsname()->nodename);
+
+	ret = ib_register_device(ibdev, qib_create_port_files);
+	if (ret)
+		goto err_reg;
+
+	ret = qib_create_agents(dev);
+	if (ret)
+		goto err_agents;
+
+	ret = qib_verbs_register_sysfs(dd);
+	if (ret)
+		goto err_class;
+
+	goto bail;
+
+err_class:
+	qib_free_agents(dev);
+err_agents:
+	ib_unregister_device(ibdev);
+err_reg:
+err_tx:
+	while (!list_empty(&dev->txreq_free)) {
+		struct list_head *l = dev->txreq_free.next;
+		struct qib_verbs_txreq *tx;
+
+		list_del(l);
+		tx = list_entry(l, struct qib_verbs_txreq, txreq.list);
+		kfree(tx);
+	}
+	if (ppd->sdma_descq_cnt)
+		dma_free_coherent(&dd->pcidev->dev,
+				  ppd->sdma_descq_cnt *
+					sizeof(struct qib_pio_header),
+				  dev->pio_hdrs, dev->pio_hdrs_phys);
+err_hdrs:
+	free_pages((unsigned long) dev->lk_table.table, get_order(lk_tab_size));
+err_lk:
+	kfree(dev->qp_table);
+err_qpt:
+	qib_dev_err(dd, "cannot register verbs: %d!\n", -ret);
+bail:
+	return ret;
+}
+
+void qib_unregister_ib_device(struct qib_devdata *dd)
+{
+	struct qib_ibdev *dev = &dd->verbs_dev;
+	struct ib_device *ibdev = &dev->ibdev;
+	u32 qps_inuse;
+	unsigned lk_tab_size;
+
+	qib_verbs_unregister_sysfs(dd);
+
+	qib_free_agents(dev);
+
+	ib_unregister_device(ibdev);
+
+	if (!list_empty(&dev->piowait))
+		qib_dev_err(dd, "piowait list not empty!\n");
+	if (!list_empty(&dev->dmawait))
+		qib_dev_err(dd, "dmawait list not empty!\n");
+	if (!list_empty(&dev->txwait))
+		qib_dev_err(dd, "txwait list not empty!\n");
+	if (!list_empty(&dev->memwait))
+		qib_dev_err(dd, "memwait list not empty!\n");
+	if (dev->dma_mr)
+		qib_dev_err(dd, "DMA MR not NULL!\n");
+
+	qps_inuse = qib_free_all_qps(dd);
+	if (qps_inuse)
+		qib_dev_err(dd, "QP memory leak! %u still in use\n",
+			    qps_inuse);
+
+	del_timer_sync(&dev->mem_timer);
+	qib_free_qpn_table(&dev->qpn_table);
+	while (!list_empty(&dev->txreq_free)) {
+		struct list_head *l = dev->txreq_free.next;
+		struct qib_verbs_txreq *tx;
+
+		list_del(l);
+		tx = list_entry(l, struct qib_verbs_txreq, txreq.list);
+		kfree(tx);
+	}
+	if (dd->pport->sdma_descq_cnt)
+		dma_free_coherent(&dd->pcidev->dev,
+				  dd->pport->sdma_descq_cnt *
+					sizeof(struct qib_pio_header),
+				  dev->pio_hdrs, dev->pio_hdrs_phys);
+	lk_tab_size = dev->lk_table.max * sizeof(*dev->lk_table.table);
+	free_pages((unsigned long) dev->lk_table.table,
+		   get_order(lk_tab_size));
+	kfree(dev->qp_table);
+}
+
+/*
+ * This must be called with s_lock held.
+ */
+void qib_schedule_send(struct qib_qp *qp)
+{
+	if (qib_send_ok(qp)) {
+		struct qib_ibport *ibp =
+			to_iport(qp->ibqp.device, qp->port_num);
+		struct qib_pportdata *ppd = ppd_from_ibp(ibp);
+
+		queue_work(ppd->qib_wq, &qp->s_work);
+	}
+}
diff --git a/drivers/infiniband/hw/qib/qib_verbs.h b/drivers/infiniband/hw/qib/qib_verbs.h
new file mode 100644
index 0000000..bfc8948
--- /dev/null
+++ b/drivers/infiniband/hw/qib/qib_verbs.h
@@ -0,0 +1,1173 @@
+/*
+ * Copyright (c) 2012, 2013 Intel Corporation.  All rights reserved.
+ * Copyright (c) 2006 - 2012 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef QIB_VERBS_H
+#define QIB_VERBS_H
+
+#include <linux/types.h>
+#include <linux/spinlock.h>
+#include <linux/kernel.h>
+#include <linux/interrupt.h>
+#include <linux/kref.h>
+#include <linux/workqueue.h>
+#include <linux/kthread.h>
+#include <linux/completion.h>
+#include <rdma/ib_pack.h>
+#include <rdma/ib_user_verbs.h>
+
+struct qib_ctxtdata;
+struct qib_pportdata;
+struct qib_devdata;
+struct qib_verbs_txreq;
+
+#define QIB_MAX_RDMA_ATOMIC     16
+#define QIB_GUIDS_PER_PORT	5
+
+#define QPN_MAX                 (1 << 24)
+#define QPNMAP_ENTRIES          (QPN_MAX / PAGE_SIZE / BITS_PER_BYTE)
+
+/*
+ * Increment this value if any changes that break userspace ABI
+ * compatibility are made.
+ */
+#define QIB_UVERBS_ABI_VERSION       2
+
+/*
+ * Define an ib_cq_notify value that is not valid so we know when CQ
+ * notifications are armed.
+ */
+#define IB_CQ_NONE      (IB_CQ_NEXT_COMP + 1)
+
+#define IB_SEQ_NAK	(3 << 29)
+
+/* AETH NAK opcode values */
+#define IB_RNR_NAK                      0x20
+#define IB_NAK_PSN_ERROR                0x60
+#define IB_NAK_INVALID_REQUEST          0x61
+#define IB_NAK_REMOTE_ACCESS_ERROR      0x62
+#define IB_NAK_REMOTE_OPERATIONAL_ERROR 0x63
+#define IB_NAK_INVALID_RD_REQUEST       0x64
+
+/* Flags for checking QP state (see ib_qib_state_ops[]) */
+#define QIB_POST_SEND_OK                0x01
+#define QIB_POST_RECV_OK                0x02
+#define QIB_PROCESS_RECV_OK             0x04
+#define QIB_PROCESS_SEND_OK             0x08
+#define QIB_PROCESS_NEXT_SEND_OK        0x10
+#define QIB_FLUSH_SEND			0x20
+#define QIB_FLUSH_RECV			0x40
+#define QIB_PROCESS_OR_FLUSH_SEND \
+	(QIB_PROCESS_SEND_OK | QIB_FLUSH_SEND)
+
+/* IB Performance Manager status values */
+#define IB_PMA_SAMPLE_STATUS_DONE       0x00
+#define IB_PMA_SAMPLE_STATUS_STARTED    0x01
+#define IB_PMA_SAMPLE_STATUS_RUNNING    0x02
+
+/* Mandatory IB performance counter select values. */
+#define IB_PMA_PORT_XMIT_DATA   cpu_to_be16(0x0001)
+#define IB_PMA_PORT_RCV_DATA    cpu_to_be16(0x0002)
+#define IB_PMA_PORT_XMIT_PKTS   cpu_to_be16(0x0003)
+#define IB_PMA_PORT_RCV_PKTS    cpu_to_be16(0x0004)
+#define IB_PMA_PORT_XMIT_WAIT   cpu_to_be16(0x0005)
+
+#define QIB_VENDOR_IPG		cpu_to_be16(0xFFA0)
+
+#define IB_BTH_REQ_ACK		(1 << 31)
+#define IB_BTH_SOLICITED	(1 << 23)
+#define IB_BTH_MIG_REQ		(1 << 22)
+
+/* XXX Should be defined in ib_verbs.h enum ib_port_cap_flags */
+#define IB_PORT_OTHER_LOCAL_CHANGES_SUP (1 << 26)
+
+#define IB_GRH_VERSION		6
+#define IB_GRH_VERSION_MASK	0xF
+#define IB_GRH_VERSION_SHIFT	28
+#define IB_GRH_TCLASS_MASK	0xFF
+#define IB_GRH_TCLASS_SHIFT	20
+#define IB_GRH_FLOW_MASK	0xFFFFF
+#define IB_GRH_FLOW_SHIFT	0
+#define IB_GRH_NEXT_HDR		0x1B
+
+#define IB_DEFAULT_GID_PREFIX	cpu_to_be64(0xfe80000000000000ULL)
+
+/* Values for set/get portinfo VLCap OperationalVLs */
+#define IB_VL_VL0       1
+#define IB_VL_VL0_1     2
+#define IB_VL_VL0_3     3
+#define IB_VL_VL0_7     4
+#define IB_VL_VL0_14    5
+
+static inline int qib_num_vls(int vls)
+{
+	switch (vls) {
+	default:
+	case IB_VL_VL0:
+		return 1;
+	case IB_VL_VL0_1:
+		return 2;
+	case IB_VL_VL0_3:
+		return 4;
+	case IB_VL_VL0_7:
+		return 8;
+	case IB_VL_VL0_14:
+		return 15;
+	}
+}
+
+struct ib_reth {
+	__be64 vaddr;
+	__be32 rkey;
+	__be32 length;
+} __packed;
+
+struct ib_atomic_eth {
+	__be32 vaddr[2];        /* unaligned so access as 2 32-bit words */
+	__be32 rkey;
+	__be64 swap_data;
+	__be64 compare_data;
+} __packed;
+
+struct qib_other_headers {
+	__be32 bth[3];
+	union {
+		struct {
+			__be32 deth[2];
+			__be32 imm_data;
+		} ud;
+		struct {
+			struct ib_reth reth;
+			__be32 imm_data;
+		} rc;
+		struct {
+			__be32 aeth;
+			__be32 atomic_ack_eth[2];
+		} at;
+		__be32 imm_data;
+		__be32 aeth;
+		struct ib_atomic_eth atomic_eth;
+	} u;
+} __packed;
+
+/*
+ * Note that UD packets with a GRH header are 8+40+12+8 = 68 bytes
+ * long (72 w/ imm_data).  Only the first 56 bytes of the IB header
+ * will be in the eager header buffer.  The remaining 12 or 16 bytes
+ * are in the data buffer.
+ */
+struct qib_ib_header {
+	__be16 lrh[4];
+	union {
+		struct {
+			struct ib_grh grh;
+			struct qib_other_headers oth;
+		} l;
+		struct qib_other_headers oth;
+	} u;
+} __packed;
+
+struct qib_pio_header {
+	__le32 pbc[2];
+	struct qib_ib_header hdr;
+} __packed;
+
+/*
+ * There is one struct qib_mcast for each multicast GID.
+ * All attached QPs are then stored as a list of
+ * struct qib_mcast_qp.
+ */
+struct qib_mcast_qp {
+	struct list_head list;
+	struct qib_qp *qp;
+};
+
+struct qib_mcast {
+	struct rb_node rb_node;
+	union ib_gid mgid;
+	struct list_head qp_list;
+	wait_queue_head_t wait;
+	atomic_t refcount;
+	int n_attached;
+};
+
+/* Protection domain */
+struct qib_pd {
+	struct ib_pd ibpd;
+	int user;               /* non-zero if created from user space */
+};
+
+/* Address Handle */
+struct qib_ah {
+	struct ib_ah ibah;
+	struct ib_ah_attr attr;
+	atomic_t refcount;
+};
+
+/*
+ * This structure is used by qib_mmap() to validate an offset
+ * when an mmap() request is made.  The vm_area_struct then uses
+ * this as its vm_private_data.
+ */
+struct qib_mmap_info {
+	struct list_head pending_mmaps;
+	struct ib_ucontext *context;
+	void *obj;
+	__u64 offset;
+	struct kref ref;
+	unsigned size;
+};
+
+/*
+ * This structure is used to contain the head pointer, tail pointer,
+ * and completion queue entries as a single memory allocation so
+ * it can be mmap'ed into user space.
+ */
+struct qib_cq_wc {
+	u32 head;               /* index of next entry to fill */
+	u32 tail;               /* index of next ib_poll_cq() entry */
+	union {
+		/* these are actually size ibcq.cqe + 1 */
+		struct ib_uverbs_wc uqueue[0];
+		struct ib_wc kqueue[0];
+	};
+};
+
+/*
+ * The completion queue structure.
+ */
+struct qib_cq {
+	struct ib_cq ibcq;
+	struct kthread_work comptask;
+	struct qib_devdata *dd;
+	spinlock_t lock; /* protect changes in this struct */
+	u8 notify;
+	u8 triggered;
+	struct qib_cq_wc *queue;
+	struct qib_mmap_info *ip;
+};
+
+/*
+ * A segment is a linear region of low physical memory.
+ * XXX Maybe we should use phys addr here and kmap()/kunmap().
+ * Used by the verbs layer.
+ */
+struct qib_seg {
+	void *vaddr;
+	size_t length;
+};
+
+/* The number of qib_segs that fit in a page. */
+#define QIB_SEGSZ     (PAGE_SIZE / sizeof(struct qib_seg))
+
+struct qib_segarray {
+	struct qib_seg segs[QIB_SEGSZ];
+};
+
+struct qib_mregion {
+	struct ib_pd *pd;       /* shares refcnt of ibmr.pd */
+	u64 user_base;          /* User's address for this region */
+	u64 iova;               /* IB start address of this region */
+	size_t length;
+	u32 lkey;
+	u32 offset;             /* offset (bytes) to start of region */
+	int access_flags;
+	u32 max_segs;           /* number of qib_segs in all the arrays */
+	u32 mapsz;              /* size of the map array */
+	u8  page_shift;         /* 0 - non unform/non powerof2 sizes */
+	u8  lkey_published;     /* in global table */
+	struct completion comp; /* complete when refcount goes to zero */
+	struct rcu_head list;
+	atomic_t refcount;
+	struct qib_segarray *map[0];    /* the segments */
+};
+
+/*
+ * These keep track of the copy progress within a memory region.
+ * Used by the verbs layer.
+ */
+struct qib_sge {
+	struct qib_mregion *mr;
+	void *vaddr;            /* kernel virtual address of segment */
+	u32 sge_length;         /* length of the SGE */
+	u32 length;             /* remaining length of the segment */
+	u16 m;                  /* current index: mr->map[m] */
+	u16 n;                  /* current index: mr->map[m]->segs[n] */
+};
+
+/* Memory region */
+struct qib_mr {
+	struct ib_mr ibmr;
+	struct ib_umem *umem;
+	struct qib_mregion mr;  /* must be last */
+};
+
+/*
+ * Send work request queue entry.
+ * The size of the sg_list is determined when the QP is created and stored
+ * in qp->s_max_sge.
+ */
+struct qib_swqe {
+	struct ib_send_wr wr;   /* don't use wr.sg_list */
+	u32 psn;                /* first packet sequence number */
+	u32 lpsn;               /* last packet sequence number */
+	u32 ssn;                /* send sequence number */
+	u32 length;             /* total length of data in sg_list */
+	struct qib_sge sg_list[0];
+};
+
+/*
+ * Receive work request queue entry.
+ * The size of the sg_list is determined when the QP (or SRQ) is created
+ * and stored in qp->r_rq.max_sge (or srq->rq.max_sge).
+ */
+struct qib_rwqe {
+	u64 wr_id;
+	u8 num_sge;
+	struct ib_sge sg_list[0];
+};
+
+/*
+ * This structure is used to contain the head pointer, tail pointer,
+ * and receive work queue entries as a single memory allocation so
+ * it can be mmap'ed into user space.
+ * Note that the wq array elements are variable size so you can't
+ * just index into the array to get the N'th element;
+ * use get_rwqe_ptr() instead.
+ */
+struct qib_rwq {
+	u32 head;               /* new work requests posted to the head */
+	u32 tail;               /* receives pull requests from here. */
+	struct qib_rwqe wq[0];
+};
+
+struct qib_rq {
+	struct qib_rwq *wq;
+	u32 size;               /* size of RWQE array */
+	u8 max_sge;
+	spinlock_t lock /* protect changes in this struct */
+		____cacheline_aligned_in_smp;
+};
+
+struct qib_srq {
+	struct ib_srq ibsrq;
+	struct qib_rq rq;
+	struct qib_mmap_info *ip;
+	/* send signal when number of RWQEs < limit */
+	u32 limit;
+};
+
+struct qib_sge_state {
+	struct qib_sge *sg_list;      /* next SGE to be used if any */
+	struct qib_sge sge;   /* progress state for the current SGE */
+	u32 total_len;
+	u8 num_sge;
+};
+
+/*
+ * This structure holds the information that the send tasklet needs
+ * to send a RDMA read response or atomic operation.
+ */
+struct qib_ack_entry {
+	u8 opcode;
+	u8 sent;
+	u32 psn;
+	u32 lpsn;
+	union {
+		struct qib_sge rdma_sge;
+		u64 atomic_data;
+	};
+};
+
+/*
+ * Variables prefixed with s_ are for the requester (sender).
+ * Variables prefixed with r_ are for the responder (receiver).
+ * Variables prefixed with ack_ are for responder replies.
+ *
+ * Common variables are protected by both r_rq.lock and s_lock in that order
+ * which only happens in modify_qp() or changing the QP 'state'.
+ */
+struct qib_qp {
+	struct ib_qp ibqp;
+	/* read mostly fields above and below */
+	struct ib_ah_attr remote_ah_attr;
+	struct ib_ah_attr alt_ah_attr;
+	struct qib_qp __rcu *next;            /* link list for QPN hash table */
+	struct qib_swqe *s_wq;  /* send work queue */
+	struct qib_mmap_info *ip;
+	struct qib_ib_header *s_hdr;     /* next packet header to send */
+	unsigned long timeout_jiffies;  /* computed from timeout */
+
+	enum ib_mtu path_mtu;
+	u32 remote_qpn;
+	u32 pmtu;		/* decoded from path_mtu */
+	u32 qkey;               /* QKEY for this QP (for UD or RD) */
+	u32 s_size;             /* send work queue size */
+	u32 s_rnr_timeout;      /* number of milliseconds for RNR timeout */
+
+	u8 state;               /* QP state */
+	u8 qp_access_flags;
+	u8 alt_timeout;         /* Alternate path timeout for this QP */
+	u8 timeout;             /* Timeout for this QP */
+	u8 s_srate;
+	u8 s_mig_state;
+	u8 port_num;
+	u8 s_pkey_index;        /* PKEY index to use */
+	u8 s_alt_pkey_index;    /* Alternate path PKEY index to use */
+	u8 r_max_rd_atomic;     /* max number of RDMA read/atomic to receive */
+	u8 s_max_rd_atomic;     /* max number of RDMA read/atomic to send */
+	u8 s_retry_cnt;         /* number of times to retry */
+	u8 s_rnr_retry_cnt;
+	u8 r_min_rnr_timer;     /* retry timeout value for RNR NAKs */
+	u8 s_max_sge;           /* size of s_wq->sg_list */
+	u8 s_draining;
+
+	/* start of read/write fields */
+
+	atomic_t refcount ____cacheline_aligned_in_smp;
+	wait_queue_head_t wait;
+
+
+	struct qib_ack_entry s_ack_queue[QIB_MAX_RDMA_ATOMIC + 1]
+		____cacheline_aligned_in_smp;
+	struct qib_sge_state s_rdma_read_sge;
+
+	spinlock_t r_lock ____cacheline_aligned_in_smp;      /* used for APM */
+	unsigned long r_aflags;
+	u64 r_wr_id;            /* ID for current receive WQE */
+	u32 r_ack_psn;          /* PSN for next ACK or atomic ACK */
+	u32 r_len;              /* total length of r_sge */
+	u32 r_rcv_len;          /* receive data len processed */
+	u32 r_psn;              /* expected rcv packet sequence number */
+	u32 r_msn;              /* message sequence number */
+
+	u8 r_state;             /* opcode of last packet received */
+	u8 r_flags;
+	u8 r_head_ack_queue;    /* index into s_ack_queue[] */
+
+	struct list_head rspwait;       /* link for waititing to respond */
+
+	struct qib_sge_state r_sge;     /* current receive data */
+	struct qib_rq r_rq;             /* receive work queue */
+
+	spinlock_t s_lock ____cacheline_aligned_in_smp;
+	struct qib_sge_state *s_cur_sge;
+	u32 s_flags;
+	struct qib_verbs_txreq *s_tx;
+	struct qib_swqe *s_wqe;
+	struct qib_sge_state s_sge;     /* current send request data */
+	struct qib_mregion *s_rdma_mr;
+	atomic_t s_dma_busy;
+	u32 s_cur_size;         /* size of send packet in bytes */
+	u32 s_len;              /* total length of s_sge */
+	u32 s_rdma_read_len;    /* total length of s_rdma_read_sge */
+	u32 s_next_psn;         /* PSN for next request */
+	u32 s_last_psn;         /* last response PSN processed */
+	u32 s_sending_psn;      /* lowest PSN that is being sent */
+	u32 s_sending_hpsn;     /* highest PSN that is being sent */
+	u32 s_psn;              /* current packet sequence number */
+	u32 s_ack_rdma_psn;     /* PSN for sending RDMA read responses */
+	u32 s_ack_psn;          /* PSN for acking sends and RDMA writes */
+	u32 s_head;             /* new entries added here */
+	u32 s_tail;             /* next entry to process */
+	u32 s_cur;              /* current work queue entry */
+	u32 s_acked;            /* last un-ACK'ed entry */
+	u32 s_last;             /* last completed entry */
+	u32 s_ssn;              /* SSN of tail entry */
+	u32 s_lsn;              /* limit sequence number (credit) */
+	u16 s_hdrwords;         /* size of s_hdr in 32 bit words */
+	u16 s_rdma_ack_cnt;
+	u8 s_state;             /* opcode of last packet sent */
+	u8 s_ack_state;         /* opcode of packet to ACK */
+	u8 s_nak_state;         /* non-zero if NAK is pending */
+	u8 r_nak_state;         /* non-zero if NAK is pending */
+	u8 s_retry;             /* requester retry counter */
+	u8 s_rnr_retry;         /* requester RNR retry counter */
+	u8 s_num_rd_atomic;     /* number of RDMA read/atomic pending */
+	u8 s_tail_ack_queue;    /* index into s_ack_queue[] */
+
+	struct qib_sge_state s_ack_rdma_sge;
+	struct timer_list s_timer;
+	struct list_head iowait;        /* link for wait PIO buf */
+
+	struct work_struct s_work;
+
+	wait_queue_head_t wait_dma;
+
+	struct qib_sge r_sg_list[0] /* verified SGEs */
+		____cacheline_aligned_in_smp;
+};
+
+/*
+ * Atomic bit definitions for r_aflags.
+ */
+#define QIB_R_WRID_VALID        0
+#define QIB_R_REWIND_SGE        1
+
+/*
+ * Bit definitions for r_flags.
+ */
+#define QIB_R_REUSE_SGE 0x01
+#define QIB_R_RDMAR_SEQ 0x02
+#define QIB_R_RSP_NAK   0x04
+#define QIB_R_RSP_SEND  0x08
+#define QIB_R_COMM_EST  0x10
+
+/*
+ * Bit definitions for s_flags.
+ *
+ * QIB_S_SIGNAL_REQ_WR - set if QP send WRs contain completion signaled
+ * QIB_S_BUSY - send tasklet is processing the QP
+ * QIB_S_TIMER - the RC retry timer is active
+ * QIB_S_ACK_PENDING - an ACK is waiting to be sent after RDMA read/atomics
+ * QIB_S_WAIT_FENCE - waiting for all prior RDMA read or atomic SWQEs
+ *                         before processing the next SWQE
+ * QIB_S_WAIT_RDMAR - waiting for a RDMA read or atomic SWQE to complete
+ *                         before processing the next SWQE
+ * QIB_S_WAIT_RNR - waiting for RNR timeout
+ * QIB_S_WAIT_SSN_CREDIT - waiting for RC credits to process next SWQE
+ * QIB_S_WAIT_DMA - waiting for send DMA queue to drain before generating
+ *                  next send completion entry not via send DMA
+ * QIB_S_WAIT_PIO - waiting for a send buffer to be available
+ * QIB_S_WAIT_TX - waiting for a struct qib_verbs_txreq to be available
+ * QIB_S_WAIT_DMA_DESC - waiting for DMA descriptors to be available
+ * QIB_S_WAIT_KMEM - waiting for kernel memory to be available
+ * QIB_S_WAIT_PSN - waiting for a packet to exit the send DMA queue
+ * QIB_S_WAIT_ACK - waiting for an ACK packet before sending more requests
+ * QIB_S_SEND_ONE - send one packet, request ACK, then wait for ACK
+ */
+#define QIB_S_SIGNAL_REQ_WR	0x0001
+#define QIB_S_BUSY		0x0002
+#define QIB_S_TIMER		0x0004
+#define QIB_S_RESP_PENDING	0x0008
+#define QIB_S_ACK_PENDING	0x0010
+#define QIB_S_WAIT_FENCE	0x0020
+#define QIB_S_WAIT_RDMAR	0x0040
+#define QIB_S_WAIT_RNR		0x0080
+#define QIB_S_WAIT_SSN_CREDIT	0x0100
+#define QIB_S_WAIT_DMA		0x0200
+#define QIB_S_WAIT_PIO		0x0400
+#define QIB_S_WAIT_TX		0x0800
+#define QIB_S_WAIT_DMA_DESC	0x1000
+#define QIB_S_WAIT_KMEM		0x2000
+#define QIB_S_WAIT_PSN		0x4000
+#define QIB_S_WAIT_ACK		0x8000
+#define QIB_S_SEND_ONE		0x10000
+#define QIB_S_UNLIMITED_CREDIT	0x20000
+
+/*
+ * Wait flags that would prevent any packet type from being sent.
+ */
+#define QIB_S_ANY_WAIT_IO (QIB_S_WAIT_PIO | QIB_S_WAIT_TX | \
+	QIB_S_WAIT_DMA_DESC | QIB_S_WAIT_KMEM)
+
+/*
+ * Wait flags that would prevent send work requests from making progress.
+ */
+#define QIB_S_ANY_WAIT_SEND (QIB_S_WAIT_FENCE | QIB_S_WAIT_RDMAR | \
+	QIB_S_WAIT_RNR | QIB_S_WAIT_SSN_CREDIT | QIB_S_WAIT_DMA | \
+	QIB_S_WAIT_PSN | QIB_S_WAIT_ACK)
+
+#define QIB_S_ANY_WAIT (QIB_S_ANY_WAIT_IO | QIB_S_ANY_WAIT_SEND)
+
+#define QIB_PSN_CREDIT  16
+
+/*
+ * Since struct qib_swqe is not a fixed size, we can't simply index into
+ * struct qib_qp.s_wq.  This function does the array index computation.
+ */
+static inline struct qib_swqe *get_swqe_ptr(struct qib_qp *qp,
+					      unsigned n)
+{
+	return (struct qib_swqe *)((char *)qp->s_wq +
+				     (sizeof(struct qib_swqe) +
+				      qp->s_max_sge *
+				      sizeof(struct qib_sge)) * n);
+}
+
+/*
+ * Since struct qib_rwqe is not a fixed size, we can't simply index into
+ * struct qib_rwq.wq.  This function does the array index computation.
+ */
+static inline struct qib_rwqe *get_rwqe_ptr(struct qib_rq *rq, unsigned n)
+{
+	return (struct qib_rwqe *)
+		((char *) rq->wq->wq +
+		 (sizeof(struct qib_rwqe) +
+		  rq->max_sge * sizeof(struct ib_sge)) * n);
+}
+
+/*
+ * QPN-map pages start out as NULL, they get allocated upon
+ * first use and are never deallocated. This way,
+ * large bitmaps are not allocated unless large numbers of QPs are used.
+ */
+struct qpn_map {
+	void *page;
+};
+
+struct qib_qpn_table {
+	spinlock_t lock; /* protect changes in this struct */
+	unsigned flags;         /* flags for QP0/1 allocated for each port */
+	u32 last;               /* last QP number allocated */
+	u32 nmaps;              /* size of the map table */
+	u16 limit;
+	u16 mask;
+	/* bit map of free QP numbers other than 0/1 */
+	struct qpn_map map[QPNMAP_ENTRIES];
+};
+
+struct qib_lkey_table {
+	spinlock_t lock; /* protect changes in this struct */
+	u32 next;               /* next unused index (speeds search) */
+	u32 gen;                /* generation count */
+	u32 max;                /* size of the table */
+	struct qib_mregion __rcu **table;
+};
+
+struct qib_opcode_stats {
+	u64 n_packets;          /* number of packets */
+	u64 n_bytes;            /* total number of bytes */
+};
+
+struct qib_opcode_stats_perctx {
+	struct qib_opcode_stats stats[128];
+};
+
+struct qib_pma_counters {
+	u64 n_unicast_xmit;     /* total unicast packets sent */
+	u64 n_unicast_rcv;      /* total unicast packets received */
+	u64 n_multicast_xmit;   /* total multicast packets sent */
+	u64 n_multicast_rcv;    /* total multicast packets received */
+};
+
+struct qib_ibport {
+	struct qib_qp __rcu *qp0;
+	struct qib_qp __rcu *qp1;
+	struct ib_mad_agent *send_agent;	/* agent for SMI (traps) */
+	struct qib_ah *sm_ah;
+	struct qib_ah *smi_ah;
+	struct rb_root mcast_tree;
+	spinlock_t lock;		/* protect changes in this struct */
+
+	/* non-zero when timer is set */
+	unsigned long mkey_lease_timeout;
+	unsigned long trap_timeout;
+	__be64 gid_prefix;      /* in network order */
+	__be64 mkey;
+	__be64 guids[QIB_GUIDS_PER_PORT	- 1];	/* writable GUIDs */
+	u64 tid;		/* TID for traps */
+	struct qib_pma_counters __percpu *pmastats;
+	u64 z_unicast_xmit;     /* starting count for PMA */
+	u64 z_unicast_rcv;      /* starting count for PMA */
+	u64 z_multicast_xmit;   /* starting count for PMA */
+	u64 z_multicast_rcv;    /* starting count for PMA */
+	u64 z_symbol_error_counter;             /* starting count for PMA */
+	u64 z_link_error_recovery_counter;      /* starting count for PMA */
+	u64 z_link_downed_counter;              /* starting count for PMA */
+	u64 z_port_rcv_errors;                  /* starting count for PMA */
+	u64 z_port_rcv_remphys_errors;          /* starting count for PMA */
+	u64 z_port_xmit_discards;               /* starting count for PMA */
+	u64 z_port_xmit_data;                   /* starting count for PMA */
+	u64 z_port_rcv_data;                    /* starting count for PMA */
+	u64 z_port_xmit_packets;                /* starting count for PMA */
+	u64 z_port_rcv_packets;                 /* starting count for PMA */
+	u32 z_local_link_integrity_errors;      /* starting count for PMA */
+	u32 z_excessive_buffer_overrun_errors;  /* starting count for PMA */
+	u32 z_vl15_dropped;                     /* starting count for PMA */
+	u32 n_rc_resends;
+	u32 n_rc_acks;
+	u32 n_rc_qacks;
+	u32 n_rc_delayed_comp;
+	u32 n_seq_naks;
+	u32 n_rdma_seq;
+	u32 n_rnr_naks;
+	u32 n_other_naks;
+	u32 n_loop_pkts;
+	u32 n_pkt_drops;
+	u32 n_vl15_dropped;
+	u32 n_rc_timeouts;
+	u32 n_dmawait;
+	u32 n_unaligned;
+	u32 n_rc_dupreq;
+	u32 n_rc_seqnak;
+	u32 port_cap_flags;
+	u32 pma_sample_start;
+	u32 pma_sample_interval;
+	__be16 pma_counter_select[5];
+	u16 pma_tag;
+	u16 pkey_violations;
+	u16 qkey_violations;
+	u16 mkey_violations;
+	u16 mkey_lease_period;
+	u16 sm_lid;
+	u16 repress_traps;
+	u8 sm_sl;
+	u8 mkeyprot;
+	u8 subnet_timeout;
+	u8 vl_high_limit;
+	u8 sl_to_vl[16];
+
+};
+
+
+struct qib_ibdev {
+	struct ib_device ibdev;
+	struct list_head pending_mmaps;
+	spinlock_t mmap_offset_lock; /* protect mmap_offset */
+	u32 mmap_offset;
+	struct qib_mregion __rcu *dma_mr;
+
+	/* QP numbers are shared by all IB ports */
+	struct qib_qpn_table qpn_table;
+	struct qib_lkey_table lk_table;
+	struct list_head piowait;       /* list for wait PIO buf */
+	struct list_head dmawait;	/* list for wait DMA */
+	struct list_head txwait;        /* list for wait qib_verbs_txreq */
+	struct list_head memwait;       /* list for wait kernel memory */
+	struct list_head txreq_free;
+	struct timer_list mem_timer;
+	struct qib_qp __rcu **qp_table;
+	struct qib_pio_header *pio_hdrs;
+	dma_addr_t pio_hdrs_phys;
+	/* list of QPs waiting for RNR timer */
+	spinlock_t pending_lock; /* protect wait lists, PMA counters, etc. */
+	u32 qp_table_size; /* size of the hash table */
+	u32 qp_rnd; /* random bytes for hash */
+	spinlock_t qpt_lock;
+
+	u32 n_piowait;
+	u32 n_txwait;
+
+	u32 n_pds_allocated;    /* number of PDs allocated for device */
+	spinlock_t n_pds_lock;
+	u32 n_ahs_allocated;    /* number of AHs allocated for device */
+	spinlock_t n_ahs_lock;
+	u32 n_cqs_allocated;    /* number of CQs allocated for device */
+	spinlock_t n_cqs_lock;
+	u32 n_qps_allocated;    /* number of QPs allocated for device */
+	spinlock_t n_qps_lock;
+	u32 n_srqs_allocated;   /* number of SRQs allocated for device */
+	spinlock_t n_srqs_lock;
+	u32 n_mcast_grps_allocated; /* number of mcast groups allocated */
+	spinlock_t n_mcast_grps_lock;
+#ifdef CONFIG_DEBUG_FS
+	/* per HCA debugfs */
+	struct dentry *qib_ibdev_dbg;
+#endif
+};
+
+struct qib_verbs_counters {
+	u64 symbol_error_counter;
+	u64 link_error_recovery_counter;
+	u64 link_downed_counter;
+	u64 port_rcv_errors;
+	u64 port_rcv_remphys_errors;
+	u64 port_xmit_discards;
+	u64 port_xmit_data;
+	u64 port_rcv_data;
+	u64 port_xmit_packets;
+	u64 port_rcv_packets;
+	u32 local_link_integrity_errors;
+	u32 excessive_buffer_overrun_errors;
+	u32 vl15_dropped;
+};
+
+static inline struct qib_mr *to_imr(struct ib_mr *ibmr)
+{
+	return container_of(ibmr, struct qib_mr, ibmr);
+}
+
+static inline struct qib_pd *to_ipd(struct ib_pd *ibpd)
+{
+	return container_of(ibpd, struct qib_pd, ibpd);
+}
+
+static inline struct qib_ah *to_iah(struct ib_ah *ibah)
+{
+	return container_of(ibah, struct qib_ah, ibah);
+}
+
+static inline struct qib_cq *to_icq(struct ib_cq *ibcq)
+{
+	return container_of(ibcq, struct qib_cq, ibcq);
+}
+
+static inline struct qib_srq *to_isrq(struct ib_srq *ibsrq)
+{
+	return container_of(ibsrq, struct qib_srq, ibsrq);
+}
+
+static inline struct qib_qp *to_iqp(struct ib_qp *ibqp)
+{
+	return container_of(ibqp, struct qib_qp, ibqp);
+}
+
+static inline struct qib_ibdev *to_idev(struct ib_device *ibdev)
+{
+	return container_of(ibdev, struct qib_ibdev, ibdev);
+}
+
+/*
+ * Send if not busy or waiting for I/O and either
+ * a RC response is pending or we can process send work requests.
+ */
+static inline int qib_send_ok(struct qib_qp *qp)
+{
+	return !(qp->s_flags & (QIB_S_BUSY | QIB_S_ANY_WAIT_IO)) &&
+		(qp->s_hdrwords || (qp->s_flags & QIB_S_RESP_PENDING) ||
+		 !(qp->s_flags & QIB_S_ANY_WAIT_SEND));
+}
+
+/*
+ * This must be called with s_lock held.
+ */
+void qib_schedule_send(struct qib_qp *qp);
+
+static inline int qib_pkey_ok(u16 pkey1, u16 pkey2)
+{
+	u16 p1 = pkey1 & 0x7FFF;
+	u16 p2 = pkey2 & 0x7FFF;
+
+	/*
+	 * Low 15 bits must be non-zero and match, and
+	 * one of the two must be a full member.
+	 */
+	return p1 && p1 == p2 && ((__s16)pkey1 < 0 || (__s16)pkey2 < 0);
+}
+
+void qib_bad_pqkey(struct qib_ibport *ibp, __be16 trap_num, u32 key, u32 sl,
+		   u32 qp1, u32 qp2, __be16 lid1, __be16 lid2);
+void qib_cap_mask_chg(struct qib_ibport *ibp);
+void qib_sys_guid_chg(struct qib_ibport *ibp);
+void qib_node_desc_chg(struct qib_ibport *ibp);
+int qib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
+		    struct ib_wc *in_wc, struct ib_grh *in_grh,
+		    struct ib_mad *in_mad, struct ib_mad *out_mad);
+int qib_create_agents(struct qib_ibdev *dev);
+void qib_free_agents(struct qib_ibdev *dev);
+
+/*
+ * Compare the lower 24 bits of the two values.
+ * Returns an integer <, ==, or > than zero.
+ */
+static inline int qib_cmp24(u32 a, u32 b)
+{
+	return (((int) a) - ((int) b)) << 8;
+}
+
+struct qib_mcast *qib_mcast_find(struct qib_ibport *ibp, union ib_gid *mgid);
+
+int qib_snapshot_counters(struct qib_pportdata *ppd, u64 *swords,
+			  u64 *rwords, u64 *spkts, u64 *rpkts,
+			  u64 *xmit_wait);
+
+int qib_get_counters(struct qib_pportdata *ppd,
+		     struct qib_verbs_counters *cntrs);
+
+int qib_multicast_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid);
+
+int qib_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid);
+
+int qib_mcast_tree_empty(struct qib_ibport *ibp);
+
+__be32 qib_compute_aeth(struct qib_qp *qp);
+
+struct qib_qp *qib_lookup_qpn(struct qib_ibport *ibp, u32 qpn);
+
+struct ib_qp *qib_create_qp(struct ib_pd *ibpd,
+			    struct ib_qp_init_attr *init_attr,
+			    struct ib_udata *udata);
+
+int qib_destroy_qp(struct ib_qp *ibqp);
+
+int qib_error_qp(struct qib_qp *qp, enum ib_wc_status err);
+
+int qib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+		  int attr_mask, struct ib_udata *udata);
+
+int qib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+		 int attr_mask, struct ib_qp_init_attr *init_attr);
+
+unsigned qib_free_all_qps(struct qib_devdata *dd);
+
+void qib_init_qpn_table(struct qib_devdata *dd, struct qib_qpn_table *qpt);
+
+void qib_free_qpn_table(struct qib_qpn_table *qpt);
+
+#ifdef CONFIG_DEBUG_FS
+
+struct qib_qp_iter;
+
+struct qib_qp_iter *qib_qp_iter_init(struct qib_ibdev *dev);
+
+int qib_qp_iter_next(struct qib_qp_iter *iter);
+
+void qib_qp_iter_print(struct seq_file *s, struct qib_qp_iter *iter);
+
+#endif
+
+void qib_get_credit(struct qib_qp *qp, u32 aeth);
+
+unsigned qib_pkt_delay(u32 plen, u8 snd_mult, u8 rcv_mult);
+
+void qib_verbs_sdma_desc_avail(struct qib_pportdata *ppd, unsigned avail);
+
+void qib_put_txreq(struct qib_verbs_txreq *tx);
+
+int qib_verbs_send(struct qib_qp *qp, struct qib_ib_header *hdr,
+		   u32 hdrwords, struct qib_sge_state *ss, u32 len);
+
+void qib_copy_sge(struct qib_sge_state *ss, void *data, u32 length,
+		  int release);
+
+void qib_skip_sge(struct qib_sge_state *ss, u32 length, int release);
+
+void qib_uc_rcv(struct qib_ibport *ibp, struct qib_ib_header *hdr,
+		int has_grh, void *data, u32 tlen, struct qib_qp *qp);
+
+void qib_rc_rcv(struct qib_ctxtdata *rcd, struct qib_ib_header *hdr,
+		int has_grh, void *data, u32 tlen, struct qib_qp *qp);
+
+int qib_check_ah(struct ib_device *ibdev, struct ib_ah_attr *ah_attr);
+
+struct ib_ah *qib_create_qp0_ah(struct qib_ibport *ibp, u16 dlid);
+
+void qib_rc_rnr_retry(unsigned long arg);
+
+void qib_rc_send_complete(struct qib_qp *qp, struct qib_ib_header *hdr);
+
+void qib_rc_error(struct qib_qp *qp, enum ib_wc_status err);
+
+int qib_post_ud_send(struct qib_qp *qp, struct ib_send_wr *wr);
+
+void qib_ud_rcv(struct qib_ibport *ibp, struct qib_ib_header *hdr,
+		int has_grh, void *data, u32 tlen, struct qib_qp *qp);
+
+int qib_alloc_lkey(struct qib_mregion *mr, int dma_region);
+
+void qib_free_lkey(struct qib_mregion *mr);
+
+int qib_lkey_ok(struct qib_lkey_table *rkt, struct qib_pd *pd,
+		struct qib_sge *isge, struct ib_sge *sge, int acc);
+
+int qib_rkey_ok(struct qib_qp *qp, struct qib_sge *sge,
+		u32 len, u64 vaddr, u32 rkey, int acc);
+
+int qib_post_srq_receive(struct ib_srq *ibsrq, struct ib_recv_wr *wr,
+			 struct ib_recv_wr **bad_wr);
+
+struct ib_srq *qib_create_srq(struct ib_pd *ibpd,
+			      struct ib_srq_init_attr *srq_init_attr,
+			      struct ib_udata *udata);
+
+int qib_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
+		   enum ib_srq_attr_mask attr_mask,
+		   struct ib_udata *udata);
+
+int qib_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr);
+
+int qib_destroy_srq(struct ib_srq *ibsrq);
+
+int qib_cq_init(struct qib_devdata *dd);
+
+void qib_cq_exit(struct qib_devdata *dd);
+
+void qib_cq_enter(struct qib_cq *cq, struct ib_wc *entry, int sig);
+
+int qib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry);
+
+struct ib_cq *qib_create_cq(struct ib_device *ibdev, int entries,
+			    int comp_vector, struct ib_ucontext *context,
+			    struct ib_udata *udata);
+
+int qib_destroy_cq(struct ib_cq *ibcq);
+
+int qib_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags notify_flags);
+
+int qib_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata);
+
+struct ib_mr *qib_get_dma_mr(struct ib_pd *pd, int acc);
+
+struct ib_mr *qib_reg_phys_mr(struct ib_pd *pd,
+			      struct ib_phys_buf *buffer_list,
+			      int num_phys_buf, int acc, u64 *iova_start);
+
+struct ib_mr *qib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
+			      u64 virt_addr, int mr_access_flags,
+			      struct ib_udata *udata);
+
+int qib_dereg_mr(struct ib_mr *ibmr);
+
+struct ib_mr *qib_alloc_fast_reg_mr(struct ib_pd *pd, int max_page_list_len);
+
+struct ib_fast_reg_page_list *qib_alloc_fast_reg_page_list(
+				struct ib_device *ibdev, int page_list_len);
+
+void qib_free_fast_reg_page_list(struct ib_fast_reg_page_list *pl);
+
+int qib_fast_reg_mr(struct qib_qp *qp, struct ib_send_wr *wr);
+
+struct ib_fmr *qib_alloc_fmr(struct ib_pd *pd, int mr_access_flags,
+			     struct ib_fmr_attr *fmr_attr);
+
+int qib_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list,
+		     int list_len, u64 iova);
+
+int qib_unmap_fmr(struct list_head *fmr_list);
+
+int qib_dealloc_fmr(struct ib_fmr *ibfmr);
+
+static inline void qib_get_mr(struct qib_mregion *mr)
+{
+	atomic_inc(&mr->refcount);
+}
+
+void mr_rcu_callback(struct rcu_head *list);
+
+static inline void qib_put_mr(struct qib_mregion *mr)
+{
+	if (unlikely(atomic_dec_and_test(&mr->refcount)))
+		call_rcu(&mr->list, mr_rcu_callback);
+}
+
+static inline void qib_put_ss(struct qib_sge_state *ss)
+{
+	while (ss->num_sge) {
+		qib_put_mr(ss->sge.mr);
+		if (--ss->num_sge)
+			ss->sge = *ss->sg_list++;
+	}
+}
+
+
+void qib_release_mmap_info(struct kref *ref);
+
+struct qib_mmap_info *qib_create_mmap_info(struct qib_ibdev *dev, u32 size,
+					   struct ib_ucontext *context,
+					   void *obj);
+
+void qib_update_mmap_info(struct qib_ibdev *dev, struct qib_mmap_info *ip,
+			  u32 size, void *obj);
+
+int qib_mmap(struct ib_ucontext *context, struct vm_area_struct *vma);
+
+int qib_get_rwqe(struct qib_qp *qp, int wr_id_only);
+
+void qib_migrate_qp(struct qib_qp *qp);
+
+int qib_ruc_check_hdr(struct qib_ibport *ibp, struct qib_ib_header *hdr,
+		      int has_grh, struct qib_qp *qp, u32 bth0);
+
+u32 qib_make_grh(struct qib_ibport *ibp, struct ib_grh *hdr,
+		 struct ib_global_route *grh, u32 hwords, u32 nwords);
+
+void qib_make_ruc_header(struct qib_qp *qp, struct qib_other_headers *ohdr,
+			 u32 bth0, u32 bth2);
+
+void qib_do_send(struct work_struct *work);
+
+void qib_send_complete(struct qib_qp *qp, struct qib_swqe *wqe,
+		       enum ib_wc_status status);
+
+void qib_send_rc_ack(struct qib_qp *qp);
+
+int qib_make_rc_req(struct qib_qp *qp);
+
+int qib_make_uc_req(struct qib_qp *qp);
+
+int qib_make_ud_req(struct qib_qp *qp);
+
+int qib_register_ib_device(struct qib_devdata *);
+
+void qib_unregister_ib_device(struct qib_devdata *);
+
+void qib_ib_rcv(struct qib_ctxtdata *, void *, void *, u32);
+
+void qib_ib_piobufavail(struct qib_devdata *);
+
+unsigned qib_get_npkeys(struct qib_devdata *);
+
+unsigned qib_get_pkey(struct qib_ibport *, unsigned);
+
+extern const enum ib_wc_opcode ib_qib_wc_opcode[];
+
+/*
+ * Below  HCA-independent IB PhysPortState values, returned
+ * by the f_ibphys_portstate() routine.
+ */
+#define IB_PHYSPORTSTATE_SLEEP 1
+#define IB_PHYSPORTSTATE_POLL 2
+#define IB_PHYSPORTSTATE_DISABLED 3
+#define IB_PHYSPORTSTATE_CFG_TRAIN 4
+#define IB_PHYSPORTSTATE_LINKUP 5
+#define IB_PHYSPORTSTATE_LINK_ERR_RECOVER 6
+#define IB_PHYSPORTSTATE_CFG_DEBOUNCE 8
+#define IB_PHYSPORTSTATE_CFG_IDLE 0xB
+#define IB_PHYSPORTSTATE_RECOVERY_RETRAIN 0xC
+#define IB_PHYSPORTSTATE_RECOVERY_WAITRMT 0xE
+#define IB_PHYSPORTSTATE_RECOVERY_IDLE 0xF
+#define IB_PHYSPORTSTATE_CFG_ENH 0x10
+#define IB_PHYSPORTSTATE_CFG_WAIT_ENH 0x13
+
+extern const int ib_qib_state_ops[];
+
+extern __be64 ib_qib_sys_image_guid;    /* in network order */
+
+extern unsigned int ib_qib_lkey_table_size;
+
+extern unsigned int ib_qib_max_cqes;
+
+extern unsigned int ib_qib_max_cqs;
+
+extern unsigned int ib_qib_max_qp_wrs;
+
+extern unsigned int ib_qib_max_qps;
+
+extern unsigned int ib_qib_max_sges;
+
+extern unsigned int ib_qib_max_mcast_grps;
+
+extern unsigned int ib_qib_max_mcast_qp_attached;
+
+extern unsigned int ib_qib_max_srqs;
+
+extern unsigned int ib_qib_max_srq_sges;
+
+extern unsigned int ib_qib_max_srq_wrs;
+
+extern const u32 ib_qib_rnr_table[];
+
+extern struct ib_dma_mapping_ops qib_dma_mapping_ops;
+
+#endif                          /* QIB_VERBS_H */
diff --git a/drivers/infiniband/hw/qib/qib_verbs_mcast.c b/drivers/infiniband/hw/qib/qib_verbs_mcast.c
new file mode 100644
index 0000000..dabb697
--- /dev/null
+++ b/drivers/infiniband/hw/qib/qib_verbs_mcast.c
@@ -0,0 +1,368 @@
+/*
+ * Copyright (c) 2006, 2007, 2008, 2009 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/rculist.h>
+
+#include "qib.h"
+
+/**
+ * qib_mcast_qp_alloc - alloc a struct to link a QP to mcast GID struct
+ * @qp: the QP to link
+ */
+static struct qib_mcast_qp *qib_mcast_qp_alloc(struct qib_qp *qp)
+{
+	struct qib_mcast_qp *mqp;
+
+	mqp = kmalloc(sizeof *mqp, GFP_KERNEL);
+	if (!mqp)
+		goto bail;
+
+	mqp->qp = qp;
+	atomic_inc(&qp->refcount);
+
+bail:
+	return mqp;
+}
+
+static void qib_mcast_qp_free(struct qib_mcast_qp *mqp)
+{
+	struct qib_qp *qp = mqp->qp;
+
+	/* Notify qib_destroy_qp() if it is waiting. */
+	if (atomic_dec_and_test(&qp->refcount))
+		wake_up(&qp->wait);
+
+	kfree(mqp);
+}
+
+/**
+ * qib_mcast_alloc - allocate the multicast GID structure
+ * @mgid: the multicast GID
+ *
+ * A list of QPs will be attached to this structure.
+ */
+static struct qib_mcast *qib_mcast_alloc(union ib_gid *mgid)
+{
+	struct qib_mcast *mcast;
+
+	mcast = kmalloc(sizeof *mcast, GFP_KERNEL);
+	if (!mcast)
+		goto bail;
+
+	mcast->mgid = *mgid;
+	INIT_LIST_HEAD(&mcast->qp_list);
+	init_waitqueue_head(&mcast->wait);
+	atomic_set(&mcast->refcount, 0);
+	mcast->n_attached = 0;
+
+bail:
+	return mcast;
+}
+
+static void qib_mcast_free(struct qib_mcast *mcast)
+{
+	struct qib_mcast_qp *p, *tmp;
+
+	list_for_each_entry_safe(p, tmp, &mcast->qp_list, list)
+		qib_mcast_qp_free(p);
+
+	kfree(mcast);
+}
+
+/**
+ * qib_mcast_find - search the global table for the given multicast GID
+ * @ibp: the IB port structure
+ * @mgid: the multicast GID to search for
+ *
+ * Returns NULL if not found.
+ *
+ * The caller is responsible for decrementing the reference count if found.
+ */
+struct qib_mcast *qib_mcast_find(struct qib_ibport *ibp, union ib_gid *mgid)
+{
+	struct rb_node *n;
+	unsigned long flags;
+	struct qib_mcast *mcast;
+
+	spin_lock_irqsave(&ibp->lock, flags);
+	n = ibp->mcast_tree.rb_node;
+	while (n) {
+		int ret;
+
+		mcast = rb_entry(n, struct qib_mcast, rb_node);
+
+		ret = memcmp(mgid->raw, mcast->mgid.raw,
+			     sizeof(union ib_gid));
+		if (ret < 0)
+			n = n->rb_left;
+		else if (ret > 0)
+			n = n->rb_right;
+		else {
+			atomic_inc(&mcast->refcount);
+			spin_unlock_irqrestore(&ibp->lock, flags);
+			goto bail;
+		}
+	}
+	spin_unlock_irqrestore(&ibp->lock, flags);
+
+	mcast = NULL;
+
+bail:
+	return mcast;
+}
+
+/**
+ * qib_mcast_add - insert mcast GID into table and attach QP struct
+ * @mcast: the mcast GID table
+ * @mqp: the QP to attach
+ *
+ * Return zero if both were added.  Return EEXIST if the GID was already in
+ * the table but the QP was added.  Return ESRCH if the QP was already
+ * attached and neither structure was added.
+ */
+static int qib_mcast_add(struct qib_ibdev *dev, struct qib_ibport *ibp,
+			 struct qib_mcast *mcast, struct qib_mcast_qp *mqp)
+{
+	struct rb_node **n = &ibp->mcast_tree.rb_node;
+	struct rb_node *pn = NULL;
+	int ret;
+
+	spin_lock_irq(&ibp->lock);
+
+	while (*n) {
+		struct qib_mcast *tmcast;
+		struct qib_mcast_qp *p;
+
+		pn = *n;
+		tmcast = rb_entry(pn, struct qib_mcast, rb_node);
+
+		ret = memcmp(mcast->mgid.raw, tmcast->mgid.raw,
+			     sizeof(union ib_gid));
+		if (ret < 0) {
+			n = &pn->rb_left;
+			continue;
+		}
+		if (ret > 0) {
+			n = &pn->rb_right;
+			continue;
+		}
+
+		/* Search the QP list to see if this is already there. */
+		list_for_each_entry_rcu(p, &tmcast->qp_list, list) {
+			if (p->qp == mqp->qp) {
+				ret = ESRCH;
+				goto bail;
+			}
+		}
+		if (tmcast->n_attached == ib_qib_max_mcast_qp_attached) {
+			ret = ENOMEM;
+			goto bail;
+		}
+
+		tmcast->n_attached++;
+
+		list_add_tail_rcu(&mqp->list, &tmcast->qp_list);
+		ret = EEXIST;
+		goto bail;
+	}
+
+	spin_lock(&dev->n_mcast_grps_lock);
+	if (dev->n_mcast_grps_allocated == ib_qib_max_mcast_grps) {
+		spin_unlock(&dev->n_mcast_grps_lock);
+		ret = ENOMEM;
+		goto bail;
+	}
+
+	dev->n_mcast_grps_allocated++;
+	spin_unlock(&dev->n_mcast_grps_lock);
+
+	mcast->n_attached++;
+
+	list_add_tail_rcu(&mqp->list, &mcast->qp_list);
+
+	atomic_inc(&mcast->refcount);
+	rb_link_node(&mcast->rb_node, pn, n);
+	rb_insert_color(&mcast->rb_node, &ibp->mcast_tree);
+
+	ret = 0;
+
+bail:
+	spin_unlock_irq(&ibp->lock);
+
+	return ret;
+}
+
+int qib_multicast_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
+{
+	struct qib_qp *qp = to_iqp(ibqp);
+	struct qib_ibdev *dev = to_idev(ibqp->device);
+	struct qib_ibport *ibp;
+	struct qib_mcast *mcast;
+	struct qib_mcast_qp *mqp;
+	int ret;
+
+	if (ibqp->qp_num <= 1 || qp->state == IB_QPS_RESET) {
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	/*
+	 * Allocate data structures since its better to do this outside of
+	 * spin locks and it will most likely be needed.
+	 */
+	mcast = qib_mcast_alloc(gid);
+	if (mcast == NULL) {
+		ret = -ENOMEM;
+		goto bail;
+	}
+	mqp = qib_mcast_qp_alloc(qp);
+	if (mqp == NULL) {
+		qib_mcast_free(mcast);
+		ret = -ENOMEM;
+		goto bail;
+	}
+	ibp = to_iport(ibqp->device, qp->port_num);
+	switch (qib_mcast_add(dev, ibp, mcast, mqp)) {
+	case ESRCH:
+		/* Neither was used: OK to attach the same QP twice. */
+		qib_mcast_qp_free(mqp);
+		qib_mcast_free(mcast);
+		break;
+
+	case EEXIST:            /* The mcast wasn't used */
+		qib_mcast_free(mcast);
+		break;
+
+	case ENOMEM:
+		/* Exceeded the maximum number of mcast groups. */
+		qib_mcast_qp_free(mqp);
+		qib_mcast_free(mcast);
+		ret = -ENOMEM;
+		goto bail;
+
+	default:
+		break;
+	}
+
+	ret = 0;
+
+bail:
+	return ret;
+}
+
+int qib_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
+{
+	struct qib_qp *qp = to_iqp(ibqp);
+	struct qib_ibdev *dev = to_idev(ibqp->device);
+	struct qib_ibport *ibp = to_iport(ibqp->device, qp->port_num);
+	struct qib_mcast *mcast = NULL;
+	struct qib_mcast_qp *p, *tmp;
+	struct rb_node *n;
+	int last = 0;
+	int ret;
+
+	if (ibqp->qp_num <= 1 || qp->state == IB_QPS_RESET) {
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	spin_lock_irq(&ibp->lock);
+
+	/* Find the GID in the mcast table. */
+	n = ibp->mcast_tree.rb_node;
+	while (1) {
+		if (n == NULL) {
+			spin_unlock_irq(&ibp->lock);
+			ret = -EINVAL;
+			goto bail;
+		}
+
+		mcast = rb_entry(n, struct qib_mcast, rb_node);
+		ret = memcmp(gid->raw, mcast->mgid.raw,
+			     sizeof(union ib_gid));
+		if (ret < 0)
+			n = n->rb_left;
+		else if (ret > 0)
+			n = n->rb_right;
+		else
+			break;
+	}
+
+	/* Search the QP list. */
+	list_for_each_entry_safe(p, tmp, &mcast->qp_list, list) {
+		if (p->qp != qp)
+			continue;
+		/*
+		 * We found it, so remove it, but don't poison the forward
+		 * link until we are sure there are no list walkers.
+		 */
+		list_del_rcu(&p->list);
+		mcast->n_attached--;
+
+		/* If this was the last attached QP, remove the GID too. */
+		if (list_empty(&mcast->qp_list)) {
+			rb_erase(&mcast->rb_node, &ibp->mcast_tree);
+			last = 1;
+		}
+		break;
+	}
+
+	spin_unlock_irq(&ibp->lock);
+
+	if (p) {
+		/*
+		 * Wait for any list walkers to finish before freeing the
+		 * list element.
+		 */
+		wait_event(mcast->wait, atomic_read(&mcast->refcount) <= 1);
+		qib_mcast_qp_free(p);
+	}
+	if (last) {
+		atomic_dec(&mcast->refcount);
+		wait_event(mcast->wait, !atomic_read(&mcast->refcount));
+		qib_mcast_free(mcast);
+		spin_lock_irq(&dev->n_mcast_grps_lock);
+		dev->n_mcast_grps_allocated--;
+		spin_unlock_irq(&dev->n_mcast_grps_lock);
+	}
+
+	ret = 0;
+
+bail:
+	return ret;
+}
+
+int qib_mcast_tree_empty(struct qib_ibport *ibp)
+{
+	return ibp->mcast_tree.rb_node == NULL;
+}
diff --git a/drivers/infiniband/hw/qib/qib_wc_ppc64.c b/drivers/infiniband/hw/qib/qib_wc_ppc64.c
new file mode 100644
index 0000000..673cf4c
--- /dev/null
+++ b/drivers/infiniband/hw/qib/qib_wc_ppc64.c
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/*
+ * This file is conditionally built on PowerPC only.  Otherwise weak symbol
+ * versions of the functions exported from here are used.
+ */
+
+#include "qib.h"
+
+/**
+ * qib_enable_wc - enable write combining for MMIO writes to the device
+ * @dd: qlogic_ib device
+ *
+ * Nothing to do on PowerPC, so just return without error.
+ */
+int qib_enable_wc(struct qib_devdata *dd)
+{
+	return 0;
+}
+
+/**
+ * qib_unordered_wc - indicate whether write combining is unordered
+ *
+ * Because our performance depends on our ability to do write
+ * combining mmio writes in the most efficient way, we need to
+ * know if we are on a processor that may reorder stores when
+ * write combining.
+ */
+int qib_unordered_wc(void)
+{
+	return 1;
+}
diff --git a/drivers/infiniband/hw/qib/qib_wc_x86_64.c b/drivers/infiniband/hw/qib/qib_wc_x86_64.c
new file mode 100644
index 0000000..1d7281c
--- /dev/null
+++ b/drivers/infiniband/hw/qib/qib_wc_x86_64.c
@@ -0,0 +1,171 @@
+/*
+ * Copyright (c) 2012 Intel Corporation. All rights reserved.
+ * Copyright (c) 2006 - 2012 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/*
+ * This file is conditionally built on x86_64 only.  Otherwise weak symbol
+ * versions of the functions exported from here are used.
+ */
+
+#include <linux/pci.h>
+#include <asm/mtrr.h>
+#include <asm/processor.h>
+
+#include "qib.h"
+
+/**
+ * qib_enable_wc - enable write combining for MMIO writes to the device
+ * @dd: qlogic_ib device
+ *
+ * This routine is x86_64-specific; it twiddles the CPU's MTRRs to enable
+ * write combining.
+ */
+int qib_enable_wc(struct qib_devdata *dd)
+{
+	int ret = 0;
+	u64 pioaddr, piolen;
+	unsigned bits;
+	const unsigned long addr = pci_resource_start(dd->pcidev, 0);
+	const size_t len = pci_resource_len(dd->pcidev, 0);
+
+	/*
+	 * Set the PIO buffers to be WCCOMB, so we get HT bursts to the
+	 * chip.  Linux (possibly the hardware) requires it to be on a power
+	 * of 2 address matching the length (which has to be a power of 2).
+	 * For rev1, that means the base address, for rev2, it will be just
+	 * the PIO buffers themselves.
+	 * For chips with two sets of buffers, the calculations are
+	 * somewhat more complicated; we need to sum, and the piobufbase
+	 * register has both offsets, 2K in low 32 bits, 4K in high 32 bits.
+	 * The buffers are still packed, so a single range covers both.
+	 */
+	if (dd->piobcnt2k && dd->piobcnt4k) {
+		/* 2 sizes for chip */
+		unsigned long pio2kbase, pio4kbase;
+		pio2kbase = dd->piobufbase & 0xffffffffUL;
+		pio4kbase = (dd->piobufbase >> 32) & 0xffffffffUL;
+		if (pio2kbase < pio4kbase) {
+			/* all current chips */
+			pioaddr = addr + pio2kbase;
+			piolen = pio4kbase - pio2kbase +
+				dd->piobcnt4k * dd->align4k;
+		} else {
+			pioaddr = addr + pio4kbase;
+			piolen = pio2kbase - pio4kbase +
+				dd->piobcnt2k * dd->palign;
+		}
+	} else {  /* single buffer size (2K, currently) */
+		pioaddr = addr + dd->piobufbase;
+		piolen = dd->piobcnt2k * dd->palign +
+			dd->piobcnt4k * dd->align4k;
+	}
+
+	for (bits = 0; !(piolen & (1ULL << bits)); bits++)
+		/* do nothing */ ;
+
+	if (piolen != (1ULL << bits)) {
+		piolen >>= bits;
+		while (piolen >>= 1)
+			bits++;
+		piolen = 1ULL << (bits + 1);
+	}
+	if (pioaddr & (piolen - 1)) {
+		u64 atmp;
+		atmp = pioaddr & ~(piolen - 1);
+		if (atmp < addr || (atmp + piolen) > (addr + len)) {
+			qib_dev_err(dd,
+				"No way to align address/size (%llx/%llx), no WC mtrr\n",
+				(unsigned long long) atmp,
+				(unsigned long long) piolen << 1);
+			ret = -ENODEV;
+		} else {
+			pioaddr = atmp;
+			piolen <<= 1;
+		}
+	}
+
+	if (!ret) {
+		int cookie;
+
+		cookie = mtrr_add(pioaddr, piolen, MTRR_TYPE_WRCOMB, 0);
+		if (cookie < 0) {
+			{
+				qib_devinfo(dd->pcidev,
+					 "mtrr_add()  WC for PIO bufs failed (%d)\n",
+					 cookie);
+				ret = -EINVAL;
+			}
+		} else {
+			dd->wc_cookie = cookie;
+			dd->wc_base = (unsigned long) pioaddr;
+			dd->wc_len = (unsigned long) piolen;
+		}
+	}
+
+	return ret;
+}
+
+/**
+ * qib_disable_wc - disable write combining for MMIO writes to the device
+ * @dd: qlogic_ib device
+ */
+void qib_disable_wc(struct qib_devdata *dd)
+{
+	if (dd->wc_cookie) {
+		int r;
+
+		r = mtrr_del(dd->wc_cookie, dd->wc_base,
+			     dd->wc_len);
+		if (r < 0)
+			qib_devinfo(dd->pcidev,
+				 "mtrr_del(%lx, %lx, %lx) failed: %d\n",
+				 dd->wc_cookie, dd->wc_base,
+				 dd->wc_len, r);
+		dd->wc_cookie = 0; /* even on failure */
+	}
+}
+
+/**
+ * qib_unordered_wc - indicate whether write combining is ordered
+ *
+ * Because our performance depends on our ability to do write combining mmio
+ * writes in the most efficient way, we need to know if we are on an Intel
+ * or AMD x86_64 processor.  AMD x86_64 processors flush WC buffers out in
+ * the order completed, and so no special flushing is required to get
+ * correct ordering.  Intel processors, however, will flush write buffers
+ * out in "random" orders, and so explicit ordering is needed at times.
+ */
+int qib_unordered_wc(void)
+{
+	return boot_cpu_data.x86_vendor != X86_VENDOR_AMD;
+}
diff --git a/drivers/infiniband/hw/usnic/Kconfig b/drivers/infiniband/hw/usnic/Kconfig
new file mode 100644
index 0000000..29ab11c
--- /dev/null
+++ b/drivers/infiniband/hw/usnic/Kconfig
@@ -0,0 +1,10 @@
+config INFINIBAND_USNIC
+	tristate "Verbs support for Cisco VIC"
+	depends on NETDEVICES && ETHERNET && INET && PCI && INTEL_IOMMU
+	select ENIC
+	select NET_VENDOR_CISCO
+	select PCI_IOV
+	select INFINIBAND_USER_ACCESS
+	---help---
+	  This is a low-level driver for Cisco's Virtual Interface
+	  Cards (VICs), including the VIC 1240 and 1280 cards.
diff --git a/drivers/infiniband/hw/usnic/Makefile b/drivers/infiniband/hw/usnic/Makefile
new file mode 100644
index 0000000..99fb2db
--- /dev/null
+++ b/drivers/infiniband/hw/usnic/Makefile
@@ -0,0 +1,15 @@
+ccflags-y := -Idrivers/net/ethernet/cisco/enic
+
+obj-$(CONFIG_INFINIBAND_USNIC)+= usnic_verbs.o
+
+usnic_verbs-y=\
+usnic_fwd.o \
+usnic_transport.o \
+usnic_uiom.o \
+usnic_uiom_interval_tree.o \
+usnic_vnic.o \
+usnic_ib_main.o \
+usnic_ib_qp_grp.o \
+usnic_ib_sysfs.o \
+usnic_ib_verbs.o \
+usnic_debugfs.o \
diff --git a/drivers/infiniband/hw/usnic/usnic.h b/drivers/infiniband/hw/usnic/usnic.h
new file mode 100644
index 0000000..5be13d8
--- /dev/null
+++ b/drivers/infiniband/hw/usnic/usnic.h
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved.
+ *
+ * This program is free software; you may redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; version 2 of the License.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifndef USNIC_H_
+#define USNIC_H_
+
+#define DRV_NAME	"usnic_verbs"
+
+#define PCI_DEVICE_ID_CISCO_VIC_USPACE_NIC	0x00cf	/* User space NIC */
+
+#define DRV_VERSION    "1.0.3"
+#define DRV_RELDATE    "December 19, 2013"
+
+#endif /* USNIC_H_ */
diff --git a/drivers/infiniband/hw/usnic/usnic_abi.h b/drivers/infiniband/hw/usnic/usnic_abi.h
new file mode 100644
index 0000000..04a6622
--- /dev/null
+++ b/drivers/infiniband/hw/usnic/usnic_abi.h
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved.
+ *
+ * This program is free software; you may redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; version 2 of the License.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+
+#ifndef USNIC_ABI_H
+#define USNIC_ABI_H
+
+/* ABI between userspace and kernel */
+#define USNIC_UVERBS_ABI_VERSION	4
+
+#define USNIC_QP_GRP_MAX_WQS		8
+#define USNIC_QP_GRP_MAX_RQS		8
+#define USNIC_QP_GRP_MAX_CQS		16
+
+enum usnic_transport_type {
+	USNIC_TRANSPORT_UNKNOWN		= 0,
+	USNIC_TRANSPORT_ROCE_CUSTOM	= 1,
+	USNIC_TRANSPORT_IPV4_UDP	= 2,
+	USNIC_TRANSPORT_MAX		= 3,
+};
+
+struct usnic_transport_spec {
+	enum usnic_transport_type	trans_type;
+	union {
+		struct {
+			uint16_t	port_num;
+		} usnic_roce;
+		struct {
+			uint32_t	sock_fd;
+		} udp;
+	};
+};
+
+struct usnic_ib_create_qp_cmd {
+	struct usnic_transport_spec	spec;
+};
+
+/*TODO: Future - usnic_modify_qp needs to pass in generic filters */
+struct usnic_ib_create_qp_resp {
+	u32				vfid;
+	u32				qp_grp_id;
+	u64				bar_bus_addr;
+	u32				bar_len;
+/*
+ * WQ, RQ, CQ are explicity specified bc exposing a generic resources inteface
+ * expands the scope of ABI to many files.
+ */
+	u32				wq_cnt;
+	u32				rq_cnt;
+	u32				cq_cnt;
+	u32				wq_idx[USNIC_QP_GRP_MAX_WQS];
+	u32				rq_idx[USNIC_QP_GRP_MAX_RQS];
+	u32				cq_idx[USNIC_QP_GRP_MAX_CQS];
+	u32				transport;
+	u32				reserved[9];
+};
+
+#endif /* USNIC_ABI_H */
diff --git a/drivers/infiniband/hw/usnic/usnic_common_pkt_hdr.h b/drivers/infiniband/hw/usnic/usnic_common_pkt_hdr.h
new file mode 100644
index 0000000..3935672
--- /dev/null
+++ b/drivers/infiniband/hw/usnic/usnic_common_pkt_hdr.h
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved.
+ *
+ * This program is free software; you may redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; version 2 of the License.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifndef USNIC_CMN_PKT_HDR_H
+#define USNIC_CMN_PKT_HDR_H
+
+#define USNIC_ROCE_ETHERTYPE		(0x8915)
+#define USNIC_ROCE_GRH_VER              (8)
+#define USNIC_PROTO_VER                 (1)
+#define USNIC_ROCE_GRH_VER_SHIFT        (4)
+
+#endif /* USNIC_COMMON_PKT_HDR_H */
diff --git a/drivers/infiniband/hw/usnic/usnic_common_util.h b/drivers/infiniband/hw/usnic/usnic_common_util.h
new file mode 100644
index 0000000..9d737ed
--- /dev/null
+++ b/drivers/infiniband/hw/usnic/usnic_common_util.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved.
+ *
+ * This program is free software; you may redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; version 2 of the License.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifndef USNIC_CMN_UTIL_H
+#define USNIC_CMN_UTIL_H
+
+static inline void
+usnic_mac_to_gid(const char *const mac, char *raw_gid)
+{
+	raw_gid[0] = 0xfe;
+	raw_gid[1] = 0x80;
+	memset(&raw_gid[2], 0, 6);
+	raw_gid[8] = mac[0]^2;
+	raw_gid[9] = mac[1];
+	raw_gid[10] = mac[2];
+	raw_gid[11] = 0xff;
+	raw_gid[12] = 0xfe;
+	raw_gid[13] = mac[3];
+	raw_gid[14] = mac[4];
+	raw_gid[15] = mac[5];
+}
+
+static inline void
+usnic_mac_ip_to_gid(const char *const mac, const __be32 inaddr, char *raw_gid)
+{
+	raw_gid[0] = 0xfe;
+	raw_gid[1] = 0x80;
+	memset(&raw_gid[2], 0, 2);
+	memcpy(&raw_gid[4], &inaddr, 4);
+	raw_gid[8] = mac[0]^2;
+	raw_gid[9] = mac[1];
+	raw_gid[10] = mac[2];
+	raw_gid[11] = 0xff;
+	raw_gid[12] = 0xfe;
+	raw_gid[13] = mac[3];
+	raw_gid[14] = mac[4];
+	raw_gid[15] = mac[5];
+}
+
+static inline void
+usnic_write_gid_if_id_from_mac(char *mac, char *raw_gid)
+{
+	raw_gid[8] = mac[0]^2;
+	raw_gid[9] = mac[1];
+	raw_gid[10] = mac[2];
+	raw_gid[11] = 0xff;
+	raw_gid[12] = 0xfe;
+	raw_gid[13] = mac[3];
+	raw_gid[14] = mac[4];
+	raw_gid[15] = mac[5];
+}
+
+#endif /* USNIC_COMMON_UTIL_H */
diff --git a/drivers/infiniband/hw/usnic/usnic_debugfs.c b/drivers/infiniband/hw/usnic/usnic_debugfs.c
new file mode 100644
index 0000000..5d13860
--- /dev/null
+++ b/drivers/infiniband/hw/usnic/usnic_debugfs.c
@@ -0,0 +1,154 @@
+/*
+ * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved.
+ *
+ * This program is free software; you may redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; version 2 of the License.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <linux/debugfs.h>
+#include <linux/module.h>
+
+#include "usnic.h"
+#include "usnic_log.h"
+#include "usnic_debugfs.h"
+#include "usnic_ib_qp_grp.h"
+#include "usnic_transport.h"
+
+static struct dentry *debugfs_root;
+static struct dentry *flows_dentry;
+
+static ssize_t usnic_debugfs_buildinfo_read(struct file *f, char __user *data,
+						size_t count, loff_t *ppos)
+{
+	char buf[500];
+	int res;
+
+	if (*ppos > 0)
+		return 0;
+
+	res = scnprintf(buf, sizeof(buf),
+			"version:       %s\n"
+			"build date:    %s\n",
+			DRV_VERSION, DRV_RELDATE);
+
+	return simple_read_from_buffer(data, count, ppos, buf, res);
+}
+
+static const struct file_operations usnic_debugfs_buildinfo_ops = {
+	.owner = THIS_MODULE,
+	.open = simple_open,
+	.read = usnic_debugfs_buildinfo_read
+};
+
+static ssize_t flowinfo_read(struct file *f, char __user *data,
+				size_t count, loff_t *ppos)
+{
+	struct usnic_ib_qp_grp_flow *qp_flow;
+	int n;
+	int left;
+	char *ptr;
+	char buf[512];
+
+	qp_flow = f->private_data;
+	ptr = buf;
+	left = count;
+
+	if (*ppos > 0)
+		return 0;
+
+	spin_lock(&qp_flow->qp_grp->lock);
+	n = scnprintf(ptr, left,
+			"QP Grp ID: %d Transport: %s ",
+			qp_flow->qp_grp->grp_id,
+			usnic_transport_to_str(qp_flow->trans_type));
+	UPDATE_PTR_LEFT(n, ptr, left);
+	if (qp_flow->trans_type == USNIC_TRANSPORT_ROCE_CUSTOM) {
+		n = scnprintf(ptr, left, "Port_Num:%hu\n",
+					qp_flow->usnic_roce.port_num);
+		UPDATE_PTR_LEFT(n, ptr, left);
+	} else if (qp_flow->trans_type == USNIC_TRANSPORT_IPV4_UDP) {
+		n = usnic_transport_sock_to_str(ptr, left,
+				qp_flow->udp.sock);
+		UPDATE_PTR_LEFT(n, ptr, left);
+		n = scnprintf(ptr, left, "\n");
+		UPDATE_PTR_LEFT(n, ptr, left);
+	}
+	spin_unlock(&qp_flow->qp_grp->lock);
+
+	return simple_read_from_buffer(data, count, ppos, buf, ptr - buf);
+}
+
+static const struct file_operations flowinfo_ops = {
+	.owner = THIS_MODULE,
+	.open = simple_open,
+	.read = flowinfo_read,
+};
+
+void usnic_debugfs_init(void)
+{
+	debugfs_root = debugfs_create_dir(DRV_NAME, NULL);
+	if (IS_ERR(debugfs_root)) {
+		usnic_err("Failed to create debugfs root dir, check if debugfs is enabled in kernel configuration\n");
+		goto out_clear_root;
+	}
+
+	flows_dentry = debugfs_create_dir("flows", debugfs_root);
+	if (IS_ERR_OR_NULL(flows_dentry)) {
+		usnic_err("Failed to create debugfs flow dir with err %ld\n",
+				PTR_ERR(flows_dentry));
+		goto out_free_root;
+	}
+
+	debugfs_create_file("build-info", S_IRUGO, debugfs_root,
+				NULL, &usnic_debugfs_buildinfo_ops);
+	return;
+
+out_free_root:
+	debugfs_remove_recursive(debugfs_root);
+out_clear_root:
+	debugfs_root = NULL;
+}
+
+void usnic_debugfs_exit(void)
+{
+	if (!debugfs_root)
+		return;
+
+	debugfs_remove_recursive(debugfs_root);
+	debugfs_root = NULL;
+}
+
+void usnic_debugfs_flow_add(struct usnic_ib_qp_grp_flow *qp_flow)
+{
+	if (IS_ERR_OR_NULL(flows_dentry))
+		return;
+
+	scnprintf(qp_flow->dentry_name, sizeof(qp_flow->dentry_name),
+			"%u", qp_flow->flow->flow_id);
+	qp_flow->dbgfs_dentry = debugfs_create_file(qp_flow->dentry_name,
+							S_IRUGO,
+							flows_dentry,
+							qp_flow,
+							&flowinfo_ops);
+	if (IS_ERR_OR_NULL(qp_flow->dbgfs_dentry)) {
+		usnic_err("Failed to create dbg fs entry for flow %u\n",
+				qp_flow->flow->flow_id);
+	}
+}
+
+void usnic_debugfs_flow_remove(struct usnic_ib_qp_grp_flow *qp_flow)
+{
+	if (!IS_ERR_OR_NULL(qp_flow->dbgfs_dentry))
+		debugfs_remove(qp_flow->dbgfs_dentry);
+}
diff --git a/drivers/infiniband/hw/usnic/usnic_debugfs.h b/drivers/infiniband/hw/usnic/usnic_debugfs.h
new file mode 100644
index 0000000..4087d24
--- /dev/null
+++ b/drivers/infiniband/hw/usnic/usnic_debugfs.h
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved.
+ *
+ * This program is free software; you may redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; version 2 of the License.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#ifndef USNIC_DEBUGFS_H_
+#define USNIC_DEBUGFS_H_
+
+#include "usnic_ib_qp_grp.h"
+
+void usnic_debugfs_init(void);
+
+void usnic_debugfs_exit(void);
+void usnic_debugfs_flow_add(struct usnic_ib_qp_grp_flow *qp_flow);
+void usnic_debugfs_flow_remove(struct usnic_ib_qp_grp_flow *qp_flow);
+
+#endif /*!USNIC_DEBUGFS_H_ */
diff --git a/drivers/infiniband/hw/usnic/usnic_fwd.c b/drivers/infiniband/hw/usnic/usnic_fwd.c
new file mode 100644
index 0000000..e3c9bd9
--- /dev/null
+++ b/drivers/infiniband/hw/usnic/usnic_fwd.c
@@ -0,0 +1,350 @@
+/*
+ * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved.
+ *
+ * This program is free software; you may redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; version 2 of the License.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/netdevice.h>
+#include <linux/pci.h>
+
+#include "enic_api.h"
+#include "usnic_common_pkt_hdr.h"
+#include "usnic_fwd.h"
+#include "usnic_log.h"
+
+static int usnic_fwd_devcmd_locked(struct usnic_fwd_dev *ufdev, int vnic_idx,
+					enum vnic_devcmd_cmd cmd, u64 *a0,
+					u64 *a1)
+{
+	int status;
+	struct net_device *netdev = ufdev->netdev;
+
+	lockdep_assert_held(&ufdev->lock);
+
+	status = enic_api_devcmd_proxy_by_index(netdev,
+			vnic_idx,
+			cmd,
+			a0, a1,
+			1000);
+	if (status) {
+		if (status == ERR_EINVAL && cmd == CMD_DEL_FILTER) {
+			usnic_dbg("Dev %s vnic idx %u cmd %u already deleted",
+					ufdev->name, vnic_idx, cmd);
+		} else {
+			usnic_err("Dev %s vnic idx %u cmd %u failed with status %d\n",
+					ufdev->name, vnic_idx, cmd,
+					status);
+		}
+	} else {
+		usnic_dbg("Dev %s vnic idx %u cmd %u success",
+				ufdev->name, vnic_idx, cmd);
+	}
+
+	return status;
+}
+
+static int usnic_fwd_devcmd(struct usnic_fwd_dev *ufdev, int vnic_idx,
+				enum vnic_devcmd_cmd cmd, u64 *a0, u64 *a1)
+{
+	int status;
+
+	spin_lock(&ufdev->lock);
+	status = usnic_fwd_devcmd_locked(ufdev, vnic_idx, cmd, a0, a1);
+	spin_unlock(&ufdev->lock);
+
+	return status;
+}
+
+struct usnic_fwd_dev *usnic_fwd_dev_alloc(struct pci_dev *pdev)
+{
+	struct usnic_fwd_dev *ufdev;
+
+	ufdev = kzalloc(sizeof(*ufdev), GFP_KERNEL);
+	if (!ufdev)
+		return NULL;
+
+	ufdev->pdev = pdev;
+	ufdev->netdev = pci_get_drvdata(pdev);
+	spin_lock_init(&ufdev->lock);
+	strncpy(ufdev->name, netdev_name(ufdev->netdev),
+			sizeof(ufdev->name) - 1);
+
+	return ufdev;
+}
+
+void usnic_fwd_dev_free(struct usnic_fwd_dev *ufdev)
+{
+	kfree(ufdev);
+}
+
+void usnic_fwd_set_mac(struct usnic_fwd_dev *ufdev, char mac[ETH_ALEN])
+{
+	spin_lock(&ufdev->lock);
+	memcpy(&ufdev->mac, mac, sizeof(ufdev->mac));
+	spin_unlock(&ufdev->lock);
+}
+
+int usnic_fwd_add_ipaddr(struct usnic_fwd_dev *ufdev, __be32 inaddr)
+{
+	int status;
+
+	spin_lock(&ufdev->lock);
+	if (ufdev->inaddr == 0) {
+		ufdev->inaddr = inaddr;
+		status = 0;
+	} else {
+		status = -EFAULT;
+	}
+	spin_unlock(&ufdev->lock);
+
+	return status;
+}
+
+void usnic_fwd_del_ipaddr(struct usnic_fwd_dev *ufdev)
+{
+	spin_lock(&ufdev->lock);
+	ufdev->inaddr = 0;
+	spin_unlock(&ufdev->lock);
+}
+
+void usnic_fwd_carrier_up(struct usnic_fwd_dev *ufdev)
+{
+	spin_lock(&ufdev->lock);
+	ufdev->link_up = 1;
+	spin_unlock(&ufdev->lock);
+}
+
+void usnic_fwd_carrier_down(struct usnic_fwd_dev *ufdev)
+{
+	spin_lock(&ufdev->lock);
+	ufdev->link_up = 0;
+	spin_unlock(&ufdev->lock);
+}
+
+void usnic_fwd_set_mtu(struct usnic_fwd_dev *ufdev, unsigned int mtu)
+{
+	spin_lock(&ufdev->lock);
+	ufdev->mtu = mtu;
+	spin_unlock(&ufdev->lock);
+}
+
+static int usnic_fwd_dev_ready_locked(struct usnic_fwd_dev *ufdev)
+{
+	lockdep_assert_held(&ufdev->lock);
+
+	if (!ufdev->link_up)
+		return -EPERM;
+
+	return 0;
+}
+
+static int validate_filter_locked(struct usnic_fwd_dev *ufdev,
+					struct filter *filter)
+{
+
+	lockdep_assert_held(&ufdev->lock);
+
+	if (filter->type == FILTER_IPV4_5TUPLE) {
+		if (!(filter->u.ipv4.flags & FILTER_FIELD_5TUP_DST_AD))
+			return -EACCES;
+		if (!(filter->u.ipv4.flags & FILTER_FIELD_5TUP_DST_PT))
+			return -EBUSY;
+		else if (ufdev->inaddr == 0)
+			return -EINVAL;
+		else if (filter->u.ipv4.dst_port == 0)
+			return -ERANGE;
+		else if (ntohl(ufdev->inaddr) != filter->u.ipv4.dst_addr)
+			return -EFAULT;
+		else
+			return 0;
+	}
+
+	return 0;
+}
+
+static void fill_tlv(struct filter_tlv *tlv, struct filter *filter,
+		struct filter_action *action)
+{
+	tlv->type = CLSF_TLV_FILTER;
+	tlv->length = sizeof(struct filter);
+	*((struct filter *)&tlv->val) = *filter;
+
+	tlv = (struct filter_tlv *)((char *)tlv + sizeof(struct filter_tlv) +
+			sizeof(struct filter));
+	tlv->type = CLSF_TLV_ACTION;
+	tlv->length = sizeof(struct filter_action);
+	*((struct filter_action *)&tlv->val) = *action;
+}
+
+struct usnic_fwd_flow*
+usnic_fwd_alloc_flow(struct usnic_fwd_dev *ufdev, struct filter *filter,
+				struct usnic_filter_action *uaction)
+{
+	struct filter_tlv *tlv;
+	struct pci_dev *pdev;
+	struct usnic_fwd_flow *flow;
+	uint64_t a0, a1;
+	uint64_t tlv_size;
+	dma_addr_t tlv_pa;
+	int status;
+
+	pdev = ufdev->pdev;
+	tlv_size = (2*sizeof(struct filter_tlv) + sizeof(struct filter) +
+			sizeof(struct filter_action));
+
+	flow = kzalloc(sizeof(*flow), GFP_ATOMIC);
+	if (!flow)
+		return ERR_PTR(-ENOMEM);
+
+	tlv = pci_alloc_consistent(pdev, tlv_size, &tlv_pa);
+	if (!tlv) {
+		usnic_err("Failed to allocate memory\n");
+		status = -ENOMEM;
+		goto out_free_flow;
+	}
+
+	fill_tlv(tlv, filter, &uaction->action);
+
+	spin_lock(&ufdev->lock);
+	status = usnic_fwd_dev_ready_locked(ufdev);
+	if (status) {
+		usnic_err("Forwarding dev %s not ready with status %d\n",
+				ufdev->name, status);
+		goto out_free_tlv;
+	}
+
+	status = validate_filter_locked(ufdev, filter);
+	if (status) {
+		usnic_err("Failed to validate filter with status %d\n",
+				status);
+		goto out_free_tlv;
+	}
+
+	/* Issue Devcmd */
+	a0 = tlv_pa;
+	a1 = tlv_size;
+	status = usnic_fwd_devcmd_locked(ufdev, uaction->vnic_idx,
+						CMD_ADD_FILTER, &a0, &a1);
+	if (status) {
+		usnic_err("VF %s Filter add failed with status:%d",
+				ufdev->name, status);
+		status = -EFAULT;
+		goto out_free_tlv;
+	} else {
+		usnic_dbg("VF %s FILTER ID:%llu", ufdev->name, a0);
+	}
+
+	flow->flow_id = (uint32_t) a0;
+	flow->vnic_idx = uaction->vnic_idx;
+	flow->ufdev = ufdev;
+
+out_free_tlv:
+	spin_unlock(&ufdev->lock);
+	pci_free_consistent(pdev, tlv_size, tlv, tlv_pa);
+	if (!status)
+		return flow;
+out_free_flow:
+	kfree(flow);
+	return ERR_PTR(status);
+}
+
+int usnic_fwd_dealloc_flow(struct usnic_fwd_flow *flow)
+{
+	int status;
+	u64 a0, a1;
+
+	a0 = flow->flow_id;
+
+	status = usnic_fwd_devcmd(flow->ufdev, flow->vnic_idx,
+					CMD_DEL_FILTER, &a0, &a1);
+	if (status) {
+		if (status == ERR_EINVAL) {
+			usnic_dbg("Filter %u already deleted for VF Idx %u pf: %s status: %d",
+					flow->flow_id, flow->vnic_idx,
+					flow->ufdev->name, status);
+		} else {
+			usnic_err("PF %s VF Idx %u Filter: %u FILTER DELETE failed with status %d",
+					flow->ufdev->name, flow->vnic_idx,
+					flow->flow_id, status);
+		}
+		status = 0;
+		/*
+		 * Log the error and fake success to the caller because if
+		 * a flow fails to be deleted in the firmware, it is an
+		 * unrecoverable error.
+		 */
+	} else {
+		usnic_dbg("PF %s VF Idx %u Filter: %u FILTER DELETED",
+				flow->ufdev->name, flow->vnic_idx,
+				flow->flow_id);
+	}
+
+	kfree(flow);
+	return status;
+}
+
+int usnic_fwd_enable_qp(struct usnic_fwd_dev *ufdev, int vnic_idx, int qp_idx)
+{
+	int status;
+	struct net_device *pf_netdev;
+	u64 a0, a1;
+
+	pf_netdev = ufdev->netdev;
+	a0 = qp_idx;
+	a1 = CMD_QP_RQWQ;
+
+	status = usnic_fwd_devcmd(ufdev, vnic_idx, CMD_QP_ENABLE,
+						&a0, &a1);
+	if (status) {
+		usnic_err("PF %s VNIC Index %u RQ Index: %u ENABLE Failed with status %d",
+				netdev_name(pf_netdev),
+				vnic_idx,
+				qp_idx,
+				status);
+	} else {
+		usnic_dbg("PF %s VNIC Index %u RQ Index: %u ENABLED",
+				netdev_name(pf_netdev),
+				vnic_idx, qp_idx);
+	}
+
+	return status;
+}
+
+int usnic_fwd_disable_qp(struct usnic_fwd_dev *ufdev, int vnic_idx, int qp_idx)
+{
+	int status;
+	u64 a0, a1;
+	struct net_device *pf_netdev;
+
+	pf_netdev = ufdev->netdev;
+	a0 = qp_idx;
+	a1 = CMD_QP_RQWQ;
+
+	status = usnic_fwd_devcmd(ufdev, vnic_idx, CMD_QP_DISABLE,
+			&a0, &a1);
+	if (status) {
+		usnic_err("PF %s VNIC Index %u RQ Index: %u DISABLE Failed with status %d",
+				netdev_name(pf_netdev),
+				vnic_idx,
+				qp_idx,
+				status);
+	} else {
+		usnic_dbg("PF %s VNIC Index %u RQ Index: %u DISABLED",
+				netdev_name(pf_netdev),
+				vnic_idx,
+				qp_idx);
+	}
+
+	return status;
+}
diff --git a/drivers/infiniband/hw/usnic/usnic_fwd.h b/drivers/infiniband/hw/usnic/usnic_fwd.h
new file mode 100644
index 0000000..93713a2
--- /dev/null
+++ b/drivers/infiniband/hw/usnic/usnic_fwd.h
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved.
+ *
+ * This program is free software; you may redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; version 2 of the License.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifndef USNIC_FWD_H_
+#define USNIC_FWD_H_
+
+#include <linux/if.h>
+#include <linux/netdevice.h>
+#include <linux/pci.h>
+#include <linux/in.h>
+
+#include "usnic_abi.h"
+#include "usnic_common_pkt_hdr.h"
+#include "vnic_devcmd.h"
+
+struct usnic_fwd_dev {
+	struct pci_dev			*pdev;
+	struct net_device		*netdev;
+	spinlock_t			lock;
+	/*
+	 * The following fields can be read directly off the device.
+	 * However, they should be set by a accessor function, except name,
+	 * which cannot be changed.
+	 */
+	bool				link_up;
+	char				mac[ETH_ALEN];
+	unsigned int			mtu;
+	__be32				inaddr;
+	char				name[IFNAMSIZ+1];
+};
+
+struct usnic_fwd_flow {
+	uint32_t			flow_id;
+	struct usnic_fwd_dev		*ufdev;
+	unsigned int			vnic_idx;
+};
+
+struct usnic_filter_action {
+	int				vnic_idx;
+	struct filter_action		action;
+};
+
+struct usnic_fwd_dev *usnic_fwd_dev_alloc(struct pci_dev *pdev);
+void usnic_fwd_dev_free(struct usnic_fwd_dev *ufdev);
+
+void usnic_fwd_set_mac(struct usnic_fwd_dev *ufdev, char mac[ETH_ALEN]);
+int usnic_fwd_add_ipaddr(struct usnic_fwd_dev *ufdev, __be32 inaddr);
+void usnic_fwd_del_ipaddr(struct usnic_fwd_dev *ufdev);
+void usnic_fwd_carrier_up(struct usnic_fwd_dev *ufdev);
+void usnic_fwd_carrier_down(struct usnic_fwd_dev *ufdev);
+void usnic_fwd_set_mtu(struct usnic_fwd_dev *ufdev, unsigned int mtu);
+
+/*
+ * Allocate a flow on this forwarding device. Whoever calls this function,
+ * must monitor netdev events on ufdev's netdevice. If NETDEV_REBOOT or
+ * NETDEV_DOWN is seen, flow will no longer function and must be
+ * immediately freed by calling usnic_dealloc_flow.
+ */
+struct usnic_fwd_flow*
+usnic_fwd_alloc_flow(struct usnic_fwd_dev *ufdev, struct filter *filter,
+				struct usnic_filter_action *action);
+int usnic_fwd_dealloc_flow(struct usnic_fwd_flow *flow);
+int usnic_fwd_enable_qp(struct usnic_fwd_dev *ufdev, int vnic_idx, int qp_idx);
+int usnic_fwd_disable_qp(struct usnic_fwd_dev *ufdev, int vnic_idx, int qp_idx);
+
+static inline void usnic_fwd_init_usnic_filter(struct filter *filter,
+						uint32_t usnic_id)
+{
+	filter->type = FILTER_USNIC_ID;
+	filter->u.usnic.ethtype = USNIC_ROCE_ETHERTYPE;
+	filter->u.usnic.flags = FILTER_FIELD_USNIC_ETHTYPE |
+				FILTER_FIELD_USNIC_ID |
+				FILTER_FIELD_USNIC_PROTO;
+	filter->u.usnic.proto_version = (USNIC_ROCE_GRH_VER <<
+					 USNIC_ROCE_GRH_VER_SHIFT) |
+					 USNIC_PROTO_VER;
+	filter->u.usnic.usnic_id = usnic_id;
+}
+
+static inline void usnic_fwd_init_udp_filter(struct filter *filter,
+						uint32_t daddr, uint16_t dport)
+{
+	filter->type = FILTER_IPV4_5TUPLE;
+	filter->u.ipv4.flags = FILTER_FIELD_5TUP_PROTO;
+	filter->u.ipv4.protocol = PROTO_UDP;
+
+	if (daddr) {
+		filter->u.ipv4.flags |= FILTER_FIELD_5TUP_DST_AD;
+		filter->u.ipv4.dst_addr = daddr;
+	}
+
+	if (dport) {
+		filter->u.ipv4.flags |= FILTER_FIELD_5TUP_DST_PT;
+		filter->u.ipv4.dst_port = dport;
+	}
+}
+
+#endif /* !USNIC_FWD_H_ */
diff --git a/drivers/infiniband/hw/usnic/usnic_ib.h b/drivers/infiniband/hw/usnic/usnic_ib.h
new file mode 100644
index 0000000..e5a9297
--- /dev/null
+++ b/drivers/infiniband/hw/usnic/usnic_ib.h
@@ -0,0 +1,118 @@
+/*
+ * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved.
+ *
+ * This program is free software; you may redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; version 2 of the License.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifndef USNIC_IB_H_
+#define USNIC_IB_H_
+
+#include <linux/iommu.h>
+#include <linux/netdevice.h>
+
+#include <rdma/ib_verbs.h>
+
+
+#include "usnic.h"
+#include "usnic_abi.h"
+#include "usnic_vnic.h"
+
+#define USNIC_IB_PORT_CNT		1
+#define USNIC_IB_NUM_COMP_VECTORS	1
+
+extern unsigned int usnic_ib_share_vf;
+
+struct usnic_ib_ucontext {
+	struct ib_ucontext		ibucontext;
+	/* Protected by usnic_ib_dev->usdev_lock */
+	struct list_head		qp_grp_list;
+	struct list_head		link;
+};
+
+struct usnic_ib_pd {
+	struct ib_pd			ibpd;
+	struct usnic_uiom_pd		*umem_pd;
+};
+
+struct usnic_ib_mr {
+	struct ib_mr			ibmr;
+	struct usnic_uiom_reg		*umem;
+};
+
+struct usnic_ib_dev {
+	struct ib_device		ib_dev;
+	struct pci_dev			*pdev;
+	struct net_device		*netdev;
+	struct usnic_fwd_dev		*ufdev;
+	struct list_head		ib_dev_link;
+	struct list_head		vf_dev_list;
+	struct list_head		ctx_list;
+	struct mutex			usdev_lock;
+
+	/* provisioning information */
+	struct kref			vf_cnt;
+	unsigned int			vf_res_cnt[USNIC_VNIC_RES_TYPE_MAX];
+
+	/* sysfs vars for QPN reporting */
+	struct kobject *qpn_kobj;
+};
+
+struct usnic_ib_vf {
+	struct usnic_ib_dev		*pf;
+	spinlock_t			lock;
+	struct usnic_vnic		*vnic;
+	unsigned int			qp_grp_ref_cnt;
+	struct usnic_ib_pd		*pd;
+	struct list_head		link;
+};
+
+static inline
+struct usnic_ib_dev *to_usdev(struct ib_device *ibdev)
+{
+	return container_of(ibdev, struct usnic_ib_dev, ib_dev);
+}
+
+static inline
+struct usnic_ib_ucontext *to_ucontext(struct ib_ucontext *ibucontext)
+{
+	return container_of(ibucontext, struct usnic_ib_ucontext, ibucontext);
+}
+
+static inline
+struct usnic_ib_pd *to_upd(struct ib_pd *ibpd)
+{
+	return container_of(ibpd, struct usnic_ib_pd, ibpd);
+}
+
+static inline
+struct usnic_ib_ucontext *to_uucontext(struct ib_ucontext *ibucontext)
+{
+	return container_of(ibucontext, struct usnic_ib_ucontext, ibucontext);
+}
+
+static inline
+struct usnic_ib_mr *to_umr(struct ib_mr *ibmr)
+{
+	return container_of(ibmr, struct usnic_ib_mr, ibmr);
+}
+void usnic_ib_log_vf(struct usnic_ib_vf *vf);
+
+#define UPDATE_PTR_LEFT(N, P, L)			\
+do {							\
+	L -= (N);					\
+	P += (N);					\
+} while (0)
+
+#endif /* USNIC_IB_H_ */
diff --git a/drivers/infiniband/hw/usnic/usnic_ib_main.c b/drivers/infiniband/hw/usnic/usnic_ib_main.c
new file mode 100644
index 0000000..0d0f986
--- /dev/null
+++ b/drivers/infiniband/hw/usnic/usnic_ib_main.c
@@ -0,0 +1,682 @@
+/*
+ * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved.
+ *
+ * This program is free software; you may redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; version 2 of the License.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Author: Upinder Malhi <umalhi@cisco.com>
+ * Author: Anant Deepak <anadeepa@cisco.com>
+ * Author: Cesare Cantu' <cantuc@cisco.com>
+ * Author: Jeff Squyres <jsquyres@cisco.com>
+ * Author: Kiran Thirumalai <kithirum@cisco.com>
+ * Author: Xuyang Wang <xuywang@cisco.com>
+ * Author: Reese Faucette <rfaucett@cisco.com>
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/inetdevice.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/errno.h>
+#include <linux/pci.h>
+#include <linux/netdevice.h>
+
+#include <rdma/ib_user_verbs.h>
+#include <rdma/ib_addr.h>
+
+#include "usnic_abi.h"
+#include "usnic_common_util.h"
+#include "usnic_ib.h"
+#include "usnic_ib_qp_grp.h"
+#include "usnic_log.h"
+#include "usnic_fwd.h"
+#include "usnic_debugfs.h"
+#include "usnic_ib_verbs.h"
+#include "usnic_transport.h"
+#include "usnic_uiom.h"
+#include "usnic_ib_sysfs.h"
+
+unsigned int usnic_log_lvl = USNIC_LOG_LVL_ERR;
+unsigned int usnic_ib_share_vf = 1;
+
+static const char usnic_version[] =
+	DRV_NAME ": Cisco VIC (USNIC) Verbs Driver v"
+	DRV_VERSION " (" DRV_RELDATE ")\n";
+
+static DEFINE_MUTEX(usnic_ib_ibdev_list_lock);
+static LIST_HEAD(usnic_ib_ibdev_list);
+
+/* Callback dump funcs */
+static int usnic_ib_dump_vf_hdr(void *obj, char *buf, int buf_sz)
+{
+	struct usnic_ib_vf *vf = obj;
+	return scnprintf(buf, buf_sz, "PF: %s ", vf->pf->ib_dev.name);
+}
+/* End callback dump funcs */
+
+static void usnic_ib_dump_vf(struct usnic_ib_vf *vf, char *buf, int buf_sz)
+{
+	usnic_vnic_dump(vf->vnic, buf, buf_sz, vf,
+			usnic_ib_dump_vf_hdr,
+			usnic_ib_qp_grp_dump_hdr, usnic_ib_qp_grp_dump_rows);
+}
+
+void usnic_ib_log_vf(struct usnic_ib_vf *vf)
+{
+	char buf[1000];
+	usnic_ib_dump_vf(vf, buf, sizeof(buf));
+	usnic_dbg("%s\n", buf);
+}
+
+/* Start of netdev section */
+static inline const char *usnic_ib_netdev_event_to_string(unsigned long event)
+{
+	const char *event2str[] = {"NETDEV_NONE", "NETDEV_UP", "NETDEV_DOWN",
+		"NETDEV_REBOOT", "NETDEV_CHANGE",
+		"NETDEV_REGISTER", "NETDEV_UNREGISTER", "NETDEV_CHANGEMTU",
+		"NETDEV_CHANGEADDR", "NETDEV_GOING_DOWN", "NETDEV_FEAT_CHANGE",
+		"NETDEV_BONDING_FAILOVER", "NETDEV_PRE_UP",
+		"NETDEV_PRE_TYPE_CHANGE", "NETDEV_POST_TYPE_CHANGE",
+		"NETDEV_POST_INT", "NETDEV_UNREGISTER_FINAL", "NETDEV_RELEASE",
+		"NETDEV_NOTIFY_PEERS", "NETDEV_JOIN"
+	};
+
+	if (event >= ARRAY_SIZE(event2str))
+		return "UNKNOWN_NETDEV_EVENT";
+	else
+		return event2str[event];
+}
+
+static void usnic_ib_qp_grp_modify_active_to_err(struct usnic_ib_dev *us_ibdev)
+{
+	struct usnic_ib_ucontext *ctx;
+	struct usnic_ib_qp_grp *qp_grp;
+	enum ib_qp_state cur_state;
+	int status;
+
+	BUG_ON(!mutex_is_locked(&us_ibdev->usdev_lock));
+
+	list_for_each_entry(ctx, &us_ibdev->ctx_list, link) {
+		list_for_each_entry(qp_grp, &ctx->qp_grp_list, link) {
+			cur_state = qp_grp->state;
+			if (cur_state == IB_QPS_INIT ||
+				cur_state == IB_QPS_RTR ||
+				cur_state == IB_QPS_RTS) {
+				status = usnic_ib_qp_grp_modify(qp_grp,
+								IB_QPS_ERR,
+								NULL);
+				if (status) {
+					usnic_err("Failed to transistion qp grp %u from %s to %s\n",
+						qp_grp->grp_id,
+						usnic_ib_qp_grp_state_to_string
+						(cur_state),
+						usnic_ib_qp_grp_state_to_string
+						(IB_QPS_ERR));
+				}
+			}
+		}
+	}
+}
+
+static void usnic_ib_handle_usdev_event(struct usnic_ib_dev *us_ibdev,
+					unsigned long event)
+{
+	struct net_device *netdev;
+	struct ib_event ib_event;
+
+	memset(&ib_event, 0, sizeof(ib_event));
+
+	mutex_lock(&us_ibdev->usdev_lock);
+	netdev = us_ibdev->netdev;
+	switch (event) {
+	case NETDEV_REBOOT:
+		usnic_info("PF Reset on %s\n", us_ibdev->ib_dev.name);
+		usnic_ib_qp_grp_modify_active_to_err(us_ibdev);
+		ib_event.event = IB_EVENT_PORT_ERR;
+		ib_event.device = &us_ibdev->ib_dev;
+		ib_event.element.port_num = 1;
+		ib_dispatch_event(&ib_event);
+		break;
+	case NETDEV_UP:
+	case NETDEV_DOWN:
+	case NETDEV_CHANGE:
+		if (!us_ibdev->ufdev->link_up &&
+				netif_carrier_ok(netdev)) {
+			usnic_fwd_carrier_up(us_ibdev->ufdev);
+			usnic_info("Link UP on %s\n", us_ibdev->ib_dev.name);
+			ib_event.event = IB_EVENT_PORT_ACTIVE;
+			ib_event.device = &us_ibdev->ib_dev;
+			ib_event.element.port_num = 1;
+			ib_dispatch_event(&ib_event);
+		} else if (us_ibdev->ufdev->link_up &&
+				!netif_carrier_ok(netdev)) {
+			usnic_fwd_carrier_down(us_ibdev->ufdev);
+			usnic_info("Link DOWN on %s\n", us_ibdev->ib_dev.name);
+			usnic_ib_qp_grp_modify_active_to_err(us_ibdev);
+			ib_event.event = IB_EVENT_PORT_ERR;
+			ib_event.device = &us_ibdev->ib_dev;
+			ib_event.element.port_num = 1;
+			ib_dispatch_event(&ib_event);
+		} else {
+			usnic_dbg("Ignoring %s on %s\n",
+					usnic_ib_netdev_event_to_string(event),
+					us_ibdev->ib_dev.name);
+		}
+		break;
+	case NETDEV_CHANGEADDR:
+		if (!memcmp(us_ibdev->ufdev->mac, netdev->dev_addr,
+				sizeof(us_ibdev->ufdev->mac))) {
+			usnic_dbg("Ignoring addr change on %s\n",
+					us_ibdev->ib_dev.name);
+		} else {
+			usnic_info(" %s old mac: %pM new mac: %pM\n",
+					us_ibdev->ib_dev.name,
+					us_ibdev->ufdev->mac,
+					netdev->dev_addr);
+			usnic_fwd_set_mac(us_ibdev->ufdev, netdev->dev_addr);
+			usnic_ib_qp_grp_modify_active_to_err(us_ibdev);
+			ib_event.event = IB_EVENT_GID_CHANGE;
+			ib_event.device = &us_ibdev->ib_dev;
+			ib_event.element.port_num = 1;
+			ib_dispatch_event(&ib_event);
+		}
+
+		break;
+	case NETDEV_CHANGEMTU:
+		if (us_ibdev->ufdev->mtu != netdev->mtu) {
+			usnic_info("MTU Change on %s old: %u new: %u\n",
+					us_ibdev->ib_dev.name,
+					us_ibdev->ufdev->mtu, netdev->mtu);
+			usnic_fwd_set_mtu(us_ibdev->ufdev, netdev->mtu);
+			usnic_ib_qp_grp_modify_active_to_err(us_ibdev);
+		} else {
+			usnic_dbg("Ignoring MTU change on %s\n",
+					us_ibdev->ib_dev.name);
+		}
+		break;
+	default:
+		usnic_dbg("Ignoring event %s on %s",
+				usnic_ib_netdev_event_to_string(event),
+				us_ibdev->ib_dev.name);
+	}
+	mutex_unlock(&us_ibdev->usdev_lock);
+}
+
+static int usnic_ib_netdevice_event(struct notifier_block *notifier,
+					unsigned long event, void *ptr)
+{
+	struct usnic_ib_dev *us_ibdev;
+
+	struct net_device *netdev = netdev_notifier_info_to_dev(ptr);
+
+	mutex_lock(&usnic_ib_ibdev_list_lock);
+	list_for_each_entry(us_ibdev, &usnic_ib_ibdev_list, ib_dev_link) {
+		if (us_ibdev->netdev == netdev) {
+			usnic_ib_handle_usdev_event(us_ibdev, event);
+			break;
+		}
+	}
+	mutex_unlock(&usnic_ib_ibdev_list_lock);
+
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block usnic_ib_netdevice_notifier = {
+	.notifier_call = usnic_ib_netdevice_event
+};
+/* End of netdev section */
+
+/* Start of inet section */
+static int usnic_ib_handle_inet_event(struct usnic_ib_dev *us_ibdev,
+					unsigned long event, void *ptr)
+{
+	struct in_ifaddr *ifa = ptr;
+	struct ib_event ib_event;
+
+	mutex_lock(&us_ibdev->usdev_lock);
+
+	switch (event) {
+	case NETDEV_DOWN:
+		usnic_info("%s via ip notifiers",
+				usnic_ib_netdev_event_to_string(event));
+		usnic_fwd_del_ipaddr(us_ibdev->ufdev);
+		usnic_ib_qp_grp_modify_active_to_err(us_ibdev);
+		ib_event.event = IB_EVENT_GID_CHANGE;
+		ib_event.device = &us_ibdev->ib_dev;
+		ib_event.element.port_num = 1;
+		ib_dispatch_event(&ib_event);
+		break;
+	case NETDEV_UP:
+		usnic_fwd_add_ipaddr(us_ibdev->ufdev, ifa->ifa_address);
+		usnic_info("%s via ip notifiers: ip %pI4",
+				usnic_ib_netdev_event_to_string(event),
+				&us_ibdev->ufdev->inaddr);
+		ib_event.event = IB_EVENT_GID_CHANGE;
+		ib_event.device = &us_ibdev->ib_dev;
+		ib_event.element.port_num = 1;
+		ib_dispatch_event(&ib_event);
+		break;
+	default:
+		usnic_info("Ignoring event %s on %s",
+				usnic_ib_netdev_event_to_string(event),
+				us_ibdev->ib_dev.name);
+	}
+	mutex_unlock(&us_ibdev->usdev_lock);
+
+	return NOTIFY_DONE;
+}
+
+static int usnic_ib_inetaddr_event(struct notifier_block *notifier,
+					unsigned long event, void *ptr)
+{
+	struct usnic_ib_dev *us_ibdev;
+	struct in_ifaddr *ifa = ptr;
+	struct net_device *netdev = ifa->ifa_dev->dev;
+
+	mutex_lock(&usnic_ib_ibdev_list_lock);
+	list_for_each_entry(us_ibdev, &usnic_ib_ibdev_list, ib_dev_link) {
+		if (us_ibdev->netdev == netdev) {
+			usnic_ib_handle_inet_event(us_ibdev, event, ptr);
+			break;
+		}
+	}
+	mutex_unlock(&usnic_ib_ibdev_list_lock);
+
+	return NOTIFY_DONE;
+}
+static struct notifier_block usnic_ib_inetaddr_notifier = {
+	.notifier_call = usnic_ib_inetaddr_event
+};
+/* End of inet section*/
+
+/* Start of PF discovery section */
+static void *usnic_ib_device_add(struct pci_dev *dev)
+{
+	struct usnic_ib_dev *us_ibdev;
+	union ib_gid gid;
+	struct in_ifaddr *in;
+	struct net_device *netdev;
+
+	usnic_dbg("\n");
+	netdev = pci_get_drvdata(dev);
+
+	us_ibdev = (struct usnic_ib_dev *)ib_alloc_device(sizeof(*us_ibdev));
+	if (IS_ERR_OR_NULL(us_ibdev)) {
+		usnic_err("Device %s context alloc failed\n",
+				netdev_name(pci_get_drvdata(dev)));
+		return ERR_PTR(us_ibdev ? PTR_ERR(us_ibdev) : -EFAULT);
+	}
+
+	us_ibdev->ufdev = usnic_fwd_dev_alloc(dev);
+	if (IS_ERR_OR_NULL(us_ibdev->ufdev)) {
+		usnic_err("Failed to alloc ufdev for %s with err %ld\n",
+				pci_name(dev), PTR_ERR(us_ibdev->ufdev));
+		goto err_dealloc;
+	}
+
+	mutex_init(&us_ibdev->usdev_lock);
+	INIT_LIST_HEAD(&us_ibdev->vf_dev_list);
+	INIT_LIST_HEAD(&us_ibdev->ctx_list);
+
+	us_ibdev->pdev = dev;
+	us_ibdev->netdev = pci_get_drvdata(dev);
+	us_ibdev->ib_dev.owner = THIS_MODULE;
+	us_ibdev->ib_dev.node_type = RDMA_NODE_USNIC_UDP;
+	us_ibdev->ib_dev.phys_port_cnt = USNIC_IB_PORT_CNT;
+	us_ibdev->ib_dev.num_comp_vectors = USNIC_IB_NUM_COMP_VECTORS;
+	us_ibdev->ib_dev.dma_device = &dev->dev;
+	us_ibdev->ib_dev.uverbs_abi_ver = USNIC_UVERBS_ABI_VERSION;
+	strlcpy(us_ibdev->ib_dev.name, "usnic_%d", IB_DEVICE_NAME_MAX);
+
+	us_ibdev->ib_dev.uverbs_cmd_mask =
+		(1ull << IB_USER_VERBS_CMD_GET_CONTEXT) |
+		(1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) |
+		(1ull << IB_USER_VERBS_CMD_QUERY_PORT) |
+		(1ull << IB_USER_VERBS_CMD_ALLOC_PD) |
+		(1ull << IB_USER_VERBS_CMD_DEALLOC_PD) |
+		(1ull << IB_USER_VERBS_CMD_REG_MR) |
+		(1ull << IB_USER_VERBS_CMD_DEREG_MR) |
+		(1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) |
+		(1ull << IB_USER_VERBS_CMD_CREATE_CQ) |
+		(1ull << IB_USER_VERBS_CMD_DESTROY_CQ) |
+		(1ull << IB_USER_VERBS_CMD_CREATE_QP) |
+		(1ull << IB_USER_VERBS_CMD_MODIFY_QP) |
+		(1ull << IB_USER_VERBS_CMD_QUERY_QP) |
+		(1ull << IB_USER_VERBS_CMD_DESTROY_QP) |
+		(1ull << IB_USER_VERBS_CMD_ATTACH_MCAST) |
+		(1ull << IB_USER_VERBS_CMD_DETACH_MCAST) |
+		(1ull << IB_USER_VERBS_CMD_OPEN_QP);
+
+	us_ibdev->ib_dev.query_device = usnic_ib_query_device;
+	us_ibdev->ib_dev.query_port = usnic_ib_query_port;
+	us_ibdev->ib_dev.query_pkey = usnic_ib_query_pkey;
+	us_ibdev->ib_dev.query_gid = usnic_ib_query_gid;
+	us_ibdev->ib_dev.get_link_layer = usnic_ib_port_link_layer;
+	us_ibdev->ib_dev.alloc_pd = usnic_ib_alloc_pd;
+	us_ibdev->ib_dev.dealloc_pd = usnic_ib_dealloc_pd;
+	us_ibdev->ib_dev.create_qp = usnic_ib_create_qp;
+	us_ibdev->ib_dev.modify_qp = usnic_ib_modify_qp;
+	us_ibdev->ib_dev.query_qp = usnic_ib_query_qp;
+	us_ibdev->ib_dev.destroy_qp = usnic_ib_destroy_qp;
+	us_ibdev->ib_dev.create_cq = usnic_ib_create_cq;
+	us_ibdev->ib_dev.destroy_cq = usnic_ib_destroy_cq;
+	us_ibdev->ib_dev.reg_user_mr = usnic_ib_reg_mr;
+	us_ibdev->ib_dev.dereg_mr = usnic_ib_dereg_mr;
+	us_ibdev->ib_dev.alloc_ucontext = usnic_ib_alloc_ucontext;
+	us_ibdev->ib_dev.dealloc_ucontext = usnic_ib_dealloc_ucontext;
+	us_ibdev->ib_dev.mmap = usnic_ib_mmap;
+	us_ibdev->ib_dev.create_ah = usnic_ib_create_ah;
+	us_ibdev->ib_dev.destroy_ah = usnic_ib_destroy_ah;
+	us_ibdev->ib_dev.post_send = usnic_ib_post_send;
+	us_ibdev->ib_dev.post_recv = usnic_ib_post_recv;
+	us_ibdev->ib_dev.poll_cq = usnic_ib_poll_cq;
+	us_ibdev->ib_dev.req_notify_cq = usnic_ib_req_notify_cq;
+	us_ibdev->ib_dev.get_dma_mr = usnic_ib_get_dma_mr;
+
+
+	if (ib_register_device(&us_ibdev->ib_dev, NULL))
+		goto err_fwd_dealloc;
+
+	usnic_fwd_set_mtu(us_ibdev->ufdev, us_ibdev->netdev->mtu);
+	usnic_fwd_set_mac(us_ibdev->ufdev, us_ibdev->netdev->dev_addr);
+	if (netif_carrier_ok(us_ibdev->netdev))
+		usnic_fwd_carrier_up(us_ibdev->ufdev);
+
+	in = ((struct in_device *)(netdev->ip_ptr))->ifa_list;
+	if (in != NULL)
+		usnic_fwd_add_ipaddr(us_ibdev->ufdev, in->ifa_address);
+
+	usnic_mac_ip_to_gid(us_ibdev->netdev->perm_addr,
+				us_ibdev->ufdev->inaddr, &gid.raw[0]);
+	memcpy(&us_ibdev->ib_dev.node_guid, &gid.global.interface_id,
+		sizeof(gid.global.interface_id));
+	kref_init(&us_ibdev->vf_cnt);
+
+	usnic_info("Added ibdev: %s netdev: %s with mac %pM Link: %u MTU: %u\n",
+			us_ibdev->ib_dev.name, netdev_name(us_ibdev->netdev),
+			us_ibdev->ufdev->mac, us_ibdev->ufdev->link_up,
+			us_ibdev->ufdev->mtu);
+	return us_ibdev;
+
+err_fwd_dealloc:
+	usnic_fwd_dev_free(us_ibdev->ufdev);
+err_dealloc:
+	usnic_err("failed -- deallocing device\n");
+	ib_dealloc_device(&us_ibdev->ib_dev);
+	return NULL;
+}
+
+static void usnic_ib_device_remove(struct usnic_ib_dev *us_ibdev)
+{
+	usnic_info("Unregistering %s\n", us_ibdev->ib_dev.name);
+	usnic_ib_sysfs_unregister_usdev(us_ibdev);
+	usnic_fwd_dev_free(us_ibdev->ufdev);
+	ib_unregister_device(&us_ibdev->ib_dev);
+	ib_dealloc_device(&us_ibdev->ib_dev);
+}
+
+static void usnic_ib_undiscover_pf(struct kref *kref)
+{
+	struct usnic_ib_dev *us_ibdev, *tmp;
+	struct pci_dev *dev;
+	bool found = false;
+
+	dev = container_of(kref, struct usnic_ib_dev, vf_cnt)->pdev;
+	mutex_lock(&usnic_ib_ibdev_list_lock);
+	list_for_each_entry_safe(us_ibdev, tmp,
+				&usnic_ib_ibdev_list, ib_dev_link) {
+		if (us_ibdev->pdev == dev) {
+			list_del(&us_ibdev->ib_dev_link);
+			usnic_ib_device_remove(us_ibdev);
+			found = true;
+			break;
+		}
+	}
+
+	WARN(!found, "Failed to remove PF %s\n", pci_name(dev));
+
+	mutex_unlock(&usnic_ib_ibdev_list_lock);
+}
+
+static struct usnic_ib_dev *usnic_ib_discover_pf(struct usnic_vnic *vnic)
+{
+	struct usnic_ib_dev *us_ibdev;
+	struct pci_dev *parent_pci, *vf_pci;
+	int err;
+
+	vf_pci = usnic_vnic_get_pdev(vnic);
+	parent_pci = pci_physfn(vf_pci);
+
+	BUG_ON(!parent_pci);
+
+	mutex_lock(&usnic_ib_ibdev_list_lock);
+	list_for_each_entry(us_ibdev, &usnic_ib_ibdev_list, ib_dev_link) {
+		if (us_ibdev->pdev == parent_pci) {
+			kref_get(&us_ibdev->vf_cnt);
+			goto out;
+		}
+	}
+
+	us_ibdev = usnic_ib_device_add(parent_pci);
+	if (IS_ERR_OR_NULL(us_ibdev)) {
+		us_ibdev = us_ibdev ? us_ibdev : ERR_PTR(-EFAULT);
+		goto out;
+	}
+
+	err = usnic_ib_sysfs_register_usdev(us_ibdev);
+	if (err) {
+		usnic_ib_device_remove(us_ibdev);
+		us_ibdev = ERR_PTR(err);
+		goto out;
+	}
+
+	list_add(&us_ibdev->ib_dev_link, &usnic_ib_ibdev_list);
+out:
+	mutex_unlock(&usnic_ib_ibdev_list_lock);
+	return us_ibdev;
+}
+/* End of PF discovery section */
+
+/* Start of PCI section */
+
+static const struct pci_device_id usnic_ib_pci_ids[] = {
+	{PCI_DEVICE(PCI_VENDOR_ID_CISCO, PCI_DEVICE_ID_CISCO_VIC_USPACE_NIC)},
+	{0,}
+};
+
+static int usnic_ib_pci_probe(struct pci_dev *pdev,
+				const struct pci_device_id *id)
+{
+	int err;
+	struct usnic_ib_dev *pf;
+	struct usnic_ib_vf *vf;
+	enum usnic_vnic_res_type res_type;
+
+	vf = kzalloc(sizeof(*vf), GFP_KERNEL);
+	if (!vf)
+		return -ENOMEM;
+
+	err = pci_enable_device(pdev);
+	if (err) {
+		usnic_err("Failed to enable %s with err %d\n",
+				pci_name(pdev), err);
+		goto out_clean_vf;
+	}
+
+	err = pci_request_regions(pdev, DRV_NAME);
+	if (err) {
+		usnic_err("Failed to request region for %s with err %d\n",
+				pci_name(pdev), err);
+		goto out_disable_device;
+	}
+
+	pci_set_master(pdev);
+	pci_set_drvdata(pdev, vf);
+
+	vf->vnic = usnic_vnic_alloc(pdev);
+	if (IS_ERR_OR_NULL(vf->vnic)) {
+		err = vf->vnic ? PTR_ERR(vf->vnic) : -ENOMEM;
+		usnic_err("Failed to alloc vnic for %s with err %d\n",
+				pci_name(pdev), err);
+		goto out_release_regions;
+	}
+
+	pf = usnic_ib_discover_pf(vf->vnic);
+	if (IS_ERR_OR_NULL(pf)) {
+		usnic_err("Failed to discover pf of vnic %s with err%ld\n",
+				pci_name(pdev), PTR_ERR(pf));
+		err = pf ? PTR_ERR(pf) : -EFAULT;
+		goto out_clean_vnic;
+	}
+
+	vf->pf = pf;
+	spin_lock_init(&vf->lock);
+	mutex_lock(&pf->usdev_lock);
+	list_add_tail(&vf->link, &pf->vf_dev_list);
+	/*
+	 * Save max settings (will be same for each VF, easier to re-write than
+	 * to say "if (!set) { set_values(); set=1; }
+	 */
+	for (res_type = USNIC_VNIC_RES_TYPE_EOL+1;
+			res_type < USNIC_VNIC_RES_TYPE_MAX;
+			res_type++) {
+		pf->vf_res_cnt[res_type] = usnic_vnic_res_cnt(vf->vnic,
+								res_type);
+	}
+
+	mutex_unlock(&pf->usdev_lock);
+
+	usnic_info("Registering usnic VF %s into PF %s\n", pci_name(pdev),
+			pf->ib_dev.name);
+	usnic_ib_log_vf(vf);
+	return 0;
+
+out_clean_vnic:
+	usnic_vnic_free(vf->vnic);
+out_release_regions:
+	pci_set_drvdata(pdev, NULL);
+	pci_clear_master(pdev);
+	pci_release_regions(pdev);
+out_disable_device:
+	pci_disable_device(pdev);
+out_clean_vf:
+	kfree(vf);
+	return err;
+}
+
+static void usnic_ib_pci_remove(struct pci_dev *pdev)
+{
+	struct usnic_ib_vf *vf = pci_get_drvdata(pdev);
+	struct usnic_ib_dev *pf = vf->pf;
+
+	mutex_lock(&pf->usdev_lock);
+	list_del(&vf->link);
+	mutex_unlock(&pf->usdev_lock);
+
+	kref_put(&pf->vf_cnt, usnic_ib_undiscover_pf);
+	usnic_vnic_free(vf->vnic);
+	pci_set_drvdata(pdev, NULL);
+	pci_clear_master(pdev);
+	pci_release_regions(pdev);
+	pci_disable_device(pdev);
+	kfree(vf);
+
+	usnic_info("Removed VF %s\n", pci_name(pdev));
+}
+
+/* PCI driver entry points */
+static struct pci_driver usnic_ib_pci_driver = {
+	.name = DRV_NAME,
+	.id_table = usnic_ib_pci_ids,
+	.probe = usnic_ib_pci_probe,
+	.remove = usnic_ib_pci_remove,
+};
+/* End of PCI section */
+
+/* Start of module section */
+static int __init usnic_ib_init(void)
+{
+	int err;
+
+	printk_once(KERN_INFO "%s", usnic_version);
+
+	err = usnic_uiom_init(DRV_NAME);
+	if (err) {
+		usnic_err("Unable to initalize umem with err %d\n", err);
+		return err;
+	}
+
+	if (pci_register_driver(&usnic_ib_pci_driver)) {
+		usnic_err("Unable to register with PCI\n");
+		goto out_umem_fini;
+	}
+
+	err = register_netdevice_notifier(&usnic_ib_netdevice_notifier);
+	if (err) {
+		usnic_err("Failed to register netdev notifier\n");
+		goto out_pci_unreg;
+	}
+
+	err = register_inetaddr_notifier(&usnic_ib_inetaddr_notifier);
+	if (err) {
+		usnic_err("Failed to register inet addr notifier\n");
+		goto out_unreg_netdev_notifier;
+	}
+
+	err = usnic_transport_init();
+	if (err) {
+		usnic_err("Failed to initialize transport\n");
+		goto out_unreg_inetaddr_notifier;
+	}
+
+	usnic_debugfs_init();
+
+	return 0;
+
+out_unreg_inetaddr_notifier:
+	unregister_inetaddr_notifier(&usnic_ib_inetaddr_notifier);
+out_unreg_netdev_notifier:
+	unregister_netdevice_notifier(&usnic_ib_netdevice_notifier);
+out_pci_unreg:
+	pci_unregister_driver(&usnic_ib_pci_driver);
+out_umem_fini:
+	usnic_uiom_fini();
+
+	return err;
+}
+
+static void __exit usnic_ib_destroy(void)
+{
+	usnic_dbg("\n");
+	usnic_debugfs_exit();
+	usnic_transport_fini();
+	unregister_inetaddr_notifier(&usnic_ib_inetaddr_notifier);
+	unregister_netdevice_notifier(&usnic_ib_netdevice_notifier);
+	pci_unregister_driver(&usnic_ib_pci_driver);
+	usnic_uiom_fini();
+}
+
+MODULE_DESCRIPTION("Cisco VIC (usNIC) Verbs Driver");
+MODULE_AUTHOR("Upinder Malhi <umalhi@cisco.com>");
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_VERSION(DRV_VERSION);
+module_param(usnic_log_lvl, uint, S_IRUGO | S_IWUSR);
+module_param(usnic_ib_share_vf, uint, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(usnic_log_lvl, " Off=0, Err=1, Info=2, Debug=3");
+MODULE_PARM_DESC(usnic_ib_share_vf, "Off=0, On=1 VF sharing amongst QPs");
+MODULE_DEVICE_TABLE(pci, usnic_ib_pci_ids);
+
+module_init(usnic_ib_init);
+module_exit(usnic_ib_destroy);
+/* End of module section */
diff --git a/drivers/infiniband/hw/usnic/usnic_ib_qp_grp.c b/drivers/infiniband/hw/usnic/usnic_ib_qp_grp.c
new file mode 100644
index 0000000..db3588d
--- /dev/null
+++ b/drivers/infiniband/hw/usnic/usnic_ib_qp_grp.c
@@ -0,0 +1,761 @@
+/*
+ * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved.
+ *
+ * This program is free software; you may redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; version 2 of the License.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/bug.h>
+#include <linux/errno.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+
+#include "usnic_log.h"
+#include "usnic_vnic.h"
+#include "usnic_fwd.h"
+#include "usnic_uiom.h"
+#include "usnic_debugfs.h"
+#include "usnic_ib_qp_grp.h"
+#include "usnic_ib_sysfs.h"
+#include "usnic_transport.h"
+
+#define DFLT_RQ_IDX	0
+
+const char *usnic_ib_qp_grp_state_to_string(enum ib_qp_state state)
+{
+	switch (state) {
+	case IB_QPS_RESET:
+		return "Rst";
+	case IB_QPS_INIT:
+		return "Init";
+	case IB_QPS_RTR:
+		return "RTR";
+	case IB_QPS_RTS:
+		return "RTS";
+	case IB_QPS_SQD:
+		return "SQD";
+	case IB_QPS_SQE:
+		return "SQE";
+	case IB_QPS_ERR:
+		return "ERR";
+	default:
+		return "UNKOWN STATE";
+
+	}
+}
+
+int usnic_ib_qp_grp_dump_hdr(char *buf, int buf_sz)
+{
+	return scnprintf(buf, buf_sz, "|QPN\t|State\t|PID\t|VF Idx\t|Fil ID");
+}
+
+int usnic_ib_qp_grp_dump_rows(void *obj, char *buf, int buf_sz)
+{
+	struct usnic_ib_qp_grp *qp_grp = obj;
+	struct usnic_ib_qp_grp_flow *default_flow;
+	if (obj) {
+		default_flow = list_first_entry(&qp_grp->flows_lst,
+					struct usnic_ib_qp_grp_flow, link);
+		return scnprintf(buf, buf_sz, "|%d\t|%s\t|%d\t|%hu\t|%d",
+					qp_grp->ibqp.qp_num,
+					usnic_ib_qp_grp_state_to_string(
+							qp_grp->state),
+					qp_grp->owner_pid,
+					usnic_vnic_get_index(qp_grp->vf->vnic),
+					default_flow->flow->flow_id);
+	} else {
+		return scnprintf(buf, buf_sz, "|N/A\t|N/A\t|N/A\t|N/A\t|N/A");
+	}
+}
+
+static struct usnic_vnic_res_chunk *
+get_qp_res_chunk(struct usnic_ib_qp_grp *qp_grp)
+{
+	lockdep_assert_held(&qp_grp->lock);
+	/*
+	 * The QP res chunk, used to derive qp indices,
+	 * are just indices of the RQs
+	 */
+	return usnic_ib_qp_grp_get_chunk(qp_grp, USNIC_VNIC_RES_TYPE_RQ);
+}
+
+static int enable_qp_grp(struct usnic_ib_qp_grp *qp_grp)
+{
+
+	int status;
+	int i, vnic_idx;
+	struct usnic_vnic_res_chunk *res_chunk;
+	struct usnic_vnic_res *res;
+
+	lockdep_assert_held(&qp_grp->lock);
+
+	vnic_idx = usnic_vnic_get_index(qp_grp->vf->vnic);
+
+	res_chunk = get_qp_res_chunk(qp_grp);
+	if (IS_ERR_OR_NULL(res_chunk)) {
+		usnic_err("Unable to get qp res with err %ld\n",
+				PTR_ERR(res_chunk));
+		return res_chunk ? PTR_ERR(res_chunk) : -ENOMEM;
+	}
+
+	for (i = 0; i < res_chunk->cnt; i++) {
+		res = res_chunk->res[i];
+		status = usnic_fwd_enable_qp(qp_grp->ufdev, vnic_idx,
+						res->vnic_idx);
+		if (status) {
+			usnic_err("Failed to enable qp %d of %s:%d\n with err %d\n",
+					res->vnic_idx, qp_grp->ufdev->name,
+					vnic_idx, status);
+			goto out_err;
+		}
+	}
+
+	return 0;
+
+out_err:
+	for (i--; i >= 0; i--) {
+		res = res_chunk->res[i];
+		usnic_fwd_disable_qp(qp_grp->ufdev, vnic_idx,
+					res->vnic_idx);
+	}
+
+	return status;
+}
+
+static int disable_qp_grp(struct usnic_ib_qp_grp *qp_grp)
+{
+	int i, vnic_idx;
+	struct usnic_vnic_res_chunk *res_chunk;
+	struct usnic_vnic_res *res;
+	int status = 0;
+
+	lockdep_assert_held(&qp_grp->lock);
+	vnic_idx = usnic_vnic_get_index(qp_grp->vf->vnic);
+
+	res_chunk = get_qp_res_chunk(qp_grp);
+	if (IS_ERR_OR_NULL(res_chunk)) {
+		usnic_err("Unable to get qp res with err %ld\n",
+			PTR_ERR(res_chunk));
+		return res_chunk ? PTR_ERR(res_chunk) : -ENOMEM;
+	}
+
+	for (i = 0; i < res_chunk->cnt; i++) {
+		res = res_chunk->res[i];
+		status = usnic_fwd_disable_qp(qp_grp->ufdev, vnic_idx,
+						res->vnic_idx);
+		if (status) {
+			usnic_err("Failed to disable rq %d of %s:%d\n with err %d\n",
+					res->vnic_idx,
+					qp_grp->ufdev->name,
+					vnic_idx, status);
+		}
+	}
+
+	return status;
+
+}
+
+static int init_filter_action(struct usnic_ib_qp_grp *qp_grp,
+				struct usnic_filter_action *uaction)
+{
+	struct usnic_vnic_res_chunk *res_chunk;
+
+	res_chunk = usnic_ib_qp_grp_get_chunk(qp_grp, USNIC_VNIC_RES_TYPE_RQ);
+	if (IS_ERR_OR_NULL(res_chunk)) {
+		usnic_err("Unable to get %s with err %ld\n",
+			usnic_vnic_res_type_to_str(USNIC_VNIC_RES_TYPE_RQ),
+			PTR_ERR(res_chunk));
+		return res_chunk ? PTR_ERR(res_chunk) : -ENOMEM;
+	}
+
+	uaction->vnic_idx = usnic_vnic_get_index(qp_grp->vf->vnic);
+	uaction->action.type = FILTER_ACTION_RQ_STEERING;
+	uaction->action.u.rq_idx = res_chunk->res[DFLT_RQ_IDX]->vnic_idx;
+
+	return 0;
+}
+
+static struct usnic_ib_qp_grp_flow*
+create_roce_custom_flow(struct usnic_ib_qp_grp *qp_grp,
+			struct usnic_transport_spec *trans_spec)
+{
+	uint16_t port_num;
+	int err;
+	struct filter filter;
+	struct usnic_filter_action uaction;
+	struct usnic_ib_qp_grp_flow *qp_flow;
+	struct usnic_fwd_flow *flow;
+	enum usnic_transport_type trans_type;
+
+	trans_type = trans_spec->trans_type;
+	port_num = trans_spec->usnic_roce.port_num;
+
+	/* Reserve Port */
+	port_num = usnic_transport_rsrv_port(trans_type, port_num);
+	if (port_num == 0)
+		return ERR_PTR(-EINVAL);
+
+	/* Create Flow */
+	usnic_fwd_init_usnic_filter(&filter, port_num);
+	err = init_filter_action(qp_grp, &uaction);
+	if (err)
+		goto out_unreserve_port;
+
+	flow = usnic_fwd_alloc_flow(qp_grp->ufdev, &filter, &uaction);
+	if (IS_ERR_OR_NULL(flow)) {
+		usnic_err("Unable to alloc flow failed with err %ld\n",
+				PTR_ERR(flow));
+		err = flow ? PTR_ERR(flow) : -EFAULT;
+		goto out_unreserve_port;
+	}
+
+	/* Create Flow Handle */
+	qp_flow = kzalloc(sizeof(*qp_flow), GFP_ATOMIC);
+	if (IS_ERR_OR_NULL(qp_flow)) {
+		err = qp_flow ? PTR_ERR(qp_flow) : -ENOMEM;
+		goto out_dealloc_flow;
+	}
+	qp_flow->flow = flow;
+	qp_flow->trans_type = trans_type;
+	qp_flow->usnic_roce.port_num = port_num;
+	qp_flow->qp_grp = qp_grp;
+	return qp_flow;
+
+out_dealloc_flow:
+	usnic_fwd_dealloc_flow(flow);
+out_unreserve_port:
+	usnic_transport_unrsrv_port(trans_type, port_num);
+	return ERR_PTR(err);
+}
+
+static void release_roce_custom_flow(struct usnic_ib_qp_grp_flow *qp_flow)
+{
+	usnic_fwd_dealloc_flow(qp_flow->flow);
+	usnic_transport_unrsrv_port(qp_flow->trans_type,
+					qp_flow->usnic_roce.port_num);
+	kfree(qp_flow);
+}
+
+static struct usnic_ib_qp_grp_flow*
+create_udp_flow(struct usnic_ib_qp_grp *qp_grp,
+		struct usnic_transport_spec *trans_spec)
+{
+	struct socket *sock;
+	int sock_fd;
+	int err;
+	struct filter filter;
+	struct usnic_filter_action uaction;
+	struct usnic_ib_qp_grp_flow *qp_flow;
+	struct usnic_fwd_flow *flow;
+	enum usnic_transport_type trans_type;
+	uint32_t addr;
+	uint16_t port_num;
+	int proto;
+
+	trans_type = trans_spec->trans_type;
+	sock_fd = trans_spec->udp.sock_fd;
+
+	/* Get and check socket */
+	sock = usnic_transport_get_socket(sock_fd);
+	if (IS_ERR_OR_NULL(sock))
+		return ERR_CAST(sock);
+
+	err = usnic_transport_sock_get_addr(sock, &proto, &addr, &port_num);
+	if (err)
+		goto out_put_sock;
+
+	if (proto != IPPROTO_UDP) {
+		usnic_err("Protocol for fd %d is not UDP", sock_fd);
+		err = -EPERM;
+		goto out_put_sock;
+	}
+
+	/* Create flow */
+	usnic_fwd_init_udp_filter(&filter, addr, port_num);
+	err = init_filter_action(qp_grp, &uaction);
+	if (err)
+		goto out_put_sock;
+
+	flow = usnic_fwd_alloc_flow(qp_grp->ufdev, &filter, &uaction);
+	if (IS_ERR_OR_NULL(flow)) {
+		usnic_err("Unable to alloc flow failed with err %ld\n",
+				PTR_ERR(flow));
+		err = flow ? PTR_ERR(flow) : -EFAULT;
+		goto out_put_sock;
+	}
+
+	/* Create qp_flow */
+	qp_flow = kzalloc(sizeof(*qp_flow), GFP_ATOMIC);
+	if (IS_ERR_OR_NULL(qp_flow)) {
+		err = qp_flow ? PTR_ERR(qp_flow) : -ENOMEM;
+		goto out_dealloc_flow;
+	}
+	qp_flow->flow = flow;
+	qp_flow->trans_type = trans_type;
+	qp_flow->udp.sock = sock;
+	qp_flow->qp_grp = qp_grp;
+	return qp_flow;
+
+out_dealloc_flow:
+	usnic_fwd_dealloc_flow(flow);
+out_put_sock:
+	usnic_transport_put_socket(sock);
+	return ERR_PTR(err);
+}
+
+static void release_udp_flow(struct usnic_ib_qp_grp_flow *qp_flow)
+{
+	usnic_fwd_dealloc_flow(qp_flow->flow);
+	usnic_transport_put_socket(qp_flow->udp.sock);
+	kfree(qp_flow);
+}
+
+static struct usnic_ib_qp_grp_flow*
+create_and_add_flow(struct usnic_ib_qp_grp *qp_grp,
+			struct usnic_transport_spec *trans_spec)
+{
+	struct usnic_ib_qp_grp_flow *qp_flow;
+	enum usnic_transport_type trans_type;
+
+	trans_type = trans_spec->trans_type;
+	switch (trans_type) {
+	case USNIC_TRANSPORT_ROCE_CUSTOM:
+		qp_flow = create_roce_custom_flow(qp_grp, trans_spec);
+		break;
+	case USNIC_TRANSPORT_IPV4_UDP:
+		qp_flow = create_udp_flow(qp_grp, trans_spec);
+		break;
+	default:
+		usnic_err("Unsupported transport %u\n",
+				trans_spec->trans_type);
+		return ERR_PTR(-EINVAL);
+	}
+
+	if (!IS_ERR_OR_NULL(qp_flow)) {
+		list_add_tail(&qp_flow->link, &qp_grp->flows_lst);
+		usnic_debugfs_flow_add(qp_flow);
+	}
+
+
+	return qp_flow;
+}
+
+static void release_and_remove_flow(struct usnic_ib_qp_grp_flow *qp_flow)
+{
+	usnic_debugfs_flow_remove(qp_flow);
+	list_del(&qp_flow->link);
+
+	switch (qp_flow->trans_type) {
+	case USNIC_TRANSPORT_ROCE_CUSTOM:
+		release_roce_custom_flow(qp_flow);
+		break;
+	case USNIC_TRANSPORT_IPV4_UDP:
+		release_udp_flow(qp_flow);
+		break;
+	default:
+		WARN(1, "Unsupported transport %u\n",
+				qp_flow->trans_type);
+		break;
+	}
+}
+
+static void release_and_remove_all_flows(struct usnic_ib_qp_grp *qp_grp)
+{
+	struct usnic_ib_qp_grp_flow *qp_flow, *tmp;
+	list_for_each_entry_safe(qp_flow, tmp, &qp_grp->flows_lst, link)
+		release_and_remove_flow(qp_flow);
+}
+
+int usnic_ib_qp_grp_modify(struct usnic_ib_qp_grp *qp_grp,
+				enum ib_qp_state new_state,
+				void *data)
+{
+	int status = 0;
+	int vnic_idx;
+	struct ib_event ib_event;
+	enum ib_qp_state old_state;
+	struct usnic_transport_spec *trans_spec;
+	struct usnic_ib_qp_grp_flow *qp_flow;
+
+	old_state = qp_grp->state;
+	vnic_idx = usnic_vnic_get_index(qp_grp->vf->vnic);
+	trans_spec = (struct usnic_transport_spec *) data;
+
+	spin_lock(&qp_grp->lock);
+	switch (new_state) {
+	case IB_QPS_RESET:
+		switch (old_state) {
+		case IB_QPS_RESET:
+			/* NO-OP */
+			break;
+		case IB_QPS_INIT:
+			release_and_remove_all_flows(qp_grp);
+			status = 0;
+			break;
+		case IB_QPS_RTR:
+		case IB_QPS_RTS:
+		case IB_QPS_ERR:
+			status = disable_qp_grp(qp_grp);
+			release_and_remove_all_flows(qp_grp);
+			break;
+		default:
+			status = -EINVAL;
+		}
+		break;
+	case IB_QPS_INIT:
+		switch (old_state) {
+		case IB_QPS_RESET:
+			if (trans_spec) {
+				qp_flow = create_and_add_flow(qp_grp,
+								trans_spec);
+				if (IS_ERR_OR_NULL(qp_flow)) {
+					status = qp_flow ? PTR_ERR(qp_flow) : -EFAULT;
+					break;
+				}
+			} else {
+				/*
+				 * Optional to specify filters.
+				 */
+				status = 0;
+			}
+			break;
+		case IB_QPS_INIT:
+			if (trans_spec) {
+				qp_flow = create_and_add_flow(qp_grp,
+								trans_spec);
+				if (IS_ERR_OR_NULL(qp_flow)) {
+					status = qp_flow ? PTR_ERR(qp_flow) : -EFAULT;
+					break;
+				}
+			} else {
+				/*
+				 * Doesn't make sense to go into INIT state
+				 * from INIT state w/o adding filters.
+				 */
+				status = -EINVAL;
+			}
+			break;
+		case IB_QPS_RTR:
+			status = disable_qp_grp(qp_grp);
+			break;
+		case IB_QPS_RTS:
+			status = disable_qp_grp(qp_grp);
+			break;
+		default:
+			status = -EINVAL;
+		}
+		break;
+	case IB_QPS_RTR:
+		switch (old_state) {
+		case IB_QPS_INIT:
+			status = enable_qp_grp(qp_grp);
+			break;
+		default:
+			status = -EINVAL;
+		}
+		break;
+	case IB_QPS_RTS:
+		switch (old_state) {
+		case IB_QPS_RTR:
+			/* NO-OP FOR NOW */
+			break;
+		default:
+			status = -EINVAL;
+		}
+		break;
+	case IB_QPS_ERR:
+		ib_event.device = &qp_grp->vf->pf->ib_dev;
+		ib_event.element.qp = &qp_grp->ibqp;
+		ib_event.event = IB_EVENT_QP_FATAL;
+
+		switch (old_state) {
+		case IB_QPS_RESET:
+			qp_grp->ibqp.event_handler(&ib_event,
+					qp_grp->ibqp.qp_context);
+			break;
+		case IB_QPS_INIT:
+			release_and_remove_all_flows(qp_grp);
+			qp_grp->ibqp.event_handler(&ib_event,
+					qp_grp->ibqp.qp_context);
+			break;
+		case IB_QPS_RTR:
+		case IB_QPS_RTS:
+			status = disable_qp_grp(qp_grp);
+			release_and_remove_all_flows(qp_grp);
+			qp_grp->ibqp.event_handler(&ib_event,
+					qp_grp->ibqp.qp_context);
+			break;
+		default:
+			status = -EINVAL;
+		}
+		break;
+	default:
+		status = -EINVAL;
+	}
+	spin_unlock(&qp_grp->lock);
+
+	if (!status) {
+		qp_grp->state = new_state;
+		usnic_info("Transistioned %u from %s to %s",
+		qp_grp->grp_id,
+		usnic_ib_qp_grp_state_to_string(old_state),
+		usnic_ib_qp_grp_state_to_string(new_state));
+	} else {
+		usnic_err("Failed to transition %u from %s to %s",
+		qp_grp->grp_id,
+		usnic_ib_qp_grp_state_to_string(old_state),
+		usnic_ib_qp_grp_state_to_string(new_state));
+	}
+
+	return status;
+}
+
+static struct usnic_vnic_res_chunk**
+alloc_res_chunk_list(struct usnic_vnic *vnic,
+			struct usnic_vnic_res_spec *res_spec, void *owner_obj)
+{
+	enum usnic_vnic_res_type res_type;
+	struct usnic_vnic_res_chunk **res_chunk_list;
+	int err, i, res_cnt, res_lst_sz;
+
+	for (res_lst_sz = 0;
+		res_spec->resources[res_lst_sz].type != USNIC_VNIC_RES_TYPE_EOL;
+		res_lst_sz++) {
+		/* Do Nothing */
+	}
+
+	res_chunk_list = kzalloc(sizeof(*res_chunk_list)*(res_lst_sz+1),
+					GFP_ATOMIC);
+	if (!res_chunk_list)
+		return ERR_PTR(-ENOMEM);
+
+	for (i = 0; res_spec->resources[i].type != USNIC_VNIC_RES_TYPE_EOL;
+		i++) {
+		res_type = res_spec->resources[i].type;
+		res_cnt = res_spec->resources[i].cnt;
+
+		res_chunk_list[i] = usnic_vnic_get_resources(vnic, res_type,
+					res_cnt, owner_obj);
+		if (IS_ERR_OR_NULL(res_chunk_list[i])) {
+			err = res_chunk_list[i] ?
+					PTR_ERR(res_chunk_list[i]) : -ENOMEM;
+			usnic_err("Failed to get %s from %s with err %d\n",
+				usnic_vnic_res_type_to_str(res_type),
+				usnic_vnic_pci_name(vnic),
+				err);
+			goto out_free_res;
+		}
+	}
+
+	return res_chunk_list;
+
+out_free_res:
+	for (i--; i > 0; i--)
+		usnic_vnic_put_resources(res_chunk_list[i]);
+	kfree(res_chunk_list);
+	return ERR_PTR(err);
+}
+
+static void free_qp_grp_res(struct usnic_vnic_res_chunk **res_chunk_list)
+{
+	int i;
+	for (i = 0; res_chunk_list[i]; i++)
+		usnic_vnic_put_resources(res_chunk_list[i]);
+	kfree(res_chunk_list);
+}
+
+static int qp_grp_and_vf_bind(struct usnic_ib_vf *vf,
+				struct usnic_ib_pd *pd,
+				struct usnic_ib_qp_grp *qp_grp)
+{
+	int err;
+	struct pci_dev *pdev;
+
+	lockdep_assert_held(&vf->lock);
+
+	pdev = usnic_vnic_get_pdev(vf->vnic);
+	if (vf->qp_grp_ref_cnt == 0) {
+		err = usnic_uiom_attach_dev_to_pd(pd->umem_pd, &pdev->dev);
+		if (err) {
+			usnic_err("Failed to attach %s to domain\n",
+					pci_name(pdev));
+			return err;
+		}
+		vf->pd = pd;
+	}
+	vf->qp_grp_ref_cnt++;
+
+	WARN_ON(vf->pd != pd);
+	qp_grp->vf = vf;
+
+	return 0;
+}
+
+static void qp_grp_and_vf_unbind(struct usnic_ib_qp_grp *qp_grp)
+{
+	struct pci_dev *pdev;
+	struct usnic_ib_pd *pd;
+
+	lockdep_assert_held(&qp_grp->vf->lock);
+
+	pd = qp_grp->vf->pd;
+	pdev = usnic_vnic_get_pdev(qp_grp->vf->vnic);
+	if (--qp_grp->vf->qp_grp_ref_cnt == 0) {
+		qp_grp->vf->pd = NULL;
+		usnic_uiom_detach_dev_from_pd(pd->umem_pd, &pdev->dev);
+	}
+	qp_grp->vf = NULL;
+}
+
+static void log_spec(struct usnic_vnic_res_spec *res_spec)
+{
+	char buf[512];
+	usnic_vnic_spec_dump(buf, sizeof(buf), res_spec);
+	usnic_dbg("%s\n", buf);
+}
+
+static int qp_grp_id_from_flow(struct usnic_ib_qp_grp_flow *qp_flow,
+				uint32_t *id)
+{
+	enum usnic_transport_type trans_type = qp_flow->trans_type;
+	int err;
+	uint16_t port_num = 0;
+
+	switch (trans_type) {
+	case USNIC_TRANSPORT_ROCE_CUSTOM:
+		*id = qp_flow->usnic_roce.port_num;
+		break;
+	case USNIC_TRANSPORT_IPV4_UDP:
+		err = usnic_transport_sock_get_addr(qp_flow->udp.sock,
+							NULL, NULL,
+							&port_num);
+		if (err)
+			return err;
+		/*
+		 * Copy port_num to stack first and then to *id,
+		 * so that the short to int cast works for little
+		 * and big endian systems.
+		 */
+		*id = port_num;
+		break;
+	default:
+		usnic_err("Unsupported transport %u\n", trans_type);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+struct usnic_ib_qp_grp *
+usnic_ib_qp_grp_create(struct usnic_fwd_dev *ufdev, struct usnic_ib_vf *vf,
+			struct usnic_ib_pd *pd,
+			struct usnic_vnic_res_spec *res_spec,
+			struct usnic_transport_spec *transport_spec)
+{
+	struct usnic_ib_qp_grp *qp_grp;
+	int err;
+	enum usnic_transport_type transport = transport_spec->trans_type;
+	struct usnic_ib_qp_grp_flow *qp_flow;
+
+	lockdep_assert_held(&vf->lock);
+
+	err = usnic_vnic_res_spec_satisfied(&min_transport_spec[transport],
+						res_spec);
+	if (err) {
+		usnic_err("Spec does not meet miniumum req for transport %d\n",
+				transport);
+		log_spec(res_spec);
+		return ERR_PTR(err);
+	}
+
+	qp_grp = kzalloc(sizeof(*qp_grp), GFP_ATOMIC);
+	if (!qp_grp) {
+		usnic_err("Unable to alloc qp_grp - Out of memory\n");
+		return NULL;
+	}
+
+	qp_grp->res_chunk_list = alloc_res_chunk_list(vf->vnic, res_spec,
+							qp_grp);
+	if (IS_ERR_OR_NULL(qp_grp->res_chunk_list)) {
+		err = qp_grp->res_chunk_list ?
+				PTR_ERR(qp_grp->res_chunk_list) : -ENOMEM;
+		usnic_err("Unable to alloc res for %d with err %d\n",
+				qp_grp->grp_id, err);
+		goto out_free_qp_grp;
+	}
+
+	err = qp_grp_and_vf_bind(vf, pd, qp_grp);
+	if (err)
+		goto out_free_res;
+
+	INIT_LIST_HEAD(&qp_grp->flows_lst);
+	spin_lock_init(&qp_grp->lock);
+	qp_grp->ufdev = ufdev;
+	qp_grp->state = IB_QPS_RESET;
+	qp_grp->owner_pid = current->pid;
+
+	qp_flow = create_and_add_flow(qp_grp, transport_spec);
+	if (IS_ERR_OR_NULL(qp_flow)) {
+		usnic_err("Unable to create and add flow with err %ld\n",
+				PTR_ERR(qp_flow));
+		err = qp_flow ? PTR_ERR(qp_flow) : -EFAULT;
+		goto out_qp_grp_vf_unbind;
+	}
+
+	err = qp_grp_id_from_flow(qp_flow, &qp_grp->grp_id);
+	if (err)
+		goto out_release_flow;
+	qp_grp->ibqp.qp_num = qp_grp->grp_id;
+
+	usnic_ib_sysfs_qpn_add(qp_grp);
+
+	return qp_grp;
+
+out_release_flow:
+	release_and_remove_flow(qp_flow);
+out_qp_grp_vf_unbind:
+	qp_grp_and_vf_unbind(qp_grp);
+out_free_res:
+	free_qp_grp_res(qp_grp->res_chunk_list);
+out_free_qp_grp:
+	kfree(qp_grp);
+
+	return ERR_PTR(err);
+}
+
+void usnic_ib_qp_grp_destroy(struct usnic_ib_qp_grp *qp_grp)
+{
+
+	WARN_ON(qp_grp->state != IB_QPS_RESET);
+	lockdep_assert_held(&qp_grp->vf->lock);
+
+	release_and_remove_all_flows(qp_grp);
+	usnic_ib_sysfs_qpn_remove(qp_grp);
+	qp_grp_and_vf_unbind(qp_grp);
+	free_qp_grp_res(qp_grp->res_chunk_list);
+	kfree(qp_grp);
+}
+
+struct usnic_vnic_res_chunk*
+usnic_ib_qp_grp_get_chunk(struct usnic_ib_qp_grp *qp_grp,
+				enum usnic_vnic_res_type res_type)
+{
+	int i;
+
+	for (i = 0; qp_grp->res_chunk_list[i]; i++) {
+		if (qp_grp->res_chunk_list[i]->type == res_type)
+			return qp_grp->res_chunk_list[i];
+	}
+
+	return ERR_PTR(-EINVAL);
+}
diff --git a/drivers/infiniband/hw/usnic/usnic_ib_qp_grp.h b/drivers/infiniband/hw/usnic/usnic_ib_qp_grp.h
new file mode 100644
index 0000000..b0aafe8
--- /dev/null
+++ b/drivers/infiniband/hw/usnic/usnic_ib_qp_grp.h
@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved.
+ *
+ * This program is free software; you may redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; version 2 of the License.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifndef USNIC_IB_QP_GRP_H_
+#define USNIC_IB_QP_GRP_H_
+
+#include <linux/debugfs.h>
+#include <rdma/ib_verbs.h>
+
+#include "usnic_ib.h"
+#include "usnic_abi.h"
+#include "usnic_fwd.h"
+#include "usnic_vnic.h"
+
+/*
+ * The qp group struct represents all the hw resources needed to present a ib_qp
+ */
+struct usnic_ib_qp_grp {
+	struct ib_qp				ibqp;
+	enum ib_qp_state			state;
+	int					grp_id;
+
+	struct usnic_fwd_dev			*ufdev;
+	struct usnic_ib_ucontext		*ctx;
+	struct list_head			flows_lst;
+
+	struct usnic_vnic_res_chunk		**res_chunk_list;
+
+	pid_t					owner_pid;
+	struct usnic_ib_vf			*vf;
+	struct list_head			link;
+
+	spinlock_t				lock;
+
+	struct kobject				kobj;
+};
+
+struct usnic_ib_qp_grp_flow {
+	struct usnic_fwd_flow		*flow;
+	enum usnic_transport_type	trans_type;
+	union {
+		struct {
+			uint16_t	port_num;
+		} usnic_roce;
+		struct {
+			struct socket	*sock;
+		} udp;
+	};
+	struct usnic_ib_qp_grp		*qp_grp;
+	struct list_head		link;
+
+	/* Debug FS */
+	struct dentry			*dbgfs_dentry;
+	char				dentry_name[32];
+};
+
+static const struct
+usnic_vnic_res_spec min_transport_spec[USNIC_TRANSPORT_MAX] = {
+	{ /*USNIC_TRANSPORT_UNKNOWN*/
+		.resources = {
+			{.type = USNIC_VNIC_RES_TYPE_EOL,	.cnt = 0,},
+		},
+	},
+	{ /*USNIC_TRANSPORT_ROCE_CUSTOM*/
+		.resources = {
+			{.type = USNIC_VNIC_RES_TYPE_WQ,	.cnt = 1,},
+			{.type = USNIC_VNIC_RES_TYPE_RQ,	.cnt = 1,},
+			{.type = USNIC_VNIC_RES_TYPE_CQ,	.cnt = 1,},
+			{.type = USNIC_VNIC_RES_TYPE_EOL,	.cnt = 0,},
+		},
+	},
+	{ /*USNIC_TRANSPORT_IPV4_UDP*/
+		.resources = {
+			{.type = USNIC_VNIC_RES_TYPE_WQ,	.cnt = 1,},
+			{.type = USNIC_VNIC_RES_TYPE_RQ,	.cnt = 1,},
+			{.type = USNIC_VNIC_RES_TYPE_CQ,	.cnt = 1,},
+			{.type = USNIC_VNIC_RES_TYPE_EOL,	.cnt = 0,},
+		},
+	},
+};
+
+const char *usnic_ib_qp_grp_state_to_string(enum ib_qp_state state);
+int usnic_ib_qp_grp_dump_hdr(char *buf, int buf_sz);
+int usnic_ib_qp_grp_dump_rows(void *obj, char *buf, int buf_sz);
+struct usnic_ib_qp_grp *
+usnic_ib_qp_grp_create(struct usnic_fwd_dev *ufdev, struct usnic_ib_vf *vf,
+			struct usnic_ib_pd *pd,
+			struct usnic_vnic_res_spec *res_spec,
+			struct usnic_transport_spec *trans_spec);
+void usnic_ib_qp_grp_destroy(struct usnic_ib_qp_grp *qp_grp);
+int usnic_ib_qp_grp_modify(struct usnic_ib_qp_grp *qp_grp,
+				enum ib_qp_state new_state,
+				void *data);
+struct usnic_vnic_res_chunk
+*usnic_ib_qp_grp_get_chunk(struct usnic_ib_qp_grp *qp_grp,
+				enum usnic_vnic_res_type type);
+static inline
+struct usnic_ib_qp_grp *to_uqp_grp(struct ib_qp *ibqp)
+{
+	return container_of(ibqp, struct usnic_ib_qp_grp, ibqp);
+}
+#endif /* USNIC_IB_QP_GRP_H_ */
diff --git a/drivers/infiniband/hw/usnic/usnic_ib_sysfs.c b/drivers/infiniband/hw/usnic/usnic_ib_sysfs.c
new file mode 100644
index 0000000..27dc67c
--- /dev/null
+++ b/drivers/infiniband/hw/usnic/usnic_ib_sysfs.c
@@ -0,0 +1,341 @@
+/*
+ * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved.
+ *
+ * This program is free software; you may redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; version 2 of the License.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+
+#include <rdma/ib_user_verbs.h>
+#include <rdma/ib_addr.h>
+
+#include "usnic_common_util.h"
+#include "usnic_ib.h"
+#include "usnic_ib_qp_grp.h"
+#include "usnic_vnic.h"
+#include "usnic_ib_verbs.h"
+#include "usnic_log.h"
+
+static ssize_t usnic_ib_show_fw_ver(struct device *device,
+					struct device_attribute *attr,
+					char *buf)
+{
+	struct usnic_ib_dev *us_ibdev =
+		container_of(device, struct usnic_ib_dev, ib_dev.dev);
+	struct ethtool_drvinfo info;
+
+	mutex_lock(&us_ibdev->usdev_lock);
+	us_ibdev->netdev->ethtool_ops->get_drvinfo(us_ibdev->netdev, &info);
+	mutex_unlock(&us_ibdev->usdev_lock);
+
+	return scnprintf(buf, PAGE_SIZE, "%s\n", info.fw_version);
+}
+
+static ssize_t usnic_ib_show_board(struct device *device,
+					struct device_attribute *attr,
+					char *buf)
+{
+	struct usnic_ib_dev *us_ibdev =
+		container_of(device, struct usnic_ib_dev, ib_dev.dev);
+	unsigned short subsystem_device_id;
+
+	mutex_lock(&us_ibdev->usdev_lock);
+	subsystem_device_id = us_ibdev->pdev->subsystem_device;
+	mutex_unlock(&us_ibdev->usdev_lock);
+
+	return scnprintf(buf, PAGE_SIZE, "%hu\n", subsystem_device_id);
+}
+
+/*
+ * Report the configuration for this PF
+ */
+static ssize_t
+usnic_ib_show_config(struct device *device, struct device_attribute *attr,
+			char *buf)
+{
+	struct usnic_ib_dev *us_ibdev;
+	char *ptr;
+	unsigned left;
+	unsigned n;
+	enum usnic_vnic_res_type res_type;
+
+	us_ibdev = container_of(device, struct usnic_ib_dev, ib_dev.dev);
+
+	/* Buffer space limit is 1 page */
+	ptr = buf;
+	left = PAGE_SIZE;
+
+	mutex_lock(&us_ibdev->usdev_lock);
+	if (atomic_read(&us_ibdev->vf_cnt.refcount) > 0) {
+		char *busname;
+
+		/*
+		 * bus name seems to come with annoying prefix.
+		 * Remove it if it is predictable
+		 */
+		busname = us_ibdev->pdev->bus->name;
+		if (strncmp(busname, "PCI Bus ", 8) == 0)
+			busname += 8;
+
+		n = scnprintf(ptr, left,
+			"%s: %s:%d.%d, %s, %pM, %u VFs\n Per VF:",
+			us_ibdev->ib_dev.name,
+			busname,
+			PCI_SLOT(us_ibdev->pdev->devfn),
+			PCI_FUNC(us_ibdev->pdev->devfn),
+			netdev_name(us_ibdev->netdev),
+			us_ibdev->ufdev->mac,
+			atomic_read(&us_ibdev->vf_cnt.refcount));
+		UPDATE_PTR_LEFT(n, ptr, left);
+
+		for (res_type = USNIC_VNIC_RES_TYPE_EOL;
+				res_type < USNIC_VNIC_RES_TYPE_MAX;
+				res_type++) {
+			if (us_ibdev->vf_res_cnt[res_type] == 0)
+				continue;
+			n = scnprintf(ptr, left, " %d %s%s",
+				us_ibdev->vf_res_cnt[res_type],
+				usnic_vnic_res_type_to_str(res_type),
+				(res_type < (USNIC_VNIC_RES_TYPE_MAX - 1)) ?
+				 "," : "");
+			UPDATE_PTR_LEFT(n, ptr, left);
+		}
+		n = scnprintf(ptr, left, "\n");
+		UPDATE_PTR_LEFT(n, ptr, left);
+	} else {
+		n = scnprintf(ptr, left, "%s: no VFs\n",
+				us_ibdev->ib_dev.name);
+		UPDATE_PTR_LEFT(n, ptr, left);
+	}
+	mutex_unlock(&us_ibdev->usdev_lock);
+
+	return ptr - buf;
+}
+
+static ssize_t
+usnic_ib_show_iface(struct device *device, struct device_attribute *attr,
+			char *buf)
+{
+	struct usnic_ib_dev *us_ibdev;
+
+	us_ibdev = container_of(device, struct usnic_ib_dev, ib_dev.dev);
+
+	return scnprintf(buf, PAGE_SIZE, "%s\n",
+			netdev_name(us_ibdev->netdev));
+}
+
+static ssize_t
+usnic_ib_show_max_vf(struct device *device, struct device_attribute *attr,
+			char *buf)
+{
+	struct usnic_ib_dev *us_ibdev;
+
+	us_ibdev = container_of(device, struct usnic_ib_dev, ib_dev.dev);
+
+	return scnprintf(buf, PAGE_SIZE, "%u\n",
+			atomic_read(&us_ibdev->vf_cnt.refcount));
+}
+
+static ssize_t
+usnic_ib_show_qp_per_vf(struct device *device, struct device_attribute *attr,
+			char *buf)
+{
+	struct usnic_ib_dev *us_ibdev;
+	int qp_per_vf;
+
+	us_ibdev = container_of(device, struct usnic_ib_dev, ib_dev.dev);
+	qp_per_vf = max(us_ibdev->vf_res_cnt[USNIC_VNIC_RES_TYPE_WQ],
+			us_ibdev->vf_res_cnt[USNIC_VNIC_RES_TYPE_RQ]);
+
+	return scnprintf(buf, PAGE_SIZE,
+				"%d\n", qp_per_vf);
+}
+
+static ssize_t
+usnic_ib_show_cq_per_vf(struct device *device, struct device_attribute *attr,
+			char *buf)
+{
+	struct usnic_ib_dev *us_ibdev;
+
+	us_ibdev = container_of(device, struct usnic_ib_dev, ib_dev.dev);
+
+	return scnprintf(buf, PAGE_SIZE, "%d\n",
+			us_ibdev->vf_res_cnt[USNIC_VNIC_RES_TYPE_CQ]);
+}
+
+static DEVICE_ATTR(fw_ver, S_IRUGO, usnic_ib_show_fw_ver, NULL);
+static DEVICE_ATTR(board_id, S_IRUGO, usnic_ib_show_board, NULL);
+static DEVICE_ATTR(config, S_IRUGO, usnic_ib_show_config, NULL);
+static DEVICE_ATTR(iface, S_IRUGO, usnic_ib_show_iface, NULL);
+static DEVICE_ATTR(max_vf, S_IRUGO, usnic_ib_show_max_vf, NULL);
+static DEVICE_ATTR(qp_per_vf, S_IRUGO, usnic_ib_show_qp_per_vf, NULL);
+static DEVICE_ATTR(cq_per_vf, S_IRUGO, usnic_ib_show_cq_per_vf, NULL);
+
+static struct device_attribute *usnic_class_attributes[] = {
+	&dev_attr_fw_ver,
+	&dev_attr_board_id,
+	&dev_attr_config,
+	&dev_attr_iface,
+	&dev_attr_max_vf,
+	&dev_attr_qp_per_vf,
+	&dev_attr_cq_per_vf,
+};
+
+struct qpn_attribute {
+	struct attribute attr;
+	ssize_t (*show)(struct usnic_ib_qp_grp *, char *buf);
+};
+
+/*
+ * Definitions for supporting QPN entries in sysfs
+ */
+static ssize_t
+usnic_ib_qpn_attr_show(struct kobject *kobj, struct attribute *attr, char *buf)
+{
+	struct usnic_ib_qp_grp *qp_grp;
+	struct qpn_attribute *qpn_attr;
+
+	qp_grp = container_of(kobj, struct usnic_ib_qp_grp, kobj);
+	qpn_attr = container_of(attr, struct qpn_attribute, attr);
+
+	return qpn_attr->show(qp_grp, buf);
+}
+
+static const struct sysfs_ops usnic_ib_qpn_sysfs_ops = {
+	.show = usnic_ib_qpn_attr_show
+};
+
+#define QPN_ATTR_RO(NAME) \
+struct qpn_attribute qpn_attr_##NAME = __ATTR_RO(NAME)
+
+static ssize_t context_show(struct usnic_ib_qp_grp *qp_grp, char *buf)
+{
+	return scnprintf(buf, PAGE_SIZE, "0x%p\n", qp_grp->ctx);
+}
+
+static ssize_t summary_show(struct usnic_ib_qp_grp *qp_grp, char *buf)
+{
+	int i, j, n;
+	int left;
+	char *ptr;
+	struct usnic_vnic_res_chunk *res_chunk;
+	struct usnic_vnic_res *vnic_res;
+
+	left = PAGE_SIZE;
+	ptr = buf;
+
+	n = scnprintf(ptr, left,
+			"QPN: %d State: (%s) PID: %u VF Idx: %hu ",
+			qp_grp->ibqp.qp_num,
+			usnic_ib_qp_grp_state_to_string(qp_grp->state),
+			qp_grp->owner_pid,
+			usnic_vnic_get_index(qp_grp->vf->vnic));
+	UPDATE_PTR_LEFT(n, ptr, left);
+
+	for (i = 0; qp_grp->res_chunk_list[i]; i++) {
+		res_chunk = qp_grp->res_chunk_list[i];
+		for (j = 0; j < res_chunk->cnt; j++) {
+			vnic_res = res_chunk->res[j];
+			n = scnprintf(ptr, left, "%s[%d] ",
+				usnic_vnic_res_type_to_str(vnic_res->type),
+				vnic_res->vnic_idx);
+			UPDATE_PTR_LEFT(n, ptr, left);
+		}
+	}
+
+	n = scnprintf(ptr, left, "\n");
+	UPDATE_PTR_LEFT(n, ptr, left);
+
+	return ptr - buf;
+}
+
+static QPN_ATTR_RO(context);
+static QPN_ATTR_RO(summary);
+
+static struct attribute *usnic_ib_qpn_default_attrs[] = {
+	&qpn_attr_context.attr,
+	&qpn_attr_summary.attr,
+	NULL
+};
+
+static struct kobj_type usnic_ib_qpn_type = {
+	.sysfs_ops = &usnic_ib_qpn_sysfs_ops,
+	.default_attrs = usnic_ib_qpn_default_attrs
+};
+
+int usnic_ib_sysfs_register_usdev(struct usnic_ib_dev *us_ibdev)
+{
+	int i;
+	int err;
+	for (i = 0; i < ARRAY_SIZE(usnic_class_attributes); ++i) {
+		err = device_create_file(&us_ibdev->ib_dev.dev,
+						usnic_class_attributes[i]);
+		if (err) {
+			usnic_err("Failed to create device file %d for %s eith err %d",
+				i, us_ibdev->ib_dev.name, err);
+			return -EINVAL;
+		}
+	}
+
+	/* create kernel object for looking at individual QPs */
+	kobject_get(&us_ibdev->ib_dev.dev.kobj);
+	us_ibdev->qpn_kobj = kobject_create_and_add("qpn",
+			&us_ibdev->ib_dev.dev.kobj);
+	if (us_ibdev->qpn_kobj == NULL) {
+		kobject_put(&us_ibdev->ib_dev.dev.kobj);
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+void usnic_ib_sysfs_unregister_usdev(struct usnic_ib_dev *us_ibdev)
+{
+	int i;
+	for (i = 0; i < ARRAY_SIZE(usnic_class_attributes); ++i) {
+		device_remove_file(&us_ibdev->ib_dev.dev,
+					usnic_class_attributes[i]);
+	}
+
+	kobject_put(us_ibdev->qpn_kobj);
+}
+
+void usnic_ib_sysfs_qpn_add(struct usnic_ib_qp_grp *qp_grp)
+{
+	struct usnic_ib_dev *us_ibdev;
+	int err;
+
+	us_ibdev = qp_grp->vf->pf;
+
+	err = kobject_init_and_add(&qp_grp->kobj, &usnic_ib_qpn_type,
+			kobject_get(us_ibdev->qpn_kobj),
+			"%d", qp_grp->grp_id);
+	if (err) {
+		kobject_put(us_ibdev->qpn_kobj);
+		return;
+	}
+}
+
+void usnic_ib_sysfs_qpn_remove(struct usnic_ib_qp_grp *qp_grp)
+{
+	struct usnic_ib_dev *us_ibdev;
+
+	us_ibdev = qp_grp->vf->pf;
+
+	kobject_put(&qp_grp->kobj);
+	kobject_put(us_ibdev->qpn_kobj);
+}
diff --git a/drivers/infiniband/hw/usnic/usnic_ib_sysfs.h b/drivers/infiniband/hw/usnic/usnic_ib_sysfs.h
new file mode 100644
index 0000000..0d09b49
--- /dev/null
+++ b/drivers/infiniband/hw/usnic/usnic_ib_sysfs.h
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved.
+ *
+ * This program is free software; you may redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; version 2 of the License.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifndef USNIC_IB_SYSFS_H_
+#define USNIC_IB_SYSFS_H_
+
+#include "usnic_ib.h"
+
+int usnic_ib_sysfs_register_usdev(struct usnic_ib_dev *us_ibdev);
+void usnic_ib_sysfs_unregister_usdev(struct usnic_ib_dev *us_ibdev);
+void usnic_ib_sysfs_qpn_add(struct usnic_ib_qp_grp *qp_grp);
+void usnic_ib_sysfs_qpn_remove(struct usnic_ib_qp_grp *qp_grp);
+
+#endif /* !USNIC_IB_SYSFS_H_ */
diff --git a/drivers/infiniband/hw/usnic/usnic_ib_verbs.c b/drivers/infiniband/hw/usnic/usnic_ib_verbs.c
new file mode 100644
index 0000000..53bd6a2
--- /dev/null
+++ b/drivers/infiniband/hw/usnic/usnic_ib_verbs.c
@@ -0,0 +1,768 @@
+/*
+ * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved.
+ *
+ * This program is free software; you may redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; version 2 of the License.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/errno.h>
+
+#include <rdma/ib_user_verbs.h>
+#include <rdma/ib_addr.h>
+
+#include "usnic_abi.h"
+#include "usnic_ib.h"
+#include "usnic_common_util.h"
+#include "usnic_ib_qp_grp.h"
+#include "usnic_fwd.h"
+#include "usnic_log.h"
+#include "usnic_uiom.h"
+#include "usnic_transport.h"
+
+#define USNIC_DEFAULT_TRANSPORT USNIC_TRANSPORT_ROCE_CUSTOM
+
+static void usnic_ib_fw_string_to_u64(char *fw_ver_str, u64 *fw_ver)
+{
+	*fw_ver = (u64) *fw_ver_str;
+}
+
+static int usnic_ib_fill_create_qp_resp(struct usnic_ib_qp_grp *qp_grp,
+					struct ib_udata *udata)
+{
+	struct usnic_ib_dev *us_ibdev;
+	struct usnic_ib_create_qp_resp resp;
+	struct pci_dev *pdev;
+	struct vnic_dev_bar *bar;
+	struct usnic_vnic_res_chunk *chunk;
+	struct usnic_ib_qp_grp_flow *default_flow;
+	int i, err;
+
+	memset(&resp, 0, sizeof(resp));
+
+	us_ibdev = qp_grp->vf->pf;
+	pdev = usnic_vnic_get_pdev(qp_grp->vf->vnic);
+	if (!pdev) {
+		usnic_err("Failed to get pdev of qp_grp %d\n",
+				qp_grp->grp_id);
+		return -EFAULT;
+	}
+
+	bar = usnic_vnic_get_bar(qp_grp->vf->vnic, 0);
+	if (!bar) {
+		usnic_err("Failed to get bar0 of qp_grp %d vf %s",
+				qp_grp->grp_id, pci_name(pdev));
+		return -EFAULT;
+	}
+
+	resp.vfid = usnic_vnic_get_index(qp_grp->vf->vnic);
+	resp.bar_bus_addr = bar->bus_addr;
+	resp.bar_len = bar->len;
+
+	chunk = usnic_ib_qp_grp_get_chunk(qp_grp, USNIC_VNIC_RES_TYPE_RQ);
+	if (IS_ERR_OR_NULL(chunk)) {
+		usnic_err("Failed to get chunk %s for qp_grp %d with err %ld\n",
+			usnic_vnic_res_type_to_str(USNIC_VNIC_RES_TYPE_RQ),
+			qp_grp->grp_id,
+			PTR_ERR(chunk));
+		return chunk ? PTR_ERR(chunk) : -ENOMEM;
+	}
+
+	WARN_ON(chunk->type != USNIC_VNIC_RES_TYPE_RQ);
+	resp.rq_cnt = chunk->cnt;
+	for (i = 0; i < chunk->cnt; i++)
+		resp.rq_idx[i] = chunk->res[i]->vnic_idx;
+
+	chunk = usnic_ib_qp_grp_get_chunk(qp_grp, USNIC_VNIC_RES_TYPE_WQ);
+	if (IS_ERR_OR_NULL(chunk)) {
+		usnic_err("Failed to get chunk %s for qp_grp %d with err %ld\n",
+			usnic_vnic_res_type_to_str(USNIC_VNIC_RES_TYPE_WQ),
+			qp_grp->grp_id,
+			PTR_ERR(chunk));
+		return chunk ? PTR_ERR(chunk) : -ENOMEM;
+	}
+
+	WARN_ON(chunk->type != USNIC_VNIC_RES_TYPE_WQ);
+	resp.wq_cnt = chunk->cnt;
+	for (i = 0; i < chunk->cnt; i++)
+		resp.wq_idx[i] = chunk->res[i]->vnic_idx;
+
+	chunk = usnic_ib_qp_grp_get_chunk(qp_grp, USNIC_VNIC_RES_TYPE_CQ);
+	if (IS_ERR_OR_NULL(chunk)) {
+		usnic_err("Failed to get chunk %s for qp_grp %d with err %ld\n",
+			usnic_vnic_res_type_to_str(USNIC_VNIC_RES_TYPE_CQ),
+			qp_grp->grp_id,
+			PTR_ERR(chunk));
+		return chunk ? PTR_ERR(chunk) : -ENOMEM;
+	}
+
+	WARN_ON(chunk->type != USNIC_VNIC_RES_TYPE_CQ);
+	resp.cq_cnt = chunk->cnt;
+	for (i = 0; i < chunk->cnt; i++)
+		resp.cq_idx[i] = chunk->res[i]->vnic_idx;
+
+	default_flow = list_first_entry(&qp_grp->flows_lst,
+					struct usnic_ib_qp_grp_flow, link);
+	resp.transport = default_flow->trans_type;
+
+	err = ib_copy_to_udata(udata, &resp, sizeof(resp));
+	if (err) {
+		usnic_err("Failed to copy udata for %s", us_ibdev->ib_dev.name);
+		return err;
+	}
+
+	return 0;
+}
+
+static struct usnic_ib_qp_grp*
+find_free_vf_and_create_qp_grp(struct usnic_ib_dev *us_ibdev,
+				struct usnic_ib_pd *pd,
+				struct usnic_transport_spec *trans_spec,
+				struct usnic_vnic_res_spec *res_spec)
+{
+	struct usnic_ib_vf *vf;
+	struct usnic_vnic *vnic;
+	struct usnic_ib_qp_grp *qp_grp;
+	struct device *dev, **dev_list;
+	int i, found = 0;
+
+	BUG_ON(!mutex_is_locked(&us_ibdev->usdev_lock));
+
+	if (list_empty(&us_ibdev->vf_dev_list)) {
+		usnic_info("No vfs to allocate\n");
+		return NULL;
+	}
+
+	if (usnic_ib_share_vf) {
+		/* Try to find resouces on a used vf which is in pd */
+		dev_list = usnic_uiom_get_dev_list(pd->umem_pd);
+		for (i = 0; dev_list[i]; i++) {
+			dev = dev_list[i];
+			vf = pci_get_drvdata(to_pci_dev(dev));
+			spin_lock(&vf->lock);
+			vnic = vf->vnic;
+			if (!usnic_vnic_check_room(vnic, res_spec)) {
+				usnic_dbg("Found used vnic %s from %s\n",
+						us_ibdev->ib_dev.name,
+						pci_name(usnic_vnic_get_pdev(
+									vnic)));
+				found = 1;
+				break;
+			}
+			spin_unlock(&vf->lock);
+
+		}
+		usnic_uiom_free_dev_list(dev_list);
+	}
+
+	if (!found) {
+		/* Try to find resources on an unused vf */
+		list_for_each_entry(vf, &us_ibdev->vf_dev_list, link) {
+			spin_lock(&vf->lock);
+			vnic = vf->vnic;
+			if (vf->qp_grp_ref_cnt == 0 &&
+				usnic_vnic_check_room(vnic, res_spec) == 0) {
+				found = 1;
+				break;
+			}
+			spin_unlock(&vf->lock);
+		}
+	}
+
+	if (!found) {
+		usnic_info("No free qp grp found on %s\n",
+				us_ibdev->ib_dev.name);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	qp_grp = usnic_ib_qp_grp_create(us_ibdev->ufdev, vf, pd, res_spec,
+						trans_spec);
+	spin_unlock(&vf->lock);
+	if (IS_ERR_OR_NULL(qp_grp)) {
+		usnic_err("Failed to allocate qp_grp\n");
+		return ERR_PTR(qp_grp ? PTR_ERR(qp_grp) : -ENOMEM);
+	}
+
+	return qp_grp;
+}
+
+static void qp_grp_destroy(struct usnic_ib_qp_grp *qp_grp)
+{
+	struct usnic_ib_vf *vf = qp_grp->vf;
+
+	WARN_ON(qp_grp->state != IB_QPS_RESET);
+
+	spin_lock(&vf->lock);
+	usnic_ib_qp_grp_destroy(qp_grp);
+	spin_unlock(&vf->lock);
+}
+
+static void eth_speed_to_ib_speed(int speed, u8 *active_speed,
+					u8 *active_width)
+{
+	if (speed <= 10000) {
+		*active_width = IB_WIDTH_1X;
+		*active_speed = IB_SPEED_FDR10;
+	} else if (speed <= 20000) {
+		*active_width = IB_WIDTH_4X;
+		*active_speed = IB_SPEED_DDR;
+	} else if (speed <= 30000) {
+		*active_width = IB_WIDTH_4X;
+		*active_speed = IB_SPEED_QDR;
+	} else if (speed <= 40000) {
+		*active_width = IB_WIDTH_4X;
+		*active_speed = IB_SPEED_FDR10;
+	} else {
+		*active_width = IB_WIDTH_4X;
+		*active_speed = IB_SPEED_EDR;
+	}
+}
+
+static int create_qp_validate_user_data(struct usnic_ib_create_qp_cmd cmd)
+{
+	if (cmd.spec.trans_type <= USNIC_TRANSPORT_UNKNOWN ||
+			cmd.spec.trans_type >= USNIC_TRANSPORT_MAX)
+		return -EINVAL;
+
+	return 0;
+}
+
+/* Start of ib callback functions */
+
+enum rdma_link_layer usnic_ib_port_link_layer(struct ib_device *device,
+						u8 port_num)
+{
+	return IB_LINK_LAYER_ETHERNET;
+}
+
+int usnic_ib_query_device(struct ib_device *ibdev,
+				struct ib_device_attr *props)
+{
+	struct usnic_ib_dev *us_ibdev = to_usdev(ibdev);
+	union ib_gid gid;
+	struct ethtool_drvinfo info;
+	struct ethtool_cmd cmd;
+	int qp_per_vf;
+
+	usnic_dbg("\n");
+	mutex_lock(&us_ibdev->usdev_lock);
+	us_ibdev->netdev->ethtool_ops->get_drvinfo(us_ibdev->netdev, &info);
+	us_ibdev->netdev->ethtool_ops->get_settings(us_ibdev->netdev, &cmd);
+	memset(props, 0, sizeof(*props));
+	usnic_mac_ip_to_gid(us_ibdev->ufdev->mac, us_ibdev->ufdev->inaddr,
+			&gid.raw[0]);
+	memcpy(&props->sys_image_guid, &gid.global.interface_id,
+		sizeof(gid.global.interface_id));
+	usnic_ib_fw_string_to_u64(&info.fw_version[0], &props->fw_ver);
+	props->max_mr_size = USNIC_UIOM_MAX_MR_SIZE;
+	props->page_size_cap = USNIC_UIOM_PAGE_SIZE;
+	props->vendor_id = PCI_VENDOR_ID_CISCO;
+	props->vendor_part_id = PCI_DEVICE_ID_CISCO_VIC_USPACE_NIC;
+	props->hw_ver = us_ibdev->pdev->subsystem_device;
+	qp_per_vf = max(us_ibdev->vf_res_cnt[USNIC_VNIC_RES_TYPE_WQ],
+			us_ibdev->vf_res_cnt[USNIC_VNIC_RES_TYPE_RQ]);
+	props->max_qp = qp_per_vf *
+		atomic_read(&us_ibdev->vf_cnt.refcount);
+	props->device_cap_flags = IB_DEVICE_PORT_ACTIVE_EVENT |
+		IB_DEVICE_SYS_IMAGE_GUID | IB_DEVICE_BLOCK_MULTICAST_LOOPBACK;
+	props->max_cq = us_ibdev->vf_res_cnt[USNIC_VNIC_RES_TYPE_CQ] *
+		atomic_read(&us_ibdev->vf_cnt.refcount);
+	props->max_pd = USNIC_UIOM_MAX_PD_CNT;
+	props->max_mr = USNIC_UIOM_MAX_MR_CNT;
+	props->local_ca_ack_delay = 0;
+	props->max_pkeys = 0;
+	props->atomic_cap = IB_ATOMIC_NONE;
+	props->masked_atomic_cap = props->atomic_cap;
+	props->max_qp_rd_atom = 0;
+	props->max_qp_init_rd_atom = 0;
+	props->max_res_rd_atom = 0;
+	props->max_srq = 0;
+	props->max_srq_wr = 0;
+	props->max_srq_sge = 0;
+	props->max_fast_reg_page_list_len = 0;
+	props->max_mcast_grp = 0;
+	props->max_mcast_qp_attach = 0;
+	props->max_total_mcast_qp_attach = 0;
+	props->max_map_per_fmr = 0;
+	/* Owned by Userspace
+	 * max_qp_wr, max_sge, max_sge_rd, max_cqe */
+	mutex_unlock(&us_ibdev->usdev_lock);
+
+	return 0;
+}
+
+int usnic_ib_query_port(struct ib_device *ibdev, u8 port,
+				struct ib_port_attr *props)
+{
+	struct usnic_ib_dev *us_ibdev = to_usdev(ibdev);
+	struct ethtool_cmd cmd;
+
+	usnic_dbg("\n");
+
+	mutex_lock(&us_ibdev->usdev_lock);
+	us_ibdev->netdev->ethtool_ops->get_settings(us_ibdev->netdev, &cmd);
+	memset(props, 0, sizeof(*props));
+
+	props->lid = 0;
+	props->lmc = 1;
+	props->sm_lid = 0;
+	props->sm_sl = 0;
+
+	if (!us_ibdev->ufdev->link_up) {
+		props->state = IB_PORT_DOWN;
+		props->phys_state = 3;
+	} else if (!us_ibdev->ufdev->inaddr) {
+		props->state = IB_PORT_INIT;
+		props->phys_state = 4;
+	} else {
+		props->state = IB_PORT_ACTIVE;
+		props->phys_state = 5;
+	}
+
+	props->port_cap_flags = 0;
+	props->gid_tbl_len = 1;
+	props->pkey_tbl_len = 1;
+	props->bad_pkey_cntr = 0;
+	props->qkey_viol_cntr = 0;
+	eth_speed_to_ib_speed(cmd.speed, &props->active_speed,
+				&props->active_width);
+	props->max_mtu = IB_MTU_4096;
+	props->active_mtu = iboe_get_mtu(us_ibdev->ufdev->mtu);
+	/* Userspace will adjust for hdrs */
+	props->max_msg_sz = us_ibdev->ufdev->mtu;
+	props->max_vl_num = 1;
+	mutex_unlock(&us_ibdev->usdev_lock);
+
+	return 0;
+}
+
+int usnic_ib_query_qp(struct ib_qp *qp, struct ib_qp_attr *qp_attr,
+				int qp_attr_mask,
+				struct ib_qp_init_attr *qp_init_attr)
+{
+	struct usnic_ib_qp_grp *qp_grp;
+	struct usnic_ib_vf *vf;
+	int err;
+
+	usnic_dbg("\n");
+
+	memset(qp_attr, 0, sizeof(*qp_attr));
+	memset(qp_init_attr, 0, sizeof(*qp_init_attr));
+
+	qp_grp = to_uqp_grp(qp);
+	vf = qp_grp->vf;
+	mutex_lock(&vf->pf->usdev_lock);
+	usnic_dbg("\n");
+	qp_attr->qp_state = qp_grp->state;
+	qp_attr->cur_qp_state = qp_grp->state;
+
+	switch (qp_grp->ibqp.qp_type) {
+	case IB_QPT_UD:
+		qp_attr->qkey = 0;
+		break;
+	default:
+		usnic_err("Unexpected qp_type %d\n", qp_grp->ibqp.qp_type);
+		err = -EINVAL;
+		goto err_out;
+	}
+
+	mutex_unlock(&vf->pf->usdev_lock);
+	return 0;
+
+err_out:
+	mutex_unlock(&vf->pf->usdev_lock);
+	return err;
+}
+
+int usnic_ib_query_gid(struct ib_device *ibdev, u8 port, int index,
+				union ib_gid *gid)
+{
+
+	struct usnic_ib_dev *us_ibdev = to_usdev(ibdev);
+	usnic_dbg("\n");
+
+	if (index > 1)
+		return -EINVAL;
+
+	mutex_lock(&us_ibdev->usdev_lock);
+	memset(&(gid->raw[0]), 0, sizeof(gid->raw));
+	usnic_mac_ip_to_gid(us_ibdev->ufdev->mac, us_ibdev->ufdev->inaddr,
+			&gid->raw[0]);
+	mutex_unlock(&us_ibdev->usdev_lock);
+
+	return 0;
+}
+
+int usnic_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index,
+				u16 *pkey)
+{
+	if (index > 1)
+		return -EINVAL;
+
+	*pkey = 0xffff;
+	return 0;
+}
+
+struct ib_pd *usnic_ib_alloc_pd(struct ib_device *ibdev,
+					struct ib_ucontext *context,
+					struct ib_udata *udata)
+{
+	struct usnic_ib_pd *pd;
+	void *umem_pd;
+
+	usnic_dbg("\n");
+
+	pd = kzalloc(sizeof(*pd), GFP_KERNEL);
+	if (!pd)
+		return ERR_PTR(-ENOMEM);
+
+	umem_pd = pd->umem_pd = usnic_uiom_alloc_pd();
+	if (IS_ERR_OR_NULL(umem_pd)) {
+		kfree(pd);
+		return ERR_PTR(umem_pd ? PTR_ERR(umem_pd) : -ENOMEM);
+	}
+
+	usnic_info("domain 0x%p allocated for context 0x%p and device %s\n",
+			pd, context, ibdev->name);
+	return &pd->ibpd;
+}
+
+int usnic_ib_dealloc_pd(struct ib_pd *pd)
+{
+	usnic_info("freeing domain 0x%p\n", pd);
+
+	usnic_uiom_dealloc_pd((to_upd(pd))->umem_pd);
+	kfree(pd);
+	return 0;
+}
+
+struct ib_qp *usnic_ib_create_qp(struct ib_pd *pd,
+					struct ib_qp_init_attr *init_attr,
+					struct ib_udata *udata)
+{
+	int err;
+	struct usnic_ib_dev *us_ibdev;
+	struct usnic_ib_qp_grp *qp_grp;
+	struct usnic_ib_ucontext *ucontext;
+	int cq_cnt;
+	struct usnic_vnic_res_spec res_spec;
+	struct usnic_ib_create_qp_cmd cmd;
+	struct usnic_transport_spec trans_spec;
+
+	usnic_dbg("\n");
+
+	ucontext = to_uucontext(pd->uobject->context);
+	us_ibdev = to_usdev(pd->device);
+
+	if (init_attr->create_flags)
+		return ERR_PTR(-EINVAL);
+
+	err = ib_copy_from_udata(&cmd, udata, sizeof(cmd));
+	if (err) {
+		usnic_err("%s: cannot copy udata for create_qp\n",
+				us_ibdev->ib_dev.name);
+		return ERR_PTR(-EINVAL);
+	}
+
+	err = create_qp_validate_user_data(cmd);
+	if (err) {
+		usnic_err("%s: Failed to validate user data\n",
+				us_ibdev->ib_dev.name);
+		return ERR_PTR(-EINVAL);
+	}
+
+	if (init_attr->qp_type != IB_QPT_UD) {
+		usnic_err("%s asked to make a non-UD QP: %d\n",
+				us_ibdev->ib_dev.name, init_attr->qp_type);
+		return ERR_PTR(-EINVAL);
+	}
+
+	trans_spec = cmd.spec;
+	mutex_lock(&us_ibdev->usdev_lock);
+	cq_cnt = (init_attr->send_cq == init_attr->recv_cq) ? 1 : 2;
+	res_spec = min_transport_spec[trans_spec.trans_type];
+	usnic_vnic_res_spec_update(&res_spec, USNIC_VNIC_RES_TYPE_CQ, cq_cnt);
+	qp_grp = find_free_vf_and_create_qp_grp(us_ibdev, to_upd(pd),
+						&trans_spec,
+						&res_spec);
+	if (IS_ERR_OR_NULL(qp_grp)) {
+		err = qp_grp ? PTR_ERR(qp_grp) : -ENOMEM;
+		goto out_release_mutex;
+	}
+
+	err = usnic_ib_fill_create_qp_resp(qp_grp, udata);
+	if (err) {
+		err = -EBUSY;
+		goto out_release_qp_grp;
+	}
+
+	qp_grp->ctx = ucontext;
+	list_add_tail(&qp_grp->link, &ucontext->qp_grp_list);
+	usnic_ib_log_vf(qp_grp->vf);
+	mutex_unlock(&us_ibdev->usdev_lock);
+	return &qp_grp->ibqp;
+
+out_release_qp_grp:
+	qp_grp_destroy(qp_grp);
+out_release_mutex:
+	mutex_unlock(&us_ibdev->usdev_lock);
+	return ERR_PTR(err);
+}
+
+int usnic_ib_destroy_qp(struct ib_qp *qp)
+{
+	struct usnic_ib_qp_grp *qp_grp;
+	struct usnic_ib_vf *vf;
+
+	usnic_dbg("\n");
+
+	qp_grp = to_uqp_grp(qp);
+	vf = qp_grp->vf;
+	mutex_lock(&vf->pf->usdev_lock);
+	if (usnic_ib_qp_grp_modify(qp_grp, IB_QPS_RESET, NULL)) {
+		usnic_err("Failed to move qp grp %u to reset\n",
+				qp_grp->grp_id);
+	}
+
+	list_del(&qp_grp->link);
+	qp_grp_destroy(qp_grp);
+	mutex_unlock(&vf->pf->usdev_lock);
+
+	return 0;
+}
+
+int usnic_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+				int attr_mask, struct ib_udata *udata)
+{
+	struct usnic_ib_qp_grp *qp_grp;
+	int status;
+	usnic_dbg("\n");
+
+	qp_grp = to_uqp_grp(ibqp);
+
+	/* TODO: Future Support All States */
+	mutex_lock(&qp_grp->vf->pf->usdev_lock);
+	if ((attr_mask & IB_QP_STATE) && attr->qp_state == IB_QPS_INIT) {
+		status = usnic_ib_qp_grp_modify(qp_grp, IB_QPS_INIT, NULL);
+	} else if ((attr_mask & IB_QP_STATE) && attr->qp_state == IB_QPS_RTR) {
+		status = usnic_ib_qp_grp_modify(qp_grp, IB_QPS_RTR, NULL);
+	} else if ((attr_mask & IB_QP_STATE) && attr->qp_state == IB_QPS_RTS) {
+		status = usnic_ib_qp_grp_modify(qp_grp, IB_QPS_RTS, NULL);
+	} else {
+		usnic_err("Unexpected combination mask: %u state: %u\n",
+				attr_mask & IB_QP_STATE, attr->qp_state);
+		status = -EINVAL;
+	}
+
+	mutex_unlock(&qp_grp->vf->pf->usdev_lock);
+	return status;
+}
+
+struct ib_cq *usnic_ib_create_cq(struct ib_device *ibdev, int entries,
+					int vector, struct ib_ucontext *context,
+					struct ib_udata *udata)
+{
+	struct ib_cq *cq;
+
+	usnic_dbg("\n");
+	cq = kzalloc(sizeof(*cq), GFP_KERNEL);
+	if (!cq)
+		return ERR_PTR(-EBUSY);
+
+	return cq;
+}
+
+int usnic_ib_destroy_cq(struct ib_cq *cq)
+{
+	usnic_dbg("\n");
+	kfree(cq);
+	return 0;
+}
+
+struct ib_mr *usnic_ib_reg_mr(struct ib_pd *pd, u64 start, u64 length,
+					u64 virt_addr, int access_flags,
+					struct ib_udata *udata)
+{
+	struct usnic_ib_mr *mr;
+	int err;
+
+	usnic_dbg("start 0x%llx va 0x%llx length 0x%llx\n", start,
+			virt_addr, length);
+
+	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
+	if (IS_ERR_OR_NULL(mr))
+		return ERR_PTR(mr ? PTR_ERR(mr) : -ENOMEM);
+
+	mr->umem = usnic_uiom_reg_get(to_upd(pd)->umem_pd, start, length,
+					access_flags, 0);
+	if (IS_ERR_OR_NULL(mr->umem)) {
+		err = mr->umem ? PTR_ERR(mr->umem) : -EFAULT;
+		goto err_free;
+	}
+
+	mr->ibmr.lkey = mr->ibmr.rkey = 0;
+	return &mr->ibmr;
+
+err_free:
+	kfree(mr);
+	return ERR_PTR(err);
+}
+
+int usnic_ib_dereg_mr(struct ib_mr *ibmr)
+{
+	struct usnic_ib_mr *mr = to_umr(ibmr);
+
+	usnic_dbg("va 0x%lx length 0x%zx\n", mr->umem->va, mr->umem->length);
+
+	usnic_uiom_reg_release(mr->umem, ibmr->pd->uobject->context->closing);
+	kfree(mr);
+	return 0;
+}
+
+struct ib_ucontext *usnic_ib_alloc_ucontext(struct ib_device *ibdev,
+							struct ib_udata *udata)
+{
+	struct usnic_ib_ucontext *context;
+	struct usnic_ib_dev *us_ibdev = to_usdev(ibdev);
+	usnic_dbg("\n");
+
+	context = kmalloc(sizeof(*context), GFP_KERNEL);
+	if (!context)
+		return ERR_PTR(-ENOMEM);
+
+	INIT_LIST_HEAD(&context->qp_grp_list);
+	mutex_lock(&us_ibdev->usdev_lock);
+	list_add_tail(&context->link, &us_ibdev->ctx_list);
+	mutex_unlock(&us_ibdev->usdev_lock);
+
+	return &context->ibucontext;
+}
+
+int usnic_ib_dealloc_ucontext(struct ib_ucontext *ibcontext)
+{
+	struct usnic_ib_ucontext *context = to_uucontext(ibcontext);
+	struct usnic_ib_dev *us_ibdev = to_usdev(ibcontext->device);
+	usnic_dbg("\n");
+
+	mutex_lock(&us_ibdev->usdev_lock);
+	BUG_ON(!list_empty(&context->qp_grp_list));
+	list_del(&context->link);
+	mutex_unlock(&us_ibdev->usdev_lock);
+	kfree(context);
+	return 0;
+}
+
+int usnic_ib_mmap(struct ib_ucontext *context,
+				struct vm_area_struct *vma)
+{
+	struct usnic_ib_ucontext *uctx = to_ucontext(context);
+	struct usnic_ib_dev *us_ibdev;
+	struct usnic_ib_qp_grp *qp_grp;
+	struct usnic_ib_vf *vf;
+	struct vnic_dev_bar *bar;
+	dma_addr_t bus_addr;
+	unsigned int len;
+	unsigned int vfid;
+
+	usnic_dbg("\n");
+
+	us_ibdev = to_usdev(context->device);
+	vma->vm_flags |= VM_IO;
+	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+	vfid = vma->vm_pgoff;
+	usnic_dbg("Page Offset %lu PAGE_SHIFT %u VFID %u\n",
+			vma->vm_pgoff, PAGE_SHIFT, vfid);
+
+	mutex_lock(&us_ibdev->usdev_lock);
+	list_for_each_entry(qp_grp, &uctx->qp_grp_list, link) {
+		vf = qp_grp->vf;
+		if (usnic_vnic_get_index(vf->vnic) == vfid) {
+			bar = usnic_vnic_get_bar(vf->vnic, 0);
+			if ((vma->vm_end - vma->vm_start) != bar->len) {
+				usnic_err("Bar0 Len %lu - Request map %lu\n",
+						bar->len,
+						vma->vm_end - vma->vm_start);
+				mutex_unlock(&us_ibdev->usdev_lock);
+				return -EINVAL;
+			}
+			bus_addr = bar->bus_addr;
+			len = bar->len;
+			usnic_dbg("bus: %pa vaddr: %p size: %ld\n",
+					&bus_addr, bar->vaddr, bar->len);
+			mutex_unlock(&us_ibdev->usdev_lock);
+
+			return remap_pfn_range(vma,
+						vma->vm_start,
+						bus_addr >> PAGE_SHIFT,
+						len, vma->vm_page_prot);
+		}
+	}
+
+	mutex_unlock(&us_ibdev->usdev_lock);
+	usnic_err("No VF %u found\n", vfid);
+	return -EINVAL;
+}
+
+/* In ib callbacks section -  Start of stub funcs */
+struct ib_ah *usnic_ib_create_ah(struct ib_pd *pd,
+					struct ib_ah_attr *ah_attr)
+{
+	usnic_dbg("\n");
+	return ERR_PTR(-EPERM);
+}
+
+int usnic_ib_destroy_ah(struct ib_ah *ah)
+{
+	usnic_dbg("\n");
+	return -EINVAL;
+}
+
+int usnic_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
+				struct ib_send_wr **bad_wr)
+{
+	usnic_dbg("\n");
+	return -EINVAL;
+}
+
+int usnic_ib_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr,
+				struct ib_recv_wr **bad_wr)
+{
+	usnic_dbg("\n");
+	return -EINVAL;
+}
+
+int usnic_ib_poll_cq(struct ib_cq *ibcq, int num_entries,
+				struct ib_wc *wc)
+{
+	usnic_dbg("\n");
+	return -EINVAL;
+}
+
+int usnic_ib_req_notify_cq(struct ib_cq *cq,
+					enum ib_cq_notify_flags flags)
+{
+	usnic_dbg("\n");
+	return -EINVAL;
+}
+
+struct ib_mr *usnic_ib_get_dma_mr(struct ib_pd *pd, int acc)
+{
+	usnic_dbg("\n");
+	return ERR_PTR(-ENOMEM);
+}
+
+
+/* In ib callbacks section - End of stub funcs */
+/* End of ib callbacks section */
diff --git a/drivers/infiniband/hw/usnic/usnic_ib_verbs.h b/drivers/infiniband/hw/usnic/usnic_ib_verbs.h
new file mode 100644
index 0000000..bb864f5
--- /dev/null
+++ b/drivers/infiniband/hw/usnic/usnic_ib_verbs.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved.
+ *
+ * This program is free software; you may redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; version 2 of the License.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifndef USNIC_IB_VERBS_H_
+#define USNIC_IB_VERBS_H_
+
+#include "usnic_ib.h"
+
+enum rdma_link_layer usnic_ib_port_link_layer(struct ib_device *device,
+						u8 port_num);
+int usnic_ib_query_device(struct ib_device *ibdev,
+				struct ib_device_attr *props);
+int usnic_ib_query_port(struct ib_device *ibdev, u8 port,
+				struct ib_port_attr *props);
+int usnic_ib_query_qp(struct ib_qp *qp, struct ib_qp_attr *qp_attr,
+				int qp_attr_mask,
+				struct ib_qp_init_attr *qp_init_attr);
+int usnic_ib_query_gid(struct ib_device *ibdev, u8 port, int index,
+				union ib_gid *gid);
+int usnic_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index,
+				u16 *pkey);
+struct ib_pd *usnic_ib_alloc_pd(struct ib_device *ibdev,
+				struct ib_ucontext *context,
+				struct ib_udata *udata);
+int usnic_ib_dealloc_pd(struct ib_pd *pd);
+struct ib_qp *usnic_ib_create_qp(struct ib_pd *pd,
+					struct ib_qp_init_attr *init_attr,
+					struct ib_udata *udata);
+int usnic_ib_destroy_qp(struct ib_qp *qp);
+int usnic_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+				int attr_mask, struct ib_udata *udata);
+struct ib_cq *usnic_ib_create_cq(struct ib_device *ibdev, int entries,
+					int vector, struct ib_ucontext *context,
+					struct ib_udata *udata);
+int usnic_ib_destroy_cq(struct ib_cq *cq);
+struct ib_mr *usnic_ib_reg_mr(struct ib_pd *pd, u64 start, u64 length,
+				u64 virt_addr, int access_flags,
+				struct ib_udata *udata);
+int usnic_ib_dereg_mr(struct ib_mr *ibmr);
+struct ib_ucontext *usnic_ib_alloc_ucontext(struct ib_device *ibdev,
+						struct ib_udata *udata);
+int usnic_ib_dealloc_ucontext(struct ib_ucontext *ibcontext);
+int usnic_ib_mmap(struct ib_ucontext *context,
+			struct vm_area_struct *vma);
+struct ib_ah *usnic_ib_create_ah(struct ib_pd *pd,
+					struct ib_ah_attr *ah_attr);
+int usnic_ib_destroy_ah(struct ib_ah *ah);
+int usnic_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
+			struct ib_send_wr **bad_wr);
+int usnic_ib_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr,
+			struct ib_recv_wr **bad_wr);
+int usnic_ib_poll_cq(struct ib_cq *ibcq, int num_entries,
+			struct ib_wc *wc);
+int usnic_ib_req_notify_cq(struct ib_cq *cq,
+				enum ib_cq_notify_flags flags);
+struct ib_mr *usnic_ib_get_dma_mr(struct ib_pd *pd, int acc);
+#endif /* !USNIC_IB_VERBS_H */
diff --git a/drivers/infiniband/hw/usnic/usnic_log.h b/drivers/infiniband/hw/usnic/usnic_log.h
new file mode 100644
index 0000000..75777a6
--- /dev/null
+++ b/drivers/infiniband/hw/usnic/usnic_log.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved.
+ *
+ * This program is free software; you may redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; version 2 of the License.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifndef USNIC_LOG_H_
+#define USNIC_LOG_H_
+
+#include "usnic.h"
+
+extern unsigned int usnic_log_lvl;
+
+#define USNIC_LOG_LVL_NONE		(0)
+#define USNIC_LOG_LVL_ERR		(1)
+#define USNIC_LOG_LVL_INFO		(2)
+#define USNIC_LOG_LVL_DBG		(3)
+
+#define usnic_printk(lvl, args...) \
+	do { \
+		printk(lvl "%s:%s:%d: ", DRV_NAME, __func__, \
+				__LINE__); \
+		printk(args); \
+	} while (0)
+
+#define usnic_dbg(args...) \
+	do { \
+		if (unlikely(usnic_log_lvl >= USNIC_LOG_LVL_DBG)) { \
+			usnic_printk(KERN_INFO, args); \
+	} \
+} while (0)
+
+#define usnic_info(args...) \
+do { \
+	if (usnic_log_lvl >= USNIC_LOG_LVL_INFO) { \
+			usnic_printk(KERN_INFO, args); \
+	} \
+} while (0)
+
+#define usnic_err(args...) \
+	do { \
+		if (usnic_log_lvl >= USNIC_LOG_LVL_ERR) { \
+			usnic_printk(KERN_ERR, args); \
+		} \
+	} while (0)
+#endif /* !USNIC_LOG_H_ */
diff --git a/drivers/infiniband/hw/usnic/usnic_transport.c b/drivers/infiniband/hw/usnic/usnic_transport.c
new file mode 100644
index 0000000..ddef6f7
--- /dev/null
+++ b/drivers/infiniband/hw/usnic/usnic_transport.c
@@ -0,0 +1,202 @@
+/*
+ * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved.
+ *
+ * This program is free software; you may redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; version 2 of the License.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/bitmap.h>
+#include <linux/file.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <net/inet_sock.h>
+
+#include "usnic_transport.h"
+#include "usnic_log.h"
+
+/* ROCE */
+static unsigned long *roce_bitmap;
+static u16 roce_next_port = 1;
+#define ROCE_BITMAP_SZ ((1 << (8 /*CHAR_BIT*/ * sizeof(u16)))/8 /*CHAR BIT*/)
+static DEFINE_SPINLOCK(roce_bitmap_lock);
+
+const char *usnic_transport_to_str(enum usnic_transport_type type)
+{
+	switch (type) {
+	case USNIC_TRANSPORT_UNKNOWN:
+		return "Unknown";
+	case USNIC_TRANSPORT_ROCE_CUSTOM:
+		return "roce custom";
+	case USNIC_TRANSPORT_IPV4_UDP:
+		return "IPv4 UDP";
+	case USNIC_TRANSPORT_MAX:
+		return "Max?";
+	default:
+		return "Not known";
+	}
+}
+
+int usnic_transport_sock_to_str(char *buf, int buf_sz,
+					struct socket *sock)
+{
+	int err;
+	uint32_t addr;
+	uint16_t port;
+	int proto;
+
+	memset(buf, 0, buf_sz);
+	err = usnic_transport_sock_get_addr(sock, &proto, &addr, &port);
+	if (err)
+		return 0;
+
+	return scnprintf(buf, buf_sz, "Proto:%u Addr:%pI4h Port:%hu",
+			proto, &addr, port);
+}
+
+/*
+ * reserve a port number.  if "0" specified, we will try to pick one
+ * starting at roce_next_port.  roce_next_port will take on the values
+ * 1..4096
+ */
+u16 usnic_transport_rsrv_port(enum usnic_transport_type type, u16 port_num)
+{
+	if (type == USNIC_TRANSPORT_ROCE_CUSTOM) {
+		spin_lock(&roce_bitmap_lock);
+		if (!port_num) {
+			port_num = bitmap_find_next_zero_area(roce_bitmap,
+						ROCE_BITMAP_SZ,
+						roce_next_port /* start */,
+						1 /* nr */,
+						0 /* align */);
+			roce_next_port = (port_num & 4095) + 1;
+		} else if (test_bit(port_num, roce_bitmap)) {
+			usnic_err("Failed to allocate port for %s\n",
+					usnic_transport_to_str(type));
+			spin_unlock(&roce_bitmap_lock);
+			goto out_fail;
+		}
+		bitmap_set(roce_bitmap, port_num, 1);
+		spin_unlock(&roce_bitmap_lock);
+	} else {
+		usnic_err("Failed to allocate port - transport %s unsupported\n",
+				usnic_transport_to_str(type));
+		goto out_fail;
+	}
+
+	usnic_dbg("Allocating port %hu for %s\n", port_num,
+			usnic_transport_to_str(type));
+	return port_num;
+
+out_fail:
+	return 0;
+}
+
+void usnic_transport_unrsrv_port(enum usnic_transport_type type, u16 port_num)
+{
+	if (type == USNIC_TRANSPORT_ROCE_CUSTOM) {
+		spin_lock(&roce_bitmap_lock);
+		if (!port_num) {
+			usnic_err("Unreserved unvalid port num 0 for %s\n",
+					usnic_transport_to_str(type));
+			goto out_roce_custom;
+		}
+
+		if (!test_bit(port_num, roce_bitmap)) {
+			usnic_err("Unreserving invalid %hu for %s\n",
+					port_num,
+					usnic_transport_to_str(type));
+			goto out_roce_custom;
+		}
+		bitmap_clear(roce_bitmap, port_num, 1);
+		usnic_dbg("Freeing port %hu for %s\n", port_num,
+				usnic_transport_to_str(type));
+out_roce_custom:
+		spin_unlock(&roce_bitmap_lock);
+	} else {
+		usnic_err("Freeing invalid port %hu for %d\n", port_num, type);
+	}
+}
+
+struct socket *usnic_transport_get_socket(int sock_fd)
+{
+	struct socket *sock;
+	int err;
+	char buf[25];
+
+	/* sockfd_lookup will internally do a fget */
+	sock = sockfd_lookup(sock_fd, &err);
+	if (!sock) {
+		usnic_err("Unable to lookup socket for fd %d with err %d\n",
+				sock_fd, err);
+		return ERR_PTR(-ENOENT);
+	}
+
+	usnic_transport_sock_to_str(buf, sizeof(buf), sock);
+	usnic_dbg("Get sock %s\n", buf);
+
+	return sock;
+}
+
+void usnic_transport_put_socket(struct socket *sock)
+{
+	char buf[100];
+
+	usnic_transport_sock_to_str(buf, sizeof(buf), sock);
+	usnic_dbg("Put sock %s\n", buf);
+	sockfd_put(sock);
+}
+
+int usnic_transport_sock_get_addr(struct socket *sock, int *proto,
+					uint32_t *addr, uint16_t *port)
+{
+	int len;
+	int err;
+	struct sockaddr_in sock_addr;
+
+	err = sock->ops->getname(sock,
+				(struct sockaddr *)&sock_addr,
+				&len, 0);
+	if (err)
+		return err;
+
+	if (sock_addr.sin_family != AF_INET)
+		return -EINVAL;
+
+	if (proto)
+		*proto = sock->sk->sk_protocol;
+	if (port)
+		*port = ntohs(((struct sockaddr_in *)&sock_addr)->sin_port);
+	if (addr)
+		*addr = ntohl(((struct sockaddr_in *)
+					&sock_addr)->sin_addr.s_addr);
+
+	return 0;
+}
+
+int usnic_transport_init(void)
+{
+	roce_bitmap = kzalloc(ROCE_BITMAP_SZ, GFP_KERNEL);
+	if (!roce_bitmap) {
+		usnic_err("Failed to allocate bit map");
+		return -ENOMEM;
+	}
+
+	/* Do not ever allocate bit 0, hence set it here */
+	bitmap_set(roce_bitmap, 0, 1);
+	return 0;
+}
+
+void usnic_transport_fini(void)
+{
+	kfree(roce_bitmap);
+}
diff --git a/drivers/infiniband/hw/usnic/usnic_transport.h b/drivers/infiniband/hw/usnic/usnic_transport.h
new file mode 100644
index 0000000..7e5dc6d
--- /dev/null
+++ b/drivers/infiniband/hw/usnic/usnic_transport.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved.
+ *
+ * This program is free software; you may redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; version 2 of the License.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifndef USNIC_TRANSPORT_H_
+#define USNIC_TRANSPORT_H_
+
+#include "usnic_abi.h"
+
+const char *usnic_transport_to_str(enum usnic_transport_type trans_type);
+/*
+ * Returns number of bytes written, excluding null terminator. If
+ * nothing was written, the function returns 0.
+ */
+int usnic_transport_sock_to_str(char *buf, int buf_sz,
+					struct socket *sock);
+/*
+ * Reserve a port. If "port_num" is set, then the function will try
+ * to reserve that particular port.
+ */
+u16 usnic_transport_rsrv_port(enum usnic_transport_type type, u16 port_num);
+void usnic_transport_unrsrv_port(enum usnic_transport_type type, u16 port_num);
+/*
+ * Do a fget on the socket refered to by sock_fd and returns the socket.
+ * Socket will not be destroyed before usnic_transport_put_socket has
+ * been called.
+ */
+struct socket *usnic_transport_get_socket(int sock_fd);
+void usnic_transport_put_socket(struct socket *sock);
+/*
+ * Call usnic_transport_get_socket before calling *_sock_get_addr
+ */
+int usnic_transport_sock_get_addr(struct socket *sock, int *proto,
+					uint32_t *addr, uint16_t *port);
+int usnic_transport_init(void);
+void usnic_transport_fini(void);
+#endif /* !USNIC_TRANSPORT_H */
diff --git a/drivers/infiniband/hw/usnic/usnic_uiom.c b/drivers/infiniband/hw/usnic/usnic_uiom.c
new file mode 100644
index 0000000..417de1f
--- /dev/null
+++ b/drivers/infiniband/hw/usnic/usnic_uiom.c
@@ -0,0 +1,604 @@
+/*
+ * Copyright (c) 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2013 Cisco Systems.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/mm.h>
+#include <linux/dma-mapping.h>
+#include <linux/sched.h>
+#include <linux/hugetlb.h>
+#include <linux/dma-attrs.h>
+#include <linux/iommu.h>
+#include <linux/workqueue.h>
+#include <linux/list.h>
+#include <linux/pci.h>
+
+#include "usnic_log.h"
+#include "usnic_uiom.h"
+#include "usnic_uiom_interval_tree.h"
+
+static struct workqueue_struct *usnic_uiom_wq;
+
+#define USNIC_UIOM_PAGE_CHUNK						\
+	((PAGE_SIZE - offsetof(struct usnic_uiom_chunk, page_list))	/\
+	((void *) &((struct usnic_uiom_chunk *) 0)->page_list[1] -	\
+	(void *) &((struct usnic_uiom_chunk *) 0)->page_list[0]))
+
+static void usnic_uiom_reg_account(struct work_struct *work)
+{
+	struct usnic_uiom_reg *umem = container_of(work,
+						struct usnic_uiom_reg, work);
+
+	down_write(&umem->mm->mmap_sem);
+	umem->mm->locked_vm -= umem->diff;
+	up_write(&umem->mm->mmap_sem);
+	mmput(umem->mm);
+	kfree(umem);
+}
+
+static int usnic_uiom_dma_fault(struct iommu_domain *domain,
+				struct device *dev,
+				unsigned long iova, int flags,
+				void *token)
+{
+	usnic_err("Device %s iommu fault domain 0x%pK va 0x%lx flags 0x%x\n",
+		dev_name(dev),
+		domain, iova, flags);
+	return -ENOSYS;
+}
+
+static void usnic_uiom_put_pages(struct list_head *chunk_list, int dirty)
+{
+	struct usnic_uiom_chunk *chunk, *tmp;
+	struct page *page;
+	struct scatterlist *sg;
+	int i;
+	dma_addr_t pa;
+
+	list_for_each_entry_safe(chunk, tmp, chunk_list, list) {
+		for_each_sg(chunk->page_list, sg, chunk->nents, i) {
+			page = sg_page(sg);
+			pa = sg_phys(sg);
+			if (dirty)
+				set_page_dirty_lock(page);
+			put_page(page);
+			usnic_dbg("pa: %pa\n", &pa);
+		}
+		kfree(chunk);
+	}
+}
+
+static int usnic_uiom_get_pages(unsigned long addr, size_t size, int writable,
+				int dmasync, struct list_head *chunk_list)
+{
+	struct page **page_list;
+	struct scatterlist *sg;
+	struct usnic_uiom_chunk *chunk;
+	unsigned long locked;
+	unsigned long lock_limit;
+	unsigned long cur_base;
+	unsigned long npages;
+	int ret;
+	int off;
+	int i;
+	int flags;
+	dma_addr_t pa;
+	DEFINE_DMA_ATTRS(attrs);
+
+	if (dmasync)
+		dma_set_attr(DMA_ATTR_WRITE_BARRIER, &attrs);
+
+	if (!can_do_mlock())
+		return -EPERM;
+
+	INIT_LIST_HEAD(chunk_list);
+
+	page_list = (struct page **) __get_free_page(GFP_KERNEL);
+	if (!page_list)
+		return -ENOMEM;
+
+	npages = PAGE_ALIGN(size + (addr & ~PAGE_MASK)) >> PAGE_SHIFT;
+
+	down_write(&current->mm->mmap_sem);
+
+	locked = npages + current->mm->locked_vm;
+	lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+
+	if ((locked > lock_limit) && !capable(CAP_IPC_LOCK)) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	flags = IOMMU_READ | IOMMU_CACHE;
+	flags |= (writable) ? IOMMU_WRITE : 0;
+	cur_base = addr & PAGE_MASK;
+	ret = 0;
+
+	while (npages) {
+		ret = get_user_pages(current, current->mm, cur_base,
+					min_t(unsigned long, npages,
+					PAGE_SIZE / sizeof(struct page *)),
+					1, !writable, page_list, NULL);
+
+		if (ret < 0)
+			goto out;
+
+		npages -= ret;
+		off = 0;
+
+		while (ret) {
+			chunk = kmalloc(sizeof(*chunk) +
+					sizeof(struct scatterlist) *
+					min_t(int, ret, USNIC_UIOM_PAGE_CHUNK),
+					GFP_KERNEL);
+			if (!chunk) {
+				ret = -ENOMEM;
+				goto out;
+			}
+
+			chunk->nents = min_t(int, ret, USNIC_UIOM_PAGE_CHUNK);
+			sg_init_table(chunk->page_list, chunk->nents);
+			for_each_sg(chunk->page_list, sg, chunk->nents, i) {
+				sg_set_page(sg, page_list[i + off],
+						PAGE_SIZE, 0);
+				pa = sg_phys(sg);
+				usnic_dbg("va: 0x%lx pa: %pa\n",
+						cur_base + i*PAGE_SIZE, &pa);
+			}
+			cur_base += chunk->nents * PAGE_SIZE;
+			ret -= chunk->nents;
+			off += chunk->nents;
+			list_add_tail(&chunk->list, chunk_list);
+		}
+
+		ret = 0;
+	}
+
+out:
+	if (ret < 0)
+		usnic_uiom_put_pages(chunk_list, 0);
+	else
+		current->mm->locked_vm = locked;
+
+	up_write(&current->mm->mmap_sem);
+	free_page((unsigned long) page_list);
+	return ret;
+}
+
+static void usnic_uiom_unmap_sorted_intervals(struct list_head *intervals,
+						struct usnic_uiom_pd *pd)
+{
+	struct usnic_uiom_interval_node *interval, *tmp;
+	long unsigned va, size;
+
+	list_for_each_entry_safe(interval, tmp, intervals, link) {
+		va = interval->start << PAGE_SHIFT;
+		size = ((interval->last - interval->start) + 1) << PAGE_SHIFT;
+		while (size > 0) {
+			/* Workaround for RH 970401 */
+			usnic_dbg("va 0x%lx size 0x%lx", va, PAGE_SIZE);
+			iommu_unmap(pd->domain, va, PAGE_SIZE);
+			va += PAGE_SIZE;
+			size -= PAGE_SIZE;
+		}
+	}
+}
+
+static void __usnic_uiom_reg_release(struct usnic_uiom_pd *pd,
+					struct usnic_uiom_reg *uiomr,
+					int dirty)
+{
+	int npages;
+	unsigned long vpn_start, vpn_last;
+	struct usnic_uiom_interval_node *interval, *tmp;
+	int writable = 0;
+	LIST_HEAD(rm_intervals);
+
+	npages = PAGE_ALIGN(uiomr->length + uiomr->offset) >> PAGE_SHIFT;
+	vpn_start = (uiomr->va & PAGE_MASK) >> PAGE_SHIFT;
+	vpn_last = vpn_start + npages - 1;
+
+	spin_lock(&pd->lock);
+	usnic_uiom_remove_interval(&pd->rb_root, vpn_start,
+					vpn_last, &rm_intervals);
+	usnic_uiom_unmap_sorted_intervals(&rm_intervals, pd);
+
+	list_for_each_entry_safe(interval, tmp, &rm_intervals, link) {
+		if (interval->flags & IOMMU_WRITE)
+			writable = 1;
+		list_del(&interval->link);
+		kfree(interval);
+	}
+
+	usnic_uiom_put_pages(&uiomr->chunk_list, dirty & writable);
+	spin_unlock(&pd->lock);
+}
+
+static int usnic_uiom_map_sorted_intervals(struct list_head *intervals,
+						struct usnic_uiom_reg *uiomr)
+{
+	int i, err;
+	size_t size;
+	struct usnic_uiom_chunk *chunk;
+	struct usnic_uiom_interval_node *interval_node;
+	dma_addr_t pa;
+	dma_addr_t pa_start = 0;
+	dma_addr_t pa_end = 0;
+	long int va_start = -EINVAL;
+	struct usnic_uiom_pd *pd = uiomr->pd;
+	long int va = uiomr->va & PAGE_MASK;
+	int flags = IOMMU_READ | IOMMU_CACHE;
+
+	flags |= (uiomr->writable) ? IOMMU_WRITE : 0;
+	chunk = list_first_entry(&uiomr->chunk_list, struct usnic_uiom_chunk,
+									list);
+	list_for_each_entry(interval_node, intervals, link) {
+iter_chunk:
+		for (i = 0; i < chunk->nents; i++, va += PAGE_SIZE) {
+			pa = sg_phys(&chunk->page_list[i]);
+			if ((va >> PAGE_SHIFT) < interval_node->start)
+				continue;
+
+			if ((va >> PAGE_SHIFT) == interval_node->start) {
+				/* First page of the interval */
+				va_start = va;
+				pa_start = pa;
+				pa_end = pa;
+			}
+
+			WARN_ON(va_start == -EINVAL);
+
+			if ((pa_end + PAGE_SIZE != pa) &&
+					(pa != pa_start)) {
+				/* PAs are not contiguous */
+				size = pa_end - pa_start + PAGE_SIZE;
+				usnic_dbg("va 0x%lx pa %pa size 0x%zx flags 0x%x",
+					va_start, &pa_start, size, flags);
+				err = iommu_map(pd->domain, va_start, pa_start,
+							size, flags);
+				if (err) {
+					usnic_err("Failed to map va 0x%lx pa %pa size 0x%zx with err %d\n",
+						va_start, &pa_start, size, err);
+					goto err_out;
+				}
+				va_start = va;
+				pa_start = pa;
+				pa_end = pa;
+			}
+
+			if ((va >> PAGE_SHIFT) == interval_node->last) {
+				/* Last page of the interval */
+				size = pa - pa_start + PAGE_SIZE;
+				usnic_dbg("va 0x%lx pa %pa size 0x%zx flags 0x%x\n",
+					va_start, &pa_start, size, flags);
+				err = iommu_map(pd->domain, va_start, pa_start,
+						size, flags);
+				if (err) {
+					usnic_err("Failed to map va 0x%lx pa %pa size 0x%zx with err %d\n",
+						va_start, &pa_start, size, err);
+					goto err_out;
+				}
+				break;
+			}
+
+			if (pa != pa_start)
+				pa_end += PAGE_SIZE;
+		}
+
+		if (i == chunk->nents) {
+			/*
+			 * Hit last entry of the chunk,
+			 * hence advance to next chunk
+			 */
+			chunk = list_first_entry(&chunk->list,
+							struct usnic_uiom_chunk,
+							list);
+			goto iter_chunk;
+		}
+	}
+
+	return 0;
+
+err_out:
+	usnic_uiom_unmap_sorted_intervals(intervals, pd);
+	return err;
+}
+
+struct usnic_uiom_reg *usnic_uiom_reg_get(struct usnic_uiom_pd *pd,
+						unsigned long addr, size_t size,
+						int writable, int dmasync)
+{
+	struct usnic_uiom_reg *uiomr;
+	unsigned long va_base, vpn_start, vpn_last;
+	unsigned long npages;
+	int offset, err;
+	LIST_HEAD(sorted_diff_intervals);
+
+	/*
+	 * Intel IOMMU map throws an error if a translation entry is
+	 * changed from read to write.  This module may not unmap
+	 * and then remap the entry after fixing the permission
+	 * b/c this open up a small windows where hw DMA may page fault
+	 * Hence, make all entries to be writable.
+	 */
+	writable = 1;
+
+	va_base = addr & PAGE_MASK;
+	offset = addr & ~PAGE_MASK;
+	npages = PAGE_ALIGN(size + offset) >> PAGE_SHIFT;
+	vpn_start = (addr & PAGE_MASK) >> PAGE_SHIFT;
+	vpn_last = vpn_start + npages - 1;
+
+	uiomr = kmalloc(sizeof(*uiomr), GFP_KERNEL);
+	if (!uiomr)
+		return ERR_PTR(-ENOMEM);
+
+	uiomr->va = va_base;
+	uiomr->offset = offset;
+	uiomr->length = size;
+	uiomr->writable = writable;
+	uiomr->pd = pd;
+
+	err = usnic_uiom_get_pages(addr, size, writable, dmasync,
+					&uiomr->chunk_list);
+	if (err) {
+		usnic_err("Failed get_pages vpn [0x%lx,0x%lx] err %d\n",
+				vpn_start, vpn_last, err);
+		goto out_free_uiomr;
+	}
+
+	spin_lock(&pd->lock);
+	err = usnic_uiom_get_intervals_diff(vpn_start, vpn_last,
+						(writable) ? IOMMU_WRITE : 0,
+						IOMMU_WRITE,
+						&pd->rb_root,
+						&sorted_diff_intervals);
+	if (err) {
+		usnic_err("Failed disjoint interval vpn [0x%lx,0x%lx] err %d\n",
+						vpn_start, vpn_last, err);
+		goto out_put_pages;
+	}
+
+	err = usnic_uiom_map_sorted_intervals(&sorted_diff_intervals, uiomr);
+	if (err) {
+		usnic_err("Failed map interval vpn [0x%lx,0x%lx] err %d\n",
+						vpn_start, vpn_last, err);
+		goto out_put_intervals;
+
+	}
+
+	err = usnic_uiom_insert_interval(&pd->rb_root, vpn_start, vpn_last,
+					(writable) ? IOMMU_WRITE : 0);
+	if (err) {
+		usnic_err("Failed insert interval vpn [0x%lx,0x%lx] err %d\n",
+						vpn_start, vpn_last, err);
+		goto out_unmap_intervals;
+	}
+
+	usnic_uiom_put_interval_set(&sorted_diff_intervals);
+	spin_unlock(&pd->lock);
+
+	return uiomr;
+
+out_unmap_intervals:
+	usnic_uiom_unmap_sorted_intervals(&sorted_diff_intervals, pd);
+out_put_intervals:
+	usnic_uiom_put_interval_set(&sorted_diff_intervals);
+out_put_pages:
+	usnic_uiom_put_pages(&uiomr->chunk_list, 0);
+	spin_unlock(&pd->lock);
+out_free_uiomr:
+	kfree(uiomr);
+	return ERR_PTR(err);
+}
+
+void usnic_uiom_reg_release(struct usnic_uiom_reg *uiomr, int closing)
+{
+	struct mm_struct *mm;
+	unsigned long diff;
+
+	__usnic_uiom_reg_release(uiomr->pd, uiomr, 1);
+
+	mm = get_task_mm(current);
+	if (!mm) {
+		kfree(uiomr);
+		return;
+	}
+
+	diff = PAGE_ALIGN(uiomr->length + uiomr->offset) >> PAGE_SHIFT;
+
+	/*
+	 * We may be called with the mm's mmap_sem already held.  This
+	 * can happen when a userspace munmap() is the call that drops
+	 * the last reference to our file and calls our release
+	 * method.  If there are memory regions to destroy, we'll end
+	 * up here and not be able to take the mmap_sem.  In that case
+	 * we defer the vm_locked accounting to the system workqueue.
+	 */
+	if (closing) {
+		if (!down_write_trylock(&mm->mmap_sem)) {
+			INIT_WORK(&uiomr->work, usnic_uiom_reg_account);
+			uiomr->mm = mm;
+			uiomr->diff = diff;
+
+			queue_work(usnic_uiom_wq, &uiomr->work);
+			return;
+		}
+	} else
+		down_write(&mm->mmap_sem);
+
+	current->mm->locked_vm -= diff;
+	up_write(&mm->mmap_sem);
+	mmput(mm);
+	kfree(uiomr);
+}
+
+struct usnic_uiom_pd *usnic_uiom_alloc_pd(void)
+{
+	struct usnic_uiom_pd *pd;
+	void *domain;
+
+	pd = kzalloc(sizeof(*pd), GFP_KERNEL);
+	if (!pd)
+		return ERR_PTR(-ENOMEM);
+
+	pd->domain = domain = iommu_domain_alloc(&pci_bus_type);
+	if (IS_ERR_OR_NULL(domain)) {
+		usnic_err("Failed to allocate IOMMU domain with err %ld\n",
+				PTR_ERR(pd->domain));
+		kfree(pd);
+		return ERR_PTR(domain ? PTR_ERR(domain) : -ENOMEM);
+	}
+
+	iommu_set_fault_handler(pd->domain, usnic_uiom_dma_fault, NULL);
+
+	spin_lock_init(&pd->lock);
+	INIT_LIST_HEAD(&pd->devs);
+
+	return pd;
+}
+
+void usnic_uiom_dealloc_pd(struct usnic_uiom_pd *pd)
+{
+	iommu_domain_free(pd->domain);
+	kfree(pd);
+}
+
+int usnic_uiom_attach_dev_to_pd(struct usnic_uiom_pd *pd, struct device *dev)
+{
+	struct usnic_uiom_dev *uiom_dev;
+	int err;
+
+	uiom_dev = kzalloc(sizeof(*uiom_dev), GFP_ATOMIC);
+	if (!uiom_dev)
+		return -ENOMEM;
+	uiom_dev->dev = dev;
+
+	err = iommu_attach_device(pd->domain, dev);
+	if (err)
+		goto out_free_dev;
+
+	if (!iommu_capable(dev->bus, IOMMU_CAP_CACHE_COHERENCY)) {
+		usnic_err("IOMMU of %s does not support cache coherency\n",
+				dev_name(dev));
+		err = -EINVAL;
+		goto out_detach_device;
+	}
+
+	spin_lock(&pd->lock);
+	list_add_tail(&uiom_dev->link, &pd->devs);
+	pd->dev_cnt++;
+	spin_unlock(&pd->lock);
+
+	return 0;
+
+out_detach_device:
+	iommu_detach_device(pd->domain, dev);
+out_free_dev:
+	kfree(uiom_dev);
+	return err;
+}
+
+void usnic_uiom_detach_dev_from_pd(struct usnic_uiom_pd *pd, struct device *dev)
+{
+	struct usnic_uiom_dev *uiom_dev;
+	int found = 0;
+
+	spin_lock(&pd->lock);
+	list_for_each_entry(uiom_dev, &pd->devs, link) {
+		if (uiom_dev->dev == dev) {
+			found = 1;
+			break;
+		}
+	}
+
+	if (!found) {
+		usnic_err("Unable to free dev %s - not found\n",
+				dev_name(dev));
+		spin_unlock(&pd->lock);
+		return;
+	}
+
+	list_del(&uiom_dev->link);
+	pd->dev_cnt--;
+	spin_unlock(&pd->lock);
+
+	return iommu_detach_device(pd->domain, dev);
+}
+
+struct device **usnic_uiom_get_dev_list(struct usnic_uiom_pd *pd)
+{
+	struct usnic_uiom_dev *uiom_dev;
+	struct device **devs;
+	int i = 0;
+
+	spin_lock(&pd->lock);
+	devs = kcalloc(pd->dev_cnt + 1, sizeof(*devs), GFP_ATOMIC);
+	if (!devs) {
+		devs = ERR_PTR(-ENOMEM);
+		goto out;
+	}
+
+	list_for_each_entry(uiom_dev, &pd->devs, link) {
+		devs[i++] = uiom_dev->dev;
+	}
+out:
+	spin_unlock(&pd->lock);
+	return devs;
+}
+
+void usnic_uiom_free_dev_list(struct device **devs)
+{
+	kfree(devs);
+}
+
+int usnic_uiom_init(char *drv_name)
+{
+	if (!iommu_present(&pci_bus_type)) {
+		usnic_err("IOMMU required but not present or enabled.  USNIC QPs will not function w/o enabling IOMMU\n");
+		return -EPERM;
+	}
+
+	usnic_uiom_wq = create_workqueue(drv_name);
+	if (!usnic_uiom_wq) {
+		usnic_err("Unable to alloc wq for drv %s\n", drv_name);
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+void usnic_uiom_fini(void)
+{
+	flush_workqueue(usnic_uiom_wq);
+	destroy_workqueue(usnic_uiom_wq);
+}
diff --git a/drivers/infiniband/hw/usnic/usnic_uiom.h b/drivers/infiniband/hw/usnic/usnic_uiom.h
new file mode 100644
index 0000000..7044099
--- /dev/null
+++ b/drivers/infiniband/hw/usnic/usnic_uiom.h
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved.
+ *
+ * This program is free software; you may redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; version 2 of the License.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifndef USNIC_UIOM_H_
+#define USNIC_UIOM_H_
+
+#include <linux/list.h>
+#include <linux/scatterlist.h>
+
+#include "usnic_uiom_interval_tree.h"
+
+#define USNIC_UIOM_READ			(1)
+#define USNIC_UIOM_WRITE		(2)
+
+#define USNIC_UIOM_MAX_PD_CNT		(1000)
+#define USNIC_UIOM_MAX_MR_CNT		(1000000)
+#define USNIC_UIOM_MAX_MR_SIZE		(~0UL)
+#define USNIC_UIOM_PAGE_SIZE		(PAGE_SIZE)
+
+struct usnic_uiom_dev {
+	struct device			*dev;
+	struct list_head		link;
+};
+
+struct usnic_uiom_pd {
+	struct iommu_domain		*domain;
+	spinlock_t			lock;
+	struct rb_root			rb_root;
+	struct list_head		devs;
+	int				dev_cnt;
+};
+
+struct usnic_uiom_reg {
+	struct usnic_uiom_pd		*pd;
+	unsigned long			va;
+	size_t				length;
+	int				offset;
+	int				page_size;
+	int				writable;
+	struct list_head		chunk_list;
+	struct work_struct		work;
+	struct mm_struct		*mm;
+	unsigned long			diff;
+};
+
+struct usnic_uiom_chunk {
+	struct list_head		list;
+	int				nents;
+	struct scatterlist		page_list[0];
+};
+
+struct usnic_uiom_pd *usnic_uiom_alloc_pd(void);
+void usnic_uiom_dealloc_pd(struct usnic_uiom_pd *pd);
+int usnic_uiom_attach_dev_to_pd(struct usnic_uiom_pd *pd, struct device *dev);
+void usnic_uiom_detach_dev_from_pd(struct usnic_uiom_pd *pd,
+					struct device *dev);
+struct device **usnic_uiom_get_dev_list(struct usnic_uiom_pd *pd);
+void usnic_uiom_free_dev_list(struct device **devs);
+struct usnic_uiom_reg *usnic_uiom_reg_get(struct usnic_uiom_pd *pd,
+						unsigned long addr, size_t size,
+						int access, int dmasync);
+void usnic_uiom_reg_release(struct usnic_uiom_reg *uiomr, int closing);
+int usnic_uiom_init(char *drv_name);
+void usnic_uiom_fini(void);
+#endif /* USNIC_UIOM_H_ */
diff --git a/drivers/infiniband/hw/usnic/usnic_uiom_interval_tree.c b/drivers/infiniband/hw/usnic/usnic_uiom_interval_tree.c
new file mode 100644
index 0000000..3a4288e
--- /dev/null
+++ b/drivers/infiniband/hw/usnic/usnic_uiom_interval_tree.c
@@ -0,0 +1,254 @@
+/*
+ * Copyright (c) 2014, Cisco Systems, Inc. All rights reserved.
+ *
+ * This program is free software; you may redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; version 2 of the License.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/list_sort.h>
+
+#include <linux/interval_tree_generic.h>
+#include "usnic_uiom_interval_tree.h"
+
+#define START(node) ((node)->start)
+#define LAST(node) ((node)->last)
+
+#define MAKE_NODE(node, start, end, ref_cnt, flags, err, err_out)	\
+		do {							\
+			node = usnic_uiom_interval_node_alloc(start,	\
+					end, ref_cnt, flags);		\
+				if (!node) {				\
+					err = -ENOMEM;			\
+					goto err_out;			\
+				}					\
+		} while (0)
+
+#define MARK_FOR_ADD(node, list) (list_add_tail(&node->link, list))
+
+#define MAKE_NODE_AND_APPEND(node, start, end, ref_cnt, flags, err,	\
+				err_out, list)				\
+				do {					\
+					MAKE_NODE(node, start, end,	\
+						ref_cnt, flags, err,	\
+						err_out);		\
+					MARK_FOR_ADD(node, list);	\
+				} while (0)
+
+#define FLAGS_EQUAL(flags1, flags2, mask)				\
+			(((flags1) & (mask)) == ((flags2) & (mask)))
+
+static struct usnic_uiom_interval_node*
+usnic_uiom_interval_node_alloc(long int start, long int last, int ref_cnt,
+				int flags)
+{
+	struct usnic_uiom_interval_node *interval = kzalloc(sizeof(*interval),
+								GFP_ATOMIC);
+	if (!interval)
+		return NULL;
+
+	interval->start = start;
+	interval->last = last;
+	interval->flags = flags;
+	interval->ref_cnt = ref_cnt;
+
+	return interval;
+}
+
+static int interval_cmp(void *priv, struct list_head *a, struct list_head *b)
+{
+	struct usnic_uiom_interval_node *node_a, *node_b;
+
+	node_a = list_entry(a, struct usnic_uiom_interval_node, link);
+	node_b = list_entry(b, struct usnic_uiom_interval_node, link);
+
+	/* long to int */
+	if (node_a->start < node_b->start)
+		return -1;
+	else if (node_a->start > node_b->start)
+		return 1;
+
+	return 0;
+}
+
+static void
+find_intervals_intersection_sorted(struct rb_root *root, unsigned long start,
+					unsigned long last,
+					struct list_head *list)
+{
+	struct usnic_uiom_interval_node *node;
+
+	INIT_LIST_HEAD(list);
+
+	for (node = usnic_uiom_interval_tree_iter_first(root, start, last);
+		node;
+		node = usnic_uiom_interval_tree_iter_next(node, start, last))
+		list_add_tail(&node->link, list);
+
+	list_sort(NULL, list, interval_cmp);
+}
+
+int usnic_uiom_get_intervals_diff(unsigned long start, unsigned long last,
+					int flags, int flag_mask,
+					struct rb_root *root,
+					struct list_head *diff_set)
+{
+	struct usnic_uiom_interval_node *interval, *tmp;
+	int err = 0;
+	long int pivot = start;
+	LIST_HEAD(intersection_set);
+
+	INIT_LIST_HEAD(diff_set);
+
+	find_intervals_intersection_sorted(root, start, last,
+						&intersection_set);
+
+	list_for_each_entry(interval, &intersection_set, link) {
+		if (pivot < interval->start) {
+			MAKE_NODE_AND_APPEND(tmp, pivot, interval->start - 1,
+						1, flags, err, err_out,
+						diff_set);
+			pivot = interval->start;
+		}
+
+		/*
+		 * Invariant: Set [start, pivot] is either in diff_set or root,
+		 * but not in both.
+		 */
+
+		if (pivot > interval->last) {
+			continue;
+		} else if (pivot <= interval->last &&
+				FLAGS_EQUAL(interval->flags, flags,
+				flag_mask)) {
+			pivot = interval->last + 1;
+		}
+	}
+
+	if (pivot <= last)
+		MAKE_NODE_AND_APPEND(tmp, pivot, last, 1, flags, err, err_out,
+					diff_set);
+
+	return 0;
+
+err_out:
+	list_for_each_entry_safe(interval, tmp, diff_set, link) {
+		list_del(&interval->link);
+		kfree(interval);
+	}
+
+	return err;
+}
+
+void usnic_uiom_put_interval_set(struct list_head *intervals)
+{
+	struct usnic_uiom_interval_node *interval, *tmp;
+	list_for_each_entry_safe(interval, tmp, intervals, link)
+		kfree(interval);
+}
+
+int usnic_uiom_insert_interval(struct rb_root *root, unsigned long start,
+				unsigned long last, int flags)
+{
+	struct usnic_uiom_interval_node *interval, *tmp;
+	unsigned long istart, ilast;
+	int iref_cnt, iflags;
+	unsigned long lpivot = start;
+	int err = 0;
+	LIST_HEAD(to_add);
+	LIST_HEAD(intersection_set);
+
+	find_intervals_intersection_sorted(root, start, last,
+						&intersection_set);
+
+	list_for_each_entry(interval, &intersection_set, link) {
+		/*
+		 * Invariant - lpivot is the left edge of next interval to be
+		 * inserted
+		 */
+		istart = interval->start;
+		ilast = interval->last;
+		iref_cnt = interval->ref_cnt;
+		iflags = interval->flags;
+
+		if (istart < lpivot) {
+			MAKE_NODE_AND_APPEND(tmp, istart, lpivot - 1, iref_cnt,
+						iflags, err, err_out, &to_add);
+		} else if (istart > lpivot) {
+			MAKE_NODE_AND_APPEND(tmp, lpivot, istart - 1, 1, flags,
+						err, err_out, &to_add);
+			lpivot = istart;
+		} else {
+			lpivot = istart;
+		}
+
+		if (ilast > last) {
+			MAKE_NODE_AND_APPEND(tmp, lpivot, last, iref_cnt + 1,
+						iflags | flags, err, err_out,
+						&to_add);
+			MAKE_NODE_AND_APPEND(tmp, last + 1, ilast, iref_cnt,
+						iflags, err, err_out, &to_add);
+		} else {
+			MAKE_NODE_AND_APPEND(tmp, lpivot, ilast, iref_cnt + 1,
+						iflags | flags, err, err_out,
+						&to_add);
+		}
+
+		lpivot = ilast + 1;
+	}
+
+	if (lpivot <= last)
+		MAKE_NODE_AND_APPEND(tmp, lpivot, last, 1, flags, err, err_out,
+					&to_add);
+
+	list_for_each_entry_safe(interval, tmp, &intersection_set, link) {
+		usnic_uiom_interval_tree_remove(interval, root);
+		kfree(interval);
+	}
+
+	list_for_each_entry(interval, &to_add, link)
+		usnic_uiom_interval_tree_insert(interval, root);
+
+	return 0;
+
+err_out:
+	list_for_each_entry_safe(interval, tmp, &to_add, link)
+		kfree(interval);
+
+	return err;
+}
+
+void usnic_uiom_remove_interval(struct rb_root *root, unsigned long start,
+				unsigned long last, struct list_head *removed)
+{
+	struct usnic_uiom_interval_node *interval;
+
+	for (interval = usnic_uiom_interval_tree_iter_first(root, start, last);
+			interval;
+			interval = usnic_uiom_interval_tree_iter_next(interval,
+									start,
+									last)) {
+		if (--interval->ref_cnt == 0)
+			list_add_tail(&interval->link, removed);
+	}
+
+	list_for_each_entry(interval, removed, link)
+		usnic_uiom_interval_tree_remove(interval, root);
+}
+
+INTERVAL_TREE_DEFINE(struct usnic_uiom_interval_node, rb,
+			unsigned long, __subtree_last,
+			START, LAST, , usnic_uiom_interval_tree)
diff --git a/drivers/infiniband/hw/usnic/usnic_uiom_interval_tree.h b/drivers/infiniband/hw/usnic/usnic_uiom_interval_tree.h
new file mode 100644
index 0000000..d4f752e
--- /dev/null
+++ b/drivers/infiniband/hw/usnic/usnic_uiom_interval_tree.h
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved.
+ *
+ * This program is free software; you may redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; version 2 of the License.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifndef USNIC_UIOM_INTERVAL_TREE_H_
+#define USNIC_UIOM_INTERVAL_TREE_H_
+
+#include <linux/rbtree.h>
+
+struct usnic_uiom_interval_node {
+	struct rb_node			rb;
+	struct list_head		link;
+	unsigned long			start;
+	unsigned long			last;
+	unsigned long			__subtree_last;
+	unsigned int			ref_cnt;
+	int				flags;
+};
+
+extern void
+usnic_uiom_interval_tree_insert(struct usnic_uiom_interval_node *node,
+					struct rb_root *root);
+extern void
+usnic_uiom_interval_tree_remove(struct usnic_uiom_interval_node *node,
+					struct rb_root *root);
+extern struct usnic_uiom_interval_node *
+usnic_uiom_interval_tree_iter_first(struct rb_root *root,
+					unsigned long start,
+					unsigned long last);
+extern struct usnic_uiom_interval_node *
+usnic_uiom_interval_tree_iter_next(struct usnic_uiom_interval_node *node,
+			unsigned long start, unsigned long last);
+/*
+ * Inserts {start...last} into {root}.  If there are overlaps,
+ * nodes will be broken up and merged
+ */
+int usnic_uiom_insert_interval(struct rb_root *root,
+				unsigned long start, unsigned long last,
+				int flags);
+/*
+ * Removed {start...last} from {root}.  The nodes removed are returned in
+ * 'removed.' The caller is responsibile for freeing memory of nodes in
+ * 'removed.'
+ */
+void usnic_uiom_remove_interval(struct rb_root *root,
+				unsigned long start, unsigned long last,
+				struct list_head *removed);
+/*
+ * Returns {start...last} - {root} (relative complement of {start...last} in
+ * {root}) in diff_set sorted ascendingly
+ */
+int usnic_uiom_get_intervals_diff(unsigned long start,
+					unsigned long last, int flags,
+					int flag_mask,
+					struct rb_root *root,
+					struct list_head *diff_set);
+/* Call this to free diff_set returned by usnic_uiom_get_intervals_diff */
+void usnic_uiom_put_interval_set(struct list_head *intervals);
+#endif /* USNIC_UIOM_INTERVAL_TREE_H_ */
diff --git a/drivers/infiniband/hw/usnic/usnic_vnic.c b/drivers/infiniband/hw/usnic/usnic_vnic.c
new file mode 100644
index 0000000..656b88c
--- /dev/null
+++ b/drivers/infiniband/hw/usnic/usnic_vnic.c
@@ -0,0 +1,467 @@
+/*
+ * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved.
+ *
+ * This program is free software; you may redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; version 2 of the License.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/errno.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+
+#include "usnic_ib.h"
+#include "vnic_resource.h"
+#include "usnic_log.h"
+#include "usnic_vnic.h"
+
+struct usnic_vnic {
+	struct vnic_dev			*vdev;
+	struct vnic_dev_bar		bar[PCI_NUM_RESOURCES];
+	struct usnic_vnic_res_chunk	chunks[USNIC_VNIC_RES_TYPE_MAX];
+	spinlock_t			res_lock;
+};
+
+static enum vnic_res_type _to_vnic_res_type(enum usnic_vnic_res_type res_type)
+{
+#define DEFINE_USNIC_VNIC_RES_AT(usnic_vnic_res_t, vnic_res_type, desc, val) \
+		vnic_res_type,
+#define DEFINE_USNIC_VNIC_RES(usnic_vnic_res_t, vnic_res_type, desc) \
+		vnic_res_type,
+	static enum vnic_res_type usnic_vnic_type_2_vnic_type[] = {
+						USNIC_VNIC_RES_TYPES};
+#undef DEFINE_USNIC_VNIC_RES
+#undef DEFINE_USNIC_VNIC_RES_AT
+
+	if (res_type >= USNIC_VNIC_RES_TYPE_MAX)
+		return RES_TYPE_MAX;
+
+	return usnic_vnic_type_2_vnic_type[res_type];
+}
+
+const char *usnic_vnic_res_type_to_str(enum usnic_vnic_res_type res_type)
+{
+#define DEFINE_USNIC_VNIC_RES_AT(usnic_vnic_res_t, vnic_res_type, desc, val) \
+		desc,
+#define DEFINE_USNIC_VNIC_RES(usnic_vnic_res_t, vnic_res_type, desc) \
+		desc,
+	static const char * const usnic_vnic_res_type_desc[] = {
+						USNIC_VNIC_RES_TYPES};
+#undef DEFINE_USNIC_VNIC_RES
+#undef DEFINE_USNIC_VNIC_RES_AT
+
+	if (res_type >= USNIC_VNIC_RES_TYPE_MAX)
+		return "unknown";
+
+	return usnic_vnic_res_type_desc[res_type];
+
+}
+
+const char *usnic_vnic_pci_name(struct usnic_vnic *vnic)
+{
+	return pci_name(usnic_vnic_get_pdev(vnic));
+}
+
+int usnic_vnic_dump(struct usnic_vnic *vnic, char *buf,
+			int buf_sz,
+			void *hdr_obj,
+			int (*printtitle)(void *, char*, int),
+			int (*printcols)(char *, int),
+			int (*printrow)(void *, char *, int))
+{
+	struct usnic_vnic_res_chunk *chunk;
+	struct usnic_vnic_res *res;
+	struct vnic_dev_bar *bar0;
+	int i, j, offset;
+
+	offset = 0;
+	bar0 = usnic_vnic_get_bar(vnic, 0);
+	offset += scnprintf(buf + offset, buf_sz - offset,
+			"VF:%hu BAR0 bus_addr=%pa vaddr=0x%p size=%ld ",
+			usnic_vnic_get_index(vnic),
+			&bar0->bus_addr,
+			bar0->vaddr, bar0->len);
+	if (printtitle)
+		offset += printtitle(hdr_obj, buf + offset, buf_sz - offset);
+	offset += scnprintf(buf + offset, buf_sz - offset, "\n");
+	offset += scnprintf(buf + offset, buf_sz - offset,
+			"|RES\t|CTRL_PIN\t\t|IN_USE\t");
+	if (printcols)
+		offset += printcols(buf + offset, buf_sz - offset);
+	offset += scnprintf(buf + offset, buf_sz - offset, "\n");
+
+	spin_lock(&vnic->res_lock);
+	for (i = 0; i < ARRAY_SIZE(vnic->chunks); i++) {
+		chunk = &vnic->chunks[i];
+		for (j = 0; j < chunk->cnt; j++) {
+			res = chunk->res[j];
+			offset += scnprintf(buf + offset, buf_sz - offset,
+					"|%s[%u]\t|0x%p\t|%u\t",
+					usnic_vnic_res_type_to_str(res->type),
+					res->vnic_idx, res->ctrl, !!res->owner);
+			if (printrow) {
+				offset += printrow(res->owner, buf + offset,
+							buf_sz - offset);
+			}
+			offset += scnprintf(buf + offset, buf_sz - offset,
+						"\n");
+		}
+	}
+	spin_unlock(&vnic->res_lock);
+	return offset;
+}
+
+void usnic_vnic_res_spec_update(struct usnic_vnic_res_spec *spec,
+				enum usnic_vnic_res_type trgt_type,
+				u16 cnt)
+{
+	int i;
+
+	for (i = 0; i < USNIC_VNIC_RES_TYPE_MAX; i++) {
+		if (spec->resources[i].type == trgt_type) {
+			spec->resources[i].cnt = cnt;
+			return;
+		}
+	}
+
+	WARN_ON(1);
+}
+
+int usnic_vnic_res_spec_satisfied(const struct usnic_vnic_res_spec *min_spec,
+					struct usnic_vnic_res_spec *res_spec)
+{
+	int found, i, j;
+
+	for (i = 0; i < USNIC_VNIC_RES_TYPE_MAX; i++) {
+		found = 0;
+
+		for (j = 0; j < USNIC_VNIC_RES_TYPE_MAX; j++) {
+			if (res_spec->resources[i].type !=
+				min_spec->resources[i].type)
+				continue;
+			found = 1;
+			if (min_spec->resources[i].cnt >
+					res_spec->resources[i].cnt)
+				return -EINVAL;
+			break;
+		}
+
+		if (!found)
+			return -EINVAL;
+	}
+	return 0;
+}
+
+int usnic_vnic_spec_dump(char *buf, int buf_sz,
+				struct usnic_vnic_res_spec *res_spec)
+{
+	enum usnic_vnic_res_type res_type;
+	int res_cnt;
+	int i;
+	int offset = 0;
+
+	for (i = 0; i < USNIC_VNIC_RES_TYPE_MAX; i++) {
+		res_type = res_spec->resources[i].type;
+		res_cnt = res_spec->resources[i].cnt;
+		offset += scnprintf(buf + offset, buf_sz - offset,
+				"Res: %s Cnt: %d ",
+				usnic_vnic_res_type_to_str(res_type),
+				res_cnt);
+	}
+
+	return offset;
+}
+
+int usnic_vnic_check_room(struct usnic_vnic *vnic,
+				struct usnic_vnic_res_spec *res_spec)
+{
+	int i;
+	enum usnic_vnic_res_type res_type;
+	int res_cnt;
+
+	for (i = 0; i < USNIC_VNIC_RES_TYPE_MAX; i++) {
+		res_type = res_spec->resources[i].type;
+		res_cnt = res_spec->resources[i].cnt;
+
+		if (res_type == USNIC_VNIC_RES_TYPE_EOL)
+			break;
+
+		if (res_cnt > usnic_vnic_res_free_cnt(vnic, res_type))
+			return -EBUSY;
+	}
+
+	return 0;
+}
+
+int usnic_vnic_res_cnt(struct usnic_vnic *vnic,
+				enum usnic_vnic_res_type type)
+{
+	return vnic->chunks[type].cnt;
+}
+
+int usnic_vnic_res_free_cnt(struct usnic_vnic *vnic,
+				enum usnic_vnic_res_type type)
+{
+	return vnic->chunks[type].free_cnt;
+}
+
+struct usnic_vnic_res_chunk *
+usnic_vnic_get_resources(struct usnic_vnic *vnic, enum usnic_vnic_res_type type,
+				int cnt, void *owner)
+{
+	struct usnic_vnic_res_chunk *src, *ret;
+	struct usnic_vnic_res *res;
+	int i;
+
+	if (usnic_vnic_res_free_cnt(vnic, type) < cnt || cnt < 1 || !owner)
+		return ERR_PTR(-EINVAL);
+
+	ret = kzalloc(sizeof(*ret), GFP_ATOMIC);
+	if (!ret) {
+		usnic_err("Failed to allocate chunk for %s - Out of memory\n",
+				usnic_vnic_pci_name(vnic));
+		return ERR_PTR(-ENOMEM);
+	}
+
+	ret->res = kzalloc(sizeof(*(ret->res))*cnt, GFP_ATOMIC);
+	if (!ret->res) {
+		usnic_err("Failed to allocate resources for %s. Out of memory\n",
+				usnic_vnic_pci_name(vnic));
+		kfree(ret);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	spin_lock(&vnic->res_lock);
+	src = &vnic->chunks[type];
+	for (i = 0; i < src->cnt && ret->cnt < cnt; i++) {
+		res = src->res[i];
+		if (!res->owner) {
+			src->free_cnt--;
+			res->owner = owner;
+			ret->res[ret->cnt++] = res;
+		}
+	}
+
+	spin_unlock(&vnic->res_lock);
+	ret->type = type;
+	ret->vnic = vnic;
+	WARN_ON(ret->cnt != cnt);
+
+	return ret;
+}
+
+void usnic_vnic_put_resources(struct usnic_vnic_res_chunk *chunk)
+{
+
+	struct usnic_vnic_res *res;
+	int i;
+	struct usnic_vnic *vnic = chunk->vnic;
+
+	spin_lock(&vnic->res_lock);
+	while ((i = --chunk->cnt) >= 0) {
+		res = chunk->res[i];
+		chunk->res[i] = NULL;
+		res->owner = NULL;
+		vnic->chunks[res->type].free_cnt++;
+	}
+	spin_unlock(&vnic->res_lock);
+
+	kfree(chunk->res);
+	kfree(chunk);
+}
+
+u16 usnic_vnic_get_index(struct usnic_vnic *vnic)
+{
+	return usnic_vnic_get_pdev(vnic)->devfn - 1;
+}
+
+static int usnic_vnic_alloc_res_chunk(struct usnic_vnic *vnic,
+					enum usnic_vnic_res_type type,
+					struct usnic_vnic_res_chunk *chunk)
+{
+	int cnt, err, i;
+	struct usnic_vnic_res *res;
+
+	cnt = vnic_dev_get_res_count(vnic->vdev, _to_vnic_res_type(type));
+	if (cnt < 1)
+		return -EINVAL;
+
+	chunk->cnt = chunk->free_cnt = cnt;
+	chunk->res = kzalloc(sizeof(*(chunk->res))*cnt, GFP_KERNEL);
+	if (!chunk->res)
+		return -ENOMEM;
+
+	for (i = 0; i < cnt; i++) {
+		res = kzalloc(sizeof(*res), GFP_KERNEL);
+		if (!res) {
+			err = -ENOMEM;
+			goto fail;
+		}
+		res->type = type;
+		res->vnic_idx = i;
+		res->vnic = vnic;
+		res->ctrl = vnic_dev_get_res(vnic->vdev,
+						_to_vnic_res_type(type), i);
+		chunk->res[i] = res;
+	}
+
+	chunk->vnic = vnic;
+	return 0;
+fail:
+	for (i--; i >= 0; i--)
+		kfree(chunk->res[i]);
+	kfree(chunk->res);
+	return err;
+}
+
+static void usnic_vnic_free_res_chunk(struct usnic_vnic_res_chunk *chunk)
+{
+	int i;
+	for (i = 0; i < chunk->cnt; i++)
+		kfree(chunk->res[i]);
+	kfree(chunk->res);
+}
+
+static int usnic_vnic_discover_resources(struct pci_dev *pdev,
+						struct usnic_vnic *vnic)
+{
+	enum usnic_vnic_res_type res_type;
+	int i;
+	int err = 0;
+
+	for (i = 0; i < ARRAY_SIZE(vnic->bar); i++) {
+		if (!(pci_resource_flags(pdev, i) & IORESOURCE_MEM))
+			continue;
+		vnic->bar[i].len = pci_resource_len(pdev, i);
+		vnic->bar[i].vaddr = pci_iomap(pdev, i, vnic->bar[i].len);
+		if (!vnic->bar[i].vaddr) {
+			usnic_err("Cannot memory-map BAR %d, aborting\n",
+					i);
+			err = -ENODEV;
+			goto out_clean_bar;
+		}
+		vnic->bar[i].bus_addr = pci_resource_start(pdev, i);
+	}
+
+	vnic->vdev = vnic_dev_register(NULL, pdev, pdev, vnic->bar,
+			ARRAY_SIZE(vnic->bar));
+	if (!vnic->vdev) {
+		usnic_err("Failed to register device %s\n",
+				pci_name(pdev));
+		err = -EINVAL;
+		goto out_clean_bar;
+	}
+
+	for (res_type = USNIC_VNIC_RES_TYPE_EOL + 1;
+			res_type < USNIC_VNIC_RES_TYPE_MAX; res_type++) {
+		err = usnic_vnic_alloc_res_chunk(vnic, res_type,
+						&vnic->chunks[res_type]);
+		if (err) {
+			usnic_err("Failed to alloc res %s with err %d\n",
+					usnic_vnic_res_type_to_str(res_type),
+					err);
+			goto out_clean_chunks;
+		}
+	}
+
+	return 0;
+
+out_clean_chunks:
+	for (res_type--; res_type > USNIC_VNIC_RES_TYPE_EOL; res_type--)
+		usnic_vnic_free_res_chunk(&vnic->chunks[res_type]);
+	vnic_dev_unregister(vnic->vdev);
+out_clean_bar:
+	for (i = 0; i < ARRAY_SIZE(vnic->bar); i++) {
+		if (!(pci_resource_flags(pdev, i) & IORESOURCE_MEM))
+			continue;
+		if (!vnic->bar[i].vaddr)
+			break;
+
+		iounmap(vnic->bar[i].vaddr);
+	}
+
+	return err;
+}
+
+struct pci_dev *usnic_vnic_get_pdev(struct usnic_vnic *vnic)
+{
+	return vnic_dev_get_pdev(vnic->vdev);
+}
+
+struct vnic_dev_bar *usnic_vnic_get_bar(struct usnic_vnic *vnic,
+				int bar_num)
+{
+	return (bar_num < ARRAY_SIZE(vnic->bar)) ? &vnic->bar[bar_num] : NULL;
+}
+
+static void usnic_vnic_release_resources(struct usnic_vnic *vnic)
+{
+	int i;
+	struct pci_dev *pdev;
+	enum usnic_vnic_res_type res_type;
+
+	pdev = usnic_vnic_get_pdev(vnic);
+
+	for (res_type = USNIC_VNIC_RES_TYPE_EOL + 1;
+			res_type < USNIC_VNIC_RES_TYPE_MAX; res_type++)
+		usnic_vnic_free_res_chunk(&vnic->chunks[res_type]);
+
+	vnic_dev_unregister(vnic->vdev);
+
+	for (i = 0; i < ARRAY_SIZE(vnic->bar); i++) {
+		if (!(pci_resource_flags(pdev, i) & IORESOURCE_MEM))
+			continue;
+		iounmap(vnic->bar[i].vaddr);
+	}
+}
+
+struct usnic_vnic *usnic_vnic_alloc(struct pci_dev *pdev)
+{
+	struct usnic_vnic *vnic;
+	int err = 0;
+
+	if (!pci_is_enabled(pdev)) {
+		usnic_err("PCI dev %s is disabled\n", pci_name(pdev));
+		return ERR_PTR(-EINVAL);
+	}
+
+	vnic = kzalloc(sizeof(*vnic), GFP_KERNEL);
+	if (!vnic) {
+		usnic_err("Failed to alloc vnic for %s - out of memory\n",
+				pci_name(pdev));
+		return ERR_PTR(-ENOMEM);
+	}
+
+	spin_lock_init(&vnic->res_lock);
+
+	err = usnic_vnic_discover_resources(pdev, vnic);
+	if (err) {
+		usnic_err("Failed to discover %s resources with err %d\n",
+				pci_name(pdev), err);
+		goto out_free_vnic;
+	}
+
+	usnic_dbg("Allocated vnic for %s\n", usnic_vnic_pci_name(vnic));
+
+	return vnic;
+
+out_free_vnic:
+	kfree(vnic);
+
+	return ERR_PTR(err);
+}
+
+void usnic_vnic_free(struct usnic_vnic *vnic)
+{
+	usnic_vnic_release_resources(vnic);
+	kfree(vnic);
+}
diff --git a/drivers/infiniband/hw/usnic/usnic_vnic.h b/drivers/infiniband/hw/usnic/usnic_vnic.h
new file mode 100644
index 0000000..14d931a
--- /dev/null
+++ b/drivers/infiniband/hw/usnic/usnic_vnic.h
@@ -0,0 +1,103 @@
+/*
+ * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved.
+ *
+ * This program is free software; you may redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; version 2 of the License.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifndef USNIC_VNIC_H_
+#define USNIC_VNIC_H_
+
+#include <linux/pci.h>
+
+#include "vnic_dev.h"
+
+/*                      =USNIC_VNIC_RES_TYPE= =VNIC_RES=   =DESC= */
+#define USNIC_VNIC_RES_TYPES \
+	DEFINE_USNIC_VNIC_RES_AT(EOL, RES_TYPE_EOL, "EOL", 0) \
+	DEFINE_USNIC_VNIC_RES(WQ, RES_TYPE_WQ, "WQ") \
+	DEFINE_USNIC_VNIC_RES(RQ, RES_TYPE_RQ, "RQ") \
+	DEFINE_USNIC_VNIC_RES(CQ, RES_TYPE_CQ, "CQ") \
+	DEFINE_USNIC_VNIC_RES(INTR, RES_TYPE_INTR_CTRL, "INT") \
+	DEFINE_USNIC_VNIC_RES(MAX, RES_TYPE_MAX, "MAX")\
+
+#define DEFINE_USNIC_VNIC_RES_AT(usnic_vnic_res_t, vnic_res_type, desc, val) \
+	USNIC_VNIC_RES_TYPE_##usnic_vnic_res_t = val,
+#define DEFINE_USNIC_VNIC_RES(usnic_vnic_res_t, vnic_res_type, desc) \
+	USNIC_VNIC_RES_TYPE_##usnic_vnic_res_t,
+enum usnic_vnic_res_type {
+	USNIC_VNIC_RES_TYPES
+};
+#undef DEFINE_USNIC_VNIC_RES
+#undef DEFINE_USNIC_VNIC_RES_AT
+
+struct usnic_vnic_res {
+	enum usnic_vnic_res_type	type;
+	unsigned int			vnic_idx;
+	struct usnic_vnic		*vnic;
+	void __iomem			*ctrl;
+	void				*owner;
+};
+
+struct usnic_vnic_res_chunk {
+	enum usnic_vnic_res_type	type;
+	int				cnt;
+	int				free_cnt;
+	struct usnic_vnic_res		**res;
+	struct usnic_vnic		*vnic;
+};
+
+struct usnic_vnic_res_desc {
+	enum usnic_vnic_res_type	type;
+	uint16_t			cnt;
+};
+
+struct usnic_vnic_res_spec {
+	struct usnic_vnic_res_desc resources[USNIC_VNIC_RES_TYPE_MAX];
+};
+
+const char *usnic_vnic_res_type_to_str(enum usnic_vnic_res_type res_type);
+const char *usnic_vnic_pci_name(struct usnic_vnic *vnic);
+int usnic_vnic_dump(struct usnic_vnic *vnic, char *buf, int buf_sz,
+			void *hdr_obj,
+			int (*printtitle)(void *, char*, int),
+			int (*printcols)(char *, int),
+			int (*printrow)(void *, char *, int));
+void usnic_vnic_res_spec_update(struct usnic_vnic_res_spec *spec,
+				enum usnic_vnic_res_type trgt_type,
+				u16 cnt);
+int usnic_vnic_res_spec_satisfied(const struct usnic_vnic_res_spec *min_spec,
+					struct usnic_vnic_res_spec *res_spec);
+int usnic_vnic_spec_dump(char *buf, int buf_sz,
+				struct usnic_vnic_res_spec *res_spec);
+int usnic_vnic_check_room(struct usnic_vnic *vnic,
+				struct usnic_vnic_res_spec *res_spec);
+int usnic_vnic_res_cnt(struct usnic_vnic *vnic,
+				enum usnic_vnic_res_type type);
+int usnic_vnic_res_free_cnt(struct usnic_vnic *vnic,
+				enum usnic_vnic_res_type type);
+struct usnic_vnic_res_chunk *
+usnic_vnic_get_resources(struct usnic_vnic *vnic,
+				enum usnic_vnic_res_type type,
+				int cnt,
+				void *owner);
+void usnic_vnic_put_resources(struct usnic_vnic_res_chunk *chunk);
+struct pci_dev *usnic_vnic_get_pdev(struct usnic_vnic *vnic);
+struct vnic_dev_bar *usnic_vnic_get_bar(struct usnic_vnic *vnic,
+				int bar_num);
+struct usnic_vnic *usnic_vnic_alloc(struct pci_dev *pdev);
+void usnic_vnic_free(struct usnic_vnic *vnic);
+u16 usnic_vnic_get_index(struct usnic_vnic *vnic);
+
+#endif /*!USNIC_VNIC_H_*/
diff --git a/drivers/infiniband/ulp/Makefile b/drivers/infiniband/ulp/Makefile
new file mode 100644
index 0000000..f3c7dcf
--- /dev/null
+++ b/drivers/infiniband/ulp/Makefile
@@ -0,0 +1,5 @@
+obj-$(CONFIG_INFINIBAND_IPOIB)		+= ipoib/
+obj-$(CONFIG_INFINIBAND_SRP)		+= srp/
+obj-$(CONFIG_INFINIBAND_SRPT)		+= srpt/
+obj-$(CONFIG_INFINIBAND_ISER)		+= iser/
+obj-$(CONFIG_INFINIBAND_ISERT)		+= isert/
diff --git a/drivers/infiniband/ulp/ipoib/Kconfig b/drivers/infiniband/ulp/ipoib/Kconfig
new file mode 100644
index 0000000..cda8eac
--- /dev/null
+++ b/drivers/infiniband/ulp/ipoib/Kconfig
@@ -0,0 +1,49 @@
+config INFINIBAND_IPOIB
+	tristate "IP-over-InfiniBand"
+	depends on NETDEVICES && INET && (IPV6 || IPV6=n)
+	---help---
+	  Support for the IP-over-InfiniBand protocol (IPoIB). This
+	  transports IP packets over InfiniBand so you can use your IB
+	  device as a fancy NIC.
+
+	  See Documentation/infiniband/ipoib.txt for more information
+
+config INFINIBAND_IPOIB_CM
+	bool "IP-over-InfiniBand Connected Mode support"
+	depends on INFINIBAND_IPOIB
+	default n
+	---help---
+	  This option enables support for IPoIB connected mode.  After
+	  enabling this option, you need to switch to connected mode
+	  through /sys/class/net/ibXXX/mode to actually create
+	  connections, and then increase the interface MTU with
+	  e.g. ifconfig ib0 mtu 65520.
+
+	  WARNING: Enabling connected mode will trigger some packet
+	  drops for multicast and UD mode traffic from this interface,
+	  unless you limit mtu for these destinations to 2044.
+
+config INFINIBAND_IPOIB_DEBUG
+	bool "IP-over-InfiniBand debugging" if EXPERT
+	depends on INFINIBAND_IPOIB
+	default y
+	---help---
+	  This option causes debugging code to be compiled into the
+	  IPoIB driver.  The output can be turned on via the
+	  debug_level and mcast_debug_level module parameters (which
+	  can also be set after the driver is loaded through sysfs).
+
+	  This option also creates a directory tree under ipoib/ in
+	  debugfs, which contains files that expose debugging
+	  information about IB multicast groups used by the IPoIB
+	  driver.
+
+config INFINIBAND_IPOIB_DEBUG_DATA
+	bool "IP-over-InfiniBand data path debugging"
+	depends on INFINIBAND_IPOIB_DEBUG
+	---help---
+	  This option compiles debugging code into the data path
+	  of the IPoIB driver.  The output can be turned on via the
+	  data_debug_level module parameter; however, even with output
+	  turned off, this debugging code will have some performance
+	  impact.
diff --git a/drivers/infiniband/ulp/ipoib/Makefile b/drivers/infiniband/ulp/ipoib/Makefile
new file mode 100644
index 0000000..e5430dd
--- /dev/null
+++ b/drivers/infiniband/ulp/ipoib/Makefile
@@ -0,0 +1,12 @@
+obj-$(CONFIG_INFINIBAND_IPOIB)			+= ib_ipoib.o
+
+ib_ipoib-y					:= ipoib_main.o \
+						   ipoib_ib.o \
+						   ipoib_multicast.o \
+						   ipoib_verbs.o \
+						   ipoib_vlan.o \
+						   ipoib_ethtool.o \
+						   ipoib_netlink.o
+ib_ipoib-$(CONFIG_INFINIBAND_IPOIB_CM)		+= ipoib_cm.o
+ib_ipoib-$(CONFIG_INFINIBAND_IPOIB_DEBUG)	+= ipoib_fs.o
+
diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h
new file mode 100644
index 0000000..d7562be
--- /dev/null
+++ b/drivers/infiniband/ulp/ipoib/ipoib.h
@@ -0,0 +1,773 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2004 Voltaire, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _IPOIB_H
+#define _IPOIB_H
+
+#include <linux/list.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/workqueue.h>
+#include <linux/kref.h>
+#include <linux/if_infiniband.h>
+#include <linux/mutex.h>
+
+#include <net/neighbour.h>
+#include <net/sch_generic.h>
+
+#include <linux/atomic.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_pack.h>
+#include <rdma/ib_sa.h>
+#include <linux/sched.h>
+
+/* constants */
+
+enum ipoib_flush_level {
+	IPOIB_FLUSH_LIGHT,
+	IPOIB_FLUSH_NORMAL,
+	IPOIB_FLUSH_HEAVY
+};
+
+enum {
+	IPOIB_ENCAP_LEN		  = 4,
+
+	IPOIB_UD_HEAD_SIZE	  = IB_GRH_BYTES + IPOIB_ENCAP_LEN,
+	IPOIB_UD_RX_SG		  = 2, /* max buffer needed for 4K mtu */
+
+	IPOIB_CM_MTU		  = 0x10000 - 0x10, /* padding to align header to 16 */
+	IPOIB_CM_BUF_SIZE	  = IPOIB_CM_MTU  + IPOIB_ENCAP_LEN,
+	IPOIB_CM_HEAD_SIZE	  = IPOIB_CM_BUF_SIZE % PAGE_SIZE,
+	IPOIB_CM_RX_SG		  = ALIGN(IPOIB_CM_BUF_SIZE, PAGE_SIZE) / PAGE_SIZE,
+	IPOIB_RX_RING_SIZE	  = 256,
+	IPOIB_TX_RING_SIZE	  = 128,
+	IPOIB_MAX_QUEUE_SIZE	  = 8192,
+	IPOIB_MIN_QUEUE_SIZE	  = 2,
+	IPOIB_CM_MAX_CONN_QP	  = 4096,
+
+	IPOIB_NUM_WC		  = 4,
+
+	IPOIB_MAX_PATH_REC_QUEUE  = 3,
+	IPOIB_MAX_MCAST_QUEUE	  = 3,
+
+	IPOIB_FLAG_OPER_UP	  = 0,
+	IPOIB_FLAG_INITIALIZED	  = 1,
+	IPOIB_FLAG_ADMIN_UP	  = 2,
+	IPOIB_PKEY_ASSIGNED	  = 3,
+	IPOIB_FLAG_SUBINTERFACE	  = 5,
+	IPOIB_MCAST_RUN		  = 6,
+	IPOIB_STOP_REAPER	  = 7,
+	IPOIB_FLAG_ADMIN_CM	  = 9,
+	IPOIB_FLAG_UMCAST	  = 10,
+	IPOIB_STOP_NEIGH_GC	  = 11,
+	IPOIB_NEIGH_TBL_FLUSH	  = 12,
+
+	IPOIB_MAX_BACKOFF_SECONDS = 16,
+
+	IPOIB_MCAST_FLAG_FOUND	  = 0,	/* used in set_multicast_list */
+	IPOIB_MCAST_FLAG_SENDONLY = 1,
+	IPOIB_MCAST_FLAG_BUSY	  = 2,	/* joining or already joined */
+	IPOIB_MCAST_FLAG_ATTACHED = 3,
+	IPOIB_MCAST_JOIN_STARTED  = 4,
+
+	MAX_SEND_CQE		  = 16,
+	IPOIB_CM_COPYBREAK	  = 256,
+
+	IPOIB_NON_CHILD		  = 0,
+	IPOIB_LEGACY_CHILD	  = 1,
+	IPOIB_RTNL_CHILD	  = 2,
+};
+
+#define	IPOIB_OP_RECV   (1ul << 31)
+#ifdef CONFIG_INFINIBAND_IPOIB_CM
+#define	IPOIB_OP_CM     (1ul << 30)
+#else
+#define	IPOIB_OP_CM     (0)
+#endif
+
+#define IPOIB_QPN_MASK ((__force u32) cpu_to_be32(0xFFFFFF))
+
+/* structs */
+
+struct ipoib_header {
+	__be16	proto;
+	u16	reserved;
+};
+
+struct ipoib_cb {
+	struct qdisc_skb_cb	qdisc_cb;
+	u8			hwaddr[INFINIBAND_ALEN];
+};
+
+static inline struct ipoib_cb *ipoib_skb_cb(const struct sk_buff *skb)
+{
+	BUILD_BUG_ON(sizeof(skb->cb) < sizeof(struct ipoib_cb));
+	return (struct ipoib_cb *)skb->cb;
+}
+
+/* Used for all multicast joins (broadcast, IPv4 mcast and IPv6 mcast) */
+struct ipoib_mcast {
+	struct ib_sa_mcmember_rec mcmember;
+	struct ib_sa_multicast	 *mc;
+	struct ipoib_ah		 *ah;
+
+	struct rb_node    rb_node;
+	struct list_head  list;
+
+	unsigned long created;
+	unsigned long backoff;
+
+	unsigned long flags;
+	unsigned char logcount;
+
+	struct list_head  neigh_list;
+
+	struct sk_buff_head pkt_queue;
+
+	struct net_device *dev;
+	struct completion done;
+};
+
+struct ipoib_rx_buf {
+	struct sk_buff *skb;
+	u64		mapping[IPOIB_UD_RX_SG];
+};
+
+struct ipoib_tx_buf {
+	struct sk_buff *skb;
+	u64		mapping[MAX_SKB_FRAGS + 1];
+};
+
+struct ipoib_cm_tx_buf {
+	struct sk_buff *skb;
+	u64		mapping;
+};
+
+struct ib_cm_id;
+
+struct ipoib_cm_data {
+	__be32 qpn; /* High byte MUST be ignored on receive */
+	__be32 mtu;
+};
+
+/*
+ * Quoting 10.3.1 Queue Pair and EE Context States:
+ *
+ * Note, for QPs that are associated with an SRQ, the Consumer should take the
+ * QP through the Error State before invoking a Destroy QP or a Modify QP to the
+ * Reset State.  The Consumer may invoke the Destroy QP without first performing
+ * a Modify QP to the Error State and waiting for the Affiliated Asynchronous
+ * Last WQE Reached Event. However, if the Consumer does not wait for the
+ * Affiliated Asynchronous Last WQE Reached Event, then WQE and Data Segment
+ * leakage may occur. Therefore, it is good programming practice to tear down a
+ * QP that is associated with an SRQ by using the following process:
+ *
+ * - Put the QP in the Error State
+ * - Wait for the Affiliated Asynchronous Last WQE Reached Event;
+ * - either:
+ *       drain the CQ by invoking the Poll CQ verb and either wait for CQ
+ *       to be empty or the number of Poll CQ operations has exceeded
+ *       CQ capacity size;
+ * - or
+ *       post another WR that completes on the same CQ and wait for this
+ *       WR to return as a WC;
+ * - and then invoke a Destroy QP or Reset QP.
+ *
+ * We use the second option and wait for a completion on the
+ * same CQ before destroying QPs attached to our SRQ.
+ */
+
+enum ipoib_cm_state {
+	IPOIB_CM_RX_LIVE,
+	IPOIB_CM_RX_ERROR, /* Ignored by stale task */
+	IPOIB_CM_RX_FLUSH  /* Last WQE Reached event observed */
+};
+
+struct ipoib_cm_rx {
+	struct ib_cm_id	       *id;
+	struct ib_qp	       *qp;
+	struct ipoib_cm_rx_buf *rx_ring;
+	struct list_head	list;
+	struct net_device      *dev;
+	unsigned long		jiffies;
+	enum ipoib_cm_state	state;
+	int			recv_count;
+};
+
+struct ipoib_cm_tx {
+	struct ib_cm_id	    *id;
+	struct ib_qp	    *qp;
+	struct list_head     list;
+	struct net_device   *dev;
+	struct ipoib_neigh  *neigh;
+	struct ipoib_path   *path;
+	struct ipoib_cm_tx_buf *tx_ring;
+	unsigned	     tx_head;
+	unsigned	     tx_tail;
+	unsigned long	     flags;
+	u32		     mtu;
+};
+
+struct ipoib_cm_rx_buf {
+	struct sk_buff *skb;
+	u64 mapping[IPOIB_CM_RX_SG];
+};
+
+struct ipoib_cm_dev_priv {
+	struct ib_srq	       *srq;
+	struct ipoib_cm_rx_buf *srq_ring;
+	struct ib_cm_id	       *id;
+	struct list_head	passive_ids;   /* state: LIVE */
+	struct list_head	rx_error_list; /* state: ERROR */
+	struct list_head	rx_flush_list; /* state: FLUSH, drain not started */
+	struct list_head	rx_drain_list; /* state: FLUSH, drain started */
+	struct list_head	rx_reap_list;  /* state: FLUSH, drain done */
+	struct work_struct      start_task;
+	struct work_struct      reap_task;
+	struct work_struct      skb_task;
+	struct work_struct      rx_reap_task;
+	struct delayed_work     stale_task;
+	struct sk_buff_head     skb_queue;
+	struct list_head	start_list;
+	struct list_head	reap_list;
+	struct ib_wc		ibwc[IPOIB_NUM_WC];
+	struct ib_sge		rx_sge[IPOIB_CM_RX_SG];
+	struct ib_recv_wr       rx_wr;
+	int			nonsrq_conn_qp;
+	int			max_cm_mtu;
+	int			num_frags;
+};
+
+struct ipoib_ethtool_st {
+	u16     coalesce_usecs;
+	u16     max_coalesced_frames;
+};
+
+struct ipoib_neigh_table;
+
+struct ipoib_neigh_hash {
+	struct ipoib_neigh_table       *ntbl;
+	struct ipoib_neigh __rcu      **buckets;
+	struct rcu_head			rcu;
+	u32				mask;
+	u32				size;
+};
+
+struct ipoib_neigh_table {
+	struct ipoib_neigh_hash __rcu  *htbl;
+	atomic_t			entries;
+	struct completion		flushed;
+	struct completion		deleted;
+};
+
+/*
+ * Device private locking: network stack tx_lock protects members used
+ * in TX fast path, lock protects everything else.  lock nests inside
+ * of tx_lock (ie tx_lock must be acquired first if needed).
+ */
+struct ipoib_dev_priv {
+	spinlock_t lock;
+
+	struct net_device *dev;
+
+	struct napi_struct napi;
+
+	unsigned long flags;
+
+	struct rw_semaphore vlan_rwsem;
+
+	struct rb_root  path_tree;
+	struct list_head path_list;
+
+	struct ipoib_neigh_table ntbl;
+
+	struct ipoib_mcast *broadcast;
+	struct list_head multicast_list;
+	struct rb_root multicast_tree;
+
+	struct delayed_work mcast_task;
+	struct work_struct carrier_on_task;
+	struct work_struct flush_light;
+	struct work_struct flush_normal;
+	struct work_struct flush_heavy;
+	struct work_struct restart_task;
+	struct delayed_work ah_reap_task;
+	struct delayed_work neigh_reap_task;
+	struct ib_device *ca;
+	u8		  port;
+	u16		  pkey;
+	u16		  pkey_index;
+	struct ib_pd	 *pd;
+	struct ib_mr	 *mr;
+	struct ib_cq	 *recv_cq;
+	struct ib_cq	 *send_cq;
+	struct ib_qp	 *qp;
+	u32		  qkey;
+
+	union ib_gid local_gid;
+	u16	     local_lid;
+
+	unsigned int admin_mtu;
+	unsigned int mcast_mtu;
+	unsigned int max_ib_mtu;
+
+	struct ipoib_rx_buf *rx_ring;
+
+	struct ipoib_tx_buf *tx_ring;
+	unsigned	     tx_head;
+	unsigned	     tx_tail;
+	struct ib_sge	     tx_sge[MAX_SKB_FRAGS + 1];
+	struct ib_send_wr    tx_wr;
+	unsigned	     tx_outstanding;
+	struct ib_wc	     send_wc[MAX_SEND_CQE];
+
+	struct ib_recv_wr    rx_wr;
+	struct ib_sge	     rx_sge[IPOIB_UD_RX_SG];
+
+	struct ib_wc ibwc[IPOIB_NUM_WC];
+
+	struct list_head dead_ahs;
+
+	struct ib_event_handler event_handler;
+
+	struct net_device *parent;
+	struct list_head child_intfs;
+	struct list_head list;
+	int    child_type;
+
+#ifdef CONFIG_INFINIBAND_IPOIB_CM
+	struct ipoib_cm_dev_priv cm;
+#endif
+
+#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
+	struct list_head fs_list;
+	struct dentry *mcg_dentry;
+	struct dentry *path_dentry;
+#endif
+	int	hca_caps;
+	struct ipoib_ethtool_st ethtool;
+	struct timer_list poll_timer;
+};
+
+struct ipoib_ah {
+	struct net_device *dev;
+	struct ib_ah	  *ah;
+	struct list_head   list;
+	struct kref	   ref;
+	unsigned	   last_send;
+};
+
+struct ipoib_path {
+	struct net_device    *dev;
+	struct ib_sa_path_rec pathrec;
+	struct ipoib_ah      *ah;
+	struct sk_buff_head   queue;
+
+	struct list_head      neigh_list;
+
+	int		      query_id;
+	struct ib_sa_query   *query;
+	struct completion     done;
+
+	struct rb_node	      rb_node;
+	struct list_head      list;
+	int  		      valid;
+};
+
+struct ipoib_neigh {
+	struct ipoib_ah    *ah;
+#ifdef CONFIG_INFINIBAND_IPOIB_CM
+	struct ipoib_cm_tx *cm;
+#endif
+	u8     daddr[INFINIBAND_ALEN];
+	struct sk_buff_head queue;
+
+	struct net_device *dev;
+
+	struct list_head    list;
+	struct ipoib_neigh __rcu *hnext;
+	struct rcu_head     rcu;
+	atomic_t	    refcnt;
+	unsigned long       alive;
+};
+
+#define IPOIB_UD_MTU(ib_mtu)		(ib_mtu - IPOIB_ENCAP_LEN)
+#define IPOIB_UD_BUF_SIZE(ib_mtu)	(ib_mtu + IB_GRH_BYTES)
+
+static inline int ipoib_ud_need_sg(unsigned int ib_mtu)
+{
+	return IPOIB_UD_BUF_SIZE(ib_mtu) > PAGE_SIZE;
+}
+
+void ipoib_neigh_dtor(struct ipoib_neigh *neigh);
+static inline void ipoib_neigh_put(struct ipoib_neigh *neigh)
+{
+	if (atomic_dec_and_test(&neigh->refcnt))
+		ipoib_neigh_dtor(neigh);
+}
+struct ipoib_neigh *ipoib_neigh_get(struct net_device *dev, u8 *daddr);
+struct ipoib_neigh *ipoib_neigh_alloc(u8 *daddr,
+				      struct net_device *dev);
+void ipoib_neigh_free(struct ipoib_neigh *neigh);
+void ipoib_del_neighs_by_gid(struct net_device *dev, u8 *gid);
+
+extern struct workqueue_struct *ipoib_workqueue;
+
+/* functions */
+
+int ipoib_poll(struct napi_struct *napi, int budget);
+void ipoib_ib_completion(struct ib_cq *cq, void *dev_ptr);
+void ipoib_send_comp_handler(struct ib_cq *cq, void *dev_ptr);
+
+struct ipoib_ah *ipoib_create_ah(struct net_device *dev,
+				 struct ib_pd *pd, struct ib_ah_attr *attr);
+void ipoib_free_ah(struct kref *kref);
+static inline void ipoib_put_ah(struct ipoib_ah *ah)
+{
+	kref_put(&ah->ref, ipoib_free_ah);
+}
+int ipoib_open(struct net_device *dev);
+int ipoib_add_pkey_attr(struct net_device *dev);
+int ipoib_add_umcast_attr(struct net_device *dev);
+
+void ipoib_send(struct net_device *dev, struct sk_buff *skb,
+		struct ipoib_ah *address, u32 qpn);
+void ipoib_reap_ah(struct work_struct *work);
+
+void ipoib_mark_paths_invalid(struct net_device *dev);
+void ipoib_flush_paths(struct net_device *dev);
+struct ipoib_dev_priv *ipoib_intf_alloc(const char *format);
+
+int ipoib_ib_dev_init(struct net_device *dev, struct ib_device *ca, int port);
+void ipoib_ib_dev_flush_light(struct work_struct *work);
+void ipoib_ib_dev_flush_normal(struct work_struct *work);
+void ipoib_ib_dev_flush_heavy(struct work_struct *work);
+void ipoib_pkey_event(struct work_struct *work);
+void ipoib_ib_dev_cleanup(struct net_device *dev);
+
+int ipoib_ib_dev_open(struct net_device *dev, int flush);
+int ipoib_ib_dev_up(struct net_device *dev);
+int ipoib_ib_dev_down(struct net_device *dev, int flush);
+int ipoib_ib_dev_stop(struct net_device *dev, int flush);
+void ipoib_pkey_dev_check_presence(struct net_device *dev);
+
+int ipoib_dev_init(struct net_device *dev, struct ib_device *ca, int port);
+void ipoib_dev_cleanup(struct net_device *dev);
+
+void ipoib_mcast_join_task(struct work_struct *work);
+void ipoib_mcast_carrier_on_task(struct work_struct *work);
+void ipoib_mcast_send(struct net_device *dev, u8 *daddr, struct sk_buff *skb);
+
+void ipoib_mcast_restart_task(struct work_struct *work);
+int ipoib_mcast_start_thread(struct net_device *dev);
+int ipoib_mcast_stop_thread(struct net_device *dev, int flush);
+
+void ipoib_mcast_dev_down(struct net_device *dev);
+void ipoib_mcast_dev_flush(struct net_device *dev);
+
+#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
+struct ipoib_mcast_iter *ipoib_mcast_iter_init(struct net_device *dev);
+int ipoib_mcast_iter_next(struct ipoib_mcast_iter *iter);
+void ipoib_mcast_iter_read(struct ipoib_mcast_iter *iter,
+				  union ib_gid *gid,
+				  unsigned long *created,
+				  unsigned int *queuelen,
+				  unsigned int *complete,
+				  unsigned int *send_only);
+
+struct ipoib_path_iter *ipoib_path_iter_init(struct net_device *dev);
+int ipoib_path_iter_next(struct ipoib_path_iter *iter);
+void ipoib_path_iter_read(struct ipoib_path_iter *iter,
+			  struct ipoib_path *path);
+#endif
+
+int ipoib_mcast_attach(struct net_device *dev, u16 mlid,
+		       union ib_gid *mgid, int set_qkey);
+
+int ipoib_init_qp(struct net_device *dev);
+int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca);
+void ipoib_transport_dev_cleanup(struct net_device *dev);
+
+void ipoib_event(struct ib_event_handler *handler,
+		 struct ib_event *record);
+
+int ipoib_vlan_add(struct net_device *pdev, unsigned short pkey);
+int ipoib_vlan_delete(struct net_device *pdev, unsigned short pkey);
+
+int __ipoib_vlan_add(struct ipoib_dev_priv *ppriv, struct ipoib_dev_priv *priv,
+		     u16 pkey, int child_type);
+
+int  __init ipoib_netlink_init(void);
+void __exit ipoib_netlink_fini(void);
+
+void ipoib_set_umcast(struct net_device *ndev, int umcast_val);
+int  ipoib_set_mode(struct net_device *dev, const char *buf);
+
+void ipoib_setup(struct net_device *dev);
+
+void ipoib_pkey_open(struct ipoib_dev_priv *priv);
+void ipoib_drain_cq(struct net_device *dev);
+
+void ipoib_set_ethtool_ops(struct net_device *dev);
+int ipoib_set_dev_features(struct ipoib_dev_priv *priv, struct ib_device *hca);
+
+#define IPOIB_FLAGS_RC		0x80
+#define IPOIB_FLAGS_UC		0x40
+
+/* We don't support UC connections at the moment */
+#define IPOIB_CM_SUPPORTED(ha)   (ha[0] & (IPOIB_FLAGS_RC))
+
+#ifdef CONFIG_INFINIBAND_IPOIB_CM
+
+extern int ipoib_max_conn_qp;
+
+static inline int ipoib_cm_admin_enabled(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	return IPOIB_CM_SUPPORTED(dev->dev_addr) &&
+		test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags);
+}
+
+static inline int ipoib_cm_enabled(struct net_device *dev, u8 *hwaddr)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	return IPOIB_CM_SUPPORTED(hwaddr) &&
+		test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags);
+}
+
+static inline int ipoib_cm_up(struct ipoib_neigh *neigh)
+
+{
+	return test_bit(IPOIB_FLAG_OPER_UP, &neigh->cm->flags);
+}
+
+static inline struct ipoib_cm_tx *ipoib_cm_get(struct ipoib_neigh *neigh)
+{
+	return neigh->cm;
+}
+
+static inline void ipoib_cm_set(struct ipoib_neigh *neigh, struct ipoib_cm_tx *tx)
+{
+	neigh->cm = tx;
+}
+
+static inline int ipoib_cm_has_srq(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	return !!priv->cm.srq;
+}
+
+static inline unsigned int ipoib_cm_max_mtu(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	return priv->cm.max_cm_mtu;
+}
+
+void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_tx *tx);
+int ipoib_cm_dev_open(struct net_device *dev);
+void ipoib_cm_dev_stop(struct net_device *dev);
+int ipoib_cm_dev_init(struct net_device *dev);
+int ipoib_cm_add_mode_attr(struct net_device *dev);
+void ipoib_cm_dev_cleanup(struct net_device *dev);
+struct ipoib_cm_tx *ipoib_cm_create_tx(struct net_device *dev, struct ipoib_path *path,
+				    struct ipoib_neigh *neigh);
+void ipoib_cm_destroy_tx(struct ipoib_cm_tx *tx);
+void ipoib_cm_skb_too_long(struct net_device *dev, struct sk_buff *skb,
+			   unsigned int mtu);
+void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc);
+void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ib_wc *wc);
+#else
+
+struct ipoib_cm_tx;
+
+#define ipoib_max_conn_qp 0
+
+static inline int ipoib_cm_admin_enabled(struct net_device *dev)
+{
+	return 0;
+}
+static inline int ipoib_cm_enabled(struct net_device *dev, u8 *hwaddr)
+
+{
+	return 0;
+}
+
+static inline int ipoib_cm_up(struct ipoib_neigh *neigh)
+
+{
+	return 0;
+}
+
+static inline struct ipoib_cm_tx *ipoib_cm_get(struct ipoib_neigh *neigh)
+{
+	return NULL;
+}
+
+static inline void ipoib_cm_set(struct ipoib_neigh *neigh, struct ipoib_cm_tx *tx)
+{
+}
+
+static inline int ipoib_cm_has_srq(struct net_device *dev)
+{
+	return 0;
+}
+
+static inline unsigned int ipoib_cm_max_mtu(struct net_device *dev)
+{
+	return 0;
+}
+
+static inline
+void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_tx *tx)
+{
+	return;
+}
+
+static inline
+int ipoib_cm_dev_open(struct net_device *dev)
+{
+	return 0;
+}
+
+static inline
+void ipoib_cm_dev_stop(struct net_device *dev)
+{
+	return;
+}
+
+static inline
+int ipoib_cm_dev_init(struct net_device *dev)
+{
+	return -ENOSYS;
+}
+
+static inline
+void ipoib_cm_dev_cleanup(struct net_device *dev)
+{
+	return;
+}
+
+static inline
+struct ipoib_cm_tx *ipoib_cm_create_tx(struct net_device *dev, struct ipoib_path *path,
+				    struct ipoib_neigh *neigh)
+{
+	return NULL;
+}
+
+static inline
+void ipoib_cm_destroy_tx(struct ipoib_cm_tx *tx)
+{
+	return;
+}
+
+static inline
+int ipoib_cm_add_mode_attr(struct net_device *dev)
+{
+	return 0;
+}
+
+static inline void ipoib_cm_skb_too_long(struct net_device *dev, struct sk_buff *skb,
+					 unsigned int mtu)
+{
+	dev_kfree_skb_any(skb);
+}
+
+static inline void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc)
+{
+}
+
+static inline void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ib_wc *wc)
+{
+}
+#endif
+
+#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
+void ipoib_create_debug_files(struct net_device *dev);
+void ipoib_delete_debug_files(struct net_device *dev);
+int ipoib_register_debugfs(void);
+void ipoib_unregister_debugfs(void);
+#else
+static inline void ipoib_create_debug_files(struct net_device *dev) { }
+static inline void ipoib_delete_debug_files(struct net_device *dev) { }
+static inline int ipoib_register_debugfs(void) { return 0; }
+static inline void ipoib_unregister_debugfs(void) { }
+#endif
+
+#define ipoib_printk(level, priv, format, arg...)	\
+	printk(level "%s: " format, ((struct ipoib_dev_priv *) priv)->dev->name , ## arg)
+#define ipoib_warn(priv, format, arg...)		\
+	ipoib_printk(KERN_WARNING, priv, format , ## arg)
+
+extern int ipoib_sendq_size;
+extern int ipoib_recvq_size;
+
+extern struct ib_sa_client ipoib_sa_client;
+
+#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
+extern int ipoib_debug_level;
+
+#define ipoib_dbg(priv, format, arg...)			\
+	do {						\
+		if (ipoib_debug_level > 0)			\
+			ipoib_printk(KERN_DEBUG, priv, format , ## arg); \
+	} while (0)
+#define ipoib_dbg_mcast(priv, format, arg...)		\
+	do {						\
+		if (mcast_debug_level > 0)		\
+			ipoib_printk(KERN_DEBUG, priv, format , ## arg); \
+	} while (0)
+#else /* CONFIG_INFINIBAND_IPOIB_DEBUG */
+#define ipoib_dbg(priv, format, arg...)			\
+	do { (void) (priv); } while (0)
+#define ipoib_dbg_mcast(priv, format, arg...)		\
+	do { (void) (priv); } while (0)
+#endif /* CONFIG_INFINIBAND_IPOIB_DEBUG */
+
+#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG_DATA
+#define ipoib_dbg_data(priv, format, arg...)		\
+	do {						\
+		if (data_debug_level > 0)		\
+			ipoib_printk(KERN_DEBUG, priv, format , ## arg); \
+	} while (0)
+#else /* CONFIG_INFINIBAND_IPOIB_DEBUG_DATA */
+#define ipoib_dbg_data(priv, format, arg...)		\
+	do { (void) (priv); } while (0)
+#endif /* CONFIG_INFINIBAND_IPOIB_DEBUG_DATA */
+
+#define IPOIB_QPN(ha) (be32_to_cpup((__be32 *) ha) & 0xffffff)
+
+extern const char ipoib_driver_version[];
+
+#endif /* _IPOIB_H */
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
new file mode 100644
index 0000000..933efce
--- /dev/null
+++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
@@ -0,0 +1,1615 @@
+/*
+ * Copyright (c) 2006 Mellanox Technologies. All rights reserved
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <rdma/ib_cm.h>
+#include <net/dst.h>
+#include <net/icmp.h>
+#include <linux/icmpv6.h>
+#include <linux/delay.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/moduleparam.h>
+
+#include "ipoib.h"
+
+int ipoib_max_conn_qp = 128;
+
+module_param_named(max_nonsrq_conn_qp, ipoib_max_conn_qp, int, 0444);
+MODULE_PARM_DESC(max_nonsrq_conn_qp,
+		 "Max number of connected-mode QPs per interface "
+		 "(applied only if shared receive queue is not available)");
+
+#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG_DATA
+static int data_debug_level;
+
+module_param_named(cm_data_debug_level, data_debug_level, int, 0644);
+MODULE_PARM_DESC(cm_data_debug_level,
+		 "Enable data path debug tracing for connected mode if > 0");
+#endif
+
+#define IPOIB_CM_IETF_ID 0x1000000000000000ULL
+
+#define IPOIB_CM_RX_UPDATE_TIME (256 * HZ)
+#define IPOIB_CM_RX_TIMEOUT     (2 * 256 * HZ)
+#define IPOIB_CM_RX_DELAY       (3 * 256 * HZ)
+#define IPOIB_CM_RX_UPDATE_MASK (0x3)
+
+static struct ib_qp_attr ipoib_cm_err_attr = {
+	.qp_state = IB_QPS_ERR
+};
+
+#define IPOIB_CM_RX_DRAIN_WRID 0xffffffff
+
+static struct ib_send_wr ipoib_cm_rx_drain_wr = {
+	.wr_id = IPOIB_CM_RX_DRAIN_WRID,
+	.opcode = IB_WR_SEND,
+};
+
+static int ipoib_cm_tx_handler(struct ib_cm_id *cm_id,
+			       struct ib_cm_event *event);
+
+static void ipoib_cm_dma_unmap_rx(struct ipoib_dev_priv *priv, int frags,
+				  u64 mapping[IPOIB_CM_RX_SG])
+{
+	int i;
+
+	ib_dma_unmap_single(priv->ca, mapping[0], IPOIB_CM_HEAD_SIZE, DMA_FROM_DEVICE);
+
+	for (i = 0; i < frags; ++i)
+		ib_dma_unmap_page(priv->ca, mapping[i + 1], PAGE_SIZE, DMA_FROM_DEVICE);
+}
+
+static int ipoib_cm_post_receive_srq(struct net_device *dev, int id)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct ib_recv_wr *bad_wr;
+	int i, ret;
+
+	priv->cm.rx_wr.wr_id = id | IPOIB_OP_CM | IPOIB_OP_RECV;
+
+	for (i = 0; i < priv->cm.num_frags; ++i)
+		priv->cm.rx_sge[i].addr = priv->cm.srq_ring[id].mapping[i];
+
+	ret = ib_post_srq_recv(priv->cm.srq, &priv->cm.rx_wr, &bad_wr);
+	if (unlikely(ret)) {
+		ipoib_warn(priv, "post srq failed for buf %d (%d)\n", id, ret);
+		ipoib_cm_dma_unmap_rx(priv, priv->cm.num_frags - 1,
+				      priv->cm.srq_ring[id].mapping);
+		dev_kfree_skb_any(priv->cm.srq_ring[id].skb);
+		priv->cm.srq_ring[id].skb = NULL;
+	}
+
+	return ret;
+}
+
+static int ipoib_cm_post_receive_nonsrq(struct net_device *dev,
+					struct ipoib_cm_rx *rx,
+					struct ib_recv_wr *wr,
+					struct ib_sge *sge, int id)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct ib_recv_wr *bad_wr;
+	int i, ret;
+
+	wr->wr_id = id | IPOIB_OP_CM | IPOIB_OP_RECV;
+
+	for (i = 0; i < IPOIB_CM_RX_SG; ++i)
+		sge[i].addr = rx->rx_ring[id].mapping[i];
+
+	ret = ib_post_recv(rx->qp, wr, &bad_wr);
+	if (unlikely(ret)) {
+		ipoib_warn(priv, "post recv failed for buf %d (%d)\n", id, ret);
+		ipoib_cm_dma_unmap_rx(priv, IPOIB_CM_RX_SG - 1,
+				      rx->rx_ring[id].mapping);
+		dev_kfree_skb_any(rx->rx_ring[id].skb);
+		rx->rx_ring[id].skb = NULL;
+	}
+
+	return ret;
+}
+
+static struct sk_buff *ipoib_cm_alloc_rx_skb(struct net_device *dev,
+					     struct ipoib_cm_rx_buf *rx_ring,
+					     int id, int frags,
+					     u64 mapping[IPOIB_CM_RX_SG],
+					     gfp_t gfp)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct sk_buff *skb;
+	int i;
+
+	skb = dev_alloc_skb(IPOIB_CM_HEAD_SIZE + 12);
+	if (unlikely(!skb))
+		return NULL;
+
+	/*
+	 * IPoIB adds a 4 byte header. So we need 12 more bytes to align the
+	 * IP header to a multiple of 16.
+	 */
+	skb_reserve(skb, 12);
+
+	mapping[0] = ib_dma_map_single(priv->ca, skb->data, IPOIB_CM_HEAD_SIZE,
+				       DMA_FROM_DEVICE);
+	if (unlikely(ib_dma_mapping_error(priv->ca, mapping[0]))) {
+		dev_kfree_skb_any(skb);
+		return NULL;
+	}
+
+	for (i = 0; i < frags; i++) {
+		struct page *page = alloc_page(gfp);
+
+		if (!page)
+			goto partial_error;
+		skb_fill_page_desc(skb, i, page, 0, PAGE_SIZE);
+
+		mapping[i + 1] = ib_dma_map_page(priv->ca, page,
+						 0, PAGE_SIZE, DMA_FROM_DEVICE);
+		if (unlikely(ib_dma_mapping_error(priv->ca, mapping[i + 1])))
+			goto partial_error;
+	}
+
+	rx_ring[id].skb = skb;
+	return skb;
+
+partial_error:
+
+	ib_dma_unmap_single(priv->ca, mapping[0], IPOIB_CM_HEAD_SIZE, DMA_FROM_DEVICE);
+
+	for (; i > 0; --i)
+		ib_dma_unmap_page(priv->ca, mapping[i], PAGE_SIZE, DMA_FROM_DEVICE);
+
+	dev_kfree_skb_any(skb);
+	return NULL;
+}
+
+static void ipoib_cm_free_rx_ring(struct net_device *dev,
+				  struct ipoib_cm_rx_buf *rx_ring)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	int i;
+
+	for (i = 0; i < ipoib_recvq_size; ++i)
+		if (rx_ring[i].skb) {
+			ipoib_cm_dma_unmap_rx(priv, IPOIB_CM_RX_SG - 1,
+					      rx_ring[i].mapping);
+			dev_kfree_skb_any(rx_ring[i].skb);
+		}
+
+	vfree(rx_ring);
+}
+
+static void ipoib_cm_start_rx_drain(struct ipoib_dev_priv *priv)
+{
+	struct ib_send_wr *bad_wr;
+	struct ipoib_cm_rx *p;
+
+	/* We only reserved 1 extra slot in CQ for drain WRs, so
+	 * make sure we have at most 1 outstanding WR. */
+	if (list_empty(&priv->cm.rx_flush_list) ||
+	    !list_empty(&priv->cm.rx_drain_list))
+		return;
+
+	/*
+	 * QPs on flush list are error state.  This way, a "flush
+	 * error" WC will be immediately generated for each WR we post.
+	 */
+	p = list_entry(priv->cm.rx_flush_list.next, typeof(*p), list);
+	if (ib_post_send(p->qp, &ipoib_cm_rx_drain_wr, &bad_wr))
+		ipoib_warn(priv, "failed to post drain wr\n");
+
+	list_splice_init(&priv->cm.rx_flush_list, &priv->cm.rx_drain_list);
+}
+
+static void ipoib_cm_rx_event_handler(struct ib_event *event, void *ctx)
+{
+	struct ipoib_cm_rx *p = ctx;
+	struct ipoib_dev_priv *priv = netdev_priv(p->dev);
+	unsigned long flags;
+
+	if (event->event != IB_EVENT_QP_LAST_WQE_REACHED)
+		return;
+
+	spin_lock_irqsave(&priv->lock, flags);
+	list_move(&p->list, &priv->cm.rx_flush_list);
+	p->state = IPOIB_CM_RX_FLUSH;
+	ipoib_cm_start_rx_drain(priv);
+	spin_unlock_irqrestore(&priv->lock, flags);
+}
+
+static struct ib_qp *ipoib_cm_create_rx_qp(struct net_device *dev,
+					   struct ipoib_cm_rx *p)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct ib_qp_init_attr attr = {
+		.event_handler = ipoib_cm_rx_event_handler,
+		.send_cq = priv->recv_cq, /* For drain WR */
+		.recv_cq = priv->recv_cq,
+		.srq = priv->cm.srq,
+		.cap.max_send_wr = 1, /* For drain WR */
+		.cap.max_send_sge = 1, /* FIXME: 0 Seems not to work */
+		.sq_sig_type = IB_SIGNAL_ALL_WR,
+		.qp_type = IB_QPT_RC,
+		.qp_context = p,
+	};
+
+	if (!ipoib_cm_has_srq(dev)) {
+		attr.cap.max_recv_wr  = ipoib_recvq_size;
+		attr.cap.max_recv_sge = IPOIB_CM_RX_SG;
+	}
+
+	return ib_create_qp(priv->pd, &attr);
+}
+
+static int ipoib_cm_modify_rx_qp(struct net_device *dev,
+				 struct ib_cm_id *cm_id, struct ib_qp *qp,
+				 unsigned psn)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct ib_qp_attr qp_attr;
+	int qp_attr_mask, ret;
+
+	qp_attr.qp_state = IB_QPS_INIT;
+	ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask);
+	if (ret) {
+		ipoib_warn(priv, "failed to init QP attr for INIT: %d\n", ret);
+		return ret;
+	}
+	ret = ib_modify_qp(qp, &qp_attr, qp_attr_mask);
+	if (ret) {
+		ipoib_warn(priv, "failed to modify QP to INIT: %d\n", ret);
+		return ret;
+	}
+	qp_attr.qp_state = IB_QPS_RTR;
+	ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask);
+	if (ret) {
+		ipoib_warn(priv, "failed to init QP attr for RTR: %d\n", ret);
+		return ret;
+	}
+	qp_attr.rq_psn = psn;
+	ret = ib_modify_qp(qp, &qp_attr, qp_attr_mask);
+	if (ret) {
+		ipoib_warn(priv, "failed to modify QP to RTR: %d\n", ret);
+		return ret;
+	}
+
+	/*
+	 * Current Mellanox HCA firmware won't generate completions
+	 * with error for drain WRs unless the QP has been moved to
+	 * RTS first. This work-around leaves a window where a QP has
+	 * moved to error asynchronously, but this will eventually get
+	 * fixed in firmware, so let's not error out if modify QP
+	 * fails.
+	 */
+	qp_attr.qp_state = IB_QPS_RTS;
+	ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask);
+	if (ret) {
+		ipoib_warn(priv, "failed to init QP attr for RTS: %d\n", ret);
+		return 0;
+	}
+	ret = ib_modify_qp(qp, &qp_attr, qp_attr_mask);
+	if (ret) {
+		ipoib_warn(priv, "failed to modify QP to RTS: %d\n", ret);
+		return 0;
+	}
+
+	return 0;
+}
+
+static void ipoib_cm_init_rx_wr(struct net_device *dev,
+				struct ib_recv_wr *wr,
+				struct ib_sge *sge)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	int i;
+
+	for (i = 0; i < priv->cm.num_frags; ++i)
+		sge[i].lkey = priv->mr->lkey;
+
+	sge[0].length = IPOIB_CM_HEAD_SIZE;
+	for (i = 1; i < priv->cm.num_frags; ++i)
+		sge[i].length = PAGE_SIZE;
+
+	wr->next    = NULL;
+	wr->sg_list = sge;
+	wr->num_sge = priv->cm.num_frags;
+}
+
+static int ipoib_cm_nonsrq_init_rx(struct net_device *dev, struct ib_cm_id *cm_id,
+				   struct ipoib_cm_rx *rx)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct {
+		struct ib_recv_wr wr;
+		struct ib_sge sge[IPOIB_CM_RX_SG];
+	} *t;
+	int ret;
+	int i;
+
+	rx->rx_ring = vzalloc(ipoib_recvq_size * sizeof *rx->rx_ring);
+	if (!rx->rx_ring) {
+		printk(KERN_WARNING "%s: failed to allocate CM non-SRQ ring (%d entries)\n",
+		       priv->ca->name, ipoib_recvq_size);
+		return -ENOMEM;
+	}
+
+	t = kmalloc(sizeof *t, GFP_KERNEL);
+	if (!t) {
+		ret = -ENOMEM;
+		goto err_free;
+	}
+
+	ipoib_cm_init_rx_wr(dev, &t->wr, t->sge);
+
+	spin_lock_irq(&priv->lock);
+
+	if (priv->cm.nonsrq_conn_qp >= ipoib_max_conn_qp) {
+		spin_unlock_irq(&priv->lock);
+		ib_send_cm_rej(cm_id, IB_CM_REJ_NO_QP, NULL, 0, NULL, 0);
+		ret = -EINVAL;
+		goto err_free;
+	} else
+		++priv->cm.nonsrq_conn_qp;
+
+	spin_unlock_irq(&priv->lock);
+
+	for (i = 0; i < ipoib_recvq_size; ++i) {
+		if (!ipoib_cm_alloc_rx_skb(dev, rx->rx_ring, i, IPOIB_CM_RX_SG - 1,
+					   rx->rx_ring[i].mapping,
+					   GFP_KERNEL)) {
+			ipoib_warn(priv, "failed to allocate receive buffer %d\n", i);
+				ret = -ENOMEM;
+				goto err_count;
+		}
+		ret = ipoib_cm_post_receive_nonsrq(dev, rx, &t->wr, t->sge, i);
+		if (ret) {
+			ipoib_warn(priv, "ipoib_cm_post_receive_nonsrq "
+				   "failed for buf %d\n", i);
+			ret = -EIO;
+			goto err_count;
+		}
+	}
+
+	rx->recv_count = ipoib_recvq_size;
+
+	kfree(t);
+
+	return 0;
+
+err_count:
+	spin_lock_irq(&priv->lock);
+	--priv->cm.nonsrq_conn_qp;
+	spin_unlock_irq(&priv->lock);
+
+err_free:
+	kfree(t);
+	ipoib_cm_free_rx_ring(dev, rx->rx_ring);
+
+	return ret;
+}
+
+static int ipoib_cm_send_rep(struct net_device *dev, struct ib_cm_id *cm_id,
+			     struct ib_qp *qp, struct ib_cm_req_event_param *req,
+			     unsigned psn)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct ipoib_cm_data data = {};
+	struct ib_cm_rep_param rep = {};
+
+	data.qpn = cpu_to_be32(priv->qp->qp_num);
+	data.mtu = cpu_to_be32(IPOIB_CM_BUF_SIZE);
+
+	rep.private_data = &data;
+	rep.private_data_len = sizeof data;
+	rep.flow_control = 0;
+	rep.rnr_retry_count = req->rnr_retry_count;
+	rep.srq = ipoib_cm_has_srq(dev);
+	rep.qp_num = qp->qp_num;
+	rep.starting_psn = psn;
+	return ib_send_cm_rep(cm_id, &rep);
+}
+
+static int ipoib_cm_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event)
+{
+	struct net_device *dev = cm_id->context;
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct ipoib_cm_rx *p;
+	unsigned psn;
+	int ret;
+
+	ipoib_dbg(priv, "REQ arrived\n");
+	p = kzalloc(sizeof *p, GFP_KERNEL);
+	if (!p)
+		return -ENOMEM;
+	p->dev = dev;
+	p->id = cm_id;
+	cm_id->context = p;
+	p->state = IPOIB_CM_RX_LIVE;
+	p->jiffies = jiffies;
+	INIT_LIST_HEAD(&p->list);
+
+	p->qp = ipoib_cm_create_rx_qp(dev, p);
+	if (IS_ERR(p->qp)) {
+		ret = PTR_ERR(p->qp);
+		goto err_qp;
+	}
+
+	psn = prandom_u32() & 0xffffff;
+	ret = ipoib_cm_modify_rx_qp(dev, cm_id, p->qp, psn);
+	if (ret)
+		goto err_modify;
+
+	if (!ipoib_cm_has_srq(dev)) {
+		ret = ipoib_cm_nonsrq_init_rx(dev, cm_id, p);
+		if (ret)
+			goto err_modify;
+	}
+
+	spin_lock_irq(&priv->lock);
+	queue_delayed_work(ipoib_workqueue,
+			   &priv->cm.stale_task, IPOIB_CM_RX_DELAY);
+	/* Add this entry to passive ids list head, but do not re-add it
+	 * if IB_EVENT_QP_LAST_WQE_REACHED has moved it to flush list. */
+	p->jiffies = jiffies;
+	if (p->state == IPOIB_CM_RX_LIVE)
+		list_move(&p->list, &priv->cm.passive_ids);
+	spin_unlock_irq(&priv->lock);
+
+	ret = ipoib_cm_send_rep(dev, cm_id, p->qp, &event->param.req_rcvd, psn);
+	if (ret) {
+		ipoib_warn(priv, "failed to send REP: %d\n", ret);
+		if (ib_modify_qp(p->qp, &ipoib_cm_err_attr, IB_QP_STATE))
+			ipoib_warn(priv, "unable to move qp to error state\n");
+	}
+	return 0;
+
+err_modify:
+	ib_destroy_qp(p->qp);
+err_qp:
+	kfree(p);
+	return ret;
+}
+
+static int ipoib_cm_rx_handler(struct ib_cm_id *cm_id,
+			       struct ib_cm_event *event)
+{
+	struct ipoib_cm_rx *p;
+	struct ipoib_dev_priv *priv;
+
+	switch (event->event) {
+	case IB_CM_REQ_RECEIVED:
+		return ipoib_cm_req_handler(cm_id, event);
+	case IB_CM_DREQ_RECEIVED:
+		p = cm_id->context;
+		ib_send_cm_drep(cm_id, NULL, 0);
+		/* Fall through */
+	case IB_CM_REJ_RECEIVED:
+		p = cm_id->context;
+		priv = netdev_priv(p->dev);
+		if (ib_modify_qp(p->qp, &ipoib_cm_err_attr, IB_QP_STATE))
+			ipoib_warn(priv, "unable to move qp to error state\n");
+		/* Fall through */
+	default:
+		return 0;
+	}
+}
+/* Adjust length of skb with fragments to match received data */
+static void skb_put_frags(struct sk_buff *skb, unsigned int hdr_space,
+			  unsigned int length, struct sk_buff *toskb)
+{
+	int i, num_frags;
+	unsigned int size;
+
+	/* put header into skb */
+	size = min(length, hdr_space);
+	skb->tail += size;
+	skb->len += size;
+	length -= size;
+
+	num_frags = skb_shinfo(skb)->nr_frags;
+	for (i = 0; i < num_frags; i++) {
+		skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+
+		if (length == 0) {
+			/* don't need this page */
+			skb_fill_page_desc(toskb, i, skb_frag_page(frag),
+					   0, PAGE_SIZE);
+			--skb_shinfo(skb)->nr_frags;
+		} else {
+			size = min(length, (unsigned) PAGE_SIZE);
+
+			skb_frag_size_set(frag, size);
+			skb->data_len += size;
+			skb->truesize += size;
+			skb->len += size;
+			length -= size;
+		}
+	}
+}
+
+void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct ipoib_cm_rx_buf *rx_ring;
+	unsigned int wr_id = wc->wr_id & ~(IPOIB_OP_CM | IPOIB_OP_RECV);
+	struct sk_buff *skb, *newskb;
+	struct ipoib_cm_rx *p;
+	unsigned long flags;
+	u64 mapping[IPOIB_CM_RX_SG];
+	int frags;
+	int has_srq;
+	struct sk_buff *small_skb;
+
+	ipoib_dbg_data(priv, "cm recv completion: id %d, status: %d\n",
+		       wr_id, wc->status);
+
+	if (unlikely(wr_id >= ipoib_recvq_size)) {
+		if (wr_id == (IPOIB_CM_RX_DRAIN_WRID & ~(IPOIB_OP_CM | IPOIB_OP_RECV))) {
+			spin_lock_irqsave(&priv->lock, flags);
+			list_splice_init(&priv->cm.rx_drain_list, &priv->cm.rx_reap_list);
+			ipoib_cm_start_rx_drain(priv);
+			queue_work(ipoib_workqueue, &priv->cm.rx_reap_task);
+			spin_unlock_irqrestore(&priv->lock, flags);
+		} else
+			ipoib_warn(priv, "cm recv completion event with wrid %d (> %d)\n",
+				   wr_id, ipoib_recvq_size);
+		return;
+	}
+
+	p = wc->qp->qp_context;
+
+	has_srq = ipoib_cm_has_srq(dev);
+	rx_ring = has_srq ? priv->cm.srq_ring : p->rx_ring;
+
+	skb = rx_ring[wr_id].skb;
+
+	if (unlikely(wc->status != IB_WC_SUCCESS)) {
+		ipoib_dbg(priv, "cm recv error "
+			   "(status=%d, wrid=%d vend_err %x)\n",
+			   wc->status, wr_id, wc->vendor_err);
+		++dev->stats.rx_dropped;
+		if (has_srq)
+			goto repost;
+		else {
+			if (!--p->recv_count) {
+				spin_lock_irqsave(&priv->lock, flags);
+				list_move(&p->list, &priv->cm.rx_reap_list);
+				spin_unlock_irqrestore(&priv->lock, flags);
+				queue_work(ipoib_workqueue, &priv->cm.rx_reap_task);
+			}
+			return;
+		}
+	}
+
+	if (unlikely(!(wr_id & IPOIB_CM_RX_UPDATE_MASK))) {
+		if (p && time_after_eq(jiffies, p->jiffies + IPOIB_CM_RX_UPDATE_TIME)) {
+			spin_lock_irqsave(&priv->lock, flags);
+			p->jiffies = jiffies;
+			/* Move this entry to list head, but do not re-add it
+			 * if it has been moved out of list. */
+			if (p->state == IPOIB_CM_RX_LIVE)
+				list_move(&p->list, &priv->cm.passive_ids);
+			spin_unlock_irqrestore(&priv->lock, flags);
+		}
+	}
+
+	if (wc->byte_len < IPOIB_CM_COPYBREAK) {
+		int dlen = wc->byte_len;
+
+		small_skb = dev_alloc_skb(dlen + 12);
+		if (small_skb) {
+			skb_reserve(small_skb, 12);
+			ib_dma_sync_single_for_cpu(priv->ca, rx_ring[wr_id].mapping[0],
+						   dlen, DMA_FROM_DEVICE);
+			skb_copy_from_linear_data(skb, small_skb->data, dlen);
+			ib_dma_sync_single_for_device(priv->ca, rx_ring[wr_id].mapping[0],
+						      dlen, DMA_FROM_DEVICE);
+			skb_put(small_skb, dlen);
+			skb = small_skb;
+			goto copied;
+		}
+	}
+
+	frags = PAGE_ALIGN(wc->byte_len - min(wc->byte_len,
+					      (unsigned)IPOIB_CM_HEAD_SIZE)) / PAGE_SIZE;
+
+	newskb = ipoib_cm_alloc_rx_skb(dev, rx_ring, wr_id, frags,
+				       mapping, GFP_ATOMIC);
+	if (unlikely(!newskb)) {
+		/*
+		 * If we can't allocate a new RX buffer, dump
+		 * this packet and reuse the old buffer.
+		 */
+		ipoib_dbg(priv, "failed to allocate receive buffer %d\n", wr_id);
+		++dev->stats.rx_dropped;
+		goto repost;
+	}
+
+	ipoib_cm_dma_unmap_rx(priv, frags, rx_ring[wr_id].mapping);
+	memcpy(rx_ring[wr_id].mapping, mapping, (frags + 1) * sizeof *mapping);
+
+	ipoib_dbg_data(priv, "received %d bytes, SLID 0x%04x\n",
+		       wc->byte_len, wc->slid);
+
+	skb_put_frags(skb, IPOIB_CM_HEAD_SIZE, wc->byte_len, newskb);
+
+copied:
+	skb->protocol = ((struct ipoib_header *) skb->data)->proto;
+	skb_reset_mac_header(skb);
+	skb_pull(skb, IPOIB_ENCAP_LEN);
+
+	++dev->stats.rx_packets;
+	dev->stats.rx_bytes += skb->len;
+
+	skb->dev = dev;
+	/* XXX get correct PACKET_ type here */
+	skb->pkt_type = PACKET_HOST;
+	netif_receive_skb(skb);
+
+repost:
+	if (has_srq) {
+		if (unlikely(ipoib_cm_post_receive_srq(dev, wr_id)))
+			ipoib_warn(priv, "ipoib_cm_post_receive_srq failed "
+				   "for buf %d\n", wr_id);
+	} else {
+		if (unlikely(ipoib_cm_post_receive_nonsrq(dev, p,
+							  &priv->cm.rx_wr,
+							  priv->cm.rx_sge,
+							  wr_id))) {
+			--p->recv_count;
+			ipoib_warn(priv, "ipoib_cm_post_receive_nonsrq failed "
+				   "for buf %d\n", wr_id);
+		}
+	}
+}
+
+static inline int post_send(struct ipoib_dev_priv *priv,
+			    struct ipoib_cm_tx *tx,
+			    unsigned int wr_id,
+			    u64 addr, int len)
+{
+	struct ib_send_wr *bad_wr;
+
+	priv->tx_sge[0].addr          = addr;
+	priv->tx_sge[0].length        = len;
+
+	priv->tx_wr.num_sge	= 1;
+	priv->tx_wr.wr_id	= wr_id | IPOIB_OP_CM;
+
+	return ib_post_send(tx->qp, &priv->tx_wr, &bad_wr);
+}
+
+void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_tx *tx)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct ipoib_cm_tx_buf *tx_req;
+	u64 addr;
+	int rc;
+
+	if (unlikely(skb->len > tx->mtu)) {
+		ipoib_warn(priv, "packet len %d (> %d) too long to send, dropping\n",
+			   skb->len, tx->mtu);
+		++dev->stats.tx_dropped;
+		++dev->stats.tx_errors;
+		ipoib_cm_skb_too_long(dev, skb, tx->mtu - IPOIB_ENCAP_LEN);
+		return;
+	}
+
+	ipoib_dbg_data(priv, "sending packet: head 0x%x length %d connection 0x%x\n",
+		       tx->tx_head, skb->len, tx->qp->qp_num);
+
+	/*
+	 * We put the skb into the tx_ring _before_ we call post_send()
+	 * because it's entirely possible that the completion handler will
+	 * run before we execute anything after the post_send().  That
+	 * means we have to make sure everything is properly recorded and
+	 * our state is consistent before we call post_send().
+	 */
+	tx_req = &tx->tx_ring[tx->tx_head & (ipoib_sendq_size - 1)];
+	tx_req->skb = skb;
+	addr = ib_dma_map_single(priv->ca, skb->data, skb->len, DMA_TO_DEVICE);
+	if (unlikely(ib_dma_mapping_error(priv->ca, addr))) {
+		++dev->stats.tx_errors;
+		dev_kfree_skb_any(skb);
+		return;
+	}
+
+	tx_req->mapping = addr;
+
+	skb_orphan(skb);
+	skb_dst_drop(skb);
+
+	rc = post_send(priv, tx, tx->tx_head & (ipoib_sendq_size - 1),
+		       addr, skb->len);
+	if (unlikely(rc)) {
+		ipoib_warn(priv, "post_send failed, error %d\n", rc);
+		++dev->stats.tx_errors;
+		ib_dma_unmap_single(priv->ca, addr, skb->len, DMA_TO_DEVICE);
+		dev_kfree_skb_any(skb);
+	} else {
+		dev->trans_start = jiffies;
+		++tx->tx_head;
+
+		if (++priv->tx_outstanding == ipoib_sendq_size) {
+			ipoib_dbg(priv, "TX ring 0x%x full, stopping kernel net queue\n",
+				  tx->qp->qp_num);
+			netif_stop_queue(dev);
+			rc = ib_req_notify_cq(priv->send_cq,
+				IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS);
+			if (rc < 0)
+				ipoib_warn(priv, "request notify on send CQ failed\n");
+			else if (rc)
+				ipoib_send_comp_handler(priv->send_cq, dev);
+		}
+	}
+}
+
+void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ib_wc *wc)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct ipoib_cm_tx *tx = wc->qp->qp_context;
+	unsigned int wr_id = wc->wr_id & ~IPOIB_OP_CM;
+	struct ipoib_cm_tx_buf *tx_req;
+	unsigned long flags;
+
+	ipoib_dbg_data(priv, "cm send completion: id %d, status: %d\n",
+		       wr_id, wc->status);
+
+	if (unlikely(wr_id >= ipoib_sendq_size)) {
+		ipoib_warn(priv, "cm send completion event with wrid %d (> %d)\n",
+			   wr_id, ipoib_sendq_size);
+		return;
+	}
+
+	tx_req = &tx->tx_ring[wr_id];
+
+	ib_dma_unmap_single(priv->ca, tx_req->mapping, tx_req->skb->len, DMA_TO_DEVICE);
+
+	/* FIXME: is this right? Shouldn't we only increment on success? */
+	++dev->stats.tx_packets;
+	dev->stats.tx_bytes += tx_req->skb->len;
+
+	dev_kfree_skb_any(tx_req->skb);
+
+	netif_tx_lock(dev);
+
+	++tx->tx_tail;
+	if (unlikely(--priv->tx_outstanding == ipoib_sendq_size >> 1) &&
+	    netif_queue_stopped(dev) &&
+	    test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags))
+		netif_wake_queue(dev);
+
+	if (wc->status != IB_WC_SUCCESS &&
+	    wc->status != IB_WC_WR_FLUSH_ERR) {
+		struct ipoib_neigh *neigh;
+
+		ipoib_dbg(priv, "failed cm send event "
+			   "(status=%d, wrid=%d vend_err %x)\n",
+			   wc->status, wr_id, wc->vendor_err);
+
+		spin_lock_irqsave(&priv->lock, flags);
+		neigh = tx->neigh;
+
+		if (neigh) {
+			neigh->cm = NULL;
+			ipoib_neigh_free(neigh);
+
+			tx->neigh = NULL;
+		}
+
+		if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &tx->flags)) {
+			list_move(&tx->list, &priv->cm.reap_list);
+			queue_work(ipoib_workqueue, &priv->cm.reap_task);
+		}
+
+		clear_bit(IPOIB_FLAG_OPER_UP, &tx->flags);
+
+		spin_unlock_irqrestore(&priv->lock, flags);
+	}
+
+	netif_tx_unlock(dev);
+}
+
+int ipoib_cm_dev_open(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	int ret;
+
+	if (!IPOIB_CM_SUPPORTED(dev->dev_addr))
+		return 0;
+
+	priv->cm.id = ib_create_cm_id(priv->ca, ipoib_cm_rx_handler, dev);
+	if (IS_ERR(priv->cm.id)) {
+		printk(KERN_WARNING "%s: failed to create CM ID\n", priv->ca->name);
+		ret = PTR_ERR(priv->cm.id);
+		goto err_cm;
+	}
+
+	ret = ib_cm_listen(priv->cm.id, cpu_to_be64(IPOIB_CM_IETF_ID | priv->qp->qp_num),
+			   0, NULL);
+	if (ret) {
+		printk(KERN_WARNING "%s: failed to listen on ID 0x%llx\n", priv->ca->name,
+		       IPOIB_CM_IETF_ID | priv->qp->qp_num);
+		goto err_listen;
+	}
+
+	return 0;
+
+err_listen:
+	ib_destroy_cm_id(priv->cm.id);
+err_cm:
+	priv->cm.id = NULL;
+	return ret;
+}
+
+static void ipoib_cm_free_rx_reap_list(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct ipoib_cm_rx *rx, *n;
+	LIST_HEAD(list);
+
+	spin_lock_irq(&priv->lock);
+	list_splice_init(&priv->cm.rx_reap_list, &list);
+	spin_unlock_irq(&priv->lock);
+
+	list_for_each_entry_safe(rx, n, &list, list) {
+		ib_destroy_cm_id(rx->id);
+		ib_destroy_qp(rx->qp);
+		if (!ipoib_cm_has_srq(dev)) {
+			ipoib_cm_free_rx_ring(priv->dev, rx->rx_ring);
+			spin_lock_irq(&priv->lock);
+			--priv->cm.nonsrq_conn_qp;
+			spin_unlock_irq(&priv->lock);
+		}
+		kfree(rx);
+	}
+}
+
+void ipoib_cm_dev_stop(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct ipoib_cm_rx *p;
+	unsigned long begin;
+	int ret;
+
+	if (!IPOIB_CM_SUPPORTED(dev->dev_addr) || !priv->cm.id)
+		return;
+
+	ib_destroy_cm_id(priv->cm.id);
+	priv->cm.id = NULL;
+
+	spin_lock_irq(&priv->lock);
+	while (!list_empty(&priv->cm.passive_ids)) {
+		p = list_entry(priv->cm.passive_ids.next, typeof(*p), list);
+		list_move(&p->list, &priv->cm.rx_error_list);
+		p->state = IPOIB_CM_RX_ERROR;
+		spin_unlock_irq(&priv->lock);
+		ret = ib_modify_qp(p->qp, &ipoib_cm_err_attr, IB_QP_STATE);
+		if (ret)
+			ipoib_warn(priv, "unable to move qp to error state: %d\n", ret);
+		spin_lock_irq(&priv->lock);
+	}
+
+	/* Wait for all RX to be drained */
+	begin = jiffies;
+
+	while (!list_empty(&priv->cm.rx_error_list) ||
+	       !list_empty(&priv->cm.rx_flush_list) ||
+	       !list_empty(&priv->cm.rx_drain_list)) {
+		if (time_after(jiffies, begin + 5 * HZ)) {
+			ipoib_warn(priv, "RX drain timing out\n");
+
+			/*
+			 * assume the HW is wedged and just free up everything.
+			 */
+			list_splice_init(&priv->cm.rx_flush_list,
+					 &priv->cm.rx_reap_list);
+			list_splice_init(&priv->cm.rx_error_list,
+					 &priv->cm.rx_reap_list);
+			list_splice_init(&priv->cm.rx_drain_list,
+					 &priv->cm.rx_reap_list);
+			break;
+		}
+		spin_unlock_irq(&priv->lock);
+		msleep(1);
+		ipoib_drain_cq(dev);
+		spin_lock_irq(&priv->lock);
+	}
+
+	spin_unlock_irq(&priv->lock);
+
+	ipoib_cm_free_rx_reap_list(dev);
+
+	cancel_delayed_work(&priv->cm.stale_task);
+}
+
+static int ipoib_cm_rep_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event)
+{
+	struct ipoib_cm_tx *p = cm_id->context;
+	struct ipoib_dev_priv *priv = netdev_priv(p->dev);
+	struct ipoib_cm_data *data = event->private_data;
+	struct sk_buff_head skqueue;
+	struct ib_qp_attr qp_attr;
+	int qp_attr_mask, ret;
+	struct sk_buff *skb;
+
+	p->mtu = be32_to_cpu(data->mtu);
+
+	if (p->mtu <= IPOIB_ENCAP_LEN) {
+		ipoib_warn(priv, "Rejecting connection: mtu %d <= %d\n",
+			   p->mtu, IPOIB_ENCAP_LEN);
+		return -EINVAL;
+	}
+
+	qp_attr.qp_state = IB_QPS_RTR;
+	ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask);
+	if (ret) {
+		ipoib_warn(priv, "failed to init QP attr for RTR: %d\n", ret);
+		return ret;
+	}
+
+	qp_attr.rq_psn = 0 /* FIXME */;
+	ret = ib_modify_qp(p->qp, &qp_attr, qp_attr_mask);
+	if (ret) {
+		ipoib_warn(priv, "failed to modify QP to RTR: %d\n", ret);
+		return ret;
+	}
+
+	qp_attr.qp_state = IB_QPS_RTS;
+	ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask);
+	if (ret) {
+		ipoib_warn(priv, "failed to init QP attr for RTS: %d\n", ret);
+		return ret;
+	}
+	ret = ib_modify_qp(p->qp, &qp_attr, qp_attr_mask);
+	if (ret) {
+		ipoib_warn(priv, "failed to modify QP to RTS: %d\n", ret);
+		return ret;
+	}
+
+	skb_queue_head_init(&skqueue);
+
+	spin_lock_irq(&priv->lock);
+	set_bit(IPOIB_FLAG_OPER_UP, &p->flags);
+	if (p->neigh)
+		while ((skb = __skb_dequeue(&p->neigh->queue)))
+			__skb_queue_tail(&skqueue, skb);
+	spin_unlock_irq(&priv->lock);
+
+	while ((skb = __skb_dequeue(&skqueue))) {
+		skb->dev = p->dev;
+		if (dev_queue_xmit(skb))
+			ipoib_warn(priv, "dev_queue_xmit failed "
+				   "to requeue packet\n");
+	}
+
+	ret = ib_send_cm_rtu(cm_id, NULL, 0);
+	if (ret) {
+		ipoib_warn(priv, "failed to send RTU: %d\n", ret);
+		return ret;
+	}
+	return 0;
+}
+
+static struct ib_qp *ipoib_cm_create_tx_qp(struct net_device *dev, struct ipoib_cm_tx *tx)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct ib_qp_init_attr attr = {
+		.send_cq		= priv->recv_cq,
+		.recv_cq		= priv->recv_cq,
+		.srq			= priv->cm.srq,
+		.cap.max_send_wr	= ipoib_sendq_size,
+		.cap.max_send_sge	= 1,
+		.sq_sig_type		= IB_SIGNAL_ALL_WR,
+		.qp_type		= IB_QPT_RC,
+		.qp_context		= tx,
+		.create_flags		= IB_QP_CREATE_USE_GFP_NOIO
+	};
+
+	struct ib_qp *tx_qp;
+
+	tx_qp = ib_create_qp(priv->pd, &attr);
+	if (PTR_ERR(tx_qp) == -EINVAL) {
+		ipoib_warn(priv, "can't use GFP_NOIO for QPs on device %s, using GFP_KERNEL\n",
+			   priv->ca->name);
+		attr.create_flags &= ~IB_QP_CREATE_USE_GFP_NOIO;
+		tx_qp = ib_create_qp(priv->pd, &attr);
+	}
+	return tx_qp;
+}
+
+static int ipoib_cm_send_req(struct net_device *dev,
+			     struct ib_cm_id *id, struct ib_qp *qp,
+			     u32 qpn,
+			     struct ib_sa_path_rec *pathrec)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct ipoib_cm_data data = {};
+	struct ib_cm_req_param req = {};
+
+	data.qpn = cpu_to_be32(priv->qp->qp_num);
+	data.mtu = cpu_to_be32(IPOIB_CM_BUF_SIZE);
+
+	req.primary_path		= pathrec;
+	req.alternate_path		= NULL;
+	req.service_id			= cpu_to_be64(IPOIB_CM_IETF_ID | qpn);
+	req.qp_num			= qp->qp_num;
+	req.qp_type			= qp->qp_type;
+	req.private_data		= &data;
+	req.private_data_len		= sizeof data;
+	req.flow_control		= 0;
+
+	req.starting_psn		= 0; /* FIXME */
+
+	/*
+	 * Pick some arbitrary defaults here; we could make these
+	 * module parameters if anyone cared about setting them.
+	 */
+	req.responder_resources		= 4;
+	req.remote_cm_response_timeout	= 20;
+	req.local_cm_response_timeout	= 20;
+	req.retry_count			= 0; /* RFC draft warns against retries */
+	req.rnr_retry_count		= 0; /* RFC draft warns against retries */
+	req.max_cm_retries		= 15;
+	req.srq				= ipoib_cm_has_srq(dev);
+	return ib_send_cm_req(id, &req);
+}
+
+static int ipoib_cm_modify_tx_init(struct net_device *dev,
+				  struct ib_cm_id *cm_id, struct ib_qp *qp)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct ib_qp_attr qp_attr;
+	int qp_attr_mask, ret;
+	ret = ib_find_pkey(priv->ca, priv->port, priv->pkey, &qp_attr.pkey_index);
+	if (ret) {
+		ipoib_warn(priv, "pkey 0x%x not found: %d\n", priv->pkey, ret);
+		return ret;
+	}
+
+	qp_attr.qp_state = IB_QPS_INIT;
+	qp_attr.qp_access_flags = IB_ACCESS_LOCAL_WRITE;
+	qp_attr.port_num = priv->port;
+	qp_attr_mask = IB_QP_STATE | IB_QP_ACCESS_FLAGS | IB_QP_PKEY_INDEX | IB_QP_PORT;
+
+	ret = ib_modify_qp(qp, &qp_attr, qp_attr_mask);
+	if (ret) {
+		ipoib_warn(priv, "failed to modify tx QP to INIT: %d\n", ret);
+		return ret;
+	}
+	return 0;
+}
+
+static int ipoib_cm_tx_init(struct ipoib_cm_tx *p, u32 qpn,
+			    struct ib_sa_path_rec *pathrec)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(p->dev);
+	int ret;
+
+	p->tx_ring = __vmalloc(ipoib_sendq_size * sizeof *p->tx_ring,
+			       GFP_NOIO, PAGE_KERNEL);
+	if (!p->tx_ring) {
+		ipoib_warn(priv, "failed to allocate tx ring\n");
+		ret = -ENOMEM;
+		goto err_tx;
+	}
+	memset(p->tx_ring, 0, ipoib_sendq_size * sizeof *p->tx_ring);
+
+	p->qp = ipoib_cm_create_tx_qp(p->dev, p);
+	if (IS_ERR(p->qp)) {
+		ret = PTR_ERR(p->qp);
+		ipoib_warn(priv, "failed to allocate tx qp: %d\n", ret);
+		goto err_qp;
+	}
+
+	p->id = ib_create_cm_id(priv->ca, ipoib_cm_tx_handler, p);
+	if (IS_ERR(p->id)) {
+		ret = PTR_ERR(p->id);
+		ipoib_warn(priv, "failed to create tx cm id: %d\n", ret);
+		goto err_id;
+	}
+
+	ret = ipoib_cm_modify_tx_init(p->dev, p->id,  p->qp);
+	if (ret) {
+		ipoib_warn(priv, "failed to modify tx qp to rtr: %d\n", ret);
+		goto err_modify;
+	}
+
+	ret = ipoib_cm_send_req(p->dev, p->id, p->qp, qpn, pathrec);
+	if (ret) {
+		ipoib_warn(priv, "failed to send cm req: %d\n", ret);
+		goto err_send_cm;
+	}
+
+	ipoib_dbg(priv, "Request connection 0x%x for gid %pI6 qpn 0x%x\n",
+		  p->qp->qp_num, pathrec->dgid.raw, qpn);
+
+	return 0;
+
+err_send_cm:
+err_modify:
+	ib_destroy_cm_id(p->id);
+err_id:
+	p->id = NULL;
+	ib_destroy_qp(p->qp);
+err_qp:
+	p->qp = NULL;
+	vfree(p->tx_ring);
+err_tx:
+	return ret;
+}
+
+static void ipoib_cm_tx_destroy(struct ipoib_cm_tx *p)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(p->dev);
+	struct ipoib_cm_tx_buf *tx_req;
+	unsigned long begin;
+
+	ipoib_dbg(priv, "Destroy active connection 0x%x head 0x%x tail 0x%x\n",
+		  p->qp ? p->qp->qp_num : 0, p->tx_head, p->tx_tail);
+
+	if (p->id)
+		ib_destroy_cm_id(p->id);
+
+	if (p->tx_ring) {
+		/* Wait for all sends to complete */
+		begin = jiffies;
+		while ((int) p->tx_tail - (int) p->tx_head < 0) {
+			if (time_after(jiffies, begin + 5 * HZ)) {
+				ipoib_warn(priv, "timing out; %d sends not completed\n",
+					   p->tx_head - p->tx_tail);
+				goto timeout;
+			}
+
+			msleep(1);
+		}
+	}
+
+timeout:
+
+	while ((int) p->tx_tail - (int) p->tx_head < 0) {
+		tx_req = &p->tx_ring[p->tx_tail & (ipoib_sendq_size - 1)];
+		ib_dma_unmap_single(priv->ca, tx_req->mapping, tx_req->skb->len,
+				    DMA_TO_DEVICE);
+		dev_kfree_skb_any(tx_req->skb);
+		++p->tx_tail;
+		netif_tx_lock_bh(p->dev);
+		if (unlikely(--priv->tx_outstanding == ipoib_sendq_size >> 1) &&
+		    netif_queue_stopped(p->dev) &&
+		    test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags))
+			netif_wake_queue(p->dev);
+		netif_tx_unlock_bh(p->dev);
+	}
+
+	if (p->qp)
+		ib_destroy_qp(p->qp);
+
+	vfree(p->tx_ring);
+	kfree(p);
+}
+
+static int ipoib_cm_tx_handler(struct ib_cm_id *cm_id,
+			       struct ib_cm_event *event)
+{
+	struct ipoib_cm_tx *tx = cm_id->context;
+	struct ipoib_dev_priv *priv = netdev_priv(tx->dev);
+	struct net_device *dev = priv->dev;
+	struct ipoib_neigh *neigh;
+	unsigned long flags;
+	int ret;
+
+	switch (event->event) {
+	case IB_CM_DREQ_RECEIVED:
+		ipoib_dbg(priv, "DREQ received.\n");
+		ib_send_cm_drep(cm_id, NULL, 0);
+		break;
+	case IB_CM_REP_RECEIVED:
+		ipoib_dbg(priv, "REP received.\n");
+		ret = ipoib_cm_rep_handler(cm_id, event);
+		if (ret)
+			ib_send_cm_rej(cm_id, IB_CM_REJ_CONSUMER_DEFINED,
+				       NULL, 0, NULL, 0);
+		break;
+	case IB_CM_REQ_ERROR:
+	case IB_CM_REJ_RECEIVED:
+	case IB_CM_TIMEWAIT_EXIT:
+		ipoib_dbg(priv, "CM error %d.\n", event->event);
+		netif_tx_lock_bh(dev);
+		spin_lock_irqsave(&priv->lock, flags);
+		neigh = tx->neigh;
+
+		if (neigh) {
+			neigh->cm = NULL;
+			ipoib_neigh_free(neigh);
+
+			tx->neigh = NULL;
+		}
+
+		if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &tx->flags)) {
+			list_move(&tx->list, &priv->cm.reap_list);
+			queue_work(ipoib_workqueue, &priv->cm.reap_task);
+		}
+
+		spin_unlock_irqrestore(&priv->lock, flags);
+		netif_tx_unlock_bh(dev);
+		break;
+	default:
+		break;
+	}
+
+	return 0;
+}
+
+struct ipoib_cm_tx *ipoib_cm_create_tx(struct net_device *dev, struct ipoib_path *path,
+				       struct ipoib_neigh *neigh)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct ipoib_cm_tx *tx;
+
+	tx = kzalloc(sizeof *tx, GFP_ATOMIC);
+	if (!tx)
+		return NULL;
+
+	neigh->cm = tx;
+	tx->neigh = neigh;
+	tx->path = path;
+	tx->dev = dev;
+	list_add(&tx->list, &priv->cm.start_list);
+	set_bit(IPOIB_FLAG_INITIALIZED, &tx->flags);
+	queue_work(ipoib_workqueue, &priv->cm.start_task);
+	return tx;
+}
+
+void ipoib_cm_destroy_tx(struct ipoib_cm_tx *tx)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(tx->dev);
+	unsigned long flags;
+	if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &tx->flags)) {
+		spin_lock_irqsave(&priv->lock, flags);
+		list_move(&tx->list, &priv->cm.reap_list);
+		queue_work(ipoib_workqueue, &priv->cm.reap_task);
+		ipoib_dbg(priv, "Reap connection for gid %pI6\n",
+			  tx->neigh->daddr + 4);
+		tx->neigh = NULL;
+		spin_unlock_irqrestore(&priv->lock, flags);
+	}
+}
+
+static void ipoib_cm_tx_start(struct work_struct *work)
+{
+	struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv,
+						   cm.start_task);
+	struct net_device *dev = priv->dev;
+	struct ipoib_neigh *neigh;
+	struct ipoib_cm_tx *p;
+	unsigned long flags;
+	int ret;
+
+	struct ib_sa_path_rec pathrec;
+	u32 qpn;
+
+	netif_tx_lock_bh(dev);
+	spin_lock_irqsave(&priv->lock, flags);
+
+	while (!list_empty(&priv->cm.start_list)) {
+		p = list_entry(priv->cm.start_list.next, typeof(*p), list);
+		list_del_init(&p->list);
+		neigh = p->neigh;
+		qpn = IPOIB_QPN(neigh->daddr);
+		memcpy(&pathrec, &p->path->pathrec, sizeof pathrec);
+
+		spin_unlock_irqrestore(&priv->lock, flags);
+		netif_tx_unlock_bh(dev);
+
+		ret = ipoib_cm_tx_init(p, qpn, &pathrec);
+
+		netif_tx_lock_bh(dev);
+		spin_lock_irqsave(&priv->lock, flags);
+
+		if (ret) {
+			neigh = p->neigh;
+			if (neigh) {
+				neigh->cm = NULL;
+				ipoib_neigh_free(neigh);
+			}
+			list_del(&p->list);
+			kfree(p);
+		}
+	}
+
+	spin_unlock_irqrestore(&priv->lock, flags);
+	netif_tx_unlock_bh(dev);
+}
+
+static void ipoib_cm_tx_reap(struct work_struct *work)
+{
+	struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv,
+						   cm.reap_task);
+	struct net_device *dev = priv->dev;
+	struct ipoib_cm_tx *p;
+	unsigned long flags;
+
+	netif_tx_lock_bh(dev);
+	spin_lock_irqsave(&priv->lock, flags);
+
+	while (!list_empty(&priv->cm.reap_list)) {
+		p = list_entry(priv->cm.reap_list.next, typeof(*p), list);
+		list_del(&p->list);
+		spin_unlock_irqrestore(&priv->lock, flags);
+		netif_tx_unlock_bh(dev);
+		ipoib_cm_tx_destroy(p);
+		netif_tx_lock_bh(dev);
+		spin_lock_irqsave(&priv->lock, flags);
+	}
+
+	spin_unlock_irqrestore(&priv->lock, flags);
+	netif_tx_unlock_bh(dev);
+}
+
+static void ipoib_cm_skb_reap(struct work_struct *work)
+{
+	struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv,
+						   cm.skb_task);
+	struct net_device *dev = priv->dev;
+	struct sk_buff *skb;
+	unsigned long flags;
+	unsigned mtu = priv->mcast_mtu;
+
+	netif_tx_lock_bh(dev);
+	spin_lock_irqsave(&priv->lock, flags);
+
+	while ((skb = skb_dequeue(&priv->cm.skb_queue))) {
+		spin_unlock_irqrestore(&priv->lock, flags);
+		netif_tx_unlock_bh(dev);
+
+		if (skb->protocol == htons(ETH_P_IP))
+			icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
+#if IS_ENABLED(CONFIG_IPV6)
+		else if (skb->protocol == htons(ETH_P_IPV6))
+			icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
+#endif
+		dev_kfree_skb_any(skb);
+
+		netif_tx_lock_bh(dev);
+		spin_lock_irqsave(&priv->lock, flags);
+	}
+
+	spin_unlock_irqrestore(&priv->lock, flags);
+	netif_tx_unlock_bh(dev);
+}
+
+void ipoib_cm_skb_too_long(struct net_device *dev, struct sk_buff *skb,
+			   unsigned int mtu)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	int e = skb_queue_empty(&priv->cm.skb_queue);
+
+	if (skb_dst(skb))
+		skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
+
+	skb_queue_tail(&priv->cm.skb_queue, skb);
+	if (e)
+		queue_work(ipoib_workqueue, &priv->cm.skb_task);
+}
+
+static void ipoib_cm_rx_reap(struct work_struct *work)
+{
+	ipoib_cm_free_rx_reap_list(container_of(work, struct ipoib_dev_priv,
+						cm.rx_reap_task)->dev);
+}
+
+static void ipoib_cm_stale_task(struct work_struct *work)
+{
+	struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv,
+						   cm.stale_task.work);
+	struct ipoib_cm_rx *p;
+	int ret;
+
+	spin_lock_irq(&priv->lock);
+	while (!list_empty(&priv->cm.passive_ids)) {
+		/* List is sorted by LRU, start from tail,
+		 * stop when we see a recently used entry */
+		p = list_entry(priv->cm.passive_ids.prev, typeof(*p), list);
+		if (time_before_eq(jiffies, p->jiffies + IPOIB_CM_RX_TIMEOUT))
+			break;
+		list_move(&p->list, &priv->cm.rx_error_list);
+		p->state = IPOIB_CM_RX_ERROR;
+		spin_unlock_irq(&priv->lock);
+		ret = ib_modify_qp(p->qp, &ipoib_cm_err_attr, IB_QP_STATE);
+		if (ret)
+			ipoib_warn(priv, "unable to move qp to error state: %d\n", ret);
+		spin_lock_irq(&priv->lock);
+	}
+
+	if (!list_empty(&priv->cm.passive_ids))
+		queue_delayed_work(ipoib_workqueue,
+				   &priv->cm.stale_task, IPOIB_CM_RX_DELAY);
+	spin_unlock_irq(&priv->lock);
+}
+
+
+static ssize_t show_mode(struct device *d, struct device_attribute *attr,
+			 char *buf)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(to_net_dev(d));
+
+	if (test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags))
+		return sprintf(buf, "connected\n");
+	else
+		return sprintf(buf, "datagram\n");
+}
+
+static ssize_t set_mode(struct device *d, struct device_attribute *attr,
+			const char *buf, size_t count)
+{
+	struct net_device *dev = to_net_dev(d);
+	int ret;
+
+	if (!rtnl_trylock())
+		return restart_syscall();
+
+	ret = ipoib_set_mode(dev, buf);
+
+	rtnl_unlock();
+
+	if (!ret)
+		return count;
+
+	return ret;
+}
+
+static DEVICE_ATTR(mode, S_IWUSR | S_IRUGO, show_mode, set_mode);
+
+int ipoib_cm_add_mode_attr(struct net_device *dev)
+{
+	return device_create_file(&dev->dev, &dev_attr_mode);
+}
+
+static void ipoib_cm_create_srq(struct net_device *dev, int max_sge)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct ib_srq_init_attr srq_init_attr = {
+		.srq_type = IB_SRQT_BASIC,
+		.attr = {
+			.max_wr  = ipoib_recvq_size,
+			.max_sge = max_sge
+		}
+	};
+
+	priv->cm.srq = ib_create_srq(priv->pd, &srq_init_attr);
+	if (IS_ERR(priv->cm.srq)) {
+		if (PTR_ERR(priv->cm.srq) != -ENOSYS)
+			printk(KERN_WARNING "%s: failed to allocate SRQ, error %ld\n",
+			       priv->ca->name, PTR_ERR(priv->cm.srq));
+		priv->cm.srq = NULL;
+		return;
+	}
+
+	priv->cm.srq_ring = vzalloc(ipoib_recvq_size * sizeof *priv->cm.srq_ring);
+	if (!priv->cm.srq_ring) {
+		printk(KERN_WARNING "%s: failed to allocate CM SRQ ring (%d entries)\n",
+		       priv->ca->name, ipoib_recvq_size);
+		ib_destroy_srq(priv->cm.srq);
+		priv->cm.srq = NULL;
+		return;
+	}
+
+}
+
+int ipoib_cm_dev_init(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	int i, ret;
+	struct ib_device_attr attr;
+
+	INIT_LIST_HEAD(&priv->cm.passive_ids);
+	INIT_LIST_HEAD(&priv->cm.reap_list);
+	INIT_LIST_HEAD(&priv->cm.start_list);
+	INIT_LIST_HEAD(&priv->cm.rx_error_list);
+	INIT_LIST_HEAD(&priv->cm.rx_flush_list);
+	INIT_LIST_HEAD(&priv->cm.rx_drain_list);
+	INIT_LIST_HEAD(&priv->cm.rx_reap_list);
+	INIT_WORK(&priv->cm.start_task, ipoib_cm_tx_start);
+	INIT_WORK(&priv->cm.reap_task, ipoib_cm_tx_reap);
+	INIT_WORK(&priv->cm.skb_task, ipoib_cm_skb_reap);
+	INIT_WORK(&priv->cm.rx_reap_task, ipoib_cm_rx_reap);
+	INIT_DELAYED_WORK(&priv->cm.stale_task, ipoib_cm_stale_task);
+
+	skb_queue_head_init(&priv->cm.skb_queue);
+
+	ret = ib_query_device(priv->ca, &attr);
+	if (ret) {
+		printk(KERN_WARNING "ib_query_device() failed with %d\n", ret);
+		return ret;
+	}
+
+	ipoib_dbg(priv, "max_srq_sge=%d\n", attr.max_srq_sge);
+
+	attr.max_srq_sge = min_t(int, IPOIB_CM_RX_SG, attr.max_srq_sge);
+	ipoib_cm_create_srq(dev, attr.max_srq_sge);
+	if (ipoib_cm_has_srq(dev)) {
+		priv->cm.max_cm_mtu = attr.max_srq_sge * PAGE_SIZE - 0x10;
+		priv->cm.num_frags  = attr.max_srq_sge;
+		ipoib_dbg(priv, "max_cm_mtu = 0x%x, num_frags=%d\n",
+			  priv->cm.max_cm_mtu, priv->cm.num_frags);
+	} else {
+		priv->cm.max_cm_mtu = IPOIB_CM_MTU;
+		priv->cm.num_frags  = IPOIB_CM_RX_SG;
+	}
+
+	ipoib_cm_init_rx_wr(dev, &priv->cm.rx_wr, priv->cm.rx_sge);
+
+	if (ipoib_cm_has_srq(dev)) {
+		for (i = 0; i < ipoib_recvq_size; ++i) {
+			if (!ipoib_cm_alloc_rx_skb(dev, priv->cm.srq_ring, i,
+						   priv->cm.num_frags - 1,
+						   priv->cm.srq_ring[i].mapping,
+						   GFP_KERNEL)) {
+				ipoib_warn(priv, "failed to allocate "
+					   "receive buffer %d\n", i);
+				ipoib_cm_dev_cleanup(dev);
+				return -ENOMEM;
+			}
+
+			if (ipoib_cm_post_receive_srq(dev, i)) {
+				ipoib_warn(priv, "ipoib_cm_post_receive_srq "
+					   "failed for buf %d\n", i);
+				ipoib_cm_dev_cleanup(dev);
+				return -EIO;
+			}
+		}
+	}
+
+	priv->dev->dev_addr[0] = IPOIB_FLAGS_RC;
+	return 0;
+}
+
+void ipoib_cm_dev_cleanup(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	int ret;
+
+	if (!priv->cm.srq)
+		return;
+
+	ipoib_dbg(priv, "Cleanup ipoib connected mode.\n");
+
+	ret = ib_destroy_srq(priv->cm.srq);
+	if (ret)
+		ipoib_warn(priv, "ib_destroy_srq failed: %d\n", ret);
+
+	priv->cm.srq = NULL;
+	if (!priv->cm.srq_ring)
+		return;
+
+	ipoib_cm_free_rx_ring(dev, priv->cm.srq_ring);
+	priv->cm.srq_ring = NULL;
+}
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c b/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c
new file mode 100644
index 0000000..078cadd
--- /dev/null
+++ b/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) 2007 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/kernel.h>
+#include <linux/ethtool.h>
+#include <linux/netdevice.h>
+
+#include "ipoib.h"
+
+static void ipoib_get_drvinfo(struct net_device *netdev,
+			      struct ethtool_drvinfo *drvinfo)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(netdev);
+	struct ib_device_attr *attr;
+
+	attr = kmalloc(sizeof(*attr), GFP_KERNEL);
+	if (attr && !ib_query_device(priv->ca, attr))
+		snprintf(drvinfo->fw_version, sizeof(drvinfo->fw_version),
+			 "%d.%d.%d", (int)(attr->fw_ver >> 32),
+			 (int)(attr->fw_ver >> 16) & 0xffff,
+			 (int)attr->fw_ver & 0xffff);
+	kfree(attr);
+
+	strlcpy(drvinfo->bus_info, dev_name(priv->ca->dma_device),
+		sizeof(drvinfo->bus_info));
+
+	strlcpy(drvinfo->version, ipoib_driver_version,
+		sizeof(drvinfo->version));
+
+	strlcpy(drvinfo->driver, "ib_ipoib", sizeof(drvinfo->driver));
+}
+
+static int ipoib_get_coalesce(struct net_device *dev,
+			      struct ethtool_coalesce *coal)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+
+	coal->rx_coalesce_usecs = priv->ethtool.coalesce_usecs;
+	coal->rx_max_coalesced_frames = priv->ethtool.max_coalesced_frames;
+
+	return 0;
+}
+
+static int ipoib_set_coalesce(struct net_device *dev,
+			      struct ethtool_coalesce *coal)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	int ret;
+
+	/*
+	 * These values are saved in the private data and returned
+	 * when ipoib_get_coalesce() is called
+	 */
+	if (coal->rx_coalesce_usecs       > 0xffff ||
+	    coal->rx_max_coalesced_frames > 0xffff)
+		return -EINVAL;
+
+	ret = ib_modify_cq(priv->recv_cq, coal->rx_max_coalesced_frames,
+			   coal->rx_coalesce_usecs);
+	if (ret && ret != -ENOSYS) {
+		ipoib_warn(priv, "failed modifying CQ (%d)\n", ret);
+		return ret;
+	}
+
+	priv->ethtool.coalesce_usecs       = coal->rx_coalesce_usecs;
+	priv->ethtool.max_coalesced_frames = coal->rx_max_coalesced_frames;
+
+	return 0;
+}
+
+static const struct ethtool_ops ipoib_ethtool_ops = {
+	.get_drvinfo		= ipoib_get_drvinfo,
+	.get_coalesce		= ipoib_get_coalesce,
+	.set_coalesce		= ipoib_set_coalesce,
+};
+
+void ipoib_set_ethtool_ops(struct net_device *dev)
+{
+	dev->ethtool_ops = &ipoib_ethtool_ops;
+}
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_fs.c b/drivers/infiniband/ulp/ipoib/ipoib_fs.c
new file mode 100644
index 0000000..6bd5740
--- /dev/null
+++ b/drivers/infiniband/ulp/ipoib/ipoib_fs.c
@@ -0,0 +1,297 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/err.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+
+struct file_operations;
+
+#include <linux/debugfs.h>
+#include <linux/export.h>
+
+#include "ipoib.h"
+
+static struct dentry *ipoib_root;
+
+static void format_gid(union ib_gid *gid, char *buf)
+{
+	int i, n;
+
+	for (n = 0, i = 0; i < 8; ++i) {
+		n += sprintf(buf + n, "%x",
+			     be16_to_cpu(((__be16 *) gid->raw)[i]));
+		if (i < 7)
+			buf[n++] = ':';
+	}
+}
+
+static void *ipoib_mcg_seq_start(struct seq_file *file, loff_t *pos)
+{
+	struct ipoib_mcast_iter *iter;
+	loff_t n = *pos;
+
+	iter = ipoib_mcast_iter_init(file->private);
+	if (!iter)
+		return NULL;
+
+	while (n--) {
+		if (ipoib_mcast_iter_next(iter)) {
+			kfree(iter);
+			return NULL;
+		}
+	}
+
+	return iter;
+}
+
+static void *ipoib_mcg_seq_next(struct seq_file *file, void *iter_ptr,
+				   loff_t *pos)
+{
+	struct ipoib_mcast_iter *iter = iter_ptr;
+
+	(*pos)++;
+
+	if (ipoib_mcast_iter_next(iter)) {
+		kfree(iter);
+		return NULL;
+	}
+
+	return iter;
+}
+
+static void ipoib_mcg_seq_stop(struct seq_file *file, void *iter_ptr)
+{
+	/* nothing for now */
+}
+
+static int ipoib_mcg_seq_show(struct seq_file *file, void *iter_ptr)
+{
+	struct ipoib_mcast_iter *iter = iter_ptr;
+	char gid_buf[sizeof "ffff:ffff:ffff:ffff:ffff:ffff:ffff:ffff"];
+	union ib_gid mgid;
+	unsigned long created;
+	unsigned int queuelen, complete, send_only;
+
+	if (!iter)
+		return 0;
+
+	ipoib_mcast_iter_read(iter, &mgid, &created, &queuelen,
+			      &complete, &send_only);
+
+	format_gid(&mgid, gid_buf);
+
+	seq_printf(file,
+		   "GID: %s\n"
+		   "  created: %10ld\n"
+		   "  queuelen: %9d\n"
+		   "  complete: %9s\n"
+		   "  send_only: %8s\n"
+		   "\n",
+		   gid_buf, created, queuelen,
+		   complete ? "yes" : "no",
+		   send_only ? "yes" : "no");
+
+	return 0;
+}
+
+static const struct seq_operations ipoib_mcg_seq_ops = {
+	.start = ipoib_mcg_seq_start,
+	.next  = ipoib_mcg_seq_next,
+	.stop  = ipoib_mcg_seq_stop,
+	.show  = ipoib_mcg_seq_show,
+};
+
+static int ipoib_mcg_open(struct inode *inode, struct file *file)
+{
+	struct seq_file *seq;
+	int ret;
+
+	ret = seq_open(file, &ipoib_mcg_seq_ops);
+	if (ret)
+		return ret;
+
+	seq = file->private_data;
+	seq->private = inode->i_private;
+
+	return 0;
+}
+
+static const struct file_operations ipoib_mcg_fops = {
+	.owner   = THIS_MODULE,
+	.open    = ipoib_mcg_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = seq_release
+};
+
+static void *ipoib_path_seq_start(struct seq_file *file, loff_t *pos)
+{
+	struct ipoib_path_iter *iter;
+	loff_t n = *pos;
+
+	iter = ipoib_path_iter_init(file->private);
+	if (!iter)
+		return NULL;
+
+	while (n--) {
+		if (ipoib_path_iter_next(iter)) {
+			kfree(iter);
+			return NULL;
+		}
+	}
+
+	return iter;
+}
+
+static void *ipoib_path_seq_next(struct seq_file *file, void *iter_ptr,
+				   loff_t *pos)
+{
+	struct ipoib_path_iter *iter = iter_ptr;
+
+	(*pos)++;
+
+	if (ipoib_path_iter_next(iter)) {
+		kfree(iter);
+		return NULL;
+	}
+
+	return iter;
+}
+
+static void ipoib_path_seq_stop(struct seq_file *file, void *iter_ptr)
+{
+	/* nothing for now */
+}
+
+static int ipoib_path_seq_show(struct seq_file *file, void *iter_ptr)
+{
+	struct ipoib_path_iter *iter = iter_ptr;
+	char gid_buf[sizeof "ffff:ffff:ffff:ffff:ffff:ffff:ffff:ffff"];
+	struct ipoib_path path;
+	int rate;
+
+	if (!iter)
+		return 0;
+
+	ipoib_path_iter_read(iter, &path);
+
+	format_gid(&path.pathrec.dgid, gid_buf);
+
+	seq_printf(file,
+		   "GID: %s\n"
+		   "  complete: %6s\n",
+		   gid_buf, path.pathrec.dlid ? "yes" : "no");
+
+	if (path.pathrec.dlid) {
+		rate = ib_rate_to_mbps(path.pathrec.rate);
+
+		seq_printf(file,
+			   "  DLID:     0x%04x\n"
+			   "  SL: %12d\n"
+			   "  rate: %8d.%d Gb/sec\n",
+			   be16_to_cpu(path.pathrec.dlid),
+			   path.pathrec.sl,
+			   rate / 1000, rate % 1000);
+	}
+
+	seq_putc(file, '\n');
+
+	return 0;
+}
+
+static const struct seq_operations ipoib_path_seq_ops = {
+	.start = ipoib_path_seq_start,
+	.next  = ipoib_path_seq_next,
+	.stop  = ipoib_path_seq_stop,
+	.show  = ipoib_path_seq_show,
+};
+
+static int ipoib_path_open(struct inode *inode, struct file *file)
+{
+	struct seq_file *seq;
+	int ret;
+
+	ret = seq_open(file, &ipoib_path_seq_ops);
+	if (ret)
+		return ret;
+
+	seq = file->private_data;
+	seq->private = inode->i_private;
+
+	return 0;
+}
+
+static const struct file_operations ipoib_path_fops = {
+	.owner   = THIS_MODULE,
+	.open    = ipoib_path_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = seq_release
+};
+
+void ipoib_create_debug_files(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	char name[IFNAMSIZ + sizeof "_path"];
+
+	snprintf(name, sizeof name, "%s_mcg", dev->name);
+	priv->mcg_dentry = debugfs_create_file(name, S_IFREG | S_IRUGO,
+					       ipoib_root, dev, &ipoib_mcg_fops);
+	if (!priv->mcg_dentry)
+		ipoib_warn(priv, "failed to create mcg debug file\n");
+
+	snprintf(name, sizeof name, "%s_path", dev->name);
+	priv->path_dentry = debugfs_create_file(name, S_IFREG | S_IRUGO,
+						ipoib_root, dev, &ipoib_path_fops);
+	if (!priv->path_dentry)
+		ipoib_warn(priv, "failed to create path debug file\n");
+}
+
+void ipoib_delete_debug_files(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+
+	debugfs_remove(priv->mcg_dentry);
+	debugfs_remove(priv->path_dentry);
+}
+
+int ipoib_register_debugfs(void)
+{
+	ipoib_root = debugfs_create_dir("ipoib", NULL);
+	return ipoib_root ? 0 : -ENOMEM;
+}
+
+void ipoib_unregister_debugfs(void)
+{
+	debugfs_remove(ipoib_root);
+}
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
new file mode 100644
index 0000000..72626c3
--- /dev/null
+++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
@@ -0,0 +1,1106 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2004, 2005 Voltaire, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/delay.h>
+#include <linux/moduleparam.h>
+#include <linux/dma-mapping.h>
+#include <linux/slab.h>
+
+#include <linux/ip.h>
+#include <linux/tcp.h>
+
+#include "ipoib.h"
+
+#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG_DATA
+static int data_debug_level;
+
+module_param(data_debug_level, int, 0644);
+MODULE_PARM_DESC(data_debug_level,
+		 "Enable data path debug tracing if > 0");
+#endif
+
+static DEFINE_MUTEX(pkey_mutex);
+
+struct ipoib_ah *ipoib_create_ah(struct net_device *dev,
+				 struct ib_pd *pd, struct ib_ah_attr *attr)
+{
+	struct ipoib_ah *ah;
+	struct ib_ah *vah;
+
+	ah = kmalloc(sizeof *ah, GFP_KERNEL);
+	if (!ah)
+		return ERR_PTR(-ENOMEM);
+
+	ah->dev       = dev;
+	ah->last_send = 0;
+	kref_init(&ah->ref);
+
+	vah = ib_create_ah(pd, attr);
+	if (IS_ERR(vah)) {
+		kfree(ah);
+		ah = (struct ipoib_ah *)vah;
+	} else {
+		ah->ah = vah;
+		ipoib_dbg(netdev_priv(dev), "Created ah %p\n", ah->ah);
+	}
+
+	return ah;
+}
+
+void ipoib_free_ah(struct kref *kref)
+{
+	struct ipoib_ah *ah = container_of(kref, struct ipoib_ah, ref);
+	struct ipoib_dev_priv *priv = netdev_priv(ah->dev);
+
+	unsigned long flags;
+
+	spin_lock_irqsave(&priv->lock, flags);
+	list_add_tail(&ah->list, &priv->dead_ahs);
+	spin_unlock_irqrestore(&priv->lock, flags);
+}
+
+static void ipoib_ud_dma_unmap_rx(struct ipoib_dev_priv *priv,
+				  u64 mapping[IPOIB_UD_RX_SG])
+{
+	if (ipoib_ud_need_sg(priv->max_ib_mtu)) {
+		ib_dma_unmap_single(priv->ca, mapping[0], IPOIB_UD_HEAD_SIZE,
+				    DMA_FROM_DEVICE);
+		ib_dma_unmap_page(priv->ca, mapping[1], PAGE_SIZE,
+				  DMA_FROM_DEVICE);
+	} else
+		ib_dma_unmap_single(priv->ca, mapping[0],
+				    IPOIB_UD_BUF_SIZE(priv->max_ib_mtu),
+				    DMA_FROM_DEVICE);
+}
+
+static void ipoib_ud_skb_put_frags(struct ipoib_dev_priv *priv,
+				   struct sk_buff *skb,
+				   unsigned int length)
+{
+	if (ipoib_ud_need_sg(priv->max_ib_mtu)) {
+		skb_frag_t *frag = &skb_shinfo(skb)->frags[0];
+		unsigned int size;
+		/*
+		 * There is only two buffers needed for max_payload = 4K,
+		 * first buf size is IPOIB_UD_HEAD_SIZE
+		 */
+		skb->tail += IPOIB_UD_HEAD_SIZE;
+		skb->len  += length;
+
+		size = length - IPOIB_UD_HEAD_SIZE;
+
+		skb_frag_size_set(frag, size);
+		skb->data_len += size;
+		skb->truesize += PAGE_SIZE;
+	} else
+		skb_put(skb, length);
+
+}
+
+static int ipoib_ib_post_receive(struct net_device *dev, int id)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct ib_recv_wr *bad_wr;
+	int ret;
+
+	priv->rx_wr.wr_id   = id | IPOIB_OP_RECV;
+	priv->rx_sge[0].addr = priv->rx_ring[id].mapping[0];
+	priv->rx_sge[1].addr = priv->rx_ring[id].mapping[1];
+
+
+	ret = ib_post_recv(priv->qp, &priv->rx_wr, &bad_wr);
+	if (unlikely(ret)) {
+		ipoib_warn(priv, "receive failed for buf %d (%d)\n", id, ret);
+		ipoib_ud_dma_unmap_rx(priv, priv->rx_ring[id].mapping);
+		dev_kfree_skb_any(priv->rx_ring[id].skb);
+		priv->rx_ring[id].skb = NULL;
+	}
+
+	return ret;
+}
+
+static struct sk_buff *ipoib_alloc_rx_skb(struct net_device *dev, int id)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct sk_buff *skb;
+	int buf_size;
+	int tailroom;
+	u64 *mapping;
+
+	if (ipoib_ud_need_sg(priv->max_ib_mtu)) {
+		buf_size = IPOIB_UD_HEAD_SIZE;
+		tailroom = 128; /* reserve some tailroom for IP/TCP headers */
+	} else {
+		buf_size = IPOIB_UD_BUF_SIZE(priv->max_ib_mtu);
+		tailroom = 0;
+	}
+
+	skb = dev_alloc_skb(buf_size + tailroom + 4);
+	if (unlikely(!skb))
+		return NULL;
+
+	/*
+	 * IB will leave a 40 byte gap for a GRH and IPoIB adds a 4 byte
+	 * header.  So we need 4 more bytes to get to 48 and align the
+	 * IP header to a multiple of 16.
+	 */
+	skb_reserve(skb, 4);
+
+	mapping = priv->rx_ring[id].mapping;
+	mapping[0] = ib_dma_map_single(priv->ca, skb->data, buf_size,
+				       DMA_FROM_DEVICE);
+	if (unlikely(ib_dma_mapping_error(priv->ca, mapping[0])))
+		goto error;
+
+	if (ipoib_ud_need_sg(priv->max_ib_mtu)) {
+		struct page *page = alloc_page(GFP_ATOMIC);
+		if (!page)
+			goto partial_error;
+		skb_fill_page_desc(skb, 0, page, 0, PAGE_SIZE);
+		mapping[1] =
+			ib_dma_map_page(priv->ca, page,
+					0, PAGE_SIZE, DMA_FROM_DEVICE);
+		if (unlikely(ib_dma_mapping_error(priv->ca, mapping[1])))
+			goto partial_error;
+	}
+
+	priv->rx_ring[id].skb = skb;
+	return skb;
+
+partial_error:
+	ib_dma_unmap_single(priv->ca, mapping[0], buf_size, DMA_FROM_DEVICE);
+error:
+	dev_kfree_skb_any(skb);
+	return NULL;
+}
+
+static int ipoib_ib_post_receives(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	int i;
+
+	for (i = 0; i < ipoib_recvq_size; ++i) {
+		if (!ipoib_alloc_rx_skb(dev, i)) {
+			ipoib_warn(priv, "failed to allocate receive buffer %d\n", i);
+			return -ENOMEM;
+		}
+		if (ipoib_ib_post_receive(dev, i)) {
+			ipoib_warn(priv, "ipoib_ib_post_receive failed for buf %d\n", i);
+			return -EIO;
+		}
+	}
+
+	return 0;
+}
+
+static void ipoib_ib_handle_rx_wc(struct net_device *dev, struct ib_wc *wc)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	unsigned int wr_id = wc->wr_id & ~IPOIB_OP_RECV;
+	struct sk_buff *skb;
+	u64 mapping[IPOIB_UD_RX_SG];
+	union ib_gid *dgid;
+
+	ipoib_dbg_data(priv, "recv completion: id %d, status: %d\n",
+		       wr_id, wc->status);
+
+	if (unlikely(wr_id >= ipoib_recvq_size)) {
+		ipoib_warn(priv, "recv completion event with wrid %d (> %d)\n",
+			   wr_id, ipoib_recvq_size);
+		return;
+	}
+
+	skb  = priv->rx_ring[wr_id].skb;
+
+	if (unlikely(wc->status != IB_WC_SUCCESS)) {
+		if (wc->status != IB_WC_WR_FLUSH_ERR)
+			ipoib_warn(priv, "failed recv event "
+				   "(status=%d, wrid=%d vend_err %x)\n",
+				   wc->status, wr_id, wc->vendor_err);
+		ipoib_ud_dma_unmap_rx(priv, priv->rx_ring[wr_id].mapping);
+		dev_kfree_skb_any(skb);
+		priv->rx_ring[wr_id].skb = NULL;
+		return;
+	}
+
+	/*
+	 * Drop packets that this interface sent, ie multicast packets
+	 * that the HCA has replicated.
+	 */
+	if (wc->slid == priv->local_lid && wc->src_qp == priv->qp->qp_num)
+		goto repost;
+
+	memcpy(mapping, priv->rx_ring[wr_id].mapping,
+	       IPOIB_UD_RX_SG * sizeof *mapping);
+
+	/*
+	 * If we can't allocate a new RX buffer, dump
+	 * this packet and reuse the old buffer.
+	 */
+	if (unlikely(!ipoib_alloc_rx_skb(dev, wr_id))) {
+		++dev->stats.rx_dropped;
+		goto repost;
+	}
+
+	ipoib_dbg_data(priv, "received %d bytes, SLID 0x%04x\n",
+		       wc->byte_len, wc->slid);
+
+	ipoib_ud_dma_unmap_rx(priv, mapping);
+	ipoib_ud_skb_put_frags(priv, skb, wc->byte_len);
+
+	/* First byte of dgid signals multicast when 0xff */
+	dgid = &((struct ib_grh *)skb->data)->dgid;
+
+	if (!(wc->wc_flags & IB_WC_GRH) || dgid->raw[0] != 0xff)
+		skb->pkt_type = PACKET_HOST;
+	else if (memcmp(dgid, dev->broadcast + 4, sizeof(union ib_gid)) == 0)
+		skb->pkt_type = PACKET_BROADCAST;
+	else
+		skb->pkt_type = PACKET_MULTICAST;
+
+	skb_pull(skb, IB_GRH_BYTES);
+
+	skb->protocol = ((struct ipoib_header *) skb->data)->proto;
+	skb_reset_mac_header(skb);
+	skb_pull(skb, IPOIB_ENCAP_LEN);
+
+	++dev->stats.rx_packets;
+	dev->stats.rx_bytes += skb->len;
+
+	skb->dev = dev;
+	if ((dev->features & NETIF_F_RXCSUM) &&
+			likely(wc->wc_flags & IB_WC_IP_CSUM_OK))
+		skb->ip_summed = CHECKSUM_UNNECESSARY;
+
+	napi_gro_receive(&priv->napi, skb);
+
+repost:
+	if (unlikely(ipoib_ib_post_receive(dev, wr_id)))
+		ipoib_warn(priv, "ipoib_ib_post_receive failed "
+			   "for buf %d\n", wr_id);
+}
+
+static int ipoib_dma_map_tx(struct ib_device *ca,
+			    struct ipoib_tx_buf *tx_req)
+{
+	struct sk_buff *skb = tx_req->skb;
+	u64 *mapping = tx_req->mapping;
+	int i;
+	int off;
+
+	if (skb_headlen(skb)) {
+		mapping[0] = ib_dma_map_single(ca, skb->data, skb_headlen(skb),
+					       DMA_TO_DEVICE);
+		if (unlikely(ib_dma_mapping_error(ca, mapping[0])))
+			return -EIO;
+
+		off = 1;
+	} else
+		off = 0;
+
+	for (i = 0; i < skb_shinfo(skb)->nr_frags; ++i) {
+		const skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+		mapping[i + off] = ib_dma_map_page(ca,
+						 skb_frag_page(frag),
+						 frag->page_offset, skb_frag_size(frag),
+						 DMA_TO_DEVICE);
+		if (unlikely(ib_dma_mapping_error(ca, mapping[i + off])))
+			goto partial_error;
+	}
+	return 0;
+
+partial_error:
+	for (; i > 0; --i) {
+		const skb_frag_t *frag = &skb_shinfo(skb)->frags[i - 1];
+
+		ib_dma_unmap_page(ca, mapping[i - !off], skb_frag_size(frag), DMA_TO_DEVICE);
+	}
+
+	if (off)
+		ib_dma_unmap_single(ca, mapping[0], skb_headlen(skb), DMA_TO_DEVICE);
+
+	return -EIO;
+}
+
+static void ipoib_dma_unmap_tx(struct ib_device *ca,
+			       struct ipoib_tx_buf *tx_req)
+{
+	struct sk_buff *skb = tx_req->skb;
+	u64 *mapping = tx_req->mapping;
+	int i;
+	int off;
+
+	if (skb_headlen(skb)) {
+		ib_dma_unmap_single(ca, mapping[0], skb_headlen(skb), DMA_TO_DEVICE);
+		off = 1;
+	} else
+		off = 0;
+
+	for (i = 0; i < skb_shinfo(skb)->nr_frags; ++i) {
+		const skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+
+		ib_dma_unmap_page(ca, mapping[i + off], skb_frag_size(frag),
+				  DMA_TO_DEVICE);
+	}
+}
+
+static void ipoib_ib_handle_tx_wc(struct net_device *dev, struct ib_wc *wc)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	unsigned int wr_id = wc->wr_id;
+	struct ipoib_tx_buf *tx_req;
+
+	ipoib_dbg_data(priv, "send completion: id %d, status: %d\n",
+		       wr_id, wc->status);
+
+	if (unlikely(wr_id >= ipoib_sendq_size)) {
+		ipoib_warn(priv, "send completion event with wrid %d (> %d)\n",
+			   wr_id, ipoib_sendq_size);
+		return;
+	}
+
+	tx_req = &priv->tx_ring[wr_id];
+
+	ipoib_dma_unmap_tx(priv->ca, tx_req);
+
+	++dev->stats.tx_packets;
+	dev->stats.tx_bytes += tx_req->skb->len;
+
+	dev_kfree_skb_any(tx_req->skb);
+
+	++priv->tx_tail;
+	if (unlikely(--priv->tx_outstanding == ipoib_sendq_size >> 1) &&
+	    netif_queue_stopped(dev) &&
+	    test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags))
+		netif_wake_queue(dev);
+
+	if (wc->status != IB_WC_SUCCESS &&
+	    wc->status != IB_WC_WR_FLUSH_ERR)
+		ipoib_warn(priv, "failed send event "
+			   "(status=%d, wrid=%d vend_err %x)\n",
+			   wc->status, wr_id, wc->vendor_err);
+}
+
+static int poll_tx(struct ipoib_dev_priv *priv)
+{
+	int n, i;
+
+	n = ib_poll_cq(priv->send_cq, MAX_SEND_CQE, priv->send_wc);
+	for (i = 0; i < n; ++i)
+		ipoib_ib_handle_tx_wc(priv->dev, priv->send_wc + i);
+
+	return n == MAX_SEND_CQE;
+}
+
+int ipoib_poll(struct napi_struct *napi, int budget)
+{
+	struct ipoib_dev_priv *priv = container_of(napi, struct ipoib_dev_priv, napi);
+	struct net_device *dev = priv->dev;
+	int done;
+	int t;
+	int n, i;
+
+	done  = 0;
+
+poll_more:
+	while (done < budget) {
+		int max = (budget - done);
+
+		t = min(IPOIB_NUM_WC, max);
+		n = ib_poll_cq(priv->recv_cq, t, priv->ibwc);
+
+		for (i = 0; i < n; i++) {
+			struct ib_wc *wc = priv->ibwc + i;
+
+			if (wc->wr_id & IPOIB_OP_RECV) {
+				++done;
+				if (wc->wr_id & IPOIB_OP_CM)
+					ipoib_cm_handle_rx_wc(dev, wc);
+				else
+					ipoib_ib_handle_rx_wc(dev, wc);
+			} else
+				ipoib_cm_handle_tx_wc(priv->dev, wc);
+		}
+
+		if (n != t)
+			break;
+	}
+
+	if (done < budget) {
+		napi_complete(napi);
+		if (unlikely(ib_req_notify_cq(priv->recv_cq,
+					      IB_CQ_NEXT_COMP |
+					      IB_CQ_REPORT_MISSED_EVENTS)) &&
+		    napi_reschedule(napi))
+			goto poll_more;
+	}
+
+	return done;
+}
+
+void ipoib_ib_completion(struct ib_cq *cq, void *dev_ptr)
+{
+	struct net_device *dev = dev_ptr;
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+
+	napi_schedule(&priv->napi);
+}
+
+static void drain_tx_cq(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+
+	netif_tx_lock(dev);
+	while (poll_tx(priv))
+		; /* nothing */
+
+	if (netif_queue_stopped(dev))
+		mod_timer(&priv->poll_timer, jiffies + 1);
+
+	netif_tx_unlock(dev);
+}
+
+void ipoib_send_comp_handler(struct ib_cq *cq, void *dev_ptr)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev_ptr);
+
+	mod_timer(&priv->poll_timer, jiffies);
+}
+
+static inline int post_send(struct ipoib_dev_priv *priv,
+			    unsigned int wr_id,
+			    struct ib_ah *address, u32 qpn,
+			    struct ipoib_tx_buf *tx_req,
+			    void *head, int hlen)
+{
+	struct ib_send_wr *bad_wr;
+	int i, off;
+	struct sk_buff *skb = tx_req->skb;
+	skb_frag_t *frags = skb_shinfo(skb)->frags;
+	int nr_frags = skb_shinfo(skb)->nr_frags;
+	u64 *mapping = tx_req->mapping;
+
+	if (skb_headlen(skb)) {
+		priv->tx_sge[0].addr         = mapping[0];
+		priv->tx_sge[0].length       = skb_headlen(skb);
+		off = 1;
+	} else
+		off = 0;
+
+	for (i = 0; i < nr_frags; ++i) {
+		priv->tx_sge[i + off].addr = mapping[i + off];
+		priv->tx_sge[i + off].length = skb_frag_size(&frags[i]);
+	}
+	priv->tx_wr.num_sge	     = nr_frags + off;
+	priv->tx_wr.wr_id 	     = wr_id;
+	priv->tx_wr.wr.ud.remote_qpn = qpn;
+	priv->tx_wr.wr.ud.ah 	     = address;
+
+	if (head) {
+		priv->tx_wr.wr.ud.mss	 = skb_shinfo(skb)->gso_size;
+		priv->tx_wr.wr.ud.header = head;
+		priv->tx_wr.wr.ud.hlen	 = hlen;
+		priv->tx_wr.opcode	 = IB_WR_LSO;
+	} else
+		priv->tx_wr.opcode	 = IB_WR_SEND;
+
+	return ib_post_send(priv->qp, &priv->tx_wr, &bad_wr);
+}
+
+void ipoib_send(struct net_device *dev, struct sk_buff *skb,
+		struct ipoib_ah *address, u32 qpn)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct ipoib_tx_buf *tx_req;
+	int hlen, rc;
+	void *phead;
+
+	if (skb_is_gso(skb)) {
+		hlen = skb_transport_offset(skb) + tcp_hdrlen(skb);
+		phead = skb->data;
+		if (unlikely(!skb_pull(skb, hlen))) {
+			ipoib_warn(priv, "linear data too small\n");
+			++dev->stats.tx_dropped;
+			++dev->stats.tx_errors;
+			dev_kfree_skb_any(skb);
+			return;
+		}
+	} else {
+		if (unlikely(skb->len > priv->mcast_mtu + IPOIB_ENCAP_LEN)) {
+			ipoib_warn(priv, "packet len %d (> %d) too long to send, dropping\n",
+				   skb->len, priv->mcast_mtu + IPOIB_ENCAP_LEN);
+			++dev->stats.tx_dropped;
+			++dev->stats.tx_errors;
+			ipoib_cm_skb_too_long(dev, skb, priv->mcast_mtu);
+			return;
+		}
+		phead = NULL;
+		hlen  = 0;
+	}
+
+	ipoib_dbg_data(priv, "sending packet, length=%d address=%p qpn=0x%06x\n",
+		       skb->len, address, qpn);
+
+	/*
+	 * We put the skb into the tx_ring _before_ we call post_send()
+	 * because it's entirely possible that the completion handler will
+	 * run before we execute anything after the post_send().  That
+	 * means we have to make sure everything is properly recorded and
+	 * our state is consistent before we call post_send().
+	 */
+	tx_req = &priv->tx_ring[priv->tx_head & (ipoib_sendq_size - 1)];
+	tx_req->skb = skb;
+	if (unlikely(ipoib_dma_map_tx(priv->ca, tx_req))) {
+		++dev->stats.tx_errors;
+		dev_kfree_skb_any(skb);
+		return;
+	}
+
+	if (skb->ip_summed == CHECKSUM_PARTIAL)
+		priv->tx_wr.send_flags |= IB_SEND_IP_CSUM;
+	else
+		priv->tx_wr.send_flags &= ~IB_SEND_IP_CSUM;
+
+	if (++priv->tx_outstanding == ipoib_sendq_size) {
+		ipoib_dbg(priv, "TX ring full, stopping kernel net queue\n");
+		if (ib_req_notify_cq(priv->send_cq, IB_CQ_NEXT_COMP))
+			ipoib_warn(priv, "request notify on send CQ failed\n");
+		netif_stop_queue(dev);
+	}
+
+	skb_orphan(skb);
+	skb_dst_drop(skb);
+
+	rc = post_send(priv, priv->tx_head & (ipoib_sendq_size - 1),
+		       address->ah, qpn, tx_req, phead, hlen);
+	if (unlikely(rc)) {
+		ipoib_warn(priv, "post_send failed, error %d\n", rc);
+		++dev->stats.tx_errors;
+		--priv->tx_outstanding;
+		ipoib_dma_unmap_tx(priv->ca, tx_req);
+		dev_kfree_skb_any(skb);
+		if (netif_queue_stopped(dev))
+			netif_wake_queue(dev);
+	} else {
+		dev->trans_start = jiffies;
+
+		address->last_send = priv->tx_head;
+		++priv->tx_head;
+	}
+
+	if (unlikely(priv->tx_outstanding > MAX_SEND_CQE))
+		while (poll_tx(priv))
+			; /* nothing */
+}
+
+static void __ipoib_reap_ah(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct ipoib_ah *ah, *tah;
+	LIST_HEAD(remove_list);
+	unsigned long flags;
+
+	netif_tx_lock_bh(dev);
+	spin_lock_irqsave(&priv->lock, flags);
+
+	list_for_each_entry_safe(ah, tah, &priv->dead_ahs, list)
+		if ((int) priv->tx_tail - (int) ah->last_send >= 0) {
+			list_del(&ah->list);
+			ib_destroy_ah(ah->ah);
+			kfree(ah);
+		}
+
+	spin_unlock_irqrestore(&priv->lock, flags);
+	netif_tx_unlock_bh(dev);
+}
+
+void ipoib_reap_ah(struct work_struct *work)
+{
+	struct ipoib_dev_priv *priv =
+		container_of(work, struct ipoib_dev_priv, ah_reap_task.work);
+	struct net_device *dev = priv->dev;
+
+	__ipoib_reap_ah(dev);
+
+	if (!test_bit(IPOIB_STOP_REAPER, &priv->flags))
+		queue_delayed_work(ipoib_workqueue, &priv->ah_reap_task,
+				   round_jiffies_relative(HZ));
+}
+
+static void ipoib_ib_tx_timer_func(unsigned long ctx)
+{
+	drain_tx_cq((struct net_device *)ctx);
+}
+
+int ipoib_ib_dev_open(struct net_device *dev, int flush)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	int ret;
+
+	ipoib_pkey_dev_check_presence(dev);
+
+	if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) {
+		ipoib_warn(priv, "P_Key 0x%04x is %s\n", priv->pkey,
+			   (!(priv->pkey & 0x7fff) ? "Invalid" : "not found"));
+		return -1;
+	}
+
+	ret = ipoib_init_qp(dev);
+	if (ret) {
+		ipoib_warn(priv, "ipoib_init_qp returned %d\n", ret);
+		return -1;
+	}
+
+	ret = ipoib_ib_post_receives(dev);
+	if (ret) {
+		ipoib_warn(priv, "ipoib_ib_post_receives returned %d\n", ret);
+		goto dev_stop;
+	}
+
+	ret = ipoib_cm_dev_open(dev);
+	if (ret) {
+		ipoib_warn(priv, "ipoib_cm_dev_open returned %d\n", ret);
+		goto dev_stop;
+	}
+
+	clear_bit(IPOIB_STOP_REAPER, &priv->flags);
+	queue_delayed_work(ipoib_workqueue, &priv->ah_reap_task,
+			   round_jiffies_relative(HZ));
+
+	if (!test_and_set_bit(IPOIB_FLAG_INITIALIZED, &priv->flags))
+		napi_enable(&priv->napi);
+
+	return 0;
+dev_stop:
+	if (!test_and_set_bit(IPOIB_FLAG_INITIALIZED, &priv->flags))
+		napi_enable(&priv->napi);
+	ipoib_ib_dev_stop(dev, flush);
+	return -1;
+}
+
+void ipoib_pkey_dev_check_presence(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+
+	if (!(priv->pkey & 0x7fff) ||
+	    ib_find_pkey(priv->ca, priv->port, priv->pkey,
+			 &priv->pkey_index))
+		clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
+	else
+		set_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
+}
+
+int ipoib_ib_dev_up(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+
+	ipoib_pkey_dev_check_presence(dev);
+
+	if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) {
+		ipoib_dbg(priv, "PKEY is not assigned.\n");
+		return 0;
+	}
+
+	set_bit(IPOIB_FLAG_OPER_UP, &priv->flags);
+
+	return ipoib_mcast_start_thread(dev);
+}
+
+int ipoib_ib_dev_down(struct net_device *dev, int flush)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+
+	ipoib_dbg(priv, "downing ib_dev\n");
+
+	clear_bit(IPOIB_FLAG_OPER_UP, &priv->flags);
+	netif_carrier_off(dev);
+
+	ipoib_mcast_stop_thread(dev, flush);
+	ipoib_mcast_dev_flush(dev);
+
+	ipoib_flush_paths(dev);
+
+	return 0;
+}
+
+static int recvs_pending(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	int pending = 0;
+	int i;
+
+	for (i = 0; i < ipoib_recvq_size; ++i)
+		if (priv->rx_ring[i].skb)
+			++pending;
+
+	return pending;
+}
+
+void ipoib_drain_cq(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	int i, n;
+
+	/*
+	 * We call completion handling routines that expect to be
+	 * called from the BH-disabled NAPI poll context, so disable
+	 * BHs here too.
+	 */
+	local_bh_disable();
+
+	do {
+		n = ib_poll_cq(priv->recv_cq, IPOIB_NUM_WC, priv->ibwc);
+		for (i = 0; i < n; ++i) {
+			/*
+			 * Convert any successful completions to flush
+			 * errors to avoid passing packets up the
+			 * stack after bringing the device down.
+			 */
+			if (priv->ibwc[i].status == IB_WC_SUCCESS)
+				priv->ibwc[i].status = IB_WC_WR_FLUSH_ERR;
+
+			if (priv->ibwc[i].wr_id & IPOIB_OP_RECV) {
+				if (priv->ibwc[i].wr_id & IPOIB_OP_CM)
+					ipoib_cm_handle_rx_wc(dev, priv->ibwc + i);
+				else
+					ipoib_ib_handle_rx_wc(dev, priv->ibwc + i);
+			} else
+				ipoib_cm_handle_tx_wc(dev, priv->ibwc + i);
+		}
+	} while (n == IPOIB_NUM_WC);
+
+	while (poll_tx(priv))
+		; /* nothing */
+
+	local_bh_enable();
+}
+
+int ipoib_ib_dev_stop(struct net_device *dev, int flush)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct ib_qp_attr qp_attr;
+	unsigned long begin;
+	struct ipoib_tx_buf *tx_req;
+	int i;
+
+	if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &priv->flags))
+		napi_disable(&priv->napi);
+
+	ipoib_cm_dev_stop(dev);
+
+	/*
+	 * Move our QP to the error state and then reinitialize in
+	 * when all work requests have completed or have been flushed.
+	 */
+	qp_attr.qp_state = IB_QPS_ERR;
+	if (ib_modify_qp(priv->qp, &qp_attr, IB_QP_STATE))
+		ipoib_warn(priv, "Failed to modify QP to ERROR state\n");
+
+	/* Wait for all sends and receives to complete */
+	begin = jiffies;
+
+	while (priv->tx_head != priv->tx_tail || recvs_pending(dev)) {
+		if (time_after(jiffies, begin + 5 * HZ)) {
+			ipoib_warn(priv, "timing out; %d sends %d receives not completed\n",
+				   priv->tx_head - priv->tx_tail, recvs_pending(dev));
+
+			/*
+			 * assume the HW is wedged and just free up
+			 * all our pending work requests.
+			 */
+			while ((int) priv->tx_tail - (int) priv->tx_head < 0) {
+				tx_req = &priv->tx_ring[priv->tx_tail &
+							(ipoib_sendq_size - 1)];
+				ipoib_dma_unmap_tx(priv->ca, tx_req);
+				dev_kfree_skb_any(tx_req->skb);
+				++priv->tx_tail;
+				--priv->tx_outstanding;
+			}
+
+			for (i = 0; i < ipoib_recvq_size; ++i) {
+				struct ipoib_rx_buf *rx_req;
+
+				rx_req = &priv->rx_ring[i];
+				if (!rx_req->skb)
+					continue;
+				ipoib_ud_dma_unmap_rx(priv,
+						      priv->rx_ring[i].mapping);
+				dev_kfree_skb_any(rx_req->skb);
+				rx_req->skb = NULL;
+			}
+
+			goto timeout;
+		}
+
+		ipoib_drain_cq(dev);
+
+		msleep(1);
+	}
+
+	ipoib_dbg(priv, "All sends and receives done.\n");
+
+timeout:
+	del_timer_sync(&priv->poll_timer);
+	qp_attr.qp_state = IB_QPS_RESET;
+	if (ib_modify_qp(priv->qp, &qp_attr, IB_QP_STATE))
+		ipoib_warn(priv, "Failed to modify QP to RESET state\n");
+
+	/* Wait for all AHs to be reaped */
+	set_bit(IPOIB_STOP_REAPER, &priv->flags);
+	cancel_delayed_work(&priv->ah_reap_task);
+	if (flush)
+		flush_workqueue(ipoib_workqueue);
+
+	begin = jiffies;
+
+	while (!list_empty(&priv->dead_ahs)) {
+		__ipoib_reap_ah(dev);
+
+		if (time_after(jiffies, begin + HZ)) {
+			ipoib_warn(priv, "timing out; will leak address handles\n");
+			break;
+		}
+
+		msleep(1);
+	}
+
+	ib_req_notify_cq(priv->recv_cq, IB_CQ_NEXT_COMP);
+
+	return 0;
+}
+
+int ipoib_ib_dev_init(struct net_device *dev, struct ib_device *ca, int port)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+
+	priv->ca = ca;
+	priv->port = port;
+	priv->qp = NULL;
+
+	if (ipoib_transport_dev_init(dev, ca)) {
+		printk(KERN_WARNING "%s: ipoib_transport_dev_init failed\n", ca->name);
+		return -ENODEV;
+	}
+
+	setup_timer(&priv->poll_timer, ipoib_ib_tx_timer_func,
+		    (unsigned long) dev);
+
+	if (dev->flags & IFF_UP) {
+		if (ipoib_ib_dev_open(dev, 1)) {
+			ipoib_transport_dev_cleanup(dev);
+			return -ENODEV;
+		}
+	}
+
+	return 0;
+}
+
+/*
+ * Takes whatever value which is in pkey index 0 and updates priv->pkey
+ * returns 0 if the pkey value was changed.
+ */
+static inline int update_parent_pkey(struct ipoib_dev_priv *priv)
+{
+	int result;
+	u16 prev_pkey;
+
+	prev_pkey = priv->pkey;
+	result = ib_query_pkey(priv->ca, priv->port, 0, &priv->pkey);
+	if (result) {
+		ipoib_warn(priv, "ib_query_pkey port %d failed (ret = %d)\n",
+			   priv->port, result);
+		return result;
+	}
+
+	priv->pkey |= 0x8000;
+
+	if (prev_pkey != priv->pkey) {
+		ipoib_dbg(priv, "pkey changed from 0x%x to 0x%x\n",
+			  prev_pkey, priv->pkey);
+		/*
+		 * Update the pkey in the broadcast address, while making sure to set
+		 * the full membership bit, so that we join the right broadcast group.
+		 */
+		priv->dev->broadcast[8] = priv->pkey >> 8;
+		priv->dev->broadcast[9] = priv->pkey & 0xff;
+		return 0;
+	}
+
+	return 1;
+}
+/*
+ * returns 0 if pkey value was found in a different slot.
+ */
+static inline int update_child_pkey(struct ipoib_dev_priv *priv)
+{
+	u16 old_index = priv->pkey_index;
+
+	priv->pkey_index = 0;
+	ipoib_pkey_dev_check_presence(priv->dev);
+
+	if (test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags) &&
+	    (old_index == priv->pkey_index))
+		return 1;
+	return 0;
+}
+
+static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv,
+				enum ipoib_flush_level level)
+{
+	struct ipoib_dev_priv *cpriv;
+	struct net_device *dev = priv->dev;
+	int result;
+
+	down_read(&priv->vlan_rwsem);
+
+	/*
+	 * Flush any child interfaces too -- they might be up even if
+	 * the parent is down.
+	 */
+	list_for_each_entry(cpriv, &priv->child_intfs, list)
+		__ipoib_ib_dev_flush(cpriv, level);
+
+	up_read(&priv->vlan_rwsem);
+
+	if (!test_bit(IPOIB_FLAG_INITIALIZED, &priv->flags) &&
+	    level != IPOIB_FLUSH_HEAVY) {
+		ipoib_dbg(priv, "Not flushing - IPOIB_FLAG_INITIALIZED not set.\n");
+		return;
+	}
+
+	if (!test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) {
+		/* interface is down. update pkey and leave. */
+		if (level == IPOIB_FLUSH_HEAVY) {
+			if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags))
+				update_parent_pkey(priv);
+			else
+				update_child_pkey(priv);
+		}
+		ipoib_dbg(priv, "Not flushing - IPOIB_FLAG_ADMIN_UP not set.\n");
+		return;
+	}
+
+	if (level == IPOIB_FLUSH_HEAVY) {
+		/* child devices chase their origin pkey value, while non-child
+		 * (parent) devices should always takes what present in pkey index 0
+		 */
+		if (test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) {
+			result = update_child_pkey(priv);
+			if (result) {
+				/* restart QP only if P_Key index is changed */
+				ipoib_dbg(priv, "Not flushing - P_Key index not changed.\n");
+				return;
+			}
+
+		} else {
+			result = update_parent_pkey(priv);
+			/* restart QP only if P_Key value changed */
+			if (result) {
+				ipoib_dbg(priv, "Not flushing - P_Key value not changed.\n");
+				return;
+			}
+		}
+	}
+
+	if (level == IPOIB_FLUSH_LIGHT) {
+		ipoib_mark_paths_invalid(dev);
+		ipoib_mcast_dev_flush(dev);
+	}
+
+	if (level >= IPOIB_FLUSH_NORMAL)
+		ipoib_ib_dev_down(dev, 0);
+
+	if (level == IPOIB_FLUSH_HEAVY) {
+		if (test_bit(IPOIB_FLAG_INITIALIZED, &priv->flags))
+			ipoib_ib_dev_stop(dev, 0);
+		if (ipoib_ib_dev_open(dev, 0) != 0)
+			return;
+		if (netif_queue_stopped(dev))
+			netif_start_queue(dev);
+	}
+
+	/*
+	 * The device could have been brought down between the start and when
+	 * we get here, don't bring it back up if it's not configured up
+	 */
+	if (test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) {
+		if (level >= IPOIB_FLUSH_NORMAL)
+			ipoib_ib_dev_up(dev);
+		ipoib_mcast_restart_task(&priv->restart_task);
+	}
+}
+
+void ipoib_ib_dev_flush_light(struct work_struct *work)
+{
+	struct ipoib_dev_priv *priv =
+		container_of(work, struct ipoib_dev_priv, flush_light);
+
+	__ipoib_ib_dev_flush(priv, IPOIB_FLUSH_LIGHT);
+}
+
+void ipoib_ib_dev_flush_normal(struct work_struct *work)
+{
+	struct ipoib_dev_priv *priv =
+		container_of(work, struct ipoib_dev_priv, flush_normal);
+
+	__ipoib_ib_dev_flush(priv, IPOIB_FLUSH_NORMAL);
+}
+
+void ipoib_ib_dev_flush_heavy(struct work_struct *work)
+{
+	struct ipoib_dev_priv *priv =
+		container_of(work, struct ipoib_dev_priv, flush_heavy);
+
+	__ipoib_ib_dev_flush(priv, IPOIB_FLUSH_HEAVY);
+}
+
+void ipoib_ib_dev_cleanup(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+
+	ipoib_dbg(priv, "cleaning up ib_dev\n");
+	/*
+	 * We must make sure there are no more (path) completions
+	 * that may wish to touch priv fields that are no longer valid
+	 */
+	ipoib_flush_paths(dev);
+
+	ipoib_mcast_stop_thread(dev, 1);
+	ipoib_mcast_dev_flush(dev);
+
+	ipoib_transport_dev_cleanup(dev);
+}
+
+
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c
new file mode 100644
index 0000000..58b5aa3
--- /dev/null
+++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
@@ -0,0 +1,1793 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2004 Voltaire, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "ipoib.h"
+
+#include <linux/module.h>
+
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/kernel.h>
+#include <linux/vmalloc.h>
+
+#include <linux/if_arp.h>	/* For ARPHRD_xxx */
+
+#include <linux/ip.h>
+#include <linux/in.h>
+
+#include <linux/jhash.h>
+#include <net/arp.h>
+
+#define DRV_VERSION "1.0.0"
+
+const char ipoib_driver_version[] = DRV_VERSION;
+
+MODULE_AUTHOR("Roland Dreier");
+MODULE_DESCRIPTION("IP-over-InfiniBand net driver");
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_VERSION(DRV_VERSION);
+
+int ipoib_sendq_size __read_mostly = IPOIB_TX_RING_SIZE;
+int ipoib_recvq_size __read_mostly = IPOIB_RX_RING_SIZE;
+
+module_param_named(send_queue_size, ipoib_sendq_size, int, 0444);
+MODULE_PARM_DESC(send_queue_size, "Number of descriptors in send queue");
+module_param_named(recv_queue_size, ipoib_recvq_size, int, 0444);
+MODULE_PARM_DESC(recv_queue_size, "Number of descriptors in receive queue");
+
+#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
+int ipoib_debug_level;
+
+module_param_named(debug_level, ipoib_debug_level, int, 0644);
+MODULE_PARM_DESC(debug_level, "Enable debug tracing if > 0");
+#endif
+
+struct ipoib_path_iter {
+	struct net_device *dev;
+	struct ipoib_path  path;
+};
+
+static const u8 ipv4_bcast_addr[] = {
+	0x00, 0xff, 0xff, 0xff,
+	0xff, 0x12, 0x40, 0x1b,	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,	0xff, 0xff, 0xff, 0xff
+};
+
+struct workqueue_struct *ipoib_workqueue;
+
+struct ib_sa_client ipoib_sa_client;
+
+static void ipoib_add_one(struct ib_device *device);
+static void ipoib_remove_one(struct ib_device *device);
+static void ipoib_neigh_reclaim(struct rcu_head *rp);
+
+static struct ib_client ipoib_client = {
+	.name   = "ipoib",
+	.add    = ipoib_add_one,
+	.remove = ipoib_remove_one
+};
+
+int ipoib_open(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+
+	ipoib_dbg(priv, "bringing up interface\n");
+
+	netif_carrier_off(dev);
+
+	set_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags);
+
+	if (ipoib_ib_dev_open(dev, 1)) {
+		if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags))
+			return 0;
+		goto err_disable;
+	}
+
+	if (ipoib_ib_dev_up(dev))
+		goto err_stop;
+
+	if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) {
+		struct ipoib_dev_priv *cpriv;
+
+		/* Bring up any child interfaces too */
+		down_read(&priv->vlan_rwsem);
+		list_for_each_entry(cpriv, &priv->child_intfs, list) {
+			int flags;
+
+			flags = cpriv->dev->flags;
+			if (flags & IFF_UP)
+				continue;
+
+			dev_change_flags(cpriv->dev, flags | IFF_UP);
+		}
+		up_read(&priv->vlan_rwsem);
+	}
+
+	netif_start_queue(dev);
+
+	return 0;
+
+err_stop:
+	ipoib_ib_dev_stop(dev, 1);
+
+err_disable:
+	clear_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags);
+
+	return -EINVAL;
+}
+
+static int ipoib_stop(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+
+	ipoib_dbg(priv, "stopping interface\n");
+
+	clear_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags);
+
+	netif_stop_queue(dev);
+
+	ipoib_ib_dev_down(dev, 1);
+	ipoib_ib_dev_stop(dev, 0);
+
+	if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) {
+		struct ipoib_dev_priv *cpriv;
+
+		/* Bring down any child interfaces too */
+		down_read(&priv->vlan_rwsem);
+		list_for_each_entry(cpriv, &priv->child_intfs, list) {
+			int flags;
+
+			flags = cpriv->dev->flags;
+			if (!(flags & IFF_UP))
+				continue;
+
+			dev_change_flags(cpriv->dev, flags & ~IFF_UP);
+		}
+		up_read(&priv->vlan_rwsem);
+	}
+
+	return 0;
+}
+
+static void ipoib_uninit(struct net_device *dev)
+{
+	ipoib_dev_cleanup(dev);
+}
+
+static netdev_features_t ipoib_fix_features(struct net_device *dev, netdev_features_t features)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+
+	if (test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags))
+		features &= ~(NETIF_F_SG | NETIF_F_IP_CSUM | NETIF_F_TSO);
+
+	return features;
+}
+
+static int ipoib_change_mtu(struct net_device *dev, int new_mtu)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+
+	/* dev->mtu > 2K ==> connected mode */
+	if (ipoib_cm_admin_enabled(dev)) {
+		if (new_mtu > ipoib_cm_max_mtu(dev))
+			return -EINVAL;
+
+		if (new_mtu > priv->mcast_mtu)
+			ipoib_warn(priv, "mtu > %d will cause multicast packet drops.\n",
+				   priv->mcast_mtu);
+
+		dev->mtu = new_mtu;
+		return 0;
+	}
+
+	if (new_mtu > IPOIB_UD_MTU(priv->max_ib_mtu))
+		return -EINVAL;
+
+	priv->admin_mtu = new_mtu;
+
+	dev->mtu = min(priv->mcast_mtu, priv->admin_mtu);
+
+	return 0;
+}
+
+int ipoib_set_mode(struct net_device *dev, const char *buf)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+
+	/* flush paths if we switch modes so that connections are restarted */
+	if (IPOIB_CM_SUPPORTED(dev->dev_addr) && !strcmp(buf, "connected\n")) {
+		set_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags);
+		ipoib_warn(priv, "enabling connected mode "
+			   "will cause multicast packet drops\n");
+		netdev_update_features(dev);
+		rtnl_unlock();
+		priv->tx_wr.send_flags &= ~IB_SEND_IP_CSUM;
+
+		ipoib_flush_paths(dev);
+		rtnl_lock();
+		return 0;
+	}
+
+	if (!strcmp(buf, "datagram\n")) {
+		clear_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags);
+		netdev_update_features(dev);
+		dev_set_mtu(dev, min(priv->mcast_mtu, dev->mtu));
+		rtnl_unlock();
+		ipoib_flush_paths(dev);
+		rtnl_lock();
+		return 0;
+	}
+
+	return -EINVAL;
+}
+
+static struct ipoib_path *__path_find(struct net_device *dev, void *gid)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct rb_node *n = priv->path_tree.rb_node;
+	struct ipoib_path *path;
+	int ret;
+
+	while (n) {
+		path = rb_entry(n, struct ipoib_path, rb_node);
+
+		ret = memcmp(gid, path->pathrec.dgid.raw,
+			     sizeof (union ib_gid));
+
+		if (ret < 0)
+			n = n->rb_left;
+		else if (ret > 0)
+			n = n->rb_right;
+		else
+			return path;
+	}
+
+	return NULL;
+}
+
+static int __path_add(struct net_device *dev, struct ipoib_path *path)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct rb_node **n = &priv->path_tree.rb_node;
+	struct rb_node *pn = NULL;
+	struct ipoib_path *tpath;
+	int ret;
+
+	while (*n) {
+		pn = *n;
+		tpath = rb_entry(pn, struct ipoib_path, rb_node);
+
+		ret = memcmp(path->pathrec.dgid.raw, tpath->pathrec.dgid.raw,
+			     sizeof (union ib_gid));
+		if (ret < 0)
+			n = &pn->rb_left;
+		else if (ret > 0)
+			n = &pn->rb_right;
+		else
+			return -EEXIST;
+	}
+
+	rb_link_node(&path->rb_node, pn, n);
+	rb_insert_color(&path->rb_node, &priv->path_tree);
+
+	list_add_tail(&path->list, &priv->path_list);
+
+	return 0;
+}
+
+static void path_free(struct net_device *dev, struct ipoib_path *path)
+{
+	struct sk_buff *skb;
+
+	while ((skb = __skb_dequeue(&path->queue)))
+		dev_kfree_skb_irq(skb);
+
+	ipoib_dbg(netdev_priv(dev), "path_free\n");
+
+	/* remove all neigh connected to this path */
+	ipoib_del_neighs_by_gid(dev, path->pathrec.dgid.raw);
+
+	if (path->ah)
+		ipoib_put_ah(path->ah);
+
+	kfree(path);
+}
+
+#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
+
+struct ipoib_path_iter *ipoib_path_iter_init(struct net_device *dev)
+{
+	struct ipoib_path_iter *iter;
+
+	iter = kmalloc(sizeof *iter, GFP_KERNEL);
+	if (!iter)
+		return NULL;
+
+	iter->dev = dev;
+	memset(iter->path.pathrec.dgid.raw, 0, 16);
+
+	if (ipoib_path_iter_next(iter)) {
+		kfree(iter);
+		return NULL;
+	}
+
+	return iter;
+}
+
+int ipoib_path_iter_next(struct ipoib_path_iter *iter)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(iter->dev);
+	struct rb_node *n;
+	struct ipoib_path *path;
+	int ret = 1;
+
+	spin_lock_irq(&priv->lock);
+
+	n = rb_first(&priv->path_tree);
+
+	while (n) {
+		path = rb_entry(n, struct ipoib_path, rb_node);
+
+		if (memcmp(iter->path.pathrec.dgid.raw, path->pathrec.dgid.raw,
+			   sizeof (union ib_gid)) < 0) {
+			iter->path = *path;
+			ret = 0;
+			break;
+		}
+
+		n = rb_next(n);
+	}
+
+	spin_unlock_irq(&priv->lock);
+
+	return ret;
+}
+
+void ipoib_path_iter_read(struct ipoib_path_iter *iter,
+			  struct ipoib_path *path)
+{
+	*path = iter->path;
+}
+
+#endif /* CONFIG_INFINIBAND_IPOIB_DEBUG */
+
+void ipoib_mark_paths_invalid(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct ipoib_path *path, *tp;
+
+	spin_lock_irq(&priv->lock);
+
+	list_for_each_entry_safe(path, tp, &priv->path_list, list) {
+		ipoib_dbg(priv, "mark path LID 0x%04x GID %pI6 invalid\n",
+			be16_to_cpu(path->pathrec.dlid),
+			path->pathrec.dgid.raw);
+		path->valid =  0;
+	}
+
+	spin_unlock_irq(&priv->lock);
+}
+
+void ipoib_flush_paths(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct ipoib_path *path, *tp;
+	LIST_HEAD(remove_list);
+	unsigned long flags;
+
+	netif_tx_lock_bh(dev);
+	spin_lock_irqsave(&priv->lock, flags);
+
+	list_splice_init(&priv->path_list, &remove_list);
+
+	list_for_each_entry(path, &remove_list, list)
+		rb_erase(&path->rb_node, &priv->path_tree);
+
+	list_for_each_entry_safe(path, tp, &remove_list, list) {
+		if (path->query)
+			ib_sa_cancel_query(path->query_id, path->query);
+		spin_unlock_irqrestore(&priv->lock, flags);
+		netif_tx_unlock_bh(dev);
+		wait_for_completion(&path->done);
+		path_free(dev, path);
+		netif_tx_lock_bh(dev);
+		spin_lock_irqsave(&priv->lock, flags);
+	}
+
+	spin_unlock_irqrestore(&priv->lock, flags);
+	netif_tx_unlock_bh(dev);
+}
+
+static void path_rec_completion(int status,
+				struct ib_sa_path_rec *pathrec,
+				void *path_ptr)
+{
+	struct ipoib_path *path = path_ptr;
+	struct net_device *dev = path->dev;
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct ipoib_ah *ah = NULL;
+	struct ipoib_ah *old_ah = NULL;
+	struct ipoib_neigh *neigh, *tn;
+	struct sk_buff_head skqueue;
+	struct sk_buff *skb;
+	unsigned long flags;
+
+	if (!status)
+		ipoib_dbg(priv, "PathRec LID 0x%04x for GID %pI6\n",
+			  be16_to_cpu(pathrec->dlid), pathrec->dgid.raw);
+	else
+		ipoib_dbg(priv, "PathRec status %d for GID %pI6\n",
+			  status, path->pathrec.dgid.raw);
+
+	skb_queue_head_init(&skqueue);
+
+	if (!status) {
+		struct ib_ah_attr av;
+
+		if (!ib_init_ah_from_path(priv->ca, priv->port, pathrec, &av))
+			ah = ipoib_create_ah(dev, priv->pd, &av);
+	}
+
+	spin_lock_irqsave(&priv->lock, flags);
+
+	if (!IS_ERR_OR_NULL(ah)) {
+		path->pathrec = *pathrec;
+
+		old_ah   = path->ah;
+		path->ah = ah;
+
+		ipoib_dbg(priv, "created address handle %p for LID 0x%04x, SL %d\n",
+			  ah, be16_to_cpu(pathrec->dlid), pathrec->sl);
+
+		while ((skb = __skb_dequeue(&path->queue)))
+			__skb_queue_tail(&skqueue, skb);
+
+		list_for_each_entry_safe(neigh, tn, &path->neigh_list, list) {
+			if (neigh->ah) {
+				WARN_ON(neigh->ah != old_ah);
+				/*
+				 * Dropping the ah reference inside
+				 * priv->lock is safe here, because we
+				 * will hold one more reference from
+				 * the original value of path->ah (ie
+				 * old_ah).
+				 */
+				ipoib_put_ah(neigh->ah);
+			}
+			kref_get(&path->ah->ref);
+			neigh->ah = path->ah;
+
+			if (ipoib_cm_enabled(dev, neigh->daddr)) {
+				if (!ipoib_cm_get(neigh))
+					ipoib_cm_set(neigh, ipoib_cm_create_tx(dev,
+									       path,
+									       neigh));
+				if (!ipoib_cm_get(neigh)) {
+					ipoib_neigh_free(neigh);
+					continue;
+				}
+			}
+
+			while ((skb = __skb_dequeue(&neigh->queue)))
+				__skb_queue_tail(&skqueue, skb);
+		}
+		path->valid = 1;
+	}
+
+	path->query = NULL;
+	complete(&path->done);
+
+	spin_unlock_irqrestore(&priv->lock, flags);
+
+	if (IS_ERR_OR_NULL(ah))
+		ipoib_del_neighs_by_gid(dev, path->pathrec.dgid.raw);
+
+	if (old_ah)
+		ipoib_put_ah(old_ah);
+
+	while ((skb = __skb_dequeue(&skqueue))) {
+		skb->dev = dev;
+		if (dev_queue_xmit(skb))
+			ipoib_warn(priv, "dev_queue_xmit failed "
+				   "to requeue packet\n");
+	}
+}
+
+static struct ipoib_path *path_rec_create(struct net_device *dev, void *gid)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct ipoib_path *path;
+
+	if (!priv->broadcast)
+		return NULL;
+
+	path = kzalloc(sizeof *path, GFP_ATOMIC);
+	if (!path)
+		return NULL;
+
+	path->dev = dev;
+
+	skb_queue_head_init(&path->queue);
+
+	INIT_LIST_HEAD(&path->neigh_list);
+
+	memcpy(path->pathrec.dgid.raw, gid, sizeof (union ib_gid));
+	path->pathrec.sgid	    = priv->local_gid;
+	path->pathrec.pkey	    = cpu_to_be16(priv->pkey);
+	path->pathrec.numb_path     = 1;
+	path->pathrec.traffic_class = priv->broadcast->mcmember.traffic_class;
+
+	return path;
+}
+
+static int path_rec_start(struct net_device *dev,
+			  struct ipoib_path *path)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+
+	ipoib_dbg(priv, "Start path record lookup for %pI6\n",
+		  path->pathrec.dgid.raw);
+
+	init_completion(&path->done);
+
+	path->query_id =
+		ib_sa_path_rec_get(&ipoib_sa_client, priv->ca, priv->port,
+				   &path->pathrec,
+				   IB_SA_PATH_REC_DGID		|
+				   IB_SA_PATH_REC_SGID		|
+				   IB_SA_PATH_REC_NUMB_PATH	|
+				   IB_SA_PATH_REC_TRAFFIC_CLASS |
+				   IB_SA_PATH_REC_PKEY,
+				   1000, GFP_ATOMIC,
+				   path_rec_completion,
+				   path, &path->query);
+	if (path->query_id < 0) {
+		ipoib_warn(priv, "ib_sa_path_rec_get failed: %d\n", path->query_id);
+		path->query = NULL;
+		complete(&path->done);
+		return path->query_id;
+	}
+
+	return 0;
+}
+
+static void neigh_add_path(struct sk_buff *skb, u8 *daddr,
+			   struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct ipoib_path *path;
+	struct ipoib_neigh *neigh;
+	unsigned long flags;
+
+	spin_lock_irqsave(&priv->lock, flags);
+	neigh = ipoib_neigh_alloc(daddr, dev);
+	if (!neigh) {
+		spin_unlock_irqrestore(&priv->lock, flags);
+		++dev->stats.tx_dropped;
+		dev_kfree_skb_any(skb);
+		return;
+	}
+
+	path = __path_find(dev, daddr + 4);
+	if (!path) {
+		path = path_rec_create(dev, daddr + 4);
+		if (!path)
+			goto err_path;
+
+		__path_add(dev, path);
+	}
+
+	list_add_tail(&neigh->list, &path->neigh_list);
+
+	if (path->ah) {
+		kref_get(&path->ah->ref);
+		neigh->ah = path->ah;
+
+		if (ipoib_cm_enabled(dev, neigh->daddr)) {
+			if (!ipoib_cm_get(neigh))
+				ipoib_cm_set(neigh, ipoib_cm_create_tx(dev, path, neigh));
+			if (!ipoib_cm_get(neigh)) {
+				ipoib_neigh_free(neigh);
+				goto err_drop;
+			}
+			if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE)
+				__skb_queue_tail(&neigh->queue, skb);
+			else {
+				ipoib_warn(priv, "queue length limit %d. Packet drop.\n",
+					   skb_queue_len(&neigh->queue));
+				goto err_drop;
+			}
+		} else {
+			spin_unlock_irqrestore(&priv->lock, flags);
+			ipoib_send(dev, skb, path->ah, IPOIB_QPN(daddr));
+			ipoib_neigh_put(neigh);
+			return;
+		}
+	} else {
+		neigh->ah  = NULL;
+
+		if (!path->query && path_rec_start(dev, path))
+			goto err_path;
+
+		__skb_queue_tail(&neigh->queue, skb);
+	}
+
+	spin_unlock_irqrestore(&priv->lock, flags);
+	ipoib_neigh_put(neigh);
+	return;
+
+err_path:
+	ipoib_neigh_free(neigh);
+err_drop:
+	++dev->stats.tx_dropped;
+	dev_kfree_skb_any(skb);
+
+	spin_unlock_irqrestore(&priv->lock, flags);
+	ipoib_neigh_put(neigh);
+}
+
+static void unicast_arp_send(struct sk_buff *skb, struct net_device *dev,
+			     struct ipoib_cb *cb)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct ipoib_path *path;
+	unsigned long flags;
+
+	spin_lock_irqsave(&priv->lock, flags);
+
+	path = __path_find(dev, cb->hwaddr + 4);
+	if (!path || !path->valid) {
+		int new_path = 0;
+
+		if (!path) {
+			path = path_rec_create(dev, cb->hwaddr + 4);
+			new_path = 1;
+		}
+		if (path) {
+			__skb_queue_tail(&path->queue, skb);
+
+			if (!path->query && path_rec_start(dev, path)) {
+				spin_unlock_irqrestore(&priv->lock, flags);
+				if (new_path)
+					path_free(dev, path);
+				return;
+			} else
+				__path_add(dev, path);
+		} else {
+			++dev->stats.tx_dropped;
+			dev_kfree_skb_any(skb);
+		}
+
+		spin_unlock_irqrestore(&priv->lock, flags);
+		return;
+	}
+
+	if (path->ah) {
+		ipoib_dbg(priv, "Send unicast ARP to %04x\n",
+			  be16_to_cpu(path->pathrec.dlid));
+
+		spin_unlock_irqrestore(&priv->lock, flags);
+		ipoib_send(dev, skb, path->ah, IPOIB_QPN(cb->hwaddr));
+		return;
+	} else if ((path->query || !path_rec_start(dev, path)) &&
+		   skb_queue_len(&path->queue) < IPOIB_MAX_PATH_REC_QUEUE) {
+		__skb_queue_tail(&path->queue, skb);
+	} else {
+		++dev->stats.tx_dropped;
+		dev_kfree_skb_any(skb);
+	}
+
+	spin_unlock_irqrestore(&priv->lock, flags);
+}
+
+static int ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct ipoib_neigh *neigh;
+	struct ipoib_cb *cb = ipoib_skb_cb(skb);
+	struct ipoib_header *header;
+	unsigned long flags;
+
+	header = (struct ipoib_header *) skb->data;
+
+	if (unlikely(cb->hwaddr[4] == 0xff)) {
+		/* multicast, arrange "if" according to probability */
+		if ((header->proto != htons(ETH_P_IP)) &&
+		    (header->proto != htons(ETH_P_IPV6)) &&
+		    (header->proto != htons(ETH_P_ARP)) &&
+		    (header->proto != htons(ETH_P_RARP)) &&
+		    (header->proto != htons(ETH_P_TIPC))) {
+			/* ethertype not supported by IPoIB */
+			++dev->stats.tx_dropped;
+			dev_kfree_skb_any(skb);
+			return NETDEV_TX_OK;
+		}
+		/* Add in the P_Key for multicast*/
+		cb->hwaddr[8] = (priv->pkey >> 8) & 0xff;
+		cb->hwaddr[9] = priv->pkey & 0xff;
+
+		neigh = ipoib_neigh_get(dev, cb->hwaddr);
+		if (likely(neigh))
+			goto send_using_neigh;
+		ipoib_mcast_send(dev, cb->hwaddr, skb);
+		return NETDEV_TX_OK;
+	}
+
+	/* unicast, arrange "switch" according to probability */
+	switch (header->proto) {
+	case htons(ETH_P_IP):
+	case htons(ETH_P_IPV6):
+	case htons(ETH_P_TIPC):
+		neigh = ipoib_neigh_get(dev, cb->hwaddr);
+		if (unlikely(!neigh)) {
+			neigh_add_path(skb, cb->hwaddr, dev);
+			return NETDEV_TX_OK;
+		}
+		break;
+	case htons(ETH_P_ARP):
+	case htons(ETH_P_RARP):
+		/* for unicast ARP and RARP should always perform path find */
+		unicast_arp_send(skb, dev, cb);
+		return NETDEV_TX_OK;
+	default:
+		/* ethertype not supported by IPoIB */
+		++dev->stats.tx_dropped;
+		dev_kfree_skb_any(skb);
+		return NETDEV_TX_OK;
+	}
+
+send_using_neigh:
+	/* note we now hold a ref to neigh */
+	if (ipoib_cm_get(neigh)) {
+		if (ipoib_cm_up(neigh)) {
+			ipoib_cm_send(dev, skb, ipoib_cm_get(neigh));
+			goto unref;
+		}
+	} else if (neigh->ah) {
+		ipoib_send(dev, skb, neigh->ah, IPOIB_QPN(cb->hwaddr));
+		goto unref;
+	}
+
+	if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) {
+		spin_lock_irqsave(&priv->lock, flags);
+		__skb_queue_tail(&neigh->queue, skb);
+		spin_unlock_irqrestore(&priv->lock, flags);
+	} else {
+		++dev->stats.tx_dropped;
+		dev_kfree_skb_any(skb);
+	}
+
+unref:
+	ipoib_neigh_put(neigh);
+
+	return NETDEV_TX_OK;
+}
+
+static void ipoib_timeout(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+
+	ipoib_warn(priv, "transmit timeout: latency %d msecs\n",
+		   jiffies_to_msecs(jiffies - dev->trans_start));
+	ipoib_warn(priv, "queue stopped %d, tx_head %u, tx_tail %u\n",
+		   netif_queue_stopped(dev),
+		   priv->tx_head, priv->tx_tail);
+	/* XXX reset QP, etc. */
+}
+
+static int ipoib_hard_header(struct sk_buff *skb,
+			     struct net_device *dev,
+			     unsigned short type,
+			     const void *daddr, const void *saddr, unsigned len)
+{
+	struct ipoib_header *header;
+	struct ipoib_cb *cb = ipoib_skb_cb(skb);
+
+	header = (struct ipoib_header *) skb_push(skb, sizeof *header);
+
+	header->proto = htons(type);
+	header->reserved = 0;
+
+	/*
+	 * we don't rely on dst_entry structure,  always stuff the
+	 * destination address into skb->cb so we can figure out where
+	 * to send the packet later.
+	 */
+	memcpy(cb->hwaddr, daddr, INFINIBAND_ALEN);
+
+	return sizeof *header;
+}
+
+static void ipoib_set_mcast_list(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+
+	if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags)) {
+		ipoib_dbg(priv, "IPOIB_FLAG_OPER_UP not set");
+		return;
+	}
+
+	queue_work(ipoib_workqueue, &priv->restart_task);
+}
+
+static u32 ipoib_addr_hash(struct ipoib_neigh_hash *htbl, u8 *daddr)
+{
+	/*
+	 * Use only the address parts that contributes to spreading
+	 * The subnet prefix is not used as one can not connect to
+	 * same remote port (GUID) using the same remote QPN via two
+	 * different subnets.
+	 */
+	 /* qpn octets[1:4) & port GUID octets[12:20) */
+	u32 *d32 = (u32 *) daddr;
+	u32 hv;
+
+	hv = jhash_3words(d32[3], d32[4], IPOIB_QPN_MASK & d32[0], 0);
+	return hv & htbl->mask;
+}
+
+struct ipoib_neigh *ipoib_neigh_get(struct net_device *dev, u8 *daddr)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct ipoib_neigh_table *ntbl = &priv->ntbl;
+	struct ipoib_neigh_hash *htbl;
+	struct ipoib_neigh *neigh = NULL;
+	u32 hash_val;
+
+	rcu_read_lock_bh();
+
+	htbl = rcu_dereference_bh(ntbl->htbl);
+
+	if (!htbl)
+		goto out_unlock;
+
+	hash_val = ipoib_addr_hash(htbl, daddr);
+	for (neigh = rcu_dereference_bh(htbl->buckets[hash_val]);
+	     neigh != NULL;
+	     neigh = rcu_dereference_bh(neigh->hnext)) {
+		if (memcmp(daddr, neigh->daddr, INFINIBAND_ALEN) == 0) {
+			/* found, take one ref on behalf of the caller */
+			if (!atomic_inc_not_zero(&neigh->refcnt)) {
+				/* deleted */
+				neigh = NULL;
+				goto out_unlock;
+			}
+			neigh->alive = jiffies;
+			goto out_unlock;
+		}
+	}
+
+out_unlock:
+	rcu_read_unlock_bh();
+	return neigh;
+}
+
+static void __ipoib_reap_neigh(struct ipoib_dev_priv *priv)
+{
+	struct ipoib_neigh_table *ntbl = &priv->ntbl;
+	struct ipoib_neigh_hash *htbl;
+	unsigned long neigh_obsolete;
+	unsigned long dt;
+	unsigned long flags;
+	int i;
+
+	if (test_bit(IPOIB_STOP_NEIGH_GC, &priv->flags))
+		return;
+
+	spin_lock_irqsave(&priv->lock, flags);
+
+	htbl = rcu_dereference_protected(ntbl->htbl,
+					 lockdep_is_held(&priv->lock));
+
+	if (!htbl)
+		goto out_unlock;
+
+	/* neigh is obsolete if it was idle for two GC periods */
+	dt = 2 * arp_tbl.gc_interval;
+	neigh_obsolete = jiffies - dt;
+	/* handle possible race condition */
+	if (test_bit(IPOIB_STOP_NEIGH_GC, &priv->flags))
+		goto out_unlock;
+
+	for (i = 0; i < htbl->size; i++) {
+		struct ipoib_neigh *neigh;
+		struct ipoib_neigh __rcu **np = &htbl->buckets[i];
+
+		while ((neigh = rcu_dereference_protected(*np,
+							  lockdep_is_held(&priv->lock))) != NULL) {
+			/* was the neigh idle for two GC periods */
+			if (time_after(neigh_obsolete, neigh->alive)) {
+				rcu_assign_pointer(*np,
+						   rcu_dereference_protected(neigh->hnext,
+									     lockdep_is_held(&priv->lock)));
+				/* remove from path/mc list */
+				list_del(&neigh->list);
+				call_rcu(&neigh->rcu, ipoib_neigh_reclaim);
+			} else {
+				np = &neigh->hnext;
+			}
+
+		}
+	}
+
+out_unlock:
+	spin_unlock_irqrestore(&priv->lock, flags);
+}
+
+static void ipoib_reap_neigh(struct work_struct *work)
+{
+	struct ipoib_dev_priv *priv =
+		container_of(work, struct ipoib_dev_priv, neigh_reap_task.work);
+
+	__ipoib_reap_neigh(priv);
+
+	if (!test_bit(IPOIB_STOP_NEIGH_GC, &priv->flags))
+		queue_delayed_work(ipoib_workqueue, &priv->neigh_reap_task,
+				   arp_tbl.gc_interval);
+}
+
+
+static struct ipoib_neigh *ipoib_neigh_ctor(u8 *daddr,
+				      struct net_device *dev)
+{
+	struct ipoib_neigh *neigh;
+
+	neigh = kzalloc(sizeof *neigh, GFP_ATOMIC);
+	if (!neigh)
+		return NULL;
+
+	neigh->dev = dev;
+	memcpy(&neigh->daddr, daddr, sizeof(neigh->daddr));
+	skb_queue_head_init(&neigh->queue);
+	INIT_LIST_HEAD(&neigh->list);
+	ipoib_cm_set(neigh, NULL);
+	/* one ref on behalf of the caller */
+	atomic_set(&neigh->refcnt, 1);
+
+	return neigh;
+}
+
+struct ipoib_neigh *ipoib_neigh_alloc(u8 *daddr,
+				      struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct ipoib_neigh_table *ntbl = &priv->ntbl;
+	struct ipoib_neigh_hash *htbl;
+	struct ipoib_neigh *neigh;
+	u32 hash_val;
+
+	htbl = rcu_dereference_protected(ntbl->htbl,
+					 lockdep_is_held(&priv->lock));
+	if (!htbl) {
+		neigh = NULL;
+		goto out_unlock;
+	}
+
+	/* need to add a new neigh, but maybe some other thread succeeded?
+	 * recalc hash, maybe hash resize took place so we do a search
+	 */
+	hash_val = ipoib_addr_hash(htbl, daddr);
+	for (neigh = rcu_dereference_protected(htbl->buckets[hash_val],
+					       lockdep_is_held(&priv->lock));
+	     neigh != NULL;
+	     neigh = rcu_dereference_protected(neigh->hnext,
+					       lockdep_is_held(&priv->lock))) {
+		if (memcmp(daddr, neigh->daddr, INFINIBAND_ALEN) == 0) {
+			/* found, take one ref on behalf of the caller */
+			if (!atomic_inc_not_zero(&neigh->refcnt)) {
+				/* deleted */
+				neigh = NULL;
+				break;
+			}
+			neigh->alive = jiffies;
+			goto out_unlock;
+		}
+	}
+
+	neigh = ipoib_neigh_ctor(daddr, dev);
+	if (!neigh)
+		goto out_unlock;
+
+	/* one ref on behalf of the hash table */
+	atomic_inc(&neigh->refcnt);
+	neigh->alive = jiffies;
+	/* put in hash */
+	rcu_assign_pointer(neigh->hnext,
+			   rcu_dereference_protected(htbl->buckets[hash_val],
+						     lockdep_is_held(&priv->lock)));
+	rcu_assign_pointer(htbl->buckets[hash_val], neigh);
+	atomic_inc(&ntbl->entries);
+
+out_unlock:
+
+	return neigh;
+}
+
+void ipoib_neigh_dtor(struct ipoib_neigh *neigh)
+{
+	/* neigh reference count was dropprd to zero */
+	struct net_device *dev = neigh->dev;
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct sk_buff *skb;
+	if (neigh->ah)
+		ipoib_put_ah(neigh->ah);
+	while ((skb = __skb_dequeue(&neigh->queue))) {
+		++dev->stats.tx_dropped;
+		dev_kfree_skb_any(skb);
+	}
+	if (ipoib_cm_get(neigh))
+		ipoib_cm_destroy_tx(ipoib_cm_get(neigh));
+	ipoib_dbg(netdev_priv(dev),
+		  "neigh free for %06x %pI6\n",
+		  IPOIB_QPN(neigh->daddr),
+		  neigh->daddr + 4);
+	kfree(neigh);
+	if (atomic_dec_and_test(&priv->ntbl.entries)) {
+		if (test_bit(IPOIB_NEIGH_TBL_FLUSH, &priv->flags))
+			complete(&priv->ntbl.flushed);
+	}
+}
+
+static void ipoib_neigh_reclaim(struct rcu_head *rp)
+{
+	/* Called as a result of removal from hash table */
+	struct ipoib_neigh *neigh = container_of(rp, struct ipoib_neigh, rcu);
+	/* note TX context may hold another ref */
+	ipoib_neigh_put(neigh);
+}
+
+void ipoib_neigh_free(struct ipoib_neigh *neigh)
+{
+	struct net_device *dev = neigh->dev;
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct ipoib_neigh_table *ntbl = &priv->ntbl;
+	struct ipoib_neigh_hash *htbl;
+	struct ipoib_neigh __rcu **np;
+	struct ipoib_neigh *n;
+	u32 hash_val;
+
+	htbl = rcu_dereference_protected(ntbl->htbl,
+					lockdep_is_held(&priv->lock));
+	if (!htbl)
+		return;
+
+	hash_val = ipoib_addr_hash(htbl, neigh->daddr);
+	np = &htbl->buckets[hash_val];
+	for (n = rcu_dereference_protected(*np,
+					    lockdep_is_held(&priv->lock));
+	     n != NULL;
+	     n = rcu_dereference_protected(*np,
+					lockdep_is_held(&priv->lock))) {
+		if (n == neigh) {
+			/* found */
+			rcu_assign_pointer(*np,
+					   rcu_dereference_protected(neigh->hnext,
+								     lockdep_is_held(&priv->lock)));
+			/* remove from parent list */
+			list_del(&neigh->list);
+			call_rcu(&neigh->rcu, ipoib_neigh_reclaim);
+			return;
+		} else {
+			np = &n->hnext;
+		}
+	}
+}
+
+static int ipoib_neigh_hash_init(struct ipoib_dev_priv *priv)
+{
+	struct ipoib_neigh_table *ntbl = &priv->ntbl;
+	struct ipoib_neigh_hash *htbl;
+	struct ipoib_neigh **buckets;
+	u32 size;
+
+	clear_bit(IPOIB_NEIGH_TBL_FLUSH, &priv->flags);
+	ntbl->htbl = NULL;
+	htbl = kzalloc(sizeof(*htbl), GFP_KERNEL);
+	if (!htbl)
+		return -ENOMEM;
+	set_bit(IPOIB_STOP_NEIGH_GC, &priv->flags);
+	size = roundup_pow_of_two(arp_tbl.gc_thresh3);
+	buckets = kzalloc(size * sizeof(*buckets), GFP_KERNEL);
+	if (!buckets) {
+		kfree(htbl);
+		return -ENOMEM;
+	}
+	htbl->size = size;
+	htbl->mask = (size - 1);
+	htbl->buckets = buckets;
+	ntbl->htbl = htbl;
+	htbl->ntbl = ntbl;
+	atomic_set(&ntbl->entries, 0);
+
+	/* start garbage collection */
+	clear_bit(IPOIB_STOP_NEIGH_GC, &priv->flags);
+	queue_delayed_work(ipoib_workqueue, &priv->neigh_reap_task,
+			   arp_tbl.gc_interval);
+
+	return 0;
+}
+
+static void neigh_hash_free_rcu(struct rcu_head *head)
+{
+	struct ipoib_neigh_hash *htbl = container_of(head,
+						    struct ipoib_neigh_hash,
+						    rcu);
+	struct ipoib_neigh __rcu **buckets = htbl->buckets;
+	struct ipoib_neigh_table *ntbl = htbl->ntbl;
+
+	kfree(buckets);
+	kfree(htbl);
+	complete(&ntbl->deleted);
+}
+
+void ipoib_del_neighs_by_gid(struct net_device *dev, u8 *gid)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct ipoib_neigh_table *ntbl = &priv->ntbl;
+	struct ipoib_neigh_hash *htbl;
+	unsigned long flags;
+	int i;
+
+	/* remove all neigh connected to a given path or mcast */
+	spin_lock_irqsave(&priv->lock, flags);
+
+	htbl = rcu_dereference_protected(ntbl->htbl,
+					 lockdep_is_held(&priv->lock));
+
+	if (!htbl)
+		goto out_unlock;
+
+	for (i = 0; i < htbl->size; i++) {
+		struct ipoib_neigh *neigh;
+		struct ipoib_neigh __rcu **np = &htbl->buckets[i];
+
+		while ((neigh = rcu_dereference_protected(*np,
+							  lockdep_is_held(&priv->lock))) != NULL) {
+			/* delete neighs belong to this parent */
+			if (!memcmp(gid, neigh->daddr + 4, sizeof (union ib_gid))) {
+				rcu_assign_pointer(*np,
+						   rcu_dereference_protected(neigh->hnext,
+									     lockdep_is_held(&priv->lock)));
+				/* remove from parent list */
+				list_del(&neigh->list);
+				call_rcu(&neigh->rcu, ipoib_neigh_reclaim);
+			} else {
+				np = &neigh->hnext;
+			}
+
+		}
+	}
+out_unlock:
+	spin_unlock_irqrestore(&priv->lock, flags);
+}
+
+static void ipoib_flush_neighs(struct ipoib_dev_priv *priv)
+{
+	struct ipoib_neigh_table *ntbl = &priv->ntbl;
+	struct ipoib_neigh_hash *htbl;
+	unsigned long flags;
+	int i, wait_flushed = 0;
+
+	init_completion(&priv->ntbl.flushed);
+
+	spin_lock_irqsave(&priv->lock, flags);
+
+	htbl = rcu_dereference_protected(ntbl->htbl,
+					lockdep_is_held(&priv->lock));
+	if (!htbl)
+		goto out_unlock;
+
+	wait_flushed = atomic_read(&priv->ntbl.entries);
+	if (!wait_flushed)
+		goto free_htbl;
+
+	for (i = 0; i < htbl->size; i++) {
+		struct ipoib_neigh *neigh;
+		struct ipoib_neigh __rcu **np = &htbl->buckets[i];
+
+		while ((neigh = rcu_dereference_protected(*np,
+				       lockdep_is_held(&priv->lock))) != NULL) {
+			rcu_assign_pointer(*np,
+					   rcu_dereference_protected(neigh->hnext,
+								     lockdep_is_held(&priv->lock)));
+			/* remove from path/mc list */
+			list_del(&neigh->list);
+			call_rcu(&neigh->rcu, ipoib_neigh_reclaim);
+		}
+	}
+
+free_htbl:
+	rcu_assign_pointer(ntbl->htbl, NULL);
+	call_rcu(&htbl->rcu, neigh_hash_free_rcu);
+
+out_unlock:
+	spin_unlock_irqrestore(&priv->lock, flags);
+	if (wait_flushed)
+		wait_for_completion(&priv->ntbl.flushed);
+}
+
+static void ipoib_neigh_hash_uninit(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	int stopped;
+
+	ipoib_dbg(priv, "ipoib_neigh_hash_uninit\n");
+	init_completion(&priv->ntbl.deleted);
+	set_bit(IPOIB_NEIGH_TBL_FLUSH, &priv->flags);
+
+	/* Stop GC if called at init fail need to cancel work */
+	stopped = test_and_set_bit(IPOIB_STOP_NEIGH_GC, &priv->flags);
+	if (!stopped)
+		cancel_delayed_work(&priv->neigh_reap_task);
+
+	ipoib_flush_neighs(priv);
+
+	wait_for_completion(&priv->ntbl.deleted);
+}
+
+
+int ipoib_dev_init(struct net_device *dev, struct ib_device *ca, int port)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+
+	if (ipoib_neigh_hash_init(priv) < 0)
+		goto out;
+	/* Allocate RX/TX "rings" to hold queued skbs */
+	priv->rx_ring =	kzalloc(ipoib_recvq_size * sizeof *priv->rx_ring,
+				GFP_KERNEL);
+	if (!priv->rx_ring) {
+		printk(KERN_WARNING "%s: failed to allocate RX ring (%d entries)\n",
+		       ca->name, ipoib_recvq_size);
+		goto out_neigh_hash_cleanup;
+	}
+
+	priv->tx_ring = vzalloc(ipoib_sendq_size * sizeof *priv->tx_ring);
+	if (!priv->tx_ring) {
+		printk(KERN_WARNING "%s: failed to allocate TX ring (%d entries)\n",
+		       ca->name, ipoib_sendq_size);
+		goto out_rx_ring_cleanup;
+	}
+
+	/* priv->tx_head, tx_tail & tx_outstanding are already 0 */
+
+	if (ipoib_ib_dev_init(dev, ca, port))
+		goto out_tx_ring_cleanup;
+
+	return 0;
+
+out_tx_ring_cleanup:
+	vfree(priv->tx_ring);
+
+out_rx_ring_cleanup:
+	kfree(priv->rx_ring);
+
+out_neigh_hash_cleanup:
+	ipoib_neigh_hash_uninit(dev);
+out:
+	return -ENOMEM;
+}
+
+void ipoib_dev_cleanup(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev), *cpriv, *tcpriv;
+	LIST_HEAD(head);
+
+	ASSERT_RTNL();
+
+	ipoib_delete_debug_files(dev);
+
+	/* Delete any child interfaces first */
+	list_for_each_entry_safe(cpriv, tcpriv, &priv->child_intfs, list) {
+		/* Stop GC on child */
+		set_bit(IPOIB_STOP_NEIGH_GC, &cpriv->flags);
+		cancel_delayed_work(&cpriv->neigh_reap_task);
+		unregister_netdevice_queue(cpriv->dev, &head);
+	}
+	unregister_netdevice_many(&head);
+
+	ipoib_ib_dev_cleanup(dev);
+
+	kfree(priv->rx_ring);
+	vfree(priv->tx_ring);
+
+	priv->rx_ring = NULL;
+	priv->tx_ring = NULL;
+
+	ipoib_neigh_hash_uninit(dev);
+}
+
+static const struct header_ops ipoib_header_ops = {
+	.create	= ipoib_hard_header,
+};
+
+static const struct net_device_ops ipoib_netdev_ops = {
+	.ndo_uninit		 = ipoib_uninit,
+	.ndo_open		 = ipoib_open,
+	.ndo_stop		 = ipoib_stop,
+	.ndo_change_mtu		 = ipoib_change_mtu,
+	.ndo_fix_features	 = ipoib_fix_features,
+	.ndo_start_xmit	 	 = ipoib_start_xmit,
+	.ndo_tx_timeout		 = ipoib_timeout,
+	.ndo_set_rx_mode	 = ipoib_set_mcast_list,
+};
+
+void ipoib_setup(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+
+	dev->netdev_ops		 = &ipoib_netdev_ops;
+	dev->header_ops		 = &ipoib_header_ops;
+
+	ipoib_set_ethtool_ops(dev);
+
+	netif_napi_add(dev, &priv->napi, ipoib_poll, NAPI_POLL_WEIGHT);
+
+	dev->watchdog_timeo	 = HZ;
+
+	dev->flags		|= IFF_BROADCAST | IFF_MULTICAST;
+
+	dev->hard_header_len	 = IPOIB_ENCAP_LEN;
+	dev->addr_len		 = INFINIBAND_ALEN;
+	dev->type		 = ARPHRD_INFINIBAND;
+	dev->tx_queue_len	 = ipoib_sendq_size * 2;
+	dev->features		 = (NETIF_F_VLAN_CHALLENGED	|
+				    NETIF_F_HIGHDMA);
+	netif_keep_dst(dev);
+
+	memcpy(dev->broadcast, ipv4_bcast_addr, INFINIBAND_ALEN);
+
+	priv->dev = dev;
+
+	spin_lock_init(&priv->lock);
+
+	init_rwsem(&priv->vlan_rwsem);
+
+	INIT_LIST_HEAD(&priv->path_list);
+	INIT_LIST_HEAD(&priv->child_intfs);
+	INIT_LIST_HEAD(&priv->dead_ahs);
+	INIT_LIST_HEAD(&priv->multicast_list);
+
+	INIT_DELAYED_WORK(&priv->mcast_task,   ipoib_mcast_join_task);
+	INIT_WORK(&priv->carrier_on_task, ipoib_mcast_carrier_on_task);
+	INIT_WORK(&priv->flush_light,   ipoib_ib_dev_flush_light);
+	INIT_WORK(&priv->flush_normal,   ipoib_ib_dev_flush_normal);
+	INIT_WORK(&priv->flush_heavy,   ipoib_ib_dev_flush_heavy);
+	INIT_WORK(&priv->restart_task, ipoib_mcast_restart_task);
+	INIT_DELAYED_WORK(&priv->ah_reap_task, ipoib_reap_ah);
+	INIT_DELAYED_WORK(&priv->neigh_reap_task, ipoib_reap_neigh);
+}
+
+struct ipoib_dev_priv *ipoib_intf_alloc(const char *name)
+{
+	struct net_device *dev;
+
+	dev = alloc_netdev((int)sizeof(struct ipoib_dev_priv), name,
+			   NET_NAME_UNKNOWN, ipoib_setup);
+	if (!dev)
+		return NULL;
+
+	return netdev_priv(dev);
+}
+
+static ssize_t show_pkey(struct device *dev,
+			 struct device_attribute *attr, char *buf)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(to_net_dev(dev));
+
+	return sprintf(buf, "0x%04x\n", priv->pkey);
+}
+static DEVICE_ATTR(pkey, S_IRUGO, show_pkey, NULL);
+
+static ssize_t show_umcast(struct device *dev,
+			   struct device_attribute *attr, char *buf)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(to_net_dev(dev));
+
+	return sprintf(buf, "%d\n", test_bit(IPOIB_FLAG_UMCAST, &priv->flags));
+}
+
+void ipoib_set_umcast(struct net_device *ndev, int umcast_val)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(ndev);
+
+	if (umcast_val > 0) {
+		set_bit(IPOIB_FLAG_UMCAST, &priv->flags);
+		ipoib_warn(priv, "ignoring multicast groups joined directly "
+				"by userspace\n");
+	} else
+		clear_bit(IPOIB_FLAG_UMCAST, &priv->flags);
+}
+
+static ssize_t set_umcast(struct device *dev,
+			  struct device_attribute *attr,
+			  const char *buf, size_t count)
+{
+	unsigned long umcast_val = simple_strtoul(buf, NULL, 0);
+
+	ipoib_set_umcast(to_net_dev(dev), umcast_val);
+
+	return count;
+}
+static DEVICE_ATTR(umcast, S_IWUSR | S_IRUGO, show_umcast, set_umcast);
+
+int ipoib_add_umcast_attr(struct net_device *dev)
+{
+	return device_create_file(&dev->dev, &dev_attr_umcast);
+}
+
+static ssize_t create_child(struct device *dev,
+			    struct device_attribute *attr,
+			    const char *buf, size_t count)
+{
+	int pkey;
+	int ret;
+
+	if (sscanf(buf, "%i", &pkey) != 1)
+		return -EINVAL;
+
+	if (pkey <= 0 || pkey > 0xffff || pkey == 0x8000)
+		return -EINVAL;
+
+	/*
+	 * Set the full membership bit, so that we join the right
+	 * broadcast group, etc.
+	 */
+	pkey |= 0x8000;
+
+	ret = ipoib_vlan_add(to_net_dev(dev), pkey);
+
+	return ret ? ret : count;
+}
+static DEVICE_ATTR(create_child, S_IWUSR, NULL, create_child);
+
+static ssize_t delete_child(struct device *dev,
+			    struct device_attribute *attr,
+			    const char *buf, size_t count)
+{
+	int pkey;
+	int ret;
+
+	if (sscanf(buf, "%i", &pkey) != 1)
+		return -EINVAL;
+
+	if (pkey < 0 || pkey > 0xffff)
+		return -EINVAL;
+
+	ret = ipoib_vlan_delete(to_net_dev(dev), pkey);
+
+	return ret ? ret : count;
+
+}
+static DEVICE_ATTR(delete_child, S_IWUSR, NULL, delete_child);
+
+int ipoib_add_pkey_attr(struct net_device *dev)
+{
+	return device_create_file(&dev->dev, &dev_attr_pkey);
+}
+
+int ipoib_set_dev_features(struct ipoib_dev_priv *priv, struct ib_device *hca)
+{
+	struct ib_device_attr *device_attr;
+	int result = -ENOMEM;
+
+	device_attr = kmalloc(sizeof *device_attr, GFP_KERNEL);
+	if (!device_attr) {
+		printk(KERN_WARNING "%s: allocation of %zu bytes failed\n",
+		       hca->name, sizeof *device_attr);
+		return result;
+	}
+
+	result = ib_query_device(hca, device_attr);
+	if (result) {
+		printk(KERN_WARNING "%s: ib_query_device failed (ret = %d)\n",
+		       hca->name, result);
+		kfree(device_attr);
+		return result;
+	}
+	priv->hca_caps = device_attr->device_cap_flags;
+
+	kfree(device_attr);
+
+	if (priv->hca_caps & IB_DEVICE_UD_IP_CSUM) {
+		priv->dev->hw_features = NETIF_F_SG |
+			NETIF_F_IP_CSUM | NETIF_F_RXCSUM;
+
+		if (priv->hca_caps & IB_DEVICE_UD_TSO)
+			priv->dev->hw_features |= NETIF_F_TSO;
+
+		priv->dev->features |= priv->dev->hw_features;
+	}
+
+	return 0;
+}
+
+static struct net_device *ipoib_add_port(const char *format,
+					 struct ib_device *hca, u8 port)
+{
+	struct ipoib_dev_priv *priv;
+	struct ib_port_attr attr;
+	int result = -ENOMEM;
+
+	priv = ipoib_intf_alloc(format);
+	if (!priv)
+		goto alloc_mem_failed;
+
+	SET_NETDEV_DEV(priv->dev, hca->dma_device);
+	priv->dev->dev_id = port - 1;
+
+	if (!ib_query_port(hca, port, &attr))
+		priv->max_ib_mtu = ib_mtu_enum_to_int(attr.max_mtu);
+	else {
+		printk(KERN_WARNING "%s: ib_query_port %d failed\n",
+		       hca->name, port);
+		goto device_init_failed;
+	}
+
+	/* MTU will be reset when mcast join happens */
+	priv->dev->mtu  = IPOIB_UD_MTU(priv->max_ib_mtu);
+	priv->mcast_mtu  = priv->admin_mtu = priv->dev->mtu;
+
+	priv->dev->neigh_priv_len = sizeof(struct ipoib_neigh);
+
+	result = ib_query_pkey(hca, port, 0, &priv->pkey);
+	if (result) {
+		printk(KERN_WARNING "%s: ib_query_pkey port %d failed (ret = %d)\n",
+		       hca->name, port, result);
+		goto device_init_failed;
+	}
+
+	if (ipoib_set_dev_features(priv, hca))
+		goto device_init_failed;
+
+	/*
+	 * Set the full membership bit, so that we join the right
+	 * broadcast group, etc.
+	 */
+	priv->pkey |= 0x8000;
+
+	priv->dev->broadcast[8] = priv->pkey >> 8;
+	priv->dev->broadcast[9] = priv->pkey & 0xff;
+
+	result = ib_query_gid(hca, port, 0, &priv->local_gid);
+	if (result) {
+		printk(KERN_WARNING "%s: ib_query_gid port %d failed (ret = %d)\n",
+		       hca->name, port, result);
+		goto device_init_failed;
+	} else
+		memcpy(priv->dev->dev_addr + 4, priv->local_gid.raw, sizeof (union ib_gid));
+
+	result = ipoib_dev_init(priv->dev, hca, port);
+	if (result < 0) {
+		printk(KERN_WARNING "%s: failed to initialize port %d (ret = %d)\n",
+		       hca->name, port, result);
+		goto device_init_failed;
+	}
+
+	INIT_IB_EVENT_HANDLER(&priv->event_handler,
+			      priv->ca, ipoib_event);
+	result = ib_register_event_handler(&priv->event_handler);
+	if (result < 0) {
+		printk(KERN_WARNING "%s: ib_register_event_handler failed for "
+		       "port %d (ret = %d)\n",
+		       hca->name, port, result);
+		goto event_failed;
+	}
+
+	result = register_netdev(priv->dev);
+	if (result) {
+		printk(KERN_WARNING "%s: couldn't register ipoib port %d; error %d\n",
+		       hca->name, port, result);
+		goto register_failed;
+	}
+
+	ipoib_create_debug_files(priv->dev);
+
+	if (ipoib_cm_add_mode_attr(priv->dev))
+		goto sysfs_failed;
+	if (ipoib_add_pkey_attr(priv->dev))
+		goto sysfs_failed;
+	if (ipoib_add_umcast_attr(priv->dev))
+		goto sysfs_failed;
+	if (device_create_file(&priv->dev->dev, &dev_attr_create_child))
+		goto sysfs_failed;
+	if (device_create_file(&priv->dev->dev, &dev_attr_delete_child))
+		goto sysfs_failed;
+
+	return priv->dev;
+
+sysfs_failed:
+	ipoib_delete_debug_files(priv->dev);
+	unregister_netdev(priv->dev);
+
+register_failed:
+	ib_unregister_event_handler(&priv->event_handler);
+	/* Stop GC if started before flush */
+	set_bit(IPOIB_STOP_NEIGH_GC, &priv->flags);
+	cancel_delayed_work(&priv->neigh_reap_task);
+	flush_workqueue(ipoib_workqueue);
+
+event_failed:
+	ipoib_dev_cleanup(priv->dev);
+
+device_init_failed:
+	free_netdev(priv->dev);
+
+alloc_mem_failed:
+	return ERR_PTR(result);
+}
+
+static void ipoib_add_one(struct ib_device *device)
+{
+	struct list_head *dev_list;
+	struct net_device *dev;
+	struct ipoib_dev_priv *priv;
+	int s, e, p;
+
+	if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB)
+		return;
+
+	dev_list = kmalloc(sizeof *dev_list, GFP_KERNEL);
+	if (!dev_list)
+		return;
+
+	INIT_LIST_HEAD(dev_list);
+
+	if (device->node_type == RDMA_NODE_IB_SWITCH) {
+		s = 0;
+		e = 0;
+	} else {
+		s = 1;
+		e = device->phys_port_cnt;
+	}
+
+	for (p = s; p <= e; ++p) {
+		if (rdma_port_get_link_layer(device, p) != IB_LINK_LAYER_INFINIBAND)
+			continue;
+		dev = ipoib_add_port("ib%d", device, p);
+		if (!IS_ERR(dev)) {
+			priv = netdev_priv(dev);
+			list_add_tail(&priv->list, dev_list);
+		}
+	}
+
+	ib_set_client_data(device, &ipoib_client, dev_list);
+}
+
+static void ipoib_remove_one(struct ib_device *device)
+{
+	struct ipoib_dev_priv *priv, *tmp;
+	struct list_head *dev_list;
+
+	if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB)
+		return;
+
+	dev_list = ib_get_client_data(device, &ipoib_client);
+	if (!dev_list)
+		return;
+
+	list_for_each_entry_safe(priv, tmp, dev_list, list) {
+		ib_unregister_event_handler(&priv->event_handler);
+
+		rtnl_lock();
+		dev_change_flags(priv->dev, priv->dev->flags & ~IFF_UP);
+		rtnl_unlock();
+
+		/* Stop GC */
+		set_bit(IPOIB_STOP_NEIGH_GC, &priv->flags);
+		cancel_delayed_work(&priv->neigh_reap_task);
+		flush_workqueue(ipoib_workqueue);
+
+		unregister_netdev(priv->dev);
+		free_netdev(priv->dev);
+	}
+
+	kfree(dev_list);
+}
+
+static int __init ipoib_init_module(void)
+{
+	int ret;
+
+	ipoib_recvq_size = roundup_pow_of_two(ipoib_recvq_size);
+	ipoib_recvq_size = min(ipoib_recvq_size, IPOIB_MAX_QUEUE_SIZE);
+	ipoib_recvq_size = max(ipoib_recvq_size, IPOIB_MIN_QUEUE_SIZE);
+
+	ipoib_sendq_size = roundup_pow_of_two(ipoib_sendq_size);
+	ipoib_sendq_size = min(ipoib_sendq_size, IPOIB_MAX_QUEUE_SIZE);
+	ipoib_sendq_size = max3(ipoib_sendq_size, 2 * MAX_SEND_CQE, IPOIB_MIN_QUEUE_SIZE);
+#ifdef CONFIG_INFINIBAND_IPOIB_CM
+	ipoib_max_conn_qp = min(ipoib_max_conn_qp, IPOIB_CM_MAX_CONN_QP);
+#endif
+
+	/*
+	 * When copying small received packets, we only copy from the
+	 * linear data part of the SKB, so we rely on this condition.
+	 */
+	BUILD_BUG_ON(IPOIB_CM_COPYBREAK > IPOIB_CM_HEAD_SIZE);
+
+	ret = ipoib_register_debugfs();
+	if (ret)
+		return ret;
+
+	/*
+	 * We create our own workqueue mainly because we want to be
+	 * able to flush it when devices are being removed.  We can't
+	 * use schedule_work()/flush_scheduled_work() because both
+	 * unregister_netdev() and linkwatch_event take the rtnl lock,
+	 * so flush_scheduled_work() can deadlock during device
+	 * removal.
+	 */
+	ipoib_workqueue = create_singlethread_workqueue("ipoib");
+	if (!ipoib_workqueue) {
+		ret = -ENOMEM;
+		goto err_fs;
+	}
+
+	ib_sa_register_client(&ipoib_sa_client);
+
+	ret = ib_register_client(&ipoib_client);
+	if (ret)
+		goto err_sa;
+
+	ret = ipoib_netlink_init();
+	if (ret)
+		goto err_client;
+
+	return 0;
+
+err_client:
+	ib_unregister_client(&ipoib_client);
+
+err_sa:
+	ib_sa_unregister_client(&ipoib_sa_client);
+	destroy_workqueue(ipoib_workqueue);
+
+err_fs:
+	ipoib_unregister_debugfs();
+
+	return ret;
+}
+
+static void __exit ipoib_cleanup_module(void)
+{
+	ipoib_netlink_fini();
+	ib_unregister_client(&ipoib_client);
+	ib_sa_unregister_client(&ipoib_sa_client);
+	ipoib_unregister_debugfs();
+	destroy_workqueue(ipoib_workqueue);
+}
+
+module_init(ipoib_init_module);
+module_exit(ipoib_cleanup_module);
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
new file mode 100644
index 0000000..ffb83b5
--- /dev/null
+++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
@@ -0,0 +1,963 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2004 Voltaire, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <linux/moduleparam.h>
+#include <linux/ip.h>
+#include <linux/in.h>
+#include <linux/igmp.h>
+#include <linux/inetdevice.h>
+#include <linux/delay.h>
+#include <linux/completion.h>
+#include <linux/slab.h>
+
+#include <net/dst.h>
+
+#include "ipoib.h"
+
+#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
+static int mcast_debug_level;
+
+module_param(mcast_debug_level, int, 0644);
+MODULE_PARM_DESC(mcast_debug_level,
+		 "Enable multicast debug tracing if > 0");
+#endif
+
+static DEFINE_MUTEX(mcast_mutex);
+
+struct ipoib_mcast_iter {
+	struct net_device *dev;
+	union ib_gid       mgid;
+	unsigned long      created;
+	unsigned int       queuelen;
+	unsigned int       complete;
+	unsigned int       send_only;
+};
+
+static void ipoib_mcast_free(struct ipoib_mcast *mcast)
+{
+	struct net_device *dev = mcast->dev;
+	int tx_dropped = 0;
+
+	ipoib_dbg_mcast(netdev_priv(dev), "deleting multicast group %pI6\n",
+			mcast->mcmember.mgid.raw);
+
+	/* remove all neigh connected to this mcast */
+	ipoib_del_neighs_by_gid(dev, mcast->mcmember.mgid.raw);
+
+	if (mcast->ah)
+		ipoib_put_ah(mcast->ah);
+
+	while (!skb_queue_empty(&mcast->pkt_queue)) {
+		++tx_dropped;
+		dev_kfree_skb_any(skb_dequeue(&mcast->pkt_queue));
+	}
+
+	netif_tx_lock_bh(dev);
+	dev->stats.tx_dropped += tx_dropped;
+	netif_tx_unlock_bh(dev);
+
+	kfree(mcast);
+}
+
+static struct ipoib_mcast *ipoib_mcast_alloc(struct net_device *dev,
+					     int can_sleep)
+{
+	struct ipoib_mcast *mcast;
+
+	mcast = kzalloc(sizeof *mcast, can_sleep ? GFP_KERNEL : GFP_ATOMIC);
+	if (!mcast)
+		return NULL;
+
+	mcast->dev = dev;
+	mcast->created = jiffies;
+	mcast->backoff = 1;
+
+	INIT_LIST_HEAD(&mcast->list);
+	INIT_LIST_HEAD(&mcast->neigh_list);
+	skb_queue_head_init(&mcast->pkt_queue);
+
+	return mcast;
+}
+
+static struct ipoib_mcast *__ipoib_mcast_find(struct net_device *dev, void *mgid)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct rb_node *n = priv->multicast_tree.rb_node;
+
+	while (n) {
+		struct ipoib_mcast *mcast;
+		int ret;
+
+		mcast = rb_entry(n, struct ipoib_mcast, rb_node);
+
+		ret = memcmp(mgid, mcast->mcmember.mgid.raw,
+			     sizeof (union ib_gid));
+		if (ret < 0)
+			n = n->rb_left;
+		else if (ret > 0)
+			n = n->rb_right;
+		else
+			return mcast;
+	}
+
+	return NULL;
+}
+
+static int __ipoib_mcast_add(struct net_device *dev, struct ipoib_mcast *mcast)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct rb_node **n = &priv->multicast_tree.rb_node, *pn = NULL;
+
+	while (*n) {
+		struct ipoib_mcast *tmcast;
+		int ret;
+
+		pn = *n;
+		tmcast = rb_entry(pn, struct ipoib_mcast, rb_node);
+
+		ret = memcmp(mcast->mcmember.mgid.raw, tmcast->mcmember.mgid.raw,
+			     sizeof (union ib_gid));
+		if (ret < 0)
+			n = &pn->rb_left;
+		else if (ret > 0)
+			n = &pn->rb_right;
+		else
+			return -EEXIST;
+	}
+
+	rb_link_node(&mcast->rb_node, pn, n);
+	rb_insert_color(&mcast->rb_node, &priv->multicast_tree);
+
+	return 0;
+}
+
+static int ipoib_mcast_join_finish(struct ipoib_mcast *mcast,
+				   struct ib_sa_mcmember_rec *mcmember)
+{
+	struct net_device *dev = mcast->dev;
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct ipoib_ah *ah;
+	int ret;
+	int set_qkey = 0;
+
+	mcast->mcmember = *mcmember;
+
+	/* Set the multicast MTU and cached Q_Key before we attach if it's
+	 * the broadcast group.
+	 */
+	if (!memcmp(mcast->mcmember.mgid.raw, priv->dev->broadcast + 4,
+		    sizeof (union ib_gid))) {
+		spin_lock_irq(&priv->lock);
+		if (!priv->broadcast) {
+			spin_unlock_irq(&priv->lock);
+			return -EAGAIN;
+		}
+		priv->mcast_mtu = IPOIB_UD_MTU(ib_mtu_enum_to_int(priv->broadcast->mcmember.mtu));
+		priv->qkey = be32_to_cpu(priv->broadcast->mcmember.qkey);
+		spin_unlock_irq(&priv->lock);
+		priv->tx_wr.wr.ud.remote_qkey = priv->qkey;
+		set_qkey = 1;
+
+		if (!ipoib_cm_admin_enabled(dev)) {
+			rtnl_lock();
+			dev_set_mtu(dev, min(priv->mcast_mtu, priv->admin_mtu));
+			rtnl_unlock();
+		}
+	}
+
+	if (!test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)) {
+		if (test_and_set_bit(IPOIB_MCAST_FLAG_ATTACHED, &mcast->flags)) {
+			ipoib_warn(priv, "multicast group %pI6 already attached\n",
+				   mcast->mcmember.mgid.raw);
+
+			return 0;
+		}
+
+		ret = ipoib_mcast_attach(dev, be16_to_cpu(mcast->mcmember.mlid),
+					 &mcast->mcmember.mgid, set_qkey);
+		if (ret < 0) {
+			ipoib_warn(priv, "couldn't attach QP to multicast group %pI6\n",
+				   mcast->mcmember.mgid.raw);
+
+			clear_bit(IPOIB_MCAST_FLAG_ATTACHED, &mcast->flags);
+			return ret;
+		}
+	}
+
+	{
+		struct ib_ah_attr av = {
+			.dlid	       = be16_to_cpu(mcast->mcmember.mlid),
+			.port_num      = priv->port,
+			.sl	       = mcast->mcmember.sl,
+			.ah_flags      = IB_AH_GRH,
+			.static_rate   = mcast->mcmember.rate,
+			.grh	       = {
+				.flow_label    = be32_to_cpu(mcast->mcmember.flow_label),
+				.hop_limit     = mcast->mcmember.hop_limit,
+				.sgid_index    = 0,
+				.traffic_class = mcast->mcmember.traffic_class
+			}
+		};
+		av.grh.dgid = mcast->mcmember.mgid;
+
+		ah = ipoib_create_ah(dev, priv->pd, &av);
+		if (IS_ERR(ah)) {
+			ipoib_warn(priv, "ib_address_create failed %ld\n",
+				-PTR_ERR(ah));
+			/* use original error */
+			return PTR_ERR(ah);
+		} else {
+			spin_lock_irq(&priv->lock);
+			mcast->ah = ah;
+			spin_unlock_irq(&priv->lock);
+
+			ipoib_dbg_mcast(priv, "MGID %pI6 AV %p, LID 0x%04x, SL %d\n",
+					mcast->mcmember.mgid.raw,
+					mcast->ah->ah,
+					be16_to_cpu(mcast->mcmember.mlid),
+					mcast->mcmember.sl);
+		}
+	}
+
+	/* actually send any queued packets */
+	netif_tx_lock_bh(dev);
+	while (!skb_queue_empty(&mcast->pkt_queue)) {
+		struct sk_buff *skb = skb_dequeue(&mcast->pkt_queue);
+
+		netif_tx_unlock_bh(dev);
+
+		skb->dev = dev;
+		if (dev_queue_xmit(skb))
+			ipoib_warn(priv, "dev_queue_xmit failed to requeue packet\n");
+
+		netif_tx_lock_bh(dev);
+	}
+	netif_tx_unlock_bh(dev);
+
+	return 0;
+}
+
+static int
+ipoib_mcast_sendonly_join_complete(int status,
+				   struct ib_sa_multicast *multicast)
+{
+	struct ipoib_mcast *mcast = multicast->context;
+	struct net_device *dev = mcast->dev;
+
+	/* We trap for port events ourselves. */
+	if (status == -ENETRESET)
+		return 0;
+
+	if (!status)
+		status = ipoib_mcast_join_finish(mcast, &multicast->rec);
+
+	if (status) {
+		if (mcast->logcount++ < 20)
+			ipoib_dbg_mcast(netdev_priv(dev), "multicast join failed for %pI6, status %d\n",
+					mcast->mcmember.mgid.raw, status);
+
+		/* Flush out any queued packets */
+		netif_tx_lock_bh(dev);
+		while (!skb_queue_empty(&mcast->pkt_queue)) {
+			++dev->stats.tx_dropped;
+			dev_kfree_skb_any(skb_dequeue(&mcast->pkt_queue));
+		}
+		netif_tx_unlock_bh(dev);
+
+		/* Clear the busy flag so we try again */
+		status = test_and_clear_bit(IPOIB_MCAST_FLAG_BUSY,
+					    &mcast->flags);
+	}
+	return status;
+}
+
+static int ipoib_mcast_sendonly_join(struct ipoib_mcast *mcast)
+{
+	struct net_device *dev = mcast->dev;
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct ib_sa_mcmember_rec rec = {
+#if 0				/* Some SMs don't support send-only yet */
+		.join_state = 4
+#else
+		.join_state = 1
+#endif
+	};
+	int ret = 0;
+
+	if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags)) {
+		ipoib_dbg_mcast(priv, "device shutting down, no multicast joins\n");
+		return -ENODEV;
+	}
+
+	if (test_and_set_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags)) {
+		ipoib_dbg_mcast(priv, "multicast entry busy, skipping\n");
+		return -EBUSY;
+	}
+
+	rec.mgid     = mcast->mcmember.mgid;
+	rec.port_gid = priv->local_gid;
+	rec.pkey     = cpu_to_be16(priv->pkey);
+
+	mcast->mc = ib_sa_join_multicast(&ipoib_sa_client, priv->ca,
+					 priv->port, &rec,
+					 IB_SA_MCMEMBER_REC_MGID	|
+					 IB_SA_MCMEMBER_REC_PORT_GID	|
+					 IB_SA_MCMEMBER_REC_PKEY	|
+					 IB_SA_MCMEMBER_REC_JOIN_STATE,
+					 GFP_ATOMIC,
+					 ipoib_mcast_sendonly_join_complete,
+					 mcast);
+	if (IS_ERR(mcast->mc)) {
+		ret = PTR_ERR(mcast->mc);
+		clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags);
+		ipoib_warn(priv, "ib_sa_join_multicast failed (ret = %d)\n",
+			   ret);
+	} else {
+		ipoib_dbg_mcast(priv, "no multicast record for %pI6, starting join\n",
+				mcast->mcmember.mgid.raw);
+	}
+
+	return ret;
+}
+
+void ipoib_mcast_carrier_on_task(struct work_struct *work)
+{
+	struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv,
+						   carrier_on_task);
+	struct ib_port_attr attr;
+
+	/*
+	 * Take rtnl_lock to avoid racing with ipoib_stop() and
+	 * turning the carrier back on while a device is being
+	 * removed.
+	 */
+	if (ib_query_port(priv->ca, priv->port, &attr) ||
+	    attr.state != IB_PORT_ACTIVE) {
+		ipoib_dbg(priv, "Keeping carrier off until IB port is active\n");
+		return;
+	}
+
+	rtnl_lock();
+	netif_carrier_on(priv->dev);
+	rtnl_unlock();
+}
+
+static int ipoib_mcast_join_complete(int status,
+				     struct ib_sa_multicast *multicast)
+{
+	struct ipoib_mcast *mcast = multicast->context;
+	struct net_device *dev = mcast->dev;
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+
+	ipoib_dbg_mcast(priv, "join completion for %pI6 (status %d)\n",
+			mcast->mcmember.mgid.raw, status);
+
+	/* We trap for port events ourselves. */
+	if (status == -ENETRESET) {
+		status = 0;
+		goto out;
+	}
+
+	if (!status)
+		status = ipoib_mcast_join_finish(mcast, &multicast->rec);
+
+	if (!status) {
+		mcast->backoff = 1;
+		mutex_lock(&mcast_mutex);
+		if (test_bit(IPOIB_MCAST_RUN, &priv->flags))
+			queue_delayed_work(ipoib_workqueue,
+					   &priv->mcast_task, 0);
+		mutex_unlock(&mcast_mutex);
+
+		/*
+		 * Defer carrier on work to ipoib_workqueue to avoid a
+		 * deadlock on rtnl_lock here.
+		 */
+		if (mcast == priv->broadcast)
+			queue_work(ipoib_workqueue, &priv->carrier_on_task);
+
+		status = 0;
+		goto out;
+	}
+
+	if (mcast->logcount++ < 20) {
+		if (status == -ETIMEDOUT || status == -EAGAIN) {
+			ipoib_dbg_mcast(priv, "multicast join failed for %pI6, status %d\n",
+					mcast->mcmember.mgid.raw, status);
+		} else {
+			ipoib_warn(priv, "multicast join failed for %pI6, status %d\n",
+				   mcast->mcmember.mgid.raw, status);
+		}
+	}
+
+	mcast->backoff *= 2;
+	if (mcast->backoff > IPOIB_MAX_BACKOFF_SECONDS)
+		mcast->backoff = IPOIB_MAX_BACKOFF_SECONDS;
+
+	/* Clear the busy flag so we try again */
+	status = test_and_clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags);
+
+	mutex_lock(&mcast_mutex);
+	spin_lock_irq(&priv->lock);
+	if (test_bit(IPOIB_MCAST_RUN, &priv->flags))
+		queue_delayed_work(ipoib_workqueue, &priv->mcast_task,
+				   mcast->backoff * HZ);
+	spin_unlock_irq(&priv->lock);
+	mutex_unlock(&mcast_mutex);
+out:
+	complete(&mcast->done);
+	return status;
+}
+
+static void ipoib_mcast_join(struct net_device *dev, struct ipoib_mcast *mcast,
+			     int create)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct ib_sa_mcmember_rec rec = {
+		.join_state = 1
+	};
+	ib_sa_comp_mask comp_mask;
+	int ret = 0;
+
+	ipoib_dbg_mcast(priv, "joining MGID %pI6\n", mcast->mcmember.mgid.raw);
+
+	rec.mgid     = mcast->mcmember.mgid;
+	rec.port_gid = priv->local_gid;
+	rec.pkey     = cpu_to_be16(priv->pkey);
+
+	comp_mask =
+		IB_SA_MCMEMBER_REC_MGID		|
+		IB_SA_MCMEMBER_REC_PORT_GID	|
+		IB_SA_MCMEMBER_REC_PKEY		|
+		IB_SA_MCMEMBER_REC_JOIN_STATE;
+
+	if (create) {
+		comp_mask |=
+			IB_SA_MCMEMBER_REC_QKEY			|
+			IB_SA_MCMEMBER_REC_MTU_SELECTOR		|
+			IB_SA_MCMEMBER_REC_MTU			|
+			IB_SA_MCMEMBER_REC_TRAFFIC_CLASS	|
+			IB_SA_MCMEMBER_REC_RATE_SELECTOR	|
+			IB_SA_MCMEMBER_REC_RATE			|
+			IB_SA_MCMEMBER_REC_SL			|
+			IB_SA_MCMEMBER_REC_FLOW_LABEL		|
+			IB_SA_MCMEMBER_REC_HOP_LIMIT;
+
+		rec.qkey	  = priv->broadcast->mcmember.qkey;
+		rec.mtu_selector  = IB_SA_EQ;
+		rec.mtu		  = priv->broadcast->mcmember.mtu;
+		rec.traffic_class = priv->broadcast->mcmember.traffic_class;
+		rec.rate_selector = IB_SA_EQ;
+		rec.rate	  = priv->broadcast->mcmember.rate;
+		rec.sl		  = priv->broadcast->mcmember.sl;
+		rec.flow_label	  = priv->broadcast->mcmember.flow_label;
+		rec.hop_limit	  = priv->broadcast->mcmember.hop_limit;
+	}
+
+	set_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags);
+	init_completion(&mcast->done);
+	set_bit(IPOIB_MCAST_JOIN_STARTED, &mcast->flags);
+
+	mcast->mc = ib_sa_join_multicast(&ipoib_sa_client, priv->ca, priv->port,
+					 &rec, comp_mask, GFP_KERNEL,
+					 ipoib_mcast_join_complete, mcast);
+	if (IS_ERR(mcast->mc)) {
+		clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags);
+		complete(&mcast->done);
+		ret = PTR_ERR(mcast->mc);
+		ipoib_warn(priv, "ib_sa_join_multicast failed, status %d\n", ret);
+
+		mcast->backoff *= 2;
+		if (mcast->backoff > IPOIB_MAX_BACKOFF_SECONDS)
+			mcast->backoff = IPOIB_MAX_BACKOFF_SECONDS;
+
+		mutex_lock(&mcast_mutex);
+		if (test_bit(IPOIB_MCAST_RUN, &priv->flags))
+			queue_delayed_work(ipoib_workqueue,
+					   &priv->mcast_task,
+					   mcast->backoff * HZ);
+		mutex_unlock(&mcast_mutex);
+	}
+}
+
+void ipoib_mcast_join_task(struct work_struct *work)
+{
+	struct ipoib_dev_priv *priv =
+		container_of(work, struct ipoib_dev_priv, mcast_task.work);
+	struct net_device *dev = priv->dev;
+	struct ib_port_attr port_attr;
+
+	if (!test_bit(IPOIB_MCAST_RUN, &priv->flags))
+		return;
+
+	if (ib_query_port(priv->ca, priv->port, &port_attr) ||
+	    port_attr.state != IB_PORT_ACTIVE) {
+		ipoib_dbg(priv, "port state is not ACTIVE (state = %d) suspending join task\n",
+			  port_attr.state);
+		return;
+	}
+	priv->local_lid = port_attr.lid;
+
+	if (ib_query_gid(priv->ca, priv->port, 0, &priv->local_gid))
+		ipoib_warn(priv, "ib_query_gid() failed\n");
+	else
+		memcpy(priv->dev->dev_addr + 4, priv->local_gid.raw, sizeof (union ib_gid));
+
+	if (!priv->broadcast) {
+		struct ipoib_mcast *broadcast;
+
+		if (!test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags))
+			return;
+
+		broadcast = ipoib_mcast_alloc(dev, 1);
+		if (!broadcast) {
+			ipoib_warn(priv, "failed to allocate broadcast group\n");
+			mutex_lock(&mcast_mutex);
+			if (test_bit(IPOIB_MCAST_RUN, &priv->flags))
+				queue_delayed_work(ipoib_workqueue,
+						   &priv->mcast_task, HZ);
+			mutex_unlock(&mcast_mutex);
+			return;
+		}
+
+		spin_lock_irq(&priv->lock);
+		memcpy(broadcast->mcmember.mgid.raw, priv->dev->broadcast + 4,
+		       sizeof (union ib_gid));
+		priv->broadcast = broadcast;
+
+		__ipoib_mcast_add(dev, priv->broadcast);
+		spin_unlock_irq(&priv->lock);
+	}
+
+	if (!test_bit(IPOIB_MCAST_FLAG_ATTACHED, &priv->broadcast->flags)) {
+		if (!test_bit(IPOIB_MCAST_FLAG_BUSY, &priv->broadcast->flags))
+			ipoib_mcast_join(dev, priv->broadcast, 0);
+		return;
+	}
+
+	while (1) {
+		struct ipoib_mcast *mcast = NULL;
+
+		spin_lock_irq(&priv->lock);
+		list_for_each_entry(mcast, &priv->multicast_list, list) {
+			if (!test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)
+			    && !test_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags)
+			    && !test_bit(IPOIB_MCAST_FLAG_ATTACHED, &mcast->flags)) {
+				/* Found the next unjoined group */
+				break;
+			}
+		}
+		spin_unlock_irq(&priv->lock);
+
+		if (&mcast->list == &priv->multicast_list) {
+			/* All done */
+			break;
+		}
+
+		ipoib_mcast_join(dev, mcast, 1);
+		return;
+	}
+
+	ipoib_dbg_mcast(priv, "successfully joined all multicast groups\n");
+
+	clear_bit(IPOIB_MCAST_RUN, &priv->flags);
+}
+
+int ipoib_mcast_start_thread(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+
+	ipoib_dbg_mcast(priv, "starting multicast thread\n");
+
+	mutex_lock(&mcast_mutex);
+	if (!test_and_set_bit(IPOIB_MCAST_RUN, &priv->flags))
+		queue_delayed_work(ipoib_workqueue, &priv->mcast_task, 0);
+	mutex_unlock(&mcast_mutex);
+
+	return 0;
+}
+
+int ipoib_mcast_stop_thread(struct net_device *dev, int flush)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+
+	ipoib_dbg_mcast(priv, "stopping multicast thread\n");
+
+	mutex_lock(&mcast_mutex);
+	clear_bit(IPOIB_MCAST_RUN, &priv->flags);
+	cancel_delayed_work(&priv->mcast_task);
+	mutex_unlock(&mcast_mutex);
+
+	if (flush)
+		flush_workqueue(ipoib_workqueue);
+
+	return 0;
+}
+
+static int ipoib_mcast_leave(struct net_device *dev, struct ipoib_mcast *mcast)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	int ret = 0;
+
+	if (test_and_clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags))
+		ib_sa_free_multicast(mcast->mc);
+
+	if (test_and_clear_bit(IPOIB_MCAST_FLAG_ATTACHED, &mcast->flags)) {
+		ipoib_dbg_mcast(priv, "leaving MGID %pI6\n",
+				mcast->mcmember.mgid.raw);
+
+		/* Remove ourselves from the multicast group */
+		ret = ib_detach_mcast(priv->qp, &mcast->mcmember.mgid,
+				      be16_to_cpu(mcast->mcmember.mlid));
+		if (ret)
+			ipoib_warn(priv, "ib_detach_mcast failed (result = %d)\n", ret);
+	}
+
+	return 0;
+}
+
+void ipoib_mcast_send(struct net_device *dev, u8 *daddr, struct sk_buff *skb)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct ipoib_mcast *mcast;
+	unsigned long flags;
+	void *mgid = daddr + 4;
+
+	spin_lock_irqsave(&priv->lock, flags);
+
+	if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags)		||
+	    !priv->broadcast					||
+	    !test_bit(IPOIB_MCAST_FLAG_ATTACHED, &priv->broadcast->flags)) {
+		++dev->stats.tx_dropped;
+		dev_kfree_skb_any(skb);
+		goto unlock;
+	}
+
+	mcast = __ipoib_mcast_find(dev, mgid);
+	if (!mcast) {
+		/* Let's create a new send only group now */
+		ipoib_dbg_mcast(priv, "setting up send only multicast group for %pI6\n",
+				mgid);
+
+		mcast = ipoib_mcast_alloc(dev, 0);
+		if (!mcast) {
+			ipoib_warn(priv, "unable to allocate memory for "
+				   "multicast structure\n");
+			++dev->stats.tx_dropped;
+			dev_kfree_skb_any(skb);
+			goto out;
+		}
+
+		set_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags);
+		memcpy(mcast->mcmember.mgid.raw, mgid, sizeof (union ib_gid));
+		__ipoib_mcast_add(dev, mcast);
+		list_add_tail(&mcast->list, &priv->multicast_list);
+	}
+
+	if (!mcast->ah) {
+		if (skb_queue_len(&mcast->pkt_queue) < IPOIB_MAX_MCAST_QUEUE)
+			skb_queue_tail(&mcast->pkt_queue, skb);
+		else {
+			++dev->stats.tx_dropped;
+			dev_kfree_skb_any(skb);
+		}
+
+		if (test_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags))
+			ipoib_dbg_mcast(priv, "no address vector, "
+					"but multicast join already started\n");
+		else if (test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags))
+			ipoib_mcast_sendonly_join(mcast);
+
+		/*
+		 * If lookup completes between here and out:, don't
+		 * want to send packet twice.
+		 */
+		mcast = NULL;
+	}
+
+out:
+	if (mcast && mcast->ah) {
+		struct ipoib_neigh *neigh;
+
+		spin_unlock_irqrestore(&priv->lock, flags);
+		neigh = ipoib_neigh_get(dev, daddr);
+		spin_lock_irqsave(&priv->lock, flags);
+		if (!neigh) {
+			neigh = ipoib_neigh_alloc(daddr, dev);
+			if (neigh) {
+				kref_get(&mcast->ah->ref);
+				neigh->ah	= mcast->ah;
+				list_add_tail(&neigh->list, &mcast->neigh_list);
+			}
+		}
+		spin_unlock_irqrestore(&priv->lock, flags);
+		ipoib_send(dev, skb, mcast->ah, IB_MULTICAST_QPN);
+		if (neigh)
+			ipoib_neigh_put(neigh);
+		return;
+	}
+
+unlock:
+	spin_unlock_irqrestore(&priv->lock, flags);
+}
+
+void ipoib_mcast_dev_flush(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	LIST_HEAD(remove_list);
+	struct ipoib_mcast *mcast, *tmcast;
+	unsigned long flags;
+
+	ipoib_dbg_mcast(priv, "flushing multicast list\n");
+
+	spin_lock_irqsave(&priv->lock, flags);
+
+	list_for_each_entry_safe(mcast, tmcast, &priv->multicast_list, list) {
+		list_del(&mcast->list);
+		rb_erase(&mcast->rb_node, &priv->multicast_tree);
+		list_add_tail(&mcast->list, &remove_list);
+	}
+
+	if (priv->broadcast) {
+		rb_erase(&priv->broadcast->rb_node, &priv->multicast_tree);
+		list_add_tail(&priv->broadcast->list, &remove_list);
+		priv->broadcast = NULL;
+	}
+
+	spin_unlock_irqrestore(&priv->lock, flags);
+
+	/* seperate between the wait to the leave*/
+	list_for_each_entry_safe(mcast, tmcast, &remove_list, list)
+		if (test_bit(IPOIB_MCAST_JOIN_STARTED, &mcast->flags))
+			wait_for_completion(&mcast->done);
+
+	list_for_each_entry_safe(mcast, tmcast, &remove_list, list) {
+		ipoib_mcast_leave(dev, mcast);
+		ipoib_mcast_free(mcast);
+	}
+}
+
+static int ipoib_mcast_addr_is_valid(const u8 *addr, const u8 *broadcast)
+{
+	/* reserved QPN, prefix, scope */
+	if (memcmp(addr, broadcast, 6))
+		return 0;
+	/* signature lower, pkey */
+	if (memcmp(addr + 7, broadcast + 7, 3))
+		return 0;
+	return 1;
+}
+
+void ipoib_mcast_restart_task(struct work_struct *work)
+{
+	struct ipoib_dev_priv *priv =
+		container_of(work, struct ipoib_dev_priv, restart_task);
+	struct net_device *dev = priv->dev;
+	struct netdev_hw_addr *ha;
+	struct ipoib_mcast *mcast, *tmcast;
+	LIST_HEAD(remove_list);
+	unsigned long flags;
+	struct ib_sa_mcmember_rec rec;
+
+	ipoib_dbg_mcast(priv, "restarting multicast task\n");
+
+	ipoib_mcast_stop_thread(dev, 0);
+
+	local_irq_save(flags);
+	netif_addr_lock(dev);
+	spin_lock(&priv->lock);
+
+	/*
+	 * Unfortunately, the networking core only gives us a list of all of
+	 * the multicast hardware addresses. We need to figure out which ones
+	 * are new and which ones have been removed
+	 */
+
+	/* Clear out the found flag */
+	list_for_each_entry(mcast, &priv->multicast_list, list)
+		clear_bit(IPOIB_MCAST_FLAG_FOUND, &mcast->flags);
+
+	/* Mark all of the entries that are found or don't exist */
+	netdev_for_each_mc_addr(ha, dev) {
+		union ib_gid mgid;
+
+		if (!ipoib_mcast_addr_is_valid(ha->addr, dev->broadcast))
+			continue;
+
+		memcpy(mgid.raw, ha->addr + 4, sizeof mgid);
+
+		mcast = __ipoib_mcast_find(dev, &mgid);
+		if (!mcast || test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)) {
+			struct ipoib_mcast *nmcast;
+
+			/* ignore group which is directly joined by userspace */
+			if (test_bit(IPOIB_FLAG_UMCAST, &priv->flags) &&
+			    !ib_sa_get_mcmember_rec(priv->ca, priv->port, &mgid, &rec)) {
+				ipoib_dbg_mcast(priv, "ignoring multicast entry for mgid %pI6\n",
+						mgid.raw);
+				continue;
+			}
+
+			/* Not found or send-only group, let's add a new entry */
+			ipoib_dbg_mcast(priv, "adding multicast entry for mgid %pI6\n",
+					mgid.raw);
+
+			nmcast = ipoib_mcast_alloc(dev, 0);
+			if (!nmcast) {
+				ipoib_warn(priv, "unable to allocate memory for multicast structure\n");
+				continue;
+			}
+
+			set_bit(IPOIB_MCAST_FLAG_FOUND, &nmcast->flags);
+
+			nmcast->mcmember.mgid = mgid;
+
+			if (mcast) {
+				/* Destroy the send only entry */
+				list_move_tail(&mcast->list, &remove_list);
+
+				rb_replace_node(&mcast->rb_node,
+						&nmcast->rb_node,
+						&priv->multicast_tree);
+			} else
+				__ipoib_mcast_add(dev, nmcast);
+
+			list_add_tail(&nmcast->list, &priv->multicast_list);
+		}
+
+		if (mcast)
+			set_bit(IPOIB_MCAST_FLAG_FOUND, &mcast->flags);
+	}
+
+	/* Remove all of the entries don't exist anymore */
+	list_for_each_entry_safe(mcast, tmcast, &priv->multicast_list, list) {
+		if (!test_bit(IPOIB_MCAST_FLAG_FOUND, &mcast->flags) &&
+		    !test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)) {
+			ipoib_dbg_mcast(priv, "deleting multicast group %pI6\n",
+					mcast->mcmember.mgid.raw);
+
+			rb_erase(&mcast->rb_node, &priv->multicast_tree);
+
+			/* Move to the remove list */
+			list_move_tail(&mcast->list, &remove_list);
+		}
+	}
+
+	spin_unlock(&priv->lock);
+	netif_addr_unlock(dev);
+	local_irq_restore(flags);
+
+	/* We have to cancel outside of the spinlock */
+	list_for_each_entry_safe(mcast, tmcast, &remove_list, list) {
+		ipoib_mcast_leave(mcast->dev, mcast);
+		ipoib_mcast_free(mcast);
+	}
+
+	if (test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags))
+		ipoib_mcast_start_thread(dev);
+}
+
+#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
+
+struct ipoib_mcast_iter *ipoib_mcast_iter_init(struct net_device *dev)
+{
+	struct ipoib_mcast_iter *iter;
+
+	iter = kmalloc(sizeof *iter, GFP_KERNEL);
+	if (!iter)
+		return NULL;
+
+	iter->dev = dev;
+	memset(iter->mgid.raw, 0, 16);
+
+	if (ipoib_mcast_iter_next(iter)) {
+		kfree(iter);
+		return NULL;
+	}
+
+	return iter;
+}
+
+int ipoib_mcast_iter_next(struct ipoib_mcast_iter *iter)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(iter->dev);
+	struct rb_node *n;
+	struct ipoib_mcast *mcast;
+	int ret = 1;
+
+	spin_lock_irq(&priv->lock);
+
+	n = rb_first(&priv->multicast_tree);
+
+	while (n) {
+		mcast = rb_entry(n, struct ipoib_mcast, rb_node);
+
+		if (memcmp(iter->mgid.raw, mcast->mcmember.mgid.raw,
+			   sizeof (union ib_gid)) < 0) {
+			iter->mgid      = mcast->mcmember.mgid;
+			iter->created   = mcast->created;
+			iter->queuelen  = skb_queue_len(&mcast->pkt_queue);
+			iter->complete  = !!mcast->ah;
+			iter->send_only = !!(mcast->flags & (1 << IPOIB_MCAST_FLAG_SENDONLY));
+
+			ret = 0;
+
+			break;
+		}
+
+		n = rb_next(n);
+	}
+
+	spin_unlock_irq(&priv->lock);
+
+	return ret;
+}
+
+void ipoib_mcast_iter_read(struct ipoib_mcast_iter *iter,
+			   union ib_gid *mgid,
+			   unsigned long *created,
+			   unsigned int *queuelen,
+			   unsigned int *complete,
+			   unsigned int *send_only)
+{
+	*mgid      = iter->mgid;
+	*created   = iter->created;
+	*queuelen  = iter->queuelen;
+	*complete  = iter->complete;
+	*send_only = iter->send_only;
+}
+
+#endif /* CONFIG_INFINIBAND_IPOIB_DEBUG */
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_netlink.c b/drivers/infiniband/ulp/ipoib/ipoib_netlink.c
new file mode 100644
index 0000000..cdc7df4
--- /dev/null
+++ b/drivers/infiniband/ulp/ipoib/ipoib_netlink.c
@@ -0,0 +1,182 @@
+/*
+ * Copyright (c) 2012 Mellanox Technologies. -  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/netdevice.h>
+#include <linux/if_arp.h>      /* For ARPHRD_xxx */
+#include <linux/module.h>
+#include <net/rtnetlink.h>
+#include "ipoib.h"
+
+static const struct nla_policy ipoib_policy[IFLA_IPOIB_MAX + 1] = {
+	[IFLA_IPOIB_PKEY]	= { .type = NLA_U16 },
+	[IFLA_IPOIB_MODE]	= { .type = NLA_U16 },
+	[IFLA_IPOIB_UMCAST]	= { .type = NLA_U16 },
+};
+
+static int ipoib_fill_info(struct sk_buff *skb, const struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	u16 val;
+
+	if (nla_put_u16(skb, IFLA_IPOIB_PKEY, priv->pkey))
+		goto nla_put_failure;
+
+	val = test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags);
+	if (nla_put_u16(skb, IFLA_IPOIB_MODE, val))
+		goto nla_put_failure;
+
+	val = test_bit(IPOIB_FLAG_UMCAST, &priv->flags);
+	if (nla_put_u16(skb, IFLA_IPOIB_UMCAST, val))
+		goto nla_put_failure;
+
+	return 0;
+
+nla_put_failure:
+	return -EMSGSIZE;
+}
+
+static int ipoib_changelink(struct net_device *dev,
+			    struct nlattr *tb[], struct nlattr *data[])
+{
+	u16 mode, umcast;
+	int ret = 0;
+
+	if (data[IFLA_IPOIB_MODE]) {
+		mode  = nla_get_u16(data[IFLA_IPOIB_MODE]);
+		if (mode == IPOIB_MODE_DATAGRAM)
+			ret = ipoib_set_mode(dev, "datagram\n");
+		else if (mode == IPOIB_MODE_CONNECTED)
+			ret = ipoib_set_mode(dev, "connected\n");
+		else
+			ret = -EINVAL;
+
+		if (ret < 0)
+			goto out_err;
+	}
+
+	if (data[IFLA_IPOIB_UMCAST]) {
+		umcast = nla_get_u16(data[IFLA_IPOIB_UMCAST]);
+		ipoib_set_umcast(dev, umcast);
+	}
+
+out_err:
+	return ret;
+}
+
+static int ipoib_new_child_link(struct net *src_net, struct net_device *dev,
+			       struct nlattr *tb[], struct nlattr *data[])
+{
+	struct net_device *pdev;
+	struct ipoib_dev_priv *ppriv;
+	u16 child_pkey;
+	int err;
+
+	if (!tb[IFLA_LINK])
+		return -EINVAL;
+
+	pdev = __dev_get_by_index(src_net, nla_get_u32(tb[IFLA_LINK]));
+	if (!pdev || pdev->type != ARPHRD_INFINIBAND)
+		return -ENODEV;
+
+	ppriv = netdev_priv(pdev);
+
+	if (test_bit(IPOIB_FLAG_SUBINTERFACE, &ppriv->flags)) {
+		ipoib_warn(ppriv, "child creation disallowed for child devices\n");
+		return -EINVAL;
+	}
+
+	if (!data || !data[IFLA_IPOIB_PKEY]) {
+		ipoib_dbg(ppriv, "no pkey specified, using parent pkey\n");
+		child_pkey  = ppriv->pkey;
+	} else
+		child_pkey  = nla_get_u16(data[IFLA_IPOIB_PKEY]);
+
+	if (child_pkey == 0 || child_pkey == 0x8000)
+		return -EINVAL;
+
+	/*
+	 * Set the full membership bit, so that we join the right
+	 * broadcast group, etc.
+	 */
+	child_pkey |= 0x8000;
+
+	err = __ipoib_vlan_add(ppriv, netdev_priv(dev), child_pkey, IPOIB_RTNL_CHILD);
+
+	if (!err && data)
+		err = ipoib_changelink(dev, tb, data);
+	return err;
+}
+
+static void ipoib_unregister_child_dev(struct net_device *dev, struct list_head *head)
+{
+	struct ipoib_dev_priv *priv, *ppriv;
+
+	priv = netdev_priv(dev);
+	ppriv = netdev_priv(priv->parent);
+
+	down_write(&ppriv->vlan_rwsem);
+	unregister_netdevice_queue(dev, head);
+	list_del(&priv->list);
+	up_write(&ppriv->vlan_rwsem);
+}
+
+static size_t ipoib_get_size(const struct net_device *dev)
+{
+	return nla_total_size(2) +	/* IFLA_IPOIB_PKEY   */
+		nla_total_size(2) +	/* IFLA_IPOIB_MODE   */
+		nla_total_size(2);	/* IFLA_IPOIB_UMCAST */
+}
+
+static struct rtnl_link_ops ipoib_link_ops __read_mostly = {
+	.kind		= "ipoib",
+	.maxtype	= IFLA_IPOIB_MAX,
+	.policy		= ipoib_policy,
+	.priv_size	= sizeof(struct ipoib_dev_priv),
+	.setup		= ipoib_setup,
+	.newlink	= ipoib_new_child_link,
+	.changelink	= ipoib_changelink,
+	.dellink	= ipoib_unregister_child_dev,
+	.get_size	= ipoib_get_size,
+	.fill_info	= ipoib_fill_info,
+};
+
+int __init ipoib_netlink_init(void)
+{
+	return rtnl_link_register(&ipoib_link_ops);
+}
+
+void __exit ipoib_netlink_fini(void)
+{
+	rtnl_link_unregister(&ipoib_link_ops);
+}
+
+MODULE_ALIAS_RTNL_LINK("ipoib");
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
new file mode 100644
index 0000000..c56d5d4
--- /dev/null
+++ b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
@@ -0,0 +1,297 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/slab.h>
+
+#include "ipoib.h"
+
+int ipoib_mcast_attach(struct net_device *dev, u16 mlid, union ib_gid *mgid, int set_qkey)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct ib_qp_attr *qp_attr = NULL;
+	int ret;
+	u16 pkey_index;
+
+	if (ib_find_pkey(priv->ca, priv->port, priv->pkey, &pkey_index)) {
+		clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
+		ret = -ENXIO;
+		goto out;
+	}
+	set_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
+
+	if (set_qkey) {
+		ret = -ENOMEM;
+		qp_attr = kmalloc(sizeof *qp_attr, GFP_KERNEL);
+		if (!qp_attr)
+			goto out;
+
+		/* set correct QKey for QP */
+		qp_attr->qkey = priv->qkey;
+		ret = ib_modify_qp(priv->qp, qp_attr, IB_QP_QKEY);
+		if (ret) {
+			ipoib_warn(priv, "failed to modify QP, ret = %d\n", ret);
+			goto out;
+		}
+	}
+
+	/* attach QP to multicast group */
+	ret = ib_attach_mcast(priv->qp, mgid, mlid);
+	if (ret)
+		ipoib_warn(priv, "failed to attach to multicast group, ret = %d\n", ret);
+
+out:
+	kfree(qp_attr);
+	return ret;
+}
+
+int ipoib_init_qp(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	int ret;
+	struct ib_qp_attr qp_attr;
+	int attr_mask;
+
+	if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags))
+		return -1;
+
+	qp_attr.qp_state = IB_QPS_INIT;
+	qp_attr.qkey = 0;
+	qp_attr.port_num = priv->port;
+	qp_attr.pkey_index = priv->pkey_index;
+	attr_mask =
+	    IB_QP_QKEY |
+	    IB_QP_PORT |
+	    IB_QP_PKEY_INDEX |
+	    IB_QP_STATE;
+	ret = ib_modify_qp(priv->qp, &qp_attr, attr_mask);
+	if (ret) {
+		ipoib_warn(priv, "failed to modify QP to init, ret = %d\n", ret);
+		goto out_fail;
+	}
+
+	qp_attr.qp_state = IB_QPS_RTR;
+	/* Can't set this in a INIT->RTR transition */
+	attr_mask &= ~IB_QP_PORT;
+	ret = ib_modify_qp(priv->qp, &qp_attr, attr_mask);
+	if (ret) {
+		ipoib_warn(priv, "failed to modify QP to RTR, ret = %d\n", ret);
+		goto out_fail;
+	}
+
+	qp_attr.qp_state = IB_QPS_RTS;
+	qp_attr.sq_psn = 0;
+	attr_mask |= IB_QP_SQ_PSN;
+	attr_mask &= ~IB_QP_PKEY_INDEX;
+	ret = ib_modify_qp(priv->qp, &qp_attr, attr_mask);
+	if (ret) {
+		ipoib_warn(priv, "failed to modify QP to RTS, ret = %d\n", ret);
+		goto out_fail;
+	}
+
+	return 0;
+
+out_fail:
+	qp_attr.qp_state = IB_QPS_RESET;
+	if (ib_modify_qp(priv->qp, &qp_attr, IB_QP_STATE))
+		ipoib_warn(priv, "Failed to modify QP to RESET state\n");
+
+	return ret;
+}
+
+int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct ib_qp_init_attr init_attr = {
+		.cap = {
+			.max_send_wr  = ipoib_sendq_size,
+			.max_recv_wr  = ipoib_recvq_size,
+			.max_send_sge = 1,
+			.max_recv_sge = IPOIB_UD_RX_SG
+		},
+		.sq_sig_type = IB_SIGNAL_ALL_WR,
+		.qp_type     = IB_QPT_UD
+	};
+
+	int ret, size;
+	int i;
+
+	priv->pd = ib_alloc_pd(priv->ca);
+	if (IS_ERR(priv->pd)) {
+		printk(KERN_WARNING "%s: failed to allocate PD\n", ca->name);
+		return -ENODEV;
+	}
+
+	priv->mr = ib_get_dma_mr(priv->pd, IB_ACCESS_LOCAL_WRITE);
+	if (IS_ERR(priv->mr)) {
+		printk(KERN_WARNING "%s: ib_get_dma_mr failed\n", ca->name);
+		goto out_free_pd;
+	}
+
+	size = ipoib_recvq_size + 1;
+	ret = ipoib_cm_dev_init(dev);
+	if (!ret) {
+		size += ipoib_sendq_size;
+		if (ipoib_cm_has_srq(dev))
+			size += ipoib_recvq_size + 1; /* 1 extra for rx_drain_qp */
+		else
+			size += ipoib_recvq_size * ipoib_max_conn_qp;
+	}
+
+	priv->recv_cq = ib_create_cq(priv->ca, ipoib_ib_completion, NULL, dev, size, 0);
+	if (IS_ERR(priv->recv_cq)) {
+		printk(KERN_WARNING "%s: failed to create receive CQ\n", ca->name);
+		goto out_free_mr;
+	}
+
+	priv->send_cq = ib_create_cq(priv->ca, ipoib_send_comp_handler, NULL,
+				     dev, ipoib_sendq_size, 0);
+	if (IS_ERR(priv->send_cq)) {
+		printk(KERN_WARNING "%s: failed to create send CQ\n", ca->name);
+		goto out_free_recv_cq;
+	}
+
+	if (ib_req_notify_cq(priv->recv_cq, IB_CQ_NEXT_COMP))
+		goto out_free_send_cq;
+
+	init_attr.send_cq = priv->send_cq;
+	init_attr.recv_cq = priv->recv_cq;
+
+	if (priv->hca_caps & IB_DEVICE_UD_TSO)
+		init_attr.create_flags |= IB_QP_CREATE_IPOIB_UD_LSO;
+
+	if (priv->hca_caps & IB_DEVICE_BLOCK_MULTICAST_LOOPBACK)
+		init_attr.create_flags |= IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK;
+
+	if (priv->hca_caps & IB_DEVICE_MANAGED_FLOW_STEERING)
+		init_attr.create_flags |= IB_QP_CREATE_NETIF_QP;
+
+	if (dev->features & NETIF_F_SG)
+		init_attr.cap.max_send_sge = MAX_SKB_FRAGS + 1;
+
+	priv->qp = ib_create_qp(priv->pd, &init_attr);
+	if (IS_ERR(priv->qp)) {
+		printk(KERN_WARNING "%s: failed to create QP\n", ca->name);
+		goto out_free_send_cq;
+	}
+
+	priv->dev->dev_addr[1] = (priv->qp->qp_num >> 16) & 0xff;
+	priv->dev->dev_addr[2] = (priv->qp->qp_num >>  8) & 0xff;
+	priv->dev->dev_addr[3] = (priv->qp->qp_num      ) & 0xff;
+
+	for (i = 0; i < MAX_SKB_FRAGS + 1; ++i)
+		priv->tx_sge[i].lkey = priv->mr->lkey;
+
+	priv->tx_wr.opcode	= IB_WR_SEND;
+	priv->tx_wr.sg_list	= priv->tx_sge;
+	priv->tx_wr.send_flags	= IB_SEND_SIGNALED;
+
+	priv->rx_sge[0].lkey = priv->mr->lkey;
+	if (ipoib_ud_need_sg(priv->max_ib_mtu)) {
+		priv->rx_sge[0].length = IPOIB_UD_HEAD_SIZE;
+		priv->rx_sge[1].length = PAGE_SIZE;
+		priv->rx_sge[1].lkey = priv->mr->lkey;
+		priv->rx_wr.num_sge = IPOIB_UD_RX_SG;
+	} else {
+		priv->rx_sge[0].length = IPOIB_UD_BUF_SIZE(priv->max_ib_mtu);
+		priv->rx_wr.num_sge = 1;
+	}
+	priv->rx_wr.next = NULL;
+	priv->rx_wr.sg_list = priv->rx_sge;
+
+	return 0;
+
+out_free_send_cq:
+	ib_destroy_cq(priv->send_cq);
+
+out_free_recv_cq:
+	ib_destroy_cq(priv->recv_cq);
+
+out_free_mr:
+	ib_dereg_mr(priv->mr);
+	ipoib_cm_dev_cleanup(dev);
+
+out_free_pd:
+	ib_dealloc_pd(priv->pd);
+	return -ENODEV;
+}
+
+void ipoib_transport_dev_cleanup(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+
+	if (priv->qp) {
+		if (ib_destroy_qp(priv->qp))
+			ipoib_warn(priv, "ib_qp_destroy failed\n");
+
+		priv->qp = NULL;
+		clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
+	}
+
+	if (ib_destroy_cq(priv->send_cq))
+		ipoib_warn(priv, "ib_cq_destroy (send) failed\n");
+
+	if (ib_destroy_cq(priv->recv_cq))
+		ipoib_warn(priv, "ib_cq_destroy (recv) failed\n");
+
+	ipoib_cm_dev_cleanup(dev);
+
+	if (ib_dereg_mr(priv->mr))
+		ipoib_warn(priv, "ib_dereg_mr failed\n");
+
+	if (ib_dealloc_pd(priv->pd))
+		ipoib_warn(priv, "ib_dealloc_pd failed\n");
+}
+
+void ipoib_event(struct ib_event_handler *handler,
+		 struct ib_event *record)
+{
+	struct ipoib_dev_priv *priv =
+		container_of(handler, struct ipoib_dev_priv, event_handler);
+
+	if (record->element.port_num != priv->port)
+		return;
+
+	ipoib_dbg(priv, "Event %d on device %s port %d\n", record->event,
+		  record->device->name, record->element.port_num);
+
+	if (record->event == IB_EVENT_SM_CHANGE ||
+	    record->event == IB_EVENT_CLIENT_REREGISTER) {
+		queue_work(ipoib_workqueue, &priv->flush_light);
+	} else if (record->event == IB_EVENT_PORT_ERR ||
+		   record->event == IB_EVENT_PORT_ACTIVE ||
+		   record->event == IB_EVENT_LID_CHANGE) {
+		queue_work(ipoib_workqueue, &priv->flush_normal);
+	} else if (record->event == IB_EVENT_PKEY_CHANGE) {
+		queue_work(ipoib_workqueue, &priv->flush_heavy);
+	}
+}
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_vlan.c b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c
new file mode 100644
index 0000000..9fad7b5
--- /dev/null
+++ b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c
@@ -0,0 +1,209 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/module.h>
+
+#include <linux/init.h>
+#include <linux/seq_file.h>
+
+#include <asm/uaccess.h>
+
+#include "ipoib.h"
+
+static ssize_t show_parent(struct device *d, struct device_attribute *attr,
+			   char *buf)
+{
+	struct net_device *dev = to_net_dev(d);
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+
+	return sprintf(buf, "%s\n", priv->parent->name);
+}
+static DEVICE_ATTR(parent, S_IRUGO, show_parent, NULL);
+
+int __ipoib_vlan_add(struct ipoib_dev_priv *ppriv, struct ipoib_dev_priv *priv,
+		     u16 pkey, int type)
+{
+	int result;
+
+	priv->max_ib_mtu = ppriv->max_ib_mtu;
+	/* MTU will be reset when mcast join happens */
+	priv->dev->mtu   = IPOIB_UD_MTU(priv->max_ib_mtu);
+	priv->mcast_mtu  = priv->admin_mtu = priv->dev->mtu;
+	set_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags);
+
+	result = ipoib_set_dev_features(priv, ppriv->ca);
+	if (result)
+		goto err;
+
+	priv->pkey = pkey;
+
+	memcpy(priv->dev->dev_addr, ppriv->dev->dev_addr, INFINIBAND_ALEN);
+	priv->dev->broadcast[8] = pkey >> 8;
+	priv->dev->broadcast[9] = pkey & 0xff;
+
+	result = ipoib_dev_init(priv->dev, ppriv->ca, ppriv->port);
+	if (result < 0) {
+		ipoib_warn(ppriv, "failed to initialize subinterface: "
+			   "device %s, port %d",
+			   ppriv->ca->name, ppriv->port);
+		goto err;
+	}
+
+	result = register_netdevice(priv->dev);
+	if (result) {
+		ipoib_warn(priv, "failed to initialize; error %i", result);
+		goto register_failed;
+	}
+
+	priv->parent = ppriv->dev;
+
+	ipoib_create_debug_files(priv->dev);
+
+	/* RTNL childs don't need proprietary sysfs entries */
+	if (type == IPOIB_LEGACY_CHILD) {
+		if (ipoib_cm_add_mode_attr(priv->dev))
+			goto sysfs_failed;
+		if (ipoib_add_pkey_attr(priv->dev))
+			goto sysfs_failed;
+		if (ipoib_add_umcast_attr(priv->dev))
+			goto sysfs_failed;
+
+		if (device_create_file(&priv->dev->dev, &dev_attr_parent))
+			goto sysfs_failed;
+	}
+
+	priv->child_type  = type;
+	priv->dev->iflink = ppriv->dev->ifindex;
+	list_add_tail(&priv->list, &ppriv->child_intfs);
+
+	return 0;
+
+sysfs_failed:
+	result = -ENOMEM;
+	ipoib_delete_debug_files(priv->dev);
+	unregister_netdevice(priv->dev);
+
+register_failed:
+	ipoib_dev_cleanup(priv->dev);
+
+err:
+	return result;
+}
+
+int ipoib_vlan_add(struct net_device *pdev, unsigned short pkey)
+{
+	struct ipoib_dev_priv *ppriv, *priv;
+	char intf_name[IFNAMSIZ];
+	struct ipoib_dev_priv *tpriv;
+	int result;
+
+	if (!capable(CAP_NET_ADMIN))
+		return -EPERM;
+
+	ppriv = netdev_priv(pdev);
+
+	snprintf(intf_name, sizeof intf_name, "%s.%04x",
+		 ppriv->dev->name, pkey);
+	priv = ipoib_intf_alloc(intf_name);
+	if (!priv)
+		return -ENOMEM;
+
+	if (!rtnl_trylock())
+		return restart_syscall();
+
+	down_write(&ppriv->vlan_rwsem);
+
+	/*
+	 * First ensure this isn't a duplicate. We check the parent device and
+	 * then all of the legacy child interfaces to make sure the Pkey
+	 * doesn't match.
+	 */
+	if (ppriv->pkey == pkey) {
+		result = -ENOTUNIQ;
+		goto out;
+	}
+
+	list_for_each_entry(tpriv, &ppriv->child_intfs, list) {
+		if (tpriv->pkey == pkey &&
+		    tpriv->child_type == IPOIB_LEGACY_CHILD) {
+			result = -ENOTUNIQ;
+			goto out;
+		}
+	}
+
+	result = __ipoib_vlan_add(ppriv, priv, pkey, IPOIB_LEGACY_CHILD);
+
+out:
+	up_write(&ppriv->vlan_rwsem);
+
+	if (result)
+		free_netdev(priv->dev);
+
+	rtnl_unlock();
+
+	return result;
+}
+
+int ipoib_vlan_delete(struct net_device *pdev, unsigned short pkey)
+{
+	struct ipoib_dev_priv *ppriv, *priv, *tpriv;
+	struct net_device *dev = NULL;
+
+	if (!capable(CAP_NET_ADMIN))
+		return -EPERM;
+
+	ppriv = netdev_priv(pdev);
+
+	if (!rtnl_trylock())
+		return restart_syscall();
+
+	down_write(&ppriv->vlan_rwsem);
+	list_for_each_entry_safe(priv, tpriv, &ppriv->child_intfs, list) {
+		if (priv->pkey == pkey &&
+		    priv->child_type == IPOIB_LEGACY_CHILD) {
+			unregister_netdevice(priv->dev);
+			list_del(&priv->list);
+			dev = priv->dev;
+			break;
+		}
+	}
+	up_write(&ppriv->vlan_rwsem);
+
+	rtnl_unlock();
+
+	if (dev) {
+		free_netdev(dev);
+		return 0;
+	}
+
+	return -ENODEV;
+}
diff --git a/drivers/infiniband/ulp/iser/Kconfig b/drivers/infiniband/ulp/iser/Kconfig
new file mode 100644
index 0000000..d00af71
--- /dev/null
+++ b/drivers/infiniband/ulp/iser/Kconfig
@@ -0,0 +1,12 @@
+config INFINIBAND_ISER
+	tristate "iSCSI Extensions for RDMA (iSER)"
+	depends on SCSI && INET && INFINIBAND_ADDR_TRANS
+	select SCSI_ISCSI_ATTRS
+	---help---
+	  Support for the iSCSI Extensions for RDMA (iSER) Protocol
+          over InfiniBand. This allows you to access storage devices
+          that speak iSCSI over iSER over InfiniBand.
+
+	  The iSER protocol is defined by IETF.
+	  See <http://www.ietf.org/rfc/rfc5046.txt>
+	  and <http://members.infinibandta.org/kwspub/spec/Annex_iSER.PDF>
diff --git a/drivers/infiniband/ulp/iser/Makefile b/drivers/infiniband/ulp/iser/Makefile
new file mode 100644
index 0000000..fe6cd15
--- /dev/null
+++ b/drivers/infiniband/ulp/iser/Makefile
@@ -0,0 +1,4 @@
+obj-$(CONFIG_INFINIBAND_ISER)	+= ib_iser.o
+
+ib_iser-y			:= iser_verbs.o iser_initiator.o iser_memory.o \
+				   iscsi_iser.o
diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.c b/drivers/infiniband/ulp/iser/iscsi_iser.c
new file mode 100644
index 0000000..f42ab14
--- /dev/null
+++ b/drivers/infiniband/ulp/iser/iscsi_iser.c
@@ -0,0 +1,1038 @@
+/*
+ * iSCSI Initiator over iSER Data-Path
+ *
+ * Copyright (C) 2004 Dmitry Yusupov
+ * Copyright (C) 2004 Alex Aizman
+ * Copyright (C) 2005 Mike Christie
+ * Copyright (c) 2005, 2006 Voltaire, Inc. All rights reserved.
+ * Copyright (c) 2013-2014 Mellanox Technologies. All rights reserved.
+ * maintained by openib-general@openib.org
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *	- Redistributions of source code must retain the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer.
+ *
+ *	- Redistributions in binary form must reproduce the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer in the documentation and/or other materials
+ *	  provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Credits:
+ *	Christoph Hellwig
+ *	FUJITA Tomonori
+ *	Arne Redlich
+ *	Zhenyu Wang
+ * Modified by:
+ *      Erez Zilber
+ */
+
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/hardirq.h>
+#include <linux/kfifo.h>
+#include <linux/blkdev.h>
+#include <linux/init.h>
+#include <linux/ioctl.h>
+#include <linux/cdev.h>
+#include <linux/in.h>
+#include <linux/net.h>
+#include <linux/scatterlist.h>
+#include <linux/delay.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+
+#include <net/sock.h>
+
+#include <asm/uaccess.h>
+
+#include <scsi/scsi_cmnd.h>
+#include <scsi/scsi_device.h>
+#include <scsi/scsi_eh.h>
+#include <scsi/scsi_tcq.h>
+#include <scsi/scsi_host.h>
+#include <scsi/scsi.h>
+#include <scsi/scsi_transport_iscsi.h>
+
+#include "iscsi_iser.h"
+
+static struct scsi_host_template iscsi_iser_sht;
+static struct iscsi_transport iscsi_iser_transport;
+static struct scsi_transport_template *iscsi_iser_scsi_transport;
+
+static unsigned int iscsi_max_lun = 512;
+module_param_named(max_lun, iscsi_max_lun, uint, S_IRUGO);
+
+int iser_debug_level = 0;
+bool iser_pi_enable = false;
+int iser_pi_guard = 1;
+
+MODULE_DESCRIPTION("iSER (iSCSI Extensions for RDMA) Datamover");
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_AUTHOR("Alex Nezhinsky, Dan Bar Dov, Or Gerlitz");
+MODULE_VERSION(DRV_VER);
+
+module_param_named(debug_level, iser_debug_level, int, 0644);
+MODULE_PARM_DESC(debug_level, "Enable debug tracing if > 0 (default:disabled)");
+
+module_param_named(pi_enable, iser_pi_enable, bool, 0644);
+MODULE_PARM_DESC(pi_enable, "Enable T10-PI offload support (default:disabled)");
+
+module_param_named(pi_guard, iser_pi_guard, int, 0644);
+MODULE_PARM_DESC(pi_guard, "T10-PI guard_type, 0:CRC|1:IP_CSUM (default:IP_CSUM)");
+
+static struct workqueue_struct *release_wq;
+struct iser_global ig;
+
+/*
+ * iscsi_iser_recv() - Process a successfull recv completion
+ * @conn:         iscsi connection
+ * @hdr:          iscsi header
+ * @rx_data:      buffer containing receive data payload
+ * @rx_data_len:  length of rx_data
+ *
+ * Notes: In case of data length errors or iscsi PDU completion failures
+ *        this routine will signal iscsi layer of connection failure.
+ */
+void
+iscsi_iser_recv(struct iscsi_conn *conn, struct iscsi_hdr *hdr,
+		char *rx_data, int rx_data_len)
+{
+	int rc = 0;
+	int datalen;
+	int ahslen;
+
+	/* verify PDU length */
+	datalen = ntoh24(hdr->dlength);
+	if (datalen > rx_data_len || (datalen + 4) < rx_data_len) {
+		iser_err("wrong datalen %d (hdr), %d (IB)\n",
+			datalen, rx_data_len);
+		rc = ISCSI_ERR_DATALEN;
+		goto error;
+	}
+
+	if (datalen != rx_data_len)
+		iser_dbg("aligned datalen (%d) hdr, %d (IB)\n",
+			datalen, rx_data_len);
+
+	/* read AHS */
+	ahslen = hdr->hlength * 4;
+
+	rc = iscsi_complete_pdu(conn, hdr, rx_data, rx_data_len);
+	if (rc && rc != ISCSI_ERR_NO_SCSI_CMD)
+		goto error;
+
+	return;
+error:
+	iscsi_conn_failure(conn, rc);
+}
+
+/**
+ * iscsi_iser_pdu_alloc() - allocate an iscsi-iser PDU
+ * @task:     iscsi task
+ * @opcode:   iscsi command opcode
+ *
+ * Netes: This routine can't fail, just assign iscsi task
+ *        hdr and max hdr size.
+ */
+static int
+iscsi_iser_pdu_alloc(struct iscsi_task *task, uint8_t opcode)
+{
+	struct iscsi_iser_task *iser_task = task->dd_data;
+
+	task->hdr = (struct iscsi_hdr *)&iser_task->desc.iscsi_header;
+	task->hdr_max = sizeof(iser_task->desc.iscsi_header);
+
+	return 0;
+}
+
+int iser_initialize_task_headers(struct iscsi_task *task,
+						struct iser_tx_desc *tx_desc)
+{
+	struct iser_conn       *iser_conn   = task->conn->dd_data;
+	struct iser_device *device = iser_conn->ib_conn.device;
+	struct iscsi_iser_task *iser_task = task->dd_data;
+	u64 dma_addr;
+
+	dma_addr = ib_dma_map_single(device->ib_device, (void *)tx_desc,
+				ISER_HEADERS_LEN, DMA_TO_DEVICE);
+	if (ib_dma_mapping_error(device->ib_device, dma_addr))
+		return -ENOMEM;
+
+	tx_desc->dma_addr = dma_addr;
+	tx_desc->tx_sg[0].addr   = tx_desc->dma_addr;
+	tx_desc->tx_sg[0].length = ISER_HEADERS_LEN;
+	tx_desc->tx_sg[0].lkey   = device->mr->lkey;
+
+	iser_task->iser_conn = iser_conn;
+	return 0;
+}
+
+/**
+ * iscsi_iser_task_init() - Initialize iscsi-iser task
+ * @task: iscsi task
+ *
+ * Initialize the task for the scsi command or mgmt command.
+ *
+ * Return: Returns zero on success or -ENOMEM when failing
+ *         to init task headers (dma mapping error).
+ */
+static int
+iscsi_iser_task_init(struct iscsi_task *task)
+{
+	struct iscsi_iser_task *iser_task = task->dd_data;
+
+	if (iser_initialize_task_headers(task, &iser_task->desc))
+			return -ENOMEM;
+
+	/* mgmt task */
+	if (!task->sc)
+		return 0;
+
+	iser_task->command_sent = 0;
+	iser_task_rdma_init(iser_task);
+	iser_task->sc = task->sc;
+
+	return 0;
+}
+
+/**
+ * iscsi_iser_mtask_xmit() - xmit management (immediate) task
+ * @conn: iscsi connection
+ * @task: task management task
+ *
+ * Notes:
+ *	The function can return -EAGAIN in which case caller must
+ *	call it again later, or recover. '0' return code means successful
+ *	xmit.
+ *
+ **/
+static int
+iscsi_iser_mtask_xmit(struct iscsi_conn *conn, struct iscsi_task *task)
+{
+	int error = 0;
+
+	iser_dbg("mtask xmit [cid %d itt 0x%x]\n", conn->id, task->itt);
+
+	error = iser_send_control(conn, task);
+
+	/* since iser xmits control with zero copy, tasks can not be recycled
+	 * right after sending them.
+	 * The recycling scheme is based on whether a response is expected
+	 * - if yes, the task is recycled at iscsi_complete_pdu
+	 * - if no,  the task is recycled at iser_snd_completion
+	 */
+	return error;
+}
+
+static int
+iscsi_iser_task_xmit_unsol_data(struct iscsi_conn *conn,
+				 struct iscsi_task *task)
+{
+	struct iscsi_r2t_info *r2t = &task->unsol_r2t;
+	struct iscsi_data hdr;
+	int error = 0;
+
+	/* Send data-out PDUs while there's still unsolicited data to send */
+	while (iscsi_task_has_unsol_data(task)) {
+		iscsi_prep_data_out_pdu(task, r2t, &hdr);
+		iser_dbg("Sending data-out: itt 0x%x, data count %d\n",
+			   hdr.itt, r2t->data_count);
+
+		/* the buffer description has been passed with the command */
+		/* Send the command */
+		error = iser_send_data_out(conn, task, &hdr);
+		if (error) {
+			r2t->datasn--;
+			goto iscsi_iser_task_xmit_unsol_data_exit;
+		}
+		r2t->sent += r2t->data_count;
+		iser_dbg("Need to send %d more as data-out PDUs\n",
+			   r2t->data_length - r2t->sent);
+	}
+
+iscsi_iser_task_xmit_unsol_data_exit:
+	return error;
+}
+
+/**
+ * iscsi_iser_task_xmit() - xmit iscsi-iser task
+ * @task: iscsi task
+ *
+ * Return: zero on success or escalates $error on failure.
+ */
+static int
+iscsi_iser_task_xmit(struct iscsi_task *task)
+{
+	struct iscsi_conn *conn = task->conn;
+	struct iscsi_iser_task *iser_task = task->dd_data;
+	int error = 0;
+
+	if (!task->sc)
+		return iscsi_iser_mtask_xmit(conn, task);
+
+	if (task->sc->sc_data_direction == DMA_TO_DEVICE) {
+		BUG_ON(scsi_bufflen(task->sc) == 0);
+
+		iser_dbg("cmd [itt %x total %d imm %d unsol_data %d\n",
+			   task->itt, scsi_bufflen(task->sc),
+			   task->imm_count, task->unsol_r2t.data_length);
+	}
+
+	iser_dbg("ctask xmit [cid %d itt 0x%x]\n",
+		   conn->id, task->itt);
+
+	/* Send the cmd PDU */
+	if (!iser_task->command_sent) {
+		error = iser_send_command(conn, task);
+		if (error)
+			goto iscsi_iser_task_xmit_exit;
+		iser_task->command_sent = 1;
+	}
+
+	/* Send unsolicited data-out PDU(s) if necessary */
+	if (iscsi_task_has_unsol_data(task))
+		error = iscsi_iser_task_xmit_unsol_data(conn, task);
+
+ iscsi_iser_task_xmit_exit:
+	return error;
+}
+
+/**
+ * iscsi_iser_cleanup_task() - cleanup an iscsi-iser task
+ * @task: iscsi task
+ *
+ * Notes: In case the RDMA device is already NULL (might have
+ *        been removed in DEVICE_REMOVAL CM event it will bail-out
+ *        without doing dma unmapping.
+ */
+static void iscsi_iser_cleanup_task(struct iscsi_task *task)
+{
+	struct iscsi_iser_task *iser_task = task->dd_data;
+	struct iser_tx_desc    *tx_desc   = &iser_task->desc;
+	struct iser_conn       *iser_conn	  = task->conn->dd_data;
+	struct iser_device *device = iser_conn->ib_conn.device;
+
+	/* DEVICE_REMOVAL event might have already released the device */
+	if (!device)
+		return;
+
+	ib_dma_unmap_single(device->ib_device,
+		tx_desc->dma_addr, ISER_HEADERS_LEN, DMA_TO_DEVICE);
+
+	/* mgmt tasks do not need special cleanup */
+	if (!task->sc)
+		return;
+
+	if (iser_task->status == ISER_TASK_STATUS_STARTED) {
+		iser_task->status = ISER_TASK_STATUS_COMPLETED;
+		iser_task_rdma_finalize(iser_task);
+	}
+}
+
+/**
+ * iscsi_iser_check_protection() - check protection information status of task.
+ * @task:     iscsi task
+ * @sector:   error sector if exsists (output)
+ *
+ * Return: zero if no data-integrity errors have occured
+ *         0x1: data-integrity error occured in the guard-block
+ *         0x2: data-integrity error occured in the reference tag
+ *         0x3: data-integrity error occured in the application tag
+ *
+ *         In addition the error sector is marked.
+ */
+static u8
+iscsi_iser_check_protection(struct iscsi_task *task, sector_t *sector)
+{
+	struct iscsi_iser_task *iser_task = task->dd_data;
+
+	if (iser_task->dir[ISER_DIR_IN])
+		return iser_check_task_pi_status(iser_task, ISER_DIR_IN,
+						 sector);
+	else
+		return iser_check_task_pi_status(iser_task, ISER_DIR_OUT,
+						 sector);
+}
+
+/**
+ * iscsi_iser_conn_create() - create a new iscsi-iser connection
+ * @cls_session: iscsi class connection
+ * @conn_idx:    connection index within the session (for MCS)
+ *
+ * Return: iscsi_cls_conn when iscsi_conn_setup succeeds or NULL
+ *         otherwise.
+ */
+static struct iscsi_cls_conn *
+iscsi_iser_conn_create(struct iscsi_cls_session *cls_session,
+		       uint32_t conn_idx)
+{
+	struct iscsi_conn *conn;
+	struct iscsi_cls_conn *cls_conn;
+
+	cls_conn = iscsi_conn_setup(cls_session, 0, conn_idx);
+	if (!cls_conn)
+		return NULL;
+	conn = cls_conn->dd_data;
+
+	/*
+	 * due to issues with the login code re iser sematics
+	 * this not set in iscsi_conn_setup - FIXME
+	 */
+	conn->max_recv_dlength = ISER_RECV_DATA_SEG_LEN;
+
+	return cls_conn;
+}
+
+/**
+ * iscsi_iser_conn_bind() - bind iscsi and iser connection structures
+ * @cls_session:     iscsi class session
+ * @cls_conn:        iscsi class connection
+ * @transport_eph:   transport end-point handle
+ * @is_leading:      indicate if this is the session leading connection (MCS)
+ *
+ * Return: zero on success, $error if iscsi_conn_bind fails and
+ *         -EINVAL in case end-point doesn't exsits anymore or iser connection
+ *         state is not UP (teardown already started).
+ */
+static int
+iscsi_iser_conn_bind(struct iscsi_cls_session *cls_session,
+		     struct iscsi_cls_conn *cls_conn,
+		     uint64_t transport_eph,
+		     int is_leading)
+{
+	struct iscsi_conn *conn = cls_conn->dd_data;
+	struct iser_conn *iser_conn;
+	struct iscsi_endpoint *ep;
+	int error;
+
+	error = iscsi_conn_bind(cls_session, cls_conn, is_leading);
+	if (error)
+		return error;
+
+	/* the transport ep handle comes from user space so it must be
+	 * verified against the global ib connections list */
+	ep = iscsi_lookup_endpoint(transport_eph);
+	if (!ep) {
+		iser_err("can't bind eph %llx\n",
+			 (unsigned long long)transport_eph);
+		return -EINVAL;
+	}
+	iser_conn = ep->dd_data;
+
+	mutex_lock(&iser_conn->state_mutex);
+	if (iser_conn->state != ISER_CONN_UP) {
+		error = -EINVAL;
+		iser_err("iser_conn %p state is %d, teardown started\n",
+			 iser_conn, iser_conn->state);
+		goto out;
+	}
+
+	error = iser_alloc_rx_descriptors(iser_conn, conn->session);
+	if (error)
+		goto out;
+
+	/* binds the iSER connection retrieved from the previously
+	 * connected ep_handle to the iSCSI layer connection. exchanges
+	 * connection pointers */
+	iser_info("binding iscsi conn %p to iser_conn %p\n", conn, iser_conn);
+
+	conn->dd_data = iser_conn;
+	iser_conn->iscsi_conn = conn;
+
+out:
+	mutex_unlock(&iser_conn->state_mutex);
+	return error;
+}
+
+/**
+ * iscsi_iser_conn_start() - start iscsi-iser connection
+ * @cls_conn: iscsi class connection
+ *
+ * Notes: Here iser intialize (or re-initialize) stop_completion as
+ *        from this point iscsi must call conn_stop in session/connection
+ *        teardown so iser transport must wait for it.
+ */
+static int
+iscsi_iser_conn_start(struct iscsi_cls_conn *cls_conn)
+{
+	struct iscsi_conn *iscsi_conn;
+	struct iser_conn *iser_conn;
+
+	iscsi_conn = cls_conn->dd_data;
+	iser_conn = iscsi_conn->dd_data;
+	reinit_completion(&iser_conn->stop_completion);
+
+	return iscsi_conn_start(cls_conn);
+}
+
+/**
+ * iscsi_iser_conn_stop() - stop iscsi-iser connection
+ * @cls_conn:  iscsi class connection
+ * @flag:      indicate if recover or terminate (passed as is)
+ *
+ * Notes: Calling iscsi_conn_stop might theoretically race with
+ *        DEVICE_REMOVAL event and dereference a previously freed RDMA device
+ *        handle, so we call it under iser the state lock to protect against
+ *        this kind of race.
+ */
+static void
+iscsi_iser_conn_stop(struct iscsi_cls_conn *cls_conn, int flag)
+{
+	struct iscsi_conn *conn = cls_conn->dd_data;
+	struct iser_conn *iser_conn = conn->dd_data;
+
+	iser_info("stopping iscsi_conn: %p, iser_conn: %p\n", conn, iser_conn);
+
+	/*
+	 * Userspace may have goofed up and not bound the connection or
+	 * might have only partially setup the connection.
+	 */
+	if (iser_conn) {
+		mutex_lock(&iser_conn->state_mutex);
+		iscsi_conn_stop(cls_conn, flag);
+		iser_conn_terminate(iser_conn);
+
+		/* unbind */
+		iser_conn->iscsi_conn = NULL;
+		conn->dd_data = NULL;
+
+		complete(&iser_conn->stop_completion);
+		mutex_unlock(&iser_conn->state_mutex);
+	} else {
+		iscsi_conn_stop(cls_conn, flag);
+	}
+}
+
+/**
+ * iscsi_iser_session_destroy() - destroy iscsi-iser session
+ * @cls_session: iscsi class session
+ *
+ * Removes and free iscsi host.
+ */
+static void
+iscsi_iser_session_destroy(struct iscsi_cls_session *cls_session)
+{
+	struct Scsi_Host *shost = iscsi_session_to_shost(cls_session);
+
+	iscsi_session_teardown(cls_session);
+	iscsi_host_remove(shost);
+	iscsi_host_free(shost);
+}
+
+static inline unsigned int
+iser_dif_prot_caps(int prot_caps)
+{
+	return ((prot_caps & IB_PROT_T10DIF_TYPE_1) ? SHOST_DIF_TYPE1_PROTECTION |
+						      SHOST_DIX_TYPE1_PROTECTION : 0) |
+	       ((prot_caps & IB_PROT_T10DIF_TYPE_2) ? SHOST_DIF_TYPE2_PROTECTION |
+						      SHOST_DIX_TYPE2_PROTECTION : 0) |
+	       ((prot_caps & IB_PROT_T10DIF_TYPE_3) ? SHOST_DIF_TYPE3_PROTECTION |
+						      SHOST_DIX_TYPE3_PROTECTION : 0);
+}
+
+/**
+ * iscsi_iser_session_create() - create an iscsi-iser session
+ * @ep:             iscsi end-point handle
+ * @cmds_max:       maximum commands in this session
+ * @qdepth:         session command queue depth
+ * @initial_cmdsn:  initiator command sequnce number
+ *
+ * Allocates and adds a scsi host, expose DIF supprot if
+ * exists, and sets up an iscsi session.
+ */
+static struct iscsi_cls_session *
+iscsi_iser_session_create(struct iscsi_endpoint *ep,
+			  uint16_t cmds_max, uint16_t qdepth,
+			  uint32_t initial_cmdsn)
+{
+	struct iscsi_cls_session *cls_session;
+	struct iscsi_session *session;
+	struct Scsi_Host *shost;
+	struct iser_conn *iser_conn = NULL;
+	struct ib_conn *ib_conn;
+
+	shost = iscsi_host_alloc(&iscsi_iser_sht, 0, 0);
+	if (!shost)
+		return NULL;
+	shost->transportt = iscsi_iser_scsi_transport;
+	shost->cmd_per_lun = qdepth;
+	shost->max_lun = iscsi_max_lun;
+	shost->max_id = 0;
+	shost->max_channel = 0;
+	shost->max_cmd_len = 16;
+
+	/*
+	 * older userspace tools (before 2.0-870) did not pass us
+	 * the leading conn's ep so this will be NULL;
+	 */
+	if (ep) {
+		iser_conn = ep->dd_data;
+		ib_conn = &iser_conn->ib_conn;
+		if (ib_conn->pi_support) {
+			u32 sig_caps = ib_conn->device->dev_attr.sig_prot_cap;
+
+			scsi_host_set_prot(shost, iser_dif_prot_caps(sig_caps));
+			if (iser_pi_guard)
+				scsi_host_set_guard(shost, SHOST_DIX_GUARD_IP);
+			else
+				scsi_host_set_guard(shost, SHOST_DIX_GUARD_CRC);
+		}
+	}
+
+	if (iscsi_host_add(shost, ep ?
+			   ib_conn->device->ib_device->dma_device : NULL))
+		goto free_host;
+
+	if (cmds_max > ISER_DEF_XMIT_CMDS_MAX) {
+		iser_info("cmds_max changed from %u to %u\n",
+			  cmds_max, ISER_DEF_XMIT_CMDS_MAX);
+		cmds_max = ISER_DEF_XMIT_CMDS_MAX;
+	}
+
+	cls_session = iscsi_session_setup(&iscsi_iser_transport, shost,
+					  cmds_max, 0,
+					  sizeof(struct iscsi_iser_task),
+					  initial_cmdsn, 0);
+	if (!cls_session)
+		goto remove_host;
+	session = cls_session->dd_data;
+
+	shost->can_queue = session->scsi_cmds_max;
+	return cls_session;
+
+remove_host:
+	iscsi_host_remove(shost);
+free_host:
+	iscsi_host_free(shost);
+	return NULL;
+}
+
+static int
+iscsi_iser_set_param(struct iscsi_cls_conn *cls_conn,
+		     enum iscsi_param param, char *buf, int buflen)
+{
+	int value;
+
+	switch (param) {
+	case ISCSI_PARAM_MAX_RECV_DLENGTH:
+		/* TBD */
+		break;
+	case ISCSI_PARAM_HDRDGST_EN:
+		sscanf(buf, "%d", &value);
+		if (value) {
+			iser_err("DataDigest wasn't negotiated to None\n");
+			return -EPROTO;
+		}
+		break;
+	case ISCSI_PARAM_DATADGST_EN:
+		sscanf(buf, "%d", &value);
+		if (value) {
+			iser_err("DataDigest wasn't negotiated to None\n");
+			return -EPROTO;
+		}
+		break;
+	case ISCSI_PARAM_IFMARKER_EN:
+		sscanf(buf, "%d", &value);
+		if (value) {
+			iser_err("IFMarker wasn't negotiated to No\n");
+			return -EPROTO;
+		}
+		break;
+	case ISCSI_PARAM_OFMARKER_EN:
+		sscanf(buf, "%d", &value);
+		if (value) {
+			iser_err("OFMarker wasn't negotiated to No\n");
+			return -EPROTO;
+		}
+		break;
+	default:
+		return iscsi_set_param(cls_conn, param, buf, buflen);
+	}
+
+	return 0;
+}
+
+/**
+ * iscsi_iser_set_param() - set class connection parameter
+ * @cls_conn:    iscsi class connection
+ * @stats:       iscsi stats to output
+ *
+ * Output connection statistics.
+ */
+static void
+iscsi_iser_conn_get_stats(struct iscsi_cls_conn *cls_conn, struct iscsi_stats *stats)
+{
+	struct iscsi_conn *conn = cls_conn->dd_data;
+
+	stats->txdata_octets = conn->txdata_octets;
+	stats->rxdata_octets = conn->rxdata_octets;
+	stats->scsicmd_pdus = conn->scsicmd_pdus_cnt;
+	stats->dataout_pdus = conn->dataout_pdus_cnt;
+	stats->scsirsp_pdus = conn->scsirsp_pdus_cnt;
+	stats->datain_pdus = conn->datain_pdus_cnt; /* always 0 */
+	stats->r2t_pdus = conn->r2t_pdus_cnt; /* always 0 */
+	stats->tmfcmd_pdus = conn->tmfcmd_pdus_cnt;
+	stats->tmfrsp_pdus = conn->tmfrsp_pdus_cnt;
+	stats->custom_length = 4;
+	strcpy(stats->custom[0].desc, "qp_tx_queue_full");
+	stats->custom[0].value = 0; /* TB iser_conn->qp_tx_queue_full; */
+	strcpy(stats->custom[1].desc, "fmr_map_not_avail");
+	stats->custom[1].value = 0; /* TB iser_conn->fmr_map_not_avail */;
+	strcpy(stats->custom[2].desc, "eh_abort_cnt");
+	stats->custom[2].value = conn->eh_abort_cnt;
+	strcpy(stats->custom[3].desc, "fmr_unalign_cnt");
+	stats->custom[3].value = conn->fmr_unalign_cnt;
+}
+
+static int iscsi_iser_get_ep_param(struct iscsi_endpoint *ep,
+				   enum iscsi_param param, char *buf)
+{
+	struct iser_conn *iser_conn = ep->dd_data;
+	int len;
+
+	switch (param) {
+	case ISCSI_PARAM_CONN_PORT:
+	case ISCSI_PARAM_CONN_ADDRESS:
+		if (!iser_conn || !iser_conn->ib_conn.cma_id)
+			return -ENOTCONN;
+
+		return iscsi_conn_get_addr_param((struct sockaddr_storage *)
+				&iser_conn->ib_conn.cma_id->route.addr.dst_addr,
+				param, buf);
+		break;
+	default:
+		return -ENOSYS;
+	}
+
+	return len;
+}
+
+/**
+ * iscsi_iser_ep_connect() - Initiate iSER connection establishment
+ * @shost:          scsi_host
+ * @dst_addr:       destination address
+ * @non-blocking:   indicate if routine can block
+ *
+ * Allocate an iscsi endpoint, an iser_conn structure and bind them.
+ * After that start RDMA connection establishment via rdma_cm. We
+ * don't allocate iser_conn embedded in iscsi_endpoint since in teardown
+ * the endpoint will be destroyed at ep_disconnect while iser_conn will
+ * cleanup its resources asynchronuously.
+ *
+ * Return: iscsi_endpoint created by iscsi layer or ERR_PTR(error)
+ *         if fails.
+ */
+static struct iscsi_endpoint *
+iscsi_iser_ep_connect(struct Scsi_Host *shost, struct sockaddr *dst_addr,
+		      int non_blocking)
+{
+	int err;
+	struct iser_conn *iser_conn;
+	struct iscsi_endpoint *ep;
+
+	ep = iscsi_create_endpoint(0);
+	if (!ep)
+		return ERR_PTR(-ENOMEM);
+
+	iser_conn = kzalloc(sizeof(*iser_conn), GFP_KERNEL);
+	if (!iser_conn) {
+		err = -ENOMEM;
+		goto failure;
+	}
+
+	ep->dd_data = iser_conn;
+	iser_conn->ep = ep;
+	iser_conn_init(iser_conn);
+
+	err = iser_connect(iser_conn, NULL, dst_addr, non_blocking);
+	if (err)
+		goto failure;
+
+	return ep;
+failure:
+	iscsi_destroy_endpoint(ep);
+	return ERR_PTR(err);
+}
+
+/**
+ * iscsi_iser_ep_poll() - poll for iser connection establishment to complete
+ * @ep:            iscsi endpoint (created at ep_connect)
+ * @timeout_ms:    polling timeout allowed in ms.
+ *
+ * This routine boils down to waiting for up_completion signaling
+ * that cma_id got CONNECTED event.
+ *
+ * Return: 1 if succeeded in connection establishment, 0 if timeout expired
+ *         (libiscsi will retry will kick in) or -1 if interrupted by signal
+ *         or more likely iser connection state transitioned to TEMINATING or
+ *         DOWN during the wait period.
+ */
+static int
+iscsi_iser_ep_poll(struct iscsi_endpoint *ep, int timeout_ms)
+{
+	struct iser_conn *iser_conn;
+	int rc;
+
+	iser_conn = ep->dd_data;
+	rc = wait_for_completion_interruptible_timeout(&iser_conn->up_completion,
+						       msecs_to_jiffies(timeout_ms));
+	/* if conn establishment failed, return error code to iscsi */
+	if (rc == 0) {
+		mutex_lock(&iser_conn->state_mutex);
+		if (iser_conn->state == ISER_CONN_TERMINATING ||
+		    iser_conn->state == ISER_CONN_DOWN)
+			rc = -1;
+		mutex_unlock(&iser_conn->state_mutex);
+	}
+
+	iser_info("ib conn %p rc = %d\n", iser_conn, rc);
+
+	if (rc > 0)
+		return 1; /* success, this is the equivalent of POLLOUT */
+	else if (!rc)
+		return 0; /* timeout */
+	else
+		return rc; /* signal */
+}
+
+/**
+ * iscsi_iser_ep_disconnect() - Initiate connection teardown process
+ * @ep:    iscsi endpoint handle
+ *
+ * This routine is not blocked by iser and RDMA termination process
+ * completion as we queue a deffered work for iser/RDMA destruction
+ * and cleanup or actually call it immediately in case we didn't pass
+ * iscsi conn bind/start stage, thus it is safe.
+ */
+static void
+iscsi_iser_ep_disconnect(struct iscsi_endpoint *ep)
+{
+	struct iser_conn *iser_conn;
+
+	iser_conn = ep->dd_data;
+	iser_info("ep %p iser conn %p state %d\n",
+		  ep, iser_conn, iser_conn->state);
+
+	mutex_lock(&iser_conn->state_mutex);
+	iser_conn_terminate(iser_conn);
+
+	/*
+	 * if iser_conn and iscsi_conn are bound, we must wait for
+	 * iscsi_conn_stop and flush errors completion before freeing
+	 * the iser resources. Otherwise we are safe to free resources
+	 * immediately.
+	 */
+	if (iser_conn->iscsi_conn) {
+		INIT_WORK(&iser_conn->release_work, iser_release_work);
+		queue_work(release_wq, &iser_conn->release_work);
+		mutex_unlock(&iser_conn->state_mutex);
+	} else {
+		iser_conn->state = ISER_CONN_DOWN;
+		mutex_unlock(&iser_conn->state_mutex);
+		iser_conn_release(iser_conn);
+	}
+	iscsi_destroy_endpoint(ep);
+}
+
+static umode_t iser_attr_is_visible(int param_type, int param)
+{
+	switch (param_type) {
+	case ISCSI_HOST_PARAM:
+		switch (param) {
+		case ISCSI_HOST_PARAM_NETDEV_NAME:
+		case ISCSI_HOST_PARAM_HWADDRESS:
+		case ISCSI_HOST_PARAM_INITIATOR_NAME:
+			return S_IRUGO;
+		default:
+			return 0;
+		}
+	case ISCSI_PARAM:
+		switch (param) {
+		case ISCSI_PARAM_MAX_RECV_DLENGTH:
+		case ISCSI_PARAM_MAX_XMIT_DLENGTH:
+		case ISCSI_PARAM_HDRDGST_EN:
+		case ISCSI_PARAM_DATADGST_EN:
+		case ISCSI_PARAM_CONN_ADDRESS:
+		case ISCSI_PARAM_CONN_PORT:
+		case ISCSI_PARAM_EXP_STATSN:
+		case ISCSI_PARAM_PERSISTENT_ADDRESS:
+		case ISCSI_PARAM_PERSISTENT_PORT:
+		case ISCSI_PARAM_PING_TMO:
+		case ISCSI_PARAM_RECV_TMO:
+		case ISCSI_PARAM_INITIAL_R2T_EN:
+		case ISCSI_PARAM_MAX_R2T:
+		case ISCSI_PARAM_IMM_DATA_EN:
+		case ISCSI_PARAM_FIRST_BURST:
+		case ISCSI_PARAM_MAX_BURST:
+		case ISCSI_PARAM_PDU_INORDER_EN:
+		case ISCSI_PARAM_DATASEQ_INORDER_EN:
+		case ISCSI_PARAM_TARGET_NAME:
+		case ISCSI_PARAM_TPGT:
+		case ISCSI_PARAM_USERNAME:
+		case ISCSI_PARAM_PASSWORD:
+		case ISCSI_PARAM_USERNAME_IN:
+		case ISCSI_PARAM_PASSWORD_IN:
+		case ISCSI_PARAM_FAST_ABORT:
+		case ISCSI_PARAM_ABORT_TMO:
+		case ISCSI_PARAM_LU_RESET_TMO:
+		case ISCSI_PARAM_TGT_RESET_TMO:
+		case ISCSI_PARAM_IFACE_NAME:
+		case ISCSI_PARAM_INITIATOR_NAME:
+		case ISCSI_PARAM_DISCOVERY_SESS:
+			return S_IRUGO;
+		default:
+			return 0;
+		}
+	}
+
+	return 0;
+}
+
+static struct scsi_host_template iscsi_iser_sht = {
+	.module                 = THIS_MODULE,
+	.name                   = "iSCSI Initiator over iSER",
+	.queuecommand           = iscsi_queuecommand,
+	.change_queue_depth	= iscsi_change_queue_depth,
+	.sg_tablesize           = ISCSI_ISER_SG_TABLESIZE,
+	.max_sectors		= 1024,
+	.cmd_per_lun            = ISER_DEF_CMD_PER_LUN,
+	.eh_abort_handler       = iscsi_eh_abort,
+	.eh_device_reset_handler= iscsi_eh_device_reset,
+	.eh_target_reset_handler = iscsi_eh_recover_target,
+	.target_alloc		= iscsi_target_alloc,
+	.use_clustering         = DISABLE_CLUSTERING,
+	.proc_name              = "iscsi_iser",
+	.this_id                = -1,
+};
+
+static struct iscsi_transport iscsi_iser_transport = {
+	.owner                  = THIS_MODULE,
+	.name                   = "iser",
+	.caps                   = CAP_RECOVERY_L0 | CAP_MULTI_R2T | CAP_TEXT_NEGO,
+	/* session management */
+	.create_session         = iscsi_iser_session_create,
+	.destroy_session        = iscsi_iser_session_destroy,
+	/* connection management */
+	.create_conn            = iscsi_iser_conn_create,
+	.bind_conn              = iscsi_iser_conn_bind,
+	.destroy_conn           = iscsi_conn_teardown,
+	.attr_is_visible	= iser_attr_is_visible,
+	.set_param              = iscsi_iser_set_param,
+	.get_conn_param		= iscsi_conn_get_param,
+	.get_ep_param		= iscsi_iser_get_ep_param,
+	.get_session_param	= iscsi_session_get_param,
+	.start_conn             = iscsi_iser_conn_start,
+	.stop_conn              = iscsi_iser_conn_stop,
+	/* iscsi host params */
+	.get_host_param		= iscsi_host_get_param,
+	.set_host_param		= iscsi_host_set_param,
+	/* IO */
+	.send_pdu		= iscsi_conn_send_pdu,
+	.get_stats		= iscsi_iser_conn_get_stats,
+	.init_task		= iscsi_iser_task_init,
+	.xmit_task		= iscsi_iser_task_xmit,
+	.cleanup_task		= iscsi_iser_cleanup_task,
+	.alloc_pdu		= iscsi_iser_pdu_alloc,
+	.check_protection	= iscsi_iser_check_protection,
+	/* recovery */
+	.session_recovery_timedout = iscsi_session_recovery_timedout,
+
+	.ep_connect             = iscsi_iser_ep_connect,
+	.ep_poll                = iscsi_iser_ep_poll,
+	.ep_disconnect          = iscsi_iser_ep_disconnect
+};
+
+static int __init iser_init(void)
+{
+	int err;
+
+	iser_dbg("Starting iSER datamover...\n");
+
+	if (iscsi_max_lun < 1) {
+		iser_err("Invalid max_lun value of %u\n", iscsi_max_lun);
+		return -EINVAL;
+	}
+
+	memset(&ig, 0, sizeof(struct iser_global));
+
+	ig.desc_cache = kmem_cache_create("iser_descriptors",
+					  sizeof(struct iser_tx_desc),
+					  0, SLAB_HWCACHE_ALIGN,
+					  NULL);
+	if (ig.desc_cache == NULL)
+		return -ENOMEM;
+
+	/* device init is called only after the first addr resolution */
+	mutex_init(&ig.device_list_mutex);
+	INIT_LIST_HEAD(&ig.device_list);
+	mutex_init(&ig.connlist_mutex);
+	INIT_LIST_HEAD(&ig.connlist);
+
+	release_wq = alloc_workqueue("release workqueue", 0, 0);
+	if (!release_wq) {
+		iser_err("failed to allocate release workqueue\n");
+		return -ENOMEM;
+	}
+
+	iscsi_iser_scsi_transport = iscsi_register_transport(
+							&iscsi_iser_transport);
+	if (!iscsi_iser_scsi_transport) {
+		iser_err("iscsi_register_transport failed\n");
+		err = -EINVAL;
+		goto register_transport_failure;
+	}
+
+	return 0;
+
+register_transport_failure:
+	kmem_cache_destroy(ig.desc_cache);
+
+	return err;
+}
+
+static void __exit iser_exit(void)
+{
+	struct iser_conn *iser_conn, *n;
+	int connlist_empty;
+
+	iser_dbg("Removing iSER datamover...\n");
+	destroy_workqueue(release_wq);
+
+	mutex_lock(&ig.connlist_mutex);
+	connlist_empty = list_empty(&ig.connlist);
+	mutex_unlock(&ig.connlist_mutex);
+
+	if (!connlist_empty) {
+		iser_err("Error cleanup stage completed but we still have iser "
+			 "connections, destroying them anyway.\n");
+		list_for_each_entry_safe(iser_conn, n, &ig.connlist,
+					 conn_list) {
+			iser_conn_release(iser_conn);
+		}
+	}
+
+	iscsi_unregister_transport(&iscsi_iser_transport);
+	kmem_cache_destroy(ig.desc_cache);
+}
+
+module_init(iser_init);
+module_exit(iser_exit);
diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.h b/drivers/infiniband/ulp/iser/iscsi_iser.h
new file mode 100644
index 0000000..cd4174c
--- /dev/null
+++ b/drivers/infiniband/ulp/iser/iscsi_iser.h
@@ -0,0 +1,666 @@
+/*
+ * iSER transport for the Open iSCSI Initiator & iSER transport internals
+ *
+ * Copyright (C) 2004 Dmitry Yusupov
+ * Copyright (C) 2004 Alex Aizman
+ * Copyright (C) 2005 Mike Christie
+ * based on code maintained by open-iscsi@googlegroups.com
+ *
+ * Copyright (c) 2004, 2005, 2006 Voltaire, Inc. All rights reserved.
+ * Copyright (c) 2005, 2006 Cisco Systems.  All rights reserved.
+ * Copyright (c) 2013-2014 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *	- Redistributions of source code must retain the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer.
+ *
+ *	- Redistributions in binary form must reproduce the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer in the documentation and/or other materials
+ *	  provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ISCSI_ISER_H__
+#define __ISCSI_ISER_H__
+
+#include <linux/types.h>
+#include <linux/net.h>
+#include <linux/printk.h>
+#include <scsi/libiscsi.h>
+#include <scsi/scsi_transport_iscsi.h>
+#include <scsi/scsi_cmnd.h>
+#include <scsi/scsi_device.h>
+
+#include <linux/interrupt.h>
+#include <linux/wait.h>
+#include <linux/sched.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/dma-mapping.h>
+#include <linux/mutex.h>
+#include <linux/mempool.h>
+#include <linux/uio.h>
+
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/in6.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_fmr_pool.h>
+#include <rdma/rdma_cm.h>
+
+#define DRV_NAME	"iser"
+#define PFX		DRV_NAME ": "
+#define DRV_VER		"1.4.8"
+
+#define iser_dbg(fmt, arg...)				 \
+	do {						 \
+		if (iser_debug_level > 2)		 \
+			printk(KERN_DEBUG PFX "%s: " fmt,\
+				__func__ , ## arg);	 \
+	} while (0)
+
+#define iser_warn(fmt, arg...)				\
+	do {						\
+		if (iser_debug_level > 0)		\
+			pr_warn(PFX "%s: " fmt,		\
+				__func__ , ## arg);	\
+	} while (0)
+
+#define iser_info(fmt, arg...)				\
+	do {						\
+		if (iser_debug_level > 1)		\
+			pr_info(PFX "%s: " fmt,		\
+				__func__ , ## arg);	\
+	} while (0)
+
+#define iser_err(fmt, arg...)				\
+	do {						\
+		printk(KERN_ERR PFX "%s: " fmt,		\
+		       __func__ , ## arg);		\
+	} while (0)
+
+#define SHIFT_4K	12
+#define SIZE_4K	(1ULL << SHIFT_4K)
+#define MASK_4K	(~(SIZE_4K-1))
+					/* support up to 512KB in one RDMA */
+#define ISCSI_ISER_SG_TABLESIZE         (0x80000 >> SHIFT_4K)
+#define ISER_DEF_XMIT_CMDS_DEFAULT		512
+#if ISCSI_DEF_XMIT_CMDS_MAX > ISER_DEF_XMIT_CMDS_DEFAULT
+	#define ISER_DEF_XMIT_CMDS_MAX		ISCSI_DEF_XMIT_CMDS_MAX
+#else
+	#define ISER_DEF_XMIT_CMDS_MAX		ISER_DEF_XMIT_CMDS_DEFAULT
+#endif
+#define ISER_DEF_CMD_PER_LUN		ISER_DEF_XMIT_CMDS_MAX
+
+/* QP settings */
+/* Maximal bounds on received asynchronous PDUs */
+#define ISER_MAX_RX_MISC_PDUS		4 /* NOOP_IN(2) , ASYNC_EVENT(2)   */
+
+#define ISER_MAX_TX_MISC_PDUS		6 /* NOOP_OUT(2), TEXT(1),         *
+					   * SCSI_TMFUNC(2), LOGOUT(1) */
+
+#define ISER_QP_MAX_RECV_DTOS		(ISER_DEF_XMIT_CMDS_MAX)
+
+#define ISER_MIN_POSTED_RX		(ISER_DEF_XMIT_CMDS_MAX >> 2)
+
+/* the max TX (send) WR supported by the iSER QP is defined by                 *
+ * max_send_wr = T * (1 + D) + C ; D is how many inflight dataouts we expect   *
+ * to have at max for SCSI command. The tx posting & completion handling code  *
+ * supports -EAGAIN scheme where tx is suspended till the QP has room for more *
+ * send WR. D=8 comes from 64K/8K                                              */
+
+#define ISER_INFLIGHT_DATAOUTS		8
+
+#define ISER_QP_MAX_REQ_DTOS		(ISER_DEF_XMIT_CMDS_MAX *    \
+					(1 + ISER_INFLIGHT_DATAOUTS) + \
+					ISER_MAX_TX_MISC_PDUS        + \
+					ISER_MAX_RX_MISC_PDUS)
+
+/* Max registration work requests per command */
+#define ISER_MAX_REG_WR_PER_CMD		5
+
+/* For Signature we don't support DATAOUTs so no need to make room for them */
+#define ISER_QP_SIG_MAX_REQ_DTOS	(ISER_DEF_XMIT_CMDS_MAX	*       \
+					(1 + ISER_MAX_REG_WR_PER_CMD) + \
+					ISER_MAX_TX_MISC_PDUS         + \
+					ISER_MAX_RX_MISC_PDUS)
+
+#define ISER_WC_BATCH_COUNT   16
+#define ISER_SIGNAL_CMD_COUNT 32
+
+#define ISER_VER			0x10
+#define ISER_WSV			0x08
+#define ISER_RSV			0x04
+
+#define ISER_FASTREG_LI_WRID		0xffffffffffffffffULL
+#define ISER_BEACON_WRID		0xfffffffffffffffeULL
+
+/**
+ * struct iser_hdr - iSER header
+ *
+ * @flags:        flags support (zbva, remote_inv)
+ * @rsvd:         reserved
+ * @write_stag:   write rkey
+ * @write_va:     write virtual address
+ * @reaf_stag:    read rkey
+ * @read_va:      read virtual address
+ */
+struct iser_hdr {
+	u8      flags;
+	u8      rsvd[3];
+	__be32  write_stag;
+	__be64  write_va;
+	__be32  read_stag;
+	__be64  read_va;
+} __attribute__((packed));
+
+
+#define ISER_ZBVA_NOT_SUPPORTED		0x80
+#define ISER_SEND_W_INV_NOT_SUPPORTED	0x40
+
+struct iser_cm_hdr {
+	u8      flags;
+	u8      rsvd[3];
+} __packed;
+
+/* Constant PDU lengths calculations */
+#define ISER_HEADERS_LEN  (sizeof(struct iser_hdr) + sizeof(struct iscsi_hdr))
+
+#define ISER_RECV_DATA_SEG_LEN	128
+#define ISER_RX_PAYLOAD_SIZE	(ISER_HEADERS_LEN + ISER_RECV_DATA_SEG_LEN)
+#define ISER_RX_LOGIN_SIZE	(ISER_HEADERS_LEN + ISCSI_DEF_MAX_RECV_SEG_LEN)
+
+/* Length of an object name string */
+#define ISER_OBJECT_NAME_SIZE		    64
+
+enum iser_conn_state {
+	ISER_CONN_INIT,		   /* descriptor allocd, no conn          */
+	ISER_CONN_PENDING,	   /* in the process of being established */
+	ISER_CONN_UP,		   /* up and running                      */
+	ISER_CONN_TERMINATING,	   /* in the process of being terminated  */
+	ISER_CONN_DOWN,		   /* shut down                           */
+	ISER_CONN_STATES_NUM
+};
+
+enum iser_task_status {
+	ISER_TASK_STATUS_INIT = 0,
+	ISER_TASK_STATUS_STARTED,
+	ISER_TASK_STATUS_COMPLETED
+};
+
+enum iser_data_dir {
+	ISER_DIR_IN = 0,	   /* to initiator */
+	ISER_DIR_OUT,		   /* from initiator */
+	ISER_DIRS_NUM
+};
+
+/**
+ * struct iser_data_buf - iSER data buffer
+ *
+ * @buf:          pointer to the sg list
+ * @size:         num entries of this sg
+ * @data_len:     total beffer byte len
+ * @dma_nents:    returned by dma_map_sg
+ * @copy_buf:     allocated copy buf for SGs unaligned
+ *                for rdma which are copied
+ * @sg_single:    SG-ified clone of a non SG SC or
+ *                unaligned SG
+ */
+struct iser_data_buf {
+	void               *buf;
+	unsigned int       size;
+	unsigned long      data_len;
+	unsigned int       dma_nents;
+	char               *copy_buf;
+	struct scatterlist sg_single;
+  };
+
+/* fwd declarations */
+struct iser_device;
+struct iscsi_iser_task;
+struct iscsi_endpoint;
+
+/**
+ * struct iser_mem_reg - iSER memory registration info
+ *
+ * @lkey:         MR local key
+ * @rkey:         MR remote key
+ * @va:           MR start address (buffer va)
+ * @len:          MR length
+ * @mem_h:        pointer to registration context (FMR/Fastreg)
+ * @is_mr:        indicates weather we registered the buffer
+ */
+struct iser_mem_reg {
+	u32  lkey;
+	u32  rkey;
+	u64  va;
+	u64  len;
+	void *mem_h;
+	int  is_mr;
+};
+
+/**
+ * struct iser_regd_buf - iSER buffer registration desc
+ *
+ * @reg:          memory registration info
+ * @virt_addr:    virtual address of buffer
+ * @device:       reference to iser device
+ * @direction:    dma direction (for dma_unmap)
+ * @data_size:    data buffer size in bytes
+ */
+struct iser_regd_buf {
+	struct iser_mem_reg     reg;
+	void                    *virt_addr;
+	struct iser_device      *device;
+	enum dma_data_direction direction;
+	unsigned int            data_size;
+};
+
+enum iser_desc_type {
+	ISCSI_TX_CONTROL ,
+	ISCSI_TX_SCSI_COMMAND,
+	ISCSI_TX_DATAOUT
+};
+
+/**
+ * struct iser_tx_desc - iSER TX descriptor (for send wr_id)
+ *
+ * @iser_header:   iser header
+ * @iscsi_header:  iscsi header
+ * @type:          command/control/dataout
+ * @dam_addr:      header buffer dma_address
+ * @tx_sg:         sg[0] points to iser/iscsi headers
+ *                 sg[1] optionally points to either of immediate data
+ *                 unsolicited data-out or control
+ * @num_sge:       number sges used on this TX task
+ */
+struct iser_tx_desc {
+	struct iser_hdr              iser_header;
+	struct iscsi_hdr             iscsi_header;
+	enum   iser_desc_type        type;
+	u64		             dma_addr;
+	struct ib_sge		     tx_sg[2];
+	int                          num_sge;
+};
+
+#define ISER_RX_PAD_SIZE	(256 - (ISER_RX_PAYLOAD_SIZE + \
+					sizeof(u64) + sizeof(struct ib_sge)))
+/**
+ * struct iser_rx_desc - iSER RX descriptor (for recv wr_id)
+ *
+ * @iser_header:   iser header
+ * @iscsi_header:  iscsi header
+ * @data:          received data segment
+ * @dma_addr:      receive buffer dma address
+ * @rx_sg:         ib_sge of receive buffer
+ * @pad:           for sense data TODO: Modify to maximum sense length supported
+ */
+struct iser_rx_desc {
+	struct iser_hdr              iser_header;
+	struct iscsi_hdr             iscsi_header;
+	char		             data[ISER_RECV_DATA_SEG_LEN];
+	u64		             dma_addr;
+	struct ib_sge		     rx_sg;
+	char		             pad[ISER_RX_PAD_SIZE];
+} __attribute__((packed));
+
+#define ISER_MAX_CQ 4
+
+struct iser_conn;
+struct ib_conn;
+struct iscsi_iser_task;
+
+/**
+ * struct iser_comp - iSER completion context
+ *
+ * @device:     pointer to device handle
+ * @cq:         completion queue
+ * @wcs:        work completion array
+ * @tasklet:    Tasklet handle
+ * @active_qps: Number of active QPs attached
+ *              to completion context
+ */
+struct iser_comp {
+	struct iser_device      *device;
+	struct ib_cq		*cq;
+	struct ib_wc		 wcs[ISER_WC_BATCH_COUNT];
+	struct tasklet_struct	 tasklet;
+	int                      active_qps;
+};
+
+/**
+ * struct iser_device - iSER device handle
+ *
+ * @ib_device:     RDMA device
+ * @pd:            Protection Domain for this device
+ * @dev_attr:      Device attributes container
+ * @mr:            Global DMA memory region
+ * @event_handler: IB events handle routine
+ * @ig_list:	   entry in devices list
+ * @refcount:      Reference counter, dominated by open iser connections
+ * @comps_used:    Number of completion contexts used, Min between online
+ *                 cpus and device max completion vectors
+ * @comps:         Dinamically allocated array of completion handlers
+ * Memory registration pool Function pointers (FMR or Fastreg):
+ *     @iser_alloc_rdma_reg_res: Allocation of memory regions pool
+ *     @iser_free_rdma_reg_res:  Free of memory regions pool
+ *     @iser_reg_rdma_mem:       Memory registration routine
+ *     @iser_unreg_rdma_mem:     Memory deregistration routine
+ */
+struct iser_device {
+	struct ib_device             *ib_device;
+	struct ib_pd	             *pd;
+	struct ib_device_attr	     dev_attr;
+	struct ib_mr	             *mr;
+	struct ib_event_handler      event_handler;
+	struct list_head             ig_list;
+	int                          refcount;
+	int			     comps_used;
+	struct iser_comp	     comps[ISER_MAX_CQ];
+	int                          (*iser_alloc_rdma_reg_res)(struct ib_conn *ib_conn,
+								unsigned cmds_max);
+	void                         (*iser_free_rdma_reg_res)(struct ib_conn *ib_conn);
+	int                          (*iser_reg_rdma_mem)(struct iscsi_iser_task *iser_task,
+							  enum iser_data_dir cmd_dir);
+	void                         (*iser_unreg_rdma_mem)(struct iscsi_iser_task *iser_task,
+							    enum iser_data_dir cmd_dir);
+};
+
+#define ISER_CHECK_GUARD	0xc0
+#define ISER_CHECK_REFTAG	0x0f
+#define ISER_CHECK_APPTAG	0x30
+
+enum iser_reg_indicator {
+	ISER_DATA_KEY_VALID	= 1 << 0,
+	ISER_PROT_KEY_VALID	= 1 << 1,
+	ISER_SIG_KEY_VALID	= 1 << 2,
+	ISER_FASTREG_PROTECTED	= 1 << 3,
+};
+
+/**
+ * struct iser_pi_context - Protection information context
+ *
+ * @prot_mr:        protection memory region
+ * @prot_frpl:      protection fastreg page list
+ * @sig_mr:         signature feature enabled memory region
+ */
+struct iser_pi_context {
+	struct ib_mr                   *prot_mr;
+	struct ib_fast_reg_page_list   *prot_frpl;
+	struct ib_mr                   *sig_mr;
+};
+
+/**
+ * struct fast_reg_descriptor - Fast registration descriptor
+ *
+ * @list:           entry in connection fastreg pool
+ * @data_mr:        data memory region
+ * @data_frpl:      data fastreg page list
+ * @pi_ctx:         protection information context
+ * @reg_indicators: fast registration indicators
+ */
+struct fast_reg_descriptor {
+	struct list_head		  list;
+	struct ib_mr			 *data_mr;
+	struct ib_fast_reg_page_list     *data_frpl;
+	struct iser_pi_context		 *pi_ctx;
+	u8				  reg_indicators;
+};
+
+/**
+ * struct ib_conn - Infiniband related objects
+ *
+ * @cma_id:              rdma_cm connection maneger handle
+ * @qp:                  Connection Queue-pair
+ * @post_recv_buf_count: post receive counter
+ * @rx_wr:               receive work request for batch posts
+ * @device:              reference to iser device
+ * @comp:                iser completion context
+ * @pi_support:          Indicate device T10-PI support
+ * @beacon:              beacon send wr to signal all flush errors were drained
+ * @flush_comp:          completes when all connection completions consumed
+ * @lock:                protects fmr/fastreg pool
+ * @union.fmr:
+ *     @pool:            FMR pool for fast registrations
+ *     @page_vec:        page vector to hold mapped commands pages
+ *                       used for registration
+ * @union.fastreg:
+ *     @pool:            Fast registration descriptors pool for fast
+ *                       registrations
+ *     @pool_size:       Size of pool
+ */
+struct ib_conn {
+	struct rdma_cm_id           *cma_id;
+	struct ib_qp	            *qp;
+	int                          post_recv_buf_count;
+	struct ib_recv_wr	     rx_wr[ISER_MIN_POSTED_RX];
+	struct iser_device          *device;
+	struct iser_comp	    *comp;
+	bool			     pi_support;
+	struct ib_send_wr	     beacon;
+	struct completion	     flush_comp;
+	spinlock_t		     lock;
+	union {
+		struct {
+			struct ib_fmr_pool      *pool;
+			struct iser_page_vec	*page_vec;
+		} fmr;
+		struct {
+			struct list_head	 pool;
+			int			 pool_size;
+		} fastreg;
+	};
+};
+
+/**
+ * struct iser_conn - iSER connection context
+ *
+ * @ib_conn:          connection RDMA resources
+ * @iscsi_conn:       link to matching iscsi connection
+ * @ep:               transport handle
+ * @state:            connection logical state
+ * @qp_max_recv_dtos: maximum number of data outs, corresponds
+ *                    to max number of post recvs
+ * @qp_max_recv_dtos_mask: (qp_max_recv_dtos - 1)
+ * @min_posted_rx:    (qp_max_recv_dtos >> 2)
+ * @name:             connection peer portal
+ * @release_work:     deffered work for release job
+ * @state_mutex:      protects iser onnection state
+ * @stop_completion:  conn_stop completion
+ * @ib_completion:    RDMA cleanup completion
+ * @up_completion:    connection establishment completed
+ *                    (state is ISER_CONN_UP)
+ * @conn_list:        entry in ig conn list
+ * @login_buf:        login data buffer (stores login parameters)
+ * @login_req_buf:    login request buffer
+ * @login_req_dma:    login request buffer dma address
+ * @login_resp_buf:   login response buffer
+ * @login_resp_dma:   login response buffer dma address
+ * @rx_desc_head:     head of rx_descs cyclic buffer
+ * @rx_descs:         rx buffers array (cyclic buffer)
+ * @num_rx_descs:     number of rx descriptors
+ */
+struct iser_conn {
+	struct ib_conn		     ib_conn;
+	struct iscsi_conn	     *iscsi_conn;
+	struct iscsi_endpoint	     *ep;
+	enum iser_conn_state	     state;
+	unsigned		     qp_max_recv_dtos;
+	unsigned		     qp_max_recv_dtos_mask;
+	unsigned		     min_posted_rx;
+	char 			     name[ISER_OBJECT_NAME_SIZE];
+	struct work_struct	     release_work;
+	struct mutex		     state_mutex;
+	struct completion	     stop_completion;
+	struct completion	     ib_completion;
+	struct completion	     up_completion;
+	struct list_head	     conn_list;
+
+	char  			     *login_buf;
+	char			     *login_req_buf, *login_resp_buf;
+	u64			     login_req_dma, login_resp_dma;
+	unsigned int 		     rx_desc_head;
+	struct iser_rx_desc	     *rx_descs;
+	u32                          num_rx_descs;
+};
+
+/**
+ * struct iscsi_iser_task - iser task context
+ *
+ * @desc:     TX descriptor
+ * @iser_conn:        link to iser connection
+ * @status:           current task status
+ * @sc:               link to scsi command
+ * @command_sent:     indicate if command was sent
+ * @dir:              iser data direction
+ * @rdma_regd:        task rdma registration desc
+ * @data:             iser data buffer desc
+ * @data_copy:        iser data copy buffer desc (bounce buffer)
+ * @prot:             iser protection buffer desc
+ * @prot_copy:        iser protection copy buffer desc (bounce buffer)
+ */
+struct iscsi_iser_task {
+	struct iser_tx_desc          desc;
+	struct iser_conn	     *iser_conn;
+	enum iser_task_status 	     status;
+	struct scsi_cmnd	     *sc;
+	int                          command_sent;
+	int                          dir[ISER_DIRS_NUM];
+	struct iser_regd_buf         rdma_regd[ISER_DIRS_NUM];
+	struct iser_data_buf         data[ISER_DIRS_NUM];
+	struct iser_data_buf         data_copy[ISER_DIRS_NUM];
+	struct iser_data_buf         prot[ISER_DIRS_NUM];
+	struct iser_data_buf         prot_copy[ISER_DIRS_NUM];
+};
+
+struct iser_page_vec {
+	u64 *pages;
+	int length;
+	int offset;
+	int data_size;
+};
+
+/**
+ * struct iser_global: iSER global context
+ *
+ * @device_list_mutex:    protects device_list
+ * @device_list:          iser devices global list
+ * @connlist_mutex:       protects connlist
+ * @connlist:             iser connections global list
+ * @desc_cache:           kmem cache for tx dataout
+ */
+struct iser_global {
+	struct mutex      device_list_mutex;
+	struct list_head  device_list;
+	struct mutex      connlist_mutex;
+	struct list_head  connlist;
+	struct kmem_cache *desc_cache;
+};
+
+extern struct iser_global ig;
+extern int iser_debug_level;
+extern bool iser_pi_enable;
+extern int iser_pi_guard;
+
+int iser_send_control(struct iscsi_conn *conn,
+		      struct iscsi_task *task);
+
+int iser_send_command(struct iscsi_conn *conn,
+		      struct iscsi_task *task);
+
+int iser_send_data_out(struct iscsi_conn *conn,
+		       struct iscsi_task *task,
+		       struct iscsi_data *hdr);
+
+void iscsi_iser_recv(struct iscsi_conn *conn,
+		     struct iscsi_hdr *hdr,
+		     char *rx_data,
+		     int rx_data_len);
+
+void iser_conn_init(struct iser_conn *iser_conn);
+
+void iser_conn_release(struct iser_conn *iser_conn);
+
+int iser_conn_terminate(struct iser_conn *iser_conn);
+
+void iser_release_work(struct work_struct *work);
+
+void iser_rcv_completion(struct iser_rx_desc *desc,
+			 unsigned long dto_xfer_len,
+			 struct ib_conn *ib_conn);
+
+void iser_snd_completion(struct iser_tx_desc *desc,
+			 struct ib_conn *ib_conn);
+
+void iser_task_rdma_init(struct iscsi_iser_task *task);
+
+void iser_task_rdma_finalize(struct iscsi_iser_task *task);
+
+void iser_free_rx_descriptors(struct iser_conn *iser_conn);
+
+void iser_finalize_rdma_unaligned_sg(struct iscsi_iser_task *iser_task,
+				     struct iser_data_buf *mem,
+				     struct iser_data_buf *mem_copy,
+				     enum iser_data_dir cmd_dir);
+
+int  iser_reg_rdma_mem_fmr(struct iscsi_iser_task *task,
+			   enum iser_data_dir cmd_dir);
+int  iser_reg_rdma_mem_fastreg(struct iscsi_iser_task *task,
+			       enum iser_data_dir cmd_dir);
+
+int  iser_connect(struct iser_conn *iser_conn,
+		  struct sockaddr *src_addr,
+		  struct sockaddr *dst_addr,
+		  int non_blocking);
+
+int  iser_reg_page_vec(struct ib_conn *ib_conn,
+		       struct iser_page_vec *page_vec,
+		       struct iser_mem_reg *mem_reg);
+
+void iser_unreg_mem_fmr(struct iscsi_iser_task *iser_task,
+			enum iser_data_dir cmd_dir);
+void iser_unreg_mem_fastreg(struct iscsi_iser_task *iser_task,
+			    enum iser_data_dir cmd_dir);
+
+int  iser_post_recvl(struct iser_conn *iser_conn);
+int  iser_post_recvm(struct iser_conn *iser_conn, int count);
+int  iser_post_send(struct ib_conn *ib_conn, struct iser_tx_desc *tx_desc,
+		    bool signal);
+
+int iser_dma_map_task_data(struct iscsi_iser_task *iser_task,
+			   struct iser_data_buf *data,
+			   enum iser_data_dir iser_dir,
+			   enum dma_data_direction dma_dir);
+
+void iser_dma_unmap_task_data(struct iscsi_iser_task *iser_task,
+			      struct iser_data_buf *data);
+int  iser_initialize_task_headers(struct iscsi_task *task,
+			struct iser_tx_desc *tx_desc);
+int iser_alloc_rx_descriptors(struct iser_conn *iser_conn,
+			      struct iscsi_session *session);
+int iser_create_fmr_pool(struct ib_conn *ib_conn, unsigned cmds_max);
+void iser_free_fmr_pool(struct ib_conn *ib_conn);
+int iser_create_fastreg_pool(struct ib_conn *ib_conn, unsigned cmds_max);
+void iser_free_fastreg_pool(struct ib_conn *ib_conn);
+u8 iser_check_task_pi_status(struct iscsi_iser_task *iser_task,
+			     enum iser_data_dir cmd_dir, sector_t *sector);
+#endif
diff --git a/drivers/infiniband/ulp/iser/iser_initiator.c b/drivers/infiniband/ulp/iser/iser_initiator.c
new file mode 100644
index 0000000..5a489ea
--- /dev/null
+++ b/drivers/infiniband/ulp/iser/iser_initiator.c
@@ -0,0 +1,732 @@
+/*
+ * Copyright (c) 2004, 2005, 2006 Voltaire, Inc. All rights reserved.
+ * Copyright (c) 2013-2014 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *	- Redistributions of source code must retain the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer.
+ *
+ *	- Redistributions in binary form must reproduce the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer in the documentation and/or other materials
+ *	  provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/scatterlist.h>
+#include <linux/kfifo.h>
+#include <scsi/scsi_cmnd.h>
+#include <scsi/scsi_host.h>
+
+#include "iscsi_iser.h"
+
+/* Register user buffer memory and initialize passive rdma
+ *  dto descriptor. Data size is stored in
+ *  task->data[ISER_DIR_IN].data_len, Protection size
+ *  os stored in task->prot[ISER_DIR_IN].data_len
+ */
+static int iser_prepare_read_cmd(struct iscsi_task *task)
+
+{
+	struct iscsi_iser_task *iser_task = task->dd_data;
+	struct iser_device  *device = iser_task->iser_conn->ib_conn.device;
+	struct iser_regd_buf *regd_buf;
+	int err;
+	struct iser_hdr *hdr = &iser_task->desc.iser_header;
+	struct iser_data_buf *buf_in = &iser_task->data[ISER_DIR_IN];
+
+	err = iser_dma_map_task_data(iser_task,
+				     buf_in,
+				     ISER_DIR_IN,
+				     DMA_FROM_DEVICE);
+	if (err)
+		return err;
+
+	if (scsi_prot_sg_count(iser_task->sc)) {
+		struct iser_data_buf *pbuf_in = &iser_task->prot[ISER_DIR_IN];
+
+		err = iser_dma_map_task_data(iser_task,
+					     pbuf_in,
+					     ISER_DIR_IN,
+					     DMA_FROM_DEVICE);
+		if (err)
+			return err;
+	}
+
+	err = device->iser_reg_rdma_mem(iser_task, ISER_DIR_IN);
+	if (err) {
+		iser_err("Failed to set up Data-IN RDMA\n");
+		return err;
+	}
+	regd_buf = &iser_task->rdma_regd[ISER_DIR_IN];
+
+	hdr->flags    |= ISER_RSV;
+	hdr->read_stag = cpu_to_be32(regd_buf->reg.rkey);
+	hdr->read_va   = cpu_to_be64(regd_buf->reg.va);
+
+	iser_dbg("Cmd itt:%d READ tags RKEY:%#.4X VA:%#llX\n",
+		 task->itt, regd_buf->reg.rkey,
+		 (unsigned long long)regd_buf->reg.va);
+
+	return 0;
+}
+
+/* Register user buffer memory and initialize passive rdma
+ *  dto descriptor. Data size is stored in
+ *  task->data[ISER_DIR_OUT].data_len, Protection size
+ *  is stored at task->prot[ISER_DIR_OUT].data_len
+ */
+static int
+iser_prepare_write_cmd(struct iscsi_task *task,
+		       unsigned int imm_sz,
+		       unsigned int unsol_sz,
+		       unsigned int edtl)
+{
+	struct iscsi_iser_task *iser_task = task->dd_data;
+	struct iser_device  *device = iser_task->iser_conn->ib_conn.device;
+	struct iser_regd_buf *regd_buf;
+	int err;
+	struct iser_hdr *hdr = &iser_task->desc.iser_header;
+	struct iser_data_buf *buf_out = &iser_task->data[ISER_DIR_OUT];
+	struct ib_sge *tx_dsg = &iser_task->desc.tx_sg[1];
+
+	err = iser_dma_map_task_data(iser_task,
+				     buf_out,
+				     ISER_DIR_OUT,
+				     DMA_TO_DEVICE);
+	if (err)
+		return err;
+
+	if (scsi_prot_sg_count(iser_task->sc)) {
+		struct iser_data_buf *pbuf_out = &iser_task->prot[ISER_DIR_OUT];
+
+		err = iser_dma_map_task_data(iser_task,
+					     pbuf_out,
+					     ISER_DIR_OUT,
+					     DMA_TO_DEVICE);
+		if (err)
+			return err;
+	}
+
+	err = device->iser_reg_rdma_mem(iser_task, ISER_DIR_OUT);
+	if (err != 0) {
+		iser_err("Failed to register write cmd RDMA mem\n");
+		return err;
+	}
+
+	regd_buf = &iser_task->rdma_regd[ISER_DIR_OUT];
+
+	if (unsol_sz < edtl) {
+		hdr->flags     |= ISER_WSV;
+		hdr->write_stag = cpu_to_be32(regd_buf->reg.rkey);
+		hdr->write_va   = cpu_to_be64(regd_buf->reg.va + unsol_sz);
+
+		iser_dbg("Cmd itt:%d, WRITE tags, RKEY:%#.4X "
+			 "VA:%#llX + unsol:%d\n",
+			 task->itt, regd_buf->reg.rkey,
+			 (unsigned long long)regd_buf->reg.va, unsol_sz);
+	}
+
+	if (imm_sz > 0) {
+		iser_dbg("Cmd itt:%d, WRITE, adding imm.data sz: %d\n",
+			 task->itt, imm_sz);
+		tx_dsg->addr   = regd_buf->reg.va;
+		tx_dsg->length = imm_sz;
+		tx_dsg->lkey   = regd_buf->reg.lkey;
+		iser_task->desc.num_sge = 2;
+	}
+
+	return 0;
+}
+
+/* creates a new tx descriptor and adds header regd buffer */
+static void iser_create_send_desc(struct iser_conn	*iser_conn,
+				  struct iser_tx_desc	*tx_desc)
+{
+	struct iser_device *device = iser_conn->ib_conn.device;
+
+	ib_dma_sync_single_for_cpu(device->ib_device,
+		tx_desc->dma_addr, ISER_HEADERS_LEN, DMA_TO_DEVICE);
+
+	memset(&tx_desc->iser_header, 0, sizeof(struct iser_hdr));
+	tx_desc->iser_header.flags = ISER_VER;
+
+	tx_desc->num_sge = 1;
+
+	if (tx_desc->tx_sg[0].lkey != device->mr->lkey) {
+		tx_desc->tx_sg[0].lkey = device->mr->lkey;
+		iser_dbg("sdesc %p lkey mismatch, fixing\n", tx_desc);
+	}
+}
+
+static void iser_free_login_buf(struct iser_conn *iser_conn)
+{
+	struct iser_device *device = iser_conn->ib_conn.device;
+
+	if (!iser_conn->login_buf)
+		return;
+
+	if (iser_conn->login_req_dma)
+		ib_dma_unmap_single(device->ib_device,
+				    iser_conn->login_req_dma,
+				    ISCSI_DEF_MAX_RECV_SEG_LEN, DMA_TO_DEVICE);
+
+	if (iser_conn->login_resp_dma)
+		ib_dma_unmap_single(device->ib_device,
+				    iser_conn->login_resp_dma,
+				    ISER_RX_LOGIN_SIZE, DMA_FROM_DEVICE);
+
+	kfree(iser_conn->login_buf);
+
+	/* make sure we never redo any unmapping */
+	iser_conn->login_req_dma = 0;
+	iser_conn->login_resp_dma = 0;
+	iser_conn->login_buf = NULL;
+}
+
+static int iser_alloc_login_buf(struct iser_conn *iser_conn)
+{
+	struct iser_device *device = iser_conn->ib_conn.device;
+	int			req_err, resp_err;
+
+	BUG_ON(device == NULL);
+
+	iser_conn->login_buf = kmalloc(ISCSI_DEF_MAX_RECV_SEG_LEN +
+				     ISER_RX_LOGIN_SIZE, GFP_KERNEL);
+	if (!iser_conn->login_buf)
+		goto out_err;
+
+	iser_conn->login_req_buf  = iser_conn->login_buf;
+	iser_conn->login_resp_buf = iser_conn->login_buf +
+						ISCSI_DEF_MAX_RECV_SEG_LEN;
+
+	iser_conn->login_req_dma = ib_dma_map_single(device->ib_device,
+						     iser_conn->login_req_buf,
+						     ISCSI_DEF_MAX_RECV_SEG_LEN,
+						     DMA_TO_DEVICE);
+
+	iser_conn->login_resp_dma = ib_dma_map_single(device->ib_device,
+						      iser_conn->login_resp_buf,
+						      ISER_RX_LOGIN_SIZE,
+						      DMA_FROM_DEVICE);
+
+	req_err  = ib_dma_mapping_error(device->ib_device,
+					iser_conn->login_req_dma);
+	resp_err = ib_dma_mapping_error(device->ib_device,
+					iser_conn->login_resp_dma);
+
+	if (req_err || resp_err) {
+		if (req_err)
+			iser_conn->login_req_dma = 0;
+		if (resp_err)
+			iser_conn->login_resp_dma = 0;
+		goto free_login_buf;
+	}
+	return 0;
+
+free_login_buf:
+	iser_free_login_buf(iser_conn);
+
+out_err:
+	iser_err("unable to alloc or map login buf\n");
+	return -ENOMEM;
+}
+
+int iser_alloc_rx_descriptors(struct iser_conn *iser_conn,
+			      struct iscsi_session *session)
+{
+	int i, j;
+	u64 dma_addr;
+	struct iser_rx_desc *rx_desc;
+	struct ib_sge       *rx_sg;
+	struct ib_conn *ib_conn = &iser_conn->ib_conn;
+	struct iser_device *device = ib_conn->device;
+
+	iser_conn->qp_max_recv_dtos = session->cmds_max;
+	iser_conn->qp_max_recv_dtos_mask = session->cmds_max - 1; /* cmds_max is 2^N */
+	iser_conn->min_posted_rx = iser_conn->qp_max_recv_dtos >> 2;
+
+	if (device->iser_alloc_rdma_reg_res(ib_conn, session->scsi_cmds_max))
+		goto create_rdma_reg_res_failed;
+
+	if (iser_alloc_login_buf(iser_conn))
+		goto alloc_login_buf_fail;
+
+	iser_conn->num_rx_descs = session->cmds_max;
+	iser_conn->rx_descs = kmalloc(iser_conn->num_rx_descs *
+				sizeof(struct iser_rx_desc), GFP_KERNEL);
+	if (!iser_conn->rx_descs)
+		goto rx_desc_alloc_fail;
+
+	rx_desc = iser_conn->rx_descs;
+
+	for (i = 0; i < iser_conn->qp_max_recv_dtos; i++, rx_desc++)  {
+		dma_addr = ib_dma_map_single(device->ib_device, (void *)rx_desc,
+					ISER_RX_PAYLOAD_SIZE, DMA_FROM_DEVICE);
+		if (ib_dma_mapping_error(device->ib_device, dma_addr))
+			goto rx_desc_dma_map_failed;
+
+		rx_desc->dma_addr = dma_addr;
+
+		rx_sg = &rx_desc->rx_sg;
+		rx_sg->addr   = rx_desc->dma_addr;
+		rx_sg->length = ISER_RX_PAYLOAD_SIZE;
+		rx_sg->lkey   = device->mr->lkey;
+	}
+
+	iser_conn->rx_desc_head = 0;
+	return 0;
+
+rx_desc_dma_map_failed:
+	rx_desc = iser_conn->rx_descs;
+	for (j = 0; j < i; j++, rx_desc++)
+		ib_dma_unmap_single(device->ib_device, rx_desc->dma_addr,
+				    ISER_RX_PAYLOAD_SIZE, DMA_FROM_DEVICE);
+	kfree(iser_conn->rx_descs);
+	iser_conn->rx_descs = NULL;
+rx_desc_alloc_fail:
+	iser_free_login_buf(iser_conn);
+alloc_login_buf_fail:
+	device->iser_free_rdma_reg_res(ib_conn);
+create_rdma_reg_res_failed:
+	iser_err("failed allocating rx descriptors / data buffers\n");
+	return -ENOMEM;
+}
+
+void iser_free_rx_descriptors(struct iser_conn *iser_conn)
+{
+	int i;
+	struct iser_rx_desc *rx_desc;
+	struct ib_conn *ib_conn = &iser_conn->ib_conn;
+	struct iser_device *device = ib_conn->device;
+
+	if (!iser_conn->rx_descs)
+		goto free_login_buf;
+
+	if (device->iser_free_rdma_reg_res)
+		device->iser_free_rdma_reg_res(ib_conn);
+
+	rx_desc = iser_conn->rx_descs;
+	for (i = 0; i < iser_conn->qp_max_recv_dtos; i++, rx_desc++)
+		ib_dma_unmap_single(device->ib_device, rx_desc->dma_addr,
+				    ISER_RX_PAYLOAD_SIZE, DMA_FROM_DEVICE);
+	kfree(iser_conn->rx_descs);
+	/* make sure we never redo any unmapping */
+	iser_conn->rx_descs = NULL;
+
+free_login_buf:
+	iser_free_login_buf(iser_conn);
+}
+
+static int iser_post_rx_bufs(struct iscsi_conn *conn, struct iscsi_hdr *req)
+{
+	struct iser_conn *iser_conn = conn->dd_data;
+	struct ib_conn *ib_conn = &iser_conn->ib_conn;
+	struct iscsi_session *session = conn->session;
+
+	iser_dbg("req op %x flags %x\n", req->opcode, req->flags);
+	/* check if this is the last login - going to full feature phase */
+	if ((req->flags & ISCSI_FULL_FEATURE_PHASE) != ISCSI_FULL_FEATURE_PHASE)
+		return 0;
+
+	/*
+	 * Check that there is one posted recv buffer
+	 * (for the last login response).
+	 */
+	WARN_ON(ib_conn->post_recv_buf_count != 1);
+
+	if (session->discovery_sess) {
+		iser_info("Discovery session, re-using login RX buffer\n");
+		return 0;
+	} else
+		iser_info("Normal session, posting batch of RX %d buffers\n",
+			  iser_conn->min_posted_rx);
+
+	/* Initial post receive buffers */
+	if (iser_post_recvm(iser_conn, iser_conn->min_posted_rx))
+		return -ENOMEM;
+
+	return 0;
+}
+
+static inline bool iser_signal_comp(int sig_count)
+{
+	return ((sig_count % ISER_SIGNAL_CMD_COUNT) == 0);
+}
+
+/**
+ * iser_send_command - send command PDU
+ */
+int iser_send_command(struct iscsi_conn *conn,
+		      struct iscsi_task *task)
+{
+	struct iser_conn *iser_conn = conn->dd_data;
+	struct iscsi_iser_task *iser_task = task->dd_data;
+	unsigned long edtl;
+	int err;
+	struct iser_data_buf *data_buf, *prot_buf;
+	struct iscsi_scsi_req *hdr = (struct iscsi_scsi_req *)task->hdr;
+	struct scsi_cmnd *sc  =  task->sc;
+	struct iser_tx_desc *tx_desc = &iser_task->desc;
+	static unsigned sig_count;
+
+	edtl = ntohl(hdr->data_length);
+
+	/* build the tx desc regd header and add it to the tx desc dto */
+	tx_desc->type = ISCSI_TX_SCSI_COMMAND;
+	iser_create_send_desc(iser_conn, tx_desc);
+
+	if (hdr->flags & ISCSI_FLAG_CMD_READ) {
+		data_buf = &iser_task->data[ISER_DIR_IN];
+		prot_buf = &iser_task->prot[ISER_DIR_IN];
+	} else {
+		data_buf = &iser_task->data[ISER_DIR_OUT];
+		prot_buf = &iser_task->prot[ISER_DIR_OUT];
+	}
+
+	if (scsi_sg_count(sc)) { /* using a scatter list */
+		data_buf->buf  = scsi_sglist(sc);
+		data_buf->size = scsi_sg_count(sc);
+	}
+	data_buf->data_len = scsi_bufflen(sc);
+
+	if (scsi_prot_sg_count(sc)) {
+		prot_buf->buf  = scsi_prot_sglist(sc);
+		prot_buf->size = scsi_prot_sg_count(sc);
+		prot_buf->data_len = data_buf->data_len >>
+				     ilog2(sc->device->sector_size) * 8;
+	}
+
+	if (hdr->flags & ISCSI_FLAG_CMD_READ) {
+		err = iser_prepare_read_cmd(task);
+		if (err)
+			goto send_command_error;
+	}
+	if (hdr->flags & ISCSI_FLAG_CMD_WRITE) {
+		err = iser_prepare_write_cmd(task,
+					     task->imm_count,
+				             task->imm_count +
+					     task->unsol_r2t.data_length,
+					     edtl);
+		if (err)
+			goto send_command_error;
+	}
+
+	iser_task->status = ISER_TASK_STATUS_STARTED;
+
+	err = iser_post_send(&iser_conn->ib_conn, tx_desc,
+			     iser_signal_comp(++sig_count));
+	if (!err)
+		return 0;
+
+send_command_error:
+	iser_err("conn %p failed task->itt %d err %d\n",conn, task->itt, err);
+	return err;
+}
+
+/**
+ * iser_send_data_out - send data out PDU
+ */
+int iser_send_data_out(struct iscsi_conn *conn,
+		       struct iscsi_task *task,
+		       struct iscsi_data *hdr)
+{
+	struct iser_conn *iser_conn = conn->dd_data;
+	struct iscsi_iser_task *iser_task = task->dd_data;
+	struct iser_tx_desc *tx_desc = NULL;
+	struct iser_regd_buf *regd_buf;
+	unsigned long buf_offset;
+	unsigned long data_seg_len;
+	uint32_t itt;
+	int err = 0;
+	struct ib_sge *tx_dsg;
+
+	itt = (__force uint32_t)hdr->itt;
+	data_seg_len = ntoh24(hdr->dlength);
+	buf_offset   = ntohl(hdr->offset);
+
+	iser_dbg("%s itt %d dseg_len %d offset %d\n",
+		 __func__,(int)itt,(int)data_seg_len,(int)buf_offset);
+
+	tx_desc = kmem_cache_zalloc(ig.desc_cache, GFP_ATOMIC);
+	if (tx_desc == NULL) {
+		iser_err("Failed to alloc desc for post dataout\n");
+		return -ENOMEM;
+	}
+
+	tx_desc->type = ISCSI_TX_DATAOUT;
+	tx_desc->iser_header.flags = ISER_VER;
+	memcpy(&tx_desc->iscsi_header, hdr, sizeof(struct iscsi_hdr));
+
+	/* build the tx desc */
+	iser_initialize_task_headers(task, tx_desc);
+
+	regd_buf = &iser_task->rdma_regd[ISER_DIR_OUT];
+	tx_dsg = &tx_desc->tx_sg[1];
+	tx_dsg->addr    = regd_buf->reg.va + buf_offset;
+	tx_dsg->length  = data_seg_len;
+	tx_dsg->lkey    = regd_buf->reg.lkey;
+	tx_desc->num_sge = 2;
+
+	if (buf_offset + data_seg_len > iser_task->data[ISER_DIR_OUT].data_len) {
+		iser_err("Offset:%ld & DSL:%ld in Data-Out "
+			 "inconsistent with total len:%ld, itt:%d\n",
+			 buf_offset, data_seg_len,
+			 iser_task->data[ISER_DIR_OUT].data_len, itt);
+		err = -EINVAL;
+		goto send_data_out_error;
+	}
+	iser_dbg("data-out itt: %d, offset: %ld, sz: %ld\n",
+		 itt, buf_offset, data_seg_len);
+
+
+	err = iser_post_send(&iser_conn->ib_conn, tx_desc, true);
+	if (!err)
+		return 0;
+
+send_data_out_error:
+	kmem_cache_free(ig.desc_cache, tx_desc);
+	iser_err("conn %p failed err %d\n",conn, err);
+	return err;
+}
+
+int iser_send_control(struct iscsi_conn *conn,
+		      struct iscsi_task *task)
+{
+	struct iser_conn *iser_conn = conn->dd_data;
+	struct iscsi_iser_task *iser_task = task->dd_data;
+	struct iser_tx_desc *mdesc = &iser_task->desc;
+	unsigned long data_seg_len;
+	int err = 0;
+	struct iser_device *device;
+
+	/* build the tx desc regd header and add it to the tx desc dto */
+	mdesc->type = ISCSI_TX_CONTROL;
+	iser_create_send_desc(iser_conn, mdesc);
+
+	device = iser_conn->ib_conn.device;
+
+	data_seg_len = ntoh24(task->hdr->dlength);
+
+	if (data_seg_len > 0) {
+		struct ib_sge *tx_dsg = &mdesc->tx_sg[1];
+		if (task != conn->login_task) {
+			iser_err("data present on non login task!!!\n");
+			goto send_control_error;
+		}
+
+		ib_dma_sync_single_for_cpu(device->ib_device,
+			iser_conn->login_req_dma, task->data_count,
+			DMA_TO_DEVICE);
+
+		memcpy(iser_conn->login_req_buf, task->data, task->data_count);
+
+		ib_dma_sync_single_for_device(device->ib_device,
+			iser_conn->login_req_dma, task->data_count,
+			DMA_TO_DEVICE);
+
+		tx_dsg->addr    = iser_conn->login_req_dma;
+		tx_dsg->length  = task->data_count;
+		tx_dsg->lkey    = device->mr->lkey;
+		mdesc->num_sge = 2;
+	}
+
+	if (task == conn->login_task) {
+		iser_dbg("op %x dsl %lx, posting login rx buffer\n",
+			 task->hdr->opcode, data_seg_len);
+		err = iser_post_recvl(iser_conn);
+		if (err)
+			goto send_control_error;
+		err = iser_post_rx_bufs(conn, task->hdr);
+		if (err)
+			goto send_control_error;
+	}
+
+	err = iser_post_send(&iser_conn->ib_conn, mdesc, true);
+	if (!err)
+		return 0;
+
+send_control_error:
+	iser_err("conn %p failed err %d\n",conn, err);
+	return err;
+}
+
+/**
+ * iser_rcv_dto_completion - recv DTO completion
+ */
+void iser_rcv_completion(struct iser_rx_desc *rx_desc,
+			 unsigned long rx_xfer_len,
+			 struct ib_conn *ib_conn)
+{
+	struct iser_conn *iser_conn = container_of(ib_conn, struct iser_conn,
+						   ib_conn);
+	struct iscsi_hdr *hdr;
+	u64 rx_dma;
+	int rx_buflen, outstanding, count, err;
+
+	/* differentiate between login to all other PDUs */
+	if ((char *)rx_desc == iser_conn->login_resp_buf) {
+		rx_dma = iser_conn->login_resp_dma;
+		rx_buflen = ISER_RX_LOGIN_SIZE;
+	} else {
+		rx_dma = rx_desc->dma_addr;
+		rx_buflen = ISER_RX_PAYLOAD_SIZE;
+	}
+
+	ib_dma_sync_single_for_cpu(ib_conn->device->ib_device, rx_dma,
+				   rx_buflen, DMA_FROM_DEVICE);
+
+	hdr = &rx_desc->iscsi_header;
+
+	iser_dbg("op 0x%x itt 0x%x dlen %d\n", hdr->opcode,
+			hdr->itt, (int)(rx_xfer_len - ISER_HEADERS_LEN));
+
+	iscsi_iser_recv(iser_conn->iscsi_conn, hdr, rx_desc->data,
+			rx_xfer_len - ISER_HEADERS_LEN);
+
+	ib_dma_sync_single_for_device(ib_conn->device->ib_device, rx_dma,
+				      rx_buflen, DMA_FROM_DEVICE);
+
+	/* decrementing conn->post_recv_buf_count only --after-- freeing the   *
+	 * task eliminates the need to worry on tasks which are completed in   *
+	 * parallel to the execution of iser_conn_term. So the code that waits *
+	 * for the posted rx bufs refcount to become zero handles everything   */
+	ib_conn->post_recv_buf_count--;
+
+	if (rx_dma == iser_conn->login_resp_dma)
+		return;
+
+	outstanding = ib_conn->post_recv_buf_count;
+	if (outstanding + iser_conn->min_posted_rx <= iser_conn->qp_max_recv_dtos) {
+		count = min(iser_conn->qp_max_recv_dtos - outstanding,
+			    iser_conn->min_posted_rx);
+		err = iser_post_recvm(iser_conn, count);
+		if (err)
+			iser_err("posting %d rx bufs err %d\n", count, err);
+	}
+}
+
+void iser_snd_completion(struct iser_tx_desc *tx_desc,
+			struct ib_conn *ib_conn)
+{
+	struct iscsi_task *task;
+	struct iser_device *device = ib_conn->device;
+
+	if (tx_desc->type == ISCSI_TX_DATAOUT) {
+		ib_dma_unmap_single(device->ib_device, tx_desc->dma_addr,
+					ISER_HEADERS_LEN, DMA_TO_DEVICE);
+		kmem_cache_free(ig.desc_cache, tx_desc);
+		tx_desc = NULL;
+	}
+
+	if (tx_desc && tx_desc->type == ISCSI_TX_CONTROL) {
+		/* this arithmetic is legal by libiscsi dd_data allocation */
+		task = (void *) ((long)(void *)tx_desc -
+				  sizeof(struct iscsi_task));
+		if (task->hdr->itt == RESERVED_ITT)
+			iscsi_put_task(task);
+	}
+}
+
+void iser_task_rdma_init(struct iscsi_iser_task *iser_task)
+
+{
+	iser_task->status = ISER_TASK_STATUS_INIT;
+
+	iser_task->dir[ISER_DIR_IN] = 0;
+	iser_task->dir[ISER_DIR_OUT] = 0;
+
+	iser_task->data[ISER_DIR_IN].data_len  = 0;
+	iser_task->data[ISER_DIR_OUT].data_len = 0;
+
+	iser_task->prot[ISER_DIR_IN].data_len  = 0;
+	iser_task->prot[ISER_DIR_OUT].data_len = 0;
+
+	memset(&iser_task->rdma_regd[ISER_DIR_IN], 0,
+	       sizeof(struct iser_regd_buf));
+	memset(&iser_task->rdma_regd[ISER_DIR_OUT], 0,
+	       sizeof(struct iser_regd_buf));
+}
+
+void iser_task_rdma_finalize(struct iscsi_iser_task *iser_task)
+{
+	struct iser_device *device = iser_task->iser_conn->ib_conn.device;
+	int is_rdma_data_aligned = 1;
+	int is_rdma_prot_aligned = 1;
+	int prot_count = scsi_prot_sg_count(iser_task->sc);
+
+	/* if we were reading, copy back to unaligned sglist,
+	 * anyway dma_unmap and free the copy
+	 */
+	if (iser_task->data_copy[ISER_DIR_IN].copy_buf != NULL) {
+		is_rdma_data_aligned = 0;
+		iser_finalize_rdma_unaligned_sg(iser_task,
+						&iser_task->data[ISER_DIR_IN],
+						&iser_task->data_copy[ISER_DIR_IN],
+						ISER_DIR_IN);
+	}
+
+	if (iser_task->data_copy[ISER_DIR_OUT].copy_buf != NULL) {
+		is_rdma_data_aligned = 0;
+		iser_finalize_rdma_unaligned_sg(iser_task,
+						&iser_task->data[ISER_DIR_OUT],
+						&iser_task->data_copy[ISER_DIR_OUT],
+						ISER_DIR_OUT);
+	}
+
+	if (iser_task->prot_copy[ISER_DIR_IN].copy_buf != NULL) {
+		is_rdma_prot_aligned = 0;
+		iser_finalize_rdma_unaligned_sg(iser_task,
+						&iser_task->prot[ISER_DIR_IN],
+						&iser_task->prot_copy[ISER_DIR_IN],
+						ISER_DIR_IN);
+	}
+
+	if (iser_task->prot_copy[ISER_DIR_OUT].copy_buf != NULL) {
+		is_rdma_prot_aligned = 0;
+		iser_finalize_rdma_unaligned_sg(iser_task,
+						&iser_task->prot[ISER_DIR_OUT],
+						&iser_task->prot_copy[ISER_DIR_OUT],
+						ISER_DIR_OUT);
+	}
+
+	if (iser_task->dir[ISER_DIR_IN]) {
+		device->iser_unreg_rdma_mem(iser_task, ISER_DIR_IN);
+		if (is_rdma_data_aligned)
+			iser_dma_unmap_task_data(iser_task,
+						 &iser_task->data[ISER_DIR_IN]);
+		if (prot_count && is_rdma_prot_aligned)
+			iser_dma_unmap_task_data(iser_task,
+						 &iser_task->prot[ISER_DIR_IN]);
+	}
+
+	if (iser_task->dir[ISER_DIR_OUT]) {
+		device->iser_unreg_rdma_mem(iser_task, ISER_DIR_OUT);
+		if (is_rdma_data_aligned)
+			iser_dma_unmap_task_data(iser_task,
+						 &iser_task->data[ISER_DIR_OUT]);
+		if (prot_count && is_rdma_prot_aligned)
+			iser_dma_unmap_task_data(iser_task,
+						 &iser_task->prot[ISER_DIR_OUT]);
+	}
+}
diff --git a/drivers/infiniband/ulp/iser/iser_memory.c b/drivers/infiniband/ulp/iser/iser_memory.c
new file mode 100644
index 0000000..6c5ce35
--- /dev/null
+++ b/drivers/infiniband/ulp/iser/iser_memory.c
@@ -0,0 +1,797 @@
+/*
+ * Copyright (c) 2004, 2005, 2006 Voltaire, Inc. All rights reserved.
+ * Copyright (c) 2013-2014 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *	- Redistributions of source code must retain the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer.
+ *
+ *	- Redistributions in binary form must reproduce the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer in the documentation and/or other materials
+ *	  provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/highmem.h>
+#include <linux/scatterlist.h>
+
+#include "iscsi_iser.h"
+
+#define ISER_KMALLOC_THRESHOLD 0x20000 /* 128K - kmalloc limit */
+
+/**
+ * iser_start_rdma_unaligned_sg
+ */
+static int iser_start_rdma_unaligned_sg(struct iscsi_iser_task *iser_task,
+					struct iser_data_buf *data,
+					struct iser_data_buf *data_copy,
+					enum iser_data_dir cmd_dir)
+{
+	struct ib_device *dev = iser_task->iser_conn->ib_conn.device->ib_device;
+	struct scatterlist *sgl = (struct scatterlist *)data->buf;
+	struct scatterlist *sg;
+	char *mem = NULL;
+	unsigned long  cmd_data_len = 0;
+	int dma_nents, i;
+
+	for_each_sg(sgl, sg, data->size, i)
+		cmd_data_len += ib_sg_dma_len(dev, sg);
+
+	if (cmd_data_len > ISER_KMALLOC_THRESHOLD)
+		mem = (void *)__get_free_pages(GFP_ATOMIC,
+		      ilog2(roundup_pow_of_two(cmd_data_len)) - PAGE_SHIFT);
+	else
+		mem = kmalloc(cmd_data_len, GFP_ATOMIC);
+
+	if (mem == NULL) {
+		iser_err("Failed to allocate mem size %d %d for copying sglist\n",
+			 data->size, (int)cmd_data_len);
+		return -ENOMEM;
+	}
+
+	if (cmd_dir == ISER_DIR_OUT) {
+		/* copy the unaligned sg the buffer which is used for RDMA */
+		int i;
+		char *p, *from;
+
+		sgl = (struct scatterlist *)data->buf;
+		p = mem;
+		for_each_sg(sgl, sg, data->size, i) {
+			from = kmap_atomic(sg_page(sg));
+			memcpy(p,
+			       from + sg->offset,
+			       sg->length);
+			kunmap_atomic(from);
+			p += sg->length;
+		}
+	}
+
+	sg_init_one(&data_copy->sg_single, mem, cmd_data_len);
+	data_copy->buf = &data_copy->sg_single;
+	data_copy->size = 1;
+	data_copy->copy_buf = mem;
+
+	dma_nents = ib_dma_map_sg(dev, &data_copy->sg_single, 1,
+				  (cmd_dir == ISER_DIR_OUT) ?
+				  DMA_TO_DEVICE : DMA_FROM_DEVICE);
+	BUG_ON(dma_nents == 0);
+
+	data_copy->dma_nents = dma_nents;
+	data_copy->data_len = cmd_data_len;
+
+	return 0;
+}
+
+/**
+ * iser_finalize_rdma_unaligned_sg
+ */
+
+void iser_finalize_rdma_unaligned_sg(struct iscsi_iser_task *iser_task,
+				     struct iser_data_buf *data,
+				     struct iser_data_buf *data_copy,
+				     enum iser_data_dir cmd_dir)
+{
+	struct ib_device *dev;
+	unsigned long  cmd_data_len;
+
+	dev = iser_task->iser_conn->ib_conn.device->ib_device;
+
+	ib_dma_unmap_sg(dev, &data_copy->sg_single, 1,
+			(cmd_dir == ISER_DIR_OUT) ?
+			DMA_TO_DEVICE : DMA_FROM_DEVICE);
+
+	if (cmd_dir == ISER_DIR_IN) {
+		char *mem;
+		struct scatterlist *sgl, *sg;
+		unsigned char *p, *to;
+		unsigned int sg_size;
+		int i;
+
+		/* copy back read RDMA to unaligned sg */
+		mem = data_copy->copy_buf;
+
+		sgl = (struct scatterlist *)data->buf;
+		sg_size = data->size;
+
+		p = mem;
+		for_each_sg(sgl, sg, sg_size, i) {
+			to = kmap_atomic(sg_page(sg));
+			memcpy(to + sg->offset,
+			       p,
+			       sg->length);
+			kunmap_atomic(to);
+			p += sg->length;
+		}
+	}
+
+	cmd_data_len = data->data_len;
+
+	if (cmd_data_len > ISER_KMALLOC_THRESHOLD)
+		free_pages((unsigned long)data_copy->copy_buf,
+			   ilog2(roundup_pow_of_two(cmd_data_len)) - PAGE_SHIFT);
+	else
+		kfree(data_copy->copy_buf);
+
+	data_copy->copy_buf = NULL;
+}
+
+#define IS_4K_ALIGNED(addr)	((((unsigned long)addr) & ~MASK_4K) == 0)
+
+/**
+ * iser_sg_to_page_vec - Translates scatterlist entries to physical addresses
+ * and returns the length of resulting physical address array (may be less than
+ * the original due to possible compaction).
+ *
+ * we build a "page vec" under the assumption that the SG meets the RDMA
+ * alignment requirements. Other then the first and last SG elements, all
+ * the "internal" elements can be compacted into a list whose elements are
+ * dma addresses of physical pages. The code supports also the weird case
+ * where --few fragments of the same page-- are present in the SG as
+ * consecutive elements. Also, it handles one entry SG.
+ */
+
+static int iser_sg_to_page_vec(struct iser_data_buf *data,
+			       struct ib_device *ibdev, u64 *pages,
+			       int *offset, int *data_size)
+{
+	struct scatterlist *sg, *sgl = (struct scatterlist *)data->buf;
+	u64 start_addr, end_addr, page, chunk_start = 0;
+	unsigned long total_sz = 0;
+	unsigned int dma_len;
+	int i, new_chunk, cur_page, last_ent = data->dma_nents - 1;
+
+	/* compute the offset of first element */
+	*offset = (u64) sgl[0].offset & ~MASK_4K;
+
+	new_chunk = 1;
+	cur_page  = 0;
+	for_each_sg(sgl, sg, data->dma_nents, i) {
+		start_addr = ib_sg_dma_address(ibdev, sg);
+		if (new_chunk)
+			chunk_start = start_addr;
+		dma_len = ib_sg_dma_len(ibdev, sg);
+		end_addr = start_addr + dma_len;
+		total_sz += dma_len;
+
+		/* collect page fragments until aligned or end of SG list */
+		if (!IS_4K_ALIGNED(end_addr) && i < last_ent) {
+			new_chunk = 0;
+			continue;
+		}
+		new_chunk = 1;
+
+		/* address of the first page in the contiguous chunk;
+		   masking relevant for the very first SG entry,
+		   which might be unaligned */
+		page = chunk_start & MASK_4K;
+		do {
+			pages[cur_page++] = page;
+			page += SIZE_4K;
+		} while (page < end_addr);
+	}
+
+	*data_size = total_sz;
+	iser_dbg("page_vec->data_size:%d cur_page %d\n",
+		 *data_size, cur_page);
+	return cur_page;
+}
+
+
+/**
+ * iser_data_buf_aligned_len - Tries to determine the maximal correctly aligned
+ * for RDMA sub-list of a scatter-gather list of memory buffers, and  returns
+ * the number of entries which are aligned correctly. Supports the case where
+ * consecutive SG elements are actually fragments of the same physcial page.
+ */
+static int iser_data_buf_aligned_len(struct iser_data_buf *data,
+				      struct ib_device *ibdev)
+{
+	struct scatterlist *sgl, *sg, *next_sg = NULL;
+	u64 start_addr, end_addr;
+	int i, ret_len, start_check = 0;
+
+	if (data->dma_nents == 1)
+		return 1;
+
+	sgl = (struct scatterlist *)data->buf;
+	start_addr  = ib_sg_dma_address(ibdev, sgl);
+
+	for_each_sg(sgl, sg, data->dma_nents, i) {
+		if (start_check && !IS_4K_ALIGNED(start_addr))
+			break;
+
+		next_sg = sg_next(sg);
+		if (!next_sg)
+			break;
+
+		end_addr    = start_addr + ib_sg_dma_len(ibdev, sg);
+		start_addr  = ib_sg_dma_address(ibdev, next_sg);
+
+		if (end_addr == start_addr) {
+			start_check = 0;
+			continue;
+		} else
+			start_check = 1;
+
+		if (!IS_4K_ALIGNED(end_addr))
+			break;
+	}
+	ret_len = (next_sg) ? i : i+1;
+	iser_dbg("Found %d aligned entries out of %d in sg:0x%p\n",
+		 ret_len, data->dma_nents, data);
+	return ret_len;
+}
+
+static void iser_data_buf_dump(struct iser_data_buf *data,
+			       struct ib_device *ibdev)
+{
+	struct scatterlist *sgl = (struct scatterlist *)data->buf;
+	struct scatterlist *sg;
+	int i;
+
+	for_each_sg(sgl, sg, data->dma_nents, i)
+		iser_dbg("sg[%d] dma_addr:0x%lX page:0x%p "
+			 "off:0x%x sz:0x%x dma_len:0x%x\n",
+			 i, (unsigned long)ib_sg_dma_address(ibdev, sg),
+			 sg_page(sg), sg->offset,
+			 sg->length, ib_sg_dma_len(ibdev, sg));
+}
+
+static void iser_dump_page_vec(struct iser_page_vec *page_vec)
+{
+	int i;
+
+	iser_err("page vec length %d data size %d\n",
+		 page_vec->length, page_vec->data_size);
+	for (i = 0; i < page_vec->length; i++)
+		iser_err("%d %lx\n",i,(unsigned long)page_vec->pages[i]);
+}
+
+static void iser_page_vec_build(struct iser_data_buf *data,
+				struct iser_page_vec *page_vec,
+				struct ib_device *ibdev)
+{
+	int page_vec_len = 0;
+
+	page_vec->length = 0;
+	page_vec->offset = 0;
+
+	iser_dbg("Translating sg sz: %d\n", data->dma_nents);
+	page_vec_len = iser_sg_to_page_vec(data, ibdev, page_vec->pages,
+					   &page_vec->offset,
+					   &page_vec->data_size);
+	iser_dbg("sg len %d page_vec_len %d\n", data->dma_nents, page_vec_len);
+
+	page_vec->length = page_vec_len;
+
+	if (page_vec_len * SIZE_4K < page_vec->data_size) {
+		iser_err("page_vec too short to hold this SG\n");
+		iser_data_buf_dump(data, ibdev);
+		iser_dump_page_vec(page_vec);
+		BUG();
+	}
+}
+
+int iser_dma_map_task_data(struct iscsi_iser_task *iser_task,
+			    struct iser_data_buf *data,
+			    enum iser_data_dir iser_dir,
+			    enum dma_data_direction dma_dir)
+{
+	struct ib_device *dev;
+
+	iser_task->dir[iser_dir] = 1;
+	dev = iser_task->iser_conn->ib_conn.device->ib_device;
+
+	data->dma_nents = ib_dma_map_sg(dev, data->buf, data->size, dma_dir);
+	if (data->dma_nents == 0) {
+		iser_err("dma_map_sg failed!!!\n");
+		return -EINVAL;
+	}
+	return 0;
+}
+
+void iser_dma_unmap_task_data(struct iscsi_iser_task *iser_task,
+			      struct iser_data_buf *data)
+{
+	struct ib_device *dev;
+
+	dev = iser_task->iser_conn->ib_conn.device->ib_device;
+	ib_dma_unmap_sg(dev, data->buf, data->size, DMA_FROM_DEVICE);
+}
+
+static int fall_to_bounce_buf(struct iscsi_iser_task *iser_task,
+			      struct ib_device *ibdev,
+			      struct iser_data_buf *mem,
+			      struct iser_data_buf *mem_copy,
+			      enum iser_data_dir cmd_dir,
+			      int aligned_len)
+{
+	struct iscsi_conn    *iscsi_conn = iser_task->iser_conn->iscsi_conn;
+
+	iscsi_conn->fmr_unalign_cnt++;
+	iser_warn("rdma alignment violation (%d/%d aligned) or FMR not supported\n",
+		  aligned_len, mem->size);
+
+	if (iser_debug_level > 0)
+		iser_data_buf_dump(mem, ibdev);
+
+	/* unmap the command data before accessing it */
+	iser_dma_unmap_task_data(iser_task, mem);
+
+	/* allocate copy buf, if we are writing, copy the */
+	/* unaligned scatterlist, dma map the copy        */
+	if (iser_start_rdma_unaligned_sg(iser_task, mem, mem_copy, cmd_dir) != 0)
+		return -ENOMEM;
+
+	return 0;
+}
+
+/**
+ * iser_reg_rdma_mem_fmr - Registers memory intended for RDMA,
+ * using FMR (if possible) obtaining rkey and va
+ *
+ * returns 0 on success, errno code on failure
+ */
+int iser_reg_rdma_mem_fmr(struct iscsi_iser_task *iser_task,
+			  enum iser_data_dir cmd_dir)
+{
+	struct ib_conn *ib_conn = &iser_task->iser_conn->ib_conn;
+	struct iser_device   *device = ib_conn->device;
+	struct ib_device     *ibdev = device->ib_device;
+	struct iser_data_buf *mem = &iser_task->data[cmd_dir];
+	struct iser_regd_buf *regd_buf;
+	int aligned_len;
+	int err;
+	int i;
+	struct scatterlist *sg;
+
+	regd_buf = &iser_task->rdma_regd[cmd_dir];
+
+	aligned_len = iser_data_buf_aligned_len(mem, ibdev);
+	if (aligned_len != mem->dma_nents) {
+		err = fall_to_bounce_buf(iser_task, ibdev, mem,
+					 &iser_task->data_copy[cmd_dir],
+					 cmd_dir, aligned_len);
+		if (err) {
+			iser_err("failed to allocate bounce buffer\n");
+			return err;
+		}
+		mem = &iser_task->data_copy[cmd_dir];
+	}
+
+	/* if there a single dma entry, FMR is not needed */
+	if (mem->dma_nents == 1) {
+		sg = (struct scatterlist *)mem->buf;
+
+		regd_buf->reg.lkey = device->mr->lkey;
+		regd_buf->reg.rkey = device->mr->rkey;
+		regd_buf->reg.len  = ib_sg_dma_len(ibdev, &sg[0]);
+		regd_buf->reg.va   = ib_sg_dma_address(ibdev, &sg[0]);
+		regd_buf->reg.is_mr = 0;
+
+		iser_dbg("PHYSICAL Mem.register: lkey: 0x%08X rkey: 0x%08X  "
+			 "va: 0x%08lX sz: %ld]\n",
+			 (unsigned int)regd_buf->reg.lkey,
+			 (unsigned int)regd_buf->reg.rkey,
+			 (unsigned long)regd_buf->reg.va,
+			 (unsigned long)regd_buf->reg.len);
+	} else { /* use FMR for multiple dma entries */
+		iser_page_vec_build(mem, ib_conn->fmr.page_vec, ibdev);
+		err = iser_reg_page_vec(ib_conn, ib_conn->fmr.page_vec,
+					&regd_buf->reg);
+		if (err && err != -EAGAIN) {
+			iser_data_buf_dump(mem, ibdev);
+			iser_err("mem->dma_nents = %d (dlength = 0x%x)\n",
+				 mem->dma_nents,
+				 ntoh24(iser_task->desc.iscsi_header.dlength));
+			iser_err("page_vec: data_size = 0x%x, length = %d, offset = 0x%x\n",
+				 ib_conn->fmr.page_vec->data_size,
+				 ib_conn->fmr.page_vec->length,
+				 ib_conn->fmr.page_vec->offset);
+			for (i = 0; i < ib_conn->fmr.page_vec->length; i++)
+				iser_err("page_vec[%d] = 0x%llx\n", i,
+					 (unsigned long long)ib_conn->fmr.page_vec->pages[i]);
+		}
+		if (err)
+			return err;
+	}
+	return 0;
+}
+
+static inline void
+iser_set_dif_domain(struct scsi_cmnd *sc, struct ib_sig_attrs *sig_attrs,
+		    struct ib_sig_domain *domain)
+{
+	domain->sig_type = IB_SIG_TYPE_T10_DIF;
+	domain->sig.dif.pi_interval = sc->device->sector_size;
+	domain->sig.dif.ref_tag = scsi_get_lba(sc) & 0xffffffff;
+	/*
+	 * At the moment we hard code those, but in the future
+	 * we will take them from sc.
+	 */
+	domain->sig.dif.apptag_check_mask = 0xffff;
+	domain->sig.dif.app_escape = true;
+	domain->sig.dif.ref_escape = true;
+	if (scsi_get_prot_type(sc) == SCSI_PROT_DIF_TYPE1 ||
+	    scsi_get_prot_type(sc) == SCSI_PROT_DIF_TYPE2)
+		domain->sig.dif.ref_remap = true;
+};
+
+static int
+iser_set_sig_attrs(struct scsi_cmnd *sc, struct ib_sig_attrs *sig_attrs)
+{
+	switch (scsi_get_prot_op(sc)) {
+	case SCSI_PROT_WRITE_INSERT:
+	case SCSI_PROT_READ_STRIP:
+		sig_attrs->mem.sig_type = IB_SIG_TYPE_NONE;
+		iser_set_dif_domain(sc, sig_attrs, &sig_attrs->wire);
+		sig_attrs->wire.sig.dif.bg_type = IB_T10DIF_CRC;
+		break;
+	case SCSI_PROT_READ_INSERT:
+	case SCSI_PROT_WRITE_STRIP:
+		sig_attrs->wire.sig_type = IB_SIG_TYPE_NONE;
+		iser_set_dif_domain(sc, sig_attrs, &sig_attrs->mem);
+		/*
+		 * At the moment we use this modparam to tell what is
+		 * the memory bg_type, in the future we will take it
+		 * from sc.
+		 */
+		sig_attrs->mem.sig.dif.bg_type = iser_pi_guard ? IB_T10DIF_CSUM :
+						 IB_T10DIF_CRC;
+		break;
+	case SCSI_PROT_READ_PASS:
+	case SCSI_PROT_WRITE_PASS:
+		iser_set_dif_domain(sc, sig_attrs, &sig_attrs->wire);
+		sig_attrs->wire.sig.dif.bg_type = IB_T10DIF_CRC;
+		iser_set_dif_domain(sc, sig_attrs, &sig_attrs->mem);
+		/*
+		 * At the moment we use this modparam to tell what is
+		 * the memory bg_type, in the future we will take it
+		 * from sc.
+		 */
+		sig_attrs->mem.sig.dif.bg_type = iser_pi_guard ? IB_T10DIF_CSUM :
+						 IB_T10DIF_CRC;
+		break;
+	default:
+		iser_err("Unsupported PI operation %d\n",
+			 scsi_get_prot_op(sc));
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int
+iser_set_prot_checks(struct scsi_cmnd *sc, u8 *mask)
+{
+	switch (scsi_get_prot_type(sc)) {
+	case SCSI_PROT_DIF_TYPE0:
+		break;
+	case SCSI_PROT_DIF_TYPE1:
+	case SCSI_PROT_DIF_TYPE2:
+		*mask = ISER_CHECK_GUARD | ISER_CHECK_REFTAG;
+		break;
+	case SCSI_PROT_DIF_TYPE3:
+		*mask = ISER_CHECK_GUARD;
+		break;
+	default:
+		iser_err("Unsupported protection type %d\n",
+			 scsi_get_prot_type(sc));
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int
+iser_reg_sig_mr(struct iscsi_iser_task *iser_task,
+		struct fast_reg_descriptor *desc, struct ib_sge *data_sge,
+		struct ib_sge *prot_sge, struct ib_sge *sig_sge)
+{
+	struct ib_conn *ib_conn = &iser_task->iser_conn->ib_conn;
+	struct iser_pi_context *pi_ctx = desc->pi_ctx;
+	struct ib_send_wr sig_wr, inv_wr;
+	struct ib_send_wr *bad_wr, *wr = NULL;
+	struct ib_sig_attrs sig_attrs;
+	int ret;
+	u32 key;
+
+	memset(&sig_attrs, 0, sizeof(sig_attrs));
+	ret = iser_set_sig_attrs(iser_task->sc, &sig_attrs);
+	if (ret)
+		goto err;
+
+	ret = iser_set_prot_checks(iser_task->sc, &sig_attrs.check_mask);
+	if (ret)
+		goto err;
+
+	if (!(desc->reg_indicators & ISER_SIG_KEY_VALID)) {
+		memset(&inv_wr, 0, sizeof(inv_wr));
+		inv_wr.opcode = IB_WR_LOCAL_INV;
+		inv_wr.wr_id = ISER_FASTREG_LI_WRID;
+		inv_wr.ex.invalidate_rkey = pi_ctx->sig_mr->rkey;
+		wr = &inv_wr;
+		/* Bump the key */
+		key = (u8)(pi_ctx->sig_mr->rkey & 0x000000FF);
+		ib_update_fast_reg_key(pi_ctx->sig_mr, ++key);
+	}
+
+	memset(&sig_wr, 0, sizeof(sig_wr));
+	sig_wr.opcode = IB_WR_REG_SIG_MR;
+	sig_wr.wr_id = ISER_FASTREG_LI_WRID;
+	sig_wr.sg_list = data_sge;
+	sig_wr.num_sge = 1;
+	sig_wr.wr.sig_handover.sig_attrs = &sig_attrs;
+	sig_wr.wr.sig_handover.sig_mr = pi_ctx->sig_mr;
+	if (scsi_prot_sg_count(iser_task->sc))
+		sig_wr.wr.sig_handover.prot = prot_sge;
+	sig_wr.wr.sig_handover.access_flags = IB_ACCESS_LOCAL_WRITE |
+					      IB_ACCESS_REMOTE_READ |
+					      IB_ACCESS_REMOTE_WRITE;
+
+	if (!wr)
+		wr = &sig_wr;
+	else
+		wr->next = &sig_wr;
+
+	ret = ib_post_send(ib_conn->qp, wr, &bad_wr);
+	if (ret) {
+		iser_err("reg_sig_mr failed, ret:%d\n", ret);
+		goto err;
+	}
+	desc->reg_indicators &= ~ISER_SIG_KEY_VALID;
+
+	sig_sge->lkey = pi_ctx->sig_mr->lkey;
+	sig_sge->addr = 0;
+	sig_sge->length = data_sge->length + prot_sge->length;
+	if (scsi_get_prot_op(iser_task->sc) == SCSI_PROT_WRITE_INSERT ||
+	    scsi_get_prot_op(iser_task->sc) == SCSI_PROT_READ_STRIP) {
+		sig_sge->length += (data_sge->length /
+				   iser_task->sc->device->sector_size) * 8;
+	}
+
+	iser_dbg("sig_sge: addr: 0x%llx  length: %u lkey: 0x%x\n",
+		 sig_sge->addr, sig_sge->length,
+		 sig_sge->lkey);
+err:
+	return ret;
+}
+
+static int iser_fast_reg_mr(struct iscsi_iser_task *iser_task,
+			    struct iser_regd_buf *regd_buf,
+			    struct iser_data_buf *mem,
+			    enum iser_reg_indicator ind,
+			    struct ib_sge *sge)
+{
+	struct fast_reg_descriptor *desc = regd_buf->reg.mem_h;
+	struct ib_conn *ib_conn = &iser_task->iser_conn->ib_conn;
+	struct iser_device *device = ib_conn->device;
+	struct ib_device *ibdev = device->ib_device;
+	struct ib_mr *mr;
+	struct ib_fast_reg_page_list *frpl;
+	struct ib_send_wr fastreg_wr, inv_wr;
+	struct ib_send_wr *bad_wr, *wr = NULL;
+	u8 key;
+	int ret, offset, size, plen;
+
+	/* if there a single dma entry, dma mr suffices */
+	if (mem->dma_nents == 1) {
+		struct scatterlist *sg = (struct scatterlist *)mem->buf;
+
+		sge->lkey = device->mr->lkey;
+		sge->addr   = ib_sg_dma_address(ibdev, &sg[0]);
+		sge->length  = ib_sg_dma_len(ibdev, &sg[0]);
+
+		iser_dbg("Single DMA entry: lkey=0x%x, addr=0x%llx, length=0x%x\n",
+			 sge->lkey, sge->addr, sge->length);
+		return 0;
+	}
+
+	if (ind == ISER_DATA_KEY_VALID) {
+		mr = desc->data_mr;
+		frpl = desc->data_frpl;
+	} else {
+		mr = desc->pi_ctx->prot_mr;
+		frpl = desc->pi_ctx->prot_frpl;
+	}
+
+	plen = iser_sg_to_page_vec(mem, device->ib_device, frpl->page_list,
+				   &offset, &size);
+	if (plen * SIZE_4K < size) {
+		iser_err("fast reg page_list too short to hold this SG\n");
+		return -EINVAL;
+	}
+
+	if (!(desc->reg_indicators & ind)) {
+		memset(&inv_wr, 0, sizeof(inv_wr));
+		inv_wr.wr_id = ISER_FASTREG_LI_WRID;
+		inv_wr.opcode = IB_WR_LOCAL_INV;
+		inv_wr.ex.invalidate_rkey = mr->rkey;
+		wr = &inv_wr;
+		/* Bump the key */
+		key = (u8)(mr->rkey & 0x000000FF);
+		ib_update_fast_reg_key(mr, ++key);
+	}
+
+	/* Prepare FASTREG WR */
+	memset(&fastreg_wr, 0, sizeof(fastreg_wr));
+	fastreg_wr.wr_id = ISER_FASTREG_LI_WRID;
+	fastreg_wr.opcode = IB_WR_FAST_REG_MR;
+	fastreg_wr.wr.fast_reg.iova_start = frpl->page_list[0] + offset;
+	fastreg_wr.wr.fast_reg.page_list = frpl;
+	fastreg_wr.wr.fast_reg.page_list_len = plen;
+	fastreg_wr.wr.fast_reg.page_shift = SHIFT_4K;
+	fastreg_wr.wr.fast_reg.length = size;
+	fastreg_wr.wr.fast_reg.rkey = mr->rkey;
+	fastreg_wr.wr.fast_reg.access_flags = (IB_ACCESS_LOCAL_WRITE  |
+					       IB_ACCESS_REMOTE_WRITE |
+					       IB_ACCESS_REMOTE_READ);
+
+	if (!wr)
+		wr = &fastreg_wr;
+	else
+		wr->next = &fastreg_wr;
+
+	ret = ib_post_send(ib_conn->qp, wr, &bad_wr);
+	if (ret) {
+		iser_err("fast registration failed, ret:%d\n", ret);
+		return ret;
+	}
+	desc->reg_indicators &= ~ind;
+
+	sge->lkey = mr->lkey;
+	sge->addr = frpl->page_list[0] + offset;
+	sge->length = size;
+
+	return ret;
+}
+
+/**
+ * iser_reg_rdma_mem_fastreg - Registers memory intended for RDMA,
+ * using Fast Registration WR (if possible) obtaining rkey and va
+ *
+ * returns 0 on success, errno code on failure
+ */
+int iser_reg_rdma_mem_fastreg(struct iscsi_iser_task *iser_task,
+			      enum iser_data_dir cmd_dir)
+{
+	struct ib_conn *ib_conn = &iser_task->iser_conn->ib_conn;
+	struct iser_device *device = ib_conn->device;
+	struct ib_device *ibdev = device->ib_device;
+	struct iser_data_buf *mem = &iser_task->data[cmd_dir];
+	struct iser_regd_buf *regd_buf = &iser_task->rdma_regd[cmd_dir];
+	struct fast_reg_descriptor *desc = NULL;
+	struct ib_sge data_sge;
+	int err, aligned_len;
+	unsigned long flags;
+
+	aligned_len = iser_data_buf_aligned_len(mem, ibdev);
+	if (aligned_len != mem->dma_nents) {
+		err = fall_to_bounce_buf(iser_task, ibdev, mem,
+					 &iser_task->data_copy[cmd_dir],
+					 cmd_dir, aligned_len);
+		if (err) {
+			iser_err("failed to allocate bounce buffer\n");
+			return err;
+		}
+		mem = &iser_task->data_copy[cmd_dir];
+	}
+
+	if (mem->dma_nents != 1 ||
+	    scsi_get_prot_op(iser_task->sc) != SCSI_PROT_NORMAL) {
+		spin_lock_irqsave(&ib_conn->lock, flags);
+		desc = list_first_entry(&ib_conn->fastreg.pool,
+					struct fast_reg_descriptor, list);
+		list_del(&desc->list);
+		spin_unlock_irqrestore(&ib_conn->lock, flags);
+		regd_buf->reg.mem_h = desc;
+	}
+
+	err = iser_fast_reg_mr(iser_task, regd_buf, mem,
+			       ISER_DATA_KEY_VALID, &data_sge);
+	if (err)
+		goto err_reg;
+
+	if (scsi_get_prot_op(iser_task->sc) != SCSI_PROT_NORMAL) {
+		struct ib_sge prot_sge, sig_sge;
+
+		memset(&prot_sge, 0, sizeof(prot_sge));
+		if (scsi_prot_sg_count(iser_task->sc)) {
+			mem = &iser_task->prot[cmd_dir];
+			aligned_len = iser_data_buf_aligned_len(mem, ibdev);
+			if (aligned_len != mem->dma_nents) {
+				err = fall_to_bounce_buf(iser_task, ibdev, mem,
+							 &iser_task->prot_copy[cmd_dir],
+							 cmd_dir, aligned_len);
+				if (err) {
+					iser_err("failed to allocate bounce buffer\n");
+					return err;
+				}
+				mem = &iser_task->prot_copy[cmd_dir];
+			}
+
+			err = iser_fast_reg_mr(iser_task, regd_buf, mem,
+					       ISER_PROT_KEY_VALID, &prot_sge);
+			if (err)
+				goto err_reg;
+		}
+
+		err = iser_reg_sig_mr(iser_task, desc, &data_sge,
+				      &prot_sge, &sig_sge);
+		if (err) {
+			iser_err("Failed to register signature mr\n");
+			return err;
+		}
+		desc->reg_indicators |= ISER_FASTREG_PROTECTED;
+
+		regd_buf->reg.lkey = sig_sge.lkey;
+		regd_buf->reg.rkey = desc->pi_ctx->sig_mr->rkey;
+		regd_buf->reg.va = sig_sge.addr;
+		regd_buf->reg.len = sig_sge.length;
+		regd_buf->reg.is_mr = 1;
+	} else {
+		if (desc) {
+			regd_buf->reg.rkey = desc->data_mr->rkey;
+			regd_buf->reg.is_mr = 1;
+		} else {
+			regd_buf->reg.rkey = device->mr->rkey;
+			regd_buf->reg.is_mr = 0;
+		}
+
+		regd_buf->reg.lkey = data_sge.lkey;
+		regd_buf->reg.va = data_sge.addr;
+		regd_buf->reg.len = data_sge.length;
+	}
+
+	return 0;
+err_reg:
+	if (desc) {
+		spin_lock_irqsave(&ib_conn->lock, flags);
+		list_add_tail(&desc->list, &ib_conn->fastreg.pool);
+		spin_unlock_irqrestore(&ib_conn->lock, flags);
+	}
+
+	return err;
+}
diff --git a/drivers/infiniband/ulp/iser/iser_verbs.c b/drivers/infiniband/ulp/iser/iser_verbs.c
new file mode 100644
index 0000000..67225bb
--- /dev/null
+++ b/drivers/infiniband/ulp/iser/iser_verbs.c
@@ -0,0 +1,1310 @@
+/*
+ * Copyright (c) 2004, 2005, 2006 Voltaire, Inc. All rights reserved.
+ * Copyright (c) 2005, 2006 Cisco Systems.  All rights reserved.
+ * Copyright (c) 2013-2014 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *	- Redistributions of source code must retain the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer.
+ *
+ *	- Redistributions in binary form must reproduce the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer in the documentation and/or other materials
+ *	  provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/delay.h>
+
+#include "iscsi_iser.h"
+
+#define ISCSI_ISER_MAX_CONN	8
+#define ISER_MAX_RX_LEN		(ISER_QP_MAX_RECV_DTOS * ISCSI_ISER_MAX_CONN)
+#define ISER_MAX_TX_LEN		(ISER_QP_MAX_REQ_DTOS  * ISCSI_ISER_MAX_CONN)
+#define ISER_MAX_CQ_LEN		(ISER_MAX_RX_LEN + ISER_MAX_TX_LEN + \
+				 ISCSI_ISER_MAX_CONN)
+
+static int iser_cq_poll_limit = 512;
+
+static void iser_cq_tasklet_fn(unsigned long data);
+static void iser_cq_callback(struct ib_cq *cq, void *cq_context);
+
+static void iser_cq_event_callback(struct ib_event *cause, void *context)
+{
+	iser_err("got cq event %d \n", cause->event);
+}
+
+static void iser_qp_event_callback(struct ib_event *cause, void *context)
+{
+	iser_err("got qp event %d\n",cause->event);
+}
+
+static void iser_event_handler(struct ib_event_handler *handler,
+				struct ib_event *event)
+{
+	iser_err("async event %d on device %s port %d\n", event->event,
+		event->device->name, event->element.port_num);
+}
+
+/**
+ * iser_create_device_ib_res - creates Protection Domain (PD), Completion
+ * Queue (CQ), DMA Memory Region (DMA MR) with the device associated with
+ * the adapator.
+ *
+ * returns 0 on success, -1 on failure
+ */
+static int iser_create_device_ib_res(struct iser_device *device)
+{
+	struct ib_device_attr *dev_attr = &device->dev_attr;
+	int ret, i;
+
+	ret = ib_query_device(device->ib_device, dev_attr);
+	if (ret) {
+		pr_warn("Query device failed for %s\n", device->ib_device->name);
+		return ret;
+	}
+
+	/* Assign function handles  - based on FMR support */
+	if (device->ib_device->alloc_fmr && device->ib_device->dealloc_fmr &&
+	    device->ib_device->map_phys_fmr && device->ib_device->unmap_fmr) {
+		iser_info("FMR supported, using FMR for registration\n");
+		device->iser_alloc_rdma_reg_res = iser_create_fmr_pool;
+		device->iser_free_rdma_reg_res = iser_free_fmr_pool;
+		device->iser_reg_rdma_mem = iser_reg_rdma_mem_fmr;
+		device->iser_unreg_rdma_mem = iser_unreg_mem_fmr;
+	} else
+	if (dev_attr->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) {
+		iser_info("FastReg supported, using FastReg for registration\n");
+		device->iser_alloc_rdma_reg_res = iser_create_fastreg_pool;
+		device->iser_free_rdma_reg_res = iser_free_fastreg_pool;
+		device->iser_reg_rdma_mem = iser_reg_rdma_mem_fastreg;
+		device->iser_unreg_rdma_mem = iser_unreg_mem_fastreg;
+	} else {
+		iser_err("IB device does not support FMRs nor FastRegs, can't register memory\n");
+		return -1;
+	}
+
+	device->comps_used = min(ISER_MAX_CQ,
+				 device->ib_device->num_comp_vectors);
+	iser_info("using %d CQs, device %s supports %d vectors\n",
+		  device->comps_used, device->ib_device->name,
+		  device->ib_device->num_comp_vectors);
+
+	device->pd = ib_alloc_pd(device->ib_device);
+	if (IS_ERR(device->pd))
+		goto pd_err;
+
+	for (i = 0; i < device->comps_used; i++) {
+		struct iser_comp *comp = &device->comps[i];
+
+		comp->device = device;
+		comp->cq = ib_create_cq(device->ib_device,
+					iser_cq_callback,
+					iser_cq_event_callback,
+					(void *)comp,
+					ISER_MAX_CQ_LEN, i);
+		if (IS_ERR(comp->cq)) {
+			comp->cq = NULL;
+			goto cq_err;
+		}
+
+		if (ib_req_notify_cq(comp->cq, IB_CQ_NEXT_COMP))
+			goto cq_err;
+
+		tasklet_init(&comp->tasklet, iser_cq_tasklet_fn,
+			     (unsigned long)comp);
+	}
+
+	device->mr = ib_get_dma_mr(device->pd, IB_ACCESS_LOCAL_WRITE |
+				   IB_ACCESS_REMOTE_WRITE |
+				   IB_ACCESS_REMOTE_READ);
+	if (IS_ERR(device->mr))
+		goto dma_mr_err;
+
+	INIT_IB_EVENT_HANDLER(&device->event_handler, device->ib_device,
+				iser_event_handler);
+	if (ib_register_event_handler(&device->event_handler))
+		goto handler_err;
+
+	return 0;
+
+handler_err:
+	ib_dereg_mr(device->mr);
+dma_mr_err:
+	for (i = 0; i < device->comps_used; i++)
+		tasklet_kill(&device->comps[i].tasklet);
+cq_err:
+	for (i = 0; i < device->comps_used; i++) {
+		struct iser_comp *comp = &device->comps[i];
+
+		if (comp->cq)
+			ib_destroy_cq(comp->cq);
+	}
+	ib_dealloc_pd(device->pd);
+pd_err:
+	iser_err("failed to allocate an IB resource\n");
+	return -1;
+}
+
+/**
+ * iser_free_device_ib_res - destroy/dealloc/dereg the DMA MR,
+ * CQ and PD created with the device associated with the adapator.
+ */
+static void iser_free_device_ib_res(struct iser_device *device)
+{
+	int i;
+	BUG_ON(device->mr == NULL);
+
+	for (i = 0; i < device->comps_used; i++) {
+		struct iser_comp *comp = &device->comps[i];
+
+		tasklet_kill(&comp->tasklet);
+		ib_destroy_cq(comp->cq);
+		comp->cq = NULL;
+	}
+
+	(void)ib_unregister_event_handler(&device->event_handler);
+	(void)ib_dereg_mr(device->mr);
+	(void)ib_dealloc_pd(device->pd);
+
+	device->mr = NULL;
+	device->pd = NULL;
+}
+
+/**
+ * iser_create_fmr_pool - Creates FMR pool and page_vector
+ *
+ * returns 0 on success, or errno code on failure
+ */
+int iser_create_fmr_pool(struct ib_conn *ib_conn, unsigned cmds_max)
+{
+	struct iser_device *device = ib_conn->device;
+	struct ib_fmr_pool_param params;
+	int ret = -ENOMEM;
+
+	ib_conn->fmr.page_vec = kmalloc(sizeof(*ib_conn->fmr.page_vec) +
+					(sizeof(u64)*(ISCSI_ISER_SG_TABLESIZE + 1)),
+					GFP_KERNEL);
+	if (!ib_conn->fmr.page_vec)
+		return ret;
+
+	ib_conn->fmr.page_vec->pages = (u64 *)(ib_conn->fmr.page_vec + 1);
+
+	params.page_shift        = SHIFT_4K;
+	/* when the first/last SG element are not start/end *
+	 * page aligned, the map whould be of N+1 pages     */
+	params.max_pages_per_fmr = ISCSI_ISER_SG_TABLESIZE + 1;
+	/* make the pool size twice the max number of SCSI commands *
+	 * the ML is expected to queue, watermark for unmap at 50%  */
+	params.pool_size	 = cmds_max * 2;
+	params.dirty_watermark	 = cmds_max;
+	params.cache		 = 0;
+	params.flush_function	 = NULL;
+	params.access		 = (IB_ACCESS_LOCAL_WRITE  |
+				    IB_ACCESS_REMOTE_WRITE |
+				    IB_ACCESS_REMOTE_READ);
+
+	ib_conn->fmr.pool = ib_create_fmr_pool(device->pd, &params);
+	if (!IS_ERR(ib_conn->fmr.pool))
+		return 0;
+
+	/* no FMR => no need for page_vec */
+	kfree(ib_conn->fmr.page_vec);
+	ib_conn->fmr.page_vec = NULL;
+
+	ret = PTR_ERR(ib_conn->fmr.pool);
+	ib_conn->fmr.pool = NULL;
+	if (ret != -ENOSYS) {
+		iser_err("FMR allocation failed, err %d\n", ret);
+		return ret;
+	} else {
+		iser_warn("FMRs are not supported, using unaligned mode\n");
+		return 0;
+	}
+}
+
+/**
+ * iser_free_fmr_pool - releases the FMR pool and page vec
+ */
+void iser_free_fmr_pool(struct ib_conn *ib_conn)
+{
+	iser_info("freeing conn %p fmr pool %p\n",
+		  ib_conn, ib_conn->fmr.pool);
+
+	if (ib_conn->fmr.pool != NULL)
+		ib_destroy_fmr_pool(ib_conn->fmr.pool);
+
+	ib_conn->fmr.pool = NULL;
+
+	kfree(ib_conn->fmr.page_vec);
+	ib_conn->fmr.page_vec = NULL;
+}
+
+static int
+iser_create_fastreg_desc(struct ib_device *ib_device, struct ib_pd *pd,
+			 bool pi_enable, struct fast_reg_descriptor *desc)
+{
+	int ret;
+
+	desc->data_frpl = ib_alloc_fast_reg_page_list(ib_device,
+						      ISCSI_ISER_SG_TABLESIZE + 1);
+	if (IS_ERR(desc->data_frpl)) {
+		ret = PTR_ERR(desc->data_frpl);
+		iser_err("Failed to allocate ib_fast_reg_page_list err=%d\n",
+			 ret);
+		return PTR_ERR(desc->data_frpl);
+	}
+
+	desc->data_mr = ib_alloc_fast_reg_mr(pd, ISCSI_ISER_SG_TABLESIZE + 1);
+	if (IS_ERR(desc->data_mr)) {
+		ret = PTR_ERR(desc->data_mr);
+		iser_err("Failed to allocate ib_fast_reg_mr err=%d\n", ret);
+		goto fast_reg_mr_failure;
+	}
+	desc->reg_indicators |= ISER_DATA_KEY_VALID;
+
+	if (pi_enable) {
+		struct ib_mr_init_attr mr_init_attr = {0};
+		struct iser_pi_context *pi_ctx = NULL;
+
+		desc->pi_ctx = kzalloc(sizeof(*desc->pi_ctx), GFP_KERNEL);
+		if (!desc->pi_ctx) {
+			iser_err("Failed to allocate pi context\n");
+			ret = -ENOMEM;
+			goto pi_ctx_alloc_failure;
+		}
+		pi_ctx = desc->pi_ctx;
+
+		pi_ctx->prot_frpl = ib_alloc_fast_reg_page_list(ib_device,
+						    ISCSI_ISER_SG_TABLESIZE);
+		if (IS_ERR(pi_ctx->prot_frpl)) {
+			ret = PTR_ERR(pi_ctx->prot_frpl);
+			iser_err("Failed to allocate prot frpl ret=%d\n",
+				 ret);
+			goto prot_frpl_failure;
+		}
+
+		pi_ctx->prot_mr = ib_alloc_fast_reg_mr(pd,
+						ISCSI_ISER_SG_TABLESIZE + 1);
+		if (IS_ERR(pi_ctx->prot_mr)) {
+			ret = PTR_ERR(pi_ctx->prot_mr);
+			iser_err("Failed to allocate prot frmr ret=%d\n",
+				 ret);
+			goto prot_mr_failure;
+		}
+		desc->reg_indicators |= ISER_PROT_KEY_VALID;
+
+		mr_init_attr.max_reg_descriptors = 2;
+		mr_init_attr.flags |= IB_MR_SIGNATURE_EN;
+		pi_ctx->sig_mr = ib_create_mr(pd, &mr_init_attr);
+		if (IS_ERR(pi_ctx->sig_mr)) {
+			ret = PTR_ERR(pi_ctx->sig_mr);
+			iser_err("Failed to allocate signature enabled mr err=%d\n",
+				 ret);
+			goto sig_mr_failure;
+		}
+		desc->reg_indicators |= ISER_SIG_KEY_VALID;
+	}
+	desc->reg_indicators &= ~ISER_FASTREG_PROTECTED;
+
+	iser_dbg("Create fr_desc %p page_list %p\n",
+		 desc, desc->data_frpl->page_list);
+
+	return 0;
+sig_mr_failure:
+	ib_dereg_mr(desc->pi_ctx->prot_mr);
+prot_mr_failure:
+	ib_free_fast_reg_page_list(desc->pi_ctx->prot_frpl);
+prot_frpl_failure:
+	kfree(desc->pi_ctx);
+pi_ctx_alloc_failure:
+	ib_dereg_mr(desc->data_mr);
+fast_reg_mr_failure:
+	ib_free_fast_reg_page_list(desc->data_frpl);
+
+	return ret;
+}
+
+/**
+ * iser_create_fastreg_pool - Creates pool of fast_reg descriptors
+ * for fast registration work requests.
+ * returns 0 on success, or errno code on failure
+ */
+int iser_create_fastreg_pool(struct ib_conn *ib_conn, unsigned cmds_max)
+{
+	struct iser_device *device = ib_conn->device;
+	struct fast_reg_descriptor *desc;
+	int i, ret;
+
+	INIT_LIST_HEAD(&ib_conn->fastreg.pool);
+	ib_conn->fastreg.pool_size = 0;
+	for (i = 0; i < cmds_max; i++) {
+		desc = kzalloc(sizeof(*desc), GFP_KERNEL);
+		if (!desc) {
+			iser_err("Failed to allocate a new fast_reg descriptor\n");
+			ret = -ENOMEM;
+			goto err;
+		}
+
+		ret = iser_create_fastreg_desc(device->ib_device, device->pd,
+					       ib_conn->pi_support, desc);
+		if (ret) {
+			iser_err("Failed to create fastreg descriptor err=%d\n",
+				 ret);
+			kfree(desc);
+			goto err;
+		}
+
+		list_add_tail(&desc->list, &ib_conn->fastreg.pool);
+		ib_conn->fastreg.pool_size++;
+	}
+
+	return 0;
+
+err:
+	iser_free_fastreg_pool(ib_conn);
+	return ret;
+}
+
+/**
+ * iser_free_fastreg_pool - releases the pool of fast_reg descriptors
+ */
+void iser_free_fastreg_pool(struct ib_conn *ib_conn)
+{
+	struct fast_reg_descriptor *desc, *tmp;
+	int i = 0;
+
+	if (list_empty(&ib_conn->fastreg.pool))
+		return;
+
+	iser_info("freeing conn %p fr pool\n", ib_conn);
+
+	list_for_each_entry_safe(desc, tmp, &ib_conn->fastreg.pool, list) {
+		list_del(&desc->list);
+		ib_free_fast_reg_page_list(desc->data_frpl);
+		ib_dereg_mr(desc->data_mr);
+		if (desc->pi_ctx) {
+			ib_free_fast_reg_page_list(desc->pi_ctx->prot_frpl);
+			ib_dereg_mr(desc->pi_ctx->prot_mr);
+			ib_destroy_mr(desc->pi_ctx->sig_mr);
+			kfree(desc->pi_ctx);
+		}
+		kfree(desc);
+		++i;
+	}
+
+	if (i < ib_conn->fastreg.pool_size)
+		iser_warn("pool still has %d regions registered\n",
+			  ib_conn->fastreg.pool_size - i);
+}
+
+/**
+ * iser_create_ib_conn_res - Queue-Pair (QP)
+ *
+ * returns 0 on success, -1 on failure
+ */
+static int iser_create_ib_conn_res(struct ib_conn *ib_conn)
+{
+	struct iser_device	*device;
+	struct ib_qp_init_attr	init_attr;
+	int			ret = -ENOMEM;
+	int index, min_index = 0;
+
+	BUG_ON(ib_conn->device == NULL);
+
+	device = ib_conn->device;
+
+	memset(&init_attr, 0, sizeof init_attr);
+
+	mutex_lock(&ig.connlist_mutex);
+	/* select the CQ with the minimal number of usages */
+	for (index = 0; index < device->comps_used; index++) {
+		if (device->comps[index].active_qps <
+		    device->comps[min_index].active_qps)
+			min_index = index;
+	}
+	ib_conn->comp = &device->comps[min_index];
+	ib_conn->comp->active_qps++;
+	mutex_unlock(&ig.connlist_mutex);
+	iser_info("cq index %d used for ib_conn %p\n", min_index, ib_conn);
+
+	init_attr.event_handler = iser_qp_event_callback;
+	init_attr.qp_context	= (void *)ib_conn;
+	init_attr.send_cq	= ib_conn->comp->cq;
+	init_attr.recv_cq	= ib_conn->comp->cq;
+	init_attr.cap.max_recv_wr  = ISER_QP_MAX_RECV_DTOS;
+	init_attr.cap.max_send_sge = 2;
+	init_attr.cap.max_recv_sge = 1;
+	init_attr.sq_sig_type	= IB_SIGNAL_REQ_WR;
+	init_attr.qp_type	= IB_QPT_RC;
+	if (ib_conn->pi_support) {
+		init_attr.cap.max_send_wr = ISER_QP_SIG_MAX_REQ_DTOS + 1;
+		init_attr.create_flags |= IB_QP_CREATE_SIGNATURE_EN;
+	} else {
+		init_attr.cap.max_send_wr  = ISER_QP_MAX_REQ_DTOS + 1;
+	}
+
+	ret = rdma_create_qp(ib_conn->cma_id, device->pd, &init_attr);
+	if (ret)
+		goto out_err;
+
+	ib_conn->qp = ib_conn->cma_id->qp;
+	iser_info("setting conn %p cma_id %p qp %p\n",
+		  ib_conn, ib_conn->cma_id,
+		  ib_conn->cma_id->qp);
+	return ret;
+
+out_err:
+	iser_err("unable to alloc mem or create resource, err %d\n", ret);
+	return ret;
+}
+
+/**
+ * based on the resolved device node GUID see if there already allocated
+ * device for this device. If there's no such, create one.
+ */
+static
+struct iser_device *iser_device_find_by_ib_device(struct rdma_cm_id *cma_id)
+{
+	struct iser_device *device;
+
+	mutex_lock(&ig.device_list_mutex);
+
+	list_for_each_entry(device, &ig.device_list, ig_list)
+		/* find if there's a match using the node GUID */
+		if (device->ib_device->node_guid == cma_id->device->node_guid)
+			goto inc_refcnt;
+
+	device = kzalloc(sizeof *device, GFP_KERNEL);
+	if (device == NULL)
+		goto out;
+
+	/* assign this device to the device */
+	device->ib_device = cma_id->device;
+	/* init the device and link it into ig device list */
+	if (iser_create_device_ib_res(device)) {
+		kfree(device);
+		device = NULL;
+		goto out;
+	}
+	list_add(&device->ig_list, &ig.device_list);
+
+inc_refcnt:
+	device->refcount++;
+out:
+	mutex_unlock(&ig.device_list_mutex);
+	return device;
+}
+
+/* if there's no demand for this device, release it */
+static void iser_device_try_release(struct iser_device *device)
+{
+	mutex_lock(&ig.device_list_mutex);
+	device->refcount--;
+	iser_info("device %p refcount %d\n", device, device->refcount);
+	if (!device->refcount) {
+		iser_free_device_ib_res(device);
+		list_del(&device->ig_list);
+		kfree(device);
+	}
+	mutex_unlock(&ig.device_list_mutex);
+}
+
+/**
+ * Called with state mutex held
+ **/
+static int iser_conn_state_comp_exch(struct iser_conn *iser_conn,
+				     enum iser_conn_state comp,
+				     enum iser_conn_state exch)
+{
+	int ret;
+
+	ret = (iser_conn->state == comp);
+	if (ret)
+		iser_conn->state = exch;
+
+	return ret;
+}
+
+void iser_release_work(struct work_struct *work)
+{
+	struct iser_conn *iser_conn;
+
+	iser_conn = container_of(work, struct iser_conn, release_work);
+
+	/* Wait for conn_stop to complete */
+	wait_for_completion(&iser_conn->stop_completion);
+	/* Wait for IB resouces cleanup to complete */
+	wait_for_completion(&iser_conn->ib_completion);
+
+	mutex_lock(&iser_conn->state_mutex);
+	iser_conn->state = ISER_CONN_DOWN;
+	mutex_unlock(&iser_conn->state_mutex);
+
+	iser_conn_release(iser_conn);
+}
+
+/**
+ * iser_free_ib_conn_res - release IB related resources
+ * @iser_conn: iser connection struct
+ * @destroy_device: indicator if we need to try to release
+ *     the iser device (only iscsi shutdown and DEVICE_REMOVAL
+ *     will use this.
+ *
+ * This routine is called with the iser state mutex held
+ * so the cm_id removal is out of here. It is Safe to
+ * be invoked multiple times.
+ */
+static void iser_free_ib_conn_res(struct iser_conn *iser_conn,
+				  bool destroy_device)
+{
+	struct ib_conn *ib_conn = &iser_conn->ib_conn;
+	struct iser_device *device = ib_conn->device;
+
+	iser_info("freeing conn %p cma_id %p qp %p\n",
+		  iser_conn, ib_conn->cma_id, ib_conn->qp);
+
+	iser_free_rx_descriptors(iser_conn);
+
+	if (ib_conn->qp != NULL) {
+		ib_conn->comp->active_qps--;
+		rdma_destroy_qp(ib_conn->cma_id);
+		ib_conn->qp = NULL;
+	}
+
+	if (destroy_device && device != NULL) {
+		iser_device_try_release(device);
+		ib_conn->device = NULL;
+	}
+}
+
+/**
+ * Frees all conn objects and deallocs conn descriptor
+ */
+void iser_conn_release(struct iser_conn *iser_conn)
+{
+	struct ib_conn *ib_conn = &iser_conn->ib_conn;
+
+	mutex_lock(&ig.connlist_mutex);
+	list_del(&iser_conn->conn_list);
+	mutex_unlock(&ig.connlist_mutex);
+
+	mutex_lock(&iser_conn->state_mutex);
+	if (iser_conn->state != ISER_CONN_DOWN)
+		iser_warn("iser conn %p state %d, expected state down.\n",
+			  iser_conn, iser_conn->state);
+	/*
+	 * In case we never got to bind stage, we still need to
+	 * release IB resources (which is safe to call more than once).
+	 */
+	iser_free_ib_conn_res(iser_conn, true);
+	mutex_unlock(&iser_conn->state_mutex);
+
+	if (ib_conn->cma_id != NULL) {
+		rdma_destroy_id(ib_conn->cma_id);
+		ib_conn->cma_id = NULL;
+	}
+
+	kfree(iser_conn);
+}
+
+/**
+ * triggers start of the disconnect procedures and wait for them to be done
+ * Called with state mutex held
+ */
+int iser_conn_terminate(struct iser_conn *iser_conn)
+{
+	struct ib_conn *ib_conn = &iser_conn->ib_conn;
+	struct ib_send_wr *bad_wr;
+	int err = 0;
+
+	/* terminate the iser conn only if the conn state is UP */
+	if (!iser_conn_state_comp_exch(iser_conn, ISER_CONN_UP,
+				       ISER_CONN_TERMINATING))
+		return 0;
+
+	iser_info("iser_conn %p state %d\n", iser_conn, iser_conn->state);
+
+	/* suspend queuing of new iscsi commands */
+	if (iser_conn->iscsi_conn)
+		iscsi_suspend_queue(iser_conn->iscsi_conn);
+
+	/*
+	 * In case we didn't already clean up the cma_id (peer initiated
+	 * a disconnection), we need to Cause the CMA to change the QP
+	 * state to ERROR.
+	 */
+	if (ib_conn->cma_id) {
+		err = rdma_disconnect(ib_conn->cma_id);
+		if (err)
+			iser_err("Failed to disconnect, conn: 0x%p err %d\n",
+				 iser_conn, err);
+
+		/* post an indication that all flush errors were consumed */
+		err = ib_post_send(ib_conn->qp, &ib_conn->beacon, &bad_wr);
+		if (err)
+			iser_err("conn %p failed to post beacon", ib_conn);
+
+		wait_for_completion(&ib_conn->flush_comp);
+	}
+
+	return 1;
+}
+
+/**
+ * Called with state mutex held
+ **/
+static void iser_connect_error(struct rdma_cm_id *cma_id)
+{
+	struct iser_conn *iser_conn;
+
+	iser_conn = (struct iser_conn *)cma_id->context;
+	iser_conn->state = ISER_CONN_DOWN;
+}
+
+/**
+ * Called with state mutex held
+ **/
+static void iser_addr_handler(struct rdma_cm_id *cma_id)
+{
+	struct iser_device *device;
+	struct iser_conn   *iser_conn;
+	struct ib_conn   *ib_conn;
+	int    ret;
+
+	iser_conn = (struct iser_conn *)cma_id->context;
+	if (iser_conn->state != ISER_CONN_PENDING)
+		/* bailout */
+		return;
+
+	ib_conn = &iser_conn->ib_conn;
+	device = iser_device_find_by_ib_device(cma_id);
+	if (!device) {
+		iser_err("device lookup/creation failed\n");
+		iser_connect_error(cma_id);
+		return;
+	}
+
+	ib_conn->device = device;
+
+	/* connection T10-PI support */
+	if (iser_pi_enable) {
+		if (!(device->dev_attr.device_cap_flags &
+		      IB_DEVICE_SIGNATURE_HANDOVER)) {
+			iser_warn("T10-PI requested but not supported on %s, "
+				  "continue without T10-PI\n",
+				  ib_conn->device->ib_device->name);
+			ib_conn->pi_support = false;
+		} else {
+			ib_conn->pi_support = true;
+		}
+	}
+
+	ret = rdma_resolve_route(cma_id, 1000);
+	if (ret) {
+		iser_err("resolve route failed: %d\n", ret);
+		iser_connect_error(cma_id);
+		return;
+	}
+}
+
+/**
+ * Called with state mutex held
+ **/
+static void iser_route_handler(struct rdma_cm_id *cma_id)
+{
+	struct rdma_conn_param conn_param;
+	int    ret;
+	struct iser_cm_hdr req_hdr;
+	struct iser_conn *iser_conn = (struct iser_conn *)cma_id->context;
+	struct ib_conn *ib_conn = &iser_conn->ib_conn;
+	struct iser_device *device = ib_conn->device;
+
+	if (iser_conn->state != ISER_CONN_PENDING)
+		/* bailout */
+		return;
+
+	ret = iser_create_ib_conn_res(ib_conn);
+	if (ret)
+		goto failure;
+
+	memset(&conn_param, 0, sizeof conn_param);
+	conn_param.responder_resources = device->dev_attr.max_qp_rd_atom;
+	conn_param.initiator_depth     = 1;
+	conn_param.retry_count	       = 7;
+	conn_param.rnr_retry_count     = 6;
+
+	memset(&req_hdr, 0, sizeof(req_hdr));
+	req_hdr.flags = (ISER_ZBVA_NOT_SUPPORTED |
+			ISER_SEND_W_INV_NOT_SUPPORTED);
+	conn_param.private_data		= (void *)&req_hdr;
+	conn_param.private_data_len	= sizeof(struct iser_cm_hdr);
+
+	ret = rdma_connect(cma_id, &conn_param);
+	if (ret) {
+		iser_err("failure connecting: %d\n", ret);
+		goto failure;
+	}
+
+	return;
+failure:
+	iser_connect_error(cma_id);
+}
+
+static void iser_connected_handler(struct rdma_cm_id *cma_id)
+{
+	struct iser_conn *iser_conn;
+	struct ib_qp_attr attr;
+	struct ib_qp_init_attr init_attr;
+
+	iser_conn = (struct iser_conn *)cma_id->context;
+	if (iser_conn->state != ISER_CONN_PENDING)
+		/* bailout */
+		return;
+
+	(void)ib_query_qp(cma_id->qp, &attr, ~0, &init_attr);
+	iser_info("remote qpn:%x my qpn:%x\n", attr.dest_qp_num, cma_id->qp->qp_num);
+
+	iser_conn->state = ISER_CONN_UP;
+	complete(&iser_conn->up_completion);
+}
+
+static void iser_disconnected_handler(struct rdma_cm_id *cma_id)
+{
+	struct iser_conn *iser_conn = (struct iser_conn *)cma_id->context;
+
+	if (iser_conn_terminate(iser_conn)) {
+		if (iser_conn->iscsi_conn)
+			iscsi_conn_failure(iser_conn->iscsi_conn,
+					   ISCSI_ERR_CONN_FAILED);
+		else
+			iser_err("iscsi_iser connection isn't bound\n");
+	}
+}
+
+static void iser_cleanup_handler(struct rdma_cm_id *cma_id,
+				 bool destroy_device)
+{
+	struct iser_conn *iser_conn = (struct iser_conn *)cma_id->context;
+
+	/*
+	 * We are not guaranteed that we visited disconnected_handler
+	 * by now, call it here to be safe that we handle CM drep
+	 * and flush errors.
+	 */
+	iser_disconnected_handler(cma_id);
+	iser_free_ib_conn_res(iser_conn, destroy_device);
+	complete(&iser_conn->ib_completion);
+};
+
+static int iser_cma_handler(struct rdma_cm_id *cma_id, struct rdma_cm_event *event)
+{
+	struct iser_conn *iser_conn;
+	int ret = 0;
+
+	iser_conn = (struct iser_conn *)cma_id->context;
+	iser_info("event %d status %d conn %p id %p\n",
+		  event->event, event->status, cma_id->context, cma_id);
+
+	mutex_lock(&iser_conn->state_mutex);
+	switch (event->event) {
+	case RDMA_CM_EVENT_ADDR_RESOLVED:
+		iser_addr_handler(cma_id);
+		break;
+	case RDMA_CM_EVENT_ROUTE_RESOLVED:
+		iser_route_handler(cma_id);
+		break;
+	case RDMA_CM_EVENT_ESTABLISHED:
+		iser_connected_handler(cma_id);
+		break;
+	case RDMA_CM_EVENT_ADDR_ERROR:
+	case RDMA_CM_EVENT_ROUTE_ERROR:
+	case RDMA_CM_EVENT_CONNECT_ERROR:
+	case RDMA_CM_EVENT_UNREACHABLE:
+	case RDMA_CM_EVENT_REJECTED:
+		iser_connect_error(cma_id);
+		break;
+	case RDMA_CM_EVENT_DISCONNECTED:
+	case RDMA_CM_EVENT_ADDR_CHANGE:
+		iser_disconnected_handler(cma_id);
+		break;
+	case RDMA_CM_EVENT_DEVICE_REMOVAL:
+		/*
+		 * we *must* destroy the device as we cannot rely
+		 * on iscsid to be around to initiate error handling.
+		 * also implicitly destroy the cma_id.
+		 */
+		iser_cleanup_handler(cma_id, true);
+		iser_conn->ib_conn.cma_id = NULL;
+		ret = 1;
+		break;
+	case RDMA_CM_EVENT_TIMEWAIT_EXIT:
+		iser_cleanup_handler(cma_id, false);
+		break;
+	default:
+		iser_err("Unexpected RDMA CM event (%d)\n", event->event);
+		break;
+	}
+	mutex_unlock(&iser_conn->state_mutex);
+
+	return ret;
+}
+
+void iser_conn_init(struct iser_conn *iser_conn)
+{
+	iser_conn->state = ISER_CONN_INIT;
+	iser_conn->ib_conn.post_recv_buf_count = 0;
+	init_completion(&iser_conn->ib_conn.flush_comp);
+	init_completion(&iser_conn->stop_completion);
+	init_completion(&iser_conn->ib_completion);
+	init_completion(&iser_conn->up_completion);
+	INIT_LIST_HEAD(&iser_conn->conn_list);
+	spin_lock_init(&iser_conn->ib_conn.lock);
+	mutex_init(&iser_conn->state_mutex);
+}
+
+ /**
+ * starts the process of connecting to the target
+ * sleeps until the connection is established or rejected
+ */
+int iser_connect(struct iser_conn   *iser_conn,
+		 struct sockaddr    *src_addr,
+		 struct sockaddr    *dst_addr,
+		 int                 non_blocking)
+{
+	struct ib_conn *ib_conn = &iser_conn->ib_conn;
+	int err = 0;
+
+	mutex_lock(&iser_conn->state_mutex);
+
+	sprintf(iser_conn->name, "%pISp", dst_addr);
+
+	iser_info("connecting to: %s\n", iser_conn->name);
+
+	/* the device is known only --after-- address resolution */
+	ib_conn->device = NULL;
+
+	iser_conn->state = ISER_CONN_PENDING;
+
+	ib_conn->beacon.wr_id = ISER_BEACON_WRID;
+	ib_conn->beacon.opcode = IB_WR_SEND;
+
+	ib_conn->cma_id = rdma_create_id(iser_cma_handler,
+					 (void *)iser_conn,
+					 RDMA_PS_TCP, IB_QPT_RC);
+	if (IS_ERR(ib_conn->cma_id)) {
+		err = PTR_ERR(ib_conn->cma_id);
+		iser_err("rdma_create_id failed: %d\n", err);
+		goto id_failure;
+	}
+
+	err = rdma_resolve_addr(ib_conn->cma_id, src_addr, dst_addr, 1000);
+	if (err) {
+		iser_err("rdma_resolve_addr failed: %d\n", err);
+		goto addr_failure;
+	}
+
+	if (!non_blocking) {
+		wait_for_completion_interruptible(&iser_conn->up_completion);
+
+		if (iser_conn->state != ISER_CONN_UP) {
+			err =  -EIO;
+			goto connect_failure;
+		}
+	}
+	mutex_unlock(&iser_conn->state_mutex);
+
+	mutex_lock(&ig.connlist_mutex);
+	list_add(&iser_conn->conn_list, &ig.connlist);
+	mutex_unlock(&ig.connlist_mutex);
+	return 0;
+
+id_failure:
+	ib_conn->cma_id = NULL;
+addr_failure:
+	iser_conn->state = ISER_CONN_DOWN;
+connect_failure:
+	mutex_unlock(&iser_conn->state_mutex);
+	iser_conn_release(iser_conn);
+	return err;
+}
+
+/**
+ * iser_reg_page_vec - Register physical memory
+ *
+ * returns: 0 on success, errno code on failure
+ */
+int iser_reg_page_vec(struct ib_conn *ib_conn,
+		      struct iser_page_vec *page_vec,
+		      struct iser_mem_reg  *mem_reg)
+{
+	struct ib_pool_fmr *mem;
+	u64		   io_addr;
+	u64		   *page_list;
+	int		   status;
+
+	page_list = page_vec->pages;
+	io_addr	  = page_list[0];
+
+	mem  = ib_fmr_pool_map_phys(ib_conn->fmr.pool,
+				    page_list,
+				    page_vec->length,
+				    io_addr);
+
+	if (IS_ERR(mem)) {
+		status = (int)PTR_ERR(mem);
+		iser_err("ib_fmr_pool_map_phys failed: %d\n", status);
+		return status;
+	}
+
+	mem_reg->lkey  = mem->fmr->lkey;
+	mem_reg->rkey  = mem->fmr->rkey;
+	mem_reg->len   = page_vec->length * SIZE_4K;
+	mem_reg->va    = io_addr;
+	mem_reg->is_mr = 1;
+	mem_reg->mem_h = (void *)mem;
+
+	mem_reg->va   += page_vec->offset;
+	mem_reg->len   = page_vec->data_size;
+
+	iser_dbg("PHYSICAL Mem.register, [PHYS p_array: 0x%p, sz: %d, "
+		 "entry[0]: (0x%08lx,%ld)] -> "
+		 "[lkey: 0x%08X mem_h: 0x%p va: 0x%08lX sz: %ld]\n",
+		 page_vec, page_vec->length,
+		 (unsigned long)page_vec->pages[0],
+		 (unsigned long)page_vec->data_size,
+		 (unsigned int)mem_reg->lkey, mem_reg->mem_h,
+		 (unsigned long)mem_reg->va, (unsigned long)mem_reg->len);
+	return 0;
+}
+
+/**
+ * Unregister (previosuly registered using FMR) memory.
+ * If memory is non-FMR does nothing.
+ */
+void iser_unreg_mem_fmr(struct iscsi_iser_task *iser_task,
+			enum iser_data_dir cmd_dir)
+{
+	struct iser_mem_reg *reg = &iser_task->rdma_regd[cmd_dir].reg;
+	int ret;
+
+	if (!reg->is_mr)
+		return;
+
+	iser_dbg("PHYSICAL Mem.Unregister mem_h %p\n",reg->mem_h);
+
+	ret = ib_fmr_pool_unmap((struct ib_pool_fmr *)reg->mem_h);
+	if (ret)
+		iser_err("ib_fmr_pool_unmap failed %d\n", ret);
+
+	reg->mem_h = NULL;
+}
+
+void iser_unreg_mem_fastreg(struct iscsi_iser_task *iser_task,
+			    enum iser_data_dir cmd_dir)
+{
+	struct iser_mem_reg *reg = &iser_task->rdma_regd[cmd_dir].reg;
+	struct iser_conn *iser_conn = iser_task->iser_conn;
+	struct ib_conn *ib_conn = &iser_conn->ib_conn;
+	struct fast_reg_descriptor *desc = reg->mem_h;
+
+	if (!reg->is_mr)
+		return;
+
+	reg->mem_h = NULL;
+	reg->is_mr = 0;
+	spin_lock_bh(&ib_conn->lock);
+	list_add_tail(&desc->list, &ib_conn->fastreg.pool);
+	spin_unlock_bh(&ib_conn->lock);
+}
+
+int iser_post_recvl(struct iser_conn *iser_conn)
+{
+	struct ib_recv_wr rx_wr, *rx_wr_failed;
+	struct ib_conn *ib_conn = &iser_conn->ib_conn;
+	struct ib_sge	  sge;
+	int ib_ret;
+
+	sge.addr   = iser_conn->login_resp_dma;
+	sge.length = ISER_RX_LOGIN_SIZE;
+	sge.lkey   = ib_conn->device->mr->lkey;
+
+	rx_wr.wr_id   = (unsigned long)iser_conn->login_resp_buf;
+	rx_wr.sg_list = &sge;
+	rx_wr.num_sge = 1;
+	rx_wr.next    = NULL;
+
+	ib_conn->post_recv_buf_count++;
+	ib_ret	= ib_post_recv(ib_conn->qp, &rx_wr, &rx_wr_failed);
+	if (ib_ret) {
+		iser_err("ib_post_recv failed ret=%d\n", ib_ret);
+		ib_conn->post_recv_buf_count--;
+	}
+	return ib_ret;
+}
+
+int iser_post_recvm(struct iser_conn *iser_conn, int count)
+{
+	struct ib_recv_wr *rx_wr, *rx_wr_failed;
+	int i, ib_ret;
+	struct ib_conn *ib_conn = &iser_conn->ib_conn;
+	unsigned int my_rx_head = iser_conn->rx_desc_head;
+	struct iser_rx_desc *rx_desc;
+
+	for (rx_wr = ib_conn->rx_wr, i = 0; i < count; i++, rx_wr++) {
+		rx_desc		= &iser_conn->rx_descs[my_rx_head];
+		rx_wr->wr_id	= (unsigned long)rx_desc;
+		rx_wr->sg_list	= &rx_desc->rx_sg;
+		rx_wr->num_sge	= 1;
+		rx_wr->next	= rx_wr + 1;
+		my_rx_head = (my_rx_head + 1) & iser_conn->qp_max_recv_dtos_mask;
+	}
+
+	rx_wr--;
+	rx_wr->next = NULL; /* mark end of work requests list */
+
+	ib_conn->post_recv_buf_count += count;
+	ib_ret	= ib_post_recv(ib_conn->qp, ib_conn->rx_wr, &rx_wr_failed);
+	if (ib_ret) {
+		iser_err("ib_post_recv failed ret=%d\n", ib_ret);
+		ib_conn->post_recv_buf_count -= count;
+	} else
+		iser_conn->rx_desc_head = my_rx_head;
+	return ib_ret;
+}
+
+
+/**
+ * iser_start_send - Initiate a Send DTO operation
+ *
+ * returns 0 on success, -1 on failure
+ */
+int iser_post_send(struct ib_conn *ib_conn, struct iser_tx_desc *tx_desc,
+		   bool signal)
+{
+	int		  ib_ret;
+	struct ib_send_wr send_wr, *send_wr_failed;
+
+	ib_dma_sync_single_for_device(ib_conn->device->ib_device,
+				      tx_desc->dma_addr, ISER_HEADERS_LEN,
+				      DMA_TO_DEVICE);
+
+	send_wr.next	   = NULL;
+	send_wr.wr_id	   = (unsigned long)tx_desc;
+	send_wr.sg_list	   = tx_desc->tx_sg;
+	send_wr.num_sge	   = tx_desc->num_sge;
+	send_wr.opcode	   = IB_WR_SEND;
+	send_wr.send_flags = signal ? IB_SEND_SIGNALED : 0;
+
+	ib_ret = ib_post_send(ib_conn->qp, &send_wr, &send_wr_failed);
+	if (ib_ret)
+		iser_err("ib_post_send failed, ret:%d\n", ib_ret);
+
+	return ib_ret;
+}
+
+/**
+ * is_iser_tx_desc - Indicate if the completion wr_id
+ *     is a TX descriptor or not.
+ * @iser_conn: iser connection
+ * @wr_id: completion WR identifier
+ *
+ * Since we cannot rely on wc opcode in FLUSH errors
+ * we must work around it by checking if the wr_id address
+ * falls in the iser connection rx_descs buffer. If so
+ * it is an RX descriptor, otherwize it is a TX.
+ */
+static inline bool
+is_iser_tx_desc(struct iser_conn *iser_conn, void *wr_id)
+{
+	void *start = iser_conn->rx_descs;
+	int len = iser_conn->num_rx_descs * sizeof(*iser_conn->rx_descs);
+
+	if (wr_id >= start && wr_id < start + len)
+		return false;
+
+	return true;
+}
+
+/**
+ * iser_handle_comp_error() - Handle error completion
+ * @ib_conn:   connection RDMA resources
+ * @wc:        work completion
+ *
+ * Notes: We may handle a FLUSH error completion and in this case
+ *        we only cleanup in case TX type was DATAOUT. For non-FLUSH
+ *        error completion we should also notify iscsi layer that
+ *        connection is failed (in case we passed bind stage).
+ */
+static void
+iser_handle_comp_error(struct ib_conn *ib_conn,
+		       struct ib_wc *wc)
+{
+	struct iser_conn *iser_conn = container_of(ib_conn, struct iser_conn,
+						   ib_conn);
+
+	if (wc->status != IB_WC_WR_FLUSH_ERR)
+		if (iser_conn->iscsi_conn)
+			iscsi_conn_failure(iser_conn->iscsi_conn,
+					   ISCSI_ERR_CONN_FAILED);
+
+	if (is_iser_tx_desc(iser_conn, (void *)wc->wr_id)) {
+		struct iser_tx_desc *desc = (struct iser_tx_desc *)wc->wr_id;
+
+		if (desc->type == ISCSI_TX_DATAOUT)
+			kmem_cache_free(ig.desc_cache, desc);
+	} else {
+		ib_conn->post_recv_buf_count--;
+	}
+}
+
+/**
+ * iser_handle_wc - handle a single work completion
+ * @wc: work completion
+ *
+ * Soft-IRQ context, work completion can be either
+ * SEND or RECV, and can turn out successful or
+ * with error (or flush error).
+ */
+static void iser_handle_wc(struct ib_wc *wc)
+{
+	struct ib_conn *ib_conn;
+	struct iser_tx_desc *tx_desc;
+	struct iser_rx_desc *rx_desc;
+
+	ib_conn = wc->qp->qp_context;
+	if (wc->status == IB_WC_SUCCESS) {
+		if (wc->opcode == IB_WC_RECV) {
+			rx_desc = (struct iser_rx_desc *)wc->wr_id;
+			iser_rcv_completion(rx_desc, wc->byte_len,
+					    ib_conn);
+		} else
+		if (wc->opcode == IB_WC_SEND) {
+			tx_desc = (struct iser_tx_desc *)wc->wr_id;
+			iser_snd_completion(tx_desc, ib_conn);
+		} else {
+			iser_err("Unknown wc opcode %d\n", wc->opcode);
+		}
+	} else {
+		if (wc->status != IB_WC_WR_FLUSH_ERR)
+			iser_err("wr id %llx status %d vend_err %x\n",
+				 wc->wr_id, wc->status, wc->vendor_err);
+		else
+			iser_dbg("flush error: wr id %llx\n", wc->wr_id);
+
+		if (wc->wr_id != ISER_FASTREG_LI_WRID &&
+		    wc->wr_id != ISER_BEACON_WRID)
+			iser_handle_comp_error(ib_conn, wc);
+
+		/* complete in case all flush errors were consumed */
+		if (wc->wr_id == ISER_BEACON_WRID)
+			complete(&ib_conn->flush_comp);
+	}
+}
+
+/**
+ * iser_cq_tasklet_fn - iSER completion polling loop
+ * @data: iSER completion context
+ *
+ * Soft-IRQ context, polling connection CQ until
+ * either CQ was empty or we exausted polling budget
+ */
+static void iser_cq_tasklet_fn(unsigned long data)
+{
+	struct iser_comp *comp = (struct iser_comp *)data;
+	struct ib_cq *cq = comp->cq;
+	struct ib_wc *const wcs = comp->wcs;
+	int i, n, completed = 0;
+
+	while ((n = ib_poll_cq(cq, ARRAY_SIZE(comp->wcs), wcs)) > 0) {
+		for (i = 0; i < n; i++)
+			iser_handle_wc(&wcs[i]);
+
+		completed += n;
+		if (completed >= iser_cq_poll_limit)
+			break;
+	}
+
+	/*
+	 * It is assumed here that arming CQ only once its empty
+	 * would not cause interrupts to be missed.
+	 */
+	ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
+
+	iser_dbg("got %d completions\n", completed);
+}
+
+static void iser_cq_callback(struct ib_cq *cq, void *cq_context)
+{
+	struct iser_comp *comp = cq_context;
+
+	tasklet_schedule(&comp->tasklet);
+}
+
+u8 iser_check_task_pi_status(struct iscsi_iser_task *iser_task,
+			     enum iser_data_dir cmd_dir, sector_t *sector)
+{
+	struct iser_mem_reg *reg = &iser_task->rdma_regd[cmd_dir].reg;
+	struct fast_reg_descriptor *desc = reg->mem_h;
+	unsigned long sector_size = iser_task->sc->device->sector_size;
+	struct ib_mr_status mr_status;
+	int ret;
+
+	if (desc && desc->reg_indicators & ISER_FASTREG_PROTECTED) {
+		desc->reg_indicators &= ~ISER_FASTREG_PROTECTED;
+		ret = ib_check_mr_status(desc->pi_ctx->sig_mr,
+					 IB_MR_CHECK_SIG_STATUS, &mr_status);
+		if (ret) {
+			pr_err("ib_check_mr_status failed, ret %d\n", ret);
+			goto err;
+		}
+
+		if (mr_status.fail_status & IB_MR_CHECK_SIG_STATUS) {
+			sector_t sector_off = mr_status.sig_err.sig_err_offset;
+
+			do_div(sector_off, sector_size + 8);
+			*sector = scsi_get_lba(iser_task->sc) + sector_off;
+
+			pr_err("PI error found type %d at sector %llx "
+			       "expected %x vs actual %x\n",
+			       mr_status.sig_err.err_type,
+			       (unsigned long long)*sector,
+			       mr_status.sig_err.expected,
+			       mr_status.sig_err.actual);
+
+			switch (mr_status.sig_err.err_type) {
+			case IB_SIG_BAD_GUARD:
+				return 0x1;
+			case IB_SIG_BAD_REFTAG:
+				return 0x3;
+			case IB_SIG_BAD_APPTAG:
+				return 0x2;
+			}
+		}
+	}
+
+	return 0;
+err:
+	/* Not alot we can do here, return ambiguous guard error */
+	return 0x1;
+}
diff --git a/drivers/infiniband/ulp/isert/Kconfig b/drivers/infiniband/ulp/isert/Kconfig
new file mode 100644
index 0000000..02f9759
--- /dev/null
+++ b/drivers/infiniband/ulp/isert/Kconfig
@@ -0,0 +1,5 @@
+config INFINIBAND_ISERT
+	tristate "iSCSI Extensions for RDMA (iSER) target support"
+	depends on INET && INFINIBAND_ADDR_TRANS && TARGET_CORE && ISCSI_TARGET
+	---help---
+	Support for iSCSI Extensions for RDMA (iSER) Target on Infiniband fabrics.
diff --git a/drivers/infiniband/ulp/isert/Makefile b/drivers/infiniband/ulp/isert/Makefile
new file mode 100644
index 0000000..c8bf242
--- /dev/null
+++ b/drivers/infiniband/ulp/isert/Makefile
@@ -0,0 +1,2 @@
+ccflags-y		:= -Idrivers/target -Idrivers/target/iscsi
+obj-$(CONFIG_INFINIBAND_ISERT)	+= ib_isert.o
diff --git a/drivers/infiniband/ulp/isert/ib_isert.c b/drivers/infiniband/ulp/isert/ib_isert.c
new file mode 100644
index 0000000..10641b7
--- /dev/null
+++ b/drivers/infiniband/ulp/isert/ib_isert.c
@@ -0,0 +1,3313 @@
+/*******************************************************************************
+ * This file contains iSCSI extentions for RDMA (iSER) Verbs
+ *
+ * (c) Copyright 2013 Datera, Inc.
+ *
+ * Nicholas A. Bellinger <nab@linux-iscsi.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ ****************************************************************************/
+
+#include <linux/string.h>
+#include <linux/module.h>
+#include <linux/scatterlist.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/in6.h>
+#include <linux/llist.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/rdma_cm.h>
+#include <target/target_core_base.h>
+#include <target/target_core_fabric.h>
+#include <target/iscsi/iscsi_transport.h>
+#include <linux/semaphore.h>
+
+#include "isert_proto.h"
+#include "ib_isert.h"
+
+#define	ISERT_MAX_CONN		8
+#define ISER_MAX_RX_CQ_LEN	(ISERT_QP_MAX_RECV_DTOS * ISERT_MAX_CONN)
+#define ISER_MAX_TX_CQ_LEN	(ISERT_QP_MAX_REQ_DTOS  * ISERT_MAX_CONN)
+
+static DEFINE_MUTEX(device_list_mutex);
+static LIST_HEAD(device_list);
+static struct workqueue_struct *isert_rx_wq;
+static struct workqueue_struct *isert_comp_wq;
+
+static void
+isert_unmap_cmd(struct isert_cmd *isert_cmd, struct isert_conn *isert_conn);
+static int
+isert_map_rdma(struct iscsi_conn *conn, struct iscsi_cmd *cmd,
+	       struct isert_rdma_wr *wr);
+static void
+isert_unreg_rdma(struct isert_cmd *isert_cmd, struct isert_conn *isert_conn);
+static int
+isert_reg_rdma(struct iscsi_conn *conn, struct iscsi_cmd *cmd,
+	       struct isert_rdma_wr *wr);
+static int
+isert_put_response(struct iscsi_conn *conn, struct iscsi_cmd *cmd);
+
+static void
+isert_qp_event_callback(struct ib_event *e, void *context)
+{
+	struct isert_conn *isert_conn = (struct isert_conn *)context;
+
+	pr_err("isert_qp_event_callback event: %d\n", e->event);
+	switch (e->event) {
+	case IB_EVENT_COMM_EST:
+		rdma_notify(isert_conn->conn_cm_id, IB_EVENT_COMM_EST);
+		break;
+	case IB_EVENT_QP_LAST_WQE_REACHED:
+		pr_warn("Reached TX IB_EVENT_QP_LAST_WQE_REACHED:\n");
+		break;
+	default:
+		break;
+	}
+}
+
+static int
+isert_query_device(struct ib_device *ib_dev, struct ib_device_attr *devattr)
+{
+	int ret;
+
+	ret = ib_query_device(ib_dev, devattr);
+	if (ret) {
+		pr_err("ib_query_device() failed: %d\n", ret);
+		return ret;
+	}
+	pr_debug("devattr->max_sge: %d\n", devattr->max_sge);
+	pr_debug("devattr->max_sge_rd: %d\n", devattr->max_sge_rd);
+
+	return 0;
+}
+
+static int
+isert_conn_setup_qp(struct isert_conn *isert_conn, struct rdma_cm_id *cma_id,
+		    u8 protection)
+{
+	struct isert_device *device = isert_conn->conn_device;
+	struct ib_qp_init_attr attr;
+	int ret, index, min_index = 0;
+
+	mutex_lock(&device_list_mutex);
+	for (index = 0; index < device->cqs_used; index++)
+		if (device->cq_active_qps[index] <
+		    device->cq_active_qps[min_index])
+			min_index = index;
+	device->cq_active_qps[min_index]++;
+	pr_debug("isert_conn_setup_qp: Using min_index: %d\n", min_index);
+	mutex_unlock(&device_list_mutex);
+
+	memset(&attr, 0, sizeof(struct ib_qp_init_attr));
+	attr.event_handler = isert_qp_event_callback;
+	attr.qp_context = isert_conn;
+	attr.send_cq = device->dev_tx_cq[min_index];
+	attr.recv_cq = device->dev_rx_cq[min_index];
+	attr.cap.max_send_wr = ISERT_QP_MAX_REQ_DTOS;
+	attr.cap.max_recv_wr = ISERT_QP_MAX_RECV_DTOS;
+	/*
+	 * FIXME: Use devattr.max_sge - 2 for max_send_sge as
+	 * work-around for RDMA_READs with ConnectX-2.
+	 *
+	 * Also, still make sure to have at least two SGEs for
+	 * outgoing control PDU responses.
+	 */
+	attr.cap.max_send_sge = max(2, device->dev_attr.max_sge - 2);
+	isert_conn->max_sge = attr.cap.max_send_sge;
+
+	attr.cap.max_recv_sge = 1;
+	attr.sq_sig_type = IB_SIGNAL_REQ_WR;
+	attr.qp_type = IB_QPT_RC;
+	if (protection)
+		attr.create_flags |= IB_QP_CREATE_SIGNATURE_EN;
+
+	pr_debug("isert_conn_setup_qp cma_id->device: %p\n",
+		 cma_id->device);
+	pr_debug("isert_conn_setup_qp conn_pd->device: %p\n",
+		 isert_conn->conn_pd->device);
+
+	ret = rdma_create_qp(cma_id, isert_conn->conn_pd, &attr);
+	if (ret) {
+		pr_err("rdma_create_qp failed for cma_id %d\n", ret);
+		return ret;
+	}
+	isert_conn->conn_qp = cma_id->qp;
+	pr_debug("rdma_create_qp() returned success >>>>>>>>>>>>>>>>>>>>>>>>>.\n");
+
+	return 0;
+}
+
+static void
+isert_cq_event_callback(struct ib_event *e, void *context)
+{
+	pr_debug("isert_cq_event_callback event: %d\n", e->event);
+}
+
+static int
+isert_alloc_rx_descriptors(struct isert_conn *isert_conn)
+{
+	struct ib_device *ib_dev = isert_conn->conn_cm_id->device;
+	struct iser_rx_desc *rx_desc;
+	struct ib_sge *rx_sg;
+	u64 dma_addr;
+	int i, j;
+
+	isert_conn->conn_rx_descs = kzalloc(ISERT_QP_MAX_RECV_DTOS *
+				sizeof(struct iser_rx_desc), GFP_KERNEL);
+	if (!isert_conn->conn_rx_descs)
+		goto fail;
+
+	rx_desc = isert_conn->conn_rx_descs;
+
+	for (i = 0; i < ISERT_QP_MAX_RECV_DTOS; i++, rx_desc++)  {
+		dma_addr = ib_dma_map_single(ib_dev, (void *)rx_desc,
+					ISER_RX_PAYLOAD_SIZE, DMA_FROM_DEVICE);
+		if (ib_dma_mapping_error(ib_dev, dma_addr))
+			goto dma_map_fail;
+
+		rx_desc->dma_addr = dma_addr;
+
+		rx_sg = &rx_desc->rx_sg;
+		rx_sg->addr = rx_desc->dma_addr;
+		rx_sg->length = ISER_RX_PAYLOAD_SIZE;
+		rx_sg->lkey = isert_conn->conn_mr->lkey;
+	}
+
+	isert_conn->conn_rx_desc_head = 0;
+	return 0;
+
+dma_map_fail:
+	rx_desc = isert_conn->conn_rx_descs;
+	for (j = 0; j < i; j++, rx_desc++) {
+		ib_dma_unmap_single(ib_dev, rx_desc->dma_addr,
+				    ISER_RX_PAYLOAD_SIZE, DMA_FROM_DEVICE);
+	}
+	kfree(isert_conn->conn_rx_descs);
+	isert_conn->conn_rx_descs = NULL;
+fail:
+	return -ENOMEM;
+}
+
+static void
+isert_free_rx_descriptors(struct isert_conn *isert_conn)
+{
+	struct ib_device *ib_dev = isert_conn->conn_cm_id->device;
+	struct iser_rx_desc *rx_desc;
+	int i;
+
+	if (!isert_conn->conn_rx_descs)
+		return;
+
+	rx_desc = isert_conn->conn_rx_descs;
+	for (i = 0; i < ISERT_QP_MAX_RECV_DTOS; i++, rx_desc++)  {
+		ib_dma_unmap_single(ib_dev, rx_desc->dma_addr,
+				    ISER_RX_PAYLOAD_SIZE, DMA_FROM_DEVICE);
+	}
+
+	kfree(isert_conn->conn_rx_descs);
+	isert_conn->conn_rx_descs = NULL;
+}
+
+static void isert_cq_tx_work(struct work_struct *);
+static void isert_cq_tx_callback(struct ib_cq *, void *);
+static void isert_cq_rx_work(struct work_struct *);
+static void isert_cq_rx_callback(struct ib_cq *, void *);
+
+static int
+isert_create_device_ib_res(struct isert_device *device)
+{
+	struct ib_device *ib_dev = device->ib_device;
+	struct isert_cq_desc *cq_desc;
+	struct ib_device_attr *dev_attr;
+	int ret = 0, i, j;
+	int max_rx_cqe, max_tx_cqe;
+
+	dev_attr = &device->dev_attr;
+	ret = isert_query_device(ib_dev, dev_attr);
+	if (ret)
+		return ret;
+
+	max_rx_cqe = min(ISER_MAX_RX_CQ_LEN, dev_attr->max_cqe);
+	max_tx_cqe = min(ISER_MAX_TX_CQ_LEN, dev_attr->max_cqe);
+
+	/* asign function handlers */
+	if (dev_attr->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS &&
+	    dev_attr->device_cap_flags & IB_DEVICE_SIGNATURE_HANDOVER) {
+		device->use_fastreg = 1;
+		device->reg_rdma_mem = isert_reg_rdma;
+		device->unreg_rdma_mem = isert_unreg_rdma;
+	} else {
+		device->use_fastreg = 0;
+		device->reg_rdma_mem = isert_map_rdma;
+		device->unreg_rdma_mem = isert_unmap_cmd;
+	}
+
+	/* Check signature cap */
+	device->pi_capable = dev_attr->device_cap_flags &
+			     IB_DEVICE_SIGNATURE_HANDOVER ? true : false;
+
+	device->cqs_used = min_t(int, num_online_cpus(),
+				 device->ib_device->num_comp_vectors);
+	device->cqs_used = min(ISERT_MAX_CQ, device->cqs_used);
+	pr_debug("Using %d CQs, device %s supports %d vectors support "
+		 "Fast registration %d pi_capable %d\n",
+		 device->cqs_used, device->ib_device->name,
+		 device->ib_device->num_comp_vectors, device->use_fastreg,
+		 device->pi_capable);
+	device->cq_desc = kzalloc(sizeof(struct isert_cq_desc) *
+				device->cqs_used, GFP_KERNEL);
+	if (!device->cq_desc) {
+		pr_err("Unable to allocate device->cq_desc\n");
+		return -ENOMEM;
+	}
+	cq_desc = device->cq_desc;
+
+	for (i = 0; i < device->cqs_used; i++) {
+		cq_desc[i].device = device;
+		cq_desc[i].cq_index = i;
+
+		INIT_WORK(&cq_desc[i].cq_rx_work, isert_cq_rx_work);
+		device->dev_rx_cq[i] = ib_create_cq(device->ib_device,
+						isert_cq_rx_callback,
+						isert_cq_event_callback,
+						(void *)&cq_desc[i],
+						max_rx_cqe, i);
+		if (IS_ERR(device->dev_rx_cq[i])) {
+			ret = PTR_ERR(device->dev_rx_cq[i]);
+			device->dev_rx_cq[i] = NULL;
+			goto out_cq;
+		}
+
+		INIT_WORK(&cq_desc[i].cq_tx_work, isert_cq_tx_work);
+		device->dev_tx_cq[i] = ib_create_cq(device->ib_device,
+						isert_cq_tx_callback,
+						isert_cq_event_callback,
+						(void *)&cq_desc[i],
+						max_tx_cqe, i);
+		if (IS_ERR(device->dev_tx_cq[i])) {
+			ret = PTR_ERR(device->dev_tx_cq[i]);
+			device->dev_tx_cq[i] = NULL;
+			goto out_cq;
+		}
+
+		ret = ib_req_notify_cq(device->dev_rx_cq[i], IB_CQ_NEXT_COMP);
+		if (ret)
+			goto out_cq;
+
+		ret = ib_req_notify_cq(device->dev_tx_cq[i], IB_CQ_NEXT_COMP);
+		if (ret)
+			goto out_cq;
+	}
+
+	return 0;
+
+out_cq:
+	for (j = 0; j < i; j++) {
+		cq_desc = &device->cq_desc[j];
+
+		if (device->dev_rx_cq[j]) {
+			cancel_work_sync(&cq_desc->cq_rx_work);
+			ib_destroy_cq(device->dev_rx_cq[j]);
+		}
+		if (device->dev_tx_cq[j]) {
+			cancel_work_sync(&cq_desc->cq_tx_work);
+			ib_destroy_cq(device->dev_tx_cq[j]);
+		}
+	}
+	kfree(device->cq_desc);
+
+	return ret;
+}
+
+static void
+isert_free_device_ib_res(struct isert_device *device)
+{
+	struct isert_cq_desc *cq_desc;
+	int i;
+
+	for (i = 0; i < device->cqs_used; i++) {
+		cq_desc = &device->cq_desc[i];
+
+		cancel_work_sync(&cq_desc->cq_rx_work);
+		cancel_work_sync(&cq_desc->cq_tx_work);
+		ib_destroy_cq(device->dev_rx_cq[i]);
+		ib_destroy_cq(device->dev_tx_cq[i]);
+		device->dev_rx_cq[i] = NULL;
+		device->dev_tx_cq[i] = NULL;
+	}
+
+	kfree(device->cq_desc);
+}
+
+static void
+isert_device_try_release(struct isert_device *device)
+{
+	mutex_lock(&device_list_mutex);
+	device->refcount--;
+	if (!device->refcount) {
+		isert_free_device_ib_res(device);
+		list_del(&device->dev_node);
+		kfree(device);
+	}
+	mutex_unlock(&device_list_mutex);
+}
+
+static struct isert_device *
+isert_device_find_by_ib_dev(struct rdma_cm_id *cma_id)
+{
+	struct isert_device *device;
+	int ret;
+
+	mutex_lock(&device_list_mutex);
+	list_for_each_entry(device, &device_list, dev_node) {
+		if (device->ib_device->node_guid == cma_id->device->node_guid) {
+			device->refcount++;
+			mutex_unlock(&device_list_mutex);
+			return device;
+		}
+	}
+
+	device = kzalloc(sizeof(struct isert_device), GFP_KERNEL);
+	if (!device) {
+		mutex_unlock(&device_list_mutex);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	INIT_LIST_HEAD(&device->dev_node);
+
+	device->ib_device = cma_id->device;
+	ret = isert_create_device_ib_res(device);
+	if (ret) {
+		kfree(device);
+		mutex_unlock(&device_list_mutex);
+		return ERR_PTR(ret);
+	}
+
+	device->refcount++;
+	list_add_tail(&device->dev_node, &device_list);
+	mutex_unlock(&device_list_mutex);
+
+	return device;
+}
+
+static void
+isert_conn_free_fastreg_pool(struct isert_conn *isert_conn)
+{
+	struct fast_reg_descriptor *fr_desc, *tmp;
+	int i = 0;
+
+	if (list_empty(&isert_conn->conn_fr_pool))
+		return;
+
+	pr_debug("Freeing conn %p fastreg pool", isert_conn);
+
+	list_for_each_entry_safe(fr_desc, tmp,
+				 &isert_conn->conn_fr_pool, list) {
+		list_del(&fr_desc->list);
+		ib_free_fast_reg_page_list(fr_desc->data_frpl);
+		ib_dereg_mr(fr_desc->data_mr);
+		if (fr_desc->pi_ctx) {
+			ib_free_fast_reg_page_list(fr_desc->pi_ctx->prot_frpl);
+			ib_dereg_mr(fr_desc->pi_ctx->prot_mr);
+			ib_destroy_mr(fr_desc->pi_ctx->sig_mr);
+			kfree(fr_desc->pi_ctx);
+		}
+		kfree(fr_desc);
+		++i;
+	}
+
+	if (i < isert_conn->conn_fr_pool_size)
+		pr_warn("Pool still has %d regions registered\n",
+			isert_conn->conn_fr_pool_size - i);
+}
+
+static int
+isert_create_fr_desc(struct ib_device *ib_device, struct ib_pd *pd,
+		     struct fast_reg_descriptor *fr_desc, u8 protection)
+{
+	int ret;
+
+	fr_desc->data_frpl = ib_alloc_fast_reg_page_list(ib_device,
+							 ISCSI_ISER_SG_TABLESIZE);
+	if (IS_ERR(fr_desc->data_frpl)) {
+		pr_err("Failed to allocate data frpl err=%ld\n",
+		       PTR_ERR(fr_desc->data_frpl));
+		return PTR_ERR(fr_desc->data_frpl);
+	}
+
+	fr_desc->data_mr = ib_alloc_fast_reg_mr(pd, ISCSI_ISER_SG_TABLESIZE);
+	if (IS_ERR(fr_desc->data_mr)) {
+		pr_err("Failed to allocate data frmr err=%ld\n",
+		       PTR_ERR(fr_desc->data_mr));
+		ret = PTR_ERR(fr_desc->data_mr);
+		goto err_data_frpl;
+	}
+	pr_debug("Create fr_desc %p page_list %p\n",
+		 fr_desc, fr_desc->data_frpl->page_list);
+	fr_desc->ind |= ISERT_DATA_KEY_VALID;
+
+	if (protection) {
+		struct ib_mr_init_attr mr_init_attr = {0};
+		struct pi_context *pi_ctx;
+
+		fr_desc->pi_ctx = kzalloc(sizeof(*fr_desc->pi_ctx), GFP_KERNEL);
+		if (!fr_desc->pi_ctx) {
+			pr_err("Failed to allocate pi context\n");
+			ret = -ENOMEM;
+			goto err_data_mr;
+		}
+		pi_ctx = fr_desc->pi_ctx;
+
+		pi_ctx->prot_frpl = ib_alloc_fast_reg_page_list(ib_device,
+						    ISCSI_ISER_SG_TABLESIZE);
+		if (IS_ERR(pi_ctx->prot_frpl)) {
+			pr_err("Failed to allocate prot frpl err=%ld\n",
+			       PTR_ERR(pi_ctx->prot_frpl));
+			ret = PTR_ERR(pi_ctx->prot_frpl);
+			goto err_pi_ctx;
+		}
+
+		pi_ctx->prot_mr = ib_alloc_fast_reg_mr(pd, ISCSI_ISER_SG_TABLESIZE);
+		if (IS_ERR(pi_ctx->prot_mr)) {
+			pr_err("Failed to allocate prot frmr err=%ld\n",
+			       PTR_ERR(pi_ctx->prot_mr));
+			ret = PTR_ERR(pi_ctx->prot_mr);
+			goto err_prot_frpl;
+		}
+		fr_desc->ind |= ISERT_PROT_KEY_VALID;
+
+		mr_init_attr.max_reg_descriptors = 2;
+		mr_init_attr.flags |= IB_MR_SIGNATURE_EN;
+		pi_ctx->sig_mr = ib_create_mr(pd, &mr_init_attr);
+		if (IS_ERR(pi_ctx->sig_mr)) {
+			pr_err("Failed to allocate signature enabled mr err=%ld\n",
+			       PTR_ERR(pi_ctx->sig_mr));
+			ret = PTR_ERR(pi_ctx->sig_mr);
+			goto err_prot_mr;
+		}
+		fr_desc->ind |= ISERT_SIG_KEY_VALID;
+	}
+	fr_desc->ind &= ~ISERT_PROTECTED;
+
+	return 0;
+err_prot_mr:
+	ib_dereg_mr(fr_desc->pi_ctx->prot_mr);
+err_prot_frpl:
+	ib_free_fast_reg_page_list(fr_desc->pi_ctx->prot_frpl);
+err_pi_ctx:
+	kfree(fr_desc->pi_ctx);
+err_data_mr:
+	ib_dereg_mr(fr_desc->data_mr);
+err_data_frpl:
+	ib_free_fast_reg_page_list(fr_desc->data_frpl);
+
+	return ret;
+}
+
+static int
+isert_conn_create_fastreg_pool(struct isert_conn *isert_conn, u8 pi_support)
+{
+	struct fast_reg_descriptor *fr_desc;
+	struct isert_device *device = isert_conn->conn_device;
+	struct se_session *se_sess = isert_conn->conn->sess->se_sess;
+	struct se_node_acl *se_nacl = se_sess->se_node_acl;
+	int i, ret, tag_num;
+	/*
+	 * Setup the number of FRMRs based upon the number of tags
+	 * available to session in iscsi_target_locate_portal().
+	 */
+	tag_num = max_t(u32, ISCSIT_MIN_TAGS, se_nacl->queue_depth);
+	tag_num = (tag_num * 2) + ISCSIT_EXTRA_TAGS;
+
+	isert_conn->conn_fr_pool_size = 0;
+	for (i = 0; i < tag_num; i++) {
+		fr_desc = kzalloc(sizeof(*fr_desc), GFP_KERNEL);
+		if (!fr_desc) {
+			pr_err("Failed to allocate fast_reg descriptor\n");
+			ret = -ENOMEM;
+			goto err;
+		}
+
+		ret = isert_create_fr_desc(device->ib_device,
+					   isert_conn->conn_pd, fr_desc,
+					   pi_support);
+		if (ret) {
+			pr_err("Failed to create fastreg descriptor err=%d\n",
+			       ret);
+			kfree(fr_desc);
+			goto err;
+		}
+
+		list_add_tail(&fr_desc->list, &isert_conn->conn_fr_pool);
+		isert_conn->conn_fr_pool_size++;
+	}
+
+	pr_debug("Creating conn %p fastreg pool size=%d",
+		 isert_conn, isert_conn->conn_fr_pool_size);
+
+	return 0;
+
+err:
+	isert_conn_free_fastreg_pool(isert_conn);
+	return ret;
+}
+
+static int
+isert_connect_request(struct rdma_cm_id *cma_id, struct rdma_cm_event *event)
+{
+	struct iscsi_np *np = cma_id->context;
+	struct isert_np *isert_np = np->np_context;
+	struct isert_conn *isert_conn;
+	struct isert_device *device;
+	struct ib_device *ib_dev = cma_id->device;
+	int ret = 0;
+	u8 pi_support;
+
+	spin_lock_bh(&np->np_thread_lock);
+	if (!np->enabled) {
+		spin_unlock_bh(&np->np_thread_lock);
+		pr_debug("iscsi_np is not enabled, reject connect request\n");
+		return rdma_reject(cma_id, NULL, 0);
+	}
+	spin_unlock_bh(&np->np_thread_lock);
+
+	pr_debug("Entering isert_connect_request cma_id: %p, context: %p\n",
+		 cma_id, cma_id->context);
+
+	isert_conn = kzalloc(sizeof(struct isert_conn), GFP_KERNEL);
+	if (!isert_conn) {
+		pr_err("Unable to allocate isert_conn\n");
+		return -ENOMEM;
+	}
+	isert_conn->state = ISER_CONN_INIT;
+	INIT_LIST_HEAD(&isert_conn->conn_accept_node);
+	init_completion(&isert_conn->conn_login_comp);
+	init_completion(&isert_conn->conn_wait);
+	init_completion(&isert_conn->conn_wait_comp_err);
+	kref_init(&isert_conn->conn_kref);
+	mutex_init(&isert_conn->conn_mutex);
+	spin_lock_init(&isert_conn->conn_lock);
+	INIT_LIST_HEAD(&isert_conn->conn_fr_pool);
+
+	cma_id->context = isert_conn;
+	isert_conn->conn_cm_id = cma_id;
+
+	isert_conn->login_buf = kzalloc(ISCSI_DEF_MAX_RECV_SEG_LEN +
+					ISER_RX_LOGIN_SIZE, GFP_KERNEL);
+	if (!isert_conn->login_buf) {
+		pr_err("Unable to allocate isert_conn->login_buf\n");
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	isert_conn->login_req_buf = isert_conn->login_buf;
+	isert_conn->login_rsp_buf = isert_conn->login_buf +
+				    ISCSI_DEF_MAX_RECV_SEG_LEN;
+	pr_debug("Set login_buf: %p login_req_buf: %p login_rsp_buf: %p\n",
+		 isert_conn->login_buf, isert_conn->login_req_buf,
+		 isert_conn->login_rsp_buf);
+
+	isert_conn->login_req_dma = ib_dma_map_single(ib_dev,
+				(void *)isert_conn->login_req_buf,
+				ISCSI_DEF_MAX_RECV_SEG_LEN, DMA_FROM_DEVICE);
+
+	ret = ib_dma_mapping_error(ib_dev, isert_conn->login_req_dma);
+	if (ret) {
+		pr_err("ib_dma_mapping_error failed for login_req_dma: %d\n",
+		       ret);
+		isert_conn->login_req_dma = 0;
+		goto out_login_buf;
+	}
+
+	isert_conn->login_rsp_dma = ib_dma_map_single(ib_dev,
+					(void *)isert_conn->login_rsp_buf,
+					ISER_RX_LOGIN_SIZE, DMA_TO_DEVICE);
+
+	ret = ib_dma_mapping_error(ib_dev, isert_conn->login_rsp_dma);
+	if (ret) {
+		pr_err("ib_dma_mapping_error failed for login_rsp_dma: %d\n",
+		       ret);
+		isert_conn->login_rsp_dma = 0;
+		goto out_req_dma_map;
+	}
+
+	device = isert_device_find_by_ib_dev(cma_id);
+	if (IS_ERR(device)) {
+		ret = PTR_ERR(device);
+		goto out_rsp_dma_map;
+	}
+
+	/* Set max inflight RDMA READ requests */
+	isert_conn->initiator_depth = min_t(u8,
+				event->param.conn.initiator_depth,
+				device->dev_attr.max_qp_init_rd_atom);
+	pr_debug("Using initiator_depth: %u\n", isert_conn->initiator_depth);
+
+	isert_conn->conn_device = device;
+	isert_conn->conn_pd = ib_alloc_pd(isert_conn->conn_device->ib_device);
+	if (IS_ERR(isert_conn->conn_pd)) {
+		ret = PTR_ERR(isert_conn->conn_pd);
+		pr_err("ib_alloc_pd failed for conn %p: ret=%d\n",
+		       isert_conn, ret);
+		goto out_pd;
+	}
+
+	isert_conn->conn_mr = ib_get_dma_mr(isert_conn->conn_pd,
+					   IB_ACCESS_LOCAL_WRITE);
+	if (IS_ERR(isert_conn->conn_mr)) {
+		ret = PTR_ERR(isert_conn->conn_mr);
+		pr_err("ib_get_dma_mr failed for conn %p: ret=%d\n",
+		       isert_conn, ret);
+		goto out_mr;
+	}
+
+	pi_support = np->tpg_np->tpg->tpg_attrib.t10_pi;
+	if (pi_support && !device->pi_capable) {
+		pr_err("Protection information requested but not supported, "
+		       "rejecting connect request\n");
+		ret = rdma_reject(cma_id, NULL, 0);
+		goto out_mr;
+	}
+
+	ret = isert_conn_setup_qp(isert_conn, cma_id, pi_support);
+	if (ret)
+		goto out_conn_dev;
+
+	mutex_lock(&isert_np->np_accept_mutex);
+	list_add_tail(&isert_conn->conn_accept_node, &isert_np->np_accept_list);
+	mutex_unlock(&isert_np->np_accept_mutex);
+
+	pr_debug("isert_connect_request() up np_sem np: %p\n", np);
+	up(&isert_np->np_sem);
+	return 0;
+
+out_conn_dev:
+	ib_dereg_mr(isert_conn->conn_mr);
+out_mr:
+	ib_dealloc_pd(isert_conn->conn_pd);
+out_pd:
+	isert_device_try_release(device);
+out_rsp_dma_map:
+	ib_dma_unmap_single(ib_dev, isert_conn->login_rsp_dma,
+			    ISER_RX_LOGIN_SIZE, DMA_TO_DEVICE);
+out_req_dma_map:
+	ib_dma_unmap_single(ib_dev, isert_conn->login_req_dma,
+			    ISCSI_DEF_MAX_RECV_SEG_LEN, DMA_FROM_DEVICE);
+out_login_buf:
+	kfree(isert_conn->login_buf);
+out:
+	kfree(isert_conn);
+	return ret;
+}
+
+static void
+isert_connect_release(struct isert_conn *isert_conn)
+{
+	struct ib_device *ib_dev = isert_conn->conn_cm_id->device;
+	struct isert_device *device = isert_conn->conn_device;
+	int cq_index;
+
+	pr_debug("Entering isert_connect_release(): >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>\n");
+
+	if (device && device->use_fastreg)
+		isert_conn_free_fastreg_pool(isert_conn);
+
+	if (isert_conn->conn_qp) {
+		cq_index = ((struct isert_cq_desc *)
+			isert_conn->conn_qp->recv_cq->cq_context)->cq_index;
+		pr_debug("isert_connect_release: cq_index: %d\n", cq_index);
+		isert_conn->conn_device->cq_active_qps[cq_index]--;
+
+		rdma_destroy_qp(isert_conn->conn_cm_id);
+	}
+
+	isert_free_rx_descriptors(isert_conn);
+	rdma_destroy_id(isert_conn->conn_cm_id);
+
+	ib_dereg_mr(isert_conn->conn_mr);
+	ib_dealloc_pd(isert_conn->conn_pd);
+
+	if (isert_conn->login_buf) {
+		ib_dma_unmap_single(ib_dev, isert_conn->login_rsp_dma,
+				    ISER_RX_LOGIN_SIZE, DMA_TO_DEVICE);
+		ib_dma_unmap_single(ib_dev, isert_conn->login_req_dma,
+				    ISCSI_DEF_MAX_RECV_SEG_LEN,
+				    DMA_FROM_DEVICE);
+		kfree(isert_conn->login_buf);
+	}
+	kfree(isert_conn);
+
+	if (device)
+		isert_device_try_release(device);
+
+	pr_debug("Leaving isert_connect_release >>>>>>>>>>>>\n");
+}
+
+static void
+isert_connected_handler(struct rdma_cm_id *cma_id)
+{
+	struct isert_conn *isert_conn = cma_id->context;
+
+	kref_get(&isert_conn->conn_kref);
+}
+
+static void
+isert_release_conn_kref(struct kref *kref)
+{
+	struct isert_conn *isert_conn = container_of(kref,
+				struct isert_conn, conn_kref);
+
+	pr_debug("Calling isert_connect_release for final kref %s/%d\n",
+		 current->comm, current->pid);
+
+	isert_connect_release(isert_conn);
+}
+
+static void
+isert_put_conn(struct isert_conn *isert_conn)
+{
+	kref_put(&isert_conn->conn_kref, isert_release_conn_kref);
+}
+
+static void
+isert_disconnect_work(struct work_struct *work)
+{
+	struct isert_conn *isert_conn = container_of(work,
+				struct isert_conn, conn_logout_work);
+
+	pr_debug("isert_disconnect_work(): >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>\n");
+	mutex_lock(&isert_conn->conn_mutex);
+	if (isert_conn->state == ISER_CONN_UP)
+		isert_conn->state = ISER_CONN_TERMINATING;
+
+	if (isert_conn->post_recv_buf_count == 0 &&
+	    atomic_read(&isert_conn->post_send_buf_count) == 0) {
+		mutex_unlock(&isert_conn->conn_mutex);
+		goto wake_up;
+	}
+	if (!isert_conn->conn_cm_id) {
+		mutex_unlock(&isert_conn->conn_mutex);
+		isert_put_conn(isert_conn);
+		return;
+	}
+
+	if (isert_conn->disconnect) {
+		/* Send DREQ/DREP towards our initiator */
+		rdma_disconnect(isert_conn->conn_cm_id);
+	}
+
+	mutex_unlock(&isert_conn->conn_mutex);
+
+wake_up:
+	complete(&isert_conn->conn_wait);
+}
+
+static int
+isert_disconnected_handler(struct rdma_cm_id *cma_id, bool disconnect)
+{
+	struct isert_conn *isert_conn;
+
+	if (!cma_id->qp) {
+		struct isert_np *isert_np = cma_id->context;
+
+		isert_np->np_cm_id = NULL;
+		return -1;
+	}
+
+	isert_conn = (struct isert_conn *)cma_id->context;
+
+	isert_conn->disconnect = disconnect;
+	INIT_WORK(&isert_conn->conn_logout_work, isert_disconnect_work);
+	schedule_work(&isert_conn->conn_logout_work);
+
+	return 0;
+}
+
+static int
+isert_cma_handler(struct rdma_cm_id *cma_id, struct rdma_cm_event *event)
+{
+	int ret = 0;
+	bool disconnect = false;
+
+	pr_debug("isert_cma_handler: event %d status %d conn %p id %p\n",
+		 event->event, event->status, cma_id->context, cma_id);
+
+	switch (event->event) {
+	case RDMA_CM_EVENT_CONNECT_REQUEST:
+		ret = isert_connect_request(cma_id, event);
+		if (ret)
+			pr_err("isert_cma_handler failed RDMA_CM_EVENT: 0x%08x %d\n",
+				event->event, ret);
+		break;
+	case RDMA_CM_EVENT_ESTABLISHED:
+		isert_connected_handler(cma_id);
+		break;
+	case RDMA_CM_EVENT_ADDR_CHANGE:    /* FALLTHRU */
+	case RDMA_CM_EVENT_DISCONNECTED:   /* FALLTHRU */
+	case RDMA_CM_EVENT_DEVICE_REMOVAL: /* FALLTHRU */
+		disconnect = true;
+	case RDMA_CM_EVENT_TIMEWAIT_EXIT:  /* FALLTHRU */
+		ret = isert_disconnected_handler(cma_id, disconnect);
+		break;
+	case RDMA_CM_EVENT_CONNECT_ERROR:
+	default:
+		pr_err("Unhandled RDMA CMA event: %d\n", event->event);
+		break;
+	}
+
+	return ret;
+}
+
+static int
+isert_post_recv(struct isert_conn *isert_conn, u32 count)
+{
+	struct ib_recv_wr *rx_wr, *rx_wr_failed;
+	int i, ret;
+	unsigned int rx_head = isert_conn->conn_rx_desc_head;
+	struct iser_rx_desc *rx_desc;
+
+	for (rx_wr = isert_conn->conn_rx_wr, i = 0; i < count; i++, rx_wr++) {
+		rx_desc		= &isert_conn->conn_rx_descs[rx_head];
+		rx_wr->wr_id	= (unsigned long)rx_desc;
+		rx_wr->sg_list	= &rx_desc->rx_sg;
+		rx_wr->num_sge	= 1;
+		rx_wr->next	= rx_wr + 1;
+		rx_head = (rx_head + 1) & (ISERT_QP_MAX_RECV_DTOS - 1);
+	}
+
+	rx_wr--;
+	rx_wr->next = NULL; /* mark end of work requests list */
+
+	isert_conn->post_recv_buf_count += count;
+	ret = ib_post_recv(isert_conn->conn_qp, isert_conn->conn_rx_wr,
+				&rx_wr_failed);
+	if (ret) {
+		pr_err("ib_post_recv() failed with ret: %d\n", ret);
+		isert_conn->post_recv_buf_count -= count;
+	} else {
+		pr_debug("isert_post_recv(): Posted %d RX buffers\n", count);
+		isert_conn->conn_rx_desc_head = rx_head;
+	}
+	return ret;
+}
+
+static int
+isert_post_send(struct isert_conn *isert_conn, struct iser_tx_desc *tx_desc)
+{
+	struct ib_device *ib_dev = isert_conn->conn_cm_id->device;
+	struct ib_send_wr send_wr, *send_wr_failed;
+	int ret;
+
+	ib_dma_sync_single_for_device(ib_dev, tx_desc->dma_addr,
+				      ISER_HEADERS_LEN, DMA_TO_DEVICE);
+
+	send_wr.next	= NULL;
+	send_wr.wr_id	= (unsigned long)tx_desc;
+	send_wr.sg_list	= tx_desc->tx_sg;
+	send_wr.num_sge	= tx_desc->num_sge;
+	send_wr.opcode	= IB_WR_SEND;
+	send_wr.send_flags = IB_SEND_SIGNALED;
+
+	atomic_inc(&isert_conn->post_send_buf_count);
+
+	ret = ib_post_send(isert_conn->conn_qp, &send_wr, &send_wr_failed);
+	if (ret) {
+		pr_err("ib_post_send() failed, ret: %d\n", ret);
+		atomic_dec(&isert_conn->post_send_buf_count);
+	}
+
+	return ret;
+}
+
+static void
+isert_create_send_desc(struct isert_conn *isert_conn,
+		       struct isert_cmd *isert_cmd,
+		       struct iser_tx_desc *tx_desc)
+{
+	struct ib_device *ib_dev = isert_conn->conn_cm_id->device;
+
+	ib_dma_sync_single_for_cpu(ib_dev, tx_desc->dma_addr,
+				   ISER_HEADERS_LEN, DMA_TO_DEVICE);
+
+	memset(&tx_desc->iser_header, 0, sizeof(struct iser_hdr));
+	tx_desc->iser_header.flags = ISER_VER;
+
+	tx_desc->num_sge = 1;
+	tx_desc->isert_cmd = isert_cmd;
+
+	if (tx_desc->tx_sg[0].lkey != isert_conn->conn_mr->lkey) {
+		tx_desc->tx_sg[0].lkey = isert_conn->conn_mr->lkey;
+		pr_debug("tx_desc %p lkey mismatch, fixing\n", tx_desc);
+	}
+}
+
+static int
+isert_init_tx_hdrs(struct isert_conn *isert_conn,
+		   struct iser_tx_desc *tx_desc)
+{
+	struct ib_device *ib_dev = isert_conn->conn_cm_id->device;
+	u64 dma_addr;
+
+	dma_addr = ib_dma_map_single(ib_dev, (void *)tx_desc,
+			ISER_HEADERS_LEN, DMA_TO_DEVICE);
+	if (ib_dma_mapping_error(ib_dev, dma_addr)) {
+		pr_err("ib_dma_mapping_error() failed\n");
+		return -ENOMEM;
+	}
+
+	tx_desc->dma_addr = dma_addr;
+	tx_desc->tx_sg[0].addr	= tx_desc->dma_addr;
+	tx_desc->tx_sg[0].length = ISER_HEADERS_LEN;
+	tx_desc->tx_sg[0].lkey = isert_conn->conn_mr->lkey;
+
+	pr_debug("isert_init_tx_hdrs: Setup tx_sg[0].addr: 0x%llx length: %u"
+		 " lkey: 0x%08x\n", tx_desc->tx_sg[0].addr,
+		 tx_desc->tx_sg[0].length, tx_desc->tx_sg[0].lkey);
+
+	return 0;
+}
+
+static void
+isert_init_send_wr(struct isert_conn *isert_conn, struct isert_cmd *isert_cmd,
+		   struct ib_send_wr *send_wr, bool coalesce)
+{
+	struct iser_tx_desc *tx_desc = &isert_cmd->tx_desc;
+
+	isert_cmd->rdma_wr.iser_ib_op = ISER_IB_SEND;
+	send_wr->wr_id = (unsigned long)&isert_cmd->tx_desc;
+	send_wr->opcode = IB_WR_SEND;
+	send_wr->sg_list = &tx_desc->tx_sg[0];
+	send_wr->num_sge = isert_cmd->tx_desc.num_sge;
+	/*
+	 * Coalesce send completion interrupts by only setting IB_SEND_SIGNALED
+	 * bit for every ISERT_COMP_BATCH_COUNT number of ib_post_send() calls.
+	 */
+	mutex_lock(&isert_conn->conn_mutex);
+	if (coalesce && isert_conn->state == ISER_CONN_UP &&
+	    ++isert_conn->conn_comp_batch < ISERT_COMP_BATCH_COUNT) {
+		tx_desc->llnode_active = true;
+		llist_add(&tx_desc->comp_llnode, &isert_conn->conn_comp_llist);
+		mutex_unlock(&isert_conn->conn_mutex);
+		return;
+	}
+	isert_conn->conn_comp_batch = 0;
+	tx_desc->comp_llnode_batch = llist_del_all(&isert_conn->conn_comp_llist);
+	mutex_unlock(&isert_conn->conn_mutex);
+
+	send_wr->send_flags = IB_SEND_SIGNALED;
+}
+
+static int
+isert_rdma_post_recvl(struct isert_conn *isert_conn)
+{
+	struct ib_recv_wr rx_wr, *rx_wr_fail;
+	struct ib_sge sge;
+	int ret;
+
+	memset(&sge, 0, sizeof(struct ib_sge));
+	sge.addr = isert_conn->login_req_dma;
+	sge.length = ISER_RX_LOGIN_SIZE;
+	sge.lkey = isert_conn->conn_mr->lkey;
+
+	pr_debug("Setup sge: addr: %llx length: %d 0x%08x\n",
+		sge.addr, sge.length, sge.lkey);
+
+	memset(&rx_wr, 0, sizeof(struct ib_recv_wr));
+	rx_wr.wr_id = (unsigned long)isert_conn->login_req_buf;
+	rx_wr.sg_list = &sge;
+	rx_wr.num_sge = 1;
+
+	isert_conn->post_recv_buf_count++;
+	ret = ib_post_recv(isert_conn->conn_qp, &rx_wr, &rx_wr_fail);
+	if (ret) {
+		pr_err("ib_post_recv() failed: %d\n", ret);
+		isert_conn->post_recv_buf_count--;
+	}
+
+	pr_debug("ib_post_recv(): returned success >>>>>>>>>>>>>>>>>>>>>>>>\n");
+	return ret;
+}
+
+static int
+isert_put_login_tx(struct iscsi_conn *conn, struct iscsi_login *login,
+		   u32 length)
+{
+	struct isert_conn *isert_conn = conn->context;
+	struct ib_device *ib_dev = isert_conn->conn_cm_id->device;
+	struct iser_tx_desc *tx_desc = &isert_conn->conn_login_tx_desc;
+	int ret;
+
+	isert_create_send_desc(isert_conn, NULL, tx_desc);
+
+	memcpy(&tx_desc->iscsi_header, &login->rsp[0],
+	       sizeof(struct iscsi_hdr));
+
+	isert_init_tx_hdrs(isert_conn, tx_desc);
+
+	if (length > 0) {
+		struct ib_sge *tx_dsg = &tx_desc->tx_sg[1];
+
+		ib_dma_sync_single_for_cpu(ib_dev, isert_conn->login_rsp_dma,
+					   length, DMA_TO_DEVICE);
+
+		memcpy(isert_conn->login_rsp_buf, login->rsp_buf, length);
+
+		ib_dma_sync_single_for_device(ib_dev, isert_conn->login_rsp_dma,
+					      length, DMA_TO_DEVICE);
+
+		tx_dsg->addr	= isert_conn->login_rsp_dma;
+		tx_dsg->length	= length;
+		tx_dsg->lkey	= isert_conn->conn_mr->lkey;
+		tx_desc->num_sge = 2;
+	}
+	if (!login->login_failed) {
+		if (login->login_complete) {
+			if (!conn->sess->sess_ops->SessionType &&
+			    isert_conn->conn_device->use_fastreg) {
+				/* Normal Session and fastreg is used */
+				u8 pi_support = login->np->tpg_np->tpg->tpg_attrib.t10_pi;
+
+				ret = isert_conn_create_fastreg_pool(isert_conn,
+								     pi_support);
+				if (ret) {
+					pr_err("Conn: %p failed to create"
+					       " fastreg pool\n", isert_conn);
+					return ret;
+				}
+			}
+
+			ret = isert_alloc_rx_descriptors(isert_conn);
+			if (ret)
+				return ret;
+
+			ret = isert_post_recv(isert_conn, ISERT_MIN_POSTED_RX);
+			if (ret)
+				return ret;
+
+			isert_conn->state = ISER_CONN_UP;
+			goto post_send;
+		}
+
+		ret = isert_rdma_post_recvl(isert_conn);
+		if (ret)
+			return ret;
+	}
+post_send:
+	ret = isert_post_send(isert_conn, tx_desc);
+	if (ret)
+		return ret;
+
+	return 0;
+}
+
+static void
+isert_rx_login_req(struct iser_rx_desc *rx_desc, int rx_buflen,
+		   struct isert_conn *isert_conn)
+{
+	struct iscsi_conn *conn = isert_conn->conn;
+	struct iscsi_login *login = conn->conn_login;
+	int size;
+
+	if (!login) {
+		pr_err("conn->conn_login is NULL\n");
+		dump_stack();
+		return;
+	}
+
+	if (login->first_request) {
+		struct iscsi_login_req *login_req =
+			(struct iscsi_login_req *)&rx_desc->iscsi_header;
+		/*
+		 * Setup the initial iscsi_login values from the leading
+		 * login request PDU.
+		 */
+		login->leading_connection = (!login_req->tsih) ? 1 : 0;
+		login->current_stage =
+			(login_req->flags & ISCSI_FLAG_LOGIN_CURRENT_STAGE_MASK)
+			 >> 2;
+		login->version_min	= login_req->min_version;
+		login->version_max	= login_req->max_version;
+		memcpy(login->isid, login_req->isid, 6);
+		login->cmd_sn		= be32_to_cpu(login_req->cmdsn);
+		login->init_task_tag	= login_req->itt;
+		login->initial_exp_statsn = be32_to_cpu(login_req->exp_statsn);
+		login->cid		= be16_to_cpu(login_req->cid);
+		login->tsih		= be16_to_cpu(login_req->tsih);
+	}
+
+	memcpy(&login->req[0], (void *)&rx_desc->iscsi_header, ISCSI_HDR_LEN);
+
+	size = min(rx_buflen, MAX_KEY_VALUE_PAIRS);
+	pr_debug("Using login payload size: %d, rx_buflen: %d MAX_KEY_VALUE_PAIRS: %d\n",
+		 size, rx_buflen, MAX_KEY_VALUE_PAIRS);
+	memcpy(login->req_buf, &rx_desc->data[0], size);
+
+	if (login->first_request) {
+		complete(&isert_conn->conn_login_comp);
+		return;
+	}
+	schedule_delayed_work(&conn->login_work, 0);
+}
+
+static struct iscsi_cmd
+*isert_allocate_cmd(struct iscsi_conn *conn)
+{
+	struct isert_conn *isert_conn = (struct isert_conn *)conn->context;
+	struct isert_cmd *isert_cmd;
+	struct iscsi_cmd *cmd;
+
+	cmd = iscsit_allocate_cmd(conn, TASK_INTERRUPTIBLE);
+	if (!cmd) {
+		pr_err("Unable to allocate iscsi_cmd + isert_cmd\n");
+		return NULL;
+	}
+	isert_cmd = iscsit_priv_cmd(cmd);
+	isert_cmd->conn = isert_conn;
+	isert_cmd->iscsi_cmd = cmd;
+
+	return cmd;
+}
+
+static int
+isert_handle_scsi_cmd(struct isert_conn *isert_conn,
+		      struct isert_cmd *isert_cmd, struct iscsi_cmd *cmd,
+		      struct iser_rx_desc *rx_desc, unsigned char *buf)
+{
+	struct iscsi_conn *conn = isert_conn->conn;
+	struct iscsi_scsi_req *hdr = (struct iscsi_scsi_req *)buf;
+	struct scatterlist *sg;
+	int imm_data, imm_data_len, unsol_data, sg_nents, rc;
+	bool dump_payload = false;
+
+	rc = iscsit_setup_scsi_cmd(conn, cmd, buf);
+	if (rc < 0)
+		return rc;
+
+	imm_data = cmd->immediate_data;
+	imm_data_len = cmd->first_burst_len;
+	unsol_data = cmd->unsolicited_data;
+
+	rc = iscsit_process_scsi_cmd(conn, cmd, hdr);
+	if (rc < 0) {
+		return 0;
+	} else if (rc > 0) {
+		dump_payload = true;
+		goto sequence_cmd;
+	}
+
+	if (!imm_data)
+		return 0;
+
+	sg = &cmd->se_cmd.t_data_sg[0];
+	sg_nents = max(1UL, DIV_ROUND_UP(imm_data_len, PAGE_SIZE));
+
+	pr_debug("Copying Immediate SG: %p sg_nents: %u from %p imm_data_len: %d\n",
+		 sg, sg_nents, &rx_desc->data[0], imm_data_len);
+
+	sg_copy_from_buffer(sg, sg_nents, &rx_desc->data[0], imm_data_len);
+
+	cmd->write_data_done += imm_data_len;
+
+	if (cmd->write_data_done == cmd->se_cmd.data_length) {
+		spin_lock_bh(&cmd->istate_lock);
+		cmd->cmd_flags |= ICF_GOT_LAST_DATAOUT;
+		cmd->i_state = ISTATE_RECEIVED_LAST_DATAOUT;
+		spin_unlock_bh(&cmd->istate_lock);
+	}
+
+sequence_cmd:
+	rc = iscsit_sequence_cmd(conn, cmd, buf, hdr->cmdsn);
+
+	if (!rc && dump_payload == false && unsol_data)
+		iscsit_set_unsoliticed_dataout(cmd);
+	else if (dump_payload && imm_data)
+		target_put_sess_cmd(conn->sess->se_sess, &cmd->se_cmd);
+
+	return 0;
+}
+
+static int
+isert_handle_iscsi_dataout(struct isert_conn *isert_conn,
+			   struct iser_rx_desc *rx_desc, unsigned char *buf)
+{
+	struct scatterlist *sg_start;
+	struct iscsi_conn *conn = isert_conn->conn;
+	struct iscsi_cmd *cmd = NULL;
+	struct iscsi_data *hdr = (struct iscsi_data *)buf;
+	u32 unsol_data_len = ntoh24(hdr->dlength);
+	int rc, sg_nents, sg_off, page_off;
+
+	rc = iscsit_check_dataout_hdr(conn, buf, &cmd);
+	if (rc < 0)
+		return rc;
+	else if (!cmd)
+		return 0;
+	/*
+	 * FIXME: Unexpected unsolicited_data out
+	 */
+	if (!cmd->unsolicited_data) {
+		pr_err("Received unexpected solicited data payload\n");
+		dump_stack();
+		return -1;
+	}
+
+	pr_debug("Unsolicited DataOut unsol_data_len: %u, write_data_done: %u, data_length: %u\n",
+		 unsol_data_len, cmd->write_data_done, cmd->se_cmd.data_length);
+
+	sg_off = cmd->write_data_done / PAGE_SIZE;
+	sg_start = &cmd->se_cmd.t_data_sg[sg_off];
+	sg_nents = max(1UL, DIV_ROUND_UP(unsol_data_len, PAGE_SIZE));
+	page_off = cmd->write_data_done % PAGE_SIZE;
+	/*
+	 * FIXME: Non page-aligned unsolicited_data out
+	 */
+	if (page_off) {
+		pr_err("Received unexpected non-page aligned data payload\n");
+		dump_stack();
+		return -1;
+	}
+	pr_debug("Copying DataOut: sg_start: %p, sg_off: %u sg_nents: %u from %p %u\n",
+		 sg_start, sg_off, sg_nents, &rx_desc->data[0], unsol_data_len);
+
+	sg_copy_from_buffer(sg_start, sg_nents, &rx_desc->data[0],
+			    unsol_data_len);
+
+	rc = iscsit_check_dataout_payload(cmd, hdr, false);
+	if (rc < 0)
+		return rc;
+
+	return 0;
+}
+
+static int
+isert_handle_nop_out(struct isert_conn *isert_conn, struct isert_cmd *isert_cmd,
+		     struct iscsi_cmd *cmd, struct iser_rx_desc *rx_desc,
+		     unsigned char *buf)
+{
+	struct iscsi_conn *conn = isert_conn->conn;
+	struct iscsi_nopout *hdr = (struct iscsi_nopout *)buf;
+	int rc;
+
+	rc = iscsit_setup_nop_out(conn, cmd, hdr);
+	if (rc < 0)
+		return rc;
+	/*
+	 * FIXME: Add support for NOPOUT payload using unsolicited RDMA payload
+	 */
+
+	return iscsit_process_nop_out(conn, cmd, hdr);
+}
+
+static int
+isert_handle_text_cmd(struct isert_conn *isert_conn, struct isert_cmd *isert_cmd,
+		      struct iscsi_cmd *cmd, struct iser_rx_desc *rx_desc,
+		      struct iscsi_text *hdr)
+{
+	struct iscsi_conn *conn = isert_conn->conn;
+	u32 payload_length = ntoh24(hdr->dlength);
+	int rc;
+	unsigned char *text_in;
+
+	rc = iscsit_setup_text_cmd(conn, cmd, hdr);
+	if (rc < 0)
+		return rc;
+
+	text_in = kzalloc(payload_length, GFP_KERNEL);
+	if (!text_in) {
+		pr_err("Unable to allocate text_in of payload_length: %u\n",
+		       payload_length);
+		return -ENOMEM;
+	}
+	cmd->text_in_ptr = text_in;
+
+	memcpy(cmd->text_in_ptr, &rx_desc->data[0], payload_length);
+
+	return iscsit_process_text_cmd(conn, cmd, hdr);
+}
+
+static int
+isert_rx_opcode(struct isert_conn *isert_conn, struct iser_rx_desc *rx_desc,
+		uint32_t read_stag, uint64_t read_va,
+		uint32_t write_stag, uint64_t write_va)
+{
+	struct iscsi_hdr *hdr = &rx_desc->iscsi_header;
+	struct iscsi_conn *conn = isert_conn->conn;
+	struct iscsi_session *sess = conn->sess;
+	struct iscsi_cmd *cmd;
+	struct isert_cmd *isert_cmd;
+	int ret = -EINVAL;
+	u8 opcode = (hdr->opcode & ISCSI_OPCODE_MASK);
+
+	if (sess->sess_ops->SessionType &&
+	   (!(opcode & ISCSI_OP_TEXT) || !(opcode & ISCSI_OP_LOGOUT))) {
+		pr_err("Got illegal opcode: 0x%02x in SessionType=Discovery,"
+		       " ignoring\n", opcode);
+		return 0;
+	}
+
+	switch (opcode) {
+	case ISCSI_OP_SCSI_CMD:
+		cmd = isert_allocate_cmd(conn);
+		if (!cmd)
+			break;
+
+		isert_cmd = iscsit_priv_cmd(cmd);
+		isert_cmd->read_stag = read_stag;
+		isert_cmd->read_va = read_va;
+		isert_cmd->write_stag = write_stag;
+		isert_cmd->write_va = write_va;
+
+		ret = isert_handle_scsi_cmd(isert_conn, isert_cmd, cmd,
+					rx_desc, (unsigned char *)hdr);
+		break;
+	case ISCSI_OP_NOOP_OUT:
+		cmd = isert_allocate_cmd(conn);
+		if (!cmd)
+			break;
+
+		isert_cmd = iscsit_priv_cmd(cmd);
+		ret = isert_handle_nop_out(isert_conn, isert_cmd, cmd,
+					   rx_desc, (unsigned char *)hdr);
+		break;
+	case ISCSI_OP_SCSI_DATA_OUT:
+		ret = isert_handle_iscsi_dataout(isert_conn, rx_desc,
+						(unsigned char *)hdr);
+		break;
+	case ISCSI_OP_SCSI_TMFUNC:
+		cmd = isert_allocate_cmd(conn);
+		if (!cmd)
+			break;
+
+		ret = iscsit_handle_task_mgt_cmd(conn, cmd,
+						(unsigned char *)hdr);
+		break;
+	case ISCSI_OP_LOGOUT:
+		cmd = isert_allocate_cmd(conn);
+		if (!cmd)
+			break;
+
+		ret = iscsit_handle_logout_cmd(conn, cmd, (unsigned char *)hdr);
+		if (ret > 0)
+			wait_for_completion_timeout(&conn->conn_logout_comp,
+						    SECONDS_FOR_LOGOUT_COMP *
+						    HZ);
+		break;
+	case ISCSI_OP_TEXT:
+		cmd = isert_allocate_cmd(conn);
+		if (!cmd)
+			break;
+
+		isert_cmd = iscsit_priv_cmd(cmd);
+		ret = isert_handle_text_cmd(isert_conn, isert_cmd, cmd,
+					    rx_desc, (struct iscsi_text *)hdr);
+		break;
+	default:
+		pr_err("Got unknown iSCSI OpCode: 0x%02x\n", opcode);
+		dump_stack();
+		break;
+	}
+
+	return ret;
+}
+
+static void
+isert_rx_do_work(struct iser_rx_desc *rx_desc, struct isert_conn *isert_conn)
+{
+	struct iser_hdr *iser_hdr = &rx_desc->iser_header;
+	uint64_t read_va = 0, write_va = 0;
+	uint32_t read_stag = 0, write_stag = 0;
+	int rc;
+
+	switch (iser_hdr->flags & 0xF0) {
+	case ISCSI_CTRL:
+		if (iser_hdr->flags & ISER_RSV) {
+			read_stag = be32_to_cpu(iser_hdr->read_stag);
+			read_va = be64_to_cpu(iser_hdr->read_va);
+			pr_debug("ISER_RSV: read_stag: 0x%08x read_va: 0x%16llx\n",
+				 read_stag, (unsigned long long)read_va);
+		}
+		if (iser_hdr->flags & ISER_WSV) {
+			write_stag = be32_to_cpu(iser_hdr->write_stag);
+			write_va = be64_to_cpu(iser_hdr->write_va);
+			pr_debug("ISER_WSV: write__stag: 0x%08x write_va: 0x%16llx\n",
+				 write_stag, (unsigned long long)write_va);
+		}
+
+		pr_debug("ISER ISCSI_CTRL PDU\n");
+		break;
+	case ISER_HELLO:
+		pr_err("iSER Hello message\n");
+		break;
+	default:
+		pr_warn("Unknown iSER hdr flags: 0x%02x\n", iser_hdr->flags);
+		break;
+	}
+
+	rc = isert_rx_opcode(isert_conn, rx_desc,
+			     read_stag, read_va, write_stag, write_va);
+}
+
+static void
+isert_rx_completion(struct iser_rx_desc *desc, struct isert_conn *isert_conn,
+		    unsigned long xfer_len)
+{
+	struct ib_device *ib_dev = isert_conn->conn_cm_id->device;
+	struct iscsi_hdr *hdr;
+	u64 rx_dma;
+	int rx_buflen, outstanding;
+
+	if ((char *)desc == isert_conn->login_req_buf) {
+		rx_dma = isert_conn->login_req_dma;
+		rx_buflen = ISER_RX_LOGIN_SIZE;
+		pr_debug("ISER login_buf: Using rx_dma: 0x%llx, rx_buflen: %d\n",
+			 rx_dma, rx_buflen);
+	} else {
+		rx_dma = desc->dma_addr;
+		rx_buflen = ISER_RX_PAYLOAD_SIZE;
+		pr_debug("ISER req_buf: Using rx_dma: 0x%llx, rx_buflen: %d\n",
+			 rx_dma, rx_buflen);
+	}
+
+	ib_dma_sync_single_for_cpu(ib_dev, rx_dma, rx_buflen, DMA_FROM_DEVICE);
+
+	hdr = &desc->iscsi_header;
+	pr_debug("iSCSI opcode: 0x%02x, ITT: 0x%08x, flags: 0x%02x dlen: %d\n",
+		 hdr->opcode, hdr->itt, hdr->flags,
+		 (int)(xfer_len - ISER_HEADERS_LEN));
+
+	if ((char *)desc == isert_conn->login_req_buf)
+		isert_rx_login_req(desc, xfer_len - ISER_HEADERS_LEN,
+				   isert_conn);
+	else
+		isert_rx_do_work(desc, isert_conn);
+
+	ib_dma_sync_single_for_device(ib_dev, rx_dma, rx_buflen,
+				      DMA_FROM_DEVICE);
+
+	isert_conn->post_recv_buf_count--;
+	pr_debug("iSERT: Decremented post_recv_buf_count: %d\n",
+		 isert_conn->post_recv_buf_count);
+
+	if ((char *)desc == isert_conn->login_req_buf)
+		return;
+
+	outstanding = isert_conn->post_recv_buf_count;
+	if (outstanding + ISERT_MIN_POSTED_RX <= ISERT_QP_MAX_RECV_DTOS) {
+		int err, count = min(ISERT_QP_MAX_RECV_DTOS - outstanding,
+				ISERT_MIN_POSTED_RX);
+		err = isert_post_recv(isert_conn, count);
+		if (err) {
+			pr_err("isert_post_recv() count: %d failed, %d\n",
+			       count, err);
+		}
+	}
+}
+
+static int
+isert_map_data_buf(struct isert_conn *isert_conn, struct isert_cmd *isert_cmd,
+		   struct scatterlist *sg, u32 nents, u32 length, u32 offset,
+		   enum iser_ib_op_code op, struct isert_data_buf *data)
+{
+	struct ib_device *ib_dev = isert_conn->conn_cm_id->device;
+
+	data->dma_dir = op == ISER_IB_RDMA_WRITE ?
+			      DMA_TO_DEVICE : DMA_FROM_DEVICE;
+
+	data->len = length - offset;
+	data->offset = offset;
+	data->sg_off = data->offset / PAGE_SIZE;
+
+	data->sg = &sg[data->sg_off];
+	data->nents = min_t(unsigned int, nents - data->sg_off,
+					  ISCSI_ISER_SG_TABLESIZE);
+	data->len = min_t(unsigned int, data->len, ISCSI_ISER_SG_TABLESIZE *
+					PAGE_SIZE);
+
+	data->dma_nents = ib_dma_map_sg(ib_dev, data->sg, data->nents,
+					data->dma_dir);
+	if (unlikely(!data->dma_nents)) {
+		pr_err("Cmd: unable to dma map SGs %p\n", sg);
+		return -EINVAL;
+	}
+
+	pr_debug("Mapped cmd: %p count: %u sg: %p sg_nents: %u rdma_len %d\n",
+		 isert_cmd, data->dma_nents, data->sg, data->nents, data->len);
+
+	return 0;
+}
+
+static void
+isert_unmap_data_buf(struct isert_conn *isert_conn, struct isert_data_buf *data)
+{
+	struct ib_device *ib_dev = isert_conn->conn_cm_id->device;
+
+	ib_dma_unmap_sg(ib_dev, data->sg, data->nents, data->dma_dir);
+	memset(data, 0, sizeof(*data));
+}
+
+
+
+static void
+isert_unmap_cmd(struct isert_cmd *isert_cmd, struct isert_conn *isert_conn)
+{
+	struct isert_rdma_wr *wr = &isert_cmd->rdma_wr;
+
+	pr_debug("isert_unmap_cmd: %p\n", isert_cmd);
+
+	if (wr->data.sg) {
+		pr_debug("isert_unmap_cmd: %p unmap_sg op\n", isert_cmd);
+		isert_unmap_data_buf(isert_conn, &wr->data);
+	}
+
+	if (wr->send_wr) {
+		pr_debug("isert_unmap_cmd: %p free send_wr\n", isert_cmd);
+		kfree(wr->send_wr);
+		wr->send_wr = NULL;
+	}
+
+	if (wr->ib_sge) {
+		pr_debug("isert_unmap_cmd: %p free ib_sge\n", isert_cmd);
+		kfree(wr->ib_sge);
+		wr->ib_sge = NULL;
+	}
+}
+
+static void
+isert_unreg_rdma(struct isert_cmd *isert_cmd, struct isert_conn *isert_conn)
+{
+	struct isert_rdma_wr *wr = &isert_cmd->rdma_wr;
+	LIST_HEAD(unmap_list);
+
+	pr_debug("unreg_fastreg_cmd: %p\n", isert_cmd);
+
+	if (wr->fr_desc) {
+		pr_debug("unreg_fastreg_cmd: %p free fr_desc %p\n",
+			 isert_cmd, wr->fr_desc);
+		if (wr->fr_desc->ind & ISERT_PROTECTED) {
+			isert_unmap_data_buf(isert_conn, &wr->prot);
+			wr->fr_desc->ind &= ~ISERT_PROTECTED;
+		}
+		spin_lock_bh(&isert_conn->conn_lock);
+		list_add_tail(&wr->fr_desc->list, &isert_conn->conn_fr_pool);
+		spin_unlock_bh(&isert_conn->conn_lock);
+		wr->fr_desc = NULL;
+	}
+
+	if (wr->data.sg) {
+		pr_debug("unreg_fastreg_cmd: %p unmap_sg op\n", isert_cmd);
+		isert_unmap_data_buf(isert_conn, &wr->data);
+	}
+
+	wr->ib_sge = NULL;
+	wr->send_wr = NULL;
+}
+
+static void
+isert_put_cmd(struct isert_cmd *isert_cmd, bool comp_err)
+{
+	struct iscsi_cmd *cmd = isert_cmd->iscsi_cmd;
+	struct isert_conn *isert_conn = isert_cmd->conn;
+	struct iscsi_conn *conn = isert_conn->conn;
+	struct isert_device *device = isert_conn->conn_device;
+
+	pr_debug("Entering isert_put_cmd: %p\n", isert_cmd);
+
+	switch (cmd->iscsi_opcode) {
+	case ISCSI_OP_SCSI_CMD:
+		spin_lock_bh(&conn->cmd_lock);
+		if (!list_empty(&cmd->i_conn_node))
+			list_del_init(&cmd->i_conn_node);
+		spin_unlock_bh(&conn->cmd_lock);
+
+		if (cmd->data_direction == DMA_TO_DEVICE) {
+			iscsit_stop_dataout_timer(cmd);
+			/*
+			 * Check for special case during comp_err where
+			 * WRITE_PENDING has been handed off from core,
+			 * but requires an extra target_put_sess_cmd()
+			 * before transport_generic_free_cmd() below.
+			 */
+			if (comp_err &&
+			    cmd->se_cmd.t_state == TRANSPORT_WRITE_PENDING) {
+				struct se_cmd *se_cmd = &cmd->se_cmd;
+
+				target_put_sess_cmd(se_cmd->se_sess, se_cmd);
+			}
+		}
+
+		device->unreg_rdma_mem(isert_cmd, isert_conn);
+		transport_generic_free_cmd(&cmd->se_cmd, 0);
+		break;
+	case ISCSI_OP_SCSI_TMFUNC:
+		spin_lock_bh(&conn->cmd_lock);
+		if (!list_empty(&cmd->i_conn_node))
+			list_del_init(&cmd->i_conn_node);
+		spin_unlock_bh(&conn->cmd_lock);
+
+		transport_generic_free_cmd(&cmd->se_cmd, 0);
+		break;
+	case ISCSI_OP_REJECT:
+	case ISCSI_OP_NOOP_OUT:
+	case ISCSI_OP_TEXT:
+		spin_lock_bh(&conn->cmd_lock);
+		if (!list_empty(&cmd->i_conn_node))
+			list_del_init(&cmd->i_conn_node);
+		spin_unlock_bh(&conn->cmd_lock);
+
+		/*
+		 * Handle special case for REJECT when iscsi_add_reject*() has
+		 * overwritten the original iscsi_opcode assignment, and the
+		 * associated cmd->se_cmd needs to be released.
+		 */
+		if (cmd->se_cmd.se_tfo != NULL) {
+			pr_debug("Calling transport_generic_free_cmd from"
+				 " isert_put_cmd for 0x%02x\n",
+				 cmd->iscsi_opcode);
+			transport_generic_free_cmd(&cmd->se_cmd, 0);
+			break;
+		}
+		/*
+		 * Fall-through
+		 */
+	default:
+		iscsit_release_cmd(cmd);
+		break;
+	}
+}
+
+static void
+isert_unmap_tx_desc(struct iser_tx_desc *tx_desc, struct ib_device *ib_dev)
+{
+	if (tx_desc->dma_addr != 0) {
+		pr_debug("Calling ib_dma_unmap_single for tx_desc->dma_addr\n");
+		ib_dma_unmap_single(ib_dev, tx_desc->dma_addr,
+				    ISER_HEADERS_LEN, DMA_TO_DEVICE);
+		tx_desc->dma_addr = 0;
+	}
+}
+
+static void
+isert_completion_put(struct iser_tx_desc *tx_desc, struct isert_cmd *isert_cmd,
+		     struct ib_device *ib_dev, bool comp_err)
+{
+	if (isert_cmd->pdu_buf_dma != 0) {
+		pr_debug("Calling ib_dma_unmap_single for isert_cmd->pdu_buf_dma\n");
+		ib_dma_unmap_single(ib_dev, isert_cmd->pdu_buf_dma,
+				    isert_cmd->pdu_buf_len, DMA_TO_DEVICE);
+		isert_cmd->pdu_buf_dma = 0;
+	}
+
+	isert_unmap_tx_desc(tx_desc, ib_dev);
+	isert_put_cmd(isert_cmd, comp_err);
+}
+
+static int
+isert_check_pi_status(struct se_cmd *se_cmd, struct ib_mr *sig_mr)
+{
+	struct ib_mr_status mr_status;
+	int ret;
+
+	ret = ib_check_mr_status(sig_mr, IB_MR_CHECK_SIG_STATUS, &mr_status);
+	if (ret) {
+		pr_err("ib_check_mr_status failed, ret %d\n", ret);
+		goto fail_mr_status;
+	}
+
+	if (mr_status.fail_status & IB_MR_CHECK_SIG_STATUS) {
+		u64 sec_offset_err;
+		u32 block_size = se_cmd->se_dev->dev_attrib.block_size + 8;
+
+		switch (mr_status.sig_err.err_type) {
+		case IB_SIG_BAD_GUARD:
+			se_cmd->pi_err = TCM_LOGICAL_BLOCK_GUARD_CHECK_FAILED;
+			break;
+		case IB_SIG_BAD_REFTAG:
+			se_cmd->pi_err = TCM_LOGICAL_BLOCK_REF_TAG_CHECK_FAILED;
+			break;
+		case IB_SIG_BAD_APPTAG:
+			se_cmd->pi_err = TCM_LOGICAL_BLOCK_APP_TAG_CHECK_FAILED;
+			break;
+		}
+		sec_offset_err = mr_status.sig_err.sig_err_offset;
+		do_div(sec_offset_err, block_size);
+		se_cmd->bad_sector = sec_offset_err + se_cmd->t_task_lba;
+
+		pr_err("isert: PI error found type %d at sector 0x%llx "
+		       "expected 0x%x vs actual 0x%x\n",
+		       mr_status.sig_err.err_type,
+		       (unsigned long long)se_cmd->bad_sector,
+		       mr_status.sig_err.expected,
+		       mr_status.sig_err.actual);
+		ret = 1;
+	}
+
+fail_mr_status:
+	return ret;
+}
+
+static void
+isert_completion_rdma_write(struct iser_tx_desc *tx_desc,
+			    struct isert_cmd *isert_cmd)
+{
+	struct isert_rdma_wr *wr = &isert_cmd->rdma_wr;
+	struct iscsi_cmd *cmd = isert_cmd->iscsi_cmd;
+	struct se_cmd *se_cmd = &cmd->se_cmd;
+	struct isert_conn *isert_conn = isert_cmd->conn;
+	struct isert_device *device = isert_conn->conn_device;
+	int ret = 0;
+
+	if (wr->fr_desc && wr->fr_desc->ind & ISERT_PROTECTED) {
+		ret = isert_check_pi_status(se_cmd,
+					    wr->fr_desc->pi_ctx->sig_mr);
+		wr->fr_desc->ind &= ~ISERT_PROTECTED;
+	}
+
+	device->unreg_rdma_mem(isert_cmd, isert_conn);
+	wr->send_wr_num = 0;
+	if (ret)
+		transport_send_check_condition_and_sense(se_cmd,
+							 se_cmd->pi_err, 0);
+	else
+		isert_put_response(isert_conn->conn, cmd);
+}
+
+static void
+isert_completion_rdma_read(struct iser_tx_desc *tx_desc,
+			   struct isert_cmd *isert_cmd)
+{
+	struct isert_rdma_wr *wr = &isert_cmd->rdma_wr;
+	struct iscsi_cmd *cmd = isert_cmd->iscsi_cmd;
+	struct se_cmd *se_cmd = &cmd->se_cmd;
+	struct isert_conn *isert_conn = isert_cmd->conn;
+	struct isert_device *device = isert_conn->conn_device;
+	int ret = 0;
+
+	if (wr->fr_desc && wr->fr_desc->ind & ISERT_PROTECTED) {
+		ret = isert_check_pi_status(se_cmd,
+					    wr->fr_desc->pi_ctx->sig_mr);
+		wr->fr_desc->ind &= ~ISERT_PROTECTED;
+	}
+
+	iscsit_stop_dataout_timer(cmd);
+	device->unreg_rdma_mem(isert_cmd, isert_conn);
+	cmd->write_data_done = wr->data.len;
+	wr->send_wr_num = 0;
+
+	pr_debug("Cmd: %p RDMA_READ comp calling execute_cmd\n", isert_cmd);
+	spin_lock_bh(&cmd->istate_lock);
+	cmd->cmd_flags |= ICF_GOT_LAST_DATAOUT;
+	cmd->i_state = ISTATE_RECEIVED_LAST_DATAOUT;
+	spin_unlock_bh(&cmd->istate_lock);
+
+	if (ret)
+		transport_send_check_condition_and_sense(se_cmd,
+							 se_cmd->pi_err, 0);
+	else
+		target_execute_cmd(se_cmd);
+}
+
+static void
+isert_do_control_comp(struct work_struct *work)
+{
+	struct isert_cmd *isert_cmd = container_of(work,
+			struct isert_cmd, comp_work);
+	struct isert_conn *isert_conn = isert_cmd->conn;
+	struct ib_device *ib_dev = isert_conn->conn_cm_id->device;
+	struct iscsi_cmd *cmd = isert_cmd->iscsi_cmd;
+
+	switch (cmd->i_state) {
+	case ISTATE_SEND_TASKMGTRSP:
+		pr_debug("Calling iscsit_tmr_post_handler >>>>>>>>>>>>>>>>>\n");
+
+		atomic_dec(&isert_conn->post_send_buf_count);
+		iscsit_tmr_post_handler(cmd, cmd->conn);
+
+		cmd->i_state = ISTATE_SENT_STATUS;
+		isert_completion_put(&isert_cmd->tx_desc, isert_cmd, ib_dev, false);
+		break;
+	case ISTATE_SEND_REJECT:
+		pr_debug("Got isert_do_control_comp ISTATE_SEND_REJECT: >>>\n");
+		atomic_dec(&isert_conn->post_send_buf_count);
+
+		cmd->i_state = ISTATE_SENT_STATUS;
+		isert_completion_put(&isert_cmd->tx_desc, isert_cmd, ib_dev, false);
+		break;
+	case ISTATE_SEND_LOGOUTRSP:
+		pr_debug("Calling iscsit_logout_post_handler >>>>>>>>>>>>>>\n");
+
+		atomic_dec(&isert_conn->post_send_buf_count);
+		iscsit_logout_post_handler(cmd, cmd->conn);
+		break;
+	case ISTATE_SEND_TEXTRSP:
+		atomic_dec(&isert_conn->post_send_buf_count);
+		cmd->i_state = ISTATE_SENT_STATUS;
+		isert_completion_put(&isert_cmd->tx_desc, isert_cmd, ib_dev, false);
+		break;
+	default:
+		pr_err("Unknown do_control_comp i_state %d\n", cmd->i_state);
+		dump_stack();
+		break;
+	}
+}
+
+static void
+isert_response_completion(struct iser_tx_desc *tx_desc,
+			  struct isert_cmd *isert_cmd,
+			  struct isert_conn *isert_conn,
+			  struct ib_device *ib_dev)
+{
+	struct iscsi_cmd *cmd = isert_cmd->iscsi_cmd;
+	struct isert_rdma_wr *wr = &isert_cmd->rdma_wr;
+
+	if (cmd->i_state == ISTATE_SEND_TASKMGTRSP ||
+	    cmd->i_state == ISTATE_SEND_LOGOUTRSP ||
+	    cmd->i_state == ISTATE_SEND_REJECT ||
+	    cmd->i_state == ISTATE_SEND_TEXTRSP) {
+		isert_unmap_tx_desc(tx_desc, ib_dev);
+
+		INIT_WORK(&isert_cmd->comp_work, isert_do_control_comp);
+		queue_work(isert_comp_wq, &isert_cmd->comp_work);
+		return;
+	}
+
+	/**
+	 * If send_wr_num is 0 this means that we got
+	 * RDMA completion and we cleared it and we should
+	 * simply decrement the response post. else the
+	 * response is incorporated in send_wr_num, just
+	 * sub it.
+	 **/
+	if (wr->send_wr_num)
+		atomic_sub(wr->send_wr_num, &isert_conn->post_send_buf_count);
+	else
+		atomic_dec(&isert_conn->post_send_buf_count);
+
+	cmd->i_state = ISTATE_SENT_STATUS;
+	isert_completion_put(tx_desc, isert_cmd, ib_dev, false);
+}
+
+static void
+__isert_send_completion(struct iser_tx_desc *tx_desc,
+		        struct isert_conn *isert_conn)
+{
+	struct ib_device *ib_dev = isert_conn->conn_cm_id->device;
+	struct isert_cmd *isert_cmd = tx_desc->isert_cmd;
+	struct isert_rdma_wr *wr;
+
+	if (!isert_cmd) {
+		atomic_dec(&isert_conn->post_send_buf_count);
+		isert_unmap_tx_desc(tx_desc, ib_dev);
+		return;
+	}
+	wr = &isert_cmd->rdma_wr;
+
+	switch (wr->iser_ib_op) {
+	case ISER_IB_RECV:
+		pr_err("isert_send_completion: Got ISER_IB_RECV\n");
+		dump_stack();
+		break;
+	case ISER_IB_SEND:
+		pr_debug("isert_send_completion: Got ISER_IB_SEND\n");
+		isert_response_completion(tx_desc, isert_cmd,
+					  isert_conn, ib_dev);
+		break;
+	case ISER_IB_RDMA_WRITE:
+		pr_debug("isert_send_completion: Got ISER_IB_RDMA_WRITE\n");
+		atomic_sub(wr->send_wr_num, &isert_conn->post_send_buf_count);
+		isert_completion_rdma_write(tx_desc, isert_cmd);
+		break;
+	case ISER_IB_RDMA_READ:
+		pr_debug("isert_send_completion: Got ISER_IB_RDMA_READ:\n");
+
+		atomic_sub(wr->send_wr_num, &isert_conn->post_send_buf_count);
+		isert_completion_rdma_read(tx_desc, isert_cmd);
+		break;
+	default:
+		pr_err("Unknown wr->iser_ib_op: 0x%02x\n", wr->iser_ib_op);
+		dump_stack();
+		break;
+	}
+}
+
+static void
+isert_send_completion(struct iser_tx_desc *tx_desc,
+		      struct isert_conn *isert_conn)
+{
+	struct llist_node *llnode = tx_desc->comp_llnode_batch;
+	struct iser_tx_desc *t;
+	/*
+	 * Drain coalesced completion llist starting from comp_llnode_batch
+	 * setup in isert_init_send_wr(), and then complete trailing tx_desc.
+	 */
+	while (llnode) {
+		t = llist_entry(llnode, struct iser_tx_desc, comp_llnode);
+		llnode = llist_next(llnode);
+		__isert_send_completion(t, isert_conn);
+	}
+	__isert_send_completion(tx_desc, isert_conn);
+}
+
+static void
+isert_cq_drain_comp_llist(struct isert_conn *isert_conn, struct ib_device *ib_dev)
+{
+	struct llist_node *llnode;
+	struct isert_rdma_wr *wr;
+	struct iser_tx_desc *t;
+
+	mutex_lock(&isert_conn->conn_mutex);
+	llnode = llist_del_all(&isert_conn->conn_comp_llist);
+	isert_conn->conn_comp_batch = 0;
+	mutex_unlock(&isert_conn->conn_mutex);
+
+	while (llnode) {
+		t = llist_entry(llnode, struct iser_tx_desc, comp_llnode);
+		llnode = llist_next(llnode);
+		wr = &t->isert_cmd->rdma_wr;
+
+		/**
+		 * If send_wr_num is 0 this means that we got
+		 * RDMA completion and we cleared it and we should
+		 * simply decrement the response post. else the
+		 * response is incorporated in send_wr_num, just
+		 * sub it.
+		 **/
+		if (wr->send_wr_num)
+			atomic_sub(wr->send_wr_num,
+				   &isert_conn->post_send_buf_count);
+		else
+			atomic_dec(&isert_conn->post_send_buf_count);
+
+		isert_completion_put(t, t->isert_cmd, ib_dev, true);
+	}
+}
+
+static void
+isert_cq_tx_comp_err(struct iser_tx_desc *tx_desc, struct isert_conn *isert_conn)
+{
+	struct ib_device *ib_dev = isert_conn->conn_cm_id->device;
+	struct isert_cmd *isert_cmd = tx_desc->isert_cmd;
+	struct llist_node *llnode = tx_desc->comp_llnode_batch;
+	struct isert_rdma_wr *wr;
+	struct iser_tx_desc *t;
+
+	while (llnode) {
+		t = llist_entry(llnode, struct iser_tx_desc, comp_llnode);
+		llnode = llist_next(llnode);
+		wr = &t->isert_cmd->rdma_wr;
+
+		/**
+		 * If send_wr_num is 0 this means that we got
+		 * RDMA completion and we cleared it and we should
+		 * simply decrement the response post. else the
+		 * response is incorporated in send_wr_num, just
+		 * sub it.
+		 **/
+		if (wr->send_wr_num)
+			atomic_sub(wr->send_wr_num,
+				   &isert_conn->post_send_buf_count);
+		else
+			atomic_dec(&isert_conn->post_send_buf_count);
+
+		isert_completion_put(t, t->isert_cmd, ib_dev, true);
+	}
+	tx_desc->comp_llnode_batch = NULL;
+
+	if (!isert_cmd)
+		isert_unmap_tx_desc(tx_desc, ib_dev);
+	else
+		isert_completion_put(tx_desc, isert_cmd, ib_dev, true);
+}
+
+static void
+isert_cq_rx_comp_err(struct isert_conn *isert_conn)
+{
+	struct ib_device *ib_dev = isert_conn->conn_cm_id->device;
+	struct iscsi_conn *conn = isert_conn->conn;
+
+	if (isert_conn->post_recv_buf_count)
+		return;
+
+	isert_cq_drain_comp_llist(isert_conn, ib_dev);
+
+	if (conn->sess) {
+		target_sess_cmd_list_set_waiting(conn->sess->se_sess);
+		target_wait_for_sess_cmds(conn->sess->se_sess);
+	}
+
+	while (atomic_read(&isert_conn->post_send_buf_count))
+		msleep(3000);
+
+	mutex_lock(&isert_conn->conn_mutex);
+	isert_conn->state = ISER_CONN_DOWN;
+	mutex_unlock(&isert_conn->conn_mutex);
+
+	iscsit_cause_connection_reinstatement(isert_conn->conn, 0);
+
+	complete(&isert_conn->conn_wait_comp_err);
+}
+
+static void
+isert_cq_tx_work(struct work_struct *work)
+{
+	struct isert_cq_desc *cq_desc = container_of(work,
+				struct isert_cq_desc, cq_tx_work);
+	struct isert_device *device = cq_desc->device;
+	int cq_index = cq_desc->cq_index;
+	struct ib_cq *tx_cq = device->dev_tx_cq[cq_index];
+	struct isert_conn *isert_conn;
+	struct iser_tx_desc *tx_desc;
+	struct ib_wc wc;
+
+	while (ib_poll_cq(tx_cq, 1, &wc) == 1) {
+		tx_desc = (struct iser_tx_desc *)(unsigned long)wc.wr_id;
+		isert_conn = wc.qp->qp_context;
+
+		if (wc.status == IB_WC_SUCCESS) {
+			isert_send_completion(tx_desc, isert_conn);
+		} else {
+			pr_debug("TX wc.status != IB_WC_SUCCESS >>>>>>>>>>>>>>\n");
+			pr_debug("TX wc.status: 0x%08x\n", wc.status);
+			pr_debug("TX wc.vendor_err: 0x%08x\n", wc.vendor_err);
+
+			if (wc.wr_id != ISER_FASTREG_LI_WRID) {
+				if (tx_desc->llnode_active)
+					continue;
+
+				atomic_dec(&isert_conn->post_send_buf_count);
+				isert_cq_tx_comp_err(tx_desc, isert_conn);
+			}
+		}
+	}
+
+	ib_req_notify_cq(tx_cq, IB_CQ_NEXT_COMP);
+}
+
+static void
+isert_cq_tx_callback(struct ib_cq *cq, void *context)
+{
+	struct isert_cq_desc *cq_desc = (struct isert_cq_desc *)context;
+
+	queue_work(isert_comp_wq, &cq_desc->cq_tx_work);
+}
+
+static void
+isert_cq_rx_work(struct work_struct *work)
+{
+	struct isert_cq_desc *cq_desc = container_of(work,
+			struct isert_cq_desc, cq_rx_work);
+	struct isert_device *device = cq_desc->device;
+	int cq_index = cq_desc->cq_index;
+	struct ib_cq *rx_cq = device->dev_rx_cq[cq_index];
+	struct isert_conn *isert_conn;
+	struct iser_rx_desc *rx_desc;
+	struct ib_wc wc;
+	unsigned long xfer_len;
+
+	while (ib_poll_cq(rx_cq, 1, &wc) == 1) {
+		rx_desc = (struct iser_rx_desc *)(unsigned long)wc.wr_id;
+		isert_conn = wc.qp->qp_context;
+
+		if (wc.status == IB_WC_SUCCESS) {
+			xfer_len = (unsigned long)wc.byte_len;
+			isert_rx_completion(rx_desc, isert_conn, xfer_len);
+		} else {
+			pr_debug("RX wc.status != IB_WC_SUCCESS >>>>>>>>>>>>>>\n");
+			if (wc.status != IB_WC_WR_FLUSH_ERR) {
+				pr_debug("RX wc.status: 0x%08x\n", wc.status);
+				pr_debug("RX wc.vendor_err: 0x%08x\n",
+					 wc.vendor_err);
+			}
+			isert_conn->post_recv_buf_count--;
+			isert_cq_rx_comp_err(isert_conn);
+		}
+	}
+
+	ib_req_notify_cq(rx_cq, IB_CQ_NEXT_COMP);
+}
+
+static void
+isert_cq_rx_callback(struct ib_cq *cq, void *context)
+{
+	struct isert_cq_desc *cq_desc = (struct isert_cq_desc *)context;
+
+	queue_work(isert_rx_wq, &cq_desc->cq_rx_work);
+}
+
+static int
+isert_post_response(struct isert_conn *isert_conn, struct isert_cmd *isert_cmd)
+{
+	struct ib_send_wr *wr_failed;
+	int ret;
+
+	atomic_inc(&isert_conn->post_send_buf_count);
+
+	ret = ib_post_send(isert_conn->conn_qp, &isert_cmd->tx_desc.send_wr,
+			   &wr_failed);
+	if (ret) {
+		pr_err("ib_post_send failed with %d\n", ret);
+		atomic_dec(&isert_conn->post_send_buf_count);
+		return ret;
+	}
+	return ret;
+}
+
+static int
+isert_put_response(struct iscsi_conn *conn, struct iscsi_cmd *cmd)
+{
+	struct isert_cmd *isert_cmd = iscsit_priv_cmd(cmd);
+	struct isert_conn *isert_conn = (struct isert_conn *)conn->context;
+	struct ib_send_wr *send_wr = &isert_cmd->tx_desc.send_wr;
+	struct iscsi_scsi_rsp *hdr = (struct iscsi_scsi_rsp *)
+				&isert_cmd->tx_desc.iscsi_header;
+
+	isert_create_send_desc(isert_conn, isert_cmd, &isert_cmd->tx_desc);
+	iscsit_build_rsp_pdu(cmd, conn, true, hdr);
+	isert_init_tx_hdrs(isert_conn, &isert_cmd->tx_desc);
+	/*
+	 * Attach SENSE DATA payload to iSCSI Response PDU
+	 */
+	if (cmd->se_cmd.sense_buffer &&
+	    ((cmd->se_cmd.se_cmd_flags & SCF_TRANSPORT_TASK_SENSE) ||
+	    (cmd->se_cmd.se_cmd_flags & SCF_EMULATED_TASK_SENSE))) {
+		struct ib_device *ib_dev = isert_conn->conn_cm_id->device;
+		struct ib_sge *tx_dsg = &isert_cmd->tx_desc.tx_sg[1];
+		u32 padding, pdu_len;
+
+		put_unaligned_be16(cmd->se_cmd.scsi_sense_length,
+				   cmd->sense_buffer);
+		cmd->se_cmd.scsi_sense_length += sizeof(__be16);
+
+		padding = -(cmd->se_cmd.scsi_sense_length) & 3;
+		hton24(hdr->dlength, (u32)cmd->se_cmd.scsi_sense_length);
+		pdu_len = cmd->se_cmd.scsi_sense_length + padding;
+
+		isert_cmd->pdu_buf_dma = ib_dma_map_single(ib_dev,
+				(void *)cmd->sense_buffer, pdu_len,
+				DMA_TO_DEVICE);
+
+		isert_cmd->pdu_buf_len = pdu_len;
+		tx_dsg->addr	= isert_cmd->pdu_buf_dma;
+		tx_dsg->length	= pdu_len;
+		tx_dsg->lkey	= isert_conn->conn_mr->lkey;
+		isert_cmd->tx_desc.num_sge = 2;
+	}
+
+	isert_init_send_wr(isert_conn, isert_cmd, send_wr, false);
+
+	pr_debug("Posting SCSI Response IB_WR_SEND >>>>>>>>>>>>>>>>>>>>>>\n");
+
+	return isert_post_response(isert_conn, isert_cmd);
+}
+
+static void
+isert_aborted_task(struct iscsi_conn *conn, struct iscsi_cmd *cmd)
+{
+	struct isert_cmd *isert_cmd = iscsit_priv_cmd(cmd);
+	struct isert_conn *isert_conn = (struct isert_conn *)conn->context;
+	struct isert_device *device = isert_conn->conn_device;
+
+	spin_lock_bh(&conn->cmd_lock);
+	if (!list_empty(&cmd->i_conn_node))
+		list_del_init(&cmd->i_conn_node);
+	spin_unlock_bh(&conn->cmd_lock);
+
+	if (cmd->data_direction == DMA_TO_DEVICE)
+		iscsit_stop_dataout_timer(cmd);
+
+	device->unreg_rdma_mem(isert_cmd, isert_conn);
+}
+
+static enum target_prot_op
+isert_get_sup_prot_ops(struct iscsi_conn *conn)
+{
+	struct isert_conn *isert_conn = (struct isert_conn *)conn->context;
+	struct isert_device *device = isert_conn->conn_device;
+
+	if (device->pi_capable)
+		return TARGET_PROT_ALL;
+
+	return TARGET_PROT_NORMAL;
+}
+
+static int
+isert_put_nopin(struct iscsi_cmd *cmd, struct iscsi_conn *conn,
+		bool nopout_response)
+{
+	struct isert_cmd *isert_cmd = iscsit_priv_cmd(cmd);
+	struct isert_conn *isert_conn = (struct isert_conn *)conn->context;
+	struct ib_send_wr *send_wr = &isert_cmd->tx_desc.send_wr;
+
+	isert_create_send_desc(isert_conn, isert_cmd, &isert_cmd->tx_desc);
+	iscsit_build_nopin_rsp(cmd, conn, (struct iscsi_nopin *)
+			       &isert_cmd->tx_desc.iscsi_header,
+			       nopout_response);
+	isert_init_tx_hdrs(isert_conn, &isert_cmd->tx_desc);
+	isert_init_send_wr(isert_conn, isert_cmd, send_wr, false);
+
+	pr_debug("Posting NOPIN Response IB_WR_SEND >>>>>>>>>>>>>>>>>>>>>>\n");
+
+	return isert_post_response(isert_conn, isert_cmd);
+}
+
+static int
+isert_put_logout_rsp(struct iscsi_cmd *cmd, struct iscsi_conn *conn)
+{
+	struct isert_cmd *isert_cmd = iscsit_priv_cmd(cmd);
+	struct isert_conn *isert_conn = (struct isert_conn *)conn->context;
+	struct ib_send_wr *send_wr = &isert_cmd->tx_desc.send_wr;
+
+	isert_create_send_desc(isert_conn, isert_cmd, &isert_cmd->tx_desc);
+	iscsit_build_logout_rsp(cmd, conn, (struct iscsi_logout_rsp *)
+				&isert_cmd->tx_desc.iscsi_header);
+	isert_init_tx_hdrs(isert_conn, &isert_cmd->tx_desc);
+	isert_init_send_wr(isert_conn, isert_cmd, send_wr, false);
+
+	pr_debug("Posting Logout Response IB_WR_SEND >>>>>>>>>>>>>>>>>>>>>>\n");
+
+	return isert_post_response(isert_conn, isert_cmd);
+}
+
+static int
+isert_put_tm_rsp(struct iscsi_cmd *cmd, struct iscsi_conn *conn)
+{
+	struct isert_cmd *isert_cmd = iscsit_priv_cmd(cmd);
+	struct isert_conn *isert_conn = (struct isert_conn *)conn->context;
+	struct ib_send_wr *send_wr = &isert_cmd->tx_desc.send_wr;
+
+	isert_create_send_desc(isert_conn, isert_cmd, &isert_cmd->tx_desc);
+	iscsit_build_task_mgt_rsp(cmd, conn, (struct iscsi_tm_rsp *)
+				  &isert_cmd->tx_desc.iscsi_header);
+	isert_init_tx_hdrs(isert_conn, &isert_cmd->tx_desc);
+	isert_init_send_wr(isert_conn, isert_cmd, send_wr, false);
+
+	pr_debug("Posting Task Management Response IB_WR_SEND >>>>>>>>>>>>>>>>>>>>>>\n");
+
+	return isert_post_response(isert_conn, isert_cmd);
+}
+
+static int
+isert_put_reject(struct iscsi_cmd *cmd, struct iscsi_conn *conn)
+{
+	struct isert_cmd *isert_cmd = iscsit_priv_cmd(cmd);
+	struct isert_conn *isert_conn = (struct isert_conn *)conn->context;
+	struct ib_send_wr *send_wr = &isert_cmd->tx_desc.send_wr;
+	struct ib_device *ib_dev = isert_conn->conn_cm_id->device;
+	struct ib_sge *tx_dsg = &isert_cmd->tx_desc.tx_sg[1];
+	struct iscsi_reject *hdr =
+		(struct iscsi_reject *)&isert_cmd->tx_desc.iscsi_header;
+
+	isert_create_send_desc(isert_conn, isert_cmd, &isert_cmd->tx_desc);
+	iscsit_build_reject(cmd, conn, hdr);
+	isert_init_tx_hdrs(isert_conn, &isert_cmd->tx_desc);
+
+	hton24(hdr->dlength, ISCSI_HDR_LEN);
+	isert_cmd->pdu_buf_dma = ib_dma_map_single(ib_dev,
+			(void *)cmd->buf_ptr, ISCSI_HDR_LEN,
+			DMA_TO_DEVICE);
+	isert_cmd->pdu_buf_len = ISCSI_HDR_LEN;
+	tx_dsg->addr	= isert_cmd->pdu_buf_dma;
+	tx_dsg->length	= ISCSI_HDR_LEN;
+	tx_dsg->lkey	= isert_conn->conn_mr->lkey;
+	isert_cmd->tx_desc.num_sge = 2;
+
+	isert_init_send_wr(isert_conn, isert_cmd, send_wr, false);
+
+	pr_debug("Posting Reject IB_WR_SEND >>>>>>>>>>>>>>>>>>>>>>\n");
+
+	return isert_post_response(isert_conn, isert_cmd);
+}
+
+static int
+isert_put_text_rsp(struct iscsi_cmd *cmd, struct iscsi_conn *conn)
+{
+	struct isert_cmd *isert_cmd = iscsit_priv_cmd(cmd);
+	struct isert_conn *isert_conn = (struct isert_conn *)conn->context;
+	struct ib_send_wr *send_wr = &isert_cmd->tx_desc.send_wr;
+	struct iscsi_text_rsp *hdr =
+		(struct iscsi_text_rsp *)&isert_cmd->tx_desc.iscsi_header;
+	u32 txt_rsp_len;
+	int rc;
+
+	isert_create_send_desc(isert_conn, isert_cmd, &isert_cmd->tx_desc);
+	rc = iscsit_build_text_rsp(cmd, conn, hdr, ISCSI_INFINIBAND);
+	if (rc < 0)
+		return rc;
+
+	txt_rsp_len = rc;
+	isert_init_tx_hdrs(isert_conn, &isert_cmd->tx_desc);
+
+	if (txt_rsp_len) {
+		struct ib_device *ib_dev = isert_conn->conn_cm_id->device;
+		struct ib_sge *tx_dsg = &isert_cmd->tx_desc.tx_sg[1];
+		void *txt_rsp_buf = cmd->buf_ptr;
+
+		isert_cmd->pdu_buf_dma = ib_dma_map_single(ib_dev,
+				txt_rsp_buf, txt_rsp_len, DMA_TO_DEVICE);
+
+		isert_cmd->pdu_buf_len = txt_rsp_len;
+		tx_dsg->addr	= isert_cmd->pdu_buf_dma;
+		tx_dsg->length	= txt_rsp_len;
+		tx_dsg->lkey	= isert_conn->conn_mr->lkey;
+		isert_cmd->tx_desc.num_sge = 2;
+	}
+	isert_init_send_wr(isert_conn, isert_cmd, send_wr, false);
+
+	pr_debug("Posting Text Response IB_WR_SEND >>>>>>>>>>>>>>>>>>>>>>\n");
+
+	return isert_post_response(isert_conn, isert_cmd);
+}
+
+static int
+isert_build_rdma_wr(struct isert_conn *isert_conn, struct isert_cmd *isert_cmd,
+		    struct ib_sge *ib_sge, struct ib_send_wr *send_wr,
+		    u32 data_left, u32 offset)
+{
+	struct iscsi_cmd *cmd = isert_cmd->iscsi_cmd;
+	struct scatterlist *sg_start, *tmp_sg;
+	struct ib_device *ib_dev = isert_conn->conn_cm_id->device;
+	u32 sg_off, page_off;
+	int i = 0, sg_nents;
+
+	sg_off = offset / PAGE_SIZE;
+	sg_start = &cmd->se_cmd.t_data_sg[sg_off];
+	sg_nents = min(cmd->se_cmd.t_data_nents - sg_off, isert_conn->max_sge);
+	page_off = offset % PAGE_SIZE;
+
+	send_wr->sg_list = ib_sge;
+	send_wr->num_sge = sg_nents;
+	send_wr->wr_id = (unsigned long)&isert_cmd->tx_desc;
+	/*
+	 * Perform mapping of TCM scatterlist memory ib_sge dma_addr.
+	 */
+	for_each_sg(sg_start, tmp_sg, sg_nents, i) {
+		pr_debug("ISER RDMA from SGL dma_addr: 0x%16llx dma_len: %u, page_off: %u\n",
+			 (unsigned long long)tmp_sg->dma_address,
+			 tmp_sg->length, page_off);
+
+		ib_sge->addr = ib_sg_dma_address(ib_dev, tmp_sg) + page_off;
+		ib_sge->length = min_t(u32, data_left,
+				ib_sg_dma_len(ib_dev, tmp_sg) - page_off);
+		ib_sge->lkey = isert_conn->conn_mr->lkey;
+
+		pr_debug("RDMA ib_sge: addr: 0x%16llx  length: %u lkey: %08x\n",
+			 ib_sge->addr, ib_sge->length, ib_sge->lkey);
+		page_off = 0;
+		data_left -= ib_sge->length;
+		ib_sge++;
+		pr_debug("Incrementing ib_sge pointer to %p\n", ib_sge);
+	}
+
+	pr_debug("Set outgoing sg_list: %p num_sg: %u from TCM SGLs\n",
+		 send_wr->sg_list, send_wr->num_sge);
+
+	return sg_nents;
+}
+
+static int
+isert_map_rdma(struct iscsi_conn *conn, struct iscsi_cmd *cmd,
+	       struct isert_rdma_wr *wr)
+{
+	struct se_cmd *se_cmd = &cmd->se_cmd;
+	struct isert_cmd *isert_cmd = iscsit_priv_cmd(cmd);
+	struct isert_conn *isert_conn = (struct isert_conn *)conn->context;
+	struct isert_data_buf *data = &wr->data;
+	struct ib_send_wr *send_wr;
+	struct ib_sge *ib_sge;
+	u32 offset, data_len, data_left, rdma_write_max, va_offset = 0;
+	int ret = 0, i, ib_sge_cnt;
+
+	isert_cmd->tx_desc.isert_cmd = isert_cmd;
+
+	offset = wr->iser_ib_op == ISER_IB_RDMA_READ ? cmd->write_data_done : 0;
+	ret = isert_map_data_buf(isert_conn, isert_cmd, se_cmd->t_data_sg,
+				 se_cmd->t_data_nents, se_cmd->data_length,
+				 offset, wr->iser_ib_op, &wr->data);
+	if (ret)
+		return ret;
+
+	data_left = data->len;
+	offset = data->offset;
+
+	ib_sge = kzalloc(sizeof(struct ib_sge) * data->nents, GFP_KERNEL);
+	if (!ib_sge) {
+		pr_warn("Unable to allocate ib_sge\n");
+		ret = -ENOMEM;
+		goto unmap_cmd;
+	}
+	wr->ib_sge = ib_sge;
+
+	wr->send_wr_num = DIV_ROUND_UP(data->nents, isert_conn->max_sge);
+	wr->send_wr = kzalloc(sizeof(struct ib_send_wr) * wr->send_wr_num,
+				GFP_KERNEL);
+	if (!wr->send_wr) {
+		pr_debug("Unable to allocate wr->send_wr\n");
+		ret = -ENOMEM;
+		goto unmap_cmd;
+	}
+
+	wr->isert_cmd = isert_cmd;
+	rdma_write_max = isert_conn->max_sge * PAGE_SIZE;
+
+	for (i = 0; i < wr->send_wr_num; i++) {
+		send_wr = &isert_cmd->rdma_wr.send_wr[i];
+		data_len = min(data_left, rdma_write_max);
+
+		send_wr->send_flags = 0;
+		if (wr->iser_ib_op == ISER_IB_RDMA_WRITE) {
+			send_wr->opcode = IB_WR_RDMA_WRITE;
+			send_wr->wr.rdma.remote_addr = isert_cmd->read_va + offset;
+			send_wr->wr.rdma.rkey = isert_cmd->read_stag;
+			if (i + 1 == wr->send_wr_num)
+				send_wr->next = &isert_cmd->tx_desc.send_wr;
+			else
+				send_wr->next = &wr->send_wr[i + 1];
+		} else {
+			send_wr->opcode = IB_WR_RDMA_READ;
+			send_wr->wr.rdma.remote_addr = isert_cmd->write_va + va_offset;
+			send_wr->wr.rdma.rkey = isert_cmd->write_stag;
+			if (i + 1 == wr->send_wr_num)
+				send_wr->send_flags = IB_SEND_SIGNALED;
+			else
+				send_wr->next = &wr->send_wr[i + 1];
+		}
+
+		ib_sge_cnt = isert_build_rdma_wr(isert_conn, isert_cmd, ib_sge,
+					send_wr, data_len, offset);
+		ib_sge += ib_sge_cnt;
+
+		offset += data_len;
+		va_offset += data_len;
+		data_left -= data_len;
+	}
+
+	return 0;
+unmap_cmd:
+	isert_unmap_data_buf(isert_conn, data);
+
+	return ret;
+}
+
+static int
+isert_map_fr_pagelist(struct ib_device *ib_dev,
+		      struct scatterlist *sg_start, int sg_nents, u64 *fr_pl)
+{
+	u64 start_addr, end_addr, page, chunk_start = 0;
+	struct scatterlist *tmp_sg;
+	int i = 0, new_chunk, last_ent, n_pages;
+
+	n_pages = 0;
+	new_chunk = 1;
+	last_ent = sg_nents - 1;
+	for_each_sg(sg_start, tmp_sg, sg_nents, i) {
+		start_addr = ib_sg_dma_address(ib_dev, tmp_sg);
+		if (new_chunk)
+			chunk_start = start_addr;
+		end_addr = start_addr + ib_sg_dma_len(ib_dev, tmp_sg);
+
+		pr_debug("SGL[%d] dma_addr: 0x%16llx len: %u\n",
+			 i, (unsigned long long)tmp_sg->dma_address,
+			 tmp_sg->length);
+
+		if ((end_addr & ~PAGE_MASK) && i < last_ent) {
+			new_chunk = 0;
+			continue;
+		}
+		new_chunk = 1;
+
+		page = chunk_start & PAGE_MASK;
+		do {
+			fr_pl[n_pages++] = page;
+			pr_debug("Mapped page_list[%d] page_addr: 0x%16llx\n",
+				 n_pages - 1, page);
+			page += PAGE_SIZE;
+		} while (page < end_addr);
+	}
+
+	return n_pages;
+}
+
+static int
+isert_fast_reg_mr(struct isert_conn *isert_conn,
+		  struct fast_reg_descriptor *fr_desc,
+		  struct isert_data_buf *mem,
+		  enum isert_indicator ind,
+		  struct ib_sge *sge)
+{
+	struct ib_device *ib_dev = isert_conn->conn_cm_id->device;
+	struct ib_mr *mr;
+	struct ib_fast_reg_page_list *frpl;
+	struct ib_send_wr fr_wr, inv_wr;
+	struct ib_send_wr *bad_wr, *wr = NULL;
+	int ret, pagelist_len;
+	u32 page_off;
+	u8 key;
+
+	if (mem->dma_nents == 1) {
+		sge->lkey = isert_conn->conn_mr->lkey;
+		sge->addr = ib_sg_dma_address(ib_dev, &mem->sg[0]);
+		sge->length = ib_sg_dma_len(ib_dev, &mem->sg[0]);
+		pr_debug("%s:%d sge: addr: 0x%llx  length: %u lkey: %x\n",
+			 __func__, __LINE__, sge->addr, sge->length,
+			 sge->lkey);
+		return 0;
+	}
+
+	if (ind == ISERT_DATA_KEY_VALID) {
+		/* Registering data buffer */
+		mr = fr_desc->data_mr;
+		frpl = fr_desc->data_frpl;
+	} else {
+		/* Registering protection buffer */
+		mr = fr_desc->pi_ctx->prot_mr;
+		frpl = fr_desc->pi_ctx->prot_frpl;
+	}
+
+	page_off = mem->offset % PAGE_SIZE;
+
+	pr_debug("Use fr_desc %p sg_nents %d offset %u\n",
+		 fr_desc, mem->nents, mem->offset);
+
+	pagelist_len = isert_map_fr_pagelist(ib_dev, mem->sg, mem->nents,
+					     &frpl->page_list[0]);
+
+	if (!(fr_desc->ind & ISERT_DATA_KEY_VALID)) {
+		memset(&inv_wr, 0, sizeof(inv_wr));
+		inv_wr.wr_id = ISER_FASTREG_LI_WRID;
+		inv_wr.opcode = IB_WR_LOCAL_INV;
+		inv_wr.ex.invalidate_rkey = mr->rkey;
+		wr = &inv_wr;
+		/* Bump the key */
+		key = (u8)(mr->rkey & 0x000000FF);
+		ib_update_fast_reg_key(mr, ++key);
+	}
+
+	/* Prepare FASTREG WR */
+	memset(&fr_wr, 0, sizeof(fr_wr));
+	fr_wr.wr_id = ISER_FASTREG_LI_WRID;
+	fr_wr.opcode = IB_WR_FAST_REG_MR;
+	fr_wr.wr.fast_reg.iova_start = frpl->page_list[0] + page_off;
+	fr_wr.wr.fast_reg.page_list = frpl;
+	fr_wr.wr.fast_reg.page_list_len = pagelist_len;
+	fr_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
+	fr_wr.wr.fast_reg.length = mem->len;
+	fr_wr.wr.fast_reg.rkey = mr->rkey;
+	fr_wr.wr.fast_reg.access_flags = IB_ACCESS_LOCAL_WRITE;
+
+	if (!wr)
+		wr = &fr_wr;
+	else
+		wr->next = &fr_wr;
+
+	ret = ib_post_send(isert_conn->conn_qp, wr, &bad_wr);
+	if (ret) {
+		pr_err("fast registration failed, ret:%d\n", ret);
+		return ret;
+	}
+	fr_desc->ind &= ~ind;
+
+	sge->lkey = mr->lkey;
+	sge->addr = frpl->page_list[0] + page_off;
+	sge->length = mem->len;
+
+	pr_debug("%s:%d sge: addr: 0x%llx  length: %u lkey: %x\n",
+		 __func__, __LINE__, sge->addr, sge->length,
+		 sge->lkey);
+
+	return ret;
+}
+
+static inline void
+isert_set_dif_domain(struct se_cmd *se_cmd, struct ib_sig_attrs *sig_attrs,
+		     struct ib_sig_domain *domain)
+{
+	domain->sig_type = IB_SIG_TYPE_T10_DIF;
+	domain->sig.dif.bg_type = IB_T10DIF_CRC;
+	domain->sig.dif.pi_interval = se_cmd->se_dev->dev_attrib.block_size;
+	domain->sig.dif.ref_tag = se_cmd->reftag_seed;
+	/*
+	 * At the moment we hard code those, but if in the future
+	 * the target core would like to use it, we will take it
+	 * from se_cmd.
+	 */
+	domain->sig.dif.apptag_check_mask = 0xffff;
+	domain->sig.dif.app_escape = true;
+	domain->sig.dif.ref_escape = true;
+	if (se_cmd->prot_type == TARGET_DIF_TYPE1_PROT ||
+	    se_cmd->prot_type == TARGET_DIF_TYPE2_PROT)
+		domain->sig.dif.ref_remap = true;
+};
+
+static int
+isert_set_sig_attrs(struct se_cmd *se_cmd, struct ib_sig_attrs *sig_attrs)
+{
+	switch (se_cmd->prot_op) {
+	case TARGET_PROT_DIN_INSERT:
+	case TARGET_PROT_DOUT_STRIP:
+		sig_attrs->mem.sig_type = IB_SIG_TYPE_NONE;
+		isert_set_dif_domain(se_cmd, sig_attrs, &sig_attrs->wire);
+		break;
+	case TARGET_PROT_DOUT_INSERT:
+	case TARGET_PROT_DIN_STRIP:
+		sig_attrs->wire.sig_type = IB_SIG_TYPE_NONE;
+		isert_set_dif_domain(se_cmd, sig_attrs, &sig_attrs->mem);
+		break;
+	case TARGET_PROT_DIN_PASS:
+	case TARGET_PROT_DOUT_PASS:
+		isert_set_dif_domain(se_cmd, sig_attrs, &sig_attrs->wire);
+		isert_set_dif_domain(se_cmd, sig_attrs, &sig_attrs->mem);
+		break;
+	default:
+		pr_err("Unsupported PI operation %d\n", se_cmd->prot_op);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static inline u8
+isert_set_prot_checks(u8 prot_checks)
+{
+	return (prot_checks & TARGET_DIF_CHECK_GUARD  ? 0xc0 : 0) |
+	       (prot_checks & TARGET_DIF_CHECK_REFTAG ? 0x30 : 0) |
+	       (prot_checks & TARGET_DIF_CHECK_REFTAG ? 0x0f : 0);
+}
+
+static int
+isert_reg_sig_mr(struct isert_conn *isert_conn, struct se_cmd *se_cmd,
+		 struct fast_reg_descriptor *fr_desc,
+		 struct ib_sge *data_sge, struct ib_sge *prot_sge,
+		 struct ib_sge *sig_sge)
+{
+	struct ib_send_wr sig_wr, inv_wr;
+	struct ib_send_wr *bad_wr, *wr = NULL;
+	struct pi_context *pi_ctx = fr_desc->pi_ctx;
+	struct ib_sig_attrs sig_attrs;
+	int ret;
+	u32 key;
+
+	memset(&sig_attrs, 0, sizeof(sig_attrs));
+	ret = isert_set_sig_attrs(se_cmd, &sig_attrs);
+	if (ret)
+		goto err;
+
+	sig_attrs.check_mask = isert_set_prot_checks(se_cmd->prot_checks);
+
+	if (!(fr_desc->ind & ISERT_SIG_KEY_VALID)) {
+		memset(&inv_wr, 0, sizeof(inv_wr));
+		inv_wr.opcode = IB_WR_LOCAL_INV;
+		inv_wr.wr_id = ISER_FASTREG_LI_WRID;
+		inv_wr.ex.invalidate_rkey = pi_ctx->sig_mr->rkey;
+		wr = &inv_wr;
+		/* Bump the key */
+		key = (u8)(pi_ctx->sig_mr->rkey & 0x000000FF);
+		ib_update_fast_reg_key(pi_ctx->sig_mr, ++key);
+	}
+
+	memset(&sig_wr, 0, sizeof(sig_wr));
+	sig_wr.opcode = IB_WR_REG_SIG_MR;
+	sig_wr.wr_id = ISER_FASTREG_LI_WRID;
+	sig_wr.sg_list = data_sge;
+	sig_wr.num_sge = 1;
+	sig_wr.wr.sig_handover.access_flags = IB_ACCESS_LOCAL_WRITE;
+	sig_wr.wr.sig_handover.sig_attrs = &sig_attrs;
+	sig_wr.wr.sig_handover.sig_mr = pi_ctx->sig_mr;
+	if (se_cmd->t_prot_sg)
+		sig_wr.wr.sig_handover.prot = prot_sge;
+
+	if (!wr)
+		wr = &sig_wr;
+	else
+		wr->next = &sig_wr;
+
+	ret = ib_post_send(isert_conn->conn_qp, wr, &bad_wr);
+	if (ret) {
+		pr_err("fast registration failed, ret:%d\n", ret);
+		goto err;
+	}
+	fr_desc->ind &= ~ISERT_SIG_KEY_VALID;
+
+	sig_sge->lkey = pi_ctx->sig_mr->lkey;
+	sig_sge->addr = 0;
+	sig_sge->length = se_cmd->data_length;
+	if (se_cmd->prot_op != TARGET_PROT_DIN_STRIP &&
+	    se_cmd->prot_op != TARGET_PROT_DOUT_INSERT)
+		/*
+		 * We have protection guards on the wire
+		 * so we need to set a larget transfer
+		 */
+		sig_sge->length += se_cmd->prot_length;
+
+	pr_debug("sig_sge: addr: 0x%llx  length: %u lkey: %x\n",
+		 sig_sge->addr, sig_sge->length,
+		 sig_sge->lkey);
+err:
+	return ret;
+}
+
+static int
+isert_reg_rdma(struct iscsi_conn *conn, struct iscsi_cmd *cmd,
+	       struct isert_rdma_wr *wr)
+{
+	struct se_cmd *se_cmd = &cmd->se_cmd;
+	struct isert_cmd *isert_cmd = iscsit_priv_cmd(cmd);
+	struct isert_conn *isert_conn = conn->context;
+	struct ib_sge data_sge;
+	struct ib_send_wr *send_wr;
+	struct fast_reg_descriptor *fr_desc = NULL;
+	u32 offset;
+	int ret = 0;
+	unsigned long flags;
+
+	isert_cmd->tx_desc.isert_cmd = isert_cmd;
+
+	offset = wr->iser_ib_op == ISER_IB_RDMA_READ ? cmd->write_data_done : 0;
+	ret = isert_map_data_buf(isert_conn, isert_cmd, se_cmd->t_data_sg,
+				 se_cmd->t_data_nents, se_cmd->data_length,
+				 offset, wr->iser_ib_op, &wr->data);
+	if (ret)
+		return ret;
+
+	if (wr->data.dma_nents != 1 ||
+	    se_cmd->prot_op != TARGET_PROT_NORMAL) {
+		spin_lock_irqsave(&isert_conn->conn_lock, flags);
+		fr_desc = list_first_entry(&isert_conn->conn_fr_pool,
+					   struct fast_reg_descriptor, list);
+		list_del(&fr_desc->list);
+		spin_unlock_irqrestore(&isert_conn->conn_lock, flags);
+		wr->fr_desc = fr_desc;
+	}
+
+	ret = isert_fast_reg_mr(isert_conn, fr_desc, &wr->data,
+				ISERT_DATA_KEY_VALID, &data_sge);
+	if (ret)
+		goto unmap_cmd;
+
+	if (se_cmd->prot_op != TARGET_PROT_NORMAL) {
+		struct ib_sge prot_sge, sig_sge;
+
+		if (se_cmd->t_prot_sg) {
+			ret = isert_map_data_buf(isert_conn, isert_cmd,
+						 se_cmd->t_prot_sg,
+						 se_cmd->t_prot_nents,
+						 se_cmd->prot_length,
+						 0, wr->iser_ib_op, &wr->prot);
+			if (ret)
+				goto unmap_cmd;
+
+			ret = isert_fast_reg_mr(isert_conn, fr_desc, &wr->prot,
+						ISERT_PROT_KEY_VALID, &prot_sge);
+			if (ret)
+				goto unmap_prot_cmd;
+		}
+
+		ret = isert_reg_sig_mr(isert_conn, se_cmd, fr_desc,
+				       &data_sge, &prot_sge, &sig_sge);
+		if (ret)
+			goto unmap_prot_cmd;
+
+		fr_desc->ind |= ISERT_PROTECTED;
+		memcpy(&wr->s_ib_sge, &sig_sge, sizeof(sig_sge));
+	} else
+		memcpy(&wr->s_ib_sge, &data_sge, sizeof(data_sge));
+
+	wr->ib_sge = &wr->s_ib_sge;
+	wr->send_wr_num = 1;
+	memset(&wr->s_send_wr, 0, sizeof(*send_wr));
+	wr->send_wr = &wr->s_send_wr;
+	wr->isert_cmd = isert_cmd;
+
+	send_wr = &isert_cmd->rdma_wr.s_send_wr;
+	send_wr->sg_list = &wr->s_ib_sge;
+	send_wr->num_sge = 1;
+	send_wr->wr_id = (unsigned long)&isert_cmd->tx_desc;
+	if (wr->iser_ib_op == ISER_IB_RDMA_WRITE) {
+		send_wr->opcode = IB_WR_RDMA_WRITE;
+		send_wr->wr.rdma.remote_addr = isert_cmd->read_va;
+		send_wr->wr.rdma.rkey = isert_cmd->read_stag;
+		send_wr->send_flags = se_cmd->prot_op == TARGET_PROT_NORMAL ?
+				      0 : IB_SEND_SIGNALED;
+	} else {
+		send_wr->opcode = IB_WR_RDMA_READ;
+		send_wr->wr.rdma.remote_addr = isert_cmd->write_va;
+		send_wr->wr.rdma.rkey = isert_cmd->write_stag;
+		send_wr->send_flags = IB_SEND_SIGNALED;
+	}
+
+	return 0;
+unmap_prot_cmd:
+	if (se_cmd->t_prot_sg)
+		isert_unmap_data_buf(isert_conn, &wr->prot);
+unmap_cmd:
+	if (fr_desc) {
+		spin_lock_irqsave(&isert_conn->conn_lock, flags);
+		list_add_tail(&fr_desc->list, &isert_conn->conn_fr_pool);
+		spin_unlock_irqrestore(&isert_conn->conn_lock, flags);
+	}
+	isert_unmap_data_buf(isert_conn, &wr->data);
+
+	return ret;
+}
+
+static int
+isert_put_datain(struct iscsi_conn *conn, struct iscsi_cmd *cmd)
+{
+	struct se_cmd *se_cmd = &cmd->se_cmd;
+	struct isert_cmd *isert_cmd = iscsit_priv_cmd(cmd);
+	struct isert_rdma_wr *wr = &isert_cmd->rdma_wr;
+	struct isert_conn *isert_conn = (struct isert_conn *)conn->context;
+	struct isert_device *device = isert_conn->conn_device;
+	struct ib_send_wr *wr_failed;
+	int rc;
+
+	pr_debug("Cmd: %p RDMA_WRITE data_length: %u\n",
+		 isert_cmd, se_cmd->data_length);
+	wr->iser_ib_op = ISER_IB_RDMA_WRITE;
+	rc = device->reg_rdma_mem(conn, cmd, wr);
+	if (rc) {
+		pr_err("Cmd: %p failed to prepare RDMA res\n", isert_cmd);
+		return rc;
+	}
+
+	if (se_cmd->prot_op == TARGET_PROT_NORMAL) {
+		/*
+		 * Build isert_conn->tx_desc for iSCSI response PDU and attach
+		 */
+		isert_create_send_desc(isert_conn, isert_cmd,
+				       &isert_cmd->tx_desc);
+		iscsit_build_rsp_pdu(cmd, conn, true, (struct iscsi_scsi_rsp *)
+				     &isert_cmd->tx_desc.iscsi_header);
+		isert_init_tx_hdrs(isert_conn, &isert_cmd->tx_desc);
+		isert_init_send_wr(isert_conn, isert_cmd,
+				   &isert_cmd->tx_desc.send_wr, false);
+		isert_cmd->rdma_wr.s_send_wr.next = &isert_cmd->tx_desc.send_wr;
+		wr->send_wr_num += 1;
+	}
+
+	atomic_add(wr->send_wr_num, &isert_conn->post_send_buf_count);
+
+	rc = ib_post_send(isert_conn->conn_qp, wr->send_wr, &wr_failed);
+	if (rc) {
+		pr_warn("ib_post_send() failed for IB_WR_RDMA_WRITE\n");
+		atomic_sub(wr->send_wr_num, &isert_conn->post_send_buf_count);
+	}
+
+	if (se_cmd->prot_op == TARGET_PROT_NORMAL)
+		pr_debug("Cmd: %p posted RDMA_WRITE + Response for iSER Data "
+			 "READ\n", isert_cmd);
+	else
+		pr_debug("Cmd: %p posted RDMA_WRITE for iSER Data READ\n",
+			 isert_cmd);
+
+	return 1;
+}
+
+static int
+isert_get_dataout(struct iscsi_conn *conn, struct iscsi_cmd *cmd, bool recovery)
+{
+	struct se_cmd *se_cmd = &cmd->se_cmd;
+	struct isert_cmd *isert_cmd = iscsit_priv_cmd(cmd);
+	struct isert_rdma_wr *wr = &isert_cmd->rdma_wr;
+	struct isert_conn *isert_conn = (struct isert_conn *)conn->context;
+	struct isert_device *device = isert_conn->conn_device;
+	struct ib_send_wr *wr_failed;
+	int rc;
+
+	pr_debug("Cmd: %p RDMA_READ data_length: %u write_data_done: %u\n",
+		 isert_cmd, se_cmd->data_length, cmd->write_data_done);
+	wr->iser_ib_op = ISER_IB_RDMA_READ;
+	rc = device->reg_rdma_mem(conn, cmd, wr);
+	if (rc) {
+		pr_err("Cmd: %p failed to prepare RDMA res\n", isert_cmd);
+		return rc;
+	}
+
+	atomic_add(wr->send_wr_num, &isert_conn->post_send_buf_count);
+
+	rc = ib_post_send(isert_conn->conn_qp, wr->send_wr, &wr_failed);
+	if (rc) {
+		pr_warn("ib_post_send() failed for IB_WR_RDMA_READ\n");
+		atomic_sub(wr->send_wr_num, &isert_conn->post_send_buf_count);
+	}
+	pr_debug("Cmd: %p posted RDMA_READ memory for ISER Data WRITE\n",
+		 isert_cmd);
+
+	return 0;
+}
+
+static int
+isert_immediate_queue(struct iscsi_conn *conn, struct iscsi_cmd *cmd, int state)
+{
+	int ret;
+
+	switch (state) {
+	case ISTATE_SEND_NOPIN_WANT_RESPONSE:
+		ret = isert_put_nopin(cmd, conn, false);
+		break;
+	default:
+		pr_err("Unknown immediate state: 0x%02x\n", state);
+		ret = -EINVAL;
+		break;
+	}
+
+	return ret;
+}
+
+static int
+isert_response_queue(struct iscsi_conn *conn, struct iscsi_cmd *cmd, int state)
+{
+	int ret;
+
+	switch (state) {
+	case ISTATE_SEND_LOGOUTRSP:
+		ret = isert_put_logout_rsp(cmd, conn);
+		if (!ret) {
+			pr_debug("Returning iSER Logout -EAGAIN\n");
+			ret = -EAGAIN;
+		}
+		break;
+	case ISTATE_SEND_NOPIN:
+		ret = isert_put_nopin(cmd, conn, true);
+		break;
+	case ISTATE_SEND_TASKMGTRSP:
+		ret = isert_put_tm_rsp(cmd, conn);
+		break;
+	case ISTATE_SEND_REJECT:
+		ret = isert_put_reject(cmd, conn);
+		break;
+	case ISTATE_SEND_TEXTRSP:
+		ret = isert_put_text_rsp(cmd, conn);
+		break;
+	case ISTATE_SEND_STATUS:
+		/*
+		 * Special case for sending non GOOD SCSI status from TX thread
+		 * context during pre se_cmd excecution failure.
+		 */
+		ret = isert_put_response(conn, cmd);
+		break;
+	default:
+		pr_err("Unknown response state: 0x%02x\n", state);
+		ret = -EINVAL;
+		break;
+	}
+
+	return ret;
+}
+
+static int
+isert_setup_np(struct iscsi_np *np,
+	       struct __kernel_sockaddr_storage *ksockaddr)
+{
+	struct isert_np *isert_np;
+	struct rdma_cm_id *isert_lid;
+	struct sockaddr *sa;
+	int ret;
+
+	isert_np = kzalloc(sizeof(struct isert_np), GFP_KERNEL);
+	if (!isert_np) {
+		pr_err("Unable to allocate struct isert_np\n");
+		return -ENOMEM;
+	}
+	sema_init(&isert_np->np_sem, 0);
+	mutex_init(&isert_np->np_accept_mutex);
+	INIT_LIST_HEAD(&isert_np->np_accept_list);
+	init_completion(&isert_np->np_login_comp);
+
+	sa = (struct sockaddr *)ksockaddr;
+	pr_debug("ksockaddr: %p, sa: %p\n", ksockaddr, sa);
+	/*
+	 * Setup the np->np_sockaddr from the passed sockaddr setup
+	 * in iscsi_target_configfs.c code..
+	 */
+	memcpy(&np->np_sockaddr, ksockaddr,
+	       sizeof(struct __kernel_sockaddr_storage));
+
+	isert_lid = rdma_create_id(isert_cma_handler, np, RDMA_PS_TCP,
+				IB_QPT_RC);
+	if (IS_ERR(isert_lid)) {
+		pr_err("rdma_create_id() for isert_listen_handler failed: %ld\n",
+		       PTR_ERR(isert_lid));
+		ret = PTR_ERR(isert_lid);
+		goto out;
+	}
+
+	ret = rdma_bind_addr(isert_lid, sa);
+	if (ret) {
+		pr_err("rdma_bind_addr() for isert_lid failed: %d\n", ret);
+		goto out_lid;
+	}
+
+	ret = rdma_listen(isert_lid, ISERT_RDMA_LISTEN_BACKLOG);
+	if (ret) {
+		pr_err("rdma_listen() for isert_lid failed: %d\n", ret);
+		goto out_lid;
+	}
+
+	isert_np->np_cm_id = isert_lid;
+	np->np_context = isert_np;
+	pr_debug("Setup isert_lid->context: %p\n", isert_lid->context);
+
+	return 0;
+
+out_lid:
+	rdma_destroy_id(isert_lid);
+out:
+	kfree(isert_np);
+	return ret;
+}
+
+static int
+isert_rdma_accept(struct isert_conn *isert_conn)
+{
+	struct rdma_cm_id *cm_id = isert_conn->conn_cm_id;
+	struct rdma_conn_param cp;
+	int ret;
+
+	memset(&cp, 0, sizeof(struct rdma_conn_param));
+	cp.initiator_depth = isert_conn->initiator_depth;
+	cp.retry_count = 7;
+	cp.rnr_retry_count = 7;
+
+	pr_debug("Before rdma_accept >>>>>>>>>>>>>>>>>>>>.\n");
+
+	ret = rdma_accept(cm_id, &cp);
+	if (ret) {
+		pr_err("rdma_accept() failed with: %d\n", ret);
+		return ret;
+	}
+
+	pr_debug("After rdma_accept >>>>>>>>>>>>>>>>>>>>>.\n");
+
+	return 0;
+}
+
+static int
+isert_get_login_rx(struct iscsi_conn *conn, struct iscsi_login *login)
+{
+	struct isert_conn *isert_conn = (struct isert_conn *)conn->context;
+	int ret;
+
+	pr_debug("isert_get_login_rx before conn_login_comp conn: %p\n", conn);
+	/*
+	 * For login requests after the first PDU, isert_rx_login_req() will
+	 * kick schedule_delayed_work(&conn->login_work) as the packet is
+	 * received, which turns this callback from iscsi_target_do_login_rx()
+	 * into a NOP.
+	 */
+	if (!login->first_request)
+		return 0;
+
+	ret = wait_for_completion_interruptible(&isert_conn->conn_login_comp);
+	if (ret)
+		return ret;
+
+	pr_debug("isert_get_login_rx processing login->req: %p\n", login->req);
+	return 0;
+}
+
+static void
+isert_set_conn_info(struct iscsi_np *np, struct iscsi_conn *conn,
+		    struct isert_conn *isert_conn)
+{
+	struct rdma_cm_id *cm_id = isert_conn->conn_cm_id;
+	struct rdma_route *cm_route = &cm_id->route;
+	struct sockaddr_in *sock_in;
+	struct sockaddr_in6 *sock_in6;
+
+	conn->login_family = np->np_sockaddr.ss_family;
+
+	if (np->np_sockaddr.ss_family == AF_INET6) {
+		sock_in6 = (struct sockaddr_in6 *)&cm_route->addr.dst_addr;
+		snprintf(conn->login_ip, sizeof(conn->login_ip), "%pI6c",
+			 &sock_in6->sin6_addr.in6_u);
+		conn->login_port = ntohs(sock_in6->sin6_port);
+
+		sock_in6 = (struct sockaddr_in6 *)&cm_route->addr.src_addr;
+		snprintf(conn->local_ip, sizeof(conn->local_ip), "%pI6c",
+			 &sock_in6->sin6_addr.in6_u);
+		conn->local_port = ntohs(sock_in6->sin6_port);
+	} else {
+		sock_in = (struct sockaddr_in *)&cm_route->addr.dst_addr;
+		sprintf(conn->login_ip, "%pI4",
+			&sock_in->sin_addr.s_addr);
+		conn->login_port = ntohs(sock_in->sin_port);
+
+		sock_in = (struct sockaddr_in *)&cm_route->addr.src_addr;
+		sprintf(conn->local_ip, "%pI4",
+			&sock_in->sin_addr.s_addr);
+		conn->local_port = ntohs(sock_in->sin_port);
+	}
+}
+
+static int
+isert_accept_np(struct iscsi_np *np, struct iscsi_conn *conn)
+{
+	struct isert_np *isert_np = (struct isert_np *)np->np_context;
+	struct isert_conn *isert_conn;
+	int max_accept = 0, ret;
+
+accept_wait:
+	ret = down_interruptible(&isert_np->np_sem);
+	if (ret || max_accept > 5)
+		return -ENODEV;
+
+	spin_lock_bh(&np->np_thread_lock);
+	if (np->np_thread_state >= ISCSI_NP_THREAD_RESET) {
+		spin_unlock_bh(&np->np_thread_lock);
+		pr_debug("np_thread_state %d for isert_accept_np\n",
+			 np->np_thread_state);
+		/**
+		 * No point in stalling here when np_thread
+		 * is in state RESET/SHUTDOWN/EXIT - bail
+		 **/
+		return -ENODEV;
+	}
+	spin_unlock_bh(&np->np_thread_lock);
+
+	mutex_lock(&isert_np->np_accept_mutex);
+	if (list_empty(&isert_np->np_accept_list)) {
+		mutex_unlock(&isert_np->np_accept_mutex);
+		max_accept++;
+		goto accept_wait;
+	}
+	isert_conn = list_first_entry(&isert_np->np_accept_list,
+			struct isert_conn, conn_accept_node);
+	list_del_init(&isert_conn->conn_accept_node);
+	mutex_unlock(&isert_np->np_accept_mutex);
+
+	conn->context = isert_conn;
+	isert_conn->conn = conn;
+	max_accept = 0;
+
+	ret = isert_rdma_post_recvl(isert_conn);
+	if (ret)
+		return ret;
+
+	ret = isert_rdma_accept(isert_conn);
+	if (ret)
+		return ret;
+
+	isert_set_conn_info(np, conn, isert_conn);
+
+	pr_debug("Processing isert_accept_np: isert_conn: %p\n", isert_conn);
+	return 0;
+}
+
+static void
+isert_free_np(struct iscsi_np *np)
+{
+	struct isert_np *isert_np = (struct isert_np *)np->np_context;
+
+	if (isert_np->np_cm_id)
+		rdma_destroy_id(isert_np->np_cm_id);
+
+	np->np_context = NULL;
+	kfree(isert_np);
+}
+
+static void isert_wait_conn(struct iscsi_conn *conn)
+{
+	struct isert_conn *isert_conn = conn->context;
+
+	pr_debug("isert_wait_conn: Starting \n");
+
+	mutex_lock(&isert_conn->conn_mutex);
+	if (isert_conn->conn_cm_id && !isert_conn->disconnect) {
+		pr_debug("Calling rdma_disconnect from isert_wait_conn\n");
+		rdma_disconnect(isert_conn->conn_cm_id);
+	}
+	/*
+	 * Only wait for conn_wait_comp_err if the isert_conn made it
+	 * into full feature phase..
+	 */
+	if (isert_conn->state == ISER_CONN_INIT) {
+		mutex_unlock(&isert_conn->conn_mutex);
+		return;
+	}
+	if (isert_conn->state == ISER_CONN_UP)
+		isert_conn->state = ISER_CONN_TERMINATING;
+	mutex_unlock(&isert_conn->conn_mutex);
+
+	wait_for_completion(&isert_conn->conn_wait_comp_err);
+
+	wait_for_completion(&isert_conn->conn_wait);
+	isert_put_conn(isert_conn);
+}
+
+static void isert_free_conn(struct iscsi_conn *conn)
+{
+	struct isert_conn *isert_conn = conn->context;
+
+	isert_put_conn(isert_conn);
+}
+
+static struct iscsit_transport iser_target_transport = {
+	.name			= "IB/iSER",
+	.transport_type		= ISCSI_INFINIBAND,
+	.priv_size		= sizeof(struct isert_cmd),
+	.owner			= THIS_MODULE,
+	.iscsit_setup_np	= isert_setup_np,
+	.iscsit_accept_np	= isert_accept_np,
+	.iscsit_free_np		= isert_free_np,
+	.iscsit_wait_conn	= isert_wait_conn,
+	.iscsit_free_conn	= isert_free_conn,
+	.iscsit_get_login_rx	= isert_get_login_rx,
+	.iscsit_put_login_tx	= isert_put_login_tx,
+	.iscsit_immediate_queue	= isert_immediate_queue,
+	.iscsit_response_queue	= isert_response_queue,
+	.iscsit_get_dataout	= isert_get_dataout,
+	.iscsit_queue_data_in	= isert_put_datain,
+	.iscsit_queue_status	= isert_put_response,
+	.iscsit_aborted_task	= isert_aborted_task,
+	.iscsit_get_sup_prot_ops = isert_get_sup_prot_ops,
+};
+
+static int __init isert_init(void)
+{
+	int ret;
+
+	isert_rx_wq = alloc_workqueue("isert_rx_wq", 0, 0);
+	if (!isert_rx_wq) {
+		pr_err("Unable to allocate isert_rx_wq\n");
+		return -ENOMEM;
+	}
+
+	isert_comp_wq = alloc_workqueue("isert_comp_wq", 0, 0);
+	if (!isert_comp_wq) {
+		pr_err("Unable to allocate isert_comp_wq\n");
+		ret = -ENOMEM;
+		goto destroy_rx_wq;
+	}
+
+	iscsit_register_transport(&iser_target_transport);
+	pr_debug("iSER_TARGET[0] - Loaded iser_target_transport\n");
+	return 0;
+
+destroy_rx_wq:
+	destroy_workqueue(isert_rx_wq);
+	return ret;
+}
+
+static void __exit isert_exit(void)
+{
+	flush_scheduled_work();
+	destroy_workqueue(isert_comp_wq);
+	destroy_workqueue(isert_rx_wq);
+	iscsit_unregister_transport(&iser_target_transport);
+	pr_debug("iSER_TARGET[0] - Released iser_target_transport\n");
+}
+
+MODULE_DESCRIPTION("iSER-Target for mainline target infrastructure");
+MODULE_VERSION("0.1");
+MODULE_AUTHOR("nab@Linux-iSCSI.org");
+MODULE_LICENSE("GPL");
+
+module_init(isert_init);
+module_exit(isert_exit);
diff --git a/drivers/infiniband/ulp/isert/ib_isert.h b/drivers/infiniband/ulp/isert/ib_isert.h
new file mode 100644
index 0000000..04f51f7
--- /dev/null
+++ b/drivers/infiniband/ulp/isert/ib_isert.h
@@ -0,0 +1,190 @@
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/in6.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/rdma_cm.h>
+
+#define ISERT_RDMA_LISTEN_BACKLOG	10
+#define ISCSI_ISER_SG_TABLESIZE		256
+#define ISER_FASTREG_LI_WRID		0xffffffffffffffffULL
+
+enum isert_desc_type {
+	ISCSI_TX_CONTROL,
+	ISCSI_TX_DATAIN
+};
+
+enum iser_ib_op_code {
+	ISER_IB_RECV,
+	ISER_IB_SEND,
+	ISER_IB_RDMA_WRITE,
+	ISER_IB_RDMA_READ,
+};
+
+enum iser_conn_state {
+	ISER_CONN_INIT,
+	ISER_CONN_UP,
+	ISER_CONN_TERMINATING,
+	ISER_CONN_DOWN,
+};
+
+struct iser_rx_desc {
+	struct iser_hdr iser_header;
+	struct iscsi_hdr iscsi_header;
+	char		data[ISER_RECV_DATA_SEG_LEN];
+	u64		dma_addr;
+	struct ib_sge	rx_sg;
+	char		pad[ISER_RX_PAD_SIZE];
+} __packed;
+
+struct iser_tx_desc {
+	struct iser_hdr iser_header;
+	struct iscsi_hdr iscsi_header;
+	enum isert_desc_type type;
+	u64		dma_addr;
+	struct ib_sge	tx_sg[2];
+	int		num_sge;
+	struct isert_cmd *isert_cmd;
+	struct llist_node *comp_llnode_batch;
+	struct llist_node comp_llnode;
+	bool		llnode_active;
+	struct ib_send_wr send_wr;
+} __packed;
+
+enum isert_indicator {
+	ISERT_PROTECTED		= 1 << 0,
+	ISERT_DATA_KEY_VALID	= 1 << 1,
+	ISERT_PROT_KEY_VALID	= 1 << 2,
+	ISERT_SIG_KEY_VALID	= 1 << 3,
+};
+
+struct pi_context {
+	struct ib_mr		       *prot_mr;
+	struct ib_fast_reg_page_list   *prot_frpl;
+	struct ib_mr		       *sig_mr;
+};
+
+struct fast_reg_descriptor {
+	struct list_head		list;
+	struct ib_mr		       *data_mr;
+	struct ib_fast_reg_page_list   *data_frpl;
+	u8				ind;
+	struct pi_context	       *pi_ctx;
+};
+
+struct isert_data_buf {
+	struct scatterlist     *sg;
+	int			nents;
+	u32			sg_off;
+	u32			len; /* cur_rdma_length */
+	u32			offset;
+	unsigned int		dma_nents;
+	enum dma_data_direction dma_dir;
+};
+
+struct isert_rdma_wr {
+	struct list_head	wr_list;
+	struct isert_cmd	*isert_cmd;
+	enum iser_ib_op_code	iser_ib_op;
+	struct ib_sge		*ib_sge;
+	struct ib_sge		s_ib_sge;
+	int			send_wr_num;
+	struct ib_send_wr	*send_wr;
+	struct ib_send_wr	s_send_wr;
+	struct isert_data_buf	data;
+	struct isert_data_buf	prot;
+	struct fast_reg_descriptor *fr_desc;
+};
+
+struct isert_cmd {
+	uint32_t		read_stag;
+	uint32_t		write_stag;
+	uint64_t		read_va;
+	uint64_t		write_va;
+	u64			pdu_buf_dma;
+	u32			pdu_buf_len;
+	u32			read_va_off;
+	u32			write_va_off;
+	u32			rdma_wr_num;
+	struct isert_conn	*conn;
+	struct iscsi_cmd	*iscsi_cmd;
+	struct iser_tx_desc	tx_desc;
+	struct isert_rdma_wr	rdma_wr;
+	struct work_struct	comp_work;
+};
+
+struct isert_device;
+
+struct isert_conn {
+	enum iser_conn_state	state;
+	int			post_recv_buf_count;
+	atomic_t		post_send_buf_count;
+	u32			responder_resources;
+	u32			initiator_depth;
+	u32			max_sge;
+	char			*login_buf;
+	char			*login_req_buf;
+	char			*login_rsp_buf;
+	u64			login_req_dma;
+	u64			login_rsp_dma;
+	unsigned int		conn_rx_desc_head;
+	struct iser_rx_desc	*conn_rx_descs;
+	struct ib_recv_wr	conn_rx_wr[ISERT_MIN_POSTED_RX];
+	struct iscsi_conn	*conn;
+	struct list_head	conn_accept_node;
+	struct completion	conn_login_comp;
+	struct iser_tx_desc	conn_login_tx_desc;
+	struct rdma_cm_id	*conn_cm_id;
+	struct ib_pd		*conn_pd;
+	struct ib_mr		*conn_mr;
+	struct ib_qp		*conn_qp;
+	struct isert_device	*conn_device;
+	struct work_struct	conn_logout_work;
+	struct mutex		conn_mutex;
+	struct completion	conn_wait;
+	struct completion	conn_wait_comp_err;
+	struct kref		conn_kref;
+	struct list_head	conn_fr_pool;
+	int			conn_fr_pool_size;
+	/* lock to protect fastreg pool */
+	spinlock_t		conn_lock;
+#define ISERT_COMP_BATCH_COUNT	8
+	int			conn_comp_batch;
+	struct llist_head	conn_comp_llist;
+	bool                    disconnect;
+};
+
+#define ISERT_MAX_CQ 64
+
+struct isert_cq_desc {
+	struct isert_device	*device;
+	int			cq_index;
+	struct work_struct	cq_rx_work;
+	struct work_struct	cq_tx_work;
+};
+
+struct isert_device {
+	int			use_fastreg;
+	bool			pi_capable;
+	int			cqs_used;
+	int			refcount;
+	int			cq_active_qps[ISERT_MAX_CQ];
+	struct ib_device	*ib_device;
+	struct ib_cq		*dev_rx_cq[ISERT_MAX_CQ];
+	struct ib_cq		*dev_tx_cq[ISERT_MAX_CQ];
+	struct isert_cq_desc	*cq_desc;
+	struct list_head	dev_node;
+	struct ib_device_attr	dev_attr;
+	int			(*reg_rdma_mem)(struct iscsi_conn *conn,
+						    struct iscsi_cmd *cmd,
+						    struct isert_rdma_wr *wr);
+	void			(*unreg_rdma_mem)(struct isert_cmd *isert_cmd,
+						  struct isert_conn *isert_conn);
+};
+
+struct isert_np {
+	struct semaphore	np_sem;
+	struct rdma_cm_id	*np_cm_id;
+	struct mutex		np_accept_mutex;
+	struct list_head	np_accept_list;
+	struct completion	np_login_comp;
+};
diff --git a/drivers/infiniband/ulp/isert/isert_proto.h b/drivers/infiniband/ulp/isert/isert_proto.h
new file mode 100644
index 0000000..4dccd31
--- /dev/null
+++ b/drivers/infiniband/ulp/isert/isert_proto.h
@@ -0,0 +1,47 @@
+/* From iscsi_iser.h */
+
+struct iser_hdr {
+	u8	flags;
+	u8	rsvd[3];
+	__be32	write_stag; /* write rkey */
+	__be64	write_va;
+	__be32	read_stag;  /* read rkey */
+	__be64	read_va;
+} __packed;
+
+/*Constant PDU lengths calculations */
+#define ISER_HEADERS_LEN  (sizeof(struct iser_hdr) + sizeof(struct iscsi_hdr))
+
+#define ISER_RECV_DATA_SEG_LEN  8192
+#define ISER_RX_PAYLOAD_SIZE    (ISER_HEADERS_LEN + ISER_RECV_DATA_SEG_LEN)
+#define ISER_RX_LOGIN_SIZE      (ISER_HEADERS_LEN + ISCSI_DEF_MAX_RECV_SEG_LEN)
+
+/* QP settings */
+/* Maximal bounds on received asynchronous PDUs */
+#define ISERT_MAX_TX_MISC_PDUS	4 /* NOOP_IN(2) , ASYNC_EVENT(2)   */
+
+#define ISERT_MAX_RX_MISC_PDUS	6 /* NOOP_OUT(2), TEXT(1),         *
+				   * SCSI_TMFUNC(2), LOGOUT(1) */
+
+#define ISCSI_DEF_XMIT_CMDS_MAX 128 /* from libiscsi.h, must be power of 2 */
+
+#define ISERT_QP_MAX_RECV_DTOS	(ISCSI_DEF_XMIT_CMDS_MAX)
+
+#define ISERT_MIN_POSTED_RX	(ISCSI_DEF_XMIT_CMDS_MAX >> 2)
+
+#define ISERT_INFLIGHT_DATAOUTS	8
+
+#define ISERT_QP_MAX_REQ_DTOS	(ISCSI_DEF_XMIT_CMDS_MAX *    \
+				(1 + ISERT_INFLIGHT_DATAOUTS) + \
+				ISERT_MAX_TX_MISC_PDUS	+ \
+				ISERT_MAX_RX_MISC_PDUS)
+
+#define ISER_RX_PAD_SIZE	(ISER_RECV_DATA_SEG_LEN + 4096 - \
+		(ISER_RX_PAYLOAD_SIZE + sizeof(u64) + sizeof(struct ib_sge)))
+
+#define ISER_VER	0x10
+#define ISER_WSV	0x08
+#define ISER_RSV	0x04
+#define ISCSI_CTRL	0x10
+#define ISER_HELLO	0x20
+#define ISER_HELLORPLY	0x30
diff --git a/drivers/infiniband/ulp/srp/Kbuild b/drivers/infiniband/ulp/srp/Kbuild
new file mode 100644
index 0000000..a16c73c
--- /dev/null
+++ b/drivers/infiniband/ulp/srp/Kbuild
@@ -0,0 +1 @@
+obj-$(CONFIG_INFINIBAND_SRP)			+= ib_srp.o
diff --git a/drivers/infiniband/ulp/srp/Kconfig b/drivers/infiniband/ulp/srp/Kconfig
new file mode 100644
index 0000000..c74ee96
--- /dev/null
+++ b/drivers/infiniband/ulp/srp/Kconfig
@@ -0,0 +1,12 @@
+config INFINIBAND_SRP
+	tristate "InfiniBand SCSI RDMA Protocol"
+	depends on SCSI
+	select SCSI_SRP_ATTRS
+	---help---
+	  Support for the SCSI RDMA Protocol over InfiniBand.  This
+	  allows you to access storage devices that speak SRP over
+	  InfiniBand.
+
+	  The SRP protocol is defined by the INCITS T10 technical
+	  committee.  See <http://www.t10.org/>.
+
diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c
new file mode 100644
index 0000000..62d2a18
--- /dev/null
+++ b/drivers/infiniband/ulp/srp/ib_srp.c
@@ -0,0 +1,3373 @@
+/*
+ * Copyright (c) 2005 Cisco Systems.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/err.h>
+#include <linux/string.h>
+#include <linux/parser.h>
+#include <linux/random.h>
+#include <linux/jiffies.h>
+
+#include <linux/atomic.h>
+
+#include <scsi/scsi.h>
+#include <scsi/scsi_device.h>
+#include <scsi/scsi_dbg.h>
+#include <scsi/scsi_tcq.h>
+#include <scsi/srp.h>
+#include <scsi/scsi_transport_srp.h>
+
+#include "ib_srp.h"
+
+#define DRV_NAME	"ib_srp"
+#define PFX		DRV_NAME ": "
+#define DRV_VERSION	"1.0"
+#define DRV_RELDATE	"July 1, 2013"
+
+MODULE_AUTHOR("Roland Dreier");
+MODULE_DESCRIPTION("InfiniBand SCSI RDMA Protocol initiator "
+		   "v" DRV_VERSION " (" DRV_RELDATE ")");
+MODULE_LICENSE("Dual BSD/GPL");
+
+static unsigned int srp_sg_tablesize;
+static unsigned int cmd_sg_entries;
+static unsigned int indirect_sg_entries;
+static bool allow_ext_sg;
+static bool prefer_fr;
+static bool register_always;
+static int topspin_workarounds = 1;
+
+module_param(srp_sg_tablesize, uint, 0444);
+MODULE_PARM_DESC(srp_sg_tablesize, "Deprecated name for cmd_sg_entries");
+
+module_param(cmd_sg_entries, uint, 0444);
+MODULE_PARM_DESC(cmd_sg_entries,
+		 "Default number of gather/scatter entries in the SRP command (default is 12, max 255)");
+
+module_param(indirect_sg_entries, uint, 0444);
+MODULE_PARM_DESC(indirect_sg_entries,
+		 "Default max number of gather/scatter entries (default is 12, max is " __stringify(SCSI_MAX_SG_CHAIN_SEGMENTS) ")");
+
+module_param(allow_ext_sg, bool, 0444);
+MODULE_PARM_DESC(allow_ext_sg,
+		  "Default behavior when there are more than cmd_sg_entries S/G entries after mapping; fails the request when false (default false)");
+
+module_param(topspin_workarounds, int, 0444);
+MODULE_PARM_DESC(topspin_workarounds,
+		 "Enable workarounds for Topspin/Cisco SRP target bugs if != 0");
+
+module_param(prefer_fr, bool, 0444);
+MODULE_PARM_DESC(prefer_fr,
+"Whether to use fast registration if both FMR and fast registration are supported");
+
+module_param(register_always, bool, 0444);
+MODULE_PARM_DESC(register_always,
+		 "Use memory registration even for contiguous memory regions");
+
+static struct kernel_param_ops srp_tmo_ops;
+
+static int srp_reconnect_delay = 10;
+module_param_cb(reconnect_delay, &srp_tmo_ops, &srp_reconnect_delay,
+		S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(reconnect_delay, "Time between successive reconnect attempts");
+
+static int srp_fast_io_fail_tmo = 15;
+module_param_cb(fast_io_fail_tmo, &srp_tmo_ops, &srp_fast_io_fail_tmo,
+		S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(fast_io_fail_tmo,
+		 "Number of seconds between the observation of a transport"
+		 " layer error and failing all I/O. \"off\" means that this"
+		 " functionality is disabled.");
+
+static int srp_dev_loss_tmo = 600;
+module_param_cb(dev_loss_tmo, &srp_tmo_ops, &srp_dev_loss_tmo,
+		S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(dev_loss_tmo,
+		 "Maximum number of seconds that the SRP transport should"
+		 " insulate transport layer errors. After this time has been"
+		 " exceeded the SCSI host is removed. Should be"
+		 " between 1 and " __stringify(SCSI_DEVICE_BLOCK_MAX_TIMEOUT)
+		 " if fast_io_fail_tmo has not been set. \"off\" means that"
+		 " this functionality is disabled.");
+
+static void srp_add_one(struct ib_device *device);
+static void srp_remove_one(struct ib_device *device);
+static void srp_recv_completion(struct ib_cq *cq, void *target_ptr);
+static void srp_send_completion(struct ib_cq *cq, void *target_ptr);
+static int srp_cm_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event);
+
+static struct scsi_transport_template *ib_srp_transport_template;
+static struct workqueue_struct *srp_remove_wq;
+
+static struct ib_client srp_client = {
+	.name   = "srp",
+	.add    = srp_add_one,
+	.remove = srp_remove_one
+};
+
+static struct ib_sa_client srp_sa_client;
+
+static int srp_tmo_get(char *buffer, const struct kernel_param *kp)
+{
+	int tmo = *(int *)kp->arg;
+
+	if (tmo >= 0)
+		return sprintf(buffer, "%d", tmo);
+	else
+		return sprintf(buffer, "off");
+}
+
+static int srp_tmo_set(const char *val, const struct kernel_param *kp)
+{
+	int tmo, res;
+
+	if (strncmp(val, "off", 3) != 0) {
+		res = kstrtoint(val, 0, &tmo);
+		if (res)
+			goto out;
+	} else {
+		tmo = -1;
+	}
+	if (kp->arg == &srp_reconnect_delay)
+		res = srp_tmo_valid(tmo, srp_fast_io_fail_tmo,
+				    srp_dev_loss_tmo);
+	else if (kp->arg == &srp_fast_io_fail_tmo)
+		res = srp_tmo_valid(srp_reconnect_delay, tmo, srp_dev_loss_tmo);
+	else
+		res = srp_tmo_valid(srp_reconnect_delay, srp_fast_io_fail_tmo,
+				    tmo);
+	if (res)
+		goto out;
+	*(int *)kp->arg = tmo;
+
+out:
+	return res;
+}
+
+static struct kernel_param_ops srp_tmo_ops = {
+	.get = srp_tmo_get,
+	.set = srp_tmo_set,
+};
+
+static inline struct srp_target_port *host_to_target(struct Scsi_Host *host)
+{
+	return (struct srp_target_port *) host->hostdata;
+}
+
+static const char *srp_target_info(struct Scsi_Host *host)
+{
+	return host_to_target(host)->target_name;
+}
+
+static int srp_target_is_topspin(struct srp_target_port *target)
+{
+	static const u8 topspin_oui[3] = { 0x00, 0x05, 0xad };
+	static const u8 cisco_oui[3]   = { 0x00, 0x1b, 0x0d };
+
+	return topspin_workarounds &&
+		(!memcmp(&target->ioc_guid, topspin_oui, sizeof topspin_oui) ||
+		 !memcmp(&target->ioc_guid, cisco_oui, sizeof cisco_oui));
+}
+
+static struct srp_iu *srp_alloc_iu(struct srp_host *host, size_t size,
+				   gfp_t gfp_mask,
+				   enum dma_data_direction direction)
+{
+	struct srp_iu *iu;
+
+	iu = kmalloc(sizeof *iu, gfp_mask);
+	if (!iu)
+		goto out;
+
+	iu->buf = kzalloc(size, gfp_mask);
+	if (!iu->buf)
+		goto out_free_iu;
+
+	iu->dma = ib_dma_map_single(host->srp_dev->dev, iu->buf, size,
+				    direction);
+	if (ib_dma_mapping_error(host->srp_dev->dev, iu->dma))
+		goto out_free_buf;
+
+	iu->size      = size;
+	iu->direction = direction;
+
+	return iu;
+
+out_free_buf:
+	kfree(iu->buf);
+out_free_iu:
+	kfree(iu);
+out:
+	return NULL;
+}
+
+static void srp_free_iu(struct srp_host *host, struct srp_iu *iu)
+{
+	if (!iu)
+		return;
+
+	ib_dma_unmap_single(host->srp_dev->dev, iu->dma, iu->size,
+			    iu->direction);
+	kfree(iu->buf);
+	kfree(iu);
+}
+
+static void srp_qp_event(struct ib_event *event, void *context)
+{
+	pr_debug("QP event %d\n", event->event);
+}
+
+static int srp_init_qp(struct srp_target_port *target,
+		       struct ib_qp *qp)
+{
+	struct ib_qp_attr *attr;
+	int ret;
+
+	attr = kmalloc(sizeof *attr, GFP_KERNEL);
+	if (!attr)
+		return -ENOMEM;
+
+	ret = ib_find_pkey(target->srp_host->srp_dev->dev,
+			   target->srp_host->port,
+			   be16_to_cpu(target->path.pkey),
+			   &attr->pkey_index);
+	if (ret)
+		goto out;
+
+	attr->qp_state        = IB_QPS_INIT;
+	attr->qp_access_flags = (IB_ACCESS_REMOTE_READ |
+				    IB_ACCESS_REMOTE_WRITE);
+	attr->port_num        = target->srp_host->port;
+
+	ret = ib_modify_qp(qp, attr,
+			   IB_QP_STATE		|
+			   IB_QP_PKEY_INDEX	|
+			   IB_QP_ACCESS_FLAGS	|
+			   IB_QP_PORT);
+
+out:
+	kfree(attr);
+	return ret;
+}
+
+static int srp_new_cm_id(struct srp_target_port *target)
+{
+	struct ib_cm_id *new_cm_id;
+
+	new_cm_id = ib_create_cm_id(target->srp_host->srp_dev->dev,
+				    srp_cm_handler, target);
+	if (IS_ERR(new_cm_id))
+		return PTR_ERR(new_cm_id);
+
+	if (target->cm_id)
+		ib_destroy_cm_id(target->cm_id);
+	target->cm_id = new_cm_id;
+
+	return 0;
+}
+
+static struct ib_fmr_pool *srp_alloc_fmr_pool(struct srp_target_port *target)
+{
+	struct srp_device *dev = target->srp_host->srp_dev;
+	struct ib_fmr_pool_param fmr_param;
+
+	memset(&fmr_param, 0, sizeof(fmr_param));
+	fmr_param.pool_size	    = target->scsi_host->can_queue;
+	fmr_param.dirty_watermark   = fmr_param.pool_size / 4;
+	fmr_param.cache		    = 1;
+	fmr_param.max_pages_per_fmr = dev->max_pages_per_mr;
+	fmr_param.page_shift	    = ilog2(dev->mr_page_size);
+	fmr_param.access	    = (IB_ACCESS_LOCAL_WRITE |
+				       IB_ACCESS_REMOTE_WRITE |
+				       IB_ACCESS_REMOTE_READ);
+
+	return ib_create_fmr_pool(dev->pd, &fmr_param);
+}
+
+/**
+ * srp_destroy_fr_pool() - free the resources owned by a pool
+ * @pool: Fast registration pool to be destroyed.
+ */
+static void srp_destroy_fr_pool(struct srp_fr_pool *pool)
+{
+	int i;
+	struct srp_fr_desc *d;
+
+	if (!pool)
+		return;
+
+	for (i = 0, d = &pool->desc[0]; i < pool->size; i++, d++) {
+		if (d->frpl)
+			ib_free_fast_reg_page_list(d->frpl);
+		if (d->mr)
+			ib_dereg_mr(d->mr);
+	}
+	kfree(pool);
+}
+
+/**
+ * srp_create_fr_pool() - allocate and initialize a pool for fast registration
+ * @device:            IB device to allocate fast registration descriptors for.
+ * @pd:                Protection domain associated with the FR descriptors.
+ * @pool_size:         Number of descriptors to allocate.
+ * @max_page_list_len: Maximum fast registration work request page list length.
+ */
+static struct srp_fr_pool *srp_create_fr_pool(struct ib_device *device,
+					      struct ib_pd *pd, int pool_size,
+					      int max_page_list_len)
+{
+	struct srp_fr_pool *pool;
+	struct srp_fr_desc *d;
+	struct ib_mr *mr;
+	struct ib_fast_reg_page_list *frpl;
+	int i, ret = -EINVAL;
+
+	if (pool_size <= 0)
+		goto err;
+	ret = -ENOMEM;
+	pool = kzalloc(sizeof(struct srp_fr_pool) +
+		       pool_size * sizeof(struct srp_fr_desc), GFP_KERNEL);
+	if (!pool)
+		goto err;
+	pool->size = pool_size;
+	pool->max_page_list_len = max_page_list_len;
+	spin_lock_init(&pool->lock);
+	INIT_LIST_HEAD(&pool->free_list);
+
+	for (i = 0, d = &pool->desc[0]; i < pool->size; i++, d++) {
+		mr = ib_alloc_fast_reg_mr(pd, max_page_list_len);
+		if (IS_ERR(mr)) {
+			ret = PTR_ERR(mr);
+			goto destroy_pool;
+		}
+		d->mr = mr;
+		frpl = ib_alloc_fast_reg_page_list(device, max_page_list_len);
+		if (IS_ERR(frpl)) {
+			ret = PTR_ERR(frpl);
+			goto destroy_pool;
+		}
+		d->frpl = frpl;
+		list_add_tail(&d->entry, &pool->free_list);
+	}
+
+out:
+	return pool;
+
+destroy_pool:
+	srp_destroy_fr_pool(pool);
+
+err:
+	pool = ERR_PTR(ret);
+	goto out;
+}
+
+/**
+ * srp_fr_pool_get() - obtain a descriptor suitable for fast registration
+ * @pool: Pool to obtain descriptor from.
+ */
+static struct srp_fr_desc *srp_fr_pool_get(struct srp_fr_pool *pool)
+{
+	struct srp_fr_desc *d = NULL;
+	unsigned long flags;
+
+	spin_lock_irqsave(&pool->lock, flags);
+	if (!list_empty(&pool->free_list)) {
+		d = list_first_entry(&pool->free_list, typeof(*d), entry);
+		list_del(&d->entry);
+	}
+	spin_unlock_irqrestore(&pool->lock, flags);
+
+	return d;
+}
+
+/**
+ * srp_fr_pool_put() - put an FR descriptor back in the free list
+ * @pool: Pool the descriptor was allocated from.
+ * @desc: Pointer to an array of fast registration descriptor pointers.
+ * @n:    Number of descriptors to put back.
+ *
+ * Note: The caller must already have queued an invalidation request for
+ * desc->mr->rkey before calling this function.
+ */
+static void srp_fr_pool_put(struct srp_fr_pool *pool, struct srp_fr_desc **desc,
+			    int n)
+{
+	unsigned long flags;
+	int i;
+
+	spin_lock_irqsave(&pool->lock, flags);
+	for (i = 0; i < n; i++)
+		list_add(&desc[i]->entry, &pool->free_list);
+	spin_unlock_irqrestore(&pool->lock, flags);
+}
+
+static struct srp_fr_pool *srp_alloc_fr_pool(struct srp_target_port *target)
+{
+	struct srp_device *dev = target->srp_host->srp_dev;
+
+	return srp_create_fr_pool(dev->dev, dev->pd,
+				  target->scsi_host->can_queue,
+				  dev->max_pages_per_mr);
+}
+
+static int srp_create_target_ib(struct srp_target_port *target)
+{
+	struct srp_device *dev = target->srp_host->srp_dev;
+	struct ib_qp_init_attr *init_attr;
+	struct ib_cq *recv_cq, *send_cq;
+	struct ib_qp *qp;
+	struct ib_fmr_pool *fmr_pool = NULL;
+	struct srp_fr_pool *fr_pool = NULL;
+	const int m = 1 + dev->use_fast_reg;
+	int ret;
+
+	init_attr = kzalloc(sizeof *init_attr, GFP_KERNEL);
+	if (!init_attr)
+		return -ENOMEM;
+
+	recv_cq = ib_create_cq(dev->dev, srp_recv_completion, NULL, target,
+			       target->queue_size, target->comp_vector);
+	if (IS_ERR(recv_cq)) {
+		ret = PTR_ERR(recv_cq);
+		goto err;
+	}
+
+	send_cq = ib_create_cq(dev->dev, srp_send_completion, NULL, target,
+			       m * target->queue_size, target->comp_vector);
+	if (IS_ERR(send_cq)) {
+		ret = PTR_ERR(send_cq);
+		goto err_recv_cq;
+	}
+
+	ib_req_notify_cq(recv_cq, IB_CQ_NEXT_COMP);
+
+	init_attr->event_handler       = srp_qp_event;
+	init_attr->cap.max_send_wr     = m * target->queue_size;
+	init_attr->cap.max_recv_wr     = target->queue_size;
+	init_attr->cap.max_recv_sge    = 1;
+	init_attr->cap.max_send_sge    = 1;
+	init_attr->sq_sig_type         = IB_SIGNAL_REQ_WR;
+	init_attr->qp_type             = IB_QPT_RC;
+	init_attr->send_cq             = send_cq;
+	init_attr->recv_cq             = recv_cq;
+
+	qp = ib_create_qp(dev->pd, init_attr);
+	if (IS_ERR(qp)) {
+		ret = PTR_ERR(qp);
+		goto err_send_cq;
+	}
+
+	ret = srp_init_qp(target, qp);
+	if (ret)
+		goto err_qp;
+
+	if (dev->use_fast_reg && dev->has_fr) {
+		fr_pool = srp_alloc_fr_pool(target);
+		if (IS_ERR(fr_pool)) {
+			ret = PTR_ERR(fr_pool);
+			shost_printk(KERN_WARNING, target->scsi_host, PFX
+				     "FR pool allocation failed (%d)\n", ret);
+			goto err_qp;
+		}
+		if (target->fr_pool)
+			srp_destroy_fr_pool(target->fr_pool);
+		target->fr_pool = fr_pool;
+	} else if (!dev->use_fast_reg && dev->has_fmr) {
+		fmr_pool = srp_alloc_fmr_pool(target);
+		if (IS_ERR(fmr_pool)) {
+			ret = PTR_ERR(fmr_pool);
+			shost_printk(KERN_WARNING, target->scsi_host, PFX
+				     "FMR pool allocation failed (%d)\n", ret);
+			goto err_qp;
+		}
+		if (target->fmr_pool)
+			ib_destroy_fmr_pool(target->fmr_pool);
+		target->fmr_pool = fmr_pool;
+	}
+
+	if (target->qp)
+		ib_destroy_qp(target->qp);
+	if (target->recv_cq)
+		ib_destroy_cq(target->recv_cq);
+	if (target->send_cq)
+		ib_destroy_cq(target->send_cq);
+
+	target->qp = qp;
+	target->recv_cq = recv_cq;
+	target->send_cq = send_cq;
+
+	kfree(init_attr);
+	return 0;
+
+err_qp:
+	ib_destroy_qp(qp);
+
+err_send_cq:
+	ib_destroy_cq(send_cq);
+
+err_recv_cq:
+	ib_destroy_cq(recv_cq);
+
+err:
+	kfree(init_attr);
+	return ret;
+}
+
+/*
+ * Note: this function may be called without srp_alloc_iu_bufs() having been
+ * invoked. Hence the target->[rt]x_ring checks.
+ */
+static void srp_free_target_ib(struct srp_target_port *target)
+{
+	struct srp_device *dev = target->srp_host->srp_dev;
+	int i;
+
+	if (dev->use_fast_reg) {
+		if (target->fr_pool)
+			srp_destroy_fr_pool(target->fr_pool);
+	} else {
+		if (target->fmr_pool)
+			ib_destroy_fmr_pool(target->fmr_pool);
+	}
+	ib_destroy_qp(target->qp);
+	ib_destroy_cq(target->send_cq);
+	ib_destroy_cq(target->recv_cq);
+
+	target->qp = NULL;
+	target->send_cq = target->recv_cq = NULL;
+
+	if (target->rx_ring) {
+		for (i = 0; i < target->queue_size; ++i)
+			srp_free_iu(target->srp_host, target->rx_ring[i]);
+		kfree(target->rx_ring);
+		target->rx_ring = NULL;
+	}
+	if (target->tx_ring) {
+		for (i = 0; i < target->queue_size; ++i)
+			srp_free_iu(target->srp_host, target->tx_ring[i]);
+		kfree(target->tx_ring);
+		target->tx_ring = NULL;
+	}
+}
+
+static void srp_path_rec_completion(int status,
+				    struct ib_sa_path_rec *pathrec,
+				    void *target_ptr)
+{
+	struct srp_target_port *target = target_ptr;
+
+	target->status = status;
+	if (status)
+		shost_printk(KERN_ERR, target->scsi_host,
+			     PFX "Got failed path rec status %d\n", status);
+	else
+		target->path = *pathrec;
+	complete(&target->done);
+}
+
+static int srp_lookup_path(struct srp_target_port *target)
+{
+	int ret;
+
+	target->path.numb_path = 1;
+
+	init_completion(&target->done);
+
+	target->path_query_id = ib_sa_path_rec_get(&srp_sa_client,
+						   target->srp_host->srp_dev->dev,
+						   target->srp_host->port,
+						   &target->path,
+						   IB_SA_PATH_REC_SERVICE_ID	|
+						   IB_SA_PATH_REC_DGID		|
+						   IB_SA_PATH_REC_SGID		|
+						   IB_SA_PATH_REC_NUMB_PATH	|
+						   IB_SA_PATH_REC_PKEY,
+						   SRP_PATH_REC_TIMEOUT_MS,
+						   GFP_KERNEL,
+						   srp_path_rec_completion,
+						   target, &target->path_query);
+	if (target->path_query_id < 0)
+		return target->path_query_id;
+
+	ret = wait_for_completion_interruptible(&target->done);
+	if (ret < 0)
+		return ret;
+
+	if (target->status < 0)
+		shost_printk(KERN_WARNING, target->scsi_host,
+			     PFX "Path record query failed\n");
+
+	return target->status;
+}
+
+static int srp_send_req(struct srp_target_port *target)
+{
+	struct {
+		struct ib_cm_req_param param;
+		struct srp_login_req   priv;
+	} *req = NULL;
+	int status;
+
+	req = kzalloc(sizeof *req, GFP_KERNEL);
+	if (!req)
+		return -ENOMEM;
+
+	req->param.primary_path 	      = &target->path;
+	req->param.alternate_path 	      = NULL;
+	req->param.service_id 		      = target->service_id;
+	req->param.qp_num 		      = target->qp->qp_num;
+	req->param.qp_type 		      = target->qp->qp_type;
+	req->param.private_data 	      = &req->priv;
+	req->param.private_data_len 	      = sizeof req->priv;
+	req->param.flow_control 	      = 1;
+
+	get_random_bytes(&req->param.starting_psn, 4);
+	req->param.starting_psn 	     &= 0xffffff;
+
+	/*
+	 * Pick some arbitrary defaults here; we could make these
+	 * module parameters if anyone cared about setting them.
+	 */
+	req->param.responder_resources	      = 4;
+	req->param.remote_cm_response_timeout = 20;
+	req->param.local_cm_response_timeout  = 20;
+	req->param.retry_count                = target->tl_retry_count;
+	req->param.rnr_retry_count 	      = 7;
+	req->param.max_cm_retries 	      = 15;
+
+	req->priv.opcode     	= SRP_LOGIN_REQ;
+	req->priv.tag        	= 0;
+	req->priv.req_it_iu_len = cpu_to_be32(target->max_iu_len);
+	req->priv.req_buf_fmt 	= cpu_to_be16(SRP_BUF_FORMAT_DIRECT |
+					      SRP_BUF_FORMAT_INDIRECT);
+	/*
+	 * In the published SRP specification (draft rev. 16a), the
+	 * port identifier format is 8 bytes of ID extension followed
+	 * by 8 bytes of GUID.  Older drafts put the two halves in the
+	 * opposite order, so that the GUID comes first.
+	 *
+	 * Targets conforming to these obsolete drafts can be
+	 * recognized by the I/O Class they report.
+	 */
+	if (target->io_class == SRP_REV10_IB_IO_CLASS) {
+		memcpy(req->priv.initiator_port_id,
+		       &target->path.sgid.global.interface_id, 8);
+		memcpy(req->priv.initiator_port_id + 8,
+		       &target->initiator_ext, 8);
+		memcpy(req->priv.target_port_id,     &target->ioc_guid, 8);
+		memcpy(req->priv.target_port_id + 8, &target->id_ext, 8);
+	} else {
+		memcpy(req->priv.initiator_port_id,
+		       &target->initiator_ext, 8);
+		memcpy(req->priv.initiator_port_id + 8,
+		       &target->path.sgid.global.interface_id, 8);
+		memcpy(req->priv.target_port_id,     &target->id_ext, 8);
+		memcpy(req->priv.target_port_id + 8, &target->ioc_guid, 8);
+	}
+
+	/*
+	 * Topspin/Cisco SRP targets will reject our login unless we
+	 * zero out the first 8 bytes of our initiator port ID and set
+	 * the second 8 bytes to the local node GUID.
+	 */
+	if (srp_target_is_topspin(target)) {
+		shost_printk(KERN_DEBUG, target->scsi_host,
+			     PFX "Topspin/Cisco initiator port ID workaround "
+			     "activated for target GUID %016llx\n",
+			     (unsigned long long) be64_to_cpu(target->ioc_guid));
+		memset(req->priv.initiator_port_id, 0, 8);
+		memcpy(req->priv.initiator_port_id + 8,
+		       &target->srp_host->srp_dev->dev->node_guid, 8);
+	}
+
+	status = ib_send_cm_req(target->cm_id, &req->param);
+
+	kfree(req);
+
+	return status;
+}
+
+static bool srp_queue_remove_work(struct srp_target_port *target)
+{
+	bool changed = false;
+
+	spin_lock_irq(&target->lock);
+	if (target->state != SRP_TARGET_REMOVED) {
+		target->state = SRP_TARGET_REMOVED;
+		changed = true;
+	}
+	spin_unlock_irq(&target->lock);
+
+	if (changed)
+		queue_work(srp_remove_wq, &target->remove_work);
+
+	return changed;
+}
+
+static bool srp_change_conn_state(struct srp_target_port *target,
+				  bool connected)
+{
+	bool changed = false;
+
+	spin_lock_irq(&target->lock);
+	if (target->connected != connected) {
+		target->connected = connected;
+		changed = true;
+	}
+	spin_unlock_irq(&target->lock);
+
+	return changed;
+}
+
+static void srp_disconnect_target(struct srp_target_port *target)
+{
+	if (srp_change_conn_state(target, false)) {
+		/* XXX should send SRP_I_LOGOUT request */
+
+		if (ib_send_cm_dreq(target->cm_id, NULL, 0)) {
+			shost_printk(KERN_DEBUG, target->scsi_host,
+				     PFX "Sending CM DREQ failed\n");
+		}
+	}
+}
+
+static void srp_free_req_data(struct srp_target_port *target)
+{
+	struct srp_device *dev = target->srp_host->srp_dev;
+	struct ib_device *ibdev = dev->dev;
+	struct srp_request *req;
+	int i;
+
+	if (!target->req_ring)
+		return;
+
+	for (i = 0; i < target->req_ring_size; ++i) {
+		req = &target->req_ring[i];
+		if (dev->use_fast_reg)
+			kfree(req->fr_list);
+		else
+			kfree(req->fmr_list);
+		kfree(req->map_page);
+		if (req->indirect_dma_addr) {
+			ib_dma_unmap_single(ibdev, req->indirect_dma_addr,
+					    target->indirect_size,
+					    DMA_TO_DEVICE);
+		}
+		kfree(req->indirect_desc);
+	}
+
+	kfree(target->req_ring);
+	target->req_ring = NULL;
+}
+
+static int srp_alloc_req_data(struct srp_target_port *target)
+{
+	struct srp_device *srp_dev = target->srp_host->srp_dev;
+	struct ib_device *ibdev = srp_dev->dev;
+	struct srp_request *req;
+	void *mr_list;
+	dma_addr_t dma_addr;
+	int i, ret = -ENOMEM;
+
+	INIT_LIST_HEAD(&target->free_reqs);
+
+	target->req_ring = kzalloc(target->req_ring_size *
+				   sizeof(*target->req_ring), GFP_KERNEL);
+	if (!target->req_ring)
+		goto out;
+
+	for (i = 0; i < target->req_ring_size; ++i) {
+		req = &target->req_ring[i];
+		mr_list = kmalloc(target->cmd_sg_cnt * sizeof(void *),
+				  GFP_KERNEL);
+		if (!mr_list)
+			goto out;
+		if (srp_dev->use_fast_reg)
+			req->fr_list = mr_list;
+		else
+			req->fmr_list = mr_list;
+		req->map_page = kmalloc(srp_dev->max_pages_per_mr *
+					sizeof(void *), GFP_KERNEL);
+		if (!req->map_page)
+			goto out;
+		req->indirect_desc = kmalloc(target->indirect_size, GFP_KERNEL);
+		if (!req->indirect_desc)
+			goto out;
+
+		dma_addr = ib_dma_map_single(ibdev, req->indirect_desc,
+					     target->indirect_size,
+					     DMA_TO_DEVICE);
+		if (ib_dma_mapping_error(ibdev, dma_addr))
+			goto out;
+
+		req->indirect_dma_addr = dma_addr;
+		req->index = i;
+		list_add_tail(&req->list, &target->free_reqs);
+	}
+	ret = 0;
+
+out:
+	return ret;
+}
+
+/**
+ * srp_del_scsi_host_attr() - Remove attributes defined in the host template.
+ * @shost: SCSI host whose attributes to remove from sysfs.
+ *
+ * Note: Any attributes defined in the host template and that did not exist
+ * before invocation of this function will be ignored.
+ */
+static void srp_del_scsi_host_attr(struct Scsi_Host *shost)
+{
+	struct device_attribute **attr;
+
+	for (attr = shost->hostt->shost_attrs; attr && *attr; ++attr)
+		device_remove_file(&shost->shost_dev, *attr);
+}
+
+static void srp_remove_target(struct srp_target_port *target)
+{
+	WARN_ON_ONCE(target->state != SRP_TARGET_REMOVED);
+
+	srp_del_scsi_host_attr(target->scsi_host);
+	srp_rport_get(target->rport);
+	srp_remove_host(target->scsi_host);
+	scsi_remove_host(target->scsi_host);
+	srp_stop_rport_timers(target->rport);
+	srp_disconnect_target(target);
+	ib_destroy_cm_id(target->cm_id);
+	srp_free_target_ib(target);
+	cancel_work_sync(&target->tl_err_work);
+	srp_rport_put(target->rport);
+	srp_free_req_data(target);
+
+	spin_lock(&target->srp_host->target_lock);
+	list_del(&target->list);
+	spin_unlock(&target->srp_host->target_lock);
+
+	scsi_host_put(target->scsi_host);
+}
+
+static void srp_remove_work(struct work_struct *work)
+{
+	struct srp_target_port *target =
+		container_of(work, struct srp_target_port, remove_work);
+
+	WARN_ON_ONCE(target->state != SRP_TARGET_REMOVED);
+
+	srp_remove_target(target);
+}
+
+static void srp_rport_delete(struct srp_rport *rport)
+{
+	struct srp_target_port *target = rport->lld_data;
+
+	srp_queue_remove_work(target);
+}
+
+static int srp_connect_target(struct srp_target_port *target)
+{
+	int retries = 3;
+	int ret;
+
+	WARN_ON_ONCE(target->connected);
+
+	target->qp_in_error = false;
+
+	ret = srp_lookup_path(target);
+	if (ret)
+		return ret;
+
+	while (1) {
+		init_completion(&target->done);
+		ret = srp_send_req(target);
+		if (ret)
+			return ret;
+		ret = wait_for_completion_interruptible(&target->done);
+		if (ret < 0)
+			return ret;
+
+		/*
+		 * The CM event handling code will set status to
+		 * SRP_PORT_REDIRECT if we get a port redirect REJ
+		 * back, or SRP_DLID_REDIRECT if we get a lid/qp
+		 * redirect REJ back.
+		 */
+		switch (target->status) {
+		case 0:
+			srp_change_conn_state(target, true);
+			return 0;
+
+		case SRP_PORT_REDIRECT:
+			ret = srp_lookup_path(target);
+			if (ret)
+				return ret;
+			break;
+
+		case SRP_DLID_REDIRECT:
+			break;
+
+		case SRP_STALE_CONN:
+			/* Our current CM id was stale, and is now in timewait.
+			 * Try to reconnect with a new one.
+			 */
+			if (!retries-- || srp_new_cm_id(target)) {
+				shost_printk(KERN_ERR, target->scsi_host, PFX
+					     "giving up on stale connection\n");
+				target->status = -ECONNRESET;
+				return target->status;
+			}
+
+			shost_printk(KERN_ERR, target->scsi_host, PFX
+				     "retrying stale connection\n");
+			break;
+
+		default:
+			return target->status;
+		}
+	}
+}
+
+static int srp_inv_rkey(struct srp_target_port *target, u32 rkey)
+{
+	struct ib_send_wr *bad_wr;
+	struct ib_send_wr wr = {
+		.opcode		    = IB_WR_LOCAL_INV,
+		.wr_id		    = LOCAL_INV_WR_ID_MASK,
+		.next		    = NULL,
+		.num_sge	    = 0,
+		.send_flags	    = 0,
+		.ex.invalidate_rkey = rkey,
+	};
+
+	return ib_post_send(target->qp, &wr, &bad_wr);
+}
+
+static void srp_unmap_data(struct scsi_cmnd *scmnd,
+			   struct srp_target_port *target,
+			   struct srp_request *req)
+{
+	struct srp_device *dev = target->srp_host->srp_dev;
+	struct ib_device *ibdev = dev->dev;
+	int i, res;
+
+	if (!scsi_sglist(scmnd) ||
+	    (scmnd->sc_data_direction != DMA_TO_DEVICE &&
+	     scmnd->sc_data_direction != DMA_FROM_DEVICE))
+		return;
+
+	if (dev->use_fast_reg) {
+		struct srp_fr_desc **pfr;
+
+		for (i = req->nmdesc, pfr = req->fr_list; i > 0; i--, pfr++) {
+			res = srp_inv_rkey(target, (*pfr)->mr->rkey);
+			if (res < 0) {
+				shost_printk(KERN_ERR, target->scsi_host, PFX
+				  "Queueing INV WR for rkey %#x failed (%d)\n",
+				  (*pfr)->mr->rkey, res);
+				queue_work(system_long_wq,
+					   &target->tl_err_work);
+			}
+		}
+		if (req->nmdesc)
+			srp_fr_pool_put(target->fr_pool, req->fr_list,
+					req->nmdesc);
+	} else {
+		struct ib_pool_fmr **pfmr;
+
+		for (i = req->nmdesc, pfmr = req->fmr_list; i > 0; i--, pfmr++)
+			ib_fmr_pool_unmap(*pfmr);
+	}
+
+	ib_dma_unmap_sg(ibdev, scsi_sglist(scmnd), scsi_sg_count(scmnd),
+			scmnd->sc_data_direction);
+}
+
+/**
+ * srp_claim_req - Take ownership of the scmnd associated with a request.
+ * @target: SRP target port.
+ * @req: SRP request.
+ * @sdev: If not NULL, only take ownership for this SCSI device.
+ * @scmnd: If NULL, take ownership of @req->scmnd. If not NULL, only take
+ *         ownership of @req->scmnd if it equals @scmnd.
+ *
+ * Return value:
+ * Either NULL or a pointer to the SCSI command the caller became owner of.
+ */
+static struct scsi_cmnd *srp_claim_req(struct srp_target_port *target,
+				       struct srp_request *req,
+				       struct scsi_device *sdev,
+				       struct scsi_cmnd *scmnd)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&target->lock, flags);
+	if (req->scmnd &&
+	    (!sdev || req->scmnd->device == sdev) &&
+	    (!scmnd || req->scmnd == scmnd)) {
+		scmnd = req->scmnd;
+		req->scmnd = NULL;
+	} else {
+		scmnd = NULL;
+	}
+	spin_unlock_irqrestore(&target->lock, flags);
+
+	return scmnd;
+}
+
+/**
+ * srp_free_req() - Unmap data and add request to the free request list.
+ * @target: SRP target port.
+ * @req:    Request to be freed.
+ * @scmnd:  SCSI command associated with @req.
+ * @req_lim_delta: Amount to be added to @target->req_lim.
+ */
+static void srp_free_req(struct srp_target_port *target,
+			 struct srp_request *req, struct scsi_cmnd *scmnd,
+			 s32 req_lim_delta)
+{
+	unsigned long flags;
+
+	srp_unmap_data(scmnd, target, req);
+
+	spin_lock_irqsave(&target->lock, flags);
+	target->req_lim += req_lim_delta;
+	list_add_tail(&req->list, &target->free_reqs);
+	spin_unlock_irqrestore(&target->lock, flags);
+}
+
+static void srp_finish_req(struct srp_target_port *target,
+			   struct srp_request *req, struct scsi_device *sdev,
+			   int result)
+{
+	struct scsi_cmnd *scmnd = srp_claim_req(target, req, sdev, NULL);
+
+	if (scmnd) {
+		srp_free_req(target, req, scmnd, 0);
+		scmnd->result = result;
+		scmnd->scsi_done(scmnd);
+	}
+}
+
+static void srp_terminate_io(struct srp_rport *rport)
+{
+	struct srp_target_port *target = rport->lld_data;
+	struct Scsi_Host *shost = target->scsi_host;
+	struct scsi_device *sdev;
+	int i;
+
+	/*
+	 * Invoking srp_terminate_io() while srp_queuecommand() is running
+	 * is not safe. Hence the warning statement below.
+	 */
+	shost_for_each_device(sdev, shost)
+		WARN_ON_ONCE(sdev->request_queue->request_fn_active);
+
+	for (i = 0; i < target->req_ring_size; ++i) {
+		struct srp_request *req = &target->req_ring[i];
+		srp_finish_req(target, req, NULL, DID_TRANSPORT_FAILFAST << 16);
+	}
+}
+
+/*
+ * It is up to the caller to ensure that srp_rport_reconnect() calls are
+ * serialized and that no concurrent srp_queuecommand(), srp_abort(),
+ * srp_reset_device() or srp_reset_host() calls will occur while this function
+ * is in progress. One way to realize that is not to call this function
+ * directly but to call srp_reconnect_rport() instead since that last function
+ * serializes calls of this function via rport->mutex and also blocks
+ * srp_queuecommand() calls before invoking this function.
+ */
+static int srp_rport_reconnect(struct srp_rport *rport)
+{
+	struct srp_target_port *target = rport->lld_data;
+	int i, ret;
+
+	srp_disconnect_target(target);
+	/*
+	 * Now get a new local CM ID so that we avoid confusing the target in
+	 * case things are really fouled up. Doing so also ensures that all CM
+	 * callbacks will have finished before a new QP is allocated.
+	 */
+	ret = srp_new_cm_id(target);
+
+	for (i = 0; i < target->req_ring_size; ++i) {
+		struct srp_request *req = &target->req_ring[i];
+		srp_finish_req(target, req, NULL, DID_RESET << 16);
+	}
+
+	/*
+	 * Whether or not creating a new CM ID succeeded, create a new
+	 * QP. This guarantees that all callback functions for the old QP have
+	 * finished before any send requests are posted on the new QP.
+	 */
+	ret += srp_create_target_ib(target);
+
+	INIT_LIST_HEAD(&target->free_tx);
+	for (i = 0; i < target->queue_size; ++i)
+		list_add(&target->tx_ring[i]->list, &target->free_tx);
+
+	if (ret == 0)
+		ret = srp_connect_target(target);
+
+	if (ret == 0)
+		shost_printk(KERN_INFO, target->scsi_host,
+			     PFX "reconnect succeeded\n");
+
+	return ret;
+}
+
+static void srp_map_desc(struct srp_map_state *state, dma_addr_t dma_addr,
+			 unsigned int dma_len, u32 rkey)
+{
+	struct srp_direct_buf *desc = state->desc;
+
+	desc->va = cpu_to_be64(dma_addr);
+	desc->key = cpu_to_be32(rkey);
+	desc->len = cpu_to_be32(dma_len);
+
+	state->total_len += dma_len;
+	state->desc++;
+	state->ndesc++;
+}
+
+static int srp_map_finish_fmr(struct srp_map_state *state,
+			      struct srp_target_port *target)
+{
+	struct ib_pool_fmr *fmr;
+	u64 io_addr = 0;
+
+	fmr = ib_fmr_pool_map_phys(target->fmr_pool, state->pages,
+				   state->npages, io_addr);
+	if (IS_ERR(fmr))
+		return PTR_ERR(fmr);
+
+	*state->next_fmr++ = fmr;
+	state->nmdesc++;
+
+	srp_map_desc(state, 0, state->dma_len, fmr->fmr->rkey);
+
+	return 0;
+}
+
+static int srp_map_finish_fr(struct srp_map_state *state,
+			     struct srp_target_port *target)
+{
+	struct srp_device *dev = target->srp_host->srp_dev;
+	struct ib_send_wr *bad_wr;
+	struct ib_send_wr wr;
+	struct srp_fr_desc *desc;
+	u32 rkey;
+
+	desc = srp_fr_pool_get(target->fr_pool);
+	if (!desc)
+		return -ENOMEM;
+
+	rkey = ib_inc_rkey(desc->mr->rkey);
+	ib_update_fast_reg_key(desc->mr, rkey);
+
+	memcpy(desc->frpl->page_list, state->pages,
+	       sizeof(state->pages[0]) * state->npages);
+
+	memset(&wr, 0, sizeof(wr));
+	wr.opcode = IB_WR_FAST_REG_MR;
+	wr.wr_id = FAST_REG_WR_ID_MASK;
+	wr.wr.fast_reg.iova_start = state->base_dma_addr;
+	wr.wr.fast_reg.page_list = desc->frpl;
+	wr.wr.fast_reg.page_list_len = state->npages;
+	wr.wr.fast_reg.page_shift = ilog2(dev->mr_page_size);
+	wr.wr.fast_reg.length = state->dma_len;
+	wr.wr.fast_reg.access_flags = (IB_ACCESS_LOCAL_WRITE |
+				       IB_ACCESS_REMOTE_READ |
+				       IB_ACCESS_REMOTE_WRITE);
+	wr.wr.fast_reg.rkey = desc->mr->lkey;
+
+	*state->next_fr++ = desc;
+	state->nmdesc++;
+
+	srp_map_desc(state, state->base_dma_addr, state->dma_len,
+		     desc->mr->rkey);
+
+	return ib_post_send(target->qp, &wr, &bad_wr);
+}
+
+static int srp_finish_mapping(struct srp_map_state *state,
+			      struct srp_target_port *target)
+{
+	int ret = 0;
+
+	if (state->npages == 0)
+		return 0;
+
+	if (state->npages == 1 && !register_always)
+		srp_map_desc(state, state->base_dma_addr, state->dma_len,
+			     target->rkey);
+	else
+		ret = target->srp_host->srp_dev->use_fast_reg ?
+			srp_map_finish_fr(state, target) :
+			srp_map_finish_fmr(state, target);
+
+	if (ret == 0) {
+		state->npages = 0;
+		state->dma_len = 0;
+	}
+
+	return ret;
+}
+
+static void srp_map_update_start(struct srp_map_state *state,
+				 struct scatterlist *sg, int sg_index,
+				 dma_addr_t dma_addr)
+{
+	state->unmapped_sg = sg;
+	state->unmapped_index = sg_index;
+	state->unmapped_addr = dma_addr;
+}
+
+static int srp_map_sg_entry(struct srp_map_state *state,
+			    struct srp_target_port *target,
+			    struct scatterlist *sg, int sg_index,
+			    bool use_mr)
+{
+	struct srp_device *dev = target->srp_host->srp_dev;
+	struct ib_device *ibdev = dev->dev;
+	dma_addr_t dma_addr = ib_sg_dma_address(ibdev, sg);
+	unsigned int dma_len = ib_sg_dma_len(ibdev, sg);
+	unsigned int len;
+	int ret;
+
+	if (!dma_len)
+		return 0;
+
+	if (!use_mr) {
+		/*
+		 * Once we're in direct map mode for a request, we don't
+		 * go back to FMR or FR mode, so no need to update anything
+		 * other than the descriptor.
+		 */
+		srp_map_desc(state, dma_addr, dma_len, target->rkey);
+		return 0;
+	}
+
+	/*
+	 * Since not all RDMA HW drivers support non-zero page offsets for
+	 * FMR, if we start at an offset into a page, don't merge into the
+	 * current FMR mapping. Finish it out, and use the kernel's MR for
+	 * this sg entry.
+	 */
+	if ((!dev->use_fast_reg && dma_addr & ~dev->mr_page_mask) ||
+	    dma_len > dev->mr_max_size) {
+		ret = srp_finish_mapping(state, target);
+		if (ret)
+			return ret;
+
+		srp_map_desc(state, dma_addr, dma_len, target->rkey);
+		srp_map_update_start(state, NULL, 0, 0);
+		return 0;
+	}
+
+	/*
+	 * If this is the first sg that will be mapped via FMR or via FR, save
+	 * our position. We need to know the first unmapped entry, its index,
+	 * and the first unmapped address within that entry to be able to
+	 * restart mapping after an error.
+	 */
+	if (!state->unmapped_sg)
+		srp_map_update_start(state, sg, sg_index, dma_addr);
+
+	while (dma_len) {
+		unsigned offset = dma_addr & ~dev->mr_page_mask;
+		if (state->npages == dev->max_pages_per_mr || offset != 0) {
+			ret = srp_finish_mapping(state, target);
+			if (ret)
+				return ret;
+
+			srp_map_update_start(state, sg, sg_index, dma_addr);
+		}
+
+		len = min_t(unsigned int, dma_len, dev->mr_page_size - offset);
+
+		if (!state->npages)
+			state->base_dma_addr = dma_addr;
+		state->pages[state->npages++] = dma_addr & dev->mr_page_mask;
+		state->dma_len += len;
+		dma_addr += len;
+		dma_len -= len;
+	}
+
+	/*
+	 * If the last entry of the MR wasn't a full page, then we need to
+	 * close it out and start a new one -- we can only merge at page
+	 * boundries.
+	 */
+	ret = 0;
+	if (len != dev->mr_page_size) {
+		ret = srp_finish_mapping(state, target);
+		if (!ret)
+			srp_map_update_start(state, NULL, 0, 0);
+	}
+	return ret;
+}
+
+static int srp_map_sg(struct srp_map_state *state,
+		      struct srp_target_port *target, struct srp_request *req,
+		      struct scatterlist *scat, int count)
+{
+	struct srp_device *dev = target->srp_host->srp_dev;
+	struct ib_device *ibdev = dev->dev;
+	struct scatterlist *sg;
+	int i;
+	bool use_mr;
+
+	state->desc	= req->indirect_desc;
+	state->pages	= req->map_page;
+	if (dev->use_fast_reg) {
+		state->next_fr = req->fr_list;
+		use_mr = !!target->fr_pool;
+	} else {
+		state->next_fmr = req->fmr_list;
+		use_mr = !!target->fmr_pool;
+	}
+
+	for_each_sg(scat, sg, count, i) {
+		if (srp_map_sg_entry(state, target, sg, i, use_mr)) {
+			/*
+			 * Memory registration failed, so backtrack to the
+			 * first unmapped entry and continue on without using
+			 * memory registration.
+			 */
+			dma_addr_t dma_addr;
+			unsigned int dma_len;
+
+backtrack:
+			sg = state->unmapped_sg;
+			i = state->unmapped_index;
+
+			dma_addr = ib_sg_dma_address(ibdev, sg);
+			dma_len = ib_sg_dma_len(ibdev, sg);
+			dma_len -= (state->unmapped_addr - dma_addr);
+			dma_addr = state->unmapped_addr;
+			use_mr = false;
+			srp_map_desc(state, dma_addr, dma_len, target->rkey);
+		}
+	}
+
+	if (use_mr && srp_finish_mapping(state, target))
+		goto backtrack;
+
+	req->nmdesc = state->nmdesc;
+
+	return 0;
+}
+
+static int srp_map_data(struct scsi_cmnd *scmnd, struct srp_target_port *target,
+			struct srp_request *req)
+{
+	struct scatterlist *scat;
+	struct srp_cmd *cmd = req->cmd->buf;
+	int len, nents, count;
+	struct srp_device *dev;
+	struct ib_device *ibdev;
+	struct srp_map_state state;
+	struct srp_indirect_buf *indirect_hdr;
+	u32 table_len;
+	u8 fmt;
+
+	if (!scsi_sglist(scmnd) || scmnd->sc_data_direction == DMA_NONE)
+		return sizeof (struct srp_cmd);
+
+	if (scmnd->sc_data_direction != DMA_FROM_DEVICE &&
+	    scmnd->sc_data_direction != DMA_TO_DEVICE) {
+		shost_printk(KERN_WARNING, target->scsi_host,
+			     PFX "Unhandled data direction %d\n",
+			     scmnd->sc_data_direction);
+		return -EINVAL;
+	}
+
+	nents = scsi_sg_count(scmnd);
+	scat  = scsi_sglist(scmnd);
+
+	dev = target->srp_host->srp_dev;
+	ibdev = dev->dev;
+
+	count = ib_dma_map_sg(ibdev, scat, nents, scmnd->sc_data_direction);
+	if (unlikely(count == 0))
+		return -EIO;
+
+	fmt = SRP_DATA_DESC_DIRECT;
+	len = sizeof (struct srp_cmd) +	sizeof (struct srp_direct_buf);
+
+	if (count == 1 && !register_always) {
+		/*
+		 * The midlayer only generated a single gather/scatter
+		 * entry, or DMA mapping coalesced everything to a
+		 * single entry.  So a direct descriptor along with
+		 * the DMA MR suffices.
+		 */
+		struct srp_direct_buf *buf = (void *) cmd->add_data;
+
+		buf->va  = cpu_to_be64(ib_sg_dma_address(ibdev, scat));
+		buf->key = cpu_to_be32(target->rkey);
+		buf->len = cpu_to_be32(ib_sg_dma_len(ibdev, scat));
+
+		req->nmdesc = 0;
+		goto map_complete;
+	}
+
+	/*
+	 * We have more than one scatter/gather entry, so build our indirect
+	 * descriptor table, trying to merge as many entries as we can.
+	 */
+	indirect_hdr = (void *) cmd->add_data;
+
+	ib_dma_sync_single_for_cpu(ibdev, req->indirect_dma_addr,
+				   target->indirect_size, DMA_TO_DEVICE);
+
+	memset(&state, 0, sizeof(state));
+	srp_map_sg(&state, target, req, scat, count);
+
+	/* We've mapped the request, now pull as much of the indirect
+	 * descriptor table as we can into the command buffer. If this
+	 * target is not using an external indirect table, we are
+	 * guaranteed to fit into the command, as the SCSI layer won't
+	 * give us more S/G entries than we allow.
+	 */
+	if (state.ndesc == 1) {
+		/*
+		 * Memory registration collapsed the sg-list into one entry,
+		 * so use a direct descriptor.
+		 */
+		struct srp_direct_buf *buf = (void *) cmd->add_data;
+
+		*buf = req->indirect_desc[0];
+		goto map_complete;
+	}
+
+	if (unlikely(target->cmd_sg_cnt < state.ndesc &&
+						!target->allow_ext_sg)) {
+		shost_printk(KERN_ERR, target->scsi_host,
+			     "Could not fit S/G list into SRP_CMD\n");
+		return -EIO;
+	}
+
+	count = min(state.ndesc, target->cmd_sg_cnt);
+	table_len = state.ndesc * sizeof (struct srp_direct_buf);
+
+	fmt = SRP_DATA_DESC_INDIRECT;
+	len = sizeof(struct srp_cmd) + sizeof (struct srp_indirect_buf);
+	len += count * sizeof (struct srp_direct_buf);
+
+	memcpy(indirect_hdr->desc_list, req->indirect_desc,
+	       count * sizeof (struct srp_direct_buf));
+
+	indirect_hdr->table_desc.va = cpu_to_be64(req->indirect_dma_addr);
+	indirect_hdr->table_desc.key = cpu_to_be32(target->rkey);
+	indirect_hdr->table_desc.len = cpu_to_be32(table_len);
+	indirect_hdr->len = cpu_to_be32(state.total_len);
+
+	if (scmnd->sc_data_direction == DMA_TO_DEVICE)
+		cmd->data_out_desc_cnt = count;
+	else
+		cmd->data_in_desc_cnt = count;
+
+	ib_dma_sync_single_for_device(ibdev, req->indirect_dma_addr, table_len,
+				      DMA_TO_DEVICE);
+
+map_complete:
+	if (scmnd->sc_data_direction == DMA_TO_DEVICE)
+		cmd->buf_fmt = fmt << 4;
+	else
+		cmd->buf_fmt = fmt;
+
+	return len;
+}
+
+/*
+ * Return an IU and possible credit to the free pool
+ */
+static void srp_put_tx_iu(struct srp_target_port *target, struct srp_iu *iu,
+			  enum srp_iu_type iu_type)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&target->lock, flags);
+	list_add(&iu->list, &target->free_tx);
+	if (iu_type != SRP_IU_RSP)
+		++target->req_lim;
+	spin_unlock_irqrestore(&target->lock, flags);
+}
+
+/*
+ * Must be called with target->lock held to protect req_lim and free_tx.
+ * If IU is not sent, it must be returned using srp_put_tx_iu().
+ *
+ * Note:
+ * An upper limit for the number of allocated information units for each
+ * request type is:
+ * - SRP_IU_CMD: SRP_CMD_SQ_SIZE, since the SCSI mid-layer never queues
+ *   more than Scsi_Host.can_queue requests.
+ * - SRP_IU_TSK_MGMT: SRP_TSK_MGMT_SQ_SIZE.
+ * - SRP_IU_RSP: 1, since a conforming SRP target never sends more than
+ *   one unanswered SRP request to an initiator.
+ */
+static struct srp_iu *__srp_get_tx_iu(struct srp_target_port *target,
+				      enum srp_iu_type iu_type)
+{
+	s32 rsv = (iu_type == SRP_IU_TSK_MGMT) ? 0 : SRP_TSK_MGMT_SQ_SIZE;
+	struct srp_iu *iu;
+
+	srp_send_completion(target->send_cq, target);
+
+	if (list_empty(&target->free_tx))
+		return NULL;
+
+	/* Initiator responses to target requests do not consume credits */
+	if (iu_type != SRP_IU_RSP) {
+		if (target->req_lim <= rsv) {
+			++target->zero_req_lim;
+			return NULL;
+		}
+
+		--target->req_lim;
+	}
+
+	iu = list_first_entry(&target->free_tx, struct srp_iu, list);
+	list_del(&iu->list);
+	return iu;
+}
+
+static int srp_post_send(struct srp_target_port *target,
+			 struct srp_iu *iu, int len)
+{
+	struct ib_sge list;
+	struct ib_send_wr wr, *bad_wr;
+
+	list.addr   = iu->dma;
+	list.length = len;
+	list.lkey   = target->lkey;
+
+	wr.next       = NULL;
+	wr.wr_id      = (uintptr_t) iu;
+	wr.sg_list    = &list;
+	wr.num_sge    = 1;
+	wr.opcode     = IB_WR_SEND;
+	wr.send_flags = IB_SEND_SIGNALED;
+
+	return ib_post_send(target->qp, &wr, &bad_wr);
+}
+
+static int srp_post_recv(struct srp_target_port *target, struct srp_iu *iu)
+{
+	struct ib_recv_wr wr, *bad_wr;
+	struct ib_sge list;
+
+	list.addr   = iu->dma;
+	list.length = iu->size;
+	list.lkey   = target->lkey;
+
+	wr.next     = NULL;
+	wr.wr_id    = (uintptr_t) iu;
+	wr.sg_list  = &list;
+	wr.num_sge  = 1;
+
+	return ib_post_recv(target->qp, &wr, &bad_wr);
+}
+
+static void srp_process_rsp(struct srp_target_port *target, struct srp_rsp *rsp)
+{
+	struct srp_request *req;
+	struct scsi_cmnd *scmnd;
+	unsigned long flags;
+
+	if (unlikely(rsp->tag & SRP_TAG_TSK_MGMT)) {
+		spin_lock_irqsave(&target->lock, flags);
+		target->req_lim += be32_to_cpu(rsp->req_lim_delta);
+		spin_unlock_irqrestore(&target->lock, flags);
+
+		target->tsk_mgmt_status = -1;
+		if (be32_to_cpu(rsp->resp_data_len) >= 4)
+			target->tsk_mgmt_status = rsp->data[3];
+		complete(&target->tsk_mgmt_done);
+	} else {
+		req = &target->req_ring[rsp->tag];
+		scmnd = srp_claim_req(target, req, NULL, NULL);
+		if (!scmnd) {
+			shost_printk(KERN_ERR, target->scsi_host,
+				     "Null scmnd for RSP w/tag %016llx\n",
+				     (unsigned long long) rsp->tag);
+
+			spin_lock_irqsave(&target->lock, flags);
+			target->req_lim += be32_to_cpu(rsp->req_lim_delta);
+			spin_unlock_irqrestore(&target->lock, flags);
+
+			return;
+		}
+		scmnd->result = rsp->status;
+
+		if (rsp->flags & SRP_RSP_FLAG_SNSVALID) {
+			memcpy(scmnd->sense_buffer, rsp->data +
+			       be32_to_cpu(rsp->resp_data_len),
+			       min_t(int, be32_to_cpu(rsp->sense_data_len),
+				     SCSI_SENSE_BUFFERSIZE));
+		}
+
+		if (unlikely(rsp->flags & SRP_RSP_FLAG_DIUNDER))
+			scsi_set_resid(scmnd, be32_to_cpu(rsp->data_in_res_cnt));
+		else if (unlikely(rsp->flags & SRP_RSP_FLAG_DIOVER))
+			scsi_set_resid(scmnd, -be32_to_cpu(rsp->data_in_res_cnt));
+		else if (unlikely(rsp->flags & SRP_RSP_FLAG_DOUNDER))
+			scsi_set_resid(scmnd, be32_to_cpu(rsp->data_out_res_cnt));
+		else if (unlikely(rsp->flags & SRP_RSP_FLAG_DOOVER))
+			scsi_set_resid(scmnd, -be32_to_cpu(rsp->data_out_res_cnt));
+
+		srp_free_req(target, req, scmnd,
+			     be32_to_cpu(rsp->req_lim_delta));
+
+		scmnd->host_scribble = NULL;
+		scmnd->scsi_done(scmnd);
+	}
+}
+
+static int srp_response_common(struct srp_target_port *target, s32 req_delta,
+			       void *rsp, int len)
+{
+	struct ib_device *dev = target->srp_host->srp_dev->dev;
+	unsigned long flags;
+	struct srp_iu *iu;
+	int err;
+
+	spin_lock_irqsave(&target->lock, flags);
+	target->req_lim += req_delta;
+	iu = __srp_get_tx_iu(target, SRP_IU_RSP);
+	spin_unlock_irqrestore(&target->lock, flags);
+
+	if (!iu) {
+		shost_printk(KERN_ERR, target->scsi_host, PFX
+			     "no IU available to send response\n");
+		return 1;
+	}
+
+	ib_dma_sync_single_for_cpu(dev, iu->dma, len, DMA_TO_DEVICE);
+	memcpy(iu->buf, rsp, len);
+	ib_dma_sync_single_for_device(dev, iu->dma, len, DMA_TO_DEVICE);
+
+	err = srp_post_send(target, iu, len);
+	if (err) {
+		shost_printk(KERN_ERR, target->scsi_host, PFX
+			     "unable to post response: %d\n", err);
+		srp_put_tx_iu(target, iu, SRP_IU_RSP);
+	}
+
+	return err;
+}
+
+static void srp_process_cred_req(struct srp_target_port *target,
+				 struct srp_cred_req *req)
+{
+	struct srp_cred_rsp rsp = {
+		.opcode = SRP_CRED_RSP,
+		.tag = req->tag,
+	};
+	s32 delta = be32_to_cpu(req->req_lim_delta);
+
+	if (srp_response_common(target, delta, &rsp, sizeof rsp))
+		shost_printk(KERN_ERR, target->scsi_host, PFX
+			     "problems processing SRP_CRED_REQ\n");
+}
+
+static void srp_process_aer_req(struct srp_target_port *target,
+				struct srp_aer_req *req)
+{
+	struct srp_aer_rsp rsp = {
+		.opcode = SRP_AER_RSP,
+		.tag = req->tag,
+	};
+	s32 delta = be32_to_cpu(req->req_lim_delta);
+
+	shost_printk(KERN_ERR, target->scsi_host, PFX
+		     "ignoring AER for LUN %llu\n", be64_to_cpu(req->lun));
+
+	if (srp_response_common(target, delta, &rsp, sizeof rsp))
+		shost_printk(KERN_ERR, target->scsi_host, PFX
+			     "problems processing SRP_AER_REQ\n");
+}
+
+static void srp_handle_recv(struct srp_target_port *target, struct ib_wc *wc)
+{
+	struct ib_device *dev = target->srp_host->srp_dev->dev;
+	struct srp_iu *iu = (struct srp_iu *) (uintptr_t) wc->wr_id;
+	int res;
+	u8 opcode;
+
+	ib_dma_sync_single_for_cpu(dev, iu->dma, target->max_ti_iu_len,
+				   DMA_FROM_DEVICE);
+
+	opcode = *(u8 *) iu->buf;
+
+	if (0) {
+		shost_printk(KERN_ERR, target->scsi_host,
+			     PFX "recv completion, opcode 0x%02x\n", opcode);
+		print_hex_dump(KERN_ERR, "", DUMP_PREFIX_OFFSET, 8, 1,
+			       iu->buf, wc->byte_len, true);
+	}
+
+	switch (opcode) {
+	case SRP_RSP:
+		srp_process_rsp(target, iu->buf);
+		break;
+
+	case SRP_CRED_REQ:
+		srp_process_cred_req(target, iu->buf);
+		break;
+
+	case SRP_AER_REQ:
+		srp_process_aer_req(target, iu->buf);
+		break;
+
+	case SRP_T_LOGOUT:
+		/* XXX Handle target logout */
+		shost_printk(KERN_WARNING, target->scsi_host,
+			     PFX "Got target logout request\n");
+		break;
+
+	default:
+		shost_printk(KERN_WARNING, target->scsi_host,
+			     PFX "Unhandled SRP opcode 0x%02x\n", opcode);
+		break;
+	}
+
+	ib_dma_sync_single_for_device(dev, iu->dma, target->max_ti_iu_len,
+				      DMA_FROM_DEVICE);
+
+	res = srp_post_recv(target, iu);
+	if (res != 0)
+		shost_printk(KERN_ERR, target->scsi_host,
+			     PFX "Recv failed with error code %d\n", res);
+}
+
+/**
+ * srp_tl_err_work() - handle a transport layer error
+ * @work: Work structure embedded in an SRP target port.
+ *
+ * Note: This function may get invoked before the rport has been created,
+ * hence the target->rport test.
+ */
+static void srp_tl_err_work(struct work_struct *work)
+{
+	struct srp_target_port *target;
+
+	target = container_of(work, struct srp_target_port, tl_err_work);
+	if (target->rport)
+		srp_start_tl_fail_timers(target->rport);
+}
+
+static void srp_handle_qp_err(u64 wr_id, enum ib_wc_status wc_status,
+			      bool send_err, struct srp_target_port *target)
+{
+	if (target->connected && !target->qp_in_error) {
+		if (wr_id & LOCAL_INV_WR_ID_MASK) {
+			shost_printk(KERN_ERR, target->scsi_host, PFX
+				     "LOCAL_INV failed with status %d\n",
+				     wc_status);
+		} else if (wr_id & FAST_REG_WR_ID_MASK) {
+			shost_printk(KERN_ERR, target->scsi_host, PFX
+				     "FAST_REG_MR failed status %d\n",
+				     wc_status);
+		} else {
+			shost_printk(KERN_ERR, target->scsi_host,
+				     PFX "failed %s status %d for iu %p\n",
+				     send_err ? "send" : "receive",
+				     wc_status, (void *)(uintptr_t)wr_id);
+		}
+		queue_work(system_long_wq, &target->tl_err_work);
+	}
+	target->qp_in_error = true;
+}
+
+static void srp_recv_completion(struct ib_cq *cq, void *target_ptr)
+{
+	struct srp_target_port *target = target_ptr;
+	struct ib_wc wc;
+
+	ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
+	while (ib_poll_cq(cq, 1, &wc) > 0) {
+		if (likely(wc.status == IB_WC_SUCCESS)) {
+			srp_handle_recv(target, &wc);
+		} else {
+			srp_handle_qp_err(wc.wr_id, wc.status, false, target);
+		}
+	}
+}
+
+static void srp_send_completion(struct ib_cq *cq, void *target_ptr)
+{
+	struct srp_target_port *target = target_ptr;
+	struct ib_wc wc;
+	struct srp_iu *iu;
+
+	while (ib_poll_cq(cq, 1, &wc) > 0) {
+		if (likely(wc.status == IB_WC_SUCCESS)) {
+			iu = (struct srp_iu *) (uintptr_t) wc.wr_id;
+			list_add(&iu->list, &target->free_tx);
+		} else {
+			srp_handle_qp_err(wc.wr_id, wc.status, true, target);
+		}
+	}
+}
+
+static int srp_queuecommand(struct Scsi_Host *shost, struct scsi_cmnd *scmnd)
+{
+	struct srp_target_port *target = host_to_target(shost);
+	struct srp_rport *rport = target->rport;
+	struct srp_request *req;
+	struct srp_iu *iu;
+	struct srp_cmd *cmd;
+	struct ib_device *dev;
+	unsigned long flags;
+	int len, ret;
+	const bool in_scsi_eh = !in_interrupt() && current == shost->ehandler;
+
+	/*
+	 * The SCSI EH thread is the only context from which srp_queuecommand()
+	 * can get invoked for blocked devices (SDEV_BLOCK /
+	 * SDEV_CREATED_BLOCK). Avoid racing with srp_reconnect_rport() by
+	 * locking the rport mutex if invoked from inside the SCSI EH.
+	 */
+	if (in_scsi_eh)
+		mutex_lock(&rport->mutex);
+
+	scmnd->result = srp_chkready(target->rport);
+	if (unlikely(scmnd->result))
+		goto err;
+
+	spin_lock_irqsave(&target->lock, flags);
+	iu = __srp_get_tx_iu(target, SRP_IU_CMD);
+	if (!iu)
+		goto err_unlock;
+
+	req = list_first_entry(&target->free_reqs, struct srp_request, list);
+	list_del(&req->list);
+	spin_unlock_irqrestore(&target->lock, flags);
+
+	dev = target->srp_host->srp_dev->dev;
+	ib_dma_sync_single_for_cpu(dev, iu->dma, target->max_iu_len,
+				   DMA_TO_DEVICE);
+
+	scmnd->host_scribble = (void *) req;
+
+	cmd = iu->buf;
+	memset(cmd, 0, sizeof *cmd);
+
+	cmd->opcode = SRP_CMD;
+	cmd->lun    = cpu_to_be64((u64) scmnd->device->lun << 48);
+	cmd->tag    = req->index;
+	memcpy(cmd->cdb, scmnd->cmnd, scmnd->cmd_len);
+
+	req->scmnd    = scmnd;
+	req->cmd      = iu;
+
+	len = srp_map_data(scmnd, target, req);
+	if (len < 0) {
+		shost_printk(KERN_ERR, target->scsi_host,
+			     PFX "Failed to map data (%d)\n", len);
+		/*
+		 * If we ran out of memory descriptors (-ENOMEM) because an
+		 * application is queuing many requests with more than
+		 * max_pages_per_mr sg-list elements, tell the SCSI mid-layer
+		 * to reduce queue depth temporarily.
+		 */
+		scmnd->result = len == -ENOMEM ?
+			DID_OK << 16 | QUEUE_FULL << 1 : DID_ERROR << 16;
+		goto err_iu;
+	}
+
+	ib_dma_sync_single_for_device(dev, iu->dma, target->max_iu_len,
+				      DMA_TO_DEVICE);
+
+	if (srp_post_send(target, iu, len)) {
+		shost_printk(KERN_ERR, target->scsi_host, PFX "Send failed\n");
+		goto err_unmap;
+	}
+
+	ret = 0;
+
+unlock_rport:
+	if (in_scsi_eh)
+		mutex_unlock(&rport->mutex);
+
+	return ret;
+
+err_unmap:
+	srp_unmap_data(scmnd, target, req);
+
+err_iu:
+	srp_put_tx_iu(target, iu, SRP_IU_CMD);
+
+	/*
+	 * Avoid that the loops that iterate over the request ring can
+	 * encounter a dangling SCSI command pointer.
+	 */
+	req->scmnd = NULL;
+
+	spin_lock_irqsave(&target->lock, flags);
+	list_add(&req->list, &target->free_reqs);
+
+err_unlock:
+	spin_unlock_irqrestore(&target->lock, flags);
+
+err:
+	if (scmnd->result) {
+		scmnd->scsi_done(scmnd);
+		ret = 0;
+	} else {
+		ret = SCSI_MLQUEUE_HOST_BUSY;
+	}
+
+	goto unlock_rport;
+}
+
+/*
+ * Note: the resources allocated in this function are freed in
+ * srp_free_target_ib().
+ */
+static int srp_alloc_iu_bufs(struct srp_target_port *target)
+{
+	int i;
+
+	target->rx_ring = kzalloc(target->queue_size * sizeof(*target->rx_ring),
+				  GFP_KERNEL);
+	if (!target->rx_ring)
+		goto err_no_ring;
+	target->tx_ring = kzalloc(target->queue_size * sizeof(*target->tx_ring),
+				  GFP_KERNEL);
+	if (!target->tx_ring)
+		goto err_no_ring;
+
+	for (i = 0; i < target->queue_size; ++i) {
+		target->rx_ring[i] = srp_alloc_iu(target->srp_host,
+						  target->max_ti_iu_len,
+						  GFP_KERNEL, DMA_FROM_DEVICE);
+		if (!target->rx_ring[i])
+			goto err;
+	}
+
+	for (i = 0; i < target->queue_size; ++i) {
+		target->tx_ring[i] = srp_alloc_iu(target->srp_host,
+						  target->max_iu_len,
+						  GFP_KERNEL, DMA_TO_DEVICE);
+		if (!target->tx_ring[i])
+			goto err;
+
+		list_add(&target->tx_ring[i]->list, &target->free_tx);
+	}
+
+	return 0;
+
+err:
+	for (i = 0; i < target->queue_size; ++i) {
+		srp_free_iu(target->srp_host, target->rx_ring[i]);
+		srp_free_iu(target->srp_host, target->tx_ring[i]);
+	}
+
+
+err_no_ring:
+	kfree(target->tx_ring);
+	target->tx_ring = NULL;
+	kfree(target->rx_ring);
+	target->rx_ring = NULL;
+
+	return -ENOMEM;
+}
+
+static uint32_t srp_compute_rq_tmo(struct ib_qp_attr *qp_attr, int attr_mask)
+{
+	uint64_t T_tr_ns, max_compl_time_ms;
+	uint32_t rq_tmo_jiffies;
+
+	/*
+	 * According to section 11.2.4.2 in the IBTA spec (Modify Queue Pair,
+	 * table 91), both the QP timeout and the retry count have to be set
+	 * for RC QP's during the RTR to RTS transition.
+	 */
+	WARN_ON_ONCE((attr_mask & (IB_QP_TIMEOUT | IB_QP_RETRY_CNT)) !=
+		     (IB_QP_TIMEOUT | IB_QP_RETRY_CNT));
+
+	/*
+	 * Set target->rq_tmo_jiffies to one second more than the largest time
+	 * it can take before an error completion is generated. See also
+	 * C9-140..142 in the IBTA spec for more information about how to
+	 * convert the QP Local ACK Timeout value to nanoseconds.
+	 */
+	T_tr_ns = 4096 * (1ULL << qp_attr->timeout);
+	max_compl_time_ms = qp_attr->retry_cnt * 4 * T_tr_ns;
+	do_div(max_compl_time_ms, NSEC_PER_MSEC);
+	rq_tmo_jiffies = msecs_to_jiffies(max_compl_time_ms + 1000);
+
+	return rq_tmo_jiffies;
+}
+
+static void srp_cm_rep_handler(struct ib_cm_id *cm_id,
+			       struct srp_login_rsp *lrsp,
+			       struct srp_target_port *target)
+{
+	struct ib_qp_attr *qp_attr = NULL;
+	int attr_mask = 0;
+	int ret;
+	int i;
+
+	if (lrsp->opcode == SRP_LOGIN_RSP) {
+		target->max_ti_iu_len = be32_to_cpu(lrsp->max_ti_iu_len);
+		target->req_lim       = be32_to_cpu(lrsp->req_lim_delta);
+
+		/*
+		 * Reserve credits for task management so we don't
+		 * bounce requests back to the SCSI mid-layer.
+		 */
+		target->scsi_host->can_queue
+			= min(target->req_lim - SRP_TSK_MGMT_SQ_SIZE,
+			      target->scsi_host->can_queue);
+		target->scsi_host->cmd_per_lun
+			= min_t(int, target->scsi_host->can_queue,
+				target->scsi_host->cmd_per_lun);
+	} else {
+		shost_printk(KERN_WARNING, target->scsi_host,
+			     PFX "Unhandled RSP opcode %#x\n", lrsp->opcode);
+		ret = -ECONNRESET;
+		goto error;
+	}
+
+	if (!target->rx_ring) {
+		ret = srp_alloc_iu_bufs(target);
+		if (ret)
+			goto error;
+	}
+
+	ret = -ENOMEM;
+	qp_attr = kmalloc(sizeof *qp_attr, GFP_KERNEL);
+	if (!qp_attr)
+		goto error;
+
+	qp_attr->qp_state = IB_QPS_RTR;
+	ret = ib_cm_init_qp_attr(cm_id, qp_attr, &attr_mask);
+	if (ret)
+		goto error_free;
+
+	ret = ib_modify_qp(target->qp, qp_attr, attr_mask);
+	if (ret)
+		goto error_free;
+
+	for (i = 0; i < target->queue_size; i++) {
+		struct srp_iu *iu = target->rx_ring[i];
+		ret = srp_post_recv(target, iu);
+		if (ret)
+			goto error_free;
+	}
+
+	qp_attr->qp_state = IB_QPS_RTS;
+	ret = ib_cm_init_qp_attr(cm_id, qp_attr, &attr_mask);
+	if (ret)
+		goto error_free;
+
+	target->rq_tmo_jiffies = srp_compute_rq_tmo(qp_attr, attr_mask);
+
+	ret = ib_modify_qp(target->qp, qp_attr, attr_mask);
+	if (ret)
+		goto error_free;
+
+	ret = ib_send_cm_rtu(cm_id, NULL, 0);
+
+error_free:
+	kfree(qp_attr);
+
+error:
+	target->status = ret;
+}
+
+static void srp_cm_rej_handler(struct ib_cm_id *cm_id,
+			       struct ib_cm_event *event,
+			       struct srp_target_port *target)
+{
+	struct Scsi_Host *shost = target->scsi_host;
+	struct ib_class_port_info *cpi;
+	int opcode;
+
+	switch (event->param.rej_rcvd.reason) {
+	case IB_CM_REJ_PORT_CM_REDIRECT:
+		cpi = event->param.rej_rcvd.ari;
+		target->path.dlid = cpi->redirect_lid;
+		target->path.pkey = cpi->redirect_pkey;
+		cm_id->remote_cm_qpn = be32_to_cpu(cpi->redirect_qp) & 0x00ffffff;
+		memcpy(target->path.dgid.raw, cpi->redirect_gid, 16);
+
+		target->status = target->path.dlid ?
+			SRP_DLID_REDIRECT : SRP_PORT_REDIRECT;
+		break;
+
+	case IB_CM_REJ_PORT_REDIRECT:
+		if (srp_target_is_topspin(target)) {
+			/*
+			 * Topspin/Cisco SRP gateways incorrectly send
+			 * reject reason code 25 when they mean 24
+			 * (port redirect).
+			 */
+			memcpy(target->path.dgid.raw,
+			       event->param.rej_rcvd.ari, 16);
+
+			shost_printk(KERN_DEBUG, shost,
+				     PFX "Topspin/Cisco redirect to target port GID %016llx%016llx\n",
+				     (unsigned long long) be64_to_cpu(target->path.dgid.global.subnet_prefix),
+				     (unsigned long long) be64_to_cpu(target->path.dgid.global.interface_id));
+
+			target->status = SRP_PORT_REDIRECT;
+		} else {
+			shost_printk(KERN_WARNING, shost,
+				     "  REJ reason: IB_CM_REJ_PORT_REDIRECT\n");
+			target->status = -ECONNRESET;
+		}
+		break;
+
+	case IB_CM_REJ_DUPLICATE_LOCAL_COMM_ID:
+		shost_printk(KERN_WARNING, shost,
+			    "  REJ reason: IB_CM_REJ_DUPLICATE_LOCAL_COMM_ID\n");
+		target->status = -ECONNRESET;
+		break;
+
+	case IB_CM_REJ_CONSUMER_DEFINED:
+		opcode = *(u8 *) event->private_data;
+		if (opcode == SRP_LOGIN_REJ) {
+			struct srp_login_rej *rej = event->private_data;
+			u32 reason = be32_to_cpu(rej->reason);
+
+			if (reason == SRP_LOGIN_REJ_REQ_IT_IU_LENGTH_TOO_LARGE)
+				shost_printk(KERN_WARNING, shost,
+					     PFX "SRP_LOGIN_REJ: requested max_it_iu_len too large\n");
+			else
+				shost_printk(KERN_WARNING, shost, PFX
+					     "SRP LOGIN from %pI6 to %pI6 REJECTED, reason 0x%08x\n",
+					     target->path.sgid.raw,
+					     target->orig_dgid, reason);
+		} else
+			shost_printk(KERN_WARNING, shost,
+				     "  REJ reason: IB_CM_REJ_CONSUMER_DEFINED,"
+				     " opcode 0x%02x\n", opcode);
+		target->status = -ECONNRESET;
+		break;
+
+	case IB_CM_REJ_STALE_CONN:
+		shost_printk(KERN_WARNING, shost, "  REJ reason: stale connection\n");
+		target->status = SRP_STALE_CONN;
+		break;
+
+	default:
+		shost_printk(KERN_WARNING, shost, "  REJ reason 0x%x\n",
+			     event->param.rej_rcvd.reason);
+		target->status = -ECONNRESET;
+	}
+}
+
+static int srp_cm_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event)
+{
+	struct srp_target_port *target = cm_id->context;
+	int comp = 0;
+
+	switch (event->event) {
+	case IB_CM_REQ_ERROR:
+		shost_printk(KERN_DEBUG, target->scsi_host,
+			     PFX "Sending CM REQ failed\n");
+		comp = 1;
+		target->status = -ECONNRESET;
+		break;
+
+	case IB_CM_REP_RECEIVED:
+		comp = 1;
+		srp_cm_rep_handler(cm_id, event->private_data, target);
+		break;
+
+	case IB_CM_REJ_RECEIVED:
+		shost_printk(KERN_DEBUG, target->scsi_host, PFX "REJ received\n");
+		comp = 1;
+
+		srp_cm_rej_handler(cm_id, event, target);
+		break;
+
+	case IB_CM_DREQ_RECEIVED:
+		shost_printk(KERN_WARNING, target->scsi_host,
+			     PFX "DREQ received - connection closed\n");
+		srp_change_conn_state(target, false);
+		if (ib_send_cm_drep(cm_id, NULL, 0))
+			shost_printk(KERN_ERR, target->scsi_host,
+				     PFX "Sending CM DREP failed\n");
+		queue_work(system_long_wq, &target->tl_err_work);
+		break;
+
+	case IB_CM_TIMEWAIT_EXIT:
+		shost_printk(KERN_ERR, target->scsi_host,
+			     PFX "connection closed\n");
+		comp = 1;
+
+		target->status = 0;
+		break;
+
+	case IB_CM_MRA_RECEIVED:
+	case IB_CM_DREQ_ERROR:
+	case IB_CM_DREP_RECEIVED:
+		break;
+
+	default:
+		shost_printk(KERN_WARNING, target->scsi_host,
+			     PFX "Unhandled CM event %d\n", event->event);
+		break;
+	}
+
+	if (comp)
+		complete(&target->done);
+
+	return 0;
+}
+
+/**
+ * srp_change_queue_type - changing device queue tag type
+ * @sdev: scsi device struct
+ * @tag_type: requested tag type
+ *
+ * Returns queue tag type.
+ */
+static int
+srp_change_queue_type(struct scsi_device *sdev, int tag_type)
+{
+	if (sdev->tagged_supported) {
+		scsi_set_tag_type(sdev, tag_type);
+		if (tag_type)
+			scsi_activate_tcq(sdev, sdev->queue_depth);
+		else
+			scsi_deactivate_tcq(sdev, sdev->queue_depth);
+	} else
+		tag_type = 0;
+
+	return tag_type;
+}
+
+/**
+ * srp_change_queue_depth - setting device queue depth
+ * @sdev: scsi device struct
+ * @qdepth: requested queue depth
+ * @reason: SCSI_QDEPTH_DEFAULT/SCSI_QDEPTH_QFULL/SCSI_QDEPTH_RAMP_UP
+ * (see include/scsi/scsi_host.h for definition)
+ *
+ * Returns queue depth.
+ */
+static int
+srp_change_queue_depth(struct scsi_device *sdev, int qdepth, int reason)
+{
+	struct Scsi_Host *shost = sdev->host;
+	int max_depth;
+	if (reason == SCSI_QDEPTH_DEFAULT || reason == SCSI_QDEPTH_RAMP_UP) {
+		max_depth = shost->can_queue;
+		if (!sdev->tagged_supported)
+			max_depth = 1;
+		if (qdepth > max_depth)
+			qdepth = max_depth;
+		scsi_adjust_queue_depth(sdev, scsi_get_tag_type(sdev), qdepth);
+	} else if (reason == SCSI_QDEPTH_QFULL)
+		scsi_track_queue_full(sdev, qdepth);
+	else
+		return -EOPNOTSUPP;
+
+	return sdev->queue_depth;
+}
+
+static int srp_send_tsk_mgmt(struct srp_target_port *target,
+			     u64 req_tag, unsigned int lun, u8 func)
+{
+	struct srp_rport *rport = target->rport;
+	struct ib_device *dev = target->srp_host->srp_dev->dev;
+	struct srp_iu *iu;
+	struct srp_tsk_mgmt *tsk_mgmt;
+
+	if (!target->connected || target->qp_in_error)
+		return -1;
+
+	init_completion(&target->tsk_mgmt_done);
+
+	/*
+	 * Lock the rport mutex to avoid that srp_create_target_ib() is
+	 * invoked while a task management function is being sent.
+	 */
+	mutex_lock(&rport->mutex);
+	spin_lock_irq(&target->lock);
+	iu = __srp_get_tx_iu(target, SRP_IU_TSK_MGMT);
+	spin_unlock_irq(&target->lock);
+
+	if (!iu) {
+		mutex_unlock(&rport->mutex);
+
+		return -1;
+	}
+
+	ib_dma_sync_single_for_cpu(dev, iu->dma, sizeof *tsk_mgmt,
+				   DMA_TO_DEVICE);
+	tsk_mgmt = iu->buf;
+	memset(tsk_mgmt, 0, sizeof *tsk_mgmt);
+
+	tsk_mgmt->opcode 	= SRP_TSK_MGMT;
+	tsk_mgmt->lun		= cpu_to_be64((u64) lun << 48);
+	tsk_mgmt->tag		= req_tag | SRP_TAG_TSK_MGMT;
+	tsk_mgmt->tsk_mgmt_func = func;
+	tsk_mgmt->task_tag	= req_tag;
+
+	ib_dma_sync_single_for_device(dev, iu->dma, sizeof *tsk_mgmt,
+				      DMA_TO_DEVICE);
+	if (srp_post_send(target, iu, sizeof *tsk_mgmt)) {
+		srp_put_tx_iu(target, iu, SRP_IU_TSK_MGMT);
+		mutex_unlock(&rport->mutex);
+
+		return -1;
+	}
+	mutex_unlock(&rport->mutex);
+
+	if (!wait_for_completion_timeout(&target->tsk_mgmt_done,
+					 msecs_to_jiffies(SRP_ABORT_TIMEOUT_MS)))
+		return -1;
+
+	return 0;
+}
+
+static int srp_abort(struct scsi_cmnd *scmnd)
+{
+	struct srp_target_port *target = host_to_target(scmnd->device->host);
+	struct srp_request *req = (struct srp_request *) scmnd->host_scribble;
+	int ret;
+
+	shost_printk(KERN_ERR, target->scsi_host, "SRP abort called\n");
+
+	if (!req || !srp_claim_req(target, req, NULL, scmnd))
+		return SUCCESS;
+	if (srp_send_tsk_mgmt(target, req->index, scmnd->device->lun,
+			      SRP_TSK_ABORT_TASK) == 0)
+		ret = SUCCESS;
+	else if (target->rport->state == SRP_RPORT_LOST)
+		ret = FAST_IO_FAIL;
+	else
+		ret = FAILED;
+	srp_free_req(target, req, scmnd, 0);
+	scmnd->result = DID_ABORT << 16;
+	scmnd->scsi_done(scmnd);
+
+	return ret;
+}
+
+static int srp_reset_device(struct scsi_cmnd *scmnd)
+{
+	struct srp_target_port *target = host_to_target(scmnd->device->host);
+	int i;
+
+	shost_printk(KERN_ERR, target->scsi_host, "SRP reset_device called\n");
+
+	if (srp_send_tsk_mgmt(target, SRP_TAG_NO_REQ, scmnd->device->lun,
+			      SRP_TSK_LUN_RESET))
+		return FAILED;
+	if (target->tsk_mgmt_status)
+		return FAILED;
+
+	for (i = 0; i < target->req_ring_size; ++i) {
+		struct srp_request *req = &target->req_ring[i];
+		srp_finish_req(target, req, scmnd->device, DID_RESET << 16);
+	}
+
+	return SUCCESS;
+}
+
+static int srp_reset_host(struct scsi_cmnd *scmnd)
+{
+	struct srp_target_port *target = host_to_target(scmnd->device->host);
+
+	shost_printk(KERN_ERR, target->scsi_host, PFX "SRP reset_host called\n");
+
+	return srp_reconnect_rport(target->rport) == 0 ? SUCCESS : FAILED;
+}
+
+static int srp_slave_configure(struct scsi_device *sdev)
+{
+	struct Scsi_Host *shost = sdev->host;
+	struct srp_target_port *target = host_to_target(shost);
+	struct request_queue *q = sdev->request_queue;
+	unsigned long timeout;
+
+	if (sdev->type == TYPE_DISK) {
+		timeout = max_t(unsigned, 30 * HZ, target->rq_tmo_jiffies);
+		blk_queue_rq_timeout(q, timeout);
+	}
+
+	return 0;
+}
+
+static ssize_t show_id_ext(struct device *dev, struct device_attribute *attr,
+			   char *buf)
+{
+	struct srp_target_port *target = host_to_target(class_to_shost(dev));
+
+	return sprintf(buf, "0x%016llx\n",
+		       (unsigned long long) be64_to_cpu(target->id_ext));
+}
+
+static ssize_t show_ioc_guid(struct device *dev, struct device_attribute *attr,
+			     char *buf)
+{
+	struct srp_target_port *target = host_to_target(class_to_shost(dev));
+
+	return sprintf(buf, "0x%016llx\n",
+		       (unsigned long long) be64_to_cpu(target->ioc_guid));
+}
+
+static ssize_t show_service_id(struct device *dev,
+			       struct device_attribute *attr, char *buf)
+{
+	struct srp_target_port *target = host_to_target(class_to_shost(dev));
+
+	return sprintf(buf, "0x%016llx\n",
+		       (unsigned long long) be64_to_cpu(target->service_id));
+}
+
+static ssize_t show_pkey(struct device *dev, struct device_attribute *attr,
+			 char *buf)
+{
+	struct srp_target_port *target = host_to_target(class_to_shost(dev));
+
+	return sprintf(buf, "0x%04x\n", be16_to_cpu(target->path.pkey));
+}
+
+static ssize_t show_sgid(struct device *dev, struct device_attribute *attr,
+			 char *buf)
+{
+	struct srp_target_port *target = host_to_target(class_to_shost(dev));
+
+	return sprintf(buf, "%pI6\n", target->path.sgid.raw);
+}
+
+static ssize_t show_dgid(struct device *dev, struct device_attribute *attr,
+			 char *buf)
+{
+	struct srp_target_port *target = host_to_target(class_to_shost(dev));
+
+	return sprintf(buf, "%pI6\n", target->path.dgid.raw);
+}
+
+static ssize_t show_orig_dgid(struct device *dev,
+			      struct device_attribute *attr, char *buf)
+{
+	struct srp_target_port *target = host_to_target(class_to_shost(dev));
+
+	return sprintf(buf, "%pI6\n", target->orig_dgid);
+}
+
+static ssize_t show_req_lim(struct device *dev,
+			    struct device_attribute *attr, char *buf)
+{
+	struct srp_target_port *target = host_to_target(class_to_shost(dev));
+
+	return sprintf(buf, "%d\n", target->req_lim);
+}
+
+static ssize_t show_zero_req_lim(struct device *dev,
+				 struct device_attribute *attr, char *buf)
+{
+	struct srp_target_port *target = host_to_target(class_to_shost(dev));
+
+	return sprintf(buf, "%d\n", target->zero_req_lim);
+}
+
+static ssize_t show_local_ib_port(struct device *dev,
+				  struct device_attribute *attr, char *buf)
+{
+	struct srp_target_port *target = host_to_target(class_to_shost(dev));
+
+	return sprintf(buf, "%d\n", target->srp_host->port);
+}
+
+static ssize_t show_local_ib_device(struct device *dev,
+				    struct device_attribute *attr, char *buf)
+{
+	struct srp_target_port *target = host_to_target(class_to_shost(dev));
+
+	return sprintf(buf, "%s\n", target->srp_host->srp_dev->dev->name);
+}
+
+static ssize_t show_comp_vector(struct device *dev,
+				struct device_attribute *attr, char *buf)
+{
+	struct srp_target_port *target = host_to_target(class_to_shost(dev));
+
+	return sprintf(buf, "%d\n", target->comp_vector);
+}
+
+static ssize_t show_tl_retry_count(struct device *dev,
+				   struct device_attribute *attr, char *buf)
+{
+	struct srp_target_port *target = host_to_target(class_to_shost(dev));
+
+	return sprintf(buf, "%d\n", target->tl_retry_count);
+}
+
+static ssize_t show_cmd_sg_entries(struct device *dev,
+				   struct device_attribute *attr, char *buf)
+{
+	struct srp_target_port *target = host_to_target(class_to_shost(dev));
+
+	return sprintf(buf, "%u\n", target->cmd_sg_cnt);
+}
+
+static ssize_t show_allow_ext_sg(struct device *dev,
+				 struct device_attribute *attr, char *buf)
+{
+	struct srp_target_port *target = host_to_target(class_to_shost(dev));
+
+	return sprintf(buf, "%s\n", target->allow_ext_sg ? "true" : "false");
+}
+
+static DEVICE_ATTR(id_ext,	    S_IRUGO, show_id_ext,	   NULL);
+static DEVICE_ATTR(ioc_guid,	    S_IRUGO, show_ioc_guid,	   NULL);
+static DEVICE_ATTR(service_id,	    S_IRUGO, show_service_id,	   NULL);
+static DEVICE_ATTR(pkey,	    S_IRUGO, show_pkey,		   NULL);
+static DEVICE_ATTR(sgid,	    S_IRUGO, show_sgid,		   NULL);
+static DEVICE_ATTR(dgid,	    S_IRUGO, show_dgid,		   NULL);
+static DEVICE_ATTR(orig_dgid,	    S_IRUGO, show_orig_dgid,	   NULL);
+static DEVICE_ATTR(req_lim,         S_IRUGO, show_req_lim,         NULL);
+static DEVICE_ATTR(zero_req_lim,    S_IRUGO, show_zero_req_lim,	   NULL);
+static DEVICE_ATTR(local_ib_port,   S_IRUGO, show_local_ib_port,   NULL);
+static DEVICE_ATTR(local_ib_device, S_IRUGO, show_local_ib_device, NULL);
+static DEVICE_ATTR(comp_vector,     S_IRUGO, show_comp_vector,     NULL);
+static DEVICE_ATTR(tl_retry_count,  S_IRUGO, show_tl_retry_count,  NULL);
+static DEVICE_ATTR(cmd_sg_entries,  S_IRUGO, show_cmd_sg_entries,  NULL);
+static DEVICE_ATTR(allow_ext_sg,    S_IRUGO, show_allow_ext_sg,    NULL);
+
+static struct device_attribute *srp_host_attrs[] = {
+	&dev_attr_id_ext,
+	&dev_attr_ioc_guid,
+	&dev_attr_service_id,
+	&dev_attr_pkey,
+	&dev_attr_sgid,
+	&dev_attr_dgid,
+	&dev_attr_orig_dgid,
+	&dev_attr_req_lim,
+	&dev_attr_zero_req_lim,
+	&dev_attr_local_ib_port,
+	&dev_attr_local_ib_device,
+	&dev_attr_comp_vector,
+	&dev_attr_tl_retry_count,
+	&dev_attr_cmd_sg_entries,
+	&dev_attr_allow_ext_sg,
+	NULL
+};
+
+static struct scsi_host_template srp_template = {
+	.module				= THIS_MODULE,
+	.name				= "InfiniBand SRP initiator",
+	.proc_name			= DRV_NAME,
+	.slave_configure		= srp_slave_configure,
+	.info				= srp_target_info,
+	.queuecommand			= srp_queuecommand,
+	.change_queue_depth             = srp_change_queue_depth,
+	.change_queue_type              = srp_change_queue_type,
+	.eh_abort_handler		= srp_abort,
+	.eh_device_reset_handler	= srp_reset_device,
+	.eh_host_reset_handler		= srp_reset_host,
+	.skip_settle_delay		= true,
+	.sg_tablesize			= SRP_DEF_SG_TABLESIZE,
+	.can_queue			= SRP_DEFAULT_CMD_SQ_SIZE,
+	.this_id			= -1,
+	.cmd_per_lun			= SRP_DEFAULT_CMD_SQ_SIZE,
+	.use_clustering			= ENABLE_CLUSTERING,
+	.shost_attrs			= srp_host_attrs
+};
+
+static int srp_add_target(struct srp_host *host, struct srp_target_port *target)
+{
+	struct srp_rport_identifiers ids;
+	struct srp_rport *rport;
+
+	sprintf(target->target_name, "SRP.T10:%016llX",
+		 (unsigned long long) be64_to_cpu(target->id_ext));
+
+	if (scsi_add_host(target->scsi_host, host->srp_dev->dev->dma_device))
+		return -ENODEV;
+
+	memcpy(ids.port_id, &target->id_ext, 8);
+	memcpy(ids.port_id + 8, &target->ioc_guid, 8);
+	ids.roles = SRP_RPORT_ROLE_TARGET;
+	rport = srp_rport_add(target->scsi_host, &ids);
+	if (IS_ERR(rport)) {
+		scsi_remove_host(target->scsi_host);
+		return PTR_ERR(rport);
+	}
+
+	rport->lld_data = target;
+	target->rport = rport;
+
+	spin_lock(&host->target_lock);
+	list_add_tail(&target->list, &host->target_list);
+	spin_unlock(&host->target_lock);
+
+	target->state = SRP_TARGET_LIVE;
+
+	scsi_scan_target(&target->scsi_host->shost_gendev,
+			 0, target->scsi_id, SCAN_WILD_CARD, 0);
+
+	return 0;
+}
+
+static void srp_release_dev(struct device *dev)
+{
+	struct srp_host *host =
+		container_of(dev, struct srp_host, dev);
+
+	complete(&host->released);
+}
+
+static struct class srp_class = {
+	.name    = "infiniband_srp",
+	.dev_release = srp_release_dev
+};
+
+/**
+ * srp_conn_unique() - check whether the connection to a target is unique
+ * @host:   SRP host.
+ * @target: SRP target port.
+ */
+static bool srp_conn_unique(struct srp_host *host,
+			    struct srp_target_port *target)
+{
+	struct srp_target_port *t;
+	bool ret = false;
+
+	if (target->state == SRP_TARGET_REMOVED)
+		goto out;
+
+	ret = true;
+
+	spin_lock(&host->target_lock);
+	list_for_each_entry(t, &host->target_list, list) {
+		if (t != target &&
+		    target->id_ext == t->id_ext &&
+		    target->ioc_guid == t->ioc_guid &&
+		    target->initiator_ext == t->initiator_ext) {
+			ret = false;
+			break;
+		}
+	}
+	spin_unlock(&host->target_lock);
+
+out:
+	return ret;
+}
+
+/*
+ * Target ports are added by writing
+ *
+ *     id_ext=<SRP ID ext>,ioc_guid=<SRP IOC GUID>,dgid=<dest GID>,
+ *     pkey=<P_Key>,service_id=<service ID>
+ *
+ * to the add_target sysfs attribute.
+ */
+enum {
+	SRP_OPT_ERR		= 0,
+	SRP_OPT_ID_EXT		= 1 << 0,
+	SRP_OPT_IOC_GUID	= 1 << 1,
+	SRP_OPT_DGID		= 1 << 2,
+	SRP_OPT_PKEY		= 1 << 3,
+	SRP_OPT_SERVICE_ID	= 1 << 4,
+	SRP_OPT_MAX_SECT	= 1 << 5,
+	SRP_OPT_MAX_CMD_PER_LUN	= 1 << 6,
+	SRP_OPT_IO_CLASS	= 1 << 7,
+	SRP_OPT_INITIATOR_EXT	= 1 << 8,
+	SRP_OPT_CMD_SG_ENTRIES	= 1 << 9,
+	SRP_OPT_ALLOW_EXT_SG	= 1 << 10,
+	SRP_OPT_SG_TABLESIZE	= 1 << 11,
+	SRP_OPT_COMP_VECTOR	= 1 << 12,
+	SRP_OPT_TL_RETRY_COUNT	= 1 << 13,
+	SRP_OPT_QUEUE_SIZE	= 1 << 14,
+	SRP_OPT_ALL		= (SRP_OPT_ID_EXT	|
+				   SRP_OPT_IOC_GUID	|
+				   SRP_OPT_DGID		|
+				   SRP_OPT_PKEY		|
+				   SRP_OPT_SERVICE_ID),
+};
+
+static const match_table_t srp_opt_tokens = {
+	{ SRP_OPT_ID_EXT,		"id_ext=%s" 		},
+	{ SRP_OPT_IOC_GUID,		"ioc_guid=%s" 		},
+	{ SRP_OPT_DGID,			"dgid=%s" 		},
+	{ SRP_OPT_PKEY,			"pkey=%x" 		},
+	{ SRP_OPT_SERVICE_ID,		"service_id=%s"		},
+	{ SRP_OPT_MAX_SECT,		"max_sect=%d" 		},
+	{ SRP_OPT_MAX_CMD_PER_LUN,	"max_cmd_per_lun=%d" 	},
+	{ SRP_OPT_IO_CLASS,		"io_class=%x"		},
+	{ SRP_OPT_INITIATOR_EXT,	"initiator_ext=%s"	},
+	{ SRP_OPT_CMD_SG_ENTRIES,	"cmd_sg_entries=%u"	},
+	{ SRP_OPT_ALLOW_EXT_SG,		"allow_ext_sg=%u"	},
+	{ SRP_OPT_SG_TABLESIZE,		"sg_tablesize=%u"	},
+	{ SRP_OPT_COMP_VECTOR,		"comp_vector=%u"	},
+	{ SRP_OPT_TL_RETRY_COUNT,	"tl_retry_count=%u"	},
+	{ SRP_OPT_QUEUE_SIZE,		"queue_size=%d"		},
+	{ SRP_OPT_ERR,			NULL 			}
+};
+
+static int srp_parse_options(const char *buf, struct srp_target_port *target)
+{
+	char *options, *sep_opt;
+	char *p;
+	char dgid[3];
+	substring_t args[MAX_OPT_ARGS];
+	int opt_mask = 0;
+	int token;
+	int ret = -EINVAL;
+	int i;
+
+	options = kstrdup(buf, GFP_KERNEL);
+	if (!options)
+		return -ENOMEM;
+
+	sep_opt = options;
+	while ((p = strsep(&sep_opt, ",")) != NULL) {
+		if (!*p)
+			continue;
+
+		token = match_token(p, srp_opt_tokens, args);
+		opt_mask |= token;
+
+		switch (token) {
+		case SRP_OPT_ID_EXT:
+			p = match_strdup(args);
+			if (!p) {
+				ret = -ENOMEM;
+				goto out;
+			}
+			target->id_ext = cpu_to_be64(simple_strtoull(p, NULL, 16));
+			kfree(p);
+			break;
+
+		case SRP_OPT_IOC_GUID:
+			p = match_strdup(args);
+			if (!p) {
+				ret = -ENOMEM;
+				goto out;
+			}
+			target->ioc_guid = cpu_to_be64(simple_strtoull(p, NULL, 16));
+			kfree(p);
+			break;
+
+		case SRP_OPT_DGID:
+			p = match_strdup(args);
+			if (!p) {
+				ret = -ENOMEM;
+				goto out;
+			}
+			if (strlen(p) != 32) {
+				pr_warn("bad dest GID parameter '%s'\n", p);
+				kfree(p);
+				goto out;
+			}
+
+			for (i = 0; i < 16; ++i) {
+				strlcpy(dgid, p + i * 2, 3);
+				target->path.dgid.raw[i] = simple_strtoul(dgid, NULL, 16);
+			}
+			kfree(p);
+			memcpy(target->orig_dgid, target->path.dgid.raw, 16);
+			break;
+
+		case SRP_OPT_PKEY:
+			if (match_hex(args, &token)) {
+				pr_warn("bad P_Key parameter '%s'\n", p);
+				goto out;
+			}
+			target->path.pkey = cpu_to_be16(token);
+			break;
+
+		case SRP_OPT_SERVICE_ID:
+			p = match_strdup(args);
+			if (!p) {
+				ret = -ENOMEM;
+				goto out;
+			}
+			target->service_id = cpu_to_be64(simple_strtoull(p, NULL, 16));
+			target->path.service_id = target->service_id;
+			kfree(p);
+			break;
+
+		case SRP_OPT_MAX_SECT:
+			if (match_int(args, &token)) {
+				pr_warn("bad max sect parameter '%s'\n", p);
+				goto out;
+			}
+			target->scsi_host->max_sectors = token;
+			break;
+
+		case SRP_OPT_QUEUE_SIZE:
+			if (match_int(args, &token) || token < 1) {
+				pr_warn("bad queue_size parameter '%s'\n", p);
+				goto out;
+			}
+			target->scsi_host->can_queue = token;
+			target->queue_size = token + SRP_RSP_SQ_SIZE +
+					     SRP_TSK_MGMT_SQ_SIZE;
+			if (!(opt_mask & SRP_OPT_MAX_CMD_PER_LUN))
+				target->scsi_host->cmd_per_lun = token;
+			break;
+
+		case SRP_OPT_MAX_CMD_PER_LUN:
+			if (match_int(args, &token) || token < 1) {
+				pr_warn("bad max cmd_per_lun parameter '%s'\n",
+					p);
+				goto out;
+			}
+			target->scsi_host->cmd_per_lun = token;
+			break;
+
+		case SRP_OPT_IO_CLASS:
+			if (match_hex(args, &token)) {
+				pr_warn("bad IO class parameter '%s'\n", p);
+				goto out;
+			}
+			if (token != SRP_REV10_IB_IO_CLASS &&
+			    token != SRP_REV16A_IB_IO_CLASS) {
+				pr_warn("unknown IO class parameter value %x specified (use %x or %x).\n",
+					token, SRP_REV10_IB_IO_CLASS,
+					SRP_REV16A_IB_IO_CLASS);
+				goto out;
+			}
+			target->io_class = token;
+			break;
+
+		case SRP_OPT_INITIATOR_EXT:
+			p = match_strdup(args);
+			if (!p) {
+				ret = -ENOMEM;
+				goto out;
+			}
+			target->initiator_ext = cpu_to_be64(simple_strtoull(p, NULL, 16));
+			kfree(p);
+			break;
+
+		case SRP_OPT_CMD_SG_ENTRIES:
+			if (match_int(args, &token) || token < 1 || token > 255) {
+				pr_warn("bad max cmd_sg_entries parameter '%s'\n",
+					p);
+				goto out;
+			}
+			target->cmd_sg_cnt = token;
+			break;
+
+		case SRP_OPT_ALLOW_EXT_SG:
+			if (match_int(args, &token)) {
+				pr_warn("bad allow_ext_sg parameter '%s'\n", p);
+				goto out;
+			}
+			target->allow_ext_sg = !!token;
+			break;
+
+		case SRP_OPT_SG_TABLESIZE:
+			if (match_int(args, &token) || token < 1 ||
+					token > SCSI_MAX_SG_CHAIN_SEGMENTS) {
+				pr_warn("bad max sg_tablesize parameter '%s'\n",
+					p);
+				goto out;
+			}
+			target->sg_tablesize = token;
+			break;
+
+		case SRP_OPT_COMP_VECTOR:
+			if (match_int(args, &token) || token < 0) {
+				pr_warn("bad comp_vector parameter '%s'\n", p);
+				goto out;
+			}
+			target->comp_vector = token;
+			break;
+
+		case SRP_OPT_TL_RETRY_COUNT:
+			if (match_int(args, &token) || token < 2 || token > 7) {
+				pr_warn("bad tl_retry_count parameter '%s' (must be a number between 2 and 7)\n",
+					p);
+				goto out;
+			}
+			target->tl_retry_count = token;
+			break;
+
+		default:
+			pr_warn("unknown parameter or missing value '%s' in target creation request\n",
+				p);
+			goto out;
+		}
+	}
+
+	if ((opt_mask & SRP_OPT_ALL) == SRP_OPT_ALL)
+		ret = 0;
+	else
+		for (i = 0; i < ARRAY_SIZE(srp_opt_tokens); ++i)
+			if ((srp_opt_tokens[i].token & SRP_OPT_ALL) &&
+			    !(srp_opt_tokens[i].token & opt_mask))
+				pr_warn("target creation request is missing parameter '%s'\n",
+					srp_opt_tokens[i].pattern);
+
+	if (target->scsi_host->cmd_per_lun > target->scsi_host->can_queue
+	    && (opt_mask & SRP_OPT_MAX_CMD_PER_LUN))
+		pr_warn("cmd_per_lun = %d > queue_size = %d\n",
+			target->scsi_host->cmd_per_lun,
+			target->scsi_host->can_queue);
+
+out:
+	kfree(options);
+	return ret;
+}
+
+static ssize_t srp_create_target(struct device *dev,
+				 struct device_attribute *attr,
+				 const char *buf, size_t count)
+{
+	struct srp_host *host =
+		container_of(dev, struct srp_host, dev);
+	struct Scsi_Host *target_host;
+	struct srp_target_port *target;
+	struct srp_device *srp_dev = host->srp_dev;
+	struct ib_device *ibdev = srp_dev->dev;
+	int ret;
+
+	target_host = scsi_host_alloc(&srp_template,
+				      sizeof (struct srp_target_port));
+	if (!target_host)
+		return -ENOMEM;
+
+	target_host->transportt  = ib_srp_transport_template;
+	target_host->max_channel = 0;
+	target_host->max_id      = 1;
+	target_host->max_lun     = SRP_MAX_LUN;
+	target_host->max_cmd_len = sizeof ((struct srp_cmd *) (void *) 0L)->cdb;
+
+	target = host_to_target(target_host);
+
+	target->io_class	= SRP_REV16A_IB_IO_CLASS;
+	target->scsi_host	= target_host;
+	target->srp_host	= host;
+	target->lkey		= host->srp_dev->mr->lkey;
+	target->rkey		= host->srp_dev->mr->rkey;
+	target->cmd_sg_cnt	= cmd_sg_entries;
+	target->sg_tablesize	= indirect_sg_entries ? : cmd_sg_entries;
+	target->allow_ext_sg	= allow_ext_sg;
+	target->tl_retry_count	= 7;
+	target->queue_size	= SRP_DEFAULT_QUEUE_SIZE;
+
+	mutex_lock(&host->add_target_mutex);
+
+	ret = srp_parse_options(buf, target);
+	if (ret)
+		goto err;
+
+	target->req_ring_size = target->queue_size - SRP_TSK_MGMT_SQ_SIZE;
+
+	if (!srp_conn_unique(target->srp_host, target)) {
+		shost_printk(KERN_INFO, target->scsi_host,
+			     PFX "Already connected to target port with id_ext=%016llx;ioc_guid=%016llx;initiator_ext=%016llx\n",
+			     be64_to_cpu(target->id_ext),
+			     be64_to_cpu(target->ioc_guid),
+			     be64_to_cpu(target->initiator_ext));
+		ret = -EEXIST;
+		goto err;
+	}
+
+	if (!srp_dev->has_fmr && !srp_dev->has_fr && !target->allow_ext_sg &&
+	    target->cmd_sg_cnt < target->sg_tablesize) {
+		pr_warn("No MR pool and no external indirect descriptors, limiting sg_tablesize to cmd_sg_cnt\n");
+		target->sg_tablesize = target->cmd_sg_cnt;
+	}
+
+	target_host->sg_tablesize = target->sg_tablesize;
+	target->indirect_size = target->sg_tablesize *
+				sizeof (struct srp_direct_buf);
+	target->max_iu_len = sizeof (struct srp_cmd) +
+			     sizeof (struct srp_indirect_buf) +
+			     target->cmd_sg_cnt * sizeof (struct srp_direct_buf);
+
+	INIT_WORK(&target->tl_err_work, srp_tl_err_work);
+	INIT_WORK(&target->remove_work, srp_remove_work);
+	spin_lock_init(&target->lock);
+	INIT_LIST_HEAD(&target->free_tx);
+	ret = srp_alloc_req_data(target);
+	if (ret)
+		goto err_free_mem;
+
+	ret = ib_query_gid(ibdev, host->port, 0, &target->path.sgid);
+	if (ret)
+		goto err_free_mem;
+
+	ret = srp_create_target_ib(target);
+	if (ret)
+		goto err_free_mem;
+
+	ret = srp_new_cm_id(target);
+	if (ret)
+		goto err_free_ib;
+
+	ret = srp_connect_target(target);
+	if (ret) {
+		shost_printk(KERN_ERR, target->scsi_host,
+			     PFX "Connection failed\n");
+		goto err_cm_id;
+	}
+
+	ret = srp_add_target(host, target);
+	if (ret)
+		goto err_disconnect;
+
+	shost_printk(KERN_DEBUG, target->scsi_host, PFX
+		     "new target: id_ext %016llx ioc_guid %016llx pkey %04x service_id %016llx sgid %pI6 dgid %pI6\n",
+		     be64_to_cpu(target->id_ext),
+		     be64_to_cpu(target->ioc_guid),
+		     be16_to_cpu(target->path.pkey),
+		     be64_to_cpu(target->service_id),
+		     target->path.sgid.raw, target->path.dgid.raw);
+
+	ret = count;
+
+out:
+	mutex_unlock(&host->add_target_mutex);
+	return ret;
+
+err_disconnect:
+	srp_disconnect_target(target);
+
+err_cm_id:
+	ib_destroy_cm_id(target->cm_id);
+
+err_free_ib:
+	srp_free_target_ib(target);
+
+err_free_mem:
+	srp_free_req_data(target);
+
+err:
+	scsi_host_put(target_host);
+	goto out;
+}
+
+static DEVICE_ATTR(add_target, S_IWUSR, NULL, srp_create_target);
+
+static ssize_t show_ibdev(struct device *dev, struct device_attribute *attr,
+			  char *buf)
+{
+	struct srp_host *host = container_of(dev, struct srp_host, dev);
+
+	return sprintf(buf, "%s\n", host->srp_dev->dev->name);
+}
+
+static DEVICE_ATTR(ibdev, S_IRUGO, show_ibdev, NULL);
+
+static ssize_t show_port(struct device *dev, struct device_attribute *attr,
+			 char *buf)
+{
+	struct srp_host *host = container_of(dev, struct srp_host, dev);
+
+	return sprintf(buf, "%d\n", host->port);
+}
+
+static DEVICE_ATTR(port, S_IRUGO, show_port, NULL);
+
+static struct srp_host *srp_add_port(struct srp_device *device, u8 port)
+{
+	struct srp_host *host;
+
+	host = kzalloc(sizeof *host, GFP_KERNEL);
+	if (!host)
+		return NULL;
+
+	INIT_LIST_HEAD(&host->target_list);
+	spin_lock_init(&host->target_lock);
+	init_completion(&host->released);
+	mutex_init(&host->add_target_mutex);
+	host->srp_dev = device;
+	host->port = port;
+
+	host->dev.class = &srp_class;
+	host->dev.parent = device->dev->dma_device;
+	dev_set_name(&host->dev, "srp-%s-%d", device->dev->name, port);
+
+	if (device_register(&host->dev))
+		goto free_host;
+	if (device_create_file(&host->dev, &dev_attr_add_target))
+		goto err_class;
+	if (device_create_file(&host->dev, &dev_attr_ibdev))
+		goto err_class;
+	if (device_create_file(&host->dev, &dev_attr_port))
+		goto err_class;
+
+	return host;
+
+err_class:
+	device_unregister(&host->dev);
+
+free_host:
+	kfree(host);
+
+	return NULL;
+}
+
+static void srp_add_one(struct ib_device *device)
+{
+	struct srp_device *srp_dev;
+	struct ib_device_attr *dev_attr;
+	struct srp_host *host;
+	int mr_page_shift, s, e, p;
+	u64 max_pages_per_mr;
+
+	dev_attr = kmalloc(sizeof *dev_attr, GFP_KERNEL);
+	if (!dev_attr)
+		return;
+
+	if (ib_query_device(device, dev_attr)) {
+		pr_warn("Query device failed for %s\n", device->name);
+		goto free_attr;
+	}
+
+	srp_dev = kmalloc(sizeof *srp_dev, GFP_KERNEL);
+	if (!srp_dev)
+		goto free_attr;
+
+	srp_dev->has_fmr = (device->alloc_fmr && device->dealloc_fmr &&
+			    device->map_phys_fmr && device->unmap_fmr);
+	srp_dev->has_fr = (dev_attr->device_cap_flags &
+			   IB_DEVICE_MEM_MGT_EXTENSIONS);
+	if (!srp_dev->has_fmr && !srp_dev->has_fr)
+		dev_warn(&device->dev, "neither FMR nor FR is supported\n");
+
+	srp_dev->use_fast_reg = (srp_dev->has_fr &&
+				 (!srp_dev->has_fmr || prefer_fr));
+
+	/*
+	 * Use the smallest page size supported by the HCA, down to a
+	 * minimum of 4096 bytes. We're unlikely to build large sglists
+	 * out of smaller entries.
+	 */
+	mr_page_shift		= max(12, ffs(dev_attr->page_size_cap) - 1);
+	srp_dev->mr_page_size	= 1 << mr_page_shift;
+	srp_dev->mr_page_mask	= ~((u64) srp_dev->mr_page_size - 1);
+	max_pages_per_mr	= dev_attr->max_mr_size;
+	do_div(max_pages_per_mr, srp_dev->mr_page_size);
+	srp_dev->max_pages_per_mr = min_t(u64, SRP_MAX_PAGES_PER_MR,
+					  max_pages_per_mr);
+	if (srp_dev->use_fast_reg) {
+		srp_dev->max_pages_per_mr =
+			min_t(u32, srp_dev->max_pages_per_mr,
+			      dev_attr->max_fast_reg_page_list_len);
+	}
+	srp_dev->mr_max_size	= srp_dev->mr_page_size *
+				   srp_dev->max_pages_per_mr;
+	pr_debug("%s: mr_page_shift = %d, dev_attr->max_mr_size = %#llx, dev_attr->max_fast_reg_page_list_len = %u, max_pages_per_mr = %d, mr_max_size = %#x\n",
+		 device->name, mr_page_shift, dev_attr->max_mr_size,
+		 dev_attr->max_fast_reg_page_list_len,
+		 srp_dev->max_pages_per_mr, srp_dev->mr_max_size);
+
+	INIT_LIST_HEAD(&srp_dev->dev_list);
+
+	srp_dev->dev = device;
+	srp_dev->pd  = ib_alloc_pd(device);
+	if (IS_ERR(srp_dev->pd))
+		goto free_dev;
+
+	srp_dev->mr = ib_get_dma_mr(srp_dev->pd,
+				    IB_ACCESS_LOCAL_WRITE |
+				    IB_ACCESS_REMOTE_READ |
+				    IB_ACCESS_REMOTE_WRITE);
+	if (IS_ERR(srp_dev->mr))
+		goto err_pd;
+
+	if (device->node_type == RDMA_NODE_IB_SWITCH) {
+		s = 0;
+		e = 0;
+	} else {
+		s = 1;
+		e = device->phys_port_cnt;
+	}
+
+	for (p = s; p <= e; ++p) {
+		host = srp_add_port(srp_dev, p);
+		if (host)
+			list_add_tail(&host->list, &srp_dev->dev_list);
+	}
+
+	ib_set_client_data(device, &srp_client, srp_dev);
+
+	goto free_attr;
+
+err_pd:
+	ib_dealloc_pd(srp_dev->pd);
+
+free_dev:
+	kfree(srp_dev);
+
+free_attr:
+	kfree(dev_attr);
+}
+
+static void srp_remove_one(struct ib_device *device)
+{
+	struct srp_device *srp_dev;
+	struct srp_host *host, *tmp_host;
+	struct srp_target_port *target;
+
+	srp_dev = ib_get_client_data(device, &srp_client);
+	if (!srp_dev)
+		return;
+
+	list_for_each_entry_safe(host, tmp_host, &srp_dev->dev_list, list) {
+		device_unregister(&host->dev);
+		/*
+		 * Wait for the sysfs entry to go away, so that no new
+		 * target ports can be created.
+		 */
+		wait_for_completion(&host->released);
+
+		/*
+		 * Remove all target ports.
+		 */
+		spin_lock(&host->target_lock);
+		list_for_each_entry(target, &host->target_list, list)
+			srp_queue_remove_work(target);
+		spin_unlock(&host->target_lock);
+
+		/*
+		 * Wait for tl_err and target port removal tasks.
+		 */
+		flush_workqueue(system_long_wq);
+		flush_workqueue(srp_remove_wq);
+
+		kfree(host);
+	}
+
+	ib_dereg_mr(srp_dev->mr);
+	ib_dealloc_pd(srp_dev->pd);
+
+	kfree(srp_dev);
+}
+
+static struct srp_function_template ib_srp_transport_functions = {
+	.has_rport_state	 = true,
+	.reset_timer_if_blocked	 = true,
+	.reconnect_delay	 = &srp_reconnect_delay,
+	.fast_io_fail_tmo	 = &srp_fast_io_fail_tmo,
+	.dev_loss_tmo		 = &srp_dev_loss_tmo,
+	.reconnect		 = srp_rport_reconnect,
+	.rport_delete		 = srp_rport_delete,
+	.terminate_rport_io	 = srp_terminate_io,
+};
+
+static int __init srp_init_module(void)
+{
+	int ret;
+
+	BUILD_BUG_ON(FIELD_SIZEOF(struct ib_wc, wr_id) < sizeof(void *));
+
+	if (srp_sg_tablesize) {
+		pr_warn("srp_sg_tablesize is deprecated, please use cmd_sg_entries\n");
+		if (!cmd_sg_entries)
+			cmd_sg_entries = srp_sg_tablesize;
+	}
+
+	if (!cmd_sg_entries)
+		cmd_sg_entries = SRP_DEF_SG_TABLESIZE;
+
+	if (cmd_sg_entries > 255) {
+		pr_warn("Clamping cmd_sg_entries to 255\n");
+		cmd_sg_entries = 255;
+	}
+
+	if (!indirect_sg_entries)
+		indirect_sg_entries = cmd_sg_entries;
+	else if (indirect_sg_entries < cmd_sg_entries) {
+		pr_warn("Bumping up indirect_sg_entries to match cmd_sg_entries (%u)\n",
+			cmd_sg_entries);
+		indirect_sg_entries = cmd_sg_entries;
+	}
+
+	srp_remove_wq = create_workqueue("srp_remove");
+	if (!srp_remove_wq) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	ret = -ENOMEM;
+	ib_srp_transport_template =
+		srp_attach_transport(&ib_srp_transport_functions);
+	if (!ib_srp_transport_template)
+		goto destroy_wq;
+
+	ret = class_register(&srp_class);
+	if (ret) {
+		pr_err("couldn't register class infiniband_srp\n");
+		goto release_tr;
+	}
+
+	ib_sa_register_client(&srp_sa_client);
+
+	ret = ib_register_client(&srp_client);
+	if (ret) {
+		pr_err("couldn't register IB client\n");
+		goto unreg_sa;
+	}
+
+out:
+	return ret;
+
+unreg_sa:
+	ib_sa_unregister_client(&srp_sa_client);
+	class_unregister(&srp_class);
+
+release_tr:
+	srp_release_transport(ib_srp_transport_template);
+
+destroy_wq:
+	destroy_workqueue(srp_remove_wq);
+	goto out;
+}
+
+static void __exit srp_cleanup_module(void)
+{
+	ib_unregister_client(&srp_client);
+	ib_sa_unregister_client(&srp_sa_client);
+	class_unregister(&srp_class);
+	srp_release_transport(ib_srp_transport_template);
+	destroy_workqueue(srp_remove_wq);
+}
+
+module_init(srp_init_module);
+module_exit(srp_cleanup_module);
diff --git a/drivers/infiniband/ulp/srp/ib_srp.h b/drivers/infiniband/ulp/srp/ib_srp.h
new file mode 100644
index 0000000..e46ecb1
--- /dev/null
+++ b/drivers/infiniband/ulp/srp/ib_srp.h
@@ -0,0 +1,279 @@
+/*
+ * Copyright (c) 2005 Cisco Systems.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef IB_SRP_H
+#define IB_SRP_H
+
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/mutex.h>
+#include <linux/scatterlist.h>
+
+#include <scsi/scsi_host.h>
+#include <scsi/scsi_cmnd.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_sa.h>
+#include <rdma/ib_cm.h>
+#include <rdma/ib_fmr_pool.h>
+
+enum {
+	SRP_PATH_REC_TIMEOUT_MS	= 1000,
+	SRP_ABORT_TIMEOUT_MS	= 5000,
+
+	SRP_PORT_REDIRECT	= 1,
+	SRP_DLID_REDIRECT	= 2,
+	SRP_STALE_CONN		= 3,
+
+	SRP_MAX_LUN		= 512,
+	SRP_DEF_SG_TABLESIZE	= 12,
+
+	SRP_DEFAULT_QUEUE_SIZE	= 1 << 6,
+	SRP_RSP_SQ_SIZE		= 1,
+	SRP_TSK_MGMT_SQ_SIZE	= 1,
+	SRP_DEFAULT_CMD_SQ_SIZE = SRP_DEFAULT_QUEUE_SIZE - SRP_RSP_SQ_SIZE -
+				  SRP_TSK_MGMT_SQ_SIZE,
+
+	SRP_TAG_NO_REQ		= ~0U,
+	SRP_TAG_TSK_MGMT	= 1U << 31,
+
+	SRP_MAX_PAGES_PER_MR	= 512,
+
+	LOCAL_INV_WR_ID_MASK	= 1,
+	FAST_REG_WR_ID_MASK	= 2,
+};
+
+enum srp_target_state {
+	SRP_TARGET_LIVE,
+	SRP_TARGET_REMOVED,
+};
+
+enum srp_iu_type {
+	SRP_IU_CMD,
+	SRP_IU_TSK_MGMT,
+	SRP_IU_RSP,
+};
+
+/*
+ * @mr_page_mask: HCA memory registration page mask.
+ * @mr_page_size: HCA memory registration page size.
+ * @mr_max_size: Maximum size in bytes of a single FMR / FR registration
+ *   request.
+ */
+struct srp_device {
+	struct list_head	dev_list;
+	struct ib_device       *dev;
+	struct ib_pd	       *pd;
+	struct ib_mr	       *mr;
+	u64			mr_page_mask;
+	int			mr_page_size;
+	int			mr_max_size;
+	int			max_pages_per_mr;
+	bool			has_fmr;
+	bool			has_fr;
+	bool			use_fast_reg;
+};
+
+struct srp_host {
+	struct srp_device      *srp_dev;
+	u8			port;
+	struct device		dev;
+	struct list_head	target_list;
+	spinlock_t		target_lock;
+	struct completion	released;
+	struct list_head	list;
+	struct mutex		add_target_mutex;
+};
+
+struct srp_request {
+	struct list_head	list;
+	struct scsi_cmnd       *scmnd;
+	struct srp_iu	       *cmd;
+	union {
+		struct ib_pool_fmr **fmr_list;
+		struct srp_fr_desc **fr_list;
+	};
+	u64		       *map_page;
+	struct srp_direct_buf  *indirect_desc;
+	dma_addr_t		indirect_dma_addr;
+	short			nmdesc;
+	short			index;
+};
+
+struct srp_target_port {
+	/* These are RW in the hot path, and commonly used together */
+	struct list_head	free_tx;
+	struct list_head	free_reqs;
+	spinlock_t		lock;
+	s32			req_lim;
+
+	/* These are read-only in the hot path */
+	struct ib_cq	       *send_cq ____cacheline_aligned_in_smp;
+	struct ib_cq	       *recv_cq;
+	struct ib_qp	       *qp;
+	union {
+		struct ib_fmr_pool     *fmr_pool;
+		struct srp_fr_pool     *fr_pool;
+	};
+	u32			lkey;
+	u32			rkey;
+	enum srp_target_state	state;
+	unsigned int		max_iu_len;
+	unsigned int		cmd_sg_cnt;
+	unsigned int		indirect_size;
+	bool			allow_ext_sg;
+
+	/* Everything above this point is used in the hot path of
+	 * command processing. Try to keep them packed into cachelines.
+	 */
+
+	__be64			id_ext;
+	__be64			ioc_guid;
+	__be64			service_id;
+	__be64			initiator_ext;
+	u16			io_class;
+	struct srp_host	       *srp_host;
+	struct Scsi_Host       *scsi_host;
+	struct srp_rport       *rport;
+	char			target_name[32];
+	unsigned int		scsi_id;
+	unsigned int		sg_tablesize;
+	int			queue_size;
+	int			req_ring_size;
+	int			comp_vector;
+	int			tl_retry_count;
+
+	struct ib_sa_path_rec	path;
+	__be16			orig_dgid[8];
+	struct ib_sa_query     *path_query;
+	int			path_query_id;
+
+	u32			rq_tmo_jiffies;
+	bool			connected;
+
+	struct ib_cm_id	       *cm_id;
+
+	int			max_ti_iu_len;
+
+	int			zero_req_lim;
+
+	struct srp_iu	       **tx_ring;
+	struct srp_iu	       **rx_ring;
+	struct srp_request	*req_ring;
+
+	struct work_struct	tl_err_work;
+	struct work_struct	remove_work;
+
+	struct list_head	list;
+	struct completion	done;
+	int			status;
+	bool			qp_in_error;
+
+	struct completion	tsk_mgmt_done;
+	u8			tsk_mgmt_status;
+};
+
+struct srp_iu {
+	struct list_head	list;
+	u64			dma;
+	void		       *buf;
+	size_t			size;
+	enum dma_data_direction	direction;
+};
+
+/**
+ * struct srp_fr_desc - fast registration work request arguments
+ * @entry: Entry in srp_fr_pool.free_list.
+ * @mr:    Memory region.
+ * @frpl:  Fast registration page list.
+ */
+struct srp_fr_desc {
+	struct list_head		entry;
+	struct ib_mr			*mr;
+	struct ib_fast_reg_page_list	*frpl;
+};
+
+/**
+ * struct srp_fr_pool - pool of fast registration descriptors
+ *
+ * An entry is available for allocation if and only if it occurs in @free_list.
+ *
+ * @size:      Number of descriptors in this pool.
+ * @max_page_list_len: Maximum fast registration work request page list length.
+ * @lock:      Protects free_list.
+ * @free_list: List of free descriptors.
+ * @desc:      Fast registration descriptor pool.
+ */
+struct srp_fr_pool {
+	int			size;
+	int			max_page_list_len;
+	spinlock_t		lock;
+	struct list_head	free_list;
+	struct srp_fr_desc	desc[0];
+};
+
+/**
+ * struct srp_map_state - per-request DMA memory mapping state
+ * @desc:	    Pointer to the element of the SRP buffer descriptor array
+ *		    that is being filled in.
+ * @pages:	    Array with DMA addresses of pages being considered for
+ *		    memory registration.
+ * @base_dma_addr:  DMA address of the first page that has not yet been mapped.
+ * @dma_len:	    Number of bytes that will be registered with the next
+ *		    FMR or FR memory registration call.
+ * @total_len:	    Total number of bytes in the sg-list being mapped.
+ * @npages:	    Number of page addresses in the pages[] array.
+ * @nmdesc:	    Number of FMR or FR memory descriptors used for mapping.
+ * @ndesc:	    Number of SRP buffer descriptors that have been filled in.
+ * @unmapped_sg:    First element of the sg-list that is mapped via FMR or FR.
+ * @unmapped_index: Index of the first element mapped via FMR or FR.
+ * @unmapped_addr:  DMA address of the first element mapped via FMR or FR.
+ */
+struct srp_map_state {
+	union {
+		struct ib_pool_fmr **next_fmr;
+		struct srp_fr_desc **next_fr;
+	};
+	struct srp_direct_buf  *desc;
+	u64		       *pages;
+	dma_addr_t		base_dma_addr;
+	u32			dma_len;
+	u32			total_len;
+	unsigned int		npages;
+	unsigned int		nmdesc;
+	unsigned int		ndesc;
+	struct scatterlist     *unmapped_sg;
+	int			unmapped_index;
+	dma_addr_t		unmapped_addr;
+};
+
+#endif /* IB_SRP_H */
diff --git a/drivers/infiniband/ulp/srpt/Kconfig b/drivers/infiniband/ulp/srpt/Kconfig
new file mode 100644
index 0000000..31ee83d
--- /dev/null
+++ b/drivers/infiniband/ulp/srpt/Kconfig
@@ -0,0 +1,12 @@
+config INFINIBAND_SRPT
+	tristate "InfiniBand SCSI RDMA Protocol target support"
+	depends on INFINIBAND && TARGET_CORE
+	---help---
+
+	  Support for the SCSI RDMA Protocol (SRP) Target driver. The
+	  SRP protocol is a protocol that allows an initiator to access
+	  a block storage device on another host (target) over a network
+	  that supports the RDMA protocol. Currently the RDMA protocol is
+	  supported by InfiniBand and by iWarp network hardware. More
+	  information about the SRP protocol can be found on the website
+	  of the INCITS T10 technical committee (http://www.t10.org/).
diff --git a/drivers/infiniband/ulp/srpt/Makefile b/drivers/infiniband/ulp/srpt/Makefile
new file mode 100644
index 0000000..e3ee4bd
--- /dev/null
+++ b/drivers/infiniband/ulp/srpt/Makefile
@@ -0,0 +1,2 @@
+ccflags-y			:= -Idrivers/target
+obj-$(CONFIG_INFINIBAND_SRPT)	+= ib_srpt.o
diff --git a/drivers/infiniband/ulp/srpt/ib_dm_mad.h b/drivers/infiniband/ulp/srpt/ib_dm_mad.h
new file mode 100644
index 0000000..fb1de1f
--- /dev/null
+++ b/drivers/infiniband/ulp/srpt/ib_dm_mad.h
@@ -0,0 +1,139 @@
+/*
+ * Copyright (c) 2006 - 2009 Mellanox Technology Inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifndef IB_DM_MAD_H
+#define IB_DM_MAD_H
+
+#include <linux/types.h>
+
+#include <rdma/ib_mad.h>
+
+enum {
+	/*
+	 * See also section 13.4.7 Status Field, table 115 MAD Common Status
+	 * Field Bit Values and also section 16.3.1.1 Status Field in the
+	 * InfiniBand Architecture Specification.
+	 */
+	DM_MAD_STATUS_UNSUP_METHOD = 0x0008,
+	DM_MAD_STATUS_UNSUP_METHOD_ATTR = 0x000c,
+	DM_MAD_STATUS_INVALID_FIELD = 0x001c,
+	DM_MAD_STATUS_NO_IOC = 0x0100,
+
+	/*
+	 * See also the Device Management chapter, section 16.3.3 Attributes,
+	 * table 279 Device Management Attributes in the InfiniBand
+	 * Architecture Specification.
+	 */
+	DM_ATTR_CLASS_PORT_INFO = 0x01,
+	DM_ATTR_IOU_INFO = 0x10,
+	DM_ATTR_IOC_PROFILE = 0x11,
+	DM_ATTR_SVC_ENTRIES = 0x12
+};
+
+struct ib_dm_hdr {
+	u8 reserved[28];
+};
+
+/*
+ * Structure of management datagram sent by the SRP target implementation.
+ * Contains a management datagram header, reliable multi-packet transaction
+ * protocol (RMPP) header and ib_dm_hdr. Notes:
+ * - The SRP target implementation does not use RMPP or ib_dm_hdr when sending
+ *   management datagrams.
+ * - The header size must be exactly 64 bytes (IB_MGMT_DEVICE_HDR), since this
+ *   is the header size that is passed to ib_create_send_mad() in ib_srpt.c.
+ * - The maximum supported size for a management datagram when not using RMPP
+ *   is 256 bytes -- 64 bytes header and 192 (IB_MGMT_DEVICE_DATA) bytes data.
+ */
+struct ib_dm_mad {
+	struct ib_mad_hdr mad_hdr;
+	struct ib_rmpp_hdr rmpp_hdr;
+	struct ib_dm_hdr dm_hdr;
+	u8 data[IB_MGMT_DEVICE_DATA];
+};
+
+/*
+ * IOUnitInfo as defined in section 16.3.3.3 IOUnitInfo of the InfiniBand
+ * Architecture Specification.
+ */
+struct ib_dm_iou_info {
+	__be16 change_id;
+	u8 max_controllers;
+	u8 op_rom;
+	u8 controller_list[128];
+};
+
+/*
+ * IOControllerprofile as defined in section 16.3.3.4 IOControllerProfile of
+ * the InfiniBand Architecture Specification.
+ */
+struct ib_dm_ioc_profile {
+	__be64 guid;
+	__be32 vendor_id;
+	__be32 device_id;
+	__be16 device_version;
+	__be16 reserved1;
+	__be32 subsys_vendor_id;
+	__be32 subsys_device_id;
+	__be16 io_class;
+	__be16 io_subclass;
+	__be16 protocol;
+	__be16 protocol_version;
+	__be16 service_conn;
+	__be16 initiators_supported;
+	__be16 send_queue_depth;
+	u8 reserved2;
+	u8 rdma_read_depth;
+	__be32 send_size;
+	__be32 rdma_size;
+	u8 op_cap_mask;
+	u8 svc_cap_mask;
+	u8 num_svc_entries;
+	u8 reserved3[9];
+	u8 id_string[64];
+};
+
+struct ib_dm_svc_entry {
+	u8 name[40];
+	__be64 id;
+};
+
+/*
+ * See also section 16.3.3.5 ServiceEntries in the InfiniBand Architecture
+ * Specification. See also section B.7, table B.8 in the T10 SRP r16a document.
+ */
+struct ib_dm_svc_entries {
+	struct ib_dm_svc_entry service_entries[4];
+};
+
+#endif
diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.c b/drivers/infiniband/ulp/srpt/ib_srpt.c
new file mode 100644
index 0000000..dc82968
--- /dev/null
+++ b/drivers/infiniband/ulp/srpt/ib_srpt.c
@@ -0,0 +1,4051 @@
+/*
+ * Copyright (c) 2006 - 2009 Mellanox Technology Inc.  All rights reserved.
+ * Copyright (C) 2008 - 2011 Bart Van Assche <bvanassche@acm.org>.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/err.h>
+#include <linux/ctype.h>
+#include <linux/kthread.h>
+#include <linux/string.h>
+#include <linux/delay.h>
+#include <linux/atomic.h>
+#include <scsi/scsi_tcq.h>
+#include <target/configfs_macros.h>
+#include <target/target_core_base.h>
+#include <target/target_core_fabric_configfs.h>
+#include <target/target_core_fabric.h>
+#include <target/target_core_configfs.h>
+#include "ib_srpt.h"
+
+/* Name of this kernel module. */
+#define DRV_NAME		"ib_srpt"
+#define DRV_VERSION		"2.0.0"
+#define DRV_RELDATE		"2011-02-14"
+
+#define SRPT_ID_STRING	"Linux SRP target"
+
+#undef pr_fmt
+#define pr_fmt(fmt) DRV_NAME " " fmt
+
+MODULE_AUTHOR("Vu Pham and Bart Van Assche");
+MODULE_DESCRIPTION("InfiniBand SCSI RDMA Protocol target "
+		   "v" DRV_VERSION " (" DRV_RELDATE ")");
+MODULE_LICENSE("Dual BSD/GPL");
+
+/*
+ * Global Variables
+ */
+
+static u64 srpt_service_guid;
+static DEFINE_SPINLOCK(srpt_dev_lock);	/* Protects srpt_dev_list. */
+static LIST_HEAD(srpt_dev_list);	/* List of srpt_device structures. */
+
+static unsigned srp_max_req_size = DEFAULT_MAX_REQ_SIZE;
+module_param(srp_max_req_size, int, 0444);
+MODULE_PARM_DESC(srp_max_req_size,
+		 "Maximum size of SRP request messages in bytes.");
+
+static int srpt_srq_size = DEFAULT_SRPT_SRQ_SIZE;
+module_param(srpt_srq_size, int, 0444);
+MODULE_PARM_DESC(srpt_srq_size,
+		 "Shared receive queue (SRQ) size.");
+
+static int srpt_get_u64_x(char *buffer, struct kernel_param *kp)
+{
+	return sprintf(buffer, "0x%016llx", *(u64 *)kp->arg);
+}
+module_param_call(srpt_service_guid, NULL, srpt_get_u64_x, &srpt_service_guid,
+		  0444);
+MODULE_PARM_DESC(srpt_service_guid,
+		 "Using this value for ioc_guid, id_ext, and cm_listen_id"
+		 " instead of using the node_guid of the first HCA.");
+
+static struct ib_client srpt_client;
+static struct target_fabric_configfs *srpt_target;
+static void srpt_release_channel(struct srpt_rdma_ch *ch);
+static int srpt_queue_status(struct se_cmd *cmd);
+
+/**
+ * opposite_dma_dir() - Swap DMA_TO_DEVICE and DMA_FROM_DEVICE.
+ */
+static inline
+enum dma_data_direction opposite_dma_dir(enum dma_data_direction dir)
+{
+	switch (dir) {
+	case DMA_TO_DEVICE:	return DMA_FROM_DEVICE;
+	case DMA_FROM_DEVICE:	return DMA_TO_DEVICE;
+	default:		return dir;
+	}
+}
+
+/**
+ * srpt_sdev_name() - Return the name associated with the HCA.
+ *
+ * Examples are ib0, ib1, ...
+ */
+static inline const char *srpt_sdev_name(struct srpt_device *sdev)
+{
+	return sdev->device->name;
+}
+
+static enum rdma_ch_state srpt_get_ch_state(struct srpt_rdma_ch *ch)
+{
+	unsigned long flags;
+	enum rdma_ch_state state;
+
+	spin_lock_irqsave(&ch->spinlock, flags);
+	state = ch->state;
+	spin_unlock_irqrestore(&ch->spinlock, flags);
+	return state;
+}
+
+static enum rdma_ch_state
+srpt_set_ch_state(struct srpt_rdma_ch *ch, enum rdma_ch_state new_state)
+{
+	unsigned long flags;
+	enum rdma_ch_state prev;
+
+	spin_lock_irqsave(&ch->spinlock, flags);
+	prev = ch->state;
+	ch->state = new_state;
+	spin_unlock_irqrestore(&ch->spinlock, flags);
+	return prev;
+}
+
+/**
+ * srpt_test_and_set_ch_state() - Test and set the channel state.
+ *
+ * Returns true if and only if the channel state has been set to the new state.
+ */
+static bool
+srpt_test_and_set_ch_state(struct srpt_rdma_ch *ch, enum rdma_ch_state old,
+			   enum rdma_ch_state new)
+{
+	unsigned long flags;
+	enum rdma_ch_state prev;
+
+	spin_lock_irqsave(&ch->spinlock, flags);
+	prev = ch->state;
+	if (prev == old)
+		ch->state = new;
+	spin_unlock_irqrestore(&ch->spinlock, flags);
+	return prev == old;
+}
+
+/**
+ * srpt_event_handler() - Asynchronous IB event callback function.
+ *
+ * Callback function called by the InfiniBand core when an asynchronous IB
+ * event occurs. This callback may occur in interrupt context. See also
+ * section 11.5.2, Set Asynchronous Event Handler in the InfiniBand
+ * Architecture Specification.
+ */
+static void srpt_event_handler(struct ib_event_handler *handler,
+			       struct ib_event *event)
+{
+	struct srpt_device *sdev;
+	struct srpt_port *sport;
+
+	sdev = ib_get_client_data(event->device, &srpt_client);
+	if (!sdev || sdev->device != event->device)
+		return;
+
+	pr_debug("ASYNC event= %d on device= %s\n", event->event,
+		 srpt_sdev_name(sdev));
+
+	switch (event->event) {
+	case IB_EVENT_PORT_ERR:
+		if (event->element.port_num <= sdev->device->phys_port_cnt) {
+			sport = &sdev->port[event->element.port_num - 1];
+			sport->lid = 0;
+			sport->sm_lid = 0;
+		}
+		break;
+	case IB_EVENT_PORT_ACTIVE:
+	case IB_EVENT_LID_CHANGE:
+	case IB_EVENT_PKEY_CHANGE:
+	case IB_EVENT_SM_CHANGE:
+	case IB_EVENT_CLIENT_REREGISTER:
+	case IB_EVENT_GID_CHANGE:
+		/* Refresh port data asynchronously. */
+		if (event->element.port_num <= sdev->device->phys_port_cnt) {
+			sport = &sdev->port[event->element.port_num - 1];
+			if (!sport->lid && !sport->sm_lid)
+				schedule_work(&sport->work);
+		}
+		break;
+	default:
+		printk(KERN_ERR "received unrecognized IB event %d\n",
+		       event->event);
+		break;
+	}
+}
+
+/**
+ * srpt_srq_event() - SRQ event callback function.
+ */
+static void srpt_srq_event(struct ib_event *event, void *ctx)
+{
+	printk(KERN_INFO "SRQ event %d\n", event->event);
+}
+
+/**
+ * srpt_qp_event() - QP event callback function.
+ */
+static void srpt_qp_event(struct ib_event *event, struct srpt_rdma_ch *ch)
+{
+	pr_debug("QP event %d on cm_id=%p sess_name=%s state=%d\n",
+		 event->event, ch->cm_id, ch->sess_name, srpt_get_ch_state(ch));
+
+	switch (event->event) {
+	case IB_EVENT_COMM_EST:
+		ib_cm_notify(ch->cm_id, event->event);
+		break;
+	case IB_EVENT_QP_LAST_WQE_REACHED:
+		if (srpt_test_and_set_ch_state(ch, CH_DRAINING,
+					       CH_RELEASING))
+			srpt_release_channel(ch);
+		else
+			pr_debug("%s: state %d - ignored LAST_WQE.\n",
+				 ch->sess_name, srpt_get_ch_state(ch));
+		break;
+	default:
+		printk(KERN_ERR "received unrecognized IB QP event %d\n",
+		       event->event);
+		break;
+	}
+}
+
+/**
+ * srpt_set_ioc() - Helper function for initializing an IOUnitInfo structure.
+ *
+ * @slot: one-based slot number.
+ * @value: four-bit value.
+ *
+ * Copies the lowest four bits of value in element slot of the array of four
+ * bit elements called c_list (controller list). The index slot is one-based.
+ */
+static void srpt_set_ioc(u8 *c_list, u32 slot, u8 value)
+{
+	u16 id;
+	u8 tmp;
+
+	id = (slot - 1) / 2;
+	if (slot & 0x1) {
+		tmp = c_list[id] & 0xf;
+		c_list[id] = (value << 4) | tmp;
+	} else {
+		tmp = c_list[id] & 0xf0;
+		c_list[id] = (value & 0xf) | tmp;
+	}
+}
+
+/**
+ * srpt_get_class_port_info() - Copy ClassPortInfo to a management datagram.
+ *
+ * See also section 16.3.3.1 ClassPortInfo in the InfiniBand Architecture
+ * Specification.
+ */
+static void srpt_get_class_port_info(struct ib_dm_mad *mad)
+{
+	struct ib_class_port_info *cif;
+
+	cif = (struct ib_class_port_info *)mad->data;
+	memset(cif, 0, sizeof *cif);
+	cif->base_version = 1;
+	cif->class_version = 1;
+	cif->resp_time_value = 20;
+
+	mad->mad_hdr.status = 0;
+}
+
+/**
+ * srpt_get_iou() - Write IOUnitInfo to a management datagram.
+ *
+ * See also section 16.3.3.3 IOUnitInfo in the InfiniBand Architecture
+ * Specification. See also section B.7, table B.6 in the SRP r16a document.
+ */
+static void srpt_get_iou(struct ib_dm_mad *mad)
+{
+	struct ib_dm_iou_info *ioui;
+	u8 slot;
+	int i;
+
+	ioui = (struct ib_dm_iou_info *)mad->data;
+	ioui->change_id = __constant_cpu_to_be16(1);
+	ioui->max_controllers = 16;
+
+	/* set present for slot 1 and empty for the rest */
+	srpt_set_ioc(ioui->controller_list, 1, 1);
+	for (i = 1, slot = 2; i < 16; i++, slot++)
+		srpt_set_ioc(ioui->controller_list, slot, 0);
+
+	mad->mad_hdr.status = 0;
+}
+
+/**
+ * srpt_get_ioc() - Write IOControllerprofile to a management datagram.
+ *
+ * See also section 16.3.3.4 IOControllerProfile in the InfiniBand
+ * Architecture Specification. See also section B.7, table B.7 in the SRP
+ * r16a document.
+ */
+static void srpt_get_ioc(struct srpt_port *sport, u32 slot,
+			 struct ib_dm_mad *mad)
+{
+	struct srpt_device *sdev = sport->sdev;
+	struct ib_dm_ioc_profile *iocp;
+
+	iocp = (struct ib_dm_ioc_profile *)mad->data;
+
+	if (!slot || slot > 16) {
+		mad->mad_hdr.status
+			= __constant_cpu_to_be16(DM_MAD_STATUS_INVALID_FIELD);
+		return;
+	}
+
+	if (slot > 2) {
+		mad->mad_hdr.status
+			= __constant_cpu_to_be16(DM_MAD_STATUS_NO_IOC);
+		return;
+	}
+
+	memset(iocp, 0, sizeof *iocp);
+	strcpy(iocp->id_string, SRPT_ID_STRING);
+	iocp->guid = cpu_to_be64(srpt_service_guid);
+	iocp->vendor_id = cpu_to_be32(sdev->dev_attr.vendor_id);
+	iocp->device_id = cpu_to_be32(sdev->dev_attr.vendor_part_id);
+	iocp->device_version = cpu_to_be16(sdev->dev_attr.hw_ver);
+	iocp->subsys_vendor_id = cpu_to_be32(sdev->dev_attr.vendor_id);
+	iocp->subsys_device_id = 0x0;
+	iocp->io_class = __constant_cpu_to_be16(SRP_REV16A_IB_IO_CLASS);
+	iocp->io_subclass = __constant_cpu_to_be16(SRP_IO_SUBCLASS);
+	iocp->protocol = __constant_cpu_to_be16(SRP_PROTOCOL);
+	iocp->protocol_version = __constant_cpu_to_be16(SRP_PROTOCOL_VERSION);
+	iocp->send_queue_depth = cpu_to_be16(sdev->srq_size);
+	iocp->rdma_read_depth = 4;
+	iocp->send_size = cpu_to_be32(srp_max_req_size);
+	iocp->rdma_size = cpu_to_be32(min(sport->port_attrib.srp_max_rdma_size,
+					  1U << 24));
+	iocp->num_svc_entries = 1;
+	iocp->op_cap_mask = SRP_SEND_TO_IOC | SRP_SEND_FROM_IOC |
+		SRP_RDMA_READ_FROM_IOC | SRP_RDMA_WRITE_FROM_IOC;
+
+	mad->mad_hdr.status = 0;
+}
+
+/**
+ * srpt_get_svc_entries() - Write ServiceEntries to a management datagram.
+ *
+ * See also section 16.3.3.5 ServiceEntries in the InfiniBand Architecture
+ * Specification. See also section B.7, table B.8 in the SRP r16a document.
+ */
+static void srpt_get_svc_entries(u64 ioc_guid,
+				 u16 slot, u8 hi, u8 lo, struct ib_dm_mad *mad)
+{
+	struct ib_dm_svc_entries *svc_entries;
+
+	WARN_ON(!ioc_guid);
+
+	if (!slot || slot > 16) {
+		mad->mad_hdr.status
+			= __constant_cpu_to_be16(DM_MAD_STATUS_INVALID_FIELD);
+		return;
+	}
+
+	if (slot > 2 || lo > hi || hi > 1) {
+		mad->mad_hdr.status
+			= __constant_cpu_to_be16(DM_MAD_STATUS_NO_IOC);
+		return;
+	}
+
+	svc_entries = (struct ib_dm_svc_entries *)mad->data;
+	memset(svc_entries, 0, sizeof *svc_entries);
+	svc_entries->service_entries[0].id = cpu_to_be64(ioc_guid);
+	snprintf(svc_entries->service_entries[0].name,
+		 sizeof(svc_entries->service_entries[0].name),
+		 "%s%016llx",
+		 SRP_SERVICE_NAME_PREFIX,
+		 ioc_guid);
+
+	mad->mad_hdr.status = 0;
+}
+
+/**
+ * srpt_mgmt_method_get() - Process a received management datagram.
+ * @sp:      source port through which the MAD has been received.
+ * @rq_mad:  received MAD.
+ * @rsp_mad: response MAD.
+ */
+static void srpt_mgmt_method_get(struct srpt_port *sp, struct ib_mad *rq_mad,
+				 struct ib_dm_mad *rsp_mad)
+{
+	u16 attr_id;
+	u32 slot;
+	u8 hi, lo;
+
+	attr_id = be16_to_cpu(rq_mad->mad_hdr.attr_id);
+	switch (attr_id) {
+	case DM_ATTR_CLASS_PORT_INFO:
+		srpt_get_class_port_info(rsp_mad);
+		break;
+	case DM_ATTR_IOU_INFO:
+		srpt_get_iou(rsp_mad);
+		break;
+	case DM_ATTR_IOC_PROFILE:
+		slot = be32_to_cpu(rq_mad->mad_hdr.attr_mod);
+		srpt_get_ioc(sp, slot, rsp_mad);
+		break;
+	case DM_ATTR_SVC_ENTRIES:
+		slot = be32_to_cpu(rq_mad->mad_hdr.attr_mod);
+		hi = (u8) ((slot >> 8) & 0xff);
+		lo = (u8) (slot & 0xff);
+		slot = (u16) ((slot >> 16) & 0xffff);
+		srpt_get_svc_entries(srpt_service_guid,
+				     slot, hi, lo, rsp_mad);
+		break;
+	default:
+		rsp_mad->mad_hdr.status =
+		    __constant_cpu_to_be16(DM_MAD_STATUS_UNSUP_METHOD_ATTR);
+		break;
+	}
+}
+
+/**
+ * srpt_mad_send_handler() - Post MAD-send callback function.
+ */
+static void srpt_mad_send_handler(struct ib_mad_agent *mad_agent,
+				  struct ib_mad_send_wc *mad_wc)
+{
+	ib_destroy_ah(mad_wc->send_buf->ah);
+	ib_free_send_mad(mad_wc->send_buf);
+}
+
+/**
+ * srpt_mad_recv_handler() - MAD reception callback function.
+ */
+static void srpt_mad_recv_handler(struct ib_mad_agent *mad_agent,
+				  struct ib_mad_recv_wc *mad_wc)
+{
+	struct srpt_port *sport = (struct srpt_port *)mad_agent->context;
+	struct ib_ah *ah;
+	struct ib_mad_send_buf *rsp;
+	struct ib_dm_mad *dm_mad;
+
+	if (!mad_wc || !mad_wc->recv_buf.mad)
+		return;
+
+	ah = ib_create_ah_from_wc(mad_agent->qp->pd, mad_wc->wc,
+				  mad_wc->recv_buf.grh, mad_agent->port_num);
+	if (IS_ERR(ah))
+		goto err;
+
+	BUILD_BUG_ON(offsetof(struct ib_dm_mad, data) != IB_MGMT_DEVICE_HDR);
+
+	rsp = ib_create_send_mad(mad_agent, mad_wc->wc->src_qp,
+				 mad_wc->wc->pkey_index, 0,
+				 IB_MGMT_DEVICE_HDR, IB_MGMT_DEVICE_DATA,
+				 GFP_KERNEL);
+	if (IS_ERR(rsp))
+		goto err_rsp;
+
+	rsp->ah = ah;
+
+	dm_mad = rsp->mad;
+	memcpy(dm_mad, mad_wc->recv_buf.mad, sizeof *dm_mad);
+	dm_mad->mad_hdr.method = IB_MGMT_METHOD_GET_RESP;
+	dm_mad->mad_hdr.status = 0;
+
+	switch (mad_wc->recv_buf.mad->mad_hdr.method) {
+	case IB_MGMT_METHOD_GET:
+		srpt_mgmt_method_get(sport, mad_wc->recv_buf.mad, dm_mad);
+		break;
+	case IB_MGMT_METHOD_SET:
+		dm_mad->mad_hdr.status =
+		    __constant_cpu_to_be16(DM_MAD_STATUS_UNSUP_METHOD_ATTR);
+		break;
+	default:
+		dm_mad->mad_hdr.status =
+		    __constant_cpu_to_be16(DM_MAD_STATUS_UNSUP_METHOD);
+		break;
+	}
+
+	if (!ib_post_send_mad(rsp, NULL)) {
+		ib_free_recv_mad(mad_wc);
+		/* will destroy_ah & free_send_mad in send completion */
+		return;
+	}
+
+	ib_free_send_mad(rsp);
+
+err_rsp:
+	ib_destroy_ah(ah);
+err:
+	ib_free_recv_mad(mad_wc);
+}
+
+/**
+ * srpt_refresh_port() - Configure a HCA port.
+ *
+ * Enable InfiniBand management datagram processing, update the cached sm_lid,
+ * lid and gid values, and register a callback function for processing MADs
+ * on the specified port.
+ *
+ * Note: It is safe to call this function more than once for the same port.
+ */
+static int srpt_refresh_port(struct srpt_port *sport)
+{
+	struct ib_mad_reg_req reg_req;
+	struct ib_port_modify port_modify;
+	struct ib_port_attr port_attr;
+	int ret;
+
+	memset(&port_modify, 0, sizeof port_modify);
+	port_modify.set_port_cap_mask = IB_PORT_DEVICE_MGMT_SUP;
+	port_modify.clr_port_cap_mask = 0;
+
+	ret = ib_modify_port(sport->sdev->device, sport->port, 0, &port_modify);
+	if (ret)
+		goto err_mod_port;
+
+	ret = ib_query_port(sport->sdev->device, sport->port, &port_attr);
+	if (ret)
+		goto err_query_port;
+
+	sport->sm_lid = port_attr.sm_lid;
+	sport->lid = port_attr.lid;
+
+	ret = ib_query_gid(sport->sdev->device, sport->port, 0, &sport->gid);
+	if (ret)
+		goto err_query_port;
+
+	if (!sport->mad_agent) {
+		memset(&reg_req, 0, sizeof reg_req);
+		reg_req.mgmt_class = IB_MGMT_CLASS_DEVICE_MGMT;
+		reg_req.mgmt_class_version = IB_MGMT_BASE_VERSION;
+		set_bit(IB_MGMT_METHOD_GET, reg_req.method_mask);
+		set_bit(IB_MGMT_METHOD_SET, reg_req.method_mask);
+
+		sport->mad_agent = ib_register_mad_agent(sport->sdev->device,
+							 sport->port,
+							 IB_QPT_GSI,
+							 &reg_req, 0,
+							 srpt_mad_send_handler,
+							 srpt_mad_recv_handler,
+							 sport, 0);
+		if (IS_ERR(sport->mad_agent)) {
+			ret = PTR_ERR(sport->mad_agent);
+			sport->mad_agent = NULL;
+			goto err_query_port;
+		}
+	}
+
+	return 0;
+
+err_query_port:
+
+	port_modify.set_port_cap_mask = 0;
+	port_modify.clr_port_cap_mask = IB_PORT_DEVICE_MGMT_SUP;
+	ib_modify_port(sport->sdev->device, sport->port, 0, &port_modify);
+
+err_mod_port:
+
+	return ret;
+}
+
+/**
+ * srpt_unregister_mad_agent() - Unregister MAD callback functions.
+ *
+ * Note: It is safe to call this function more than once for the same device.
+ */
+static void srpt_unregister_mad_agent(struct srpt_device *sdev)
+{
+	struct ib_port_modify port_modify = {
+		.clr_port_cap_mask = IB_PORT_DEVICE_MGMT_SUP,
+	};
+	struct srpt_port *sport;
+	int i;
+
+	for (i = 1; i <= sdev->device->phys_port_cnt; i++) {
+		sport = &sdev->port[i - 1];
+		WARN_ON(sport->port != i);
+		if (ib_modify_port(sdev->device, i, 0, &port_modify) < 0)
+			printk(KERN_ERR "disabling MAD processing failed.\n");
+		if (sport->mad_agent) {
+			ib_unregister_mad_agent(sport->mad_agent);
+			sport->mad_agent = NULL;
+		}
+	}
+}
+
+/**
+ * srpt_alloc_ioctx() - Allocate an SRPT I/O context structure.
+ */
+static struct srpt_ioctx *srpt_alloc_ioctx(struct srpt_device *sdev,
+					   int ioctx_size, int dma_size,
+					   enum dma_data_direction dir)
+{
+	struct srpt_ioctx *ioctx;
+
+	ioctx = kmalloc(ioctx_size, GFP_KERNEL);
+	if (!ioctx)
+		goto err;
+
+	ioctx->buf = kmalloc(dma_size, GFP_KERNEL);
+	if (!ioctx->buf)
+		goto err_free_ioctx;
+
+	ioctx->dma = ib_dma_map_single(sdev->device, ioctx->buf, dma_size, dir);
+	if (ib_dma_mapping_error(sdev->device, ioctx->dma))
+		goto err_free_buf;
+
+	return ioctx;
+
+err_free_buf:
+	kfree(ioctx->buf);
+err_free_ioctx:
+	kfree(ioctx);
+err:
+	return NULL;
+}
+
+/**
+ * srpt_free_ioctx() - Free an SRPT I/O context structure.
+ */
+static void srpt_free_ioctx(struct srpt_device *sdev, struct srpt_ioctx *ioctx,
+			    int dma_size, enum dma_data_direction dir)
+{
+	if (!ioctx)
+		return;
+
+	ib_dma_unmap_single(sdev->device, ioctx->dma, dma_size, dir);
+	kfree(ioctx->buf);
+	kfree(ioctx);
+}
+
+/**
+ * srpt_alloc_ioctx_ring() - Allocate a ring of SRPT I/O context structures.
+ * @sdev:       Device to allocate the I/O context ring for.
+ * @ring_size:  Number of elements in the I/O context ring.
+ * @ioctx_size: I/O context size.
+ * @dma_size:   DMA buffer size.
+ * @dir:        DMA data direction.
+ */
+static struct srpt_ioctx **srpt_alloc_ioctx_ring(struct srpt_device *sdev,
+				int ring_size, int ioctx_size,
+				int dma_size, enum dma_data_direction dir)
+{
+	struct srpt_ioctx **ring;
+	int i;
+
+	WARN_ON(ioctx_size != sizeof(struct srpt_recv_ioctx)
+		&& ioctx_size != sizeof(struct srpt_send_ioctx));
+
+	ring = kmalloc(ring_size * sizeof(ring[0]), GFP_KERNEL);
+	if (!ring)
+		goto out;
+	for (i = 0; i < ring_size; ++i) {
+		ring[i] = srpt_alloc_ioctx(sdev, ioctx_size, dma_size, dir);
+		if (!ring[i])
+			goto err;
+		ring[i]->index = i;
+	}
+	goto out;
+
+err:
+	while (--i >= 0)
+		srpt_free_ioctx(sdev, ring[i], dma_size, dir);
+	kfree(ring);
+	ring = NULL;
+out:
+	return ring;
+}
+
+/**
+ * srpt_free_ioctx_ring() - Free the ring of SRPT I/O context structures.
+ */
+static void srpt_free_ioctx_ring(struct srpt_ioctx **ioctx_ring,
+				 struct srpt_device *sdev, int ring_size,
+				 int dma_size, enum dma_data_direction dir)
+{
+	int i;
+
+	for (i = 0; i < ring_size; ++i)
+		srpt_free_ioctx(sdev, ioctx_ring[i], dma_size, dir);
+	kfree(ioctx_ring);
+}
+
+/**
+ * srpt_get_cmd_state() - Get the state of a SCSI command.
+ */
+static enum srpt_command_state srpt_get_cmd_state(struct srpt_send_ioctx *ioctx)
+{
+	enum srpt_command_state state;
+	unsigned long flags;
+
+	BUG_ON(!ioctx);
+
+	spin_lock_irqsave(&ioctx->spinlock, flags);
+	state = ioctx->state;
+	spin_unlock_irqrestore(&ioctx->spinlock, flags);
+	return state;
+}
+
+/**
+ * srpt_set_cmd_state() - Set the state of a SCSI command.
+ *
+ * Does not modify the state of aborted commands. Returns the previous command
+ * state.
+ */
+static enum srpt_command_state srpt_set_cmd_state(struct srpt_send_ioctx *ioctx,
+						  enum srpt_command_state new)
+{
+	enum srpt_command_state previous;
+	unsigned long flags;
+
+	BUG_ON(!ioctx);
+
+	spin_lock_irqsave(&ioctx->spinlock, flags);
+	previous = ioctx->state;
+	if (previous != SRPT_STATE_DONE)
+		ioctx->state = new;
+	spin_unlock_irqrestore(&ioctx->spinlock, flags);
+
+	return previous;
+}
+
+/**
+ * srpt_test_and_set_cmd_state() - Test and set the state of a command.
+ *
+ * Returns true if and only if the previous command state was equal to 'old'.
+ */
+static bool srpt_test_and_set_cmd_state(struct srpt_send_ioctx *ioctx,
+					enum srpt_command_state old,
+					enum srpt_command_state new)
+{
+	enum srpt_command_state previous;
+	unsigned long flags;
+
+	WARN_ON(!ioctx);
+	WARN_ON(old == SRPT_STATE_DONE);
+	WARN_ON(new == SRPT_STATE_NEW);
+
+	spin_lock_irqsave(&ioctx->spinlock, flags);
+	previous = ioctx->state;
+	if (previous == old)
+		ioctx->state = new;
+	spin_unlock_irqrestore(&ioctx->spinlock, flags);
+	return previous == old;
+}
+
+/**
+ * srpt_post_recv() - Post an IB receive request.
+ */
+static int srpt_post_recv(struct srpt_device *sdev,
+			  struct srpt_recv_ioctx *ioctx)
+{
+	struct ib_sge list;
+	struct ib_recv_wr wr, *bad_wr;
+
+	BUG_ON(!sdev);
+	wr.wr_id = encode_wr_id(SRPT_RECV, ioctx->ioctx.index);
+
+	list.addr = ioctx->ioctx.dma;
+	list.length = srp_max_req_size;
+	list.lkey = sdev->mr->lkey;
+
+	wr.next = NULL;
+	wr.sg_list = &list;
+	wr.num_sge = 1;
+
+	return ib_post_srq_recv(sdev->srq, &wr, &bad_wr);
+}
+
+/**
+ * srpt_post_send() - Post an IB send request.
+ *
+ * Returns zero upon success and a non-zero value upon failure.
+ */
+static int srpt_post_send(struct srpt_rdma_ch *ch,
+			  struct srpt_send_ioctx *ioctx, int len)
+{
+	struct ib_sge list;
+	struct ib_send_wr wr, *bad_wr;
+	struct srpt_device *sdev = ch->sport->sdev;
+	int ret;
+
+	atomic_inc(&ch->req_lim);
+
+	ret = -ENOMEM;
+	if (unlikely(atomic_dec_return(&ch->sq_wr_avail) < 0)) {
+		printk(KERN_WARNING "IB send queue full (needed 1)\n");
+		goto out;
+	}
+
+	ib_dma_sync_single_for_device(sdev->device, ioctx->ioctx.dma, len,
+				      DMA_TO_DEVICE);
+
+	list.addr = ioctx->ioctx.dma;
+	list.length = len;
+	list.lkey = sdev->mr->lkey;
+
+	wr.next = NULL;
+	wr.wr_id = encode_wr_id(SRPT_SEND, ioctx->ioctx.index);
+	wr.sg_list = &list;
+	wr.num_sge = 1;
+	wr.opcode = IB_WR_SEND;
+	wr.send_flags = IB_SEND_SIGNALED;
+
+	ret = ib_post_send(ch->qp, &wr, &bad_wr);
+
+out:
+	if (ret < 0) {
+		atomic_inc(&ch->sq_wr_avail);
+		atomic_dec(&ch->req_lim);
+	}
+	return ret;
+}
+
+/**
+ * srpt_get_desc_tbl() - Parse the data descriptors of an SRP_CMD request.
+ * @ioctx: Pointer to the I/O context associated with the request.
+ * @srp_cmd: Pointer to the SRP_CMD request data.
+ * @dir: Pointer to the variable to which the transfer direction will be
+ *   written.
+ * @data_len: Pointer to the variable to which the total data length of all
+ *   descriptors in the SRP_CMD request will be written.
+ *
+ * This function initializes ioctx->nrbuf and ioctx->r_bufs.
+ *
+ * Returns -EINVAL when the SRP_CMD request contains inconsistent descriptors;
+ * -ENOMEM when memory allocation fails and zero upon success.
+ */
+static int srpt_get_desc_tbl(struct srpt_send_ioctx *ioctx,
+			     struct srp_cmd *srp_cmd,
+			     enum dma_data_direction *dir, u64 *data_len)
+{
+	struct srp_indirect_buf *idb;
+	struct srp_direct_buf *db;
+	unsigned add_cdb_offset;
+	int ret;
+
+	/*
+	 * The pointer computations below will only be compiled correctly
+	 * if srp_cmd::add_data is declared as s8*, u8*, s8[] or u8[], so check
+	 * whether srp_cmd::add_data has been declared as a byte pointer.
+	 */
+	BUILD_BUG_ON(!__same_type(srp_cmd->add_data[0], (s8)0)
+		     && !__same_type(srp_cmd->add_data[0], (u8)0));
+
+	BUG_ON(!dir);
+	BUG_ON(!data_len);
+
+	ret = 0;
+	*data_len = 0;
+
+	/*
+	 * The lower four bits of the buffer format field contain the DATA-IN
+	 * buffer descriptor format, and the highest four bits contain the
+	 * DATA-OUT buffer descriptor format.
+	 */
+	*dir = DMA_NONE;
+	if (srp_cmd->buf_fmt & 0xf)
+		/* DATA-IN: transfer data from target to initiator (read). */
+		*dir = DMA_FROM_DEVICE;
+	else if (srp_cmd->buf_fmt >> 4)
+		/* DATA-OUT: transfer data from initiator to target (write). */
+		*dir = DMA_TO_DEVICE;
+
+	/*
+	 * According to the SRP spec, the lower two bits of the 'ADDITIONAL
+	 * CDB LENGTH' field are reserved and the size in bytes of this field
+	 * is four times the value specified in bits 3..7. Hence the "& ~3".
+	 */
+	add_cdb_offset = srp_cmd->add_cdb_len & ~3;
+	if (((srp_cmd->buf_fmt & 0xf) == SRP_DATA_DESC_DIRECT) ||
+	    ((srp_cmd->buf_fmt >> 4) == SRP_DATA_DESC_DIRECT)) {
+		ioctx->n_rbuf = 1;
+		ioctx->rbufs = &ioctx->single_rbuf;
+
+		db = (struct srp_direct_buf *)(srp_cmd->add_data
+					       + add_cdb_offset);
+		memcpy(ioctx->rbufs, db, sizeof *db);
+		*data_len = be32_to_cpu(db->len);
+	} else if (((srp_cmd->buf_fmt & 0xf) == SRP_DATA_DESC_INDIRECT) ||
+		   ((srp_cmd->buf_fmt >> 4) == SRP_DATA_DESC_INDIRECT)) {
+		idb = (struct srp_indirect_buf *)(srp_cmd->add_data
+						  + add_cdb_offset);
+
+		ioctx->n_rbuf = be32_to_cpu(idb->table_desc.len) / sizeof *db;
+
+		if (ioctx->n_rbuf >
+		    (srp_cmd->data_out_desc_cnt + srp_cmd->data_in_desc_cnt)) {
+			printk(KERN_ERR "received unsupported SRP_CMD request"
+			       " type (%u out + %u in != %u / %zu)\n",
+			       srp_cmd->data_out_desc_cnt,
+			       srp_cmd->data_in_desc_cnt,
+			       be32_to_cpu(idb->table_desc.len),
+			       sizeof(*db));
+			ioctx->n_rbuf = 0;
+			ret = -EINVAL;
+			goto out;
+		}
+
+		if (ioctx->n_rbuf == 1)
+			ioctx->rbufs = &ioctx->single_rbuf;
+		else {
+			ioctx->rbufs =
+				kmalloc(ioctx->n_rbuf * sizeof *db, GFP_ATOMIC);
+			if (!ioctx->rbufs) {
+				ioctx->n_rbuf = 0;
+				ret = -ENOMEM;
+				goto out;
+			}
+		}
+
+		db = idb->desc_list;
+		memcpy(ioctx->rbufs, db, ioctx->n_rbuf * sizeof *db);
+		*data_len = be32_to_cpu(idb->len);
+	}
+out:
+	return ret;
+}
+
+/**
+ * srpt_init_ch_qp() - Initialize queue pair attributes.
+ *
+ * Initialized the attributes of queue pair 'qp' by allowing local write,
+ * remote read and remote write. Also transitions 'qp' to state IB_QPS_INIT.
+ */
+static int srpt_init_ch_qp(struct srpt_rdma_ch *ch, struct ib_qp *qp)
+{
+	struct ib_qp_attr *attr;
+	int ret;
+
+	attr = kzalloc(sizeof *attr, GFP_KERNEL);
+	if (!attr)
+		return -ENOMEM;
+
+	attr->qp_state = IB_QPS_INIT;
+	attr->qp_access_flags = IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_READ |
+	    IB_ACCESS_REMOTE_WRITE;
+	attr->port_num = ch->sport->port;
+	attr->pkey_index = 0;
+
+	ret = ib_modify_qp(qp, attr,
+			   IB_QP_STATE | IB_QP_ACCESS_FLAGS | IB_QP_PORT |
+			   IB_QP_PKEY_INDEX);
+
+	kfree(attr);
+	return ret;
+}
+
+/**
+ * srpt_ch_qp_rtr() - Change the state of a channel to 'ready to receive' (RTR).
+ * @ch: channel of the queue pair.
+ * @qp: queue pair to change the state of.
+ *
+ * Returns zero upon success and a negative value upon failure.
+ *
+ * Note: currently a struct ib_qp_attr takes 136 bytes on a 64-bit system.
+ * If this structure ever becomes larger, it might be necessary to allocate
+ * it dynamically instead of on the stack.
+ */
+static int srpt_ch_qp_rtr(struct srpt_rdma_ch *ch, struct ib_qp *qp)
+{
+	struct ib_qp_attr qp_attr;
+	int attr_mask;
+	int ret;
+
+	qp_attr.qp_state = IB_QPS_RTR;
+	ret = ib_cm_init_qp_attr(ch->cm_id, &qp_attr, &attr_mask);
+	if (ret)
+		goto out;
+
+	qp_attr.max_dest_rd_atomic = 4;
+
+	ret = ib_modify_qp(qp, &qp_attr, attr_mask);
+
+out:
+	return ret;
+}
+
+/**
+ * srpt_ch_qp_rts() - Change the state of a channel to 'ready to send' (RTS).
+ * @ch: channel of the queue pair.
+ * @qp: queue pair to change the state of.
+ *
+ * Returns zero upon success and a negative value upon failure.
+ *
+ * Note: currently a struct ib_qp_attr takes 136 bytes on a 64-bit system.
+ * If this structure ever becomes larger, it might be necessary to allocate
+ * it dynamically instead of on the stack.
+ */
+static int srpt_ch_qp_rts(struct srpt_rdma_ch *ch, struct ib_qp *qp)
+{
+	struct ib_qp_attr qp_attr;
+	int attr_mask;
+	int ret;
+
+	qp_attr.qp_state = IB_QPS_RTS;
+	ret = ib_cm_init_qp_attr(ch->cm_id, &qp_attr, &attr_mask);
+	if (ret)
+		goto out;
+
+	qp_attr.max_rd_atomic = 4;
+
+	ret = ib_modify_qp(qp, &qp_attr, attr_mask);
+
+out:
+	return ret;
+}
+
+/**
+ * srpt_ch_qp_err() - Set the channel queue pair state to 'error'.
+ */
+static int srpt_ch_qp_err(struct srpt_rdma_ch *ch)
+{
+	struct ib_qp_attr qp_attr;
+
+	qp_attr.qp_state = IB_QPS_ERR;
+	return ib_modify_qp(ch->qp, &qp_attr, IB_QP_STATE);
+}
+
+/**
+ * srpt_unmap_sg_to_ib_sge() - Unmap an IB SGE list.
+ */
+static void srpt_unmap_sg_to_ib_sge(struct srpt_rdma_ch *ch,
+				    struct srpt_send_ioctx *ioctx)
+{
+	struct scatterlist *sg;
+	enum dma_data_direction dir;
+
+	BUG_ON(!ch);
+	BUG_ON(!ioctx);
+	BUG_ON(ioctx->n_rdma && !ioctx->rdma_ius);
+
+	while (ioctx->n_rdma)
+		kfree(ioctx->rdma_ius[--ioctx->n_rdma].sge);
+
+	kfree(ioctx->rdma_ius);
+	ioctx->rdma_ius = NULL;
+
+	if (ioctx->mapped_sg_count) {
+		sg = ioctx->sg;
+		WARN_ON(!sg);
+		dir = ioctx->cmd.data_direction;
+		BUG_ON(dir == DMA_NONE);
+		ib_dma_unmap_sg(ch->sport->sdev->device, sg, ioctx->sg_cnt,
+				opposite_dma_dir(dir));
+		ioctx->mapped_sg_count = 0;
+	}
+}
+
+/**
+ * srpt_map_sg_to_ib_sge() - Map an SG list to an IB SGE list.
+ */
+static int srpt_map_sg_to_ib_sge(struct srpt_rdma_ch *ch,
+				 struct srpt_send_ioctx *ioctx)
+{
+	struct ib_device *dev = ch->sport->sdev->device;
+	struct se_cmd *cmd;
+	struct scatterlist *sg, *sg_orig;
+	int sg_cnt;
+	enum dma_data_direction dir;
+	struct rdma_iu *riu;
+	struct srp_direct_buf *db;
+	dma_addr_t dma_addr;
+	struct ib_sge *sge;
+	u64 raddr;
+	u32 rsize;
+	u32 tsize;
+	u32 dma_len;
+	int count, nrdma;
+	int i, j, k;
+
+	BUG_ON(!ch);
+	BUG_ON(!ioctx);
+	cmd = &ioctx->cmd;
+	dir = cmd->data_direction;
+	BUG_ON(dir == DMA_NONE);
+
+	ioctx->sg = sg = sg_orig = cmd->t_data_sg;
+	ioctx->sg_cnt = sg_cnt = cmd->t_data_nents;
+
+	count = ib_dma_map_sg(ch->sport->sdev->device, sg, sg_cnt,
+			      opposite_dma_dir(dir));
+	if (unlikely(!count))
+		return -EAGAIN;
+
+	ioctx->mapped_sg_count = count;
+
+	if (ioctx->rdma_ius && ioctx->n_rdma_ius)
+		nrdma = ioctx->n_rdma_ius;
+	else {
+		nrdma = (count + SRPT_DEF_SG_PER_WQE - 1) / SRPT_DEF_SG_PER_WQE
+			+ ioctx->n_rbuf;
+
+		ioctx->rdma_ius = kzalloc(nrdma * sizeof *riu, GFP_KERNEL);
+		if (!ioctx->rdma_ius)
+			goto free_mem;
+
+		ioctx->n_rdma_ius = nrdma;
+	}
+
+	db = ioctx->rbufs;
+	tsize = cmd->data_length;
+	dma_len = ib_sg_dma_len(dev, &sg[0]);
+	riu = ioctx->rdma_ius;
+
+	/*
+	 * For each remote desc - calculate the #ib_sge.
+	 * If #ib_sge < SRPT_DEF_SG_PER_WQE per rdma operation then
+	 *      each remote desc rdma_iu is required a rdma wr;
+	 * else
+	 *      we need to allocate extra rdma_iu to carry extra #ib_sge in
+	 *      another rdma wr
+	 */
+	for (i = 0, j = 0;
+	     j < count && i < ioctx->n_rbuf && tsize > 0; ++i, ++riu, ++db) {
+		rsize = be32_to_cpu(db->len);
+		raddr = be64_to_cpu(db->va);
+		riu->raddr = raddr;
+		riu->rkey = be32_to_cpu(db->key);
+		riu->sge_cnt = 0;
+
+		/* calculate how many sge required for this remote_buf */
+		while (rsize > 0 && tsize > 0) {
+
+			if (rsize >= dma_len) {
+				tsize -= dma_len;
+				rsize -= dma_len;
+				raddr += dma_len;
+
+				if (tsize > 0) {
+					++j;
+					if (j < count) {
+						sg = sg_next(sg);
+						dma_len = ib_sg_dma_len(
+								dev, sg);
+					}
+				}
+			} else {
+				tsize -= rsize;
+				dma_len -= rsize;
+				rsize = 0;
+			}
+
+			++riu->sge_cnt;
+
+			if (rsize > 0 && riu->sge_cnt == SRPT_DEF_SG_PER_WQE) {
+				++ioctx->n_rdma;
+				riu->sge =
+				    kmalloc(riu->sge_cnt * sizeof *riu->sge,
+					    GFP_KERNEL);
+				if (!riu->sge)
+					goto free_mem;
+
+				++riu;
+				riu->sge_cnt = 0;
+				riu->raddr = raddr;
+				riu->rkey = be32_to_cpu(db->key);
+			}
+		}
+
+		++ioctx->n_rdma;
+		riu->sge = kmalloc(riu->sge_cnt * sizeof *riu->sge,
+				   GFP_KERNEL);
+		if (!riu->sge)
+			goto free_mem;
+	}
+
+	db = ioctx->rbufs;
+	tsize = cmd->data_length;
+	riu = ioctx->rdma_ius;
+	sg = sg_orig;
+	dma_len = ib_sg_dma_len(dev, &sg[0]);
+	dma_addr = ib_sg_dma_address(dev, &sg[0]);
+
+	/* this second loop is really mapped sg_addres to rdma_iu->ib_sge */
+	for (i = 0, j = 0;
+	     j < count && i < ioctx->n_rbuf && tsize > 0; ++i, ++riu, ++db) {
+		rsize = be32_to_cpu(db->len);
+		sge = riu->sge;
+		k = 0;
+
+		while (rsize > 0 && tsize > 0) {
+			sge->addr = dma_addr;
+			sge->lkey = ch->sport->sdev->mr->lkey;
+
+			if (rsize >= dma_len) {
+				sge->length =
+					(tsize < dma_len) ? tsize : dma_len;
+				tsize -= dma_len;
+				rsize -= dma_len;
+
+				if (tsize > 0) {
+					++j;
+					if (j < count) {
+						sg = sg_next(sg);
+						dma_len = ib_sg_dma_len(
+								dev, sg);
+						dma_addr = ib_sg_dma_address(
+								dev, sg);
+					}
+				}
+			} else {
+				sge->length = (tsize < rsize) ? tsize : rsize;
+				tsize -= rsize;
+				dma_len -= rsize;
+				dma_addr += rsize;
+				rsize = 0;
+			}
+
+			++k;
+			if (k == riu->sge_cnt && rsize > 0 && tsize > 0) {
+				++riu;
+				sge = riu->sge;
+				k = 0;
+			} else if (rsize > 0 && tsize > 0)
+				++sge;
+		}
+	}
+
+	return 0;
+
+free_mem:
+	srpt_unmap_sg_to_ib_sge(ch, ioctx);
+
+	return -ENOMEM;
+}
+
+/**
+ * srpt_get_send_ioctx() - Obtain an I/O context for sending to the initiator.
+ */
+static struct srpt_send_ioctx *srpt_get_send_ioctx(struct srpt_rdma_ch *ch)
+{
+	struct srpt_send_ioctx *ioctx;
+	unsigned long flags;
+
+	BUG_ON(!ch);
+
+	ioctx = NULL;
+	spin_lock_irqsave(&ch->spinlock, flags);
+	if (!list_empty(&ch->free_list)) {
+		ioctx = list_first_entry(&ch->free_list,
+					 struct srpt_send_ioctx, free_list);
+		list_del(&ioctx->free_list);
+	}
+	spin_unlock_irqrestore(&ch->spinlock, flags);
+
+	if (!ioctx)
+		return ioctx;
+
+	BUG_ON(ioctx->ch != ch);
+	spin_lock_init(&ioctx->spinlock);
+	ioctx->state = SRPT_STATE_NEW;
+	ioctx->n_rbuf = 0;
+	ioctx->rbufs = NULL;
+	ioctx->n_rdma = 0;
+	ioctx->n_rdma_ius = 0;
+	ioctx->rdma_ius = NULL;
+	ioctx->mapped_sg_count = 0;
+	init_completion(&ioctx->tx_done);
+	ioctx->queue_status_only = false;
+	/*
+	 * transport_init_se_cmd() does not initialize all fields, so do it
+	 * here.
+	 */
+	memset(&ioctx->cmd, 0, sizeof(ioctx->cmd));
+	memset(&ioctx->sense_data, 0, sizeof(ioctx->sense_data));
+
+	return ioctx;
+}
+
+/**
+ * srpt_abort_cmd() - Abort a SCSI command.
+ * @ioctx:   I/O context associated with the SCSI command.
+ * @context: Preferred execution context.
+ */
+static int srpt_abort_cmd(struct srpt_send_ioctx *ioctx)
+{
+	enum srpt_command_state state;
+	unsigned long flags;
+
+	BUG_ON(!ioctx);
+
+	/*
+	 * If the command is in a state where the target core is waiting for
+	 * the ib_srpt driver, change the state to the next state. Changing
+	 * the state of the command from SRPT_STATE_NEED_DATA to
+	 * SRPT_STATE_DATA_IN ensures that srpt_xmit_response() will call this
+	 * function a second time.
+	 */
+
+	spin_lock_irqsave(&ioctx->spinlock, flags);
+	state = ioctx->state;
+	switch (state) {
+	case SRPT_STATE_NEED_DATA:
+		ioctx->state = SRPT_STATE_DATA_IN;
+		break;
+	case SRPT_STATE_DATA_IN:
+	case SRPT_STATE_CMD_RSP_SENT:
+	case SRPT_STATE_MGMT_RSP_SENT:
+		ioctx->state = SRPT_STATE_DONE;
+		break;
+	default:
+		break;
+	}
+	spin_unlock_irqrestore(&ioctx->spinlock, flags);
+
+	if (state == SRPT_STATE_DONE) {
+		struct srpt_rdma_ch *ch = ioctx->ch;
+
+		BUG_ON(ch->sess == NULL);
+
+		target_put_sess_cmd(ch->sess, &ioctx->cmd);
+		goto out;
+	}
+
+	pr_debug("Aborting cmd with state %d and tag %lld\n", state,
+		 ioctx->tag);
+
+	switch (state) {
+	case SRPT_STATE_NEW:
+	case SRPT_STATE_DATA_IN:
+	case SRPT_STATE_MGMT:
+		/*
+		 * Do nothing - defer abort processing until
+		 * srpt_queue_response() is invoked.
+		 */
+		WARN_ON(!transport_check_aborted_status(&ioctx->cmd, false));
+		break;
+	case SRPT_STATE_NEED_DATA:
+		/* DMA_TO_DEVICE (write) - RDMA read error. */
+
+		/* XXX(hch): this is a horrible layering violation.. */
+		spin_lock_irqsave(&ioctx->cmd.t_state_lock, flags);
+		ioctx->cmd.transport_state &= ~CMD_T_ACTIVE;
+		spin_unlock_irqrestore(&ioctx->cmd.t_state_lock, flags);
+		break;
+	case SRPT_STATE_CMD_RSP_SENT:
+		/*
+		 * SRP_RSP sending failed or the SRP_RSP send completion has
+		 * not been received in time.
+		 */
+		srpt_unmap_sg_to_ib_sge(ioctx->ch, ioctx);
+		target_put_sess_cmd(ioctx->ch->sess, &ioctx->cmd);
+		break;
+	case SRPT_STATE_MGMT_RSP_SENT:
+		srpt_set_cmd_state(ioctx, SRPT_STATE_DONE);
+		target_put_sess_cmd(ioctx->ch->sess, &ioctx->cmd);
+		break;
+	default:
+		WARN(1, "Unexpected command state (%d)", state);
+		break;
+	}
+
+out:
+	return state;
+}
+
+/**
+ * srpt_handle_send_err_comp() - Process an IB_WC_SEND error completion.
+ */
+static void srpt_handle_send_err_comp(struct srpt_rdma_ch *ch, u64 wr_id)
+{
+	struct srpt_send_ioctx *ioctx;
+	enum srpt_command_state state;
+	struct se_cmd *cmd;
+	u32 index;
+
+	atomic_inc(&ch->sq_wr_avail);
+
+	index = idx_from_wr_id(wr_id);
+	ioctx = ch->ioctx_ring[index];
+	state = srpt_get_cmd_state(ioctx);
+	cmd = &ioctx->cmd;
+
+	WARN_ON(state != SRPT_STATE_CMD_RSP_SENT
+		&& state != SRPT_STATE_MGMT_RSP_SENT
+		&& state != SRPT_STATE_NEED_DATA
+		&& state != SRPT_STATE_DONE);
+
+	/* If SRP_RSP sending failed, undo the ch->req_lim change. */
+	if (state == SRPT_STATE_CMD_RSP_SENT
+	    || state == SRPT_STATE_MGMT_RSP_SENT)
+		atomic_dec(&ch->req_lim);
+
+	srpt_abort_cmd(ioctx);
+}
+
+/**
+ * srpt_handle_send_comp() - Process an IB send completion notification.
+ */
+static void srpt_handle_send_comp(struct srpt_rdma_ch *ch,
+				  struct srpt_send_ioctx *ioctx)
+{
+	enum srpt_command_state state;
+
+	atomic_inc(&ch->sq_wr_avail);
+
+	state = srpt_set_cmd_state(ioctx, SRPT_STATE_DONE);
+
+	if (WARN_ON(state != SRPT_STATE_CMD_RSP_SENT
+		    && state != SRPT_STATE_MGMT_RSP_SENT
+		    && state != SRPT_STATE_DONE))
+		pr_debug("state = %d\n", state);
+
+	if (state != SRPT_STATE_DONE) {
+		srpt_unmap_sg_to_ib_sge(ch, ioctx);
+		transport_generic_free_cmd(&ioctx->cmd, 0);
+	} else {
+		printk(KERN_ERR "IB completion has been received too late for"
+		       " wr_id = %u.\n", ioctx->ioctx.index);
+	}
+}
+
+/**
+ * srpt_handle_rdma_comp() - Process an IB RDMA completion notification.
+ *
+ * XXX: what is now target_execute_cmd used to be asynchronous, and unmapping
+ * the data that has been transferred via IB RDMA had to be postponed until the
+ * check_stop_free() callback.  None of this is necessary anymore and needs to
+ * be cleaned up.
+ */
+static void srpt_handle_rdma_comp(struct srpt_rdma_ch *ch,
+				  struct srpt_send_ioctx *ioctx,
+				  enum srpt_opcode opcode)
+{
+	WARN_ON(ioctx->n_rdma <= 0);
+	atomic_add(ioctx->n_rdma, &ch->sq_wr_avail);
+
+	if (opcode == SRPT_RDMA_READ_LAST) {
+		if (srpt_test_and_set_cmd_state(ioctx, SRPT_STATE_NEED_DATA,
+						SRPT_STATE_DATA_IN))
+			target_execute_cmd(&ioctx->cmd);
+		else
+			printk(KERN_ERR "%s[%d]: wrong state = %d\n", __func__,
+			       __LINE__, srpt_get_cmd_state(ioctx));
+	} else if (opcode == SRPT_RDMA_ABORT) {
+		ioctx->rdma_aborted = true;
+	} else {
+		WARN(true, "unexpected opcode %d\n", opcode);
+	}
+}
+
+/**
+ * srpt_handle_rdma_err_comp() - Process an IB RDMA error completion.
+ */
+static void srpt_handle_rdma_err_comp(struct srpt_rdma_ch *ch,
+				      struct srpt_send_ioctx *ioctx,
+				      enum srpt_opcode opcode)
+{
+	struct se_cmd *cmd;
+	enum srpt_command_state state;
+
+	cmd = &ioctx->cmd;
+	state = srpt_get_cmd_state(ioctx);
+	switch (opcode) {
+	case SRPT_RDMA_READ_LAST:
+		if (ioctx->n_rdma <= 0) {
+			printk(KERN_ERR "Received invalid RDMA read"
+			       " error completion with idx %d\n",
+			       ioctx->ioctx.index);
+			break;
+		}
+		atomic_add(ioctx->n_rdma, &ch->sq_wr_avail);
+		if (state == SRPT_STATE_NEED_DATA)
+			srpt_abort_cmd(ioctx);
+		else
+			printk(KERN_ERR "%s[%d]: wrong state = %d\n",
+			       __func__, __LINE__, state);
+		break;
+	case SRPT_RDMA_WRITE_LAST:
+		break;
+	default:
+		printk(KERN_ERR "%s[%d]: opcode = %u\n", __func__,
+		       __LINE__, opcode);
+		break;
+	}
+}
+
+/**
+ * srpt_build_cmd_rsp() - Build an SRP_RSP response.
+ * @ch: RDMA channel through which the request has been received.
+ * @ioctx: I/O context associated with the SRP_CMD request. The response will
+ *   be built in the buffer ioctx->buf points at and hence this function will
+ *   overwrite the request data.
+ * @tag: tag of the request for which this response is being generated.
+ * @status: value for the STATUS field of the SRP_RSP information unit.
+ *
+ * Returns the size in bytes of the SRP_RSP response.
+ *
+ * An SRP_RSP response contains a SCSI status or service response. See also
+ * section 6.9 in the SRP r16a document for the format of an SRP_RSP
+ * response. See also SPC-2 for more information about sense data.
+ */
+static int srpt_build_cmd_rsp(struct srpt_rdma_ch *ch,
+			      struct srpt_send_ioctx *ioctx, u64 tag,
+			      int status)
+{
+	struct srp_rsp *srp_rsp;
+	const u8 *sense_data;
+	int sense_data_len, max_sense_len;
+
+	/*
+	 * The lowest bit of all SAM-3 status codes is zero (see also
+	 * paragraph 5.3 in SAM-3).
+	 */
+	WARN_ON(status & 1);
+
+	srp_rsp = ioctx->ioctx.buf;
+	BUG_ON(!srp_rsp);
+
+	sense_data = ioctx->sense_data;
+	sense_data_len = ioctx->cmd.scsi_sense_length;
+	WARN_ON(sense_data_len > sizeof(ioctx->sense_data));
+
+	memset(srp_rsp, 0, sizeof *srp_rsp);
+	srp_rsp->opcode = SRP_RSP;
+	srp_rsp->req_lim_delta =
+		__constant_cpu_to_be32(1 + atomic_xchg(&ch->req_lim_delta, 0));
+	srp_rsp->tag = tag;
+	srp_rsp->status = status;
+
+	if (sense_data_len) {
+		BUILD_BUG_ON(MIN_MAX_RSP_SIZE <= sizeof(*srp_rsp));
+		max_sense_len = ch->max_ti_iu_len - sizeof(*srp_rsp);
+		if (sense_data_len > max_sense_len) {
+			printk(KERN_WARNING "truncated sense data from %d to %d"
+			       " bytes\n", sense_data_len, max_sense_len);
+			sense_data_len = max_sense_len;
+		}
+
+		srp_rsp->flags |= SRP_RSP_FLAG_SNSVALID;
+		srp_rsp->sense_data_len = cpu_to_be32(sense_data_len);
+		memcpy(srp_rsp + 1, sense_data, sense_data_len);
+	}
+
+	return sizeof(*srp_rsp) + sense_data_len;
+}
+
+/**
+ * srpt_build_tskmgmt_rsp() - Build a task management response.
+ * @ch:       RDMA channel through which the request has been received.
+ * @ioctx:    I/O context in which the SRP_RSP response will be built.
+ * @rsp_code: RSP_CODE that will be stored in the response.
+ * @tag:      Tag of the request for which this response is being generated.
+ *
+ * Returns the size in bytes of the SRP_RSP response.
+ *
+ * An SRP_RSP response contains a SCSI status or service response. See also
+ * section 6.9 in the SRP r16a document for the format of an SRP_RSP
+ * response.
+ */
+static int srpt_build_tskmgmt_rsp(struct srpt_rdma_ch *ch,
+				  struct srpt_send_ioctx *ioctx,
+				  u8 rsp_code, u64 tag)
+{
+	struct srp_rsp *srp_rsp;
+	int resp_data_len;
+	int resp_len;
+
+	resp_data_len = 4;
+	resp_len = sizeof(*srp_rsp) + resp_data_len;
+
+	srp_rsp = ioctx->ioctx.buf;
+	BUG_ON(!srp_rsp);
+	memset(srp_rsp, 0, sizeof *srp_rsp);
+
+	srp_rsp->opcode = SRP_RSP;
+	srp_rsp->req_lim_delta = __constant_cpu_to_be32(1
+				    + atomic_xchg(&ch->req_lim_delta, 0));
+	srp_rsp->tag = tag;
+
+	srp_rsp->flags |= SRP_RSP_FLAG_RSPVALID;
+	srp_rsp->resp_data_len = cpu_to_be32(resp_data_len);
+	srp_rsp->data[3] = rsp_code;
+
+	return resp_len;
+}
+
+#define NO_SUCH_LUN ((uint64_t)-1LL)
+
+/*
+ * SCSI LUN addressing method. See also SAM-2 and the section about
+ * eight byte LUNs.
+ */
+enum scsi_lun_addr_method {
+	SCSI_LUN_ADDR_METHOD_PERIPHERAL   = 0,
+	SCSI_LUN_ADDR_METHOD_FLAT         = 1,
+	SCSI_LUN_ADDR_METHOD_LUN          = 2,
+	SCSI_LUN_ADDR_METHOD_EXTENDED_LUN = 3,
+};
+
+/*
+ * srpt_unpack_lun() - Convert from network LUN to linear LUN.
+ *
+ * Convert an 2-byte, 4-byte, 6-byte or 8-byte LUN structure in network byte
+ * order (big endian) to a linear LUN. Supports three LUN addressing methods:
+ * peripheral, flat and logical unit. See also SAM-2, section 4.9.4 (page 40).
+ */
+static uint64_t srpt_unpack_lun(const uint8_t *lun, int len)
+{
+	uint64_t res = NO_SUCH_LUN;
+	int addressing_method;
+
+	if (unlikely(len < 2)) {
+		printk(KERN_ERR "Illegal LUN length %d, expected 2 bytes or "
+		       "more", len);
+		goto out;
+	}
+
+	switch (len) {
+	case 8:
+		if ((*((__be64 *)lun) &
+		     __constant_cpu_to_be64(0x0000FFFFFFFFFFFFLL)) != 0)
+			goto out_err;
+		break;
+	case 4:
+		if (*((__be16 *)&lun[2]) != 0)
+			goto out_err;
+		break;
+	case 6:
+		if (*((__be32 *)&lun[2]) != 0)
+			goto out_err;
+		break;
+	case 2:
+		break;
+	default:
+		goto out_err;
+	}
+
+	addressing_method = (*lun) >> 6; /* highest two bits of byte 0 */
+	switch (addressing_method) {
+	case SCSI_LUN_ADDR_METHOD_PERIPHERAL:
+	case SCSI_LUN_ADDR_METHOD_FLAT:
+	case SCSI_LUN_ADDR_METHOD_LUN:
+		res = *(lun + 1) | (((*lun) & 0x3f) << 8);
+		break;
+
+	case SCSI_LUN_ADDR_METHOD_EXTENDED_LUN:
+	default:
+		printk(KERN_ERR "Unimplemented LUN addressing method %u",
+		       addressing_method);
+		break;
+	}
+
+out:
+	return res;
+
+out_err:
+	printk(KERN_ERR "Support for multi-level LUNs has not yet been"
+	       " implemented");
+	goto out;
+}
+
+static int srpt_check_stop_free(struct se_cmd *cmd)
+{
+	struct srpt_send_ioctx *ioctx = container_of(cmd,
+				struct srpt_send_ioctx, cmd);
+
+	return target_put_sess_cmd(ioctx->ch->sess, &ioctx->cmd);
+}
+
+/**
+ * srpt_handle_cmd() - Process SRP_CMD.
+ */
+static int srpt_handle_cmd(struct srpt_rdma_ch *ch,
+			   struct srpt_recv_ioctx *recv_ioctx,
+			   struct srpt_send_ioctx *send_ioctx)
+{
+	struct se_cmd *cmd;
+	struct srp_cmd *srp_cmd;
+	uint64_t unpacked_lun;
+	u64 data_len;
+	enum dma_data_direction dir;
+	sense_reason_t ret;
+	int rc;
+
+	BUG_ON(!send_ioctx);
+
+	srp_cmd = recv_ioctx->ioctx.buf;
+	cmd = &send_ioctx->cmd;
+	send_ioctx->tag = srp_cmd->tag;
+
+	switch (srp_cmd->task_attr) {
+	case SRP_CMD_SIMPLE_Q:
+		cmd->sam_task_attr = MSG_SIMPLE_TAG;
+		break;
+	case SRP_CMD_ORDERED_Q:
+	default:
+		cmd->sam_task_attr = MSG_ORDERED_TAG;
+		break;
+	case SRP_CMD_HEAD_OF_Q:
+		cmd->sam_task_attr = MSG_HEAD_TAG;
+		break;
+	case SRP_CMD_ACA:
+		cmd->sam_task_attr = MSG_ACA_TAG;
+		break;
+	}
+
+	if (srpt_get_desc_tbl(send_ioctx, srp_cmd, &dir, &data_len)) {
+		printk(KERN_ERR "0x%llx: parsing SRP descriptor table failed.\n",
+		       srp_cmd->tag);
+		ret = TCM_INVALID_CDB_FIELD;
+		goto send_sense;
+	}
+
+	unpacked_lun = srpt_unpack_lun((uint8_t *)&srp_cmd->lun,
+				       sizeof(srp_cmd->lun));
+	rc = target_submit_cmd(cmd, ch->sess, srp_cmd->cdb,
+			&send_ioctx->sense_data[0], unpacked_lun, data_len,
+			MSG_SIMPLE_TAG, dir, TARGET_SCF_ACK_KREF);
+	if (rc != 0) {
+		ret = TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE;
+		goto send_sense;
+	}
+	return 0;
+
+send_sense:
+	transport_send_check_condition_and_sense(cmd, ret, 0);
+	return -1;
+}
+
+/**
+ * srpt_rx_mgmt_fn_tag() - Process a task management function by tag.
+ * @ch: RDMA channel of the task management request.
+ * @fn: Task management function to perform.
+ * @req_tag: Tag of the SRP task management request.
+ * @mgmt_ioctx: I/O context of the task management request.
+ *
+ * Returns zero if the target core will process the task management
+ * request asynchronously.
+ *
+ * Note: It is assumed that the initiator serializes tag-based task management
+ * requests.
+ */
+static int srpt_rx_mgmt_fn_tag(struct srpt_send_ioctx *ioctx, u64 tag)
+{
+	struct srpt_device *sdev;
+	struct srpt_rdma_ch *ch;
+	struct srpt_send_ioctx *target;
+	int ret, i;
+
+	ret = -EINVAL;
+	ch = ioctx->ch;
+	BUG_ON(!ch);
+	BUG_ON(!ch->sport);
+	sdev = ch->sport->sdev;
+	BUG_ON(!sdev);
+	spin_lock_irq(&sdev->spinlock);
+	for (i = 0; i < ch->rq_size; ++i) {
+		target = ch->ioctx_ring[i];
+		if (target->cmd.se_lun == ioctx->cmd.se_lun &&
+		    target->tag == tag &&
+		    srpt_get_cmd_state(target) != SRPT_STATE_DONE) {
+			ret = 0;
+			/* now let the target core abort &target->cmd; */
+			break;
+		}
+	}
+	spin_unlock_irq(&sdev->spinlock);
+	return ret;
+}
+
+static int srp_tmr_to_tcm(int fn)
+{
+	switch (fn) {
+	case SRP_TSK_ABORT_TASK:
+		return TMR_ABORT_TASK;
+	case SRP_TSK_ABORT_TASK_SET:
+		return TMR_ABORT_TASK_SET;
+	case SRP_TSK_CLEAR_TASK_SET:
+		return TMR_CLEAR_TASK_SET;
+	case SRP_TSK_LUN_RESET:
+		return TMR_LUN_RESET;
+	case SRP_TSK_CLEAR_ACA:
+		return TMR_CLEAR_ACA;
+	default:
+		return -1;
+	}
+}
+
+/**
+ * srpt_handle_tsk_mgmt() - Process an SRP_TSK_MGMT information unit.
+ *
+ * Returns 0 if and only if the request will be processed by the target core.
+ *
+ * For more information about SRP_TSK_MGMT information units, see also section
+ * 6.7 in the SRP r16a document.
+ */
+static void srpt_handle_tsk_mgmt(struct srpt_rdma_ch *ch,
+				 struct srpt_recv_ioctx *recv_ioctx,
+				 struct srpt_send_ioctx *send_ioctx)
+{
+	struct srp_tsk_mgmt *srp_tsk;
+	struct se_cmd *cmd;
+	struct se_session *sess = ch->sess;
+	uint64_t unpacked_lun;
+	uint32_t tag = 0;
+	int tcm_tmr;
+	int rc;
+
+	BUG_ON(!send_ioctx);
+
+	srp_tsk = recv_ioctx->ioctx.buf;
+	cmd = &send_ioctx->cmd;
+
+	pr_debug("recv tsk_mgmt fn %d for task_tag %lld and cmd tag %lld"
+		 " cm_id %p sess %p\n", srp_tsk->tsk_mgmt_func,
+		 srp_tsk->task_tag, srp_tsk->tag, ch->cm_id, ch->sess);
+
+	srpt_set_cmd_state(send_ioctx, SRPT_STATE_MGMT);
+	send_ioctx->tag = srp_tsk->tag;
+	tcm_tmr = srp_tmr_to_tcm(srp_tsk->tsk_mgmt_func);
+	if (tcm_tmr < 0) {
+		send_ioctx->cmd.se_tmr_req->response =
+			TMR_TASK_MGMT_FUNCTION_NOT_SUPPORTED;
+		goto fail;
+	}
+	unpacked_lun = srpt_unpack_lun((uint8_t *)&srp_tsk->lun,
+				       sizeof(srp_tsk->lun));
+
+	if (srp_tsk->tsk_mgmt_func == SRP_TSK_ABORT_TASK) {
+		rc = srpt_rx_mgmt_fn_tag(send_ioctx, srp_tsk->task_tag);
+		if (rc < 0) {
+			send_ioctx->cmd.se_tmr_req->response =
+					TMR_TASK_DOES_NOT_EXIST;
+			goto fail;
+		}
+		tag = srp_tsk->task_tag;
+	}
+	rc = target_submit_tmr(&send_ioctx->cmd, sess, NULL, unpacked_lun,
+				srp_tsk, tcm_tmr, GFP_KERNEL, tag,
+				TARGET_SCF_ACK_KREF);
+	if (rc != 0) {
+		send_ioctx->cmd.se_tmr_req->response = TMR_FUNCTION_REJECTED;
+		goto fail;
+	}
+	return;
+fail:
+	transport_send_check_condition_and_sense(cmd, 0, 0); // XXX:
+}
+
+/**
+ * srpt_handle_new_iu() - Process a newly received information unit.
+ * @ch:    RDMA channel through which the information unit has been received.
+ * @ioctx: SRPT I/O context associated with the information unit.
+ */
+static void srpt_handle_new_iu(struct srpt_rdma_ch *ch,
+			       struct srpt_recv_ioctx *recv_ioctx,
+			       struct srpt_send_ioctx *send_ioctx)
+{
+	struct srp_cmd *srp_cmd;
+	enum rdma_ch_state ch_state;
+
+	BUG_ON(!ch);
+	BUG_ON(!recv_ioctx);
+
+	ib_dma_sync_single_for_cpu(ch->sport->sdev->device,
+				   recv_ioctx->ioctx.dma, srp_max_req_size,
+				   DMA_FROM_DEVICE);
+
+	ch_state = srpt_get_ch_state(ch);
+	if (unlikely(ch_state == CH_CONNECTING)) {
+		list_add_tail(&recv_ioctx->wait_list, &ch->cmd_wait_list);
+		goto out;
+	}
+
+	if (unlikely(ch_state != CH_LIVE))
+		goto out;
+
+	srp_cmd = recv_ioctx->ioctx.buf;
+	if (srp_cmd->opcode == SRP_CMD || srp_cmd->opcode == SRP_TSK_MGMT) {
+		if (!send_ioctx)
+			send_ioctx = srpt_get_send_ioctx(ch);
+		if (unlikely(!send_ioctx)) {
+			list_add_tail(&recv_ioctx->wait_list,
+				      &ch->cmd_wait_list);
+			goto out;
+		}
+	}
+
+	switch (srp_cmd->opcode) {
+	case SRP_CMD:
+		srpt_handle_cmd(ch, recv_ioctx, send_ioctx);
+		break;
+	case SRP_TSK_MGMT:
+		srpt_handle_tsk_mgmt(ch, recv_ioctx, send_ioctx);
+		break;
+	case SRP_I_LOGOUT:
+		printk(KERN_ERR "Not yet implemented: SRP_I_LOGOUT\n");
+		break;
+	case SRP_CRED_RSP:
+		pr_debug("received SRP_CRED_RSP\n");
+		break;
+	case SRP_AER_RSP:
+		pr_debug("received SRP_AER_RSP\n");
+		break;
+	case SRP_RSP:
+		printk(KERN_ERR "Received SRP_RSP\n");
+		break;
+	default:
+		printk(KERN_ERR "received IU with unknown opcode 0x%x\n",
+		       srp_cmd->opcode);
+		break;
+	}
+
+	srpt_post_recv(ch->sport->sdev, recv_ioctx);
+out:
+	return;
+}
+
+static void srpt_process_rcv_completion(struct ib_cq *cq,
+					struct srpt_rdma_ch *ch,
+					struct ib_wc *wc)
+{
+	struct srpt_device *sdev = ch->sport->sdev;
+	struct srpt_recv_ioctx *ioctx;
+	u32 index;
+
+	index = idx_from_wr_id(wc->wr_id);
+	if (wc->status == IB_WC_SUCCESS) {
+		int req_lim;
+
+		req_lim = atomic_dec_return(&ch->req_lim);
+		if (unlikely(req_lim < 0))
+			printk(KERN_ERR "req_lim = %d < 0\n", req_lim);
+		ioctx = sdev->ioctx_ring[index];
+		srpt_handle_new_iu(ch, ioctx, NULL);
+	} else {
+		printk(KERN_INFO "receiving failed for idx %u with status %d\n",
+		       index, wc->status);
+	}
+}
+
+/**
+ * srpt_process_send_completion() - Process an IB send completion.
+ *
+ * Note: Although this has not yet been observed during tests, at least in
+ * theory it is possible that the srpt_get_send_ioctx() call invoked by
+ * srpt_handle_new_iu() fails. This is possible because the req_lim_delta
+ * value in each response is set to one, and it is possible that this response
+ * makes the initiator send a new request before the send completion for that
+ * response has been processed. This could e.g. happen if the call to
+ * srpt_put_send_iotcx() is delayed because of a higher priority interrupt or
+ * if IB retransmission causes generation of the send completion to be
+ * delayed. Incoming information units for which srpt_get_send_ioctx() fails
+ * are queued on cmd_wait_list. The code below processes these delayed
+ * requests one at a time.
+ */
+static void srpt_process_send_completion(struct ib_cq *cq,
+					 struct srpt_rdma_ch *ch,
+					 struct ib_wc *wc)
+{
+	struct srpt_send_ioctx *send_ioctx;
+	uint32_t index;
+	enum srpt_opcode opcode;
+
+	index = idx_from_wr_id(wc->wr_id);
+	opcode = opcode_from_wr_id(wc->wr_id);
+	send_ioctx = ch->ioctx_ring[index];
+	if (wc->status == IB_WC_SUCCESS) {
+		if (opcode == SRPT_SEND)
+			srpt_handle_send_comp(ch, send_ioctx);
+		else {
+			WARN_ON(opcode != SRPT_RDMA_ABORT &&
+				wc->opcode != IB_WC_RDMA_READ);
+			srpt_handle_rdma_comp(ch, send_ioctx, opcode);
+		}
+	} else {
+		if (opcode == SRPT_SEND) {
+			printk(KERN_INFO "sending response for idx %u failed"
+			       " with status %d\n", index, wc->status);
+			srpt_handle_send_err_comp(ch, wc->wr_id);
+		} else if (opcode != SRPT_RDMA_MID) {
+			printk(KERN_INFO "RDMA t %d for idx %u failed with"
+				" status %d", opcode, index, wc->status);
+			srpt_handle_rdma_err_comp(ch, send_ioctx, opcode);
+		}
+	}
+
+	while (unlikely(opcode == SRPT_SEND
+			&& !list_empty(&ch->cmd_wait_list)
+			&& srpt_get_ch_state(ch) == CH_LIVE
+			&& (send_ioctx = srpt_get_send_ioctx(ch)) != NULL)) {
+		struct srpt_recv_ioctx *recv_ioctx;
+
+		recv_ioctx = list_first_entry(&ch->cmd_wait_list,
+					      struct srpt_recv_ioctx,
+					      wait_list);
+		list_del(&recv_ioctx->wait_list);
+		srpt_handle_new_iu(ch, recv_ioctx, send_ioctx);
+	}
+}
+
+static void srpt_process_completion(struct ib_cq *cq, struct srpt_rdma_ch *ch)
+{
+	struct ib_wc *const wc = ch->wc;
+	int i, n;
+
+	WARN_ON(cq != ch->cq);
+
+	ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
+	while ((n = ib_poll_cq(cq, ARRAY_SIZE(ch->wc), wc)) > 0) {
+		for (i = 0; i < n; i++) {
+			if (opcode_from_wr_id(wc[i].wr_id) == SRPT_RECV)
+				srpt_process_rcv_completion(cq, ch, &wc[i]);
+			else
+				srpt_process_send_completion(cq, ch, &wc[i]);
+		}
+	}
+}
+
+/**
+ * srpt_completion() - IB completion queue callback function.
+ *
+ * Notes:
+ * - It is guaranteed that a completion handler will never be invoked
+ *   concurrently on two different CPUs for the same completion queue. See also
+ *   Documentation/infiniband/core_locking.txt and the implementation of
+ *   handle_edge_irq() in kernel/irq/chip.c.
+ * - When threaded IRQs are enabled, completion handlers are invoked in thread
+ *   context instead of interrupt context.
+ */
+static void srpt_completion(struct ib_cq *cq, void *ctx)
+{
+	struct srpt_rdma_ch *ch = ctx;
+
+	wake_up_interruptible(&ch->wait_queue);
+}
+
+static int srpt_compl_thread(void *arg)
+{
+	struct srpt_rdma_ch *ch;
+
+	/* Hibernation / freezing of the SRPT kernel thread is not supported. */
+	current->flags |= PF_NOFREEZE;
+
+	ch = arg;
+	BUG_ON(!ch);
+	printk(KERN_INFO "Session %s: kernel thread %s (PID %d) started\n",
+	       ch->sess_name, ch->thread->comm, current->pid);
+	while (!kthread_should_stop()) {
+		wait_event_interruptible(ch->wait_queue,
+			(srpt_process_completion(ch->cq, ch),
+			 kthread_should_stop()));
+	}
+	printk(KERN_INFO "Session %s: kernel thread %s (PID %d) stopped\n",
+	       ch->sess_name, ch->thread->comm, current->pid);
+	return 0;
+}
+
+/**
+ * srpt_create_ch_ib() - Create receive and send completion queues.
+ */
+static int srpt_create_ch_ib(struct srpt_rdma_ch *ch)
+{
+	struct ib_qp_init_attr *qp_init;
+	struct srpt_port *sport = ch->sport;
+	struct srpt_device *sdev = sport->sdev;
+	u32 srp_sq_size = sport->port_attrib.srp_sq_size;
+	int ret;
+
+	WARN_ON(ch->rq_size < 1);
+
+	ret = -ENOMEM;
+	qp_init = kzalloc(sizeof *qp_init, GFP_KERNEL);
+	if (!qp_init)
+		goto out;
+
+retry:
+	ch->cq = ib_create_cq(sdev->device, srpt_completion, NULL, ch,
+			      ch->rq_size + srp_sq_size, 0);
+	if (IS_ERR(ch->cq)) {
+		ret = PTR_ERR(ch->cq);
+		printk(KERN_ERR "failed to create CQ cqe= %d ret= %d\n",
+		       ch->rq_size + srp_sq_size, ret);
+		goto out;
+	}
+
+	qp_init->qp_context = (void *)ch;
+	qp_init->event_handler
+		= (void(*)(struct ib_event *, void*))srpt_qp_event;
+	qp_init->send_cq = ch->cq;
+	qp_init->recv_cq = ch->cq;
+	qp_init->srq = sdev->srq;
+	qp_init->sq_sig_type = IB_SIGNAL_REQ_WR;
+	qp_init->qp_type = IB_QPT_RC;
+	qp_init->cap.max_send_wr = srp_sq_size;
+	qp_init->cap.max_send_sge = SRPT_DEF_SG_PER_WQE;
+
+	ch->qp = ib_create_qp(sdev->pd, qp_init);
+	if (IS_ERR(ch->qp)) {
+		ret = PTR_ERR(ch->qp);
+		if (ret == -ENOMEM) {
+			srp_sq_size /= 2;
+			if (srp_sq_size >= MIN_SRPT_SQ_SIZE) {
+				ib_destroy_cq(ch->cq);
+				goto retry;
+			}
+		}
+		printk(KERN_ERR "failed to create_qp ret= %d\n", ret);
+		goto err_destroy_cq;
+	}
+
+	atomic_set(&ch->sq_wr_avail, qp_init->cap.max_send_wr);
+
+	pr_debug("%s: max_cqe= %d max_sge= %d sq_size = %d cm_id= %p\n",
+		 __func__, ch->cq->cqe, qp_init->cap.max_send_sge,
+		 qp_init->cap.max_send_wr, ch->cm_id);
+
+	ret = srpt_init_ch_qp(ch, ch->qp);
+	if (ret)
+		goto err_destroy_qp;
+
+	init_waitqueue_head(&ch->wait_queue);
+
+	pr_debug("creating thread for session %s\n", ch->sess_name);
+
+	ch->thread = kthread_run(srpt_compl_thread, ch, "ib_srpt_compl");
+	if (IS_ERR(ch->thread)) {
+		printk(KERN_ERR "failed to create kernel thread %ld\n",
+		       PTR_ERR(ch->thread));
+		ch->thread = NULL;
+		goto err_destroy_qp;
+	}
+
+out:
+	kfree(qp_init);
+	return ret;
+
+err_destroy_qp:
+	ib_destroy_qp(ch->qp);
+err_destroy_cq:
+	ib_destroy_cq(ch->cq);
+	goto out;
+}
+
+static void srpt_destroy_ch_ib(struct srpt_rdma_ch *ch)
+{
+	if (ch->thread)
+		kthread_stop(ch->thread);
+
+	ib_destroy_qp(ch->qp);
+	ib_destroy_cq(ch->cq);
+}
+
+/**
+ * __srpt_close_ch() - Close an RDMA channel by setting the QP error state.
+ *
+ * Reset the QP and make sure all resources associated with the channel will
+ * be deallocated at an appropriate time.
+ *
+ * Note: The caller must hold ch->sport->sdev->spinlock.
+ */
+static void __srpt_close_ch(struct srpt_rdma_ch *ch)
+{
+	struct srpt_device *sdev;
+	enum rdma_ch_state prev_state;
+	unsigned long flags;
+
+	sdev = ch->sport->sdev;
+
+	spin_lock_irqsave(&ch->spinlock, flags);
+	prev_state = ch->state;
+	switch (prev_state) {
+	case CH_CONNECTING:
+	case CH_LIVE:
+		ch->state = CH_DISCONNECTING;
+		break;
+	default:
+		break;
+	}
+	spin_unlock_irqrestore(&ch->spinlock, flags);
+
+	switch (prev_state) {
+	case CH_CONNECTING:
+		ib_send_cm_rej(ch->cm_id, IB_CM_REJ_NO_RESOURCES, NULL, 0,
+			       NULL, 0);
+		/* fall through */
+	case CH_LIVE:
+		if (ib_send_cm_dreq(ch->cm_id, NULL, 0) < 0)
+			printk(KERN_ERR "sending CM DREQ failed.\n");
+		break;
+	case CH_DISCONNECTING:
+		break;
+	case CH_DRAINING:
+	case CH_RELEASING:
+		break;
+	}
+}
+
+/**
+ * srpt_close_ch() - Close an RDMA channel.
+ */
+static void srpt_close_ch(struct srpt_rdma_ch *ch)
+{
+	struct srpt_device *sdev;
+
+	sdev = ch->sport->sdev;
+	spin_lock_irq(&sdev->spinlock);
+	__srpt_close_ch(ch);
+	spin_unlock_irq(&sdev->spinlock);
+}
+
+/**
+ * srpt_shutdown_session() - Whether or not a session may be shut down.
+ */
+static int srpt_shutdown_session(struct se_session *se_sess)
+{
+	struct srpt_rdma_ch *ch = se_sess->fabric_sess_ptr;
+	unsigned long flags;
+
+	spin_lock_irqsave(&ch->spinlock, flags);
+	if (ch->in_shutdown) {
+		spin_unlock_irqrestore(&ch->spinlock, flags);
+		return true;
+	}
+
+	ch->in_shutdown = true;
+	target_sess_cmd_list_set_waiting(se_sess);
+	spin_unlock_irqrestore(&ch->spinlock, flags);
+
+	return true;
+}
+
+/**
+ * srpt_drain_channel() - Drain a channel by resetting the IB queue pair.
+ * @cm_id: Pointer to the CM ID of the channel to be drained.
+ *
+ * Note: Must be called from inside srpt_cm_handler to avoid a race between
+ * accessing sdev->spinlock and the call to kfree(sdev) in srpt_remove_one()
+ * (the caller of srpt_cm_handler holds the cm_id spinlock; srpt_remove_one()
+ * waits until all target sessions for the associated IB device have been
+ * unregistered and target session registration involves a call to
+ * ib_destroy_cm_id(), which locks the cm_id spinlock and hence waits until
+ * this function has finished).
+ */
+static void srpt_drain_channel(struct ib_cm_id *cm_id)
+{
+	struct srpt_device *sdev;
+	struct srpt_rdma_ch *ch;
+	int ret;
+	bool do_reset = false;
+
+	WARN_ON_ONCE(irqs_disabled());
+
+	sdev = cm_id->context;
+	BUG_ON(!sdev);
+	spin_lock_irq(&sdev->spinlock);
+	list_for_each_entry(ch, &sdev->rch_list, list) {
+		if (ch->cm_id == cm_id) {
+			do_reset = srpt_test_and_set_ch_state(ch,
+					CH_CONNECTING, CH_DRAINING) ||
+				   srpt_test_and_set_ch_state(ch,
+					CH_LIVE, CH_DRAINING) ||
+				   srpt_test_and_set_ch_state(ch,
+					CH_DISCONNECTING, CH_DRAINING);
+			break;
+		}
+	}
+	spin_unlock_irq(&sdev->spinlock);
+
+	if (do_reset) {
+		if (ch->sess)
+			srpt_shutdown_session(ch->sess);
+
+		ret = srpt_ch_qp_err(ch);
+		if (ret < 0)
+			printk(KERN_ERR "Setting queue pair in error state"
+			       " failed: %d\n", ret);
+	}
+}
+
+/**
+ * srpt_find_channel() - Look up an RDMA channel.
+ * @cm_id: Pointer to the CM ID of the channel to be looked up.
+ *
+ * Return NULL if no matching RDMA channel has been found.
+ */
+static struct srpt_rdma_ch *srpt_find_channel(struct srpt_device *sdev,
+					      struct ib_cm_id *cm_id)
+{
+	struct srpt_rdma_ch *ch;
+	bool found;
+
+	WARN_ON_ONCE(irqs_disabled());
+	BUG_ON(!sdev);
+
+	found = false;
+	spin_lock_irq(&sdev->spinlock);
+	list_for_each_entry(ch, &sdev->rch_list, list) {
+		if (ch->cm_id == cm_id) {
+			found = true;
+			break;
+		}
+	}
+	spin_unlock_irq(&sdev->spinlock);
+
+	return found ? ch : NULL;
+}
+
+/**
+ * srpt_release_channel() - Release channel resources.
+ *
+ * Schedules the actual release because:
+ * - Calling the ib_destroy_cm_id() call from inside an IB CM callback would
+ *   trigger a deadlock.
+ * - It is not safe to call TCM transport_* functions from interrupt context.
+ */
+static void srpt_release_channel(struct srpt_rdma_ch *ch)
+{
+	schedule_work(&ch->release_work);
+}
+
+static void srpt_release_channel_work(struct work_struct *w)
+{
+	struct srpt_rdma_ch *ch;
+	struct srpt_device *sdev;
+	struct se_session *se_sess;
+
+	ch = container_of(w, struct srpt_rdma_ch, release_work);
+	pr_debug("ch = %p; ch->sess = %p; release_done = %p\n", ch, ch->sess,
+		 ch->release_done);
+
+	sdev = ch->sport->sdev;
+	BUG_ON(!sdev);
+
+	se_sess = ch->sess;
+	BUG_ON(!se_sess);
+
+	target_wait_for_sess_cmds(se_sess);
+
+	transport_deregister_session_configfs(se_sess);
+	transport_deregister_session(se_sess);
+	ch->sess = NULL;
+
+	ib_destroy_cm_id(ch->cm_id);
+
+	srpt_destroy_ch_ib(ch);
+
+	srpt_free_ioctx_ring((struct srpt_ioctx **)ch->ioctx_ring,
+			     ch->sport->sdev, ch->rq_size,
+			     ch->rsp_size, DMA_TO_DEVICE);
+
+	spin_lock_irq(&sdev->spinlock);
+	list_del(&ch->list);
+	spin_unlock_irq(&sdev->spinlock);
+
+	if (ch->release_done)
+		complete(ch->release_done);
+
+	wake_up(&sdev->ch_releaseQ);
+
+	kfree(ch);
+}
+
+static struct srpt_node_acl *__srpt_lookup_acl(struct srpt_port *sport,
+					       u8 i_port_id[16])
+{
+	struct srpt_node_acl *nacl;
+
+	list_for_each_entry(nacl, &sport->port_acl_list, list)
+		if (memcmp(nacl->i_port_id, i_port_id,
+			   sizeof(nacl->i_port_id)) == 0)
+			return nacl;
+
+	return NULL;
+}
+
+static struct srpt_node_acl *srpt_lookup_acl(struct srpt_port *sport,
+					     u8 i_port_id[16])
+{
+	struct srpt_node_acl *nacl;
+
+	spin_lock_irq(&sport->port_acl_lock);
+	nacl = __srpt_lookup_acl(sport, i_port_id);
+	spin_unlock_irq(&sport->port_acl_lock);
+
+	return nacl;
+}
+
+/**
+ * srpt_cm_req_recv() - Process the event IB_CM_REQ_RECEIVED.
+ *
+ * Ownership of the cm_id is transferred to the target session if this
+ * functions returns zero. Otherwise the caller remains the owner of cm_id.
+ */
+static int srpt_cm_req_recv(struct ib_cm_id *cm_id,
+			    struct ib_cm_req_event_param *param,
+			    void *private_data)
+{
+	struct srpt_device *sdev = cm_id->context;
+	struct srpt_port *sport = &sdev->port[param->port - 1];
+	struct srp_login_req *req;
+	struct srp_login_rsp *rsp;
+	struct srp_login_rej *rej;
+	struct ib_cm_rep_param *rep_param;
+	struct srpt_rdma_ch *ch, *tmp_ch;
+	struct srpt_node_acl *nacl;
+	u32 it_iu_len;
+	int i;
+	int ret = 0;
+
+	WARN_ON_ONCE(irqs_disabled());
+
+	if (WARN_ON(!sdev || !private_data))
+		return -EINVAL;
+
+	req = (struct srp_login_req *)private_data;
+
+	it_iu_len = be32_to_cpu(req->req_it_iu_len);
+
+	printk(KERN_INFO "Received SRP_LOGIN_REQ with i_port_id 0x%llx:0x%llx,"
+	       " t_port_id 0x%llx:0x%llx and it_iu_len %d on port %d"
+	       " (guid=0x%llx:0x%llx)\n",
+	       be64_to_cpu(*(__be64 *)&req->initiator_port_id[0]),
+	       be64_to_cpu(*(__be64 *)&req->initiator_port_id[8]),
+	       be64_to_cpu(*(__be64 *)&req->target_port_id[0]),
+	       be64_to_cpu(*(__be64 *)&req->target_port_id[8]),
+	       it_iu_len,
+	       param->port,
+	       be64_to_cpu(*(__be64 *)&sdev->port[param->port - 1].gid.raw[0]),
+	       be64_to_cpu(*(__be64 *)&sdev->port[param->port - 1].gid.raw[8]));
+
+	rsp = kzalloc(sizeof *rsp, GFP_KERNEL);
+	rej = kzalloc(sizeof *rej, GFP_KERNEL);
+	rep_param = kzalloc(sizeof *rep_param, GFP_KERNEL);
+
+	if (!rsp || !rej || !rep_param) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	if (it_iu_len > srp_max_req_size || it_iu_len < 64) {
+		rej->reason = __constant_cpu_to_be32(
+				SRP_LOGIN_REJ_REQ_IT_IU_LENGTH_TOO_LARGE);
+		ret = -EINVAL;
+		printk(KERN_ERR "rejected SRP_LOGIN_REQ because its"
+		       " length (%d bytes) is out of range (%d .. %d)\n",
+		       it_iu_len, 64, srp_max_req_size);
+		goto reject;
+	}
+
+	if (!sport->enabled) {
+		rej->reason = __constant_cpu_to_be32(
+			     SRP_LOGIN_REJ_INSUFFICIENT_RESOURCES);
+		ret = -EINVAL;
+		printk(KERN_ERR "rejected SRP_LOGIN_REQ because the target port"
+		       " has not yet been enabled\n");
+		goto reject;
+	}
+
+	if ((req->req_flags & SRP_MTCH_ACTION) == SRP_MULTICHAN_SINGLE) {
+		rsp->rsp_flags = SRP_LOGIN_RSP_MULTICHAN_NO_CHAN;
+
+		spin_lock_irq(&sdev->spinlock);
+
+		list_for_each_entry_safe(ch, tmp_ch, &sdev->rch_list, list) {
+			if (!memcmp(ch->i_port_id, req->initiator_port_id, 16)
+			    && !memcmp(ch->t_port_id, req->target_port_id, 16)
+			    && param->port == ch->sport->port
+			    && param->listen_id == ch->sport->sdev->cm_id
+			    && ch->cm_id) {
+				enum rdma_ch_state ch_state;
+
+				ch_state = srpt_get_ch_state(ch);
+				if (ch_state != CH_CONNECTING
+				    && ch_state != CH_LIVE)
+					continue;
+
+				/* found an existing channel */
+				pr_debug("Found existing channel %s"
+					 " cm_id= %p state= %d\n",
+					 ch->sess_name, ch->cm_id, ch_state);
+
+				__srpt_close_ch(ch);
+
+				rsp->rsp_flags =
+					SRP_LOGIN_RSP_MULTICHAN_TERMINATED;
+			}
+		}
+
+		spin_unlock_irq(&sdev->spinlock);
+
+	} else
+		rsp->rsp_flags = SRP_LOGIN_RSP_MULTICHAN_MAINTAINED;
+
+	if (*(__be64 *)req->target_port_id != cpu_to_be64(srpt_service_guid)
+	    || *(__be64 *)(req->target_port_id + 8) !=
+	       cpu_to_be64(srpt_service_guid)) {
+		rej->reason = __constant_cpu_to_be32(
+				SRP_LOGIN_REJ_UNABLE_ASSOCIATE_CHANNEL);
+		ret = -ENOMEM;
+		printk(KERN_ERR "rejected SRP_LOGIN_REQ because it"
+		       " has an invalid target port identifier.\n");
+		goto reject;
+	}
+
+	ch = kzalloc(sizeof *ch, GFP_KERNEL);
+	if (!ch) {
+		rej->reason = __constant_cpu_to_be32(
+					SRP_LOGIN_REJ_INSUFFICIENT_RESOURCES);
+		printk(KERN_ERR "rejected SRP_LOGIN_REQ because no memory.\n");
+		ret = -ENOMEM;
+		goto reject;
+	}
+
+	INIT_WORK(&ch->release_work, srpt_release_channel_work);
+	memcpy(ch->i_port_id, req->initiator_port_id, 16);
+	memcpy(ch->t_port_id, req->target_port_id, 16);
+	ch->sport = &sdev->port[param->port - 1];
+	ch->cm_id = cm_id;
+	/*
+	 * Avoid QUEUE_FULL conditions by limiting the number of buffers used
+	 * for the SRP protocol to the command queue size.
+	 */
+	ch->rq_size = SRPT_RQ_SIZE;
+	spin_lock_init(&ch->spinlock);
+	ch->state = CH_CONNECTING;
+	INIT_LIST_HEAD(&ch->cmd_wait_list);
+	ch->rsp_size = ch->sport->port_attrib.srp_max_rsp_size;
+
+	ch->ioctx_ring = (struct srpt_send_ioctx **)
+		srpt_alloc_ioctx_ring(ch->sport->sdev, ch->rq_size,
+				      sizeof(*ch->ioctx_ring[0]),
+				      ch->rsp_size, DMA_TO_DEVICE);
+	if (!ch->ioctx_ring)
+		goto free_ch;
+
+	INIT_LIST_HEAD(&ch->free_list);
+	for (i = 0; i < ch->rq_size; i++) {
+		ch->ioctx_ring[i]->ch = ch;
+		list_add_tail(&ch->ioctx_ring[i]->free_list, &ch->free_list);
+	}
+
+	ret = srpt_create_ch_ib(ch);
+	if (ret) {
+		rej->reason = __constant_cpu_to_be32(
+				SRP_LOGIN_REJ_INSUFFICIENT_RESOURCES);
+		printk(KERN_ERR "rejected SRP_LOGIN_REQ because creating"
+		       " a new RDMA channel failed.\n");
+		goto free_ring;
+	}
+
+	ret = srpt_ch_qp_rtr(ch, ch->qp);
+	if (ret) {
+		rej->reason = __constant_cpu_to_be32(
+				SRP_LOGIN_REJ_INSUFFICIENT_RESOURCES);
+		printk(KERN_ERR "rejected SRP_LOGIN_REQ because enabling"
+		       " RTR failed (error code = %d)\n", ret);
+		goto destroy_ib;
+	}
+	/*
+	 * Use the initator port identifier as the session name.
+	 */
+	snprintf(ch->sess_name, sizeof(ch->sess_name), "0x%016llx%016llx",
+			be64_to_cpu(*(__be64 *)ch->i_port_id),
+			be64_to_cpu(*(__be64 *)(ch->i_port_id + 8)));
+
+	pr_debug("registering session %s\n", ch->sess_name);
+
+	nacl = srpt_lookup_acl(sport, ch->i_port_id);
+	if (!nacl) {
+		printk(KERN_INFO "Rejected login because no ACL has been"
+		       " configured yet for initiator %s.\n", ch->sess_name);
+		rej->reason = __constant_cpu_to_be32(
+				SRP_LOGIN_REJ_CHANNEL_LIMIT_REACHED);
+		goto destroy_ib;
+	}
+
+	ch->sess = transport_init_session(TARGET_PROT_NORMAL);
+	if (IS_ERR(ch->sess)) {
+		rej->reason = __constant_cpu_to_be32(
+				SRP_LOGIN_REJ_INSUFFICIENT_RESOURCES);
+		pr_debug("Failed to create session\n");
+		goto deregister_session;
+	}
+	ch->sess->se_node_acl = &nacl->nacl;
+	transport_register_session(&sport->port_tpg_1, &nacl->nacl, ch->sess, ch);
+
+	pr_debug("Establish connection sess=%p name=%s cm_id=%p\n", ch->sess,
+		 ch->sess_name, ch->cm_id);
+
+	/* create srp_login_response */
+	rsp->opcode = SRP_LOGIN_RSP;
+	rsp->tag = req->tag;
+	rsp->max_it_iu_len = req->req_it_iu_len;
+	rsp->max_ti_iu_len = req->req_it_iu_len;
+	ch->max_ti_iu_len = it_iu_len;
+	rsp->buf_fmt = __constant_cpu_to_be16(SRP_BUF_FORMAT_DIRECT
+					      | SRP_BUF_FORMAT_INDIRECT);
+	rsp->req_lim_delta = cpu_to_be32(ch->rq_size);
+	atomic_set(&ch->req_lim, ch->rq_size);
+	atomic_set(&ch->req_lim_delta, 0);
+
+	/* create cm reply */
+	rep_param->qp_num = ch->qp->qp_num;
+	rep_param->private_data = (void *)rsp;
+	rep_param->private_data_len = sizeof *rsp;
+	rep_param->rnr_retry_count = 7;
+	rep_param->flow_control = 1;
+	rep_param->failover_accepted = 0;
+	rep_param->srq = 1;
+	rep_param->responder_resources = 4;
+	rep_param->initiator_depth = 4;
+
+	ret = ib_send_cm_rep(cm_id, rep_param);
+	if (ret) {
+		printk(KERN_ERR "sending SRP_LOGIN_REQ response failed"
+		       " (error code = %d)\n", ret);
+		goto release_channel;
+	}
+
+	spin_lock_irq(&sdev->spinlock);
+	list_add_tail(&ch->list, &sdev->rch_list);
+	spin_unlock_irq(&sdev->spinlock);
+
+	goto out;
+
+release_channel:
+	srpt_set_ch_state(ch, CH_RELEASING);
+	transport_deregister_session_configfs(ch->sess);
+
+deregister_session:
+	transport_deregister_session(ch->sess);
+	ch->sess = NULL;
+
+destroy_ib:
+	srpt_destroy_ch_ib(ch);
+
+free_ring:
+	srpt_free_ioctx_ring((struct srpt_ioctx **)ch->ioctx_ring,
+			     ch->sport->sdev, ch->rq_size,
+			     ch->rsp_size, DMA_TO_DEVICE);
+free_ch:
+	kfree(ch);
+
+reject:
+	rej->opcode = SRP_LOGIN_REJ;
+	rej->tag = req->tag;
+	rej->buf_fmt = __constant_cpu_to_be16(SRP_BUF_FORMAT_DIRECT
+					      | SRP_BUF_FORMAT_INDIRECT);
+
+	ib_send_cm_rej(cm_id, IB_CM_REJ_CONSUMER_DEFINED, NULL, 0,
+			     (void *)rej, sizeof *rej);
+
+out:
+	kfree(rep_param);
+	kfree(rsp);
+	kfree(rej);
+
+	return ret;
+}
+
+static void srpt_cm_rej_recv(struct ib_cm_id *cm_id)
+{
+	printk(KERN_INFO "Received IB REJ for cm_id %p.\n", cm_id);
+	srpt_drain_channel(cm_id);
+}
+
+/**
+ * srpt_cm_rtu_recv() - Process an IB_CM_RTU_RECEIVED or USER_ESTABLISHED event.
+ *
+ * An IB_CM_RTU_RECEIVED message indicates that the connection is established
+ * and that the recipient may begin transmitting (RTU = ready to use).
+ */
+static void srpt_cm_rtu_recv(struct ib_cm_id *cm_id)
+{
+	struct srpt_rdma_ch *ch;
+	int ret;
+
+	ch = srpt_find_channel(cm_id->context, cm_id);
+	BUG_ON(!ch);
+
+	if (srpt_test_and_set_ch_state(ch, CH_CONNECTING, CH_LIVE)) {
+		struct srpt_recv_ioctx *ioctx, *ioctx_tmp;
+
+		ret = srpt_ch_qp_rts(ch, ch->qp);
+
+		list_for_each_entry_safe(ioctx, ioctx_tmp, &ch->cmd_wait_list,
+					 wait_list) {
+			list_del(&ioctx->wait_list);
+			srpt_handle_new_iu(ch, ioctx, NULL);
+		}
+		if (ret)
+			srpt_close_ch(ch);
+	}
+}
+
+static void srpt_cm_timewait_exit(struct ib_cm_id *cm_id)
+{
+	printk(KERN_INFO "Received IB TimeWait exit for cm_id %p.\n", cm_id);
+	srpt_drain_channel(cm_id);
+}
+
+static void srpt_cm_rep_error(struct ib_cm_id *cm_id)
+{
+	printk(KERN_INFO "Received IB REP error for cm_id %p.\n", cm_id);
+	srpt_drain_channel(cm_id);
+}
+
+/**
+ * srpt_cm_dreq_recv() - Process reception of a DREQ message.
+ */
+static void srpt_cm_dreq_recv(struct ib_cm_id *cm_id)
+{
+	struct srpt_rdma_ch *ch;
+	unsigned long flags;
+	bool send_drep = false;
+
+	ch = srpt_find_channel(cm_id->context, cm_id);
+	BUG_ON(!ch);
+
+	pr_debug("cm_id= %p ch->state= %d\n", cm_id, srpt_get_ch_state(ch));
+
+	spin_lock_irqsave(&ch->spinlock, flags);
+	switch (ch->state) {
+	case CH_CONNECTING:
+	case CH_LIVE:
+		send_drep = true;
+		ch->state = CH_DISCONNECTING;
+		break;
+	case CH_DISCONNECTING:
+	case CH_DRAINING:
+	case CH_RELEASING:
+		WARN(true, "unexpected channel state %d\n", ch->state);
+		break;
+	}
+	spin_unlock_irqrestore(&ch->spinlock, flags);
+
+	if (send_drep) {
+		if (ib_send_cm_drep(ch->cm_id, NULL, 0) < 0)
+			printk(KERN_ERR "Sending IB DREP failed.\n");
+		printk(KERN_INFO "Received DREQ and sent DREP for session %s.\n",
+		       ch->sess_name);
+	}
+}
+
+/**
+ * srpt_cm_drep_recv() - Process reception of a DREP message.
+ */
+static void srpt_cm_drep_recv(struct ib_cm_id *cm_id)
+{
+	printk(KERN_INFO "Received InfiniBand DREP message for cm_id %p.\n",
+	       cm_id);
+	srpt_drain_channel(cm_id);
+}
+
+/**
+ * srpt_cm_handler() - IB connection manager callback function.
+ *
+ * A non-zero return value will cause the caller destroy the CM ID.
+ *
+ * Note: srpt_cm_handler() must only return a non-zero value when transferring
+ * ownership of the cm_id to a channel by srpt_cm_req_recv() failed. Returning
+ * a non-zero value in any other case will trigger a race with the
+ * ib_destroy_cm_id() call in srpt_release_channel().
+ */
+static int srpt_cm_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event)
+{
+	int ret;
+
+	ret = 0;
+	switch (event->event) {
+	case IB_CM_REQ_RECEIVED:
+		ret = srpt_cm_req_recv(cm_id, &event->param.req_rcvd,
+				       event->private_data);
+		break;
+	case IB_CM_REJ_RECEIVED:
+		srpt_cm_rej_recv(cm_id);
+		break;
+	case IB_CM_RTU_RECEIVED:
+	case IB_CM_USER_ESTABLISHED:
+		srpt_cm_rtu_recv(cm_id);
+		break;
+	case IB_CM_DREQ_RECEIVED:
+		srpt_cm_dreq_recv(cm_id);
+		break;
+	case IB_CM_DREP_RECEIVED:
+		srpt_cm_drep_recv(cm_id);
+		break;
+	case IB_CM_TIMEWAIT_EXIT:
+		srpt_cm_timewait_exit(cm_id);
+		break;
+	case IB_CM_REP_ERROR:
+		srpt_cm_rep_error(cm_id);
+		break;
+	case IB_CM_DREQ_ERROR:
+		printk(KERN_INFO "Received IB DREQ ERROR event.\n");
+		break;
+	case IB_CM_MRA_RECEIVED:
+		printk(KERN_INFO "Received IB MRA event\n");
+		break;
+	default:
+		printk(KERN_ERR "received unrecognized IB CM event %d\n",
+		       event->event);
+		break;
+	}
+
+	return ret;
+}
+
+/**
+ * srpt_perform_rdmas() - Perform IB RDMA.
+ *
+ * Returns zero upon success or a negative number upon failure.
+ */
+static int srpt_perform_rdmas(struct srpt_rdma_ch *ch,
+			      struct srpt_send_ioctx *ioctx)
+{
+	struct ib_send_wr wr;
+	struct ib_send_wr *bad_wr;
+	struct rdma_iu *riu;
+	int i;
+	int ret;
+	int sq_wr_avail;
+	enum dma_data_direction dir;
+	const int n_rdma = ioctx->n_rdma;
+
+	dir = ioctx->cmd.data_direction;
+	if (dir == DMA_TO_DEVICE) {
+		/* write */
+		ret = -ENOMEM;
+		sq_wr_avail = atomic_sub_return(n_rdma, &ch->sq_wr_avail);
+		if (sq_wr_avail < 0) {
+			printk(KERN_WARNING "IB send queue full (needed %d)\n",
+			       n_rdma);
+			goto out;
+		}
+	}
+
+	ioctx->rdma_aborted = false;
+	ret = 0;
+	riu = ioctx->rdma_ius;
+	memset(&wr, 0, sizeof wr);
+
+	for (i = 0; i < n_rdma; ++i, ++riu) {
+		if (dir == DMA_FROM_DEVICE) {
+			wr.opcode = IB_WR_RDMA_WRITE;
+			wr.wr_id = encode_wr_id(i == n_rdma - 1 ?
+						SRPT_RDMA_WRITE_LAST :
+						SRPT_RDMA_MID,
+						ioctx->ioctx.index);
+		} else {
+			wr.opcode = IB_WR_RDMA_READ;
+			wr.wr_id = encode_wr_id(i == n_rdma - 1 ?
+						SRPT_RDMA_READ_LAST :
+						SRPT_RDMA_MID,
+						ioctx->ioctx.index);
+		}
+		wr.next = NULL;
+		wr.wr.rdma.remote_addr = riu->raddr;
+		wr.wr.rdma.rkey = riu->rkey;
+		wr.num_sge = riu->sge_cnt;
+		wr.sg_list = riu->sge;
+
+		/* only get completion event for the last rdma write */
+		if (i == (n_rdma - 1) && dir == DMA_TO_DEVICE)
+			wr.send_flags = IB_SEND_SIGNALED;
+
+		ret = ib_post_send(ch->qp, &wr, &bad_wr);
+		if (ret)
+			break;
+	}
+
+	if (ret)
+		printk(KERN_ERR "%s[%d]: ib_post_send() returned %d for %d/%d",
+				 __func__, __LINE__, ret, i, n_rdma);
+	if (ret && i > 0) {
+		wr.num_sge = 0;
+		wr.wr_id = encode_wr_id(SRPT_RDMA_ABORT, ioctx->ioctx.index);
+		wr.send_flags = IB_SEND_SIGNALED;
+		while (ch->state == CH_LIVE &&
+			ib_post_send(ch->qp, &wr, &bad_wr) != 0) {
+			printk(KERN_INFO "Trying to abort failed RDMA transfer [%d]",
+				ioctx->ioctx.index);
+			msleep(1000);
+		}
+		while (ch->state != CH_RELEASING && !ioctx->rdma_aborted) {
+			printk(KERN_INFO "Waiting until RDMA abort finished [%d]",
+				ioctx->ioctx.index);
+			msleep(1000);
+		}
+	}
+out:
+	if (unlikely(dir == DMA_TO_DEVICE && ret < 0))
+		atomic_add(n_rdma, &ch->sq_wr_avail);
+	return ret;
+}
+
+/**
+ * srpt_xfer_data() - Start data transfer from initiator to target.
+ */
+static int srpt_xfer_data(struct srpt_rdma_ch *ch,
+			  struct srpt_send_ioctx *ioctx)
+{
+	int ret;
+
+	ret = srpt_map_sg_to_ib_sge(ch, ioctx);
+	if (ret) {
+		printk(KERN_ERR "%s[%d] ret=%d\n", __func__, __LINE__, ret);
+		goto out;
+	}
+
+	ret = srpt_perform_rdmas(ch, ioctx);
+	if (ret) {
+		if (ret == -EAGAIN || ret == -ENOMEM)
+			printk(KERN_INFO "%s[%d] queue full -- ret=%d\n",
+				   __func__, __LINE__, ret);
+		else
+			printk(KERN_ERR "%s[%d] fatal error -- ret=%d\n",
+			       __func__, __LINE__, ret);
+		goto out_unmap;
+	}
+
+out:
+	return ret;
+out_unmap:
+	srpt_unmap_sg_to_ib_sge(ch, ioctx);
+	goto out;
+}
+
+static int srpt_write_pending_status(struct se_cmd *se_cmd)
+{
+	struct srpt_send_ioctx *ioctx;
+
+	ioctx = container_of(se_cmd, struct srpt_send_ioctx, cmd);
+	return srpt_get_cmd_state(ioctx) == SRPT_STATE_NEED_DATA;
+}
+
+/*
+ * srpt_write_pending() - Start data transfer from initiator to target (write).
+ */
+static int srpt_write_pending(struct se_cmd *se_cmd)
+{
+	struct srpt_rdma_ch *ch;
+	struct srpt_send_ioctx *ioctx;
+	enum srpt_command_state new_state;
+	enum rdma_ch_state ch_state;
+	int ret;
+
+	ioctx = container_of(se_cmd, struct srpt_send_ioctx, cmd);
+
+	new_state = srpt_set_cmd_state(ioctx, SRPT_STATE_NEED_DATA);
+	WARN_ON(new_state == SRPT_STATE_DONE);
+
+	ch = ioctx->ch;
+	BUG_ON(!ch);
+
+	ch_state = srpt_get_ch_state(ch);
+	switch (ch_state) {
+	case CH_CONNECTING:
+		WARN(true, "unexpected channel state %d\n", ch_state);
+		ret = -EINVAL;
+		goto out;
+	case CH_LIVE:
+		break;
+	case CH_DISCONNECTING:
+	case CH_DRAINING:
+	case CH_RELEASING:
+		pr_debug("cmd with tag %lld: channel disconnecting\n",
+			 ioctx->tag);
+		srpt_set_cmd_state(ioctx, SRPT_STATE_DATA_IN);
+		ret = -EINVAL;
+		goto out;
+	}
+	ret = srpt_xfer_data(ch, ioctx);
+
+out:
+	return ret;
+}
+
+static u8 tcm_to_srp_tsk_mgmt_status(const int tcm_mgmt_status)
+{
+	switch (tcm_mgmt_status) {
+	case TMR_FUNCTION_COMPLETE:
+		return SRP_TSK_MGMT_SUCCESS;
+	case TMR_FUNCTION_REJECTED:
+		return SRP_TSK_MGMT_FUNC_NOT_SUPP;
+	}
+	return SRP_TSK_MGMT_FAILED;
+}
+
+/**
+ * srpt_queue_response() - Transmits the response to a SCSI command.
+ *
+ * Callback function called by the TCM core. Must not block since it can be
+ * invoked on the context of the IB completion handler.
+ */
+static void srpt_queue_response(struct se_cmd *cmd)
+{
+	struct srpt_rdma_ch *ch;
+	struct srpt_send_ioctx *ioctx;
+	enum srpt_command_state state;
+	unsigned long flags;
+	int ret;
+	enum dma_data_direction dir;
+	int resp_len;
+	u8 srp_tm_status;
+
+	ioctx = container_of(cmd, struct srpt_send_ioctx, cmd);
+	ch = ioctx->ch;
+	BUG_ON(!ch);
+
+	spin_lock_irqsave(&ioctx->spinlock, flags);
+	state = ioctx->state;
+	switch (state) {
+	case SRPT_STATE_NEW:
+	case SRPT_STATE_DATA_IN:
+		ioctx->state = SRPT_STATE_CMD_RSP_SENT;
+		break;
+	case SRPT_STATE_MGMT:
+		ioctx->state = SRPT_STATE_MGMT_RSP_SENT;
+		break;
+	default:
+		WARN(true, "ch %p; cmd %d: unexpected command state %d\n",
+			ch, ioctx->ioctx.index, ioctx->state);
+		break;
+	}
+	spin_unlock_irqrestore(&ioctx->spinlock, flags);
+
+	if (unlikely(transport_check_aborted_status(&ioctx->cmd, false)
+		     || WARN_ON_ONCE(state == SRPT_STATE_CMD_RSP_SENT))) {
+		atomic_inc(&ch->req_lim_delta);
+		srpt_abort_cmd(ioctx);
+		return;
+	}
+
+	dir = ioctx->cmd.data_direction;
+
+	/* For read commands, transfer the data to the initiator. */
+	if (dir == DMA_FROM_DEVICE && ioctx->cmd.data_length &&
+	    !ioctx->queue_status_only) {
+		ret = srpt_xfer_data(ch, ioctx);
+		if (ret) {
+			printk(KERN_ERR "xfer_data failed for tag %llu\n",
+			       ioctx->tag);
+			return;
+		}
+	}
+
+	if (state != SRPT_STATE_MGMT)
+		resp_len = srpt_build_cmd_rsp(ch, ioctx, ioctx->tag,
+					      cmd->scsi_status);
+	else {
+		srp_tm_status
+			= tcm_to_srp_tsk_mgmt_status(cmd->se_tmr_req->response);
+		resp_len = srpt_build_tskmgmt_rsp(ch, ioctx, srp_tm_status,
+						 ioctx->tag);
+	}
+	ret = srpt_post_send(ch, ioctx, resp_len);
+	if (ret) {
+		printk(KERN_ERR "sending cmd response failed for tag %llu\n",
+		       ioctx->tag);
+		srpt_unmap_sg_to_ib_sge(ch, ioctx);
+		srpt_set_cmd_state(ioctx, SRPT_STATE_DONE);
+		target_put_sess_cmd(ioctx->ch->sess, &ioctx->cmd);
+	}
+}
+
+static int srpt_queue_data_in(struct se_cmd *cmd)
+{
+	srpt_queue_response(cmd);
+	return 0;
+}
+
+static void srpt_queue_tm_rsp(struct se_cmd *cmd)
+{
+	srpt_queue_response(cmd);
+}
+
+static void srpt_aborted_task(struct se_cmd *cmd)
+{
+	struct srpt_send_ioctx *ioctx = container_of(cmd,
+				struct srpt_send_ioctx, cmd);
+
+	srpt_unmap_sg_to_ib_sge(ioctx->ch, ioctx);
+}
+
+static int srpt_queue_status(struct se_cmd *cmd)
+{
+	struct srpt_send_ioctx *ioctx;
+
+	ioctx = container_of(cmd, struct srpt_send_ioctx, cmd);
+	BUG_ON(ioctx->sense_data != cmd->sense_buffer);
+	if (cmd->se_cmd_flags &
+	    (SCF_TRANSPORT_TASK_SENSE | SCF_EMULATED_TASK_SENSE))
+		WARN_ON(cmd->scsi_status != SAM_STAT_CHECK_CONDITION);
+	ioctx->queue_status_only = true;
+	srpt_queue_response(cmd);
+	return 0;
+}
+
+static void srpt_refresh_port_work(struct work_struct *work)
+{
+	struct srpt_port *sport = container_of(work, struct srpt_port, work);
+
+	srpt_refresh_port(sport);
+}
+
+static int srpt_ch_list_empty(struct srpt_device *sdev)
+{
+	int res;
+
+	spin_lock_irq(&sdev->spinlock);
+	res = list_empty(&sdev->rch_list);
+	spin_unlock_irq(&sdev->spinlock);
+
+	return res;
+}
+
+/**
+ * srpt_release_sdev() - Free the channel resources associated with a target.
+ */
+static int srpt_release_sdev(struct srpt_device *sdev)
+{
+	struct srpt_rdma_ch *ch, *tmp_ch;
+	int res;
+
+	WARN_ON_ONCE(irqs_disabled());
+
+	BUG_ON(!sdev);
+
+	spin_lock_irq(&sdev->spinlock);
+	list_for_each_entry_safe(ch, tmp_ch, &sdev->rch_list, list)
+		__srpt_close_ch(ch);
+	spin_unlock_irq(&sdev->spinlock);
+
+	res = wait_event_interruptible(sdev->ch_releaseQ,
+				       srpt_ch_list_empty(sdev));
+	if (res)
+		printk(KERN_ERR "%s: interrupted.\n", __func__);
+
+	return 0;
+}
+
+static struct srpt_port *__srpt_lookup_port(const char *name)
+{
+	struct ib_device *dev;
+	struct srpt_device *sdev;
+	struct srpt_port *sport;
+	int i;
+
+	list_for_each_entry(sdev, &srpt_dev_list, list) {
+		dev = sdev->device;
+		if (!dev)
+			continue;
+
+		for (i = 0; i < dev->phys_port_cnt; i++) {
+			sport = &sdev->port[i];
+
+			if (!strcmp(sport->port_guid, name))
+				return sport;
+		}
+	}
+
+	return NULL;
+}
+
+static struct srpt_port *srpt_lookup_port(const char *name)
+{
+	struct srpt_port *sport;
+
+	spin_lock(&srpt_dev_lock);
+	sport = __srpt_lookup_port(name);
+	spin_unlock(&srpt_dev_lock);
+
+	return sport;
+}
+
+/**
+ * srpt_add_one() - Infiniband device addition callback function.
+ */
+static void srpt_add_one(struct ib_device *device)
+{
+	struct srpt_device *sdev;
+	struct srpt_port *sport;
+	struct ib_srq_init_attr srq_attr;
+	int i;
+
+	pr_debug("device = %p, device->dma_ops = %p\n", device,
+		 device->dma_ops);
+
+	sdev = kzalloc(sizeof *sdev, GFP_KERNEL);
+	if (!sdev)
+		goto err;
+
+	sdev->device = device;
+	INIT_LIST_HEAD(&sdev->rch_list);
+	init_waitqueue_head(&sdev->ch_releaseQ);
+	spin_lock_init(&sdev->spinlock);
+
+	if (ib_query_device(device, &sdev->dev_attr))
+		goto free_dev;
+
+	sdev->pd = ib_alloc_pd(device);
+	if (IS_ERR(sdev->pd))
+		goto free_dev;
+
+	sdev->mr = ib_get_dma_mr(sdev->pd, IB_ACCESS_LOCAL_WRITE);
+	if (IS_ERR(sdev->mr))
+		goto err_pd;
+
+	sdev->srq_size = min(srpt_srq_size, sdev->dev_attr.max_srq_wr);
+
+	srq_attr.event_handler = srpt_srq_event;
+	srq_attr.srq_context = (void *)sdev;
+	srq_attr.attr.max_wr = sdev->srq_size;
+	srq_attr.attr.max_sge = 1;
+	srq_attr.attr.srq_limit = 0;
+	srq_attr.srq_type = IB_SRQT_BASIC;
+
+	sdev->srq = ib_create_srq(sdev->pd, &srq_attr);
+	if (IS_ERR(sdev->srq))
+		goto err_mr;
+
+	pr_debug("%s: create SRQ #wr= %d max_allow=%d dev= %s\n",
+		 __func__, sdev->srq_size, sdev->dev_attr.max_srq_wr,
+		 device->name);
+
+	if (!srpt_service_guid)
+		srpt_service_guid = be64_to_cpu(device->node_guid);
+
+	sdev->cm_id = ib_create_cm_id(device, srpt_cm_handler, sdev);
+	if (IS_ERR(sdev->cm_id))
+		goto err_srq;
+
+	/* print out target login information */
+	pr_debug("Target login info: id_ext=%016llx,ioc_guid=%016llx,"
+		 "pkey=ffff,service_id=%016llx\n", srpt_service_guid,
+		 srpt_service_guid, srpt_service_guid);
+
+	/*
+	 * We do not have a consistent service_id (ie. also id_ext of target_id)
+	 * to identify this target. We currently use the guid of the first HCA
+	 * in the system as service_id; therefore, the target_id will change
+	 * if this HCA is gone bad and replaced by different HCA
+	 */
+	if (ib_cm_listen(sdev->cm_id, cpu_to_be64(srpt_service_guid), 0, NULL))
+		goto err_cm;
+
+	INIT_IB_EVENT_HANDLER(&sdev->event_handler, sdev->device,
+			      srpt_event_handler);
+	if (ib_register_event_handler(&sdev->event_handler))
+		goto err_cm;
+
+	sdev->ioctx_ring = (struct srpt_recv_ioctx **)
+		srpt_alloc_ioctx_ring(sdev, sdev->srq_size,
+				      sizeof(*sdev->ioctx_ring[0]),
+				      srp_max_req_size, DMA_FROM_DEVICE);
+	if (!sdev->ioctx_ring)
+		goto err_event;
+
+	for (i = 0; i < sdev->srq_size; ++i)
+		srpt_post_recv(sdev, sdev->ioctx_ring[i]);
+
+	WARN_ON(sdev->device->phys_port_cnt > ARRAY_SIZE(sdev->port));
+
+	for (i = 1; i <= sdev->device->phys_port_cnt; i++) {
+		sport = &sdev->port[i - 1];
+		sport->sdev = sdev;
+		sport->port = i;
+		sport->port_attrib.srp_max_rdma_size = DEFAULT_MAX_RDMA_SIZE;
+		sport->port_attrib.srp_max_rsp_size = DEFAULT_MAX_RSP_SIZE;
+		sport->port_attrib.srp_sq_size = DEF_SRPT_SQ_SIZE;
+		INIT_WORK(&sport->work, srpt_refresh_port_work);
+		INIT_LIST_HEAD(&sport->port_acl_list);
+		spin_lock_init(&sport->port_acl_lock);
+
+		if (srpt_refresh_port(sport)) {
+			printk(KERN_ERR "MAD registration failed for %s-%d.\n",
+			       srpt_sdev_name(sdev), i);
+			goto err_ring;
+		}
+		snprintf(sport->port_guid, sizeof(sport->port_guid),
+			"0x%016llx%016llx",
+			be64_to_cpu(sport->gid.global.subnet_prefix),
+			be64_to_cpu(sport->gid.global.interface_id));
+	}
+
+	spin_lock(&srpt_dev_lock);
+	list_add_tail(&sdev->list, &srpt_dev_list);
+	spin_unlock(&srpt_dev_lock);
+
+out:
+	ib_set_client_data(device, &srpt_client, sdev);
+	pr_debug("added %s.\n", device->name);
+	return;
+
+err_ring:
+	srpt_free_ioctx_ring((struct srpt_ioctx **)sdev->ioctx_ring, sdev,
+			     sdev->srq_size, srp_max_req_size,
+			     DMA_FROM_DEVICE);
+err_event:
+	ib_unregister_event_handler(&sdev->event_handler);
+err_cm:
+	ib_destroy_cm_id(sdev->cm_id);
+err_srq:
+	ib_destroy_srq(sdev->srq);
+err_mr:
+	ib_dereg_mr(sdev->mr);
+err_pd:
+	ib_dealloc_pd(sdev->pd);
+free_dev:
+	kfree(sdev);
+err:
+	sdev = NULL;
+	printk(KERN_INFO "%s(%s) failed.\n", __func__, device->name);
+	goto out;
+}
+
+/**
+ * srpt_remove_one() - InfiniBand device removal callback function.
+ */
+static void srpt_remove_one(struct ib_device *device)
+{
+	struct srpt_device *sdev;
+	int i;
+
+	sdev = ib_get_client_data(device, &srpt_client);
+	if (!sdev) {
+		printk(KERN_INFO "%s(%s): nothing to do.\n", __func__,
+		       device->name);
+		return;
+	}
+
+	srpt_unregister_mad_agent(sdev);
+
+	ib_unregister_event_handler(&sdev->event_handler);
+
+	/* Cancel any work queued by the just unregistered IB event handler. */
+	for (i = 0; i < sdev->device->phys_port_cnt; i++)
+		cancel_work_sync(&sdev->port[i].work);
+
+	ib_destroy_cm_id(sdev->cm_id);
+
+	/*
+	 * Unregistering a target must happen after destroying sdev->cm_id
+	 * such that no new SRP_LOGIN_REQ information units can arrive while
+	 * destroying the target.
+	 */
+	spin_lock(&srpt_dev_lock);
+	list_del(&sdev->list);
+	spin_unlock(&srpt_dev_lock);
+	srpt_release_sdev(sdev);
+
+	ib_destroy_srq(sdev->srq);
+	ib_dereg_mr(sdev->mr);
+	ib_dealloc_pd(sdev->pd);
+
+	srpt_free_ioctx_ring((struct srpt_ioctx **)sdev->ioctx_ring, sdev,
+			     sdev->srq_size, srp_max_req_size, DMA_FROM_DEVICE);
+	sdev->ioctx_ring = NULL;
+	kfree(sdev);
+}
+
+static struct ib_client srpt_client = {
+	.name = DRV_NAME,
+	.add = srpt_add_one,
+	.remove = srpt_remove_one
+};
+
+static int srpt_check_true(struct se_portal_group *se_tpg)
+{
+	return 1;
+}
+
+static int srpt_check_false(struct se_portal_group *se_tpg)
+{
+	return 0;
+}
+
+static char *srpt_get_fabric_name(void)
+{
+	return "srpt";
+}
+
+static u8 srpt_get_fabric_proto_ident(struct se_portal_group *se_tpg)
+{
+	return SCSI_TRANSPORTID_PROTOCOLID_SRP;
+}
+
+static char *srpt_get_fabric_wwn(struct se_portal_group *tpg)
+{
+	struct srpt_port *sport = container_of(tpg, struct srpt_port, port_tpg_1);
+
+	return sport->port_guid;
+}
+
+static u16 srpt_get_tag(struct se_portal_group *tpg)
+{
+	return 1;
+}
+
+static u32 srpt_get_default_depth(struct se_portal_group *se_tpg)
+{
+	return 1;
+}
+
+static u32 srpt_get_pr_transport_id(struct se_portal_group *se_tpg,
+				    struct se_node_acl *se_nacl,
+				    struct t10_pr_registration *pr_reg,
+				    int *format_code, unsigned char *buf)
+{
+	struct srpt_node_acl *nacl;
+	struct spc_rdma_transport_id *tr_id;
+
+	nacl = container_of(se_nacl, struct srpt_node_acl, nacl);
+	tr_id = (void *)buf;
+	tr_id->protocol_identifier = SCSI_TRANSPORTID_PROTOCOLID_SRP;
+	memcpy(tr_id->i_port_id, nacl->i_port_id, sizeof(tr_id->i_port_id));
+	return sizeof(*tr_id);
+}
+
+static u32 srpt_get_pr_transport_id_len(struct se_portal_group *se_tpg,
+					struct se_node_acl *se_nacl,
+					struct t10_pr_registration *pr_reg,
+					int *format_code)
+{
+	*format_code = 0;
+	return sizeof(struct spc_rdma_transport_id);
+}
+
+static char *srpt_parse_pr_out_transport_id(struct se_portal_group *se_tpg,
+					    const char *buf, u32 *out_tid_len,
+					    char **port_nexus_ptr)
+{
+	struct spc_rdma_transport_id *tr_id;
+
+	*port_nexus_ptr = NULL;
+	*out_tid_len = sizeof(struct spc_rdma_transport_id);
+	tr_id = (void *)buf;
+	return (char *)tr_id->i_port_id;
+}
+
+static struct se_node_acl *srpt_alloc_fabric_acl(struct se_portal_group *se_tpg)
+{
+	struct srpt_node_acl *nacl;
+
+	nacl = kzalloc(sizeof(struct srpt_node_acl), GFP_KERNEL);
+	if (!nacl) {
+		printk(KERN_ERR "Unable to allocate struct srpt_node_acl\n");
+		return NULL;
+	}
+
+	return &nacl->nacl;
+}
+
+static void srpt_release_fabric_acl(struct se_portal_group *se_tpg,
+				    struct se_node_acl *se_nacl)
+{
+	struct srpt_node_acl *nacl;
+
+	nacl = container_of(se_nacl, struct srpt_node_acl, nacl);
+	kfree(nacl);
+}
+
+static u32 srpt_tpg_get_inst_index(struct se_portal_group *se_tpg)
+{
+	return 1;
+}
+
+static void srpt_release_cmd(struct se_cmd *se_cmd)
+{
+	struct srpt_send_ioctx *ioctx = container_of(se_cmd,
+				struct srpt_send_ioctx, cmd);
+	struct srpt_rdma_ch *ch = ioctx->ch;
+	unsigned long flags;
+
+	WARN_ON(ioctx->state != SRPT_STATE_DONE);
+	WARN_ON(ioctx->mapped_sg_count != 0);
+
+	if (ioctx->n_rbuf > 1) {
+		kfree(ioctx->rbufs);
+		ioctx->rbufs = NULL;
+		ioctx->n_rbuf = 0;
+	}
+
+	spin_lock_irqsave(&ch->spinlock, flags);
+	list_add(&ioctx->free_list, &ch->free_list);
+	spin_unlock_irqrestore(&ch->spinlock, flags);
+}
+
+/**
+ * srpt_close_session() - Forcibly close a session.
+ *
+ * Callback function invoked by the TCM core to clean up sessions associated
+ * with a node ACL when the user invokes
+ * rmdir /sys/kernel/config/target/$driver/$port/$tpg/acls/$i_port_id
+ */
+static void srpt_close_session(struct se_session *se_sess)
+{
+	DECLARE_COMPLETION_ONSTACK(release_done);
+	struct srpt_rdma_ch *ch;
+	struct srpt_device *sdev;
+	int res;
+
+	ch = se_sess->fabric_sess_ptr;
+	WARN_ON(ch->sess != se_sess);
+
+	pr_debug("ch %p state %d\n", ch, srpt_get_ch_state(ch));
+
+	sdev = ch->sport->sdev;
+	spin_lock_irq(&sdev->spinlock);
+	BUG_ON(ch->release_done);
+	ch->release_done = &release_done;
+	__srpt_close_ch(ch);
+	spin_unlock_irq(&sdev->spinlock);
+
+	res = wait_for_completion_timeout(&release_done, 60 * HZ);
+	WARN_ON(res <= 0);
+}
+
+/**
+ * srpt_sess_get_index() - Return the value of scsiAttIntrPortIndex (SCSI-MIB).
+ *
+ * A quote from RFC 4455 (SCSI-MIB) about this MIB object:
+ * This object represents an arbitrary integer used to uniquely identify a
+ * particular attached remote initiator port to a particular SCSI target port
+ * within a particular SCSI target device within a particular SCSI instance.
+ */
+static u32 srpt_sess_get_index(struct se_session *se_sess)
+{
+	return 0;
+}
+
+static void srpt_set_default_node_attrs(struct se_node_acl *nacl)
+{
+}
+
+static u32 srpt_get_task_tag(struct se_cmd *se_cmd)
+{
+	struct srpt_send_ioctx *ioctx;
+
+	ioctx = container_of(se_cmd, struct srpt_send_ioctx, cmd);
+	return ioctx->tag;
+}
+
+/* Note: only used from inside debug printk's by the TCM core. */
+static int srpt_get_tcm_cmd_state(struct se_cmd *se_cmd)
+{
+	struct srpt_send_ioctx *ioctx;
+
+	ioctx = container_of(se_cmd, struct srpt_send_ioctx, cmd);
+	return srpt_get_cmd_state(ioctx);
+}
+
+/**
+ * srpt_parse_i_port_id() - Parse an initiator port ID.
+ * @name: ASCII representation of a 128-bit initiator port ID.
+ * @i_port_id: Binary 128-bit port ID.
+ */
+static int srpt_parse_i_port_id(u8 i_port_id[16], const char *name)
+{
+	const char *p;
+	unsigned len, count, leading_zero_bytes;
+	int ret, rc;
+
+	p = name;
+	if (strncasecmp(p, "0x", 2) == 0)
+		p += 2;
+	ret = -EINVAL;
+	len = strlen(p);
+	if (len % 2)
+		goto out;
+	count = min(len / 2, 16U);
+	leading_zero_bytes = 16 - count;
+	memset(i_port_id, 0, leading_zero_bytes);
+	rc = hex2bin(i_port_id + leading_zero_bytes, p, count);
+	if (rc < 0)
+		pr_debug("hex2bin failed for srpt_parse_i_port_id: %d\n", rc);
+	ret = 0;
+out:
+	return ret;
+}
+
+/*
+ * configfs callback function invoked for
+ * mkdir /sys/kernel/config/target/$driver/$port/$tpg/acls/$i_port_id
+ */
+static struct se_node_acl *srpt_make_nodeacl(struct se_portal_group *tpg,
+					     struct config_group *group,
+					     const char *name)
+{
+	struct srpt_port *sport = container_of(tpg, struct srpt_port, port_tpg_1);
+	struct se_node_acl *se_nacl, *se_nacl_new;
+	struct srpt_node_acl *nacl;
+	int ret = 0;
+	u32 nexus_depth = 1;
+	u8 i_port_id[16];
+
+	if (srpt_parse_i_port_id(i_port_id, name) < 0) {
+		printk(KERN_ERR "invalid initiator port ID %s\n", name);
+		ret = -EINVAL;
+		goto err;
+	}
+
+	se_nacl_new = srpt_alloc_fabric_acl(tpg);
+	if (!se_nacl_new) {
+		ret = -ENOMEM;
+		goto err;
+	}
+	/*
+	 * nacl_new may be released by core_tpg_add_initiator_node_acl()
+	 * when converting a node ACL from demo mode to explict
+	 */
+	se_nacl = core_tpg_add_initiator_node_acl(tpg, se_nacl_new, name,
+						  nexus_depth);
+	if (IS_ERR(se_nacl)) {
+		ret = PTR_ERR(se_nacl);
+		goto err;
+	}
+	/* Locate our struct srpt_node_acl and set sdev and i_port_id. */
+	nacl = container_of(se_nacl, struct srpt_node_acl, nacl);
+	memcpy(&nacl->i_port_id[0], &i_port_id[0], 16);
+	nacl->sport = sport;
+
+	spin_lock_irq(&sport->port_acl_lock);
+	list_add_tail(&nacl->list, &sport->port_acl_list);
+	spin_unlock_irq(&sport->port_acl_lock);
+
+	return se_nacl;
+err:
+	return ERR_PTR(ret);
+}
+
+/*
+ * configfs callback function invoked for
+ * rmdir /sys/kernel/config/target/$driver/$port/$tpg/acls/$i_port_id
+ */
+static void srpt_drop_nodeacl(struct se_node_acl *se_nacl)
+{
+	struct srpt_node_acl *nacl;
+	struct srpt_device *sdev;
+	struct srpt_port *sport;
+
+	nacl = container_of(se_nacl, struct srpt_node_acl, nacl);
+	sport = nacl->sport;
+	sdev = sport->sdev;
+	spin_lock_irq(&sport->port_acl_lock);
+	list_del(&nacl->list);
+	spin_unlock_irq(&sport->port_acl_lock);
+	core_tpg_del_initiator_node_acl(&sport->port_tpg_1, se_nacl, 1);
+	srpt_release_fabric_acl(NULL, se_nacl);
+}
+
+static ssize_t srpt_tpg_attrib_show_srp_max_rdma_size(
+	struct se_portal_group *se_tpg,
+	char *page)
+{
+	struct srpt_port *sport = container_of(se_tpg, struct srpt_port, port_tpg_1);
+
+	return sprintf(page, "%u\n", sport->port_attrib.srp_max_rdma_size);
+}
+
+static ssize_t srpt_tpg_attrib_store_srp_max_rdma_size(
+	struct se_portal_group *se_tpg,
+	const char *page,
+	size_t count)
+{
+	struct srpt_port *sport = container_of(se_tpg, struct srpt_port, port_tpg_1);
+	unsigned long val;
+	int ret;
+
+	ret = kstrtoul(page, 0, &val);
+	if (ret < 0) {
+		pr_err("kstrtoul() failed with ret: %d\n", ret);
+		return -EINVAL;
+	}
+	if (val > MAX_SRPT_RDMA_SIZE) {
+		pr_err("val: %lu exceeds MAX_SRPT_RDMA_SIZE: %d\n", val,
+			MAX_SRPT_RDMA_SIZE);
+		return -EINVAL;
+	}
+	if (val < DEFAULT_MAX_RDMA_SIZE) {
+		pr_err("val: %lu smaller than DEFAULT_MAX_RDMA_SIZE: %d\n",
+			val, DEFAULT_MAX_RDMA_SIZE);
+		return -EINVAL;
+	}
+	sport->port_attrib.srp_max_rdma_size = val;
+
+	return count;
+}
+
+TF_TPG_ATTRIB_ATTR(srpt, srp_max_rdma_size, S_IRUGO | S_IWUSR);
+
+static ssize_t srpt_tpg_attrib_show_srp_max_rsp_size(
+	struct se_portal_group *se_tpg,
+	char *page)
+{
+	struct srpt_port *sport = container_of(se_tpg, struct srpt_port, port_tpg_1);
+
+	return sprintf(page, "%u\n", sport->port_attrib.srp_max_rsp_size);
+}
+
+static ssize_t srpt_tpg_attrib_store_srp_max_rsp_size(
+	struct se_portal_group *se_tpg,
+	const char *page,
+	size_t count)
+{
+	struct srpt_port *sport = container_of(se_tpg, struct srpt_port, port_tpg_1);
+	unsigned long val;
+	int ret;
+
+	ret = kstrtoul(page, 0, &val);
+	if (ret < 0) {
+		pr_err("kstrtoul() failed with ret: %d\n", ret);
+		return -EINVAL;
+	}
+	if (val > MAX_SRPT_RSP_SIZE) {
+		pr_err("val: %lu exceeds MAX_SRPT_RSP_SIZE: %d\n", val,
+			MAX_SRPT_RSP_SIZE);
+		return -EINVAL;
+	}
+	if (val < MIN_MAX_RSP_SIZE) {
+		pr_err("val: %lu smaller than MIN_MAX_RSP_SIZE: %d\n", val,
+			MIN_MAX_RSP_SIZE);
+		return -EINVAL;
+	}
+	sport->port_attrib.srp_max_rsp_size = val;
+
+	return count;
+}
+
+TF_TPG_ATTRIB_ATTR(srpt, srp_max_rsp_size, S_IRUGO | S_IWUSR);
+
+static ssize_t srpt_tpg_attrib_show_srp_sq_size(
+	struct se_portal_group *se_tpg,
+	char *page)
+{
+	struct srpt_port *sport = container_of(se_tpg, struct srpt_port, port_tpg_1);
+
+	return sprintf(page, "%u\n", sport->port_attrib.srp_sq_size);
+}
+
+static ssize_t srpt_tpg_attrib_store_srp_sq_size(
+	struct se_portal_group *se_tpg,
+	const char *page,
+	size_t count)
+{
+	struct srpt_port *sport = container_of(se_tpg, struct srpt_port, port_tpg_1);
+	unsigned long val;
+	int ret;
+
+	ret = kstrtoul(page, 0, &val);
+	if (ret < 0) {
+		pr_err("kstrtoul() failed with ret: %d\n", ret);
+		return -EINVAL;
+	}
+	if (val > MAX_SRPT_SRQ_SIZE) {
+		pr_err("val: %lu exceeds MAX_SRPT_SRQ_SIZE: %d\n", val,
+			MAX_SRPT_SRQ_SIZE);
+		return -EINVAL;
+	}
+	if (val < MIN_SRPT_SRQ_SIZE) {
+		pr_err("val: %lu smaller than MIN_SRPT_SRQ_SIZE: %d\n", val,
+			MIN_SRPT_SRQ_SIZE);
+		return -EINVAL;
+	}
+	sport->port_attrib.srp_sq_size = val;
+
+	return count;
+}
+
+TF_TPG_ATTRIB_ATTR(srpt, srp_sq_size, S_IRUGO | S_IWUSR);
+
+static struct configfs_attribute *srpt_tpg_attrib_attrs[] = {
+	&srpt_tpg_attrib_srp_max_rdma_size.attr,
+	&srpt_tpg_attrib_srp_max_rsp_size.attr,
+	&srpt_tpg_attrib_srp_sq_size.attr,
+	NULL,
+};
+
+static ssize_t srpt_tpg_show_enable(
+	struct se_portal_group *se_tpg,
+	char *page)
+{
+	struct srpt_port *sport = container_of(se_tpg, struct srpt_port, port_tpg_1);
+
+	return snprintf(page, PAGE_SIZE, "%d\n", (sport->enabled) ? 1: 0);
+}
+
+static ssize_t srpt_tpg_store_enable(
+	struct se_portal_group *se_tpg,
+	const char *page,
+	size_t count)
+{
+	struct srpt_port *sport = container_of(se_tpg, struct srpt_port, port_tpg_1);
+	unsigned long tmp;
+        int ret;
+
+	ret = kstrtoul(page, 0, &tmp);
+	if (ret < 0) {
+		printk(KERN_ERR "Unable to extract srpt_tpg_store_enable\n");
+		return -EINVAL;
+	}
+
+	if ((tmp != 0) && (tmp != 1)) {
+		printk(KERN_ERR "Illegal value for srpt_tpg_store_enable: %lu\n", tmp);
+		return -EINVAL;
+	}
+	if (tmp == 1)
+		sport->enabled = true;
+	else
+		sport->enabled = false;
+
+	return count;
+}
+
+TF_TPG_BASE_ATTR(srpt, enable, S_IRUGO | S_IWUSR);
+
+static struct configfs_attribute *srpt_tpg_attrs[] = {
+	&srpt_tpg_enable.attr,
+	NULL,
+};
+
+/**
+ * configfs callback invoked for
+ * mkdir /sys/kernel/config/target/$driver/$port/$tpg
+ */
+static struct se_portal_group *srpt_make_tpg(struct se_wwn *wwn,
+					     struct config_group *group,
+					     const char *name)
+{
+	struct srpt_port *sport = container_of(wwn, struct srpt_port, port_wwn);
+	int res;
+
+	/* Initialize sport->port_wwn and sport->port_tpg_1 */
+	res = core_tpg_register(&srpt_target->tf_ops, &sport->port_wwn,
+			&sport->port_tpg_1, sport, TRANSPORT_TPG_TYPE_NORMAL);
+	if (res)
+		return ERR_PTR(res);
+
+	return &sport->port_tpg_1;
+}
+
+/**
+ * configfs callback invoked for
+ * rmdir /sys/kernel/config/target/$driver/$port/$tpg
+ */
+static void srpt_drop_tpg(struct se_portal_group *tpg)
+{
+	struct srpt_port *sport = container_of(tpg,
+				struct srpt_port, port_tpg_1);
+
+	sport->enabled = false;
+	core_tpg_deregister(&sport->port_tpg_1);
+}
+
+/**
+ * configfs callback invoked for
+ * mkdir /sys/kernel/config/target/$driver/$port
+ */
+static struct se_wwn *srpt_make_tport(struct target_fabric_configfs *tf,
+				      struct config_group *group,
+				      const char *name)
+{
+	struct srpt_port *sport;
+	int ret;
+
+	sport = srpt_lookup_port(name);
+	pr_debug("make_tport(%s)\n", name);
+	ret = -EINVAL;
+	if (!sport)
+		goto err;
+
+	return &sport->port_wwn;
+
+err:
+	return ERR_PTR(ret);
+}
+
+/**
+ * configfs callback invoked for
+ * rmdir /sys/kernel/config/target/$driver/$port
+ */
+static void srpt_drop_tport(struct se_wwn *wwn)
+{
+	struct srpt_port *sport = container_of(wwn, struct srpt_port, port_wwn);
+
+	pr_debug("drop_tport(%s\n", config_item_name(&sport->port_wwn.wwn_group.cg_item));
+}
+
+static ssize_t srpt_wwn_show_attr_version(struct target_fabric_configfs *tf,
+					      char *buf)
+{
+	return scnprintf(buf, PAGE_SIZE, "%s\n", DRV_VERSION);
+}
+
+TF_WWN_ATTR_RO(srpt, version);
+
+static struct configfs_attribute *srpt_wwn_attrs[] = {
+	&srpt_wwn_version.attr,
+	NULL,
+};
+
+static struct target_core_fabric_ops srpt_template = {
+	.get_fabric_name		= srpt_get_fabric_name,
+	.get_fabric_proto_ident		= srpt_get_fabric_proto_ident,
+	.tpg_get_wwn			= srpt_get_fabric_wwn,
+	.tpg_get_tag			= srpt_get_tag,
+	.tpg_get_default_depth		= srpt_get_default_depth,
+	.tpg_get_pr_transport_id	= srpt_get_pr_transport_id,
+	.tpg_get_pr_transport_id_len	= srpt_get_pr_transport_id_len,
+	.tpg_parse_pr_out_transport_id	= srpt_parse_pr_out_transport_id,
+	.tpg_check_demo_mode		= srpt_check_false,
+	.tpg_check_demo_mode_cache	= srpt_check_true,
+	.tpg_check_demo_mode_write_protect = srpt_check_true,
+	.tpg_check_prod_mode_write_protect = srpt_check_false,
+	.tpg_alloc_fabric_acl		= srpt_alloc_fabric_acl,
+	.tpg_release_fabric_acl		= srpt_release_fabric_acl,
+	.tpg_get_inst_index		= srpt_tpg_get_inst_index,
+	.release_cmd			= srpt_release_cmd,
+	.check_stop_free		= srpt_check_stop_free,
+	.shutdown_session		= srpt_shutdown_session,
+	.close_session			= srpt_close_session,
+	.sess_get_index			= srpt_sess_get_index,
+	.sess_get_initiator_sid		= NULL,
+	.write_pending			= srpt_write_pending,
+	.write_pending_status		= srpt_write_pending_status,
+	.set_default_node_attributes	= srpt_set_default_node_attrs,
+	.get_task_tag			= srpt_get_task_tag,
+	.get_cmd_state			= srpt_get_tcm_cmd_state,
+	.queue_data_in			= srpt_queue_data_in,
+	.queue_status			= srpt_queue_status,
+	.queue_tm_rsp			= srpt_queue_tm_rsp,
+	.aborted_task			= srpt_aborted_task,
+	/*
+	 * Setup function pointers for generic logic in
+	 * target_core_fabric_configfs.c
+	 */
+	.fabric_make_wwn		= srpt_make_tport,
+	.fabric_drop_wwn		= srpt_drop_tport,
+	.fabric_make_tpg		= srpt_make_tpg,
+	.fabric_drop_tpg		= srpt_drop_tpg,
+	.fabric_post_link		= NULL,
+	.fabric_pre_unlink		= NULL,
+	.fabric_make_np			= NULL,
+	.fabric_drop_np			= NULL,
+	.fabric_make_nodeacl		= srpt_make_nodeacl,
+	.fabric_drop_nodeacl		= srpt_drop_nodeacl,
+};
+
+/**
+ * srpt_init_module() - Kernel module initialization.
+ *
+ * Note: Since ib_register_client() registers callback functions, and since at
+ * least one of these callback functions (srpt_add_one()) calls target core
+ * functions, this driver must be registered with the target core before
+ * ib_register_client() is called.
+ */
+static int __init srpt_init_module(void)
+{
+	int ret;
+
+	ret = -EINVAL;
+	if (srp_max_req_size < MIN_MAX_REQ_SIZE) {
+		printk(KERN_ERR "invalid value %d for kernel module parameter"
+		       " srp_max_req_size -- must be at least %d.\n",
+		       srp_max_req_size, MIN_MAX_REQ_SIZE);
+		goto out;
+	}
+
+	if (srpt_srq_size < MIN_SRPT_SRQ_SIZE
+	    || srpt_srq_size > MAX_SRPT_SRQ_SIZE) {
+		printk(KERN_ERR "invalid value %d for kernel module parameter"
+		       " srpt_srq_size -- must be in the range [%d..%d].\n",
+		       srpt_srq_size, MIN_SRPT_SRQ_SIZE, MAX_SRPT_SRQ_SIZE);
+		goto out;
+	}
+
+	srpt_target = target_fabric_configfs_init(THIS_MODULE, "srpt");
+	if (IS_ERR(srpt_target)) {
+		printk(KERN_ERR "couldn't register\n");
+		ret = PTR_ERR(srpt_target);
+		goto out;
+	}
+
+	srpt_target->tf_ops = srpt_template;
+
+	/*
+	 * Set up default attribute lists.
+	 */
+	srpt_target->tf_cit_tmpl.tfc_wwn_cit.ct_attrs = srpt_wwn_attrs;
+	srpt_target->tf_cit_tmpl.tfc_tpg_base_cit.ct_attrs = srpt_tpg_attrs;
+	srpt_target->tf_cit_tmpl.tfc_tpg_attrib_cit.ct_attrs = srpt_tpg_attrib_attrs;
+	srpt_target->tf_cit_tmpl.tfc_tpg_param_cit.ct_attrs = NULL;
+	srpt_target->tf_cit_tmpl.tfc_tpg_np_base_cit.ct_attrs = NULL;
+	srpt_target->tf_cit_tmpl.tfc_tpg_nacl_base_cit.ct_attrs = NULL;
+	srpt_target->tf_cit_tmpl.tfc_tpg_nacl_attrib_cit.ct_attrs = NULL;
+	srpt_target->tf_cit_tmpl.tfc_tpg_nacl_auth_cit.ct_attrs = NULL;
+	srpt_target->tf_cit_tmpl.tfc_tpg_nacl_param_cit.ct_attrs = NULL;
+
+	ret = target_fabric_configfs_register(srpt_target);
+	if (ret < 0) {
+		printk(KERN_ERR "couldn't register\n");
+		goto out_free_target;
+	}
+
+	ret = ib_register_client(&srpt_client);
+	if (ret) {
+		printk(KERN_ERR "couldn't register IB client\n");
+		goto out_unregister_target;
+	}
+
+	return 0;
+
+out_unregister_target:
+	target_fabric_configfs_deregister(srpt_target);
+	srpt_target = NULL;
+out_free_target:
+	if (srpt_target)
+		target_fabric_configfs_free(srpt_target);
+out:
+	return ret;
+}
+
+static void __exit srpt_cleanup_module(void)
+{
+	ib_unregister_client(&srpt_client);
+	target_fabric_configfs_deregister(srpt_target);
+	srpt_target = NULL;
+}
+
+module_init(srpt_init_module);
+module_exit(srpt_cleanup_module);
diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.h b/drivers/infiniband/ulp/srpt/ib_srpt.h
new file mode 100644
index 0000000..3dae156
--- /dev/null
+++ b/drivers/infiniband/ulp/srpt/ib_srpt.h
@@ -0,0 +1,443 @@
+/*
+ * Copyright (c) 2006 - 2009 Mellanox Technology Inc.  All rights reserved.
+ * Copyright (C) 2009 - 2010 Bart Van Assche <bvanassche@acm.org>.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifndef IB_SRPT_H
+#define IB_SRPT_H
+
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/wait.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_sa.h>
+#include <rdma/ib_cm.h>
+
+#include <scsi/srp.h>
+
+#include "ib_dm_mad.h"
+
+/*
+ * The prefix the ServiceName field must start with in the device management
+ * ServiceEntries attribute pair. See also the SRP specification.
+ */
+#define SRP_SERVICE_NAME_PREFIX		"SRP.T10:"
+
+enum {
+	/*
+	 * SRP IOControllerProfile attributes for SRP target ports that have
+	 * not been defined in <scsi/srp.h>. Source: section B.7, table B.7
+	 * in the SRP specification.
+	 */
+	SRP_PROTOCOL = 0x0108,
+	SRP_PROTOCOL_VERSION = 0x0001,
+	SRP_IO_SUBCLASS = 0x609e,
+	SRP_SEND_TO_IOC = 0x01,
+	SRP_SEND_FROM_IOC = 0x02,
+	SRP_RDMA_READ_FROM_IOC = 0x08,
+	SRP_RDMA_WRITE_FROM_IOC = 0x20,
+
+	/*
+	 * srp_login_cmd.req_flags bitmasks. See also table 9 in the SRP
+	 * specification.
+	 */
+	SRP_MTCH_ACTION = 0x03, /* MULTI-CHANNEL ACTION */
+	SRP_LOSOLNT = 0x10, /* logout solicited notification */
+	SRP_CRSOLNT = 0x20, /* credit request solicited notification */
+	SRP_AESOLNT = 0x40, /* asynchronous event solicited notification */
+
+	/*
+	 * srp_cmd.sol_nt / srp_tsk_mgmt.sol_not bitmasks. See also tables
+	 * 18 and 20 in the SRP specification.
+	 */
+	SRP_SCSOLNT = 0x02, /* SCSOLNT = successful solicited notification */
+	SRP_UCSOLNT = 0x04, /* UCSOLNT = unsuccessful solicited notification */
+
+	/*
+	 * srp_rsp.sol_not / srp_t_logout.sol_not bitmasks. See also tables
+	 * 16 and 22 in the SRP specification.
+	 */
+	SRP_SOLNT = 0x01, /* SOLNT = solicited notification */
+
+	/* See also table 24 in the SRP specification. */
+	SRP_TSK_MGMT_SUCCESS = 0x00,
+	SRP_TSK_MGMT_FUNC_NOT_SUPP = 0x04,
+	SRP_TSK_MGMT_FAILED = 0x05,
+
+	/* See also table 21 in the SRP specification. */
+	SRP_CMD_SIMPLE_Q = 0x0,
+	SRP_CMD_HEAD_OF_Q = 0x1,
+	SRP_CMD_ORDERED_Q = 0x2,
+	SRP_CMD_ACA = 0x4,
+
+	SRP_LOGIN_RSP_MULTICHAN_NO_CHAN = 0x0,
+	SRP_LOGIN_RSP_MULTICHAN_TERMINATED = 0x1,
+	SRP_LOGIN_RSP_MULTICHAN_MAINTAINED = 0x2,
+
+	SRPT_DEF_SG_TABLESIZE = 128,
+	SRPT_DEF_SG_PER_WQE = 16,
+
+	MIN_SRPT_SQ_SIZE = 16,
+	DEF_SRPT_SQ_SIZE = 4096,
+	SRPT_RQ_SIZE = 128,
+	MIN_SRPT_SRQ_SIZE = 4,
+	DEFAULT_SRPT_SRQ_SIZE = 4095,
+	MAX_SRPT_SRQ_SIZE = 65535,
+	MAX_SRPT_RDMA_SIZE = 1U << 24,
+	MAX_SRPT_RSP_SIZE = 1024,
+
+	MIN_MAX_REQ_SIZE = 996,
+	DEFAULT_MAX_REQ_SIZE
+		= sizeof(struct srp_cmd)/*48*/
+		+ sizeof(struct srp_indirect_buf)/*20*/
+		+ 128 * sizeof(struct srp_direct_buf)/*16*/,
+
+	MIN_MAX_RSP_SIZE = sizeof(struct srp_rsp)/*36*/ + 4,
+	DEFAULT_MAX_RSP_SIZE = 256, /* leaves 220 bytes for sense data */
+
+	DEFAULT_MAX_RDMA_SIZE = 65536,
+};
+
+enum srpt_opcode {
+	SRPT_RECV,
+	SRPT_SEND,
+	SRPT_RDMA_MID,
+	SRPT_RDMA_ABORT,
+	SRPT_RDMA_READ_LAST,
+	SRPT_RDMA_WRITE_LAST,
+};
+
+static inline u64 encode_wr_id(u8 opcode, u32 idx)
+{
+	return ((u64)opcode << 32) | idx;
+}
+static inline enum srpt_opcode opcode_from_wr_id(u64 wr_id)
+{
+	return wr_id >> 32;
+}
+static inline u32 idx_from_wr_id(u64 wr_id)
+{
+	return (u32)wr_id;
+}
+
+struct rdma_iu {
+	u64		raddr;
+	u32		rkey;
+	struct ib_sge	*sge;
+	u32		sge_cnt;
+	int		mem_id;
+};
+
+/**
+ * enum srpt_command_state - SCSI command state managed by SRPT.
+ * @SRPT_STATE_NEW:           New command arrived and is being processed.
+ * @SRPT_STATE_NEED_DATA:     Processing a write or bidir command and waiting
+ *                            for data arrival.
+ * @SRPT_STATE_DATA_IN:       Data for the write or bidir command arrived and is
+ *                            being processed.
+ * @SRPT_STATE_CMD_RSP_SENT:  SRP_RSP for SRP_CMD has been sent.
+ * @SRPT_STATE_MGMT:          Processing a SCSI task management command.
+ * @SRPT_STATE_MGMT_RSP_SENT: SRP_RSP for SRP_TSK_MGMT has been sent.
+ * @SRPT_STATE_DONE:          Command processing finished successfully, command
+ *                            processing has been aborted or command processing
+ *                            failed.
+ */
+enum srpt_command_state {
+	SRPT_STATE_NEW		 = 0,
+	SRPT_STATE_NEED_DATA	 = 1,
+	SRPT_STATE_DATA_IN	 = 2,
+	SRPT_STATE_CMD_RSP_SENT	 = 3,
+	SRPT_STATE_MGMT		 = 4,
+	SRPT_STATE_MGMT_RSP_SENT = 5,
+	SRPT_STATE_DONE		 = 6,
+};
+
+/**
+ * struct srpt_ioctx - Shared SRPT I/O context information.
+ * @buf:   Pointer to the buffer.
+ * @dma:   DMA address of the buffer.
+ * @index: Index of the I/O context in its ioctx_ring array.
+ */
+struct srpt_ioctx {
+	void			*buf;
+	dma_addr_t		dma;
+	uint32_t		index;
+};
+
+/**
+ * struct srpt_recv_ioctx - SRPT receive I/O context.
+ * @ioctx:     See above.
+ * @wait_list: Node for insertion in srpt_rdma_ch.cmd_wait_list.
+ */
+struct srpt_recv_ioctx {
+	struct srpt_ioctx	ioctx;
+	struct list_head	wait_list;
+};
+
+/**
+ * struct srpt_send_ioctx - SRPT send I/O context.
+ * @ioctx:       See above.
+ * @ch:          Channel pointer.
+ * @free_list:   Node in srpt_rdma_ch.free_list.
+ * @n_rbuf:      Number of data buffers in the received SRP command.
+ * @rbufs:       Pointer to SRP data buffer array.
+ * @single_rbuf: SRP data buffer if the command has only a single buffer.
+ * @sg:          Pointer to sg-list associated with this I/O context.
+ * @sg_cnt:      SG-list size.
+ * @mapped_sg_count: ib_dma_map_sg() return value.
+ * @n_rdma_ius:  Number of elements in the rdma_ius array.
+ * @rdma_ius:    Array with information about the RDMA mapping.
+ * @tag:         Tag of the received SRP information unit.
+ * @spinlock:    Protects 'state'.
+ * @state:       I/O context state.
+ * @rdma_aborted: If initiating a multipart RDMA transfer failed, whether
+ * 		 the already initiated transfers have finished.
+ * @cmd:         Target core command data structure.
+ * @sense_data:  SCSI sense data.
+ */
+struct srpt_send_ioctx {
+	struct srpt_ioctx	ioctx;
+	struct srpt_rdma_ch	*ch;
+	struct rdma_iu		*rdma_ius;
+	struct srp_direct_buf	*rbufs;
+	struct srp_direct_buf	single_rbuf;
+	struct scatterlist	*sg;
+	struct list_head	free_list;
+	spinlock_t		spinlock;
+	enum srpt_command_state	state;
+	bool			rdma_aborted;
+	struct se_cmd		cmd;
+	struct completion	tx_done;
+	u64			tag;
+	int			sg_cnt;
+	int			mapped_sg_count;
+	u16			n_rdma_ius;
+	u8			n_rdma;
+	u8			n_rbuf;
+	bool			queue_status_only;
+	u8			sense_data[SCSI_SENSE_BUFFERSIZE];
+};
+
+/**
+ * enum rdma_ch_state - SRP channel state.
+ * @CH_CONNECTING:	 QP is in RTR state; waiting for RTU.
+ * @CH_LIVE:		 QP is in RTS state.
+ * @CH_DISCONNECTING:    DREQ has been received; waiting for DREP
+ *                       or DREQ has been send and waiting for DREP
+ *                       or .
+ * @CH_DRAINING:	 QP is in ERR state; waiting for last WQE event.
+ * @CH_RELEASING:	 Last WQE event has been received; releasing resources.
+ */
+enum rdma_ch_state {
+	CH_CONNECTING,
+	CH_LIVE,
+	CH_DISCONNECTING,
+	CH_DRAINING,
+	CH_RELEASING
+};
+
+/**
+ * struct srpt_rdma_ch - RDMA channel.
+ * @wait_queue:    Allows the kernel thread to wait for more work.
+ * @thread:        Kernel thread that processes the IB queues associated with
+ *                 the channel.
+ * @cm_id:         IB CM ID associated with the channel.
+ * @qp:            IB queue pair used for communicating over this channel.
+ * @cq:            IB completion queue for this channel.
+ * @rq_size:       IB receive queue size.
+ * @rsp_size	   IB response message size in bytes.
+ * @sq_wr_avail:   number of work requests available in the send queue.
+ * @sport:         pointer to the information of the HCA port used by this
+ *                 channel.
+ * @i_port_id:     128-bit initiator port identifier copied from SRP_LOGIN_REQ.
+ * @t_port_id:     128-bit target port identifier copied from SRP_LOGIN_REQ.
+ * @max_ti_iu_len: maximum target-to-initiator information unit length.
+ * @req_lim:       request limit: maximum number of requests that may be sent
+ *                 by the initiator without having received a response.
+ * @req_lim_delta: Number of credits not yet sent back to the initiator.
+ * @spinlock:      Protects free_list and state.
+ * @free_list:     Head of list with free send I/O contexts.
+ * @state:         channel state. See also enum rdma_ch_state.
+ * @ioctx_ring:    Send ring.
+ * @wc:            IB work completion array for srpt_process_completion().
+ * @list:          Node for insertion in the srpt_device.rch_list list.
+ * @cmd_wait_list: List of SCSI commands that arrived before the RTU event. This
+ *                 list contains struct srpt_ioctx elements and is protected
+ *                 against concurrent modification by the cm_id spinlock.
+ * @sess:          Session information associated with this SRP channel.
+ * @sess_name:     Session name.
+ * @release_work:  Allows scheduling of srpt_release_channel().
+ * @release_done:  Enables waiting for srpt_release_channel() completion.
+ */
+struct srpt_rdma_ch {
+	wait_queue_head_t	wait_queue;
+	struct task_struct	*thread;
+	struct ib_cm_id		*cm_id;
+	struct ib_qp		*qp;
+	struct ib_cq		*cq;
+	int			rq_size;
+	u32			rsp_size;
+	atomic_t		sq_wr_avail;
+	struct srpt_port	*sport;
+	u8			i_port_id[16];
+	u8			t_port_id[16];
+	int			max_ti_iu_len;
+	atomic_t		req_lim;
+	atomic_t		req_lim_delta;
+	spinlock_t		spinlock;
+	struct list_head	free_list;
+	enum rdma_ch_state	state;
+	struct srpt_send_ioctx	**ioctx_ring;
+	struct ib_wc		wc[16];
+	struct list_head	list;
+	struct list_head	cmd_wait_list;
+	struct se_session	*sess;
+	u8			sess_name[36];
+	struct work_struct	release_work;
+	struct completion	*release_done;
+	bool			in_shutdown;
+};
+
+/**
+ * struct srpt_port_attib - Attributes for SRPT port
+ * @srp_max_rdma_size: Maximum size of SRP RDMA transfers for new connections.
+ * @srp_max_rsp_size: Maximum size of SRP response messages in bytes.
+ * @srp_sq_size: Shared receive queue (SRQ) size.
+ */
+struct srpt_port_attrib {
+	u32			srp_max_rdma_size;
+	u32			srp_max_rsp_size;
+	u32			srp_sq_size;
+};
+
+/**
+ * struct srpt_port - Information associated by SRPT with a single IB port.
+ * @sdev:      backpointer to the HCA information.
+ * @mad_agent: per-port management datagram processing information.
+ * @enabled:   Whether or not this target port is enabled.
+ * @port_guid: ASCII representation of Port GUID
+ * @port:      one-based port number.
+ * @sm_lid:    cached value of the port's sm_lid.
+ * @lid:       cached value of the port's lid.
+ * @gid:       cached value of the port's gid.
+ * @port_acl_lock spinlock for port_acl_list:
+ * @work:      work structure for refreshing the aforementioned cached values.
+ * @port_tpg_1 Target portal group = 1 data.
+ * @port_wwn:  Target core WWN data.
+ * @port_acl_list: Head of the list with all node ACLs for this port.
+ */
+struct srpt_port {
+	struct srpt_device	*sdev;
+	struct ib_mad_agent	*mad_agent;
+	bool			enabled;
+	u8			port_guid[64];
+	u8			port;
+	u16			sm_lid;
+	u16			lid;
+	union ib_gid		gid;
+	spinlock_t		port_acl_lock;
+	struct work_struct	work;
+	struct se_portal_group	port_tpg_1;
+	struct se_wwn		port_wwn;
+	struct list_head	port_acl_list;
+	struct srpt_port_attrib port_attrib;
+};
+
+/**
+ * struct srpt_device - Information associated by SRPT with a single HCA.
+ * @device:        Backpointer to the struct ib_device managed by the IB core.
+ * @pd:            IB protection domain.
+ * @mr:            L_Key (local key) with write access to all local memory.
+ * @srq:           Per-HCA SRQ (shared receive queue).
+ * @cm_id:         Connection identifier.
+ * @dev_attr:      Attributes of the InfiniBand device as obtained during the
+ *                 ib_client.add() callback.
+ * @srq_size:      SRQ size.
+ * @ioctx_ring:    Per-HCA SRQ.
+ * @rch_list:      Per-device channel list -- see also srpt_rdma_ch.list.
+ * @ch_releaseQ:   Enables waiting for removal from rch_list.
+ * @spinlock:      Protects rch_list and tpg.
+ * @port:          Information about the ports owned by this HCA.
+ * @event_handler: Per-HCA asynchronous IB event handler.
+ * @list:          Node in srpt_dev_list.
+ */
+struct srpt_device {
+	struct ib_device	*device;
+	struct ib_pd		*pd;
+	struct ib_mr		*mr;
+	struct ib_srq		*srq;
+	struct ib_cm_id		*cm_id;
+	struct ib_device_attr	dev_attr;
+	int			srq_size;
+	struct srpt_recv_ioctx	**ioctx_ring;
+	struct list_head	rch_list;
+	wait_queue_head_t	ch_releaseQ;
+	spinlock_t		spinlock;
+	struct srpt_port	port[2];
+	struct ib_event_handler	event_handler;
+	struct list_head	list;
+};
+
+/**
+ * struct srpt_node_acl - Per-initiator ACL data (managed via configfs).
+ * @i_port_id: 128-bit SRP initiator port ID.
+ * @sport:     port information.
+ * @nacl:      Target core node ACL information.
+ * @list:      Element of the per-HCA ACL list.
+ */
+struct srpt_node_acl {
+	u8			i_port_id[16];
+	struct srpt_port	*sport;
+	struct se_node_acl	nacl;
+	struct list_head	list;
+};
+
+/*
+ * SRP-releated SCSI persistent reservation definitions.
+ *
+ * See also SPC4r28, section 7.6.1 (Protocol specific parameters introduction).
+ * See also SPC4r28, section 7.6.4.5 (TransportID for initiator ports using
+ * SCSI over an RDMA interface).
+ */
+
+enum {
+	SCSI_TRANSPORTID_PROTOCOLID_SRP	= 4,
+};
+
+struct spc_rdma_transport_id {
+	uint8_t protocol_identifier;
+	uint8_t reserved[7];
+	uint8_t i_port_id[16];
+};
+
+#endif				/* IB_SRPT_H */
diff --git a/drivers/net/ethernet/chelsio/cxgb3/Makefile b/drivers/net/ethernet/chelsio/cxgb3/Makefile
new file mode 100644
index 0000000..29aff78
--- /dev/null
+++ b/drivers/net/ethernet/chelsio/cxgb3/Makefile
@@ -0,0 +1,8 @@
+#
+# Chelsio T3 driver
+#
+
+obj-$(CONFIG_CHELSIO_T3) += cxgb3.o
+
+cxgb3-objs := cxgb3_main.o ael1002.o vsc8211.o t3_hw.o mc5.o \
+	      xgmac.o sge.o l2t.o cxgb3_offload.o aq100x.o
diff --git a/drivers/net/ethernet/chelsio/cxgb3/adapter.h b/drivers/net/ethernet/chelsio/cxgb3/adapter.h
new file mode 100644
index 0000000..8b395b5
--- /dev/null
+++ b/drivers/net/ethernet/chelsio/cxgb3/adapter.h
@@ -0,0 +1,334 @@
+/*
+ * Copyright (c) 2003-2008 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/* This file should not be included directly.  Include common.h instead. */
+
+#ifndef __T3_ADAPTER_H__
+#define __T3_ADAPTER_H__
+
+#include <linux/pci.h>
+#include <linux/spinlock.h>
+#include <linux/interrupt.h>
+#include <linux/timer.h>
+#include <linux/cache.h>
+#include <linux/mutex.h>
+#include <linux/bitops.h>
+#include "t3cdev.h"
+#include <asm/io.h>
+
+struct adapter;
+struct sge_qset;
+struct port_info;
+
+enum mac_idx_types {
+	LAN_MAC_IDX	= 0,
+	SAN_MAC_IDX,
+
+	MAX_MAC_IDX
+};
+
+struct iscsi_config {
+	__u8	mac_addr[ETH_ALEN];
+	__u32	flags;
+	int (*send)(struct port_info *pi, struct sk_buff **skb);
+	int (*recv)(struct port_info *pi, struct sk_buff *skb);
+};
+
+struct port_info {
+	struct adapter *adapter;
+	struct sge_qset *qs;
+	u8 port_id;
+	u8 nqsets;
+	u8 first_qset;
+	struct cphy phy;
+	struct cmac mac;
+	struct link_config link_config;
+	struct net_device_stats netstats;
+	int activity;
+	__be32 iscsi_ipv4addr;
+	struct iscsi_config iscsic;
+
+	int link_fault; /* link fault was detected */
+};
+
+enum {				/* adapter flags */
+	FULL_INIT_DONE = (1 << 0),
+	USING_MSI = (1 << 1),
+	USING_MSIX = (1 << 2),
+	QUEUES_BOUND = (1 << 3),
+	TP_PARITY_INIT = (1 << 4),
+	NAPI_INIT = (1 << 5),
+};
+
+struct fl_pg_chunk {
+	struct page *page;
+	void *va;
+	unsigned int offset;
+	unsigned long *p_cnt;
+	dma_addr_t mapping;
+};
+
+struct rx_desc;
+struct rx_sw_desc;
+
+struct sge_fl {                     /* SGE per free-buffer list state */
+	unsigned int buf_size;      /* size of each Rx buffer */
+	unsigned int credits;       /* # of available Rx buffers */
+	unsigned int pend_cred;     /* new buffers since last FL DB ring */
+	unsigned int size;          /* capacity of free list */
+	unsigned int cidx;          /* consumer index */
+	unsigned int pidx;          /* producer index */
+	unsigned int gen;           /* free list generation */
+	struct fl_pg_chunk pg_chunk;/* page chunk cache */
+	unsigned int use_pages;     /* whether FL uses pages or sk_buffs */
+	unsigned int order;	    /* order of page allocations */
+	unsigned int alloc_size;    /* size of allocated buffer */
+	struct rx_desc *desc;       /* address of HW Rx descriptor ring */
+	struct rx_sw_desc *sdesc;   /* address of SW Rx descriptor ring */
+	dma_addr_t   phys_addr;     /* physical address of HW ring start */
+	unsigned int cntxt_id;      /* SGE context id for the free list */
+	unsigned long empty;        /* # of times queue ran out of buffers */
+	unsigned long alloc_failed; /* # of times buffer allocation failed */
+};
+
+/*
+ * Bundle size for grouping offload RX packets for delivery to the stack.
+ * Don't make this too big as we do prefetch on each packet in a bundle.
+ */
+# define RX_BUNDLE_SIZE 8
+
+struct rsp_desc;
+
+struct sge_rspq {		/* state for an SGE response queue */
+	unsigned int credits;	/* # of pending response credits */
+	unsigned int size;	/* capacity of response queue */
+	unsigned int cidx;	/* consumer index */
+	unsigned int gen;	/* current generation bit */
+	unsigned int polling;	/* is the queue serviced through NAPI? */
+	unsigned int holdoff_tmr;	/* interrupt holdoff timer in 100ns */
+	unsigned int next_holdoff;	/* holdoff time for next interrupt */
+	unsigned int rx_recycle_buf; /* whether recycling occurred
+					within current sop-eop */
+	struct rsp_desc *desc;	/* address of HW response ring */
+	dma_addr_t phys_addr;	/* physical address of the ring */
+	unsigned int cntxt_id;	/* SGE context id for the response q */
+	spinlock_t lock;	/* guards response processing */
+	struct sk_buff_head rx_queue; /* offload packet receive queue */
+	struct sk_buff *pg_skb; /* used to build frag list in napi handler */
+
+	unsigned long offload_pkts;
+	unsigned long offload_bundles;
+	unsigned long eth_pkts;	/* # of ethernet packets */
+	unsigned long pure_rsps;	/* # of pure (non-data) responses */
+	unsigned long imm_data;	/* responses with immediate data */
+	unsigned long rx_drops;	/* # of packets dropped due to no mem */
+	unsigned long async_notif; /* # of asynchronous notification events */
+	unsigned long empty;	/* # of times queue ran out of credits */
+	unsigned long nomem;	/* # of responses deferred due to no mem */
+	unsigned long unhandled_irqs;	/* # of spurious intrs */
+	unsigned long starved;
+	unsigned long restarted;
+};
+
+struct tx_desc;
+struct tx_sw_desc;
+
+struct sge_txq {		/* state for an SGE Tx queue */
+	unsigned long flags;	/* HW DMA fetch status */
+	unsigned int in_use;	/* # of in-use Tx descriptors */
+	unsigned int size;	/* # of descriptors */
+	unsigned int processed;	/* total # of descs HW has processed */
+	unsigned int cleaned;	/* total # of descs SW has reclaimed */
+	unsigned int stop_thres;	/* SW TX queue suspend threshold */
+	unsigned int cidx;	/* consumer index */
+	unsigned int pidx;	/* producer index */
+	unsigned int gen;	/* current value of generation bit */
+	unsigned int unacked;	/* Tx descriptors used since last COMPL */
+	struct tx_desc *desc;	/* address of HW Tx descriptor ring */
+	struct tx_sw_desc *sdesc;	/* address of SW Tx descriptor ring */
+	spinlock_t lock;	/* guards enqueueing of new packets */
+	unsigned int token;	/* WR token */
+	dma_addr_t phys_addr;	/* physical address of the ring */
+	struct sk_buff_head sendq;	/* List of backpressured offload packets */
+	struct tasklet_struct qresume_tsk;	/* restarts the queue */
+	unsigned int cntxt_id;	/* SGE context id for the Tx q */
+	unsigned long stops;	/* # of times q has been stopped */
+	unsigned long restarts;	/* # of queue restarts */
+};
+
+enum {				/* per port SGE statistics */
+	SGE_PSTAT_TSO,		/* # of TSO requests */
+	SGE_PSTAT_RX_CSUM_GOOD,	/* # of successful RX csum offloads */
+	SGE_PSTAT_TX_CSUM,	/* # of TX checksum offloads */
+	SGE_PSTAT_VLANEX,	/* # of VLAN tag extractions */
+	SGE_PSTAT_VLANINS,	/* # of VLAN tag insertions */
+
+	SGE_PSTAT_MAX		/* must be last */
+};
+
+struct napi_gro_fraginfo;
+
+struct sge_qset {		/* an SGE queue set */
+	struct adapter *adap;
+	struct napi_struct napi;
+	struct sge_rspq rspq;
+	struct sge_fl fl[SGE_RXQ_PER_SET];
+	struct sge_txq txq[SGE_TXQ_PER_SET];
+	int nomem;
+	void *lro_va;
+	struct net_device *netdev;
+	struct netdev_queue *tx_q;	/* associated netdev TX queue */
+	unsigned long txq_stopped;	/* which Tx queues are stopped */
+	struct timer_list tx_reclaim_timer;	/* reclaims TX buffers */
+	struct timer_list rx_reclaim_timer;	/* reclaims RX buffers */
+	unsigned long port_stats[SGE_PSTAT_MAX];
+} ____cacheline_aligned;
+
+struct sge {
+	struct sge_qset qs[SGE_QSETS];
+	spinlock_t reg_lock;	/* guards non-atomic SGE registers (eg context) */
+};
+
+struct adapter {
+	struct t3cdev tdev;
+	struct list_head adapter_list;
+	void __iomem *regs;
+	struct pci_dev *pdev;
+	unsigned long registered_device_map;
+	unsigned long open_device_map;
+	unsigned long flags;
+
+	const char *name;
+	int msg_enable;
+	unsigned int mmio_len;
+
+	struct adapter_params params;
+	unsigned int slow_intr_mask;
+	unsigned long irq_stats[IRQ_NUM_STATS];
+
+	int msix_nvectors;
+	struct {
+		unsigned short vec;
+		char desc[22];
+	} msix_info[SGE_QSETS + 1];
+
+	/* T3 modules */
+	struct sge sge;
+	struct mc7 pmrx;
+	struct mc7 pmtx;
+	struct mc7 cm;
+	struct mc5 mc5;
+
+	struct net_device *port[MAX_NPORTS];
+	unsigned int check_task_cnt;
+	struct delayed_work adap_check_task;
+	struct work_struct ext_intr_handler_task;
+	struct work_struct fatal_error_handler_task;
+	struct work_struct link_fault_handler_task;
+
+	struct work_struct db_full_task;
+	struct work_struct db_empty_task;
+	struct work_struct db_drop_task;
+
+	struct dentry *debugfs_root;
+
+	struct mutex mdio_lock;
+	spinlock_t stats_lock;
+	spinlock_t work_lock;
+
+	struct sk_buff *nofail_skb;
+};
+
+static inline u32 t3_read_reg(struct adapter *adapter, u32 reg_addr)
+{
+	u32 val = readl(adapter->regs + reg_addr);
+
+	CH_DBG(adapter, MMIO, "read register 0x%x value 0x%x\n", reg_addr, val);
+	return val;
+}
+
+static inline void t3_write_reg(struct adapter *adapter, u32 reg_addr, u32 val)
+{
+	CH_DBG(adapter, MMIO, "setting register 0x%x to 0x%x\n", reg_addr, val);
+	writel(val, adapter->regs + reg_addr);
+}
+
+static inline struct port_info *adap2pinfo(struct adapter *adap, int idx)
+{
+	return netdev_priv(adap->port[idx]);
+}
+
+static inline int phy2portid(struct cphy *phy)
+{
+	struct adapter *adap = phy->adapter;
+	struct port_info *port0 = adap2pinfo(adap, 0);
+
+	return &port0->phy == phy ? 0 : 1;
+}
+
+#define OFFLOAD_DEVMAP_BIT 15
+
+#define tdev2adap(d) container_of(d, struct adapter, tdev)
+
+static inline int offload_running(struct adapter *adapter)
+{
+	return test_bit(OFFLOAD_DEVMAP_BIT, &adapter->open_device_map);
+}
+
+int t3_offload_tx(struct t3cdev *tdev, struct sk_buff *skb);
+
+void t3_os_ext_intr_handler(struct adapter *adapter);
+void t3_os_link_changed(struct adapter *adapter, int port_id, int link_status,
+			int speed, int duplex, int fc);
+void t3_os_phymod_changed(struct adapter *adap, int port_id);
+void t3_os_link_fault(struct adapter *adapter, int port_id, int state);
+void t3_os_link_fault_handler(struct adapter *adapter, int port_id);
+
+void t3_sge_start(struct adapter *adap);
+void t3_sge_stop(struct adapter *adap);
+void t3_start_sge_timers(struct adapter *adap);
+void t3_stop_sge_timers(struct adapter *adap);
+void t3_free_sge_resources(struct adapter *adap);
+void t3_sge_err_intr_handler(struct adapter *adapter);
+irq_handler_t t3_intr_handler(struct adapter *adap, int polling);
+netdev_tx_t t3_eth_xmit(struct sk_buff *skb, struct net_device *dev);
+int t3_mgmt_tx(struct adapter *adap, struct sk_buff *skb);
+void t3_update_qset_coalesce(struct sge_qset *qs, const struct qset_params *p);
+int t3_sge_alloc_qset(struct adapter *adapter, unsigned int id, int nports,
+		      int irq_vec_idx, const struct qset_params *p,
+		      int ntxq, struct net_device *dev,
+		      struct netdev_queue *netdevq);
+extern struct workqueue_struct *cxgb3_wq;
+
+int t3_get_edc_fw(struct cphy *phy, int edc_idx, int size);
+
+#endif				/* __T3_ADAPTER_H__ */
diff --git a/drivers/net/ethernet/chelsio/cxgb3/ael1002.c b/drivers/net/ethernet/chelsio/cxgb3/ael1002.c
new file mode 100644
index 0000000..2028da9
--- /dev/null
+++ b/drivers/net/ethernet/chelsio/cxgb3/ael1002.c
@@ -0,0 +1,941 @@
+/*
+ * Copyright (c) 2005-2008 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "common.h"
+#include "regs.h"
+
+enum {
+	AEL100X_TX_CONFIG1 = 0xc002,
+	AEL1002_PWR_DOWN_HI = 0xc011,
+	AEL1002_PWR_DOWN_LO = 0xc012,
+	AEL1002_XFI_EQL = 0xc015,
+	AEL1002_LB_EN = 0xc017,
+	AEL_OPT_SETTINGS = 0xc017,
+	AEL_I2C_CTRL = 0xc30a,
+	AEL_I2C_DATA = 0xc30b,
+	AEL_I2C_STAT = 0xc30c,
+	AEL2005_GPIO_CTRL = 0xc214,
+	AEL2005_GPIO_STAT = 0xc215,
+
+	AEL2020_GPIO_INTR   = 0xc103,	/* Latch High (LH) */
+	AEL2020_GPIO_CTRL   = 0xc108,	/* Store Clear (SC) */
+	AEL2020_GPIO_STAT   = 0xc10c,	/* Read Only (RO) */
+	AEL2020_GPIO_CFG    = 0xc110,	/* Read Write (RW) */
+
+	AEL2020_GPIO_SDA    = 0,	/* IN: i2c serial data */
+	AEL2020_GPIO_MODDET = 1,	/* IN: Module Detect */
+	AEL2020_GPIO_0      = 3,	/* IN: unassigned */
+	AEL2020_GPIO_1      = 2,	/* OUT: unassigned */
+	AEL2020_GPIO_LSTAT  = AEL2020_GPIO_1, /* wired to link status LED */
+};
+
+enum { edc_none, edc_sr, edc_twinax };
+
+/* PHY module I2C device address */
+enum {
+	MODULE_DEV_ADDR	= 0xa0,
+	SFF_DEV_ADDR	= 0xa2,
+};
+
+/* PHY transceiver type */
+enum {
+	phy_transtype_unknown = 0,
+	phy_transtype_sfp     = 3,
+	phy_transtype_xfp     = 6,
+};
+
+#define AEL2005_MODDET_IRQ 4
+
+struct reg_val {
+	unsigned short mmd_addr;
+	unsigned short reg_addr;
+	unsigned short clear_bits;
+	unsigned short set_bits;
+};
+
+static int set_phy_regs(struct cphy *phy, const struct reg_val *rv)
+{
+	int err;
+
+	for (err = 0; rv->mmd_addr && !err; rv++) {
+		if (rv->clear_bits == 0xffff)
+			err = t3_mdio_write(phy, rv->mmd_addr, rv->reg_addr,
+					    rv->set_bits);
+		else
+			err = t3_mdio_change_bits(phy, rv->mmd_addr,
+						  rv->reg_addr, rv->clear_bits,
+						  rv->set_bits);
+	}
+	return err;
+}
+
+static void ael100x_txon(struct cphy *phy)
+{
+	int tx_on_gpio =
+		phy->mdio.prtad == 0 ? F_GPIO7_OUT_VAL : F_GPIO2_OUT_VAL;
+
+	msleep(100);
+	t3_set_reg_field(phy->adapter, A_T3DBG_GPIO_EN, 0, tx_on_gpio);
+	msleep(30);
+}
+
+/*
+ * Read an 8-bit word from a device attached to the PHY's i2c bus.
+ */
+static int ael_i2c_rd(struct cphy *phy, int dev_addr, int word_addr)
+{
+	int i, err;
+	unsigned int stat, data;
+
+	err = t3_mdio_write(phy, MDIO_MMD_PMAPMD, AEL_I2C_CTRL,
+			    (dev_addr << 8) | (1 << 8) | word_addr);
+	if (err)
+		return err;
+
+	for (i = 0; i < 200; i++) {
+		msleep(1);
+		err = t3_mdio_read(phy, MDIO_MMD_PMAPMD, AEL_I2C_STAT, &stat);
+		if (err)
+			return err;
+		if ((stat & 3) == 1) {
+			err = t3_mdio_read(phy, MDIO_MMD_PMAPMD, AEL_I2C_DATA,
+					   &data);
+			if (err)
+				return err;
+			return data >> 8;
+		}
+	}
+	CH_WARN(phy->adapter, "PHY %u i2c read of dev.addr %#x.%#x timed out\n",
+		phy->mdio.prtad, dev_addr, word_addr);
+	return -ETIMEDOUT;
+}
+
+static int ael1002_power_down(struct cphy *phy, int enable)
+{
+	int err;
+
+	err = t3_mdio_write(phy, MDIO_MMD_PMAPMD, MDIO_PMA_TXDIS, !!enable);
+	if (!err)
+		err = mdio_set_flag(&phy->mdio, phy->mdio.prtad,
+				    MDIO_MMD_PMAPMD, MDIO_CTRL1,
+				    MDIO_CTRL1_LPOWER, enable);
+	return err;
+}
+
+static int ael1002_reset(struct cphy *phy, int wait)
+{
+	int err;
+
+	if ((err = ael1002_power_down(phy, 0)) ||
+	    (err = t3_mdio_write(phy, MDIO_MMD_PMAPMD, AEL100X_TX_CONFIG1, 1)) ||
+	    (err = t3_mdio_write(phy, MDIO_MMD_PMAPMD, AEL1002_PWR_DOWN_HI, 0)) ||
+	    (err = t3_mdio_write(phy, MDIO_MMD_PMAPMD, AEL1002_PWR_DOWN_LO, 0)) ||
+	    (err = t3_mdio_write(phy, MDIO_MMD_PMAPMD, AEL1002_XFI_EQL, 0x18)) ||
+	    (err = t3_mdio_change_bits(phy, MDIO_MMD_PMAPMD, AEL1002_LB_EN,
+				       0, 1 << 5)))
+		return err;
+	return 0;
+}
+
+static int ael1002_intr_noop(struct cphy *phy)
+{
+	return 0;
+}
+
+/*
+ * Get link status for a 10GBASE-R device.
+ */
+static int get_link_status_r(struct cphy *phy, int *link_ok, int *speed,
+			     int *duplex, int *fc)
+{
+	if (link_ok) {
+		unsigned int stat0, stat1, stat2;
+		int err = t3_mdio_read(phy, MDIO_MMD_PMAPMD,
+				       MDIO_PMA_RXDET, &stat0);
+
+		if (!err)
+			err = t3_mdio_read(phy, MDIO_MMD_PCS,
+					   MDIO_PCS_10GBRT_STAT1, &stat1);
+		if (!err)
+			err = t3_mdio_read(phy, MDIO_MMD_PHYXS,
+					   MDIO_PHYXS_LNSTAT, &stat2);
+		if (err)
+			return err;
+		*link_ok = (stat0 & stat1 & (stat2 >> 12)) & 1;
+	}
+	if (speed)
+		*speed = SPEED_10000;
+	if (duplex)
+		*duplex = DUPLEX_FULL;
+	return 0;
+}
+
+static struct cphy_ops ael1002_ops = {
+	.reset = ael1002_reset,
+	.intr_enable = ael1002_intr_noop,
+	.intr_disable = ael1002_intr_noop,
+	.intr_clear = ael1002_intr_noop,
+	.intr_handler = ael1002_intr_noop,
+	.get_link_status = get_link_status_r,
+	.power_down = ael1002_power_down,
+	.mmds = MDIO_DEVS_PMAPMD | MDIO_DEVS_PCS | MDIO_DEVS_PHYXS,
+};
+
+int t3_ael1002_phy_prep(struct cphy *phy, struct adapter *adapter,
+			int phy_addr, const struct mdio_ops *mdio_ops)
+{
+	cphy_init(phy, adapter, phy_addr, &ael1002_ops, mdio_ops,
+		  SUPPORTED_10000baseT_Full | SUPPORTED_AUI | SUPPORTED_FIBRE,
+		   "10GBASE-R");
+	ael100x_txon(phy);
+	return 0;
+}
+
+static int ael1006_reset(struct cphy *phy, int wait)
+{
+	return t3_phy_reset(phy, MDIO_MMD_PMAPMD, wait);
+}
+
+static struct cphy_ops ael1006_ops = {
+	.reset = ael1006_reset,
+	.intr_enable = t3_phy_lasi_intr_enable,
+	.intr_disable = t3_phy_lasi_intr_disable,
+	.intr_clear = t3_phy_lasi_intr_clear,
+	.intr_handler = t3_phy_lasi_intr_handler,
+	.get_link_status = get_link_status_r,
+	.power_down = ael1002_power_down,
+	.mmds = MDIO_DEVS_PMAPMD | MDIO_DEVS_PCS | MDIO_DEVS_PHYXS,
+};
+
+int t3_ael1006_phy_prep(struct cphy *phy, struct adapter *adapter,
+			     int phy_addr, const struct mdio_ops *mdio_ops)
+{
+	cphy_init(phy, adapter, phy_addr, &ael1006_ops, mdio_ops,
+		  SUPPORTED_10000baseT_Full | SUPPORTED_AUI | SUPPORTED_FIBRE,
+		   "10GBASE-SR");
+	ael100x_txon(phy);
+	return 0;
+}
+
+/*
+ * Decode our module type.
+ */
+static int ael2xxx_get_module_type(struct cphy *phy, int delay_ms)
+{
+	int v;
+
+	if (delay_ms)
+		msleep(delay_ms);
+
+	/* see SFF-8472 for below */
+	v = ael_i2c_rd(phy, MODULE_DEV_ADDR, 3);
+	if (v < 0)
+		return v;
+
+	if (v == 0x10)
+		return phy_modtype_sr;
+	if (v == 0x20)
+		return phy_modtype_lr;
+	if (v == 0x40)
+		return phy_modtype_lrm;
+
+	v = ael_i2c_rd(phy, MODULE_DEV_ADDR, 6);
+	if (v < 0)
+		return v;
+	if (v != 4)
+		goto unknown;
+
+	v = ael_i2c_rd(phy, MODULE_DEV_ADDR, 10);
+	if (v < 0)
+		return v;
+
+	if (v & 0x80) {
+		v = ael_i2c_rd(phy, MODULE_DEV_ADDR, 0x12);
+		if (v < 0)
+			return v;
+		return v > 10 ? phy_modtype_twinax_long : phy_modtype_twinax;
+	}
+unknown:
+	return phy_modtype_unknown;
+}
+
+/*
+ * Code to support the Aeluros/NetLogic 2005 10Gb PHY.
+ */
+static int ael2005_setup_sr_edc(struct cphy *phy)
+{
+	static const struct reg_val regs[] = {
+		{ MDIO_MMD_PMAPMD, 0xc003, 0xffff, 0x181 },
+		{ MDIO_MMD_PMAPMD, 0xc010, 0xffff, 0x448a },
+		{ MDIO_MMD_PMAPMD, 0xc04a, 0xffff, 0x5200 },
+		{ 0, 0, 0, 0 }
+	};
+
+	int i, err;
+
+	err = set_phy_regs(phy, regs);
+	if (err)
+		return err;
+
+	msleep(50);
+
+	if (phy->priv != edc_sr)
+		err = t3_get_edc_fw(phy, EDC_OPT_AEL2005,
+				    EDC_OPT_AEL2005_SIZE);
+	if (err)
+		return err;
+
+	for (i = 0; i <  EDC_OPT_AEL2005_SIZE / sizeof(u16) && !err; i += 2)
+		err = t3_mdio_write(phy, MDIO_MMD_PMAPMD,
+				    phy->phy_cache[i],
+				    phy->phy_cache[i + 1]);
+	if (!err)
+		phy->priv = edc_sr;
+	return err;
+}
+
+static int ael2005_setup_twinax_edc(struct cphy *phy, int modtype)
+{
+	static const struct reg_val regs[] = {
+		{ MDIO_MMD_PMAPMD, 0xc04a, 0xffff, 0x5a00 },
+		{ 0, 0, 0, 0 }
+	};
+	static const struct reg_val preemphasis[] = {
+		{ MDIO_MMD_PMAPMD, 0xc014, 0xffff, 0xfe16 },
+		{ MDIO_MMD_PMAPMD, 0xc015, 0xffff, 0xa000 },
+		{ 0, 0, 0, 0 }
+	};
+	int i, err;
+
+	err = set_phy_regs(phy, regs);
+	if (!err && modtype == phy_modtype_twinax_long)
+		err = set_phy_regs(phy, preemphasis);
+	if (err)
+		return err;
+
+	msleep(50);
+
+	if (phy->priv != edc_twinax)
+		err = t3_get_edc_fw(phy, EDC_TWX_AEL2005,
+				    EDC_TWX_AEL2005_SIZE);
+	if (err)
+		return err;
+
+	for (i = 0; i <  EDC_TWX_AEL2005_SIZE / sizeof(u16) && !err; i += 2)
+		err = t3_mdio_write(phy, MDIO_MMD_PMAPMD,
+				    phy->phy_cache[i],
+				    phy->phy_cache[i + 1]);
+	if (!err)
+		phy->priv = edc_twinax;
+	return err;
+}
+
+static int ael2005_get_module_type(struct cphy *phy, int delay_ms)
+{
+	int v;
+	unsigned int stat;
+
+	v = t3_mdio_read(phy, MDIO_MMD_PMAPMD, AEL2005_GPIO_CTRL, &stat);
+	if (v)
+		return v;
+
+	if (stat & (1 << 8))			/* module absent */
+		return phy_modtype_none;
+
+	return ael2xxx_get_module_type(phy, delay_ms);
+}
+
+static int ael2005_intr_enable(struct cphy *phy)
+{
+	int err = t3_mdio_write(phy, MDIO_MMD_PMAPMD, AEL2005_GPIO_CTRL, 0x200);
+	return err ? err : t3_phy_lasi_intr_enable(phy);
+}
+
+static int ael2005_intr_disable(struct cphy *phy)
+{
+	int err = t3_mdio_write(phy, MDIO_MMD_PMAPMD, AEL2005_GPIO_CTRL, 0x100);
+	return err ? err : t3_phy_lasi_intr_disable(phy);
+}
+
+static int ael2005_intr_clear(struct cphy *phy)
+{
+	int err = t3_mdio_write(phy, MDIO_MMD_PMAPMD, AEL2005_GPIO_CTRL, 0xd00);
+	return err ? err : t3_phy_lasi_intr_clear(phy);
+}
+
+static int ael2005_reset(struct cphy *phy, int wait)
+{
+	static const struct reg_val regs0[] = {
+		{ MDIO_MMD_PMAPMD, 0xc001, 0, 1 << 5 },
+		{ MDIO_MMD_PMAPMD, 0xc017, 0, 1 << 5 },
+		{ MDIO_MMD_PMAPMD, 0xc013, 0xffff, 0xf341 },
+		{ MDIO_MMD_PMAPMD, 0xc210, 0xffff, 0x8000 },
+		{ MDIO_MMD_PMAPMD, 0xc210, 0xffff, 0x8100 },
+		{ MDIO_MMD_PMAPMD, 0xc210, 0xffff, 0x8000 },
+		{ MDIO_MMD_PMAPMD, 0xc210, 0xffff, 0 },
+		{ 0, 0, 0, 0 }
+	};
+	static const struct reg_val regs1[] = {
+		{ MDIO_MMD_PMAPMD, 0xca00, 0xffff, 0x0080 },
+		{ MDIO_MMD_PMAPMD, 0xca12, 0xffff, 0 },
+		{ 0, 0, 0, 0 }
+	};
+
+	int err;
+	unsigned int lasi_ctrl;
+
+	err = t3_mdio_read(phy, MDIO_MMD_PMAPMD, MDIO_PMA_LASI_CTRL,
+			   &lasi_ctrl);
+	if (err)
+		return err;
+
+	err = t3_phy_reset(phy, MDIO_MMD_PMAPMD, 0);
+	if (err)
+		return err;
+
+	msleep(125);
+	phy->priv = edc_none;
+	err = set_phy_regs(phy, regs0);
+	if (err)
+		return err;
+
+	msleep(50);
+
+	err = ael2005_get_module_type(phy, 0);
+	if (err < 0)
+		return err;
+	phy->modtype = err;
+
+	if (err == phy_modtype_twinax || err == phy_modtype_twinax_long)
+		err = ael2005_setup_twinax_edc(phy, err);
+	else
+		err = ael2005_setup_sr_edc(phy);
+	if (err)
+		return err;
+
+	err = set_phy_regs(phy, regs1);
+	if (err)
+		return err;
+
+	/* reset wipes out interrupts, reenable them if they were on */
+	if (lasi_ctrl & 1)
+		err = ael2005_intr_enable(phy);
+	return err;
+}
+
+static int ael2005_intr_handler(struct cphy *phy)
+{
+	unsigned int stat;
+	int ret, edc_needed, cause = 0;
+
+	ret = t3_mdio_read(phy, MDIO_MMD_PMAPMD, AEL2005_GPIO_STAT, &stat);
+	if (ret)
+		return ret;
+
+	if (stat & AEL2005_MODDET_IRQ) {
+		ret = t3_mdio_write(phy, MDIO_MMD_PMAPMD, AEL2005_GPIO_CTRL,
+				    0xd00);
+		if (ret)
+			return ret;
+
+		/* modules have max 300 ms init time after hot plug */
+		ret = ael2005_get_module_type(phy, 300);
+		if (ret < 0)
+			return ret;
+
+		phy->modtype = ret;
+		if (ret == phy_modtype_none)
+			edc_needed = phy->priv;       /* on unplug retain EDC */
+		else if (ret == phy_modtype_twinax ||
+			 ret == phy_modtype_twinax_long)
+			edc_needed = edc_twinax;
+		else
+			edc_needed = edc_sr;
+
+		if (edc_needed != phy->priv) {
+			ret = ael2005_reset(phy, 0);
+			return ret ? ret : cphy_cause_module_change;
+		}
+		cause = cphy_cause_module_change;
+	}
+
+	ret = t3_phy_lasi_intr_handler(phy);
+	if (ret < 0)
+		return ret;
+
+	ret |= cause;
+	return ret ? ret : cphy_cause_link_change;
+}
+
+static struct cphy_ops ael2005_ops = {
+	.reset           = ael2005_reset,
+	.intr_enable     = ael2005_intr_enable,
+	.intr_disable    = ael2005_intr_disable,
+	.intr_clear      = ael2005_intr_clear,
+	.intr_handler    = ael2005_intr_handler,
+	.get_link_status = get_link_status_r,
+	.power_down      = ael1002_power_down,
+	.mmds            = MDIO_DEVS_PMAPMD | MDIO_DEVS_PCS | MDIO_DEVS_PHYXS,
+};
+
+int t3_ael2005_phy_prep(struct cphy *phy, struct adapter *adapter,
+			int phy_addr, const struct mdio_ops *mdio_ops)
+{
+	cphy_init(phy, adapter, phy_addr, &ael2005_ops, mdio_ops,
+		  SUPPORTED_10000baseT_Full | SUPPORTED_AUI | SUPPORTED_FIBRE |
+		  SUPPORTED_IRQ, "10GBASE-R");
+	msleep(125);
+	return t3_mdio_change_bits(phy, MDIO_MMD_PMAPMD, AEL_OPT_SETTINGS, 0,
+				   1 << 5);
+}
+
+/*
+ * Setup EDC and other parameters for operation with an optical module.
+ */
+static int ael2020_setup_sr_edc(struct cphy *phy)
+{
+	static const struct reg_val regs[] = {
+		/* set CDR offset to 10 */
+		{ MDIO_MMD_PMAPMD, 0xcc01, 0xffff, 0x488a },
+
+		/* adjust 10G RX bias current */
+		{ MDIO_MMD_PMAPMD, 0xcb1b, 0xffff, 0x0200 },
+		{ MDIO_MMD_PMAPMD, 0xcb1c, 0xffff, 0x00f0 },
+		{ MDIO_MMD_PMAPMD, 0xcc06, 0xffff, 0x00e0 },
+
+		/* end */
+		{ 0, 0, 0, 0 }
+	};
+	int err;
+
+	err = set_phy_regs(phy, regs);
+	msleep(50);
+	if (err)
+		return err;
+
+	phy->priv = edc_sr;
+	return 0;
+}
+
+/*
+ * Setup EDC and other parameters for operation with an TWINAX module.
+ */
+static int ael2020_setup_twinax_edc(struct cphy *phy, int modtype)
+{
+	/* set uC to 40MHz */
+	static const struct reg_val uCclock40MHz[] = {
+		{ MDIO_MMD_PMAPMD, 0xff28, 0xffff, 0x4001 },
+		{ MDIO_MMD_PMAPMD, 0xff2a, 0xffff, 0x0002 },
+		{ 0, 0, 0, 0 }
+	};
+
+	/* activate uC clock */
+	static const struct reg_val uCclockActivate[] = {
+		{ MDIO_MMD_PMAPMD, 0xd000, 0xffff, 0x5200 },
+		{ 0, 0, 0, 0 }
+	};
+
+	/* set PC to start of SRAM and activate uC */
+	static const struct reg_val uCactivate[] = {
+		{ MDIO_MMD_PMAPMD, 0xd080, 0xffff, 0x0100 },
+		{ MDIO_MMD_PMAPMD, 0xd092, 0xffff, 0x0000 },
+		{ 0, 0, 0, 0 }
+	};
+	int i, err;
+
+	/* set uC clock and activate it */
+	err = set_phy_regs(phy, uCclock40MHz);
+	msleep(500);
+	if (err)
+		return err;
+	err = set_phy_regs(phy, uCclockActivate);
+	msleep(500);
+	if (err)
+		return err;
+
+	if (phy->priv != edc_twinax)
+		err = t3_get_edc_fw(phy, EDC_TWX_AEL2020,
+				    EDC_TWX_AEL2020_SIZE);
+	if (err)
+		return err;
+
+	for (i = 0; i <  EDC_TWX_AEL2020_SIZE / sizeof(u16) && !err; i += 2)
+		err = t3_mdio_write(phy, MDIO_MMD_PMAPMD,
+				    phy->phy_cache[i],
+				    phy->phy_cache[i + 1]);
+	/* activate uC */
+	err = set_phy_regs(phy, uCactivate);
+	if (!err)
+		phy->priv = edc_twinax;
+	return err;
+}
+
+/*
+ * Return Module Type.
+ */
+static int ael2020_get_module_type(struct cphy *phy, int delay_ms)
+{
+	int v;
+	unsigned int stat;
+
+	v = t3_mdio_read(phy, MDIO_MMD_PMAPMD, AEL2020_GPIO_STAT, &stat);
+	if (v)
+		return v;
+
+	if (stat & (0x1 << (AEL2020_GPIO_MODDET*4))) {
+		/* module absent */
+		return phy_modtype_none;
+	}
+
+	return ael2xxx_get_module_type(phy, delay_ms);
+}
+
+/*
+ * Enable PHY interrupts.  We enable "Module Detection" interrupts (on any
+ * state transition) and then generic Link Alarm Status Interrupt (LASI).
+ */
+static int ael2020_intr_enable(struct cphy *phy)
+{
+	static const struct reg_val regs[] = {
+		/* output Module's Loss Of Signal (LOS) to LED */
+		{ MDIO_MMD_PMAPMD, AEL2020_GPIO_CFG+AEL2020_GPIO_LSTAT,
+			0xffff, 0x4 },
+		{ MDIO_MMD_PMAPMD, AEL2020_GPIO_CTRL,
+			0xffff, 0x8 << (AEL2020_GPIO_LSTAT*4) },
+
+		 /* enable module detect status change interrupts */
+		{ MDIO_MMD_PMAPMD, AEL2020_GPIO_CTRL,
+			0xffff, 0x2 << (AEL2020_GPIO_MODDET*4) },
+
+		/* end */
+		{ 0, 0, 0, 0 }
+	};
+	int err, link_ok = 0;
+
+	/* set up "link status" LED and enable module change interrupts */
+	err = set_phy_regs(phy, regs);
+	if (err)
+		return err;
+
+	err = get_link_status_r(phy, &link_ok, NULL, NULL, NULL);
+	if (err)
+		return err;
+	if (link_ok)
+		t3_link_changed(phy->adapter,
+				phy2portid(phy));
+
+	err = t3_phy_lasi_intr_enable(phy);
+	if (err)
+		return err;
+
+	return 0;
+}
+
+/*
+ * Disable PHY interrupts.  The mirror of the above ...
+ */
+static int ael2020_intr_disable(struct cphy *phy)
+{
+	static const struct reg_val regs[] = {
+		/* reset "link status" LED to "off" */
+		{ MDIO_MMD_PMAPMD, AEL2020_GPIO_CTRL,
+			0xffff, 0xb << (AEL2020_GPIO_LSTAT*4) },
+
+		/* disable module detect status change interrupts */
+		{ MDIO_MMD_PMAPMD, AEL2020_GPIO_CTRL,
+			0xffff, 0x1 << (AEL2020_GPIO_MODDET*4) },
+
+		/* end */
+		{ 0, 0, 0, 0 }
+	};
+	int err;
+
+	/* turn off "link status" LED and disable module change interrupts */
+	err = set_phy_regs(phy, regs);
+	if (err)
+		return err;
+
+	return t3_phy_lasi_intr_disable(phy);
+}
+
+/*
+ * Clear PHY interrupt state.
+ */
+static int ael2020_intr_clear(struct cphy *phy)
+{
+	/*
+	 * The GPIO Interrupt register on the AEL2020 is a "Latching High"
+	 * (LH) register which is cleared to the current state when it's read.
+	 * Thus, we simply read the register and discard the result.
+	 */
+	unsigned int stat;
+	int err = t3_mdio_read(phy, MDIO_MMD_PMAPMD, AEL2020_GPIO_INTR, &stat);
+	return err ? err : t3_phy_lasi_intr_clear(phy);
+}
+
+static const struct reg_val ael2020_reset_regs[] = {
+	/* Erratum #2: CDRLOL asserted, causing PMA link down status */
+	{ MDIO_MMD_PMAPMD, 0xc003, 0xffff, 0x3101 },
+
+	/* force XAUI to send LF when RX_LOS is asserted */
+	{ MDIO_MMD_PMAPMD, 0xcd40, 0xffff, 0x0001 },
+
+	/* allow writes to transceiver module EEPROM on i2c bus */
+	{ MDIO_MMD_PMAPMD, 0xff02, 0xffff, 0x0023 },
+	{ MDIO_MMD_PMAPMD, 0xff03, 0xffff, 0x0000 },
+	{ MDIO_MMD_PMAPMD, 0xff04, 0xffff, 0x0000 },
+
+	/* end */
+	{ 0, 0, 0, 0 }
+};
+/*
+ * Reset the PHY and put it into a canonical operating state.
+ */
+static int ael2020_reset(struct cphy *phy, int wait)
+{
+	int err;
+	unsigned int lasi_ctrl;
+
+	/* grab current interrupt state */
+	err = t3_mdio_read(phy, MDIO_MMD_PMAPMD, MDIO_PMA_LASI_CTRL,
+			   &lasi_ctrl);
+	if (err)
+		return err;
+
+	err = t3_phy_reset(phy, MDIO_MMD_PMAPMD, 125);
+	if (err)
+		return err;
+	msleep(100);
+
+	/* basic initialization for all module types */
+	phy->priv = edc_none;
+	err = set_phy_regs(phy, ael2020_reset_regs);
+	if (err)
+		return err;
+
+	/* determine module type and perform appropriate initialization */
+	err = ael2020_get_module_type(phy, 0);
+	if (err < 0)
+		return err;
+	phy->modtype = (u8)err;
+	if (err == phy_modtype_twinax || err == phy_modtype_twinax_long)
+		err = ael2020_setup_twinax_edc(phy, err);
+	else
+		err = ael2020_setup_sr_edc(phy);
+	if (err)
+		return err;
+
+	/* reset wipes out interrupts, reenable them if they were on */
+	if (lasi_ctrl & 1)
+		err = ael2005_intr_enable(phy);
+	return err;
+}
+
+/*
+ * Handle a PHY interrupt.
+ */
+static int ael2020_intr_handler(struct cphy *phy)
+{
+	unsigned int stat;
+	int ret, edc_needed, cause = 0;
+
+	ret = t3_mdio_read(phy, MDIO_MMD_PMAPMD, AEL2020_GPIO_INTR, &stat);
+	if (ret)
+		return ret;
+
+	if (stat & (0x1 << AEL2020_GPIO_MODDET)) {
+		/* modules have max 300 ms init time after hot plug */
+		ret = ael2020_get_module_type(phy, 300);
+		if (ret < 0)
+			return ret;
+
+		phy->modtype = (u8)ret;
+		if (ret == phy_modtype_none)
+			edc_needed = phy->priv;       /* on unplug retain EDC */
+		else if (ret == phy_modtype_twinax ||
+			 ret == phy_modtype_twinax_long)
+			edc_needed = edc_twinax;
+		else
+			edc_needed = edc_sr;
+
+		if (edc_needed != phy->priv) {
+			ret = ael2020_reset(phy, 0);
+			return ret ? ret : cphy_cause_module_change;
+		}
+		cause = cphy_cause_module_change;
+	}
+
+	ret = t3_phy_lasi_intr_handler(phy);
+	if (ret < 0)
+		return ret;
+
+	ret |= cause;
+	return ret ? ret : cphy_cause_link_change;
+}
+
+static struct cphy_ops ael2020_ops = {
+	.reset           = ael2020_reset,
+	.intr_enable     = ael2020_intr_enable,
+	.intr_disable    = ael2020_intr_disable,
+	.intr_clear      = ael2020_intr_clear,
+	.intr_handler    = ael2020_intr_handler,
+	.get_link_status = get_link_status_r,
+	.power_down      = ael1002_power_down,
+	.mmds		 = MDIO_DEVS_PMAPMD | MDIO_DEVS_PCS | MDIO_DEVS_PHYXS,
+};
+
+int t3_ael2020_phy_prep(struct cphy *phy, struct adapter *adapter, int phy_addr,
+			const struct mdio_ops *mdio_ops)
+{
+	int err;
+
+	cphy_init(phy, adapter, phy_addr, &ael2020_ops, mdio_ops,
+		  SUPPORTED_10000baseT_Full | SUPPORTED_AUI | SUPPORTED_FIBRE |
+		  SUPPORTED_IRQ, "10GBASE-R");
+	msleep(125);
+
+	err = set_phy_regs(phy, ael2020_reset_regs);
+	if (err)
+		return err;
+	return 0;
+}
+
+/*
+ * Get link status for a 10GBASE-X device.
+ */
+static int get_link_status_x(struct cphy *phy, int *link_ok, int *speed,
+			     int *duplex, int *fc)
+{
+	if (link_ok) {
+		unsigned int stat0, stat1, stat2;
+		int err = t3_mdio_read(phy, MDIO_MMD_PMAPMD,
+				       MDIO_PMA_RXDET, &stat0);
+
+		if (!err)
+			err = t3_mdio_read(phy, MDIO_MMD_PCS,
+					   MDIO_PCS_10GBX_STAT1, &stat1);
+		if (!err)
+			err = t3_mdio_read(phy, MDIO_MMD_PHYXS,
+					   MDIO_PHYXS_LNSTAT, &stat2);
+		if (err)
+			return err;
+		*link_ok = (stat0 & (stat1 >> 12) & (stat2 >> 12)) & 1;
+	}
+	if (speed)
+		*speed = SPEED_10000;
+	if (duplex)
+		*duplex = DUPLEX_FULL;
+	return 0;
+}
+
+static struct cphy_ops qt2045_ops = {
+	.reset = ael1006_reset,
+	.intr_enable = t3_phy_lasi_intr_enable,
+	.intr_disable = t3_phy_lasi_intr_disable,
+	.intr_clear = t3_phy_lasi_intr_clear,
+	.intr_handler = t3_phy_lasi_intr_handler,
+	.get_link_status = get_link_status_x,
+	.power_down = ael1002_power_down,
+	.mmds = MDIO_DEVS_PMAPMD | MDIO_DEVS_PCS | MDIO_DEVS_PHYXS,
+};
+
+int t3_qt2045_phy_prep(struct cphy *phy, struct adapter *adapter,
+		       int phy_addr, const struct mdio_ops *mdio_ops)
+{
+	unsigned int stat;
+
+	cphy_init(phy, adapter, phy_addr, &qt2045_ops, mdio_ops,
+		  SUPPORTED_10000baseT_Full | SUPPORTED_AUI | SUPPORTED_TP,
+		  "10GBASE-CX4");
+
+	/*
+	 * Some cards where the PHY is supposed to be at address 0 actually
+	 * have it at 1.
+	 */
+	if (!phy_addr &&
+	    !t3_mdio_read(phy, MDIO_MMD_PMAPMD, MDIO_STAT1, &stat) &&
+	    stat == 0xffff)
+		phy->mdio.prtad = 1;
+	return 0;
+}
+
+static int xaui_direct_reset(struct cphy *phy, int wait)
+{
+	return 0;
+}
+
+static int xaui_direct_get_link_status(struct cphy *phy, int *link_ok,
+				       int *speed, int *duplex, int *fc)
+{
+	if (link_ok) {
+		unsigned int status;
+		int prtad = phy->mdio.prtad;
+
+		status = t3_read_reg(phy->adapter,
+				     XGM_REG(A_XGM_SERDES_STAT0, prtad)) |
+		    t3_read_reg(phy->adapter,
+				    XGM_REG(A_XGM_SERDES_STAT1, prtad)) |
+		    t3_read_reg(phy->adapter,
+				XGM_REG(A_XGM_SERDES_STAT2, prtad)) |
+		    t3_read_reg(phy->adapter,
+				XGM_REG(A_XGM_SERDES_STAT3, prtad));
+		*link_ok = !(status & F_LOWSIG0);
+	}
+	if (speed)
+		*speed = SPEED_10000;
+	if (duplex)
+		*duplex = DUPLEX_FULL;
+	return 0;
+}
+
+static int xaui_direct_power_down(struct cphy *phy, int enable)
+{
+	return 0;
+}
+
+static struct cphy_ops xaui_direct_ops = {
+	.reset = xaui_direct_reset,
+	.intr_enable = ael1002_intr_noop,
+	.intr_disable = ael1002_intr_noop,
+	.intr_clear = ael1002_intr_noop,
+	.intr_handler = ael1002_intr_noop,
+	.get_link_status = xaui_direct_get_link_status,
+	.power_down = xaui_direct_power_down,
+};
+
+int t3_xaui_direct_phy_prep(struct cphy *phy, struct adapter *adapter,
+			    int phy_addr, const struct mdio_ops *mdio_ops)
+{
+	cphy_init(phy, adapter, phy_addr, &xaui_direct_ops, mdio_ops,
+		  SUPPORTED_10000baseT_Full | SUPPORTED_AUI | SUPPORTED_TP,
+		  "10GBASE-CX4");
+	return 0;
+}
diff --git a/drivers/net/ethernet/chelsio/cxgb3/aq100x.c b/drivers/net/ethernet/chelsio/cxgb3/aq100x.c
new file mode 100644
index 0000000..341b7ef
--- /dev/null
+++ b/drivers/net/ethernet/chelsio/cxgb3/aq100x.c
@@ -0,0 +1,354 @@
+/*
+ * Copyright (c) 2005-2008 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "common.h"
+#include "regs.h"
+
+enum {
+	/* MDIO_DEV_PMA_PMD registers */
+	AQ_LINK_STAT	= 0xe800,
+	AQ_IMASK_PMA	= 0xf000,
+
+	/* MDIO_DEV_XGXS registers */
+	AQ_XAUI_RX_CFG	= 0xc400,
+	AQ_XAUI_TX_CFG	= 0xe400,
+
+	/* MDIO_DEV_ANEG registers */
+	AQ_1G_CTRL	= 0xc400,
+	AQ_ANEG_STAT	= 0xc800,
+
+	/* MDIO_DEV_VEND1 registers */
+	AQ_FW_VERSION	= 0x0020,
+	AQ_IFLAG_GLOBAL	= 0xfc00,
+	AQ_IMASK_GLOBAL	= 0xff00,
+};
+
+enum {
+	IMASK_PMA	= 1 << 2,
+	IMASK_GLOBAL	= 1 << 15,
+	ADV_1G_FULL	= 1 << 15,
+	ADV_1G_HALF	= 1 << 14,
+	ADV_10G_FULL	= 1 << 12,
+	AQ_RESET	= (1 << 14) | (1 << 15),
+	AQ_LOWPOWER	= 1 << 12,
+};
+
+static int aq100x_reset(struct cphy *phy, int wait)
+{
+	/*
+	 * Ignore the caller specified wait time; always wait for the reset to
+	 * complete. Can take up to 3s.
+	 */
+	int err = t3_phy_reset(phy, MDIO_MMD_VEND1, 3000);
+
+	if (err)
+		CH_WARN(phy->adapter, "PHY%d: reset failed (0x%x).\n",
+			phy->mdio.prtad, err);
+
+	return err;
+}
+
+static int aq100x_intr_enable(struct cphy *phy)
+{
+	int err = t3_mdio_write(phy, MDIO_MMD_PMAPMD, AQ_IMASK_PMA, IMASK_PMA);
+	if (err)
+		return err;
+
+	err = t3_mdio_write(phy, MDIO_MMD_VEND1, AQ_IMASK_GLOBAL, IMASK_GLOBAL);
+	return err;
+}
+
+static int aq100x_intr_disable(struct cphy *phy)
+{
+	return t3_mdio_write(phy, MDIO_MMD_VEND1, AQ_IMASK_GLOBAL, 0);
+}
+
+static int aq100x_intr_clear(struct cphy *phy)
+{
+	unsigned int v;
+
+	t3_mdio_read(phy, MDIO_MMD_VEND1, AQ_IFLAG_GLOBAL, &v);
+	t3_mdio_read(phy, MDIO_MMD_PMAPMD, MDIO_STAT1, &v);
+
+	return 0;
+}
+
+static int aq100x_intr_handler(struct cphy *phy)
+{
+	int err;
+	unsigned int cause, v;
+
+	err = t3_mdio_read(phy, MDIO_MMD_VEND1, AQ_IFLAG_GLOBAL, &cause);
+	if (err)
+		return err;
+
+	/* Read (and reset) the latching version of the status */
+	t3_mdio_read(phy, MDIO_MMD_PMAPMD, MDIO_STAT1, &v);
+
+	return cphy_cause_link_change;
+}
+
+static int aq100x_power_down(struct cphy *phy, int off)
+{
+	return mdio_set_flag(&phy->mdio, phy->mdio.prtad,
+			     MDIO_MMD_PMAPMD, MDIO_CTRL1,
+			     MDIO_CTRL1_LPOWER, off);
+}
+
+static int aq100x_autoneg_enable(struct cphy *phy)
+{
+	int err;
+
+	err = aq100x_power_down(phy, 0);
+	if (!err)
+		err = mdio_set_flag(&phy->mdio, phy->mdio.prtad,
+				    MDIO_MMD_AN, MDIO_CTRL1,
+				    BMCR_ANENABLE | BMCR_ANRESTART, 1);
+
+	return err;
+}
+
+static int aq100x_autoneg_restart(struct cphy *phy)
+{
+	int err;
+
+	err = aq100x_power_down(phy, 0);
+	if (!err)
+		err = mdio_set_flag(&phy->mdio, phy->mdio.prtad,
+				    MDIO_MMD_AN, MDIO_CTRL1,
+				    BMCR_ANENABLE | BMCR_ANRESTART, 1);
+
+	return err;
+}
+
+static int aq100x_advertise(struct cphy *phy, unsigned int advertise_map)
+{
+	unsigned int adv;
+	int err;
+
+	/* 10G advertisement */
+	adv = 0;
+	if (advertise_map & ADVERTISED_10000baseT_Full)
+		adv |= ADV_10G_FULL;
+	err = t3_mdio_change_bits(phy, MDIO_MMD_AN, MDIO_AN_10GBT_CTRL,
+				  ADV_10G_FULL, adv);
+	if (err)
+		return err;
+
+	/* 1G advertisement */
+	adv = 0;
+	if (advertise_map & ADVERTISED_1000baseT_Full)
+		adv |= ADV_1G_FULL;
+	if (advertise_map & ADVERTISED_1000baseT_Half)
+		adv |= ADV_1G_HALF;
+	err = t3_mdio_change_bits(phy, MDIO_MMD_AN, AQ_1G_CTRL,
+				  ADV_1G_FULL | ADV_1G_HALF, adv);
+	if (err)
+		return err;
+
+	/* 100M, pause advertisement */
+	adv = 0;
+	if (advertise_map & ADVERTISED_100baseT_Half)
+		adv |= ADVERTISE_100HALF;
+	if (advertise_map & ADVERTISED_100baseT_Full)
+		adv |= ADVERTISE_100FULL;
+	if (advertise_map & ADVERTISED_Pause)
+		adv |= ADVERTISE_PAUSE_CAP;
+	if (advertise_map & ADVERTISED_Asym_Pause)
+		adv |= ADVERTISE_PAUSE_ASYM;
+	err = t3_mdio_change_bits(phy, MDIO_MMD_AN, MDIO_AN_ADVERTISE,
+				  0xfe0, adv);
+
+	return err;
+}
+
+static int aq100x_set_loopback(struct cphy *phy, int mmd, int dir, int enable)
+{
+	return mdio_set_flag(&phy->mdio, phy->mdio.prtad,
+			     MDIO_MMD_PMAPMD, MDIO_CTRL1,
+			     BMCR_LOOPBACK, enable);
+}
+
+static int aq100x_set_speed_duplex(struct cphy *phy, int speed, int duplex)
+{
+	/* no can do */
+	return -1;
+}
+
+static int aq100x_get_link_status(struct cphy *phy, int *link_ok,
+				  int *speed, int *duplex, int *fc)
+{
+	int err;
+	unsigned int v;
+
+	if (link_ok) {
+		err = t3_mdio_read(phy, MDIO_MMD_PMAPMD, AQ_LINK_STAT, &v);
+		if (err)
+			return err;
+
+		*link_ok = v & 1;
+		if (!*link_ok)
+			return 0;
+	}
+
+	err = t3_mdio_read(phy, MDIO_MMD_AN, AQ_ANEG_STAT, &v);
+	if (err)
+		return err;
+
+	if (speed) {
+		switch (v & 0x6) {
+		case 0x6:
+			*speed = SPEED_10000;
+			break;
+		case 0x4:
+			*speed = SPEED_1000;
+			break;
+		case 0x2:
+			*speed = SPEED_100;
+			break;
+		case 0x0:
+			*speed = SPEED_10;
+			break;
+		}
+	}
+
+	if (duplex)
+		*duplex = v & 1 ? DUPLEX_FULL : DUPLEX_HALF;
+
+	return 0;
+}
+
+static struct cphy_ops aq100x_ops = {
+	.reset             = aq100x_reset,
+	.intr_enable       = aq100x_intr_enable,
+	.intr_disable      = aq100x_intr_disable,
+	.intr_clear        = aq100x_intr_clear,
+	.intr_handler      = aq100x_intr_handler,
+	.autoneg_enable    = aq100x_autoneg_enable,
+	.autoneg_restart   = aq100x_autoneg_restart,
+	.advertise         = aq100x_advertise,
+	.set_loopback      = aq100x_set_loopback,
+	.set_speed_duplex  = aq100x_set_speed_duplex,
+	.get_link_status   = aq100x_get_link_status,
+	.power_down        = aq100x_power_down,
+	.mmds 		   = MDIO_DEVS_PMAPMD | MDIO_DEVS_PCS | MDIO_DEVS_PHYXS,
+};
+
+int t3_aq100x_phy_prep(struct cphy *phy, struct adapter *adapter, int phy_addr,
+		       const struct mdio_ops *mdio_ops)
+{
+	unsigned int v, v2, gpio, wait;
+	int err;
+
+	cphy_init(phy, adapter, phy_addr, &aq100x_ops, mdio_ops,
+		  SUPPORTED_1000baseT_Full | SUPPORTED_10000baseT_Full |
+		  SUPPORTED_TP | SUPPORTED_Autoneg | SUPPORTED_AUI,
+		  "1000/10GBASE-T");
+
+	/*
+	 * The PHY has been out of reset ever since the system powered up.  So
+	 * we do a hard reset over here.
+	 */
+	gpio = phy_addr ? F_GPIO10_OUT_VAL : F_GPIO6_OUT_VAL;
+	t3_set_reg_field(adapter, A_T3DBG_GPIO_EN, gpio, 0);
+	msleep(1);
+	t3_set_reg_field(adapter, A_T3DBG_GPIO_EN, gpio, gpio);
+
+	/*
+	 * Give it enough time to load the firmware and get ready for mdio.
+	 */
+	msleep(1000);
+	wait = 500; /* in 10ms increments */
+	do {
+		err = t3_mdio_read(phy, MDIO_MMD_VEND1, MDIO_CTRL1, &v);
+		if (err || v == 0xffff) {
+
+			/* Allow prep_adapter to succeed when ffff is read */
+
+			CH_WARN(adapter, "PHY%d: reset failed (0x%x, 0x%x).\n",
+				phy_addr, err, v);
+			goto done;
+		}
+
+		v &= AQ_RESET;
+		if (v)
+			msleep(10);
+	} while (v && --wait);
+	if (v) {
+		CH_WARN(adapter, "PHY%d: reset timed out (0x%x).\n",
+			phy_addr, v);
+
+		goto done; /* let prep_adapter succeed */
+	}
+
+	/* Datasheet says 3s max but this has been observed */
+	wait = (500 - wait) * 10 + 1000;
+	if (wait > 3000)
+		CH_WARN(adapter, "PHY%d: reset took %ums\n", phy_addr, wait);
+
+	/* Firmware version check. */
+	t3_mdio_read(phy, MDIO_MMD_VEND1, AQ_FW_VERSION, &v);
+	if (v != 101)
+		CH_WARN(adapter, "PHY%d: unsupported firmware %d\n",
+			phy_addr, v);
+
+	/*
+	 * The PHY should start in really-low-power mode.  Prepare it for normal
+	 * operations.
+	 */
+	err = t3_mdio_read(phy, MDIO_MMD_VEND1, MDIO_CTRL1, &v);
+	if (err)
+		return err;
+	if (v & AQ_LOWPOWER) {
+		err = t3_mdio_change_bits(phy, MDIO_MMD_VEND1, MDIO_CTRL1,
+					  AQ_LOWPOWER, 0);
+		if (err)
+			return err;
+		msleep(10);
+	} else
+		CH_WARN(adapter, "PHY%d does not start in low power mode.\n",
+			phy_addr);
+
+	/*
+	 * Verify XAUI settings, but let prep succeed no matter what.
+	 */
+	v = v2 = 0;
+	t3_mdio_read(phy, MDIO_MMD_PHYXS, AQ_XAUI_RX_CFG, &v);
+	t3_mdio_read(phy, MDIO_MMD_PHYXS, AQ_XAUI_TX_CFG, &v2);
+	if (v != 0x1b || v2 != 0x1b)
+		CH_WARN(adapter,
+			"PHY%d: incorrect XAUI settings (0x%x, 0x%x).\n",
+			phy_addr, v, v2);
+
+done:
+	return err;
+}
diff --git a/drivers/net/ethernet/chelsio/cxgb3/common.h b/drivers/net/ethernet/chelsio/cxgb3/common.h
new file mode 100644
index 0000000..4424809
--- /dev/null
+++ b/drivers/net/ethernet/chelsio/cxgb3/common.h
@@ -0,0 +1,773 @@
+/*
+ * Copyright (c) 2005-2008 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __CHELSIO_COMMON_H
+#define __CHELSIO_COMMON_H
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/ctype.h>
+#include <linux/delay.h>
+#include <linux/netdevice.h>
+#include <linux/ethtool.h>
+#include <linux/mdio.h>
+#include "version.h"
+
+#define CH_ERR(adap, fmt, ...)   dev_err(&adap->pdev->dev, fmt, ##__VA_ARGS__)
+#define CH_WARN(adap, fmt, ...)  dev_warn(&adap->pdev->dev, fmt, ##__VA_ARGS__)
+#define CH_ALERT(adap, fmt, ...) dev_alert(&adap->pdev->dev, fmt, ##__VA_ARGS__)
+
+/*
+ * More powerful macro that selectively prints messages based on msg_enable.
+ * For info and debugging messages.
+ */
+#define CH_MSG(adapter, level, category, fmt, ...) do { \
+	if ((adapter)->msg_enable & NETIF_MSG_##category) \
+		dev_printk(KERN_##level, &adapter->pdev->dev, fmt, \
+			   ## __VA_ARGS__); \
+} while (0)
+
+#ifdef DEBUG
+# define CH_DBG(adapter, category, fmt, ...) \
+	CH_MSG(adapter, DEBUG, category, fmt, ## __VA_ARGS__)
+#else
+# define CH_DBG(adapter, category, fmt, ...)
+#endif
+
+/* Additional NETIF_MSG_* categories */
+#define NETIF_MSG_MMIO 0x8000000
+
+enum {
+	MAX_NPORTS = 2,		/* max # of ports */
+	MAX_FRAME_SIZE = 10240,	/* max MAC frame size, including header + FCS */
+	EEPROMSIZE = 8192,	/* Serial EEPROM size */
+	SERNUM_LEN     = 16,    /* Serial # length */
+	RSS_TABLE_SIZE = 64,	/* size of RSS lookup and mapping tables */
+	TCB_SIZE = 128,		/* TCB size */
+	NMTUS = 16,		/* size of MTU table */
+	NCCTRL_WIN = 32,	/* # of congestion control windows */
+	PROTO_SRAM_LINES = 128, /* size of TP sram */
+};
+
+#define MAX_RX_COALESCING_LEN 12288U
+
+enum {
+	PAUSE_RX = 1 << 0,
+	PAUSE_TX = 1 << 1,
+	PAUSE_AUTONEG = 1 << 2
+};
+
+enum {
+	SUPPORTED_IRQ      = 1 << 24
+};
+
+enum {				/* adapter interrupt-maintained statistics */
+	STAT_ULP_CH0_PBL_OOB,
+	STAT_ULP_CH1_PBL_OOB,
+	STAT_PCI_CORR_ECC,
+
+	IRQ_NUM_STATS		/* keep last */
+};
+
+#define TP_VERSION_MAJOR	1
+#define TP_VERSION_MINOR	1
+#define TP_VERSION_MICRO	0
+
+#define S_TP_VERSION_MAJOR		16
+#define M_TP_VERSION_MAJOR		0xFF
+#define V_TP_VERSION_MAJOR(x)		((x) << S_TP_VERSION_MAJOR)
+#define G_TP_VERSION_MAJOR(x)		\
+	    (((x) >> S_TP_VERSION_MAJOR) & M_TP_VERSION_MAJOR)
+
+#define S_TP_VERSION_MINOR		8
+#define M_TP_VERSION_MINOR		0xFF
+#define V_TP_VERSION_MINOR(x)		((x) << S_TP_VERSION_MINOR)
+#define G_TP_VERSION_MINOR(x)		\
+	    (((x) >> S_TP_VERSION_MINOR) & M_TP_VERSION_MINOR)
+
+#define S_TP_VERSION_MICRO		0
+#define M_TP_VERSION_MICRO		0xFF
+#define V_TP_VERSION_MICRO(x)		((x) << S_TP_VERSION_MICRO)
+#define G_TP_VERSION_MICRO(x)		\
+	    (((x) >> S_TP_VERSION_MICRO) & M_TP_VERSION_MICRO)
+
+enum {
+	SGE_QSETS = 8,		/* # of SGE Tx/Rx/RspQ sets */
+	SGE_RXQ_PER_SET = 2,	/* # of Rx queues per set */
+	SGE_TXQ_PER_SET = 3	/* # of Tx queues per set */
+};
+
+enum sge_context_type {		/* SGE egress context types */
+	SGE_CNTXT_RDMA = 0,
+	SGE_CNTXT_ETH = 2,
+	SGE_CNTXT_OFLD = 4,
+	SGE_CNTXT_CTRL = 5
+};
+
+enum {
+	AN_PKT_SIZE = 32,	/* async notification packet size */
+	IMMED_PKT_SIZE = 48	/* packet size for immediate data */
+};
+
+struct sg_ent {			/* SGE scatter/gather entry */
+	__be32 len[2];
+	__be64 addr[2];
+};
+
+#ifndef SGE_NUM_GENBITS
+/* Must be 1 or 2 */
+# define SGE_NUM_GENBITS 2
+#endif
+
+#define TX_DESC_FLITS 16U
+#define WR_FLITS (TX_DESC_FLITS + 1 - SGE_NUM_GENBITS)
+
+struct cphy;
+struct adapter;
+
+struct mdio_ops {
+	int (*read)(struct net_device *dev, int phy_addr, int mmd_addr,
+		    u16 reg_addr);
+	int (*write)(struct net_device *dev, int phy_addr, int mmd_addr,
+		     u16 reg_addr, u16 val);
+	unsigned mode_support;
+};
+
+struct adapter_info {
+	unsigned char nports0;        /* # of ports on channel 0 */
+	unsigned char nports1;        /* # of ports on channel 1 */
+	unsigned char phy_base_addr;	/* MDIO PHY base address */
+	unsigned int gpio_out;	/* GPIO output settings */
+	unsigned char gpio_intr[MAX_NPORTS]; /* GPIO PHY IRQ pins */
+	unsigned long caps;	/* adapter capabilities */
+	const struct mdio_ops *mdio_ops;	/* MDIO operations */
+	const char *desc;	/* product description */
+};
+
+struct mc5_stats {
+	unsigned long parity_err;
+	unsigned long active_rgn_full;
+	unsigned long nfa_srch_err;
+	unsigned long unknown_cmd;
+	unsigned long reqq_parity_err;
+	unsigned long dispq_parity_err;
+	unsigned long del_act_empty;
+};
+
+struct mc7_stats {
+	unsigned long corr_err;
+	unsigned long uncorr_err;
+	unsigned long parity_err;
+	unsigned long addr_err;
+};
+
+struct mac_stats {
+	u64 tx_octets;		/* total # of octets in good frames */
+	u64 tx_octets_bad;	/* total # of octets in error frames */
+	u64 tx_frames;		/* all good frames */
+	u64 tx_mcast_frames;	/* good multicast frames */
+	u64 tx_bcast_frames;	/* good broadcast frames */
+	u64 tx_pause;		/* # of transmitted pause frames */
+	u64 tx_deferred;	/* frames with deferred transmissions */
+	u64 tx_late_collisions;	/* # of late collisions */
+	u64 tx_total_collisions;	/* # of total collisions */
+	u64 tx_excess_collisions;	/* frame errors from excessive collissions */
+	u64 tx_underrun;	/* # of Tx FIFO underruns */
+	u64 tx_len_errs;	/* # of Tx length errors */
+	u64 tx_mac_internal_errs;	/* # of internal MAC errors on Tx */
+	u64 tx_excess_deferral;	/* # of frames with excessive deferral */
+	u64 tx_fcs_errs;	/* # of frames with bad FCS */
+
+	u64 tx_frames_64;	/* # of Tx frames in a particular range */
+	u64 tx_frames_65_127;
+	u64 tx_frames_128_255;
+	u64 tx_frames_256_511;
+	u64 tx_frames_512_1023;
+	u64 tx_frames_1024_1518;
+	u64 tx_frames_1519_max;
+
+	u64 rx_octets;		/* total # of octets in good frames */
+	u64 rx_octets_bad;	/* total # of octets in error frames */
+	u64 rx_frames;		/* all good frames */
+	u64 rx_mcast_frames;	/* good multicast frames */
+	u64 rx_bcast_frames;	/* good broadcast frames */
+	u64 rx_pause;		/* # of received pause frames */
+	u64 rx_fcs_errs;	/* # of received frames with bad FCS */
+	u64 rx_align_errs;	/* alignment errors */
+	u64 rx_symbol_errs;	/* symbol errors */
+	u64 rx_data_errs;	/* data errors */
+	u64 rx_sequence_errs;	/* sequence errors */
+	u64 rx_runt;		/* # of runt frames */
+	u64 rx_jabber;		/* # of jabber frames */
+	u64 rx_short;		/* # of short frames */
+	u64 rx_too_long;	/* # of oversized frames */
+	u64 rx_mac_internal_errs;	/* # of internal MAC errors on Rx */
+
+	u64 rx_frames_64;	/* # of Rx frames in a particular range */
+	u64 rx_frames_65_127;
+	u64 rx_frames_128_255;
+	u64 rx_frames_256_511;
+	u64 rx_frames_512_1023;
+	u64 rx_frames_1024_1518;
+	u64 rx_frames_1519_max;
+
+	u64 rx_cong_drops;	/* # of Rx drops due to SGE congestion */
+
+	unsigned long tx_fifo_parity_err;
+	unsigned long rx_fifo_parity_err;
+	unsigned long tx_fifo_urun;
+	unsigned long rx_fifo_ovfl;
+	unsigned long serdes_signal_loss;
+	unsigned long xaui_pcs_ctc_err;
+	unsigned long xaui_pcs_align_change;
+
+	unsigned long num_toggled; /* # times toggled TxEn due to stuck TX */
+	unsigned long num_resets;  /* # times reset due to stuck TX */
+
+	unsigned long link_faults;  /* # detected link faults */
+};
+
+struct tp_mib_stats {
+	u32 ipInReceive_hi;
+	u32 ipInReceive_lo;
+	u32 ipInHdrErrors_hi;
+	u32 ipInHdrErrors_lo;
+	u32 ipInAddrErrors_hi;
+	u32 ipInAddrErrors_lo;
+	u32 ipInUnknownProtos_hi;
+	u32 ipInUnknownProtos_lo;
+	u32 ipInDiscards_hi;
+	u32 ipInDiscards_lo;
+	u32 ipInDelivers_hi;
+	u32 ipInDelivers_lo;
+	u32 ipOutRequests_hi;
+	u32 ipOutRequests_lo;
+	u32 ipOutDiscards_hi;
+	u32 ipOutDiscards_lo;
+	u32 ipOutNoRoutes_hi;
+	u32 ipOutNoRoutes_lo;
+	u32 ipReasmTimeout;
+	u32 ipReasmReqds;
+	u32 ipReasmOKs;
+	u32 ipReasmFails;
+
+	u32 reserved[8];
+
+	u32 tcpActiveOpens;
+	u32 tcpPassiveOpens;
+	u32 tcpAttemptFails;
+	u32 tcpEstabResets;
+	u32 tcpOutRsts;
+	u32 tcpCurrEstab;
+	u32 tcpInSegs_hi;
+	u32 tcpInSegs_lo;
+	u32 tcpOutSegs_hi;
+	u32 tcpOutSegs_lo;
+	u32 tcpRetransSeg_hi;
+	u32 tcpRetransSeg_lo;
+	u32 tcpInErrs_hi;
+	u32 tcpInErrs_lo;
+	u32 tcpRtoMin;
+	u32 tcpRtoMax;
+};
+
+struct tp_params {
+	unsigned int nchan;	/* # of channels */
+	unsigned int pmrx_size;	/* total PMRX capacity */
+	unsigned int pmtx_size;	/* total PMTX capacity */
+	unsigned int cm_size;	/* total CM capacity */
+	unsigned int chan_rx_size;	/* per channel Rx size */
+	unsigned int chan_tx_size;	/* per channel Tx size */
+	unsigned int rx_pg_size;	/* Rx page size */
+	unsigned int tx_pg_size;	/* Tx page size */
+	unsigned int rx_num_pgs;	/* # of Rx pages */
+	unsigned int tx_num_pgs;	/* # of Tx pages */
+	unsigned int ntimer_qs;	/* # of timer queues */
+};
+
+struct qset_params {		/* SGE queue set parameters */
+	unsigned int polling;	/* polling/interrupt service for rspq */
+	unsigned int coalesce_usecs;	/* irq coalescing timer */
+	unsigned int rspq_size;	/* # of entries in response queue */
+	unsigned int fl_size;	/* # of entries in regular free list */
+	unsigned int jumbo_size;	/* # of entries in jumbo free list */
+	unsigned int txq_size[SGE_TXQ_PER_SET];	/* Tx queue sizes */
+	unsigned int cong_thres;	/* FL congestion threshold */
+	unsigned int vector;		/* Interrupt (line or vector) number */
+};
+
+struct sge_params {
+	unsigned int max_pkt_size;	/* max offload pkt size */
+	struct qset_params qset[SGE_QSETS];
+};
+
+struct mc5_params {
+	unsigned int mode;	/* selects MC5 width */
+	unsigned int nservers;	/* size of server region */
+	unsigned int nfilters;	/* size of filter region */
+	unsigned int nroutes;	/* size of routing region */
+};
+
+/* Default MC5 region sizes */
+enum {
+	DEFAULT_NSERVERS = 512,
+	DEFAULT_NFILTERS = 128
+};
+
+/* MC5 modes, these must be non-0 */
+enum {
+	MC5_MODE_144_BIT = 1,
+	MC5_MODE_72_BIT = 2
+};
+
+/* MC5 min active region size */
+enum { MC5_MIN_TIDS = 16 };
+
+struct vpd_params {
+	unsigned int cclk;
+	unsigned int mclk;
+	unsigned int uclk;
+	unsigned int mdc;
+	unsigned int mem_timing;
+	u8 sn[SERNUM_LEN + 1];
+	u8 eth_base[6];
+	u8 port_type[MAX_NPORTS];
+	unsigned short xauicfg[2];
+};
+
+struct pci_params {
+	unsigned int vpd_cap_addr;
+	unsigned short speed;
+	unsigned char width;
+	unsigned char variant;
+};
+
+enum {
+	PCI_VARIANT_PCI,
+	PCI_VARIANT_PCIX_MODE1_PARITY,
+	PCI_VARIANT_PCIX_MODE1_ECC,
+	PCI_VARIANT_PCIX_266_MODE2,
+	PCI_VARIANT_PCIE
+};
+
+struct adapter_params {
+	struct sge_params sge;
+	struct mc5_params mc5;
+	struct tp_params tp;
+	struct vpd_params vpd;
+	struct pci_params pci;
+
+	const struct adapter_info *info;
+
+	unsigned short mtus[NMTUS];
+	unsigned short a_wnd[NCCTRL_WIN];
+	unsigned short b_wnd[NCCTRL_WIN];
+
+	unsigned int nports;	/* # of ethernet ports */
+	unsigned int chan_map;  /* bitmap of in-use Tx channels */
+	unsigned int stats_update_period;	/* MAC stats accumulation period */
+	unsigned int linkpoll_period;	/* link poll period in 0.1s */
+	unsigned int rev;	/* chip revision */
+	unsigned int offload;
+};
+
+enum {					    /* chip revisions */
+	T3_REV_A  = 0,
+	T3_REV_B  = 2,
+	T3_REV_B2 = 3,
+	T3_REV_C  = 4,
+};
+
+struct trace_params {
+	u32 sip;
+	u32 sip_mask;
+	u32 dip;
+	u32 dip_mask;
+	u16 sport;
+	u16 sport_mask;
+	u16 dport;
+	u16 dport_mask;
+	u32 vlan:12;
+	u32 vlan_mask:12;
+	u32 intf:4;
+	u32 intf_mask:4;
+	u8 proto;
+	u8 proto_mask;
+};
+
+struct link_config {
+	unsigned int supported;	/* link capabilities */
+	unsigned int advertising;	/* advertised capabilities */
+	unsigned short requested_speed;	/* speed user has requested */
+	unsigned short speed;	/* actual link speed */
+	unsigned char requested_duplex;	/* duplex user has requested */
+	unsigned char duplex;	/* actual link duplex */
+	unsigned char requested_fc;	/* flow control user has requested */
+	unsigned char fc;	/* actual link flow control */
+	unsigned char autoneg;	/* autonegotiating? */
+	unsigned int link_ok;	/* link up? */
+};
+
+#define SPEED_INVALID   0xffff
+#define DUPLEX_INVALID  0xff
+
+struct mc5 {
+	struct adapter *adapter;
+	unsigned int tcam_size;
+	unsigned char part_type;
+	unsigned char parity_enabled;
+	unsigned char mode;
+	struct mc5_stats stats;
+};
+
+static inline unsigned int t3_mc5_size(const struct mc5 *p)
+{
+	return p->tcam_size;
+}
+
+struct mc7 {
+	struct adapter *adapter;	/* backpointer to adapter */
+	unsigned int size;	/* memory size in bytes */
+	unsigned int width;	/* MC7 interface width */
+	unsigned int offset;	/* register address offset for MC7 instance */
+	const char *name;	/* name of MC7 instance */
+	struct mc7_stats stats;	/* MC7 statistics */
+};
+
+static inline unsigned int t3_mc7_size(const struct mc7 *p)
+{
+	return p->size;
+}
+
+struct cmac {
+	struct adapter *adapter;
+	unsigned int offset;
+	unsigned int nucast;	/* # of address filters for unicast MACs */
+	unsigned int tx_tcnt;
+	unsigned int tx_xcnt;
+	u64 tx_mcnt;
+	unsigned int rx_xcnt;
+	unsigned int rx_ocnt;
+	u64 rx_mcnt;
+	unsigned int toggle_cnt;
+	unsigned int txen;
+	u64 rx_pause;
+	struct mac_stats stats;
+};
+
+enum {
+	MAC_DIRECTION_RX = 1,
+	MAC_DIRECTION_TX = 2,
+	MAC_RXFIFO_SIZE = 32768
+};
+
+/* PHY loopback direction */
+enum {
+	PHY_LOOPBACK_TX = 1,
+	PHY_LOOPBACK_RX = 2
+};
+
+/* PHY interrupt types */
+enum {
+	cphy_cause_link_change = 1,
+	cphy_cause_fifo_error = 2,
+	cphy_cause_module_change = 4,
+};
+
+/* PHY module types */
+enum {
+	phy_modtype_none,
+	phy_modtype_sr,
+	phy_modtype_lr,
+	phy_modtype_lrm,
+	phy_modtype_twinax,
+	phy_modtype_twinax_long,
+	phy_modtype_unknown
+};
+
+/* PHY operations */
+struct cphy_ops {
+	int (*reset)(struct cphy *phy, int wait);
+
+	int (*intr_enable)(struct cphy *phy);
+	int (*intr_disable)(struct cphy *phy);
+	int (*intr_clear)(struct cphy *phy);
+	int (*intr_handler)(struct cphy *phy);
+
+	int (*autoneg_enable)(struct cphy *phy);
+	int (*autoneg_restart)(struct cphy *phy);
+
+	int (*advertise)(struct cphy *phy, unsigned int advertise_map);
+	int (*set_loopback)(struct cphy *phy, int mmd, int dir, int enable);
+	int (*set_speed_duplex)(struct cphy *phy, int speed, int duplex);
+	int (*get_link_status)(struct cphy *phy, int *link_ok, int *speed,
+			       int *duplex, int *fc);
+	int (*power_down)(struct cphy *phy, int enable);
+
+	u32 mmds;
+};
+enum {
+	EDC_OPT_AEL2005 = 0,
+	EDC_OPT_AEL2005_SIZE = 1084,
+	EDC_TWX_AEL2005 = 1,
+	EDC_TWX_AEL2005_SIZE = 1464,
+	EDC_TWX_AEL2020 = 2,
+	EDC_TWX_AEL2020_SIZE = 1628,
+	EDC_MAX_SIZE = EDC_TWX_AEL2020_SIZE, /* Max cache size */
+};
+
+/* A PHY instance */
+struct cphy {
+	u8 modtype;			/* PHY module type */
+	short priv;			/* scratch pad */
+	unsigned int caps;		/* PHY capabilities */
+	struct adapter *adapter;	/* associated adapter */
+	const char *desc;		/* PHY description */
+	unsigned long fifo_errors;	/* FIFO over/under-flows */
+	const struct cphy_ops *ops;	/* PHY operations */
+	struct mdio_if_info mdio;
+	u16 phy_cache[EDC_MAX_SIZE];	/* EDC cache */
+};
+
+/* Convenience MDIO read/write wrappers */
+static inline int t3_mdio_read(struct cphy *phy, int mmd, int reg,
+			       unsigned int *valp)
+{
+	int rc = phy->mdio.mdio_read(phy->mdio.dev, phy->mdio.prtad, mmd, reg);
+	*valp = (rc >= 0) ? rc : -1;
+	return (rc >= 0) ? 0 : rc;
+}
+
+static inline int t3_mdio_write(struct cphy *phy, int mmd, int reg,
+				unsigned int val)
+{
+	return phy->mdio.mdio_write(phy->mdio.dev, phy->mdio.prtad, mmd,
+				    reg, val);
+}
+
+/* Convenience initializer */
+static inline void cphy_init(struct cphy *phy, struct adapter *adapter,
+			     int phy_addr, struct cphy_ops *phy_ops,
+			     const struct mdio_ops *mdio_ops,
+			      unsigned int caps, const char *desc)
+{
+	phy->caps = caps;
+	phy->adapter = adapter;
+	phy->desc = desc;
+	phy->ops = phy_ops;
+	if (mdio_ops) {
+		phy->mdio.prtad = phy_addr;
+		phy->mdio.mmds = phy_ops->mmds;
+		phy->mdio.mode_support = mdio_ops->mode_support;
+		phy->mdio.mdio_read = mdio_ops->read;
+		phy->mdio.mdio_write = mdio_ops->write;
+	}
+}
+
+/* Accumulate MAC statistics every 180 seconds.  For 1G we multiply by 10. */
+#define MAC_STATS_ACCUM_SECS 180
+
+#define XGM_REG(reg_addr, idx) \
+	((reg_addr) + (idx) * (XGMAC0_1_BASE_ADDR - XGMAC0_0_BASE_ADDR))
+
+struct addr_val_pair {
+	unsigned int reg_addr;
+	unsigned int val;
+};
+
+#include "adapter.h"
+
+#ifndef PCI_VENDOR_ID_CHELSIO
+# define PCI_VENDOR_ID_CHELSIO 0x1425
+#endif
+
+#define for_each_port(adapter, iter) \
+	for (iter = 0; iter < (adapter)->params.nports; ++iter)
+
+#define adapter_info(adap) ((adap)->params.info)
+
+static inline int uses_xaui(const struct adapter *adap)
+{
+	return adapter_info(adap)->caps & SUPPORTED_AUI;
+}
+
+static inline int is_10G(const struct adapter *adap)
+{
+	return adapter_info(adap)->caps & SUPPORTED_10000baseT_Full;
+}
+
+static inline int is_offload(const struct adapter *adap)
+{
+	return adap->params.offload;
+}
+
+static inline unsigned int core_ticks_per_usec(const struct adapter *adap)
+{
+	return adap->params.vpd.cclk / 1000;
+}
+
+static inline unsigned int is_pcie(const struct adapter *adap)
+{
+	return adap->params.pci.variant == PCI_VARIANT_PCIE;
+}
+
+void t3_set_reg_field(struct adapter *adap, unsigned int addr, u32 mask,
+		      u32 val);
+void t3_write_regs(struct adapter *adapter, const struct addr_val_pair *p,
+		   int n, unsigned int offset);
+int t3_wait_op_done_val(struct adapter *adapter, int reg, u32 mask,
+			int polarity, int attempts, int delay, u32 *valp);
+static inline int t3_wait_op_done(struct adapter *adapter, int reg, u32 mask,
+				  int polarity, int attempts, int delay)
+{
+	return t3_wait_op_done_val(adapter, reg, mask, polarity, attempts,
+				   delay, NULL);
+}
+int t3_mdio_change_bits(struct cphy *phy, int mmd, int reg, unsigned int clear,
+			unsigned int set);
+int t3_phy_reset(struct cphy *phy, int mmd, int wait);
+int t3_phy_advertise(struct cphy *phy, unsigned int advert);
+int t3_phy_advertise_fiber(struct cphy *phy, unsigned int advert);
+int t3_set_phy_speed_duplex(struct cphy *phy, int speed, int duplex);
+int t3_phy_lasi_intr_enable(struct cphy *phy);
+int t3_phy_lasi_intr_disable(struct cphy *phy);
+int t3_phy_lasi_intr_clear(struct cphy *phy);
+int t3_phy_lasi_intr_handler(struct cphy *phy);
+
+void t3_intr_enable(struct adapter *adapter);
+void t3_intr_disable(struct adapter *adapter);
+void t3_intr_clear(struct adapter *adapter);
+void t3_xgm_intr_enable(struct adapter *adapter, int idx);
+void t3_xgm_intr_disable(struct adapter *adapter, int idx);
+void t3_port_intr_enable(struct adapter *adapter, int idx);
+void t3_port_intr_disable(struct adapter *adapter, int idx);
+int t3_slow_intr_handler(struct adapter *adapter);
+int t3_phy_intr_handler(struct adapter *adapter);
+
+void t3_link_changed(struct adapter *adapter, int port_id);
+void t3_link_fault(struct adapter *adapter, int port_id);
+int t3_link_start(struct cphy *phy, struct cmac *mac, struct link_config *lc);
+const struct adapter_info *t3_get_adapter_info(unsigned int board_id);
+int t3_seeprom_read(struct adapter *adapter, u32 addr, __le32 *data);
+int t3_seeprom_write(struct adapter *adapter, u32 addr, __le32 data);
+int t3_seeprom_wp(struct adapter *adapter, int enable);
+int t3_get_tp_version(struct adapter *adapter, u32 *vers);
+int t3_check_tpsram_version(struct adapter *adapter);
+int t3_check_tpsram(struct adapter *adapter, const u8 *tp_ram,
+		    unsigned int size);
+int t3_set_proto_sram(struct adapter *adap, const u8 *data);
+int t3_load_fw(struct adapter *adapter, const u8 * fw_data, unsigned int size);
+int t3_get_fw_version(struct adapter *adapter, u32 *vers);
+int t3_check_fw_version(struct adapter *adapter);
+int t3_init_hw(struct adapter *adapter, u32 fw_params);
+int t3_reset_adapter(struct adapter *adapter);
+int t3_prep_adapter(struct adapter *adapter, const struct adapter_info *ai,
+		    int reset);
+int t3_replay_prep_adapter(struct adapter *adapter);
+void t3_led_ready(struct adapter *adapter);
+void t3_fatal_err(struct adapter *adapter);
+void t3_set_vlan_accel(struct adapter *adapter, unsigned int ports, int on);
+void t3_config_rss(struct adapter *adapter, unsigned int rss_config,
+		   const u8 * cpus, const u16 *rspq);
+int t3_cim_ctl_blk_read(struct adapter *adap, unsigned int addr,
+			unsigned int n, unsigned int *valp);
+int t3_mc7_bd_read(struct mc7 *mc7, unsigned int start, unsigned int n,
+		   u64 *buf);
+
+int t3_mac_reset(struct cmac *mac);
+void t3b_pcs_reset(struct cmac *mac);
+void t3_mac_disable_exact_filters(struct cmac *mac);
+void t3_mac_enable_exact_filters(struct cmac *mac);
+int t3_mac_enable(struct cmac *mac, int which);
+int t3_mac_disable(struct cmac *mac, int which);
+int t3_mac_set_mtu(struct cmac *mac, unsigned int mtu);
+int t3_mac_set_rx_mode(struct cmac *mac, struct net_device *dev);
+int t3_mac_set_address(struct cmac *mac, unsigned int idx, u8 addr[6]);
+int t3_mac_set_num_ucast(struct cmac *mac, int n);
+const struct mac_stats *t3_mac_update_stats(struct cmac *mac);
+int t3_mac_set_speed_duplex_fc(struct cmac *mac, int speed, int duplex, int fc);
+int t3b2_mac_watchdog_task(struct cmac *mac);
+
+void t3_mc5_prep(struct adapter *adapter, struct mc5 *mc5, int mode);
+int t3_mc5_init(struct mc5 *mc5, unsigned int nservers, unsigned int nfilters,
+		unsigned int nroutes);
+void t3_mc5_intr_handler(struct mc5 *mc5);
+
+void t3_tp_set_offload_mode(struct adapter *adap, int enable);
+void t3_tp_get_mib_stats(struct adapter *adap, struct tp_mib_stats *tps);
+void t3_load_mtus(struct adapter *adap, unsigned short mtus[NMTUS],
+		  unsigned short alpha[NCCTRL_WIN],
+		  unsigned short beta[NCCTRL_WIN], unsigned short mtu_cap);
+void t3_config_trace_filter(struct adapter *adapter,
+			    const struct trace_params *tp, int filter_index,
+			    int invert, int enable);
+int t3_config_sched(struct adapter *adap, unsigned int kbps, int sched);
+
+void t3_sge_prep(struct adapter *adap, struct sge_params *p);
+void t3_sge_init(struct adapter *adap, struct sge_params *p);
+int t3_sge_init_ecntxt(struct adapter *adapter, unsigned int id, int gts_enable,
+		       enum sge_context_type type, int respq, u64 base_addr,
+		       unsigned int size, unsigned int token, int gen,
+		       unsigned int cidx);
+int t3_sge_init_flcntxt(struct adapter *adapter, unsigned int id,
+			int gts_enable, u64 base_addr, unsigned int size,
+			unsigned int esize, unsigned int cong_thres, int gen,
+			unsigned int cidx);
+int t3_sge_init_rspcntxt(struct adapter *adapter, unsigned int id,
+			 int irq_vec_idx, u64 base_addr, unsigned int size,
+			 unsigned int fl_thres, int gen, unsigned int cidx);
+int t3_sge_init_cqcntxt(struct adapter *adapter, unsigned int id, u64 base_addr,
+			unsigned int size, int rspq, int ovfl_mode,
+			unsigned int credits, unsigned int credit_thres);
+int t3_sge_enable_ecntxt(struct adapter *adapter, unsigned int id, int enable);
+int t3_sge_disable_fl(struct adapter *adapter, unsigned int id);
+int t3_sge_disable_rspcntxt(struct adapter *adapter, unsigned int id);
+int t3_sge_disable_cqcntxt(struct adapter *adapter, unsigned int id);
+int t3_sge_cqcntxt_op(struct adapter *adapter, unsigned int id, unsigned int op,
+		      unsigned int credits);
+
+int t3_vsc8211_phy_prep(struct cphy *phy, struct adapter *adapter,
+			int phy_addr, const struct mdio_ops *mdio_ops);
+int t3_ael1002_phy_prep(struct cphy *phy, struct adapter *adapter,
+			int phy_addr, const struct mdio_ops *mdio_ops);
+int t3_ael1006_phy_prep(struct cphy *phy, struct adapter *adapter,
+			int phy_addr, const struct mdio_ops *mdio_ops);
+int t3_ael2005_phy_prep(struct cphy *phy, struct adapter *adapter,
+			int phy_addr, const struct mdio_ops *mdio_ops);
+int t3_ael2020_phy_prep(struct cphy *phy, struct adapter *adapter,
+			int phy_addr, const struct mdio_ops *mdio_ops);
+int t3_qt2045_phy_prep(struct cphy *phy, struct adapter *adapter, int phy_addr,
+		       const struct mdio_ops *mdio_ops);
+int t3_xaui_direct_phy_prep(struct cphy *phy, struct adapter *adapter,
+			    int phy_addr, const struct mdio_ops *mdio_ops);
+int t3_aq100x_phy_prep(struct cphy *phy, struct adapter *adapter,
+			    int phy_addr, const struct mdio_ops *mdio_ops);
+#endif				/* __CHELSIO_COMMON_H */
diff --git a/drivers/net/ethernet/chelsio/cxgb3/cxgb3_ctl_defs.h b/drivers/net/ethernet/chelsio/cxgb3/cxgb3_ctl_defs.h
new file mode 100644
index 0000000..369fe71
--- /dev/null
+++ b/drivers/net/ethernet/chelsio/cxgb3/cxgb3_ctl_defs.h
@@ -0,0 +1,189 @@
+/*
+ * Copyright (c) 2003-2008 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef _CXGB3_OFFLOAD_CTL_DEFS_H
+#define _CXGB3_OFFLOAD_CTL_DEFS_H
+
+enum {
+	GET_MAX_OUTSTANDING_WR 	= 0,
+	GET_TX_MAX_CHUNK	= 1,
+	GET_TID_RANGE		= 2,
+	GET_STID_RANGE		= 3,
+	GET_RTBL_RANGE		= 4,
+	GET_L2T_CAPACITY	= 5,
+	GET_MTUS		= 6,
+	GET_WR_LEN		= 7,
+	GET_IFF_FROM_MAC	= 8,
+	GET_DDP_PARAMS		= 9,
+	GET_PORTS		= 10,
+
+	ULP_ISCSI_GET_PARAMS	= 11,
+	ULP_ISCSI_SET_PARAMS	= 12,
+
+	RDMA_GET_PARAMS		= 13,
+	RDMA_CQ_OP		= 14,
+	RDMA_CQ_SETUP		= 15,
+	RDMA_CQ_DISABLE		= 16,
+	RDMA_CTRL_QP_SETUP	= 17,
+	RDMA_GET_MEM		= 18,
+	RDMA_GET_MIB		= 19,
+
+	GET_RX_PAGE_INFO	= 50,
+	GET_ISCSI_IPV4ADDR	= 51,
+
+	GET_EMBEDDED_INFO	= 70,
+};
+
+/*
+ * Structure used to describe a TID range.  Valid TIDs are [base, base+num).
+ */
+struct tid_range {
+	unsigned int base;	/* first TID */
+	unsigned int num;	/* number of TIDs in range */
+};
+
+/*
+ * Structure used to request the size and contents of the MTU table.
+ */
+struct mtutab {
+	unsigned int size;	/* # of entries in the MTU table */
+	const unsigned short *mtus;	/* the MTU table values */
+};
+
+struct net_device;
+
+/*
+ * Structure used to request the adapter net_device owning a given MAC address.
+ */
+struct iff_mac {
+	struct net_device *dev;	/* the net_device */
+	const unsigned char *mac_addr;	/* MAC address to lookup */
+	u16 vlan_tag;
+};
+
+/* Structure used to request a port's iSCSI IPv4 address */
+struct iscsi_ipv4addr {
+	struct net_device *dev;	/* the net_device */
+	__be32 ipv4addr;	/* the return iSCSI IPv4 address */
+};
+
+struct pci_dev;
+
+/*
+ * Structure used to request the TCP DDP parameters.
+ */
+struct ddp_params {
+	unsigned int llimit;	/* TDDP region start address */
+	unsigned int ulimit;	/* TDDP region end address */
+	unsigned int tag_mask;	/* TDDP tag mask */
+	struct pci_dev *pdev;
+};
+
+struct adap_ports {
+	unsigned int nports;	/* number of ports on this adapter */
+	struct net_device *lldevs[2];
+};
+
+/*
+ * Structure used to return information to the iscsi layer.
+ */
+struct ulp_iscsi_info {
+	unsigned int offset;
+	unsigned int llimit;
+	unsigned int ulimit;
+	unsigned int tagmask;
+	u8 pgsz_factor[4];
+	unsigned int max_rxsz;
+	unsigned int max_txsz;
+	struct pci_dev *pdev;
+};
+
+/*
+ * Structure used to return information to the RDMA layer.
+ */
+struct rdma_info {
+	unsigned int tpt_base;	/* TPT base address */
+	unsigned int tpt_top;	/* TPT last entry address */
+	unsigned int pbl_base;	/* PBL base address */
+	unsigned int pbl_top;	/* PBL last entry address */
+	unsigned int rqt_base;	/* RQT base address */
+	unsigned int rqt_top;	/* RQT last entry address */
+	unsigned int udbell_len;	/* user doorbell region length */
+	unsigned long udbell_physbase;	/* user doorbell physical start addr */
+	void __iomem *kdb_addr;	/* kernel doorbell register address */
+	struct pci_dev *pdev;	/* associated PCI device */
+};
+
+/*
+ * Structure used to request an operation on an RDMA completion queue.
+ */
+struct rdma_cq_op {
+	unsigned int id;
+	unsigned int op;
+	unsigned int credits;
+};
+
+/*
+ * Structure used to setup RDMA completion queues.
+ */
+struct rdma_cq_setup {
+	unsigned int id;
+	unsigned long long base_addr;
+	unsigned int size;
+	unsigned int credits;
+	unsigned int credit_thres;
+	unsigned int ovfl_mode;
+};
+
+/*
+ * Structure used to setup the RDMA control egress context.
+ */
+struct rdma_ctrlqp_setup {
+	unsigned long long base_addr;
+	unsigned int size;
+};
+
+/*
+ * Offload TX/RX page information.
+ */
+struct ofld_page_info {
+	unsigned int page_size;  /* Page size, should be a power of 2 */
+	unsigned int num;        /* Number of pages */
+};
+
+/*
+ * Structure used to get firmware and protocol engine versions.
+ */
+struct ch_embedded_info {
+	u32 fw_vers;
+	u32 tp_vers;
+};
+#endif				/* _CXGB3_OFFLOAD_CTL_DEFS_H */
diff --git a/drivers/net/ethernet/chelsio/cxgb3/cxgb3_defs.h b/drivers/net/ethernet/chelsio/cxgb3/cxgb3_defs.h
new file mode 100644
index 0000000..920d918
--- /dev/null
+++ b/drivers/net/ethernet/chelsio/cxgb3/cxgb3_defs.h
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2006-2008 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef _CHELSIO_DEFS_H
+#define _CHELSIO_DEFS_H
+
+#include <linux/skbuff.h>
+#include <net/tcp.h>
+
+#include "t3cdev.h"
+
+#include "cxgb3_offload.h"
+
+#define VALIDATE_TID 1
+
+void *cxgb_alloc_mem(unsigned long size);
+void cxgb_free_mem(void *addr);
+
+/*
+ * Map an ATID or STID to their entries in the corresponding TID tables.
+ */
+static inline union active_open_entry *atid2entry(const struct tid_info *t,
+						  unsigned int atid)
+{
+	return &t->atid_tab[atid - t->atid_base];
+}
+
+static inline union listen_entry *stid2entry(const struct tid_info *t,
+					     unsigned int stid)
+{
+	return &t->stid_tab[stid - t->stid_base];
+}
+
+/*
+ * Find the connection corresponding to a TID.
+ */
+static inline struct t3c_tid_entry *lookup_tid(const struct tid_info *t,
+					       unsigned int tid)
+{
+	struct t3c_tid_entry *t3c_tid = tid < t->ntids ?
+	    &(t->tid_tab[tid]) : NULL;
+
+	return (t3c_tid && t3c_tid->client) ? t3c_tid : NULL;
+}
+
+/*
+ * Find the connection corresponding to a server TID.
+ */
+static inline struct t3c_tid_entry *lookup_stid(const struct tid_info *t,
+						unsigned int tid)
+{
+	union listen_entry *e;
+
+	if (tid < t->stid_base || tid >= t->stid_base + t->nstids)
+		return NULL;
+
+	e = stid2entry(t, tid);
+	if ((void *)e->next >= (void *)t->tid_tab &&
+	    (void *)e->next < (void *)&t->atid_tab[t->natids])
+		return NULL;
+
+	return &e->t3c_tid;
+}
+
+/*
+ * Find the connection corresponding to an active-open TID.
+ */
+static inline struct t3c_tid_entry *lookup_atid(const struct tid_info *t,
+						unsigned int tid)
+{
+	union active_open_entry *e;
+
+	if (tid < t->atid_base || tid >= t->atid_base + t->natids)
+		return NULL;
+
+	e = atid2entry(t, tid);
+	if ((void *)e->next >= (void *)t->tid_tab &&
+	    (void *)e->next < (void *)&t->atid_tab[t->natids])
+		return NULL;
+
+	return &e->t3c_tid;
+}
+
+int attach_t3cdev(struct t3cdev *dev);
+void detach_t3cdev(struct t3cdev *dev);
+#endif
diff --git a/drivers/net/ethernet/chelsio/cxgb3/cxgb3_ioctl.h b/drivers/net/ethernet/chelsio/cxgb3/cxgb3_ioctl.h
new file mode 100644
index 0000000..b19e437
--- /dev/null
+++ b/drivers/net/ethernet/chelsio/cxgb3/cxgb3_ioctl.h
@@ -0,0 +1,177 @@
+/*
+ * Copyright (c) 2003-2008 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __CHIOCTL_H__
+#define __CHIOCTL_H__
+
+/*
+ * Ioctl commands specific to this driver.
+ */
+enum {
+	CHELSIO_GETMTUTAB 		= 1029,
+	CHELSIO_SETMTUTAB 		= 1030,
+	CHELSIO_SET_PM 			= 1032,
+	CHELSIO_GET_PM			= 1033,
+	CHELSIO_GET_MEM			= 1038,
+	CHELSIO_LOAD_FW			= 1041,
+	CHELSIO_SET_TRACE_FILTER	= 1044,
+	CHELSIO_SET_QSET_PARAMS		= 1045,
+	CHELSIO_GET_QSET_PARAMS		= 1046,
+	CHELSIO_SET_QSET_NUM		= 1047,
+	CHELSIO_GET_QSET_NUM		= 1048,
+};
+
+struct ch_reg {
+	uint32_t cmd;
+	uint32_t addr;
+	uint32_t val;
+};
+
+struct ch_cntxt {
+	uint32_t cmd;
+	uint32_t cntxt_type;
+	uint32_t cntxt_id;
+	uint32_t data[4];
+};
+
+/* context types */
+enum { CNTXT_TYPE_EGRESS, CNTXT_TYPE_FL, CNTXT_TYPE_RSP, CNTXT_TYPE_CQ };
+
+struct ch_desc {
+	uint32_t cmd;
+	uint32_t queue_num;
+	uint32_t idx;
+	uint32_t size;
+	uint8_t data[128];
+};
+
+struct ch_mem_range {
+	uint32_t cmd;
+	uint32_t mem_id;
+	uint32_t addr;
+	uint32_t len;
+	uint32_t version;
+	uint8_t buf[0];
+};
+
+struct ch_qset_params {
+	uint32_t cmd;
+	uint32_t qset_idx;
+	int32_t txq_size[3];
+	int32_t rspq_size;
+	int32_t fl_size[2];
+	int32_t intr_lat;
+	int32_t polling;
+	int32_t lro;
+	int32_t cong_thres;
+	int32_t  vector;
+	int32_t  qnum;
+};
+
+struct ch_pktsched_params {
+	uint32_t cmd;
+	uint8_t sched;
+	uint8_t idx;
+	uint8_t min;
+	uint8_t max;
+	uint8_t binding;
+};
+
+#ifndef TCB_SIZE
+# define TCB_SIZE   128
+#endif
+
+/* TCB size in 32-bit words */
+#define TCB_WORDS (TCB_SIZE / 4)
+
+enum { MEM_CM, MEM_PMRX, MEM_PMTX };	/* ch_mem_range.mem_id values */
+
+struct ch_mtus {
+	uint32_t cmd;
+	uint32_t nmtus;
+	uint16_t mtus[NMTUS];
+};
+
+struct ch_pm {
+	uint32_t cmd;
+	uint32_t tx_pg_sz;
+	uint32_t tx_num_pg;
+	uint32_t rx_pg_sz;
+	uint32_t rx_num_pg;
+	uint32_t pm_total;
+};
+
+struct ch_tcam {
+	uint32_t cmd;
+	uint32_t tcam_size;
+	uint32_t nservers;
+	uint32_t nroutes;
+	uint32_t nfilters;
+};
+
+struct ch_tcb {
+	uint32_t cmd;
+	uint32_t tcb_index;
+	uint32_t tcb_data[TCB_WORDS];
+};
+
+struct ch_tcam_word {
+	uint32_t cmd;
+	uint32_t addr;
+	uint32_t buf[3];
+};
+
+struct ch_trace {
+	uint32_t cmd;
+	uint32_t sip;
+	uint32_t sip_mask;
+	uint32_t dip;
+	uint32_t dip_mask;
+	uint16_t sport;
+	uint16_t sport_mask;
+	uint16_t dport;
+	uint16_t dport_mask;
+	uint32_t vlan:12;
+	uint32_t vlan_mask:12;
+	uint32_t intf:4;
+	uint32_t intf_mask:4;
+	uint8_t proto;
+	uint8_t proto_mask;
+	uint8_t invert_match:1;
+	uint8_t config_tx:1;
+	uint8_t config_rx:1;
+	uint8_t trace_tx:1;
+	uint8_t trace_rx:1;
+};
+
+#define SIOCCHIOCTL SIOCDEVPRIVATE
+
+#endif
diff --git a/drivers/net/ethernet/chelsio/cxgb3/cxgb3_main.c b/drivers/net/ethernet/chelsio/cxgb3/cxgb3_main.c
new file mode 100644
index 0000000..db76f70
--- /dev/null
+++ b/drivers/net/ethernet/chelsio/cxgb3/cxgb3_main.c
@@ -0,0 +1,3438 @@
+/*
+ * Copyright (c) 2003-2008 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/init.h>
+#include <linux/pci.h>
+#include <linux/dma-mapping.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/if_vlan.h>
+#include <linux/mdio.h>
+#include <linux/sockios.h>
+#include <linux/workqueue.h>
+#include <linux/proc_fs.h>
+#include <linux/rtnetlink.h>
+#include <linux/firmware.h>
+#include <linux/log2.h>
+#include <linux/stringify.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <asm/uaccess.h>
+
+#include "common.h"
+#include "cxgb3_ioctl.h"
+#include "regs.h"
+#include "cxgb3_offload.h"
+#include "version.h"
+
+#include "cxgb3_ctl_defs.h"
+#include "t3_cpl.h"
+#include "firmware_exports.h"
+
+enum {
+	MAX_TXQ_ENTRIES = 16384,
+	MAX_CTRL_TXQ_ENTRIES = 1024,
+	MAX_RSPQ_ENTRIES = 16384,
+	MAX_RX_BUFFERS = 16384,
+	MAX_RX_JUMBO_BUFFERS = 16384,
+	MIN_TXQ_ENTRIES = 4,
+	MIN_CTRL_TXQ_ENTRIES = 4,
+	MIN_RSPQ_ENTRIES = 32,
+	MIN_FL_ENTRIES = 32
+};
+
+#define PORT_MASK ((1 << MAX_NPORTS) - 1)
+
+#define DFLT_MSG_ENABLE (NETIF_MSG_DRV | NETIF_MSG_PROBE | NETIF_MSG_LINK | \
+			 NETIF_MSG_TIMER | NETIF_MSG_IFDOWN | NETIF_MSG_IFUP |\
+			 NETIF_MSG_RX_ERR | NETIF_MSG_TX_ERR)
+
+#define EEPROM_MAGIC 0x38E2F10C
+
+#define CH_DEVICE(devid, idx) \
+	{ PCI_VENDOR_ID_CHELSIO, devid, PCI_ANY_ID, PCI_ANY_ID, 0, 0, idx }
+
+static const struct pci_device_id cxgb3_pci_tbl[] = {
+	CH_DEVICE(0x20, 0),	/* PE9000 */
+	CH_DEVICE(0x21, 1),	/* T302E */
+	CH_DEVICE(0x22, 2),	/* T310E */
+	CH_DEVICE(0x23, 3),	/* T320X */
+	CH_DEVICE(0x24, 1),	/* T302X */
+	CH_DEVICE(0x25, 3),	/* T320E */
+	CH_DEVICE(0x26, 2),	/* T310X */
+	CH_DEVICE(0x30, 2),	/* T3B10 */
+	CH_DEVICE(0x31, 3),	/* T3B20 */
+	CH_DEVICE(0x32, 1),	/* T3B02 */
+	CH_DEVICE(0x35, 6),	/* T3C20-derived T3C10 */
+	CH_DEVICE(0x36, 3),	/* S320E-CR */
+	CH_DEVICE(0x37, 7),	/* N320E-G2 */
+	{0,}
+};
+
+MODULE_DESCRIPTION(DRV_DESC);
+MODULE_AUTHOR("Chelsio Communications");
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_VERSION(DRV_VERSION);
+MODULE_DEVICE_TABLE(pci, cxgb3_pci_tbl);
+
+static int dflt_msg_enable = DFLT_MSG_ENABLE;
+
+module_param(dflt_msg_enable, int, 0644);
+MODULE_PARM_DESC(dflt_msg_enable, "Chelsio T3 default message enable bitmap");
+
+/*
+ * The driver uses the best interrupt scheme available on a platform in the
+ * order MSI-X, MSI, legacy pin interrupts.  This parameter determines which
+ * of these schemes the driver may consider as follows:
+ *
+ * msi = 2: choose from among all three options
+ * msi = 1: only consider MSI and pin interrupts
+ * msi = 0: force pin interrupts
+ */
+static int msi = 2;
+
+module_param(msi, int, 0644);
+MODULE_PARM_DESC(msi, "whether to use MSI or MSI-X");
+
+/*
+ * The driver enables offload as a default.
+ * To disable it, use ofld_disable = 1.
+ */
+
+static int ofld_disable = 0;
+
+module_param(ofld_disable, int, 0644);
+MODULE_PARM_DESC(ofld_disable, "whether to enable offload at init time or not");
+
+/*
+ * We have work elements that we need to cancel when an interface is taken
+ * down.  Normally the work elements would be executed by keventd but that
+ * can deadlock because of linkwatch.  If our close method takes the rtnl
+ * lock and linkwatch is ahead of our work elements in keventd, linkwatch
+ * will block keventd as it needs the rtnl lock, and we'll deadlock waiting
+ * for our work to complete.  Get our own work queue to solve this.
+ */
+struct workqueue_struct *cxgb3_wq;
+
+/**
+ *	link_report - show link status and link speed/duplex
+ *	@p: the port whose settings are to be reported
+ *
+ *	Shows the link status, speed, and duplex of a port.
+ */
+static void link_report(struct net_device *dev)
+{
+	if (!netif_carrier_ok(dev))
+		netdev_info(dev, "link down\n");
+	else {
+		const char *s = "10Mbps";
+		const struct port_info *p = netdev_priv(dev);
+
+		switch (p->link_config.speed) {
+		case SPEED_10000:
+			s = "10Gbps";
+			break;
+		case SPEED_1000:
+			s = "1000Mbps";
+			break;
+		case SPEED_100:
+			s = "100Mbps";
+			break;
+		}
+
+		netdev_info(dev, "link up, %s, %s-duplex\n",
+			    s, p->link_config.duplex == DUPLEX_FULL
+			    ? "full" : "half");
+	}
+}
+
+static void enable_tx_fifo_drain(struct adapter *adapter,
+				 struct port_info *pi)
+{
+	t3_set_reg_field(adapter, A_XGM_TXFIFO_CFG + pi->mac.offset, 0,
+			 F_ENDROPPKT);
+	t3_write_reg(adapter, A_XGM_RX_CTRL + pi->mac.offset, 0);
+	t3_write_reg(adapter, A_XGM_TX_CTRL + pi->mac.offset, F_TXEN);
+	t3_write_reg(adapter, A_XGM_RX_CTRL + pi->mac.offset, F_RXEN);
+}
+
+static void disable_tx_fifo_drain(struct adapter *adapter,
+				  struct port_info *pi)
+{
+	t3_set_reg_field(adapter, A_XGM_TXFIFO_CFG + pi->mac.offset,
+			 F_ENDROPPKT, 0);
+}
+
+void t3_os_link_fault(struct adapter *adap, int port_id, int state)
+{
+	struct net_device *dev = adap->port[port_id];
+	struct port_info *pi = netdev_priv(dev);
+
+	if (state == netif_carrier_ok(dev))
+		return;
+
+	if (state) {
+		struct cmac *mac = &pi->mac;
+
+		netif_carrier_on(dev);
+
+		disable_tx_fifo_drain(adap, pi);
+
+		/* Clear local faults */
+		t3_xgm_intr_disable(adap, pi->port_id);
+		t3_read_reg(adap, A_XGM_INT_STATUS +
+				    pi->mac.offset);
+		t3_write_reg(adap,
+			     A_XGM_INT_CAUSE + pi->mac.offset,
+			     F_XGM_INT);
+
+		t3_set_reg_field(adap,
+				 A_XGM_INT_ENABLE +
+				 pi->mac.offset,
+				 F_XGM_INT, F_XGM_INT);
+		t3_xgm_intr_enable(adap, pi->port_id);
+
+		t3_mac_enable(mac, MAC_DIRECTION_TX);
+	} else {
+		netif_carrier_off(dev);
+
+		/* Flush TX FIFO */
+		enable_tx_fifo_drain(adap, pi);
+	}
+	link_report(dev);
+}
+
+/**
+ *	t3_os_link_changed - handle link status changes
+ *	@adapter: the adapter associated with the link change
+ *	@port_id: the port index whose limk status has changed
+ *	@link_stat: the new status of the link
+ *	@speed: the new speed setting
+ *	@duplex: the new duplex setting
+ *	@pause: the new flow-control setting
+ *
+ *	This is the OS-dependent handler for link status changes.  The OS
+ *	neutral handler takes care of most of the processing for these events,
+ *	then calls this handler for any OS-specific processing.
+ */
+void t3_os_link_changed(struct adapter *adapter, int port_id, int link_stat,
+			int speed, int duplex, int pause)
+{
+	struct net_device *dev = adapter->port[port_id];
+	struct port_info *pi = netdev_priv(dev);
+	struct cmac *mac = &pi->mac;
+
+	/* Skip changes from disabled ports. */
+	if (!netif_running(dev))
+		return;
+
+	if (link_stat != netif_carrier_ok(dev)) {
+		if (link_stat) {
+			disable_tx_fifo_drain(adapter, pi);
+
+			t3_mac_enable(mac, MAC_DIRECTION_RX);
+
+			/* Clear local faults */
+			t3_xgm_intr_disable(adapter, pi->port_id);
+			t3_read_reg(adapter, A_XGM_INT_STATUS +
+				    pi->mac.offset);
+			t3_write_reg(adapter,
+				     A_XGM_INT_CAUSE + pi->mac.offset,
+				     F_XGM_INT);
+
+			t3_set_reg_field(adapter,
+					 A_XGM_INT_ENABLE + pi->mac.offset,
+					 F_XGM_INT, F_XGM_INT);
+			t3_xgm_intr_enable(adapter, pi->port_id);
+
+			netif_carrier_on(dev);
+		} else {
+			netif_carrier_off(dev);
+
+			t3_xgm_intr_disable(adapter, pi->port_id);
+			t3_read_reg(adapter, A_XGM_INT_STATUS + pi->mac.offset);
+			t3_set_reg_field(adapter,
+					 A_XGM_INT_ENABLE + pi->mac.offset,
+					 F_XGM_INT, 0);
+
+			if (is_10G(adapter))
+				pi->phy.ops->power_down(&pi->phy, 1);
+
+			t3_read_reg(adapter, A_XGM_INT_STATUS + pi->mac.offset);
+			t3_mac_disable(mac, MAC_DIRECTION_RX);
+			t3_link_start(&pi->phy, mac, &pi->link_config);
+
+			/* Flush TX FIFO */
+			enable_tx_fifo_drain(adapter, pi);
+		}
+
+		link_report(dev);
+	}
+}
+
+/**
+ *	t3_os_phymod_changed - handle PHY module changes
+ *	@phy: the PHY reporting the module change
+ *	@mod_type: new module type
+ *
+ *	This is the OS-dependent handler for PHY module changes.  It is
+ *	invoked when a PHY module is removed or inserted for any OS-specific
+ *	processing.
+ */
+void t3_os_phymod_changed(struct adapter *adap, int port_id)
+{
+	static const char *mod_str[] = {
+		NULL, "SR", "LR", "LRM", "TWINAX", "TWINAX", "unknown"
+	};
+
+	const struct net_device *dev = adap->port[port_id];
+	const struct port_info *pi = netdev_priv(dev);
+
+	if (pi->phy.modtype == phy_modtype_none)
+		netdev_info(dev, "PHY module unplugged\n");
+	else
+		netdev_info(dev, "%s PHY module inserted\n",
+			    mod_str[pi->phy.modtype]);
+}
+
+static void cxgb_set_rxmode(struct net_device *dev)
+{
+	struct port_info *pi = netdev_priv(dev);
+
+	t3_mac_set_rx_mode(&pi->mac, dev);
+}
+
+/**
+ *	link_start - enable a port
+ *	@dev: the device to enable
+ *
+ *	Performs the MAC and PHY actions needed to enable a port.
+ */
+static void link_start(struct net_device *dev)
+{
+	struct port_info *pi = netdev_priv(dev);
+	struct cmac *mac = &pi->mac;
+
+	t3_mac_reset(mac);
+	t3_mac_set_num_ucast(mac, MAX_MAC_IDX);
+	t3_mac_set_mtu(mac, dev->mtu);
+	t3_mac_set_address(mac, LAN_MAC_IDX, dev->dev_addr);
+	t3_mac_set_address(mac, SAN_MAC_IDX, pi->iscsic.mac_addr);
+	t3_mac_set_rx_mode(mac, dev);
+	t3_link_start(&pi->phy, mac, &pi->link_config);
+	t3_mac_enable(mac, MAC_DIRECTION_RX | MAC_DIRECTION_TX);
+}
+
+static inline void cxgb_disable_msi(struct adapter *adapter)
+{
+	if (adapter->flags & USING_MSIX) {
+		pci_disable_msix(adapter->pdev);
+		adapter->flags &= ~USING_MSIX;
+	} else if (adapter->flags & USING_MSI) {
+		pci_disable_msi(adapter->pdev);
+		adapter->flags &= ~USING_MSI;
+	}
+}
+
+/*
+ * Interrupt handler for asynchronous events used with MSI-X.
+ */
+static irqreturn_t t3_async_intr_handler(int irq, void *cookie)
+{
+	t3_slow_intr_handler(cookie);
+	return IRQ_HANDLED;
+}
+
+/*
+ * Name the MSI-X interrupts.
+ */
+static void name_msix_vecs(struct adapter *adap)
+{
+	int i, j, msi_idx = 1, n = sizeof(adap->msix_info[0].desc) - 1;
+
+	snprintf(adap->msix_info[0].desc, n, "%s", adap->name);
+	adap->msix_info[0].desc[n] = 0;
+
+	for_each_port(adap, j) {
+		struct net_device *d = adap->port[j];
+		const struct port_info *pi = netdev_priv(d);
+
+		for (i = 0; i < pi->nqsets; i++, msi_idx++) {
+			snprintf(adap->msix_info[msi_idx].desc, n,
+				 "%s-%d", d->name, pi->first_qset + i);
+			adap->msix_info[msi_idx].desc[n] = 0;
+		}
+	}
+}
+
+static int request_msix_data_irqs(struct adapter *adap)
+{
+	int i, j, err, qidx = 0;
+
+	for_each_port(adap, i) {
+		int nqsets = adap2pinfo(adap, i)->nqsets;
+
+		for (j = 0; j < nqsets; ++j) {
+			err = request_irq(adap->msix_info[qidx + 1].vec,
+					  t3_intr_handler(adap,
+							  adap->sge.qs[qidx].
+							  rspq.polling), 0,
+					  adap->msix_info[qidx + 1].desc,
+					  &adap->sge.qs[qidx]);
+			if (err) {
+				while (--qidx >= 0)
+					free_irq(adap->msix_info[qidx + 1].vec,
+						 &adap->sge.qs[qidx]);
+				return err;
+			}
+			qidx++;
+		}
+	}
+	return 0;
+}
+
+static void free_irq_resources(struct adapter *adapter)
+{
+	if (adapter->flags & USING_MSIX) {
+		int i, n = 0;
+
+		free_irq(adapter->msix_info[0].vec, adapter);
+		for_each_port(adapter, i)
+			n += adap2pinfo(adapter, i)->nqsets;
+
+		for (i = 0; i < n; ++i)
+			free_irq(adapter->msix_info[i + 1].vec,
+				 &adapter->sge.qs[i]);
+	} else
+		free_irq(adapter->pdev->irq, adapter);
+}
+
+static int await_mgmt_replies(struct adapter *adap, unsigned long init_cnt,
+			      unsigned long n)
+{
+	int attempts = 10;
+
+	while (adap->sge.qs[0].rspq.offload_pkts < init_cnt + n) {
+		if (!--attempts)
+			return -ETIMEDOUT;
+		msleep(10);
+	}
+	return 0;
+}
+
+static int init_tp_parity(struct adapter *adap)
+{
+	int i;
+	struct sk_buff *skb;
+	struct cpl_set_tcb_field *greq;
+	unsigned long cnt = adap->sge.qs[0].rspq.offload_pkts;
+
+	t3_tp_set_offload_mode(adap, 1);
+
+	for (i = 0; i < 16; i++) {
+		struct cpl_smt_write_req *req;
+
+		skb = alloc_skb(sizeof(*req), GFP_KERNEL);
+		if (!skb)
+			skb = adap->nofail_skb;
+		if (!skb)
+			goto alloc_skb_fail;
+
+		req = (struct cpl_smt_write_req *)__skb_put(skb, sizeof(*req));
+		memset(req, 0, sizeof(*req));
+		req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
+		OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SMT_WRITE_REQ, i));
+		req->mtu_idx = NMTUS - 1;
+		req->iff = i;
+		t3_mgmt_tx(adap, skb);
+		if (skb == adap->nofail_skb) {
+			await_mgmt_replies(adap, cnt, i + 1);
+			adap->nofail_skb = alloc_skb(sizeof(*greq), GFP_KERNEL);
+			if (!adap->nofail_skb)
+				goto alloc_skb_fail;
+		}
+	}
+
+	for (i = 0; i < 2048; i++) {
+		struct cpl_l2t_write_req *req;
+
+		skb = alloc_skb(sizeof(*req), GFP_KERNEL);
+		if (!skb)
+			skb = adap->nofail_skb;
+		if (!skb)
+			goto alloc_skb_fail;
+
+		req = (struct cpl_l2t_write_req *)__skb_put(skb, sizeof(*req));
+		memset(req, 0, sizeof(*req));
+		req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
+		OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_L2T_WRITE_REQ, i));
+		req->params = htonl(V_L2T_W_IDX(i));
+		t3_mgmt_tx(adap, skb);
+		if (skb == adap->nofail_skb) {
+			await_mgmt_replies(adap, cnt, 16 + i + 1);
+			adap->nofail_skb = alloc_skb(sizeof(*greq), GFP_KERNEL);
+			if (!adap->nofail_skb)
+				goto alloc_skb_fail;
+		}
+	}
+
+	for (i = 0; i < 2048; i++) {
+		struct cpl_rte_write_req *req;
+
+		skb = alloc_skb(sizeof(*req), GFP_KERNEL);
+		if (!skb)
+			skb = adap->nofail_skb;
+		if (!skb)
+			goto alloc_skb_fail;
+
+		req = (struct cpl_rte_write_req *)__skb_put(skb, sizeof(*req));
+		memset(req, 0, sizeof(*req));
+		req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
+		OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_RTE_WRITE_REQ, i));
+		req->l2t_idx = htonl(V_L2T_W_IDX(i));
+		t3_mgmt_tx(adap, skb);
+		if (skb == adap->nofail_skb) {
+			await_mgmt_replies(adap, cnt, 16 + 2048 + i + 1);
+			adap->nofail_skb = alloc_skb(sizeof(*greq), GFP_KERNEL);
+			if (!adap->nofail_skb)
+				goto alloc_skb_fail;
+		}
+	}
+
+	skb = alloc_skb(sizeof(*greq), GFP_KERNEL);
+	if (!skb)
+		skb = adap->nofail_skb;
+	if (!skb)
+		goto alloc_skb_fail;
+
+	greq = (struct cpl_set_tcb_field *)__skb_put(skb, sizeof(*greq));
+	memset(greq, 0, sizeof(*greq));
+	greq->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
+	OPCODE_TID(greq) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, 0));
+	greq->mask = cpu_to_be64(1);
+	t3_mgmt_tx(adap, skb);
+
+	i = await_mgmt_replies(adap, cnt, 16 + 2048 + 2048 + 1);
+	if (skb == adap->nofail_skb) {
+		i = await_mgmt_replies(adap, cnt, 16 + 2048 + 2048 + 1);
+		adap->nofail_skb = alloc_skb(sizeof(*greq), GFP_KERNEL);
+	}
+
+	t3_tp_set_offload_mode(adap, 0);
+	return i;
+
+alloc_skb_fail:
+	t3_tp_set_offload_mode(adap, 0);
+	return -ENOMEM;
+}
+
+/**
+ *	setup_rss - configure RSS
+ *	@adap: the adapter
+ *
+ *	Sets up RSS to distribute packets to multiple receive queues.  We
+ *	configure the RSS CPU lookup table to distribute to the number of HW
+ *	receive queues, and the response queue lookup table to narrow that
+ *	down to the response queues actually configured for each port.
+ *	We always configure the RSS mapping for two ports since the mapping
+ *	table has plenty of entries.
+ */
+static void setup_rss(struct adapter *adap)
+{
+	int i;
+	unsigned int nq0 = adap2pinfo(adap, 0)->nqsets;
+	unsigned int nq1 = adap->port[1] ? adap2pinfo(adap, 1)->nqsets : 1;
+	u8 cpus[SGE_QSETS + 1];
+	u16 rspq_map[RSS_TABLE_SIZE];
+
+	for (i = 0; i < SGE_QSETS; ++i)
+		cpus[i] = i;
+	cpus[SGE_QSETS] = 0xff;	/* terminator */
+
+	for (i = 0; i < RSS_TABLE_SIZE / 2; ++i) {
+		rspq_map[i] = i % nq0;
+		rspq_map[i + RSS_TABLE_SIZE / 2] = (i % nq1) + nq0;
+	}
+
+	t3_config_rss(adap, F_RQFEEDBACKENABLE | F_TNLLKPEN | F_TNLMAPEN |
+		      F_TNLPRTEN | F_TNL2TUPEN | F_TNL4TUPEN |
+		      V_RRCPLCPUSIZE(6) | F_HASHTOEPLITZ, cpus, rspq_map);
+}
+
+static void ring_dbs(struct adapter *adap)
+{
+	int i, j;
+
+	for (i = 0; i < SGE_QSETS; i++) {
+		struct sge_qset *qs = &adap->sge.qs[i];
+
+		if (qs->adap)
+			for (j = 0; j < SGE_TXQ_PER_SET; j++)
+				t3_write_reg(adap, A_SG_KDOORBELL, F_SELEGRCNTX | V_EGRCNTX(qs->txq[j].cntxt_id));
+	}
+}
+
+static void init_napi(struct adapter *adap)
+{
+	int i;
+
+	for (i = 0; i < SGE_QSETS; i++) {
+		struct sge_qset *qs = &adap->sge.qs[i];
+
+		if (qs->adap)
+			netif_napi_add(qs->netdev, &qs->napi, qs->napi.poll,
+				       64);
+	}
+
+	/*
+	 * netif_napi_add() can be called only once per napi_struct because it
+	 * adds each new napi_struct to a list.  Be careful not to call it a
+	 * second time, e.g., during EEH recovery, by making a note of it.
+	 */
+	adap->flags |= NAPI_INIT;
+}
+
+/*
+ * Wait until all NAPI handlers are descheduled.  This includes the handlers of
+ * both netdevices representing interfaces and the dummy ones for the extra
+ * queues.
+ */
+static void quiesce_rx(struct adapter *adap)
+{
+	int i;
+
+	for (i = 0; i < SGE_QSETS; i++)
+		if (adap->sge.qs[i].adap)
+			napi_disable(&adap->sge.qs[i].napi);
+}
+
+static void enable_all_napi(struct adapter *adap)
+{
+	int i;
+	for (i = 0; i < SGE_QSETS; i++)
+		if (adap->sge.qs[i].adap)
+			napi_enable(&adap->sge.qs[i].napi);
+}
+
+/**
+ *	setup_sge_qsets - configure SGE Tx/Rx/response queues
+ *	@adap: the adapter
+ *
+ *	Determines how many sets of SGE queues to use and initializes them.
+ *	We support multiple queue sets per port if we have MSI-X, otherwise
+ *	just one queue set per port.
+ */
+static int setup_sge_qsets(struct adapter *adap)
+{
+	int i, j, err, irq_idx = 0, qset_idx = 0;
+	unsigned int ntxq = SGE_TXQ_PER_SET;
+
+	if (adap->params.rev > 0 && !(adap->flags & USING_MSI))
+		irq_idx = -1;
+
+	for_each_port(adap, i) {
+		struct net_device *dev = adap->port[i];
+		struct port_info *pi = netdev_priv(dev);
+
+		pi->qs = &adap->sge.qs[pi->first_qset];
+		for (j = 0; j < pi->nqsets; ++j, ++qset_idx) {
+			err = t3_sge_alloc_qset(adap, qset_idx, 1,
+				(adap->flags & USING_MSIX) ? qset_idx + 1 :
+							     irq_idx,
+				&adap->params.sge.qset[qset_idx], ntxq, dev,
+				netdev_get_tx_queue(dev, j));
+			if (err) {
+				t3_free_sge_resources(adap);
+				return err;
+			}
+		}
+	}
+
+	return 0;
+}
+
+static ssize_t attr_show(struct device *d, char *buf,
+			 ssize_t(*format) (struct net_device *, char *))
+{
+	ssize_t len;
+
+	/* Synchronize with ioctls that may shut down the device */
+	rtnl_lock();
+	len = (*format) (to_net_dev(d), buf);
+	rtnl_unlock();
+	return len;
+}
+
+static ssize_t attr_store(struct device *d,
+			  const char *buf, size_t len,
+			  ssize_t(*set) (struct net_device *, unsigned int),
+			  unsigned int min_val, unsigned int max_val)
+{
+	char *endp;
+	ssize_t ret;
+	unsigned int val;
+
+	if (!capable(CAP_NET_ADMIN))
+		return -EPERM;
+
+	val = simple_strtoul(buf, &endp, 0);
+	if (endp == buf || val < min_val || val > max_val)
+		return -EINVAL;
+
+	rtnl_lock();
+	ret = (*set) (to_net_dev(d), val);
+	if (!ret)
+		ret = len;
+	rtnl_unlock();
+	return ret;
+}
+
+#define CXGB3_SHOW(name, val_expr) \
+static ssize_t format_##name(struct net_device *dev, char *buf) \
+{ \
+	struct port_info *pi = netdev_priv(dev); \
+	struct adapter *adap = pi->adapter; \
+	return sprintf(buf, "%u\n", val_expr); \
+} \
+static ssize_t show_##name(struct device *d, struct device_attribute *attr, \
+			   char *buf) \
+{ \
+	return attr_show(d, buf, format_##name); \
+}
+
+static ssize_t set_nfilters(struct net_device *dev, unsigned int val)
+{
+	struct port_info *pi = netdev_priv(dev);
+	struct adapter *adap = pi->adapter;
+	int min_tids = is_offload(adap) ? MC5_MIN_TIDS : 0;
+
+	if (adap->flags & FULL_INIT_DONE)
+		return -EBUSY;
+	if (val && adap->params.rev == 0)
+		return -EINVAL;
+	if (val > t3_mc5_size(&adap->mc5) - adap->params.mc5.nservers -
+	    min_tids)
+		return -EINVAL;
+	adap->params.mc5.nfilters = val;
+	return 0;
+}
+
+static ssize_t store_nfilters(struct device *d, struct device_attribute *attr,
+			      const char *buf, size_t len)
+{
+	return attr_store(d, buf, len, set_nfilters, 0, ~0);
+}
+
+static ssize_t set_nservers(struct net_device *dev, unsigned int val)
+{
+	struct port_info *pi = netdev_priv(dev);
+	struct adapter *adap = pi->adapter;
+
+	if (adap->flags & FULL_INIT_DONE)
+		return -EBUSY;
+	if (val > t3_mc5_size(&adap->mc5) - adap->params.mc5.nfilters -
+	    MC5_MIN_TIDS)
+		return -EINVAL;
+	adap->params.mc5.nservers = val;
+	return 0;
+}
+
+static ssize_t store_nservers(struct device *d, struct device_attribute *attr,
+			      const char *buf, size_t len)
+{
+	return attr_store(d, buf, len, set_nservers, 0, ~0);
+}
+
+#define CXGB3_ATTR_R(name, val_expr) \
+CXGB3_SHOW(name, val_expr) \
+static DEVICE_ATTR(name, S_IRUGO, show_##name, NULL)
+
+#define CXGB3_ATTR_RW(name, val_expr, store_method) \
+CXGB3_SHOW(name, val_expr) \
+static DEVICE_ATTR(name, S_IRUGO | S_IWUSR, show_##name, store_method)
+
+CXGB3_ATTR_R(cam_size, t3_mc5_size(&adap->mc5));
+CXGB3_ATTR_RW(nfilters, adap->params.mc5.nfilters, store_nfilters);
+CXGB3_ATTR_RW(nservers, adap->params.mc5.nservers, store_nservers);
+
+static struct attribute *cxgb3_attrs[] = {
+	&dev_attr_cam_size.attr,
+	&dev_attr_nfilters.attr,
+	&dev_attr_nservers.attr,
+	NULL
+};
+
+static struct attribute_group cxgb3_attr_group = {.attrs = cxgb3_attrs };
+
+static ssize_t tm_attr_show(struct device *d,
+			    char *buf, int sched)
+{
+	struct port_info *pi = netdev_priv(to_net_dev(d));
+	struct adapter *adap = pi->adapter;
+	unsigned int v, addr, bpt, cpt;
+	ssize_t len;
+
+	addr = A_TP_TX_MOD_Q1_Q0_RATE_LIMIT - sched / 2;
+	rtnl_lock();
+	t3_write_reg(adap, A_TP_TM_PIO_ADDR, addr);
+	v = t3_read_reg(adap, A_TP_TM_PIO_DATA);
+	if (sched & 1)
+		v >>= 16;
+	bpt = (v >> 8) & 0xff;
+	cpt = v & 0xff;
+	if (!cpt)
+		len = sprintf(buf, "disabled\n");
+	else {
+		v = (adap->params.vpd.cclk * 1000) / cpt;
+		len = sprintf(buf, "%u Kbps\n", (v * bpt) / 125);
+	}
+	rtnl_unlock();
+	return len;
+}
+
+static ssize_t tm_attr_store(struct device *d,
+			     const char *buf, size_t len, int sched)
+{
+	struct port_info *pi = netdev_priv(to_net_dev(d));
+	struct adapter *adap = pi->adapter;
+	unsigned int val;
+	char *endp;
+	ssize_t ret;
+
+	if (!capable(CAP_NET_ADMIN))
+		return -EPERM;
+
+	val = simple_strtoul(buf, &endp, 0);
+	if (endp == buf || val > 10000000)
+		return -EINVAL;
+
+	rtnl_lock();
+	ret = t3_config_sched(adap, val, sched);
+	if (!ret)
+		ret = len;
+	rtnl_unlock();
+	return ret;
+}
+
+#define TM_ATTR(name, sched) \
+static ssize_t show_##name(struct device *d, struct device_attribute *attr, \
+			   char *buf) \
+{ \
+	return tm_attr_show(d, buf, sched); \
+} \
+static ssize_t store_##name(struct device *d, struct device_attribute *attr, \
+			    const char *buf, size_t len) \
+{ \
+	return tm_attr_store(d, buf, len, sched); \
+} \
+static DEVICE_ATTR(name, S_IRUGO | S_IWUSR, show_##name, store_##name)
+
+TM_ATTR(sched0, 0);
+TM_ATTR(sched1, 1);
+TM_ATTR(sched2, 2);
+TM_ATTR(sched3, 3);
+TM_ATTR(sched4, 4);
+TM_ATTR(sched5, 5);
+TM_ATTR(sched6, 6);
+TM_ATTR(sched7, 7);
+
+static struct attribute *offload_attrs[] = {
+	&dev_attr_sched0.attr,
+	&dev_attr_sched1.attr,
+	&dev_attr_sched2.attr,
+	&dev_attr_sched3.attr,
+	&dev_attr_sched4.attr,
+	&dev_attr_sched5.attr,
+	&dev_attr_sched6.attr,
+	&dev_attr_sched7.attr,
+	NULL
+};
+
+static struct attribute_group offload_attr_group = {.attrs = offload_attrs };
+
+/*
+ * Sends an sk_buff to an offload queue driver
+ * after dealing with any active network taps.
+ */
+static inline int offload_tx(struct t3cdev *tdev, struct sk_buff *skb)
+{
+	int ret;
+
+	local_bh_disable();
+	ret = t3_offload_tx(tdev, skb);
+	local_bh_enable();
+	return ret;
+}
+
+static int write_smt_entry(struct adapter *adapter, int idx)
+{
+	struct cpl_smt_write_req *req;
+	struct port_info *pi = netdev_priv(adapter->port[idx]);
+	struct sk_buff *skb = alloc_skb(sizeof(*req), GFP_KERNEL);
+
+	if (!skb)
+		return -ENOMEM;
+
+	req = (struct cpl_smt_write_req *)__skb_put(skb, sizeof(*req));
+	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
+	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SMT_WRITE_REQ, idx));
+	req->mtu_idx = NMTUS - 1;	/* should be 0 but there's a T3 bug */
+	req->iff = idx;
+	memcpy(req->src_mac0, adapter->port[idx]->dev_addr, ETH_ALEN);
+	memcpy(req->src_mac1, pi->iscsic.mac_addr, ETH_ALEN);
+	skb->priority = 1;
+	offload_tx(&adapter->tdev, skb);
+	return 0;
+}
+
+static int init_smt(struct adapter *adapter)
+{
+	int i;
+
+	for_each_port(adapter, i)
+	    write_smt_entry(adapter, i);
+	return 0;
+}
+
+static void init_port_mtus(struct adapter *adapter)
+{
+	unsigned int mtus = adapter->port[0]->mtu;
+
+	if (adapter->port[1])
+		mtus |= adapter->port[1]->mtu << 16;
+	t3_write_reg(adapter, A_TP_MTU_PORT_TABLE, mtus);
+}
+
+static int send_pktsched_cmd(struct adapter *adap, int sched, int qidx, int lo,
+			      int hi, int port)
+{
+	struct sk_buff *skb;
+	struct mngt_pktsched_wr *req;
+	int ret;
+
+	skb = alloc_skb(sizeof(*req), GFP_KERNEL);
+	if (!skb)
+		skb = adap->nofail_skb;
+	if (!skb)
+		return -ENOMEM;
+
+	req = (struct mngt_pktsched_wr *)skb_put(skb, sizeof(*req));
+	req->wr_hi = htonl(V_WR_OP(FW_WROPCODE_MNGT));
+	req->mngt_opcode = FW_MNGTOPCODE_PKTSCHED_SET;
+	req->sched = sched;
+	req->idx = qidx;
+	req->min = lo;
+	req->max = hi;
+	req->binding = port;
+	ret = t3_mgmt_tx(adap, skb);
+	if (skb == adap->nofail_skb) {
+		adap->nofail_skb = alloc_skb(sizeof(struct cpl_set_tcb_field),
+					     GFP_KERNEL);
+		if (!adap->nofail_skb)
+			ret = -ENOMEM;
+	}
+
+	return ret;
+}
+
+static int bind_qsets(struct adapter *adap)
+{
+	int i, j, err = 0;
+
+	for_each_port(adap, i) {
+		const struct port_info *pi = adap2pinfo(adap, i);
+
+		for (j = 0; j < pi->nqsets; ++j) {
+			int ret = send_pktsched_cmd(adap, 1,
+						    pi->first_qset + j, -1,
+						    -1, i);
+			if (ret)
+				err = ret;
+		}
+	}
+
+	return err;
+}
+
+#define FW_VERSION __stringify(FW_VERSION_MAJOR) "."			\
+	__stringify(FW_VERSION_MINOR) "." __stringify(FW_VERSION_MICRO)
+#define FW_FNAME "cxgb3/t3fw-" FW_VERSION ".bin"
+#define TPSRAM_VERSION __stringify(TP_VERSION_MAJOR) "."		\
+	__stringify(TP_VERSION_MINOR) "." __stringify(TP_VERSION_MICRO)
+#define TPSRAM_NAME "cxgb3/t3%c_psram-" TPSRAM_VERSION ".bin"
+#define AEL2005_OPT_EDC_NAME "cxgb3/ael2005_opt_edc.bin"
+#define AEL2005_TWX_EDC_NAME "cxgb3/ael2005_twx_edc.bin"
+#define AEL2020_TWX_EDC_NAME "cxgb3/ael2020_twx_edc.bin"
+MODULE_FIRMWARE(FW_FNAME);
+MODULE_FIRMWARE("cxgb3/t3b_psram-" TPSRAM_VERSION ".bin");
+MODULE_FIRMWARE("cxgb3/t3c_psram-" TPSRAM_VERSION ".bin");
+MODULE_FIRMWARE(AEL2005_OPT_EDC_NAME);
+MODULE_FIRMWARE(AEL2005_TWX_EDC_NAME);
+MODULE_FIRMWARE(AEL2020_TWX_EDC_NAME);
+
+static inline const char *get_edc_fw_name(int edc_idx)
+{
+	const char *fw_name = NULL;
+
+	switch (edc_idx) {
+	case EDC_OPT_AEL2005:
+		fw_name = AEL2005_OPT_EDC_NAME;
+		break;
+	case EDC_TWX_AEL2005:
+		fw_name = AEL2005_TWX_EDC_NAME;
+		break;
+	case EDC_TWX_AEL2020:
+		fw_name = AEL2020_TWX_EDC_NAME;
+		break;
+	}
+	return fw_name;
+}
+
+int t3_get_edc_fw(struct cphy *phy, int edc_idx, int size)
+{
+	struct adapter *adapter = phy->adapter;
+	const struct firmware *fw;
+	char buf[64];
+	u32 csum;
+	const __be32 *p;
+	u16 *cache = phy->phy_cache;
+	int i, ret;
+
+	snprintf(buf, sizeof(buf), get_edc_fw_name(edc_idx));
+
+	ret = request_firmware(&fw, buf, &adapter->pdev->dev);
+	if (ret < 0) {
+		dev_err(&adapter->pdev->dev,
+			"could not upgrade firmware: unable to load %s\n",
+			buf);
+		return ret;
+	}
+
+	/* check size, take checksum in account */
+	if (fw->size > size + 4) {
+		CH_ERR(adapter, "firmware image too large %u, expected %d\n",
+		       (unsigned int)fw->size, size + 4);
+		ret = -EINVAL;
+	}
+
+	/* compute checksum */
+	p = (const __be32 *)fw->data;
+	for (csum = 0, i = 0; i < fw->size / sizeof(csum); i++)
+		csum += ntohl(p[i]);
+
+	if (csum != 0xffffffff) {
+		CH_ERR(adapter, "corrupted firmware image, checksum %u\n",
+		       csum);
+		ret = -EINVAL;
+	}
+
+	for (i = 0; i < size / 4 ; i++) {
+		*cache++ = (be32_to_cpu(p[i]) & 0xffff0000) >> 16;
+		*cache++ = be32_to_cpu(p[i]) & 0xffff;
+	}
+
+	release_firmware(fw);
+
+	return ret;
+}
+
+static int upgrade_fw(struct adapter *adap)
+{
+	int ret;
+	const struct firmware *fw;
+	struct device *dev = &adap->pdev->dev;
+
+	ret = request_firmware(&fw, FW_FNAME, dev);
+	if (ret < 0) {
+		dev_err(dev, "could not upgrade firmware: unable to load %s\n",
+			FW_FNAME);
+		return ret;
+	}
+	ret = t3_load_fw(adap, fw->data, fw->size);
+	release_firmware(fw);
+
+	if (ret == 0)
+		dev_info(dev, "successful upgrade to firmware %d.%d.%d\n",
+			 FW_VERSION_MAJOR, FW_VERSION_MINOR, FW_VERSION_MICRO);
+	else
+		dev_err(dev, "failed to upgrade to firmware %d.%d.%d\n",
+			FW_VERSION_MAJOR, FW_VERSION_MINOR, FW_VERSION_MICRO);
+
+	return ret;
+}
+
+static inline char t3rev2char(struct adapter *adapter)
+{
+	char rev = 0;
+
+	switch(adapter->params.rev) {
+	case T3_REV_B:
+	case T3_REV_B2:
+		rev = 'b';
+		break;
+	case T3_REV_C:
+		rev = 'c';
+		break;
+	}
+	return rev;
+}
+
+static int update_tpsram(struct adapter *adap)
+{
+	const struct firmware *tpsram;
+	char buf[64];
+	struct device *dev = &adap->pdev->dev;
+	int ret;
+	char rev;
+
+	rev = t3rev2char(adap);
+	if (!rev)
+		return 0;
+
+	snprintf(buf, sizeof(buf), TPSRAM_NAME, rev);
+
+	ret = request_firmware(&tpsram, buf, dev);
+	if (ret < 0) {
+		dev_err(dev, "could not load TP SRAM: unable to load %s\n",
+			buf);
+		return ret;
+	}
+
+	ret = t3_check_tpsram(adap, tpsram->data, tpsram->size);
+	if (ret)
+		goto release_tpsram;
+
+	ret = t3_set_proto_sram(adap, tpsram->data);
+	if (ret == 0)
+		dev_info(dev,
+			 "successful update of protocol engine "
+			 "to %d.%d.%d\n",
+			 TP_VERSION_MAJOR, TP_VERSION_MINOR, TP_VERSION_MICRO);
+	else
+		dev_err(dev, "failed to update of protocol engine %d.%d.%d\n",
+			TP_VERSION_MAJOR, TP_VERSION_MINOR, TP_VERSION_MICRO);
+	if (ret)
+		dev_err(dev, "loading protocol SRAM failed\n");
+
+release_tpsram:
+	release_firmware(tpsram);
+
+	return ret;
+}
+
+/**
+ * t3_synchronize_rx - wait for current Rx processing on a port to complete
+ * @adap: the adapter
+ * @p: the port
+ *
+ * Ensures that current Rx processing on any of the queues associated with
+ * the given port completes before returning.  We do this by acquiring and
+ * releasing the locks of the response queues associated with the port.
+ */
+static void t3_synchronize_rx(struct adapter *adap, const struct port_info *p)
+{
+	int i;
+
+	for (i = p->first_qset; i < p->first_qset + p->nqsets; i++) {
+		struct sge_rspq *q = &adap->sge.qs[i].rspq;
+
+		spin_lock_irq(&q->lock);
+		spin_unlock_irq(&q->lock);
+	}
+}
+
+static void cxgb_vlan_mode(struct net_device *dev, netdev_features_t features)
+{
+	struct port_info *pi = netdev_priv(dev);
+	struct adapter *adapter = pi->adapter;
+
+	if (adapter->params.rev > 0) {
+		t3_set_vlan_accel(adapter, 1 << pi->port_id,
+				  features & NETIF_F_HW_VLAN_CTAG_RX);
+	} else {
+		/* single control for all ports */
+		unsigned int i, have_vlans = features & NETIF_F_HW_VLAN_CTAG_RX;
+
+		for_each_port(adapter, i)
+			have_vlans |=
+				adapter->port[i]->features &
+				NETIF_F_HW_VLAN_CTAG_RX;
+
+		t3_set_vlan_accel(adapter, 1, have_vlans);
+	}
+	t3_synchronize_rx(adapter, pi);
+}
+
+/**
+ *	cxgb_up - enable the adapter
+ *	@adapter: adapter being enabled
+ *
+ *	Called when the first port is enabled, this function performs the
+ *	actions necessary to make an adapter operational, such as completing
+ *	the initialization of HW modules, and enabling interrupts.
+ *
+ *	Must be called with the rtnl lock held.
+ */
+static int cxgb_up(struct adapter *adap)
+{
+	int i, err;
+
+	if (!(adap->flags & FULL_INIT_DONE)) {
+		err = t3_check_fw_version(adap);
+		if (err == -EINVAL) {
+			err = upgrade_fw(adap);
+			CH_WARN(adap, "FW upgrade to %d.%d.%d %s\n",
+				FW_VERSION_MAJOR, FW_VERSION_MINOR,
+				FW_VERSION_MICRO, err ? "failed" : "succeeded");
+		}
+
+		err = t3_check_tpsram_version(adap);
+		if (err == -EINVAL) {
+			err = update_tpsram(adap);
+			CH_WARN(adap, "TP upgrade to %d.%d.%d %s\n",
+				TP_VERSION_MAJOR, TP_VERSION_MINOR,
+				TP_VERSION_MICRO, err ? "failed" : "succeeded");
+		}
+
+		/*
+		 * Clear interrupts now to catch errors if t3_init_hw fails.
+		 * We clear them again later as initialization may trigger
+		 * conditions that can interrupt.
+		 */
+		t3_intr_clear(adap);
+
+		err = t3_init_hw(adap, 0);
+		if (err)
+			goto out;
+
+		t3_set_reg_field(adap, A_TP_PARA_REG5, 0, F_RXDDPOFFINIT);
+		t3_write_reg(adap, A_ULPRX_TDDP_PSZ, V_HPZ0(PAGE_SHIFT - 12));
+
+		err = setup_sge_qsets(adap);
+		if (err)
+			goto out;
+
+		for_each_port(adap, i)
+			cxgb_vlan_mode(adap->port[i], adap->port[i]->features);
+
+		setup_rss(adap);
+		if (!(adap->flags & NAPI_INIT))
+			init_napi(adap);
+
+		t3_start_sge_timers(adap);
+		adap->flags |= FULL_INIT_DONE;
+	}
+
+	t3_intr_clear(adap);
+
+	if (adap->flags & USING_MSIX) {
+		name_msix_vecs(adap);
+		err = request_irq(adap->msix_info[0].vec,
+				  t3_async_intr_handler, 0,
+				  adap->msix_info[0].desc, adap);
+		if (err)
+			goto irq_err;
+
+		err = request_msix_data_irqs(adap);
+		if (err) {
+			free_irq(adap->msix_info[0].vec, adap);
+			goto irq_err;
+		}
+	} else if ((err = request_irq(adap->pdev->irq,
+				      t3_intr_handler(adap,
+						      adap->sge.qs[0].rspq.
+						      polling),
+				      (adap->flags & USING_MSI) ?
+				       0 : IRQF_SHARED,
+				      adap->name, adap)))
+		goto irq_err;
+
+	enable_all_napi(adap);
+	t3_sge_start(adap);
+	t3_intr_enable(adap);
+
+	if (adap->params.rev >= T3_REV_C && !(adap->flags & TP_PARITY_INIT) &&
+	    is_offload(adap) && init_tp_parity(adap) == 0)
+		adap->flags |= TP_PARITY_INIT;
+
+	if (adap->flags & TP_PARITY_INIT) {
+		t3_write_reg(adap, A_TP_INT_CAUSE,
+			     F_CMCACHEPERR | F_ARPLUTPERR);
+		t3_write_reg(adap, A_TP_INT_ENABLE, 0x7fbfffff);
+	}
+
+	if (!(adap->flags & QUEUES_BOUND)) {
+		int ret = bind_qsets(adap);
+
+		if (ret < 0) {
+			CH_ERR(adap, "failed to bind qsets, err %d\n", ret);
+			t3_intr_disable(adap);
+			free_irq_resources(adap);
+			err = ret;
+			goto out;
+		}
+		adap->flags |= QUEUES_BOUND;
+	}
+
+out:
+	return err;
+irq_err:
+	CH_ERR(adap, "request_irq failed, err %d\n", err);
+	goto out;
+}
+
+/*
+ * Release resources when all the ports and offloading have been stopped.
+ */
+static void cxgb_down(struct adapter *adapter, int on_wq)
+{
+	t3_sge_stop(adapter);
+	spin_lock_irq(&adapter->work_lock);	/* sync with PHY intr task */
+	t3_intr_disable(adapter);
+	spin_unlock_irq(&adapter->work_lock);
+
+	free_irq_resources(adapter);
+	quiesce_rx(adapter);
+	t3_sge_stop(adapter);
+	if (!on_wq)
+		flush_workqueue(cxgb3_wq);/* wait for external IRQ handler */
+}
+
+static void schedule_chk_task(struct adapter *adap)
+{
+	unsigned int timeo;
+
+	timeo = adap->params.linkpoll_period ?
+	    (HZ * adap->params.linkpoll_period) / 10 :
+	    adap->params.stats_update_period * HZ;
+	if (timeo)
+		queue_delayed_work(cxgb3_wq, &adap->adap_check_task, timeo);
+}
+
+static int offload_open(struct net_device *dev)
+{
+	struct port_info *pi = netdev_priv(dev);
+	struct adapter *adapter = pi->adapter;
+	struct t3cdev *tdev = dev2t3cdev(dev);
+	int adap_up = adapter->open_device_map & PORT_MASK;
+	int err;
+
+	if (test_and_set_bit(OFFLOAD_DEVMAP_BIT, &adapter->open_device_map))
+		return 0;
+
+	if (!adap_up && (err = cxgb_up(adapter)) < 0)
+		goto out;
+
+	t3_tp_set_offload_mode(adapter, 1);
+	tdev->lldev = adapter->port[0];
+	err = cxgb3_offload_activate(adapter);
+	if (err)
+		goto out;
+
+	init_port_mtus(adapter);
+	t3_load_mtus(adapter, adapter->params.mtus, adapter->params.a_wnd,
+		     adapter->params.b_wnd,
+		     adapter->params.rev == 0 ?
+		     adapter->port[0]->mtu : 0xffff);
+	init_smt(adapter);
+
+	if (sysfs_create_group(&tdev->lldev->dev.kobj, &offload_attr_group))
+		dev_dbg(&dev->dev, "cannot create sysfs group\n");
+
+	/* Call back all registered clients */
+	cxgb3_add_clients(tdev);
+
+out:
+	/* restore them in case the offload module has changed them */
+	if (err) {
+		t3_tp_set_offload_mode(adapter, 0);
+		clear_bit(OFFLOAD_DEVMAP_BIT, &adapter->open_device_map);
+		cxgb3_set_dummy_ops(tdev);
+	}
+	return err;
+}
+
+static int offload_close(struct t3cdev *tdev)
+{
+	struct adapter *adapter = tdev2adap(tdev);
+	struct t3c_data *td = T3C_DATA(tdev);
+
+	if (!test_bit(OFFLOAD_DEVMAP_BIT, &adapter->open_device_map))
+		return 0;
+
+	/* Call back all registered clients */
+	cxgb3_remove_clients(tdev);
+
+	sysfs_remove_group(&tdev->lldev->dev.kobj, &offload_attr_group);
+
+	/* Flush work scheduled while releasing TIDs */
+	flush_work(&td->tid_release_task);
+
+	tdev->lldev = NULL;
+	cxgb3_set_dummy_ops(tdev);
+	t3_tp_set_offload_mode(adapter, 0);
+	clear_bit(OFFLOAD_DEVMAP_BIT, &adapter->open_device_map);
+
+	if (!adapter->open_device_map)
+		cxgb_down(adapter, 0);
+
+	cxgb3_offload_deactivate(adapter);
+	return 0;
+}
+
+static int cxgb_open(struct net_device *dev)
+{
+	struct port_info *pi = netdev_priv(dev);
+	struct adapter *adapter = pi->adapter;
+	int other_ports = adapter->open_device_map & PORT_MASK;
+	int err;
+
+	if (!adapter->open_device_map && (err = cxgb_up(adapter)) < 0)
+		return err;
+
+	set_bit(pi->port_id, &adapter->open_device_map);
+	if (is_offload(adapter) && !ofld_disable) {
+		err = offload_open(dev);
+		if (err)
+			pr_warn("Could not initialize offload capabilities\n");
+	}
+
+	netif_set_real_num_tx_queues(dev, pi->nqsets);
+	err = netif_set_real_num_rx_queues(dev, pi->nqsets);
+	if (err)
+		return err;
+	link_start(dev);
+	t3_port_intr_enable(adapter, pi->port_id);
+	netif_tx_start_all_queues(dev);
+	if (!other_ports)
+		schedule_chk_task(adapter);
+
+	cxgb3_event_notify(&adapter->tdev, OFFLOAD_PORT_UP, pi->port_id);
+	return 0;
+}
+
+static int __cxgb_close(struct net_device *dev, int on_wq)
+{
+	struct port_info *pi = netdev_priv(dev);
+	struct adapter *adapter = pi->adapter;
+
+	
+	if (!adapter->open_device_map)
+		return 0;
+
+	/* Stop link fault interrupts */
+	t3_xgm_intr_disable(adapter, pi->port_id);
+	t3_read_reg(adapter, A_XGM_INT_STATUS + pi->mac.offset);
+
+	t3_port_intr_disable(adapter, pi->port_id);
+	netif_tx_stop_all_queues(dev);
+	pi->phy.ops->power_down(&pi->phy, 1);
+	netif_carrier_off(dev);
+	t3_mac_disable(&pi->mac, MAC_DIRECTION_TX | MAC_DIRECTION_RX);
+
+	spin_lock_irq(&adapter->work_lock);	/* sync with update task */
+	clear_bit(pi->port_id, &adapter->open_device_map);
+	spin_unlock_irq(&adapter->work_lock);
+
+	if (!(adapter->open_device_map & PORT_MASK))
+		cancel_delayed_work_sync(&adapter->adap_check_task);
+
+	if (!adapter->open_device_map)
+		cxgb_down(adapter, on_wq);
+
+	cxgb3_event_notify(&adapter->tdev, OFFLOAD_PORT_DOWN, pi->port_id);
+	return 0;
+}
+
+static int cxgb_close(struct net_device *dev)
+{
+	return __cxgb_close(dev, 0);
+}
+
+static struct net_device_stats *cxgb_get_stats(struct net_device *dev)
+{
+	struct port_info *pi = netdev_priv(dev);
+	struct adapter *adapter = pi->adapter;
+	struct net_device_stats *ns = &pi->netstats;
+	const struct mac_stats *pstats;
+
+	spin_lock(&adapter->stats_lock);
+	pstats = t3_mac_update_stats(&pi->mac);
+	spin_unlock(&adapter->stats_lock);
+
+	ns->tx_bytes = pstats->tx_octets;
+	ns->tx_packets = pstats->tx_frames;
+	ns->rx_bytes = pstats->rx_octets;
+	ns->rx_packets = pstats->rx_frames;
+	ns->multicast = pstats->rx_mcast_frames;
+
+	ns->tx_errors = pstats->tx_underrun;
+	ns->rx_errors = pstats->rx_symbol_errs + pstats->rx_fcs_errs +
+	    pstats->rx_too_long + pstats->rx_jabber + pstats->rx_short +
+	    pstats->rx_fifo_ovfl;
+
+	/* detailed rx_errors */
+	ns->rx_length_errors = pstats->rx_jabber + pstats->rx_too_long;
+	ns->rx_over_errors = 0;
+	ns->rx_crc_errors = pstats->rx_fcs_errs;
+	ns->rx_frame_errors = pstats->rx_symbol_errs;
+	ns->rx_fifo_errors = pstats->rx_fifo_ovfl;
+	ns->rx_missed_errors = pstats->rx_cong_drops;
+
+	/* detailed tx_errors */
+	ns->tx_aborted_errors = 0;
+	ns->tx_carrier_errors = 0;
+	ns->tx_fifo_errors = pstats->tx_underrun;
+	ns->tx_heartbeat_errors = 0;
+	ns->tx_window_errors = 0;
+	return ns;
+}
+
+static u32 get_msglevel(struct net_device *dev)
+{
+	struct port_info *pi = netdev_priv(dev);
+	struct adapter *adapter = pi->adapter;
+
+	return adapter->msg_enable;
+}
+
+static void set_msglevel(struct net_device *dev, u32 val)
+{
+	struct port_info *pi = netdev_priv(dev);
+	struct adapter *adapter = pi->adapter;
+
+	adapter->msg_enable = val;
+}
+
+static char stats_strings[][ETH_GSTRING_LEN] = {
+	"TxOctetsOK         ",
+	"TxFramesOK         ",
+	"TxMulticastFramesOK",
+	"TxBroadcastFramesOK",
+	"TxPauseFrames      ",
+	"TxUnderrun         ",
+	"TxExtUnderrun      ",
+
+	"TxFrames64         ",
+	"TxFrames65To127    ",
+	"TxFrames128To255   ",
+	"TxFrames256To511   ",
+	"TxFrames512To1023  ",
+	"TxFrames1024To1518 ",
+	"TxFrames1519ToMax  ",
+
+	"RxOctetsOK         ",
+	"RxFramesOK         ",
+	"RxMulticastFramesOK",
+	"RxBroadcastFramesOK",
+	"RxPauseFrames      ",
+	"RxFCSErrors        ",
+	"RxSymbolErrors     ",
+	"RxShortErrors      ",
+	"RxJabberErrors     ",
+	"RxLengthErrors     ",
+	"RxFIFOoverflow     ",
+
+	"RxFrames64         ",
+	"RxFrames65To127    ",
+	"RxFrames128To255   ",
+	"RxFrames256To511   ",
+	"RxFrames512To1023  ",
+	"RxFrames1024To1518 ",
+	"RxFrames1519ToMax  ",
+
+	"PhyFIFOErrors      ",
+	"TSO                ",
+	"VLANextractions    ",
+	"VLANinsertions     ",
+	"TxCsumOffload      ",
+	"RxCsumGood         ",
+	"LroAggregated      ",
+	"LroFlushed         ",
+	"LroNoDesc          ",
+	"RxDrops            ",
+
+	"CheckTXEnToggled   ",
+	"CheckResets        ",
+
+	"LinkFaults         ",
+};
+
+static int get_sset_count(struct net_device *dev, int sset)
+{
+	switch (sset) {
+	case ETH_SS_STATS:
+		return ARRAY_SIZE(stats_strings);
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+#define T3_REGMAP_SIZE (3 * 1024)
+
+static int get_regs_len(struct net_device *dev)
+{
+	return T3_REGMAP_SIZE;
+}
+
+static int get_eeprom_len(struct net_device *dev)
+{
+	return EEPROMSIZE;
+}
+
+static void get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info)
+{
+	struct port_info *pi = netdev_priv(dev);
+	struct adapter *adapter = pi->adapter;
+	u32 fw_vers = 0;
+	u32 tp_vers = 0;
+
+	spin_lock(&adapter->stats_lock);
+	t3_get_fw_version(adapter, &fw_vers);
+	t3_get_tp_version(adapter, &tp_vers);
+	spin_unlock(&adapter->stats_lock);
+
+	strlcpy(info->driver, DRV_NAME, sizeof(info->driver));
+	strlcpy(info->version, DRV_VERSION, sizeof(info->version));
+	strlcpy(info->bus_info, pci_name(adapter->pdev),
+		sizeof(info->bus_info));
+	if (fw_vers)
+		snprintf(info->fw_version, sizeof(info->fw_version),
+			 "%s %u.%u.%u TP %u.%u.%u",
+			 G_FW_VERSION_TYPE(fw_vers) ? "T" : "N",
+			 G_FW_VERSION_MAJOR(fw_vers),
+			 G_FW_VERSION_MINOR(fw_vers),
+			 G_FW_VERSION_MICRO(fw_vers),
+			 G_TP_VERSION_MAJOR(tp_vers),
+			 G_TP_VERSION_MINOR(tp_vers),
+			 G_TP_VERSION_MICRO(tp_vers));
+}
+
+static void get_strings(struct net_device *dev, u32 stringset, u8 * data)
+{
+	if (stringset == ETH_SS_STATS)
+		memcpy(data, stats_strings, sizeof(stats_strings));
+}
+
+static unsigned long collect_sge_port_stats(struct adapter *adapter,
+					    struct port_info *p, int idx)
+{
+	int i;
+	unsigned long tot = 0;
+
+	for (i = p->first_qset; i < p->first_qset + p->nqsets; ++i)
+		tot += adapter->sge.qs[i].port_stats[idx];
+	return tot;
+}
+
+static void get_stats(struct net_device *dev, struct ethtool_stats *stats,
+		      u64 *data)
+{
+	struct port_info *pi = netdev_priv(dev);
+	struct adapter *adapter = pi->adapter;
+	const struct mac_stats *s;
+
+	spin_lock(&adapter->stats_lock);
+	s = t3_mac_update_stats(&pi->mac);
+	spin_unlock(&adapter->stats_lock);
+
+	*data++ = s->tx_octets;
+	*data++ = s->tx_frames;
+	*data++ = s->tx_mcast_frames;
+	*data++ = s->tx_bcast_frames;
+	*data++ = s->tx_pause;
+	*data++ = s->tx_underrun;
+	*data++ = s->tx_fifo_urun;
+
+	*data++ = s->tx_frames_64;
+	*data++ = s->tx_frames_65_127;
+	*data++ = s->tx_frames_128_255;
+	*data++ = s->tx_frames_256_511;
+	*data++ = s->tx_frames_512_1023;
+	*data++ = s->tx_frames_1024_1518;
+	*data++ = s->tx_frames_1519_max;
+
+	*data++ = s->rx_octets;
+	*data++ = s->rx_frames;
+	*data++ = s->rx_mcast_frames;
+	*data++ = s->rx_bcast_frames;
+	*data++ = s->rx_pause;
+	*data++ = s->rx_fcs_errs;
+	*data++ = s->rx_symbol_errs;
+	*data++ = s->rx_short;
+	*data++ = s->rx_jabber;
+	*data++ = s->rx_too_long;
+	*data++ = s->rx_fifo_ovfl;
+
+	*data++ = s->rx_frames_64;
+	*data++ = s->rx_frames_65_127;
+	*data++ = s->rx_frames_128_255;
+	*data++ = s->rx_frames_256_511;
+	*data++ = s->rx_frames_512_1023;
+	*data++ = s->rx_frames_1024_1518;
+	*data++ = s->rx_frames_1519_max;
+
+	*data++ = pi->phy.fifo_errors;
+
+	*data++ = collect_sge_port_stats(adapter, pi, SGE_PSTAT_TSO);
+	*data++ = collect_sge_port_stats(adapter, pi, SGE_PSTAT_VLANEX);
+	*data++ = collect_sge_port_stats(adapter, pi, SGE_PSTAT_VLANINS);
+	*data++ = collect_sge_port_stats(adapter, pi, SGE_PSTAT_TX_CSUM);
+	*data++ = collect_sge_port_stats(adapter, pi, SGE_PSTAT_RX_CSUM_GOOD);
+	*data++ = 0;
+	*data++ = 0;
+	*data++ = 0;
+	*data++ = s->rx_cong_drops;
+
+	*data++ = s->num_toggled;
+	*data++ = s->num_resets;
+
+	*data++ = s->link_faults;
+}
+
+static inline void reg_block_dump(struct adapter *ap, void *buf,
+				  unsigned int start, unsigned int end)
+{
+	u32 *p = buf + start;
+
+	for (; start <= end; start += sizeof(u32))
+		*p++ = t3_read_reg(ap, start);
+}
+
+static void get_regs(struct net_device *dev, struct ethtool_regs *regs,
+		     void *buf)
+{
+	struct port_info *pi = netdev_priv(dev);
+	struct adapter *ap = pi->adapter;
+
+	/*
+	 * Version scheme:
+	 * bits 0..9: chip version
+	 * bits 10..15: chip revision
+	 * bit 31: set for PCIe cards
+	 */
+	regs->version = 3 | (ap->params.rev << 10) | (is_pcie(ap) << 31);
+
+	/*
+	 * We skip the MAC statistics registers because they are clear-on-read.
+	 * Also reading multi-register stats would need to synchronize with the
+	 * periodic mac stats accumulation.  Hard to justify the complexity.
+	 */
+	memset(buf, 0, T3_REGMAP_SIZE);
+	reg_block_dump(ap, buf, 0, A_SG_RSPQ_CREDIT_RETURN);
+	reg_block_dump(ap, buf, A_SG_HI_DRB_HI_THRSH, A_ULPRX_PBL_ULIMIT);
+	reg_block_dump(ap, buf, A_ULPTX_CONFIG, A_MPS_INT_CAUSE);
+	reg_block_dump(ap, buf, A_CPL_SWITCH_CNTRL, A_CPL_MAP_TBL_DATA);
+	reg_block_dump(ap, buf, A_SMB_GLOBAL_TIME_CFG, A_XGM_SERDES_STAT3);
+	reg_block_dump(ap, buf, A_XGM_SERDES_STATUS0,
+		       XGM_REG(A_XGM_SERDES_STAT3, 1));
+	reg_block_dump(ap, buf, XGM_REG(A_XGM_SERDES_STATUS0, 1),
+		       XGM_REG(A_XGM_RX_SPI4_SOP_EOP_CNT, 1));
+}
+
+static int restart_autoneg(struct net_device *dev)
+{
+	struct port_info *p = netdev_priv(dev);
+
+	if (!netif_running(dev))
+		return -EAGAIN;
+	if (p->link_config.autoneg != AUTONEG_ENABLE)
+		return -EINVAL;
+	p->phy.ops->autoneg_restart(&p->phy);
+	return 0;
+}
+
+static int set_phys_id(struct net_device *dev,
+		       enum ethtool_phys_id_state state)
+{
+	struct port_info *pi = netdev_priv(dev);
+	struct adapter *adapter = pi->adapter;
+
+	switch (state) {
+	case ETHTOOL_ID_ACTIVE:
+		return 1;	/* cycle on/off once per second */
+
+	case ETHTOOL_ID_OFF:
+		t3_set_reg_field(adapter, A_T3DBG_GPIO_EN, F_GPIO0_OUT_VAL, 0);
+		break;
+
+	case ETHTOOL_ID_ON:
+	case ETHTOOL_ID_INACTIVE:
+		t3_set_reg_field(adapter, A_T3DBG_GPIO_EN, F_GPIO0_OUT_VAL,
+			 F_GPIO0_OUT_VAL);
+	}
+
+	return 0;
+}
+
+static int get_settings(struct net_device *dev, struct ethtool_cmd *cmd)
+{
+	struct port_info *p = netdev_priv(dev);
+
+	cmd->supported = p->link_config.supported;
+	cmd->advertising = p->link_config.advertising;
+
+	if (netif_carrier_ok(dev)) {
+		ethtool_cmd_speed_set(cmd, p->link_config.speed);
+		cmd->duplex = p->link_config.duplex;
+	} else {
+		ethtool_cmd_speed_set(cmd, SPEED_UNKNOWN);
+		cmd->duplex = DUPLEX_UNKNOWN;
+	}
+
+	cmd->port = (cmd->supported & SUPPORTED_TP) ? PORT_TP : PORT_FIBRE;
+	cmd->phy_address = p->phy.mdio.prtad;
+	cmd->transceiver = XCVR_EXTERNAL;
+	cmd->autoneg = p->link_config.autoneg;
+	cmd->maxtxpkt = 0;
+	cmd->maxrxpkt = 0;
+	return 0;
+}
+
+static int speed_duplex_to_caps(int speed, int duplex)
+{
+	int cap = 0;
+
+	switch (speed) {
+	case SPEED_10:
+		if (duplex == DUPLEX_FULL)
+			cap = SUPPORTED_10baseT_Full;
+		else
+			cap = SUPPORTED_10baseT_Half;
+		break;
+	case SPEED_100:
+		if (duplex == DUPLEX_FULL)
+			cap = SUPPORTED_100baseT_Full;
+		else
+			cap = SUPPORTED_100baseT_Half;
+		break;
+	case SPEED_1000:
+		if (duplex == DUPLEX_FULL)
+			cap = SUPPORTED_1000baseT_Full;
+		else
+			cap = SUPPORTED_1000baseT_Half;
+		break;
+	case SPEED_10000:
+		if (duplex == DUPLEX_FULL)
+			cap = SUPPORTED_10000baseT_Full;
+	}
+	return cap;
+}
+
+#define ADVERTISED_MASK (ADVERTISED_10baseT_Half | ADVERTISED_10baseT_Full | \
+		      ADVERTISED_100baseT_Half | ADVERTISED_100baseT_Full | \
+		      ADVERTISED_1000baseT_Half | ADVERTISED_1000baseT_Full | \
+		      ADVERTISED_10000baseT_Full)
+
+static int set_settings(struct net_device *dev, struct ethtool_cmd *cmd)
+{
+	struct port_info *p = netdev_priv(dev);
+	struct link_config *lc = &p->link_config;
+
+	if (!(lc->supported & SUPPORTED_Autoneg)) {
+		/*
+		 * PHY offers a single speed/duplex.  See if that's what's
+		 * being requested.
+		 */
+		if (cmd->autoneg == AUTONEG_DISABLE) {
+			u32 speed = ethtool_cmd_speed(cmd);
+			int cap = speed_duplex_to_caps(speed, cmd->duplex);
+			if (lc->supported & cap)
+				return 0;
+		}
+		return -EINVAL;
+	}
+
+	if (cmd->autoneg == AUTONEG_DISABLE) {
+		u32 speed = ethtool_cmd_speed(cmd);
+		int cap = speed_duplex_to_caps(speed, cmd->duplex);
+
+		if (!(lc->supported & cap) || (speed == SPEED_1000))
+			return -EINVAL;
+		lc->requested_speed = speed;
+		lc->requested_duplex = cmd->duplex;
+		lc->advertising = 0;
+	} else {
+		cmd->advertising &= ADVERTISED_MASK;
+		cmd->advertising &= lc->supported;
+		if (!cmd->advertising)
+			return -EINVAL;
+		lc->requested_speed = SPEED_INVALID;
+		lc->requested_duplex = DUPLEX_INVALID;
+		lc->advertising = cmd->advertising | ADVERTISED_Autoneg;
+	}
+	lc->autoneg = cmd->autoneg;
+	if (netif_running(dev))
+		t3_link_start(&p->phy, &p->mac, lc);
+	return 0;
+}
+
+static void get_pauseparam(struct net_device *dev,
+			   struct ethtool_pauseparam *epause)
+{
+	struct port_info *p = netdev_priv(dev);
+
+	epause->autoneg = (p->link_config.requested_fc & PAUSE_AUTONEG) != 0;
+	epause->rx_pause = (p->link_config.fc & PAUSE_RX) != 0;
+	epause->tx_pause = (p->link_config.fc & PAUSE_TX) != 0;
+}
+
+static int set_pauseparam(struct net_device *dev,
+			  struct ethtool_pauseparam *epause)
+{
+	struct port_info *p = netdev_priv(dev);
+	struct link_config *lc = &p->link_config;
+
+	if (epause->autoneg == AUTONEG_DISABLE)
+		lc->requested_fc = 0;
+	else if (lc->supported & SUPPORTED_Autoneg)
+		lc->requested_fc = PAUSE_AUTONEG;
+	else
+		return -EINVAL;
+
+	if (epause->rx_pause)
+		lc->requested_fc |= PAUSE_RX;
+	if (epause->tx_pause)
+		lc->requested_fc |= PAUSE_TX;
+	if (lc->autoneg == AUTONEG_ENABLE) {
+		if (netif_running(dev))
+			t3_link_start(&p->phy, &p->mac, lc);
+	} else {
+		lc->fc = lc->requested_fc & (PAUSE_RX | PAUSE_TX);
+		if (netif_running(dev))
+			t3_mac_set_speed_duplex_fc(&p->mac, -1, -1, lc->fc);
+	}
+	return 0;
+}
+
+static void get_sge_param(struct net_device *dev, struct ethtool_ringparam *e)
+{
+	struct port_info *pi = netdev_priv(dev);
+	struct adapter *adapter = pi->adapter;
+	const struct qset_params *q = &adapter->params.sge.qset[pi->first_qset];
+
+	e->rx_max_pending = MAX_RX_BUFFERS;
+	e->rx_jumbo_max_pending = MAX_RX_JUMBO_BUFFERS;
+	e->tx_max_pending = MAX_TXQ_ENTRIES;
+
+	e->rx_pending = q->fl_size;
+	e->rx_mini_pending = q->rspq_size;
+	e->rx_jumbo_pending = q->jumbo_size;
+	e->tx_pending = q->txq_size[0];
+}
+
+static int set_sge_param(struct net_device *dev, struct ethtool_ringparam *e)
+{
+	struct port_info *pi = netdev_priv(dev);
+	struct adapter *adapter = pi->adapter;
+	struct qset_params *q;
+	int i;
+
+	if (e->rx_pending > MAX_RX_BUFFERS ||
+	    e->rx_jumbo_pending > MAX_RX_JUMBO_BUFFERS ||
+	    e->tx_pending > MAX_TXQ_ENTRIES ||
+	    e->rx_mini_pending > MAX_RSPQ_ENTRIES ||
+	    e->rx_mini_pending < MIN_RSPQ_ENTRIES ||
+	    e->rx_pending < MIN_FL_ENTRIES ||
+	    e->rx_jumbo_pending < MIN_FL_ENTRIES ||
+	    e->tx_pending < adapter->params.nports * MIN_TXQ_ENTRIES)
+		return -EINVAL;
+
+	if (adapter->flags & FULL_INIT_DONE)
+		return -EBUSY;
+
+	q = &adapter->params.sge.qset[pi->first_qset];
+	for (i = 0; i < pi->nqsets; ++i, ++q) {
+		q->rspq_size = e->rx_mini_pending;
+		q->fl_size = e->rx_pending;
+		q->jumbo_size = e->rx_jumbo_pending;
+		q->txq_size[0] = e->tx_pending;
+		q->txq_size[1] = e->tx_pending;
+		q->txq_size[2] = e->tx_pending;
+	}
+	return 0;
+}
+
+static int set_coalesce(struct net_device *dev, struct ethtool_coalesce *c)
+{
+	struct port_info *pi = netdev_priv(dev);
+	struct adapter *adapter = pi->adapter;
+	struct qset_params *qsp;
+	struct sge_qset *qs;
+	int i;
+
+	if (c->rx_coalesce_usecs * 10 > M_NEWTIMER)
+		return -EINVAL;
+
+	for (i = 0; i < pi->nqsets; i++) {
+		qsp = &adapter->params.sge.qset[i];
+		qs = &adapter->sge.qs[i];
+		qsp->coalesce_usecs = c->rx_coalesce_usecs;
+		t3_update_qset_coalesce(qs, qsp);
+	}
+
+	return 0;
+}
+
+static int get_coalesce(struct net_device *dev, struct ethtool_coalesce *c)
+{
+	struct port_info *pi = netdev_priv(dev);
+	struct adapter *adapter = pi->adapter;
+	struct qset_params *q = adapter->params.sge.qset;
+
+	c->rx_coalesce_usecs = q->coalesce_usecs;
+	return 0;
+}
+
+static int get_eeprom(struct net_device *dev, struct ethtool_eeprom *e,
+		      u8 * data)
+{
+	struct port_info *pi = netdev_priv(dev);
+	struct adapter *adapter = pi->adapter;
+	int i, err = 0;
+
+	u8 *buf = kmalloc(EEPROMSIZE, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	e->magic = EEPROM_MAGIC;
+	for (i = e->offset & ~3; !err && i < e->offset + e->len; i += 4)
+		err = t3_seeprom_read(adapter, i, (__le32 *) & buf[i]);
+
+	if (!err)
+		memcpy(data, buf + e->offset, e->len);
+	kfree(buf);
+	return err;
+}
+
+static int set_eeprom(struct net_device *dev, struct ethtool_eeprom *eeprom,
+		      u8 * data)
+{
+	struct port_info *pi = netdev_priv(dev);
+	struct adapter *adapter = pi->adapter;
+	u32 aligned_offset, aligned_len;
+	__le32 *p;
+	u8 *buf;
+	int err;
+
+	if (eeprom->magic != EEPROM_MAGIC)
+		return -EINVAL;
+
+	aligned_offset = eeprom->offset & ~3;
+	aligned_len = (eeprom->len + (eeprom->offset & 3) + 3) & ~3;
+
+	if (aligned_offset != eeprom->offset || aligned_len != eeprom->len) {
+		buf = kmalloc(aligned_len, GFP_KERNEL);
+		if (!buf)
+			return -ENOMEM;
+		err = t3_seeprom_read(adapter, aligned_offset, (__le32 *) buf);
+		if (!err && aligned_len > 4)
+			err = t3_seeprom_read(adapter,
+					      aligned_offset + aligned_len - 4,
+					      (__le32 *) & buf[aligned_len - 4]);
+		if (err)
+			goto out;
+		memcpy(buf + (eeprom->offset & 3), data, eeprom->len);
+	} else
+		buf = data;
+
+	err = t3_seeprom_wp(adapter, 0);
+	if (err)
+		goto out;
+
+	for (p = (__le32 *) buf; !err && aligned_len; aligned_len -= 4, p++) {
+		err = t3_seeprom_write(adapter, aligned_offset, *p);
+		aligned_offset += 4;
+	}
+
+	if (!err)
+		err = t3_seeprom_wp(adapter, 1);
+out:
+	if (buf != data)
+		kfree(buf);
+	return err;
+}
+
+static void get_wol(struct net_device *dev, struct ethtool_wolinfo *wol)
+{
+	wol->supported = 0;
+	wol->wolopts = 0;
+	memset(&wol->sopass, 0, sizeof(wol->sopass));
+}
+
+static const struct ethtool_ops cxgb_ethtool_ops = {
+	.get_settings = get_settings,
+	.set_settings = set_settings,
+	.get_drvinfo = get_drvinfo,
+	.get_msglevel = get_msglevel,
+	.set_msglevel = set_msglevel,
+	.get_ringparam = get_sge_param,
+	.set_ringparam = set_sge_param,
+	.get_coalesce = get_coalesce,
+	.set_coalesce = set_coalesce,
+	.get_eeprom_len = get_eeprom_len,
+	.get_eeprom = get_eeprom,
+	.set_eeprom = set_eeprom,
+	.get_pauseparam = get_pauseparam,
+	.set_pauseparam = set_pauseparam,
+	.get_link = ethtool_op_get_link,
+	.get_strings = get_strings,
+	.set_phys_id = set_phys_id,
+	.nway_reset = restart_autoneg,
+	.get_sset_count = get_sset_count,
+	.get_ethtool_stats = get_stats,
+	.get_regs_len = get_regs_len,
+	.get_regs = get_regs,
+	.get_wol = get_wol,
+};
+
+static int in_range(int val, int lo, int hi)
+{
+	return val < 0 || (val <= hi && val >= lo);
+}
+
+static int cxgb_extension_ioctl(struct net_device *dev, void __user *useraddr)
+{
+	struct port_info *pi = netdev_priv(dev);
+	struct adapter *adapter = pi->adapter;
+	u32 cmd;
+	int ret;
+
+	if (copy_from_user(&cmd, useraddr, sizeof(cmd)))
+		return -EFAULT;
+
+	switch (cmd) {
+	case CHELSIO_SET_QSET_PARAMS:{
+		int i;
+		struct qset_params *q;
+		struct ch_qset_params t;
+		int q1 = pi->first_qset;
+		int nqsets = pi->nqsets;
+
+		if (!capable(CAP_NET_ADMIN))
+			return -EPERM;
+		if (copy_from_user(&t, useraddr, sizeof(t)))
+			return -EFAULT;
+		if (t.qset_idx >= SGE_QSETS)
+			return -EINVAL;
+		if (!in_range(t.intr_lat, 0, M_NEWTIMER) ||
+		    !in_range(t.cong_thres, 0, 255) ||
+		    !in_range(t.txq_size[0], MIN_TXQ_ENTRIES,
+			      MAX_TXQ_ENTRIES) ||
+		    !in_range(t.txq_size[1], MIN_TXQ_ENTRIES,
+			      MAX_TXQ_ENTRIES) ||
+		    !in_range(t.txq_size[2], MIN_CTRL_TXQ_ENTRIES,
+			      MAX_CTRL_TXQ_ENTRIES) ||
+		    !in_range(t.fl_size[0], MIN_FL_ENTRIES,
+			      MAX_RX_BUFFERS) ||
+		    !in_range(t.fl_size[1], MIN_FL_ENTRIES,
+			      MAX_RX_JUMBO_BUFFERS) ||
+		    !in_range(t.rspq_size, MIN_RSPQ_ENTRIES,
+			      MAX_RSPQ_ENTRIES))
+			return -EINVAL;
+
+		if ((adapter->flags & FULL_INIT_DONE) &&
+			(t.rspq_size >= 0 || t.fl_size[0] >= 0 ||
+			t.fl_size[1] >= 0 || t.txq_size[0] >= 0 ||
+			t.txq_size[1] >= 0 || t.txq_size[2] >= 0 ||
+			t.polling >= 0 || t.cong_thres >= 0))
+			return -EBUSY;
+
+		/* Allow setting of any available qset when offload enabled */
+		if (test_bit(OFFLOAD_DEVMAP_BIT, &adapter->open_device_map)) {
+			q1 = 0;
+			for_each_port(adapter, i) {
+				pi = adap2pinfo(adapter, i);
+				nqsets += pi->first_qset + pi->nqsets;
+			}
+		}
+
+		if (t.qset_idx < q1)
+			return -EINVAL;
+		if (t.qset_idx > q1 + nqsets - 1)
+			return -EINVAL;
+
+		q = &adapter->params.sge.qset[t.qset_idx];
+
+		if (t.rspq_size >= 0)
+			q->rspq_size = t.rspq_size;
+		if (t.fl_size[0] >= 0)
+			q->fl_size = t.fl_size[0];
+		if (t.fl_size[1] >= 0)
+			q->jumbo_size = t.fl_size[1];
+		if (t.txq_size[0] >= 0)
+			q->txq_size[0] = t.txq_size[0];
+		if (t.txq_size[1] >= 0)
+			q->txq_size[1] = t.txq_size[1];
+		if (t.txq_size[2] >= 0)
+			q->txq_size[2] = t.txq_size[2];
+		if (t.cong_thres >= 0)
+			q->cong_thres = t.cong_thres;
+		if (t.intr_lat >= 0) {
+			struct sge_qset *qs =
+				&adapter->sge.qs[t.qset_idx];
+
+			q->coalesce_usecs = t.intr_lat;
+			t3_update_qset_coalesce(qs, q);
+		}
+		if (t.polling >= 0) {
+			if (adapter->flags & USING_MSIX)
+				q->polling = t.polling;
+			else {
+				/* No polling with INTx for T3A */
+				if (adapter->params.rev == 0 &&
+					!(adapter->flags & USING_MSI))
+					t.polling = 0;
+
+				for (i = 0; i < SGE_QSETS; i++) {
+					q = &adapter->params.sge.
+						qset[i];
+					q->polling = t.polling;
+				}
+			}
+		}
+
+		if (t.lro >= 0) {
+			if (t.lro)
+				dev->wanted_features |= NETIF_F_GRO;
+			else
+				dev->wanted_features &= ~NETIF_F_GRO;
+			netdev_update_features(dev);
+		}
+
+		break;
+	}
+	case CHELSIO_GET_QSET_PARAMS:{
+		struct qset_params *q;
+		struct ch_qset_params t;
+		int q1 = pi->first_qset;
+		int nqsets = pi->nqsets;
+		int i;
+
+		if (copy_from_user(&t, useraddr, sizeof(t)))
+			return -EFAULT;
+
+		/* Display qsets for all ports when offload enabled */
+		if (test_bit(OFFLOAD_DEVMAP_BIT, &adapter->open_device_map)) {
+			q1 = 0;
+			for_each_port(adapter, i) {
+				pi = adap2pinfo(adapter, i);
+				nqsets = pi->first_qset + pi->nqsets;
+			}
+		}
+
+		if (t.qset_idx >= nqsets)
+			return -EINVAL;
+
+		q = &adapter->params.sge.qset[q1 + t.qset_idx];
+		t.rspq_size = q->rspq_size;
+		t.txq_size[0] = q->txq_size[0];
+		t.txq_size[1] = q->txq_size[1];
+		t.txq_size[2] = q->txq_size[2];
+		t.fl_size[0] = q->fl_size;
+		t.fl_size[1] = q->jumbo_size;
+		t.polling = q->polling;
+		t.lro = !!(dev->features & NETIF_F_GRO);
+		t.intr_lat = q->coalesce_usecs;
+		t.cong_thres = q->cong_thres;
+		t.qnum = q1;
+
+		if (adapter->flags & USING_MSIX)
+			t.vector = adapter->msix_info[q1 + t.qset_idx + 1].vec;
+		else
+			t.vector = adapter->pdev->irq;
+
+		if (copy_to_user(useraddr, &t, sizeof(t)))
+			return -EFAULT;
+		break;
+	}
+	case CHELSIO_SET_QSET_NUM:{
+		struct ch_reg edata;
+		unsigned int i, first_qset = 0, other_qsets = 0;
+
+		if (!capable(CAP_NET_ADMIN))
+			return -EPERM;
+		if (adapter->flags & FULL_INIT_DONE)
+			return -EBUSY;
+		if (copy_from_user(&edata, useraddr, sizeof(edata)))
+			return -EFAULT;
+		if (edata.val < 1 ||
+			(edata.val > 1 && !(adapter->flags & USING_MSIX)))
+			return -EINVAL;
+
+		for_each_port(adapter, i)
+			if (adapter->port[i] && adapter->port[i] != dev)
+				other_qsets += adap2pinfo(adapter, i)->nqsets;
+
+		if (edata.val + other_qsets > SGE_QSETS)
+			return -EINVAL;
+
+		pi->nqsets = edata.val;
+
+		for_each_port(adapter, i)
+			if (adapter->port[i]) {
+				pi = adap2pinfo(adapter, i);
+				pi->first_qset = first_qset;
+				first_qset += pi->nqsets;
+			}
+		break;
+	}
+	case CHELSIO_GET_QSET_NUM:{
+		struct ch_reg edata;
+
+		memset(&edata, 0, sizeof(struct ch_reg));
+
+		edata.cmd = CHELSIO_GET_QSET_NUM;
+		edata.val = pi->nqsets;
+		if (copy_to_user(useraddr, &edata, sizeof(edata)))
+			return -EFAULT;
+		break;
+	}
+	case CHELSIO_LOAD_FW:{
+		u8 *fw_data;
+		struct ch_mem_range t;
+
+		if (!capable(CAP_SYS_RAWIO))
+			return -EPERM;
+		if (copy_from_user(&t, useraddr, sizeof(t)))
+			return -EFAULT;
+		/* Check t.len sanity ? */
+		fw_data = memdup_user(useraddr + sizeof(t), t.len);
+		if (IS_ERR(fw_data))
+			return PTR_ERR(fw_data);
+
+		ret = t3_load_fw(adapter, fw_data, t.len);
+		kfree(fw_data);
+		if (ret)
+			return ret;
+		break;
+	}
+	case CHELSIO_SETMTUTAB:{
+		struct ch_mtus m;
+		int i;
+
+		if (!is_offload(adapter))
+			return -EOPNOTSUPP;
+		if (!capable(CAP_NET_ADMIN))
+			return -EPERM;
+		if (offload_running(adapter))
+			return -EBUSY;
+		if (copy_from_user(&m, useraddr, sizeof(m)))
+			return -EFAULT;
+		if (m.nmtus != NMTUS)
+			return -EINVAL;
+		if (m.mtus[0] < 81)	/* accommodate SACK */
+			return -EINVAL;
+
+		/* MTUs must be in ascending order */
+		for (i = 1; i < NMTUS; ++i)
+			if (m.mtus[i] < m.mtus[i - 1])
+				return -EINVAL;
+
+		memcpy(adapter->params.mtus, m.mtus,
+			sizeof(adapter->params.mtus));
+		break;
+	}
+	case CHELSIO_GET_PM:{
+		struct tp_params *p = &adapter->params.tp;
+		struct ch_pm m = {.cmd = CHELSIO_GET_PM };
+
+		if (!is_offload(adapter))
+			return -EOPNOTSUPP;
+		m.tx_pg_sz = p->tx_pg_size;
+		m.tx_num_pg = p->tx_num_pgs;
+		m.rx_pg_sz = p->rx_pg_size;
+		m.rx_num_pg = p->rx_num_pgs;
+		m.pm_total = p->pmtx_size + p->chan_rx_size * p->nchan;
+		if (copy_to_user(useraddr, &m, sizeof(m)))
+			return -EFAULT;
+		break;
+	}
+	case CHELSIO_SET_PM:{
+		struct ch_pm m;
+		struct tp_params *p = &adapter->params.tp;
+
+		if (!is_offload(adapter))
+			return -EOPNOTSUPP;
+		if (!capable(CAP_NET_ADMIN))
+			return -EPERM;
+		if (adapter->flags & FULL_INIT_DONE)
+			return -EBUSY;
+		if (copy_from_user(&m, useraddr, sizeof(m)))
+			return -EFAULT;
+		if (!is_power_of_2(m.rx_pg_sz) ||
+			!is_power_of_2(m.tx_pg_sz))
+			return -EINVAL;	/* not power of 2 */
+		if (!(m.rx_pg_sz & 0x14000))
+			return -EINVAL;	/* not 16KB or 64KB */
+		if (!(m.tx_pg_sz & 0x1554000))
+			return -EINVAL;
+		if (m.tx_num_pg == -1)
+			m.tx_num_pg = p->tx_num_pgs;
+		if (m.rx_num_pg == -1)
+			m.rx_num_pg = p->rx_num_pgs;
+		if (m.tx_num_pg % 24 || m.rx_num_pg % 24)
+			return -EINVAL;
+		if (m.rx_num_pg * m.rx_pg_sz > p->chan_rx_size ||
+			m.tx_num_pg * m.tx_pg_sz > p->chan_tx_size)
+			return -EINVAL;
+		p->rx_pg_size = m.rx_pg_sz;
+		p->tx_pg_size = m.tx_pg_sz;
+		p->rx_num_pgs = m.rx_num_pg;
+		p->tx_num_pgs = m.tx_num_pg;
+		break;
+	}
+	case CHELSIO_GET_MEM:{
+		struct ch_mem_range t;
+		struct mc7 *mem;
+		u64 buf[32];
+
+		if (!is_offload(adapter))
+			return -EOPNOTSUPP;
+		if (!(adapter->flags & FULL_INIT_DONE))
+			return -EIO;	/* need the memory controllers */
+		if (copy_from_user(&t, useraddr, sizeof(t)))
+			return -EFAULT;
+		if ((t.addr & 7) || (t.len & 7))
+			return -EINVAL;
+		if (t.mem_id == MEM_CM)
+			mem = &adapter->cm;
+		else if (t.mem_id == MEM_PMRX)
+			mem = &adapter->pmrx;
+		else if (t.mem_id == MEM_PMTX)
+			mem = &adapter->pmtx;
+		else
+			return -EINVAL;
+
+		/*
+		 * Version scheme:
+		 * bits 0..9: chip version
+		 * bits 10..15: chip revision
+		 */
+		t.version = 3 | (adapter->params.rev << 10);
+		if (copy_to_user(useraddr, &t, sizeof(t)))
+			return -EFAULT;
+
+		/*
+		 * Read 256 bytes at a time as len can be large and we don't
+		 * want to use huge intermediate buffers.
+		 */
+		useraddr += sizeof(t);	/* advance to start of buffer */
+		while (t.len) {
+			unsigned int chunk =
+				min_t(unsigned int, t.len, sizeof(buf));
+
+			ret =
+				t3_mc7_bd_read(mem, t.addr / 8, chunk / 8,
+						buf);
+			if (ret)
+				return ret;
+			if (copy_to_user(useraddr, buf, chunk))
+				return -EFAULT;
+			useraddr += chunk;
+			t.addr += chunk;
+			t.len -= chunk;
+		}
+		break;
+	}
+	case CHELSIO_SET_TRACE_FILTER:{
+		struct ch_trace t;
+		const struct trace_params *tp;
+
+		if (!capable(CAP_NET_ADMIN))
+			return -EPERM;
+		if (!offload_running(adapter))
+			return -EAGAIN;
+		if (copy_from_user(&t, useraddr, sizeof(t)))
+			return -EFAULT;
+
+		tp = (const struct trace_params *)&t.sip;
+		if (t.config_tx)
+			t3_config_trace_filter(adapter, tp, 0,
+						t.invert_match,
+						t.trace_tx);
+		if (t.config_rx)
+			t3_config_trace_filter(adapter, tp, 1,
+						t.invert_match,
+						t.trace_rx);
+		break;
+	}
+	default:
+		return -EOPNOTSUPP;
+	}
+	return 0;
+}
+
+static int cxgb_ioctl(struct net_device *dev, struct ifreq *req, int cmd)
+{
+	struct mii_ioctl_data *data = if_mii(req);
+	struct port_info *pi = netdev_priv(dev);
+	struct adapter *adapter = pi->adapter;
+
+	switch (cmd) {
+	case SIOCGMIIREG:
+	case SIOCSMIIREG:
+		/* Convert phy_id from older PRTAD/DEVAD format */
+		if (is_10G(adapter) &&
+		    !mdio_phy_id_is_c45(data->phy_id) &&
+		    (data->phy_id & 0x1f00) &&
+		    !(data->phy_id & 0xe0e0))
+			data->phy_id = mdio_phy_id_c45(data->phy_id >> 8,
+						       data->phy_id & 0x1f);
+		/* FALLTHRU */
+	case SIOCGMIIPHY:
+		return mdio_mii_ioctl(&pi->phy.mdio, data, cmd);
+	case SIOCCHIOCTL:
+		return cxgb_extension_ioctl(dev, req->ifr_data);
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+static int cxgb_change_mtu(struct net_device *dev, int new_mtu)
+{
+	struct port_info *pi = netdev_priv(dev);
+	struct adapter *adapter = pi->adapter;
+	int ret;
+
+	if (new_mtu < 81)	/* accommodate SACK */
+		return -EINVAL;
+	if ((ret = t3_mac_set_mtu(&pi->mac, new_mtu)))
+		return ret;
+	dev->mtu = new_mtu;
+	init_port_mtus(adapter);
+	if (adapter->params.rev == 0 && offload_running(adapter))
+		t3_load_mtus(adapter, adapter->params.mtus,
+			     adapter->params.a_wnd, adapter->params.b_wnd,
+			     adapter->port[0]->mtu);
+	return 0;
+}
+
+static int cxgb_set_mac_addr(struct net_device *dev, void *p)
+{
+	struct port_info *pi = netdev_priv(dev);
+	struct adapter *adapter = pi->adapter;
+	struct sockaddr *addr = p;
+
+	if (!is_valid_ether_addr(addr->sa_data))
+		return -EADDRNOTAVAIL;
+
+	memcpy(dev->dev_addr, addr->sa_data, dev->addr_len);
+	t3_mac_set_address(&pi->mac, LAN_MAC_IDX, dev->dev_addr);
+	if (offload_running(adapter))
+		write_smt_entry(adapter, pi->port_id);
+	return 0;
+}
+
+static netdev_features_t cxgb_fix_features(struct net_device *dev,
+	netdev_features_t features)
+{
+	/*
+	 * Since there is no support for separate rx/tx vlan accel
+	 * enable/disable make sure tx flag is always in same state as rx.
+	 */
+	if (features & NETIF_F_HW_VLAN_CTAG_RX)
+		features |= NETIF_F_HW_VLAN_CTAG_TX;
+	else
+		features &= ~NETIF_F_HW_VLAN_CTAG_TX;
+
+	return features;
+}
+
+static int cxgb_set_features(struct net_device *dev, netdev_features_t features)
+{
+	netdev_features_t changed = dev->features ^ features;
+
+	if (changed & NETIF_F_HW_VLAN_CTAG_RX)
+		cxgb_vlan_mode(dev, features);
+
+	return 0;
+}
+
+#ifdef CONFIG_NET_POLL_CONTROLLER
+static void cxgb_netpoll(struct net_device *dev)
+{
+	struct port_info *pi = netdev_priv(dev);
+	struct adapter *adapter = pi->adapter;
+	int qidx;
+
+	for (qidx = pi->first_qset; qidx < pi->first_qset + pi->nqsets; qidx++) {
+		struct sge_qset *qs = &adapter->sge.qs[qidx];
+		void *source;
+
+		if (adapter->flags & USING_MSIX)
+			source = qs;
+		else
+			source = adapter;
+
+		t3_intr_handler(adapter, qs->rspq.polling) (0, source);
+	}
+}
+#endif
+
+/*
+ * Periodic accumulation of MAC statistics.
+ */
+static void mac_stats_update(struct adapter *adapter)
+{
+	int i;
+
+	for_each_port(adapter, i) {
+		struct net_device *dev = adapter->port[i];
+		struct port_info *p = netdev_priv(dev);
+
+		if (netif_running(dev)) {
+			spin_lock(&adapter->stats_lock);
+			t3_mac_update_stats(&p->mac);
+			spin_unlock(&adapter->stats_lock);
+		}
+	}
+}
+
+static void check_link_status(struct adapter *adapter)
+{
+	int i;
+
+	for_each_port(adapter, i) {
+		struct net_device *dev = adapter->port[i];
+		struct port_info *p = netdev_priv(dev);
+		int link_fault;
+
+		spin_lock_irq(&adapter->work_lock);
+		link_fault = p->link_fault;
+		spin_unlock_irq(&adapter->work_lock);
+
+		if (link_fault) {
+			t3_link_fault(adapter, i);
+			continue;
+		}
+
+		if (!(p->phy.caps & SUPPORTED_IRQ) && netif_running(dev)) {
+			t3_xgm_intr_disable(adapter, i);
+			t3_read_reg(adapter, A_XGM_INT_STATUS + p->mac.offset);
+
+			t3_link_changed(adapter, i);
+			t3_xgm_intr_enable(adapter, i);
+		}
+	}
+}
+
+static void check_t3b2_mac(struct adapter *adapter)
+{
+	int i;
+
+	if (!rtnl_trylock())	/* synchronize with ifdown */
+		return;
+
+	for_each_port(adapter, i) {
+		struct net_device *dev = adapter->port[i];
+		struct port_info *p = netdev_priv(dev);
+		int status;
+
+		if (!netif_running(dev))
+			continue;
+
+		status = 0;
+		if (netif_running(dev) && netif_carrier_ok(dev))
+			status = t3b2_mac_watchdog_task(&p->mac);
+		if (status == 1)
+			p->mac.stats.num_toggled++;
+		else if (status == 2) {
+			struct cmac *mac = &p->mac;
+
+			t3_mac_set_mtu(mac, dev->mtu);
+			t3_mac_set_address(mac, LAN_MAC_IDX, dev->dev_addr);
+			cxgb_set_rxmode(dev);
+			t3_link_start(&p->phy, mac, &p->link_config);
+			t3_mac_enable(mac, MAC_DIRECTION_RX | MAC_DIRECTION_TX);
+			t3_port_intr_enable(adapter, p->port_id);
+			p->mac.stats.num_resets++;
+		}
+	}
+	rtnl_unlock();
+}
+
+
+static void t3_adap_check_task(struct work_struct *work)
+{
+	struct adapter *adapter = container_of(work, struct adapter,
+					       adap_check_task.work);
+	const struct adapter_params *p = &adapter->params;
+	int port;
+	unsigned int v, status, reset;
+
+	adapter->check_task_cnt++;
+
+	check_link_status(adapter);
+
+	/* Accumulate MAC stats if needed */
+	if (!p->linkpoll_period ||
+	    (adapter->check_task_cnt * p->linkpoll_period) / 10 >=
+	    p->stats_update_period) {
+		mac_stats_update(adapter);
+		adapter->check_task_cnt = 0;
+	}
+
+	if (p->rev == T3_REV_B2)
+		check_t3b2_mac(adapter);
+
+	/*
+	 * Scan the XGMAC's to check for various conditions which we want to
+	 * monitor in a periodic polling manner rather than via an interrupt
+	 * condition.  This is used for conditions which would otherwise flood
+	 * the system with interrupts and we only really need to know that the
+	 * conditions are "happening" ...  For each condition we count the
+	 * detection of the condition and reset it for the next polling loop.
+	 */
+	for_each_port(adapter, port) {
+		struct cmac *mac =  &adap2pinfo(adapter, port)->mac;
+		u32 cause;
+
+		cause = t3_read_reg(adapter, A_XGM_INT_CAUSE + mac->offset);
+		reset = 0;
+		if (cause & F_RXFIFO_OVERFLOW) {
+			mac->stats.rx_fifo_ovfl++;
+			reset |= F_RXFIFO_OVERFLOW;
+		}
+
+		t3_write_reg(adapter, A_XGM_INT_CAUSE + mac->offset, reset);
+	}
+
+	/*
+	 * We do the same as above for FL_EMPTY interrupts.
+	 */
+	status = t3_read_reg(adapter, A_SG_INT_CAUSE);
+	reset = 0;
+
+	if (status & F_FLEMPTY) {
+		struct sge_qset *qs = &adapter->sge.qs[0];
+		int i = 0;
+
+		reset |= F_FLEMPTY;
+
+		v = (t3_read_reg(adapter, A_SG_RSPQ_FL_STATUS) >> S_FL0EMPTY) &
+		    0xffff;
+
+		while (v) {
+			qs->fl[i].empty += (v & 1);
+			if (i)
+				qs++;
+			i ^= 1;
+			v >>= 1;
+		}
+	}
+
+	t3_write_reg(adapter, A_SG_INT_CAUSE, reset);
+
+	/* Schedule the next check update if any port is active. */
+	spin_lock_irq(&adapter->work_lock);
+	if (adapter->open_device_map & PORT_MASK)
+		schedule_chk_task(adapter);
+	spin_unlock_irq(&adapter->work_lock);
+}
+
+static void db_full_task(struct work_struct *work)
+{
+	struct adapter *adapter = container_of(work, struct adapter,
+					       db_full_task);
+
+	cxgb3_event_notify(&adapter->tdev, OFFLOAD_DB_FULL, 0);
+}
+
+static void db_empty_task(struct work_struct *work)
+{
+	struct adapter *adapter = container_of(work, struct adapter,
+					       db_empty_task);
+
+	cxgb3_event_notify(&adapter->tdev, OFFLOAD_DB_EMPTY, 0);
+}
+
+static void db_drop_task(struct work_struct *work)
+{
+	struct adapter *adapter = container_of(work, struct adapter,
+					       db_drop_task);
+	unsigned long delay = 1000;
+	unsigned short r;
+
+	cxgb3_event_notify(&adapter->tdev, OFFLOAD_DB_DROP, 0);
+
+	/*
+	 * Sleep a while before ringing the driver qset dbs.
+	 * The delay is between 1000-2023 usecs.
+	 */
+	get_random_bytes(&r, 2);
+	delay += r & 1023;
+	set_current_state(TASK_UNINTERRUPTIBLE);
+	schedule_timeout(usecs_to_jiffies(delay));
+	ring_dbs(adapter);
+}
+
+/*
+ * Processes external (PHY) interrupts in process context.
+ */
+static void ext_intr_task(struct work_struct *work)
+{
+	struct adapter *adapter = container_of(work, struct adapter,
+					       ext_intr_handler_task);
+	int i;
+
+	/* Disable link fault interrupts */
+	for_each_port(adapter, i) {
+		struct net_device *dev = adapter->port[i];
+		struct port_info *p = netdev_priv(dev);
+
+		t3_xgm_intr_disable(adapter, i);
+		t3_read_reg(adapter, A_XGM_INT_STATUS + p->mac.offset);
+	}
+
+	/* Re-enable link fault interrupts */
+	t3_phy_intr_handler(adapter);
+
+	for_each_port(adapter, i)
+		t3_xgm_intr_enable(adapter, i);
+
+	/* Now reenable external interrupts */
+	spin_lock_irq(&adapter->work_lock);
+	if (adapter->slow_intr_mask) {
+		adapter->slow_intr_mask |= F_T3DBG;
+		t3_write_reg(adapter, A_PL_INT_CAUSE0, F_T3DBG);
+		t3_write_reg(adapter, A_PL_INT_ENABLE0,
+			     adapter->slow_intr_mask);
+	}
+	spin_unlock_irq(&adapter->work_lock);
+}
+
+/*
+ * Interrupt-context handler for external (PHY) interrupts.
+ */
+void t3_os_ext_intr_handler(struct adapter *adapter)
+{
+	/*
+	 * Schedule a task to handle external interrupts as they may be slow
+	 * and we use a mutex to protect MDIO registers.  We disable PHY
+	 * interrupts in the meantime and let the task reenable them when
+	 * it's done.
+	 */
+	spin_lock(&adapter->work_lock);
+	if (adapter->slow_intr_mask) {
+		adapter->slow_intr_mask &= ~F_T3DBG;
+		t3_write_reg(adapter, A_PL_INT_ENABLE0,
+			     adapter->slow_intr_mask);
+		queue_work(cxgb3_wq, &adapter->ext_intr_handler_task);
+	}
+	spin_unlock(&adapter->work_lock);
+}
+
+void t3_os_link_fault_handler(struct adapter *adapter, int port_id)
+{
+	struct net_device *netdev = adapter->port[port_id];
+	struct port_info *pi = netdev_priv(netdev);
+
+	spin_lock(&adapter->work_lock);
+	pi->link_fault = 1;
+	spin_unlock(&adapter->work_lock);
+}
+
+static int t3_adapter_error(struct adapter *adapter, int reset, int on_wq)
+{
+	int i, ret = 0;
+
+	if (is_offload(adapter) &&
+	    test_bit(OFFLOAD_DEVMAP_BIT, &adapter->open_device_map)) {
+		cxgb3_event_notify(&adapter->tdev, OFFLOAD_STATUS_DOWN, 0);
+		offload_close(&adapter->tdev);
+	}
+
+	/* Stop all ports */
+	for_each_port(adapter, i) {
+		struct net_device *netdev = adapter->port[i];
+
+		if (netif_running(netdev))
+			__cxgb_close(netdev, on_wq);
+	}
+
+	/* Stop SGE timers */
+	t3_stop_sge_timers(adapter);
+
+	adapter->flags &= ~FULL_INIT_DONE;
+
+	if (reset)
+		ret = t3_reset_adapter(adapter);
+
+	pci_disable_device(adapter->pdev);
+
+	return ret;
+}
+
+static int t3_reenable_adapter(struct adapter *adapter)
+{
+	if (pci_enable_device(adapter->pdev)) {
+		dev_err(&adapter->pdev->dev,
+			"Cannot re-enable PCI device after reset.\n");
+		goto err;
+	}
+	pci_set_master(adapter->pdev);
+	pci_restore_state(adapter->pdev);
+	pci_save_state(adapter->pdev);
+
+	/* Free sge resources */
+	t3_free_sge_resources(adapter);
+
+	if (t3_replay_prep_adapter(adapter))
+		goto err;
+
+	return 0;
+err:
+	return -1;
+}
+
+static void t3_resume_ports(struct adapter *adapter)
+{
+	int i;
+
+	/* Restart the ports */
+	for_each_port(adapter, i) {
+		struct net_device *netdev = adapter->port[i];
+
+		if (netif_running(netdev)) {
+			if (cxgb_open(netdev)) {
+				dev_err(&adapter->pdev->dev,
+					"can't bring device back up"
+					" after reset\n");
+				continue;
+			}
+		}
+	}
+
+	if (is_offload(adapter) && !ofld_disable)
+		cxgb3_event_notify(&adapter->tdev, OFFLOAD_STATUS_UP, 0);
+}
+
+/*
+ * processes a fatal error.
+ * Bring the ports down, reset the chip, bring the ports back up.
+ */
+static void fatal_error_task(struct work_struct *work)
+{
+	struct adapter *adapter = container_of(work, struct adapter,
+					       fatal_error_handler_task);
+	int err = 0;
+
+	rtnl_lock();
+	err = t3_adapter_error(adapter, 1, 1);
+	if (!err)
+		err = t3_reenable_adapter(adapter);
+	if (!err)
+		t3_resume_ports(adapter);
+
+	CH_ALERT(adapter, "adapter reset %s\n", err ? "failed" : "succeeded");
+	rtnl_unlock();
+}
+
+void t3_fatal_err(struct adapter *adapter)
+{
+	unsigned int fw_status[4];
+
+	if (adapter->flags & FULL_INIT_DONE) {
+		t3_sge_stop(adapter);
+		t3_write_reg(adapter, A_XGM_TX_CTRL, 0);
+		t3_write_reg(adapter, A_XGM_RX_CTRL, 0);
+		t3_write_reg(adapter, XGM_REG(A_XGM_TX_CTRL, 1), 0);
+		t3_write_reg(adapter, XGM_REG(A_XGM_RX_CTRL, 1), 0);
+
+		spin_lock(&adapter->work_lock);
+		t3_intr_disable(adapter);
+		queue_work(cxgb3_wq, &adapter->fatal_error_handler_task);
+		spin_unlock(&adapter->work_lock);
+	}
+	CH_ALERT(adapter, "encountered fatal error, operation suspended\n");
+	if (!t3_cim_ctl_blk_read(adapter, 0xa0, 4, fw_status))
+		CH_ALERT(adapter, "FW status: 0x%x, 0x%x, 0x%x, 0x%x\n",
+			 fw_status[0], fw_status[1],
+			 fw_status[2], fw_status[3]);
+}
+
+/**
+ * t3_io_error_detected - called when PCI error is detected
+ * @pdev: Pointer to PCI device
+ * @state: The current pci connection state
+ *
+ * This function is called after a PCI bus error affecting
+ * this device has been detected.
+ */
+static pci_ers_result_t t3_io_error_detected(struct pci_dev *pdev,
+					     pci_channel_state_t state)
+{
+	struct adapter *adapter = pci_get_drvdata(pdev);
+
+	if (state == pci_channel_io_perm_failure)
+		return PCI_ERS_RESULT_DISCONNECT;
+
+	t3_adapter_error(adapter, 0, 0);
+
+	/* Request a slot reset. */
+	return PCI_ERS_RESULT_NEED_RESET;
+}
+
+/**
+ * t3_io_slot_reset - called after the pci bus has been reset.
+ * @pdev: Pointer to PCI device
+ *
+ * Restart the card from scratch, as if from a cold-boot.
+ */
+static pci_ers_result_t t3_io_slot_reset(struct pci_dev *pdev)
+{
+	struct adapter *adapter = pci_get_drvdata(pdev);
+
+	if (!t3_reenable_adapter(adapter))
+		return PCI_ERS_RESULT_RECOVERED;
+
+	return PCI_ERS_RESULT_DISCONNECT;
+}
+
+/**
+ * t3_io_resume - called when traffic can start flowing again.
+ * @pdev: Pointer to PCI device
+ *
+ * This callback is called when the error recovery driver tells us that
+ * its OK to resume normal operation.
+ */
+static void t3_io_resume(struct pci_dev *pdev)
+{
+	struct adapter *adapter = pci_get_drvdata(pdev);
+
+	CH_ALERT(adapter, "adapter recovering, PEX ERR 0x%x\n",
+		 t3_read_reg(adapter, A_PCIE_PEX_ERR));
+
+	rtnl_lock();
+	t3_resume_ports(adapter);
+	rtnl_unlock();
+}
+
+static const struct pci_error_handlers t3_err_handler = {
+	.error_detected = t3_io_error_detected,
+	.slot_reset = t3_io_slot_reset,
+	.resume = t3_io_resume,
+};
+
+/*
+ * Set the number of qsets based on the number of CPUs and the number of ports,
+ * not to exceed the number of available qsets, assuming there are enough qsets
+ * per port in HW.
+ */
+static void set_nqsets(struct adapter *adap)
+{
+	int i, j = 0;
+	int num_cpus = netif_get_num_default_rss_queues();
+	int hwports = adap->params.nports;
+	int nqsets = adap->msix_nvectors - 1;
+
+	if (adap->params.rev > 0 && adap->flags & USING_MSIX) {
+		if (hwports == 2 &&
+		    (hwports * nqsets > SGE_QSETS ||
+		     num_cpus >= nqsets / hwports))
+			nqsets /= hwports;
+		if (nqsets > num_cpus)
+			nqsets = num_cpus;
+		if (nqsets < 1 || hwports == 4)
+			nqsets = 1;
+	} else
+		nqsets = 1;
+
+	for_each_port(adap, i) {
+		struct port_info *pi = adap2pinfo(adap, i);
+
+		pi->first_qset = j;
+		pi->nqsets = nqsets;
+		j = pi->first_qset + nqsets;
+
+		dev_info(&adap->pdev->dev,
+			 "Port %d using %d queue sets.\n", i, nqsets);
+	}
+}
+
+static int cxgb_enable_msix(struct adapter *adap)
+{
+	struct msix_entry entries[SGE_QSETS + 1];
+	int vectors;
+	int i;
+
+	vectors = ARRAY_SIZE(entries);
+	for (i = 0; i < vectors; ++i)
+		entries[i].entry = i;
+
+	vectors = pci_enable_msix_range(adap->pdev, entries,
+					adap->params.nports + 1, vectors);
+	if (vectors < 0)
+		return vectors;
+
+	for (i = 0; i < vectors; ++i)
+		adap->msix_info[i].vec = entries[i].vector;
+	adap->msix_nvectors = vectors;
+
+	return 0;
+}
+
+static void print_port_info(struct adapter *adap, const struct adapter_info *ai)
+{
+	static const char *pci_variant[] = {
+		"PCI", "PCI-X", "PCI-X ECC", "PCI-X 266", "PCI Express"
+	};
+
+	int i;
+	char buf[80];
+
+	if (is_pcie(adap))
+		snprintf(buf, sizeof(buf), "%s x%d",
+			 pci_variant[adap->params.pci.variant],
+			 adap->params.pci.width);
+	else
+		snprintf(buf, sizeof(buf), "%s %dMHz/%d-bit",
+			 pci_variant[adap->params.pci.variant],
+			 adap->params.pci.speed, adap->params.pci.width);
+
+	for_each_port(adap, i) {
+		struct net_device *dev = adap->port[i];
+		const struct port_info *pi = netdev_priv(dev);
+
+		if (!test_bit(i, &adap->registered_device_map))
+			continue;
+		netdev_info(dev, "%s %s %sNIC (rev %d) %s%s\n",
+			    ai->desc, pi->phy.desc,
+			    is_offload(adap) ? "R" : "", adap->params.rev, buf,
+			    (adap->flags & USING_MSIX) ? " MSI-X" :
+			    (adap->flags & USING_MSI) ? " MSI" : "");
+		if (adap->name == dev->name && adap->params.vpd.mclk)
+			pr_info("%s: %uMB CM, %uMB PMTX, %uMB PMRX, S/N: %s\n",
+			       adap->name, t3_mc7_size(&adap->cm) >> 20,
+			       t3_mc7_size(&adap->pmtx) >> 20,
+			       t3_mc7_size(&adap->pmrx) >> 20,
+			       adap->params.vpd.sn);
+	}
+}
+
+static const struct net_device_ops cxgb_netdev_ops = {
+	.ndo_open		= cxgb_open,
+	.ndo_stop		= cxgb_close,
+	.ndo_start_xmit		= t3_eth_xmit,
+	.ndo_get_stats		= cxgb_get_stats,
+	.ndo_validate_addr	= eth_validate_addr,
+	.ndo_set_rx_mode	= cxgb_set_rxmode,
+	.ndo_do_ioctl		= cxgb_ioctl,
+	.ndo_change_mtu		= cxgb_change_mtu,
+	.ndo_set_mac_address	= cxgb_set_mac_addr,
+	.ndo_fix_features	= cxgb_fix_features,
+	.ndo_set_features	= cxgb_set_features,
+#ifdef CONFIG_NET_POLL_CONTROLLER
+	.ndo_poll_controller	= cxgb_netpoll,
+#endif
+};
+
+static void cxgb3_init_iscsi_mac(struct net_device *dev)
+{
+	struct port_info *pi = netdev_priv(dev);
+
+	memcpy(pi->iscsic.mac_addr, dev->dev_addr, ETH_ALEN);
+	pi->iscsic.mac_addr[3] |= 0x80;
+}
+
+#define TSO_FLAGS (NETIF_F_TSO | NETIF_F_TSO6 | NETIF_F_TSO_ECN)
+#define VLAN_FEAT (NETIF_F_SG | NETIF_F_IP_CSUM | TSO_FLAGS | \
+			NETIF_F_IPV6_CSUM | NETIF_F_HIGHDMA)
+static int init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
+{
+	int i, err, pci_using_dac = 0;
+	resource_size_t mmio_start, mmio_len;
+	const struct adapter_info *ai;
+	struct adapter *adapter = NULL;
+	struct port_info *pi;
+
+	pr_info_once("%s - version %s\n", DRV_DESC, DRV_VERSION);
+
+	if (!cxgb3_wq) {
+		cxgb3_wq = create_singlethread_workqueue(DRV_NAME);
+		if (!cxgb3_wq) {
+			pr_err("cannot initialize work queue\n");
+			return -ENOMEM;
+		}
+	}
+
+	err = pci_enable_device(pdev);
+	if (err) {
+		dev_err(&pdev->dev, "cannot enable PCI device\n");
+		goto out;
+	}
+
+	err = pci_request_regions(pdev, DRV_NAME);
+	if (err) {
+		/* Just info, some other driver may have claimed the device. */
+		dev_info(&pdev->dev, "cannot obtain PCI resources\n");
+		goto out_disable_device;
+	}
+
+	if (!pci_set_dma_mask(pdev, DMA_BIT_MASK(64))) {
+		pci_using_dac = 1;
+		err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64));
+		if (err) {
+			dev_err(&pdev->dev, "unable to obtain 64-bit DMA for "
+			       "coherent allocations\n");
+			goto out_release_regions;
+		}
+	} else if ((err = pci_set_dma_mask(pdev, DMA_BIT_MASK(32))) != 0) {
+		dev_err(&pdev->dev, "no usable DMA configuration\n");
+		goto out_release_regions;
+	}
+
+	pci_set_master(pdev);
+	pci_save_state(pdev);
+
+	mmio_start = pci_resource_start(pdev, 0);
+	mmio_len = pci_resource_len(pdev, 0);
+	ai = t3_get_adapter_info(ent->driver_data);
+
+	adapter = kzalloc(sizeof(*adapter), GFP_KERNEL);
+	if (!adapter) {
+		err = -ENOMEM;
+		goto out_release_regions;
+	}
+
+	adapter->nofail_skb =
+		alloc_skb(sizeof(struct cpl_set_tcb_field), GFP_KERNEL);
+	if (!adapter->nofail_skb) {
+		dev_err(&pdev->dev, "cannot allocate nofail buffer\n");
+		err = -ENOMEM;
+		goto out_free_adapter;
+	}
+
+	adapter->regs = ioremap_nocache(mmio_start, mmio_len);
+	if (!adapter->regs) {
+		dev_err(&pdev->dev, "cannot map device registers\n");
+		err = -ENOMEM;
+		goto out_free_adapter;
+	}
+
+	adapter->pdev = pdev;
+	adapter->name = pci_name(pdev);
+	adapter->msg_enable = dflt_msg_enable;
+	adapter->mmio_len = mmio_len;
+
+	mutex_init(&adapter->mdio_lock);
+	spin_lock_init(&adapter->work_lock);
+	spin_lock_init(&adapter->stats_lock);
+
+	INIT_LIST_HEAD(&adapter->adapter_list);
+	INIT_WORK(&adapter->ext_intr_handler_task, ext_intr_task);
+	INIT_WORK(&adapter->fatal_error_handler_task, fatal_error_task);
+
+	INIT_WORK(&adapter->db_full_task, db_full_task);
+	INIT_WORK(&adapter->db_empty_task, db_empty_task);
+	INIT_WORK(&adapter->db_drop_task, db_drop_task);
+
+	INIT_DELAYED_WORK(&adapter->adap_check_task, t3_adap_check_task);
+
+	for (i = 0; i < ai->nports0 + ai->nports1; ++i) {
+		struct net_device *netdev;
+
+		netdev = alloc_etherdev_mq(sizeof(struct port_info), SGE_QSETS);
+		if (!netdev) {
+			err = -ENOMEM;
+			goto out_free_dev;
+		}
+
+		SET_NETDEV_DEV(netdev, &pdev->dev);
+
+		adapter->port[i] = netdev;
+		pi = netdev_priv(netdev);
+		pi->adapter = adapter;
+		pi->port_id = i;
+		netif_carrier_off(netdev);
+		netdev->irq = pdev->irq;
+		netdev->mem_start = mmio_start;
+		netdev->mem_end = mmio_start + mmio_len - 1;
+		netdev->hw_features = NETIF_F_SG | NETIF_F_IP_CSUM |
+			NETIF_F_TSO | NETIF_F_RXCSUM | NETIF_F_HW_VLAN_CTAG_RX;
+		netdev->features |= netdev->hw_features |
+				    NETIF_F_HW_VLAN_CTAG_TX;
+		netdev->vlan_features |= netdev->features & VLAN_FEAT;
+		if (pci_using_dac)
+			netdev->features |= NETIF_F_HIGHDMA;
+
+		netdev->netdev_ops = &cxgb_netdev_ops;
+		netdev->ethtool_ops = &cxgb_ethtool_ops;
+	}
+
+	pci_set_drvdata(pdev, adapter);
+	if (t3_prep_adapter(adapter, ai, 1) < 0) {
+		err = -ENODEV;
+		goto out_free_dev;
+	}
+
+	/*
+	 * The card is now ready to go.  If any errors occur during device
+	 * registration we do not fail the whole card but rather proceed only
+	 * with the ports we manage to register successfully.  However we must
+	 * register at least one net device.
+	 */
+	for_each_port(adapter, i) {
+		err = register_netdev(adapter->port[i]);
+		if (err)
+			dev_warn(&pdev->dev,
+				 "cannot register net device %s, skipping\n",
+				 adapter->port[i]->name);
+		else {
+			/*
+			 * Change the name we use for messages to the name of
+			 * the first successfully registered interface.
+			 */
+			if (!adapter->registered_device_map)
+				adapter->name = adapter->port[i]->name;
+
+			__set_bit(i, &adapter->registered_device_map);
+		}
+	}
+	if (!adapter->registered_device_map) {
+		dev_err(&pdev->dev, "could not register any net devices\n");
+		goto out_free_dev;
+	}
+
+	for_each_port(adapter, i)
+		cxgb3_init_iscsi_mac(adapter->port[i]);
+
+	/* Driver's ready. Reflect it on LEDs */
+	t3_led_ready(adapter);
+
+	if (is_offload(adapter)) {
+		__set_bit(OFFLOAD_DEVMAP_BIT, &adapter->registered_device_map);
+		cxgb3_adapter_ofld(adapter);
+	}
+
+	/* See what interrupts we'll be using */
+	if (msi > 1 && cxgb_enable_msix(adapter) == 0)
+		adapter->flags |= USING_MSIX;
+	else if (msi > 0 && pci_enable_msi(pdev) == 0)
+		adapter->flags |= USING_MSI;
+
+	set_nqsets(adapter);
+
+	err = sysfs_create_group(&adapter->port[0]->dev.kobj,
+				 &cxgb3_attr_group);
+
+	print_port_info(adapter, ai);
+	return 0;
+
+out_free_dev:
+	iounmap(adapter->regs);
+	for (i = ai->nports0 + ai->nports1 - 1; i >= 0; --i)
+		if (adapter->port[i])
+			free_netdev(adapter->port[i]);
+
+out_free_adapter:
+	kfree(adapter);
+
+out_release_regions:
+	pci_release_regions(pdev);
+out_disable_device:
+	pci_disable_device(pdev);
+out:
+	return err;
+}
+
+static void remove_one(struct pci_dev *pdev)
+{
+	struct adapter *adapter = pci_get_drvdata(pdev);
+
+	if (adapter) {
+		int i;
+
+		t3_sge_stop(adapter);
+		sysfs_remove_group(&adapter->port[0]->dev.kobj,
+				   &cxgb3_attr_group);
+
+		if (is_offload(adapter)) {
+			cxgb3_adapter_unofld(adapter);
+			if (test_bit(OFFLOAD_DEVMAP_BIT,
+				     &adapter->open_device_map))
+				offload_close(&adapter->tdev);
+		}
+
+		for_each_port(adapter, i)
+		    if (test_bit(i, &adapter->registered_device_map))
+			unregister_netdev(adapter->port[i]);
+
+		t3_stop_sge_timers(adapter);
+		t3_free_sge_resources(adapter);
+		cxgb_disable_msi(adapter);
+
+		for_each_port(adapter, i)
+			if (adapter->port[i])
+				free_netdev(adapter->port[i]);
+
+		iounmap(adapter->regs);
+		if (adapter->nofail_skb)
+			kfree_skb(adapter->nofail_skb);
+		kfree(adapter);
+		pci_release_regions(pdev);
+		pci_disable_device(pdev);
+	}
+}
+
+static struct pci_driver driver = {
+	.name = DRV_NAME,
+	.id_table = cxgb3_pci_tbl,
+	.probe = init_one,
+	.remove = remove_one,
+	.err_handler = &t3_err_handler,
+};
+
+static int __init cxgb3_init_module(void)
+{
+	int ret;
+
+	cxgb3_offload_init();
+
+	ret = pci_register_driver(&driver);
+	return ret;
+}
+
+static void __exit cxgb3_cleanup_module(void)
+{
+	pci_unregister_driver(&driver);
+	if (cxgb3_wq)
+		destroy_workqueue(cxgb3_wq);
+}
+
+module_init(cxgb3_init_module);
+module_exit(cxgb3_cleanup_module);
diff --git a/drivers/net/ethernet/chelsio/cxgb3/cxgb3_offload.c b/drivers/net/ethernet/chelsio/cxgb3/cxgb3_offload.c
new file mode 100644
index 0000000..b0cbb2b
--- /dev/null
+++ b/drivers/net/ethernet/chelsio/cxgb3/cxgb3_offload.c
@@ -0,0 +1,1427 @@
+/*
+ * Copyright (c) 2006-2008 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <net/neighbour.h>
+#include <linux/notifier.h>
+#include <linux/atomic.h>
+#include <linux/proc_fs.h>
+#include <linux/if_vlan.h>
+#include <net/netevent.h>
+#include <linux/highmem.h>
+#include <linux/vmalloc.h>
+#include <linux/export.h>
+
+#include "common.h"
+#include "regs.h"
+#include "cxgb3_ioctl.h"
+#include "cxgb3_ctl_defs.h"
+#include "cxgb3_defs.h"
+#include "l2t.h"
+#include "firmware_exports.h"
+#include "cxgb3_offload.h"
+
+static LIST_HEAD(client_list);
+static LIST_HEAD(ofld_dev_list);
+static DEFINE_MUTEX(cxgb3_db_lock);
+
+static DEFINE_RWLOCK(adapter_list_lock);
+static LIST_HEAD(adapter_list);
+
+static const unsigned int MAX_ATIDS = 64 * 1024;
+static const unsigned int ATID_BASE = 0x10000;
+
+static void cxgb_neigh_update(struct neighbour *neigh);
+static void cxgb_redirect(struct dst_entry *old, struct dst_entry *new,
+			  struct neighbour *neigh, const void *daddr);
+
+static inline int offload_activated(struct t3cdev *tdev)
+{
+	const struct adapter *adapter = tdev2adap(tdev);
+
+	return test_bit(OFFLOAD_DEVMAP_BIT, &adapter->open_device_map);
+}
+
+/**
+ *	cxgb3_register_client - register an offload client
+ *	@client: the client
+ *
+ *	Add the client to the client list,
+ *	and call backs the client for each activated offload device
+ */
+void cxgb3_register_client(struct cxgb3_client *client)
+{
+	struct t3cdev *tdev;
+
+	mutex_lock(&cxgb3_db_lock);
+	list_add_tail(&client->client_list, &client_list);
+
+	if (client->add) {
+		list_for_each_entry(tdev, &ofld_dev_list, ofld_dev_list) {
+			if (offload_activated(tdev))
+				client->add(tdev);
+		}
+	}
+	mutex_unlock(&cxgb3_db_lock);
+}
+
+EXPORT_SYMBOL(cxgb3_register_client);
+
+/**
+ *	cxgb3_unregister_client - unregister an offload client
+ *	@client: the client
+ *
+ *	Remove the client to the client list,
+ *	and call backs the client for each activated offload device.
+ */
+void cxgb3_unregister_client(struct cxgb3_client *client)
+{
+	struct t3cdev *tdev;
+
+	mutex_lock(&cxgb3_db_lock);
+	list_del(&client->client_list);
+
+	if (client->remove) {
+		list_for_each_entry(tdev, &ofld_dev_list, ofld_dev_list) {
+			if (offload_activated(tdev))
+				client->remove(tdev);
+		}
+	}
+	mutex_unlock(&cxgb3_db_lock);
+}
+
+EXPORT_SYMBOL(cxgb3_unregister_client);
+
+/**
+ *	cxgb3_add_clients - activate registered clients for an offload device
+ *	@tdev: the offload device
+ *
+ *	Call backs all registered clients once a offload device is activated
+ */
+void cxgb3_add_clients(struct t3cdev *tdev)
+{
+	struct cxgb3_client *client;
+
+	mutex_lock(&cxgb3_db_lock);
+	list_for_each_entry(client, &client_list, client_list) {
+		if (client->add)
+			client->add(tdev);
+	}
+	mutex_unlock(&cxgb3_db_lock);
+}
+
+/**
+ *	cxgb3_remove_clients - deactivates registered clients
+ *			       for an offload device
+ *	@tdev: the offload device
+ *
+ *	Call backs all registered clients once a offload device is deactivated
+ */
+void cxgb3_remove_clients(struct t3cdev *tdev)
+{
+	struct cxgb3_client *client;
+
+	mutex_lock(&cxgb3_db_lock);
+	list_for_each_entry(client, &client_list, client_list) {
+		if (client->remove)
+			client->remove(tdev);
+	}
+	mutex_unlock(&cxgb3_db_lock);
+}
+
+void cxgb3_event_notify(struct t3cdev *tdev, u32 event, u32 port)
+{
+	struct cxgb3_client *client;
+
+	mutex_lock(&cxgb3_db_lock);
+	list_for_each_entry(client, &client_list, client_list) {
+		if (client->event_handler)
+			client->event_handler(tdev, event, port);
+	}
+	mutex_unlock(&cxgb3_db_lock);
+}
+
+static struct net_device *get_iff_from_mac(struct adapter *adapter,
+					   const unsigned char *mac,
+					   unsigned int vlan)
+{
+	int i;
+
+	for_each_port(adapter, i) {
+		struct net_device *dev = adapter->port[i];
+
+		if (ether_addr_equal(dev->dev_addr, mac)) {
+			rcu_read_lock();
+			if (vlan && vlan != VLAN_VID_MASK) {
+				dev = __vlan_find_dev_deep_rcu(dev, htons(ETH_P_8021Q), vlan);
+			} else if (netif_is_bond_slave(dev)) {
+				struct net_device *upper_dev;
+
+				while ((upper_dev =
+					netdev_master_upper_dev_get_rcu(dev)))
+					dev = upper_dev;
+			}
+			rcu_read_unlock();
+			return dev;
+		}
+	}
+	return NULL;
+}
+
+static int cxgb_ulp_iscsi_ctl(struct adapter *adapter, unsigned int req,
+			      void *data)
+{
+	int i;
+	int ret = 0;
+	unsigned int val = 0;
+	struct ulp_iscsi_info *uiip = data;
+
+	switch (req) {
+	case ULP_ISCSI_GET_PARAMS:
+		uiip->pdev = adapter->pdev;
+		uiip->llimit = t3_read_reg(adapter, A_ULPRX_ISCSI_LLIMIT);
+		uiip->ulimit = t3_read_reg(adapter, A_ULPRX_ISCSI_ULIMIT);
+		uiip->tagmask = t3_read_reg(adapter, A_ULPRX_ISCSI_TAGMASK);
+
+		val = t3_read_reg(adapter, A_ULPRX_ISCSI_PSZ);
+		for (i = 0; i < 4; i++, val >>= 8)
+			uiip->pgsz_factor[i] = val & 0xFF;
+
+		val = t3_read_reg(adapter, A_TP_PARA_REG7);
+		uiip->max_txsz =
+		uiip->max_rxsz = min((val >> S_PMMAXXFERLEN0)&M_PMMAXXFERLEN0,
+				     (val >> S_PMMAXXFERLEN1)&M_PMMAXXFERLEN1);
+		/*
+		 * On tx, the iscsi pdu has to be <= tx page size and has to
+		 * fit into the Tx PM FIFO.
+		 */
+		val = min(adapter->params.tp.tx_pg_size,
+			  t3_read_reg(adapter, A_PM1_TX_CFG) >> 17);
+		uiip->max_txsz = min(val, uiip->max_txsz);
+
+		/* set MaxRxData to 16224 */
+		val = t3_read_reg(adapter, A_TP_PARA_REG2);
+		if ((val >> S_MAXRXDATA) != 0x3f60) {
+			val &= (M_RXCOALESCESIZE << S_RXCOALESCESIZE);
+			val |= V_MAXRXDATA(0x3f60);
+			pr_info("%s, iscsi set MaxRxData to 16224 (0x%x)\n",
+				adapter->name, val);
+			t3_write_reg(adapter, A_TP_PARA_REG2, val);
+		}
+
+		/*
+		 * on rx, the iscsi pdu has to be < rx page size and the
+		 * the max rx data length programmed in TP
+		 */
+		val = min(adapter->params.tp.rx_pg_size,
+			  ((t3_read_reg(adapter, A_TP_PARA_REG2)) >>
+				S_MAXRXDATA) & M_MAXRXDATA);
+		uiip->max_rxsz = min(val, uiip->max_rxsz);
+		break;
+	case ULP_ISCSI_SET_PARAMS:
+		t3_write_reg(adapter, A_ULPRX_ISCSI_TAGMASK, uiip->tagmask);
+		/* program the ddp page sizes */
+		for (i = 0; i < 4; i++)
+			val |= (uiip->pgsz_factor[i] & 0xF) << (8 * i);
+		if (val && (val != t3_read_reg(adapter, A_ULPRX_ISCSI_PSZ))) {
+			pr_info("%s, setting iscsi pgsz 0x%x, %u,%u,%u,%u\n",
+				adapter->name, val, uiip->pgsz_factor[0],
+				uiip->pgsz_factor[1], uiip->pgsz_factor[2],
+				uiip->pgsz_factor[3]);
+			t3_write_reg(adapter, A_ULPRX_ISCSI_PSZ, val);
+		}
+		break;
+	default:
+		ret = -EOPNOTSUPP;
+	}
+	return ret;
+}
+
+/* Response queue used for RDMA events. */
+#define ASYNC_NOTIF_RSPQ 0
+
+static int cxgb_rdma_ctl(struct adapter *adapter, unsigned int req, void *data)
+{
+	int ret = 0;
+
+	switch (req) {
+	case RDMA_GET_PARAMS: {
+		struct rdma_info *rdma = data;
+		struct pci_dev *pdev = adapter->pdev;
+
+		rdma->udbell_physbase = pci_resource_start(pdev, 2);
+		rdma->udbell_len = pci_resource_len(pdev, 2);
+		rdma->tpt_base =
+			t3_read_reg(adapter, A_ULPTX_TPT_LLIMIT);
+		rdma->tpt_top = t3_read_reg(adapter, A_ULPTX_TPT_ULIMIT);
+		rdma->pbl_base =
+			t3_read_reg(adapter, A_ULPTX_PBL_LLIMIT);
+		rdma->pbl_top = t3_read_reg(adapter, A_ULPTX_PBL_ULIMIT);
+		rdma->rqt_base = t3_read_reg(adapter, A_ULPRX_RQ_LLIMIT);
+		rdma->rqt_top = t3_read_reg(adapter, A_ULPRX_RQ_ULIMIT);
+		rdma->kdb_addr = adapter->regs + A_SG_KDOORBELL;
+		rdma->pdev = pdev;
+		break;
+	}
+	case RDMA_CQ_OP:{
+		unsigned long flags;
+		struct rdma_cq_op *rdma = data;
+
+		/* may be called in any context */
+		spin_lock_irqsave(&adapter->sge.reg_lock, flags);
+		ret = t3_sge_cqcntxt_op(adapter, rdma->id, rdma->op,
+					rdma->credits);
+		spin_unlock_irqrestore(&adapter->sge.reg_lock, flags);
+		break;
+	}
+	case RDMA_GET_MEM:{
+		struct ch_mem_range *t = data;
+		struct mc7 *mem;
+
+		if ((t->addr & 7) || (t->len & 7))
+			return -EINVAL;
+		if (t->mem_id == MEM_CM)
+			mem = &adapter->cm;
+		else if (t->mem_id == MEM_PMRX)
+			mem = &adapter->pmrx;
+		else if (t->mem_id == MEM_PMTX)
+			mem = &adapter->pmtx;
+		else
+			return -EINVAL;
+
+		ret =
+			t3_mc7_bd_read(mem, t->addr / 8, t->len / 8,
+					(u64 *) t->buf);
+		if (ret)
+			return ret;
+		break;
+	}
+	case RDMA_CQ_SETUP:{
+		struct rdma_cq_setup *rdma = data;
+
+		spin_lock_irq(&adapter->sge.reg_lock);
+		ret =
+			t3_sge_init_cqcntxt(adapter, rdma->id,
+					rdma->base_addr, rdma->size,
+					ASYNC_NOTIF_RSPQ,
+					rdma->ovfl_mode, rdma->credits,
+					rdma->credit_thres);
+		spin_unlock_irq(&adapter->sge.reg_lock);
+		break;
+	}
+	case RDMA_CQ_DISABLE:
+		spin_lock_irq(&adapter->sge.reg_lock);
+		ret = t3_sge_disable_cqcntxt(adapter, *(unsigned int *)data);
+		spin_unlock_irq(&adapter->sge.reg_lock);
+		break;
+	case RDMA_CTRL_QP_SETUP:{
+		struct rdma_ctrlqp_setup *rdma = data;
+
+		spin_lock_irq(&adapter->sge.reg_lock);
+		ret = t3_sge_init_ecntxt(adapter, FW_RI_SGEEC_START, 0,
+						SGE_CNTXT_RDMA,
+						ASYNC_NOTIF_RSPQ,
+						rdma->base_addr, rdma->size,
+						FW_RI_TID_START, 1, 0);
+		spin_unlock_irq(&adapter->sge.reg_lock);
+		break;
+	}
+	case RDMA_GET_MIB: {
+		spin_lock(&adapter->stats_lock);
+		t3_tp_get_mib_stats(adapter, (struct tp_mib_stats *)data);
+		spin_unlock(&adapter->stats_lock);
+		break;
+	}
+	default:
+		ret = -EOPNOTSUPP;
+	}
+	return ret;
+}
+
+static int cxgb_offload_ctl(struct t3cdev *tdev, unsigned int req, void *data)
+{
+	struct adapter *adapter = tdev2adap(tdev);
+	struct tid_range *tid;
+	struct mtutab *mtup;
+	struct iff_mac *iffmacp;
+	struct ddp_params *ddpp;
+	struct adap_ports *ports;
+	struct ofld_page_info *rx_page_info;
+	struct tp_params *tp = &adapter->params.tp;
+	int i;
+
+	switch (req) {
+	case GET_MAX_OUTSTANDING_WR:
+		*(unsigned int *)data = FW_WR_NUM;
+		break;
+	case GET_WR_LEN:
+		*(unsigned int *)data = WR_FLITS;
+		break;
+	case GET_TX_MAX_CHUNK:
+		*(unsigned int *)data = 1 << 20;	/* 1MB */
+		break;
+	case GET_TID_RANGE:
+		tid = data;
+		tid->num = t3_mc5_size(&adapter->mc5) -
+		    adapter->params.mc5.nroutes -
+		    adapter->params.mc5.nfilters - adapter->params.mc5.nservers;
+		tid->base = 0;
+		break;
+	case GET_STID_RANGE:
+		tid = data;
+		tid->num = adapter->params.mc5.nservers;
+		tid->base = t3_mc5_size(&adapter->mc5) - tid->num -
+		    adapter->params.mc5.nfilters - adapter->params.mc5.nroutes;
+		break;
+	case GET_L2T_CAPACITY:
+		*(unsigned int *)data = 2048;
+		break;
+	case GET_MTUS:
+		mtup = data;
+		mtup->size = NMTUS;
+		mtup->mtus = adapter->params.mtus;
+		break;
+	case GET_IFF_FROM_MAC:
+		iffmacp = data;
+		iffmacp->dev = get_iff_from_mac(adapter, iffmacp->mac_addr,
+						iffmacp->vlan_tag &
+						VLAN_VID_MASK);
+		break;
+	case GET_DDP_PARAMS:
+		ddpp = data;
+		ddpp->llimit = t3_read_reg(adapter, A_ULPRX_TDDP_LLIMIT);
+		ddpp->ulimit = t3_read_reg(adapter, A_ULPRX_TDDP_ULIMIT);
+		ddpp->tag_mask = t3_read_reg(adapter, A_ULPRX_TDDP_TAGMASK);
+		break;
+	case GET_PORTS:
+		ports = data;
+		ports->nports = adapter->params.nports;
+		for_each_port(adapter, i)
+			ports->lldevs[i] = adapter->port[i];
+		break;
+	case ULP_ISCSI_GET_PARAMS:
+	case ULP_ISCSI_SET_PARAMS:
+		if (!offload_running(adapter))
+			return -EAGAIN;
+		return cxgb_ulp_iscsi_ctl(adapter, req, data);
+	case RDMA_GET_PARAMS:
+	case RDMA_CQ_OP:
+	case RDMA_CQ_SETUP:
+	case RDMA_CQ_DISABLE:
+	case RDMA_CTRL_QP_SETUP:
+	case RDMA_GET_MEM:
+	case RDMA_GET_MIB:
+		if (!offload_running(adapter))
+			return -EAGAIN;
+		return cxgb_rdma_ctl(adapter, req, data);
+	case GET_RX_PAGE_INFO:
+		rx_page_info = data;
+		rx_page_info->page_size = tp->rx_pg_size;
+		rx_page_info->num = tp->rx_num_pgs;
+		break;
+	case GET_ISCSI_IPV4ADDR: {
+		struct iscsi_ipv4addr *p = data;
+		struct port_info *pi = netdev_priv(p->dev);
+		p->ipv4addr = pi->iscsi_ipv4addr;
+		break;
+	}
+	case GET_EMBEDDED_INFO: {
+		struct ch_embedded_info *e = data;
+
+		spin_lock(&adapter->stats_lock);
+		t3_get_fw_version(adapter, &e->fw_vers);
+		t3_get_tp_version(adapter, &e->tp_vers);
+		spin_unlock(&adapter->stats_lock);
+		break;
+	}
+	default:
+		return -EOPNOTSUPP;
+	}
+	return 0;
+}
+
+/*
+ * Dummy handler for Rx offload packets in case we get an offload packet before
+ * proper processing is setup.  This complains and drops the packet as it isn't
+ * normal to get offload packets at this stage.
+ */
+static int rx_offload_blackhole(struct t3cdev *dev, struct sk_buff **skbs,
+				int n)
+{
+	while (n--)
+		dev_kfree_skb_any(skbs[n]);
+	return 0;
+}
+
+static void dummy_neigh_update(struct t3cdev *dev, struct neighbour *neigh)
+{
+}
+
+void cxgb3_set_dummy_ops(struct t3cdev *dev)
+{
+	dev->recv = rx_offload_blackhole;
+	dev->neigh_update = dummy_neigh_update;
+}
+
+/*
+ * Free an active-open TID.
+ */
+void *cxgb3_free_atid(struct t3cdev *tdev, int atid)
+{
+	struct tid_info *t = &(T3C_DATA(tdev))->tid_maps;
+	union active_open_entry *p = atid2entry(t, atid);
+	void *ctx = p->t3c_tid.ctx;
+
+	spin_lock_bh(&t->atid_lock);
+	p->next = t->afree;
+	t->afree = p;
+	t->atids_in_use--;
+	spin_unlock_bh(&t->atid_lock);
+
+	return ctx;
+}
+
+EXPORT_SYMBOL(cxgb3_free_atid);
+
+/*
+ * Free a server TID and return it to the free pool.
+ */
+void cxgb3_free_stid(struct t3cdev *tdev, int stid)
+{
+	struct tid_info *t = &(T3C_DATA(tdev))->tid_maps;
+	union listen_entry *p = stid2entry(t, stid);
+
+	spin_lock_bh(&t->stid_lock);
+	p->next = t->sfree;
+	t->sfree = p;
+	t->stids_in_use--;
+	spin_unlock_bh(&t->stid_lock);
+}
+
+EXPORT_SYMBOL(cxgb3_free_stid);
+
+void cxgb3_insert_tid(struct t3cdev *tdev, struct cxgb3_client *client,
+		      void *ctx, unsigned int tid)
+{
+	struct tid_info *t = &(T3C_DATA(tdev))->tid_maps;
+
+	t->tid_tab[tid].client = client;
+	t->tid_tab[tid].ctx = ctx;
+	atomic_inc(&t->tids_in_use);
+}
+
+EXPORT_SYMBOL(cxgb3_insert_tid);
+
+/*
+ * Populate a TID_RELEASE WR.  The skb must be already propely sized.
+ */
+static inline void mk_tid_release(struct sk_buff *skb, unsigned int tid)
+{
+	struct cpl_tid_release *req;
+
+	skb->priority = CPL_PRIORITY_SETUP;
+	req = (struct cpl_tid_release *)__skb_put(skb, sizeof(*req));
+	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
+	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_TID_RELEASE, tid));
+}
+
+static void t3_process_tid_release_list(struct work_struct *work)
+{
+	struct t3c_data *td = container_of(work, struct t3c_data,
+					   tid_release_task);
+	struct sk_buff *skb;
+	struct t3cdev *tdev = td->dev;
+
+
+	spin_lock_bh(&td->tid_release_lock);
+	while (td->tid_release_list) {
+		struct t3c_tid_entry *p = td->tid_release_list;
+
+		td->tid_release_list = p->ctx;
+		spin_unlock_bh(&td->tid_release_lock);
+
+		skb = alloc_skb(sizeof(struct cpl_tid_release),
+				GFP_KERNEL);
+		if (!skb)
+			skb = td->nofail_skb;
+		if (!skb) {
+			spin_lock_bh(&td->tid_release_lock);
+			p->ctx = (void *)td->tid_release_list;
+			td->tid_release_list = p;
+			break;
+		}
+		mk_tid_release(skb, p - td->tid_maps.tid_tab);
+		cxgb3_ofld_send(tdev, skb);
+		p->ctx = NULL;
+		if (skb == td->nofail_skb)
+			td->nofail_skb =
+				alloc_skb(sizeof(struct cpl_tid_release),
+					GFP_KERNEL);
+		spin_lock_bh(&td->tid_release_lock);
+	}
+	td->release_list_incomplete = (td->tid_release_list == NULL) ? 0 : 1;
+	spin_unlock_bh(&td->tid_release_lock);
+
+	if (!td->nofail_skb)
+		td->nofail_skb =
+			alloc_skb(sizeof(struct cpl_tid_release),
+				GFP_KERNEL);
+}
+
+/* use ctx as a next pointer in the tid release list */
+void cxgb3_queue_tid_release(struct t3cdev *tdev, unsigned int tid)
+{
+	struct t3c_data *td = T3C_DATA(tdev);
+	struct t3c_tid_entry *p = &td->tid_maps.tid_tab[tid];
+
+	spin_lock_bh(&td->tid_release_lock);
+	p->ctx = (void *)td->tid_release_list;
+	p->client = NULL;
+	td->tid_release_list = p;
+	if (!p->ctx || td->release_list_incomplete)
+		schedule_work(&td->tid_release_task);
+	spin_unlock_bh(&td->tid_release_lock);
+}
+
+EXPORT_SYMBOL(cxgb3_queue_tid_release);
+
+/*
+ * Remove a tid from the TID table.  A client may defer processing its last
+ * CPL message if it is locked at the time it arrives, and while the message
+ * sits in the client's backlog the TID may be reused for another connection.
+ * To handle this we atomically switch the TID association if it still points
+ * to the original client context.
+ */
+void cxgb3_remove_tid(struct t3cdev *tdev, void *ctx, unsigned int tid)
+{
+	struct tid_info *t = &(T3C_DATA(tdev))->tid_maps;
+
+	BUG_ON(tid >= t->ntids);
+	if (tdev->type == T3A)
+		(void)cmpxchg(&t->tid_tab[tid].ctx, ctx, NULL);
+	else {
+		struct sk_buff *skb;
+
+		skb = alloc_skb(sizeof(struct cpl_tid_release), GFP_ATOMIC);
+		if (likely(skb)) {
+			mk_tid_release(skb, tid);
+			cxgb3_ofld_send(tdev, skb);
+			t->tid_tab[tid].ctx = NULL;
+		} else
+			cxgb3_queue_tid_release(tdev, tid);
+	}
+	atomic_dec(&t->tids_in_use);
+}
+
+EXPORT_SYMBOL(cxgb3_remove_tid);
+
+int cxgb3_alloc_atid(struct t3cdev *tdev, struct cxgb3_client *client,
+		     void *ctx)
+{
+	int atid = -1;
+	struct tid_info *t = &(T3C_DATA(tdev))->tid_maps;
+
+	spin_lock_bh(&t->atid_lock);
+	if (t->afree &&
+	    t->atids_in_use + atomic_read(&t->tids_in_use) + MC5_MIN_TIDS <=
+	    t->ntids) {
+		union active_open_entry *p = t->afree;
+
+		atid = (p - t->atid_tab) + t->atid_base;
+		t->afree = p->next;
+		p->t3c_tid.ctx = ctx;
+		p->t3c_tid.client = client;
+		t->atids_in_use++;
+	}
+	spin_unlock_bh(&t->atid_lock);
+	return atid;
+}
+
+EXPORT_SYMBOL(cxgb3_alloc_atid);
+
+int cxgb3_alloc_stid(struct t3cdev *tdev, struct cxgb3_client *client,
+		     void *ctx)
+{
+	int stid = -1;
+	struct tid_info *t = &(T3C_DATA(tdev))->tid_maps;
+
+	spin_lock_bh(&t->stid_lock);
+	if (t->sfree) {
+		union listen_entry *p = t->sfree;
+
+		stid = (p - t->stid_tab) + t->stid_base;
+		t->sfree = p->next;
+		p->t3c_tid.ctx = ctx;
+		p->t3c_tid.client = client;
+		t->stids_in_use++;
+	}
+	spin_unlock_bh(&t->stid_lock);
+	return stid;
+}
+
+EXPORT_SYMBOL(cxgb3_alloc_stid);
+
+/* Get the t3cdev associated with a net_device */
+struct t3cdev *dev2t3cdev(struct net_device *dev)
+{
+	const struct port_info *pi = netdev_priv(dev);
+
+	return (struct t3cdev *)pi->adapter;
+}
+
+EXPORT_SYMBOL(dev2t3cdev);
+
+static int do_smt_write_rpl(struct t3cdev *dev, struct sk_buff *skb)
+{
+	struct cpl_smt_write_rpl *rpl = cplhdr(skb);
+
+	if (rpl->status != CPL_ERR_NONE)
+		pr_err("Unexpected SMT_WRITE_RPL status %u for entry %u\n",
+		       rpl->status, GET_TID(rpl));
+
+	return CPL_RET_BUF_DONE;
+}
+
+static int do_l2t_write_rpl(struct t3cdev *dev, struct sk_buff *skb)
+{
+	struct cpl_l2t_write_rpl *rpl = cplhdr(skb);
+
+	if (rpl->status != CPL_ERR_NONE)
+		pr_err("Unexpected L2T_WRITE_RPL status %u for entry %u\n",
+		       rpl->status, GET_TID(rpl));
+
+	return CPL_RET_BUF_DONE;
+}
+
+static int do_rte_write_rpl(struct t3cdev *dev, struct sk_buff *skb)
+{
+	struct cpl_rte_write_rpl *rpl = cplhdr(skb);
+
+	if (rpl->status != CPL_ERR_NONE)
+		pr_err("Unexpected RTE_WRITE_RPL status %u for entry %u\n",
+		       rpl->status, GET_TID(rpl));
+
+	return CPL_RET_BUF_DONE;
+}
+
+static int do_act_open_rpl(struct t3cdev *dev, struct sk_buff *skb)
+{
+	struct cpl_act_open_rpl *rpl = cplhdr(skb);
+	unsigned int atid = G_TID(ntohl(rpl->atid));
+	struct t3c_tid_entry *t3c_tid;
+
+	t3c_tid = lookup_atid(&(T3C_DATA(dev))->tid_maps, atid);
+	if (t3c_tid && t3c_tid->ctx && t3c_tid->client &&
+	    t3c_tid->client->handlers &&
+	    t3c_tid->client->handlers[CPL_ACT_OPEN_RPL]) {
+		return t3c_tid->client->handlers[CPL_ACT_OPEN_RPL] (dev, skb,
+								    t3c_tid->
+								    ctx);
+	} else {
+		pr_err("%s: received clientless CPL command 0x%x\n",
+		       dev->name, CPL_ACT_OPEN_RPL);
+		return CPL_RET_BUF_DONE | CPL_RET_BAD_MSG;
+	}
+}
+
+static int do_stid_rpl(struct t3cdev *dev, struct sk_buff *skb)
+{
+	union opcode_tid *p = cplhdr(skb);
+	unsigned int stid = G_TID(ntohl(p->opcode_tid));
+	struct t3c_tid_entry *t3c_tid;
+
+	t3c_tid = lookup_stid(&(T3C_DATA(dev))->tid_maps, stid);
+	if (t3c_tid && t3c_tid->ctx && t3c_tid->client->handlers &&
+	    t3c_tid->client->handlers[p->opcode]) {
+		return t3c_tid->client->handlers[p->opcode] (dev, skb,
+							     t3c_tid->ctx);
+	} else {
+		pr_err("%s: received clientless CPL command 0x%x\n",
+		       dev->name, p->opcode);
+		return CPL_RET_BUF_DONE | CPL_RET_BAD_MSG;
+	}
+}
+
+static int do_hwtid_rpl(struct t3cdev *dev, struct sk_buff *skb)
+{
+	union opcode_tid *p = cplhdr(skb);
+	unsigned int hwtid = G_TID(ntohl(p->opcode_tid));
+	struct t3c_tid_entry *t3c_tid;
+
+	t3c_tid = lookup_tid(&(T3C_DATA(dev))->tid_maps, hwtid);
+	if (t3c_tid && t3c_tid->ctx && t3c_tid->client->handlers &&
+	    t3c_tid->client->handlers[p->opcode]) {
+		return t3c_tid->client->handlers[p->opcode]
+		    (dev, skb, t3c_tid->ctx);
+	} else {
+		pr_err("%s: received clientless CPL command 0x%x\n",
+		       dev->name, p->opcode);
+		return CPL_RET_BUF_DONE | CPL_RET_BAD_MSG;
+	}
+}
+
+static int do_cr(struct t3cdev *dev, struct sk_buff *skb)
+{
+	struct cpl_pass_accept_req *req = cplhdr(skb);
+	unsigned int stid = G_PASS_OPEN_TID(ntohl(req->tos_tid));
+	struct tid_info *t = &(T3C_DATA(dev))->tid_maps;
+	struct t3c_tid_entry *t3c_tid;
+	unsigned int tid = GET_TID(req);
+
+	if (unlikely(tid >= t->ntids)) {
+		printk("%s: passive open TID %u too large\n",
+		       dev->name, tid);
+		t3_fatal_err(tdev2adap(dev));
+		return CPL_RET_BUF_DONE;
+	}
+
+	t3c_tid = lookup_stid(t, stid);
+	if (t3c_tid && t3c_tid->ctx && t3c_tid->client->handlers &&
+	    t3c_tid->client->handlers[CPL_PASS_ACCEPT_REQ]) {
+		return t3c_tid->client->handlers[CPL_PASS_ACCEPT_REQ]
+		    (dev, skb, t3c_tid->ctx);
+	} else {
+		pr_err("%s: received clientless CPL command 0x%x\n",
+		       dev->name, CPL_PASS_ACCEPT_REQ);
+		return CPL_RET_BUF_DONE | CPL_RET_BAD_MSG;
+	}
+}
+
+/*
+ * Returns an sk_buff for a reply CPL message of size len.  If the input
+ * sk_buff has no other users it is trimmed and reused, otherwise a new buffer
+ * is allocated.  The input skb must be of size at least len.  Note that this
+ * operation does not destroy the original skb data even if it decides to reuse
+ * the buffer.
+ */
+static struct sk_buff *cxgb3_get_cpl_reply_skb(struct sk_buff *skb, size_t len,
+					       gfp_t gfp)
+{
+	if (likely(!skb_cloned(skb))) {
+		BUG_ON(skb->len < len);
+		__skb_trim(skb, len);
+		skb_get(skb);
+	} else {
+		skb = alloc_skb(len, gfp);
+		if (skb)
+			__skb_put(skb, len);
+	}
+	return skb;
+}
+
+static int do_abort_req_rss(struct t3cdev *dev, struct sk_buff *skb)
+{
+	union opcode_tid *p = cplhdr(skb);
+	unsigned int hwtid = G_TID(ntohl(p->opcode_tid));
+	struct t3c_tid_entry *t3c_tid;
+
+	t3c_tid = lookup_tid(&(T3C_DATA(dev))->tid_maps, hwtid);
+	if (t3c_tid && t3c_tid->ctx && t3c_tid->client->handlers &&
+	    t3c_tid->client->handlers[p->opcode]) {
+		return t3c_tid->client->handlers[p->opcode]
+		    (dev, skb, t3c_tid->ctx);
+	} else {
+		struct cpl_abort_req_rss *req = cplhdr(skb);
+		struct cpl_abort_rpl *rpl;
+		struct sk_buff *reply_skb;
+		unsigned int tid = GET_TID(req);
+		u8 cmd = req->status;
+
+		if (req->status == CPL_ERR_RTX_NEG_ADVICE ||
+		    req->status == CPL_ERR_PERSIST_NEG_ADVICE)
+			goto out;
+
+		reply_skb = cxgb3_get_cpl_reply_skb(skb,
+						    sizeof(struct
+							   cpl_abort_rpl),
+						    GFP_ATOMIC);
+
+		if (!reply_skb) {
+			printk("do_abort_req_rss: couldn't get skb!\n");
+			goto out;
+		}
+		reply_skb->priority = CPL_PRIORITY_DATA;
+		__skb_put(reply_skb, sizeof(struct cpl_abort_rpl));
+		rpl = cplhdr(reply_skb);
+		rpl->wr.wr_hi =
+		    htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_RPL));
+		rpl->wr.wr_lo = htonl(V_WR_TID(tid));
+		OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_ABORT_RPL, tid));
+		rpl->cmd = cmd;
+		cxgb3_ofld_send(dev, reply_skb);
+out:
+		return CPL_RET_BUF_DONE;
+	}
+}
+
+static int do_act_establish(struct t3cdev *dev, struct sk_buff *skb)
+{
+	struct cpl_act_establish *req = cplhdr(skb);
+	unsigned int atid = G_PASS_OPEN_TID(ntohl(req->tos_tid));
+	struct tid_info *t = &(T3C_DATA(dev))->tid_maps;
+	struct t3c_tid_entry *t3c_tid;
+	unsigned int tid = GET_TID(req);
+
+	if (unlikely(tid >= t->ntids)) {
+		printk("%s: active establish TID %u too large\n",
+		       dev->name, tid);
+		t3_fatal_err(tdev2adap(dev));
+		return CPL_RET_BUF_DONE;
+	}
+
+	t3c_tid = lookup_atid(t, atid);
+	if (t3c_tid && t3c_tid->ctx && t3c_tid->client->handlers &&
+	    t3c_tid->client->handlers[CPL_ACT_ESTABLISH]) {
+		return t3c_tid->client->handlers[CPL_ACT_ESTABLISH]
+		    (dev, skb, t3c_tid->ctx);
+	} else {
+		pr_err("%s: received clientless CPL command 0x%x\n",
+		       dev->name, CPL_ACT_ESTABLISH);
+		return CPL_RET_BUF_DONE | CPL_RET_BAD_MSG;
+	}
+}
+
+static int do_trace(struct t3cdev *dev, struct sk_buff *skb)
+{
+	struct cpl_trace_pkt *p = cplhdr(skb);
+
+	skb->protocol = htons(0xffff);
+	skb->dev = dev->lldev;
+	skb_pull(skb, sizeof(*p));
+	skb_reset_mac_header(skb);
+	netif_receive_skb(skb);
+	return 0;
+}
+
+/*
+ * That skb would better have come from process_responses() where we abuse
+ * ->priority and ->csum to carry our data.  NB: if we get to per-arch
+ * ->csum, the things might get really interesting here.
+ */
+
+static inline u32 get_hwtid(struct sk_buff *skb)
+{
+	return ntohl((__force __be32)skb->priority) >> 8 & 0xfffff;
+}
+
+static inline u32 get_opcode(struct sk_buff *skb)
+{
+	return G_OPCODE(ntohl((__force __be32)skb->csum));
+}
+
+static int do_term(struct t3cdev *dev, struct sk_buff *skb)
+{
+	unsigned int hwtid = get_hwtid(skb);
+	unsigned int opcode = get_opcode(skb);
+	struct t3c_tid_entry *t3c_tid;
+
+	t3c_tid = lookup_tid(&(T3C_DATA(dev))->tid_maps, hwtid);
+	if (t3c_tid && t3c_tid->ctx && t3c_tid->client->handlers &&
+	    t3c_tid->client->handlers[opcode]) {
+		return t3c_tid->client->handlers[opcode] (dev, skb,
+							  t3c_tid->ctx);
+	} else {
+		pr_err("%s: received clientless CPL command 0x%x\n",
+		       dev->name, opcode);
+		return CPL_RET_BUF_DONE | CPL_RET_BAD_MSG;
+	}
+}
+
+static int nb_callback(struct notifier_block *self, unsigned long event,
+		       void *ctx)
+{
+	switch (event) {
+	case (NETEVENT_NEIGH_UPDATE):{
+		cxgb_neigh_update((struct neighbour *)ctx);
+		break;
+	}
+	case (NETEVENT_REDIRECT):{
+		struct netevent_redirect *nr = ctx;
+		cxgb_redirect(nr->old, nr->new, nr->neigh,
+			      nr->daddr);
+		cxgb_neigh_update(nr->neigh);
+		break;
+	}
+	default:
+		break;
+	}
+	return 0;
+}
+
+static struct notifier_block nb = {
+	.notifier_call = nb_callback
+};
+
+/*
+ * Process a received packet with an unknown/unexpected CPL opcode.
+ */
+static int do_bad_cpl(struct t3cdev *dev, struct sk_buff *skb)
+{
+	pr_err("%s: received bad CPL command 0x%x\n", dev->name, *skb->data);
+	return CPL_RET_BUF_DONE | CPL_RET_BAD_MSG;
+}
+
+/*
+ * Handlers for each CPL opcode
+ */
+static cpl_handler_func cpl_handlers[NUM_CPL_CMDS];
+
+/*
+ * Add a new handler to the CPL dispatch table.  A NULL handler may be supplied
+ * to unregister an existing handler.
+ */
+void t3_register_cpl_handler(unsigned int opcode, cpl_handler_func h)
+{
+	if (opcode < NUM_CPL_CMDS)
+		cpl_handlers[opcode] = h ? h : do_bad_cpl;
+	else
+		pr_err("T3C: handler registration for opcode %x failed\n",
+		       opcode);
+}
+
+EXPORT_SYMBOL(t3_register_cpl_handler);
+
+/*
+ * T3CDEV's receive method.
+ */
+static int process_rx(struct t3cdev *dev, struct sk_buff **skbs, int n)
+{
+	while (n--) {
+		struct sk_buff *skb = *skbs++;
+		unsigned int opcode = get_opcode(skb);
+		int ret = cpl_handlers[opcode] (dev, skb);
+
+#if VALIDATE_TID
+		if (ret & CPL_RET_UNKNOWN_TID) {
+			union opcode_tid *p = cplhdr(skb);
+
+			pr_err("%s: CPL message (opcode %u) had unknown TID %u\n",
+			       dev->name, opcode, G_TID(ntohl(p->opcode_tid)));
+		}
+#endif
+		if (ret & CPL_RET_BUF_DONE)
+			kfree_skb(skb);
+	}
+	return 0;
+}
+
+/*
+ * Sends an sk_buff to a T3C driver after dealing with any active network taps.
+ */
+int cxgb3_ofld_send(struct t3cdev *dev, struct sk_buff *skb)
+{
+	int r;
+
+	local_bh_disable();
+	r = dev->send(dev, skb);
+	local_bh_enable();
+	return r;
+}
+
+EXPORT_SYMBOL(cxgb3_ofld_send);
+
+static int is_offloading(struct net_device *dev)
+{
+	struct adapter *adapter;
+	int i;
+
+	read_lock_bh(&adapter_list_lock);
+	list_for_each_entry(adapter, &adapter_list, adapter_list) {
+		for_each_port(adapter, i) {
+			if (dev == adapter->port[i]) {
+				read_unlock_bh(&adapter_list_lock);
+				return 1;
+			}
+		}
+	}
+	read_unlock_bh(&adapter_list_lock);
+	return 0;
+}
+
+static void cxgb_neigh_update(struct neighbour *neigh)
+{
+	struct net_device *dev;
+
+	if (!neigh)
+		return;
+	dev = neigh->dev;
+	if (dev && (is_offloading(dev))) {
+		struct t3cdev *tdev = dev2t3cdev(dev);
+
+		BUG_ON(!tdev);
+		t3_l2t_update(tdev, neigh);
+	}
+}
+
+static void set_l2t_ix(struct t3cdev *tdev, u32 tid, struct l2t_entry *e)
+{
+	struct sk_buff *skb;
+	struct cpl_set_tcb_field *req;
+
+	skb = alloc_skb(sizeof(*req), GFP_ATOMIC);
+	if (!skb) {
+		pr_err("%s: cannot allocate skb!\n", __func__);
+		return;
+	}
+	skb->priority = CPL_PRIORITY_CONTROL;
+	req = (struct cpl_set_tcb_field *)skb_put(skb, sizeof(*req));
+	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
+	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, tid));
+	req->reply = 0;
+	req->cpu_idx = 0;
+	req->word = htons(W_TCB_L2T_IX);
+	req->mask = cpu_to_be64(V_TCB_L2T_IX(M_TCB_L2T_IX));
+	req->val = cpu_to_be64(V_TCB_L2T_IX(e->idx));
+	tdev->send(tdev, skb);
+}
+
+static void cxgb_redirect(struct dst_entry *old, struct dst_entry *new,
+			  struct neighbour *neigh,
+			  const void *daddr)
+{
+	struct net_device *dev;
+	struct tid_info *ti;
+	struct t3cdev *tdev;
+	u32 tid;
+	int update_tcb;
+	struct l2t_entry *e;
+	struct t3c_tid_entry *te;
+
+	dev = neigh->dev;
+
+	if (!is_offloading(dev))
+		return;
+	tdev = dev2t3cdev(dev);
+	BUG_ON(!tdev);
+
+	/* Add new L2T entry */
+	e = t3_l2t_get(tdev, new, dev, daddr);
+	if (!e) {
+		pr_err("%s: couldn't allocate new l2t entry!\n", __func__);
+		return;
+	}
+
+	/* Walk tid table and notify clients of dst change. */
+	ti = &(T3C_DATA(tdev))->tid_maps;
+	for (tid = 0; tid < ti->ntids; tid++) {
+		te = lookup_tid(ti, tid);
+		BUG_ON(!te);
+		if (te && te->ctx && te->client && te->client->redirect) {
+			update_tcb = te->client->redirect(te->ctx, old, new, e);
+			if (update_tcb) {
+				rcu_read_lock();
+				l2t_hold(L2DATA(tdev), e);
+				rcu_read_unlock();
+				set_l2t_ix(tdev, tid, e);
+			}
+		}
+	}
+	l2t_release(tdev, e);
+}
+
+/*
+ * Allocate a chunk of memory using kmalloc or, if that fails, vmalloc.
+ * The allocated memory is cleared.
+ */
+void *cxgb_alloc_mem(unsigned long size)
+{
+	void *p = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
+
+	if (!p)
+		p = vzalloc(size);
+	return p;
+}
+
+/*
+ * Free memory allocated through t3_alloc_mem().
+ */
+void cxgb_free_mem(void *addr)
+{
+	if (is_vmalloc_addr(addr))
+		vfree(addr);
+	else
+		kfree(addr);
+}
+
+/*
+ * Allocate and initialize the TID tables.  Returns 0 on success.
+ */
+static int init_tid_tabs(struct tid_info *t, unsigned int ntids,
+			 unsigned int natids, unsigned int nstids,
+			 unsigned int atid_base, unsigned int stid_base)
+{
+	unsigned long size = ntids * sizeof(*t->tid_tab) +
+	    natids * sizeof(*t->atid_tab) + nstids * sizeof(*t->stid_tab);
+
+	t->tid_tab = cxgb_alloc_mem(size);
+	if (!t->tid_tab)
+		return -ENOMEM;
+
+	t->stid_tab = (union listen_entry *)&t->tid_tab[ntids];
+	t->atid_tab = (union active_open_entry *)&t->stid_tab[nstids];
+	t->ntids = ntids;
+	t->nstids = nstids;
+	t->stid_base = stid_base;
+	t->sfree = NULL;
+	t->natids = natids;
+	t->atid_base = atid_base;
+	t->afree = NULL;
+	t->stids_in_use = t->atids_in_use = 0;
+	atomic_set(&t->tids_in_use, 0);
+	spin_lock_init(&t->stid_lock);
+	spin_lock_init(&t->atid_lock);
+
+	/*
+	 * Setup the free lists for stid_tab and atid_tab.
+	 */
+	if (nstids) {
+		while (--nstids)
+			t->stid_tab[nstids - 1].next = &t->stid_tab[nstids];
+		t->sfree = t->stid_tab;
+	}
+	if (natids) {
+		while (--natids)
+			t->atid_tab[natids - 1].next = &t->atid_tab[natids];
+		t->afree = t->atid_tab;
+	}
+	return 0;
+}
+
+static void free_tid_maps(struct tid_info *t)
+{
+	cxgb_free_mem(t->tid_tab);
+}
+
+static inline void add_adapter(struct adapter *adap)
+{
+	write_lock_bh(&adapter_list_lock);
+	list_add_tail(&adap->adapter_list, &adapter_list);
+	write_unlock_bh(&adapter_list_lock);
+}
+
+static inline void remove_adapter(struct adapter *adap)
+{
+	write_lock_bh(&adapter_list_lock);
+	list_del(&adap->adapter_list);
+	write_unlock_bh(&adapter_list_lock);
+}
+
+int cxgb3_offload_activate(struct adapter *adapter)
+{
+	struct t3cdev *dev = &adapter->tdev;
+	int natids, err;
+	struct t3c_data *t;
+	struct tid_range stid_range, tid_range;
+	struct mtutab mtutab;
+	unsigned int l2t_capacity;
+	struct l2t_data *l2td;
+
+	t = kzalloc(sizeof(*t), GFP_KERNEL);
+	if (!t)
+		return -ENOMEM;
+
+	err = -EOPNOTSUPP;
+	if (dev->ctl(dev, GET_TX_MAX_CHUNK, &t->tx_max_chunk) < 0 ||
+	    dev->ctl(dev, GET_MAX_OUTSTANDING_WR, &t->max_wrs) < 0 ||
+	    dev->ctl(dev, GET_L2T_CAPACITY, &l2t_capacity) < 0 ||
+	    dev->ctl(dev, GET_MTUS, &mtutab) < 0 ||
+	    dev->ctl(dev, GET_TID_RANGE, &tid_range) < 0 ||
+	    dev->ctl(dev, GET_STID_RANGE, &stid_range) < 0)
+		goto out_free;
+
+	err = -ENOMEM;
+	l2td = t3_init_l2t(l2t_capacity);
+	if (!l2td)
+		goto out_free;
+
+	natids = min(tid_range.num / 2, MAX_ATIDS);
+	err = init_tid_tabs(&t->tid_maps, tid_range.num, natids,
+			    stid_range.num, ATID_BASE, stid_range.base);
+	if (err)
+		goto out_free_l2t;
+
+	t->mtus = mtutab.mtus;
+	t->nmtus = mtutab.size;
+
+	INIT_WORK(&t->tid_release_task, t3_process_tid_release_list);
+	spin_lock_init(&t->tid_release_lock);
+	INIT_LIST_HEAD(&t->list_node);
+	t->dev = dev;
+
+	RCU_INIT_POINTER(dev->l2opt, l2td);
+	T3C_DATA(dev) = t;
+	dev->recv = process_rx;
+	dev->neigh_update = t3_l2t_update;
+
+	/* Register netevent handler once */
+	if (list_empty(&adapter_list))
+		register_netevent_notifier(&nb);
+
+	t->nofail_skb = alloc_skb(sizeof(struct cpl_tid_release), GFP_KERNEL);
+	t->release_list_incomplete = 0;
+
+	add_adapter(adapter);
+	return 0;
+
+out_free_l2t:
+	t3_free_l2t(l2td);
+out_free:
+	kfree(t);
+	return err;
+}
+
+static void clean_l2_data(struct rcu_head *head)
+{
+	struct l2t_data *d = container_of(head, struct l2t_data, rcu_head);
+	t3_free_l2t(d);
+}
+
+
+void cxgb3_offload_deactivate(struct adapter *adapter)
+{
+	struct t3cdev *tdev = &adapter->tdev;
+	struct t3c_data *t = T3C_DATA(tdev);
+	struct l2t_data *d;
+
+	remove_adapter(adapter);
+	if (list_empty(&adapter_list))
+		unregister_netevent_notifier(&nb);
+
+	free_tid_maps(&t->tid_maps);
+	T3C_DATA(tdev) = NULL;
+	rcu_read_lock();
+	d = L2DATA(tdev);
+	rcu_read_unlock();
+	RCU_INIT_POINTER(tdev->l2opt, NULL);
+	call_rcu(&d->rcu_head, clean_l2_data);
+	if (t->nofail_skb)
+		kfree_skb(t->nofail_skb);
+	kfree(t);
+}
+
+static inline void register_tdev(struct t3cdev *tdev)
+{
+	static int unit;
+
+	mutex_lock(&cxgb3_db_lock);
+	snprintf(tdev->name, sizeof(tdev->name), "ofld_dev%d", unit++);
+	list_add_tail(&tdev->ofld_dev_list, &ofld_dev_list);
+	mutex_unlock(&cxgb3_db_lock);
+}
+
+static inline void unregister_tdev(struct t3cdev *tdev)
+{
+	mutex_lock(&cxgb3_db_lock);
+	list_del(&tdev->ofld_dev_list);
+	mutex_unlock(&cxgb3_db_lock);
+}
+
+static inline int adap2type(struct adapter *adapter)
+{
+	int type = 0;
+
+	switch (adapter->params.rev) {
+	case T3_REV_A:
+		type = T3A;
+		break;
+	case T3_REV_B:
+	case T3_REV_B2:
+		type = T3B;
+		break;
+	case T3_REV_C:
+		type = T3C;
+		break;
+	}
+	return type;
+}
+
+void cxgb3_adapter_ofld(struct adapter *adapter)
+{
+	struct t3cdev *tdev = &adapter->tdev;
+
+	INIT_LIST_HEAD(&tdev->ofld_dev_list);
+
+	cxgb3_set_dummy_ops(tdev);
+	tdev->send = t3_offload_tx;
+	tdev->ctl = cxgb_offload_ctl;
+	tdev->type = adap2type(adapter);
+
+	register_tdev(tdev);
+}
+
+void cxgb3_adapter_unofld(struct adapter *adapter)
+{
+	struct t3cdev *tdev = &adapter->tdev;
+
+	tdev->recv = NULL;
+	tdev->neigh_update = NULL;
+
+	unregister_tdev(tdev);
+}
+
+void __init cxgb3_offload_init(void)
+{
+	int i;
+
+	for (i = 0; i < NUM_CPL_CMDS; ++i)
+		cpl_handlers[i] = do_bad_cpl;
+
+	t3_register_cpl_handler(CPL_SMT_WRITE_RPL, do_smt_write_rpl);
+	t3_register_cpl_handler(CPL_L2T_WRITE_RPL, do_l2t_write_rpl);
+	t3_register_cpl_handler(CPL_RTE_WRITE_RPL, do_rte_write_rpl);
+	t3_register_cpl_handler(CPL_PASS_OPEN_RPL, do_stid_rpl);
+	t3_register_cpl_handler(CPL_CLOSE_LISTSRV_RPL, do_stid_rpl);
+	t3_register_cpl_handler(CPL_PASS_ACCEPT_REQ, do_cr);
+	t3_register_cpl_handler(CPL_PASS_ESTABLISH, do_hwtid_rpl);
+	t3_register_cpl_handler(CPL_ABORT_RPL_RSS, do_hwtid_rpl);
+	t3_register_cpl_handler(CPL_ABORT_RPL, do_hwtid_rpl);
+	t3_register_cpl_handler(CPL_RX_URG_NOTIFY, do_hwtid_rpl);
+	t3_register_cpl_handler(CPL_RX_DATA, do_hwtid_rpl);
+	t3_register_cpl_handler(CPL_TX_DATA_ACK, do_hwtid_rpl);
+	t3_register_cpl_handler(CPL_TX_DMA_ACK, do_hwtid_rpl);
+	t3_register_cpl_handler(CPL_ACT_OPEN_RPL, do_act_open_rpl);
+	t3_register_cpl_handler(CPL_PEER_CLOSE, do_hwtid_rpl);
+	t3_register_cpl_handler(CPL_CLOSE_CON_RPL, do_hwtid_rpl);
+	t3_register_cpl_handler(CPL_ABORT_REQ_RSS, do_abort_req_rss);
+	t3_register_cpl_handler(CPL_ACT_ESTABLISH, do_act_establish);
+	t3_register_cpl_handler(CPL_SET_TCB_RPL, do_hwtid_rpl);
+	t3_register_cpl_handler(CPL_GET_TCB_RPL, do_hwtid_rpl);
+	t3_register_cpl_handler(CPL_RDMA_TERMINATE, do_term);
+	t3_register_cpl_handler(CPL_RDMA_EC_STATUS, do_hwtid_rpl);
+	t3_register_cpl_handler(CPL_TRACE_PKT, do_trace);
+	t3_register_cpl_handler(CPL_RX_DATA_DDP, do_hwtid_rpl);
+	t3_register_cpl_handler(CPL_RX_DDP_COMPLETE, do_hwtid_rpl);
+	t3_register_cpl_handler(CPL_ISCSI_HDR, do_hwtid_rpl);
+}
diff --git a/drivers/net/ethernet/chelsio/cxgb3/cxgb3_offload.h b/drivers/net/ethernet/chelsio/cxgb3/cxgb3_offload.h
new file mode 100644
index 0000000..929c298
--- /dev/null
+++ b/drivers/net/ethernet/chelsio/cxgb3/cxgb3_offload.h
@@ -0,0 +1,209 @@
+/*
+ * Copyright (c) 2006-2008 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef _CXGB3_OFFLOAD_H
+#define _CXGB3_OFFLOAD_H
+
+#include <linux/list.h>
+#include <linux/skbuff.h>
+
+#include "l2t.h"
+
+#include "t3cdev.h"
+#include "t3_cpl.h"
+
+struct adapter;
+
+void cxgb3_offload_init(void);
+
+void cxgb3_adapter_ofld(struct adapter *adapter);
+void cxgb3_adapter_unofld(struct adapter *adapter);
+int cxgb3_offload_activate(struct adapter *adapter);
+void cxgb3_offload_deactivate(struct adapter *adapter);
+
+void cxgb3_set_dummy_ops(struct t3cdev *dev);
+
+struct t3cdev *dev2t3cdev(struct net_device *dev);
+
+/*
+ * Client registration.  Users of T3 driver must register themselves.
+ * The T3 driver will call the add function of every client for each T3
+ * adapter activated, passing up the t3cdev ptr.  Each client fills out an
+ * array of callback functions to process CPL messages.
+ */
+
+void cxgb3_register_client(struct cxgb3_client *client);
+void cxgb3_unregister_client(struct cxgb3_client *client);
+void cxgb3_add_clients(struct t3cdev *tdev);
+void cxgb3_remove_clients(struct t3cdev *tdev);
+void cxgb3_event_notify(struct t3cdev *tdev, u32 event, u32 port);
+
+typedef int (*cxgb3_cpl_handler_func)(struct t3cdev *dev,
+				      struct sk_buff *skb, void *ctx);
+
+enum {
+	OFFLOAD_STATUS_UP,
+	OFFLOAD_STATUS_DOWN,
+	OFFLOAD_PORT_DOWN,
+	OFFLOAD_PORT_UP,
+	OFFLOAD_DB_FULL,
+	OFFLOAD_DB_EMPTY,
+	OFFLOAD_DB_DROP
+};
+
+struct cxgb3_client {
+	char *name;
+	void (*add) (struct t3cdev *);
+	void (*remove) (struct t3cdev *);
+	cxgb3_cpl_handler_func *handlers;
+	int (*redirect)(void *ctx, struct dst_entry *old,
+			struct dst_entry *new, struct l2t_entry *l2t);
+	struct list_head client_list;
+	void (*event_handler)(struct t3cdev *tdev, u32 event, u32 port);
+};
+
+/*
+ * TID allocation services.
+ */
+int cxgb3_alloc_atid(struct t3cdev *dev, struct cxgb3_client *client,
+		     void *ctx);
+int cxgb3_alloc_stid(struct t3cdev *dev, struct cxgb3_client *client,
+		     void *ctx);
+void *cxgb3_free_atid(struct t3cdev *dev, int atid);
+void cxgb3_free_stid(struct t3cdev *dev, int stid);
+void cxgb3_insert_tid(struct t3cdev *dev, struct cxgb3_client *client,
+		      void *ctx, unsigned int tid);
+void cxgb3_queue_tid_release(struct t3cdev *dev, unsigned int tid);
+void cxgb3_remove_tid(struct t3cdev *dev, void *ctx, unsigned int tid);
+
+struct t3c_tid_entry {
+	struct cxgb3_client *client;
+	void *ctx;
+};
+
+/* CPL message priority levels */
+enum {
+	CPL_PRIORITY_DATA = 0,	/* data messages */
+	CPL_PRIORITY_SETUP = 1,	/* connection setup messages */
+	CPL_PRIORITY_TEARDOWN = 0,	/* connection teardown messages */
+	CPL_PRIORITY_LISTEN = 1,	/* listen start/stop messages */
+	CPL_PRIORITY_ACK = 1,	/* RX ACK messages */
+	CPL_PRIORITY_CONTROL = 1	/* offload control messages */
+};
+
+/* Flags for return value of CPL message handlers */
+enum {
+	CPL_RET_BUF_DONE = 1, /* buffer processing done, buffer may be freed */
+	CPL_RET_BAD_MSG = 2,  /* bad CPL message (e.g., unknown opcode) */
+	CPL_RET_UNKNOWN_TID = 4	/* unexpected unknown TID */
+};
+
+typedef int (*cpl_handler_func)(struct t3cdev *dev, struct sk_buff *skb);
+
+/*
+ * Returns a pointer to the first byte of the CPL header in an sk_buff that
+ * contains a CPL message.
+ */
+static inline void *cplhdr(struct sk_buff *skb)
+{
+	return skb->data;
+}
+
+void t3_register_cpl_handler(unsigned int opcode, cpl_handler_func h);
+
+union listen_entry {
+	struct t3c_tid_entry t3c_tid;
+	union listen_entry *next;
+};
+
+union active_open_entry {
+	struct t3c_tid_entry t3c_tid;
+	union active_open_entry *next;
+};
+
+/*
+ * Holds the size, base address, free list start, etc of the TID, server TID,
+ * and active-open TID tables for a offload device.
+ * The tables themselves are allocated dynamically.
+ */
+struct tid_info {
+	struct t3c_tid_entry *tid_tab;
+	unsigned int ntids;
+	atomic_t tids_in_use;
+
+	union listen_entry *stid_tab;
+	unsigned int nstids;
+	unsigned int stid_base;
+
+	union active_open_entry *atid_tab;
+	unsigned int natids;
+	unsigned int atid_base;
+
+	/*
+	 * The following members are accessed R/W so we put them in their own
+	 * cache lines.
+	 *
+	 * XXX We could combine the atid fields above with the lock here since
+	 * atids are use once (unlike other tids).  OTOH the above fields are
+	 * usually in cache due to tid_tab.
+	 */
+	spinlock_t atid_lock ____cacheline_aligned_in_smp;
+	union active_open_entry *afree;
+	unsigned int atids_in_use;
+
+	spinlock_t stid_lock ____cacheline_aligned;
+	union listen_entry *sfree;
+	unsigned int stids_in_use;
+};
+
+struct t3c_data {
+	struct list_head list_node;
+	struct t3cdev *dev;
+	unsigned int tx_max_chunk;	/* max payload for TX_DATA */
+	unsigned int max_wrs;	/* max in-flight WRs per connection */
+	unsigned int nmtus;
+	const unsigned short *mtus;
+	struct tid_info tid_maps;
+
+	struct t3c_tid_entry *tid_release_list;
+	spinlock_t tid_release_lock;
+	struct work_struct tid_release_task;
+
+	struct sk_buff *nofail_skb;
+	unsigned int release_list_incomplete;
+};
+
+/*
+ * t3cdev -> t3c_data accessor
+ */
+#define T3C_DATA(dev) (*(struct t3c_data **)&(dev)->l4opt)
+
+#endif
diff --git a/drivers/net/ethernet/chelsio/cxgb3/firmware_exports.h b/drivers/net/ethernet/chelsio/cxgb3/firmware_exports.h
new file mode 100644
index 0000000..0d9b0e6
--- /dev/null
+++ b/drivers/net/ethernet/chelsio/cxgb3/firmware_exports.h
@@ -0,0 +1,177 @@
+/*
+ * Copyright (c) 2004-2008 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef _FIRMWARE_EXPORTS_H_
+#define _FIRMWARE_EXPORTS_H_
+
+/* WR OPCODES supported by the firmware.
+ */
+#define	FW_WROPCODE_FORWARD			0x01
+#define FW_WROPCODE_BYPASS			0x05
+
+#define FW_WROPCODE_TUNNEL_TX_PKT		0x03
+
+#define FW_WROPOCDE_ULPTX_DATA_SGL		0x00
+#define FW_WROPCODE_ULPTX_MEM_READ		0x02
+#define FW_WROPCODE_ULPTX_PKT			0x04
+#define FW_WROPCODE_ULPTX_INVALIDATE		0x06
+
+#define FW_WROPCODE_TUNNEL_RX_PKT		0x07
+
+#define FW_WROPCODE_OFLD_GETTCB_RPL		0x08
+#define FW_WROPCODE_OFLD_CLOSE_CON		0x09
+#define FW_WROPCODE_OFLD_TP_ABORT_CON_REQ	0x0A
+#define FW_WROPCODE_OFLD_HOST_ABORT_CON_RPL	0x0F
+#define FW_WROPCODE_OFLD_HOST_ABORT_CON_REQ	0x0B
+#define FW_WROPCODE_OFLD_TP_ABORT_CON_RPL	0x0C
+#define FW_WROPCODE_OFLD_TX_DATA		0x0D
+#define FW_WROPCODE_OFLD_TX_DATA_ACK		0x0E
+
+#define FW_WROPCODE_RI_RDMA_INIT		0x10
+#define FW_WROPCODE_RI_RDMA_WRITE		0x11
+#define FW_WROPCODE_RI_RDMA_READ_REQ		0x12
+#define FW_WROPCODE_RI_RDMA_READ_RESP		0x13
+#define FW_WROPCODE_RI_SEND			0x14
+#define FW_WROPCODE_RI_TERMINATE		0x15
+#define FW_WROPCODE_RI_RDMA_READ		0x16
+#define FW_WROPCODE_RI_RECEIVE			0x17
+#define FW_WROPCODE_RI_BIND_MW			0x18
+#define FW_WROPCODE_RI_FASTREGISTER_MR		0x19
+#define FW_WROPCODE_RI_LOCAL_INV		0x1A
+#define FW_WROPCODE_RI_MODIFY_QP		0x1B
+#define FW_WROPCODE_RI_BYPASS			0x1C
+
+#define FW_WROPOCDE_RSVD			0x1E
+
+#define FW_WROPCODE_SGE_EGRESSCONTEXT_RR	0x1F
+
+#define FW_WROPCODE_MNGT			0x1D
+#define FW_MNGTOPCODE_PKTSCHED_SET		0x00
+
+/* Maximum size of a WR sent from the host, limited by the SGE.
+ *
+ * Note: WR coming from ULP or TP are only limited by CIM.
+ */
+#define FW_WR_SIZE			128
+
+/* Maximum number of outstanding WRs sent from the host. Value must be
+ * programmed in the CTRL/TUNNEL/QP SGE Egress Context and used by
+ * offload modules to limit the number of WRs per connection.
+ */
+#define FW_T3_WR_NUM			16
+#define FW_N3_WR_NUM			7
+
+#ifndef N3
+# define FW_WR_NUM			FW_T3_WR_NUM
+#else
+# define FW_WR_NUM			FW_N3_WR_NUM
+#endif
+
+/* FW_TUNNEL_NUM corresponds to the number of supported TUNNEL Queues. These
+ * queues must start at SGE Egress Context FW_TUNNEL_SGEEC_START and must
+ * start at 'TID' (or 'uP Token') FW_TUNNEL_TID_START.
+ *
+ * Ingress Traffic (e.g. DMA completion credit)  for TUNNEL Queue[i] is sent
+ * to RESP Queue[i].
+ */
+#define FW_TUNNEL_NUM			8
+#define FW_TUNNEL_SGEEC_START		8
+#define FW_TUNNEL_TID_START		65544
+
+/* FW_CTRL_NUM corresponds to the number of supported CTRL Queues. These queues
+ * must start at SGE Egress Context FW_CTRL_SGEEC_START and must start at 'TID'
+ * (or 'uP Token') FW_CTRL_TID_START.
+ *
+ * Ingress Traffic for CTRL Queue[i] is sent to RESP Queue[i].
+ */
+#define FW_CTRL_NUM			8
+#define FW_CTRL_SGEEC_START		65528
+#define FW_CTRL_TID_START		65536
+
+/* FW_OFLD_NUM corresponds to the number of supported OFFLOAD Queues. These
+ * queues must start at SGE Egress Context FW_OFLD_SGEEC_START.
+ *
+ * Note: the 'uP Token' in the SGE Egress Context fields is irrelevant for
+ * OFFLOAD Queues, as the host is responsible for providing the correct TID in
+ * every WR.
+ *
+ * Ingress Trafffic for OFFLOAD Queue[i] is sent to RESP Queue[i].
+ */
+#define FW_OFLD_NUM			8
+#define FW_OFLD_SGEEC_START		0
+
+/*
+ *
+ */
+#define FW_RI_NUM			1
+#define FW_RI_SGEEC_START		65527
+#define FW_RI_TID_START			65552
+
+/*
+ * The RX_PKT_TID
+ */
+#define FW_RX_PKT_NUM			1
+#define FW_RX_PKT_TID_START		65553
+
+/* FW_WRC_NUM corresponds to the number of Work Request Context that supported
+ * by the firmware.
+ */
+#define FW_WRC_NUM			\
+    (65536 + FW_TUNNEL_NUM + FW_CTRL_NUM + FW_RI_NUM + FW_RX_PKT_NUM)
+
+/*
+ * FW type and version.
+ */
+#define S_FW_VERSION_TYPE		28
+#define M_FW_VERSION_TYPE		0xF
+#define V_FW_VERSION_TYPE(x)		((x) << S_FW_VERSION_TYPE)
+#define G_FW_VERSION_TYPE(x)		\
+    (((x) >> S_FW_VERSION_TYPE) & M_FW_VERSION_TYPE)
+
+#define S_FW_VERSION_MAJOR		16
+#define M_FW_VERSION_MAJOR		0xFFF
+#define V_FW_VERSION_MAJOR(x)		((x) << S_FW_VERSION_MAJOR)
+#define G_FW_VERSION_MAJOR(x)		\
+    (((x) >> S_FW_VERSION_MAJOR) & M_FW_VERSION_MAJOR)
+
+#define S_FW_VERSION_MINOR		8
+#define M_FW_VERSION_MINOR		0xFF
+#define V_FW_VERSION_MINOR(x)		((x) << S_FW_VERSION_MINOR)
+#define G_FW_VERSION_MINOR(x)		\
+    (((x) >> S_FW_VERSION_MINOR) & M_FW_VERSION_MINOR)
+
+#define S_FW_VERSION_MICRO		0
+#define M_FW_VERSION_MICRO		0xFF
+#define V_FW_VERSION_MICRO(x)		((x) << S_FW_VERSION_MICRO)
+#define G_FW_VERSION_MICRO(x)		\
+    (((x) >> S_FW_VERSION_MICRO) & M_FW_VERSION_MICRO)
+
+#endif				/* _FIRMWARE_EXPORTS_H_ */
diff --git a/drivers/net/ethernet/chelsio/cxgb3/l2t.c b/drivers/net/ethernet/chelsio/cxgb3/l2t.c
new file mode 100644
index 0000000..5f226ed
--- /dev/null
+++ b/drivers/net/ethernet/chelsio/cxgb3/l2t.c
@@ -0,0 +1,470 @@
+/*
+ * Copyright (c) 2003-2008 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/if.h>
+#include <linux/if_vlan.h>
+#include <linux/jhash.h>
+#include <linux/slab.h>
+#include <linux/export.h>
+#include <net/neighbour.h>
+#include "common.h"
+#include "t3cdev.h"
+#include "cxgb3_defs.h"
+#include "l2t.h"
+#include "t3_cpl.h"
+#include "firmware_exports.h"
+
+#define VLAN_NONE 0xfff
+
+/*
+ * Module locking notes:  There is a RW lock protecting the L2 table as a
+ * whole plus a spinlock per L2T entry.  Entry lookups and allocations happen
+ * under the protection of the table lock, individual entry changes happen
+ * while holding that entry's spinlock.  The table lock nests outside the
+ * entry locks.  Allocations of new entries take the table lock as writers so
+ * no other lookups can happen while allocating new entries.  Entry updates
+ * take the table lock as readers so multiple entries can be updated in
+ * parallel.  An L2T entry can be dropped by decrementing its reference count
+ * and therefore can happen in parallel with entry allocation but no entry
+ * can change state or increment its ref count during allocation as both of
+ * these perform lookups.
+ */
+
+static inline unsigned int vlan_prio(const struct l2t_entry *e)
+{
+	return e->vlan >> 13;
+}
+
+static inline unsigned int arp_hash(u32 key, int ifindex,
+				    const struct l2t_data *d)
+{
+	return jhash_2words(key, ifindex, 0) & (d->nentries - 1);
+}
+
+static inline void neigh_replace(struct l2t_entry *e, struct neighbour *n)
+{
+	neigh_hold(n);
+	if (e->neigh)
+		neigh_release(e->neigh);
+	e->neigh = n;
+}
+
+/*
+ * Set up an L2T entry and send any packets waiting in the arp queue.  The
+ * supplied skb is used for the CPL_L2T_WRITE_REQ.  Must be called with the
+ * entry locked.
+ */
+static int setup_l2e_send_pending(struct t3cdev *dev, struct sk_buff *skb,
+				  struct l2t_entry *e)
+{
+	struct cpl_l2t_write_req *req;
+	struct sk_buff *tmp;
+
+	if (!skb) {
+		skb = alloc_skb(sizeof(*req), GFP_ATOMIC);
+		if (!skb)
+			return -ENOMEM;
+	}
+
+	req = (struct cpl_l2t_write_req *)__skb_put(skb, sizeof(*req));
+	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
+	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_L2T_WRITE_REQ, e->idx));
+	req->params = htonl(V_L2T_W_IDX(e->idx) | V_L2T_W_IFF(e->smt_idx) |
+			    V_L2T_W_VLAN(e->vlan & VLAN_VID_MASK) |
+			    V_L2T_W_PRIO(vlan_prio(e)));
+	memcpy(e->dmac, e->neigh->ha, sizeof(e->dmac));
+	memcpy(req->dst_mac, e->dmac, sizeof(req->dst_mac));
+	skb->priority = CPL_PRIORITY_CONTROL;
+	cxgb3_ofld_send(dev, skb);
+
+	skb_queue_walk_safe(&e->arpq, skb, tmp) {
+		__skb_unlink(skb, &e->arpq);
+		cxgb3_ofld_send(dev, skb);
+	}
+	e->state = L2T_STATE_VALID;
+
+	return 0;
+}
+
+/*
+ * Add a packet to the an L2T entry's queue of packets awaiting resolution.
+ * Must be called with the entry's lock held.
+ */
+static inline void arpq_enqueue(struct l2t_entry *e, struct sk_buff *skb)
+{
+	__skb_queue_tail(&e->arpq, skb);
+}
+
+int t3_l2t_send_slow(struct t3cdev *dev, struct sk_buff *skb,
+		     struct l2t_entry *e)
+{
+again:
+	switch (e->state) {
+	case L2T_STATE_STALE:	/* entry is stale, kick off revalidation */
+		neigh_event_send(e->neigh, NULL);
+		spin_lock_bh(&e->lock);
+		if (e->state == L2T_STATE_STALE)
+			e->state = L2T_STATE_VALID;
+		spin_unlock_bh(&e->lock);
+	case L2T_STATE_VALID:	/* fast-path, send the packet on */
+		return cxgb3_ofld_send(dev, skb);
+	case L2T_STATE_RESOLVING:
+		spin_lock_bh(&e->lock);
+		if (e->state != L2T_STATE_RESOLVING) {
+			/* ARP already completed */
+			spin_unlock_bh(&e->lock);
+			goto again;
+		}
+		arpq_enqueue(e, skb);
+		spin_unlock_bh(&e->lock);
+
+		/*
+		 * Only the first packet added to the arpq should kick off
+		 * resolution.  However, because the alloc_skb below can fail,
+		 * we allow each packet added to the arpq to retry resolution
+		 * as a way of recovering from transient memory exhaustion.
+		 * A better way would be to use a work request to retry L2T
+		 * entries when there's no memory.
+		 */
+		if (!neigh_event_send(e->neigh, NULL)) {
+			skb = alloc_skb(sizeof(struct cpl_l2t_write_req),
+					GFP_ATOMIC);
+			if (!skb)
+				break;
+
+			spin_lock_bh(&e->lock);
+			if (!skb_queue_empty(&e->arpq))
+				setup_l2e_send_pending(dev, skb, e);
+			else	/* we lost the race */
+				__kfree_skb(skb);
+			spin_unlock_bh(&e->lock);
+		}
+	}
+	return 0;
+}
+
+EXPORT_SYMBOL(t3_l2t_send_slow);
+
+void t3_l2t_send_event(struct t3cdev *dev, struct l2t_entry *e)
+{
+again:
+	switch (e->state) {
+	case L2T_STATE_STALE:	/* entry is stale, kick off revalidation */
+		neigh_event_send(e->neigh, NULL);
+		spin_lock_bh(&e->lock);
+		if (e->state == L2T_STATE_STALE) {
+			e->state = L2T_STATE_VALID;
+		}
+		spin_unlock_bh(&e->lock);
+		return;
+	case L2T_STATE_VALID:	/* fast-path, send the packet on */
+		return;
+	case L2T_STATE_RESOLVING:
+		spin_lock_bh(&e->lock);
+		if (e->state != L2T_STATE_RESOLVING) {
+			/* ARP already completed */
+			spin_unlock_bh(&e->lock);
+			goto again;
+		}
+		spin_unlock_bh(&e->lock);
+
+		/*
+		 * Only the first packet added to the arpq should kick off
+		 * resolution.  However, because the alloc_skb below can fail,
+		 * we allow each packet added to the arpq to retry resolution
+		 * as a way of recovering from transient memory exhaustion.
+		 * A better way would be to use a work request to retry L2T
+		 * entries when there's no memory.
+		 */
+		neigh_event_send(e->neigh, NULL);
+	}
+}
+
+EXPORT_SYMBOL(t3_l2t_send_event);
+
+/*
+ * Allocate a free L2T entry.  Must be called with l2t_data.lock held.
+ */
+static struct l2t_entry *alloc_l2e(struct l2t_data *d)
+{
+	struct l2t_entry *end, *e, **p;
+
+	if (!atomic_read(&d->nfree))
+		return NULL;
+
+	/* there's definitely a free entry */
+	for (e = d->rover, end = &d->l2tab[d->nentries]; e != end; ++e)
+		if (atomic_read(&e->refcnt) == 0)
+			goto found;
+
+	for (e = &d->l2tab[1]; atomic_read(&e->refcnt); ++e) ;
+found:
+	d->rover = e + 1;
+	atomic_dec(&d->nfree);
+
+	/*
+	 * The entry we found may be an inactive entry that is
+	 * presently in the hash table.  We need to remove it.
+	 */
+	if (e->state != L2T_STATE_UNUSED) {
+		int hash = arp_hash(e->addr, e->ifindex, d);
+
+		for (p = &d->l2tab[hash].first; *p; p = &(*p)->next)
+			if (*p == e) {
+				*p = e->next;
+				break;
+			}
+		e->state = L2T_STATE_UNUSED;
+	}
+	return e;
+}
+
+/*
+ * Called when an L2T entry has no more users.  The entry is left in the hash
+ * table since it is likely to be reused but we also bump nfree to indicate
+ * that the entry can be reallocated for a different neighbor.  We also drop
+ * the existing neighbor reference in case the neighbor is going away and is
+ * waiting on our reference.
+ *
+ * Because entries can be reallocated to other neighbors once their ref count
+ * drops to 0 we need to take the entry's lock to avoid races with a new
+ * incarnation.
+ */
+void t3_l2e_free(struct l2t_data *d, struct l2t_entry *e)
+{
+	spin_lock_bh(&e->lock);
+	if (atomic_read(&e->refcnt) == 0) {	/* hasn't been recycled */
+		if (e->neigh) {
+			neigh_release(e->neigh);
+			e->neigh = NULL;
+		}
+	}
+	spin_unlock_bh(&e->lock);
+	atomic_inc(&d->nfree);
+}
+
+EXPORT_SYMBOL(t3_l2e_free);
+
+/*
+ * Update an L2T entry that was previously used for the same next hop as neigh.
+ * Must be called with softirqs disabled.
+ */
+static inline void reuse_entry(struct l2t_entry *e, struct neighbour *neigh)
+{
+	unsigned int nud_state;
+
+	spin_lock(&e->lock);	/* avoid race with t3_l2t_free */
+
+	if (neigh != e->neigh)
+		neigh_replace(e, neigh);
+	nud_state = neigh->nud_state;
+	if (memcmp(e->dmac, neigh->ha, sizeof(e->dmac)) ||
+	    !(nud_state & NUD_VALID))
+		e->state = L2T_STATE_RESOLVING;
+	else if (nud_state & NUD_CONNECTED)
+		e->state = L2T_STATE_VALID;
+	else
+		e->state = L2T_STATE_STALE;
+	spin_unlock(&e->lock);
+}
+
+struct l2t_entry *t3_l2t_get(struct t3cdev *cdev, struct dst_entry *dst,
+			     struct net_device *dev, const void *daddr)
+{
+	struct l2t_entry *e = NULL;
+	struct neighbour *neigh;
+	struct port_info *p;
+	struct l2t_data *d;
+	int hash;
+	u32 addr;
+	int ifidx;
+	int smt_idx;
+
+	rcu_read_lock();
+	neigh = dst_neigh_lookup(dst, daddr);
+	if (!neigh)
+		goto done_rcu;
+
+	addr = *(u32 *) neigh->primary_key;
+	ifidx = neigh->dev->ifindex;
+
+	if (!dev)
+		dev = neigh->dev;
+	p = netdev_priv(dev);
+	smt_idx = p->port_id;
+
+	d = L2DATA(cdev);
+	if (!d)
+		goto done_rcu;
+
+	hash = arp_hash(addr, ifidx, d);
+
+	write_lock_bh(&d->lock);
+	for (e = d->l2tab[hash].first; e; e = e->next)
+		if (e->addr == addr && e->ifindex == ifidx &&
+		    e->smt_idx == smt_idx) {
+			l2t_hold(d, e);
+			if (atomic_read(&e->refcnt) == 1)
+				reuse_entry(e, neigh);
+			goto done_unlock;
+		}
+
+	/* Need to allocate a new entry */
+	e = alloc_l2e(d);
+	if (e) {
+		spin_lock(&e->lock);	/* avoid race with t3_l2t_free */
+		e->next = d->l2tab[hash].first;
+		d->l2tab[hash].first = e;
+		e->state = L2T_STATE_RESOLVING;
+		e->addr = addr;
+		e->ifindex = ifidx;
+		e->smt_idx = smt_idx;
+		atomic_set(&e->refcnt, 1);
+		neigh_replace(e, neigh);
+		if (neigh->dev->priv_flags & IFF_802_1Q_VLAN)
+			e->vlan = vlan_dev_vlan_id(neigh->dev);
+		else
+			e->vlan = VLAN_NONE;
+		spin_unlock(&e->lock);
+	}
+done_unlock:
+	write_unlock_bh(&d->lock);
+done_rcu:
+	if (neigh)
+		neigh_release(neigh);
+	rcu_read_unlock();
+	return e;
+}
+
+EXPORT_SYMBOL(t3_l2t_get);
+
+/*
+ * Called when address resolution fails for an L2T entry to handle packets
+ * on the arpq head.  If a packet specifies a failure handler it is invoked,
+ * otherwise the packets is sent to the offload device.
+ *
+ * XXX: maybe we should abandon the latter behavior and just require a failure
+ * handler.
+ */
+static void handle_failed_resolution(struct t3cdev *dev, struct sk_buff_head *arpq)
+{
+	struct sk_buff *skb, *tmp;
+
+	skb_queue_walk_safe(arpq, skb, tmp) {
+		struct l2t_skb_cb *cb = L2T_SKB_CB(skb);
+
+		__skb_unlink(skb, arpq);
+		if (cb->arp_failure_handler)
+			cb->arp_failure_handler(dev, skb);
+		else
+			cxgb3_ofld_send(dev, skb);
+	}
+}
+
+/*
+ * Called when the host's ARP layer makes a change to some entry that is
+ * loaded into the HW L2 table.
+ */
+void t3_l2t_update(struct t3cdev *dev, struct neighbour *neigh)
+{
+	struct sk_buff_head arpq;
+	struct l2t_entry *e;
+	struct l2t_data *d = L2DATA(dev);
+	u32 addr = *(u32 *) neigh->primary_key;
+	int ifidx = neigh->dev->ifindex;
+	int hash = arp_hash(addr, ifidx, d);
+
+	read_lock_bh(&d->lock);
+	for (e = d->l2tab[hash].first; e; e = e->next)
+		if (e->addr == addr && e->ifindex == ifidx) {
+			spin_lock(&e->lock);
+			goto found;
+		}
+	read_unlock_bh(&d->lock);
+	return;
+
+found:
+	__skb_queue_head_init(&arpq);
+
+	read_unlock(&d->lock);
+	if (atomic_read(&e->refcnt)) {
+		if (neigh != e->neigh)
+			neigh_replace(e, neigh);
+
+		if (e->state == L2T_STATE_RESOLVING) {
+			if (neigh->nud_state & NUD_FAILED) {
+				skb_queue_splice_init(&e->arpq, &arpq);
+			} else if (neigh->nud_state & (NUD_CONNECTED|NUD_STALE))
+				setup_l2e_send_pending(dev, NULL, e);
+		} else {
+			e->state = neigh->nud_state & NUD_CONNECTED ?
+			    L2T_STATE_VALID : L2T_STATE_STALE;
+			if (!ether_addr_equal(e->dmac, neigh->ha))
+				setup_l2e_send_pending(dev, NULL, e);
+		}
+	}
+	spin_unlock_bh(&e->lock);
+
+	if (!skb_queue_empty(&arpq))
+		handle_failed_resolution(dev, &arpq);
+}
+
+struct l2t_data *t3_init_l2t(unsigned int l2t_capacity)
+{
+	struct l2t_data *d;
+	int i, size = sizeof(*d) + l2t_capacity * sizeof(struct l2t_entry);
+
+	d = cxgb_alloc_mem(size);
+	if (!d)
+		return NULL;
+
+	d->nentries = l2t_capacity;
+	d->rover = &d->l2tab[1];	/* entry 0 is not used */
+	atomic_set(&d->nfree, l2t_capacity - 1);
+	rwlock_init(&d->lock);
+
+	for (i = 0; i < l2t_capacity; ++i) {
+		d->l2tab[i].idx = i;
+		d->l2tab[i].state = L2T_STATE_UNUSED;
+		__skb_queue_head_init(&d->l2tab[i].arpq);
+		spin_lock_init(&d->l2tab[i].lock);
+		atomic_set(&d->l2tab[i].refcnt, 0);
+	}
+	return d;
+}
+
+void t3_free_l2t(struct l2t_data *d)
+{
+	cxgb_free_mem(d);
+}
+
diff --git a/drivers/net/ethernet/chelsio/cxgb3/l2t.h b/drivers/net/ethernet/chelsio/cxgb3/l2t.h
new file mode 100644
index 0000000..8cffcdf
--- /dev/null
+++ b/drivers/net/ethernet/chelsio/cxgb3/l2t.h
@@ -0,0 +1,149 @@
+/*
+ * Copyright (c) 2003-2008 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef _CHELSIO_L2T_H
+#define _CHELSIO_L2T_H
+
+#include <linux/spinlock.h>
+#include "t3cdev.h"
+#include <linux/atomic.h>
+
+enum {
+	L2T_STATE_VALID,	/* entry is up to date */
+	L2T_STATE_STALE,	/* entry may be used but needs revalidation */
+	L2T_STATE_RESOLVING,	/* entry needs address resolution */
+	L2T_STATE_UNUSED	/* entry not in use */
+};
+
+struct neighbour;
+struct sk_buff;
+
+/*
+ * Each L2T entry plays multiple roles.  First of all, it keeps state for the
+ * corresponding entry of the HW L2 table and maintains a queue of offload
+ * packets awaiting address resolution.  Second, it is a node of a hash table
+ * chain, where the nodes of the chain are linked together through their next
+ * pointer.  Finally, each node is a bucket of a hash table, pointing to the
+ * first element in its chain through its first pointer.
+ */
+struct l2t_entry {
+	u16 state;		/* entry state */
+	u16 idx;		/* entry index */
+	u32 addr;		/* dest IP address */
+	int ifindex;		/* neighbor's net_device's ifindex */
+	u16 smt_idx;		/* SMT index */
+	u16 vlan;		/* VLAN TCI (id: bits 0-11, prio: 13-15 */
+	struct neighbour *neigh;	/* associated neighbour */
+	struct l2t_entry *first;	/* start of hash chain */
+	struct l2t_entry *next;	/* next l2t_entry on chain */
+	struct sk_buff_head arpq;	/* queue of packets awaiting resolution */
+	spinlock_t lock;
+	atomic_t refcnt;	/* entry reference count */
+	u8 dmac[6];		/* neighbour's MAC address */
+};
+
+struct l2t_data {
+	unsigned int nentries;	/* number of entries */
+	struct l2t_entry *rover;	/* starting point for next allocation */
+	atomic_t nfree;		/* number of free entries */
+	rwlock_t lock;
+	struct l2t_entry l2tab[0];
+	struct rcu_head rcu_head;	/* to handle rcu cleanup */
+};
+
+typedef void (*arp_failure_handler_func)(struct t3cdev * dev,
+					 struct sk_buff * skb);
+
+/*
+ * Callback stored in an skb to handle address resolution failure.
+ */
+struct l2t_skb_cb {
+	arp_failure_handler_func arp_failure_handler;
+};
+
+#define L2T_SKB_CB(skb) ((struct l2t_skb_cb *)(skb)->cb)
+
+static inline void set_arp_failure_handler(struct sk_buff *skb,
+					   arp_failure_handler_func hnd)
+{
+	L2T_SKB_CB(skb)->arp_failure_handler = hnd;
+}
+
+/*
+ * Getting to the L2 data from an offload device.
+ */
+#define L2DATA(cdev) (rcu_dereference((cdev)->l2opt))
+
+#define W_TCB_L2T_IX    0
+#define S_TCB_L2T_IX    7
+#define M_TCB_L2T_IX    0x7ffULL
+#define V_TCB_L2T_IX(x) ((x) << S_TCB_L2T_IX)
+
+void t3_l2e_free(struct l2t_data *d, struct l2t_entry *e);
+void t3_l2t_update(struct t3cdev *dev, struct neighbour *neigh);
+struct l2t_entry *t3_l2t_get(struct t3cdev *cdev, struct dst_entry *dst,
+			     struct net_device *dev, const void *daddr);
+int t3_l2t_send_slow(struct t3cdev *dev, struct sk_buff *skb,
+		     struct l2t_entry *e);
+void t3_l2t_send_event(struct t3cdev *dev, struct l2t_entry *e);
+struct l2t_data *t3_init_l2t(unsigned int l2t_capacity);
+void t3_free_l2t(struct l2t_data *d);
+
+int cxgb3_ofld_send(struct t3cdev *dev, struct sk_buff *skb);
+
+static inline int l2t_send(struct t3cdev *dev, struct sk_buff *skb,
+			   struct l2t_entry *e)
+{
+	if (likely(e->state == L2T_STATE_VALID))
+		return cxgb3_ofld_send(dev, skb);
+	return t3_l2t_send_slow(dev, skb, e);
+}
+
+static inline void l2t_release(struct t3cdev *t, struct l2t_entry *e)
+{
+	struct l2t_data *d;
+
+	rcu_read_lock();
+	d = L2DATA(t);
+
+	if (atomic_dec_and_test(&e->refcnt) && d)
+		t3_l2e_free(d, e);
+
+	rcu_read_unlock();
+}
+
+static inline void l2t_hold(struct l2t_data *d, struct l2t_entry *e)
+{
+	if (d && atomic_add_return(1, &e->refcnt) == 1)	/* 0 -> 1 transition */
+		atomic_dec(&d->nfree);
+}
+
+#endif
diff --git a/drivers/net/ethernet/chelsio/cxgb3/mc5.c b/drivers/net/ethernet/chelsio/cxgb3/mc5.c
new file mode 100644
index 0000000..e13b7fe
--- /dev/null
+++ b/drivers/net/ethernet/chelsio/cxgb3/mc5.c
@@ -0,0 +1,438 @@
+/*
+ * Copyright (c) 2003-2008 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "common.h"
+#include "regs.h"
+
+enum {
+	IDT75P52100 = 4,
+	IDT75N43102 = 5
+};
+
+/* DBGI command mode */
+enum {
+	DBGI_MODE_MBUS = 0,
+	DBGI_MODE_IDT52100 = 5
+};
+
+/* IDT 75P52100 commands */
+#define IDT_CMD_READ   0
+#define IDT_CMD_WRITE  1
+#define IDT_CMD_SEARCH 2
+#define IDT_CMD_LEARN  3
+
+/* IDT LAR register address and value for 144-bit mode (low 32 bits) */
+#define IDT_LAR_ADR0   	0x180006
+#define IDT_LAR_MODE144	0xffff0000
+
+/* IDT SCR and SSR addresses (low 32 bits) */
+#define IDT_SCR_ADR0  0x180000
+#define IDT_SSR0_ADR0 0x180002
+#define IDT_SSR1_ADR0 0x180004
+
+/* IDT GMR base address (low 32 bits) */
+#define IDT_GMR_BASE_ADR0 0x180020
+
+/* IDT data and mask array base addresses (low 32 bits) */
+#define IDT_DATARY_BASE_ADR0 0
+#define IDT_MSKARY_BASE_ADR0 0x80000
+
+/* IDT 75N43102 commands */
+#define IDT4_CMD_SEARCH144 3
+#define IDT4_CMD_WRITE     4
+#define IDT4_CMD_READ      5
+
+/* IDT 75N43102 SCR address (low 32 bits) */
+#define IDT4_SCR_ADR0  0x3
+
+/* IDT 75N43102 GMR base addresses (low 32 bits) */
+#define IDT4_GMR_BASE0 0x10
+#define IDT4_GMR_BASE1 0x20
+#define IDT4_GMR_BASE2 0x30
+
+/* IDT 75N43102 data and mask array base addresses (low 32 bits) */
+#define IDT4_DATARY_BASE_ADR0 0x1000000
+#define IDT4_MSKARY_BASE_ADR0 0x2000000
+
+#define MAX_WRITE_ATTEMPTS 5
+
+#define MAX_ROUTES 2048
+
+/*
+ * Issue a command to the TCAM and wait for its completion.  The address and
+ * any data required by the command must have been setup by the caller.
+ */
+static int mc5_cmd_write(struct adapter *adapter, u32 cmd)
+{
+	t3_write_reg(adapter, A_MC5_DB_DBGI_REQ_CMD, cmd);
+	return t3_wait_op_done(adapter, A_MC5_DB_DBGI_RSP_STATUS,
+			       F_DBGIRSPVALID, 1, MAX_WRITE_ATTEMPTS, 1);
+}
+
+static inline void dbgi_wr_addr3(struct adapter *adapter, u32 v1, u32 v2,
+				 u32 v3)
+{
+	t3_write_reg(adapter, A_MC5_DB_DBGI_REQ_ADDR0, v1);
+	t3_write_reg(adapter, A_MC5_DB_DBGI_REQ_ADDR1, v2);
+	t3_write_reg(adapter, A_MC5_DB_DBGI_REQ_ADDR2, v3);
+}
+
+static inline void dbgi_wr_data3(struct adapter *adapter, u32 v1, u32 v2,
+				 u32 v3)
+{
+	t3_write_reg(adapter, A_MC5_DB_DBGI_REQ_DATA0, v1);
+	t3_write_reg(adapter, A_MC5_DB_DBGI_REQ_DATA1, v2);
+	t3_write_reg(adapter, A_MC5_DB_DBGI_REQ_DATA2, v3);
+}
+
+static inline void dbgi_rd_rsp3(struct adapter *adapter, u32 *v1, u32 *v2,
+				u32 *v3)
+{
+	*v1 = t3_read_reg(adapter, A_MC5_DB_DBGI_RSP_DATA0);
+	*v2 = t3_read_reg(adapter, A_MC5_DB_DBGI_RSP_DATA1);
+	*v3 = t3_read_reg(adapter, A_MC5_DB_DBGI_RSP_DATA2);
+}
+
+/*
+ * Write data to the TCAM register at address (0, 0, addr_lo) using the TCAM
+ * command cmd.  The data to be written must have been set up by the caller.
+ * Returns -1 on failure, 0 on success.
+ */
+static int mc5_write(struct adapter *adapter, u32 addr_lo, u32 cmd)
+{
+	t3_write_reg(adapter, A_MC5_DB_DBGI_REQ_ADDR0, addr_lo);
+	if (mc5_cmd_write(adapter, cmd) == 0)
+		return 0;
+	CH_ERR(adapter, "MC5 timeout writing to TCAM address 0x%x\n",
+	       addr_lo);
+	return -1;
+}
+
+static int init_mask_data_array(struct mc5 *mc5, u32 mask_array_base,
+				u32 data_array_base, u32 write_cmd,
+				int addr_shift)
+{
+	unsigned int i;
+	struct adapter *adap = mc5->adapter;
+
+	/*
+	 * We need the size of the TCAM data and mask arrays in terms of
+	 * 72-bit entries.
+	 */
+	unsigned int size72 = mc5->tcam_size;
+	unsigned int server_base = t3_read_reg(adap, A_MC5_DB_SERVER_INDEX);
+
+	if (mc5->mode == MC5_MODE_144_BIT) {
+		size72 *= 2;	/* 1 144-bit entry is 2 72-bit entries */
+		server_base *= 2;
+	}
+
+	/* Clear the data array */
+	dbgi_wr_data3(adap, 0, 0, 0);
+	for (i = 0; i < size72; i++)
+		if (mc5_write(adap, data_array_base + (i << addr_shift),
+			      write_cmd))
+			return -1;
+
+	/* Initialize the mask array. */
+	dbgi_wr_data3(adap, 0xffffffff, 0xffffffff, 0xff);
+	for (i = 0; i < size72; i++) {
+		if (i == server_base)	/* entering server or routing region */
+			t3_write_reg(adap, A_MC5_DB_DBGI_REQ_DATA0,
+				     mc5->mode == MC5_MODE_144_BIT ?
+				     0xfffffff9 : 0xfffffffd);
+		if (mc5_write(adap, mask_array_base + (i << addr_shift),
+			      write_cmd))
+			return -1;
+	}
+	return 0;
+}
+
+static int init_idt52100(struct mc5 *mc5)
+{
+	int i;
+	struct adapter *adap = mc5->adapter;
+
+	t3_write_reg(adap, A_MC5_DB_RSP_LATENCY,
+		     V_RDLAT(0x15) | V_LRNLAT(0x15) | V_SRCHLAT(0x15));
+	t3_write_reg(adap, A_MC5_DB_PART_ID_INDEX, 2);
+
+	/*
+	 * Use GMRs 14-15 for ELOOKUP, GMRs 12-13 for SYN lookups, and
+	 * GMRs 8-9 for ACK- and AOPEN searches.
+	 */
+	t3_write_reg(adap, A_MC5_DB_POPEN_DATA_WR_CMD, IDT_CMD_WRITE);
+	t3_write_reg(adap, A_MC5_DB_POPEN_MASK_WR_CMD, IDT_CMD_WRITE);
+	t3_write_reg(adap, A_MC5_DB_AOPEN_SRCH_CMD, IDT_CMD_SEARCH);
+	t3_write_reg(adap, A_MC5_DB_AOPEN_LRN_CMD, IDT_CMD_LEARN);
+	t3_write_reg(adap, A_MC5_DB_SYN_SRCH_CMD, IDT_CMD_SEARCH | 0x6000);
+	t3_write_reg(adap, A_MC5_DB_SYN_LRN_CMD, IDT_CMD_LEARN);
+	t3_write_reg(adap, A_MC5_DB_ACK_SRCH_CMD, IDT_CMD_SEARCH);
+	t3_write_reg(adap, A_MC5_DB_ACK_LRN_CMD, IDT_CMD_LEARN);
+	t3_write_reg(adap, A_MC5_DB_ILOOKUP_CMD, IDT_CMD_SEARCH);
+	t3_write_reg(adap, A_MC5_DB_ELOOKUP_CMD, IDT_CMD_SEARCH | 0x7000);
+	t3_write_reg(adap, A_MC5_DB_DATA_WRITE_CMD, IDT_CMD_WRITE);
+	t3_write_reg(adap, A_MC5_DB_DATA_READ_CMD, IDT_CMD_READ);
+
+	/* Set DBGI command mode for IDT TCAM. */
+	t3_write_reg(adap, A_MC5_DB_DBGI_CONFIG, DBGI_MODE_IDT52100);
+
+	/* Set up LAR */
+	dbgi_wr_data3(adap, IDT_LAR_MODE144, 0, 0);
+	if (mc5_write(adap, IDT_LAR_ADR0, IDT_CMD_WRITE))
+		goto err;
+
+	/* Set up SSRs */
+	dbgi_wr_data3(adap, 0xffffffff, 0xffffffff, 0);
+	if (mc5_write(adap, IDT_SSR0_ADR0, IDT_CMD_WRITE) ||
+	    mc5_write(adap, IDT_SSR1_ADR0, IDT_CMD_WRITE))
+		goto err;
+
+	/* Set up GMRs */
+	for (i = 0; i < 32; ++i) {
+		if (i >= 12 && i < 15)
+			dbgi_wr_data3(adap, 0xfffffff9, 0xffffffff, 0xff);
+		else if (i == 15)
+			dbgi_wr_data3(adap, 0xfffffff9, 0xffff8007, 0xff);
+		else
+			dbgi_wr_data3(adap, 0xffffffff, 0xffffffff, 0xff);
+
+		if (mc5_write(adap, IDT_GMR_BASE_ADR0 + i, IDT_CMD_WRITE))
+			goto err;
+	}
+
+	/* Set up SCR */
+	dbgi_wr_data3(adap, 1, 0, 0);
+	if (mc5_write(adap, IDT_SCR_ADR0, IDT_CMD_WRITE))
+		goto err;
+
+	return init_mask_data_array(mc5, IDT_MSKARY_BASE_ADR0,
+				    IDT_DATARY_BASE_ADR0, IDT_CMD_WRITE, 0);
+err:
+	return -EIO;
+}
+
+static int init_idt43102(struct mc5 *mc5)
+{
+	int i;
+	struct adapter *adap = mc5->adapter;
+
+	t3_write_reg(adap, A_MC5_DB_RSP_LATENCY,
+		     adap->params.rev == 0 ? V_RDLAT(0xd) | V_SRCHLAT(0x11) :
+		     V_RDLAT(0xd) | V_SRCHLAT(0x12));
+
+	/*
+	 * Use GMRs 24-25 for ELOOKUP, GMRs 20-21 for SYN lookups, and no mask
+	 * for ACK- and AOPEN searches.
+	 */
+	t3_write_reg(adap, A_MC5_DB_POPEN_DATA_WR_CMD, IDT4_CMD_WRITE);
+	t3_write_reg(adap, A_MC5_DB_POPEN_MASK_WR_CMD, IDT4_CMD_WRITE);
+	t3_write_reg(adap, A_MC5_DB_AOPEN_SRCH_CMD,
+		     IDT4_CMD_SEARCH144 | 0x3800);
+	t3_write_reg(adap, A_MC5_DB_SYN_SRCH_CMD, IDT4_CMD_SEARCH144);
+	t3_write_reg(adap, A_MC5_DB_ACK_SRCH_CMD, IDT4_CMD_SEARCH144 | 0x3800);
+	t3_write_reg(adap, A_MC5_DB_ILOOKUP_CMD, IDT4_CMD_SEARCH144 | 0x3800);
+	t3_write_reg(adap, A_MC5_DB_ELOOKUP_CMD, IDT4_CMD_SEARCH144 | 0x800);
+	t3_write_reg(adap, A_MC5_DB_DATA_WRITE_CMD, IDT4_CMD_WRITE);
+	t3_write_reg(adap, A_MC5_DB_DATA_READ_CMD, IDT4_CMD_READ);
+
+	t3_write_reg(adap, A_MC5_DB_PART_ID_INDEX, 3);
+
+	/* Set DBGI command mode for IDT TCAM. */
+	t3_write_reg(adap, A_MC5_DB_DBGI_CONFIG, DBGI_MODE_IDT52100);
+
+	/* Set up GMRs */
+	dbgi_wr_data3(adap, 0xffffffff, 0xffffffff, 0xff);
+	for (i = 0; i < 7; ++i)
+		if (mc5_write(adap, IDT4_GMR_BASE0 + i, IDT4_CMD_WRITE))
+			goto err;
+
+	for (i = 0; i < 4; ++i)
+		if (mc5_write(adap, IDT4_GMR_BASE2 + i, IDT4_CMD_WRITE))
+			goto err;
+
+	dbgi_wr_data3(adap, 0xfffffff9, 0xffffffff, 0xff);
+	if (mc5_write(adap, IDT4_GMR_BASE1, IDT4_CMD_WRITE) ||
+	    mc5_write(adap, IDT4_GMR_BASE1 + 1, IDT4_CMD_WRITE) ||
+	    mc5_write(adap, IDT4_GMR_BASE1 + 4, IDT4_CMD_WRITE))
+		goto err;
+
+	dbgi_wr_data3(adap, 0xfffffff9, 0xffff8007, 0xff);
+	if (mc5_write(adap, IDT4_GMR_BASE1 + 5, IDT4_CMD_WRITE))
+		goto err;
+
+	/* Set up SCR */
+	dbgi_wr_data3(adap, 0xf0000000, 0, 0);
+	if (mc5_write(adap, IDT4_SCR_ADR0, IDT4_CMD_WRITE))
+		goto err;
+
+	return init_mask_data_array(mc5, IDT4_MSKARY_BASE_ADR0,
+				    IDT4_DATARY_BASE_ADR0, IDT4_CMD_WRITE, 1);
+err:
+	return -EIO;
+}
+
+/* Put MC5 in DBGI mode. */
+static inline void mc5_dbgi_mode_enable(const struct mc5 *mc5)
+{
+	t3_write_reg(mc5->adapter, A_MC5_DB_CONFIG,
+		     V_TMMODE(mc5->mode == MC5_MODE_72_BIT) | F_DBGIEN);
+}
+
+/* Put MC5 in M-Bus mode. */
+static void mc5_dbgi_mode_disable(const struct mc5 *mc5)
+{
+	t3_write_reg(mc5->adapter, A_MC5_DB_CONFIG,
+		     V_TMMODE(mc5->mode == MC5_MODE_72_BIT) |
+		     V_COMPEN(mc5->mode == MC5_MODE_72_BIT) |
+		     V_PRTYEN(mc5->parity_enabled) | F_MBUSEN);
+}
+
+/*
+ * Initialization that requires the OS and protocol layers to already
+ * be initialized goes here.
+ */
+int t3_mc5_init(struct mc5 *mc5, unsigned int nservers, unsigned int nfilters,
+		unsigned int nroutes)
+{
+	u32 cfg;
+	int err;
+	unsigned int tcam_size = mc5->tcam_size;
+	struct adapter *adap = mc5->adapter;
+
+	if (!tcam_size)
+		return 0;
+
+	if (nroutes > MAX_ROUTES || nroutes + nservers + nfilters > tcam_size)
+		return -EINVAL;
+
+	/* Reset the TCAM */
+	cfg = t3_read_reg(adap, A_MC5_DB_CONFIG) & ~F_TMMODE;
+	cfg |= V_TMMODE(mc5->mode == MC5_MODE_72_BIT) | F_TMRST;
+	t3_write_reg(adap, A_MC5_DB_CONFIG, cfg);
+	if (t3_wait_op_done(adap, A_MC5_DB_CONFIG, F_TMRDY, 1, 500, 0)) {
+		CH_ERR(adap, "TCAM reset timed out\n");
+		return -1;
+	}
+
+	t3_write_reg(adap, A_MC5_DB_ROUTING_TABLE_INDEX, tcam_size - nroutes);
+	t3_write_reg(adap, A_MC5_DB_FILTER_TABLE,
+		     tcam_size - nroutes - nfilters);
+	t3_write_reg(adap, A_MC5_DB_SERVER_INDEX,
+		     tcam_size - nroutes - nfilters - nservers);
+
+	mc5->parity_enabled = 1;
+
+	/* All the TCAM addresses we access have only the low 32 bits non 0 */
+	t3_write_reg(adap, A_MC5_DB_DBGI_REQ_ADDR1, 0);
+	t3_write_reg(adap, A_MC5_DB_DBGI_REQ_ADDR2, 0);
+
+	mc5_dbgi_mode_enable(mc5);
+
+	switch (mc5->part_type) {
+	case IDT75P52100:
+		err = init_idt52100(mc5);
+		break;
+	case IDT75N43102:
+		err = init_idt43102(mc5);
+		break;
+	default:
+		CH_ERR(adap, "Unsupported TCAM type %d\n", mc5->part_type);
+		err = -EINVAL;
+		break;
+	}
+
+	mc5_dbgi_mode_disable(mc5);
+	return err;
+}
+
+
+#define MC5_INT_FATAL (F_PARITYERR | F_REQQPARERR | F_DISPQPARERR)
+
+/*
+ * MC5 interrupt handler
+ */
+void t3_mc5_intr_handler(struct mc5 *mc5)
+{
+	struct adapter *adap = mc5->adapter;
+	u32 cause = t3_read_reg(adap, A_MC5_DB_INT_CAUSE);
+
+	if ((cause & F_PARITYERR) && mc5->parity_enabled) {
+		CH_ALERT(adap, "MC5 parity error\n");
+		mc5->stats.parity_err++;
+	}
+
+	if (cause & F_REQQPARERR) {
+		CH_ALERT(adap, "MC5 request queue parity error\n");
+		mc5->stats.reqq_parity_err++;
+	}
+
+	if (cause & F_DISPQPARERR) {
+		CH_ALERT(adap, "MC5 dispatch queue parity error\n");
+		mc5->stats.dispq_parity_err++;
+	}
+
+	if (cause & F_ACTRGNFULL)
+		mc5->stats.active_rgn_full++;
+	if (cause & F_NFASRCHFAIL)
+		mc5->stats.nfa_srch_err++;
+	if (cause & F_UNKNOWNCMD)
+		mc5->stats.unknown_cmd++;
+	if (cause & F_DELACTEMPTY)
+		mc5->stats.del_act_empty++;
+	if (cause & MC5_INT_FATAL)
+		t3_fatal_err(adap);
+
+	t3_write_reg(adap, A_MC5_DB_INT_CAUSE, cause);
+}
+
+void t3_mc5_prep(struct adapter *adapter, struct mc5 *mc5, int mode)
+{
+#define K * 1024
+
+	static unsigned int tcam_part_size[] = {	/* in K 72-bit entries */
+		64 K, 128 K, 256 K, 32 K
+	};
+
+#undef K
+
+	u32 cfg = t3_read_reg(adapter, A_MC5_DB_CONFIG);
+
+	mc5->adapter = adapter;
+	mc5->mode = (unsigned char)mode;
+	mc5->part_type = (unsigned char)G_TMTYPE(cfg);
+	if (cfg & F_TMTYPEHI)
+		mc5->part_type |= 4;
+
+	mc5->tcam_size = tcam_part_size[G_TMPARTSIZE(cfg)];
+	if (mode == MC5_MODE_144_BIT)
+		mc5->tcam_size /= 2;
+}
diff --git a/drivers/net/ethernet/chelsio/cxgb3/regs.h b/drivers/net/ethernet/chelsio/cxgb3/regs.h
new file mode 100644
index 0000000..81029b8
--- /dev/null
+++ b/drivers/net/ethernet/chelsio/cxgb3/regs.h
@@ -0,0 +1,2563 @@
+#define A_SG_CONTROL 0x0
+
+#define S_CONGMODE    29
+#define V_CONGMODE(x) ((x) << S_CONGMODE)
+#define F_CONGMODE    V_CONGMODE(1U)
+
+#define S_TNLFLMODE    28
+#define V_TNLFLMODE(x) ((x) << S_TNLFLMODE)
+#define F_TNLFLMODE    V_TNLFLMODE(1U)
+
+#define S_FATLPERREN    27
+#define V_FATLPERREN(x) ((x) << S_FATLPERREN)
+#define F_FATLPERREN    V_FATLPERREN(1U)
+
+#define S_DROPPKT    20
+#define V_DROPPKT(x) ((x) << S_DROPPKT)
+#define F_DROPPKT    V_DROPPKT(1U)
+
+#define S_EGRGENCTRL    19
+#define V_EGRGENCTRL(x) ((x) << S_EGRGENCTRL)
+#define F_EGRGENCTRL    V_EGRGENCTRL(1U)
+
+#define S_USERSPACESIZE    14
+#define M_USERSPACESIZE    0x1f
+#define V_USERSPACESIZE(x) ((x) << S_USERSPACESIZE)
+
+#define S_HOSTPAGESIZE    11
+#define M_HOSTPAGESIZE    0x7
+#define V_HOSTPAGESIZE(x) ((x) << S_HOSTPAGESIZE)
+
+#define S_FLMODE    9
+#define V_FLMODE(x) ((x) << S_FLMODE)
+#define F_FLMODE    V_FLMODE(1U)
+
+#define S_PKTSHIFT    6
+#define M_PKTSHIFT    0x7
+#define V_PKTSHIFT(x) ((x) << S_PKTSHIFT)
+
+#define S_ONEINTMULTQ    5
+#define V_ONEINTMULTQ(x) ((x) << S_ONEINTMULTQ)
+#define F_ONEINTMULTQ    V_ONEINTMULTQ(1U)
+
+#define S_BIGENDIANINGRESS    2
+#define V_BIGENDIANINGRESS(x) ((x) << S_BIGENDIANINGRESS)
+#define F_BIGENDIANINGRESS    V_BIGENDIANINGRESS(1U)
+
+#define S_ISCSICOALESCING    1
+#define V_ISCSICOALESCING(x) ((x) << S_ISCSICOALESCING)
+#define F_ISCSICOALESCING    V_ISCSICOALESCING(1U)
+
+#define S_GLOBALENABLE    0
+#define V_GLOBALENABLE(x) ((x) << S_GLOBALENABLE)
+#define F_GLOBALENABLE    V_GLOBALENABLE(1U)
+
+#define S_AVOIDCQOVFL    24
+#define V_AVOIDCQOVFL(x) ((x) << S_AVOIDCQOVFL)
+#define F_AVOIDCQOVFL    V_AVOIDCQOVFL(1U)
+
+#define S_OPTONEINTMULTQ    23
+#define V_OPTONEINTMULTQ(x) ((x) << S_OPTONEINTMULTQ)
+#define F_OPTONEINTMULTQ    V_OPTONEINTMULTQ(1U)
+
+#define S_CQCRDTCTRL    22
+#define V_CQCRDTCTRL(x) ((x) << S_CQCRDTCTRL)
+#define F_CQCRDTCTRL    V_CQCRDTCTRL(1U)
+
+#define A_SG_KDOORBELL 0x4
+
+#define S_SELEGRCNTX    31
+#define V_SELEGRCNTX(x) ((x) << S_SELEGRCNTX)
+#define F_SELEGRCNTX    V_SELEGRCNTX(1U)
+
+#define S_EGRCNTX    0
+#define M_EGRCNTX    0xffff
+#define V_EGRCNTX(x) ((x) << S_EGRCNTX)
+
+#define A_SG_GTS 0x8
+
+#define S_RSPQ    29
+#define M_RSPQ    0x7
+#define V_RSPQ(x) ((x) << S_RSPQ)
+#define G_RSPQ(x) (((x) >> S_RSPQ) & M_RSPQ)
+
+#define S_NEWTIMER    16
+#define M_NEWTIMER    0x1fff
+#define V_NEWTIMER(x) ((x) << S_NEWTIMER)
+
+#define S_NEWINDEX    0
+#define M_NEWINDEX    0xffff
+#define V_NEWINDEX(x) ((x) << S_NEWINDEX)
+
+#define A_SG_CONTEXT_CMD 0xc
+
+#define S_CONTEXT_CMD_OPCODE    28
+#define M_CONTEXT_CMD_OPCODE    0xf
+#define V_CONTEXT_CMD_OPCODE(x) ((x) << S_CONTEXT_CMD_OPCODE)
+
+#define S_CONTEXT_CMD_BUSY    27
+#define V_CONTEXT_CMD_BUSY(x) ((x) << S_CONTEXT_CMD_BUSY)
+#define F_CONTEXT_CMD_BUSY    V_CONTEXT_CMD_BUSY(1U)
+
+#define S_CQ_CREDIT    20
+
+#define M_CQ_CREDIT    0x7f
+
+#define V_CQ_CREDIT(x) ((x) << S_CQ_CREDIT)
+
+#define G_CQ_CREDIT(x) (((x) >> S_CQ_CREDIT) & M_CQ_CREDIT)
+
+#define S_CQ    19
+
+#define V_CQ(x) ((x) << S_CQ)
+#define F_CQ    V_CQ(1U)
+
+#define S_RESPONSEQ    18
+#define V_RESPONSEQ(x) ((x) << S_RESPONSEQ)
+#define F_RESPONSEQ    V_RESPONSEQ(1U)
+
+#define S_EGRESS    17
+#define V_EGRESS(x) ((x) << S_EGRESS)
+#define F_EGRESS    V_EGRESS(1U)
+
+#define S_FREELIST    16
+#define V_FREELIST(x) ((x) << S_FREELIST)
+#define F_FREELIST    V_FREELIST(1U)
+
+#define S_CONTEXT    0
+#define M_CONTEXT    0xffff
+#define V_CONTEXT(x) ((x) << S_CONTEXT)
+
+#define G_CONTEXT(x) (((x) >> S_CONTEXT) & M_CONTEXT)
+
+#define A_SG_CONTEXT_DATA0 0x10
+
+#define A_SG_CONTEXT_DATA1 0x14
+
+#define A_SG_CONTEXT_DATA2 0x18
+
+#define A_SG_CONTEXT_DATA3 0x1c
+
+#define A_SG_CONTEXT_MASK0 0x20
+
+#define A_SG_CONTEXT_MASK1 0x24
+
+#define A_SG_CONTEXT_MASK2 0x28
+
+#define A_SG_CONTEXT_MASK3 0x2c
+
+#define A_SG_RSPQ_CREDIT_RETURN 0x30
+
+#define S_CREDITS    0
+#define M_CREDITS    0xffff
+#define V_CREDITS(x) ((x) << S_CREDITS)
+
+#define A_SG_DATA_INTR 0x34
+
+#define S_ERRINTR    31
+#define V_ERRINTR(x) ((x) << S_ERRINTR)
+#define F_ERRINTR    V_ERRINTR(1U)
+
+#define A_SG_HI_DRB_HI_THRSH 0x38
+
+#define A_SG_HI_DRB_LO_THRSH 0x3c
+
+#define A_SG_LO_DRB_HI_THRSH 0x40
+
+#define A_SG_LO_DRB_LO_THRSH 0x44
+
+#define A_SG_RSPQ_FL_STATUS 0x4c
+
+#define S_RSPQ0DISABLED    8
+
+#define S_FL0EMPTY    16
+#define V_FL0EMPTY(x) ((x) << S_FL0EMPTY)
+#define F_FL0EMPTY    V_FL0EMPTY(1U)
+
+#define A_SG_EGR_RCQ_DRB_THRSH 0x54
+
+#define S_HIRCQDRBTHRSH    16
+#define M_HIRCQDRBTHRSH    0x7ff
+#define V_HIRCQDRBTHRSH(x) ((x) << S_HIRCQDRBTHRSH)
+
+#define S_LORCQDRBTHRSH    0
+#define M_LORCQDRBTHRSH    0x7ff
+#define V_LORCQDRBTHRSH(x) ((x) << S_LORCQDRBTHRSH)
+
+#define A_SG_EGR_CNTX_BADDR 0x58
+
+#define A_SG_INT_CAUSE 0x5c
+
+#define S_HIRCQPARITYERROR    31
+#define V_HIRCQPARITYERROR(x) ((x) << S_HIRCQPARITYERROR)
+#define F_HIRCQPARITYERROR    V_HIRCQPARITYERROR(1U)
+
+#define S_LORCQPARITYERROR    30
+#define V_LORCQPARITYERROR(x) ((x) << S_LORCQPARITYERROR)
+#define F_LORCQPARITYERROR    V_LORCQPARITYERROR(1U)
+
+#define S_HIDRBPARITYERROR    29
+#define V_HIDRBPARITYERROR(x) ((x) << S_HIDRBPARITYERROR)
+#define F_HIDRBPARITYERROR    V_HIDRBPARITYERROR(1U)
+
+#define S_LODRBPARITYERROR    28
+#define V_LODRBPARITYERROR(x) ((x) << S_LODRBPARITYERROR)
+#define F_LODRBPARITYERROR    V_LODRBPARITYERROR(1U)
+
+#define S_FLPARITYERROR    22
+#define M_FLPARITYERROR    0x3f
+#define V_FLPARITYERROR(x) ((x) << S_FLPARITYERROR)
+#define G_FLPARITYERROR(x) (((x) >> S_FLPARITYERROR) & M_FLPARITYERROR)
+
+#define S_ITPARITYERROR    20
+#define M_ITPARITYERROR    0x3
+#define V_ITPARITYERROR(x) ((x) << S_ITPARITYERROR)
+#define G_ITPARITYERROR(x) (((x) >> S_ITPARITYERROR) & M_ITPARITYERROR)
+
+#define S_IRPARITYERROR    19
+#define V_IRPARITYERROR(x) ((x) << S_IRPARITYERROR)
+#define F_IRPARITYERROR    V_IRPARITYERROR(1U)
+
+#define S_RCPARITYERROR    18
+#define V_RCPARITYERROR(x) ((x) << S_RCPARITYERROR)
+#define F_RCPARITYERROR    V_RCPARITYERROR(1U)
+
+#define S_OCPARITYERROR    17
+#define V_OCPARITYERROR(x) ((x) << S_OCPARITYERROR)
+#define F_OCPARITYERROR    V_OCPARITYERROR(1U)
+
+#define S_CPPARITYERROR    16
+#define V_CPPARITYERROR(x) ((x) << S_CPPARITYERROR)
+#define F_CPPARITYERROR    V_CPPARITYERROR(1U)
+
+#define S_R_REQ_FRAMINGERROR    15
+#define V_R_REQ_FRAMINGERROR(x) ((x) << S_R_REQ_FRAMINGERROR)
+#define F_R_REQ_FRAMINGERROR    V_R_REQ_FRAMINGERROR(1U)
+
+#define S_UC_REQ_FRAMINGERROR    14
+#define V_UC_REQ_FRAMINGERROR(x) ((x) << S_UC_REQ_FRAMINGERROR)
+#define F_UC_REQ_FRAMINGERROR    V_UC_REQ_FRAMINGERROR(1U)
+
+#define S_HICTLDRBDROPERR    13
+#define V_HICTLDRBDROPERR(x) ((x) << S_HICTLDRBDROPERR)
+#define F_HICTLDRBDROPERR    V_HICTLDRBDROPERR(1U)
+
+#define S_LOCTLDRBDROPERR    12
+#define V_LOCTLDRBDROPERR(x) ((x) << S_LOCTLDRBDROPERR)
+#define F_LOCTLDRBDROPERR    V_LOCTLDRBDROPERR(1U)
+
+#define S_HIPIODRBDROPERR    11
+#define V_HIPIODRBDROPERR(x) ((x) << S_HIPIODRBDROPERR)
+#define F_HIPIODRBDROPERR    V_HIPIODRBDROPERR(1U)
+
+#define S_LOPIODRBDROPERR    10
+#define V_LOPIODRBDROPERR(x) ((x) << S_LOPIODRBDROPERR)
+#define F_LOPIODRBDROPERR    V_LOPIODRBDROPERR(1U)
+
+#define S_HIPRIORITYDBFULL    7
+#define V_HIPRIORITYDBFULL(x) ((x) << S_HIPRIORITYDBFULL)
+#define F_HIPRIORITYDBFULL    V_HIPRIORITYDBFULL(1U)
+
+#define S_HIPRIORITYDBEMPTY   6
+#define V_HIPRIORITYDBEMPTY(x) ((x) << S_HIPRIORITYDBEMPTY)
+#define F_HIPRIORITYDBEMPTY    V_HIPRIORITYDBEMPTY(1U)
+
+#define S_LOPRIORITYDBFULL    5
+#define V_LOPRIORITYDBFULL(x) ((x) << S_LOPRIORITYDBFULL)
+#define F_LOPRIORITYDBFULL    V_LOPRIORITYDBFULL(1U)
+
+#define S_LOPRIORITYDBEMPTY   4
+#define V_LOPRIORITYDBEMPTY(x) ((x) << S_LOPRIORITYDBEMPTY)
+#define F_LOPRIORITYDBEMPTY    V_LOPRIORITYDBEMPTY(1U)
+
+#define S_RSPQDISABLED    3
+#define V_RSPQDISABLED(x) ((x) << S_RSPQDISABLED)
+#define F_RSPQDISABLED    V_RSPQDISABLED(1U)
+
+#define S_RSPQCREDITOVERFOW    2
+#define V_RSPQCREDITOVERFOW(x) ((x) << S_RSPQCREDITOVERFOW)
+#define F_RSPQCREDITOVERFOW    V_RSPQCREDITOVERFOW(1U)
+
+#define S_FLEMPTY    1
+#define V_FLEMPTY(x) ((x) << S_FLEMPTY)
+#define F_FLEMPTY    V_FLEMPTY(1U)
+
+#define A_SG_INT_ENABLE 0x60
+
+#define A_SG_CMDQ_CREDIT_TH 0x64
+
+#define S_TIMEOUT    8
+#define M_TIMEOUT    0xffffff
+#define V_TIMEOUT(x) ((x) << S_TIMEOUT)
+
+#define S_THRESHOLD    0
+#define M_THRESHOLD    0xff
+#define V_THRESHOLD(x) ((x) << S_THRESHOLD)
+
+#define A_SG_TIMER_TICK 0x68
+
+#define A_SG_CQ_CONTEXT_BADDR 0x6c
+
+#define A_SG_OCO_BASE 0x70
+
+#define S_BASE1    16
+#define M_BASE1    0xffff
+#define V_BASE1(x) ((x) << S_BASE1)
+
+#define A_SG_DRB_PRI_THRESH 0x74
+
+#define A_PCIX_INT_ENABLE 0x80
+
+#define S_MSIXPARERR    22
+#define M_MSIXPARERR    0x7
+
+#define V_MSIXPARERR(x) ((x) << S_MSIXPARERR)
+
+#define S_CFPARERR    18
+#define M_CFPARERR    0xf
+
+#define V_CFPARERR(x) ((x) << S_CFPARERR)
+
+#define S_RFPARERR    14
+#define M_RFPARERR    0xf
+
+#define V_RFPARERR(x) ((x) << S_RFPARERR)
+
+#define S_WFPARERR    12
+#define M_WFPARERR    0x3
+
+#define V_WFPARERR(x) ((x) << S_WFPARERR)
+
+#define S_PIOPARERR    11
+#define V_PIOPARERR(x) ((x) << S_PIOPARERR)
+#define F_PIOPARERR    V_PIOPARERR(1U)
+
+#define S_DETUNCECCERR    10
+#define V_DETUNCECCERR(x) ((x) << S_DETUNCECCERR)
+#define F_DETUNCECCERR    V_DETUNCECCERR(1U)
+
+#define S_DETCORECCERR    9
+#define V_DETCORECCERR(x) ((x) << S_DETCORECCERR)
+#define F_DETCORECCERR    V_DETCORECCERR(1U)
+
+#define S_RCVSPLCMPERR    8
+#define V_RCVSPLCMPERR(x) ((x) << S_RCVSPLCMPERR)
+#define F_RCVSPLCMPERR    V_RCVSPLCMPERR(1U)
+
+#define S_UNXSPLCMP    7
+#define V_UNXSPLCMP(x) ((x) << S_UNXSPLCMP)
+#define F_UNXSPLCMP    V_UNXSPLCMP(1U)
+
+#define S_SPLCMPDIS    6
+#define V_SPLCMPDIS(x) ((x) << S_SPLCMPDIS)
+#define F_SPLCMPDIS    V_SPLCMPDIS(1U)
+
+#define S_DETPARERR    5
+#define V_DETPARERR(x) ((x) << S_DETPARERR)
+#define F_DETPARERR    V_DETPARERR(1U)
+
+#define S_SIGSYSERR    4
+#define V_SIGSYSERR(x) ((x) << S_SIGSYSERR)
+#define F_SIGSYSERR    V_SIGSYSERR(1U)
+
+#define S_RCVMSTABT    3
+#define V_RCVMSTABT(x) ((x) << S_RCVMSTABT)
+#define F_RCVMSTABT    V_RCVMSTABT(1U)
+
+#define S_RCVTARABT    2
+#define V_RCVTARABT(x) ((x) << S_RCVTARABT)
+#define F_RCVTARABT    V_RCVTARABT(1U)
+
+#define S_SIGTARABT    1
+#define V_SIGTARABT(x) ((x) << S_SIGTARABT)
+#define F_SIGTARABT    V_SIGTARABT(1U)
+
+#define S_MSTDETPARERR    0
+#define V_MSTDETPARERR(x) ((x) << S_MSTDETPARERR)
+#define F_MSTDETPARERR    V_MSTDETPARERR(1U)
+
+#define A_PCIX_INT_CAUSE 0x84
+
+#define A_PCIX_CFG 0x88
+
+#define S_DMASTOPEN    19
+#define V_DMASTOPEN(x) ((x) << S_DMASTOPEN)
+#define F_DMASTOPEN    V_DMASTOPEN(1U)
+
+#define S_CLIDECEN    18
+#define V_CLIDECEN(x) ((x) << S_CLIDECEN)
+#define F_CLIDECEN    V_CLIDECEN(1U)
+
+#define A_PCIX_MODE 0x8c
+
+#define S_PCLKRANGE    6
+#define M_PCLKRANGE    0x3
+#define V_PCLKRANGE(x) ((x) << S_PCLKRANGE)
+#define G_PCLKRANGE(x) (((x) >> S_PCLKRANGE) & M_PCLKRANGE)
+
+#define S_PCIXINITPAT    2
+#define M_PCIXINITPAT    0xf
+#define V_PCIXINITPAT(x) ((x) << S_PCIXINITPAT)
+#define G_PCIXINITPAT(x) (((x) >> S_PCIXINITPAT) & M_PCIXINITPAT)
+
+#define S_64BIT    0
+#define V_64BIT(x) ((x) << S_64BIT)
+#define F_64BIT    V_64BIT(1U)
+
+#define A_PCIE_INT_ENABLE 0x80
+
+#define S_BISTERR    15
+#define M_BISTERR    0xff
+
+#define V_BISTERR(x) ((x) << S_BISTERR)
+
+#define S_TXPARERR    18
+#define V_TXPARERR(x) ((x) << S_TXPARERR)
+#define F_TXPARERR    V_TXPARERR(1U)
+
+#define S_RXPARERR    17
+#define V_RXPARERR(x) ((x) << S_RXPARERR)
+#define F_RXPARERR    V_RXPARERR(1U)
+
+#define S_RETRYLUTPARERR    16
+#define V_RETRYLUTPARERR(x) ((x) << S_RETRYLUTPARERR)
+#define F_RETRYLUTPARERR    V_RETRYLUTPARERR(1U)
+
+#define S_RETRYBUFPARERR    15
+#define V_RETRYBUFPARERR(x) ((x) << S_RETRYBUFPARERR)
+#define F_RETRYBUFPARERR    V_RETRYBUFPARERR(1U)
+
+#define S_PCIE_MSIXPARERR    12
+#define M_PCIE_MSIXPARERR    0x7
+
+#define V_PCIE_MSIXPARERR(x) ((x) << S_PCIE_MSIXPARERR)
+
+#define S_PCIE_CFPARERR    11
+#define V_PCIE_CFPARERR(x) ((x) << S_PCIE_CFPARERR)
+#define F_PCIE_CFPARERR    V_PCIE_CFPARERR(1U)
+
+#define S_PCIE_RFPARERR    10
+#define V_PCIE_RFPARERR(x) ((x) << S_PCIE_RFPARERR)
+#define F_PCIE_RFPARERR    V_PCIE_RFPARERR(1U)
+
+#define S_PCIE_WFPARERR    9
+#define V_PCIE_WFPARERR(x) ((x) << S_PCIE_WFPARERR)
+#define F_PCIE_WFPARERR    V_PCIE_WFPARERR(1U)
+
+#define S_PCIE_PIOPARERR    8
+#define V_PCIE_PIOPARERR(x) ((x) << S_PCIE_PIOPARERR)
+#define F_PCIE_PIOPARERR    V_PCIE_PIOPARERR(1U)
+
+#define S_UNXSPLCPLERRC    7
+#define V_UNXSPLCPLERRC(x) ((x) << S_UNXSPLCPLERRC)
+#define F_UNXSPLCPLERRC    V_UNXSPLCPLERRC(1U)
+
+#define S_UNXSPLCPLERRR    6
+#define V_UNXSPLCPLERRR(x) ((x) << S_UNXSPLCPLERRR)
+#define F_UNXSPLCPLERRR    V_UNXSPLCPLERRR(1U)
+
+#define S_PEXERR    0
+#define V_PEXERR(x) ((x) << S_PEXERR)
+#define F_PEXERR    V_PEXERR(1U)
+
+#define A_PCIE_INT_CAUSE 0x84
+
+#define S_PCIE_DMASTOPEN    24
+#define V_PCIE_DMASTOPEN(x) ((x) << S_PCIE_DMASTOPEN)
+#define F_PCIE_DMASTOPEN    V_PCIE_DMASTOPEN(1U)
+
+#define A_PCIE_CFG 0x88
+
+#define S_ENABLELINKDWNDRST    21
+#define V_ENABLELINKDWNDRST(x) ((x) << S_ENABLELINKDWNDRST)
+#define F_ENABLELINKDWNDRST    V_ENABLELINKDWNDRST(1U)
+
+#define S_ENABLELINKDOWNRST    20
+#define V_ENABLELINKDOWNRST(x) ((x) << S_ENABLELINKDOWNRST)
+#define F_ENABLELINKDOWNRST    V_ENABLELINKDOWNRST(1U)
+
+#define S_PCIE_CLIDECEN    16
+#define V_PCIE_CLIDECEN(x) ((x) << S_PCIE_CLIDECEN)
+#define F_PCIE_CLIDECEN    V_PCIE_CLIDECEN(1U)
+
+#define S_CRSTWRMMODE    0
+#define V_CRSTWRMMODE(x) ((x) << S_CRSTWRMMODE)
+#define F_CRSTWRMMODE    V_CRSTWRMMODE(1U)
+
+#define A_PCIE_MODE 0x8c
+
+#define S_NUMFSTTRNSEQRX    10
+#define M_NUMFSTTRNSEQRX    0xff
+#define V_NUMFSTTRNSEQRX(x) ((x) << S_NUMFSTTRNSEQRX)
+#define G_NUMFSTTRNSEQRX(x) (((x) >> S_NUMFSTTRNSEQRX) & M_NUMFSTTRNSEQRX)
+
+#define A_PCIE_PEX_CTRL0 0x98
+
+#define S_NUMFSTTRNSEQ    22
+#define M_NUMFSTTRNSEQ    0xff
+#define V_NUMFSTTRNSEQ(x) ((x) << S_NUMFSTTRNSEQ)
+#define G_NUMFSTTRNSEQ(x) (((x) >> S_NUMFSTTRNSEQ) & M_NUMFSTTRNSEQ)
+
+#define S_REPLAYLMT    2
+#define M_REPLAYLMT    0xfffff
+
+#define V_REPLAYLMT(x) ((x) << S_REPLAYLMT)
+
+#define A_PCIE_PEX_CTRL1 0x9c
+
+#define S_T3A_ACKLAT    0
+#define M_T3A_ACKLAT    0x7ff
+
+#define V_T3A_ACKLAT(x) ((x) << S_T3A_ACKLAT)
+
+#define S_ACKLAT    0
+#define M_ACKLAT    0x1fff
+
+#define V_ACKLAT(x) ((x) << S_ACKLAT)
+
+#define A_PCIE_PEX_ERR 0xa4
+
+#define A_T3DBG_GPIO_EN 0xd0
+
+#define S_GPIO11_OEN    27
+#define V_GPIO11_OEN(x) ((x) << S_GPIO11_OEN)
+#define F_GPIO11_OEN    V_GPIO11_OEN(1U)
+
+#define S_GPIO10_OEN    26
+#define V_GPIO10_OEN(x) ((x) << S_GPIO10_OEN)
+#define F_GPIO10_OEN    V_GPIO10_OEN(1U)
+
+#define S_GPIO7_OEN    23
+#define V_GPIO7_OEN(x) ((x) << S_GPIO7_OEN)
+#define F_GPIO7_OEN    V_GPIO7_OEN(1U)
+
+#define S_GPIO6_OEN    22
+#define V_GPIO6_OEN(x) ((x) << S_GPIO6_OEN)
+#define F_GPIO6_OEN    V_GPIO6_OEN(1U)
+
+#define S_GPIO5_OEN    21
+#define V_GPIO5_OEN(x) ((x) << S_GPIO5_OEN)
+#define F_GPIO5_OEN    V_GPIO5_OEN(1U)
+
+#define S_GPIO4_OEN    20
+#define V_GPIO4_OEN(x) ((x) << S_GPIO4_OEN)
+#define F_GPIO4_OEN    V_GPIO4_OEN(1U)
+
+#define S_GPIO2_OEN    18
+#define V_GPIO2_OEN(x) ((x) << S_GPIO2_OEN)
+#define F_GPIO2_OEN    V_GPIO2_OEN(1U)
+
+#define S_GPIO1_OEN    17
+#define V_GPIO1_OEN(x) ((x) << S_GPIO1_OEN)
+#define F_GPIO1_OEN    V_GPIO1_OEN(1U)
+
+#define S_GPIO0_OEN    16
+#define V_GPIO0_OEN(x) ((x) << S_GPIO0_OEN)
+#define F_GPIO0_OEN    V_GPIO0_OEN(1U)
+
+#define S_GPIO10_OUT_VAL    10
+#define V_GPIO10_OUT_VAL(x) ((x) << S_GPIO10_OUT_VAL)
+#define F_GPIO10_OUT_VAL    V_GPIO10_OUT_VAL(1U)
+
+#define S_GPIO7_OUT_VAL    7
+#define V_GPIO7_OUT_VAL(x) ((x) << S_GPIO7_OUT_VAL)
+#define F_GPIO7_OUT_VAL    V_GPIO7_OUT_VAL(1U)
+
+#define S_GPIO6_OUT_VAL    6
+#define V_GPIO6_OUT_VAL(x) ((x) << S_GPIO6_OUT_VAL)
+#define F_GPIO6_OUT_VAL    V_GPIO6_OUT_VAL(1U)
+
+#define S_GPIO5_OUT_VAL    5
+#define V_GPIO5_OUT_VAL(x) ((x) << S_GPIO5_OUT_VAL)
+#define F_GPIO5_OUT_VAL    V_GPIO5_OUT_VAL(1U)
+
+#define S_GPIO4_OUT_VAL    4
+#define V_GPIO4_OUT_VAL(x) ((x) << S_GPIO4_OUT_VAL)
+#define F_GPIO4_OUT_VAL    V_GPIO4_OUT_VAL(1U)
+
+#define S_GPIO2_OUT_VAL    2
+#define V_GPIO2_OUT_VAL(x) ((x) << S_GPIO2_OUT_VAL)
+#define F_GPIO2_OUT_VAL    V_GPIO2_OUT_VAL(1U)
+
+#define S_GPIO1_OUT_VAL    1
+#define V_GPIO1_OUT_VAL(x) ((x) << S_GPIO1_OUT_VAL)
+#define F_GPIO1_OUT_VAL    V_GPIO1_OUT_VAL(1U)
+
+#define S_GPIO0_OUT_VAL    0
+#define V_GPIO0_OUT_VAL(x) ((x) << S_GPIO0_OUT_VAL)
+#define F_GPIO0_OUT_VAL    V_GPIO0_OUT_VAL(1U)
+
+#define A_T3DBG_INT_ENABLE 0xd8
+
+#define S_GPIO11    11
+#define V_GPIO11(x) ((x) << S_GPIO11)
+#define F_GPIO11    V_GPIO11(1U)
+
+#define S_GPIO10    10
+#define V_GPIO10(x) ((x) << S_GPIO10)
+#define F_GPIO10    V_GPIO10(1U)
+
+#define S_GPIO9    9
+#define V_GPIO9(x) ((x) << S_GPIO9)
+#define F_GPIO9    V_GPIO9(1U)
+
+#define S_GPIO7    7
+#define V_GPIO7(x) ((x) << S_GPIO7)
+#define F_GPIO7    V_GPIO7(1U)
+
+#define S_GPIO6    6
+#define V_GPIO6(x) ((x) << S_GPIO6)
+#define F_GPIO6    V_GPIO6(1U)
+
+#define S_GPIO5    5
+#define V_GPIO5(x) ((x) << S_GPIO5)
+#define F_GPIO5    V_GPIO5(1U)
+
+#define S_GPIO4    4
+#define V_GPIO4(x) ((x) << S_GPIO4)
+#define F_GPIO4    V_GPIO4(1U)
+
+#define S_GPIO3    3
+#define V_GPIO3(x) ((x) << S_GPIO3)
+#define F_GPIO3    V_GPIO3(1U)
+
+#define S_GPIO2    2
+#define V_GPIO2(x) ((x) << S_GPIO2)
+#define F_GPIO2    V_GPIO2(1U)
+
+#define S_GPIO1    1
+#define V_GPIO1(x) ((x) << S_GPIO1)
+#define F_GPIO1    V_GPIO1(1U)
+
+#define S_GPIO0    0
+#define V_GPIO0(x) ((x) << S_GPIO0)
+#define F_GPIO0    V_GPIO0(1U)
+
+#define A_T3DBG_INT_CAUSE 0xdc
+
+#define A_T3DBG_GPIO_ACT_LOW 0xf0
+
+#define MC7_PMRX_BASE_ADDR 0x100
+
+#define A_MC7_CFG 0x100
+
+#define S_IFEN    13
+#define V_IFEN(x) ((x) << S_IFEN)
+#define F_IFEN    V_IFEN(1U)
+
+#define S_TERM150    11
+#define V_TERM150(x) ((x) << S_TERM150)
+#define F_TERM150    V_TERM150(1U)
+
+#define S_SLOW    10
+#define V_SLOW(x) ((x) << S_SLOW)
+#define F_SLOW    V_SLOW(1U)
+
+#define S_WIDTH    8
+#define M_WIDTH    0x3
+#define V_WIDTH(x) ((x) << S_WIDTH)
+#define G_WIDTH(x) (((x) >> S_WIDTH) & M_WIDTH)
+
+#define S_BKS    6
+#define V_BKS(x) ((x) << S_BKS)
+#define F_BKS    V_BKS(1U)
+
+#define S_ORG    5
+#define V_ORG(x) ((x) << S_ORG)
+#define F_ORG    V_ORG(1U)
+
+#define S_DEN    2
+#define M_DEN    0x7
+#define V_DEN(x) ((x) << S_DEN)
+#define G_DEN(x) (((x) >> S_DEN) & M_DEN)
+
+#define S_RDY    1
+#define V_RDY(x) ((x) << S_RDY)
+#define F_RDY    V_RDY(1U)
+
+#define S_CLKEN    0
+#define V_CLKEN(x) ((x) << S_CLKEN)
+#define F_CLKEN    V_CLKEN(1U)
+
+#define A_MC7_MODE 0x104
+
+#define S_BUSY    31
+#define V_BUSY(x) ((x) << S_BUSY)
+#define F_BUSY    V_BUSY(1U)
+
+#define A_MC7_EXT_MODE1 0x108
+
+#define A_MC7_EXT_MODE2 0x10c
+
+#define A_MC7_EXT_MODE3 0x110
+
+#define A_MC7_PRE 0x114
+
+#define A_MC7_REF 0x118
+
+#define S_PREREFDIV    1
+#define M_PREREFDIV    0x3fff
+#define V_PREREFDIV(x) ((x) << S_PREREFDIV)
+
+#define S_PERREFEN    0
+#define V_PERREFEN(x) ((x) << S_PERREFEN)
+#define F_PERREFEN    V_PERREFEN(1U)
+
+#define A_MC7_DLL 0x11c
+
+#define S_DLLENB    1
+#define V_DLLENB(x) ((x) << S_DLLENB)
+#define F_DLLENB    V_DLLENB(1U)
+
+#define S_DLLRST    0
+#define V_DLLRST(x) ((x) << S_DLLRST)
+#define F_DLLRST    V_DLLRST(1U)
+
+#define A_MC7_PARM 0x120
+
+#define S_ACTTOPREDLY    26
+#define M_ACTTOPREDLY    0xf
+#define V_ACTTOPREDLY(x) ((x) << S_ACTTOPREDLY)
+
+#define S_ACTTORDWRDLY    23
+#define M_ACTTORDWRDLY    0x7
+#define V_ACTTORDWRDLY(x) ((x) << S_ACTTORDWRDLY)
+
+#define S_PRECYC    20
+#define M_PRECYC    0x7
+#define V_PRECYC(x) ((x) << S_PRECYC)
+
+#define S_REFCYC    13
+#define M_REFCYC    0x7f
+#define V_REFCYC(x) ((x) << S_REFCYC)
+
+#define S_BKCYC    8
+#define M_BKCYC    0x1f
+#define V_BKCYC(x) ((x) << S_BKCYC)
+
+#define S_WRTORDDLY    4
+#define M_WRTORDDLY    0xf
+#define V_WRTORDDLY(x) ((x) << S_WRTORDDLY)
+
+#define S_RDTOWRDLY    0
+#define M_RDTOWRDLY    0xf
+#define V_RDTOWRDLY(x) ((x) << S_RDTOWRDLY)
+
+#define A_MC7_CAL 0x128
+
+#define S_CAL_FAULT    30
+#define V_CAL_FAULT(x) ((x) << S_CAL_FAULT)
+#define F_CAL_FAULT    V_CAL_FAULT(1U)
+
+#define S_SGL_CAL_EN    20
+#define V_SGL_CAL_EN(x) ((x) << S_SGL_CAL_EN)
+#define F_SGL_CAL_EN    V_SGL_CAL_EN(1U)
+
+#define A_MC7_ERR_ADDR 0x12c
+
+#define A_MC7_ECC 0x130
+
+#define S_ECCCHKEN    1
+#define V_ECCCHKEN(x) ((x) << S_ECCCHKEN)
+#define F_ECCCHKEN    V_ECCCHKEN(1U)
+
+#define S_ECCGENEN    0
+#define V_ECCGENEN(x) ((x) << S_ECCGENEN)
+#define F_ECCGENEN    V_ECCGENEN(1U)
+
+#define A_MC7_CE_ADDR 0x134
+
+#define A_MC7_CE_DATA0 0x138
+
+#define A_MC7_CE_DATA1 0x13c
+
+#define A_MC7_CE_DATA2 0x140
+
+#define S_DATA    0
+#define M_DATA    0xff
+
+#define G_DATA(x) (((x) >> S_DATA) & M_DATA)
+
+#define A_MC7_UE_ADDR 0x144
+
+#define A_MC7_UE_DATA0 0x148
+
+#define A_MC7_UE_DATA1 0x14c
+
+#define A_MC7_UE_DATA2 0x150
+
+#define A_MC7_BD_ADDR 0x154
+
+#define S_ADDR    3
+
+#define M_ADDR    0x1fffffff
+
+#define A_MC7_BD_DATA0 0x158
+
+#define A_MC7_BD_DATA1 0x15c
+
+#define A_MC7_BD_OP 0x164
+
+#define S_OP    0
+
+#define V_OP(x) ((x) << S_OP)
+#define F_OP    V_OP(1U)
+
+#define A_MC7_BIST_ADDR_BEG 0x168
+
+#define A_MC7_BIST_ADDR_END 0x16c
+
+#define A_MC7_BIST_DATA 0x170
+
+#define A_MC7_BIST_OP 0x174
+
+#define S_CONT    3
+#define V_CONT(x) ((x) << S_CONT)
+#define F_CONT    V_CONT(1U)
+
+#define A_MC7_INT_ENABLE 0x178
+
+#define S_AE    17
+#define V_AE(x) ((x) << S_AE)
+#define F_AE    V_AE(1U)
+
+#define S_PE    2
+#define M_PE    0x7fff
+
+#define V_PE(x) ((x) << S_PE)
+
+#define G_PE(x) (((x) >> S_PE) & M_PE)
+
+#define S_UE    1
+#define V_UE(x) ((x) << S_UE)
+#define F_UE    V_UE(1U)
+
+#define S_CE    0
+#define V_CE(x) ((x) << S_CE)
+#define F_CE    V_CE(1U)
+
+#define A_MC7_INT_CAUSE 0x17c
+
+#define MC7_PMTX_BASE_ADDR 0x180
+
+#define MC7_CM_BASE_ADDR 0x200
+
+#define A_CIM_BOOT_CFG 0x280
+
+#define S_BOOTADDR    2
+#define M_BOOTADDR    0x3fffffff
+#define V_BOOTADDR(x) ((x) << S_BOOTADDR)
+
+#define A_CIM_SDRAM_BASE_ADDR 0x28c
+
+#define A_CIM_SDRAM_ADDR_SIZE 0x290
+
+#define A_CIM_HOST_INT_ENABLE 0x298
+
+#define S_DTAGPARERR    28
+#define V_DTAGPARERR(x) ((x) << S_DTAGPARERR)
+#define F_DTAGPARERR    V_DTAGPARERR(1U)
+
+#define S_ITAGPARERR    27
+#define V_ITAGPARERR(x) ((x) << S_ITAGPARERR)
+#define F_ITAGPARERR    V_ITAGPARERR(1U)
+
+#define S_IBQTPPARERR    26
+#define V_IBQTPPARERR(x) ((x) << S_IBQTPPARERR)
+#define F_IBQTPPARERR    V_IBQTPPARERR(1U)
+
+#define S_IBQULPPARERR    25
+#define V_IBQULPPARERR(x) ((x) << S_IBQULPPARERR)
+#define F_IBQULPPARERR    V_IBQULPPARERR(1U)
+
+#define S_IBQSGEHIPARERR    24
+#define V_IBQSGEHIPARERR(x) ((x) << S_IBQSGEHIPARERR)
+#define F_IBQSGEHIPARERR    V_IBQSGEHIPARERR(1U)
+
+#define S_IBQSGELOPARERR    23
+#define V_IBQSGELOPARERR(x) ((x) << S_IBQSGELOPARERR)
+#define F_IBQSGELOPARERR    V_IBQSGELOPARERR(1U)
+
+#define S_OBQULPLOPARERR    22
+#define V_OBQULPLOPARERR(x) ((x) << S_OBQULPLOPARERR)
+#define F_OBQULPLOPARERR    V_OBQULPLOPARERR(1U)
+
+#define S_OBQULPHIPARERR    21
+#define V_OBQULPHIPARERR(x) ((x) << S_OBQULPHIPARERR)
+#define F_OBQULPHIPARERR    V_OBQULPHIPARERR(1U)
+
+#define S_OBQSGEPARERR    20
+#define V_OBQSGEPARERR(x) ((x) << S_OBQSGEPARERR)
+#define F_OBQSGEPARERR    V_OBQSGEPARERR(1U)
+
+#define S_DCACHEPARERR    19
+#define V_DCACHEPARERR(x) ((x) << S_DCACHEPARERR)
+#define F_DCACHEPARERR    V_DCACHEPARERR(1U)
+
+#define S_ICACHEPARERR    18
+#define V_ICACHEPARERR(x) ((x) << S_ICACHEPARERR)
+#define F_ICACHEPARERR    V_ICACHEPARERR(1U)
+
+#define S_DRAMPARERR    17
+#define V_DRAMPARERR(x) ((x) << S_DRAMPARERR)
+#define F_DRAMPARERR    V_DRAMPARERR(1U)
+
+#define A_CIM_HOST_INT_CAUSE 0x29c
+
+#define S_BLKWRPLINT    12
+#define V_BLKWRPLINT(x) ((x) << S_BLKWRPLINT)
+#define F_BLKWRPLINT    V_BLKWRPLINT(1U)
+
+#define S_BLKRDPLINT    11
+#define V_BLKRDPLINT(x) ((x) << S_BLKRDPLINT)
+#define F_BLKRDPLINT    V_BLKRDPLINT(1U)
+
+#define S_BLKWRCTLINT    10
+#define V_BLKWRCTLINT(x) ((x) << S_BLKWRCTLINT)
+#define F_BLKWRCTLINT    V_BLKWRCTLINT(1U)
+
+#define S_BLKRDCTLINT    9
+#define V_BLKRDCTLINT(x) ((x) << S_BLKRDCTLINT)
+#define F_BLKRDCTLINT    V_BLKRDCTLINT(1U)
+
+#define S_BLKWRFLASHINT    8
+#define V_BLKWRFLASHINT(x) ((x) << S_BLKWRFLASHINT)
+#define F_BLKWRFLASHINT    V_BLKWRFLASHINT(1U)
+
+#define S_BLKRDFLASHINT    7
+#define V_BLKRDFLASHINT(x) ((x) << S_BLKRDFLASHINT)
+#define F_BLKRDFLASHINT    V_BLKRDFLASHINT(1U)
+
+#define S_SGLWRFLASHINT    6
+#define V_SGLWRFLASHINT(x) ((x) << S_SGLWRFLASHINT)
+#define F_SGLWRFLASHINT    V_SGLWRFLASHINT(1U)
+
+#define S_WRBLKFLASHINT    5
+#define V_WRBLKFLASHINT(x) ((x) << S_WRBLKFLASHINT)
+#define F_WRBLKFLASHINT    V_WRBLKFLASHINT(1U)
+
+#define S_BLKWRBOOTINT    4
+#define V_BLKWRBOOTINT(x) ((x) << S_BLKWRBOOTINT)
+#define F_BLKWRBOOTINT    V_BLKWRBOOTINT(1U)
+
+#define S_FLASHRANGEINT    2
+#define V_FLASHRANGEINT(x) ((x) << S_FLASHRANGEINT)
+#define F_FLASHRANGEINT    V_FLASHRANGEINT(1U)
+
+#define S_SDRAMRANGEINT    1
+#define V_SDRAMRANGEINT(x) ((x) << S_SDRAMRANGEINT)
+#define F_SDRAMRANGEINT    V_SDRAMRANGEINT(1U)
+
+#define S_RSVDSPACEINT    0
+#define V_RSVDSPACEINT(x) ((x) << S_RSVDSPACEINT)
+#define F_RSVDSPACEINT    V_RSVDSPACEINT(1U)
+
+#define A_CIM_HOST_ACC_CTRL 0x2b0
+
+#define S_HOSTBUSY    17
+#define V_HOSTBUSY(x) ((x) << S_HOSTBUSY)
+#define F_HOSTBUSY    V_HOSTBUSY(1U)
+
+#define A_CIM_HOST_ACC_DATA 0x2b4
+
+#define A_CIM_IBQ_DBG_CFG 0x2c0
+
+#define S_IBQDBGADDR    16
+#define M_IBQDBGADDR    0x1ff
+#define V_IBQDBGADDR(x) ((x) << S_IBQDBGADDR)
+#define G_IBQDBGADDR(x) (((x) >> S_IBQDBGADDR) & M_IBQDBGADDR)
+
+#define S_IBQDBGQID    3
+#define M_IBQDBGQID    0x3
+#define V_IBQDBGQID(x) ((x) << S_IBQDBGQID)
+#define G_IBQDBGQID(x) (((x) >> S_IBQDBGQID) & M_IBQDBGQID)
+
+#define S_IBQDBGWR    2
+#define V_IBQDBGWR(x) ((x) << S_IBQDBGWR)
+#define F_IBQDBGWR    V_IBQDBGWR(1U)
+
+#define S_IBQDBGBUSY    1
+#define V_IBQDBGBUSY(x) ((x) << S_IBQDBGBUSY)
+#define F_IBQDBGBUSY    V_IBQDBGBUSY(1U)
+
+#define S_IBQDBGEN    0
+#define V_IBQDBGEN(x) ((x) << S_IBQDBGEN)
+#define F_IBQDBGEN    V_IBQDBGEN(1U)
+
+#define A_CIM_IBQ_DBG_DATA 0x2c8
+
+#define A_TP_IN_CONFIG 0x300
+
+#define S_RXFBARBPRIO    25
+#define V_RXFBARBPRIO(x) ((x) << S_RXFBARBPRIO)
+#define F_RXFBARBPRIO    V_RXFBARBPRIO(1U)
+
+#define S_TXFBARBPRIO    24
+#define V_TXFBARBPRIO(x) ((x) << S_TXFBARBPRIO)
+#define F_TXFBARBPRIO    V_TXFBARBPRIO(1U)
+
+#define S_NICMODE    14
+#define V_NICMODE(x) ((x) << S_NICMODE)
+#define F_NICMODE    V_NICMODE(1U)
+
+#define S_IPV6ENABLE    15
+#define V_IPV6ENABLE(x) ((x) << S_IPV6ENABLE)
+#define F_IPV6ENABLE    V_IPV6ENABLE(1U)
+
+#define A_TP_OUT_CONFIG 0x304
+
+#define S_VLANEXTRACTIONENABLE    12
+
+#define A_TP_GLOBAL_CONFIG 0x308
+
+#define S_TXPACINGENABLE    24
+#define V_TXPACINGENABLE(x) ((x) << S_TXPACINGENABLE)
+#define F_TXPACINGENABLE    V_TXPACINGENABLE(1U)
+
+#define S_PATHMTU    15
+#define V_PATHMTU(x) ((x) << S_PATHMTU)
+#define F_PATHMTU    V_PATHMTU(1U)
+
+#define S_IPCHECKSUMOFFLOAD    13
+#define V_IPCHECKSUMOFFLOAD(x) ((x) << S_IPCHECKSUMOFFLOAD)
+#define F_IPCHECKSUMOFFLOAD    V_IPCHECKSUMOFFLOAD(1U)
+
+#define S_UDPCHECKSUMOFFLOAD    12
+#define V_UDPCHECKSUMOFFLOAD(x) ((x) << S_UDPCHECKSUMOFFLOAD)
+#define F_UDPCHECKSUMOFFLOAD    V_UDPCHECKSUMOFFLOAD(1U)
+
+#define S_TCPCHECKSUMOFFLOAD    11
+#define V_TCPCHECKSUMOFFLOAD(x) ((x) << S_TCPCHECKSUMOFFLOAD)
+#define F_TCPCHECKSUMOFFLOAD    V_TCPCHECKSUMOFFLOAD(1U)
+
+#define S_IPTTL    0
+#define M_IPTTL    0xff
+#define V_IPTTL(x) ((x) << S_IPTTL)
+
+#define A_TP_CMM_MM_BASE 0x314
+
+#define A_TP_CMM_TIMER_BASE 0x318
+
+#define S_CMTIMERMAXNUM    28
+#define M_CMTIMERMAXNUM    0x3
+#define V_CMTIMERMAXNUM(x) ((x) << S_CMTIMERMAXNUM)
+
+#define A_TP_PMM_SIZE 0x31c
+
+#define A_TP_PMM_TX_BASE 0x320
+
+#define A_TP_PMM_RX_BASE 0x328
+
+#define A_TP_PMM_RX_PAGE_SIZE 0x32c
+
+#define A_TP_PMM_RX_MAX_PAGE 0x330
+
+#define A_TP_PMM_TX_PAGE_SIZE 0x334
+
+#define A_TP_PMM_TX_MAX_PAGE 0x338
+
+#define A_TP_TCP_OPTIONS 0x340
+
+#define S_MTUDEFAULT    16
+#define M_MTUDEFAULT    0xffff
+#define V_MTUDEFAULT(x) ((x) << S_MTUDEFAULT)
+
+#define S_MTUENABLE    10
+#define V_MTUENABLE(x) ((x) << S_MTUENABLE)
+#define F_MTUENABLE    V_MTUENABLE(1U)
+
+#define S_SACKRX    8
+#define V_SACKRX(x) ((x) << S_SACKRX)
+#define F_SACKRX    V_SACKRX(1U)
+
+#define S_SACKMODE    4
+
+#define M_SACKMODE    0x3
+
+#define V_SACKMODE(x) ((x) << S_SACKMODE)
+
+#define S_WINDOWSCALEMODE    2
+#define M_WINDOWSCALEMODE    0x3
+#define V_WINDOWSCALEMODE(x) ((x) << S_WINDOWSCALEMODE)
+
+#define S_TIMESTAMPSMODE    0
+
+#define M_TIMESTAMPSMODE    0x3
+
+#define V_TIMESTAMPSMODE(x) ((x) << S_TIMESTAMPSMODE)
+
+#define A_TP_DACK_CONFIG 0x344
+
+#define S_AUTOSTATE3    30
+#define M_AUTOSTATE3    0x3
+#define V_AUTOSTATE3(x) ((x) << S_AUTOSTATE3)
+
+#define S_AUTOSTATE2    28
+#define M_AUTOSTATE2    0x3
+#define V_AUTOSTATE2(x) ((x) << S_AUTOSTATE2)
+
+#define S_AUTOSTATE1    26
+#define M_AUTOSTATE1    0x3
+#define V_AUTOSTATE1(x) ((x) << S_AUTOSTATE1)
+
+#define S_BYTETHRESHOLD    5
+#define M_BYTETHRESHOLD    0xfffff
+#define V_BYTETHRESHOLD(x) ((x) << S_BYTETHRESHOLD)
+
+#define S_MSSTHRESHOLD    3
+#define M_MSSTHRESHOLD    0x3
+#define V_MSSTHRESHOLD(x) ((x) << S_MSSTHRESHOLD)
+
+#define S_AUTOCAREFUL    2
+#define V_AUTOCAREFUL(x) ((x) << S_AUTOCAREFUL)
+#define F_AUTOCAREFUL    V_AUTOCAREFUL(1U)
+
+#define S_AUTOENABLE    1
+#define V_AUTOENABLE(x) ((x) << S_AUTOENABLE)
+#define F_AUTOENABLE    V_AUTOENABLE(1U)
+
+#define S_DACK_MODE    0
+#define V_DACK_MODE(x) ((x) << S_DACK_MODE)
+#define F_DACK_MODE    V_DACK_MODE(1U)
+
+#define A_TP_PC_CONFIG 0x348
+
+#define S_TXTOSQUEUEMAPMODE    26
+#define V_TXTOSQUEUEMAPMODE(x) ((x) << S_TXTOSQUEUEMAPMODE)
+#define F_TXTOSQUEUEMAPMODE    V_TXTOSQUEUEMAPMODE(1U)
+
+#define S_ENABLEEPCMDAFULL    23
+#define V_ENABLEEPCMDAFULL(x) ((x) << S_ENABLEEPCMDAFULL)
+#define F_ENABLEEPCMDAFULL    V_ENABLEEPCMDAFULL(1U)
+
+#define S_MODULATEUNIONMODE    22
+#define V_MODULATEUNIONMODE(x) ((x) << S_MODULATEUNIONMODE)
+#define F_MODULATEUNIONMODE    V_MODULATEUNIONMODE(1U)
+
+#define S_TXDEFERENABLE    20
+#define V_TXDEFERENABLE(x) ((x) << S_TXDEFERENABLE)
+#define F_TXDEFERENABLE    V_TXDEFERENABLE(1U)
+
+#define S_RXCONGESTIONMODE    19
+#define V_RXCONGESTIONMODE(x) ((x) << S_RXCONGESTIONMODE)
+#define F_RXCONGESTIONMODE    V_RXCONGESTIONMODE(1U)
+
+#define S_HEARBEATDACK    16
+#define V_HEARBEATDACK(x) ((x) << S_HEARBEATDACK)
+#define F_HEARBEATDACK    V_HEARBEATDACK(1U)
+
+#define S_TXCONGESTIONMODE    15
+#define V_TXCONGESTIONMODE(x) ((x) << S_TXCONGESTIONMODE)
+#define F_TXCONGESTIONMODE    V_TXCONGESTIONMODE(1U)
+
+#define S_ENABLEOCSPIFULL    30
+#define V_ENABLEOCSPIFULL(x) ((x) << S_ENABLEOCSPIFULL)
+#define F_ENABLEOCSPIFULL    V_ENABLEOCSPIFULL(1U)
+
+#define S_LOCKTID    28
+#define V_LOCKTID(x) ((x) << S_LOCKTID)
+#define F_LOCKTID    V_LOCKTID(1U)
+
+#define S_TABLELATENCYDELTA    0
+#define M_TABLELATENCYDELTA    0xf
+#define V_TABLELATENCYDELTA(x) ((x) << S_TABLELATENCYDELTA)
+#define G_TABLELATENCYDELTA(x) \
+	(((x) >> S_TABLELATENCYDELTA) & M_TABLELATENCYDELTA)
+
+#define A_TP_PC_CONFIG2 0x34c
+
+#define S_DISBLEDAPARBIT0    15
+#define V_DISBLEDAPARBIT0(x) ((x) << S_DISBLEDAPARBIT0)
+#define F_DISBLEDAPARBIT0    V_DISBLEDAPARBIT0(1U)
+
+#define S_ENABLEARPMISS    13
+#define V_ENABLEARPMISS(x) ((x) << S_ENABLEARPMISS)
+#define F_ENABLEARPMISS    V_ENABLEARPMISS(1U)
+
+#define S_ENABLENONOFDTNLSYN    12
+#define V_ENABLENONOFDTNLSYN(x) ((x) << S_ENABLENONOFDTNLSYN)
+#define F_ENABLENONOFDTNLSYN    V_ENABLENONOFDTNLSYN(1U)
+
+#define S_ENABLEIPV6RSS    11
+#define V_ENABLEIPV6RSS(x) ((x) << S_ENABLEIPV6RSS)
+#define F_ENABLEIPV6RSS    V_ENABLEIPV6RSS(1U)
+
+#define S_CHDRAFULL    4
+#define V_CHDRAFULL(x) ((x) << S_CHDRAFULL)
+#define F_CHDRAFULL    V_CHDRAFULL(1U)
+
+#define A_TP_TCP_BACKOFF_REG0 0x350
+
+#define A_TP_TCP_BACKOFF_REG1 0x354
+
+#define A_TP_TCP_BACKOFF_REG2 0x358
+
+#define A_TP_TCP_BACKOFF_REG3 0x35c
+
+#define A_TP_PARA_REG2 0x368
+
+#define S_MAXRXDATA    16
+#define M_MAXRXDATA    0xffff
+#define V_MAXRXDATA(x) ((x) << S_MAXRXDATA)
+
+#define S_RXCOALESCESIZE    0
+#define M_RXCOALESCESIZE    0xffff
+#define V_RXCOALESCESIZE(x) ((x) << S_RXCOALESCESIZE)
+
+#define A_TP_PARA_REG3 0x36c
+
+#define S_TXDATAACKIDX    16
+#define M_TXDATAACKIDX    0xf
+
+#define V_TXDATAACKIDX(x) ((x) << S_TXDATAACKIDX)
+
+#define S_TXPACEAUTOSTRICT    10
+#define V_TXPACEAUTOSTRICT(x) ((x) << S_TXPACEAUTOSTRICT)
+#define F_TXPACEAUTOSTRICT    V_TXPACEAUTOSTRICT(1U)
+
+#define S_TXPACEFIXED    9
+#define V_TXPACEFIXED(x) ((x) << S_TXPACEFIXED)
+#define F_TXPACEFIXED    V_TXPACEFIXED(1U)
+
+#define S_TXPACEAUTO    8
+#define V_TXPACEAUTO(x) ((x) << S_TXPACEAUTO)
+#define F_TXPACEAUTO    V_TXPACEAUTO(1U)
+
+#define S_RXCOALESCEENABLE    1
+#define V_RXCOALESCEENABLE(x) ((x) << S_RXCOALESCEENABLE)
+#define F_RXCOALESCEENABLE    V_RXCOALESCEENABLE(1U)
+
+#define S_RXCOALESCEPSHEN    0
+#define V_RXCOALESCEPSHEN(x) ((x) << S_RXCOALESCEPSHEN)
+#define F_RXCOALESCEPSHEN    V_RXCOALESCEPSHEN(1U)
+
+#define A_TP_PARA_REG4 0x370
+
+#define A_TP_PARA_REG5 0x374
+
+#define S_RXDDPOFFINIT    3
+#define V_RXDDPOFFINIT(x) ((x) << S_RXDDPOFFINIT)
+#define F_RXDDPOFFINIT    V_RXDDPOFFINIT(1U)
+
+#define A_TP_PARA_REG6 0x378
+
+#define S_T3A_ENABLEESND    13
+#define V_T3A_ENABLEESND(x) ((x) << S_T3A_ENABLEESND)
+#define F_T3A_ENABLEESND    V_T3A_ENABLEESND(1U)
+
+#define S_ENABLEESND    11
+#define V_ENABLEESND(x) ((x) << S_ENABLEESND)
+#define F_ENABLEESND    V_ENABLEESND(1U)
+
+#define A_TP_PARA_REG7 0x37c
+
+#define S_PMMAXXFERLEN1    16
+#define M_PMMAXXFERLEN1    0xffff
+#define V_PMMAXXFERLEN1(x) ((x) << S_PMMAXXFERLEN1)
+
+#define S_PMMAXXFERLEN0    0
+#define M_PMMAXXFERLEN0    0xffff
+#define V_PMMAXXFERLEN0(x) ((x) << S_PMMAXXFERLEN0)
+
+#define A_TP_TIMER_RESOLUTION 0x390
+
+#define S_TIMERRESOLUTION    16
+#define M_TIMERRESOLUTION    0xff
+#define V_TIMERRESOLUTION(x) ((x) << S_TIMERRESOLUTION)
+
+#define S_TIMESTAMPRESOLUTION    8
+#define M_TIMESTAMPRESOLUTION    0xff
+#define V_TIMESTAMPRESOLUTION(x) ((x) << S_TIMESTAMPRESOLUTION)
+
+#define S_DELAYEDACKRESOLUTION    0
+#define M_DELAYEDACKRESOLUTION    0xff
+#define V_DELAYEDACKRESOLUTION(x) ((x) << S_DELAYEDACKRESOLUTION)
+
+#define A_TP_MSL 0x394
+
+#define A_TP_RXT_MIN 0x398
+
+#define A_TP_RXT_MAX 0x39c
+
+#define A_TP_PERS_MIN 0x3a0
+
+#define A_TP_PERS_MAX 0x3a4
+
+#define A_TP_KEEP_IDLE 0x3a8
+
+#define A_TP_KEEP_INTVL 0x3ac
+
+#define A_TP_INIT_SRTT 0x3b0
+
+#define A_TP_DACK_TIMER 0x3b4
+
+#define A_TP_FINWAIT2_TIMER 0x3b8
+
+#define A_TP_SHIFT_CNT 0x3c0
+
+#define S_SYNSHIFTMAX    24
+
+#define M_SYNSHIFTMAX    0xff
+
+#define V_SYNSHIFTMAX(x) ((x) << S_SYNSHIFTMAX)
+
+#define S_RXTSHIFTMAXR1    20
+
+#define M_RXTSHIFTMAXR1    0xf
+
+#define V_RXTSHIFTMAXR1(x) ((x) << S_RXTSHIFTMAXR1)
+
+#define S_RXTSHIFTMAXR2    16
+
+#define M_RXTSHIFTMAXR2    0xf
+
+#define V_RXTSHIFTMAXR2(x) ((x) << S_RXTSHIFTMAXR2)
+
+#define S_PERSHIFTBACKOFFMAX    12
+#define M_PERSHIFTBACKOFFMAX    0xf
+#define V_PERSHIFTBACKOFFMAX(x) ((x) << S_PERSHIFTBACKOFFMAX)
+
+#define S_PERSHIFTMAX    8
+#define M_PERSHIFTMAX    0xf
+#define V_PERSHIFTMAX(x) ((x) << S_PERSHIFTMAX)
+
+#define S_KEEPALIVEMAX    0
+
+#define M_KEEPALIVEMAX    0xff
+
+#define V_KEEPALIVEMAX(x) ((x) << S_KEEPALIVEMAX)
+
+#define A_TP_MTU_PORT_TABLE 0x3d0
+
+#define A_TP_CCTRL_TABLE 0x3dc
+
+#define A_TP_MTU_TABLE 0x3e4
+
+#define A_TP_RSS_MAP_TABLE 0x3e8
+
+#define A_TP_RSS_LKP_TABLE 0x3ec
+
+#define A_TP_RSS_CONFIG 0x3f0
+
+#define S_TNL4TUPEN    29
+#define V_TNL4TUPEN(x) ((x) << S_TNL4TUPEN)
+#define F_TNL4TUPEN    V_TNL4TUPEN(1U)
+
+#define S_TNL2TUPEN    28
+#define V_TNL2TUPEN(x) ((x) << S_TNL2TUPEN)
+#define F_TNL2TUPEN    V_TNL2TUPEN(1U)
+
+#define S_TNLPRTEN    26
+#define V_TNLPRTEN(x) ((x) << S_TNLPRTEN)
+#define F_TNLPRTEN    V_TNLPRTEN(1U)
+
+#define S_TNLMAPEN    25
+#define V_TNLMAPEN(x) ((x) << S_TNLMAPEN)
+#define F_TNLMAPEN    V_TNLMAPEN(1U)
+
+#define S_TNLLKPEN    24
+#define V_TNLLKPEN(x) ((x) << S_TNLLKPEN)
+#define F_TNLLKPEN    V_TNLLKPEN(1U)
+
+#define S_RRCPLMAPEN    7
+#define V_RRCPLMAPEN(x) ((x) << S_RRCPLMAPEN)
+#define F_RRCPLMAPEN    V_RRCPLMAPEN(1U)
+
+#define S_RRCPLCPUSIZE    4
+#define M_RRCPLCPUSIZE    0x7
+#define V_RRCPLCPUSIZE(x) ((x) << S_RRCPLCPUSIZE)
+
+#define S_RQFEEDBACKENABLE    3
+#define V_RQFEEDBACKENABLE(x) ((x) << S_RQFEEDBACKENABLE)
+#define F_RQFEEDBACKENABLE    V_RQFEEDBACKENABLE(1U)
+
+#define S_HASHTOEPLITZ    2
+#define V_HASHTOEPLITZ(x) ((x) << S_HASHTOEPLITZ)
+#define F_HASHTOEPLITZ    V_HASHTOEPLITZ(1U)
+
+#define S_DISABLE    0
+
+#define A_TP_TM_PIO_ADDR 0x418
+
+#define A_TP_TM_PIO_DATA 0x41c
+
+#define A_TP_TX_MOD_QUE_TABLE 0x420
+
+#define A_TP_TX_RESOURCE_LIMIT 0x424
+
+#define A_TP_TX_MOD_QUEUE_REQ_MAP 0x428
+
+#define S_TX_MOD_QUEUE_REQ_MAP    0
+#define M_TX_MOD_QUEUE_REQ_MAP    0xff
+#define V_TX_MOD_QUEUE_REQ_MAP(x) ((x) << S_TX_MOD_QUEUE_REQ_MAP)
+
+#define A_TP_TX_MOD_QUEUE_WEIGHT1 0x42c
+
+#define A_TP_TX_MOD_QUEUE_WEIGHT0 0x430
+
+#define A_TP_MOD_CHANNEL_WEIGHT 0x434
+
+#define A_TP_MOD_RATE_LIMIT 0x438
+
+#define A_TP_PIO_ADDR 0x440
+
+#define A_TP_PIO_DATA 0x444
+
+#define A_TP_RESET 0x44c
+
+#define S_FLSTINITENABLE    1
+#define V_FLSTINITENABLE(x) ((x) << S_FLSTINITENABLE)
+#define F_FLSTINITENABLE    V_FLSTINITENABLE(1U)
+
+#define S_TPRESET    0
+#define V_TPRESET(x) ((x) << S_TPRESET)
+#define F_TPRESET    V_TPRESET(1U)
+
+#define A_TP_CMM_MM_RX_FLST_BASE 0x460
+
+#define A_TP_CMM_MM_TX_FLST_BASE 0x464
+
+#define A_TP_CMM_MM_PS_FLST_BASE 0x468
+
+#define A_TP_MIB_INDEX 0x450
+
+#define A_TP_MIB_RDATA 0x454
+
+#define A_TP_CMM_MM_MAX_PSTRUCT 0x46c
+
+#define A_TP_INT_ENABLE 0x470
+
+#define S_FLMTXFLSTEMPTY    30
+#define V_FLMTXFLSTEMPTY(x) ((x) << S_FLMTXFLSTEMPTY)
+#define F_FLMTXFLSTEMPTY    V_FLMTXFLSTEMPTY(1U)
+
+#define S_FLMRXFLSTEMPTY    29
+#define V_FLMRXFLSTEMPTY(x) ((x) << S_FLMRXFLSTEMPTY)
+#define F_FLMRXFLSTEMPTY    V_FLMRXFLSTEMPTY(1U)
+
+#define S_ARPLUTPERR    26
+#define V_ARPLUTPERR(x) ((x) << S_ARPLUTPERR)
+#define F_ARPLUTPERR    V_ARPLUTPERR(1U)
+
+#define S_CMCACHEPERR    24
+#define V_CMCACHEPERR(x) ((x) << S_CMCACHEPERR)
+#define F_CMCACHEPERR    V_CMCACHEPERR(1U)
+
+#define A_TP_INT_CAUSE 0x474
+
+#define A_TP_TX_MOD_Q1_Q0_RATE_LIMIT 0x8
+
+#define A_TP_TX_DROP_CFG_CH0 0x12b
+
+#define A_TP_TX_DROP_MODE 0x12f
+
+#define A_TP_EGRESS_CONFIG 0x145
+
+#define S_REWRITEFORCETOSIZE    0
+#define V_REWRITEFORCETOSIZE(x) ((x) << S_REWRITEFORCETOSIZE)
+#define F_REWRITEFORCETOSIZE    V_REWRITEFORCETOSIZE(1U)
+
+#define A_TP_TX_TRC_KEY0 0x20
+
+#define A_TP_RX_TRC_KEY0 0x120
+
+#define A_TP_TX_DROP_CNT_CH0 0x12d
+
+#define S_TXDROPCNTCH0RCVD    0
+#define M_TXDROPCNTCH0RCVD    0xffff
+#define V_TXDROPCNTCH0RCVD(x) ((x) << S_TXDROPCNTCH0RCVD)
+#define G_TXDROPCNTCH0RCVD(x) (((x) >> S_TXDROPCNTCH0RCVD) & \
+			       M_TXDROPCNTCH0RCVD)
+
+#define A_TP_PROXY_FLOW_CNTL 0x4b0
+
+#define A_TP_EMBED_OP_FIELD0 0x4e8
+#define A_TP_EMBED_OP_FIELD1 0x4ec
+#define A_TP_EMBED_OP_FIELD2 0x4f0
+#define A_TP_EMBED_OP_FIELD3 0x4f4
+#define A_TP_EMBED_OP_FIELD4 0x4f8
+#define A_TP_EMBED_OP_FIELD5 0x4fc
+
+#define A_ULPRX_CTL 0x500
+
+#define S_ROUND_ROBIN    4
+#define V_ROUND_ROBIN(x) ((x) << S_ROUND_ROBIN)
+#define F_ROUND_ROBIN    V_ROUND_ROBIN(1U)
+
+#define A_ULPRX_INT_ENABLE 0x504
+
+#define S_DATASELFRAMEERR0    7
+#define V_DATASELFRAMEERR0(x) ((x) << S_DATASELFRAMEERR0)
+#define F_DATASELFRAMEERR0    V_DATASELFRAMEERR0(1U)
+
+#define S_DATASELFRAMEERR1    6
+#define V_DATASELFRAMEERR1(x) ((x) << S_DATASELFRAMEERR1)
+#define F_DATASELFRAMEERR1    V_DATASELFRAMEERR1(1U)
+
+#define S_PCMDMUXPERR    5
+#define V_PCMDMUXPERR(x) ((x) << S_PCMDMUXPERR)
+#define F_PCMDMUXPERR    V_PCMDMUXPERR(1U)
+
+#define S_ARBFPERR    4
+#define V_ARBFPERR(x) ((x) << S_ARBFPERR)
+#define F_ARBFPERR    V_ARBFPERR(1U)
+
+#define S_ARBPF0PERR    3
+#define V_ARBPF0PERR(x) ((x) << S_ARBPF0PERR)
+#define F_ARBPF0PERR    V_ARBPF0PERR(1U)
+
+#define S_ARBPF1PERR    2
+#define V_ARBPF1PERR(x) ((x) << S_ARBPF1PERR)
+#define F_ARBPF1PERR    V_ARBPF1PERR(1U)
+
+#define S_PARERRPCMD    1
+#define V_PARERRPCMD(x) ((x) << S_PARERRPCMD)
+#define F_PARERRPCMD    V_PARERRPCMD(1U)
+
+#define S_PARERRDATA    0
+#define V_PARERRDATA(x) ((x) << S_PARERRDATA)
+#define F_PARERRDATA    V_PARERRDATA(1U)
+
+#define A_ULPRX_INT_CAUSE 0x508
+
+#define A_ULPRX_ISCSI_LLIMIT 0x50c
+
+#define A_ULPRX_ISCSI_ULIMIT 0x510
+
+#define A_ULPRX_ISCSI_TAGMASK 0x514
+
+#define A_ULPRX_ISCSI_PSZ 0x518
+
+#define A_ULPRX_TDDP_LLIMIT 0x51c
+
+#define A_ULPRX_TDDP_ULIMIT 0x520
+#define A_ULPRX_TDDP_PSZ 0x528
+
+#define S_HPZ0    0
+#define M_HPZ0    0xf
+#define V_HPZ0(x) ((x) << S_HPZ0)
+#define G_HPZ0(x) (((x) >> S_HPZ0) & M_HPZ0)
+
+#define A_ULPRX_STAG_LLIMIT 0x52c
+
+#define A_ULPRX_STAG_ULIMIT 0x530
+
+#define A_ULPRX_RQ_LLIMIT 0x534
+
+#define A_ULPRX_RQ_ULIMIT 0x538
+
+#define A_ULPRX_PBL_LLIMIT 0x53c
+
+#define A_ULPRX_PBL_ULIMIT 0x540
+
+#define A_ULPRX_TDDP_TAGMASK 0x524
+
+#define A_ULPTX_CONFIG 0x580
+
+#define S_CFG_CQE_SOP_MASK    1
+#define V_CFG_CQE_SOP_MASK(x) ((x) << S_CFG_CQE_SOP_MASK)
+#define F_CFG_CQE_SOP_MASK    V_CFG_CQE_SOP_MASK(1U)
+
+#define S_CFG_RR_ARB    0
+#define V_CFG_RR_ARB(x) ((x) << S_CFG_RR_ARB)
+#define F_CFG_RR_ARB    V_CFG_RR_ARB(1U)
+
+#define A_ULPTX_INT_ENABLE 0x584
+
+#define S_PBL_BOUND_ERR_CH1    1
+#define V_PBL_BOUND_ERR_CH1(x) ((x) << S_PBL_BOUND_ERR_CH1)
+#define F_PBL_BOUND_ERR_CH1    V_PBL_BOUND_ERR_CH1(1U)
+
+#define S_PBL_BOUND_ERR_CH0    0
+#define V_PBL_BOUND_ERR_CH0(x) ((x) << S_PBL_BOUND_ERR_CH0)
+#define F_PBL_BOUND_ERR_CH0    V_PBL_BOUND_ERR_CH0(1U)
+
+#define A_ULPTX_INT_CAUSE 0x588
+
+#define A_ULPTX_TPT_LLIMIT 0x58c
+
+#define A_ULPTX_TPT_ULIMIT 0x590
+
+#define A_ULPTX_PBL_LLIMIT 0x594
+
+#define A_ULPTX_PBL_ULIMIT 0x598
+
+#define A_ULPTX_DMA_WEIGHT 0x5ac
+
+#define S_D1_WEIGHT    16
+#define M_D1_WEIGHT    0xffff
+#define V_D1_WEIGHT(x) ((x) << S_D1_WEIGHT)
+
+#define S_D0_WEIGHT    0
+#define M_D0_WEIGHT    0xffff
+#define V_D0_WEIGHT(x) ((x) << S_D0_WEIGHT)
+
+#define A_PM1_RX_CFG 0x5c0
+#define A_PM1_RX_MODE 0x5c4
+
+#define A_PM1_RX_INT_ENABLE 0x5d8
+
+#define S_ZERO_E_CMD_ERROR    18
+#define V_ZERO_E_CMD_ERROR(x) ((x) << S_ZERO_E_CMD_ERROR)
+#define F_ZERO_E_CMD_ERROR    V_ZERO_E_CMD_ERROR(1U)
+
+#define S_IESPI0_FIFO2X_RX_FRAMING_ERROR    17
+#define V_IESPI0_FIFO2X_RX_FRAMING_ERROR(x) ((x) << S_IESPI0_FIFO2X_RX_FRAMING_ERROR)
+#define F_IESPI0_FIFO2X_RX_FRAMING_ERROR    V_IESPI0_FIFO2X_RX_FRAMING_ERROR(1U)
+
+#define S_IESPI1_FIFO2X_RX_FRAMING_ERROR    16
+#define V_IESPI1_FIFO2X_RX_FRAMING_ERROR(x) ((x) << S_IESPI1_FIFO2X_RX_FRAMING_ERROR)
+#define F_IESPI1_FIFO2X_RX_FRAMING_ERROR    V_IESPI1_FIFO2X_RX_FRAMING_ERROR(1U)
+
+#define S_IESPI0_RX_FRAMING_ERROR    15
+#define V_IESPI0_RX_FRAMING_ERROR(x) ((x) << S_IESPI0_RX_FRAMING_ERROR)
+#define F_IESPI0_RX_FRAMING_ERROR    V_IESPI0_RX_FRAMING_ERROR(1U)
+
+#define S_IESPI1_RX_FRAMING_ERROR    14
+#define V_IESPI1_RX_FRAMING_ERROR(x) ((x) << S_IESPI1_RX_FRAMING_ERROR)
+#define F_IESPI1_RX_FRAMING_ERROR    V_IESPI1_RX_FRAMING_ERROR(1U)
+
+#define S_IESPI0_TX_FRAMING_ERROR    13
+#define V_IESPI0_TX_FRAMING_ERROR(x) ((x) << S_IESPI0_TX_FRAMING_ERROR)
+#define F_IESPI0_TX_FRAMING_ERROR    V_IESPI0_TX_FRAMING_ERROR(1U)
+
+#define S_IESPI1_TX_FRAMING_ERROR    12
+#define V_IESPI1_TX_FRAMING_ERROR(x) ((x) << S_IESPI1_TX_FRAMING_ERROR)
+#define F_IESPI1_TX_FRAMING_ERROR    V_IESPI1_TX_FRAMING_ERROR(1U)
+
+#define S_OCSPI0_RX_FRAMING_ERROR    11
+#define V_OCSPI0_RX_FRAMING_ERROR(x) ((x) << S_OCSPI0_RX_FRAMING_ERROR)
+#define F_OCSPI0_RX_FRAMING_ERROR    V_OCSPI0_RX_FRAMING_ERROR(1U)
+
+#define S_OCSPI1_RX_FRAMING_ERROR    10
+#define V_OCSPI1_RX_FRAMING_ERROR(x) ((x) << S_OCSPI1_RX_FRAMING_ERROR)
+#define F_OCSPI1_RX_FRAMING_ERROR    V_OCSPI1_RX_FRAMING_ERROR(1U)
+
+#define S_OCSPI0_TX_FRAMING_ERROR    9
+#define V_OCSPI0_TX_FRAMING_ERROR(x) ((x) << S_OCSPI0_TX_FRAMING_ERROR)
+#define F_OCSPI0_TX_FRAMING_ERROR    V_OCSPI0_TX_FRAMING_ERROR(1U)
+
+#define S_OCSPI1_TX_FRAMING_ERROR    8
+#define V_OCSPI1_TX_FRAMING_ERROR(x) ((x) << S_OCSPI1_TX_FRAMING_ERROR)
+#define F_OCSPI1_TX_FRAMING_ERROR    V_OCSPI1_TX_FRAMING_ERROR(1U)
+
+#define S_OCSPI0_OFIFO2X_TX_FRAMING_ERROR    7
+#define V_OCSPI0_OFIFO2X_TX_FRAMING_ERROR(x) ((x) << S_OCSPI0_OFIFO2X_TX_FRAMING_ERROR)
+#define F_OCSPI0_OFIFO2X_TX_FRAMING_ERROR    V_OCSPI0_OFIFO2X_TX_FRAMING_ERROR(1U)
+
+#define S_OCSPI1_OFIFO2X_TX_FRAMING_ERROR    6
+#define V_OCSPI1_OFIFO2X_TX_FRAMING_ERROR(x) ((x) << S_OCSPI1_OFIFO2X_TX_FRAMING_ERROR)
+#define F_OCSPI1_OFIFO2X_TX_FRAMING_ERROR    V_OCSPI1_OFIFO2X_TX_FRAMING_ERROR(1U)
+
+#define S_IESPI_PAR_ERROR    3
+#define M_IESPI_PAR_ERROR    0x7
+
+#define V_IESPI_PAR_ERROR(x) ((x) << S_IESPI_PAR_ERROR)
+
+#define S_OCSPI_PAR_ERROR    0
+#define M_OCSPI_PAR_ERROR    0x7
+
+#define V_OCSPI_PAR_ERROR(x) ((x) << S_OCSPI_PAR_ERROR)
+
+#define A_PM1_RX_INT_CAUSE 0x5dc
+
+#define A_PM1_TX_CFG 0x5e0
+#define A_PM1_TX_MODE 0x5e4
+
+#define A_PM1_TX_INT_ENABLE 0x5f8
+
+#define S_ZERO_C_CMD_ERROR    18
+#define V_ZERO_C_CMD_ERROR(x) ((x) << S_ZERO_C_CMD_ERROR)
+#define F_ZERO_C_CMD_ERROR    V_ZERO_C_CMD_ERROR(1U)
+
+#define S_ICSPI0_FIFO2X_RX_FRAMING_ERROR    17
+#define V_ICSPI0_FIFO2X_RX_FRAMING_ERROR(x) ((x) << S_ICSPI0_FIFO2X_RX_FRAMING_ERROR)
+#define F_ICSPI0_FIFO2X_RX_FRAMING_ERROR    V_ICSPI0_FIFO2X_RX_FRAMING_ERROR(1U)
+
+#define S_ICSPI1_FIFO2X_RX_FRAMING_ERROR    16
+#define V_ICSPI1_FIFO2X_RX_FRAMING_ERROR(x) ((x) << S_ICSPI1_FIFO2X_RX_FRAMING_ERROR)
+#define F_ICSPI1_FIFO2X_RX_FRAMING_ERROR    V_ICSPI1_FIFO2X_RX_FRAMING_ERROR(1U)
+
+#define S_ICSPI0_RX_FRAMING_ERROR    15
+#define V_ICSPI0_RX_FRAMING_ERROR(x) ((x) << S_ICSPI0_RX_FRAMING_ERROR)
+#define F_ICSPI0_RX_FRAMING_ERROR    V_ICSPI0_RX_FRAMING_ERROR(1U)
+
+#define S_ICSPI1_RX_FRAMING_ERROR    14
+#define V_ICSPI1_RX_FRAMING_ERROR(x) ((x) << S_ICSPI1_RX_FRAMING_ERROR)
+#define F_ICSPI1_RX_FRAMING_ERROR    V_ICSPI1_RX_FRAMING_ERROR(1U)
+
+#define S_ICSPI0_TX_FRAMING_ERROR    13
+#define V_ICSPI0_TX_FRAMING_ERROR(x) ((x) << S_ICSPI0_TX_FRAMING_ERROR)
+#define F_ICSPI0_TX_FRAMING_ERROR    V_ICSPI0_TX_FRAMING_ERROR(1U)
+
+#define S_ICSPI1_TX_FRAMING_ERROR    12
+#define V_ICSPI1_TX_FRAMING_ERROR(x) ((x) << S_ICSPI1_TX_FRAMING_ERROR)
+#define F_ICSPI1_TX_FRAMING_ERROR    V_ICSPI1_TX_FRAMING_ERROR(1U)
+
+#define S_OESPI0_RX_FRAMING_ERROR    11
+#define V_OESPI0_RX_FRAMING_ERROR(x) ((x) << S_OESPI0_RX_FRAMING_ERROR)
+#define F_OESPI0_RX_FRAMING_ERROR    V_OESPI0_RX_FRAMING_ERROR(1U)
+
+#define S_OESPI1_RX_FRAMING_ERROR    10
+#define V_OESPI1_RX_FRAMING_ERROR(x) ((x) << S_OESPI1_RX_FRAMING_ERROR)
+#define F_OESPI1_RX_FRAMING_ERROR    V_OESPI1_RX_FRAMING_ERROR(1U)
+
+#define S_OESPI0_TX_FRAMING_ERROR    9
+#define V_OESPI0_TX_FRAMING_ERROR(x) ((x) << S_OESPI0_TX_FRAMING_ERROR)
+#define F_OESPI0_TX_FRAMING_ERROR    V_OESPI0_TX_FRAMING_ERROR(1U)
+
+#define S_OESPI1_TX_FRAMING_ERROR    8
+#define V_OESPI1_TX_FRAMING_ERROR(x) ((x) << S_OESPI1_TX_FRAMING_ERROR)
+#define F_OESPI1_TX_FRAMING_ERROR    V_OESPI1_TX_FRAMING_ERROR(1U)
+
+#define S_OESPI0_OFIFO2X_TX_FRAMING_ERROR    7
+#define V_OESPI0_OFIFO2X_TX_FRAMING_ERROR(x) ((x) << S_OESPI0_OFIFO2X_TX_FRAMING_ERROR)
+#define F_OESPI0_OFIFO2X_TX_FRAMING_ERROR    V_OESPI0_OFIFO2X_TX_FRAMING_ERROR(1U)
+
+#define S_OESPI1_OFIFO2X_TX_FRAMING_ERROR    6
+#define V_OESPI1_OFIFO2X_TX_FRAMING_ERROR(x) ((x) << S_OESPI1_OFIFO2X_TX_FRAMING_ERROR)
+#define F_OESPI1_OFIFO2X_TX_FRAMING_ERROR    V_OESPI1_OFIFO2X_TX_FRAMING_ERROR(1U)
+
+#define S_ICSPI_PAR_ERROR    3
+#define M_ICSPI_PAR_ERROR    0x7
+
+#define V_ICSPI_PAR_ERROR(x) ((x) << S_ICSPI_PAR_ERROR)
+
+#define S_OESPI_PAR_ERROR    0
+#define M_OESPI_PAR_ERROR    0x7
+
+#define V_OESPI_PAR_ERROR(x) ((x) << S_OESPI_PAR_ERROR)
+
+#define A_PM1_TX_INT_CAUSE 0x5fc
+
+#define A_MPS_CFG 0x600
+
+#define S_TPRXPORTEN    4
+#define V_TPRXPORTEN(x) ((x) << S_TPRXPORTEN)
+#define F_TPRXPORTEN    V_TPRXPORTEN(1U)
+
+#define S_TPTXPORT1EN    3
+#define V_TPTXPORT1EN(x) ((x) << S_TPTXPORT1EN)
+#define F_TPTXPORT1EN    V_TPTXPORT1EN(1U)
+
+#define S_TPTXPORT0EN    2
+#define V_TPTXPORT0EN(x) ((x) << S_TPTXPORT0EN)
+#define F_TPTXPORT0EN    V_TPTXPORT0EN(1U)
+
+#define S_PORT1ACTIVE    1
+#define V_PORT1ACTIVE(x) ((x) << S_PORT1ACTIVE)
+#define F_PORT1ACTIVE    V_PORT1ACTIVE(1U)
+
+#define S_PORT0ACTIVE    0
+#define V_PORT0ACTIVE(x) ((x) << S_PORT0ACTIVE)
+#define F_PORT0ACTIVE    V_PORT0ACTIVE(1U)
+
+#define S_ENFORCEPKT    11
+#define V_ENFORCEPKT(x) ((x) << S_ENFORCEPKT)
+#define F_ENFORCEPKT    V_ENFORCEPKT(1U)
+
+#define A_MPS_INT_ENABLE 0x61c
+
+#define S_MCAPARERRENB    6
+#define M_MCAPARERRENB    0x7
+
+#define V_MCAPARERRENB(x) ((x) << S_MCAPARERRENB)
+
+#define S_RXTPPARERRENB    4
+#define M_RXTPPARERRENB    0x3
+
+#define V_RXTPPARERRENB(x) ((x) << S_RXTPPARERRENB)
+
+#define S_TX1TPPARERRENB    2
+#define M_TX1TPPARERRENB    0x3
+
+#define V_TX1TPPARERRENB(x) ((x) << S_TX1TPPARERRENB)
+
+#define S_TX0TPPARERRENB    0
+#define M_TX0TPPARERRENB    0x3
+
+#define V_TX0TPPARERRENB(x) ((x) << S_TX0TPPARERRENB)
+
+#define A_MPS_INT_CAUSE 0x620
+
+#define S_MCAPARERR    6
+#define M_MCAPARERR    0x7
+
+#define V_MCAPARERR(x) ((x) << S_MCAPARERR)
+
+#define S_RXTPPARERR    4
+#define M_RXTPPARERR    0x3
+
+#define V_RXTPPARERR(x) ((x) << S_RXTPPARERR)
+
+#define S_TX1TPPARERR    2
+#define M_TX1TPPARERR    0x3
+
+#define V_TX1TPPARERR(x) ((x) << S_TX1TPPARERR)
+
+#define S_TX0TPPARERR    0
+#define M_TX0TPPARERR    0x3
+
+#define V_TX0TPPARERR(x) ((x) << S_TX0TPPARERR)
+
+#define A_CPL_SWITCH_CNTRL 0x640
+
+#define A_CPL_INTR_ENABLE 0x650
+
+#define S_CIM_OP_MAP_PERR    5
+#define V_CIM_OP_MAP_PERR(x) ((x) << S_CIM_OP_MAP_PERR)
+#define F_CIM_OP_MAP_PERR    V_CIM_OP_MAP_PERR(1U)
+
+#define S_CIM_OVFL_ERROR    4
+#define V_CIM_OVFL_ERROR(x) ((x) << S_CIM_OVFL_ERROR)
+#define F_CIM_OVFL_ERROR    V_CIM_OVFL_ERROR(1U)
+
+#define S_TP_FRAMING_ERROR    3
+#define V_TP_FRAMING_ERROR(x) ((x) << S_TP_FRAMING_ERROR)
+#define F_TP_FRAMING_ERROR    V_TP_FRAMING_ERROR(1U)
+
+#define S_SGE_FRAMING_ERROR    2
+#define V_SGE_FRAMING_ERROR(x) ((x) << S_SGE_FRAMING_ERROR)
+#define F_SGE_FRAMING_ERROR    V_SGE_FRAMING_ERROR(1U)
+
+#define S_CIM_FRAMING_ERROR    1
+#define V_CIM_FRAMING_ERROR(x) ((x) << S_CIM_FRAMING_ERROR)
+#define F_CIM_FRAMING_ERROR    V_CIM_FRAMING_ERROR(1U)
+
+#define S_ZERO_SWITCH_ERROR    0
+#define V_ZERO_SWITCH_ERROR(x) ((x) << S_ZERO_SWITCH_ERROR)
+#define F_ZERO_SWITCH_ERROR    V_ZERO_SWITCH_ERROR(1U)
+
+#define A_CPL_INTR_CAUSE 0x654
+
+#define A_CPL_MAP_TBL_DATA 0x65c
+
+#define A_SMB_GLOBAL_TIME_CFG 0x660
+
+#define A_I2C_CFG 0x6a0
+
+#define S_I2C_CLKDIV    0
+#define M_I2C_CLKDIV    0xfff
+#define V_I2C_CLKDIV(x) ((x) << S_I2C_CLKDIV)
+
+#define A_MI1_CFG 0x6b0
+
+#define S_CLKDIV    5
+#define M_CLKDIV    0xff
+#define V_CLKDIV(x) ((x) << S_CLKDIV)
+
+#define S_ST    3
+
+#define M_ST    0x3
+
+#define V_ST(x) ((x) << S_ST)
+
+#define G_ST(x) (((x) >> S_ST) & M_ST)
+
+#define S_PREEN    2
+#define V_PREEN(x) ((x) << S_PREEN)
+#define F_PREEN    V_PREEN(1U)
+
+#define S_MDIINV    1
+#define V_MDIINV(x) ((x) << S_MDIINV)
+#define F_MDIINV    V_MDIINV(1U)
+
+#define S_MDIEN    0
+#define V_MDIEN(x) ((x) << S_MDIEN)
+#define F_MDIEN    V_MDIEN(1U)
+
+#define A_MI1_ADDR 0x6b4
+
+#define S_PHYADDR    5
+#define M_PHYADDR    0x1f
+#define V_PHYADDR(x) ((x) << S_PHYADDR)
+
+#define S_REGADDR    0
+#define M_REGADDR    0x1f
+#define V_REGADDR(x) ((x) << S_REGADDR)
+
+#define A_MI1_DATA 0x6b8
+
+#define A_MI1_OP 0x6bc
+
+#define S_MDI_OP    0
+#define M_MDI_OP    0x3
+#define V_MDI_OP(x) ((x) << S_MDI_OP)
+
+#define A_SF_DATA 0x6d8
+
+#define A_SF_OP 0x6dc
+
+#define S_BYTECNT    1
+#define M_BYTECNT    0x3
+#define V_BYTECNT(x) ((x) << S_BYTECNT)
+
+#define A_PL_INT_ENABLE0 0x6e0
+
+#define S_T3DBG    23
+#define V_T3DBG(x) ((x) << S_T3DBG)
+#define F_T3DBG    V_T3DBG(1U)
+
+#define S_XGMAC0_1    20
+#define V_XGMAC0_1(x) ((x) << S_XGMAC0_1)
+#define F_XGMAC0_1    V_XGMAC0_1(1U)
+
+#define S_XGMAC0_0    19
+#define V_XGMAC0_0(x) ((x) << S_XGMAC0_0)
+#define F_XGMAC0_0    V_XGMAC0_0(1U)
+
+#define S_MC5A    18
+#define V_MC5A(x) ((x) << S_MC5A)
+#define F_MC5A    V_MC5A(1U)
+
+#define S_CPL_SWITCH    12
+#define V_CPL_SWITCH(x) ((x) << S_CPL_SWITCH)
+#define F_CPL_SWITCH    V_CPL_SWITCH(1U)
+
+#define S_MPS0    11
+#define V_MPS0(x) ((x) << S_MPS0)
+#define F_MPS0    V_MPS0(1U)
+
+#define S_PM1_TX    10
+#define V_PM1_TX(x) ((x) << S_PM1_TX)
+#define F_PM1_TX    V_PM1_TX(1U)
+
+#define S_PM1_RX    9
+#define V_PM1_RX(x) ((x) << S_PM1_RX)
+#define F_PM1_RX    V_PM1_RX(1U)
+
+#define S_ULP2_TX    8
+#define V_ULP2_TX(x) ((x) << S_ULP2_TX)
+#define F_ULP2_TX    V_ULP2_TX(1U)
+
+#define S_ULP2_RX    7
+#define V_ULP2_RX(x) ((x) << S_ULP2_RX)
+#define F_ULP2_RX    V_ULP2_RX(1U)
+
+#define S_TP1    6
+#define V_TP1(x) ((x) << S_TP1)
+#define F_TP1    V_TP1(1U)
+
+#define S_CIM    5
+#define V_CIM(x) ((x) << S_CIM)
+#define F_CIM    V_CIM(1U)
+
+#define S_MC7_CM    4
+#define V_MC7_CM(x) ((x) << S_MC7_CM)
+#define F_MC7_CM    V_MC7_CM(1U)
+
+#define S_MC7_PMTX    3
+#define V_MC7_PMTX(x) ((x) << S_MC7_PMTX)
+#define F_MC7_PMTX    V_MC7_PMTX(1U)
+
+#define S_MC7_PMRX    2
+#define V_MC7_PMRX(x) ((x) << S_MC7_PMRX)
+#define F_MC7_PMRX    V_MC7_PMRX(1U)
+
+#define S_PCIM0    1
+#define V_PCIM0(x) ((x) << S_PCIM0)
+#define F_PCIM0    V_PCIM0(1U)
+
+#define S_SGE3    0
+#define V_SGE3(x) ((x) << S_SGE3)
+#define F_SGE3    V_SGE3(1U)
+
+#define A_PL_INT_CAUSE0 0x6e4
+
+#define A_PL_RST 0x6f0
+
+#define S_FATALPERREN    4
+#define V_FATALPERREN(x) ((x) << S_FATALPERREN)
+#define F_FATALPERREN    V_FATALPERREN(1U)
+
+#define S_CRSTWRM    1
+#define V_CRSTWRM(x) ((x) << S_CRSTWRM)
+#define F_CRSTWRM    V_CRSTWRM(1U)
+
+#define A_PL_REV 0x6f4
+
+#define A_PL_CLI 0x6f8
+
+#define A_MC5_DB_CONFIG 0x704
+
+#define S_TMTYPEHI    30
+#define V_TMTYPEHI(x) ((x) << S_TMTYPEHI)
+#define F_TMTYPEHI    V_TMTYPEHI(1U)
+
+#define S_TMPARTSIZE    28
+#define M_TMPARTSIZE    0x3
+#define V_TMPARTSIZE(x) ((x) << S_TMPARTSIZE)
+#define G_TMPARTSIZE(x) (((x) >> S_TMPARTSIZE) & M_TMPARTSIZE)
+
+#define S_TMTYPE    26
+#define M_TMTYPE    0x3
+#define V_TMTYPE(x) ((x) << S_TMTYPE)
+#define G_TMTYPE(x) (((x) >> S_TMTYPE) & M_TMTYPE)
+
+#define S_COMPEN    17
+#define V_COMPEN(x) ((x) << S_COMPEN)
+#define F_COMPEN    V_COMPEN(1U)
+
+#define S_PRTYEN    6
+#define V_PRTYEN(x) ((x) << S_PRTYEN)
+#define F_PRTYEN    V_PRTYEN(1U)
+
+#define S_MBUSEN    5
+#define V_MBUSEN(x) ((x) << S_MBUSEN)
+#define F_MBUSEN    V_MBUSEN(1U)
+
+#define S_DBGIEN    4
+#define V_DBGIEN(x) ((x) << S_DBGIEN)
+#define F_DBGIEN    V_DBGIEN(1U)
+
+#define S_TMRDY    2
+#define V_TMRDY(x) ((x) << S_TMRDY)
+#define F_TMRDY    V_TMRDY(1U)
+
+#define S_TMRST    1
+#define V_TMRST(x) ((x) << S_TMRST)
+#define F_TMRST    V_TMRST(1U)
+
+#define S_TMMODE    0
+#define V_TMMODE(x) ((x) << S_TMMODE)
+#define F_TMMODE    V_TMMODE(1U)
+
+#define A_MC5_DB_ROUTING_TABLE_INDEX 0x70c
+
+#define A_MC5_DB_FILTER_TABLE 0x710
+
+#define A_MC5_DB_SERVER_INDEX 0x714
+
+#define A_MC5_DB_RSP_LATENCY 0x720
+
+#define S_RDLAT    16
+#define M_RDLAT    0x1f
+#define V_RDLAT(x) ((x) << S_RDLAT)
+
+#define S_LRNLAT    8
+#define M_LRNLAT    0x1f
+#define V_LRNLAT(x) ((x) << S_LRNLAT)
+
+#define S_SRCHLAT    0
+#define M_SRCHLAT    0x1f
+#define V_SRCHLAT(x) ((x) << S_SRCHLAT)
+
+#define A_MC5_DB_PART_ID_INDEX 0x72c
+
+#define A_MC5_DB_INT_ENABLE 0x740
+
+#define S_DELACTEMPTY    18
+#define V_DELACTEMPTY(x) ((x) << S_DELACTEMPTY)
+#define F_DELACTEMPTY    V_DELACTEMPTY(1U)
+
+#define S_DISPQPARERR    17
+#define V_DISPQPARERR(x) ((x) << S_DISPQPARERR)
+#define F_DISPQPARERR    V_DISPQPARERR(1U)
+
+#define S_REQQPARERR    16
+#define V_REQQPARERR(x) ((x) << S_REQQPARERR)
+#define F_REQQPARERR    V_REQQPARERR(1U)
+
+#define S_UNKNOWNCMD    15
+#define V_UNKNOWNCMD(x) ((x) << S_UNKNOWNCMD)
+#define F_UNKNOWNCMD    V_UNKNOWNCMD(1U)
+
+#define S_NFASRCHFAIL    8
+#define V_NFASRCHFAIL(x) ((x) << S_NFASRCHFAIL)
+#define F_NFASRCHFAIL    V_NFASRCHFAIL(1U)
+
+#define S_ACTRGNFULL    7
+#define V_ACTRGNFULL(x) ((x) << S_ACTRGNFULL)
+#define F_ACTRGNFULL    V_ACTRGNFULL(1U)
+
+#define S_PARITYERR    6
+#define V_PARITYERR(x) ((x) << S_PARITYERR)
+#define F_PARITYERR    V_PARITYERR(1U)
+
+#define A_MC5_DB_INT_CAUSE 0x744
+
+#define A_MC5_DB_DBGI_CONFIG 0x774
+
+#define A_MC5_DB_DBGI_REQ_CMD 0x778
+
+#define A_MC5_DB_DBGI_REQ_ADDR0 0x77c
+
+#define A_MC5_DB_DBGI_REQ_ADDR1 0x780
+
+#define A_MC5_DB_DBGI_REQ_ADDR2 0x784
+
+#define A_MC5_DB_DBGI_REQ_DATA0 0x788
+
+#define A_MC5_DB_DBGI_REQ_DATA1 0x78c
+
+#define A_MC5_DB_DBGI_REQ_DATA2 0x790
+
+#define A_MC5_DB_DBGI_RSP_STATUS 0x7b0
+
+#define S_DBGIRSPVALID    0
+#define V_DBGIRSPVALID(x) ((x) << S_DBGIRSPVALID)
+#define F_DBGIRSPVALID    V_DBGIRSPVALID(1U)
+
+#define A_MC5_DB_DBGI_RSP_DATA0 0x7b4
+
+#define A_MC5_DB_DBGI_RSP_DATA1 0x7b8
+
+#define A_MC5_DB_DBGI_RSP_DATA2 0x7bc
+
+#define A_MC5_DB_POPEN_DATA_WR_CMD 0x7cc
+
+#define A_MC5_DB_POPEN_MASK_WR_CMD 0x7d0
+
+#define A_MC5_DB_AOPEN_SRCH_CMD 0x7d4
+
+#define A_MC5_DB_AOPEN_LRN_CMD 0x7d8
+
+#define A_MC5_DB_SYN_SRCH_CMD 0x7dc
+
+#define A_MC5_DB_SYN_LRN_CMD 0x7e0
+
+#define A_MC5_DB_ACK_SRCH_CMD 0x7e4
+
+#define A_MC5_DB_ACK_LRN_CMD 0x7e8
+
+#define A_MC5_DB_ILOOKUP_CMD 0x7ec
+
+#define A_MC5_DB_ELOOKUP_CMD 0x7f0
+
+#define A_MC5_DB_DATA_WRITE_CMD 0x7f4
+
+#define A_MC5_DB_DATA_READ_CMD 0x7f8
+
+#define XGMAC0_0_BASE_ADDR 0x800
+
+#define A_XGM_TX_CTRL 0x800
+
+#define S_TXEN    0
+#define V_TXEN(x) ((x) << S_TXEN)
+#define F_TXEN    V_TXEN(1U)
+
+#define A_XGM_TX_CFG 0x804
+
+#define S_TXPAUSEEN    0
+#define V_TXPAUSEEN(x) ((x) << S_TXPAUSEEN)
+#define F_TXPAUSEEN    V_TXPAUSEEN(1U)
+
+#define A_XGM_TX_PAUSE_QUANTA 0x808
+
+#define A_XGM_RX_CTRL 0x80c
+
+#define S_RXEN    0
+#define V_RXEN(x) ((x) << S_RXEN)
+#define F_RXEN    V_RXEN(1U)
+
+#define A_XGM_RX_CFG 0x810
+
+#define S_DISPAUSEFRAMES    9
+#define V_DISPAUSEFRAMES(x) ((x) << S_DISPAUSEFRAMES)
+#define F_DISPAUSEFRAMES    V_DISPAUSEFRAMES(1U)
+
+#define S_EN1536BFRAMES    8
+#define V_EN1536BFRAMES(x) ((x) << S_EN1536BFRAMES)
+#define F_EN1536BFRAMES    V_EN1536BFRAMES(1U)
+
+#define S_ENJUMBO    7
+#define V_ENJUMBO(x) ((x) << S_ENJUMBO)
+#define F_ENJUMBO    V_ENJUMBO(1U)
+
+#define S_RMFCS    6
+#define V_RMFCS(x) ((x) << S_RMFCS)
+#define F_RMFCS    V_RMFCS(1U)
+
+#define S_ENHASHMCAST    2
+#define V_ENHASHMCAST(x) ((x) << S_ENHASHMCAST)
+#define F_ENHASHMCAST    V_ENHASHMCAST(1U)
+
+#define S_COPYALLFRAMES    0
+#define V_COPYALLFRAMES(x) ((x) << S_COPYALLFRAMES)
+#define F_COPYALLFRAMES    V_COPYALLFRAMES(1U)
+
+#define S_DISBCAST    1
+#define V_DISBCAST(x) ((x) << S_DISBCAST)
+#define F_DISBCAST    V_DISBCAST(1U)
+
+#define A_XGM_RX_HASH_LOW 0x814
+
+#define A_XGM_RX_HASH_HIGH 0x818
+
+#define A_XGM_RX_EXACT_MATCH_LOW_1 0x81c
+
+#define A_XGM_RX_EXACT_MATCH_HIGH_1 0x820
+
+#define A_XGM_RX_EXACT_MATCH_LOW_2 0x824
+
+#define A_XGM_RX_EXACT_MATCH_LOW_3 0x82c
+
+#define A_XGM_RX_EXACT_MATCH_LOW_4 0x834
+
+#define A_XGM_RX_EXACT_MATCH_LOW_5 0x83c
+
+#define A_XGM_RX_EXACT_MATCH_LOW_6 0x844
+
+#define A_XGM_RX_EXACT_MATCH_LOW_7 0x84c
+
+#define A_XGM_RX_EXACT_MATCH_LOW_8 0x854
+
+#define A_XGM_INT_STATUS 0x86c
+
+#define S_LINKFAULTCHANGE    9
+#define V_LINKFAULTCHANGE(x) ((x) << S_LINKFAULTCHANGE)
+#define F_LINKFAULTCHANGE    V_LINKFAULTCHANGE(1U)
+
+#define A_XGM_XGM_INT_ENABLE 0x874
+#define A_XGM_XGM_INT_DISABLE 0x878
+
+#define A_XGM_STAT_CTRL 0x880
+
+#define S_CLRSTATS    2
+#define V_CLRSTATS(x) ((x) << S_CLRSTATS)
+#define F_CLRSTATS    V_CLRSTATS(1U)
+
+#define A_XGM_RXFIFO_CFG 0x884
+
+#define S_RXFIFO_EMPTY    31
+#define V_RXFIFO_EMPTY(x) ((x) << S_RXFIFO_EMPTY)
+#define F_RXFIFO_EMPTY    V_RXFIFO_EMPTY(1U)
+
+#define S_RXFIFOPAUSEHWM    17
+#define M_RXFIFOPAUSEHWM    0xfff
+
+#define V_RXFIFOPAUSEHWM(x) ((x) << S_RXFIFOPAUSEHWM)
+
+#define G_RXFIFOPAUSEHWM(x) (((x) >> S_RXFIFOPAUSEHWM) & M_RXFIFOPAUSEHWM)
+
+#define S_RXFIFOPAUSELWM    5
+#define M_RXFIFOPAUSELWM    0xfff
+
+#define V_RXFIFOPAUSELWM(x) ((x) << S_RXFIFOPAUSELWM)
+
+#define G_RXFIFOPAUSELWM(x) (((x) >> S_RXFIFOPAUSELWM) & M_RXFIFOPAUSELWM)
+
+#define S_RXSTRFRWRD    1
+#define V_RXSTRFRWRD(x) ((x) << S_RXSTRFRWRD)
+#define F_RXSTRFRWRD    V_RXSTRFRWRD(1U)
+
+#define S_DISERRFRAMES    0
+#define V_DISERRFRAMES(x) ((x) << S_DISERRFRAMES)
+#define F_DISERRFRAMES    V_DISERRFRAMES(1U)
+
+#define A_XGM_TXFIFO_CFG 0x888
+
+#define S_UNDERUNFIX    22
+#define V_UNDERUNFIX(x) ((x) << S_UNDERUNFIX)
+#define F_UNDERUNFIX    V_UNDERUNFIX(1U)
+
+#define S_TXIPG    13
+#define M_TXIPG    0xff
+#define V_TXIPG(x) ((x) << S_TXIPG)
+#define G_TXIPG(x) (((x) >> S_TXIPG) & M_TXIPG)
+
+#define S_TXFIFOTHRESH    4
+#define M_TXFIFOTHRESH    0x1ff
+
+#define V_TXFIFOTHRESH(x) ((x) << S_TXFIFOTHRESH)
+
+#define S_ENDROPPKT    21
+#define V_ENDROPPKT(x) ((x) << S_ENDROPPKT)
+#define F_ENDROPPKT    V_ENDROPPKT(1U)
+
+#define A_XGM_SERDES_CTRL 0x890
+#define A_XGM_SERDES_CTRL0 0x8e0
+
+#define S_SERDESRESET_    24
+#define V_SERDESRESET_(x) ((x) << S_SERDESRESET_)
+#define F_SERDESRESET_    V_SERDESRESET_(1U)
+
+#define S_RXENABLE    4
+#define V_RXENABLE(x) ((x) << S_RXENABLE)
+#define F_RXENABLE    V_RXENABLE(1U)
+
+#define S_TXENABLE    3
+#define V_TXENABLE(x) ((x) << S_TXENABLE)
+#define F_TXENABLE    V_TXENABLE(1U)
+
+#define A_XGM_PAUSE_TIMER 0x890
+
+#define A_XGM_RGMII_IMP 0x89c
+
+#define S_XGM_IMPSETUPDATE    6
+#define V_XGM_IMPSETUPDATE(x) ((x) << S_XGM_IMPSETUPDATE)
+#define F_XGM_IMPSETUPDATE    V_XGM_IMPSETUPDATE(1U)
+
+#define S_RGMIIIMPPD    3
+#define M_RGMIIIMPPD    0x7
+#define V_RGMIIIMPPD(x) ((x) << S_RGMIIIMPPD)
+
+#define S_RGMIIIMPPU    0
+#define M_RGMIIIMPPU    0x7
+#define V_RGMIIIMPPU(x) ((x) << S_RGMIIIMPPU)
+
+#define S_CALRESET    8
+#define V_CALRESET(x) ((x) << S_CALRESET)
+#define F_CALRESET    V_CALRESET(1U)
+
+#define S_CALUPDATE    7
+#define V_CALUPDATE(x) ((x) << S_CALUPDATE)
+#define F_CALUPDATE    V_CALUPDATE(1U)
+
+#define A_XGM_XAUI_IMP 0x8a0
+
+#define S_CALBUSY    31
+#define V_CALBUSY(x) ((x) << S_CALBUSY)
+#define F_CALBUSY    V_CALBUSY(1U)
+
+#define S_XGM_CALFAULT    29
+#define V_XGM_CALFAULT(x) ((x) << S_XGM_CALFAULT)
+#define F_XGM_CALFAULT    V_XGM_CALFAULT(1U)
+
+#define S_CALIMP    24
+#define M_CALIMP    0x1f
+#define V_CALIMP(x) ((x) << S_CALIMP)
+#define G_CALIMP(x) (((x) >> S_CALIMP) & M_CALIMP)
+
+#define S_XAUIIMP    0
+#define M_XAUIIMP    0x7
+#define V_XAUIIMP(x) ((x) << S_XAUIIMP)
+
+#define A_XGM_RX_MAX_PKT_SIZE 0x8a8
+
+#define S_RXMAXFRAMERSIZE    17
+#define M_RXMAXFRAMERSIZE    0x3fff
+#define V_RXMAXFRAMERSIZE(x) ((x) << S_RXMAXFRAMERSIZE)
+#define G_RXMAXFRAMERSIZE(x) (((x) >> S_RXMAXFRAMERSIZE) & M_RXMAXFRAMERSIZE)
+
+#define S_RXENFRAMER    14
+#define V_RXENFRAMER(x) ((x) << S_RXENFRAMER)
+#define F_RXENFRAMER    V_RXENFRAMER(1U)
+
+#define S_RXMAXPKTSIZE    0
+#define M_RXMAXPKTSIZE    0x3fff
+#define V_RXMAXPKTSIZE(x) ((x) << S_RXMAXPKTSIZE)
+#define G_RXMAXPKTSIZE(x) (((x) >> S_RXMAXPKTSIZE) & M_RXMAXPKTSIZE)
+
+#define A_XGM_RESET_CTRL 0x8ac
+
+#define S_XGMAC_STOP_EN    4
+#define V_XGMAC_STOP_EN(x) ((x) << S_XGMAC_STOP_EN)
+#define F_XGMAC_STOP_EN    V_XGMAC_STOP_EN(1U)
+
+#define S_XG2G_RESET_    3
+#define V_XG2G_RESET_(x) ((x) << S_XG2G_RESET_)
+#define F_XG2G_RESET_    V_XG2G_RESET_(1U)
+
+#define S_RGMII_RESET_    2
+#define V_RGMII_RESET_(x) ((x) << S_RGMII_RESET_)
+#define F_RGMII_RESET_    V_RGMII_RESET_(1U)
+
+#define S_PCS_RESET_    1
+#define V_PCS_RESET_(x) ((x) << S_PCS_RESET_)
+#define F_PCS_RESET_    V_PCS_RESET_(1U)
+
+#define S_MAC_RESET_    0
+#define V_MAC_RESET_(x) ((x) << S_MAC_RESET_)
+#define F_MAC_RESET_    V_MAC_RESET_(1U)
+
+#define A_XGM_PORT_CFG 0x8b8
+
+#define S_CLKDIVRESET_    3
+#define V_CLKDIVRESET_(x) ((x) << S_CLKDIVRESET_)
+#define F_CLKDIVRESET_    V_CLKDIVRESET_(1U)
+
+#define S_PORTSPEED    1
+#define M_PORTSPEED    0x3
+
+#define V_PORTSPEED(x) ((x) << S_PORTSPEED)
+
+#define S_ENRGMII    0
+#define V_ENRGMII(x) ((x) << S_ENRGMII)
+#define F_ENRGMII    V_ENRGMII(1U)
+
+#define A_XGM_INT_ENABLE 0x8d4
+
+#define S_TXFIFO_PRTY_ERR    17
+#define M_TXFIFO_PRTY_ERR    0x7
+
+#define V_TXFIFO_PRTY_ERR(x) ((x) << S_TXFIFO_PRTY_ERR)
+
+#define S_RXFIFO_PRTY_ERR    14
+#define M_RXFIFO_PRTY_ERR    0x7
+
+#define V_RXFIFO_PRTY_ERR(x) ((x) << S_RXFIFO_PRTY_ERR)
+
+#define S_TXFIFO_UNDERRUN    13
+#define V_TXFIFO_UNDERRUN(x) ((x) << S_TXFIFO_UNDERRUN)
+#define F_TXFIFO_UNDERRUN    V_TXFIFO_UNDERRUN(1U)
+
+#define S_RXFIFO_OVERFLOW    12
+#define V_RXFIFO_OVERFLOW(x) ((x) << S_RXFIFO_OVERFLOW)
+#define F_RXFIFO_OVERFLOW    V_RXFIFO_OVERFLOW(1U)
+
+#define S_SERDES_LOS    4
+#define M_SERDES_LOS    0xf
+
+#define V_SERDES_LOS(x) ((x) << S_SERDES_LOS)
+
+#define S_XAUIPCSCTCERR    3
+#define V_XAUIPCSCTCERR(x) ((x) << S_XAUIPCSCTCERR)
+#define F_XAUIPCSCTCERR    V_XAUIPCSCTCERR(1U)
+
+#define S_XAUIPCSALIGNCHANGE    2
+#define V_XAUIPCSALIGNCHANGE(x) ((x) << S_XAUIPCSALIGNCHANGE)
+#define F_XAUIPCSALIGNCHANGE    V_XAUIPCSALIGNCHANGE(1U)
+
+#define S_XGM_INT    0
+#define V_XGM_INT(x) ((x) << S_XGM_INT)
+#define F_XGM_INT    V_XGM_INT(1U)
+
+#define A_XGM_INT_CAUSE 0x8d8
+
+#define A_XGM_XAUI_ACT_CTRL 0x8dc
+
+#define S_TXACTENABLE    1
+#define V_TXACTENABLE(x) ((x) << S_TXACTENABLE)
+#define F_TXACTENABLE    V_TXACTENABLE(1U)
+
+#define S_RESET3    23
+#define V_RESET3(x) ((x) << S_RESET3)
+#define F_RESET3    V_RESET3(1U)
+
+#define S_RESET2    22
+#define V_RESET2(x) ((x) << S_RESET2)
+#define F_RESET2    V_RESET2(1U)
+
+#define S_RESET1    21
+#define V_RESET1(x) ((x) << S_RESET1)
+#define F_RESET1    V_RESET1(1U)
+
+#define S_RESET0    20
+#define V_RESET0(x) ((x) << S_RESET0)
+#define F_RESET0    V_RESET0(1U)
+
+#define S_PWRDN3    19
+#define V_PWRDN3(x) ((x) << S_PWRDN3)
+#define F_PWRDN3    V_PWRDN3(1U)
+
+#define S_PWRDN2    18
+#define V_PWRDN2(x) ((x) << S_PWRDN2)
+#define F_PWRDN2    V_PWRDN2(1U)
+
+#define S_PWRDN1    17
+#define V_PWRDN1(x) ((x) << S_PWRDN1)
+#define F_PWRDN1    V_PWRDN1(1U)
+
+#define S_PWRDN0    16
+#define V_PWRDN0(x) ((x) << S_PWRDN0)
+#define F_PWRDN0    V_PWRDN0(1U)
+
+#define S_RESETPLL23    15
+#define V_RESETPLL23(x) ((x) << S_RESETPLL23)
+#define F_RESETPLL23    V_RESETPLL23(1U)
+
+#define S_RESETPLL01    14
+#define V_RESETPLL01(x) ((x) << S_RESETPLL01)
+#define F_RESETPLL01    V_RESETPLL01(1U)
+
+#define A_XGM_SERDES_STAT0 0x8f0
+#define A_XGM_SERDES_STAT1 0x8f4
+#define A_XGM_SERDES_STAT2 0x8f8
+
+#define S_LOWSIG0    0
+#define V_LOWSIG0(x) ((x) << S_LOWSIG0)
+#define F_LOWSIG0    V_LOWSIG0(1U)
+
+#define A_XGM_SERDES_STAT3 0x8fc
+
+#define A_XGM_STAT_TX_BYTE_LOW 0x900
+
+#define A_XGM_STAT_TX_BYTE_HIGH 0x904
+
+#define A_XGM_STAT_TX_FRAME_LOW 0x908
+
+#define A_XGM_STAT_TX_FRAME_HIGH 0x90c
+
+#define A_XGM_STAT_TX_BCAST 0x910
+
+#define A_XGM_STAT_TX_MCAST 0x914
+
+#define A_XGM_STAT_TX_PAUSE 0x918
+
+#define A_XGM_STAT_TX_64B_FRAMES 0x91c
+
+#define A_XGM_STAT_TX_65_127B_FRAMES 0x920
+
+#define A_XGM_STAT_TX_128_255B_FRAMES 0x924
+
+#define A_XGM_STAT_TX_256_511B_FRAMES 0x928
+
+#define A_XGM_STAT_TX_512_1023B_FRAMES 0x92c
+
+#define A_XGM_STAT_TX_1024_1518B_FRAMES 0x930
+
+#define A_XGM_STAT_TX_1519_MAXB_FRAMES 0x934
+
+#define A_XGM_STAT_TX_ERR_FRAMES 0x938
+
+#define A_XGM_STAT_RX_BYTES_LOW 0x93c
+
+#define A_XGM_STAT_RX_BYTES_HIGH 0x940
+
+#define A_XGM_STAT_RX_FRAMES_LOW 0x944
+
+#define A_XGM_STAT_RX_FRAMES_HIGH 0x948
+
+#define A_XGM_STAT_RX_BCAST_FRAMES 0x94c
+
+#define A_XGM_STAT_RX_MCAST_FRAMES 0x950
+
+#define A_XGM_STAT_RX_PAUSE_FRAMES 0x954
+
+#define A_XGM_STAT_RX_64B_FRAMES 0x958
+
+#define A_XGM_STAT_RX_65_127B_FRAMES 0x95c
+
+#define A_XGM_STAT_RX_128_255B_FRAMES 0x960
+
+#define A_XGM_STAT_RX_256_511B_FRAMES 0x964
+
+#define A_XGM_STAT_RX_512_1023B_FRAMES 0x968
+
+#define A_XGM_STAT_RX_1024_1518B_FRAMES 0x96c
+
+#define A_XGM_STAT_RX_1519_MAXB_FRAMES 0x970
+
+#define A_XGM_STAT_RX_SHORT_FRAMES 0x974
+
+#define A_XGM_STAT_RX_OVERSIZE_FRAMES 0x978
+
+#define A_XGM_STAT_RX_JABBER_FRAMES 0x97c
+
+#define A_XGM_STAT_RX_CRC_ERR_FRAMES 0x980
+
+#define A_XGM_STAT_RX_LENGTH_ERR_FRAMES 0x984
+
+#define A_XGM_STAT_RX_SYM_CODE_ERR_FRAMES 0x988
+
+#define A_XGM_SERDES_STATUS0 0x98c
+
+#define A_XGM_SERDES_STATUS1 0x990
+
+#define S_CMULOCK    31
+#define V_CMULOCK(x) ((x) << S_CMULOCK)
+#define F_CMULOCK    V_CMULOCK(1U)
+
+#define A_XGM_RX_MAX_PKT_SIZE_ERR_CNT 0x9a4
+
+#define A_XGM_TX_SPI4_SOP_EOP_CNT 0x9a8
+
+#define S_TXSPI4SOPCNT    16
+#define M_TXSPI4SOPCNT    0xffff
+#define V_TXSPI4SOPCNT(x) ((x) << S_TXSPI4SOPCNT)
+#define G_TXSPI4SOPCNT(x) (((x) >> S_TXSPI4SOPCNT) & M_TXSPI4SOPCNT)
+
+#define A_XGM_RX_SPI4_SOP_EOP_CNT 0x9ac
+
+#define XGMAC0_1_BASE_ADDR 0xa00
diff --git a/drivers/net/ethernet/chelsio/cxgb3/sge.c b/drivers/net/ethernet/chelsio/cxgb3/sge.c
new file mode 100644
index 0000000..3dfcf60
--- /dev/null
+++ b/drivers/net/ethernet/chelsio/cxgb3/sge.c
@@ -0,0 +1,3305 @@
+/*
+ * Copyright (c) 2005-2008 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/if_vlan.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>
+#include <linux/dma-mapping.h>
+#include <linux/slab.h>
+#include <linux/prefetch.h>
+#include <net/arp.h>
+#include "common.h"
+#include "regs.h"
+#include "sge_defs.h"
+#include "t3_cpl.h"
+#include "firmware_exports.h"
+#include "cxgb3_offload.h"
+
+#define USE_GTS 0
+
+#define SGE_RX_SM_BUF_SIZE 1536
+
+#define SGE_RX_COPY_THRES  256
+#define SGE_RX_PULL_LEN    128
+
+#define SGE_PG_RSVD SMP_CACHE_BYTES
+/*
+ * Page chunk size for FL0 buffers if FL0 is to be populated with page chunks.
+ * It must be a divisor of PAGE_SIZE.  If set to 0 FL0 will use sk_buffs
+ * directly.
+ */
+#define FL0_PG_CHUNK_SIZE  2048
+#define FL0_PG_ORDER 0
+#define FL0_PG_ALLOC_SIZE (PAGE_SIZE << FL0_PG_ORDER)
+#define FL1_PG_CHUNK_SIZE (PAGE_SIZE > 8192 ? 16384 : 8192)
+#define FL1_PG_ORDER (PAGE_SIZE > 8192 ? 0 : 1)
+#define FL1_PG_ALLOC_SIZE (PAGE_SIZE << FL1_PG_ORDER)
+
+#define SGE_RX_DROP_THRES 16
+#define RX_RECLAIM_PERIOD (HZ/4)
+
+/*
+ * Max number of Rx buffers we replenish at a time.
+ */
+#define MAX_RX_REFILL 16U
+/*
+ * Period of the Tx buffer reclaim timer.  This timer does not need to run
+ * frequently as Tx buffers are usually reclaimed by new Tx packets.
+ */
+#define TX_RECLAIM_PERIOD (HZ / 4)
+#define TX_RECLAIM_TIMER_CHUNK 64U
+#define TX_RECLAIM_CHUNK 16U
+
+/* WR size in bytes */
+#define WR_LEN (WR_FLITS * 8)
+
+/*
+ * Types of Tx queues in each queue set.  Order here matters, do not change.
+ */
+enum { TXQ_ETH, TXQ_OFLD, TXQ_CTRL };
+
+/* Values for sge_txq.flags */
+enum {
+	TXQ_RUNNING = 1 << 0,	/* fetch engine is running */
+	TXQ_LAST_PKT_DB = 1 << 1,	/* last packet rang the doorbell */
+};
+
+struct tx_desc {
+	__be64 flit[TX_DESC_FLITS];
+};
+
+struct rx_desc {
+	__be32 addr_lo;
+	__be32 len_gen;
+	__be32 gen2;
+	__be32 addr_hi;
+};
+
+struct tx_sw_desc {		/* SW state per Tx descriptor */
+	struct sk_buff *skb;
+	u8 eop;       /* set if last descriptor for packet */
+	u8 addr_idx;  /* buffer index of first SGL entry in descriptor */
+	u8 fragidx;   /* first page fragment associated with descriptor */
+	s8 sflit;     /* start flit of first SGL entry in descriptor */
+};
+
+struct rx_sw_desc {                /* SW state per Rx descriptor */
+	union {
+		struct sk_buff *skb;
+		struct fl_pg_chunk pg_chunk;
+	};
+	DEFINE_DMA_UNMAP_ADDR(dma_addr);
+};
+
+struct rsp_desc {		/* response queue descriptor */
+	struct rss_header rss_hdr;
+	__be32 flags;
+	__be32 len_cq;
+	u8 imm_data[47];
+	u8 intr_gen;
+};
+
+/*
+ * Holds unmapping information for Tx packets that need deferred unmapping.
+ * This structure lives at skb->head and must be allocated by callers.
+ */
+struct deferred_unmap_info {
+	struct pci_dev *pdev;
+	dma_addr_t addr[MAX_SKB_FRAGS + 1];
+};
+
+/*
+ * Maps a number of flits to the number of Tx descriptors that can hold them.
+ * The formula is
+ *
+ * desc = 1 + (flits - 2) / (WR_FLITS - 1).
+ *
+ * HW allows up to 4 descriptors to be combined into a WR.
+ */
+static u8 flit_desc_map[] = {
+	0,
+#if SGE_NUM_GENBITS == 1
+	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+	4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4
+#elif SGE_NUM_GENBITS == 2
+	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+	4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+#else
+# error "SGE_NUM_GENBITS must be 1 or 2"
+#endif
+};
+
+static inline struct sge_qset *fl_to_qset(const struct sge_fl *q, int qidx)
+{
+	return container_of(q, struct sge_qset, fl[qidx]);
+}
+
+static inline struct sge_qset *rspq_to_qset(const struct sge_rspq *q)
+{
+	return container_of(q, struct sge_qset, rspq);
+}
+
+static inline struct sge_qset *txq_to_qset(const struct sge_txq *q, int qidx)
+{
+	return container_of(q, struct sge_qset, txq[qidx]);
+}
+
+/**
+ *	refill_rspq - replenish an SGE response queue
+ *	@adapter: the adapter
+ *	@q: the response queue to replenish
+ *	@credits: how many new responses to make available
+ *
+ *	Replenishes a response queue by making the supplied number of responses
+ *	available to HW.
+ */
+static inline void refill_rspq(struct adapter *adapter,
+			       const struct sge_rspq *q, unsigned int credits)
+{
+	rmb();
+	t3_write_reg(adapter, A_SG_RSPQ_CREDIT_RETURN,
+		     V_RSPQ(q->cntxt_id) | V_CREDITS(credits));
+}
+
+/**
+ *	need_skb_unmap - does the platform need unmapping of sk_buffs?
+ *
+ *	Returns true if the platform needs sk_buff unmapping.  The compiler
+ *	optimizes away unnecessary code if this returns true.
+ */
+static inline int need_skb_unmap(void)
+{
+#ifdef CONFIG_NEED_DMA_MAP_STATE
+	return 1;
+#else
+	return 0;
+#endif
+}
+
+/**
+ *	unmap_skb - unmap a packet main body and its page fragments
+ *	@skb: the packet
+ *	@q: the Tx queue containing Tx descriptors for the packet
+ *	@cidx: index of Tx descriptor
+ *	@pdev: the PCI device
+ *
+ *	Unmap the main body of an sk_buff and its page fragments, if any.
+ *	Because of the fairly complicated structure of our SGLs and the desire
+ *	to conserve space for metadata, the information necessary to unmap an
+ *	sk_buff is spread across the sk_buff itself (buffer lengths), the HW Tx
+ *	descriptors (the physical addresses of the various data buffers), and
+ *	the SW descriptor state (assorted indices).  The send functions
+ *	initialize the indices for the first packet descriptor so we can unmap
+ *	the buffers held in the first Tx descriptor here, and we have enough
+ *	information at this point to set the state for the next Tx descriptor.
+ *
+ *	Note that it is possible to clean up the first descriptor of a packet
+ *	before the send routines have written the next descriptors, but this
+ *	race does not cause any problem.  We just end up writing the unmapping
+ *	info for the descriptor first.
+ */
+static inline void unmap_skb(struct sk_buff *skb, struct sge_txq *q,
+			     unsigned int cidx, struct pci_dev *pdev)
+{
+	const struct sg_ent *sgp;
+	struct tx_sw_desc *d = &q->sdesc[cidx];
+	int nfrags, frag_idx, curflit, j = d->addr_idx;
+
+	sgp = (struct sg_ent *)&q->desc[cidx].flit[d->sflit];
+	frag_idx = d->fragidx;
+
+	if (frag_idx == 0 && skb_headlen(skb)) {
+		pci_unmap_single(pdev, be64_to_cpu(sgp->addr[0]),
+				 skb_headlen(skb), PCI_DMA_TODEVICE);
+		j = 1;
+	}
+
+	curflit = d->sflit + 1 + j;
+	nfrags = skb_shinfo(skb)->nr_frags;
+
+	while (frag_idx < nfrags && curflit < WR_FLITS) {
+		pci_unmap_page(pdev, be64_to_cpu(sgp->addr[j]),
+			       skb_frag_size(&skb_shinfo(skb)->frags[frag_idx]),
+			       PCI_DMA_TODEVICE);
+		j ^= 1;
+		if (j == 0) {
+			sgp++;
+			curflit++;
+		}
+		curflit++;
+		frag_idx++;
+	}
+
+	if (frag_idx < nfrags) {   /* SGL continues into next Tx descriptor */
+		d = cidx + 1 == q->size ? q->sdesc : d + 1;
+		d->fragidx = frag_idx;
+		d->addr_idx = j;
+		d->sflit = curflit - WR_FLITS - j; /* sflit can be -1 */
+	}
+}
+
+/**
+ *	free_tx_desc - reclaims Tx descriptors and their buffers
+ *	@adapter: the adapter
+ *	@q: the Tx queue to reclaim descriptors from
+ *	@n: the number of descriptors to reclaim
+ *
+ *	Reclaims Tx descriptors from an SGE Tx queue and frees the associated
+ *	Tx buffers.  Called with the Tx queue lock held.
+ */
+static void free_tx_desc(struct adapter *adapter, struct sge_txq *q,
+			 unsigned int n)
+{
+	struct tx_sw_desc *d;
+	struct pci_dev *pdev = adapter->pdev;
+	unsigned int cidx = q->cidx;
+
+	const int need_unmap = need_skb_unmap() &&
+			       q->cntxt_id >= FW_TUNNEL_SGEEC_START;
+
+	d = &q->sdesc[cidx];
+	while (n--) {
+		if (d->skb) {	/* an SGL is present */
+			if (need_unmap)
+				unmap_skb(d->skb, q, cidx, pdev);
+			if (d->eop) {
+				dev_consume_skb_any(d->skb);
+				d->skb = NULL;
+			}
+		}
+		++d;
+		if (++cidx == q->size) {
+			cidx = 0;
+			d = q->sdesc;
+		}
+	}
+	q->cidx = cidx;
+}
+
+/**
+ *	reclaim_completed_tx - reclaims completed Tx descriptors
+ *	@adapter: the adapter
+ *	@q: the Tx queue to reclaim completed descriptors from
+ *	@chunk: maximum number of descriptors to reclaim
+ *
+ *	Reclaims Tx descriptors that the SGE has indicated it has processed,
+ *	and frees the associated buffers if possible.  Called with the Tx
+ *	queue's lock held.
+ */
+static inline unsigned int reclaim_completed_tx(struct adapter *adapter,
+						struct sge_txq *q,
+						unsigned int chunk)
+{
+	unsigned int reclaim = q->processed - q->cleaned;
+
+	reclaim = min(chunk, reclaim);
+	if (reclaim) {
+		free_tx_desc(adapter, q, reclaim);
+		q->cleaned += reclaim;
+		q->in_use -= reclaim;
+	}
+	return q->processed - q->cleaned;
+}
+
+/**
+ *	should_restart_tx - are there enough resources to restart a Tx queue?
+ *	@q: the Tx queue
+ *
+ *	Checks if there are enough descriptors to restart a suspended Tx queue.
+ */
+static inline int should_restart_tx(const struct sge_txq *q)
+{
+	unsigned int r = q->processed - q->cleaned;
+
+	return q->in_use - r < (q->size >> 1);
+}
+
+static void clear_rx_desc(struct pci_dev *pdev, const struct sge_fl *q,
+			  struct rx_sw_desc *d)
+{
+	if (q->use_pages && d->pg_chunk.page) {
+		(*d->pg_chunk.p_cnt)--;
+		if (!*d->pg_chunk.p_cnt)
+			pci_unmap_page(pdev,
+				       d->pg_chunk.mapping,
+				       q->alloc_size, PCI_DMA_FROMDEVICE);
+
+		put_page(d->pg_chunk.page);
+		d->pg_chunk.page = NULL;
+	} else {
+		pci_unmap_single(pdev, dma_unmap_addr(d, dma_addr),
+				 q->buf_size, PCI_DMA_FROMDEVICE);
+		kfree_skb(d->skb);
+		d->skb = NULL;
+	}
+}
+
+/**
+ *	free_rx_bufs - free the Rx buffers on an SGE free list
+ *	@pdev: the PCI device associated with the adapter
+ *	@rxq: the SGE free list to clean up
+ *
+ *	Release the buffers on an SGE free-buffer Rx queue.  HW fetching from
+ *	this queue should be stopped before calling this function.
+ */
+static void free_rx_bufs(struct pci_dev *pdev, struct sge_fl *q)
+{
+	unsigned int cidx = q->cidx;
+
+	while (q->credits--) {
+		struct rx_sw_desc *d = &q->sdesc[cidx];
+
+
+		clear_rx_desc(pdev, q, d);
+		if (++cidx == q->size)
+			cidx = 0;
+	}
+
+	if (q->pg_chunk.page) {
+		__free_pages(q->pg_chunk.page, q->order);
+		q->pg_chunk.page = NULL;
+	}
+}
+
+/**
+ *	add_one_rx_buf - add a packet buffer to a free-buffer list
+ *	@va:  buffer start VA
+ *	@len: the buffer length
+ *	@d: the HW Rx descriptor to write
+ *	@sd: the SW Rx descriptor to write
+ *	@gen: the generation bit value
+ *	@pdev: the PCI device associated with the adapter
+ *
+ *	Add a buffer of the given length to the supplied HW and SW Rx
+ *	descriptors.
+ */
+static inline int add_one_rx_buf(void *va, unsigned int len,
+				 struct rx_desc *d, struct rx_sw_desc *sd,
+				 unsigned int gen, struct pci_dev *pdev)
+{
+	dma_addr_t mapping;
+
+	mapping = pci_map_single(pdev, va, len, PCI_DMA_FROMDEVICE);
+	if (unlikely(pci_dma_mapping_error(pdev, mapping)))
+		return -ENOMEM;
+
+	dma_unmap_addr_set(sd, dma_addr, mapping);
+
+	d->addr_lo = cpu_to_be32(mapping);
+	d->addr_hi = cpu_to_be32((u64) mapping >> 32);
+	wmb();
+	d->len_gen = cpu_to_be32(V_FLD_GEN1(gen));
+	d->gen2 = cpu_to_be32(V_FLD_GEN2(gen));
+	return 0;
+}
+
+static inline int add_one_rx_chunk(dma_addr_t mapping, struct rx_desc *d,
+				   unsigned int gen)
+{
+	d->addr_lo = cpu_to_be32(mapping);
+	d->addr_hi = cpu_to_be32((u64) mapping >> 32);
+	wmb();
+	d->len_gen = cpu_to_be32(V_FLD_GEN1(gen));
+	d->gen2 = cpu_to_be32(V_FLD_GEN2(gen));
+	return 0;
+}
+
+static int alloc_pg_chunk(struct adapter *adapter, struct sge_fl *q,
+			  struct rx_sw_desc *sd, gfp_t gfp,
+			  unsigned int order)
+{
+	if (!q->pg_chunk.page) {
+		dma_addr_t mapping;
+
+		q->pg_chunk.page = alloc_pages(gfp, order);
+		if (unlikely(!q->pg_chunk.page))
+			return -ENOMEM;
+		q->pg_chunk.va = page_address(q->pg_chunk.page);
+		q->pg_chunk.p_cnt = q->pg_chunk.va + (PAGE_SIZE << order) -
+				    SGE_PG_RSVD;
+		q->pg_chunk.offset = 0;
+		mapping = pci_map_page(adapter->pdev, q->pg_chunk.page,
+				       0, q->alloc_size, PCI_DMA_FROMDEVICE);
+		q->pg_chunk.mapping = mapping;
+	}
+	sd->pg_chunk = q->pg_chunk;
+
+	prefetch(sd->pg_chunk.p_cnt);
+
+	q->pg_chunk.offset += q->buf_size;
+	if (q->pg_chunk.offset == (PAGE_SIZE << order))
+		q->pg_chunk.page = NULL;
+	else {
+		q->pg_chunk.va += q->buf_size;
+		get_page(q->pg_chunk.page);
+	}
+
+	if (sd->pg_chunk.offset == 0)
+		*sd->pg_chunk.p_cnt = 1;
+	else
+		*sd->pg_chunk.p_cnt += 1;
+
+	return 0;
+}
+
+static inline void ring_fl_db(struct adapter *adap, struct sge_fl *q)
+{
+	if (q->pend_cred >= q->credits / 4) {
+		q->pend_cred = 0;
+		wmb();
+		t3_write_reg(adap, A_SG_KDOORBELL, V_EGRCNTX(q->cntxt_id));
+	}
+}
+
+/**
+ *	refill_fl - refill an SGE free-buffer list
+ *	@adapter: the adapter
+ *	@q: the free-list to refill
+ *	@n: the number of new buffers to allocate
+ *	@gfp: the gfp flags for allocating new buffers
+ *
+ *	(Re)populate an SGE free-buffer list with up to @n new packet buffers,
+ *	allocated with the supplied gfp flags.  The caller must assure that
+ *	@n does not exceed the queue's capacity.
+ */
+static int refill_fl(struct adapter *adap, struct sge_fl *q, int n, gfp_t gfp)
+{
+	struct rx_sw_desc *sd = &q->sdesc[q->pidx];
+	struct rx_desc *d = &q->desc[q->pidx];
+	unsigned int count = 0;
+
+	while (n--) {
+		dma_addr_t mapping;
+		int err;
+
+		if (q->use_pages) {
+			if (unlikely(alloc_pg_chunk(adap, q, sd, gfp,
+						    q->order))) {
+nomem:				q->alloc_failed++;
+				break;
+			}
+			mapping = sd->pg_chunk.mapping + sd->pg_chunk.offset;
+			dma_unmap_addr_set(sd, dma_addr, mapping);
+
+			add_one_rx_chunk(mapping, d, q->gen);
+			pci_dma_sync_single_for_device(adap->pdev, mapping,
+						q->buf_size - SGE_PG_RSVD,
+						PCI_DMA_FROMDEVICE);
+		} else {
+			void *buf_start;
+
+			struct sk_buff *skb = alloc_skb(q->buf_size, gfp);
+			if (!skb)
+				goto nomem;
+
+			sd->skb = skb;
+			buf_start = skb->data;
+			err = add_one_rx_buf(buf_start, q->buf_size, d, sd,
+					     q->gen, adap->pdev);
+			if (unlikely(err)) {
+				clear_rx_desc(adap->pdev, q, sd);
+				break;
+			}
+		}
+
+		d++;
+		sd++;
+		if (++q->pidx == q->size) {
+			q->pidx = 0;
+			q->gen ^= 1;
+			sd = q->sdesc;
+			d = q->desc;
+		}
+		count++;
+	}
+
+	q->credits += count;
+	q->pend_cred += count;
+	ring_fl_db(adap, q);
+
+	return count;
+}
+
+static inline void __refill_fl(struct adapter *adap, struct sge_fl *fl)
+{
+	refill_fl(adap, fl, min(MAX_RX_REFILL, fl->size - fl->credits),
+		  GFP_ATOMIC | __GFP_COMP);
+}
+
+/**
+ *	recycle_rx_buf - recycle a receive buffer
+ *	@adapter: the adapter
+ *	@q: the SGE free list
+ *	@idx: index of buffer to recycle
+ *
+ *	Recycles the specified buffer on the given free list by adding it at
+ *	the next available slot on the list.
+ */
+static void recycle_rx_buf(struct adapter *adap, struct sge_fl *q,
+			   unsigned int idx)
+{
+	struct rx_desc *from = &q->desc[idx];
+	struct rx_desc *to = &q->desc[q->pidx];
+
+	q->sdesc[q->pidx] = q->sdesc[idx];
+	to->addr_lo = from->addr_lo;	/* already big endian */
+	to->addr_hi = from->addr_hi;	/* likewise */
+	wmb();
+	to->len_gen = cpu_to_be32(V_FLD_GEN1(q->gen));
+	to->gen2 = cpu_to_be32(V_FLD_GEN2(q->gen));
+
+	if (++q->pidx == q->size) {
+		q->pidx = 0;
+		q->gen ^= 1;
+	}
+
+	q->credits++;
+	q->pend_cred++;
+	ring_fl_db(adap, q);
+}
+
+/**
+ *	alloc_ring - allocate resources for an SGE descriptor ring
+ *	@pdev: the PCI device
+ *	@nelem: the number of descriptors
+ *	@elem_size: the size of each descriptor
+ *	@sw_size: the size of the SW state associated with each ring element
+ *	@phys: the physical address of the allocated ring
+ *	@metadata: address of the array holding the SW state for the ring
+ *
+ *	Allocates resources for an SGE descriptor ring, such as Tx queues,
+ *	free buffer lists, or response queues.  Each SGE ring requires
+ *	space for its HW descriptors plus, optionally, space for the SW state
+ *	associated with each HW entry (the metadata).  The function returns
+ *	three values: the virtual address for the HW ring (the return value
+ *	of the function), the physical address of the HW ring, and the address
+ *	of the SW ring.
+ */
+static void *alloc_ring(struct pci_dev *pdev, size_t nelem, size_t elem_size,
+			size_t sw_size, dma_addr_t * phys, void *metadata)
+{
+	size_t len = nelem * elem_size;
+	void *s = NULL;
+	void *p = dma_alloc_coherent(&pdev->dev, len, phys, GFP_KERNEL);
+
+	if (!p)
+		return NULL;
+	if (sw_size && metadata) {
+		s = kcalloc(nelem, sw_size, GFP_KERNEL);
+
+		if (!s) {
+			dma_free_coherent(&pdev->dev, len, p, *phys);
+			return NULL;
+		}
+		*(void **)metadata = s;
+	}
+	memset(p, 0, len);
+	return p;
+}
+
+/**
+ *	t3_reset_qset - reset a sge qset
+ *	@q: the queue set
+ *
+ *	Reset the qset structure.
+ *	the NAPI structure is preserved in the event of
+ *	the qset's reincarnation, for example during EEH recovery.
+ */
+static void t3_reset_qset(struct sge_qset *q)
+{
+	if (q->adap &&
+	    !(q->adap->flags & NAPI_INIT)) {
+		memset(q, 0, sizeof(*q));
+		return;
+	}
+
+	q->adap = NULL;
+	memset(&q->rspq, 0, sizeof(q->rspq));
+	memset(q->fl, 0, sizeof(struct sge_fl) * SGE_RXQ_PER_SET);
+	memset(q->txq, 0, sizeof(struct sge_txq) * SGE_TXQ_PER_SET);
+	q->txq_stopped = 0;
+	q->tx_reclaim_timer.function = NULL; /* for t3_stop_sge_timers() */
+	q->rx_reclaim_timer.function = NULL;
+	q->nomem = 0;
+	napi_free_frags(&q->napi);
+}
+
+
+/**
+ *	free_qset - free the resources of an SGE queue set
+ *	@adapter: the adapter owning the queue set
+ *	@q: the queue set
+ *
+ *	Release the HW and SW resources associated with an SGE queue set, such
+ *	as HW contexts, packet buffers, and descriptor rings.  Traffic to the
+ *	queue set must be quiesced prior to calling this.
+ */
+static void t3_free_qset(struct adapter *adapter, struct sge_qset *q)
+{
+	int i;
+	struct pci_dev *pdev = adapter->pdev;
+
+	for (i = 0; i < SGE_RXQ_PER_SET; ++i)
+		if (q->fl[i].desc) {
+			spin_lock_irq(&adapter->sge.reg_lock);
+			t3_sge_disable_fl(adapter, q->fl[i].cntxt_id);
+			spin_unlock_irq(&adapter->sge.reg_lock);
+			free_rx_bufs(pdev, &q->fl[i]);
+			kfree(q->fl[i].sdesc);
+			dma_free_coherent(&pdev->dev,
+					  q->fl[i].size *
+					  sizeof(struct rx_desc), q->fl[i].desc,
+					  q->fl[i].phys_addr);
+		}
+
+	for (i = 0; i < SGE_TXQ_PER_SET; ++i)
+		if (q->txq[i].desc) {
+			spin_lock_irq(&adapter->sge.reg_lock);
+			t3_sge_enable_ecntxt(adapter, q->txq[i].cntxt_id, 0);
+			spin_unlock_irq(&adapter->sge.reg_lock);
+			if (q->txq[i].sdesc) {
+				free_tx_desc(adapter, &q->txq[i],
+					     q->txq[i].in_use);
+				kfree(q->txq[i].sdesc);
+			}
+			dma_free_coherent(&pdev->dev,
+					  q->txq[i].size *
+					  sizeof(struct tx_desc),
+					  q->txq[i].desc, q->txq[i].phys_addr);
+			__skb_queue_purge(&q->txq[i].sendq);
+		}
+
+	if (q->rspq.desc) {
+		spin_lock_irq(&adapter->sge.reg_lock);
+		t3_sge_disable_rspcntxt(adapter, q->rspq.cntxt_id);
+		spin_unlock_irq(&adapter->sge.reg_lock);
+		dma_free_coherent(&pdev->dev,
+				  q->rspq.size * sizeof(struct rsp_desc),
+				  q->rspq.desc, q->rspq.phys_addr);
+	}
+
+	t3_reset_qset(q);
+}
+
+/**
+ *	init_qset_cntxt - initialize an SGE queue set context info
+ *	@qs: the queue set
+ *	@id: the queue set id
+ *
+ *	Initializes the TIDs and context ids for the queues of a queue set.
+ */
+static void init_qset_cntxt(struct sge_qset *qs, unsigned int id)
+{
+	qs->rspq.cntxt_id = id;
+	qs->fl[0].cntxt_id = 2 * id;
+	qs->fl[1].cntxt_id = 2 * id + 1;
+	qs->txq[TXQ_ETH].cntxt_id = FW_TUNNEL_SGEEC_START + id;
+	qs->txq[TXQ_ETH].token = FW_TUNNEL_TID_START + id;
+	qs->txq[TXQ_OFLD].cntxt_id = FW_OFLD_SGEEC_START + id;
+	qs->txq[TXQ_CTRL].cntxt_id = FW_CTRL_SGEEC_START + id;
+	qs->txq[TXQ_CTRL].token = FW_CTRL_TID_START + id;
+}
+
+/**
+ *	sgl_len - calculates the size of an SGL of the given capacity
+ *	@n: the number of SGL entries
+ *
+ *	Calculates the number of flits needed for a scatter/gather list that
+ *	can hold the given number of entries.
+ */
+static inline unsigned int sgl_len(unsigned int n)
+{
+	/* alternatively: 3 * (n / 2) + 2 * (n & 1) */
+	return (3 * n) / 2 + (n & 1);
+}
+
+/**
+ *	flits_to_desc - returns the num of Tx descriptors for the given flits
+ *	@n: the number of flits
+ *
+ *	Calculates the number of Tx descriptors needed for the supplied number
+ *	of flits.
+ */
+static inline unsigned int flits_to_desc(unsigned int n)
+{
+	BUG_ON(n >= ARRAY_SIZE(flit_desc_map));
+	return flit_desc_map[n];
+}
+
+/**
+ *	get_packet - return the next ingress packet buffer from a free list
+ *	@adap: the adapter that received the packet
+ *	@fl: the SGE free list holding the packet
+ *	@len: the packet length including any SGE padding
+ *	@drop_thres: # of remaining buffers before we start dropping packets
+ *
+ *	Get the next packet from a free list and complete setup of the
+ *	sk_buff.  If the packet is small we make a copy and recycle the
+ *	original buffer, otherwise we use the original buffer itself.  If a
+ *	positive drop threshold is supplied packets are dropped and their
+ *	buffers recycled if (a) the number of remaining buffers is under the
+ *	threshold and the packet is too big to copy, or (b) the packet should
+ *	be copied but there is no memory for the copy.
+ */
+static struct sk_buff *get_packet(struct adapter *adap, struct sge_fl *fl,
+				  unsigned int len, unsigned int drop_thres)
+{
+	struct sk_buff *skb = NULL;
+	struct rx_sw_desc *sd = &fl->sdesc[fl->cidx];
+
+	prefetch(sd->skb->data);
+	fl->credits--;
+
+	if (len <= SGE_RX_COPY_THRES) {
+		skb = alloc_skb(len, GFP_ATOMIC);
+		if (likely(skb != NULL)) {
+			__skb_put(skb, len);
+			pci_dma_sync_single_for_cpu(adap->pdev,
+					    dma_unmap_addr(sd, dma_addr), len,
+					    PCI_DMA_FROMDEVICE);
+			memcpy(skb->data, sd->skb->data, len);
+			pci_dma_sync_single_for_device(adap->pdev,
+					    dma_unmap_addr(sd, dma_addr), len,
+					    PCI_DMA_FROMDEVICE);
+		} else if (!drop_thres)
+			goto use_orig_buf;
+recycle:
+		recycle_rx_buf(adap, fl, fl->cidx);
+		return skb;
+	}
+
+	if (unlikely(fl->credits < drop_thres) &&
+	    refill_fl(adap, fl, min(MAX_RX_REFILL, fl->size - fl->credits - 1),
+		      GFP_ATOMIC | __GFP_COMP) == 0)
+		goto recycle;
+
+use_orig_buf:
+	pci_unmap_single(adap->pdev, dma_unmap_addr(sd, dma_addr),
+			 fl->buf_size, PCI_DMA_FROMDEVICE);
+	skb = sd->skb;
+	skb_put(skb, len);
+	__refill_fl(adap, fl);
+	return skb;
+}
+
+/**
+ *	get_packet_pg - return the next ingress packet buffer from a free list
+ *	@adap: the adapter that received the packet
+ *	@fl: the SGE free list holding the packet
+ *	@len: the packet length including any SGE padding
+ *	@drop_thres: # of remaining buffers before we start dropping packets
+ *
+ *	Get the next packet from a free list populated with page chunks.
+ *	If the packet is small we make a copy and recycle the original buffer,
+ *	otherwise we attach the original buffer as a page fragment to a fresh
+ *	sk_buff.  If a positive drop threshold is supplied packets are dropped
+ *	and their buffers recycled if (a) the number of remaining buffers is
+ *	under the threshold and the packet is too big to copy, or (b) there's
+ *	no system memory.
+ *
+ * 	Note: this function is similar to @get_packet but deals with Rx buffers
+ * 	that are page chunks rather than sk_buffs.
+ */
+static struct sk_buff *get_packet_pg(struct adapter *adap, struct sge_fl *fl,
+				     struct sge_rspq *q, unsigned int len,
+				     unsigned int drop_thres)
+{
+	struct sk_buff *newskb, *skb;
+	struct rx_sw_desc *sd = &fl->sdesc[fl->cidx];
+
+	dma_addr_t dma_addr = dma_unmap_addr(sd, dma_addr);
+
+	newskb = skb = q->pg_skb;
+	if (!skb && (len <= SGE_RX_COPY_THRES)) {
+		newskb = alloc_skb(len, GFP_ATOMIC);
+		if (likely(newskb != NULL)) {
+			__skb_put(newskb, len);
+			pci_dma_sync_single_for_cpu(adap->pdev, dma_addr, len,
+					    PCI_DMA_FROMDEVICE);
+			memcpy(newskb->data, sd->pg_chunk.va, len);
+			pci_dma_sync_single_for_device(adap->pdev, dma_addr,
+						       len,
+						       PCI_DMA_FROMDEVICE);
+		} else if (!drop_thres)
+			return NULL;
+recycle:
+		fl->credits--;
+		recycle_rx_buf(adap, fl, fl->cidx);
+		q->rx_recycle_buf++;
+		return newskb;
+	}
+
+	if (unlikely(q->rx_recycle_buf || (!skb && fl->credits <= drop_thres)))
+		goto recycle;
+
+	prefetch(sd->pg_chunk.p_cnt);
+
+	if (!skb)
+		newskb = alloc_skb(SGE_RX_PULL_LEN, GFP_ATOMIC);
+
+	if (unlikely(!newskb)) {
+		if (!drop_thres)
+			return NULL;
+		goto recycle;
+	}
+
+	pci_dma_sync_single_for_cpu(adap->pdev, dma_addr, len,
+				    PCI_DMA_FROMDEVICE);
+	(*sd->pg_chunk.p_cnt)--;
+	if (!*sd->pg_chunk.p_cnt && sd->pg_chunk.page != fl->pg_chunk.page)
+		pci_unmap_page(adap->pdev,
+			       sd->pg_chunk.mapping,
+			       fl->alloc_size,
+			       PCI_DMA_FROMDEVICE);
+	if (!skb) {
+		__skb_put(newskb, SGE_RX_PULL_LEN);
+		memcpy(newskb->data, sd->pg_chunk.va, SGE_RX_PULL_LEN);
+		skb_fill_page_desc(newskb, 0, sd->pg_chunk.page,
+				   sd->pg_chunk.offset + SGE_RX_PULL_LEN,
+				   len - SGE_RX_PULL_LEN);
+		newskb->len = len;
+		newskb->data_len = len - SGE_RX_PULL_LEN;
+		newskb->truesize += newskb->data_len;
+	} else {
+		skb_fill_page_desc(newskb, skb_shinfo(newskb)->nr_frags,
+				   sd->pg_chunk.page,
+				   sd->pg_chunk.offset, len);
+		newskb->len += len;
+		newskb->data_len += len;
+		newskb->truesize += len;
+	}
+
+	fl->credits--;
+	/*
+	 * We do not refill FLs here, we let the caller do it to overlap a
+	 * prefetch.
+	 */
+	return newskb;
+}
+
+/**
+ *	get_imm_packet - return the next ingress packet buffer from a response
+ *	@resp: the response descriptor containing the packet data
+ *
+ *	Return a packet containing the immediate data of the given response.
+ */
+static inline struct sk_buff *get_imm_packet(const struct rsp_desc *resp)
+{
+	struct sk_buff *skb = alloc_skb(IMMED_PKT_SIZE, GFP_ATOMIC);
+
+	if (skb) {
+		__skb_put(skb, IMMED_PKT_SIZE);
+		skb_copy_to_linear_data(skb, resp->imm_data, IMMED_PKT_SIZE);
+	}
+	return skb;
+}
+
+/**
+ *	calc_tx_descs - calculate the number of Tx descriptors for a packet
+ *	@skb: the packet
+ *
+ * 	Returns the number of Tx descriptors needed for the given Ethernet
+ * 	packet.  Ethernet packets require addition of WR and CPL headers.
+ */
+static inline unsigned int calc_tx_descs(const struct sk_buff *skb)
+{
+	unsigned int flits;
+
+	if (skb->len <= WR_LEN - sizeof(struct cpl_tx_pkt))
+		return 1;
+
+	flits = sgl_len(skb_shinfo(skb)->nr_frags + 1) + 2;
+	if (skb_shinfo(skb)->gso_size)
+		flits++;
+	return flits_to_desc(flits);
+}
+
+/**
+ *	make_sgl - populate a scatter/gather list for a packet
+ *	@skb: the packet
+ *	@sgp: the SGL to populate
+ *	@start: start address of skb main body data to include in the SGL
+ *	@len: length of skb main body data to include in the SGL
+ *	@pdev: the PCI device
+ *
+ *	Generates a scatter/gather list for the buffers that make up a packet
+ *	and returns the SGL size in 8-byte words.  The caller must size the SGL
+ *	appropriately.
+ */
+static inline unsigned int make_sgl(const struct sk_buff *skb,
+				    struct sg_ent *sgp, unsigned char *start,
+				    unsigned int len, struct pci_dev *pdev)
+{
+	dma_addr_t mapping;
+	unsigned int i, j = 0, nfrags;
+
+	if (len) {
+		mapping = pci_map_single(pdev, start, len, PCI_DMA_TODEVICE);
+		sgp->len[0] = cpu_to_be32(len);
+		sgp->addr[0] = cpu_to_be64(mapping);
+		j = 1;
+	}
+
+	nfrags = skb_shinfo(skb)->nr_frags;
+	for (i = 0; i < nfrags; i++) {
+		const skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+
+		mapping = skb_frag_dma_map(&pdev->dev, frag, 0, skb_frag_size(frag),
+					   DMA_TO_DEVICE);
+		sgp->len[j] = cpu_to_be32(skb_frag_size(frag));
+		sgp->addr[j] = cpu_to_be64(mapping);
+		j ^= 1;
+		if (j == 0)
+			++sgp;
+	}
+	if (j)
+		sgp->len[j] = 0;
+	return ((nfrags + (len != 0)) * 3) / 2 + j;
+}
+
+/**
+ *	check_ring_tx_db - check and potentially ring a Tx queue's doorbell
+ *	@adap: the adapter
+ *	@q: the Tx queue
+ *
+ *	Ring the doorbel if a Tx queue is asleep.  There is a natural race,
+ *	where the HW is going to sleep just after we checked, however,
+ *	then the interrupt handler will detect the outstanding TX packet
+ *	and ring the doorbell for us.
+ *
+ *	When GTS is disabled we unconditionally ring the doorbell.
+ */
+static inline void check_ring_tx_db(struct adapter *adap, struct sge_txq *q)
+{
+#if USE_GTS
+	clear_bit(TXQ_LAST_PKT_DB, &q->flags);
+	if (test_and_set_bit(TXQ_RUNNING, &q->flags) == 0) {
+		set_bit(TXQ_LAST_PKT_DB, &q->flags);
+		t3_write_reg(adap, A_SG_KDOORBELL,
+			     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
+	}
+#else
+	wmb();			/* write descriptors before telling HW */
+	t3_write_reg(adap, A_SG_KDOORBELL,
+		     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
+#endif
+}
+
+static inline void wr_gen2(struct tx_desc *d, unsigned int gen)
+{
+#if SGE_NUM_GENBITS == 2
+	d->flit[TX_DESC_FLITS - 1] = cpu_to_be64(gen);
+#endif
+}
+
+/**
+ *	write_wr_hdr_sgl - write a WR header and, optionally, SGL
+ *	@ndesc: number of Tx descriptors spanned by the SGL
+ *	@skb: the packet corresponding to the WR
+ *	@d: first Tx descriptor to be written
+ *	@pidx: index of above descriptors
+ *	@q: the SGE Tx queue
+ *	@sgl: the SGL
+ *	@flits: number of flits to the start of the SGL in the first descriptor
+ *	@sgl_flits: the SGL size in flits
+ *	@gen: the Tx descriptor generation
+ *	@wr_hi: top 32 bits of WR header based on WR type (big endian)
+ *	@wr_lo: low 32 bits of WR header based on WR type (big endian)
+ *
+ *	Write a work request header and an associated SGL.  If the SGL is
+ *	small enough to fit into one Tx descriptor it has already been written
+ *	and we just need to write the WR header.  Otherwise we distribute the
+ *	SGL across the number of descriptors it spans.
+ */
+static void write_wr_hdr_sgl(unsigned int ndesc, struct sk_buff *skb,
+			     struct tx_desc *d, unsigned int pidx,
+			     const struct sge_txq *q,
+			     const struct sg_ent *sgl,
+			     unsigned int flits, unsigned int sgl_flits,
+			     unsigned int gen, __be32 wr_hi,
+			     __be32 wr_lo)
+{
+	struct work_request_hdr *wrp = (struct work_request_hdr *)d;
+	struct tx_sw_desc *sd = &q->sdesc[pidx];
+
+	sd->skb = skb;
+	if (need_skb_unmap()) {
+		sd->fragidx = 0;
+		sd->addr_idx = 0;
+		sd->sflit = flits;
+	}
+
+	if (likely(ndesc == 1)) {
+		sd->eop = 1;
+		wrp->wr_hi = htonl(F_WR_SOP | F_WR_EOP | V_WR_DATATYPE(1) |
+				   V_WR_SGLSFLT(flits)) | wr_hi;
+		wmb();
+		wrp->wr_lo = htonl(V_WR_LEN(flits + sgl_flits) |
+				   V_WR_GEN(gen)) | wr_lo;
+		wr_gen2(d, gen);
+	} else {
+		unsigned int ogen = gen;
+		const u64 *fp = (const u64 *)sgl;
+		struct work_request_hdr *wp = wrp;
+
+		wrp->wr_hi = htonl(F_WR_SOP | V_WR_DATATYPE(1) |
+				   V_WR_SGLSFLT(flits)) | wr_hi;
+
+		while (sgl_flits) {
+			unsigned int avail = WR_FLITS - flits;
+
+			if (avail > sgl_flits)
+				avail = sgl_flits;
+			memcpy(&d->flit[flits], fp, avail * sizeof(*fp));
+			sgl_flits -= avail;
+			ndesc--;
+			if (!sgl_flits)
+				break;
+
+			fp += avail;
+			d++;
+			sd->eop = 0;
+			sd++;
+			if (++pidx == q->size) {
+				pidx = 0;
+				gen ^= 1;
+				d = q->desc;
+				sd = q->sdesc;
+			}
+
+			sd->skb = skb;
+			wrp = (struct work_request_hdr *)d;
+			wrp->wr_hi = htonl(V_WR_DATATYPE(1) |
+					   V_WR_SGLSFLT(1)) | wr_hi;
+			wrp->wr_lo = htonl(V_WR_LEN(min(WR_FLITS,
+							sgl_flits + 1)) |
+					   V_WR_GEN(gen)) | wr_lo;
+			wr_gen2(d, gen);
+			flits = 1;
+		}
+		sd->eop = 1;
+		wrp->wr_hi |= htonl(F_WR_EOP);
+		wmb();
+		wp->wr_lo = htonl(V_WR_LEN(WR_FLITS) | V_WR_GEN(ogen)) | wr_lo;
+		wr_gen2((struct tx_desc *)wp, ogen);
+		WARN_ON(ndesc != 0);
+	}
+}
+
+/**
+ *	write_tx_pkt_wr - write a TX_PKT work request
+ *	@adap: the adapter
+ *	@skb: the packet to send
+ *	@pi: the egress interface
+ *	@pidx: index of the first Tx descriptor to write
+ *	@gen: the generation value to use
+ *	@q: the Tx queue
+ *	@ndesc: number of descriptors the packet will occupy
+ *	@compl: the value of the COMPL bit to use
+ *
+ *	Generate a TX_PKT work request to send the supplied packet.
+ */
+static void write_tx_pkt_wr(struct adapter *adap, struct sk_buff *skb,
+			    const struct port_info *pi,
+			    unsigned int pidx, unsigned int gen,
+			    struct sge_txq *q, unsigned int ndesc,
+			    unsigned int compl)
+{
+	unsigned int flits, sgl_flits, cntrl, tso_info;
+	struct sg_ent *sgp, sgl[MAX_SKB_FRAGS / 2 + 1];
+	struct tx_desc *d = &q->desc[pidx];
+	struct cpl_tx_pkt *cpl = (struct cpl_tx_pkt *)d;
+
+	cpl->len = htonl(skb->len);
+	cntrl = V_TXPKT_INTF(pi->port_id);
+
+	if (vlan_tx_tag_present(skb))
+		cntrl |= F_TXPKT_VLAN_VLD | V_TXPKT_VLAN(vlan_tx_tag_get(skb));
+
+	tso_info = V_LSO_MSS(skb_shinfo(skb)->gso_size);
+	if (tso_info) {
+		int eth_type;
+		struct cpl_tx_pkt_lso *hdr = (struct cpl_tx_pkt_lso *)cpl;
+
+		d->flit[2] = 0;
+		cntrl |= V_TXPKT_OPCODE(CPL_TX_PKT_LSO);
+		hdr->cntrl = htonl(cntrl);
+		eth_type = skb_network_offset(skb) == ETH_HLEN ?
+		    CPL_ETH_II : CPL_ETH_II_VLAN;
+		tso_info |= V_LSO_ETH_TYPE(eth_type) |
+		    V_LSO_IPHDR_WORDS(ip_hdr(skb)->ihl) |
+		    V_LSO_TCPHDR_WORDS(tcp_hdr(skb)->doff);
+		hdr->lso_info = htonl(tso_info);
+		flits = 3;
+	} else {
+		cntrl |= V_TXPKT_OPCODE(CPL_TX_PKT);
+		cntrl |= F_TXPKT_IPCSUM_DIS;	/* SW calculates IP csum */
+		cntrl |= V_TXPKT_L4CSUM_DIS(skb->ip_summed != CHECKSUM_PARTIAL);
+		cpl->cntrl = htonl(cntrl);
+
+		if (skb->len <= WR_LEN - sizeof(*cpl)) {
+			q->sdesc[pidx].skb = NULL;
+			if (!skb->data_len)
+				skb_copy_from_linear_data(skb, &d->flit[2],
+							  skb->len);
+			else
+				skb_copy_bits(skb, 0, &d->flit[2], skb->len);
+
+			flits = (skb->len + 7) / 8 + 2;
+			cpl->wr.wr_hi = htonl(V_WR_BCNTLFLT(skb->len & 7) |
+					      V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT)
+					      | F_WR_SOP | F_WR_EOP | compl);
+			wmb();
+			cpl->wr.wr_lo = htonl(V_WR_LEN(flits) | V_WR_GEN(gen) |
+					      V_WR_TID(q->token));
+			wr_gen2(d, gen);
+			dev_consume_skb_any(skb);
+			return;
+		}
+
+		flits = 2;
+	}
+
+	sgp = ndesc == 1 ? (struct sg_ent *)&d->flit[flits] : sgl;
+	sgl_flits = make_sgl(skb, sgp, skb->data, skb_headlen(skb), adap->pdev);
+
+	write_wr_hdr_sgl(ndesc, skb, d, pidx, q, sgl, flits, sgl_flits, gen,
+			 htonl(V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) | compl),
+			 htonl(V_WR_TID(q->token)));
+}
+
+static inline void t3_stop_tx_queue(struct netdev_queue *txq,
+				    struct sge_qset *qs, struct sge_txq *q)
+{
+	netif_tx_stop_queue(txq);
+	set_bit(TXQ_ETH, &qs->txq_stopped);
+	q->stops++;
+}
+
+/**
+ *	eth_xmit - add a packet to the Ethernet Tx queue
+ *	@skb: the packet
+ *	@dev: the egress net device
+ *
+ *	Add a packet to an SGE Tx queue.  Runs with softirqs disabled.
+ */
+netdev_tx_t t3_eth_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+	int qidx;
+	unsigned int ndesc, pidx, credits, gen, compl;
+	const struct port_info *pi = netdev_priv(dev);
+	struct adapter *adap = pi->adapter;
+	struct netdev_queue *txq;
+	struct sge_qset *qs;
+	struct sge_txq *q;
+
+	/*
+	 * The chip min packet length is 9 octets but play safe and reject
+	 * anything shorter than an Ethernet header.
+	 */
+	if (unlikely(skb->len < ETH_HLEN)) {
+		dev_kfree_skb_any(skb);
+		return NETDEV_TX_OK;
+	}
+
+	qidx = skb_get_queue_mapping(skb);
+	qs = &pi->qs[qidx];
+	q = &qs->txq[TXQ_ETH];
+	txq = netdev_get_tx_queue(dev, qidx);
+
+	reclaim_completed_tx(adap, q, TX_RECLAIM_CHUNK);
+
+	credits = q->size - q->in_use;
+	ndesc = calc_tx_descs(skb);
+
+	if (unlikely(credits < ndesc)) {
+		t3_stop_tx_queue(txq, qs, q);
+		dev_err(&adap->pdev->dev,
+			"%s: Tx ring %u full while queue awake!\n",
+			dev->name, q->cntxt_id & 7);
+		return NETDEV_TX_BUSY;
+	}
+
+	q->in_use += ndesc;
+	if (unlikely(credits - ndesc < q->stop_thres)) {
+		t3_stop_tx_queue(txq, qs, q);
+
+		if (should_restart_tx(q) &&
+		    test_and_clear_bit(TXQ_ETH, &qs->txq_stopped)) {
+			q->restarts++;
+			netif_tx_start_queue(txq);
+		}
+	}
+
+	gen = q->gen;
+	q->unacked += ndesc;
+	compl = (q->unacked & 8) << (S_WR_COMPL - 3);
+	q->unacked &= 7;
+	pidx = q->pidx;
+	q->pidx += ndesc;
+	if (q->pidx >= q->size) {
+		q->pidx -= q->size;
+		q->gen ^= 1;
+	}
+
+	/* update port statistics */
+	if (skb->ip_summed == CHECKSUM_PARTIAL)
+		qs->port_stats[SGE_PSTAT_TX_CSUM]++;
+	if (skb_shinfo(skb)->gso_size)
+		qs->port_stats[SGE_PSTAT_TSO]++;
+	if (vlan_tx_tag_present(skb))
+		qs->port_stats[SGE_PSTAT_VLANINS]++;
+
+	/*
+	 * We do not use Tx completion interrupts to free DMAd Tx packets.
+	 * This is good for performance but means that we rely on new Tx
+	 * packets arriving to run the destructors of completed packets,
+	 * which open up space in their sockets' send queues.  Sometimes
+	 * we do not get such new packets causing Tx to stall.  A single
+	 * UDP transmitter is a good example of this situation.  We have
+	 * a clean up timer that periodically reclaims completed packets
+	 * but it doesn't run often enough (nor do we want it to) to prevent
+	 * lengthy stalls.  A solution to this problem is to run the
+	 * destructor early, after the packet is queued but before it's DMAd.
+	 * A cons is that we lie to socket memory accounting, but the amount
+	 * of extra memory is reasonable (limited by the number of Tx
+	 * descriptors), the packets do actually get freed quickly by new
+	 * packets almost always, and for protocols like TCP that wait for
+	 * acks to really free up the data the extra memory is even less.
+	 * On the positive side we run the destructors on the sending CPU
+	 * rather than on a potentially different completing CPU, usually a
+	 * good thing.  We also run them without holding our Tx queue lock,
+	 * unlike what reclaim_completed_tx() would otherwise do.
+	 *
+	 * Run the destructor before telling the DMA engine about the packet
+	 * to make sure it doesn't complete and get freed prematurely.
+	 */
+	if (likely(!skb_shared(skb)))
+		skb_orphan(skb);
+
+	write_tx_pkt_wr(adap, skb, pi, pidx, gen, q, ndesc, compl);
+	check_ring_tx_db(adap, q);
+	return NETDEV_TX_OK;
+}
+
+/**
+ *	write_imm - write a packet into a Tx descriptor as immediate data
+ *	@d: the Tx descriptor to write
+ *	@skb: the packet
+ *	@len: the length of packet data to write as immediate data
+ *	@gen: the generation bit value to write
+ *
+ *	Writes a packet as immediate data into a Tx descriptor.  The packet
+ *	contains a work request at its beginning.  We must write the packet
+ *	carefully so the SGE doesn't read it accidentally before it's written
+ *	in its entirety.
+ */
+static inline void write_imm(struct tx_desc *d, struct sk_buff *skb,
+			     unsigned int len, unsigned int gen)
+{
+	struct work_request_hdr *from = (struct work_request_hdr *)skb->data;
+	struct work_request_hdr *to = (struct work_request_hdr *)d;
+
+	if (likely(!skb->data_len))
+		memcpy(&to[1], &from[1], len - sizeof(*from));
+	else
+		skb_copy_bits(skb, sizeof(*from), &to[1], len - sizeof(*from));
+
+	to->wr_hi = from->wr_hi | htonl(F_WR_SOP | F_WR_EOP |
+					V_WR_BCNTLFLT(len & 7));
+	wmb();
+	to->wr_lo = from->wr_lo | htonl(V_WR_GEN(gen) |
+					V_WR_LEN((len + 7) / 8));
+	wr_gen2(d, gen);
+	kfree_skb(skb);
+}
+
+/**
+ *	check_desc_avail - check descriptor availability on a send queue
+ *	@adap: the adapter
+ *	@q: the send queue
+ *	@skb: the packet needing the descriptors
+ *	@ndesc: the number of Tx descriptors needed
+ *	@qid: the Tx queue number in its queue set (TXQ_OFLD or TXQ_CTRL)
+ *
+ *	Checks if the requested number of Tx descriptors is available on an
+ *	SGE send queue.  If the queue is already suspended or not enough
+ *	descriptors are available the packet is queued for later transmission.
+ *	Must be called with the Tx queue locked.
+ *
+ *	Returns 0 if enough descriptors are available, 1 if there aren't
+ *	enough descriptors and the packet has been queued, and 2 if the caller
+ *	needs to retry because there weren't enough descriptors at the
+ *	beginning of the call but some freed up in the mean time.
+ */
+static inline int check_desc_avail(struct adapter *adap, struct sge_txq *q,
+				   struct sk_buff *skb, unsigned int ndesc,
+				   unsigned int qid)
+{
+	if (unlikely(!skb_queue_empty(&q->sendq))) {
+	      addq_exit:__skb_queue_tail(&q->sendq, skb);
+		return 1;
+	}
+	if (unlikely(q->size - q->in_use < ndesc)) {
+		struct sge_qset *qs = txq_to_qset(q, qid);
+
+		set_bit(qid, &qs->txq_stopped);
+		smp_mb__after_atomic();
+
+		if (should_restart_tx(q) &&
+		    test_and_clear_bit(qid, &qs->txq_stopped))
+			return 2;
+
+		q->stops++;
+		goto addq_exit;
+	}
+	return 0;
+}
+
+/**
+ *	reclaim_completed_tx_imm - reclaim completed control-queue Tx descs
+ *	@q: the SGE control Tx queue
+ *
+ *	This is a variant of reclaim_completed_tx() that is used for Tx queues
+ *	that send only immediate data (presently just the control queues) and
+ *	thus do not have any sk_buffs to release.
+ */
+static inline void reclaim_completed_tx_imm(struct sge_txq *q)
+{
+	unsigned int reclaim = q->processed - q->cleaned;
+
+	q->in_use -= reclaim;
+	q->cleaned += reclaim;
+}
+
+static inline int immediate(const struct sk_buff *skb)
+{
+	return skb->len <= WR_LEN;
+}
+
+/**
+ *	ctrl_xmit - send a packet through an SGE control Tx queue
+ *	@adap: the adapter
+ *	@q: the control queue
+ *	@skb: the packet
+ *
+ *	Send a packet through an SGE control Tx queue.  Packets sent through
+ *	a control queue must fit entirely as immediate data in a single Tx
+ *	descriptor and have no page fragments.
+ */
+static int ctrl_xmit(struct adapter *adap, struct sge_txq *q,
+		     struct sk_buff *skb)
+{
+	int ret;
+	struct work_request_hdr *wrp = (struct work_request_hdr *)skb->data;
+
+	if (unlikely(!immediate(skb))) {
+		WARN_ON(1);
+		dev_kfree_skb(skb);
+		return NET_XMIT_SUCCESS;
+	}
+
+	wrp->wr_hi |= htonl(F_WR_SOP | F_WR_EOP);
+	wrp->wr_lo = htonl(V_WR_TID(q->token));
+
+	spin_lock(&q->lock);
+      again:reclaim_completed_tx_imm(q);
+
+	ret = check_desc_avail(adap, q, skb, 1, TXQ_CTRL);
+	if (unlikely(ret)) {
+		if (ret == 1) {
+			spin_unlock(&q->lock);
+			return NET_XMIT_CN;
+		}
+		goto again;
+	}
+
+	write_imm(&q->desc[q->pidx], skb, skb->len, q->gen);
+
+	q->in_use++;
+	if (++q->pidx >= q->size) {
+		q->pidx = 0;
+		q->gen ^= 1;
+	}
+	spin_unlock(&q->lock);
+	wmb();
+	t3_write_reg(adap, A_SG_KDOORBELL,
+		     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
+	return NET_XMIT_SUCCESS;
+}
+
+/**
+ *	restart_ctrlq - restart a suspended control queue
+ *	@qs: the queue set cotaining the control queue
+ *
+ *	Resumes transmission on a suspended Tx control queue.
+ */
+static void restart_ctrlq(unsigned long data)
+{
+	struct sk_buff *skb;
+	struct sge_qset *qs = (struct sge_qset *)data;
+	struct sge_txq *q = &qs->txq[TXQ_CTRL];
+
+	spin_lock(&q->lock);
+      again:reclaim_completed_tx_imm(q);
+
+	while (q->in_use < q->size &&
+	       (skb = __skb_dequeue(&q->sendq)) != NULL) {
+
+		write_imm(&q->desc[q->pidx], skb, skb->len, q->gen);
+
+		if (++q->pidx >= q->size) {
+			q->pidx = 0;
+			q->gen ^= 1;
+		}
+		q->in_use++;
+	}
+
+	if (!skb_queue_empty(&q->sendq)) {
+		set_bit(TXQ_CTRL, &qs->txq_stopped);
+		smp_mb__after_atomic();
+
+		if (should_restart_tx(q) &&
+		    test_and_clear_bit(TXQ_CTRL, &qs->txq_stopped))
+			goto again;
+		q->stops++;
+	}
+
+	spin_unlock(&q->lock);
+	wmb();
+	t3_write_reg(qs->adap, A_SG_KDOORBELL,
+		     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
+}
+
+/*
+ * Send a management message through control queue 0
+ */
+int t3_mgmt_tx(struct adapter *adap, struct sk_buff *skb)
+{
+	int ret;
+	local_bh_disable();
+	ret = ctrl_xmit(adap, &adap->sge.qs[0].txq[TXQ_CTRL], skb);
+	local_bh_enable();
+
+	return ret;
+}
+
+/**
+ *	deferred_unmap_destructor - unmap a packet when it is freed
+ *	@skb: the packet
+ *
+ *	This is the packet destructor used for Tx packets that need to remain
+ *	mapped until they are freed rather than until their Tx descriptors are
+ *	freed.
+ */
+static void deferred_unmap_destructor(struct sk_buff *skb)
+{
+	int i;
+	const dma_addr_t *p;
+	const struct skb_shared_info *si;
+	const struct deferred_unmap_info *dui;
+
+	dui = (struct deferred_unmap_info *)skb->head;
+	p = dui->addr;
+
+	if (skb_tail_pointer(skb) - skb_transport_header(skb))
+		pci_unmap_single(dui->pdev, *p++, skb_tail_pointer(skb) -
+				 skb_transport_header(skb), PCI_DMA_TODEVICE);
+
+	si = skb_shinfo(skb);
+	for (i = 0; i < si->nr_frags; i++)
+		pci_unmap_page(dui->pdev, *p++, skb_frag_size(&si->frags[i]),
+			       PCI_DMA_TODEVICE);
+}
+
+static void setup_deferred_unmapping(struct sk_buff *skb, struct pci_dev *pdev,
+				     const struct sg_ent *sgl, int sgl_flits)
+{
+	dma_addr_t *p;
+	struct deferred_unmap_info *dui;
+
+	dui = (struct deferred_unmap_info *)skb->head;
+	dui->pdev = pdev;
+	for (p = dui->addr; sgl_flits >= 3; sgl++, sgl_flits -= 3) {
+		*p++ = be64_to_cpu(sgl->addr[0]);
+		*p++ = be64_to_cpu(sgl->addr[1]);
+	}
+	if (sgl_flits)
+		*p = be64_to_cpu(sgl->addr[0]);
+}
+
+/**
+ *	write_ofld_wr - write an offload work request
+ *	@adap: the adapter
+ *	@skb: the packet to send
+ *	@q: the Tx queue
+ *	@pidx: index of the first Tx descriptor to write
+ *	@gen: the generation value to use
+ *	@ndesc: number of descriptors the packet will occupy
+ *
+ *	Write an offload work request to send the supplied packet.  The packet
+ *	data already carry the work request with most fields populated.
+ */
+static void write_ofld_wr(struct adapter *adap, struct sk_buff *skb,
+			  struct sge_txq *q, unsigned int pidx,
+			  unsigned int gen, unsigned int ndesc)
+{
+	unsigned int sgl_flits, flits;
+	struct work_request_hdr *from;
+	struct sg_ent *sgp, sgl[MAX_SKB_FRAGS / 2 + 1];
+	struct tx_desc *d = &q->desc[pidx];
+
+	if (immediate(skb)) {
+		q->sdesc[pidx].skb = NULL;
+		write_imm(d, skb, skb->len, gen);
+		return;
+	}
+
+	/* Only TX_DATA builds SGLs */
+
+	from = (struct work_request_hdr *)skb->data;
+	memcpy(&d->flit[1], &from[1],
+	       skb_transport_offset(skb) - sizeof(*from));
+
+	flits = skb_transport_offset(skb) / 8;
+	sgp = ndesc == 1 ? (struct sg_ent *)&d->flit[flits] : sgl;
+	sgl_flits = make_sgl(skb, sgp, skb_transport_header(skb),
+			     skb_tail_pointer(skb) -
+			     skb_transport_header(skb),
+			     adap->pdev);
+	if (need_skb_unmap()) {
+		setup_deferred_unmapping(skb, adap->pdev, sgp, sgl_flits);
+		skb->destructor = deferred_unmap_destructor;
+	}
+
+	write_wr_hdr_sgl(ndesc, skb, d, pidx, q, sgl, flits, sgl_flits,
+			 gen, from->wr_hi, from->wr_lo);
+}
+
+/**
+ *	calc_tx_descs_ofld - calculate # of Tx descriptors for an offload packet
+ *	@skb: the packet
+ *
+ * 	Returns the number of Tx descriptors needed for the given offload
+ * 	packet.  These packets are already fully constructed.
+ */
+static inline unsigned int calc_tx_descs_ofld(const struct sk_buff *skb)
+{
+	unsigned int flits, cnt;
+
+	if (skb->len <= WR_LEN)
+		return 1;	/* packet fits as immediate data */
+
+	flits = skb_transport_offset(skb) / 8;	/* headers */
+	cnt = skb_shinfo(skb)->nr_frags;
+	if (skb_tail_pointer(skb) != skb_transport_header(skb))
+		cnt++;
+	return flits_to_desc(flits + sgl_len(cnt));
+}
+
+/**
+ *	ofld_xmit - send a packet through an offload queue
+ *	@adap: the adapter
+ *	@q: the Tx offload queue
+ *	@skb: the packet
+ *
+ *	Send an offload packet through an SGE offload queue.
+ */
+static int ofld_xmit(struct adapter *adap, struct sge_txq *q,
+		     struct sk_buff *skb)
+{
+	int ret;
+	unsigned int ndesc = calc_tx_descs_ofld(skb), pidx, gen;
+
+	spin_lock(&q->lock);
+again:	reclaim_completed_tx(adap, q, TX_RECLAIM_CHUNK);
+
+	ret = check_desc_avail(adap, q, skb, ndesc, TXQ_OFLD);
+	if (unlikely(ret)) {
+		if (ret == 1) {
+			skb->priority = ndesc;	/* save for restart */
+			spin_unlock(&q->lock);
+			return NET_XMIT_CN;
+		}
+		goto again;
+	}
+
+	gen = q->gen;
+	q->in_use += ndesc;
+	pidx = q->pidx;
+	q->pidx += ndesc;
+	if (q->pidx >= q->size) {
+		q->pidx -= q->size;
+		q->gen ^= 1;
+	}
+	spin_unlock(&q->lock);
+
+	write_ofld_wr(adap, skb, q, pidx, gen, ndesc);
+	check_ring_tx_db(adap, q);
+	return NET_XMIT_SUCCESS;
+}
+
+/**
+ *	restart_offloadq - restart a suspended offload queue
+ *	@qs: the queue set cotaining the offload queue
+ *
+ *	Resumes transmission on a suspended Tx offload queue.
+ */
+static void restart_offloadq(unsigned long data)
+{
+	struct sk_buff *skb;
+	struct sge_qset *qs = (struct sge_qset *)data;
+	struct sge_txq *q = &qs->txq[TXQ_OFLD];
+	const struct port_info *pi = netdev_priv(qs->netdev);
+	struct adapter *adap = pi->adapter;
+
+	spin_lock(&q->lock);
+again:	reclaim_completed_tx(adap, q, TX_RECLAIM_CHUNK);
+
+	while ((skb = skb_peek(&q->sendq)) != NULL) {
+		unsigned int gen, pidx;
+		unsigned int ndesc = skb->priority;
+
+		if (unlikely(q->size - q->in_use < ndesc)) {
+			set_bit(TXQ_OFLD, &qs->txq_stopped);
+			smp_mb__after_atomic();
+
+			if (should_restart_tx(q) &&
+			    test_and_clear_bit(TXQ_OFLD, &qs->txq_stopped))
+				goto again;
+			q->stops++;
+			break;
+		}
+
+		gen = q->gen;
+		q->in_use += ndesc;
+		pidx = q->pidx;
+		q->pidx += ndesc;
+		if (q->pidx >= q->size) {
+			q->pidx -= q->size;
+			q->gen ^= 1;
+		}
+		__skb_unlink(skb, &q->sendq);
+		spin_unlock(&q->lock);
+
+		write_ofld_wr(adap, skb, q, pidx, gen, ndesc);
+		spin_lock(&q->lock);
+	}
+	spin_unlock(&q->lock);
+
+#if USE_GTS
+	set_bit(TXQ_RUNNING, &q->flags);
+	set_bit(TXQ_LAST_PKT_DB, &q->flags);
+#endif
+	wmb();
+	t3_write_reg(adap, A_SG_KDOORBELL,
+		     F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
+}
+
+/**
+ *	queue_set - return the queue set a packet should use
+ *	@skb: the packet
+ *
+ *	Maps a packet to the SGE queue set it should use.  The desired queue
+ *	set is carried in bits 1-3 in the packet's priority.
+ */
+static inline int queue_set(const struct sk_buff *skb)
+{
+	return skb->priority >> 1;
+}
+
+/**
+ *	is_ctrl_pkt - return whether an offload packet is a control packet
+ *	@skb: the packet
+ *
+ *	Determines whether an offload packet should use an OFLD or a CTRL
+ *	Tx queue.  This is indicated by bit 0 in the packet's priority.
+ */
+static inline int is_ctrl_pkt(const struct sk_buff *skb)
+{
+	return skb->priority & 1;
+}
+
+/**
+ *	t3_offload_tx - send an offload packet
+ *	@tdev: the offload device to send to
+ *	@skb: the packet
+ *
+ *	Sends an offload packet.  We use the packet priority to select the
+ *	appropriate Tx queue as follows: bit 0 indicates whether the packet
+ *	should be sent as regular or control, bits 1-3 select the queue set.
+ */
+int t3_offload_tx(struct t3cdev *tdev, struct sk_buff *skb)
+{
+	struct adapter *adap = tdev2adap(tdev);
+	struct sge_qset *qs = &adap->sge.qs[queue_set(skb)];
+
+	if (unlikely(is_ctrl_pkt(skb)))
+		return ctrl_xmit(adap, &qs->txq[TXQ_CTRL], skb);
+
+	return ofld_xmit(adap, &qs->txq[TXQ_OFLD], skb);
+}
+
+/**
+ *	offload_enqueue - add an offload packet to an SGE offload receive queue
+ *	@q: the SGE response queue
+ *	@skb: the packet
+ *
+ *	Add a new offload packet to an SGE response queue's offload packet
+ *	queue.  If the packet is the first on the queue it schedules the RX
+ *	softirq to process the queue.
+ */
+static inline void offload_enqueue(struct sge_rspq *q, struct sk_buff *skb)
+{
+	int was_empty = skb_queue_empty(&q->rx_queue);
+
+	__skb_queue_tail(&q->rx_queue, skb);
+
+	if (was_empty) {
+		struct sge_qset *qs = rspq_to_qset(q);
+
+		napi_schedule(&qs->napi);
+	}
+}
+
+/**
+ *	deliver_partial_bundle - deliver a (partial) bundle of Rx offload pkts
+ *	@tdev: the offload device that will be receiving the packets
+ *	@q: the SGE response queue that assembled the bundle
+ *	@skbs: the partial bundle
+ *	@n: the number of packets in the bundle
+ *
+ *	Delivers a (partial) bundle of Rx offload packets to an offload device.
+ */
+static inline void deliver_partial_bundle(struct t3cdev *tdev,
+					  struct sge_rspq *q,
+					  struct sk_buff *skbs[], int n)
+{
+	if (n) {
+		q->offload_bundles++;
+		tdev->recv(tdev, skbs, n);
+	}
+}
+
+/**
+ *	ofld_poll - NAPI handler for offload packets in interrupt mode
+ *	@dev: the network device doing the polling
+ *	@budget: polling budget
+ *
+ *	The NAPI handler for offload packets when a response queue is serviced
+ *	by the hard interrupt handler, i.e., when it's operating in non-polling
+ *	mode.  Creates small packet batches and sends them through the offload
+ *	receive handler.  Batches need to be of modest size as we do prefetches
+ *	on the packets in each.
+ */
+static int ofld_poll(struct napi_struct *napi, int budget)
+{
+	struct sge_qset *qs = container_of(napi, struct sge_qset, napi);
+	struct sge_rspq *q = &qs->rspq;
+	struct adapter *adapter = qs->adap;
+	int work_done = 0;
+
+	while (work_done < budget) {
+		struct sk_buff *skb, *tmp, *skbs[RX_BUNDLE_SIZE];
+		struct sk_buff_head queue;
+		int ngathered;
+
+		spin_lock_irq(&q->lock);
+		__skb_queue_head_init(&queue);
+		skb_queue_splice_init(&q->rx_queue, &queue);
+		if (skb_queue_empty(&queue)) {
+			napi_complete(napi);
+			spin_unlock_irq(&q->lock);
+			return work_done;
+		}
+		spin_unlock_irq(&q->lock);
+
+		ngathered = 0;
+		skb_queue_walk_safe(&queue, skb, tmp) {
+			if (work_done >= budget)
+				break;
+			work_done++;
+
+			__skb_unlink(skb, &queue);
+			prefetch(skb->data);
+			skbs[ngathered] = skb;
+			if (++ngathered == RX_BUNDLE_SIZE) {
+				q->offload_bundles++;
+				adapter->tdev.recv(&adapter->tdev, skbs,
+						   ngathered);
+				ngathered = 0;
+			}
+		}
+		if (!skb_queue_empty(&queue)) {
+			/* splice remaining packets back onto Rx queue */
+			spin_lock_irq(&q->lock);
+			skb_queue_splice(&queue, &q->rx_queue);
+			spin_unlock_irq(&q->lock);
+		}
+		deliver_partial_bundle(&adapter->tdev, q, skbs, ngathered);
+	}
+
+	return work_done;
+}
+
+/**
+ *	rx_offload - process a received offload packet
+ *	@tdev: the offload device receiving the packet
+ *	@rq: the response queue that received the packet
+ *	@skb: the packet
+ *	@rx_gather: a gather list of packets if we are building a bundle
+ *	@gather_idx: index of the next available slot in the bundle
+ *
+ *	Process an ingress offload pakcet and add it to the offload ingress
+ *	queue. 	Returns the index of the next available slot in the bundle.
+ */
+static inline int rx_offload(struct t3cdev *tdev, struct sge_rspq *rq,
+			     struct sk_buff *skb, struct sk_buff *rx_gather[],
+			     unsigned int gather_idx)
+{
+	skb_reset_mac_header(skb);
+	skb_reset_network_header(skb);
+	skb_reset_transport_header(skb);
+
+	if (rq->polling) {
+		rx_gather[gather_idx++] = skb;
+		if (gather_idx == RX_BUNDLE_SIZE) {
+			tdev->recv(tdev, rx_gather, RX_BUNDLE_SIZE);
+			gather_idx = 0;
+			rq->offload_bundles++;
+		}
+	} else
+		offload_enqueue(rq, skb);
+
+	return gather_idx;
+}
+
+/**
+ *	restart_tx - check whether to restart suspended Tx queues
+ *	@qs: the queue set to resume
+ *
+ *	Restarts suspended Tx queues of an SGE queue set if they have enough
+ *	free resources to resume operation.
+ */
+static void restart_tx(struct sge_qset *qs)
+{
+	if (test_bit(TXQ_ETH, &qs->txq_stopped) &&
+	    should_restart_tx(&qs->txq[TXQ_ETH]) &&
+	    test_and_clear_bit(TXQ_ETH, &qs->txq_stopped)) {
+		qs->txq[TXQ_ETH].restarts++;
+		if (netif_running(qs->netdev))
+			netif_tx_wake_queue(qs->tx_q);
+	}
+
+	if (test_bit(TXQ_OFLD, &qs->txq_stopped) &&
+	    should_restart_tx(&qs->txq[TXQ_OFLD]) &&
+	    test_and_clear_bit(TXQ_OFLD, &qs->txq_stopped)) {
+		qs->txq[TXQ_OFLD].restarts++;
+		tasklet_schedule(&qs->txq[TXQ_OFLD].qresume_tsk);
+	}
+	if (test_bit(TXQ_CTRL, &qs->txq_stopped) &&
+	    should_restart_tx(&qs->txq[TXQ_CTRL]) &&
+	    test_and_clear_bit(TXQ_CTRL, &qs->txq_stopped)) {
+		qs->txq[TXQ_CTRL].restarts++;
+		tasklet_schedule(&qs->txq[TXQ_CTRL].qresume_tsk);
+	}
+}
+
+/**
+ *	cxgb3_arp_process - process an ARP request probing a private IP address
+ *	@adapter: the adapter
+ *	@skb: the skbuff containing the ARP request
+ *
+ *	Check if the ARP request is probing the private IP address
+ *	dedicated to iSCSI, generate an ARP reply if so.
+ */
+static void cxgb3_arp_process(struct port_info *pi, struct sk_buff *skb)
+{
+	struct net_device *dev = skb->dev;
+	struct arphdr *arp;
+	unsigned char *arp_ptr;
+	unsigned char *sha;
+	__be32 sip, tip;
+
+	if (!dev)
+		return;
+
+	skb_reset_network_header(skb);
+	arp = arp_hdr(skb);
+
+	if (arp->ar_op != htons(ARPOP_REQUEST))
+		return;
+
+	arp_ptr = (unsigned char *)(arp + 1);
+	sha = arp_ptr;
+	arp_ptr += dev->addr_len;
+	memcpy(&sip, arp_ptr, sizeof(sip));
+	arp_ptr += sizeof(sip);
+	arp_ptr += dev->addr_len;
+	memcpy(&tip, arp_ptr, sizeof(tip));
+
+	if (tip != pi->iscsi_ipv4addr)
+		return;
+
+	arp_send(ARPOP_REPLY, ETH_P_ARP, sip, dev, tip, sha,
+		 pi->iscsic.mac_addr, sha);
+
+}
+
+static inline int is_arp(struct sk_buff *skb)
+{
+	return skb->protocol == htons(ETH_P_ARP);
+}
+
+static void cxgb3_process_iscsi_prov_pack(struct port_info *pi,
+					struct sk_buff *skb)
+{
+	if (is_arp(skb)) {
+		cxgb3_arp_process(pi, skb);
+		return;
+	}
+
+	if (pi->iscsic.recv)
+		pi->iscsic.recv(pi, skb);
+
+}
+
+/**
+ *	rx_eth - process an ingress ethernet packet
+ *	@adap: the adapter
+ *	@rq: the response queue that received the packet
+ *	@skb: the packet
+ *	@pad: amount of padding at the start of the buffer
+ *
+ *	Process an ingress ethernet pakcet and deliver it to the stack.
+ *	The padding is 2 if the packet was delivered in an Rx buffer and 0
+ *	if it was immediate data in a response.
+ */
+static void rx_eth(struct adapter *adap, struct sge_rspq *rq,
+		   struct sk_buff *skb, int pad, int lro)
+{
+	struct cpl_rx_pkt *p = (struct cpl_rx_pkt *)(skb->data + pad);
+	struct sge_qset *qs = rspq_to_qset(rq);
+	struct port_info *pi;
+
+	skb_pull(skb, sizeof(*p) + pad);
+	skb->protocol = eth_type_trans(skb, adap->port[p->iff]);
+	pi = netdev_priv(skb->dev);
+	if ((skb->dev->features & NETIF_F_RXCSUM) && p->csum_valid &&
+	    p->csum == htons(0xffff) && !p->fragment) {
+		qs->port_stats[SGE_PSTAT_RX_CSUM_GOOD]++;
+		skb->ip_summed = CHECKSUM_UNNECESSARY;
+	} else
+		skb_checksum_none_assert(skb);
+	skb_record_rx_queue(skb, qs - &adap->sge.qs[pi->first_qset]);
+
+	if (p->vlan_valid) {
+		qs->port_stats[SGE_PSTAT_VLANEX]++;
+		__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), ntohs(p->vlan));
+	}
+	if (rq->polling) {
+		if (lro)
+			napi_gro_receive(&qs->napi, skb);
+		else {
+			if (unlikely(pi->iscsic.flags))
+				cxgb3_process_iscsi_prov_pack(pi, skb);
+			netif_receive_skb(skb);
+		}
+	} else
+		netif_rx(skb);
+}
+
+static inline int is_eth_tcp(u32 rss)
+{
+	return G_HASHTYPE(ntohl(rss)) == RSS_HASH_4_TUPLE;
+}
+
+/**
+ *	lro_add_page - add a page chunk to an LRO session
+ *	@adap: the adapter
+ *	@qs: the associated queue set
+ *	@fl: the free list containing the page chunk to add
+ *	@len: packet length
+ *	@complete: Indicates the last fragment of a frame
+ *
+ *	Add a received packet contained in a page chunk to an existing LRO
+ *	session.
+ */
+static void lro_add_page(struct adapter *adap, struct sge_qset *qs,
+			 struct sge_fl *fl, int len, int complete)
+{
+	struct rx_sw_desc *sd = &fl->sdesc[fl->cidx];
+	struct port_info *pi = netdev_priv(qs->netdev);
+	struct sk_buff *skb = NULL;
+	struct cpl_rx_pkt *cpl;
+	struct skb_frag_struct *rx_frag;
+	int nr_frags;
+	int offset = 0;
+
+	if (!qs->nomem) {
+		skb = napi_get_frags(&qs->napi);
+		qs->nomem = !skb;
+	}
+
+	fl->credits--;
+
+	pci_dma_sync_single_for_cpu(adap->pdev,
+				    dma_unmap_addr(sd, dma_addr),
+				    fl->buf_size - SGE_PG_RSVD,
+				    PCI_DMA_FROMDEVICE);
+
+	(*sd->pg_chunk.p_cnt)--;
+	if (!*sd->pg_chunk.p_cnt && sd->pg_chunk.page != fl->pg_chunk.page)
+		pci_unmap_page(adap->pdev,
+			       sd->pg_chunk.mapping,
+			       fl->alloc_size,
+			       PCI_DMA_FROMDEVICE);
+
+	if (!skb) {
+		put_page(sd->pg_chunk.page);
+		if (complete)
+			qs->nomem = 0;
+		return;
+	}
+
+	rx_frag = skb_shinfo(skb)->frags;
+	nr_frags = skb_shinfo(skb)->nr_frags;
+
+	if (!nr_frags) {
+		offset = 2 + sizeof(struct cpl_rx_pkt);
+		cpl = qs->lro_va = sd->pg_chunk.va + 2;
+
+		if ((qs->netdev->features & NETIF_F_RXCSUM) &&
+		     cpl->csum_valid && cpl->csum == htons(0xffff)) {
+			skb->ip_summed = CHECKSUM_UNNECESSARY;
+			qs->port_stats[SGE_PSTAT_RX_CSUM_GOOD]++;
+		} else
+			skb->ip_summed = CHECKSUM_NONE;
+	} else
+		cpl = qs->lro_va;
+
+	len -= offset;
+
+	rx_frag += nr_frags;
+	__skb_frag_set_page(rx_frag, sd->pg_chunk.page);
+	rx_frag->page_offset = sd->pg_chunk.offset + offset;
+	skb_frag_size_set(rx_frag, len);
+
+	skb->len += len;
+	skb->data_len += len;
+	skb->truesize += len;
+	skb_shinfo(skb)->nr_frags++;
+
+	if (!complete)
+		return;
+
+	skb_record_rx_queue(skb, qs - &adap->sge.qs[pi->first_qset]);
+
+	if (cpl->vlan_valid) {
+		qs->port_stats[SGE_PSTAT_VLANEX]++;
+		__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), ntohs(cpl->vlan));
+	}
+	napi_gro_frags(&qs->napi);
+}
+
+/**
+ *	handle_rsp_cntrl_info - handles control information in a response
+ *	@qs: the queue set corresponding to the response
+ *	@flags: the response control flags
+ *
+ *	Handles the control information of an SGE response, such as GTS
+ *	indications and completion credits for the queue set's Tx queues.
+ *	HW coalesces credits, we don't do any extra SW coalescing.
+ */
+static inline void handle_rsp_cntrl_info(struct sge_qset *qs, u32 flags)
+{
+	unsigned int credits;
+
+#if USE_GTS
+	if (flags & F_RSPD_TXQ0_GTS)
+		clear_bit(TXQ_RUNNING, &qs->txq[TXQ_ETH].flags);
+#endif
+
+	credits = G_RSPD_TXQ0_CR(flags);
+	if (credits)
+		qs->txq[TXQ_ETH].processed += credits;
+
+	credits = G_RSPD_TXQ2_CR(flags);
+	if (credits)
+		qs->txq[TXQ_CTRL].processed += credits;
+
+# if USE_GTS
+	if (flags & F_RSPD_TXQ1_GTS)
+		clear_bit(TXQ_RUNNING, &qs->txq[TXQ_OFLD].flags);
+# endif
+	credits = G_RSPD_TXQ1_CR(flags);
+	if (credits)
+		qs->txq[TXQ_OFLD].processed += credits;
+}
+
+/**
+ *	check_ring_db - check if we need to ring any doorbells
+ *	@adapter: the adapter
+ *	@qs: the queue set whose Tx queues are to be examined
+ *	@sleeping: indicates which Tx queue sent GTS
+ *
+ *	Checks if some of a queue set's Tx queues need to ring their doorbells
+ *	to resume transmission after idling while they still have unprocessed
+ *	descriptors.
+ */
+static void check_ring_db(struct adapter *adap, struct sge_qset *qs,
+			  unsigned int sleeping)
+{
+	if (sleeping & F_RSPD_TXQ0_GTS) {
+		struct sge_txq *txq = &qs->txq[TXQ_ETH];
+
+		if (txq->cleaned + txq->in_use != txq->processed &&
+		    !test_and_set_bit(TXQ_LAST_PKT_DB, &txq->flags)) {
+			set_bit(TXQ_RUNNING, &txq->flags);
+			t3_write_reg(adap, A_SG_KDOORBELL, F_SELEGRCNTX |
+				     V_EGRCNTX(txq->cntxt_id));
+		}
+	}
+
+	if (sleeping & F_RSPD_TXQ1_GTS) {
+		struct sge_txq *txq = &qs->txq[TXQ_OFLD];
+
+		if (txq->cleaned + txq->in_use != txq->processed &&
+		    !test_and_set_bit(TXQ_LAST_PKT_DB, &txq->flags)) {
+			set_bit(TXQ_RUNNING, &txq->flags);
+			t3_write_reg(adap, A_SG_KDOORBELL, F_SELEGRCNTX |
+				     V_EGRCNTX(txq->cntxt_id));
+		}
+	}
+}
+
+/**
+ *	is_new_response - check if a response is newly written
+ *	@r: the response descriptor
+ *	@q: the response queue
+ *
+ *	Returns true if a response descriptor contains a yet unprocessed
+ *	response.
+ */
+static inline int is_new_response(const struct rsp_desc *r,
+				  const struct sge_rspq *q)
+{
+	return (r->intr_gen & F_RSPD_GEN2) == q->gen;
+}
+
+static inline void clear_rspq_bufstate(struct sge_rspq * const q)
+{
+	q->pg_skb = NULL;
+	q->rx_recycle_buf = 0;
+}
+
+#define RSPD_GTS_MASK  (F_RSPD_TXQ0_GTS | F_RSPD_TXQ1_GTS)
+#define RSPD_CTRL_MASK (RSPD_GTS_MASK | \
+			V_RSPD_TXQ0_CR(M_RSPD_TXQ0_CR) | \
+			V_RSPD_TXQ1_CR(M_RSPD_TXQ1_CR) | \
+			V_RSPD_TXQ2_CR(M_RSPD_TXQ2_CR))
+
+/* How long to delay the next interrupt in case of memory shortage, in 0.1us. */
+#define NOMEM_INTR_DELAY 2500
+
+/**
+ *	process_responses - process responses from an SGE response queue
+ *	@adap: the adapter
+ *	@qs: the queue set to which the response queue belongs
+ *	@budget: how many responses can be processed in this round
+ *
+ *	Process responses from an SGE response queue up to the supplied budget.
+ *	Responses include received packets as well as credits and other events
+ *	for the queues that belong to the response queue's queue set.
+ *	A negative budget is effectively unlimited.
+ *
+ *	Additionally choose the interrupt holdoff time for the next interrupt
+ *	on this queue.  If the system is under memory shortage use a fairly
+ *	long delay to help recovery.
+ */
+static int process_responses(struct adapter *adap, struct sge_qset *qs,
+			     int budget)
+{
+	struct sge_rspq *q = &qs->rspq;
+	struct rsp_desc *r = &q->desc[q->cidx];
+	int budget_left = budget;
+	unsigned int sleeping = 0;
+	struct sk_buff *offload_skbs[RX_BUNDLE_SIZE];
+	int ngathered = 0;
+
+	q->next_holdoff = q->holdoff_tmr;
+
+	while (likely(budget_left && is_new_response(r, q))) {
+		int packet_complete, eth, ethpad = 2;
+		int lro = !!(qs->netdev->features & NETIF_F_GRO);
+		struct sk_buff *skb = NULL;
+		u32 len, flags;
+		__be32 rss_hi, rss_lo;
+
+		rmb();
+		eth = r->rss_hdr.opcode == CPL_RX_PKT;
+		rss_hi = *(const __be32 *)r;
+		rss_lo = r->rss_hdr.rss_hash_val;
+		flags = ntohl(r->flags);
+
+		if (unlikely(flags & F_RSPD_ASYNC_NOTIF)) {
+			skb = alloc_skb(AN_PKT_SIZE, GFP_ATOMIC);
+			if (!skb)
+				goto no_mem;
+
+			memcpy(__skb_put(skb, AN_PKT_SIZE), r, AN_PKT_SIZE);
+			skb->data[0] = CPL_ASYNC_NOTIF;
+			rss_hi = htonl(CPL_ASYNC_NOTIF << 24);
+			q->async_notif++;
+		} else if (flags & F_RSPD_IMM_DATA_VALID) {
+			skb = get_imm_packet(r);
+			if (unlikely(!skb)) {
+no_mem:
+				q->next_holdoff = NOMEM_INTR_DELAY;
+				q->nomem++;
+				/* consume one credit since we tried */
+				budget_left--;
+				break;
+			}
+			q->imm_data++;
+			ethpad = 0;
+		} else if ((len = ntohl(r->len_cq)) != 0) {
+			struct sge_fl *fl;
+
+			lro &= eth && is_eth_tcp(rss_hi);
+
+			fl = (len & F_RSPD_FLQ) ? &qs->fl[1] : &qs->fl[0];
+			if (fl->use_pages) {
+				void *addr = fl->sdesc[fl->cidx].pg_chunk.va;
+
+				prefetch(addr);
+#if L1_CACHE_BYTES < 128
+				prefetch(addr + L1_CACHE_BYTES);
+#endif
+				__refill_fl(adap, fl);
+				if (lro > 0) {
+					lro_add_page(adap, qs, fl,
+						     G_RSPD_LEN(len),
+						     flags & F_RSPD_EOP);
+					 goto next_fl;
+				}
+
+				skb = get_packet_pg(adap, fl, q,
+						    G_RSPD_LEN(len),
+						    eth ?
+						    SGE_RX_DROP_THRES : 0);
+				q->pg_skb = skb;
+			} else
+				skb = get_packet(adap, fl, G_RSPD_LEN(len),
+						 eth ? SGE_RX_DROP_THRES : 0);
+			if (unlikely(!skb)) {
+				if (!eth)
+					goto no_mem;
+				q->rx_drops++;
+			} else if (unlikely(r->rss_hdr.opcode == CPL_TRACE_PKT))
+				__skb_pull(skb, 2);
+next_fl:
+			if (++fl->cidx == fl->size)
+				fl->cidx = 0;
+		} else
+			q->pure_rsps++;
+
+		if (flags & RSPD_CTRL_MASK) {
+			sleeping |= flags & RSPD_GTS_MASK;
+			handle_rsp_cntrl_info(qs, flags);
+		}
+
+		r++;
+		if (unlikely(++q->cidx == q->size)) {
+			q->cidx = 0;
+			q->gen ^= 1;
+			r = q->desc;
+		}
+		prefetch(r);
+
+		if (++q->credits >= (q->size / 4)) {
+			refill_rspq(adap, q, q->credits);
+			q->credits = 0;
+		}
+
+		packet_complete = flags &
+				  (F_RSPD_EOP | F_RSPD_IMM_DATA_VALID |
+				   F_RSPD_ASYNC_NOTIF);
+
+		if (skb != NULL && packet_complete) {
+			if (eth)
+				rx_eth(adap, q, skb, ethpad, lro);
+			else {
+				q->offload_pkts++;
+				/* Preserve the RSS info in csum & priority */
+				skb->csum = rss_hi;
+				skb->priority = rss_lo;
+				ngathered = rx_offload(&adap->tdev, q, skb,
+						       offload_skbs,
+						       ngathered);
+			}
+
+			if (flags & F_RSPD_EOP)
+				clear_rspq_bufstate(q);
+		}
+		--budget_left;
+	}
+
+	deliver_partial_bundle(&adap->tdev, q, offload_skbs, ngathered);
+
+	if (sleeping)
+		check_ring_db(adap, qs, sleeping);
+
+	smp_mb();		/* commit Tx queue .processed updates */
+	if (unlikely(qs->txq_stopped != 0))
+		restart_tx(qs);
+
+	budget -= budget_left;
+	return budget;
+}
+
+static inline int is_pure_response(const struct rsp_desc *r)
+{
+	__be32 n = r->flags & htonl(F_RSPD_ASYNC_NOTIF | F_RSPD_IMM_DATA_VALID);
+
+	return (n | r->len_cq) == 0;
+}
+
+/**
+ *	napi_rx_handler - the NAPI handler for Rx processing
+ *	@napi: the napi instance
+ *	@budget: how many packets we can process in this round
+ *
+ *	Handler for new data events when using NAPI.
+ */
+static int napi_rx_handler(struct napi_struct *napi, int budget)
+{
+	struct sge_qset *qs = container_of(napi, struct sge_qset, napi);
+	struct adapter *adap = qs->adap;
+	int work_done = process_responses(adap, qs, budget);
+
+	if (likely(work_done < budget)) {
+		napi_complete(napi);
+
+		/*
+		 * Because we don't atomically flush the following
+		 * write it is possible that in very rare cases it can
+		 * reach the device in a way that races with a new
+		 * response being written plus an error interrupt
+		 * causing the NAPI interrupt handler below to return
+		 * unhandled status to the OS.  To protect against
+		 * this would require flushing the write and doing
+		 * both the write and the flush with interrupts off.
+		 * Way too expensive and unjustifiable given the
+		 * rarity of the race.
+		 *
+		 * The race cannot happen at all with MSI-X.
+		 */
+		t3_write_reg(adap, A_SG_GTS, V_RSPQ(qs->rspq.cntxt_id) |
+			     V_NEWTIMER(qs->rspq.next_holdoff) |
+			     V_NEWINDEX(qs->rspq.cidx));
+	}
+	return work_done;
+}
+
+/*
+ * Returns true if the device is already scheduled for polling.
+ */
+static inline int napi_is_scheduled(struct napi_struct *napi)
+{
+	return test_bit(NAPI_STATE_SCHED, &napi->state);
+}
+
+/**
+ *	process_pure_responses - process pure responses from a response queue
+ *	@adap: the adapter
+ *	@qs: the queue set owning the response queue
+ *	@r: the first pure response to process
+ *
+ *	A simpler version of process_responses() that handles only pure (i.e.,
+ *	non data-carrying) responses.  Such respones are too light-weight to
+ *	justify calling a softirq under NAPI, so we handle them specially in
+ *	the interrupt handler.  The function is called with a pointer to a
+ *	response, which the caller must ensure is a valid pure response.
+ *
+ *	Returns 1 if it encounters a valid data-carrying response, 0 otherwise.
+ */
+static int process_pure_responses(struct adapter *adap, struct sge_qset *qs,
+				  struct rsp_desc *r)
+{
+	struct sge_rspq *q = &qs->rspq;
+	unsigned int sleeping = 0;
+
+	do {
+		u32 flags = ntohl(r->flags);
+
+		r++;
+		if (unlikely(++q->cidx == q->size)) {
+			q->cidx = 0;
+			q->gen ^= 1;
+			r = q->desc;
+		}
+		prefetch(r);
+
+		if (flags & RSPD_CTRL_MASK) {
+			sleeping |= flags & RSPD_GTS_MASK;
+			handle_rsp_cntrl_info(qs, flags);
+		}
+
+		q->pure_rsps++;
+		if (++q->credits >= (q->size / 4)) {
+			refill_rspq(adap, q, q->credits);
+			q->credits = 0;
+		}
+		if (!is_new_response(r, q))
+			break;
+		rmb();
+	} while (is_pure_response(r));
+
+	if (sleeping)
+		check_ring_db(adap, qs, sleeping);
+
+	smp_mb();		/* commit Tx queue .processed updates */
+	if (unlikely(qs->txq_stopped != 0))
+		restart_tx(qs);
+
+	return is_new_response(r, q);
+}
+
+/**
+ *	handle_responses - decide what to do with new responses in NAPI mode
+ *	@adap: the adapter
+ *	@q: the response queue
+ *
+ *	This is used by the NAPI interrupt handlers to decide what to do with
+ *	new SGE responses.  If there are no new responses it returns -1.  If
+ *	there are new responses and they are pure (i.e., non-data carrying)
+ *	it handles them straight in hard interrupt context as they are very
+ *	cheap and don't deliver any packets.  Finally, if there are any data
+ *	signaling responses it schedules the NAPI handler.  Returns 1 if it
+ *	schedules NAPI, 0 if all new responses were pure.
+ *
+ *	The caller must ascertain NAPI is not already running.
+ */
+static inline int handle_responses(struct adapter *adap, struct sge_rspq *q)
+{
+	struct sge_qset *qs = rspq_to_qset(q);
+	struct rsp_desc *r = &q->desc[q->cidx];
+
+	if (!is_new_response(r, q))
+		return -1;
+	rmb();
+	if (is_pure_response(r) && process_pure_responses(adap, qs, r) == 0) {
+		t3_write_reg(adap, A_SG_GTS, V_RSPQ(q->cntxt_id) |
+			     V_NEWTIMER(q->holdoff_tmr) | V_NEWINDEX(q->cidx));
+		return 0;
+	}
+	napi_schedule(&qs->napi);
+	return 1;
+}
+
+/*
+ * The MSI-X interrupt handler for an SGE response queue for the non-NAPI case
+ * (i.e., response queue serviced in hard interrupt).
+ */
+static irqreturn_t t3_sge_intr_msix(int irq, void *cookie)
+{
+	struct sge_qset *qs = cookie;
+	struct adapter *adap = qs->adap;
+	struct sge_rspq *q = &qs->rspq;
+
+	spin_lock(&q->lock);
+	if (process_responses(adap, qs, -1) == 0)
+		q->unhandled_irqs++;
+	t3_write_reg(adap, A_SG_GTS, V_RSPQ(q->cntxt_id) |
+		     V_NEWTIMER(q->next_holdoff) | V_NEWINDEX(q->cidx));
+	spin_unlock(&q->lock);
+	return IRQ_HANDLED;
+}
+
+/*
+ * The MSI-X interrupt handler for an SGE response queue for the NAPI case
+ * (i.e., response queue serviced by NAPI polling).
+ */
+static irqreturn_t t3_sge_intr_msix_napi(int irq, void *cookie)
+{
+	struct sge_qset *qs = cookie;
+	struct sge_rspq *q = &qs->rspq;
+
+	spin_lock(&q->lock);
+
+	if (handle_responses(qs->adap, q) < 0)
+		q->unhandled_irqs++;
+	spin_unlock(&q->lock);
+	return IRQ_HANDLED;
+}
+
+/*
+ * The non-NAPI MSI interrupt handler.  This needs to handle data events from
+ * SGE response queues as well as error and other async events as they all use
+ * the same MSI vector.  We use one SGE response queue per port in this mode
+ * and protect all response queues with queue 0's lock.
+ */
+static irqreturn_t t3_intr_msi(int irq, void *cookie)
+{
+	int new_packets = 0;
+	struct adapter *adap = cookie;
+	struct sge_rspq *q = &adap->sge.qs[0].rspq;
+
+	spin_lock(&q->lock);
+
+	if (process_responses(adap, &adap->sge.qs[0], -1)) {
+		t3_write_reg(adap, A_SG_GTS, V_RSPQ(q->cntxt_id) |
+			     V_NEWTIMER(q->next_holdoff) | V_NEWINDEX(q->cidx));
+		new_packets = 1;
+	}
+
+	if (adap->params.nports == 2 &&
+	    process_responses(adap, &adap->sge.qs[1], -1)) {
+		struct sge_rspq *q1 = &adap->sge.qs[1].rspq;
+
+		t3_write_reg(adap, A_SG_GTS, V_RSPQ(q1->cntxt_id) |
+			     V_NEWTIMER(q1->next_holdoff) |
+			     V_NEWINDEX(q1->cidx));
+		new_packets = 1;
+	}
+
+	if (!new_packets && t3_slow_intr_handler(adap) == 0)
+		q->unhandled_irqs++;
+
+	spin_unlock(&q->lock);
+	return IRQ_HANDLED;
+}
+
+static int rspq_check_napi(struct sge_qset *qs)
+{
+	struct sge_rspq *q = &qs->rspq;
+
+	if (!napi_is_scheduled(&qs->napi) &&
+	    is_new_response(&q->desc[q->cidx], q)) {
+		napi_schedule(&qs->napi);
+		return 1;
+	}
+	return 0;
+}
+
+/*
+ * The MSI interrupt handler for the NAPI case (i.e., response queues serviced
+ * by NAPI polling).  Handles data events from SGE response queues as well as
+ * error and other async events as they all use the same MSI vector.  We use
+ * one SGE response queue per port in this mode and protect all response
+ * queues with queue 0's lock.
+ */
+static irqreturn_t t3_intr_msi_napi(int irq, void *cookie)
+{
+	int new_packets;
+	struct adapter *adap = cookie;
+	struct sge_rspq *q = &adap->sge.qs[0].rspq;
+
+	spin_lock(&q->lock);
+
+	new_packets = rspq_check_napi(&adap->sge.qs[0]);
+	if (adap->params.nports == 2)
+		new_packets += rspq_check_napi(&adap->sge.qs[1]);
+	if (!new_packets && t3_slow_intr_handler(adap) == 0)
+		q->unhandled_irqs++;
+
+	spin_unlock(&q->lock);
+	return IRQ_HANDLED;
+}
+
+/*
+ * A helper function that processes responses and issues GTS.
+ */
+static inline int process_responses_gts(struct adapter *adap,
+					struct sge_rspq *rq)
+{
+	int work;
+
+	work = process_responses(adap, rspq_to_qset(rq), -1);
+	t3_write_reg(adap, A_SG_GTS, V_RSPQ(rq->cntxt_id) |
+		     V_NEWTIMER(rq->next_holdoff) | V_NEWINDEX(rq->cidx));
+	return work;
+}
+
+/*
+ * The legacy INTx interrupt handler.  This needs to handle data events from
+ * SGE response queues as well as error and other async events as they all use
+ * the same interrupt pin.  We use one SGE response queue per port in this mode
+ * and protect all response queues with queue 0's lock.
+ */
+static irqreturn_t t3_intr(int irq, void *cookie)
+{
+	int work_done, w0, w1;
+	struct adapter *adap = cookie;
+	struct sge_rspq *q0 = &adap->sge.qs[0].rspq;
+	struct sge_rspq *q1 = &adap->sge.qs[1].rspq;
+
+	spin_lock(&q0->lock);
+
+	w0 = is_new_response(&q0->desc[q0->cidx], q0);
+	w1 = adap->params.nports == 2 &&
+	    is_new_response(&q1->desc[q1->cidx], q1);
+
+	if (likely(w0 | w1)) {
+		t3_write_reg(adap, A_PL_CLI, 0);
+		t3_read_reg(adap, A_PL_CLI);	/* flush */
+
+		if (likely(w0))
+			process_responses_gts(adap, q0);
+
+		if (w1)
+			process_responses_gts(adap, q1);
+
+		work_done = w0 | w1;
+	} else
+		work_done = t3_slow_intr_handler(adap);
+
+	spin_unlock(&q0->lock);
+	return IRQ_RETVAL(work_done != 0);
+}
+
+/*
+ * Interrupt handler for legacy INTx interrupts for T3B-based cards.
+ * Handles data events from SGE response queues as well as error and other
+ * async events as they all use the same interrupt pin.  We use one SGE
+ * response queue per port in this mode and protect all response queues with
+ * queue 0's lock.
+ */
+static irqreturn_t t3b_intr(int irq, void *cookie)
+{
+	u32 map;
+	struct adapter *adap = cookie;
+	struct sge_rspq *q0 = &adap->sge.qs[0].rspq;
+
+	t3_write_reg(adap, A_PL_CLI, 0);
+	map = t3_read_reg(adap, A_SG_DATA_INTR);
+
+	if (unlikely(!map))	/* shared interrupt, most likely */
+		return IRQ_NONE;
+
+	spin_lock(&q0->lock);
+
+	if (unlikely(map & F_ERRINTR))
+		t3_slow_intr_handler(adap);
+
+	if (likely(map & 1))
+		process_responses_gts(adap, q0);
+
+	if (map & 2)
+		process_responses_gts(adap, &adap->sge.qs[1].rspq);
+
+	spin_unlock(&q0->lock);
+	return IRQ_HANDLED;
+}
+
+/*
+ * NAPI interrupt handler for legacy INTx interrupts for T3B-based cards.
+ * Handles data events from SGE response queues as well as error and other
+ * async events as they all use the same interrupt pin.  We use one SGE
+ * response queue per port in this mode and protect all response queues with
+ * queue 0's lock.
+ */
+static irqreturn_t t3b_intr_napi(int irq, void *cookie)
+{
+	u32 map;
+	struct adapter *adap = cookie;
+	struct sge_qset *qs0 = &adap->sge.qs[0];
+	struct sge_rspq *q0 = &qs0->rspq;
+
+	t3_write_reg(adap, A_PL_CLI, 0);
+	map = t3_read_reg(adap, A_SG_DATA_INTR);
+
+	if (unlikely(!map))	/* shared interrupt, most likely */
+		return IRQ_NONE;
+
+	spin_lock(&q0->lock);
+
+	if (unlikely(map & F_ERRINTR))
+		t3_slow_intr_handler(adap);
+
+	if (likely(map & 1))
+		napi_schedule(&qs0->napi);
+
+	if (map & 2)
+		napi_schedule(&adap->sge.qs[1].napi);
+
+	spin_unlock(&q0->lock);
+	return IRQ_HANDLED;
+}
+
+/**
+ *	t3_intr_handler - select the top-level interrupt handler
+ *	@adap: the adapter
+ *	@polling: whether using NAPI to service response queues
+ *
+ *	Selects the top-level interrupt handler based on the type of interrupts
+ *	(MSI-X, MSI, or legacy) and whether NAPI will be used to service the
+ *	response queues.
+ */
+irq_handler_t t3_intr_handler(struct adapter *adap, int polling)
+{
+	if (adap->flags & USING_MSIX)
+		return polling ? t3_sge_intr_msix_napi : t3_sge_intr_msix;
+	if (adap->flags & USING_MSI)
+		return polling ? t3_intr_msi_napi : t3_intr_msi;
+	if (adap->params.rev > 0)
+		return polling ? t3b_intr_napi : t3b_intr;
+	return t3_intr;
+}
+
+#define SGE_PARERR (F_CPPARITYERROR | F_OCPARITYERROR | F_RCPARITYERROR | \
+		    F_IRPARITYERROR | V_ITPARITYERROR(M_ITPARITYERROR) | \
+		    V_FLPARITYERROR(M_FLPARITYERROR) | F_LODRBPARITYERROR | \
+		    F_HIDRBPARITYERROR | F_LORCQPARITYERROR | \
+		    F_HIRCQPARITYERROR)
+#define SGE_FRAMINGERR (F_UC_REQ_FRAMINGERROR | F_R_REQ_FRAMINGERROR)
+#define SGE_FATALERR (SGE_PARERR | SGE_FRAMINGERR | F_RSPQCREDITOVERFOW | \
+		      F_RSPQDISABLED)
+
+/**
+ *	t3_sge_err_intr_handler - SGE async event interrupt handler
+ *	@adapter: the adapter
+ *
+ *	Interrupt handler for SGE asynchronous (non-data) events.
+ */
+void t3_sge_err_intr_handler(struct adapter *adapter)
+{
+	unsigned int v, status = t3_read_reg(adapter, A_SG_INT_CAUSE) &
+				 ~F_FLEMPTY;
+
+	if (status & SGE_PARERR)
+		CH_ALERT(adapter, "SGE parity error (0x%x)\n",
+			 status & SGE_PARERR);
+	if (status & SGE_FRAMINGERR)
+		CH_ALERT(adapter, "SGE framing error (0x%x)\n",
+			 status & SGE_FRAMINGERR);
+
+	if (status & F_RSPQCREDITOVERFOW)
+		CH_ALERT(adapter, "SGE response queue credit overflow\n");
+
+	if (status & F_RSPQDISABLED) {
+		v = t3_read_reg(adapter, A_SG_RSPQ_FL_STATUS);
+
+		CH_ALERT(adapter,
+			 "packet delivered to disabled response queue "
+			 "(0x%x)\n", (v >> S_RSPQ0DISABLED) & 0xff);
+	}
+
+	if (status & (F_HIPIODRBDROPERR | F_LOPIODRBDROPERR))
+		queue_work(cxgb3_wq, &adapter->db_drop_task);
+
+	if (status & (F_HIPRIORITYDBFULL | F_LOPRIORITYDBFULL))
+		queue_work(cxgb3_wq, &adapter->db_full_task);
+
+	if (status & (F_HIPRIORITYDBEMPTY | F_LOPRIORITYDBEMPTY))
+		queue_work(cxgb3_wq, &adapter->db_empty_task);
+
+	t3_write_reg(adapter, A_SG_INT_CAUSE, status);
+	if (status &  SGE_FATALERR)
+		t3_fatal_err(adapter);
+}
+
+/**
+ *	sge_timer_tx - perform periodic maintenance of an SGE qset
+ *	@data: the SGE queue set to maintain
+ *
+ *	Runs periodically from a timer to perform maintenance of an SGE queue
+ *	set.  It performs two tasks:
+ *
+ *	Cleans up any completed Tx descriptors that may still be pending.
+ *	Normal descriptor cleanup happens when new packets are added to a Tx
+ *	queue so this timer is relatively infrequent and does any cleanup only
+ *	if the Tx queue has not seen any new packets in a while.  We make a
+ *	best effort attempt to reclaim descriptors, in that we don't wait
+ *	around if we cannot get a queue's lock (which most likely is because
+ *	someone else is queueing new packets and so will also handle the clean
+ *	up).  Since control queues use immediate data exclusively we don't
+ *	bother cleaning them up here.
+ *
+ */
+static void sge_timer_tx(unsigned long data)
+{
+	struct sge_qset *qs = (struct sge_qset *)data;
+	struct port_info *pi = netdev_priv(qs->netdev);
+	struct adapter *adap = pi->adapter;
+	unsigned int tbd[SGE_TXQ_PER_SET] = {0, 0};
+	unsigned long next_period;
+
+	if (__netif_tx_trylock(qs->tx_q)) {
+                tbd[TXQ_ETH] = reclaim_completed_tx(adap, &qs->txq[TXQ_ETH],
+                                                     TX_RECLAIM_TIMER_CHUNK);
+		__netif_tx_unlock(qs->tx_q);
+	}
+
+	if (spin_trylock(&qs->txq[TXQ_OFLD].lock)) {
+		tbd[TXQ_OFLD] = reclaim_completed_tx(adap, &qs->txq[TXQ_OFLD],
+						     TX_RECLAIM_TIMER_CHUNK);
+		spin_unlock(&qs->txq[TXQ_OFLD].lock);
+	}
+
+	next_period = TX_RECLAIM_PERIOD >>
+                      (max(tbd[TXQ_ETH], tbd[TXQ_OFLD]) /
+                      TX_RECLAIM_TIMER_CHUNK);
+	mod_timer(&qs->tx_reclaim_timer, jiffies + next_period);
+}
+
+/**
+ *	sge_timer_rx - perform periodic maintenance of an SGE qset
+ *	@data: the SGE queue set to maintain
+ *
+ *	a) Replenishes Rx queues that have run out due to memory shortage.
+ *	Normally new Rx buffers are added when existing ones are consumed but
+ *	when out of memory a queue can become empty.  We try to add only a few
+ *	buffers here, the queue will be replenished fully as these new buffers
+ *	are used up if memory shortage has subsided.
+ *
+ *	b) Return coalesced response queue credits in case a response queue is
+ *	starved.
+ *
+ */
+static void sge_timer_rx(unsigned long data)
+{
+	spinlock_t *lock;
+	struct sge_qset *qs = (struct sge_qset *)data;
+	struct port_info *pi = netdev_priv(qs->netdev);
+	struct adapter *adap = pi->adapter;
+	u32 status;
+
+	lock = adap->params.rev > 0 ?
+	       &qs->rspq.lock : &adap->sge.qs[0].rspq.lock;
+
+	if (!spin_trylock_irq(lock))
+		goto out;
+
+	if (napi_is_scheduled(&qs->napi))
+		goto unlock;
+
+	if (adap->params.rev < 4) {
+		status = t3_read_reg(adap, A_SG_RSPQ_FL_STATUS);
+
+		if (status & (1 << qs->rspq.cntxt_id)) {
+			qs->rspq.starved++;
+			if (qs->rspq.credits) {
+				qs->rspq.credits--;
+				refill_rspq(adap, &qs->rspq, 1);
+				qs->rspq.restarted++;
+				t3_write_reg(adap, A_SG_RSPQ_FL_STATUS,
+					     1 << qs->rspq.cntxt_id);
+			}
+		}
+	}
+
+	if (qs->fl[0].credits < qs->fl[0].size)
+		__refill_fl(adap, &qs->fl[0]);
+	if (qs->fl[1].credits < qs->fl[1].size)
+		__refill_fl(adap, &qs->fl[1]);
+
+unlock:
+	spin_unlock_irq(lock);
+out:
+	mod_timer(&qs->rx_reclaim_timer, jiffies + RX_RECLAIM_PERIOD);
+}
+
+/**
+ *	t3_update_qset_coalesce - update coalescing settings for a queue set
+ *	@qs: the SGE queue set
+ *	@p: new queue set parameters
+ *
+ *	Update the coalescing settings for an SGE queue set.  Nothing is done
+ *	if the queue set is not initialized yet.
+ */
+void t3_update_qset_coalesce(struct sge_qset *qs, const struct qset_params *p)
+{
+	qs->rspq.holdoff_tmr = max(p->coalesce_usecs * 10, 1U);/* can't be 0 */
+	qs->rspq.polling = p->polling;
+	qs->napi.poll = p->polling ? napi_rx_handler : ofld_poll;
+}
+
+/**
+ *	t3_sge_alloc_qset - initialize an SGE queue set
+ *	@adapter: the adapter
+ *	@id: the queue set id
+ *	@nports: how many Ethernet ports will be using this queue set
+ *	@irq_vec_idx: the IRQ vector index for response queue interrupts
+ *	@p: configuration parameters for this queue set
+ *	@ntxq: number of Tx queues for the queue set
+ *	@netdev: net device associated with this queue set
+ *	@netdevq: net device TX queue associated with this queue set
+ *
+ *	Allocate resources and initialize an SGE queue set.  A queue set
+ *	comprises a response queue, two Rx free-buffer queues, and up to 3
+ *	Tx queues.  The Tx queues are assigned roles in the order Ethernet
+ *	queue, offload queue, and control queue.
+ */
+int t3_sge_alloc_qset(struct adapter *adapter, unsigned int id, int nports,
+		      int irq_vec_idx, const struct qset_params *p,
+		      int ntxq, struct net_device *dev,
+		      struct netdev_queue *netdevq)
+{
+	int i, avail, ret = -ENOMEM;
+	struct sge_qset *q = &adapter->sge.qs[id];
+
+	init_qset_cntxt(q, id);
+	setup_timer(&q->tx_reclaim_timer, sge_timer_tx, (unsigned long)q);
+	setup_timer(&q->rx_reclaim_timer, sge_timer_rx, (unsigned long)q);
+
+	q->fl[0].desc = alloc_ring(adapter->pdev, p->fl_size,
+				   sizeof(struct rx_desc),
+				   sizeof(struct rx_sw_desc),
+				   &q->fl[0].phys_addr, &q->fl[0].sdesc);
+	if (!q->fl[0].desc)
+		goto err;
+
+	q->fl[1].desc = alloc_ring(adapter->pdev, p->jumbo_size,
+				   sizeof(struct rx_desc),
+				   sizeof(struct rx_sw_desc),
+				   &q->fl[1].phys_addr, &q->fl[1].sdesc);
+	if (!q->fl[1].desc)
+		goto err;
+
+	q->rspq.desc = alloc_ring(adapter->pdev, p->rspq_size,
+				  sizeof(struct rsp_desc), 0,
+				  &q->rspq.phys_addr, NULL);
+	if (!q->rspq.desc)
+		goto err;
+
+	for (i = 0; i < ntxq; ++i) {
+		/*
+		 * The control queue always uses immediate data so does not
+		 * need to keep track of any sk_buffs.
+		 */
+		size_t sz = i == TXQ_CTRL ? 0 : sizeof(struct tx_sw_desc);
+
+		q->txq[i].desc = alloc_ring(adapter->pdev, p->txq_size[i],
+					    sizeof(struct tx_desc), sz,
+					    &q->txq[i].phys_addr,
+					    &q->txq[i].sdesc);
+		if (!q->txq[i].desc)
+			goto err;
+
+		q->txq[i].gen = 1;
+		q->txq[i].size = p->txq_size[i];
+		spin_lock_init(&q->txq[i].lock);
+		skb_queue_head_init(&q->txq[i].sendq);
+	}
+
+	tasklet_init(&q->txq[TXQ_OFLD].qresume_tsk, restart_offloadq,
+		     (unsigned long)q);
+	tasklet_init(&q->txq[TXQ_CTRL].qresume_tsk, restart_ctrlq,
+		     (unsigned long)q);
+
+	q->fl[0].gen = q->fl[1].gen = 1;
+	q->fl[0].size = p->fl_size;
+	q->fl[1].size = p->jumbo_size;
+
+	q->rspq.gen = 1;
+	q->rspq.size = p->rspq_size;
+	spin_lock_init(&q->rspq.lock);
+	skb_queue_head_init(&q->rspq.rx_queue);
+
+	q->txq[TXQ_ETH].stop_thres = nports *
+	    flits_to_desc(sgl_len(MAX_SKB_FRAGS + 1) + 3);
+
+#if FL0_PG_CHUNK_SIZE > 0
+	q->fl[0].buf_size = FL0_PG_CHUNK_SIZE;
+#else
+	q->fl[0].buf_size = SGE_RX_SM_BUF_SIZE + sizeof(struct cpl_rx_data);
+#endif
+#if FL1_PG_CHUNK_SIZE > 0
+	q->fl[1].buf_size = FL1_PG_CHUNK_SIZE;
+#else
+	q->fl[1].buf_size = is_offload(adapter) ?
+		(16 * 1024) - SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) :
+		MAX_FRAME_SIZE + 2 + sizeof(struct cpl_rx_pkt);
+#endif
+
+	q->fl[0].use_pages = FL0_PG_CHUNK_SIZE > 0;
+	q->fl[1].use_pages = FL1_PG_CHUNK_SIZE > 0;
+	q->fl[0].order = FL0_PG_ORDER;
+	q->fl[1].order = FL1_PG_ORDER;
+	q->fl[0].alloc_size = FL0_PG_ALLOC_SIZE;
+	q->fl[1].alloc_size = FL1_PG_ALLOC_SIZE;
+
+	spin_lock_irq(&adapter->sge.reg_lock);
+
+	/* FL threshold comparison uses < */
+	ret = t3_sge_init_rspcntxt(adapter, q->rspq.cntxt_id, irq_vec_idx,
+				   q->rspq.phys_addr, q->rspq.size,
+				   q->fl[0].buf_size - SGE_PG_RSVD, 1, 0);
+	if (ret)
+		goto err_unlock;
+
+	for (i = 0; i < SGE_RXQ_PER_SET; ++i) {
+		ret = t3_sge_init_flcntxt(adapter, q->fl[i].cntxt_id, 0,
+					  q->fl[i].phys_addr, q->fl[i].size,
+					  q->fl[i].buf_size - SGE_PG_RSVD,
+					  p->cong_thres, 1, 0);
+		if (ret)
+			goto err_unlock;
+	}
+
+	ret = t3_sge_init_ecntxt(adapter, q->txq[TXQ_ETH].cntxt_id, USE_GTS,
+				 SGE_CNTXT_ETH, id, q->txq[TXQ_ETH].phys_addr,
+				 q->txq[TXQ_ETH].size, q->txq[TXQ_ETH].token,
+				 1, 0);
+	if (ret)
+		goto err_unlock;
+
+	if (ntxq > 1) {
+		ret = t3_sge_init_ecntxt(adapter, q->txq[TXQ_OFLD].cntxt_id,
+					 USE_GTS, SGE_CNTXT_OFLD, id,
+					 q->txq[TXQ_OFLD].phys_addr,
+					 q->txq[TXQ_OFLD].size, 0, 1, 0);
+		if (ret)
+			goto err_unlock;
+	}
+
+	if (ntxq > 2) {
+		ret = t3_sge_init_ecntxt(adapter, q->txq[TXQ_CTRL].cntxt_id, 0,
+					 SGE_CNTXT_CTRL, id,
+					 q->txq[TXQ_CTRL].phys_addr,
+					 q->txq[TXQ_CTRL].size,
+					 q->txq[TXQ_CTRL].token, 1, 0);
+		if (ret)
+			goto err_unlock;
+	}
+
+	spin_unlock_irq(&adapter->sge.reg_lock);
+
+	q->adap = adapter;
+	q->netdev = dev;
+	q->tx_q = netdevq;
+	t3_update_qset_coalesce(q, p);
+
+	avail = refill_fl(adapter, &q->fl[0], q->fl[0].size,
+			  GFP_KERNEL | __GFP_COMP);
+	if (!avail) {
+		CH_ALERT(adapter, "free list queue 0 initialization failed\n");
+		goto err;
+	}
+	if (avail < q->fl[0].size)
+		CH_WARN(adapter, "free list queue 0 enabled with %d credits\n",
+			avail);
+
+	avail = refill_fl(adapter, &q->fl[1], q->fl[1].size,
+			  GFP_KERNEL | __GFP_COMP);
+	if (avail < q->fl[1].size)
+		CH_WARN(adapter, "free list queue 1 enabled with %d credits\n",
+			avail);
+	refill_rspq(adapter, &q->rspq, q->rspq.size - 1);
+
+	t3_write_reg(adapter, A_SG_GTS, V_RSPQ(q->rspq.cntxt_id) |
+		     V_NEWTIMER(q->rspq.holdoff_tmr));
+
+	return 0;
+
+err_unlock:
+	spin_unlock_irq(&adapter->sge.reg_lock);
+err:
+	t3_free_qset(adapter, q);
+	return ret;
+}
+
+/**
+ *      t3_start_sge_timers - start SGE timer call backs
+ *      @adap: the adapter
+ *
+ *      Starts each SGE queue set's timer call back
+ */
+void t3_start_sge_timers(struct adapter *adap)
+{
+	int i;
+
+	for (i = 0; i < SGE_QSETS; ++i) {
+		struct sge_qset *q = &adap->sge.qs[i];
+
+	if (q->tx_reclaim_timer.function)
+		mod_timer(&q->tx_reclaim_timer, jiffies + TX_RECLAIM_PERIOD);
+
+	if (q->rx_reclaim_timer.function)
+		mod_timer(&q->rx_reclaim_timer, jiffies + RX_RECLAIM_PERIOD);
+	}
+}
+
+/**
+ *	t3_stop_sge_timers - stop SGE timer call backs
+ *	@adap: the adapter
+ *
+ *	Stops each SGE queue set's timer call back
+ */
+void t3_stop_sge_timers(struct adapter *adap)
+{
+	int i;
+
+	for (i = 0; i < SGE_QSETS; ++i) {
+		struct sge_qset *q = &adap->sge.qs[i];
+
+		if (q->tx_reclaim_timer.function)
+			del_timer_sync(&q->tx_reclaim_timer);
+		if (q->rx_reclaim_timer.function)
+			del_timer_sync(&q->rx_reclaim_timer);
+	}
+}
+
+/**
+ *	t3_free_sge_resources - free SGE resources
+ *	@adap: the adapter
+ *
+ *	Frees resources used by the SGE queue sets.
+ */
+void t3_free_sge_resources(struct adapter *adap)
+{
+	int i;
+
+	for (i = 0; i < SGE_QSETS; ++i)
+		t3_free_qset(adap, &adap->sge.qs[i]);
+}
+
+/**
+ *	t3_sge_start - enable SGE
+ *	@adap: the adapter
+ *
+ *	Enables the SGE for DMAs.  This is the last step in starting packet
+ *	transfers.
+ */
+void t3_sge_start(struct adapter *adap)
+{
+	t3_set_reg_field(adap, A_SG_CONTROL, F_GLOBALENABLE, F_GLOBALENABLE);
+}
+
+/**
+ *	t3_sge_stop - disable SGE operation
+ *	@adap: the adapter
+ *
+ *	Disables the DMA engine.  This can be called in emeregencies (e.g.,
+ *	from error interrupts) or from normal process context.  In the latter
+ *	case it also disables any pending queue restart tasklets.  Note that
+ *	if it is called in interrupt context it cannot disable the restart
+ *	tasklets as it cannot wait, however the tasklets will have no effect
+ *	since the doorbells are disabled and the driver will call this again
+ *	later from process context, at which time the tasklets will be stopped
+ *	if they are still running.
+ */
+void t3_sge_stop(struct adapter *adap)
+{
+	t3_set_reg_field(adap, A_SG_CONTROL, F_GLOBALENABLE, 0);
+	if (!in_interrupt()) {
+		int i;
+
+		for (i = 0; i < SGE_QSETS; ++i) {
+			struct sge_qset *qs = &adap->sge.qs[i];
+
+			tasklet_kill(&qs->txq[TXQ_OFLD].qresume_tsk);
+			tasklet_kill(&qs->txq[TXQ_CTRL].qresume_tsk);
+		}
+	}
+}
+
+/**
+ *	t3_sge_init - initialize SGE
+ *	@adap: the adapter
+ *	@p: the SGE parameters
+ *
+ *	Performs SGE initialization needed every time after a chip reset.
+ *	We do not initialize any of the queue sets here, instead the driver
+ *	top-level must request those individually.  We also do not enable DMA
+ *	here, that should be done after the queues have been set up.
+ */
+void t3_sge_init(struct adapter *adap, struct sge_params *p)
+{
+	unsigned int ctrl, ups = ffs(pci_resource_len(adap->pdev, 2) >> 12);
+
+	ctrl = F_DROPPKT | V_PKTSHIFT(2) | F_FLMODE | F_AVOIDCQOVFL |
+	    F_CQCRDTCTRL | F_CONGMODE | F_TNLFLMODE | F_FATLPERREN |
+	    V_HOSTPAGESIZE(PAGE_SHIFT - 11) | F_BIGENDIANINGRESS |
+	    V_USERSPACESIZE(ups ? ups - 1 : 0) | F_ISCSICOALESCING;
+#if SGE_NUM_GENBITS == 1
+	ctrl |= F_EGRGENCTRL;
+#endif
+	if (adap->params.rev > 0) {
+		if (!(adap->flags & (USING_MSIX | USING_MSI)))
+			ctrl |= F_ONEINTMULTQ | F_OPTONEINTMULTQ;
+	}
+	t3_write_reg(adap, A_SG_CONTROL, ctrl);
+	t3_write_reg(adap, A_SG_EGR_RCQ_DRB_THRSH, V_HIRCQDRBTHRSH(512) |
+		     V_LORCQDRBTHRSH(512));
+	t3_write_reg(adap, A_SG_TIMER_TICK, core_ticks_per_usec(adap) / 10);
+	t3_write_reg(adap, A_SG_CMDQ_CREDIT_TH, V_THRESHOLD(32) |
+		     V_TIMEOUT(200 * core_ticks_per_usec(adap)));
+	t3_write_reg(adap, A_SG_HI_DRB_HI_THRSH,
+		     adap->params.rev < T3_REV_C ? 1000 : 500);
+	t3_write_reg(adap, A_SG_HI_DRB_LO_THRSH, 256);
+	t3_write_reg(adap, A_SG_LO_DRB_HI_THRSH, 1000);
+	t3_write_reg(adap, A_SG_LO_DRB_LO_THRSH, 256);
+	t3_write_reg(adap, A_SG_OCO_BASE, V_BASE1(0xfff));
+	t3_write_reg(adap, A_SG_DRB_PRI_THRESH, 63 * 1024);
+}
+
+/**
+ *	t3_sge_prep - one-time SGE initialization
+ *	@adap: the associated adapter
+ *	@p: SGE parameters
+ *
+ *	Performs one-time initialization of SGE SW state.  Includes determining
+ *	defaults for the assorted SGE parameters, which admins can change until
+ *	they are used to initialize the SGE.
+ */
+void t3_sge_prep(struct adapter *adap, struct sge_params *p)
+{
+	int i;
+
+	p->max_pkt_size = (16 * 1024) - sizeof(struct cpl_rx_data) -
+	    SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+
+	for (i = 0; i < SGE_QSETS; ++i) {
+		struct qset_params *q = p->qset + i;
+
+		q->polling = adap->params.rev > 0;
+		q->coalesce_usecs = 5;
+		q->rspq_size = 1024;
+		q->fl_size = 1024;
+ 		q->jumbo_size = 512;
+		q->txq_size[TXQ_ETH] = 1024;
+		q->txq_size[TXQ_OFLD] = 1024;
+		q->txq_size[TXQ_CTRL] = 256;
+		q->cong_thres = 0;
+	}
+
+	spin_lock_init(&adap->sge.reg_lock);
+}
diff --git a/drivers/net/ethernet/chelsio/cxgb3/sge_defs.h b/drivers/net/ethernet/chelsio/cxgb3/sge_defs.h
new file mode 100644
index 0000000..29b6c80
--- /dev/null
+++ b/drivers/net/ethernet/chelsio/cxgb3/sge_defs.h
@@ -0,0 +1,255 @@
+/*
+ * This file is automatically generated --- any changes will be lost.
+ */
+
+#ifndef _SGE_DEFS_H
+#define _SGE_DEFS_H
+
+#define S_EC_CREDITS    0
+#define M_EC_CREDITS    0x7FFF
+#define V_EC_CREDITS(x) ((x) << S_EC_CREDITS)
+#define G_EC_CREDITS(x) (((x) >> S_EC_CREDITS) & M_EC_CREDITS)
+
+#define S_EC_GTS    15
+#define V_EC_GTS(x) ((x) << S_EC_GTS)
+#define F_EC_GTS    V_EC_GTS(1U)
+
+#define S_EC_INDEX    16
+#define M_EC_INDEX    0xFFFF
+#define V_EC_INDEX(x) ((x) << S_EC_INDEX)
+#define G_EC_INDEX(x) (((x) >> S_EC_INDEX) & M_EC_INDEX)
+
+#define S_EC_SIZE    0
+#define M_EC_SIZE    0xFFFF
+#define V_EC_SIZE(x) ((x) << S_EC_SIZE)
+#define G_EC_SIZE(x) (((x) >> S_EC_SIZE) & M_EC_SIZE)
+
+#define S_EC_BASE_LO    16
+#define M_EC_BASE_LO    0xFFFF
+#define V_EC_BASE_LO(x) ((x) << S_EC_BASE_LO)
+#define G_EC_BASE_LO(x) (((x) >> S_EC_BASE_LO) & M_EC_BASE_LO)
+
+#define S_EC_BASE_HI    0
+#define M_EC_BASE_HI    0xF
+#define V_EC_BASE_HI(x) ((x) << S_EC_BASE_HI)
+#define G_EC_BASE_HI(x) (((x) >> S_EC_BASE_HI) & M_EC_BASE_HI)
+
+#define S_EC_RESPQ    4
+#define M_EC_RESPQ    0x7
+#define V_EC_RESPQ(x) ((x) << S_EC_RESPQ)
+#define G_EC_RESPQ(x) (((x) >> S_EC_RESPQ) & M_EC_RESPQ)
+
+#define S_EC_TYPE    7
+#define M_EC_TYPE    0x7
+#define V_EC_TYPE(x) ((x) << S_EC_TYPE)
+#define G_EC_TYPE(x) (((x) >> S_EC_TYPE) & M_EC_TYPE)
+
+#define S_EC_GEN    10
+#define V_EC_GEN(x) ((x) << S_EC_GEN)
+#define F_EC_GEN    V_EC_GEN(1U)
+
+#define S_EC_UP_TOKEN    11
+#define M_EC_UP_TOKEN    0xFFFFF
+#define V_EC_UP_TOKEN(x) ((x) << S_EC_UP_TOKEN)
+#define G_EC_UP_TOKEN(x) (((x) >> S_EC_UP_TOKEN) & M_EC_UP_TOKEN)
+
+#define S_EC_VALID    31
+#define V_EC_VALID(x) ((x) << S_EC_VALID)
+#define F_EC_VALID    V_EC_VALID(1U)
+
+#define S_RQ_MSI_VEC    20
+#define M_RQ_MSI_VEC    0x3F
+#define V_RQ_MSI_VEC(x) ((x) << S_RQ_MSI_VEC)
+#define G_RQ_MSI_VEC(x) (((x) >> S_RQ_MSI_VEC) & M_RQ_MSI_VEC)
+
+#define S_RQ_INTR_EN    26
+#define V_RQ_INTR_EN(x) ((x) << S_RQ_INTR_EN)
+#define F_RQ_INTR_EN    V_RQ_INTR_EN(1U)
+
+#define S_RQ_GEN    28
+#define V_RQ_GEN(x) ((x) << S_RQ_GEN)
+#define F_RQ_GEN    V_RQ_GEN(1U)
+
+#define S_CQ_INDEX    0
+#define M_CQ_INDEX    0xFFFF
+#define V_CQ_INDEX(x) ((x) << S_CQ_INDEX)
+#define G_CQ_INDEX(x) (((x) >> S_CQ_INDEX) & M_CQ_INDEX)
+
+#define S_CQ_SIZE    16
+#define M_CQ_SIZE    0xFFFF
+#define V_CQ_SIZE(x) ((x) << S_CQ_SIZE)
+#define G_CQ_SIZE(x) (((x) >> S_CQ_SIZE) & M_CQ_SIZE)
+
+#define S_CQ_BASE_HI    0
+#define M_CQ_BASE_HI    0xFFFFF
+#define V_CQ_BASE_HI(x) ((x) << S_CQ_BASE_HI)
+#define G_CQ_BASE_HI(x) (((x) >> S_CQ_BASE_HI) & M_CQ_BASE_HI)
+
+#define S_CQ_RSPQ    20
+#define M_CQ_RSPQ    0x3F
+#define V_CQ_RSPQ(x) ((x) << S_CQ_RSPQ)
+#define G_CQ_RSPQ(x) (((x) >> S_CQ_RSPQ) & M_CQ_RSPQ)
+
+#define S_CQ_ASYNC_NOTIF    26
+#define V_CQ_ASYNC_NOTIF(x) ((x) << S_CQ_ASYNC_NOTIF)
+#define F_CQ_ASYNC_NOTIF    V_CQ_ASYNC_NOTIF(1U)
+
+#define S_CQ_ARMED    27
+#define V_CQ_ARMED(x) ((x) << S_CQ_ARMED)
+#define F_CQ_ARMED    V_CQ_ARMED(1U)
+
+#define S_CQ_ASYNC_NOTIF_SOL    28
+#define V_CQ_ASYNC_NOTIF_SOL(x) ((x) << S_CQ_ASYNC_NOTIF_SOL)
+#define F_CQ_ASYNC_NOTIF_SOL    V_CQ_ASYNC_NOTIF_SOL(1U)
+
+#define S_CQ_GEN    29
+#define V_CQ_GEN(x) ((x) << S_CQ_GEN)
+#define F_CQ_GEN    V_CQ_GEN(1U)
+
+#define S_CQ_ERR    30
+#define V_CQ_ERR(x) ((x) << S_CQ_ERR)
+#define F_CQ_ERR    V_CQ_ERR(1U)
+
+#define S_CQ_OVERFLOW_MODE    31
+#define V_CQ_OVERFLOW_MODE(x) ((x) << S_CQ_OVERFLOW_MODE)
+#define F_CQ_OVERFLOW_MODE    V_CQ_OVERFLOW_MODE(1U)
+
+#define S_CQ_CREDITS    0
+#define M_CQ_CREDITS    0xFFFF
+#define V_CQ_CREDITS(x) ((x) << S_CQ_CREDITS)
+#define G_CQ_CREDITS(x) (((x) >> S_CQ_CREDITS) & M_CQ_CREDITS)
+
+#define S_CQ_CREDIT_THRES    16
+#define M_CQ_CREDIT_THRES    0x1FFF
+#define V_CQ_CREDIT_THRES(x) ((x) << S_CQ_CREDIT_THRES)
+#define G_CQ_CREDIT_THRES(x) (((x) >> S_CQ_CREDIT_THRES) & M_CQ_CREDIT_THRES)
+
+#define S_FL_BASE_HI    0
+#define M_FL_BASE_HI    0xFFFFF
+#define V_FL_BASE_HI(x) ((x) << S_FL_BASE_HI)
+#define G_FL_BASE_HI(x) (((x) >> S_FL_BASE_HI) & M_FL_BASE_HI)
+
+#define S_FL_INDEX_LO    20
+#define M_FL_INDEX_LO    0xFFF
+#define V_FL_INDEX_LO(x) ((x) << S_FL_INDEX_LO)
+#define G_FL_INDEX_LO(x) (((x) >> S_FL_INDEX_LO) & M_FL_INDEX_LO)
+
+#define S_FL_INDEX_HI    0
+#define M_FL_INDEX_HI    0xF
+#define V_FL_INDEX_HI(x) ((x) << S_FL_INDEX_HI)
+#define G_FL_INDEX_HI(x) (((x) >> S_FL_INDEX_HI) & M_FL_INDEX_HI)
+
+#define S_FL_SIZE    4
+#define M_FL_SIZE    0xFFFF
+#define V_FL_SIZE(x) ((x) << S_FL_SIZE)
+#define G_FL_SIZE(x) (((x) >> S_FL_SIZE) & M_FL_SIZE)
+
+#define S_FL_GEN    20
+#define V_FL_GEN(x) ((x) << S_FL_GEN)
+#define F_FL_GEN    V_FL_GEN(1U)
+
+#define S_FL_ENTRY_SIZE_LO    21
+#define M_FL_ENTRY_SIZE_LO    0x7FF
+#define V_FL_ENTRY_SIZE_LO(x) ((x) << S_FL_ENTRY_SIZE_LO)
+#define G_FL_ENTRY_SIZE_LO(x) (((x) >> S_FL_ENTRY_SIZE_LO) & M_FL_ENTRY_SIZE_LO)
+
+#define S_FL_ENTRY_SIZE_HI    0
+#define M_FL_ENTRY_SIZE_HI    0x1FFFFF
+#define V_FL_ENTRY_SIZE_HI(x) ((x) << S_FL_ENTRY_SIZE_HI)
+#define G_FL_ENTRY_SIZE_HI(x) (((x) >> S_FL_ENTRY_SIZE_HI) & M_FL_ENTRY_SIZE_HI)
+
+#define S_FL_CONG_THRES    21
+#define M_FL_CONG_THRES    0x3FF
+#define V_FL_CONG_THRES(x) ((x) << S_FL_CONG_THRES)
+#define G_FL_CONG_THRES(x) (((x) >> S_FL_CONG_THRES) & M_FL_CONG_THRES)
+
+#define S_FL_GTS    31
+#define V_FL_GTS(x) ((x) << S_FL_GTS)
+#define F_FL_GTS    V_FL_GTS(1U)
+
+#define S_FLD_GEN1    31
+#define V_FLD_GEN1(x) ((x) << S_FLD_GEN1)
+#define F_FLD_GEN1    V_FLD_GEN1(1U)
+
+#define S_FLD_GEN2    0
+#define V_FLD_GEN2(x) ((x) << S_FLD_GEN2)
+#define F_FLD_GEN2    V_FLD_GEN2(1U)
+
+#define S_RSPD_TXQ1_CR    0
+#define M_RSPD_TXQ1_CR    0x7F
+#define V_RSPD_TXQ1_CR(x) ((x) << S_RSPD_TXQ1_CR)
+#define G_RSPD_TXQ1_CR(x) (((x) >> S_RSPD_TXQ1_CR) & M_RSPD_TXQ1_CR)
+
+#define S_RSPD_TXQ1_GTS    7
+#define V_RSPD_TXQ1_GTS(x) ((x) << S_RSPD_TXQ1_GTS)
+#define F_RSPD_TXQ1_GTS    V_RSPD_TXQ1_GTS(1U)
+
+#define S_RSPD_TXQ2_CR    8
+#define M_RSPD_TXQ2_CR    0x7F
+#define V_RSPD_TXQ2_CR(x) ((x) << S_RSPD_TXQ2_CR)
+#define G_RSPD_TXQ2_CR(x) (((x) >> S_RSPD_TXQ2_CR) & M_RSPD_TXQ2_CR)
+
+#define S_RSPD_TXQ2_GTS    15
+#define V_RSPD_TXQ2_GTS(x) ((x) << S_RSPD_TXQ2_GTS)
+#define F_RSPD_TXQ2_GTS    V_RSPD_TXQ2_GTS(1U)
+
+#define S_RSPD_TXQ0_CR    16
+#define M_RSPD_TXQ0_CR    0x7F
+#define V_RSPD_TXQ0_CR(x) ((x) << S_RSPD_TXQ0_CR)
+#define G_RSPD_TXQ0_CR(x) (((x) >> S_RSPD_TXQ0_CR) & M_RSPD_TXQ0_CR)
+
+#define S_RSPD_TXQ0_GTS    23
+#define V_RSPD_TXQ0_GTS(x) ((x) << S_RSPD_TXQ0_GTS)
+#define F_RSPD_TXQ0_GTS    V_RSPD_TXQ0_GTS(1U)
+
+#define S_RSPD_EOP    24
+#define V_RSPD_EOP(x) ((x) << S_RSPD_EOP)
+#define F_RSPD_EOP    V_RSPD_EOP(1U)
+
+#define S_RSPD_SOP    25
+#define V_RSPD_SOP(x) ((x) << S_RSPD_SOP)
+#define F_RSPD_SOP    V_RSPD_SOP(1U)
+
+#define S_RSPD_ASYNC_NOTIF    26
+#define V_RSPD_ASYNC_NOTIF(x) ((x) << S_RSPD_ASYNC_NOTIF)
+#define F_RSPD_ASYNC_NOTIF    V_RSPD_ASYNC_NOTIF(1U)
+
+#define S_RSPD_FL0_GTS    27
+#define V_RSPD_FL0_GTS(x) ((x) << S_RSPD_FL0_GTS)
+#define F_RSPD_FL0_GTS    V_RSPD_FL0_GTS(1U)
+
+#define S_RSPD_FL1_GTS    28
+#define V_RSPD_FL1_GTS(x) ((x) << S_RSPD_FL1_GTS)
+#define F_RSPD_FL1_GTS    V_RSPD_FL1_GTS(1U)
+
+#define S_RSPD_IMM_DATA_VALID    29
+#define V_RSPD_IMM_DATA_VALID(x) ((x) << S_RSPD_IMM_DATA_VALID)
+#define F_RSPD_IMM_DATA_VALID    V_RSPD_IMM_DATA_VALID(1U)
+
+#define S_RSPD_OFFLOAD    30
+#define V_RSPD_OFFLOAD(x) ((x) << S_RSPD_OFFLOAD)
+#define F_RSPD_OFFLOAD    V_RSPD_OFFLOAD(1U)
+
+#define S_RSPD_GEN1    31
+#define V_RSPD_GEN1(x) ((x) << S_RSPD_GEN1)
+#define F_RSPD_GEN1    V_RSPD_GEN1(1U)
+
+#define S_RSPD_LEN    0
+#define M_RSPD_LEN    0x7FFFFFFF
+#define V_RSPD_LEN(x) ((x) << S_RSPD_LEN)
+#define G_RSPD_LEN(x) (((x) >> S_RSPD_LEN) & M_RSPD_LEN)
+
+#define S_RSPD_FLQ    31
+#define V_RSPD_FLQ(x) ((x) << S_RSPD_FLQ)
+#define F_RSPD_FLQ    V_RSPD_FLQ(1U)
+
+#define S_RSPD_GEN2    0
+#define V_RSPD_GEN2(x) ((x) << S_RSPD_GEN2)
+#define F_RSPD_GEN2    V_RSPD_GEN2(1U)
+
+#define S_RSPD_INR_VEC    1
+#define M_RSPD_INR_VEC    0x7F
+#define V_RSPD_INR_VEC(x) ((x) << S_RSPD_INR_VEC)
+#define G_RSPD_INR_VEC(x) (((x) >> S_RSPD_INR_VEC) & M_RSPD_INR_VEC)
+
+#endif				/* _SGE_DEFS_H */
diff --git a/drivers/net/ethernet/chelsio/cxgb3/t3_cpl.h b/drivers/net/ethernet/chelsio/cxgb3/t3_cpl.h
new file mode 100644
index 0000000..852c399
--- /dev/null
+++ b/drivers/net/ethernet/chelsio/cxgb3/t3_cpl.h
@@ -0,0 +1,1495 @@
+/*
+ * Copyright (c) 2004-2008 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef T3_CPL_H
+#define T3_CPL_H
+
+#if !defined(__LITTLE_ENDIAN_BITFIELD) && !defined(__BIG_ENDIAN_BITFIELD)
+# include <asm/byteorder.h>
+#endif
+
+enum CPL_opcode {
+	CPL_PASS_OPEN_REQ = 0x1,
+	CPL_PASS_ACCEPT_RPL = 0x2,
+	CPL_ACT_OPEN_REQ = 0x3,
+	CPL_SET_TCB = 0x4,
+	CPL_SET_TCB_FIELD = 0x5,
+	CPL_GET_TCB = 0x6,
+	CPL_PCMD = 0x7,
+	CPL_CLOSE_CON_REQ = 0x8,
+	CPL_CLOSE_LISTSRV_REQ = 0x9,
+	CPL_ABORT_REQ = 0xA,
+	CPL_ABORT_RPL = 0xB,
+	CPL_TX_DATA = 0xC,
+	CPL_RX_DATA_ACK = 0xD,
+	CPL_TX_PKT = 0xE,
+	CPL_RTE_DELETE_REQ = 0xF,
+	CPL_RTE_WRITE_REQ = 0x10,
+	CPL_RTE_READ_REQ = 0x11,
+	CPL_L2T_WRITE_REQ = 0x12,
+	CPL_L2T_READ_REQ = 0x13,
+	CPL_SMT_WRITE_REQ = 0x14,
+	CPL_SMT_READ_REQ = 0x15,
+	CPL_TX_PKT_LSO = 0x16,
+	CPL_PCMD_READ = 0x17,
+	CPL_BARRIER = 0x18,
+	CPL_TID_RELEASE = 0x1A,
+
+	CPL_CLOSE_LISTSRV_RPL = 0x20,
+	CPL_ERROR = 0x21,
+	CPL_GET_TCB_RPL = 0x22,
+	CPL_L2T_WRITE_RPL = 0x23,
+	CPL_PCMD_READ_RPL = 0x24,
+	CPL_PCMD_RPL = 0x25,
+	CPL_PEER_CLOSE = 0x26,
+	CPL_RTE_DELETE_RPL = 0x27,
+	CPL_RTE_WRITE_RPL = 0x28,
+	CPL_RX_DDP_COMPLETE = 0x29,
+	CPL_RX_PHYS_ADDR = 0x2A,
+	CPL_RX_PKT = 0x2B,
+	CPL_RX_URG_NOTIFY = 0x2C,
+	CPL_SET_TCB_RPL = 0x2D,
+	CPL_SMT_WRITE_RPL = 0x2E,
+	CPL_TX_DATA_ACK = 0x2F,
+
+	CPL_ABORT_REQ_RSS = 0x30,
+	CPL_ABORT_RPL_RSS = 0x31,
+	CPL_CLOSE_CON_RPL = 0x32,
+	CPL_ISCSI_HDR = 0x33,
+	CPL_L2T_READ_RPL = 0x34,
+	CPL_RDMA_CQE = 0x35,
+	CPL_RDMA_CQE_READ_RSP = 0x36,
+	CPL_RDMA_CQE_ERR = 0x37,
+	CPL_RTE_READ_RPL = 0x38,
+	CPL_RX_DATA = 0x39,
+
+	CPL_ACT_OPEN_RPL = 0x40,
+	CPL_PASS_OPEN_RPL = 0x41,
+	CPL_RX_DATA_DDP = 0x42,
+	CPL_SMT_READ_RPL = 0x43,
+
+	CPL_ACT_ESTABLISH = 0x50,
+	CPL_PASS_ESTABLISH = 0x51,
+
+	CPL_PASS_ACCEPT_REQ = 0x70,
+
+	CPL_ASYNC_NOTIF = 0x80,	/* fake opcode for async notifications */
+
+	CPL_TX_DMA_ACK = 0xA0,
+	CPL_RDMA_READ_REQ = 0xA1,
+	CPL_RDMA_TERMINATE = 0xA2,
+	CPL_TRACE_PKT = 0xA3,
+	CPL_RDMA_EC_STATUS = 0xA5,
+
+	NUM_CPL_CMDS		/* must be last and previous entries must be sorted */
+};
+
+enum CPL_error {
+	CPL_ERR_NONE = 0,
+	CPL_ERR_TCAM_PARITY = 1,
+	CPL_ERR_TCAM_FULL = 3,
+	CPL_ERR_CONN_RESET = 20,
+	CPL_ERR_CONN_EXIST = 22,
+	CPL_ERR_ARP_MISS = 23,
+	CPL_ERR_BAD_SYN = 24,
+	CPL_ERR_CONN_TIMEDOUT = 30,
+	CPL_ERR_XMIT_TIMEDOUT = 31,
+	CPL_ERR_PERSIST_TIMEDOUT = 32,
+	CPL_ERR_FINWAIT2_TIMEDOUT = 33,
+	CPL_ERR_KEEPALIVE_TIMEDOUT = 34,
+	CPL_ERR_RTX_NEG_ADVICE = 35,
+	CPL_ERR_PERSIST_NEG_ADVICE = 36,
+	CPL_ERR_ABORT_FAILED = 42,
+	CPL_ERR_GENERAL = 99
+};
+
+enum {
+	CPL_CONN_POLICY_AUTO = 0,
+	CPL_CONN_POLICY_ASK = 1,
+	CPL_CONN_POLICY_DENY = 3
+};
+
+enum {
+	ULP_MODE_NONE = 0,
+	ULP_MODE_ISCSI = 2,
+	ULP_MODE_RDMA = 4,
+	ULP_MODE_TCPDDP = 5
+};
+
+enum {
+	ULP_CRC_HEADER = 1 << 0,
+	ULP_CRC_DATA = 1 << 1
+};
+
+enum {
+	CPL_PASS_OPEN_ACCEPT,
+	CPL_PASS_OPEN_REJECT
+};
+
+enum {
+	CPL_ABORT_SEND_RST = 0,
+	CPL_ABORT_NO_RST,
+	CPL_ABORT_POST_CLOSE_REQ = 2
+};
+
+enum {				/* TX_PKT_LSO ethernet types */
+	CPL_ETH_II,
+	CPL_ETH_II_VLAN,
+	CPL_ETH_802_3,
+	CPL_ETH_802_3_VLAN
+};
+
+enum {				/* TCP congestion control algorithms */
+	CONG_ALG_RENO,
+	CONG_ALG_TAHOE,
+	CONG_ALG_NEWRENO,
+	CONG_ALG_HIGHSPEED
+};
+
+enum {			/* RSS hash type */
+	RSS_HASH_NONE = 0,
+	RSS_HASH_2_TUPLE = 1,
+	RSS_HASH_4_TUPLE = 2,
+	RSS_HASH_TCPV6 = 3
+};
+
+union opcode_tid {
+	__be32 opcode_tid;
+	__u8 opcode;
+};
+
+#define S_OPCODE 24
+#define V_OPCODE(x) ((x) << S_OPCODE)
+#define G_OPCODE(x) (((x) >> S_OPCODE) & 0xFF)
+#define G_TID(x)    ((x) & 0xFFFFFF)
+
+#define S_QNUM 0
+#define G_QNUM(x) (((x) >> S_QNUM) & 0xFFFF)
+
+#define S_HASHTYPE 22
+#define M_HASHTYPE 0x3
+#define G_HASHTYPE(x) (((x) >> S_HASHTYPE) & M_HASHTYPE)
+
+/* tid is assumed to be 24-bits */
+#define MK_OPCODE_TID(opcode, tid) (V_OPCODE(opcode) | (tid))
+
+#define OPCODE_TID(cmd) ((cmd)->ot.opcode_tid)
+
+/* extract the TID from a CPL command */
+#define GET_TID(cmd) (G_TID(ntohl(OPCODE_TID(cmd))))
+
+struct tcp_options {
+	__be16 mss;
+	__u8 wsf;
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+	 __u8:5;
+	__u8 ecn:1;
+	__u8 sack:1;
+	__u8 tstamp:1;
+#else
+	__u8 tstamp:1;
+	__u8 sack:1;
+	__u8 ecn:1;
+	 __u8:5;
+#endif
+};
+
+struct rss_header {
+	__u8 opcode;
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+	__u8 cpu_idx:6;
+	__u8 hash_type:2;
+#else
+	__u8 hash_type:2;
+	__u8 cpu_idx:6;
+#endif
+	__be16 cq_idx;
+	__be32 rss_hash_val;
+};
+
+#ifndef CHELSIO_FW
+struct work_request_hdr {
+	__be32 wr_hi;
+	__be32 wr_lo;
+};
+
+/* wr_hi fields */
+#define S_WR_SGE_CREDITS    0
+#define M_WR_SGE_CREDITS    0xFF
+#define V_WR_SGE_CREDITS(x) ((x) << S_WR_SGE_CREDITS)
+#define G_WR_SGE_CREDITS(x) (((x) >> S_WR_SGE_CREDITS) & M_WR_SGE_CREDITS)
+
+#define S_WR_SGLSFLT    8
+#define M_WR_SGLSFLT    0xFF
+#define V_WR_SGLSFLT(x) ((x) << S_WR_SGLSFLT)
+#define G_WR_SGLSFLT(x) (((x) >> S_WR_SGLSFLT) & M_WR_SGLSFLT)
+
+#define S_WR_BCNTLFLT    16
+#define M_WR_BCNTLFLT    0xF
+#define V_WR_BCNTLFLT(x) ((x) << S_WR_BCNTLFLT)
+#define G_WR_BCNTLFLT(x) (((x) >> S_WR_BCNTLFLT) & M_WR_BCNTLFLT)
+
+#define S_WR_DATATYPE    20
+#define V_WR_DATATYPE(x) ((x) << S_WR_DATATYPE)
+#define F_WR_DATATYPE    V_WR_DATATYPE(1U)
+
+#define S_WR_COMPL    21
+#define V_WR_COMPL(x) ((x) << S_WR_COMPL)
+#define F_WR_COMPL    V_WR_COMPL(1U)
+
+#define S_WR_EOP    22
+#define V_WR_EOP(x) ((x) << S_WR_EOP)
+#define F_WR_EOP    V_WR_EOP(1U)
+
+#define S_WR_SOP    23
+#define V_WR_SOP(x) ((x) << S_WR_SOP)
+#define F_WR_SOP    V_WR_SOP(1U)
+
+#define S_WR_OP    24
+#define M_WR_OP    0xFF
+#define V_WR_OP(x) ((x) << S_WR_OP)
+#define G_WR_OP(x) (((x) >> S_WR_OP) & M_WR_OP)
+
+/* wr_lo fields */
+#define S_WR_LEN    0
+#define M_WR_LEN    0xFF
+#define V_WR_LEN(x) ((x) << S_WR_LEN)
+#define G_WR_LEN(x) (((x) >> S_WR_LEN) & M_WR_LEN)
+
+#define S_WR_TID    8
+#define M_WR_TID    0xFFFFF
+#define V_WR_TID(x) ((x) << S_WR_TID)
+#define G_WR_TID(x) (((x) >> S_WR_TID) & M_WR_TID)
+
+#define S_WR_CR_FLUSH    30
+#define V_WR_CR_FLUSH(x) ((x) << S_WR_CR_FLUSH)
+#define F_WR_CR_FLUSH    V_WR_CR_FLUSH(1U)
+
+#define S_WR_GEN    31
+#define V_WR_GEN(x) ((x) << S_WR_GEN)
+#define F_WR_GEN    V_WR_GEN(1U)
+
+# define WR_HDR struct work_request_hdr wr
+# define RSS_HDR
+#else
+# define WR_HDR
+# define RSS_HDR struct rss_header rss_hdr;
+#endif
+
+/* option 0 lower-half fields */
+#define S_CPL_STATUS    0
+#define M_CPL_STATUS    0xFF
+#define V_CPL_STATUS(x) ((x) << S_CPL_STATUS)
+#define G_CPL_STATUS(x) (((x) >> S_CPL_STATUS) & M_CPL_STATUS)
+
+#define S_INJECT_TIMER    6
+#define V_INJECT_TIMER(x) ((x) << S_INJECT_TIMER)
+#define F_INJECT_TIMER    V_INJECT_TIMER(1U)
+
+#define S_NO_OFFLOAD    7
+#define V_NO_OFFLOAD(x) ((x) << S_NO_OFFLOAD)
+#define F_NO_OFFLOAD    V_NO_OFFLOAD(1U)
+
+#define S_ULP_MODE    8
+#define M_ULP_MODE    0xF
+#define V_ULP_MODE(x) ((x) << S_ULP_MODE)
+#define G_ULP_MODE(x) (((x) >> S_ULP_MODE) & M_ULP_MODE)
+
+#define S_RCV_BUFSIZ    12
+#define M_RCV_BUFSIZ    0x3FFF
+#define V_RCV_BUFSIZ(x) ((x) << S_RCV_BUFSIZ)
+#define G_RCV_BUFSIZ(x) (((x) >> S_RCV_BUFSIZ) & M_RCV_BUFSIZ)
+
+#define S_TOS    26
+#define M_TOS    0x3F
+#define V_TOS(x) ((x) << S_TOS)
+#define G_TOS(x) (((x) >> S_TOS) & M_TOS)
+
+/* option 0 upper-half fields */
+#define S_DELACK    0
+#define V_DELACK(x) ((x) << S_DELACK)
+#define F_DELACK    V_DELACK(1U)
+
+#define S_NO_CONG    1
+#define V_NO_CONG(x) ((x) << S_NO_CONG)
+#define F_NO_CONG    V_NO_CONG(1U)
+
+#define S_SRC_MAC_SEL    2
+#define M_SRC_MAC_SEL    0x3
+#define V_SRC_MAC_SEL(x) ((x) << S_SRC_MAC_SEL)
+#define G_SRC_MAC_SEL(x) (((x) >> S_SRC_MAC_SEL) & M_SRC_MAC_SEL)
+
+#define S_L2T_IDX    4
+#define M_L2T_IDX    0x7FF
+#define V_L2T_IDX(x) ((x) << S_L2T_IDX)
+#define G_L2T_IDX(x) (((x) >> S_L2T_IDX) & M_L2T_IDX)
+
+#define S_TX_CHANNEL    15
+#define V_TX_CHANNEL(x) ((x) << S_TX_CHANNEL)
+#define F_TX_CHANNEL    V_TX_CHANNEL(1U)
+
+#define S_TCAM_BYPASS    16
+#define V_TCAM_BYPASS(x) ((x) << S_TCAM_BYPASS)
+#define F_TCAM_BYPASS    V_TCAM_BYPASS(1U)
+
+#define S_NAGLE    17
+#define V_NAGLE(x) ((x) << S_NAGLE)
+#define F_NAGLE    V_NAGLE(1U)
+
+#define S_WND_SCALE    18
+#define M_WND_SCALE    0xF
+#define V_WND_SCALE(x) ((x) << S_WND_SCALE)
+#define G_WND_SCALE(x) (((x) >> S_WND_SCALE) & M_WND_SCALE)
+
+#define S_KEEP_ALIVE    22
+#define V_KEEP_ALIVE(x) ((x) << S_KEEP_ALIVE)
+#define F_KEEP_ALIVE    V_KEEP_ALIVE(1U)
+
+#define S_MAX_RETRANS    23
+#define M_MAX_RETRANS    0xF
+#define V_MAX_RETRANS(x) ((x) << S_MAX_RETRANS)
+#define G_MAX_RETRANS(x) (((x) >> S_MAX_RETRANS) & M_MAX_RETRANS)
+
+#define S_MAX_RETRANS_OVERRIDE    27
+#define V_MAX_RETRANS_OVERRIDE(x) ((x) << S_MAX_RETRANS_OVERRIDE)
+#define F_MAX_RETRANS_OVERRIDE    V_MAX_RETRANS_OVERRIDE(1U)
+
+#define S_MSS_IDX    28
+#define M_MSS_IDX    0xF
+#define V_MSS_IDX(x) ((x) << S_MSS_IDX)
+#define G_MSS_IDX(x) (((x) >> S_MSS_IDX) & M_MSS_IDX)
+
+/* option 1 fields */
+#define S_RSS_ENABLE    0
+#define V_RSS_ENABLE(x) ((x) << S_RSS_ENABLE)
+#define F_RSS_ENABLE    V_RSS_ENABLE(1U)
+
+#define S_RSS_MASK_LEN    1
+#define M_RSS_MASK_LEN    0x7
+#define V_RSS_MASK_LEN(x) ((x) << S_RSS_MASK_LEN)
+#define G_RSS_MASK_LEN(x) (((x) >> S_RSS_MASK_LEN) & M_RSS_MASK_LEN)
+
+#define S_CPU_IDX    4
+#define M_CPU_IDX    0x3F
+#define V_CPU_IDX(x) ((x) << S_CPU_IDX)
+#define G_CPU_IDX(x) (((x) >> S_CPU_IDX) & M_CPU_IDX)
+
+#define S_MAC_MATCH_VALID    18
+#define V_MAC_MATCH_VALID(x) ((x) << S_MAC_MATCH_VALID)
+#define F_MAC_MATCH_VALID    V_MAC_MATCH_VALID(1U)
+
+#define S_CONN_POLICY    19
+#define M_CONN_POLICY    0x3
+#define V_CONN_POLICY(x) ((x) << S_CONN_POLICY)
+#define G_CONN_POLICY(x) (((x) >> S_CONN_POLICY) & M_CONN_POLICY)
+
+#define S_SYN_DEFENSE    21
+#define V_SYN_DEFENSE(x) ((x) << S_SYN_DEFENSE)
+#define F_SYN_DEFENSE    V_SYN_DEFENSE(1U)
+
+#define S_VLAN_PRI    22
+#define M_VLAN_PRI    0x3
+#define V_VLAN_PRI(x) ((x) << S_VLAN_PRI)
+#define G_VLAN_PRI(x) (((x) >> S_VLAN_PRI) & M_VLAN_PRI)
+
+#define S_VLAN_PRI_VALID    24
+#define V_VLAN_PRI_VALID(x) ((x) << S_VLAN_PRI_VALID)
+#define F_VLAN_PRI_VALID    V_VLAN_PRI_VALID(1U)
+
+#define S_PKT_TYPE    25
+#define M_PKT_TYPE    0x3
+#define V_PKT_TYPE(x) ((x) << S_PKT_TYPE)
+#define G_PKT_TYPE(x) (((x) >> S_PKT_TYPE) & M_PKT_TYPE)
+
+#define S_MAC_MATCH    27
+#define M_MAC_MATCH    0x1F
+#define V_MAC_MATCH(x) ((x) << S_MAC_MATCH)
+#define G_MAC_MATCH(x) (((x) >> S_MAC_MATCH) & M_MAC_MATCH)
+
+/* option 2 fields */
+#define S_CPU_INDEX    0
+#define M_CPU_INDEX    0x7F
+#define V_CPU_INDEX(x) ((x) << S_CPU_INDEX)
+#define G_CPU_INDEX(x) (((x) >> S_CPU_INDEX) & M_CPU_INDEX)
+
+#define S_CPU_INDEX_VALID    7
+#define V_CPU_INDEX_VALID(x) ((x) << S_CPU_INDEX_VALID)
+#define F_CPU_INDEX_VALID    V_CPU_INDEX_VALID(1U)
+
+#define S_RX_COALESCE    8
+#define M_RX_COALESCE    0x3
+#define V_RX_COALESCE(x) ((x) << S_RX_COALESCE)
+#define G_RX_COALESCE(x) (((x) >> S_RX_COALESCE) & M_RX_COALESCE)
+
+#define S_RX_COALESCE_VALID    10
+#define V_RX_COALESCE_VALID(x) ((x) << S_RX_COALESCE_VALID)
+#define F_RX_COALESCE_VALID    V_RX_COALESCE_VALID(1U)
+
+#define S_CONG_CONTROL_FLAVOR    11
+#define M_CONG_CONTROL_FLAVOR    0x3
+#define V_CONG_CONTROL_FLAVOR(x) ((x) << S_CONG_CONTROL_FLAVOR)
+#define G_CONG_CONTROL_FLAVOR(x) (((x) >> S_CONG_CONTROL_FLAVOR) & M_CONG_CONTROL_FLAVOR)
+
+#define S_PACING_FLAVOR    13
+#define M_PACING_FLAVOR    0x3
+#define V_PACING_FLAVOR(x) ((x) << S_PACING_FLAVOR)
+#define G_PACING_FLAVOR(x) (((x) >> S_PACING_FLAVOR) & M_PACING_FLAVOR)
+
+#define S_FLAVORS_VALID    15
+#define V_FLAVORS_VALID(x) ((x) << S_FLAVORS_VALID)
+#define F_FLAVORS_VALID    V_FLAVORS_VALID(1U)
+
+#define S_RX_FC_DISABLE    16
+#define V_RX_FC_DISABLE(x) ((x) << S_RX_FC_DISABLE)
+#define F_RX_FC_DISABLE    V_RX_FC_DISABLE(1U)
+
+#define S_RX_FC_VALID    17
+#define V_RX_FC_VALID(x) ((x) << S_RX_FC_VALID)
+#define F_RX_FC_VALID    V_RX_FC_VALID(1U)
+
+struct cpl_pass_open_req {
+	WR_HDR;
+	union opcode_tid ot;
+	__be16 local_port;
+	__be16 peer_port;
+	__be32 local_ip;
+	__be32 peer_ip;
+	__be32 opt0h;
+	__be32 opt0l;
+	__be32 peer_netmask;
+	__be32 opt1;
+};
+
+struct cpl_pass_open_rpl {
+	RSS_HDR union opcode_tid ot;
+	__be16 local_port;
+	__be16 peer_port;
+	__be32 local_ip;
+	__be32 peer_ip;
+	__u8 resvd[7];
+	__u8 status;
+};
+
+struct cpl_pass_establish {
+	RSS_HDR union opcode_tid ot;
+	__be16 local_port;
+	__be16 peer_port;
+	__be32 local_ip;
+	__be32 peer_ip;
+	__be32 tos_tid;
+	__be16 l2t_idx;
+	__be16 tcp_opt;
+	__be32 snd_isn;
+	__be32 rcv_isn;
+};
+
+/* cpl_pass_establish.tos_tid fields */
+#define S_PASS_OPEN_TID    0
+#define M_PASS_OPEN_TID    0xFFFFFF
+#define V_PASS_OPEN_TID(x) ((x) << S_PASS_OPEN_TID)
+#define G_PASS_OPEN_TID(x) (((x) >> S_PASS_OPEN_TID) & M_PASS_OPEN_TID)
+
+#define S_PASS_OPEN_TOS    24
+#define M_PASS_OPEN_TOS    0xFF
+#define V_PASS_OPEN_TOS(x) ((x) << S_PASS_OPEN_TOS)
+#define G_PASS_OPEN_TOS(x) (((x) >> S_PASS_OPEN_TOS) & M_PASS_OPEN_TOS)
+
+/* cpl_pass_establish.l2t_idx fields */
+#define S_L2T_IDX16    5
+#define M_L2T_IDX16    0x7FF
+#define V_L2T_IDX16(x) ((x) << S_L2T_IDX16)
+#define G_L2T_IDX16(x) (((x) >> S_L2T_IDX16) & M_L2T_IDX16)
+
+/* cpl_pass_establish.tcp_opt fields (also applies act_open_establish) */
+#define G_TCPOPT_WSCALE_OK(x)  (((x) >> 5) & 1)
+#define G_TCPOPT_SACK(x)       (((x) >> 6) & 1)
+#define G_TCPOPT_TSTAMP(x)     (((x) >> 7) & 1)
+#define G_TCPOPT_SND_WSCALE(x) (((x) >> 8) & 0xf)
+#define G_TCPOPT_MSS(x)        (((x) >> 12) & 0xf)
+
+struct cpl_pass_accept_req {
+	RSS_HDR union opcode_tid ot;
+	__be16 local_port;
+	__be16 peer_port;
+	__be32 local_ip;
+	__be32 peer_ip;
+	__be32 tos_tid;
+	struct tcp_options tcp_options;
+	__u8 dst_mac[6];
+	__be16 vlan_tag;
+	__u8 src_mac[6];
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+	 __u8:3;
+	__u8 addr_idx:3;
+	__u8 port_idx:1;
+	__u8 exact_match:1;
+#else
+	__u8 exact_match:1;
+	__u8 port_idx:1;
+	__u8 addr_idx:3;
+	 __u8:3;
+#endif
+	__u8 rsvd;
+	__be32 rcv_isn;
+	__be32 rsvd2;
+};
+
+struct cpl_pass_accept_rpl {
+	WR_HDR;
+	union opcode_tid ot;
+	__be32 opt2;
+	__be32 rsvd;
+	__be32 peer_ip;
+	__be32 opt0h;
+	__be32 opt0l_status;
+};
+
+struct cpl_act_open_req {
+	WR_HDR;
+	union opcode_tid ot;
+	__be16 local_port;
+	__be16 peer_port;
+	__be32 local_ip;
+	__be32 peer_ip;
+	__be32 opt0h;
+	__be32 opt0l;
+	__be32 params;
+	__be32 opt2;
+};
+
+/* cpl_act_open_req.params fields */
+#define S_AOPEN_VLAN_PRI    9
+#define M_AOPEN_VLAN_PRI    0x3
+#define V_AOPEN_VLAN_PRI(x) ((x) << S_AOPEN_VLAN_PRI)
+#define G_AOPEN_VLAN_PRI(x) (((x) >> S_AOPEN_VLAN_PRI) & M_AOPEN_VLAN_PRI)
+
+#define S_AOPEN_VLAN_PRI_VALID    11
+#define V_AOPEN_VLAN_PRI_VALID(x) ((x) << S_AOPEN_VLAN_PRI_VALID)
+#define F_AOPEN_VLAN_PRI_VALID    V_AOPEN_VLAN_PRI_VALID(1U)
+
+#define S_AOPEN_PKT_TYPE    12
+#define M_AOPEN_PKT_TYPE    0x3
+#define V_AOPEN_PKT_TYPE(x) ((x) << S_AOPEN_PKT_TYPE)
+#define G_AOPEN_PKT_TYPE(x) (((x) >> S_AOPEN_PKT_TYPE) & M_AOPEN_PKT_TYPE)
+
+#define S_AOPEN_MAC_MATCH    14
+#define M_AOPEN_MAC_MATCH    0x1F
+#define V_AOPEN_MAC_MATCH(x) ((x) << S_AOPEN_MAC_MATCH)
+#define G_AOPEN_MAC_MATCH(x) (((x) >> S_AOPEN_MAC_MATCH) & M_AOPEN_MAC_MATCH)
+
+#define S_AOPEN_MAC_MATCH_VALID    19
+#define V_AOPEN_MAC_MATCH_VALID(x) ((x) << S_AOPEN_MAC_MATCH_VALID)
+#define F_AOPEN_MAC_MATCH_VALID    V_AOPEN_MAC_MATCH_VALID(1U)
+
+#define S_AOPEN_IFF_VLAN    20
+#define M_AOPEN_IFF_VLAN    0xFFF
+#define V_AOPEN_IFF_VLAN(x) ((x) << S_AOPEN_IFF_VLAN)
+#define G_AOPEN_IFF_VLAN(x) (((x) >> S_AOPEN_IFF_VLAN) & M_AOPEN_IFF_VLAN)
+
+struct cpl_act_open_rpl {
+	RSS_HDR union opcode_tid ot;
+	__be16 local_port;
+	__be16 peer_port;
+	__be32 local_ip;
+	__be32 peer_ip;
+	__be32 atid;
+	__u8 rsvd[3];
+	__u8 status;
+};
+
+struct cpl_act_establish {
+	RSS_HDR union opcode_tid ot;
+	__be16 local_port;
+	__be16 peer_port;
+	__be32 local_ip;
+	__be32 peer_ip;
+	__be32 tos_tid;
+	__be16 l2t_idx;
+	__be16 tcp_opt;
+	__be32 snd_isn;
+	__be32 rcv_isn;
+};
+
+struct cpl_get_tcb {
+	WR_HDR;
+	union opcode_tid ot;
+	__be16 cpuno;
+	__be16 rsvd;
+};
+
+struct cpl_get_tcb_rpl {
+	RSS_HDR union opcode_tid ot;
+	__u8 rsvd;
+	__u8 status;
+	__be16 len;
+};
+
+struct cpl_set_tcb {
+	WR_HDR;
+	union opcode_tid ot;
+	__u8 reply;
+	__u8 cpu_idx;
+	__be16 len;
+};
+
+/* cpl_set_tcb.reply fields */
+#define S_NO_REPLY    7
+#define V_NO_REPLY(x) ((x) << S_NO_REPLY)
+#define F_NO_REPLY    V_NO_REPLY(1U)
+
+struct cpl_set_tcb_field {
+	WR_HDR;
+	union opcode_tid ot;
+	__u8 reply;
+	__u8 cpu_idx;
+	__be16 word;
+	__be64 mask;
+	__be64 val;
+};
+
+struct cpl_set_tcb_rpl {
+	RSS_HDR union opcode_tid ot;
+	__u8 rsvd[3];
+	__u8 status;
+};
+
+struct cpl_pcmd {
+	WR_HDR;
+	union opcode_tid ot;
+	__u8 rsvd[3];
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+	__u8 src:1;
+	__u8 bundle:1;
+	__u8 channel:1;
+	 __u8:5;
+#else
+	 __u8:5;
+	__u8 channel:1;
+	__u8 bundle:1;
+	__u8 src:1;
+#endif
+	__be32 pcmd_parm[2];
+};
+
+struct cpl_pcmd_reply {
+	RSS_HDR union opcode_tid ot;
+	__u8 status;
+	__u8 rsvd;
+	__be16 len;
+};
+
+struct cpl_close_con_req {
+	WR_HDR;
+	union opcode_tid ot;
+	__be32 rsvd;
+};
+
+struct cpl_close_con_rpl {
+	RSS_HDR union opcode_tid ot;
+	__u8 rsvd[3];
+	__u8 status;
+	__be32 snd_nxt;
+	__be32 rcv_nxt;
+};
+
+struct cpl_close_listserv_req {
+	WR_HDR;
+	union opcode_tid ot;
+	__u8 rsvd0;
+	__u8 cpu_idx;
+	__be16 rsvd1;
+};
+
+struct cpl_close_listserv_rpl {
+	RSS_HDR union opcode_tid ot;
+	__u8 rsvd[3];
+	__u8 status;
+};
+
+struct cpl_abort_req_rss {
+	RSS_HDR union opcode_tid ot;
+	__be32 rsvd0;
+	__u8 rsvd1;
+	__u8 status;
+	__u8 rsvd2[6];
+};
+
+struct cpl_abort_req {
+	WR_HDR;
+	union opcode_tid ot;
+	__be32 rsvd0;
+	__u8 rsvd1;
+	__u8 cmd;
+	__u8 rsvd2[6];
+};
+
+struct cpl_abort_rpl_rss {
+	RSS_HDR union opcode_tid ot;
+	__be32 rsvd0;
+	__u8 rsvd1;
+	__u8 status;
+	__u8 rsvd2[6];
+};
+
+struct cpl_abort_rpl {
+	WR_HDR;
+	union opcode_tid ot;
+	__be32 rsvd0;
+	__u8 rsvd1;
+	__u8 cmd;
+	__u8 rsvd2[6];
+};
+
+struct cpl_peer_close {
+	RSS_HDR union opcode_tid ot;
+	__be32 rcv_nxt;
+};
+
+struct tx_data_wr {
+	__be32 wr_hi;
+	__be32 wr_lo;
+	__be32 len;
+	__be32 flags;
+	__be32 sndseq;
+	__be32 param;
+};
+
+/* tx_data_wr.flags fields */
+#define S_TX_ACK_PAGES	21
+#define M_TX_ACK_PAGES	0x7
+#define V_TX_ACK_PAGES(x) ((x) << S_TX_ACK_PAGES)
+#define G_TX_ACK_PAGES(x) (((x) >> S_TX_ACK_PAGES) & M_TX_ACK_PAGES)
+
+/* tx_data_wr.param fields */
+#define S_TX_PORT    0
+#define M_TX_PORT    0x7
+#define V_TX_PORT(x) ((x) << S_TX_PORT)
+#define G_TX_PORT(x) (((x) >> S_TX_PORT) & M_TX_PORT)
+
+#define S_TX_MSS    4
+#define M_TX_MSS    0xF
+#define V_TX_MSS(x) ((x) << S_TX_MSS)
+#define G_TX_MSS(x) (((x) >> S_TX_MSS) & M_TX_MSS)
+
+#define S_TX_QOS    8
+#define M_TX_QOS    0xFF
+#define V_TX_QOS(x) ((x) << S_TX_QOS)
+#define G_TX_QOS(x) (((x) >> S_TX_QOS) & M_TX_QOS)
+
+#define S_TX_SNDBUF 16
+#define M_TX_SNDBUF 0xFFFF
+#define V_TX_SNDBUF(x) ((x) << S_TX_SNDBUF)
+#define G_TX_SNDBUF(x) (((x) >> S_TX_SNDBUF) & M_TX_SNDBUF)
+
+struct cpl_tx_data {
+	union opcode_tid ot;
+	__be32 len;
+	__be32 rsvd;
+	__be16 urg;
+	__be16 flags;
+};
+
+/* cpl_tx_data.flags fields */
+#define S_TX_ULP_SUBMODE    6
+#define M_TX_ULP_SUBMODE    0xF
+#define V_TX_ULP_SUBMODE(x) ((x) << S_TX_ULP_SUBMODE)
+#define G_TX_ULP_SUBMODE(x) (((x) >> S_TX_ULP_SUBMODE) & M_TX_ULP_SUBMODE)
+
+#define S_TX_ULP_MODE    10
+#define M_TX_ULP_MODE    0xF
+#define V_TX_ULP_MODE(x) ((x) << S_TX_ULP_MODE)
+#define G_TX_ULP_MODE(x) (((x) >> S_TX_ULP_MODE) & M_TX_ULP_MODE)
+
+#define S_TX_SHOVE    14
+#define V_TX_SHOVE(x) ((x) << S_TX_SHOVE)
+#define F_TX_SHOVE    V_TX_SHOVE(1U)
+
+#define S_TX_MORE    15
+#define V_TX_MORE(x) ((x) << S_TX_MORE)
+#define F_TX_MORE    V_TX_MORE(1U)
+
+/* additional tx_data_wr.flags fields */
+#define S_TX_CPU_IDX    0
+#define M_TX_CPU_IDX    0x3F
+#define V_TX_CPU_IDX(x) ((x) << S_TX_CPU_IDX)
+#define G_TX_CPU_IDX(x) (((x) >> S_TX_CPU_IDX) & M_TX_CPU_IDX)
+
+#define S_TX_URG    16
+#define V_TX_URG(x) ((x) << S_TX_URG)
+#define F_TX_URG    V_TX_URG(1U)
+
+#define S_TX_CLOSE    17
+#define V_TX_CLOSE(x) ((x) << S_TX_CLOSE)
+#define F_TX_CLOSE    V_TX_CLOSE(1U)
+
+#define S_TX_INIT    18
+#define V_TX_INIT(x) ((x) << S_TX_INIT)
+#define F_TX_INIT    V_TX_INIT(1U)
+
+#define S_TX_IMM_ACK    19
+#define V_TX_IMM_ACK(x) ((x) << S_TX_IMM_ACK)
+#define F_TX_IMM_ACK    V_TX_IMM_ACK(1U)
+
+#define S_TX_IMM_DMA    20
+#define V_TX_IMM_DMA(x) ((x) << S_TX_IMM_DMA)
+#define F_TX_IMM_DMA    V_TX_IMM_DMA(1U)
+
+struct cpl_tx_data_ack {
+	RSS_HDR union opcode_tid ot;
+	__be32 ack_seq;
+};
+
+struct cpl_wr_ack {
+	RSS_HDR union opcode_tid ot;
+	__be16 credits;
+	__be16 rsvd;
+	__be32 snd_nxt;
+	__be32 snd_una;
+};
+
+struct cpl_rdma_ec_status {
+	RSS_HDR union opcode_tid ot;
+	__u8 rsvd[3];
+	__u8 status;
+};
+
+struct mngt_pktsched_wr {
+	__be32 wr_hi;
+	__be32 wr_lo;
+	__u8 mngt_opcode;
+	__u8 rsvd[7];
+	__u8 sched;
+	__u8 idx;
+	__u8 min;
+	__u8 max;
+	__u8 binding;
+	__u8 rsvd1[3];
+};
+
+struct cpl_iscsi_hdr {
+	RSS_HDR union opcode_tid ot;
+	__be16 pdu_len_ddp;
+	__be16 len;
+	__be32 seq;
+	__be16 urg;
+	__u8 rsvd;
+	__u8 status;
+};
+
+/* cpl_iscsi_hdr.pdu_len_ddp fields */
+#define S_ISCSI_PDU_LEN    0
+#define M_ISCSI_PDU_LEN    0x7FFF
+#define V_ISCSI_PDU_LEN(x) ((x) << S_ISCSI_PDU_LEN)
+#define G_ISCSI_PDU_LEN(x) (((x) >> S_ISCSI_PDU_LEN) & M_ISCSI_PDU_LEN)
+
+#define S_ISCSI_DDP    15
+#define V_ISCSI_DDP(x) ((x) << S_ISCSI_DDP)
+#define F_ISCSI_DDP    V_ISCSI_DDP(1U)
+
+struct cpl_rx_data {
+	RSS_HDR union opcode_tid ot;
+	__be16 rsvd;
+	__be16 len;
+	__be32 seq;
+	__be16 urg;
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+	__u8 dack_mode:2;
+	__u8 psh:1;
+	__u8 heartbeat:1;
+	 __u8:4;
+#else
+	 __u8:4;
+	__u8 heartbeat:1;
+	__u8 psh:1;
+	__u8 dack_mode:2;
+#endif
+	__u8 status;
+};
+
+struct cpl_rx_data_ack {
+	WR_HDR;
+	union opcode_tid ot;
+	__be32 credit_dack;
+};
+
+/* cpl_rx_data_ack.ack_seq fields */
+#define S_RX_CREDITS    0
+#define M_RX_CREDITS    0x7FFFFFF
+#define V_RX_CREDITS(x) ((x) << S_RX_CREDITS)
+#define G_RX_CREDITS(x) (((x) >> S_RX_CREDITS) & M_RX_CREDITS)
+
+#define S_RX_MODULATE    27
+#define V_RX_MODULATE(x) ((x) << S_RX_MODULATE)
+#define F_RX_MODULATE    V_RX_MODULATE(1U)
+
+#define S_RX_FORCE_ACK    28
+#define V_RX_FORCE_ACK(x) ((x) << S_RX_FORCE_ACK)
+#define F_RX_FORCE_ACK    V_RX_FORCE_ACK(1U)
+
+#define S_RX_DACK_MODE    29
+#define M_RX_DACK_MODE    0x3
+#define V_RX_DACK_MODE(x) ((x) << S_RX_DACK_MODE)
+#define G_RX_DACK_MODE(x) (((x) >> S_RX_DACK_MODE) & M_RX_DACK_MODE)
+
+#define S_RX_DACK_CHANGE    31
+#define V_RX_DACK_CHANGE(x) ((x) << S_RX_DACK_CHANGE)
+#define F_RX_DACK_CHANGE    V_RX_DACK_CHANGE(1U)
+
+struct cpl_rx_urg_notify {
+	RSS_HDR union opcode_tid ot;
+	__be32 seq;
+};
+
+struct cpl_rx_ddp_complete {
+	RSS_HDR union opcode_tid ot;
+	__be32 ddp_report;
+};
+
+struct cpl_rx_data_ddp {
+	RSS_HDR union opcode_tid ot;
+	__be16 urg;
+	__be16 len;
+	__be32 seq;
+	union {
+		__be32 nxt_seq;
+		__be32 ddp_report;
+	};
+	__be32 ulp_crc;
+	__be32 ddpvld_status;
+};
+
+/* cpl_rx_data_ddp.ddpvld_status fields */
+#define S_DDP_STATUS    0
+#define M_DDP_STATUS    0xFF
+#define V_DDP_STATUS(x) ((x) << S_DDP_STATUS)
+#define G_DDP_STATUS(x) (((x) >> S_DDP_STATUS) & M_DDP_STATUS)
+
+#define S_DDP_VALID    15
+#define M_DDP_VALID    0x1FFFF
+#define V_DDP_VALID(x) ((x) << S_DDP_VALID)
+#define G_DDP_VALID(x) (((x) >> S_DDP_VALID) & M_DDP_VALID)
+
+#define S_DDP_PPOD_MISMATCH    15
+#define V_DDP_PPOD_MISMATCH(x) ((x) << S_DDP_PPOD_MISMATCH)
+#define F_DDP_PPOD_MISMATCH    V_DDP_PPOD_MISMATCH(1U)
+
+#define S_DDP_PDU    16
+#define V_DDP_PDU(x) ((x) << S_DDP_PDU)
+#define F_DDP_PDU    V_DDP_PDU(1U)
+
+#define S_DDP_LLIMIT_ERR    17
+#define V_DDP_LLIMIT_ERR(x) ((x) << S_DDP_LLIMIT_ERR)
+#define F_DDP_LLIMIT_ERR    V_DDP_LLIMIT_ERR(1U)
+
+#define S_DDP_PPOD_PARITY_ERR    18
+#define V_DDP_PPOD_PARITY_ERR(x) ((x) << S_DDP_PPOD_PARITY_ERR)
+#define F_DDP_PPOD_PARITY_ERR    V_DDP_PPOD_PARITY_ERR(1U)
+
+#define S_DDP_PADDING_ERR    19
+#define V_DDP_PADDING_ERR(x) ((x) << S_DDP_PADDING_ERR)
+#define F_DDP_PADDING_ERR    V_DDP_PADDING_ERR(1U)
+
+#define S_DDP_HDRCRC_ERR    20
+#define V_DDP_HDRCRC_ERR(x) ((x) << S_DDP_HDRCRC_ERR)
+#define F_DDP_HDRCRC_ERR    V_DDP_HDRCRC_ERR(1U)
+
+#define S_DDP_DATACRC_ERR    21
+#define V_DDP_DATACRC_ERR(x) ((x) << S_DDP_DATACRC_ERR)
+#define F_DDP_DATACRC_ERR    V_DDP_DATACRC_ERR(1U)
+
+#define S_DDP_INVALID_TAG    22
+#define V_DDP_INVALID_TAG(x) ((x) << S_DDP_INVALID_TAG)
+#define F_DDP_INVALID_TAG    V_DDP_INVALID_TAG(1U)
+
+#define S_DDP_ULIMIT_ERR    23
+#define V_DDP_ULIMIT_ERR(x) ((x) << S_DDP_ULIMIT_ERR)
+#define F_DDP_ULIMIT_ERR    V_DDP_ULIMIT_ERR(1U)
+
+#define S_DDP_OFFSET_ERR    24
+#define V_DDP_OFFSET_ERR(x) ((x) << S_DDP_OFFSET_ERR)
+#define F_DDP_OFFSET_ERR    V_DDP_OFFSET_ERR(1U)
+
+#define S_DDP_COLOR_ERR    25
+#define V_DDP_COLOR_ERR(x) ((x) << S_DDP_COLOR_ERR)
+#define F_DDP_COLOR_ERR    V_DDP_COLOR_ERR(1U)
+
+#define S_DDP_TID_MISMATCH    26
+#define V_DDP_TID_MISMATCH(x) ((x) << S_DDP_TID_MISMATCH)
+#define F_DDP_TID_MISMATCH    V_DDP_TID_MISMATCH(1U)
+
+#define S_DDP_INVALID_PPOD    27
+#define V_DDP_INVALID_PPOD(x) ((x) << S_DDP_INVALID_PPOD)
+#define F_DDP_INVALID_PPOD    V_DDP_INVALID_PPOD(1U)
+
+#define S_DDP_ULP_MODE    28
+#define M_DDP_ULP_MODE    0xF
+#define V_DDP_ULP_MODE(x) ((x) << S_DDP_ULP_MODE)
+#define G_DDP_ULP_MODE(x) (((x) >> S_DDP_ULP_MODE) & M_DDP_ULP_MODE)
+
+/* cpl_rx_data_ddp.ddp_report fields */
+#define S_DDP_OFFSET    0
+#define M_DDP_OFFSET    0x3FFFFF
+#define V_DDP_OFFSET(x) ((x) << S_DDP_OFFSET)
+#define G_DDP_OFFSET(x) (((x) >> S_DDP_OFFSET) & M_DDP_OFFSET)
+
+#define S_DDP_URG    24
+#define V_DDP_URG(x) ((x) << S_DDP_URG)
+#define F_DDP_URG    V_DDP_URG(1U)
+
+#define S_DDP_PSH    25
+#define V_DDP_PSH(x) ((x) << S_DDP_PSH)
+#define F_DDP_PSH    V_DDP_PSH(1U)
+
+#define S_DDP_BUF_COMPLETE    26
+#define V_DDP_BUF_COMPLETE(x) ((x) << S_DDP_BUF_COMPLETE)
+#define F_DDP_BUF_COMPLETE    V_DDP_BUF_COMPLETE(1U)
+
+#define S_DDP_BUF_TIMED_OUT    27
+#define V_DDP_BUF_TIMED_OUT(x) ((x) << S_DDP_BUF_TIMED_OUT)
+#define F_DDP_BUF_TIMED_OUT    V_DDP_BUF_TIMED_OUT(1U)
+
+#define S_DDP_BUF_IDX    28
+#define V_DDP_BUF_IDX(x) ((x) << S_DDP_BUF_IDX)
+#define F_DDP_BUF_IDX    V_DDP_BUF_IDX(1U)
+
+struct cpl_tx_pkt {
+	WR_HDR;
+	__be32 cntrl;
+	__be32 len;
+};
+
+struct cpl_tx_pkt_lso {
+	WR_HDR;
+	__be32 cntrl;
+	__be32 len;
+
+	__be32 rsvd;
+	__be32 lso_info;
+};
+
+/* cpl_tx_pkt*.cntrl fields */
+#define S_TXPKT_VLAN    0
+#define M_TXPKT_VLAN    0xFFFF
+#define V_TXPKT_VLAN(x) ((x) << S_TXPKT_VLAN)
+#define G_TXPKT_VLAN(x) (((x) >> S_TXPKT_VLAN) & M_TXPKT_VLAN)
+
+#define S_TXPKT_INTF    16
+#define M_TXPKT_INTF    0xF
+#define V_TXPKT_INTF(x) ((x) << S_TXPKT_INTF)
+#define G_TXPKT_INTF(x) (((x) >> S_TXPKT_INTF) & M_TXPKT_INTF)
+
+#define S_TXPKT_IPCSUM_DIS    20
+#define V_TXPKT_IPCSUM_DIS(x) ((x) << S_TXPKT_IPCSUM_DIS)
+#define F_TXPKT_IPCSUM_DIS    V_TXPKT_IPCSUM_DIS(1U)
+
+#define S_TXPKT_L4CSUM_DIS    21
+#define V_TXPKT_L4CSUM_DIS(x) ((x) << S_TXPKT_L4CSUM_DIS)
+#define F_TXPKT_L4CSUM_DIS    V_TXPKT_L4CSUM_DIS(1U)
+
+#define S_TXPKT_VLAN_VLD    22
+#define V_TXPKT_VLAN_VLD(x) ((x) << S_TXPKT_VLAN_VLD)
+#define F_TXPKT_VLAN_VLD    V_TXPKT_VLAN_VLD(1U)
+
+#define S_TXPKT_LOOPBACK    23
+#define V_TXPKT_LOOPBACK(x) ((x) << S_TXPKT_LOOPBACK)
+#define F_TXPKT_LOOPBACK    V_TXPKT_LOOPBACK(1U)
+
+#define S_TXPKT_OPCODE    24
+#define M_TXPKT_OPCODE    0xFF
+#define V_TXPKT_OPCODE(x) ((x) << S_TXPKT_OPCODE)
+#define G_TXPKT_OPCODE(x) (((x) >> S_TXPKT_OPCODE) & M_TXPKT_OPCODE)
+
+/* cpl_tx_pkt_lso.lso_info fields */
+#define S_LSO_MSS    0
+#define M_LSO_MSS    0x3FFF
+#define V_LSO_MSS(x) ((x) << S_LSO_MSS)
+#define G_LSO_MSS(x) (((x) >> S_LSO_MSS) & M_LSO_MSS)
+
+#define S_LSO_ETH_TYPE    14
+#define M_LSO_ETH_TYPE    0x3
+#define V_LSO_ETH_TYPE(x) ((x) << S_LSO_ETH_TYPE)
+#define G_LSO_ETH_TYPE(x) (((x) >> S_LSO_ETH_TYPE) & M_LSO_ETH_TYPE)
+
+#define S_LSO_TCPHDR_WORDS    16
+#define M_LSO_TCPHDR_WORDS    0xF
+#define V_LSO_TCPHDR_WORDS(x) ((x) << S_LSO_TCPHDR_WORDS)
+#define G_LSO_TCPHDR_WORDS(x) (((x) >> S_LSO_TCPHDR_WORDS) & M_LSO_TCPHDR_WORDS)
+
+#define S_LSO_IPHDR_WORDS    20
+#define M_LSO_IPHDR_WORDS    0xF
+#define V_LSO_IPHDR_WORDS(x) ((x) << S_LSO_IPHDR_WORDS)
+#define G_LSO_IPHDR_WORDS(x) (((x) >> S_LSO_IPHDR_WORDS) & M_LSO_IPHDR_WORDS)
+
+#define S_LSO_IPV6    24
+#define V_LSO_IPV6(x) ((x) << S_LSO_IPV6)
+#define F_LSO_IPV6    V_LSO_IPV6(1U)
+
+struct cpl_trace_pkt {
+#ifdef CHELSIO_FW
+	__u8 rss_opcode;
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+	__u8 err:1;
+	 __u8:7;
+#else
+	 __u8:7;
+	__u8 err:1;
+#endif
+	__u8 rsvd0;
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+	__u8 qid:4;
+	 __u8:4;
+#else
+	 __u8:4;
+	__u8 qid:4;
+#endif
+	__be32 tstamp;
+#endif				/* CHELSIO_FW */
+
+	__u8 opcode;
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+	__u8 iff:4;
+	 __u8:4;
+#else
+	 __u8:4;
+	__u8 iff:4;
+#endif
+	__u8 rsvd[4];
+	__be16 len;
+};
+
+struct cpl_rx_pkt {
+	RSS_HDR __u8 opcode;
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+	__u8 iff:4;
+	__u8 csum_valid:1;
+	__u8 ipmi_pkt:1;
+	__u8 vlan_valid:1;
+	__u8 fragment:1;
+#else
+	__u8 fragment:1;
+	__u8 vlan_valid:1;
+	__u8 ipmi_pkt:1;
+	__u8 csum_valid:1;
+	__u8 iff:4;
+#endif
+	__be16 csum;
+	__be16 vlan;
+	__be16 len;
+};
+
+struct cpl_l2t_write_req {
+	WR_HDR;
+	union opcode_tid ot;
+	__be32 params;
+	__u8 rsvd[2];
+	__u8 dst_mac[6];
+};
+
+/* cpl_l2t_write_req.params fields */
+#define S_L2T_W_IDX    0
+#define M_L2T_W_IDX    0x7FF
+#define V_L2T_W_IDX(x) ((x) << S_L2T_W_IDX)
+#define G_L2T_W_IDX(x) (((x) >> S_L2T_W_IDX) & M_L2T_W_IDX)
+
+#define S_L2T_W_VLAN    11
+#define M_L2T_W_VLAN    0xFFF
+#define V_L2T_W_VLAN(x) ((x) << S_L2T_W_VLAN)
+#define G_L2T_W_VLAN(x) (((x) >> S_L2T_W_VLAN) & M_L2T_W_VLAN)
+
+#define S_L2T_W_IFF    23
+#define M_L2T_W_IFF    0xF
+#define V_L2T_W_IFF(x) ((x) << S_L2T_W_IFF)
+#define G_L2T_W_IFF(x) (((x) >> S_L2T_W_IFF) & M_L2T_W_IFF)
+
+#define S_L2T_W_PRIO    27
+#define M_L2T_W_PRIO    0x7
+#define V_L2T_W_PRIO(x) ((x) << S_L2T_W_PRIO)
+#define G_L2T_W_PRIO(x) (((x) >> S_L2T_W_PRIO) & M_L2T_W_PRIO)
+
+struct cpl_l2t_write_rpl {
+	RSS_HDR union opcode_tid ot;
+	__u8 status;
+	__u8 rsvd[3];
+};
+
+struct cpl_l2t_read_req {
+	WR_HDR;
+	union opcode_tid ot;
+	__be16 rsvd;
+	__be16 l2t_idx;
+};
+
+struct cpl_l2t_read_rpl {
+	RSS_HDR union opcode_tid ot;
+	__be32 params;
+	__u8 rsvd[2];
+	__u8 dst_mac[6];
+};
+
+/* cpl_l2t_read_rpl.params fields */
+#define S_L2T_R_PRIO    0
+#define M_L2T_R_PRIO    0x7
+#define V_L2T_R_PRIO(x) ((x) << S_L2T_R_PRIO)
+#define G_L2T_R_PRIO(x) (((x) >> S_L2T_R_PRIO) & M_L2T_R_PRIO)
+
+#define S_L2T_R_VLAN    8
+#define M_L2T_R_VLAN    0xFFF
+#define V_L2T_R_VLAN(x) ((x) << S_L2T_R_VLAN)
+#define G_L2T_R_VLAN(x) (((x) >> S_L2T_R_VLAN) & M_L2T_R_VLAN)
+
+#define S_L2T_R_IFF    20
+#define M_L2T_R_IFF    0xF
+#define V_L2T_R_IFF(x) ((x) << S_L2T_R_IFF)
+#define G_L2T_R_IFF(x) (((x) >> S_L2T_R_IFF) & M_L2T_R_IFF)
+
+#define S_L2T_STATUS    24
+#define M_L2T_STATUS    0xFF
+#define V_L2T_STATUS(x) ((x) << S_L2T_STATUS)
+#define G_L2T_STATUS(x) (((x) >> S_L2T_STATUS) & M_L2T_STATUS)
+
+struct cpl_smt_write_req {
+	WR_HDR;
+	union opcode_tid ot;
+	__u8 rsvd0;
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+	__u8 mtu_idx:4;
+	__u8 iff:4;
+#else
+	__u8 iff:4;
+	__u8 mtu_idx:4;
+#endif
+	__be16 rsvd2;
+	__be16 rsvd3;
+	__u8 src_mac1[6];
+	__be16 rsvd4;
+	__u8 src_mac0[6];
+};
+
+struct cpl_smt_write_rpl {
+	RSS_HDR union opcode_tid ot;
+	__u8 status;
+	__u8 rsvd[3];
+};
+
+struct cpl_smt_read_req {
+	WR_HDR;
+	union opcode_tid ot;
+	__u8 rsvd0;
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+	 __u8:4;
+	__u8 iff:4;
+#else
+	__u8 iff:4;
+	 __u8:4;
+#endif
+	__be16 rsvd2;
+};
+
+struct cpl_smt_read_rpl {
+	RSS_HDR union opcode_tid ot;
+	__u8 status;
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+	__u8 mtu_idx:4;
+	 __u8:4;
+#else
+	 __u8:4;
+	__u8 mtu_idx:4;
+#endif
+	__be16 rsvd2;
+	__be16 rsvd3;
+	__u8 src_mac1[6];
+	__be16 rsvd4;
+	__u8 src_mac0[6];
+};
+
+struct cpl_rte_delete_req {
+	WR_HDR;
+	union opcode_tid ot;
+	__be32 params;
+};
+
+/* { cpl_rte_delete_req, cpl_rte_read_req }.params fields */
+#define S_RTE_REQ_LUT_IX    8
+#define M_RTE_REQ_LUT_IX    0x7FF
+#define V_RTE_REQ_LUT_IX(x) ((x) << S_RTE_REQ_LUT_IX)
+#define G_RTE_REQ_LUT_IX(x) (((x) >> S_RTE_REQ_LUT_IX) & M_RTE_REQ_LUT_IX)
+
+#define S_RTE_REQ_LUT_BASE    19
+#define M_RTE_REQ_LUT_BASE    0x7FF
+#define V_RTE_REQ_LUT_BASE(x) ((x) << S_RTE_REQ_LUT_BASE)
+#define G_RTE_REQ_LUT_BASE(x) (((x) >> S_RTE_REQ_LUT_BASE) & M_RTE_REQ_LUT_BASE)
+
+#define S_RTE_READ_REQ_SELECT    31
+#define V_RTE_READ_REQ_SELECT(x) ((x) << S_RTE_READ_REQ_SELECT)
+#define F_RTE_READ_REQ_SELECT    V_RTE_READ_REQ_SELECT(1U)
+
+struct cpl_rte_delete_rpl {
+	RSS_HDR union opcode_tid ot;
+	__u8 status;
+	__u8 rsvd[3];
+};
+
+struct cpl_rte_write_req {
+	WR_HDR;
+	union opcode_tid ot;
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+	 __u8:6;
+	__u8 write_tcam:1;
+	__u8 write_l2t_lut:1;
+#else
+	__u8 write_l2t_lut:1;
+	__u8 write_tcam:1;
+	 __u8:6;
+#endif
+	__u8 rsvd[3];
+	__be32 lut_params;
+	__be16 rsvd2;
+	__be16 l2t_idx;
+	__be32 netmask;
+	__be32 faddr;
+};
+
+/* cpl_rte_write_req.lut_params fields */
+#define S_RTE_WRITE_REQ_LUT_IX    10
+#define M_RTE_WRITE_REQ_LUT_IX    0x7FF
+#define V_RTE_WRITE_REQ_LUT_IX(x) ((x) << S_RTE_WRITE_REQ_LUT_IX)
+#define G_RTE_WRITE_REQ_LUT_IX(x) (((x) >> S_RTE_WRITE_REQ_LUT_IX) & M_RTE_WRITE_REQ_LUT_IX)
+
+#define S_RTE_WRITE_REQ_LUT_BASE    21
+#define M_RTE_WRITE_REQ_LUT_BASE    0x7FF
+#define V_RTE_WRITE_REQ_LUT_BASE(x) ((x) << S_RTE_WRITE_REQ_LUT_BASE)
+#define G_RTE_WRITE_REQ_LUT_BASE(x) (((x) >> S_RTE_WRITE_REQ_LUT_BASE) & M_RTE_WRITE_REQ_LUT_BASE)
+
+struct cpl_rte_write_rpl {
+	RSS_HDR union opcode_tid ot;
+	__u8 status;
+	__u8 rsvd[3];
+};
+
+struct cpl_rte_read_req {
+	WR_HDR;
+	union opcode_tid ot;
+	__be32 params;
+};
+
+struct cpl_rte_read_rpl {
+	RSS_HDR union opcode_tid ot;
+	__u8 status;
+	__u8 rsvd0;
+	__be16 l2t_idx;
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+	 __u8:7;
+	__u8 select:1;
+#else
+	__u8 select:1;
+	 __u8:7;
+#endif
+	__u8 rsvd2[3];
+	__be32 addr;
+};
+
+struct cpl_tid_release {
+	WR_HDR;
+	union opcode_tid ot;
+	__be32 rsvd;
+};
+
+struct cpl_barrier {
+	WR_HDR;
+	__u8 opcode;
+	__u8 rsvd[7];
+};
+
+struct cpl_rdma_read_req {
+	__u8 opcode;
+	__u8 rsvd[15];
+};
+
+struct cpl_rdma_terminate {
+#ifdef CHELSIO_FW
+	__u8 opcode;
+	__u8 rsvd[2];
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+	__u8 rspq:3;
+	 __u8:5;
+#else
+	 __u8:5;
+	__u8 rspq:3;
+#endif
+	__be32 tid_len;
+#endif
+	__be32 msn;
+	__be32 mo;
+	__u8 data[0];
+};
+
+/* cpl_rdma_terminate.tid_len fields */
+#define S_FLIT_CNT    0
+#define M_FLIT_CNT    0xFF
+#define V_FLIT_CNT(x) ((x) << S_FLIT_CNT)
+#define G_FLIT_CNT(x) (((x) >> S_FLIT_CNT) & M_FLIT_CNT)
+
+#define S_TERM_TID    8
+#define M_TERM_TID    0xFFFFF
+#define V_TERM_TID(x) ((x) << S_TERM_TID)
+#define G_TERM_TID(x) (((x) >> S_TERM_TID) & M_TERM_TID)
+
+/* ULP_TX opcodes */
+enum { ULP_MEM_READ = 2, ULP_MEM_WRITE = 3, ULP_TXPKT = 4 };
+
+#define S_ULPTX_CMD	28
+#define M_ULPTX_CMD	0xF
+#define V_ULPTX_CMD(x)	((x) << S_ULPTX_CMD)
+
+#define S_ULPTX_NFLITS	0
+#define M_ULPTX_NFLITS	0xFF
+#define V_ULPTX_NFLITS(x) ((x) << S_ULPTX_NFLITS)
+
+struct ulp_mem_io {
+	WR_HDR;
+	__be32 cmd_lock_addr;
+	__be32 len;
+};
+
+/* ulp_mem_io.cmd_lock_addr fields */
+#define S_ULP_MEMIO_ADDR	0
+#define M_ULP_MEMIO_ADDR	0x7FFFFFF
+#define V_ULP_MEMIO_ADDR(x)	((x) << S_ULP_MEMIO_ADDR)
+#define S_ULP_MEMIO_LOCK	27
+#define V_ULP_MEMIO_LOCK(x)	((x) << S_ULP_MEMIO_LOCK)
+#define F_ULP_MEMIO_LOCK	V_ULP_MEMIO_LOCK(1U)
+
+/* ulp_mem_io.len fields */
+#define S_ULP_MEMIO_DATA_LEN	28
+#define M_ULP_MEMIO_DATA_LEN	0xF
+#define V_ULP_MEMIO_DATA_LEN(x)	((x) << S_ULP_MEMIO_DATA_LEN)
+
+#endif				/* T3_CPL_H */
diff --git a/drivers/net/ethernet/chelsio/cxgb3/t3_hw.c b/drivers/net/ethernet/chelsio/cxgb3/t3_hw.c
new file mode 100644
index 0000000..c74a898
--- /dev/null
+++ b/drivers/net/ethernet/chelsio/cxgb3/t3_hw.c
@@ -0,0 +1,3777 @@
+/*
+ * Copyright (c) 2003-2008 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "common.h"
+#include "regs.h"
+#include "sge_defs.h"
+#include "firmware_exports.h"
+
+static void t3_port_intr_clear(struct adapter *adapter, int idx);
+
+/**
+ *	t3_wait_op_done_val - wait until an operation is completed
+ *	@adapter: the adapter performing the operation
+ *	@reg: the register to check for completion
+ *	@mask: a single-bit field within @reg that indicates completion
+ *	@polarity: the value of the field when the operation is completed
+ *	@attempts: number of check iterations
+ *	@delay: delay in usecs between iterations
+ *	@valp: where to store the value of the register at completion time
+ *
+ *	Wait until an operation is completed by checking a bit in a register
+ *	up to @attempts times.  If @valp is not NULL the value of the register
+ *	at the time it indicated completion is stored there.  Returns 0 if the
+ *	operation completes and -EAGAIN otherwise.
+ */
+
+int t3_wait_op_done_val(struct adapter *adapter, int reg, u32 mask,
+			int polarity, int attempts, int delay, u32 *valp)
+{
+	while (1) {
+		u32 val = t3_read_reg(adapter, reg);
+
+		if (!!(val & mask) == polarity) {
+			if (valp)
+				*valp = val;
+			return 0;
+		}
+		if (--attempts == 0)
+			return -EAGAIN;
+		if (delay)
+			udelay(delay);
+	}
+}
+
+/**
+ *	t3_write_regs - write a bunch of registers
+ *	@adapter: the adapter to program
+ *	@p: an array of register address/register value pairs
+ *	@n: the number of address/value pairs
+ *	@offset: register address offset
+ *
+ *	Takes an array of register address/register value pairs and writes each
+ *	value to the corresponding register.  Register addresses are adjusted
+ *	by the supplied offset.
+ */
+void t3_write_regs(struct adapter *adapter, const struct addr_val_pair *p,
+		   int n, unsigned int offset)
+{
+	while (n--) {
+		t3_write_reg(adapter, p->reg_addr + offset, p->val);
+		p++;
+	}
+}
+
+/**
+ *	t3_set_reg_field - set a register field to a value
+ *	@adapter: the adapter to program
+ *	@addr: the register address
+ *	@mask: specifies the portion of the register to modify
+ *	@val: the new value for the register field
+ *
+ *	Sets a register field specified by the supplied mask to the
+ *	given value.
+ */
+void t3_set_reg_field(struct adapter *adapter, unsigned int addr, u32 mask,
+		      u32 val)
+{
+	u32 v = t3_read_reg(adapter, addr) & ~mask;
+
+	t3_write_reg(adapter, addr, v | val);
+	t3_read_reg(adapter, addr);	/* flush */
+}
+
+/**
+ *	t3_read_indirect - read indirectly addressed registers
+ *	@adap: the adapter
+ *	@addr_reg: register holding the indirect address
+ *	@data_reg: register holding the value of the indirect register
+ *	@vals: where the read register values are stored
+ *	@start_idx: index of first indirect register to read
+ *	@nregs: how many indirect registers to read
+ *
+ *	Reads registers that are accessed indirectly through an address/data
+ *	register pair.
+ */
+static void t3_read_indirect(struct adapter *adap, unsigned int addr_reg,
+			     unsigned int data_reg, u32 *vals,
+			     unsigned int nregs, unsigned int start_idx)
+{
+	while (nregs--) {
+		t3_write_reg(adap, addr_reg, start_idx);
+		*vals++ = t3_read_reg(adap, data_reg);
+		start_idx++;
+	}
+}
+
+/**
+ *	t3_mc7_bd_read - read from MC7 through backdoor accesses
+ *	@mc7: identifies MC7 to read from
+ *	@start: index of first 64-bit word to read
+ *	@n: number of 64-bit words to read
+ *	@buf: where to store the read result
+ *
+ *	Read n 64-bit words from MC7 starting at word start, using backdoor
+ *	accesses.
+ */
+int t3_mc7_bd_read(struct mc7 *mc7, unsigned int start, unsigned int n,
+		   u64 *buf)
+{
+	static const int shift[] = { 0, 0, 16, 24 };
+	static const int step[] = { 0, 32, 16, 8 };
+
+	unsigned int size64 = mc7->size / 8;	/* # of 64-bit words */
+	struct adapter *adap = mc7->adapter;
+
+	if (start >= size64 || start + n > size64)
+		return -EINVAL;
+
+	start *= (8 << mc7->width);
+	while (n--) {
+		int i;
+		u64 val64 = 0;
+
+		for (i = (1 << mc7->width) - 1; i >= 0; --i) {
+			int attempts = 10;
+			u32 val;
+
+			t3_write_reg(adap, mc7->offset + A_MC7_BD_ADDR, start);
+			t3_write_reg(adap, mc7->offset + A_MC7_BD_OP, 0);
+			val = t3_read_reg(adap, mc7->offset + A_MC7_BD_OP);
+			while ((val & F_BUSY) && attempts--)
+				val = t3_read_reg(adap,
+						  mc7->offset + A_MC7_BD_OP);
+			if (val & F_BUSY)
+				return -EIO;
+
+			val = t3_read_reg(adap, mc7->offset + A_MC7_BD_DATA1);
+			if (mc7->width == 0) {
+				val64 = t3_read_reg(adap,
+						    mc7->offset +
+						    A_MC7_BD_DATA0);
+				val64 |= (u64) val << 32;
+			} else {
+				if (mc7->width > 1)
+					val >>= shift[mc7->width];
+				val64 |= (u64) val << (step[mc7->width] * i);
+			}
+			start += 8;
+		}
+		*buf++ = val64;
+	}
+	return 0;
+}
+
+/*
+ * Initialize MI1.
+ */
+static void mi1_init(struct adapter *adap, const struct adapter_info *ai)
+{
+	u32 clkdiv = adap->params.vpd.cclk / (2 * adap->params.vpd.mdc) - 1;
+	u32 val = F_PREEN | V_CLKDIV(clkdiv);
+
+	t3_write_reg(adap, A_MI1_CFG, val);
+}
+
+#define MDIO_ATTEMPTS 20
+
+/*
+ * MI1 read/write operations for clause 22 PHYs.
+ */
+static int t3_mi1_read(struct net_device *dev, int phy_addr, int mmd_addr,
+		       u16 reg_addr)
+{
+	struct port_info *pi = netdev_priv(dev);
+	struct adapter *adapter = pi->adapter;
+	int ret;
+	u32 addr = V_REGADDR(reg_addr) | V_PHYADDR(phy_addr);
+
+	mutex_lock(&adapter->mdio_lock);
+	t3_set_reg_field(adapter, A_MI1_CFG, V_ST(M_ST), V_ST(1));
+	t3_write_reg(adapter, A_MI1_ADDR, addr);
+	t3_write_reg(adapter, A_MI1_OP, V_MDI_OP(2));
+	ret = t3_wait_op_done(adapter, A_MI1_OP, F_BUSY, 0, MDIO_ATTEMPTS, 10);
+	if (!ret)
+		ret = t3_read_reg(adapter, A_MI1_DATA);
+	mutex_unlock(&adapter->mdio_lock);
+	return ret;
+}
+
+static int t3_mi1_write(struct net_device *dev, int phy_addr, int mmd_addr,
+			u16 reg_addr, u16 val)
+{
+	struct port_info *pi = netdev_priv(dev);
+	struct adapter *adapter = pi->adapter;
+	int ret;
+	u32 addr = V_REGADDR(reg_addr) | V_PHYADDR(phy_addr);
+
+	mutex_lock(&adapter->mdio_lock);
+	t3_set_reg_field(adapter, A_MI1_CFG, V_ST(M_ST), V_ST(1));
+	t3_write_reg(adapter, A_MI1_ADDR, addr);
+	t3_write_reg(adapter, A_MI1_DATA, val);
+	t3_write_reg(adapter, A_MI1_OP, V_MDI_OP(1));
+	ret = t3_wait_op_done(adapter, A_MI1_OP, F_BUSY, 0, MDIO_ATTEMPTS, 10);
+	mutex_unlock(&adapter->mdio_lock);
+	return ret;
+}
+
+static const struct mdio_ops mi1_mdio_ops = {
+	.read = t3_mi1_read,
+	.write = t3_mi1_write,
+	.mode_support = MDIO_SUPPORTS_C22
+};
+
+/*
+ * Performs the address cycle for clause 45 PHYs.
+ * Must be called with the MDIO_LOCK held.
+ */
+static int mi1_wr_addr(struct adapter *adapter, int phy_addr, int mmd_addr,
+		       int reg_addr)
+{
+	u32 addr = V_REGADDR(mmd_addr) | V_PHYADDR(phy_addr);
+
+	t3_set_reg_field(adapter, A_MI1_CFG, V_ST(M_ST), 0);
+	t3_write_reg(adapter, A_MI1_ADDR, addr);
+	t3_write_reg(adapter, A_MI1_DATA, reg_addr);
+	t3_write_reg(adapter, A_MI1_OP, V_MDI_OP(0));
+	return t3_wait_op_done(adapter, A_MI1_OP, F_BUSY, 0,
+			       MDIO_ATTEMPTS, 10);
+}
+
+/*
+ * MI1 read/write operations for indirect-addressed PHYs.
+ */
+static int mi1_ext_read(struct net_device *dev, int phy_addr, int mmd_addr,
+			u16 reg_addr)
+{
+	struct port_info *pi = netdev_priv(dev);
+	struct adapter *adapter = pi->adapter;
+	int ret;
+
+	mutex_lock(&adapter->mdio_lock);
+	ret = mi1_wr_addr(adapter, phy_addr, mmd_addr, reg_addr);
+	if (!ret) {
+		t3_write_reg(adapter, A_MI1_OP, V_MDI_OP(3));
+		ret = t3_wait_op_done(adapter, A_MI1_OP, F_BUSY, 0,
+				      MDIO_ATTEMPTS, 10);
+		if (!ret)
+			ret = t3_read_reg(adapter, A_MI1_DATA);
+	}
+	mutex_unlock(&adapter->mdio_lock);
+	return ret;
+}
+
+static int mi1_ext_write(struct net_device *dev, int phy_addr, int mmd_addr,
+			 u16 reg_addr, u16 val)
+{
+	struct port_info *pi = netdev_priv(dev);
+	struct adapter *adapter = pi->adapter;
+	int ret;
+
+	mutex_lock(&adapter->mdio_lock);
+	ret = mi1_wr_addr(adapter, phy_addr, mmd_addr, reg_addr);
+	if (!ret) {
+		t3_write_reg(adapter, A_MI1_DATA, val);
+		t3_write_reg(adapter, A_MI1_OP, V_MDI_OP(1));
+		ret = t3_wait_op_done(adapter, A_MI1_OP, F_BUSY, 0,
+				      MDIO_ATTEMPTS, 10);
+	}
+	mutex_unlock(&adapter->mdio_lock);
+	return ret;
+}
+
+static const struct mdio_ops mi1_mdio_ext_ops = {
+	.read = mi1_ext_read,
+	.write = mi1_ext_write,
+	.mode_support = MDIO_SUPPORTS_C45 | MDIO_EMULATE_C22
+};
+
+/**
+ *	t3_mdio_change_bits - modify the value of a PHY register
+ *	@phy: the PHY to operate on
+ *	@mmd: the device address
+ *	@reg: the register address
+ *	@clear: what part of the register value to mask off
+ *	@set: what part of the register value to set
+ *
+ *	Changes the value of a PHY register by applying a mask to its current
+ *	value and ORing the result with a new value.
+ */
+int t3_mdio_change_bits(struct cphy *phy, int mmd, int reg, unsigned int clear,
+			unsigned int set)
+{
+	int ret;
+	unsigned int val;
+
+	ret = t3_mdio_read(phy, mmd, reg, &val);
+	if (!ret) {
+		val &= ~clear;
+		ret = t3_mdio_write(phy, mmd, reg, val | set);
+	}
+	return ret;
+}
+
+/**
+ *	t3_phy_reset - reset a PHY block
+ *	@phy: the PHY to operate on
+ *	@mmd: the device address of the PHY block to reset
+ *	@wait: how long to wait for the reset to complete in 1ms increments
+ *
+ *	Resets a PHY block and optionally waits for the reset to complete.
+ *	@mmd should be 0 for 10/100/1000 PHYs and the device address to reset
+ *	for 10G PHYs.
+ */
+int t3_phy_reset(struct cphy *phy, int mmd, int wait)
+{
+	int err;
+	unsigned int ctl;
+
+	err = t3_mdio_change_bits(phy, mmd, MDIO_CTRL1, MDIO_CTRL1_LPOWER,
+				  MDIO_CTRL1_RESET);
+	if (err || !wait)
+		return err;
+
+	do {
+		err = t3_mdio_read(phy, mmd, MDIO_CTRL1, &ctl);
+		if (err)
+			return err;
+		ctl &= MDIO_CTRL1_RESET;
+		if (ctl)
+			msleep(1);
+	} while (ctl && --wait);
+
+	return ctl ? -1 : 0;
+}
+
+/**
+ *	t3_phy_advertise - set the PHY advertisement registers for autoneg
+ *	@phy: the PHY to operate on
+ *	@advert: bitmap of capabilities the PHY should advertise
+ *
+ *	Sets a 10/100/1000 PHY's advertisement registers to advertise the
+ *	requested capabilities.
+ */
+int t3_phy_advertise(struct cphy *phy, unsigned int advert)
+{
+	int err;
+	unsigned int val = 0;
+
+	err = t3_mdio_read(phy, MDIO_DEVAD_NONE, MII_CTRL1000, &val);
+	if (err)
+		return err;
+
+	val &= ~(ADVERTISE_1000HALF | ADVERTISE_1000FULL);
+	if (advert & ADVERTISED_1000baseT_Half)
+		val |= ADVERTISE_1000HALF;
+	if (advert & ADVERTISED_1000baseT_Full)
+		val |= ADVERTISE_1000FULL;
+
+	err = t3_mdio_write(phy, MDIO_DEVAD_NONE, MII_CTRL1000, val);
+	if (err)
+		return err;
+
+	val = 1;
+	if (advert & ADVERTISED_10baseT_Half)
+		val |= ADVERTISE_10HALF;
+	if (advert & ADVERTISED_10baseT_Full)
+		val |= ADVERTISE_10FULL;
+	if (advert & ADVERTISED_100baseT_Half)
+		val |= ADVERTISE_100HALF;
+	if (advert & ADVERTISED_100baseT_Full)
+		val |= ADVERTISE_100FULL;
+	if (advert & ADVERTISED_Pause)
+		val |= ADVERTISE_PAUSE_CAP;
+	if (advert & ADVERTISED_Asym_Pause)
+		val |= ADVERTISE_PAUSE_ASYM;
+	return t3_mdio_write(phy, MDIO_DEVAD_NONE, MII_ADVERTISE, val);
+}
+
+/**
+ *	t3_phy_advertise_fiber - set fiber PHY advertisement register
+ *	@phy: the PHY to operate on
+ *	@advert: bitmap of capabilities the PHY should advertise
+ *
+ *	Sets a fiber PHY's advertisement register to advertise the
+ *	requested capabilities.
+ */
+int t3_phy_advertise_fiber(struct cphy *phy, unsigned int advert)
+{
+	unsigned int val = 0;
+
+	if (advert & ADVERTISED_1000baseT_Half)
+		val |= ADVERTISE_1000XHALF;
+	if (advert & ADVERTISED_1000baseT_Full)
+		val |= ADVERTISE_1000XFULL;
+	if (advert & ADVERTISED_Pause)
+		val |= ADVERTISE_1000XPAUSE;
+	if (advert & ADVERTISED_Asym_Pause)
+		val |= ADVERTISE_1000XPSE_ASYM;
+	return t3_mdio_write(phy, MDIO_DEVAD_NONE, MII_ADVERTISE, val);
+}
+
+/**
+ *	t3_set_phy_speed_duplex - force PHY speed and duplex
+ *	@phy: the PHY to operate on
+ *	@speed: requested PHY speed
+ *	@duplex: requested PHY duplex
+ *
+ *	Force a 10/100/1000 PHY's speed and duplex.  This also disables
+ *	auto-negotiation except for GigE, where auto-negotiation is mandatory.
+ */
+int t3_set_phy_speed_duplex(struct cphy *phy, int speed, int duplex)
+{
+	int err;
+	unsigned int ctl;
+
+	err = t3_mdio_read(phy, MDIO_DEVAD_NONE, MII_BMCR, &ctl);
+	if (err)
+		return err;
+
+	if (speed >= 0) {
+		ctl &= ~(BMCR_SPEED100 | BMCR_SPEED1000 | BMCR_ANENABLE);
+		if (speed == SPEED_100)
+			ctl |= BMCR_SPEED100;
+		else if (speed == SPEED_1000)
+			ctl |= BMCR_SPEED1000;
+	}
+	if (duplex >= 0) {
+		ctl &= ~(BMCR_FULLDPLX | BMCR_ANENABLE);
+		if (duplex == DUPLEX_FULL)
+			ctl |= BMCR_FULLDPLX;
+	}
+	if (ctl & BMCR_SPEED1000) /* auto-negotiation required for GigE */
+		ctl |= BMCR_ANENABLE;
+	return t3_mdio_write(phy, MDIO_DEVAD_NONE, MII_BMCR, ctl);
+}
+
+int t3_phy_lasi_intr_enable(struct cphy *phy)
+{
+	return t3_mdio_write(phy, MDIO_MMD_PMAPMD, MDIO_PMA_LASI_CTRL,
+			     MDIO_PMA_LASI_LSALARM);
+}
+
+int t3_phy_lasi_intr_disable(struct cphy *phy)
+{
+	return t3_mdio_write(phy, MDIO_MMD_PMAPMD, MDIO_PMA_LASI_CTRL, 0);
+}
+
+int t3_phy_lasi_intr_clear(struct cphy *phy)
+{
+	u32 val;
+
+	return t3_mdio_read(phy, MDIO_MMD_PMAPMD, MDIO_PMA_LASI_STAT, &val);
+}
+
+int t3_phy_lasi_intr_handler(struct cphy *phy)
+{
+	unsigned int status;
+	int err = t3_mdio_read(phy, MDIO_MMD_PMAPMD, MDIO_PMA_LASI_STAT,
+			       &status);
+
+	if (err)
+		return err;
+	return (status & MDIO_PMA_LASI_LSALARM) ? cphy_cause_link_change : 0;
+}
+
+static const struct adapter_info t3_adap_info[] = {
+	{1, 1, 0,
+	 F_GPIO2_OEN | F_GPIO4_OEN |
+	 F_GPIO2_OUT_VAL | F_GPIO4_OUT_VAL, { S_GPIO3, S_GPIO5 }, 0,
+	 &mi1_mdio_ops, "Chelsio PE9000"},
+	{1, 1, 0,
+	 F_GPIO2_OEN | F_GPIO4_OEN |
+	 F_GPIO2_OUT_VAL | F_GPIO4_OUT_VAL, { S_GPIO3, S_GPIO5 }, 0,
+	 &mi1_mdio_ops, "Chelsio T302"},
+	{1, 0, 0,
+	 F_GPIO1_OEN | F_GPIO6_OEN | F_GPIO7_OEN | F_GPIO10_OEN |
+	 F_GPIO11_OEN | F_GPIO1_OUT_VAL | F_GPIO6_OUT_VAL | F_GPIO10_OUT_VAL,
+	 { 0 }, SUPPORTED_10000baseT_Full | SUPPORTED_AUI,
+	 &mi1_mdio_ext_ops, "Chelsio T310"},
+	{1, 1, 0,
+	 F_GPIO1_OEN | F_GPIO2_OEN | F_GPIO4_OEN | F_GPIO5_OEN | F_GPIO6_OEN |
+	 F_GPIO7_OEN | F_GPIO10_OEN | F_GPIO11_OEN | F_GPIO1_OUT_VAL |
+	 F_GPIO5_OUT_VAL | F_GPIO6_OUT_VAL | F_GPIO10_OUT_VAL,
+	 { S_GPIO9, S_GPIO3 }, SUPPORTED_10000baseT_Full | SUPPORTED_AUI,
+	 &mi1_mdio_ext_ops, "Chelsio T320"},
+	{},
+	{},
+	{1, 0, 0,
+	 F_GPIO1_OEN | F_GPIO2_OEN | F_GPIO4_OEN | F_GPIO6_OEN | F_GPIO7_OEN |
+	 F_GPIO10_OEN | F_GPIO1_OUT_VAL | F_GPIO6_OUT_VAL | F_GPIO10_OUT_VAL,
+	 { S_GPIO9 }, SUPPORTED_10000baseT_Full | SUPPORTED_AUI,
+	 &mi1_mdio_ext_ops, "Chelsio T310" },
+	{1, 0, 0,
+	 F_GPIO1_OEN | F_GPIO6_OEN | F_GPIO7_OEN |
+	 F_GPIO1_OUT_VAL | F_GPIO6_OUT_VAL,
+	 { S_GPIO9 }, SUPPORTED_10000baseT_Full | SUPPORTED_AUI,
+	 &mi1_mdio_ext_ops, "Chelsio N320E-G2" },
+};
+
+/*
+ * Return the adapter_info structure with a given index.  Out-of-range indices
+ * return NULL.
+ */
+const struct adapter_info *t3_get_adapter_info(unsigned int id)
+{
+	return id < ARRAY_SIZE(t3_adap_info) ? &t3_adap_info[id] : NULL;
+}
+
+struct port_type_info {
+	int (*phy_prep)(struct cphy *phy, struct adapter *adapter,
+			int phy_addr, const struct mdio_ops *ops);
+};
+
+static const struct port_type_info port_types[] = {
+	{ NULL },
+	{ t3_ael1002_phy_prep },
+	{ t3_vsc8211_phy_prep },
+	{ NULL},
+	{ t3_xaui_direct_phy_prep },
+	{ t3_ael2005_phy_prep },
+	{ t3_qt2045_phy_prep },
+	{ t3_ael1006_phy_prep },
+	{ NULL },
+	{ t3_aq100x_phy_prep },
+	{ t3_ael2020_phy_prep },
+};
+
+#define VPD_ENTRY(name, len) \
+	u8 name##_kword[2]; u8 name##_len; u8 name##_data[len]
+
+/*
+ * Partial EEPROM Vital Product Data structure.  Includes only the ID and
+ * VPD-R sections.
+ */
+struct t3_vpd {
+	u8 id_tag;
+	u8 id_len[2];
+	u8 id_data[16];
+	u8 vpdr_tag;
+	u8 vpdr_len[2];
+	VPD_ENTRY(pn, 16);	/* part number */
+	VPD_ENTRY(ec, 16);	/* EC level */
+	VPD_ENTRY(sn, SERNUM_LEN); /* serial number */
+	VPD_ENTRY(na, 12);	/* MAC address base */
+	VPD_ENTRY(cclk, 6);	/* core clock */
+	VPD_ENTRY(mclk, 6);	/* mem clock */
+	VPD_ENTRY(uclk, 6);	/* uP clk */
+	VPD_ENTRY(mdc, 6);	/* MDIO clk */
+	VPD_ENTRY(mt, 2);	/* mem timing */
+	VPD_ENTRY(xaui0cfg, 6);	/* XAUI0 config */
+	VPD_ENTRY(xaui1cfg, 6);	/* XAUI1 config */
+	VPD_ENTRY(port0, 2);	/* PHY0 complex */
+	VPD_ENTRY(port1, 2);	/* PHY1 complex */
+	VPD_ENTRY(port2, 2);	/* PHY2 complex */
+	VPD_ENTRY(port3, 2);	/* PHY3 complex */
+	VPD_ENTRY(rv, 1);	/* csum */
+	u32 pad;		/* for multiple-of-4 sizing and alignment */
+};
+
+#define EEPROM_MAX_POLL   40
+#define EEPROM_STAT_ADDR  0x4000
+#define VPD_BASE          0xc00
+
+/**
+ *	t3_seeprom_read - read a VPD EEPROM location
+ *	@adapter: adapter to read
+ *	@addr: EEPROM address
+ *	@data: where to store the read data
+ *
+ *	Read a 32-bit word from a location in VPD EEPROM using the card's PCI
+ *	VPD ROM capability.  A zero is written to the flag bit when the
+ *	address is written to the control register.  The hardware device will
+ *	set the flag to 1 when 4 bytes have been read into the data register.
+ */
+int t3_seeprom_read(struct adapter *adapter, u32 addr, __le32 *data)
+{
+	u16 val;
+	int attempts = EEPROM_MAX_POLL;
+	u32 v;
+	unsigned int base = adapter->params.pci.vpd_cap_addr;
+
+	if ((addr >= EEPROMSIZE && addr != EEPROM_STAT_ADDR) || (addr & 3))
+		return -EINVAL;
+
+	pci_write_config_word(adapter->pdev, base + PCI_VPD_ADDR, addr);
+	do {
+		udelay(10);
+		pci_read_config_word(adapter->pdev, base + PCI_VPD_ADDR, &val);
+	} while (!(val & PCI_VPD_ADDR_F) && --attempts);
+
+	if (!(val & PCI_VPD_ADDR_F)) {
+		CH_ERR(adapter, "reading EEPROM address 0x%x failed\n", addr);
+		return -EIO;
+	}
+	pci_read_config_dword(adapter->pdev, base + PCI_VPD_DATA, &v);
+	*data = cpu_to_le32(v);
+	return 0;
+}
+
+/**
+ *	t3_seeprom_write - write a VPD EEPROM location
+ *	@adapter: adapter to write
+ *	@addr: EEPROM address
+ *	@data: value to write
+ *
+ *	Write a 32-bit word to a location in VPD EEPROM using the card's PCI
+ *	VPD ROM capability.
+ */
+int t3_seeprom_write(struct adapter *adapter, u32 addr, __le32 data)
+{
+	u16 val;
+	int attempts = EEPROM_MAX_POLL;
+	unsigned int base = adapter->params.pci.vpd_cap_addr;
+
+	if ((addr >= EEPROMSIZE && addr != EEPROM_STAT_ADDR) || (addr & 3))
+		return -EINVAL;
+
+	pci_write_config_dword(adapter->pdev, base + PCI_VPD_DATA,
+			       le32_to_cpu(data));
+	pci_write_config_word(adapter->pdev,base + PCI_VPD_ADDR,
+			      addr | PCI_VPD_ADDR_F);
+	do {
+		msleep(1);
+		pci_read_config_word(adapter->pdev, base + PCI_VPD_ADDR, &val);
+	} while ((val & PCI_VPD_ADDR_F) && --attempts);
+
+	if (val & PCI_VPD_ADDR_F) {
+		CH_ERR(adapter, "write to EEPROM address 0x%x failed\n", addr);
+		return -EIO;
+	}
+	return 0;
+}
+
+/**
+ *	t3_seeprom_wp - enable/disable EEPROM write protection
+ *	@adapter: the adapter
+ *	@enable: 1 to enable write protection, 0 to disable it
+ *
+ *	Enables or disables write protection on the serial EEPROM.
+ */
+int t3_seeprom_wp(struct adapter *adapter, int enable)
+{
+	return t3_seeprom_write(adapter, EEPROM_STAT_ADDR, enable ? 0xc : 0);
+}
+
+/**
+ *	get_vpd_params - read VPD parameters from VPD EEPROM
+ *	@adapter: adapter to read
+ *	@p: where to store the parameters
+ *
+ *	Reads card parameters stored in VPD EEPROM.
+ */
+static int get_vpd_params(struct adapter *adapter, struct vpd_params *p)
+{
+	int i, addr, ret;
+	struct t3_vpd vpd;
+
+	/*
+	 * Card information is normally at VPD_BASE but some early cards had
+	 * it at 0.
+	 */
+	ret = t3_seeprom_read(adapter, VPD_BASE, (__le32 *)&vpd);
+	if (ret)
+		return ret;
+	addr = vpd.id_tag == 0x82 ? VPD_BASE : 0;
+
+	for (i = 0; i < sizeof(vpd); i += 4) {
+		ret = t3_seeprom_read(adapter, addr + i,
+				      (__le32 *)((u8 *)&vpd + i));
+		if (ret)
+			return ret;
+	}
+
+	p->cclk = simple_strtoul(vpd.cclk_data, NULL, 10);
+	p->mclk = simple_strtoul(vpd.mclk_data, NULL, 10);
+	p->uclk = simple_strtoul(vpd.uclk_data, NULL, 10);
+	p->mdc = simple_strtoul(vpd.mdc_data, NULL, 10);
+	p->mem_timing = simple_strtoul(vpd.mt_data, NULL, 10);
+	memcpy(p->sn, vpd.sn_data, SERNUM_LEN);
+
+	/* Old eeproms didn't have port information */
+	if (adapter->params.rev == 0 && !vpd.port0_data[0]) {
+		p->port_type[0] = uses_xaui(adapter) ? 1 : 2;
+		p->port_type[1] = uses_xaui(adapter) ? 6 : 2;
+	} else {
+		p->port_type[0] = hex_to_bin(vpd.port0_data[0]);
+		p->port_type[1] = hex_to_bin(vpd.port1_data[0]);
+		p->xauicfg[0] = simple_strtoul(vpd.xaui0cfg_data, NULL, 16);
+		p->xauicfg[1] = simple_strtoul(vpd.xaui1cfg_data, NULL, 16);
+	}
+
+	for (i = 0; i < 6; i++)
+		p->eth_base[i] = hex_to_bin(vpd.na_data[2 * i]) * 16 +
+				 hex_to_bin(vpd.na_data[2 * i + 1]);
+	return 0;
+}
+
+/* serial flash and firmware constants */
+enum {
+	SF_ATTEMPTS = 5,	/* max retries for SF1 operations */
+	SF_SEC_SIZE = 64 * 1024,	/* serial flash sector size */
+	SF_SIZE = SF_SEC_SIZE * 8,	/* serial flash size */
+
+	/* flash command opcodes */
+	SF_PROG_PAGE = 2,	/* program page */
+	SF_WR_DISABLE = 4,	/* disable writes */
+	SF_RD_STATUS = 5,	/* read status register */
+	SF_WR_ENABLE = 6,	/* enable writes */
+	SF_RD_DATA_FAST = 0xb,	/* read flash */
+	SF_ERASE_SECTOR = 0xd8,	/* erase sector */
+
+	FW_FLASH_BOOT_ADDR = 0x70000,	/* start address of FW in flash */
+	FW_VERS_ADDR = 0x7fffc,    /* flash address holding FW version */
+	FW_MIN_SIZE = 8            /* at least version and csum */
+};
+
+/**
+ *	sf1_read - read data from the serial flash
+ *	@adapter: the adapter
+ *	@byte_cnt: number of bytes to read
+ *	@cont: whether another operation will be chained
+ *	@valp: where to store the read data
+ *
+ *	Reads up to 4 bytes of data from the serial flash.  The location of
+ *	the read needs to be specified prior to calling this by issuing the
+ *	appropriate commands to the serial flash.
+ */
+static int sf1_read(struct adapter *adapter, unsigned int byte_cnt, int cont,
+		    u32 *valp)
+{
+	int ret;
+
+	if (!byte_cnt || byte_cnt > 4)
+		return -EINVAL;
+	if (t3_read_reg(adapter, A_SF_OP) & F_BUSY)
+		return -EBUSY;
+	t3_write_reg(adapter, A_SF_OP, V_CONT(cont) | V_BYTECNT(byte_cnt - 1));
+	ret = t3_wait_op_done(adapter, A_SF_OP, F_BUSY, 0, SF_ATTEMPTS, 10);
+	if (!ret)
+		*valp = t3_read_reg(adapter, A_SF_DATA);
+	return ret;
+}
+
+/**
+ *	sf1_write - write data to the serial flash
+ *	@adapter: the adapter
+ *	@byte_cnt: number of bytes to write
+ *	@cont: whether another operation will be chained
+ *	@val: value to write
+ *
+ *	Writes up to 4 bytes of data to the serial flash.  The location of
+ *	the write needs to be specified prior to calling this by issuing the
+ *	appropriate commands to the serial flash.
+ */
+static int sf1_write(struct adapter *adapter, unsigned int byte_cnt, int cont,
+		     u32 val)
+{
+	if (!byte_cnt || byte_cnt > 4)
+		return -EINVAL;
+	if (t3_read_reg(adapter, A_SF_OP) & F_BUSY)
+		return -EBUSY;
+	t3_write_reg(adapter, A_SF_DATA, val);
+	t3_write_reg(adapter, A_SF_OP,
+		     V_CONT(cont) | V_BYTECNT(byte_cnt - 1) | V_OP(1));
+	return t3_wait_op_done(adapter, A_SF_OP, F_BUSY, 0, SF_ATTEMPTS, 10);
+}
+
+/**
+ *	flash_wait_op - wait for a flash operation to complete
+ *	@adapter: the adapter
+ *	@attempts: max number of polls of the status register
+ *	@delay: delay between polls in ms
+ *
+ *	Wait for a flash operation to complete by polling the status register.
+ */
+static int flash_wait_op(struct adapter *adapter, int attempts, int delay)
+{
+	int ret;
+	u32 status;
+
+	while (1) {
+		if ((ret = sf1_write(adapter, 1, 1, SF_RD_STATUS)) != 0 ||
+		    (ret = sf1_read(adapter, 1, 0, &status)) != 0)
+			return ret;
+		if (!(status & 1))
+			return 0;
+		if (--attempts == 0)
+			return -EAGAIN;
+		if (delay)
+			msleep(delay);
+	}
+}
+
+/**
+ *	t3_read_flash - read words from serial flash
+ *	@adapter: the adapter
+ *	@addr: the start address for the read
+ *	@nwords: how many 32-bit words to read
+ *	@data: where to store the read data
+ *	@byte_oriented: whether to store data as bytes or as words
+ *
+ *	Read the specified number of 32-bit words from the serial flash.
+ *	If @byte_oriented is set the read data is stored as a byte array
+ *	(i.e., big-endian), otherwise as 32-bit words in the platform's
+ *	natural endianess.
+ */
+static int t3_read_flash(struct adapter *adapter, unsigned int addr,
+			 unsigned int nwords, u32 *data, int byte_oriented)
+{
+	int ret;
+
+	if (addr + nwords * sizeof(u32) > SF_SIZE || (addr & 3))
+		return -EINVAL;
+
+	addr = swab32(addr) | SF_RD_DATA_FAST;
+
+	if ((ret = sf1_write(adapter, 4, 1, addr)) != 0 ||
+	    (ret = sf1_read(adapter, 1, 1, data)) != 0)
+		return ret;
+
+	for (; nwords; nwords--, data++) {
+		ret = sf1_read(adapter, 4, nwords > 1, data);
+		if (ret)
+			return ret;
+		if (byte_oriented)
+			*data = htonl(*data);
+	}
+	return 0;
+}
+
+/**
+ *	t3_write_flash - write up to a page of data to the serial flash
+ *	@adapter: the adapter
+ *	@addr: the start address to write
+ *	@n: length of data to write
+ *	@data: the data to write
+ *
+ *	Writes up to a page of data (256 bytes) to the serial flash starting
+ *	at the given address.
+ */
+static int t3_write_flash(struct adapter *adapter, unsigned int addr,
+			  unsigned int n, const u8 *data)
+{
+	int ret;
+	u32 buf[64];
+	unsigned int i, c, left, val, offset = addr & 0xff;
+
+	if (addr + n > SF_SIZE || offset + n > 256)
+		return -EINVAL;
+
+	val = swab32(addr) | SF_PROG_PAGE;
+
+	if ((ret = sf1_write(adapter, 1, 0, SF_WR_ENABLE)) != 0 ||
+	    (ret = sf1_write(adapter, 4, 1, val)) != 0)
+		return ret;
+
+	for (left = n; left; left -= c) {
+		c = min(left, 4U);
+		for (val = 0, i = 0; i < c; ++i)
+			val = (val << 8) + *data++;
+
+		ret = sf1_write(adapter, c, c != left, val);
+		if (ret)
+			return ret;
+	}
+	if ((ret = flash_wait_op(adapter, 5, 1)) != 0)
+		return ret;
+
+	/* Read the page to verify the write succeeded */
+	ret = t3_read_flash(adapter, addr & ~0xff, ARRAY_SIZE(buf), buf, 1);
+	if (ret)
+		return ret;
+
+	if (memcmp(data - n, (u8 *) buf + offset, n))
+		return -EIO;
+	return 0;
+}
+
+/**
+ *	t3_get_tp_version - read the tp sram version
+ *	@adapter: the adapter
+ *	@vers: where to place the version
+ *
+ *	Reads the protocol sram version from sram.
+ */
+int t3_get_tp_version(struct adapter *adapter, u32 *vers)
+{
+	int ret;
+
+	/* Get version loaded in SRAM */
+	t3_write_reg(adapter, A_TP_EMBED_OP_FIELD0, 0);
+	ret = t3_wait_op_done(adapter, A_TP_EMBED_OP_FIELD0,
+			      1, 1, 5, 1);
+	if (ret)
+		return ret;
+
+	*vers = t3_read_reg(adapter, A_TP_EMBED_OP_FIELD1);
+
+	return 0;
+}
+
+/**
+ *	t3_check_tpsram_version - read the tp sram version
+ *	@adapter: the adapter
+ *
+ *	Reads the protocol sram version from flash.
+ */
+int t3_check_tpsram_version(struct adapter *adapter)
+{
+	int ret;
+	u32 vers;
+	unsigned int major, minor;
+
+	if (adapter->params.rev == T3_REV_A)
+		return 0;
+
+
+	ret = t3_get_tp_version(adapter, &vers);
+	if (ret)
+		return ret;
+
+	major = G_TP_VERSION_MAJOR(vers);
+	minor = G_TP_VERSION_MINOR(vers);
+
+	if (major == TP_VERSION_MAJOR && minor == TP_VERSION_MINOR)
+		return 0;
+	else {
+		CH_ERR(adapter, "found wrong TP version (%u.%u), "
+		       "driver compiled for version %d.%d\n", major, minor,
+		       TP_VERSION_MAJOR, TP_VERSION_MINOR);
+	}
+	return -EINVAL;
+}
+
+/**
+ *	t3_check_tpsram - check if provided protocol SRAM
+ *			  is compatible with this driver
+ *	@adapter: the adapter
+ *	@tp_sram: the firmware image to write
+ *	@size: image size
+ *
+ *	Checks if an adapter's tp sram is compatible with the driver.
+ *	Returns 0 if the versions are compatible, a negative error otherwise.
+ */
+int t3_check_tpsram(struct adapter *adapter, const u8 *tp_sram,
+		    unsigned int size)
+{
+	u32 csum;
+	unsigned int i;
+	const __be32 *p = (const __be32 *)tp_sram;
+
+	/* Verify checksum */
+	for (csum = 0, i = 0; i < size / sizeof(csum); i++)
+		csum += ntohl(p[i]);
+	if (csum != 0xffffffff) {
+		CH_ERR(adapter, "corrupted protocol SRAM image, checksum %u\n",
+		       csum);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+enum fw_version_type {
+	FW_VERSION_N3,
+	FW_VERSION_T3
+};
+
+/**
+ *	t3_get_fw_version - read the firmware version
+ *	@adapter: the adapter
+ *	@vers: where to place the version
+ *
+ *	Reads the FW version from flash.
+ */
+int t3_get_fw_version(struct adapter *adapter, u32 *vers)
+{
+	return t3_read_flash(adapter, FW_VERS_ADDR, 1, vers, 0);
+}
+
+/**
+ *	t3_check_fw_version - check if the FW is compatible with this driver
+ *	@adapter: the adapter
+ *
+ *	Checks if an adapter's FW is compatible with the driver.  Returns 0
+ *	if the versions are compatible, a negative error otherwise.
+ */
+int t3_check_fw_version(struct adapter *adapter)
+{
+	int ret;
+	u32 vers;
+	unsigned int type, major, minor;
+
+	ret = t3_get_fw_version(adapter, &vers);
+	if (ret)
+		return ret;
+
+	type = G_FW_VERSION_TYPE(vers);
+	major = G_FW_VERSION_MAJOR(vers);
+	minor = G_FW_VERSION_MINOR(vers);
+
+	if (type == FW_VERSION_T3 && major == FW_VERSION_MAJOR &&
+	    minor == FW_VERSION_MINOR)
+		return 0;
+	else if (major != FW_VERSION_MAJOR || minor < FW_VERSION_MINOR)
+		CH_WARN(adapter, "found old FW minor version(%u.%u), "
+		        "driver compiled for version %u.%u\n", major, minor,
+			FW_VERSION_MAJOR, FW_VERSION_MINOR);
+	else {
+		CH_WARN(adapter, "found newer FW version(%u.%u), "
+		        "driver compiled for version %u.%u\n", major, minor,
+			FW_VERSION_MAJOR, FW_VERSION_MINOR);
+			return 0;
+	}
+	return -EINVAL;
+}
+
+/**
+ *	t3_flash_erase_sectors - erase a range of flash sectors
+ *	@adapter: the adapter
+ *	@start: the first sector to erase
+ *	@end: the last sector to erase
+ *
+ *	Erases the sectors in the given range.
+ */
+static int t3_flash_erase_sectors(struct adapter *adapter, int start, int end)
+{
+	while (start <= end) {
+		int ret;
+
+		if ((ret = sf1_write(adapter, 1, 0, SF_WR_ENABLE)) != 0 ||
+		    (ret = sf1_write(adapter, 4, 0,
+				     SF_ERASE_SECTOR | (start << 8))) != 0 ||
+		    (ret = flash_wait_op(adapter, 5, 500)) != 0)
+			return ret;
+		start++;
+	}
+	return 0;
+}
+
+/**
+ *	t3_load_fw - download firmware
+ *	@adapter: the adapter
+ *	@fw_data: the firmware image to write
+ *	@size: image size
+ *
+ *	Write the supplied firmware image to the card's serial flash.
+ *	The FW image has the following sections: @size - 8 bytes of code and
+ *	data, followed by 4 bytes of FW version, followed by the 32-bit
+ *	1's complement checksum of the whole image.
+ */
+int t3_load_fw(struct adapter *adapter, const u8 *fw_data, unsigned int size)
+{
+	u32 csum;
+	unsigned int i;
+	const __be32 *p = (const __be32 *)fw_data;
+	int ret, addr, fw_sector = FW_FLASH_BOOT_ADDR >> 16;
+
+	if ((size & 3) || size < FW_MIN_SIZE)
+		return -EINVAL;
+	if (size > FW_VERS_ADDR + 8 - FW_FLASH_BOOT_ADDR)
+		return -EFBIG;
+
+	for (csum = 0, i = 0; i < size / sizeof(csum); i++)
+		csum += ntohl(p[i]);
+	if (csum != 0xffffffff) {
+		CH_ERR(adapter, "corrupted firmware image, checksum %u\n",
+		       csum);
+		return -EINVAL;
+	}
+
+	ret = t3_flash_erase_sectors(adapter, fw_sector, fw_sector);
+	if (ret)
+		goto out;
+
+	size -= 8;		/* trim off version and checksum */
+	for (addr = FW_FLASH_BOOT_ADDR; size;) {
+		unsigned int chunk_size = min(size, 256U);
+
+		ret = t3_write_flash(adapter, addr, chunk_size, fw_data);
+		if (ret)
+			goto out;
+
+		addr += chunk_size;
+		fw_data += chunk_size;
+		size -= chunk_size;
+	}
+
+	ret = t3_write_flash(adapter, FW_VERS_ADDR, 4, fw_data);
+out:
+	if (ret)
+		CH_ERR(adapter, "firmware download failed, error %d\n", ret);
+	return ret;
+}
+
+#define CIM_CTL_BASE 0x2000
+
+/**
+ *      t3_cim_ctl_blk_read - read a block from CIM control region
+ *
+ *      @adap: the adapter
+ *      @addr: the start address within the CIM control region
+ *      @n: number of words to read
+ *      @valp: where to store the result
+ *
+ *      Reads a block of 4-byte words from the CIM control region.
+ */
+int t3_cim_ctl_blk_read(struct adapter *adap, unsigned int addr,
+			unsigned int n, unsigned int *valp)
+{
+	int ret = 0;
+
+	if (t3_read_reg(adap, A_CIM_HOST_ACC_CTRL) & F_HOSTBUSY)
+		return -EBUSY;
+
+	for ( ; !ret && n--; addr += 4) {
+		t3_write_reg(adap, A_CIM_HOST_ACC_CTRL, CIM_CTL_BASE + addr);
+		ret = t3_wait_op_done(adap, A_CIM_HOST_ACC_CTRL, F_HOSTBUSY,
+				      0, 5, 2);
+		if (!ret)
+			*valp++ = t3_read_reg(adap, A_CIM_HOST_ACC_DATA);
+	}
+	return ret;
+}
+
+static void t3_gate_rx_traffic(struct cmac *mac, u32 *rx_cfg,
+			       u32 *rx_hash_high, u32 *rx_hash_low)
+{
+	/* stop Rx unicast traffic */
+	t3_mac_disable_exact_filters(mac);
+
+	/* stop broadcast, multicast, promiscuous mode traffic */
+	*rx_cfg = t3_read_reg(mac->adapter, A_XGM_RX_CFG);
+	t3_set_reg_field(mac->adapter, A_XGM_RX_CFG,
+			 F_ENHASHMCAST | F_DISBCAST | F_COPYALLFRAMES,
+			 F_DISBCAST);
+
+	*rx_hash_high = t3_read_reg(mac->adapter, A_XGM_RX_HASH_HIGH);
+	t3_write_reg(mac->adapter, A_XGM_RX_HASH_HIGH, 0);
+
+	*rx_hash_low = t3_read_reg(mac->adapter, A_XGM_RX_HASH_LOW);
+	t3_write_reg(mac->adapter, A_XGM_RX_HASH_LOW, 0);
+
+	/* Leave time to drain max RX fifo */
+	msleep(1);
+}
+
+static void t3_open_rx_traffic(struct cmac *mac, u32 rx_cfg,
+			       u32 rx_hash_high, u32 rx_hash_low)
+{
+	t3_mac_enable_exact_filters(mac);
+	t3_set_reg_field(mac->adapter, A_XGM_RX_CFG,
+			 F_ENHASHMCAST | F_DISBCAST | F_COPYALLFRAMES,
+			 rx_cfg);
+	t3_write_reg(mac->adapter, A_XGM_RX_HASH_HIGH, rx_hash_high);
+	t3_write_reg(mac->adapter, A_XGM_RX_HASH_LOW, rx_hash_low);
+}
+
+/**
+ *	t3_link_changed - handle interface link changes
+ *	@adapter: the adapter
+ *	@port_id: the port index that changed link state
+ *
+ *	Called when a port's link settings change to propagate the new values
+ *	to the associated PHY and MAC.  After performing the common tasks it
+ *	invokes an OS-specific handler.
+ */
+void t3_link_changed(struct adapter *adapter, int port_id)
+{
+	int link_ok, speed, duplex, fc;
+	struct port_info *pi = adap2pinfo(adapter, port_id);
+	struct cphy *phy = &pi->phy;
+	struct cmac *mac = &pi->mac;
+	struct link_config *lc = &pi->link_config;
+
+	phy->ops->get_link_status(phy, &link_ok, &speed, &duplex, &fc);
+
+	if (!lc->link_ok && link_ok) {
+		u32 rx_cfg, rx_hash_high, rx_hash_low;
+		u32 status;
+
+		t3_xgm_intr_enable(adapter, port_id);
+		t3_gate_rx_traffic(mac, &rx_cfg, &rx_hash_high, &rx_hash_low);
+		t3_write_reg(adapter, A_XGM_RX_CTRL + mac->offset, 0);
+		t3_mac_enable(mac, MAC_DIRECTION_RX);
+
+		status = t3_read_reg(adapter, A_XGM_INT_STATUS + mac->offset);
+		if (status & F_LINKFAULTCHANGE) {
+			mac->stats.link_faults++;
+			pi->link_fault = 1;
+		}
+		t3_open_rx_traffic(mac, rx_cfg, rx_hash_high, rx_hash_low);
+	}
+
+	if (lc->requested_fc & PAUSE_AUTONEG)
+		fc &= lc->requested_fc;
+	else
+		fc = lc->requested_fc & (PAUSE_RX | PAUSE_TX);
+
+	if (link_ok == lc->link_ok && speed == lc->speed &&
+	    duplex == lc->duplex && fc == lc->fc)
+		return;                            /* nothing changed */
+
+	if (link_ok != lc->link_ok && adapter->params.rev > 0 &&
+	    uses_xaui(adapter)) {
+		if (link_ok)
+			t3b_pcs_reset(mac);
+		t3_write_reg(adapter, A_XGM_XAUI_ACT_CTRL + mac->offset,
+			     link_ok ? F_TXACTENABLE | F_RXEN : 0);
+	}
+	lc->link_ok = link_ok;
+	lc->speed = speed < 0 ? SPEED_INVALID : speed;
+	lc->duplex = duplex < 0 ? DUPLEX_INVALID : duplex;
+
+	if (link_ok && speed >= 0 && lc->autoneg == AUTONEG_ENABLE) {
+		/* Set MAC speed, duplex, and flow control to match PHY. */
+		t3_mac_set_speed_duplex_fc(mac, speed, duplex, fc);
+		lc->fc = fc;
+	}
+
+	t3_os_link_changed(adapter, port_id, link_ok && !pi->link_fault,
+			   speed, duplex, fc);
+}
+
+void t3_link_fault(struct adapter *adapter, int port_id)
+{
+	struct port_info *pi = adap2pinfo(adapter, port_id);
+	struct cmac *mac = &pi->mac;
+	struct cphy *phy = &pi->phy;
+	struct link_config *lc = &pi->link_config;
+	int link_ok, speed, duplex, fc, link_fault;
+	u32 rx_cfg, rx_hash_high, rx_hash_low;
+
+	t3_gate_rx_traffic(mac, &rx_cfg, &rx_hash_high, &rx_hash_low);
+
+	if (adapter->params.rev > 0 && uses_xaui(adapter))
+		t3_write_reg(adapter, A_XGM_XAUI_ACT_CTRL + mac->offset, 0);
+
+	t3_write_reg(adapter, A_XGM_RX_CTRL + mac->offset, 0);
+	t3_mac_enable(mac, MAC_DIRECTION_RX);
+
+	t3_open_rx_traffic(mac, rx_cfg, rx_hash_high, rx_hash_low);
+
+	link_fault = t3_read_reg(adapter,
+				 A_XGM_INT_STATUS + mac->offset);
+	link_fault &= F_LINKFAULTCHANGE;
+
+	link_ok = lc->link_ok;
+	speed = lc->speed;
+	duplex = lc->duplex;
+	fc = lc->fc;
+
+	phy->ops->get_link_status(phy, &link_ok, &speed, &duplex, &fc);
+
+	if (link_fault) {
+		lc->link_ok = 0;
+		lc->speed = SPEED_INVALID;
+		lc->duplex = DUPLEX_INVALID;
+
+		t3_os_link_fault(adapter, port_id, 0);
+
+		/* Account link faults only when the phy reports a link up */
+		if (link_ok)
+			mac->stats.link_faults++;
+	} else {
+		if (link_ok)
+			t3_write_reg(adapter, A_XGM_XAUI_ACT_CTRL + mac->offset,
+				     F_TXACTENABLE | F_RXEN);
+
+		pi->link_fault = 0;
+		lc->link_ok = (unsigned char)link_ok;
+		lc->speed = speed < 0 ? SPEED_INVALID : speed;
+		lc->duplex = duplex < 0 ? DUPLEX_INVALID : duplex;
+		t3_os_link_fault(adapter, port_id, link_ok);
+	}
+}
+
+/**
+ *	t3_link_start - apply link configuration to MAC/PHY
+ *	@phy: the PHY to setup
+ *	@mac: the MAC to setup
+ *	@lc: the requested link configuration
+ *
+ *	Set up a port's MAC and PHY according to a desired link configuration.
+ *	- If the PHY can auto-negotiate first decide what to advertise, then
+ *	  enable/disable auto-negotiation as desired, and reset.
+ *	- If the PHY does not auto-negotiate just reset it.
+ *	- If auto-negotiation is off set the MAC to the proper speed/duplex/FC,
+ *	  otherwise do it later based on the outcome of auto-negotiation.
+ */
+int t3_link_start(struct cphy *phy, struct cmac *mac, struct link_config *lc)
+{
+	unsigned int fc = lc->requested_fc & (PAUSE_RX | PAUSE_TX);
+
+	lc->link_ok = 0;
+	if (lc->supported & SUPPORTED_Autoneg) {
+		lc->advertising &= ~(ADVERTISED_Asym_Pause | ADVERTISED_Pause);
+		if (fc) {
+			lc->advertising |= ADVERTISED_Asym_Pause;
+			if (fc & PAUSE_RX)
+				lc->advertising |= ADVERTISED_Pause;
+		}
+		phy->ops->advertise(phy, lc->advertising);
+
+		if (lc->autoneg == AUTONEG_DISABLE) {
+			lc->speed = lc->requested_speed;
+			lc->duplex = lc->requested_duplex;
+			lc->fc = (unsigned char)fc;
+			t3_mac_set_speed_duplex_fc(mac, lc->speed, lc->duplex,
+						   fc);
+			/* Also disables autoneg */
+			phy->ops->set_speed_duplex(phy, lc->speed, lc->duplex);
+		} else
+			phy->ops->autoneg_enable(phy);
+	} else {
+		t3_mac_set_speed_duplex_fc(mac, -1, -1, fc);
+		lc->fc = (unsigned char)fc;
+		phy->ops->reset(phy, 0);
+	}
+	return 0;
+}
+
+/**
+ *	t3_set_vlan_accel - control HW VLAN extraction
+ *	@adapter: the adapter
+ *	@ports: bitmap of adapter ports to operate on
+ *	@on: enable (1) or disable (0) HW VLAN extraction
+ *
+ *	Enables or disables HW extraction of VLAN tags for the given port.
+ */
+void t3_set_vlan_accel(struct adapter *adapter, unsigned int ports, int on)
+{
+	t3_set_reg_field(adapter, A_TP_OUT_CONFIG,
+			 ports << S_VLANEXTRACTIONENABLE,
+			 on ? (ports << S_VLANEXTRACTIONENABLE) : 0);
+}
+
+struct intr_info {
+	unsigned int mask;	/* bits to check in interrupt status */
+	const char *msg;	/* message to print or NULL */
+	short stat_idx;		/* stat counter to increment or -1 */
+	unsigned short fatal;	/* whether the condition reported is fatal */
+};
+
+/**
+ *	t3_handle_intr_status - table driven interrupt handler
+ *	@adapter: the adapter that generated the interrupt
+ *	@reg: the interrupt status register to process
+ *	@mask: a mask to apply to the interrupt status
+ *	@acts: table of interrupt actions
+ *	@stats: statistics counters tracking interrupt occurrences
+ *
+ *	A table driven interrupt handler that applies a set of masks to an
+ *	interrupt status word and performs the corresponding actions if the
+ *	interrupts described by the mask have occurred.  The actions include
+ *	optionally printing a warning or alert message, and optionally
+ *	incrementing a stat counter.  The table is terminated by an entry
+ *	specifying mask 0.  Returns the number of fatal interrupt conditions.
+ */
+static int t3_handle_intr_status(struct adapter *adapter, unsigned int reg,
+				 unsigned int mask,
+				 const struct intr_info *acts,
+				 unsigned long *stats)
+{
+	int fatal = 0;
+	unsigned int status = t3_read_reg(adapter, reg) & mask;
+
+	for (; acts->mask; ++acts) {
+		if (!(status & acts->mask))
+			continue;
+		if (acts->fatal) {
+			fatal++;
+			CH_ALERT(adapter, "%s (0x%x)\n",
+				 acts->msg, status & acts->mask);
+			status &= ~acts->mask;
+		} else if (acts->msg)
+			CH_WARN(adapter, "%s (0x%x)\n",
+				acts->msg, status & acts->mask);
+		if (acts->stat_idx >= 0)
+			stats[acts->stat_idx]++;
+	}
+	if (status)		/* clear processed interrupts */
+		t3_write_reg(adapter, reg, status);
+	return fatal;
+}
+
+#define SGE_INTR_MASK (F_RSPQDISABLED | \
+		       F_UC_REQ_FRAMINGERROR | F_R_REQ_FRAMINGERROR | \
+		       F_CPPARITYERROR | F_OCPARITYERROR | F_RCPARITYERROR | \
+		       F_IRPARITYERROR | V_ITPARITYERROR(M_ITPARITYERROR) | \
+		       V_FLPARITYERROR(M_FLPARITYERROR) | F_LODRBPARITYERROR | \
+		       F_HIDRBPARITYERROR | F_LORCQPARITYERROR | \
+		       F_HIRCQPARITYERROR | F_LOPRIORITYDBFULL | \
+		       F_HIPRIORITYDBFULL | F_LOPRIORITYDBEMPTY | \
+		       F_HIPRIORITYDBEMPTY | F_HIPIODRBDROPERR | \
+		       F_LOPIODRBDROPERR)
+#define MC5_INTR_MASK (F_PARITYERR | F_ACTRGNFULL | F_UNKNOWNCMD | \
+		       F_REQQPARERR | F_DISPQPARERR | F_DELACTEMPTY | \
+		       F_NFASRCHFAIL)
+#define MC7_INTR_MASK (F_AE | F_UE | F_CE | V_PE(M_PE))
+#define XGM_INTR_MASK (V_TXFIFO_PRTY_ERR(M_TXFIFO_PRTY_ERR) | \
+		       V_RXFIFO_PRTY_ERR(M_RXFIFO_PRTY_ERR) | \
+		       F_TXFIFO_UNDERRUN)
+#define PCIX_INTR_MASK (F_MSTDETPARERR | F_SIGTARABT | F_RCVTARABT | \
+			F_RCVMSTABT | F_SIGSYSERR | F_DETPARERR | \
+			F_SPLCMPDIS | F_UNXSPLCMP | F_RCVSPLCMPERR | \
+			F_DETCORECCERR | F_DETUNCECCERR | F_PIOPARERR | \
+			V_WFPARERR(M_WFPARERR) | V_RFPARERR(M_RFPARERR) | \
+			V_CFPARERR(M_CFPARERR) /* | V_MSIXPARERR(M_MSIXPARERR) */)
+#define PCIE_INTR_MASK (F_UNXSPLCPLERRR | F_UNXSPLCPLERRC | F_PCIE_PIOPARERR |\
+			F_PCIE_WFPARERR | F_PCIE_RFPARERR | F_PCIE_CFPARERR | \
+			/* V_PCIE_MSIXPARERR(M_PCIE_MSIXPARERR) | */ \
+			F_RETRYBUFPARERR | F_RETRYLUTPARERR | F_RXPARERR | \
+			F_TXPARERR | V_BISTERR(M_BISTERR))
+#define ULPRX_INTR_MASK (F_PARERRDATA | F_PARERRPCMD | F_ARBPF1PERR | \
+			 F_ARBPF0PERR | F_ARBFPERR | F_PCMDMUXPERR | \
+			 F_DATASELFRAMEERR1 | F_DATASELFRAMEERR0)
+#define ULPTX_INTR_MASK 0xfc
+#define CPLSW_INTR_MASK (F_CIM_OP_MAP_PERR | F_TP_FRAMING_ERROR | \
+			 F_SGE_FRAMING_ERROR | F_CIM_FRAMING_ERROR | \
+			 F_ZERO_SWITCH_ERROR)
+#define CIM_INTR_MASK (F_BLKWRPLINT | F_BLKRDPLINT | F_BLKWRCTLINT | \
+		       F_BLKRDCTLINT | F_BLKWRFLASHINT | F_BLKRDFLASHINT | \
+		       F_SGLWRFLASHINT | F_WRBLKFLASHINT | F_BLKWRBOOTINT | \
+	 	       F_FLASHRANGEINT | F_SDRAMRANGEINT | F_RSVDSPACEINT | \
+		       F_DRAMPARERR | F_ICACHEPARERR | F_DCACHEPARERR | \
+		       F_OBQSGEPARERR | F_OBQULPHIPARERR | F_OBQULPLOPARERR | \
+		       F_IBQSGELOPARERR | F_IBQSGEHIPARERR | F_IBQULPPARERR | \
+		       F_IBQTPPARERR | F_ITAGPARERR | F_DTAGPARERR)
+#define PMTX_INTR_MASK (F_ZERO_C_CMD_ERROR | ICSPI_FRM_ERR | OESPI_FRM_ERR | \
+			V_ICSPI_PAR_ERROR(M_ICSPI_PAR_ERROR) | \
+			V_OESPI_PAR_ERROR(M_OESPI_PAR_ERROR))
+#define PMRX_INTR_MASK (F_ZERO_E_CMD_ERROR | IESPI_FRM_ERR | OCSPI_FRM_ERR | \
+			V_IESPI_PAR_ERROR(M_IESPI_PAR_ERROR) | \
+			V_OCSPI_PAR_ERROR(M_OCSPI_PAR_ERROR))
+#define MPS_INTR_MASK (V_TX0TPPARERRENB(M_TX0TPPARERRENB) | \
+		       V_TX1TPPARERRENB(M_TX1TPPARERRENB) | \
+		       V_RXTPPARERRENB(M_RXTPPARERRENB) | \
+		       V_MCAPARERRENB(M_MCAPARERRENB))
+#define XGM_EXTRA_INTR_MASK (F_LINKFAULTCHANGE)
+#define PL_INTR_MASK (F_T3DBG | F_XGMAC0_0 | F_XGMAC0_1 | F_MC5A | F_PM1_TX | \
+		      F_PM1_RX | F_ULP2_TX | F_ULP2_RX | F_TP1 | F_CIM | \
+		      F_MC7_CM | F_MC7_PMTX | F_MC7_PMRX | F_SGE3 | F_PCIM0 | \
+		      F_MPS0 | F_CPL_SWITCH)
+/*
+ * Interrupt handler for the PCIX1 module.
+ */
+static void pci_intr_handler(struct adapter *adapter)
+{
+	static const struct intr_info pcix1_intr_info[] = {
+		{F_MSTDETPARERR, "PCI master detected parity error", -1, 1},
+		{F_SIGTARABT, "PCI signaled target abort", -1, 1},
+		{F_RCVTARABT, "PCI received target abort", -1, 1},
+		{F_RCVMSTABT, "PCI received master abort", -1, 1},
+		{F_SIGSYSERR, "PCI signaled system error", -1, 1},
+		{F_DETPARERR, "PCI detected parity error", -1, 1},
+		{F_SPLCMPDIS, "PCI split completion discarded", -1, 1},
+		{F_UNXSPLCMP, "PCI unexpected split completion error", -1, 1},
+		{F_RCVSPLCMPERR, "PCI received split completion error", -1,
+		 1},
+		{F_DETCORECCERR, "PCI correctable ECC error",
+		 STAT_PCI_CORR_ECC, 0},
+		{F_DETUNCECCERR, "PCI uncorrectable ECC error", -1, 1},
+		{F_PIOPARERR, "PCI PIO FIFO parity error", -1, 1},
+		{V_WFPARERR(M_WFPARERR), "PCI write FIFO parity error", -1,
+		 1},
+		{V_RFPARERR(M_RFPARERR), "PCI read FIFO parity error", -1,
+		 1},
+		{V_CFPARERR(M_CFPARERR), "PCI command FIFO parity error", -1,
+		 1},
+		{V_MSIXPARERR(M_MSIXPARERR), "PCI MSI-X table/PBA parity "
+		 "error", -1, 1},
+		{0}
+	};
+
+	if (t3_handle_intr_status(adapter, A_PCIX_INT_CAUSE, PCIX_INTR_MASK,
+				  pcix1_intr_info, adapter->irq_stats))
+		t3_fatal_err(adapter);
+}
+
+/*
+ * Interrupt handler for the PCIE module.
+ */
+static void pcie_intr_handler(struct adapter *adapter)
+{
+	static const struct intr_info pcie_intr_info[] = {
+		{F_PEXERR, "PCI PEX error", -1, 1},
+		{F_UNXSPLCPLERRR,
+		 "PCI unexpected split completion DMA read error", -1, 1},
+		{F_UNXSPLCPLERRC,
+		 "PCI unexpected split completion DMA command error", -1, 1},
+		{F_PCIE_PIOPARERR, "PCI PIO FIFO parity error", -1, 1},
+		{F_PCIE_WFPARERR, "PCI write FIFO parity error", -1, 1},
+		{F_PCIE_RFPARERR, "PCI read FIFO parity error", -1, 1},
+		{F_PCIE_CFPARERR, "PCI command FIFO parity error", -1, 1},
+		{V_PCIE_MSIXPARERR(M_PCIE_MSIXPARERR),
+		 "PCI MSI-X table/PBA parity error", -1, 1},
+		{F_RETRYBUFPARERR, "PCI retry buffer parity error", -1, 1},
+		{F_RETRYLUTPARERR, "PCI retry LUT parity error", -1, 1},
+		{F_RXPARERR, "PCI Rx parity error", -1, 1},
+		{F_TXPARERR, "PCI Tx parity error", -1, 1},
+		{V_BISTERR(M_BISTERR), "PCI BIST error", -1, 1},
+		{0}
+	};
+
+	if (t3_read_reg(adapter, A_PCIE_INT_CAUSE) & F_PEXERR)
+		CH_ALERT(adapter, "PEX error code 0x%x\n",
+			 t3_read_reg(adapter, A_PCIE_PEX_ERR));
+
+	if (t3_handle_intr_status(adapter, A_PCIE_INT_CAUSE, PCIE_INTR_MASK,
+				  pcie_intr_info, adapter->irq_stats))
+		t3_fatal_err(adapter);
+}
+
+/*
+ * TP interrupt handler.
+ */
+static void tp_intr_handler(struct adapter *adapter)
+{
+	static const struct intr_info tp_intr_info[] = {
+		{0xffffff, "TP parity error", -1, 1},
+		{0x1000000, "TP out of Rx pages", -1, 1},
+		{0x2000000, "TP out of Tx pages", -1, 1},
+		{0}
+	};
+
+	static const struct intr_info tp_intr_info_t3c[] = {
+		{0x1fffffff, "TP parity error", -1, 1},
+		{F_FLMRXFLSTEMPTY, "TP out of Rx pages", -1, 1},
+		{F_FLMTXFLSTEMPTY, "TP out of Tx pages", -1, 1},
+		{0}
+	};
+
+	if (t3_handle_intr_status(adapter, A_TP_INT_CAUSE, 0xffffffff,
+				  adapter->params.rev < T3_REV_C ?
+				  tp_intr_info : tp_intr_info_t3c, NULL))
+		t3_fatal_err(adapter);
+}
+
+/*
+ * CIM interrupt handler.
+ */
+static void cim_intr_handler(struct adapter *adapter)
+{
+	static const struct intr_info cim_intr_info[] = {
+		{F_RSVDSPACEINT, "CIM reserved space write", -1, 1},
+		{F_SDRAMRANGEINT, "CIM SDRAM address out of range", -1, 1},
+		{F_FLASHRANGEINT, "CIM flash address out of range", -1, 1},
+		{F_BLKWRBOOTINT, "CIM block write to boot space", -1, 1},
+		{F_WRBLKFLASHINT, "CIM write to cached flash space", -1, 1},
+		{F_SGLWRFLASHINT, "CIM single write to flash space", -1, 1},
+		{F_BLKRDFLASHINT, "CIM block read from flash space", -1, 1},
+		{F_BLKWRFLASHINT, "CIM block write to flash space", -1, 1},
+		{F_BLKRDCTLINT, "CIM block read from CTL space", -1, 1},
+		{F_BLKWRCTLINT, "CIM block write to CTL space", -1, 1},
+		{F_BLKRDPLINT, "CIM block read from PL space", -1, 1},
+		{F_BLKWRPLINT, "CIM block write to PL space", -1, 1},
+		{F_DRAMPARERR, "CIM DRAM parity error", -1, 1},
+		{F_ICACHEPARERR, "CIM icache parity error", -1, 1},
+		{F_DCACHEPARERR, "CIM dcache parity error", -1, 1},
+		{F_OBQSGEPARERR, "CIM OBQ SGE parity error", -1, 1},
+		{F_OBQULPHIPARERR, "CIM OBQ ULPHI parity error", -1, 1},
+		{F_OBQULPLOPARERR, "CIM OBQ ULPLO parity error", -1, 1},
+		{F_IBQSGELOPARERR, "CIM IBQ SGELO parity error", -1, 1},
+		{F_IBQSGEHIPARERR, "CIM IBQ SGEHI parity error", -1, 1},
+		{F_IBQULPPARERR, "CIM IBQ ULP parity error", -1, 1},
+		{F_IBQTPPARERR, "CIM IBQ TP parity error", -1, 1},
+		{F_ITAGPARERR, "CIM itag parity error", -1, 1},
+		{F_DTAGPARERR, "CIM dtag parity error", -1, 1},
+		{0}
+	};
+
+	if (t3_handle_intr_status(adapter, A_CIM_HOST_INT_CAUSE, 0xffffffff,
+				  cim_intr_info, NULL))
+		t3_fatal_err(adapter);
+}
+
+/*
+ * ULP RX interrupt handler.
+ */
+static void ulprx_intr_handler(struct adapter *adapter)
+{
+	static const struct intr_info ulprx_intr_info[] = {
+		{F_PARERRDATA, "ULP RX data parity error", -1, 1},
+		{F_PARERRPCMD, "ULP RX command parity error", -1, 1},
+		{F_ARBPF1PERR, "ULP RX ArbPF1 parity error", -1, 1},
+		{F_ARBPF0PERR, "ULP RX ArbPF0 parity error", -1, 1},
+		{F_ARBFPERR, "ULP RX ArbF parity error", -1, 1},
+		{F_PCMDMUXPERR, "ULP RX PCMDMUX parity error", -1, 1},
+		{F_DATASELFRAMEERR1, "ULP RX frame error", -1, 1},
+		{F_DATASELFRAMEERR0, "ULP RX frame error", -1, 1},
+		{0}
+	};
+
+	if (t3_handle_intr_status(adapter, A_ULPRX_INT_CAUSE, 0xffffffff,
+				  ulprx_intr_info, NULL))
+		t3_fatal_err(adapter);
+}
+
+/*
+ * ULP TX interrupt handler.
+ */
+static void ulptx_intr_handler(struct adapter *adapter)
+{
+	static const struct intr_info ulptx_intr_info[] = {
+		{F_PBL_BOUND_ERR_CH0, "ULP TX channel 0 PBL out of bounds",
+		 STAT_ULP_CH0_PBL_OOB, 0},
+		{F_PBL_BOUND_ERR_CH1, "ULP TX channel 1 PBL out of bounds",
+		 STAT_ULP_CH1_PBL_OOB, 0},
+		{0xfc, "ULP TX parity error", -1, 1},
+		{0}
+	};
+
+	if (t3_handle_intr_status(adapter, A_ULPTX_INT_CAUSE, 0xffffffff,
+				  ulptx_intr_info, adapter->irq_stats))
+		t3_fatal_err(adapter);
+}
+
+#define ICSPI_FRM_ERR (F_ICSPI0_FIFO2X_RX_FRAMING_ERROR | \
+	F_ICSPI1_FIFO2X_RX_FRAMING_ERROR | F_ICSPI0_RX_FRAMING_ERROR | \
+	F_ICSPI1_RX_FRAMING_ERROR | F_ICSPI0_TX_FRAMING_ERROR | \
+	F_ICSPI1_TX_FRAMING_ERROR)
+#define OESPI_FRM_ERR (F_OESPI0_RX_FRAMING_ERROR | \
+	F_OESPI1_RX_FRAMING_ERROR | F_OESPI0_TX_FRAMING_ERROR | \
+	F_OESPI1_TX_FRAMING_ERROR | F_OESPI0_OFIFO2X_TX_FRAMING_ERROR | \
+	F_OESPI1_OFIFO2X_TX_FRAMING_ERROR)
+
+/*
+ * PM TX interrupt handler.
+ */
+static void pmtx_intr_handler(struct adapter *adapter)
+{
+	static const struct intr_info pmtx_intr_info[] = {
+		{F_ZERO_C_CMD_ERROR, "PMTX 0-length pcmd", -1, 1},
+		{ICSPI_FRM_ERR, "PMTX ispi framing error", -1, 1},
+		{OESPI_FRM_ERR, "PMTX ospi framing error", -1, 1},
+		{V_ICSPI_PAR_ERROR(M_ICSPI_PAR_ERROR),
+		 "PMTX ispi parity error", -1, 1},
+		{V_OESPI_PAR_ERROR(M_OESPI_PAR_ERROR),
+		 "PMTX ospi parity error", -1, 1},
+		{0}
+	};
+
+	if (t3_handle_intr_status(adapter, A_PM1_TX_INT_CAUSE, 0xffffffff,
+				  pmtx_intr_info, NULL))
+		t3_fatal_err(adapter);
+}
+
+#define IESPI_FRM_ERR (F_IESPI0_FIFO2X_RX_FRAMING_ERROR | \
+	F_IESPI1_FIFO2X_RX_FRAMING_ERROR | F_IESPI0_RX_FRAMING_ERROR | \
+	F_IESPI1_RX_FRAMING_ERROR | F_IESPI0_TX_FRAMING_ERROR | \
+	F_IESPI1_TX_FRAMING_ERROR)
+#define OCSPI_FRM_ERR (F_OCSPI0_RX_FRAMING_ERROR | \
+	F_OCSPI1_RX_FRAMING_ERROR | F_OCSPI0_TX_FRAMING_ERROR | \
+	F_OCSPI1_TX_FRAMING_ERROR | F_OCSPI0_OFIFO2X_TX_FRAMING_ERROR | \
+	F_OCSPI1_OFIFO2X_TX_FRAMING_ERROR)
+
+/*
+ * PM RX interrupt handler.
+ */
+static void pmrx_intr_handler(struct adapter *adapter)
+{
+	static const struct intr_info pmrx_intr_info[] = {
+		{F_ZERO_E_CMD_ERROR, "PMRX 0-length pcmd", -1, 1},
+		{IESPI_FRM_ERR, "PMRX ispi framing error", -1, 1},
+		{OCSPI_FRM_ERR, "PMRX ospi framing error", -1, 1},
+		{V_IESPI_PAR_ERROR(M_IESPI_PAR_ERROR),
+		 "PMRX ispi parity error", -1, 1},
+		{V_OCSPI_PAR_ERROR(M_OCSPI_PAR_ERROR),
+		 "PMRX ospi parity error", -1, 1},
+		{0}
+	};
+
+	if (t3_handle_intr_status(adapter, A_PM1_RX_INT_CAUSE, 0xffffffff,
+				  pmrx_intr_info, NULL))
+		t3_fatal_err(adapter);
+}
+
+/*
+ * CPL switch interrupt handler.
+ */
+static void cplsw_intr_handler(struct adapter *adapter)
+{
+	static const struct intr_info cplsw_intr_info[] = {
+		{F_CIM_OP_MAP_PERR, "CPL switch CIM parity error", -1, 1},
+		{F_CIM_OVFL_ERROR, "CPL switch CIM overflow", -1, 1},
+		{F_TP_FRAMING_ERROR, "CPL switch TP framing error", -1, 1},
+		{F_SGE_FRAMING_ERROR, "CPL switch SGE framing error", -1, 1},
+		{F_CIM_FRAMING_ERROR, "CPL switch CIM framing error", -1, 1},
+		{F_ZERO_SWITCH_ERROR, "CPL switch no-switch error", -1, 1},
+		{0}
+	};
+
+	if (t3_handle_intr_status(adapter, A_CPL_INTR_CAUSE, 0xffffffff,
+				  cplsw_intr_info, NULL))
+		t3_fatal_err(adapter);
+}
+
+/*
+ * MPS interrupt handler.
+ */
+static void mps_intr_handler(struct adapter *adapter)
+{
+	static const struct intr_info mps_intr_info[] = {
+		{0x1ff, "MPS parity error", -1, 1},
+		{0}
+	};
+
+	if (t3_handle_intr_status(adapter, A_MPS_INT_CAUSE, 0xffffffff,
+				  mps_intr_info, NULL))
+		t3_fatal_err(adapter);
+}
+
+#define MC7_INTR_FATAL (F_UE | V_PE(M_PE) | F_AE)
+
+/*
+ * MC7 interrupt handler.
+ */
+static void mc7_intr_handler(struct mc7 *mc7)
+{
+	struct adapter *adapter = mc7->adapter;
+	u32 cause = t3_read_reg(adapter, mc7->offset + A_MC7_INT_CAUSE);
+
+	if (cause & F_CE) {
+		mc7->stats.corr_err++;
+		CH_WARN(adapter, "%s MC7 correctable error at addr 0x%x, "
+			"data 0x%x 0x%x 0x%x\n", mc7->name,
+			t3_read_reg(adapter, mc7->offset + A_MC7_CE_ADDR),
+			t3_read_reg(adapter, mc7->offset + A_MC7_CE_DATA0),
+			t3_read_reg(adapter, mc7->offset + A_MC7_CE_DATA1),
+			t3_read_reg(adapter, mc7->offset + A_MC7_CE_DATA2));
+	}
+
+	if (cause & F_UE) {
+		mc7->stats.uncorr_err++;
+		CH_ALERT(adapter, "%s MC7 uncorrectable error at addr 0x%x, "
+			 "data 0x%x 0x%x 0x%x\n", mc7->name,
+			 t3_read_reg(adapter, mc7->offset + A_MC7_UE_ADDR),
+			 t3_read_reg(adapter, mc7->offset + A_MC7_UE_DATA0),
+			 t3_read_reg(adapter, mc7->offset + A_MC7_UE_DATA1),
+			 t3_read_reg(adapter, mc7->offset + A_MC7_UE_DATA2));
+	}
+
+	if (G_PE(cause)) {
+		mc7->stats.parity_err++;
+		CH_ALERT(adapter, "%s MC7 parity error 0x%x\n",
+			 mc7->name, G_PE(cause));
+	}
+
+	if (cause & F_AE) {
+		u32 addr = 0;
+
+		if (adapter->params.rev > 0)
+			addr = t3_read_reg(adapter,
+					   mc7->offset + A_MC7_ERR_ADDR);
+		mc7->stats.addr_err++;
+		CH_ALERT(adapter, "%s MC7 address error: 0x%x\n",
+			 mc7->name, addr);
+	}
+
+	if (cause & MC7_INTR_FATAL)
+		t3_fatal_err(adapter);
+
+	t3_write_reg(adapter, mc7->offset + A_MC7_INT_CAUSE, cause);
+}
+
+#define XGM_INTR_FATAL (V_TXFIFO_PRTY_ERR(M_TXFIFO_PRTY_ERR) | \
+			V_RXFIFO_PRTY_ERR(M_RXFIFO_PRTY_ERR))
+/*
+ * XGMAC interrupt handler.
+ */
+static int mac_intr_handler(struct adapter *adap, unsigned int idx)
+{
+	struct cmac *mac = &adap2pinfo(adap, idx)->mac;
+	/*
+	 * We mask out interrupt causes for which we're not taking interrupts.
+	 * This allows us to use polling logic to monitor some of the other
+	 * conditions when taking interrupts would impose too much load on the
+	 * system.
+	 */
+	u32 cause = t3_read_reg(adap, A_XGM_INT_CAUSE + mac->offset) &
+		    ~F_RXFIFO_OVERFLOW;
+
+	if (cause & V_TXFIFO_PRTY_ERR(M_TXFIFO_PRTY_ERR)) {
+		mac->stats.tx_fifo_parity_err++;
+		CH_ALERT(adap, "port%d: MAC TX FIFO parity error\n", idx);
+	}
+	if (cause & V_RXFIFO_PRTY_ERR(M_RXFIFO_PRTY_ERR)) {
+		mac->stats.rx_fifo_parity_err++;
+		CH_ALERT(adap, "port%d: MAC RX FIFO parity error\n", idx);
+	}
+	if (cause & F_TXFIFO_UNDERRUN)
+		mac->stats.tx_fifo_urun++;
+	if (cause & F_RXFIFO_OVERFLOW)
+		mac->stats.rx_fifo_ovfl++;
+	if (cause & V_SERDES_LOS(M_SERDES_LOS))
+		mac->stats.serdes_signal_loss++;
+	if (cause & F_XAUIPCSCTCERR)
+		mac->stats.xaui_pcs_ctc_err++;
+	if (cause & F_XAUIPCSALIGNCHANGE)
+		mac->stats.xaui_pcs_align_change++;
+	if (cause & F_XGM_INT) {
+		t3_set_reg_field(adap,
+				 A_XGM_INT_ENABLE + mac->offset,
+				 F_XGM_INT, 0);
+		mac->stats.link_faults++;
+
+		t3_os_link_fault_handler(adap, idx);
+	}
+
+	if (cause & XGM_INTR_FATAL)
+		t3_fatal_err(adap);
+
+	t3_write_reg(adap, A_XGM_INT_CAUSE + mac->offset, cause);
+	return cause != 0;
+}
+
+/*
+ * Interrupt handler for PHY events.
+ */
+int t3_phy_intr_handler(struct adapter *adapter)
+{
+	u32 i, cause = t3_read_reg(adapter, A_T3DBG_INT_CAUSE);
+
+	for_each_port(adapter, i) {
+		struct port_info *p = adap2pinfo(adapter, i);
+
+		if (!(p->phy.caps & SUPPORTED_IRQ))
+			continue;
+
+		if (cause & (1 << adapter_info(adapter)->gpio_intr[i])) {
+			int phy_cause = p->phy.ops->intr_handler(&p->phy);
+
+			if (phy_cause & cphy_cause_link_change)
+				t3_link_changed(adapter, i);
+			if (phy_cause & cphy_cause_fifo_error)
+				p->phy.fifo_errors++;
+			if (phy_cause & cphy_cause_module_change)
+				t3_os_phymod_changed(adapter, i);
+		}
+	}
+
+	t3_write_reg(adapter, A_T3DBG_INT_CAUSE, cause);
+	return 0;
+}
+
+/*
+ * T3 slow path (non-data) interrupt handler.
+ */
+int t3_slow_intr_handler(struct adapter *adapter)
+{
+	u32 cause = t3_read_reg(adapter, A_PL_INT_CAUSE0);
+
+	cause &= adapter->slow_intr_mask;
+	if (!cause)
+		return 0;
+	if (cause & F_PCIM0) {
+		if (is_pcie(adapter))
+			pcie_intr_handler(adapter);
+		else
+			pci_intr_handler(adapter);
+	}
+	if (cause & F_SGE3)
+		t3_sge_err_intr_handler(adapter);
+	if (cause & F_MC7_PMRX)
+		mc7_intr_handler(&adapter->pmrx);
+	if (cause & F_MC7_PMTX)
+		mc7_intr_handler(&adapter->pmtx);
+	if (cause & F_MC7_CM)
+		mc7_intr_handler(&adapter->cm);
+	if (cause & F_CIM)
+		cim_intr_handler(adapter);
+	if (cause & F_TP1)
+		tp_intr_handler(adapter);
+	if (cause & F_ULP2_RX)
+		ulprx_intr_handler(adapter);
+	if (cause & F_ULP2_TX)
+		ulptx_intr_handler(adapter);
+	if (cause & F_PM1_RX)
+		pmrx_intr_handler(adapter);
+	if (cause & F_PM1_TX)
+		pmtx_intr_handler(adapter);
+	if (cause & F_CPL_SWITCH)
+		cplsw_intr_handler(adapter);
+	if (cause & F_MPS0)
+		mps_intr_handler(adapter);
+	if (cause & F_MC5A)
+		t3_mc5_intr_handler(&adapter->mc5);
+	if (cause & F_XGMAC0_0)
+		mac_intr_handler(adapter, 0);
+	if (cause & F_XGMAC0_1)
+		mac_intr_handler(adapter, 1);
+	if (cause & F_T3DBG)
+		t3_os_ext_intr_handler(adapter);
+
+	/* Clear the interrupts just processed. */
+	t3_write_reg(adapter, A_PL_INT_CAUSE0, cause);
+	t3_read_reg(adapter, A_PL_INT_CAUSE0);	/* flush */
+	return 1;
+}
+
+static unsigned int calc_gpio_intr(struct adapter *adap)
+{
+	unsigned int i, gpi_intr = 0;
+
+	for_each_port(adap, i)
+		if ((adap2pinfo(adap, i)->phy.caps & SUPPORTED_IRQ) &&
+		    adapter_info(adap)->gpio_intr[i])
+			gpi_intr |= 1 << adapter_info(adap)->gpio_intr[i];
+	return gpi_intr;
+}
+
+/**
+ *	t3_intr_enable - enable interrupts
+ *	@adapter: the adapter whose interrupts should be enabled
+ *
+ *	Enable interrupts by setting the interrupt enable registers of the
+ *	various HW modules and then enabling the top-level interrupt
+ *	concentrator.
+ */
+void t3_intr_enable(struct adapter *adapter)
+{
+	static const struct addr_val_pair intr_en_avp[] = {
+		{A_SG_INT_ENABLE, SGE_INTR_MASK},
+		{A_MC7_INT_ENABLE, MC7_INTR_MASK},
+		{A_MC7_INT_ENABLE - MC7_PMRX_BASE_ADDR + MC7_PMTX_BASE_ADDR,
+		 MC7_INTR_MASK},
+		{A_MC7_INT_ENABLE - MC7_PMRX_BASE_ADDR + MC7_CM_BASE_ADDR,
+		 MC7_INTR_MASK},
+		{A_MC5_DB_INT_ENABLE, MC5_INTR_MASK},
+		{A_ULPRX_INT_ENABLE, ULPRX_INTR_MASK},
+		{A_PM1_TX_INT_ENABLE, PMTX_INTR_MASK},
+		{A_PM1_RX_INT_ENABLE, PMRX_INTR_MASK},
+		{A_CIM_HOST_INT_ENABLE, CIM_INTR_MASK},
+		{A_MPS_INT_ENABLE, MPS_INTR_MASK},
+	};
+
+	adapter->slow_intr_mask = PL_INTR_MASK;
+
+	t3_write_regs(adapter, intr_en_avp, ARRAY_SIZE(intr_en_avp), 0);
+	t3_write_reg(adapter, A_TP_INT_ENABLE,
+		     adapter->params.rev >= T3_REV_C ? 0x2bfffff : 0x3bfffff);
+
+	if (adapter->params.rev > 0) {
+		t3_write_reg(adapter, A_CPL_INTR_ENABLE,
+			     CPLSW_INTR_MASK | F_CIM_OVFL_ERROR);
+		t3_write_reg(adapter, A_ULPTX_INT_ENABLE,
+			     ULPTX_INTR_MASK | F_PBL_BOUND_ERR_CH0 |
+			     F_PBL_BOUND_ERR_CH1);
+	} else {
+		t3_write_reg(adapter, A_CPL_INTR_ENABLE, CPLSW_INTR_MASK);
+		t3_write_reg(adapter, A_ULPTX_INT_ENABLE, ULPTX_INTR_MASK);
+	}
+
+	t3_write_reg(adapter, A_T3DBG_INT_ENABLE, calc_gpio_intr(adapter));
+
+	if (is_pcie(adapter))
+		t3_write_reg(adapter, A_PCIE_INT_ENABLE, PCIE_INTR_MASK);
+	else
+		t3_write_reg(adapter, A_PCIX_INT_ENABLE, PCIX_INTR_MASK);
+	t3_write_reg(adapter, A_PL_INT_ENABLE0, adapter->slow_intr_mask);
+	t3_read_reg(adapter, A_PL_INT_ENABLE0);	/* flush */
+}
+
+/**
+ *	t3_intr_disable - disable a card's interrupts
+ *	@adapter: the adapter whose interrupts should be disabled
+ *
+ *	Disable interrupts.  We only disable the top-level interrupt
+ *	concentrator and the SGE data interrupts.
+ */
+void t3_intr_disable(struct adapter *adapter)
+{
+	t3_write_reg(adapter, A_PL_INT_ENABLE0, 0);
+	t3_read_reg(adapter, A_PL_INT_ENABLE0);	/* flush */
+	adapter->slow_intr_mask = 0;
+}
+
+/**
+ *	t3_intr_clear - clear all interrupts
+ *	@adapter: the adapter whose interrupts should be cleared
+ *
+ *	Clears all interrupts.
+ */
+void t3_intr_clear(struct adapter *adapter)
+{
+	static const unsigned int cause_reg_addr[] = {
+		A_SG_INT_CAUSE,
+		A_SG_RSPQ_FL_STATUS,
+		A_PCIX_INT_CAUSE,
+		A_MC7_INT_CAUSE,
+		A_MC7_INT_CAUSE - MC7_PMRX_BASE_ADDR + MC7_PMTX_BASE_ADDR,
+		A_MC7_INT_CAUSE - MC7_PMRX_BASE_ADDR + MC7_CM_BASE_ADDR,
+		A_CIM_HOST_INT_CAUSE,
+		A_TP_INT_CAUSE,
+		A_MC5_DB_INT_CAUSE,
+		A_ULPRX_INT_CAUSE,
+		A_ULPTX_INT_CAUSE,
+		A_CPL_INTR_CAUSE,
+		A_PM1_TX_INT_CAUSE,
+		A_PM1_RX_INT_CAUSE,
+		A_MPS_INT_CAUSE,
+		A_T3DBG_INT_CAUSE,
+	};
+	unsigned int i;
+
+	/* Clear PHY and MAC interrupts for each port. */
+	for_each_port(adapter, i)
+	    t3_port_intr_clear(adapter, i);
+
+	for (i = 0; i < ARRAY_SIZE(cause_reg_addr); ++i)
+		t3_write_reg(adapter, cause_reg_addr[i], 0xffffffff);
+
+	if (is_pcie(adapter))
+		t3_write_reg(adapter, A_PCIE_PEX_ERR, 0xffffffff);
+	t3_write_reg(adapter, A_PL_INT_CAUSE0, 0xffffffff);
+	t3_read_reg(adapter, A_PL_INT_CAUSE0);	/* flush */
+}
+
+void t3_xgm_intr_enable(struct adapter *adapter, int idx)
+{
+	struct port_info *pi = adap2pinfo(adapter, idx);
+
+	t3_write_reg(adapter, A_XGM_XGM_INT_ENABLE + pi->mac.offset,
+		     XGM_EXTRA_INTR_MASK);
+}
+
+void t3_xgm_intr_disable(struct adapter *adapter, int idx)
+{
+	struct port_info *pi = adap2pinfo(adapter, idx);
+
+	t3_write_reg(adapter, A_XGM_XGM_INT_DISABLE + pi->mac.offset,
+		     0x7ff);
+}
+
+/**
+ *	t3_port_intr_enable - enable port-specific interrupts
+ *	@adapter: associated adapter
+ *	@idx: index of port whose interrupts should be enabled
+ *
+ *	Enable port-specific (i.e., MAC and PHY) interrupts for the given
+ *	adapter port.
+ */
+void t3_port_intr_enable(struct adapter *adapter, int idx)
+{
+	struct cphy *phy = &adap2pinfo(adapter, idx)->phy;
+
+	t3_write_reg(adapter, XGM_REG(A_XGM_INT_ENABLE, idx), XGM_INTR_MASK);
+	t3_read_reg(adapter, XGM_REG(A_XGM_INT_ENABLE, idx)); /* flush */
+	phy->ops->intr_enable(phy);
+}
+
+/**
+ *	t3_port_intr_disable - disable port-specific interrupts
+ *	@adapter: associated adapter
+ *	@idx: index of port whose interrupts should be disabled
+ *
+ *	Disable port-specific (i.e., MAC and PHY) interrupts for the given
+ *	adapter port.
+ */
+void t3_port_intr_disable(struct adapter *adapter, int idx)
+{
+	struct cphy *phy = &adap2pinfo(adapter, idx)->phy;
+
+	t3_write_reg(adapter, XGM_REG(A_XGM_INT_ENABLE, idx), 0);
+	t3_read_reg(adapter, XGM_REG(A_XGM_INT_ENABLE, idx)); /* flush */
+	phy->ops->intr_disable(phy);
+}
+
+/**
+ *	t3_port_intr_clear - clear port-specific interrupts
+ *	@adapter: associated adapter
+ *	@idx: index of port whose interrupts to clear
+ *
+ *	Clear port-specific (i.e., MAC and PHY) interrupts for the given
+ *	adapter port.
+ */
+static void t3_port_intr_clear(struct adapter *adapter, int idx)
+{
+	struct cphy *phy = &adap2pinfo(adapter, idx)->phy;
+
+	t3_write_reg(adapter, XGM_REG(A_XGM_INT_CAUSE, idx), 0xffffffff);
+	t3_read_reg(adapter, XGM_REG(A_XGM_INT_CAUSE, idx)); /* flush */
+	phy->ops->intr_clear(phy);
+}
+
+#define SG_CONTEXT_CMD_ATTEMPTS 100
+
+/**
+ * 	t3_sge_write_context - write an SGE context
+ * 	@adapter: the adapter
+ * 	@id: the context id
+ * 	@type: the context type
+ *
+ * 	Program an SGE context with the values already loaded in the
+ * 	CONTEXT_DATA? registers.
+ */
+static int t3_sge_write_context(struct adapter *adapter, unsigned int id,
+				unsigned int type)
+{
+	if (type == F_RESPONSEQ) {
+		/*
+		 * Can't write the Response Queue Context bits for
+		 * Interrupt Armed or the Reserve bits after the chip
+		 * has been initialized out of reset.  Writing to these
+		 * bits can confuse the hardware.
+		 */
+		t3_write_reg(adapter, A_SG_CONTEXT_MASK0, 0xffffffff);
+		t3_write_reg(adapter, A_SG_CONTEXT_MASK1, 0xffffffff);
+		t3_write_reg(adapter, A_SG_CONTEXT_MASK2, 0x17ffffff);
+		t3_write_reg(adapter, A_SG_CONTEXT_MASK3, 0xffffffff);
+	} else {
+		t3_write_reg(adapter, A_SG_CONTEXT_MASK0, 0xffffffff);
+		t3_write_reg(adapter, A_SG_CONTEXT_MASK1, 0xffffffff);
+		t3_write_reg(adapter, A_SG_CONTEXT_MASK2, 0xffffffff);
+		t3_write_reg(adapter, A_SG_CONTEXT_MASK3, 0xffffffff);
+	}
+	t3_write_reg(adapter, A_SG_CONTEXT_CMD,
+		     V_CONTEXT_CMD_OPCODE(1) | type | V_CONTEXT(id));
+	return t3_wait_op_done(adapter, A_SG_CONTEXT_CMD, F_CONTEXT_CMD_BUSY,
+			       0, SG_CONTEXT_CMD_ATTEMPTS, 1);
+}
+
+/**
+ *	clear_sge_ctxt - completely clear an SGE context
+ *	@adapter: the adapter
+ *	@id: the context id
+ *	@type: the context type
+ *
+ *	Completely clear an SGE context.  Used predominantly at post-reset
+ *	initialization.  Note in particular that we don't skip writing to any
+ *	"sensitive bits" in the contexts the way that t3_sge_write_context()
+ *	does ...
+ */
+static int clear_sge_ctxt(struct adapter *adap, unsigned int id,
+			  unsigned int type)
+{
+	t3_write_reg(adap, A_SG_CONTEXT_DATA0, 0);
+	t3_write_reg(adap, A_SG_CONTEXT_DATA1, 0);
+	t3_write_reg(adap, A_SG_CONTEXT_DATA2, 0);
+	t3_write_reg(adap, A_SG_CONTEXT_DATA3, 0);
+	t3_write_reg(adap, A_SG_CONTEXT_MASK0, 0xffffffff);
+	t3_write_reg(adap, A_SG_CONTEXT_MASK1, 0xffffffff);
+	t3_write_reg(adap, A_SG_CONTEXT_MASK2, 0xffffffff);
+	t3_write_reg(adap, A_SG_CONTEXT_MASK3, 0xffffffff);
+	t3_write_reg(adap, A_SG_CONTEXT_CMD,
+		     V_CONTEXT_CMD_OPCODE(1) | type | V_CONTEXT(id));
+	return t3_wait_op_done(adap, A_SG_CONTEXT_CMD, F_CONTEXT_CMD_BUSY,
+			       0, SG_CONTEXT_CMD_ATTEMPTS, 1);
+}
+
+/**
+ *	t3_sge_init_ecntxt - initialize an SGE egress context
+ *	@adapter: the adapter to configure
+ *	@id: the context id
+ *	@gts_enable: whether to enable GTS for the context
+ *	@type: the egress context type
+ *	@respq: associated response queue
+ *	@base_addr: base address of queue
+ *	@size: number of queue entries
+ *	@token: uP token
+ *	@gen: initial generation value for the context
+ *	@cidx: consumer pointer
+ *
+ *	Initialize an SGE egress context and make it ready for use.  If the
+ *	platform allows concurrent context operations, the caller is
+ *	responsible for appropriate locking.
+ */
+int t3_sge_init_ecntxt(struct adapter *adapter, unsigned int id, int gts_enable,
+		       enum sge_context_type type, int respq, u64 base_addr,
+		       unsigned int size, unsigned int token, int gen,
+		       unsigned int cidx)
+{
+	unsigned int credits = type == SGE_CNTXT_OFLD ? 0 : FW_WR_NUM;
+
+	if (base_addr & 0xfff)	/* must be 4K aligned */
+		return -EINVAL;
+	if (t3_read_reg(adapter, A_SG_CONTEXT_CMD) & F_CONTEXT_CMD_BUSY)
+		return -EBUSY;
+
+	base_addr >>= 12;
+	t3_write_reg(adapter, A_SG_CONTEXT_DATA0, V_EC_INDEX(cidx) |
+		     V_EC_CREDITS(credits) | V_EC_GTS(gts_enable));
+	t3_write_reg(adapter, A_SG_CONTEXT_DATA1, V_EC_SIZE(size) |
+		     V_EC_BASE_LO(base_addr & 0xffff));
+	base_addr >>= 16;
+	t3_write_reg(adapter, A_SG_CONTEXT_DATA2, base_addr);
+	base_addr >>= 32;
+	t3_write_reg(adapter, A_SG_CONTEXT_DATA3,
+		     V_EC_BASE_HI(base_addr & 0xf) | V_EC_RESPQ(respq) |
+		     V_EC_TYPE(type) | V_EC_GEN(gen) | V_EC_UP_TOKEN(token) |
+		     F_EC_VALID);
+	return t3_sge_write_context(adapter, id, F_EGRESS);
+}
+
+/**
+ *	t3_sge_init_flcntxt - initialize an SGE free-buffer list context
+ *	@adapter: the adapter to configure
+ *	@id: the context id
+ *	@gts_enable: whether to enable GTS for the context
+ *	@base_addr: base address of queue
+ *	@size: number of queue entries
+ *	@bsize: size of each buffer for this queue
+ *	@cong_thres: threshold to signal congestion to upstream producers
+ *	@gen: initial generation value for the context
+ *	@cidx: consumer pointer
+ *
+ *	Initialize an SGE free list context and make it ready for use.  The
+ *	caller is responsible for ensuring only one context operation occurs
+ *	at a time.
+ */
+int t3_sge_init_flcntxt(struct adapter *adapter, unsigned int id,
+			int gts_enable, u64 base_addr, unsigned int size,
+			unsigned int bsize, unsigned int cong_thres, int gen,
+			unsigned int cidx)
+{
+	if (base_addr & 0xfff)	/* must be 4K aligned */
+		return -EINVAL;
+	if (t3_read_reg(adapter, A_SG_CONTEXT_CMD) & F_CONTEXT_CMD_BUSY)
+		return -EBUSY;
+
+	base_addr >>= 12;
+	t3_write_reg(adapter, A_SG_CONTEXT_DATA0, base_addr);
+	base_addr >>= 32;
+	t3_write_reg(adapter, A_SG_CONTEXT_DATA1,
+		     V_FL_BASE_HI((u32) base_addr) |
+		     V_FL_INDEX_LO(cidx & M_FL_INDEX_LO));
+	t3_write_reg(adapter, A_SG_CONTEXT_DATA2, V_FL_SIZE(size) |
+		     V_FL_GEN(gen) | V_FL_INDEX_HI(cidx >> 12) |
+		     V_FL_ENTRY_SIZE_LO(bsize & M_FL_ENTRY_SIZE_LO));
+	t3_write_reg(adapter, A_SG_CONTEXT_DATA3,
+		     V_FL_ENTRY_SIZE_HI(bsize >> (32 - S_FL_ENTRY_SIZE_LO)) |
+		     V_FL_CONG_THRES(cong_thres) | V_FL_GTS(gts_enable));
+	return t3_sge_write_context(adapter, id, F_FREELIST);
+}
+
+/**
+ *	t3_sge_init_rspcntxt - initialize an SGE response queue context
+ *	@adapter: the adapter to configure
+ *	@id: the context id
+ *	@irq_vec_idx: MSI-X interrupt vector index, 0 if no MSI-X, -1 if no IRQ
+ *	@base_addr: base address of queue
+ *	@size: number of queue entries
+ *	@fl_thres: threshold for selecting the normal or jumbo free list
+ *	@gen: initial generation value for the context
+ *	@cidx: consumer pointer
+ *
+ *	Initialize an SGE response queue context and make it ready for use.
+ *	The caller is responsible for ensuring only one context operation
+ *	occurs at a time.
+ */
+int t3_sge_init_rspcntxt(struct adapter *adapter, unsigned int id,
+			 int irq_vec_idx, u64 base_addr, unsigned int size,
+			 unsigned int fl_thres, int gen, unsigned int cidx)
+{
+	unsigned int intr = 0;
+
+	if (base_addr & 0xfff)	/* must be 4K aligned */
+		return -EINVAL;
+	if (t3_read_reg(adapter, A_SG_CONTEXT_CMD) & F_CONTEXT_CMD_BUSY)
+		return -EBUSY;
+
+	base_addr >>= 12;
+	t3_write_reg(adapter, A_SG_CONTEXT_DATA0, V_CQ_SIZE(size) |
+		     V_CQ_INDEX(cidx));
+	t3_write_reg(adapter, A_SG_CONTEXT_DATA1, base_addr);
+	base_addr >>= 32;
+	if (irq_vec_idx >= 0)
+		intr = V_RQ_MSI_VEC(irq_vec_idx) | F_RQ_INTR_EN;
+	t3_write_reg(adapter, A_SG_CONTEXT_DATA2,
+		     V_CQ_BASE_HI((u32) base_addr) | intr | V_RQ_GEN(gen));
+	t3_write_reg(adapter, A_SG_CONTEXT_DATA3, fl_thres);
+	return t3_sge_write_context(adapter, id, F_RESPONSEQ);
+}
+
+/**
+ *	t3_sge_init_cqcntxt - initialize an SGE completion queue context
+ *	@adapter: the adapter to configure
+ *	@id: the context id
+ *	@base_addr: base address of queue
+ *	@size: number of queue entries
+ *	@rspq: response queue for async notifications
+ *	@ovfl_mode: CQ overflow mode
+ *	@credits: completion queue credits
+ *	@credit_thres: the credit threshold
+ *
+ *	Initialize an SGE completion queue context and make it ready for use.
+ *	The caller is responsible for ensuring only one context operation
+ *	occurs at a time.
+ */
+int t3_sge_init_cqcntxt(struct adapter *adapter, unsigned int id, u64 base_addr,
+			unsigned int size, int rspq, int ovfl_mode,
+			unsigned int credits, unsigned int credit_thres)
+{
+	if (base_addr & 0xfff)	/* must be 4K aligned */
+		return -EINVAL;
+	if (t3_read_reg(adapter, A_SG_CONTEXT_CMD) & F_CONTEXT_CMD_BUSY)
+		return -EBUSY;
+
+	base_addr >>= 12;
+	t3_write_reg(adapter, A_SG_CONTEXT_DATA0, V_CQ_SIZE(size));
+	t3_write_reg(adapter, A_SG_CONTEXT_DATA1, base_addr);
+	base_addr >>= 32;
+	t3_write_reg(adapter, A_SG_CONTEXT_DATA2,
+		     V_CQ_BASE_HI((u32) base_addr) | V_CQ_RSPQ(rspq) |
+		     V_CQ_GEN(1) | V_CQ_OVERFLOW_MODE(ovfl_mode) |
+		     V_CQ_ERR(ovfl_mode));
+	t3_write_reg(adapter, A_SG_CONTEXT_DATA3, V_CQ_CREDITS(credits) |
+		     V_CQ_CREDIT_THRES(credit_thres));
+	return t3_sge_write_context(adapter, id, F_CQ);
+}
+
+/**
+ *	t3_sge_enable_ecntxt - enable/disable an SGE egress context
+ *	@adapter: the adapter
+ *	@id: the egress context id
+ *	@enable: enable (1) or disable (0) the context
+ *
+ *	Enable or disable an SGE egress context.  The caller is responsible for
+ *	ensuring only one context operation occurs at a time.
+ */
+int t3_sge_enable_ecntxt(struct adapter *adapter, unsigned int id, int enable)
+{
+	if (t3_read_reg(adapter, A_SG_CONTEXT_CMD) & F_CONTEXT_CMD_BUSY)
+		return -EBUSY;
+
+	t3_write_reg(adapter, A_SG_CONTEXT_MASK0, 0);
+	t3_write_reg(adapter, A_SG_CONTEXT_MASK1, 0);
+	t3_write_reg(adapter, A_SG_CONTEXT_MASK2, 0);
+	t3_write_reg(adapter, A_SG_CONTEXT_MASK3, F_EC_VALID);
+	t3_write_reg(adapter, A_SG_CONTEXT_DATA3, V_EC_VALID(enable));
+	t3_write_reg(adapter, A_SG_CONTEXT_CMD,
+		     V_CONTEXT_CMD_OPCODE(1) | F_EGRESS | V_CONTEXT(id));
+	return t3_wait_op_done(adapter, A_SG_CONTEXT_CMD, F_CONTEXT_CMD_BUSY,
+			       0, SG_CONTEXT_CMD_ATTEMPTS, 1);
+}
+
+/**
+ *	t3_sge_disable_fl - disable an SGE free-buffer list
+ *	@adapter: the adapter
+ *	@id: the free list context id
+ *
+ *	Disable an SGE free-buffer list.  The caller is responsible for
+ *	ensuring only one context operation occurs at a time.
+ */
+int t3_sge_disable_fl(struct adapter *adapter, unsigned int id)
+{
+	if (t3_read_reg(adapter, A_SG_CONTEXT_CMD) & F_CONTEXT_CMD_BUSY)
+		return -EBUSY;
+
+	t3_write_reg(adapter, A_SG_CONTEXT_MASK0, 0);
+	t3_write_reg(adapter, A_SG_CONTEXT_MASK1, 0);
+	t3_write_reg(adapter, A_SG_CONTEXT_MASK2, V_FL_SIZE(M_FL_SIZE));
+	t3_write_reg(adapter, A_SG_CONTEXT_MASK3, 0);
+	t3_write_reg(adapter, A_SG_CONTEXT_DATA2, 0);
+	t3_write_reg(adapter, A_SG_CONTEXT_CMD,
+		     V_CONTEXT_CMD_OPCODE(1) | F_FREELIST | V_CONTEXT(id));
+	return t3_wait_op_done(adapter, A_SG_CONTEXT_CMD, F_CONTEXT_CMD_BUSY,
+			       0, SG_CONTEXT_CMD_ATTEMPTS, 1);
+}
+
+/**
+ *	t3_sge_disable_rspcntxt - disable an SGE response queue
+ *	@adapter: the adapter
+ *	@id: the response queue context id
+ *
+ *	Disable an SGE response queue.  The caller is responsible for
+ *	ensuring only one context operation occurs at a time.
+ */
+int t3_sge_disable_rspcntxt(struct adapter *adapter, unsigned int id)
+{
+	if (t3_read_reg(adapter, A_SG_CONTEXT_CMD) & F_CONTEXT_CMD_BUSY)
+		return -EBUSY;
+
+	t3_write_reg(adapter, A_SG_CONTEXT_MASK0, V_CQ_SIZE(M_CQ_SIZE));
+	t3_write_reg(adapter, A_SG_CONTEXT_MASK1, 0);
+	t3_write_reg(adapter, A_SG_CONTEXT_MASK2, 0);
+	t3_write_reg(adapter, A_SG_CONTEXT_MASK3, 0);
+	t3_write_reg(adapter, A_SG_CONTEXT_DATA0, 0);
+	t3_write_reg(adapter, A_SG_CONTEXT_CMD,
+		     V_CONTEXT_CMD_OPCODE(1) | F_RESPONSEQ | V_CONTEXT(id));
+	return t3_wait_op_done(adapter, A_SG_CONTEXT_CMD, F_CONTEXT_CMD_BUSY,
+			       0, SG_CONTEXT_CMD_ATTEMPTS, 1);
+}
+
+/**
+ *	t3_sge_disable_cqcntxt - disable an SGE completion queue
+ *	@adapter: the adapter
+ *	@id: the completion queue context id
+ *
+ *	Disable an SGE completion queue.  The caller is responsible for
+ *	ensuring only one context operation occurs at a time.
+ */
+int t3_sge_disable_cqcntxt(struct adapter *adapter, unsigned int id)
+{
+	if (t3_read_reg(adapter, A_SG_CONTEXT_CMD) & F_CONTEXT_CMD_BUSY)
+		return -EBUSY;
+
+	t3_write_reg(adapter, A_SG_CONTEXT_MASK0, V_CQ_SIZE(M_CQ_SIZE));
+	t3_write_reg(adapter, A_SG_CONTEXT_MASK1, 0);
+	t3_write_reg(adapter, A_SG_CONTEXT_MASK2, 0);
+	t3_write_reg(adapter, A_SG_CONTEXT_MASK3, 0);
+	t3_write_reg(adapter, A_SG_CONTEXT_DATA0, 0);
+	t3_write_reg(adapter, A_SG_CONTEXT_CMD,
+		     V_CONTEXT_CMD_OPCODE(1) | F_CQ | V_CONTEXT(id));
+	return t3_wait_op_done(adapter, A_SG_CONTEXT_CMD, F_CONTEXT_CMD_BUSY,
+			       0, SG_CONTEXT_CMD_ATTEMPTS, 1);
+}
+
+/**
+ *	t3_sge_cqcntxt_op - perform an operation on a completion queue context
+ *	@adapter: the adapter
+ *	@id: the context id
+ *	@op: the operation to perform
+ *
+ *	Perform the selected operation on an SGE completion queue context.
+ *	The caller is responsible for ensuring only one context operation
+ *	occurs at a time.
+ */
+int t3_sge_cqcntxt_op(struct adapter *adapter, unsigned int id, unsigned int op,
+		      unsigned int credits)
+{
+	u32 val;
+
+	if (t3_read_reg(adapter, A_SG_CONTEXT_CMD) & F_CONTEXT_CMD_BUSY)
+		return -EBUSY;
+
+	t3_write_reg(adapter, A_SG_CONTEXT_DATA0, credits << 16);
+	t3_write_reg(adapter, A_SG_CONTEXT_CMD, V_CONTEXT_CMD_OPCODE(op) |
+		     V_CONTEXT(id) | F_CQ);
+	if (t3_wait_op_done_val(adapter, A_SG_CONTEXT_CMD, F_CONTEXT_CMD_BUSY,
+				0, SG_CONTEXT_CMD_ATTEMPTS, 1, &val))
+		return -EIO;
+
+	if (op >= 2 && op < 7) {
+		if (adapter->params.rev > 0)
+			return G_CQ_INDEX(val);
+
+		t3_write_reg(adapter, A_SG_CONTEXT_CMD,
+			     V_CONTEXT_CMD_OPCODE(0) | F_CQ | V_CONTEXT(id));
+		if (t3_wait_op_done(adapter, A_SG_CONTEXT_CMD,
+				    F_CONTEXT_CMD_BUSY, 0,
+				    SG_CONTEXT_CMD_ATTEMPTS, 1))
+			return -EIO;
+		return G_CQ_INDEX(t3_read_reg(adapter, A_SG_CONTEXT_DATA0));
+	}
+	return 0;
+}
+
+/**
+ *	t3_config_rss - configure Rx packet steering
+ *	@adapter: the adapter
+ *	@rss_config: RSS settings (written to TP_RSS_CONFIG)
+ *	@cpus: values for the CPU lookup table (0xff terminated)
+ *	@rspq: values for the response queue lookup table (0xffff terminated)
+ *
+ *	Programs the receive packet steering logic.  @cpus and @rspq provide
+ *	the values for the CPU and response queue lookup tables.  If they
+ *	provide fewer values than the size of the tables the supplied values
+ *	are used repeatedly until the tables are fully populated.
+ */
+void t3_config_rss(struct adapter *adapter, unsigned int rss_config,
+		   const u8 * cpus, const u16 *rspq)
+{
+	int i, j, cpu_idx = 0, q_idx = 0;
+
+	if (cpus)
+		for (i = 0; i < RSS_TABLE_SIZE; ++i) {
+			u32 val = i << 16;
+
+			for (j = 0; j < 2; ++j) {
+				val |= (cpus[cpu_idx++] & 0x3f) << (8 * j);
+				if (cpus[cpu_idx] == 0xff)
+					cpu_idx = 0;
+			}
+			t3_write_reg(adapter, A_TP_RSS_LKP_TABLE, val);
+		}
+
+	if (rspq)
+		for (i = 0; i < RSS_TABLE_SIZE; ++i) {
+			t3_write_reg(adapter, A_TP_RSS_MAP_TABLE,
+				     (i << 16) | rspq[q_idx++]);
+			if (rspq[q_idx] == 0xffff)
+				q_idx = 0;
+		}
+
+	t3_write_reg(adapter, A_TP_RSS_CONFIG, rss_config);
+}
+
+/**
+ *	t3_tp_set_offload_mode - put TP in NIC/offload mode
+ *	@adap: the adapter
+ *	@enable: 1 to select offload mode, 0 for regular NIC
+ *
+ *	Switches TP to NIC/offload mode.
+ */
+void t3_tp_set_offload_mode(struct adapter *adap, int enable)
+{
+	if (is_offload(adap) || !enable)
+		t3_set_reg_field(adap, A_TP_IN_CONFIG, F_NICMODE,
+				 V_NICMODE(!enable));
+}
+
+/**
+ *	pm_num_pages - calculate the number of pages of the payload memory
+ *	@mem_size: the size of the payload memory
+ *	@pg_size: the size of each payload memory page
+ *
+ *	Calculate the number of pages, each of the given size, that fit in a
+ *	memory of the specified size, respecting the HW requirement that the
+ *	number of pages must be a multiple of 24.
+ */
+static inline unsigned int pm_num_pages(unsigned int mem_size,
+					unsigned int pg_size)
+{
+	unsigned int n = mem_size / pg_size;
+
+	return n - n % 24;
+}
+
+#define mem_region(adap, start, size, reg) \
+	t3_write_reg((adap), A_ ## reg, (start)); \
+	start += size
+
+/**
+ *	partition_mem - partition memory and configure TP memory settings
+ *	@adap: the adapter
+ *	@p: the TP parameters
+ *
+ *	Partitions context and payload memory and configures TP's memory
+ *	registers.
+ */
+static void partition_mem(struct adapter *adap, const struct tp_params *p)
+{
+	unsigned int m, pstructs, tids = t3_mc5_size(&adap->mc5);
+	unsigned int timers = 0, timers_shift = 22;
+
+	if (adap->params.rev > 0) {
+		if (tids <= 16 * 1024) {
+			timers = 1;
+			timers_shift = 16;
+		} else if (tids <= 64 * 1024) {
+			timers = 2;
+			timers_shift = 18;
+		} else if (tids <= 256 * 1024) {
+			timers = 3;
+			timers_shift = 20;
+		}
+	}
+
+	t3_write_reg(adap, A_TP_PMM_SIZE,
+		     p->chan_rx_size | (p->chan_tx_size >> 16));
+
+	t3_write_reg(adap, A_TP_PMM_TX_BASE, 0);
+	t3_write_reg(adap, A_TP_PMM_TX_PAGE_SIZE, p->tx_pg_size);
+	t3_write_reg(adap, A_TP_PMM_TX_MAX_PAGE, p->tx_num_pgs);
+	t3_set_reg_field(adap, A_TP_PARA_REG3, V_TXDATAACKIDX(M_TXDATAACKIDX),
+			 V_TXDATAACKIDX(fls(p->tx_pg_size) - 12));
+
+	t3_write_reg(adap, A_TP_PMM_RX_BASE, 0);
+	t3_write_reg(adap, A_TP_PMM_RX_PAGE_SIZE, p->rx_pg_size);
+	t3_write_reg(adap, A_TP_PMM_RX_MAX_PAGE, p->rx_num_pgs);
+
+	pstructs = p->rx_num_pgs + p->tx_num_pgs;
+	/* Add a bit of headroom and make multiple of 24 */
+	pstructs += 48;
+	pstructs -= pstructs % 24;
+	t3_write_reg(adap, A_TP_CMM_MM_MAX_PSTRUCT, pstructs);
+
+	m = tids * TCB_SIZE;
+	mem_region(adap, m, (64 << 10) * 64, SG_EGR_CNTX_BADDR);
+	mem_region(adap, m, (64 << 10) * 64, SG_CQ_CONTEXT_BADDR);
+	t3_write_reg(adap, A_TP_CMM_TIMER_BASE, V_CMTIMERMAXNUM(timers) | m);
+	m += ((p->ntimer_qs - 1) << timers_shift) + (1 << 22);
+	mem_region(adap, m, pstructs * 64, TP_CMM_MM_BASE);
+	mem_region(adap, m, 64 * (pstructs / 24), TP_CMM_MM_PS_FLST_BASE);
+	mem_region(adap, m, 64 * (p->rx_num_pgs / 24), TP_CMM_MM_RX_FLST_BASE);
+	mem_region(adap, m, 64 * (p->tx_num_pgs / 24), TP_CMM_MM_TX_FLST_BASE);
+
+	m = (m + 4095) & ~0xfff;
+	t3_write_reg(adap, A_CIM_SDRAM_BASE_ADDR, m);
+	t3_write_reg(adap, A_CIM_SDRAM_ADDR_SIZE, p->cm_size - m);
+
+	tids = (p->cm_size - m - (3 << 20)) / 3072 - 32;
+	m = t3_mc5_size(&adap->mc5) - adap->params.mc5.nservers -
+	    adap->params.mc5.nfilters - adap->params.mc5.nroutes;
+	if (tids < m)
+		adap->params.mc5.nservers += m - tids;
+}
+
+static inline void tp_wr_indirect(struct adapter *adap, unsigned int addr,
+				  u32 val)
+{
+	t3_write_reg(adap, A_TP_PIO_ADDR, addr);
+	t3_write_reg(adap, A_TP_PIO_DATA, val);
+}
+
+static void tp_config(struct adapter *adap, const struct tp_params *p)
+{
+	t3_write_reg(adap, A_TP_GLOBAL_CONFIG, F_TXPACINGENABLE | F_PATHMTU |
+		     F_IPCHECKSUMOFFLOAD | F_UDPCHECKSUMOFFLOAD |
+		     F_TCPCHECKSUMOFFLOAD | V_IPTTL(64));
+	t3_write_reg(adap, A_TP_TCP_OPTIONS, V_MTUDEFAULT(576) |
+		     F_MTUENABLE | V_WINDOWSCALEMODE(1) |
+		     V_TIMESTAMPSMODE(1) | V_SACKMODE(1) | V_SACKRX(1));
+	t3_write_reg(adap, A_TP_DACK_CONFIG, V_AUTOSTATE3(1) |
+		     V_AUTOSTATE2(1) | V_AUTOSTATE1(0) |
+		     V_BYTETHRESHOLD(26880) | V_MSSTHRESHOLD(2) |
+		     F_AUTOCAREFUL | F_AUTOENABLE | V_DACK_MODE(1));
+	t3_set_reg_field(adap, A_TP_IN_CONFIG, F_RXFBARBPRIO | F_TXFBARBPRIO,
+			 F_IPV6ENABLE | F_NICMODE);
+	t3_write_reg(adap, A_TP_TX_RESOURCE_LIMIT, 0x18141814);
+	t3_write_reg(adap, A_TP_PARA_REG4, 0x5050105);
+	t3_set_reg_field(adap, A_TP_PARA_REG6, 0,
+			 adap->params.rev > 0 ? F_ENABLEESND :
+			 F_T3A_ENABLEESND);
+
+	t3_set_reg_field(adap, A_TP_PC_CONFIG,
+			 F_ENABLEEPCMDAFULL,
+			 F_ENABLEOCSPIFULL |F_TXDEFERENABLE | F_HEARBEATDACK |
+			 F_TXCONGESTIONMODE | F_RXCONGESTIONMODE);
+	t3_set_reg_field(adap, A_TP_PC_CONFIG2, F_CHDRAFULL,
+			 F_ENABLEIPV6RSS | F_ENABLENONOFDTNLSYN |
+			 F_ENABLEARPMISS | F_DISBLEDAPARBIT0);
+	t3_write_reg(adap, A_TP_PROXY_FLOW_CNTL, 1080);
+	t3_write_reg(adap, A_TP_PROXY_FLOW_CNTL, 1000);
+
+	if (adap->params.rev > 0) {
+		tp_wr_indirect(adap, A_TP_EGRESS_CONFIG, F_REWRITEFORCETOSIZE);
+		t3_set_reg_field(adap, A_TP_PARA_REG3, F_TXPACEAUTO,
+				 F_TXPACEAUTO);
+		t3_set_reg_field(adap, A_TP_PC_CONFIG, F_LOCKTID, F_LOCKTID);
+		t3_set_reg_field(adap, A_TP_PARA_REG3, 0, F_TXPACEAUTOSTRICT);
+	} else
+		t3_set_reg_field(adap, A_TP_PARA_REG3, 0, F_TXPACEFIXED);
+
+	if (adap->params.rev == T3_REV_C)
+		t3_set_reg_field(adap, A_TP_PC_CONFIG,
+				 V_TABLELATENCYDELTA(M_TABLELATENCYDELTA),
+				 V_TABLELATENCYDELTA(4));
+
+	t3_write_reg(adap, A_TP_TX_MOD_QUEUE_WEIGHT1, 0);
+	t3_write_reg(adap, A_TP_TX_MOD_QUEUE_WEIGHT0, 0);
+	t3_write_reg(adap, A_TP_MOD_CHANNEL_WEIGHT, 0);
+	t3_write_reg(adap, A_TP_MOD_RATE_LIMIT, 0xf2200000);
+}
+
+/* Desired TP timer resolution in usec */
+#define TP_TMR_RES 50
+
+/* TCP timer values in ms */
+#define TP_DACK_TIMER 50
+#define TP_RTO_MIN    250
+
+/**
+ *	tp_set_timers - set TP timing parameters
+ *	@adap: the adapter to set
+ *	@core_clk: the core clock frequency in Hz
+ *
+ *	Set TP's timing parameters, such as the various timer resolutions and
+ *	the TCP timer values.
+ */
+static void tp_set_timers(struct adapter *adap, unsigned int core_clk)
+{
+	unsigned int tre = fls(core_clk / (1000000 / TP_TMR_RES)) - 1;
+	unsigned int dack_re = fls(core_clk / 5000) - 1;	/* 200us */
+	unsigned int tstamp_re = fls(core_clk / 1000);	/* 1ms, at least */
+	unsigned int tps = core_clk >> tre;
+
+	t3_write_reg(adap, A_TP_TIMER_RESOLUTION, V_TIMERRESOLUTION(tre) |
+		     V_DELAYEDACKRESOLUTION(dack_re) |
+		     V_TIMESTAMPRESOLUTION(tstamp_re));
+	t3_write_reg(adap, A_TP_DACK_TIMER,
+		     (core_clk >> dack_re) / (1000 / TP_DACK_TIMER));
+	t3_write_reg(adap, A_TP_TCP_BACKOFF_REG0, 0x3020100);
+	t3_write_reg(adap, A_TP_TCP_BACKOFF_REG1, 0x7060504);
+	t3_write_reg(adap, A_TP_TCP_BACKOFF_REG2, 0xb0a0908);
+	t3_write_reg(adap, A_TP_TCP_BACKOFF_REG3, 0xf0e0d0c);
+	t3_write_reg(adap, A_TP_SHIFT_CNT, V_SYNSHIFTMAX(6) |
+		     V_RXTSHIFTMAXR1(4) | V_RXTSHIFTMAXR2(15) |
+		     V_PERSHIFTBACKOFFMAX(8) | V_PERSHIFTMAX(8) |
+		     V_KEEPALIVEMAX(9));
+
+#define SECONDS * tps
+
+	t3_write_reg(adap, A_TP_MSL, adap->params.rev > 0 ? 0 : 2 SECONDS);
+	t3_write_reg(adap, A_TP_RXT_MIN, tps / (1000 / TP_RTO_MIN));
+	t3_write_reg(adap, A_TP_RXT_MAX, 64 SECONDS);
+	t3_write_reg(adap, A_TP_PERS_MIN, 5 SECONDS);
+	t3_write_reg(adap, A_TP_PERS_MAX, 64 SECONDS);
+	t3_write_reg(adap, A_TP_KEEP_IDLE, 7200 SECONDS);
+	t3_write_reg(adap, A_TP_KEEP_INTVL, 75 SECONDS);
+	t3_write_reg(adap, A_TP_INIT_SRTT, 3 SECONDS);
+	t3_write_reg(adap, A_TP_FINWAIT2_TIMER, 600 SECONDS);
+
+#undef SECONDS
+}
+
+/**
+ *	t3_tp_set_coalescing_size - set receive coalescing size
+ *	@adap: the adapter
+ *	@size: the receive coalescing size
+ *	@psh: whether a set PSH bit should deliver coalesced data
+ *
+ *	Set the receive coalescing size and PSH bit handling.
+ */
+static int t3_tp_set_coalescing_size(struct adapter *adap,
+				     unsigned int size, int psh)
+{
+	u32 val;
+
+	if (size > MAX_RX_COALESCING_LEN)
+		return -EINVAL;
+
+	val = t3_read_reg(adap, A_TP_PARA_REG3);
+	val &= ~(F_RXCOALESCEENABLE | F_RXCOALESCEPSHEN);
+
+	if (size) {
+		val |= F_RXCOALESCEENABLE;
+		if (psh)
+			val |= F_RXCOALESCEPSHEN;
+		size = min(MAX_RX_COALESCING_LEN, size);
+		t3_write_reg(adap, A_TP_PARA_REG2, V_RXCOALESCESIZE(size) |
+			     V_MAXRXDATA(MAX_RX_COALESCING_LEN));
+	}
+	t3_write_reg(adap, A_TP_PARA_REG3, val);
+	return 0;
+}
+
+/**
+ *	t3_tp_set_max_rxsize - set the max receive size
+ *	@adap: the adapter
+ *	@size: the max receive size
+ *
+ *	Set TP's max receive size.  This is the limit that applies when
+ *	receive coalescing is disabled.
+ */
+static void t3_tp_set_max_rxsize(struct adapter *adap, unsigned int size)
+{
+	t3_write_reg(adap, A_TP_PARA_REG7,
+		     V_PMMAXXFERLEN0(size) | V_PMMAXXFERLEN1(size));
+}
+
+static void init_mtus(unsigned short mtus[])
+{
+	/*
+	 * See draft-mathis-plpmtud-00.txt for the values.  The min is 88 so
+	 * it can accommodate max size TCP/IP headers when SACK and timestamps
+	 * are enabled and still have at least 8 bytes of payload.
+	 */
+	mtus[0] = 88;
+	mtus[1] = 88;
+	mtus[2] = 256;
+	mtus[3] = 512;
+	mtus[4] = 576;
+	mtus[5] = 1024;
+	mtus[6] = 1280;
+	mtus[7] = 1492;
+	mtus[8] = 1500;
+	mtus[9] = 2002;
+	mtus[10] = 2048;
+	mtus[11] = 4096;
+	mtus[12] = 4352;
+	mtus[13] = 8192;
+	mtus[14] = 9000;
+	mtus[15] = 9600;
+}
+
+/*
+ * Initial congestion control parameters.
+ */
+static void init_cong_ctrl(unsigned short *a, unsigned short *b)
+{
+	a[0] = a[1] = a[2] = a[3] = a[4] = a[5] = a[6] = a[7] = a[8] = 1;
+	a[9] = 2;
+	a[10] = 3;
+	a[11] = 4;
+	a[12] = 5;
+	a[13] = 6;
+	a[14] = 7;
+	a[15] = 8;
+	a[16] = 9;
+	a[17] = 10;
+	a[18] = 14;
+	a[19] = 17;
+	a[20] = 21;
+	a[21] = 25;
+	a[22] = 30;
+	a[23] = 35;
+	a[24] = 45;
+	a[25] = 60;
+	a[26] = 80;
+	a[27] = 100;
+	a[28] = 200;
+	a[29] = 300;
+	a[30] = 400;
+	a[31] = 500;
+
+	b[0] = b[1] = b[2] = b[3] = b[4] = b[5] = b[6] = b[7] = b[8] = 0;
+	b[9] = b[10] = 1;
+	b[11] = b[12] = 2;
+	b[13] = b[14] = b[15] = b[16] = 3;
+	b[17] = b[18] = b[19] = b[20] = b[21] = 4;
+	b[22] = b[23] = b[24] = b[25] = b[26] = b[27] = 5;
+	b[28] = b[29] = 6;
+	b[30] = b[31] = 7;
+}
+
+/* The minimum additive increment value for the congestion control table */
+#define CC_MIN_INCR 2U
+
+/**
+ *	t3_load_mtus - write the MTU and congestion control HW tables
+ *	@adap: the adapter
+ *	@mtus: the unrestricted values for the MTU table
+ *	@alphs: the values for the congestion control alpha parameter
+ *	@beta: the values for the congestion control beta parameter
+ *	@mtu_cap: the maximum permitted effective MTU
+ *
+ *	Write the MTU table with the supplied MTUs capping each at &mtu_cap.
+ *	Update the high-speed congestion control table with the supplied alpha,
+ * 	beta, and MTUs.
+ */
+void t3_load_mtus(struct adapter *adap, unsigned short mtus[NMTUS],
+		  unsigned short alpha[NCCTRL_WIN],
+		  unsigned short beta[NCCTRL_WIN], unsigned short mtu_cap)
+{
+	static const unsigned int avg_pkts[NCCTRL_WIN] = {
+		2, 6, 10, 14, 20, 28, 40, 56, 80, 112, 160, 224, 320, 448, 640,
+		896, 1281, 1792, 2560, 3584, 5120, 7168, 10240, 14336, 20480,
+		28672, 40960, 57344, 81920, 114688, 163840, 229376
+	};
+
+	unsigned int i, w;
+
+	for (i = 0; i < NMTUS; ++i) {
+		unsigned int mtu = min(mtus[i], mtu_cap);
+		unsigned int log2 = fls(mtu);
+
+		if (!(mtu & ((1 << log2) >> 2)))	/* round */
+			log2--;
+		t3_write_reg(adap, A_TP_MTU_TABLE,
+			     (i << 24) | (log2 << 16) | mtu);
+
+		for (w = 0; w < NCCTRL_WIN; ++w) {
+			unsigned int inc;
+
+			inc = max(((mtu - 40) * alpha[w]) / avg_pkts[w],
+				  CC_MIN_INCR);
+
+			t3_write_reg(adap, A_TP_CCTRL_TABLE, (i << 21) |
+				     (w << 16) | (beta[w] << 13) | inc);
+		}
+	}
+}
+
+/**
+ *	t3_tp_get_mib_stats - read TP's MIB counters
+ *	@adap: the adapter
+ *	@tps: holds the returned counter values
+ *
+ *	Returns the values of TP's MIB counters.
+ */
+void t3_tp_get_mib_stats(struct adapter *adap, struct tp_mib_stats *tps)
+{
+	t3_read_indirect(adap, A_TP_MIB_INDEX, A_TP_MIB_RDATA, (u32 *) tps,
+			 sizeof(*tps) / sizeof(u32), 0);
+}
+
+#define ulp_region(adap, name, start, len) \
+	t3_write_reg((adap), A_ULPRX_ ## name ## _LLIMIT, (start)); \
+	t3_write_reg((adap), A_ULPRX_ ## name ## _ULIMIT, \
+		     (start) + (len) - 1); \
+	start += len
+
+#define ulptx_region(adap, name, start, len) \
+	t3_write_reg((adap), A_ULPTX_ ## name ## _LLIMIT, (start)); \
+	t3_write_reg((adap), A_ULPTX_ ## name ## _ULIMIT, \
+		     (start) + (len) - 1)
+
+static void ulp_config(struct adapter *adap, const struct tp_params *p)
+{
+	unsigned int m = p->chan_rx_size;
+
+	ulp_region(adap, ISCSI, m, p->chan_rx_size / 8);
+	ulp_region(adap, TDDP, m, p->chan_rx_size / 8);
+	ulptx_region(adap, TPT, m, p->chan_rx_size / 4);
+	ulp_region(adap, STAG, m, p->chan_rx_size / 4);
+	ulp_region(adap, RQ, m, p->chan_rx_size / 4);
+	ulptx_region(adap, PBL, m, p->chan_rx_size / 4);
+	ulp_region(adap, PBL, m, p->chan_rx_size / 4);
+	t3_write_reg(adap, A_ULPRX_TDDP_TAGMASK, 0xffffffff);
+}
+
+/**
+ *	t3_set_proto_sram - set the contents of the protocol sram
+ *	@adapter: the adapter
+ *	@data: the protocol image
+ *
+ *	Write the contents of the protocol SRAM.
+ */
+int t3_set_proto_sram(struct adapter *adap, const u8 *data)
+{
+	int i;
+	const __be32 *buf = (const __be32 *)data;
+
+	for (i = 0; i < PROTO_SRAM_LINES; i++) {
+		t3_write_reg(adap, A_TP_EMBED_OP_FIELD5, be32_to_cpu(*buf++));
+		t3_write_reg(adap, A_TP_EMBED_OP_FIELD4, be32_to_cpu(*buf++));
+		t3_write_reg(adap, A_TP_EMBED_OP_FIELD3, be32_to_cpu(*buf++));
+		t3_write_reg(adap, A_TP_EMBED_OP_FIELD2, be32_to_cpu(*buf++));
+		t3_write_reg(adap, A_TP_EMBED_OP_FIELD1, be32_to_cpu(*buf++));
+
+		t3_write_reg(adap, A_TP_EMBED_OP_FIELD0, i << 1 | 1 << 31);
+		if (t3_wait_op_done(adap, A_TP_EMBED_OP_FIELD0, 1, 1, 5, 1))
+			return -EIO;
+	}
+	t3_write_reg(adap, A_TP_EMBED_OP_FIELD0, 0);
+
+	return 0;
+}
+
+void t3_config_trace_filter(struct adapter *adapter,
+			    const struct trace_params *tp, int filter_index,
+			    int invert, int enable)
+{
+	u32 addr, key[4], mask[4];
+
+	key[0] = tp->sport | (tp->sip << 16);
+	key[1] = (tp->sip >> 16) | (tp->dport << 16);
+	key[2] = tp->dip;
+	key[3] = tp->proto | (tp->vlan << 8) | (tp->intf << 20);
+
+	mask[0] = tp->sport_mask | (tp->sip_mask << 16);
+	mask[1] = (tp->sip_mask >> 16) | (tp->dport_mask << 16);
+	mask[2] = tp->dip_mask;
+	mask[3] = tp->proto_mask | (tp->vlan_mask << 8) | (tp->intf_mask << 20);
+
+	if (invert)
+		key[3] |= (1 << 29);
+	if (enable)
+		key[3] |= (1 << 28);
+
+	addr = filter_index ? A_TP_RX_TRC_KEY0 : A_TP_TX_TRC_KEY0;
+	tp_wr_indirect(adapter, addr++, key[0]);
+	tp_wr_indirect(adapter, addr++, mask[0]);
+	tp_wr_indirect(adapter, addr++, key[1]);
+	tp_wr_indirect(adapter, addr++, mask[1]);
+	tp_wr_indirect(adapter, addr++, key[2]);
+	tp_wr_indirect(adapter, addr++, mask[2]);
+	tp_wr_indirect(adapter, addr++, key[3]);
+	tp_wr_indirect(adapter, addr, mask[3]);
+	t3_read_reg(adapter, A_TP_PIO_DATA);
+}
+
+/**
+ *	t3_config_sched - configure a HW traffic scheduler
+ *	@adap: the adapter
+ *	@kbps: target rate in Kbps
+ *	@sched: the scheduler index
+ *
+ *	Configure a HW scheduler for the target rate
+ */
+int t3_config_sched(struct adapter *adap, unsigned int kbps, int sched)
+{
+	unsigned int v, tps, cpt, bpt, delta, mindelta = ~0;
+	unsigned int clk = adap->params.vpd.cclk * 1000;
+	unsigned int selected_cpt = 0, selected_bpt = 0;
+
+	if (kbps > 0) {
+		kbps *= 125;	/* -> bytes */
+		for (cpt = 1; cpt <= 255; cpt++) {
+			tps = clk / cpt;
+			bpt = (kbps + tps / 2) / tps;
+			if (bpt > 0 && bpt <= 255) {
+				v = bpt * tps;
+				delta = v >= kbps ? v - kbps : kbps - v;
+				if (delta <= mindelta) {
+					mindelta = delta;
+					selected_cpt = cpt;
+					selected_bpt = bpt;
+				}
+			} else if (selected_cpt)
+				break;
+		}
+		if (!selected_cpt)
+			return -EINVAL;
+	}
+	t3_write_reg(adap, A_TP_TM_PIO_ADDR,
+		     A_TP_TX_MOD_Q1_Q0_RATE_LIMIT - sched / 2);
+	v = t3_read_reg(adap, A_TP_TM_PIO_DATA);
+	if (sched & 1)
+		v = (v & 0xffff) | (selected_cpt << 16) | (selected_bpt << 24);
+	else
+		v = (v & 0xffff0000) | selected_cpt | (selected_bpt << 8);
+	t3_write_reg(adap, A_TP_TM_PIO_DATA, v);
+	return 0;
+}
+
+static int tp_init(struct adapter *adap, const struct tp_params *p)
+{
+	int busy = 0;
+
+	tp_config(adap, p);
+	t3_set_vlan_accel(adap, 3, 0);
+
+	if (is_offload(adap)) {
+		tp_set_timers(adap, adap->params.vpd.cclk * 1000);
+		t3_write_reg(adap, A_TP_RESET, F_FLSTINITENABLE);
+		busy = t3_wait_op_done(adap, A_TP_RESET, F_FLSTINITENABLE,
+				       0, 1000, 5);
+		if (busy)
+			CH_ERR(adap, "TP initialization timed out\n");
+	}
+
+	if (!busy)
+		t3_write_reg(adap, A_TP_RESET, F_TPRESET);
+	return busy;
+}
+
+/*
+ * Perform the bits of HW initialization that are dependent on the Tx
+ * channels being used.
+ */
+static void chan_init_hw(struct adapter *adap, unsigned int chan_map)
+{
+	int i;
+
+	if (chan_map != 3) {                                 /* one channel */
+		t3_set_reg_field(adap, A_ULPRX_CTL, F_ROUND_ROBIN, 0);
+		t3_set_reg_field(adap, A_ULPTX_CONFIG, F_CFG_RR_ARB, 0);
+		t3_write_reg(adap, A_MPS_CFG, F_TPRXPORTEN | F_ENFORCEPKT |
+			     (chan_map == 1 ? F_TPTXPORT0EN | F_PORT0ACTIVE :
+					      F_TPTXPORT1EN | F_PORT1ACTIVE));
+		t3_write_reg(adap, A_PM1_TX_CFG,
+			     chan_map == 1 ? 0xffffffff : 0);
+	} else {                                             /* two channels */
+		t3_set_reg_field(adap, A_ULPRX_CTL, 0, F_ROUND_ROBIN);
+		t3_set_reg_field(adap, A_ULPTX_CONFIG, 0, F_CFG_RR_ARB);
+		t3_write_reg(adap, A_ULPTX_DMA_WEIGHT,
+			     V_D1_WEIGHT(16) | V_D0_WEIGHT(16));
+		t3_write_reg(adap, A_MPS_CFG, F_TPTXPORT0EN | F_TPTXPORT1EN |
+			     F_TPRXPORTEN | F_PORT0ACTIVE | F_PORT1ACTIVE |
+			     F_ENFORCEPKT);
+		t3_write_reg(adap, A_PM1_TX_CFG, 0x80008000);
+		t3_set_reg_field(adap, A_TP_PC_CONFIG, 0, F_TXTOSQUEUEMAPMODE);
+		t3_write_reg(adap, A_TP_TX_MOD_QUEUE_REQ_MAP,
+			     V_TX_MOD_QUEUE_REQ_MAP(0xaa));
+		for (i = 0; i < 16; i++)
+			t3_write_reg(adap, A_TP_TX_MOD_QUE_TABLE,
+				     (i << 16) | 0x1010);
+	}
+}
+
+static int calibrate_xgm(struct adapter *adapter)
+{
+	if (uses_xaui(adapter)) {
+		unsigned int v, i;
+
+		for (i = 0; i < 5; ++i) {
+			t3_write_reg(adapter, A_XGM_XAUI_IMP, 0);
+			t3_read_reg(adapter, A_XGM_XAUI_IMP);
+			msleep(1);
+			v = t3_read_reg(adapter, A_XGM_XAUI_IMP);
+			if (!(v & (F_XGM_CALFAULT | F_CALBUSY))) {
+				t3_write_reg(adapter, A_XGM_XAUI_IMP,
+					     V_XAUIIMP(G_CALIMP(v) >> 2));
+				return 0;
+			}
+		}
+		CH_ERR(adapter, "MAC calibration failed\n");
+		return -1;
+	} else {
+		t3_write_reg(adapter, A_XGM_RGMII_IMP,
+			     V_RGMIIIMPPD(2) | V_RGMIIIMPPU(3));
+		t3_set_reg_field(adapter, A_XGM_RGMII_IMP, F_XGM_IMPSETUPDATE,
+				 F_XGM_IMPSETUPDATE);
+	}
+	return 0;
+}
+
+static void calibrate_xgm_t3b(struct adapter *adapter)
+{
+	if (!uses_xaui(adapter)) {
+		t3_write_reg(adapter, A_XGM_RGMII_IMP, F_CALRESET |
+			     F_CALUPDATE | V_RGMIIIMPPD(2) | V_RGMIIIMPPU(3));
+		t3_set_reg_field(adapter, A_XGM_RGMII_IMP, F_CALRESET, 0);
+		t3_set_reg_field(adapter, A_XGM_RGMII_IMP, 0,
+				 F_XGM_IMPSETUPDATE);
+		t3_set_reg_field(adapter, A_XGM_RGMII_IMP, F_XGM_IMPSETUPDATE,
+				 0);
+		t3_set_reg_field(adapter, A_XGM_RGMII_IMP, F_CALUPDATE, 0);
+		t3_set_reg_field(adapter, A_XGM_RGMII_IMP, 0, F_CALUPDATE);
+	}
+}
+
+struct mc7_timing_params {
+	unsigned char ActToPreDly;
+	unsigned char ActToRdWrDly;
+	unsigned char PreCyc;
+	unsigned char RefCyc[5];
+	unsigned char BkCyc;
+	unsigned char WrToRdDly;
+	unsigned char RdToWrDly;
+};
+
+/*
+ * Write a value to a register and check that the write completed.  These
+ * writes normally complete in a cycle or two, so one read should suffice.
+ * The very first read exists to flush the posted write to the device.
+ */
+static int wrreg_wait(struct adapter *adapter, unsigned int addr, u32 val)
+{
+	t3_write_reg(adapter, addr, val);
+	t3_read_reg(adapter, addr);	/* flush */
+	if (!(t3_read_reg(adapter, addr) & F_BUSY))
+		return 0;
+	CH_ERR(adapter, "write to MC7 register 0x%x timed out\n", addr);
+	return -EIO;
+}
+
+static int mc7_init(struct mc7 *mc7, unsigned int mc7_clock, int mem_type)
+{
+	static const unsigned int mc7_mode[] = {
+		0x632, 0x642, 0x652, 0x432, 0x442
+	};
+	static const struct mc7_timing_params mc7_timings[] = {
+		{12, 3, 4, {20, 28, 34, 52, 0}, 15, 6, 4},
+		{12, 4, 5, {20, 28, 34, 52, 0}, 16, 7, 4},
+		{12, 5, 6, {20, 28, 34, 52, 0}, 17, 8, 4},
+		{9, 3, 4, {15, 21, 26, 39, 0}, 12, 6, 4},
+		{9, 4, 5, {15, 21, 26, 39, 0}, 13, 7, 4}
+	};
+
+	u32 val;
+	unsigned int width, density, slow, attempts;
+	struct adapter *adapter = mc7->adapter;
+	const struct mc7_timing_params *p = &mc7_timings[mem_type];
+
+	if (!mc7->size)
+		return 0;
+
+	val = t3_read_reg(adapter, mc7->offset + A_MC7_CFG);
+	slow = val & F_SLOW;
+	width = G_WIDTH(val);
+	density = G_DEN(val);
+
+	t3_write_reg(adapter, mc7->offset + A_MC7_CFG, val | F_IFEN);
+	val = t3_read_reg(adapter, mc7->offset + A_MC7_CFG);	/* flush */
+	msleep(1);
+
+	if (!slow) {
+		t3_write_reg(adapter, mc7->offset + A_MC7_CAL, F_SGL_CAL_EN);
+		t3_read_reg(adapter, mc7->offset + A_MC7_CAL);
+		msleep(1);
+		if (t3_read_reg(adapter, mc7->offset + A_MC7_CAL) &
+		    (F_BUSY | F_SGL_CAL_EN | F_CAL_FAULT)) {
+			CH_ERR(adapter, "%s MC7 calibration timed out\n",
+			       mc7->name);
+			goto out_fail;
+		}
+	}
+
+	t3_write_reg(adapter, mc7->offset + A_MC7_PARM,
+		     V_ACTTOPREDLY(p->ActToPreDly) |
+		     V_ACTTORDWRDLY(p->ActToRdWrDly) | V_PRECYC(p->PreCyc) |
+		     V_REFCYC(p->RefCyc[density]) | V_BKCYC(p->BkCyc) |
+		     V_WRTORDDLY(p->WrToRdDly) | V_RDTOWRDLY(p->RdToWrDly));
+
+	t3_write_reg(adapter, mc7->offset + A_MC7_CFG,
+		     val | F_CLKEN | F_TERM150);
+	t3_read_reg(adapter, mc7->offset + A_MC7_CFG);	/* flush */
+
+	if (!slow)
+		t3_set_reg_field(adapter, mc7->offset + A_MC7_DLL, F_DLLENB,
+				 F_DLLENB);
+	udelay(1);
+
+	val = slow ? 3 : 6;
+	if (wrreg_wait(adapter, mc7->offset + A_MC7_PRE, 0) ||
+	    wrreg_wait(adapter, mc7->offset + A_MC7_EXT_MODE2, 0) ||
+	    wrreg_wait(adapter, mc7->offset + A_MC7_EXT_MODE3, 0) ||
+	    wrreg_wait(adapter, mc7->offset + A_MC7_EXT_MODE1, val))
+		goto out_fail;
+
+	if (!slow) {
+		t3_write_reg(adapter, mc7->offset + A_MC7_MODE, 0x100);
+		t3_set_reg_field(adapter, mc7->offset + A_MC7_DLL, F_DLLRST, 0);
+		udelay(5);
+	}
+
+	if (wrreg_wait(adapter, mc7->offset + A_MC7_PRE, 0) ||
+	    wrreg_wait(adapter, mc7->offset + A_MC7_REF, 0) ||
+	    wrreg_wait(adapter, mc7->offset + A_MC7_REF, 0) ||
+	    wrreg_wait(adapter, mc7->offset + A_MC7_MODE,
+		       mc7_mode[mem_type]) ||
+	    wrreg_wait(adapter, mc7->offset + A_MC7_EXT_MODE1, val | 0x380) ||
+	    wrreg_wait(adapter, mc7->offset + A_MC7_EXT_MODE1, val))
+		goto out_fail;
+
+	/* clock value is in KHz */
+	mc7_clock = mc7_clock * 7812 + mc7_clock / 2;	/* ns */
+	mc7_clock /= 1000000;	/* KHz->MHz, ns->us */
+
+	t3_write_reg(adapter, mc7->offset + A_MC7_REF,
+		     F_PERREFEN | V_PREREFDIV(mc7_clock));
+	t3_read_reg(adapter, mc7->offset + A_MC7_REF);	/* flush */
+
+	t3_write_reg(adapter, mc7->offset + A_MC7_ECC, F_ECCGENEN | F_ECCCHKEN);
+	t3_write_reg(adapter, mc7->offset + A_MC7_BIST_DATA, 0);
+	t3_write_reg(adapter, mc7->offset + A_MC7_BIST_ADDR_BEG, 0);
+	t3_write_reg(adapter, mc7->offset + A_MC7_BIST_ADDR_END,
+		     (mc7->size << width) - 1);
+	t3_write_reg(adapter, mc7->offset + A_MC7_BIST_OP, V_OP(1));
+	t3_read_reg(adapter, mc7->offset + A_MC7_BIST_OP);	/* flush */
+
+	attempts = 50;
+	do {
+		msleep(250);
+		val = t3_read_reg(adapter, mc7->offset + A_MC7_BIST_OP);
+	} while ((val & F_BUSY) && --attempts);
+	if (val & F_BUSY) {
+		CH_ERR(adapter, "%s MC7 BIST timed out\n", mc7->name);
+		goto out_fail;
+	}
+
+	/* Enable normal memory accesses. */
+	t3_set_reg_field(adapter, mc7->offset + A_MC7_CFG, 0, F_RDY);
+	return 0;
+
+out_fail:
+	return -1;
+}
+
+static void config_pcie(struct adapter *adap)
+{
+	static const u16 ack_lat[4][6] = {
+		{237, 416, 559, 1071, 2095, 4143},
+		{128, 217, 289, 545, 1057, 2081},
+		{73, 118, 154, 282, 538, 1050},
+		{67, 107, 86, 150, 278, 534}
+	};
+	static const u16 rpl_tmr[4][6] = {
+		{711, 1248, 1677, 3213, 6285, 12429},
+		{384, 651, 867, 1635, 3171, 6243},
+		{219, 354, 462, 846, 1614, 3150},
+		{201, 321, 258, 450, 834, 1602}
+	};
+
+	u16 val, devid;
+	unsigned int log2_width, pldsize;
+	unsigned int fst_trn_rx, fst_trn_tx, acklat, rpllmt;
+
+	pcie_capability_read_word(adap->pdev, PCI_EXP_DEVCTL, &val);
+	pldsize = (val & PCI_EXP_DEVCTL_PAYLOAD) >> 5;
+
+	pci_read_config_word(adap->pdev, 0x2, &devid);
+	if (devid == 0x37) {
+		pcie_capability_write_word(adap->pdev, PCI_EXP_DEVCTL,
+					   val & ~PCI_EXP_DEVCTL_READRQ &
+					   ~PCI_EXP_DEVCTL_PAYLOAD);
+		pldsize = 0;
+	}
+
+	pcie_capability_read_word(adap->pdev, PCI_EXP_LNKCTL, &val);
+
+	fst_trn_tx = G_NUMFSTTRNSEQ(t3_read_reg(adap, A_PCIE_PEX_CTRL0));
+	fst_trn_rx = adap->params.rev == 0 ? fst_trn_tx :
+	    G_NUMFSTTRNSEQRX(t3_read_reg(adap, A_PCIE_MODE));
+	log2_width = fls(adap->params.pci.width) - 1;
+	acklat = ack_lat[log2_width][pldsize];
+	if (val & PCI_EXP_LNKCTL_ASPM_L0S)	/* check LOsEnable */
+		acklat += fst_trn_tx * 4;
+	rpllmt = rpl_tmr[log2_width][pldsize] + fst_trn_rx * 4;
+
+	if (adap->params.rev == 0)
+		t3_set_reg_field(adap, A_PCIE_PEX_CTRL1,
+				 V_T3A_ACKLAT(M_T3A_ACKLAT),
+				 V_T3A_ACKLAT(acklat));
+	else
+		t3_set_reg_field(adap, A_PCIE_PEX_CTRL1, V_ACKLAT(M_ACKLAT),
+				 V_ACKLAT(acklat));
+
+	t3_set_reg_field(adap, A_PCIE_PEX_CTRL0, V_REPLAYLMT(M_REPLAYLMT),
+			 V_REPLAYLMT(rpllmt));
+
+	t3_write_reg(adap, A_PCIE_PEX_ERR, 0xffffffff);
+	t3_set_reg_field(adap, A_PCIE_CFG, 0,
+			 F_ENABLELINKDWNDRST | F_ENABLELINKDOWNRST |
+			 F_PCIE_DMASTOPEN | F_PCIE_CLIDECEN);
+}
+
+/*
+ * Initialize and configure T3 HW modules.  This performs the
+ * initialization steps that need to be done once after a card is reset.
+ * MAC and PHY initialization is handled separarely whenever a port is enabled.
+ *
+ * fw_params are passed to FW and their value is platform dependent.  Only the
+ * top 8 bits are available for use, the rest must be 0.
+ */
+int t3_init_hw(struct adapter *adapter, u32 fw_params)
+{
+	int err = -EIO, attempts, i;
+	const struct vpd_params *vpd = &adapter->params.vpd;
+
+	if (adapter->params.rev > 0)
+		calibrate_xgm_t3b(adapter);
+	else if (calibrate_xgm(adapter))
+		goto out_err;
+
+	if (vpd->mclk) {
+		partition_mem(adapter, &adapter->params.tp);
+
+		if (mc7_init(&adapter->pmrx, vpd->mclk, vpd->mem_timing) ||
+		    mc7_init(&adapter->pmtx, vpd->mclk, vpd->mem_timing) ||
+		    mc7_init(&adapter->cm, vpd->mclk, vpd->mem_timing) ||
+		    t3_mc5_init(&adapter->mc5, adapter->params.mc5.nservers,
+				adapter->params.mc5.nfilters,
+				adapter->params.mc5.nroutes))
+			goto out_err;
+
+		for (i = 0; i < 32; i++)
+			if (clear_sge_ctxt(adapter, i, F_CQ))
+				goto out_err;
+	}
+
+	if (tp_init(adapter, &adapter->params.tp))
+		goto out_err;
+
+	t3_tp_set_coalescing_size(adapter,
+				  min(adapter->params.sge.max_pkt_size,
+				      MAX_RX_COALESCING_LEN), 1);
+	t3_tp_set_max_rxsize(adapter,
+			     min(adapter->params.sge.max_pkt_size, 16384U));
+	ulp_config(adapter, &adapter->params.tp);
+
+	if (is_pcie(adapter))
+		config_pcie(adapter);
+	else
+		t3_set_reg_field(adapter, A_PCIX_CFG, 0,
+				 F_DMASTOPEN | F_CLIDECEN);
+
+	if (adapter->params.rev == T3_REV_C)
+		t3_set_reg_field(adapter, A_ULPTX_CONFIG, 0,
+				 F_CFG_CQE_SOP_MASK);
+
+	t3_write_reg(adapter, A_PM1_RX_CFG, 0xffffffff);
+	t3_write_reg(adapter, A_PM1_RX_MODE, 0);
+	t3_write_reg(adapter, A_PM1_TX_MODE, 0);
+	chan_init_hw(adapter, adapter->params.chan_map);
+	t3_sge_init(adapter, &adapter->params.sge);
+	t3_set_reg_field(adapter, A_PL_RST, 0, F_FATALPERREN);
+
+	t3_write_reg(adapter, A_T3DBG_GPIO_ACT_LOW, calc_gpio_intr(adapter));
+
+	t3_write_reg(adapter, A_CIM_HOST_ACC_DATA, vpd->uclk | fw_params);
+	t3_write_reg(adapter, A_CIM_BOOT_CFG,
+		     V_BOOTADDR(FW_FLASH_BOOT_ADDR >> 2));
+	t3_read_reg(adapter, A_CIM_BOOT_CFG);	/* flush */
+
+	attempts = 100;
+	do {			/* wait for uP to initialize */
+		msleep(20);
+	} while (t3_read_reg(adapter, A_CIM_HOST_ACC_DATA) && --attempts);
+	if (!attempts) {
+		CH_ERR(adapter, "uP initialization timed out\n");
+		goto out_err;
+	}
+
+	err = 0;
+out_err:
+	return err;
+}
+
+/**
+ *	get_pci_mode - determine a card's PCI mode
+ *	@adapter: the adapter
+ *	@p: where to store the PCI settings
+ *
+ *	Determines a card's PCI mode and associated parameters, such as speed
+ *	and width.
+ */
+static void get_pci_mode(struct adapter *adapter, struct pci_params *p)
+{
+	static unsigned short speed_map[] = { 33, 66, 100, 133 };
+	u32 pci_mode;
+
+	if (pci_is_pcie(adapter->pdev)) {
+		u16 val;
+
+		p->variant = PCI_VARIANT_PCIE;
+		pcie_capability_read_word(adapter->pdev, PCI_EXP_LNKSTA, &val);
+		p->width = (val >> 4) & 0x3f;
+		return;
+	}
+
+	pci_mode = t3_read_reg(adapter, A_PCIX_MODE);
+	p->speed = speed_map[G_PCLKRANGE(pci_mode)];
+	p->width = (pci_mode & F_64BIT) ? 64 : 32;
+	pci_mode = G_PCIXINITPAT(pci_mode);
+	if (pci_mode == 0)
+		p->variant = PCI_VARIANT_PCI;
+	else if (pci_mode < 4)
+		p->variant = PCI_VARIANT_PCIX_MODE1_PARITY;
+	else if (pci_mode < 8)
+		p->variant = PCI_VARIANT_PCIX_MODE1_ECC;
+	else
+		p->variant = PCI_VARIANT_PCIX_266_MODE2;
+}
+
+/**
+ *	init_link_config - initialize a link's SW state
+ *	@lc: structure holding the link state
+ *	@ai: information about the current card
+ *
+ *	Initializes the SW state maintained for each link, including the link's
+ *	capabilities and default speed/duplex/flow-control/autonegotiation
+ *	settings.
+ */
+static void init_link_config(struct link_config *lc, unsigned int caps)
+{
+	lc->supported = caps;
+	lc->requested_speed = lc->speed = SPEED_INVALID;
+	lc->requested_duplex = lc->duplex = DUPLEX_INVALID;
+	lc->requested_fc = lc->fc = PAUSE_RX | PAUSE_TX;
+	if (lc->supported & SUPPORTED_Autoneg) {
+		lc->advertising = lc->supported;
+		lc->autoneg = AUTONEG_ENABLE;
+		lc->requested_fc |= PAUSE_AUTONEG;
+	} else {
+		lc->advertising = 0;
+		lc->autoneg = AUTONEG_DISABLE;
+	}
+}
+
+/**
+ *	mc7_calc_size - calculate MC7 memory size
+ *	@cfg: the MC7 configuration
+ *
+ *	Calculates the size of an MC7 memory in bytes from the value of its
+ *	configuration register.
+ */
+static unsigned int mc7_calc_size(u32 cfg)
+{
+	unsigned int width = G_WIDTH(cfg);
+	unsigned int banks = !!(cfg & F_BKS) + 1;
+	unsigned int org = !!(cfg & F_ORG) + 1;
+	unsigned int density = G_DEN(cfg);
+	unsigned int MBs = ((256 << density) * banks) / (org << width);
+
+	return MBs << 20;
+}
+
+static void mc7_prep(struct adapter *adapter, struct mc7 *mc7,
+		     unsigned int base_addr, const char *name)
+{
+	u32 cfg;
+
+	mc7->adapter = adapter;
+	mc7->name = name;
+	mc7->offset = base_addr - MC7_PMRX_BASE_ADDR;
+	cfg = t3_read_reg(adapter, mc7->offset + A_MC7_CFG);
+	mc7->size = G_DEN(cfg) == M_DEN ? 0 : mc7_calc_size(cfg);
+	mc7->width = G_WIDTH(cfg);
+}
+
+static void mac_prep(struct cmac *mac, struct adapter *adapter, int index)
+{
+	u16 devid;
+
+	mac->adapter = adapter;
+	pci_read_config_word(adapter->pdev, 0x2, &devid);
+
+	if (devid == 0x37 && !adapter->params.vpd.xauicfg[1])
+		index = 0;
+	mac->offset = (XGMAC0_1_BASE_ADDR - XGMAC0_0_BASE_ADDR) * index;
+	mac->nucast = 1;
+
+	if (adapter->params.rev == 0 && uses_xaui(adapter)) {
+		t3_write_reg(adapter, A_XGM_SERDES_CTRL + mac->offset,
+			     is_10G(adapter) ? 0x2901c04 : 0x2301c04);
+		t3_set_reg_field(adapter, A_XGM_PORT_CFG + mac->offset,
+				 F_ENRGMII, 0);
+	}
+}
+
+static void early_hw_init(struct adapter *adapter,
+			  const struct adapter_info *ai)
+{
+	u32 val = V_PORTSPEED(is_10G(adapter) ? 3 : 2);
+
+	mi1_init(adapter, ai);
+	t3_write_reg(adapter, A_I2C_CFG,	/* set for 80KHz */
+		     V_I2C_CLKDIV(adapter->params.vpd.cclk / 80 - 1));
+	t3_write_reg(adapter, A_T3DBG_GPIO_EN,
+		     ai->gpio_out | F_GPIO0_OEN | F_GPIO0_OUT_VAL);
+	t3_write_reg(adapter, A_MC5_DB_SERVER_INDEX, 0);
+	t3_write_reg(adapter, A_SG_OCO_BASE, V_BASE1(0xfff));
+
+	if (adapter->params.rev == 0 || !uses_xaui(adapter))
+		val |= F_ENRGMII;
+
+	/* Enable MAC clocks so we can access the registers */
+	t3_write_reg(adapter, A_XGM_PORT_CFG, val);
+	t3_read_reg(adapter, A_XGM_PORT_CFG);
+
+	val |= F_CLKDIVRESET_;
+	t3_write_reg(adapter, A_XGM_PORT_CFG, val);
+	t3_read_reg(adapter, A_XGM_PORT_CFG);
+	t3_write_reg(adapter, XGM_REG(A_XGM_PORT_CFG, 1), val);
+	t3_read_reg(adapter, A_XGM_PORT_CFG);
+}
+
+/*
+ * Reset the adapter.
+ * Older PCIe cards lose their config space during reset, PCI-X
+ * ones don't.
+ */
+int t3_reset_adapter(struct adapter *adapter)
+{
+	int i, save_and_restore_pcie =
+	    adapter->params.rev < T3_REV_B2 && is_pcie(adapter);
+	uint16_t devid = 0;
+
+	if (save_and_restore_pcie)
+		pci_save_state(adapter->pdev);
+	t3_write_reg(adapter, A_PL_RST, F_CRSTWRM | F_CRSTWRMMODE);
+
+	/*
+	 * Delay. Give Some time to device to reset fully.
+	 * XXX The delay time should be modified.
+	 */
+	for (i = 0; i < 10; i++) {
+		msleep(50);
+		pci_read_config_word(adapter->pdev, 0x00, &devid);
+		if (devid == 0x1425)
+			break;
+	}
+
+	if (devid != 0x1425)
+		return -1;
+
+	if (save_and_restore_pcie)
+		pci_restore_state(adapter->pdev);
+	return 0;
+}
+
+static int init_parity(struct adapter *adap)
+{
+		int i, err, addr;
+
+	if (t3_read_reg(adap, A_SG_CONTEXT_CMD) & F_CONTEXT_CMD_BUSY)
+		return -EBUSY;
+
+	for (err = i = 0; !err && i < 16; i++)
+		err = clear_sge_ctxt(adap, i, F_EGRESS);
+	for (i = 0xfff0; !err && i <= 0xffff; i++)
+		err = clear_sge_ctxt(adap, i, F_EGRESS);
+	for (i = 0; !err && i < SGE_QSETS; i++)
+		err = clear_sge_ctxt(adap, i, F_RESPONSEQ);
+	if (err)
+		return err;
+
+	t3_write_reg(adap, A_CIM_IBQ_DBG_DATA, 0);
+	for (i = 0; i < 4; i++)
+		for (addr = 0; addr <= M_IBQDBGADDR; addr++) {
+			t3_write_reg(adap, A_CIM_IBQ_DBG_CFG, F_IBQDBGEN |
+				     F_IBQDBGWR | V_IBQDBGQID(i) |
+				     V_IBQDBGADDR(addr));
+			err = t3_wait_op_done(adap, A_CIM_IBQ_DBG_CFG,
+					      F_IBQDBGBUSY, 0, 2, 1);
+			if (err)
+				return err;
+		}
+	return 0;
+}
+
+/*
+ * Initialize adapter SW state for the various HW modules, set initial values
+ * for some adapter tunables, take PHYs out of reset, and initialize the MDIO
+ * interface.
+ */
+int t3_prep_adapter(struct adapter *adapter, const struct adapter_info *ai,
+		    int reset)
+{
+	int ret;
+	unsigned int i, j = -1;
+
+	get_pci_mode(adapter, &adapter->params.pci);
+
+	adapter->params.info = ai;
+	adapter->params.nports = ai->nports0 + ai->nports1;
+	adapter->params.chan_map = (!!ai->nports0) | (!!ai->nports1 << 1);
+	adapter->params.rev = t3_read_reg(adapter, A_PL_REV);
+	/*
+	 * We used to only run the "adapter check task" once a second if
+	 * we had PHYs which didn't support interrupts (we would check
+	 * their link status once a second).  Now we check other conditions
+	 * in that routine which could potentially impose a very high
+	 * interrupt load on the system.  As such, we now always scan the
+	 * adapter state once a second ...
+	 */
+	adapter->params.linkpoll_period = 10;
+	adapter->params.stats_update_period = is_10G(adapter) ?
+	    MAC_STATS_ACCUM_SECS : (MAC_STATS_ACCUM_SECS * 10);
+	adapter->params.pci.vpd_cap_addr =
+	    pci_find_capability(adapter->pdev, PCI_CAP_ID_VPD);
+	ret = get_vpd_params(adapter, &adapter->params.vpd);
+	if (ret < 0)
+		return ret;
+
+	if (reset && t3_reset_adapter(adapter))
+		return -1;
+
+	t3_sge_prep(adapter, &adapter->params.sge);
+
+	if (adapter->params.vpd.mclk) {
+		struct tp_params *p = &adapter->params.tp;
+
+		mc7_prep(adapter, &adapter->pmrx, MC7_PMRX_BASE_ADDR, "PMRX");
+		mc7_prep(adapter, &adapter->pmtx, MC7_PMTX_BASE_ADDR, "PMTX");
+		mc7_prep(adapter, &adapter->cm, MC7_CM_BASE_ADDR, "CM");
+
+		p->nchan = adapter->params.chan_map == 3 ? 2 : 1;
+		p->pmrx_size = t3_mc7_size(&adapter->pmrx);
+		p->pmtx_size = t3_mc7_size(&adapter->pmtx);
+		p->cm_size = t3_mc7_size(&adapter->cm);
+		p->chan_rx_size = p->pmrx_size / 2;	/* only 1 Rx channel */
+		p->chan_tx_size = p->pmtx_size / p->nchan;
+		p->rx_pg_size = 64 * 1024;
+		p->tx_pg_size = is_10G(adapter) ? 64 * 1024 : 16 * 1024;
+		p->rx_num_pgs = pm_num_pages(p->chan_rx_size, p->rx_pg_size);
+		p->tx_num_pgs = pm_num_pages(p->chan_tx_size, p->tx_pg_size);
+		p->ntimer_qs = p->cm_size >= (128 << 20) ||
+		    adapter->params.rev > 0 ? 12 : 6;
+	}
+
+	adapter->params.offload = t3_mc7_size(&adapter->pmrx) &&
+				  t3_mc7_size(&adapter->pmtx) &&
+				  t3_mc7_size(&adapter->cm);
+
+	if (is_offload(adapter)) {
+		adapter->params.mc5.nservers = DEFAULT_NSERVERS;
+		adapter->params.mc5.nfilters = adapter->params.rev > 0 ?
+		    DEFAULT_NFILTERS : 0;
+		adapter->params.mc5.nroutes = 0;
+		t3_mc5_prep(adapter, &adapter->mc5, MC5_MODE_144_BIT);
+
+		init_mtus(adapter->params.mtus);
+		init_cong_ctrl(adapter->params.a_wnd, adapter->params.b_wnd);
+	}
+
+	early_hw_init(adapter, ai);
+	ret = init_parity(adapter);
+	if (ret)
+		return ret;
+
+	for_each_port(adapter, i) {
+		u8 hw_addr[6];
+		const struct port_type_info *pti;
+		struct port_info *p = adap2pinfo(adapter, i);
+
+		while (!adapter->params.vpd.port_type[++j])
+			;
+
+		pti = &port_types[adapter->params.vpd.port_type[j]];
+		if (!pti->phy_prep) {
+			CH_ALERT(adapter, "Invalid port type index %d\n",
+				 adapter->params.vpd.port_type[j]);
+			return -EINVAL;
+		}
+
+		p->phy.mdio.dev = adapter->port[i];
+		ret = pti->phy_prep(&p->phy, adapter, ai->phy_base_addr + j,
+				    ai->mdio_ops);
+		if (ret)
+			return ret;
+		mac_prep(&p->mac, adapter, j);
+
+		/*
+		 * The VPD EEPROM stores the base Ethernet address for the
+		 * card.  A port's address is derived from the base by adding
+		 * the port's index to the base's low octet.
+		 */
+		memcpy(hw_addr, adapter->params.vpd.eth_base, 5);
+		hw_addr[5] = adapter->params.vpd.eth_base[5] + i;
+
+		memcpy(adapter->port[i]->dev_addr, hw_addr,
+		       ETH_ALEN);
+		init_link_config(&p->link_config, p->phy.caps);
+		p->phy.ops->power_down(&p->phy, 1);
+
+		/*
+		 * If the PHY doesn't support interrupts for link status
+		 * changes, schedule a scan of the adapter links at least
+		 * once a second.
+		 */
+		if (!(p->phy.caps & SUPPORTED_IRQ) &&
+		    adapter->params.linkpoll_period > 10)
+			adapter->params.linkpoll_period = 10;
+	}
+
+	return 0;
+}
+
+void t3_led_ready(struct adapter *adapter)
+{
+	t3_set_reg_field(adapter, A_T3DBG_GPIO_EN, F_GPIO0_OUT_VAL,
+			 F_GPIO0_OUT_VAL);
+}
+
+int t3_replay_prep_adapter(struct adapter *adapter)
+{
+	const struct adapter_info *ai = adapter->params.info;
+	unsigned int i, j = -1;
+	int ret;
+
+	early_hw_init(adapter, ai);
+	ret = init_parity(adapter);
+	if (ret)
+		return ret;
+
+	for_each_port(adapter, i) {
+		const struct port_type_info *pti;
+		struct port_info *p = adap2pinfo(adapter, i);
+
+		while (!adapter->params.vpd.port_type[++j])
+			;
+
+		pti = &port_types[adapter->params.vpd.port_type[j]];
+		ret = pti->phy_prep(&p->phy, adapter, p->phy.mdio.prtad, NULL);
+		if (ret)
+			return ret;
+		p->phy.ops->power_down(&p->phy, 1);
+	}
+
+return 0;
+}
+
diff --git a/drivers/net/ethernet/chelsio/cxgb3/t3cdev.h b/drivers/net/ethernet/chelsio/cxgb3/t3cdev.h
new file mode 100644
index 0000000..705713b
--- /dev/null
+++ b/drivers/net/ethernet/chelsio/cxgb3/t3cdev.h
@@ -0,0 +1,70 @@
+/*
+ * Copyright (C) 2006-2008 Chelsio Communications.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef _T3CDEV_H_
+#define _T3CDEV_H_
+
+#include <linux/list.h>
+#include <linux/atomic.h>
+#include <linux/netdevice.h>
+#include <linux/proc_fs.h>
+#include <linux/skbuff.h>
+#include <net/neighbour.h>
+
+#define T3CNAMSIZ 16
+
+struct cxgb3_client;
+
+enum t3ctype {
+	T3A = 0,
+	T3B,
+	T3C,
+};
+
+struct t3cdev {
+	char name[T3CNAMSIZ];	/* T3C device name */
+	enum t3ctype type;
+	struct list_head ofld_dev_list;	/* for list linking */
+	struct net_device *lldev;	/* LL dev associated with T3C messages */
+	struct proc_dir_entry *proc_dir;	/* root of proc dir for this T3C */
+	int (*send)(struct t3cdev *dev, struct sk_buff *skb);
+	int (*recv)(struct t3cdev *dev, struct sk_buff **skb, int n);
+	int (*ctl)(struct t3cdev *dev, unsigned int req, void *data);
+	void (*neigh_update)(struct t3cdev *dev, struct neighbour *neigh);
+	void *priv;		/* driver private data */
+	void *l2opt;		/* optional layer 2 data */
+	void *l3opt;		/* optional layer 3 data */
+	void *l4opt;		/* optional layer 4 data */
+	void *ulp;		/* ulp stuff */
+	void *ulp_iscsi;	/* ulp iscsi */
+};
+
+#endif				/* _T3CDEV_H_ */
diff --git a/drivers/net/ethernet/chelsio/cxgb3/version.h b/drivers/net/ethernet/chelsio/cxgb3/version.h
new file mode 100644
index 0000000..165bfb9
--- /dev/null
+++ b/drivers/net/ethernet/chelsio/cxgb3/version.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2003-2008 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/* $Date: 2006/10/31 18:57:51 $ $RCSfile: version.h,v $ $Revision: 1.3 $ */
+#ifndef __CHELSIO_VERSION_H
+#define __CHELSIO_VERSION_H
+#define DRV_DESC "Chelsio T3 Network Driver"
+#define DRV_NAME "cxgb3"
+/* Driver version */
+#define DRV_VERSION "1.1.5-ko"
+
+/* Firmware version */
+#define FW_VERSION_MAJOR 7
+#define FW_VERSION_MINOR 12
+#define FW_VERSION_MICRO 0
+#endif				/* __CHELSIO_VERSION_H */
diff --git a/drivers/net/ethernet/chelsio/cxgb3/vsc8211.c b/drivers/net/ethernet/chelsio/cxgb3/vsc8211.c
new file mode 100644
index 0000000..4f9a1c2
--- /dev/null
+++ b/drivers/net/ethernet/chelsio/cxgb3/vsc8211.c
@@ -0,0 +1,416 @@
+/*
+ * Copyright (c) 2005-2008 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "common.h"
+
+/* VSC8211 PHY specific registers. */
+enum {
+	VSC8211_SIGDET_CTRL = 19,
+	VSC8211_EXT_CTRL = 23,
+	VSC8211_INTR_ENABLE = 25,
+	VSC8211_INTR_STATUS = 26,
+	VSC8211_LED_CTRL = 27,
+	VSC8211_AUX_CTRL_STAT = 28,
+	VSC8211_EXT_PAGE_AXS = 31,
+};
+
+enum {
+	VSC_INTR_RX_ERR = 1 << 0,
+	VSC_INTR_MS_ERR = 1 << 1,  /* master/slave resolution error */
+	VSC_INTR_CABLE = 1 << 2,  /* cable impairment */
+	VSC_INTR_FALSE_CARR = 1 << 3,  /* false carrier */
+	VSC_INTR_MEDIA_CHG = 1 << 4,  /* AMS media change */
+	VSC_INTR_RX_FIFO = 1 << 5,  /* Rx FIFO over/underflow */
+	VSC_INTR_TX_FIFO = 1 << 6,  /* Tx FIFO over/underflow */
+	VSC_INTR_DESCRAMBL = 1 << 7,  /* descrambler lock-lost */
+	VSC_INTR_SYMBOL_ERR = 1 << 8,  /* symbol error */
+	VSC_INTR_NEG_DONE = 1 << 10, /* autoneg done */
+	VSC_INTR_NEG_ERR = 1 << 11, /* autoneg error */
+	VSC_INTR_DPLX_CHG = 1 << 12, /* duplex change */
+	VSC_INTR_LINK_CHG = 1 << 13, /* link change */
+	VSC_INTR_SPD_CHG = 1 << 14, /* speed change */
+	VSC_INTR_ENABLE = 1 << 15, /* interrupt enable */
+};
+
+enum {
+	VSC_CTRL_CLAUSE37_VIEW = 1 << 4,   /* Switch to Clause 37 view */
+	VSC_CTRL_MEDIA_MODE_HI = 0xf000    /* High part of media mode select */
+};
+
+#define CFG_CHG_INTR_MASK (VSC_INTR_LINK_CHG | VSC_INTR_NEG_ERR | \
+			   VSC_INTR_DPLX_CHG | VSC_INTR_SPD_CHG | \
+	 		   VSC_INTR_NEG_DONE)
+#define INTR_MASK (CFG_CHG_INTR_MASK | VSC_INTR_TX_FIFO | VSC_INTR_RX_FIFO | \
+		   VSC_INTR_ENABLE)
+
+/* PHY specific auxiliary control & status register fields */
+#define S_ACSR_ACTIPHY_TMR    0
+#define M_ACSR_ACTIPHY_TMR    0x3
+#define V_ACSR_ACTIPHY_TMR(x) ((x) << S_ACSR_ACTIPHY_TMR)
+
+#define S_ACSR_SPEED    3
+#define M_ACSR_SPEED    0x3
+#define G_ACSR_SPEED(x) (((x) >> S_ACSR_SPEED) & M_ACSR_SPEED)
+
+#define S_ACSR_DUPLEX 5
+#define F_ACSR_DUPLEX (1 << S_ACSR_DUPLEX)
+
+#define S_ACSR_ACTIPHY 6
+#define F_ACSR_ACTIPHY (1 << S_ACSR_ACTIPHY)
+
+/*
+ * Reset the PHY.  This PHY completes reset immediately so we never wait.
+ */
+static int vsc8211_reset(struct cphy *cphy, int wait)
+{
+	return t3_phy_reset(cphy, MDIO_DEVAD_NONE, 0);
+}
+
+static int vsc8211_intr_enable(struct cphy *cphy)
+{
+	return t3_mdio_write(cphy, MDIO_DEVAD_NONE, VSC8211_INTR_ENABLE,
+			     INTR_MASK);
+}
+
+static int vsc8211_intr_disable(struct cphy *cphy)
+{
+	return t3_mdio_write(cphy, MDIO_DEVAD_NONE, VSC8211_INTR_ENABLE, 0);
+}
+
+static int vsc8211_intr_clear(struct cphy *cphy)
+{
+	u32 val;
+
+	/* Clear PHY interrupts by reading the register. */
+	return t3_mdio_read(cphy, MDIO_DEVAD_NONE, VSC8211_INTR_STATUS, &val);
+}
+
+static int vsc8211_autoneg_enable(struct cphy *cphy)
+{
+	return t3_mdio_change_bits(cphy, MDIO_DEVAD_NONE, MII_BMCR,
+				   BMCR_PDOWN | BMCR_ISOLATE,
+				   BMCR_ANENABLE | BMCR_ANRESTART);
+}
+
+static int vsc8211_autoneg_restart(struct cphy *cphy)
+{
+	return t3_mdio_change_bits(cphy, MDIO_DEVAD_NONE, MII_BMCR,
+				   BMCR_PDOWN | BMCR_ISOLATE,
+				   BMCR_ANRESTART);
+}
+
+static int vsc8211_get_link_status(struct cphy *cphy, int *link_ok,
+				   int *speed, int *duplex, int *fc)
+{
+	unsigned int bmcr, status, lpa, adv;
+	int err, sp = -1, dplx = -1, pause = 0;
+
+	err = t3_mdio_read(cphy, MDIO_DEVAD_NONE, MII_BMCR, &bmcr);
+	if (!err)
+		err = t3_mdio_read(cphy, MDIO_DEVAD_NONE, MII_BMSR, &status);
+	if (err)
+		return err;
+
+	if (link_ok) {
+		/*
+		 * BMSR_LSTATUS is latch-low, so if it is 0 we need to read it
+		 * once more to get the current link state.
+		 */
+		if (!(status & BMSR_LSTATUS))
+			err = t3_mdio_read(cphy, MDIO_DEVAD_NONE, MII_BMSR,
+					   &status);
+		if (err)
+			return err;
+		*link_ok = (status & BMSR_LSTATUS) != 0;
+	}
+	if (!(bmcr & BMCR_ANENABLE)) {
+		dplx = (bmcr & BMCR_FULLDPLX) ? DUPLEX_FULL : DUPLEX_HALF;
+		if (bmcr & BMCR_SPEED1000)
+			sp = SPEED_1000;
+		else if (bmcr & BMCR_SPEED100)
+			sp = SPEED_100;
+		else
+			sp = SPEED_10;
+	} else if (status & BMSR_ANEGCOMPLETE) {
+		err = t3_mdio_read(cphy, MDIO_DEVAD_NONE, VSC8211_AUX_CTRL_STAT,
+				   &status);
+		if (err)
+			return err;
+
+		dplx = (status & F_ACSR_DUPLEX) ? DUPLEX_FULL : DUPLEX_HALF;
+		sp = G_ACSR_SPEED(status);
+		if (sp == 0)
+			sp = SPEED_10;
+		else if (sp == 1)
+			sp = SPEED_100;
+		else
+			sp = SPEED_1000;
+
+		if (fc && dplx == DUPLEX_FULL) {
+			err = t3_mdio_read(cphy, MDIO_DEVAD_NONE, MII_LPA,
+					   &lpa);
+			if (!err)
+				err = t3_mdio_read(cphy, MDIO_DEVAD_NONE,
+						   MII_ADVERTISE, &adv);
+			if (err)
+				return err;
+
+			if (lpa & adv & ADVERTISE_PAUSE_CAP)
+				pause = PAUSE_RX | PAUSE_TX;
+			else if ((lpa & ADVERTISE_PAUSE_CAP) &&
+				 (lpa & ADVERTISE_PAUSE_ASYM) &&
+				 (adv & ADVERTISE_PAUSE_ASYM))
+				pause = PAUSE_TX;
+			else if ((lpa & ADVERTISE_PAUSE_ASYM) &&
+				 (adv & ADVERTISE_PAUSE_CAP))
+				pause = PAUSE_RX;
+		}
+	}
+	if (speed)
+		*speed = sp;
+	if (duplex)
+		*duplex = dplx;
+	if (fc)
+		*fc = pause;
+	return 0;
+}
+
+static int vsc8211_get_link_status_fiber(struct cphy *cphy, int *link_ok,
+					 int *speed, int *duplex, int *fc)
+{
+	unsigned int bmcr, status, lpa, adv;
+	int err, sp = -1, dplx = -1, pause = 0;
+
+	err = t3_mdio_read(cphy, MDIO_DEVAD_NONE, MII_BMCR, &bmcr);
+	if (!err)
+		err = t3_mdio_read(cphy, MDIO_DEVAD_NONE, MII_BMSR, &status);
+	if (err)
+		return err;
+
+	if (link_ok) {
+		/*
+		 * BMSR_LSTATUS is latch-low, so if it is 0 we need to read it
+		 * once more to get the current link state.
+		 */
+		if (!(status & BMSR_LSTATUS))
+			err = t3_mdio_read(cphy, MDIO_DEVAD_NONE, MII_BMSR,
+					   &status);
+		if (err)
+			return err;
+		*link_ok = (status & BMSR_LSTATUS) != 0;
+	}
+	if (!(bmcr & BMCR_ANENABLE)) {
+		dplx = (bmcr & BMCR_FULLDPLX) ? DUPLEX_FULL : DUPLEX_HALF;
+		if (bmcr & BMCR_SPEED1000)
+			sp = SPEED_1000;
+		else if (bmcr & BMCR_SPEED100)
+			sp = SPEED_100;
+		else
+			sp = SPEED_10;
+	} else if (status & BMSR_ANEGCOMPLETE) {
+		err = t3_mdio_read(cphy, MDIO_DEVAD_NONE, MII_LPA, &lpa);
+		if (!err)
+			err = t3_mdio_read(cphy, MDIO_DEVAD_NONE, MII_ADVERTISE,
+					   &adv);
+		if (err)
+			return err;
+
+		if (adv & lpa & ADVERTISE_1000XFULL) {
+			dplx = DUPLEX_FULL;
+			sp = SPEED_1000;
+		} else if (adv & lpa & ADVERTISE_1000XHALF) {
+			dplx = DUPLEX_HALF;
+			sp = SPEED_1000;
+		}
+
+		if (fc && dplx == DUPLEX_FULL) {
+			if (lpa & adv & ADVERTISE_1000XPAUSE)
+				pause = PAUSE_RX | PAUSE_TX;
+			else if ((lpa & ADVERTISE_1000XPAUSE) &&
+				 (adv & lpa & ADVERTISE_1000XPSE_ASYM))
+				pause = PAUSE_TX;
+			else if ((lpa & ADVERTISE_1000XPSE_ASYM) &&
+				 (adv & ADVERTISE_1000XPAUSE))
+				pause = PAUSE_RX;
+		}
+	}
+	if (speed)
+		*speed = sp;
+	if (duplex)
+		*duplex = dplx;
+	if (fc)
+		*fc = pause;
+	return 0;
+}
+
+#ifdef UNUSED
+/*
+ * Enable/disable auto MDI/MDI-X in forced link speed mode.
+ */
+static int vsc8211_set_automdi(struct cphy *phy, int enable)
+{
+	int err;
+
+	err = t3_mdio_write(phy, MDIO_DEVAD_NONE, VSC8211_EXT_PAGE_AXS, 0x52b5);
+	if (err)
+		return err;
+
+	err = t3_mdio_write(phy, MDIO_DEVAD_NONE, 18, 0x12);
+	if (err)
+		return err;
+
+	err = t3_mdio_write(phy, MDIO_DEVAD_NONE, 17, enable ? 0x2803 : 0x3003);
+	if (err)
+		return err;
+
+	err = t3_mdio_write(phy, MDIO_DEVAD_NONE, 16, 0x87fa);
+	if (err)
+		return err;
+
+	err = t3_mdio_write(phy, MDIO_DEVAD_NONE, VSC8211_EXT_PAGE_AXS, 0);
+	if (err)
+		return err;
+
+	return 0;
+}
+
+int vsc8211_set_speed_duplex(struct cphy *phy, int speed, int duplex)
+{
+	int err;
+
+	err = t3_set_phy_speed_duplex(phy, speed, duplex);
+	if (!err)
+		err = vsc8211_set_automdi(phy, 1);
+	return err;
+}
+#endif /* UNUSED */
+
+static int vsc8211_power_down(struct cphy *cphy, int enable)
+{
+	return t3_mdio_change_bits(cphy, 0, MII_BMCR, BMCR_PDOWN,
+				   enable ? BMCR_PDOWN : 0);
+}
+
+static int vsc8211_intr_handler(struct cphy *cphy)
+{
+	unsigned int cause;
+	int err, cphy_cause = 0;
+
+	err = t3_mdio_read(cphy, MDIO_DEVAD_NONE, VSC8211_INTR_STATUS, &cause);
+	if (err)
+		return err;
+
+	cause &= INTR_MASK;
+	if (cause & CFG_CHG_INTR_MASK)
+		cphy_cause |= cphy_cause_link_change;
+	if (cause & (VSC_INTR_RX_FIFO | VSC_INTR_TX_FIFO))
+		cphy_cause |= cphy_cause_fifo_error;
+	return cphy_cause;
+}
+
+static struct cphy_ops vsc8211_ops = {
+	.reset = vsc8211_reset,
+	.intr_enable = vsc8211_intr_enable,
+	.intr_disable = vsc8211_intr_disable,
+	.intr_clear = vsc8211_intr_clear,
+	.intr_handler = vsc8211_intr_handler,
+	.autoneg_enable = vsc8211_autoneg_enable,
+	.autoneg_restart = vsc8211_autoneg_restart,
+	.advertise = t3_phy_advertise,
+	.set_speed_duplex = t3_set_phy_speed_duplex,
+	.get_link_status = vsc8211_get_link_status,
+	.power_down = vsc8211_power_down,
+};
+
+static struct cphy_ops vsc8211_fiber_ops = {
+	.reset = vsc8211_reset,
+	.intr_enable = vsc8211_intr_enable,
+	.intr_disable = vsc8211_intr_disable,
+	.intr_clear = vsc8211_intr_clear,
+	.intr_handler = vsc8211_intr_handler,
+	.autoneg_enable = vsc8211_autoneg_enable,
+	.autoneg_restart = vsc8211_autoneg_restart,
+	.advertise = t3_phy_advertise_fiber,
+	.set_speed_duplex = t3_set_phy_speed_duplex,
+	.get_link_status = vsc8211_get_link_status_fiber,
+	.power_down = vsc8211_power_down,
+};
+
+int t3_vsc8211_phy_prep(struct cphy *phy, struct adapter *adapter,
+			int phy_addr, const struct mdio_ops *mdio_ops)
+{
+	int err;
+	unsigned int val;
+
+	cphy_init(phy, adapter, phy_addr, &vsc8211_ops, mdio_ops,
+		  SUPPORTED_10baseT_Full | SUPPORTED_100baseT_Full |
+		  SUPPORTED_1000baseT_Full | SUPPORTED_Autoneg | SUPPORTED_MII |
+		  SUPPORTED_TP | SUPPORTED_IRQ, "10/100/1000BASE-T");
+	msleep(20);       /* PHY needs ~10ms to start responding to MDIO */
+
+	err = t3_mdio_read(phy, MDIO_DEVAD_NONE, VSC8211_EXT_CTRL, &val);
+	if (err)
+		return err;
+	if (val & VSC_CTRL_MEDIA_MODE_HI) {
+		/* copper interface, just need to configure the LEDs */
+		return t3_mdio_write(phy, MDIO_DEVAD_NONE, VSC8211_LED_CTRL,
+				     0x100);
+	}
+
+	phy->caps = SUPPORTED_1000baseT_Full | SUPPORTED_Autoneg |
+		    SUPPORTED_MII | SUPPORTED_FIBRE | SUPPORTED_IRQ;
+	phy->desc = "1000BASE-X";
+	phy->ops = &vsc8211_fiber_ops;
+
+	err = t3_mdio_write(phy, MDIO_DEVAD_NONE, VSC8211_EXT_PAGE_AXS, 1);
+	if (err)
+		return err;
+
+	err = t3_mdio_write(phy, MDIO_DEVAD_NONE, VSC8211_SIGDET_CTRL, 1);
+	if (err)
+		return err;
+
+	err = t3_mdio_write(phy, MDIO_DEVAD_NONE, VSC8211_EXT_PAGE_AXS, 0);
+	if (err)
+		return err;
+
+	err = t3_mdio_write(phy, MDIO_DEVAD_NONE, VSC8211_EXT_CTRL,
+			    val | VSC_CTRL_CLAUSE37_VIEW);
+	if (err)
+		return err;
+
+	err = vsc8211_reset(phy, 0);
+	if (err)
+		return err;
+
+	udelay(5); /* delay after reset before next SMI */
+	return 0;
+}
diff --git a/drivers/net/ethernet/chelsio/cxgb3/xgmac.c b/drivers/net/ethernet/chelsio/cxgb3/xgmac.c
new file mode 100644
index 0000000..3af19a5
--- /dev/null
+++ b/drivers/net/ethernet/chelsio/cxgb3/xgmac.c
@@ -0,0 +1,657 @@
+/*
+ * Copyright (c) 2005-2008 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "common.h"
+#include "regs.h"
+
+/*
+ * # of exact address filters.  The first one is used for the station address,
+ * the rest are available for multicast addresses.
+ */
+#define EXACT_ADDR_FILTERS 8
+
+static inline int macidx(const struct cmac *mac)
+{
+	return mac->offset / (XGMAC0_1_BASE_ADDR - XGMAC0_0_BASE_ADDR);
+}
+
+static void xaui_serdes_reset(struct cmac *mac)
+{
+	static const unsigned int clear[] = {
+		F_PWRDN0 | F_PWRDN1, F_RESETPLL01, F_RESET0 | F_RESET1,
+		F_PWRDN2 | F_PWRDN3, F_RESETPLL23, F_RESET2 | F_RESET3
+	};
+
+	int i;
+	struct adapter *adap = mac->adapter;
+	u32 ctrl = A_XGM_SERDES_CTRL0 + mac->offset;
+
+	t3_write_reg(adap, ctrl, adap->params.vpd.xauicfg[macidx(mac)] |
+		     F_RESET3 | F_RESET2 | F_RESET1 | F_RESET0 |
+		     F_PWRDN3 | F_PWRDN2 | F_PWRDN1 | F_PWRDN0 |
+		     F_RESETPLL23 | F_RESETPLL01);
+	t3_read_reg(adap, ctrl);
+	udelay(15);
+
+	for (i = 0; i < ARRAY_SIZE(clear); i++) {
+		t3_set_reg_field(adap, ctrl, clear[i], 0);
+		udelay(15);
+	}
+}
+
+void t3b_pcs_reset(struct cmac *mac)
+{
+	t3_set_reg_field(mac->adapter, A_XGM_RESET_CTRL + mac->offset,
+			 F_PCS_RESET_, 0);
+	udelay(20);
+	t3_set_reg_field(mac->adapter, A_XGM_RESET_CTRL + mac->offset, 0,
+			 F_PCS_RESET_);
+}
+
+int t3_mac_reset(struct cmac *mac)
+{
+	static const struct addr_val_pair mac_reset_avp[] = {
+		{A_XGM_TX_CTRL, 0},
+		{A_XGM_RX_CTRL, 0},
+		{A_XGM_RX_CFG, F_DISPAUSEFRAMES | F_EN1536BFRAMES |
+		 F_RMFCS | F_ENJUMBO | F_ENHASHMCAST},
+		{A_XGM_RX_HASH_LOW, 0},
+		{A_XGM_RX_HASH_HIGH, 0},
+		{A_XGM_RX_EXACT_MATCH_LOW_1, 0},
+		{A_XGM_RX_EXACT_MATCH_LOW_2, 0},
+		{A_XGM_RX_EXACT_MATCH_LOW_3, 0},
+		{A_XGM_RX_EXACT_MATCH_LOW_4, 0},
+		{A_XGM_RX_EXACT_MATCH_LOW_5, 0},
+		{A_XGM_RX_EXACT_MATCH_LOW_6, 0},
+		{A_XGM_RX_EXACT_MATCH_LOW_7, 0},
+		{A_XGM_RX_EXACT_MATCH_LOW_8, 0},
+		{A_XGM_STAT_CTRL, F_CLRSTATS}
+	};
+	u32 val;
+	struct adapter *adap = mac->adapter;
+	unsigned int oft = mac->offset;
+
+	t3_write_reg(adap, A_XGM_RESET_CTRL + oft, F_MAC_RESET_);
+	t3_read_reg(adap, A_XGM_RESET_CTRL + oft);	/* flush */
+
+	t3_write_regs(adap, mac_reset_avp, ARRAY_SIZE(mac_reset_avp), oft);
+	t3_set_reg_field(adap, A_XGM_RXFIFO_CFG + oft,
+			 F_RXSTRFRWRD | F_DISERRFRAMES,
+			 uses_xaui(adap) ? 0 : F_RXSTRFRWRD);
+	t3_set_reg_field(adap, A_XGM_TXFIFO_CFG + oft, 0, F_UNDERUNFIX);
+
+	if (uses_xaui(adap)) {
+		if (adap->params.rev == 0) {
+			t3_set_reg_field(adap, A_XGM_SERDES_CTRL + oft, 0,
+					 F_RXENABLE | F_TXENABLE);
+			if (t3_wait_op_done(adap, A_XGM_SERDES_STATUS1 + oft,
+					    F_CMULOCK, 1, 5, 2)) {
+				CH_ERR(adap,
+				       "MAC %d XAUI SERDES CMU lock failed\n",
+				       macidx(mac));
+				return -1;
+			}
+			t3_set_reg_field(adap, A_XGM_SERDES_CTRL + oft, 0,
+					 F_SERDESRESET_);
+		} else
+			xaui_serdes_reset(mac);
+	}
+
+	t3_set_reg_field(adap, A_XGM_RX_MAX_PKT_SIZE + oft,
+			 V_RXMAXFRAMERSIZE(M_RXMAXFRAMERSIZE),
+			 V_RXMAXFRAMERSIZE(MAX_FRAME_SIZE) | F_RXENFRAMER);
+	val = F_MAC_RESET_ | F_XGMAC_STOP_EN;
+
+	if (is_10G(adap))
+		val |= F_PCS_RESET_;
+	else if (uses_xaui(adap))
+		val |= F_PCS_RESET_ | F_XG2G_RESET_;
+	else
+		val |= F_RGMII_RESET_ | F_XG2G_RESET_;
+	t3_write_reg(adap, A_XGM_RESET_CTRL + oft, val);
+	t3_read_reg(adap, A_XGM_RESET_CTRL + oft);	/* flush */
+	if ((val & F_PCS_RESET_) && adap->params.rev) {
+		msleep(1);
+		t3b_pcs_reset(mac);
+	}
+
+	memset(&mac->stats, 0, sizeof(mac->stats));
+	return 0;
+}
+
+static int t3b2_mac_reset(struct cmac *mac)
+{
+	struct adapter *adap = mac->adapter;
+	unsigned int oft = mac->offset, store;
+	int idx = macidx(mac);
+	u32 val;
+
+	if (!macidx(mac))
+		t3_set_reg_field(adap, A_MPS_CFG, F_PORT0ACTIVE, 0);
+	else
+		t3_set_reg_field(adap, A_MPS_CFG, F_PORT1ACTIVE, 0);
+
+	/* Stop NIC traffic to reduce the number of TXTOGGLES */
+	t3_set_reg_field(adap, A_MPS_CFG, F_ENFORCEPKT, 0);
+	/* Ensure TX drains */
+	t3_set_reg_field(adap, A_XGM_TX_CFG + oft, F_TXPAUSEEN, 0);
+
+	t3_write_reg(adap, A_XGM_RESET_CTRL + oft, F_MAC_RESET_);
+	t3_read_reg(adap, A_XGM_RESET_CTRL + oft);    /* flush */
+
+	/* Store A_TP_TX_DROP_CFG_CH0 */
+	t3_write_reg(adap, A_TP_PIO_ADDR, A_TP_TX_DROP_CFG_CH0 + idx);
+	store = t3_read_reg(adap, A_TP_TX_DROP_CFG_CH0 + idx);
+
+	msleep(10);
+
+	/* Change DROP_CFG to 0xc0000011 */
+	t3_write_reg(adap, A_TP_PIO_ADDR, A_TP_TX_DROP_CFG_CH0 + idx);
+	t3_write_reg(adap, A_TP_PIO_DATA, 0xc0000011);
+
+	/* Check for xgm Rx fifo empty */
+	/* Increased loop count to 1000 from 5 cover 1G and 100Mbps case */
+	if (t3_wait_op_done(adap, A_XGM_RX_MAX_PKT_SIZE_ERR_CNT + oft,
+			    0x80000000, 1, 1000, 2)) {
+		CH_ERR(adap, "MAC %d Rx fifo drain failed\n",
+		       macidx(mac));
+		return -1;
+	}
+
+	t3_write_reg(adap, A_XGM_RESET_CTRL + oft, 0);
+	t3_read_reg(adap, A_XGM_RESET_CTRL + oft);    /* flush */
+
+	val = F_MAC_RESET_;
+	if (is_10G(adap))
+		val |= F_PCS_RESET_;
+	else if (uses_xaui(adap))
+		val |= F_PCS_RESET_ | F_XG2G_RESET_;
+	else
+		val |= F_RGMII_RESET_ | F_XG2G_RESET_;
+	t3_write_reg(adap, A_XGM_RESET_CTRL + oft, val);
+	t3_read_reg(adap, A_XGM_RESET_CTRL + oft);  /* flush */
+	if ((val & F_PCS_RESET_) && adap->params.rev) {
+		msleep(1);
+		t3b_pcs_reset(mac);
+	}
+	t3_write_reg(adap, A_XGM_RX_CFG + oft,
+		     F_DISPAUSEFRAMES | F_EN1536BFRAMES |
+		     F_RMFCS | F_ENJUMBO | F_ENHASHMCAST);
+
+	/* Restore the DROP_CFG */
+	t3_write_reg(adap, A_TP_PIO_ADDR, A_TP_TX_DROP_CFG_CH0 + idx);
+	t3_write_reg(adap, A_TP_PIO_DATA, store);
+
+	if (!idx)
+		t3_set_reg_field(adap, A_MPS_CFG, 0, F_PORT0ACTIVE);
+	else
+		t3_set_reg_field(adap, A_MPS_CFG, 0, F_PORT1ACTIVE);
+
+	/* re-enable nic traffic */
+	t3_set_reg_field(adap, A_MPS_CFG, F_ENFORCEPKT, 1);
+
+	/*  Set: re-enable NIC traffic */
+	t3_set_reg_field(adap, A_MPS_CFG, F_ENFORCEPKT, 1);
+
+	return 0;
+}
+
+/*
+ * Set the exact match register 'idx' to recognize the given Ethernet address.
+ */
+static void set_addr_filter(struct cmac *mac, int idx, const u8 * addr)
+{
+	u32 addr_lo, addr_hi;
+	unsigned int oft = mac->offset + idx * 8;
+
+	addr_lo = (addr[3] << 24) | (addr[2] << 16) | (addr[1] << 8) | addr[0];
+	addr_hi = (addr[5] << 8) | addr[4];
+
+	t3_write_reg(mac->adapter, A_XGM_RX_EXACT_MATCH_LOW_1 + oft, addr_lo);
+	t3_write_reg(mac->adapter, A_XGM_RX_EXACT_MATCH_HIGH_1 + oft, addr_hi);
+}
+
+/* Set one of the station's unicast MAC addresses. */
+int t3_mac_set_address(struct cmac *mac, unsigned int idx, u8 addr[6])
+{
+	if (idx >= mac->nucast)
+		return -EINVAL;
+	set_addr_filter(mac, idx, addr);
+	return 0;
+}
+
+/*
+ * Specify the number of exact address filters that should be reserved for
+ * unicast addresses.  Caller should reload the unicast and multicast addresses
+ * after calling this.
+ */
+int t3_mac_set_num_ucast(struct cmac *mac, int n)
+{
+	if (n > EXACT_ADDR_FILTERS)
+		return -EINVAL;
+	mac->nucast = n;
+	return 0;
+}
+
+void t3_mac_disable_exact_filters(struct cmac *mac)
+{
+	unsigned int i, reg = mac->offset + A_XGM_RX_EXACT_MATCH_LOW_1;
+
+	for (i = 0; i < EXACT_ADDR_FILTERS; i++, reg += 8) {
+		u32 v = t3_read_reg(mac->adapter, reg);
+		t3_write_reg(mac->adapter, reg, v);
+	}
+	t3_read_reg(mac->adapter, A_XGM_RX_EXACT_MATCH_LOW_1);	/* flush */
+}
+
+void t3_mac_enable_exact_filters(struct cmac *mac)
+{
+	unsigned int i, reg = mac->offset + A_XGM_RX_EXACT_MATCH_HIGH_1;
+
+	for (i = 0; i < EXACT_ADDR_FILTERS; i++, reg += 8) {
+		u32 v = t3_read_reg(mac->adapter, reg);
+		t3_write_reg(mac->adapter, reg, v);
+	}
+	t3_read_reg(mac->adapter, A_XGM_RX_EXACT_MATCH_LOW_1);	/* flush */
+}
+
+/* Calculate the RX hash filter index of an Ethernet address */
+static int hash_hw_addr(const u8 * addr)
+{
+	int hash = 0, octet, bit, i = 0, c;
+
+	for (octet = 0; octet < 6; ++octet)
+		for (c = addr[octet], bit = 0; bit < 8; c >>= 1, ++bit) {
+			hash ^= (c & 1) << i;
+			if (++i == 6)
+				i = 0;
+		}
+	return hash;
+}
+
+int t3_mac_set_rx_mode(struct cmac *mac, struct net_device *dev)
+{
+	u32 val, hash_lo, hash_hi;
+	struct adapter *adap = mac->adapter;
+	unsigned int oft = mac->offset;
+
+	val = t3_read_reg(adap, A_XGM_RX_CFG + oft) & ~F_COPYALLFRAMES;
+	if (dev->flags & IFF_PROMISC)
+		val |= F_COPYALLFRAMES;
+	t3_write_reg(adap, A_XGM_RX_CFG + oft, val);
+
+	if (dev->flags & IFF_ALLMULTI)
+		hash_lo = hash_hi = 0xffffffff;
+	else {
+		struct netdev_hw_addr *ha;
+		int exact_addr_idx = mac->nucast;
+
+		hash_lo = hash_hi = 0;
+		netdev_for_each_mc_addr(ha, dev)
+			if (exact_addr_idx < EXACT_ADDR_FILTERS)
+				set_addr_filter(mac, exact_addr_idx++,
+						ha->addr);
+			else {
+				int hash = hash_hw_addr(ha->addr);
+
+				if (hash < 32)
+					hash_lo |= (1 << hash);
+				else
+					hash_hi |= (1 << (hash - 32));
+			}
+	}
+
+	t3_write_reg(adap, A_XGM_RX_HASH_LOW + oft, hash_lo);
+	t3_write_reg(adap, A_XGM_RX_HASH_HIGH + oft, hash_hi);
+	return 0;
+}
+
+static int rx_fifo_hwm(int mtu)
+{
+	int hwm;
+
+	hwm = max(MAC_RXFIFO_SIZE - 3 * mtu, (MAC_RXFIFO_SIZE * 38) / 100);
+	return min(hwm, MAC_RXFIFO_SIZE - 8192);
+}
+
+int t3_mac_set_mtu(struct cmac *mac, unsigned int mtu)
+{
+	int hwm, lwm, divisor;
+	int ipg;
+	unsigned int thres, v, reg;
+	struct adapter *adap = mac->adapter;
+
+	/*
+	 * MAX_FRAME_SIZE inludes header + FCS, mtu doesn't.  The HW max
+	 * packet size register includes header, but not FCS.
+	 */
+	mtu += 14;
+	if (mtu > 1536)
+		mtu += 4;
+
+	if (mtu > MAX_FRAME_SIZE - 4)
+		return -EINVAL;
+	t3_write_reg(adap, A_XGM_RX_MAX_PKT_SIZE + mac->offset, mtu);
+
+	if (adap->params.rev >= T3_REV_B2 &&
+	    (t3_read_reg(adap, A_XGM_RX_CTRL + mac->offset) & F_RXEN)) {
+		t3_mac_disable_exact_filters(mac);
+		v = t3_read_reg(adap, A_XGM_RX_CFG + mac->offset);
+		t3_set_reg_field(adap, A_XGM_RX_CFG + mac->offset,
+				 F_ENHASHMCAST | F_COPYALLFRAMES, F_DISBCAST);
+
+		reg = adap->params.rev == T3_REV_B2 ?
+			A_XGM_RX_MAX_PKT_SIZE_ERR_CNT : A_XGM_RXFIFO_CFG;
+
+		/* drain RX FIFO */
+		if (t3_wait_op_done(adap, reg + mac->offset,
+				    F_RXFIFO_EMPTY, 1, 20, 5)) {
+			t3_write_reg(adap, A_XGM_RX_CFG + mac->offset, v);
+			t3_mac_enable_exact_filters(mac);
+			return -EIO;
+		}
+		t3_set_reg_field(adap, A_XGM_RX_MAX_PKT_SIZE + mac->offset,
+				 V_RXMAXPKTSIZE(M_RXMAXPKTSIZE),
+				 V_RXMAXPKTSIZE(mtu));
+		t3_write_reg(adap, A_XGM_RX_CFG + mac->offset, v);
+		t3_mac_enable_exact_filters(mac);
+	} else
+		t3_set_reg_field(adap, A_XGM_RX_MAX_PKT_SIZE + mac->offset,
+				 V_RXMAXPKTSIZE(M_RXMAXPKTSIZE),
+				 V_RXMAXPKTSIZE(mtu));
+
+	/*
+	 * Adjust the PAUSE frame watermarks.  We always set the LWM, and the
+	 * HWM only if flow-control is enabled.
+	 */
+	hwm = rx_fifo_hwm(mtu);
+	lwm = min(3 * (int)mtu, MAC_RXFIFO_SIZE / 4);
+	v = t3_read_reg(adap, A_XGM_RXFIFO_CFG + mac->offset);
+	v &= ~V_RXFIFOPAUSELWM(M_RXFIFOPAUSELWM);
+	v |= V_RXFIFOPAUSELWM(lwm / 8);
+	if (G_RXFIFOPAUSEHWM(v))
+		v = (v & ~V_RXFIFOPAUSEHWM(M_RXFIFOPAUSEHWM)) |
+		    V_RXFIFOPAUSEHWM(hwm / 8);
+
+	t3_write_reg(adap, A_XGM_RXFIFO_CFG + mac->offset, v);
+
+	/* Adjust the TX FIFO threshold based on the MTU */
+	thres = (adap->params.vpd.cclk * 1000) / 15625;
+	thres = (thres * mtu) / 1000;
+	if (is_10G(adap))
+		thres /= 10;
+	thres = mtu > thres ? (mtu - thres + 7) / 8 : 0;
+	thres = max(thres, 8U);	/* need at least 8 */
+	ipg = (adap->params.rev == T3_REV_C) ? 0 : 1;
+	t3_set_reg_field(adap, A_XGM_TXFIFO_CFG + mac->offset,
+			 V_TXFIFOTHRESH(M_TXFIFOTHRESH) | V_TXIPG(M_TXIPG),
+			 V_TXFIFOTHRESH(thres) | V_TXIPG(ipg));
+
+	if (adap->params.rev > 0) {
+		divisor = (adap->params.rev == T3_REV_C) ? 64 : 8;
+		t3_write_reg(adap, A_XGM_PAUSE_TIMER + mac->offset,
+			     (hwm - lwm) * 4 / divisor);
+	}
+	t3_write_reg(adap, A_XGM_TX_PAUSE_QUANTA + mac->offset,
+		     MAC_RXFIFO_SIZE * 4 * 8 / 512);
+	return 0;
+}
+
+int t3_mac_set_speed_duplex_fc(struct cmac *mac, int speed, int duplex, int fc)
+{
+	u32 val;
+	struct adapter *adap = mac->adapter;
+	unsigned int oft = mac->offset;
+
+	if (duplex >= 0 && duplex != DUPLEX_FULL)
+		return -EINVAL;
+	if (speed >= 0) {
+		if (speed == SPEED_10)
+			val = V_PORTSPEED(0);
+		else if (speed == SPEED_100)
+			val = V_PORTSPEED(1);
+		else if (speed == SPEED_1000)
+			val = V_PORTSPEED(2);
+		else if (speed == SPEED_10000)
+			val = V_PORTSPEED(3);
+		else
+			return -EINVAL;
+
+		t3_set_reg_field(adap, A_XGM_PORT_CFG + oft,
+				 V_PORTSPEED(M_PORTSPEED), val);
+	}
+
+	val = t3_read_reg(adap, A_XGM_RXFIFO_CFG + oft);
+	val &= ~V_RXFIFOPAUSEHWM(M_RXFIFOPAUSEHWM);
+	if (fc & PAUSE_TX) {
+		u32 rx_max_pkt_size =
+		    G_RXMAXPKTSIZE(t3_read_reg(adap,
+					       A_XGM_RX_MAX_PKT_SIZE + oft));
+		val |= V_RXFIFOPAUSEHWM(rx_fifo_hwm(rx_max_pkt_size) / 8);
+	}
+	t3_write_reg(adap, A_XGM_RXFIFO_CFG + oft, val);
+
+	t3_set_reg_field(adap, A_XGM_TX_CFG + oft, F_TXPAUSEEN,
+			 (fc & PAUSE_RX) ? F_TXPAUSEEN : 0);
+	return 0;
+}
+
+int t3_mac_enable(struct cmac *mac, int which)
+{
+	int idx = macidx(mac);
+	struct adapter *adap = mac->adapter;
+	unsigned int oft = mac->offset;
+	struct mac_stats *s = &mac->stats;
+
+	if (which & MAC_DIRECTION_TX) {
+		t3_write_reg(adap, A_TP_PIO_ADDR, A_TP_TX_DROP_CFG_CH0 + idx);
+		t3_write_reg(adap, A_TP_PIO_DATA,
+			     adap->params.rev == T3_REV_C ?
+			     0xc4ffff01 : 0xc0ede401);
+		t3_write_reg(adap, A_TP_PIO_ADDR, A_TP_TX_DROP_MODE);
+		t3_set_reg_field(adap, A_TP_PIO_DATA, 1 << idx,
+				 adap->params.rev == T3_REV_C ? 0 : 1 << idx);
+
+		t3_write_reg(adap, A_XGM_TX_CTRL + oft, F_TXEN);
+
+		t3_write_reg(adap, A_TP_PIO_ADDR, A_TP_TX_DROP_CNT_CH0 + idx);
+		mac->tx_mcnt = s->tx_frames;
+		mac->tx_tcnt = (G_TXDROPCNTCH0RCVD(t3_read_reg(adap,
+							A_TP_PIO_DATA)));
+		mac->tx_xcnt = (G_TXSPI4SOPCNT(t3_read_reg(adap,
+						A_XGM_TX_SPI4_SOP_EOP_CNT +
+						oft)));
+		mac->rx_mcnt = s->rx_frames;
+		mac->rx_pause = s->rx_pause;
+		mac->rx_xcnt = (G_TXSPI4SOPCNT(t3_read_reg(adap,
+						A_XGM_RX_SPI4_SOP_EOP_CNT +
+						oft)));
+		mac->rx_ocnt = s->rx_fifo_ovfl;
+		mac->txen = F_TXEN;
+		mac->toggle_cnt = 0;
+	}
+	if (which & MAC_DIRECTION_RX)
+		t3_write_reg(adap, A_XGM_RX_CTRL + oft, F_RXEN);
+	return 0;
+}
+
+int t3_mac_disable(struct cmac *mac, int which)
+{
+	struct adapter *adap = mac->adapter;
+
+	if (which & MAC_DIRECTION_TX) {
+		t3_write_reg(adap, A_XGM_TX_CTRL + mac->offset, 0);
+		mac->txen = 0;
+	}
+	if (which & MAC_DIRECTION_RX) {
+		int val = F_MAC_RESET_;
+
+		t3_set_reg_field(mac->adapter, A_XGM_RESET_CTRL + mac->offset,
+				 F_PCS_RESET_, 0);
+		msleep(100);
+		t3_write_reg(adap, A_XGM_RX_CTRL + mac->offset, 0);
+		if (is_10G(adap))
+			val |= F_PCS_RESET_;
+		else if (uses_xaui(adap))
+			val |= F_PCS_RESET_ | F_XG2G_RESET_;
+		else
+			val |= F_RGMII_RESET_ | F_XG2G_RESET_;
+		t3_write_reg(mac->adapter, A_XGM_RESET_CTRL + mac->offset, val);
+	}
+	return 0;
+}
+
+int t3b2_mac_watchdog_task(struct cmac *mac)
+{
+	struct adapter *adap = mac->adapter;
+	struct mac_stats *s = &mac->stats;
+	unsigned int tx_tcnt, tx_xcnt;
+	u64 tx_mcnt = s->tx_frames;
+	int status;
+
+	status = 0;
+	tx_xcnt = 1;		/* By default tx_xcnt is making progress */
+	tx_tcnt = mac->tx_tcnt;	/* If tx_mcnt is progressing ignore tx_tcnt */
+	if (tx_mcnt == mac->tx_mcnt && mac->rx_pause == s->rx_pause) {
+		tx_xcnt = (G_TXSPI4SOPCNT(t3_read_reg(adap,
+						A_XGM_TX_SPI4_SOP_EOP_CNT +
+					       	mac->offset)));
+		if (tx_xcnt == 0) {
+			t3_write_reg(adap, A_TP_PIO_ADDR,
+				     A_TP_TX_DROP_CNT_CH0 + macidx(mac));
+			tx_tcnt = (G_TXDROPCNTCH0RCVD(t3_read_reg(adap,
+						      A_TP_PIO_DATA)));
+		} else {
+			goto out;
+		}
+	} else {
+		mac->toggle_cnt = 0;
+		goto out;
+	}
+
+	if ((tx_tcnt != mac->tx_tcnt) && (mac->tx_xcnt == 0)) {
+		if (mac->toggle_cnt > 4) {
+			status = 2;
+			goto out;
+		} else {
+			status = 1;
+			goto out;
+		}
+	} else {
+		mac->toggle_cnt = 0;
+		goto out;
+	}
+
+out:
+	mac->tx_tcnt = tx_tcnt;
+	mac->tx_xcnt = tx_xcnt;
+	mac->tx_mcnt = s->tx_frames;
+	mac->rx_pause = s->rx_pause;
+	if (status == 1) {
+		t3_write_reg(adap, A_XGM_TX_CTRL + mac->offset, 0);
+		t3_read_reg(adap, A_XGM_TX_CTRL + mac->offset);  /* flush */
+		t3_write_reg(adap, A_XGM_TX_CTRL + mac->offset, mac->txen);
+		t3_read_reg(adap, A_XGM_TX_CTRL + mac->offset);  /* flush */
+		mac->toggle_cnt++;
+	} else if (status == 2) {
+		t3b2_mac_reset(mac);
+		mac->toggle_cnt = 0;
+	}
+	return status;
+}
+
+/*
+ * This function is called periodically to accumulate the current values of the
+ * RMON counters into the port statistics.  Since the packet counters are only
+ * 32 bits they can overflow in ~286 secs at 10G, so the function should be
+ * called more frequently than that.  The byte counters are 45-bit wide, they
+ * would overflow in ~7.8 hours.
+ */
+const struct mac_stats *t3_mac_update_stats(struct cmac *mac)
+{
+#define RMON_READ(mac, addr) t3_read_reg(mac->adapter, addr + mac->offset)
+#define RMON_UPDATE(mac, name, reg) \
+	(mac)->stats.name += (u64)RMON_READ(mac, A_XGM_STAT_##reg)
+#define RMON_UPDATE64(mac, name, reg_lo, reg_hi) \
+	(mac)->stats.name += RMON_READ(mac, A_XGM_STAT_##reg_lo) + \
+			     ((u64)RMON_READ(mac, A_XGM_STAT_##reg_hi) << 32)
+
+	u32 v, lo;
+
+	RMON_UPDATE64(mac, rx_octets, RX_BYTES_LOW, RX_BYTES_HIGH);
+	RMON_UPDATE64(mac, rx_frames, RX_FRAMES_LOW, RX_FRAMES_HIGH);
+	RMON_UPDATE(mac, rx_mcast_frames, RX_MCAST_FRAMES);
+	RMON_UPDATE(mac, rx_bcast_frames, RX_BCAST_FRAMES);
+	RMON_UPDATE(mac, rx_fcs_errs, RX_CRC_ERR_FRAMES);
+	RMON_UPDATE(mac, rx_pause, RX_PAUSE_FRAMES);
+	RMON_UPDATE(mac, rx_jabber, RX_JABBER_FRAMES);
+	RMON_UPDATE(mac, rx_short, RX_SHORT_FRAMES);
+	RMON_UPDATE(mac, rx_symbol_errs, RX_SYM_CODE_ERR_FRAMES);
+
+	RMON_UPDATE(mac, rx_too_long, RX_OVERSIZE_FRAMES);
+
+	v = RMON_READ(mac, A_XGM_RX_MAX_PKT_SIZE_ERR_CNT);
+	if (mac->adapter->params.rev == T3_REV_B2)
+		v &= 0x7fffffff;
+	mac->stats.rx_too_long += v;
+
+	RMON_UPDATE(mac, rx_frames_64, RX_64B_FRAMES);
+	RMON_UPDATE(mac, rx_frames_65_127, RX_65_127B_FRAMES);
+	RMON_UPDATE(mac, rx_frames_128_255, RX_128_255B_FRAMES);
+	RMON_UPDATE(mac, rx_frames_256_511, RX_256_511B_FRAMES);
+	RMON_UPDATE(mac, rx_frames_512_1023, RX_512_1023B_FRAMES);
+	RMON_UPDATE(mac, rx_frames_1024_1518, RX_1024_1518B_FRAMES);
+	RMON_UPDATE(mac, rx_frames_1519_max, RX_1519_MAXB_FRAMES);
+
+	RMON_UPDATE64(mac, tx_octets, TX_BYTE_LOW, TX_BYTE_HIGH);
+	RMON_UPDATE64(mac, tx_frames, TX_FRAME_LOW, TX_FRAME_HIGH);
+	RMON_UPDATE(mac, tx_mcast_frames, TX_MCAST);
+	RMON_UPDATE(mac, tx_bcast_frames, TX_BCAST);
+	RMON_UPDATE(mac, tx_pause, TX_PAUSE);
+	/* This counts error frames in general (bad FCS, underrun, etc). */
+	RMON_UPDATE(mac, tx_underrun, TX_ERR_FRAMES);
+
+	RMON_UPDATE(mac, tx_frames_64, TX_64B_FRAMES);
+	RMON_UPDATE(mac, tx_frames_65_127, TX_65_127B_FRAMES);
+	RMON_UPDATE(mac, tx_frames_128_255, TX_128_255B_FRAMES);
+	RMON_UPDATE(mac, tx_frames_256_511, TX_256_511B_FRAMES);
+	RMON_UPDATE(mac, tx_frames_512_1023, TX_512_1023B_FRAMES);
+	RMON_UPDATE(mac, tx_frames_1024_1518, TX_1024_1518B_FRAMES);
+	RMON_UPDATE(mac, tx_frames_1519_max, TX_1519_MAXB_FRAMES);
+
+	/* The next stat isn't clear-on-read. */
+	t3_write_reg(mac->adapter, A_TP_MIB_INDEX, mac->offset ? 51 : 50);
+	v = t3_read_reg(mac->adapter, A_TP_MIB_RDATA);
+	lo = (u32) mac->stats.rx_cong_drops;
+	mac->stats.rx_cong_drops += (u64) (v - lo);
+
+	return &mac->stats;
+}
diff --git a/drivers/net/ethernet/chelsio/cxgb4/Makefile b/drivers/net/ethernet/chelsio/cxgb4/Makefile
new file mode 100644
index 0000000..1df65c9
--- /dev/null
+++ b/drivers/net/ethernet/chelsio/cxgb4/Makefile
@@ -0,0 +1,8 @@
+#
+# Chelsio T4 driver
+#
+
+obj-$(CONFIG_CHELSIO_T4) += cxgb4.o
+
+cxgb4-objs := cxgb4_main.o l2t.o t4_hw.o sge.o
+cxgb4-$(CONFIG_CHELSIO_T4_DCB) +=  cxgb4_dcb.o
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
new file mode 100644
index 0000000..3c481b2
--- /dev/null
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
@@ -0,0 +1,1088 @@
+/*
+ * This file is part of the Chelsio T4 Ethernet driver for Linux.
+ *
+ * Copyright (c) 2003-2014 Chelsio Communications, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __CXGB4_H__
+#define __CXGB4_H__
+
+#include "t4_hw.h"
+
+#include <linux/bitops.h>
+#include <linux/cache.h>
+#include <linux/interrupt.h>
+#include <linux/list.h>
+#include <linux/netdevice.h>
+#include <linux/pci.h>
+#include <linux/spinlock.h>
+#include <linux/timer.h>
+#include <linux/vmalloc.h>
+#include <asm/io.h>
+#include "cxgb4_uld.h"
+
+#define T4FW_VERSION_MAJOR 0x01
+#define T4FW_VERSION_MINOR 0x0B
+#define T4FW_VERSION_MICRO 0x1B
+#define T4FW_VERSION_BUILD 0x00
+
+#define T5FW_VERSION_MAJOR 0x01
+#define T5FW_VERSION_MINOR 0x0B
+#define T5FW_VERSION_MICRO 0x1B
+#define T5FW_VERSION_BUILD 0x00
+
+#define CH_WARN(adap, fmt, ...) dev_warn(adap->pdev_dev, fmt, ## __VA_ARGS__)
+
+enum {
+	MAX_NPORTS = 4,     /* max # of ports */
+	SERNUM_LEN = 24,    /* Serial # length */
+	EC_LEN     = 16,    /* E/C length */
+	ID_LEN     = 16,    /* ID length */
+	PN_LEN     = 16,    /* Part Number length */
+};
+
+enum {
+	MEM_EDC0,
+	MEM_EDC1,
+	MEM_MC,
+	MEM_MC0 = MEM_MC,
+	MEM_MC1
+};
+
+enum {
+	MEMWIN0_APERTURE = 2048,
+	MEMWIN0_BASE     = 0x1b800,
+	MEMWIN1_APERTURE = 32768,
+	MEMWIN1_BASE     = 0x28000,
+	MEMWIN1_BASE_T5  = 0x52000,
+	MEMWIN2_APERTURE = 65536,
+	MEMWIN2_BASE     = 0x30000,
+	MEMWIN2_APERTURE_T5 = 131072,
+	MEMWIN2_BASE_T5  = 0x60000,
+};
+
+enum dev_master {
+	MASTER_CANT,
+	MASTER_MAY,
+	MASTER_MUST
+};
+
+enum dev_state {
+	DEV_STATE_UNINIT,
+	DEV_STATE_INIT,
+	DEV_STATE_ERR
+};
+
+enum {
+	PAUSE_RX      = 1 << 0,
+	PAUSE_TX      = 1 << 1,
+	PAUSE_AUTONEG = 1 << 2
+};
+
+struct port_stats {
+	u64 tx_octets;            /* total # of octets in good frames */
+	u64 tx_frames;            /* all good frames */
+	u64 tx_bcast_frames;      /* all broadcast frames */
+	u64 tx_mcast_frames;      /* all multicast frames */
+	u64 tx_ucast_frames;      /* all unicast frames */
+	u64 tx_error_frames;      /* all error frames */
+
+	u64 tx_frames_64;         /* # of Tx frames in a particular range */
+	u64 tx_frames_65_127;
+	u64 tx_frames_128_255;
+	u64 tx_frames_256_511;
+	u64 tx_frames_512_1023;
+	u64 tx_frames_1024_1518;
+	u64 tx_frames_1519_max;
+
+	u64 tx_drop;              /* # of dropped Tx frames */
+	u64 tx_pause;             /* # of transmitted pause frames */
+	u64 tx_ppp0;              /* # of transmitted PPP prio 0 frames */
+	u64 tx_ppp1;              /* # of transmitted PPP prio 1 frames */
+	u64 tx_ppp2;              /* # of transmitted PPP prio 2 frames */
+	u64 tx_ppp3;              /* # of transmitted PPP prio 3 frames */
+	u64 tx_ppp4;              /* # of transmitted PPP prio 4 frames */
+	u64 tx_ppp5;              /* # of transmitted PPP prio 5 frames */
+	u64 tx_ppp6;              /* # of transmitted PPP prio 6 frames */
+	u64 tx_ppp7;              /* # of transmitted PPP prio 7 frames */
+
+	u64 rx_octets;            /* total # of octets in good frames */
+	u64 rx_frames;            /* all good frames */
+	u64 rx_bcast_frames;      /* all broadcast frames */
+	u64 rx_mcast_frames;      /* all multicast frames */
+	u64 rx_ucast_frames;      /* all unicast frames */
+	u64 rx_too_long;          /* # of frames exceeding MTU */
+	u64 rx_jabber;            /* # of jabber frames */
+	u64 rx_fcs_err;           /* # of received frames with bad FCS */
+	u64 rx_len_err;           /* # of received frames with length error */
+	u64 rx_symbol_err;        /* symbol errors */
+	u64 rx_runt;              /* # of short frames */
+
+	u64 rx_frames_64;         /* # of Rx frames in a particular range */
+	u64 rx_frames_65_127;
+	u64 rx_frames_128_255;
+	u64 rx_frames_256_511;
+	u64 rx_frames_512_1023;
+	u64 rx_frames_1024_1518;
+	u64 rx_frames_1519_max;
+
+	u64 rx_pause;             /* # of received pause frames */
+	u64 rx_ppp0;              /* # of received PPP prio 0 frames */
+	u64 rx_ppp1;              /* # of received PPP prio 1 frames */
+	u64 rx_ppp2;              /* # of received PPP prio 2 frames */
+	u64 rx_ppp3;              /* # of received PPP prio 3 frames */
+	u64 rx_ppp4;              /* # of received PPP prio 4 frames */
+	u64 rx_ppp5;              /* # of received PPP prio 5 frames */
+	u64 rx_ppp6;              /* # of received PPP prio 6 frames */
+	u64 rx_ppp7;              /* # of received PPP prio 7 frames */
+
+	u64 rx_ovflow0;           /* drops due to buffer-group 0 overflows */
+	u64 rx_ovflow1;           /* drops due to buffer-group 1 overflows */
+	u64 rx_ovflow2;           /* drops due to buffer-group 2 overflows */
+	u64 rx_ovflow3;           /* drops due to buffer-group 3 overflows */
+	u64 rx_trunc0;            /* buffer-group 0 truncated packets */
+	u64 rx_trunc1;            /* buffer-group 1 truncated packets */
+	u64 rx_trunc2;            /* buffer-group 2 truncated packets */
+	u64 rx_trunc3;            /* buffer-group 3 truncated packets */
+};
+
+struct lb_port_stats {
+	u64 octets;
+	u64 frames;
+	u64 bcast_frames;
+	u64 mcast_frames;
+	u64 ucast_frames;
+	u64 error_frames;
+
+	u64 frames_64;
+	u64 frames_65_127;
+	u64 frames_128_255;
+	u64 frames_256_511;
+	u64 frames_512_1023;
+	u64 frames_1024_1518;
+	u64 frames_1519_max;
+
+	u64 drop;
+
+	u64 ovflow0;
+	u64 ovflow1;
+	u64 ovflow2;
+	u64 ovflow3;
+	u64 trunc0;
+	u64 trunc1;
+	u64 trunc2;
+	u64 trunc3;
+};
+
+struct tp_tcp_stats {
+	u32 tcpOutRsts;
+	u64 tcpInSegs;
+	u64 tcpOutSegs;
+	u64 tcpRetransSegs;
+};
+
+struct tp_err_stats {
+	u32 macInErrs[4];
+	u32 hdrInErrs[4];
+	u32 tcpInErrs[4];
+	u32 tnlCongDrops[4];
+	u32 ofldChanDrops[4];
+	u32 tnlTxDrops[4];
+	u32 ofldVlanDrops[4];
+	u32 tcp6InErrs[4];
+	u32 ofldNoNeigh;
+	u32 ofldCongDefer;
+};
+
+struct tp_params {
+	unsigned int ntxchan;        /* # of Tx channels */
+	unsigned int tre;            /* log2 of core clocks per TP tick */
+	unsigned short tx_modq_map;  /* TX modulation scheduler queue to */
+				     /* channel map */
+
+	uint32_t dack_re;            /* DACK timer resolution */
+	unsigned short tx_modq[NCHAN];	/* channel to modulation queue map */
+
+	u32 vlan_pri_map;               /* cached TP_VLAN_PRI_MAP */
+	u32 ingress_config;             /* cached TP_INGRESS_CONFIG */
+
+	/* TP_VLAN_PRI_MAP Compressed Filter Tuple field offsets.  This is a
+	 * subset of the set of fields which may be present in the Compressed
+	 * Filter Tuple portion of filters and TCP TCB connections.  The
+	 * fields which are present are controlled by the TP_VLAN_PRI_MAP.
+	 * Since a variable number of fields may or may not be present, their
+	 * shifted field positions within the Compressed Filter Tuple may
+	 * vary, or not even be present if the field isn't selected in
+	 * TP_VLAN_PRI_MAP.  Since some of these fields are needed in various
+	 * places we store their offsets here, or a -1 if the field isn't
+	 * present.
+	 */
+	int vlan_shift;
+	int vnic_shift;
+	int port_shift;
+	int protocol_shift;
+};
+
+struct vpd_params {
+	unsigned int cclk;
+	u8 ec[EC_LEN + 1];
+	u8 sn[SERNUM_LEN + 1];
+	u8 id[ID_LEN + 1];
+	u8 pn[PN_LEN + 1];
+};
+
+struct pci_params {
+	unsigned char speed;
+	unsigned char width;
+};
+
+#define CHELSIO_CHIP_CODE(version, revision) (((version) << 4) | (revision))
+#define CHELSIO_CHIP_FPGA          0x100
+#define CHELSIO_CHIP_VERSION(code) (((code) >> 4) & 0xf)
+#define CHELSIO_CHIP_RELEASE(code) ((code) & 0xf)
+
+#define CHELSIO_T4		0x4
+#define CHELSIO_T5		0x5
+
+enum chip_type {
+	T4_A1 = CHELSIO_CHIP_CODE(CHELSIO_T4, 1),
+	T4_A2 = CHELSIO_CHIP_CODE(CHELSIO_T4, 2),
+	T4_FIRST_REV	= T4_A1,
+	T4_LAST_REV	= T4_A2,
+
+	T5_A0 = CHELSIO_CHIP_CODE(CHELSIO_T5, 0),
+	T5_A1 = CHELSIO_CHIP_CODE(CHELSIO_T5, 1),
+	T5_FIRST_REV	= T5_A0,
+	T5_LAST_REV	= T5_A1,
+};
+
+struct adapter_params {
+	struct tp_params  tp;
+	struct vpd_params vpd;
+	struct pci_params pci;
+
+	unsigned int sf_size;             /* serial flash size in bytes */
+	unsigned int sf_nsec;             /* # of flash sectors */
+	unsigned int sf_fw_start;         /* start of FW image in flash */
+
+	unsigned int fw_vers;
+	unsigned int tp_vers;
+	u8 api_vers[7];
+
+	unsigned short mtus[NMTUS];
+	unsigned short a_wnd[NCCTRL_WIN];
+	unsigned short b_wnd[NCCTRL_WIN];
+
+	unsigned char nports;             /* # of ethernet ports */
+	unsigned char portvec;
+	enum chip_type chip;               /* chip code */
+	unsigned char offload;
+
+	unsigned char bypass;
+
+	unsigned int ofldq_wr_cred;
+	bool ulptx_memwrite_dsgl;          /* use of T5 DSGL allowed */
+
+	unsigned int max_ordird_qp;       /* Max read depth per RDMA QP */
+	unsigned int max_ird_adapter;     /* Max read depth per adapter */
+};
+
+#include "t4fw_api.h"
+
+#define FW_VERSION(chip) ( \
+		FW_HDR_FW_VER_MAJOR_GET(chip##FW_VERSION_MAJOR) | \
+		FW_HDR_FW_VER_MINOR_GET(chip##FW_VERSION_MINOR) | \
+		FW_HDR_FW_VER_MICRO_GET(chip##FW_VERSION_MICRO) | \
+		FW_HDR_FW_VER_BUILD_GET(chip##FW_VERSION_BUILD))
+#define FW_INTFVER(chip, intf) (FW_HDR_INTFVER_##intf)
+
+struct fw_info {
+	u8 chip;
+	char *fs_name;
+	char *fw_mod_name;
+	struct fw_hdr fw_hdr;
+};
+
+
+struct trace_params {
+	u32 data[TRACE_LEN / 4];
+	u32 mask[TRACE_LEN / 4];
+	unsigned short snap_len;
+	unsigned short min_len;
+	unsigned char skip_ofst;
+	unsigned char skip_len;
+	unsigned char invert;
+	unsigned char port;
+};
+
+struct link_config {
+	unsigned short supported;        /* link capabilities */
+	unsigned short advertising;      /* advertised capabilities */
+	unsigned short requested_speed;  /* speed user has requested */
+	unsigned short speed;            /* actual link speed */
+	unsigned char  requested_fc;     /* flow control user has requested */
+	unsigned char  fc;               /* actual link flow control */
+	unsigned char  autoneg;          /* autonegotiating? */
+	unsigned char  link_ok;          /* link up? */
+};
+
+#define FW_LEN16(fw_struct) FW_CMD_LEN16(sizeof(fw_struct) / 16)
+
+enum {
+	MAX_ETH_QSETS = 32,           /* # of Ethernet Tx/Rx queue sets */
+	MAX_OFLD_QSETS = 16,          /* # of offload Tx/Rx queue sets */
+	MAX_CTRL_QUEUES = NCHAN,      /* # of control Tx queues */
+	MAX_RDMA_QUEUES = NCHAN,      /* # of streaming RDMA Rx queues */
+	MAX_RDMA_CIQS = NCHAN,        /* # of  RDMA concentrator IQs */
+	MAX_ISCSI_QUEUES = NCHAN,     /* # of streaming iSCSI Rx queues */
+};
+
+enum {
+	INGQ_EXTRAS = 2,        /* firmware event queue and */
+				/*   forwarded interrupts */
+	MAX_EGRQ = MAX_ETH_QSETS*2 + MAX_OFLD_QSETS*2
+		   + MAX_CTRL_QUEUES + MAX_RDMA_QUEUES + MAX_ISCSI_QUEUES,
+	MAX_INGQ = MAX_ETH_QSETS + MAX_OFLD_QSETS + MAX_RDMA_QUEUES
+		   + MAX_RDMA_CIQS + MAX_ISCSI_QUEUES + INGQ_EXTRAS,
+};
+
+struct adapter;
+struct sge_rspq;
+
+#include "cxgb4_dcb.h"
+
+struct port_info {
+	struct adapter *adapter;
+	u16    viid;
+	s16    xact_addr_filt;        /* index of exact MAC address filter */
+	u16    rss_size;              /* size of VI's RSS table slice */
+	s8     mdio_addr;
+	u8     port_type;
+	u8     mod_type;
+	u8     port_id;
+	u8     tx_chan;
+	u8     lport;                 /* associated offload logical port */
+	u8     nqsets;                /* # of qsets */
+	u8     first_qset;            /* index of first qset */
+	u8     rss_mode;
+	struct link_config link_cfg;
+	u16   *rss;
+#ifdef CONFIG_CHELSIO_T4_DCB
+	struct port_dcb_info dcb;     /* Data Center Bridging support */
+#endif
+};
+
+struct dentry;
+struct work_struct;
+
+enum {                                 /* adapter flags */
+	FULL_INIT_DONE     = (1 << 0),
+	DEV_ENABLED        = (1 << 1),
+	USING_MSI          = (1 << 2),
+	USING_MSIX         = (1 << 3),
+	FW_OK              = (1 << 4),
+	RSS_TNLALLLOOKUP   = (1 << 5),
+	USING_SOFT_PARAMS  = (1 << 6),
+	MASTER_PF          = (1 << 7),
+	FW_OFLD_CONN       = (1 << 9),
+};
+
+struct rx_sw_desc;
+
+struct sge_fl {                     /* SGE free-buffer queue state */
+	unsigned int avail;         /* # of available Rx buffers */
+	unsigned int pend_cred;     /* new buffers since last FL DB ring */
+	unsigned int cidx;          /* consumer index */
+	unsigned int pidx;          /* producer index */
+	unsigned long alloc_failed; /* # of times buffer allocation failed */
+	unsigned long large_alloc_failed;
+	unsigned long starving;
+	/* RO fields */
+	unsigned int cntxt_id;      /* SGE context id for the free list */
+	unsigned int size;          /* capacity of free list */
+	struct rx_sw_desc *sdesc;   /* address of SW Rx descriptor ring */
+	__be64 *desc;               /* address of HW Rx descriptor ring */
+	dma_addr_t addr;            /* bus address of HW ring start */
+	u64 udb;                    /* BAR2 offset of User Doorbell area */
+};
+
+/* A packet gather list */
+struct pkt_gl {
+	struct page_frag frags[MAX_SKB_FRAGS];
+	void *va;                         /* virtual address of first byte */
+	unsigned int nfrags;              /* # of fragments */
+	unsigned int tot_len;             /* total length of fragments */
+};
+
+typedef int (*rspq_handler_t)(struct sge_rspq *q, const __be64 *rsp,
+			      const struct pkt_gl *gl);
+
+struct sge_rspq {                   /* state for an SGE response queue */
+	struct napi_struct napi;
+	const __be64 *cur_desc;     /* current descriptor in queue */
+	unsigned int cidx;          /* consumer index */
+	u8 gen;                     /* current generation bit */
+	u8 intr_params;             /* interrupt holdoff parameters */
+	u8 next_intr_params;        /* holdoff params for next interrupt */
+	u8 adaptive_rx;
+	u8 pktcnt_idx;              /* interrupt packet threshold */
+	u8 uld;                     /* ULD handling this queue */
+	u8 idx;                     /* queue index within its group */
+	int offset;                 /* offset into current Rx buffer */
+	u16 cntxt_id;               /* SGE context id for the response q */
+	u16 abs_id;                 /* absolute SGE id for the response q */
+	__be64 *desc;               /* address of HW response ring */
+	dma_addr_t phys_addr;       /* physical address of the ring */
+	u64 udb;                    /* BAR2 offset of User Doorbell area */
+	unsigned int iqe_len;       /* entry size */
+	unsigned int size;          /* capacity of response queue */
+	struct adapter *adap;
+	struct net_device *netdev;  /* associated net device */
+	rspq_handler_t handler;
+};
+
+struct sge_eth_stats {              /* Ethernet queue statistics */
+	unsigned long pkts;         /* # of ethernet packets */
+	unsigned long lro_pkts;     /* # of LRO super packets */
+	unsigned long lro_merged;   /* # of wire packets merged by LRO */
+	unsigned long rx_cso;       /* # of Rx checksum offloads */
+	unsigned long vlan_ex;      /* # of Rx VLAN extractions */
+	unsigned long rx_drops;     /* # of packets dropped due to no mem */
+};
+
+struct sge_eth_rxq {                /* SW Ethernet Rx queue */
+	struct sge_rspq rspq;
+	struct sge_fl fl;
+	struct sge_eth_stats stats;
+} ____cacheline_aligned_in_smp;
+
+struct sge_ofld_stats {             /* offload queue statistics */
+	unsigned long pkts;         /* # of packets */
+	unsigned long imm;          /* # of immediate-data packets */
+	unsigned long an;           /* # of asynchronous notifications */
+	unsigned long nomem;        /* # of responses deferred due to no mem */
+};
+
+struct sge_ofld_rxq {               /* SW offload Rx queue */
+	struct sge_rspq rspq;
+	struct sge_fl fl;
+	struct sge_ofld_stats stats;
+} ____cacheline_aligned_in_smp;
+
+struct tx_desc {
+	__be64 flit[8];
+};
+
+struct tx_sw_desc;
+
+struct sge_txq {
+	unsigned int  in_use;       /* # of in-use Tx descriptors */
+	unsigned int  size;         /* # of descriptors */
+	unsigned int  cidx;         /* SW consumer index */
+	unsigned int  pidx;         /* producer index */
+	unsigned long stops;        /* # of times q has been stopped */
+	unsigned long restarts;     /* # of queue restarts */
+	unsigned int  cntxt_id;     /* SGE context id for the Tx q */
+	struct tx_desc *desc;       /* address of HW Tx descriptor ring */
+	struct tx_sw_desc *sdesc;   /* address of SW Tx descriptor ring */
+	struct sge_qstat *stat;     /* queue status entry */
+	dma_addr_t    phys_addr;    /* physical address of the ring */
+	spinlock_t db_lock;
+	int db_disabled;
+	unsigned short db_pidx;
+	unsigned short db_pidx_inc;
+	u64 udb;                    /* BAR2 offset of User Doorbell area */
+};
+
+struct sge_eth_txq {                /* state for an SGE Ethernet Tx queue */
+	struct sge_txq q;
+	struct netdev_queue *txq;   /* associated netdev TX queue */
+#ifdef CONFIG_CHELSIO_T4_DCB
+	u8 dcb_prio;		    /* DCB Priority bound to queue */
+#endif
+	unsigned long tso;          /* # of TSO requests */
+	unsigned long tx_cso;       /* # of Tx checksum offloads */
+	unsigned long vlan_ins;     /* # of Tx VLAN insertions */
+	unsigned long mapping_err;  /* # of I/O MMU packet mapping errors */
+} ____cacheline_aligned_in_smp;
+
+struct sge_ofld_txq {               /* state for an SGE offload Tx queue */
+	struct sge_txq q;
+	struct adapter *adap;
+	struct sk_buff_head sendq;  /* list of backpressured packets */
+	struct tasklet_struct qresume_tsk; /* restarts the queue */
+	u8 full;                    /* the Tx ring is full */
+	unsigned long mapping_err;  /* # of I/O MMU packet mapping errors */
+} ____cacheline_aligned_in_smp;
+
+struct sge_ctrl_txq {               /* state for an SGE control Tx queue */
+	struct sge_txq q;
+	struct adapter *adap;
+	struct sk_buff_head sendq;  /* list of backpressured packets */
+	struct tasklet_struct qresume_tsk; /* restarts the queue */
+	u8 full;                    /* the Tx ring is full */
+} ____cacheline_aligned_in_smp;
+
+struct sge {
+	struct sge_eth_txq ethtxq[MAX_ETH_QSETS];
+	struct sge_ofld_txq ofldtxq[MAX_OFLD_QSETS];
+	struct sge_ctrl_txq ctrlq[MAX_CTRL_QUEUES];
+
+	struct sge_eth_rxq ethrxq[MAX_ETH_QSETS];
+	struct sge_ofld_rxq ofldrxq[MAX_OFLD_QSETS];
+	struct sge_ofld_rxq rdmarxq[MAX_RDMA_QUEUES];
+	struct sge_ofld_rxq rdmaciq[MAX_RDMA_CIQS];
+	struct sge_rspq fw_evtq ____cacheline_aligned_in_smp;
+
+	struct sge_rspq intrq ____cacheline_aligned_in_smp;
+	spinlock_t intrq_lock;
+
+	u16 max_ethqsets;           /* # of available Ethernet queue sets */
+	u16 ethqsets;               /* # of active Ethernet queue sets */
+	u16 ethtxq_rover;           /* Tx queue to clean up next */
+	u16 ofldqsets;              /* # of active offload queue sets */
+	u16 rdmaqs;                 /* # of available RDMA Rx queues */
+	u16 rdmaciqs;               /* # of available RDMA concentrator IQs */
+	u16 ofld_rxq[MAX_OFLD_QSETS];
+	u16 rdma_rxq[NCHAN];
+	u16 rdma_ciq[NCHAN];
+	u16 timer_val[SGE_NTIMERS];
+	u8 counter_val[SGE_NCOUNTERS];
+	u32 fl_pg_order;            /* large page allocation size */
+	u32 stat_len;               /* length of status page at ring end */
+	u32 pktshift;               /* padding between CPL & packet data */
+	u32 fl_align;               /* response queue message alignment */
+	u32 fl_starve_thres;        /* Free List starvation threshold */
+
+	/* State variables for detecting an SGE Ingress DMA hang */
+	unsigned int idma_1s_thresh;/* SGE same State Counter 1s threshold */
+	unsigned int idma_stalled[2];/* SGE synthesized stalled timers in HZ */
+	unsigned int idma_state[2]; /* SGE IDMA Hang detect state */
+	unsigned int idma_qid[2];   /* SGE IDMA Hung Ingress Queue ID */
+
+	unsigned int egr_start;
+	unsigned int ingr_start;
+	void *egr_map[MAX_EGRQ];    /* qid->queue egress queue map */
+	struct sge_rspq *ingr_map[MAX_INGQ]; /* qid->queue ingress queue map */
+	DECLARE_BITMAP(starving_fl, MAX_EGRQ);
+	DECLARE_BITMAP(txq_maperr, MAX_EGRQ);
+	struct timer_list rx_timer; /* refills starving FLs */
+	struct timer_list tx_timer; /* checks Tx queues */
+};
+
+#define for_each_ethrxq(sge, i) for (i = 0; i < (sge)->ethqsets; i++)
+#define for_each_ofldrxq(sge, i) for (i = 0; i < (sge)->ofldqsets; i++)
+#define for_each_rdmarxq(sge, i) for (i = 0; i < (sge)->rdmaqs; i++)
+#define for_each_rdmaciq(sge, i) for (i = 0; i < (sge)->rdmaciqs; i++)
+
+struct l2t_data;
+
+#ifdef CONFIG_PCI_IOV
+
+/* T4 supports SRIOV on PF0-3 and T5 on PF0-7.  However, the Serial
+ * Configuration initialization for T5 only has SR-IOV functionality enabled
+ * on PF0-3 in order to simplify everything.
+ */
+#define NUM_OF_PF_WITH_SRIOV 4
+
+#endif
+
+struct adapter {
+	void __iomem *regs;
+	void __iomem *bar2;
+	u32 t4_bar0;
+	struct pci_dev *pdev;
+	struct device *pdev_dev;
+	unsigned int mbox;
+	unsigned int fn;
+	unsigned int flags;
+	enum chip_type chip;
+
+	int msg_enable;
+
+	struct adapter_params params;
+	struct cxgb4_virt_res vres;
+	unsigned int swintr;
+
+	unsigned int wol;
+
+	struct {
+		unsigned short vec;
+		char desc[IFNAMSIZ + 10];
+	} msix_info[MAX_INGQ + 1];
+
+	struct sge sge;
+
+	struct net_device *port[MAX_NPORTS];
+	u8 chan_map[NCHAN];                   /* channel -> port map */
+
+	u32 filter_mode;
+	unsigned int l2t_start;
+	unsigned int l2t_end;
+	struct l2t_data *l2t;
+	void *uld_handle[CXGB4_ULD_MAX];
+	struct list_head list_node;
+	struct list_head rcu_node;
+
+	struct tid_info tids;
+	void **tid_release_head;
+	spinlock_t tid_release_lock;
+	struct workqueue_struct *workq;
+	struct work_struct tid_release_task;
+	struct work_struct db_full_task;
+	struct work_struct db_drop_task;
+	bool tid_release_task_busy;
+
+	struct dentry *debugfs_root;
+
+	spinlock_t stats_lock;
+	spinlock_t win0_lock ____cacheline_aligned_in_smp;
+};
+
+/* Defined bit width of user definable filter tuples
+ */
+#define ETHTYPE_BITWIDTH 16
+#define FRAG_BITWIDTH 1
+#define MACIDX_BITWIDTH 9
+#define FCOE_BITWIDTH 1
+#define IPORT_BITWIDTH 3
+#define MATCHTYPE_BITWIDTH 3
+#define PROTO_BITWIDTH 8
+#define TOS_BITWIDTH 8
+#define PF_BITWIDTH 8
+#define VF_BITWIDTH 8
+#define IVLAN_BITWIDTH 16
+#define OVLAN_BITWIDTH 16
+
+/* Filter matching rules.  These consist of a set of ingress packet field
+ * (value, mask) tuples.  The associated ingress packet field matches the
+ * tuple when ((field & mask) == value).  (Thus a wildcard "don't care" field
+ * rule can be constructed by specifying a tuple of (0, 0).)  A filter rule
+ * matches an ingress packet when all of the individual individual field
+ * matching rules are true.
+ *
+ * Partial field masks are always valid, however, while it may be easy to
+ * understand their meanings for some fields (e.g. IP address to match a
+ * subnet), for others making sensible partial masks is less intuitive (e.g.
+ * MPS match type) ...
+ *
+ * Most of the following data structures are modeled on T4 capabilities.
+ * Drivers for earlier chips use the subsets which make sense for those chips.
+ * We really need to come up with a hardware-independent mechanism to
+ * represent hardware filter capabilities ...
+ */
+struct ch_filter_tuple {
+	/* Compressed header matching field rules.  The TP_VLAN_PRI_MAP
+	 * register selects which of these fields will participate in the
+	 * filter match rules -- up to a maximum of 36 bits.  Because
+	 * TP_VLAN_PRI_MAP is a global register, all filters must use the same
+	 * set of fields.
+	 */
+	uint32_t ethtype:ETHTYPE_BITWIDTH;      /* Ethernet type */
+	uint32_t frag:FRAG_BITWIDTH;            /* IP fragmentation header */
+	uint32_t ivlan_vld:1;                   /* inner VLAN valid */
+	uint32_t ovlan_vld:1;                   /* outer VLAN valid */
+	uint32_t pfvf_vld:1;                    /* PF/VF valid */
+	uint32_t macidx:MACIDX_BITWIDTH;        /* exact match MAC index */
+	uint32_t fcoe:FCOE_BITWIDTH;            /* FCoE packet */
+	uint32_t iport:IPORT_BITWIDTH;          /* ingress port */
+	uint32_t matchtype:MATCHTYPE_BITWIDTH;  /* MPS match type */
+	uint32_t proto:PROTO_BITWIDTH;          /* protocol type */
+	uint32_t tos:TOS_BITWIDTH;              /* TOS/Traffic Type */
+	uint32_t pf:PF_BITWIDTH;                /* PCI-E PF ID */
+	uint32_t vf:VF_BITWIDTH;                /* PCI-E VF ID */
+	uint32_t ivlan:IVLAN_BITWIDTH;          /* inner VLAN */
+	uint32_t ovlan:OVLAN_BITWIDTH;          /* outer VLAN */
+
+	/* Uncompressed header matching field rules.  These are always
+	 * available for field rules.
+	 */
+	uint8_t lip[16];        /* local IP address (IPv4 in [3:0]) */
+	uint8_t fip[16];        /* foreign IP address (IPv4 in [3:0]) */
+	uint16_t lport;         /* local port */
+	uint16_t fport;         /* foreign port */
+};
+
+/* A filter ioctl command.
+ */
+struct ch_filter_specification {
+	/* Administrative fields for filter.
+	 */
+	uint32_t hitcnts:1;     /* count filter hits in TCB */
+	uint32_t prio:1;        /* filter has priority over active/server */
+
+	/* Fundamental filter typing.  This is the one element of filter
+	 * matching that doesn't exist as a (value, mask) tuple.
+	 */
+	uint32_t type:1;        /* 0 => IPv4, 1 => IPv6 */
+
+	/* Packet dispatch information.  Ingress packets which match the
+	 * filter rules will be dropped, passed to the host or switched back
+	 * out as egress packets.
+	 */
+	uint32_t action:2;      /* drop, pass, switch */
+
+	uint32_t rpttid:1;      /* report TID in RSS hash field */
+
+	uint32_t dirsteer:1;    /* 0 => RSS, 1 => steer to iq */
+	uint32_t iq:10;         /* ingress queue */
+
+	uint32_t maskhash:1;    /* dirsteer=0: store RSS hash in TCB */
+	uint32_t dirsteerhash:1;/* dirsteer=1: 0 => TCB contains RSS hash */
+				/*             1 => TCB contains IQ ID */
+
+	/* Switch proxy/rewrite fields.  An ingress packet which matches a
+	 * filter with "switch" set will be looped back out as an egress
+	 * packet -- potentially with some Ethernet header rewriting.
+	 */
+	uint32_t eport:2;       /* egress port to switch packet out */
+	uint32_t newdmac:1;     /* rewrite destination MAC address */
+	uint32_t newsmac:1;     /* rewrite source MAC address */
+	uint32_t newvlan:2;     /* rewrite VLAN Tag */
+	uint8_t dmac[ETH_ALEN]; /* new destination MAC address */
+	uint8_t smac[ETH_ALEN]; /* new source MAC address */
+	uint16_t vlan;          /* VLAN Tag to insert */
+
+	/* Filter rule value/mask pairs.
+	 */
+	struct ch_filter_tuple val;
+	struct ch_filter_tuple mask;
+};
+
+enum {
+	FILTER_PASS = 0,        /* default */
+	FILTER_DROP,
+	FILTER_SWITCH
+};
+
+enum {
+	VLAN_NOCHANGE = 0,      /* default */
+	VLAN_REMOVE,
+	VLAN_INSERT,
+	VLAN_REWRITE
+};
+
+static inline int is_t5(enum chip_type chip)
+{
+	return CHELSIO_CHIP_VERSION(chip) == CHELSIO_T5;
+}
+
+static inline int is_t4(enum chip_type chip)
+{
+	return CHELSIO_CHIP_VERSION(chip) == CHELSIO_T4;
+}
+
+static inline u32 t4_read_reg(struct adapter *adap, u32 reg_addr)
+{
+	return readl(adap->regs + reg_addr);
+}
+
+static inline void t4_write_reg(struct adapter *adap, u32 reg_addr, u32 val)
+{
+	writel(val, adap->regs + reg_addr);
+}
+
+#ifndef readq
+static inline u64 readq(const volatile void __iomem *addr)
+{
+	return readl(addr) + ((u64)readl(addr + 4) << 32);
+}
+
+static inline void writeq(u64 val, volatile void __iomem *addr)
+{
+	writel(val, addr);
+	writel(val >> 32, addr + 4);
+}
+#endif
+
+static inline u64 t4_read_reg64(struct adapter *adap, u32 reg_addr)
+{
+	return readq(adap->regs + reg_addr);
+}
+
+static inline void t4_write_reg64(struct adapter *adap, u32 reg_addr, u64 val)
+{
+	writeq(val, adap->regs + reg_addr);
+}
+
+/**
+ * netdev2pinfo - return the port_info structure associated with a net_device
+ * @dev: the netdev
+ *
+ * Return the struct port_info associated with a net_device
+ */
+static inline struct port_info *netdev2pinfo(const struct net_device *dev)
+{
+	return netdev_priv(dev);
+}
+
+/**
+ * adap2pinfo - return the port_info of a port
+ * @adap: the adapter
+ * @idx: the port index
+ *
+ * Return the port_info structure for the port of the given index.
+ */
+static inline struct port_info *adap2pinfo(struct adapter *adap, int idx)
+{
+	return netdev_priv(adap->port[idx]);
+}
+
+/**
+ * netdev2adap - return the adapter structure associated with a net_device
+ * @dev: the netdev
+ *
+ * Return the struct adapter associated with a net_device
+ */
+static inline struct adapter *netdev2adap(const struct net_device *dev)
+{
+	return netdev2pinfo(dev)->adapter;
+}
+
+void t4_os_portmod_changed(const struct adapter *adap, int port_id);
+void t4_os_link_changed(struct adapter *adap, int port_id, int link_stat);
+
+void *t4_alloc_mem(size_t size);
+
+void t4_free_sge_resources(struct adapter *adap);
+void t4_free_ofld_rxqs(struct adapter *adap, int n, struct sge_ofld_rxq *q);
+irq_handler_t t4_intr_handler(struct adapter *adap);
+netdev_tx_t t4_eth_xmit(struct sk_buff *skb, struct net_device *dev);
+int t4_ethrx_handler(struct sge_rspq *q, const __be64 *rsp,
+		     const struct pkt_gl *gl);
+int t4_mgmt_tx(struct adapter *adap, struct sk_buff *skb);
+int t4_ofld_send(struct adapter *adap, struct sk_buff *skb);
+int t4_sge_alloc_rxq(struct adapter *adap, struct sge_rspq *iq, bool fwevtq,
+		     struct net_device *dev, int intr_idx,
+		     struct sge_fl *fl, rspq_handler_t hnd);
+int t4_sge_alloc_eth_txq(struct adapter *adap, struct sge_eth_txq *txq,
+			 struct net_device *dev, struct netdev_queue *netdevq,
+			 unsigned int iqid);
+int t4_sge_alloc_ctrl_txq(struct adapter *adap, struct sge_ctrl_txq *txq,
+			  struct net_device *dev, unsigned int iqid,
+			  unsigned int cmplqid);
+int t4_sge_alloc_ofld_txq(struct adapter *adap, struct sge_ofld_txq *txq,
+			  struct net_device *dev, unsigned int iqid);
+irqreturn_t t4_sge_intr_msix(int irq, void *cookie);
+int t4_sge_init(struct adapter *adap);
+void t4_sge_start(struct adapter *adap);
+void t4_sge_stop(struct adapter *adap);
+extern int dbfifo_int_thresh;
+
+#define for_each_port(adapter, iter) \
+	for (iter = 0; iter < (adapter)->params.nports; ++iter)
+
+static inline int is_bypass(struct adapter *adap)
+{
+	return adap->params.bypass;
+}
+
+static inline int is_bypass_device(int device)
+{
+	/* this should be set based upon device capabilities */
+	switch (device) {
+	case 0x440b:
+	case 0x440c:
+		return 1;
+	default:
+		return 0;
+	}
+}
+
+static inline unsigned int core_ticks_per_usec(const struct adapter *adap)
+{
+	return adap->params.vpd.cclk / 1000;
+}
+
+static inline unsigned int us_to_core_ticks(const struct adapter *adap,
+					    unsigned int us)
+{
+	return (us * adap->params.vpd.cclk) / 1000;
+}
+
+static inline unsigned int core_ticks_to_us(const struct adapter *adapter,
+					    unsigned int ticks)
+{
+	/* add Core Clock / 2 to round ticks to nearest uS */
+	return ((ticks * 1000 + adapter->params.vpd.cclk/2) /
+		adapter->params.vpd.cclk);
+}
+
+void t4_set_reg_field(struct adapter *adap, unsigned int addr, u32 mask,
+		      u32 val);
+
+int t4_wr_mbox_meat(struct adapter *adap, int mbox, const void *cmd, int size,
+		    void *rpl, bool sleep_ok);
+
+static inline int t4_wr_mbox(struct adapter *adap, int mbox, const void *cmd,
+			     int size, void *rpl)
+{
+	return t4_wr_mbox_meat(adap, mbox, cmd, size, rpl, true);
+}
+
+static inline int t4_wr_mbox_ns(struct adapter *adap, int mbox, const void *cmd,
+				int size, void *rpl)
+{
+	return t4_wr_mbox_meat(adap, mbox, cmd, size, rpl, false);
+}
+
+void t4_write_indirect(struct adapter *adap, unsigned int addr_reg,
+		       unsigned int data_reg, const u32 *vals,
+		       unsigned int nregs, unsigned int start_idx);
+void t4_read_indirect(struct adapter *adap, unsigned int addr_reg,
+		      unsigned int data_reg, u32 *vals, unsigned int nregs,
+		      unsigned int start_idx);
+void t4_hw_pci_read_cfg4(struct adapter *adapter, int reg, u32 *val);
+
+struct fw_filter_wr;
+
+void t4_intr_enable(struct adapter *adapter);
+void t4_intr_disable(struct adapter *adapter);
+int t4_slow_intr_handler(struct adapter *adapter);
+
+int t4_wait_dev_ready(void __iomem *regs);
+int t4_link_start(struct adapter *adap, unsigned int mbox, unsigned int port,
+		  struct link_config *lc);
+int t4_restart_aneg(struct adapter *adap, unsigned int mbox, unsigned int port);
+
+#define T4_MEMORY_WRITE	0
+#define T4_MEMORY_READ	1
+int t4_memory_rw(struct adapter *adap, int win, int mtype, u32 addr, u32 len,
+		 __be32 *buf, int dir);
+static inline int t4_memory_write(struct adapter *adap, int mtype, u32 addr,
+				  u32 len, __be32 *buf)
+{
+	return t4_memory_rw(adap, 0, mtype, addr, len, buf, 0);
+}
+
+int t4_seeprom_wp(struct adapter *adapter, bool enable);
+int get_vpd_params(struct adapter *adapter, struct vpd_params *p);
+int t4_load_fw(struct adapter *adapter, const u8 *fw_data, unsigned int size);
+int t4_fw_upgrade(struct adapter *adap, unsigned int mbox,
+		  const u8 *fw_data, unsigned int size, int force);
+unsigned int t4_flash_cfg_addr(struct adapter *adapter);
+int t4_get_fw_version(struct adapter *adapter, u32 *vers);
+int t4_get_tp_version(struct adapter *adapter, u32 *vers);
+int t4_prep_fw(struct adapter *adap, struct fw_info *fw_info,
+	       const u8 *fw_data, unsigned int fw_size,
+	       struct fw_hdr *card_fw, enum dev_state state, int *reset);
+int t4_prep_adapter(struct adapter *adapter);
+int t4_init_tp_params(struct adapter *adap);
+int t4_filter_field_shift(const struct adapter *adap, int filter_sel);
+int t4_port_init(struct adapter *adap, int mbox, int pf, int vf);
+void t4_fatal_err(struct adapter *adapter);
+int t4_config_rss_range(struct adapter *adapter, int mbox, unsigned int viid,
+			int start, int n, const u16 *rspq, unsigned int nrspq);
+int t4_config_glbl_rss(struct adapter *adapter, int mbox, unsigned int mode,
+		       unsigned int flags);
+int t4_mc_read(struct adapter *adap, int idx, u32 addr, __be32 *data,
+	       u64 *parity);
+int t4_edc_read(struct adapter *adap, int idx, u32 addr, __be32 *data,
+		u64 *parity);
+const char *t4_get_port_type_description(enum fw_port_type port_type);
+void t4_get_port_stats(struct adapter *adap, int idx, struct port_stats *p);
+void t4_read_mtu_tbl(struct adapter *adap, u16 *mtus, u8 *mtu_log);
+void t4_tp_wr_bits_indirect(struct adapter *adap, unsigned int addr,
+			    unsigned int mask, unsigned int val);
+void t4_tp_get_tcp_stats(struct adapter *adap, struct tp_tcp_stats *v4,
+			 struct tp_tcp_stats *v6);
+void t4_load_mtus(struct adapter *adap, const unsigned short *mtus,
+		  const unsigned short *alpha, const unsigned short *beta);
+
+void t4_mk_filtdelwr(unsigned int ftid, struct fw_filter_wr *wr, int qid);
+
+void t4_wol_magic_enable(struct adapter *adap, unsigned int port,
+			 const u8 *addr);
+int t4_wol_pat_enable(struct adapter *adap, unsigned int port, unsigned int map,
+		      u64 mask0, u64 mask1, unsigned int crc, bool enable);
+
+int t4_fw_hello(struct adapter *adap, unsigned int mbox, unsigned int evt_mbox,
+		enum dev_master master, enum dev_state *state);
+int t4_fw_bye(struct adapter *adap, unsigned int mbox);
+int t4_early_init(struct adapter *adap, unsigned int mbox);
+int t4_fw_reset(struct adapter *adap, unsigned int mbox, int reset);
+int t4_fixup_host_params(struct adapter *adap, unsigned int page_size,
+			  unsigned int cache_line_size);
+int t4_fw_initialize(struct adapter *adap, unsigned int mbox);
+int t4_query_params(struct adapter *adap, unsigned int mbox, unsigned int pf,
+		    unsigned int vf, unsigned int nparams, const u32 *params,
+		    u32 *val);
+int t4_set_params(struct adapter *adap, unsigned int mbox, unsigned int pf,
+		  unsigned int vf, unsigned int nparams, const u32 *params,
+		  const u32 *val);
+int t4_set_params_nosleep(struct adapter *adap, unsigned int mbox,
+			  unsigned int pf, unsigned int vf,
+			  unsigned int nparams, const u32 *params,
+			  const u32 *val);
+int t4_cfg_pfvf(struct adapter *adap, unsigned int mbox, unsigned int pf,
+		unsigned int vf, unsigned int txq, unsigned int txq_eth_ctrl,
+		unsigned int rxqi, unsigned int rxq, unsigned int tc,
+		unsigned int vi, unsigned int cmask, unsigned int pmask,
+		unsigned int nexact, unsigned int rcaps, unsigned int wxcaps);
+int t4_alloc_vi(struct adapter *adap, unsigned int mbox, unsigned int port,
+		unsigned int pf, unsigned int vf, unsigned int nmac, u8 *mac,
+		unsigned int *rss_size);
+int t4_set_rxmode(struct adapter *adap, unsigned int mbox, unsigned int viid,
+		int mtu, int promisc, int all_multi, int bcast, int vlanex,
+		bool sleep_ok);
+int t4_alloc_mac_filt(struct adapter *adap, unsigned int mbox,
+		      unsigned int viid, bool free, unsigned int naddr,
+		      const u8 **addr, u16 *idx, u64 *hash, bool sleep_ok);
+int t4_change_mac(struct adapter *adap, unsigned int mbox, unsigned int viid,
+		  int idx, const u8 *addr, bool persist, bool add_smt);
+int t4_set_addr_hash(struct adapter *adap, unsigned int mbox, unsigned int viid,
+		     bool ucast, u64 vec, bool sleep_ok);
+int t4_enable_vi_params(struct adapter *adap, unsigned int mbox,
+			unsigned int viid, bool rx_en, bool tx_en, bool dcb_en);
+int t4_enable_vi(struct adapter *adap, unsigned int mbox, unsigned int viid,
+		 bool rx_en, bool tx_en);
+int t4_identify_port(struct adapter *adap, unsigned int mbox, unsigned int viid,
+		     unsigned int nblinks);
+int t4_mdio_rd(struct adapter *adap, unsigned int mbox, unsigned int phy_addr,
+	       unsigned int mmd, unsigned int reg, u16 *valp);
+int t4_mdio_wr(struct adapter *adap, unsigned int mbox, unsigned int phy_addr,
+	       unsigned int mmd, unsigned int reg, u16 val);
+int t4_iq_free(struct adapter *adap, unsigned int mbox, unsigned int pf,
+	       unsigned int vf, unsigned int iqtype, unsigned int iqid,
+	       unsigned int fl0id, unsigned int fl1id);
+int t4_eth_eq_free(struct adapter *adap, unsigned int mbox, unsigned int pf,
+		   unsigned int vf, unsigned int eqid);
+int t4_ctrl_eq_free(struct adapter *adap, unsigned int mbox, unsigned int pf,
+		    unsigned int vf, unsigned int eqid);
+int t4_ofld_eq_free(struct adapter *adap, unsigned int mbox, unsigned int pf,
+		    unsigned int vf, unsigned int eqid);
+int t4_handle_fw_rpl(struct adapter *adap, const __be64 *rpl);
+void t4_db_full(struct adapter *adapter);
+void t4_db_dropped(struct adapter *adapter);
+int t4_fwaddrspace_write(struct adapter *adap, unsigned int mbox,
+			 u32 addr, u32 val);
+void t4_sge_decode_idma_state(struct adapter *adapter, int state);
+#endif /* __CXGB4_H__ */
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_dcb.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_dcb.c
new file mode 100644
index 0000000..4fe3360
--- /dev/null
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_dcb.c
@@ -0,0 +1,1149 @@
+/*
+ *  Copyright (C) 2013-2014 Chelsio Communications.  All rights reserved.
+ *
+ *  Written by Anish Bhatt (anish@chelsio.com)
+ *	       Casey Leedom (leedom@chelsio.com)
+ *
+ *  This program is free software; you can redistribute it and/or modify it
+ *  under the terms and conditions of the GNU General Public License,
+ *  version 2, as published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ *  more details.
+ *
+ *  The full GNU General Public License is included in this distribution in
+ *  the file called "COPYING".
+ *
+ */
+
+#include "cxgb4.h"
+
+/* DCBx version control
+ */
+char *dcb_ver_array[] = {
+	"Unknown",
+	"DCBx-CIN",
+	"DCBx-CEE 1.01",
+	"DCBx-IEEE",
+	"", "", "",
+	"Auto Negotiated"
+};
+
+/* Initialize a port's Data Center Bridging state.  Typically used after a
+ * Link Down event.
+ */
+void cxgb4_dcb_state_init(struct net_device *dev)
+{
+	struct port_info *pi = netdev2pinfo(dev);
+	struct port_dcb_info *dcb = &pi->dcb;
+	int version_temp = dcb->dcb_version;
+
+	memset(dcb, 0, sizeof(struct port_dcb_info));
+	dcb->state = CXGB4_DCB_STATE_START;
+	if (version_temp)
+		dcb->dcb_version = version_temp;
+
+	netdev_dbg(dev, "%s: Initializing DCB state for port[%d]\n",
+		    __func__, pi->port_id);
+}
+
+void cxgb4_dcb_version_init(struct net_device *dev)
+{
+	struct port_info *pi = netdev2pinfo(dev);
+	struct port_dcb_info *dcb = &pi->dcb;
+
+	/* Any writes here are only done on kernels that exlicitly need
+	 * a specific version, say < 2.6.38 which only support CEE
+	 */
+	dcb->dcb_version = FW_PORT_DCB_VER_AUTO;
+}
+
+static void cxgb4_dcb_cleanup_apps(struct net_device *dev)
+{
+	struct port_info *pi = netdev2pinfo(dev);
+	struct adapter *adap = pi->adapter;
+	struct port_dcb_info *dcb = &pi->dcb;
+	struct dcb_app app;
+	int i, err;
+
+	/* zero priority implies remove */
+	app.priority = 0;
+
+	for (i = 0; i < CXGB4_MAX_DCBX_APP_SUPPORTED; i++) {
+		/* Check if app list is exhausted */
+		if (!dcb->app_priority[i].protocolid)
+			break;
+
+		app.protocol = dcb->app_priority[i].protocolid;
+
+		if (dcb->dcb_version == FW_PORT_DCB_VER_IEEE) {
+			app.priority = dcb->app_priority[i].user_prio_map;
+			app.selector = dcb->app_priority[i].sel_field + 1;
+			err = dcb_ieee_delapp(dev, &app);
+		} else {
+			app.selector = !!(dcb->app_priority[i].sel_field);
+			err = dcb_setapp(dev, &app);
+		}
+
+		if (err) {
+			dev_err(adap->pdev_dev,
+				"Failed DCB Clear %s Application Priority: sel=%d, prot=%d, , err=%d\n",
+				dcb_ver_array[dcb->dcb_version], app.selector,
+				app.protocol, -err);
+			break;
+		}
+	}
+}
+
+/* Finite State machine for Data Center Bridging.
+ */
+void cxgb4_dcb_state_fsm(struct net_device *dev,
+			 enum cxgb4_dcb_state_input transition_to)
+{
+	struct port_info *pi = netdev2pinfo(dev);
+	struct port_dcb_info *dcb = &pi->dcb;
+	struct adapter *adap = pi->adapter;
+	enum cxgb4_dcb_state current_state = dcb->state;
+
+	netdev_dbg(dev, "%s: State change from %d to %d for %s\n",
+		    __func__, dcb->state, transition_to, dev->name);
+
+	switch (current_state) {
+	case CXGB4_DCB_STATE_START: {
+		switch (transition_to) {
+		case CXGB4_DCB_INPUT_FW_DISABLED: {
+			/* we're going to use Host DCB */
+			dcb->state = CXGB4_DCB_STATE_HOST;
+			dcb->supported = CXGB4_DCBX_HOST_SUPPORT;
+			break;
+		}
+
+		case CXGB4_DCB_INPUT_FW_ENABLED: {
+			/* we're going to use Firmware DCB */
+			dcb->state = CXGB4_DCB_STATE_FW_INCOMPLETE;
+			dcb->supported = DCB_CAP_DCBX_LLD_MANAGED;
+			if (dcb->dcb_version == FW_PORT_DCB_VER_IEEE)
+				dcb->supported |= DCB_CAP_DCBX_VER_IEEE;
+			else
+				dcb->supported |= DCB_CAP_DCBX_VER_CEE;
+			break;
+		}
+
+		case CXGB4_DCB_INPUT_FW_INCOMPLETE: {
+			/* expected transition */
+			break;
+		}
+
+		case CXGB4_DCB_INPUT_FW_ALLSYNCED: {
+			dcb->state = CXGB4_DCB_STATE_FW_ALLSYNCED;
+			break;
+		}
+
+		default:
+			goto bad_state_input;
+		}
+		break;
+	}
+
+	case CXGB4_DCB_STATE_FW_INCOMPLETE: {
+		switch (transition_to) {
+		case CXGB4_DCB_INPUT_FW_ENABLED: {
+			/* we're alreaady in firmware DCB mode */
+			break;
+		}
+
+		case CXGB4_DCB_INPUT_FW_INCOMPLETE: {
+			/* we're already incomplete */
+			break;
+		}
+
+		case CXGB4_DCB_INPUT_FW_ALLSYNCED: {
+			dcb->state = CXGB4_DCB_STATE_FW_ALLSYNCED;
+			dcb->enabled = 1;
+			linkwatch_fire_event(dev);
+			break;
+		}
+
+		default:
+			goto bad_state_input;
+		}
+		break;
+	}
+
+	case CXGB4_DCB_STATE_FW_ALLSYNCED: {
+		switch (transition_to) {
+		case CXGB4_DCB_INPUT_FW_ENABLED: {
+			/* we're alreaady in firmware DCB mode */
+			break;
+		}
+
+		case CXGB4_DCB_INPUT_FW_INCOMPLETE: {
+			/* We were successfully running with firmware DCB but
+			 * now it's telling us that it's in an "incomplete
+			 * state.  We need to reset back to a ground state
+			 * of incomplete.
+			 */
+			cxgb4_dcb_cleanup_apps(dev);
+			cxgb4_dcb_state_init(dev);
+			dcb->state = CXGB4_DCB_STATE_FW_INCOMPLETE;
+			dcb->supported = CXGB4_DCBX_FW_SUPPORT;
+			linkwatch_fire_event(dev);
+			break;
+		}
+
+		case CXGB4_DCB_INPUT_FW_ALLSYNCED: {
+			/* we're already all sync'ed
+			 * this is only applicable for IEEE or
+			 * when another VI already completed negotiaton
+			 */
+			dcb->enabled = 1;
+			linkwatch_fire_event(dev);
+			break;
+		}
+
+		default:
+			goto bad_state_input;
+		}
+		break;
+	}
+
+	case CXGB4_DCB_STATE_HOST: {
+		switch (transition_to) {
+		case CXGB4_DCB_INPUT_FW_DISABLED: {
+			/* we're alreaady in Host DCB mode */
+			break;
+		}
+
+		default:
+			goto bad_state_input;
+		}
+		break;
+	}
+
+	default:
+		goto bad_state_transition;
+	}
+	return;
+
+bad_state_input:
+	dev_err(adap->pdev_dev, "cxgb4_dcb_state_fsm: illegal input symbol %d\n",
+		transition_to);
+	return;
+
+bad_state_transition:
+	dev_err(adap->pdev_dev, "cxgb4_dcb_state_fsm: bad state transition, state = %d, input = %d\n",
+		current_state, transition_to);
+}
+
+/* Handle a DCB/DCBX update message from the firmware.
+ */
+void cxgb4_dcb_handle_fw_update(struct adapter *adap,
+				const struct fw_port_cmd *pcmd)
+{
+	const union fw_port_dcb *fwdcb = &pcmd->u.dcb;
+	int port = FW_PORT_CMD_PORTID_GET(be32_to_cpu(pcmd->op_to_portid));
+	struct net_device *dev = adap->port[port];
+	struct port_info *pi = netdev_priv(dev);
+	struct port_dcb_info *dcb = &pi->dcb;
+	int dcb_type = pcmd->u.dcb.pgid.type;
+	int dcb_running_version;
+
+	/* Handle Firmware DCB Control messages separately since they drive
+	 * our state machine.
+	 */
+	if (dcb_type == FW_PORT_DCB_TYPE_CONTROL) {
+		enum cxgb4_dcb_state_input input =
+			((pcmd->u.dcb.control.all_syncd_pkd &
+			  FW_PORT_CMD_ALL_SYNCD)
+			 ? CXGB4_DCB_STATE_FW_ALLSYNCED
+			 : CXGB4_DCB_STATE_FW_INCOMPLETE);
+
+		if (dcb->dcb_version != FW_PORT_DCB_VER_UNKNOWN) {
+			dcb_running_version = FW_PORT_CMD_DCB_VERSION_GET(
+				be16_to_cpu(
+				pcmd->u.dcb.control.dcb_version_to_app_state));
+			if (dcb_running_version == FW_PORT_DCB_VER_CEE1D01 ||
+			    dcb_running_version == FW_PORT_DCB_VER_IEEE) {
+				dcb->dcb_version = dcb_running_version;
+				dev_warn(adap->pdev_dev, "Interface %s is running %s\n",
+					 dev->name,
+					 dcb_ver_array[dcb->dcb_version]);
+			} else {
+				dev_warn(adap->pdev_dev,
+					 "Something screwed up, requested firmware for %s, but firmware returned %s instead\n",
+					 dcb_ver_array[dcb->dcb_version],
+					 dcb_ver_array[dcb_running_version]);
+				dcb->dcb_version = FW_PORT_DCB_VER_UNKNOWN;
+			}
+		}
+
+		cxgb4_dcb_state_fsm(dev, input);
+		return;
+	}
+
+	/* It's weird, and almost certainly an error, to get Firmware DCB
+	 * messages when we either haven't been told whether we're going to be
+	 * doing Host or Firmware DCB; and even worse when we've been told
+	 * that we're doing Host DCB!
+	 */
+	if (dcb->state == CXGB4_DCB_STATE_START ||
+	    dcb->state == CXGB4_DCB_STATE_HOST) {
+		dev_err(adap->pdev_dev, "Receiving Firmware DCB messages in State %d\n",
+			dcb->state);
+		return;
+	}
+
+	/* Now handle the general Firmware DCB update messages ...
+	 */
+	switch (dcb_type) {
+	case FW_PORT_DCB_TYPE_PGID:
+		dcb->pgid = be32_to_cpu(fwdcb->pgid.pgid);
+		dcb->msgs |= CXGB4_DCB_FW_PGID;
+		break;
+
+	case FW_PORT_DCB_TYPE_PGRATE:
+		dcb->pg_num_tcs_supported = fwdcb->pgrate.num_tcs_supported;
+		memcpy(dcb->pgrate, &fwdcb->pgrate.pgrate,
+		       sizeof(dcb->pgrate));
+		memcpy(dcb->tsa, &fwdcb->pgrate.tsa,
+		       sizeof(dcb->tsa));
+		dcb->msgs |= CXGB4_DCB_FW_PGRATE;
+		if (dcb->msgs & CXGB4_DCB_FW_PGID)
+			IEEE_FAUX_SYNC(dev, dcb);
+		break;
+
+	case FW_PORT_DCB_TYPE_PRIORATE:
+		memcpy(dcb->priorate, &fwdcb->priorate.strict_priorate,
+		       sizeof(dcb->priorate));
+		dcb->msgs |= CXGB4_DCB_FW_PRIORATE;
+		break;
+
+	case FW_PORT_DCB_TYPE_PFC:
+		dcb->pfcen = fwdcb->pfc.pfcen;
+		dcb->pfc_num_tcs_supported = fwdcb->pfc.max_pfc_tcs;
+		dcb->msgs |= CXGB4_DCB_FW_PFC;
+		IEEE_FAUX_SYNC(dev, dcb);
+		break;
+
+	case FW_PORT_DCB_TYPE_APP_ID: {
+		const struct fw_port_app_priority *fwap = &fwdcb->app_priority;
+		int idx = fwap->idx;
+		struct app_priority *ap = &dcb->app_priority[idx];
+
+		struct dcb_app app = {
+			.protocol = be16_to_cpu(fwap->protocolid),
+		};
+		int err;
+
+		/* Convert from firmware format to relevant format
+		 * when using app selector
+		 */
+		if (dcb->dcb_version == FW_PORT_DCB_VER_IEEE) {
+			app.selector = (fwap->sel_field + 1);
+			app.priority = ffs(fwap->user_prio_map) - 1;
+			err = dcb_ieee_setapp(dev, &app);
+			IEEE_FAUX_SYNC(dev, dcb);
+		} else {
+			/* Default is CEE */
+			app.selector = !!(fwap->sel_field);
+			app.priority = fwap->user_prio_map;
+			err = dcb_setapp(dev, &app);
+		}
+
+		if (err)
+			dev_err(adap->pdev_dev,
+				"Failed DCB Set Application Priority: sel=%d, prot=%d, prio=%d, err=%d\n",
+				app.selector, app.protocol, app.priority, -err);
+
+		ap->user_prio_map = fwap->user_prio_map;
+		ap->sel_field = fwap->sel_field;
+		ap->protocolid = be16_to_cpu(fwap->protocolid);
+		dcb->msgs |= CXGB4_DCB_FW_APP_ID;
+		break;
+	}
+
+	default:
+		dev_err(adap->pdev_dev, "Unknown DCB update type received %x\n",
+			dcb_type);
+		break;
+	}
+}
+
+/* Data Center Bridging netlink operations.
+ */
+
+
+/* Get current DCB enabled/disabled state.
+ */
+static u8 cxgb4_getstate(struct net_device *dev)
+{
+	struct port_info *pi = netdev2pinfo(dev);
+
+	return pi->dcb.enabled;
+}
+
+/* Set DCB enabled/disabled.
+ */
+static u8 cxgb4_setstate(struct net_device *dev, u8 enabled)
+{
+	struct port_info *pi = netdev2pinfo(dev);
+
+	/* If DCBx is host-managed, dcb is enabled by outside lldp agents */
+	if (pi->dcb.state == CXGB4_DCB_STATE_HOST) {
+		pi->dcb.enabled = enabled;
+		return 0;
+	}
+
+	/* Firmware doesn't provide any mechanism to control the DCB state.
+	 */
+	if (enabled != (pi->dcb.state == CXGB4_DCB_STATE_FW_ALLSYNCED))
+		return 1;
+
+	return 0;
+}
+
+static void cxgb4_getpgtccfg(struct net_device *dev, int tc,
+			     u8 *prio_type, u8 *pgid, u8 *bw_per,
+			     u8 *up_tc_map, int local)
+{
+	struct fw_port_cmd pcmd;
+	struct port_info *pi = netdev2pinfo(dev);
+	struct adapter *adap = pi->adapter;
+	int err;
+
+	*prio_type = *pgid = *bw_per = *up_tc_map = 0;
+
+	if (local)
+		INIT_PORT_DCB_READ_LOCAL_CMD(pcmd, pi->port_id);
+	else
+		INIT_PORT_DCB_READ_PEER_CMD(pcmd, pi->port_id);
+
+	pcmd.u.dcb.pgid.type = FW_PORT_DCB_TYPE_PGID;
+	err = t4_wr_mbox(adap, adap->mbox, &pcmd, sizeof(pcmd), &pcmd);
+	if (err != FW_PORT_DCB_CFG_SUCCESS) {
+		dev_err(adap->pdev_dev, "DCB read PGID failed with %d\n", -err);
+		return;
+	}
+	*pgid = (be32_to_cpu(pcmd.u.dcb.pgid.pgid) >> (tc * 4)) & 0xf;
+
+	INIT_PORT_DCB_READ_PEER_CMD(pcmd, pi->port_id);
+	pcmd.u.dcb.pgrate.type = FW_PORT_DCB_TYPE_PGRATE;
+	err = t4_wr_mbox(adap, adap->mbox, &pcmd, sizeof(pcmd), &pcmd);
+	if (err != FW_PORT_DCB_CFG_SUCCESS) {
+		dev_err(adap->pdev_dev, "DCB read PGRATE failed with %d\n",
+			-err);
+		return;
+	}
+
+	*bw_per = pcmd.u.dcb.pgrate.pgrate[*pgid];
+	*up_tc_map = (1 << tc);
+
+	/* prio_type is link strict */
+	if (*pgid != 0xF)
+		*prio_type = 0x2;
+}
+
+static void cxgb4_getpgtccfg_tx(struct net_device *dev, int tc,
+				u8 *prio_type, u8 *pgid, u8 *bw_per,
+				u8 *up_tc_map)
+{
+	/* tc 0 is written at MSB position */
+	return cxgb4_getpgtccfg(dev, (7 - tc), prio_type, pgid, bw_per,
+				up_tc_map, 1);
+}
+
+
+static void cxgb4_getpgtccfg_rx(struct net_device *dev, int tc,
+				u8 *prio_type, u8 *pgid, u8 *bw_per,
+				u8 *up_tc_map)
+{
+	/* tc 0 is written at MSB position */
+	return cxgb4_getpgtccfg(dev, (7 - tc), prio_type, pgid, bw_per,
+				up_tc_map, 0);
+}
+
+static void cxgb4_setpgtccfg_tx(struct net_device *dev, int tc,
+				u8 prio_type, u8 pgid, u8 bw_per,
+				u8 up_tc_map)
+{
+	struct fw_port_cmd pcmd;
+	struct port_info *pi = netdev2pinfo(dev);
+	struct adapter *adap = pi->adapter;
+	int fw_tc = 7 - tc;
+	u32 _pgid;
+	int err;
+
+	if (pgid == DCB_ATTR_VALUE_UNDEFINED)
+		return;
+	if (bw_per == DCB_ATTR_VALUE_UNDEFINED)
+		return;
+
+	INIT_PORT_DCB_READ_LOCAL_CMD(pcmd, pi->port_id);
+	pcmd.u.dcb.pgid.type = FW_PORT_DCB_TYPE_PGID;
+
+	err = t4_wr_mbox(adap, adap->mbox, &pcmd, sizeof(pcmd), &pcmd);
+	if (err != FW_PORT_DCB_CFG_SUCCESS) {
+		dev_err(adap->pdev_dev, "DCB read PGID failed with %d\n", -err);
+		return;
+	}
+
+	_pgid = be32_to_cpu(pcmd.u.dcb.pgid.pgid);
+	_pgid &= ~(0xF << (fw_tc * 4));
+	_pgid |= pgid << (fw_tc * 4);
+	pcmd.u.dcb.pgid.pgid = cpu_to_be32(_pgid);
+
+	INIT_PORT_DCB_WRITE_CMD(pcmd, pi->port_id);
+
+	err = t4_wr_mbox(adap, adap->mbox, &pcmd, sizeof(pcmd), &pcmd);
+	if (err != FW_PORT_DCB_CFG_SUCCESS) {
+		dev_err(adap->pdev_dev, "DCB write PGID failed with %d\n",
+			-err);
+		return;
+	}
+
+	memset(&pcmd, 0, sizeof(struct fw_port_cmd));
+
+	INIT_PORT_DCB_READ_LOCAL_CMD(pcmd, pi->port_id);
+	pcmd.u.dcb.pgrate.type = FW_PORT_DCB_TYPE_PGRATE;
+
+	err = t4_wr_mbox(adap, adap->mbox, &pcmd, sizeof(pcmd), &pcmd);
+	if (err != FW_PORT_DCB_CFG_SUCCESS) {
+		dev_err(adap->pdev_dev, "DCB read PGRATE failed with %d\n",
+			-err);
+		return;
+	}
+
+	pcmd.u.dcb.pgrate.pgrate[pgid] = bw_per;
+
+	INIT_PORT_DCB_WRITE_CMD(pcmd, pi->port_id);
+	if (pi->dcb.state == CXGB4_DCB_STATE_HOST)
+		pcmd.op_to_portid |= cpu_to_be32(FW_PORT_CMD_APPLY);
+
+	err = t4_wr_mbox(adap, adap->mbox, &pcmd, sizeof(pcmd), &pcmd);
+	if (err != FW_PORT_DCB_CFG_SUCCESS)
+		dev_err(adap->pdev_dev, "DCB write PGRATE failed with %d\n",
+			-err);
+}
+
+static void cxgb4_getpgbwgcfg(struct net_device *dev, int pgid, u8 *bw_per,
+			      int local)
+{
+	struct fw_port_cmd pcmd;
+	struct port_info *pi = netdev2pinfo(dev);
+	struct adapter *adap = pi->adapter;
+	int err;
+
+	if (local)
+		INIT_PORT_DCB_READ_LOCAL_CMD(pcmd, pi->port_id);
+	else
+		INIT_PORT_DCB_READ_PEER_CMD(pcmd, pi->port_id);
+
+	pcmd.u.dcb.pgrate.type = FW_PORT_DCB_TYPE_PGRATE;
+	err = t4_wr_mbox(adap, adap->mbox, &pcmd, sizeof(pcmd), &pcmd);
+	if (err != FW_PORT_DCB_CFG_SUCCESS) {
+		dev_err(adap->pdev_dev, "DCB read PGRATE failed with %d\n",
+			-err);
+		return;
+	}
+
+	*bw_per = pcmd.u.dcb.pgrate.pgrate[pgid];
+}
+
+static void cxgb4_getpgbwgcfg_tx(struct net_device *dev, int pgid, u8 *bw_per)
+{
+	return cxgb4_getpgbwgcfg(dev, pgid, bw_per, 1);
+}
+
+static void cxgb4_getpgbwgcfg_rx(struct net_device *dev, int pgid, u8 *bw_per)
+{
+	return cxgb4_getpgbwgcfg(dev, pgid, bw_per, 0);
+}
+
+static void cxgb4_setpgbwgcfg_tx(struct net_device *dev, int pgid,
+				 u8 bw_per)
+{
+	struct fw_port_cmd pcmd;
+	struct port_info *pi = netdev2pinfo(dev);
+	struct adapter *adap = pi->adapter;
+	int err;
+
+	INIT_PORT_DCB_READ_LOCAL_CMD(pcmd, pi->port_id);
+	pcmd.u.dcb.pgrate.type = FW_PORT_DCB_TYPE_PGRATE;
+
+	err = t4_wr_mbox(adap, adap->mbox, &pcmd, sizeof(pcmd), &pcmd);
+	if (err != FW_PORT_DCB_CFG_SUCCESS) {
+		dev_err(adap->pdev_dev, "DCB read PGRATE failed with %d\n",
+			-err);
+		return;
+	}
+
+	pcmd.u.dcb.pgrate.pgrate[pgid] = bw_per;
+
+	INIT_PORT_DCB_WRITE_CMD(pcmd, pi->port_id);
+	if (pi->dcb.state == CXGB4_DCB_STATE_HOST)
+		pcmd.op_to_portid |= cpu_to_be32(FW_PORT_CMD_APPLY);
+
+	err = t4_wr_mbox(adap, adap->mbox, &pcmd, sizeof(pcmd), &pcmd);
+
+	if (err != FW_PORT_DCB_CFG_SUCCESS)
+		dev_err(adap->pdev_dev, "DCB write PGRATE failed with %d\n",
+			-err);
+}
+
+/* Return whether the specified Traffic Class Priority has Priority Pause
+ * Frames enabled.
+ */
+static void cxgb4_getpfccfg(struct net_device *dev, int priority, u8 *pfccfg)
+{
+	struct port_info *pi = netdev2pinfo(dev);
+	struct port_dcb_info *dcb = &pi->dcb;
+
+	if (dcb->state != CXGB4_DCB_STATE_FW_ALLSYNCED ||
+	    priority >= CXGB4_MAX_PRIORITY)
+		*pfccfg = 0;
+	else
+		*pfccfg = (pi->dcb.pfcen >> (7 - priority)) & 1;
+}
+
+/* Enable/disable Priority Pause Frames for the specified Traffic Class
+ * Priority.
+ */
+static void cxgb4_setpfccfg(struct net_device *dev, int priority, u8 pfccfg)
+{
+	struct fw_port_cmd pcmd;
+	struct port_info *pi = netdev2pinfo(dev);
+	struct adapter *adap = pi->adapter;
+	int err;
+
+	if (pi->dcb.state != CXGB4_DCB_STATE_FW_ALLSYNCED ||
+	    priority >= CXGB4_MAX_PRIORITY)
+		return;
+
+	INIT_PORT_DCB_WRITE_CMD(pcmd, pi->port_id);
+	if (pi->dcb.state == CXGB4_DCB_STATE_HOST)
+		pcmd.op_to_portid |= cpu_to_be32(FW_PORT_CMD_APPLY);
+
+	pcmd.u.dcb.pfc.type = FW_PORT_DCB_TYPE_PFC;
+	pcmd.u.dcb.pfc.pfcen = pi->dcb.pfcen;
+
+	if (pfccfg)
+		pcmd.u.dcb.pfc.pfcen |= (1 << (7 - priority));
+	else
+		pcmd.u.dcb.pfc.pfcen &= (~(1 << (7 - priority)));
+
+	err = t4_wr_mbox(adap, adap->mbox, &pcmd, sizeof(pcmd), &pcmd);
+	if (err != FW_PORT_DCB_CFG_SUCCESS) {
+		dev_err(adap->pdev_dev, "DCB PFC write failed with %d\n", -err);
+		return;
+	}
+
+	pi->dcb.pfcen = pcmd.u.dcb.pfc.pfcen;
+}
+
+static u8 cxgb4_setall(struct net_device *dev)
+{
+	return 0;
+}
+
+/* Return DCB capabilities.
+ */
+static u8 cxgb4_getcap(struct net_device *dev, int cap_id, u8 *caps)
+{
+	struct port_info *pi = netdev2pinfo(dev);
+
+	switch (cap_id) {
+	case DCB_CAP_ATTR_PG:
+	case DCB_CAP_ATTR_PFC:
+		*caps = true;
+		break;
+
+	case DCB_CAP_ATTR_PG_TCS:
+		/* 8 priorities for PG represented by bitmap */
+		*caps = 0x80;
+		break;
+
+	case DCB_CAP_ATTR_PFC_TCS:
+		/* 8 priorities for PFC represented by bitmap */
+		*caps = 0x80;
+		break;
+
+	case DCB_CAP_ATTR_GSP:
+		*caps = true;
+		break;
+
+	case DCB_CAP_ATTR_UP2TC:
+	case DCB_CAP_ATTR_BCN:
+		*caps = false;
+		break;
+
+	case DCB_CAP_ATTR_DCBX:
+		*caps = pi->dcb.supported;
+		break;
+
+	default:
+		*caps = false;
+	}
+
+	return 0;
+}
+
+/* Return the number of Traffic Classes for the indicated Traffic Class ID.
+ */
+static int cxgb4_getnumtcs(struct net_device *dev, int tcs_id, u8 *num)
+{
+	struct port_info *pi = netdev2pinfo(dev);
+
+	switch (tcs_id) {
+	case DCB_NUMTCS_ATTR_PG:
+		if (pi->dcb.msgs & CXGB4_DCB_FW_PGRATE)
+			*num = pi->dcb.pg_num_tcs_supported;
+		else
+			*num = 0x8;
+		break;
+
+	case DCB_NUMTCS_ATTR_PFC:
+		*num = 0x8;
+		break;
+
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+/* Set the number of Traffic Classes supported for the indicated Traffic Class
+ * ID.
+ */
+static int cxgb4_setnumtcs(struct net_device *dev, int tcs_id, u8 num)
+{
+	/* Setting the number of Traffic Classes isn't supported.
+	 */
+	return -ENOSYS;
+}
+
+/* Return whether Priority Flow Control is enabled.  */
+static u8 cxgb4_getpfcstate(struct net_device *dev)
+{
+	struct port_info *pi = netdev2pinfo(dev);
+
+	if (pi->dcb.state != CXGB4_DCB_STATE_FW_ALLSYNCED)
+		return false;
+
+	return pi->dcb.pfcen != 0;
+}
+
+/* Enable/disable Priority Flow Control. */
+static void cxgb4_setpfcstate(struct net_device *dev, u8 state)
+{
+	/* We can't enable/disable Priority Flow Control but we also can't
+	 * return an error ...
+	 */
+}
+
+/* Return the Application User Priority Map associated with the specified
+ * Application ID.
+ */
+static int __cxgb4_getapp(struct net_device *dev, u8 app_idtype, u16 app_id,
+			  int peer)
+{
+	struct port_info *pi = netdev2pinfo(dev);
+	struct adapter *adap = pi->adapter;
+	int i;
+
+	if (pi->dcb.state != CXGB4_DCB_STATE_FW_ALLSYNCED)
+		return 0;
+
+	for (i = 0; i < CXGB4_MAX_DCBX_APP_SUPPORTED; i++) {
+		struct fw_port_cmd pcmd;
+		int err;
+
+		if (peer)
+			INIT_PORT_DCB_READ_PEER_CMD(pcmd, pi->port_id);
+		else
+			INIT_PORT_DCB_READ_LOCAL_CMD(pcmd, pi->port_id);
+
+		pcmd.u.dcb.app_priority.type = FW_PORT_DCB_TYPE_APP_ID;
+		pcmd.u.dcb.app_priority.idx = i;
+
+		err = t4_wr_mbox(adap, adap->mbox, &pcmd, sizeof(pcmd), &pcmd);
+		if (err != FW_PORT_DCB_CFG_SUCCESS) {
+			dev_err(adap->pdev_dev, "DCB APP read failed with %d\n",
+				-err);
+			return err;
+		}
+		if (be16_to_cpu(pcmd.u.dcb.app_priority.protocolid) == app_id)
+			if (pcmd.u.dcb.app_priority.sel_field == app_idtype)
+				return pcmd.u.dcb.app_priority.user_prio_map;
+
+		/* exhausted app list */
+		if (!pcmd.u.dcb.app_priority.protocolid)
+			break;
+	}
+
+	return -EEXIST;
+}
+
+/* Return the Application User Priority Map associated with the specified
+ * Application ID.
+ */
+static int cxgb4_getapp(struct net_device *dev, u8 app_idtype, u16 app_id)
+{
+	return __cxgb4_getapp(dev, app_idtype, app_id, 0);
+}
+
+/* Write a new Application User Priority Map for the specified Application ID
+ */
+static int __cxgb4_setapp(struct net_device *dev, u8 app_idtype, u16 app_id,
+			  u8 app_prio)
+{
+	struct fw_port_cmd pcmd;
+	struct port_info *pi = netdev2pinfo(dev);
+	struct adapter *adap = pi->adapter;
+	int i, err;
+
+
+	if (pi->dcb.state != CXGB4_DCB_STATE_FW_ALLSYNCED)
+		return -EINVAL;
+
+	/* DCB info gets thrown away on link up */
+	if (!netif_carrier_ok(dev))
+		return -ENOLINK;
+
+	for (i = 0; i < CXGB4_MAX_DCBX_APP_SUPPORTED; i++) {
+		INIT_PORT_DCB_READ_LOCAL_CMD(pcmd, pi->port_id);
+		pcmd.u.dcb.app_priority.type = FW_PORT_DCB_TYPE_APP_ID;
+		pcmd.u.dcb.app_priority.idx = i;
+		err = t4_wr_mbox(adap, adap->mbox, &pcmd, sizeof(pcmd), &pcmd);
+
+		if (err != FW_PORT_DCB_CFG_SUCCESS) {
+			dev_err(adap->pdev_dev, "DCB app table read failed with %d\n",
+				-err);
+			return err;
+		}
+		if (be16_to_cpu(pcmd.u.dcb.app_priority.protocolid) == app_id) {
+			/* overwrite existing app table */
+			pcmd.u.dcb.app_priority.protocolid = 0;
+			break;
+		}
+		/* find first empty slot */
+		if (!pcmd.u.dcb.app_priority.protocolid)
+			break;
+	}
+
+	if (i == CXGB4_MAX_DCBX_APP_SUPPORTED) {
+		/* no empty slots available */
+		dev_err(adap->pdev_dev, "DCB app table full\n");
+		return -EBUSY;
+	}
+
+	/* write out new app table entry */
+	INIT_PORT_DCB_WRITE_CMD(pcmd, pi->port_id);
+	if (pi->dcb.state == CXGB4_DCB_STATE_HOST)
+		pcmd.op_to_portid |= cpu_to_be32(FW_PORT_CMD_APPLY);
+
+	pcmd.u.dcb.app_priority.type = FW_PORT_DCB_TYPE_APP_ID;
+	pcmd.u.dcb.app_priority.protocolid = cpu_to_be16(app_id);
+	pcmd.u.dcb.app_priority.sel_field = app_idtype;
+	pcmd.u.dcb.app_priority.user_prio_map = app_prio;
+	pcmd.u.dcb.app_priority.idx = i;
+
+	err = t4_wr_mbox(adap, adap->mbox, &pcmd, sizeof(pcmd), &pcmd);
+	if (err != FW_PORT_DCB_CFG_SUCCESS) {
+		dev_err(adap->pdev_dev, "DCB app table write failed with %d\n",
+			-err);
+		return err;
+	}
+
+	return 0;
+}
+
+/* Priority for CEE inside dcb_app is bitmask, with 0 being an invalid value */
+static int cxgb4_setapp(struct net_device *dev, u8 app_idtype, u16 app_id,
+			u8 app_prio)
+{
+	int ret;
+	struct dcb_app app = {
+		.selector = app_idtype,
+		.protocol = app_id,
+		.priority = app_prio,
+	};
+
+	if (app_idtype != DCB_APP_IDTYPE_ETHTYPE &&
+	    app_idtype != DCB_APP_IDTYPE_PORTNUM)
+		return -EINVAL;
+
+	/* Convert app_idtype to a format that firmware understands */
+	ret = __cxgb4_setapp(dev, app_idtype == DCB_APP_IDTYPE_ETHTYPE ?
+			      app_idtype : 3, app_id, app_prio);
+	if (ret)
+		return ret;
+
+	return dcb_setapp(dev, &app);
+}
+
+/* Return whether IEEE Data Center Bridging has been negotiated.
+ */
+static inline int
+cxgb4_ieee_negotiation_complete(struct net_device *dev,
+				enum cxgb4_dcb_fw_msgs dcb_subtype)
+{
+	struct port_info *pi = netdev2pinfo(dev);
+	struct port_dcb_info *dcb = &pi->dcb;
+
+	if (dcb_subtype && !(dcb->msgs & dcb_subtype))
+		return 0;
+
+	return (dcb->state == CXGB4_DCB_STATE_FW_ALLSYNCED &&
+		(dcb->supported & DCB_CAP_DCBX_VER_IEEE));
+}
+
+/* Fill in the Application User Priority Map associated with the
+ * specified Application.
+ * Priority for IEEE dcb_app is an integer, with 0 being a valid value
+ */
+static int cxgb4_ieee_getapp(struct net_device *dev, struct dcb_app *app)
+{
+	int prio;
+
+	if (!cxgb4_ieee_negotiation_complete(dev, CXGB4_DCB_FW_APP_ID))
+		return -EINVAL;
+	if (!(app->selector && app->protocol))
+		return -EINVAL;
+
+	/* Try querying firmware first, use firmware format */
+	prio = __cxgb4_getapp(dev, app->selector - 1, app->protocol, 0);
+
+	if (prio < 0)
+		prio = dcb_ieee_getapp_mask(dev, app);
+
+	app->priority = ffs(prio) - 1;
+	return 0;
+}
+
+/* Write a new Application User Priority Map for the specified Application ID.
+ * Priority for IEEE dcb_app is an integer, with 0 being a valid value
+ */
+static int cxgb4_ieee_setapp(struct net_device *dev, struct dcb_app *app)
+{
+	int ret;
+
+	if (!cxgb4_ieee_negotiation_complete(dev, CXGB4_DCB_FW_APP_ID))
+		return -EINVAL;
+	if (!(app->selector && app->protocol))
+		return -EINVAL;
+
+	if (!(app->selector > IEEE_8021QAZ_APP_SEL_ETHERTYPE  &&
+	      app->selector < IEEE_8021QAZ_APP_SEL_ANY))
+		return -EINVAL;
+
+	/* change selector to a format that firmware understands */
+	ret = __cxgb4_setapp(dev, app->selector - 1, app->protocol,
+			     (1 << app->priority));
+	if (ret)
+		return ret;
+
+	return dcb_ieee_setapp(dev, app);
+}
+
+/* Return our DCBX parameters.
+ */
+static u8 cxgb4_getdcbx(struct net_device *dev)
+{
+	struct port_info *pi = netdev2pinfo(dev);
+
+	/* This is already set by cxgb4_set_dcb_caps, so just return it */
+	return pi->dcb.supported;
+}
+
+/* Set our DCBX parameters.
+ */
+static u8 cxgb4_setdcbx(struct net_device *dev, u8 dcb_request)
+{
+	struct port_info *pi = netdev2pinfo(dev);
+
+	/* Filter out requests which exceed our capabilities.
+	 */
+	if ((dcb_request & (CXGB4_DCBX_FW_SUPPORT | CXGB4_DCBX_HOST_SUPPORT))
+	    != dcb_request)
+		return 1;
+
+	/* Can't enable DCB if we haven't successfully negotiated it.
+	 */
+	if (pi->dcb.state != CXGB4_DCB_STATE_FW_ALLSYNCED)
+		return 1;
+
+	/* There's currently no mechanism to allow for the firmware DCBX
+	 * negotiation to be changed from the Host Driver.  If the caller
+	 * requests exactly the same parameters that we already have then
+	 * we'll allow them to be successfully "set" ...
+	 */
+	if (dcb_request != pi->dcb.supported)
+		return 1;
+
+	pi->dcb.supported = dcb_request;
+	return 0;
+}
+
+static int cxgb4_getpeer_app(struct net_device *dev,
+			     struct dcb_peer_app_info *info, u16 *app_count)
+{
+	struct fw_port_cmd pcmd;
+	struct port_info *pi = netdev2pinfo(dev);
+	struct adapter *adap = pi->adapter;
+	int i, err = 0;
+
+	if (pi->dcb.state != CXGB4_DCB_STATE_FW_ALLSYNCED)
+		return 1;
+
+	info->willing = 0;
+	info->error = 0;
+
+	*app_count = 0;
+	for (i = 0; i < CXGB4_MAX_DCBX_APP_SUPPORTED; i++) {
+		INIT_PORT_DCB_READ_PEER_CMD(pcmd, pi->port_id);
+		pcmd.u.dcb.app_priority.type = FW_PORT_DCB_TYPE_APP_ID;
+		pcmd.u.dcb.app_priority.idx = *app_count;
+		err = t4_wr_mbox(adap, adap->mbox, &pcmd, sizeof(pcmd), &pcmd);
+
+		if (err != FW_PORT_DCB_CFG_SUCCESS) {
+			dev_err(adap->pdev_dev, "DCB app table read failed with %d\n",
+				-err);
+			return err;
+		}
+
+		/* find first empty slot */
+		if (!pcmd.u.dcb.app_priority.protocolid)
+			break;
+	}
+	*app_count = i;
+	return err;
+}
+
+static int cxgb4_getpeerapp_tbl(struct net_device *dev, struct dcb_app *table)
+{
+	struct fw_port_cmd pcmd;
+	struct port_info *pi = netdev2pinfo(dev);
+	struct adapter *adap = pi->adapter;
+	int i, err = 0;
+
+	if (pi->dcb.state != CXGB4_DCB_STATE_FW_ALLSYNCED)
+		return 1;
+
+	for (i = 0; i < CXGB4_MAX_DCBX_APP_SUPPORTED; i++) {
+		INIT_PORT_DCB_READ_PEER_CMD(pcmd, pi->port_id);
+		pcmd.u.dcb.app_priority.type = FW_PORT_DCB_TYPE_APP_ID;
+		pcmd.u.dcb.app_priority.idx = i;
+		err = t4_wr_mbox(adap, adap->mbox, &pcmd, sizeof(pcmd), &pcmd);
+
+		if (err != FW_PORT_DCB_CFG_SUCCESS) {
+			dev_err(adap->pdev_dev, "DCB app table read failed with %d\n",
+				-err);
+			return err;
+		}
+
+		/* find first empty slot */
+		if (!pcmd.u.dcb.app_priority.protocolid)
+			break;
+
+		table[i].selector = pcmd.u.dcb.app_priority.sel_field;
+		table[i].protocol =
+			be16_to_cpu(pcmd.u.dcb.app_priority.protocolid);
+		table[i].priority =
+			ffs(pcmd.u.dcb.app_priority.user_prio_map) - 1;
+	}
+	return err;
+}
+
+/* Return Priority Group information.
+ */
+static int cxgb4_cee_peer_getpg(struct net_device *dev, struct cee_pg *pg)
+{
+	struct fw_port_cmd pcmd;
+	struct port_info *pi = netdev2pinfo(dev);
+	struct adapter *adap = pi->adapter;
+	u32 pgid;
+	int i, err;
+
+	/* We're always "willing" -- the Switch Fabric always dictates the
+	 * DCBX parameters to us.
+	 */
+	pg->willing = true;
+
+	INIT_PORT_DCB_READ_PEER_CMD(pcmd, pi->port_id);
+	pcmd.u.dcb.pgid.type = FW_PORT_DCB_TYPE_PGID;
+	err = t4_wr_mbox(adap, adap->mbox, &pcmd, sizeof(pcmd), &pcmd);
+	if (err != FW_PORT_DCB_CFG_SUCCESS) {
+		dev_err(adap->pdev_dev, "DCB read PGID failed with %d\n", -err);
+		return err;
+	}
+	pgid = be32_to_cpu(pcmd.u.dcb.pgid.pgid);
+
+	for (i = 0; i < CXGB4_MAX_PRIORITY; i++)
+		pg->prio_pg[7 - i] = (pgid >> (i * 4)) & 0xF;
+
+	INIT_PORT_DCB_READ_PEER_CMD(pcmd, pi->port_id);
+	pcmd.u.dcb.pgrate.type = FW_PORT_DCB_TYPE_PGRATE;
+	err = t4_wr_mbox(adap, adap->mbox, &pcmd, sizeof(pcmd), &pcmd);
+	if (err != FW_PORT_DCB_CFG_SUCCESS) {
+		dev_err(adap->pdev_dev, "DCB read PGRATE failed with %d\n",
+			-err);
+		return err;
+	}
+
+	for (i = 0; i < CXGB4_MAX_PRIORITY; i++)
+		pg->pg_bw[i] = pcmd.u.dcb.pgrate.pgrate[i];
+
+	return 0;
+}
+
+/* Return Priority Flow Control information.
+ */
+static int cxgb4_cee_peer_getpfc(struct net_device *dev, struct cee_pfc *pfc)
+{
+	struct port_info *pi = netdev2pinfo(dev);
+
+	cxgb4_getnumtcs(dev, DCB_NUMTCS_ATTR_PFC, &(pfc->tcs_supported));
+	pfc->pfc_en = pi->dcb.pfcen;
+
+	return 0;
+}
+
+const struct dcbnl_rtnl_ops cxgb4_dcb_ops = {
+	.ieee_getapp		= cxgb4_ieee_getapp,
+	.ieee_setapp		= cxgb4_ieee_setapp,
+
+	/* CEE std */
+	.getstate		= cxgb4_getstate,
+	.setstate		= cxgb4_setstate,
+	.getpgtccfgtx		= cxgb4_getpgtccfg_tx,
+	.getpgbwgcfgtx		= cxgb4_getpgbwgcfg_tx,
+	.getpgtccfgrx		= cxgb4_getpgtccfg_rx,
+	.getpgbwgcfgrx		= cxgb4_getpgbwgcfg_rx,
+	.setpgtccfgtx		= cxgb4_setpgtccfg_tx,
+	.setpgbwgcfgtx		= cxgb4_setpgbwgcfg_tx,
+	.setpfccfg		= cxgb4_setpfccfg,
+	.getpfccfg		= cxgb4_getpfccfg,
+	.setall			= cxgb4_setall,
+	.getcap			= cxgb4_getcap,
+	.getnumtcs		= cxgb4_getnumtcs,
+	.setnumtcs		= cxgb4_setnumtcs,
+	.getpfcstate		= cxgb4_getpfcstate,
+	.setpfcstate		= cxgb4_setpfcstate,
+	.getapp			= cxgb4_getapp,
+	.setapp			= cxgb4_setapp,
+
+	/* DCBX configuration */
+	.getdcbx		= cxgb4_getdcbx,
+	.setdcbx		= cxgb4_setdcbx,
+
+	/* peer apps */
+	.peer_getappinfo	= cxgb4_getpeer_app,
+	.peer_getapptable	= cxgb4_getpeerapp_tbl,
+
+	/* CEE peer */
+	.cee_peer_getpg		= cxgb4_cee_peer_getpg,
+	.cee_peer_getpfc	= cxgb4_cee_peer_getpfc,
+};
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_dcb.h b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_dcb.h
new file mode 100644
index 0000000..2a6aa88
--- /dev/null
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_dcb.h
@@ -0,0 +1,151 @@
+/*
+ *  Copyright (C) 2013-2014 Chelsio Communications.  All rights reserved.
+ *
+ *  Written by Anish Bhatt (anish@chelsio.com)
+ *
+ *  This program is free software; you can redistribute it and/or modify it
+ *  under the terms and conditions of the GNU General Public License,
+ *  version 2, as published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope it will be useful, but WITHOUT
+ *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ *  more details.
+ *
+ *  The full GNU General Public License is included in this distribution in
+ *  the file called "COPYING".
+ *
+ */
+
+#ifndef __CXGB4_DCB_H
+#define __CXGB4_DCB_H
+
+#include <linux/netdevice.h>
+#include <linux/dcbnl.h>
+#include <net/dcbnl.h>
+
+#ifdef CONFIG_CHELSIO_T4_DCB
+
+#define CXGB4_DCBX_FW_SUPPORT \
+	(DCB_CAP_DCBX_VER_CEE | \
+	 DCB_CAP_DCBX_VER_IEEE | \
+	 DCB_CAP_DCBX_LLD_MANAGED)
+#define CXGB4_DCBX_HOST_SUPPORT \
+	(DCB_CAP_DCBX_VER_CEE | \
+	 DCB_CAP_DCBX_VER_IEEE | \
+	 DCB_CAP_DCBX_HOST)
+
+#define CXGB4_MAX_PRIORITY      CXGB4_MAX_DCBX_APP_SUPPORTED
+#define CXGB4_MAX_TCS           CXGB4_MAX_DCBX_APP_SUPPORTED
+
+#define INIT_PORT_DCB_CMD(__pcmd, __port, __op, __action) \
+	do { \
+		memset(&(__pcmd), 0, sizeof(__pcmd)); \
+		(__pcmd).op_to_portid = \
+			cpu_to_be32(FW_CMD_OP(FW_PORT_CMD) | \
+				    FW_CMD_REQUEST | \
+				    FW_CMD_##__op | \
+				    FW_PORT_CMD_PORTID(__port)); \
+		(__pcmd).action_to_len16 = \
+			cpu_to_be32(FW_PORT_CMD_ACTION(__action) | \
+				    FW_LEN16(pcmd)); \
+	} while (0)
+
+#define INIT_PORT_DCB_READ_PEER_CMD(__pcmd, __port) \
+	INIT_PORT_DCB_CMD(__pcmd, __port, READ, FW_PORT_ACTION_DCB_READ_RECV)
+
+#define INIT_PORT_DCB_READ_LOCAL_CMD(__pcmd, __port) \
+	INIT_PORT_DCB_CMD(__pcmd, __port, READ, FW_PORT_ACTION_DCB_READ_TRANS)
+
+#define INIT_PORT_DCB_READ_SYNC_CMD(__pcmd, __port) \
+	INIT_PORT_DCB_CMD(__pcmd, __port, READ, FW_PORT_ACTION_DCB_READ_DET)
+
+#define INIT_PORT_DCB_WRITE_CMD(__pcmd, __port) \
+	INIT_PORT_DCB_CMD(__pcmd, __port, EXEC, FW_PORT_ACTION_L2_DCB_CFG)
+
+#define IEEE_FAUX_SYNC(__dev, __dcb) \
+	do { \
+		if ((__dcb)->dcb_version == FW_PORT_DCB_VER_IEEE) \
+			cxgb4_dcb_state_fsm((__dev), \
+					    CXGB4_DCB_STATE_FW_ALLSYNCED); \
+	} while (0)
+
+/* States we can be in for a port's Data Center Bridging.
+ */
+enum cxgb4_dcb_state {
+	CXGB4_DCB_STATE_START,		/* initial unknown state */
+	CXGB4_DCB_STATE_HOST,		/* we're using Host DCB (if at all) */
+	CXGB4_DCB_STATE_FW_INCOMPLETE,	/* using firmware DCB, incomplete */
+	CXGB4_DCB_STATE_FW_ALLSYNCED,	/* using firmware DCB, all sync'ed */
+};
+
+/* Data Center Bridging state input for the Finite State Machine.
+ */
+enum cxgb4_dcb_state_input {
+	/* Input from the firmware.
+	 */
+	CXGB4_DCB_INPUT_FW_DISABLED,	/* firmware DCB disabled */
+	CXGB4_DCB_INPUT_FW_ENABLED,	/* firmware DCB enabled */
+	CXGB4_DCB_INPUT_FW_INCOMPLETE,	/* firmware reports incomplete DCB */
+	CXGB4_DCB_INPUT_FW_ALLSYNCED,	/* firmware reports all sync'ed */
+
+};
+
+/* Firmware DCB messages that we've received so far ...
+ */
+enum cxgb4_dcb_fw_msgs {
+	CXGB4_DCB_FW_PGID	= 0x01,
+	CXGB4_DCB_FW_PGRATE	= 0x02,
+	CXGB4_DCB_FW_PRIORATE	= 0x04,
+	CXGB4_DCB_FW_PFC	= 0x08,
+	CXGB4_DCB_FW_APP_ID	= 0x10,
+};
+
+#define CXGB4_MAX_DCBX_APP_SUPPORTED 8
+
+/* Data Center Bridging support;
+ */
+struct port_dcb_info {
+	enum cxgb4_dcb_state state;	/* DCB State Machine */
+	enum cxgb4_dcb_fw_msgs msgs;	/* DCB Firmware messages received */
+	unsigned int supported;		/* OS DCB capabilities supported */
+	bool enabled;			/* OS Enabled state */
+
+	/* Cached copies of DCB information sent by the firmware (in Host
+	 * Native Endian format).
+	 */
+	u32	pgid;			/* Priority Group[0..7] */
+	u8	dcb_version;		/* Running DCBx version */
+	u8	pfcen;			/* Priority Flow Control[0..7] */
+	u8	pg_num_tcs_supported;	/* max PG Traffic Classes */
+	u8	pfc_num_tcs_supported;	/* max PFC Traffic Classes */
+	u8	pgrate[8];		/* Priority Group Rate[0..7] */
+	u8	priorate[8];		/* Priority Rate[0..7] */
+	u8	tsa[8];			/* TSA Algorithm[0..7] */
+	struct app_priority { /* Application Information */
+		u8	user_prio_map;	/* Priority Map bitfield */
+		u8	sel_field;	/* Protocol ID interpretation */
+		u16	protocolid;	/* Protocol ID */
+	} app_priority[CXGB4_MAX_DCBX_APP_SUPPORTED];
+};
+
+void cxgb4_dcb_state_init(struct net_device *);
+void cxgb4_dcb_version_init(struct net_device *);
+void cxgb4_dcb_state_fsm(struct net_device *, enum cxgb4_dcb_state_input);
+void cxgb4_dcb_handle_fw_update(struct adapter *, const struct fw_port_cmd *);
+void cxgb4_dcb_set_caps(struct adapter *, const struct fw_port_cmd *);
+extern const struct dcbnl_rtnl_ops cxgb4_dcb_ops;
+
+#define CXGB4_DCB_ENABLED true
+
+#else /* !CONFIG_CHELSIO_T4_DCB */
+
+static inline void cxgb4_dcb_state_init(struct net_device *dev)
+{
+}
+
+#define CXGB4_DCB_ENABLED false
+
+#endif /* !CONFIG_CHELSIO_T4_DCB */
+
+#endif /* __CXGB4_DCB_H */
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
new file mode 100644
index 0000000..279873c
--- /dev/null
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
@@ -0,0 +1,6902 @@
+/*
+ * This file is part of the Chelsio T4 Ethernet driver for Linux.
+ *
+ * Copyright (c) 2003-2014 Chelsio Communications, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/bitmap.h>
+#include <linux/crc32.h>
+#include <linux/ctype.h>
+#include <linux/debugfs.h>
+#include <linux/err.h>
+#include <linux/etherdevice.h>
+#include <linux/firmware.h>
+#include <linux/if.h>
+#include <linux/if_vlan.h>
+#include <linux/init.h>
+#include <linux/log2.h>
+#include <linux/mdio.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/mutex.h>
+#include <linux/netdevice.h>
+#include <linux/pci.h>
+#include <linux/aer.h>
+#include <linux/rtnetlink.h>
+#include <linux/sched.h>
+#include <linux/seq_file.h>
+#include <linux/sockios.h>
+#include <linux/vmalloc.h>
+#include <linux/workqueue.h>
+#include <net/neighbour.h>
+#include <net/netevent.h>
+#include <net/addrconf.h>
+#include <asm/uaccess.h>
+
+#include "cxgb4.h"
+#include "t4_regs.h"
+#include "t4_msg.h"
+#include "t4fw_api.h"
+#include "cxgb4_dcb.h"
+#include "l2t.h"
+
+#include <../drivers/net/bonding/bonding.h>
+
+#ifdef DRV_VERSION
+#undef DRV_VERSION
+#endif
+#define DRV_VERSION "2.0.0-ko"
+#define DRV_DESC "Chelsio T4/T5 Network Driver"
+
+/*
+ * Max interrupt hold-off timer value in us.  Queues fall back to this value
+ * under extreme memory pressure so it's largish to give the system time to
+ * recover.
+ */
+#define MAX_SGE_TIMERVAL 200U
+
+enum {
+	/*
+	 * Physical Function provisioning constants.
+	 */
+	PFRES_NVI = 4,			/* # of Virtual Interfaces */
+	PFRES_NETHCTRL = 128,		/* # of EQs used for ETH or CTRL Qs */
+	PFRES_NIQFLINT = 128,		/* # of ingress Qs/w Free List(s)/intr
+					 */
+	PFRES_NEQ = 256,		/* # of egress queues */
+	PFRES_NIQ = 0,			/* # of ingress queues */
+	PFRES_TC = 0,			/* PCI-E traffic class */
+	PFRES_NEXACTF = 128,		/* # of exact MPS filters */
+
+	PFRES_R_CAPS = FW_CMD_CAP_PF,
+	PFRES_WX_CAPS = FW_CMD_CAP_PF,
+
+#ifdef CONFIG_PCI_IOV
+	/*
+	 * Virtual Function provisioning constants.  We need two extra Ingress
+	 * Queues with Interrupt capability to serve as the VF's Firmware
+	 * Event Queue and Forwarded Interrupt Queue (when using MSI mode) --
+	 * neither will have Free Lists associated with them).  For each
+	 * Ethernet/Control Egress Queue and for each Free List, we need an
+	 * Egress Context.
+	 */
+	VFRES_NPORTS = 1,		/* # of "ports" per VF */
+	VFRES_NQSETS = 2,		/* # of "Queue Sets" per VF */
+
+	VFRES_NVI = VFRES_NPORTS,	/* # of Virtual Interfaces */
+	VFRES_NETHCTRL = VFRES_NQSETS,	/* # of EQs used for ETH or CTRL Qs */
+	VFRES_NIQFLINT = VFRES_NQSETS+2,/* # of ingress Qs/w Free List(s)/intr */
+	VFRES_NEQ = VFRES_NQSETS*2,	/* # of egress queues */
+	VFRES_NIQ = 0,			/* # of non-fl/int ingress queues */
+	VFRES_TC = 0,			/* PCI-E traffic class */
+	VFRES_NEXACTF = 16,		/* # of exact MPS filters */
+
+	VFRES_R_CAPS = FW_CMD_CAP_DMAQ|FW_CMD_CAP_VF|FW_CMD_CAP_PORT,
+	VFRES_WX_CAPS = FW_CMD_CAP_DMAQ|FW_CMD_CAP_VF,
+#endif
+};
+
+/*
+ * Provide a Port Access Rights Mask for the specified PF/VF.  This is very
+ * static and likely not to be useful in the long run.  We really need to
+ * implement some form of persistent configuration which the firmware
+ * controls.
+ */
+static unsigned int pfvfres_pmask(struct adapter *adapter,
+				  unsigned int pf, unsigned int vf)
+{
+	unsigned int portn, portvec;
+
+	/*
+	 * Give PF's access to all of the ports.
+	 */
+	if (vf == 0)
+		return FW_PFVF_CMD_PMASK_MASK;
+
+	/*
+	 * For VFs, we'll assign them access to the ports based purely on the
+	 * PF.  We assign active ports in order, wrapping around if there are
+	 * fewer active ports than PFs: e.g. active port[pf % nports].
+	 * Unfortunately the adapter's port_info structs haven't been
+	 * initialized yet so we have to compute this.
+	 */
+	if (adapter->params.nports == 0)
+		return 0;
+
+	portn = pf % adapter->params.nports;
+	portvec = adapter->params.portvec;
+	for (;;) {
+		/*
+		 * Isolate the lowest set bit in the port vector.  If we're at
+		 * the port number that we want, return that as the pmask.
+		 * otherwise mask that bit out of the port vector and
+		 * decrement our port number ...
+		 */
+		unsigned int pmask = portvec ^ (portvec & (portvec-1));
+		if (portn == 0)
+			return pmask;
+		portn--;
+		portvec &= ~pmask;
+	}
+	/*NOTREACHED*/
+}
+
+enum {
+	MAX_TXQ_ENTRIES      = 16384,
+	MAX_CTRL_TXQ_ENTRIES = 1024,
+	MAX_RSPQ_ENTRIES     = 16384,
+	MAX_RX_BUFFERS       = 16384,
+	MIN_TXQ_ENTRIES      = 32,
+	MIN_CTRL_TXQ_ENTRIES = 32,
+	MIN_RSPQ_ENTRIES     = 128,
+	MIN_FL_ENTRIES       = 16
+};
+
+/* Host shadow copy of ingress filter entry.  This is in host native format
+ * and doesn't match the ordering or bit order, etc. of the hardware of the
+ * firmware command.  The use of bit-field structure elements is purely to
+ * remind ourselves of the field size limitations and save memory in the case
+ * where the filter table is large.
+ */
+struct filter_entry {
+	/* Administrative fields for filter.
+	 */
+	u32 valid:1;            /* filter allocated and valid */
+	u32 locked:1;           /* filter is administratively locked */
+
+	u32 pending:1;          /* filter action is pending firmware reply */
+	u32 smtidx:8;           /* Source MAC Table index for smac */
+	struct l2t_entry *l2t;  /* Layer Two Table entry for dmac */
+
+	/* The filter itself.  Most of this is a straight copy of information
+	 * provided by the extended ioctl().  Some fields are translated to
+	 * internal forms -- for instance the Ingress Queue ID passed in from
+	 * the ioctl() is translated into the Absolute Ingress Queue ID.
+	 */
+	struct ch_filter_specification fs;
+};
+
+#define DFLT_MSG_ENABLE (NETIF_MSG_DRV | NETIF_MSG_PROBE | NETIF_MSG_LINK | \
+			 NETIF_MSG_TIMER | NETIF_MSG_IFDOWN | NETIF_MSG_IFUP |\
+			 NETIF_MSG_RX_ERR | NETIF_MSG_TX_ERR)
+
+#define CH_DEVICE(devid, data) { PCI_VDEVICE(CHELSIO, devid), (data) }
+
+static const struct pci_device_id cxgb4_pci_tbl[] = {
+	CH_DEVICE(0xa000, 0),  /* PE10K */
+	CH_DEVICE(0x4001, -1),
+	CH_DEVICE(0x4002, -1),
+	CH_DEVICE(0x4003, -1),
+	CH_DEVICE(0x4004, -1),
+	CH_DEVICE(0x4005, -1),
+	CH_DEVICE(0x4006, -1),
+	CH_DEVICE(0x4007, -1),
+	CH_DEVICE(0x4008, -1),
+	CH_DEVICE(0x4009, -1),
+	CH_DEVICE(0x400a, -1),
+	CH_DEVICE(0x400d, -1),
+	CH_DEVICE(0x400e, -1),
+	CH_DEVICE(0x4080, -1),
+	CH_DEVICE(0x4081, -1),
+	CH_DEVICE(0x4082, -1),
+	CH_DEVICE(0x4083, -1),
+	CH_DEVICE(0x4084, -1),
+	CH_DEVICE(0x4085, -1),
+	CH_DEVICE(0x4086, -1),
+	CH_DEVICE(0x4087, -1),
+	CH_DEVICE(0x4088, -1),
+	CH_DEVICE(0x4401, 4),
+	CH_DEVICE(0x4402, 4),
+	CH_DEVICE(0x4403, 4),
+	CH_DEVICE(0x4404, 4),
+	CH_DEVICE(0x4405, 4),
+	CH_DEVICE(0x4406, 4),
+	CH_DEVICE(0x4407, 4),
+	CH_DEVICE(0x4408, 4),
+	CH_DEVICE(0x4409, 4),
+	CH_DEVICE(0x440a, 4),
+	CH_DEVICE(0x440d, 4),
+	CH_DEVICE(0x440e, 4),
+	CH_DEVICE(0x4480, 4),
+	CH_DEVICE(0x4481, 4),
+	CH_DEVICE(0x4482, 4),
+	CH_DEVICE(0x4483, 4),
+	CH_DEVICE(0x4484, 4),
+	CH_DEVICE(0x4485, 4),
+	CH_DEVICE(0x4486, 4),
+	CH_DEVICE(0x4487, 4),
+	CH_DEVICE(0x4488, 4),
+	CH_DEVICE(0x5001, 4),
+	CH_DEVICE(0x5002, 4),
+	CH_DEVICE(0x5003, 4),
+	CH_DEVICE(0x5004, 4),
+	CH_DEVICE(0x5005, 4),
+	CH_DEVICE(0x5006, 4),
+	CH_DEVICE(0x5007, 4),
+	CH_DEVICE(0x5008, 4),
+	CH_DEVICE(0x5009, 4),
+	CH_DEVICE(0x500A, 4),
+	CH_DEVICE(0x500B, 4),
+	CH_DEVICE(0x500C, 4),
+	CH_DEVICE(0x500D, 4),
+	CH_DEVICE(0x500E, 4),
+	CH_DEVICE(0x500F, 4),
+	CH_DEVICE(0x5010, 4),
+	CH_DEVICE(0x5011, 4),
+	CH_DEVICE(0x5012, 4),
+	CH_DEVICE(0x5013, 4),
+	CH_DEVICE(0x5014, 4),
+	CH_DEVICE(0x5015, 4),
+	CH_DEVICE(0x5080, 4),
+	CH_DEVICE(0x5081, 4),
+	CH_DEVICE(0x5082, 4),
+	CH_DEVICE(0x5083, 4),
+	CH_DEVICE(0x5084, 4),
+	CH_DEVICE(0x5085, 4),
+	CH_DEVICE(0x5086, 4),
+	CH_DEVICE(0x5087, 4),
+	CH_DEVICE(0x5088, 4),
+	CH_DEVICE(0x5401, 4),
+	CH_DEVICE(0x5402, 4),
+	CH_DEVICE(0x5403, 4),
+	CH_DEVICE(0x5404, 4),
+	CH_DEVICE(0x5405, 4),
+	CH_DEVICE(0x5406, 4),
+	CH_DEVICE(0x5407, 4),
+	CH_DEVICE(0x5408, 4),
+	CH_DEVICE(0x5409, 4),
+	CH_DEVICE(0x540A, 4),
+	CH_DEVICE(0x540B, 4),
+	CH_DEVICE(0x540C, 4),
+	CH_DEVICE(0x540D, 4),
+	CH_DEVICE(0x540E, 4),
+	CH_DEVICE(0x540F, 4),
+	CH_DEVICE(0x5410, 4),
+	CH_DEVICE(0x5411, 4),
+	CH_DEVICE(0x5412, 4),
+	CH_DEVICE(0x5413, 4),
+	CH_DEVICE(0x5414, 4),
+	CH_DEVICE(0x5415, 4),
+	CH_DEVICE(0x5480, 4),
+	CH_DEVICE(0x5481, 4),
+	CH_DEVICE(0x5482, 4),
+	CH_DEVICE(0x5483, 4),
+	CH_DEVICE(0x5484, 4),
+	CH_DEVICE(0x5485, 4),
+	CH_DEVICE(0x5486, 4),
+	CH_DEVICE(0x5487, 4),
+	CH_DEVICE(0x5488, 4),
+	{ 0, }
+};
+
+#define FW4_FNAME "cxgb4/t4fw.bin"
+#define FW5_FNAME "cxgb4/t5fw.bin"
+#define FW4_CFNAME "cxgb4/t4-config.txt"
+#define FW5_CFNAME "cxgb4/t5-config.txt"
+
+MODULE_DESCRIPTION(DRV_DESC);
+MODULE_AUTHOR("Chelsio Communications");
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_VERSION(DRV_VERSION);
+MODULE_DEVICE_TABLE(pci, cxgb4_pci_tbl);
+MODULE_FIRMWARE(FW4_FNAME);
+MODULE_FIRMWARE(FW5_FNAME);
+
+/*
+ * Normally we're willing to become the firmware's Master PF but will be happy
+ * if another PF has already become the Master and initialized the adapter.
+ * Setting "force_init" will cause this driver to forcibly establish itself as
+ * the Master PF and initialize the adapter.
+ */
+static uint force_init;
+
+module_param(force_init, uint, 0644);
+MODULE_PARM_DESC(force_init, "Forcibly become Master PF and initialize adapter");
+
+/*
+ * Normally if the firmware we connect to has Configuration File support, we
+ * use that and only fall back to the old Driver-based initialization if the
+ * Configuration File fails for some reason.  If force_old_init is set, then
+ * we'll always use the old Driver-based initialization sequence.
+ */
+static uint force_old_init;
+
+module_param(force_old_init, uint, 0644);
+MODULE_PARM_DESC(force_old_init, "Force old initialization sequence");
+
+static int dflt_msg_enable = DFLT_MSG_ENABLE;
+
+module_param(dflt_msg_enable, int, 0644);
+MODULE_PARM_DESC(dflt_msg_enable, "Chelsio T4 default message enable bitmap");
+
+/*
+ * The driver uses the best interrupt scheme available on a platform in the
+ * order MSI-X, MSI, legacy INTx interrupts.  This parameter determines which
+ * of these schemes the driver may consider as follows:
+ *
+ * msi = 2: choose from among all three options
+ * msi = 1: only consider MSI and INTx interrupts
+ * msi = 0: force INTx interrupts
+ */
+static int msi = 2;
+
+module_param(msi, int, 0644);
+MODULE_PARM_DESC(msi, "whether to use INTx (0), MSI (1) or MSI-X (2)");
+
+/*
+ * Queue interrupt hold-off timer values.  Queues default to the first of these
+ * upon creation.
+ */
+static unsigned int intr_holdoff[SGE_NTIMERS - 1] = { 5, 10, 20, 50, 100 };
+
+module_param_array(intr_holdoff, uint, NULL, 0644);
+MODULE_PARM_DESC(intr_holdoff, "values for queue interrupt hold-off timers "
+		 "0..4 in microseconds");
+
+static unsigned int intr_cnt[SGE_NCOUNTERS - 1] = { 4, 8, 16 };
+
+module_param_array(intr_cnt, uint, NULL, 0644);
+MODULE_PARM_DESC(intr_cnt,
+		 "thresholds 1..3 for queue interrupt packet counters");
+
+/*
+ * Normally we tell the chip to deliver Ingress Packets into our DMA buffers
+ * offset by 2 bytes in order to have the IP headers line up on 4-byte
+ * boundaries.  This is a requirement for many architectures which will throw
+ * a machine check fault if an attempt is made to access one of the 4-byte IP
+ * header fields on a non-4-byte boundary.  And it's a major performance issue
+ * even on some architectures which allow it like some implementations of the
+ * x86 ISA.  However, some architectures don't mind this and for some very
+ * edge-case performance sensitive applications (like forwarding large volumes
+ * of small packets), setting this DMA offset to 0 will decrease the number of
+ * PCI-E Bus transfers enough to measurably affect performance.
+ */
+static int rx_dma_offset = 2;
+
+static bool vf_acls;
+
+#ifdef CONFIG_PCI_IOV
+module_param(vf_acls, bool, 0644);
+MODULE_PARM_DESC(vf_acls, "if set enable virtualization L2 ACL enforcement");
+
+/* Configure the number of PCI-E Virtual Function which are to be instantiated
+ * on SR-IOV Capable Physical Functions.
+ */
+static unsigned int num_vf[NUM_OF_PF_WITH_SRIOV];
+
+module_param_array(num_vf, uint, NULL, 0644);
+MODULE_PARM_DESC(num_vf, "number of VFs for each of PFs 0-3");
+#endif
+
+/* TX Queue select used to determine what algorithm to use for selecting TX
+ * queue. Select between the kernel provided function (select_queue=0) or user
+ * cxgb_select_queue function (select_queue=1)
+ *
+ * Default: select_queue=0
+ */
+static int select_queue;
+module_param(select_queue, int, 0644);
+MODULE_PARM_DESC(select_queue,
+		 "Select between kernel provided method of selecting or driver method of selecting TX queue. Default is kernel method.");
+
+/*
+ * The filter TCAM has a fixed portion and a variable portion.  The fixed
+ * portion can match on source/destination IP IPv4/IPv6 addresses and TCP/UDP
+ * ports.  The variable portion is 36 bits which can include things like Exact
+ * Match MAC Index (9 bits), Ether Type (16 bits), IP Protocol (8 bits),
+ * [Inner] VLAN Tag (17 bits), etc. which, if all were somehow selected, would
+ * far exceed the 36-bit budget for this "compressed" header portion of the
+ * filter.  Thus, we have a scarce resource which must be carefully managed.
+ *
+ * By default we set this up to mostly match the set of filter matching
+ * capabilities of T3 but with accommodations for some of T4's more
+ * interesting features:
+ *
+ *   { IP Fragment (1), MPS Match Type (3), IP Protocol (8),
+ *     [Inner] VLAN (17), Port (3), FCoE (1) }
+ */
+enum {
+	TP_VLAN_PRI_MAP_DEFAULT = HW_TPL_FR_MT_PR_IV_P_FC,
+	TP_VLAN_PRI_MAP_FIRST = FCOE_SHIFT,
+	TP_VLAN_PRI_MAP_LAST = FRAGMENTATION_SHIFT,
+};
+
+static unsigned int tp_vlan_pri_map = TP_VLAN_PRI_MAP_DEFAULT;
+
+module_param(tp_vlan_pri_map, uint, 0644);
+MODULE_PARM_DESC(tp_vlan_pri_map, "global compressed filter configuration");
+
+static struct dentry *cxgb4_debugfs_root;
+
+static LIST_HEAD(adapter_list);
+static DEFINE_MUTEX(uld_mutex);
+/* Adapter list to be accessed from atomic context */
+static LIST_HEAD(adap_rcu_list);
+static DEFINE_SPINLOCK(adap_rcu_lock);
+static struct cxgb4_uld_info ulds[CXGB4_ULD_MAX];
+static const char *uld_str[] = { "RDMA", "iSCSI" };
+
+static void link_report(struct net_device *dev)
+{
+	if (!netif_carrier_ok(dev))
+		netdev_info(dev, "link down\n");
+	else {
+		static const char *fc[] = { "no", "Rx", "Tx", "Tx/Rx" };
+
+		const char *s = "10Mbps";
+		const struct port_info *p = netdev_priv(dev);
+
+		switch (p->link_cfg.speed) {
+		case 10000:
+			s = "10Gbps";
+			break;
+		case 1000:
+			s = "1000Mbps";
+			break;
+		case 100:
+			s = "100Mbps";
+			break;
+		case 40000:
+			s = "40Gbps";
+			break;
+		}
+
+		netdev_info(dev, "link up, %s, full-duplex, %s PAUSE\n", s,
+			    fc[p->link_cfg.fc]);
+	}
+}
+
+#ifdef CONFIG_CHELSIO_T4_DCB
+/* Set up/tear down Data Center Bridging Priority mapping for a net device. */
+static void dcb_tx_queue_prio_enable(struct net_device *dev, int enable)
+{
+	struct port_info *pi = netdev_priv(dev);
+	struct adapter *adap = pi->adapter;
+	struct sge_eth_txq *txq = &adap->sge.ethtxq[pi->first_qset];
+	int i;
+
+	/* We use a simple mapping of Port TX Queue Index to DCB
+	 * Priority when we're enabling DCB.
+	 */
+	for (i = 0; i < pi->nqsets; i++, txq++) {
+		u32 name, value;
+		int err;
+
+		name = (FW_PARAMS_MNEM(FW_PARAMS_MNEM_DMAQ) |
+			FW_PARAMS_PARAM_X(FW_PARAMS_PARAM_DMAQ_EQ_DCBPRIO_ETH) |
+			FW_PARAMS_PARAM_YZ(txq->q.cntxt_id));
+		value = enable ? i : 0xffffffff;
+
+		/* Since we can be called while atomic (from "interrupt
+		 * level") we need to issue the Set Parameters Commannd
+		 * without sleeping (timeout < 0).
+		 */
+		err = t4_set_params_nosleep(adap, adap->mbox, adap->fn, 0, 1,
+					    &name, &value);
+
+		if (err)
+			dev_err(adap->pdev_dev,
+				"Can't %s DCB Priority on port %d, TX Queue %d: err=%d\n",
+				enable ? "set" : "unset", pi->port_id, i, -err);
+		else
+			txq->dcb_prio = value;
+	}
+}
+#endif /* CONFIG_CHELSIO_T4_DCB */
+
+void t4_os_link_changed(struct adapter *adapter, int port_id, int link_stat)
+{
+	struct net_device *dev = adapter->port[port_id];
+
+	/* Skip changes from disabled ports. */
+	if (netif_running(dev) && link_stat != netif_carrier_ok(dev)) {
+		if (link_stat)
+			netif_carrier_on(dev);
+		else {
+#ifdef CONFIG_CHELSIO_T4_DCB
+			cxgb4_dcb_state_init(dev);
+			dcb_tx_queue_prio_enable(dev, false);
+#endif /* CONFIG_CHELSIO_T4_DCB */
+			netif_carrier_off(dev);
+		}
+
+		link_report(dev);
+	}
+}
+
+void t4_os_portmod_changed(const struct adapter *adap, int port_id)
+{
+	static const char *mod_str[] = {
+		NULL, "LR", "SR", "ER", "passive DA", "active DA", "LRM"
+	};
+
+	const struct net_device *dev = adap->port[port_id];
+	const struct port_info *pi = netdev_priv(dev);
+
+	if (pi->mod_type == FW_PORT_MOD_TYPE_NONE)
+		netdev_info(dev, "port module unplugged\n");
+	else if (pi->mod_type < ARRAY_SIZE(mod_str))
+		netdev_info(dev, "%s module inserted\n", mod_str[pi->mod_type]);
+}
+
+/*
+ * Configure the exact and hash address filters to handle a port's multicast
+ * and secondary unicast MAC addresses.
+ */
+static int set_addr_filters(const struct net_device *dev, bool sleep)
+{
+	u64 mhash = 0;
+	u64 uhash = 0;
+	bool free = true;
+	u16 filt_idx[7];
+	const u8 *addr[7];
+	int ret, naddr = 0;
+	const struct netdev_hw_addr *ha;
+	int uc_cnt = netdev_uc_count(dev);
+	int mc_cnt = netdev_mc_count(dev);
+	const struct port_info *pi = netdev_priv(dev);
+	unsigned int mb = pi->adapter->fn;
+
+	/* first do the secondary unicast addresses */
+	netdev_for_each_uc_addr(ha, dev) {
+		addr[naddr++] = ha->addr;
+		if (--uc_cnt == 0 || naddr >= ARRAY_SIZE(addr)) {
+			ret = t4_alloc_mac_filt(pi->adapter, mb, pi->viid, free,
+					naddr, addr, filt_idx, &uhash, sleep);
+			if (ret < 0)
+				return ret;
+
+			free = false;
+			naddr = 0;
+		}
+	}
+
+	/* next set up the multicast addresses */
+	netdev_for_each_mc_addr(ha, dev) {
+		addr[naddr++] = ha->addr;
+		if (--mc_cnt == 0 || naddr >= ARRAY_SIZE(addr)) {
+			ret = t4_alloc_mac_filt(pi->adapter, mb, pi->viid, free,
+					naddr, addr, filt_idx, &mhash, sleep);
+			if (ret < 0)
+				return ret;
+
+			free = false;
+			naddr = 0;
+		}
+	}
+
+	return t4_set_addr_hash(pi->adapter, mb, pi->viid, uhash != 0,
+				uhash | mhash, sleep);
+}
+
+int dbfifo_int_thresh = 10; /* 10 == 640 entry threshold */
+module_param(dbfifo_int_thresh, int, 0644);
+MODULE_PARM_DESC(dbfifo_int_thresh, "doorbell fifo interrupt threshold");
+
+/*
+ * usecs to sleep while draining the dbfifo
+ */
+static int dbfifo_drain_delay = 1000;
+module_param(dbfifo_drain_delay, int, 0644);
+MODULE_PARM_DESC(dbfifo_drain_delay,
+		 "usecs to sleep while draining the dbfifo");
+
+/*
+ * Set Rx properties of a port, such as promiscruity, address filters, and MTU.
+ * If @mtu is -1 it is left unchanged.
+ */
+static int set_rxmode(struct net_device *dev, int mtu, bool sleep_ok)
+{
+	int ret;
+	struct port_info *pi = netdev_priv(dev);
+
+	ret = set_addr_filters(dev, sleep_ok);
+	if (ret == 0)
+		ret = t4_set_rxmode(pi->adapter, pi->adapter->fn, pi->viid, mtu,
+				    (dev->flags & IFF_PROMISC) ? 1 : 0,
+				    (dev->flags & IFF_ALLMULTI) ? 1 : 0, 1, -1,
+				    sleep_ok);
+	return ret;
+}
+
+/**
+ *	link_start - enable a port
+ *	@dev: the port to enable
+ *
+ *	Performs the MAC and PHY actions needed to enable a port.
+ */
+static int link_start(struct net_device *dev)
+{
+	int ret;
+	struct port_info *pi = netdev_priv(dev);
+	unsigned int mb = pi->adapter->fn;
+
+	/*
+	 * We do not set address filters and promiscuity here, the stack does
+	 * that step explicitly.
+	 */
+	ret = t4_set_rxmode(pi->adapter, mb, pi->viid, dev->mtu, -1, -1, -1,
+			    !!(dev->features & NETIF_F_HW_VLAN_CTAG_RX), true);
+	if (ret == 0) {
+		ret = t4_change_mac(pi->adapter, mb, pi->viid,
+				    pi->xact_addr_filt, dev->dev_addr, true,
+				    true);
+		if (ret >= 0) {
+			pi->xact_addr_filt = ret;
+			ret = 0;
+		}
+	}
+	if (ret == 0)
+		ret = t4_link_start(pi->adapter, mb, pi->tx_chan,
+				    &pi->link_cfg);
+	if (ret == 0) {
+		local_bh_disable();
+		ret = t4_enable_vi_params(pi->adapter, mb, pi->viid, true,
+					  true, CXGB4_DCB_ENABLED);
+		local_bh_enable();
+	}
+
+	return ret;
+}
+
+int cxgb4_dcb_enabled(const struct net_device *dev)
+{
+#ifdef CONFIG_CHELSIO_T4_DCB
+	struct port_info *pi = netdev_priv(dev);
+
+	if (!pi->dcb.enabled)
+		return 0;
+
+	return ((pi->dcb.state == CXGB4_DCB_STATE_FW_ALLSYNCED) ||
+		(pi->dcb.state == CXGB4_DCB_STATE_HOST));
+#else
+	return 0;
+#endif
+}
+EXPORT_SYMBOL(cxgb4_dcb_enabled);
+
+#ifdef CONFIG_CHELSIO_T4_DCB
+/* Handle a Data Center Bridging update message from the firmware. */
+static void dcb_rpl(struct adapter *adap, const struct fw_port_cmd *pcmd)
+{
+	int port = FW_PORT_CMD_PORTID_GET(ntohl(pcmd->op_to_portid));
+	struct net_device *dev = adap->port[port];
+	int old_dcb_enabled = cxgb4_dcb_enabled(dev);
+	int new_dcb_enabled;
+
+	cxgb4_dcb_handle_fw_update(adap, pcmd);
+	new_dcb_enabled = cxgb4_dcb_enabled(dev);
+
+	/* If the DCB has become enabled or disabled on the port then we're
+	 * going to need to set up/tear down DCB Priority parameters for the
+	 * TX Queues associated with the port.
+	 */
+	if (new_dcb_enabled != old_dcb_enabled)
+		dcb_tx_queue_prio_enable(dev, new_dcb_enabled);
+}
+#endif /* CONFIG_CHELSIO_T4_DCB */
+
+/* Clear a filter and release any of its resources that we own.  This also
+ * clears the filter's "pending" status.
+ */
+static void clear_filter(struct adapter *adap, struct filter_entry *f)
+{
+	/* If the new or old filter have loopback rewriteing rules then we'll
+	 * need to free any existing Layer Two Table (L2T) entries of the old
+	 * filter rule.  The firmware will handle freeing up any Source MAC
+	 * Table (SMT) entries used for rewriting Source MAC Addresses in
+	 * loopback rules.
+	 */
+	if (f->l2t)
+		cxgb4_l2t_release(f->l2t);
+
+	/* The zeroing of the filter rule below clears the filter valid,
+	 * pending, locked flags, l2t pointer, etc. so it's all we need for
+	 * this operation.
+	 */
+	memset(f, 0, sizeof(*f));
+}
+
+/* Handle a filter write/deletion reply.
+ */
+static void filter_rpl(struct adapter *adap, const struct cpl_set_tcb_rpl *rpl)
+{
+	unsigned int idx = GET_TID(rpl);
+	unsigned int nidx = idx - adap->tids.ftid_base;
+	unsigned int ret;
+	struct filter_entry *f;
+
+	if (idx >= adap->tids.ftid_base && nidx <
+	   (adap->tids.nftids + adap->tids.nsftids)) {
+		idx = nidx;
+		ret = GET_TCB_COOKIE(rpl->cookie);
+		f = &adap->tids.ftid_tab[idx];
+
+		if (ret == FW_FILTER_WR_FLT_DELETED) {
+			/* Clear the filter when we get confirmation from the
+			 * hardware that the filter has been deleted.
+			 */
+			clear_filter(adap, f);
+		} else if (ret == FW_FILTER_WR_SMT_TBL_FULL) {
+			dev_err(adap->pdev_dev, "filter %u setup failed due to full SMT\n",
+				idx);
+			clear_filter(adap, f);
+		} else if (ret == FW_FILTER_WR_FLT_ADDED) {
+			f->smtidx = (be64_to_cpu(rpl->oldval) >> 24) & 0xff;
+			f->pending = 0;  /* asynchronous setup completed */
+			f->valid = 1;
+		} else {
+			/* Something went wrong.  Issue a warning about the
+			 * problem and clear everything out.
+			 */
+			dev_err(adap->pdev_dev, "filter %u setup failed with error %u\n",
+				idx, ret);
+			clear_filter(adap, f);
+		}
+	}
+}
+
+/* Response queue handler for the FW event queue.
+ */
+static int fwevtq_handler(struct sge_rspq *q, const __be64 *rsp,
+			  const struct pkt_gl *gl)
+{
+	u8 opcode = ((const struct rss_header *)rsp)->opcode;
+
+	rsp++;                                          /* skip RSS header */
+
+	/* FW can send EGR_UPDATEs encapsulated in a CPL_FW4_MSG.
+	 */
+	if (unlikely(opcode == CPL_FW4_MSG &&
+	   ((const struct cpl_fw4_msg *)rsp)->type == FW_TYPE_RSSCPL)) {
+		rsp++;
+		opcode = ((const struct rss_header *)rsp)->opcode;
+		rsp++;
+		if (opcode != CPL_SGE_EGR_UPDATE) {
+			dev_err(q->adap->pdev_dev, "unexpected FW4/CPL %#x on FW event queue\n"
+				, opcode);
+			goto out;
+		}
+	}
+
+	if (likely(opcode == CPL_SGE_EGR_UPDATE)) {
+		const struct cpl_sge_egr_update *p = (void *)rsp;
+		unsigned int qid = EGR_QID(ntohl(p->opcode_qid));
+		struct sge_txq *txq;
+
+		txq = q->adap->sge.egr_map[qid - q->adap->sge.egr_start];
+		txq->restarts++;
+		if ((u8 *)txq < (u8 *)q->adap->sge.ofldtxq) {
+			struct sge_eth_txq *eq;
+
+			eq = container_of(txq, struct sge_eth_txq, q);
+			netif_tx_wake_queue(eq->txq);
+		} else {
+			struct sge_ofld_txq *oq;
+
+			oq = container_of(txq, struct sge_ofld_txq, q);
+			tasklet_schedule(&oq->qresume_tsk);
+		}
+	} else if (opcode == CPL_FW6_MSG || opcode == CPL_FW4_MSG) {
+		const struct cpl_fw6_msg *p = (void *)rsp;
+
+#ifdef CONFIG_CHELSIO_T4_DCB
+		const struct fw_port_cmd *pcmd = (const void *)p->data;
+		unsigned int cmd = FW_CMD_OP_GET(ntohl(pcmd->op_to_portid));
+		unsigned int action =
+			FW_PORT_CMD_ACTION_GET(ntohl(pcmd->action_to_len16));
+
+		if (cmd == FW_PORT_CMD &&
+		    action == FW_PORT_ACTION_GET_PORT_INFO) {
+			int port = FW_PORT_CMD_PORTID_GET(
+					be32_to_cpu(pcmd->op_to_portid));
+			struct net_device *dev = q->adap->port[port];
+			int state_input = ((pcmd->u.info.dcbxdis_pkd &
+					    FW_PORT_CMD_DCBXDIS)
+					   ? CXGB4_DCB_INPUT_FW_DISABLED
+					   : CXGB4_DCB_INPUT_FW_ENABLED);
+
+			cxgb4_dcb_state_fsm(dev, state_input);
+		}
+
+		if (cmd == FW_PORT_CMD &&
+		    action == FW_PORT_ACTION_L2_DCB_CFG)
+			dcb_rpl(q->adap, pcmd);
+		else
+#endif
+			if (p->type == 0)
+				t4_handle_fw_rpl(q->adap, p->data);
+	} else if (opcode == CPL_L2T_WRITE_RPL) {
+		const struct cpl_l2t_write_rpl *p = (void *)rsp;
+
+		do_l2t_write_rpl(q->adap, p);
+	} else if (opcode == CPL_SET_TCB_RPL) {
+		const struct cpl_set_tcb_rpl *p = (void *)rsp;
+
+		filter_rpl(q->adap, p);
+	} else
+		dev_err(q->adap->pdev_dev,
+			"unexpected CPL %#x on FW event queue\n", opcode);
+out:
+	return 0;
+}
+
+/**
+ *	uldrx_handler - response queue handler for ULD queues
+ *	@q: the response queue that received the packet
+ *	@rsp: the response queue descriptor holding the offload message
+ *	@gl: the gather list of packet fragments
+ *
+ *	Deliver an ingress offload packet to a ULD.  All processing is done by
+ *	the ULD, we just maintain statistics.
+ */
+static int uldrx_handler(struct sge_rspq *q, const __be64 *rsp,
+			 const struct pkt_gl *gl)
+{
+	struct sge_ofld_rxq *rxq = container_of(q, struct sge_ofld_rxq, rspq);
+
+	/* FW can send CPLs encapsulated in a CPL_FW4_MSG.
+	 */
+	if (((const struct rss_header *)rsp)->opcode == CPL_FW4_MSG &&
+	    ((const struct cpl_fw4_msg *)(rsp + 1))->type == FW_TYPE_RSSCPL)
+		rsp += 2;
+
+	if (ulds[q->uld].rx_handler(q->adap->uld_handle[q->uld], rsp, gl)) {
+		rxq->stats.nomem++;
+		return -1;
+	}
+	if (gl == NULL)
+		rxq->stats.imm++;
+	else if (gl == CXGB4_MSG_AN)
+		rxq->stats.an++;
+	else
+		rxq->stats.pkts++;
+	return 0;
+}
+
+static void disable_msi(struct adapter *adapter)
+{
+	if (adapter->flags & USING_MSIX) {
+		pci_disable_msix(adapter->pdev);
+		adapter->flags &= ~USING_MSIX;
+	} else if (adapter->flags & USING_MSI) {
+		pci_disable_msi(adapter->pdev);
+		adapter->flags &= ~USING_MSI;
+	}
+}
+
+/*
+ * Interrupt handler for non-data events used with MSI-X.
+ */
+static irqreturn_t t4_nondata_intr(int irq, void *cookie)
+{
+	struct adapter *adap = cookie;
+
+	u32 v = t4_read_reg(adap, MYPF_REG(PL_PF_INT_CAUSE));
+	if (v & PFSW) {
+		adap->swintr = 1;
+		t4_write_reg(adap, MYPF_REG(PL_PF_INT_CAUSE), v);
+	}
+	t4_slow_intr_handler(adap);
+	return IRQ_HANDLED;
+}
+
+/*
+ * Name the MSI-X interrupts.
+ */
+static void name_msix_vecs(struct adapter *adap)
+{
+	int i, j, msi_idx = 2, n = sizeof(adap->msix_info[0].desc);
+
+	/* non-data interrupts */
+	snprintf(adap->msix_info[0].desc, n, "%s", adap->port[0]->name);
+
+	/* FW events */
+	snprintf(adap->msix_info[1].desc, n, "%s-FWeventq",
+		 adap->port[0]->name);
+
+	/* Ethernet queues */
+	for_each_port(adap, j) {
+		struct net_device *d = adap->port[j];
+		const struct port_info *pi = netdev_priv(d);
+
+		for (i = 0; i < pi->nqsets; i++, msi_idx++)
+			snprintf(adap->msix_info[msi_idx].desc, n, "%s-Rx%d",
+				 d->name, i);
+	}
+
+	/* offload queues */
+	for_each_ofldrxq(&adap->sge, i)
+		snprintf(adap->msix_info[msi_idx++].desc, n, "%s-ofld%d",
+			 adap->port[0]->name, i);
+
+	for_each_rdmarxq(&adap->sge, i)
+		snprintf(adap->msix_info[msi_idx++].desc, n, "%s-rdma%d",
+			 adap->port[0]->name, i);
+
+	for_each_rdmaciq(&adap->sge, i)
+		snprintf(adap->msix_info[msi_idx++].desc, n, "%s-rdma-ciq%d",
+			 adap->port[0]->name, i);
+}
+
+static int request_msix_queue_irqs(struct adapter *adap)
+{
+	struct sge *s = &adap->sge;
+	int err, ethqidx, ofldqidx = 0, rdmaqidx = 0, rdmaciqqidx = 0;
+	int msi_index = 2;
+
+	err = request_irq(adap->msix_info[1].vec, t4_sge_intr_msix, 0,
+			  adap->msix_info[1].desc, &s->fw_evtq);
+	if (err)
+		return err;
+
+	for_each_ethrxq(s, ethqidx) {
+		err = request_irq(adap->msix_info[msi_index].vec,
+				  t4_sge_intr_msix, 0,
+				  adap->msix_info[msi_index].desc,
+				  &s->ethrxq[ethqidx].rspq);
+		if (err)
+			goto unwind;
+		msi_index++;
+	}
+	for_each_ofldrxq(s, ofldqidx) {
+		err = request_irq(adap->msix_info[msi_index].vec,
+				  t4_sge_intr_msix, 0,
+				  adap->msix_info[msi_index].desc,
+				  &s->ofldrxq[ofldqidx].rspq);
+		if (err)
+			goto unwind;
+		msi_index++;
+	}
+	for_each_rdmarxq(s, rdmaqidx) {
+		err = request_irq(adap->msix_info[msi_index].vec,
+				  t4_sge_intr_msix, 0,
+				  adap->msix_info[msi_index].desc,
+				  &s->rdmarxq[rdmaqidx].rspq);
+		if (err)
+			goto unwind;
+		msi_index++;
+	}
+	for_each_rdmaciq(s, rdmaciqqidx) {
+		err = request_irq(adap->msix_info[msi_index].vec,
+				  t4_sge_intr_msix, 0,
+				  adap->msix_info[msi_index].desc,
+				  &s->rdmaciq[rdmaciqqidx].rspq);
+		if (err)
+			goto unwind;
+		msi_index++;
+	}
+	return 0;
+
+unwind:
+	while (--rdmaciqqidx >= 0)
+		free_irq(adap->msix_info[--msi_index].vec,
+			 &s->rdmaciq[rdmaciqqidx].rspq);
+	while (--rdmaqidx >= 0)
+		free_irq(adap->msix_info[--msi_index].vec,
+			 &s->rdmarxq[rdmaqidx].rspq);
+	while (--ofldqidx >= 0)
+		free_irq(adap->msix_info[--msi_index].vec,
+			 &s->ofldrxq[ofldqidx].rspq);
+	while (--ethqidx >= 0)
+		free_irq(adap->msix_info[--msi_index].vec,
+			 &s->ethrxq[ethqidx].rspq);
+	free_irq(adap->msix_info[1].vec, &s->fw_evtq);
+	return err;
+}
+
+static void free_msix_queue_irqs(struct adapter *adap)
+{
+	int i, msi_index = 2;
+	struct sge *s = &adap->sge;
+
+	free_irq(adap->msix_info[1].vec, &s->fw_evtq);
+	for_each_ethrxq(s, i)
+		free_irq(adap->msix_info[msi_index++].vec, &s->ethrxq[i].rspq);
+	for_each_ofldrxq(s, i)
+		free_irq(adap->msix_info[msi_index++].vec, &s->ofldrxq[i].rspq);
+	for_each_rdmarxq(s, i)
+		free_irq(adap->msix_info[msi_index++].vec, &s->rdmarxq[i].rspq);
+	for_each_rdmaciq(s, i)
+		free_irq(adap->msix_info[msi_index++].vec, &s->rdmaciq[i].rspq);
+}
+
+/**
+ *	write_rss - write the RSS table for a given port
+ *	@pi: the port
+ *	@queues: array of queue indices for RSS
+ *
+ *	Sets up the portion of the HW RSS table for the port's VI to distribute
+ *	packets to the Rx queues in @queues.
+ */
+static int write_rss(const struct port_info *pi, const u16 *queues)
+{
+	u16 *rss;
+	int i, err;
+	const struct sge_eth_rxq *q = &pi->adapter->sge.ethrxq[pi->first_qset];
+
+	rss = kmalloc(pi->rss_size * sizeof(u16), GFP_KERNEL);
+	if (!rss)
+		return -ENOMEM;
+
+	/* map the queue indices to queue ids */
+	for (i = 0; i < pi->rss_size; i++, queues++)
+		rss[i] = q[*queues].rspq.abs_id;
+
+	err = t4_config_rss_range(pi->adapter, pi->adapter->fn, pi->viid, 0,
+				  pi->rss_size, rss, pi->rss_size);
+	kfree(rss);
+	return err;
+}
+
+/**
+ *	setup_rss - configure RSS
+ *	@adap: the adapter
+ *
+ *	Sets up RSS for each port.
+ */
+static int setup_rss(struct adapter *adap)
+{
+	int i, err;
+
+	for_each_port(adap, i) {
+		const struct port_info *pi = adap2pinfo(adap, i);
+
+		err = write_rss(pi, pi->rss);
+		if (err)
+			return err;
+	}
+	return 0;
+}
+
+/*
+ * Return the channel of the ingress queue with the given qid.
+ */
+static unsigned int rxq_to_chan(const struct sge *p, unsigned int qid)
+{
+	qid -= p->ingr_start;
+	return netdev2pinfo(p->ingr_map[qid]->netdev)->tx_chan;
+}
+
+/*
+ * Wait until all NAPI handlers are descheduled.
+ */
+static void quiesce_rx(struct adapter *adap)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(adap->sge.ingr_map); i++) {
+		struct sge_rspq *q = adap->sge.ingr_map[i];
+
+		if (q && q->handler)
+			napi_disable(&q->napi);
+	}
+}
+
+/*
+ * Enable NAPI scheduling and interrupt generation for all Rx queues.
+ */
+static void enable_rx(struct adapter *adap)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(adap->sge.ingr_map); i++) {
+		struct sge_rspq *q = adap->sge.ingr_map[i];
+
+		if (!q)
+			continue;
+		if (q->handler)
+			napi_enable(&q->napi);
+		/* 0-increment GTS to start the timer and enable interrupts */
+		t4_write_reg(adap, MYPF_REG(SGE_PF_GTS),
+			     SEINTARM(q->intr_params) |
+			     INGRESSQID(q->cntxt_id));
+	}
+}
+
+/**
+ *	setup_sge_queues - configure SGE Tx/Rx/response queues
+ *	@adap: the adapter
+ *
+ *	Determines how many sets of SGE queues to use and initializes them.
+ *	We support multiple queue sets per port if we have MSI-X, otherwise
+ *	just one queue set per port.
+ */
+static int setup_sge_queues(struct adapter *adap)
+{
+	int err, msi_idx, i, j;
+	struct sge *s = &adap->sge;
+
+	bitmap_zero(s->starving_fl, MAX_EGRQ);
+	bitmap_zero(s->txq_maperr, MAX_EGRQ);
+
+	if (adap->flags & USING_MSIX)
+		msi_idx = 1;         /* vector 0 is for non-queue interrupts */
+	else {
+		err = t4_sge_alloc_rxq(adap, &s->intrq, false, adap->port[0], 0,
+				       NULL, NULL);
+		if (err)
+			return err;
+		msi_idx = -((int)s->intrq.abs_id + 1);
+	}
+
+	err = t4_sge_alloc_rxq(adap, &s->fw_evtq, true, adap->port[0],
+			       msi_idx, NULL, fwevtq_handler);
+	if (err) {
+freeout:	t4_free_sge_resources(adap);
+		return err;
+	}
+
+	for_each_port(adap, i) {
+		struct net_device *dev = adap->port[i];
+		struct port_info *pi = netdev_priv(dev);
+		struct sge_eth_rxq *q = &s->ethrxq[pi->first_qset];
+		struct sge_eth_txq *t = &s->ethtxq[pi->first_qset];
+
+		for (j = 0; j < pi->nqsets; j++, q++) {
+			if (msi_idx > 0)
+				msi_idx++;
+			err = t4_sge_alloc_rxq(adap, &q->rspq, false, dev,
+					       msi_idx, &q->fl,
+					       t4_ethrx_handler);
+			if (err)
+				goto freeout;
+			q->rspq.idx = j;
+			memset(&q->stats, 0, sizeof(q->stats));
+		}
+		for (j = 0; j < pi->nqsets; j++, t++) {
+			err = t4_sge_alloc_eth_txq(adap, t, dev,
+					netdev_get_tx_queue(dev, j),
+					s->fw_evtq.cntxt_id);
+			if (err)
+				goto freeout;
+		}
+	}
+
+	j = s->ofldqsets / adap->params.nports; /* ofld queues per channel */
+	for_each_ofldrxq(s, i) {
+		struct sge_ofld_rxq *q = &s->ofldrxq[i];
+		struct net_device *dev = adap->port[i / j];
+
+		if (msi_idx > 0)
+			msi_idx++;
+		err = t4_sge_alloc_rxq(adap, &q->rspq, false, dev, msi_idx,
+				       q->fl.size ? &q->fl : NULL,
+				       uldrx_handler);
+		if (err)
+			goto freeout;
+		memset(&q->stats, 0, sizeof(q->stats));
+		s->ofld_rxq[i] = q->rspq.abs_id;
+		err = t4_sge_alloc_ofld_txq(adap, &s->ofldtxq[i], dev,
+					    s->fw_evtq.cntxt_id);
+		if (err)
+			goto freeout;
+	}
+
+	for_each_rdmarxq(s, i) {
+		struct sge_ofld_rxq *q = &s->rdmarxq[i];
+
+		if (msi_idx > 0)
+			msi_idx++;
+		err = t4_sge_alloc_rxq(adap, &q->rspq, false, adap->port[i],
+				       msi_idx, q->fl.size ? &q->fl : NULL,
+				       uldrx_handler);
+		if (err)
+			goto freeout;
+		memset(&q->stats, 0, sizeof(q->stats));
+		s->rdma_rxq[i] = q->rspq.abs_id;
+	}
+
+	for_each_rdmaciq(s, i) {
+		struct sge_ofld_rxq *q = &s->rdmaciq[i];
+
+		if (msi_idx > 0)
+			msi_idx++;
+		err = t4_sge_alloc_rxq(adap, &q->rspq, false, adap->port[i],
+				       msi_idx, q->fl.size ? &q->fl : NULL,
+				       uldrx_handler);
+		if (err)
+			goto freeout;
+		memset(&q->stats, 0, sizeof(q->stats));
+		s->rdma_ciq[i] = q->rspq.abs_id;
+	}
+
+	for_each_port(adap, i) {
+		/*
+		 * Note that ->rdmarxq[i].rspq.cntxt_id below is 0 if we don't
+		 * have RDMA queues, and that's the right value.
+		 */
+		err = t4_sge_alloc_ctrl_txq(adap, &s->ctrlq[i], adap->port[i],
+					    s->fw_evtq.cntxt_id,
+					    s->rdmarxq[i].rspq.cntxt_id);
+		if (err)
+			goto freeout;
+	}
+
+	t4_write_reg(adap, is_t4(adap->params.chip) ?
+				MPS_TRC_RSS_CONTROL :
+				MPS_T5_TRC_RSS_CONTROL,
+		     RSSCONTROL(netdev2pinfo(adap->port[0])->tx_chan) |
+		     QUEUENUMBER(s->ethrxq[0].rspq.abs_id));
+	return 0;
+}
+
+/*
+ * Allocate a chunk of memory using kmalloc or, if that fails, vmalloc.
+ * The allocated memory is cleared.
+ */
+void *t4_alloc_mem(size_t size)
+{
+	void *p = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
+
+	if (!p)
+		p = vzalloc(size);
+	return p;
+}
+
+/*
+ * Free memory allocated through alloc_mem().
+ */
+static void t4_free_mem(void *addr)
+{
+	if (is_vmalloc_addr(addr))
+		vfree(addr);
+	else
+		kfree(addr);
+}
+
+/* Send a Work Request to write the filter at a specified index.  We construct
+ * a Firmware Filter Work Request to have the work done and put the indicated
+ * filter into "pending" mode which will prevent any further actions against
+ * it till we get a reply from the firmware on the completion status of the
+ * request.
+ */
+static int set_filter_wr(struct adapter *adapter, int fidx)
+{
+	struct filter_entry *f = &adapter->tids.ftid_tab[fidx];
+	struct sk_buff *skb;
+	struct fw_filter_wr *fwr;
+	unsigned int ftid;
+
+	/* If the new filter requires loopback Destination MAC and/or VLAN
+	 * rewriting then we need to allocate a Layer 2 Table (L2T) entry for
+	 * the filter.
+	 */
+	if (f->fs.newdmac || f->fs.newvlan) {
+		/* allocate L2T entry for new filter */
+		f->l2t = t4_l2t_alloc_switching(adapter->l2t);
+		if (f->l2t == NULL)
+			return -EAGAIN;
+		if (t4_l2t_set_switching(adapter, f->l2t, f->fs.vlan,
+					f->fs.eport, f->fs.dmac)) {
+			cxgb4_l2t_release(f->l2t);
+			f->l2t = NULL;
+			return -ENOMEM;
+		}
+	}
+
+	ftid = adapter->tids.ftid_base + fidx;
+
+	skb = alloc_skb(sizeof(*fwr), GFP_KERNEL | __GFP_NOFAIL);
+	fwr = (struct fw_filter_wr *)__skb_put(skb, sizeof(*fwr));
+	memset(fwr, 0, sizeof(*fwr));
+
+	/* It would be nice to put most of the following in t4_hw.c but most
+	 * of the work is translating the cxgbtool ch_filter_specification
+	 * into the Work Request and the definition of that structure is
+	 * currently in cxgbtool.h which isn't appropriate to pull into the
+	 * common code.  We may eventually try to come up with a more neutral
+	 * filter specification structure but for now it's easiest to simply
+	 * put this fairly direct code in line ...
+	 */
+	fwr->op_pkd = htonl(FW_WR_OP(FW_FILTER_WR));
+	fwr->len16_pkd = htonl(FW_WR_LEN16(sizeof(*fwr)/16));
+	fwr->tid_to_iq =
+		htonl(V_FW_FILTER_WR_TID(ftid) |
+		      V_FW_FILTER_WR_RQTYPE(f->fs.type) |
+		      V_FW_FILTER_WR_NOREPLY(0) |
+		      V_FW_FILTER_WR_IQ(f->fs.iq));
+	fwr->del_filter_to_l2tix =
+		htonl(V_FW_FILTER_WR_RPTTID(f->fs.rpttid) |
+		      V_FW_FILTER_WR_DROP(f->fs.action == FILTER_DROP) |
+		      V_FW_FILTER_WR_DIRSTEER(f->fs.dirsteer) |
+		      V_FW_FILTER_WR_MASKHASH(f->fs.maskhash) |
+		      V_FW_FILTER_WR_DIRSTEERHASH(f->fs.dirsteerhash) |
+		      V_FW_FILTER_WR_LPBK(f->fs.action == FILTER_SWITCH) |
+		      V_FW_FILTER_WR_DMAC(f->fs.newdmac) |
+		      V_FW_FILTER_WR_SMAC(f->fs.newsmac) |
+		      V_FW_FILTER_WR_INSVLAN(f->fs.newvlan == VLAN_INSERT ||
+					     f->fs.newvlan == VLAN_REWRITE) |
+		      V_FW_FILTER_WR_RMVLAN(f->fs.newvlan == VLAN_REMOVE ||
+					    f->fs.newvlan == VLAN_REWRITE) |
+		      V_FW_FILTER_WR_HITCNTS(f->fs.hitcnts) |
+		      V_FW_FILTER_WR_TXCHAN(f->fs.eport) |
+		      V_FW_FILTER_WR_PRIO(f->fs.prio) |
+		      V_FW_FILTER_WR_L2TIX(f->l2t ? f->l2t->idx : 0));
+	fwr->ethtype = htons(f->fs.val.ethtype);
+	fwr->ethtypem = htons(f->fs.mask.ethtype);
+	fwr->frag_to_ovlan_vldm =
+		(V_FW_FILTER_WR_FRAG(f->fs.val.frag) |
+		 V_FW_FILTER_WR_FRAGM(f->fs.mask.frag) |
+		 V_FW_FILTER_WR_IVLAN_VLD(f->fs.val.ivlan_vld) |
+		 V_FW_FILTER_WR_OVLAN_VLD(f->fs.val.ovlan_vld) |
+		 V_FW_FILTER_WR_IVLAN_VLDM(f->fs.mask.ivlan_vld) |
+		 V_FW_FILTER_WR_OVLAN_VLDM(f->fs.mask.ovlan_vld));
+	fwr->smac_sel = 0;
+	fwr->rx_chan_rx_rpl_iq =
+		htons(V_FW_FILTER_WR_RX_CHAN(0) |
+		      V_FW_FILTER_WR_RX_RPL_IQ(adapter->sge.fw_evtq.abs_id));
+	fwr->maci_to_matchtypem =
+		htonl(V_FW_FILTER_WR_MACI(f->fs.val.macidx) |
+		      V_FW_FILTER_WR_MACIM(f->fs.mask.macidx) |
+		      V_FW_FILTER_WR_FCOE(f->fs.val.fcoe) |
+		      V_FW_FILTER_WR_FCOEM(f->fs.mask.fcoe) |
+		      V_FW_FILTER_WR_PORT(f->fs.val.iport) |
+		      V_FW_FILTER_WR_PORTM(f->fs.mask.iport) |
+		      V_FW_FILTER_WR_MATCHTYPE(f->fs.val.matchtype) |
+		      V_FW_FILTER_WR_MATCHTYPEM(f->fs.mask.matchtype));
+	fwr->ptcl = f->fs.val.proto;
+	fwr->ptclm = f->fs.mask.proto;
+	fwr->ttyp = f->fs.val.tos;
+	fwr->ttypm = f->fs.mask.tos;
+	fwr->ivlan = htons(f->fs.val.ivlan);
+	fwr->ivlanm = htons(f->fs.mask.ivlan);
+	fwr->ovlan = htons(f->fs.val.ovlan);
+	fwr->ovlanm = htons(f->fs.mask.ovlan);
+	memcpy(fwr->lip, f->fs.val.lip, sizeof(fwr->lip));
+	memcpy(fwr->lipm, f->fs.mask.lip, sizeof(fwr->lipm));
+	memcpy(fwr->fip, f->fs.val.fip, sizeof(fwr->fip));
+	memcpy(fwr->fipm, f->fs.mask.fip, sizeof(fwr->fipm));
+	fwr->lp = htons(f->fs.val.lport);
+	fwr->lpm = htons(f->fs.mask.lport);
+	fwr->fp = htons(f->fs.val.fport);
+	fwr->fpm = htons(f->fs.mask.fport);
+	if (f->fs.newsmac)
+		memcpy(fwr->sma, f->fs.smac, sizeof(fwr->sma));
+
+	/* Mark the filter as "pending" and ship off the Filter Work Request.
+	 * When we get the Work Request Reply we'll clear the pending status.
+	 */
+	f->pending = 1;
+	set_wr_txq(skb, CPL_PRIORITY_CONTROL, f->fs.val.iport & 0x3);
+	t4_ofld_send(adapter, skb);
+	return 0;
+}
+
+/* Delete the filter at a specified index.
+ */
+static int del_filter_wr(struct adapter *adapter, int fidx)
+{
+	struct filter_entry *f = &adapter->tids.ftid_tab[fidx];
+	struct sk_buff *skb;
+	struct fw_filter_wr *fwr;
+	unsigned int len, ftid;
+
+	len = sizeof(*fwr);
+	ftid = adapter->tids.ftid_base + fidx;
+
+	skb = alloc_skb(len, GFP_KERNEL | __GFP_NOFAIL);
+	fwr = (struct fw_filter_wr *)__skb_put(skb, len);
+	t4_mk_filtdelwr(ftid, fwr, adapter->sge.fw_evtq.abs_id);
+
+	/* Mark the filter as "pending" and ship off the Filter Work Request.
+	 * When we get the Work Request Reply we'll clear the pending status.
+	 */
+	f->pending = 1;
+	t4_mgmt_tx(adapter, skb);
+	return 0;
+}
+
+static u16 cxgb_select_queue(struct net_device *dev, struct sk_buff *skb,
+			     void *accel_priv, select_queue_fallback_t fallback)
+{
+	int txq;
+
+#ifdef CONFIG_CHELSIO_T4_DCB
+	/* If a Data Center Bridging has been successfully negotiated on this
+	 * link then we'll use the skb's priority to map it to a TX Queue.
+	 * The skb's priority is determined via the VLAN Tag Priority Code
+	 * Point field.
+	 */
+	if (cxgb4_dcb_enabled(dev)) {
+		u16 vlan_tci;
+		int err;
+
+		err = vlan_get_tag(skb, &vlan_tci);
+		if (unlikely(err)) {
+			if (net_ratelimit())
+				netdev_warn(dev,
+					    "TX Packet without VLAN Tag on DCB Link\n");
+			txq = 0;
+		} else {
+			txq = (vlan_tci & VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT;
+		}
+		return txq;
+	}
+#endif /* CONFIG_CHELSIO_T4_DCB */
+
+	if (select_queue) {
+		txq = (skb_rx_queue_recorded(skb)
+			? skb_get_rx_queue(skb)
+			: smp_processor_id());
+
+		while (unlikely(txq >= dev->real_num_tx_queues))
+			txq -= dev->real_num_tx_queues;
+
+		return txq;
+	}
+
+	return fallback(dev, skb) % dev->real_num_tx_queues;
+}
+
+static inline int is_offload(const struct adapter *adap)
+{
+	return adap->params.offload;
+}
+
+/*
+ * Implementation of ethtool operations.
+ */
+
+static u32 get_msglevel(struct net_device *dev)
+{
+	return netdev2adap(dev)->msg_enable;
+}
+
+static void set_msglevel(struct net_device *dev, u32 val)
+{
+	netdev2adap(dev)->msg_enable = val;
+}
+
+static char stats_strings[][ETH_GSTRING_LEN] = {
+	"TxOctetsOK         ",
+	"TxFramesOK         ",
+	"TxBroadcastFrames  ",
+	"TxMulticastFrames  ",
+	"TxUnicastFrames    ",
+	"TxErrorFrames      ",
+
+	"TxFrames64         ",
+	"TxFrames65To127    ",
+	"TxFrames128To255   ",
+	"TxFrames256To511   ",
+	"TxFrames512To1023  ",
+	"TxFrames1024To1518 ",
+	"TxFrames1519ToMax  ",
+
+	"TxFramesDropped    ",
+	"TxPauseFrames      ",
+	"TxPPP0Frames       ",
+	"TxPPP1Frames       ",
+	"TxPPP2Frames       ",
+	"TxPPP3Frames       ",
+	"TxPPP4Frames       ",
+	"TxPPP5Frames       ",
+	"TxPPP6Frames       ",
+	"TxPPP7Frames       ",
+
+	"RxOctetsOK         ",
+	"RxFramesOK         ",
+	"RxBroadcastFrames  ",
+	"RxMulticastFrames  ",
+	"RxUnicastFrames    ",
+
+	"RxFramesTooLong    ",
+	"RxJabberErrors     ",
+	"RxFCSErrors        ",
+	"RxLengthErrors     ",
+	"RxSymbolErrors     ",
+	"RxRuntFrames       ",
+
+	"RxFrames64         ",
+	"RxFrames65To127    ",
+	"RxFrames128To255   ",
+	"RxFrames256To511   ",
+	"RxFrames512To1023  ",
+	"RxFrames1024To1518 ",
+	"RxFrames1519ToMax  ",
+
+	"RxPauseFrames      ",
+	"RxPPP0Frames       ",
+	"RxPPP1Frames       ",
+	"RxPPP2Frames       ",
+	"RxPPP3Frames       ",
+	"RxPPP4Frames       ",
+	"RxPPP5Frames       ",
+	"RxPPP6Frames       ",
+	"RxPPP7Frames       ",
+
+	"RxBG0FramesDropped ",
+	"RxBG1FramesDropped ",
+	"RxBG2FramesDropped ",
+	"RxBG3FramesDropped ",
+	"RxBG0FramesTrunc   ",
+	"RxBG1FramesTrunc   ",
+	"RxBG2FramesTrunc   ",
+	"RxBG3FramesTrunc   ",
+
+	"TSO                ",
+	"TxCsumOffload      ",
+	"RxCsumGood         ",
+	"VLANextractions    ",
+	"VLANinsertions     ",
+	"GROpackets         ",
+	"GROmerged          ",
+	"WriteCoalSuccess   ",
+	"WriteCoalFail      ",
+};
+
+static int get_sset_count(struct net_device *dev, int sset)
+{
+	switch (sset) {
+	case ETH_SS_STATS:
+		return ARRAY_SIZE(stats_strings);
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+#define T4_REGMAP_SIZE (160 * 1024)
+#define T5_REGMAP_SIZE (332 * 1024)
+
+static int get_regs_len(struct net_device *dev)
+{
+	struct adapter *adap = netdev2adap(dev);
+	if (is_t4(adap->params.chip))
+		return T4_REGMAP_SIZE;
+	else
+		return T5_REGMAP_SIZE;
+}
+
+static int get_eeprom_len(struct net_device *dev)
+{
+	return EEPROMSIZE;
+}
+
+static void get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info)
+{
+	struct adapter *adapter = netdev2adap(dev);
+
+	strlcpy(info->driver, KBUILD_MODNAME, sizeof(info->driver));
+	strlcpy(info->version, DRV_VERSION, sizeof(info->version));
+	strlcpy(info->bus_info, pci_name(adapter->pdev),
+		sizeof(info->bus_info));
+
+	if (adapter->params.fw_vers)
+		snprintf(info->fw_version, sizeof(info->fw_version),
+			"%u.%u.%u.%u, TP %u.%u.%u.%u",
+			FW_HDR_FW_VER_MAJOR_GET(adapter->params.fw_vers),
+			FW_HDR_FW_VER_MINOR_GET(adapter->params.fw_vers),
+			FW_HDR_FW_VER_MICRO_GET(adapter->params.fw_vers),
+			FW_HDR_FW_VER_BUILD_GET(adapter->params.fw_vers),
+			FW_HDR_FW_VER_MAJOR_GET(adapter->params.tp_vers),
+			FW_HDR_FW_VER_MINOR_GET(adapter->params.tp_vers),
+			FW_HDR_FW_VER_MICRO_GET(adapter->params.tp_vers),
+			FW_HDR_FW_VER_BUILD_GET(adapter->params.tp_vers));
+}
+
+static void get_strings(struct net_device *dev, u32 stringset, u8 *data)
+{
+	if (stringset == ETH_SS_STATS)
+		memcpy(data, stats_strings, sizeof(stats_strings));
+}
+
+/*
+ * port stats maintained per queue of the port.  They should be in the same
+ * order as in stats_strings above.
+ */
+struct queue_port_stats {
+	u64 tso;
+	u64 tx_csum;
+	u64 rx_csum;
+	u64 vlan_ex;
+	u64 vlan_ins;
+	u64 gro_pkts;
+	u64 gro_merged;
+};
+
+static void collect_sge_port_stats(const struct adapter *adap,
+		const struct port_info *p, struct queue_port_stats *s)
+{
+	int i;
+	const struct sge_eth_txq *tx = &adap->sge.ethtxq[p->first_qset];
+	const struct sge_eth_rxq *rx = &adap->sge.ethrxq[p->first_qset];
+
+	memset(s, 0, sizeof(*s));
+	for (i = 0; i < p->nqsets; i++, rx++, tx++) {
+		s->tso += tx->tso;
+		s->tx_csum += tx->tx_cso;
+		s->rx_csum += rx->stats.rx_cso;
+		s->vlan_ex += rx->stats.vlan_ex;
+		s->vlan_ins += tx->vlan_ins;
+		s->gro_pkts += rx->stats.lro_pkts;
+		s->gro_merged += rx->stats.lro_merged;
+	}
+}
+
+static void get_stats(struct net_device *dev, struct ethtool_stats *stats,
+		      u64 *data)
+{
+	struct port_info *pi = netdev_priv(dev);
+	struct adapter *adapter = pi->adapter;
+	u32 val1, val2;
+
+	t4_get_port_stats(adapter, pi->tx_chan, (struct port_stats *)data);
+
+	data += sizeof(struct port_stats) / sizeof(u64);
+	collect_sge_port_stats(adapter, pi, (struct queue_port_stats *)data);
+	data += sizeof(struct queue_port_stats) / sizeof(u64);
+	if (!is_t4(adapter->params.chip)) {
+		t4_write_reg(adapter, SGE_STAT_CFG, STATSOURCE_T5(7));
+		val1 = t4_read_reg(adapter, SGE_STAT_TOTAL);
+		val2 = t4_read_reg(adapter, SGE_STAT_MATCH);
+		*data = val1 - val2;
+		data++;
+		*data = val2;
+		data++;
+	} else {
+		memset(data, 0, 2 * sizeof(u64));
+		*data += 2;
+	}
+}
+
+/*
+ * Return a version number to identify the type of adapter.  The scheme is:
+ * - bits 0..9: chip version
+ * - bits 10..15: chip revision
+ * - bits 16..23: register dump version
+ */
+static inline unsigned int mk_adap_vers(const struct adapter *ap)
+{
+	return CHELSIO_CHIP_VERSION(ap->params.chip) |
+		(CHELSIO_CHIP_RELEASE(ap->params.chip) << 10) | (1 << 16);
+}
+
+static void reg_block_dump(struct adapter *ap, void *buf, unsigned int start,
+			   unsigned int end)
+{
+	u32 *p = buf + start;
+
+	for ( ; start <= end; start += sizeof(u32))
+		*p++ = t4_read_reg(ap, start);
+}
+
+static void get_regs(struct net_device *dev, struct ethtool_regs *regs,
+		     void *buf)
+{
+	static const unsigned int t4_reg_ranges[] = {
+		0x1008, 0x1108,
+		0x1180, 0x11b4,
+		0x11fc, 0x123c,
+		0x1300, 0x173c,
+		0x1800, 0x18fc,
+		0x3000, 0x30d8,
+		0x30e0, 0x5924,
+		0x5960, 0x59d4,
+		0x5a00, 0x5af8,
+		0x6000, 0x6098,
+		0x6100, 0x6150,
+		0x6200, 0x6208,
+		0x6240, 0x6248,
+		0x6280, 0x6338,
+		0x6370, 0x638c,
+		0x6400, 0x643c,
+		0x6500, 0x6524,
+		0x6a00, 0x6a38,
+		0x6a60, 0x6a78,
+		0x6b00, 0x6b84,
+		0x6bf0, 0x6c84,
+		0x6cf0, 0x6d84,
+		0x6df0, 0x6e84,
+		0x6ef0, 0x6f84,
+		0x6ff0, 0x7084,
+		0x70f0, 0x7184,
+		0x71f0, 0x7284,
+		0x72f0, 0x7384,
+		0x73f0, 0x7450,
+		0x7500, 0x7530,
+		0x7600, 0x761c,
+		0x7680, 0x76cc,
+		0x7700, 0x7798,
+		0x77c0, 0x77fc,
+		0x7900, 0x79fc,
+		0x7b00, 0x7c38,
+		0x7d00, 0x7efc,
+		0x8dc0, 0x8e1c,
+		0x8e30, 0x8e78,
+		0x8ea0, 0x8f6c,
+		0x8fc0, 0x9074,
+		0x90fc, 0x90fc,
+		0x9400, 0x9458,
+		0x9600, 0x96bc,
+		0x9800, 0x9808,
+		0x9820, 0x983c,
+		0x9850, 0x9864,
+		0x9c00, 0x9c6c,
+		0x9c80, 0x9cec,
+		0x9d00, 0x9d6c,
+		0x9d80, 0x9dec,
+		0x9e00, 0x9e6c,
+		0x9e80, 0x9eec,
+		0x9f00, 0x9f6c,
+		0x9f80, 0x9fec,
+		0xd004, 0xd03c,
+		0xdfc0, 0xdfe0,
+		0xe000, 0xea7c,
+		0xf000, 0x11110,
+		0x11118, 0x11190,
+		0x19040, 0x1906c,
+		0x19078, 0x19080,
+		0x1908c, 0x19124,
+		0x19150, 0x191b0,
+		0x191d0, 0x191e8,
+		0x19238, 0x1924c,
+		0x193f8, 0x19474,
+		0x19490, 0x194f8,
+		0x19800, 0x19f30,
+		0x1a000, 0x1a06c,
+		0x1a0b0, 0x1a120,
+		0x1a128, 0x1a138,
+		0x1a190, 0x1a1c4,
+		0x1a1fc, 0x1a1fc,
+		0x1e040, 0x1e04c,
+		0x1e284, 0x1e28c,
+		0x1e2c0, 0x1e2c0,
+		0x1e2e0, 0x1e2e0,
+		0x1e300, 0x1e384,
+		0x1e3c0, 0x1e3c8,
+		0x1e440, 0x1e44c,
+		0x1e684, 0x1e68c,
+		0x1e6c0, 0x1e6c0,
+		0x1e6e0, 0x1e6e0,
+		0x1e700, 0x1e784,
+		0x1e7c0, 0x1e7c8,
+		0x1e840, 0x1e84c,
+		0x1ea84, 0x1ea8c,
+		0x1eac0, 0x1eac0,
+		0x1eae0, 0x1eae0,
+		0x1eb00, 0x1eb84,
+		0x1ebc0, 0x1ebc8,
+		0x1ec40, 0x1ec4c,
+		0x1ee84, 0x1ee8c,
+		0x1eec0, 0x1eec0,
+		0x1eee0, 0x1eee0,
+		0x1ef00, 0x1ef84,
+		0x1efc0, 0x1efc8,
+		0x1f040, 0x1f04c,
+		0x1f284, 0x1f28c,
+		0x1f2c0, 0x1f2c0,
+		0x1f2e0, 0x1f2e0,
+		0x1f300, 0x1f384,
+		0x1f3c0, 0x1f3c8,
+		0x1f440, 0x1f44c,
+		0x1f684, 0x1f68c,
+		0x1f6c0, 0x1f6c0,
+		0x1f6e0, 0x1f6e0,
+		0x1f700, 0x1f784,
+		0x1f7c0, 0x1f7c8,
+		0x1f840, 0x1f84c,
+		0x1fa84, 0x1fa8c,
+		0x1fac0, 0x1fac0,
+		0x1fae0, 0x1fae0,
+		0x1fb00, 0x1fb84,
+		0x1fbc0, 0x1fbc8,
+		0x1fc40, 0x1fc4c,
+		0x1fe84, 0x1fe8c,
+		0x1fec0, 0x1fec0,
+		0x1fee0, 0x1fee0,
+		0x1ff00, 0x1ff84,
+		0x1ffc0, 0x1ffc8,
+		0x20000, 0x2002c,
+		0x20100, 0x2013c,
+		0x20190, 0x201c8,
+		0x20200, 0x20318,
+		0x20400, 0x20528,
+		0x20540, 0x20614,
+		0x21000, 0x21040,
+		0x2104c, 0x21060,
+		0x210c0, 0x210ec,
+		0x21200, 0x21268,
+		0x21270, 0x21284,
+		0x212fc, 0x21388,
+		0x21400, 0x21404,
+		0x21500, 0x21518,
+		0x2152c, 0x2153c,
+		0x21550, 0x21554,
+		0x21600, 0x21600,
+		0x21608, 0x21628,
+		0x21630, 0x2163c,
+		0x21700, 0x2171c,
+		0x21780, 0x2178c,
+		0x21800, 0x21c38,
+		0x21c80, 0x21d7c,
+		0x21e00, 0x21e04,
+		0x22000, 0x2202c,
+		0x22100, 0x2213c,
+		0x22190, 0x221c8,
+		0x22200, 0x22318,
+		0x22400, 0x22528,
+		0x22540, 0x22614,
+		0x23000, 0x23040,
+		0x2304c, 0x23060,
+		0x230c0, 0x230ec,
+		0x23200, 0x23268,
+		0x23270, 0x23284,
+		0x232fc, 0x23388,
+		0x23400, 0x23404,
+		0x23500, 0x23518,
+		0x2352c, 0x2353c,
+		0x23550, 0x23554,
+		0x23600, 0x23600,
+		0x23608, 0x23628,
+		0x23630, 0x2363c,
+		0x23700, 0x2371c,
+		0x23780, 0x2378c,
+		0x23800, 0x23c38,
+		0x23c80, 0x23d7c,
+		0x23e00, 0x23e04,
+		0x24000, 0x2402c,
+		0x24100, 0x2413c,
+		0x24190, 0x241c8,
+		0x24200, 0x24318,
+		0x24400, 0x24528,
+		0x24540, 0x24614,
+		0x25000, 0x25040,
+		0x2504c, 0x25060,
+		0x250c0, 0x250ec,
+		0x25200, 0x25268,
+		0x25270, 0x25284,
+		0x252fc, 0x25388,
+		0x25400, 0x25404,
+		0x25500, 0x25518,
+		0x2552c, 0x2553c,
+		0x25550, 0x25554,
+		0x25600, 0x25600,
+		0x25608, 0x25628,
+		0x25630, 0x2563c,
+		0x25700, 0x2571c,
+		0x25780, 0x2578c,
+		0x25800, 0x25c38,
+		0x25c80, 0x25d7c,
+		0x25e00, 0x25e04,
+		0x26000, 0x2602c,
+		0x26100, 0x2613c,
+		0x26190, 0x261c8,
+		0x26200, 0x26318,
+		0x26400, 0x26528,
+		0x26540, 0x26614,
+		0x27000, 0x27040,
+		0x2704c, 0x27060,
+		0x270c0, 0x270ec,
+		0x27200, 0x27268,
+		0x27270, 0x27284,
+		0x272fc, 0x27388,
+		0x27400, 0x27404,
+		0x27500, 0x27518,
+		0x2752c, 0x2753c,
+		0x27550, 0x27554,
+		0x27600, 0x27600,
+		0x27608, 0x27628,
+		0x27630, 0x2763c,
+		0x27700, 0x2771c,
+		0x27780, 0x2778c,
+		0x27800, 0x27c38,
+		0x27c80, 0x27d7c,
+		0x27e00, 0x27e04
+	};
+
+	static const unsigned int t5_reg_ranges[] = {
+		0x1008, 0x1148,
+		0x1180, 0x11b4,
+		0x11fc, 0x123c,
+		0x1280, 0x173c,
+		0x1800, 0x18fc,
+		0x3000, 0x3028,
+		0x3060, 0x30d8,
+		0x30e0, 0x30fc,
+		0x3140, 0x357c,
+		0x35a8, 0x35cc,
+		0x35ec, 0x35ec,
+		0x3600, 0x5624,
+		0x56cc, 0x575c,
+		0x580c, 0x5814,
+		0x5890, 0x58bc,
+		0x5940, 0x59dc,
+		0x59fc, 0x5a18,
+		0x5a60, 0x5a9c,
+		0x5b9c, 0x5bfc,
+		0x6000, 0x6040,
+		0x6058, 0x614c,
+		0x7700, 0x7798,
+		0x77c0, 0x78fc,
+		0x7b00, 0x7c54,
+		0x7d00, 0x7efc,
+		0x8dc0, 0x8de0,
+		0x8df8, 0x8e84,
+		0x8ea0, 0x8f84,
+		0x8fc0, 0x90f8,
+		0x9400, 0x9470,
+		0x9600, 0x96f4,
+		0x9800, 0x9808,
+		0x9820, 0x983c,
+		0x9850, 0x9864,
+		0x9c00, 0x9c6c,
+		0x9c80, 0x9cec,
+		0x9d00, 0x9d6c,
+		0x9d80, 0x9dec,
+		0x9e00, 0x9e6c,
+		0x9e80, 0x9eec,
+		0x9f00, 0x9f6c,
+		0x9f80, 0xa020,
+		0xd004, 0xd03c,
+		0xdfc0, 0xdfe0,
+		0xe000, 0x11088,
+		0x1109c, 0x11110,
+		0x11118, 0x1117c,
+		0x11190, 0x11204,
+		0x19040, 0x1906c,
+		0x19078, 0x19080,
+		0x1908c, 0x19124,
+		0x19150, 0x191b0,
+		0x191d0, 0x191e8,
+		0x19238, 0x19290,
+		0x193f8, 0x19474,
+		0x19490, 0x194cc,
+		0x194f0, 0x194f8,
+		0x19c00, 0x19c60,
+		0x19c94, 0x19e10,
+		0x19e50, 0x19f34,
+		0x19f40, 0x19f50,
+		0x19f90, 0x19fe4,
+		0x1a000, 0x1a06c,
+		0x1a0b0, 0x1a120,
+		0x1a128, 0x1a138,
+		0x1a190, 0x1a1c4,
+		0x1a1fc, 0x1a1fc,
+		0x1e008, 0x1e00c,
+		0x1e040, 0x1e04c,
+		0x1e284, 0x1e290,
+		0x1e2c0, 0x1e2c0,
+		0x1e2e0, 0x1e2e0,
+		0x1e300, 0x1e384,
+		0x1e3c0, 0x1e3c8,
+		0x1e408, 0x1e40c,
+		0x1e440, 0x1e44c,
+		0x1e684, 0x1e690,
+		0x1e6c0, 0x1e6c0,
+		0x1e6e0, 0x1e6e0,
+		0x1e700, 0x1e784,
+		0x1e7c0, 0x1e7c8,
+		0x1e808, 0x1e80c,
+		0x1e840, 0x1e84c,
+		0x1ea84, 0x1ea90,
+		0x1eac0, 0x1eac0,
+		0x1eae0, 0x1eae0,
+		0x1eb00, 0x1eb84,
+		0x1ebc0, 0x1ebc8,
+		0x1ec08, 0x1ec0c,
+		0x1ec40, 0x1ec4c,
+		0x1ee84, 0x1ee90,
+		0x1eec0, 0x1eec0,
+		0x1eee0, 0x1eee0,
+		0x1ef00, 0x1ef84,
+		0x1efc0, 0x1efc8,
+		0x1f008, 0x1f00c,
+		0x1f040, 0x1f04c,
+		0x1f284, 0x1f290,
+		0x1f2c0, 0x1f2c0,
+		0x1f2e0, 0x1f2e0,
+		0x1f300, 0x1f384,
+		0x1f3c0, 0x1f3c8,
+		0x1f408, 0x1f40c,
+		0x1f440, 0x1f44c,
+		0x1f684, 0x1f690,
+		0x1f6c0, 0x1f6c0,
+		0x1f6e0, 0x1f6e0,
+		0x1f700, 0x1f784,
+		0x1f7c0, 0x1f7c8,
+		0x1f808, 0x1f80c,
+		0x1f840, 0x1f84c,
+		0x1fa84, 0x1fa90,
+		0x1fac0, 0x1fac0,
+		0x1fae0, 0x1fae0,
+		0x1fb00, 0x1fb84,
+		0x1fbc0, 0x1fbc8,
+		0x1fc08, 0x1fc0c,
+		0x1fc40, 0x1fc4c,
+		0x1fe84, 0x1fe90,
+		0x1fec0, 0x1fec0,
+		0x1fee0, 0x1fee0,
+		0x1ff00, 0x1ff84,
+		0x1ffc0, 0x1ffc8,
+		0x30000, 0x30030,
+		0x30100, 0x30144,
+		0x30190, 0x301d0,
+		0x30200, 0x30318,
+		0x30400, 0x3052c,
+		0x30540, 0x3061c,
+		0x30800, 0x30834,
+		0x308c0, 0x30908,
+		0x30910, 0x309ac,
+		0x30a00, 0x30a04,
+		0x30a0c, 0x30a2c,
+		0x30a44, 0x30a50,
+		0x30a74, 0x30c24,
+		0x30d08, 0x30d14,
+		0x30d1c, 0x30d20,
+		0x30d3c, 0x30d50,
+		0x31200, 0x3120c,
+		0x31220, 0x31220,
+		0x31240, 0x31240,
+		0x31600, 0x31600,
+		0x31608, 0x3160c,
+		0x31a00, 0x31a1c,
+		0x31e04, 0x31e20,
+		0x31e38, 0x31e3c,
+		0x31e80, 0x31e80,
+		0x31e88, 0x31ea8,
+		0x31eb0, 0x31eb4,
+		0x31ec8, 0x31ed4,
+		0x31fb8, 0x32004,
+		0x32208, 0x3223c,
+		0x32600, 0x32630,
+		0x32a00, 0x32abc,
+		0x32b00, 0x32b70,
+		0x33000, 0x33048,
+		0x33060, 0x3309c,
+		0x330f0, 0x33148,
+		0x33160, 0x3319c,
+		0x331f0, 0x332e4,
+		0x332f8, 0x333e4,
+		0x333f8, 0x33448,
+		0x33460, 0x3349c,
+		0x334f0, 0x33548,
+		0x33560, 0x3359c,
+		0x335f0, 0x336e4,
+		0x336f8, 0x337e4,
+		0x337f8, 0x337fc,
+		0x33814, 0x33814,
+		0x3382c, 0x3382c,
+		0x33880, 0x3388c,
+		0x338e8, 0x338ec,
+		0x33900, 0x33948,
+		0x33960, 0x3399c,
+		0x339f0, 0x33ae4,
+		0x33af8, 0x33b10,
+		0x33b28, 0x33b28,
+		0x33b3c, 0x33b50,
+		0x33bf0, 0x33c10,
+		0x33c28, 0x33c28,
+		0x33c3c, 0x33c50,
+		0x33cf0, 0x33cfc,
+		0x34000, 0x34030,
+		0x34100, 0x34144,
+		0x34190, 0x341d0,
+		0x34200, 0x34318,
+		0x34400, 0x3452c,
+		0x34540, 0x3461c,
+		0x34800, 0x34834,
+		0x348c0, 0x34908,
+		0x34910, 0x349ac,
+		0x34a00, 0x34a04,
+		0x34a0c, 0x34a2c,
+		0x34a44, 0x34a50,
+		0x34a74, 0x34c24,
+		0x34d08, 0x34d14,
+		0x34d1c, 0x34d20,
+		0x34d3c, 0x34d50,
+		0x35200, 0x3520c,
+		0x35220, 0x35220,
+		0x35240, 0x35240,
+		0x35600, 0x35600,
+		0x35608, 0x3560c,
+		0x35a00, 0x35a1c,
+		0x35e04, 0x35e20,
+		0x35e38, 0x35e3c,
+		0x35e80, 0x35e80,
+		0x35e88, 0x35ea8,
+		0x35eb0, 0x35eb4,
+		0x35ec8, 0x35ed4,
+		0x35fb8, 0x36004,
+		0x36208, 0x3623c,
+		0x36600, 0x36630,
+		0x36a00, 0x36abc,
+		0x36b00, 0x36b70,
+		0x37000, 0x37048,
+		0x37060, 0x3709c,
+		0x370f0, 0x37148,
+		0x37160, 0x3719c,
+		0x371f0, 0x372e4,
+		0x372f8, 0x373e4,
+		0x373f8, 0x37448,
+		0x37460, 0x3749c,
+		0x374f0, 0x37548,
+		0x37560, 0x3759c,
+		0x375f0, 0x376e4,
+		0x376f8, 0x377e4,
+		0x377f8, 0x377fc,
+		0x37814, 0x37814,
+		0x3782c, 0x3782c,
+		0x37880, 0x3788c,
+		0x378e8, 0x378ec,
+		0x37900, 0x37948,
+		0x37960, 0x3799c,
+		0x379f0, 0x37ae4,
+		0x37af8, 0x37b10,
+		0x37b28, 0x37b28,
+		0x37b3c, 0x37b50,
+		0x37bf0, 0x37c10,
+		0x37c28, 0x37c28,
+		0x37c3c, 0x37c50,
+		0x37cf0, 0x37cfc,
+		0x38000, 0x38030,
+		0x38100, 0x38144,
+		0x38190, 0x381d0,
+		0x38200, 0x38318,
+		0x38400, 0x3852c,
+		0x38540, 0x3861c,
+		0x38800, 0x38834,
+		0x388c0, 0x38908,
+		0x38910, 0x389ac,
+		0x38a00, 0x38a04,
+		0x38a0c, 0x38a2c,
+		0x38a44, 0x38a50,
+		0x38a74, 0x38c24,
+		0x38d08, 0x38d14,
+		0x38d1c, 0x38d20,
+		0x38d3c, 0x38d50,
+		0x39200, 0x3920c,
+		0x39220, 0x39220,
+		0x39240, 0x39240,
+		0x39600, 0x39600,
+		0x39608, 0x3960c,
+		0x39a00, 0x39a1c,
+		0x39e04, 0x39e20,
+		0x39e38, 0x39e3c,
+		0x39e80, 0x39e80,
+		0x39e88, 0x39ea8,
+		0x39eb0, 0x39eb4,
+		0x39ec8, 0x39ed4,
+		0x39fb8, 0x3a004,
+		0x3a208, 0x3a23c,
+		0x3a600, 0x3a630,
+		0x3aa00, 0x3aabc,
+		0x3ab00, 0x3ab70,
+		0x3b000, 0x3b048,
+		0x3b060, 0x3b09c,
+		0x3b0f0, 0x3b148,
+		0x3b160, 0x3b19c,
+		0x3b1f0, 0x3b2e4,
+		0x3b2f8, 0x3b3e4,
+		0x3b3f8, 0x3b448,
+		0x3b460, 0x3b49c,
+		0x3b4f0, 0x3b548,
+		0x3b560, 0x3b59c,
+		0x3b5f0, 0x3b6e4,
+		0x3b6f8, 0x3b7e4,
+		0x3b7f8, 0x3b7fc,
+		0x3b814, 0x3b814,
+		0x3b82c, 0x3b82c,
+		0x3b880, 0x3b88c,
+		0x3b8e8, 0x3b8ec,
+		0x3b900, 0x3b948,
+		0x3b960, 0x3b99c,
+		0x3b9f0, 0x3bae4,
+		0x3baf8, 0x3bb10,
+		0x3bb28, 0x3bb28,
+		0x3bb3c, 0x3bb50,
+		0x3bbf0, 0x3bc10,
+		0x3bc28, 0x3bc28,
+		0x3bc3c, 0x3bc50,
+		0x3bcf0, 0x3bcfc,
+		0x3c000, 0x3c030,
+		0x3c100, 0x3c144,
+		0x3c190, 0x3c1d0,
+		0x3c200, 0x3c318,
+		0x3c400, 0x3c52c,
+		0x3c540, 0x3c61c,
+		0x3c800, 0x3c834,
+		0x3c8c0, 0x3c908,
+		0x3c910, 0x3c9ac,
+		0x3ca00, 0x3ca04,
+		0x3ca0c, 0x3ca2c,
+		0x3ca44, 0x3ca50,
+		0x3ca74, 0x3cc24,
+		0x3cd08, 0x3cd14,
+		0x3cd1c, 0x3cd20,
+		0x3cd3c, 0x3cd50,
+		0x3d200, 0x3d20c,
+		0x3d220, 0x3d220,
+		0x3d240, 0x3d240,
+		0x3d600, 0x3d600,
+		0x3d608, 0x3d60c,
+		0x3da00, 0x3da1c,
+		0x3de04, 0x3de20,
+		0x3de38, 0x3de3c,
+		0x3de80, 0x3de80,
+		0x3de88, 0x3dea8,
+		0x3deb0, 0x3deb4,
+		0x3dec8, 0x3ded4,
+		0x3dfb8, 0x3e004,
+		0x3e208, 0x3e23c,
+		0x3e600, 0x3e630,
+		0x3ea00, 0x3eabc,
+		0x3eb00, 0x3eb70,
+		0x3f000, 0x3f048,
+		0x3f060, 0x3f09c,
+		0x3f0f0, 0x3f148,
+		0x3f160, 0x3f19c,
+		0x3f1f0, 0x3f2e4,
+		0x3f2f8, 0x3f3e4,
+		0x3f3f8, 0x3f448,
+		0x3f460, 0x3f49c,
+		0x3f4f0, 0x3f548,
+		0x3f560, 0x3f59c,
+		0x3f5f0, 0x3f6e4,
+		0x3f6f8, 0x3f7e4,
+		0x3f7f8, 0x3f7fc,
+		0x3f814, 0x3f814,
+		0x3f82c, 0x3f82c,
+		0x3f880, 0x3f88c,
+		0x3f8e8, 0x3f8ec,
+		0x3f900, 0x3f948,
+		0x3f960, 0x3f99c,
+		0x3f9f0, 0x3fae4,
+		0x3faf8, 0x3fb10,
+		0x3fb28, 0x3fb28,
+		0x3fb3c, 0x3fb50,
+		0x3fbf0, 0x3fc10,
+		0x3fc28, 0x3fc28,
+		0x3fc3c, 0x3fc50,
+		0x3fcf0, 0x3fcfc,
+		0x40000, 0x4000c,
+		0x40040, 0x40068,
+		0x40080, 0x40144,
+		0x40180, 0x4018c,
+		0x40200, 0x40298,
+		0x402ac, 0x4033c,
+		0x403f8, 0x403fc,
+		0x41304, 0x413c4,
+		0x41400, 0x4141c,
+		0x41480, 0x414d0,
+		0x44000, 0x44078,
+		0x440c0, 0x44278,
+		0x442c0, 0x44478,
+		0x444c0, 0x44678,
+		0x446c0, 0x44878,
+		0x448c0, 0x449fc,
+		0x45000, 0x45068,
+		0x45080, 0x45084,
+		0x450a0, 0x450b0,
+		0x45200, 0x45268,
+		0x45280, 0x45284,
+		0x452a0, 0x452b0,
+		0x460c0, 0x460e4,
+		0x47000, 0x4708c,
+		0x47200, 0x47250,
+		0x47400, 0x47420,
+		0x47600, 0x47618,
+		0x47800, 0x47814,
+		0x48000, 0x4800c,
+		0x48040, 0x48068,
+		0x48080, 0x48144,
+		0x48180, 0x4818c,
+		0x48200, 0x48298,
+		0x482ac, 0x4833c,
+		0x483f8, 0x483fc,
+		0x49304, 0x493c4,
+		0x49400, 0x4941c,
+		0x49480, 0x494d0,
+		0x4c000, 0x4c078,
+		0x4c0c0, 0x4c278,
+		0x4c2c0, 0x4c478,
+		0x4c4c0, 0x4c678,
+		0x4c6c0, 0x4c878,
+		0x4c8c0, 0x4c9fc,
+		0x4d000, 0x4d068,
+		0x4d080, 0x4d084,
+		0x4d0a0, 0x4d0b0,
+		0x4d200, 0x4d268,
+		0x4d280, 0x4d284,
+		0x4d2a0, 0x4d2b0,
+		0x4e0c0, 0x4e0e4,
+		0x4f000, 0x4f08c,
+		0x4f200, 0x4f250,
+		0x4f400, 0x4f420,
+		0x4f600, 0x4f618,
+		0x4f800, 0x4f814,
+		0x50000, 0x500cc,
+		0x50400, 0x50400,
+		0x50800, 0x508cc,
+		0x50c00, 0x50c00,
+		0x51000, 0x5101c,
+		0x51300, 0x51308,
+	};
+
+	int i;
+	struct adapter *ap = netdev2adap(dev);
+	static const unsigned int *reg_ranges;
+	int arr_size = 0, buf_size = 0;
+
+	if (is_t4(ap->params.chip)) {
+		reg_ranges = &t4_reg_ranges[0];
+		arr_size = ARRAY_SIZE(t4_reg_ranges);
+		buf_size = T4_REGMAP_SIZE;
+	} else {
+		reg_ranges = &t5_reg_ranges[0];
+		arr_size = ARRAY_SIZE(t5_reg_ranges);
+		buf_size = T5_REGMAP_SIZE;
+	}
+
+	regs->version = mk_adap_vers(ap);
+
+	memset(buf, 0, buf_size);
+	for (i = 0; i < arr_size; i += 2)
+		reg_block_dump(ap, buf, reg_ranges[i], reg_ranges[i + 1]);
+}
+
+static int restart_autoneg(struct net_device *dev)
+{
+	struct port_info *p = netdev_priv(dev);
+
+	if (!netif_running(dev))
+		return -EAGAIN;
+	if (p->link_cfg.autoneg != AUTONEG_ENABLE)
+		return -EINVAL;
+	t4_restart_aneg(p->adapter, p->adapter->fn, p->tx_chan);
+	return 0;
+}
+
+static int identify_port(struct net_device *dev,
+			 enum ethtool_phys_id_state state)
+{
+	unsigned int val;
+	struct adapter *adap = netdev2adap(dev);
+
+	if (state == ETHTOOL_ID_ACTIVE)
+		val = 0xffff;
+	else if (state == ETHTOOL_ID_INACTIVE)
+		val = 0;
+	else
+		return -EINVAL;
+
+	return t4_identify_port(adap, adap->fn, netdev2pinfo(dev)->viid, val);
+}
+
+static unsigned int from_fw_linkcaps(unsigned int type, unsigned int caps)
+{
+	unsigned int v = 0;
+
+	if (type == FW_PORT_TYPE_BT_SGMII || type == FW_PORT_TYPE_BT_XFI ||
+	    type == FW_PORT_TYPE_BT_XAUI) {
+		v |= SUPPORTED_TP;
+		if (caps & FW_PORT_CAP_SPEED_100M)
+			v |= SUPPORTED_100baseT_Full;
+		if (caps & FW_PORT_CAP_SPEED_1G)
+			v |= SUPPORTED_1000baseT_Full;
+		if (caps & FW_PORT_CAP_SPEED_10G)
+			v |= SUPPORTED_10000baseT_Full;
+	} else if (type == FW_PORT_TYPE_KX4 || type == FW_PORT_TYPE_KX) {
+		v |= SUPPORTED_Backplane;
+		if (caps & FW_PORT_CAP_SPEED_1G)
+			v |= SUPPORTED_1000baseKX_Full;
+		if (caps & FW_PORT_CAP_SPEED_10G)
+			v |= SUPPORTED_10000baseKX4_Full;
+	} else if (type == FW_PORT_TYPE_KR)
+		v |= SUPPORTED_Backplane | SUPPORTED_10000baseKR_Full;
+	else if (type == FW_PORT_TYPE_BP_AP)
+		v |= SUPPORTED_Backplane | SUPPORTED_10000baseR_FEC |
+		     SUPPORTED_10000baseKR_Full | SUPPORTED_1000baseKX_Full;
+	else if (type == FW_PORT_TYPE_BP4_AP)
+		v |= SUPPORTED_Backplane | SUPPORTED_10000baseR_FEC |
+		     SUPPORTED_10000baseKR_Full | SUPPORTED_1000baseKX_Full |
+		     SUPPORTED_10000baseKX4_Full;
+	else if (type == FW_PORT_TYPE_FIBER_XFI ||
+		 type == FW_PORT_TYPE_FIBER_XAUI || type == FW_PORT_TYPE_SFP) {
+		v |= SUPPORTED_FIBRE;
+		if (caps & FW_PORT_CAP_SPEED_1G)
+			v |= SUPPORTED_1000baseT_Full;
+		if (caps & FW_PORT_CAP_SPEED_10G)
+			v |= SUPPORTED_10000baseT_Full;
+	} else if (type == FW_PORT_TYPE_BP40_BA)
+		v |= SUPPORTED_40000baseSR4_Full;
+
+	if (caps & FW_PORT_CAP_ANEG)
+		v |= SUPPORTED_Autoneg;
+	return v;
+}
+
+static unsigned int to_fw_linkcaps(unsigned int caps)
+{
+	unsigned int v = 0;
+
+	if (caps & ADVERTISED_100baseT_Full)
+		v |= FW_PORT_CAP_SPEED_100M;
+	if (caps & ADVERTISED_1000baseT_Full)
+		v |= FW_PORT_CAP_SPEED_1G;
+	if (caps & ADVERTISED_10000baseT_Full)
+		v |= FW_PORT_CAP_SPEED_10G;
+	if (caps & ADVERTISED_40000baseSR4_Full)
+		v |= FW_PORT_CAP_SPEED_40G;
+	return v;
+}
+
+static int get_settings(struct net_device *dev, struct ethtool_cmd *cmd)
+{
+	const struct port_info *p = netdev_priv(dev);
+
+	if (p->port_type == FW_PORT_TYPE_BT_SGMII ||
+	    p->port_type == FW_PORT_TYPE_BT_XFI ||
+	    p->port_type == FW_PORT_TYPE_BT_XAUI)
+		cmd->port = PORT_TP;
+	else if (p->port_type == FW_PORT_TYPE_FIBER_XFI ||
+		 p->port_type == FW_PORT_TYPE_FIBER_XAUI)
+		cmd->port = PORT_FIBRE;
+	else if (p->port_type == FW_PORT_TYPE_SFP ||
+		 p->port_type == FW_PORT_TYPE_QSFP_10G ||
+		 p->port_type == FW_PORT_TYPE_QSFP) {
+		if (p->mod_type == FW_PORT_MOD_TYPE_LR ||
+		    p->mod_type == FW_PORT_MOD_TYPE_SR ||
+		    p->mod_type == FW_PORT_MOD_TYPE_ER ||
+		    p->mod_type == FW_PORT_MOD_TYPE_LRM)
+			cmd->port = PORT_FIBRE;
+		else if (p->mod_type == FW_PORT_MOD_TYPE_TWINAX_PASSIVE ||
+			 p->mod_type == FW_PORT_MOD_TYPE_TWINAX_ACTIVE)
+			cmd->port = PORT_DA;
+		else
+			cmd->port = PORT_OTHER;
+	} else
+		cmd->port = PORT_OTHER;
+
+	if (p->mdio_addr >= 0) {
+		cmd->phy_address = p->mdio_addr;
+		cmd->transceiver = XCVR_EXTERNAL;
+		cmd->mdio_support = p->port_type == FW_PORT_TYPE_BT_SGMII ?
+			MDIO_SUPPORTS_C22 : MDIO_SUPPORTS_C45;
+	} else {
+		cmd->phy_address = 0;  /* not really, but no better option */
+		cmd->transceiver = XCVR_INTERNAL;
+		cmd->mdio_support = 0;
+	}
+
+	cmd->supported = from_fw_linkcaps(p->port_type, p->link_cfg.supported);
+	cmd->advertising = from_fw_linkcaps(p->port_type,
+					    p->link_cfg.advertising);
+	ethtool_cmd_speed_set(cmd,
+			      netif_carrier_ok(dev) ? p->link_cfg.speed : 0);
+	cmd->duplex = DUPLEX_FULL;
+	cmd->autoneg = p->link_cfg.autoneg;
+	cmd->maxtxpkt = 0;
+	cmd->maxrxpkt = 0;
+	return 0;
+}
+
+static unsigned int speed_to_caps(int speed)
+{
+	if (speed == 100)
+		return FW_PORT_CAP_SPEED_100M;
+	if (speed == 1000)
+		return FW_PORT_CAP_SPEED_1G;
+	if (speed == 10000)
+		return FW_PORT_CAP_SPEED_10G;
+	if (speed == 40000)
+		return FW_PORT_CAP_SPEED_40G;
+	return 0;
+}
+
+static int set_settings(struct net_device *dev, struct ethtool_cmd *cmd)
+{
+	unsigned int cap;
+	struct port_info *p = netdev_priv(dev);
+	struct link_config *lc = &p->link_cfg;
+	u32 speed = ethtool_cmd_speed(cmd);
+
+	if (cmd->duplex != DUPLEX_FULL)     /* only full-duplex supported */
+		return -EINVAL;
+
+	if (!(lc->supported & FW_PORT_CAP_ANEG)) {
+		/*
+		 * PHY offers a single speed.  See if that's what's
+		 * being requested.
+		 */
+		if (cmd->autoneg == AUTONEG_DISABLE &&
+		    (lc->supported & speed_to_caps(speed)))
+			return 0;
+		return -EINVAL;
+	}
+
+	if (cmd->autoneg == AUTONEG_DISABLE) {
+		cap = speed_to_caps(speed);
+
+		if (!(lc->supported & cap) ||
+		    (speed == 1000) ||
+		    (speed == 10000) ||
+		    (speed == 40000))
+			return -EINVAL;
+		lc->requested_speed = cap;
+		lc->advertising = 0;
+	} else {
+		cap = to_fw_linkcaps(cmd->advertising);
+		if (!(lc->supported & cap))
+			return -EINVAL;
+		lc->requested_speed = 0;
+		lc->advertising = cap | FW_PORT_CAP_ANEG;
+	}
+	lc->autoneg = cmd->autoneg;
+
+	if (netif_running(dev))
+		return t4_link_start(p->adapter, p->adapter->fn, p->tx_chan,
+				     lc);
+	return 0;
+}
+
+static void get_pauseparam(struct net_device *dev,
+			   struct ethtool_pauseparam *epause)
+{
+	struct port_info *p = netdev_priv(dev);
+
+	epause->autoneg = (p->link_cfg.requested_fc & PAUSE_AUTONEG) != 0;
+	epause->rx_pause = (p->link_cfg.fc & PAUSE_RX) != 0;
+	epause->tx_pause = (p->link_cfg.fc & PAUSE_TX) != 0;
+}
+
+static int set_pauseparam(struct net_device *dev,
+			  struct ethtool_pauseparam *epause)
+{
+	struct port_info *p = netdev_priv(dev);
+	struct link_config *lc = &p->link_cfg;
+
+	if (epause->autoneg == AUTONEG_DISABLE)
+		lc->requested_fc = 0;
+	else if (lc->supported & FW_PORT_CAP_ANEG)
+		lc->requested_fc = PAUSE_AUTONEG;
+	else
+		return -EINVAL;
+
+	if (epause->rx_pause)
+		lc->requested_fc |= PAUSE_RX;
+	if (epause->tx_pause)
+		lc->requested_fc |= PAUSE_TX;
+	if (netif_running(dev))
+		return t4_link_start(p->adapter, p->adapter->fn, p->tx_chan,
+				     lc);
+	return 0;
+}
+
+static void get_sge_param(struct net_device *dev, struct ethtool_ringparam *e)
+{
+	const struct port_info *pi = netdev_priv(dev);
+	const struct sge *s = &pi->adapter->sge;
+
+	e->rx_max_pending = MAX_RX_BUFFERS;
+	e->rx_mini_max_pending = MAX_RSPQ_ENTRIES;
+	e->rx_jumbo_max_pending = 0;
+	e->tx_max_pending = MAX_TXQ_ENTRIES;
+
+	e->rx_pending = s->ethrxq[pi->first_qset].fl.size - 8;
+	e->rx_mini_pending = s->ethrxq[pi->first_qset].rspq.size;
+	e->rx_jumbo_pending = 0;
+	e->tx_pending = s->ethtxq[pi->first_qset].q.size;
+}
+
+static int set_sge_param(struct net_device *dev, struct ethtool_ringparam *e)
+{
+	int i;
+	const struct port_info *pi = netdev_priv(dev);
+	struct adapter *adapter = pi->adapter;
+	struct sge *s = &adapter->sge;
+
+	if (e->rx_pending > MAX_RX_BUFFERS || e->rx_jumbo_pending ||
+	    e->tx_pending > MAX_TXQ_ENTRIES ||
+	    e->rx_mini_pending > MAX_RSPQ_ENTRIES ||
+	    e->rx_mini_pending < MIN_RSPQ_ENTRIES ||
+	    e->rx_pending < MIN_FL_ENTRIES || e->tx_pending < MIN_TXQ_ENTRIES)
+		return -EINVAL;
+
+	if (adapter->flags & FULL_INIT_DONE)
+		return -EBUSY;
+
+	for (i = 0; i < pi->nqsets; ++i) {
+		s->ethtxq[pi->first_qset + i].q.size = e->tx_pending;
+		s->ethrxq[pi->first_qset + i].fl.size = e->rx_pending + 8;
+		s->ethrxq[pi->first_qset + i].rspq.size = e->rx_mini_pending;
+	}
+	return 0;
+}
+
+static int closest_timer(const struct sge *s, int time)
+{
+	int i, delta, match = 0, min_delta = INT_MAX;
+
+	for (i = 0; i < ARRAY_SIZE(s->timer_val); i++) {
+		delta = time - s->timer_val[i];
+		if (delta < 0)
+			delta = -delta;
+		if (delta < min_delta) {
+			min_delta = delta;
+			match = i;
+		}
+	}
+	return match;
+}
+
+static int closest_thres(const struct sge *s, int thres)
+{
+	int i, delta, match = 0, min_delta = INT_MAX;
+
+	for (i = 0; i < ARRAY_SIZE(s->counter_val); i++) {
+		delta = thres - s->counter_val[i];
+		if (delta < 0)
+			delta = -delta;
+		if (delta < min_delta) {
+			min_delta = delta;
+			match = i;
+		}
+	}
+	return match;
+}
+
+/*
+ * Return a queue's interrupt hold-off time in us.  0 means no timer.
+ */
+static unsigned int qtimer_val(const struct adapter *adap,
+			       const struct sge_rspq *q)
+{
+	unsigned int idx = q->intr_params >> 1;
+
+	return idx < SGE_NTIMERS ? adap->sge.timer_val[idx] : 0;
+}
+
+/**
+ *	set_rspq_intr_params - set a queue's interrupt holdoff parameters
+ *	@q: the Rx queue
+ *	@us: the hold-off time in us, or 0 to disable timer
+ *	@cnt: the hold-off packet count, or 0 to disable counter
+ *
+ *	Sets an Rx queue's interrupt hold-off time and packet count.  At least
+ *	one of the two needs to be enabled for the queue to generate interrupts.
+ */
+static int set_rspq_intr_params(struct sge_rspq *q,
+				unsigned int us, unsigned int cnt)
+{
+	struct adapter *adap = q->adap;
+
+	if ((us | cnt) == 0)
+		cnt = 1;
+
+	if (cnt) {
+		int err;
+		u32 v, new_idx;
+
+		new_idx = closest_thres(&adap->sge, cnt);
+		if (q->desc && q->pktcnt_idx != new_idx) {
+			/* the queue has already been created, update it */
+			v = FW_PARAMS_MNEM(FW_PARAMS_MNEM_DMAQ) |
+			    FW_PARAMS_PARAM_X(FW_PARAMS_PARAM_DMAQ_IQ_INTCNTTHRESH) |
+			    FW_PARAMS_PARAM_YZ(q->cntxt_id);
+			err = t4_set_params(adap, adap->fn, adap->fn, 0, 1, &v,
+					    &new_idx);
+			if (err)
+				return err;
+		}
+		q->pktcnt_idx = new_idx;
+	}
+
+	us = us == 0 ? 6 : closest_timer(&adap->sge, us);
+	q->intr_params = QINTR_TIMER_IDX(us) | (cnt > 0 ? QINTR_CNT_EN : 0);
+	return 0;
+}
+
+/**
+ * set_rx_intr_params - set a net devices's RX interrupt holdoff paramete!
+ * @dev: the network device
+ * @us: the hold-off time in us, or 0 to disable timer
+ * @cnt: the hold-off packet count, or 0 to disable counter
+ *
+ * Set the RX interrupt hold-off parameters for a network device.
+ */
+static int set_rx_intr_params(struct net_device *dev,
+			      unsigned int us, unsigned int cnt)
+{
+	int i, err;
+	struct port_info *pi = netdev_priv(dev);
+	struct adapter *adap = pi->adapter;
+	struct sge_eth_rxq *q = &adap->sge.ethrxq[pi->first_qset];
+
+	for (i = 0; i < pi->nqsets; i++, q++) {
+		err = set_rspq_intr_params(&q->rspq, us, cnt);
+		if (err)
+			return err;
+	}
+	return 0;
+}
+
+static int set_adaptive_rx_setting(struct net_device *dev, int adaptive_rx)
+{
+	int i;
+	struct port_info *pi = netdev_priv(dev);
+	struct adapter *adap = pi->adapter;
+	struct sge_eth_rxq *q = &adap->sge.ethrxq[pi->first_qset];
+
+	for (i = 0; i < pi->nqsets; i++, q++)
+		q->rspq.adaptive_rx = adaptive_rx;
+
+	return 0;
+}
+
+static int get_adaptive_rx_setting(struct net_device *dev)
+{
+	struct port_info *pi = netdev_priv(dev);
+	struct adapter *adap = pi->adapter;
+	struct sge_eth_rxq *q = &adap->sge.ethrxq[pi->first_qset];
+
+	return q->rspq.adaptive_rx;
+}
+
+static int set_coalesce(struct net_device *dev, struct ethtool_coalesce *c)
+{
+	set_adaptive_rx_setting(dev, c->use_adaptive_rx_coalesce);
+	return set_rx_intr_params(dev, c->rx_coalesce_usecs,
+				  c->rx_max_coalesced_frames);
+}
+
+static int get_coalesce(struct net_device *dev, struct ethtool_coalesce *c)
+{
+	const struct port_info *pi = netdev_priv(dev);
+	const struct adapter *adap = pi->adapter;
+	const struct sge_rspq *rq = &adap->sge.ethrxq[pi->first_qset].rspq;
+
+	c->rx_coalesce_usecs = qtimer_val(adap, rq);
+	c->rx_max_coalesced_frames = (rq->intr_params & QINTR_CNT_EN) ?
+		adap->sge.counter_val[rq->pktcnt_idx] : 0;
+	c->use_adaptive_rx_coalesce = get_adaptive_rx_setting(dev);
+	return 0;
+}
+
+/**
+ *	eeprom_ptov - translate a physical EEPROM address to virtual
+ *	@phys_addr: the physical EEPROM address
+ *	@fn: the PCI function number
+ *	@sz: size of function-specific area
+ *
+ *	Translate a physical EEPROM address to virtual.  The first 1K is
+ *	accessed through virtual addresses starting at 31K, the rest is
+ *	accessed through virtual addresses starting at 0.
+ *
+ *	The mapping is as follows:
+ *	[0..1K) -> [31K..32K)
+ *	[1K..1K+A) -> [31K-A..31K)
+ *	[1K+A..ES) -> [0..ES-A-1K)
+ *
+ *	where A = @fn * @sz, and ES = EEPROM size.
+ */
+static int eeprom_ptov(unsigned int phys_addr, unsigned int fn, unsigned int sz)
+{
+	fn *= sz;
+	if (phys_addr < 1024)
+		return phys_addr + (31 << 10);
+	if (phys_addr < 1024 + fn)
+		return 31744 - fn + phys_addr - 1024;
+	if (phys_addr < EEPROMSIZE)
+		return phys_addr - 1024 - fn;
+	return -EINVAL;
+}
+
+/*
+ * The next two routines implement eeprom read/write from physical addresses.
+ */
+static int eeprom_rd_phys(struct adapter *adap, unsigned int phys_addr, u32 *v)
+{
+	int vaddr = eeprom_ptov(phys_addr, adap->fn, EEPROMPFSIZE);
+
+	if (vaddr >= 0)
+		vaddr = pci_read_vpd(adap->pdev, vaddr, sizeof(u32), v);
+	return vaddr < 0 ? vaddr : 0;
+}
+
+static int eeprom_wr_phys(struct adapter *adap, unsigned int phys_addr, u32 v)
+{
+	int vaddr = eeprom_ptov(phys_addr, adap->fn, EEPROMPFSIZE);
+
+	if (vaddr >= 0)
+		vaddr = pci_write_vpd(adap->pdev, vaddr, sizeof(u32), &v);
+	return vaddr < 0 ? vaddr : 0;
+}
+
+#define EEPROM_MAGIC 0x38E2F10C
+
+static int get_eeprom(struct net_device *dev, struct ethtool_eeprom *e,
+		      u8 *data)
+{
+	int i, err = 0;
+	struct adapter *adapter = netdev2adap(dev);
+
+	u8 *buf = kmalloc(EEPROMSIZE, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	e->magic = EEPROM_MAGIC;
+	for (i = e->offset & ~3; !err && i < e->offset + e->len; i += 4)
+		err = eeprom_rd_phys(adapter, i, (u32 *)&buf[i]);
+
+	if (!err)
+		memcpy(data, buf + e->offset, e->len);
+	kfree(buf);
+	return err;
+}
+
+static int set_eeprom(struct net_device *dev, struct ethtool_eeprom *eeprom,
+		      u8 *data)
+{
+	u8 *buf;
+	int err = 0;
+	u32 aligned_offset, aligned_len, *p;
+	struct adapter *adapter = netdev2adap(dev);
+
+	if (eeprom->magic != EEPROM_MAGIC)
+		return -EINVAL;
+
+	aligned_offset = eeprom->offset & ~3;
+	aligned_len = (eeprom->len + (eeprom->offset & 3) + 3) & ~3;
+
+	if (adapter->fn > 0) {
+		u32 start = 1024 + adapter->fn * EEPROMPFSIZE;
+
+		if (aligned_offset < start ||
+		    aligned_offset + aligned_len > start + EEPROMPFSIZE)
+			return -EPERM;
+	}
+
+	if (aligned_offset != eeprom->offset || aligned_len != eeprom->len) {
+		/*
+		 * RMW possibly needed for first or last words.
+		 */
+		buf = kmalloc(aligned_len, GFP_KERNEL);
+		if (!buf)
+			return -ENOMEM;
+		err = eeprom_rd_phys(adapter, aligned_offset, (u32 *)buf);
+		if (!err && aligned_len > 4)
+			err = eeprom_rd_phys(adapter,
+					     aligned_offset + aligned_len - 4,
+					     (u32 *)&buf[aligned_len - 4]);
+		if (err)
+			goto out;
+		memcpy(buf + (eeprom->offset & 3), data, eeprom->len);
+	} else
+		buf = data;
+
+	err = t4_seeprom_wp(adapter, false);
+	if (err)
+		goto out;
+
+	for (p = (u32 *)buf; !err && aligned_len; aligned_len -= 4, p++) {
+		err = eeprom_wr_phys(adapter, aligned_offset, *p);
+		aligned_offset += 4;
+	}
+
+	if (!err)
+		err = t4_seeprom_wp(adapter, true);
+out:
+	if (buf != data)
+		kfree(buf);
+	return err;
+}
+
+static int set_flash(struct net_device *netdev, struct ethtool_flash *ef)
+{
+	int ret;
+	const struct firmware *fw;
+	struct adapter *adap = netdev2adap(netdev);
+	unsigned int mbox = FW_PCIE_FW_MASTER_MASK + 1;
+
+	ef->data[sizeof(ef->data) - 1] = '\0';
+	ret = request_firmware(&fw, ef->data, adap->pdev_dev);
+	if (ret < 0)
+		return ret;
+
+	/* If the adapter has been fully initialized then we'll go ahead and
+	 * try to get the firmware's cooperation in upgrading to the new
+	 * firmware image otherwise we'll try to do the entire job from the
+	 * host ... and we always "force" the operation in this path.
+	 */
+	if (adap->flags & FULL_INIT_DONE)
+		mbox = adap->mbox;
+
+	ret = t4_fw_upgrade(adap, mbox, fw->data, fw->size, 1);
+	release_firmware(fw);
+	if (!ret)
+		dev_info(adap->pdev_dev, "loaded firmware %s,"
+			 " reload cxgb4 driver\n", ef->data);
+	return ret;
+}
+
+#define WOL_SUPPORTED (WAKE_BCAST | WAKE_MAGIC)
+#define BCAST_CRC 0xa0ccc1a6
+
+static void get_wol(struct net_device *dev, struct ethtool_wolinfo *wol)
+{
+	wol->supported = WAKE_BCAST | WAKE_MAGIC;
+	wol->wolopts = netdev2adap(dev)->wol;
+	memset(&wol->sopass, 0, sizeof(wol->sopass));
+}
+
+static int set_wol(struct net_device *dev, struct ethtool_wolinfo *wol)
+{
+	int err = 0;
+	struct port_info *pi = netdev_priv(dev);
+
+	if (wol->wolopts & ~WOL_SUPPORTED)
+		return -EINVAL;
+	t4_wol_magic_enable(pi->adapter, pi->tx_chan,
+			    (wol->wolopts & WAKE_MAGIC) ? dev->dev_addr : NULL);
+	if (wol->wolopts & WAKE_BCAST) {
+		err = t4_wol_pat_enable(pi->adapter, pi->tx_chan, 0xfe, ~0ULL,
+					~0ULL, 0, false);
+		if (!err)
+			err = t4_wol_pat_enable(pi->adapter, pi->tx_chan, 1,
+						~6ULL, ~0ULL, BCAST_CRC, true);
+	} else
+		t4_wol_pat_enable(pi->adapter, pi->tx_chan, 0, 0, 0, 0, false);
+	return err;
+}
+
+static int cxgb_set_features(struct net_device *dev, netdev_features_t features)
+{
+	const struct port_info *pi = netdev_priv(dev);
+	netdev_features_t changed = dev->features ^ features;
+	int err;
+
+	if (!(changed & NETIF_F_HW_VLAN_CTAG_RX))
+		return 0;
+
+	err = t4_set_rxmode(pi->adapter, pi->adapter->fn, pi->viid, -1,
+			    -1, -1, -1,
+			    !!(features & NETIF_F_HW_VLAN_CTAG_RX), true);
+	if (unlikely(err))
+		dev->features = features ^ NETIF_F_HW_VLAN_CTAG_RX;
+	return err;
+}
+
+static u32 get_rss_table_size(struct net_device *dev)
+{
+	const struct port_info *pi = netdev_priv(dev);
+
+	return pi->rss_size;
+}
+
+static int get_rss_table(struct net_device *dev, u32 *p, u8 *key)
+{
+	const struct port_info *pi = netdev_priv(dev);
+	unsigned int n = pi->rss_size;
+
+	while (n--)
+		p[n] = pi->rss[n];
+	return 0;
+}
+
+static int set_rss_table(struct net_device *dev, const u32 *p, const u8 *key)
+{
+	unsigned int i;
+	struct port_info *pi = netdev_priv(dev);
+
+	for (i = 0; i < pi->rss_size; i++)
+		pi->rss[i] = p[i];
+	if (pi->adapter->flags & FULL_INIT_DONE)
+		return write_rss(pi, pi->rss);
+	return 0;
+}
+
+static int get_rxnfc(struct net_device *dev, struct ethtool_rxnfc *info,
+		     u32 *rules)
+{
+	const struct port_info *pi = netdev_priv(dev);
+
+	switch (info->cmd) {
+	case ETHTOOL_GRXFH: {
+		unsigned int v = pi->rss_mode;
+
+		info->data = 0;
+		switch (info->flow_type) {
+		case TCP_V4_FLOW:
+			if (v & FW_RSS_VI_CONFIG_CMD_IP4FOURTUPEN)
+				info->data = RXH_IP_SRC | RXH_IP_DST |
+					     RXH_L4_B_0_1 | RXH_L4_B_2_3;
+			else if (v & FW_RSS_VI_CONFIG_CMD_IP4TWOTUPEN)
+				info->data = RXH_IP_SRC | RXH_IP_DST;
+			break;
+		case UDP_V4_FLOW:
+			if ((v & FW_RSS_VI_CONFIG_CMD_IP4FOURTUPEN) &&
+			    (v & FW_RSS_VI_CONFIG_CMD_UDPEN))
+				info->data = RXH_IP_SRC | RXH_IP_DST |
+					     RXH_L4_B_0_1 | RXH_L4_B_2_3;
+			else if (v & FW_RSS_VI_CONFIG_CMD_IP4TWOTUPEN)
+				info->data = RXH_IP_SRC | RXH_IP_DST;
+			break;
+		case SCTP_V4_FLOW:
+		case AH_ESP_V4_FLOW:
+		case IPV4_FLOW:
+			if (v & FW_RSS_VI_CONFIG_CMD_IP4TWOTUPEN)
+				info->data = RXH_IP_SRC | RXH_IP_DST;
+			break;
+		case TCP_V6_FLOW:
+			if (v & FW_RSS_VI_CONFIG_CMD_IP6FOURTUPEN)
+				info->data = RXH_IP_SRC | RXH_IP_DST |
+					     RXH_L4_B_0_1 | RXH_L4_B_2_3;
+			else if (v & FW_RSS_VI_CONFIG_CMD_IP6TWOTUPEN)
+				info->data = RXH_IP_SRC | RXH_IP_DST;
+			break;
+		case UDP_V6_FLOW:
+			if ((v & FW_RSS_VI_CONFIG_CMD_IP6FOURTUPEN) &&
+			    (v & FW_RSS_VI_CONFIG_CMD_UDPEN))
+				info->data = RXH_IP_SRC | RXH_IP_DST |
+					     RXH_L4_B_0_1 | RXH_L4_B_2_3;
+			else if (v & FW_RSS_VI_CONFIG_CMD_IP6TWOTUPEN)
+				info->data = RXH_IP_SRC | RXH_IP_DST;
+			break;
+		case SCTP_V6_FLOW:
+		case AH_ESP_V6_FLOW:
+		case IPV6_FLOW:
+			if (v & FW_RSS_VI_CONFIG_CMD_IP6TWOTUPEN)
+				info->data = RXH_IP_SRC | RXH_IP_DST;
+			break;
+		}
+		return 0;
+	}
+	case ETHTOOL_GRXRINGS:
+		info->data = pi->nqsets;
+		return 0;
+	}
+	return -EOPNOTSUPP;
+}
+
+static const struct ethtool_ops cxgb_ethtool_ops = {
+	.get_settings      = get_settings,
+	.set_settings      = set_settings,
+	.get_drvinfo       = get_drvinfo,
+	.get_msglevel      = get_msglevel,
+	.set_msglevel      = set_msglevel,
+	.get_ringparam     = get_sge_param,
+	.set_ringparam     = set_sge_param,
+	.get_coalesce      = get_coalesce,
+	.set_coalesce      = set_coalesce,
+	.get_eeprom_len    = get_eeprom_len,
+	.get_eeprom        = get_eeprom,
+	.set_eeprom        = set_eeprom,
+	.get_pauseparam    = get_pauseparam,
+	.set_pauseparam    = set_pauseparam,
+	.get_link          = ethtool_op_get_link,
+	.get_strings       = get_strings,
+	.set_phys_id       = identify_port,
+	.nway_reset        = restart_autoneg,
+	.get_sset_count    = get_sset_count,
+	.get_ethtool_stats = get_stats,
+	.get_regs_len      = get_regs_len,
+	.get_regs          = get_regs,
+	.get_wol           = get_wol,
+	.set_wol           = set_wol,
+	.get_rxnfc         = get_rxnfc,
+	.get_rxfh_indir_size = get_rss_table_size,
+	.get_rxfh	   = get_rss_table,
+	.set_rxfh	   = set_rss_table,
+	.flash_device      = set_flash,
+};
+
+/*
+ * debugfs support
+ */
+static ssize_t mem_read(struct file *file, char __user *buf, size_t count,
+			loff_t *ppos)
+{
+	loff_t pos = *ppos;
+	loff_t avail = file_inode(file)->i_size;
+	unsigned int mem = (uintptr_t)file->private_data & 3;
+	struct adapter *adap = file->private_data - mem;
+	__be32 *data;
+	int ret;
+
+	if (pos < 0)
+		return -EINVAL;
+	if (pos >= avail)
+		return 0;
+	if (count > avail - pos)
+		count = avail - pos;
+
+	data = t4_alloc_mem(count);
+	if (!data)
+		return -ENOMEM;
+
+	spin_lock(&adap->win0_lock);
+	ret = t4_memory_rw(adap, 0, mem, pos, count, data, T4_MEMORY_READ);
+	spin_unlock(&adap->win0_lock);
+	if (ret) {
+		t4_free_mem(data);
+		return ret;
+	}
+	ret = copy_to_user(buf, data, count);
+
+	t4_free_mem(data);
+	if (ret)
+		return -EFAULT;
+
+	*ppos = pos + count;
+	return count;
+}
+
+static const struct file_operations mem_debugfs_fops = {
+	.owner   = THIS_MODULE,
+	.open    = simple_open,
+	.read    = mem_read,
+	.llseek  = default_llseek,
+};
+
+static void add_debugfs_mem(struct adapter *adap, const char *name,
+			    unsigned int idx, unsigned int size_mb)
+{
+	struct dentry *de;
+
+	de = debugfs_create_file(name, S_IRUSR, adap->debugfs_root,
+				 (void *)adap + idx, &mem_debugfs_fops);
+	if (de && de->d_inode)
+		de->d_inode->i_size = size_mb << 20;
+}
+
+static int setup_debugfs(struct adapter *adap)
+{
+	int i;
+	u32 size;
+
+	if (IS_ERR_OR_NULL(adap->debugfs_root))
+		return -1;
+
+	i = t4_read_reg(adap, MA_TARGET_MEM_ENABLE);
+	if (i & EDRAM0_ENABLE) {
+		size = t4_read_reg(adap, MA_EDRAM0_BAR);
+		add_debugfs_mem(adap, "edc0", MEM_EDC0,	EDRAM_SIZE_GET(size));
+	}
+	if (i & EDRAM1_ENABLE) {
+		size = t4_read_reg(adap, MA_EDRAM1_BAR);
+		add_debugfs_mem(adap, "edc1", MEM_EDC1, EDRAM_SIZE_GET(size));
+	}
+	if (is_t4(adap->params.chip)) {
+		size = t4_read_reg(adap, MA_EXT_MEMORY_BAR);
+		if (i & EXT_MEM_ENABLE)
+			add_debugfs_mem(adap, "mc", MEM_MC,
+					EXT_MEM_SIZE_GET(size));
+	} else {
+		if (i & EXT_MEM_ENABLE) {
+			size = t4_read_reg(adap, MA_EXT_MEMORY_BAR);
+			add_debugfs_mem(adap, "mc0", MEM_MC0,
+					EXT_MEM_SIZE_GET(size));
+		}
+		if (i & EXT_MEM1_ENABLE) {
+			size = t4_read_reg(adap, MA_EXT_MEMORY1_BAR);
+			add_debugfs_mem(adap, "mc1", MEM_MC1,
+					EXT_MEM_SIZE_GET(size));
+		}
+	}
+	if (adap->l2t)
+		debugfs_create_file("l2t", S_IRUSR, adap->debugfs_root, adap,
+				    &t4_l2t_fops);
+	return 0;
+}
+
+/*
+ * upper-layer driver support
+ */
+
+/*
+ * Allocate an active-open TID and set it to the supplied value.
+ */
+int cxgb4_alloc_atid(struct tid_info *t, void *data)
+{
+	int atid = -1;
+
+	spin_lock_bh(&t->atid_lock);
+	if (t->afree) {
+		union aopen_entry *p = t->afree;
+
+		atid = (p - t->atid_tab) + t->atid_base;
+		t->afree = p->next;
+		p->data = data;
+		t->atids_in_use++;
+	}
+	spin_unlock_bh(&t->atid_lock);
+	return atid;
+}
+EXPORT_SYMBOL(cxgb4_alloc_atid);
+
+/*
+ * Release an active-open TID.
+ */
+void cxgb4_free_atid(struct tid_info *t, unsigned int atid)
+{
+	union aopen_entry *p = &t->atid_tab[atid - t->atid_base];
+
+	spin_lock_bh(&t->atid_lock);
+	p->next = t->afree;
+	t->afree = p;
+	t->atids_in_use--;
+	spin_unlock_bh(&t->atid_lock);
+}
+EXPORT_SYMBOL(cxgb4_free_atid);
+
+/*
+ * Allocate a server TID and set it to the supplied value.
+ */
+int cxgb4_alloc_stid(struct tid_info *t, int family, void *data)
+{
+	int stid;
+
+	spin_lock_bh(&t->stid_lock);
+	if (family == PF_INET) {
+		stid = find_first_zero_bit(t->stid_bmap, t->nstids);
+		if (stid < t->nstids)
+			__set_bit(stid, t->stid_bmap);
+		else
+			stid = -1;
+	} else {
+		stid = bitmap_find_free_region(t->stid_bmap, t->nstids, 2);
+		if (stid < 0)
+			stid = -1;
+	}
+	if (stid >= 0) {
+		t->stid_tab[stid].data = data;
+		stid += t->stid_base;
+		/* IPv6 requires max of 520 bits or 16 cells in TCAM
+		 * This is equivalent to 4 TIDs. With CLIP enabled it
+		 * needs 2 TIDs.
+		 */
+		if (family == PF_INET)
+			t->stids_in_use++;
+		else
+			t->stids_in_use += 4;
+	}
+	spin_unlock_bh(&t->stid_lock);
+	return stid;
+}
+EXPORT_SYMBOL(cxgb4_alloc_stid);
+
+/* Allocate a server filter TID and set it to the supplied value.
+ */
+int cxgb4_alloc_sftid(struct tid_info *t, int family, void *data)
+{
+	int stid;
+
+	spin_lock_bh(&t->stid_lock);
+	if (family == PF_INET) {
+		stid = find_next_zero_bit(t->stid_bmap,
+				t->nstids + t->nsftids, t->nstids);
+		if (stid < (t->nstids + t->nsftids))
+			__set_bit(stid, t->stid_bmap);
+		else
+			stid = -1;
+	} else {
+		stid = -1;
+	}
+	if (stid >= 0) {
+		t->stid_tab[stid].data = data;
+		stid -= t->nstids;
+		stid += t->sftid_base;
+		t->stids_in_use++;
+	}
+	spin_unlock_bh(&t->stid_lock);
+	return stid;
+}
+EXPORT_SYMBOL(cxgb4_alloc_sftid);
+
+/* Release a server TID.
+ */
+void cxgb4_free_stid(struct tid_info *t, unsigned int stid, int family)
+{
+	/* Is it a server filter TID? */
+	if (t->nsftids && (stid >= t->sftid_base)) {
+		stid -= t->sftid_base;
+		stid += t->nstids;
+	} else {
+		stid -= t->stid_base;
+	}
+
+	spin_lock_bh(&t->stid_lock);
+	if (family == PF_INET)
+		__clear_bit(stid, t->stid_bmap);
+	else
+		bitmap_release_region(t->stid_bmap, stid, 2);
+	t->stid_tab[stid].data = NULL;
+	if (family == PF_INET)
+		t->stids_in_use--;
+	else
+		t->stids_in_use -= 4;
+	spin_unlock_bh(&t->stid_lock);
+}
+EXPORT_SYMBOL(cxgb4_free_stid);
+
+/*
+ * Populate a TID_RELEASE WR.  Caller must properly size the skb.
+ */
+static void mk_tid_release(struct sk_buff *skb, unsigned int chan,
+			   unsigned int tid)
+{
+	struct cpl_tid_release *req;
+
+	set_wr_txq(skb, CPL_PRIORITY_SETUP, chan);
+	req = (struct cpl_tid_release *)__skb_put(skb, sizeof(*req));
+	INIT_TP_WR(req, tid);
+	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_TID_RELEASE, tid));
+}
+
+/*
+ * Queue a TID release request and if necessary schedule a work queue to
+ * process it.
+ */
+static void cxgb4_queue_tid_release(struct tid_info *t, unsigned int chan,
+				    unsigned int tid)
+{
+	void **p = &t->tid_tab[tid];
+	struct adapter *adap = container_of(t, struct adapter, tids);
+
+	spin_lock_bh(&adap->tid_release_lock);
+	*p = adap->tid_release_head;
+	/* Low 2 bits encode the Tx channel number */
+	adap->tid_release_head = (void **)((uintptr_t)p | chan);
+	if (!adap->tid_release_task_busy) {
+		adap->tid_release_task_busy = true;
+		queue_work(adap->workq, &adap->tid_release_task);
+	}
+	spin_unlock_bh(&adap->tid_release_lock);
+}
+
+/*
+ * Process the list of pending TID release requests.
+ */
+static void process_tid_release_list(struct work_struct *work)
+{
+	struct sk_buff *skb;
+	struct adapter *adap;
+
+	adap = container_of(work, struct adapter, tid_release_task);
+
+	spin_lock_bh(&adap->tid_release_lock);
+	while (adap->tid_release_head) {
+		void **p = adap->tid_release_head;
+		unsigned int chan = (uintptr_t)p & 3;
+		p = (void *)p - chan;
+
+		adap->tid_release_head = *p;
+		*p = NULL;
+		spin_unlock_bh(&adap->tid_release_lock);
+
+		while (!(skb = alloc_skb(sizeof(struct cpl_tid_release),
+					 GFP_KERNEL)))
+			schedule_timeout_uninterruptible(1);
+
+		mk_tid_release(skb, chan, p - adap->tids.tid_tab);
+		t4_ofld_send(adap, skb);
+		spin_lock_bh(&adap->tid_release_lock);
+	}
+	adap->tid_release_task_busy = false;
+	spin_unlock_bh(&adap->tid_release_lock);
+}
+
+/*
+ * Release a TID and inform HW.  If we are unable to allocate the release
+ * message we defer to a work queue.
+ */
+void cxgb4_remove_tid(struct tid_info *t, unsigned int chan, unsigned int tid)
+{
+	void *old;
+	struct sk_buff *skb;
+	struct adapter *adap = container_of(t, struct adapter, tids);
+
+	old = t->tid_tab[tid];
+	skb = alloc_skb(sizeof(struct cpl_tid_release), GFP_ATOMIC);
+	if (likely(skb)) {
+		t->tid_tab[tid] = NULL;
+		mk_tid_release(skb, chan, tid);
+		t4_ofld_send(adap, skb);
+	} else
+		cxgb4_queue_tid_release(t, chan, tid);
+	if (old)
+		atomic_dec(&t->tids_in_use);
+}
+EXPORT_SYMBOL(cxgb4_remove_tid);
+
+/*
+ * Allocate and initialize the TID tables.  Returns 0 on success.
+ */
+static int tid_init(struct tid_info *t)
+{
+	size_t size;
+	unsigned int stid_bmap_size;
+	unsigned int natids = t->natids;
+	struct adapter *adap = container_of(t, struct adapter, tids);
+
+	stid_bmap_size = BITS_TO_LONGS(t->nstids + t->nsftids);
+	size = t->ntids * sizeof(*t->tid_tab) +
+	       natids * sizeof(*t->atid_tab) +
+	       t->nstids * sizeof(*t->stid_tab) +
+	       t->nsftids * sizeof(*t->stid_tab) +
+	       stid_bmap_size * sizeof(long) +
+	       t->nftids * sizeof(*t->ftid_tab) +
+	       t->nsftids * sizeof(*t->ftid_tab);
+
+	t->tid_tab = t4_alloc_mem(size);
+	if (!t->tid_tab)
+		return -ENOMEM;
+
+	t->atid_tab = (union aopen_entry *)&t->tid_tab[t->ntids];
+	t->stid_tab = (struct serv_entry *)&t->atid_tab[natids];
+	t->stid_bmap = (unsigned long *)&t->stid_tab[t->nstids + t->nsftids];
+	t->ftid_tab = (struct filter_entry *)&t->stid_bmap[stid_bmap_size];
+	spin_lock_init(&t->stid_lock);
+	spin_lock_init(&t->atid_lock);
+
+	t->stids_in_use = 0;
+	t->afree = NULL;
+	t->atids_in_use = 0;
+	atomic_set(&t->tids_in_use, 0);
+
+	/* Setup the free list for atid_tab and clear the stid bitmap. */
+	if (natids) {
+		while (--natids)
+			t->atid_tab[natids - 1].next = &t->atid_tab[natids];
+		t->afree = t->atid_tab;
+	}
+	bitmap_zero(t->stid_bmap, t->nstids + t->nsftids);
+	/* Reserve stid 0 for T4/T5 adapters */
+	if (!t->stid_base &&
+	    (is_t4(adap->params.chip) || is_t5(adap->params.chip)))
+		__set_bit(0, t->stid_bmap);
+
+	return 0;
+}
+
+int cxgb4_clip_get(const struct net_device *dev,
+		   const struct in6_addr *lip)
+{
+	struct adapter *adap;
+	struct fw_clip_cmd c;
+
+	adap = netdev2adap(dev);
+	memset(&c, 0, sizeof(c));
+	c.op_to_write = htonl(FW_CMD_OP(FW_CLIP_CMD) |
+			FW_CMD_REQUEST | FW_CMD_WRITE);
+	c.alloc_to_len16 = htonl(F_FW_CLIP_CMD_ALLOC | FW_LEN16(c));
+	c.ip_hi = *(__be64 *)(lip->s6_addr);
+	c.ip_lo = *(__be64 *)(lip->s6_addr + 8);
+	return t4_wr_mbox_meat(adap, adap->mbox, &c, sizeof(c), &c, false);
+}
+EXPORT_SYMBOL(cxgb4_clip_get);
+
+int cxgb4_clip_release(const struct net_device *dev,
+		       const struct in6_addr *lip)
+{
+	struct adapter *adap;
+	struct fw_clip_cmd c;
+
+	adap = netdev2adap(dev);
+	memset(&c, 0, sizeof(c));
+	c.op_to_write = htonl(FW_CMD_OP(FW_CLIP_CMD) |
+			FW_CMD_REQUEST | FW_CMD_READ);
+	c.alloc_to_len16 = htonl(F_FW_CLIP_CMD_FREE | FW_LEN16(c));
+	c.ip_hi = *(__be64 *)(lip->s6_addr);
+	c.ip_lo = *(__be64 *)(lip->s6_addr + 8);
+	return t4_wr_mbox_meat(adap, adap->mbox, &c, sizeof(c), &c, false);
+}
+EXPORT_SYMBOL(cxgb4_clip_release);
+
+/**
+ *	cxgb4_create_server - create an IP server
+ *	@dev: the device
+ *	@stid: the server TID
+ *	@sip: local IP address to bind server to
+ *	@sport: the server's TCP port
+ *	@queue: queue to direct messages from this server to
+ *
+ *	Create an IP server for the given port and address.
+ *	Returns <0 on error and one of the %NET_XMIT_* values on success.
+ */
+int cxgb4_create_server(const struct net_device *dev, unsigned int stid,
+			__be32 sip, __be16 sport, __be16 vlan,
+			unsigned int queue)
+{
+	unsigned int chan;
+	struct sk_buff *skb;
+	struct adapter *adap;
+	struct cpl_pass_open_req *req;
+	int ret;
+
+	skb = alloc_skb(sizeof(*req), GFP_KERNEL);
+	if (!skb)
+		return -ENOMEM;
+
+	adap = netdev2adap(dev);
+	req = (struct cpl_pass_open_req *)__skb_put(skb, sizeof(*req));
+	INIT_TP_WR(req, 0);
+	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_PASS_OPEN_REQ, stid));
+	req->local_port = sport;
+	req->peer_port = htons(0);
+	req->local_ip = sip;
+	req->peer_ip = htonl(0);
+	chan = rxq_to_chan(&adap->sge, queue);
+	req->opt0 = cpu_to_be64(TX_CHAN(chan));
+	req->opt1 = cpu_to_be64(CONN_POLICY_ASK |
+				SYN_RSS_ENABLE | SYN_RSS_QUEUE(queue));
+	ret = t4_mgmt_tx(adap, skb);
+	return net_xmit_eval(ret);
+}
+EXPORT_SYMBOL(cxgb4_create_server);
+
+/*	cxgb4_create_server6 - create an IPv6 server
+ *	@dev: the device
+ *	@stid: the server TID
+ *	@sip: local IPv6 address to bind server to
+ *	@sport: the server's TCP port
+ *	@queue: queue to direct messages from this server to
+ *
+ *	Create an IPv6 server for the given port and address.
+ *	Returns <0 on error and one of the %NET_XMIT_* values on success.
+ */
+int cxgb4_create_server6(const struct net_device *dev, unsigned int stid,
+			 const struct in6_addr *sip, __be16 sport,
+			 unsigned int queue)
+{
+	unsigned int chan;
+	struct sk_buff *skb;
+	struct adapter *adap;
+	struct cpl_pass_open_req6 *req;
+	int ret;
+
+	skb = alloc_skb(sizeof(*req), GFP_KERNEL);
+	if (!skb)
+		return -ENOMEM;
+
+	adap = netdev2adap(dev);
+	req = (struct cpl_pass_open_req6 *)__skb_put(skb, sizeof(*req));
+	INIT_TP_WR(req, 0);
+	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_PASS_OPEN_REQ6, stid));
+	req->local_port = sport;
+	req->peer_port = htons(0);
+	req->local_ip_hi = *(__be64 *)(sip->s6_addr);
+	req->local_ip_lo = *(__be64 *)(sip->s6_addr + 8);
+	req->peer_ip_hi = cpu_to_be64(0);
+	req->peer_ip_lo = cpu_to_be64(0);
+	chan = rxq_to_chan(&adap->sge, queue);
+	req->opt0 = cpu_to_be64(TX_CHAN(chan));
+	req->opt1 = cpu_to_be64(CONN_POLICY_ASK |
+				SYN_RSS_ENABLE | SYN_RSS_QUEUE(queue));
+	ret = t4_mgmt_tx(adap, skb);
+	return net_xmit_eval(ret);
+}
+EXPORT_SYMBOL(cxgb4_create_server6);
+
+int cxgb4_remove_server(const struct net_device *dev, unsigned int stid,
+			unsigned int queue, bool ipv6)
+{
+	struct sk_buff *skb;
+	struct adapter *adap;
+	struct cpl_close_listsvr_req *req;
+	int ret;
+
+	adap = netdev2adap(dev);
+
+	skb = alloc_skb(sizeof(*req), GFP_KERNEL);
+	if (!skb)
+		return -ENOMEM;
+
+	req = (struct cpl_close_listsvr_req *)__skb_put(skb, sizeof(*req));
+	INIT_TP_WR(req, 0);
+	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_LISTSRV_REQ, stid));
+	req->reply_ctrl = htons(NO_REPLY(0) | (ipv6 ? LISTSVR_IPV6(1) :
+				LISTSVR_IPV6(0)) | QUEUENO(queue));
+	ret = t4_mgmt_tx(adap, skb);
+	return net_xmit_eval(ret);
+}
+EXPORT_SYMBOL(cxgb4_remove_server);
+
+/**
+ *	cxgb4_best_mtu - find the entry in the MTU table closest to an MTU
+ *	@mtus: the HW MTU table
+ *	@mtu: the target MTU
+ *	@idx: index of selected entry in the MTU table
+ *
+ *	Returns the index and the value in the HW MTU table that is closest to
+ *	but does not exceed @mtu, unless @mtu is smaller than any value in the
+ *	table, in which case that smallest available value is selected.
+ */
+unsigned int cxgb4_best_mtu(const unsigned short *mtus, unsigned short mtu,
+			    unsigned int *idx)
+{
+	unsigned int i = 0;
+
+	while (i < NMTUS - 1 && mtus[i + 1] <= mtu)
+		++i;
+	if (idx)
+		*idx = i;
+	return mtus[i];
+}
+EXPORT_SYMBOL(cxgb4_best_mtu);
+
+/**
+ *     cxgb4_best_aligned_mtu - find best MTU, [hopefully] data size aligned
+ *     @mtus: the HW MTU table
+ *     @header_size: Header Size
+ *     @data_size_max: maximum Data Segment Size
+ *     @data_size_align: desired Data Segment Size Alignment (2^N)
+ *     @mtu_idxp: HW MTU Table Index return value pointer (possibly NULL)
+ *
+ *     Similar to cxgb4_best_mtu() but instead of searching the Hardware
+ *     MTU Table based solely on a Maximum MTU parameter, we break that
+ *     parameter up into a Header Size and Maximum Data Segment Size, and
+ *     provide a desired Data Segment Size Alignment.  If we find an MTU in
+ *     the Hardware MTU Table which will result in a Data Segment Size with
+ *     the requested alignment _and_ that MTU isn't "too far" from the
+ *     closest MTU, then we'll return that rather than the closest MTU.
+ */
+unsigned int cxgb4_best_aligned_mtu(const unsigned short *mtus,
+				    unsigned short header_size,
+				    unsigned short data_size_max,
+				    unsigned short data_size_align,
+				    unsigned int *mtu_idxp)
+{
+	unsigned short max_mtu = header_size + data_size_max;
+	unsigned short data_size_align_mask = data_size_align - 1;
+	int mtu_idx, aligned_mtu_idx;
+
+	/* Scan the MTU Table till we find an MTU which is larger than our
+	 * Maximum MTU or we reach the end of the table.  Along the way,
+	 * record the last MTU found, if any, which will result in a Data
+	 * Segment Length matching the requested alignment.
+	 */
+	for (mtu_idx = 0, aligned_mtu_idx = -1; mtu_idx < NMTUS; mtu_idx++) {
+		unsigned short data_size = mtus[mtu_idx] - header_size;
+
+		/* If this MTU minus the Header Size would result in a
+		 * Data Segment Size of the desired alignment, remember it.
+		 */
+		if ((data_size & data_size_align_mask) == 0)
+			aligned_mtu_idx = mtu_idx;
+
+		/* If we're not at the end of the Hardware MTU Table and the
+		 * next element is larger than our Maximum MTU, drop out of
+		 * the loop.
+		 */
+		if (mtu_idx+1 < NMTUS && mtus[mtu_idx+1] > max_mtu)
+			break;
+	}
+
+	/* If we fell out of the loop because we ran to the end of the table,
+	 * then we just have to use the last [largest] entry.
+	 */
+	if (mtu_idx == NMTUS)
+		mtu_idx--;
+
+	/* If we found an MTU which resulted in the requested Data Segment
+	 * Length alignment and that's "not far" from the largest MTU which is
+	 * less than or equal to the maximum MTU, then use that.
+	 */
+	if (aligned_mtu_idx >= 0 &&
+	    mtu_idx - aligned_mtu_idx <= 1)
+		mtu_idx = aligned_mtu_idx;
+
+	/* If the caller has passed in an MTU Index pointer, pass the
+	 * MTU Index back.  Return the MTU value.
+	 */
+	if (mtu_idxp)
+		*mtu_idxp = mtu_idx;
+	return mtus[mtu_idx];
+}
+EXPORT_SYMBOL(cxgb4_best_aligned_mtu);
+
+/**
+ *	cxgb4_port_chan - get the HW channel of a port
+ *	@dev: the net device for the port
+ *
+ *	Return the HW Tx channel of the given port.
+ */
+unsigned int cxgb4_port_chan(const struct net_device *dev)
+{
+	return netdev2pinfo(dev)->tx_chan;
+}
+EXPORT_SYMBOL(cxgb4_port_chan);
+
+unsigned int cxgb4_dbfifo_count(const struct net_device *dev, int lpfifo)
+{
+	struct adapter *adap = netdev2adap(dev);
+	u32 v1, v2, lp_count, hp_count;
+
+	v1 = t4_read_reg(adap, A_SGE_DBFIFO_STATUS);
+	v2 = t4_read_reg(adap, SGE_DBFIFO_STATUS2);
+	if (is_t4(adap->params.chip)) {
+		lp_count = G_LP_COUNT(v1);
+		hp_count = G_HP_COUNT(v1);
+	} else {
+		lp_count = G_LP_COUNT_T5(v1);
+		hp_count = G_HP_COUNT_T5(v2);
+	}
+	return lpfifo ? lp_count : hp_count;
+}
+EXPORT_SYMBOL(cxgb4_dbfifo_count);
+
+/**
+ *	cxgb4_port_viid - get the VI id of a port
+ *	@dev: the net device for the port
+ *
+ *	Return the VI id of the given port.
+ */
+unsigned int cxgb4_port_viid(const struct net_device *dev)
+{
+	return netdev2pinfo(dev)->viid;
+}
+EXPORT_SYMBOL(cxgb4_port_viid);
+
+/**
+ *	cxgb4_port_idx - get the index of a port
+ *	@dev: the net device for the port
+ *
+ *	Return the index of the given port.
+ */
+unsigned int cxgb4_port_idx(const struct net_device *dev)
+{
+	return netdev2pinfo(dev)->port_id;
+}
+EXPORT_SYMBOL(cxgb4_port_idx);
+
+void cxgb4_get_tcp_stats(struct pci_dev *pdev, struct tp_tcp_stats *v4,
+			 struct tp_tcp_stats *v6)
+{
+	struct adapter *adap = pci_get_drvdata(pdev);
+
+	spin_lock(&adap->stats_lock);
+	t4_tp_get_tcp_stats(adap, v4, v6);
+	spin_unlock(&adap->stats_lock);
+}
+EXPORT_SYMBOL(cxgb4_get_tcp_stats);
+
+void cxgb4_iscsi_init(struct net_device *dev, unsigned int tag_mask,
+		      const unsigned int *pgsz_order)
+{
+	struct adapter *adap = netdev2adap(dev);
+
+	t4_write_reg(adap, ULP_RX_ISCSI_TAGMASK, tag_mask);
+	t4_write_reg(adap, ULP_RX_ISCSI_PSZ, HPZ0(pgsz_order[0]) |
+		     HPZ1(pgsz_order[1]) | HPZ2(pgsz_order[2]) |
+		     HPZ3(pgsz_order[3]));
+}
+EXPORT_SYMBOL(cxgb4_iscsi_init);
+
+int cxgb4_flush_eq_cache(struct net_device *dev)
+{
+	struct adapter *adap = netdev2adap(dev);
+	int ret;
+
+	ret = t4_fwaddrspace_write(adap, adap->mbox,
+				   0xe1000000 + A_SGE_CTXT_CMD, 0x20000000);
+	return ret;
+}
+EXPORT_SYMBOL(cxgb4_flush_eq_cache);
+
+static int read_eq_indices(struct adapter *adap, u16 qid, u16 *pidx, u16 *cidx)
+{
+	u32 addr = t4_read_reg(adap, A_SGE_DBQ_CTXT_BADDR) + 24 * qid + 8;
+	__be64 indices;
+	int ret;
+
+	spin_lock(&adap->win0_lock);
+	ret = t4_memory_rw(adap, 0, MEM_EDC0, addr,
+			   sizeof(indices), (__be32 *)&indices,
+			   T4_MEMORY_READ);
+	spin_unlock(&adap->win0_lock);
+	if (!ret) {
+		*cidx = (be64_to_cpu(indices) >> 25) & 0xffff;
+		*pidx = (be64_to_cpu(indices) >> 9) & 0xffff;
+	}
+	return ret;
+}
+
+int cxgb4_sync_txq_pidx(struct net_device *dev, u16 qid, u16 pidx,
+			u16 size)
+{
+	struct adapter *adap = netdev2adap(dev);
+	u16 hw_pidx, hw_cidx;
+	int ret;
+
+	ret = read_eq_indices(adap, qid, &hw_pidx, &hw_cidx);
+	if (ret)
+		goto out;
+
+	if (pidx != hw_pidx) {
+		u16 delta;
+
+		if (pidx >= hw_pidx)
+			delta = pidx - hw_pidx;
+		else
+			delta = size - hw_pidx + pidx;
+		wmb();
+		t4_write_reg(adap, MYPF_REG(SGE_PF_KDOORBELL),
+			     QID(qid) | PIDX(delta));
+	}
+out:
+	return ret;
+}
+EXPORT_SYMBOL(cxgb4_sync_txq_pidx);
+
+void cxgb4_disable_db_coalescing(struct net_device *dev)
+{
+	struct adapter *adap;
+
+	adap = netdev2adap(dev);
+	t4_set_reg_field(adap, A_SGE_DOORBELL_CONTROL, F_NOCOALESCE,
+			 F_NOCOALESCE);
+}
+EXPORT_SYMBOL(cxgb4_disable_db_coalescing);
+
+void cxgb4_enable_db_coalescing(struct net_device *dev)
+{
+	struct adapter *adap;
+
+	adap = netdev2adap(dev);
+	t4_set_reg_field(adap, A_SGE_DOORBELL_CONTROL, F_NOCOALESCE, 0);
+}
+EXPORT_SYMBOL(cxgb4_enable_db_coalescing);
+
+int cxgb4_read_tpte(struct net_device *dev, u32 stag, __be32 *tpte)
+{
+	struct adapter *adap;
+	u32 offset, memtype, memaddr;
+	u32 edc0_size, edc1_size, mc0_size, mc1_size;
+	u32 edc0_end, edc1_end, mc0_end, mc1_end;
+	int ret;
+
+	adap = netdev2adap(dev);
+
+	offset = ((stag >> 8) * 32) + adap->vres.stag.start;
+
+	/* Figure out where the offset lands in the Memory Type/Address scheme.
+	 * This code assumes that the memory is laid out starting at offset 0
+	 * with no breaks as: EDC0, EDC1, MC0, MC1. All cards have both EDC0
+	 * and EDC1.  Some cards will have neither MC0 nor MC1, most cards have
+	 * MC0, and some have both MC0 and MC1.
+	 */
+	edc0_size = EDRAM_SIZE_GET(t4_read_reg(adap, MA_EDRAM0_BAR)) << 20;
+	edc1_size = EDRAM_SIZE_GET(t4_read_reg(adap, MA_EDRAM1_BAR)) << 20;
+	mc0_size = EXT_MEM_SIZE_GET(t4_read_reg(adap, MA_EXT_MEMORY_BAR)) << 20;
+
+	edc0_end = edc0_size;
+	edc1_end = edc0_end + edc1_size;
+	mc0_end = edc1_end + mc0_size;
+
+	if (offset < edc0_end) {
+		memtype = MEM_EDC0;
+		memaddr = offset;
+	} else if (offset < edc1_end) {
+		memtype = MEM_EDC1;
+		memaddr = offset - edc0_end;
+	} else {
+		if (offset < mc0_end) {
+			memtype = MEM_MC0;
+			memaddr = offset - edc1_end;
+		} else if (is_t4(adap->params.chip)) {
+			/* T4 only has a single memory channel */
+			goto err;
+		} else {
+			mc1_size = EXT_MEM_SIZE_GET(
+					t4_read_reg(adap,
+						    MA_EXT_MEMORY1_BAR)) << 20;
+			mc1_end = mc0_end + mc1_size;
+			if (offset < mc1_end) {
+				memtype = MEM_MC1;
+				memaddr = offset - mc0_end;
+			} else {
+				/* offset beyond the end of any memory */
+				goto err;
+			}
+		}
+	}
+
+	spin_lock(&adap->win0_lock);
+	ret = t4_memory_rw(adap, 0, memtype, memaddr, 32, tpte, T4_MEMORY_READ);
+	spin_unlock(&adap->win0_lock);
+	return ret;
+
+err:
+	dev_err(adap->pdev_dev, "stag %#x, offset %#x out of range\n",
+		stag, offset);
+	return -EINVAL;
+}
+EXPORT_SYMBOL(cxgb4_read_tpte);
+
+u64 cxgb4_read_sge_timestamp(struct net_device *dev)
+{
+	u32 hi, lo;
+	struct adapter *adap;
+
+	adap = netdev2adap(dev);
+	lo = t4_read_reg(adap, SGE_TIMESTAMP_LO);
+	hi = GET_TSVAL(t4_read_reg(adap, SGE_TIMESTAMP_HI));
+
+	return ((u64)hi << 32) | (u64)lo;
+}
+EXPORT_SYMBOL(cxgb4_read_sge_timestamp);
+
+static struct pci_driver cxgb4_driver;
+
+static void check_neigh_update(struct neighbour *neigh)
+{
+	const struct device *parent;
+	const struct net_device *netdev = neigh->dev;
+
+	if (netdev->priv_flags & IFF_802_1Q_VLAN)
+		netdev = vlan_dev_real_dev(netdev);
+	parent = netdev->dev.parent;
+	if (parent && parent->driver == &cxgb4_driver.driver)
+		t4_l2t_update(dev_get_drvdata(parent), neigh);
+}
+
+static int netevent_cb(struct notifier_block *nb, unsigned long event,
+		       void *data)
+{
+	switch (event) {
+	case NETEVENT_NEIGH_UPDATE:
+		check_neigh_update(data);
+		break;
+	case NETEVENT_REDIRECT:
+	default:
+		break;
+	}
+	return 0;
+}
+
+static bool netevent_registered;
+static struct notifier_block cxgb4_netevent_nb = {
+	.notifier_call = netevent_cb
+};
+
+static void drain_db_fifo(struct adapter *adap, int usecs)
+{
+	u32 v1, v2, lp_count, hp_count;
+
+	do {
+		v1 = t4_read_reg(adap, A_SGE_DBFIFO_STATUS);
+		v2 = t4_read_reg(adap, SGE_DBFIFO_STATUS2);
+		if (is_t4(adap->params.chip)) {
+			lp_count = G_LP_COUNT(v1);
+			hp_count = G_HP_COUNT(v1);
+		} else {
+			lp_count = G_LP_COUNT_T5(v1);
+			hp_count = G_HP_COUNT_T5(v2);
+		}
+
+		if (lp_count == 0 && hp_count == 0)
+			break;
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		schedule_timeout(usecs_to_jiffies(usecs));
+	} while (1);
+}
+
+static void disable_txq_db(struct sge_txq *q)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&q->db_lock, flags);
+	q->db_disabled = 1;
+	spin_unlock_irqrestore(&q->db_lock, flags);
+}
+
+static void enable_txq_db(struct adapter *adap, struct sge_txq *q)
+{
+	spin_lock_irq(&q->db_lock);
+	if (q->db_pidx_inc) {
+		/* Make sure that all writes to the TX descriptors
+		 * are committed before we tell HW about them.
+		 */
+		wmb();
+		t4_write_reg(adap, MYPF_REG(SGE_PF_KDOORBELL),
+			     QID(q->cntxt_id) | PIDX(q->db_pidx_inc));
+		q->db_pidx_inc = 0;
+	}
+	q->db_disabled = 0;
+	spin_unlock_irq(&q->db_lock);
+}
+
+static void disable_dbs(struct adapter *adap)
+{
+	int i;
+
+	for_each_ethrxq(&adap->sge, i)
+		disable_txq_db(&adap->sge.ethtxq[i].q);
+	for_each_ofldrxq(&adap->sge, i)
+		disable_txq_db(&adap->sge.ofldtxq[i].q);
+	for_each_port(adap, i)
+		disable_txq_db(&adap->sge.ctrlq[i].q);
+}
+
+static void enable_dbs(struct adapter *adap)
+{
+	int i;
+
+	for_each_ethrxq(&adap->sge, i)
+		enable_txq_db(adap, &adap->sge.ethtxq[i].q);
+	for_each_ofldrxq(&adap->sge, i)
+		enable_txq_db(adap, &adap->sge.ofldtxq[i].q);
+	for_each_port(adap, i)
+		enable_txq_db(adap, &adap->sge.ctrlq[i].q);
+}
+
+static void notify_rdma_uld(struct adapter *adap, enum cxgb4_control cmd)
+{
+	if (adap->uld_handle[CXGB4_ULD_RDMA])
+		ulds[CXGB4_ULD_RDMA].control(adap->uld_handle[CXGB4_ULD_RDMA],
+				cmd);
+}
+
+static void process_db_full(struct work_struct *work)
+{
+	struct adapter *adap;
+
+	adap = container_of(work, struct adapter, db_full_task);
+
+	drain_db_fifo(adap, dbfifo_drain_delay);
+	enable_dbs(adap);
+	notify_rdma_uld(adap, CXGB4_CONTROL_DB_EMPTY);
+	t4_set_reg_field(adap, SGE_INT_ENABLE3,
+			 DBFIFO_HP_INT | DBFIFO_LP_INT,
+			 DBFIFO_HP_INT | DBFIFO_LP_INT);
+}
+
+static void sync_txq_pidx(struct adapter *adap, struct sge_txq *q)
+{
+	u16 hw_pidx, hw_cidx;
+	int ret;
+
+	spin_lock_irq(&q->db_lock);
+	ret = read_eq_indices(adap, (u16)q->cntxt_id, &hw_pidx, &hw_cidx);
+	if (ret)
+		goto out;
+	if (q->db_pidx != hw_pidx) {
+		u16 delta;
+
+		if (q->db_pidx >= hw_pidx)
+			delta = q->db_pidx - hw_pidx;
+		else
+			delta = q->size - hw_pidx + q->db_pidx;
+		wmb();
+		t4_write_reg(adap, MYPF_REG(SGE_PF_KDOORBELL),
+			     QID(q->cntxt_id) | PIDX(delta));
+	}
+out:
+	q->db_disabled = 0;
+	q->db_pidx_inc = 0;
+	spin_unlock_irq(&q->db_lock);
+	if (ret)
+		CH_WARN(adap, "DB drop recovery failed.\n");
+}
+static void recover_all_queues(struct adapter *adap)
+{
+	int i;
+
+	for_each_ethrxq(&adap->sge, i)
+		sync_txq_pidx(adap, &adap->sge.ethtxq[i].q);
+	for_each_ofldrxq(&adap->sge, i)
+		sync_txq_pidx(adap, &adap->sge.ofldtxq[i].q);
+	for_each_port(adap, i)
+		sync_txq_pidx(adap, &adap->sge.ctrlq[i].q);
+}
+
+static void process_db_drop(struct work_struct *work)
+{
+	struct adapter *adap;
+
+	adap = container_of(work, struct adapter, db_drop_task);
+
+	if (is_t4(adap->params.chip)) {
+		drain_db_fifo(adap, dbfifo_drain_delay);
+		notify_rdma_uld(adap, CXGB4_CONTROL_DB_DROP);
+		drain_db_fifo(adap, dbfifo_drain_delay);
+		recover_all_queues(adap);
+		drain_db_fifo(adap, dbfifo_drain_delay);
+		enable_dbs(adap);
+		notify_rdma_uld(adap, CXGB4_CONTROL_DB_EMPTY);
+	} else {
+		u32 dropped_db = t4_read_reg(adap, 0x010ac);
+		u16 qid = (dropped_db >> 15) & 0x1ffff;
+		u16 pidx_inc = dropped_db & 0x1fff;
+		unsigned int s_qpp;
+		unsigned short udb_density;
+		unsigned long qpshift;
+		int page;
+		u32 udb;
+
+		dev_warn(adap->pdev_dev,
+			 "Dropped DB 0x%x qid %d bar2 %d coalesce %d pidx %d\n",
+			 dropped_db, qid,
+			 (dropped_db >> 14) & 1,
+			 (dropped_db >> 13) & 1,
+			 pidx_inc);
+
+		drain_db_fifo(adap, 1);
+
+		s_qpp = QUEUESPERPAGEPF1 * adap->fn;
+		udb_density = 1 << QUEUESPERPAGEPF0_GET(t4_read_reg(adap,
+				SGE_EGRESS_QUEUES_PER_PAGE_PF) >> s_qpp);
+		qpshift = PAGE_SHIFT - ilog2(udb_density);
+		udb = qid << qpshift;
+		udb &= PAGE_MASK;
+		page = udb / PAGE_SIZE;
+		udb += (qid - (page * udb_density)) * 128;
+
+		writel(PIDX(pidx_inc),  adap->bar2 + udb + 8);
+
+		/* Re-enable BAR2 WC */
+		t4_set_reg_field(adap, 0x10b0, 1<<15, 1<<15);
+	}
+
+	t4_set_reg_field(adap, A_SGE_DOORBELL_CONTROL, F_DROPPED_DB, 0);
+}
+
+void t4_db_full(struct adapter *adap)
+{
+	if (is_t4(adap->params.chip)) {
+		disable_dbs(adap);
+		notify_rdma_uld(adap, CXGB4_CONTROL_DB_FULL);
+		t4_set_reg_field(adap, SGE_INT_ENABLE3,
+				 DBFIFO_HP_INT | DBFIFO_LP_INT, 0);
+		queue_work(adap->workq, &adap->db_full_task);
+	}
+}
+
+void t4_db_dropped(struct adapter *adap)
+{
+	if (is_t4(adap->params.chip)) {
+		disable_dbs(adap);
+		notify_rdma_uld(adap, CXGB4_CONTROL_DB_FULL);
+	}
+	queue_work(adap->workq, &adap->db_drop_task);
+}
+
+static void uld_attach(struct adapter *adap, unsigned int uld)
+{
+	void *handle;
+	struct cxgb4_lld_info lli;
+	unsigned short i;
+
+	lli.pdev = adap->pdev;
+	lli.pf = adap->fn;
+	lli.l2t = adap->l2t;
+	lli.tids = &adap->tids;
+	lli.ports = adap->port;
+	lli.vr = &adap->vres;
+	lli.mtus = adap->params.mtus;
+	if (uld == CXGB4_ULD_RDMA) {
+		lli.rxq_ids = adap->sge.rdma_rxq;
+		lli.ciq_ids = adap->sge.rdma_ciq;
+		lli.nrxq = adap->sge.rdmaqs;
+		lli.nciq = adap->sge.rdmaciqs;
+	} else if (uld == CXGB4_ULD_ISCSI) {
+		lli.rxq_ids = adap->sge.ofld_rxq;
+		lli.nrxq = adap->sge.ofldqsets;
+	}
+	lli.ntxq = adap->sge.ofldqsets;
+	lli.nchan = adap->params.nports;
+	lli.nports = adap->params.nports;
+	lli.wr_cred = adap->params.ofldq_wr_cred;
+	lli.adapter_type = adap->params.chip;
+	lli.iscsi_iolen = MAXRXDATA_GET(t4_read_reg(adap, TP_PARA_REG2));
+	lli.cclk_ps = 1000000000 / adap->params.vpd.cclk;
+	lli.udb_density = 1 << QUEUESPERPAGEPF0_GET(
+			t4_read_reg(adap, SGE_EGRESS_QUEUES_PER_PAGE_PF) >>
+			(adap->fn * 4));
+	lli.ucq_density = 1 << QUEUESPERPAGEPF0_GET(
+			t4_read_reg(adap, SGE_INGRESS_QUEUES_PER_PAGE_PF) >>
+			(adap->fn * 4));
+	lli.filt_mode = adap->params.tp.vlan_pri_map;
+	/* MODQ_REQ_MAP sets queues 0-3 to chan 0-3 */
+	for (i = 0; i < NCHAN; i++)
+		lli.tx_modq[i] = i;
+	lli.gts_reg = adap->regs + MYPF_REG(SGE_PF_GTS);
+	lli.db_reg = adap->regs + MYPF_REG(SGE_PF_KDOORBELL);
+	lli.fw_vers = adap->params.fw_vers;
+	lli.dbfifo_int_thresh = dbfifo_int_thresh;
+	lli.sge_ingpadboundary = adap->sge.fl_align;
+	lli.sge_egrstatuspagesize = adap->sge.stat_len;
+	lli.sge_pktshift = adap->sge.pktshift;
+	lli.enable_fw_ofld_conn = adap->flags & FW_OFLD_CONN;
+	lli.max_ordird_qp = adap->params.max_ordird_qp;
+	lli.max_ird_adapter = adap->params.max_ird_adapter;
+	lli.ulptx_memwrite_dsgl = adap->params.ulptx_memwrite_dsgl;
+
+	handle = ulds[uld].add(&lli);
+	if (IS_ERR(handle)) {
+		dev_warn(adap->pdev_dev,
+			 "could not attach to the %s driver, error %ld\n",
+			 uld_str[uld], PTR_ERR(handle));
+		return;
+	}
+
+	adap->uld_handle[uld] = handle;
+
+	if (!netevent_registered) {
+		register_netevent_notifier(&cxgb4_netevent_nb);
+		netevent_registered = true;
+	}
+
+	if (adap->flags & FULL_INIT_DONE)
+		ulds[uld].state_change(handle, CXGB4_STATE_UP);
+}
+
+static void attach_ulds(struct adapter *adap)
+{
+	unsigned int i;
+
+	spin_lock(&adap_rcu_lock);
+	list_add_tail_rcu(&adap->rcu_node, &adap_rcu_list);
+	spin_unlock(&adap_rcu_lock);
+
+	mutex_lock(&uld_mutex);
+	list_add_tail(&adap->list_node, &adapter_list);
+	for (i = 0; i < CXGB4_ULD_MAX; i++)
+		if (ulds[i].add)
+			uld_attach(adap, i);
+	mutex_unlock(&uld_mutex);
+}
+
+static void detach_ulds(struct adapter *adap)
+{
+	unsigned int i;
+
+	mutex_lock(&uld_mutex);
+	list_del(&adap->list_node);
+	for (i = 0; i < CXGB4_ULD_MAX; i++)
+		if (adap->uld_handle[i]) {
+			ulds[i].state_change(adap->uld_handle[i],
+					     CXGB4_STATE_DETACH);
+			adap->uld_handle[i] = NULL;
+		}
+	if (netevent_registered && list_empty(&adapter_list)) {
+		unregister_netevent_notifier(&cxgb4_netevent_nb);
+		netevent_registered = false;
+	}
+	mutex_unlock(&uld_mutex);
+
+	spin_lock(&adap_rcu_lock);
+	list_del_rcu(&adap->rcu_node);
+	spin_unlock(&adap_rcu_lock);
+}
+
+static void notify_ulds(struct adapter *adap, enum cxgb4_state new_state)
+{
+	unsigned int i;
+
+	mutex_lock(&uld_mutex);
+	for (i = 0; i < CXGB4_ULD_MAX; i++)
+		if (adap->uld_handle[i])
+			ulds[i].state_change(adap->uld_handle[i], new_state);
+	mutex_unlock(&uld_mutex);
+}
+
+/**
+ *	cxgb4_register_uld - register an upper-layer driver
+ *	@type: the ULD type
+ *	@p: the ULD methods
+ *
+ *	Registers an upper-layer driver with this driver and notifies the ULD
+ *	about any presently available devices that support its type.  Returns
+ *	%-EBUSY if a ULD of the same type is already registered.
+ */
+int cxgb4_register_uld(enum cxgb4_uld type, const struct cxgb4_uld_info *p)
+{
+	int ret = 0;
+	struct adapter *adap;
+
+	if (type >= CXGB4_ULD_MAX)
+		return -EINVAL;
+	mutex_lock(&uld_mutex);
+	if (ulds[type].add) {
+		ret = -EBUSY;
+		goto out;
+	}
+	ulds[type] = *p;
+	list_for_each_entry(adap, &adapter_list, list_node)
+		uld_attach(adap, type);
+out:	mutex_unlock(&uld_mutex);
+	return ret;
+}
+EXPORT_SYMBOL(cxgb4_register_uld);
+
+/**
+ *	cxgb4_unregister_uld - unregister an upper-layer driver
+ *	@type: the ULD type
+ *
+ *	Unregisters an existing upper-layer driver.
+ */
+int cxgb4_unregister_uld(enum cxgb4_uld type)
+{
+	struct adapter *adap;
+
+	if (type >= CXGB4_ULD_MAX)
+		return -EINVAL;
+	mutex_lock(&uld_mutex);
+	list_for_each_entry(adap, &adapter_list, list_node)
+		adap->uld_handle[type] = NULL;
+	ulds[type].add = NULL;
+	mutex_unlock(&uld_mutex);
+	return 0;
+}
+EXPORT_SYMBOL(cxgb4_unregister_uld);
+
+/* Check if netdev on which event is occured belongs to us or not. Return
+ * success (true) if it belongs otherwise failure (false).
+ * Called with rcu_read_lock() held.
+ */
+#if IS_ENABLED(CONFIG_IPV6)
+static bool cxgb4_netdev(const struct net_device *netdev)
+{
+	struct adapter *adap;
+	int i;
+
+	list_for_each_entry_rcu(adap, &adap_rcu_list, rcu_node)
+		for (i = 0; i < MAX_NPORTS; i++)
+			if (adap->port[i] == netdev)
+				return true;
+	return false;
+}
+
+static int clip_add(struct net_device *event_dev, struct inet6_ifaddr *ifa,
+		    unsigned long event)
+{
+	int ret = NOTIFY_DONE;
+
+	rcu_read_lock();
+	if (cxgb4_netdev(event_dev)) {
+		switch (event) {
+		case NETDEV_UP:
+			ret = cxgb4_clip_get(event_dev,
+				(const struct in6_addr *)ifa->addr.s6_addr);
+			if (ret < 0) {
+				rcu_read_unlock();
+				return ret;
+			}
+			ret = NOTIFY_OK;
+			break;
+		case NETDEV_DOWN:
+			cxgb4_clip_release(event_dev,
+				(const struct in6_addr *)ifa->addr.s6_addr);
+			ret = NOTIFY_OK;
+			break;
+		default:
+			break;
+		}
+	}
+	rcu_read_unlock();
+	return ret;
+}
+
+static int cxgb4_inet6addr_handler(struct notifier_block *this,
+		unsigned long event, void *data)
+{
+	struct inet6_ifaddr *ifa = data;
+	struct net_device *event_dev;
+	int ret = NOTIFY_DONE;
+	struct bonding *bond = netdev_priv(ifa->idev->dev);
+	struct list_head *iter;
+	struct slave *slave;
+	struct pci_dev *first_pdev = NULL;
+
+	if (ifa->idev->dev->priv_flags & IFF_802_1Q_VLAN) {
+		event_dev = vlan_dev_real_dev(ifa->idev->dev);
+		ret = clip_add(event_dev, ifa, event);
+	} else if (ifa->idev->dev->flags & IFF_MASTER) {
+		/* It is possible that two different adapters are bonded in one
+		 * bond. We need to find such different adapters and add clip
+		 * in all of them only once.
+		 */
+		bond_for_each_slave(bond, slave, iter) {
+			if (!first_pdev) {
+				ret = clip_add(slave->dev, ifa, event);
+				/* If clip_add is success then only initialize
+				 * first_pdev since it means it is our device
+				 */
+				if (ret == NOTIFY_OK)
+					first_pdev = to_pci_dev(
+							slave->dev->dev.parent);
+			} else if (first_pdev !=
+				   to_pci_dev(slave->dev->dev.parent))
+					ret = clip_add(slave->dev, ifa, event);
+		}
+	} else
+		ret = clip_add(ifa->idev->dev, ifa, event);
+
+	return ret;
+}
+
+static struct notifier_block cxgb4_inet6addr_notifier = {
+	.notifier_call = cxgb4_inet6addr_handler
+};
+
+/* Retrieves IPv6 addresses from a root device (bond, vlan) associated with
+ * a physical device.
+ * The physical device reference is needed to send the actul CLIP command.
+ */
+static int update_dev_clip(struct net_device *root_dev, struct net_device *dev)
+{
+	struct inet6_dev *idev = NULL;
+	struct inet6_ifaddr *ifa;
+	int ret = 0;
+
+	idev = __in6_dev_get(root_dev);
+	if (!idev)
+		return ret;
+
+	read_lock_bh(&idev->lock);
+	list_for_each_entry(ifa, &idev->addr_list, if_list) {
+		ret = cxgb4_clip_get(dev,
+				(const struct in6_addr *)ifa->addr.s6_addr);
+		if (ret < 0)
+			break;
+	}
+	read_unlock_bh(&idev->lock);
+
+	return ret;
+}
+
+static int update_root_dev_clip(struct net_device *dev)
+{
+	struct net_device *root_dev = NULL;
+	int i, ret = 0;
+
+	/* First populate the real net device's IPv6 addresses */
+	ret = update_dev_clip(dev, dev);
+	if (ret)
+		return ret;
+
+	/* Parse all bond and vlan devices layered on top of the physical dev */
+	root_dev = netdev_master_upper_dev_get_rcu(dev);
+	if (root_dev) {
+		ret = update_dev_clip(root_dev, dev);
+		if (ret)
+			return ret;
+	}
+
+	for (i = 0; i < VLAN_N_VID; i++) {
+		root_dev = __vlan_find_dev_deep_rcu(dev, htons(ETH_P_8021Q), i);
+		if (!root_dev)
+			continue;
+
+		ret = update_dev_clip(root_dev, dev);
+		if (ret)
+			break;
+	}
+	return ret;
+}
+
+static void update_clip(const struct adapter *adap)
+{
+	int i;
+	struct net_device *dev;
+	int ret;
+
+	rcu_read_lock();
+
+	for (i = 0; i < MAX_NPORTS; i++) {
+		dev = adap->port[i];
+		ret = 0;
+
+		if (dev)
+			ret = update_root_dev_clip(dev);
+
+		if (ret < 0)
+			break;
+	}
+	rcu_read_unlock();
+}
+#endif /* IS_ENABLED(CONFIG_IPV6) */
+
+/**
+ *	cxgb_up - enable the adapter
+ *	@adap: adapter being enabled
+ *
+ *	Called when the first port is enabled, this function performs the
+ *	actions necessary to make an adapter operational, such as completing
+ *	the initialization of HW modules, and enabling interrupts.
+ *
+ *	Must be called with the rtnl lock held.
+ */
+static int cxgb_up(struct adapter *adap)
+{
+	int err;
+
+	err = setup_sge_queues(adap);
+	if (err)
+		goto out;
+	err = setup_rss(adap);
+	if (err)
+		goto freeq;
+
+	if (adap->flags & USING_MSIX) {
+		name_msix_vecs(adap);
+		err = request_irq(adap->msix_info[0].vec, t4_nondata_intr, 0,
+				  adap->msix_info[0].desc, adap);
+		if (err)
+			goto irq_err;
+
+		err = request_msix_queue_irqs(adap);
+		if (err) {
+			free_irq(adap->msix_info[0].vec, adap);
+			goto irq_err;
+		}
+	} else {
+		err = request_irq(adap->pdev->irq, t4_intr_handler(adap),
+				  (adap->flags & USING_MSI) ? 0 : IRQF_SHARED,
+				  adap->port[0]->name, adap);
+		if (err)
+			goto irq_err;
+	}
+	enable_rx(adap);
+	t4_sge_start(adap);
+	t4_intr_enable(adap);
+	adap->flags |= FULL_INIT_DONE;
+	notify_ulds(adap, CXGB4_STATE_UP);
+#if IS_ENABLED(CONFIG_IPV6)
+	update_clip(adap);
+#endif
+ out:
+	return err;
+ irq_err:
+	dev_err(adap->pdev_dev, "request_irq failed, err %d\n", err);
+ freeq:
+	t4_free_sge_resources(adap);
+	goto out;
+}
+
+static void cxgb_down(struct adapter *adapter)
+{
+	t4_intr_disable(adapter);
+	cancel_work_sync(&adapter->tid_release_task);
+	cancel_work_sync(&adapter->db_full_task);
+	cancel_work_sync(&adapter->db_drop_task);
+	adapter->tid_release_task_busy = false;
+	adapter->tid_release_head = NULL;
+
+	if (adapter->flags & USING_MSIX) {
+		free_msix_queue_irqs(adapter);
+		free_irq(adapter->msix_info[0].vec, adapter);
+	} else
+		free_irq(adapter->pdev->irq, adapter);
+	quiesce_rx(adapter);
+	t4_sge_stop(adapter);
+	t4_free_sge_resources(adapter);
+	adapter->flags &= ~FULL_INIT_DONE;
+}
+
+/*
+ * net_device operations
+ */
+static int cxgb_open(struct net_device *dev)
+{
+	int err;
+	struct port_info *pi = netdev_priv(dev);
+	struct adapter *adapter = pi->adapter;
+
+	netif_carrier_off(dev);
+
+	if (!(adapter->flags & FULL_INIT_DONE)) {
+		err = cxgb_up(adapter);
+		if (err < 0)
+			return err;
+	}
+
+	err = link_start(dev);
+	if (!err)
+		netif_tx_start_all_queues(dev);
+	return err;
+}
+
+static int cxgb_close(struct net_device *dev)
+{
+	struct port_info *pi = netdev_priv(dev);
+	struct adapter *adapter = pi->adapter;
+
+	netif_tx_stop_all_queues(dev);
+	netif_carrier_off(dev);
+	return t4_enable_vi(adapter, adapter->fn, pi->viid, false, false);
+}
+
+/* Return an error number if the indicated filter isn't writable ...
+ */
+static int writable_filter(struct filter_entry *f)
+{
+	if (f->locked)
+		return -EPERM;
+	if (f->pending)
+		return -EBUSY;
+
+	return 0;
+}
+
+/* Delete the filter at the specified index (if valid).  The checks for all
+ * the common problems with doing this like the filter being locked, currently
+ * pending in another operation, etc.
+ */
+static int delete_filter(struct adapter *adapter, unsigned int fidx)
+{
+	struct filter_entry *f;
+	int ret;
+
+	if (fidx >= adapter->tids.nftids + adapter->tids.nsftids)
+		return -EINVAL;
+
+	f = &adapter->tids.ftid_tab[fidx];
+	ret = writable_filter(f);
+	if (ret)
+		return ret;
+	if (f->valid)
+		return del_filter_wr(adapter, fidx);
+
+	return 0;
+}
+
+int cxgb4_create_server_filter(const struct net_device *dev, unsigned int stid,
+		__be32 sip, __be16 sport, __be16 vlan,
+		unsigned int queue, unsigned char port, unsigned char mask)
+{
+	int ret;
+	struct filter_entry *f;
+	struct adapter *adap;
+	int i;
+	u8 *val;
+
+	adap = netdev2adap(dev);
+
+	/* Adjust stid to correct filter index */
+	stid -= adap->tids.sftid_base;
+	stid += adap->tids.nftids;
+
+	/* Check to make sure the filter requested is writable ...
+	 */
+	f = &adap->tids.ftid_tab[stid];
+	ret = writable_filter(f);
+	if (ret)
+		return ret;
+
+	/* Clear out any old resources being used by the filter before
+	 * we start constructing the new filter.
+	 */
+	if (f->valid)
+		clear_filter(adap, f);
+
+	/* Clear out filter specifications */
+	memset(&f->fs, 0, sizeof(struct ch_filter_specification));
+	f->fs.val.lport = cpu_to_be16(sport);
+	f->fs.mask.lport  = ~0;
+	val = (u8 *)&sip;
+	if ((val[0] | val[1] | val[2] | val[3]) != 0) {
+		for (i = 0; i < 4; i++) {
+			f->fs.val.lip[i] = val[i];
+			f->fs.mask.lip[i] = ~0;
+		}
+		if (adap->params.tp.vlan_pri_map & F_PORT) {
+			f->fs.val.iport = port;
+			f->fs.mask.iport = mask;
+		}
+	}
+
+	if (adap->params.tp.vlan_pri_map & F_PROTOCOL) {
+		f->fs.val.proto = IPPROTO_TCP;
+		f->fs.mask.proto = ~0;
+	}
+
+	f->fs.dirsteer = 1;
+	f->fs.iq = queue;
+	/* Mark filter as locked */
+	f->locked = 1;
+	f->fs.rpttid = 1;
+
+	ret = set_filter_wr(adap, stid);
+	if (ret) {
+		clear_filter(adap, f);
+		return ret;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(cxgb4_create_server_filter);
+
+int cxgb4_remove_server_filter(const struct net_device *dev, unsigned int stid,
+		unsigned int queue, bool ipv6)
+{
+	int ret;
+	struct filter_entry *f;
+	struct adapter *adap;
+
+	adap = netdev2adap(dev);
+
+	/* Adjust stid to correct filter index */
+	stid -= adap->tids.sftid_base;
+	stid += adap->tids.nftids;
+
+	f = &adap->tids.ftid_tab[stid];
+	/* Unlock the filter */
+	f->locked = 0;
+
+	ret = delete_filter(adap, stid);
+	if (ret)
+		return ret;
+
+	return 0;
+}
+EXPORT_SYMBOL(cxgb4_remove_server_filter);
+
+static struct rtnl_link_stats64 *cxgb_get_stats(struct net_device *dev,
+						struct rtnl_link_stats64 *ns)
+{
+	struct port_stats stats;
+	struct port_info *p = netdev_priv(dev);
+	struct adapter *adapter = p->adapter;
+
+	/* Block retrieving statistics during EEH error
+	 * recovery. Otherwise, the recovery might fail
+	 * and the PCI device will be removed permanently
+	 */
+	spin_lock(&adapter->stats_lock);
+	if (!netif_device_present(dev)) {
+		spin_unlock(&adapter->stats_lock);
+		return ns;
+	}
+	t4_get_port_stats(adapter, p->tx_chan, &stats);
+	spin_unlock(&adapter->stats_lock);
+
+	ns->tx_bytes   = stats.tx_octets;
+	ns->tx_packets = stats.tx_frames;
+	ns->rx_bytes   = stats.rx_octets;
+	ns->rx_packets = stats.rx_frames;
+	ns->multicast  = stats.rx_mcast_frames;
+
+	/* detailed rx_errors */
+	ns->rx_length_errors = stats.rx_jabber + stats.rx_too_long +
+			       stats.rx_runt;
+	ns->rx_over_errors   = 0;
+	ns->rx_crc_errors    = stats.rx_fcs_err;
+	ns->rx_frame_errors  = stats.rx_symbol_err;
+	ns->rx_fifo_errors   = stats.rx_ovflow0 + stats.rx_ovflow1 +
+			       stats.rx_ovflow2 + stats.rx_ovflow3 +
+			       stats.rx_trunc0 + stats.rx_trunc1 +
+			       stats.rx_trunc2 + stats.rx_trunc3;
+	ns->rx_missed_errors = 0;
+
+	/* detailed tx_errors */
+	ns->tx_aborted_errors   = 0;
+	ns->tx_carrier_errors   = 0;
+	ns->tx_fifo_errors      = 0;
+	ns->tx_heartbeat_errors = 0;
+	ns->tx_window_errors    = 0;
+
+	ns->tx_errors = stats.tx_error_frames;
+	ns->rx_errors = stats.rx_symbol_err + stats.rx_fcs_err +
+		ns->rx_length_errors + stats.rx_len_err + ns->rx_fifo_errors;
+	return ns;
+}
+
+static int cxgb_ioctl(struct net_device *dev, struct ifreq *req, int cmd)
+{
+	unsigned int mbox;
+	int ret = 0, prtad, devad;
+	struct port_info *pi = netdev_priv(dev);
+	struct mii_ioctl_data *data = (struct mii_ioctl_data *)&req->ifr_data;
+
+	switch (cmd) {
+	case SIOCGMIIPHY:
+		if (pi->mdio_addr < 0)
+			return -EOPNOTSUPP;
+		data->phy_id = pi->mdio_addr;
+		break;
+	case SIOCGMIIREG:
+	case SIOCSMIIREG:
+		if (mdio_phy_id_is_c45(data->phy_id)) {
+			prtad = mdio_phy_id_prtad(data->phy_id);
+			devad = mdio_phy_id_devad(data->phy_id);
+		} else if (data->phy_id < 32) {
+			prtad = data->phy_id;
+			devad = 0;
+			data->reg_num &= 0x1f;
+		} else
+			return -EINVAL;
+
+		mbox = pi->adapter->fn;
+		if (cmd == SIOCGMIIREG)
+			ret = t4_mdio_rd(pi->adapter, mbox, prtad, devad,
+					 data->reg_num, &data->val_out);
+		else
+			ret = t4_mdio_wr(pi->adapter, mbox, prtad, devad,
+					 data->reg_num, data->val_in);
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
+	return ret;
+}
+
+static void cxgb_set_rxmode(struct net_device *dev)
+{
+	/* unfortunately we can't return errors to the stack */
+	set_rxmode(dev, -1, false);
+}
+
+static int cxgb_change_mtu(struct net_device *dev, int new_mtu)
+{
+	int ret;
+	struct port_info *pi = netdev_priv(dev);
+
+	if (new_mtu < 81 || new_mtu > MAX_MTU)         /* accommodate SACK */
+		return -EINVAL;
+	ret = t4_set_rxmode(pi->adapter, pi->adapter->fn, pi->viid, new_mtu, -1,
+			    -1, -1, -1, true);
+	if (!ret)
+		dev->mtu = new_mtu;
+	return ret;
+}
+
+static int cxgb_set_mac_addr(struct net_device *dev, void *p)
+{
+	int ret;
+	struct sockaddr *addr = p;
+	struct port_info *pi = netdev_priv(dev);
+
+	if (!is_valid_ether_addr(addr->sa_data))
+		return -EADDRNOTAVAIL;
+
+	ret = t4_change_mac(pi->adapter, pi->adapter->fn, pi->viid,
+			    pi->xact_addr_filt, addr->sa_data, true, true);
+	if (ret < 0)
+		return ret;
+
+	memcpy(dev->dev_addr, addr->sa_data, dev->addr_len);
+	pi->xact_addr_filt = ret;
+	return 0;
+}
+
+#ifdef CONFIG_NET_POLL_CONTROLLER
+static void cxgb_netpoll(struct net_device *dev)
+{
+	struct port_info *pi = netdev_priv(dev);
+	struct adapter *adap = pi->adapter;
+
+	if (adap->flags & USING_MSIX) {
+		int i;
+		struct sge_eth_rxq *rx = &adap->sge.ethrxq[pi->first_qset];
+
+		for (i = pi->nqsets; i; i--, rx++)
+			t4_sge_intr_msix(0, &rx->rspq);
+	} else
+		t4_intr_handler(adap)(0, adap);
+}
+#endif
+
+static const struct net_device_ops cxgb4_netdev_ops = {
+	.ndo_open             = cxgb_open,
+	.ndo_stop             = cxgb_close,
+	.ndo_start_xmit       = t4_eth_xmit,
+	.ndo_select_queue     =	cxgb_select_queue,
+	.ndo_get_stats64      = cxgb_get_stats,
+	.ndo_set_rx_mode      = cxgb_set_rxmode,
+	.ndo_set_mac_address  = cxgb_set_mac_addr,
+	.ndo_set_features     = cxgb_set_features,
+	.ndo_validate_addr    = eth_validate_addr,
+	.ndo_do_ioctl         = cxgb_ioctl,
+	.ndo_change_mtu       = cxgb_change_mtu,
+#ifdef CONFIG_NET_POLL_CONTROLLER
+	.ndo_poll_controller  = cxgb_netpoll,
+#endif
+};
+
+void t4_fatal_err(struct adapter *adap)
+{
+	t4_set_reg_field(adap, SGE_CONTROL, GLOBALENABLE, 0);
+	t4_intr_disable(adap);
+	dev_alert(adap->pdev_dev, "encountered fatal error, adapter stopped\n");
+}
+
+/* Return the specified PCI-E Configuration Space register from our Physical
+ * Function.  We try first via a Firmware LDST Command since we prefer to let
+ * the firmware own all of these registers, but if that fails we go for it
+ * directly ourselves.
+ */
+static u32 t4_read_pcie_cfg4(struct adapter *adap, int reg)
+{
+	struct fw_ldst_cmd ldst_cmd;
+	u32 val;
+	int ret;
+
+	/* Construct and send the Firmware LDST Command to retrieve the
+	 * specified PCI-E Configuration Space register.
+	 */
+	memset(&ldst_cmd, 0, sizeof(ldst_cmd));
+	ldst_cmd.op_to_addrspace =
+		htonl(FW_CMD_OP(FW_LDST_CMD) |
+		      FW_CMD_REQUEST |
+		      FW_CMD_READ |
+		      FW_LDST_CMD_ADDRSPACE(FW_LDST_ADDRSPC_FUNC_PCIE));
+	ldst_cmd.cycles_to_len16 = htonl(FW_LEN16(ldst_cmd));
+	ldst_cmd.u.pcie.select_naccess = FW_LDST_CMD_NACCESS(1);
+	ldst_cmd.u.pcie.ctrl_to_fn =
+		(FW_LDST_CMD_LC | FW_LDST_CMD_FN(adap->fn));
+	ldst_cmd.u.pcie.r = reg;
+	ret = t4_wr_mbox(adap, adap->mbox, &ldst_cmd, sizeof(ldst_cmd),
+			 &ldst_cmd);
+
+	/* If the LDST Command suucceeded, exctract the returned register
+	 * value.  Otherwise read it directly ourself.
+	 */
+	if (ret == 0)
+		val = ntohl(ldst_cmd.u.pcie.data[0]);
+	else
+		t4_hw_pci_read_cfg4(adap, reg, &val);
+
+	return val;
+}
+
+static void setup_memwin(struct adapter *adap)
+{
+	u32 mem_win0_base, mem_win1_base, mem_win2_base, mem_win2_aperture;
+
+	if (is_t4(adap->params.chip)) {
+		u32 bar0;
+
+		/* Truncation intentional: we only read the bottom 32-bits of
+		 * the 64-bit BAR0/BAR1 ...  We use the hardware backdoor
+		 * mechanism to read BAR0 instead of using
+		 * pci_resource_start() because we could be operating from
+		 * within a Virtual Machine which is trapping our accesses to
+		 * our Configuration Space and we need to set up the PCI-E
+		 * Memory Window decoders with the actual addresses which will
+		 * be coming across the PCI-E link.
+		 */
+		bar0 = t4_read_pcie_cfg4(adap, PCI_BASE_ADDRESS_0);
+		bar0 &= PCI_BASE_ADDRESS_MEM_MASK;
+		adap->t4_bar0 = bar0;
+
+		mem_win0_base = bar0 + MEMWIN0_BASE;
+		mem_win1_base = bar0 + MEMWIN1_BASE;
+		mem_win2_base = bar0 + MEMWIN2_BASE;
+		mem_win2_aperture = MEMWIN2_APERTURE;
+	} else {
+		/* For T5, only relative offset inside the PCIe BAR is passed */
+		mem_win0_base = MEMWIN0_BASE;
+		mem_win1_base = MEMWIN1_BASE;
+		mem_win2_base = MEMWIN2_BASE_T5;
+		mem_win2_aperture = MEMWIN2_APERTURE_T5;
+	}
+	t4_write_reg(adap, PCIE_MEM_ACCESS_REG(PCIE_MEM_ACCESS_BASE_WIN, 0),
+		     mem_win0_base | BIR(0) |
+		     WINDOW(ilog2(MEMWIN0_APERTURE) - 10));
+	t4_write_reg(adap, PCIE_MEM_ACCESS_REG(PCIE_MEM_ACCESS_BASE_WIN, 1),
+		     mem_win1_base | BIR(0) |
+		     WINDOW(ilog2(MEMWIN1_APERTURE) - 10));
+	t4_write_reg(adap, PCIE_MEM_ACCESS_REG(PCIE_MEM_ACCESS_BASE_WIN, 2),
+		     mem_win2_base | BIR(0) |
+		     WINDOW(ilog2(mem_win2_aperture) - 10));
+	t4_read_reg(adap, PCIE_MEM_ACCESS_REG(PCIE_MEM_ACCESS_BASE_WIN, 2));
+}
+
+static void setup_memwin_rdma(struct adapter *adap)
+{
+	if (adap->vres.ocq.size) {
+		u32 start;
+		unsigned int sz_kb;
+
+		start = t4_read_pcie_cfg4(adap, PCI_BASE_ADDRESS_2);
+		start &= PCI_BASE_ADDRESS_MEM_MASK;
+		start += OCQ_WIN_OFFSET(adap->pdev, &adap->vres);
+		sz_kb = roundup_pow_of_two(adap->vres.ocq.size) >> 10;
+		t4_write_reg(adap,
+			     PCIE_MEM_ACCESS_REG(PCIE_MEM_ACCESS_BASE_WIN, 3),
+			     start | BIR(1) | WINDOW(ilog2(sz_kb)));
+		t4_write_reg(adap,
+			     PCIE_MEM_ACCESS_REG(PCIE_MEM_ACCESS_OFFSET, 3),
+			     adap->vres.ocq.start);
+		t4_read_reg(adap,
+			    PCIE_MEM_ACCESS_REG(PCIE_MEM_ACCESS_OFFSET, 3));
+	}
+}
+
+static int adap_init1(struct adapter *adap, struct fw_caps_config_cmd *c)
+{
+	u32 v;
+	int ret;
+
+	/* get device capabilities */
+	memset(c, 0, sizeof(*c));
+	c->op_to_write = htonl(FW_CMD_OP(FW_CAPS_CONFIG_CMD) |
+			       FW_CMD_REQUEST | FW_CMD_READ);
+	c->cfvalid_to_len16 = htonl(FW_LEN16(*c));
+	ret = t4_wr_mbox(adap, adap->fn, c, sizeof(*c), c);
+	if (ret < 0)
+		return ret;
+
+	/* select capabilities we'll be using */
+	if (c->niccaps & htons(FW_CAPS_CONFIG_NIC_VM)) {
+		if (!vf_acls)
+			c->niccaps ^= htons(FW_CAPS_CONFIG_NIC_VM);
+		else
+			c->niccaps = htons(FW_CAPS_CONFIG_NIC_VM);
+	} else if (vf_acls) {
+		dev_err(adap->pdev_dev, "virtualization ACLs not supported");
+		return ret;
+	}
+	c->op_to_write = htonl(FW_CMD_OP(FW_CAPS_CONFIG_CMD) |
+			       FW_CMD_REQUEST | FW_CMD_WRITE);
+	ret = t4_wr_mbox(adap, adap->fn, c, sizeof(*c), NULL);
+	if (ret < 0)
+		return ret;
+
+	ret = t4_config_glbl_rss(adap, adap->fn,
+				 FW_RSS_GLB_CONFIG_CMD_MODE_BASICVIRTUAL,
+				 FW_RSS_GLB_CONFIG_CMD_TNLMAPEN |
+				 FW_RSS_GLB_CONFIG_CMD_TNLALLLKP);
+	if (ret < 0)
+		return ret;
+
+	ret = t4_cfg_pfvf(adap, adap->fn, adap->fn, 0, MAX_EGRQ, 64, MAX_INGQ,
+			  0, 0, 4, 0xf, 0xf, 16, FW_CMD_CAP_PF, FW_CMD_CAP_PF);
+	if (ret < 0)
+		return ret;
+
+	t4_sge_init(adap);
+
+	/* tweak some settings */
+	t4_write_reg(adap, TP_SHIFT_CNT, 0x64f8849);
+	t4_write_reg(adap, ULP_RX_TDDP_PSZ, HPZ0(PAGE_SHIFT - 12));
+	t4_write_reg(adap, TP_PIO_ADDR, TP_INGRESS_CONFIG);
+	v = t4_read_reg(adap, TP_PIO_DATA);
+	t4_write_reg(adap, TP_PIO_DATA, v & ~CSUM_HAS_PSEUDO_HDR);
+
+	/* first 4 Tx modulation queues point to consecutive Tx channels */
+	adap->params.tp.tx_modq_map = 0xE4;
+	t4_write_reg(adap, A_TP_TX_MOD_QUEUE_REQ_MAP,
+		     V_TX_MOD_QUEUE_REQ_MAP(adap->params.tp.tx_modq_map));
+
+	/* associate each Tx modulation queue with consecutive Tx channels */
+	v = 0x84218421;
+	t4_write_indirect(adap, TP_PIO_ADDR, TP_PIO_DATA,
+			  &v, 1, A_TP_TX_SCHED_HDR);
+	t4_write_indirect(adap, TP_PIO_ADDR, TP_PIO_DATA,
+			  &v, 1, A_TP_TX_SCHED_FIFO);
+	t4_write_indirect(adap, TP_PIO_ADDR, TP_PIO_DATA,
+			  &v, 1, A_TP_TX_SCHED_PCMD);
+
+#define T4_TX_MODQ_10G_WEIGHT_DEFAULT 16 /* in KB units */
+	if (is_offload(adap)) {
+		t4_write_reg(adap, A_TP_TX_MOD_QUEUE_WEIGHT0,
+			     V_TX_MODQ_WEIGHT0(T4_TX_MODQ_10G_WEIGHT_DEFAULT) |
+			     V_TX_MODQ_WEIGHT1(T4_TX_MODQ_10G_WEIGHT_DEFAULT) |
+			     V_TX_MODQ_WEIGHT2(T4_TX_MODQ_10G_WEIGHT_DEFAULT) |
+			     V_TX_MODQ_WEIGHT3(T4_TX_MODQ_10G_WEIGHT_DEFAULT));
+		t4_write_reg(adap, A_TP_TX_MOD_CHANNEL_WEIGHT,
+			     V_TX_MODQ_WEIGHT0(T4_TX_MODQ_10G_WEIGHT_DEFAULT) |
+			     V_TX_MODQ_WEIGHT1(T4_TX_MODQ_10G_WEIGHT_DEFAULT) |
+			     V_TX_MODQ_WEIGHT2(T4_TX_MODQ_10G_WEIGHT_DEFAULT) |
+			     V_TX_MODQ_WEIGHT3(T4_TX_MODQ_10G_WEIGHT_DEFAULT));
+	}
+
+	/* get basic stuff going */
+	return t4_early_init(adap, adap->fn);
+}
+
+/*
+ * Max # of ATIDs.  The absolute HW max is 16K but we keep it lower.
+ */
+#define MAX_ATIDS 8192U
+
+/*
+ * Phase 0 of initialization: contact FW, obtain config, perform basic init.
+ *
+ * If the firmware we're dealing with has Configuration File support, then
+ * we use that to perform all configuration
+ */
+
+/*
+ * Tweak configuration based on module parameters, etc.  Most of these have
+ * defaults assigned to them by Firmware Configuration Files (if we're using
+ * them) but need to be explicitly set if we're using hard-coded
+ * initialization.  But even in the case of using Firmware Configuration
+ * Files, we'd like to expose the ability to change these via module
+ * parameters so these are essentially common tweaks/settings for
+ * Configuration Files and hard-coded initialization ...
+ */
+static int adap_init0_tweaks(struct adapter *adapter)
+{
+	/*
+	 * Fix up various Host-Dependent Parameters like Page Size, Cache
+	 * Line Size, etc.  The firmware default is for a 4KB Page Size and
+	 * 64B Cache Line Size ...
+	 */
+	t4_fixup_host_params(adapter, PAGE_SIZE, L1_CACHE_BYTES);
+
+	/*
+	 * Process module parameters which affect early initialization.
+	 */
+	if (rx_dma_offset != 2 && rx_dma_offset != 0) {
+		dev_err(&adapter->pdev->dev,
+			"Ignoring illegal rx_dma_offset=%d, using 2\n",
+			rx_dma_offset);
+		rx_dma_offset = 2;
+	}
+	t4_set_reg_field(adapter, SGE_CONTROL,
+			 PKTSHIFT_MASK,
+			 PKTSHIFT(rx_dma_offset));
+
+	/*
+	 * Don't include the "IP Pseudo Header" in CPL_RX_PKT checksums: Linux
+	 * adds the pseudo header itself.
+	 */
+	t4_tp_wr_bits_indirect(adapter, TP_INGRESS_CONFIG,
+			       CSUM_HAS_PSEUDO_HDR, 0);
+
+	return 0;
+}
+
+/*
+ * Attempt to initialize the adapter via a Firmware Configuration File.
+ */
+static int adap_init0_config(struct adapter *adapter, int reset)
+{
+	struct fw_caps_config_cmd caps_cmd;
+	const struct firmware *cf;
+	unsigned long mtype = 0, maddr = 0;
+	u32 finiver, finicsum, cfcsum;
+	int ret;
+	int config_issued = 0;
+	char *fw_config_file, fw_config_file_path[256];
+	char *config_name = NULL;
+
+	/*
+	 * Reset device if necessary.
+	 */
+	if (reset) {
+		ret = t4_fw_reset(adapter, adapter->mbox,
+				  PIORSTMODE | PIORST);
+		if (ret < 0)
+			goto bye;
+	}
+
+	/*
+	 * If we have a T4 configuration file under /lib/firmware/cxgb4/,
+	 * then use that.  Otherwise, use the configuration file stored
+	 * in the adapter flash ...
+	 */
+	switch (CHELSIO_CHIP_VERSION(adapter->params.chip)) {
+	case CHELSIO_T4:
+		fw_config_file = FW4_CFNAME;
+		break;
+	case CHELSIO_T5:
+		fw_config_file = FW5_CFNAME;
+		break;
+	default:
+		dev_err(adapter->pdev_dev, "Device %d is not supported\n",
+		       adapter->pdev->device);
+		ret = -EINVAL;
+		goto bye;
+	}
+
+	ret = request_firmware(&cf, fw_config_file, adapter->pdev_dev);
+	if (ret < 0) {
+		config_name = "On FLASH";
+		mtype = FW_MEMTYPE_CF_FLASH;
+		maddr = t4_flash_cfg_addr(adapter);
+	} else {
+		u32 params[7], val[7];
+
+		sprintf(fw_config_file_path,
+			"/lib/firmware/%s", fw_config_file);
+		config_name = fw_config_file_path;
+
+		if (cf->size >= FLASH_CFG_MAX_SIZE)
+			ret = -ENOMEM;
+		else {
+			params[0] = (FW_PARAMS_MNEM(FW_PARAMS_MNEM_DEV) |
+			     FW_PARAMS_PARAM_X(FW_PARAMS_PARAM_DEV_CF));
+			ret = t4_query_params(adapter, adapter->mbox,
+					      adapter->fn, 0, 1, params, val);
+			if (ret == 0) {
+				/*
+				 * For t4_memory_rw() below addresses and
+				 * sizes have to be in terms of multiples of 4
+				 * bytes.  So, if the Configuration File isn't
+				 * a multiple of 4 bytes in length we'll have
+				 * to write that out separately since we can't
+				 * guarantee that the bytes following the
+				 * residual byte in the buffer returned by
+				 * request_firmware() are zeroed out ...
+				 */
+				size_t resid = cf->size & 0x3;
+				size_t size = cf->size & ~0x3;
+				__be32 *data = (__be32 *)cf->data;
+
+				mtype = FW_PARAMS_PARAM_Y_GET(val[0]);
+				maddr = FW_PARAMS_PARAM_Z_GET(val[0]) << 16;
+
+				spin_lock(&adapter->win0_lock);
+				ret = t4_memory_rw(adapter, 0, mtype, maddr,
+						   size, data, T4_MEMORY_WRITE);
+				if (ret == 0 && resid != 0) {
+					union {
+						__be32 word;
+						char buf[4];
+					} last;
+					int i;
+
+					last.word = data[size >> 2];
+					for (i = resid; i < 4; i++)
+						last.buf[i] = 0;
+					ret = t4_memory_rw(adapter, 0, mtype,
+							   maddr + size,
+							   4, &last.word,
+							   T4_MEMORY_WRITE);
+				}
+				spin_unlock(&adapter->win0_lock);
+			}
+		}
+
+		release_firmware(cf);
+		if (ret)
+			goto bye;
+	}
+
+	/*
+	 * Issue a Capability Configuration command to the firmware to get it
+	 * to parse the Configuration File.  We don't use t4_fw_config_file()
+	 * because we want the ability to modify various features after we've
+	 * processed the configuration file ...
+	 */
+	memset(&caps_cmd, 0, sizeof(caps_cmd));
+	caps_cmd.op_to_write =
+		htonl(FW_CMD_OP(FW_CAPS_CONFIG_CMD) |
+		      FW_CMD_REQUEST |
+		      FW_CMD_READ);
+	caps_cmd.cfvalid_to_len16 =
+		htonl(FW_CAPS_CONFIG_CMD_CFVALID |
+		      FW_CAPS_CONFIG_CMD_MEMTYPE_CF(mtype) |
+		      FW_CAPS_CONFIG_CMD_MEMADDR64K_CF(maddr >> 16) |
+		      FW_LEN16(caps_cmd));
+	ret = t4_wr_mbox(adapter, adapter->mbox, &caps_cmd, sizeof(caps_cmd),
+			 &caps_cmd);
+
+	/* If the CAPS_CONFIG failed with an ENOENT (for a Firmware
+	 * Configuration File in FLASH), our last gasp effort is to use the
+	 * Firmware Configuration File which is embedded in the firmware.  A
+	 * very few early versions of the firmware didn't have one embedded
+	 * but we can ignore those.
+	 */
+	if (ret == -ENOENT) {
+		memset(&caps_cmd, 0, sizeof(caps_cmd));
+		caps_cmd.op_to_write =
+			htonl(FW_CMD_OP(FW_CAPS_CONFIG_CMD) |
+					FW_CMD_REQUEST |
+					FW_CMD_READ);
+		caps_cmd.cfvalid_to_len16 = htonl(FW_LEN16(caps_cmd));
+		ret = t4_wr_mbox(adapter, adapter->mbox, &caps_cmd,
+				sizeof(caps_cmd), &caps_cmd);
+		config_name = "Firmware Default";
+	}
+
+	config_issued = 1;
+	if (ret < 0)
+		goto bye;
+
+	finiver = ntohl(caps_cmd.finiver);
+	finicsum = ntohl(caps_cmd.finicsum);
+	cfcsum = ntohl(caps_cmd.cfcsum);
+	if (finicsum != cfcsum)
+		dev_warn(adapter->pdev_dev, "Configuration File checksum "\
+			 "mismatch: [fini] csum=%#x, computed csum=%#x\n",
+			 finicsum, cfcsum);
+
+	/*
+	 * And now tell the firmware to use the configuration we just loaded.
+	 */
+	caps_cmd.op_to_write =
+		htonl(FW_CMD_OP(FW_CAPS_CONFIG_CMD) |
+		      FW_CMD_REQUEST |
+		      FW_CMD_WRITE);
+	caps_cmd.cfvalid_to_len16 = htonl(FW_LEN16(caps_cmd));
+	ret = t4_wr_mbox(adapter, adapter->mbox, &caps_cmd, sizeof(caps_cmd),
+			 NULL);
+	if (ret < 0)
+		goto bye;
+
+	/*
+	 * Tweak configuration based on system architecture, module
+	 * parameters, etc.
+	 */
+	ret = adap_init0_tweaks(adapter);
+	if (ret < 0)
+		goto bye;
+
+	/*
+	 * And finally tell the firmware to initialize itself using the
+	 * parameters from the Configuration File.
+	 */
+	ret = t4_fw_initialize(adapter, adapter->mbox);
+	if (ret < 0)
+		goto bye;
+
+	/*
+	 * Return successfully and note that we're operating with parameters
+	 * not supplied by the driver, rather than from hard-wired
+	 * initialization constants burried in the driver.
+	 */
+	adapter->flags |= USING_SOFT_PARAMS;
+	dev_info(adapter->pdev_dev, "Successfully configured using Firmware "\
+		 "Configuration File \"%s\", version %#x, computed checksum %#x\n",
+		 config_name, finiver, cfcsum);
+	return 0;
+
+	/*
+	 * Something bad happened.  Return the error ...  (If the "error"
+	 * is that there's no Configuration File on the adapter we don't
+	 * want to issue a warning since this is fairly common.)
+	 */
+bye:
+	if (config_issued && ret != -ENOENT)
+		dev_warn(adapter->pdev_dev, "\"%s\" configuration file error %d\n",
+			 config_name, -ret);
+	return ret;
+}
+
+/*
+ * Attempt to initialize the adapter via hard-coded, driver supplied
+ * parameters ...
+ */
+static int adap_init0_no_config(struct adapter *adapter, int reset)
+{
+	struct sge *s = &adapter->sge;
+	struct fw_caps_config_cmd caps_cmd;
+	u32 v;
+	int i, ret;
+
+	/*
+	 * Reset device if necessary
+	 */
+	if (reset) {
+		ret = t4_fw_reset(adapter, adapter->mbox,
+				  PIORSTMODE | PIORST);
+		if (ret < 0)
+			goto bye;
+	}
+
+	/*
+	 * Get device capabilities and select which we'll be using.
+	 */
+	memset(&caps_cmd, 0, sizeof(caps_cmd));
+	caps_cmd.op_to_write = htonl(FW_CMD_OP(FW_CAPS_CONFIG_CMD) |
+				     FW_CMD_REQUEST | FW_CMD_READ);
+	caps_cmd.cfvalid_to_len16 = htonl(FW_LEN16(caps_cmd));
+	ret = t4_wr_mbox(adapter, adapter->mbox, &caps_cmd, sizeof(caps_cmd),
+			 &caps_cmd);
+	if (ret < 0)
+		goto bye;
+
+	if (caps_cmd.niccaps & htons(FW_CAPS_CONFIG_NIC_VM)) {
+		if (!vf_acls)
+			caps_cmd.niccaps ^= htons(FW_CAPS_CONFIG_NIC_VM);
+		else
+			caps_cmd.niccaps = htons(FW_CAPS_CONFIG_NIC_VM);
+	} else if (vf_acls) {
+		dev_err(adapter->pdev_dev, "virtualization ACLs not supported");
+		goto bye;
+	}
+	caps_cmd.op_to_write = htonl(FW_CMD_OP(FW_CAPS_CONFIG_CMD) |
+			      FW_CMD_REQUEST | FW_CMD_WRITE);
+	ret = t4_wr_mbox(adapter, adapter->mbox, &caps_cmd, sizeof(caps_cmd),
+			 NULL);
+	if (ret < 0)
+		goto bye;
+
+	/*
+	 * Tweak configuration based on system architecture, module
+	 * parameters, etc.
+	 */
+	ret = adap_init0_tweaks(adapter);
+	if (ret < 0)
+		goto bye;
+
+	/*
+	 * Select RSS Global Mode we want to use.  We use "Basic Virtual"
+	 * mode which maps each Virtual Interface to its own section of
+	 * the RSS Table and we turn on all map and hash enables ...
+	 */
+	adapter->flags |= RSS_TNLALLLOOKUP;
+	ret = t4_config_glbl_rss(adapter, adapter->mbox,
+				 FW_RSS_GLB_CONFIG_CMD_MODE_BASICVIRTUAL,
+				 FW_RSS_GLB_CONFIG_CMD_TNLMAPEN |
+				 FW_RSS_GLB_CONFIG_CMD_HASHTOEPLITZ |
+				 ((adapter->flags & RSS_TNLALLLOOKUP) ?
+					FW_RSS_GLB_CONFIG_CMD_TNLALLLKP : 0));
+	if (ret < 0)
+		goto bye;
+
+	/*
+	 * Set up our own fundamental resource provisioning ...
+	 */
+	ret = t4_cfg_pfvf(adapter, adapter->mbox, adapter->fn, 0,
+			  PFRES_NEQ, PFRES_NETHCTRL,
+			  PFRES_NIQFLINT, PFRES_NIQ,
+			  PFRES_TC, PFRES_NVI,
+			  FW_PFVF_CMD_CMASK_MASK,
+			  pfvfres_pmask(adapter, adapter->fn, 0),
+			  PFRES_NEXACTF,
+			  PFRES_R_CAPS, PFRES_WX_CAPS);
+	if (ret < 0)
+		goto bye;
+
+	/*
+	 * Perform low level SGE initialization.  We need to do this before we
+	 * send the firmware the INITIALIZE command because that will cause
+	 * any other PF Drivers which are waiting for the Master
+	 * Initialization to proceed forward.
+	 */
+	for (i = 0; i < SGE_NTIMERS - 1; i++)
+		s->timer_val[i] = min(intr_holdoff[i], MAX_SGE_TIMERVAL);
+	s->timer_val[SGE_NTIMERS - 1] = MAX_SGE_TIMERVAL;
+	s->counter_val[0] = 1;
+	for (i = 1; i < SGE_NCOUNTERS; i++)
+		s->counter_val[i] = min(intr_cnt[i - 1],
+					THRESHOLD_0_GET(THRESHOLD_0_MASK));
+	t4_sge_init(adapter);
+
+#ifdef CONFIG_PCI_IOV
+	/*
+	 * Provision resource limits for Virtual Functions.  We currently
+	 * grant them all the same static resource limits except for the Port
+	 * Access Rights Mask which we're assigning based on the PF.  All of
+	 * the static provisioning stuff for both the PF and VF really needs
+	 * to be managed in a persistent manner for each device which the
+	 * firmware controls.
+	 */
+	{
+		int pf, vf;
+
+		for (pf = 0; pf < ARRAY_SIZE(num_vf); pf++) {
+			if (num_vf[pf] <= 0)
+				continue;
+
+			/* VF numbering starts at 1! */
+			for (vf = 1; vf <= num_vf[pf]; vf++) {
+				ret = t4_cfg_pfvf(adapter, adapter->mbox,
+						  pf, vf,
+						  VFRES_NEQ, VFRES_NETHCTRL,
+						  VFRES_NIQFLINT, VFRES_NIQ,
+						  VFRES_TC, VFRES_NVI,
+						  FW_PFVF_CMD_CMASK_MASK,
+						  pfvfres_pmask(
+						  adapter, pf, vf),
+						  VFRES_NEXACTF,
+						  VFRES_R_CAPS, VFRES_WX_CAPS);
+				if (ret < 0)
+					dev_warn(adapter->pdev_dev,
+						 "failed to "\
+						 "provision pf/vf=%d/%d; "
+						 "err=%d\n", pf, vf, ret);
+			}
+		}
+	}
+#endif
+
+	/*
+	 * Set up the default filter mode.  Later we'll want to implement this
+	 * via a firmware command, etc. ...  This needs to be done before the
+	 * firmare initialization command ...  If the selected set of fields
+	 * isn't equal to the default value, we'll need to make sure that the
+	 * field selections will fit in the 36-bit budget.
+	 */
+	if (tp_vlan_pri_map != TP_VLAN_PRI_MAP_DEFAULT) {
+		int j, bits = 0;
+
+		for (j = TP_VLAN_PRI_MAP_FIRST; j <= TP_VLAN_PRI_MAP_LAST; j++)
+			switch (tp_vlan_pri_map & (1 << j)) {
+			case 0:
+				/* compressed filter field not enabled */
+				break;
+			case FCOE_MASK:
+				bits +=  1;
+				break;
+			case PORT_MASK:
+				bits +=  3;
+				break;
+			case VNIC_ID_MASK:
+				bits += 17;
+				break;
+			case VLAN_MASK:
+				bits += 17;
+				break;
+			case TOS_MASK:
+				bits +=  8;
+				break;
+			case PROTOCOL_MASK:
+				bits +=  8;
+				break;
+			case ETHERTYPE_MASK:
+				bits += 16;
+				break;
+			case MACMATCH_MASK:
+				bits +=  9;
+				break;
+			case MPSHITTYPE_MASK:
+				bits +=  3;
+				break;
+			case FRAGMENTATION_MASK:
+				bits +=  1;
+				break;
+			}
+
+		if (bits > 36) {
+			dev_err(adapter->pdev_dev,
+				"tp_vlan_pri_map=%#x needs %d bits > 36;"\
+				" using %#x\n", tp_vlan_pri_map, bits,
+				TP_VLAN_PRI_MAP_DEFAULT);
+			tp_vlan_pri_map = TP_VLAN_PRI_MAP_DEFAULT;
+		}
+	}
+	v = tp_vlan_pri_map;
+	t4_write_indirect(adapter, TP_PIO_ADDR, TP_PIO_DATA,
+			  &v, 1, TP_VLAN_PRI_MAP);
+
+	/*
+	 * We need Five Tuple Lookup mode to be set in TP_GLOBAL_CONFIG order
+	 * to support any of the compressed filter fields above.  Newer
+	 * versions of the firmware do this automatically but it doesn't hurt
+	 * to set it here.  Meanwhile, we do _not_ need to set Lookup Every
+	 * Packet in TP_INGRESS_CONFIG to support matching non-TCP packets
+	 * since the firmware automatically turns this on and off when we have
+	 * a non-zero number of filters active (since it does have a
+	 * performance impact).
+	 */
+	if (tp_vlan_pri_map)
+		t4_set_reg_field(adapter, TP_GLOBAL_CONFIG,
+				 FIVETUPLELOOKUP_MASK,
+				 FIVETUPLELOOKUP_MASK);
+
+	/*
+	 * Tweak some settings.
+	 */
+	t4_write_reg(adapter, TP_SHIFT_CNT, SYNSHIFTMAX(6) |
+		     RXTSHIFTMAXR1(4) | RXTSHIFTMAXR2(15) |
+		     PERSHIFTBACKOFFMAX(8) | PERSHIFTMAX(8) |
+		     KEEPALIVEMAXR1(4) | KEEPALIVEMAXR2(9));
+
+	/*
+	 * Get basic stuff going by issuing the Firmware Initialize command.
+	 * Note that this _must_ be after all PFVF commands ...
+	 */
+	ret = t4_fw_initialize(adapter, adapter->mbox);
+	if (ret < 0)
+		goto bye;
+
+	/*
+	 * Return successfully!
+	 */
+	dev_info(adapter->pdev_dev, "Successfully configured using built-in "\
+		 "driver parameters\n");
+	return 0;
+
+	/*
+	 * Something bad happened.  Return the error ...
+	 */
+bye:
+	return ret;
+}
+
+static struct fw_info fw_info_array[] = {
+	{
+		.chip = CHELSIO_T4,
+		.fs_name = FW4_CFNAME,
+		.fw_mod_name = FW4_FNAME,
+		.fw_hdr = {
+			.chip = FW_HDR_CHIP_T4,
+			.fw_ver = __cpu_to_be32(FW_VERSION(T4)),
+			.intfver_nic = FW_INTFVER(T4, NIC),
+			.intfver_vnic = FW_INTFVER(T4, VNIC),
+			.intfver_ri = FW_INTFVER(T4, RI),
+			.intfver_iscsi = FW_INTFVER(T4, ISCSI),
+			.intfver_fcoe = FW_INTFVER(T4, FCOE),
+		},
+	}, {
+		.chip = CHELSIO_T5,
+		.fs_name = FW5_CFNAME,
+		.fw_mod_name = FW5_FNAME,
+		.fw_hdr = {
+			.chip = FW_HDR_CHIP_T5,
+			.fw_ver = __cpu_to_be32(FW_VERSION(T5)),
+			.intfver_nic = FW_INTFVER(T5, NIC),
+			.intfver_vnic = FW_INTFVER(T5, VNIC),
+			.intfver_ri = FW_INTFVER(T5, RI),
+			.intfver_iscsi = FW_INTFVER(T5, ISCSI),
+			.intfver_fcoe = FW_INTFVER(T5, FCOE),
+		},
+	}
+};
+
+static struct fw_info *find_fw_info(int chip)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(fw_info_array); i++) {
+		if (fw_info_array[i].chip == chip)
+			return &fw_info_array[i];
+	}
+	return NULL;
+}
+
+/*
+ * Phase 0 of initialization: contact FW, obtain config, perform basic init.
+ */
+static int adap_init0(struct adapter *adap)
+{
+	int ret;
+	u32 v, port_vec;
+	enum dev_state state;
+	u32 params[7], val[7];
+	struct fw_caps_config_cmd caps_cmd;
+	int reset = 1;
+
+	/*
+	 * Contact FW, advertising Master capability (and potentially forcing
+	 * ourselves as the Master PF if our module parameter force_init is
+	 * set).
+	 */
+	ret = t4_fw_hello(adap, adap->mbox, adap->fn,
+			  force_init ? MASTER_MUST : MASTER_MAY,
+			  &state);
+	if (ret < 0) {
+		dev_err(adap->pdev_dev, "could not connect to FW, error %d\n",
+			ret);
+		return ret;
+	}
+	if (ret == adap->mbox)
+		adap->flags |= MASTER_PF;
+	if (force_init && state == DEV_STATE_INIT)
+		state = DEV_STATE_UNINIT;
+
+	/*
+	 * If we're the Master PF Driver and the device is uninitialized,
+	 * then let's consider upgrading the firmware ...  (We always want
+	 * to check the firmware version number in order to A. get it for
+	 * later reporting and B. to warn if the currently loaded firmware
+	 * is excessively mismatched relative to the driver.)
+	 */
+	t4_get_fw_version(adap, &adap->params.fw_vers);
+	t4_get_tp_version(adap, &adap->params.tp_vers);
+	if ((adap->flags & MASTER_PF) && state != DEV_STATE_INIT) {
+		struct fw_info *fw_info;
+		struct fw_hdr *card_fw;
+		const struct firmware *fw;
+		const u8 *fw_data = NULL;
+		unsigned int fw_size = 0;
+
+		/* This is the firmware whose headers the driver was compiled
+		 * against
+		 */
+		fw_info = find_fw_info(CHELSIO_CHIP_VERSION(adap->params.chip));
+		if (fw_info == NULL) {
+			dev_err(adap->pdev_dev,
+				"unable to get firmware info for chip %d.\n",
+				CHELSIO_CHIP_VERSION(adap->params.chip));
+			return -EINVAL;
+		}
+
+		/* allocate memory to read the header of the firmware on the
+		 * card
+		 */
+		card_fw = t4_alloc_mem(sizeof(*card_fw));
+
+		/* Get FW from from /lib/firmware/ */
+		ret = request_firmware(&fw, fw_info->fw_mod_name,
+				       adap->pdev_dev);
+		if (ret < 0) {
+			dev_err(adap->pdev_dev,
+				"unable to load firmware image %s, error %d\n",
+				fw_info->fw_mod_name, ret);
+		} else {
+			fw_data = fw->data;
+			fw_size = fw->size;
+		}
+
+		/* upgrade FW logic */
+		ret = t4_prep_fw(adap, fw_info, fw_data, fw_size, card_fw,
+				 state, &reset);
+
+		/* Cleaning up */
+		if (fw != NULL)
+			release_firmware(fw);
+		t4_free_mem(card_fw);
+
+		if (ret < 0)
+			goto bye;
+	}
+
+	/*
+	 * Grab VPD parameters.  This should be done after we establish a
+	 * connection to the firmware since some of the VPD parameters
+	 * (notably the Core Clock frequency) are retrieved via requests to
+	 * the firmware.  On the other hand, we need these fairly early on
+	 * so we do this right after getting ahold of the firmware.
+	 */
+	ret = get_vpd_params(adap, &adap->params.vpd);
+	if (ret < 0)
+		goto bye;
+
+	/*
+	 * Find out what ports are available to us.  Note that we need to do
+	 * this before calling adap_init0_no_config() since it needs nports
+	 * and portvec ...
+	 */
+	v =
+	    FW_PARAMS_MNEM(FW_PARAMS_MNEM_DEV) |
+	    FW_PARAMS_PARAM_X(FW_PARAMS_PARAM_DEV_PORTVEC);
+	ret = t4_query_params(adap, adap->mbox, adap->fn, 0, 1, &v, &port_vec);
+	if (ret < 0)
+		goto bye;
+
+	adap->params.nports = hweight32(port_vec);
+	adap->params.portvec = port_vec;
+
+	/*
+	 * If the firmware is initialized already (and we're not forcing a
+	 * master initialization), note that we're living with existing
+	 * adapter parameters.  Otherwise, it's time to try initializing the
+	 * adapter ...
+	 */
+	if (state == DEV_STATE_INIT) {
+		dev_info(adap->pdev_dev, "Coming up as %s: "\
+			 "Adapter already initialized\n",
+			 adap->flags & MASTER_PF ? "MASTER" : "SLAVE");
+		adap->flags |= USING_SOFT_PARAMS;
+	} else {
+		dev_info(adap->pdev_dev, "Coming up as MASTER: "\
+			 "Initializing adapter\n");
+
+		/*
+		 * If the firmware doesn't support Configuration
+		 * Files warn user and exit,
+		 */
+		if (ret < 0)
+			dev_warn(adap->pdev_dev, "Firmware doesn't support "
+				 "configuration file.\n");
+		if (force_old_init)
+			ret = adap_init0_no_config(adap, reset);
+		else {
+			/*
+			 * Find out whether we're dealing with a version of
+			 * the firmware which has configuration file support.
+			 */
+			params[0] = (FW_PARAMS_MNEM(FW_PARAMS_MNEM_DEV) |
+				     FW_PARAMS_PARAM_X(FW_PARAMS_PARAM_DEV_CF));
+			ret = t4_query_params(adap, adap->mbox, adap->fn, 0, 1,
+					      params, val);
+
+			/*
+			 * If the firmware doesn't support Configuration
+			 * Files, use the old Driver-based, hard-wired
+			 * initialization.  Otherwise, try using the
+			 * Configuration File support and fall back to the
+			 * Driver-based initialization if there's no
+			 * Configuration File found.
+			 */
+			if (ret < 0)
+				ret = adap_init0_no_config(adap, reset);
+			else {
+				/*
+				 * The firmware provides us with a memory
+				 * buffer where we can load a Configuration
+				 * File from the host if we want to override
+				 * the Configuration File in flash.
+				 */
+
+				ret = adap_init0_config(adap, reset);
+				if (ret == -ENOENT) {
+					dev_info(adap->pdev_dev,
+					    "No Configuration File present "
+					    "on adapter. Using hard-wired "
+					    "configuration parameters.\n");
+					ret = adap_init0_no_config(adap, reset);
+				}
+			}
+		}
+		if (ret < 0) {
+			dev_err(adap->pdev_dev,
+				"could not initialize adapter, error %d\n",
+				-ret);
+			goto bye;
+		}
+	}
+
+	/*
+	 * If we're living with non-hard-coded parameters (either from a
+	 * Firmware Configuration File or values programmed by a different PF
+	 * Driver), give the SGE code a chance to pull in anything that it
+	 * needs ...  Note that this must be called after we retrieve our VPD
+	 * parameters in order to know how to convert core ticks to seconds.
+	 */
+	if (adap->flags & USING_SOFT_PARAMS) {
+		ret = t4_sge_init(adap);
+		if (ret < 0)
+			goto bye;
+	}
+
+	if (is_bypass_device(adap->pdev->device))
+		adap->params.bypass = 1;
+
+	/*
+	 * Grab some of our basic fundamental operating parameters.
+	 */
+#define FW_PARAM_DEV(param) \
+	(FW_PARAMS_MNEM(FW_PARAMS_MNEM_DEV) | \
+	FW_PARAMS_PARAM_X(FW_PARAMS_PARAM_DEV_##param))
+
+#define FW_PARAM_PFVF(param) \
+	FW_PARAMS_MNEM(FW_PARAMS_MNEM_PFVF) | \
+	FW_PARAMS_PARAM_X(FW_PARAMS_PARAM_PFVF_##param)|  \
+	FW_PARAMS_PARAM_Y(0) | \
+	FW_PARAMS_PARAM_Z(0)
+
+	params[0] = FW_PARAM_PFVF(EQ_START);
+	params[1] = FW_PARAM_PFVF(L2T_START);
+	params[2] = FW_PARAM_PFVF(L2T_END);
+	params[3] = FW_PARAM_PFVF(FILTER_START);
+	params[4] = FW_PARAM_PFVF(FILTER_END);
+	params[5] = FW_PARAM_PFVF(IQFLINT_START);
+	ret = t4_query_params(adap, adap->mbox, adap->fn, 0, 6, params, val);
+	if (ret < 0)
+		goto bye;
+	adap->sge.egr_start = val[0];
+	adap->l2t_start = val[1];
+	adap->l2t_end = val[2];
+	adap->tids.ftid_base = val[3];
+	adap->tids.nftids = val[4] - val[3] + 1;
+	adap->sge.ingr_start = val[5];
+
+	/* query params related to active filter region */
+	params[0] = FW_PARAM_PFVF(ACTIVE_FILTER_START);
+	params[1] = FW_PARAM_PFVF(ACTIVE_FILTER_END);
+	ret = t4_query_params(adap, adap->mbox, adap->fn, 0, 2, params, val);
+	/* If Active filter size is set we enable establishing
+	 * offload connection through firmware work request
+	 */
+	if ((val[0] != val[1]) && (ret >= 0)) {
+		adap->flags |= FW_OFLD_CONN;
+		adap->tids.aftid_base = val[0];
+		adap->tids.aftid_end = val[1];
+	}
+
+	/* If we're running on newer firmware, let it know that we're
+	 * prepared to deal with encapsulated CPL messages.  Older
+	 * firmware won't understand this and we'll just get
+	 * unencapsulated messages ...
+	 */
+	params[0] = FW_PARAM_PFVF(CPLFW4MSG_ENCAP);
+	val[0] = 1;
+	(void) t4_set_params(adap, adap->mbox, adap->fn, 0, 1, params, val);
+
+	/*
+	 * Find out whether we're allowed to use the T5+ ULPTX MEMWRITE DSGL
+	 * capability.  Earlier versions of the firmware didn't have the
+	 * ULPTX_MEMWRITE_DSGL so we'll interpret a query failure as no
+	 * permission to use ULPTX MEMWRITE DSGL.
+	 */
+	if (is_t4(adap->params.chip)) {
+		adap->params.ulptx_memwrite_dsgl = false;
+	} else {
+		params[0] = FW_PARAM_DEV(ULPTX_MEMWRITE_DSGL);
+		ret = t4_query_params(adap, adap->mbox, adap->fn, 0,
+				      1, params, val);
+		adap->params.ulptx_memwrite_dsgl = (ret == 0 && val[0] != 0);
+	}
+
+	/*
+	 * Get device capabilities so we can determine what resources we need
+	 * to manage.
+	 */
+	memset(&caps_cmd, 0, sizeof(caps_cmd));
+	caps_cmd.op_to_write = htonl(FW_CMD_OP(FW_CAPS_CONFIG_CMD) |
+				     FW_CMD_REQUEST | FW_CMD_READ);
+	caps_cmd.cfvalid_to_len16 = htonl(FW_LEN16(caps_cmd));
+	ret = t4_wr_mbox(adap, adap->mbox, &caps_cmd, sizeof(caps_cmd),
+			 &caps_cmd);
+	if (ret < 0)
+		goto bye;
+
+	if (caps_cmd.ofldcaps) {
+		/* query offload-related parameters */
+		params[0] = FW_PARAM_DEV(NTID);
+		params[1] = FW_PARAM_PFVF(SERVER_START);
+		params[2] = FW_PARAM_PFVF(SERVER_END);
+		params[3] = FW_PARAM_PFVF(TDDP_START);
+		params[4] = FW_PARAM_PFVF(TDDP_END);
+		params[5] = FW_PARAM_DEV(FLOWC_BUFFIFO_SZ);
+		ret = t4_query_params(adap, adap->mbox, adap->fn, 0, 6,
+				      params, val);
+		if (ret < 0)
+			goto bye;
+		adap->tids.ntids = val[0];
+		adap->tids.natids = min(adap->tids.ntids / 2, MAX_ATIDS);
+		adap->tids.stid_base = val[1];
+		adap->tids.nstids = val[2] - val[1] + 1;
+		/*
+		 * Setup server filter region. Divide the availble filter
+		 * region into two parts. Regular filters get 1/3rd and server
+		 * filters get 2/3rd part. This is only enabled if workarond
+		 * path is enabled.
+		 * 1. For regular filters.
+		 * 2. Server filter: This are special filters which are used
+		 * to redirect SYN packets to offload queue.
+		 */
+		if (adap->flags & FW_OFLD_CONN && !is_bypass(adap)) {
+			adap->tids.sftid_base = adap->tids.ftid_base +
+					DIV_ROUND_UP(adap->tids.nftids, 3);
+			adap->tids.nsftids = adap->tids.nftids -
+					 DIV_ROUND_UP(adap->tids.nftids, 3);
+			adap->tids.nftids = adap->tids.sftid_base -
+						adap->tids.ftid_base;
+		}
+		adap->vres.ddp.start = val[3];
+		adap->vres.ddp.size = val[4] - val[3] + 1;
+		adap->params.ofldq_wr_cred = val[5];
+
+		adap->params.offload = 1;
+	}
+	if (caps_cmd.rdmacaps) {
+		params[0] = FW_PARAM_PFVF(STAG_START);
+		params[1] = FW_PARAM_PFVF(STAG_END);
+		params[2] = FW_PARAM_PFVF(RQ_START);
+		params[3] = FW_PARAM_PFVF(RQ_END);
+		params[4] = FW_PARAM_PFVF(PBL_START);
+		params[5] = FW_PARAM_PFVF(PBL_END);
+		ret = t4_query_params(adap, adap->mbox, adap->fn, 0, 6,
+				      params, val);
+		if (ret < 0)
+			goto bye;
+		adap->vres.stag.start = val[0];
+		adap->vres.stag.size = val[1] - val[0] + 1;
+		adap->vres.rq.start = val[2];
+		adap->vres.rq.size = val[3] - val[2] + 1;
+		adap->vres.pbl.start = val[4];
+		adap->vres.pbl.size = val[5] - val[4] + 1;
+
+		params[0] = FW_PARAM_PFVF(SQRQ_START);
+		params[1] = FW_PARAM_PFVF(SQRQ_END);
+		params[2] = FW_PARAM_PFVF(CQ_START);
+		params[3] = FW_PARAM_PFVF(CQ_END);
+		params[4] = FW_PARAM_PFVF(OCQ_START);
+		params[5] = FW_PARAM_PFVF(OCQ_END);
+		ret = t4_query_params(adap, adap->mbox, adap->fn, 0, 6, params,
+				      val);
+		if (ret < 0)
+			goto bye;
+		adap->vres.qp.start = val[0];
+		adap->vres.qp.size = val[1] - val[0] + 1;
+		adap->vres.cq.start = val[2];
+		adap->vres.cq.size = val[3] - val[2] + 1;
+		adap->vres.ocq.start = val[4];
+		adap->vres.ocq.size = val[5] - val[4] + 1;
+
+		params[0] = FW_PARAM_DEV(MAXORDIRD_QP);
+		params[1] = FW_PARAM_DEV(MAXIRD_ADAPTER);
+		ret = t4_query_params(adap, adap->mbox, adap->fn, 0, 2, params,
+				      val);
+		if (ret < 0) {
+			adap->params.max_ordird_qp = 8;
+			adap->params.max_ird_adapter = 32 * adap->tids.ntids;
+			ret = 0;
+		} else {
+			adap->params.max_ordird_qp = val[0];
+			adap->params.max_ird_adapter = val[1];
+		}
+		dev_info(adap->pdev_dev,
+			 "max_ordird_qp %d max_ird_adapter %d\n",
+			 adap->params.max_ordird_qp,
+			 adap->params.max_ird_adapter);
+	}
+	if (caps_cmd.iscsicaps) {
+		params[0] = FW_PARAM_PFVF(ISCSI_START);
+		params[1] = FW_PARAM_PFVF(ISCSI_END);
+		ret = t4_query_params(adap, adap->mbox, adap->fn, 0, 2,
+				      params, val);
+		if (ret < 0)
+			goto bye;
+		adap->vres.iscsi.start = val[0];
+		adap->vres.iscsi.size = val[1] - val[0] + 1;
+	}
+#undef FW_PARAM_PFVF
+#undef FW_PARAM_DEV
+
+	/* The MTU/MSS Table is initialized by now, so load their values.  If
+	 * we're initializing the adapter, then we'll make any modifications
+	 * we want to the MTU/MSS Table and also initialize the congestion
+	 * parameters.
+	 */
+	t4_read_mtu_tbl(adap, adap->params.mtus, NULL);
+	if (state != DEV_STATE_INIT) {
+		int i;
+
+		/* The default MTU Table contains values 1492 and 1500.
+		 * However, for TCP, it's better to have two values which are
+		 * a multiple of 8 +/- 4 bytes apart near this popular MTU.
+		 * This allows us to have a TCP Data Payload which is a
+		 * multiple of 8 regardless of what combination of TCP Options
+		 * are in use (always a multiple of 4 bytes) which is
+		 * important for performance reasons.  For instance, if no
+		 * options are in use, then we have a 20-byte IP header and a
+		 * 20-byte TCP header.  In this case, a 1500-byte MSS would
+		 * result in a TCP Data Payload of 1500 - 40 == 1460 bytes
+		 * which is not a multiple of 8.  So using an MSS of 1488 in
+		 * this case results in a TCP Data Payload of 1448 bytes which
+		 * is a multiple of 8.  On the other hand, if 12-byte TCP Time
+		 * Stamps have been negotiated, then an MTU of 1500 bytes
+		 * results in a TCP Data Payload of 1448 bytes which, as
+		 * above, is a multiple of 8 bytes ...
+		 */
+		for (i = 0; i < NMTUS; i++)
+			if (adap->params.mtus[i] == 1492) {
+				adap->params.mtus[i] = 1488;
+				break;
+			}
+
+		t4_load_mtus(adap, adap->params.mtus, adap->params.a_wnd,
+			     adap->params.b_wnd);
+	}
+	t4_init_tp_params(adap);
+	adap->flags |= FW_OK;
+	return 0;
+
+	/*
+	 * Something bad happened.  If a command timed out or failed with EIO
+	 * FW does not operate within its spec or something catastrophic
+	 * happened to HW/FW, stop issuing commands.
+	 */
+bye:
+	if (ret != -ETIMEDOUT && ret != -EIO)
+		t4_fw_bye(adap, adap->mbox);
+	return ret;
+}
+
+/* EEH callbacks */
+
+static pci_ers_result_t eeh_err_detected(struct pci_dev *pdev,
+					 pci_channel_state_t state)
+{
+	int i;
+	struct adapter *adap = pci_get_drvdata(pdev);
+
+	if (!adap)
+		goto out;
+
+	rtnl_lock();
+	adap->flags &= ~FW_OK;
+	notify_ulds(adap, CXGB4_STATE_START_RECOVERY);
+	spin_lock(&adap->stats_lock);
+	for_each_port(adap, i) {
+		struct net_device *dev = adap->port[i];
+
+		netif_device_detach(dev);
+		netif_carrier_off(dev);
+	}
+	spin_unlock(&adap->stats_lock);
+	if (adap->flags & FULL_INIT_DONE)
+		cxgb_down(adap);
+	rtnl_unlock();
+	if ((adap->flags & DEV_ENABLED)) {
+		pci_disable_device(pdev);
+		adap->flags &= ~DEV_ENABLED;
+	}
+out:	return state == pci_channel_io_perm_failure ?
+		PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_NEED_RESET;
+}
+
+static pci_ers_result_t eeh_slot_reset(struct pci_dev *pdev)
+{
+	int i, ret;
+	struct fw_caps_config_cmd c;
+	struct adapter *adap = pci_get_drvdata(pdev);
+
+	if (!adap) {
+		pci_restore_state(pdev);
+		pci_save_state(pdev);
+		return PCI_ERS_RESULT_RECOVERED;
+	}
+
+	if (!(adap->flags & DEV_ENABLED)) {
+		if (pci_enable_device(pdev)) {
+			dev_err(&pdev->dev, "Cannot reenable PCI "
+					    "device after reset\n");
+			return PCI_ERS_RESULT_DISCONNECT;
+		}
+		adap->flags |= DEV_ENABLED;
+	}
+
+	pci_set_master(pdev);
+	pci_restore_state(pdev);
+	pci_save_state(pdev);
+	pci_cleanup_aer_uncorrect_error_status(pdev);
+
+	if (t4_wait_dev_ready(adap->regs) < 0)
+		return PCI_ERS_RESULT_DISCONNECT;
+	if (t4_fw_hello(adap, adap->fn, adap->fn, MASTER_MUST, NULL) < 0)
+		return PCI_ERS_RESULT_DISCONNECT;
+	adap->flags |= FW_OK;
+	if (adap_init1(adap, &c))
+		return PCI_ERS_RESULT_DISCONNECT;
+
+	for_each_port(adap, i) {
+		struct port_info *p = adap2pinfo(adap, i);
+
+		ret = t4_alloc_vi(adap, adap->fn, p->tx_chan, adap->fn, 0, 1,
+				  NULL, NULL);
+		if (ret < 0)
+			return PCI_ERS_RESULT_DISCONNECT;
+		p->viid = ret;
+		p->xact_addr_filt = -1;
+	}
+
+	t4_load_mtus(adap, adap->params.mtus, adap->params.a_wnd,
+		     adap->params.b_wnd);
+	setup_memwin(adap);
+	if (cxgb_up(adap))
+		return PCI_ERS_RESULT_DISCONNECT;
+	return PCI_ERS_RESULT_RECOVERED;
+}
+
+static void eeh_resume(struct pci_dev *pdev)
+{
+	int i;
+	struct adapter *adap = pci_get_drvdata(pdev);
+
+	if (!adap)
+		return;
+
+	rtnl_lock();
+	for_each_port(adap, i) {
+		struct net_device *dev = adap->port[i];
+
+		if (netif_running(dev)) {
+			link_start(dev);
+			cxgb_set_rxmode(dev);
+		}
+		netif_device_attach(dev);
+	}
+	rtnl_unlock();
+}
+
+static const struct pci_error_handlers cxgb4_eeh = {
+	.error_detected = eeh_err_detected,
+	.slot_reset     = eeh_slot_reset,
+	.resume         = eeh_resume,
+};
+
+static inline bool is_x_10g_port(const struct link_config *lc)
+{
+	return (lc->supported & FW_PORT_CAP_SPEED_10G) != 0 ||
+	       (lc->supported & FW_PORT_CAP_SPEED_40G) != 0;
+}
+
+static inline void init_rspq(struct adapter *adap, struct sge_rspq *q,
+			     unsigned int us, unsigned int cnt,
+			     unsigned int size, unsigned int iqe_size)
+{
+	q->adap = adap;
+	set_rspq_intr_params(q, us, cnt);
+	q->iqe_len = iqe_size;
+	q->size = size;
+}
+
+/*
+ * Perform default configuration of DMA queues depending on the number and type
+ * of ports we found and the number of available CPUs.  Most settings can be
+ * modified by the admin prior to actual use.
+ */
+static void cfg_queues(struct adapter *adap)
+{
+	struct sge *s = &adap->sge;
+	int i, n10g = 0, qidx = 0;
+#ifndef CONFIG_CHELSIO_T4_DCB
+	int q10g = 0;
+#endif
+	int ciq_size;
+
+	for_each_port(adap, i)
+		n10g += is_x_10g_port(&adap2pinfo(adap, i)->link_cfg);
+#ifdef CONFIG_CHELSIO_T4_DCB
+	/* For Data Center Bridging support we need to be able to support up
+	 * to 8 Traffic Priorities; each of which will be assigned to its
+	 * own TX Queue in order to prevent Head-Of-Line Blocking.
+	 */
+	if (adap->params.nports * 8 > MAX_ETH_QSETS) {
+		dev_err(adap->pdev_dev, "MAX_ETH_QSETS=%d < %d!\n",
+			MAX_ETH_QSETS, adap->params.nports * 8);
+		BUG_ON(1);
+	}
+
+	for_each_port(adap, i) {
+		struct port_info *pi = adap2pinfo(adap, i);
+
+		pi->first_qset = qidx;
+		pi->nqsets = 8;
+		qidx += pi->nqsets;
+	}
+#else /* !CONFIG_CHELSIO_T4_DCB */
+	/*
+	 * We default to 1 queue per non-10G port and up to # of cores queues
+	 * per 10G port.
+	 */
+	if (n10g)
+		q10g = (MAX_ETH_QSETS - (adap->params.nports - n10g)) / n10g;
+	if (q10g > netif_get_num_default_rss_queues())
+		q10g = netif_get_num_default_rss_queues();
+
+	for_each_port(adap, i) {
+		struct port_info *pi = adap2pinfo(adap, i);
+
+		pi->first_qset = qidx;
+		pi->nqsets = is_x_10g_port(&pi->link_cfg) ? q10g : 1;
+		qidx += pi->nqsets;
+	}
+#endif /* !CONFIG_CHELSIO_T4_DCB */
+
+	s->ethqsets = qidx;
+	s->max_ethqsets = qidx;   /* MSI-X may lower it later */
+
+	if (is_offload(adap)) {
+		/*
+		 * For offload we use 1 queue/channel if all ports are up to 1G,
+		 * otherwise we divide all available queues amongst the channels
+		 * capped by the number of available cores.
+		 */
+		if (n10g) {
+			i = min_t(int, ARRAY_SIZE(s->ofldrxq),
+				  num_online_cpus());
+			s->ofldqsets = roundup(i, adap->params.nports);
+		} else
+			s->ofldqsets = adap->params.nports;
+		/* For RDMA one Rx queue per channel suffices */
+		s->rdmaqs = adap->params.nports;
+		s->rdmaciqs = adap->params.nports;
+	}
+
+	for (i = 0; i < ARRAY_SIZE(s->ethrxq); i++) {
+		struct sge_eth_rxq *r = &s->ethrxq[i];
+
+		init_rspq(adap, &r->rspq, 5, 10, 1024, 64);
+		r->fl.size = 72;
+	}
+
+	for (i = 0; i < ARRAY_SIZE(s->ethtxq); i++)
+		s->ethtxq[i].q.size = 1024;
+
+	for (i = 0; i < ARRAY_SIZE(s->ctrlq); i++)
+		s->ctrlq[i].q.size = 512;
+
+	for (i = 0; i < ARRAY_SIZE(s->ofldtxq); i++)
+		s->ofldtxq[i].q.size = 1024;
+
+	for (i = 0; i < ARRAY_SIZE(s->ofldrxq); i++) {
+		struct sge_ofld_rxq *r = &s->ofldrxq[i];
+
+		init_rspq(adap, &r->rspq, 5, 1, 1024, 64);
+		r->rspq.uld = CXGB4_ULD_ISCSI;
+		r->fl.size = 72;
+	}
+
+	for (i = 0; i < ARRAY_SIZE(s->rdmarxq); i++) {
+		struct sge_ofld_rxq *r = &s->rdmarxq[i];
+
+		init_rspq(adap, &r->rspq, 5, 1, 511, 64);
+		r->rspq.uld = CXGB4_ULD_RDMA;
+		r->fl.size = 72;
+	}
+
+	ciq_size = 64 + adap->vres.cq.size + adap->tids.nftids;
+	if (ciq_size > SGE_MAX_IQ_SIZE) {
+		CH_WARN(adap, "CIQ size too small for available IQs\n");
+		ciq_size = SGE_MAX_IQ_SIZE;
+	}
+
+	for (i = 0; i < ARRAY_SIZE(s->rdmaciq); i++) {
+		struct sge_ofld_rxq *r = &s->rdmaciq[i];
+
+		init_rspq(adap, &r->rspq, 5, 1, ciq_size, 64);
+		r->rspq.uld = CXGB4_ULD_RDMA;
+	}
+
+	init_rspq(adap, &s->fw_evtq, 0, 1, 1024, 64);
+	init_rspq(adap, &s->intrq, 0, 1, 2 * MAX_INGQ, 64);
+}
+
+/*
+ * Reduce the number of Ethernet queues across all ports to at most n.
+ * n provides at least one queue per port.
+ */
+static void reduce_ethqs(struct adapter *adap, int n)
+{
+	int i;
+	struct port_info *pi;
+
+	while (n < adap->sge.ethqsets)
+		for_each_port(adap, i) {
+			pi = adap2pinfo(adap, i);
+			if (pi->nqsets > 1) {
+				pi->nqsets--;
+				adap->sge.ethqsets--;
+				if (adap->sge.ethqsets <= n)
+					break;
+			}
+		}
+
+	n = 0;
+	for_each_port(adap, i) {
+		pi = adap2pinfo(adap, i);
+		pi->first_qset = n;
+		n += pi->nqsets;
+	}
+}
+
+/* 2 MSI-X vectors needed for the FW queue and non-data interrupts */
+#define EXTRA_VECS 2
+
+static int enable_msix(struct adapter *adap)
+{
+	int ofld_need = 0;
+	int i, want, need;
+	struct sge *s = &adap->sge;
+	unsigned int nchan = adap->params.nports;
+	struct msix_entry entries[MAX_INGQ + 1];
+
+	for (i = 0; i < ARRAY_SIZE(entries); ++i)
+		entries[i].entry = i;
+
+	want = s->max_ethqsets + EXTRA_VECS;
+	if (is_offload(adap)) {
+		want += s->rdmaqs + s->rdmaciqs + s->ofldqsets;
+		/* need nchan for each possible ULD */
+		ofld_need = 3 * nchan;
+	}
+#ifdef CONFIG_CHELSIO_T4_DCB
+	/* For Data Center Bridging we need 8 Ethernet TX Priority Queues for
+	 * each port.
+	 */
+	need = 8 * adap->params.nports + EXTRA_VECS + ofld_need;
+#else
+	need = adap->params.nports + EXTRA_VECS + ofld_need;
+#endif
+	want = pci_enable_msix_range(adap->pdev, entries, need, want);
+	if (want < 0)
+		return want;
+
+	/*
+	 * Distribute available vectors to the various queue groups.
+	 * Every group gets its minimum requirement and NIC gets top
+	 * priority for leftovers.
+	 */
+	i = want - EXTRA_VECS - ofld_need;
+	if (i < s->max_ethqsets) {
+		s->max_ethqsets = i;
+		if (i < s->ethqsets)
+			reduce_ethqs(adap, i);
+	}
+	if (is_offload(adap)) {
+		i = want - EXTRA_VECS - s->max_ethqsets;
+		i -= ofld_need - nchan;
+		s->ofldqsets = (i / nchan) * nchan;  /* round down */
+	}
+	for (i = 0; i < want; ++i)
+		adap->msix_info[i].vec = entries[i].vector;
+
+	return 0;
+}
+
+#undef EXTRA_VECS
+
+static int init_rss(struct adapter *adap)
+{
+	unsigned int i, j;
+
+	for_each_port(adap, i) {
+		struct port_info *pi = adap2pinfo(adap, i);
+
+		pi->rss = kcalloc(pi->rss_size, sizeof(u16), GFP_KERNEL);
+		if (!pi->rss)
+			return -ENOMEM;
+		for (j = 0; j < pi->rss_size; j++)
+			pi->rss[j] = ethtool_rxfh_indir_default(j, pi->nqsets);
+	}
+	return 0;
+}
+
+static void print_port_info(const struct net_device *dev)
+{
+	char buf[80];
+	char *bufp = buf;
+	const char *spd = "";
+	const struct port_info *pi = netdev_priv(dev);
+	const struct adapter *adap = pi->adapter;
+
+	if (adap->params.pci.speed == PCI_EXP_LNKSTA_CLS_2_5GB)
+		spd = " 2.5 GT/s";
+	else if (adap->params.pci.speed == PCI_EXP_LNKSTA_CLS_5_0GB)
+		spd = " 5 GT/s";
+	else if (adap->params.pci.speed == PCI_EXP_LNKSTA_CLS_8_0GB)
+		spd = " 8 GT/s";
+
+	if (pi->link_cfg.supported & FW_PORT_CAP_SPEED_100M)
+		bufp += sprintf(bufp, "100/");
+	if (pi->link_cfg.supported & FW_PORT_CAP_SPEED_1G)
+		bufp += sprintf(bufp, "1000/");
+	if (pi->link_cfg.supported & FW_PORT_CAP_SPEED_10G)
+		bufp += sprintf(bufp, "10G/");
+	if (pi->link_cfg.supported & FW_PORT_CAP_SPEED_40G)
+		bufp += sprintf(bufp, "40G/");
+	if (bufp != buf)
+		--bufp;
+	sprintf(bufp, "BASE-%s", t4_get_port_type_description(pi->port_type));
+
+	netdev_info(dev, "Chelsio %s rev %d %s %sNIC PCIe x%d%s%s\n",
+		    adap->params.vpd.id,
+		    CHELSIO_CHIP_RELEASE(adap->params.chip), buf,
+		    is_offload(adap) ? "R" : "", adap->params.pci.width, spd,
+		    (adap->flags & USING_MSIX) ? " MSI-X" :
+		    (adap->flags & USING_MSI) ? " MSI" : "");
+	netdev_info(dev, "S/N: %s, P/N: %s\n",
+		    adap->params.vpd.sn, adap->params.vpd.pn);
+}
+
+static void enable_pcie_relaxed_ordering(struct pci_dev *dev)
+{
+	pcie_capability_set_word(dev, PCI_EXP_DEVCTL, PCI_EXP_DEVCTL_RELAX_EN);
+}
+
+/*
+ * Free the following resources:
+ * - memory used for tables
+ * - MSI/MSI-X
+ * - net devices
+ * - resources FW is holding for us
+ */
+static void free_some_resources(struct adapter *adapter)
+{
+	unsigned int i;
+
+	t4_free_mem(adapter->l2t);
+	t4_free_mem(adapter->tids.tid_tab);
+	disable_msi(adapter);
+
+	for_each_port(adapter, i)
+		if (adapter->port[i]) {
+			kfree(adap2pinfo(adapter, i)->rss);
+			free_netdev(adapter->port[i]);
+		}
+	if (adapter->flags & FW_OK)
+		t4_fw_bye(adapter, adapter->fn);
+}
+
+#define TSO_FLAGS (NETIF_F_TSO | NETIF_F_TSO6 | NETIF_F_TSO_ECN)
+#define VLAN_FEAT (NETIF_F_SG | NETIF_F_IP_CSUM | TSO_FLAGS | \
+		   NETIF_F_IPV6_CSUM | NETIF_F_HIGHDMA)
+#define SEGMENT_SIZE 128
+
+static int init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
+{
+	int func, i, err, s_qpp, qpp, num_seg;
+	struct port_info *pi;
+	bool highdma = false;
+	struct adapter *adapter = NULL;
+	void __iomem *regs;
+
+	printk_once(KERN_INFO "%s - version %s\n", DRV_DESC, DRV_VERSION);
+
+	err = pci_request_regions(pdev, KBUILD_MODNAME);
+	if (err) {
+		/* Just info, some other driver may have claimed the device. */
+		dev_info(&pdev->dev, "cannot obtain PCI resources\n");
+		return err;
+	}
+
+	err = pci_enable_device(pdev);
+	if (err) {
+		dev_err(&pdev->dev, "cannot enable PCI device\n");
+		goto out_release_regions;
+	}
+
+	regs = pci_ioremap_bar(pdev, 0);
+	if (!regs) {
+		dev_err(&pdev->dev, "cannot map device registers\n");
+		err = -ENOMEM;
+		goto out_disable_device;
+	}
+
+	err = t4_wait_dev_ready(regs);
+	if (err < 0)
+		goto out_unmap_bar0;
+
+	/* We control everything through one PF */
+	func = SOURCEPF_GET(readl(regs + PL_WHOAMI));
+	if (func != ent->driver_data) {
+		iounmap(regs);
+		pci_disable_device(pdev);
+		pci_save_state(pdev);        /* to restore SR-IOV later */
+		goto sriov;
+	}
+
+	if (!pci_set_dma_mask(pdev, DMA_BIT_MASK(64))) {
+		highdma = true;
+		err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64));
+		if (err) {
+			dev_err(&pdev->dev, "unable to obtain 64-bit DMA for "
+				"coherent allocations\n");
+			goto out_unmap_bar0;
+		}
+	} else {
+		err = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
+		if (err) {
+			dev_err(&pdev->dev, "no usable DMA configuration\n");
+			goto out_unmap_bar0;
+		}
+	}
+
+	pci_enable_pcie_error_reporting(pdev);
+	enable_pcie_relaxed_ordering(pdev);
+	pci_set_master(pdev);
+	pci_save_state(pdev);
+
+	adapter = kzalloc(sizeof(*adapter), GFP_KERNEL);
+	if (!adapter) {
+		err = -ENOMEM;
+		goto out_unmap_bar0;
+	}
+
+	adapter->workq = create_singlethread_workqueue("cxgb4");
+	if (!adapter->workq) {
+		err = -ENOMEM;
+		goto out_free_adapter;
+	}
+
+	/* PCI device has been enabled */
+	adapter->flags |= DEV_ENABLED;
+
+	adapter->regs = regs;
+	adapter->pdev = pdev;
+	adapter->pdev_dev = &pdev->dev;
+	adapter->mbox = func;
+	adapter->fn = func;
+	adapter->msg_enable = dflt_msg_enable;
+	memset(adapter->chan_map, 0xff, sizeof(adapter->chan_map));
+
+	spin_lock_init(&adapter->stats_lock);
+	spin_lock_init(&adapter->tid_release_lock);
+	spin_lock_init(&adapter->win0_lock);
+
+	INIT_WORK(&adapter->tid_release_task, process_tid_release_list);
+	INIT_WORK(&adapter->db_full_task, process_db_full);
+	INIT_WORK(&adapter->db_drop_task, process_db_drop);
+
+	err = t4_prep_adapter(adapter);
+	if (err)
+		goto out_free_adapter;
+
+
+	if (!is_t4(adapter->params.chip)) {
+		s_qpp = QUEUESPERPAGEPF1 * adapter->fn;
+		qpp = 1 << QUEUESPERPAGEPF0_GET(t4_read_reg(adapter,
+		      SGE_EGRESS_QUEUES_PER_PAGE_PF) >> s_qpp);
+		num_seg = PAGE_SIZE / SEGMENT_SIZE;
+
+		/* Each segment size is 128B. Write coalescing is enabled only
+		 * when SGE_EGRESS_QUEUES_PER_PAGE_PF reg value for the
+		 * queue is less no of segments that can be accommodated in
+		 * a page size.
+		 */
+		if (qpp > num_seg) {
+			dev_err(&pdev->dev,
+				"Incorrect number of egress queues per page\n");
+			err = -EINVAL;
+			goto out_free_adapter;
+		}
+		adapter->bar2 = ioremap_wc(pci_resource_start(pdev, 2),
+		pci_resource_len(pdev, 2));
+		if (!adapter->bar2) {
+			dev_err(&pdev->dev, "cannot map device bar2 region\n");
+			err = -ENOMEM;
+			goto out_free_adapter;
+		}
+	}
+
+	setup_memwin(adapter);
+	err = adap_init0(adapter);
+	setup_memwin_rdma(adapter);
+	if (err)
+		goto out_unmap_bar;
+
+	for_each_port(adapter, i) {
+		struct net_device *netdev;
+
+		netdev = alloc_etherdev_mq(sizeof(struct port_info),
+					   MAX_ETH_QSETS);
+		if (!netdev) {
+			err = -ENOMEM;
+			goto out_free_dev;
+		}
+
+		SET_NETDEV_DEV(netdev, &pdev->dev);
+
+		adapter->port[i] = netdev;
+		pi = netdev_priv(netdev);
+		pi->adapter = adapter;
+		pi->xact_addr_filt = -1;
+		pi->port_id = i;
+		netdev->irq = pdev->irq;
+
+		netdev->hw_features = NETIF_F_SG | TSO_FLAGS |
+			NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM |
+			NETIF_F_RXCSUM | NETIF_F_RXHASH |
+			NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_CTAG_RX;
+		if (highdma)
+			netdev->hw_features |= NETIF_F_HIGHDMA;
+		netdev->features |= netdev->hw_features;
+		netdev->vlan_features = netdev->features & VLAN_FEAT;
+
+		netdev->priv_flags |= IFF_UNICAST_FLT;
+
+		netdev->netdev_ops = &cxgb4_netdev_ops;
+#ifdef CONFIG_CHELSIO_T4_DCB
+		netdev->dcbnl_ops = &cxgb4_dcb_ops;
+		cxgb4_dcb_state_init(netdev);
+#endif
+		netdev->ethtool_ops = &cxgb_ethtool_ops;
+	}
+
+	pci_set_drvdata(pdev, adapter);
+
+	if (adapter->flags & FW_OK) {
+		err = t4_port_init(adapter, func, func, 0);
+		if (err)
+			goto out_free_dev;
+	}
+
+	/*
+	 * Configure queues and allocate tables now, they can be needed as
+	 * soon as the first register_netdev completes.
+	 */
+	cfg_queues(adapter);
+
+	adapter->l2t = t4_init_l2t();
+	if (!adapter->l2t) {
+		/* We tolerate a lack of L2T, giving up some functionality */
+		dev_warn(&pdev->dev, "could not allocate L2T, continuing\n");
+		adapter->params.offload = 0;
+	}
+
+	if (is_offload(adapter) && tid_init(&adapter->tids) < 0) {
+		dev_warn(&pdev->dev, "could not allocate TID table, "
+			 "continuing\n");
+		adapter->params.offload = 0;
+	}
+
+	/* See what interrupts we'll be using */
+	if (msi > 1 && enable_msix(adapter) == 0)
+		adapter->flags |= USING_MSIX;
+	else if (msi > 0 && pci_enable_msi(pdev) == 0)
+		adapter->flags |= USING_MSI;
+
+	err = init_rss(adapter);
+	if (err)
+		goto out_free_dev;
+
+	/*
+	 * The card is now ready to go.  If any errors occur during device
+	 * registration we do not fail the whole card but rather proceed only
+	 * with the ports we manage to register successfully.  However we must
+	 * register at least one net device.
+	 */
+	for_each_port(adapter, i) {
+		pi = adap2pinfo(adapter, i);
+		netif_set_real_num_tx_queues(adapter->port[i], pi->nqsets);
+		netif_set_real_num_rx_queues(adapter->port[i], pi->nqsets);
+
+		err = register_netdev(adapter->port[i]);
+		if (err)
+			break;
+		adapter->chan_map[pi->tx_chan] = i;
+		print_port_info(adapter->port[i]);
+	}
+	if (i == 0) {
+		dev_err(&pdev->dev, "could not register any net devices\n");
+		goto out_free_dev;
+	}
+	if (err) {
+		dev_warn(&pdev->dev, "only %d net devices registered\n", i);
+		err = 0;
+	}
+
+	if (cxgb4_debugfs_root) {
+		adapter->debugfs_root = debugfs_create_dir(pci_name(pdev),
+							   cxgb4_debugfs_root);
+		setup_debugfs(adapter);
+	}
+
+	/* PCIe EEH recovery on powerpc platforms needs fundamental reset */
+	pdev->needs_freset = 1;
+
+	if (is_offload(adapter))
+		attach_ulds(adapter);
+
+sriov:
+#ifdef CONFIG_PCI_IOV
+	if (func < ARRAY_SIZE(num_vf) && num_vf[func] > 0)
+		if (pci_enable_sriov(pdev, num_vf[func]) == 0)
+			dev_info(&pdev->dev,
+				 "instantiated %u virtual functions\n",
+				 num_vf[func]);
+#endif
+	return 0;
+
+ out_free_dev:
+	free_some_resources(adapter);
+ out_unmap_bar:
+	if (!is_t4(adapter->params.chip))
+		iounmap(adapter->bar2);
+ out_free_adapter:
+	if (adapter->workq)
+		destroy_workqueue(adapter->workq);
+
+	kfree(adapter);
+ out_unmap_bar0:
+	iounmap(regs);
+ out_disable_device:
+	pci_disable_pcie_error_reporting(pdev);
+	pci_disable_device(pdev);
+ out_release_regions:
+	pci_release_regions(pdev);
+	return err;
+}
+
+static void remove_one(struct pci_dev *pdev)
+{
+	struct adapter *adapter = pci_get_drvdata(pdev);
+
+#ifdef CONFIG_PCI_IOV
+	pci_disable_sriov(pdev);
+
+#endif
+
+	if (adapter) {
+		int i;
+
+		/* Tear down per-adapter Work Queue first since it can contain
+		 * references to our adapter data structure.
+		 */
+		destroy_workqueue(adapter->workq);
+
+		if (is_offload(adapter))
+			detach_ulds(adapter);
+
+		for_each_port(adapter, i)
+			if (adapter->port[i]->reg_state == NETREG_REGISTERED)
+				unregister_netdev(adapter->port[i]);
+
+		debugfs_remove_recursive(adapter->debugfs_root);
+
+		/* If we allocated filters, free up state associated with any
+		 * valid filters ...
+		 */
+		if (adapter->tids.ftid_tab) {
+			struct filter_entry *f = &adapter->tids.ftid_tab[0];
+			for (i = 0; i < (adapter->tids.nftids +
+					adapter->tids.nsftids); i++, f++)
+				if (f->valid)
+					clear_filter(adapter, f);
+		}
+
+		if (adapter->flags & FULL_INIT_DONE)
+			cxgb_down(adapter);
+
+		free_some_resources(adapter);
+		iounmap(adapter->regs);
+		if (!is_t4(adapter->params.chip))
+			iounmap(adapter->bar2);
+		pci_disable_pcie_error_reporting(pdev);
+		if ((adapter->flags & DEV_ENABLED)) {
+			pci_disable_device(pdev);
+			adapter->flags &= ~DEV_ENABLED;
+		}
+		pci_release_regions(pdev);
+		synchronize_rcu();
+		kfree(adapter);
+	} else
+		pci_release_regions(pdev);
+}
+
+static struct pci_driver cxgb4_driver = {
+	.name     = KBUILD_MODNAME,
+	.id_table = cxgb4_pci_tbl,
+	.probe    = init_one,
+	.remove   = remove_one,
+	.shutdown = remove_one,
+	.err_handler = &cxgb4_eeh,
+};
+
+static int __init cxgb4_init_module(void)
+{
+	int ret;
+
+	/* Debugfs support is optional, just warn if this fails */
+	cxgb4_debugfs_root = debugfs_create_dir(KBUILD_MODNAME, NULL);
+	if (!cxgb4_debugfs_root)
+		pr_warn("could not create debugfs entry, continuing\n");
+
+	ret = pci_register_driver(&cxgb4_driver);
+	if (ret < 0)
+		debugfs_remove(cxgb4_debugfs_root);
+
+#if IS_ENABLED(CONFIG_IPV6)
+	register_inet6addr_notifier(&cxgb4_inet6addr_notifier);
+#endif
+
+	return ret;
+}
+
+static void __exit cxgb4_cleanup_module(void)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+	unregister_inet6addr_notifier(&cxgb4_inet6addr_notifier);
+#endif
+	pci_unregister_driver(&cxgb4_driver);
+	debugfs_remove(cxgb4_debugfs_root);  /* NULL ok */
+}
+
+module_init(cxgb4_init_module);
+module_exit(cxgb4_cleanup_module);
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h
new file mode 100644
index 0000000..1366ba6
--- /dev/null
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h
@@ -0,0 +1,307 @@
+/*
+ * This file is part of the Chelsio T4 Ethernet driver for Linux.
+ *
+ * Copyright (c) 2003-2014 Chelsio Communications, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __CXGB4_OFLD_H
+#define __CXGB4_OFLD_H
+
+#include <linux/cache.h>
+#include <linux/spinlock.h>
+#include <linux/skbuff.h>
+#include <linux/inetdevice.h>
+#include <linux/atomic.h>
+
+/* CPL message priority levels */
+enum {
+	CPL_PRIORITY_DATA     = 0,  /* data messages */
+	CPL_PRIORITY_SETUP    = 1,  /* connection setup messages */
+	CPL_PRIORITY_TEARDOWN = 0,  /* connection teardown messages */
+	CPL_PRIORITY_LISTEN   = 1,  /* listen start/stop messages */
+	CPL_PRIORITY_ACK      = 1,  /* RX ACK messages */
+	CPL_PRIORITY_CONTROL  = 1   /* control messages */
+};
+
+#define INIT_TP_WR(w, tid) do { \
+	(w)->wr.wr_hi = htonl(FW_WR_OP(FW_TP_WR) | \
+			      FW_WR_IMMDLEN(sizeof(*w) - sizeof(w->wr))); \
+	(w)->wr.wr_mid = htonl(FW_WR_LEN16(DIV_ROUND_UP(sizeof(*w), 16)) | \
+			       FW_WR_FLOWID(tid)); \
+	(w)->wr.wr_lo = cpu_to_be64(0); \
+} while (0)
+
+#define INIT_TP_WR_CPL(w, cpl, tid) do { \
+	INIT_TP_WR(w, tid); \
+	OPCODE_TID(w) = htonl(MK_OPCODE_TID(cpl, tid)); \
+} while (0)
+
+#define INIT_ULPTX_WR(w, wrlen, atomic, tid) do { \
+	(w)->wr.wr_hi = htonl(FW_WR_OP(FW_ULPTX_WR) | FW_WR_ATOMIC(atomic)); \
+	(w)->wr.wr_mid = htonl(FW_WR_LEN16(DIV_ROUND_UP(wrlen, 16)) | \
+			       FW_WR_FLOWID(tid)); \
+	(w)->wr.wr_lo = cpu_to_be64(0); \
+} while (0)
+
+/* Special asynchronous notification message */
+#define CXGB4_MSG_AN ((void *)1)
+
+struct serv_entry {
+	void *data;
+};
+
+union aopen_entry {
+	void *data;
+	union aopen_entry *next;
+};
+
+/*
+ * Holds the size, base address, free list start, etc of the TID, server TID,
+ * and active-open TID tables.  The tables themselves are allocated dynamically.
+ */
+struct tid_info {
+	void **tid_tab;
+	unsigned int ntids;
+
+	struct serv_entry *stid_tab;
+	unsigned long *stid_bmap;
+	unsigned int nstids;
+	unsigned int stid_base;
+
+	union aopen_entry *atid_tab;
+	unsigned int natids;
+	unsigned int atid_base;
+
+	struct filter_entry *ftid_tab;
+	unsigned int nftids;
+	unsigned int ftid_base;
+	unsigned int aftid_base;
+	unsigned int aftid_end;
+	/* Server filter region */
+	unsigned int sftid_base;
+	unsigned int nsftids;
+
+	spinlock_t atid_lock ____cacheline_aligned_in_smp;
+	union aopen_entry *afree;
+	unsigned int atids_in_use;
+
+	spinlock_t stid_lock;
+	unsigned int stids_in_use;
+
+	atomic_t tids_in_use;
+};
+
+static inline void *lookup_tid(const struct tid_info *t, unsigned int tid)
+{
+	return tid < t->ntids ? t->tid_tab[tid] : NULL;
+}
+
+static inline void *lookup_atid(const struct tid_info *t, unsigned int atid)
+{
+	return atid < t->natids ? t->atid_tab[atid].data : NULL;
+}
+
+static inline void *lookup_stid(const struct tid_info *t, unsigned int stid)
+{
+	/* Is it a server filter TID? */
+	if (t->nsftids && (stid >= t->sftid_base)) {
+		stid -= t->sftid_base;
+		stid += t->nstids;
+	} else {
+		stid -= t->stid_base;
+	}
+
+	return stid < (t->nstids + t->nsftids) ? t->stid_tab[stid].data : NULL;
+}
+
+static inline void cxgb4_insert_tid(struct tid_info *t, void *data,
+				    unsigned int tid)
+{
+	t->tid_tab[tid] = data;
+	atomic_inc(&t->tids_in_use);
+}
+
+int cxgb4_alloc_atid(struct tid_info *t, void *data);
+int cxgb4_alloc_stid(struct tid_info *t, int family, void *data);
+int cxgb4_alloc_sftid(struct tid_info *t, int family, void *data);
+void cxgb4_free_atid(struct tid_info *t, unsigned int atid);
+void cxgb4_free_stid(struct tid_info *t, unsigned int stid, int family);
+void cxgb4_remove_tid(struct tid_info *t, unsigned int qid, unsigned int tid);
+
+struct in6_addr;
+
+int cxgb4_create_server(const struct net_device *dev, unsigned int stid,
+			__be32 sip, __be16 sport, __be16 vlan,
+			unsigned int queue);
+int cxgb4_create_server6(const struct net_device *dev, unsigned int stid,
+			 const struct in6_addr *sip, __be16 sport,
+			 unsigned int queue);
+int cxgb4_remove_server(const struct net_device *dev, unsigned int stid,
+			unsigned int queue, bool ipv6);
+int cxgb4_create_server_filter(const struct net_device *dev, unsigned int stid,
+			       __be32 sip, __be16 sport, __be16 vlan,
+			       unsigned int queue,
+			       unsigned char port, unsigned char mask);
+int cxgb4_remove_server_filter(const struct net_device *dev, unsigned int stid,
+			       unsigned int queue, bool ipv6);
+int cxgb4_clip_get(const struct net_device *dev, const struct in6_addr *lip);
+int cxgb4_clip_release(const struct net_device *dev,
+		       const struct in6_addr *lip);
+
+static inline void set_wr_txq(struct sk_buff *skb, int prio, int queue)
+{
+	skb_set_queue_mapping(skb, (queue << 1) | prio);
+}
+
+enum cxgb4_uld {
+	CXGB4_ULD_RDMA,
+	CXGB4_ULD_ISCSI,
+	CXGB4_ULD_MAX
+};
+
+enum cxgb4_state {
+	CXGB4_STATE_UP,
+	CXGB4_STATE_START_RECOVERY,
+	CXGB4_STATE_DOWN,
+	CXGB4_STATE_DETACH
+};
+
+enum cxgb4_control {
+	CXGB4_CONTROL_DB_FULL,
+	CXGB4_CONTROL_DB_EMPTY,
+	CXGB4_CONTROL_DB_DROP,
+};
+
+struct pci_dev;
+struct l2t_data;
+struct net_device;
+struct pkt_gl;
+struct tp_tcp_stats;
+
+struct cxgb4_range {
+	unsigned int start;
+	unsigned int size;
+};
+
+struct cxgb4_virt_res {                      /* virtualized HW resources */
+	struct cxgb4_range ddp;
+	struct cxgb4_range iscsi;
+	struct cxgb4_range stag;
+	struct cxgb4_range rq;
+	struct cxgb4_range pbl;
+	struct cxgb4_range qp;
+	struct cxgb4_range cq;
+	struct cxgb4_range ocq;
+};
+
+#define OCQ_WIN_OFFSET(pdev, vres) \
+	(pci_resource_len((pdev), 2) - roundup_pow_of_two((vres)->ocq.size))
+
+/*
+ * Block of information the LLD provides to ULDs attaching to a device.
+ */
+struct cxgb4_lld_info {
+	struct pci_dev *pdev;                /* associated PCI device */
+	struct l2t_data *l2t;                /* L2 table */
+	struct tid_info *tids;               /* TID table */
+	struct net_device **ports;           /* device ports */
+	const struct cxgb4_virt_res *vr;     /* assorted HW resources */
+	const unsigned short *mtus;          /* MTU table */
+	const unsigned short *rxq_ids;       /* the ULD's Rx queue ids */
+	const unsigned short *ciq_ids;       /* the ULD's concentrator IQ ids */
+	unsigned short nrxq;                 /* # of Rx queues */
+	unsigned short ntxq;                 /* # of Tx queues */
+	unsigned short nciq;		     /* # of concentrator IQ */
+	unsigned char nchan:4;               /* # of channels */
+	unsigned char nports:4;              /* # of ports */
+	unsigned char wr_cred;               /* WR 16-byte credits */
+	unsigned char adapter_type;          /* type of adapter */
+	unsigned char fw_api_ver;            /* FW API version */
+	unsigned int fw_vers;                /* FW version */
+	unsigned int iscsi_iolen;            /* iSCSI max I/O length */
+	unsigned int cclk_ps;                /* Core clock period in psec */
+	unsigned short udb_density;          /* # of user DB/page */
+	unsigned short ucq_density;          /* # of user CQs/page */
+	unsigned short filt_mode;            /* filter optional components */
+	unsigned short tx_modq[NCHAN];       /* maps each tx channel to a */
+					     /* scheduler queue */
+	void __iomem *gts_reg;               /* address of GTS register */
+	void __iomem *db_reg;                /* address of kernel doorbell */
+	int dbfifo_int_thresh;		     /* doorbell fifo int threshold */
+	unsigned int sge_ingpadboundary;     /* SGE ingress padding boundary */
+	unsigned int sge_egrstatuspagesize;  /* SGE egress status page size */
+	unsigned int sge_pktshift;           /* Padding between CPL and */
+					     /*	packet data */
+	unsigned int pf;		     /* Physical Function we're using */
+	bool enable_fw_ofld_conn;            /* Enable connection through fw */
+					     /* WR */
+	unsigned int max_ordird_qp;          /* Max ORD/IRD depth per RDMA QP */
+	unsigned int max_ird_adapter;        /* Max IRD memory per adapter */
+	bool ulptx_memwrite_dsgl;            /* use of T5 DSGL allowed */
+};
+
+struct cxgb4_uld_info {
+	const char *name;
+	void *(*add)(const struct cxgb4_lld_info *p);
+	int (*rx_handler)(void *handle, const __be64 *rsp,
+			  const struct pkt_gl *gl);
+	int (*state_change)(void *handle, enum cxgb4_state new_state);
+	int (*control)(void *handle, enum cxgb4_control control, ...);
+};
+
+int cxgb4_register_uld(enum cxgb4_uld type, const struct cxgb4_uld_info *p);
+int cxgb4_unregister_uld(enum cxgb4_uld type);
+int cxgb4_ofld_send(struct net_device *dev, struct sk_buff *skb);
+unsigned int cxgb4_dbfifo_count(const struct net_device *dev, int lpfifo);
+unsigned int cxgb4_port_chan(const struct net_device *dev);
+unsigned int cxgb4_port_viid(const struct net_device *dev);
+unsigned int cxgb4_port_idx(const struct net_device *dev);
+unsigned int cxgb4_best_mtu(const unsigned short *mtus, unsigned short mtu,
+			    unsigned int *idx);
+unsigned int cxgb4_best_aligned_mtu(const unsigned short *mtus,
+				    unsigned short header_size,
+				    unsigned short data_size_max,
+				    unsigned short data_size_align,
+				    unsigned int *mtu_idxp);
+void cxgb4_get_tcp_stats(struct pci_dev *pdev, struct tp_tcp_stats *v4,
+			 struct tp_tcp_stats *v6);
+void cxgb4_iscsi_init(struct net_device *dev, unsigned int tag_mask,
+		      const unsigned int *pgsz_order);
+struct sk_buff *cxgb4_pktgl_to_skb(const struct pkt_gl *gl,
+				   unsigned int skb_len, unsigned int pull_len);
+int cxgb4_sync_txq_pidx(struct net_device *dev, u16 qid, u16 pidx, u16 size);
+int cxgb4_flush_eq_cache(struct net_device *dev);
+void cxgb4_disable_db_coalescing(struct net_device *dev);
+void cxgb4_enable_db_coalescing(struct net_device *dev);
+int cxgb4_read_tpte(struct net_device *dev, u32 stag, __be32 *tpte);
+u64 cxgb4_read_sge_timestamp(struct net_device *dev);
+
+#endif  /* !__CXGB4_OFLD_H */
diff --git a/drivers/net/ethernet/chelsio/cxgb4/l2t.c b/drivers/net/ethernet/chelsio/cxgb4/l2t.c
new file mode 100644
index 0000000..9604139
--- /dev/null
+++ b/drivers/net/ethernet/chelsio/cxgb4/l2t.c
@@ -0,0 +1,665 @@
+/*
+ * This file is part of the Chelsio T4 Ethernet driver for Linux.
+ *
+ * Copyright (c) 2003-2014 Chelsio Communications, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/if.h>
+#include <linux/if_vlan.h>
+#include <linux/jhash.h>
+#include <linux/module.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+#include <net/neighbour.h>
+#include "cxgb4.h"
+#include "l2t.h"
+#include "t4_msg.h"
+#include "t4fw_api.h"
+#include "t4_regs.h"
+
+#define VLAN_NONE 0xfff
+
+/* identifies sync vs async L2T_WRITE_REQs */
+#define F_SYNC_WR    (1 << 12)
+
+enum {
+	L2T_STATE_VALID,      /* entry is up to date */
+	L2T_STATE_STALE,      /* entry may be used but needs revalidation */
+	L2T_STATE_RESOLVING,  /* entry needs address resolution */
+	L2T_STATE_SYNC_WRITE, /* synchronous write of entry underway */
+
+	/* when state is one of the below the entry is not hashed */
+	L2T_STATE_SWITCHING,  /* entry is being used by a switching filter */
+	L2T_STATE_UNUSED      /* entry not in use */
+};
+
+struct l2t_data {
+	rwlock_t lock;
+	atomic_t nfree;             /* number of free entries */
+	struct l2t_entry *rover;    /* starting point for next allocation */
+	struct l2t_entry l2tab[L2T_SIZE];
+};
+
+static inline unsigned int vlan_prio(const struct l2t_entry *e)
+{
+	return e->vlan >> 13;
+}
+
+static inline void l2t_hold(struct l2t_data *d, struct l2t_entry *e)
+{
+	if (atomic_add_return(1, &e->refcnt) == 1)  /* 0 -> 1 transition */
+		atomic_dec(&d->nfree);
+}
+
+/*
+ * To avoid having to check address families we do not allow v4 and v6
+ * neighbors to be on the same hash chain.  We keep v4 entries in the first
+ * half of available hash buckets and v6 in the second.
+ */
+enum {
+	L2T_SZ_HALF = L2T_SIZE / 2,
+	L2T_HASH_MASK = L2T_SZ_HALF - 1
+};
+
+static inline unsigned int arp_hash(const u32 *key, int ifindex)
+{
+	return jhash_2words(*key, ifindex, 0) & L2T_HASH_MASK;
+}
+
+static inline unsigned int ipv6_hash(const u32 *key, int ifindex)
+{
+	u32 xor = key[0] ^ key[1] ^ key[2] ^ key[3];
+
+	return L2T_SZ_HALF + (jhash_2words(xor, ifindex, 0) & L2T_HASH_MASK);
+}
+
+static unsigned int addr_hash(const u32 *addr, int addr_len, int ifindex)
+{
+	return addr_len == 4 ? arp_hash(addr, ifindex) :
+			       ipv6_hash(addr, ifindex);
+}
+
+/*
+ * Checks if an L2T entry is for the given IP/IPv6 address.  It does not check
+ * whether the L2T entry and the address are of the same address family.
+ * Callers ensure an address is only checked against L2T entries of the same
+ * family, something made trivial by the separation of IP and IPv6 hash chains
+ * mentioned above.  Returns 0 if there's a match,
+ */
+static int addreq(const struct l2t_entry *e, const u32 *addr)
+{
+	if (e->v6)
+		return (e->addr[0] ^ addr[0]) | (e->addr[1] ^ addr[1]) |
+		       (e->addr[2] ^ addr[2]) | (e->addr[3] ^ addr[3]);
+	return e->addr[0] ^ addr[0];
+}
+
+static void neigh_replace(struct l2t_entry *e, struct neighbour *n)
+{
+	neigh_hold(n);
+	if (e->neigh)
+		neigh_release(e->neigh);
+	e->neigh = n;
+}
+
+/*
+ * Write an L2T entry.  Must be called with the entry locked.
+ * The write may be synchronous or asynchronous.
+ */
+static int write_l2e(struct adapter *adap, struct l2t_entry *e, int sync)
+{
+	struct sk_buff *skb;
+	struct cpl_l2t_write_req *req;
+
+	skb = alloc_skb(sizeof(*req), GFP_ATOMIC);
+	if (!skb)
+		return -ENOMEM;
+
+	req = (struct cpl_l2t_write_req *)__skb_put(skb, sizeof(*req));
+	INIT_TP_WR(req, 0);
+
+	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_L2T_WRITE_REQ,
+					e->idx | (sync ? F_SYNC_WR : 0) |
+					TID_QID(adap->sge.fw_evtq.abs_id)));
+	req->params = htons(L2T_W_PORT(e->lport) | L2T_W_NOREPLY(!sync));
+	req->l2t_idx = htons(e->idx);
+	req->vlan = htons(e->vlan);
+	if (e->neigh && !(e->neigh->dev->flags & IFF_LOOPBACK))
+		memcpy(e->dmac, e->neigh->ha, sizeof(e->dmac));
+	memcpy(req->dst_mac, e->dmac, sizeof(req->dst_mac));
+
+	set_wr_txq(skb, CPL_PRIORITY_CONTROL, 0);
+	t4_ofld_send(adap, skb);
+
+	if (sync && e->state != L2T_STATE_SWITCHING)
+		e->state = L2T_STATE_SYNC_WRITE;
+	return 0;
+}
+
+/*
+ * Send packets waiting in an L2T entry's ARP queue.  Must be called with the
+ * entry locked.
+ */
+static void send_pending(struct adapter *adap, struct l2t_entry *e)
+{
+	while (e->arpq_head) {
+		struct sk_buff *skb = e->arpq_head;
+
+		e->arpq_head = skb->next;
+		skb->next = NULL;
+		t4_ofld_send(adap, skb);
+	}
+	e->arpq_tail = NULL;
+}
+
+/*
+ * Process a CPL_L2T_WRITE_RPL.  Wake up the ARP queue if it completes a
+ * synchronous L2T_WRITE.  Note that the TID in the reply is really the L2T
+ * index it refers to.
+ */
+void do_l2t_write_rpl(struct adapter *adap, const struct cpl_l2t_write_rpl *rpl)
+{
+	unsigned int tid = GET_TID(rpl);
+	unsigned int idx = tid & (L2T_SIZE - 1);
+
+	if (unlikely(rpl->status != CPL_ERR_NONE)) {
+		dev_err(adap->pdev_dev,
+			"Unexpected L2T_WRITE_RPL status %u for entry %u\n",
+			rpl->status, idx);
+		return;
+	}
+
+	if (tid & F_SYNC_WR) {
+		struct l2t_entry *e = &adap->l2t->l2tab[idx];
+
+		spin_lock(&e->lock);
+		if (e->state != L2T_STATE_SWITCHING) {
+			send_pending(adap, e);
+			e->state = (e->neigh->nud_state & NUD_STALE) ?
+					L2T_STATE_STALE : L2T_STATE_VALID;
+		}
+		spin_unlock(&e->lock);
+	}
+}
+
+/*
+ * Add a packet to an L2T entry's queue of packets awaiting resolution.
+ * Must be called with the entry's lock held.
+ */
+static inline void arpq_enqueue(struct l2t_entry *e, struct sk_buff *skb)
+{
+	skb->next = NULL;
+	if (e->arpq_head)
+		e->arpq_tail->next = skb;
+	else
+		e->arpq_head = skb;
+	e->arpq_tail = skb;
+}
+
+int cxgb4_l2t_send(struct net_device *dev, struct sk_buff *skb,
+		   struct l2t_entry *e)
+{
+	struct adapter *adap = netdev2adap(dev);
+
+again:
+	switch (e->state) {
+	case L2T_STATE_STALE:     /* entry is stale, kick off revalidation */
+		neigh_event_send(e->neigh, NULL);
+		spin_lock_bh(&e->lock);
+		if (e->state == L2T_STATE_STALE)
+			e->state = L2T_STATE_VALID;
+		spin_unlock_bh(&e->lock);
+	case L2T_STATE_VALID:     /* fast-path, send the packet on */
+		return t4_ofld_send(adap, skb);
+	case L2T_STATE_RESOLVING:
+	case L2T_STATE_SYNC_WRITE:
+		spin_lock_bh(&e->lock);
+		if (e->state != L2T_STATE_SYNC_WRITE &&
+		    e->state != L2T_STATE_RESOLVING) {
+			spin_unlock_bh(&e->lock);
+			goto again;
+		}
+		arpq_enqueue(e, skb);
+		spin_unlock_bh(&e->lock);
+
+		if (e->state == L2T_STATE_RESOLVING &&
+		    !neigh_event_send(e->neigh, NULL)) {
+			spin_lock_bh(&e->lock);
+			if (e->state == L2T_STATE_RESOLVING && e->arpq_head)
+				write_l2e(adap, e, 1);
+			spin_unlock_bh(&e->lock);
+		}
+	}
+	return 0;
+}
+EXPORT_SYMBOL(cxgb4_l2t_send);
+
+/*
+ * Allocate a free L2T entry.  Must be called with l2t_data.lock held.
+ */
+static struct l2t_entry *alloc_l2e(struct l2t_data *d)
+{
+	struct l2t_entry *end, *e, **p;
+
+	if (!atomic_read(&d->nfree))
+		return NULL;
+
+	/* there's definitely a free entry */
+	for (e = d->rover, end = &d->l2tab[L2T_SIZE]; e != end; ++e)
+		if (atomic_read(&e->refcnt) == 0)
+			goto found;
+
+	for (e = d->l2tab; atomic_read(&e->refcnt); ++e)
+		;
+found:
+	d->rover = e + 1;
+	atomic_dec(&d->nfree);
+
+	/*
+	 * The entry we found may be an inactive entry that is
+	 * presently in the hash table.  We need to remove it.
+	 */
+	if (e->state < L2T_STATE_SWITCHING)
+		for (p = &d->l2tab[e->hash].first; *p; p = &(*p)->next)
+			if (*p == e) {
+				*p = e->next;
+				e->next = NULL;
+				break;
+			}
+
+	e->state = L2T_STATE_UNUSED;
+	return e;
+}
+
+/*
+ * Called when an L2T entry has no more users.
+ */
+static void t4_l2e_free(struct l2t_entry *e)
+{
+	struct l2t_data *d;
+
+	spin_lock_bh(&e->lock);
+	if (atomic_read(&e->refcnt) == 0) {  /* hasn't been recycled */
+		if (e->neigh) {
+			neigh_release(e->neigh);
+			e->neigh = NULL;
+		}
+		while (e->arpq_head) {
+			struct sk_buff *skb = e->arpq_head;
+
+			e->arpq_head = skb->next;
+			kfree_skb(skb);
+		}
+		e->arpq_tail = NULL;
+	}
+	spin_unlock_bh(&e->lock);
+
+	d = container_of(e, struct l2t_data, l2tab[e->idx]);
+	atomic_inc(&d->nfree);
+}
+
+void cxgb4_l2t_release(struct l2t_entry *e)
+{
+	if (atomic_dec_and_test(&e->refcnt))
+		t4_l2e_free(e);
+}
+EXPORT_SYMBOL(cxgb4_l2t_release);
+
+/*
+ * Update an L2T entry that was previously used for the same next hop as neigh.
+ * Must be called with softirqs disabled.
+ */
+static void reuse_entry(struct l2t_entry *e, struct neighbour *neigh)
+{
+	unsigned int nud_state;
+
+	spin_lock(&e->lock);                /* avoid race with t4_l2t_free */
+	if (neigh != e->neigh)
+		neigh_replace(e, neigh);
+	nud_state = neigh->nud_state;
+	if (memcmp(e->dmac, neigh->ha, sizeof(e->dmac)) ||
+	    !(nud_state & NUD_VALID))
+		e->state = L2T_STATE_RESOLVING;
+	else if (nud_state & NUD_CONNECTED)
+		e->state = L2T_STATE_VALID;
+	else
+		e->state = L2T_STATE_STALE;
+	spin_unlock(&e->lock);
+}
+
+struct l2t_entry *cxgb4_l2t_get(struct l2t_data *d, struct neighbour *neigh,
+				const struct net_device *physdev,
+				unsigned int priority)
+{
+	u8 lport;
+	u16 vlan;
+	struct l2t_entry *e;
+	int addr_len = neigh->tbl->key_len;
+	u32 *addr = (u32 *)neigh->primary_key;
+	int ifidx = neigh->dev->ifindex;
+	int hash = addr_hash(addr, addr_len, ifidx);
+
+	if (neigh->dev->flags & IFF_LOOPBACK)
+		lport = netdev2pinfo(physdev)->tx_chan + 4;
+	else
+		lport = netdev2pinfo(physdev)->lport;
+
+	if (neigh->dev->priv_flags & IFF_802_1Q_VLAN)
+		vlan = vlan_dev_vlan_id(neigh->dev);
+	else
+		vlan = VLAN_NONE;
+
+	write_lock_bh(&d->lock);
+	for (e = d->l2tab[hash].first; e; e = e->next)
+		if (!addreq(e, addr) && e->ifindex == ifidx &&
+		    e->vlan == vlan && e->lport == lport) {
+			l2t_hold(d, e);
+			if (atomic_read(&e->refcnt) == 1)
+				reuse_entry(e, neigh);
+			goto done;
+		}
+
+	/* Need to allocate a new entry */
+	e = alloc_l2e(d);
+	if (e) {
+		spin_lock(&e->lock);          /* avoid race with t4_l2t_free */
+		e->state = L2T_STATE_RESOLVING;
+		if (neigh->dev->flags & IFF_LOOPBACK)
+			memcpy(e->dmac, physdev->dev_addr, sizeof(e->dmac));
+		memcpy(e->addr, addr, addr_len);
+		e->ifindex = ifidx;
+		e->hash = hash;
+		e->lport = lport;
+		e->v6 = addr_len == 16;
+		atomic_set(&e->refcnt, 1);
+		neigh_replace(e, neigh);
+		e->vlan = vlan;
+		e->next = d->l2tab[hash].first;
+		d->l2tab[hash].first = e;
+		spin_unlock(&e->lock);
+	}
+done:
+	write_unlock_bh(&d->lock);
+	return e;
+}
+EXPORT_SYMBOL(cxgb4_l2t_get);
+
+u64 cxgb4_select_ntuple(struct net_device *dev,
+			const struct l2t_entry *l2t)
+{
+	struct adapter *adap = netdev2adap(dev);
+	struct tp_params *tp = &adap->params.tp;
+	u64 ntuple = 0;
+
+	/* Initialize each of the fields which we care about which are present
+	 * in the Compressed Filter Tuple.
+	 */
+	if (tp->vlan_shift >= 0 && l2t->vlan != VLAN_NONE)
+		ntuple |= (u64)(F_FT_VLAN_VLD | l2t->vlan) << tp->vlan_shift;
+
+	if (tp->port_shift >= 0)
+		ntuple |= (u64)l2t->lport << tp->port_shift;
+
+	if (tp->protocol_shift >= 0)
+		ntuple |= (u64)IPPROTO_TCP << tp->protocol_shift;
+
+	if (tp->vnic_shift >= 0) {
+		u32 viid = cxgb4_port_viid(dev);
+		u32 vf = FW_VIID_VIN_GET(viid);
+		u32 pf = FW_VIID_PFN_GET(viid);
+		u32 vld = FW_VIID_VIVLD_GET(viid);
+
+		ntuple |= (u64)(V_FT_VNID_ID_VF(vf) |
+				V_FT_VNID_ID_PF(pf) |
+				V_FT_VNID_ID_VLD(vld)) << tp->vnic_shift;
+	}
+
+	return ntuple;
+}
+EXPORT_SYMBOL(cxgb4_select_ntuple);
+
+/*
+ * Called when address resolution fails for an L2T entry to handle packets
+ * on the arpq head.  If a packet specifies a failure handler it is invoked,
+ * otherwise the packet is sent to the device.
+ */
+static void handle_failed_resolution(struct adapter *adap, struct sk_buff *arpq)
+{
+	while (arpq) {
+		struct sk_buff *skb = arpq;
+		const struct l2t_skb_cb *cb = L2T_SKB_CB(skb);
+
+		arpq = skb->next;
+		skb->next = NULL;
+		if (cb->arp_err_handler)
+			cb->arp_err_handler(cb->handle, skb);
+		else
+			t4_ofld_send(adap, skb);
+	}
+}
+
+/*
+ * Called when the host's neighbor layer makes a change to some entry that is
+ * loaded into the HW L2 table.
+ */
+void t4_l2t_update(struct adapter *adap, struct neighbour *neigh)
+{
+	struct l2t_entry *e;
+	struct sk_buff *arpq = NULL;
+	struct l2t_data *d = adap->l2t;
+	int addr_len = neigh->tbl->key_len;
+	u32 *addr = (u32 *) neigh->primary_key;
+	int ifidx = neigh->dev->ifindex;
+	int hash = addr_hash(addr, addr_len, ifidx);
+
+	read_lock_bh(&d->lock);
+	for (e = d->l2tab[hash].first; e; e = e->next)
+		if (!addreq(e, addr) && e->ifindex == ifidx) {
+			spin_lock(&e->lock);
+			if (atomic_read(&e->refcnt))
+				goto found;
+			spin_unlock(&e->lock);
+			break;
+		}
+	read_unlock_bh(&d->lock);
+	return;
+
+ found:
+	read_unlock(&d->lock);
+
+	if (neigh != e->neigh)
+		neigh_replace(e, neigh);
+
+	if (e->state == L2T_STATE_RESOLVING) {
+		if (neigh->nud_state & NUD_FAILED) {
+			arpq = e->arpq_head;
+			e->arpq_head = e->arpq_tail = NULL;
+		} else if ((neigh->nud_state & (NUD_CONNECTED | NUD_STALE)) &&
+			   e->arpq_head) {
+			write_l2e(adap, e, 1);
+		}
+	} else {
+		e->state = neigh->nud_state & NUD_CONNECTED ?
+			L2T_STATE_VALID : L2T_STATE_STALE;
+		if (memcmp(e->dmac, neigh->ha, sizeof(e->dmac)))
+			write_l2e(adap, e, 0);
+	}
+
+	spin_unlock_bh(&e->lock);
+
+	if (arpq)
+		handle_failed_resolution(adap, arpq);
+}
+
+/* Allocate an L2T entry for use by a switching rule.  Such need to be
+ * explicitly freed and while busy they are not on any hash chain, so normal
+ * address resolution updates do not see them.
+ */
+struct l2t_entry *t4_l2t_alloc_switching(struct l2t_data *d)
+{
+	struct l2t_entry *e;
+
+	write_lock_bh(&d->lock);
+	e = alloc_l2e(d);
+	if (e) {
+		spin_lock(&e->lock);          /* avoid race with t4_l2t_free */
+		e->state = L2T_STATE_SWITCHING;
+		atomic_set(&e->refcnt, 1);
+		spin_unlock(&e->lock);
+	}
+	write_unlock_bh(&d->lock);
+	return e;
+}
+
+/* Sets/updates the contents of a switching L2T entry that has been allocated
+ * with an earlier call to @t4_l2t_alloc_switching.
+ */
+int t4_l2t_set_switching(struct adapter *adap, struct l2t_entry *e, u16 vlan,
+		u8 port, u8 *eth_addr)
+{
+	e->vlan = vlan;
+	e->lport = port;
+	memcpy(e->dmac, eth_addr, ETH_ALEN);
+	return write_l2e(adap, e, 0);
+}
+
+struct l2t_data *t4_init_l2t(void)
+{
+	int i;
+	struct l2t_data *d;
+
+	d = t4_alloc_mem(sizeof(*d));
+	if (!d)
+		return NULL;
+
+	d->rover = d->l2tab;
+	atomic_set(&d->nfree, L2T_SIZE);
+	rwlock_init(&d->lock);
+
+	for (i = 0; i < L2T_SIZE; ++i) {
+		d->l2tab[i].idx = i;
+		d->l2tab[i].state = L2T_STATE_UNUSED;
+		spin_lock_init(&d->l2tab[i].lock);
+		atomic_set(&d->l2tab[i].refcnt, 0);
+	}
+	return d;
+}
+
+static inline void *l2t_get_idx(struct seq_file *seq, loff_t pos)
+{
+	struct l2t_entry *l2tab = seq->private;
+
+	return pos >= L2T_SIZE ? NULL : &l2tab[pos];
+}
+
+static void *l2t_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	return *pos ? l2t_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
+}
+
+static void *l2t_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	v = l2t_get_idx(seq, *pos);
+	if (v)
+		++*pos;
+	return v;
+}
+
+static void l2t_seq_stop(struct seq_file *seq, void *v)
+{
+}
+
+static char l2e_state(const struct l2t_entry *e)
+{
+	switch (e->state) {
+	case L2T_STATE_VALID: return 'V';
+	case L2T_STATE_STALE: return 'S';
+	case L2T_STATE_SYNC_WRITE: return 'W';
+	case L2T_STATE_RESOLVING: return e->arpq_head ? 'A' : 'R';
+	case L2T_STATE_SWITCHING: return 'X';
+	default:
+		return 'U';
+	}
+}
+
+static int l2t_seq_show(struct seq_file *seq, void *v)
+{
+	if (v == SEQ_START_TOKEN)
+		seq_puts(seq, " Idx IP address                "
+			 "Ethernet address  VLAN/P LP State Users Port\n");
+	else {
+		char ip[60];
+		struct l2t_entry *e = v;
+
+		spin_lock_bh(&e->lock);
+		if (e->state == L2T_STATE_SWITCHING)
+			ip[0] = '\0';
+		else
+			sprintf(ip, e->v6 ? "%pI6c" : "%pI4", e->addr);
+		seq_printf(seq, "%4u %-25s %17pM %4d %u %2u   %c   %5u %s\n",
+			   e->idx, ip, e->dmac,
+			   e->vlan & VLAN_VID_MASK, vlan_prio(e), e->lport,
+			   l2e_state(e), atomic_read(&e->refcnt),
+			   e->neigh ? e->neigh->dev->name : "");
+		spin_unlock_bh(&e->lock);
+	}
+	return 0;
+}
+
+static const struct seq_operations l2t_seq_ops = {
+	.start = l2t_seq_start,
+	.next = l2t_seq_next,
+	.stop = l2t_seq_stop,
+	.show = l2t_seq_show
+};
+
+static int l2t_seq_open(struct inode *inode, struct file *file)
+{
+	int rc = seq_open(file, &l2t_seq_ops);
+
+	if (!rc) {
+		struct adapter *adap = inode->i_private;
+		struct seq_file *seq = file->private_data;
+
+		seq->private = adap->l2t->l2tab;
+	}
+	return rc;
+}
+
+const struct file_operations t4_l2t_fops = {
+	.owner = THIS_MODULE,
+	.open = l2t_seq_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = seq_release,
+};
diff --git a/drivers/net/ethernet/chelsio/cxgb4/l2t.h b/drivers/net/ethernet/chelsio/cxgb4/l2t.h
new file mode 100644
index 0000000..a30126c
--- /dev/null
+++ b/drivers/net/ethernet/chelsio/cxgb4/l2t.h
@@ -0,0 +1,111 @@
+/*
+ * This file is part of the Chelsio T4 Ethernet driver for Linux.
+ *
+ * Copyright (c) 2003-2014 Chelsio Communications, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __CXGB4_L2T_H
+#define __CXGB4_L2T_H
+
+#include <linux/spinlock.h>
+#include <linux/if_ether.h>
+#include <linux/atomic.h>
+
+struct adapter;
+struct l2t_data;
+struct neighbour;
+struct net_device;
+struct file_operations;
+struct cpl_l2t_write_rpl;
+
+/*
+ * Each L2T entry plays multiple roles.  First of all, it keeps state for the
+ * corresponding entry of the HW L2 table and maintains a queue of offload
+ * packets awaiting address resolution.  Second, it is a node of a hash table
+ * chain, where the nodes of the chain are linked together through their next
+ * pointer.  Finally, each node is a bucket of a hash table, pointing to the
+ * first element in its chain through its first pointer.
+ */
+struct l2t_entry {
+	u16 state;                  /* entry state */
+	u16 idx;                    /* entry index */
+	u32 addr[4];                /* next hop IP or IPv6 address */
+	int ifindex;                /* neighbor's net_device's ifindex */
+	struct neighbour *neigh;    /* associated neighbour */
+	struct l2t_entry *first;    /* start of hash chain */
+	struct l2t_entry *next;     /* next l2t_entry on chain */
+	struct sk_buff *arpq_head;  /* queue of packets awaiting resolution */
+	struct sk_buff *arpq_tail;
+	spinlock_t lock;
+	atomic_t refcnt;            /* entry reference count */
+	u16 hash;                   /* hash bucket the entry is on */
+	u16 vlan;                   /* VLAN TCI (id: bits 0-11, prio: 13-15 */
+	u8 v6;                      /* whether entry is for IPv6 */
+	u8 lport;                   /* associated offload logical interface */
+	u8 dmac[ETH_ALEN];          /* neighbour's MAC address */
+};
+
+typedef void (*arp_err_handler_t)(void *handle, struct sk_buff *skb);
+
+/*
+ * Callback stored in an skb to handle address resolution failure.
+ */
+struct l2t_skb_cb {
+	void *handle;
+	arp_err_handler_t arp_err_handler;
+};
+
+#define L2T_SKB_CB(skb) ((struct l2t_skb_cb *)(skb)->cb)
+
+static inline void t4_set_arp_err_handler(struct sk_buff *skb, void *handle,
+					  arp_err_handler_t handler)
+{
+	L2T_SKB_CB(skb)->handle = handle;
+	L2T_SKB_CB(skb)->arp_err_handler = handler;
+}
+
+void cxgb4_l2t_release(struct l2t_entry *e);
+int cxgb4_l2t_send(struct net_device *dev, struct sk_buff *skb,
+		   struct l2t_entry *e);
+struct l2t_entry *cxgb4_l2t_get(struct l2t_data *d, struct neighbour *neigh,
+				const struct net_device *physdev,
+				unsigned int priority);
+u64 cxgb4_select_ntuple(struct net_device *dev,
+			const struct l2t_entry *l2t);
+void t4_l2t_update(struct adapter *adap, struct neighbour *neigh);
+struct l2t_entry *t4_l2t_alloc_switching(struct l2t_data *d);
+int t4_l2t_set_switching(struct adapter *adap, struct l2t_entry *e, u16 vlan,
+			 u8 port, u8 *eth_addr);
+struct l2t_data *t4_init_l2t(void);
+void do_l2t_write_rpl(struct adapter *p, const struct cpl_l2t_write_rpl *rpl);
+
+extern const struct file_operations t4_l2t_fops;
+#endif  /* __CXGB4_L2T_H */
diff --git a/drivers/net/ethernet/chelsio/cxgb4/sge.c b/drivers/net/ethernet/chelsio/cxgb4/sge.c
new file mode 100644
index 0000000..39f2b13
--- /dev/null
+++ b/drivers/net/ethernet/chelsio/cxgb4/sge.c
@@ -0,0 +1,2988 @@
+/*
+ * This file is part of the Chelsio T4 Ethernet driver for Linux.
+ *
+ * Copyright (c) 2003-2014 Chelsio Communications, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/if_vlan.h>
+#include <linux/ip.h>
+#include <linux/dma-mapping.h>
+#include <linux/jiffies.h>
+#include <linux/prefetch.h>
+#include <linux/export.h>
+#include <net/ipv6.h>
+#include <net/tcp.h>
+#include "cxgb4.h"
+#include "t4_regs.h"
+#include "t4_msg.h"
+#include "t4fw_api.h"
+
+/*
+ * Rx buffer size.  We use largish buffers if possible but settle for single
+ * pages under memory shortage.
+ */
+#if PAGE_SHIFT >= 16
+# define FL_PG_ORDER 0
+#else
+# define FL_PG_ORDER (16 - PAGE_SHIFT)
+#endif
+
+/* RX_PULL_LEN should be <= RX_COPY_THRES */
+#define RX_COPY_THRES    256
+#define RX_PULL_LEN      128
+
+/*
+ * Main body length for sk_buffs used for Rx Ethernet packets with fragments.
+ * Should be >= RX_PULL_LEN but possibly bigger to give pskb_may_pull some room.
+ */
+#define RX_PKT_SKB_LEN   512
+
+/*
+ * Max number of Tx descriptors we clean up at a time.  Should be modest as
+ * freeing skbs isn't cheap and it happens while holding locks.  We just need
+ * to free packets faster than they arrive, we eventually catch up and keep
+ * the amortized cost reasonable.  Must be >= 2 * TXQ_STOP_THRES.
+ */
+#define MAX_TX_RECLAIM 16
+
+/*
+ * Max number of Rx buffers we replenish at a time.  Again keep this modest,
+ * allocating buffers isn't cheap either.
+ */
+#define MAX_RX_REFILL 16U
+
+/*
+ * Period of the Rx queue check timer.  This timer is infrequent as it has
+ * something to do only when the system experiences severe memory shortage.
+ */
+#define RX_QCHECK_PERIOD (HZ / 2)
+
+/*
+ * Period of the Tx queue check timer.
+ */
+#define TX_QCHECK_PERIOD (HZ / 2)
+
+/* SGE Hung Ingress DMA Threshold Warning time (in Hz) and Warning Repeat Rate
+ * (in RX_QCHECK_PERIOD multiples).  If we find one of the SGE Ingress DMA
+ * State Machines in the same state for this amount of time (in HZ) then we'll
+ * issue a warning about a potential hang.  We'll repeat the warning as the
+ * SGE Ingress DMA Channel appears to be hung every N RX_QCHECK_PERIODs till
+ * the situation clears.  If the situation clears, we'll note that as well.
+ */
+#define SGE_IDMA_WARN_THRESH (1 * HZ)
+#define SGE_IDMA_WARN_REPEAT (20 * RX_QCHECK_PERIOD)
+
+/*
+ * Max number of Tx descriptors to be reclaimed by the Tx timer.
+ */
+#define MAX_TIMER_TX_RECLAIM 100
+
+/*
+ * Timer index used when backing off due to memory shortage.
+ */
+#define NOMEM_TMR_IDX (SGE_NTIMERS - 1)
+
+/*
+ * An FL with <= FL_STARVE_THRES buffers is starving and a periodic timer will
+ * attempt to refill it.
+ */
+#define FL_STARVE_THRES 4
+
+/*
+ * Suspend an Ethernet Tx queue with fewer available descriptors than this.
+ * This is the same as calc_tx_descs() for a TSO packet with
+ * nr_frags == MAX_SKB_FRAGS.
+ */
+#define ETHTXQ_STOP_THRES \
+	(1 + DIV_ROUND_UP((3 * MAX_SKB_FRAGS) / 2 + (MAX_SKB_FRAGS & 1), 8))
+
+/*
+ * Suspension threshold for non-Ethernet Tx queues.  We require enough room
+ * for a full sized WR.
+ */
+#define TXQ_STOP_THRES (SGE_MAX_WR_LEN / sizeof(struct tx_desc))
+
+/*
+ * Max Tx descriptor space we allow for an Ethernet packet to be inlined
+ * into a WR.
+ */
+#define MAX_IMM_TX_PKT_LEN 128
+
+/*
+ * Max size of a WR sent through a control Tx queue.
+ */
+#define MAX_CTRL_WR_LEN SGE_MAX_WR_LEN
+
+struct tx_sw_desc {                /* SW state per Tx descriptor */
+	struct sk_buff *skb;
+	struct ulptx_sgl *sgl;
+};
+
+struct rx_sw_desc {                /* SW state per Rx descriptor */
+	struct page *page;
+	dma_addr_t dma_addr;
+};
+
+/*
+ * Rx buffer sizes for "useskbs" Free List buffers (one ingress packet pe skb
+ * buffer).  We currently only support two sizes for 1500- and 9000-byte MTUs.
+ * We could easily support more but there doesn't seem to be much need for
+ * that ...
+ */
+#define FL_MTU_SMALL 1500
+#define FL_MTU_LARGE 9000
+
+static inline unsigned int fl_mtu_bufsize(struct adapter *adapter,
+					  unsigned int mtu)
+{
+	struct sge *s = &adapter->sge;
+
+	return ALIGN(s->pktshift + ETH_HLEN + VLAN_HLEN + mtu, s->fl_align);
+}
+
+#define FL_MTU_SMALL_BUFSIZE(adapter) fl_mtu_bufsize(adapter, FL_MTU_SMALL)
+#define FL_MTU_LARGE_BUFSIZE(adapter) fl_mtu_bufsize(adapter, FL_MTU_LARGE)
+
+/*
+ * Bits 0..3 of rx_sw_desc.dma_addr have special meaning.  The hardware uses
+ * these to specify the buffer size as an index into the SGE Free List Buffer
+ * Size register array.  We also use bit 4, when the buffer has been unmapped
+ * for DMA, but this is of course never sent to the hardware and is only used
+ * to prevent double unmappings.  All of the above requires that the Free List
+ * Buffers which we allocate have the bottom 5 bits free (0) -- i.e. are
+ * 32-byte or or a power of 2 greater in alignment.  Since the SGE's minimal
+ * Free List Buffer alignment is 32 bytes, this works out for us ...
+ */
+enum {
+	RX_BUF_FLAGS     = 0x1f,   /* bottom five bits are special */
+	RX_BUF_SIZE      = 0x0f,   /* bottom three bits are for buf sizes */
+	RX_UNMAPPED_BUF  = 0x10,   /* buffer is not mapped */
+
+	/*
+	 * XXX We shouldn't depend on being able to use these indices.
+	 * XXX Especially when some other Master PF has initialized the
+	 * XXX adapter or we use the Firmware Configuration File.  We
+	 * XXX should really search through the Host Buffer Size register
+	 * XXX array for the appropriately sized buffer indices.
+	 */
+	RX_SMALL_PG_BUF  = 0x0,   /* small (PAGE_SIZE) page buffer */
+	RX_LARGE_PG_BUF  = 0x1,   /* buffer large (FL_PG_ORDER) page buffer */
+
+	RX_SMALL_MTU_BUF = 0x2,   /* small MTU buffer */
+	RX_LARGE_MTU_BUF = 0x3,   /* large MTU buffer */
+};
+
+static int timer_pkt_quota[] = {1, 1, 2, 3, 4, 5};
+#define MIN_NAPI_WORK  1
+
+static inline dma_addr_t get_buf_addr(const struct rx_sw_desc *d)
+{
+	return d->dma_addr & ~(dma_addr_t)RX_BUF_FLAGS;
+}
+
+static inline bool is_buf_mapped(const struct rx_sw_desc *d)
+{
+	return !(d->dma_addr & RX_UNMAPPED_BUF);
+}
+
+/**
+ *	txq_avail - return the number of available slots in a Tx queue
+ *	@q: the Tx queue
+ *
+ *	Returns the number of descriptors in a Tx queue available to write new
+ *	packets.
+ */
+static inline unsigned int txq_avail(const struct sge_txq *q)
+{
+	return q->size - 1 - q->in_use;
+}
+
+/**
+ *	fl_cap - return the capacity of a free-buffer list
+ *	@fl: the FL
+ *
+ *	Returns the capacity of a free-buffer list.  The capacity is less than
+ *	the size because one descriptor needs to be left unpopulated, otherwise
+ *	HW will think the FL is empty.
+ */
+static inline unsigned int fl_cap(const struct sge_fl *fl)
+{
+	return fl->size - 8;   /* 1 descriptor = 8 buffers */
+}
+
+static inline bool fl_starving(const struct sge_fl *fl)
+{
+	return fl->avail - fl->pend_cred <= FL_STARVE_THRES;
+}
+
+static int map_skb(struct device *dev, const struct sk_buff *skb,
+		   dma_addr_t *addr)
+{
+	const skb_frag_t *fp, *end;
+	const struct skb_shared_info *si;
+
+	*addr = dma_map_single(dev, skb->data, skb_headlen(skb), DMA_TO_DEVICE);
+	if (dma_mapping_error(dev, *addr))
+		goto out_err;
+
+	si = skb_shinfo(skb);
+	end = &si->frags[si->nr_frags];
+
+	for (fp = si->frags; fp < end; fp++) {
+		*++addr = skb_frag_dma_map(dev, fp, 0, skb_frag_size(fp),
+					   DMA_TO_DEVICE);
+		if (dma_mapping_error(dev, *addr))
+			goto unwind;
+	}
+	return 0;
+
+unwind:
+	while (fp-- > si->frags)
+		dma_unmap_page(dev, *--addr, skb_frag_size(fp), DMA_TO_DEVICE);
+
+	dma_unmap_single(dev, addr[-1], skb_headlen(skb), DMA_TO_DEVICE);
+out_err:
+	return -ENOMEM;
+}
+
+#ifdef CONFIG_NEED_DMA_MAP_STATE
+static void unmap_skb(struct device *dev, const struct sk_buff *skb,
+		      const dma_addr_t *addr)
+{
+	const skb_frag_t *fp, *end;
+	const struct skb_shared_info *si;
+
+	dma_unmap_single(dev, *addr++, skb_headlen(skb), DMA_TO_DEVICE);
+
+	si = skb_shinfo(skb);
+	end = &si->frags[si->nr_frags];
+	for (fp = si->frags; fp < end; fp++)
+		dma_unmap_page(dev, *addr++, skb_frag_size(fp), DMA_TO_DEVICE);
+}
+
+/**
+ *	deferred_unmap_destructor - unmap a packet when it is freed
+ *	@skb: the packet
+ *
+ *	This is the packet destructor used for Tx packets that need to remain
+ *	mapped until they are freed rather than until their Tx descriptors are
+ *	freed.
+ */
+static void deferred_unmap_destructor(struct sk_buff *skb)
+{
+	unmap_skb(skb->dev->dev.parent, skb, (dma_addr_t *)skb->head);
+}
+#endif
+
+static void unmap_sgl(struct device *dev, const struct sk_buff *skb,
+		      const struct ulptx_sgl *sgl, const struct sge_txq *q)
+{
+	const struct ulptx_sge_pair *p;
+	unsigned int nfrags = skb_shinfo(skb)->nr_frags;
+
+	if (likely(skb_headlen(skb)))
+		dma_unmap_single(dev, be64_to_cpu(sgl->addr0), ntohl(sgl->len0),
+				 DMA_TO_DEVICE);
+	else {
+		dma_unmap_page(dev, be64_to_cpu(sgl->addr0), ntohl(sgl->len0),
+			       DMA_TO_DEVICE);
+		nfrags--;
+	}
+
+	/*
+	 * the complexity below is because of the possibility of a wrap-around
+	 * in the middle of an SGL
+	 */
+	for (p = sgl->sge; nfrags >= 2; nfrags -= 2) {
+		if (likely((u8 *)(p + 1) <= (u8 *)q->stat)) {
+unmap:			dma_unmap_page(dev, be64_to_cpu(p->addr[0]),
+				       ntohl(p->len[0]), DMA_TO_DEVICE);
+			dma_unmap_page(dev, be64_to_cpu(p->addr[1]),
+				       ntohl(p->len[1]), DMA_TO_DEVICE);
+			p++;
+		} else if ((u8 *)p == (u8 *)q->stat) {
+			p = (const struct ulptx_sge_pair *)q->desc;
+			goto unmap;
+		} else if ((u8 *)p + 8 == (u8 *)q->stat) {
+			const __be64 *addr = (const __be64 *)q->desc;
+
+			dma_unmap_page(dev, be64_to_cpu(addr[0]),
+				       ntohl(p->len[0]), DMA_TO_DEVICE);
+			dma_unmap_page(dev, be64_to_cpu(addr[1]),
+				       ntohl(p->len[1]), DMA_TO_DEVICE);
+			p = (const struct ulptx_sge_pair *)&addr[2];
+		} else {
+			const __be64 *addr = (const __be64 *)q->desc;
+
+			dma_unmap_page(dev, be64_to_cpu(p->addr[0]),
+				       ntohl(p->len[0]), DMA_TO_DEVICE);
+			dma_unmap_page(dev, be64_to_cpu(addr[0]),
+				       ntohl(p->len[1]), DMA_TO_DEVICE);
+			p = (const struct ulptx_sge_pair *)&addr[1];
+		}
+	}
+	if (nfrags) {
+		__be64 addr;
+
+		if ((u8 *)p == (u8 *)q->stat)
+			p = (const struct ulptx_sge_pair *)q->desc;
+		addr = (u8 *)p + 16 <= (u8 *)q->stat ? p->addr[0] :
+						       *(const __be64 *)q->desc;
+		dma_unmap_page(dev, be64_to_cpu(addr), ntohl(p->len[0]),
+			       DMA_TO_DEVICE);
+	}
+}
+
+/**
+ *	free_tx_desc - reclaims Tx descriptors and their buffers
+ *	@adapter: the adapter
+ *	@q: the Tx queue to reclaim descriptors from
+ *	@n: the number of descriptors to reclaim
+ *	@unmap: whether the buffers should be unmapped for DMA
+ *
+ *	Reclaims Tx descriptors from an SGE Tx queue and frees the associated
+ *	Tx buffers.  Called with the Tx queue lock held.
+ */
+static void free_tx_desc(struct adapter *adap, struct sge_txq *q,
+			 unsigned int n, bool unmap)
+{
+	struct tx_sw_desc *d;
+	unsigned int cidx = q->cidx;
+	struct device *dev = adap->pdev_dev;
+
+	d = &q->sdesc[cidx];
+	while (n--) {
+		if (d->skb) {                       /* an SGL is present */
+			if (unmap)
+				unmap_sgl(dev, d->skb, d->sgl, q);
+			dev_consume_skb_any(d->skb);
+			d->skb = NULL;
+		}
+		++d;
+		if (++cidx == q->size) {
+			cidx = 0;
+			d = q->sdesc;
+		}
+	}
+	q->cidx = cidx;
+}
+
+/*
+ * Return the number of reclaimable descriptors in a Tx queue.
+ */
+static inline int reclaimable(const struct sge_txq *q)
+{
+	int hw_cidx = ntohs(q->stat->cidx);
+	hw_cidx -= q->cidx;
+	return hw_cidx < 0 ? hw_cidx + q->size : hw_cidx;
+}
+
+/**
+ *	reclaim_completed_tx - reclaims completed Tx descriptors
+ *	@adap: the adapter
+ *	@q: the Tx queue to reclaim completed descriptors from
+ *	@unmap: whether the buffers should be unmapped for DMA
+ *
+ *	Reclaims Tx descriptors that the SGE has indicated it has processed,
+ *	and frees the associated buffers if possible.  Called with the Tx
+ *	queue locked.
+ */
+static inline void reclaim_completed_tx(struct adapter *adap, struct sge_txq *q,
+					bool unmap)
+{
+	int avail = reclaimable(q);
+
+	if (avail) {
+		/*
+		 * Limit the amount of clean up work we do at a time to keep
+		 * the Tx lock hold time O(1).
+		 */
+		if (avail > MAX_TX_RECLAIM)
+			avail = MAX_TX_RECLAIM;
+
+		free_tx_desc(adap, q, avail, unmap);
+		q->in_use -= avail;
+	}
+}
+
+static inline int get_buf_size(struct adapter *adapter,
+			       const struct rx_sw_desc *d)
+{
+	struct sge *s = &adapter->sge;
+	unsigned int rx_buf_size_idx = d->dma_addr & RX_BUF_SIZE;
+	int buf_size;
+
+	switch (rx_buf_size_idx) {
+	case RX_SMALL_PG_BUF:
+		buf_size = PAGE_SIZE;
+		break;
+
+	case RX_LARGE_PG_BUF:
+		buf_size = PAGE_SIZE << s->fl_pg_order;
+		break;
+
+	case RX_SMALL_MTU_BUF:
+		buf_size = FL_MTU_SMALL_BUFSIZE(adapter);
+		break;
+
+	case RX_LARGE_MTU_BUF:
+		buf_size = FL_MTU_LARGE_BUFSIZE(adapter);
+		break;
+
+	default:
+		BUG_ON(1);
+	}
+
+	return buf_size;
+}
+
+/**
+ *	free_rx_bufs - free the Rx buffers on an SGE free list
+ *	@adap: the adapter
+ *	@q: the SGE free list to free buffers from
+ *	@n: how many buffers to free
+ *
+ *	Release the next @n buffers on an SGE free-buffer Rx queue.   The
+ *	buffers must be made inaccessible to HW before calling this function.
+ */
+static void free_rx_bufs(struct adapter *adap, struct sge_fl *q, int n)
+{
+	while (n--) {
+		struct rx_sw_desc *d = &q->sdesc[q->cidx];
+
+		if (is_buf_mapped(d))
+			dma_unmap_page(adap->pdev_dev, get_buf_addr(d),
+				       get_buf_size(adap, d),
+				       PCI_DMA_FROMDEVICE);
+		put_page(d->page);
+		d->page = NULL;
+		if (++q->cidx == q->size)
+			q->cidx = 0;
+		q->avail--;
+	}
+}
+
+/**
+ *	unmap_rx_buf - unmap the current Rx buffer on an SGE free list
+ *	@adap: the adapter
+ *	@q: the SGE free list
+ *
+ *	Unmap the current buffer on an SGE free-buffer Rx queue.   The
+ *	buffer must be made inaccessible to HW before calling this function.
+ *
+ *	This is similar to @free_rx_bufs above but does not free the buffer.
+ *	Do note that the FL still loses any further access to the buffer.
+ */
+static void unmap_rx_buf(struct adapter *adap, struct sge_fl *q)
+{
+	struct rx_sw_desc *d = &q->sdesc[q->cidx];
+
+	if (is_buf_mapped(d))
+		dma_unmap_page(adap->pdev_dev, get_buf_addr(d),
+			       get_buf_size(adap, d), PCI_DMA_FROMDEVICE);
+	d->page = NULL;
+	if (++q->cidx == q->size)
+		q->cidx = 0;
+	q->avail--;
+}
+
+static inline void ring_fl_db(struct adapter *adap, struct sge_fl *q)
+{
+	u32 val;
+	if (q->pend_cred >= 8) {
+		val = PIDX(q->pend_cred / 8);
+		if (!is_t4(adap->params.chip))
+			val |= DBTYPE(1);
+		val |= DBPRIO(1);
+		wmb();
+
+		/* If we're on T4, use the old doorbell mechanism; otherwise
+		 * use the new BAR2 mechanism.
+		 */
+		if (is_t4(adap->params.chip)) {
+			t4_write_reg(adap, MYPF_REG(SGE_PF_KDOORBELL),
+				     val | QID(q->cntxt_id));
+		} else {
+			writel(val,  adap->bar2 + q->udb + SGE_UDB_KDOORBELL);
+
+			/* This Write memory Barrier will force the write to
+			 * the User Doorbell area to be flushed.
+			 */
+			wmb();
+		}
+		q->pend_cred &= 7;
+	}
+}
+
+static inline void set_rx_sw_desc(struct rx_sw_desc *sd, struct page *pg,
+				  dma_addr_t mapping)
+{
+	sd->page = pg;
+	sd->dma_addr = mapping;      /* includes size low bits */
+}
+
+/**
+ *	refill_fl - refill an SGE Rx buffer ring
+ *	@adap: the adapter
+ *	@q: the ring to refill
+ *	@n: the number of new buffers to allocate
+ *	@gfp: the gfp flags for the allocations
+ *
+ *	(Re)populate an SGE free-buffer queue with up to @n new packet buffers,
+ *	allocated with the supplied gfp flags.  The caller must assure that
+ *	@n does not exceed the queue's capacity.  If afterwards the queue is
+ *	found critically low mark it as starving in the bitmap of starving FLs.
+ *
+ *	Returns the number of buffers allocated.
+ */
+static unsigned int refill_fl(struct adapter *adap, struct sge_fl *q, int n,
+			      gfp_t gfp)
+{
+	struct sge *s = &adap->sge;
+	struct page *pg;
+	dma_addr_t mapping;
+	unsigned int cred = q->avail;
+	__be64 *d = &q->desc[q->pidx];
+	struct rx_sw_desc *sd = &q->sdesc[q->pidx];
+
+	gfp |= __GFP_NOWARN | __GFP_COLD;
+
+	if (s->fl_pg_order == 0)
+		goto alloc_small_pages;
+
+	/*
+	 * Prefer large buffers
+	 */
+	while (n) {
+		pg = alloc_pages(gfp | __GFP_COMP, s->fl_pg_order);
+		if (unlikely(!pg)) {
+			q->large_alloc_failed++;
+			break;       /* fall back to single pages */
+		}
+
+		mapping = dma_map_page(adap->pdev_dev, pg, 0,
+				       PAGE_SIZE << s->fl_pg_order,
+				       PCI_DMA_FROMDEVICE);
+		if (unlikely(dma_mapping_error(adap->pdev_dev, mapping))) {
+			__free_pages(pg, s->fl_pg_order);
+			goto out;   /* do not try small pages for this error */
+		}
+		mapping |= RX_LARGE_PG_BUF;
+		*d++ = cpu_to_be64(mapping);
+
+		set_rx_sw_desc(sd, pg, mapping);
+		sd++;
+
+		q->avail++;
+		if (++q->pidx == q->size) {
+			q->pidx = 0;
+			sd = q->sdesc;
+			d = q->desc;
+		}
+		n--;
+	}
+
+alloc_small_pages:
+	while (n--) {
+		pg = __skb_alloc_page(gfp, NULL);
+		if (unlikely(!pg)) {
+			q->alloc_failed++;
+			break;
+		}
+
+		mapping = dma_map_page(adap->pdev_dev, pg, 0, PAGE_SIZE,
+				       PCI_DMA_FROMDEVICE);
+		if (unlikely(dma_mapping_error(adap->pdev_dev, mapping))) {
+			put_page(pg);
+			goto out;
+		}
+		*d++ = cpu_to_be64(mapping);
+
+		set_rx_sw_desc(sd, pg, mapping);
+		sd++;
+
+		q->avail++;
+		if (++q->pidx == q->size) {
+			q->pidx = 0;
+			sd = q->sdesc;
+			d = q->desc;
+		}
+	}
+
+out:	cred = q->avail - cred;
+	q->pend_cred += cred;
+	ring_fl_db(adap, q);
+
+	if (unlikely(fl_starving(q))) {
+		smp_wmb();
+		set_bit(q->cntxt_id - adap->sge.egr_start,
+			adap->sge.starving_fl);
+	}
+
+	return cred;
+}
+
+static inline void __refill_fl(struct adapter *adap, struct sge_fl *fl)
+{
+	refill_fl(adap, fl, min(MAX_RX_REFILL, fl_cap(fl) - fl->avail),
+		  GFP_ATOMIC);
+}
+
+/**
+ *	alloc_ring - allocate resources for an SGE descriptor ring
+ *	@dev: the PCI device's core device
+ *	@nelem: the number of descriptors
+ *	@elem_size: the size of each descriptor
+ *	@sw_size: the size of the SW state associated with each ring element
+ *	@phys: the physical address of the allocated ring
+ *	@metadata: address of the array holding the SW state for the ring
+ *	@stat_size: extra space in HW ring for status information
+ *	@node: preferred node for memory allocations
+ *
+ *	Allocates resources for an SGE descriptor ring, such as Tx queues,
+ *	free buffer lists, or response queues.  Each SGE ring requires
+ *	space for its HW descriptors plus, optionally, space for the SW state
+ *	associated with each HW entry (the metadata).  The function returns
+ *	three values: the virtual address for the HW ring (the return value
+ *	of the function), the bus address of the HW ring, and the address
+ *	of the SW ring.
+ */
+static void *alloc_ring(struct device *dev, size_t nelem, size_t elem_size,
+			size_t sw_size, dma_addr_t *phys, void *metadata,
+			size_t stat_size, int node)
+{
+	size_t len = nelem * elem_size + stat_size;
+	void *s = NULL;
+	void *p = dma_alloc_coherent(dev, len, phys, GFP_KERNEL);
+
+	if (!p)
+		return NULL;
+	if (sw_size) {
+		s = kzalloc_node(nelem * sw_size, GFP_KERNEL, node);
+
+		if (!s) {
+			dma_free_coherent(dev, len, p, *phys);
+			return NULL;
+		}
+	}
+	if (metadata)
+		*(void **)metadata = s;
+	memset(p, 0, len);
+	return p;
+}
+
+/**
+ *	sgl_len - calculates the size of an SGL of the given capacity
+ *	@n: the number of SGL entries
+ *
+ *	Calculates the number of flits needed for a scatter/gather list that
+ *	can hold the given number of entries.
+ */
+static inline unsigned int sgl_len(unsigned int n)
+{
+	n--;
+	return (3 * n) / 2 + (n & 1) + 2;
+}
+
+/**
+ *	flits_to_desc - returns the num of Tx descriptors for the given flits
+ *	@n: the number of flits
+ *
+ *	Returns the number of Tx descriptors needed for the supplied number
+ *	of flits.
+ */
+static inline unsigned int flits_to_desc(unsigned int n)
+{
+	BUG_ON(n > SGE_MAX_WR_LEN / 8);
+	return DIV_ROUND_UP(n, 8);
+}
+
+/**
+ *	is_eth_imm - can an Ethernet packet be sent as immediate data?
+ *	@skb: the packet
+ *
+ *	Returns whether an Ethernet packet is small enough to fit as
+ *	immediate data. Return value corresponds to headroom required.
+ */
+static inline int is_eth_imm(const struct sk_buff *skb)
+{
+	int hdrlen = skb_shinfo(skb)->gso_size ?
+			sizeof(struct cpl_tx_pkt_lso_core) : 0;
+
+	hdrlen += sizeof(struct cpl_tx_pkt);
+	if (skb->len <= MAX_IMM_TX_PKT_LEN - hdrlen)
+		return hdrlen;
+	return 0;
+}
+
+/**
+ *	calc_tx_flits - calculate the number of flits for a packet Tx WR
+ *	@skb: the packet
+ *
+ *	Returns the number of flits needed for a Tx WR for the given Ethernet
+ *	packet, including the needed WR and CPL headers.
+ */
+static inline unsigned int calc_tx_flits(const struct sk_buff *skb)
+{
+	unsigned int flits;
+	int hdrlen = is_eth_imm(skb);
+
+	if (hdrlen)
+		return DIV_ROUND_UP(skb->len + hdrlen, sizeof(__be64));
+
+	flits = sgl_len(skb_shinfo(skb)->nr_frags + 1) + 4;
+	if (skb_shinfo(skb)->gso_size)
+		flits += 2;
+	return flits;
+}
+
+/**
+ *	calc_tx_descs - calculate the number of Tx descriptors for a packet
+ *	@skb: the packet
+ *
+ *	Returns the number of Tx descriptors needed for the given Ethernet
+ *	packet, including the needed WR and CPL headers.
+ */
+static inline unsigned int calc_tx_descs(const struct sk_buff *skb)
+{
+	return flits_to_desc(calc_tx_flits(skb));
+}
+
+/**
+ *	write_sgl - populate a scatter/gather list for a packet
+ *	@skb: the packet
+ *	@q: the Tx queue we are writing into
+ *	@sgl: starting location for writing the SGL
+ *	@end: points right after the end of the SGL
+ *	@start: start offset into skb main-body data to include in the SGL
+ *	@addr: the list of bus addresses for the SGL elements
+ *
+ *	Generates a gather list for the buffers that make up a packet.
+ *	The caller must provide adequate space for the SGL that will be written.
+ *	The SGL includes all of the packet's page fragments and the data in its
+ *	main body except for the first @start bytes.  @sgl must be 16-byte
+ *	aligned and within a Tx descriptor with available space.  @end points
+ *	right after the end of the SGL but does not account for any potential
+ *	wrap around, i.e., @end > @sgl.
+ */
+static void write_sgl(const struct sk_buff *skb, struct sge_txq *q,
+		      struct ulptx_sgl *sgl, u64 *end, unsigned int start,
+		      const dma_addr_t *addr)
+{
+	unsigned int i, len;
+	struct ulptx_sge_pair *to;
+	const struct skb_shared_info *si = skb_shinfo(skb);
+	unsigned int nfrags = si->nr_frags;
+	struct ulptx_sge_pair buf[MAX_SKB_FRAGS / 2 + 1];
+
+	len = skb_headlen(skb) - start;
+	if (likely(len)) {
+		sgl->len0 = htonl(len);
+		sgl->addr0 = cpu_to_be64(addr[0] + start);
+		nfrags++;
+	} else {
+		sgl->len0 = htonl(skb_frag_size(&si->frags[0]));
+		sgl->addr0 = cpu_to_be64(addr[1]);
+	}
+
+	sgl->cmd_nsge = htonl(ULPTX_CMD(ULP_TX_SC_DSGL) | ULPTX_NSGE(nfrags));
+	if (likely(--nfrags == 0))
+		return;
+	/*
+	 * Most of the complexity below deals with the possibility we hit the
+	 * end of the queue in the middle of writing the SGL.  For this case
+	 * only we create the SGL in a temporary buffer and then copy it.
+	 */
+	to = (u8 *)end > (u8 *)q->stat ? buf : sgl->sge;
+
+	for (i = (nfrags != si->nr_frags); nfrags >= 2; nfrags -= 2, to++) {
+		to->len[0] = cpu_to_be32(skb_frag_size(&si->frags[i]));
+		to->len[1] = cpu_to_be32(skb_frag_size(&si->frags[++i]));
+		to->addr[0] = cpu_to_be64(addr[i]);
+		to->addr[1] = cpu_to_be64(addr[++i]);
+	}
+	if (nfrags) {
+		to->len[0] = cpu_to_be32(skb_frag_size(&si->frags[i]));
+		to->len[1] = cpu_to_be32(0);
+		to->addr[0] = cpu_to_be64(addr[i + 1]);
+	}
+	if (unlikely((u8 *)end > (u8 *)q->stat)) {
+		unsigned int part0 = (u8 *)q->stat - (u8 *)sgl->sge, part1;
+
+		if (likely(part0))
+			memcpy(sgl->sge, buf, part0);
+		part1 = (u8 *)end - (u8 *)q->stat;
+		memcpy(q->desc, (u8 *)buf + part0, part1);
+		end = (void *)q->desc + part1;
+	}
+	if ((uintptr_t)end & 8)           /* 0-pad to multiple of 16 */
+		*end = 0;
+}
+
+/* This function copies a tx_desc struct to memory mapped BAR2 space(user space
+ * writes). For coalesced WR SGE, fetches data from the FIFO instead of from
+ * Host.
+ */
+static void cxgb_pio_copy(u64 __iomem *dst, struct tx_desc *desc)
+{
+	int count = sizeof(*desc) / sizeof(u64);
+	u64 *src = (u64 *)desc;
+
+	while (count) {
+		writeq(*src, dst);
+		src++;
+		dst++;
+		count--;
+	}
+}
+
+/**
+ *	ring_tx_db - check and potentially ring a Tx queue's doorbell
+ *	@adap: the adapter
+ *	@q: the Tx queue
+ *	@n: number of new descriptors to give to HW
+ *
+ *	Ring the doorbel for a Tx queue.
+ */
+static inline void ring_tx_db(struct adapter *adap, struct sge_txq *q, int n)
+{
+	wmb();            /* write descriptors before telling HW */
+
+	if (is_t4(adap->params.chip)) {
+		u32 val = PIDX(n);
+		unsigned long flags;
+
+		/* For T4 we need to participate in the Doorbell Recovery
+		 * mechanism.
+		 */
+		spin_lock_irqsave(&q->db_lock, flags);
+		if (!q->db_disabled)
+			t4_write_reg(adap, MYPF_REG(SGE_PF_KDOORBELL),
+				     QID(q->cntxt_id) | val);
+		else
+			q->db_pidx_inc += n;
+		q->db_pidx = q->pidx;
+		spin_unlock_irqrestore(&q->db_lock, flags);
+	} else {
+		u32 val = PIDX_T5(n);
+
+		/* T4 and later chips share the same PIDX field offset within
+		 * the doorbell, but T5 and later shrank the field in order to
+		 * gain a bit for Doorbell Priority.  The field was absurdly
+		 * large in the first place (14 bits) so we just use the T5
+		 * and later limits and warn if a Queue ID is too large.
+		 */
+		WARN_ON(val & DBPRIO(1));
+
+		/* For T5 and later we use the Write-Combine mapped BAR2 User
+		 * Doorbell mechanism.  If we're only writing a single TX
+		 * Descriptor and TX Write Combining hasn't been disabled, we
+		 * can use the Write Combining Gather Buffer; otherwise we use
+		 * the simple doorbell.
+		 */
+		if (n == 1) {
+			int index = (q->pidx
+				     ? (q->pidx - 1)
+				     : (q->size - 1));
+
+			cxgb_pio_copy(adap->bar2 + q->udb + SGE_UDB_WCDOORBELL,
+				      q->desc + index);
+		} else {
+			writel(val,  adap->bar2 + q->udb + SGE_UDB_KDOORBELL);
+		}
+
+		/* This Write Memory Barrier will force the write to the User
+		 * Doorbell area to be flushed.  This is needed to prevent
+		 * writes on different CPUs for the same queue from hitting
+		 * the adapter out of order.  This is required when some Work
+		 * Requests take the Write Combine Gather Buffer path (user
+		 * doorbell area offset [SGE_UDB_WCDOORBELL..+63]) and some
+		 * take the traditional path where we simply increment the
+		 * PIDX (User Doorbell area SGE_UDB_KDOORBELL) and have the
+		 * hardware DMA read the actual Work Request.
+		 */
+		wmb();
+	}
+}
+
+/**
+ *	inline_tx_skb - inline a packet's data into Tx descriptors
+ *	@skb: the packet
+ *	@q: the Tx queue where the packet will be inlined
+ *	@pos: starting position in the Tx queue where to inline the packet
+ *
+ *	Inline a packet's contents directly into Tx descriptors, starting at
+ *	the given position within the Tx DMA ring.
+ *	Most of the complexity of this operation is dealing with wrap arounds
+ *	in the middle of the packet we want to inline.
+ */
+static void inline_tx_skb(const struct sk_buff *skb, const struct sge_txq *q,
+			  void *pos)
+{
+	u64 *p;
+	int left = (void *)q->stat - pos;
+
+	if (likely(skb->len <= left)) {
+		if (likely(!skb->data_len))
+			skb_copy_from_linear_data(skb, pos, skb->len);
+		else
+			skb_copy_bits(skb, 0, pos, skb->len);
+		pos += skb->len;
+	} else {
+		skb_copy_bits(skb, 0, pos, left);
+		skb_copy_bits(skb, left, q->desc, skb->len - left);
+		pos = (void *)q->desc + (skb->len - left);
+	}
+
+	/* 0-pad to multiple of 16 */
+	p = PTR_ALIGN(pos, 8);
+	if ((uintptr_t)p & 8)
+		*p = 0;
+}
+
+/*
+ * Figure out what HW csum a packet wants and return the appropriate control
+ * bits.
+ */
+static u64 hwcsum(const struct sk_buff *skb)
+{
+	int csum_type;
+	const struct iphdr *iph = ip_hdr(skb);
+
+	if (iph->version == 4) {
+		if (iph->protocol == IPPROTO_TCP)
+			csum_type = TX_CSUM_TCPIP;
+		else if (iph->protocol == IPPROTO_UDP)
+			csum_type = TX_CSUM_UDPIP;
+		else {
+nocsum:			/*
+			 * unknown protocol, disable HW csum
+			 * and hope a bad packet is detected
+			 */
+			return TXPKT_L4CSUM_DIS;
+		}
+	} else {
+		/*
+		 * this doesn't work with extension headers
+		 */
+		const struct ipv6hdr *ip6h = (const struct ipv6hdr *)iph;
+
+		if (ip6h->nexthdr == IPPROTO_TCP)
+			csum_type = TX_CSUM_TCPIP6;
+		else if (ip6h->nexthdr == IPPROTO_UDP)
+			csum_type = TX_CSUM_UDPIP6;
+		else
+			goto nocsum;
+	}
+
+	if (likely(csum_type >= TX_CSUM_TCPIP))
+		return TXPKT_CSUM_TYPE(csum_type) |
+			TXPKT_IPHDR_LEN(skb_network_header_len(skb)) |
+			TXPKT_ETHHDR_LEN(skb_network_offset(skb) - ETH_HLEN);
+	else {
+		int start = skb_transport_offset(skb);
+
+		return TXPKT_CSUM_TYPE(csum_type) | TXPKT_CSUM_START(start) |
+			TXPKT_CSUM_LOC(start + skb->csum_offset);
+	}
+}
+
+static void eth_txq_stop(struct sge_eth_txq *q)
+{
+	netif_tx_stop_queue(q->txq);
+	q->q.stops++;
+}
+
+static inline void txq_advance(struct sge_txq *q, unsigned int n)
+{
+	q->in_use += n;
+	q->pidx += n;
+	if (q->pidx >= q->size)
+		q->pidx -= q->size;
+}
+
+/**
+ *	t4_eth_xmit - add a packet to an Ethernet Tx queue
+ *	@skb: the packet
+ *	@dev: the egress net device
+ *
+ *	Add a packet to an SGE Ethernet Tx queue.  Runs with softirqs disabled.
+ */
+netdev_tx_t t4_eth_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+	int len;
+	u32 wr_mid;
+	u64 cntrl, *end;
+	int qidx, credits;
+	unsigned int flits, ndesc;
+	struct adapter *adap;
+	struct sge_eth_txq *q;
+	const struct port_info *pi;
+	struct fw_eth_tx_pkt_wr *wr;
+	struct cpl_tx_pkt_core *cpl;
+	const struct skb_shared_info *ssi;
+	dma_addr_t addr[MAX_SKB_FRAGS + 1];
+	bool immediate = false;
+
+	/*
+	 * The chip min packet length is 10 octets but play safe and reject
+	 * anything shorter than an Ethernet header.
+	 */
+	if (unlikely(skb->len < ETH_HLEN)) {
+out_free:	dev_kfree_skb_any(skb);
+		return NETDEV_TX_OK;
+	}
+
+	pi = netdev_priv(dev);
+	adap = pi->adapter;
+	qidx = skb_get_queue_mapping(skb);
+	q = &adap->sge.ethtxq[qidx + pi->first_qset];
+
+	reclaim_completed_tx(adap, &q->q, true);
+
+	flits = calc_tx_flits(skb);
+	ndesc = flits_to_desc(flits);
+	credits = txq_avail(&q->q) - ndesc;
+
+	if (unlikely(credits < 0)) {
+		eth_txq_stop(q);
+		dev_err(adap->pdev_dev,
+			"%s: Tx ring %u full while queue awake!\n",
+			dev->name, qidx);
+		return NETDEV_TX_BUSY;
+	}
+
+	if (is_eth_imm(skb))
+		immediate = true;
+
+	if (!immediate &&
+	    unlikely(map_skb(adap->pdev_dev, skb, addr) < 0)) {
+		q->mapping_err++;
+		goto out_free;
+	}
+
+	wr_mid = FW_WR_LEN16(DIV_ROUND_UP(flits, 2));
+	if (unlikely(credits < ETHTXQ_STOP_THRES)) {
+		eth_txq_stop(q);
+		wr_mid |= FW_WR_EQUEQ | FW_WR_EQUIQ;
+	}
+
+	wr = (void *)&q->q.desc[q->q.pidx];
+	wr->equiq_to_len16 = htonl(wr_mid);
+	wr->r3 = cpu_to_be64(0);
+	end = (u64 *)wr + flits;
+
+	len = immediate ? skb->len : 0;
+	ssi = skb_shinfo(skb);
+	if (ssi->gso_size) {
+		struct cpl_tx_pkt_lso *lso = (void *)wr;
+		bool v6 = (ssi->gso_type & SKB_GSO_TCPV6) != 0;
+		int l3hdr_len = skb_network_header_len(skb);
+		int eth_xtra_len = skb_network_offset(skb) - ETH_HLEN;
+
+		len += sizeof(*lso);
+		wr->op_immdlen = htonl(FW_WR_OP(FW_ETH_TX_PKT_WR) |
+				       FW_WR_IMMDLEN(len));
+		lso->c.lso_ctrl = htonl(LSO_OPCODE(CPL_TX_PKT_LSO) |
+					LSO_FIRST_SLICE | LSO_LAST_SLICE |
+					LSO_IPV6(v6) |
+					LSO_ETHHDR_LEN(eth_xtra_len / 4) |
+					LSO_IPHDR_LEN(l3hdr_len / 4) |
+					LSO_TCPHDR_LEN(tcp_hdr(skb)->doff));
+		lso->c.ipid_ofst = htons(0);
+		lso->c.mss = htons(ssi->gso_size);
+		lso->c.seqno_offset = htonl(0);
+		if (is_t4(adap->params.chip))
+			lso->c.len = htonl(skb->len);
+		else
+			lso->c.len = htonl(LSO_T5_XFER_SIZE(skb->len));
+		cpl = (void *)(lso + 1);
+		cntrl = TXPKT_CSUM_TYPE(v6 ? TX_CSUM_TCPIP6 : TX_CSUM_TCPIP) |
+			TXPKT_IPHDR_LEN(l3hdr_len) |
+			TXPKT_ETHHDR_LEN(eth_xtra_len);
+		q->tso++;
+		q->tx_cso += ssi->gso_segs;
+	} else {
+		len += sizeof(*cpl);
+		wr->op_immdlen = htonl(FW_WR_OP(FW_ETH_TX_PKT_WR) |
+				       FW_WR_IMMDLEN(len));
+		cpl = (void *)(wr + 1);
+		if (skb->ip_summed == CHECKSUM_PARTIAL) {
+			cntrl = hwcsum(skb) | TXPKT_IPCSUM_DIS;
+			q->tx_cso++;
+		} else
+			cntrl = TXPKT_L4CSUM_DIS | TXPKT_IPCSUM_DIS;
+	}
+
+	if (vlan_tx_tag_present(skb)) {
+		q->vlan_ins++;
+		cntrl |= TXPKT_VLAN_VLD | TXPKT_VLAN(vlan_tx_tag_get(skb));
+	}
+
+	cpl->ctrl0 = htonl(TXPKT_OPCODE(CPL_TX_PKT_XT) |
+			   TXPKT_INTF(pi->tx_chan) | TXPKT_PF(adap->fn));
+	cpl->pack = htons(0);
+	cpl->len = htons(skb->len);
+	cpl->ctrl1 = cpu_to_be64(cntrl);
+
+	if (immediate) {
+		inline_tx_skb(skb, &q->q, cpl + 1);
+		dev_consume_skb_any(skb);
+	} else {
+		int last_desc;
+
+		write_sgl(skb, &q->q, (struct ulptx_sgl *)(cpl + 1), end, 0,
+			  addr);
+		skb_orphan(skb);
+
+		last_desc = q->q.pidx + ndesc - 1;
+		if (last_desc >= q->q.size)
+			last_desc -= q->q.size;
+		q->q.sdesc[last_desc].skb = skb;
+		q->q.sdesc[last_desc].sgl = (struct ulptx_sgl *)(cpl + 1);
+	}
+
+	txq_advance(&q->q, ndesc);
+
+	ring_tx_db(adap, &q->q, ndesc);
+	return NETDEV_TX_OK;
+}
+
+/**
+ *	reclaim_completed_tx_imm - reclaim completed control-queue Tx descs
+ *	@q: the SGE control Tx queue
+ *
+ *	This is a variant of reclaim_completed_tx() that is used for Tx queues
+ *	that send only immediate data (presently just the control queues) and
+ *	thus do not have any sk_buffs to release.
+ */
+static inline void reclaim_completed_tx_imm(struct sge_txq *q)
+{
+	int hw_cidx = ntohs(q->stat->cidx);
+	int reclaim = hw_cidx - q->cidx;
+
+	if (reclaim < 0)
+		reclaim += q->size;
+
+	q->in_use -= reclaim;
+	q->cidx = hw_cidx;
+}
+
+/**
+ *	is_imm - check whether a packet can be sent as immediate data
+ *	@skb: the packet
+ *
+ *	Returns true if a packet can be sent as a WR with immediate data.
+ */
+static inline int is_imm(const struct sk_buff *skb)
+{
+	return skb->len <= MAX_CTRL_WR_LEN;
+}
+
+/**
+ *	ctrlq_check_stop - check if a control queue is full and should stop
+ *	@q: the queue
+ *	@wr: most recent WR written to the queue
+ *
+ *	Check if a control queue has become full and should be stopped.
+ *	We clean up control queue descriptors very lazily, only when we are out.
+ *	If the queue is still full after reclaiming any completed descriptors
+ *	we suspend it and have the last WR wake it up.
+ */
+static void ctrlq_check_stop(struct sge_ctrl_txq *q, struct fw_wr_hdr *wr)
+{
+	reclaim_completed_tx_imm(&q->q);
+	if (unlikely(txq_avail(&q->q) < TXQ_STOP_THRES)) {
+		wr->lo |= htonl(FW_WR_EQUEQ | FW_WR_EQUIQ);
+		q->q.stops++;
+		q->full = 1;
+	}
+}
+
+/**
+ *	ctrl_xmit - send a packet through an SGE control Tx queue
+ *	@q: the control queue
+ *	@skb: the packet
+ *
+ *	Send a packet through an SGE control Tx queue.  Packets sent through
+ *	a control queue must fit entirely as immediate data.
+ */
+static int ctrl_xmit(struct sge_ctrl_txq *q, struct sk_buff *skb)
+{
+	unsigned int ndesc;
+	struct fw_wr_hdr *wr;
+
+	if (unlikely(!is_imm(skb))) {
+		WARN_ON(1);
+		dev_kfree_skb(skb);
+		return NET_XMIT_DROP;
+	}
+
+	ndesc = DIV_ROUND_UP(skb->len, sizeof(struct tx_desc));
+	spin_lock(&q->sendq.lock);
+
+	if (unlikely(q->full)) {
+		skb->priority = ndesc;                  /* save for restart */
+		__skb_queue_tail(&q->sendq, skb);
+		spin_unlock(&q->sendq.lock);
+		return NET_XMIT_CN;
+	}
+
+	wr = (struct fw_wr_hdr *)&q->q.desc[q->q.pidx];
+	inline_tx_skb(skb, &q->q, wr);
+
+	txq_advance(&q->q, ndesc);
+	if (unlikely(txq_avail(&q->q) < TXQ_STOP_THRES))
+		ctrlq_check_stop(q, wr);
+
+	ring_tx_db(q->adap, &q->q, ndesc);
+	spin_unlock(&q->sendq.lock);
+
+	kfree_skb(skb);
+	return NET_XMIT_SUCCESS;
+}
+
+/**
+ *	restart_ctrlq - restart a suspended control queue
+ *	@data: the control queue to restart
+ *
+ *	Resumes transmission on a suspended Tx control queue.
+ */
+static void restart_ctrlq(unsigned long data)
+{
+	struct sk_buff *skb;
+	unsigned int written = 0;
+	struct sge_ctrl_txq *q = (struct sge_ctrl_txq *)data;
+
+	spin_lock(&q->sendq.lock);
+	reclaim_completed_tx_imm(&q->q);
+	BUG_ON(txq_avail(&q->q) < TXQ_STOP_THRES);  /* q should be empty */
+
+	while ((skb = __skb_dequeue(&q->sendq)) != NULL) {
+		struct fw_wr_hdr *wr;
+		unsigned int ndesc = skb->priority;     /* previously saved */
+
+		/*
+		 * Write descriptors and free skbs outside the lock to limit
+		 * wait times.  q->full is still set so new skbs will be queued.
+		 */
+		spin_unlock(&q->sendq.lock);
+
+		wr = (struct fw_wr_hdr *)&q->q.desc[q->q.pidx];
+		inline_tx_skb(skb, &q->q, wr);
+		kfree_skb(skb);
+
+		written += ndesc;
+		txq_advance(&q->q, ndesc);
+		if (unlikely(txq_avail(&q->q) < TXQ_STOP_THRES)) {
+			unsigned long old = q->q.stops;
+
+			ctrlq_check_stop(q, wr);
+			if (q->q.stops != old) {          /* suspended anew */
+				spin_lock(&q->sendq.lock);
+				goto ringdb;
+			}
+		}
+		if (written > 16) {
+			ring_tx_db(q->adap, &q->q, written);
+			written = 0;
+		}
+		spin_lock(&q->sendq.lock);
+	}
+	q->full = 0;
+ringdb: if (written)
+		ring_tx_db(q->adap, &q->q, written);
+	spin_unlock(&q->sendq.lock);
+}
+
+/**
+ *	t4_mgmt_tx - send a management message
+ *	@adap: the adapter
+ *	@skb: the packet containing the management message
+ *
+ *	Send a management message through control queue 0.
+ */
+int t4_mgmt_tx(struct adapter *adap, struct sk_buff *skb)
+{
+	int ret;
+
+	local_bh_disable();
+	ret = ctrl_xmit(&adap->sge.ctrlq[0], skb);
+	local_bh_enable();
+	return ret;
+}
+
+/**
+ *	is_ofld_imm - check whether a packet can be sent as immediate data
+ *	@skb: the packet
+ *
+ *	Returns true if a packet can be sent as an offload WR with immediate
+ *	data.  We currently use the same limit as for Ethernet packets.
+ */
+static inline int is_ofld_imm(const struct sk_buff *skb)
+{
+	return skb->len <= MAX_IMM_TX_PKT_LEN;
+}
+
+/**
+ *	calc_tx_flits_ofld - calculate # of flits for an offload packet
+ *	@skb: the packet
+ *
+ *	Returns the number of flits needed for the given offload packet.
+ *	These packets are already fully constructed and no additional headers
+ *	will be added.
+ */
+static inline unsigned int calc_tx_flits_ofld(const struct sk_buff *skb)
+{
+	unsigned int flits, cnt;
+
+	if (is_ofld_imm(skb))
+		return DIV_ROUND_UP(skb->len, 8);
+
+	flits = skb_transport_offset(skb) / 8U;   /* headers */
+	cnt = skb_shinfo(skb)->nr_frags;
+	if (skb_tail_pointer(skb) != skb_transport_header(skb))
+		cnt++;
+	return flits + sgl_len(cnt);
+}
+
+/**
+ *	txq_stop_maperr - stop a Tx queue due to I/O MMU exhaustion
+ *	@adap: the adapter
+ *	@q: the queue to stop
+ *
+ *	Mark a Tx queue stopped due to I/O MMU exhaustion and resulting
+ *	inability to map packets.  A periodic timer attempts to restart
+ *	queues so marked.
+ */
+static void txq_stop_maperr(struct sge_ofld_txq *q)
+{
+	q->mapping_err++;
+	q->q.stops++;
+	set_bit(q->q.cntxt_id - q->adap->sge.egr_start,
+		q->adap->sge.txq_maperr);
+}
+
+/**
+ *	ofldtxq_stop - stop an offload Tx queue that has become full
+ *	@q: the queue to stop
+ *	@skb: the packet causing the queue to become full
+ *
+ *	Stops an offload Tx queue that has become full and modifies the packet
+ *	being written to request a wakeup.
+ */
+static void ofldtxq_stop(struct sge_ofld_txq *q, struct sk_buff *skb)
+{
+	struct fw_wr_hdr *wr = (struct fw_wr_hdr *)skb->data;
+
+	wr->lo |= htonl(FW_WR_EQUEQ | FW_WR_EQUIQ);
+	q->q.stops++;
+	q->full = 1;
+}
+
+/**
+ *	service_ofldq - restart a suspended offload queue
+ *	@q: the offload queue
+ *
+ *	Services an offload Tx queue by moving packets from its packet queue
+ *	to the HW Tx ring.  The function starts and ends with the queue locked.
+ */
+static void service_ofldq(struct sge_ofld_txq *q)
+{
+	u64 *pos;
+	int credits;
+	struct sk_buff *skb;
+	unsigned int written = 0;
+	unsigned int flits, ndesc;
+
+	while ((skb = skb_peek(&q->sendq)) != NULL && !q->full) {
+		/*
+		 * We drop the lock but leave skb on sendq, thus retaining
+		 * exclusive access to the state of the queue.
+		 */
+		spin_unlock(&q->sendq.lock);
+
+		reclaim_completed_tx(q->adap, &q->q, false);
+
+		flits = skb->priority;                /* previously saved */
+		ndesc = flits_to_desc(flits);
+		credits = txq_avail(&q->q) - ndesc;
+		BUG_ON(credits < 0);
+		if (unlikely(credits < TXQ_STOP_THRES))
+			ofldtxq_stop(q, skb);
+
+		pos = (u64 *)&q->q.desc[q->q.pidx];
+		if (is_ofld_imm(skb))
+			inline_tx_skb(skb, &q->q, pos);
+		else if (map_skb(q->adap->pdev_dev, skb,
+				 (dma_addr_t *)skb->head)) {
+			txq_stop_maperr(q);
+			spin_lock(&q->sendq.lock);
+			break;
+		} else {
+			int last_desc, hdr_len = skb_transport_offset(skb);
+
+			memcpy(pos, skb->data, hdr_len);
+			write_sgl(skb, &q->q, (void *)pos + hdr_len,
+				  pos + flits, hdr_len,
+				  (dma_addr_t *)skb->head);
+#ifdef CONFIG_NEED_DMA_MAP_STATE
+			skb->dev = q->adap->port[0];
+			skb->destructor = deferred_unmap_destructor;
+#endif
+			last_desc = q->q.pidx + ndesc - 1;
+			if (last_desc >= q->q.size)
+				last_desc -= q->q.size;
+			q->q.sdesc[last_desc].skb = skb;
+		}
+
+		txq_advance(&q->q, ndesc);
+		written += ndesc;
+		if (unlikely(written > 32)) {
+			ring_tx_db(q->adap, &q->q, written);
+			written = 0;
+		}
+
+		spin_lock(&q->sendq.lock);
+		__skb_unlink(skb, &q->sendq);
+		if (is_ofld_imm(skb))
+			kfree_skb(skb);
+	}
+	if (likely(written))
+		ring_tx_db(q->adap, &q->q, written);
+}
+
+/**
+ *	ofld_xmit - send a packet through an offload queue
+ *	@q: the Tx offload queue
+ *	@skb: the packet
+ *
+ *	Send an offload packet through an SGE offload queue.
+ */
+static int ofld_xmit(struct sge_ofld_txq *q, struct sk_buff *skb)
+{
+	skb->priority = calc_tx_flits_ofld(skb);       /* save for restart */
+	spin_lock(&q->sendq.lock);
+	__skb_queue_tail(&q->sendq, skb);
+	if (q->sendq.qlen == 1)
+		service_ofldq(q);
+	spin_unlock(&q->sendq.lock);
+	return NET_XMIT_SUCCESS;
+}
+
+/**
+ *	restart_ofldq - restart a suspended offload queue
+ *	@data: the offload queue to restart
+ *
+ *	Resumes transmission on a suspended Tx offload queue.
+ */
+static void restart_ofldq(unsigned long data)
+{
+	struct sge_ofld_txq *q = (struct sge_ofld_txq *)data;
+
+	spin_lock(&q->sendq.lock);
+	q->full = 0;            /* the queue actually is completely empty now */
+	service_ofldq(q);
+	spin_unlock(&q->sendq.lock);
+}
+
+/**
+ *	skb_txq - return the Tx queue an offload packet should use
+ *	@skb: the packet
+ *
+ *	Returns the Tx queue an offload packet should use as indicated by bits
+ *	1-15 in the packet's queue_mapping.
+ */
+static inline unsigned int skb_txq(const struct sk_buff *skb)
+{
+	return skb->queue_mapping >> 1;
+}
+
+/**
+ *	is_ctrl_pkt - return whether an offload packet is a control packet
+ *	@skb: the packet
+ *
+ *	Returns whether an offload packet should use an OFLD or a CTRL
+ *	Tx queue as indicated by bit 0 in the packet's queue_mapping.
+ */
+static inline unsigned int is_ctrl_pkt(const struct sk_buff *skb)
+{
+	return skb->queue_mapping & 1;
+}
+
+static inline int ofld_send(struct adapter *adap, struct sk_buff *skb)
+{
+	unsigned int idx = skb_txq(skb);
+
+	if (unlikely(is_ctrl_pkt(skb))) {
+		/* Single ctrl queue is a requirement for LE workaround path */
+		if (adap->tids.nsftids)
+			idx = 0;
+		return ctrl_xmit(&adap->sge.ctrlq[idx], skb);
+	}
+	return ofld_xmit(&adap->sge.ofldtxq[idx], skb);
+}
+
+/**
+ *	t4_ofld_send - send an offload packet
+ *	@adap: the adapter
+ *	@skb: the packet
+ *
+ *	Sends an offload packet.  We use the packet queue_mapping to select the
+ *	appropriate Tx queue as follows: bit 0 indicates whether the packet
+ *	should be sent as regular or control, bits 1-15 select the queue.
+ */
+int t4_ofld_send(struct adapter *adap, struct sk_buff *skb)
+{
+	int ret;
+
+	local_bh_disable();
+	ret = ofld_send(adap, skb);
+	local_bh_enable();
+	return ret;
+}
+
+/**
+ *	cxgb4_ofld_send - send an offload packet
+ *	@dev: the net device
+ *	@skb: the packet
+ *
+ *	Sends an offload packet.  This is an exported version of @t4_ofld_send,
+ *	intended for ULDs.
+ */
+int cxgb4_ofld_send(struct net_device *dev, struct sk_buff *skb)
+{
+	return t4_ofld_send(netdev2adap(dev), skb);
+}
+EXPORT_SYMBOL(cxgb4_ofld_send);
+
+static inline void copy_frags(struct sk_buff *skb,
+			      const struct pkt_gl *gl, unsigned int offset)
+{
+	int i;
+
+	/* usually there's just one frag */
+	__skb_fill_page_desc(skb, 0, gl->frags[0].page,
+			     gl->frags[0].offset + offset,
+			     gl->frags[0].size - offset);
+	skb_shinfo(skb)->nr_frags = gl->nfrags;
+	for (i = 1; i < gl->nfrags; i++)
+		__skb_fill_page_desc(skb, i, gl->frags[i].page,
+				     gl->frags[i].offset,
+				     gl->frags[i].size);
+
+	/* get a reference to the last page, we don't own it */
+	get_page(gl->frags[gl->nfrags - 1].page);
+}
+
+/**
+ *	cxgb4_pktgl_to_skb - build an sk_buff from a packet gather list
+ *	@gl: the gather list
+ *	@skb_len: size of sk_buff main body if it carries fragments
+ *	@pull_len: amount of data to move to the sk_buff's main body
+ *
+ *	Builds an sk_buff from the given packet gather list.  Returns the
+ *	sk_buff or %NULL if sk_buff allocation failed.
+ */
+struct sk_buff *cxgb4_pktgl_to_skb(const struct pkt_gl *gl,
+				   unsigned int skb_len, unsigned int pull_len)
+{
+	struct sk_buff *skb;
+
+	/*
+	 * Below we rely on RX_COPY_THRES being less than the smallest Rx buffer
+	 * size, which is expected since buffers are at least PAGE_SIZEd.
+	 * In this case packets up to RX_COPY_THRES have only one fragment.
+	 */
+	if (gl->tot_len <= RX_COPY_THRES) {
+		skb = dev_alloc_skb(gl->tot_len);
+		if (unlikely(!skb))
+			goto out;
+		__skb_put(skb, gl->tot_len);
+		skb_copy_to_linear_data(skb, gl->va, gl->tot_len);
+	} else {
+		skb = dev_alloc_skb(skb_len);
+		if (unlikely(!skb))
+			goto out;
+		__skb_put(skb, pull_len);
+		skb_copy_to_linear_data(skb, gl->va, pull_len);
+
+		copy_frags(skb, gl, pull_len);
+		skb->len = gl->tot_len;
+		skb->data_len = skb->len - pull_len;
+		skb->truesize += skb->data_len;
+	}
+out:	return skb;
+}
+EXPORT_SYMBOL(cxgb4_pktgl_to_skb);
+
+/**
+ *	t4_pktgl_free - free a packet gather list
+ *	@gl: the gather list
+ *
+ *	Releases the pages of a packet gather list.  We do not own the last
+ *	page on the list and do not free it.
+ */
+static void t4_pktgl_free(const struct pkt_gl *gl)
+{
+	int n;
+	const struct page_frag *p;
+
+	for (p = gl->frags, n = gl->nfrags - 1; n--; p++)
+		put_page(p->page);
+}
+
+/*
+ * Process an MPS trace packet.  Give it an unused protocol number so it won't
+ * be delivered to anyone and send it to the stack for capture.
+ */
+static noinline int handle_trace_pkt(struct adapter *adap,
+				     const struct pkt_gl *gl)
+{
+	struct sk_buff *skb;
+
+	skb = cxgb4_pktgl_to_skb(gl, RX_PULL_LEN, RX_PULL_LEN);
+	if (unlikely(!skb)) {
+		t4_pktgl_free(gl);
+		return 0;
+	}
+
+	if (is_t4(adap->params.chip))
+		__skb_pull(skb, sizeof(struct cpl_trace_pkt));
+	else
+		__skb_pull(skb, sizeof(struct cpl_t5_trace_pkt));
+
+	skb_reset_mac_header(skb);
+	skb->protocol = htons(0xffff);
+	skb->dev = adap->port[0];
+	netif_receive_skb(skb);
+	return 0;
+}
+
+static void do_gro(struct sge_eth_rxq *rxq, const struct pkt_gl *gl,
+		   const struct cpl_rx_pkt *pkt)
+{
+	struct adapter *adapter = rxq->rspq.adap;
+	struct sge *s = &adapter->sge;
+	int ret;
+	struct sk_buff *skb;
+
+	skb = napi_get_frags(&rxq->rspq.napi);
+	if (unlikely(!skb)) {
+		t4_pktgl_free(gl);
+		rxq->stats.rx_drops++;
+		return;
+	}
+
+	copy_frags(skb, gl, s->pktshift);
+	skb->len = gl->tot_len - s->pktshift;
+	skb->data_len = skb->len;
+	skb->truesize += skb->data_len;
+	skb->ip_summed = CHECKSUM_UNNECESSARY;
+	skb_record_rx_queue(skb, rxq->rspq.idx);
+	if (rxq->rspq.netdev->features & NETIF_F_RXHASH)
+		skb_set_hash(skb, (__force u32)pkt->rsshdr.hash_val,
+			     PKT_HASH_TYPE_L3);
+
+	if (unlikely(pkt->vlan_ex)) {
+		__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), ntohs(pkt->vlan));
+		rxq->stats.vlan_ex++;
+	}
+	ret = napi_gro_frags(&rxq->rspq.napi);
+	if (ret == GRO_HELD)
+		rxq->stats.lro_pkts++;
+	else if (ret == GRO_MERGED || ret == GRO_MERGED_FREE)
+		rxq->stats.lro_merged++;
+	rxq->stats.pkts++;
+	rxq->stats.rx_cso++;
+}
+
+/**
+ *	t4_ethrx_handler - process an ingress ethernet packet
+ *	@q: the response queue that received the packet
+ *	@rsp: the response queue descriptor holding the RX_PKT message
+ *	@si: the gather list of packet fragments
+ *
+ *	Process an ingress ethernet packet and deliver it to the stack.
+ */
+int t4_ethrx_handler(struct sge_rspq *q, const __be64 *rsp,
+		     const struct pkt_gl *si)
+{
+	bool csum_ok;
+	struct sk_buff *skb;
+	const struct cpl_rx_pkt *pkt;
+	struct sge_eth_rxq *rxq = container_of(q, struct sge_eth_rxq, rspq);
+	struct sge *s = &q->adap->sge;
+	int cpl_trace_pkt = is_t4(q->adap->params.chip) ?
+			    CPL_TRACE_PKT : CPL_TRACE_PKT_T5;
+
+	if (unlikely(*(u8 *)rsp == cpl_trace_pkt))
+		return handle_trace_pkt(q->adap, si);
+
+	pkt = (const struct cpl_rx_pkt *)rsp;
+	csum_ok = pkt->csum_calc && !pkt->err_vec &&
+		  (q->netdev->features & NETIF_F_RXCSUM);
+	if ((pkt->l2info & htonl(RXF_TCP)) &&
+	    (q->netdev->features & NETIF_F_GRO) && csum_ok && !pkt->ip_frag) {
+		do_gro(rxq, si, pkt);
+		return 0;
+	}
+
+	skb = cxgb4_pktgl_to_skb(si, RX_PKT_SKB_LEN, RX_PULL_LEN);
+	if (unlikely(!skb)) {
+		t4_pktgl_free(si);
+		rxq->stats.rx_drops++;
+		return 0;
+	}
+
+	__skb_pull(skb, s->pktshift);      /* remove ethernet header padding */
+	skb->protocol = eth_type_trans(skb, q->netdev);
+	skb_record_rx_queue(skb, q->idx);
+	if (skb->dev->features & NETIF_F_RXHASH)
+		skb_set_hash(skb, (__force u32)pkt->rsshdr.hash_val,
+			     PKT_HASH_TYPE_L3);
+
+	rxq->stats.pkts++;
+
+	if (csum_ok && (pkt->l2info & htonl(RXF_UDP | RXF_TCP))) {
+		if (!pkt->ip_frag) {
+			skb->ip_summed = CHECKSUM_UNNECESSARY;
+			rxq->stats.rx_cso++;
+		} else if (pkt->l2info & htonl(RXF_IP)) {
+			__sum16 c = (__force __sum16)pkt->csum;
+			skb->csum = csum_unfold(c);
+			skb->ip_summed = CHECKSUM_COMPLETE;
+			rxq->stats.rx_cso++;
+		}
+	} else
+		skb_checksum_none_assert(skb);
+
+	if (unlikely(pkt->vlan_ex)) {
+		__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), ntohs(pkt->vlan));
+		rxq->stats.vlan_ex++;
+	}
+	netif_receive_skb(skb);
+	return 0;
+}
+
+/**
+ *	restore_rx_bufs - put back a packet's Rx buffers
+ *	@si: the packet gather list
+ *	@q: the SGE free list
+ *	@frags: number of FL buffers to restore
+ *
+ *	Puts back on an FL the Rx buffers associated with @si.  The buffers
+ *	have already been unmapped and are left unmapped, we mark them so to
+ *	prevent further unmapping attempts.
+ *
+ *	This function undoes a series of @unmap_rx_buf calls when we find out
+ *	that the current packet can't be processed right away afterall and we
+ *	need to come back to it later.  This is a very rare event and there's
+ *	no effort to make this particularly efficient.
+ */
+static void restore_rx_bufs(const struct pkt_gl *si, struct sge_fl *q,
+			    int frags)
+{
+	struct rx_sw_desc *d;
+
+	while (frags--) {
+		if (q->cidx == 0)
+			q->cidx = q->size - 1;
+		else
+			q->cidx--;
+		d = &q->sdesc[q->cidx];
+		d->page = si->frags[frags].page;
+		d->dma_addr |= RX_UNMAPPED_BUF;
+		q->avail++;
+	}
+}
+
+/**
+ *	is_new_response - check if a response is newly written
+ *	@r: the response descriptor
+ *	@q: the response queue
+ *
+ *	Returns true if a response descriptor contains a yet unprocessed
+ *	response.
+ */
+static inline bool is_new_response(const struct rsp_ctrl *r,
+				   const struct sge_rspq *q)
+{
+	return RSPD_GEN(r->type_gen) == q->gen;
+}
+
+/**
+ *	rspq_next - advance to the next entry in a response queue
+ *	@q: the queue
+ *
+ *	Updates the state of a response queue to advance it to the next entry.
+ */
+static inline void rspq_next(struct sge_rspq *q)
+{
+	q->cur_desc = (void *)q->cur_desc + q->iqe_len;
+	if (unlikely(++q->cidx == q->size)) {
+		q->cidx = 0;
+		q->gen ^= 1;
+		q->cur_desc = q->desc;
+	}
+}
+
+/**
+ *	process_responses - process responses from an SGE response queue
+ *	@q: the ingress queue to process
+ *	@budget: how many responses can be processed in this round
+ *
+ *	Process responses from an SGE response queue up to the supplied budget.
+ *	Responses include received packets as well as control messages from FW
+ *	or HW.
+ *
+ *	Additionally choose the interrupt holdoff time for the next interrupt
+ *	on this queue.  If the system is under memory shortage use a fairly
+ *	long delay to help recovery.
+ */
+static int process_responses(struct sge_rspq *q, int budget)
+{
+	int ret, rsp_type;
+	int budget_left = budget;
+	const struct rsp_ctrl *rc;
+	struct sge_eth_rxq *rxq = container_of(q, struct sge_eth_rxq, rspq);
+	struct adapter *adapter = q->adap;
+	struct sge *s = &adapter->sge;
+
+	while (likely(budget_left)) {
+		rc = (void *)q->cur_desc + (q->iqe_len - sizeof(*rc));
+		if (!is_new_response(rc, q))
+			break;
+
+		rmb();
+		rsp_type = RSPD_TYPE(rc->type_gen);
+		if (likely(rsp_type == RSP_TYPE_FLBUF)) {
+			struct page_frag *fp;
+			struct pkt_gl si;
+			const struct rx_sw_desc *rsd;
+			u32 len = ntohl(rc->pldbuflen_qid), bufsz, frags;
+
+			if (len & RSPD_NEWBUF) {
+				if (likely(q->offset > 0)) {
+					free_rx_bufs(q->adap, &rxq->fl, 1);
+					q->offset = 0;
+				}
+				len = RSPD_LEN(len);
+			}
+			si.tot_len = len;
+
+			/* gather packet fragments */
+			for (frags = 0, fp = si.frags; ; frags++, fp++) {
+				rsd = &rxq->fl.sdesc[rxq->fl.cidx];
+				bufsz = get_buf_size(adapter, rsd);
+				fp->page = rsd->page;
+				fp->offset = q->offset;
+				fp->size = min(bufsz, len);
+				len -= fp->size;
+				if (!len)
+					break;
+				unmap_rx_buf(q->adap, &rxq->fl);
+			}
+
+			/*
+			 * Last buffer remains mapped so explicitly make it
+			 * coherent for CPU access.
+			 */
+			dma_sync_single_for_cpu(q->adap->pdev_dev,
+						get_buf_addr(rsd),
+						fp->size, DMA_FROM_DEVICE);
+
+			si.va = page_address(si.frags[0].page) +
+				si.frags[0].offset;
+			prefetch(si.va);
+
+			si.nfrags = frags + 1;
+			ret = q->handler(q, q->cur_desc, &si);
+			if (likely(ret == 0))
+				q->offset += ALIGN(fp->size, s->fl_align);
+			else
+				restore_rx_bufs(&si, &rxq->fl, frags);
+		} else if (likely(rsp_type == RSP_TYPE_CPL)) {
+			ret = q->handler(q, q->cur_desc, NULL);
+		} else {
+			ret = q->handler(q, (const __be64 *)rc, CXGB4_MSG_AN);
+		}
+
+		if (unlikely(ret)) {
+			/* couldn't process descriptor, back off for recovery */
+			q->next_intr_params = QINTR_TIMER_IDX(NOMEM_TMR_IDX);
+			break;
+		}
+
+		rspq_next(q);
+		budget_left--;
+	}
+
+	if (q->offset >= 0 && rxq->fl.size - rxq->fl.avail >= 16)
+		__refill_fl(q->adap, &rxq->fl);
+	return budget - budget_left;
+}
+
+/**
+ *	napi_rx_handler - the NAPI handler for Rx processing
+ *	@napi: the napi instance
+ *	@budget: how many packets we can process in this round
+ *
+ *	Handler for new data events when using NAPI.  This does not need any
+ *	locking or protection from interrupts as data interrupts are off at
+ *	this point and other adapter interrupts do not interfere (the latter
+ *	in not a concern at all with MSI-X as non-data interrupts then have
+ *	a separate handler).
+ */
+static int napi_rx_handler(struct napi_struct *napi, int budget)
+{
+	unsigned int params;
+	struct sge_rspq *q = container_of(napi, struct sge_rspq, napi);
+	int work_done = process_responses(q, budget);
+	u32 val;
+
+	if (likely(work_done < budget)) {
+		int timer_index;
+
+		napi_complete(napi);
+		timer_index = QINTR_TIMER_IDX_GET(q->next_intr_params);
+
+		if (q->adaptive_rx) {
+			if (work_done > max(timer_pkt_quota[timer_index],
+					    MIN_NAPI_WORK))
+				timer_index = (timer_index + 1);
+			else
+				timer_index = timer_index - 1;
+
+			timer_index = clamp(timer_index, 0, SGE_TIMERREGS - 1);
+			q->next_intr_params = QINTR_TIMER_IDX(timer_index) |
+							      V_QINTR_CNT_EN;
+			params = q->next_intr_params;
+		} else {
+			params = q->next_intr_params;
+			q->next_intr_params = q->intr_params;
+		}
+	} else
+		params = QINTR_TIMER_IDX(7);
+
+	val = CIDXINC(work_done) | SEINTARM(params);
+	if (is_t4(q->adap->params.chip)) {
+		t4_write_reg(q->adap, MYPF_REG(SGE_PF_GTS),
+			     val | INGRESSQID((u32)q->cntxt_id));
+	} else {
+		writel(val, q->adap->bar2 + q->udb + SGE_UDB_GTS);
+		wmb();
+	}
+	return work_done;
+}
+
+/*
+ * The MSI-X interrupt handler for an SGE response queue.
+ */
+irqreturn_t t4_sge_intr_msix(int irq, void *cookie)
+{
+	struct sge_rspq *q = cookie;
+
+	napi_schedule(&q->napi);
+	return IRQ_HANDLED;
+}
+
+/*
+ * Process the indirect interrupt entries in the interrupt queue and kick off
+ * NAPI for each queue that has generated an entry.
+ */
+static unsigned int process_intrq(struct adapter *adap)
+{
+	unsigned int credits;
+	const struct rsp_ctrl *rc;
+	struct sge_rspq *q = &adap->sge.intrq;
+	u32 val;
+
+	spin_lock(&adap->sge.intrq_lock);
+	for (credits = 0; ; credits++) {
+		rc = (void *)q->cur_desc + (q->iqe_len - sizeof(*rc));
+		if (!is_new_response(rc, q))
+			break;
+
+		rmb();
+		if (RSPD_TYPE(rc->type_gen) == RSP_TYPE_INTR) {
+			unsigned int qid = ntohl(rc->pldbuflen_qid);
+
+			qid -= adap->sge.ingr_start;
+			napi_schedule(&adap->sge.ingr_map[qid]->napi);
+		}
+
+		rspq_next(q);
+	}
+
+	val =  CIDXINC(credits) | SEINTARM(q->intr_params);
+	if (is_t4(adap->params.chip)) {
+		t4_write_reg(adap, MYPF_REG(SGE_PF_GTS),
+			     val | INGRESSQID(q->cntxt_id));
+	} else {
+		writel(val, adap->bar2 + q->udb + SGE_UDB_GTS);
+		wmb();
+	}
+	spin_unlock(&adap->sge.intrq_lock);
+	return credits;
+}
+
+/*
+ * The MSI interrupt handler, which handles data events from SGE response queues
+ * as well as error and other async events as they all use the same MSI vector.
+ */
+static irqreturn_t t4_intr_msi(int irq, void *cookie)
+{
+	struct adapter *adap = cookie;
+
+	t4_slow_intr_handler(adap);
+	process_intrq(adap);
+	return IRQ_HANDLED;
+}
+
+/*
+ * Interrupt handler for legacy INTx interrupts.
+ * Handles data events from SGE response queues as well as error and other
+ * async events as they all use the same interrupt line.
+ */
+static irqreturn_t t4_intr_intx(int irq, void *cookie)
+{
+	struct adapter *adap = cookie;
+
+	t4_write_reg(adap, MYPF_REG(PCIE_PF_CLI), 0);
+	if (t4_slow_intr_handler(adap) | process_intrq(adap))
+		return IRQ_HANDLED;
+	return IRQ_NONE;             /* probably shared interrupt */
+}
+
+/**
+ *	t4_intr_handler - select the top-level interrupt handler
+ *	@adap: the adapter
+ *
+ *	Selects the top-level interrupt handler based on the type of interrupts
+ *	(MSI-X, MSI, or INTx).
+ */
+irq_handler_t t4_intr_handler(struct adapter *adap)
+{
+	if (adap->flags & USING_MSIX)
+		return t4_sge_intr_msix;
+	if (adap->flags & USING_MSI)
+		return t4_intr_msi;
+	return t4_intr_intx;
+}
+
+static void sge_rx_timer_cb(unsigned long data)
+{
+	unsigned long m;
+	unsigned int i, idma_same_state_cnt[2];
+	struct adapter *adap = (struct adapter *)data;
+	struct sge *s = &adap->sge;
+
+	for (i = 0; i < ARRAY_SIZE(s->starving_fl); i++)
+		for (m = s->starving_fl[i]; m; m &= m - 1) {
+			struct sge_eth_rxq *rxq;
+			unsigned int id = __ffs(m) + i * BITS_PER_LONG;
+			struct sge_fl *fl = s->egr_map[id];
+
+			clear_bit(id, s->starving_fl);
+			smp_mb__after_atomic();
+
+			if (fl_starving(fl)) {
+				rxq = container_of(fl, struct sge_eth_rxq, fl);
+				if (napi_reschedule(&rxq->rspq.napi))
+					fl->starving++;
+				else
+					set_bit(id, s->starving_fl);
+			}
+		}
+
+	t4_write_reg(adap, SGE_DEBUG_INDEX, 13);
+	idma_same_state_cnt[0] = t4_read_reg(adap, SGE_DEBUG_DATA_HIGH);
+	idma_same_state_cnt[1] = t4_read_reg(adap, SGE_DEBUG_DATA_LOW);
+
+	for (i = 0; i < 2; i++) {
+		u32 debug0, debug11;
+
+		/* If the Ingress DMA Same State Counter ("timer") is less
+		 * than 1s, then we can reset our synthesized Stall Timer and
+		 * continue.  If we have previously emitted warnings about a
+		 * potential stalled Ingress Queue, issue a note indicating
+		 * that the Ingress Queue has resumed forward progress.
+		 */
+		if (idma_same_state_cnt[i] < s->idma_1s_thresh) {
+			if (s->idma_stalled[i] >= SGE_IDMA_WARN_THRESH)
+				CH_WARN(adap, "SGE idma%d, queue%u,resumed after %d sec\n",
+					i, s->idma_qid[i],
+					s->idma_stalled[i]/HZ);
+			s->idma_stalled[i] = 0;
+			continue;
+		}
+
+		/* Synthesize an SGE Ingress DMA Same State Timer in the Hz
+		 * domain.  The first time we get here it'll be because we
+		 * passed the 1s Threshold; each additional time it'll be
+		 * because the RX Timer Callback is being fired on its regular
+		 * schedule.
+		 *
+		 * If the stall is below our Potential Hung Ingress Queue
+		 * Warning Threshold, continue.
+		 */
+		if (s->idma_stalled[i] == 0)
+			s->idma_stalled[i] = HZ;
+		else
+			s->idma_stalled[i] += RX_QCHECK_PERIOD;
+
+		if (s->idma_stalled[i] < SGE_IDMA_WARN_THRESH)
+			continue;
+
+		/* We'll issue a warning every SGE_IDMA_WARN_REPEAT Hz */
+		if (((s->idma_stalled[i] - HZ) % SGE_IDMA_WARN_REPEAT) != 0)
+			continue;
+
+		/* Read and save the SGE IDMA State and Queue ID information.
+		 * We do this every time in case it changes across time ...
+		 */
+		t4_write_reg(adap, SGE_DEBUG_INDEX, 0);
+		debug0 = t4_read_reg(adap, SGE_DEBUG_DATA_LOW);
+		s->idma_state[i] = (debug0 >> (i * 9)) & 0x3f;
+
+		t4_write_reg(adap, SGE_DEBUG_INDEX, 11);
+		debug11 = t4_read_reg(adap, SGE_DEBUG_DATA_LOW);
+		s->idma_qid[i] = (debug11 >> (i * 16)) & 0xffff;
+
+		CH_WARN(adap, "SGE idma%u, queue%u, maybe stuck state%u %dsecs (debug0=%#x, debug11=%#x)\n",
+			i, s->idma_qid[i], s->idma_state[i],
+			s->idma_stalled[i]/HZ, debug0, debug11);
+		t4_sge_decode_idma_state(adap, s->idma_state[i]);
+	}
+
+	mod_timer(&s->rx_timer, jiffies + RX_QCHECK_PERIOD);
+}
+
+static void sge_tx_timer_cb(unsigned long data)
+{
+	unsigned long m;
+	unsigned int i, budget;
+	struct adapter *adap = (struct adapter *)data;
+	struct sge *s = &adap->sge;
+
+	for (i = 0; i < ARRAY_SIZE(s->txq_maperr); i++)
+		for (m = s->txq_maperr[i]; m; m &= m - 1) {
+			unsigned long id = __ffs(m) + i * BITS_PER_LONG;
+			struct sge_ofld_txq *txq = s->egr_map[id];
+
+			clear_bit(id, s->txq_maperr);
+			tasklet_schedule(&txq->qresume_tsk);
+		}
+
+	budget = MAX_TIMER_TX_RECLAIM;
+	i = s->ethtxq_rover;
+	do {
+		struct sge_eth_txq *q = &s->ethtxq[i];
+
+		if (q->q.in_use &&
+		    time_after_eq(jiffies, q->txq->trans_start + HZ / 100) &&
+		    __netif_tx_trylock(q->txq)) {
+			int avail = reclaimable(&q->q);
+
+			if (avail) {
+				if (avail > budget)
+					avail = budget;
+
+				free_tx_desc(adap, &q->q, avail, true);
+				q->q.in_use -= avail;
+				budget -= avail;
+			}
+			__netif_tx_unlock(q->txq);
+		}
+
+		if (++i >= s->ethqsets)
+			i = 0;
+	} while (budget && i != s->ethtxq_rover);
+	s->ethtxq_rover = i;
+	mod_timer(&s->tx_timer, jiffies + (budget ? TX_QCHECK_PERIOD : 2));
+}
+
+/**
+ *      udb_address - return the BAR2 User Doorbell address for a Queue
+ *      @adap: the adapter
+ *      @cntxt_id: the Queue Context ID
+ *      @qpp: Queues Per Page (for all PFs)
+ *
+ *      Returns the BAR2 address of the user Doorbell associated with the
+ *      indicated Queue Context ID.  Note that this is only applicable
+ *      for T5 and later.
+ */
+static u64 udb_address(struct adapter *adap, unsigned int cntxt_id,
+		       unsigned int qpp)
+{
+	u64 udb;
+	unsigned int s_qpp;
+	unsigned short udb_density;
+	unsigned long qpshift;
+	int page;
+
+	BUG_ON(is_t4(adap->params.chip));
+
+	s_qpp = (QUEUESPERPAGEPF0 +
+		(QUEUESPERPAGEPF1 - QUEUESPERPAGEPF0) * adap->fn);
+	udb_density = 1 << ((qpp >> s_qpp) & QUEUESPERPAGEPF0_MASK);
+	qpshift = PAGE_SHIFT - ilog2(udb_density);
+	udb = (u64)cntxt_id << qpshift;
+	udb &= PAGE_MASK;
+	page = udb / PAGE_SIZE;
+	udb += (cntxt_id - (page * udb_density)) * SGE_UDB_SIZE;
+
+	return udb;
+}
+
+static u64 udb_address_eq(struct adapter *adap, unsigned int cntxt_id)
+{
+	return udb_address(adap, cntxt_id,
+			   t4_read_reg(adap, SGE_EGRESS_QUEUES_PER_PAGE_PF));
+}
+
+static u64 udb_address_iq(struct adapter *adap, unsigned int cntxt_id)
+{
+	return udb_address(adap, cntxt_id,
+			   t4_read_reg(adap, SGE_INGRESS_QUEUES_PER_PAGE_PF));
+}
+
+int t4_sge_alloc_rxq(struct adapter *adap, struct sge_rspq *iq, bool fwevtq,
+		     struct net_device *dev, int intr_idx,
+		     struct sge_fl *fl, rspq_handler_t hnd)
+{
+	int ret, flsz = 0;
+	struct fw_iq_cmd c;
+	struct sge *s = &adap->sge;
+	struct port_info *pi = netdev_priv(dev);
+
+	/* Size needs to be multiple of 16, including status entry. */
+	iq->size = roundup(iq->size, 16);
+
+	iq->desc = alloc_ring(adap->pdev_dev, iq->size, iq->iqe_len, 0,
+			      &iq->phys_addr, NULL, 0, NUMA_NO_NODE);
+	if (!iq->desc)
+		return -ENOMEM;
+
+	memset(&c, 0, sizeof(c));
+	c.op_to_vfn = htonl(FW_CMD_OP(FW_IQ_CMD) | FW_CMD_REQUEST |
+			    FW_CMD_WRITE | FW_CMD_EXEC |
+			    FW_IQ_CMD_PFN(adap->fn) | FW_IQ_CMD_VFN(0));
+	c.alloc_to_len16 = htonl(FW_IQ_CMD_ALLOC | FW_IQ_CMD_IQSTART(1) |
+				 FW_LEN16(c));
+	c.type_to_iqandstindex = htonl(FW_IQ_CMD_TYPE(FW_IQ_TYPE_FL_INT_CAP) |
+		FW_IQ_CMD_IQASYNCH(fwevtq) | FW_IQ_CMD_VIID(pi->viid) |
+		FW_IQ_CMD_IQANDST(intr_idx < 0) | FW_IQ_CMD_IQANUD(1) |
+		FW_IQ_CMD_IQANDSTINDEX(intr_idx >= 0 ? intr_idx :
+							-intr_idx - 1));
+	c.iqdroprss_to_iqesize = htons(FW_IQ_CMD_IQPCIECH(pi->tx_chan) |
+		FW_IQ_CMD_IQGTSMODE |
+		FW_IQ_CMD_IQINTCNTTHRESH(iq->pktcnt_idx) |
+		FW_IQ_CMD_IQESIZE(ilog2(iq->iqe_len) - 4));
+	c.iqsize = htons(iq->size);
+	c.iqaddr = cpu_to_be64(iq->phys_addr);
+
+	if (fl) {
+		fl->size = roundup(fl->size, 8);
+		fl->desc = alloc_ring(adap->pdev_dev, fl->size, sizeof(__be64),
+				      sizeof(struct rx_sw_desc), &fl->addr,
+				      &fl->sdesc, s->stat_len, NUMA_NO_NODE);
+		if (!fl->desc)
+			goto fl_nomem;
+
+		flsz = fl->size / 8 + s->stat_len / sizeof(struct tx_desc);
+		c.iqns_to_fl0congen = htonl(FW_IQ_CMD_FL0PACKEN(1) |
+					    FW_IQ_CMD_FL0FETCHRO(1) |
+					    FW_IQ_CMD_FL0DATARO(1) |
+					    FW_IQ_CMD_FL0PADEN(1));
+		c.fl0dcaen_to_fl0cidxfthresh = htons(FW_IQ_CMD_FL0FBMIN(2) |
+				FW_IQ_CMD_FL0FBMAX(3));
+		c.fl0size = htons(flsz);
+		c.fl0addr = cpu_to_be64(fl->addr);
+	}
+
+	ret = t4_wr_mbox(adap, adap->fn, &c, sizeof(c), &c);
+	if (ret)
+		goto err;
+
+	netif_napi_add(dev, &iq->napi, napi_rx_handler, 64);
+	iq->cur_desc = iq->desc;
+	iq->cidx = 0;
+	iq->gen = 1;
+	iq->next_intr_params = iq->intr_params;
+	iq->cntxt_id = ntohs(c.iqid);
+	iq->abs_id = ntohs(c.physiqid);
+	if (!is_t4(adap->params.chip))
+		iq->udb = udb_address_iq(adap, iq->cntxt_id);
+	iq->size--;                           /* subtract status entry */
+	iq->netdev = dev;
+	iq->handler = hnd;
+
+	/* set offset to -1 to distinguish ingress queues without FL */
+	iq->offset = fl ? 0 : -1;
+
+	adap->sge.ingr_map[iq->cntxt_id - adap->sge.ingr_start] = iq;
+
+	if (fl) {
+		fl->cntxt_id = ntohs(c.fl0id);
+		fl->avail = fl->pend_cred = 0;
+		fl->pidx = fl->cidx = 0;
+		fl->alloc_failed = fl->large_alloc_failed = fl->starving = 0;
+		adap->sge.egr_map[fl->cntxt_id - adap->sge.egr_start] = fl;
+
+		/* Note, we must initialize the Free List User Doorbell
+		 * address before refilling the Free List!
+		 */
+		if (!is_t4(adap->params.chip))
+			fl->udb = udb_address_eq(adap, fl->cntxt_id);
+		refill_fl(adap, fl, fl_cap(fl), GFP_KERNEL);
+	}
+	return 0;
+
+fl_nomem:
+	ret = -ENOMEM;
+err:
+	if (iq->desc) {
+		dma_free_coherent(adap->pdev_dev, iq->size * iq->iqe_len,
+				  iq->desc, iq->phys_addr);
+		iq->desc = NULL;
+	}
+	if (fl && fl->desc) {
+		kfree(fl->sdesc);
+		fl->sdesc = NULL;
+		dma_free_coherent(adap->pdev_dev, flsz * sizeof(struct tx_desc),
+				  fl->desc, fl->addr);
+		fl->desc = NULL;
+	}
+	return ret;
+}
+
+static void init_txq(struct adapter *adap, struct sge_txq *q, unsigned int id)
+{
+	q->cntxt_id = id;
+	if (!is_t4(adap->params.chip))
+		q->udb = udb_address_eq(adap, q->cntxt_id);
+
+	q->in_use = 0;
+	q->cidx = q->pidx = 0;
+	q->stops = q->restarts = 0;
+	q->stat = (void *)&q->desc[q->size];
+	spin_lock_init(&q->db_lock);
+	adap->sge.egr_map[id - adap->sge.egr_start] = q;
+}
+
+int t4_sge_alloc_eth_txq(struct adapter *adap, struct sge_eth_txq *txq,
+			 struct net_device *dev, struct netdev_queue *netdevq,
+			 unsigned int iqid)
+{
+	int ret, nentries;
+	struct fw_eq_eth_cmd c;
+	struct sge *s = &adap->sge;
+	struct port_info *pi = netdev_priv(dev);
+
+	/* Add status entries */
+	nentries = txq->q.size + s->stat_len / sizeof(struct tx_desc);
+
+	txq->q.desc = alloc_ring(adap->pdev_dev, txq->q.size,
+			sizeof(struct tx_desc), sizeof(struct tx_sw_desc),
+			&txq->q.phys_addr, &txq->q.sdesc, s->stat_len,
+			netdev_queue_numa_node_read(netdevq));
+	if (!txq->q.desc)
+		return -ENOMEM;
+
+	memset(&c, 0, sizeof(c));
+	c.op_to_vfn = htonl(FW_CMD_OP(FW_EQ_ETH_CMD) | FW_CMD_REQUEST |
+			    FW_CMD_WRITE | FW_CMD_EXEC |
+			    FW_EQ_ETH_CMD_PFN(adap->fn) | FW_EQ_ETH_CMD_VFN(0));
+	c.alloc_to_len16 = htonl(FW_EQ_ETH_CMD_ALLOC |
+				 FW_EQ_ETH_CMD_EQSTART | FW_LEN16(c));
+	c.viid_pkd = htonl(FW_EQ_ETH_CMD_AUTOEQUEQE |
+			   FW_EQ_ETH_CMD_VIID(pi->viid));
+	c.fetchszm_to_iqid = htonl(FW_EQ_ETH_CMD_HOSTFCMODE(2) |
+				   FW_EQ_ETH_CMD_PCIECHN(pi->tx_chan) |
+				   FW_EQ_ETH_CMD_FETCHRO(1) |
+				   FW_EQ_ETH_CMD_IQID(iqid));
+	c.dcaen_to_eqsize = htonl(FW_EQ_ETH_CMD_FBMIN(2) |
+				  FW_EQ_ETH_CMD_FBMAX(3) |
+				  FW_EQ_ETH_CMD_CIDXFTHRESH(5) |
+				  FW_EQ_ETH_CMD_EQSIZE(nentries));
+	c.eqaddr = cpu_to_be64(txq->q.phys_addr);
+
+	ret = t4_wr_mbox(adap, adap->fn, &c, sizeof(c), &c);
+	if (ret) {
+		kfree(txq->q.sdesc);
+		txq->q.sdesc = NULL;
+		dma_free_coherent(adap->pdev_dev,
+				  nentries * sizeof(struct tx_desc),
+				  txq->q.desc, txq->q.phys_addr);
+		txq->q.desc = NULL;
+		return ret;
+	}
+
+	init_txq(adap, &txq->q, FW_EQ_ETH_CMD_EQID_GET(ntohl(c.eqid_pkd)));
+	txq->txq = netdevq;
+	txq->tso = txq->tx_cso = txq->vlan_ins = 0;
+	txq->mapping_err = 0;
+	return 0;
+}
+
+int t4_sge_alloc_ctrl_txq(struct adapter *adap, struct sge_ctrl_txq *txq,
+			  struct net_device *dev, unsigned int iqid,
+			  unsigned int cmplqid)
+{
+	int ret, nentries;
+	struct fw_eq_ctrl_cmd c;
+	struct sge *s = &adap->sge;
+	struct port_info *pi = netdev_priv(dev);
+
+	/* Add status entries */
+	nentries = txq->q.size + s->stat_len / sizeof(struct tx_desc);
+
+	txq->q.desc = alloc_ring(adap->pdev_dev, nentries,
+				 sizeof(struct tx_desc), 0, &txq->q.phys_addr,
+				 NULL, 0, NUMA_NO_NODE);
+	if (!txq->q.desc)
+		return -ENOMEM;
+
+	c.op_to_vfn = htonl(FW_CMD_OP(FW_EQ_CTRL_CMD) | FW_CMD_REQUEST |
+			    FW_CMD_WRITE | FW_CMD_EXEC |
+			    FW_EQ_CTRL_CMD_PFN(adap->fn) |
+			    FW_EQ_CTRL_CMD_VFN(0));
+	c.alloc_to_len16 = htonl(FW_EQ_CTRL_CMD_ALLOC |
+				 FW_EQ_CTRL_CMD_EQSTART | FW_LEN16(c));
+	c.cmpliqid_eqid = htonl(FW_EQ_CTRL_CMD_CMPLIQID(cmplqid));
+	c.physeqid_pkd = htonl(0);
+	c.fetchszm_to_iqid = htonl(FW_EQ_CTRL_CMD_HOSTFCMODE(2) |
+				   FW_EQ_CTRL_CMD_PCIECHN(pi->tx_chan) |
+				   FW_EQ_CTRL_CMD_FETCHRO |
+				   FW_EQ_CTRL_CMD_IQID(iqid));
+	c.dcaen_to_eqsize = htonl(FW_EQ_CTRL_CMD_FBMIN(2) |
+				  FW_EQ_CTRL_CMD_FBMAX(3) |
+				  FW_EQ_CTRL_CMD_CIDXFTHRESH(5) |
+				  FW_EQ_CTRL_CMD_EQSIZE(nentries));
+	c.eqaddr = cpu_to_be64(txq->q.phys_addr);
+
+	ret = t4_wr_mbox(adap, adap->fn, &c, sizeof(c), &c);
+	if (ret) {
+		dma_free_coherent(adap->pdev_dev,
+				  nentries * sizeof(struct tx_desc),
+				  txq->q.desc, txq->q.phys_addr);
+		txq->q.desc = NULL;
+		return ret;
+	}
+
+	init_txq(adap, &txq->q, FW_EQ_CTRL_CMD_EQID_GET(ntohl(c.cmpliqid_eqid)));
+	txq->adap = adap;
+	skb_queue_head_init(&txq->sendq);
+	tasklet_init(&txq->qresume_tsk, restart_ctrlq, (unsigned long)txq);
+	txq->full = 0;
+	return 0;
+}
+
+int t4_sge_alloc_ofld_txq(struct adapter *adap, struct sge_ofld_txq *txq,
+			  struct net_device *dev, unsigned int iqid)
+{
+	int ret, nentries;
+	struct fw_eq_ofld_cmd c;
+	struct sge *s = &adap->sge;
+	struct port_info *pi = netdev_priv(dev);
+
+	/* Add status entries */
+	nentries = txq->q.size + s->stat_len / sizeof(struct tx_desc);
+
+	txq->q.desc = alloc_ring(adap->pdev_dev, txq->q.size,
+			sizeof(struct tx_desc), sizeof(struct tx_sw_desc),
+			&txq->q.phys_addr, &txq->q.sdesc, s->stat_len,
+			NUMA_NO_NODE);
+	if (!txq->q.desc)
+		return -ENOMEM;
+
+	memset(&c, 0, sizeof(c));
+	c.op_to_vfn = htonl(FW_CMD_OP(FW_EQ_OFLD_CMD) | FW_CMD_REQUEST |
+			    FW_CMD_WRITE | FW_CMD_EXEC |
+			    FW_EQ_OFLD_CMD_PFN(adap->fn) |
+			    FW_EQ_OFLD_CMD_VFN(0));
+	c.alloc_to_len16 = htonl(FW_EQ_OFLD_CMD_ALLOC |
+				 FW_EQ_OFLD_CMD_EQSTART | FW_LEN16(c));
+	c.fetchszm_to_iqid = htonl(FW_EQ_OFLD_CMD_HOSTFCMODE(2) |
+				   FW_EQ_OFLD_CMD_PCIECHN(pi->tx_chan) |
+				   FW_EQ_OFLD_CMD_FETCHRO(1) |
+				   FW_EQ_OFLD_CMD_IQID(iqid));
+	c.dcaen_to_eqsize = htonl(FW_EQ_OFLD_CMD_FBMIN(2) |
+				  FW_EQ_OFLD_CMD_FBMAX(3) |
+				  FW_EQ_OFLD_CMD_CIDXFTHRESH(5) |
+				  FW_EQ_OFLD_CMD_EQSIZE(nentries));
+	c.eqaddr = cpu_to_be64(txq->q.phys_addr);
+
+	ret = t4_wr_mbox(adap, adap->fn, &c, sizeof(c), &c);
+	if (ret) {
+		kfree(txq->q.sdesc);
+		txq->q.sdesc = NULL;
+		dma_free_coherent(adap->pdev_dev,
+				  nentries * sizeof(struct tx_desc),
+				  txq->q.desc, txq->q.phys_addr);
+		txq->q.desc = NULL;
+		return ret;
+	}
+
+	init_txq(adap, &txq->q, FW_EQ_OFLD_CMD_EQID_GET(ntohl(c.eqid_pkd)));
+	txq->adap = adap;
+	skb_queue_head_init(&txq->sendq);
+	tasklet_init(&txq->qresume_tsk, restart_ofldq, (unsigned long)txq);
+	txq->full = 0;
+	txq->mapping_err = 0;
+	return 0;
+}
+
+static void free_txq(struct adapter *adap, struct sge_txq *q)
+{
+	struct sge *s = &adap->sge;
+
+	dma_free_coherent(adap->pdev_dev,
+			  q->size * sizeof(struct tx_desc) + s->stat_len,
+			  q->desc, q->phys_addr);
+	q->cntxt_id = 0;
+	q->sdesc = NULL;
+	q->desc = NULL;
+}
+
+static void free_rspq_fl(struct adapter *adap, struct sge_rspq *rq,
+			 struct sge_fl *fl)
+{
+	struct sge *s = &adap->sge;
+	unsigned int fl_id = fl ? fl->cntxt_id : 0xffff;
+
+	adap->sge.ingr_map[rq->cntxt_id - adap->sge.ingr_start] = NULL;
+	t4_iq_free(adap, adap->fn, adap->fn, 0, FW_IQ_TYPE_FL_INT_CAP,
+		   rq->cntxt_id, fl_id, 0xffff);
+	dma_free_coherent(adap->pdev_dev, (rq->size + 1) * rq->iqe_len,
+			  rq->desc, rq->phys_addr);
+	netif_napi_del(&rq->napi);
+	rq->netdev = NULL;
+	rq->cntxt_id = rq->abs_id = 0;
+	rq->desc = NULL;
+
+	if (fl) {
+		free_rx_bufs(adap, fl, fl->avail);
+		dma_free_coherent(adap->pdev_dev, fl->size * 8 + s->stat_len,
+				  fl->desc, fl->addr);
+		kfree(fl->sdesc);
+		fl->sdesc = NULL;
+		fl->cntxt_id = 0;
+		fl->desc = NULL;
+	}
+}
+
+/**
+ *      t4_free_ofld_rxqs - free a block of consecutive Rx queues
+ *      @adap: the adapter
+ *      @n: number of queues
+ *      @q: pointer to first queue
+ *
+ *      Release the resources of a consecutive block of offload Rx queues.
+ */
+void t4_free_ofld_rxqs(struct adapter *adap, int n, struct sge_ofld_rxq *q)
+{
+	for ( ; n; n--, q++)
+		if (q->rspq.desc)
+			free_rspq_fl(adap, &q->rspq,
+				     q->fl.size ? &q->fl : NULL);
+}
+
+/**
+ *	t4_free_sge_resources - free SGE resources
+ *	@adap: the adapter
+ *
+ *	Frees resources used by the SGE queue sets.
+ */
+void t4_free_sge_resources(struct adapter *adap)
+{
+	int i;
+	struct sge_eth_rxq *eq = adap->sge.ethrxq;
+	struct sge_eth_txq *etq = adap->sge.ethtxq;
+
+	/* clean up Ethernet Tx/Rx queues */
+	for (i = 0; i < adap->sge.ethqsets; i++, eq++, etq++) {
+		if (eq->rspq.desc)
+			free_rspq_fl(adap, &eq->rspq,
+				     eq->fl.size ? &eq->fl : NULL);
+		if (etq->q.desc) {
+			t4_eth_eq_free(adap, adap->fn, adap->fn, 0,
+				       etq->q.cntxt_id);
+			free_tx_desc(adap, &etq->q, etq->q.in_use, true);
+			kfree(etq->q.sdesc);
+			free_txq(adap, &etq->q);
+		}
+	}
+
+	/* clean up RDMA and iSCSI Rx queues */
+	t4_free_ofld_rxqs(adap, adap->sge.ofldqsets, adap->sge.ofldrxq);
+	t4_free_ofld_rxqs(adap, adap->sge.rdmaqs, adap->sge.rdmarxq);
+	t4_free_ofld_rxqs(adap, adap->sge.rdmaciqs, adap->sge.rdmaciq);
+
+	/* clean up offload Tx queues */
+	for (i = 0; i < ARRAY_SIZE(adap->sge.ofldtxq); i++) {
+		struct sge_ofld_txq *q = &adap->sge.ofldtxq[i];
+
+		if (q->q.desc) {
+			tasklet_kill(&q->qresume_tsk);
+			t4_ofld_eq_free(adap, adap->fn, adap->fn, 0,
+					q->q.cntxt_id);
+			free_tx_desc(adap, &q->q, q->q.in_use, false);
+			kfree(q->q.sdesc);
+			__skb_queue_purge(&q->sendq);
+			free_txq(adap, &q->q);
+		}
+	}
+
+	/* clean up control Tx queues */
+	for (i = 0; i < ARRAY_SIZE(adap->sge.ctrlq); i++) {
+		struct sge_ctrl_txq *cq = &adap->sge.ctrlq[i];
+
+		if (cq->q.desc) {
+			tasklet_kill(&cq->qresume_tsk);
+			t4_ctrl_eq_free(adap, adap->fn, adap->fn, 0,
+					cq->q.cntxt_id);
+			__skb_queue_purge(&cq->sendq);
+			free_txq(adap, &cq->q);
+		}
+	}
+
+	if (adap->sge.fw_evtq.desc)
+		free_rspq_fl(adap, &adap->sge.fw_evtq, NULL);
+
+	if (adap->sge.intrq.desc)
+		free_rspq_fl(adap, &adap->sge.intrq, NULL);
+
+	/* clear the reverse egress queue map */
+	memset(adap->sge.egr_map, 0, sizeof(adap->sge.egr_map));
+}
+
+void t4_sge_start(struct adapter *adap)
+{
+	adap->sge.ethtxq_rover = 0;
+	mod_timer(&adap->sge.rx_timer, jiffies + RX_QCHECK_PERIOD);
+	mod_timer(&adap->sge.tx_timer, jiffies + TX_QCHECK_PERIOD);
+}
+
+/**
+ *	t4_sge_stop - disable SGE operation
+ *	@adap: the adapter
+ *
+ *	Stop tasklets and timers associated with the DMA engine.  Note that
+ *	this is effective only if measures have been taken to disable any HW
+ *	events that may restart them.
+ */
+void t4_sge_stop(struct adapter *adap)
+{
+	int i;
+	struct sge *s = &adap->sge;
+
+	if (in_interrupt())  /* actions below require waiting */
+		return;
+
+	if (s->rx_timer.function)
+		del_timer_sync(&s->rx_timer);
+	if (s->tx_timer.function)
+		del_timer_sync(&s->tx_timer);
+
+	for (i = 0; i < ARRAY_SIZE(s->ofldtxq); i++) {
+		struct sge_ofld_txq *q = &s->ofldtxq[i];
+
+		if (q->q.desc)
+			tasklet_kill(&q->qresume_tsk);
+	}
+	for (i = 0; i < ARRAY_SIZE(s->ctrlq); i++) {
+		struct sge_ctrl_txq *cq = &s->ctrlq[i];
+
+		if (cq->q.desc)
+			tasklet_kill(&cq->qresume_tsk);
+	}
+}
+
+/**
+ *	t4_sge_init - initialize SGE
+ *	@adap: the adapter
+ *
+ *	Performs SGE initialization needed every time after a chip reset.
+ *	We do not initialize any of the queues here, instead the driver
+ *	top-level must request them individually.
+ *
+ *	Called in two different modes:
+ *
+ *	 1. Perform actual hardware initialization and record hard-coded
+ *	    parameters which were used.  This gets used when we're the
+ *	    Master PF and the Firmware Configuration File support didn't
+ *	    work for some reason.
+ *
+ *	 2. We're not the Master PF or initialization was performed with
+ *	    a Firmware Configuration File.  In this case we need to grab
+ *	    any of the SGE operating parameters that we need to have in
+ *	    order to do our job and make sure we can live with them ...
+ */
+
+static int t4_sge_init_soft(struct adapter *adap)
+{
+	struct sge *s = &adap->sge;
+	u32 fl_small_pg, fl_large_pg, fl_small_mtu, fl_large_mtu;
+	u32 timer_value_0_and_1, timer_value_2_and_3, timer_value_4_and_5;
+	u32 ingress_rx_threshold;
+
+	/*
+	 * Verify that CPL messages are going to the Ingress Queue for
+	 * process_responses() and that only packet data is going to the
+	 * Free Lists.
+	 */
+	if ((t4_read_reg(adap, SGE_CONTROL) & RXPKTCPLMODE_MASK) !=
+	    RXPKTCPLMODE(X_RXPKTCPLMODE_SPLIT)) {
+		dev_err(adap->pdev_dev, "bad SGE CPL MODE\n");
+		return -EINVAL;
+	}
+
+	/*
+	 * Validate the Host Buffer Register Array indices that we want to
+	 * use ...
+	 *
+	 * XXX Note that we should really read through the Host Buffer Size
+	 * XXX register array and find the indices of the Buffer Sizes which
+	 * XXX meet our needs!
+	 */
+	#define READ_FL_BUF(x) \
+		t4_read_reg(adap, SGE_FL_BUFFER_SIZE0+(x)*sizeof(u32))
+
+	fl_small_pg = READ_FL_BUF(RX_SMALL_PG_BUF);
+	fl_large_pg = READ_FL_BUF(RX_LARGE_PG_BUF);
+	fl_small_mtu = READ_FL_BUF(RX_SMALL_MTU_BUF);
+	fl_large_mtu = READ_FL_BUF(RX_LARGE_MTU_BUF);
+
+	/* We only bother using the Large Page logic if the Large Page Buffer
+	 * is larger than our Page Size Buffer.
+	 */
+	if (fl_large_pg <= fl_small_pg)
+		fl_large_pg = 0;
+
+	#undef READ_FL_BUF
+
+	/* The Page Size Buffer must be exactly equal to our Page Size and the
+	 * Large Page Size Buffer should be 0 (per above) or a power of 2.
+	 */
+	if (fl_small_pg != PAGE_SIZE ||
+	    (fl_large_pg & (fl_large_pg-1)) != 0) {
+		dev_err(adap->pdev_dev, "bad SGE FL page buffer sizes [%d, %d]\n",
+			fl_small_pg, fl_large_pg);
+		return -EINVAL;
+	}
+	if (fl_large_pg)
+		s->fl_pg_order = ilog2(fl_large_pg) - PAGE_SHIFT;
+
+	if (fl_small_mtu < FL_MTU_SMALL_BUFSIZE(adap) ||
+	    fl_large_mtu < FL_MTU_LARGE_BUFSIZE(adap)) {
+		dev_err(adap->pdev_dev, "bad SGE FL MTU sizes [%d, %d]\n",
+			fl_small_mtu, fl_large_mtu);
+		return -EINVAL;
+	}
+
+	/*
+	 * Retrieve our RX interrupt holdoff timer values and counter
+	 * threshold values from the SGE parameters.
+	 */
+	timer_value_0_and_1 = t4_read_reg(adap, SGE_TIMER_VALUE_0_AND_1);
+	timer_value_2_and_3 = t4_read_reg(adap, SGE_TIMER_VALUE_2_AND_3);
+	timer_value_4_and_5 = t4_read_reg(adap, SGE_TIMER_VALUE_4_AND_5);
+	s->timer_val[0] = core_ticks_to_us(adap,
+		TIMERVALUE0_GET(timer_value_0_and_1));
+	s->timer_val[1] = core_ticks_to_us(adap,
+		TIMERVALUE1_GET(timer_value_0_and_1));
+	s->timer_val[2] = core_ticks_to_us(adap,
+		TIMERVALUE2_GET(timer_value_2_and_3));
+	s->timer_val[3] = core_ticks_to_us(adap,
+		TIMERVALUE3_GET(timer_value_2_and_3));
+	s->timer_val[4] = core_ticks_to_us(adap,
+		TIMERVALUE4_GET(timer_value_4_and_5));
+	s->timer_val[5] = core_ticks_to_us(adap,
+		TIMERVALUE5_GET(timer_value_4_and_5));
+
+	ingress_rx_threshold = t4_read_reg(adap, SGE_INGRESS_RX_THRESHOLD);
+	s->counter_val[0] = THRESHOLD_0_GET(ingress_rx_threshold);
+	s->counter_val[1] = THRESHOLD_1_GET(ingress_rx_threshold);
+	s->counter_val[2] = THRESHOLD_2_GET(ingress_rx_threshold);
+	s->counter_val[3] = THRESHOLD_3_GET(ingress_rx_threshold);
+
+	return 0;
+}
+
+static int t4_sge_init_hard(struct adapter *adap)
+{
+	struct sge *s = &adap->sge;
+
+	/*
+	 * Set up our basic SGE mode to deliver CPL messages to our Ingress
+	 * Queue and Packet Date to the Free List.
+	 */
+	t4_set_reg_field(adap, SGE_CONTROL, RXPKTCPLMODE_MASK,
+			 RXPKTCPLMODE_MASK);
+
+	/*
+	 * Set up to drop DOORBELL writes when the DOORBELL FIFO overflows
+	 * and generate an interrupt when this occurs so we can recover.
+	 */
+	if (is_t4(adap->params.chip)) {
+		t4_set_reg_field(adap, A_SGE_DBFIFO_STATUS,
+				 V_HP_INT_THRESH(M_HP_INT_THRESH) |
+				 V_LP_INT_THRESH(M_LP_INT_THRESH),
+				 V_HP_INT_THRESH(dbfifo_int_thresh) |
+				 V_LP_INT_THRESH(dbfifo_int_thresh));
+	} else {
+		t4_set_reg_field(adap, A_SGE_DBFIFO_STATUS,
+				 V_LP_INT_THRESH_T5(M_LP_INT_THRESH_T5),
+				 V_LP_INT_THRESH_T5(dbfifo_int_thresh));
+		t4_set_reg_field(adap, SGE_DBFIFO_STATUS2,
+				 V_HP_INT_THRESH_T5(M_HP_INT_THRESH_T5),
+				 V_HP_INT_THRESH_T5(dbfifo_int_thresh));
+	}
+	t4_set_reg_field(adap, A_SGE_DOORBELL_CONTROL, F_ENABLE_DROP,
+			F_ENABLE_DROP);
+
+	/*
+	 * SGE_FL_BUFFER_SIZE0 (RX_SMALL_PG_BUF) is set up by
+	 * t4_fixup_host_params().
+	 */
+	s->fl_pg_order = FL_PG_ORDER;
+	if (s->fl_pg_order)
+		t4_write_reg(adap,
+			     SGE_FL_BUFFER_SIZE0+RX_LARGE_PG_BUF*sizeof(u32),
+			     PAGE_SIZE << FL_PG_ORDER);
+	t4_write_reg(adap, SGE_FL_BUFFER_SIZE0+RX_SMALL_MTU_BUF*sizeof(u32),
+		     FL_MTU_SMALL_BUFSIZE(adap));
+	t4_write_reg(adap, SGE_FL_BUFFER_SIZE0+RX_LARGE_MTU_BUF*sizeof(u32),
+		     FL_MTU_LARGE_BUFSIZE(adap));
+
+	/*
+	 * Note that the SGE Ingress Packet Count Interrupt Threshold and
+	 * Timer Holdoff values must be supplied by our caller.
+	 */
+	t4_write_reg(adap, SGE_INGRESS_RX_THRESHOLD,
+		     THRESHOLD_0(s->counter_val[0]) |
+		     THRESHOLD_1(s->counter_val[1]) |
+		     THRESHOLD_2(s->counter_val[2]) |
+		     THRESHOLD_3(s->counter_val[3]));
+	t4_write_reg(adap, SGE_TIMER_VALUE_0_AND_1,
+		     TIMERVALUE0(us_to_core_ticks(adap, s->timer_val[0])) |
+		     TIMERVALUE1(us_to_core_ticks(adap, s->timer_val[1])));
+	t4_write_reg(adap, SGE_TIMER_VALUE_2_AND_3,
+		     TIMERVALUE2(us_to_core_ticks(adap, s->timer_val[2])) |
+		     TIMERVALUE3(us_to_core_ticks(adap, s->timer_val[3])));
+	t4_write_reg(adap, SGE_TIMER_VALUE_4_AND_5,
+		     TIMERVALUE4(us_to_core_ticks(adap, s->timer_val[4])) |
+		     TIMERVALUE5(us_to_core_ticks(adap, s->timer_val[5])));
+
+	return 0;
+}
+
+int t4_sge_init(struct adapter *adap)
+{
+	struct sge *s = &adap->sge;
+	u32 sge_control, sge_control2, sge_conm_ctrl;
+	unsigned int ingpadboundary, ingpackboundary;
+	int ret, egress_threshold;
+
+	/*
+	 * Ingress Padding Boundary and Egress Status Page Size are set up by
+	 * t4_fixup_host_params().
+	 */
+	sge_control = t4_read_reg(adap, SGE_CONTROL);
+	s->pktshift = PKTSHIFT_GET(sge_control);
+	s->stat_len = (sge_control & EGRSTATUSPAGESIZE_MASK) ? 128 : 64;
+
+	/* T4 uses a single control field to specify both the PCIe Padding and
+	 * Packing Boundary.  T5 introduced the ability to specify these
+	 * separately.  The actual Ingress Packet Data alignment boundary
+	 * within Packed Buffer Mode is the maximum of these two
+	 * specifications.
+	 */
+	ingpadboundary = 1 << (INGPADBOUNDARY_GET(sge_control) +
+			       X_INGPADBOUNDARY_SHIFT);
+	if (is_t4(adap->params.chip)) {
+		s->fl_align = ingpadboundary;
+	} else {
+		/* T5 has a different interpretation of one of the PCIe Packing
+		 * Boundary values.
+		 */
+		sge_control2 = t4_read_reg(adap, SGE_CONTROL2_A);
+		ingpackboundary = INGPACKBOUNDARY_G(sge_control2);
+		if (ingpackboundary == INGPACKBOUNDARY_16B_X)
+			ingpackboundary = 16;
+		else
+			ingpackboundary = 1 << (ingpackboundary +
+						INGPACKBOUNDARY_SHIFT_X);
+
+		s->fl_align = max(ingpadboundary, ingpackboundary);
+	}
+
+	if (adap->flags & USING_SOFT_PARAMS)
+		ret = t4_sge_init_soft(adap);
+	else
+		ret = t4_sge_init_hard(adap);
+	if (ret < 0)
+		return ret;
+
+	/*
+	 * A FL with <= fl_starve_thres buffers is starving and a periodic
+	 * timer will attempt to refill it.  This needs to be larger than the
+	 * SGE's Egress Congestion Threshold.  If it isn't, then we can get
+	 * stuck waiting for new packets while the SGE is waiting for us to
+	 * give it more Free List entries.  (Note that the SGE's Egress
+	 * Congestion Threshold is in units of 2 Free List pointers.) For T4,
+	 * there was only a single field to control this.  For T5 there's the
+	 * original field which now only applies to Unpacked Mode Free List
+	 * buffers and a new field which only applies to Packed Mode Free List
+	 * buffers.
+	 */
+	sge_conm_ctrl = t4_read_reg(adap, SGE_CONM_CTRL);
+	if (is_t4(adap->params.chip))
+		egress_threshold = EGRTHRESHOLD_GET(sge_conm_ctrl);
+	else
+		egress_threshold = EGRTHRESHOLDPACKING_GET(sge_conm_ctrl);
+	s->fl_starve_thres = 2*egress_threshold + 1;
+
+	setup_timer(&s->rx_timer, sge_rx_timer_cb, (unsigned long)adap);
+	setup_timer(&s->tx_timer, sge_tx_timer_cb, (unsigned long)adap);
+	s->idma_1s_thresh = core_ticks_per_usec(adap) * 1000000;  /* 1 s */
+	s->idma_stalled[0] = 0;
+	s->idma_stalled[1] = 0;
+	spin_lock_init(&s->intrq_lock);
+
+	return 0;
+}
diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c
new file mode 100644
index 0000000..163a2a1
--- /dev/null
+++ b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c
@@ -0,0 +1,4164 @@
+/*
+ * This file is part of the Chelsio T4 Ethernet driver for Linux.
+ *
+ * Copyright (c) 2003-2014 Chelsio Communications, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/delay.h>
+#include "cxgb4.h"
+#include "t4_regs.h"
+#include "t4fw_api.h"
+
+/**
+ *	t4_wait_op_done_val - wait until an operation is completed
+ *	@adapter: the adapter performing the operation
+ *	@reg: the register to check for completion
+ *	@mask: a single-bit field within @reg that indicates completion
+ *	@polarity: the value of the field when the operation is completed
+ *	@attempts: number of check iterations
+ *	@delay: delay in usecs between iterations
+ *	@valp: where to store the value of the register at completion time
+ *
+ *	Wait until an operation is completed by checking a bit in a register
+ *	up to @attempts times.  If @valp is not NULL the value of the register
+ *	at the time it indicated completion is stored there.  Returns 0 if the
+ *	operation completes and	-EAGAIN	otherwise.
+ */
+static int t4_wait_op_done_val(struct adapter *adapter, int reg, u32 mask,
+			       int polarity, int attempts, int delay, u32 *valp)
+{
+	while (1) {
+		u32 val = t4_read_reg(adapter, reg);
+
+		if (!!(val & mask) == polarity) {
+			if (valp)
+				*valp = val;
+			return 0;
+		}
+		if (--attempts == 0)
+			return -EAGAIN;
+		if (delay)
+			udelay(delay);
+	}
+}
+
+static inline int t4_wait_op_done(struct adapter *adapter, int reg, u32 mask,
+				  int polarity, int attempts, int delay)
+{
+	return t4_wait_op_done_val(adapter, reg, mask, polarity, attempts,
+				   delay, NULL);
+}
+
+/**
+ *	t4_set_reg_field - set a register field to a value
+ *	@adapter: the adapter to program
+ *	@addr: the register address
+ *	@mask: specifies the portion of the register to modify
+ *	@val: the new value for the register field
+ *
+ *	Sets a register field specified by the supplied mask to the
+ *	given value.
+ */
+void t4_set_reg_field(struct adapter *adapter, unsigned int addr, u32 mask,
+		      u32 val)
+{
+	u32 v = t4_read_reg(adapter, addr) & ~mask;
+
+	t4_write_reg(adapter, addr, v | val);
+	(void) t4_read_reg(adapter, addr);      /* flush */
+}
+
+/**
+ *	t4_read_indirect - read indirectly addressed registers
+ *	@adap: the adapter
+ *	@addr_reg: register holding the indirect address
+ *	@data_reg: register holding the value of the indirect register
+ *	@vals: where the read register values are stored
+ *	@nregs: how many indirect registers to read
+ *	@start_idx: index of first indirect register to read
+ *
+ *	Reads registers that are accessed indirectly through an address/data
+ *	register pair.
+ */
+void t4_read_indirect(struct adapter *adap, unsigned int addr_reg,
+			     unsigned int data_reg, u32 *vals,
+			     unsigned int nregs, unsigned int start_idx)
+{
+	while (nregs--) {
+		t4_write_reg(adap, addr_reg, start_idx);
+		*vals++ = t4_read_reg(adap, data_reg);
+		start_idx++;
+	}
+}
+
+/**
+ *	t4_write_indirect - write indirectly addressed registers
+ *	@adap: the adapter
+ *	@addr_reg: register holding the indirect addresses
+ *	@data_reg: register holding the value for the indirect registers
+ *	@vals: values to write
+ *	@nregs: how many indirect registers to write
+ *	@start_idx: address of first indirect register to write
+ *
+ *	Writes a sequential block of registers that are accessed indirectly
+ *	through an address/data register pair.
+ */
+void t4_write_indirect(struct adapter *adap, unsigned int addr_reg,
+		       unsigned int data_reg, const u32 *vals,
+		       unsigned int nregs, unsigned int start_idx)
+{
+	while (nregs--) {
+		t4_write_reg(adap, addr_reg, start_idx++);
+		t4_write_reg(adap, data_reg, *vals++);
+	}
+}
+
+/*
+ * Read a 32-bit PCI Configuration Space register via the PCI-E backdoor
+ * mechanism.  This guarantees that we get the real value even if we're
+ * operating within a Virtual Machine and the Hypervisor is trapping our
+ * Configuration Space accesses.
+ */
+void t4_hw_pci_read_cfg4(struct adapter *adap, int reg, u32 *val)
+{
+	u32 req = ENABLE | FUNCTION(adap->fn) | reg;
+
+	if (is_t4(adap->params.chip))
+		req |= F_LOCALCFG;
+
+	t4_write_reg(adap, PCIE_CFG_SPACE_REQ, req);
+	*val = t4_read_reg(adap, PCIE_CFG_SPACE_DATA);
+
+	/* Reset ENABLE to 0 so reads of PCIE_CFG_SPACE_DATA won't cause a
+	 * Configuration Space read.  (None of the other fields matter when
+	 * ENABLE is 0 so a simple register write is easier than a
+	 * read-modify-write via t4_set_reg_field().)
+	 */
+	t4_write_reg(adap, PCIE_CFG_SPACE_REQ, 0);
+}
+
+/*
+ * t4_report_fw_error - report firmware error
+ * @adap: the adapter
+ *
+ * The adapter firmware can indicate error conditions to the host.
+ * If the firmware has indicated an error, print out the reason for
+ * the firmware error.
+ */
+static void t4_report_fw_error(struct adapter *adap)
+{
+	static const char *const reason[] = {
+		"Crash",                        /* PCIE_FW_EVAL_CRASH */
+		"During Device Preparation",    /* PCIE_FW_EVAL_PREP */
+		"During Device Configuration",  /* PCIE_FW_EVAL_CONF */
+		"During Device Initialization", /* PCIE_FW_EVAL_INIT */
+		"Unexpected Event",             /* PCIE_FW_EVAL_UNEXPECTEDEVENT */
+		"Insufficient Airflow",         /* PCIE_FW_EVAL_OVERHEAT */
+		"Device Shutdown",              /* PCIE_FW_EVAL_DEVICESHUTDOWN */
+		"Reserved",                     /* reserved */
+	};
+	u32 pcie_fw;
+
+	pcie_fw = t4_read_reg(adap, MA_PCIE_FW);
+	if (pcie_fw & FW_PCIE_FW_ERR)
+		dev_err(adap->pdev_dev, "Firmware reports adapter error: %s\n",
+			reason[FW_PCIE_FW_EVAL_GET(pcie_fw)]);
+}
+
+/*
+ * Get the reply to a mailbox command and store it in @rpl in big-endian order.
+ */
+static void get_mbox_rpl(struct adapter *adap, __be64 *rpl, int nflit,
+			 u32 mbox_addr)
+{
+	for ( ; nflit; nflit--, mbox_addr += 8)
+		*rpl++ = cpu_to_be64(t4_read_reg64(adap, mbox_addr));
+}
+
+/*
+ * Handle a FW assertion reported in a mailbox.
+ */
+static void fw_asrt(struct adapter *adap, u32 mbox_addr)
+{
+	struct fw_debug_cmd asrt;
+
+	get_mbox_rpl(adap, (__be64 *)&asrt, sizeof(asrt) / 8, mbox_addr);
+	dev_alert(adap->pdev_dev,
+		  "FW assertion at %.16s:%u, val0 %#x, val1 %#x\n",
+		  asrt.u.assert.filename_0_7, ntohl(asrt.u.assert.line),
+		  ntohl(asrt.u.assert.x), ntohl(asrt.u.assert.y));
+}
+
+static void dump_mbox(struct adapter *adap, int mbox, u32 data_reg)
+{
+	dev_err(adap->pdev_dev,
+		"mbox %d: %llx %llx %llx %llx %llx %llx %llx %llx\n", mbox,
+		(unsigned long long)t4_read_reg64(adap, data_reg),
+		(unsigned long long)t4_read_reg64(adap, data_reg + 8),
+		(unsigned long long)t4_read_reg64(adap, data_reg + 16),
+		(unsigned long long)t4_read_reg64(adap, data_reg + 24),
+		(unsigned long long)t4_read_reg64(adap, data_reg + 32),
+		(unsigned long long)t4_read_reg64(adap, data_reg + 40),
+		(unsigned long long)t4_read_reg64(adap, data_reg + 48),
+		(unsigned long long)t4_read_reg64(adap, data_reg + 56));
+}
+
+/**
+ *	t4_wr_mbox_meat - send a command to FW through the given mailbox
+ *	@adap: the adapter
+ *	@mbox: index of the mailbox to use
+ *	@cmd: the command to write
+ *	@size: command length in bytes
+ *	@rpl: where to optionally store the reply
+ *	@sleep_ok: if true we may sleep while awaiting command completion
+ *
+ *	Sends the given command to FW through the selected mailbox and waits
+ *	for the FW to execute the command.  If @rpl is not %NULL it is used to
+ *	store the FW's reply to the command.  The command and its optional
+ *	reply are of the same length.  FW can take up to %FW_CMD_MAX_TIMEOUT ms
+ *	to respond.  @sleep_ok determines whether we may sleep while awaiting
+ *	the response.  If sleeping is allowed we use progressive backoff
+ *	otherwise we spin.
+ *
+ *	The return value is 0 on success or a negative errno on failure.  A
+ *	failure can happen either because we are not able to execute the
+ *	command or FW executes it but signals an error.  In the latter case
+ *	the return value is the error code indicated by FW (negated).
+ */
+int t4_wr_mbox_meat(struct adapter *adap, int mbox, const void *cmd, int size,
+		    void *rpl, bool sleep_ok)
+{
+	static const int delay[] = {
+		1, 1, 3, 5, 10, 10, 20, 50, 100, 200
+	};
+
+	u32 v;
+	u64 res;
+	int i, ms, delay_idx;
+	const __be64 *p = cmd;
+	u32 data_reg = PF_REG(mbox, CIM_PF_MAILBOX_DATA);
+	u32 ctl_reg = PF_REG(mbox, CIM_PF_MAILBOX_CTRL);
+
+	if ((size & 15) || size > MBOX_LEN)
+		return -EINVAL;
+
+	/*
+	 * If the device is off-line, as in EEH, commands will time out.
+	 * Fail them early so we don't waste time waiting.
+	 */
+	if (adap->pdev->error_state != pci_channel_io_normal)
+		return -EIO;
+
+	v = MBOWNER_GET(t4_read_reg(adap, ctl_reg));
+	for (i = 0; v == MBOX_OWNER_NONE && i < 3; i++)
+		v = MBOWNER_GET(t4_read_reg(adap, ctl_reg));
+
+	if (v != MBOX_OWNER_DRV)
+		return v ? -EBUSY : -ETIMEDOUT;
+
+	for (i = 0; i < size; i += 8)
+		t4_write_reg64(adap, data_reg + i, be64_to_cpu(*p++));
+
+	t4_write_reg(adap, ctl_reg, MBMSGVALID | MBOWNER(MBOX_OWNER_FW));
+	t4_read_reg(adap, ctl_reg);          /* flush write */
+
+	delay_idx = 0;
+	ms = delay[0];
+
+	for (i = 0; i < FW_CMD_MAX_TIMEOUT; i += ms) {
+		if (sleep_ok) {
+			ms = delay[delay_idx];  /* last element may repeat */
+			if (delay_idx < ARRAY_SIZE(delay) - 1)
+				delay_idx++;
+			msleep(ms);
+		} else
+			mdelay(ms);
+
+		v = t4_read_reg(adap, ctl_reg);
+		if (MBOWNER_GET(v) == MBOX_OWNER_DRV) {
+			if (!(v & MBMSGVALID)) {
+				t4_write_reg(adap, ctl_reg, 0);
+				continue;
+			}
+
+			res = t4_read_reg64(adap, data_reg);
+			if (FW_CMD_OP_GET(res >> 32) == FW_DEBUG_CMD) {
+				fw_asrt(adap, data_reg);
+				res = FW_CMD_RETVAL(EIO);
+			} else if (rpl)
+				get_mbox_rpl(adap, rpl, size / 8, data_reg);
+
+			if (FW_CMD_RETVAL_GET((int)res))
+				dump_mbox(adap, mbox, data_reg);
+			t4_write_reg(adap, ctl_reg, 0);
+			return -FW_CMD_RETVAL_GET((int)res);
+		}
+	}
+
+	dump_mbox(adap, mbox, data_reg);
+	dev_err(adap->pdev_dev, "command %#x in mailbox %d timed out\n",
+		*(const u8 *)cmd, mbox);
+	t4_report_fw_error(adap);
+	return -ETIMEDOUT;
+}
+
+/**
+ *	t4_mc_read - read from MC through backdoor accesses
+ *	@adap: the adapter
+ *	@addr: address of first byte requested
+ *	@idx: which MC to access
+ *	@data: 64 bytes of data containing the requested address
+ *	@ecc: where to store the corresponding 64-bit ECC word
+ *
+ *	Read 64 bytes of data from MC starting at a 64-byte-aligned address
+ *	that covers the requested address @addr.  If @parity is not %NULL it
+ *	is assigned the 64-bit ECC word for the read data.
+ */
+int t4_mc_read(struct adapter *adap, int idx, u32 addr, __be32 *data, u64 *ecc)
+{
+	int i;
+	u32 mc_bist_cmd, mc_bist_cmd_addr, mc_bist_cmd_len;
+	u32 mc_bist_status_rdata, mc_bist_data_pattern;
+
+	if (is_t4(adap->params.chip)) {
+		mc_bist_cmd = MC_BIST_CMD;
+		mc_bist_cmd_addr = MC_BIST_CMD_ADDR;
+		mc_bist_cmd_len = MC_BIST_CMD_LEN;
+		mc_bist_status_rdata = MC_BIST_STATUS_RDATA;
+		mc_bist_data_pattern = MC_BIST_DATA_PATTERN;
+	} else {
+		mc_bist_cmd = MC_REG(MC_P_BIST_CMD, idx);
+		mc_bist_cmd_addr = MC_REG(MC_P_BIST_CMD_ADDR, idx);
+		mc_bist_cmd_len = MC_REG(MC_P_BIST_CMD_LEN, idx);
+		mc_bist_status_rdata = MC_REG(MC_P_BIST_STATUS_RDATA, idx);
+		mc_bist_data_pattern = MC_REG(MC_P_BIST_DATA_PATTERN, idx);
+	}
+
+	if (t4_read_reg(adap, mc_bist_cmd) & START_BIST)
+		return -EBUSY;
+	t4_write_reg(adap, mc_bist_cmd_addr, addr & ~0x3fU);
+	t4_write_reg(adap, mc_bist_cmd_len, 64);
+	t4_write_reg(adap, mc_bist_data_pattern, 0xc);
+	t4_write_reg(adap, mc_bist_cmd, BIST_OPCODE(1) | START_BIST |
+		     BIST_CMD_GAP(1));
+	i = t4_wait_op_done(adap, mc_bist_cmd, START_BIST, 0, 10, 1);
+	if (i)
+		return i;
+
+#define MC_DATA(i) MC_BIST_STATUS_REG(mc_bist_status_rdata, i)
+
+	for (i = 15; i >= 0; i--)
+		*data++ = htonl(t4_read_reg(adap, MC_DATA(i)));
+	if (ecc)
+		*ecc = t4_read_reg64(adap, MC_DATA(16));
+#undef MC_DATA
+	return 0;
+}
+
+/**
+ *	t4_edc_read - read from EDC through backdoor accesses
+ *	@adap: the adapter
+ *	@idx: which EDC to access
+ *	@addr: address of first byte requested
+ *	@data: 64 bytes of data containing the requested address
+ *	@ecc: where to store the corresponding 64-bit ECC word
+ *
+ *	Read 64 bytes of data from EDC starting at a 64-byte-aligned address
+ *	that covers the requested address @addr.  If @parity is not %NULL it
+ *	is assigned the 64-bit ECC word for the read data.
+ */
+int t4_edc_read(struct adapter *adap, int idx, u32 addr, __be32 *data, u64 *ecc)
+{
+	int i;
+	u32 edc_bist_cmd, edc_bist_cmd_addr, edc_bist_cmd_len;
+	u32 edc_bist_cmd_data_pattern, edc_bist_status_rdata;
+
+	if (is_t4(adap->params.chip)) {
+		edc_bist_cmd = EDC_REG(EDC_BIST_CMD, idx);
+		edc_bist_cmd_addr = EDC_REG(EDC_BIST_CMD_ADDR, idx);
+		edc_bist_cmd_len = EDC_REG(EDC_BIST_CMD_LEN, idx);
+		edc_bist_cmd_data_pattern = EDC_REG(EDC_BIST_DATA_PATTERN,
+						    idx);
+		edc_bist_status_rdata = EDC_REG(EDC_BIST_STATUS_RDATA,
+						    idx);
+	} else {
+		edc_bist_cmd = EDC_REG_T5(EDC_H_BIST_CMD, idx);
+		edc_bist_cmd_addr = EDC_REG_T5(EDC_H_BIST_CMD_ADDR, idx);
+		edc_bist_cmd_len = EDC_REG_T5(EDC_H_BIST_CMD_LEN, idx);
+		edc_bist_cmd_data_pattern =
+			EDC_REG_T5(EDC_H_BIST_DATA_PATTERN, idx);
+		edc_bist_status_rdata =
+			 EDC_REG_T5(EDC_H_BIST_STATUS_RDATA, idx);
+	}
+
+	if (t4_read_reg(adap, edc_bist_cmd) & START_BIST)
+		return -EBUSY;
+	t4_write_reg(adap, edc_bist_cmd_addr, addr & ~0x3fU);
+	t4_write_reg(adap, edc_bist_cmd_len, 64);
+	t4_write_reg(adap, edc_bist_cmd_data_pattern, 0xc);
+	t4_write_reg(adap, edc_bist_cmd,
+		     BIST_OPCODE(1) | BIST_CMD_GAP(1) | START_BIST);
+	i = t4_wait_op_done(adap, edc_bist_cmd, START_BIST, 0, 10, 1);
+	if (i)
+		return i;
+
+#define EDC_DATA(i) (EDC_BIST_STATUS_REG(edc_bist_status_rdata, i))
+
+	for (i = 15; i >= 0; i--)
+		*data++ = htonl(t4_read_reg(adap, EDC_DATA(i)));
+	if (ecc)
+		*ecc = t4_read_reg64(adap, EDC_DATA(16));
+#undef EDC_DATA
+	return 0;
+}
+
+/**
+ *	t4_memory_rw - read/write EDC 0, EDC 1 or MC via PCIE memory window
+ *	@adap: the adapter
+ *	@win: PCI-E Memory Window to use
+ *	@mtype: memory type: MEM_EDC0, MEM_EDC1 or MEM_MC
+ *	@addr: address within indicated memory type
+ *	@len: amount of memory to transfer
+ *	@buf: host memory buffer
+ *	@dir: direction of transfer T4_MEMORY_READ (1) or T4_MEMORY_WRITE (0)
+ *
+ *	Reads/writes an [almost] arbitrary memory region in the firmware: the
+ *	firmware memory address and host buffer must be aligned on 32-bit
+ *	boudaries; the length may be arbitrary.  The memory is transferred as
+ *	a raw byte sequence from/to the firmware's memory.  If this memory
+ *	contains data structures which contain multi-byte integers, it's the
+ *	caller's responsibility to perform appropriate byte order conversions.
+ */
+int t4_memory_rw(struct adapter *adap, int win, int mtype, u32 addr,
+		 u32 len, __be32 *buf, int dir)
+{
+	u32 pos, offset, resid, memoffset;
+	u32 edc_size, mc_size, win_pf, mem_reg, mem_aperture, mem_base;
+
+	/* Argument sanity checks ...
+	 */
+	if (addr & 0x3)
+		return -EINVAL;
+
+	/* It's convenient to be able to handle lengths which aren't a
+	 * multiple of 32-bits because we often end up transferring files to
+	 * the firmware.  So we'll handle that by normalizing the length here
+	 * and then handling any residual transfer at the end.
+	 */
+	resid = len & 0x3;
+	len -= resid;
+
+	/* Offset into the region of memory which is being accessed
+	 * MEM_EDC0 = 0
+	 * MEM_EDC1 = 1
+	 * MEM_MC   = 2 -- T4
+	 * MEM_MC0  = 2 -- For T5
+	 * MEM_MC1  = 3 -- For T5
+	 */
+	edc_size  = EDRAM_SIZE_GET(t4_read_reg(adap, MA_EDRAM0_BAR));
+	if (mtype != MEM_MC1)
+		memoffset = (mtype * (edc_size * 1024 * 1024));
+	else {
+		mc_size = EXT_MEM_SIZE_GET(t4_read_reg(adap,
+						       MA_EXT_MEMORY_BAR));
+		memoffset = (MEM_MC0 * edc_size + mc_size) * 1024 * 1024;
+	}
+
+	/* Determine the PCIE_MEM_ACCESS_OFFSET */
+	addr = addr + memoffset;
+
+	/* Each PCI-E Memory Window is programmed with a window size -- or
+	 * "aperture" -- which controls the granularity of its mapping onto
+	 * adapter memory.  We need to grab that aperture in order to know
+	 * how to use the specified window.  The window is also programmed
+	 * with the base address of the Memory Window in BAR0's address
+	 * space.  For T4 this is an absolute PCI-E Bus Address.  For T5
+	 * the address is relative to BAR0.
+	 */
+	mem_reg = t4_read_reg(adap,
+			      PCIE_MEM_ACCESS_REG(PCIE_MEM_ACCESS_BASE_WIN,
+						  win));
+	mem_aperture = 1 << (GET_WINDOW(mem_reg) + 10);
+	mem_base = GET_PCIEOFST(mem_reg) << 10;
+	if (is_t4(adap->params.chip))
+		mem_base -= adap->t4_bar0;
+	win_pf = is_t4(adap->params.chip) ? 0 : V_PFNUM(adap->fn);
+
+	/* Calculate our initial PCI-E Memory Window Position and Offset into
+	 * that Window.
+	 */
+	pos = addr & ~(mem_aperture-1);
+	offset = addr - pos;
+
+	/* Set up initial PCI-E Memory Window to cover the start of our
+	 * transfer.  (Read it back to ensure that changes propagate before we
+	 * attempt to use the new value.)
+	 */
+	t4_write_reg(adap,
+		     PCIE_MEM_ACCESS_REG(PCIE_MEM_ACCESS_OFFSET, win),
+		     pos | win_pf);
+	t4_read_reg(adap,
+		    PCIE_MEM_ACCESS_REG(PCIE_MEM_ACCESS_OFFSET, win));
+
+	/* Transfer data to/from the adapter as long as there's an integral
+	 * number of 32-bit transfers to complete.
+	 */
+	while (len > 0) {
+		if (dir == T4_MEMORY_READ)
+			*buf++ = (__force __be32) t4_read_reg(adap,
+							mem_base + offset);
+		else
+			t4_write_reg(adap, mem_base + offset,
+				     (__force u32) *buf++);
+		offset += sizeof(__be32);
+		len -= sizeof(__be32);
+
+		/* If we've reached the end of our current window aperture,
+		 * move the PCI-E Memory Window on to the next.  Note that
+		 * doing this here after "len" may be 0 allows us to set up
+		 * the PCI-E Memory Window for a possible final residual
+		 * transfer below ...
+		 */
+		if (offset == mem_aperture) {
+			pos += mem_aperture;
+			offset = 0;
+			t4_write_reg(adap,
+				     PCIE_MEM_ACCESS_REG(PCIE_MEM_ACCESS_OFFSET,
+							 win), pos | win_pf);
+			t4_read_reg(adap,
+				    PCIE_MEM_ACCESS_REG(PCIE_MEM_ACCESS_OFFSET,
+							win));
+		}
+	}
+
+	/* If the original transfer had a length which wasn't a multiple of
+	 * 32-bits, now's where we need to finish off the transfer of the
+	 * residual amount.  The PCI-E Memory Window has already been moved
+	 * above (if necessary) to cover this final transfer.
+	 */
+	if (resid) {
+		union {
+			__be32 word;
+			char byte[4];
+		} last;
+		unsigned char *bp;
+		int i;
+
+		if (dir == T4_MEMORY_READ) {
+			last.word = (__force __be32) t4_read_reg(adap,
+							mem_base + offset);
+			for (bp = (unsigned char *)buf, i = resid; i < 4; i++)
+				bp[i] = last.byte[i];
+		} else {
+			last.word = *buf;
+			for (i = resid; i < 4; i++)
+				last.byte[i] = 0;
+			t4_write_reg(adap, mem_base + offset,
+				     (__force u32) last.word);
+		}
+	}
+
+	return 0;
+}
+
+#define EEPROM_STAT_ADDR   0x7bfc
+#define VPD_BASE           0x400
+#define VPD_BASE_OLD       0
+#define VPD_LEN            1024
+#define CHELSIO_VPD_UNIQUE_ID 0x82
+
+/**
+ *	t4_seeprom_wp - enable/disable EEPROM write protection
+ *	@adapter: the adapter
+ *	@enable: whether to enable or disable write protection
+ *
+ *	Enables or disables write protection on the serial EEPROM.
+ */
+int t4_seeprom_wp(struct adapter *adapter, bool enable)
+{
+	unsigned int v = enable ? 0xc : 0;
+	int ret = pci_write_vpd(adapter->pdev, EEPROM_STAT_ADDR, 4, &v);
+	return ret < 0 ? ret : 0;
+}
+
+/**
+ *	get_vpd_params - read VPD parameters from VPD EEPROM
+ *	@adapter: adapter to read
+ *	@p: where to store the parameters
+ *
+ *	Reads card parameters stored in VPD EEPROM.
+ */
+int get_vpd_params(struct adapter *adapter, struct vpd_params *p)
+{
+	u32 cclk_param, cclk_val;
+	int i, ret, addr;
+	int ec, sn, pn;
+	u8 *vpd, csum;
+	unsigned int vpdr_len, kw_offset, id_len;
+
+	vpd = vmalloc(VPD_LEN);
+	if (!vpd)
+		return -ENOMEM;
+
+	ret = pci_read_vpd(adapter->pdev, VPD_BASE, sizeof(u32), vpd);
+	if (ret < 0)
+		goto out;
+
+	/* The VPD shall have a unique identifier specified by the PCI SIG.
+	 * For chelsio adapters, the identifier is 0x82. The first byte of a VPD
+	 * shall be CHELSIO_VPD_UNIQUE_ID (0x82). The VPD programming software
+	 * is expected to automatically put this entry at the
+	 * beginning of the VPD.
+	 */
+	addr = *vpd == CHELSIO_VPD_UNIQUE_ID ? VPD_BASE : VPD_BASE_OLD;
+
+	ret = pci_read_vpd(adapter->pdev, addr, VPD_LEN, vpd);
+	if (ret < 0)
+		goto out;
+
+	if (vpd[0] != PCI_VPD_LRDT_ID_STRING) {
+		dev_err(adapter->pdev_dev, "missing VPD ID string\n");
+		ret = -EINVAL;
+		goto out;
+	}
+
+	id_len = pci_vpd_lrdt_size(vpd);
+	if (id_len > ID_LEN)
+		id_len = ID_LEN;
+
+	i = pci_vpd_find_tag(vpd, 0, VPD_LEN, PCI_VPD_LRDT_RO_DATA);
+	if (i < 0) {
+		dev_err(adapter->pdev_dev, "missing VPD-R section\n");
+		ret = -EINVAL;
+		goto out;
+	}
+
+	vpdr_len = pci_vpd_lrdt_size(&vpd[i]);
+	kw_offset = i + PCI_VPD_LRDT_TAG_SIZE;
+	if (vpdr_len + kw_offset > VPD_LEN) {
+		dev_err(adapter->pdev_dev, "bad VPD-R length %u\n", vpdr_len);
+		ret = -EINVAL;
+		goto out;
+	}
+
+#define FIND_VPD_KW(var, name) do { \
+	var = pci_vpd_find_info_keyword(vpd, kw_offset, vpdr_len, name); \
+	if (var < 0) { \
+		dev_err(adapter->pdev_dev, "missing VPD keyword " name "\n"); \
+		ret = -EINVAL; \
+		goto out; \
+	} \
+	var += PCI_VPD_INFO_FLD_HDR_SIZE; \
+} while (0)
+
+	FIND_VPD_KW(i, "RV");
+	for (csum = 0; i >= 0; i--)
+		csum += vpd[i];
+
+	if (csum) {
+		dev_err(adapter->pdev_dev,
+			"corrupted VPD EEPROM, actual csum %u\n", csum);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	FIND_VPD_KW(ec, "EC");
+	FIND_VPD_KW(sn, "SN");
+	FIND_VPD_KW(pn, "PN");
+#undef FIND_VPD_KW
+
+	memcpy(p->id, vpd + PCI_VPD_LRDT_TAG_SIZE, id_len);
+	strim(p->id);
+	memcpy(p->ec, vpd + ec, EC_LEN);
+	strim(p->ec);
+	i = pci_vpd_info_field_size(vpd + sn - PCI_VPD_INFO_FLD_HDR_SIZE);
+	memcpy(p->sn, vpd + sn, min(i, SERNUM_LEN));
+	strim(p->sn);
+	i = pci_vpd_info_field_size(vpd + pn - PCI_VPD_INFO_FLD_HDR_SIZE);
+	memcpy(p->pn, vpd + pn, min(i, PN_LEN));
+	strim(p->pn);
+
+	/*
+	 * Ask firmware for the Core Clock since it knows how to translate the
+	 * Reference Clock ('V2') VPD field into a Core Clock value ...
+	 */
+	cclk_param = (FW_PARAMS_MNEM(FW_PARAMS_MNEM_DEV) |
+		      FW_PARAMS_PARAM_X(FW_PARAMS_PARAM_DEV_CCLK));
+	ret = t4_query_params(adapter, adapter->mbox, 0, 0,
+			      1, &cclk_param, &cclk_val);
+
+out:
+	vfree(vpd);
+	if (ret)
+		return ret;
+	p->cclk = cclk_val;
+
+	return 0;
+}
+
+/* serial flash and firmware constants */
+enum {
+	SF_ATTEMPTS = 10,             /* max retries for SF operations */
+
+	/* flash command opcodes */
+	SF_PROG_PAGE    = 2,          /* program page */
+	SF_WR_DISABLE   = 4,          /* disable writes */
+	SF_RD_STATUS    = 5,          /* read status register */
+	SF_WR_ENABLE    = 6,          /* enable writes */
+	SF_RD_DATA_FAST = 0xb,        /* read flash */
+	SF_RD_ID        = 0x9f,       /* read ID */
+	SF_ERASE_SECTOR = 0xd8,       /* erase sector */
+
+	FW_MAX_SIZE = 16 * SF_SEC_SIZE,
+};
+
+/**
+ *	sf1_read - read data from the serial flash
+ *	@adapter: the adapter
+ *	@byte_cnt: number of bytes to read
+ *	@cont: whether another operation will be chained
+ *	@lock: whether to lock SF for PL access only
+ *	@valp: where to store the read data
+ *
+ *	Reads up to 4 bytes of data from the serial flash.  The location of
+ *	the read needs to be specified prior to calling this by issuing the
+ *	appropriate commands to the serial flash.
+ */
+static int sf1_read(struct adapter *adapter, unsigned int byte_cnt, int cont,
+		    int lock, u32 *valp)
+{
+	int ret;
+
+	if (!byte_cnt || byte_cnt > 4)
+		return -EINVAL;
+	if (t4_read_reg(adapter, SF_OP) & SF_BUSY)
+		return -EBUSY;
+	cont = cont ? SF_CONT : 0;
+	lock = lock ? SF_LOCK : 0;
+	t4_write_reg(adapter, SF_OP, lock | cont | BYTECNT(byte_cnt - 1));
+	ret = t4_wait_op_done(adapter, SF_OP, SF_BUSY, 0, SF_ATTEMPTS, 5);
+	if (!ret)
+		*valp = t4_read_reg(adapter, SF_DATA);
+	return ret;
+}
+
+/**
+ *	sf1_write - write data to the serial flash
+ *	@adapter: the adapter
+ *	@byte_cnt: number of bytes to write
+ *	@cont: whether another operation will be chained
+ *	@lock: whether to lock SF for PL access only
+ *	@val: value to write
+ *
+ *	Writes up to 4 bytes of data to the serial flash.  The location of
+ *	the write needs to be specified prior to calling this by issuing the
+ *	appropriate commands to the serial flash.
+ */
+static int sf1_write(struct adapter *adapter, unsigned int byte_cnt, int cont,
+		     int lock, u32 val)
+{
+	if (!byte_cnt || byte_cnt > 4)
+		return -EINVAL;
+	if (t4_read_reg(adapter, SF_OP) & SF_BUSY)
+		return -EBUSY;
+	cont = cont ? SF_CONT : 0;
+	lock = lock ? SF_LOCK : 0;
+	t4_write_reg(adapter, SF_DATA, val);
+	t4_write_reg(adapter, SF_OP, lock |
+		     cont | BYTECNT(byte_cnt - 1) | OP_WR);
+	return t4_wait_op_done(adapter, SF_OP, SF_BUSY, 0, SF_ATTEMPTS, 5);
+}
+
+/**
+ *	flash_wait_op - wait for a flash operation to complete
+ *	@adapter: the adapter
+ *	@attempts: max number of polls of the status register
+ *	@delay: delay between polls in ms
+ *
+ *	Wait for a flash operation to complete by polling the status register.
+ */
+static int flash_wait_op(struct adapter *adapter, int attempts, int delay)
+{
+	int ret;
+	u32 status;
+
+	while (1) {
+		if ((ret = sf1_write(adapter, 1, 1, 1, SF_RD_STATUS)) != 0 ||
+		    (ret = sf1_read(adapter, 1, 0, 1, &status)) != 0)
+			return ret;
+		if (!(status & 1))
+			return 0;
+		if (--attempts == 0)
+			return -EAGAIN;
+		if (delay)
+			msleep(delay);
+	}
+}
+
+/**
+ *	t4_read_flash - read words from serial flash
+ *	@adapter: the adapter
+ *	@addr: the start address for the read
+ *	@nwords: how many 32-bit words to read
+ *	@data: where to store the read data
+ *	@byte_oriented: whether to store data as bytes or as words
+ *
+ *	Read the specified number of 32-bit words from the serial flash.
+ *	If @byte_oriented is set the read data is stored as a byte array
+ *	(i.e., big-endian), otherwise as 32-bit words in the platform's
+ *	natural endianess.
+ */
+static int t4_read_flash(struct adapter *adapter, unsigned int addr,
+			 unsigned int nwords, u32 *data, int byte_oriented)
+{
+	int ret;
+
+	if (addr + nwords * sizeof(u32) > adapter->params.sf_size || (addr & 3))
+		return -EINVAL;
+
+	addr = swab32(addr) | SF_RD_DATA_FAST;
+
+	if ((ret = sf1_write(adapter, 4, 1, 0, addr)) != 0 ||
+	    (ret = sf1_read(adapter, 1, 1, 0, data)) != 0)
+		return ret;
+
+	for ( ; nwords; nwords--, data++) {
+		ret = sf1_read(adapter, 4, nwords > 1, nwords == 1, data);
+		if (nwords == 1)
+			t4_write_reg(adapter, SF_OP, 0);    /* unlock SF */
+		if (ret)
+			return ret;
+		if (byte_oriented)
+			*data = (__force __u32) (htonl(*data));
+	}
+	return 0;
+}
+
+/**
+ *	t4_write_flash - write up to a page of data to the serial flash
+ *	@adapter: the adapter
+ *	@addr: the start address to write
+ *	@n: length of data to write in bytes
+ *	@data: the data to write
+ *
+ *	Writes up to a page of data (256 bytes) to the serial flash starting
+ *	at the given address.  All the data must be written to the same page.
+ */
+static int t4_write_flash(struct adapter *adapter, unsigned int addr,
+			  unsigned int n, const u8 *data)
+{
+	int ret;
+	u32 buf[64];
+	unsigned int i, c, left, val, offset = addr & 0xff;
+
+	if (addr >= adapter->params.sf_size || offset + n > SF_PAGE_SIZE)
+		return -EINVAL;
+
+	val = swab32(addr) | SF_PROG_PAGE;
+
+	if ((ret = sf1_write(adapter, 1, 0, 1, SF_WR_ENABLE)) != 0 ||
+	    (ret = sf1_write(adapter, 4, 1, 1, val)) != 0)
+		goto unlock;
+
+	for (left = n; left; left -= c) {
+		c = min(left, 4U);
+		for (val = 0, i = 0; i < c; ++i)
+			val = (val << 8) + *data++;
+
+		ret = sf1_write(adapter, c, c != left, 1, val);
+		if (ret)
+			goto unlock;
+	}
+	ret = flash_wait_op(adapter, 8, 1);
+	if (ret)
+		goto unlock;
+
+	t4_write_reg(adapter, SF_OP, 0);    /* unlock SF */
+
+	/* Read the page to verify the write succeeded */
+	ret = t4_read_flash(adapter, addr & ~0xff, ARRAY_SIZE(buf), buf, 1);
+	if (ret)
+		return ret;
+
+	if (memcmp(data - n, (u8 *)buf + offset, n)) {
+		dev_err(adapter->pdev_dev,
+			"failed to correctly write the flash page at %#x\n",
+			addr);
+		return -EIO;
+	}
+	return 0;
+
+unlock:
+	t4_write_reg(adapter, SF_OP, 0);    /* unlock SF */
+	return ret;
+}
+
+/**
+ *	t4_get_fw_version - read the firmware version
+ *	@adapter: the adapter
+ *	@vers: where to place the version
+ *
+ *	Reads the FW version from flash.
+ */
+int t4_get_fw_version(struct adapter *adapter, u32 *vers)
+{
+	return t4_read_flash(adapter, FLASH_FW_START +
+			     offsetof(struct fw_hdr, fw_ver), 1,
+			     vers, 0);
+}
+
+/**
+ *	t4_get_tp_version - read the TP microcode version
+ *	@adapter: the adapter
+ *	@vers: where to place the version
+ *
+ *	Reads the TP microcode version from flash.
+ */
+int t4_get_tp_version(struct adapter *adapter, u32 *vers)
+{
+	return t4_read_flash(adapter, FLASH_FW_START +
+			     offsetof(struct fw_hdr, tp_microcode_ver),
+			     1, vers, 0);
+}
+
+/* Is the given firmware API compatible with the one the driver was compiled
+ * with?
+ */
+static int fw_compatible(const struct fw_hdr *hdr1, const struct fw_hdr *hdr2)
+{
+
+	/* short circuit if it's the exact same firmware version */
+	if (hdr1->chip == hdr2->chip && hdr1->fw_ver == hdr2->fw_ver)
+		return 1;
+
+#define SAME_INTF(x) (hdr1->intfver_##x == hdr2->intfver_##x)
+	if (hdr1->chip == hdr2->chip && SAME_INTF(nic) && SAME_INTF(vnic) &&
+	    SAME_INTF(ri) && SAME_INTF(iscsi) && SAME_INTF(fcoe))
+		return 1;
+#undef SAME_INTF
+
+	return 0;
+}
+
+/* The firmware in the filesystem is usable, but should it be installed?
+ * This routine explains itself in detail if it indicates the filesystem
+ * firmware should be installed.
+ */
+static int should_install_fs_fw(struct adapter *adap, int card_fw_usable,
+				int k, int c)
+{
+	const char *reason;
+
+	if (!card_fw_usable) {
+		reason = "incompatible or unusable";
+		goto install;
+	}
+
+	if (k > c) {
+		reason = "older than the version supported with this driver";
+		goto install;
+	}
+
+	return 0;
+
+install:
+	dev_err(adap->pdev_dev, "firmware on card (%u.%u.%u.%u) is %s, "
+		"installing firmware %u.%u.%u.%u on card.\n",
+		FW_HDR_FW_VER_MAJOR_GET(c), FW_HDR_FW_VER_MINOR_GET(c),
+		FW_HDR_FW_VER_MICRO_GET(c), FW_HDR_FW_VER_BUILD_GET(c), reason,
+		FW_HDR_FW_VER_MAJOR_GET(k), FW_HDR_FW_VER_MINOR_GET(k),
+		FW_HDR_FW_VER_MICRO_GET(k), FW_HDR_FW_VER_BUILD_GET(k));
+
+	return 1;
+}
+
+int t4_prep_fw(struct adapter *adap, struct fw_info *fw_info,
+	       const u8 *fw_data, unsigned int fw_size,
+	       struct fw_hdr *card_fw, enum dev_state state,
+	       int *reset)
+{
+	int ret, card_fw_usable, fs_fw_usable;
+	const struct fw_hdr *fs_fw;
+	const struct fw_hdr *drv_fw;
+
+	drv_fw = &fw_info->fw_hdr;
+
+	/* Read the header of the firmware on the card */
+	ret = -t4_read_flash(adap, FLASH_FW_START,
+			    sizeof(*card_fw) / sizeof(uint32_t),
+			    (uint32_t *)card_fw, 1);
+	if (ret == 0) {
+		card_fw_usable = fw_compatible(drv_fw, (const void *)card_fw);
+	} else {
+		dev_err(adap->pdev_dev,
+			"Unable to read card's firmware header: %d\n", ret);
+		card_fw_usable = 0;
+	}
+
+	if (fw_data != NULL) {
+		fs_fw = (const void *)fw_data;
+		fs_fw_usable = fw_compatible(drv_fw, fs_fw);
+	} else {
+		fs_fw = NULL;
+		fs_fw_usable = 0;
+	}
+
+	if (card_fw_usable && card_fw->fw_ver == drv_fw->fw_ver &&
+	    (!fs_fw_usable || fs_fw->fw_ver == drv_fw->fw_ver)) {
+		/* Common case: the firmware on the card is an exact match and
+		 * the filesystem one is an exact match too, or the filesystem
+		 * one is absent/incompatible.
+		 */
+	} else if (fs_fw_usable && state == DEV_STATE_UNINIT &&
+		   should_install_fs_fw(adap, card_fw_usable,
+					be32_to_cpu(fs_fw->fw_ver),
+					be32_to_cpu(card_fw->fw_ver))) {
+		ret = -t4_fw_upgrade(adap, adap->mbox, fw_data,
+				     fw_size, 0);
+		if (ret != 0) {
+			dev_err(adap->pdev_dev,
+				"failed to install firmware: %d\n", ret);
+			goto bye;
+		}
+
+		/* Installed successfully, update the cached header too. */
+		memcpy(card_fw, fs_fw, sizeof(*card_fw));
+		card_fw_usable = 1;
+		*reset = 0;	/* already reset as part of load_fw */
+	}
+
+	if (!card_fw_usable) {
+		uint32_t d, c, k;
+
+		d = be32_to_cpu(drv_fw->fw_ver);
+		c = be32_to_cpu(card_fw->fw_ver);
+		k = fs_fw ? be32_to_cpu(fs_fw->fw_ver) : 0;
+
+		dev_err(adap->pdev_dev, "Cannot find a usable firmware: "
+			"chip state %d, "
+			"driver compiled with %d.%d.%d.%d, "
+			"card has %d.%d.%d.%d, filesystem has %d.%d.%d.%d\n",
+			state,
+			FW_HDR_FW_VER_MAJOR_GET(d), FW_HDR_FW_VER_MINOR_GET(d),
+			FW_HDR_FW_VER_MICRO_GET(d), FW_HDR_FW_VER_BUILD_GET(d),
+			FW_HDR_FW_VER_MAJOR_GET(c), FW_HDR_FW_VER_MINOR_GET(c),
+			FW_HDR_FW_VER_MICRO_GET(c), FW_HDR_FW_VER_BUILD_GET(c),
+			FW_HDR_FW_VER_MAJOR_GET(k), FW_HDR_FW_VER_MINOR_GET(k),
+			FW_HDR_FW_VER_MICRO_GET(k), FW_HDR_FW_VER_BUILD_GET(k));
+		ret = EINVAL;
+		goto bye;
+	}
+
+	/* We're using whatever's on the card and it's known to be good. */
+	adap->params.fw_vers = be32_to_cpu(card_fw->fw_ver);
+	adap->params.tp_vers = be32_to_cpu(card_fw->tp_microcode_ver);
+
+bye:
+	return ret;
+}
+
+/**
+ *	t4_flash_erase_sectors - erase a range of flash sectors
+ *	@adapter: the adapter
+ *	@start: the first sector to erase
+ *	@end: the last sector to erase
+ *
+ *	Erases the sectors in the given inclusive range.
+ */
+static int t4_flash_erase_sectors(struct adapter *adapter, int start, int end)
+{
+	int ret = 0;
+
+	if (end >= adapter->params.sf_nsec)
+		return -EINVAL;
+
+	while (start <= end) {
+		if ((ret = sf1_write(adapter, 1, 0, 1, SF_WR_ENABLE)) != 0 ||
+		    (ret = sf1_write(adapter, 4, 0, 1,
+				     SF_ERASE_SECTOR | (start << 8))) != 0 ||
+		    (ret = flash_wait_op(adapter, 14, 500)) != 0) {
+			dev_err(adapter->pdev_dev,
+				"erase of flash sector %d failed, error %d\n",
+				start, ret);
+			break;
+		}
+		start++;
+	}
+	t4_write_reg(adapter, SF_OP, 0);    /* unlock SF */
+	return ret;
+}
+
+/**
+ *	t4_flash_cfg_addr - return the address of the flash configuration file
+ *	@adapter: the adapter
+ *
+ *	Return the address within the flash where the Firmware Configuration
+ *	File is stored.
+ */
+unsigned int t4_flash_cfg_addr(struct adapter *adapter)
+{
+	if (adapter->params.sf_size == 0x100000)
+		return FLASH_FPGA_CFG_START;
+	else
+		return FLASH_CFG_START;
+}
+
+/**
+ *	t4_load_fw - download firmware
+ *	@adap: the adapter
+ *	@fw_data: the firmware image to write
+ *	@size: image size
+ *
+ *	Write the supplied firmware image to the card's serial flash.
+ */
+int t4_load_fw(struct adapter *adap, const u8 *fw_data, unsigned int size)
+{
+	u32 csum;
+	int ret, addr;
+	unsigned int i;
+	u8 first_page[SF_PAGE_SIZE];
+	const __be32 *p = (const __be32 *)fw_data;
+	const struct fw_hdr *hdr = (const struct fw_hdr *)fw_data;
+	unsigned int sf_sec_size = adap->params.sf_size / adap->params.sf_nsec;
+	unsigned int fw_img_start = adap->params.sf_fw_start;
+	unsigned int fw_start_sec = fw_img_start / sf_sec_size;
+
+	if (!size) {
+		dev_err(adap->pdev_dev, "FW image has no data\n");
+		return -EINVAL;
+	}
+	if (size & 511) {
+		dev_err(adap->pdev_dev,
+			"FW image size not multiple of 512 bytes\n");
+		return -EINVAL;
+	}
+	if (ntohs(hdr->len512) * 512 != size) {
+		dev_err(adap->pdev_dev,
+			"FW image size differs from size in FW header\n");
+		return -EINVAL;
+	}
+	if (size > FW_MAX_SIZE) {
+		dev_err(adap->pdev_dev, "FW image too large, max is %u bytes\n",
+			FW_MAX_SIZE);
+		return -EFBIG;
+	}
+
+	for (csum = 0, i = 0; i < size / sizeof(csum); i++)
+		csum += ntohl(p[i]);
+
+	if (csum != 0xffffffff) {
+		dev_err(adap->pdev_dev,
+			"corrupted firmware image, checksum %#x\n", csum);
+		return -EINVAL;
+	}
+
+	i = DIV_ROUND_UP(size, sf_sec_size);        /* # of sectors spanned */
+	ret = t4_flash_erase_sectors(adap, fw_start_sec, fw_start_sec + i - 1);
+	if (ret)
+		goto out;
+
+	/*
+	 * We write the correct version at the end so the driver can see a bad
+	 * version if the FW write fails.  Start by writing a copy of the
+	 * first page with a bad version.
+	 */
+	memcpy(first_page, fw_data, SF_PAGE_SIZE);
+	((struct fw_hdr *)first_page)->fw_ver = htonl(0xffffffff);
+	ret = t4_write_flash(adap, fw_img_start, SF_PAGE_SIZE, first_page);
+	if (ret)
+		goto out;
+
+	addr = fw_img_start;
+	for (size -= SF_PAGE_SIZE; size; size -= SF_PAGE_SIZE) {
+		addr += SF_PAGE_SIZE;
+		fw_data += SF_PAGE_SIZE;
+		ret = t4_write_flash(adap, addr, SF_PAGE_SIZE, fw_data);
+		if (ret)
+			goto out;
+	}
+
+	ret = t4_write_flash(adap,
+			     fw_img_start + offsetof(struct fw_hdr, fw_ver),
+			     sizeof(hdr->fw_ver), (const u8 *)&hdr->fw_ver);
+out:
+	if (ret)
+		dev_err(adap->pdev_dev, "firmware download failed, error %d\n",
+			ret);
+	return ret;
+}
+
+#define ADVERT_MASK (FW_PORT_CAP_SPEED_100M | FW_PORT_CAP_SPEED_1G |\
+		     FW_PORT_CAP_SPEED_10G | FW_PORT_CAP_SPEED_40G | \
+		     FW_PORT_CAP_ANEG)
+
+/**
+ *	t4_link_start - apply link configuration to MAC/PHY
+ *	@phy: the PHY to setup
+ *	@mac: the MAC to setup
+ *	@lc: the requested link configuration
+ *
+ *	Set up a port's MAC and PHY according to a desired link configuration.
+ *	- If the PHY can auto-negotiate first decide what to advertise, then
+ *	  enable/disable auto-negotiation as desired, and reset.
+ *	- If the PHY does not auto-negotiate just reset it.
+ *	- If auto-negotiation is off set the MAC to the proper speed/duplex/FC,
+ *	  otherwise do it later based on the outcome of auto-negotiation.
+ */
+int t4_link_start(struct adapter *adap, unsigned int mbox, unsigned int port,
+		  struct link_config *lc)
+{
+	struct fw_port_cmd c;
+	unsigned int fc = 0, mdi = FW_PORT_MDI(FW_PORT_MDI_AUTO);
+
+	lc->link_ok = 0;
+	if (lc->requested_fc & PAUSE_RX)
+		fc |= FW_PORT_CAP_FC_RX;
+	if (lc->requested_fc & PAUSE_TX)
+		fc |= FW_PORT_CAP_FC_TX;
+
+	memset(&c, 0, sizeof(c));
+	c.op_to_portid = htonl(FW_CMD_OP(FW_PORT_CMD) | FW_CMD_REQUEST |
+			       FW_CMD_EXEC | FW_PORT_CMD_PORTID(port));
+	c.action_to_len16 = htonl(FW_PORT_CMD_ACTION(FW_PORT_ACTION_L1_CFG) |
+				  FW_LEN16(c));
+
+	if (!(lc->supported & FW_PORT_CAP_ANEG)) {
+		c.u.l1cfg.rcap = htonl((lc->supported & ADVERT_MASK) | fc);
+		lc->fc = lc->requested_fc & (PAUSE_RX | PAUSE_TX);
+	} else if (lc->autoneg == AUTONEG_DISABLE) {
+		c.u.l1cfg.rcap = htonl(lc->requested_speed | fc | mdi);
+		lc->fc = lc->requested_fc & (PAUSE_RX | PAUSE_TX);
+	} else
+		c.u.l1cfg.rcap = htonl(lc->advertising | fc | mdi);
+
+	return t4_wr_mbox(adap, mbox, &c, sizeof(c), NULL);
+}
+
+/**
+ *	t4_restart_aneg - restart autonegotiation
+ *	@adap: the adapter
+ *	@mbox: mbox to use for the FW command
+ *	@port: the port id
+ *
+ *	Restarts autonegotiation for the selected port.
+ */
+int t4_restart_aneg(struct adapter *adap, unsigned int mbox, unsigned int port)
+{
+	struct fw_port_cmd c;
+
+	memset(&c, 0, sizeof(c));
+	c.op_to_portid = htonl(FW_CMD_OP(FW_PORT_CMD) | FW_CMD_REQUEST |
+			       FW_CMD_EXEC | FW_PORT_CMD_PORTID(port));
+	c.action_to_len16 = htonl(FW_PORT_CMD_ACTION(FW_PORT_ACTION_L1_CFG) |
+				  FW_LEN16(c));
+	c.u.l1cfg.rcap = htonl(FW_PORT_CAP_ANEG);
+	return t4_wr_mbox(adap, mbox, &c, sizeof(c), NULL);
+}
+
+typedef void (*int_handler_t)(struct adapter *adap);
+
+struct intr_info {
+	unsigned int mask;       /* bits to check in interrupt status */
+	const char *msg;         /* message to print or NULL */
+	short stat_idx;          /* stat counter to increment or -1 */
+	unsigned short fatal;    /* whether the condition reported is fatal */
+	int_handler_t int_handler; /* platform-specific int handler */
+};
+
+/**
+ *	t4_handle_intr_status - table driven interrupt handler
+ *	@adapter: the adapter that generated the interrupt
+ *	@reg: the interrupt status register to process
+ *	@acts: table of interrupt actions
+ *
+ *	A table driven interrupt handler that applies a set of masks to an
+ *	interrupt status word and performs the corresponding actions if the
+ *	interrupts described by the mask have occurred.  The actions include
+ *	optionally emitting a warning or alert message.  The table is terminated
+ *	by an entry specifying mask 0.  Returns the number of fatal interrupt
+ *	conditions.
+ */
+static int t4_handle_intr_status(struct adapter *adapter, unsigned int reg,
+				 const struct intr_info *acts)
+{
+	int fatal = 0;
+	unsigned int mask = 0;
+	unsigned int status = t4_read_reg(adapter, reg);
+
+	for ( ; acts->mask; ++acts) {
+		if (!(status & acts->mask))
+			continue;
+		if (acts->fatal) {
+			fatal++;
+			dev_alert(adapter->pdev_dev, "%s (0x%x)\n", acts->msg,
+				  status & acts->mask);
+		} else if (acts->msg && printk_ratelimit())
+			dev_warn(adapter->pdev_dev, "%s (0x%x)\n", acts->msg,
+				 status & acts->mask);
+		if (acts->int_handler)
+			acts->int_handler(adapter);
+		mask |= acts->mask;
+	}
+	status &= mask;
+	if (status)                           /* clear processed interrupts */
+		t4_write_reg(adapter, reg, status);
+	return fatal;
+}
+
+/*
+ * Interrupt handler for the PCIE module.
+ */
+static void pcie_intr_handler(struct adapter *adapter)
+{
+	static const struct intr_info sysbus_intr_info[] = {
+		{ RNPP, "RXNP array parity error", -1, 1 },
+		{ RPCP, "RXPC array parity error", -1, 1 },
+		{ RCIP, "RXCIF array parity error", -1, 1 },
+		{ RCCP, "Rx completions control array parity error", -1, 1 },
+		{ RFTP, "RXFT array parity error", -1, 1 },
+		{ 0 }
+	};
+	static const struct intr_info pcie_port_intr_info[] = {
+		{ TPCP, "TXPC array parity error", -1, 1 },
+		{ TNPP, "TXNP array parity error", -1, 1 },
+		{ TFTP, "TXFT array parity error", -1, 1 },
+		{ TCAP, "TXCA array parity error", -1, 1 },
+		{ TCIP, "TXCIF array parity error", -1, 1 },
+		{ RCAP, "RXCA array parity error", -1, 1 },
+		{ OTDD, "outbound request TLP discarded", -1, 1 },
+		{ RDPE, "Rx data parity error", -1, 1 },
+		{ TDUE, "Tx uncorrectable data error", -1, 1 },
+		{ 0 }
+	};
+	static const struct intr_info pcie_intr_info[] = {
+		{ MSIADDRLPERR, "MSI AddrL parity error", -1, 1 },
+		{ MSIADDRHPERR, "MSI AddrH parity error", -1, 1 },
+		{ MSIDATAPERR, "MSI data parity error", -1, 1 },
+		{ MSIXADDRLPERR, "MSI-X AddrL parity error", -1, 1 },
+		{ MSIXADDRHPERR, "MSI-X AddrH parity error", -1, 1 },
+		{ MSIXDATAPERR, "MSI-X data parity error", -1, 1 },
+		{ MSIXDIPERR, "MSI-X DI parity error", -1, 1 },
+		{ PIOCPLPERR, "PCI PIO completion FIFO parity error", -1, 1 },
+		{ PIOREQPERR, "PCI PIO request FIFO parity error", -1, 1 },
+		{ TARTAGPERR, "PCI PCI target tag FIFO parity error", -1, 1 },
+		{ CCNTPERR, "PCI CMD channel count parity error", -1, 1 },
+		{ CREQPERR, "PCI CMD channel request parity error", -1, 1 },
+		{ CRSPPERR, "PCI CMD channel response parity error", -1, 1 },
+		{ DCNTPERR, "PCI DMA channel count parity error", -1, 1 },
+		{ DREQPERR, "PCI DMA channel request parity error", -1, 1 },
+		{ DRSPPERR, "PCI DMA channel response parity error", -1, 1 },
+		{ HCNTPERR, "PCI HMA channel count parity error", -1, 1 },
+		{ HREQPERR, "PCI HMA channel request parity error", -1, 1 },
+		{ HRSPPERR, "PCI HMA channel response parity error", -1, 1 },
+		{ CFGSNPPERR, "PCI config snoop FIFO parity error", -1, 1 },
+		{ FIDPERR, "PCI FID parity error", -1, 1 },
+		{ INTXCLRPERR, "PCI INTx clear parity error", -1, 1 },
+		{ MATAGPERR, "PCI MA tag parity error", -1, 1 },
+		{ PIOTAGPERR, "PCI PIO tag parity error", -1, 1 },
+		{ RXCPLPERR, "PCI Rx completion parity error", -1, 1 },
+		{ RXWRPERR, "PCI Rx write parity error", -1, 1 },
+		{ RPLPERR, "PCI replay buffer parity error", -1, 1 },
+		{ PCIESINT, "PCI core secondary fault", -1, 1 },
+		{ PCIEPINT, "PCI core primary fault", -1, 1 },
+		{ UNXSPLCPLERR, "PCI unexpected split completion error", -1, 0 },
+		{ 0 }
+	};
+
+	static struct intr_info t5_pcie_intr_info[] = {
+		{ MSTGRPPERR, "Master Response Read Queue parity error",
+		  -1, 1 },
+		{ MSTTIMEOUTPERR, "Master Timeout FIFO parity error", -1, 1 },
+		{ MSIXSTIPERR, "MSI-X STI SRAM parity error", -1, 1 },
+		{ MSIXADDRLPERR, "MSI-X AddrL parity error", -1, 1 },
+		{ MSIXADDRHPERR, "MSI-X AddrH parity error", -1, 1 },
+		{ MSIXDATAPERR, "MSI-X data parity error", -1, 1 },
+		{ MSIXDIPERR, "MSI-X DI parity error", -1, 1 },
+		{ PIOCPLGRPPERR, "PCI PIO completion Group FIFO parity error",
+		  -1, 1 },
+		{ PIOREQGRPPERR, "PCI PIO request Group FIFO parity error",
+		  -1, 1 },
+		{ TARTAGPERR, "PCI PCI target tag FIFO parity error", -1, 1 },
+		{ MSTTAGQPERR, "PCI master tag queue parity error", -1, 1 },
+		{ CREQPERR, "PCI CMD channel request parity error", -1, 1 },
+		{ CRSPPERR, "PCI CMD channel response parity error", -1, 1 },
+		{ DREQWRPERR, "PCI DMA channel write request parity error",
+		  -1, 1 },
+		{ DREQPERR, "PCI DMA channel request parity error", -1, 1 },
+		{ DRSPPERR, "PCI DMA channel response parity error", -1, 1 },
+		{ HREQWRPERR, "PCI HMA channel count parity error", -1, 1 },
+		{ HREQPERR, "PCI HMA channel request parity error", -1, 1 },
+		{ HRSPPERR, "PCI HMA channel response parity error", -1, 1 },
+		{ CFGSNPPERR, "PCI config snoop FIFO parity error", -1, 1 },
+		{ FIDPERR, "PCI FID parity error", -1, 1 },
+		{ VFIDPERR, "PCI INTx clear parity error", -1, 1 },
+		{ MAGRPPERR, "PCI MA group FIFO parity error", -1, 1 },
+		{ PIOTAGPERR, "PCI PIO tag parity error", -1, 1 },
+		{ IPRXHDRGRPPERR, "PCI IP Rx header group parity error",
+		  -1, 1 },
+		{ IPRXDATAGRPPERR, "PCI IP Rx data group parity error", -1, 1 },
+		{ RPLPERR, "PCI IP replay buffer parity error", -1, 1 },
+		{ IPSOTPERR, "PCI IP SOT buffer parity error", -1, 1 },
+		{ TRGT1GRPPERR, "PCI TRGT1 group FIFOs parity error", -1, 1 },
+		{ READRSPERR, "Outbound read error", -1, 0 },
+		{ 0 }
+	};
+
+	int fat;
+
+	if (is_t4(adapter->params.chip))
+		fat = t4_handle_intr_status(adapter,
+					    PCIE_CORE_UTL_SYSTEM_BUS_AGENT_STATUS,
+					    sysbus_intr_info) +
+			t4_handle_intr_status(adapter,
+					      PCIE_CORE_UTL_PCI_EXPRESS_PORT_STATUS,
+					      pcie_port_intr_info) +
+			t4_handle_intr_status(adapter, PCIE_INT_CAUSE,
+					      pcie_intr_info);
+	else
+		fat = t4_handle_intr_status(adapter, PCIE_INT_CAUSE,
+					    t5_pcie_intr_info);
+
+	if (fat)
+		t4_fatal_err(adapter);
+}
+
+/*
+ * TP interrupt handler.
+ */
+static void tp_intr_handler(struct adapter *adapter)
+{
+	static const struct intr_info tp_intr_info[] = {
+		{ 0x3fffffff, "TP parity error", -1, 1 },
+		{ FLMTXFLSTEMPTY, "TP out of Tx pages", -1, 1 },
+		{ 0 }
+	};
+
+	if (t4_handle_intr_status(adapter, TP_INT_CAUSE, tp_intr_info))
+		t4_fatal_err(adapter);
+}
+
+/*
+ * SGE interrupt handler.
+ */
+static void sge_intr_handler(struct adapter *adapter)
+{
+	u64 v;
+
+	static const struct intr_info sge_intr_info[] = {
+		{ ERR_CPL_EXCEED_IQE_SIZE,
+		  "SGE received CPL exceeding IQE size", -1, 1 },
+		{ ERR_INVALID_CIDX_INC,
+		  "SGE GTS CIDX increment too large", -1, 0 },
+		{ ERR_CPL_OPCODE_0, "SGE received 0-length CPL", -1, 0 },
+		{ DBFIFO_LP_INT, NULL, -1, 0, t4_db_full },
+		{ DBFIFO_HP_INT, NULL, -1, 0, t4_db_full },
+		{ ERR_DROPPED_DB, NULL, -1, 0, t4_db_dropped },
+		{ ERR_DATA_CPL_ON_HIGH_QID1 | ERR_DATA_CPL_ON_HIGH_QID0,
+		  "SGE IQID > 1023 received CPL for FL", -1, 0 },
+		{ ERR_BAD_DB_PIDX3, "SGE DBP 3 pidx increment too large", -1,
+		  0 },
+		{ ERR_BAD_DB_PIDX2, "SGE DBP 2 pidx increment too large", -1,
+		  0 },
+		{ ERR_BAD_DB_PIDX1, "SGE DBP 1 pidx increment too large", -1,
+		  0 },
+		{ ERR_BAD_DB_PIDX0, "SGE DBP 0 pidx increment too large", -1,
+		  0 },
+		{ ERR_ING_CTXT_PRIO,
+		  "SGE too many priority ingress contexts", -1, 0 },
+		{ ERR_EGR_CTXT_PRIO,
+		  "SGE too many priority egress contexts", -1, 0 },
+		{ INGRESS_SIZE_ERR, "SGE illegal ingress QID", -1, 0 },
+		{ EGRESS_SIZE_ERR, "SGE illegal egress QID", -1, 0 },
+		{ 0 }
+	};
+
+	v = (u64)t4_read_reg(adapter, SGE_INT_CAUSE1) |
+		((u64)t4_read_reg(adapter, SGE_INT_CAUSE2) << 32);
+	if (v) {
+		dev_alert(adapter->pdev_dev, "SGE parity error (%#llx)\n",
+				(unsigned long long)v);
+		t4_write_reg(adapter, SGE_INT_CAUSE1, v);
+		t4_write_reg(adapter, SGE_INT_CAUSE2, v >> 32);
+	}
+
+	if (t4_handle_intr_status(adapter, SGE_INT_CAUSE3, sge_intr_info) ||
+	    v != 0)
+		t4_fatal_err(adapter);
+}
+
+/*
+ * CIM interrupt handler.
+ */
+static void cim_intr_handler(struct adapter *adapter)
+{
+	static const struct intr_info cim_intr_info[] = {
+		{ PREFDROPINT, "CIM control register prefetch drop", -1, 1 },
+		{ OBQPARERR, "CIM OBQ parity error", -1, 1 },
+		{ IBQPARERR, "CIM IBQ parity error", -1, 1 },
+		{ MBUPPARERR, "CIM mailbox uP parity error", -1, 1 },
+		{ MBHOSTPARERR, "CIM mailbox host parity error", -1, 1 },
+		{ TIEQINPARERRINT, "CIM TIEQ outgoing parity error", -1, 1 },
+		{ TIEQOUTPARERRINT, "CIM TIEQ incoming parity error", -1, 1 },
+		{ 0 }
+	};
+	static const struct intr_info cim_upintr_info[] = {
+		{ RSVDSPACEINT, "CIM reserved space access", -1, 1 },
+		{ ILLTRANSINT, "CIM illegal transaction", -1, 1 },
+		{ ILLWRINT, "CIM illegal write", -1, 1 },
+		{ ILLRDINT, "CIM illegal read", -1, 1 },
+		{ ILLRDBEINT, "CIM illegal read BE", -1, 1 },
+		{ ILLWRBEINT, "CIM illegal write BE", -1, 1 },
+		{ SGLRDBOOTINT, "CIM single read from boot space", -1, 1 },
+		{ SGLWRBOOTINT, "CIM single write to boot space", -1, 1 },
+		{ BLKWRBOOTINT, "CIM block write to boot space", -1, 1 },
+		{ SGLRDFLASHINT, "CIM single read from flash space", -1, 1 },
+		{ SGLWRFLASHINT, "CIM single write to flash space", -1, 1 },
+		{ BLKWRFLASHINT, "CIM block write to flash space", -1, 1 },
+		{ SGLRDEEPROMINT, "CIM single EEPROM read", -1, 1 },
+		{ SGLWREEPROMINT, "CIM single EEPROM write", -1, 1 },
+		{ BLKRDEEPROMINT, "CIM block EEPROM read", -1, 1 },
+		{ BLKWREEPROMINT, "CIM block EEPROM write", -1, 1 },
+		{ SGLRDCTLINT , "CIM single read from CTL space", -1, 1 },
+		{ SGLWRCTLINT , "CIM single write to CTL space", -1, 1 },
+		{ BLKRDCTLINT , "CIM block read from CTL space", -1, 1 },
+		{ BLKWRCTLINT , "CIM block write to CTL space", -1, 1 },
+		{ SGLRDPLINT , "CIM single read from PL space", -1, 1 },
+		{ SGLWRPLINT , "CIM single write to PL space", -1, 1 },
+		{ BLKRDPLINT , "CIM block read from PL space", -1, 1 },
+		{ BLKWRPLINT , "CIM block write to PL space", -1, 1 },
+		{ REQOVRLOOKUPINT , "CIM request FIFO overwrite", -1, 1 },
+		{ RSPOVRLOOKUPINT , "CIM response FIFO overwrite", -1, 1 },
+		{ TIMEOUTINT , "CIM PIF timeout", -1, 1 },
+		{ TIMEOUTMAINT , "CIM PIF MA timeout", -1, 1 },
+		{ 0 }
+	};
+
+	int fat;
+
+	if (t4_read_reg(adapter, MA_PCIE_FW) & FW_PCIE_FW_ERR)
+		t4_report_fw_error(adapter);
+
+	fat = t4_handle_intr_status(adapter, CIM_HOST_INT_CAUSE,
+				    cim_intr_info) +
+	      t4_handle_intr_status(adapter, CIM_HOST_UPACC_INT_CAUSE,
+				    cim_upintr_info);
+	if (fat)
+		t4_fatal_err(adapter);
+}
+
+/*
+ * ULP RX interrupt handler.
+ */
+static void ulprx_intr_handler(struct adapter *adapter)
+{
+	static const struct intr_info ulprx_intr_info[] = {
+		{ 0x1800000, "ULPRX context error", -1, 1 },
+		{ 0x7fffff, "ULPRX parity error", -1, 1 },
+		{ 0 }
+	};
+
+	if (t4_handle_intr_status(adapter, ULP_RX_INT_CAUSE, ulprx_intr_info))
+		t4_fatal_err(adapter);
+}
+
+/*
+ * ULP TX interrupt handler.
+ */
+static void ulptx_intr_handler(struct adapter *adapter)
+{
+	static const struct intr_info ulptx_intr_info[] = {
+		{ PBL_BOUND_ERR_CH3, "ULPTX channel 3 PBL out of bounds", -1,
+		  0 },
+		{ PBL_BOUND_ERR_CH2, "ULPTX channel 2 PBL out of bounds", -1,
+		  0 },
+		{ PBL_BOUND_ERR_CH1, "ULPTX channel 1 PBL out of bounds", -1,
+		  0 },
+		{ PBL_BOUND_ERR_CH0, "ULPTX channel 0 PBL out of bounds", -1,
+		  0 },
+		{ 0xfffffff, "ULPTX parity error", -1, 1 },
+		{ 0 }
+	};
+
+	if (t4_handle_intr_status(adapter, ULP_TX_INT_CAUSE, ulptx_intr_info))
+		t4_fatal_err(adapter);
+}
+
+/*
+ * PM TX interrupt handler.
+ */
+static void pmtx_intr_handler(struct adapter *adapter)
+{
+	static const struct intr_info pmtx_intr_info[] = {
+		{ PCMD_LEN_OVFL0, "PMTX channel 0 pcmd too large", -1, 1 },
+		{ PCMD_LEN_OVFL1, "PMTX channel 1 pcmd too large", -1, 1 },
+		{ PCMD_LEN_OVFL2, "PMTX channel 2 pcmd too large", -1, 1 },
+		{ ZERO_C_CMD_ERROR, "PMTX 0-length pcmd", -1, 1 },
+		{ PMTX_FRAMING_ERROR, "PMTX framing error", -1, 1 },
+		{ OESPI_PAR_ERROR, "PMTX oespi parity error", -1, 1 },
+		{ DB_OPTIONS_PAR_ERROR, "PMTX db_options parity error", -1, 1 },
+		{ ICSPI_PAR_ERROR, "PMTX icspi parity error", -1, 1 },
+		{ C_PCMD_PAR_ERROR, "PMTX c_pcmd parity error", -1, 1},
+		{ 0 }
+	};
+
+	if (t4_handle_intr_status(adapter, PM_TX_INT_CAUSE, pmtx_intr_info))
+		t4_fatal_err(adapter);
+}
+
+/*
+ * PM RX interrupt handler.
+ */
+static void pmrx_intr_handler(struct adapter *adapter)
+{
+	static const struct intr_info pmrx_intr_info[] = {
+		{ ZERO_E_CMD_ERROR, "PMRX 0-length pcmd", -1, 1 },
+		{ PMRX_FRAMING_ERROR, "PMRX framing error", -1, 1 },
+		{ OCSPI_PAR_ERROR, "PMRX ocspi parity error", -1, 1 },
+		{ DB_OPTIONS_PAR_ERROR, "PMRX db_options parity error", -1, 1 },
+		{ IESPI_PAR_ERROR, "PMRX iespi parity error", -1, 1 },
+		{ E_PCMD_PAR_ERROR, "PMRX e_pcmd parity error", -1, 1},
+		{ 0 }
+	};
+
+	if (t4_handle_intr_status(adapter, PM_RX_INT_CAUSE, pmrx_intr_info))
+		t4_fatal_err(adapter);
+}
+
+/*
+ * CPL switch interrupt handler.
+ */
+static void cplsw_intr_handler(struct adapter *adapter)
+{
+	static const struct intr_info cplsw_intr_info[] = {
+		{ CIM_OP_MAP_PERR, "CPLSW CIM op_map parity error", -1, 1 },
+		{ CIM_OVFL_ERROR, "CPLSW CIM overflow", -1, 1 },
+		{ TP_FRAMING_ERROR, "CPLSW TP framing error", -1, 1 },
+		{ SGE_FRAMING_ERROR, "CPLSW SGE framing error", -1, 1 },
+		{ CIM_FRAMING_ERROR, "CPLSW CIM framing error", -1, 1 },
+		{ ZERO_SWITCH_ERROR, "CPLSW no-switch error", -1, 1 },
+		{ 0 }
+	};
+
+	if (t4_handle_intr_status(adapter, CPL_INTR_CAUSE, cplsw_intr_info))
+		t4_fatal_err(adapter);
+}
+
+/*
+ * LE interrupt handler.
+ */
+static void le_intr_handler(struct adapter *adap)
+{
+	static const struct intr_info le_intr_info[] = {
+		{ LIPMISS, "LE LIP miss", -1, 0 },
+		{ LIP0, "LE 0 LIP error", -1, 0 },
+		{ PARITYERR, "LE parity error", -1, 1 },
+		{ UNKNOWNCMD, "LE unknown command", -1, 1 },
+		{ REQQPARERR, "LE request queue parity error", -1, 1 },
+		{ 0 }
+	};
+
+	if (t4_handle_intr_status(adap, LE_DB_INT_CAUSE, le_intr_info))
+		t4_fatal_err(adap);
+}
+
+/*
+ * MPS interrupt handler.
+ */
+static void mps_intr_handler(struct adapter *adapter)
+{
+	static const struct intr_info mps_rx_intr_info[] = {
+		{ 0xffffff, "MPS Rx parity error", -1, 1 },
+		{ 0 }
+	};
+	static const struct intr_info mps_tx_intr_info[] = {
+		{ TPFIFO, "MPS Tx TP FIFO parity error", -1, 1 },
+		{ NCSIFIFO, "MPS Tx NC-SI FIFO parity error", -1, 1 },
+		{ TXDATAFIFO, "MPS Tx data FIFO parity error", -1, 1 },
+		{ TXDESCFIFO, "MPS Tx desc FIFO parity error", -1, 1 },
+		{ BUBBLE, "MPS Tx underflow", -1, 1 },
+		{ SECNTERR, "MPS Tx SOP/EOP error", -1, 1 },
+		{ FRMERR, "MPS Tx framing error", -1, 1 },
+		{ 0 }
+	};
+	static const struct intr_info mps_trc_intr_info[] = {
+		{ FILTMEM, "MPS TRC filter parity error", -1, 1 },
+		{ PKTFIFO, "MPS TRC packet FIFO parity error", -1, 1 },
+		{ MISCPERR, "MPS TRC misc parity error", -1, 1 },
+		{ 0 }
+	};
+	static const struct intr_info mps_stat_sram_intr_info[] = {
+		{ 0x1fffff, "MPS statistics SRAM parity error", -1, 1 },
+		{ 0 }
+	};
+	static const struct intr_info mps_stat_tx_intr_info[] = {
+		{ 0xfffff, "MPS statistics Tx FIFO parity error", -1, 1 },
+		{ 0 }
+	};
+	static const struct intr_info mps_stat_rx_intr_info[] = {
+		{ 0xffffff, "MPS statistics Rx FIFO parity error", -1, 1 },
+		{ 0 }
+	};
+	static const struct intr_info mps_cls_intr_info[] = {
+		{ MATCHSRAM, "MPS match SRAM parity error", -1, 1 },
+		{ MATCHTCAM, "MPS match TCAM parity error", -1, 1 },
+		{ HASHSRAM, "MPS hash SRAM parity error", -1, 1 },
+		{ 0 }
+	};
+
+	int fat;
+
+	fat = t4_handle_intr_status(adapter, MPS_RX_PERR_INT_CAUSE,
+				    mps_rx_intr_info) +
+	      t4_handle_intr_status(adapter, MPS_TX_INT_CAUSE,
+				    mps_tx_intr_info) +
+	      t4_handle_intr_status(adapter, MPS_TRC_INT_CAUSE,
+				    mps_trc_intr_info) +
+	      t4_handle_intr_status(adapter, MPS_STAT_PERR_INT_CAUSE_SRAM,
+				    mps_stat_sram_intr_info) +
+	      t4_handle_intr_status(adapter, MPS_STAT_PERR_INT_CAUSE_TX_FIFO,
+				    mps_stat_tx_intr_info) +
+	      t4_handle_intr_status(adapter, MPS_STAT_PERR_INT_CAUSE_RX_FIFO,
+				    mps_stat_rx_intr_info) +
+	      t4_handle_intr_status(adapter, MPS_CLS_INT_CAUSE,
+				    mps_cls_intr_info);
+
+	t4_write_reg(adapter, MPS_INT_CAUSE, CLSINT | TRCINT |
+		     RXINT | TXINT | STATINT);
+	t4_read_reg(adapter, MPS_INT_CAUSE);                    /* flush */
+	if (fat)
+		t4_fatal_err(adapter);
+}
+
+#define MEM_INT_MASK (PERR_INT_CAUSE | ECC_CE_INT_CAUSE | ECC_UE_INT_CAUSE)
+
+/*
+ * EDC/MC interrupt handler.
+ */
+static void mem_intr_handler(struct adapter *adapter, int idx)
+{
+	static const char name[4][7] = { "EDC0", "EDC1", "MC/MC0", "MC1" };
+
+	unsigned int addr, cnt_addr, v;
+
+	if (idx <= MEM_EDC1) {
+		addr = EDC_REG(EDC_INT_CAUSE, idx);
+		cnt_addr = EDC_REG(EDC_ECC_STATUS, idx);
+	} else if (idx == MEM_MC) {
+		if (is_t4(adapter->params.chip)) {
+			addr = MC_INT_CAUSE;
+			cnt_addr = MC_ECC_STATUS;
+		} else {
+			addr = MC_P_INT_CAUSE;
+			cnt_addr = MC_P_ECC_STATUS;
+		}
+	} else {
+		addr = MC_REG(MC_P_INT_CAUSE, 1);
+		cnt_addr = MC_REG(MC_P_ECC_STATUS, 1);
+	}
+
+	v = t4_read_reg(adapter, addr) & MEM_INT_MASK;
+	if (v & PERR_INT_CAUSE)
+		dev_alert(adapter->pdev_dev, "%s FIFO parity error\n",
+			  name[idx]);
+	if (v & ECC_CE_INT_CAUSE) {
+		u32 cnt = ECC_CECNT_GET(t4_read_reg(adapter, cnt_addr));
+
+		t4_write_reg(adapter, cnt_addr, ECC_CECNT_MASK);
+		if (printk_ratelimit())
+			dev_warn(adapter->pdev_dev,
+				 "%u %s correctable ECC data error%s\n",
+				 cnt, name[idx], cnt > 1 ? "s" : "");
+	}
+	if (v & ECC_UE_INT_CAUSE)
+		dev_alert(adapter->pdev_dev,
+			  "%s uncorrectable ECC data error\n", name[idx]);
+
+	t4_write_reg(adapter, addr, v);
+	if (v & (PERR_INT_CAUSE | ECC_UE_INT_CAUSE))
+		t4_fatal_err(adapter);
+}
+
+/*
+ * MA interrupt handler.
+ */
+static void ma_intr_handler(struct adapter *adap)
+{
+	u32 v, status = t4_read_reg(adap, MA_INT_CAUSE);
+
+	if (status & MEM_PERR_INT_CAUSE) {
+		dev_alert(adap->pdev_dev,
+			  "MA parity error, parity status %#x\n",
+			  t4_read_reg(adap, MA_PARITY_ERROR_STATUS));
+		if (is_t5(adap->params.chip))
+			dev_alert(adap->pdev_dev,
+				  "MA parity error, parity status %#x\n",
+				  t4_read_reg(adap,
+					      MA_PARITY_ERROR_STATUS2));
+	}
+	if (status & MEM_WRAP_INT_CAUSE) {
+		v = t4_read_reg(adap, MA_INT_WRAP_STATUS);
+		dev_alert(adap->pdev_dev, "MA address wrap-around error by "
+			  "client %u to address %#x\n",
+			  MEM_WRAP_CLIENT_NUM_GET(v),
+			  MEM_WRAP_ADDRESS_GET(v) << 4);
+	}
+	t4_write_reg(adap, MA_INT_CAUSE, status);
+	t4_fatal_err(adap);
+}
+
+/*
+ * SMB interrupt handler.
+ */
+static void smb_intr_handler(struct adapter *adap)
+{
+	static const struct intr_info smb_intr_info[] = {
+		{ MSTTXFIFOPARINT, "SMB master Tx FIFO parity error", -1, 1 },
+		{ MSTRXFIFOPARINT, "SMB master Rx FIFO parity error", -1, 1 },
+		{ SLVFIFOPARINT, "SMB slave FIFO parity error", -1, 1 },
+		{ 0 }
+	};
+
+	if (t4_handle_intr_status(adap, SMB_INT_CAUSE, smb_intr_info))
+		t4_fatal_err(adap);
+}
+
+/*
+ * NC-SI interrupt handler.
+ */
+static void ncsi_intr_handler(struct adapter *adap)
+{
+	static const struct intr_info ncsi_intr_info[] = {
+		{ CIM_DM_PRTY_ERR, "NC-SI CIM parity error", -1, 1 },
+		{ MPS_DM_PRTY_ERR, "NC-SI MPS parity error", -1, 1 },
+		{ TXFIFO_PRTY_ERR, "NC-SI Tx FIFO parity error", -1, 1 },
+		{ RXFIFO_PRTY_ERR, "NC-SI Rx FIFO parity error", -1, 1 },
+		{ 0 }
+	};
+
+	if (t4_handle_intr_status(adap, NCSI_INT_CAUSE, ncsi_intr_info))
+		t4_fatal_err(adap);
+}
+
+/*
+ * XGMAC interrupt handler.
+ */
+static void xgmac_intr_handler(struct adapter *adap, int port)
+{
+	u32 v, int_cause_reg;
+
+	if (is_t4(adap->params.chip))
+		int_cause_reg = PORT_REG(port, XGMAC_PORT_INT_CAUSE);
+	else
+		int_cause_reg = T5_PORT_REG(port, MAC_PORT_INT_CAUSE);
+
+	v = t4_read_reg(adap, int_cause_reg);
+
+	v &= TXFIFO_PRTY_ERR | RXFIFO_PRTY_ERR;
+	if (!v)
+		return;
+
+	if (v & TXFIFO_PRTY_ERR)
+		dev_alert(adap->pdev_dev, "XGMAC %d Tx FIFO parity error\n",
+			  port);
+	if (v & RXFIFO_PRTY_ERR)
+		dev_alert(adap->pdev_dev, "XGMAC %d Rx FIFO parity error\n",
+			  port);
+	t4_write_reg(adap, PORT_REG(port, XGMAC_PORT_INT_CAUSE), v);
+	t4_fatal_err(adap);
+}
+
+/*
+ * PL interrupt handler.
+ */
+static void pl_intr_handler(struct adapter *adap)
+{
+	static const struct intr_info pl_intr_info[] = {
+		{ FATALPERR, "T4 fatal parity error", -1, 1 },
+		{ PERRVFID, "PL VFID_MAP parity error", -1, 1 },
+		{ 0 }
+	};
+
+	if (t4_handle_intr_status(adap, PL_PL_INT_CAUSE, pl_intr_info))
+		t4_fatal_err(adap);
+}
+
+#define PF_INTR_MASK (PFSW)
+#define GLBL_INTR_MASK (CIM | MPS | PL | PCIE | MC | EDC0 | \
+		EDC1 | LE | TP | MA | PM_TX | PM_RX | ULP_RX | \
+		CPL_SWITCH | SGE | ULP_TX)
+
+/**
+ *	t4_slow_intr_handler - control path interrupt handler
+ *	@adapter: the adapter
+ *
+ *	T4 interrupt handler for non-data global interrupt events, e.g., errors.
+ *	The designation 'slow' is because it involves register reads, while
+ *	data interrupts typically don't involve any MMIOs.
+ */
+int t4_slow_intr_handler(struct adapter *adapter)
+{
+	u32 cause = t4_read_reg(adapter, PL_INT_CAUSE);
+
+	if (!(cause & GLBL_INTR_MASK))
+		return 0;
+	if (cause & CIM)
+		cim_intr_handler(adapter);
+	if (cause & MPS)
+		mps_intr_handler(adapter);
+	if (cause & NCSI)
+		ncsi_intr_handler(adapter);
+	if (cause & PL)
+		pl_intr_handler(adapter);
+	if (cause & SMB)
+		smb_intr_handler(adapter);
+	if (cause & XGMAC0)
+		xgmac_intr_handler(adapter, 0);
+	if (cause & XGMAC1)
+		xgmac_intr_handler(adapter, 1);
+	if (cause & XGMAC_KR0)
+		xgmac_intr_handler(adapter, 2);
+	if (cause & XGMAC_KR1)
+		xgmac_intr_handler(adapter, 3);
+	if (cause & PCIE)
+		pcie_intr_handler(adapter);
+	if (cause & MC)
+		mem_intr_handler(adapter, MEM_MC);
+	if (!is_t4(adapter->params.chip) && (cause & MC1))
+		mem_intr_handler(adapter, MEM_MC1);
+	if (cause & EDC0)
+		mem_intr_handler(adapter, MEM_EDC0);
+	if (cause & EDC1)
+		mem_intr_handler(adapter, MEM_EDC1);
+	if (cause & LE)
+		le_intr_handler(adapter);
+	if (cause & TP)
+		tp_intr_handler(adapter);
+	if (cause & MA)
+		ma_intr_handler(adapter);
+	if (cause & PM_TX)
+		pmtx_intr_handler(adapter);
+	if (cause & PM_RX)
+		pmrx_intr_handler(adapter);
+	if (cause & ULP_RX)
+		ulprx_intr_handler(adapter);
+	if (cause & CPL_SWITCH)
+		cplsw_intr_handler(adapter);
+	if (cause & SGE)
+		sge_intr_handler(adapter);
+	if (cause & ULP_TX)
+		ulptx_intr_handler(adapter);
+
+	/* Clear the interrupts just processed for which we are the master. */
+	t4_write_reg(adapter, PL_INT_CAUSE, cause & GLBL_INTR_MASK);
+	(void) t4_read_reg(adapter, PL_INT_CAUSE); /* flush */
+	return 1;
+}
+
+/**
+ *	t4_intr_enable - enable interrupts
+ *	@adapter: the adapter whose interrupts should be enabled
+ *
+ *	Enable PF-specific interrupts for the calling function and the top-level
+ *	interrupt concentrator for global interrupts.  Interrupts are already
+ *	enabled at each module,	here we just enable the roots of the interrupt
+ *	hierarchies.
+ *
+ *	Note: this function should be called only when the driver manages
+ *	non PF-specific interrupts from the various HW modules.  Only one PCI
+ *	function at a time should be doing this.
+ */
+void t4_intr_enable(struct adapter *adapter)
+{
+	u32 pf = SOURCEPF_GET(t4_read_reg(adapter, PL_WHOAMI));
+
+	t4_write_reg(adapter, SGE_INT_ENABLE3, ERR_CPL_EXCEED_IQE_SIZE |
+		     ERR_INVALID_CIDX_INC | ERR_CPL_OPCODE_0 |
+		     ERR_DROPPED_DB | ERR_DATA_CPL_ON_HIGH_QID1 |
+		     ERR_DATA_CPL_ON_HIGH_QID0 | ERR_BAD_DB_PIDX3 |
+		     ERR_BAD_DB_PIDX2 | ERR_BAD_DB_PIDX1 |
+		     ERR_BAD_DB_PIDX0 | ERR_ING_CTXT_PRIO |
+		     ERR_EGR_CTXT_PRIO | INGRESS_SIZE_ERR |
+		     DBFIFO_HP_INT | DBFIFO_LP_INT |
+		     EGRESS_SIZE_ERR);
+	t4_write_reg(adapter, MYPF_REG(PL_PF_INT_ENABLE), PF_INTR_MASK);
+	t4_set_reg_field(adapter, PL_INT_MAP0, 0, 1 << pf);
+}
+
+/**
+ *	t4_intr_disable - disable interrupts
+ *	@adapter: the adapter whose interrupts should be disabled
+ *
+ *	Disable interrupts.  We only disable the top-level interrupt
+ *	concentrators.  The caller must be a PCI function managing global
+ *	interrupts.
+ */
+void t4_intr_disable(struct adapter *adapter)
+{
+	u32 pf = SOURCEPF_GET(t4_read_reg(adapter, PL_WHOAMI));
+
+	t4_write_reg(adapter, MYPF_REG(PL_PF_INT_ENABLE), 0);
+	t4_set_reg_field(adapter, PL_INT_MAP0, 1 << pf, 0);
+}
+
+/**
+ *	hash_mac_addr - return the hash value of a MAC address
+ *	@addr: the 48-bit Ethernet MAC address
+ *
+ *	Hashes a MAC address according to the hash function used by HW inexact
+ *	(hash) address matching.
+ */
+static int hash_mac_addr(const u8 *addr)
+{
+	u32 a = ((u32)addr[0] << 16) | ((u32)addr[1] << 8) | addr[2];
+	u32 b = ((u32)addr[3] << 16) | ((u32)addr[4] << 8) | addr[5];
+	a ^= b;
+	a ^= (a >> 12);
+	a ^= (a >> 6);
+	return a & 0x3f;
+}
+
+/**
+ *	t4_config_rss_range - configure a portion of the RSS mapping table
+ *	@adapter: the adapter
+ *	@mbox: mbox to use for the FW command
+ *	@viid: virtual interface whose RSS subtable is to be written
+ *	@start: start entry in the table to write
+ *	@n: how many table entries to write
+ *	@rspq: values for the response queue lookup table
+ *	@nrspq: number of values in @rspq
+ *
+ *	Programs the selected part of the VI's RSS mapping table with the
+ *	provided values.  If @nrspq < @n the supplied values are used repeatedly
+ *	until the full table range is populated.
+ *
+ *	The caller must ensure the values in @rspq are in the range allowed for
+ *	@viid.
+ */
+int t4_config_rss_range(struct adapter *adapter, int mbox, unsigned int viid,
+			int start, int n, const u16 *rspq, unsigned int nrspq)
+{
+	int ret;
+	const u16 *rsp = rspq;
+	const u16 *rsp_end = rspq + nrspq;
+	struct fw_rss_ind_tbl_cmd cmd;
+
+	memset(&cmd, 0, sizeof(cmd));
+	cmd.op_to_viid = htonl(FW_CMD_OP(FW_RSS_IND_TBL_CMD) |
+			       FW_CMD_REQUEST | FW_CMD_WRITE |
+			       FW_RSS_IND_TBL_CMD_VIID(viid));
+	cmd.retval_len16 = htonl(FW_LEN16(cmd));
+
+	/* each fw_rss_ind_tbl_cmd takes up to 32 entries */
+	while (n > 0) {
+		int nq = min(n, 32);
+		__be32 *qp = &cmd.iq0_to_iq2;
+
+		cmd.niqid = htons(nq);
+		cmd.startidx = htons(start);
+
+		start += nq;
+		n -= nq;
+
+		while (nq > 0) {
+			unsigned int v;
+
+			v = FW_RSS_IND_TBL_CMD_IQ0(*rsp);
+			if (++rsp >= rsp_end)
+				rsp = rspq;
+			v |= FW_RSS_IND_TBL_CMD_IQ1(*rsp);
+			if (++rsp >= rsp_end)
+				rsp = rspq;
+			v |= FW_RSS_IND_TBL_CMD_IQ2(*rsp);
+			if (++rsp >= rsp_end)
+				rsp = rspq;
+
+			*qp++ = htonl(v);
+			nq -= 3;
+		}
+
+		ret = t4_wr_mbox(adapter, mbox, &cmd, sizeof(cmd), NULL);
+		if (ret)
+			return ret;
+	}
+	return 0;
+}
+
+/**
+ *	t4_config_glbl_rss - configure the global RSS mode
+ *	@adapter: the adapter
+ *	@mbox: mbox to use for the FW command
+ *	@mode: global RSS mode
+ *	@flags: mode-specific flags
+ *
+ *	Sets the global RSS mode.
+ */
+int t4_config_glbl_rss(struct adapter *adapter, int mbox, unsigned int mode,
+		       unsigned int flags)
+{
+	struct fw_rss_glb_config_cmd c;
+
+	memset(&c, 0, sizeof(c));
+	c.op_to_write = htonl(FW_CMD_OP(FW_RSS_GLB_CONFIG_CMD) |
+			      FW_CMD_REQUEST | FW_CMD_WRITE);
+	c.retval_len16 = htonl(FW_LEN16(c));
+	if (mode == FW_RSS_GLB_CONFIG_CMD_MODE_MANUAL) {
+		c.u.manual.mode_pkd = htonl(FW_RSS_GLB_CONFIG_CMD_MODE(mode));
+	} else if (mode == FW_RSS_GLB_CONFIG_CMD_MODE_BASICVIRTUAL) {
+		c.u.basicvirtual.mode_pkd =
+			htonl(FW_RSS_GLB_CONFIG_CMD_MODE(mode));
+		c.u.basicvirtual.synmapen_to_hashtoeplitz = htonl(flags);
+	} else
+		return -EINVAL;
+	return t4_wr_mbox(adapter, mbox, &c, sizeof(c), NULL);
+}
+
+/**
+ *	t4_tp_get_tcp_stats - read TP's TCP MIB counters
+ *	@adap: the adapter
+ *	@v4: holds the TCP/IP counter values
+ *	@v6: holds the TCP/IPv6 counter values
+ *
+ *	Returns the values of TP's TCP/IP and TCP/IPv6 MIB counters.
+ *	Either @v4 or @v6 may be %NULL to skip the corresponding stats.
+ */
+void t4_tp_get_tcp_stats(struct adapter *adap, struct tp_tcp_stats *v4,
+			 struct tp_tcp_stats *v6)
+{
+	u32 val[TP_MIB_TCP_RXT_SEG_LO - TP_MIB_TCP_OUT_RST + 1];
+
+#define STAT_IDX(x) ((TP_MIB_TCP_##x) - TP_MIB_TCP_OUT_RST)
+#define STAT(x)     val[STAT_IDX(x)]
+#define STAT64(x)   (((u64)STAT(x##_HI) << 32) | STAT(x##_LO))
+
+	if (v4) {
+		t4_read_indirect(adap, TP_MIB_INDEX, TP_MIB_DATA, val,
+				 ARRAY_SIZE(val), TP_MIB_TCP_OUT_RST);
+		v4->tcpOutRsts = STAT(OUT_RST);
+		v4->tcpInSegs  = STAT64(IN_SEG);
+		v4->tcpOutSegs = STAT64(OUT_SEG);
+		v4->tcpRetransSegs = STAT64(RXT_SEG);
+	}
+	if (v6) {
+		t4_read_indirect(adap, TP_MIB_INDEX, TP_MIB_DATA, val,
+				 ARRAY_SIZE(val), TP_MIB_TCP_V6OUT_RST);
+		v6->tcpOutRsts = STAT(OUT_RST);
+		v6->tcpInSegs  = STAT64(IN_SEG);
+		v6->tcpOutSegs = STAT64(OUT_SEG);
+		v6->tcpRetransSegs = STAT64(RXT_SEG);
+	}
+#undef STAT64
+#undef STAT
+#undef STAT_IDX
+}
+
+/**
+ *	t4_read_mtu_tbl - returns the values in the HW path MTU table
+ *	@adap: the adapter
+ *	@mtus: where to store the MTU values
+ *	@mtu_log: where to store the MTU base-2 log (may be %NULL)
+ *
+ *	Reads the HW path MTU table.
+ */
+void t4_read_mtu_tbl(struct adapter *adap, u16 *mtus, u8 *mtu_log)
+{
+	u32 v;
+	int i;
+
+	for (i = 0; i < NMTUS; ++i) {
+		t4_write_reg(adap, TP_MTU_TABLE,
+			     MTUINDEX(0xff) | MTUVALUE(i));
+		v = t4_read_reg(adap, TP_MTU_TABLE);
+		mtus[i] = MTUVALUE_GET(v);
+		if (mtu_log)
+			mtu_log[i] = MTUWIDTH_GET(v);
+	}
+}
+
+/**
+ *	t4_tp_wr_bits_indirect - set/clear bits in an indirect TP register
+ *	@adap: the adapter
+ *	@addr: the indirect TP register address
+ *	@mask: specifies the field within the register to modify
+ *	@val: new value for the field
+ *
+ *	Sets a field of an indirect TP register to the given value.
+ */
+void t4_tp_wr_bits_indirect(struct adapter *adap, unsigned int addr,
+			    unsigned int mask, unsigned int val)
+{
+	t4_write_reg(adap, TP_PIO_ADDR, addr);
+	val |= t4_read_reg(adap, TP_PIO_DATA) & ~mask;
+	t4_write_reg(adap, TP_PIO_DATA, val);
+}
+
+/**
+ *	init_cong_ctrl - initialize congestion control parameters
+ *	@a: the alpha values for congestion control
+ *	@b: the beta values for congestion control
+ *
+ *	Initialize the congestion control parameters.
+ */
+static void init_cong_ctrl(unsigned short *a, unsigned short *b)
+{
+	a[0] = a[1] = a[2] = a[3] = a[4] = a[5] = a[6] = a[7] = a[8] = 1;
+	a[9] = 2;
+	a[10] = 3;
+	a[11] = 4;
+	a[12] = 5;
+	a[13] = 6;
+	a[14] = 7;
+	a[15] = 8;
+	a[16] = 9;
+	a[17] = 10;
+	a[18] = 14;
+	a[19] = 17;
+	a[20] = 21;
+	a[21] = 25;
+	a[22] = 30;
+	a[23] = 35;
+	a[24] = 45;
+	a[25] = 60;
+	a[26] = 80;
+	a[27] = 100;
+	a[28] = 200;
+	a[29] = 300;
+	a[30] = 400;
+	a[31] = 500;
+
+	b[0] = b[1] = b[2] = b[3] = b[4] = b[5] = b[6] = b[7] = b[8] = 0;
+	b[9] = b[10] = 1;
+	b[11] = b[12] = 2;
+	b[13] = b[14] = b[15] = b[16] = 3;
+	b[17] = b[18] = b[19] = b[20] = b[21] = 4;
+	b[22] = b[23] = b[24] = b[25] = b[26] = b[27] = 5;
+	b[28] = b[29] = 6;
+	b[30] = b[31] = 7;
+}
+
+/* The minimum additive increment value for the congestion control table */
+#define CC_MIN_INCR 2U
+
+/**
+ *	t4_load_mtus - write the MTU and congestion control HW tables
+ *	@adap: the adapter
+ *	@mtus: the values for the MTU table
+ *	@alpha: the values for the congestion control alpha parameter
+ *	@beta: the values for the congestion control beta parameter
+ *
+ *	Write the HW MTU table with the supplied MTUs and the high-speed
+ *	congestion control table with the supplied alpha, beta, and MTUs.
+ *	We write the two tables together because the additive increments
+ *	depend on the MTUs.
+ */
+void t4_load_mtus(struct adapter *adap, const unsigned short *mtus,
+		  const unsigned short *alpha, const unsigned short *beta)
+{
+	static const unsigned int avg_pkts[NCCTRL_WIN] = {
+		2, 6, 10, 14, 20, 28, 40, 56, 80, 112, 160, 224, 320, 448, 640,
+		896, 1281, 1792, 2560, 3584, 5120, 7168, 10240, 14336, 20480,
+		28672, 40960, 57344, 81920, 114688, 163840, 229376
+	};
+
+	unsigned int i, w;
+
+	for (i = 0; i < NMTUS; ++i) {
+		unsigned int mtu = mtus[i];
+		unsigned int log2 = fls(mtu);
+
+		if (!(mtu & ((1 << log2) >> 2)))     /* round */
+			log2--;
+		t4_write_reg(adap, TP_MTU_TABLE, MTUINDEX(i) |
+			     MTUWIDTH(log2) | MTUVALUE(mtu));
+
+		for (w = 0; w < NCCTRL_WIN; ++w) {
+			unsigned int inc;
+
+			inc = max(((mtu - 40) * alpha[w]) / avg_pkts[w],
+				  CC_MIN_INCR);
+
+			t4_write_reg(adap, TP_CCTRL_TABLE, (i << 21) |
+				     (w << 16) | (beta[w] << 13) | inc);
+		}
+	}
+}
+
+/**
+ *	get_mps_bg_map - return the buffer groups associated with a port
+ *	@adap: the adapter
+ *	@idx: the port index
+ *
+ *	Returns a bitmap indicating which MPS buffer groups are associated
+ *	with the given port.  Bit i is set if buffer group i is used by the
+ *	port.
+ */
+static unsigned int get_mps_bg_map(struct adapter *adap, int idx)
+{
+	u32 n = NUMPORTS_GET(t4_read_reg(adap, MPS_CMN_CTL));
+
+	if (n == 0)
+		return idx == 0 ? 0xf : 0;
+	if (n == 1)
+		return idx < 2 ? (3 << (2 * idx)) : 0;
+	return 1 << idx;
+}
+
+/**
+ *      t4_get_port_type_description - return Port Type string description
+ *      @port_type: firmware Port Type enumeration
+ */
+const char *t4_get_port_type_description(enum fw_port_type port_type)
+{
+	static const char *const port_type_description[] = {
+		"R XFI",
+		"R XAUI",
+		"T SGMII",
+		"T XFI",
+		"T XAUI",
+		"KX4",
+		"CX4",
+		"KX",
+		"KR",
+		"R SFP+",
+		"KR/KX",
+		"KR/KX/KX4",
+		"R QSFP_10G",
+		"",
+		"R QSFP",
+		"R BP40_BA",
+	};
+
+	if (port_type < ARRAY_SIZE(port_type_description))
+		return port_type_description[port_type];
+	return "UNKNOWN";
+}
+
+/**
+ *	t4_get_port_stats - collect port statistics
+ *	@adap: the adapter
+ *	@idx: the port index
+ *	@p: the stats structure to fill
+ *
+ *	Collect statistics related to the given port from HW.
+ */
+void t4_get_port_stats(struct adapter *adap, int idx, struct port_stats *p)
+{
+	u32 bgmap = get_mps_bg_map(adap, idx);
+
+#define GET_STAT(name) \
+	t4_read_reg64(adap, \
+	(is_t4(adap->params.chip) ? PORT_REG(idx, MPS_PORT_STAT_##name##_L) : \
+	T5_PORT_REG(idx, MPS_PORT_STAT_##name##_L)))
+#define GET_STAT_COM(name) t4_read_reg64(adap, MPS_STAT_##name##_L)
+
+	p->tx_octets           = GET_STAT(TX_PORT_BYTES);
+	p->tx_frames           = GET_STAT(TX_PORT_FRAMES);
+	p->tx_bcast_frames     = GET_STAT(TX_PORT_BCAST);
+	p->tx_mcast_frames     = GET_STAT(TX_PORT_MCAST);
+	p->tx_ucast_frames     = GET_STAT(TX_PORT_UCAST);
+	p->tx_error_frames     = GET_STAT(TX_PORT_ERROR);
+	p->tx_frames_64        = GET_STAT(TX_PORT_64B);
+	p->tx_frames_65_127    = GET_STAT(TX_PORT_65B_127B);
+	p->tx_frames_128_255   = GET_STAT(TX_PORT_128B_255B);
+	p->tx_frames_256_511   = GET_STAT(TX_PORT_256B_511B);
+	p->tx_frames_512_1023  = GET_STAT(TX_PORT_512B_1023B);
+	p->tx_frames_1024_1518 = GET_STAT(TX_PORT_1024B_1518B);
+	p->tx_frames_1519_max  = GET_STAT(TX_PORT_1519B_MAX);
+	p->tx_drop             = GET_STAT(TX_PORT_DROP);
+	p->tx_pause            = GET_STAT(TX_PORT_PAUSE);
+	p->tx_ppp0             = GET_STAT(TX_PORT_PPP0);
+	p->tx_ppp1             = GET_STAT(TX_PORT_PPP1);
+	p->tx_ppp2             = GET_STAT(TX_PORT_PPP2);
+	p->tx_ppp3             = GET_STAT(TX_PORT_PPP3);
+	p->tx_ppp4             = GET_STAT(TX_PORT_PPP4);
+	p->tx_ppp5             = GET_STAT(TX_PORT_PPP5);
+	p->tx_ppp6             = GET_STAT(TX_PORT_PPP6);
+	p->tx_ppp7             = GET_STAT(TX_PORT_PPP7);
+
+	p->rx_octets           = GET_STAT(RX_PORT_BYTES);
+	p->rx_frames           = GET_STAT(RX_PORT_FRAMES);
+	p->rx_bcast_frames     = GET_STAT(RX_PORT_BCAST);
+	p->rx_mcast_frames     = GET_STAT(RX_PORT_MCAST);
+	p->rx_ucast_frames     = GET_STAT(RX_PORT_UCAST);
+	p->rx_too_long         = GET_STAT(RX_PORT_MTU_ERROR);
+	p->rx_jabber           = GET_STAT(RX_PORT_MTU_CRC_ERROR);
+	p->rx_fcs_err          = GET_STAT(RX_PORT_CRC_ERROR);
+	p->rx_len_err          = GET_STAT(RX_PORT_LEN_ERROR);
+	p->rx_symbol_err       = GET_STAT(RX_PORT_SYM_ERROR);
+	p->rx_runt             = GET_STAT(RX_PORT_LESS_64B);
+	p->rx_frames_64        = GET_STAT(RX_PORT_64B);
+	p->rx_frames_65_127    = GET_STAT(RX_PORT_65B_127B);
+	p->rx_frames_128_255   = GET_STAT(RX_PORT_128B_255B);
+	p->rx_frames_256_511   = GET_STAT(RX_PORT_256B_511B);
+	p->rx_frames_512_1023  = GET_STAT(RX_PORT_512B_1023B);
+	p->rx_frames_1024_1518 = GET_STAT(RX_PORT_1024B_1518B);
+	p->rx_frames_1519_max  = GET_STAT(RX_PORT_1519B_MAX);
+	p->rx_pause            = GET_STAT(RX_PORT_PAUSE);
+	p->rx_ppp0             = GET_STAT(RX_PORT_PPP0);
+	p->rx_ppp1             = GET_STAT(RX_PORT_PPP1);
+	p->rx_ppp2             = GET_STAT(RX_PORT_PPP2);
+	p->rx_ppp3             = GET_STAT(RX_PORT_PPP3);
+	p->rx_ppp4             = GET_STAT(RX_PORT_PPP4);
+	p->rx_ppp5             = GET_STAT(RX_PORT_PPP5);
+	p->rx_ppp6             = GET_STAT(RX_PORT_PPP6);
+	p->rx_ppp7             = GET_STAT(RX_PORT_PPP7);
+
+	p->rx_ovflow0 = (bgmap & 1) ? GET_STAT_COM(RX_BG_0_MAC_DROP_FRAME) : 0;
+	p->rx_ovflow1 = (bgmap & 2) ? GET_STAT_COM(RX_BG_1_MAC_DROP_FRAME) : 0;
+	p->rx_ovflow2 = (bgmap & 4) ? GET_STAT_COM(RX_BG_2_MAC_DROP_FRAME) : 0;
+	p->rx_ovflow3 = (bgmap & 8) ? GET_STAT_COM(RX_BG_3_MAC_DROP_FRAME) : 0;
+	p->rx_trunc0 = (bgmap & 1) ? GET_STAT_COM(RX_BG_0_MAC_TRUNC_FRAME) : 0;
+	p->rx_trunc1 = (bgmap & 2) ? GET_STAT_COM(RX_BG_1_MAC_TRUNC_FRAME) : 0;
+	p->rx_trunc2 = (bgmap & 4) ? GET_STAT_COM(RX_BG_2_MAC_TRUNC_FRAME) : 0;
+	p->rx_trunc3 = (bgmap & 8) ? GET_STAT_COM(RX_BG_3_MAC_TRUNC_FRAME) : 0;
+
+#undef GET_STAT
+#undef GET_STAT_COM
+}
+
+/**
+ *	t4_wol_magic_enable - enable/disable magic packet WoL
+ *	@adap: the adapter
+ *	@port: the physical port index
+ *	@addr: MAC address expected in magic packets, %NULL to disable
+ *
+ *	Enables/disables magic packet wake-on-LAN for the selected port.
+ */
+void t4_wol_magic_enable(struct adapter *adap, unsigned int port,
+			 const u8 *addr)
+{
+	u32 mag_id_reg_l, mag_id_reg_h, port_cfg_reg;
+
+	if (is_t4(adap->params.chip)) {
+		mag_id_reg_l = PORT_REG(port, XGMAC_PORT_MAGIC_MACID_LO);
+		mag_id_reg_h = PORT_REG(port, XGMAC_PORT_MAGIC_MACID_HI);
+		port_cfg_reg = PORT_REG(port, XGMAC_PORT_CFG2);
+	} else {
+		mag_id_reg_l = T5_PORT_REG(port, MAC_PORT_MAGIC_MACID_LO);
+		mag_id_reg_h = T5_PORT_REG(port, MAC_PORT_MAGIC_MACID_HI);
+		port_cfg_reg = T5_PORT_REG(port, MAC_PORT_CFG2);
+	}
+
+	if (addr) {
+		t4_write_reg(adap, mag_id_reg_l,
+			     (addr[2] << 24) | (addr[3] << 16) |
+			     (addr[4] << 8) | addr[5]);
+		t4_write_reg(adap, mag_id_reg_h,
+			     (addr[0] << 8) | addr[1]);
+	}
+	t4_set_reg_field(adap, port_cfg_reg, MAGICEN,
+			 addr ? MAGICEN : 0);
+}
+
+/**
+ *	t4_wol_pat_enable - enable/disable pattern-based WoL
+ *	@adap: the adapter
+ *	@port: the physical port index
+ *	@map: bitmap of which HW pattern filters to set
+ *	@mask0: byte mask for bytes 0-63 of a packet
+ *	@mask1: byte mask for bytes 64-127 of a packet
+ *	@crc: Ethernet CRC for selected bytes
+ *	@enable: enable/disable switch
+ *
+ *	Sets the pattern filters indicated in @map to mask out the bytes
+ *	specified in @mask0/@mask1 in received packets and compare the CRC of
+ *	the resulting packet against @crc.  If @enable is %true pattern-based
+ *	WoL is enabled, otherwise disabled.
+ */
+int t4_wol_pat_enable(struct adapter *adap, unsigned int port, unsigned int map,
+		      u64 mask0, u64 mask1, unsigned int crc, bool enable)
+{
+	int i;
+	u32 port_cfg_reg;
+
+	if (is_t4(adap->params.chip))
+		port_cfg_reg = PORT_REG(port, XGMAC_PORT_CFG2);
+	else
+		port_cfg_reg = T5_PORT_REG(port, MAC_PORT_CFG2);
+
+	if (!enable) {
+		t4_set_reg_field(adap, port_cfg_reg, PATEN, 0);
+		return 0;
+	}
+	if (map > 0xff)
+		return -EINVAL;
+
+#define EPIO_REG(name) \
+	(is_t4(adap->params.chip) ? PORT_REG(port, XGMAC_PORT_EPIO_##name) : \
+	T5_PORT_REG(port, MAC_PORT_EPIO_##name))
+
+	t4_write_reg(adap, EPIO_REG(DATA1), mask0 >> 32);
+	t4_write_reg(adap, EPIO_REG(DATA2), mask1);
+	t4_write_reg(adap, EPIO_REG(DATA3), mask1 >> 32);
+
+	for (i = 0; i < NWOL_PAT; i++, map >>= 1) {
+		if (!(map & 1))
+			continue;
+
+		/* write byte masks */
+		t4_write_reg(adap, EPIO_REG(DATA0), mask0);
+		t4_write_reg(adap, EPIO_REG(OP), ADDRESS(i) | EPIOWR);
+		t4_read_reg(adap, EPIO_REG(OP));                /* flush */
+		if (t4_read_reg(adap, EPIO_REG(OP)) & SF_BUSY)
+			return -ETIMEDOUT;
+
+		/* write CRC */
+		t4_write_reg(adap, EPIO_REG(DATA0), crc);
+		t4_write_reg(adap, EPIO_REG(OP), ADDRESS(i + 32) | EPIOWR);
+		t4_read_reg(adap, EPIO_REG(OP));                /* flush */
+		if (t4_read_reg(adap, EPIO_REG(OP)) & SF_BUSY)
+			return -ETIMEDOUT;
+	}
+#undef EPIO_REG
+
+	t4_set_reg_field(adap, PORT_REG(port, XGMAC_PORT_CFG2), 0, PATEN);
+	return 0;
+}
+
+/*     t4_mk_filtdelwr - create a delete filter WR
+ *     @ftid: the filter ID
+ *     @wr: the filter work request to populate
+ *     @qid: ingress queue to receive the delete notification
+ *
+ *     Creates a filter work request to delete the supplied filter.  If @qid is
+ *     negative the delete notification is suppressed.
+ */
+void t4_mk_filtdelwr(unsigned int ftid, struct fw_filter_wr *wr, int qid)
+{
+	memset(wr, 0, sizeof(*wr));
+	wr->op_pkd = htonl(FW_WR_OP(FW_FILTER_WR));
+	wr->len16_pkd = htonl(FW_WR_LEN16(sizeof(*wr) / 16));
+	wr->tid_to_iq = htonl(V_FW_FILTER_WR_TID(ftid) |
+			V_FW_FILTER_WR_NOREPLY(qid < 0));
+	wr->del_filter_to_l2tix = htonl(F_FW_FILTER_WR_DEL_FILTER);
+	if (qid >= 0)
+		wr->rx_chan_rx_rpl_iq = htons(V_FW_FILTER_WR_RX_RPL_IQ(qid));
+}
+
+#define INIT_CMD(var, cmd, rd_wr) do { \
+	(var).op_to_write = htonl(FW_CMD_OP(FW_##cmd##_CMD) | \
+				  FW_CMD_REQUEST | FW_CMD_##rd_wr); \
+	(var).retval_len16 = htonl(FW_LEN16(var)); \
+} while (0)
+
+int t4_fwaddrspace_write(struct adapter *adap, unsigned int mbox,
+			  u32 addr, u32 val)
+{
+	struct fw_ldst_cmd c;
+
+	memset(&c, 0, sizeof(c));
+	c.op_to_addrspace = htonl(FW_CMD_OP(FW_LDST_CMD) | FW_CMD_REQUEST |
+			    FW_CMD_WRITE |
+			    FW_LDST_CMD_ADDRSPACE(FW_LDST_ADDRSPC_FIRMWARE));
+	c.cycles_to_len16 = htonl(FW_LEN16(c));
+	c.u.addrval.addr = htonl(addr);
+	c.u.addrval.val = htonl(val);
+
+	return t4_wr_mbox(adap, mbox, &c, sizeof(c), NULL);
+}
+
+/**
+ *	t4_mdio_rd - read a PHY register through MDIO
+ *	@adap: the adapter
+ *	@mbox: mailbox to use for the FW command
+ *	@phy_addr: the PHY address
+ *	@mmd: the PHY MMD to access (0 for clause 22 PHYs)
+ *	@reg: the register to read
+ *	@valp: where to store the value
+ *
+ *	Issues a FW command through the given mailbox to read a PHY register.
+ */
+int t4_mdio_rd(struct adapter *adap, unsigned int mbox, unsigned int phy_addr,
+	       unsigned int mmd, unsigned int reg, u16 *valp)
+{
+	int ret;
+	struct fw_ldst_cmd c;
+
+	memset(&c, 0, sizeof(c));
+	c.op_to_addrspace = htonl(FW_CMD_OP(FW_LDST_CMD) | FW_CMD_REQUEST |
+		FW_CMD_READ | FW_LDST_CMD_ADDRSPACE(FW_LDST_ADDRSPC_MDIO));
+	c.cycles_to_len16 = htonl(FW_LEN16(c));
+	c.u.mdio.paddr_mmd = htons(FW_LDST_CMD_PADDR(phy_addr) |
+				   FW_LDST_CMD_MMD(mmd));
+	c.u.mdio.raddr = htons(reg);
+
+	ret = t4_wr_mbox(adap, mbox, &c, sizeof(c), &c);
+	if (ret == 0)
+		*valp = ntohs(c.u.mdio.rval);
+	return ret;
+}
+
+/**
+ *	t4_mdio_wr - write a PHY register through MDIO
+ *	@adap: the adapter
+ *	@mbox: mailbox to use for the FW command
+ *	@phy_addr: the PHY address
+ *	@mmd: the PHY MMD to access (0 for clause 22 PHYs)
+ *	@reg: the register to write
+ *	@valp: value to write
+ *
+ *	Issues a FW command through the given mailbox to write a PHY register.
+ */
+int t4_mdio_wr(struct adapter *adap, unsigned int mbox, unsigned int phy_addr,
+	       unsigned int mmd, unsigned int reg, u16 val)
+{
+	struct fw_ldst_cmd c;
+
+	memset(&c, 0, sizeof(c));
+	c.op_to_addrspace = htonl(FW_CMD_OP(FW_LDST_CMD) | FW_CMD_REQUEST |
+		FW_CMD_WRITE | FW_LDST_CMD_ADDRSPACE(FW_LDST_ADDRSPC_MDIO));
+	c.cycles_to_len16 = htonl(FW_LEN16(c));
+	c.u.mdio.paddr_mmd = htons(FW_LDST_CMD_PADDR(phy_addr) |
+				   FW_LDST_CMD_MMD(mmd));
+	c.u.mdio.raddr = htons(reg);
+	c.u.mdio.rval = htons(val);
+
+	return t4_wr_mbox(adap, mbox, &c, sizeof(c), NULL);
+}
+
+/**
+ *	t4_sge_decode_idma_state - decode the idma state
+ *	@adap: the adapter
+ *	@state: the state idma is stuck in
+ */
+void t4_sge_decode_idma_state(struct adapter *adapter, int state)
+{
+	static const char * const t4_decode[] = {
+		"IDMA_IDLE",
+		"IDMA_PUSH_MORE_CPL_FIFO",
+		"IDMA_PUSH_CPL_MSG_HEADER_TO_FIFO",
+		"Not used",
+		"IDMA_PHYSADDR_SEND_PCIEHDR",
+		"IDMA_PHYSADDR_SEND_PAYLOAD_FIRST",
+		"IDMA_PHYSADDR_SEND_PAYLOAD",
+		"IDMA_SEND_FIFO_TO_IMSG",
+		"IDMA_FL_REQ_DATA_FL_PREP",
+		"IDMA_FL_REQ_DATA_FL",
+		"IDMA_FL_DROP",
+		"IDMA_FL_H_REQ_HEADER_FL",
+		"IDMA_FL_H_SEND_PCIEHDR",
+		"IDMA_FL_H_PUSH_CPL_FIFO",
+		"IDMA_FL_H_SEND_CPL",
+		"IDMA_FL_H_SEND_IP_HDR_FIRST",
+		"IDMA_FL_H_SEND_IP_HDR",
+		"IDMA_FL_H_REQ_NEXT_HEADER_FL",
+		"IDMA_FL_H_SEND_NEXT_PCIEHDR",
+		"IDMA_FL_H_SEND_IP_HDR_PADDING",
+		"IDMA_FL_D_SEND_PCIEHDR",
+		"IDMA_FL_D_SEND_CPL_AND_IP_HDR",
+		"IDMA_FL_D_REQ_NEXT_DATA_FL",
+		"IDMA_FL_SEND_PCIEHDR",
+		"IDMA_FL_PUSH_CPL_FIFO",
+		"IDMA_FL_SEND_CPL",
+		"IDMA_FL_SEND_PAYLOAD_FIRST",
+		"IDMA_FL_SEND_PAYLOAD",
+		"IDMA_FL_REQ_NEXT_DATA_FL",
+		"IDMA_FL_SEND_NEXT_PCIEHDR",
+		"IDMA_FL_SEND_PADDING",
+		"IDMA_FL_SEND_COMPLETION_TO_IMSG",
+		"IDMA_FL_SEND_FIFO_TO_IMSG",
+		"IDMA_FL_REQ_DATAFL_DONE",
+		"IDMA_FL_REQ_HEADERFL_DONE",
+	};
+	static const char * const t5_decode[] = {
+		"IDMA_IDLE",
+		"IDMA_ALMOST_IDLE",
+		"IDMA_PUSH_MORE_CPL_FIFO",
+		"IDMA_PUSH_CPL_MSG_HEADER_TO_FIFO",
+		"IDMA_SGEFLRFLUSH_SEND_PCIEHDR",
+		"IDMA_PHYSADDR_SEND_PCIEHDR",
+		"IDMA_PHYSADDR_SEND_PAYLOAD_FIRST",
+		"IDMA_PHYSADDR_SEND_PAYLOAD",
+		"IDMA_SEND_FIFO_TO_IMSG",
+		"IDMA_FL_REQ_DATA_FL",
+		"IDMA_FL_DROP",
+		"IDMA_FL_DROP_SEND_INC",
+		"IDMA_FL_H_REQ_HEADER_FL",
+		"IDMA_FL_H_SEND_PCIEHDR",
+		"IDMA_FL_H_PUSH_CPL_FIFO",
+		"IDMA_FL_H_SEND_CPL",
+		"IDMA_FL_H_SEND_IP_HDR_FIRST",
+		"IDMA_FL_H_SEND_IP_HDR",
+		"IDMA_FL_H_REQ_NEXT_HEADER_FL",
+		"IDMA_FL_H_SEND_NEXT_PCIEHDR",
+		"IDMA_FL_H_SEND_IP_HDR_PADDING",
+		"IDMA_FL_D_SEND_PCIEHDR",
+		"IDMA_FL_D_SEND_CPL_AND_IP_HDR",
+		"IDMA_FL_D_REQ_NEXT_DATA_FL",
+		"IDMA_FL_SEND_PCIEHDR",
+		"IDMA_FL_PUSH_CPL_FIFO",
+		"IDMA_FL_SEND_CPL",
+		"IDMA_FL_SEND_PAYLOAD_FIRST",
+		"IDMA_FL_SEND_PAYLOAD",
+		"IDMA_FL_REQ_NEXT_DATA_FL",
+		"IDMA_FL_SEND_NEXT_PCIEHDR",
+		"IDMA_FL_SEND_PADDING",
+		"IDMA_FL_SEND_COMPLETION_TO_IMSG",
+	};
+	static const u32 sge_regs[] = {
+		SGE_DEBUG_DATA_LOW_INDEX_2,
+		SGE_DEBUG_DATA_LOW_INDEX_3,
+		SGE_DEBUG_DATA_HIGH_INDEX_10,
+	};
+	const char **sge_idma_decode;
+	int sge_idma_decode_nstates;
+	int i;
+
+	if (is_t4(adapter->params.chip)) {
+		sge_idma_decode = (const char **)t4_decode;
+		sge_idma_decode_nstates = ARRAY_SIZE(t4_decode);
+	} else {
+		sge_idma_decode = (const char **)t5_decode;
+		sge_idma_decode_nstates = ARRAY_SIZE(t5_decode);
+	}
+
+	if (state < sge_idma_decode_nstates)
+		CH_WARN(adapter, "idma state %s\n", sge_idma_decode[state]);
+	else
+		CH_WARN(adapter, "idma state %d unknown\n", state);
+
+	for (i = 0; i < ARRAY_SIZE(sge_regs); i++)
+		CH_WARN(adapter, "SGE register %#x value %#x\n",
+			sge_regs[i], t4_read_reg(adapter, sge_regs[i]));
+}
+
+/**
+ *      t4_fw_hello - establish communication with FW
+ *      @adap: the adapter
+ *      @mbox: mailbox to use for the FW command
+ *      @evt_mbox: mailbox to receive async FW events
+ *      @master: specifies the caller's willingness to be the device master
+ *	@state: returns the current device state (if non-NULL)
+ *
+ *	Issues a command to establish communication with FW.  Returns either
+ *	an error (negative integer) or the mailbox of the Master PF.
+ */
+int t4_fw_hello(struct adapter *adap, unsigned int mbox, unsigned int evt_mbox,
+		enum dev_master master, enum dev_state *state)
+{
+	int ret;
+	struct fw_hello_cmd c;
+	u32 v;
+	unsigned int master_mbox;
+	int retries = FW_CMD_HELLO_RETRIES;
+
+retry:
+	memset(&c, 0, sizeof(c));
+	INIT_CMD(c, HELLO, WRITE);
+	c.err_to_clearinit = htonl(
+		FW_HELLO_CMD_MASTERDIS(master == MASTER_CANT) |
+		FW_HELLO_CMD_MASTERFORCE(master == MASTER_MUST) |
+		FW_HELLO_CMD_MBMASTER(master == MASTER_MUST ? mbox :
+				      FW_HELLO_CMD_MBMASTER_MASK) |
+		FW_HELLO_CMD_MBASYNCNOT(evt_mbox) |
+		FW_HELLO_CMD_STAGE(fw_hello_cmd_stage_os) |
+		FW_HELLO_CMD_CLEARINIT);
+
+	/*
+	 * Issue the HELLO command to the firmware.  If it's not successful
+	 * but indicates that we got a "busy" or "timeout" condition, retry
+	 * the HELLO until we exhaust our retry limit.  If we do exceed our
+	 * retry limit, check to see if the firmware left us any error
+	 * information and report that if so.
+	 */
+	ret = t4_wr_mbox(adap, mbox, &c, sizeof(c), &c);
+	if (ret < 0) {
+		if ((ret == -EBUSY || ret == -ETIMEDOUT) && retries-- > 0)
+			goto retry;
+		if (t4_read_reg(adap, MA_PCIE_FW) & FW_PCIE_FW_ERR)
+			t4_report_fw_error(adap);
+		return ret;
+	}
+
+	v = ntohl(c.err_to_clearinit);
+	master_mbox = FW_HELLO_CMD_MBMASTER_GET(v);
+	if (state) {
+		if (v & FW_HELLO_CMD_ERR)
+			*state = DEV_STATE_ERR;
+		else if (v & FW_HELLO_CMD_INIT)
+			*state = DEV_STATE_INIT;
+		else
+			*state = DEV_STATE_UNINIT;
+	}
+
+	/*
+	 * If we're not the Master PF then we need to wait around for the
+	 * Master PF Driver to finish setting up the adapter.
+	 *
+	 * Note that we also do this wait if we're a non-Master-capable PF and
+	 * there is no current Master PF; a Master PF may show up momentarily
+	 * and we wouldn't want to fail pointlessly.  (This can happen when an
+	 * OS loads lots of different drivers rapidly at the same time).  In
+	 * this case, the Master PF returned by the firmware will be
+	 * FW_PCIE_FW_MASTER_MASK so the test below will work ...
+	 */
+	if ((v & (FW_HELLO_CMD_ERR|FW_HELLO_CMD_INIT)) == 0 &&
+	    master_mbox != mbox) {
+		int waiting = FW_CMD_HELLO_TIMEOUT;
+
+		/*
+		 * Wait for the firmware to either indicate an error or
+		 * initialized state.  If we see either of these we bail out
+		 * and report the issue to the caller.  If we exhaust the
+		 * "hello timeout" and we haven't exhausted our retries, try
+		 * again.  Otherwise bail with a timeout error.
+		 */
+		for (;;) {
+			u32 pcie_fw;
+
+			msleep(50);
+			waiting -= 50;
+
+			/*
+			 * If neither Error nor Initialialized are indicated
+			 * by the firmware keep waiting till we exaust our
+			 * timeout ... and then retry if we haven't exhausted
+			 * our retries ...
+			 */
+			pcie_fw = t4_read_reg(adap, MA_PCIE_FW);
+			if (!(pcie_fw & (FW_PCIE_FW_ERR|FW_PCIE_FW_INIT))) {
+				if (waiting <= 0) {
+					if (retries-- > 0)
+						goto retry;
+
+					return -ETIMEDOUT;
+				}
+				continue;
+			}
+
+			/*
+			 * We either have an Error or Initialized condition
+			 * report errors preferentially.
+			 */
+			if (state) {
+				if (pcie_fw & FW_PCIE_FW_ERR)
+					*state = DEV_STATE_ERR;
+				else if (pcie_fw & FW_PCIE_FW_INIT)
+					*state = DEV_STATE_INIT;
+			}
+
+			/*
+			 * If we arrived before a Master PF was selected and
+			 * there's not a valid Master PF, grab its identity
+			 * for our caller.
+			 */
+			if (master_mbox == FW_PCIE_FW_MASTER_MASK &&
+			    (pcie_fw & FW_PCIE_FW_MASTER_VLD))
+				master_mbox = FW_PCIE_FW_MASTER_GET(pcie_fw);
+			break;
+		}
+	}
+
+	return master_mbox;
+}
+
+/**
+ *	t4_fw_bye - end communication with FW
+ *	@adap: the adapter
+ *	@mbox: mailbox to use for the FW command
+ *
+ *	Issues a command to terminate communication with FW.
+ */
+int t4_fw_bye(struct adapter *adap, unsigned int mbox)
+{
+	struct fw_bye_cmd c;
+
+	memset(&c, 0, sizeof(c));
+	INIT_CMD(c, BYE, WRITE);
+	return t4_wr_mbox(adap, mbox, &c, sizeof(c), NULL);
+}
+
+/**
+ *	t4_init_cmd - ask FW to initialize the device
+ *	@adap: the adapter
+ *	@mbox: mailbox to use for the FW command
+ *
+ *	Issues a command to FW to partially initialize the device.  This
+ *	performs initialization that generally doesn't depend on user input.
+ */
+int t4_early_init(struct adapter *adap, unsigned int mbox)
+{
+	struct fw_initialize_cmd c;
+
+	memset(&c, 0, sizeof(c));
+	INIT_CMD(c, INITIALIZE, WRITE);
+	return t4_wr_mbox(adap, mbox, &c, sizeof(c), NULL);
+}
+
+/**
+ *	t4_fw_reset - issue a reset to FW
+ *	@adap: the adapter
+ *	@mbox: mailbox to use for the FW command
+ *	@reset: specifies the type of reset to perform
+ *
+ *	Issues a reset command of the specified type to FW.
+ */
+int t4_fw_reset(struct adapter *adap, unsigned int mbox, int reset)
+{
+	struct fw_reset_cmd c;
+
+	memset(&c, 0, sizeof(c));
+	INIT_CMD(c, RESET, WRITE);
+	c.val = htonl(reset);
+	return t4_wr_mbox(adap, mbox, &c, sizeof(c), NULL);
+}
+
+/**
+ *	t4_fw_halt - issue a reset/halt to FW and put uP into RESET
+ *	@adap: the adapter
+ *	@mbox: mailbox to use for the FW RESET command (if desired)
+ *	@force: force uP into RESET even if FW RESET command fails
+ *
+ *	Issues a RESET command to firmware (if desired) with a HALT indication
+ *	and then puts the microprocessor into RESET state.  The RESET command
+ *	will only be issued if a legitimate mailbox is provided (mbox <=
+ *	FW_PCIE_FW_MASTER_MASK).
+ *
+ *	This is generally used in order for the host to safely manipulate the
+ *	adapter without fear of conflicting with whatever the firmware might
+ *	be doing.  The only way out of this state is to RESTART the firmware
+ *	...
+ */
+static int t4_fw_halt(struct adapter *adap, unsigned int mbox, int force)
+{
+	int ret = 0;
+
+	/*
+	 * If a legitimate mailbox is provided, issue a RESET command
+	 * with a HALT indication.
+	 */
+	if (mbox <= FW_PCIE_FW_MASTER_MASK) {
+		struct fw_reset_cmd c;
+
+		memset(&c, 0, sizeof(c));
+		INIT_CMD(c, RESET, WRITE);
+		c.val = htonl(PIORST | PIORSTMODE);
+		c.halt_pkd = htonl(FW_RESET_CMD_HALT(1U));
+		ret = t4_wr_mbox(adap, mbox, &c, sizeof(c), NULL);
+	}
+
+	/*
+	 * Normally we won't complete the operation if the firmware RESET
+	 * command fails but if our caller insists we'll go ahead and put the
+	 * uP into RESET.  This can be useful if the firmware is hung or even
+	 * missing ...  We'll have to take the risk of putting the uP into
+	 * RESET without the cooperation of firmware in that case.
+	 *
+	 * We also force the firmware's HALT flag to be on in case we bypassed
+	 * the firmware RESET command above or we're dealing with old firmware
+	 * which doesn't have the HALT capability.  This will serve as a flag
+	 * for the incoming firmware to know that it's coming out of a HALT
+	 * rather than a RESET ... if it's new enough to understand that ...
+	 */
+	if (ret == 0 || force) {
+		t4_set_reg_field(adap, CIM_BOOT_CFG, UPCRST, UPCRST);
+		t4_set_reg_field(adap, PCIE_FW, FW_PCIE_FW_HALT,
+				 FW_PCIE_FW_HALT);
+	}
+
+	/*
+	 * And we always return the result of the firmware RESET command
+	 * even when we force the uP into RESET ...
+	 */
+	return ret;
+}
+
+/**
+ *	t4_fw_restart - restart the firmware by taking the uP out of RESET
+ *	@adap: the adapter
+ *	@reset: if we want to do a RESET to restart things
+ *
+ *	Restart firmware previously halted by t4_fw_halt().  On successful
+ *	return the previous PF Master remains as the new PF Master and there
+ *	is no need to issue a new HELLO command, etc.
+ *
+ *	We do this in two ways:
+ *
+ *	 1. If we're dealing with newer firmware we'll simply want to take
+ *	    the chip's microprocessor out of RESET.  This will cause the
+ *	    firmware to start up from its start vector.  And then we'll loop
+ *	    until the firmware indicates it's started again (PCIE_FW.HALT
+ *	    reset to 0) or we timeout.
+ *
+ *	 2. If we're dealing with older firmware then we'll need to RESET
+ *	    the chip since older firmware won't recognize the PCIE_FW.HALT
+ *	    flag and automatically RESET itself on startup.
+ */
+static int t4_fw_restart(struct adapter *adap, unsigned int mbox, int reset)
+{
+	if (reset) {
+		/*
+		 * Since we're directing the RESET instead of the firmware
+		 * doing it automatically, we need to clear the PCIE_FW.HALT
+		 * bit.
+		 */
+		t4_set_reg_field(adap, PCIE_FW, FW_PCIE_FW_HALT, 0);
+
+		/*
+		 * If we've been given a valid mailbox, first try to get the
+		 * firmware to do the RESET.  If that works, great and we can
+		 * return success.  Otherwise, if we haven't been given a
+		 * valid mailbox or the RESET command failed, fall back to
+		 * hitting the chip with a hammer.
+		 */
+		if (mbox <= FW_PCIE_FW_MASTER_MASK) {
+			t4_set_reg_field(adap, CIM_BOOT_CFG, UPCRST, 0);
+			msleep(100);
+			if (t4_fw_reset(adap, mbox,
+					PIORST | PIORSTMODE) == 0)
+				return 0;
+		}
+
+		t4_write_reg(adap, PL_RST, PIORST | PIORSTMODE);
+		msleep(2000);
+	} else {
+		int ms;
+
+		t4_set_reg_field(adap, CIM_BOOT_CFG, UPCRST, 0);
+		for (ms = 0; ms < FW_CMD_MAX_TIMEOUT; ) {
+			if (!(t4_read_reg(adap, PCIE_FW) & FW_PCIE_FW_HALT))
+				return 0;
+			msleep(100);
+			ms += 100;
+		}
+		return -ETIMEDOUT;
+	}
+	return 0;
+}
+
+/**
+ *	t4_fw_upgrade - perform all of the steps necessary to upgrade FW
+ *	@adap: the adapter
+ *	@mbox: mailbox to use for the FW RESET command (if desired)
+ *	@fw_data: the firmware image to write
+ *	@size: image size
+ *	@force: force upgrade even if firmware doesn't cooperate
+ *
+ *	Perform all of the steps necessary for upgrading an adapter's
+ *	firmware image.  Normally this requires the cooperation of the
+ *	existing firmware in order to halt all existing activities
+ *	but if an invalid mailbox token is passed in we skip that step
+ *	(though we'll still put the adapter microprocessor into RESET in
+ *	that case).
+ *
+ *	On successful return the new firmware will have been loaded and
+ *	the adapter will have been fully RESET losing all previous setup
+ *	state.  On unsuccessful return the adapter may be completely hosed ...
+ *	positive errno indicates that the adapter is ~probably~ intact, a
+ *	negative errno indicates that things are looking bad ...
+ */
+int t4_fw_upgrade(struct adapter *adap, unsigned int mbox,
+		  const u8 *fw_data, unsigned int size, int force)
+{
+	const struct fw_hdr *fw_hdr = (const struct fw_hdr *)fw_data;
+	int reset, ret;
+
+	ret = t4_fw_halt(adap, mbox, force);
+	if (ret < 0 && !force)
+		return ret;
+
+	ret = t4_load_fw(adap, fw_data, size);
+	if (ret < 0)
+		return ret;
+
+	/*
+	 * Older versions of the firmware don't understand the new
+	 * PCIE_FW.HALT flag and so won't know to perform a RESET when they
+	 * restart.  So for newly loaded older firmware we'll have to do the
+	 * RESET for it so it starts up on a clean slate.  We can tell if
+	 * the newly loaded firmware will handle this right by checking
+	 * its header flags to see if it advertises the capability.
+	 */
+	reset = ((ntohl(fw_hdr->flags) & FW_HDR_FLAGS_RESET_HALT) == 0);
+	return t4_fw_restart(adap, mbox, reset);
+}
+
+/**
+ *	t4_fixup_host_params - fix up host-dependent parameters
+ *	@adap: the adapter
+ *	@page_size: the host's Base Page Size
+ *	@cache_line_size: the host's Cache Line Size
+ *
+ *	Various registers in T4 contain values which are dependent on the
+ *	host's Base Page and Cache Line Sizes.  This function will fix all of
+ *	those registers with the appropriate values as passed in ...
+ */
+int t4_fixup_host_params(struct adapter *adap, unsigned int page_size,
+			 unsigned int cache_line_size)
+{
+	unsigned int page_shift = fls(page_size) - 1;
+	unsigned int sge_hps = page_shift - 10;
+	unsigned int stat_len = cache_line_size > 64 ? 128 : 64;
+	unsigned int fl_align = cache_line_size < 32 ? 32 : cache_line_size;
+	unsigned int fl_align_log = fls(fl_align) - 1;
+
+	t4_write_reg(adap, SGE_HOST_PAGE_SIZE,
+		     HOSTPAGESIZEPF0(sge_hps) |
+		     HOSTPAGESIZEPF1(sge_hps) |
+		     HOSTPAGESIZEPF2(sge_hps) |
+		     HOSTPAGESIZEPF3(sge_hps) |
+		     HOSTPAGESIZEPF4(sge_hps) |
+		     HOSTPAGESIZEPF5(sge_hps) |
+		     HOSTPAGESIZEPF6(sge_hps) |
+		     HOSTPAGESIZEPF7(sge_hps));
+
+	if (is_t4(adap->params.chip)) {
+		t4_set_reg_field(adap, SGE_CONTROL,
+				 INGPADBOUNDARY_MASK |
+				 EGRSTATUSPAGESIZE_MASK,
+				 INGPADBOUNDARY(fl_align_log - 5) |
+				 EGRSTATUSPAGESIZE(stat_len != 64));
+	} else {
+		/* T5 introduced the separation of the Free List Padding and
+		 * Packing Boundaries.  Thus, we can select a smaller Padding
+		 * Boundary to avoid uselessly chewing up PCIe Link and Memory
+		 * Bandwidth, and use a Packing Boundary which is large enough
+		 * to avoid false sharing between CPUs, etc.
+		 *
+		 * For the PCI Link, the smaller the Padding Boundary the
+		 * better.  For the Memory Controller, a smaller Padding
+		 * Boundary is better until we cross under the Memory Line
+		 * Size (the minimum unit of transfer to/from Memory).  If we
+		 * have a Padding Boundary which is smaller than the Memory
+		 * Line Size, that'll involve a Read-Modify-Write cycle on the
+		 * Memory Controller which is never good.  For T5 the smallest
+		 * Padding Boundary which we can select is 32 bytes which is
+		 * larger than any known Memory Controller Line Size so we'll
+		 * use that.
+		 *
+		 * T5 has a different interpretation of the "0" value for the
+		 * Packing Boundary.  This corresponds to 16 bytes instead of
+		 * the expected 32 bytes.  We never have a Packing Boundary
+		 * less than 32 bytes so we can't use that special value but
+		 * on the other hand, if we wanted 32 bytes, the best we can
+		 * really do is 64 bytes.
+		*/
+		if (fl_align <= 32) {
+			fl_align = 64;
+			fl_align_log = 6;
+		}
+		t4_set_reg_field(adap, SGE_CONTROL,
+				 INGPADBOUNDARY_MASK |
+				 EGRSTATUSPAGESIZE_MASK,
+				 INGPADBOUNDARY(INGPCIEBOUNDARY_32B_X) |
+				 EGRSTATUSPAGESIZE(stat_len != 64));
+		t4_set_reg_field(adap, SGE_CONTROL2_A,
+				 INGPACKBOUNDARY_V(INGPACKBOUNDARY_M),
+				 INGPACKBOUNDARY_V(fl_align_log -
+						 INGPACKBOUNDARY_SHIFT_X));
+	}
+	/*
+	 * Adjust various SGE Free List Host Buffer Sizes.
+	 *
+	 * This is something of a crock since we're using fixed indices into
+	 * the array which are also known by the sge.c code and the T4
+	 * Firmware Configuration File.  We need to come up with a much better
+	 * approach to managing this array.  For now, the first four entries
+	 * are:
+	 *
+	 *   0: Host Page Size
+	 *   1: 64KB
+	 *   2: Buffer size corresponding to 1500 byte MTU (unpacked mode)
+	 *   3: Buffer size corresponding to 9000 byte MTU (unpacked mode)
+	 *
+	 * For the single-MTU buffers in unpacked mode we need to include
+	 * space for the SGE Control Packet Shift, 14 byte Ethernet header,
+	 * possible 4 byte VLAN tag, all rounded up to the next Ingress Packet
+	 * Padding boundry.  All of these are accommodated in the Factory
+	 * Default Firmware Configuration File but we need to adjust it for
+	 * this host's cache line size.
+	 */
+	t4_write_reg(adap, SGE_FL_BUFFER_SIZE0, page_size);
+	t4_write_reg(adap, SGE_FL_BUFFER_SIZE2,
+		     (t4_read_reg(adap, SGE_FL_BUFFER_SIZE2) + fl_align-1)
+		     & ~(fl_align-1));
+	t4_write_reg(adap, SGE_FL_BUFFER_SIZE3,
+		     (t4_read_reg(adap, SGE_FL_BUFFER_SIZE3) + fl_align-1)
+		     & ~(fl_align-1));
+
+	t4_write_reg(adap, ULP_RX_TDDP_PSZ, HPZ0(page_shift - 12));
+
+	return 0;
+}
+
+/**
+ *	t4_fw_initialize - ask FW to initialize the device
+ *	@adap: the adapter
+ *	@mbox: mailbox to use for the FW command
+ *
+ *	Issues a command to FW to partially initialize the device.  This
+ *	performs initialization that generally doesn't depend on user input.
+ */
+int t4_fw_initialize(struct adapter *adap, unsigned int mbox)
+{
+	struct fw_initialize_cmd c;
+
+	memset(&c, 0, sizeof(c));
+	INIT_CMD(c, INITIALIZE, WRITE);
+	return t4_wr_mbox(adap, mbox, &c, sizeof(c), NULL);
+}
+
+/**
+ *	t4_query_params - query FW or device parameters
+ *	@adap: the adapter
+ *	@mbox: mailbox to use for the FW command
+ *	@pf: the PF
+ *	@vf: the VF
+ *	@nparams: the number of parameters
+ *	@params: the parameter names
+ *	@val: the parameter values
+ *
+ *	Reads the value of FW or device parameters.  Up to 7 parameters can be
+ *	queried at once.
+ */
+int t4_query_params(struct adapter *adap, unsigned int mbox, unsigned int pf,
+		    unsigned int vf, unsigned int nparams, const u32 *params,
+		    u32 *val)
+{
+	int i, ret;
+	struct fw_params_cmd c;
+	__be32 *p = &c.param[0].mnem;
+
+	if (nparams > 7)
+		return -EINVAL;
+
+	memset(&c, 0, sizeof(c));
+	c.op_to_vfn = htonl(FW_CMD_OP(FW_PARAMS_CMD) | FW_CMD_REQUEST |
+			    FW_CMD_READ | FW_PARAMS_CMD_PFN(pf) |
+			    FW_PARAMS_CMD_VFN(vf));
+	c.retval_len16 = htonl(FW_LEN16(c));
+	for (i = 0; i < nparams; i++, p += 2)
+		*p = htonl(*params++);
+
+	ret = t4_wr_mbox(adap, mbox, &c, sizeof(c), &c);
+	if (ret == 0)
+		for (i = 0, p = &c.param[0].val; i < nparams; i++, p += 2)
+			*val++ = ntohl(*p);
+	return ret;
+}
+
+/**
+ *      t4_set_params_nosleep - sets FW or device parameters
+ *      @adap: the adapter
+ *      @mbox: mailbox to use for the FW command
+ *      @pf: the PF
+ *      @vf: the VF
+ *      @nparams: the number of parameters
+ *      @params: the parameter names
+ *      @val: the parameter values
+ *
+ *	 Does not ever sleep
+ *      Sets the value of FW or device parameters.  Up to 7 parameters can be
+ *      specified at once.
+ */
+int t4_set_params_nosleep(struct adapter *adap, unsigned int mbox,
+			  unsigned int pf, unsigned int vf,
+			  unsigned int nparams, const u32 *params,
+			  const u32 *val)
+{
+	struct fw_params_cmd c;
+	__be32 *p = &c.param[0].mnem;
+
+	if (nparams > 7)
+		return -EINVAL;
+
+	memset(&c, 0, sizeof(c));
+	c.op_to_vfn = cpu_to_be32(FW_CMD_OP(FW_PARAMS_CMD) |
+				FW_CMD_REQUEST | FW_CMD_WRITE |
+				FW_PARAMS_CMD_PFN(pf) |
+				FW_PARAMS_CMD_VFN(vf));
+	c.retval_len16 = cpu_to_be32(FW_LEN16(c));
+
+	while (nparams--) {
+		*p++ = cpu_to_be32(*params++);
+		*p++ = cpu_to_be32(*val++);
+	}
+
+	return t4_wr_mbox_ns(adap, mbox, &c, sizeof(c), NULL);
+}
+
+/**
+ *	t4_set_params - sets FW or device parameters
+ *	@adap: the adapter
+ *	@mbox: mailbox to use for the FW command
+ *	@pf: the PF
+ *	@vf: the VF
+ *	@nparams: the number of parameters
+ *	@params: the parameter names
+ *	@val: the parameter values
+ *
+ *	Sets the value of FW or device parameters.  Up to 7 parameters can be
+ *	specified at once.
+ */
+int t4_set_params(struct adapter *adap, unsigned int mbox, unsigned int pf,
+		  unsigned int vf, unsigned int nparams, const u32 *params,
+		  const u32 *val)
+{
+	struct fw_params_cmd c;
+	__be32 *p = &c.param[0].mnem;
+
+	if (nparams > 7)
+		return -EINVAL;
+
+	memset(&c, 0, sizeof(c));
+	c.op_to_vfn = htonl(FW_CMD_OP(FW_PARAMS_CMD) | FW_CMD_REQUEST |
+			    FW_CMD_WRITE | FW_PARAMS_CMD_PFN(pf) |
+			    FW_PARAMS_CMD_VFN(vf));
+	c.retval_len16 = htonl(FW_LEN16(c));
+	while (nparams--) {
+		*p++ = htonl(*params++);
+		*p++ = htonl(*val++);
+	}
+
+	return t4_wr_mbox(adap, mbox, &c, sizeof(c), NULL);
+}
+
+/**
+ *	t4_cfg_pfvf - configure PF/VF resource limits
+ *	@adap: the adapter
+ *	@mbox: mailbox to use for the FW command
+ *	@pf: the PF being configured
+ *	@vf: the VF being configured
+ *	@txq: the max number of egress queues
+ *	@txq_eth_ctrl: the max number of egress Ethernet or control queues
+ *	@rxqi: the max number of interrupt-capable ingress queues
+ *	@rxq: the max number of interruptless ingress queues
+ *	@tc: the PCI traffic class
+ *	@vi: the max number of virtual interfaces
+ *	@cmask: the channel access rights mask for the PF/VF
+ *	@pmask: the port access rights mask for the PF/VF
+ *	@nexact: the maximum number of exact MPS filters
+ *	@rcaps: read capabilities
+ *	@wxcaps: write/execute capabilities
+ *
+ *	Configures resource limits and capabilities for a physical or virtual
+ *	function.
+ */
+int t4_cfg_pfvf(struct adapter *adap, unsigned int mbox, unsigned int pf,
+		unsigned int vf, unsigned int txq, unsigned int txq_eth_ctrl,
+		unsigned int rxqi, unsigned int rxq, unsigned int tc,
+		unsigned int vi, unsigned int cmask, unsigned int pmask,
+		unsigned int nexact, unsigned int rcaps, unsigned int wxcaps)
+{
+	struct fw_pfvf_cmd c;
+
+	memset(&c, 0, sizeof(c));
+	c.op_to_vfn = htonl(FW_CMD_OP(FW_PFVF_CMD) | FW_CMD_REQUEST |
+			    FW_CMD_WRITE | FW_PFVF_CMD_PFN(pf) |
+			    FW_PFVF_CMD_VFN(vf));
+	c.retval_len16 = htonl(FW_LEN16(c));
+	c.niqflint_niq = htonl(FW_PFVF_CMD_NIQFLINT(rxqi) |
+			       FW_PFVF_CMD_NIQ(rxq));
+	c.type_to_neq = htonl(FW_PFVF_CMD_CMASK(cmask) |
+			       FW_PFVF_CMD_PMASK(pmask) |
+			       FW_PFVF_CMD_NEQ(txq));
+	c.tc_to_nexactf = htonl(FW_PFVF_CMD_TC(tc) | FW_PFVF_CMD_NVI(vi) |
+				FW_PFVF_CMD_NEXACTF(nexact));
+	c.r_caps_to_nethctrl = htonl(FW_PFVF_CMD_R_CAPS(rcaps) |
+				     FW_PFVF_CMD_WX_CAPS(wxcaps) |
+				     FW_PFVF_CMD_NETHCTRL(txq_eth_ctrl));
+	return t4_wr_mbox(adap, mbox, &c, sizeof(c), NULL);
+}
+
+/**
+ *	t4_alloc_vi - allocate a virtual interface
+ *	@adap: the adapter
+ *	@mbox: mailbox to use for the FW command
+ *	@port: physical port associated with the VI
+ *	@pf: the PF owning the VI
+ *	@vf: the VF owning the VI
+ *	@nmac: number of MAC addresses needed (1 to 5)
+ *	@mac: the MAC addresses of the VI
+ *	@rss_size: size of RSS table slice associated with this VI
+ *
+ *	Allocates a virtual interface for the given physical port.  If @mac is
+ *	not %NULL it contains the MAC addresses of the VI as assigned by FW.
+ *	@mac should be large enough to hold @nmac Ethernet addresses, they are
+ *	stored consecutively so the space needed is @nmac * 6 bytes.
+ *	Returns a negative error number or the non-negative VI id.
+ */
+int t4_alloc_vi(struct adapter *adap, unsigned int mbox, unsigned int port,
+		unsigned int pf, unsigned int vf, unsigned int nmac, u8 *mac,
+		unsigned int *rss_size)
+{
+	int ret;
+	struct fw_vi_cmd c;
+
+	memset(&c, 0, sizeof(c));
+	c.op_to_vfn = htonl(FW_CMD_OP(FW_VI_CMD) | FW_CMD_REQUEST |
+			    FW_CMD_WRITE | FW_CMD_EXEC |
+			    FW_VI_CMD_PFN(pf) | FW_VI_CMD_VFN(vf));
+	c.alloc_to_len16 = htonl(FW_VI_CMD_ALLOC | FW_LEN16(c));
+	c.portid_pkd = FW_VI_CMD_PORTID(port);
+	c.nmac = nmac - 1;
+
+	ret = t4_wr_mbox(adap, mbox, &c, sizeof(c), &c);
+	if (ret)
+		return ret;
+
+	if (mac) {
+		memcpy(mac, c.mac, sizeof(c.mac));
+		switch (nmac) {
+		case 5:
+			memcpy(mac + 24, c.nmac3, sizeof(c.nmac3));
+		case 4:
+			memcpy(mac + 18, c.nmac2, sizeof(c.nmac2));
+		case 3:
+			memcpy(mac + 12, c.nmac1, sizeof(c.nmac1));
+		case 2:
+			memcpy(mac + 6,  c.nmac0, sizeof(c.nmac0));
+		}
+	}
+	if (rss_size)
+		*rss_size = FW_VI_CMD_RSSSIZE_GET(ntohs(c.rsssize_pkd));
+	return FW_VI_CMD_VIID_GET(ntohs(c.type_viid));
+}
+
+/**
+ *	t4_set_rxmode - set Rx properties of a virtual interface
+ *	@adap: the adapter
+ *	@mbox: mailbox to use for the FW command
+ *	@viid: the VI id
+ *	@mtu: the new MTU or -1
+ *	@promisc: 1 to enable promiscuous mode, 0 to disable it, -1 no change
+ *	@all_multi: 1 to enable all-multi mode, 0 to disable it, -1 no change
+ *	@bcast: 1 to enable broadcast Rx, 0 to disable it, -1 no change
+ *	@vlanex: 1 to enable HW VLAN extraction, 0 to disable it, -1 no change
+ *	@sleep_ok: if true we may sleep while awaiting command completion
+ *
+ *	Sets Rx properties of a virtual interface.
+ */
+int t4_set_rxmode(struct adapter *adap, unsigned int mbox, unsigned int viid,
+		  int mtu, int promisc, int all_multi, int bcast, int vlanex,
+		  bool sleep_ok)
+{
+	struct fw_vi_rxmode_cmd c;
+
+	/* convert to FW values */
+	if (mtu < 0)
+		mtu = FW_RXMODE_MTU_NO_CHG;
+	if (promisc < 0)
+		promisc = FW_VI_RXMODE_CMD_PROMISCEN_MASK;
+	if (all_multi < 0)
+		all_multi = FW_VI_RXMODE_CMD_ALLMULTIEN_MASK;
+	if (bcast < 0)
+		bcast = FW_VI_RXMODE_CMD_BROADCASTEN_MASK;
+	if (vlanex < 0)
+		vlanex = FW_VI_RXMODE_CMD_VLANEXEN_MASK;
+
+	memset(&c, 0, sizeof(c));
+	c.op_to_viid = htonl(FW_CMD_OP(FW_VI_RXMODE_CMD) | FW_CMD_REQUEST |
+			     FW_CMD_WRITE | FW_VI_RXMODE_CMD_VIID(viid));
+	c.retval_len16 = htonl(FW_LEN16(c));
+	c.mtu_to_vlanexen = htonl(FW_VI_RXMODE_CMD_MTU(mtu) |
+				  FW_VI_RXMODE_CMD_PROMISCEN(promisc) |
+				  FW_VI_RXMODE_CMD_ALLMULTIEN(all_multi) |
+				  FW_VI_RXMODE_CMD_BROADCASTEN(bcast) |
+				  FW_VI_RXMODE_CMD_VLANEXEN(vlanex));
+	return t4_wr_mbox_meat(adap, mbox, &c, sizeof(c), NULL, sleep_ok);
+}
+
+/**
+ *	t4_alloc_mac_filt - allocates exact-match filters for MAC addresses
+ *	@adap: the adapter
+ *	@mbox: mailbox to use for the FW command
+ *	@viid: the VI id
+ *	@free: if true any existing filters for this VI id are first removed
+ *	@naddr: the number of MAC addresses to allocate filters for (up to 7)
+ *	@addr: the MAC address(es)
+ *	@idx: where to store the index of each allocated filter
+ *	@hash: pointer to hash address filter bitmap
+ *	@sleep_ok: call is allowed to sleep
+ *
+ *	Allocates an exact-match filter for each of the supplied addresses and
+ *	sets it to the corresponding address.  If @idx is not %NULL it should
+ *	have at least @naddr entries, each of which will be set to the index of
+ *	the filter allocated for the corresponding MAC address.  If a filter
+ *	could not be allocated for an address its index is set to 0xffff.
+ *	If @hash is not %NULL addresses that fail to allocate an exact filter
+ *	are hashed and update the hash filter bitmap pointed at by @hash.
+ *
+ *	Returns a negative error number or the number of filters allocated.
+ */
+int t4_alloc_mac_filt(struct adapter *adap, unsigned int mbox,
+		      unsigned int viid, bool free, unsigned int naddr,
+		      const u8 **addr, u16 *idx, u64 *hash, bool sleep_ok)
+{
+	int i, ret;
+	struct fw_vi_mac_cmd c;
+	struct fw_vi_mac_exact *p;
+	unsigned int max_naddr = is_t4(adap->params.chip) ?
+				       NUM_MPS_CLS_SRAM_L_INSTANCES :
+				       NUM_MPS_T5_CLS_SRAM_L_INSTANCES;
+
+	if (naddr > 7)
+		return -EINVAL;
+
+	memset(&c, 0, sizeof(c));
+	c.op_to_viid = htonl(FW_CMD_OP(FW_VI_MAC_CMD) | FW_CMD_REQUEST |
+			     FW_CMD_WRITE | (free ? FW_CMD_EXEC : 0) |
+			     FW_VI_MAC_CMD_VIID(viid));
+	c.freemacs_to_len16 = htonl(FW_VI_MAC_CMD_FREEMACS(free) |
+				    FW_CMD_LEN16((naddr + 2) / 2));
+
+	for (i = 0, p = c.u.exact; i < naddr; i++, p++) {
+		p->valid_to_idx = htons(FW_VI_MAC_CMD_VALID |
+				      FW_VI_MAC_CMD_IDX(FW_VI_MAC_ADD_MAC));
+		memcpy(p->macaddr, addr[i], sizeof(p->macaddr));
+	}
+
+	ret = t4_wr_mbox_meat(adap, mbox, &c, sizeof(c), &c, sleep_ok);
+	if (ret)
+		return ret;
+
+	for (i = 0, p = c.u.exact; i < naddr; i++, p++) {
+		u16 index = FW_VI_MAC_CMD_IDX_GET(ntohs(p->valid_to_idx));
+
+		if (idx)
+			idx[i] = index >= max_naddr ? 0xffff : index;
+		if (index < max_naddr)
+			ret++;
+		else if (hash)
+			*hash |= (1ULL << hash_mac_addr(addr[i]));
+	}
+	return ret;
+}
+
+/**
+ *	t4_change_mac - modifies the exact-match filter for a MAC address
+ *	@adap: the adapter
+ *	@mbox: mailbox to use for the FW command
+ *	@viid: the VI id
+ *	@idx: index of existing filter for old value of MAC address, or -1
+ *	@addr: the new MAC address value
+ *	@persist: whether a new MAC allocation should be persistent
+ *	@add_smt: if true also add the address to the HW SMT
+ *
+ *	Modifies an exact-match filter and sets it to the new MAC address.
+ *	Note that in general it is not possible to modify the value of a given
+ *	filter so the generic way to modify an address filter is to free the one
+ *	being used by the old address value and allocate a new filter for the
+ *	new address value.  @idx can be -1 if the address is a new addition.
+ *
+ *	Returns a negative error number or the index of the filter with the new
+ *	MAC value.
+ */
+int t4_change_mac(struct adapter *adap, unsigned int mbox, unsigned int viid,
+		  int idx, const u8 *addr, bool persist, bool add_smt)
+{
+	int ret, mode;
+	struct fw_vi_mac_cmd c;
+	struct fw_vi_mac_exact *p = c.u.exact;
+	unsigned int max_mac_addr = is_t4(adap->params.chip) ?
+				    NUM_MPS_CLS_SRAM_L_INSTANCES :
+				    NUM_MPS_T5_CLS_SRAM_L_INSTANCES;
+
+	if (idx < 0)                             /* new allocation */
+		idx = persist ? FW_VI_MAC_ADD_PERSIST_MAC : FW_VI_MAC_ADD_MAC;
+	mode = add_smt ? FW_VI_MAC_SMT_AND_MPSTCAM : FW_VI_MAC_MPS_TCAM_ENTRY;
+
+	memset(&c, 0, sizeof(c));
+	c.op_to_viid = htonl(FW_CMD_OP(FW_VI_MAC_CMD) | FW_CMD_REQUEST |
+			     FW_CMD_WRITE | FW_VI_MAC_CMD_VIID(viid));
+	c.freemacs_to_len16 = htonl(FW_CMD_LEN16(1));
+	p->valid_to_idx = htons(FW_VI_MAC_CMD_VALID |
+				FW_VI_MAC_CMD_SMAC_RESULT(mode) |
+				FW_VI_MAC_CMD_IDX(idx));
+	memcpy(p->macaddr, addr, sizeof(p->macaddr));
+
+	ret = t4_wr_mbox(adap, mbox, &c, sizeof(c), &c);
+	if (ret == 0) {
+		ret = FW_VI_MAC_CMD_IDX_GET(ntohs(p->valid_to_idx));
+		if (ret >= max_mac_addr)
+			ret = -ENOMEM;
+	}
+	return ret;
+}
+
+/**
+ *	t4_set_addr_hash - program the MAC inexact-match hash filter
+ *	@adap: the adapter
+ *	@mbox: mailbox to use for the FW command
+ *	@viid: the VI id
+ *	@ucast: whether the hash filter should also match unicast addresses
+ *	@vec: the value to be written to the hash filter
+ *	@sleep_ok: call is allowed to sleep
+ *
+ *	Sets the 64-bit inexact-match hash filter for a virtual interface.
+ */
+int t4_set_addr_hash(struct adapter *adap, unsigned int mbox, unsigned int viid,
+		     bool ucast, u64 vec, bool sleep_ok)
+{
+	struct fw_vi_mac_cmd c;
+
+	memset(&c, 0, sizeof(c));
+	c.op_to_viid = htonl(FW_CMD_OP(FW_VI_MAC_CMD) | FW_CMD_REQUEST |
+			     FW_CMD_WRITE | FW_VI_ENABLE_CMD_VIID(viid));
+	c.freemacs_to_len16 = htonl(FW_VI_MAC_CMD_HASHVECEN |
+				    FW_VI_MAC_CMD_HASHUNIEN(ucast) |
+				    FW_CMD_LEN16(1));
+	c.u.hash.hashvec = cpu_to_be64(vec);
+	return t4_wr_mbox_meat(adap, mbox, &c, sizeof(c), NULL, sleep_ok);
+}
+
+/**
+ *      t4_enable_vi_params - enable/disable a virtual interface
+ *      @adap: the adapter
+ *      @mbox: mailbox to use for the FW command
+ *      @viid: the VI id
+ *      @rx_en: 1=enable Rx, 0=disable Rx
+ *      @tx_en: 1=enable Tx, 0=disable Tx
+ *      @dcb_en: 1=enable delivery of Data Center Bridging messages.
+ *
+ *      Enables/disables a virtual interface.  Note that setting DCB Enable
+ *      only makes sense when enabling a Virtual Interface ...
+ */
+int t4_enable_vi_params(struct adapter *adap, unsigned int mbox,
+			unsigned int viid, bool rx_en, bool tx_en, bool dcb_en)
+{
+	struct fw_vi_enable_cmd c;
+
+	memset(&c, 0, sizeof(c));
+	c.op_to_viid = htonl(FW_CMD_OP(FW_VI_ENABLE_CMD) | FW_CMD_REQUEST |
+			     FW_CMD_EXEC | FW_VI_ENABLE_CMD_VIID(viid));
+
+	c.ien_to_len16 = htonl(FW_VI_ENABLE_CMD_IEN(rx_en) |
+			       FW_VI_ENABLE_CMD_EEN(tx_en) | FW_LEN16(c) |
+			       FW_VI_ENABLE_CMD_DCB_INFO(dcb_en));
+	return t4_wr_mbox_ns(adap, mbox, &c, sizeof(c), NULL);
+}
+
+/**
+ *	t4_enable_vi - enable/disable a virtual interface
+ *	@adap: the adapter
+ *	@mbox: mailbox to use for the FW command
+ *	@viid: the VI id
+ *	@rx_en: 1=enable Rx, 0=disable Rx
+ *	@tx_en: 1=enable Tx, 0=disable Tx
+ *
+ *	Enables/disables a virtual interface.
+ */
+int t4_enable_vi(struct adapter *adap, unsigned int mbox, unsigned int viid,
+		 bool rx_en, bool tx_en)
+{
+	return t4_enable_vi_params(adap, mbox, viid, rx_en, tx_en, 0);
+}
+
+/**
+ *	t4_identify_port - identify a VI's port by blinking its LED
+ *	@adap: the adapter
+ *	@mbox: mailbox to use for the FW command
+ *	@viid: the VI id
+ *	@nblinks: how many times to blink LED at 2.5 Hz
+ *
+ *	Identifies a VI's port by blinking its LED.
+ */
+int t4_identify_port(struct adapter *adap, unsigned int mbox, unsigned int viid,
+		     unsigned int nblinks)
+{
+	struct fw_vi_enable_cmd c;
+
+	memset(&c, 0, sizeof(c));
+	c.op_to_viid = htonl(FW_CMD_OP(FW_VI_ENABLE_CMD) | FW_CMD_REQUEST |
+			     FW_CMD_EXEC | FW_VI_ENABLE_CMD_VIID(viid));
+	c.ien_to_len16 = htonl(FW_VI_ENABLE_CMD_LED | FW_LEN16(c));
+	c.blinkdur = htons(nblinks);
+	return t4_wr_mbox(adap, mbox, &c, sizeof(c), NULL);
+}
+
+/**
+ *	t4_iq_free - free an ingress queue and its FLs
+ *	@adap: the adapter
+ *	@mbox: mailbox to use for the FW command
+ *	@pf: the PF owning the queues
+ *	@vf: the VF owning the queues
+ *	@iqtype: the ingress queue type
+ *	@iqid: ingress queue id
+ *	@fl0id: FL0 queue id or 0xffff if no attached FL0
+ *	@fl1id: FL1 queue id or 0xffff if no attached FL1
+ *
+ *	Frees an ingress queue and its associated FLs, if any.
+ */
+int t4_iq_free(struct adapter *adap, unsigned int mbox, unsigned int pf,
+	       unsigned int vf, unsigned int iqtype, unsigned int iqid,
+	       unsigned int fl0id, unsigned int fl1id)
+{
+	struct fw_iq_cmd c;
+
+	memset(&c, 0, sizeof(c));
+	c.op_to_vfn = htonl(FW_CMD_OP(FW_IQ_CMD) | FW_CMD_REQUEST |
+			    FW_CMD_EXEC | FW_IQ_CMD_PFN(pf) |
+			    FW_IQ_CMD_VFN(vf));
+	c.alloc_to_len16 = htonl(FW_IQ_CMD_FREE | FW_LEN16(c));
+	c.type_to_iqandstindex = htonl(FW_IQ_CMD_TYPE(iqtype));
+	c.iqid = htons(iqid);
+	c.fl0id = htons(fl0id);
+	c.fl1id = htons(fl1id);
+	return t4_wr_mbox(adap, mbox, &c, sizeof(c), NULL);
+}
+
+/**
+ *	t4_eth_eq_free - free an Ethernet egress queue
+ *	@adap: the adapter
+ *	@mbox: mailbox to use for the FW command
+ *	@pf: the PF owning the queue
+ *	@vf: the VF owning the queue
+ *	@eqid: egress queue id
+ *
+ *	Frees an Ethernet egress queue.
+ */
+int t4_eth_eq_free(struct adapter *adap, unsigned int mbox, unsigned int pf,
+		   unsigned int vf, unsigned int eqid)
+{
+	struct fw_eq_eth_cmd c;
+
+	memset(&c, 0, sizeof(c));
+	c.op_to_vfn = htonl(FW_CMD_OP(FW_EQ_ETH_CMD) | FW_CMD_REQUEST |
+			    FW_CMD_EXEC | FW_EQ_ETH_CMD_PFN(pf) |
+			    FW_EQ_ETH_CMD_VFN(vf));
+	c.alloc_to_len16 = htonl(FW_EQ_ETH_CMD_FREE | FW_LEN16(c));
+	c.eqid_pkd = htonl(FW_EQ_ETH_CMD_EQID(eqid));
+	return t4_wr_mbox(adap, mbox, &c, sizeof(c), NULL);
+}
+
+/**
+ *	t4_ctrl_eq_free - free a control egress queue
+ *	@adap: the adapter
+ *	@mbox: mailbox to use for the FW command
+ *	@pf: the PF owning the queue
+ *	@vf: the VF owning the queue
+ *	@eqid: egress queue id
+ *
+ *	Frees a control egress queue.
+ */
+int t4_ctrl_eq_free(struct adapter *adap, unsigned int mbox, unsigned int pf,
+		    unsigned int vf, unsigned int eqid)
+{
+	struct fw_eq_ctrl_cmd c;
+
+	memset(&c, 0, sizeof(c));
+	c.op_to_vfn = htonl(FW_CMD_OP(FW_EQ_CTRL_CMD) | FW_CMD_REQUEST |
+			    FW_CMD_EXEC | FW_EQ_CTRL_CMD_PFN(pf) |
+			    FW_EQ_CTRL_CMD_VFN(vf));
+	c.alloc_to_len16 = htonl(FW_EQ_CTRL_CMD_FREE | FW_LEN16(c));
+	c.cmpliqid_eqid = htonl(FW_EQ_CTRL_CMD_EQID(eqid));
+	return t4_wr_mbox(adap, mbox, &c, sizeof(c), NULL);
+}
+
+/**
+ *	t4_ofld_eq_free - free an offload egress queue
+ *	@adap: the adapter
+ *	@mbox: mailbox to use for the FW command
+ *	@pf: the PF owning the queue
+ *	@vf: the VF owning the queue
+ *	@eqid: egress queue id
+ *
+ *	Frees a control egress queue.
+ */
+int t4_ofld_eq_free(struct adapter *adap, unsigned int mbox, unsigned int pf,
+		    unsigned int vf, unsigned int eqid)
+{
+	struct fw_eq_ofld_cmd c;
+
+	memset(&c, 0, sizeof(c));
+	c.op_to_vfn = htonl(FW_CMD_OP(FW_EQ_OFLD_CMD) | FW_CMD_REQUEST |
+			    FW_CMD_EXEC | FW_EQ_OFLD_CMD_PFN(pf) |
+			    FW_EQ_OFLD_CMD_VFN(vf));
+	c.alloc_to_len16 = htonl(FW_EQ_OFLD_CMD_FREE | FW_LEN16(c));
+	c.eqid_pkd = htonl(FW_EQ_OFLD_CMD_EQID(eqid));
+	return t4_wr_mbox(adap, mbox, &c, sizeof(c), NULL);
+}
+
+/**
+ *	t4_handle_fw_rpl - process a FW reply message
+ *	@adap: the adapter
+ *	@rpl: start of the FW message
+ *
+ *	Processes a FW message, such as link state change messages.
+ */
+int t4_handle_fw_rpl(struct adapter *adap, const __be64 *rpl)
+{
+	u8 opcode = *(const u8 *)rpl;
+
+	if (opcode == FW_PORT_CMD) {    /* link/module state change message */
+		int speed = 0, fc = 0;
+		const struct fw_port_cmd *p = (void *)rpl;
+		int chan = FW_PORT_CMD_PORTID_GET(ntohl(p->op_to_portid));
+		int port = adap->chan_map[chan];
+		struct port_info *pi = adap2pinfo(adap, port);
+		struct link_config *lc = &pi->link_cfg;
+		u32 stat = ntohl(p->u.info.lstatus_to_modtype);
+		int link_ok = (stat & FW_PORT_CMD_LSTATUS) != 0;
+		u32 mod = FW_PORT_CMD_MODTYPE_GET(stat);
+
+		if (stat & FW_PORT_CMD_RXPAUSE)
+			fc |= PAUSE_RX;
+		if (stat & FW_PORT_CMD_TXPAUSE)
+			fc |= PAUSE_TX;
+		if (stat & FW_PORT_CMD_LSPEED(FW_PORT_CAP_SPEED_100M))
+			speed = 100;
+		else if (stat & FW_PORT_CMD_LSPEED(FW_PORT_CAP_SPEED_1G))
+			speed = 1000;
+		else if (stat & FW_PORT_CMD_LSPEED(FW_PORT_CAP_SPEED_10G))
+			speed = 10000;
+		else if (stat & FW_PORT_CMD_LSPEED(FW_PORT_CAP_SPEED_40G))
+			speed = 40000;
+
+		if (link_ok != lc->link_ok || speed != lc->speed ||
+		    fc != lc->fc) {                    /* something changed */
+			lc->link_ok = link_ok;
+			lc->speed = speed;
+			lc->fc = fc;
+			lc->supported = be16_to_cpu(p->u.info.pcap);
+			t4_os_link_changed(adap, port, link_ok);
+		}
+		if (mod != pi->mod_type) {
+			pi->mod_type = mod;
+			t4_os_portmod_changed(adap, port);
+		}
+	}
+	return 0;
+}
+
+static void get_pci_mode(struct adapter *adapter, struct pci_params *p)
+{
+	u16 val;
+
+	if (pci_is_pcie(adapter->pdev)) {
+		pcie_capability_read_word(adapter->pdev, PCI_EXP_LNKSTA, &val);
+		p->speed = val & PCI_EXP_LNKSTA_CLS;
+		p->width = (val & PCI_EXP_LNKSTA_NLW) >> 4;
+	}
+}
+
+/**
+ *	init_link_config - initialize a link's SW state
+ *	@lc: structure holding the link state
+ *	@caps: link capabilities
+ *
+ *	Initializes the SW state maintained for each link, including the link's
+ *	capabilities and default speed/flow-control/autonegotiation settings.
+ */
+static void init_link_config(struct link_config *lc, unsigned int caps)
+{
+	lc->supported = caps;
+	lc->requested_speed = 0;
+	lc->speed = 0;
+	lc->requested_fc = lc->fc = PAUSE_RX | PAUSE_TX;
+	if (lc->supported & FW_PORT_CAP_ANEG) {
+		lc->advertising = lc->supported & ADVERT_MASK;
+		lc->autoneg = AUTONEG_ENABLE;
+		lc->requested_fc |= PAUSE_AUTONEG;
+	} else {
+		lc->advertising = 0;
+		lc->autoneg = AUTONEG_DISABLE;
+	}
+}
+
+#define CIM_PF_NOACCESS 0xeeeeeeee
+
+int t4_wait_dev_ready(void __iomem *regs)
+{
+	u32 whoami;
+
+	whoami = readl(regs + PL_WHOAMI);
+	if (whoami != 0xffffffff && whoami != CIM_PF_NOACCESS)
+		return 0;
+
+	msleep(500);
+	whoami = readl(regs + PL_WHOAMI);
+	return (whoami != 0xffffffff && whoami != CIM_PF_NOACCESS ? 0 : -EIO);
+}
+
+struct flash_desc {
+	u32 vendor_and_model_id;
+	u32 size_mb;
+};
+
+static int get_flash_params(struct adapter *adap)
+{
+	/* Table for non-Numonix supported flash parts.  Numonix parts are left
+	 * to the preexisting code.  All flash parts have 64KB sectors.
+	 */
+	static struct flash_desc supported_flash[] = {
+		{ 0x150201, 4 << 20 },       /* Spansion 4MB S25FL032P */
+	};
+
+	int ret;
+	u32 info;
+
+	ret = sf1_write(adap, 1, 1, 0, SF_RD_ID);
+	if (!ret)
+		ret = sf1_read(adap, 3, 0, 1, &info);
+	t4_write_reg(adap, SF_OP, 0);                    /* unlock SF */
+	if (ret)
+		return ret;
+
+	for (ret = 0; ret < ARRAY_SIZE(supported_flash); ++ret)
+		if (supported_flash[ret].vendor_and_model_id == info) {
+			adap->params.sf_size = supported_flash[ret].size_mb;
+			adap->params.sf_nsec =
+				adap->params.sf_size / SF_SEC_SIZE;
+			return 0;
+		}
+
+	if ((info & 0xff) != 0x20)             /* not a Numonix flash */
+		return -EINVAL;
+	info >>= 16;                           /* log2 of size */
+	if (info >= 0x14 && info < 0x18)
+		adap->params.sf_nsec = 1 << (info - 16);
+	else if (info == 0x18)
+		adap->params.sf_nsec = 64;
+	else
+		return -EINVAL;
+	adap->params.sf_size = 1 << info;
+	adap->params.sf_fw_start =
+		t4_read_reg(adap, CIM_BOOT_CFG) & BOOTADDR_MASK;
+
+	if (adap->params.sf_size < FLASH_MIN_SIZE)
+		dev_warn(adap->pdev_dev, "WARNING!!! FLASH size %#x < %#x!!!\n",
+			 adap->params.sf_size, FLASH_MIN_SIZE);
+	return 0;
+}
+
+/**
+ *	t4_prep_adapter - prepare SW and HW for operation
+ *	@adapter: the adapter
+ *	@reset: if true perform a HW reset
+ *
+ *	Initialize adapter SW state for the various HW modules, set initial
+ *	values for some adapter tunables, take PHYs out of reset, and
+ *	initialize the MDIO interface.
+ */
+int t4_prep_adapter(struct adapter *adapter)
+{
+	int ret, ver;
+	uint16_t device_id;
+	u32 pl_rev;
+
+	get_pci_mode(adapter, &adapter->params.pci);
+	pl_rev = G_REV(t4_read_reg(adapter, PL_REV));
+
+	ret = get_flash_params(adapter);
+	if (ret < 0) {
+		dev_err(adapter->pdev_dev, "error %d identifying flash\n", ret);
+		return ret;
+	}
+
+	/* Retrieve adapter's device ID
+	 */
+	pci_read_config_word(adapter->pdev, PCI_DEVICE_ID, &device_id);
+	ver = device_id >> 12;
+	adapter->params.chip = 0;
+	switch (ver) {
+	case CHELSIO_T4:
+		adapter->params.chip |= CHELSIO_CHIP_CODE(CHELSIO_T4, pl_rev);
+		break;
+	case CHELSIO_T5:
+		adapter->params.chip |= CHELSIO_CHIP_CODE(CHELSIO_T5, pl_rev);
+		break;
+	default:
+		dev_err(adapter->pdev_dev, "Device %d is not supported\n",
+			device_id);
+		return -EINVAL;
+	}
+
+	init_cong_ctrl(adapter->params.a_wnd, adapter->params.b_wnd);
+
+	/*
+	 * Default port for debugging in case we can't reach FW.
+	 */
+	adapter->params.nports = 1;
+	adapter->params.portvec = 1;
+	adapter->params.vpd.cclk = 50000;
+	return 0;
+}
+
+/**
+ *      t4_init_tp_params - initialize adap->params.tp
+ *      @adap: the adapter
+ *
+ *      Initialize various fields of the adapter's TP Parameters structure.
+ */
+int t4_init_tp_params(struct adapter *adap)
+{
+	int chan;
+	u32 v;
+
+	v = t4_read_reg(adap, TP_TIMER_RESOLUTION);
+	adap->params.tp.tre = TIMERRESOLUTION_GET(v);
+	adap->params.tp.dack_re = DELAYEDACKRESOLUTION_GET(v);
+
+	/* MODQ_REQ_MAP defaults to setting queues 0-3 to chan 0-3 */
+	for (chan = 0; chan < NCHAN; chan++)
+		adap->params.tp.tx_modq[chan] = chan;
+
+	/* Cache the adapter's Compressed Filter Mode and global Incress
+	 * Configuration.
+	 */
+	t4_read_indirect(adap, TP_PIO_ADDR, TP_PIO_DATA,
+			 &adap->params.tp.vlan_pri_map, 1,
+			 TP_VLAN_PRI_MAP);
+	t4_read_indirect(adap, TP_PIO_ADDR, TP_PIO_DATA,
+			 &adap->params.tp.ingress_config, 1,
+			 TP_INGRESS_CONFIG);
+
+	/* Now that we have TP_VLAN_PRI_MAP cached, we can calculate the field
+	 * shift positions of several elements of the Compressed Filter Tuple
+	 * for this adapter which we need frequently ...
+	 */
+	adap->params.tp.vlan_shift = t4_filter_field_shift(adap, F_VLAN);
+	adap->params.tp.vnic_shift = t4_filter_field_shift(adap, F_VNIC_ID);
+	adap->params.tp.port_shift = t4_filter_field_shift(adap, F_PORT);
+	adap->params.tp.protocol_shift = t4_filter_field_shift(adap,
+							       F_PROTOCOL);
+
+	/* If TP_INGRESS_CONFIG.VNID == 0, then TP_VLAN_PRI_MAP.VNIC_ID
+	 * represents the presense of an Outer VLAN instead of a VNIC ID.
+	 */
+	if ((adap->params.tp.ingress_config & F_VNIC) == 0)
+		adap->params.tp.vnic_shift = -1;
+
+	return 0;
+}
+
+/**
+ *      t4_filter_field_shift - calculate filter field shift
+ *      @adap: the adapter
+ *      @filter_sel: the desired field (from TP_VLAN_PRI_MAP bits)
+ *
+ *      Return the shift position of a filter field within the Compressed
+ *      Filter Tuple.  The filter field is specified via its selection bit
+ *      within TP_VLAN_PRI_MAL (filter mode).  E.g. F_VLAN.
+ */
+int t4_filter_field_shift(const struct adapter *adap, int filter_sel)
+{
+	unsigned int filter_mode = adap->params.tp.vlan_pri_map;
+	unsigned int sel;
+	int field_shift;
+
+	if ((filter_mode & filter_sel) == 0)
+		return -1;
+
+	for (sel = 1, field_shift = 0; sel < filter_sel; sel <<= 1) {
+		switch (filter_mode & sel) {
+		case F_FCOE:
+			field_shift += W_FT_FCOE;
+			break;
+		case F_PORT:
+			field_shift += W_FT_PORT;
+			break;
+		case F_VNIC_ID:
+			field_shift += W_FT_VNIC_ID;
+			break;
+		case F_VLAN:
+			field_shift += W_FT_VLAN;
+			break;
+		case F_TOS:
+			field_shift += W_FT_TOS;
+			break;
+		case F_PROTOCOL:
+			field_shift += W_FT_PROTOCOL;
+			break;
+		case F_ETHERTYPE:
+			field_shift += W_FT_ETHERTYPE;
+			break;
+		case F_MACMATCH:
+			field_shift += W_FT_MACMATCH;
+			break;
+		case F_MPSHITTYPE:
+			field_shift += W_FT_MPSHITTYPE;
+			break;
+		case F_FRAGMENTATION:
+			field_shift += W_FT_FRAGMENTATION;
+			break;
+		}
+	}
+	return field_shift;
+}
+
+int t4_port_init(struct adapter *adap, int mbox, int pf, int vf)
+{
+	u8 addr[6];
+	int ret, i, j = 0;
+	struct fw_port_cmd c;
+	struct fw_rss_vi_config_cmd rvc;
+
+	memset(&c, 0, sizeof(c));
+	memset(&rvc, 0, sizeof(rvc));
+
+	for_each_port(adap, i) {
+		unsigned int rss_size;
+		struct port_info *p = adap2pinfo(adap, i);
+
+		while ((adap->params.portvec & (1 << j)) == 0)
+			j++;
+
+		c.op_to_portid = htonl(FW_CMD_OP(FW_PORT_CMD) |
+				       FW_CMD_REQUEST | FW_CMD_READ |
+				       FW_PORT_CMD_PORTID(j));
+		c.action_to_len16 = htonl(
+			FW_PORT_CMD_ACTION(FW_PORT_ACTION_GET_PORT_INFO) |
+			FW_LEN16(c));
+		ret = t4_wr_mbox(adap, mbox, &c, sizeof(c), &c);
+		if (ret)
+			return ret;
+
+		ret = t4_alloc_vi(adap, mbox, j, pf, vf, 1, addr, &rss_size);
+		if (ret < 0)
+			return ret;
+
+		p->viid = ret;
+		p->tx_chan = j;
+		p->lport = j;
+		p->rss_size = rss_size;
+		memcpy(adap->port[i]->dev_addr, addr, ETH_ALEN);
+		adap->port[i]->dev_port = j;
+
+		ret = ntohl(c.u.info.lstatus_to_modtype);
+		p->mdio_addr = (ret & FW_PORT_CMD_MDIOCAP) ?
+			FW_PORT_CMD_MDIOADDR_GET(ret) : -1;
+		p->port_type = FW_PORT_CMD_PTYPE_GET(ret);
+		p->mod_type = FW_PORT_MOD_TYPE_NA;
+
+		rvc.op_to_viid = htonl(FW_CMD_OP(FW_RSS_VI_CONFIG_CMD) |
+				       FW_CMD_REQUEST | FW_CMD_READ |
+				       FW_RSS_VI_CONFIG_CMD_VIID(p->viid));
+		rvc.retval_len16 = htonl(FW_LEN16(rvc));
+		ret = t4_wr_mbox(adap, mbox, &rvc, sizeof(rvc), &rvc);
+		if (ret)
+			return ret;
+		p->rss_mode = ntohl(rvc.u.basicvirtual.defaultq_to_udpen);
+
+		init_link_config(&p->link_cfg, ntohs(c.u.info.pcap));
+		j++;
+	}
+	return 0;
+}
diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4_hw.h b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.h
new file mode 100644
index 0000000..c19a90e
--- /dev/null
+++ b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.h
@@ -0,0 +1,227 @@
+/*
+ * This file is part of the Chelsio T4 Ethernet driver for Linux.
+ *
+ * Copyright (c) 2003-2014 Chelsio Communications, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __T4_HW_H
+#define __T4_HW_H
+
+#include <linux/types.h>
+
+enum {
+	NCHAN          = 4,     /* # of HW channels */
+	MAX_MTU        = 9600,  /* max MAC MTU, excluding header + FCS */
+	EEPROMSIZE     = 17408, /* Serial EEPROM physical size */
+	EEPROMVSIZE    = 32768, /* Serial EEPROM virtual address space size */
+	EEPROMPFSIZE   = 1024,  /* EEPROM writable area size for PFn, n>0 */
+	RSS_NENTRIES   = 2048,  /* # of entries in RSS mapping table */
+	TCB_SIZE       = 128,   /* TCB size */
+	NMTUS          = 16,    /* size of MTU table */
+	NCCTRL_WIN     = 32,    /* # of congestion control windows */
+	L2T_SIZE       = 4096,  /* # of L2T entries */
+	MBOX_LEN       = 64,    /* mailbox size in bytes */
+	TRACE_LEN      = 112,   /* length of trace data and mask */
+	FILTER_OPT_LEN = 36,    /* filter tuple width for optional components */
+	NWOL_PAT       = 8,     /* # of WoL patterns */
+	WOL_PAT_LEN    = 128,   /* length of WoL patterns */
+};
+
+enum {
+	SF_PAGE_SIZE = 256,           /* serial flash page size */
+	SF_SEC_SIZE = 64 * 1024,      /* serial flash sector size */
+};
+
+enum { RSP_TYPE_FLBUF, RSP_TYPE_CPL, RSP_TYPE_INTR }; /* response entry types */
+
+enum { MBOX_OWNER_NONE, MBOX_OWNER_FW, MBOX_OWNER_DRV };    /* mailbox owners */
+
+enum {
+	SGE_MAX_WR_LEN = 512,     /* max WR size in bytes */
+	SGE_NTIMERS = 6,          /* # of interrupt holdoff timer values */
+	SGE_NCOUNTERS = 4,        /* # of interrupt packet counter values */
+	SGE_MAX_IQ_SIZE = 65520,
+
+	SGE_TIMER_RSTRT_CNTR = 6, /* restart RX packet threshold counter */
+	SGE_TIMER_UPD_CIDX = 7,   /* update cidx only */
+
+	SGE_EQ_IDXSIZE = 64,      /* egress queue pidx/cidx unit size */
+
+	SGE_INTRDST_PCI = 0,      /* interrupt destination is PCI-E */
+	SGE_INTRDST_IQ = 1,       /*   destination is an ingress queue */
+
+	SGE_UPDATEDEL_NONE = 0,   /* ingress queue pidx update delivery */
+	SGE_UPDATEDEL_INTR = 1,   /*   interrupt */
+	SGE_UPDATEDEL_STPG = 2,   /*   status page */
+	SGE_UPDATEDEL_BOTH = 3,   /*   interrupt and status page */
+
+	SGE_HOSTFCMODE_NONE = 0,  /* egress queue cidx updates */
+	SGE_HOSTFCMODE_IQ = 1,    /*   sent to ingress queue */
+	SGE_HOSTFCMODE_STPG = 2,  /*   sent to status page */
+	SGE_HOSTFCMODE_BOTH = 3,  /*   ingress queue and status page */
+
+	SGE_FETCHBURSTMIN_16B = 0,/* egress queue descriptor fetch minimum */
+	SGE_FETCHBURSTMIN_32B = 1,
+	SGE_FETCHBURSTMIN_64B = 2,
+	SGE_FETCHBURSTMIN_128B = 3,
+
+	SGE_FETCHBURSTMAX_64B = 0,/* egress queue descriptor fetch maximum */
+	SGE_FETCHBURSTMAX_128B = 1,
+	SGE_FETCHBURSTMAX_256B = 2,
+	SGE_FETCHBURSTMAX_512B = 3,
+
+	SGE_CIDXFLUSHTHRESH_1 = 0,/* egress queue cidx flush threshold */
+	SGE_CIDXFLUSHTHRESH_2 = 1,
+	SGE_CIDXFLUSHTHRESH_4 = 2,
+	SGE_CIDXFLUSHTHRESH_8 = 3,
+	SGE_CIDXFLUSHTHRESH_16 = 4,
+	SGE_CIDXFLUSHTHRESH_32 = 5,
+	SGE_CIDXFLUSHTHRESH_64 = 6,
+	SGE_CIDXFLUSHTHRESH_128 = 7,
+
+	SGE_INGPADBOUNDARY_SHIFT = 5,/* ingress queue pad boundary */
+};
+
+struct sge_qstat {                /* data written to SGE queue status entries */
+	__be32 qid;
+	__be16 cidx;
+	__be16 pidx;
+};
+
+/*
+ * Structure for last 128 bits of response descriptors
+ */
+struct rsp_ctrl {
+	__be32 hdrbuflen_pidx;
+	__be32 pldbuflen_qid;
+	union {
+		u8 type_gen;
+		__be64 last_flit;
+	};
+};
+
+#define RSPD_NEWBUF 0x80000000U
+#define RSPD_LEN(x) (((x) >> 0) & 0x7fffffffU)
+#define RSPD_QID(x) RSPD_LEN(x)
+
+#define RSPD_GEN(x)  ((x) >> 7)
+#define RSPD_TYPE(x) (((x) >> 4) & 3)
+
+#define V_QINTR_CNT_EN	   0x0
+#define QINTR_CNT_EN       0x1
+#define QINTR_TIMER_IDX(x) ((x) << 1)
+#define QINTR_TIMER_IDX_GET(x) (((x) >> 1) & 0x7)
+
+/*
+ * Flash layout.
+ */
+#define FLASH_START(start)	((start) * SF_SEC_SIZE)
+#define FLASH_MAX_SIZE(nsecs)	((nsecs) * SF_SEC_SIZE)
+
+enum {
+	/*
+	 * Various Expansion-ROM boot images, etc.
+	 */
+	FLASH_EXP_ROM_START_SEC = 0,
+	FLASH_EXP_ROM_NSECS = 6,
+	FLASH_EXP_ROM_START = FLASH_START(FLASH_EXP_ROM_START_SEC),
+	FLASH_EXP_ROM_MAX_SIZE = FLASH_MAX_SIZE(FLASH_EXP_ROM_NSECS),
+
+	/*
+	 * iSCSI Boot Firmware Table (iBFT) and other driver-related
+	 * parameters ...
+	 */
+	FLASH_IBFT_START_SEC = 6,
+	FLASH_IBFT_NSECS = 1,
+	FLASH_IBFT_START = FLASH_START(FLASH_IBFT_START_SEC),
+	FLASH_IBFT_MAX_SIZE = FLASH_MAX_SIZE(FLASH_IBFT_NSECS),
+
+	/*
+	 * Boot configuration data.
+	 */
+	FLASH_BOOTCFG_START_SEC = 7,
+	FLASH_BOOTCFG_NSECS = 1,
+	FLASH_BOOTCFG_START = FLASH_START(FLASH_BOOTCFG_START_SEC),
+	FLASH_BOOTCFG_MAX_SIZE = FLASH_MAX_SIZE(FLASH_BOOTCFG_NSECS),
+
+	/*
+	 * Location of firmware image in FLASH.
+	 */
+	FLASH_FW_START_SEC = 8,
+	FLASH_FW_NSECS = 16,
+	FLASH_FW_START = FLASH_START(FLASH_FW_START_SEC),
+	FLASH_FW_MAX_SIZE = FLASH_MAX_SIZE(FLASH_FW_NSECS),
+
+	/*
+	 * iSCSI persistent/crash information.
+	 */
+	FLASH_ISCSI_CRASH_START_SEC = 29,
+	FLASH_ISCSI_CRASH_NSECS = 1,
+	FLASH_ISCSI_CRASH_START = FLASH_START(FLASH_ISCSI_CRASH_START_SEC),
+	FLASH_ISCSI_CRASH_MAX_SIZE = FLASH_MAX_SIZE(FLASH_ISCSI_CRASH_NSECS),
+
+	/*
+	 * FCoE persistent/crash information.
+	 */
+	FLASH_FCOE_CRASH_START_SEC = 30,
+	FLASH_FCOE_CRASH_NSECS = 1,
+	FLASH_FCOE_CRASH_START = FLASH_START(FLASH_FCOE_CRASH_START_SEC),
+	FLASH_FCOE_CRASH_MAX_SIZE = FLASH_MAX_SIZE(FLASH_FCOE_CRASH_NSECS),
+
+	/*
+	 * Location of Firmware Configuration File in FLASH.  Since the FPGA
+	 * "FLASH" is smaller we need to store the Configuration File in a
+	 * different location -- which will overlap the end of the firmware
+	 * image if firmware ever gets that large ...
+	 */
+	FLASH_CFG_START_SEC = 31,
+	FLASH_CFG_NSECS = 1,
+	FLASH_CFG_START = FLASH_START(FLASH_CFG_START_SEC),
+	FLASH_CFG_MAX_SIZE = FLASH_MAX_SIZE(FLASH_CFG_NSECS),
+
+	/* We don't support FLASH devices which can't support the full
+	 * standard set of sections which we need for normal
+	 * operations.
+	 */
+	FLASH_MIN_SIZE = FLASH_CFG_START + FLASH_CFG_MAX_SIZE,
+
+	FLASH_FPGA_CFG_START_SEC = 15,
+	FLASH_FPGA_CFG_START = FLASH_START(FLASH_FPGA_CFG_START_SEC),
+
+	/*
+	 * Sectors 32-63 are reserved for FLASH failover.
+	 */
+};
+
+#undef FLASH_START
+#undef FLASH_MAX_SIZE
+
+#endif /* __T4_HW_H */
diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4_msg.h b/drivers/net/ethernet/chelsio/cxgb4/t4_msg.h
new file mode 100644
index 0000000..5f4db23
--- /dev/null
+++ b/drivers/net/ethernet/chelsio/cxgb4/t4_msg.h
@@ -0,0 +1,841 @@
+/*
+ * This file is part of the Chelsio T4 Ethernet driver for Linux.
+ *
+ * Copyright (c) 2003-2014 Chelsio Communications, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __T4_MSG_H
+#define __T4_MSG_H
+
+#include <linux/types.h>
+
+enum {
+	CPL_PASS_OPEN_REQ     = 0x1,
+	CPL_PASS_ACCEPT_RPL   = 0x2,
+	CPL_ACT_OPEN_REQ      = 0x3,
+	CPL_SET_TCB_FIELD     = 0x5,
+	CPL_GET_TCB           = 0x6,
+	CPL_CLOSE_CON_REQ     = 0x8,
+	CPL_CLOSE_LISTSRV_REQ = 0x9,
+	CPL_ABORT_REQ         = 0xA,
+	CPL_ABORT_RPL         = 0xB,
+	CPL_RX_DATA_ACK       = 0xD,
+	CPL_TX_PKT            = 0xE,
+	CPL_L2T_WRITE_REQ     = 0x12,
+	CPL_TID_RELEASE       = 0x1A,
+
+	CPL_CLOSE_LISTSRV_RPL = 0x20,
+	CPL_L2T_WRITE_RPL     = 0x23,
+	CPL_PASS_OPEN_RPL     = 0x24,
+	CPL_ACT_OPEN_RPL      = 0x25,
+	CPL_PEER_CLOSE        = 0x26,
+	CPL_ABORT_REQ_RSS     = 0x2B,
+	CPL_ABORT_RPL_RSS     = 0x2D,
+
+	CPL_CLOSE_CON_RPL     = 0x32,
+	CPL_ISCSI_HDR         = 0x33,
+	CPL_RDMA_CQE          = 0x35,
+	CPL_RDMA_CQE_READ_RSP = 0x36,
+	CPL_RDMA_CQE_ERR      = 0x37,
+	CPL_RX_DATA           = 0x39,
+	CPL_SET_TCB_RPL       = 0x3A,
+	CPL_RX_PKT            = 0x3B,
+	CPL_RX_DDP_COMPLETE   = 0x3F,
+
+	CPL_ACT_ESTABLISH     = 0x40,
+	CPL_PASS_ESTABLISH    = 0x41,
+	CPL_RX_DATA_DDP       = 0x42,
+	CPL_PASS_ACCEPT_REQ   = 0x44,
+	CPL_TRACE_PKT_T5      = 0x48,
+	CPL_RX_ISCSI_DDP      = 0x49,
+
+	CPL_RDMA_READ_REQ     = 0x60,
+
+	CPL_PASS_OPEN_REQ6    = 0x81,
+	CPL_ACT_OPEN_REQ6     = 0x83,
+
+	CPL_RDMA_TERMINATE    = 0xA2,
+	CPL_RDMA_WRITE        = 0xA4,
+	CPL_SGE_EGR_UPDATE    = 0xA5,
+
+	CPL_TRACE_PKT         = 0xB0,
+	CPL_ISCSI_DATA	      = 0xB2,
+
+	CPL_FW4_MSG           = 0xC0,
+	CPL_FW4_PLD           = 0xC1,
+	CPL_FW4_ACK           = 0xC3,
+
+	CPL_FW6_MSG           = 0xE0,
+	CPL_FW6_PLD           = 0xE1,
+	CPL_TX_PKT_LSO        = 0xED,
+	CPL_TX_PKT_XT         = 0xEE,
+
+	NUM_CPL_CMDS
+};
+
+enum CPL_error {
+	CPL_ERR_NONE               = 0,
+	CPL_ERR_TCAM_FULL          = 3,
+	CPL_ERR_BAD_LENGTH         = 15,
+	CPL_ERR_BAD_ROUTE          = 18,
+	CPL_ERR_CONN_RESET         = 20,
+	CPL_ERR_CONN_EXIST_SYNRECV = 21,
+	CPL_ERR_CONN_EXIST         = 22,
+	CPL_ERR_ARP_MISS           = 23,
+	CPL_ERR_BAD_SYN            = 24,
+	CPL_ERR_CONN_TIMEDOUT      = 30,
+	CPL_ERR_XMIT_TIMEDOUT      = 31,
+	CPL_ERR_PERSIST_TIMEDOUT   = 32,
+	CPL_ERR_FINWAIT2_TIMEDOUT  = 33,
+	CPL_ERR_KEEPALIVE_TIMEDOUT = 34,
+	CPL_ERR_RTX_NEG_ADVICE     = 35,
+	CPL_ERR_PERSIST_NEG_ADVICE = 36,
+	CPL_ERR_KEEPALV_NEG_ADVICE = 37,
+	CPL_ERR_ABORT_FAILED       = 42,
+	CPL_ERR_IWARP_FLM          = 50,
+};
+
+enum {
+	ULP_MODE_NONE          = 0,
+	ULP_MODE_ISCSI         = 2,
+	ULP_MODE_RDMA          = 4,
+	ULP_MODE_TCPDDP	       = 5,
+	ULP_MODE_FCOE          = 6,
+};
+
+enum {
+	ULP_CRC_HEADER = 1 << 0,
+	ULP_CRC_DATA   = 1 << 1
+};
+
+enum {
+	CPL_ABORT_SEND_RST = 0,
+	CPL_ABORT_NO_RST,
+};
+
+enum {                     /* TX_PKT_XT checksum types */
+	TX_CSUM_TCP    = 0,
+	TX_CSUM_UDP    = 1,
+	TX_CSUM_CRC16  = 4,
+	TX_CSUM_CRC32  = 5,
+	TX_CSUM_CRC32C = 6,
+	TX_CSUM_FCOE   = 7,
+	TX_CSUM_TCPIP  = 8,
+	TX_CSUM_UDPIP  = 9,
+	TX_CSUM_TCPIP6 = 10,
+	TX_CSUM_UDPIP6 = 11,
+	TX_CSUM_IP     = 12,
+};
+
+union opcode_tid {
+	__be32 opcode_tid;
+	u8 opcode;
+};
+
+#define CPL_OPCODE(x) ((x) << 24)
+#define G_CPL_OPCODE(x) (((x) >> 24) & 0xFF)
+#define MK_OPCODE_TID(opcode, tid) (CPL_OPCODE(opcode) | (tid))
+#define OPCODE_TID(cmd) ((cmd)->ot.opcode_tid)
+#define GET_TID(cmd) (ntohl(OPCODE_TID(cmd)) & 0xFFFFFF)
+
+/* partitioning of TID fields that also carry a queue id */
+#define GET_TID_TID(x) ((x) & 0x3fff)
+#define GET_TID_QID(x) (((x) >> 14) & 0x3ff)
+#define TID_QID(x)     ((x) << 14)
+
+struct rss_header {
+	u8 opcode;
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+	u8 channel:2;
+	u8 filter_hit:1;
+	u8 filter_tid:1;
+	u8 hash_type:2;
+	u8 ipv6:1;
+	u8 send2fw:1;
+#else
+	u8 send2fw:1;
+	u8 ipv6:1;
+	u8 hash_type:2;
+	u8 filter_tid:1;
+	u8 filter_hit:1;
+	u8 channel:2;
+#endif
+	__be16 qid;
+	__be32 hash_val;
+};
+
+struct work_request_hdr {
+	__be32 wr_hi;
+	__be32 wr_mid;
+	__be64 wr_lo;
+};
+
+/* wr_hi fields */
+#define S_WR_OP    24
+#define V_WR_OP(x) ((__u64)(x) << S_WR_OP)
+
+#define WR_HDR struct work_request_hdr wr
+
+/* option 0 fields */
+#define S_MSS_IDX    60
+#define M_MSS_IDX    0xF
+#define V_MSS_IDX(x) ((__u64)(x) << S_MSS_IDX)
+#define G_MSS_IDX(x) (((x) >> S_MSS_IDX) & M_MSS_IDX)
+
+/* option 2 fields */
+#define S_RSS_QUEUE    0
+#define M_RSS_QUEUE    0x3FF
+#define V_RSS_QUEUE(x) ((x) << S_RSS_QUEUE)
+#define G_RSS_QUEUE(x) (((x) >> S_RSS_QUEUE) & M_RSS_QUEUE)
+
+struct cpl_pass_open_req {
+	WR_HDR;
+	union opcode_tid ot;
+	__be16 local_port;
+	__be16 peer_port;
+	__be32 local_ip;
+	__be32 peer_ip;
+	__be64 opt0;
+#define TX_CHAN(x)    ((x) << 2)
+#define NO_CONG(x)    ((x) << 4)
+#define DELACK(x)     ((x) << 5)
+#define ULP_MODE(x)   ((x) << 8)
+#define RCV_BUFSIZ(x) ((x) << 12)
+#define RCV_BUFSIZ_MASK 0x3FFU
+#define DSCP(x)       ((x) << 22)
+#define SMAC_SEL(x)   ((u64)(x) << 28)
+#define L2T_IDX(x)    ((u64)(x) << 36)
+#define TCAM_BYPASS(x) ((u64)(x) << 48)
+#define NAGLE(x)      ((u64)(x) << 49)
+#define WND_SCALE(x)  ((u64)(x) << 50)
+#define KEEP_ALIVE(x) ((u64)(x) << 54)
+#define MSS_IDX(x)    ((u64)(x) << 60)
+	__be64 opt1;
+#define SYN_RSS_ENABLE   (1 << 0)
+#define SYN_RSS_QUEUE(x) ((x) << 2)
+#define CONN_POLICY_ASK  (1 << 22)
+};
+
+struct cpl_pass_open_req6 {
+	WR_HDR;
+	union opcode_tid ot;
+	__be16 local_port;
+	__be16 peer_port;
+	__be64 local_ip_hi;
+	__be64 local_ip_lo;
+	__be64 peer_ip_hi;
+	__be64 peer_ip_lo;
+	__be64 opt0;
+	__be64 opt1;
+};
+
+struct cpl_pass_open_rpl {
+	union opcode_tid ot;
+	u8 rsvd[3];
+	u8 status;
+};
+
+struct cpl_pass_accept_rpl {
+	WR_HDR;
+	union opcode_tid ot;
+	__be32 opt2;
+#define RSS_QUEUE(x)         ((x) << 0)
+#define RSS_QUEUE_VALID      (1 << 10)
+#define RX_COALESCE_VALID(x) ((x) << 11)
+#define RX_COALESCE(x)       ((x) << 12)
+#define PACE(x)	      ((x) << 16)
+#define RX_FC_VALID	     ((1U) << 19)
+#define RX_FC_DISABLE	     ((1U) << 20)
+#define TX_QUEUE(x)          ((x) << 23)
+#define RX_CHANNEL(x)        ((x) << 26)
+#define CCTRL_ECN(x)         ((x) << 27)
+#define WND_SCALE_EN(x)      ((x) << 28)
+#define TSTAMPS_EN(x)        ((x) << 29)
+#define SACK_EN(x)           ((x) << 30)
+#define T5_OPT_2_VALID	     ((1U) << 31)
+	__be64 opt0;
+};
+
+struct cpl_t5_pass_accept_rpl {
+	WR_HDR;
+	union opcode_tid ot;
+	__be32 opt2;
+	__be64 opt0;
+	__be32 iss;
+	__be32 rsvd;
+};
+
+struct cpl_act_open_req {
+	WR_HDR;
+	union opcode_tid ot;
+	__be16 local_port;
+	__be16 peer_port;
+	__be32 local_ip;
+	__be32 peer_ip;
+	__be64 opt0;
+	__be32 params;
+	__be32 opt2;
+};
+
+#define S_FILTER_TUPLE  24
+#define M_FILTER_TUPLE  0xFFFFFFFFFF
+#define V_FILTER_TUPLE(x) ((x) << S_FILTER_TUPLE)
+#define G_FILTER_TUPLE(x) (((x) >> S_FILTER_TUPLE) & M_FILTER_TUPLE)
+struct cpl_t5_act_open_req {
+	WR_HDR;
+	union opcode_tid ot;
+	__be16 local_port;
+	__be16 peer_port;
+	__be32 local_ip;
+	__be32 peer_ip;
+	__be64 opt0;
+	__be32 rsvd;
+	__be32 opt2;
+	__be64 params;
+};
+
+struct cpl_act_open_req6 {
+	WR_HDR;
+	union opcode_tid ot;
+	__be16 local_port;
+	__be16 peer_port;
+	__be64 local_ip_hi;
+	__be64 local_ip_lo;
+	__be64 peer_ip_hi;
+	__be64 peer_ip_lo;
+	__be64 opt0;
+	__be32 params;
+	__be32 opt2;
+};
+
+struct cpl_t5_act_open_req6 {
+	WR_HDR;
+	union opcode_tid ot;
+	__be16 local_port;
+	__be16 peer_port;
+	__be64 local_ip_hi;
+	__be64 local_ip_lo;
+	__be64 peer_ip_hi;
+	__be64 peer_ip_lo;
+	__be64 opt0;
+	__be32 rsvd;
+	__be32 opt2;
+	__be64 params;
+};
+
+struct cpl_act_open_rpl {
+	union opcode_tid ot;
+	__be32 atid_status;
+#define GET_AOPEN_STATUS(x) ((x) & 0xff)
+#define GET_AOPEN_ATID(x)   (((x) >> 8) & 0xffffff)
+};
+
+struct cpl_pass_establish {
+	union opcode_tid ot;
+	__be32 rsvd;
+	__be32 tos_stid;
+#define PASS_OPEN_TID(x) ((x) << 0)
+#define PASS_OPEN_TOS(x) ((x) << 24)
+#define GET_PASS_OPEN_TID(x)	(((x) >> 0) & 0xFFFFFF)
+#define GET_POPEN_TID(x) ((x) & 0xffffff)
+#define GET_POPEN_TOS(x) (((x) >> 24) & 0xff)
+	__be16 mac_idx;
+	__be16 tcp_opt;
+#define GET_TCPOPT_WSCALE_OK(x)  (((x) >> 5) & 1)
+#define GET_TCPOPT_SACK(x)       (((x) >> 6) & 1)
+#define GET_TCPOPT_TSTAMP(x)     (((x) >> 7) & 1)
+#define GET_TCPOPT_SND_WSCALE(x) (((x) >> 8) & 0xf)
+#define GET_TCPOPT_MSS(x)        (((x) >> 12) & 0xf)
+	__be32 snd_isn;
+	__be32 rcv_isn;
+};
+
+struct cpl_act_establish {
+	union opcode_tid ot;
+	__be32 rsvd;
+	__be32 tos_atid;
+	__be16 mac_idx;
+	__be16 tcp_opt;
+	__be32 snd_isn;
+	__be32 rcv_isn;
+};
+
+struct cpl_get_tcb {
+	WR_HDR;
+	union opcode_tid ot;
+	__be16 reply_ctrl;
+#define QUEUENO(x)    ((x) << 0)
+#define REPLY_CHAN(x) ((x) << 14)
+#define NO_REPLY(x)   ((x) << 15)
+	__be16 cookie;
+};
+
+struct cpl_set_tcb_field {
+	WR_HDR;
+	union opcode_tid ot;
+	__be16 reply_ctrl;
+	__be16 word_cookie;
+#define TCB_WORD(x)   ((x) << 0)
+#define TCB_COOKIE(x) ((x) << 5)
+#define GET_TCB_COOKIE(x) (((x) >> 5) & 7)
+	__be64 mask;
+	__be64 val;
+};
+
+struct cpl_set_tcb_rpl {
+	union opcode_tid ot;
+	__be16 rsvd;
+	u8 cookie;
+	u8 status;
+	__be64 oldval;
+};
+
+struct cpl_close_con_req {
+	WR_HDR;
+	union opcode_tid ot;
+	__be32 rsvd;
+};
+
+struct cpl_close_con_rpl {
+	union opcode_tid ot;
+	u8 rsvd[3];
+	u8 status;
+	__be32 snd_nxt;
+	__be32 rcv_nxt;
+};
+
+struct cpl_close_listsvr_req {
+	WR_HDR;
+	union opcode_tid ot;
+	__be16 reply_ctrl;
+#define LISTSVR_IPV6(x) ((x) << 14)
+	__be16 rsvd;
+};
+
+struct cpl_close_listsvr_rpl {
+	union opcode_tid ot;
+	u8 rsvd[3];
+	u8 status;
+};
+
+struct cpl_abort_req_rss {
+	union opcode_tid ot;
+	u8 rsvd[3];
+	u8 status;
+};
+
+struct cpl_abort_req {
+	WR_HDR;
+	union opcode_tid ot;
+	__be32 rsvd0;
+	u8 rsvd1;
+	u8 cmd;
+	u8 rsvd2[6];
+};
+
+struct cpl_abort_rpl_rss {
+	union opcode_tid ot;
+	u8 rsvd[3];
+	u8 status;
+};
+
+struct cpl_abort_rpl {
+	WR_HDR;
+	union opcode_tid ot;
+	__be32 rsvd0;
+	u8 rsvd1;
+	u8 cmd;
+	u8 rsvd2[6];
+};
+
+struct cpl_peer_close {
+	union opcode_tid ot;
+	__be32 rcv_nxt;
+};
+
+struct cpl_tid_release {
+	WR_HDR;
+	union opcode_tid ot;
+	__be32 rsvd;
+};
+
+struct cpl_tx_pkt_core {
+	__be32 ctrl0;
+#define TXPKT_VF(x)        ((x) << 0)
+#define TXPKT_PF(x)        ((x) << 8)
+#define TXPKT_VF_VLD       (1 << 11)
+#define TXPKT_OVLAN_IDX(x) ((x) << 12)
+#define TXPKT_INTF(x)      ((x) << 16)
+#define TXPKT_INS_OVLAN    (1 << 21)
+#define TXPKT_OPCODE(x)    ((x) << 24)
+	__be16 pack;
+	__be16 len;
+	__be64 ctrl1;
+#define TXPKT_CSUM_END(x)   ((x) << 12)
+#define TXPKT_CSUM_START(x) ((x) << 20)
+#define TXPKT_IPHDR_LEN(x)  ((u64)(x) << 20)
+#define TXPKT_CSUM_LOC(x)   ((u64)(x) << 30)
+#define TXPKT_ETHHDR_LEN(x) ((u64)(x) << 34)
+#define TXPKT_CSUM_TYPE(x)  ((u64)(x) << 40)
+#define TXPKT_VLAN(x)       ((u64)(x) << 44)
+#define TXPKT_VLAN_VLD      (1ULL << 60)
+#define TXPKT_IPCSUM_DIS    (1ULL << 62)
+#define TXPKT_L4CSUM_DIS    (1ULL << 63)
+};
+
+struct cpl_tx_pkt {
+	WR_HDR;
+	struct cpl_tx_pkt_core c;
+};
+
+#define cpl_tx_pkt_xt cpl_tx_pkt
+
+struct cpl_tx_pkt_lso_core {
+	__be32 lso_ctrl;
+#define LSO_TCPHDR_LEN(x) ((x) << 0)
+#define LSO_IPHDR_LEN(x)  ((x) << 4)
+#define LSO_ETHHDR_LEN(x) ((x) << 16)
+#define LSO_IPV6(x)       ((x) << 20)
+#define LSO_LAST_SLICE    (1 << 22)
+#define LSO_FIRST_SLICE   (1 << 23)
+#define LSO_OPCODE(x)     ((x) << 24)
+#define LSO_T5_XFER_SIZE(x) ((x) << 0)
+	__be16 ipid_ofst;
+	__be16 mss;
+	__be32 seqno_offset;
+	__be32 len;
+	/* encapsulated CPL (TX_PKT, TX_PKT_XT or TX_DATA) follows here */
+};
+
+struct cpl_tx_pkt_lso {
+	WR_HDR;
+	struct cpl_tx_pkt_lso_core c;
+	/* encapsulated CPL (TX_PKT, TX_PKT_XT or TX_DATA) follows here */
+};
+
+struct cpl_iscsi_hdr {
+	union opcode_tid ot;
+	__be16 pdu_len_ddp;
+#define ISCSI_PDU_LEN(x) ((x) & 0x7FFF)
+#define ISCSI_DDP        (1 << 15)
+	__be16 len;
+	__be32 seq;
+	__be16 urg;
+	u8 rsvd;
+	u8 status;
+};
+
+struct cpl_rx_data {
+	union opcode_tid ot;
+	__be16 rsvd;
+	__be16 len;
+	__be32 seq;
+	__be16 urg;
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+	u8 dack_mode:2;
+	u8 psh:1;
+	u8 heartbeat:1;
+	u8 ddp_off:1;
+	u8 :3;
+#else
+	u8 :3;
+	u8 ddp_off:1;
+	u8 heartbeat:1;
+	u8 psh:1;
+	u8 dack_mode:2;
+#endif
+	u8 status;
+};
+
+struct cpl_rx_data_ack {
+	WR_HDR;
+	union opcode_tid ot;
+	__be32 credit_dack;
+#define RX_CREDITS(x)   ((x) << 0)
+#define RX_FORCE_ACK(x) ((x) << 28)
+};
+
+struct cpl_rx_pkt {
+	struct rss_header rsshdr;
+	u8 opcode;
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+	u8 iff:4;
+	u8 csum_calc:1;
+	u8 ipmi_pkt:1;
+	u8 vlan_ex:1;
+	u8 ip_frag:1;
+#else
+	u8 ip_frag:1;
+	u8 vlan_ex:1;
+	u8 ipmi_pkt:1;
+	u8 csum_calc:1;
+	u8 iff:4;
+#endif
+	__be16 csum;
+	__be16 vlan;
+	__be16 len;
+	__be32 l2info;
+#define RXF_UDP (1 << 22)
+#define RXF_TCP (1 << 23)
+#define RXF_IP  (1 << 24)
+#define RXF_IP6 (1 << 25)
+	__be16 hdr_len;
+	__be16 err_vec;
+};
+
+/* rx_pkt.l2info fields */
+#define S_RX_ETHHDR_LEN    0
+#define M_RX_ETHHDR_LEN    0x1F
+#define V_RX_ETHHDR_LEN(x) ((x) << S_RX_ETHHDR_LEN)
+#define G_RX_ETHHDR_LEN(x) (((x) >> S_RX_ETHHDR_LEN) & M_RX_ETHHDR_LEN)
+
+#define S_RX_T5_ETHHDR_LEN    0
+#define M_RX_T5_ETHHDR_LEN    0x3F
+#define V_RX_T5_ETHHDR_LEN(x) ((x) << S_RX_T5_ETHHDR_LEN)
+#define G_RX_T5_ETHHDR_LEN(x) (((x) >> S_RX_T5_ETHHDR_LEN) & M_RX_T5_ETHHDR_LEN)
+
+#define S_RX_MACIDX    8
+#define M_RX_MACIDX    0x1FF
+#define V_RX_MACIDX(x) ((x) << S_RX_MACIDX)
+#define G_RX_MACIDX(x) (((x) >> S_RX_MACIDX) & M_RX_MACIDX)
+
+#define S_RXF_SYN    21
+#define V_RXF_SYN(x) ((x) << S_RXF_SYN)
+#define F_RXF_SYN    V_RXF_SYN(1U)
+
+#define S_RX_CHAN    28
+#define M_RX_CHAN    0xF
+#define V_RX_CHAN(x) ((x) << S_RX_CHAN)
+#define G_RX_CHAN(x) (((x) >> S_RX_CHAN) & M_RX_CHAN)
+
+/* rx_pkt.hdr_len fields */
+#define S_RX_TCPHDR_LEN    0
+#define M_RX_TCPHDR_LEN    0x3F
+#define V_RX_TCPHDR_LEN(x) ((x) << S_RX_TCPHDR_LEN)
+#define G_RX_TCPHDR_LEN(x) (((x) >> S_RX_TCPHDR_LEN) & M_RX_TCPHDR_LEN)
+
+#define S_RX_IPHDR_LEN    6
+#define M_RX_IPHDR_LEN    0x3FF
+#define V_RX_IPHDR_LEN(x) ((x) << S_RX_IPHDR_LEN)
+#define G_RX_IPHDR_LEN(x) (((x) >> S_RX_IPHDR_LEN) & M_RX_IPHDR_LEN)
+
+struct cpl_trace_pkt {
+	u8 opcode;
+	u8 intf;
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+	u8 runt:4;
+	u8 filter_hit:4;
+	u8 :6;
+	u8 err:1;
+	u8 trunc:1;
+#else
+	u8 filter_hit:4;
+	u8 runt:4;
+	u8 trunc:1;
+	u8 err:1;
+	u8 :6;
+#endif
+	__be16 rsvd;
+	__be16 len;
+	__be64 tstamp;
+};
+
+struct cpl_t5_trace_pkt {
+	__u8 opcode;
+	__u8 intf;
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+	__u8 runt:4;
+	__u8 filter_hit:4;
+	__u8:6;
+	__u8 err:1;
+	__u8 trunc:1;
+#else
+	__u8 filter_hit:4;
+	__u8 runt:4;
+	__u8 trunc:1;
+	__u8 err:1;
+	__u8:6;
+#endif
+	__be16 rsvd;
+	__be16 len;
+	__be64 tstamp;
+	__be64 rsvd1;
+};
+
+struct cpl_l2t_write_req {
+	WR_HDR;
+	union opcode_tid ot;
+	__be16 params;
+#define L2T_W_INFO(x)    ((x) << 2)
+#define L2T_W_PORT(x)    ((x) << 8)
+#define L2T_W_NOREPLY(x) ((x) << 15)
+	__be16 l2t_idx;
+	__be16 vlan;
+	u8 dst_mac[6];
+};
+
+struct cpl_l2t_write_rpl {
+	union opcode_tid ot;
+	u8 status;
+	u8 rsvd[3];
+};
+
+struct cpl_rdma_terminate {
+	union opcode_tid ot;
+	__be16 rsvd;
+	__be16 len;
+};
+
+struct cpl_sge_egr_update {
+	__be32 opcode_qid;
+#define EGR_QID(x) ((x) & 0x1FFFF)
+	__be16 cidx;
+	__be16 pidx;
+};
+
+/* cpl_fw*.type values */
+enum {
+	FW_TYPE_CMD_RPL = 0,
+	FW_TYPE_WR_RPL = 1,
+	FW_TYPE_CQE = 2,
+	FW_TYPE_OFLD_CONNECTION_WR_RPL = 3,
+	FW_TYPE_RSSCPL = 4,
+};
+
+struct cpl_fw4_pld {
+	u8 opcode;
+	u8 rsvd0[3];
+	u8 type;
+	u8 rsvd1;
+	__be16 len;
+	__be64 data;
+	__be64 rsvd2;
+};
+
+struct cpl_fw6_pld {
+	u8 opcode;
+	u8 rsvd[5];
+	__be16 len;
+	__be64 data[4];
+};
+
+struct cpl_fw4_msg {
+	u8 opcode;
+	u8 type;
+	__be16 rsvd0;
+	__be32 rsvd1;
+	__be64 data[2];
+};
+
+struct cpl_fw4_ack {
+	union opcode_tid ot;
+	u8 credits;
+	u8 rsvd0[2];
+	u8 seq_vld;
+	__be32 snd_nxt;
+	__be32 snd_una;
+	__be64 rsvd1;
+};
+
+struct cpl_fw6_msg {
+	u8 opcode;
+	u8 type;
+	__be16 rsvd0;
+	__be32 rsvd1;
+	__be64 data[4];
+};
+
+/* cpl_fw6_msg.type values */
+enum {
+	FW6_TYPE_CMD_RPL = 0,
+	FW6_TYPE_WR_RPL = 1,
+	FW6_TYPE_CQE = 2,
+	FW6_TYPE_OFLD_CONNECTION_WR_RPL = 3,
+	FW6_TYPE_RSSCPL = FW_TYPE_RSSCPL,
+};
+
+struct cpl_fw6_msg_ofld_connection_wr_rpl {
+	__u64   cookie;
+	__be32  tid;    /* or atid in case of active failure */
+	__u8    t_state;
+	__u8    retval;
+	__u8    rsvd[2];
+};
+
+enum {
+	ULP_TX_MEM_READ = 2,
+	ULP_TX_MEM_WRITE = 3,
+	ULP_TX_PKT = 4
+};
+
+enum {
+	ULP_TX_SC_NOOP = 0x80,
+	ULP_TX_SC_IMM  = 0x81,
+	ULP_TX_SC_DSGL = 0x82,
+	ULP_TX_SC_ISGL = 0x83
+};
+
+struct ulptx_sge_pair {
+	__be32 len[2];
+	__be64 addr[2];
+};
+
+struct ulptx_sgl {
+	__be32 cmd_nsge;
+#define ULPTX_CMD(x) ((x) << 24)
+#define ULPTX_NSGE(x) ((x) << 0)
+#define ULPTX_MORE (1U << 23)
+	__be32 len0;
+	__be64 addr0;
+	struct ulptx_sge_pair sge[0];
+};
+
+struct ulp_mem_io {
+	WR_HDR;
+	__be32 cmd;
+#define ULP_MEMIO_ORDER(x) ((x) << 23)
+	__be32 len16;             /* command length */
+	__be32 dlen;              /* data length in 32-byte units */
+#define ULP_MEMIO_DATA_LEN(x) ((x) << 0)
+	__be32 lock_addr;
+#define ULP_MEMIO_ADDR(x) ((x) << 0)
+#define ULP_MEMIO_LOCK(x) ((x) << 31)
+};
+
+#define S_T5_ULP_MEMIO_IMM    23
+#define V_T5_ULP_MEMIO_IMM(x) ((x) << S_T5_ULP_MEMIO_IMM)
+#define F_T5_ULP_MEMIO_IMM    V_T5_ULP_MEMIO_IMM(1U)
+
+#define S_T5_ULP_MEMIO_ORDER    22
+#define V_T5_ULP_MEMIO_ORDER(x) ((x) << S_T5_ULP_MEMIO_ORDER)
+#define F_T5_ULP_MEMIO_ORDER    V_T5_ULP_MEMIO_ORDER(1U)
+
+#endif  /* __T4_MSG_H */
diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4_regs.h b/drivers/net/ethernet/chelsio/cxgb4/t4_regs.h
new file mode 100644
index 0000000..8d2de10
--- /dev/null
+++ b/drivers/net/ethernet/chelsio/cxgb4/t4_regs.h
@@ -0,0 +1,1344 @@
+/*
+ * This file is part of the Chelsio T4 Ethernet driver for Linux.
+ *
+ * Copyright (c) 2003-2014 Chelsio Communications, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __T4_REGS_H
+#define __T4_REGS_H
+
+#define MYPF_BASE 0x1b000
+#define MYPF_REG(reg_addr) (MYPF_BASE + (reg_addr))
+
+#define PF0_BASE 0x1e000
+#define PF0_REG(reg_addr) (PF0_BASE + (reg_addr))
+
+#define PF_STRIDE 0x400
+#define PF_BASE(idx) (PF0_BASE + (idx) * PF_STRIDE)
+#define PF_REG(idx, reg) (PF_BASE(idx) + (reg))
+
+#define MYPORT_BASE 0x1c000
+#define MYPORT_REG(reg_addr) (MYPORT_BASE + (reg_addr))
+
+#define PORT0_BASE 0x20000
+#define PORT0_REG(reg_addr) (PORT0_BASE + (reg_addr))
+
+#define PORT_STRIDE 0x2000
+#define PORT_BASE(idx) (PORT0_BASE + (idx) * PORT_STRIDE)
+#define PORT_REG(idx, reg) (PORT_BASE(idx) + (reg))
+
+#define EDC_STRIDE (EDC_1_BASE_ADDR - EDC_0_BASE_ADDR)
+#define EDC_REG(reg, idx) (reg + EDC_STRIDE * idx)
+
+#define PCIE_MEM_ACCESS_REG(reg_addr, idx) ((reg_addr) + (idx) * 8)
+#define PCIE_MAILBOX_REG(reg_addr, idx) ((reg_addr) + (idx) * 8)
+#define MC_BIST_STATUS_REG(reg_addr, idx) ((reg_addr) + (idx) * 4)
+#define EDC_BIST_STATUS_REG(reg_addr, idx) ((reg_addr) + (idx) * 4)
+
+#define SGE_PF_KDOORBELL 0x0
+#define  QID_MASK    0xffff8000U
+#define  QID_SHIFT   15
+#define  QID(x)      ((x) << QID_SHIFT)
+#define  DBPRIO(x)   ((x) << 14)
+#define  DBTYPE(x)   ((x) << 13)
+#define  PIDX_MASK   0x00003fffU
+#define  PIDX_SHIFT  0
+#define  PIDX(x)     ((x) << PIDX_SHIFT)
+#define  PIDX_SHIFT_T5   0
+#define  PIDX_T5(x)  ((x) << PIDX_SHIFT_T5)
+
+
+#define SGE_TIMERREGS	6
+#define SGE_PF_GTS 0x4
+#define  INGRESSQID_MASK   0xffff0000U
+#define  INGRESSQID_SHIFT  16
+#define  INGRESSQID(x)     ((x) << INGRESSQID_SHIFT)
+#define  TIMERREG_MASK     0x0000e000U
+#define  TIMERREG_SHIFT    13
+#define  TIMERREG(x)       ((x) << TIMERREG_SHIFT)
+#define  SEINTARM_MASK     0x00001000U
+#define  SEINTARM_SHIFT    12
+#define  SEINTARM(x)       ((x) << SEINTARM_SHIFT)
+#define  CIDXINC_MASK      0x00000fffU
+#define  CIDXINC_SHIFT     0
+#define  CIDXINC(x)        ((x) << CIDXINC_SHIFT)
+
+#define X_RXPKTCPLMODE_SPLIT     1
+#define X_INGPADBOUNDARY_SHIFT 5
+
+#define SGE_CONTROL 0x1008
+#define SGE_CONTROL2_A		0x1124
+#define  DCASYSTYPE             0x00080000U
+#define  RXPKTCPLMODE_MASK      0x00040000U
+#define  RXPKTCPLMODE_SHIFT     18
+#define  RXPKTCPLMODE(x)        ((x) << RXPKTCPLMODE_SHIFT)
+#define  EGRSTATUSPAGESIZE_MASK  0x00020000U
+#define  EGRSTATUSPAGESIZE_SHIFT 17
+#define  EGRSTATUSPAGESIZE(x)    ((x) << EGRSTATUSPAGESIZE_SHIFT)
+#define  PKTSHIFT_MASK          0x00001c00U
+#define  PKTSHIFT_SHIFT         10
+#define  PKTSHIFT(x)            ((x) << PKTSHIFT_SHIFT)
+#define  PKTSHIFT_GET(x)	(((x) & PKTSHIFT_MASK) >> PKTSHIFT_SHIFT)
+#define  INGPCIEBOUNDARY_32B_X	0
+#define  INGPCIEBOUNDARY_MASK   0x00000380U
+#define  INGPCIEBOUNDARY_SHIFT  7
+#define  INGPCIEBOUNDARY(x)     ((x) << INGPCIEBOUNDARY_SHIFT)
+#define  INGPADBOUNDARY_MASK    0x00000070U
+#define  INGPADBOUNDARY_SHIFT   4
+#define  INGPADBOUNDARY(x)      ((x) << INGPADBOUNDARY_SHIFT)
+#define  INGPADBOUNDARY_GET(x)	(((x) & INGPADBOUNDARY_MASK) \
+				 >> INGPADBOUNDARY_SHIFT)
+#define  INGPACKBOUNDARY_16B_X	0
+#define  INGPACKBOUNDARY_SHIFT_X 5
+
+#define  INGPACKBOUNDARY_S	16
+#define  INGPACKBOUNDARY_M	0x7U
+#define  INGPACKBOUNDARY_V(x)	((x) << INGPACKBOUNDARY_S)
+#define  INGPACKBOUNDARY_G(x)	(((x) >> INGPACKBOUNDARY_S) \
+				 & INGPACKBOUNDARY_M)
+#define  EGRPCIEBOUNDARY_MASK   0x0000000eU
+#define  EGRPCIEBOUNDARY_SHIFT  1
+#define  EGRPCIEBOUNDARY(x)     ((x) << EGRPCIEBOUNDARY_SHIFT)
+#define  GLOBALENABLE           0x00000001U
+
+#define SGE_HOST_PAGE_SIZE 0x100c
+
+#define  HOSTPAGESIZEPF7_MASK   0x0000000fU
+#define  HOSTPAGESIZEPF7_SHIFT  28
+#define  HOSTPAGESIZEPF7(x)     ((x) << HOSTPAGESIZEPF7_SHIFT)
+
+#define  HOSTPAGESIZEPF6_MASK   0x0000000fU
+#define  HOSTPAGESIZEPF6_SHIFT  24
+#define  HOSTPAGESIZEPF6(x)     ((x) << HOSTPAGESIZEPF6_SHIFT)
+
+#define  HOSTPAGESIZEPF5_MASK   0x0000000fU
+#define  HOSTPAGESIZEPF5_SHIFT  20
+#define  HOSTPAGESIZEPF5(x)     ((x) << HOSTPAGESIZEPF5_SHIFT)
+
+#define  HOSTPAGESIZEPF4_MASK   0x0000000fU
+#define  HOSTPAGESIZEPF4_SHIFT  16
+#define  HOSTPAGESIZEPF4(x)     ((x) << HOSTPAGESIZEPF4_SHIFT)
+
+#define  HOSTPAGESIZEPF3_MASK   0x0000000fU
+#define  HOSTPAGESIZEPF3_SHIFT  12
+#define  HOSTPAGESIZEPF3(x)     ((x) << HOSTPAGESIZEPF3_SHIFT)
+
+#define  HOSTPAGESIZEPF2_MASK   0x0000000fU
+#define  HOSTPAGESIZEPF2_SHIFT  8
+#define  HOSTPAGESIZEPF2(x)     ((x) << HOSTPAGESIZEPF2_SHIFT)
+
+#define  HOSTPAGESIZEPF1_MASK   0x0000000fU
+#define  HOSTPAGESIZEPF1_SHIFT  4
+#define  HOSTPAGESIZEPF1(x)     ((x) << HOSTPAGESIZEPF1_SHIFT)
+
+#define  HOSTPAGESIZEPF0_MASK   0x0000000fU
+#define  HOSTPAGESIZEPF0_SHIFT  0
+#define  HOSTPAGESIZEPF0(x)     ((x) << HOSTPAGESIZEPF0_SHIFT)
+
+#define SGE_EGRESS_QUEUES_PER_PAGE_PF 0x1010
+#define  QUEUESPERPAGEPF0_MASK   0x0000000fU
+#define  QUEUESPERPAGEPF0_GET(x) ((x) & QUEUESPERPAGEPF0_MASK)
+
+#define QUEUESPERPAGEPF0    0
+#define QUEUESPERPAGEPF1    4
+
+/* T5 and later support a new BAR2-based doorbell mechanism for Egress Queues.
+ * The User Doorbells are each 128 bytes in length with a Simple Doorbell at
+ * offsets 8x and a Write Combining single 64-byte Egress Queue Unit
+ * (X_IDXSIZE_UNIT) Gather Buffer interface at offset 64.  For Ingress Queues,
+ * we have a Going To Sleep register at offsets 8x+4.
+ *
+ * As noted above, we have many instances of the Simple Doorbell and Going To
+ * Sleep registers at offsets 8x and 8x+4, respectively.  We want to use a
+ * non-64-byte aligned offset for the Simple Doorbell in order to attempt to
+ * avoid buffering of the writes to the Simple Doorbell and we want to use a
+ * non-contiguous offset for the Going To Sleep writes in order to avoid
+ * possible combining between them.
+ */
+#define SGE_UDB_SIZE            128
+#define SGE_UDB_KDOORBELL       8
+#define SGE_UDB_GTS             20
+#define SGE_UDB_WCDOORBELL      64
+
+#define SGE_INT_CAUSE1 0x1024
+#define SGE_INT_CAUSE2 0x1030
+#define SGE_INT_CAUSE3 0x103c
+#define  ERR_FLM_DBP               0x80000000U
+#define  ERR_FLM_IDMA1             0x40000000U
+#define  ERR_FLM_IDMA0             0x20000000U
+#define  ERR_FLM_HINT              0x10000000U
+#define  ERR_PCIE_ERROR3           0x08000000U
+#define  ERR_PCIE_ERROR2           0x04000000U
+#define  ERR_PCIE_ERROR1           0x02000000U
+#define  ERR_PCIE_ERROR0           0x01000000U
+#define  ERR_TIMER_ABOVE_MAX_QID   0x00800000U
+#define  ERR_CPL_EXCEED_IQE_SIZE   0x00400000U
+#define  ERR_INVALID_CIDX_INC      0x00200000U
+#define  ERR_ITP_TIME_PAUSED       0x00100000U
+#define  ERR_CPL_OPCODE_0          0x00080000U
+#define  ERR_DROPPED_DB            0x00040000U
+#define  ERR_DATA_CPL_ON_HIGH_QID1 0x00020000U
+#define  ERR_DATA_CPL_ON_HIGH_QID0 0x00010000U
+#define  ERR_BAD_DB_PIDX3          0x00008000U
+#define  ERR_BAD_DB_PIDX2          0x00004000U
+#define  ERR_BAD_DB_PIDX1          0x00002000U
+#define  ERR_BAD_DB_PIDX0          0x00001000U
+#define  ERR_ING_PCIE_CHAN         0x00000800U
+#define  ERR_ING_CTXT_PRIO         0x00000400U
+#define  ERR_EGR_CTXT_PRIO         0x00000200U
+#define  DBFIFO_HP_INT             0x00000100U
+#define  DBFIFO_LP_INT             0x00000080U
+#define  REG_ADDRESS_ERR           0x00000040U
+#define  INGRESS_SIZE_ERR          0x00000020U
+#define  EGRESS_SIZE_ERR           0x00000010U
+#define  ERR_INV_CTXT3             0x00000008U
+#define  ERR_INV_CTXT2             0x00000004U
+#define  ERR_INV_CTXT1             0x00000002U
+#define  ERR_INV_CTXT0             0x00000001U
+
+#define SGE_INT_ENABLE3 0x1040
+#define SGE_FL_BUFFER_SIZE0 0x1044
+#define SGE_FL_BUFFER_SIZE1 0x1048
+#define SGE_FL_BUFFER_SIZE2 0x104c
+#define SGE_FL_BUFFER_SIZE3 0x1050
+#define SGE_FL_BUFFER_SIZE4 0x1054
+#define SGE_FL_BUFFER_SIZE5 0x1058
+#define SGE_FL_BUFFER_SIZE6 0x105c
+#define SGE_FL_BUFFER_SIZE7 0x1060
+#define SGE_FL_BUFFER_SIZE8 0x1064
+
+#define SGE_INGRESS_RX_THRESHOLD 0x10a0
+#define  THRESHOLD_0_MASK   0x3f000000U
+#define  THRESHOLD_0_SHIFT  24
+#define  THRESHOLD_0(x)     ((x) << THRESHOLD_0_SHIFT)
+#define  THRESHOLD_0_GET(x) (((x) & THRESHOLD_0_MASK) >> THRESHOLD_0_SHIFT)
+#define  THRESHOLD_1_MASK   0x003f0000U
+#define  THRESHOLD_1_SHIFT  16
+#define  THRESHOLD_1(x)     ((x) << THRESHOLD_1_SHIFT)
+#define  THRESHOLD_1_GET(x) (((x) & THRESHOLD_1_MASK) >> THRESHOLD_1_SHIFT)
+#define  THRESHOLD_2_MASK   0x00003f00U
+#define  THRESHOLD_2_SHIFT  8
+#define  THRESHOLD_2(x)     ((x) << THRESHOLD_2_SHIFT)
+#define  THRESHOLD_2_GET(x) (((x) & THRESHOLD_2_MASK) >> THRESHOLD_2_SHIFT)
+#define  THRESHOLD_3_MASK   0x0000003fU
+#define  THRESHOLD_3_SHIFT  0
+#define  THRESHOLD_3(x)     ((x) << THRESHOLD_3_SHIFT)
+#define  THRESHOLD_3_GET(x) (((x) & THRESHOLD_3_MASK) >> THRESHOLD_3_SHIFT)
+
+#define SGE_CONM_CTRL 0x1094
+#define  EGRTHRESHOLD_MASK   0x00003f00U
+#define  EGRTHRESHOLDshift   8
+#define  EGRTHRESHOLD(x)     ((x) << EGRTHRESHOLDshift)
+#define  EGRTHRESHOLD_GET(x) (((x) & EGRTHRESHOLD_MASK) >> EGRTHRESHOLDshift)
+
+#define EGRTHRESHOLDPACKING_MASK	0x3fU
+#define EGRTHRESHOLDPACKING_SHIFT	14
+#define EGRTHRESHOLDPACKING(x)		((x) << EGRTHRESHOLDPACKING_SHIFT)
+#define EGRTHRESHOLDPACKING_GET(x)	(((x) >> EGRTHRESHOLDPACKING_SHIFT) & \
+					  EGRTHRESHOLDPACKING_MASK)
+
+#define SGE_DBFIFO_STATUS 0x10a4
+#define  HP_INT_THRESH_SHIFT 28
+#define  HP_INT_THRESH_MASK  0xfU
+#define  HP_INT_THRESH(x)    ((x) << HP_INT_THRESH_SHIFT)
+#define  LP_INT_THRESH_SHIFT 12
+#define  LP_INT_THRESH_MASK  0xfU
+#define  LP_INT_THRESH(x)    ((x) << LP_INT_THRESH_SHIFT)
+
+#define SGE_DOORBELL_CONTROL 0x10a8
+#define  ENABLE_DROP        (1 << 13)
+
+#define S_NOCOALESCE    26
+#define V_NOCOALESCE(x) ((x) << S_NOCOALESCE)
+#define F_NOCOALESCE    V_NOCOALESCE(1U)
+
+#define SGE_TIMESTAMP_LO 0x1098
+#define SGE_TIMESTAMP_HI 0x109c
+#define S_TSVAL    0
+#define M_TSVAL    0xfffffffU
+#define GET_TSVAL(x) (((x) >> S_TSVAL) & M_TSVAL)
+
+#define SGE_TIMER_VALUE_0_AND_1 0x10b8
+#define  TIMERVALUE0_MASK   0xffff0000U
+#define  TIMERVALUE0_SHIFT  16
+#define  TIMERVALUE0(x)     ((x) << TIMERVALUE0_SHIFT)
+#define  TIMERVALUE0_GET(x) (((x) & TIMERVALUE0_MASK) >> TIMERVALUE0_SHIFT)
+#define  TIMERVALUE1_MASK   0x0000ffffU
+#define  TIMERVALUE1_SHIFT  0
+#define  TIMERVALUE1(x)     ((x) << TIMERVALUE1_SHIFT)
+#define  TIMERVALUE1_GET(x) (((x) & TIMERVALUE1_MASK) >> TIMERVALUE1_SHIFT)
+
+#define SGE_TIMER_VALUE_2_AND_3 0x10bc
+#define  TIMERVALUE2_MASK   0xffff0000U
+#define  TIMERVALUE2_SHIFT  16
+#define  TIMERVALUE2(x)     ((x) << TIMERVALUE2_SHIFT)
+#define  TIMERVALUE2_GET(x) (((x) & TIMERVALUE2_MASK) >> TIMERVALUE2_SHIFT)
+#define  TIMERVALUE3_MASK   0x0000ffffU
+#define  TIMERVALUE3_SHIFT  0
+#define  TIMERVALUE3(x)     ((x) << TIMERVALUE3_SHIFT)
+#define  TIMERVALUE3_GET(x) (((x) & TIMERVALUE3_MASK) >> TIMERVALUE3_SHIFT)
+
+#define SGE_TIMER_VALUE_4_AND_5 0x10c0
+#define  TIMERVALUE4_MASK   0xffff0000U
+#define  TIMERVALUE4_SHIFT  16
+#define  TIMERVALUE4(x)     ((x) << TIMERVALUE4_SHIFT)
+#define  TIMERVALUE4_GET(x) (((x) & TIMERVALUE4_MASK) >> TIMERVALUE4_SHIFT)
+#define  TIMERVALUE5_MASK   0x0000ffffU
+#define  TIMERVALUE5_SHIFT  0
+#define  TIMERVALUE5(x)     ((x) << TIMERVALUE5_SHIFT)
+#define  TIMERVALUE5_GET(x) (((x) & TIMERVALUE5_MASK) >> TIMERVALUE5_SHIFT)
+
+#define SGE_DEBUG_INDEX 0x10cc
+#define SGE_DEBUG_DATA_HIGH 0x10d0
+#define SGE_DEBUG_DATA_LOW 0x10d4
+#define SGE_DEBUG_DATA_LOW_INDEX_2	0x12c8
+#define SGE_DEBUG_DATA_LOW_INDEX_3	0x12cc
+#define SGE_DEBUG_DATA_HIGH_INDEX_10	0x12a8
+#define SGE_INGRESS_QUEUES_PER_PAGE_PF 0x10f4
+
+#define S_HP_INT_THRESH    28
+#define M_HP_INT_THRESH 0xfU
+#define V_HP_INT_THRESH(x) ((x) << S_HP_INT_THRESH)
+#define S_LP_INT_THRESH_T5    18
+#define V_LP_INT_THRESH_T5(x) ((x) << S_LP_INT_THRESH_T5)
+#define M_LP_COUNT_T5    0x3ffffU
+#define G_LP_COUNT_T5(x) (((x) >> S_LP_COUNT) & M_LP_COUNT_T5)
+#define M_HP_COUNT 0x7ffU
+#define S_HP_COUNT 16
+#define G_HP_COUNT(x) (((x) >> S_HP_COUNT) & M_HP_COUNT)
+#define S_LP_INT_THRESH    12
+#define M_LP_INT_THRESH 0xfU
+#define M_LP_INT_THRESH_T5    0xfffU
+#define V_LP_INT_THRESH(x) ((x) << S_LP_INT_THRESH)
+#define M_LP_COUNT 0x7ffU
+#define S_LP_COUNT 0
+#define G_LP_COUNT(x) (((x) >> S_LP_COUNT) & M_LP_COUNT)
+#define A_SGE_DBFIFO_STATUS 0x10a4
+
+#define SGE_STAT_TOTAL 0x10e4
+#define SGE_STAT_MATCH 0x10e8
+
+#define SGE_STAT_CFG   0x10ec
+#define S_STATSOURCE_T5    9
+#define STATSOURCE_T5(x) ((x) << S_STATSOURCE_T5)
+
+#define SGE_DBFIFO_STATUS2 0x1118
+#define M_HP_COUNT_T5    0x3ffU
+#define G_HP_COUNT_T5(x) ((x)  & M_HP_COUNT_T5)
+#define S_HP_INT_THRESH_T5    10
+#define M_HP_INT_THRESH_T5    0xfU
+#define V_HP_INT_THRESH_T5(x) ((x) << S_HP_INT_THRESH_T5)
+
+#define S_ENABLE_DROP    13
+#define V_ENABLE_DROP(x) ((x) << S_ENABLE_DROP)
+#define F_ENABLE_DROP    V_ENABLE_DROP(1U)
+#define S_DROPPED_DB 0
+#define V_DROPPED_DB(x) ((x) << S_DROPPED_DB)
+#define F_DROPPED_DB V_DROPPED_DB(1U)
+#define A_SGE_DOORBELL_CONTROL 0x10a8
+
+#define A_SGE_CTXT_CMD 0x11fc
+#define A_SGE_DBQ_CTXT_BADDR 0x1084
+
+#define PCIE_PF_CFG 0x40
+#define  AIVEC(x)	((x) << 4)
+#define  AIVEC_MASK	0x3ffU
+
+#define PCIE_PF_CLI 0x44
+#define PCIE_INT_CAUSE 0x3004
+#define  UNXSPLCPLERR  0x20000000U
+#define  PCIEPINT      0x10000000U
+#define  PCIESINT      0x08000000U
+#define  RPLPERR       0x04000000U
+#define  RXWRPERR      0x02000000U
+#define  RXCPLPERR     0x01000000U
+#define  PIOTAGPERR    0x00800000U
+#define  MATAGPERR     0x00400000U
+#define  INTXCLRPERR   0x00200000U
+#define  FIDPERR       0x00100000U
+#define  CFGSNPPERR    0x00080000U
+#define  HRSPPERR      0x00040000U
+#define  HREQPERR      0x00020000U
+#define  HCNTPERR      0x00010000U
+#define  DRSPPERR      0x00008000U
+#define  DREQPERR      0x00004000U
+#define  DCNTPERR      0x00002000U
+#define  CRSPPERR      0x00001000U
+#define  CREQPERR      0x00000800U
+#define  CCNTPERR      0x00000400U
+#define  TARTAGPERR    0x00000200U
+#define  PIOREQPERR    0x00000100U
+#define  PIOCPLPERR    0x00000080U
+#define  MSIXDIPERR    0x00000040U
+#define  MSIXDATAPERR  0x00000020U
+#define  MSIXADDRHPERR 0x00000010U
+#define  MSIXADDRLPERR 0x00000008U
+#define  MSIDATAPERR   0x00000004U
+#define  MSIADDRHPERR  0x00000002U
+#define  MSIADDRLPERR  0x00000001U
+
+#define  READRSPERR      0x20000000U
+#define  TRGT1GRPPERR    0x10000000U
+#define  IPSOTPERR       0x08000000U
+#define  IPRXDATAGRPPERR 0x02000000U
+#define  IPRXHDRGRPPERR  0x01000000U
+#define  MAGRPPERR       0x00400000U
+#define  VFIDPERR        0x00200000U
+#define  HREQWRPERR      0x00010000U
+#define  DREQWRPERR      0x00002000U
+#define  MSTTAGQPERR     0x00000400U
+#define  PIOREQGRPPERR   0x00000100U
+#define  PIOCPLGRPPERR   0x00000080U
+#define  MSIXSTIPERR     0x00000004U
+#define  MSTTIMEOUTPERR  0x00000002U
+#define  MSTGRPPERR      0x00000001U
+
+#define PCIE_NONFAT_ERR 0x3010
+#define PCIE_CFG_SPACE_REQ 0x3060
+#define PCIE_CFG_SPACE_DATA 0x3064
+#define PCIE_MEM_ACCESS_BASE_WIN 0x3068
+#define S_PCIEOFST       10
+#define M_PCIEOFST       0x3fffffU
+#define GET_PCIEOFST(x)  (((x) >> S_PCIEOFST) & M_PCIEOFST)
+#define  PCIEOFST_MASK   0xfffffc00U
+#define  BIR_MASK        0x00000300U
+#define  BIR_SHIFT       8
+#define  BIR(x)          ((x) << BIR_SHIFT)
+#define  WINDOW_MASK     0x000000ffU
+#define  WINDOW_SHIFT    0
+#define  WINDOW(x)       ((x) << WINDOW_SHIFT)
+#define  GET_WINDOW(x)	 (((x) >> WINDOW_SHIFT) & WINDOW_MASK)
+#define PCIE_MEM_ACCESS_OFFSET 0x306c
+#define ENABLE	(1U << 30)
+#define FUNCTION(x) ((x) << 12)
+#define F_LOCALCFG    (1U << 28)
+
+#define S_PFNUM    0
+#define V_PFNUM(x) ((x) << S_PFNUM)
+
+#define PCIE_FW 0x30b8
+#define  PCIE_FW_ERR		0x80000000U
+#define  PCIE_FW_INIT		0x40000000U
+#define  PCIE_FW_HALT		0x20000000U
+#define  PCIE_FW_MASTER_VLD	0x00008000U
+#define  PCIE_FW_MASTER(x)	((x) << 12)
+#define  PCIE_FW_MASTER_MASK	0x7
+#define  PCIE_FW_MASTER_GET(x)	(((x) >> 12) & PCIE_FW_MASTER_MASK)
+
+#define PCIE_CORE_UTL_SYSTEM_BUS_AGENT_STATUS 0x5908
+#define  RNPP 0x80000000U
+#define  RPCP 0x20000000U
+#define  RCIP 0x08000000U
+#define  RCCP 0x04000000U
+#define  RFTP 0x00800000U
+#define  PTRP 0x00100000U
+
+#define PCIE_CORE_UTL_PCI_EXPRESS_PORT_STATUS 0x59a4
+#define  TPCP 0x40000000U
+#define  TNPP 0x20000000U
+#define  TFTP 0x10000000U
+#define  TCAP 0x08000000U
+#define  TCIP 0x04000000U
+#define  RCAP 0x02000000U
+#define  PLUP 0x00800000U
+#define  PLDN 0x00400000U
+#define  OTDD 0x00200000U
+#define  GTRP 0x00100000U
+#define  RDPE 0x00040000U
+#define  TDCE 0x00020000U
+#define  TDUE 0x00010000U
+
+#define MC_INT_CAUSE 0x7518
+#define MC_P_INT_CAUSE 0x41318
+#define  ECC_UE_INT_CAUSE 0x00000004U
+#define  ECC_CE_INT_CAUSE 0x00000002U
+#define  PERR_INT_CAUSE   0x00000001U
+
+#define MC_ECC_STATUS 0x751c
+#define MC_P_ECC_STATUS 0x4131c
+#define  ECC_CECNT_MASK   0xffff0000U
+#define  ECC_CECNT_SHIFT  16
+#define  ECC_CECNT(x)     ((x) << ECC_CECNT_SHIFT)
+#define  ECC_CECNT_GET(x) (((x) & ECC_CECNT_MASK) >> ECC_CECNT_SHIFT)
+#define  ECC_UECNT_MASK   0x0000ffffU
+#define  ECC_UECNT_SHIFT  0
+#define  ECC_UECNT(x)     ((x) << ECC_UECNT_SHIFT)
+#define  ECC_UECNT_GET(x) (((x) & ECC_UECNT_MASK) >> ECC_UECNT_SHIFT)
+
+#define MC_BIST_CMD 0x7600
+#define  START_BIST          0x80000000U
+#define  BIST_CMD_GAP_MASK   0x0000ff00U
+#define  BIST_CMD_GAP_SHIFT  8
+#define  BIST_CMD_GAP(x)     ((x) << BIST_CMD_GAP_SHIFT)
+#define  BIST_OPCODE_MASK    0x00000003U
+#define  BIST_OPCODE_SHIFT   0
+#define  BIST_OPCODE(x)      ((x) << BIST_OPCODE_SHIFT)
+
+#define MC_BIST_CMD_ADDR 0x7604
+#define MC_BIST_CMD_LEN 0x7608
+#define MC_BIST_DATA_PATTERN 0x760c
+#define  BIST_DATA_TYPE_MASK   0x0000000fU
+#define  BIST_DATA_TYPE_SHIFT  0
+#define  BIST_DATA_TYPE(x)     ((x) << BIST_DATA_TYPE_SHIFT)
+
+#define MC_BIST_STATUS_RDATA 0x7688
+
+#define MA_EDRAM0_BAR 0x77c0
+#define MA_EDRAM1_BAR 0x77c4
+#define EDRAM_SIZE_MASK   0xfffU
+#define EDRAM_SIZE_GET(x) ((x) & EDRAM_SIZE_MASK)
+
+#define MA_EXT_MEMORY_BAR 0x77c8
+#define  EXT_MEM_SIZE_MASK   0x00000fffU
+#define  EXT_MEM_SIZE_SHIFT  0
+#define  EXT_MEM_SIZE_GET(x) (((x) & EXT_MEM_SIZE_MASK) >> EXT_MEM_SIZE_SHIFT)
+
+#define MA_TARGET_MEM_ENABLE 0x77d8
+#define  EXT_MEM1_ENABLE 0x00000010U
+#define  EXT_MEM_ENABLE 0x00000004U
+#define  EDRAM1_ENABLE  0x00000002U
+#define  EDRAM0_ENABLE  0x00000001U
+
+#define MA_INT_CAUSE 0x77e0
+#define  MEM_PERR_INT_CAUSE 0x00000002U
+#define  MEM_WRAP_INT_CAUSE 0x00000001U
+
+#define MA_INT_WRAP_STATUS 0x77e4
+#define  MEM_WRAP_ADDRESS_MASK   0xfffffff0U
+#define  MEM_WRAP_ADDRESS_SHIFT  4
+#define  MEM_WRAP_ADDRESS_GET(x) (((x) & MEM_WRAP_ADDRESS_MASK) >> MEM_WRAP_ADDRESS_SHIFT)
+#define  MEM_WRAP_CLIENT_NUM_MASK   0x0000000fU
+#define  MEM_WRAP_CLIENT_NUM_SHIFT  0
+#define  MEM_WRAP_CLIENT_NUM_GET(x) (((x) & MEM_WRAP_CLIENT_NUM_MASK) >> MEM_WRAP_CLIENT_NUM_SHIFT)
+#define MA_PCIE_FW 0x30b8
+#define MA_PARITY_ERROR_STATUS 0x77f4
+#define MA_PARITY_ERROR_STATUS2 0x7804
+
+#define MA_EXT_MEMORY1_BAR 0x7808
+#define EDC_0_BASE_ADDR 0x7900
+
+#define EDC_BIST_CMD 0x7904
+#define EDC_BIST_CMD_ADDR 0x7908
+#define EDC_BIST_CMD_LEN 0x790c
+#define EDC_BIST_DATA_PATTERN 0x7910
+#define EDC_BIST_STATUS_RDATA 0x7928
+#define EDC_INT_CAUSE 0x7978
+#define  ECC_UE_PAR     0x00000020U
+#define  ECC_CE_PAR     0x00000010U
+#define  PERR_PAR_CAUSE 0x00000008U
+
+#define EDC_ECC_STATUS 0x797c
+
+#define EDC_1_BASE_ADDR 0x7980
+
+#define CIM_BOOT_CFG 0x7b00
+#define  BOOTADDR_MASK 0xffffff00U
+#define  UPCRST        0x1U
+
+#define CIM_PF_MAILBOX_DATA 0x240
+#define CIM_PF_MAILBOX_CTRL 0x280
+#define  MBMSGVALID     0x00000008U
+#define  MBINTREQ       0x00000004U
+#define  MBOWNER_MASK   0x00000003U
+#define  MBOWNER_SHIFT  0
+#define  MBOWNER(x)     ((x) << MBOWNER_SHIFT)
+#define  MBOWNER_GET(x) (((x) & MBOWNER_MASK) >> MBOWNER_SHIFT)
+
+#define CIM_PF_HOST_INT_ENABLE 0x288
+#define  MBMSGRDYINTEN(x) ((x) << 19)
+
+#define CIM_PF_HOST_INT_CAUSE 0x28c
+#define  MBMSGRDYINT 0x00080000U
+
+#define CIM_HOST_INT_CAUSE 0x7b2c
+#define  TIEQOUTPARERRINT  0x00100000U
+#define  TIEQINPARERRINT   0x00080000U
+#define  MBHOSTPARERR      0x00040000U
+#define  MBUPPARERR        0x00020000U
+#define  IBQPARERR         0x0001f800U
+#define  IBQTP0PARERR      0x00010000U
+#define  IBQTP1PARERR      0x00008000U
+#define  IBQULPPARERR      0x00004000U
+#define  IBQSGELOPARERR    0x00002000U
+#define  IBQSGEHIPARERR    0x00001000U
+#define  IBQNCSIPARERR     0x00000800U
+#define  OBQPARERR         0x000007e0U
+#define  OBQULP0PARERR     0x00000400U
+#define  OBQULP1PARERR     0x00000200U
+#define  OBQULP2PARERR     0x00000100U
+#define  OBQULP3PARERR     0x00000080U
+#define  OBQSGEPARERR      0x00000040U
+#define  OBQNCSIPARERR     0x00000020U
+#define  PREFDROPINT       0x00000002U
+#define  UPACCNONZERO      0x00000001U
+
+#define CIM_HOST_UPACC_INT_CAUSE 0x7b34
+#define  EEPROMWRINT      0x40000000U
+#define  TIMEOUTMAINT     0x20000000U
+#define  TIMEOUTINT       0x10000000U
+#define  RSPOVRLOOKUPINT  0x08000000U
+#define  REQOVRLOOKUPINT  0x04000000U
+#define  BLKWRPLINT       0x02000000U
+#define  BLKRDPLINT       0x01000000U
+#define  SGLWRPLINT       0x00800000U
+#define  SGLRDPLINT       0x00400000U
+#define  BLKWRCTLINT      0x00200000U
+#define  BLKRDCTLINT      0x00100000U
+#define  SGLWRCTLINT      0x00080000U
+#define  SGLRDCTLINT      0x00040000U
+#define  BLKWREEPROMINT   0x00020000U
+#define  BLKRDEEPROMINT   0x00010000U
+#define  SGLWREEPROMINT   0x00008000U
+#define  SGLRDEEPROMINT   0x00004000U
+#define  BLKWRFLASHINT    0x00002000U
+#define  BLKRDFLASHINT    0x00001000U
+#define  SGLWRFLASHINT    0x00000800U
+#define  SGLRDFLASHINT    0x00000400U
+#define  BLKWRBOOTINT     0x00000200U
+#define  BLKRDBOOTINT     0x00000100U
+#define  SGLWRBOOTINT     0x00000080U
+#define  SGLRDBOOTINT     0x00000040U
+#define  ILLWRBEINT       0x00000020U
+#define  ILLRDBEINT       0x00000010U
+#define  ILLRDINT         0x00000008U
+#define  ILLWRINT         0x00000004U
+#define  ILLTRANSINT      0x00000002U
+#define  RSVDSPACEINT     0x00000001U
+
+#define TP_OUT_CONFIG 0x7d04
+#define  VLANEXTENABLE_MASK  0x0000f000U
+#define  VLANEXTENABLE_SHIFT 12
+
+#define TP_GLOBAL_CONFIG 0x7d08
+#define  FIVETUPLELOOKUP_SHIFT  17
+#define  FIVETUPLELOOKUP_MASK   0x00060000U
+#define  FIVETUPLELOOKUP(x)     ((x) << FIVETUPLELOOKUP_SHIFT)
+#define  FIVETUPLELOOKUP_GET(x) (((x) & FIVETUPLELOOKUP_MASK) >> \
+				FIVETUPLELOOKUP_SHIFT)
+
+#define TP_PARA_REG2 0x7d68
+#define  MAXRXDATA_MASK    0xffff0000U
+#define  MAXRXDATA_SHIFT   16
+#define  MAXRXDATA_GET(x) (((x) & MAXRXDATA_MASK) >> MAXRXDATA_SHIFT)
+
+#define TP_TIMER_RESOLUTION 0x7d90
+#define  TIMERRESOLUTION_MASK   0x00ff0000U
+#define  TIMERRESOLUTION_SHIFT  16
+#define  TIMERRESOLUTION_GET(x) (((x) & TIMERRESOLUTION_MASK) >> TIMERRESOLUTION_SHIFT)
+#define  DELAYEDACKRESOLUTION_MASK 0x000000ffU
+#define  DELAYEDACKRESOLUTION_SHIFT     0
+#define  DELAYEDACKRESOLUTION_GET(x) \
+	(((x) & DELAYEDACKRESOLUTION_MASK) >> DELAYEDACKRESOLUTION_SHIFT)
+
+#define TP_SHIFT_CNT 0x7dc0
+#define  SYNSHIFTMAX_SHIFT         24
+#define  SYNSHIFTMAX_MASK          0xff000000U
+#define  SYNSHIFTMAX(x)            ((x) << SYNSHIFTMAX_SHIFT)
+#define  SYNSHIFTMAX_GET(x)        (((x) & SYNSHIFTMAX_MASK) >> \
+				   SYNSHIFTMAX_SHIFT)
+#define  RXTSHIFTMAXR1_SHIFT       20
+#define  RXTSHIFTMAXR1_MASK        0x00f00000U
+#define  RXTSHIFTMAXR1(x)          ((x) << RXTSHIFTMAXR1_SHIFT)
+#define  RXTSHIFTMAXR1_GET(x)      (((x) & RXTSHIFTMAXR1_MASK) >> \
+				   RXTSHIFTMAXR1_SHIFT)
+#define  RXTSHIFTMAXR2_SHIFT       16
+#define  RXTSHIFTMAXR2_MASK        0x000f0000U
+#define  RXTSHIFTMAXR2(x)          ((x) << RXTSHIFTMAXR2_SHIFT)
+#define  RXTSHIFTMAXR2_GET(x)      (((x) & RXTSHIFTMAXR2_MASK) >> \
+				   RXTSHIFTMAXR2_SHIFT)
+#define  PERSHIFTBACKOFFMAX_SHIFT  12
+#define  PERSHIFTBACKOFFMAX_MASK   0x0000f000U
+#define  PERSHIFTBACKOFFMAX(x)     ((x) << PERSHIFTBACKOFFMAX_SHIFT)
+#define  PERSHIFTBACKOFFMAX_GET(x) (((x) & PERSHIFTBACKOFFMAX_MASK) >> \
+				   PERSHIFTBACKOFFMAX_SHIFT)
+#define  PERSHIFTMAX_SHIFT         8
+#define  PERSHIFTMAX_MASK          0x00000f00U
+#define  PERSHIFTMAX(x)            ((x) << PERSHIFTMAX_SHIFT)
+#define  PERSHIFTMAX_GET(x)        (((x) & PERSHIFTMAX_MASK) >> \
+				   PERSHIFTMAX_SHIFT)
+#define  KEEPALIVEMAXR1_SHIFT      4
+#define  KEEPALIVEMAXR1_MASK       0x000000f0U
+#define  KEEPALIVEMAXR1(x)         ((x) << KEEPALIVEMAXR1_SHIFT)
+#define  KEEPALIVEMAXR1_GET(x)     (((x) & KEEPALIVEMAXR1_MASK) >> \
+				   KEEPALIVEMAXR1_SHIFT)
+#define KEEPALIVEMAXR2_SHIFT       0
+#define KEEPALIVEMAXR2_MASK        0x0000000fU
+#define KEEPALIVEMAXR2(x)          ((x) << KEEPALIVEMAXR2_SHIFT)
+#define KEEPALIVEMAXR2_GET(x)      (((x) & KEEPALIVEMAXR2_MASK) >> \
+				   KEEPALIVEMAXR2_SHIFT)
+
+#define TP_CCTRL_TABLE 0x7ddc
+#define TP_MTU_TABLE 0x7de4
+#define  MTUINDEX_MASK   0xff000000U
+#define  MTUINDEX_SHIFT  24
+#define  MTUINDEX(x)     ((x) << MTUINDEX_SHIFT)
+#define  MTUWIDTH_MASK   0x000f0000U
+#define  MTUWIDTH_SHIFT  16
+#define  MTUWIDTH(x)     ((x) << MTUWIDTH_SHIFT)
+#define  MTUWIDTH_GET(x) (((x) & MTUWIDTH_MASK) >> MTUWIDTH_SHIFT)
+#define  MTUVALUE_MASK   0x00003fffU
+#define  MTUVALUE_SHIFT  0
+#define  MTUVALUE(x)     ((x) << MTUVALUE_SHIFT)
+#define  MTUVALUE_GET(x) (((x) & MTUVALUE_MASK) >> MTUVALUE_SHIFT)
+
+#define TP_RSS_LKP_TABLE 0x7dec
+#define  LKPTBLROWVLD        0x80000000U
+#define  LKPTBLQUEUE1_MASK   0x000ffc00U
+#define  LKPTBLQUEUE1_SHIFT  10
+#define  LKPTBLQUEUE1(x)     ((x) << LKPTBLQUEUE1_SHIFT)
+#define  LKPTBLQUEUE1_GET(x) (((x) & LKPTBLQUEUE1_MASK) >> LKPTBLQUEUE1_SHIFT)
+#define  LKPTBLQUEUE0_MASK   0x000003ffU
+#define  LKPTBLQUEUE0_SHIFT  0
+#define  LKPTBLQUEUE0(x)     ((x) << LKPTBLQUEUE0_SHIFT)
+#define  LKPTBLQUEUE0_GET(x) (((x) & LKPTBLQUEUE0_MASK) >> LKPTBLQUEUE0_SHIFT)
+
+#define TP_PIO_ADDR 0x7e40
+#define TP_PIO_DATA 0x7e44
+#define TP_MIB_INDEX 0x7e50
+#define TP_MIB_DATA 0x7e54
+#define TP_INT_CAUSE 0x7e74
+#define  FLMTXFLSTEMPTY 0x40000000U
+
+#define TP_VLAN_PRI_MAP 0x140
+#define  FRAGMENTATION_SHIFT 9
+#define  FRAGMENTATION_MASK  0x00000200U
+#define  MPSHITTYPE_MASK     0x00000100U
+#define  MACMATCH_MASK       0x00000080U
+#define  ETHERTYPE_MASK      0x00000040U
+#define  PROTOCOL_MASK       0x00000020U
+#define  TOS_MASK            0x00000010U
+#define  VLAN_MASK           0x00000008U
+#define  VNIC_ID_MASK        0x00000004U
+#define  PORT_MASK           0x00000002U
+#define  FCOE_SHIFT          0
+#define  FCOE_MASK           0x00000001U
+
+#define TP_INGRESS_CONFIG 0x141
+#define  VNIC                0x00000800U
+#define  CSUM_HAS_PSEUDO_HDR 0x00000400U
+#define  RM_OVLAN            0x00000200U
+#define  LOOKUPEVERYPKT      0x00000100U
+
+#define TP_MIB_MAC_IN_ERR_0 0x0
+#define TP_MIB_TCP_OUT_RST 0xc
+#define TP_MIB_TCP_IN_SEG_HI 0x10
+#define TP_MIB_TCP_IN_SEG_LO 0x11
+#define TP_MIB_TCP_OUT_SEG_HI 0x12
+#define TP_MIB_TCP_OUT_SEG_LO 0x13
+#define TP_MIB_TCP_RXT_SEG_HI 0x14
+#define TP_MIB_TCP_RXT_SEG_LO 0x15
+#define TP_MIB_TNL_CNG_DROP_0 0x18
+#define TP_MIB_TCP_V6IN_ERR_0 0x28
+#define TP_MIB_TCP_V6OUT_RST 0x2c
+#define TP_MIB_OFD_ARP_DROP 0x36
+#define TP_MIB_TNL_DROP_0 0x44
+#define TP_MIB_OFD_VLN_DROP_0 0x58
+
+#define ULP_TX_INT_CAUSE 0x8dcc
+#define  PBL_BOUND_ERR_CH3 0x80000000U
+#define  PBL_BOUND_ERR_CH2 0x40000000U
+#define  PBL_BOUND_ERR_CH1 0x20000000U
+#define  PBL_BOUND_ERR_CH0 0x10000000U
+
+#define PM_RX_INT_CAUSE 0x8fdc
+#define  ZERO_E_CMD_ERROR     0x00400000U
+#define  PMRX_FRAMING_ERROR   0x003ffff0U
+#define  OCSPI_PAR_ERROR      0x00000008U
+#define  DB_OPTIONS_PAR_ERROR 0x00000004U
+#define  IESPI_PAR_ERROR      0x00000002U
+#define  E_PCMD_PAR_ERROR     0x00000001U
+
+#define PM_TX_INT_CAUSE 0x8ffc
+#define  PCMD_LEN_OVFL0     0x80000000U
+#define  PCMD_LEN_OVFL1     0x40000000U
+#define  PCMD_LEN_OVFL2     0x20000000U
+#define  ZERO_C_CMD_ERROR   0x10000000U
+#define  PMTX_FRAMING_ERROR 0x0ffffff0U
+#define  OESPI_PAR_ERROR    0x00000008U
+#define  ICSPI_PAR_ERROR    0x00000002U
+#define  C_PCMD_PAR_ERROR   0x00000001U
+
+#define MPS_PORT_STAT_TX_PORT_BYTES_L 0x400
+#define MPS_PORT_STAT_TX_PORT_BYTES_H 0x404
+#define MPS_PORT_STAT_TX_PORT_FRAMES_L 0x408
+#define MPS_PORT_STAT_TX_PORT_FRAMES_H 0x40c
+#define MPS_PORT_STAT_TX_PORT_BCAST_L 0x410
+#define MPS_PORT_STAT_TX_PORT_BCAST_H 0x414
+#define MPS_PORT_STAT_TX_PORT_MCAST_L 0x418
+#define MPS_PORT_STAT_TX_PORT_MCAST_H 0x41c
+#define MPS_PORT_STAT_TX_PORT_UCAST_L 0x420
+#define MPS_PORT_STAT_TX_PORT_UCAST_H 0x424
+#define MPS_PORT_STAT_TX_PORT_ERROR_L 0x428
+#define MPS_PORT_STAT_TX_PORT_ERROR_H 0x42c
+#define MPS_PORT_STAT_TX_PORT_64B_L 0x430
+#define MPS_PORT_STAT_TX_PORT_64B_H 0x434
+#define MPS_PORT_STAT_TX_PORT_65B_127B_L 0x438
+#define MPS_PORT_STAT_TX_PORT_65B_127B_H 0x43c
+#define MPS_PORT_STAT_TX_PORT_128B_255B_L 0x440
+#define MPS_PORT_STAT_TX_PORT_128B_255B_H 0x444
+#define MPS_PORT_STAT_TX_PORT_256B_511B_L 0x448
+#define MPS_PORT_STAT_TX_PORT_256B_511B_H 0x44c
+#define MPS_PORT_STAT_TX_PORT_512B_1023B_L 0x450
+#define MPS_PORT_STAT_TX_PORT_512B_1023B_H 0x454
+#define MPS_PORT_STAT_TX_PORT_1024B_1518B_L 0x458
+#define MPS_PORT_STAT_TX_PORT_1024B_1518B_H 0x45c
+#define MPS_PORT_STAT_TX_PORT_1519B_MAX_L 0x460
+#define MPS_PORT_STAT_TX_PORT_1519B_MAX_H 0x464
+#define MPS_PORT_STAT_TX_PORT_DROP_L 0x468
+#define MPS_PORT_STAT_TX_PORT_DROP_H 0x46c
+#define MPS_PORT_STAT_TX_PORT_PAUSE_L 0x470
+#define MPS_PORT_STAT_TX_PORT_PAUSE_H 0x474
+#define MPS_PORT_STAT_TX_PORT_PPP0_L 0x478
+#define MPS_PORT_STAT_TX_PORT_PPP0_H 0x47c
+#define MPS_PORT_STAT_TX_PORT_PPP1_L 0x480
+#define MPS_PORT_STAT_TX_PORT_PPP1_H 0x484
+#define MPS_PORT_STAT_TX_PORT_PPP2_L 0x488
+#define MPS_PORT_STAT_TX_PORT_PPP2_H 0x48c
+#define MPS_PORT_STAT_TX_PORT_PPP3_L 0x490
+#define MPS_PORT_STAT_TX_PORT_PPP3_H 0x494
+#define MPS_PORT_STAT_TX_PORT_PPP4_L 0x498
+#define MPS_PORT_STAT_TX_PORT_PPP4_H 0x49c
+#define MPS_PORT_STAT_TX_PORT_PPP5_L 0x4a0
+#define MPS_PORT_STAT_TX_PORT_PPP5_H 0x4a4
+#define MPS_PORT_STAT_TX_PORT_PPP6_L 0x4a8
+#define MPS_PORT_STAT_TX_PORT_PPP6_H 0x4ac
+#define MPS_PORT_STAT_TX_PORT_PPP7_L 0x4b0
+#define MPS_PORT_STAT_TX_PORT_PPP7_H 0x4b4
+#define MPS_PORT_STAT_LB_PORT_BYTES_L 0x4c0
+#define MPS_PORT_STAT_LB_PORT_BYTES_H 0x4c4
+#define MPS_PORT_STAT_LB_PORT_FRAMES_L 0x4c8
+#define MPS_PORT_STAT_LB_PORT_FRAMES_H 0x4cc
+#define MPS_PORT_STAT_LB_PORT_BCAST_L 0x4d0
+#define MPS_PORT_STAT_LB_PORT_BCAST_H 0x4d4
+#define MPS_PORT_STAT_LB_PORT_MCAST_L 0x4d8
+#define MPS_PORT_STAT_LB_PORT_MCAST_H 0x4dc
+#define MPS_PORT_STAT_LB_PORT_UCAST_L 0x4e0
+#define MPS_PORT_STAT_LB_PORT_UCAST_H 0x4e4
+#define MPS_PORT_STAT_LB_PORT_ERROR_L 0x4e8
+#define MPS_PORT_STAT_LB_PORT_ERROR_H 0x4ec
+#define MPS_PORT_STAT_LB_PORT_64B_L 0x4f0
+#define MPS_PORT_STAT_LB_PORT_64B_H 0x4f4
+#define MPS_PORT_STAT_LB_PORT_65B_127B_L 0x4f8
+#define MPS_PORT_STAT_LB_PORT_65B_127B_H 0x4fc
+#define MPS_PORT_STAT_LB_PORT_128B_255B_L 0x500
+#define MPS_PORT_STAT_LB_PORT_128B_255B_H 0x504
+#define MPS_PORT_STAT_LB_PORT_256B_511B_L 0x508
+#define MPS_PORT_STAT_LB_PORT_256B_511B_H 0x50c
+#define MPS_PORT_STAT_LB_PORT_512B_1023B_L 0x510
+#define MPS_PORT_STAT_LB_PORT_512B_1023B_H 0x514
+#define MPS_PORT_STAT_LB_PORT_1024B_1518B_L 0x518
+#define MPS_PORT_STAT_LB_PORT_1024B_1518B_H 0x51c
+#define MPS_PORT_STAT_LB_PORT_1519B_MAX_L 0x520
+#define MPS_PORT_STAT_LB_PORT_1519B_MAX_H 0x524
+#define MPS_PORT_STAT_LB_PORT_DROP_FRAMES 0x528
+#define MPS_PORT_STAT_RX_PORT_BYTES_L 0x540
+#define MPS_PORT_STAT_RX_PORT_BYTES_H 0x544
+#define MPS_PORT_STAT_RX_PORT_FRAMES_L 0x548
+#define MPS_PORT_STAT_RX_PORT_FRAMES_H 0x54c
+#define MPS_PORT_STAT_RX_PORT_BCAST_L 0x550
+#define MPS_PORT_STAT_RX_PORT_BCAST_H 0x554
+#define MPS_PORT_STAT_RX_PORT_MCAST_L 0x558
+#define MPS_PORT_STAT_RX_PORT_MCAST_H 0x55c
+#define MPS_PORT_STAT_RX_PORT_UCAST_L 0x560
+#define MPS_PORT_STAT_RX_PORT_UCAST_H 0x564
+#define MPS_PORT_STAT_RX_PORT_MTU_ERROR_L 0x568
+#define MPS_PORT_STAT_RX_PORT_MTU_ERROR_H 0x56c
+#define MPS_PORT_STAT_RX_PORT_MTU_CRC_ERROR_L 0x570
+#define MPS_PORT_STAT_RX_PORT_MTU_CRC_ERROR_H 0x574
+#define MPS_PORT_STAT_RX_PORT_CRC_ERROR_L 0x578
+#define MPS_PORT_STAT_RX_PORT_CRC_ERROR_H 0x57c
+#define MPS_PORT_STAT_RX_PORT_LEN_ERROR_L 0x580
+#define MPS_PORT_STAT_RX_PORT_LEN_ERROR_H 0x584
+#define MPS_PORT_STAT_RX_PORT_SYM_ERROR_L 0x588
+#define MPS_PORT_STAT_RX_PORT_SYM_ERROR_H 0x58c
+#define MPS_PORT_STAT_RX_PORT_64B_L 0x590
+#define MPS_PORT_STAT_RX_PORT_64B_H 0x594
+#define MPS_PORT_STAT_RX_PORT_65B_127B_L 0x598
+#define MPS_PORT_STAT_RX_PORT_65B_127B_H 0x59c
+#define MPS_PORT_STAT_RX_PORT_128B_255B_L 0x5a0
+#define MPS_PORT_STAT_RX_PORT_128B_255B_H 0x5a4
+#define MPS_PORT_STAT_RX_PORT_256B_511B_L 0x5a8
+#define MPS_PORT_STAT_RX_PORT_256B_511B_H 0x5ac
+#define MPS_PORT_STAT_RX_PORT_512B_1023B_L 0x5b0
+#define MPS_PORT_STAT_RX_PORT_512B_1023B_H 0x5b4
+#define MPS_PORT_STAT_RX_PORT_1024B_1518B_L 0x5b8
+#define MPS_PORT_STAT_RX_PORT_1024B_1518B_H 0x5bc
+#define MPS_PORT_STAT_RX_PORT_1519B_MAX_L 0x5c0
+#define MPS_PORT_STAT_RX_PORT_1519B_MAX_H 0x5c4
+#define MPS_PORT_STAT_RX_PORT_PAUSE_L 0x5c8
+#define MPS_PORT_STAT_RX_PORT_PAUSE_H 0x5cc
+#define MPS_PORT_STAT_RX_PORT_PPP0_L 0x5d0
+#define MPS_PORT_STAT_RX_PORT_PPP0_H 0x5d4
+#define MPS_PORT_STAT_RX_PORT_PPP1_L 0x5d8
+#define MPS_PORT_STAT_RX_PORT_PPP1_H 0x5dc
+#define MPS_PORT_STAT_RX_PORT_PPP2_L 0x5e0
+#define MPS_PORT_STAT_RX_PORT_PPP2_H 0x5e4
+#define MPS_PORT_STAT_RX_PORT_PPP3_L 0x5e8
+#define MPS_PORT_STAT_RX_PORT_PPP3_H 0x5ec
+#define MPS_PORT_STAT_RX_PORT_PPP4_L 0x5f0
+#define MPS_PORT_STAT_RX_PORT_PPP4_H 0x5f4
+#define MPS_PORT_STAT_RX_PORT_PPP5_L 0x5f8
+#define MPS_PORT_STAT_RX_PORT_PPP5_H 0x5fc
+#define MPS_PORT_STAT_RX_PORT_PPP6_L 0x600
+#define MPS_PORT_STAT_RX_PORT_PPP6_H 0x604
+#define MPS_PORT_STAT_RX_PORT_PPP7_L 0x608
+#define MPS_PORT_STAT_RX_PORT_PPP7_H 0x60c
+#define MPS_PORT_STAT_RX_PORT_LESS_64B_L 0x610
+#define MPS_PORT_STAT_RX_PORT_LESS_64B_H 0x614
+#define MAC_PORT_CFG2 0x818
+#define MAC_PORT_MAGIC_MACID_LO 0x824
+#define MAC_PORT_MAGIC_MACID_HI 0x828
+#define MAC_PORT_EPIO_DATA0 0x8c0
+#define MAC_PORT_EPIO_DATA1 0x8c4
+#define MAC_PORT_EPIO_DATA2 0x8c8
+#define MAC_PORT_EPIO_DATA3 0x8cc
+#define MAC_PORT_EPIO_OP 0x8d0
+
+#define MPS_CMN_CTL 0x9000
+#define  NUMPORTS_MASK   0x00000003U
+#define  NUMPORTS_SHIFT  0
+#define  NUMPORTS_GET(x) (((x) & NUMPORTS_MASK) >> NUMPORTS_SHIFT)
+
+#define MPS_INT_CAUSE 0x9008
+#define  STATINT 0x00000020U
+#define  TXINT   0x00000010U
+#define  RXINT   0x00000008U
+#define  TRCINT  0x00000004U
+#define  CLSINT  0x00000002U
+#define  PLINT   0x00000001U
+
+#define MPS_TX_INT_CAUSE 0x9408
+#define  PORTERR    0x00010000U
+#define  FRMERR     0x00008000U
+#define  SECNTERR   0x00004000U
+#define  BUBBLE     0x00002000U
+#define  TXDESCFIFO 0x00001e00U
+#define  TXDATAFIFO 0x000001e0U
+#define  NCSIFIFO   0x00000010U
+#define  TPFIFO     0x0000000fU
+
+#define MPS_STAT_PERR_INT_CAUSE_SRAM 0x9614
+#define MPS_STAT_PERR_INT_CAUSE_TX_FIFO 0x9620
+#define MPS_STAT_PERR_INT_CAUSE_RX_FIFO 0x962c
+
+#define MPS_STAT_RX_BG_0_MAC_DROP_FRAME_L 0x9640
+#define MPS_STAT_RX_BG_0_MAC_DROP_FRAME_H 0x9644
+#define MPS_STAT_RX_BG_1_MAC_DROP_FRAME_L 0x9648
+#define MPS_STAT_RX_BG_1_MAC_DROP_FRAME_H 0x964c
+#define MPS_STAT_RX_BG_2_MAC_DROP_FRAME_L 0x9650
+#define MPS_STAT_RX_BG_2_MAC_DROP_FRAME_H 0x9654
+#define MPS_STAT_RX_BG_3_MAC_DROP_FRAME_L 0x9658
+#define MPS_STAT_RX_BG_3_MAC_DROP_FRAME_H 0x965c
+#define MPS_STAT_RX_BG_0_LB_DROP_FRAME_L 0x9660
+#define MPS_STAT_RX_BG_0_LB_DROP_FRAME_H 0x9664
+#define MPS_STAT_RX_BG_1_LB_DROP_FRAME_L 0x9668
+#define MPS_STAT_RX_BG_1_LB_DROP_FRAME_H 0x966c
+#define MPS_STAT_RX_BG_2_LB_DROP_FRAME_L 0x9670
+#define MPS_STAT_RX_BG_2_LB_DROP_FRAME_H 0x9674
+#define MPS_STAT_RX_BG_3_LB_DROP_FRAME_L 0x9678
+#define MPS_STAT_RX_BG_3_LB_DROP_FRAME_H 0x967c
+#define MPS_STAT_RX_BG_0_MAC_TRUNC_FRAME_L 0x9680
+#define MPS_STAT_RX_BG_0_MAC_TRUNC_FRAME_H 0x9684
+#define MPS_STAT_RX_BG_1_MAC_TRUNC_FRAME_L 0x9688
+#define MPS_STAT_RX_BG_1_MAC_TRUNC_FRAME_H 0x968c
+#define MPS_STAT_RX_BG_2_MAC_TRUNC_FRAME_L 0x9690
+#define MPS_STAT_RX_BG_2_MAC_TRUNC_FRAME_H 0x9694
+#define MPS_STAT_RX_BG_3_MAC_TRUNC_FRAME_L 0x9698
+#define MPS_STAT_RX_BG_3_MAC_TRUNC_FRAME_H 0x969c
+#define MPS_STAT_RX_BG_0_LB_TRUNC_FRAME_L 0x96a0
+#define MPS_STAT_RX_BG_0_LB_TRUNC_FRAME_H 0x96a4
+#define MPS_STAT_RX_BG_1_LB_TRUNC_FRAME_L 0x96a8
+#define MPS_STAT_RX_BG_1_LB_TRUNC_FRAME_H 0x96ac
+#define MPS_STAT_RX_BG_2_LB_TRUNC_FRAME_L 0x96b0
+#define MPS_STAT_RX_BG_2_LB_TRUNC_FRAME_H 0x96b4
+#define MPS_STAT_RX_BG_3_LB_TRUNC_FRAME_L 0x96b8
+#define MPS_STAT_RX_BG_3_LB_TRUNC_FRAME_H 0x96bc
+#define MPS_TRC_CFG 0x9800
+#define  TRCFIFOEMPTY       0x00000010U
+#define  TRCIGNOREDROPINPUT 0x00000008U
+#define  TRCKEEPDUPLICATES  0x00000004U
+#define  TRCEN              0x00000002U
+#define  TRCMULTIFILTER     0x00000001U
+
+#define MPS_TRC_RSS_CONTROL 0x9808
+#define MPS_T5_TRC_RSS_CONTROL 0xa00c
+#define  RSSCONTROL_MASK    0x00ff0000U
+#define  RSSCONTROL_SHIFT   16
+#define  RSSCONTROL(x)      ((x) << RSSCONTROL_SHIFT)
+#define  QUEUENUMBER_MASK   0x0000ffffU
+#define  QUEUENUMBER_SHIFT  0
+#define  QUEUENUMBER(x)     ((x) << QUEUENUMBER_SHIFT)
+
+#define MPS_TRC_FILTER_MATCH_CTL_A 0x9810
+#define  TFINVERTMATCH   0x01000000U
+#define  TFPKTTOOLARGE   0x00800000U
+#define  TFEN            0x00400000U
+#define  TFPORT_MASK     0x003c0000U
+#define  TFPORT_SHIFT    18
+#define  TFPORT(x)       ((x) << TFPORT_SHIFT)
+#define  TFPORT_GET(x)   (((x) & TFPORT_MASK) >> TFPORT_SHIFT)
+#define  TFDROP          0x00020000U
+#define  TFSOPEOPERR     0x00010000U
+#define  TFLENGTH_MASK   0x00001f00U
+#define  TFLENGTH_SHIFT  8
+#define  TFLENGTH(x)     ((x) << TFLENGTH_SHIFT)
+#define  TFLENGTH_GET(x) (((x) & TFLENGTH_MASK) >> TFLENGTH_SHIFT)
+#define  TFOFFSET_MASK   0x0000001fU
+#define  TFOFFSET_SHIFT  0
+#define  TFOFFSET(x)     ((x) << TFOFFSET_SHIFT)
+#define  TFOFFSET_GET(x) (((x) & TFOFFSET_MASK) >> TFOFFSET_SHIFT)
+
+#define MPS_TRC_FILTER_MATCH_CTL_B 0x9820
+#define  TFMINPKTSIZE_MASK   0x01ff0000U
+#define  TFMINPKTSIZE_SHIFT  16
+#define  TFMINPKTSIZE(x)     ((x) << TFMINPKTSIZE_SHIFT)
+#define  TFMINPKTSIZE_GET(x) (((x) & TFMINPKTSIZE_MASK) >> TFMINPKTSIZE_SHIFT)
+#define  TFCAPTUREMAX_MASK   0x00003fffU
+#define  TFCAPTUREMAX_SHIFT  0
+#define  TFCAPTUREMAX(x)     ((x) << TFCAPTUREMAX_SHIFT)
+#define  TFCAPTUREMAX_GET(x) (((x) & TFCAPTUREMAX_MASK) >> TFCAPTUREMAX_SHIFT)
+
+#define MPS_TRC_INT_CAUSE 0x985c
+#define  MISCPERR 0x00000100U
+#define  PKTFIFO  0x000000f0U
+#define  FILTMEM  0x0000000fU
+
+#define MPS_TRC_FILTER0_MATCH 0x9c00
+#define MPS_TRC_FILTER0_DONT_CARE 0x9c80
+#define MPS_TRC_FILTER1_MATCH 0x9d00
+#define MPS_CLS_INT_CAUSE 0xd028
+#define  PLERRENB  0x00000008U
+#define  HASHSRAM  0x00000004U
+#define  MATCHTCAM 0x00000002U
+#define  MATCHSRAM 0x00000001U
+
+#define MPS_RX_PERR_INT_CAUSE 0x11074
+
+#define CPL_INTR_CAUSE 0x19054
+#define  CIM_OP_MAP_PERR   0x00000020U
+#define  CIM_OVFL_ERROR    0x00000010U
+#define  TP_FRAMING_ERROR  0x00000008U
+#define  SGE_FRAMING_ERROR 0x00000004U
+#define  CIM_FRAMING_ERROR 0x00000002U
+#define  ZERO_SWITCH_ERROR 0x00000001U
+
+#define SMB_INT_CAUSE 0x19090
+#define  MSTTXFIFOPARINT 0x00200000U
+#define  MSTRXFIFOPARINT 0x00100000U
+#define  SLVFIFOPARINT   0x00080000U
+
+#define ULP_RX_INT_CAUSE 0x19158
+#define ULP_RX_ISCSI_TAGMASK 0x19164
+#define ULP_RX_ISCSI_PSZ 0x19168
+#define  HPZ3_MASK   0x0f000000U
+#define  HPZ3_SHIFT  24
+#define  HPZ3(x)     ((x) << HPZ3_SHIFT)
+#define  HPZ2_MASK   0x000f0000U
+#define  HPZ2_SHIFT  16
+#define  HPZ2(x)     ((x) << HPZ2_SHIFT)
+#define  HPZ1_MASK   0x00000f00U
+#define  HPZ1_SHIFT  8
+#define  HPZ1(x)     ((x) << HPZ1_SHIFT)
+#define  HPZ0_MASK   0x0000000fU
+#define  HPZ0_SHIFT  0
+#define  HPZ0(x)     ((x) << HPZ0_SHIFT)
+
+#define ULP_RX_TDDP_PSZ 0x19178
+
+#define SF_DATA 0x193f8
+#define SF_OP 0x193fc
+#define  SF_BUSY       0x80000000U
+#define  SF_LOCK       0x00000010U
+#define  SF_CONT       0x00000008U
+#define  BYTECNT_MASK  0x00000006U
+#define  BYTECNT_SHIFT 1
+#define  BYTECNT(x)    ((x) << BYTECNT_SHIFT)
+#define  OP_WR         0x00000001U
+
+#define PL_PF_INT_CAUSE 0x3c0
+#define  PFSW  0x00000008U
+#define  PFSGE 0x00000004U
+#define  PFCIM 0x00000002U
+#define  PFMPS 0x00000001U
+
+#define PL_PF_INT_ENABLE 0x3c4
+#define PL_PF_CTL 0x3c8
+#define  SWINT 0x00000001U
+
+#define PL_WHOAMI 0x19400
+#define  SOURCEPF_MASK   0x00000700U
+#define  SOURCEPF_SHIFT  8
+#define  SOURCEPF(x)     ((x) << SOURCEPF_SHIFT)
+#define  SOURCEPF_GET(x) (((x) & SOURCEPF_MASK) >> SOURCEPF_SHIFT)
+#define  ISVF            0x00000080U
+#define  VFID_MASK       0x0000007fU
+#define  VFID_SHIFT      0
+#define  VFID(x)         ((x) << VFID_SHIFT)
+#define  VFID_GET(x)     (((x) & VFID_MASK) >> VFID_SHIFT)
+
+#define PL_INT_CAUSE 0x1940c
+#define  ULP_TX     0x08000000U
+#define  SGE        0x04000000U
+#define  HMA        0x02000000U
+#define  CPL_SWITCH 0x01000000U
+#define  ULP_RX     0x00800000U
+#define  PM_RX      0x00400000U
+#define  PM_TX      0x00200000U
+#define  MA         0x00100000U
+#define  TP         0x00080000U
+#define  LE         0x00040000U
+#define  EDC1       0x00020000U
+#define  EDC0       0x00010000U
+#define  MC         0x00008000U
+#define  PCIE       0x00004000U
+#define  PMU        0x00002000U
+#define  XGMAC_KR1  0x00001000U
+#define  XGMAC_KR0  0x00000800U
+#define  XGMAC1     0x00000400U
+#define  XGMAC0     0x00000200U
+#define  SMB        0x00000100U
+#define  SF         0x00000080U
+#define  PL         0x00000040U
+#define  NCSI       0x00000020U
+#define  MPS        0x00000010U
+#define  MI         0x00000008U
+#define  DBG        0x00000004U
+#define  I2CM       0x00000002U
+#define  CIM        0x00000001U
+
+#define MC1 0x31
+#define PL_INT_ENABLE 0x19410
+#define PL_INT_MAP0 0x19414
+#define PL_RST 0x19428
+#define  PIORST     0x00000002U
+#define  PIORSTMODE 0x00000001U
+
+#define PL_PL_INT_CAUSE 0x19430
+#define  FATALPERR 0x00000010U
+#define  PERRVFID  0x00000001U
+
+#define PL_REV 0x1943c
+
+#define S_REV    0
+#define M_REV    0xfU
+#define V_REV(x) ((x) << S_REV)
+#define G_REV(x) (((x) >> S_REV) & M_REV)
+
+#define LE_DB_CONFIG 0x19c04
+#define  HASHEN 0x00100000U
+
+#define LE_DB_SERVER_INDEX 0x19c18
+#define LE_DB_ACT_CNT_IPV4 0x19c20
+#define LE_DB_ACT_CNT_IPV6 0x19c24
+
+#define LE_DB_INT_CAUSE 0x19c3c
+#define  REQQPARERR 0x00010000U
+#define  UNKNOWNCMD 0x00008000U
+#define  PARITYERR  0x00000040U
+#define  LIPMISS    0x00000020U
+#define  LIP0       0x00000010U
+
+#define LE_DB_TID_HASHBASE 0x19df8
+
+#define NCSI_INT_CAUSE 0x1a0d8
+#define  CIM_DM_PRTY_ERR 0x00000100U
+#define  MPS_DM_PRTY_ERR 0x00000080U
+#define  TXFIFO_PRTY_ERR 0x00000002U
+#define  RXFIFO_PRTY_ERR 0x00000001U
+
+#define XGMAC_PORT_CFG2 0x1018
+#define  PATEN   0x00040000U
+#define  MAGICEN 0x00020000U
+
+#define XGMAC_PORT_MAGIC_MACID_LO 0x1024
+#define XGMAC_PORT_MAGIC_MACID_HI 0x1028
+
+#define XGMAC_PORT_EPIO_DATA0 0x10c0
+#define XGMAC_PORT_EPIO_DATA1 0x10c4
+#define XGMAC_PORT_EPIO_DATA2 0x10c8
+#define XGMAC_PORT_EPIO_DATA3 0x10cc
+#define XGMAC_PORT_EPIO_OP 0x10d0
+#define  EPIOWR         0x00000100U
+#define  ADDRESS_MASK   0x000000ffU
+#define  ADDRESS_SHIFT  0
+#define  ADDRESS(x)     ((x) << ADDRESS_SHIFT)
+
+#define MAC_PORT_INT_CAUSE 0x8dc
+#define XGMAC_PORT_INT_CAUSE 0x10dc
+
+#define A_TP_TX_MOD_QUEUE_REQ_MAP 0x7e28
+
+#define A_TP_TX_MOD_CHANNEL_WEIGHT 0x7e34
+
+#define S_TX_MOD_QUEUE_REQ_MAP    0
+#define M_TX_MOD_QUEUE_REQ_MAP    0xffffU
+#define V_TX_MOD_QUEUE_REQ_MAP(x) ((x) << S_TX_MOD_QUEUE_REQ_MAP)
+
+#define A_TP_TX_MOD_QUEUE_WEIGHT0 0x7e30
+
+#define S_TX_MODQ_WEIGHT3    24
+#define M_TX_MODQ_WEIGHT3    0xffU
+#define V_TX_MODQ_WEIGHT3(x) ((x) << S_TX_MODQ_WEIGHT3)
+
+#define S_TX_MODQ_WEIGHT2    16
+#define M_TX_MODQ_WEIGHT2    0xffU
+#define V_TX_MODQ_WEIGHT2(x) ((x) << S_TX_MODQ_WEIGHT2)
+
+#define S_TX_MODQ_WEIGHT1    8
+#define M_TX_MODQ_WEIGHT1    0xffU
+#define V_TX_MODQ_WEIGHT1(x) ((x) << S_TX_MODQ_WEIGHT1)
+
+#define S_TX_MODQ_WEIGHT0    0
+#define M_TX_MODQ_WEIGHT0    0xffU
+#define V_TX_MODQ_WEIGHT0(x) ((x) << S_TX_MODQ_WEIGHT0)
+
+#define A_TP_TX_SCHED_HDR 0x23
+
+#define A_TP_TX_SCHED_FIFO 0x24
+
+#define A_TP_TX_SCHED_PCMD 0x25
+
+#define S_VNIC    11
+#define V_VNIC(x) ((x) << S_VNIC)
+#define F_VNIC    V_VNIC(1U)
+
+#define S_FRAGMENTATION    9
+#define V_FRAGMENTATION(x) ((x) << S_FRAGMENTATION)
+#define F_FRAGMENTATION    V_FRAGMENTATION(1U)
+
+#define S_MPSHITTYPE    8
+#define V_MPSHITTYPE(x) ((x) << S_MPSHITTYPE)
+#define F_MPSHITTYPE    V_MPSHITTYPE(1U)
+
+#define S_MACMATCH    7
+#define V_MACMATCH(x) ((x) << S_MACMATCH)
+#define F_MACMATCH    V_MACMATCH(1U)
+
+#define S_ETHERTYPE    6
+#define V_ETHERTYPE(x) ((x) << S_ETHERTYPE)
+#define F_ETHERTYPE    V_ETHERTYPE(1U)
+
+#define S_PROTOCOL    5
+#define V_PROTOCOL(x) ((x) << S_PROTOCOL)
+#define F_PROTOCOL    V_PROTOCOL(1U)
+
+#define S_TOS    4
+#define V_TOS(x) ((x) << S_TOS)
+#define F_TOS    V_TOS(1U)
+
+#define S_VLAN    3
+#define V_VLAN(x) ((x) << S_VLAN)
+#define F_VLAN    V_VLAN(1U)
+
+#define S_VNIC_ID    2
+#define V_VNIC_ID(x) ((x) << S_VNIC_ID)
+#define F_VNIC_ID    V_VNIC_ID(1U)
+
+#define S_PORT    1
+#define V_PORT(x) ((x) << S_PORT)
+#define F_PORT    V_PORT(1U)
+
+#define S_FCOE    0
+#define V_FCOE(x) ((x) << S_FCOE)
+#define F_FCOE    V_FCOE(1U)
+
+#define NUM_MPS_CLS_SRAM_L_INSTANCES 336
+#define NUM_MPS_T5_CLS_SRAM_L_INSTANCES 512
+
+#define T5_PORT0_BASE 0x30000
+#define T5_PORT_STRIDE 0x4000
+#define T5_PORT_BASE(idx) (T5_PORT0_BASE + (idx) * T5_PORT_STRIDE)
+#define T5_PORT_REG(idx, reg) (T5_PORT_BASE(idx) + (reg))
+
+#define MC_0_BASE_ADDR 0x40000
+#define MC_1_BASE_ADDR 0x48000
+#define MC_STRIDE (MC_1_BASE_ADDR - MC_0_BASE_ADDR)
+#define MC_REG(reg, idx) (reg + MC_STRIDE * idx)
+
+#define MC_P_BIST_CMD 0x41400
+#define MC_P_BIST_CMD_ADDR 0x41404
+#define MC_P_BIST_CMD_LEN 0x41408
+#define MC_P_BIST_DATA_PATTERN 0x4140c
+#define MC_P_BIST_STATUS_RDATA 0x41488
+#define EDC_T50_BASE_ADDR 0x50000
+#define EDC_H_BIST_CMD 0x50004
+#define EDC_H_BIST_CMD_ADDR 0x50008
+#define EDC_H_BIST_CMD_LEN 0x5000c
+#define EDC_H_BIST_DATA_PATTERN 0x50010
+#define EDC_H_BIST_STATUS_RDATA 0x50028
+
+#define EDC_T51_BASE_ADDR 0x50800
+#define EDC_STRIDE_T5 (EDC_T51_BASE_ADDR - EDC_T50_BASE_ADDR)
+#define EDC_REG_T5(reg, idx) (reg + EDC_STRIDE_T5 * idx)
+
+#define A_PL_VF_REV 0x4
+#define A_PL_VF_WHOAMI 0x0
+#define A_PL_VF_REVISION 0x8
+
+#define S_CHIPID    4
+#define M_CHIPID    0xfU
+#define V_CHIPID(x) ((x) << S_CHIPID)
+#define G_CHIPID(x) (((x) >> S_CHIPID) & M_CHIPID)
+
+/* TP_VLAN_PRI_MAP controls which subset of fields will be present in the
+ * Compressed Filter Tuple for LE filters.  Each bit set in TP_VLAN_PRI_MAP
+ * selects for a particular field being present.  These fields, when present
+ * in the Compressed Filter Tuple, have the following widths in bits.
+ */
+#define W_FT_FCOE                       1
+#define W_FT_PORT                       3
+#define W_FT_VNIC_ID                    17
+#define W_FT_VLAN                       17
+#define W_FT_TOS                        8
+#define W_FT_PROTOCOL                   8
+#define W_FT_ETHERTYPE                  16
+#define W_FT_MACMATCH                   9
+#define W_FT_MPSHITTYPE                 3
+#define W_FT_FRAGMENTATION              1
+
+/* Some of the Compressed Filter Tuple fields have internal structure.  These
+ * bit shifts/masks describe those structures.  All shifts are relative to the
+ * base position of the fields within the Compressed Filter Tuple
+ */
+#define S_FT_VLAN_VLD                   16
+#define V_FT_VLAN_VLD(x)                ((x) << S_FT_VLAN_VLD)
+#define F_FT_VLAN_VLD                   V_FT_VLAN_VLD(1U)
+
+#define S_FT_VNID_ID_VF                 0
+#define V_FT_VNID_ID_VF(x)              ((x) << S_FT_VNID_ID_VF)
+
+#define S_FT_VNID_ID_PF                 7
+#define V_FT_VNID_ID_PF(x)              ((x) << S_FT_VNID_ID_PF)
+
+#define S_FT_VNID_ID_VLD                16
+#define V_FT_VNID_ID_VLD(x)             ((x) << S_FT_VNID_ID_VLD)
+
+#endif /* __T4_REGS_H */
diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h b/drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h
new file mode 100644
index 0000000..3409756
--- /dev/null
+++ b/drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h
@@ -0,0 +1,2282 @@
+/*
+ * This file is part of the Chelsio T4 Ethernet driver for Linux.
+ *
+ * Copyright (c) 2009-2014 Chelsio Communications, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _T4FW_INTERFACE_H_
+#define _T4FW_INTERFACE_H_
+
+enum fw_retval {
+	FW_SUCCESS		= 0,	/* completed sucessfully */
+	FW_EPERM		= 1,	/* operation not permitted */
+	FW_ENOENT		= 2,	/* no such file or directory */
+	FW_EIO			= 5,	/* input/output error; hw bad */
+	FW_ENOEXEC		= 8,	/* exec format error; inv microcode */
+	FW_EAGAIN		= 11,	/* try again */
+	FW_ENOMEM		= 12,	/* out of memory */
+	FW_EFAULT		= 14,	/* bad address; fw bad */
+	FW_EBUSY		= 16,	/* resource busy */
+	FW_EEXIST		= 17,	/* file exists */
+	FW_ENODEV		= 19,	/* no such device */
+	FW_EINVAL		= 22,	/* invalid argument */
+	FW_ENOSPC		= 28,	/* no space left on device */
+	FW_ENOSYS		= 38,	/* functionality not implemented */
+	FW_ENODATA		= 61,	/* no data available */
+	FW_EPROTO		= 71,	/* protocol error */
+	FW_EADDRINUSE		= 98,	/* address already in use */
+	FW_EADDRNOTAVAIL	= 99,	/* cannot assigned requested address */
+	FW_ENETDOWN		= 100,	/* network is down */
+	FW_ENETUNREACH		= 101,	/* network is unreachable */
+	FW_ENOBUFS		= 105,	/* no buffer space available */
+	FW_ETIMEDOUT		= 110,	/* timeout */
+	FW_EINPROGRESS		= 115,	/* fw internal */
+	FW_SCSI_ABORT_REQUESTED	= 128,	/* */
+	FW_SCSI_ABORT_TIMEDOUT	= 129,	/* */
+	FW_SCSI_ABORTED		= 130,	/* */
+	FW_SCSI_CLOSE_REQUESTED	= 131,	/* */
+	FW_ERR_LINK_DOWN	= 132,	/* */
+	FW_RDEV_NOT_READY	= 133,	/* */
+	FW_ERR_RDEV_LOST	= 134,	/* */
+	FW_ERR_RDEV_LOGO	= 135,	/* */
+	FW_FCOE_NO_XCHG		= 136,	/* */
+	FW_SCSI_RSP_ERR		= 137,	/* */
+	FW_ERR_RDEV_IMPL_LOGO	= 138,	/* */
+	FW_SCSI_UNDER_FLOW_ERR  = 139,	/* */
+	FW_SCSI_OVER_FLOW_ERR   = 140,	/* */
+	FW_SCSI_DDP_ERR		= 141,	/* DDP error*/
+	FW_SCSI_TASK_ERR	= 142,	/* No SCSI tasks available */
+};
+
+#define FW_T4VF_SGE_BASE_ADDR      0x0000
+#define FW_T4VF_MPS_BASE_ADDR      0x0100
+#define FW_T4VF_PL_BASE_ADDR       0x0200
+#define FW_T4VF_MBDATA_BASE_ADDR   0x0240
+#define FW_T4VF_CIM_BASE_ADDR      0x0300
+
+enum fw_wr_opcodes {
+	FW_FILTER_WR                   = 0x02,
+	FW_ULPTX_WR                    = 0x04,
+	FW_TP_WR                       = 0x05,
+	FW_ETH_TX_PKT_WR               = 0x08,
+	FW_OFLD_CONNECTION_WR          = 0x2f,
+	FW_FLOWC_WR                    = 0x0a,
+	FW_OFLD_TX_DATA_WR             = 0x0b,
+	FW_CMD_WR                      = 0x10,
+	FW_ETH_TX_PKT_VM_WR            = 0x11,
+	FW_RI_RES_WR                   = 0x0c,
+	FW_RI_INIT_WR                  = 0x0d,
+	FW_RI_RDMA_WRITE_WR            = 0x14,
+	FW_RI_SEND_WR                  = 0x15,
+	FW_RI_RDMA_READ_WR             = 0x16,
+	FW_RI_RECV_WR                  = 0x17,
+	FW_RI_BIND_MW_WR               = 0x18,
+	FW_RI_FR_NSMR_WR               = 0x19,
+	FW_RI_INV_LSTAG_WR             = 0x1a,
+	FW_LASTC2E_WR                  = 0x40
+};
+
+struct fw_wr_hdr {
+	__be32 hi;
+	__be32 lo;
+};
+
+#define FW_WR_OP(x)	 ((x) << 24)
+#define FW_WR_OP_GET(x)	 (((x) >> 24) & 0xff)
+#define FW_WR_ATOMIC(x)	 ((x) << 23)
+#define FW_WR_FLUSH(x)   ((x) << 22)
+#define FW_WR_COMPL(x)   ((x) << 21)
+#define FW_WR_IMMDLEN_MASK 0xff
+#define FW_WR_IMMDLEN(x) ((x) << 0)
+
+#define FW_WR_EQUIQ	(1U << 31)
+#define FW_WR_EQUEQ	(1U << 30)
+#define FW_WR_FLOWID(x)	((x) << 8)
+#define FW_WR_LEN16(x)	((x) << 0)
+
+#define HW_TPL_FR_MT_PR_IV_P_FC         0X32B
+#define HW_TPL_FR_MT_PR_OV_P_FC         0X327
+
+/* filter wr reply code in cookie in CPL_SET_TCB_RPL */
+enum fw_filter_wr_cookie {
+	FW_FILTER_WR_SUCCESS,
+	FW_FILTER_WR_FLT_ADDED,
+	FW_FILTER_WR_FLT_DELETED,
+	FW_FILTER_WR_SMT_TBL_FULL,
+	FW_FILTER_WR_EINVAL,
+};
+
+struct fw_filter_wr {
+	__be32 op_pkd;
+	__be32 len16_pkd;
+	__be64 r3;
+	__be32 tid_to_iq;
+	__be32 del_filter_to_l2tix;
+	__be16 ethtype;
+	__be16 ethtypem;
+	__u8   frag_to_ovlan_vldm;
+	__u8   smac_sel;
+	__be16 rx_chan_rx_rpl_iq;
+	__be32 maci_to_matchtypem;
+	__u8   ptcl;
+	__u8   ptclm;
+	__u8   ttyp;
+	__u8   ttypm;
+	__be16 ivlan;
+	__be16 ivlanm;
+	__be16 ovlan;
+	__be16 ovlanm;
+	__u8   lip[16];
+	__u8   lipm[16];
+	__u8   fip[16];
+	__u8   fipm[16];
+	__be16 lp;
+	__be16 lpm;
+	__be16 fp;
+	__be16 fpm;
+	__be16 r7;
+	__u8   sma[6];
+};
+
+#define S_FW_FILTER_WR_TID      12
+#define M_FW_FILTER_WR_TID      0xfffff
+#define V_FW_FILTER_WR_TID(x)   ((x) << S_FW_FILTER_WR_TID)
+#define G_FW_FILTER_WR_TID(x)   \
+	(((x) >> S_FW_FILTER_WR_TID) & M_FW_FILTER_WR_TID)
+
+#define S_FW_FILTER_WR_RQTYPE           11
+#define M_FW_FILTER_WR_RQTYPE           0x1
+#define V_FW_FILTER_WR_RQTYPE(x)        ((x) << S_FW_FILTER_WR_RQTYPE)
+#define G_FW_FILTER_WR_RQTYPE(x)        \
+	(((x) >> S_FW_FILTER_WR_RQTYPE) & M_FW_FILTER_WR_RQTYPE)
+#define F_FW_FILTER_WR_RQTYPE   V_FW_FILTER_WR_RQTYPE(1U)
+
+#define S_FW_FILTER_WR_NOREPLY          10
+#define M_FW_FILTER_WR_NOREPLY          0x1
+#define V_FW_FILTER_WR_NOREPLY(x)       ((x) << S_FW_FILTER_WR_NOREPLY)
+#define G_FW_FILTER_WR_NOREPLY(x)       \
+	(((x) >> S_FW_FILTER_WR_NOREPLY) & M_FW_FILTER_WR_NOREPLY)
+#define F_FW_FILTER_WR_NOREPLY  V_FW_FILTER_WR_NOREPLY(1U)
+
+#define S_FW_FILTER_WR_IQ       0
+#define M_FW_FILTER_WR_IQ       0x3ff
+#define V_FW_FILTER_WR_IQ(x)    ((x) << S_FW_FILTER_WR_IQ)
+#define G_FW_FILTER_WR_IQ(x)    \
+	(((x) >> S_FW_FILTER_WR_IQ) & M_FW_FILTER_WR_IQ)
+
+#define S_FW_FILTER_WR_DEL_FILTER       31
+#define M_FW_FILTER_WR_DEL_FILTER       0x1
+#define V_FW_FILTER_WR_DEL_FILTER(x)    ((x) << S_FW_FILTER_WR_DEL_FILTER)
+#define G_FW_FILTER_WR_DEL_FILTER(x)    \
+	(((x) >> S_FW_FILTER_WR_DEL_FILTER) & M_FW_FILTER_WR_DEL_FILTER)
+#define F_FW_FILTER_WR_DEL_FILTER       V_FW_FILTER_WR_DEL_FILTER(1U)
+
+#define S_FW_FILTER_WR_RPTTID           25
+#define M_FW_FILTER_WR_RPTTID           0x1
+#define V_FW_FILTER_WR_RPTTID(x)        ((x) << S_FW_FILTER_WR_RPTTID)
+#define G_FW_FILTER_WR_RPTTID(x)        \
+	(((x) >> S_FW_FILTER_WR_RPTTID) & M_FW_FILTER_WR_RPTTID)
+#define F_FW_FILTER_WR_RPTTID   V_FW_FILTER_WR_RPTTID(1U)
+
+#define S_FW_FILTER_WR_DROP     24
+#define M_FW_FILTER_WR_DROP     0x1
+#define V_FW_FILTER_WR_DROP(x)  ((x) << S_FW_FILTER_WR_DROP)
+#define G_FW_FILTER_WR_DROP(x)  \
+	(((x) >> S_FW_FILTER_WR_DROP) & M_FW_FILTER_WR_DROP)
+#define F_FW_FILTER_WR_DROP     V_FW_FILTER_WR_DROP(1U)
+
+#define S_FW_FILTER_WR_DIRSTEER         23
+#define M_FW_FILTER_WR_DIRSTEER         0x1
+#define V_FW_FILTER_WR_DIRSTEER(x)      ((x) << S_FW_FILTER_WR_DIRSTEER)
+#define G_FW_FILTER_WR_DIRSTEER(x)      \
+	(((x) >> S_FW_FILTER_WR_DIRSTEER) & M_FW_FILTER_WR_DIRSTEER)
+#define F_FW_FILTER_WR_DIRSTEER V_FW_FILTER_WR_DIRSTEER(1U)
+
+#define S_FW_FILTER_WR_MASKHASH         22
+#define M_FW_FILTER_WR_MASKHASH         0x1
+#define V_FW_FILTER_WR_MASKHASH(x)      ((x) << S_FW_FILTER_WR_MASKHASH)
+#define G_FW_FILTER_WR_MASKHASH(x)      \
+	(((x) >> S_FW_FILTER_WR_MASKHASH) & M_FW_FILTER_WR_MASKHASH)
+#define F_FW_FILTER_WR_MASKHASH V_FW_FILTER_WR_MASKHASH(1U)
+
+#define S_FW_FILTER_WR_DIRSTEERHASH     21
+#define M_FW_FILTER_WR_DIRSTEERHASH     0x1
+#define V_FW_FILTER_WR_DIRSTEERHASH(x)  ((x) << S_FW_FILTER_WR_DIRSTEERHASH)
+#define G_FW_FILTER_WR_DIRSTEERHASH(x)  \
+	(((x) >> S_FW_FILTER_WR_DIRSTEERHASH) & M_FW_FILTER_WR_DIRSTEERHASH)
+#define F_FW_FILTER_WR_DIRSTEERHASH     V_FW_FILTER_WR_DIRSTEERHASH(1U)
+
+#define S_FW_FILTER_WR_LPBK     20
+#define M_FW_FILTER_WR_LPBK     0x1
+#define V_FW_FILTER_WR_LPBK(x)  ((x) << S_FW_FILTER_WR_LPBK)
+#define G_FW_FILTER_WR_LPBK(x)  \
+	(((x) >> S_FW_FILTER_WR_LPBK) & M_FW_FILTER_WR_LPBK)
+#define F_FW_FILTER_WR_LPBK     V_FW_FILTER_WR_LPBK(1U)
+
+#define S_FW_FILTER_WR_DMAC     19
+#define M_FW_FILTER_WR_DMAC     0x1
+#define V_FW_FILTER_WR_DMAC(x)  ((x) << S_FW_FILTER_WR_DMAC)
+#define G_FW_FILTER_WR_DMAC(x)  \
+	(((x) >> S_FW_FILTER_WR_DMAC) & M_FW_FILTER_WR_DMAC)
+#define F_FW_FILTER_WR_DMAC     V_FW_FILTER_WR_DMAC(1U)
+
+#define S_FW_FILTER_WR_SMAC     18
+#define M_FW_FILTER_WR_SMAC     0x1
+#define V_FW_FILTER_WR_SMAC(x)  ((x) << S_FW_FILTER_WR_SMAC)
+#define G_FW_FILTER_WR_SMAC(x)  \
+	(((x) >> S_FW_FILTER_WR_SMAC) & M_FW_FILTER_WR_SMAC)
+#define F_FW_FILTER_WR_SMAC     V_FW_FILTER_WR_SMAC(1U)
+
+#define S_FW_FILTER_WR_INSVLAN          17
+#define M_FW_FILTER_WR_INSVLAN          0x1
+#define V_FW_FILTER_WR_INSVLAN(x)       ((x) << S_FW_FILTER_WR_INSVLAN)
+#define G_FW_FILTER_WR_INSVLAN(x)       \
+	(((x) >> S_FW_FILTER_WR_INSVLAN) & M_FW_FILTER_WR_INSVLAN)
+#define F_FW_FILTER_WR_INSVLAN  V_FW_FILTER_WR_INSVLAN(1U)
+
+#define S_FW_FILTER_WR_RMVLAN           16
+#define M_FW_FILTER_WR_RMVLAN           0x1
+#define V_FW_FILTER_WR_RMVLAN(x)        ((x) << S_FW_FILTER_WR_RMVLAN)
+#define G_FW_FILTER_WR_RMVLAN(x)        \
+	(((x) >> S_FW_FILTER_WR_RMVLAN) & M_FW_FILTER_WR_RMVLAN)
+#define F_FW_FILTER_WR_RMVLAN   V_FW_FILTER_WR_RMVLAN(1U)
+
+#define S_FW_FILTER_WR_HITCNTS          15
+#define M_FW_FILTER_WR_HITCNTS          0x1
+#define V_FW_FILTER_WR_HITCNTS(x)       ((x) << S_FW_FILTER_WR_HITCNTS)
+#define G_FW_FILTER_WR_HITCNTS(x)       \
+	(((x) >> S_FW_FILTER_WR_HITCNTS) & M_FW_FILTER_WR_HITCNTS)
+#define F_FW_FILTER_WR_HITCNTS  V_FW_FILTER_WR_HITCNTS(1U)
+
+#define S_FW_FILTER_WR_TXCHAN           13
+#define M_FW_FILTER_WR_TXCHAN           0x3
+#define V_FW_FILTER_WR_TXCHAN(x)        ((x) << S_FW_FILTER_WR_TXCHAN)
+#define G_FW_FILTER_WR_TXCHAN(x)        \
+	(((x) >> S_FW_FILTER_WR_TXCHAN) & M_FW_FILTER_WR_TXCHAN)
+
+#define S_FW_FILTER_WR_PRIO     12
+#define M_FW_FILTER_WR_PRIO     0x1
+#define V_FW_FILTER_WR_PRIO(x)  ((x) << S_FW_FILTER_WR_PRIO)
+#define G_FW_FILTER_WR_PRIO(x)  \
+	(((x) >> S_FW_FILTER_WR_PRIO) & M_FW_FILTER_WR_PRIO)
+#define F_FW_FILTER_WR_PRIO     V_FW_FILTER_WR_PRIO(1U)
+
+#define S_FW_FILTER_WR_L2TIX    0
+#define M_FW_FILTER_WR_L2TIX    0xfff
+#define V_FW_FILTER_WR_L2TIX(x) ((x) << S_FW_FILTER_WR_L2TIX)
+#define G_FW_FILTER_WR_L2TIX(x) \
+	(((x) >> S_FW_FILTER_WR_L2TIX) & M_FW_FILTER_WR_L2TIX)
+
+#define S_FW_FILTER_WR_FRAG     7
+#define M_FW_FILTER_WR_FRAG     0x1
+#define V_FW_FILTER_WR_FRAG(x)  ((x) << S_FW_FILTER_WR_FRAG)
+#define G_FW_FILTER_WR_FRAG(x)  \
+	(((x) >> S_FW_FILTER_WR_FRAG) & M_FW_FILTER_WR_FRAG)
+#define F_FW_FILTER_WR_FRAG     V_FW_FILTER_WR_FRAG(1U)
+
+#define S_FW_FILTER_WR_FRAGM    6
+#define M_FW_FILTER_WR_FRAGM    0x1
+#define V_FW_FILTER_WR_FRAGM(x) ((x) << S_FW_FILTER_WR_FRAGM)
+#define G_FW_FILTER_WR_FRAGM(x) \
+	(((x) >> S_FW_FILTER_WR_FRAGM) & M_FW_FILTER_WR_FRAGM)
+#define F_FW_FILTER_WR_FRAGM    V_FW_FILTER_WR_FRAGM(1U)
+
+#define S_FW_FILTER_WR_IVLAN_VLD        5
+#define M_FW_FILTER_WR_IVLAN_VLD        0x1
+#define V_FW_FILTER_WR_IVLAN_VLD(x)     ((x) << S_FW_FILTER_WR_IVLAN_VLD)
+#define G_FW_FILTER_WR_IVLAN_VLD(x)     \
+	(((x) >> S_FW_FILTER_WR_IVLAN_VLD) & M_FW_FILTER_WR_IVLAN_VLD)
+#define F_FW_FILTER_WR_IVLAN_VLD        V_FW_FILTER_WR_IVLAN_VLD(1U)
+
+#define S_FW_FILTER_WR_OVLAN_VLD        4
+#define M_FW_FILTER_WR_OVLAN_VLD        0x1
+#define V_FW_FILTER_WR_OVLAN_VLD(x)     ((x) << S_FW_FILTER_WR_OVLAN_VLD)
+#define G_FW_FILTER_WR_OVLAN_VLD(x)     \
+	(((x) >> S_FW_FILTER_WR_OVLAN_VLD) & M_FW_FILTER_WR_OVLAN_VLD)
+#define F_FW_FILTER_WR_OVLAN_VLD        V_FW_FILTER_WR_OVLAN_VLD(1U)
+
+#define S_FW_FILTER_WR_IVLAN_VLDM       3
+#define M_FW_FILTER_WR_IVLAN_VLDM       0x1
+#define V_FW_FILTER_WR_IVLAN_VLDM(x)    ((x) << S_FW_FILTER_WR_IVLAN_VLDM)
+#define G_FW_FILTER_WR_IVLAN_VLDM(x)    \
+	(((x) >> S_FW_FILTER_WR_IVLAN_VLDM) & M_FW_FILTER_WR_IVLAN_VLDM)
+#define F_FW_FILTER_WR_IVLAN_VLDM       V_FW_FILTER_WR_IVLAN_VLDM(1U)
+
+#define S_FW_FILTER_WR_OVLAN_VLDM       2
+#define M_FW_FILTER_WR_OVLAN_VLDM       0x1
+#define V_FW_FILTER_WR_OVLAN_VLDM(x)    ((x) << S_FW_FILTER_WR_OVLAN_VLDM)
+#define G_FW_FILTER_WR_OVLAN_VLDM(x)    \
+	(((x) >> S_FW_FILTER_WR_OVLAN_VLDM) & M_FW_FILTER_WR_OVLAN_VLDM)
+#define F_FW_FILTER_WR_OVLAN_VLDM       V_FW_FILTER_WR_OVLAN_VLDM(1U)
+
+#define S_FW_FILTER_WR_RX_CHAN          15
+#define M_FW_FILTER_WR_RX_CHAN          0x1
+#define V_FW_FILTER_WR_RX_CHAN(x)       ((x) << S_FW_FILTER_WR_RX_CHAN)
+#define G_FW_FILTER_WR_RX_CHAN(x)       \
+	(((x) >> S_FW_FILTER_WR_RX_CHAN) & M_FW_FILTER_WR_RX_CHAN)
+#define F_FW_FILTER_WR_RX_CHAN  V_FW_FILTER_WR_RX_CHAN(1U)
+
+#define S_FW_FILTER_WR_RX_RPL_IQ        0
+#define M_FW_FILTER_WR_RX_RPL_IQ        0x3ff
+#define V_FW_FILTER_WR_RX_RPL_IQ(x)     ((x) << S_FW_FILTER_WR_RX_RPL_IQ)
+#define G_FW_FILTER_WR_RX_RPL_IQ(x)     \
+	(((x) >> S_FW_FILTER_WR_RX_RPL_IQ) & M_FW_FILTER_WR_RX_RPL_IQ)
+
+#define S_FW_FILTER_WR_MACI     23
+#define M_FW_FILTER_WR_MACI     0x1ff
+#define V_FW_FILTER_WR_MACI(x)  ((x) << S_FW_FILTER_WR_MACI)
+#define G_FW_FILTER_WR_MACI(x)  \
+	(((x) >> S_FW_FILTER_WR_MACI) & M_FW_FILTER_WR_MACI)
+
+#define S_FW_FILTER_WR_MACIM    14
+#define M_FW_FILTER_WR_MACIM    0x1ff
+#define V_FW_FILTER_WR_MACIM(x) ((x) << S_FW_FILTER_WR_MACIM)
+#define G_FW_FILTER_WR_MACIM(x) \
+	(((x) >> S_FW_FILTER_WR_MACIM) & M_FW_FILTER_WR_MACIM)
+
+#define S_FW_FILTER_WR_FCOE     13
+#define M_FW_FILTER_WR_FCOE     0x1
+#define V_FW_FILTER_WR_FCOE(x)  ((x) << S_FW_FILTER_WR_FCOE)
+#define G_FW_FILTER_WR_FCOE(x)  \
+	(((x) >> S_FW_FILTER_WR_FCOE) & M_FW_FILTER_WR_FCOE)
+#define F_FW_FILTER_WR_FCOE     V_FW_FILTER_WR_FCOE(1U)
+
+#define S_FW_FILTER_WR_FCOEM    12
+#define M_FW_FILTER_WR_FCOEM    0x1
+#define V_FW_FILTER_WR_FCOEM(x) ((x) << S_FW_FILTER_WR_FCOEM)
+#define G_FW_FILTER_WR_FCOEM(x) \
+	(((x) >> S_FW_FILTER_WR_FCOEM) & M_FW_FILTER_WR_FCOEM)
+#define F_FW_FILTER_WR_FCOEM    V_FW_FILTER_WR_FCOEM(1U)
+
+#define S_FW_FILTER_WR_PORT     9
+#define M_FW_FILTER_WR_PORT     0x7
+#define V_FW_FILTER_WR_PORT(x)  ((x) << S_FW_FILTER_WR_PORT)
+#define G_FW_FILTER_WR_PORT(x)  \
+	(((x) >> S_FW_FILTER_WR_PORT) & M_FW_FILTER_WR_PORT)
+
+#define S_FW_FILTER_WR_PORTM    6
+#define M_FW_FILTER_WR_PORTM    0x7
+#define V_FW_FILTER_WR_PORTM(x) ((x) << S_FW_FILTER_WR_PORTM)
+#define G_FW_FILTER_WR_PORTM(x) \
+	(((x) >> S_FW_FILTER_WR_PORTM) & M_FW_FILTER_WR_PORTM)
+
+#define S_FW_FILTER_WR_MATCHTYPE        3
+#define M_FW_FILTER_WR_MATCHTYPE        0x7
+#define V_FW_FILTER_WR_MATCHTYPE(x)     ((x) << S_FW_FILTER_WR_MATCHTYPE)
+#define G_FW_FILTER_WR_MATCHTYPE(x)     \
+	(((x) >> S_FW_FILTER_WR_MATCHTYPE) & M_FW_FILTER_WR_MATCHTYPE)
+
+#define S_FW_FILTER_WR_MATCHTYPEM       0
+#define M_FW_FILTER_WR_MATCHTYPEM       0x7
+#define V_FW_FILTER_WR_MATCHTYPEM(x)    ((x) << S_FW_FILTER_WR_MATCHTYPEM)
+#define G_FW_FILTER_WR_MATCHTYPEM(x)    \
+	(((x) >> S_FW_FILTER_WR_MATCHTYPEM) & M_FW_FILTER_WR_MATCHTYPEM)
+
+struct fw_ulptx_wr {
+	__be32 op_to_compl;
+	__be32 flowid_len16;
+	u64 cookie;
+};
+
+struct fw_tp_wr {
+	__be32 op_to_immdlen;
+	__be32 flowid_len16;
+	u64 cookie;
+};
+
+struct fw_eth_tx_pkt_wr {
+	__be32 op_immdlen;
+	__be32 equiq_to_len16;
+	__be64 r3;
+};
+
+struct fw_ofld_connection_wr {
+	__be32 op_compl;
+	__be32 len16_pkd;
+	__u64  cookie;
+	__be64 r2;
+	__be64 r3;
+	struct fw_ofld_connection_le {
+		__be32 version_cpl;
+		__be32 filter;
+		__be32 r1;
+		__be16 lport;
+		__be16 pport;
+		union fw_ofld_connection_leip {
+			struct fw_ofld_connection_le_ipv4 {
+				__be32 pip;
+				__be32 lip;
+				__be64 r0;
+				__be64 r1;
+				__be64 r2;
+			} ipv4;
+			struct fw_ofld_connection_le_ipv6 {
+				__be64 pip_hi;
+				__be64 pip_lo;
+				__be64 lip_hi;
+				__be64 lip_lo;
+			} ipv6;
+		} u;
+	} le;
+	struct fw_ofld_connection_tcb {
+		__be32 t_state_to_astid;
+		__be16 cplrxdataack_cplpassacceptrpl;
+		__be16 rcv_adv;
+		__be32 rcv_nxt;
+		__be32 tx_max;
+		__be64 opt0;
+		__be32 opt2;
+		__be32 r1;
+		__be64 r2;
+		__be64 r3;
+	} tcb;
+};
+
+#define S_FW_OFLD_CONNECTION_WR_VERSION                31
+#define M_FW_OFLD_CONNECTION_WR_VERSION                0x1
+#define V_FW_OFLD_CONNECTION_WR_VERSION(x)     \
+	((x) << S_FW_OFLD_CONNECTION_WR_VERSION)
+#define G_FW_OFLD_CONNECTION_WR_VERSION(x)     \
+	(((x) >> S_FW_OFLD_CONNECTION_WR_VERSION) & \
+	M_FW_OFLD_CONNECTION_WR_VERSION)
+#define F_FW_OFLD_CONNECTION_WR_VERSION        \
+	V_FW_OFLD_CONNECTION_WR_VERSION(1U)
+
+#define S_FW_OFLD_CONNECTION_WR_CPL    30
+#define M_FW_OFLD_CONNECTION_WR_CPL    0x1
+#define V_FW_OFLD_CONNECTION_WR_CPL(x) ((x) << S_FW_OFLD_CONNECTION_WR_CPL)
+#define G_FW_OFLD_CONNECTION_WR_CPL(x) \
+	(((x) >> S_FW_OFLD_CONNECTION_WR_CPL) & M_FW_OFLD_CONNECTION_WR_CPL)
+#define F_FW_OFLD_CONNECTION_WR_CPL    V_FW_OFLD_CONNECTION_WR_CPL(1U)
+
+#define S_FW_OFLD_CONNECTION_WR_T_STATE                28
+#define M_FW_OFLD_CONNECTION_WR_T_STATE                0xf
+#define V_FW_OFLD_CONNECTION_WR_T_STATE(x)     \
+	((x) << S_FW_OFLD_CONNECTION_WR_T_STATE)
+#define G_FW_OFLD_CONNECTION_WR_T_STATE(x)     \
+	(((x) >> S_FW_OFLD_CONNECTION_WR_T_STATE) & \
+	M_FW_OFLD_CONNECTION_WR_T_STATE)
+
+#define S_FW_OFLD_CONNECTION_WR_RCV_SCALE      24
+#define M_FW_OFLD_CONNECTION_WR_RCV_SCALE      0xf
+#define V_FW_OFLD_CONNECTION_WR_RCV_SCALE(x)   \
+	((x) << S_FW_OFLD_CONNECTION_WR_RCV_SCALE)
+#define G_FW_OFLD_CONNECTION_WR_RCV_SCALE(x)   \
+	(((x) >> S_FW_OFLD_CONNECTION_WR_RCV_SCALE) & \
+	M_FW_OFLD_CONNECTION_WR_RCV_SCALE)
+
+#define S_FW_OFLD_CONNECTION_WR_ASTID          0
+#define M_FW_OFLD_CONNECTION_WR_ASTID          0xffffff
+#define V_FW_OFLD_CONNECTION_WR_ASTID(x)       \
+	((x) << S_FW_OFLD_CONNECTION_WR_ASTID)
+#define G_FW_OFLD_CONNECTION_WR_ASTID(x)       \
+	(((x) >> S_FW_OFLD_CONNECTION_WR_ASTID) & M_FW_OFLD_CONNECTION_WR_ASTID)
+
+#define S_FW_OFLD_CONNECTION_WR_CPLRXDATAACK   15
+#define M_FW_OFLD_CONNECTION_WR_CPLRXDATAACK   0x1
+#define V_FW_OFLD_CONNECTION_WR_CPLRXDATAACK(x)        \
+	((x) << S_FW_OFLD_CONNECTION_WR_CPLRXDATAACK)
+#define G_FW_OFLD_CONNECTION_WR_CPLRXDATAACK(x)        \
+	(((x) >> S_FW_OFLD_CONNECTION_WR_CPLRXDATAACK) & \
+	M_FW_OFLD_CONNECTION_WR_CPLRXDATAACK)
+#define F_FW_OFLD_CONNECTION_WR_CPLRXDATAACK   \
+	V_FW_OFLD_CONNECTION_WR_CPLRXDATAACK(1U)
+
+#define S_FW_OFLD_CONNECTION_WR_CPLPASSACCEPTRPL       14
+#define M_FW_OFLD_CONNECTION_WR_CPLPASSACCEPTRPL       0x1
+#define V_FW_OFLD_CONNECTION_WR_CPLPASSACCEPTRPL(x)    \
+	((x) << S_FW_OFLD_CONNECTION_WR_CPLPASSACCEPTRPL)
+#define G_FW_OFLD_CONNECTION_WR_CPLPASSACCEPTRPL(x)    \
+	(((x) >> S_FW_OFLD_CONNECTION_WR_CPLPASSACCEPTRPL) & \
+	M_FW_OFLD_CONNECTION_WR_CPLPASSACCEPTRPL)
+#define F_FW_OFLD_CONNECTION_WR_CPLPASSACCEPTRPL       \
+	V_FW_OFLD_CONNECTION_WR_CPLPASSACCEPTRPL(1U)
+
+enum fw_flowc_mnem {
+	FW_FLOWC_MNEM_PFNVFN,		/* PFN [15:8] VFN [7:0] */
+	FW_FLOWC_MNEM_CH,
+	FW_FLOWC_MNEM_PORT,
+	FW_FLOWC_MNEM_IQID,
+	FW_FLOWC_MNEM_SNDNXT,
+	FW_FLOWC_MNEM_RCVNXT,
+	FW_FLOWC_MNEM_SNDBUF,
+	FW_FLOWC_MNEM_MSS,
+};
+
+struct fw_flowc_mnemval {
+	u8 mnemonic;
+	u8 r4[3];
+	__be32 val;
+};
+
+struct fw_flowc_wr {
+	__be32 op_to_nparams;
+#define FW_FLOWC_WR_NPARAMS(x)	((x) << 0)
+	__be32 flowid_len16;
+	struct fw_flowc_mnemval mnemval[0];
+};
+
+struct fw_ofld_tx_data_wr {
+	__be32 op_to_immdlen;
+	__be32 flowid_len16;
+	__be32 plen;
+	__be32 tunnel_to_proxy;
+#define FW_OFLD_TX_DATA_WR_TUNNEL(x)	 ((x) << 19)
+#define FW_OFLD_TX_DATA_WR_SAVE(x)	 ((x) << 18)
+#define FW_OFLD_TX_DATA_WR_FLUSH(x)	 ((x) << 17)
+#define FW_OFLD_TX_DATA_WR_URGENT(x)	 ((x) << 16)
+#define FW_OFLD_TX_DATA_WR_MORE(x)	 ((x) << 15)
+#define FW_OFLD_TX_DATA_WR_SHOVE(x)	 ((x) << 14)
+#define FW_OFLD_TX_DATA_WR_ULPMODE(x)	 ((x) << 10)
+#define FW_OFLD_TX_DATA_WR_ULPSUBMODE(x) ((x) << 6)
+};
+
+struct fw_cmd_wr {
+	__be32 op_dma;
+#define FW_CMD_WR_DMA (1U << 17)
+	__be32 len16_pkd;
+	__be64 cookie_daddr;
+};
+
+struct fw_eth_tx_pkt_vm_wr {
+	__be32 op_immdlen;
+	__be32 equiq_to_len16;
+	__be32 r3[2];
+	u8 ethmacdst[6];
+	u8 ethmacsrc[6];
+	__be16 ethtype;
+	__be16 vlantci;
+};
+
+#define FW_CMD_MAX_TIMEOUT 10000
+
+/*
+ * If a host driver does a HELLO and discovers that there's already a MASTER
+ * selected, we may have to wait for that MASTER to finish issuing RESET,
+ * configuration and INITIALIZE commands.  Also, there's a possibility that
+ * our own HELLO may get lost if it happens right as the MASTER is issuign a
+ * RESET command, so we need to be willing to make a few retries of our HELLO.
+ */
+#define FW_CMD_HELLO_TIMEOUT	(3 * FW_CMD_MAX_TIMEOUT)
+#define FW_CMD_HELLO_RETRIES	3
+
+
+enum fw_cmd_opcodes {
+	FW_LDST_CMD                    = 0x01,
+	FW_RESET_CMD                   = 0x03,
+	FW_HELLO_CMD                   = 0x04,
+	FW_BYE_CMD                     = 0x05,
+	FW_INITIALIZE_CMD              = 0x06,
+	FW_CAPS_CONFIG_CMD             = 0x07,
+	FW_PARAMS_CMD                  = 0x08,
+	FW_PFVF_CMD                    = 0x09,
+	FW_IQ_CMD                      = 0x10,
+	FW_EQ_MNGT_CMD                 = 0x11,
+	FW_EQ_ETH_CMD                  = 0x12,
+	FW_EQ_CTRL_CMD                 = 0x13,
+	FW_EQ_OFLD_CMD                 = 0x21,
+	FW_VI_CMD                      = 0x14,
+	FW_VI_MAC_CMD                  = 0x15,
+	FW_VI_RXMODE_CMD               = 0x16,
+	FW_VI_ENABLE_CMD               = 0x17,
+	FW_ACL_MAC_CMD                 = 0x18,
+	FW_ACL_VLAN_CMD                = 0x19,
+	FW_VI_STATS_CMD                = 0x1a,
+	FW_PORT_CMD                    = 0x1b,
+	FW_PORT_STATS_CMD              = 0x1c,
+	FW_PORT_LB_STATS_CMD           = 0x1d,
+	FW_PORT_TRACE_CMD              = 0x1e,
+	FW_PORT_TRACE_MMAP_CMD         = 0x1f,
+	FW_RSS_IND_TBL_CMD             = 0x20,
+	FW_RSS_GLB_CONFIG_CMD          = 0x22,
+	FW_RSS_VI_CONFIG_CMD           = 0x23,
+	FW_CLIP_CMD                    = 0x28,
+	FW_LASTC2E_CMD                 = 0x40,
+	FW_ERROR_CMD                   = 0x80,
+	FW_DEBUG_CMD                   = 0x81,
+};
+
+enum fw_cmd_cap {
+	FW_CMD_CAP_PF                  = 0x01,
+	FW_CMD_CAP_DMAQ                = 0x02,
+	FW_CMD_CAP_PORT                = 0x04,
+	FW_CMD_CAP_PORTPROMISC         = 0x08,
+	FW_CMD_CAP_PORTSTATS           = 0x10,
+	FW_CMD_CAP_VF                  = 0x80,
+};
+
+/*
+ * Generic command header flit0
+ */
+struct fw_cmd_hdr {
+	__be32 hi;
+	__be32 lo;
+};
+
+#define FW_CMD_OP(x)		((x) << 24)
+#define FW_CMD_OP_GET(x)        (((x) >> 24) & 0xff)
+#define FW_CMD_REQUEST          (1U << 23)
+#define FW_CMD_REQUEST_GET(x)   (((x) >> 23) & 0x1)
+#define FW_CMD_READ		(1U << 22)
+#define FW_CMD_WRITE		(1U << 21)
+#define FW_CMD_EXEC		(1U << 20)
+#define FW_CMD_RAMASK(x)	((x) << 20)
+#define FW_CMD_RETVAL(x)	((x) << 8)
+#define FW_CMD_RETVAL_GET(x)	(((x) >> 8) & 0xff)
+#define FW_CMD_LEN16(x)         ((x) << 0)
+#define FW_LEN16(fw_struct)	FW_CMD_LEN16(sizeof(fw_struct) / 16)
+
+enum fw_ldst_addrspc {
+	FW_LDST_ADDRSPC_FIRMWARE  = 0x0001,
+	FW_LDST_ADDRSPC_SGE_EGRC  = 0x0008,
+	FW_LDST_ADDRSPC_SGE_INGC  = 0x0009,
+	FW_LDST_ADDRSPC_SGE_FLMC  = 0x000a,
+	FW_LDST_ADDRSPC_SGE_CONMC = 0x000b,
+	FW_LDST_ADDRSPC_TP_PIO    = 0x0010,
+	FW_LDST_ADDRSPC_TP_TM_PIO = 0x0011,
+	FW_LDST_ADDRSPC_TP_MIB    = 0x0012,
+	FW_LDST_ADDRSPC_MDIO      = 0x0018,
+	FW_LDST_ADDRSPC_MPS       = 0x0020,
+	FW_LDST_ADDRSPC_FUNC      = 0x0028,
+	FW_LDST_ADDRSPC_FUNC_PCIE = 0x0029,
+};
+
+enum fw_ldst_mps_fid {
+	FW_LDST_MPS_ATRB,
+	FW_LDST_MPS_RPLC
+};
+
+enum fw_ldst_func_access_ctl {
+	FW_LDST_FUNC_ACC_CTL_VIID,
+	FW_LDST_FUNC_ACC_CTL_FID
+};
+
+enum fw_ldst_func_mod_index {
+	FW_LDST_FUNC_MPS
+};
+
+struct fw_ldst_cmd {
+	__be32 op_to_addrspace;
+#define FW_LDST_CMD_ADDRSPACE(x) ((x) << 0)
+	__be32 cycles_to_len16;
+	union fw_ldst {
+		struct fw_ldst_addrval {
+			__be32 addr;
+			__be32 val;
+		} addrval;
+		struct fw_ldst_idctxt {
+			__be32 physid;
+			__be32 msg_pkd;
+			__be32 ctxt_data7;
+			__be32 ctxt_data6;
+			__be32 ctxt_data5;
+			__be32 ctxt_data4;
+			__be32 ctxt_data3;
+			__be32 ctxt_data2;
+			__be32 ctxt_data1;
+			__be32 ctxt_data0;
+		} idctxt;
+		struct fw_ldst_mdio {
+			__be16 paddr_mmd;
+			__be16 raddr;
+			__be16 vctl;
+			__be16 rval;
+		} mdio;
+		struct fw_ldst_mps {
+			__be16 fid_ctl;
+			__be16 rplcpf_pkd;
+			__be32 rplc127_96;
+			__be32 rplc95_64;
+			__be32 rplc63_32;
+			__be32 rplc31_0;
+			__be32 atrb;
+			__be16 vlan[16];
+		} mps;
+		struct fw_ldst_func {
+			u8 access_ctl;
+			u8 mod_index;
+			__be16 ctl_id;
+			__be32 offset;
+			__be64 data0;
+			__be64 data1;
+		} func;
+		struct fw_ldst_pcie {
+			u8 ctrl_to_fn;
+			u8 bnum;
+			u8 r;
+			u8 ext_r;
+			u8 select_naccess;
+			u8 pcie_fn;
+			__be16 nset_pkd;
+			__be32 data[12];
+		} pcie;
+	} u;
+};
+
+#define FW_LDST_CMD_MSG(x)	((x) << 31)
+#define FW_LDST_CMD_PADDR(x)	((x) << 8)
+#define FW_LDST_CMD_MMD(x)	((x) << 0)
+#define FW_LDST_CMD_FID(x)	((x) << 15)
+#define FW_LDST_CMD_CTL(x)	((x) << 0)
+#define FW_LDST_CMD_RPLCPF(x)	((x) << 0)
+#define FW_LDST_CMD_LC		(1U << 4)
+#define FW_LDST_CMD_NACCESS(x)	((x) << 0)
+#define FW_LDST_CMD_FN(x)	((x) << 0)
+
+struct fw_reset_cmd {
+	__be32 op_to_write;
+	__be32 retval_len16;
+	__be32 val;
+	__be32 halt_pkd;
+};
+
+#define FW_RESET_CMD_HALT_SHIFT    31
+#define FW_RESET_CMD_HALT_MASK     0x1
+#define FW_RESET_CMD_HALT(x)       ((x) << FW_RESET_CMD_HALT_SHIFT)
+#define FW_RESET_CMD_HALT_GET(x)  \
+	(((x) >> FW_RESET_CMD_HALT_SHIFT) & FW_RESET_CMD_HALT_MASK)
+
+enum fw_hellow_cmd {
+	fw_hello_cmd_stage_os		= 0x0
+};
+
+struct fw_hello_cmd {
+	__be32 op_to_write;
+	__be32 retval_len16;
+	__be32 err_to_clearinit;
+#define FW_HELLO_CMD_ERR	    (1U << 31)
+#define FW_HELLO_CMD_INIT	    (1U << 30)
+#define FW_HELLO_CMD_MASTERDIS(x)   ((x) << 29)
+#define FW_HELLO_CMD_MASTERFORCE(x) ((x) << 28)
+#define FW_HELLO_CMD_MBMASTER_MASK   0xfU
+#define FW_HELLO_CMD_MBMASTER_SHIFT  24
+#define FW_HELLO_CMD_MBMASTER(x)     ((x) << FW_HELLO_CMD_MBMASTER_SHIFT)
+#define FW_HELLO_CMD_MBMASTER_GET(x) \
+	(((x) >> FW_HELLO_CMD_MBMASTER_SHIFT) & FW_HELLO_CMD_MBMASTER_MASK)
+#define FW_HELLO_CMD_MBASYNCNOTINT(x)	((x) << 23)
+#define FW_HELLO_CMD_MBASYNCNOT(x)  ((x) << 20)
+#define FW_HELLO_CMD_STAGE(x)       ((x) << 17)
+#define FW_HELLO_CMD_CLEARINIT      (1U << 16)
+	__be32 fwrev;
+};
+
+struct fw_bye_cmd {
+	__be32 op_to_write;
+	__be32 retval_len16;
+	__be64 r3;
+};
+
+struct fw_initialize_cmd {
+	__be32 op_to_write;
+	__be32 retval_len16;
+	__be64 r3;
+};
+
+enum fw_caps_config_hm {
+	FW_CAPS_CONFIG_HM_PCIE		= 0x00000001,
+	FW_CAPS_CONFIG_HM_PL		= 0x00000002,
+	FW_CAPS_CONFIG_HM_SGE		= 0x00000004,
+	FW_CAPS_CONFIG_HM_CIM		= 0x00000008,
+	FW_CAPS_CONFIG_HM_ULPTX		= 0x00000010,
+	FW_CAPS_CONFIG_HM_TP		= 0x00000020,
+	FW_CAPS_CONFIG_HM_ULPRX		= 0x00000040,
+	FW_CAPS_CONFIG_HM_PMRX		= 0x00000080,
+	FW_CAPS_CONFIG_HM_PMTX		= 0x00000100,
+	FW_CAPS_CONFIG_HM_MC		= 0x00000200,
+	FW_CAPS_CONFIG_HM_LE		= 0x00000400,
+	FW_CAPS_CONFIG_HM_MPS		= 0x00000800,
+	FW_CAPS_CONFIG_HM_XGMAC		= 0x00001000,
+	FW_CAPS_CONFIG_HM_CPLSWITCH	= 0x00002000,
+	FW_CAPS_CONFIG_HM_T4DBG		= 0x00004000,
+	FW_CAPS_CONFIG_HM_MI		= 0x00008000,
+	FW_CAPS_CONFIG_HM_I2CM		= 0x00010000,
+	FW_CAPS_CONFIG_HM_NCSI		= 0x00020000,
+	FW_CAPS_CONFIG_HM_SMB		= 0x00040000,
+	FW_CAPS_CONFIG_HM_MA		= 0x00080000,
+	FW_CAPS_CONFIG_HM_EDRAM		= 0x00100000,
+	FW_CAPS_CONFIG_HM_PMU		= 0x00200000,
+	FW_CAPS_CONFIG_HM_UART		= 0x00400000,
+	FW_CAPS_CONFIG_HM_SF		= 0x00800000,
+};
+
+enum fw_caps_config_nbm {
+	FW_CAPS_CONFIG_NBM_IPMI		= 0x00000001,
+	FW_CAPS_CONFIG_NBM_NCSI		= 0x00000002,
+};
+
+enum fw_caps_config_link {
+	FW_CAPS_CONFIG_LINK_PPP		= 0x00000001,
+	FW_CAPS_CONFIG_LINK_QFC		= 0x00000002,
+	FW_CAPS_CONFIG_LINK_DCBX	= 0x00000004,
+};
+
+enum fw_caps_config_switch {
+	FW_CAPS_CONFIG_SWITCH_INGRESS	= 0x00000001,
+	FW_CAPS_CONFIG_SWITCH_EGRESS	= 0x00000002,
+};
+
+enum fw_caps_config_nic {
+	FW_CAPS_CONFIG_NIC		= 0x00000001,
+	FW_CAPS_CONFIG_NIC_VM		= 0x00000002,
+};
+
+enum fw_caps_config_ofld {
+	FW_CAPS_CONFIG_OFLD		= 0x00000001,
+};
+
+enum fw_caps_config_rdma {
+	FW_CAPS_CONFIG_RDMA_RDDP	= 0x00000001,
+	FW_CAPS_CONFIG_RDMA_RDMAC	= 0x00000002,
+};
+
+enum fw_caps_config_iscsi {
+	FW_CAPS_CONFIG_ISCSI_INITIATOR_PDU = 0x00000001,
+	FW_CAPS_CONFIG_ISCSI_TARGET_PDU = 0x00000002,
+	FW_CAPS_CONFIG_ISCSI_INITIATOR_CNXOFLD = 0x00000004,
+	FW_CAPS_CONFIG_ISCSI_TARGET_CNXOFLD = 0x00000008,
+};
+
+enum fw_caps_config_fcoe {
+	FW_CAPS_CONFIG_FCOE_INITIATOR	= 0x00000001,
+	FW_CAPS_CONFIG_FCOE_TARGET	= 0x00000002,
+	FW_CAPS_CONFIG_FCOE_CTRL_OFLD	= 0x00000004,
+};
+
+enum fw_memtype_cf {
+	FW_MEMTYPE_CF_EDC0		= 0x0,
+	FW_MEMTYPE_CF_EDC1		= 0x1,
+	FW_MEMTYPE_CF_EXTMEM		= 0x2,
+	FW_MEMTYPE_CF_FLASH		= 0x4,
+	FW_MEMTYPE_CF_INTERNAL		= 0x5,
+};
+
+struct fw_caps_config_cmd {
+	__be32 op_to_write;
+	__be32 cfvalid_to_len16;
+	__be32 r2;
+	__be32 hwmbitmap;
+	__be16 nbmcaps;
+	__be16 linkcaps;
+	__be16 switchcaps;
+	__be16 r3;
+	__be16 niccaps;
+	__be16 ofldcaps;
+	__be16 rdmacaps;
+	__be16 r4;
+	__be16 iscsicaps;
+	__be16 fcoecaps;
+	__be32 cfcsum;
+	__be32 finiver;
+	__be32 finicsum;
+};
+
+#define FW_CAPS_CONFIG_CMD_CFVALID          (1U << 27)
+#define FW_CAPS_CONFIG_CMD_MEMTYPE_CF(x)    ((x) << 24)
+#define FW_CAPS_CONFIG_CMD_MEMADDR64K_CF(x) ((x) << 16)
+
+/*
+ * params command mnemonics
+ */
+enum fw_params_mnem {
+	FW_PARAMS_MNEM_DEV		= 1,	/* device params */
+	FW_PARAMS_MNEM_PFVF		= 2,	/* function params */
+	FW_PARAMS_MNEM_REG		= 3,	/* limited register access */
+	FW_PARAMS_MNEM_DMAQ		= 4,	/* dma queue params */
+	FW_PARAMS_MNEM_LAST
+};
+
+/*
+ * device parameters
+ */
+enum fw_params_param_dev {
+	FW_PARAMS_PARAM_DEV_CCLK	= 0x00, /* chip core clock in khz */
+	FW_PARAMS_PARAM_DEV_PORTVEC	= 0x01, /* the port vector */
+	FW_PARAMS_PARAM_DEV_NTID	= 0x02, /* reads the number of TIDs
+						 * allocated by the device's
+						 * Lookup Engine
+						 */
+	FW_PARAMS_PARAM_DEV_FLOWC_BUFFIFO_SZ = 0x03,
+	FW_PARAMS_PARAM_DEV_INTVER_NIC	= 0x04,
+	FW_PARAMS_PARAM_DEV_INTVER_VNIC = 0x05,
+	FW_PARAMS_PARAM_DEV_INTVER_OFLD = 0x06,
+	FW_PARAMS_PARAM_DEV_INTVER_RI	= 0x07,
+	FW_PARAMS_PARAM_DEV_INTVER_ISCSIPDU = 0x08,
+	FW_PARAMS_PARAM_DEV_INTVER_ISCSI = 0x09,
+	FW_PARAMS_PARAM_DEV_INTVER_FCOE = 0x0A,
+	FW_PARAMS_PARAM_DEV_FWREV = 0x0B,
+	FW_PARAMS_PARAM_DEV_TPREV = 0x0C,
+	FW_PARAMS_PARAM_DEV_CF = 0x0D,
+	FW_PARAMS_PARAM_DEV_MAXORDIRD_QP = 0x13, /* max supported QP IRD/ORD */
+	FW_PARAMS_PARAM_DEV_MAXIRD_ADAPTER = 0x14, /* max supported adap IRD */
+	FW_PARAMS_PARAM_DEV_ULPTX_MEMWRITE_DSGL = 0x17,
+};
+
+/*
+ * physical and virtual function parameters
+ */
+enum fw_params_param_pfvf {
+	FW_PARAMS_PARAM_PFVF_RWXCAPS	= 0x00,
+	FW_PARAMS_PARAM_PFVF_ROUTE_START = 0x01,
+	FW_PARAMS_PARAM_PFVF_ROUTE_END = 0x02,
+	FW_PARAMS_PARAM_PFVF_CLIP_START = 0x03,
+	FW_PARAMS_PARAM_PFVF_CLIP_END = 0x04,
+	FW_PARAMS_PARAM_PFVF_FILTER_START = 0x05,
+	FW_PARAMS_PARAM_PFVF_FILTER_END = 0x06,
+	FW_PARAMS_PARAM_PFVF_SERVER_START = 0x07,
+	FW_PARAMS_PARAM_PFVF_SERVER_END = 0x08,
+	FW_PARAMS_PARAM_PFVF_TDDP_START = 0x09,
+	FW_PARAMS_PARAM_PFVF_TDDP_END = 0x0A,
+	FW_PARAMS_PARAM_PFVF_ISCSI_START = 0x0B,
+	FW_PARAMS_PARAM_PFVF_ISCSI_END = 0x0C,
+	FW_PARAMS_PARAM_PFVF_STAG_START = 0x0D,
+	FW_PARAMS_PARAM_PFVF_STAG_END = 0x0E,
+	FW_PARAMS_PARAM_PFVF_RQ_START = 0x1F,
+	FW_PARAMS_PARAM_PFVF_RQ_END	= 0x10,
+	FW_PARAMS_PARAM_PFVF_PBL_START = 0x11,
+	FW_PARAMS_PARAM_PFVF_PBL_END	= 0x12,
+	FW_PARAMS_PARAM_PFVF_L2T_START = 0x13,
+	FW_PARAMS_PARAM_PFVF_L2T_END = 0x14,
+	FW_PARAMS_PARAM_PFVF_SQRQ_START = 0x15,
+	FW_PARAMS_PARAM_PFVF_SQRQ_END	= 0x16,
+	FW_PARAMS_PARAM_PFVF_CQ_START	= 0x17,
+	FW_PARAMS_PARAM_PFVF_CQ_END	= 0x18,
+	FW_PARAMS_PARAM_PFVF_SCHEDCLASS_ETH = 0x20,
+	FW_PARAMS_PARAM_PFVF_VIID       = 0x24,
+	FW_PARAMS_PARAM_PFVF_CPMASK     = 0x25,
+	FW_PARAMS_PARAM_PFVF_OCQ_START  = 0x26,
+	FW_PARAMS_PARAM_PFVF_OCQ_END    = 0x27,
+	FW_PARAMS_PARAM_PFVF_CONM_MAP   = 0x28,
+	FW_PARAMS_PARAM_PFVF_IQFLINT_START = 0x29,
+	FW_PARAMS_PARAM_PFVF_IQFLINT_END = 0x2A,
+	FW_PARAMS_PARAM_PFVF_EQ_START	= 0x2B,
+	FW_PARAMS_PARAM_PFVF_EQ_END	= 0x2C,
+	FW_PARAMS_PARAM_PFVF_ACTIVE_FILTER_START = 0x2D,
+	FW_PARAMS_PARAM_PFVF_ACTIVE_FILTER_END = 0x2E,
+	FW_PARAMS_PARAM_PFVF_ETHOFLD_END = 0x30,
+	FW_PARAMS_PARAM_PFVF_CPLFW4MSG_ENCAP = 0x31
+};
+
+/*
+ * dma queue parameters
+ */
+enum fw_params_param_dmaq {
+	FW_PARAMS_PARAM_DMAQ_IQ_DCAEN_DCACPU = 0x00,
+	FW_PARAMS_PARAM_DMAQ_IQ_INTCNTTHRESH = 0x01,
+	FW_PARAMS_PARAM_DMAQ_EQ_CMPLIQID_MNGT = 0x10,
+	FW_PARAMS_PARAM_DMAQ_EQ_CMPLIQID_CTRL = 0x11,
+	FW_PARAMS_PARAM_DMAQ_EQ_SCHEDCLASS_ETH = 0x12,
+	FW_PARAMS_PARAM_DMAQ_EQ_DCBPRIO_ETH = 0x13,
+};
+
+#define FW_PARAMS_MNEM(x)      ((x) << 24)
+#define FW_PARAMS_PARAM_X(x)   ((x) << 16)
+#define FW_PARAMS_PARAM_Y_SHIFT  8
+#define FW_PARAMS_PARAM_Y_MASK   0xffU
+#define FW_PARAMS_PARAM_Y(x)     ((x) << FW_PARAMS_PARAM_Y_SHIFT)
+#define FW_PARAMS_PARAM_Y_GET(x) (((x) >> FW_PARAMS_PARAM_Y_SHIFT) &\
+		FW_PARAMS_PARAM_Y_MASK)
+#define FW_PARAMS_PARAM_Z_SHIFT  0
+#define FW_PARAMS_PARAM_Z_MASK   0xffu
+#define FW_PARAMS_PARAM_Z(x)     ((x) << FW_PARAMS_PARAM_Z_SHIFT)
+#define FW_PARAMS_PARAM_Z_GET(x) (((x) >> FW_PARAMS_PARAM_Z_SHIFT) &\
+		FW_PARAMS_PARAM_Z_MASK)
+#define FW_PARAMS_PARAM_XYZ(x) ((x) << 0)
+#define FW_PARAMS_PARAM_YZ(x)  ((x) << 0)
+
+struct fw_params_cmd {
+	__be32 op_to_vfn;
+	__be32 retval_len16;
+	struct fw_params_param {
+		__be32 mnem;
+		__be32 val;
+	} param[7];
+};
+
+#define FW_PARAMS_CMD_PFN(x) ((x) << 8)
+#define FW_PARAMS_CMD_VFN(x) ((x) << 0)
+
+struct fw_pfvf_cmd {
+	__be32 op_to_vfn;
+	__be32 retval_len16;
+	__be32 niqflint_niq;
+	__be32 type_to_neq;
+	__be32 tc_to_nexactf;
+	__be32 r_caps_to_nethctrl;
+	__be16 nricq;
+	__be16 nriqp;
+	__be32 r4;
+};
+
+#define FW_PFVF_CMD_PFN(x) ((x) << 8)
+#define FW_PFVF_CMD_VFN(x) ((x) << 0)
+
+#define FW_PFVF_CMD_NIQFLINT(x) ((x) << 20)
+#define FW_PFVF_CMD_NIQFLINT_GET(x) (((x) >> 20) & 0xfff)
+
+#define FW_PFVF_CMD_NIQ(x) ((x) << 0)
+#define FW_PFVF_CMD_NIQ_GET(x) (((x) >> 0) & 0xfffff)
+
+#define FW_PFVF_CMD_TYPE (1 << 31)
+#define FW_PFVF_CMD_TYPE_GET(x) (((x) >> 31) & 0x1)
+
+#define FW_PFVF_CMD_CMASK(x) ((x) << 24)
+#define FW_PFVF_CMD_CMASK_MASK 0xf
+#define FW_PFVF_CMD_CMASK_GET(x) (((x) >> 24) & FW_PFVF_CMD_CMASK_MASK)
+
+#define FW_PFVF_CMD_PMASK(x) ((x) << 20)
+#define FW_PFVF_CMD_PMASK_MASK 0xf
+#define FW_PFVF_CMD_PMASK_GET(x) (((x) >> 20) & FW_PFVF_CMD_PMASK_MASK)
+
+#define FW_PFVF_CMD_NEQ(x) ((x) << 0)
+#define FW_PFVF_CMD_NEQ_GET(x) (((x) >> 0) & 0xfffff)
+
+#define FW_PFVF_CMD_TC(x) ((x) << 24)
+#define FW_PFVF_CMD_TC_GET(x) (((x) >> 24) & 0xff)
+
+#define FW_PFVF_CMD_NVI(x) ((x) << 16)
+#define FW_PFVF_CMD_NVI_GET(x) (((x) >> 16) & 0xff)
+
+#define FW_PFVF_CMD_NEXACTF(x) ((x) << 0)
+#define FW_PFVF_CMD_NEXACTF_GET(x) (((x) >> 0) & 0xffff)
+
+#define FW_PFVF_CMD_R_CAPS(x) ((x) << 24)
+#define FW_PFVF_CMD_R_CAPS_GET(x) (((x) >> 24) & 0xff)
+
+#define FW_PFVF_CMD_WX_CAPS(x) ((x) << 16)
+#define FW_PFVF_CMD_WX_CAPS_GET(x) (((x) >> 16) & 0xff)
+
+#define FW_PFVF_CMD_NETHCTRL(x) ((x) << 0)
+#define FW_PFVF_CMD_NETHCTRL_GET(x) (((x) >> 0) & 0xffff)
+
+enum fw_iq_type {
+	FW_IQ_TYPE_FL_INT_CAP,
+	FW_IQ_TYPE_NO_FL_INT_CAP
+};
+
+struct fw_iq_cmd {
+	__be32 op_to_vfn;
+	__be32 alloc_to_len16;
+	__be16 physiqid;
+	__be16 iqid;
+	__be16 fl0id;
+	__be16 fl1id;
+	__be32 type_to_iqandstindex;
+	__be16 iqdroprss_to_iqesize;
+	__be16 iqsize;
+	__be64 iqaddr;
+	__be32 iqns_to_fl0congen;
+	__be16 fl0dcaen_to_fl0cidxfthresh;
+	__be16 fl0size;
+	__be64 fl0addr;
+	__be32 fl1cngchmap_to_fl1congen;
+	__be16 fl1dcaen_to_fl1cidxfthresh;
+	__be16 fl1size;
+	__be64 fl1addr;
+};
+
+#define FW_IQ_CMD_PFN(x) ((x) << 8)
+#define FW_IQ_CMD_VFN(x) ((x) << 0)
+
+#define FW_IQ_CMD_ALLOC (1U << 31)
+#define FW_IQ_CMD_FREE (1U << 30)
+#define FW_IQ_CMD_MODIFY (1U << 29)
+#define FW_IQ_CMD_IQSTART(x) ((x) << 28)
+#define FW_IQ_CMD_IQSTOP(x) ((x) << 27)
+
+#define FW_IQ_CMD_TYPE(x) ((x) << 29)
+#define FW_IQ_CMD_IQASYNCH(x) ((x) << 28)
+#define FW_IQ_CMD_VIID(x) ((x) << 16)
+#define FW_IQ_CMD_IQANDST(x) ((x) << 15)
+#define FW_IQ_CMD_IQANUS(x) ((x) << 14)
+#define FW_IQ_CMD_IQANUD(x) ((x) << 12)
+#define FW_IQ_CMD_IQANDSTINDEX(x) ((x) << 0)
+
+#define FW_IQ_CMD_IQDROPRSS (1U << 15)
+#define FW_IQ_CMD_IQGTSMODE (1U << 14)
+#define FW_IQ_CMD_IQPCIECH(x) ((x) << 12)
+#define FW_IQ_CMD_IQDCAEN(x) ((x) << 11)
+#define FW_IQ_CMD_IQDCACPU(x) ((x) << 6)
+#define FW_IQ_CMD_IQINTCNTTHRESH(x) ((x) << 4)
+#define FW_IQ_CMD_IQO (1U << 3)
+#define FW_IQ_CMD_IQCPRIO(x) ((x) << 2)
+#define FW_IQ_CMD_IQESIZE(x) ((x) << 0)
+
+#define FW_IQ_CMD_IQNS(x) ((x) << 31)
+#define FW_IQ_CMD_IQRO(x) ((x) << 30)
+#define FW_IQ_CMD_IQFLINTIQHSEN(x) ((x) << 28)
+#define FW_IQ_CMD_IQFLINTCONGEN(x) ((x) << 27)
+#define FW_IQ_CMD_IQFLINTISCSIC(x) ((x) << 26)
+#define FW_IQ_CMD_FL0CNGCHMAP(x) ((x) << 20)
+#define FW_IQ_CMD_FL0CACHELOCK(x) ((x) << 15)
+#define FW_IQ_CMD_FL0DBP(x) ((x) << 14)
+#define FW_IQ_CMD_FL0DATANS(x) ((x) << 13)
+#define FW_IQ_CMD_FL0DATARO(x) ((x) << 12)
+#define FW_IQ_CMD_FL0CONGCIF(x) ((x) << 11)
+#define FW_IQ_CMD_FL0ONCHIP(x) ((x) << 10)
+#define FW_IQ_CMD_FL0STATUSPGNS(x) ((x) << 9)
+#define FW_IQ_CMD_FL0STATUSPGRO(x) ((x) << 8)
+#define FW_IQ_CMD_FL0FETCHNS(x) ((x) << 7)
+#define FW_IQ_CMD_FL0FETCHRO(x) ((x) << 6)
+#define FW_IQ_CMD_FL0HOSTFCMODE(x) ((x) << 4)
+#define FW_IQ_CMD_FL0CPRIO(x) ((x) << 3)
+#define FW_IQ_CMD_FL0PADEN(x) ((x) << 2)
+#define FW_IQ_CMD_FL0PACKEN(x) ((x) << 1)
+#define FW_IQ_CMD_FL0CONGEN (1U << 0)
+
+#define FW_IQ_CMD_FL0DCAEN(x) ((x) << 15)
+#define FW_IQ_CMD_FL0DCACPU(x) ((x) << 10)
+#define FW_IQ_CMD_FL0FBMIN(x) ((x) << 7)
+#define FW_IQ_CMD_FL0FBMAX(x) ((x) << 4)
+#define FW_IQ_CMD_FL0CIDXFTHRESHO (1U << 3)
+#define FW_IQ_CMD_FL0CIDXFTHRESH(x) ((x) << 0)
+
+#define FW_IQ_CMD_FL1CNGCHMAP(x) ((x) << 20)
+#define FW_IQ_CMD_FL1CACHELOCK(x) ((x) << 15)
+#define FW_IQ_CMD_FL1DBP(x) ((x) << 14)
+#define FW_IQ_CMD_FL1DATANS(x) ((x) << 13)
+#define FW_IQ_CMD_FL1DATARO(x) ((x) << 12)
+#define FW_IQ_CMD_FL1CONGCIF(x) ((x) << 11)
+#define FW_IQ_CMD_FL1ONCHIP(x) ((x) << 10)
+#define FW_IQ_CMD_FL1STATUSPGNS(x) ((x) << 9)
+#define FW_IQ_CMD_FL1STATUSPGRO(x) ((x) << 8)
+#define FW_IQ_CMD_FL1FETCHNS(x) ((x) << 7)
+#define FW_IQ_CMD_FL1FETCHRO(x) ((x) << 6)
+#define FW_IQ_CMD_FL1HOSTFCMODE(x) ((x) << 4)
+#define FW_IQ_CMD_FL1CPRIO(x) ((x) << 3)
+#define FW_IQ_CMD_FL1PADEN (1U << 2)
+#define FW_IQ_CMD_FL1PACKEN (1U << 1)
+#define FW_IQ_CMD_FL1CONGEN (1U << 0)
+
+#define FW_IQ_CMD_FL1DCAEN(x) ((x) << 15)
+#define FW_IQ_CMD_FL1DCACPU(x) ((x) << 10)
+#define FW_IQ_CMD_FL1FBMIN(x) ((x) << 7)
+#define FW_IQ_CMD_FL1FBMAX(x) ((x) << 4)
+#define FW_IQ_CMD_FL1CIDXFTHRESHO (1U << 3)
+#define FW_IQ_CMD_FL1CIDXFTHRESH(x) ((x) << 0)
+
+struct fw_eq_eth_cmd {
+	__be32 op_to_vfn;
+	__be32 alloc_to_len16;
+	__be32 eqid_pkd;
+	__be32 physeqid_pkd;
+	__be32 fetchszm_to_iqid;
+	__be32 dcaen_to_eqsize;
+	__be64 eqaddr;
+	__be32 viid_pkd;
+	__be32 r8_lo;
+	__be64 r9;
+};
+
+#define FW_EQ_ETH_CMD_PFN(x) ((x) << 8)
+#define FW_EQ_ETH_CMD_VFN(x) ((x) << 0)
+#define FW_EQ_ETH_CMD_ALLOC (1U << 31)
+#define FW_EQ_ETH_CMD_FREE (1U << 30)
+#define FW_EQ_ETH_CMD_MODIFY (1U << 29)
+#define FW_EQ_ETH_CMD_EQSTART (1U << 28)
+#define FW_EQ_ETH_CMD_EQSTOP (1U << 27)
+
+#define FW_EQ_ETH_CMD_EQID(x) ((x) << 0)
+#define FW_EQ_ETH_CMD_EQID_GET(x) (((x) >> 0) & 0xfffff)
+#define FW_EQ_ETH_CMD_PHYSEQID(x) ((x) << 0)
+#define FW_EQ_ETH_CMD_PHYSEQID_GET(x) (((x) >> 0) & 0xfffff)
+
+#define FW_EQ_ETH_CMD_FETCHSZM(x) ((x) << 26)
+#define FW_EQ_ETH_CMD_STATUSPGNS(x) ((x) << 25)
+#define FW_EQ_ETH_CMD_STATUSPGRO(x) ((x) << 24)
+#define FW_EQ_ETH_CMD_FETCHNS(x) ((x) << 23)
+#define FW_EQ_ETH_CMD_FETCHRO(x) ((x) << 22)
+#define FW_EQ_ETH_CMD_HOSTFCMODE(x) ((x) << 20)
+#define FW_EQ_ETH_CMD_CPRIO(x) ((x) << 19)
+#define FW_EQ_ETH_CMD_ONCHIP(x) ((x) << 18)
+#define FW_EQ_ETH_CMD_PCIECHN(x) ((x) << 16)
+#define FW_EQ_ETH_CMD_IQID(x) ((x) << 0)
+
+#define FW_EQ_ETH_CMD_DCAEN(x) ((x) << 31)
+#define FW_EQ_ETH_CMD_DCACPU(x) ((x) << 26)
+#define FW_EQ_ETH_CMD_FBMIN(x) ((x) << 23)
+#define FW_EQ_ETH_CMD_FBMAX(x) ((x) << 20)
+#define FW_EQ_ETH_CMD_CIDXFTHRESHO(x) ((x) << 19)
+#define FW_EQ_ETH_CMD_CIDXFTHRESH(x) ((x) << 16)
+#define FW_EQ_ETH_CMD_EQSIZE(x) ((x) << 0)
+
+#define FW_EQ_ETH_CMD_AUTOEQUEQE (1U << 30)
+#define FW_EQ_ETH_CMD_VIID(x) ((x) << 16)
+
+struct fw_eq_ctrl_cmd {
+	__be32 op_to_vfn;
+	__be32 alloc_to_len16;
+	__be32 cmpliqid_eqid;
+	__be32 physeqid_pkd;
+	__be32 fetchszm_to_iqid;
+	__be32 dcaen_to_eqsize;
+	__be64 eqaddr;
+};
+
+#define FW_EQ_CTRL_CMD_PFN(x) ((x) << 8)
+#define FW_EQ_CTRL_CMD_VFN(x) ((x) << 0)
+
+#define FW_EQ_CTRL_CMD_ALLOC (1U << 31)
+#define FW_EQ_CTRL_CMD_FREE (1U << 30)
+#define FW_EQ_CTRL_CMD_MODIFY (1U << 29)
+#define FW_EQ_CTRL_CMD_EQSTART (1U << 28)
+#define FW_EQ_CTRL_CMD_EQSTOP (1U << 27)
+
+#define FW_EQ_CTRL_CMD_CMPLIQID(x) ((x) << 20)
+#define FW_EQ_CTRL_CMD_EQID(x) ((x) << 0)
+#define FW_EQ_CTRL_CMD_EQID_GET(x) (((x) >> 0) & 0xfffff)
+#define FW_EQ_CTRL_CMD_PHYSEQID_GET(x) (((x) >> 0) & 0xfffff)
+
+#define FW_EQ_CTRL_CMD_FETCHSZM (1U << 26)
+#define FW_EQ_CTRL_CMD_STATUSPGNS (1U << 25)
+#define FW_EQ_CTRL_CMD_STATUSPGRO (1U << 24)
+#define FW_EQ_CTRL_CMD_FETCHNS (1U << 23)
+#define FW_EQ_CTRL_CMD_FETCHRO (1U << 22)
+#define FW_EQ_CTRL_CMD_HOSTFCMODE(x) ((x) << 20)
+#define FW_EQ_CTRL_CMD_CPRIO(x) ((x) << 19)
+#define FW_EQ_CTRL_CMD_ONCHIP(x) ((x) << 18)
+#define FW_EQ_CTRL_CMD_PCIECHN(x) ((x) << 16)
+#define FW_EQ_CTRL_CMD_IQID(x) ((x) << 0)
+
+#define FW_EQ_CTRL_CMD_DCAEN(x) ((x) << 31)
+#define FW_EQ_CTRL_CMD_DCACPU(x) ((x) << 26)
+#define FW_EQ_CTRL_CMD_FBMIN(x) ((x) << 23)
+#define FW_EQ_CTRL_CMD_FBMAX(x) ((x) << 20)
+#define FW_EQ_CTRL_CMD_CIDXFTHRESHO(x) ((x) << 19)
+#define FW_EQ_CTRL_CMD_CIDXFTHRESH(x) ((x) << 16)
+#define FW_EQ_CTRL_CMD_EQSIZE(x) ((x) << 0)
+
+struct fw_eq_ofld_cmd {
+	__be32 op_to_vfn;
+	__be32 alloc_to_len16;
+	__be32 eqid_pkd;
+	__be32 physeqid_pkd;
+	__be32 fetchszm_to_iqid;
+	__be32 dcaen_to_eqsize;
+	__be64 eqaddr;
+};
+
+#define FW_EQ_OFLD_CMD_PFN(x) ((x) << 8)
+#define FW_EQ_OFLD_CMD_VFN(x) ((x) << 0)
+
+#define FW_EQ_OFLD_CMD_ALLOC (1U << 31)
+#define FW_EQ_OFLD_CMD_FREE (1U << 30)
+#define FW_EQ_OFLD_CMD_MODIFY (1U << 29)
+#define FW_EQ_OFLD_CMD_EQSTART (1U << 28)
+#define FW_EQ_OFLD_CMD_EQSTOP (1U << 27)
+
+#define FW_EQ_OFLD_CMD_EQID(x) ((x) << 0)
+#define FW_EQ_OFLD_CMD_EQID_GET(x) (((x) >> 0) & 0xfffff)
+#define FW_EQ_OFLD_CMD_PHYSEQID_GET(x) (((x) >> 0) & 0xfffff)
+
+#define FW_EQ_OFLD_CMD_FETCHSZM(x) ((x) << 26)
+#define FW_EQ_OFLD_CMD_STATUSPGNS(x) ((x) << 25)
+#define FW_EQ_OFLD_CMD_STATUSPGRO(x) ((x) << 24)
+#define FW_EQ_OFLD_CMD_FETCHNS(x) ((x) << 23)
+#define FW_EQ_OFLD_CMD_FETCHRO(x) ((x) << 22)
+#define FW_EQ_OFLD_CMD_HOSTFCMODE(x) ((x) << 20)
+#define FW_EQ_OFLD_CMD_CPRIO(x) ((x) << 19)
+#define FW_EQ_OFLD_CMD_ONCHIP(x) ((x) << 18)
+#define FW_EQ_OFLD_CMD_PCIECHN(x) ((x) << 16)
+#define FW_EQ_OFLD_CMD_IQID(x) ((x) << 0)
+
+#define FW_EQ_OFLD_CMD_DCAEN(x) ((x) << 31)
+#define FW_EQ_OFLD_CMD_DCACPU(x) ((x) << 26)
+#define FW_EQ_OFLD_CMD_FBMIN(x) ((x) << 23)
+#define FW_EQ_OFLD_CMD_FBMAX(x) ((x) << 20)
+#define FW_EQ_OFLD_CMD_CIDXFTHRESHO(x) ((x) << 19)
+#define FW_EQ_OFLD_CMD_CIDXFTHRESH(x) ((x) << 16)
+#define FW_EQ_OFLD_CMD_EQSIZE(x) ((x) << 0)
+
+/*
+ * Macros for VIID parsing:
+ * VIID - [10:8] PFN, [7] VI Valid, [6:0] VI number
+ */
+#define FW_VIID_PFN_GET(x) (((x) >> 8) & 0x7)
+#define FW_VIID_VIVLD_GET(x) (((x) >> 7) & 0x1)
+#define FW_VIID_VIN_GET(x) (((x) >> 0) & 0x7F)
+
+struct fw_vi_cmd {
+	__be32 op_to_vfn;
+	__be32 alloc_to_len16;
+	__be16 type_viid;
+	u8 mac[6];
+	u8 portid_pkd;
+	u8 nmac;
+	u8 nmac0[6];
+	__be16 rsssize_pkd;
+	u8 nmac1[6];
+	__be16 idsiiq_pkd;
+	u8 nmac2[6];
+	__be16 idseiq_pkd;
+	u8 nmac3[6];
+	__be64 r9;
+	__be64 r10;
+};
+
+#define FW_VI_CMD_PFN(x) ((x) << 8)
+#define FW_VI_CMD_VFN(x) ((x) << 0)
+#define FW_VI_CMD_ALLOC (1U << 31)
+#define FW_VI_CMD_FREE (1U << 30)
+#define FW_VI_CMD_VIID(x) ((x) << 0)
+#define FW_VI_CMD_VIID_GET(x) ((x) & 0xfff)
+#define FW_VI_CMD_PORTID(x) ((x) << 4)
+#define FW_VI_CMD_PORTID_GET(x) (((x) >> 4) & 0xf)
+#define FW_VI_CMD_RSSSIZE_GET(x) (((x) >> 0) & 0x7ff)
+
+/* Special VI_MAC command index ids */
+#define FW_VI_MAC_ADD_MAC		0x3FF
+#define FW_VI_MAC_ADD_PERSIST_MAC	0x3FE
+#define FW_VI_MAC_MAC_BASED_FREE	0x3FD
+#define FW_CLS_TCAM_NUM_ENTRIES		336
+
+enum fw_vi_mac_smac {
+	FW_VI_MAC_MPS_TCAM_ENTRY,
+	FW_VI_MAC_MPS_TCAM_ONLY,
+	FW_VI_MAC_SMT_ONLY,
+	FW_VI_MAC_SMT_AND_MPSTCAM
+};
+
+enum fw_vi_mac_result {
+	FW_VI_MAC_R_SUCCESS,
+	FW_VI_MAC_R_F_NONEXISTENT_NOMEM,
+	FW_VI_MAC_R_SMAC_FAIL,
+	FW_VI_MAC_R_F_ACL_CHECK
+};
+
+struct fw_vi_mac_cmd {
+	__be32 op_to_viid;
+	__be32 freemacs_to_len16;
+	union fw_vi_mac {
+		struct fw_vi_mac_exact {
+			__be16 valid_to_idx;
+			u8 macaddr[6];
+		} exact[7];
+		struct fw_vi_mac_hash {
+			__be64 hashvec;
+		} hash;
+	} u;
+};
+
+#define FW_VI_MAC_CMD_VIID(x) ((x) << 0)
+#define FW_VI_MAC_CMD_FREEMACS(x) ((x) << 31)
+#define FW_VI_MAC_CMD_HASHVECEN (1U << 23)
+#define FW_VI_MAC_CMD_HASHUNIEN(x) ((x) << 22)
+#define FW_VI_MAC_CMD_VALID (1U << 15)
+#define FW_VI_MAC_CMD_PRIO(x) ((x) << 12)
+#define FW_VI_MAC_CMD_SMAC_RESULT(x) ((x) << 10)
+#define FW_VI_MAC_CMD_SMAC_RESULT_GET(x) (((x) >> 10) & 0x3)
+#define FW_VI_MAC_CMD_IDX(x) ((x) << 0)
+#define FW_VI_MAC_CMD_IDX_GET(x) (((x) >> 0) & 0x3ff)
+
+#define FW_RXMODE_MTU_NO_CHG	65535
+
+struct fw_vi_rxmode_cmd {
+	__be32 op_to_viid;
+	__be32 retval_len16;
+	__be32 mtu_to_vlanexen;
+	__be32 r4_lo;
+};
+
+#define FW_VI_RXMODE_CMD_VIID(x) ((x) << 0)
+#define FW_VI_RXMODE_CMD_MTU_MASK 0xffff
+#define FW_VI_RXMODE_CMD_MTU(x) ((x) << 16)
+#define FW_VI_RXMODE_CMD_PROMISCEN_MASK 0x3
+#define FW_VI_RXMODE_CMD_PROMISCEN(x) ((x) << 14)
+#define FW_VI_RXMODE_CMD_ALLMULTIEN_MASK 0x3
+#define FW_VI_RXMODE_CMD_ALLMULTIEN(x) ((x) << 12)
+#define FW_VI_RXMODE_CMD_BROADCASTEN_MASK 0x3
+#define FW_VI_RXMODE_CMD_BROADCASTEN(x) ((x) << 10)
+#define FW_VI_RXMODE_CMD_VLANEXEN_MASK 0x3
+#define FW_VI_RXMODE_CMD_VLANEXEN(x) ((x) << 8)
+
+struct fw_vi_enable_cmd {
+	__be32 op_to_viid;
+	__be32 ien_to_len16;
+	__be16 blinkdur;
+	__be16 r3;
+	__be32 r4;
+};
+
+#define FW_VI_ENABLE_CMD_VIID(x) ((x) << 0)
+#define FW_VI_ENABLE_CMD_IEN(x) ((x) << 31)
+#define FW_VI_ENABLE_CMD_EEN(x) ((x) << 30)
+#define FW_VI_ENABLE_CMD_DCB_INFO(x) ((x) << 28)
+#define FW_VI_ENABLE_CMD_LED (1U << 29)
+
+/* VI VF stats offset definitions */
+#define VI_VF_NUM_STATS	16
+enum fw_vi_stats_vf_index {
+	FW_VI_VF_STAT_TX_BCAST_BYTES_IX,
+	FW_VI_VF_STAT_TX_BCAST_FRAMES_IX,
+	FW_VI_VF_STAT_TX_MCAST_BYTES_IX,
+	FW_VI_VF_STAT_TX_MCAST_FRAMES_IX,
+	FW_VI_VF_STAT_TX_UCAST_BYTES_IX,
+	FW_VI_VF_STAT_TX_UCAST_FRAMES_IX,
+	FW_VI_VF_STAT_TX_DROP_FRAMES_IX,
+	FW_VI_VF_STAT_TX_OFLD_BYTES_IX,
+	FW_VI_VF_STAT_TX_OFLD_FRAMES_IX,
+	FW_VI_VF_STAT_RX_BCAST_BYTES_IX,
+	FW_VI_VF_STAT_RX_BCAST_FRAMES_IX,
+	FW_VI_VF_STAT_RX_MCAST_BYTES_IX,
+	FW_VI_VF_STAT_RX_MCAST_FRAMES_IX,
+	FW_VI_VF_STAT_RX_UCAST_BYTES_IX,
+	FW_VI_VF_STAT_RX_UCAST_FRAMES_IX,
+	FW_VI_VF_STAT_RX_ERR_FRAMES_IX
+};
+
+/* VI PF stats offset definitions */
+#define VI_PF_NUM_STATS	17
+enum fw_vi_stats_pf_index {
+	FW_VI_PF_STAT_TX_BCAST_BYTES_IX,
+	FW_VI_PF_STAT_TX_BCAST_FRAMES_IX,
+	FW_VI_PF_STAT_TX_MCAST_BYTES_IX,
+	FW_VI_PF_STAT_TX_MCAST_FRAMES_IX,
+	FW_VI_PF_STAT_TX_UCAST_BYTES_IX,
+	FW_VI_PF_STAT_TX_UCAST_FRAMES_IX,
+	FW_VI_PF_STAT_TX_OFLD_BYTES_IX,
+	FW_VI_PF_STAT_TX_OFLD_FRAMES_IX,
+	FW_VI_PF_STAT_RX_BYTES_IX,
+	FW_VI_PF_STAT_RX_FRAMES_IX,
+	FW_VI_PF_STAT_RX_BCAST_BYTES_IX,
+	FW_VI_PF_STAT_RX_BCAST_FRAMES_IX,
+	FW_VI_PF_STAT_RX_MCAST_BYTES_IX,
+	FW_VI_PF_STAT_RX_MCAST_FRAMES_IX,
+	FW_VI_PF_STAT_RX_UCAST_BYTES_IX,
+	FW_VI_PF_STAT_RX_UCAST_FRAMES_IX,
+	FW_VI_PF_STAT_RX_ERR_FRAMES_IX
+};
+
+struct fw_vi_stats_cmd {
+	__be32 op_to_viid;
+	__be32 retval_len16;
+	union fw_vi_stats {
+		struct fw_vi_stats_ctl {
+			__be16 nstats_ix;
+			__be16 r6;
+			__be32 r7;
+			__be64 stat0;
+			__be64 stat1;
+			__be64 stat2;
+			__be64 stat3;
+			__be64 stat4;
+			__be64 stat5;
+		} ctl;
+		struct fw_vi_stats_pf {
+			__be64 tx_bcast_bytes;
+			__be64 tx_bcast_frames;
+			__be64 tx_mcast_bytes;
+			__be64 tx_mcast_frames;
+			__be64 tx_ucast_bytes;
+			__be64 tx_ucast_frames;
+			__be64 tx_offload_bytes;
+			__be64 tx_offload_frames;
+			__be64 rx_pf_bytes;
+			__be64 rx_pf_frames;
+			__be64 rx_bcast_bytes;
+			__be64 rx_bcast_frames;
+			__be64 rx_mcast_bytes;
+			__be64 rx_mcast_frames;
+			__be64 rx_ucast_bytes;
+			__be64 rx_ucast_frames;
+			__be64 rx_err_frames;
+		} pf;
+		struct fw_vi_stats_vf {
+			__be64 tx_bcast_bytes;
+			__be64 tx_bcast_frames;
+			__be64 tx_mcast_bytes;
+			__be64 tx_mcast_frames;
+			__be64 tx_ucast_bytes;
+			__be64 tx_ucast_frames;
+			__be64 tx_drop_frames;
+			__be64 tx_offload_bytes;
+			__be64 tx_offload_frames;
+			__be64 rx_bcast_bytes;
+			__be64 rx_bcast_frames;
+			__be64 rx_mcast_bytes;
+			__be64 rx_mcast_frames;
+			__be64 rx_ucast_bytes;
+			__be64 rx_ucast_frames;
+			__be64 rx_err_frames;
+		} vf;
+	} u;
+};
+
+#define FW_VI_STATS_CMD_VIID(x) ((x) << 0)
+#define FW_VI_STATS_CMD_NSTATS(x) ((x) << 12)
+#define FW_VI_STATS_CMD_IX(x) ((x) << 0)
+
+struct fw_acl_mac_cmd {
+	__be32 op_to_vfn;
+	__be32 en_to_len16;
+	u8 nmac;
+	u8 r3[7];
+	__be16 r4;
+	u8 macaddr0[6];
+	__be16 r5;
+	u8 macaddr1[6];
+	__be16 r6;
+	u8 macaddr2[6];
+	__be16 r7;
+	u8 macaddr3[6];
+};
+
+#define FW_ACL_MAC_CMD_PFN(x) ((x) << 8)
+#define FW_ACL_MAC_CMD_VFN(x) ((x) << 0)
+#define FW_ACL_MAC_CMD_EN(x) ((x) << 31)
+
+struct fw_acl_vlan_cmd {
+	__be32 op_to_vfn;
+	__be32 en_to_len16;
+	u8 nvlan;
+	u8 dropnovlan_fm;
+	u8 r3_lo[6];
+	__be16 vlanid[16];
+};
+
+#define FW_ACL_VLAN_CMD_PFN(x) ((x) << 8)
+#define FW_ACL_VLAN_CMD_VFN(x) ((x) << 0)
+#define FW_ACL_VLAN_CMD_EN(x) ((x) << 31)
+#define FW_ACL_VLAN_CMD_DROPNOVLAN(x) ((x) << 7)
+#define FW_ACL_VLAN_CMD_FM(x) ((x) << 6)
+
+enum fw_port_cap {
+	FW_PORT_CAP_SPEED_100M		= 0x0001,
+	FW_PORT_CAP_SPEED_1G		= 0x0002,
+	FW_PORT_CAP_SPEED_2_5G		= 0x0004,
+	FW_PORT_CAP_SPEED_10G		= 0x0008,
+	FW_PORT_CAP_SPEED_40G		= 0x0010,
+	FW_PORT_CAP_SPEED_100G		= 0x0020,
+	FW_PORT_CAP_FC_RX		= 0x0040,
+	FW_PORT_CAP_FC_TX		= 0x0080,
+	FW_PORT_CAP_ANEG		= 0x0100,
+	FW_PORT_CAP_MDI_0		= 0x0200,
+	FW_PORT_CAP_MDI_1		= 0x0400,
+	FW_PORT_CAP_BEAN		= 0x0800,
+	FW_PORT_CAP_PMA_LPBK		= 0x1000,
+	FW_PORT_CAP_PCS_LPBK		= 0x2000,
+	FW_PORT_CAP_PHYXS_LPBK		= 0x4000,
+	FW_PORT_CAP_FAR_END_LPBK	= 0x8000,
+};
+
+enum fw_port_mdi {
+	FW_PORT_MDI_UNCHANGED,
+	FW_PORT_MDI_AUTO,
+	FW_PORT_MDI_F_STRAIGHT,
+	FW_PORT_MDI_F_CROSSOVER
+};
+
+#define FW_PORT_MDI(x) ((x) << 9)
+
+enum fw_port_action {
+	FW_PORT_ACTION_L1_CFG		= 0x0001,
+	FW_PORT_ACTION_L2_CFG		= 0x0002,
+	FW_PORT_ACTION_GET_PORT_INFO	= 0x0003,
+	FW_PORT_ACTION_L2_PPP_CFG	= 0x0004,
+	FW_PORT_ACTION_L2_DCB_CFG	= 0x0005,
+	FW_PORT_ACTION_DCB_READ_TRANS	= 0x0006,
+	FW_PORT_ACTION_DCB_READ_RECV	= 0x0007,
+	FW_PORT_ACTION_DCB_READ_DET	= 0x0008,
+	FW_PORT_ACTION_LOW_PWR_TO_NORMAL = 0x0010,
+	FW_PORT_ACTION_L1_LOW_PWR_EN	= 0x0011,
+	FW_PORT_ACTION_L2_WOL_MODE_EN	= 0x0012,
+	FW_PORT_ACTION_LPBK_TO_NORMAL	= 0x0020,
+	FW_PORT_ACTION_L1_LPBK		= 0x0021,
+	FW_PORT_ACTION_L1_PMA_LPBK	= 0x0022,
+	FW_PORT_ACTION_L1_PCS_LPBK	= 0x0023,
+	FW_PORT_ACTION_L1_PHYXS_CSIDE_LPBK = 0x0024,
+	FW_PORT_ACTION_L1_PHYXS_ESIDE_LPBK = 0x0025,
+	FW_PORT_ACTION_PHY_RESET	= 0x0040,
+	FW_PORT_ACTION_PMA_RESET	= 0x0041,
+	FW_PORT_ACTION_PCS_RESET	= 0x0042,
+	FW_PORT_ACTION_PHYXS_RESET	= 0x0043,
+	FW_PORT_ACTION_DTEXS_REEST	= 0x0044,
+	FW_PORT_ACTION_AN_RESET		= 0x0045
+};
+
+enum fw_port_l2cfg_ctlbf {
+	FW_PORT_L2_CTLBF_OVLAN0	= 0x01,
+	FW_PORT_L2_CTLBF_OVLAN1	= 0x02,
+	FW_PORT_L2_CTLBF_OVLAN2	= 0x04,
+	FW_PORT_L2_CTLBF_OVLAN3	= 0x08,
+	FW_PORT_L2_CTLBF_IVLAN	= 0x10,
+	FW_PORT_L2_CTLBF_TXIPG	= 0x20
+};
+
+enum fw_port_dcb_versions {
+	FW_PORT_DCB_VER_UNKNOWN,
+	FW_PORT_DCB_VER_CEE1D0,
+	FW_PORT_DCB_VER_CEE1D01,
+	FW_PORT_DCB_VER_IEEE,
+	FW_PORT_DCB_VER_AUTO = 7
+};
+
+enum fw_port_dcb_cfg {
+	FW_PORT_DCB_CFG_PG	= 0x01,
+	FW_PORT_DCB_CFG_PFC	= 0x02,
+	FW_PORT_DCB_CFG_APPL	= 0x04
+};
+
+enum fw_port_dcb_cfg_rc {
+	FW_PORT_DCB_CFG_SUCCESS	= 0x0,
+	FW_PORT_DCB_CFG_ERROR	= 0x1
+};
+
+enum fw_port_dcb_type {
+	FW_PORT_DCB_TYPE_PGID		= 0x00,
+	FW_PORT_DCB_TYPE_PGRATE		= 0x01,
+	FW_PORT_DCB_TYPE_PRIORATE	= 0x02,
+	FW_PORT_DCB_TYPE_PFC		= 0x03,
+	FW_PORT_DCB_TYPE_APP_ID		= 0x04,
+	FW_PORT_DCB_TYPE_CONTROL	= 0x05,
+};
+
+enum fw_port_dcb_feature_state {
+	FW_PORT_DCB_FEATURE_STATE_PENDING = 0x0,
+	FW_PORT_DCB_FEATURE_STATE_SUCCESS = 0x1,
+	FW_PORT_DCB_FEATURE_STATE_ERROR	= 0x2,
+	FW_PORT_DCB_FEATURE_STATE_TIMEOUT = 0x3,
+};
+
+struct fw_port_cmd {
+	__be32 op_to_portid;
+	__be32 action_to_len16;
+	union fw_port {
+		struct fw_port_l1cfg {
+			__be32 rcap;
+			__be32 r;
+		} l1cfg;
+		struct fw_port_l2cfg {
+			__u8   ctlbf;
+			__u8   ovlan3_to_ivlan0;
+			__be16 ivlantype;
+			__be16 txipg_force_pinfo;
+			__be16 mtu;
+			__be16 ovlan0mask;
+			__be16 ovlan0type;
+			__be16 ovlan1mask;
+			__be16 ovlan1type;
+			__be16 ovlan2mask;
+			__be16 ovlan2type;
+			__be16 ovlan3mask;
+			__be16 ovlan3type;
+		} l2cfg;
+		struct fw_port_info {
+			__be32 lstatus_to_modtype;
+			__be16 pcap;
+			__be16 acap;
+			__be16 mtu;
+			__u8   cbllen;
+			__u8   auxlinfo;
+			__u8   dcbxdis_pkd;
+			__u8   r8_lo[3];
+			__be64 r9;
+		} info;
+		struct fw_port_diags {
+			__u8   diagop;
+			__u8   r[3];
+			__be32 diagval;
+		} diags;
+		union fw_port_dcb {
+			struct fw_port_dcb_pgid {
+				__u8   type;
+				__u8   apply_pkd;
+				__u8   r10_lo[2];
+				__be32 pgid;
+				__be64 r11;
+			} pgid;
+			struct fw_port_dcb_pgrate {
+				__u8   type;
+				__u8   apply_pkd;
+				__u8   r10_lo[5];
+				__u8   num_tcs_supported;
+				__u8   pgrate[8];
+				__u8   tsa[8];
+			} pgrate;
+			struct fw_port_dcb_priorate {
+				__u8   type;
+				__u8   apply_pkd;
+				__u8   r10_lo[6];
+				__u8   strict_priorate[8];
+			} priorate;
+			struct fw_port_dcb_pfc {
+				__u8   type;
+				__u8   pfcen;
+				__u8   r10[5];
+				__u8   max_pfc_tcs;
+				__be64 r11;
+			} pfc;
+			struct fw_port_app_priority {
+				__u8   type;
+				__u8   r10[2];
+				__u8   idx;
+				__u8   user_prio_map;
+				__u8   sel_field;
+				__be16 protocolid;
+				__be64 r12;
+			} app_priority;
+			struct fw_port_dcb_control {
+				__u8   type;
+				__u8   all_syncd_pkd;
+				__be16 dcb_version_to_app_state;
+				__be32 r11;
+				__be64 r12;
+			} control;
+		} dcb;
+	} u;
+};
+
+#define FW_PORT_CMD_READ (1U << 22)
+
+#define FW_PORT_CMD_PORTID(x) ((x) << 0)
+#define FW_PORT_CMD_PORTID_GET(x) (((x) >> 0) & 0xf)
+
+#define FW_PORT_CMD_ACTION(x) ((x) << 16)
+#define FW_PORT_CMD_ACTION_GET(x) (((x) >> 16) & 0xffff)
+
+#define FW_PORT_CMD_CTLBF(x) ((x) << 10)
+#define FW_PORT_CMD_OVLAN3(x) ((x) << 7)
+#define FW_PORT_CMD_OVLAN2(x) ((x) << 6)
+#define FW_PORT_CMD_OVLAN1(x) ((x) << 5)
+#define FW_PORT_CMD_OVLAN0(x) ((x) << 4)
+#define FW_PORT_CMD_IVLAN0(x) ((x) << 3)
+
+#define FW_PORT_CMD_TXIPG(x) ((x) << 19)
+
+#define FW_PORT_CMD_LSTATUS (1U << 31)
+#define FW_PORT_CMD_LSTATUS_GET(x) (((x) >> 31) & 0x1)
+#define FW_PORT_CMD_LSPEED(x) ((x) << 24)
+#define FW_PORT_CMD_LSPEED_GET(x) (((x) >> 24) & 0x3f)
+#define FW_PORT_CMD_TXPAUSE (1U << 23)
+#define FW_PORT_CMD_RXPAUSE (1U << 22)
+#define FW_PORT_CMD_MDIOCAP (1U << 21)
+#define FW_PORT_CMD_MDIOADDR_GET(x) (((x) >> 16) & 0x1f)
+#define FW_PORT_CMD_LPTXPAUSE (1U << 15)
+#define FW_PORT_CMD_LPRXPAUSE (1U << 14)
+#define FW_PORT_CMD_PTYPE_MASK 0x1f
+#define FW_PORT_CMD_PTYPE_GET(x) (((x) >> 8) & FW_PORT_CMD_PTYPE_MASK)
+#define FW_PORT_CMD_MODTYPE_MASK 0x1f
+#define FW_PORT_CMD_MODTYPE_GET(x) (((x) >> 0) & FW_PORT_CMD_MODTYPE_MASK)
+
+#define FW_PORT_CMD_DCBXDIS (1U << 7)
+#define FW_PORT_CMD_APPLY (1U <<  7)
+#define FW_PORT_CMD_ALL_SYNCD (1U << 7)
+#define FW_PORT_CMD_DCB_VERSION_GET(x) (((x) >> 8) & 0xf)
+
+#define FW_PORT_CMD_PPPEN(x) ((x) << 31)
+#define FW_PORT_CMD_TPSRC(x) ((x) << 28)
+#define FW_PORT_CMD_NCSISRC(x) ((x) << 24)
+
+#define FW_PORT_CMD_CH0(x) ((x) << 20)
+#define FW_PORT_CMD_CH1(x) ((x) << 16)
+#define FW_PORT_CMD_CH2(x) ((x) << 12)
+#define FW_PORT_CMD_CH3(x) ((x) << 8)
+#define FW_PORT_CMD_NCSICH(x) ((x) << 4)
+
+enum fw_port_type {
+	FW_PORT_TYPE_FIBER_XFI,
+	FW_PORT_TYPE_FIBER_XAUI,
+	FW_PORT_TYPE_BT_SGMII,
+	FW_PORT_TYPE_BT_XFI,
+	FW_PORT_TYPE_BT_XAUI,
+	FW_PORT_TYPE_KX4,
+	FW_PORT_TYPE_CX4,
+	FW_PORT_TYPE_KX,
+	FW_PORT_TYPE_KR,
+	FW_PORT_TYPE_SFP,
+	FW_PORT_TYPE_BP_AP,
+	FW_PORT_TYPE_BP4_AP,
+	FW_PORT_TYPE_QSFP_10G,
+	FW_PORT_TYPE_QSFP,
+	FW_PORT_TYPE_BP40_BA,
+
+	FW_PORT_TYPE_NONE = FW_PORT_CMD_PTYPE_MASK
+};
+
+enum fw_port_module_type {
+	FW_PORT_MOD_TYPE_NA,
+	FW_PORT_MOD_TYPE_LR,
+	FW_PORT_MOD_TYPE_SR,
+	FW_PORT_MOD_TYPE_ER,
+	FW_PORT_MOD_TYPE_TWINAX_PASSIVE,
+	FW_PORT_MOD_TYPE_TWINAX_ACTIVE,
+	FW_PORT_MOD_TYPE_LRM,
+	FW_PORT_MOD_TYPE_ERROR		= FW_PORT_CMD_MODTYPE_MASK - 3,
+	FW_PORT_MOD_TYPE_UNKNOWN	= FW_PORT_CMD_MODTYPE_MASK - 2,
+	FW_PORT_MOD_TYPE_NOTSUPPORTED	= FW_PORT_CMD_MODTYPE_MASK - 1,
+
+	FW_PORT_MOD_TYPE_NONE = FW_PORT_CMD_MODTYPE_MASK
+};
+
+enum fw_port_mod_sub_type {
+	FW_PORT_MOD_SUB_TYPE_NA,
+	FW_PORT_MOD_SUB_TYPE_MV88E114X = 0x1,
+	FW_PORT_MOD_SUB_TYPE_TN8022 = 0x2,
+	FW_PORT_MOD_SUB_TYPE_AQ1202 = 0x3,
+	FW_PORT_MOD_SUB_TYPE_88x3120 = 0x4,
+	FW_PORT_MOD_SUB_TYPE_BCM84834 = 0x5,
+	FW_PORT_MOD_SUB_TYPE_BT_VSC8634 = 0x8,
+
+	/* The following will never been in the VPD.  They are TWINAX cable
+	 * lengths decoded from SFP+ module i2c PROMs.  These should
+	 * almost certainly go somewhere else ...
+	 */
+	FW_PORT_MOD_SUB_TYPE_TWINAX_1 = 0x9,
+	FW_PORT_MOD_SUB_TYPE_TWINAX_3 = 0xA,
+	FW_PORT_MOD_SUB_TYPE_TWINAX_5 = 0xB,
+	FW_PORT_MOD_SUB_TYPE_TWINAX_7 = 0xC,
+};
+
+/* port stats */
+#define FW_NUM_PORT_STATS 50
+#define FW_NUM_PORT_TX_STATS 23
+#define FW_NUM_PORT_RX_STATS 27
+
+enum fw_port_stats_tx_index {
+	FW_STAT_TX_PORT_BYTES_IX,
+	FW_STAT_TX_PORT_FRAMES_IX,
+	FW_STAT_TX_PORT_BCAST_IX,
+	FW_STAT_TX_PORT_MCAST_IX,
+	FW_STAT_TX_PORT_UCAST_IX,
+	FW_STAT_TX_PORT_ERROR_IX,
+	FW_STAT_TX_PORT_64B_IX,
+	FW_STAT_TX_PORT_65B_127B_IX,
+	FW_STAT_TX_PORT_128B_255B_IX,
+	FW_STAT_TX_PORT_256B_511B_IX,
+	FW_STAT_TX_PORT_512B_1023B_IX,
+	FW_STAT_TX_PORT_1024B_1518B_IX,
+	FW_STAT_TX_PORT_1519B_MAX_IX,
+	FW_STAT_TX_PORT_DROP_IX,
+	FW_STAT_TX_PORT_PAUSE_IX,
+	FW_STAT_TX_PORT_PPP0_IX,
+	FW_STAT_TX_PORT_PPP1_IX,
+	FW_STAT_TX_PORT_PPP2_IX,
+	FW_STAT_TX_PORT_PPP3_IX,
+	FW_STAT_TX_PORT_PPP4_IX,
+	FW_STAT_TX_PORT_PPP5_IX,
+	FW_STAT_TX_PORT_PPP6_IX,
+	FW_STAT_TX_PORT_PPP7_IX
+};
+
+enum fw_port_stat_rx_index {
+	FW_STAT_RX_PORT_BYTES_IX,
+	FW_STAT_RX_PORT_FRAMES_IX,
+	FW_STAT_RX_PORT_BCAST_IX,
+	FW_STAT_RX_PORT_MCAST_IX,
+	FW_STAT_RX_PORT_UCAST_IX,
+	FW_STAT_RX_PORT_MTU_ERROR_IX,
+	FW_STAT_RX_PORT_MTU_CRC_ERROR_IX,
+	FW_STAT_RX_PORT_CRC_ERROR_IX,
+	FW_STAT_RX_PORT_LEN_ERROR_IX,
+	FW_STAT_RX_PORT_SYM_ERROR_IX,
+	FW_STAT_RX_PORT_64B_IX,
+	FW_STAT_RX_PORT_65B_127B_IX,
+	FW_STAT_RX_PORT_128B_255B_IX,
+	FW_STAT_RX_PORT_256B_511B_IX,
+	FW_STAT_RX_PORT_512B_1023B_IX,
+	FW_STAT_RX_PORT_1024B_1518B_IX,
+	FW_STAT_RX_PORT_1519B_MAX_IX,
+	FW_STAT_RX_PORT_PAUSE_IX,
+	FW_STAT_RX_PORT_PPP0_IX,
+	FW_STAT_RX_PORT_PPP1_IX,
+	FW_STAT_RX_PORT_PPP2_IX,
+	FW_STAT_RX_PORT_PPP3_IX,
+	FW_STAT_RX_PORT_PPP4_IX,
+	FW_STAT_RX_PORT_PPP5_IX,
+	FW_STAT_RX_PORT_PPP6_IX,
+	FW_STAT_RX_PORT_PPP7_IX,
+	FW_STAT_RX_PORT_LESS_64B_IX
+};
+
+struct fw_port_stats_cmd {
+	__be32 op_to_portid;
+	__be32 retval_len16;
+	union fw_port_stats {
+		struct fw_port_stats_ctl {
+			u8 nstats_bg_bm;
+			u8 tx_ix;
+			__be16 r6;
+			__be32 r7;
+			__be64 stat0;
+			__be64 stat1;
+			__be64 stat2;
+			__be64 stat3;
+			__be64 stat4;
+			__be64 stat5;
+		} ctl;
+		struct fw_port_stats_all {
+			__be64 tx_bytes;
+			__be64 tx_frames;
+			__be64 tx_bcast;
+			__be64 tx_mcast;
+			__be64 tx_ucast;
+			__be64 tx_error;
+			__be64 tx_64b;
+			__be64 tx_65b_127b;
+			__be64 tx_128b_255b;
+			__be64 tx_256b_511b;
+			__be64 tx_512b_1023b;
+			__be64 tx_1024b_1518b;
+			__be64 tx_1519b_max;
+			__be64 tx_drop;
+			__be64 tx_pause;
+			__be64 tx_ppp0;
+			__be64 tx_ppp1;
+			__be64 tx_ppp2;
+			__be64 tx_ppp3;
+			__be64 tx_ppp4;
+			__be64 tx_ppp5;
+			__be64 tx_ppp6;
+			__be64 tx_ppp7;
+			__be64 rx_bytes;
+			__be64 rx_frames;
+			__be64 rx_bcast;
+			__be64 rx_mcast;
+			__be64 rx_ucast;
+			__be64 rx_mtu_error;
+			__be64 rx_mtu_crc_error;
+			__be64 rx_crc_error;
+			__be64 rx_len_error;
+			__be64 rx_sym_error;
+			__be64 rx_64b;
+			__be64 rx_65b_127b;
+			__be64 rx_128b_255b;
+			__be64 rx_256b_511b;
+			__be64 rx_512b_1023b;
+			__be64 rx_1024b_1518b;
+			__be64 rx_1519b_max;
+			__be64 rx_pause;
+			__be64 rx_ppp0;
+			__be64 rx_ppp1;
+			__be64 rx_ppp2;
+			__be64 rx_ppp3;
+			__be64 rx_ppp4;
+			__be64 rx_ppp5;
+			__be64 rx_ppp6;
+			__be64 rx_ppp7;
+			__be64 rx_less_64b;
+			__be64 rx_bg_drop;
+			__be64 rx_bg_trunc;
+		} all;
+	} u;
+};
+
+#define FW_PORT_STATS_CMD_NSTATS(x) ((x) << 4)
+#define FW_PORT_STATS_CMD_BG_BM(x) ((x) << 0)
+#define FW_PORT_STATS_CMD_TX(x) ((x) << 7)
+#define FW_PORT_STATS_CMD_IX(x) ((x) << 0)
+
+/* port loopback stats */
+#define FW_NUM_LB_STATS 16
+enum fw_port_lb_stats_index {
+	FW_STAT_LB_PORT_BYTES_IX,
+	FW_STAT_LB_PORT_FRAMES_IX,
+	FW_STAT_LB_PORT_BCAST_IX,
+	FW_STAT_LB_PORT_MCAST_IX,
+	FW_STAT_LB_PORT_UCAST_IX,
+	FW_STAT_LB_PORT_ERROR_IX,
+	FW_STAT_LB_PORT_64B_IX,
+	FW_STAT_LB_PORT_65B_127B_IX,
+	FW_STAT_LB_PORT_128B_255B_IX,
+	FW_STAT_LB_PORT_256B_511B_IX,
+	FW_STAT_LB_PORT_512B_1023B_IX,
+	FW_STAT_LB_PORT_1024B_1518B_IX,
+	FW_STAT_LB_PORT_1519B_MAX_IX,
+	FW_STAT_LB_PORT_DROP_FRAMES_IX
+};
+
+struct fw_port_lb_stats_cmd {
+	__be32 op_to_lbport;
+	__be32 retval_len16;
+	union fw_port_lb_stats {
+		struct fw_port_lb_stats_ctl {
+			u8 nstats_bg_bm;
+			u8 ix_pkd;
+			__be16 r6;
+			__be32 r7;
+			__be64 stat0;
+			__be64 stat1;
+			__be64 stat2;
+			__be64 stat3;
+			__be64 stat4;
+			__be64 stat5;
+		} ctl;
+		struct fw_port_lb_stats_all {
+			__be64 tx_bytes;
+			__be64 tx_frames;
+			__be64 tx_bcast;
+			__be64 tx_mcast;
+			__be64 tx_ucast;
+			__be64 tx_error;
+			__be64 tx_64b;
+			__be64 tx_65b_127b;
+			__be64 tx_128b_255b;
+			__be64 tx_256b_511b;
+			__be64 tx_512b_1023b;
+			__be64 tx_1024b_1518b;
+			__be64 tx_1519b_max;
+			__be64 rx_lb_drop;
+			__be64 rx_lb_trunc;
+		} all;
+	} u;
+};
+
+#define FW_PORT_LB_STATS_CMD_LBPORT(x) ((x) << 0)
+#define FW_PORT_LB_STATS_CMD_NSTATS(x) ((x) << 4)
+#define FW_PORT_LB_STATS_CMD_BG_BM(x) ((x) << 0)
+#define FW_PORT_LB_STATS_CMD_IX(x) ((x) << 0)
+
+struct fw_rss_ind_tbl_cmd {
+	__be32 op_to_viid;
+#define FW_RSS_IND_TBL_CMD_VIID(x) ((x) << 0)
+	__be32 retval_len16;
+	__be16 niqid;
+	__be16 startidx;
+	__be32 r3;
+	__be32 iq0_to_iq2;
+#define FW_RSS_IND_TBL_CMD_IQ0(x) ((x) << 20)
+#define FW_RSS_IND_TBL_CMD_IQ1(x) ((x) << 10)
+#define FW_RSS_IND_TBL_CMD_IQ2(x) ((x) << 0)
+	__be32 iq3_to_iq5;
+	__be32 iq6_to_iq8;
+	__be32 iq9_to_iq11;
+	__be32 iq12_to_iq14;
+	__be32 iq15_to_iq17;
+	__be32 iq18_to_iq20;
+	__be32 iq21_to_iq23;
+	__be32 iq24_to_iq26;
+	__be32 iq27_to_iq29;
+	__be32 iq30_iq31;
+	__be32 r15_lo;
+};
+
+struct fw_rss_glb_config_cmd {
+	__be32 op_to_write;
+	__be32 retval_len16;
+	union fw_rss_glb_config {
+		struct fw_rss_glb_config_manual {
+			__be32 mode_pkd;
+			__be32 r3;
+			__be64 r4;
+			__be64 r5;
+		} manual;
+		struct fw_rss_glb_config_basicvirtual {
+			__be32 mode_pkd;
+			__be32 synmapen_to_hashtoeplitz;
+#define FW_RSS_GLB_CONFIG_CMD_SYNMAPEN      (1U << 8)
+#define FW_RSS_GLB_CONFIG_CMD_SYN4TUPENIPV6 (1U << 7)
+#define FW_RSS_GLB_CONFIG_CMD_SYN2TUPENIPV6 (1U << 6)
+#define FW_RSS_GLB_CONFIG_CMD_SYN4TUPENIPV4 (1U << 5)
+#define FW_RSS_GLB_CONFIG_CMD_SYN2TUPENIPV4 (1U << 4)
+#define FW_RSS_GLB_CONFIG_CMD_OFDMAPEN      (1U << 3)
+#define FW_RSS_GLB_CONFIG_CMD_TNLMAPEN      (1U << 2)
+#define FW_RSS_GLB_CONFIG_CMD_TNLALLLKP     (1U << 1)
+#define FW_RSS_GLB_CONFIG_CMD_HASHTOEPLITZ  (1U << 0)
+			__be64 r8;
+			__be64 r9;
+		} basicvirtual;
+	} u;
+};
+
+#define FW_RSS_GLB_CONFIG_CMD_MODE(x)	((x) << 28)
+#define FW_RSS_GLB_CONFIG_CMD_MODE_GET(x) (((x) >> 28) & 0xf)
+
+#define FW_RSS_GLB_CONFIG_CMD_MODE_MANUAL	0
+#define FW_RSS_GLB_CONFIG_CMD_MODE_BASICVIRTUAL	1
+
+struct fw_rss_vi_config_cmd {
+	__be32 op_to_viid;
+#define FW_RSS_VI_CONFIG_CMD_VIID(x) ((x) << 0)
+	__be32 retval_len16;
+	union fw_rss_vi_config {
+		struct fw_rss_vi_config_manual {
+			__be64 r3;
+			__be64 r4;
+			__be64 r5;
+		} manual;
+		struct fw_rss_vi_config_basicvirtual {
+			__be32 r6;
+			__be32 defaultq_to_udpen;
+#define FW_RSS_VI_CONFIG_CMD_DEFAULTQ(x)  ((x) << 16)
+#define FW_RSS_VI_CONFIG_CMD_DEFAULTQ_GET(x) (((x) >> 16) & 0x3ff)
+#define FW_RSS_VI_CONFIG_CMD_IP6FOURTUPEN (1U << 4)
+#define FW_RSS_VI_CONFIG_CMD_IP6TWOTUPEN  (1U << 3)
+#define FW_RSS_VI_CONFIG_CMD_IP4FOURTUPEN (1U << 2)
+#define FW_RSS_VI_CONFIG_CMD_IP4TWOTUPEN  (1U << 1)
+#define FW_RSS_VI_CONFIG_CMD_UDPEN        (1U << 0)
+			__be64 r9;
+			__be64 r10;
+		} basicvirtual;
+	} u;
+};
+
+struct fw_clip_cmd {
+	__be32 op_to_write;
+	__be32 alloc_to_len16;
+	__be64 ip_hi;
+	__be64 ip_lo;
+	__be32 r4[2];
+};
+
+#define S_FW_CLIP_CMD_ALLOC     31
+#define M_FW_CLIP_CMD_ALLOC     0x1
+#define V_FW_CLIP_CMD_ALLOC(x)  ((x) << S_FW_CLIP_CMD_ALLOC)
+#define G_FW_CLIP_CMD_ALLOC(x)  \
+	(((x) >> S_FW_CLIP_CMD_ALLOC) & M_FW_CLIP_CMD_ALLOC)
+#define F_FW_CLIP_CMD_ALLOC     V_FW_CLIP_CMD_ALLOC(1U)
+
+#define S_FW_CLIP_CMD_FREE      30
+#define M_FW_CLIP_CMD_FREE      0x1
+#define V_FW_CLIP_CMD_FREE(x)   ((x) << S_FW_CLIP_CMD_FREE)
+#define G_FW_CLIP_CMD_FREE(x)   \
+	(((x) >> S_FW_CLIP_CMD_FREE) & M_FW_CLIP_CMD_FREE)
+#define F_FW_CLIP_CMD_FREE      V_FW_CLIP_CMD_FREE(1U)
+
+enum fw_error_type {
+	FW_ERROR_TYPE_EXCEPTION		= 0x0,
+	FW_ERROR_TYPE_HWMODULE		= 0x1,
+	FW_ERROR_TYPE_WR		= 0x2,
+	FW_ERROR_TYPE_ACL		= 0x3,
+};
+
+struct fw_error_cmd {
+	__be32 op_to_type;
+	__be32 len16_pkd;
+	union fw_error {
+		struct fw_error_exception {
+			__be32 info[6];
+		} exception;
+		struct fw_error_hwmodule {
+			__be32 regaddr;
+			__be32 regval;
+		} hwmodule;
+		struct fw_error_wr {
+			__be16 cidx;
+			__be16 pfn_vfn;
+			__be32 eqid;
+			u8 wrhdr[16];
+		} wr;
+		struct fw_error_acl {
+			__be16 cidx;
+			__be16 pfn_vfn;
+			__be32 eqid;
+			__be16 mv_pkd;
+			u8 val[6];
+			__be64 r4;
+		} acl;
+	} u;
+};
+
+struct fw_debug_cmd {
+	__be32 op_type;
+#define FW_DEBUG_CMD_TYPE_GET(x) ((x) & 0xff)
+	__be32 len16_pkd;
+	union fw_debug {
+		struct fw_debug_assert {
+			__be32 fcid;
+			__be32 line;
+			__be32 x;
+			__be32 y;
+			u8 filename_0_7[8];
+			u8 filename_8_15[8];
+			__be64 r3;
+		} assert;
+		struct fw_debug_prt {
+			__be16 dprtstridx;
+			__be16 r3[3];
+			__be32 dprtstrparam0;
+			__be32 dprtstrparam1;
+			__be32 dprtstrparam2;
+			__be32 dprtstrparam3;
+		} prt;
+	} u;
+};
+
+#define FW_PCIE_FW_ERR           (1U << 31)
+#define FW_PCIE_FW_INIT          (1U << 30)
+#define FW_PCIE_FW_HALT          (1U << 29)
+#define FW_PCIE_FW_MASTER_VLD    (1U << 15)
+#define FW_PCIE_FW_MASTER_MASK   0x7
+#define FW_PCIE_FW_MASTER_SHIFT  12
+#define FW_PCIE_FW_MASTER(x)     ((x) << FW_PCIE_FW_MASTER_SHIFT)
+#define FW_PCIE_FW_MASTER_GET(x) (((x) >> FW_PCIE_FW_MASTER_SHIFT) & \
+				 FW_PCIE_FW_MASTER_MASK)
+#define FW_PCIE_FW_EVAL_MASK   0x7
+#define FW_PCIE_FW_EVAL_SHIFT  24
+#define FW_PCIE_FW_EVAL_GET(x) (((x) >> FW_PCIE_FW_EVAL_SHIFT) & \
+				 FW_PCIE_FW_EVAL_MASK)
+
+struct fw_hdr {
+	u8 ver;
+	u8 chip;			/* terminator chip type */
+	__be16	len512;			/* bin length in units of 512-bytes */
+	__be32	fw_ver;			/* firmware version */
+	__be32	tp_microcode_ver;
+	u8 intfver_nic;
+	u8 intfver_vnic;
+	u8 intfver_ofld;
+	u8 intfver_ri;
+	u8 intfver_iscsipdu;
+	u8 intfver_iscsi;
+	u8 intfver_fcoepdu;
+	u8 intfver_fcoe;
+	__u32   reserved2;
+	__u32   reserved3;
+	__u32   reserved4;
+	__be32  flags;
+	__be32  reserved6[23];
+};
+
+enum fw_hdr_chip {
+	FW_HDR_CHIP_T4,
+	FW_HDR_CHIP_T5
+};
+
+#define FW_HDR_FW_VER_MAJOR_GET(x) (((x) >> 24) & 0xff)
+#define FW_HDR_FW_VER_MINOR_GET(x) (((x) >> 16) & 0xff)
+#define FW_HDR_FW_VER_MICRO_GET(x) (((x) >> 8) & 0xff)
+#define FW_HDR_FW_VER_BUILD_GET(x) (((x) >> 0) & 0xff)
+
+enum fw_hdr_intfver {
+	FW_HDR_INTFVER_NIC      = 0x00,
+	FW_HDR_INTFVER_VNIC     = 0x00,
+	FW_HDR_INTFVER_OFLD     = 0x00,
+	FW_HDR_INTFVER_RI       = 0x00,
+	FW_HDR_INTFVER_ISCSIPDU = 0x00,
+	FW_HDR_INTFVER_ISCSI    = 0x00,
+	FW_HDR_INTFVER_FCOEPDU  = 0x00,
+	FW_HDR_INTFVER_FCOE     = 0x00,
+};
+
+enum fw_hdr_flags {
+	FW_HDR_FLAGS_RESET_HALT = 0x00000001,
+};
+
+#endif /* _T4FW_INTERFACE_H_ */
diff --git a/drivers/net/ethernet/emulex/benet/Kconfig b/drivers/net/ethernet/emulex/benet/Kconfig
new file mode 100644
index 0000000..ea94a8e
--- /dev/null
+++ b/drivers/net/ethernet/emulex/benet/Kconfig
@@ -0,0 +1,14 @@
+config BE2NET
+	tristate "ServerEngines' 10Gbps NIC - BladeEngine"
+	depends on PCI
+	---help---
+	  This driver implements the NIC functionality for ServerEngines'
+	  10Gbps network adapter - BladeEngine.
+
+config BE2NET_VXLAN
+        bool "VXLAN offload support on be2net driver"
+        default y
+        depends on BE2NET && VXLAN && !(BE2NET=y && VXLAN=m)
+        ---help---
+	  Say Y here if you want to enable VXLAN offload support on
+	  be2net driver.
diff --git a/drivers/net/ethernet/emulex/benet/Makefile b/drivers/net/ethernet/emulex/benet/Makefile
new file mode 100644
index 0000000..1a91b27
--- /dev/null
+++ b/drivers/net/ethernet/emulex/benet/Makefile
@@ -0,0 +1,7 @@
+#
+# Makefile to build the network driver for ServerEngine's BladeEngine.
+#
+
+obj-$(CONFIG_BE2NET) += be2net.o
+
+be2net-y :=  be_main.o be_cmds.o be_ethtool.o be_roce.o
diff --git a/drivers/net/ethernet/emulex/benet/be.h b/drivers/net/ethernet/emulex/benet/be.h
new file mode 100644
index 0000000..9a2d752
--- /dev/null
+++ b/drivers/net/ethernet/emulex/benet/be.h
@@ -0,0 +1,926 @@
+/*
+ * Copyright (C) 2005 - 2014 Emulex
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.  The full GNU General
+ * Public License is included in this distribution in the file called COPYING.
+ *
+ * Contact Information:
+ * linux-drivers@emulex.com
+ *
+ * Emulex
+ * 3333 Susan Street
+ * Costa Mesa, CA 92626
+ */
+
+#ifndef BE_H
+#define BE_H
+
+#include <linux/pci.h>
+#include <linux/etherdevice.h>
+#include <linux/delay.h>
+#include <net/tcp.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <linux/if_vlan.h>
+#include <linux/workqueue.h>
+#include <linux/interrupt.h>
+#include <linux/firmware.h>
+#include <linux/slab.h>
+#include <linux/u64_stats_sync.h>
+
+#include "be_hw.h"
+#include "be_roce.h"
+
+#define DRV_VER			"10.4u"
+#define DRV_NAME		"be2net"
+#define BE_NAME			"Emulex BladeEngine2"
+#define BE3_NAME		"Emulex BladeEngine3"
+#define OC_NAME			"Emulex OneConnect"
+#define OC_NAME_BE		OC_NAME	"(be3)"
+#define OC_NAME_LANCER		OC_NAME "(Lancer)"
+#define OC_NAME_SH		OC_NAME "(Skyhawk)"
+#define DRV_DESC		"Emulex OneConnect NIC Driver"
+
+#define BE_VENDOR_ID 		0x19a2
+#define EMULEX_VENDOR_ID	0x10df
+#define BE_DEVICE_ID1		0x211
+#define BE_DEVICE_ID2		0x221
+#define OC_DEVICE_ID1		0x700	/* Device Id for BE2 cards */
+#define OC_DEVICE_ID2		0x710	/* Device Id for BE3 cards */
+#define OC_DEVICE_ID3		0xe220	/* Device id for Lancer cards */
+#define OC_DEVICE_ID4           0xe228   /* Device id for VF in Lancer */
+#define OC_DEVICE_ID5		0x720	/* Device Id for Skyhawk cards */
+#define OC_DEVICE_ID6		0x728   /* Device id for VF in SkyHawk */
+#define OC_SUBSYS_DEVICE_ID1	0xE602
+#define OC_SUBSYS_DEVICE_ID2	0xE642
+#define OC_SUBSYS_DEVICE_ID3	0xE612
+#define OC_SUBSYS_DEVICE_ID4	0xE652
+
+static inline char *nic_name(struct pci_dev *pdev)
+{
+	switch (pdev->device) {
+	case OC_DEVICE_ID1:
+		return OC_NAME;
+	case OC_DEVICE_ID2:
+		return OC_NAME_BE;
+	case OC_DEVICE_ID3:
+	case OC_DEVICE_ID4:
+		return OC_NAME_LANCER;
+	case BE_DEVICE_ID2:
+		return BE3_NAME;
+	case OC_DEVICE_ID5:
+	case OC_DEVICE_ID6:
+		return OC_NAME_SH;
+	default:
+		return BE_NAME;
+	}
+}
+
+/* Number of bytes of an RX frame that are copied to skb->data */
+#define BE_HDR_LEN		((u16) 64)
+/* allocate extra space to allow tunneling decapsulation without head reallocation */
+#define BE_RX_SKB_ALLOC_SIZE (BE_HDR_LEN + 64)
+
+#define BE_MAX_JUMBO_FRAME_SIZE	9018
+#define BE_MIN_MTU		256
+#define BE_MAX_MTU              (BE_MAX_JUMBO_FRAME_SIZE -	\
+				 (ETH_HLEN + ETH_FCS_LEN))
+
+#define BE_NUM_VLANS_SUPPORTED	64
+#define BE_MAX_EQD		128u
+#define	BE_MAX_TX_FRAG_COUNT	30
+
+#define EVNT_Q_LEN		1024
+#define TX_Q_LEN		2048
+#define TX_CQ_LEN		1024
+#define RX_Q_LEN		1024	/* Does not support any other value */
+#define RX_CQ_LEN		1024
+#define MCC_Q_LEN		128	/* total size not to exceed 8 pages */
+#define MCC_CQ_LEN		256
+
+#define BE2_MAX_RSS_QS		4
+#define BE3_MAX_RSS_QS		16
+#define BE3_MAX_TX_QS		16
+#define BE3_MAX_EVT_QS		16
+#define BE3_SRIOV_MAX_EVT_QS	8
+
+#define MAX_RX_QS		32
+#define MAX_EVT_QS		32
+#define MAX_TX_QS		32
+
+#define MAX_ROCE_EQS		5
+#define MAX_MSIX_VECTORS	32
+#define MIN_MSIX_VECTORS	1
+#define BE_NAPI_WEIGHT		64
+#define MAX_RX_POST		BE_NAPI_WEIGHT /* Frags posted at a time */
+#define RX_FRAGS_REFILL_WM	(RX_Q_LEN - MAX_RX_POST)
+
+#define MAX_VFS			30 /* Max VFs supported by BE3 FW */
+#define FW_VER_LEN		32
+
+#define	RSS_INDIR_TABLE_LEN	128
+#define RSS_HASH_KEY_LEN	40
+
+struct be_dma_mem {
+	void *va;
+	dma_addr_t dma;
+	u32 size;
+};
+
+struct be_queue_info {
+	struct be_dma_mem dma_mem;
+	u16 len;
+	u16 entry_size;	/* Size of an element in the queue */
+	u16 id;
+	u16 tail, head;
+	bool created;
+	atomic_t used;	/* Number of valid elements in the queue */
+};
+
+static inline u32 MODULO(u16 val, u16 limit)
+{
+	BUG_ON(limit & (limit - 1));
+	return val & (limit - 1);
+}
+
+static inline void index_adv(u16 *index, u16 val, u16 limit)
+{
+	*index = MODULO((*index + val), limit);
+}
+
+static inline void index_inc(u16 *index, u16 limit)
+{
+	*index = MODULO((*index + 1), limit);
+}
+
+static inline void *queue_head_node(struct be_queue_info *q)
+{
+	return q->dma_mem.va + q->head * q->entry_size;
+}
+
+static inline void *queue_tail_node(struct be_queue_info *q)
+{
+	return q->dma_mem.va + q->tail * q->entry_size;
+}
+
+static inline void *queue_index_node(struct be_queue_info *q, u16 index)
+{
+	return q->dma_mem.va + index * q->entry_size;
+}
+
+static inline void queue_head_inc(struct be_queue_info *q)
+{
+	index_inc(&q->head, q->len);
+}
+
+static inline void index_dec(u16 *index, u16 limit)
+{
+	*index = MODULO((*index - 1), limit);
+}
+
+static inline void queue_tail_inc(struct be_queue_info *q)
+{
+	index_inc(&q->tail, q->len);
+}
+
+struct be_eq_obj {
+	struct be_queue_info q;
+	char desc[32];
+
+	/* Adaptive interrupt coalescing (AIC) info */
+	bool enable_aic;
+	u32 min_eqd;		/* in usecs */
+	u32 max_eqd;		/* in usecs */
+	u32 eqd;		/* configured val when aic is off */
+	u32 cur_eqd;		/* in usecs */
+
+	u8 idx;			/* array index */
+	u8 msix_idx;
+	u16 spurious_intr;
+	struct napi_struct napi;
+	struct be_adapter *adapter;
+
+#ifdef CONFIG_NET_RX_BUSY_POLL
+#define BE_EQ_IDLE		0
+#define BE_EQ_NAPI		1	/* napi owns this EQ */
+#define BE_EQ_POLL		2	/* poll owns this EQ */
+#define BE_EQ_LOCKED		(BE_EQ_NAPI | BE_EQ_POLL)
+#define BE_EQ_NAPI_YIELD	4	/* napi yielded this EQ */
+#define BE_EQ_POLL_YIELD	8	/* poll yielded this EQ */
+#define BE_EQ_YIELD		(BE_EQ_NAPI_YIELD | BE_EQ_POLL_YIELD)
+#define BE_EQ_USER_PEND		(BE_EQ_POLL | BE_EQ_POLL_YIELD)
+	unsigned int state;
+	spinlock_t lock;	/* lock to serialize napi and busy-poll */
+#endif  /* CONFIG_NET_RX_BUSY_POLL */
+} ____cacheline_aligned_in_smp;
+
+struct be_aic_obj {		/* Adaptive interrupt coalescing (AIC) info */
+	bool enable;
+	u32 min_eqd;		/* in usecs */
+	u32 max_eqd;		/* in usecs */
+	u32 prev_eqd;		/* in usecs */
+	u32 et_eqd;		/* configured val when aic is off */
+	ulong jiffies;
+	u64 rx_pkts_prev;	/* Used to calculate RX pps */
+	u64 tx_reqs_prev;	/* Used to calculate TX pps */
+};
+
+enum {
+	NAPI_POLLING,
+	BUSY_POLLING
+};
+
+struct be_mcc_obj {
+	struct be_queue_info q;
+	struct be_queue_info cq;
+	bool rearm_cq;
+};
+
+struct be_tx_stats {
+	u64 tx_bytes;
+	u64 tx_pkts;
+	u64 tx_reqs;
+	u64 tx_wrbs;
+	u64 tx_compl;
+	ulong tx_jiffies;
+	u32 tx_stops;
+	u32 tx_drv_drops;	/* pkts dropped by driver */
+	/* the error counters are described in be_ethtool.c */
+	u32 tx_hdr_parse_err;
+	u32 tx_dma_err;
+	u32 tx_tso_err;
+	u32 tx_spoof_check_err;
+	u32 tx_qinq_err;
+	u32 tx_internal_parity_err;
+	struct u64_stats_sync sync;
+	struct u64_stats_sync sync_compl;
+};
+
+struct be_tx_obj {
+	u32 db_offset;
+	struct be_queue_info q;
+	struct be_queue_info cq;
+	/* Remember the skbs that were transmitted */
+	struct sk_buff *sent_skb_list[TX_Q_LEN];
+	struct be_tx_stats stats;
+} ____cacheline_aligned_in_smp;
+
+/* Struct to remember the pages posted for rx frags */
+struct be_rx_page_info {
+	struct page *page;
+	/* set to page-addr for last frag of the page & frag-addr otherwise */
+	DEFINE_DMA_UNMAP_ADDR(bus);
+	u16 page_offset;
+	bool last_frag;		/* last frag of the page */
+};
+
+struct be_rx_stats {
+	u64 rx_bytes;
+	u64 rx_pkts;
+	u32 rx_drops_no_skbs;	/* skb allocation errors */
+	u32 rx_drops_no_frags;	/* HW has no fetched frags */
+	u32 rx_post_fail;	/* page post alloc failures */
+	u32 rx_compl;
+	u32 rx_mcast_pkts;
+	u32 rx_compl_err;	/* completions with err set */
+	struct u64_stats_sync sync;
+};
+
+struct be_rx_compl_info {
+	u32 rss_hash;
+	u16 vlan_tag;
+	u16 pkt_size;
+	u16 port;
+	u8 vlanf;
+	u8 num_rcvd;
+	u8 err;
+	u8 ipf;
+	u8 tcpf;
+	u8 udpf;
+	u8 ip_csum;
+	u8 l4_csum;
+	u8 ipv6;
+	u8 qnq;
+	u8 pkt_type;
+	u8 ip_frag;
+	u8 tunneled;
+};
+
+struct be_rx_obj {
+	struct be_adapter *adapter;
+	struct be_queue_info q;
+	struct be_queue_info cq;
+	struct be_rx_compl_info rxcp;
+	struct be_rx_page_info page_info_tbl[RX_Q_LEN];
+	struct be_rx_stats stats;
+	u8 rss_id;
+	bool rx_post_starved;	/* Zero rx frags have been posted to BE */
+} ____cacheline_aligned_in_smp;
+
+struct be_drv_stats {
+	u32 be_on_die_temperature;
+	u32 eth_red_drops;
+	u32 dma_map_errors;
+	u32 rx_drops_no_pbuf;
+	u32 rx_drops_no_txpb;
+	u32 rx_drops_no_erx_descr;
+	u32 rx_drops_no_tpre_descr;
+	u32 rx_drops_too_many_frags;
+	u32 forwarded_packets;
+	u32 rx_drops_mtu;
+	u32 rx_crc_errors;
+	u32 rx_alignment_symbol_errors;
+	u32 rx_pause_frames;
+	u32 rx_priority_pause_frames;
+	u32 rx_control_frames;
+	u32 rx_in_range_errors;
+	u32 rx_out_range_errors;
+	u32 rx_frame_too_long;
+	u32 rx_address_filtered;
+	u32 rx_dropped_too_small;
+	u32 rx_dropped_too_short;
+	u32 rx_dropped_header_too_small;
+	u32 rx_dropped_tcp_length;
+	u32 rx_dropped_runt;
+	u32 rx_ip_checksum_errs;
+	u32 rx_tcp_checksum_errs;
+	u32 rx_udp_checksum_errs;
+	u32 tx_pauseframes;
+	u32 tx_priority_pauseframes;
+	u32 tx_controlframes;
+	u32 rxpp_fifo_overflow_drop;
+	u32 rx_input_fifo_overflow_drop;
+	u32 pmem_fifo_overflow_drop;
+	u32 jabber_events;
+	u32 rx_roce_bytes_lsd;
+	u32 rx_roce_bytes_msd;
+	u32 rx_roce_frames;
+	u32 roce_drops_payload_len;
+	u32 roce_drops_crc;
+};
+
+/* A vlan-id of 0xFFFF must be used to clear transparent vlan-tagging */
+#define BE_RESET_VLAN_TAG_ID	0xFFFF
+
+struct be_vf_cfg {
+	unsigned char mac_addr[ETH_ALEN];
+	int if_handle;
+	int pmac_id;
+	u16 vlan_tag;
+	u32 tx_rate;
+	u32 plink_tracking;
+};
+
+enum vf_state {
+	ENABLED = 0,
+	ASSIGNED = 1
+};
+
+#define BE_FLAGS_LINK_STATUS_INIT		1
+#define BE_FLAGS_SRIOV_ENABLED			(1 << 2)
+#define BE_FLAGS_WORKER_SCHEDULED		(1 << 3)
+#define BE_FLAGS_VLAN_PROMISC			(1 << 4)
+#define BE_FLAGS_MCAST_PROMISC			(1 << 5)
+#define BE_FLAGS_NAPI_ENABLED			(1 << 9)
+#define BE_FLAGS_QNQ_ASYNC_EVT_RCVD		(1 << 11)
+#define BE_FLAGS_VXLAN_OFFLOADS			(1 << 12)
+#define BE_FLAGS_SETUP_DONE			(1 << 13)
+
+#define BE_UC_PMAC_COUNT			30
+#define BE_VF_UC_PMAC_COUNT			2
+
+/* Ethtool set_dump flags */
+#define LANCER_INITIATE_FW_DUMP			0x1
+#define LANCER_DELETE_FW_DUMP			0x2
+
+struct phy_info {
+	u8 transceiver;
+	u8 autoneg;
+	u8 fc_autoneg;
+	u8 port_type;
+	u16 phy_type;
+	u16 interface_type;
+	u32 misc_params;
+	u16 auto_speeds_supported;
+	u16 fixed_speeds_supported;
+	int link_speed;
+	u32 advertising;
+	u32 supported;
+	u8 cable_type;
+};
+
+struct be_resources {
+	u16 max_vfs;		/* Total VFs "really" supported by FW/HW */
+	u16 max_mcast_mac;
+	u16 max_tx_qs;
+	u16 max_rss_qs;
+	u16 max_rx_qs;
+	u16 max_uc_mac;		/* Max UC MACs programmable */
+	u16 max_vlans;		/* Number of vlans supported */
+	u16 max_evt_qs;
+	u32 if_cap_flags;
+	u32 vf_if_cap_flags;	/* VF if capability flags */
+};
+
+struct rss_info {
+	u64 rss_flags;
+	u8 rsstable[RSS_INDIR_TABLE_LEN];
+	u8 rss_queue[RSS_INDIR_TABLE_LEN];
+	u8 rss_hkey[RSS_HASH_KEY_LEN];
+};
+
+struct be_adapter {
+	struct pci_dev *pdev;
+	struct net_device *netdev;
+
+	u8 __iomem *csr;	/* CSR BAR used only for BE2/3 */
+	u8 __iomem *db;		/* Door Bell */
+
+	struct mutex mbox_lock; /* For serializing mbox cmds to BE card */
+	struct be_dma_mem mbox_mem;
+	/* Mbox mem is adjusted to align to 16 bytes. The allocated addr
+	 * is stored for freeing purpose */
+	struct be_dma_mem mbox_mem_alloced;
+
+	struct be_mcc_obj mcc_obj;
+	spinlock_t mcc_lock;	/* For serializing mcc cmds to BE card */
+	spinlock_t mcc_cq_lock;
+
+	u16 cfg_num_qs;		/* configured via set-channels */
+	u16 num_evt_qs;
+	u16 num_msix_vec;
+	struct be_eq_obj eq_obj[MAX_EVT_QS];
+	struct msix_entry msix_entries[MAX_MSIX_VECTORS];
+	bool isr_registered;
+
+	/* TX Rings */
+	u16 num_tx_qs;
+	struct be_tx_obj tx_obj[MAX_TX_QS];
+
+	/* Rx rings */
+	u16 num_rx_qs;
+	struct be_rx_obj rx_obj[MAX_RX_QS];
+	u32 big_page_size;	/* Compounded page size shared by rx wrbs */
+
+	struct be_drv_stats drv_stats;
+	struct be_aic_obj aic_obj[MAX_EVT_QS];
+	u16 vlans_added;
+	unsigned long vids[BITS_TO_LONGS(VLAN_N_VID)];
+	u8 vlan_prio_bmap;	/* Available Priority BitMap */
+	u16 recommended_prio;	/* Recommended Priority */
+	struct be_dma_mem rx_filter; /* Cmd DMA mem for rx-filter */
+
+	struct be_dma_mem stats_cmd;
+	/* Work queue used to perform periodic tasks like getting statistics */
+	struct delayed_work work;
+	u16 work_counter;
+
+	struct delayed_work func_recovery_work;
+	u32 flags;
+	u32 cmd_privileges;
+	/* Ethtool knobs and info */
+	char fw_ver[FW_VER_LEN];
+	char fw_on_flash[FW_VER_LEN];
+	int if_handle;		/* Used to configure filtering */
+	u32 *pmac_id;		/* MAC addr handle used by BE card */
+	u32 beacon_state;	/* for set_phys_id */
+
+	bool eeh_error;
+	bool fw_timeout;
+	bool hw_error;
+
+	u32 port_num;
+	bool promiscuous;
+	u8 mc_type;
+	u32 function_mode;
+	u32 function_caps;
+	u32 rx_fc;		/* Rx flow control */
+	u32 tx_fc;		/* Tx flow control */
+	bool stats_cmd_sent;
+	struct {
+		u32 size;
+		u32 total_size;
+		u64 io_addr;
+	} roce_db;
+	u32 num_msix_roce_vec;
+	struct ocrdma_dev *ocrdma_dev;
+	struct list_head entry;
+
+	u32 flash_status;
+	struct completion et_cmd_compl;
+
+	struct be_resources pool_res;	/* resources available for the port */
+	struct be_resources res;	/* resources available for the func */
+	u16 num_vfs;			/* Number of VFs provisioned by PF */
+	u8 virtfn;
+	struct be_vf_cfg *vf_cfg;
+	bool be3_native;
+	u32 sli_family;
+	u8 hba_port_num;
+	u16 pvid;
+	__be16 vxlan_port;
+	struct phy_info phy;
+	u8 wol_cap;
+	bool wol_en;
+	u32 uc_macs;		/* Count of secondary UC MAC programmed */
+	u16 asic_rev;
+	u16 qnq_vid;
+	u32 msg_enable;
+	int be_get_temp_freq;
+	u8 pf_number;
+	struct rss_info rss_info;
+};
+
+#define be_physfn(adapter)		(!adapter->virtfn)
+#define be_virtfn(adapter)		(adapter->virtfn)
+#define sriov_enabled(adapter)		(adapter->flags &	\
+					 BE_FLAGS_SRIOV_ENABLED)
+
+#define for_all_vfs(adapter, vf_cfg, i)					\
+	for (i = 0, vf_cfg = &adapter->vf_cfg[i]; i < adapter->num_vfs;	\
+		i++, vf_cfg++)
+
+#define ON				1
+#define OFF				0
+
+#define be_max_vlans(adapter)		(adapter->res.max_vlans)
+#define be_max_uc(adapter)		(adapter->res.max_uc_mac)
+#define be_max_mc(adapter)		(adapter->res.max_mcast_mac)
+#define be_max_vfs(adapter)		(adapter->pool_res.max_vfs)
+#define be_max_rss(adapter)		(adapter->res.max_rss_qs)
+#define be_max_txqs(adapter)		(adapter->res.max_tx_qs)
+#define be_max_prio_txqs(adapter)	(adapter->res.max_prio_tx_qs)
+#define be_max_rxqs(adapter)		(adapter->res.max_rx_qs)
+#define be_max_eqs(adapter)		(adapter->res.max_evt_qs)
+#define be_if_cap_flags(adapter)	(adapter->res.if_cap_flags)
+
+static inline u16 be_max_qs(struct be_adapter *adapter)
+{
+	/* If no RSS, need atleast the one def RXQ */
+	u16 num = max_t(u16, be_max_rss(adapter), 1);
+
+	num = min(num, be_max_eqs(adapter));
+	return min_t(u16, num, num_online_cpus());
+}
+
+/* Is BE in pvid_tagging mode */
+#define be_pvid_tagging_enabled(adapter)	(adapter->pvid)
+
+/* Is BE in QNQ multi-channel mode */
+#define be_is_qnq_mode(adapter)		(adapter->function_mode & QNQ_MODE)
+
+#define lancer_chip(adapter)	(adapter->pdev->device == OC_DEVICE_ID3 || \
+				 adapter->pdev->device == OC_DEVICE_ID4)
+
+#define skyhawk_chip(adapter)	(adapter->pdev->device == OC_DEVICE_ID5 || \
+				 adapter->pdev->device == OC_DEVICE_ID6)
+
+#define BE3_chip(adapter)	(adapter->pdev->device == BE_DEVICE_ID2 || \
+				 adapter->pdev->device == OC_DEVICE_ID2)
+
+#define BE2_chip(adapter)	(adapter->pdev->device == BE_DEVICE_ID1 || \
+				 adapter->pdev->device == OC_DEVICE_ID1)
+
+#define BEx_chip(adapter)	(BE3_chip(adapter) || BE2_chip(adapter))
+
+#define be_roce_supported(adapter)	(skyhawk_chip(adapter) && \
+					(adapter->function_mode & RDMA_ENABLED))
+
+extern const struct ethtool_ops be_ethtool_ops;
+
+#define msix_enabled(adapter)		(adapter->num_msix_vec > 0)
+#define num_irqs(adapter)		(msix_enabled(adapter) ?	\
+						adapter->num_msix_vec : 1)
+#define tx_stats(txo)			(&(txo)->stats)
+#define rx_stats(rxo)			(&(rxo)->stats)
+
+/* The default RXQ is the last RXQ */
+#define default_rxo(adpt)		(&adpt->rx_obj[adpt->num_rx_qs - 1])
+
+#define for_all_rx_queues(adapter, rxo, i)				\
+	for (i = 0, rxo = &adapter->rx_obj[i]; i < adapter->num_rx_qs;	\
+		i++, rxo++)
+
+/* Skip the default non-rss queue (last one)*/
+#define for_all_rss_queues(adapter, rxo, i)				\
+	for (i = 0, rxo = &adapter->rx_obj[i]; i < (adapter->num_rx_qs - 1);\
+		i++, rxo++)
+
+#define for_all_tx_queues(adapter, txo, i)				\
+	for (i = 0, txo = &adapter->tx_obj[i]; i < adapter->num_tx_qs;	\
+		i++, txo++)
+
+#define for_all_evt_queues(adapter, eqo, i)				\
+	for (i = 0, eqo = &adapter->eq_obj[i]; i < adapter->num_evt_qs; \
+		i++, eqo++)
+
+#define for_all_rx_queues_on_eq(adapter, eqo, rxo, i)			\
+	for (i = eqo->idx, rxo = &adapter->rx_obj[i]; i < adapter->num_rx_qs;\
+		 i += adapter->num_evt_qs, rxo += adapter->num_evt_qs)
+
+#define for_all_tx_queues_on_eq(adapter, eqo, txo, i)			\
+	for (i = eqo->idx, txo = &adapter->tx_obj[i]; i < adapter->num_tx_qs;\
+		i += adapter->num_evt_qs, txo += adapter->num_evt_qs)
+
+#define is_mcc_eqo(eqo)			(eqo->idx == 0)
+#define mcc_eqo(adapter)		(&adapter->eq_obj[0])
+
+#define PAGE_SHIFT_4K		12
+#define PAGE_SIZE_4K		(1 << PAGE_SHIFT_4K)
+
+/* Returns number of pages spanned by the data starting at the given addr */
+#define PAGES_4K_SPANNED(_address, size) 				\
+		((u32)((((size_t)(_address) & (PAGE_SIZE_4K - 1)) + 	\
+			(size) + (PAGE_SIZE_4K - 1)) >> PAGE_SHIFT_4K))
+
+/* Returns bit offset within a DWORD of a bitfield */
+#define AMAP_BIT_OFFSET(_struct, field)  				\
+		(((size_t)&(((_struct *)0)->field))%32)
+
+/* Returns the bit mask of the field that is NOT shifted into location. */
+static inline u32 amap_mask(u32 bitsize)
+{
+	return (bitsize == 32 ? 0xFFFFFFFF : (1 << bitsize) - 1);
+}
+
+static inline void
+amap_set(void *ptr, u32 dw_offset, u32 mask, u32 offset, u32 value)
+{
+	u32 *dw = (u32 *) ptr + dw_offset;
+	*dw &= ~(mask << offset);
+	*dw |= (mask & value) << offset;
+}
+
+#define AMAP_SET_BITS(_struct, field, ptr, val)				\
+		amap_set(ptr,						\
+			offsetof(_struct, field)/32,			\
+			amap_mask(sizeof(((_struct *)0)->field)),	\
+			AMAP_BIT_OFFSET(_struct, field),		\
+			val)
+
+static inline u32 amap_get(void *ptr, u32 dw_offset, u32 mask, u32 offset)
+{
+	u32 *dw = (u32 *) ptr;
+	return mask & (*(dw + dw_offset) >> offset);
+}
+
+#define AMAP_GET_BITS(_struct, field, ptr)				\
+		amap_get(ptr,						\
+			offsetof(_struct, field)/32,			\
+			amap_mask(sizeof(((_struct *)0)->field)),	\
+			AMAP_BIT_OFFSET(_struct, field))
+
+#define GET_RX_COMPL_V0_BITS(field, ptr)				\
+		AMAP_GET_BITS(struct amap_eth_rx_compl_v0, field, ptr)
+
+#define GET_RX_COMPL_V1_BITS(field, ptr)				\
+		AMAP_GET_BITS(struct amap_eth_rx_compl_v1, field, ptr)
+
+#define GET_TX_COMPL_BITS(field, ptr)					\
+		AMAP_GET_BITS(struct amap_eth_tx_compl, field, ptr)
+
+#define SET_TX_WRB_HDR_BITS(field, ptr, val)				\
+		AMAP_SET_BITS(struct amap_eth_hdr_wrb, field, ptr, val)
+
+#define be_dws_cpu_to_le(wrb, len)	swap_dws(wrb, len)
+#define be_dws_le_to_cpu(wrb, len)	swap_dws(wrb, len)
+static inline void swap_dws(void *wrb, int len)
+{
+#ifdef __BIG_ENDIAN
+	u32 *dw = wrb;
+	BUG_ON(len % 4);
+	do {
+		*dw = cpu_to_le32(*dw);
+		dw++;
+		len -= 4;
+	} while (len);
+#endif				/* __BIG_ENDIAN */
+}
+
+#define be_cmd_status(status)		(status > 0 ? -EIO : status)
+
+static inline u8 is_tcp_pkt(struct sk_buff *skb)
+{
+	u8 val = 0;
+
+	if (ip_hdr(skb)->version == 4)
+		val = (ip_hdr(skb)->protocol == IPPROTO_TCP);
+	else if (ip_hdr(skb)->version == 6)
+		val = (ipv6_hdr(skb)->nexthdr == NEXTHDR_TCP);
+
+	return val;
+}
+
+static inline u8 is_udp_pkt(struct sk_buff *skb)
+{
+	u8 val = 0;
+
+	if (ip_hdr(skb)->version == 4)
+		val = (ip_hdr(skb)->protocol == IPPROTO_UDP);
+	else if (ip_hdr(skb)->version == 6)
+		val = (ipv6_hdr(skb)->nexthdr == NEXTHDR_UDP);
+
+	return val;
+}
+
+static inline bool is_ipv4_pkt(struct sk_buff *skb)
+{
+	return skb->protocol == htons(ETH_P_IP) && ip_hdr(skb)->version == 4;
+}
+
+static inline void be_vf_eth_addr_generate(struct be_adapter *adapter, u8 *mac)
+{
+	u32 addr;
+
+	addr = jhash(adapter->netdev->dev_addr, ETH_ALEN, 0);
+
+	mac[5] = (u8)(addr & 0xFF);
+	mac[4] = (u8)((addr >> 8) & 0xFF);
+	mac[3] = (u8)((addr >> 16) & 0xFF);
+	/* Use the OUI from the current MAC address */
+	memcpy(mac, adapter->netdev->dev_addr, 3);
+}
+
+static inline bool be_multi_rxq(const struct be_adapter *adapter)
+{
+	return adapter->num_rx_qs > 1;
+}
+
+static inline bool be_error(struct be_adapter *adapter)
+{
+	return adapter->eeh_error || adapter->hw_error || adapter->fw_timeout;
+}
+
+static inline bool be_hw_error(struct be_adapter *adapter)
+{
+	return adapter->eeh_error || adapter->hw_error;
+}
+
+static inline void  be_clear_all_error(struct be_adapter *adapter)
+{
+	adapter->eeh_error = false;
+	adapter->hw_error = false;
+	adapter->fw_timeout = false;
+}
+
+static inline bool be_is_wol_excluded(struct be_adapter *adapter)
+{
+	struct pci_dev *pdev = adapter->pdev;
+
+	if (!be_physfn(adapter))
+		return true;
+
+	switch (pdev->subsystem_device) {
+	case OC_SUBSYS_DEVICE_ID1:
+	case OC_SUBSYS_DEVICE_ID2:
+	case OC_SUBSYS_DEVICE_ID3:
+	case OC_SUBSYS_DEVICE_ID4:
+		return true;
+	default:
+		return false;
+	}
+}
+
+static inline int qnq_async_evt_rcvd(struct be_adapter *adapter)
+{
+	return adapter->flags & BE_FLAGS_QNQ_ASYNC_EVT_RCVD;
+}
+
+#ifdef CONFIG_NET_RX_BUSY_POLL
+static inline bool be_lock_napi(struct be_eq_obj *eqo)
+{
+	bool status = true;
+
+	spin_lock(&eqo->lock); /* BH is already disabled */
+	if (eqo->state & BE_EQ_LOCKED) {
+		WARN_ON(eqo->state & BE_EQ_NAPI);
+		eqo->state |= BE_EQ_NAPI_YIELD;
+		status = false;
+	} else {
+		eqo->state = BE_EQ_NAPI;
+	}
+	spin_unlock(&eqo->lock);
+	return status;
+}
+
+static inline void be_unlock_napi(struct be_eq_obj *eqo)
+{
+	spin_lock(&eqo->lock); /* BH is already disabled */
+
+	WARN_ON(eqo->state & (BE_EQ_POLL | BE_EQ_NAPI_YIELD));
+	eqo->state = BE_EQ_IDLE;
+
+	spin_unlock(&eqo->lock);
+}
+
+static inline bool be_lock_busy_poll(struct be_eq_obj *eqo)
+{
+	bool status = true;
+
+	spin_lock_bh(&eqo->lock);
+	if (eqo->state & BE_EQ_LOCKED) {
+		eqo->state |= BE_EQ_POLL_YIELD;
+		status = false;
+	} else {
+		eqo->state |= BE_EQ_POLL;
+	}
+	spin_unlock_bh(&eqo->lock);
+	return status;
+}
+
+static inline void be_unlock_busy_poll(struct be_eq_obj *eqo)
+{
+	spin_lock_bh(&eqo->lock);
+
+	WARN_ON(eqo->state & (BE_EQ_NAPI));
+	eqo->state = BE_EQ_IDLE;
+
+	spin_unlock_bh(&eqo->lock);
+}
+
+static inline void be_enable_busy_poll(struct be_eq_obj *eqo)
+{
+	spin_lock_init(&eqo->lock);
+	eqo->state = BE_EQ_IDLE;
+}
+
+static inline void be_disable_busy_poll(struct be_eq_obj *eqo)
+{
+	local_bh_disable();
+
+	/* It's enough to just acquire napi lock on the eqo to stop
+	 * be_busy_poll() from processing any queueus.
+	 */
+	while (!be_lock_napi(eqo))
+		mdelay(1);
+
+	local_bh_enable();
+}
+
+#else /* CONFIG_NET_RX_BUSY_POLL */
+
+static inline bool be_lock_napi(struct be_eq_obj *eqo)
+{
+	return true;
+}
+
+static inline void be_unlock_napi(struct be_eq_obj *eqo)
+{
+}
+
+static inline bool be_lock_busy_poll(struct be_eq_obj *eqo)
+{
+	return false;
+}
+
+static inline void be_unlock_busy_poll(struct be_eq_obj *eqo)
+{
+}
+
+static inline void be_enable_busy_poll(struct be_eq_obj *eqo)
+{
+}
+
+static inline void be_disable_busy_poll(struct be_eq_obj *eqo)
+{
+}
+#endif /* CONFIG_NET_RX_BUSY_POLL */
+
+void be_cq_notify(struct be_adapter *adapter, u16 qid, bool arm,
+		  u16 num_popped);
+void be_link_status_update(struct be_adapter *adapter, u8 link_status);
+void be_parse_stats(struct be_adapter *adapter);
+int be_load_fw(struct be_adapter *adapter, u8 *func);
+bool be_is_wol_supported(struct be_adapter *adapter);
+bool be_pause_supported(struct be_adapter *adapter);
+u32 be_get_fw_log_level(struct be_adapter *adapter);
+
+static inline int fw_major_num(const char *fw_ver)
+{
+	int fw_major = 0;
+
+	sscanf(fw_ver, "%d.", &fw_major);
+
+	return fw_major;
+}
+
+int be_update_queues(struct be_adapter *adapter);
+int be_poll(struct napi_struct *napi, int budget);
+
+/*
+ * internal function to initialize-cleanup roce device.
+ */
+void be_roce_dev_add(struct be_adapter *);
+void be_roce_dev_remove(struct be_adapter *);
+
+/*
+ * internal function to open-close roce device during ifup-ifdown.
+ */
+void be_roce_dev_open(struct be_adapter *);
+void be_roce_dev_close(struct be_adapter *);
+void be_roce_dev_shutdown(struct be_adapter *);
+
+#endif				/* BE_H */
diff --git a/drivers/net/ethernet/emulex/benet/be_cmds.c b/drivers/net/ethernet/emulex/benet/be_cmds.c
new file mode 100644
index 0000000..fead5c6
--- /dev/null
+++ b/drivers/net/ethernet/emulex/benet/be_cmds.c
@@ -0,0 +1,4162 @@
+/*
+ * Copyright (C) 2005 - 2014 Emulex
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.  The full GNU General
+ * Public License is included in this distribution in the file called COPYING.
+ *
+ * Contact Information:
+ * linux-drivers@emulex.com
+ *
+ * Emulex
+ * 3333 Susan Street
+ * Costa Mesa, CA 92626
+ */
+
+#include <linux/module.h>
+#include "be.h"
+#include "be_cmds.h"
+
+static struct be_cmd_priv_map cmd_priv_map[] = {
+	{
+		OPCODE_ETH_ACPI_WOL_MAGIC_CONFIG,
+		CMD_SUBSYSTEM_ETH,
+		BE_PRIV_LNKMGMT | BE_PRIV_VHADM |
+		BE_PRIV_DEVCFG | BE_PRIV_DEVSEC
+	},
+	{
+		OPCODE_COMMON_GET_FLOW_CONTROL,
+		CMD_SUBSYSTEM_COMMON,
+		BE_PRIV_LNKQUERY | BE_PRIV_VHADM |
+		BE_PRIV_DEVCFG | BE_PRIV_DEVSEC
+	},
+	{
+		OPCODE_COMMON_SET_FLOW_CONTROL,
+		CMD_SUBSYSTEM_COMMON,
+		BE_PRIV_LNKMGMT | BE_PRIV_VHADM |
+		BE_PRIV_DEVCFG | BE_PRIV_DEVSEC
+	},
+	{
+		OPCODE_ETH_GET_PPORT_STATS,
+		CMD_SUBSYSTEM_ETH,
+		BE_PRIV_LNKMGMT | BE_PRIV_VHADM |
+		BE_PRIV_DEVCFG | BE_PRIV_DEVSEC
+	},
+	{
+		OPCODE_COMMON_GET_PHY_DETAILS,
+		CMD_SUBSYSTEM_COMMON,
+		BE_PRIV_LNKMGMT | BE_PRIV_VHADM |
+		BE_PRIV_DEVCFG | BE_PRIV_DEVSEC
+	}
+};
+
+static bool be_cmd_allowed(struct be_adapter *adapter, u8 opcode, u8 subsystem)
+{
+	int i;
+	int num_entries = sizeof(cmd_priv_map)/sizeof(struct be_cmd_priv_map);
+	u32 cmd_privileges = adapter->cmd_privileges;
+
+	for (i = 0; i < num_entries; i++)
+		if (opcode == cmd_priv_map[i].opcode &&
+		    subsystem == cmd_priv_map[i].subsystem)
+			if (!(cmd_privileges & cmd_priv_map[i].priv_mask))
+				return false;
+
+	return true;
+}
+
+static inline void *embedded_payload(struct be_mcc_wrb *wrb)
+{
+	return wrb->payload.embedded_payload;
+}
+
+static void be_mcc_notify(struct be_adapter *adapter)
+{
+	struct be_queue_info *mccq = &adapter->mcc_obj.q;
+	u32 val = 0;
+
+	if (be_error(adapter))
+		return;
+
+	val |= mccq->id & DB_MCCQ_RING_ID_MASK;
+	val |= 1 << DB_MCCQ_NUM_POSTED_SHIFT;
+
+	wmb();
+	iowrite32(val, adapter->db + DB_MCCQ_OFFSET);
+}
+
+/* To check if valid bit is set, check the entire word as we don't know
+ * the endianness of the data (old entry is host endian while a new entry is
+ * little endian) */
+static inline bool be_mcc_compl_is_new(struct be_mcc_compl *compl)
+{
+	u32 flags;
+
+	if (compl->flags != 0) {
+		flags = le32_to_cpu(compl->flags);
+		if (flags & CQE_FLAGS_VALID_MASK) {
+			compl->flags = flags;
+			return true;
+		}
+	}
+	return false;
+}
+
+/* Need to reset the entire word that houses the valid bit */
+static inline void be_mcc_compl_use(struct be_mcc_compl *compl)
+{
+	compl->flags = 0;
+}
+
+static struct be_cmd_resp_hdr *be_decode_resp_hdr(u32 tag0, u32 tag1)
+{
+	unsigned long addr;
+
+	addr = tag1;
+	addr = ((addr << 16) << 16) | tag0;
+	return (void *)addr;
+}
+
+static bool be_skip_err_log(u8 opcode, u16 base_status, u16 addl_status)
+{
+	if (base_status == MCC_STATUS_NOT_SUPPORTED ||
+	    base_status == MCC_STATUS_ILLEGAL_REQUEST ||
+	    addl_status == MCC_ADDL_STATUS_TOO_MANY_INTERFACES ||
+	    (opcode == OPCODE_COMMON_WRITE_FLASHROM &&
+	    (base_status == MCC_STATUS_ILLEGAL_FIELD ||
+	     addl_status == MCC_ADDL_STATUS_FLASH_IMAGE_CRC_MISMATCH)))
+		return true;
+	else
+		return false;
+}
+
+/* Place holder for all the async MCC cmds wherein the caller is not in a busy
+ * loop (has not issued be_mcc_notify_wait())
+ */
+static void be_async_cmd_process(struct be_adapter *adapter,
+				 struct be_mcc_compl *compl,
+				 struct be_cmd_resp_hdr *resp_hdr)
+{
+	enum mcc_base_status base_status = base_status(compl->status);
+	u8 opcode = 0, subsystem = 0;
+
+	if (resp_hdr) {
+		opcode = resp_hdr->opcode;
+		subsystem = resp_hdr->subsystem;
+	}
+
+	if (opcode == OPCODE_LOWLEVEL_LOOPBACK_TEST &&
+	    subsystem == CMD_SUBSYSTEM_LOWLEVEL) {
+		complete(&adapter->et_cmd_compl);
+		return;
+	}
+
+	if ((opcode == OPCODE_COMMON_WRITE_FLASHROM ||
+	     opcode == OPCODE_COMMON_WRITE_OBJECT) &&
+	    subsystem == CMD_SUBSYSTEM_COMMON) {
+		adapter->flash_status = compl->status;
+		complete(&adapter->et_cmd_compl);
+		return;
+	}
+
+	if ((opcode == OPCODE_ETH_GET_STATISTICS ||
+	     opcode == OPCODE_ETH_GET_PPORT_STATS) &&
+	    subsystem == CMD_SUBSYSTEM_ETH &&
+	    base_status == MCC_STATUS_SUCCESS) {
+		be_parse_stats(adapter);
+		adapter->stats_cmd_sent = false;
+		return;
+	}
+
+	if (opcode == OPCODE_COMMON_GET_CNTL_ADDITIONAL_ATTRIBUTES &&
+	    subsystem == CMD_SUBSYSTEM_COMMON) {
+		if (base_status == MCC_STATUS_SUCCESS) {
+			struct be_cmd_resp_get_cntl_addnl_attribs *resp =
+							(void *)resp_hdr;
+			adapter->drv_stats.be_on_die_temperature =
+						resp->on_die_temperature;
+		} else {
+			adapter->be_get_temp_freq = 0;
+		}
+		return;
+	}
+}
+
+static int be_mcc_compl_process(struct be_adapter *adapter,
+				struct be_mcc_compl *compl)
+{
+	enum mcc_base_status base_status;
+	enum mcc_addl_status addl_status;
+	struct be_cmd_resp_hdr *resp_hdr;
+	u8 opcode = 0, subsystem = 0;
+
+	/* Just swap the status to host endian; mcc tag is opaquely copied
+	 * from mcc_wrb */
+	be_dws_le_to_cpu(compl, 4);
+
+	base_status = base_status(compl->status);
+	addl_status = addl_status(compl->status);
+
+	resp_hdr = be_decode_resp_hdr(compl->tag0, compl->tag1);
+	if (resp_hdr) {
+		opcode = resp_hdr->opcode;
+		subsystem = resp_hdr->subsystem;
+	}
+
+	be_async_cmd_process(adapter, compl, resp_hdr);
+
+	if (base_status != MCC_STATUS_SUCCESS &&
+	    !be_skip_err_log(opcode, base_status, addl_status)) {
+		if (base_status == MCC_STATUS_UNAUTHORIZED_REQUEST) {
+			dev_warn(&adapter->pdev->dev,
+				 "VF is not privileged to issue opcode %d-%d\n",
+				 opcode, subsystem);
+		} else {
+			dev_err(&adapter->pdev->dev,
+				"opcode %d-%d failed:status %d-%d\n",
+				opcode, subsystem, base_status, addl_status);
+		}
+	}
+	return compl->status;
+}
+
+/* Link state evt is a string of bytes; no need for endian swapping */
+static void be_async_link_state_process(struct be_adapter *adapter,
+					struct be_mcc_compl *compl)
+{
+	struct be_async_event_link_state *evt =
+			(struct be_async_event_link_state *)compl;
+
+	/* When link status changes, link speed must be re-queried from FW */
+	adapter->phy.link_speed = -1;
+
+	/* On BEx the FW does not send a separate link status
+	 * notification for physical and logical link.
+	 * On other chips just process the logical link
+	 * status notification
+	 */
+	if (!BEx_chip(adapter) &&
+	    !(evt->port_link_status & LOGICAL_LINK_STATUS_MASK))
+		return;
+
+	/* For the initial link status do not rely on the ASYNC event as
+	 * it may not be received in some cases.
+	 */
+	if (adapter->flags & BE_FLAGS_LINK_STATUS_INIT)
+		be_link_status_update(adapter,
+				      evt->port_link_status & LINK_STATUS_MASK);
+}
+
+/* Grp5 CoS Priority evt */
+static void be_async_grp5_cos_priority_process(struct be_adapter *adapter,
+					       struct be_mcc_compl *compl)
+{
+	struct be_async_event_grp5_cos_priority *evt =
+			(struct be_async_event_grp5_cos_priority *)compl;
+
+	if (evt->valid) {
+		adapter->vlan_prio_bmap = evt->available_priority_bmap;
+		adapter->recommended_prio &= ~VLAN_PRIO_MASK;
+		adapter->recommended_prio =
+			evt->reco_default_priority << VLAN_PRIO_SHIFT;
+	}
+}
+
+/* Grp5 QOS Speed evt: qos_link_speed is in units of 10 Mbps */
+static void be_async_grp5_qos_speed_process(struct be_adapter *adapter,
+					    struct be_mcc_compl *compl)
+{
+	struct be_async_event_grp5_qos_link_speed *evt =
+			(struct be_async_event_grp5_qos_link_speed *)compl;
+
+	if (adapter->phy.link_speed >= 0 &&
+	    evt->physical_port == adapter->port_num)
+		adapter->phy.link_speed = le16_to_cpu(evt->qos_link_speed) * 10;
+}
+
+/*Grp5 PVID evt*/
+static void be_async_grp5_pvid_state_process(struct be_adapter *adapter,
+					     struct be_mcc_compl *compl)
+{
+	struct be_async_event_grp5_pvid_state *evt =
+			(struct be_async_event_grp5_pvid_state *)compl;
+
+	if (evt->enabled) {
+		adapter->pvid = le16_to_cpu(evt->tag) & VLAN_VID_MASK;
+		dev_info(&adapter->pdev->dev, "LPVID: %d\n", adapter->pvid);
+	} else {
+		adapter->pvid = 0;
+	}
+}
+
+static void be_async_grp5_evt_process(struct be_adapter *adapter,
+				      struct be_mcc_compl *compl)
+{
+	u8 event_type = (compl->flags >> ASYNC_EVENT_TYPE_SHIFT) &
+				ASYNC_EVENT_TYPE_MASK;
+
+	switch (event_type) {
+	case ASYNC_EVENT_COS_PRIORITY:
+		be_async_grp5_cos_priority_process(adapter, compl);
+		break;
+	case ASYNC_EVENT_QOS_SPEED:
+		be_async_grp5_qos_speed_process(adapter, compl);
+		break;
+	case ASYNC_EVENT_PVID_STATE:
+		be_async_grp5_pvid_state_process(adapter, compl);
+		break;
+	default:
+		break;
+	}
+}
+
+static void be_async_dbg_evt_process(struct be_adapter *adapter,
+				     struct be_mcc_compl *cmp)
+{
+	u8 event_type = 0;
+	struct be_async_event_qnq *evt = (struct be_async_event_qnq *)cmp;
+
+	event_type = (cmp->flags >> ASYNC_EVENT_TYPE_SHIFT) &
+			ASYNC_EVENT_TYPE_MASK;
+
+	switch (event_type) {
+	case ASYNC_DEBUG_EVENT_TYPE_QNQ:
+		if (evt->valid)
+			adapter->qnq_vid = le16_to_cpu(evt->vlan_tag);
+		adapter->flags |= BE_FLAGS_QNQ_ASYNC_EVT_RCVD;
+	break;
+	default:
+		dev_warn(&adapter->pdev->dev, "Unknown debug event 0x%x!\n",
+			 event_type);
+	break;
+	}
+}
+
+static inline bool is_link_state_evt(u32 flags)
+{
+	return ((flags >> ASYNC_EVENT_CODE_SHIFT) & ASYNC_EVENT_CODE_MASK) ==
+			ASYNC_EVENT_CODE_LINK_STATE;
+}
+
+static inline bool is_grp5_evt(u32 flags)
+{
+	return ((flags >> ASYNC_EVENT_CODE_SHIFT) & ASYNC_EVENT_CODE_MASK) ==
+			ASYNC_EVENT_CODE_GRP_5;
+}
+
+static inline bool is_dbg_evt(u32 flags)
+{
+	return ((flags >> ASYNC_EVENT_CODE_SHIFT) & ASYNC_EVENT_CODE_MASK) ==
+			ASYNC_EVENT_CODE_QNQ;
+}
+
+static void be_mcc_event_process(struct be_adapter *adapter,
+				 struct be_mcc_compl *compl)
+{
+	if (is_link_state_evt(compl->flags))
+		be_async_link_state_process(adapter, compl);
+	else if (is_grp5_evt(compl->flags))
+		be_async_grp5_evt_process(adapter, compl);
+	else if (is_dbg_evt(compl->flags))
+		be_async_dbg_evt_process(adapter, compl);
+}
+
+static struct be_mcc_compl *be_mcc_compl_get(struct be_adapter *adapter)
+{
+	struct be_queue_info *mcc_cq = &adapter->mcc_obj.cq;
+	struct be_mcc_compl *compl = queue_tail_node(mcc_cq);
+
+	if (be_mcc_compl_is_new(compl)) {
+		queue_tail_inc(mcc_cq);
+		return compl;
+	}
+	return NULL;
+}
+
+void be_async_mcc_enable(struct be_adapter *adapter)
+{
+	spin_lock_bh(&adapter->mcc_cq_lock);
+
+	be_cq_notify(adapter, adapter->mcc_obj.cq.id, true, 0);
+	adapter->mcc_obj.rearm_cq = true;
+
+	spin_unlock_bh(&adapter->mcc_cq_lock);
+}
+
+void be_async_mcc_disable(struct be_adapter *adapter)
+{
+	spin_lock_bh(&adapter->mcc_cq_lock);
+
+	adapter->mcc_obj.rearm_cq = false;
+	be_cq_notify(adapter, adapter->mcc_obj.cq.id, false, 0);
+
+	spin_unlock_bh(&adapter->mcc_cq_lock);
+}
+
+int be_process_mcc(struct be_adapter *adapter)
+{
+	struct be_mcc_compl *compl;
+	int num = 0, status = 0;
+	struct be_mcc_obj *mcc_obj = &adapter->mcc_obj;
+
+	spin_lock(&adapter->mcc_cq_lock);
+
+	while ((compl = be_mcc_compl_get(adapter))) {
+		if (compl->flags & CQE_FLAGS_ASYNC_MASK) {
+			be_mcc_event_process(adapter, compl);
+		} else if (compl->flags & CQE_FLAGS_COMPLETED_MASK) {
+			status = be_mcc_compl_process(adapter, compl);
+			atomic_dec(&mcc_obj->q.used);
+		}
+		be_mcc_compl_use(compl);
+		num++;
+	}
+
+	if (num)
+		be_cq_notify(adapter, mcc_obj->cq.id, mcc_obj->rearm_cq, num);
+
+	spin_unlock(&adapter->mcc_cq_lock);
+	return status;
+}
+
+/* Wait till no more pending mcc requests are present */
+static int be_mcc_wait_compl(struct be_adapter *adapter)
+{
+#define mcc_timeout		120000 /* 12s timeout */
+	int i, status = 0;
+	struct be_mcc_obj *mcc_obj = &adapter->mcc_obj;
+
+	for (i = 0; i < mcc_timeout; i++) {
+		if (be_error(adapter))
+			return -EIO;
+
+		local_bh_disable();
+		status = be_process_mcc(adapter);
+		local_bh_enable();
+
+		if (atomic_read(&mcc_obj->q.used) == 0)
+			break;
+		udelay(100);
+	}
+	if (i == mcc_timeout) {
+		dev_err(&adapter->pdev->dev, "FW not responding\n");
+		adapter->fw_timeout = true;
+		return -EIO;
+	}
+	return status;
+}
+
+/* Notify MCC requests and wait for completion */
+static int be_mcc_notify_wait(struct be_adapter *adapter)
+{
+	int status;
+	struct be_mcc_wrb *wrb;
+	struct be_mcc_obj *mcc_obj = &adapter->mcc_obj;
+	u16 index = mcc_obj->q.head;
+	struct be_cmd_resp_hdr *resp;
+
+	index_dec(&index, mcc_obj->q.len);
+	wrb = queue_index_node(&mcc_obj->q, index);
+
+	resp = be_decode_resp_hdr(wrb->tag0, wrb->tag1);
+
+	be_mcc_notify(adapter);
+
+	status = be_mcc_wait_compl(adapter);
+	if (status == -EIO)
+		goto out;
+
+	status = (resp->base_status |
+		  ((resp->addl_status & CQE_ADDL_STATUS_MASK) <<
+		   CQE_ADDL_STATUS_SHIFT));
+out:
+	return status;
+}
+
+static int be_mbox_db_ready_wait(struct be_adapter *adapter, void __iomem *db)
+{
+	int msecs = 0;
+	u32 ready;
+
+	do {
+		if (be_error(adapter))
+			return -EIO;
+
+		ready = ioread32(db);
+		if (ready == 0xffffffff)
+			return -1;
+
+		ready &= MPU_MAILBOX_DB_RDY_MASK;
+		if (ready)
+			break;
+
+		if (msecs > 4000) {
+			dev_err(&adapter->pdev->dev, "FW not responding\n");
+			adapter->fw_timeout = true;
+			be_detect_error(adapter);
+			return -1;
+		}
+
+		msleep(1);
+		msecs++;
+	} while (true);
+
+	return 0;
+}
+
+/*
+ * Insert the mailbox address into the doorbell in two steps
+ * Polls on the mbox doorbell till a command completion (or a timeout) occurs
+ */
+static int be_mbox_notify_wait(struct be_adapter *adapter)
+{
+	int status;
+	u32 val = 0;
+	void __iomem *db = adapter->db + MPU_MAILBOX_DB_OFFSET;
+	struct be_dma_mem *mbox_mem = &adapter->mbox_mem;
+	struct be_mcc_mailbox *mbox = mbox_mem->va;
+	struct be_mcc_compl *compl = &mbox->compl;
+
+	/* wait for ready to be set */
+	status = be_mbox_db_ready_wait(adapter, db);
+	if (status != 0)
+		return status;
+
+	val |= MPU_MAILBOX_DB_HI_MASK;
+	/* at bits 2 - 31 place mbox dma addr msb bits 34 - 63 */
+	val |= (upper_32_bits(mbox_mem->dma) >> 2) << 2;
+	iowrite32(val, db);
+
+	/* wait for ready to be set */
+	status = be_mbox_db_ready_wait(adapter, db);
+	if (status != 0)
+		return status;
+
+	val = 0;
+	/* at bits 2 - 31 place mbox dma addr lsb bits 4 - 33 */
+	val |= (u32)(mbox_mem->dma >> 4) << 2;
+	iowrite32(val, db);
+
+	status = be_mbox_db_ready_wait(adapter, db);
+	if (status != 0)
+		return status;
+
+	/* A cq entry has been made now */
+	if (be_mcc_compl_is_new(compl)) {
+		status = be_mcc_compl_process(adapter, &mbox->compl);
+		be_mcc_compl_use(compl);
+		if (status)
+			return status;
+	} else {
+		dev_err(&adapter->pdev->dev, "invalid mailbox completion\n");
+		return -1;
+	}
+	return 0;
+}
+
+static u16 be_POST_stage_get(struct be_adapter *adapter)
+{
+	u32 sem;
+
+	if (BEx_chip(adapter))
+		sem  = ioread32(adapter->csr + SLIPORT_SEMAPHORE_OFFSET_BEx);
+	else
+		pci_read_config_dword(adapter->pdev,
+				      SLIPORT_SEMAPHORE_OFFSET_SH, &sem);
+
+	return sem & POST_STAGE_MASK;
+}
+
+static int lancer_wait_ready(struct be_adapter *adapter)
+{
+#define SLIPORT_READY_TIMEOUT 30
+	u32 sliport_status;
+	int status = 0, i;
+
+	for (i = 0; i < SLIPORT_READY_TIMEOUT; i++) {
+		sliport_status = ioread32(adapter->db + SLIPORT_STATUS_OFFSET);
+		if (sliport_status & SLIPORT_STATUS_RDY_MASK)
+			break;
+
+		msleep(1000);
+	}
+
+	if (i == SLIPORT_READY_TIMEOUT)
+		status = -1;
+
+	return status;
+}
+
+static bool lancer_provisioning_error(struct be_adapter *adapter)
+{
+	u32 sliport_status = 0, sliport_err1 = 0, sliport_err2 = 0;
+
+	sliport_status = ioread32(adapter->db + SLIPORT_STATUS_OFFSET);
+	if (sliport_status & SLIPORT_STATUS_ERR_MASK) {
+		sliport_err1 = ioread32(adapter->db + SLIPORT_ERROR1_OFFSET);
+		sliport_err2 = ioread32(adapter->db + SLIPORT_ERROR2_OFFSET);
+
+		if (sliport_err1 == SLIPORT_ERROR_NO_RESOURCE1 &&
+		    sliport_err2 == SLIPORT_ERROR_NO_RESOURCE2)
+			return true;
+	}
+	return false;
+}
+
+int lancer_test_and_set_rdy_state(struct be_adapter *adapter)
+{
+	int status;
+	u32 sliport_status, err, reset_needed;
+	bool resource_error;
+
+	resource_error = lancer_provisioning_error(adapter);
+	if (resource_error)
+		return -EAGAIN;
+
+	status = lancer_wait_ready(adapter);
+	if (!status) {
+		sliport_status = ioread32(adapter->db + SLIPORT_STATUS_OFFSET);
+		err = sliport_status & SLIPORT_STATUS_ERR_MASK;
+		reset_needed = sliport_status & SLIPORT_STATUS_RN_MASK;
+		if (err && reset_needed) {
+			iowrite32(SLI_PORT_CONTROL_IP_MASK,
+				  adapter->db + SLIPORT_CONTROL_OFFSET);
+
+			/* check adapter has corrected the error */
+			status = lancer_wait_ready(adapter);
+			sliport_status = ioread32(adapter->db +
+						  SLIPORT_STATUS_OFFSET);
+			sliport_status &= (SLIPORT_STATUS_ERR_MASK |
+						SLIPORT_STATUS_RN_MASK);
+			if (status || sliport_status)
+				status = -1;
+		} else if (err || reset_needed) {
+			status = -1;
+		}
+	}
+	/* Stop error recovery if error is not recoverable.
+	 * No resource error is temporary errors and will go away
+	 * when PF provisions resources.
+	 */
+	resource_error = lancer_provisioning_error(adapter);
+	if (resource_error)
+		status = -EAGAIN;
+
+	return status;
+}
+
+int be_fw_wait_ready(struct be_adapter *adapter)
+{
+	u16 stage;
+	int status, timeout = 0;
+	struct device *dev = &adapter->pdev->dev;
+
+	if (lancer_chip(adapter)) {
+		status = lancer_wait_ready(adapter);
+		return status;
+	}
+
+	do {
+		stage = be_POST_stage_get(adapter);
+		if (stage == POST_STAGE_ARMFW_RDY)
+			return 0;
+
+		dev_info(dev, "Waiting for POST, %ds elapsed\n", timeout);
+		if (msleep_interruptible(2000)) {
+			dev_err(dev, "Waiting for POST aborted\n");
+			return -EINTR;
+		}
+		timeout += 2;
+	} while (timeout < 60);
+
+	dev_err(dev, "POST timeout; stage=0x%x\n", stage);
+	return -1;
+}
+
+static inline struct be_sge *nonembedded_sgl(struct be_mcc_wrb *wrb)
+{
+	return &wrb->payload.sgl[0];
+}
+
+static inline void fill_wrb_tags(struct be_mcc_wrb *wrb, unsigned long addr)
+{
+	wrb->tag0 = addr & 0xFFFFFFFF;
+	wrb->tag1 = upper_32_bits(addr);
+}
+
+/* Don't touch the hdr after it's prepared */
+/* mem will be NULL for embedded commands */
+static void be_wrb_cmd_hdr_prepare(struct be_cmd_req_hdr *req_hdr,
+				   u8 subsystem, u8 opcode, int cmd_len,
+				   struct be_mcc_wrb *wrb,
+				   struct be_dma_mem *mem)
+{
+	struct be_sge *sge;
+
+	req_hdr->opcode = opcode;
+	req_hdr->subsystem = subsystem;
+	req_hdr->request_length = cpu_to_le32(cmd_len - sizeof(*req_hdr));
+	req_hdr->version = 0;
+	fill_wrb_tags(wrb, (ulong) req_hdr);
+	wrb->payload_length = cmd_len;
+	if (mem) {
+		wrb->embedded |= (1 & MCC_WRB_SGE_CNT_MASK) <<
+			MCC_WRB_SGE_CNT_SHIFT;
+		sge = nonembedded_sgl(wrb);
+		sge->pa_hi = cpu_to_le32(upper_32_bits(mem->dma));
+		sge->pa_lo = cpu_to_le32(mem->dma & 0xFFFFFFFF);
+		sge->len = cpu_to_le32(mem->size);
+	} else
+		wrb->embedded |= MCC_WRB_EMBEDDED_MASK;
+	be_dws_cpu_to_le(wrb, 8);
+}
+
+static void be_cmd_page_addrs_prepare(struct phys_addr *pages, u32 max_pages,
+				      struct be_dma_mem *mem)
+{
+	int i, buf_pages = min(PAGES_4K_SPANNED(mem->va, mem->size), max_pages);
+	u64 dma = (u64)mem->dma;
+
+	for (i = 0; i < buf_pages; i++) {
+		pages[i].lo = cpu_to_le32(dma & 0xFFFFFFFF);
+		pages[i].hi = cpu_to_le32(upper_32_bits(dma));
+		dma += PAGE_SIZE_4K;
+	}
+}
+
+static inline struct be_mcc_wrb *wrb_from_mbox(struct be_adapter *adapter)
+{
+	struct be_dma_mem *mbox_mem = &adapter->mbox_mem;
+	struct be_mcc_wrb *wrb
+		= &((struct be_mcc_mailbox *)(mbox_mem->va))->wrb;
+	memset(wrb, 0, sizeof(*wrb));
+	return wrb;
+}
+
+static struct be_mcc_wrb *wrb_from_mccq(struct be_adapter *adapter)
+{
+	struct be_queue_info *mccq = &adapter->mcc_obj.q;
+	struct be_mcc_wrb *wrb;
+
+	if (!mccq->created)
+		return NULL;
+
+	if (atomic_read(&mccq->used) >= mccq->len)
+		return NULL;
+
+	wrb = queue_head_node(mccq);
+	queue_head_inc(mccq);
+	atomic_inc(&mccq->used);
+	memset(wrb, 0, sizeof(*wrb));
+	return wrb;
+}
+
+static bool use_mcc(struct be_adapter *adapter)
+{
+	return adapter->mcc_obj.q.created;
+}
+
+/* Must be used only in process context */
+static int be_cmd_lock(struct be_adapter *adapter)
+{
+	if (use_mcc(adapter)) {
+		spin_lock_bh(&adapter->mcc_lock);
+		return 0;
+	} else {
+		return mutex_lock_interruptible(&adapter->mbox_lock);
+	}
+}
+
+/* Must be used only in process context */
+static void be_cmd_unlock(struct be_adapter *adapter)
+{
+	if (use_mcc(adapter))
+		spin_unlock_bh(&adapter->mcc_lock);
+	else
+		return mutex_unlock(&adapter->mbox_lock);
+}
+
+static struct be_mcc_wrb *be_cmd_copy(struct be_adapter *adapter,
+				      struct be_mcc_wrb *wrb)
+{
+	struct be_mcc_wrb *dest_wrb;
+
+	if (use_mcc(adapter)) {
+		dest_wrb = wrb_from_mccq(adapter);
+		if (!dest_wrb)
+			return NULL;
+	} else {
+		dest_wrb = wrb_from_mbox(adapter);
+	}
+
+	memcpy(dest_wrb, wrb, sizeof(*wrb));
+	if (wrb->embedded & cpu_to_le32(MCC_WRB_EMBEDDED_MASK))
+		fill_wrb_tags(dest_wrb, (ulong) embedded_payload(wrb));
+
+	return dest_wrb;
+}
+
+/* Must be used only in process context */
+static int be_cmd_notify_wait(struct be_adapter *adapter,
+			      struct be_mcc_wrb *wrb)
+{
+	struct be_mcc_wrb *dest_wrb;
+	int status;
+
+	status = be_cmd_lock(adapter);
+	if (status)
+		return status;
+
+	dest_wrb = be_cmd_copy(adapter, wrb);
+	if (!dest_wrb)
+		return -EBUSY;
+
+	if (use_mcc(adapter))
+		status = be_mcc_notify_wait(adapter);
+	else
+		status = be_mbox_notify_wait(adapter);
+
+	if (!status)
+		memcpy(wrb, dest_wrb, sizeof(*wrb));
+
+	be_cmd_unlock(adapter);
+	return status;
+}
+
+/* Tell fw we're about to start firing cmds by writing a
+ * special pattern across the wrb hdr; uses mbox
+ */
+int be_cmd_fw_init(struct be_adapter *adapter)
+{
+	u8 *wrb;
+	int status;
+
+	if (lancer_chip(adapter))
+		return 0;
+
+	if (mutex_lock_interruptible(&adapter->mbox_lock))
+		return -1;
+
+	wrb = (u8 *)wrb_from_mbox(adapter);
+	*wrb++ = 0xFF;
+	*wrb++ = 0x12;
+	*wrb++ = 0x34;
+	*wrb++ = 0xFF;
+	*wrb++ = 0xFF;
+	*wrb++ = 0x56;
+	*wrb++ = 0x78;
+	*wrb = 0xFF;
+
+	status = be_mbox_notify_wait(adapter);
+
+	mutex_unlock(&adapter->mbox_lock);
+	return status;
+}
+
+/* Tell fw we're done with firing cmds by writing a
+ * special pattern across the wrb hdr; uses mbox
+ */
+int be_cmd_fw_clean(struct be_adapter *adapter)
+{
+	u8 *wrb;
+	int status;
+
+	if (lancer_chip(adapter))
+		return 0;
+
+	if (mutex_lock_interruptible(&adapter->mbox_lock))
+		return -1;
+
+	wrb = (u8 *)wrb_from_mbox(adapter);
+	*wrb++ = 0xFF;
+	*wrb++ = 0xAA;
+	*wrb++ = 0xBB;
+	*wrb++ = 0xFF;
+	*wrb++ = 0xFF;
+	*wrb++ = 0xCC;
+	*wrb++ = 0xDD;
+	*wrb = 0xFF;
+
+	status = be_mbox_notify_wait(adapter);
+
+	mutex_unlock(&adapter->mbox_lock);
+	return status;
+}
+
+int be_cmd_eq_create(struct be_adapter *adapter, struct be_eq_obj *eqo)
+{
+	struct be_mcc_wrb *wrb;
+	struct be_cmd_req_eq_create *req;
+	struct be_dma_mem *q_mem = &eqo->q.dma_mem;
+	int status, ver = 0;
+
+	if (mutex_lock_interruptible(&adapter->mbox_lock))
+		return -1;
+
+	wrb = wrb_from_mbox(adapter);
+	req = embedded_payload(wrb);
+
+	be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON,
+			       OPCODE_COMMON_EQ_CREATE, sizeof(*req), wrb,
+			       NULL);
+
+	/* Support for EQ_CREATEv2 available only SH-R onwards */
+	if (!(BEx_chip(adapter) || lancer_chip(adapter)))
+		ver = 2;
+
+	req->hdr.version = ver;
+	req->num_pages =  cpu_to_le16(PAGES_4K_SPANNED(q_mem->va, q_mem->size));
+
+	AMAP_SET_BITS(struct amap_eq_context, valid, req->context, 1);
+	/* 4byte eqe*/
+	AMAP_SET_BITS(struct amap_eq_context, size, req->context, 0);
+	AMAP_SET_BITS(struct amap_eq_context, count, req->context,
+		      __ilog2_u32(eqo->q.len / 256));
+	be_dws_cpu_to_le(req->context, sizeof(req->context));
+
+	be_cmd_page_addrs_prepare(req->pages, ARRAY_SIZE(req->pages), q_mem);
+
+	status = be_mbox_notify_wait(adapter);
+	if (!status) {
+		struct be_cmd_resp_eq_create *resp = embedded_payload(wrb);
+
+		eqo->q.id = le16_to_cpu(resp->eq_id);
+		eqo->msix_idx =
+			(ver == 2) ? le16_to_cpu(resp->msix_idx) : eqo->idx;
+		eqo->q.created = true;
+	}
+
+	mutex_unlock(&adapter->mbox_lock);
+	return status;
+}
+
+/* Use MCC */
+int be_cmd_mac_addr_query(struct be_adapter *adapter, u8 *mac_addr,
+			  bool permanent, u32 if_handle, u32 pmac_id)
+{
+	struct be_mcc_wrb *wrb;
+	struct be_cmd_req_mac_query *req;
+	int status;
+
+	spin_lock_bh(&adapter->mcc_lock);
+
+	wrb = wrb_from_mccq(adapter);
+	if (!wrb) {
+		status = -EBUSY;
+		goto err;
+	}
+	req = embedded_payload(wrb);
+
+	be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON,
+			       OPCODE_COMMON_NTWK_MAC_QUERY, sizeof(*req), wrb,
+			       NULL);
+	req->type = MAC_ADDRESS_TYPE_NETWORK;
+	if (permanent) {
+		req->permanent = 1;
+	} else {
+		req->if_id = cpu_to_le16((u16)if_handle);
+		req->pmac_id = cpu_to_le32(pmac_id);
+		req->permanent = 0;
+	}
+
+	status = be_mcc_notify_wait(adapter);
+	if (!status) {
+		struct be_cmd_resp_mac_query *resp = embedded_payload(wrb);
+
+		memcpy(mac_addr, resp->mac.addr, ETH_ALEN);
+	}
+
+err:
+	spin_unlock_bh(&adapter->mcc_lock);
+	return status;
+}
+
+/* Uses synchronous MCCQ */
+int be_cmd_pmac_add(struct be_adapter *adapter, u8 *mac_addr,
+		    u32 if_id, u32 *pmac_id, u32 domain)
+{
+	struct be_mcc_wrb *wrb;
+	struct be_cmd_req_pmac_add *req;
+	int status;
+
+	spin_lock_bh(&adapter->mcc_lock);
+
+	wrb = wrb_from_mccq(adapter);
+	if (!wrb) {
+		status = -EBUSY;
+		goto err;
+	}
+	req = embedded_payload(wrb);
+
+	be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON,
+			       OPCODE_COMMON_NTWK_PMAC_ADD, sizeof(*req), wrb,
+			       NULL);
+
+	req->hdr.domain = domain;
+	req->if_id = cpu_to_le32(if_id);
+	memcpy(req->mac_address, mac_addr, ETH_ALEN);
+
+	status = be_mcc_notify_wait(adapter);
+	if (!status) {
+		struct be_cmd_resp_pmac_add *resp = embedded_payload(wrb);
+
+		*pmac_id = le32_to_cpu(resp->pmac_id);
+	}
+
+err:
+	spin_unlock_bh(&adapter->mcc_lock);
+
+	 if (status == MCC_STATUS_UNAUTHORIZED_REQUEST)
+		status = -EPERM;
+
+	return status;
+}
+
+/* Uses synchronous MCCQ */
+int be_cmd_pmac_del(struct be_adapter *adapter, u32 if_id, int pmac_id, u32 dom)
+{
+	struct be_mcc_wrb *wrb;
+	struct be_cmd_req_pmac_del *req;
+	int status;
+
+	if (pmac_id == -1)
+		return 0;
+
+	spin_lock_bh(&adapter->mcc_lock);
+
+	wrb = wrb_from_mccq(adapter);
+	if (!wrb) {
+		status = -EBUSY;
+		goto err;
+	}
+	req = embedded_payload(wrb);
+
+	be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON,
+			       OPCODE_COMMON_NTWK_PMAC_DEL, sizeof(*req),
+			       wrb, NULL);
+
+	req->hdr.domain = dom;
+	req->if_id = cpu_to_le32(if_id);
+	req->pmac_id = cpu_to_le32(pmac_id);
+
+	status = be_mcc_notify_wait(adapter);
+
+err:
+	spin_unlock_bh(&adapter->mcc_lock);
+	return status;
+}
+
+/* Uses Mbox */
+int be_cmd_cq_create(struct be_adapter *adapter, struct be_queue_info *cq,
+		     struct be_queue_info *eq, bool no_delay, int coalesce_wm)
+{
+	struct be_mcc_wrb *wrb;
+	struct be_cmd_req_cq_create *req;
+	struct be_dma_mem *q_mem = &cq->dma_mem;
+	void *ctxt;
+	int status;
+
+	if (mutex_lock_interruptible(&adapter->mbox_lock))
+		return -1;
+
+	wrb = wrb_from_mbox(adapter);
+	req = embedded_payload(wrb);
+	ctxt = &req->context;
+
+	be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON,
+			       OPCODE_COMMON_CQ_CREATE, sizeof(*req), wrb,
+			       NULL);
+
+	req->num_pages =  cpu_to_le16(PAGES_4K_SPANNED(q_mem->va, q_mem->size));
+
+	if (BEx_chip(adapter)) {
+		AMAP_SET_BITS(struct amap_cq_context_be, coalescwm, ctxt,
+			      coalesce_wm);
+		AMAP_SET_BITS(struct amap_cq_context_be, nodelay,
+			      ctxt, no_delay);
+		AMAP_SET_BITS(struct amap_cq_context_be, count, ctxt,
+			      __ilog2_u32(cq->len / 256));
+		AMAP_SET_BITS(struct amap_cq_context_be, valid, ctxt, 1);
+		AMAP_SET_BITS(struct amap_cq_context_be, eventable, ctxt, 1);
+		AMAP_SET_BITS(struct amap_cq_context_be, eqid, ctxt, eq->id);
+	} else {
+		req->hdr.version = 2;
+		req->page_size = 1; /* 1 for 4K */
+
+		/* coalesce-wm field in this cmd is not relevant to Lancer.
+		 * Lancer uses COMMON_MODIFY_CQ to set this field
+		 */
+		if (!lancer_chip(adapter))
+			AMAP_SET_BITS(struct amap_cq_context_v2, coalescwm,
+				      ctxt, coalesce_wm);
+		AMAP_SET_BITS(struct amap_cq_context_v2, nodelay, ctxt,
+			      no_delay);
+		AMAP_SET_BITS(struct amap_cq_context_v2, count, ctxt,
+			      __ilog2_u32(cq->len / 256));
+		AMAP_SET_BITS(struct amap_cq_context_v2, valid, ctxt, 1);
+		AMAP_SET_BITS(struct amap_cq_context_v2, eventable, ctxt, 1);
+		AMAP_SET_BITS(struct amap_cq_context_v2, eqid, ctxt, eq->id);
+	}
+
+	be_dws_cpu_to_le(ctxt, sizeof(req->context));
+
+	be_cmd_page_addrs_prepare(req->pages, ARRAY_SIZE(req->pages), q_mem);
+
+	status = be_mbox_notify_wait(adapter);
+	if (!status) {
+		struct be_cmd_resp_cq_create *resp = embedded_payload(wrb);
+
+		cq->id = le16_to_cpu(resp->cq_id);
+		cq->created = true;
+	}
+
+	mutex_unlock(&adapter->mbox_lock);
+
+	return status;
+}
+
+static u32 be_encoded_q_len(int q_len)
+{
+	u32 len_encoded = fls(q_len); /* log2(len) + 1 */
+
+	if (len_encoded == 16)
+		len_encoded = 0;
+	return len_encoded;
+}
+
+static int be_cmd_mccq_ext_create(struct be_adapter *adapter,
+				  struct be_queue_info *mccq,
+				  struct be_queue_info *cq)
+{
+	struct be_mcc_wrb *wrb;
+	struct be_cmd_req_mcc_ext_create *req;
+	struct be_dma_mem *q_mem = &mccq->dma_mem;
+	void *ctxt;
+	int status;
+
+	if (mutex_lock_interruptible(&adapter->mbox_lock))
+		return -1;
+
+	wrb = wrb_from_mbox(adapter);
+	req = embedded_payload(wrb);
+	ctxt = &req->context;
+
+	be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON,
+			       OPCODE_COMMON_MCC_CREATE_EXT, sizeof(*req), wrb,
+			       NULL);
+
+	req->num_pages = cpu_to_le16(PAGES_4K_SPANNED(q_mem->va, q_mem->size));
+	if (BEx_chip(adapter)) {
+		AMAP_SET_BITS(struct amap_mcc_context_be, valid, ctxt, 1);
+		AMAP_SET_BITS(struct amap_mcc_context_be, ring_size, ctxt,
+			      be_encoded_q_len(mccq->len));
+		AMAP_SET_BITS(struct amap_mcc_context_be, cq_id, ctxt, cq->id);
+	} else {
+		req->hdr.version = 1;
+		req->cq_id = cpu_to_le16(cq->id);
+
+		AMAP_SET_BITS(struct amap_mcc_context_v1, ring_size, ctxt,
+			      be_encoded_q_len(mccq->len));
+		AMAP_SET_BITS(struct amap_mcc_context_v1, valid, ctxt, 1);
+		AMAP_SET_BITS(struct amap_mcc_context_v1, async_cq_id,
+			      ctxt, cq->id);
+		AMAP_SET_BITS(struct amap_mcc_context_v1, async_cq_valid,
+			      ctxt, 1);
+	}
+
+	/* Subscribe to Link State and Group 5 Events(bits 1 and 5 set) */
+	req->async_event_bitmap[0] = cpu_to_le32(0x00000022);
+	req->async_event_bitmap[0] |= cpu_to_le32(1 << ASYNC_EVENT_CODE_QNQ);
+	be_dws_cpu_to_le(ctxt, sizeof(req->context));
+
+	be_cmd_page_addrs_prepare(req->pages, ARRAY_SIZE(req->pages), q_mem);
+
+	status = be_mbox_notify_wait(adapter);
+	if (!status) {
+		struct be_cmd_resp_mcc_create *resp = embedded_payload(wrb);
+
+		mccq->id = le16_to_cpu(resp->id);
+		mccq->created = true;
+	}
+	mutex_unlock(&adapter->mbox_lock);
+
+	return status;
+}
+
+static int be_cmd_mccq_org_create(struct be_adapter *adapter,
+				  struct be_queue_info *mccq,
+				  struct be_queue_info *cq)
+{
+	struct be_mcc_wrb *wrb;
+	struct be_cmd_req_mcc_create *req;
+	struct be_dma_mem *q_mem = &mccq->dma_mem;
+	void *ctxt;
+	int status;
+
+	if (mutex_lock_interruptible(&adapter->mbox_lock))
+		return -1;
+
+	wrb = wrb_from_mbox(adapter);
+	req = embedded_payload(wrb);
+	ctxt = &req->context;
+
+	be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON,
+			       OPCODE_COMMON_MCC_CREATE, sizeof(*req), wrb,
+			       NULL);
+
+	req->num_pages = cpu_to_le16(PAGES_4K_SPANNED(q_mem->va, q_mem->size));
+
+	AMAP_SET_BITS(struct amap_mcc_context_be, valid, ctxt, 1);
+	AMAP_SET_BITS(struct amap_mcc_context_be, ring_size, ctxt,
+		      be_encoded_q_len(mccq->len));
+	AMAP_SET_BITS(struct amap_mcc_context_be, cq_id, ctxt, cq->id);
+
+	be_dws_cpu_to_le(ctxt, sizeof(req->context));
+
+	be_cmd_page_addrs_prepare(req->pages, ARRAY_SIZE(req->pages), q_mem);
+
+	status = be_mbox_notify_wait(adapter);
+	if (!status) {
+		struct be_cmd_resp_mcc_create *resp = embedded_payload(wrb);
+
+		mccq->id = le16_to_cpu(resp->id);
+		mccq->created = true;
+	}
+
+	mutex_unlock(&adapter->mbox_lock);
+	return status;
+}
+
+int be_cmd_mccq_create(struct be_adapter *adapter,
+		       struct be_queue_info *mccq, struct be_queue_info *cq)
+{
+	int status;
+
+	status = be_cmd_mccq_ext_create(adapter, mccq, cq);
+	if (status && BEx_chip(adapter)) {
+		dev_warn(&adapter->pdev->dev, "Upgrade to F/W ver 2.102.235.0 "
+			"or newer to avoid conflicting priorities between NIC "
+			"and FCoE traffic");
+		status = be_cmd_mccq_org_create(adapter, mccq, cq);
+	}
+	return status;
+}
+
+int be_cmd_txq_create(struct be_adapter *adapter, struct be_tx_obj *txo)
+{
+	struct be_mcc_wrb wrb = {0};
+	struct be_cmd_req_eth_tx_create *req;
+	struct be_queue_info *txq = &txo->q;
+	struct be_queue_info *cq = &txo->cq;
+	struct be_dma_mem *q_mem = &txq->dma_mem;
+	int status, ver = 0;
+
+	req = embedded_payload(&wrb);
+	be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_ETH,
+			       OPCODE_ETH_TX_CREATE, sizeof(*req), &wrb, NULL);
+
+	if (lancer_chip(adapter)) {
+		req->hdr.version = 1;
+	} else if (BEx_chip(adapter)) {
+		if (adapter->function_caps & BE_FUNCTION_CAPS_SUPER_NIC)
+			req->hdr.version = 2;
+	} else { /* For SH */
+		req->hdr.version = 2;
+	}
+
+	if (req->hdr.version > 0)
+		req->if_id = cpu_to_le16(adapter->if_handle);
+	req->num_pages = PAGES_4K_SPANNED(q_mem->va, q_mem->size);
+	req->ulp_num = BE_ULP1_NUM;
+	req->type = BE_ETH_TX_RING_TYPE_STANDARD;
+	req->cq_id = cpu_to_le16(cq->id);
+	req->queue_size = be_encoded_q_len(txq->len);
+	be_cmd_page_addrs_prepare(req->pages, ARRAY_SIZE(req->pages), q_mem);
+	ver = req->hdr.version;
+
+	status = be_cmd_notify_wait(adapter, &wrb);
+	if (!status) {
+		struct be_cmd_resp_eth_tx_create *resp = embedded_payload(&wrb);
+
+		txq->id = le16_to_cpu(resp->cid);
+		if (ver == 2)
+			txo->db_offset = le32_to_cpu(resp->db_offset);
+		else
+			txo->db_offset = DB_TXULP1_OFFSET;
+		txq->created = true;
+	}
+
+	return status;
+}
+
+/* Uses MCC */
+int be_cmd_rxq_create(struct be_adapter *adapter,
+		      struct be_queue_info *rxq, u16 cq_id, u16 frag_size,
+		      u32 if_id, u32 rss, u8 *rss_id)
+{
+	struct be_mcc_wrb *wrb;
+	struct be_cmd_req_eth_rx_create *req;
+	struct be_dma_mem *q_mem = &rxq->dma_mem;
+	int status;
+
+	spin_lock_bh(&adapter->mcc_lock);
+
+	wrb = wrb_from_mccq(adapter);
+	if (!wrb) {
+		status = -EBUSY;
+		goto err;
+	}
+	req = embedded_payload(wrb);
+
+	be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_ETH,
+			       OPCODE_ETH_RX_CREATE, sizeof(*req), wrb, NULL);
+
+	req->cq_id = cpu_to_le16(cq_id);
+	req->frag_size = fls(frag_size) - 1;
+	req->num_pages = 2;
+	be_cmd_page_addrs_prepare(req->pages, ARRAY_SIZE(req->pages), q_mem);
+	req->interface_id = cpu_to_le32(if_id);
+	req->max_frame_size = cpu_to_le16(BE_MAX_JUMBO_FRAME_SIZE);
+	req->rss_queue = cpu_to_le32(rss);
+
+	status = be_mcc_notify_wait(adapter);
+	if (!status) {
+		struct be_cmd_resp_eth_rx_create *resp = embedded_payload(wrb);
+
+		rxq->id = le16_to_cpu(resp->id);
+		rxq->created = true;
+		*rss_id = resp->rss_id;
+	}
+
+err:
+	spin_unlock_bh(&adapter->mcc_lock);
+	return status;
+}
+
+/* Generic destroyer function for all types of queues
+ * Uses Mbox
+ */
+int be_cmd_q_destroy(struct be_adapter *adapter, struct be_queue_info *q,
+		     int queue_type)
+{
+	struct be_mcc_wrb *wrb;
+	struct be_cmd_req_q_destroy *req;
+	u8 subsys = 0, opcode = 0;
+	int status;
+
+	if (mutex_lock_interruptible(&adapter->mbox_lock))
+		return -1;
+
+	wrb = wrb_from_mbox(adapter);
+	req = embedded_payload(wrb);
+
+	switch (queue_type) {
+	case QTYPE_EQ:
+		subsys = CMD_SUBSYSTEM_COMMON;
+		opcode = OPCODE_COMMON_EQ_DESTROY;
+		break;
+	case QTYPE_CQ:
+		subsys = CMD_SUBSYSTEM_COMMON;
+		opcode = OPCODE_COMMON_CQ_DESTROY;
+		break;
+	case QTYPE_TXQ:
+		subsys = CMD_SUBSYSTEM_ETH;
+		opcode = OPCODE_ETH_TX_DESTROY;
+		break;
+	case QTYPE_RXQ:
+		subsys = CMD_SUBSYSTEM_ETH;
+		opcode = OPCODE_ETH_RX_DESTROY;
+		break;
+	case QTYPE_MCCQ:
+		subsys = CMD_SUBSYSTEM_COMMON;
+		opcode = OPCODE_COMMON_MCC_DESTROY;
+		break;
+	default:
+		BUG();
+	}
+
+	be_wrb_cmd_hdr_prepare(&req->hdr, subsys, opcode, sizeof(*req), wrb,
+			       NULL);
+	req->id = cpu_to_le16(q->id);
+
+	status = be_mbox_notify_wait(adapter);
+	q->created = false;
+
+	mutex_unlock(&adapter->mbox_lock);
+	return status;
+}
+
+/* Uses MCC */
+int be_cmd_rxq_destroy(struct be_adapter *adapter, struct be_queue_info *q)
+{
+	struct be_mcc_wrb *wrb;
+	struct be_cmd_req_q_destroy *req;
+	int status;
+
+	spin_lock_bh(&adapter->mcc_lock);
+
+	wrb = wrb_from_mccq(adapter);
+	if (!wrb) {
+		status = -EBUSY;
+		goto err;
+	}
+	req = embedded_payload(wrb);
+
+	be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_ETH,
+			       OPCODE_ETH_RX_DESTROY, sizeof(*req), wrb, NULL);
+	req->id = cpu_to_le16(q->id);
+
+	status = be_mcc_notify_wait(adapter);
+	q->created = false;
+
+err:
+	spin_unlock_bh(&adapter->mcc_lock);
+	return status;
+}
+
+/* Create an rx filtering policy configuration on an i/f
+ * Will use MBOX only if MCCQ has not been created.
+ */
+int be_cmd_if_create(struct be_adapter *adapter, u32 cap_flags, u32 en_flags,
+		     u32 *if_handle, u32 domain)
+{
+	struct be_mcc_wrb wrb = {0};
+	struct be_cmd_req_if_create *req;
+	int status;
+
+	req = embedded_payload(&wrb);
+	be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON,
+			       OPCODE_COMMON_NTWK_INTERFACE_CREATE,
+			       sizeof(*req), &wrb, NULL);
+	req->hdr.domain = domain;
+	req->capability_flags = cpu_to_le32(cap_flags);
+	req->enable_flags = cpu_to_le32(en_flags);
+	req->pmac_invalid = true;
+
+	status = be_cmd_notify_wait(adapter, &wrb);
+	if (!status) {
+		struct be_cmd_resp_if_create *resp = embedded_payload(&wrb);
+
+		*if_handle = le32_to_cpu(resp->interface_id);
+
+		/* Hack to retrieve VF's pmac-id on BE3 */
+		if (BE3_chip(adapter) && !be_physfn(adapter))
+			adapter->pmac_id[0] = le32_to_cpu(resp->pmac_id);
+	}
+	return status;
+}
+
+/* Uses MCCQ */
+int be_cmd_if_destroy(struct be_adapter *adapter, int interface_id, u32 domain)
+{
+	struct be_mcc_wrb *wrb;
+	struct be_cmd_req_if_destroy *req;
+	int status;
+
+	if (interface_id == -1)
+		return 0;
+
+	spin_lock_bh(&adapter->mcc_lock);
+
+	wrb = wrb_from_mccq(adapter);
+	if (!wrb) {
+		status = -EBUSY;
+		goto err;
+	}
+	req = embedded_payload(wrb);
+
+	be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON,
+			       OPCODE_COMMON_NTWK_INTERFACE_DESTROY,
+			       sizeof(*req), wrb, NULL);
+	req->hdr.domain = domain;
+	req->interface_id = cpu_to_le32(interface_id);
+
+	status = be_mcc_notify_wait(adapter);
+err:
+	spin_unlock_bh(&adapter->mcc_lock);
+	return status;
+}
+
+/* Get stats is a non embedded command: the request is not embedded inside
+ * WRB but is a separate dma memory block
+ * Uses asynchronous MCC
+ */
+int be_cmd_get_stats(struct be_adapter *adapter, struct be_dma_mem *nonemb_cmd)
+{
+	struct be_mcc_wrb *wrb;
+	struct be_cmd_req_hdr *hdr;
+	int status = 0;
+
+	spin_lock_bh(&adapter->mcc_lock);
+
+	wrb = wrb_from_mccq(adapter);
+	if (!wrb) {
+		status = -EBUSY;
+		goto err;
+	}
+	hdr = nonemb_cmd->va;
+
+	be_wrb_cmd_hdr_prepare(hdr, CMD_SUBSYSTEM_ETH,
+			       OPCODE_ETH_GET_STATISTICS, nonemb_cmd->size, wrb,
+			       nonemb_cmd);
+
+	/* version 1 of the cmd is not supported only by BE2 */
+	if (BE2_chip(adapter))
+		hdr->version = 0;
+	if (BE3_chip(adapter) || lancer_chip(adapter))
+		hdr->version = 1;
+	else
+		hdr->version = 2;
+
+	be_mcc_notify(adapter);
+	adapter->stats_cmd_sent = true;
+
+err:
+	spin_unlock_bh(&adapter->mcc_lock);
+	return status;
+}
+
+/* Lancer Stats */
+int lancer_cmd_get_pport_stats(struct be_adapter *adapter,
+			       struct be_dma_mem *nonemb_cmd)
+{
+	struct be_mcc_wrb *wrb;
+	struct lancer_cmd_req_pport_stats *req;
+	int status = 0;
+
+	if (!be_cmd_allowed(adapter, OPCODE_ETH_GET_PPORT_STATS,
+			    CMD_SUBSYSTEM_ETH))
+		return -EPERM;
+
+	spin_lock_bh(&adapter->mcc_lock);
+
+	wrb = wrb_from_mccq(adapter);
+	if (!wrb) {
+		status = -EBUSY;
+		goto err;
+	}
+	req = nonemb_cmd->va;
+
+	be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_ETH,
+			       OPCODE_ETH_GET_PPORT_STATS, nonemb_cmd->size,
+			       wrb, nonemb_cmd);
+
+	req->cmd_params.params.pport_num = cpu_to_le16(adapter->hba_port_num);
+	req->cmd_params.params.reset_stats = 0;
+
+	be_mcc_notify(adapter);
+	adapter->stats_cmd_sent = true;
+
+err:
+	spin_unlock_bh(&adapter->mcc_lock);
+	return status;
+}
+
+static int be_mac_to_link_speed(int mac_speed)
+{
+	switch (mac_speed) {
+	case PHY_LINK_SPEED_ZERO:
+		return 0;
+	case PHY_LINK_SPEED_10MBPS:
+		return 10;
+	case PHY_LINK_SPEED_100MBPS:
+		return 100;
+	case PHY_LINK_SPEED_1GBPS:
+		return 1000;
+	case PHY_LINK_SPEED_10GBPS:
+		return 10000;
+	case PHY_LINK_SPEED_20GBPS:
+		return 20000;
+	case PHY_LINK_SPEED_25GBPS:
+		return 25000;
+	case PHY_LINK_SPEED_40GBPS:
+		return 40000;
+	}
+	return 0;
+}
+
+/* Uses synchronous mcc
+ * Returns link_speed in Mbps
+ */
+int be_cmd_link_status_query(struct be_adapter *adapter, u16 *link_speed,
+			     u8 *link_status, u32 dom)
+{
+	struct be_mcc_wrb *wrb;
+	struct be_cmd_req_link_status *req;
+	int status;
+
+	spin_lock_bh(&adapter->mcc_lock);
+
+	if (link_status)
+		*link_status = LINK_DOWN;
+
+	wrb = wrb_from_mccq(adapter);
+	if (!wrb) {
+		status = -EBUSY;
+		goto err;
+	}
+	req = embedded_payload(wrb);
+
+	be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON,
+			       OPCODE_COMMON_NTWK_LINK_STATUS_QUERY,
+			       sizeof(*req), wrb, NULL);
+
+	/* version 1 of the cmd is not supported only by BE2 */
+	if (!BE2_chip(adapter))
+		req->hdr.version = 1;
+
+	req->hdr.domain = dom;
+
+	status = be_mcc_notify_wait(adapter);
+	if (!status) {
+		struct be_cmd_resp_link_status *resp = embedded_payload(wrb);
+
+		if (link_speed) {
+			*link_speed = resp->link_speed ?
+				      le16_to_cpu(resp->link_speed) * 10 :
+				      be_mac_to_link_speed(resp->mac_speed);
+
+			if (!resp->logical_link_status)
+				*link_speed = 0;
+		}
+		if (link_status)
+			*link_status = resp->logical_link_status;
+	}
+
+err:
+	spin_unlock_bh(&adapter->mcc_lock);
+	return status;
+}
+
+/* Uses synchronous mcc */
+int be_cmd_get_die_temperature(struct be_adapter *adapter)
+{
+	struct be_mcc_wrb *wrb;
+	struct be_cmd_req_get_cntl_addnl_attribs *req;
+	int status = 0;
+
+	spin_lock_bh(&adapter->mcc_lock);
+
+	wrb = wrb_from_mccq(adapter);
+	if (!wrb) {
+		status = -EBUSY;
+		goto err;
+	}
+	req = embedded_payload(wrb);
+
+	be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON,
+			       OPCODE_COMMON_GET_CNTL_ADDITIONAL_ATTRIBUTES,
+			       sizeof(*req), wrb, NULL);
+
+	be_mcc_notify(adapter);
+
+err:
+	spin_unlock_bh(&adapter->mcc_lock);
+	return status;
+}
+
+/* Uses synchronous mcc */
+int be_cmd_get_reg_len(struct be_adapter *adapter, u32 *log_size)
+{
+	struct be_mcc_wrb *wrb;
+	struct be_cmd_req_get_fat *req;
+	int status;
+
+	spin_lock_bh(&adapter->mcc_lock);
+
+	wrb = wrb_from_mccq(adapter);
+	if (!wrb) {
+		status = -EBUSY;
+		goto err;
+	}
+	req = embedded_payload(wrb);
+
+	be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON,
+			       OPCODE_COMMON_MANAGE_FAT, sizeof(*req), wrb,
+			       NULL);
+	req->fat_operation = cpu_to_le32(QUERY_FAT);
+	status = be_mcc_notify_wait(adapter);
+	if (!status) {
+		struct be_cmd_resp_get_fat *resp = embedded_payload(wrb);
+
+		if (log_size && resp->log_size)
+			*log_size = le32_to_cpu(resp->log_size) -
+					sizeof(u32);
+	}
+err:
+	spin_unlock_bh(&adapter->mcc_lock);
+	return status;
+}
+
+int be_cmd_get_regs(struct be_adapter *adapter, u32 buf_len, void *buf)
+{
+	struct be_dma_mem get_fat_cmd;
+	struct be_mcc_wrb *wrb;
+	struct be_cmd_req_get_fat *req;
+	u32 offset = 0, total_size, buf_size,
+				log_offset = sizeof(u32), payload_len;
+	int status = 0;
+
+	if (buf_len == 0)
+		return -EIO;
+
+	total_size = buf_len;
+
+	get_fat_cmd.size = sizeof(struct be_cmd_req_get_fat) + 60*1024;
+	get_fat_cmd.va = pci_alloc_consistent(adapter->pdev,
+					      get_fat_cmd.size,
+					      &get_fat_cmd.dma);
+	if (!get_fat_cmd.va) {
+		dev_err(&adapter->pdev->dev,
+			"Memory allocation failure while reading FAT data\n");
+		return -ENOMEM;
+	}
+
+	spin_lock_bh(&adapter->mcc_lock);
+
+	while (total_size) {
+		buf_size = min(total_size, (u32)60*1024);
+		total_size -= buf_size;
+
+		wrb = wrb_from_mccq(adapter);
+		if (!wrb) {
+			status = -EBUSY;
+			goto err;
+		}
+		req = get_fat_cmd.va;
+
+		payload_len = sizeof(struct be_cmd_req_get_fat) + buf_size;
+		be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON,
+				       OPCODE_COMMON_MANAGE_FAT, payload_len,
+				       wrb, &get_fat_cmd);
+
+		req->fat_operation = cpu_to_le32(RETRIEVE_FAT);
+		req->read_log_offset = cpu_to_le32(log_offset);
+		req->read_log_length = cpu_to_le32(buf_size);
+		req->data_buffer_size = cpu_to_le32(buf_size);
+
+		status = be_mcc_notify_wait(adapter);
+		if (!status) {
+			struct be_cmd_resp_get_fat *resp = get_fat_cmd.va;
+
+			memcpy(buf + offset,
+			       resp->data_buffer,
+			       le32_to_cpu(resp->read_log_length));
+		} else {
+			dev_err(&adapter->pdev->dev, "FAT Table Retrieve error\n");
+			goto err;
+		}
+		offset += buf_size;
+		log_offset += buf_size;
+	}
+err:
+	pci_free_consistent(adapter->pdev, get_fat_cmd.size,
+			    get_fat_cmd.va, get_fat_cmd.dma);
+	spin_unlock_bh(&adapter->mcc_lock);
+	return status;
+}
+
+/* Uses synchronous mcc */
+int be_cmd_get_fw_ver(struct be_adapter *adapter)
+{
+	struct be_mcc_wrb *wrb;
+	struct be_cmd_req_get_fw_version *req;
+	int status;
+
+	spin_lock_bh(&adapter->mcc_lock);
+
+	wrb = wrb_from_mccq(adapter);
+	if (!wrb) {
+		status = -EBUSY;
+		goto err;
+	}
+
+	req = embedded_payload(wrb);
+
+	be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON,
+			       OPCODE_COMMON_GET_FW_VERSION, sizeof(*req), wrb,
+			       NULL);
+	status = be_mcc_notify_wait(adapter);
+	if (!status) {
+		struct be_cmd_resp_get_fw_version *resp = embedded_payload(wrb);
+
+		strlcpy(adapter->fw_ver, resp->firmware_version_string,
+			sizeof(adapter->fw_ver));
+		strlcpy(adapter->fw_on_flash, resp->fw_on_flash_version_string,
+			sizeof(adapter->fw_on_flash));
+	}
+err:
+	spin_unlock_bh(&adapter->mcc_lock);
+	return status;
+}
+
+/* set the EQ delay interval of an EQ to specified value
+ * Uses async mcc
+ */
+static int __be_cmd_modify_eqd(struct be_adapter *adapter,
+			       struct be_set_eqd *set_eqd, int num)
+{
+	struct be_mcc_wrb *wrb;
+	struct be_cmd_req_modify_eq_delay *req;
+	int status = 0, i;
+
+	spin_lock_bh(&adapter->mcc_lock);
+
+	wrb = wrb_from_mccq(adapter);
+	if (!wrb) {
+		status = -EBUSY;
+		goto err;
+	}
+	req = embedded_payload(wrb);
+
+	be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON,
+			       OPCODE_COMMON_MODIFY_EQ_DELAY, sizeof(*req), wrb,
+			       NULL);
+
+	req->num_eq = cpu_to_le32(num);
+	for (i = 0; i < num; i++) {
+		req->set_eqd[i].eq_id = cpu_to_le32(set_eqd[i].eq_id);
+		req->set_eqd[i].phase = 0;
+		req->set_eqd[i].delay_multiplier =
+				cpu_to_le32(set_eqd[i].delay_multiplier);
+	}
+
+	be_mcc_notify(adapter);
+err:
+	spin_unlock_bh(&adapter->mcc_lock);
+	return status;
+}
+
+int be_cmd_modify_eqd(struct be_adapter *adapter, struct be_set_eqd *set_eqd,
+		      int num)
+{
+	int num_eqs, i = 0;
+
+	if (lancer_chip(adapter) && num > 8) {
+		while (num) {
+			num_eqs = min(num, 8);
+			__be_cmd_modify_eqd(adapter, &set_eqd[i], num_eqs);
+			i += num_eqs;
+			num -= num_eqs;
+		}
+	} else {
+		__be_cmd_modify_eqd(adapter, set_eqd, num);
+	}
+
+	return 0;
+}
+
+/* Uses sycnhronous mcc */
+int be_cmd_vlan_config(struct be_adapter *adapter, u32 if_id, u16 *vtag_array,
+		       u32 num)
+{
+	struct be_mcc_wrb *wrb;
+	struct be_cmd_req_vlan_config *req;
+	int status;
+
+	spin_lock_bh(&adapter->mcc_lock);
+
+	wrb = wrb_from_mccq(adapter);
+	if (!wrb) {
+		status = -EBUSY;
+		goto err;
+	}
+	req = embedded_payload(wrb);
+
+	be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON,
+			       OPCODE_COMMON_NTWK_VLAN_CONFIG, sizeof(*req),
+			       wrb, NULL);
+
+	req->interface_id = if_id;
+	req->untagged = BE_IF_FLAGS_UNTAGGED & be_if_cap_flags(adapter) ? 1 : 0;
+	req->num_vlan = num;
+	memcpy(req->normal_vlan, vtag_array,
+	       req->num_vlan * sizeof(vtag_array[0]));
+
+	status = be_mcc_notify_wait(adapter);
+err:
+	spin_unlock_bh(&adapter->mcc_lock);
+	return status;
+}
+
+int be_cmd_rx_filter(struct be_adapter *adapter, u32 flags, u32 value)
+{
+	struct be_mcc_wrb *wrb;
+	struct be_dma_mem *mem = &adapter->rx_filter;
+	struct be_cmd_req_rx_filter *req = mem->va;
+	int status;
+
+	spin_lock_bh(&adapter->mcc_lock);
+
+	wrb = wrb_from_mccq(adapter);
+	if (!wrb) {
+		status = -EBUSY;
+		goto err;
+	}
+	memset(req, 0, sizeof(*req));
+	be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON,
+			       OPCODE_COMMON_NTWK_RX_FILTER, sizeof(*req),
+			       wrb, mem);
+
+	req->if_id = cpu_to_le32(adapter->if_handle);
+	if (flags & IFF_PROMISC) {
+		req->if_flags_mask = cpu_to_le32(BE_IF_FLAGS_PROMISCUOUS |
+						 BE_IF_FLAGS_VLAN_PROMISCUOUS |
+						 BE_IF_FLAGS_MCAST_PROMISCUOUS);
+		if (value == ON)
+			req->if_flags =
+				cpu_to_le32(BE_IF_FLAGS_PROMISCUOUS |
+					    BE_IF_FLAGS_VLAN_PROMISCUOUS |
+					    BE_IF_FLAGS_MCAST_PROMISCUOUS);
+	} else if (flags & IFF_ALLMULTI) {
+		req->if_flags_mask = cpu_to_le32(BE_IF_FLAGS_MCAST_PROMISCUOUS);
+		req->if_flags =	cpu_to_le32(BE_IF_FLAGS_MCAST_PROMISCUOUS);
+	} else if (flags & BE_FLAGS_VLAN_PROMISC) {
+		req->if_flags_mask = cpu_to_le32(BE_IF_FLAGS_VLAN_PROMISCUOUS);
+
+		if (value == ON)
+			req->if_flags =
+				cpu_to_le32(BE_IF_FLAGS_VLAN_PROMISCUOUS);
+	} else {
+		struct netdev_hw_addr *ha;
+		int i = 0;
+
+		req->if_flags_mask = cpu_to_le32(BE_IF_FLAGS_MULTICAST);
+		req->if_flags =	cpu_to_le32(BE_IF_FLAGS_MULTICAST);
+
+		/* Reset mcast promisc mode if already set by setting mask
+		 * and not setting flags field
+		 */
+		req->if_flags_mask |=
+			cpu_to_le32(BE_IF_FLAGS_MCAST_PROMISCUOUS &
+				    be_if_cap_flags(adapter));
+		req->mcast_num = cpu_to_le32(netdev_mc_count(adapter->netdev));
+		netdev_for_each_mc_addr(ha, adapter->netdev)
+			memcpy(req->mcast_mac[i++].byte, ha->addr, ETH_ALEN);
+	}
+
+	if ((req->if_flags_mask & cpu_to_le32(be_if_cap_flags(adapter))) !=
+	    req->if_flags_mask) {
+		dev_warn(&adapter->pdev->dev,
+			 "Cannot set rx filter flags 0x%x\n",
+			 req->if_flags_mask);
+		dev_warn(&adapter->pdev->dev,
+			 "Interface is capable of 0x%x flags only\n",
+			 be_if_cap_flags(adapter));
+	}
+	req->if_flags_mask &= cpu_to_le32(be_if_cap_flags(adapter));
+
+	status = be_mcc_notify_wait(adapter);
+
+err:
+	spin_unlock_bh(&adapter->mcc_lock);
+	return status;
+}
+
+/* Uses synchrounous mcc */
+int be_cmd_set_flow_control(struct be_adapter *adapter, u32 tx_fc, u32 rx_fc)
+{
+	struct be_mcc_wrb *wrb;
+	struct be_cmd_req_set_flow_control *req;
+	int status;
+
+	if (!be_cmd_allowed(adapter, OPCODE_COMMON_SET_FLOW_CONTROL,
+			    CMD_SUBSYSTEM_COMMON))
+		return -EPERM;
+
+	spin_lock_bh(&adapter->mcc_lock);
+
+	wrb = wrb_from_mccq(adapter);
+	if (!wrb) {
+		status = -EBUSY;
+		goto err;
+	}
+	req = embedded_payload(wrb);
+
+	be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON,
+			       OPCODE_COMMON_SET_FLOW_CONTROL, sizeof(*req),
+			       wrb, NULL);
+
+	req->hdr.version = 1;
+	req->tx_flow_control = cpu_to_le16((u16)tx_fc);
+	req->rx_flow_control = cpu_to_le16((u16)rx_fc);
+
+	status = be_mcc_notify_wait(adapter);
+
+err:
+	spin_unlock_bh(&adapter->mcc_lock);
+
+	if (base_status(status) == MCC_STATUS_FEATURE_NOT_SUPPORTED)
+		return  -EOPNOTSUPP;
+
+	return status;
+}
+
+/* Uses sycn mcc */
+int be_cmd_get_flow_control(struct be_adapter *adapter, u32 *tx_fc, u32 *rx_fc)
+{
+	struct be_mcc_wrb *wrb;
+	struct be_cmd_req_get_flow_control *req;
+	int status;
+
+	if (!be_cmd_allowed(adapter, OPCODE_COMMON_GET_FLOW_CONTROL,
+			    CMD_SUBSYSTEM_COMMON))
+		return -EPERM;
+
+	spin_lock_bh(&adapter->mcc_lock);
+
+	wrb = wrb_from_mccq(adapter);
+	if (!wrb) {
+		status = -EBUSY;
+		goto err;
+	}
+	req = embedded_payload(wrb);
+
+	be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON,
+			       OPCODE_COMMON_GET_FLOW_CONTROL, sizeof(*req),
+			       wrb, NULL);
+
+	status = be_mcc_notify_wait(adapter);
+	if (!status) {
+		struct be_cmd_resp_get_flow_control *resp =
+						embedded_payload(wrb);
+
+		*tx_fc = le16_to_cpu(resp->tx_flow_control);
+		*rx_fc = le16_to_cpu(resp->rx_flow_control);
+	}
+
+err:
+	spin_unlock_bh(&adapter->mcc_lock);
+	return status;
+}
+
+/* Uses mbox */
+int be_cmd_query_fw_cfg(struct be_adapter *adapter)
+{
+	struct be_mcc_wrb *wrb;
+	struct be_cmd_req_query_fw_cfg *req;
+	int status;
+
+	if (mutex_lock_interruptible(&adapter->mbox_lock))
+		return -1;
+
+	wrb = wrb_from_mbox(adapter);
+	req = embedded_payload(wrb);
+
+	be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON,
+			       OPCODE_COMMON_QUERY_FIRMWARE_CONFIG,
+			       sizeof(*req), wrb, NULL);
+
+	status = be_mbox_notify_wait(adapter);
+	if (!status) {
+		struct be_cmd_resp_query_fw_cfg *resp = embedded_payload(wrb);
+
+		adapter->port_num = le32_to_cpu(resp->phys_port);
+		adapter->function_mode = le32_to_cpu(resp->function_mode);
+		adapter->function_caps = le32_to_cpu(resp->function_caps);
+		adapter->asic_rev = le32_to_cpu(resp->asic_revision) & 0xFF;
+		dev_info(&adapter->pdev->dev,
+			 "FW config: function_mode=0x%x, function_caps=0x%x\n",
+			 adapter->function_mode, adapter->function_caps);
+	}
+
+	mutex_unlock(&adapter->mbox_lock);
+	return status;
+}
+
+/* Uses mbox */
+int be_cmd_reset_function(struct be_adapter *adapter)
+{
+	struct be_mcc_wrb *wrb;
+	struct be_cmd_req_hdr *req;
+	int status;
+
+	if (lancer_chip(adapter)) {
+		status = lancer_wait_ready(adapter);
+		if (!status) {
+			iowrite32(SLI_PORT_CONTROL_IP_MASK,
+				  adapter->db + SLIPORT_CONTROL_OFFSET);
+			status = lancer_test_and_set_rdy_state(adapter);
+		}
+		if (status) {
+			dev_err(&adapter->pdev->dev,
+				"Adapter in non recoverable error\n");
+		}
+		return status;
+	}
+
+	if (mutex_lock_interruptible(&adapter->mbox_lock))
+		return -1;
+
+	wrb = wrb_from_mbox(adapter);
+	req = embedded_payload(wrb);
+
+	be_wrb_cmd_hdr_prepare(req, CMD_SUBSYSTEM_COMMON,
+			       OPCODE_COMMON_FUNCTION_RESET, sizeof(*req), wrb,
+			       NULL);
+
+	status = be_mbox_notify_wait(adapter);
+
+	mutex_unlock(&adapter->mbox_lock);
+	return status;
+}
+
+int be_cmd_rss_config(struct be_adapter *adapter, u8 *rsstable,
+		      u32 rss_hash_opts, u16 table_size, const u8 *rss_hkey)
+{
+	struct be_mcc_wrb *wrb;
+	struct be_cmd_req_rss_config *req;
+	int status;
+
+	if (!(be_if_cap_flags(adapter) & BE_IF_FLAGS_RSS))
+		return 0;
+
+	spin_lock_bh(&adapter->mcc_lock);
+
+	wrb = wrb_from_mccq(adapter);
+	if (!wrb) {
+		status = -EBUSY;
+		goto err;
+	}
+	req = embedded_payload(wrb);
+
+	be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_ETH,
+			       OPCODE_ETH_RSS_CONFIG, sizeof(*req), wrb, NULL);
+
+	req->if_id = cpu_to_le32(adapter->if_handle);
+	req->enable_rss = cpu_to_le16(rss_hash_opts);
+	req->cpu_table_size_log2 = cpu_to_le16(fls(table_size) - 1);
+
+	if (!BEx_chip(adapter))
+		req->hdr.version = 1;
+
+	memcpy(req->cpu_table, rsstable, table_size);
+	memcpy(req->hash, rss_hkey, RSS_HASH_KEY_LEN);
+	be_dws_cpu_to_le(req->hash, sizeof(req->hash));
+
+	status = be_mcc_notify_wait(adapter);
+err:
+	spin_unlock_bh(&adapter->mcc_lock);
+	return status;
+}
+
+/* Uses sync mcc */
+int be_cmd_set_beacon_state(struct be_adapter *adapter, u8 port_num,
+			    u8 bcn, u8 sts, u8 state)
+{
+	struct be_mcc_wrb *wrb;
+	struct be_cmd_req_enable_disable_beacon *req;
+	int status;
+
+	spin_lock_bh(&adapter->mcc_lock);
+
+	wrb = wrb_from_mccq(adapter);
+	if (!wrb) {
+		status = -EBUSY;
+		goto err;
+	}
+	req = embedded_payload(wrb);
+
+	be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON,
+			       OPCODE_COMMON_ENABLE_DISABLE_BEACON,
+			       sizeof(*req), wrb, NULL);
+
+	req->port_num = port_num;
+	req->beacon_state = state;
+	req->beacon_duration = bcn;
+	req->status_duration = sts;
+
+	status = be_mcc_notify_wait(adapter);
+
+err:
+	spin_unlock_bh(&adapter->mcc_lock);
+	return status;
+}
+
+/* Uses sync mcc */
+int be_cmd_get_beacon_state(struct be_adapter *adapter, u8 port_num, u32 *state)
+{
+	struct be_mcc_wrb *wrb;
+	struct be_cmd_req_get_beacon_state *req;
+	int status;
+
+	spin_lock_bh(&adapter->mcc_lock);
+
+	wrb = wrb_from_mccq(adapter);
+	if (!wrb) {
+		status = -EBUSY;
+		goto err;
+	}
+	req = embedded_payload(wrb);
+
+	be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON,
+			       OPCODE_COMMON_GET_BEACON_STATE, sizeof(*req),
+			       wrb, NULL);
+
+	req->port_num = port_num;
+
+	status = be_mcc_notify_wait(adapter);
+	if (!status) {
+		struct be_cmd_resp_get_beacon_state *resp =
+						embedded_payload(wrb);
+
+		*state = resp->beacon_state;
+	}
+
+err:
+	spin_unlock_bh(&adapter->mcc_lock);
+	return status;
+}
+
+/* Uses sync mcc */
+int be_cmd_read_port_transceiver_data(struct be_adapter *adapter,
+				      u8 page_num, u8 *data)
+{
+	struct be_dma_mem cmd;
+	struct be_mcc_wrb *wrb;
+	struct be_cmd_req_port_type *req;
+	int status;
+
+	if (page_num > TR_PAGE_A2)
+		return -EINVAL;
+
+	cmd.size = sizeof(struct be_cmd_resp_port_type);
+	cmd.va = pci_alloc_consistent(adapter->pdev, cmd.size, &cmd.dma);
+	if (!cmd.va) {
+		dev_err(&adapter->pdev->dev, "Memory allocation failed\n");
+		return -ENOMEM;
+	}
+	memset(cmd.va, 0, cmd.size);
+
+	spin_lock_bh(&adapter->mcc_lock);
+
+	wrb = wrb_from_mccq(adapter);
+	if (!wrb) {
+		status = -EBUSY;
+		goto err;
+	}
+	req = cmd.va;
+
+	be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON,
+			       OPCODE_COMMON_READ_TRANSRECV_DATA,
+			       cmd.size, wrb, &cmd);
+
+	req->port = cpu_to_le32(adapter->hba_port_num);
+	req->page_num = cpu_to_le32(page_num);
+	status = be_mcc_notify_wait(adapter);
+	if (!status) {
+		struct be_cmd_resp_port_type *resp = cmd.va;
+
+		memcpy(data, resp->page_data, PAGE_DATA_LEN);
+	}
+err:
+	spin_unlock_bh(&adapter->mcc_lock);
+	pci_free_consistent(adapter->pdev, cmd.size, cmd.va, cmd.dma);
+	return status;
+}
+
+int lancer_cmd_write_object(struct be_adapter *adapter, struct be_dma_mem *cmd,
+			    u32 data_size, u32 data_offset,
+			    const char *obj_name, u32 *data_written,
+			    u8 *change_status, u8 *addn_status)
+{
+	struct be_mcc_wrb *wrb;
+	struct lancer_cmd_req_write_object *req;
+	struct lancer_cmd_resp_write_object *resp;
+	void *ctxt = NULL;
+	int status;
+
+	spin_lock_bh(&adapter->mcc_lock);
+	adapter->flash_status = 0;
+
+	wrb = wrb_from_mccq(adapter);
+	if (!wrb) {
+		status = -EBUSY;
+		goto err_unlock;
+	}
+
+	req = embedded_payload(wrb);
+
+	be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON,
+			       OPCODE_COMMON_WRITE_OBJECT,
+			       sizeof(struct lancer_cmd_req_write_object), wrb,
+			       NULL);
+
+	ctxt = &req->context;
+	AMAP_SET_BITS(struct amap_lancer_write_obj_context,
+		      write_length, ctxt, data_size);
+
+	if (data_size == 0)
+		AMAP_SET_BITS(struct amap_lancer_write_obj_context,
+			      eof, ctxt, 1);
+	else
+		AMAP_SET_BITS(struct amap_lancer_write_obj_context,
+			      eof, ctxt, 0);
+
+	be_dws_cpu_to_le(ctxt, sizeof(req->context));
+	req->write_offset = cpu_to_le32(data_offset);
+	strlcpy(req->object_name, obj_name, sizeof(req->object_name));
+	req->descriptor_count = cpu_to_le32(1);
+	req->buf_len = cpu_to_le32(data_size);
+	req->addr_low = cpu_to_le32((cmd->dma +
+				     sizeof(struct lancer_cmd_req_write_object))
+				    & 0xFFFFFFFF);
+	req->addr_high = cpu_to_le32(upper_32_bits(cmd->dma +
+				sizeof(struct lancer_cmd_req_write_object)));
+
+	be_mcc_notify(adapter);
+	spin_unlock_bh(&adapter->mcc_lock);
+
+	if (!wait_for_completion_timeout(&adapter->et_cmd_compl,
+					 msecs_to_jiffies(60000)))
+		status = -ETIMEDOUT;
+	else
+		status = adapter->flash_status;
+
+	resp = embedded_payload(wrb);
+	if (!status) {
+		*data_written = le32_to_cpu(resp->actual_write_len);
+		*change_status = resp->change_status;
+	} else {
+		*addn_status = resp->additional_status;
+	}
+
+	return status;
+
+err_unlock:
+	spin_unlock_bh(&adapter->mcc_lock);
+	return status;
+}
+
+int be_cmd_query_cable_type(struct be_adapter *adapter)
+{
+	u8 page_data[PAGE_DATA_LEN];
+	int status;
+
+	status = be_cmd_read_port_transceiver_data(adapter, TR_PAGE_A0,
+						   page_data);
+	if (!status) {
+		switch (adapter->phy.interface_type) {
+		case PHY_TYPE_QSFP:
+			adapter->phy.cable_type =
+				page_data[QSFP_PLUS_CABLE_TYPE_OFFSET];
+			break;
+		case PHY_TYPE_SFP_PLUS_10GB:
+			adapter->phy.cable_type =
+				page_data[SFP_PLUS_CABLE_TYPE_OFFSET];
+			break;
+		default:
+			adapter->phy.cable_type = 0;
+			break;
+		}
+	}
+	return status;
+}
+
+int lancer_cmd_delete_object(struct be_adapter *adapter, const char *obj_name)
+{
+	struct lancer_cmd_req_delete_object *req;
+	struct be_mcc_wrb *wrb;
+	int status;
+
+	spin_lock_bh(&adapter->mcc_lock);
+
+	wrb = wrb_from_mccq(adapter);
+	if (!wrb) {
+		status = -EBUSY;
+		goto err;
+	}
+
+	req = embedded_payload(wrb);
+
+	be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON,
+			       OPCODE_COMMON_DELETE_OBJECT,
+			       sizeof(*req), wrb, NULL);
+
+	strlcpy(req->object_name, obj_name, sizeof(req->object_name));
+
+	status = be_mcc_notify_wait(adapter);
+err:
+	spin_unlock_bh(&adapter->mcc_lock);
+	return status;
+}
+
+int lancer_cmd_read_object(struct be_adapter *adapter, struct be_dma_mem *cmd,
+			   u32 data_size, u32 data_offset, const char *obj_name,
+			   u32 *data_read, u32 *eof, u8 *addn_status)
+{
+	struct be_mcc_wrb *wrb;
+	struct lancer_cmd_req_read_object *req;
+	struct lancer_cmd_resp_read_object *resp;
+	int status;
+
+	spin_lock_bh(&adapter->mcc_lock);
+
+	wrb = wrb_from_mccq(adapter);
+	if (!wrb) {
+		status = -EBUSY;
+		goto err_unlock;
+	}
+
+	req = embedded_payload(wrb);
+
+	be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON,
+			       OPCODE_COMMON_READ_OBJECT,
+			       sizeof(struct lancer_cmd_req_read_object), wrb,
+			       NULL);
+
+	req->desired_read_len = cpu_to_le32(data_size);
+	req->read_offset = cpu_to_le32(data_offset);
+	strcpy(req->object_name, obj_name);
+	req->descriptor_count = cpu_to_le32(1);
+	req->buf_len = cpu_to_le32(data_size);
+	req->addr_low = cpu_to_le32((cmd->dma & 0xFFFFFFFF));
+	req->addr_high = cpu_to_le32(upper_32_bits(cmd->dma));
+
+	status = be_mcc_notify_wait(adapter);
+
+	resp = embedded_payload(wrb);
+	if (!status) {
+		*data_read = le32_to_cpu(resp->actual_read_len);
+		*eof = le32_to_cpu(resp->eof);
+	} else {
+		*addn_status = resp->additional_status;
+	}
+
+err_unlock:
+	spin_unlock_bh(&adapter->mcc_lock);
+	return status;
+}
+
+int be_cmd_write_flashrom(struct be_adapter *adapter, struct be_dma_mem *cmd,
+			  u32 flash_type, u32 flash_opcode, u32 buf_size)
+{
+	struct be_mcc_wrb *wrb;
+	struct be_cmd_write_flashrom *req;
+	int status;
+
+	spin_lock_bh(&adapter->mcc_lock);
+	adapter->flash_status = 0;
+
+	wrb = wrb_from_mccq(adapter);
+	if (!wrb) {
+		status = -EBUSY;
+		goto err_unlock;
+	}
+	req = cmd->va;
+
+	be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON,
+			       OPCODE_COMMON_WRITE_FLASHROM, cmd->size, wrb,
+			       cmd);
+
+	req->params.op_type = cpu_to_le32(flash_type);
+	req->params.op_code = cpu_to_le32(flash_opcode);
+	req->params.data_buf_size = cpu_to_le32(buf_size);
+
+	be_mcc_notify(adapter);
+	spin_unlock_bh(&adapter->mcc_lock);
+
+	if (!wait_for_completion_timeout(&adapter->et_cmd_compl,
+					 msecs_to_jiffies(40000)))
+		status = -ETIMEDOUT;
+	else
+		status = adapter->flash_status;
+
+	return status;
+
+err_unlock:
+	spin_unlock_bh(&adapter->mcc_lock);
+	return status;
+}
+
+int be_cmd_get_flash_crc(struct be_adapter *adapter, u8 *flashed_crc,
+			 u16 optype, int offset)
+{
+	struct be_mcc_wrb *wrb;
+	struct be_cmd_read_flash_crc *req;
+	int status;
+
+	spin_lock_bh(&adapter->mcc_lock);
+
+	wrb = wrb_from_mccq(adapter);
+	if (!wrb) {
+		status = -EBUSY;
+		goto err;
+	}
+	req = embedded_payload(wrb);
+
+	be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON,
+			       OPCODE_COMMON_READ_FLASHROM, sizeof(*req),
+			       wrb, NULL);
+
+	req->params.op_type = cpu_to_le32(optype);
+	req->params.op_code = cpu_to_le32(FLASHROM_OPER_REPORT);
+	req->params.offset = cpu_to_le32(offset);
+	req->params.data_buf_size = cpu_to_le32(0x4);
+
+	status = be_mcc_notify_wait(adapter);
+	if (!status)
+		memcpy(flashed_crc, req->crc, 4);
+
+err:
+	spin_unlock_bh(&adapter->mcc_lock);
+	return status;
+}
+
+int be_cmd_enable_magic_wol(struct be_adapter *adapter, u8 *mac,
+			    struct be_dma_mem *nonemb_cmd)
+{
+	struct be_mcc_wrb *wrb;
+	struct be_cmd_req_acpi_wol_magic_config *req;
+	int status;
+
+	spin_lock_bh(&adapter->mcc_lock);
+
+	wrb = wrb_from_mccq(adapter);
+	if (!wrb) {
+		status = -EBUSY;
+		goto err;
+	}
+	req = nonemb_cmd->va;
+
+	be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_ETH,
+			       OPCODE_ETH_ACPI_WOL_MAGIC_CONFIG, sizeof(*req),
+			       wrb, nonemb_cmd);
+	memcpy(req->magic_mac, mac, ETH_ALEN);
+
+	status = be_mcc_notify_wait(adapter);
+
+err:
+	spin_unlock_bh(&adapter->mcc_lock);
+	return status;
+}
+
+int be_cmd_set_loopback(struct be_adapter *adapter, u8 port_num,
+			u8 loopback_type, u8 enable)
+{
+	struct be_mcc_wrb *wrb;
+	struct be_cmd_req_set_lmode *req;
+	int status;
+
+	spin_lock_bh(&adapter->mcc_lock);
+
+	wrb = wrb_from_mccq(adapter);
+	if (!wrb) {
+		status = -EBUSY;
+		goto err;
+	}
+
+	req = embedded_payload(wrb);
+
+	be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_LOWLEVEL,
+			       OPCODE_LOWLEVEL_SET_LOOPBACK_MODE, sizeof(*req),
+			       wrb, NULL);
+
+	req->src_port = port_num;
+	req->dest_port = port_num;
+	req->loopback_type = loopback_type;
+	req->loopback_state = enable;
+
+	status = be_mcc_notify_wait(adapter);
+err:
+	spin_unlock_bh(&adapter->mcc_lock);
+	return status;
+}
+
+int be_cmd_loopback_test(struct be_adapter *adapter, u32 port_num,
+			 u32 loopback_type, u32 pkt_size, u32 num_pkts,
+			 u64 pattern)
+{
+	struct be_mcc_wrb *wrb;
+	struct be_cmd_req_loopback_test *req;
+	struct be_cmd_resp_loopback_test *resp;
+	int status;
+
+	spin_lock_bh(&adapter->mcc_lock);
+
+	wrb = wrb_from_mccq(adapter);
+	if (!wrb) {
+		status = -EBUSY;
+		goto err;
+	}
+
+	req = embedded_payload(wrb);
+
+	be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_LOWLEVEL,
+			       OPCODE_LOWLEVEL_LOOPBACK_TEST, sizeof(*req), wrb,
+			       NULL);
+
+	req->hdr.timeout = cpu_to_le32(15);
+	req->pattern = cpu_to_le64(pattern);
+	req->src_port = cpu_to_le32(port_num);
+	req->dest_port = cpu_to_le32(port_num);
+	req->pkt_size = cpu_to_le32(pkt_size);
+	req->num_pkts = cpu_to_le32(num_pkts);
+	req->loopback_type = cpu_to_le32(loopback_type);
+
+	be_mcc_notify(adapter);
+
+	spin_unlock_bh(&adapter->mcc_lock);
+
+	wait_for_completion(&adapter->et_cmd_compl);
+	resp = embedded_payload(wrb);
+	status = le32_to_cpu(resp->status);
+
+	return status;
+err:
+	spin_unlock_bh(&adapter->mcc_lock);
+	return status;
+}
+
+int be_cmd_ddr_dma_test(struct be_adapter *adapter, u64 pattern,
+			u32 byte_cnt, struct be_dma_mem *cmd)
+{
+	struct be_mcc_wrb *wrb;
+	struct be_cmd_req_ddrdma_test *req;
+	int status;
+	int i, j = 0;
+
+	spin_lock_bh(&adapter->mcc_lock);
+
+	wrb = wrb_from_mccq(adapter);
+	if (!wrb) {
+		status = -EBUSY;
+		goto err;
+	}
+	req = cmd->va;
+	be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_LOWLEVEL,
+			       OPCODE_LOWLEVEL_HOST_DDR_DMA, cmd->size, wrb,
+			       cmd);
+
+	req->pattern = cpu_to_le64(pattern);
+	req->byte_count = cpu_to_le32(byte_cnt);
+	for (i = 0; i < byte_cnt; i++) {
+		req->snd_buff[i] = (u8)(pattern >> (j*8));
+		j++;
+		if (j > 7)
+			j = 0;
+	}
+
+	status = be_mcc_notify_wait(adapter);
+
+	if (!status) {
+		struct be_cmd_resp_ddrdma_test *resp;
+
+		resp = cmd->va;
+		if ((memcmp(resp->rcv_buff, req->snd_buff, byte_cnt) != 0) ||
+		    resp->snd_err) {
+			status = -1;
+		}
+	}
+
+err:
+	spin_unlock_bh(&adapter->mcc_lock);
+	return status;
+}
+
+int be_cmd_get_seeprom_data(struct be_adapter *adapter,
+			    struct be_dma_mem *nonemb_cmd)
+{
+	struct be_mcc_wrb *wrb;
+	struct be_cmd_req_seeprom_read *req;
+	int status;
+
+	spin_lock_bh(&adapter->mcc_lock);
+
+	wrb = wrb_from_mccq(adapter);
+	if (!wrb) {
+		status = -EBUSY;
+		goto err;
+	}
+	req = nonemb_cmd->va;
+
+	be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON,
+			       OPCODE_COMMON_SEEPROM_READ, sizeof(*req), wrb,
+			       nonemb_cmd);
+
+	status = be_mcc_notify_wait(adapter);
+
+err:
+	spin_unlock_bh(&adapter->mcc_lock);
+	return status;
+}
+
+int be_cmd_get_phy_info(struct be_adapter *adapter)
+{
+	struct be_mcc_wrb *wrb;
+	struct be_cmd_req_get_phy_info *req;
+	struct be_dma_mem cmd;
+	int status;
+
+	if (!be_cmd_allowed(adapter, OPCODE_COMMON_GET_PHY_DETAILS,
+			    CMD_SUBSYSTEM_COMMON))
+		return -EPERM;
+
+	spin_lock_bh(&adapter->mcc_lock);
+
+	wrb = wrb_from_mccq(adapter);
+	if (!wrb) {
+		status = -EBUSY;
+		goto err;
+	}
+	cmd.size = sizeof(struct be_cmd_req_get_phy_info);
+	cmd.va = pci_alloc_consistent(adapter->pdev, cmd.size, &cmd.dma);
+	if (!cmd.va) {
+		dev_err(&adapter->pdev->dev, "Memory alloc failure\n");
+		status = -ENOMEM;
+		goto err;
+	}
+
+	req = cmd.va;
+
+	be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON,
+			       OPCODE_COMMON_GET_PHY_DETAILS, sizeof(*req),
+			       wrb, &cmd);
+
+	status = be_mcc_notify_wait(adapter);
+	if (!status) {
+		struct be_phy_info *resp_phy_info =
+				cmd.va + sizeof(struct be_cmd_req_hdr);
+
+		adapter->phy.phy_type = le16_to_cpu(resp_phy_info->phy_type);
+		adapter->phy.interface_type =
+			le16_to_cpu(resp_phy_info->interface_type);
+		adapter->phy.auto_speeds_supported =
+			le16_to_cpu(resp_phy_info->auto_speeds_supported);
+		adapter->phy.fixed_speeds_supported =
+			le16_to_cpu(resp_phy_info->fixed_speeds_supported);
+		adapter->phy.misc_params =
+			le32_to_cpu(resp_phy_info->misc_params);
+
+		if (BE2_chip(adapter)) {
+			adapter->phy.fixed_speeds_supported =
+				BE_SUPPORTED_SPEED_10GBPS |
+				BE_SUPPORTED_SPEED_1GBPS;
+		}
+	}
+	pci_free_consistent(adapter->pdev, cmd.size, cmd.va, cmd.dma);
+err:
+	spin_unlock_bh(&adapter->mcc_lock);
+	return status;
+}
+
+int be_cmd_set_qos(struct be_adapter *adapter, u32 bps, u32 domain)
+{
+	struct be_mcc_wrb *wrb;
+	struct be_cmd_req_set_qos *req;
+	int status;
+
+	spin_lock_bh(&adapter->mcc_lock);
+
+	wrb = wrb_from_mccq(adapter);
+	if (!wrb) {
+		status = -EBUSY;
+		goto err;
+	}
+
+	req = embedded_payload(wrb);
+
+	be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON,
+			       OPCODE_COMMON_SET_QOS, sizeof(*req), wrb, NULL);
+
+	req->hdr.domain = domain;
+	req->valid_bits = cpu_to_le32(BE_QOS_BITS_NIC);
+	req->max_bps_nic = cpu_to_le32(bps);
+
+	status = be_mcc_notify_wait(adapter);
+
+err:
+	spin_unlock_bh(&adapter->mcc_lock);
+	return status;
+}
+
+int be_cmd_get_cntl_attributes(struct be_adapter *adapter)
+{
+	struct be_mcc_wrb *wrb;
+	struct be_cmd_req_cntl_attribs *req;
+	struct be_cmd_resp_cntl_attribs *resp;
+	int status;
+	int payload_len = max(sizeof(*req), sizeof(*resp));
+	struct mgmt_controller_attrib *attribs;
+	struct be_dma_mem attribs_cmd;
+
+	if (mutex_lock_interruptible(&adapter->mbox_lock))
+		return -1;
+
+	memset(&attribs_cmd, 0, sizeof(struct be_dma_mem));
+	attribs_cmd.size = sizeof(struct be_cmd_resp_cntl_attribs);
+	attribs_cmd.va = pci_alloc_consistent(adapter->pdev, attribs_cmd.size,
+					      &attribs_cmd.dma);
+	if (!attribs_cmd.va) {
+		dev_err(&adapter->pdev->dev, "Memory allocation failure\n");
+		status = -ENOMEM;
+		goto err;
+	}
+
+	wrb = wrb_from_mbox(adapter);
+	if (!wrb) {
+		status = -EBUSY;
+		goto err;
+	}
+	req = attribs_cmd.va;
+
+	be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON,
+			       OPCODE_COMMON_GET_CNTL_ATTRIBUTES, payload_len,
+			       wrb, &attribs_cmd);
+
+	status = be_mbox_notify_wait(adapter);
+	if (!status) {
+		attribs = attribs_cmd.va + sizeof(struct be_cmd_resp_hdr);
+		adapter->hba_port_num = attribs->hba_attribs.phy_port;
+	}
+
+err:
+	mutex_unlock(&adapter->mbox_lock);
+	if (attribs_cmd.va)
+		pci_free_consistent(adapter->pdev, attribs_cmd.size,
+				    attribs_cmd.va, attribs_cmd.dma);
+	return status;
+}
+
+/* Uses mbox */
+int be_cmd_req_native_mode(struct be_adapter *adapter)
+{
+	struct be_mcc_wrb *wrb;
+	struct be_cmd_req_set_func_cap *req;
+	int status;
+
+	if (mutex_lock_interruptible(&adapter->mbox_lock))
+		return -1;
+
+	wrb = wrb_from_mbox(adapter);
+	if (!wrb) {
+		status = -EBUSY;
+		goto err;
+	}
+
+	req = embedded_payload(wrb);
+
+	be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON,
+			       OPCODE_COMMON_SET_DRIVER_FUNCTION_CAP,
+			       sizeof(*req), wrb, NULL);
+
+	req->valid_cap_flags = cpu_to_le32(CAPABILITY_SW_TIMESTAMPS |
+				CAPABILITY_BE3_NATIVE_ERX_API);
+	req->cap_flags = cpu_to_le32(CAPABILITY_BE3_NATIVE_ERX_API);
+
+	status = be_mbox_notify_wait(adapter);
+	if (!status) {
+		struct be_cmd_resp_set_func_cap *resp = embedded_payload(wrb);
+
+		adapter->be3_native = le32_to_cpu(resp->cap_flags) &
+					CAPABILITY_BE3_NATIVE_ERX_API;
+		if (!adapter->be3_native)
+			dev_warn(&adapter->pdev->dev,
+				 "adapter not in advanced mode\n");
+	}
+err:
+	mutex_unlock(&adapter->mbox_lock);
+	return status;
+}
+
+/* Get privilege(s) for a function */
+int be_cmd_get_fn_privileges(struct be_adapter *adapter, u32 *privilege,
+			     u32 domain)
+{
+	struct be_mcc_wrb *wrb;
+	struct be_cmd_req_get_fn_privileges *req;
+	int status;
+
+	spin_lock_bh(&adapter->mcc_lock);
+
+	wrb = wrb_from_mccq(adapter);
+	if (!wrb) {
+		status = -EBUSY;
+		goto err;
+	}
+
+	req = embedded_payload(wrb);
+
+	be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON,
+			       OPCODE_COMMON_GET_FN_PRIVILEGES, sizeof(*req),
+			       wrb, NULL);
+
+	req->hdr.domain = domain;
+
+	status = be_mcc_notify_wait(adapter);
+	if (!status) {
+		struct be_cmd_resp_get_fn_privileges *resp =
+						embedded_payload(wrb);
+
+		*privilege = le32_to_cpu(resp->privilege_mask);
+
+		/* In UMC mode FW does not return right privileges.
+		 * Override with correct privilege equivalent to PF.
+		 */
+		if (BEx_chip(adapter) && be_is_mc(adapter) &&
+		    be_physfn(adapter))
+			*privilege = MAX_PRIVILEGES;
+	}
+
+err:
+	spin_unlock_bh(&adapter->mcc_lock);
+	return status;
+}
+
+/* Set privilege(s) for a function */
+int be_cmd_set_fn_privileges(struct be_adapter *adapter, u32 privileges,
+			     u32 domain)
+{
+	struct be_mcc_wrb *wrb;
+	struct be_cmd_req_set_fn_privileges *req;
+	int status;
+
+	spin_lock_bh(&adapter->mcc_lock);
+
+	wrb = wrb_from_mccq(adapter);
+	if (!wrb) {
+		status = -EBUSY;
+		goto err;
+	}
+
+	req = embedded_payload(wrb);
+	be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON,
+			       OPCODE_COMMON_SET_FN_PRIVILEGES, sizeof(*req),
+			       wrb, NULL);
+	req->hdr.domain = domain;
+	if (lancer_chip(adapter))
+		req->privileges_lancer = cpu_to_le32(privileges);
+	else
+		req->privileges = cpu_to_le32(privileges);
+
+	status = be_mcc_notify_wait(adapter);
+err:
+	spin_unlock_bh(&adapter->mcc_lock);
+	return status;
+}
+
+/* pmac_id_valid: true => pmac_id is supplied and MAC address is requested.
+ * pmac_id_valid: false => pmac_id or MAC address is requested.
+ *		  If pmac_id is returned, pmac_id_valid is returned as true
+ */
+int be_cmd_get_mac_from_list(struct be_adapter *adapter, u8 *mac,
+			     bool *pmac_id_valid, u32 *pmac_id, u32 if_handle,
+			     u8 domain)
+{
+	struct be_mcc_wrb *wrb;
+	struct be_cmd_req_get_mac_list *req;
+	int status;
+	int mac_count;
+	struct be_dma_mem get_mac_list_cmd;
+	int i;
+
+	memset(&get_mac_list_cmd, 0, sizeof(struct be_dma_mem));
+	get_mac_list_cmd.size = sizeof(struct be_cmd_resp_get_mac_list);
+	get_mac_list_cmd.va = pci_alloc_consistent(adapter->pdev,
+						   get_mac_list_cmd.size,
+						   &get_mac_list_cmd.dma);
+
+	if (!get_mac_list_cmd.va) {
+		dev_err(&adapter->pdev->dev,
+			"Memory allocation failure during GET_MAC_LIST\n");
+		return -ENOMEM;
+	}
+
+	spin_lock_bh(&adapter->mcc_lock);
+
+	wrb = wrb_from_mccq(adapter);
+	if (!wrb) {
+		status = -EBUSY;
+		goto out;
+	}
+
+	req = get_mac_list_cmd.va;
+
+	be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON,
+			       OPCODE_COMMON_GET_MAC_LIST,
+			       get_mac_list_cmd.size, wrb, &get_mac_list_cmd);
+	req->hdr.domain = domain;
+	req->mac_type = MAC_ADDRESS_TYPE_NETWORK;
+	if (*pmac_id_valid) {
+		req->mac_id = cpu_to_le32(*pmac_id);
+		req->iface_id = cpu_to_le16(if_handle);
+		req->perm_override = 0;
+	} else {
+		req->perm_override = 1;
+	}
+
+	status = be_mcc_notify_wait(adapter);
+	if (!status) {
+		struct be_cmd_resp_get_mac_list *resp =
+						get_mac_list_cmd.va;
+
+		if (*pmac_id_valid) {
+			memcpy(mac, resp->macid_macaddr.mac_addr_id.macaddr,
+			       ETH_ALEN);
+			goto out;
+		}
+
+		mac_count = resp->true_mac_count + resp->pseudo_mac_count;
+		/* Mac list returned could contain one or more active mac_ids
+		 * or one or more true or pseudo permanant mac addresses.
+		 * If an active mac_id is present, return first active mac_id
+		 * found.
+		 */
+		for (i = 0; i < mac_count; i++) {
+			struct get_list_macaddr *mac_entry;
+			u16 mac_addr_size;
+			u32 mac_id;
+
+			mac_entry = &resp->macaddr_list[i];
+			mac_addr_size = le16_to_cpu(mac_entry->mac_addr_size);
+			/* mac_id is a 32 bit value and mac_addr size
+			 * is 6 bytes
+			 */
+			if (mac_addr_size == sizeof(u32)) {
+				*pmac_id_valid = true;
+				mac_id = mac_entry->mac_addr_id.s_mac_id.mac_id;
+				*pmac_id = le32_to_cpu(mac_id);
+				goto out;
+			}
+		}
+		/* If no active mac_id found, return first mac addr */
+		*pmac_id_valid = false;
+		memcpy(mac, resp->macaddr_list[0].mac_addr_id.macaddr,
+		       ETH_ALEN);
+	}
+
+out:
+	spin_unlock_bh(&adapter->mcc_lock);
+	pci_free_consistent(adapter->pdev, get_mac_list_cmd.size,
+			    get_mac_list_cmd.va, get_mac_list_cmd.dma);
+	return status;
+}
+
+int be_cmd_get_active_mac(struct be_adapter *adapter, u32 curr_pmac_id,
+			  u8 *mac, u32 if_handle, bool active, u32 domain)
+{
+	if (!active)
+		be_cmd_get_mac_from_list(adapter, mac, &active, &curr_pmac_id,
+					 if_handle, domain);
+	if (BEx_chip(adapter))
+		return be_cmd_mac_addr_query(adapter, mac, false,
+					     if_handle, curr_pmac_id);
+	else
+		/* Fetch the MAC address using pmac_id */
+		return be_cmd_get_mac_from_list(adapter, mac, &active,
+						&curr_pmac_id,
+						if_handle, domain);
+}
+
+int be_cmd_get_perm_mac(struct be_adapter *adapter, u8 *mac)
+{
+	int status;
+	bool pmac_valid = false;
+
+	memset(mac, 0, ETH_ALEN);
+
+	if (BEx_chip(adapter)) {
+		if (be_physfn(adapter))
+			status = be_cmd_mac_addr_query(adapter, mac, true, 0,
+						       0);
+		else
+			status = be_cmd_mac_addr_query(adapter, mac, false,
+						       adapter->if_handle, 0);
+	} else {
+		status = be_cmd_get_mac_from_list(adapter, mac, &pmac_valid,
+						  NULL, adapter->if_handle, 0);
+	}
+
+	return status;
+}
+
+/* Uses synchronous MCCQ */
+int be_cmd_set_mac_list(struct be_adapter *adapter, u8 *mac_array,
+			u8 mac_count, u32 domain)
+{
+	struct be_mcc_wrb *wrb;
+	struct be_cmd_req_set_mac_list *req;
+	int status;
+	struct be_dma_mem cmd;
+
+	memset(&cmd, 0, sizeof(struct be_dma_mem));
+	cmd.size = sizeof(struct be_cmd_req_set_mac_list);
+	cmd.va = dma_alloc_coherent(&adapter->pdev->dev, cmd.size,
+				    &cmd.dma, GFP_KERNEL);
+	if (!cmd.va)
+		return -ENOMEM;
+
+	spin_lock_bh(&adapter->mcc_lock);
+
+	wrb = wrb_from_mccq(adapter);
+	if (!wrb) {
+		status = -EBUSY;
+		goto err;
+	}
+
+	req = cmd.va;
+	be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON,
+			       OPCODE_COMMON_SET_MAC_LIST, sizeof(*req),
+			       wrb, &cmd);
+
+	req->hdr.domain = domain;
+	req->mac_count = mac_count;
+	if (mac_count)
+		memcpy(req->mac, mac_array, ETH_ALEN*mac_count);
+
+	status = be_mcc_notify_wait(adapter);
+
+err:
+	dma_free_coherent(&adapter->pdev->dev, cmd.size, cmd.va, cmd.dma);
+	spin_unlock_bh(&adapter->mcc_lock);
+	return status;
+}
+
+/* Wrapper to delete any active MACs and provision the new mac.
+ * Changes to MAC_LIST are allowed iff none of the MAC addresses in the
+ * current list are active.
+ */
+int be_cmd_set_mac(struct be_adapter *adapter, u8 *mac, int if_id, u32 dom)
+{
+	bool active_mac = false;
+	u8 old_mac[ETH_ALEN];
+	u32 pmac_id;
+	int status;
+
+	status = be_cmd_get_mac_from_list(adapter, old_mac, &active_mac,
+					  &pmac_id, if_id, dom);
+
+	if (!status && active_mac)
+		be_cmd_pmac_del(adapter, if_id, pmac_id, dom);
+
+	return be_cmd_set_mac_list(adapter, mac, mac ? 1 : 0, dom);
+}
+
+int be_cmd_set_hsw_config(struct be_adapter *adapter, u16 pvid,
+			  u32 domain, u16 intf_id, u16 hsw_mode)
+{
+	struct be_mcc_wrb *wrb;
+	struct be_cmd_req_set_hsw_config *req;
+	void *ctxt;
+	int status;
+
+	spin_lock_bh(&adapter->mcc_lock);
+
+	wrb = wrb_from_mccq(adapter);
+	if (!wrb) {
+		status = -EBUSY;
+		goto err;
+	}
+
+	req = embedded_payload(wrb);
+	ctxt = &req->context;
+
+	be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON,
+			       OPCODE_COMMON_SET_HSW_CONFIG, sizeof(*req), wrb,
+			       NULL);
+
+	req->hdr.domain = domain;
+	AMAP_SET_BITS(struct amap_set_hsw_context, interface_id, ctxt, intf_id);
+	if (pvid) {
+		AMAP_SET_BITS(struct amap_set_hsw_context, pvid_valid, ctxt, 1);
+		AMAP_SET_BITS(struct amap_set_hsw_context, pvid, ctxt, pvid);
+	}
+	if (!BEx_chip(adapter) && hsw_mode) {
+		AMAP_SET_BITS(struct amap_set_hsw_context, interface_id,
+			      ctxt, adapter->hba_port_num);
+		AMAP_SET_BITS(struct amap_set_hsw_context, pport, ctxt, 1);
+		AMAP_SET_BITS(struct amap_set_hsw_context, port_fwd_type,
+			      ctxt, hsw_mode);
+	}
+
+	be_dws_cpu_to_le(req->context, sizeof(req->context));
+	status = be_mcc_notify_wait(adapter);
+
+err:
+	spin_unlock_bh(&adapter->mcc_lock);
+	return status;
+}
+
+/* Get Hyper switch config */
+int be_cmd_get_hsw_config(struct be_adapter *adapter, u16 *pvid,
+			  u32 domain, u16 intf_id, u8 *mode)
+{
+	struct be_mcc_wrb *wrb;
+	struct be_cmd_req_get_hsw_config *req;
+	void *ctxt;
+	int status;
+	u16 vid;
+
+	spin_lock_bh(&adapter->mcc_lock);
+
+	wrb = wrb_from_mccq(adapter);
+	if (!wrb) {
+		status = -EBUSY;
+		goto err;
+	}
+
+	req = embedded_payload(wrb);
+	ctxt = &req->context;
+
+	be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON,
+			       OPCODE_COMMON_GET_HSW_CONFIG, sizeof(*req), wrb,
+			       NULL);
+
+	req->hdr.domain = domain;
+	AMAP_SET_BITS(struct amap_get_hsw_req_context, interface_id,
+		      ctxt, intf_id);
+	AMAP_SET_BITS(struct amap_get_hsw_req_context, pvid_valid, ctxt, 1);
+
+	if (!BEx_chip(adapter) && mode) {
+		AMAP_SET_BITS(struct amap_get_hsw_req_context, interface_id,
+			      ctxt, adapter->hba_port_num);
+		AMAP_SET_BITS(struct amap_get_hsw_req_context, pport, ctxt, 1);
+	}
+	be_dws_cpu_to_le(req->context, sizeof(req->context));
+
+	status = be_mcc_notify_wait(adapter);
+	if (!status) {
+		struct be_cmd_resp_get_hsw_config *resp =
+						embedded_payload(wrb);
+
+		be_dws_le_to_cpu(&resp->context, sizeof(resp->context));
+		vid = AMAP_GET_BITS(struct amap_get_hsw_resp_context,
+				    pvid, &resp->context);
+		if (pvid)
+			*pvid = le16_to_cpu(vid);
+		if (mode)
+			*mode = AMAP_GET_BITS(struct amap_get_hsw_resp_context,
+					      port_fwd_type, &resp->context);
+	}
+
+err:
+	spin_unlock_bh(&adapter->mcc_lock);
+	return status;
+}
+
+int be_cmd_get_acpi_wol_cap(struct be_adapter *adapter)
+{
+	struct be_mcc_wrb *wrb;
+	struct be_cmd_req_acpi_wol_magic_config_v1 *req;
+	int status = 0;
+	struct be_dma_mem cmd;
+
+	if (!be_cmd_allowed(adapter, OPCODE_ETH_ACPI_WOL_MAGIC_CONFIG,
+			    CMD_SUBSYSTEM_ETH))
+		return -EPERM;
+
+	if (be_is_wol_excluded(adapter))
+		return status;
+
+	if (mutex_lock_interruptible(&adapter->mbox_lock))
+		return -1;
+
+	memset(&cmd, 0, sizeof(struct be_dma_mem));
+	cmd.size = sizeof(struct be_cmd_resp_acpi_wol_magic_config_v1);
+	cmd.va = pci_alloc_consistent(adapter->pdev, cmd.size, &cmd.dma);
+	if (!cmd.va) {
+		dev_err(&adapter->pdev->dev, "Memory allocation failure\n");
+		status = -ENOMEM;
+		goto err;
+	}
+
+	wrb = wrb_from_mbox(adapter);
+	if (!wrb) {
+		status = -EBUSY;
+		goto err;
+	}
+
+	req = cmd.va;
+
+	be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_ETH,
+			       OPCODE_ETH_ACPI_WOL_MAGIC_CONFIG,
+			       sizeof(*req), wrb, &cmd);
+
+	req->hdr.version = 1;
+	req->query_options = BE_GET_WOL_CAP;
+
+	status = be_mbox_notify_wait(adapter);
+	if (!status) {
+		struct be_cmd_resp_acpi_wol_magic_config_v1 *resp;
+
+		resp = (struct be_cmd_resp_acpi_wol_magic_config_v1 *)cmd.va;
+
+		adapter->wol_cap = resp->wol_settings;
+		if (adapter->wol_cap & BE_WOL_CAP)
+			adapter->wol_en = true;
+	}
+err:
+	mutex_unlock(&adapter->mbox_lock);
+	if (cmd.va)
+		pci_free_consistent(adapter->pdev, cmd.size, cmd.va, cmd.dma);
+	return status;
+
+}
+
+int be_cmd_set_fw_log_level(struct be_adapter *adapter, u32 level)
+{
+	struct be_dma_mem extfat_cmd;
+	struct be_fat_conf_params *cfgs;
+	int status;
+	int i, j;
+
+	memset(&extfat_cmd, 0, sizeof(struct be_dma_mem));
+	extfat_cmd.size = sizeof(struct be_cmd_resp_get_ext_fat_caps);
+	extfat_cmd.va = pci_alloc_consistent(adapter->pdev, extfat_cmd.size,
+					     &extfat_cmd.dma);
+	if (!extfat_cmd.va)
+		return -ENOMEM;
+
+	status = be_cmd_get_ext_fat_capabilites(adapter, &extfat_cmd);
+	if (status)
+		goto err;
+
+	cfgs = (struct be_fat_conf_params *)
+			(extfat_cmd.va + sizeof(struct be_cmd_resp_hdr));
+	for (i = 0; i < le32_to_cpu(cfgs->num_modules); i++) {
+		u32 num_modes = le32_to_cpu(cfgs->module[i].num_modes);
+
+		for (j = 0; j < num_modes; j++) {
+			if (cfgs->module[i].trace_lvl[j].mode == MODE_UART)
+				cfgs->module[i].trace_lvl[j].dbg_lvl =
+							cpu_to_le32(level);
+		}
+	}
+
+	status = be_cmd_set_ext_fat_capabilites(adapter, &extfat_cmd, cfgs);
+err:
+	pci_free_consistent(adapter->pdev, extfat_cmd.size, extfat_cmd.va,
+			    extfat_cmd.dma);
+	return status;
+}
+
+int be_cmd_get_fw_log_level(struct be_adapter *adapter)
+{
+	struct be_dma_mem extfat_cmd;
+	struct be_fat_conf_params *cfgs;
+	int status, j;
+	int level = 0;
+
+	memset(&extfat_cmd, 0, sizeof(struct be_dma_mem));
+	extfat_cmd.size = sizeof(struct be_cmd_resp_get_ext_fat_caps);
+	extfat_cmd.va = pci_alloc_consistent(adapter->pdev, extfat_cmd.size,
+					     &extfat_cmd.dma);
+
+	if (!extfat_cmd.va) {
+		dev_err(&adapter->pdev->dev, "%s: Memory allocation failure\n",
+			__func__);
+		goto err;
+	}
+
+	status = be_cmd_get_ext_fat_capabilites(adapter, &extfat_cmd);
+	if (!status) {
+		cfgs = (struct be_fat_conf_params *)(extfat_cmd.va +
+						sizeof(struct be_cmd_resp_hdr));
+
+		for (j = 0; j < le32_to_cpu(cfgs->module[0].num_modes); j++) {
+			if (cfgs->module[0].trace_lvl[j].mode == MODE_UART)
+				level = cfgs->module[0].trace_lvl[j].dbg_lvl;
+		}
+	}
+	pci_free_consistent(adapter->pdev, extfat_cmd.size, extfat_cmd.va,
+			    extfat_cmd.dma);
+err:
+	return level;
+}
+
+int be_cmd_get_ext_fat_capabilites(struct be_adapter *adapter,
+				   struct be_dma_mem *cmd)
+{
+	struct be_mcc_wrb *wrb;
+	struct be_cmd_req_get_ext_fat_caps *req;
+	int status;
+
+	if (mutex_lock_interruptible(&adapter->mbox_lock))
+		return -1;
+
+	wrb = wrb_from_mbox(adapter);
+	if (!wrb) {
+		status = -EBUSY;
+		goto err;
+	}
+
+	req = cmd->va;
+	be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON,
+			       OPCODE_COMMON_GET_EXT_FAT_CAPABILITES,
+			       cmd->size, wrb, cmd);
+	req->parameter_type = cpu_to_le32(1);
+
+	status = be_mbox_notify_wait(adapter);
+err:
+	mutex_unlock(&adapter->mbox_lock);
+	return status;
+}
+
+int be_cmd_set_ext_fat_capabilites(struct be_adapter *adapter,
+				   struct be_dma_mem *cmd,
+				   struct be_fat_conf_params *configs)
+{
+	struct be_mcc_wrb *wrb;
+	struct be_cmd_req_set_ext_fat_caps *req;
+	int status;
+
+	spin_lock_bh(&adapter->mcc_lock);
+
+	wrb = wrb_from_mccq(adapter);
+	if (!wrb) {
+		status = -EBUSY;
+		goto err;
+	}
+
+	req = cmd->va;
+	memcpy(&req->set_params, configs, sizeof(struct be_fat_conf_params));
+	be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON,
+			       OPCODE_COMMON_SET_EXT_FAT_CAPABILITES,
+			       cmd->size, wrb, cmd);
+
+	status = be_mcc_notify_wait(adapter);
+err:
+	spin_unlock_bh(&adapter->mcc_lock);
+	return status;
+}
+
+int be_cmd_query_port_name(struct be_adapter *adapter, u8 *port_name)
+{
+	struct be_mcc_wrb *wrb;
+	struct be_cmd_req_get_port_name *req;
+	int status;
+
+	if (!lancer_chip(adapter)) {
+		*port_name = adapter->hba_port_num + '0';
+		return 0;
+	}
+
+	spin_lock_bh(&adapter->mcc_lock);
+
+	wrb = wrb_from_mccq(adapter);
+	if (!wrb) {
+		status = -EBUSY;
+		goto err;
+	}
+
+	req = embedded_payload(wrb);
+
+	be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON,
+			       OPCODE_COMMON_GET_PORT_NAME, sizeof(*req), wrb,
+			       NULL);
+	req->hdr.version = 1;
+
+	status = be_mcc_notify_wait(adapter);
+	if (!status) {
+		struct be_cmd_resp_get_port_name *resp = embedded_payload(wrb);
+
+		*port_name = resp->port_name[adapter->hba_port_num];
+	} else {
+		*port_name = adapter->hba_port_num + '0';
+	}
+err:
+	spin_unlock_bh(&adapter->mcc_lock);
+	return status;
+}
+
+/* Descriptor type */
+enum {
+	FUNC_DESC = 1,
+	VFT_DESC = 2
+};
+
+static struct be_nic_res_desc *be_get_nic_desc(u8 *buf, u32 desc_count,
+					       int desc_type)
+{
+	struct be_res_desc_hdr *hdr = (struct be_res_desc_hdr *)buf;
+	struct be_nic_res_desc *nic;
+	int i;
+
+	for (i = 0; i < desc_count; i++) {
+		if (hdr->desc_type == NIC_RESOURCE_DESC_TYPE_V0 ||
+		    hdr->desc_type == NIC_RESOURCE_DESC_TYPE_V1) {
+			nic = (struct be_nic_res_desc *)hdr;
+			if (desc_type == FUNC_DESC ||
+			    (desc_type == VFT_DESC &&
+			     nic->flags & (1 << VFT_SHIFT)))
+				return nic;
+		}
+
+		hdr->desc_len = hdr->desc_len ? : RESOURCE_DESC_SIZE_V0;
+		hdr = (void *)hdr + hdr->desc_len;
+	}
+	return NULL;
+}
+
+static struct be_nic_res_desc *be_get_vft_desc(u8 *buf, u32 desc_count)
+{
+	return be_get_nic_desc(buf, desc_count, VFT_DESC);
+}
+
+static struct be_nic_res_desc *be_get_func_nic_desc(u8 *buf, u32 desc_count)
+{
+	return be_get_nic_desc(buf, desc_count, FUNC_DESC);
+}
+
+static struct be_pcie_res_desc *be_get_pcie_desc(u8 devfn, u8 *buf,
+						 u32 desc_count)
+{
+	struct be_res_desc_hdr *hdr = (struct be_res_desc_hdr *)buf;
+	struct be_pcie_res_desc *pcie;
+	int i;
+
+	for (i = 0; i < desc_count; i++) {
+		if ((hdr->desc_type == PCIE_RESOURCE_DESC_TYPE_V0 ||
+		     hdr->desc_type == PCIE_RESOURCE_DESC_TYPE_V1)) {
+			pcie = (struct be_pcie_res_desc	*)hdr;
+			if (pcie->pf_num == devfn)
+				return pcie;
+		}
+
+		hdr->desc_len = hdr->desc_len ? : RESOURCE_DESC_SIZE_V0;
+		hdr = (void *)hdr + hdr->desc_len;
+	}
+	return NULL;
+}
+
+static struct be_port_res_desc *be_get_port_desc(u8 *buf, u32 desc_count)
+{
+	struct be_res_desc_hdr *hdr = (struct be_res_desc_hdr *)buf;
+	int i;
+
+	for (i = 0; i < desc_count; i++) {
+		if (hdr->desc_type == PORT_RESOURCE_DESC_TYPE_V1)
+			return (struct be_port_res_desc *)hdr;
+
+		hdr->desc_len = hdr->desc_len ? : RESOURCE_DESC_SIZE_V0;
+		hdr = (void *)hdr + hdr->desc_len;
+	}
+	return NULL;
+}
+
+static void be_copy_nic_desc(struct be_resources *res,
+			     struct be_nic_res_desc *desc)
+{
+	res->max_uc_mac = le16_to_cpu(desc->unicast_mac_count);
+	res->max_vlans = le16_to_cpu(desc->vlan_count);
+	res->max_mcast_mac = le16_to_cpu(desc->mcast_mac_count);
+	res->max_tx_qs = le16_to_cpu(desc->txq_count);
+	res->max_rss_qs = le16_to_cpu(desc->rssq_count);
+	res->max_rx_qs = le16_to_cpu(desc->rq_count);
+	res->max_evt_qs = le16_to_cpu(desc->eq_count);
+	/* Clear flags that driver is not interested in */
+	res->if_cap_flags = le32_to_cpu(desc->cap_flags) &
+				BE_IF_CAP_FLAGS_WANT;
+	/* Need 1 RXQ as the default RXQ */
+	if (res->max_rss_qs && res->max_rss_qs == res->max_rx_qs)
+		res->max_rss_qs -= 1;
+}
+
+/* Uses Mbox */
+int be_cmd_get_func_config(struct be_adapter *adapter, struct be_resources *res)
+{
+	struct be_mcc_wrb *wrb;
+	struct be_cmd_req_get_func_config *req;
+	int status;
+	struct be_dma_mem cmd;
+
+	if (mutex_lock_interruptible(&adapter->mbox_lock))
+		return -1;
+
+	memset(&cmd, 0, sizeof(struct be_dma_mem));
+	cmd.size = sizeof(struct be_cmd_resp_get_func_config);
+	cmd.va = pci_alloc_consistent(adapter->pdev, cmd.size, &cmd.dma);
+	if (!cmd.va) {
+		dev_err(&adapter->pdev->dev, "Memory alloc failure\n");
+		status = -ENOMEM;
+		goto err;
+	}
+
+	wrb = wrb_from_mbox(adapter);
+	if (!wrb) {
+		status = -EBUSY;
+		goto err;
+	}
+
+	req = cmd.va;
+
+	be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON,
+			       OPCODE_COMMON_GET_FUNC_CONFIG,
+			       cmd.size, wrb, &cmd);
+
+	if (skyhawk_chip(adapter))
+		req->hdr.version = 1;
+
+	status = be_mbox_notify_wait(adapter);
+	if (!status) {
+		struct be_cmd_resp_get_func_config *resp = cmd.va;
+		u32 desc_count = le32_to_cpu(resp->desc_count);
+		struct be_nic_res_desc *desc;
+
+		desc = be_get_func_nic_desc(resp->func_param, desc_count);
+		if (!desc) {
+			status = -EINVAL;
+			goto err;
+		}
+
+		adapter->pf_number = desc->pf_num;
+		be_copy_nic_desc(res, desc);
+	}
+err:
+	mutex_unlock(&adapter->mbox_lock);
+	if (cmd.va)
+		pci_free_consistent(adapter->pdev, cmd.size, cmd.va, cmd.dma);
+	return status;
+}
+
+/* Will use MBOX only if MCCQ has not been created */
+int be_cmd_get_profile_config(struct be_adapter *adapter,
+			      struct be_resources *res, u8 domain)
+{
+	struct be_cmd_resp_get_profile_config *resp;
+	struct be_cmd_req_get_profile_config *req;
+	struct be_nic_res_desc *vf_res;
+	struct be_pcie_res_desc *pcie;
+	struct be_port_res_desc *port;
+	struct be_nic_res_desc *nic;
+	struct be_mcc_wrb wrb = {0};
+	struct be_dma_mem cmd;
+	u32 desc_count;
+	int status;
+
+	memset(&cmd, 0, sizeof(struct be_dma_mem));
+	cmd.size = sizeof(struct be_cmd_resp_get_profile_config);
+	cmd.va = pci_alloc_consistent(adapter->pdev, cmd.size, &cmd.dma);
+	if (!cmd.va)
+		return -ENOMEM;
+
+	req = cmd.va;
+	be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON,
+			       OPCODE_COMMON_GET_PROFILE_CONFIG,
+			       cmd.size, &wrb, &cmd);
+
+	req->hdr.domain = domain;
+	if (!lancer_chip(adapter))
+		req->hdr.version = 1;
+	req->type = ACTIVE_PROFILE_TYPE;
+
+	status = be_cmd_notify_wait(adapter, &wrb);
+	if (status)
+		goto err;
+
+	resp = cmd.va;
+	desc_count = le32_to_cpu(resp->desc_count);
+
+	pcie = be_get_pcie_desc(adapter->pdev->devfn, resp->func_param,
+				desc_count);
+	if (pcie)
+		res->max_vfs = le16_to_cpu(pcie->num_vfs);
+
+	port = be_get_port_desc(resp->func_param, desc_count);
+	if (port)
+		adapter->mc_type = port->mc_type;
+
+	nic = be_get_func_nic_desc(resp->func_param, desc_count);
+	if (nic)
+		be_copy_nic_desc(res, nic);
+
+	vf_res = be_get_vft_desc(resp->func_param, desc_count);
+	if (vf_res)
+		res->vf_if_cap_flags = vf_res->cap_flags;
+err:
+	if (cmd.va)
+		pci_free_consistent(adapter->pdev, cmd.size, cmd.va, cmd.dma);
+	return status;
+}
+
+/* Will use MBOX only if MCCQ has not been created */
+static int be_cmd_set_profile_config(struct be_adapter *adapter, void *desc,
+				     int size, int count, u8 version, u8 domain)
+{
+	struct be_cmd_req_set_profile_config *req;
+	struct be_mcc_wrb wrb = {0};
+	struct be_dma_mem cmd;
+	int status;
+
+	memset(&cmd, 0, sizeof(struct be_dma_mem));
+	cmd.size = sizeof(struct be_cmd_req_set_profile_config);
+	cmd.va = pci_alloc_consistent(adapter->pdev, cmd.size, &cmd.dma);
+	if (!cmd.va)
+		return -ENOMEM;
+
+	req = cmd.va;
+	be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON,
+			       OPCODE_COMMON_SET_PROFILE_CONFIG, cmd.size,
+			       &wrb, &cmd);
+	req->hdr.version = version;
+	req->hdr.domain = domain;
+	req->desc_count = cpu_to_le32(count);
+	memcpy(req->desc, desc, size);
+
+	status = be_cmd_notify_wait(adapter, &wrb);
+
+	if (cmd.va)
+		pci_free_consistent(adapter->pdev, cmd.size, cmd.va, cmd.dma);
+	return status;
+}
+
+/* Mark all fields invalid */
+static void be_reset_nic_desc(struct be_nic_res_desc *nic)
+{
+	memset(nic, 0, sizeof(*nic));
+	nic->unicast_mac_count = 0xFFFF;
+	nic->mcc_count = 0xFFFF;
+	nic->vlan_count = 0xFFFF;
+	nic->mcast_mac_count = 0xFFFF;
+	nic->txq_count = 0xFFFF;
+	nic->rq_count = 0xFFFF;
+	nic->rssq_count = 0xFFFF;
+	nic->lro_count = 0xFFFF;
+	nic->cq_count = 0xFFFF;
+	nic->toe_conn_count = 0xFFFF;
+	nic->eq_count = 0xFFFF;
+	nic->iface_count = 0xFFFF;
+	nic->link_param = 0xFF;
+	nic->channel_id_param = cpu_to_le16(0xF000);
+	nic->acpi_params = 0xFF;
+	nic->wol_param = 0x0F;
+	nic->tunnel_iface_count = 0xFFFF;
+	nic->direct_tenant_iface_count = 0xFFFF;
+	nic->bw_min = 0xFFFFFFFF;
+	nic->bw_max = 0xFFFFFFFF;
+}
+
+/* Mark all fields invalid */
+static void be_reset_pcie_desc(struct be_pcie_res_desc *pcie)
+{
+	memset(pcie, 0, sizeof(*pcie));
+	pcie->sriov_state = 0xFF;
+	pcie->pf_state = 0xFF;
+	pcie->pf_type = 0xFF;
+	pcie->num_vfs = 0xFFFF;
+}
+
+int be_cmd_config_qos(struct be_adapter *adapter, u32 max_rate, u16 link_speed,
+		      u8 domain)
+{
+	struct be_nic_res_desc nic_desc;
+	u32 bw_percent;
+	u16 version = 0;
+
+	if (BE3_chip(adapter))
+		return be_cmd_set_qos(adapter, max_rate / 10, domain);
+
+	be_reset_nic_desc(&nic_desc);
+	nic_desc.pf_num = adapter->pf_number;
+	nic_desc.vf_num = domain;
+	if (lancer_chip(adapter)) {
+		nic_desc.hdr.desc_type = NIC_RESOURCE_DESC_TYPE_V0;
+		nic_desc.hdr.desc_len = RESOURCE_DESC_SIZE_V0;
+		nic_desc.flags = (1 << QUN_SHIFT) | (1 << IMM_SHIFT) |
+					(1 << NOSV_SHIFT);
+		nic_desc.bw_max = cpu_to_le32(max_rate / 10);
+	} else {
+		version = 1;
+		nic_desc.hdr.desc_type = NIC_RESOURCE_DESC_TYPE_V1;
+		nic_desc.hdr.desc_len = RESOURCE_DESC_SIZE_V1;
+		nic_desc.flags = (1 << IMM_SHIFT) | (1 << NOSV_SHIFT);
+		bw_percent = max_rate ? (max_rate * 100) / link_speed : 100;
+		nic_desc.bw_max = cpu_to_le32(bw_percent);
+	}
+
+	return be_cmd_set_profile_config(adapter, &nic_desc,
+					 nic_desc.hdr.desc_len,
+					 1, version, domain);
+}
+
+int be_cmd_set_sriov_config(struct be_adapter *adapter,
+			    struct be_resources res, u16 num_vfs)
+{
+	struct {
+		struct be_pcie_res_desc pcie;
+		struct be_nic_res_desc nic_vft;
+	} __packed desc;
+	u16 vf_q_count;
+
+	if (BEx_chip(adapter) || lancer_chip(adapter))
+		return 0;
+
+	/* PF PCIE descriptor */
+	be_reset_pcie_desc(&desc.pcie);
+	desc.pcie.hdr.desc_type = PCIE_RESOURCE_DESC_TYPE_V1;
+	desc.pcie.hdr.desc_len = RESOURCE_DESC_SIZE_V1;
+	desc.pcie.flags = (1 << IMM_SHIFT) | (1 << NOSV_SHIFT);
+	desc.pcie.pf_num = adapter->pdev->devfn;
+	desc.pcie.sriov_state = num_vfs ? 1 : 0;
+	desc.pcie.num_vfs = cpu_to_le16(num_vfs);
+
+	/* VF NIC Template descriptor */
+	be_reset_nic_desc(&desc.nic_vft);
+	desc.nic_vft.hdr.desc_type = NIC_RESOURCE_DESC_TYPE_V1;
+	desc.nic_vft.hdr.desc_len = RESOURCE_DESC_SIZE_V1;
+	desc.nic_vft.flags = (1 << VFT_SHIFT) | (1 << IMM_SHIFT) |
+				(1 << NOSV_SHIFT);
+	desc.nic_vft.pf_num = adapter->pdev->devfn;
+	desc.nic_vft.vf_num = 0;
+
+	if (num_vfs && res.vf_if_cap_flags & BE_IF_FLAGS_RSS) {
+		/* If number of VFs requested is 8 less than max supported,
+		 * assign 8 queue pairs to the PF and divide the remaining
+		 * resources evenly among the VFs
+		 */
+		if (num_vfs < (be_max_vfs(adapter) - 8))
+			vf_q_count = (res.max_rss_qs - 8) / num_vfs;
+		else
+			vf_q_count = res.max_rss_qs / num_vfs;
+
+		desc.nic_vft.rq_count = cpu_to_le16(vf_q_count);
+		desc.nic_vft.txq_count = cpu_to_le16(vf_q_count);
+		desc.nic_vft.rssq_count = cpu_to_le16(vf_q_count - 1);
+		desc.nic_vft.cq_count = cpu_to_le16(3 * vf_q_count);
+	} else {
+		desc.nic_vft.txq_count = cpu_to_le16(1);
+		desc.nic_vft.rq_count = cpu_to_le16(1);
+		desc.nic_vft.rssq_count = cpu_to_le16(0);
+		/* One CQ for each TX, RX and MCCQ */
+		desc.nic_vft.cq_count = cpu_to_le16(3);
+	}
+
+	return be_cmd_set_profile_config(adapter, &desc,
+					 2 * RESOURCE_DESC_SIZE_V1, 2, 1, 0);
+}
+
+int be_cmd_manage_iface(struct be_adapter *adapter, u32 iface, u8 op)
+{
+	struct be_mcc_wrb *wrb;
+	struct be_cmd_req_manage_iface_filters *req;
+	int status;
+
+	if (iface == 0xFFFFFFFF)
+		return -1;
+
+	spin_lock_bh(&adapter->mcc_lock);
+
+	wrb = wrb_from_mccq(adapter);
+	if (!wrb) {
+		status = -EBUSY;
+		goto err;
+	}
+	req = embedded_payload(wrb);
+
+	be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON,
+			       OPCODE_COMMON_MANAGE_IFACE_FILTERS, sizeof(*req),
+			       wrb, NULL);
+	req->op = op;
+	req->target_iface_id = cpu_to_le32(iface);
+
+	status = be_mcc_notify_wait(adapter);
+err:
+	spin_unlock_bh(&adapter->mcc_lock);
+	return status;
+}
+
+int be_cmd_set_vxlan_port(struct be_adapter *adapter, __be16 port)
+{
+	struct be_port_res_desc port_desc;
+
+	memset(&port_desc, 0, sizeof(port_desc));
+	port_desc.hdr.desc_type = PORT_RESOURCE_DESC_TYPE_V1;
+	port_desc.hdr.desc_len = RESOURCE_DESC_SIZE_V1;
+	port_desc.flags = (1 << IMM_SHIFT) | (1 << NOSV_SHIFT);
+	port_desc.link_num = adapter->hba_port_num;
+	if (port) {
+		port_desc.nv_flags = NV_TYPE_VXLAN | (1 << SOCVID_SHIFT) |
+					(1 << RCVID_SHIFT);
+		port_desc.nv_port = swab16(port);
+	} else {
+		port_desc.nv_flags = NV_TYPE_DISABLED;
+		port_desc.nv_port = 0;
+	}
+
+	return be_cmd_set_profile_config(adapter, &port_desc,
+					 RESOURCE_DESC_SIZE_V1, 1, 1, 0);
+}
+
+int be_cmd_get_if_id(struct be_adapter *adapter, struct be_vf_cfg *vf_cfg,
+		     int vf_num)
+{
+	struct be_mcc_wrb *wrb;
+	struct be_cmd_req_get_iface_list *req;
+	struct be_cmd_resp_get_iface_list *resp;
+	int status;
+
+	spin_lock_bh(&adapter->mcc_lock);
+
+	wrb = wrb_from_mccq(adapter);
+	if (!wrb) {
+		status = -EBUSY;
+		goto err;
+	}
+	req = embedded_payload(wrb);
+
+	be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON,
+			       OPCODE_COMMON_GET_IFACE_LIST, sizeof(*resp),
+			       wrb, NULL);
+	req->hdr.domain = vf_num + 1;
+
+	status = be_mcc_notify_wait(adapter);
+	if (!status) {
+		resp = (struct be_cmd_resp_get_iface_list *)req;
+		vf_cfg->if_handle = le32_to_cpu(resp->if_desc.if_id);
+	}
+
+err:
+	spin_unlock_bh(&adapter->mcc_lock);
+	return status;
+}
+
+static int lancer_wait_idle(struct be_adapter *adapter)
+{
+#define SLIPORT_IDLE_TIMEOUT 30
+	u32 reg_val;
+	int status = 0, i;
+
+	for (i = 0; i < SLIPORT_IDLE_TIMEOUT; i++) {
+		reg_val = ioread32(adapter->db + PHYSDEV_CONTROL_OFFSET);
+		if ((reg_val & PHYSDEV_CONTROL_INP_MASK) == 0)
+			break;
+
+		ssleep(1);
+	}
+
+	if (i == SLIPORT_IDLE_TIMEOUT)
+		status = -1;
+
+	return status;
+}
+
+int lancer_physdev_ctrl(struct be_adapter *adapter, u32 mask)
+{
+	int status = 0;
+
+	status = lancer_wait_idle(adapter);
+	if (status)
+		return status;
+
+	iowrite32(mask, adapter->db + PHYSDEV_CONTROL_OFFSET);
+
+	return status;
+}
+
+/* Routine to check whether dump image is present or not */
+bool dump_present(struct be_adapter *adapter)
+{
+	u32 sliport_status = 0;
+
+	sliport_status = ioread32(adapter->db + SLIPORT_STATUS_OFFSET);
+	return !!(sliport_status & SLIPORT_STATUS_DIP_MASK);
+}
+
+int lancer_initiate_dump(struct be_adapter *adapter)
+{
+	struct device *dev = &adapter->pdev->dev;
+	int status;
+
+	if (dump_present(adapter)) {
+		dev_info(dev, "Previous dump not cleared, not forcing dump\n");
+		return -EEXIST;
+	}
+
+	/* give firmware reset and diagnostic dump */
+	status = lancer_physdev_ctrl(adapter, PHYSDEV_CONTROL_FW_RESET_MASK |
+				     PHYSDEV_CONTROL_DD_MASK);
+	if (status < 0) {
+		dev_err(dev, "FW reset failed\n");
+		return status;
+	}
+
+	status = lancer_wait_idle(adapter);
+	if (status)
+		return status;
+
+	if (!dump_present(adapter)) {
+		dev_err(dev, "FW dump not generated\n");
+		return -EIO;
+	}
+
+	return 0;
+}
+
+int lancer_delete_dump(struct be_adapter *adapter)
+{
+	int status;
+
+	status = lancer_cmd_delete_object(adapter, LANCER_FW_DUMP_FILE);
+	return be_cmd_status(status);
+}
+
+/* Uses sync mcc */
+int be_cmd_enable_vf(struct be_adapter *adapter, u8 domain)
+{
+	struct be_mcc_wrb *wrb;
+	struct be_cmd_enable_disable_vf *req;
+	int status;
+
+	if (BEx_chip(adapter))
+		return 0;
+
+	spin_lock_bh(&adapter->mcc_lock);
+
+	wrb = wrb_from_mccq(adapter);
+	if (!wrb) {
+		status = -EBUSY;
+		goto err;
+	}
+
+	req = embedded_payload(wrb);
+
+	be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON,
+			       OPCODE_COMMON_ENABLE_DISABLE_VF, sizeof(*req),
+			       wrb, NULL);
+
+	req->hdr.domain = domain;
+	req->enable = 1;
+	status = be_mcc_notify_wait(adapter);
+err:
+	spin_unlock_bh(&adapter->mcc_lock);
+	return status;
+}
+
+int be_cmd_intr_set(struct be_adapter *adapter, bool intr_enable)
+{
+	struct be_mcc_wrb *wrb;
+	struct be_cmd_req_intr_set *req;
+	int status;
+
+	if (mutex_lock_interruptible(&adapter->mbox_lock))
+		return -1;
+
+	wrb = wrb_from_mbox(adapter);
+
+	req = embedded_payload(wrb);
+
+	be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON,
+			       OPCODE_COMMON_SET_INTERRUPT_ENABLE, sizeof(*req),
+			       wrb, NULL);
+
+	req->intr_enabled = intr_enable;
+
+	status = be_mbox_notify_wait(adapter);
+
+	mutex_unlock(&adapter->mbox_lock);
+	return status;
+}
+
+/* Uses MBOX */
+int be_cmd_get_active_profile(struct be_adapter *adapter, u16 *profile_id)
+{
+	struct be_cmd_req_get_active_profile *req;
+	struct be_mcc_wrb *wrb;
+	int status;
+
+	if (mutex_lock_interruptible(&adapter->mbox_lock))
+		return -1;
+
+	wrb = wrb_from_mbox(adapter);
+	if (!wrb) {
+		status = -EBUSY;
+		goto err;
+	}
+
+	req = embedded_payload(wrb);
+
+	be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON,
+			       OPCODE_COMMON_GET_ACTIVE_PROFILE, sizeof(*req),
+			       wrb, NULL);
+
+	status = be_mbox_notify_wait(adapter);
+	if (!status) {
+		struct be_cmd_resp_get_active_profile *resp =
+							embedded_payload(wrb);
+
+		*profile_id = le16_to_cpu(resp->active_profile_id);
+	}
+
+err:
+	mutex_unlock(&adapter->mbox_lock);
+	return status;
+}
+
+int be_cmd_set_logical_link_config(struct be_adapter *adapter,
+				   int link_state, u8 domain)
+{
+	struct be_mcc_wrb *wrb;
+	struct be_cmd_req_set_ll_link *req;
+	int status;
+
+	if (BEx_chip(adapter) || lancer_chip(adapter))
+		return 0;
+
+	spin_lock_bh(&adapter->mcc_lock);
+
+	wrb = wrb_from_mccq(adapter);
+	if (!wrb) {
+		status = -EBUSY;
+		goto err;
+	}
+
+	req = embedded_payload(wrb);
+
+	be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON,
+			       OPCODE_COMMON_SET_LOGICAL_LINK_CONFIG,
+			       sizeof(*req), wrb, NULL);
+
+	req->hdr.version = 1;
+	req->hdr.domain = domain;
+
+	if (link_state == IFLA_VF_LINK_STATE_ENABLE)
+		req->link_config |= 1;
+
+	if (link_state == IFLA_VF_LINK_STATE_AUTO)
+		req->link_config |= 1 << PLINK_TRACK_SHIFT;
+
+	status = be_mcc_notify_wait(adapter);
+err:
+	spin_unlock_bh(&adapter->mcc_lock);
+	return status;
+}
+
+int be_roce_mcc_cmd(void *netdev_handle, void *wrb_payload,
+		    int wrb_payload_size, u16 *cmd_status, u16 *ext_status)
+{
+	struct be_adapter *adapter = netdev_priv(netdev_handle);
+	struct be_mcc_wrb *wrb;
+	struct be_cmd_req_hdr *hdr = (struct be_cmd_req_hdr *)wrb_payload;
+	struct be_cmd_req_hdr *req;
+	struct be_cmd_resp_hdr *resp;
+	int status;
+
+	spin_lock_bh(&adapter->mcc_lock);
+
+	wrb = wrb_from_mccq(adapter);
+	if (!wrb) {
+		status = -EBUSY;
+		goto err;
+	}
+	req = embedded_payload(wrb);
+	resp = embedded_payload(wrb);
+
+	be_wrb_cmd_hdr_prepare(req, hdr->subsystem,
+			       hdr->opcode, wrb_payload_size, wrb, NULL);
+	memcpy(req, wrb_payload, wrb_payload_size);
+	be_dws_cpu_to_le(req, wrb_payload_size);
+
+	status = be_mcc_notify_wait(adapter);
+	if (cmd_status)
+		*cmd_status = (status & 0xffff);
+	if (ext_status)
+		*ext_status = 0;
+	memcpy(wrb_payload, resp, sizeof(*resp) + resp->response_length);
+	be_dws_le_to_cpu(wrb_payload, sizeof(*resp) + resp->response_length);
+err:
+	spin_unlock_bh(&adapter->mcc_lock);
+	return status;
+}
+EXPORT_SYMBOL(be_roce_mcc_cmd);
diff --git a/drivers/net/ethernet/emulex/benet/be_cmds.h b/drivers/net/ethernet/emulex/benet/be_cmds.h
new file mode 100644
index 0000000..eb5085d
--- /dev/null
+++ b/drivers/net/ethernet/emulex/benet/be_cmds.h
@@ -0,0 +1,2154 @@
+/*
+ * Copyright (C) 2005 - 2014 Emulex
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.  The full GNU General
+ * Public License is included in this distribution in the file called COPYING.
+ *
+ * Contact Information:
+ * linux-drivers@emulex.com
+ *
+ * Emulex
+ * 3333 Susan Street
+ * Costa Mesa, CA 92626
+ */
+
+/*
+ * The driver sends configuration and managements command requests to the
+ * firmware in the BE. These requests are communicated to the processor
+ * using Work Request Blocks (WRBs) submitted to the MCC-WRB ring or via one
+ * WRB inside a MAILBOX.
+ * The commands are serviced by the ARM processor in the BladeEngine's MPU.
+ */
+
+struct be_sge {
+	u32 pa_lo;
+	u32 pa_hi;
+	u32 len;
+};
+
+#define MCC_WRB_EMBEDDED_MASK	1 	/* bit 0 of dword 0*/
+#define MCC_WRB_SGE_CNT_SHIFT	3	/* bits 3 - 7 of dword 0 */
+#define MCC_WRB_SGE_CNT_MASK	0x1F	/* bits 3 - 7 of dword 0 */
+struct be_mcc_wrb {
+	u32 embedded;		/* dword 0 */
+	u32 payload_length;	/* dword 1 */
+	u32 tag0;		/* dword 2 */
+	u32 tag1;		/* dword 3 */
+	u32 rsvd;		/* dword 4 */
+	union {
+		u8 embedded_payload[236]; /* used by embedded cmds */
+		struct be_sge sgl[19];    /* used by non-embedded cmds */
+	} payload;
+};
+
+#define CQE_FLAGS_VALID_MASK 		(1 << 31)
+#define CQE_FLAGS_ASYNC_MASK 		(1 << 30)
+#define CQE_FLAGS_COMPLETED_MASK 	(1 << 28)
+#define CQE_FLAGS_CONSUMED_MASK 	(1 << 27)
+
+/* Completion Status */
+enum mcc_base_status {
+	MCC_STATUS_SUCCESS = 0,
+	MCC_STATUS_FAILED = 1,
+	MCC_STATUS_ILLEGAL_REQUEST = 2,
+	MCC_STATUS_ILLEGAL_FIELD = 3,
+	MCC_STATUS_INSUFFICIENT_BUFFER = 4,
+	MCC_STATUS_UNAUTHORIZED_REQUEST = 5,
+	MCC_STATUS_NOT_SUPPORTED = 66,
+	MCC_STATUS_FEATURE_NOT_SUPPORTED = 68
+};
+
+/* Additional status */
+enum mcc_addl_status {
+	MCC_ADDL_STATUS_INSUFFICIENT_RESOURCES = 0x16,
+	MCC_ADDL_STATUS_FLASH_IMAGE_CRC_MISMATCH = 0x4d,
+	MCC_ADDL_STATUS_TOO_MANY_INTERFACES = 0x4a
+};
+
+#define CQE_BASE_STATUS_MASK		0xFFFF
+#define CQE_BASE_STATUS_SHIFT		0	/* bits 0 - 15 */
+#define CQE_ADDL_STATUS_MASK		0xFF
+#define CQE_ADDL_STATUS_SHIFT		16	/* bits 16 - 31 */
+
+#define base_status(status)		\
+		((enum mcc_base_status)	\
+			(status > 0 ? (status & CQE_BASE_STATUS_MASK) : 0))
+#define addl_status(status)		\
+		((enum mcc_addl_status)	\
+			(status > 0 ? (status >> CQE_ADDL_STATUS_SHIFT) & \
+					CQE_ADDL_STATUS_MASK : 0))
+
+struct be_mcc_compl {
+	u32 status;		/* dword 0 */
+	u32 tag0;		/* dword 1 */
+	u32 tag1;		/* dword 2 */
+	u32 flags;		/* dword 3 */
+};
+
+/* When the async bit of mcc_compl flags is set, flags
+ * is interpreted as follows:
+ */
+#define ASYNC_EVENT_CODE_SHIFT		8	/* bits 8 - 15 */
+#define ASYNC_EVENT_CODE_MASK		0xFF
+#define ASYNC_EVENT_TYPE_SHIFT		16
+#define ASYNC_EVENT_TYPE_MASK		0xFF
+#define ASYNC_EVENT_CODE_LINK_STATE	0x1
+#define ASYNC_EVENT_CODE_GRP_5		0x5
+#define ASYNC_EVENT_QOS_SPEED		0x1
+#define ASYNC_EVENT_COS_PRIORITY	0x2
+#define ASYNC_EVENT_PVID_STATE		0x3
+#define ASYNC_EVENT_CODE_QNQ		0x6
+#define ASYNC_DEBUG_EVENT_TYPE_QNQ	1
+
+enum {
+	LINK_DOWN	= 0x0,
+	LINK_UP		= 0x1
+};
+#define LINK_STATUS_MASK			0x1
+#define LOGICAL_LINK_STATUS_MASK		0x2
+
+/* When the event code of compl->flags is link-state, the mcc_compl
+ * must be interpreted as follows
+ */
+struct be_async_event_link_state {
+	u8 physical_port;
+	u8 port_link_status;
+	u8 port_duplex;
+	u8 port_speed;
+	u8 port_fault;
+	u8 rsvd0[7];
+	u32 flags;
+} __packed;
+
+/* When the event code of compl->flags is GRP-5 and event_type is QOS_SPEED
+ * the mcc_compl must be interpreted as follows
+ */
+struct be_async_event_grp5_qos_link_speed {
+	u8 physical_port;
+	u8 rsvd[5];
+	u16 qos_link_speed;
+	u32 event_tag;
+	u32 flags;
+} __packed;
+
+/* When the event code of compl->flags is GRP5 and event type is
+ * CoS-Priority, the mcc_compl must be interpreted as follows
+ */
+struct be_async_event_grp5_cos_priority {
+	u8 physical_port;
+	u8 available_priority_bmap;
+	u8 reco_default_priority;
+	u8 valid;
+	u8 rsvd0;
+	u8 event_tag;
+	u32 flags;
+} __packed;
+
+/* When the event code of compl->flags is GRP5 and event type is
+ * PVID state, the mcc_compl must be interpreted as follows
+ */
+struct be_async_event_grp5_pvid_state {
+	u8 enabled;
+	u8 rsvd0;
+	u16 tag;
+	u32 event_tag;
+	u32 rsvd1;
+	u32 flags;
+} __packed;
+
+/* async event indicating outer VLAN tag in QnQ */
+struct be_async_event_qnq {
+	u8 valid;	/* Indicates if outer VLAN is valid */
+	u8 rsvd0;
+	u16 vlan_tag;
+	u32 event_tag;
+	u8 rsvd1[4];
+	u32 flags;
+} __packed;
+
+struct be_mcc_mailbox {
+	struct be_mcc_wrb wrb;
+	struct be_mcc_compl compl;
+};
+
+#define CMD_SUBSYSTEM_COMMON	0x1
+#define CMD_SUBSYSTEM_ETH 	0x3
+#define CMD_SUBSYSTEM_LOWLEVEL  0xb
+
+#define OPCODE_COMMON_NTWK_MAC_QUERY			1
+#define OPCODE_COMMON_NTWK_MAC_SET			2
+#define OPCODE_COMMON_NTWK_MULTICAST_SET		3
+#define OPCODE_COMMON_NTWK_VLAN_CONFIG  		4
+#define OPCODE_COMMON_NTWK_LINK_STATUS_QUERY		5
+#define OPCODE_COMMON_READ_FLASHROM			6
+#define OPCODE_COMMON_WRITE_FLASHROM			7
+#define OPCODE_COMMON_CQ_CREATE				12
+#define OPCODE_COMMON_EQ_CREATE				13
+#define OPCODE_COMMON_MCC_CREATE			21
+#define OPCODE_COMMON_SET_QOS				28
+#define OPCODE_COMMON_MCC_CREATE_EXT			90
+#define OPCODE_COMMON_SEEPROM_READ			30
+#define OPCODE_COMMON_GET_CNTL_ATTRIBUTES               32
+#define OPCODE_COMMON_NTWK_RX_FILTER    		34
+#define OPCODE_COMMON_GET_FW_VERSION			35
+#define OPCODE_COMMON_SET_FLOW_CONTROL			36
+#define OPCODE_COMMON_GET_FLOW_CONTROL			37
+#define OPCODE_COMMON_SET_FRAME_SIZE			39
+#define OPCODE_COMMON_MODIFY_EQ_DELAY			41
+#define OPCODE_COMMON_FIRMWARE_CONFIG			42
+#define OPCODE_COMMON_NTWK_INTERFACE_CREATE 		50
+#define OPCODE_COMMON_NTWK_INTERFACE_DESTROY 		51
+#define OPCODE_COMMON_MCC_DESTROY        		53
+#define OPCODE_COMMON_CQ_DESTROY        		54
+#define OPCODE_COMMON_EQ_DESTROY        		55
+#define OPCODE_COMMON_QUERY_FIRMWARE_CONFIG		58
+#define OPCODE_COMMON_NTWK_PMAC_ADD			59
+#define OPCODE_COMMON_NTWK_PMAC_DEL			60
+#define OPCODE_COMMON_FUNCTION_RESET			61
+#define OPCODE_COMMON_MANAGE_FAT			68
+#define OPCODE_COMMON_ENABLE_DISABLE_BEACON		69
+#define OPCODE_COMMON_GET_BEACON_STATE			70
+#define OPCODE_COMMON_READ_TRANSRECV_DATA		73
+#define OPCODE_COMMON_GET_PORT_NAME			77
+#define OPCODE_COMMON_SET_LOGICAL_LINK_CONFIG		80
+#define OPCODE_COMMON_SET_INTERRUPT_ENABLE		89
+#define OPCODE_COMMON_SET_FN_PRIVILEGES			100
+#define OPCODE_COMMON_GET_PHY_DETAILS			102
+#define OPCODE_COMMON_SET_DRIVER_FUNCTION_CAP		103
+#define OPCODE_COMMON_GET_CNTL_ADDITIONAL_ATTRIBUTES	121
+#define OPCODE_COMMON_GET_EXT_FAT_CAPABILITES		125
+#define OPCODE_COMMON_SET_EXT_FAT_CAPABILITES		126
+#define OPCODE_COMMON_GET_MAC_LIST			147
+#define OPCODE_COMMON_SET_MAC_LIST			148
+#define OPCODE_COMMON_GET_HSW_CONFIG			152
+#define OPCODE_COMMON_GET_FUNC_CONFIG			160
+#define OPCODE_COMMON_GET_PROFILE_CONFIG		164
+#define OPCODE_COMMON_SET_PROFILE_CONFIG		165
+#define OPCODE_COMMON_GET_ACTIVE_PROFILE		167
+#define OPCODE_COMMON_SET_HSW_CONFIG			153
+#define OPCODE_COMMON_GET_FN_PRIVILEGES			170
+#define OPCODE_COMMON_READ_OBJECT			171
+#define OPCODE_COMMON_WRITE_OBJECT			172
+#define OPCODE_COMMON_DELETE_OBJECT			174
+#define OPCODE_COMMON_MANAGE_IFACE_FILTERS		193
+#define OPCODE_COMMON_GET_IFACE_LIST			194
+#define OPCODE_COMMON_ENABLE_DISABLE_VF			196
+
+#define OPCODE_ETH_RSS_CONFIG				1
+#define OPCODE_ETH_ACPI_CONFIG				2
+#define OPCODE_ETH_PROMISCUOUS				3
+#define OPCODE_ETH_GET_STATISTICS			4
+#define OPCODE_ETH_TX_CREATE				7
+#define OPCODE_ETH_RX_CREATE            		8
+#define OPCODE_ETH_TX_DESTROY           		9
+#define OPCODE_ETH_RX_DESTROY           		10
+#define OPCODE_ETH_ACPI_WOL_MAGIC_CONFIG		12
+#define OPCODE_ETH_GET_PPORT_STATS			18
+
+#define OPCODE_LOWLEVEL_HOST_DDR_DMA                    17
+#define OPCODE_LOWLEVEL_LOOPBACK_TEST                   18
+#define OPCODE_LOWLEVEL_SET_LOOPBACK_MODE		19
+
+struct be_cmd_req_hdr {
+	u8 opcode;		/* dword 0 */
+	u8 subsystem;		/* dword 0 */
+	u8 port_number;		/* dword 0 */
+	u8 domain;		/* dword 0 */
+	u32 timeout;		/* dword 1 */
+	u32 request_length;	/* dword 2 */
+	u8 version;		/* dword 3 */
+	u8 rsvd[3];		/* dword 3 */
+};
+
+#define RESP_HDR_INFO_OPCODE_SHIFT	0	/* bits 0 - 7 */
+#define RESP_HDR_INFO_SUBSYS_SHIFT	8 	/* bits 8 - 15 */
+struct be_cmd_resp_hdr {
+	u8 opcode;		/* dword 0 */
+	u8 subsystem;		/* dword 0 */
+	u8 rsvd[2];		/* dword 0 */
+	u8 base_status;		/* dword 1 */
+	u8 addl_status;		/* dword 1 */
+	u8 rsvd1[2];		/* dword 1 */
+	u32 response_length;	/* dword 2 */
+	u32 actual_resp_len;	/* dword 3 */
+};
+
+struct phys_addr {
+	u32 lo;
+	u32 hi;
+};
+
+/**************************
+ * BE Command definitions *
+ **************************/
+
+/* Pseudo amap definition in which each bit of the actual structure is defined
+ * as a byte: used to calculate offset/shift/mask of each field */
+struct amap_eq_context {
+	u8 cidx[13];		/* dword 0*/
+	u8 rsvd0[3];		/* dword 0*/
+	u8 epidx[13];		/* dword 0*/
+	u8 valid;		/* dword 0*/
+	u8 rsvd1;		/* dword 0*/
+	u8 size;		/* dword 0*/
+	u8 pidx[13];		/* dword 1*/
+	u8 rsvd2[3];		/* dword 1*/
+	u8 pd[10];		/* dword 1*/
+	u8 count[3];		/* dword 1*/
+	u8 solevent;		/* dword 1*/
+	u8 stalled;		/* dword 1*/
+	u8 armed;		/* dword 1*/
+	u8 rsvd3[4];		/* dword 2*/
+	u8 func[8];		/* dword 2*/
+	u8 rsvd4;		/* dword 2*/
+	u8 delaymult[10];	/* dword 2*/
+	u8 rsvd5[2];		/* dword 2*/
+	u8 phase[2];		/* dword 2*/
+	u8 nodelay;		/* dword 2*/
+	u8 rsvd6[4];		/* dword 2*/
+	u8 rsvd7[32];		/* dword 3*/
+} __packed;
+
+struct be_cmd_req_eq_create {
+	struct be_cmd_req_hdr hdr;
+	u16 num_pages;		/* sword */
+	u16 rsvd0;		/* sword */
+	u8 context[sizeof(struct amap_eq_context) / 8];
+	struct phys_addr pages[8];
+} __packed;
+
+struct be_cmd_resp_eq_create {
+	struct be_cmd_resp_hdr resp_hdr;
+	u16 eq_id;		/* sword */
+	u16 msix_idx;		/* available only in v2 */
+} __packed;
+
+/******************** Mac query ***************************/
+enum {
+	MAC_ADDRESS_TYPE_STORAGE = 0x0,
+	MAC_ADDRESS_TYPE_NETWORK = 0x1,
+	MAC_ADDRESS_TYPE_PD = 0x2,
+	MAC_ADDRESS_TYPE_MANAGEMENT = 0x3
+};
+
+struct mac_addr {
+	u16 size_of_struct;
+	u8 addr[ETH_ALEN];
+} __packed;
+
+struct be_cmd_req_mac_query {
+	struct be_cmd_req_hdr hdr;
+	u8 type;
+	u8 permanent;
+	u16 if_id;
+	u32 pmac_id;
+} __packed;
+
+struct be_cmd_resp_mac_query {
+	struct be_cmd_resp_hdr hdr;
+	struct mac_addr mac;
+};
+
+/******************** PMac Add ***************************/
+struct be_cmd_req_pmac_add {
+	struct be_cmd_req_hdr hdr;
+	u32 if_id;
+	u8 mac_address[ETH_ALEN];
+	u8 rsvd0[2];
+} __packed;
+
+struct be_cmd_resp_pmac_add {
+	struct be_cmd_resp_hdr hdr;
+	u32 pmac_id;
+};
+
+/******************** PMac Del ***************************/
+struct be_cmd_req_pmac_del {
+	struct be_cmd_req_hdr hdr;
+	u32 if_id;
+	u32 pmac_id;
+};
+
+/******************** Create CQ ***************************/
+/* Pseudo amap definition in which each bit of the actual structure is defined
+ * as a byte: used to calculate offset/shift/mask of each field */
+struct amap_cq_context_be {
+	u8 cidx[11];		/* dword 0*/
+	u8 rsvd0;		/* dword 0*/
+	u8 coalescwm[2];	/* dword 0*/
+	u8 nodelay;		/* dword 0*/
+	u8 epidx[11];		/* dword 0*/
+	u8 rsvd1;		/* dword 0*/
+	u8 count[2];		/* dword 0*/
+	u8 valid;		/* dword 0*/
+	u8 solevent;		/* dword 0*/
+	u8 eventable;		/* dword 0*/
+	u8 pidx[11];		/* dword 1*/
+	u8 rsvd2;		/* dword 1*/
+	u8 pd[10];		/* dword 1*/
+	u8 eqid[8];		/* dword 1*/
+	u8 stalled;		/* dword 1*/
+	u8 armed;		/* dword 1*/
+	u8 rsvd3[4];		/* dword 2*/
+	u8 func[8];		/* dword 2*/
+	u8 rsvd4[20];		/* dword 2*/
+	u8 rsvd5[32];		/* dword 3*/
+} __packed;
+
+struct amap_cq_context_v2 {
+	u8 rsvd0[12];		/* dword 0*/
+	u8 coalescwm[2];	/* dword 0*/
+	u8 nodelay;		/* dword 0*/
+	u8 rsvd1[12];		/* dword 0*/
+	u8 count[2];		/* dword 0*/
+	u8 valid;		/* dword 0*/
+	u8 rsvd2;		/* dword 0*/
+	u8 eventable;		/* dword 0*/
+	u8 eqid[16];		/* dword 1*/
+	u8 rsvd3[15];		/* dword 1*/
+	u8 armed;		/* dword 1*/
+	u8 rsvd4[32];		/* dword 2*/
+	u8 rsvd5[32];		/* dword 3*/
+} __packed;
+
+struct be_cmd_req_cq_create {
+	struct be_cmd_req_hdr hdr;
+	u16 num_pages;
+	u8 page_size;
+	u8 rsvd0;
+	u8 context[sizeof(struct amap_cq_context_be) / 8];
+	struct phys_addr pages[8];
+} __packed;
+
+
+struct be_cmd_resp_cq_create {
+	struct be_cmd_resp_hdr hdr;
+	u16 cq_id;
+	u16 rsvd0;
+} __packed;
+
+struct be_cmd_req_get_fat {
+	struct be_cmd_req_hdr hdr;
+	u32 fat_operation;
+	u32 read_log_offset;
+	u32 read_log_length;
+	u32 data_buffer_size;
+	u32 data_buffer[1];
+} __packed;
+
+struct be_cmd_resp_get_fat {
+	struct be_cmd_resp_hdr hdr;
+	u32 log_size;
+	u32 read_log_length;
+	u32 rsvd[2];
+	u32 data_buffer[1];
+} __packed;
+
+
+/******************** Create MCCQ ***************************/
+/* Pseudo amap definition in which each bit of the actual structure is defined
+ * as a byte: used to calculate offset/shift/mask of each field */
+struct amap_mcc_context_be {
+	u8 con_index[14];
+	u8 rsvd0[2];
+	u8 ring_size[4];
+	u8 fetch_wrb;
+	u8 fetch_r2t;
+	u8 cq_id[10];
+	u8 prod_index[14];
+	u8 fid[8];
+	u8 pdid[9];
+	u8 valid;
+	u8 rsvd1[32];
+	u8 rsvd2[32];
+} __packed;
+
+struct amap_mcc_context_v1 {
+	u8 async_cq_id[16];
+	u8 ring_size[4];
+	u8 rsvd0[12];
+	u8 rsvd1[31];
+	u8 valid;
+	u8 async_cq_valid[1];
+	u8 rsvd2[31];
+	u8 rsvd3[32];
+} __packed;
+
+struct be_cmd_req_mcc_create {
+	struct be_cmd_req_hdr hdr;
+	u16 num_pages;
+	u16 cq_id;
+	u8 context[sizeof(struct amap_mcc_context_be) / 8];
+	struct phys_addr pages[8];
+} __packed;
+
+struct be_cmd_req_mcc_ext_create {
+	struct be_cmd_req_hdr hdr;
+	u16 num_pages;
+	u16 cq_id;
+	u32 async_event_bitmap[1];
+	u8 context[sizeof(struct amap_mcc_context_v1) / 8];
+	struct phys_addr pages[8];
+} __packed;
+
+struct be_cmd_resp_mcc_create {
+	struct be_cmd_resp_hdr hdr;
+	u16 id;
+	u16 rsvd0;
+} __packed;
+
+/******************** Create TxQ ***************************/
+#define BE_ETH_TX_RING_TYPE_STANDARD    	2
+#define BE_ULP1_NUM				1
+
+struct be_cmd_req_eth_tx_create {
+	struct be_cmd_req_hdr hdr;
+	u8 num_pages;
+	u8 ulp_num;
+	u16 type;
+	u16 if_id;
+	u8 queue_size;
+	u8 rsvd0;
+	u32 rsvd1;
+	u16 cq_id;
+	u16 rsvd2;
+	u32 rsvd3[13];
+	struct phys_addr pages[8];
+} __packed;
+
+struct be_cmd_resp_eth_tx_create {
+	struct be_cmd_resp_hdr hdr;
+	u16 cid;
+	u16 rid;
+	u32 db_offset;
+	u32 rsvd0[4];
+} __packed;
+
+/******************** Create RxQ ***************************/
+struct be_cmd_req_eth_rx_create {
+	struct be_cmd_req_hdr hdr;
+	u16 cq_id;
+	u8 frag_size;
+	u8 num_pages;
+	struct phys_addr pages[2];
+	u32 interface_id;
+	u16 max_frame_size;
+	u16 rsvd0;
+	u32 rss_queue;
+} __packed;
+
+struct be_cmd_resp_eth_rx_create {
+	struct be_cmd_resp_hdr hdr;
+	u16 id;
+	u8 rss_id;
+	u8 rsvd0;
+} __packed;
+
+/******************** Q Destroy  ***************************/
+/* Type of Queue to be destroyed */
+enum {
+	QTYPE_EQ = 1,
+	QTYPE_CQ,
+	QTYPE_TXQ,
+	QTYPE_RXQ,
+	QTYPE_MCCQ
+};
+
+struct be_cmd_req_q_destroy {
+	struct be_cmd_req_hdr hdr;
+	u16 id;
+	u16 bypass_flush;	/* valid only for rx q destroy */
+} __packed;
+
+/************ I/f Create (it's actually I/f Config Create)**********/
+
+/* Capability flags for the i/f */
+enum be_if_flags {
+	BE_IF_FLAGS_RSS = 0x4,
+	BE_IF_FLAGS_PROMISCUOUS = 0x8,
+	BE_IF_FLAGS_BROADCAST = 0x10,
+	BE_IF_FLAGS_UNTAGGED = 0x20,
+	BE_IF_FLAGS_ULP = 0x40,
+	BE_IF_FLAGS_VLAN_PROMISCUOUS = 0x80,
+	BE_IF_FLAGS_VLAN = 0x100,
+	BE_IF_FLAGS_MCAST_PROMISCUOUS = 0x200,
+	BE_IF_FLAGS_PASS_L2_ERRORS = 0x400,
+	BE_IF_FLAGS_PASS_L3L4_ERRORS = 0x800,
+	BE_IF_FLAGS_MULTICAST = 0x1000
+};
+
+#define BE_IF_CAP_FLAGS_WANT (BE_IF_FLAGS_RSS | BE_IF_FLAGS_PROMISCUOUS |\
+			 BE_IF_FLAGS_BROADCAST | BE_IF_FLAGS_VLAN_PROMISCUOUS |\
+			 BE_IF_FLAGS_VLAN | BE_IF_FLAGS_MCAST_PROMISCUOUS |\
+			 BE_IF_FLAGS_PASS_L3L4_ERRORS | BE_IF_FLAGS_MULTICAST |\
+			 BE_IF_FLAGS_UNTAGGED)
+
+/* An RX interface is an object with one or more MAC addresses and
+ * filtering capabilities. */
+struct be_cmd_req_if_create {
+	struct be_cmd_req_hdr hdr;
+	u32 version;		/* ignore currently */
+	u32 capability_flags;
+	u32 enable_flags;
+	u8 mac_addr[ETH_ALEN];
+	u8 rsvd0;
+	u8 pmac_invalid; /* if set, don't attach the mac addr to the i/f */
+	u32 vlan_tag;	 /* not used currently */
+} __packed;
+
+struct be_cmd_resp_if_create {
+	struct be_cmd_resp_hdr hdr;
+	u32 interface_id;
+	u32 pmac_id;
+};
+
+/****** I/f Destroy(it's actually I/f Config Destroy )**********/
+struct be_cmd_req_if_destroy {
+	struct be_cmd_req_hdr hdr;
+	u32 interface_id;
+};
+
+/*************** HW Stats Get **********************************/
+struct be_port_rxf_stats_v0 {
+	u32 rx_bytes_lsd;	/* dword 0*/
+	u32 rx_bytes_msd;	/* dword 1*/
+	u32 rx_total_frames;	/* dword 2*/
+	u32 rx_unicast_frames;	/* dword 3*/
+	u32 rx_multicast_frames;	/* dword 4*/
+	u32 rx_broadcast_frames;	/* dword 5*/
+	u32 rx_crc_errors;	/* dword 6*/
+	u32 rx_alignment_symbol_errors;	/* dword 7*/
+	u32 rx_pause_frames;	/* dword 8*/
+	u32 rx_control_frames;	/* dword 9*/
+	u32 rx_in_range_errors;	/* dword 10*/
+	u32 rx_out_range_errors;	/* dword 11*/
+	u32 rx_frame_too_long;	/* dword 12*/
+	u32 rx_address_filtered;	/* dword 13*/
+	u32 rx_vlan_filtered;	/* dword 14*/
+	u32 rx_dropped_too_small;	/* dword 15*/
+	u32 rx_dropped_too_short;	/* dword 16*/
+	u32 rx_dropped_header_too_small;	/* dword 17*/
+	u32 rx_dropped_tcp_length;	/* dword 18*/
+	u32 rx_dropped_runt;	/* dword 19*/
+	u32 rx_64_byte_packets;	/* dword 20*/
+	u32 rx_65_127_byte_packets;	/* dword 21*/
+	u32 rx_128_256_byte_packets;	/* dword 22*/
+	u32 rx_256_511_byte_packets;	/* dword 23*/
+	u32 rx_512_1023_byte_packets;	/* dword 24*/
+	u32 rx_1024_1518_byte_packets;	/* dword 25*/
+	u32 rx_1519_2047_byte_packets;	/* dword 26*/
+	u32 rx_2048_4095_byte_packets;	/* dword 27*/
+	u32 rx_4096_8191_byte_packets;	/* dword 28*/
+	u32 rx_8192_9216_byte_packets;	/* dword 29*/
+	u32 rx_ip_checksum_errs;	/* dword 30*/
+	u32 rx_tcp_checksum_errs;	/* dword 31*/
+	u32 rx_udp_checksum_errs;	/* dword 32*/
+	u32 rx_non_rss_packets;	/* dword 33*/
+	u32 rx_ipv4_packets;	/* dword 34*/
+	u32 rx_ipv6_packets;	/* dword 35*/
+	u32 rx_ipv4_bytes_lsd;	/* dword 36*/
+	u32 rx_ipv4_bytes_msd;	/* dword 37*/
+	u32 rx_ipv6_bytes_lsd;	/* dword 38*/
+	u32 rx_ipv6_bytes_msd;	/* dword 39*/
+	u32 rx_chute1_packets;	/* dword 40*/
+	u32 rx_chute2_packets;	/* dword 41*/
+	u32 rx_chute3_packets;	/* dword 42*/
+	u32 rx_management_packets;	/* dword 43*/
+	u32 rx_switched_unicast_packets;	/* dword 44*/
+	u32 rx_switched_multicast_packets;	/* dword 45*/
+	u32 rx_switched_broadcast_packets;	/* dword 46*/
+	u32 tx_bytes_lsd;	/* dword 47*/
+	u32 tx_bytes_msd;	/* dword 48*/
+	u32 tx_unicastframes;	/* dword 49*/
+	u32 tx_multicastframes;	/* dword 50*/
+	u32 tx_broadcastframes;	/* dword 51*/
+	u32 tx_pauseframes;	/* dword 52*/
+	u32 tx_controlframes;	/* dword 53*/
+	u32 tx_64_byte_packets;	/* dword 54*/
+	u32 tx_65_127_byte_packets;	/* dword 55*/
+	u32 tx_128_256_byte_packets;	/* dword 56*/
+	u32 tx_256_511_byte_packets;	/* dword 57*/
+	u32 tx_512_1023_byte_packets;	/* dword 58*/
+	u32 tx_1024_1518_byte_packets;	/* dword 59*/
+	u32 tx_1519_2047_byte_packets;	/* dword 60*/
+	u32 tx_2048_4095_byte_packets;	/* dword 61*/
+	u32 tx_4096_8191_byte_packets;	/* dword 62*/
+	u32 tx_8192_9216_byte_packets;	/* dword 63*/
+	u32 rx_fifo_overflow;	/* dword 64*/
+	u32 rx_input_fifo_overflow;	/* dword 65*/
+};
+
+struct be_rxf_stats_v0 {
+	struct be_port_rxf_stats_v0 port[2];
+	u32 rx_drops_no_pbuf;	/* dword 132*/
+	u32 rx_drops_no_txpb;	/* dword 133*/
+	u32 rx_drops_no_erx_descr;	/* dword 134*/
+	u32 rx_drops_no_tpre_descr;	/* dword 135*/
+	u32 management_rx_port_packets;	/* dword 136*/
+	u32 management_rx_port_bytes;	/* dword 137*/
+	u32 management_rx_port_pause_frames;	/* dword 138*/
+	u32 management_rx_port_errors;	/* dword 139*/
+	u32 management_tx_port_packets;	/* dword 140*/
+	u32 management_tx_port_bytes;	/* dword 141*/
+	u32 management_tx_port_pause;	/* dword 142*/
+	u32 management_rx_port_rxfifo_overflow;	/* dword 143*/
+	u32 rx_drops_too_many_frags;	/* dword 144*/
+	u32 rx_drops_invalid_ring;	/* dword 145*/
+	u32 forwarded_packets;	/* dword 146*/
+	u32 rx_drops_mtu;	/* dword 147*/
+	u32 rsvd0[7];
+	u32 port0_jabber_events;
+	u32 port1_jabber_events;
+	u32 rsvd1[6];
+};
+
+struct be_erx_stats_v0 {
+	u32 rx_drops_no_fragments[44];     /* dwordS 0 to 43*/
+	u32 rsvd[4];
+};
+
+struct be_pmem_stats {
+	u32 eth_red_drops;
+	u32 rsvd[5];
+};
+
+struct be_hw_stats_v0 {
+	struct be_rxf_stats_v0 rxf;
+	u32 rsvd[48];
+	struct be_erx_stats_v0 erx;
+	struct be_pmem_stats pmem;
+};
+
+struct be_cmd_req_get_stats_v0 {
+	struct be_cmd_req_hdr hdr;
+	u8 rsvd[sizeof(struct be_hw_stats_v0)];
+};
+
+struct be_cmd_resp_get_stats_v0 {
+	struct be_cmd_resp_hdr hdr;
+	struct be_hw_stats_v0 hw_stats;
+};
+
+struct lancer_pport_stats {
+	u32 tx_packets_lo;
+	u32 tx_packets_hi;
+	u32 tx_unicast_packets_lo;
+	u32 tx_unicast_packets_hi;
+	u32 tx_multicast_packets_lo;
+	u32 tx_multicast_packets_hi;
+	u32 tx_broadcast_packets_lo;
+	u32 tx_broadcast_packets_hi;
+	u32 tx_bytes_lo;
+	u32 tx_bytes_hi;
+	u32 tx_unicast_bytes_lo;
+	u32 tx_unicast_bytes_hi;
+	u32 tx_multicast_bytes_lo;
+	u32 tx_multicast_bytes_hi;
+	u32 tx_broadcast_bytes_lo;
+	u32 tx_broadcast_bytes_hi;
+	u32 tx_discards_lo;
+	u32 tx_discards_hi;
+	u32 tx_errors_lo;
+	u32 tx_errors_hi;
+	u32 tx_pause_frames_lo;
+	u32 tx_pause_frames_hi;
+	u32 tx_pause_on_frames_lo;
+	u32 tx_pause_on_frames_hi;
+	u32 tx_pause_off_frames_lo;
+	u32 tx_pause_off_frames_hi;
+	u32 tx_internal_mac_errors_lo;
+	u32 tx_internal_mac_errors_hi;
+	u32 tx_control_frames_lo;
+	u32 tx_control_frames_hi;
+	u32 tx_packets_64_bytes_lo;
+	u32 tx_packets_64_bytes_hi;
+	u32 tx_packets_65_to_127_bytes_lo;
+	u32 tx_packets_65_to_127_bytes_hi;
+	u32 tx_packets_128_to_255_bytes_lo;
+	u32 tx_packets_128_to_255_bytes_hi;
+	u32 tx_packets_256_to_511_bytes_lo;
+	u32 tx_packets_256_to_511_bytes_hi;
+	u32 tx_packets_512_to_1023_bytes_lo;
+	u32 tx_packets_512_to_1023_bytes_hi;
+	u32 tx_packets_1024_to_1518_bytes_lo;
+	u32 tx_packets_1024_to_1518_bytes_hi;
+	u32 tx_packets_1519_to_2047_bytes_lo;
+	u32 tx_packets_1519_to_2047_bytes_hi;
+	u32 tx_packets_2048_to_4095_bytes_lo;
+	u32 tx_packets_2048_to_4095_bytes_hi;
+	u32 tx_packets_4096_to_8191_bytes_lo;
+	u32 tx_packets_4096_to_8191_bytes_hi;
+	u32 tx_packets_8192_to_9216_bytes_lo;
+	u32 tx_packets_8192_to_9216_bytes_hi;
+	u32 tx_lso_packets_lo;
+	u32 tx_lso_packets_hi;
+	u32 rx_packets_lo;
+	u32 rx_packets_hi;
+	u32 rx_unicast_packets_lo;
+	u32 rx_unicast_packets_hi;
+	u32 rx_multicast_packets_lo;
+	u32 rx_multicast_packets_hi;
+	u32 rx_broadcast_packets_lo;
+	u32 rx_broadcast_packets_hi;
+	u32 rx_bytes_lo;
+	u32 rx_bytes_hi;
+	u32 rx_unicast_bytes_lo;
+	u32 rx_unicast_bytes_hi;
+	u32 rx_multicast_bytes_lo;
+	u32 rx_multicast_bytes_hi;
+	u32 rx_broadcast_bytes_lo;
+	u32 rx_broadcast_bytes_hi;
+	u32 rx_unknown_protos;
+	u32 rsvd_69; /* Word 69 is reserved */
+	u32 rx_discards_lo;
+	u32 rx_discards_hi;
+	u32 rx_errors_lo;
+	u32 rx_errors_hi;
+	u32 rx_crc_errors_lo;
+	u32 rx_crc_errors_hi;
+	u32 rx_alignment_errors_lo;
+	u32 rx_alignment_errors_hi;
+	u32 rx_symbol_errors_lo;
+	u32 rx_symbol_errors_hi;
+	u32 rx_pause_frames_lo;
+	u32 rx_pause_frames_hi;
+	u32 rx_pause_on_frames_lo;
+	u32 rx_pause_on_frames_hi;
+	u32 rx_pause_off_frames_lo;
+	u32 rx_pause_off_frames_hi;
+	u32 rx_frames_too_long_lo;
+	u32 rx_frames_too_long_hi;
+	u32 rx_internal_mac_errors_lo;
+	u32 rx_internal_mac_errors_hi;
+	u32 rx_undersize_packets;
+	u32 rx_oversize_packets;
+	u32 rx_fragment_packets;
+	u32 rx_jabbers;
+	u32 rx_control_frames_lo;
+	u32 rx_control_frames_hi;
+	u32 rx_control_frames_unknown_opcode_lo;
+	u32 rx_control_frames_unknown_opcode_hi;
+	u32 rx_in_range_errors;
+	u32 rx_out_of_range_errors;
+	u32 rx_address_filtered;
+	u32 rx_vlan_filtered;
+	u32 rx_dropped_too_small;
+	u32 rx_dropped_too_short;
+	u32 rx_dropped_header_too_small;
+	u32 rx_dropped_invalid_tcp_length;
+	u32 rx_dropped_runt;
+	u32 rx_ip_checksum_errors;
+	u32 rx_tcp_checksum_errors;
+	u32 rx_udp_checksum_errors;
+	u32 rx_non_rss_packets;
+	u32 rsvd_111;
+	u32 rx_ipv4_packets_lo;
+	u32 rx_ipv4_packets_hi;
+	u32 rx_ipv6_packets_lo;
+	u32 rx_ipv6_packets_hi;
+	u32 rx_ipv4_bytes_lo;
+	u32 rx_ipv4_bytes_hi;
+	u32 rx_ipv6_bytes_lo;
+	u32 rx_ipv6_bytes_hi;
+	u32 rx_nic_packets_lo;
+	u32 rx_nic_packets_hi;
+	u32 rx_tcp_packets_lo;
+	u32 rx_tcp_packets_hi;
+	u32 rx_iscsi_packets_lo;
+	u32 rx_iscsi_packets_hi;
+	u32 rx_management_packets_lo;
+	u32 rx_management_packets_hi;
+	u32 rx_switched_unicast_packets_lo;
+	u32 rx_switched_unicast_packets_hi;
+	u32 rx_switched_multicast_packets_lo;
+	u32 rx_switched_multicast_packets_hi;
+	u32 rx_switched_broadcast_packets_lo;
+	u32 rx_switched_broadcast_packets_hi;
+	u32 num_forwards_lo;
+	u32 num_forwards_hi;
+	u32 rx_fifo_overflow;
+	u32 rx_input_fifo_overflow;
+	u32 rx_drops_too_many_frags_lo;
+	u32 rx_drops_too_many_frags_hi;
+	u32 rx_drops_invalid_queue;
+	u32 rsvd_141;
+	u32 rx_drops_mtu_lo;
+	u32 rx_drops_mtu_hi;
+	u32 rx_packets_64_bytes_lo;
+	u32 rx_packets_64_bytes_hi;
+	u32 rx_packets_65_to_127_bytes_lo;
+	u32 rx_packets_65_to_127_bytes_hi;
+	u32 rx_packets_128_to_255_bytes_lo;
+	u32 rx_packets_128_to_255_bytes_hi;
+	u32 rx_packets_256_to_511_bytes_lo;
+	u32 rx_packets_256_to_511_bytes_hi;
+	u32 rx_packets_512_to_1023_bytes_lo;
+	u32 rx_packets_512_to_1023_bytes_hi;
+	u32 rx_packets_1024_to_1518_bytes_lo;
+	u32 rx_packets_1024_to_1518_bytes_hi;
+	u32 rx_packets_1519_to_2047_bytes_lo;
+	u32 rx_packets_1519_to_2047_bytes_hi;
+	u32 rx_packets_2048_to_4095_bytes_lo;
+	u32 rx_packets_2048_to_4095_bytes_hi;
+	u32 rx_packets_4096_to_8191_bytes_lo;
+	u32 rx_packets_4096_to_8191_bytes_hi;
+	u32 rx_packets_8192_to_9216_bytes_lo;
+	u32 rx_packets_8192_to_9216_bytes_hi;
+};
+
+struct pport_stats_params {
+	u16 pport_num;
+	u8 rsvd;
+	u8 reset_stats;
+};
+
+struct lancer_cmd_req_pport_stats {
+	struct be_cmd_req_hdr hdr;
+	union {
+		struct pport_stats_params params;
+		u8 rsvd[sizeof(struct lancer_pport_stats)];
+	} cmd_params;
+};
+
+struct lancer_cmd_resp_pport_stats {
+	struct be_cmd_resp_hdr hdr;
+	struct lancer_pport_stats pport_stats;
+};
+
+static inline struct lancer_pport_stats*
+	pport_stats_from_cmd(struct be_adapter *adapter)
+{
+	struct lancer_cmd_resp_pport_stats *cmd = adapter->stats_cmd.va;
+	return &cmd->pport_stats;
+}
+
+struct be_cmd_req_get_cntl_addnl_attribs {
+	struct be_cmd_req_hdr hdr;
+	u8 rsvd[8];
+};
+
+struct be_cmd_resp_get_cntl_addnl_attribs {
+	struct be_cmd_resp_hdr hdr;
+	u16 ipl_file_number;
+	u8 ipl_file_version;
+	u8 rsvd0;
+	u8 on_die_temperature; /* in degrees centigrade*/
+	u8 rsvd1[3];
+};
+
+struct be_cmd_req_vlan_config {
+	struct be_cmd_req_hdr hdr;
+	u8 interface_id;
+	u8 promiscuous;
+	u8 untagged;
+	u8 num_vlan;
+	u16 normal_vlan[64];
+} __packed;
+
+/******************* RX FILTER ******************************/
+#define BE_MAX_MC		64 /* set mcast promisc if > 64 */
+struct macaddr {
+	u8 byte[ETH_ALEN];
+};
+
+struct be_cmd_req_rx_filter {
+	struct be_cmd_req_hdr hdr;
+	u32 global_flags_mask;
+	u32 global_flags;
+	u32 if_flags_mask;
+	u32 if_flags;
+	u32 if_id;
+	u32 mcast_num;
+	struct macaddr mcast_mac[BE_MAX_MC];
+};
+
+/******************** Link Status Query *******************/
+struct be_cmd_req_link_status {
+	struct be_cmd_req_hdr hdr;
+	u32 rsvd;
+};
+
+enum {
+	PHY_LINK_DUPLEX_NONE = 0x0,
+	PHY_LINK_DUPLEX_HALF = 0x1,
+	PHY_LINK_DUPLEX_FULL = 0x2
+};
+
+enum {
+	PHY_LINK_SPEED_ZERO = 0x0, 	/* => No link */
+	PHY_LINK_SPEED_10MBPS = 0x1,
+	PHY_LINK_SPEED_100MBPS = 0x2,
+	PHY_LINK_SPEED_1GBPS = 0x3,
+	PHY_LINK_SPEED_10GBPS = 0x4,
+	PHY_LINK_SPEED_20GBPS = 0x5,
+	PHY_LINK_SPEED_25GBPS = 0x6,
+	PHY_LINK_SPEED_40GBPS = 0x7
+};
+
+struct be_cmd_resp_link_status {
+	struct be_cmd_resp_hdr hdr;
+	u8 physical_port;
+	u8 mac_duplex;
+	u8 mac_speed;
+	u8 mac_fault;
+	u8 mgmt_mac_duplex;
+	u8 mgmt_mac_speed;
+	u16 link_speed;
+	u8 logical_link_status;
+	u8 rsvd1[3];
+} __packed;
+
+/******************** Port Identification ***************************/
+/*    Identifies the type of port attached to NIC     */
+struct be_cmd_req_port_type {
+	struct be_cmd_req_hdr hdr;
+	__le32 page_num;
+	__le32 port;
+};
+
+enum {
+	TR_PAGE_A0 = 0xa0,
+	TR_PAGE_A2 = 0xa2
+};
+
+/* From SFF-8436 QSFP+ spec */
+#define	QSFP_PLUS_CABLE_TYPE_OFFSET	0x83
+#define	QSFP_PLUS_CR4_CABLE		0x8
+#define	QSFP_PLUS_SR4_CABLE		0x4
+#define	QSFP_PLUS_LR4_CABLE		0x2
+
+/* From SFF-8472 spec */
+#define	SFP_PLUS_SFF_8472_COMP		0x5E
+#define	SFP_PLUS_CABLE_TYPE_OFFSET	0x8
+#define	SFP_PLUS_COPPER_CABLE		0x4
+
+#define PAGE_DATA_LEN   256
+struct be_cmd_resp_port_type {
+	struct be_cmd_resp_hdr hdr;
+	u32 page_num;
+	u32 port;
+	u8  page_data[PAGE_DATA_LEN];
+};
+
+/******************** Get FW Version *******************/
+struct be_cmd_req_get_fw_version {
+	struct be_cmd_req_hdr hdr;
+	u8 rsvd0[FW_VER_LEN];
+	u8 rsvd1[FW_VER_LEN];
+} __packed;
+
+struct be_cmd_resp_get_fw_version {
+	struct be_cmd_resp_hdr hdr;
+	u8 firmware_version_string[FW_VER_LEN];
+	u8 fw_on_flash_version_string[FW_VER_LEN];
+} __packed;
+
+/******************** Set Flow Contrl *******************/
+struct be_cmd_req_set_flow_control {
+	struct be_cmd_req_hdr hdr;
+	u16 tx_flow_control;
+	u16 rx_flow_control;
+} __packed;
+
+/******************** Get Flow Contrl *******************/
+struct be_cmd_req_get_flow_control {
+	struct be_cmd_req_hdr hdr;
+	u32 rsvd;
+};
+
+struct be_cmd_resp_get_flow_control {
+	struct be_cmd_resp_hdr hdr;
+	u16 tx_flow_control;
+	u16 rx_flow_control;
+} __packed;
+
+/******************** Modify EQ Delay *******************/
+struct be_set_eqd {
+	u32 eq_id;
+	u32 phase;
+	u32 delay_multiplier;
+};
+
+struct be_cmd_req_modify_eq_delay {
+	struct be_cmd_req_hdr hdr;
+	u32 num_eq;
+	struct be_set_eqd set_eqd[MAX_EVT_QS];
+} __packed;
+
+/******************** Get FW Config *******************/
+/* The HW can come up in either of the following multi-channel modes
+ * based on the skew/IPL.
+ */
+#define RDMA_ENABLED				0x4
+#define QNQ_MODE				0x400
+#define VNIC_MODE				0x20000
+#define UMC_ENABLED				0x1000000
+struct be_cmd_req_query_fw_cfg {
+	struct be_cmd_req_hdr hdr;
+	u32 rsvd[31];
+};
+
+struct be_cmd_resp_query_fw_cfg {
+	struct be_cmd_resp_hdr hdr;
+	u32 be_config_number;
+	u32 asic_revision;
+	u32 phys_port;
+	u32 function_mode;
+	u32 rsvd[26];
+	u32 function_caps;
+};
+
+/******************** RSS Config ****************************************/
+/* RSS type		Input parameters used to compute RX hash
+ * RSS_ENABLE_IPV4	SRC IPv4, DST IPv4
+ * RSS_ENABLE_TCP_IPV4	SRC IPv4, DST IPv4, TCP SRC PORT, TCP DST PORT
+ * RSS_ENABLE_IPV6	SRC IPv6, DST IPv6
+ * RSS_ENABLE_TCP_IPV6	SRC IPv6, DST IPv6, TCP SRC PORT, TCP DST PORT
+ * RSS_ENABLE_UDP_IPV4	SRC IPv4, DST IPv4, UDP SRC PORT, UDP DST PORT
+ * RSS_ENABLE_UDP_IPV6	SRC IPv6, DST IPv6, UDP SRC PORT, UDP DST PORT
+ *
+ * When multiple RSS types are enabled, HW picks the best hash policy
+ * based on the type of the received packet.
+ */
+#define RSS_ENABLE_NONE				0x0
+#define RSS_ENABLE_IPV4				0x1
+#define RSS_ENABLE_TCP_IPV4			0x2
+#define RSS_ENABLE_IPV6				0x4
+#define RSS_ENABLE_TCP_IPV6			0x8
+#define RSS_ENABLE_UDP_IPV4			0x10
+#define RSS_ENABLE_UDP_IPV6			0x20
+
+#define L3_RSS_FLAGS				(RXH_IP_DST | RXH_IP_SRC)
+#define L4_RSS_FLAGS				(RXH_L4_B_0_1 | RXH_L4_B_2_3)
+
+struct be_cmd_req_rss_config {
+	struct be_cmd_req_hdr hdr;
+	u32 if_id;
+	u16 enable_rss;
+	u16 cpu_table_size_log2;
+	u32 hash[10];
+	u8 cpu_table[128];
+	u8 flush;
+	u8 rsvd0[3];
+};
+
+/******************** Port Beacon ***************************/
+
+#define BEACON_STATE_ENABLED		0x1
+#define BEACON_STATE_DISABLED		0x0
+
+struct be_cmd_req_enable_disable_beacon {
+	struct be_cmd_req_hdr hdr;
+	u8  port_num;
+	u8  beacon_state;
+	u8  beacon_duration;
+	u8  status_duration;
+} __packed;
+
+struct be_cmd_req_get_beacon_state {
+	struct be_cmd_req_hdr hdr;
+	u8  port_num;
+	u8  rsvd0;
+	u16 rsvd1;
+} __packed;
+
+struct be_cmd_resp_get_beacon_state {
+	struct be_cmd_resp_hdr resp_hdr;
+	u8 beacon_state;
+	u8 rsvd0[3];
+} __packed;
+
+/****************** Firmware Flash ******************/
+struct flashrom_params {
+	u32 op_code;
+	u32 op_type;
+	u32 data_buf_size;
+	u32 offset;
+};
+
+struct be_cmd_write_flashrom {
+	struct be_cmd_req_hdr hdr;
+	struct flashrom_params params;
+	u8 data_buf[32768];
+	u8 rsvd[4];
+} __packed;
+
+/* cmd to read flash crc */
+struct be_cmd_read_flash_crc {
+	struct be_cmd_req_hdr hdr;
+	struct flashrom_params params;
+	u8 crc[4];
+	u8 rsvd[4];
+} __packed;
+
+/**************** Lancer Firmware Flash ************/
+struct amap_lancer_write_obj_context {
+	u8 write_length[24];
+	u8 reserved1[7];
+	u8 eof;
+} __packed;
+
+struct lancer_cmd_req_write_object {
+	struct be_cmd_req_hdr hdr;
+	u8 context[sizeof(struct amap_lancer_write_obj_context) / 8];
+	u32 write_offset;
+	u8 object_name[104];
+	u32 descriptor_count;
+	u32 buf_len;
+	u32 addr_low;
+	u32 addr_high;
+};
+
+#define LANCER_NO_RESET_NEEDED		0x00
+#define LANCER_FW_RESET_NEEDED		0x02
+struct lancer_cmd_resp_write_object {
+	u8 opcode;
+	u8 subsystem;
+	u8 rsvd1[2];
+	u8 status;
+	u8 additional_status;
+	u8 rsvd2[2];
+	u32 resp_len;
+	u32 actual_resp_len;
+	u32 actual_write_len;
+	u8 change_status;
+	u8 rsvd3[3];
+};
+
+/************************ Lancer Read FW info **************/
+#define LANCER_READ_FILE_CHUNK			(32*1024)
+#define LANCER_READ_FILE_EOF_MASK		0x80000000
+
+#define LANCER_FW_DUMP_FILE			"/dbg/dump.bin"
+#define LANCER_VPD_PF_FILE			"/vpd/ntr_pf.vpd"
+#define LANCER_VPD_VF_FILE			"/vpd/ntr_vf.vpd"
+
+struct lancer_cmd_req_read_object {
+	struct be_cmd_req_hdr hdr;
+	u32 desired_read_len;
+	u32 read_offset;
+	u8 object_name[104];
+	u32 descriptor_count;
+	u32 buf_len;
+	u32 addr_low;
+	u32 addr_high;
+};
+
+struct lancer_cmd_resp_read_object {
+	u8 opcode;
+	u8 subsystem;
+	u8 rsvd1[2];
+	u8 status;
+	u8 additional_status;
+	u8 rsvd2[2];
+	u32 resp_len;
+	u32 actual_resp_len;
+	u32 actual_read_len;
+	u32 eof;
+};
+
+struct lancer_cmd_req_delete_object {
+	struct be_cmd_req_hdr hdr;
+	u32 rsvd1;
+	u32 rsvd2;
+	u8 object_name[104];
+};
+
+/************************ WOL *******************************/
+struct be_cmd_req_acpi_wol_magic_config{
+	struct be_cmd_req_hdr hdr;
+	u32 rsvd0[145];
+	u8 magic_mac[6];
+	u8 rsvd2[2];
+} __packed;
+
+struct be_cmd_req_acpi_wol_magic_config_v1 {
+	struct be_cmd_req_hdr hdr;
+	u8 rsvd0[2];
+	u8 query_options;
+	u8 rsvd1[5];
+	u32 rsvd2[288];
+	u8 magic_mac[6];
+	u8 rsvd3[22];
+} __packed;
+
+struct be_cmd_resp_acpi_wol_magic_config_v1 {
+	struct be_cmd_resp_hdr hdr;
+	u8 rsvd0[2];
+	u8 wol_settings;
+	u8 rsvd1[5];
+	u32 rsvd2[295];
+} __packed;
+
+#define BE_GET_WOL_CAP			2
+
+#define BE_WOL_CAP			0x1
+#define BE_PME_D0_CAP			0x8
+#define BE_PME_D1_CAP			0x10
+#define BE_PME_D2_CAP			0x20
+#define BE_PME_D3HOT_CAP		0x40
+#define BE_PME_D3COLD_CAP		0x80
+
+/********************** LoopBack test *********************/
+struct be_cmd_req_loopback_test {
+	struct be_cmd_req_hdr hdr;
+	u32 loopback_type;
+	u32 num_pkts;
+	u64 pattern;
+	u32 src_port;
+	u32 dest_port;
+	u32 pkt_size;
+};
+
+struct be_cmd_resp_loopback_test {
+	struct be_cmd_resp_hdr resp_hdr;
+	u32    status;
+	u32    num_txfer;
+	u32    num_rx;
+	u32    miscomp_off;
+	u32    ticks_compl;
+};
+
+struct be_cmd_req_set_lmode {
+	struct be_cmd_req_hdr hdr;
+	u8 src_port;
+	u8 dest_port;
+	u8 loopback_type;
+	u8 loopback_state;
+};
+
+/********************** DDR DMA test *********************/
+struct be_cmd_req_ddrdma_test {
+	struct be_cmd_req_hdr hdr;
+	u64 pattern;
+	u32 byte_count;
+	u32 rsvd0;
+	u8  snd_buff[4096];
+	u8  rsvd1[4096];
+};
+
+struct be_cmd_resp_ddrdma_test {
+	struct be_cmd_resp_hdr hdr;
+	u64 pattern;
+	u32 byte_cnt;
+	u32 snd_err;
+	u8  rsvd0[4096];
+	u8  rcv_buff[4096];
+};
+
+/*********************** SEEPROM Read ***********************/
+
+#define BE_READ_SEEPROM_LEN 1024
+struct be_cmd_req_seeprom_read {
+	struct be_cmd_req_hdr hdr;
+	u8 rsvd0[BE_READ_SEEPROM_LEN];
+};
+
+struct be_cmd_resp_seeprom_read {
+	struct be_cmd_req_hdr hdr;
+	u8 seeprom_data[BE_READ_SEEPROM_LEN];
+};
+
+enum {
+	PHY_TYPE_CX4_10GB = 0,
+	PHY_TYPE_XFP_10GB,
+	PHY_TYPE_SFP_1GB,
+	PHY_TYPE_SFP_PLUS_10GB,
+	PHY_TYPE_KR_10GB,
+	PHY_TYPE_KX4_10GB,
+	PHY_TYPE_BASET_10GB,
+	PHY_TYPE_BASET_1GB,
+	PHY_TYPE_BASEX_1GB,
+	PHY_TYPE_SGMII,
+	PHY_TYPE_QSFP,
+	PHY_TYPE_KR4_40GB,
+	PHY_TYPE_KR2_20GB,
+	PHY_TYPE_DISABLED = 255
+};
+
+#define BE_SUPPORTED_SPEED_NONE		0
+#define BE_SUPPORTED_SPEED_10MBPS	1
+#define BE_SUPPORTED_SPEED_100MBPS	2
+#define BE_SUPPORTED_SPEED_1GBPS	4
+#define BE_SUPPORTED_SPEED_10GBPS	8
+#define BE_SUPPORTED_SPEED_20GBPS	0x10
+#define BE_SUPPORTED_SPEED_40GBPS	0x20
+
+#define BE_AN_EN			0x2
+#define BE_PAUSE_SYM_EN			0x80
+
+/* MAC speed valid values */
+#define SPEED_DEFAULT  0x0
+#define SPEED_FORCED_10GB  0x1
+#define SPEED_FORCED_1GB  0x2
+#define SPEED_AUTONEG_10GB  0x3
+#define SPEED_AUTONEG_1GB  0x4
+#define SPEED_AUTONEG_100MB  0x5
+#define SPEED_AUTONEG_10GB_1GB 0x6
+#define SPEED_AUTONEG_10GB_1GB_100MB 0x7
+#define SPEED_AUTONEG_1GB_100MB  0x8
+#define SPEED_AUTONEG_10MB  0x9
+#define SPEED_AUTONEG_1GB_100MB_10MB 0xa
+#define SPEED_AUTONEG_100MB_10MB 0xb
+#define SPEED_FORCED_100MB  0xc
+#define SPEED_FORCED_10MB  0xd
+
+struct be_cmd_req_get_phy_info {
+	struct be_cmd_req_hdr hdr;
+	u8 rsvd0[24];
+};
+
+struct be_phy_info {
+	u16 phy_type;
+	u16 interface_type;
+	u32 misc_params;
+	u16 ext_phy_details;
+	u16 rsvd;
+	u16 auto_speeds_supported;
+	u16 fixed_speeds_supported;
+	u32 future_use[2];
+};
+
+struct be_cmd_resp_get_phy_info {
+	struct be_cmd_req_hdr hdr;
+	struct be_phy_info phy_info;
+};
+
+/*********************** Set QOS ***********************/
+
+#define BE_QOS_BITS_NIC				1
+
+struct be_cmd_req_set_qos {
+	struct be_cmd_req_hdr hdr;
+	u32 valid_bits;
+	u32 max_bps_nic;
+	u32 rsvd[7];
+};
+
+/*********************** Controller Attributes ***********************/
+struct be_cmd_req_cntl_attribs {
+	struct be_cmd_req_hdr hdr;
+};
+
+struct be_cmd_resp_cntl_attribs {
+	struct be_cmd_resp_hdr hdr;
+	struct mgmt_controller_attrib attribs;
+};
+
+/*********************** Set driver function ***********************/
+#define CAPABILITY_SW_TIMESTAMPS	2
+#define CAPABILITY_BE3_NATIVE_ERX_API	4
+
+struct be_cmd_req_set_func_cap {
+	struct be_cmd_req_hdr hdr;
+	u32 valid_cap_flags;
+	u32 cap_flags;
+	u8 rsvd[212];
+};
+
+struct be_cmd_resp_set_func_cap {
+	struct be_cmd_resp_hdr hdr;
+	u32 valid_cap_flags;
+	u32 cap_flags;
+	u8 rsvd[212];
+};
+
+/*********************** Function Privileges ***********************/
+enum {
+	BE_PRIV_DEFAULT = 0x1,
+	BE_PRIV_LNKQUERY = 0x2,
+	BE_PRIV_LNKSTATS = 0x4,
+	BE_PRIV_LNKMGMT = 0x8,
+	BE_PRIV_LNKDIAG = 0x10,
+	BE_PRIV_UTILQUERY = 0x20,
+	BE_PRIV_FILTMGMT = 0x40,
+	BE_PRIV_IFACEMGMT = 0x80,
+	BE_PRIV_VHADM = 0x100,
+	BE_PRIV_DEVCFG = 0x200,
+	BE_PRIV_DEVSEC = 0x400
+};
+#define MAX_PRIVILEGES		(BE_PRIV_VHADM | BE_PRIV_DEVCFG | \
+				 BE_PRIV_DEVSEC)
+#define MIN_PRIVILEGES		BE_PRIV_DEFAULT
+
+struct be_cmd_priv_map {
+	u8 opcode;
+	u8 subsystem;
+	u32 priv_mask;
+};
+
+struct be_cmd_req_get_fn_privileges {
+	struct be_cmd_req_hdr hdr;
+	u32 rsvd;
+};
+
+struct be_cmd_resp_get_fn_privileges {
+	struct be_cmd_resp_hdr hdr;
+	u32 privilege_mask;
+};
+
+struct be_cmd_req_set_fn_privileges {
+	struct be_cmd_req_hdr hdr;
+	u32 privileges;		/* Used by BE3, SH-R */
+	u32 privileges_lancer;	/* Used by Lancer */
+};
+
+/******************** GET/SET_MACLIST  **************************/
+#define BE_MAX_MAC			64
+struct be_cmd_req_get_mac_list {
+	struct be_cmd_req_hdr hdr;
+	u8 mac_type;
+	u8 perm_override;
+	u16 iface_id;
+	u32 mac_id;
+	u32 rsvd[3];
+} __packed;
+
+struct get_list_macaddr {
+	u16 mac_addr_size;
+	union {
+		u8 macaddr[6];
+		struct {
+			u8 rsvd[2];
+			u32 mac_id;
+		} __packed s_mac_id;
+	} __packed mac_addr_id;
+} __packed;
+
+struct be_cmd_resp_get_mac_list {
+	struct be_cmd_resp_hdr hdr;
+	struct get_list_macaddr fd_macaddr; /* Factory default mac */
+	struct get_list_macaddr macid_macaddr; /* soft mac */
+	u8 true_mac_count;
+	u8 pseudo_mac_count;
+	u8 mac_list_size;
+	u8 rsvd;
+	/* perm override mac */
+	struct get_list_macaddr macaddr_list[BE_MAX_MAC];
+} __packed;
+
+struct be_cmd_req_set_mac_list {
+	struct be_cmd_req_hdr hdr;
+	u8 mac_count;
+	u8 rsvd1;
+	u16 rsvd2;
+	struct macaddr mac[BE_MAX_MAC];
+} __packed;
+
+/*********************** HSW Config ***********************/
+#define PORT_FWD_TYPE_VEPA		0x3
+#define PORT_FWD_TYPE_VEB		0x2
+
+struct amap_set_hsw_context {
+	u8 interface_id[16];
+	u8 rsvd0[14];
+	u8 pvid_valid;
+	u8 pport;
+	u8 rsvd1[6];
+	u8 port_fwd_type[3];
+	u8 rsvd2[7];
+	u8 pvid[16];
+	u8 rsvd3[32];
+	u8 rsvd4[32];
+	u8 rsvd5[32];
+} __packed;
+
+struct be_cmd_req_set_hsw_config {
+	struct be_cmd_req_hdr hdr;
+	u8 context[sizeof(struct amap_set_hsw_context) / 8];
+} __packed;
+
+struct amap_get_hsw_req_context {
+	u8 interface_id[16];
+	u8 rsvd0[14];
+	u8 pvid_valid;
+	u8 pport;
+} __packed;
+
+struct amap_get_hsw_resp_context {
+	u8 rsvd0[6];
+	u8 port_fwd_type[3];
+	u8 rsvd1[7];
+	u8 pvid[16];
+	u8 rsvd2[32];
+	u8 rsvd3[32];
+	u8 rsvd4[32];
+} __packed;
+
+struct be_cmd_req_get_hsw_config {
+	struct be_cmd_req_hdr hdr;
+	u8 context[sizeof(struct amap_get_hsw_req_context) / 8];
+} __packed;
+
+struct be_cmd_resp_get_hsw_config {
+	struct be_cmd_resp_hdr hdr;
+	u8 context[sizeof(struct amap_get_hsw_resp_context) / 8];
+	u32 rsvd;
+};
+
+/******************* get port names ***************/
+struct be_cmd_req_get_port_name {
+	struct be_cmd_req_hdr hdr;
+	u32 rsvd0;
+};
+
+struct be_cmd_resp_get_port_name {
+	struct be_cmd_req_hdr hdr;
+	u8 port_name[4];
+};
+
+/*************** HW Stats Get v1 **********************************/
+#define BE_TXP_SW_SZ			48
+struct be_port_rxf_stats_v1 {
+	u32 rsvd0[12];
+	u32 rx_crc_errors;
+	u32 rx_alignment_symbol_errors;
+	u32 rx_pause_frames;
+	u32 rx_priority_pause_frames;
+	u32 rx_control_frames;
+	u32 rx_in_range_errors;
+	u32 rx_out_range_errors;
+	u32 rx_frame_too_long;
+	u32 rx_address_filtered;
+	u32 rx_dropped_too_small;
+	u32 rx_dropped_too_short;
+	u32 rx_dropped_header_too_small;
+	u32 rx_dropped_tcp_length;
+	u32 rx_dropped_runt;
+	u32 rsvd1[10];
+	u32 rx_ip_checksum_errs;
+	u32 rx_tcp_checksum_errs;
+	u32 rx_udp_checksum_errs;
+	u32 rsvd2[7];
+	u32 rx_switched_unicast_packets;
+	u32 rx_switched_multicast_packets;
+	u32 rx_switched_broadcast_packets;
+	u32 rsvd3[3];
+	u32 tx_pauseframes;
+	u32 tx_priority_pauseframes;
+	u32 tx_controlframes;
+	u32 rsvd4[10];
+	u32 rxpp_fifo_overflow_drop;
+	u32 rx_input_fifo_overflow_drop;
+	u32 pmem_fifo_overflow_drop;
+	u32 jabber_events;
+	u32 rsvd5[3];
+};
+
+
+struct be_rxf_stats_v1 {
+	struct be_port_rxf_stats_v1 port[4];
+	u32 rsvd0[2];
+	u32 rx_drops_no_pbuf;
+	u32 rx_drops_no_txpb;
+	u32 rx_drops_no_erx_descr;
+	u32 rx_drops_no_tpre_descr;
+	u32 rsvd1[6];
+	u32 rx_drops_too_many_frags;
+	u32 rx_drops_invalid_ring;
+	u32 forwarded_packets;
+	u32 rx_drops_mtu;
+	u32 rsvd2[14];
+};
+
+struct be_erx_stats_v1 {
+	u32 rx_drops_no_fragments[68];     /* dwordS 0 to 67*/
+	u32 rsvd[4];
+};
+
+struct be_port_rxf_stats_v2 {
+	u32 rsvd0[10];
+	u32 roce_bytes_received_lsd;
+	u32 roce_bytes_received_msd;
+	u32 rsvd1[5];
+	u32 roce_frames_received;
+	u32 rx_crc_errors;
+	u32 rx_alignment_symbol_errors;
+	u32 rx_pause_frames;
+	u32 rx_priority_pause_frames;
+	u32 rx_control_frames;
+	u32 rx_in_range_errors;
+	u32 rx_out_range_errors;
+	u32 rx_frame_too_long;
+	u32 rx_address_filtered;
+	u32 rx_dropped_too_small;
+	u32 rx_dropped_too_short;
+	u32 rx_dropped_header_too_small;
+	u32 rx_dropped_tcp_length;
+	u32 rx_dropped_runt;
+	u32 rsvd2[10];
+	u32 rx_ip_checksum_errs;
+	u32 rx_tcp_checksum_errs;
+	u32 rx_udp_checksum_errs;
+	u32 rsvd3[7];
+	u32 rx_switched_unicast_packets;
+	u32 rx_switched_multicast_packets;
+	u32 rx_switched_broadcast_packets;
+	u32 rsvd4[3];
+	u32 tx_pauseframes;
+	u32 tx_priority_pauseframes;
+	u32 tx_controlframes;
+	u32 rsvd5[10];
+	u32 rxpp_fifo_overflow_drop;
+	u32 rx_input_fifo_overflow_drop;
+	u32 pmem_fifo_overflow_drop;
+	u32 jabber_events;
+	u32 rsvd6[3];
+	u32 rx_drops_payload_size;
+	u32 rx_drops_clipped_header;
+	u32 rx_drops_crc;
+	u32 roce_drops_payload_len;
+	u32 roce_drops_crc;
+	u32 rsvd7[19];
+};
+
+struct be_rxf_stats_v2 {
+	struct be_port_rxf_stats_v2 port[4];
+	u32 rsvd0[2];
+	u32 rx_drops_no_pbuf;
+	u32 rx_drops_no_txpb;
+	u32 rx_drops_no_erx_descr;
+	u32 rx_drops_no_tpre_descr;
+	u32 rsvd1[6];
+	u32 rx_drops_too_many_frags;
+	u32 rx_drops_invalid_ring;
+	u32 forwarded_packets;
+	u32 rx_drops_mtu;
+	u32 rsvd2[35];
+};
+
+struct be_hw_stats_v1 {
+	struct be_rxf_stats_v1 rxf;
+	u32 rsvd0[BE_TXP_SW_SZ];
+	struct be_erx_stats_v1 erx;
+	struct be_pmem_stats pmem;
+	u32 rsvd1[18];
+};
+
+struct be_cmd_req_get_stats_v1 {
+	struct be_cmd_req_hdr hdr;
+	u8 rsvd[sizeof(struct be_hw_stats_v1)];
+};
+
+struct be_cmd_resp_get_stats_v1 {
+	struct be_cmd_resp_hdr hdr;
+	struct be_hw_stats_v1 hw_stats;
+};
+
+struct be_erx_stats_v2 {
+	u32 rx_drops_no_fragments[136];     /* dwordS 0 to 135*/
+	u32 rsvd[3];
+};
+
+struct be_hw_stats_v2 {
+	struct be_rxf_stats_v2 rxf;
+	u32 rsvd0[BE_TXP_SW_SZ];
+	struct be_erx_stats_v2 erx;
+	struct be_pmem_stats pmem;
+	u32 rsvd1[18];
+};
+
+struct be_cmd_req_get_stats_v2 {
+	struct be_cmd_req_hdr hdr;
+	u8 rsvd[sizeof(struct be_hw_stats_v2)];
+};
+
+struct be_cmd_resp_get_stats_v2 {
+	struct be_cmd_resp_hdr hdr;
+	struct be_hw_stats_v2 hw_stats;
+};
+
+/************** get fat capabilites *******************/
+#define MAX_MODULES 27
+#define MAX_MODES 4
+#define MODE_UART 0
+#define FW_LOG_LEVEL_DEFAULT 48
+#define FW_LOG_LEVEL_FATAL 64
+
+struct ext_fat_mode {
+	u8 mode;
+	u8 rsvd0;
+	u16 port_mask;
+	u32 dbg_lvl;
+	u64 fun_mask;
+} __packed;
+
+struct ext_fat_modules {
+	u8 modules_str[32];
+	u32 modules_id;
+	u32 num_modes;
+	struct ext_fat_mode trace_lvl[MAX_MODES];
+} __packed;
+
+struct be_fat_conf_params {
+	u32 max_log_entries;
+	u32 log_entry_size;
+	u8 log_type;
+	u8 max_log_funs;
+	u8 max_log_ports;
+	u8 rsvd0;
+	u32 supp_modes;
+	u32 num_modules;
+	struct ext_fat_modules module[MAX_MODULES];
+} __packed;
+
+struct be_cmd_req_get_ext_fat_caps {
+	struct be_cmd_req_hdr hdr;
+	u32 parameter_type;
+};
+
+struct be_cmd_resp_get_ext_fat_caps {
+	struct be_cmd_resp_hdr hdr;
+	struct be_fat_conf_params get_params;
+};
+
+struct be_cmd_req_set_ext_fat_caps {
+	struct be_cmd_req_hdr hdr;
+	struct be_fat_conf_params set_params;
+};
+
+#define RESOURCE_DESC_SIZE_V0			72
+#define RESOURCE_DESC_SIZE_V1			88
+#define PCIE_RESOURCE_DESC_TYPE_V0		0x40
+#define NIC_RESOURCE_DESC_TYPE_V0		0x41
+#define PCIE_RESOURCE_DESC_TYPE_V1		0x50
+#define NIC_RESOURCE_DESC_TYPE_V1		0x51
+#define PORT_RESOURCE_DESC_TYPE_V1		0x55
+#define MAX_RESOURCE_DESC			264
+
+#define VFT_SHIFT				3	/* VF template */
+#define IMM_SHIFT				6	/* Immediate */
+#define NOSV_SHIFT				7	/* No save */
+
+struct be_res_desc_hdr {
+	u8 desc_type;
+	u8 desc_len;
+} __packed;
+
+struct be_port_res_desc {
+	struct be_res_desc_hdr hdr;
+	u8 rsvd0;
+	u8 flags;
+	u8 link_num;
+	u8 mc_type;
+	u16 rsvd1;
+
+#define NV_TYPE_MASK				0x3	/* bits 0-1 */
+#define NV_TYPE_DISABLED			1
+#define NV_TYPE_VXLAN				3
+#define SOCVID_SHIFT				2	/* Strip outer vlan */
+#define RCVID_SHIFT				4	/* Report vlan */
+	u8 nv_flags;
+	u8 rsvd2;
+	__le16 nv_port;					/* vxlan/gre port */
+	u32 rsvd3[19];
+} __packed;
+
+struct be_pcie_res_desc {
+	struct be_res_desc_hdr hdr;
+	u8 rsvd0;
+	u8 flags;
+	u16 rsvd1;
+	u8 pf_num;
+	u8 rsvd2;
+	u32 rsvd3;
+	u8 sriov_state;
+	u8 pf_state;
+	u8 pf_type;
+	u8 rsvd4;
+	u16 num_vfs;
+	u16 rsvd5;
+	u32 rsvd6[17];
+} __packed;
+
+struct be_nic_res_desc {
+	struct be_res_desc_hdr hdr;
+	u8 rsvd1;
+
+#define QUN_SHIFT				4 /* QoS is in absolute units */
+	u8 flags;
+	u8 vf_num;
+	u8 rsvd2;
+	u8 pf_num;
+	u8 rsvd3;
+	u16 unicast_mac_count;
+	u8 rsvd4[6];
+	u16 mcc_count;
+	u16 vlan_count;
+	u16 mcast_mac_count;
+	u16 txq_count;
+	u16 rq_count;
+	u16 rssq_count;
+	u16 lro_count;
+	u16 cq_count;
+	u16 toe_conn_count;
+	u16 eq_count;
+	u16 vlan_id;
+	u16 iface_count;
+	u32 cap_flags;
+	u8 link_param;
+	u8 rsvd6;
+	u16 channel_id_param;
+	u32 bw_min;
+	u32 bw_max;
+	u8 acpi_params;
+	u8 wol_param;
+	u16 rsvd7;
+	u16 tunnel_iface_count;
+	u16 direct_tenant_iface_count;
+	u32 rsvd8[6];
+} __packed;
+
+/************ Multi-Channel type ***********/
+enum mc_type {
+	MC_NONE = 0x01,
+	UMC = 0x02,
+	FLEX10 = 0x03,
+	vNIC1 = 0x04,
+	nPAR = 0x05,
+	UFP = 0x06,
+	vNIC2 = 0x07
+};
+
+/* Is BE in a multi-channel mode */
+static inline bool be_is_mc(struct be_adapter *adapter)
+{
+	return adapter->mc_type > MC_NONE;
+}
+
+struct be_cmd_req_get_func_config {
+	struct be_cmd_req_hdr hdr;
+};
+
+struct be_cmd_resp_get_func_config {
+	struct be_cmd_resp_hdr hdr;
+	u32 desc_count;
+	u8 func_param[MAX_RESOURCE_DESC * RESOURCE_DESC_SIZE_V1];
+};
+
+#define ACTIVE_PROFILE_TYPE			0x2
+struct be_cmd_req_get_profile_config {
+	struct be_cmd_req_hdr hdr;
+	u8 rsvd;
+	u8 type;
+	u16 rsvd1;
+};
+
+struct be_cmd_resp_get_profile_config {
+	struct be_cmd_resp_hdr hdr;
+	u32 desc_count;
+	u8 func_param[MAX_RESOURCE_DESC * RESOURCE_DESC_SIZE_V1];
+};
+
+struct be_cmd_req_set_profile_config {
+	struct be_cmd_req_hdr hdr;
+	u32 rsvd;
+	u32 desc_count;
+	u8 desc[2 * RESOURCE_DESC_SIZE_V1];
+} __packed;
+
+struct be_cmd_req_get_active_profile {
+	struct be_cmd_req_hdr hdr;
+	u32 rsvd;
+} __packed;
+
+struct be_cmd_resp_get_active_profile {
+	struct be_cmd_resp_hdr hdr;
+	u16 active_profile_id;
+	u16 next_profile_id;
+} __packed;
+
+struct be_cmd_enable_disable_vf {
+	struct be_cmd_req_hdr hdr;
+	u8 enable;
+	u8 rsvd[3];
+};
+
+struct be_cmd_req_intr_set {
+	struct be_cmd_req_hdr hdr;
+	u8 intr_enabled;
+	u8 rsvd[3];
+};
+
+static inline bool check_privilege(struct be_adapter *adapter, u32 flags)
+{
+	return flags & adapter->cmd_privileges ? true : false;
+}
+
+/************** Get IFACE LIST *******************/
+struct be_if_desc {
+	u32 if_id;
+	u32 cap_flags;
+	u32 en_flags;
+};
+
+struct be_cmd_req_get_iface_list {
+	struct be_cmd_req_hdr hdr;
+};
+
+struct be_cmd_resp_get_iface_list {
+	struct be_cmd_req_hdr hdr;
+	u32 if_cnt;
+	struct be_if_desc if_desc;
+};
+
+/*************** Set logical link ********************/
+#define PLINK_TRACK_SHIFT	8
+struct be_cmd_req_set_ll_link {
+	struct be_cmd_req_hdr hdr;
+	u32 link_config; /* Bit 0: UP_DOWN, Bit 9: PLINK */
+};
+
+/************** Manage IFACE Filters *******************/
+#define OP_CONVERT_NORMAL_TO_TUNNEL		0
+#define OP_CONVERT_TUNNEL_TO_NORMAL		1
+
+struct be_cmd_req_manage_iface_filters {
+	struct be_cmd_req_hdr hdr;
+	u8  op;
+	u8  rsvd0;
+	u8  flags;
+	u8  rsvd1;
+	u32 tunnel_iface_id;
+	u32 target_iface_id;
+	u8  mac[6];
+	u16 vlan_tag;
+	u32 tenant_id;
+	u32 filter_id;
+	u32 cap_flags;
+	u32 cap_control_flags;
+} __packed;
+
+int be_pci_fnum_get(struct be_adapter *adapter);
+int be_fw_wait_ready(struct be_adapter *adapter);
+int be_cmd_mac_addr_query(struct be_adapter *adapter, u8 *mac_addr,
+			  bool permanent, u32 if_handle, u32 pmac_id);
+int be_cmd_pmac_add(struct be_adapter *adapter, u8 *mac_addr, u32 if_id,
+		    u32 *pmac_id, u32 domain);
+int be_cmd_pmac_del(struct be_adapter *adapter, u32 if_id, int pmac_id,
+		    u32 domain);
+int be_cmd_if_create(struct be_adapter *adapter, u32 cap_flags, u32 en_flags,
+		     u32 *if_handle, u32 domain);
+int be_cmd_if_destroy(struct be_adapter *adapter, int if_handle, u32 domain);
+int be_cmd_eq_create(struct be_adapter *adapter, struct be_eq_obj *eqo);
+int be_cmd_cq_create(struct be_adapter *adapter, struct be_queue_info *cq,
+		     struct be_queue_info *eq, bool no_delay,
+		     int num_cqe_dma_coalesce);
+int be_cmd_mccq_create(struct be_adapter *adapter, struct be_queue_info *mccq,
+		       struct be_queue_info *cq);
+int be_cmd_txq_create(struct be_adapter *adapter, struct be_tx_obj *txo);
+int be_cmd_rxq_create(struct be_adapter *adapter, struct be_queue_info *rxq,
+		      u16 cq_id, u16 frag_size, u32 if_id, u32 rss, u8 *rss_id);
+int be_cmd_q_destroy(struct be_adapter *adapter, struct be_queue_info *q,
+		     int type);
+int be_cmd_rxq_destroy(struct be_adapter *adapter, struct be_queue_info *q);
+int be_cmd_link_status_query(struct be_adapter *adapter, u16 *link_speed,
+			     u8 *link_status, u32 dom);
+int be_cmd_reset(struct be_adapter *adapter);
+int be_cmd_get_stats(struct be_adapter *adapter, struct be_dma_mem *nonemb_cmd);
+int lancer_cmd_get_pport_stats(struct be_adapter *adapter,
+			       struct be_dma_mem *nonemb_cmd);
+int be_cmd_get_fw_ver(struct be_adapter *adapter);
+int be_cmd_modify_eqd(struct be_adapter *adapter, struct be_set_eqd *, int num);
+int be_cmd_vlan_config(struct be_adapter *adapter, u32 if_id, u16 *vtag_array,
+		       u32 num);
+int be_cmd_rx_filter(struct be_adapter *adapter, u32 flags, u32 status);
+int be_cmd_set_flow_control(struct be_adapter *adapter, u32 tx_fc, u32 rx_fc);
+int be_cmd_get_flow_control(struct be_adapter *adapter, u32 *tx_fc, u32 *rx_fc);
+int be_cmd_query_fw_cfg(struct be_adapter *adapter);
+int be_cmd_reset_function(struct be_adapter *adapter);
+int be_cmd_rss_config(struct be_adapter *adapter, u8 *rsstable,
+		      u32 rss_hash_opts, u16 table_size, const u8 *rss_hkey);
+int be_process_mcc(struct be_adapter *adapter);
+int be_cmd_set_beacon_state(struct be_adapter *adapter, u8 port_num, u8 beacon,
+			    u8 status, u8 state);
+int be_cmd_get_beacon_state(struct be_adapter *adapter, u8 port_num,
+			    u32 *state);
+int be_cmd_read_port_transceiver_data(struct be_adapter *adapter,
+				      u8 page_num, u8 *data);
+int be_cmd_query_cable_type(struct be_adapter *adapter);
+int be_cmd_write_flashrom(struct be_adapter *adapter, struct be_dma_mem *cmd,
+			  u32 flash_oper, u32 flash_opcode, u32 buf_size);
+int lancer_cmd_write_object(struct be_adapter *adapter, struct be_dma_mem *cmd,
+			    u32 data_size, u32 data_offset,
+			    const char *obj_name, u32 *data_written,
+			    u8 *change_status, u8 *addn_status);
+int lancer_cmd_read_object(struct be_adapter *adapter, struct be_dma_mem *cmd,
+			   u32 data_size, u32 data_offset, const char *obj_name,
+			   u32 *data_read, u32 *eof, u8 *addn_status);
+int lancer_cmd_delete_object(struct be_adapter *adapter, const char *obj_name);
+int be_cmd_get_flash_crc(struct be_adapter *adapter, u8 *flashed_crc,
+			  u16 optype, int offset);
+int be_cmd_enable_magic_wol(struct be_adapter *adapter, u8 *mac,
+			    struct be_dma_mem *nonemb_cmd);
+int be_cmd_fw_init(struct be_adapter *adapter);
+int be_cmd_fw_clean(struct be_adapter *adapter);
+void be_async_mcc_enable(struct be_adapter *adapter);
+void be_async_mcc_disable(struct be_adapter *adapter);
+int be_cmd_loopback_test(struct be_adapter *adapter, u32 port_num,
+			 u32 loopback_type, u32 pkt_size, u32 num_pkts,
+			 u64 pattern);
+int be_cmd_ddr_dma_test(struct be_adapter *adapter, u64 pattern, u32 byte_cnt,
+			struct be_dma_mem *cmd);
+int be_cmd_get_seeprom_data(struct be_adapter *adapter,
+			    struct be_dma_mem *nonemb_cmd);
+int be_cmd_set_loopback(struct be_adapter *adapter, u8 port_num,
+			u8 loopback_type, u8 enable);
+int be_cmd_get_phy_info(struct be_adapter *adapter);
+int be_cmd_config_qos(struct be_adapter *adapter, u32 max_rate,
+		      u16 link_speed, u8 domain);
+void be_detect_error(struct be_adapter *adapter);
+int be_cmd_get_die_temperature(struct be_adapter *adapter);
+int be_cmd_get_cntl_attributes(struct be_adapter *adapter);
+int be_cmd_req_native_mode(struct be_adapter *adapter);
+int be_cmd_get_reg_len(struct be_adapter *adapter, u32 *log_size);
+int be_cmd_get_regs(struct be_adapter *adapter, u32 buf_len, void *buf);
+int be_cmd_get_fn_privileges(struct be_adapter *adapter, u32 *privilege,
+			     u32 domain);
+int be_cmd_set_fn_privileges(struct be_adapter *adapter, u32 privileges,
+			     u32 vf_num);
+int be_cmd_get_mac_from_list(struct be_adapter *adapter, u8 *mac,
+			     bool *pmac_id_active, u32 *pmac_id,
+			     u32 if_handle, u8 domain);
+int be_cmd_get_active_mac(struct be_adapter *adapter, u32 pmac_id, u8 *mac,
+			  u32 if_handle, bool active, u32 domain);
+int be_cmd_get_perm_mac(struct be_adapter *adapter, u8 *mac);
+int be_cmd_set_mac_list(struct be_adapter *adapter, u8 *mac_array, u8 mac_count,
+			u32 domain);
+int be_cmd_set_mac(struct be_adapter *adapter, u8 *mac, int if_id, u32 dom);
+int be_cmd_set_hsw_config(struct be_adapter *adapter, u16 pvid, u32 domain,
+			  u16 intf_id, u16 hsw_mode);
+int be_cmd_get_hsw_config(struct be_adapter *adapter, u16 *pvid, u32 domain,
+			  u16 intf_id, u8 *mode);
+int be_cmd_get_acpi_wol_cap(struct be_adapter *adapter);
+int be_cmd_set_fw_log_level(struct be_adapter *adapter, u32 level);
+int be_cmd_get_fw_log_level(struct be_adapter *adapter);
+int be_cmd_get_ext_fat_capabilites(struct be_adapter *adapter,
+				   struct be_dma_mem *cmd);
+int be_cmd_set_ext_fat_capabilites(struct be_adapter *adapter,
+				   struct be_dma_mem *cmd,
+				   struct be_fat_conf_params *cfgs);
+int lancer_physdev_ctrl(struct be_adapter *adapter, u32 mask);
+int lancer_initiate_dump(struct be_adapter *adapter);
+int lancer_delete_dump(struct be_adapter *adapter);
+bool dump_present(struct be_adapter *adapter);
+int lancer_test_and_set_rdy_state(struct be_adapter *adapter);
+int be_cmd_query_port_name(struct be_adapter *adapter, u8 *port_name);
+int be_cmd_get_func_config(struct be_adapter *adapter,
+			   struct be_resources *res);
+int be_cmd_get_profile_config(struct be_adapter *adapter,
+			      struct be_resources *res, u8 domain);
+int be_cmd_get_active_profile(struct be_adapter *adapter, u16 *profile);
+int be_cmd_get_if_id(struct be_adapter *adapter, struct be_vf_cfg *vf_cfg,
+		     int vf_num);
+int be_cmd_enable_vf(struct be_adapter *adapter, u8 domain);
+int be_cmd_intr_set(struct be_adapter *adapter, bool intr_enable);
+int be_cmd_set_logical_link_config(struct be_adapter *adapter,
+					  int link_state, u8 domain);
+int be_cmd_set_vxlan_port(struct be_adapter *adapter, __be16 port);
+int be_cmd_manage_iface(struct be_adapter *adapter, u32 iface, u8 op);
+int be_cmd_set_sriov_config(struct be_adapter *adapter,
+			    struct be_resources res, u16 num_vfs);
diff --git a/drivers/net/ethernet/emulex/benet/be_ethtool.c b/drivers/net/ethernet/emulex/benet/be_ethtool.c
new file mode 100644
index 0000000..e42a791
--- /dev/null
+++ b/drivers/net/ethernet/emulex/benet/be_ethtool.c
@@ -0,0 +1,1314 @@
+/*
+ * Copyright (C) 2005 - 2014 Emulex
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.  The full GNU General
+ * Public License is included in this distribution in the file called COPYING.
+ *
+ * Contact Information:
+ * linux-drivers@emulex.com
+ *
+ * Emulex
+ * 3333 Susan Street
+ * Costa Mesa, CA 92626
+ */
+
+#include "be.h"
+#include "be_cmds.h"
+#include <linux/ethtool.h>
+
+struct be_ethtool_stat {
+	char desc[ETH_GSTRING_LEN];
+	int type;
+	int size;
+	int offset;
+};
+
+enum {DRVSTAT_TX, DRVSTAT_RX, DRVSTAT};
+#define FIELDINFO(_struct, field) FIELD_SIZEOF(_struct, field), \
+					offsetof(_struct, field)
+#define DRVSTAT_TX_INFO(field)	#field, DRVSTAT_TX,\
+					FIELDINFO(struct be_tx_stats, field)
+#define DRVSTAT_RX_INFO(field)	#field, DRVSTAT_RX,\
+					FIELDINFO(struct be_rx_stats, field)
+#define	DRVSTAT_INFO(field)	#field, DRVSTAT,\
+					FIELDINFO(struct be_drv_stats, field)
+
+static const struct be_ethtool_stat et_stats[] = {
+	{DRVSTAT_INFO(rx_crc_errors)},
+	{DRVSTAT_INFO(rx_alignment_symbol_errors)},
+	{DRVSTAT_INFO(rx_pause_frames)},
+	{DRVSTAT_INFO(rx_control_frames)},
+	/* Received packets dropped when the Ethernet length field
+	 * is not equal to the actual Ethernet data length.
+	 */
+	{DRVSTAT_INFO(rx_in_range_errors)},
+	/* Received packets dropped when their length field is >= 1501 bytes
+	 * and <= 1535 bytes.
+	 */
+	{DRVSTAT_INFO(rx_out_range_errors)},
+	/* Received packets dropped when they are longer than 9216 bytes */
+	{DRVSTAT_INFO(rx_frame_too_long)},
+	/* Received packets dropped when they don't pass the unicast or
+	 * multicast address filtering.
+	 */
+	{DRVSTAT_INFO(rx_address_filtered)},
+	/* Received packets dropped when IP packet length field is less than
+	 * the IP header length field.
+	 */
+	{DRVSTAT_INFO(rx_dropped_too_small)},
+	/* Received packets dropped when IP length field is greater than
+	 * the actual packet length.
+	 */
+	{DRVSTAT_INFO(rx_dropped_too_short)},
+	/* Received packets dropped when the IP header length field is less
+	 * than 5.
+	 */
+	{DRVSTAT_INFO(rx_dropped_header_too_small)},
+	/* Received packets dropped when the TCP header length field is less
+	 * than 5 or the TCP header length + IP header length is more
+	 * than IP packet length.
+	 */
+	{DRVSTAT_INFO(rx_dropped_tcp_length)},
+	{DRVSTAT_INFO(rx_dropped_runt)},
+	/* Number of received packets dropped when a fifo for descriptors going
+	 * into the packet demux block overflows. In normal operation, this
+	 * fifo must never overflow.
+	 */
+	{DRVSTAT_INFO(rxpp_fifo_overflow_drop)},
+	/* Received packets dropped when the RX block runs out of space in
+	 * one of its input FIFOs. This could happen due a long burst of
+	 * minimum-sized (64b) frames in the receive path.
+	 * This counter may also be erroneously incremented rarely.
+	 */
+	{DRVSTAT_INFO(rx_input_fifo_overflow_drop)},
+	{DRVSTAT_INFO(rx_ip_checksum_errs)},
+	{DRVSTAT_INFO(rx_tcp_checksum_errs)},
+	{DRVSTAT_INFO(rx_udp_checksum_errs)},
+	{DRVSTAT_INFO(tx_pauseframes)},
+	{DRVSTAT_INFO(tx_controlframes)},
+	{DRVSTAT_INFO(rx_priority_pause_frames)},
+	{DRVSTAT_INFO(tx_priority_pauseframes)},
+	/* Received packets dropped when an internal fifo going into
+	 * main packet buffer tank (PMEM) overflows.
+	 */
+	{DRVSTAT_INFO(pmem_fifo_overflow_drop)},
+	{DRVSTAT_INFO(jabber_events)},
+	/* Received packets dropped due to lack of available HW packet buffers
+	 * used to temporarily hold the received packets.
+	 */
+	{DRVSTAT_INFO(rx_drops_no_pbuf)},
+	/* Received packets dropped due to input receive buffer
+	 * descriptor fifo overflowing.
+	 */
+	{DRVSTAT_INFO(rx_drops_no_erx_descr)},
+	/* Packets dropped because the internal FIFO to the offloaded TCP
+	 * receive processing block is full. This could happen only for
+	 * offloaded iSCSI or FCoE trarffic.
+	 */
+	{DRVSTAT_INFO(rx_drops_no_tpre_descr)},
+	/* Received packets dropped when they need more than 8
+	 * receive buffers. This cannot happen as the driver configures
+	 * 2048 byte receive buffers.
+	 */
+	{DRVSTAT_INFO(rx_drops_too_many_frags)},
+	{DRVSTAT_INFO(forwarded_packets)},
+	/* Received packets dropped when the frame length
+	 * is more than 9018 bytes
+	 */
+	{DRVSTAT_INFO(rx_drops_mtu)},
+	/* Number of dma mapping errors */
+	{DRVSTAT_INFO(dma_map_errors)},
+	/* Number of packets dropped due to random early drop function */
+	{DRVSTAT_INFO(eth_red_drops)},
+	{DRVSTAT_INFO(be_on_die_temperature)},
+	{DRVSTAT_INFO(rx_roce_bytes_lsd)},
+	{DRVSTAT_INFO(rx_roce_bytes_msd)},
+	{DRVSTAT_INFO(rx_roce_frames)},
+	{DRVSTAT_INFO(roce_drops_payload_len)},
+	{DRVSTAT_INFO(roce_drops_crc)}
+};
+
+#define ETHTOOL_STATS_NUM ARRAY_SIZE(et_stats)
+
+/* Stats related to multi RX queues: get_stats routine assumes bytes, pkts
+ * are first and second members respectively.
+ */
+static const struct be_ethtool_stat et_rx_stats[] = {
+	{DRVSTAT_RX_INFO(rx_bytes)},/* If moving this member see above note */
+	{DRVSTAT_RX_INFO(rx_pkts)}, /* If moving this member see above note */
+	{DRVSTAT_RX_INFO(rx_compl)},
+	{DRVSTAT_RX_INFO(rx_compl_err)},
+	{DRVSTAT_RX_INFO(rx_mcast_pkts)},
+	/* Number of page allocation failures while posting receive buffers
+	 * to HW.
+	 */
+	{DRVSTAT_RX_INFO(rx_post_fail)},
+	/* Recevied packets dropped due to skb allocation failure */
+	{DRVSTAT_RX_INFO(rx_drops_no_skbs)},
+	/* Received packets dropped due to lack of available fetched buffers
+	 * posted by the driver.
+	 */
+	{DRVSTAT_RX_INFO(rx_drops_no_frags)}
+};
+
+#define ETHTOOL_RXSTATS_NUM (ARRAY_SIZE(et_rx_stats))
+
+/* Stats related to multi TX queues: get_stats routine assumes compl is the
+ * first member
+ */
+static const struct be_ethtool_stat et_tx_stats[] = {
+	{DRVSTAT_TX_INFO(tx_compl)}, /* If moving this member see above note */
+	/* This counter is incremented when the HW encounters an error while
+	 * parsing the packet header of an outgoing TX request. This counter is
+	 * applicable only for BE2, BE3 and Skyhawk based adapters.
+	 */
+	{DRVSTAT_TX_INFO(tx_hdr_parse_err)},
+	/* This counter is incremented when an error occurs in the DMA
+	 * operation associated with the TX request from the host to the device.
+	 */
+	{DRVSTAT_TX_INFO(tx_dma_err)},
+	/* This counter is incremented when MAC or VLAN spoof checking is
+	 * enabled on the interface and the TX request fails the spoof check
+	 * in HW.
+	 */
+	{DRVSTAT_TX_INFO(tx_spoof_check_err)},
+	/* This counter is incremented when the HW encounters an error while
+	 * performing TSO offload. This counter is applicable only for Lancer
+	 * adapters.
+	 */
+	{DRVSTAT_TX_INFO(tx_tso_err)},
+	/* This counter is incremented when the HW detects Q-in-Q style VLAN
+	 * tagging in a packet and such tagging is not expected on the outgoing
+	 * interface. This counter is applicable only for Lancer adapters.
+	 */
+	{DRVSTAT_TX_INFO(tx_qinq_err)},
+	/* This counter is incremented when the HW detects parity errors in the
+	 * packet data. This counter is applicable only for Lancer adapters.
+	 */
+	{DRVSTAT_TX_INFO(tx_internal_parity_err)},
+	{DRVSTAT_TX_INFO(tx_bytes)},
+	{DRVSTAT_TX_INFO(tx_pkts)},
+	/* Number of skbs queued for trasmission by the driver */
+	{DRVSTAT_TX_INFO(tx_reqs)},
+	/* Number of TX work request blocks DMAed to HW */
+	{DRVSTAT_TX_INFO(tx_wrbs)},
+	/* Number of times the TX queue was stopped due to lack
+	 * of spaces in the TXQ.
+	 */
+	{DRVSTAT_TX_INFO(tx_stops)},
+	/* Pkts dropped in the driver's transmit path */
+	{DRVSTAT_TX_INFO(tx_drv_drops)}
+};
+
+#define ETHTOOL_TXSTATS_NUM (ARRAY_SIZE(et_tx_stats))
+
+static const char et_self_tests[][ETH_GSTRING_LEN] = {
+	"MAC Loopback test",
+	"PHY Loopback test",
+	"External Loopback test",
+	"DDR DMA test",
+	"Link test"
+};
+
+#define ETHTOOL_TESTS_NUM ARRAY_SIZE(et_self_tests)
+#define BE_MAC_LOOPBACK 0x0
+#define BE_PHY_LOOPBACK 0x1
+#define BE_ONE_PORT_EXT_LOOPBACK 0x2
+#define BE_NO_LOOPBACK 0xff
+
+static void be_get_drvinfo(struct net_device *netdev,
+			   struct ethtool_drvinfo *drvinfo)
+{
+	struct be_adapter *adapter = netdev_priv(netdev);
+
+	strlcpy(drvinfo->driver, DRV_NAME, sizeof(drvinfo->driver));
+	strlcpy(drvinfo->version, DRV_VER, sizeof(drvinfo->version));
+	if (!memcmp(adapter->fw_ver, adapter->fw_on_flash, FW_VER_LEN))
+		strlcpy(drvinfo->fw_version, adapter->fw_ver,
+			sizeof(drvinfo->fw_version));
+	else
+		snprintf(drvinfo->fw_version, sizeof(drvinfo->fw_version),
+			 "%s [%s]", adapter->fw_ver, adapter->fw_on_flash);
+
+	strlcpy(drvinfo->bus_info, pci_name(adapter->pdev),
+		sizeof(drvinfo->bus_info));
+	drvinfo->testinfo_len = 0;
+	drvinfo->regdump_len = 0;
+	drvinfo->eedump_len = 0;
+}
+
+static u32 lancer_cmd_get_file_len(struct be_adapter *adapter, u8 *file_name)
+{
+	u32 data_read = 0, eof;
+	u8 addn_status;
+	struct be_dma_mem data_len_cmd;
+	int status;
+
+	memset(&data_len_cmd, 0, sizeof(data_len_cmd));
+	/* data_offset and data_size should be 0 to get reg len */
+	status = lancer_cmd_read_object(adapter, &data_len_cmd, 0, 0,
+					file_name, &data_read, &eof,
+					&addn_status);
+
+	return data_read;
+}
+
+static int lancer_cmd_read_file(struct be_adapter *adapter, u8 *file_name,
+				u32 buf_len, void *buf)
+{
+	struct be_dma_mem read_cmd;
+	u32 read_len = 0, total_read_len = 0, chunk_size;
+	u32 eof = 0;
+	u8 addn_status;
+	int status = 0;
+
+	read_cmd.size = LANCER_READ_FILE_CHUNK;
+	read_cmd.va = pci_alloc_consistent(adapter->pdev, read_cmd.size,
+					   &read_cmd.dma);
+
+	if (!read_cmd.va) {
+		dev_err(&adapter->pdev->dev,
+			"Memory allocation failure while reading dump\n");
+		return -ENOMEM;
+	}
+
+	while ((total_read_len < buf_len) && !eof) {
+		chunk_size = min_t(u32, (buf_len - total_read_len),
+				   LANCER_READ_FILE_CHUNK);
+		chunk_size = ALIGN(chunk_size, 4);
+		status = lancer_cmd_read_object(adapter, &read_cmd, chunk_size,
+						total_read_len, file_name,
+						&read_len, &eof, &addn_status);
+		if (!status) {
+			memcpy(buf + total_read_len, read_cmd.va, read_len);
+			total_read_len += read_len;
+			eof &= LANCER_READ_FILE_EOF_MASK;
+		} else {
+			status = -EIO;
+			break;
+		}
+	}
+	pci_free_consistent(adapter->pdev, read_cmd.size, read_cmd.va,
+			    read_cmd.dma);
+
+	return status;
+}
+
+static int be_get_reg_len(struct net_device *netdev)
+{
+	struct be_adapter *adapter = netdev_priv(netdev);
+	u32 log_size = 0;
+
+	if (!check_privilege(adapter, MAX_PRIVILEGES))
+		return 0;
+
+	if (be_physfn(adapter)) {
+		if (lancer_chip(adapter))
+			log_size = lancer_cmd_get_file_len(adapter,
+							   LANCER_FW_DUMP_FILE);
+		else
+			be_cmd_get_reg_len(adapter, &log_size);
+	}
+	return log_size;
+}
+
+static void
+be_get_regs(struct net_device *netdev, struct ethtool_regs *regs, void *buf)
+{
+	struct be_adapter *adapter = netdev_priv(netdev);
+
+	if (be_physfn(adapter)) {
+		memset(buf, 0, regs->len);
+		if (lancer_chip(adapter))
+			lancer_cmd_read_file(adapter, LANCER_FW_DUMP_FILE,
+					     regs->len, buf);
+		else
+			be_cmd_get_regs(adapter, regs->len, buf);
+	}
+}
+
+static int be_get_coalesce(struct net_device *netdev,
+			   struct ethtool_coalesce *et)
+{
+	struct be_adapter *adapter = netdev_priv(netdev);
+	struct be_aic_obj *aic = &adapter->aic_obj[0];
+
+	et->rx_coalesce_usecs = aic->prev_eqd;
+	et->rx_coalesce_usecs_high = aic->max_eqd;
+	et->rx_coalesce_usecs_low = aic->min_eqd;
+
+	et->tx_coalesce_usecs = aic->prev_eqd;
+	et->tx_coalesce_usecs_high = aic->max_eqd;
+	et->tx_coalesce_usecs_low = aic->min_eqd;
+
+	et->use_adaptive_rx_coalesce = aic->enable;
+	et->use_adaptive_tx_coalesce = aic->enable;
+
+	return 0;
+}
+
+/* TX attributes are ignored. Only RX attributes are considered
+ * eqd cmd is issued in the worker thread.
+ */
+static int be_set_coalesce(struct net_device *netdev,
+			   struct ethtool_coalesce *et)
+{
+	struct be_adapter *adapter = netdev_priv(netdev);
+	struct be_aic_obj *aic = &adapter->aic_obj[0];
+	struct be_eq_obj *eqo;
+	int i;
+
+	for_all_evt_queues(adapter, eqo, i) {
+		aic->enable = et->use_adaptive_rx_coalesce;
+		aic->max_eqd = min(et->rx_coalesce_usecs_high, BE_MAX_EQD);
+		aic->min_eqd = min(et->rx_coalesce_usecs_low, aic->max_eqd);
+		aic->et_eqd = min(et->rx_coalesce_usecs, aic->max_eqd);
+		aic->et_eqd = max(aic->et_eqd, aic->min_eqd);
+		aic++;
+	}
+
+	return 0;
+}
+
+static void be_get_ethtool_stats(struct net_device *netdev,
+				 struct ethtool_stats *stats, uint64_t *data)
+{
+	struct be_adapter *adapter = netdev_priv(netdev);
+	struct be_rx_obj *rxo;
+	struct be_tx_obj *txo;
+	void *p;
+	unsigned int i, j, base = 0, start;
+
+	for (i = 0; i < ETHTOOL_STATS_NUM; i++) {
+		p = (u8 *)&adapter->drv_stats + et_stats[i].offset;
+		data[i] = *(u32 *)p;
+	}
+	base += ETHTOOL_STATS_NUM;
+
+	for_all_rx_queues(adapter, rxo, j) {
+		struct be_rx_stats *stats = rx_stats(rxo);
+
+		do {
+			start = u64_stats_fetch_begin_irq(&stats->sync);
+			data[base] = stats->rx_bytes;
+			data[base + 1] = stats->rx_pkts;
+		} while (u64_stats_fetch_retry_irq(&stats->sync, start));
+
+		for (i = 2; i < ETHTOOL_RXSTATS_NUM; i++) {
+			p = (u8 *)stats + et_rx_stats[i].offset;
+			data[base + i] = *(u32 *)p;
+		}
+		base += ETHTOOL_RXSTATS_NUM;
+	}
+
+	for_all_tx_queues(adapter, txo, j) {
+		struct be_tx_stats *stats = tx_stats(txo);
+
+		do {
+			start = u64_stats_fetch_begin_irq(&stats->sync_compl);
+			data[base] = stats->tx_compl;
+		} while (u64_stats_fetch_retry_irq(&stats->sync_compl, start));
+
+		do {
+			start = u64_stats_fetch_begin_irq(&stats->sync);
+			for (i = 1; i < ETHTOOL_TXSTATS_NUM; i++) {
+				p = (u8 *)stats + et_tx_stats[i].offset;
+				data[base + i] =
+					(et_tx_stats[i].size == sizeof(u64)) ?
+						*(u64 *)p : *(u32 *)p;
+			}
+		} while (u64_stats_fetch_retry_irq(&stats->sync, start));
+		base += ETHTOOL_TXSTATS_NUM;
+	}
+}
+
+static void be_get_stat_strings(struct net_device *netdev, uint32_t stringset,
+				uint8_t *data)
+{
+	struct be_adapter *adapter = netdev_priv(netdev);
+	int i, j;
+
+	switch (stringset) {
+	case ETH_SS_STATS:
+		for (i = 0; i < ETHTOOL_STATS_NUM; i++) {
+			memcpy(data, et_stats[i].desc, ETH_GSTRING_LEN);
+			data += ETH_GSTRING_LEN;
+		}
+		for (i = 0; i < adapter->num_rx_qs; i++) {
+			for (j = 0; j < ETHTOOL_RXSTATS_NUM; j++) {
+				sprintf(data, "rxq%d: %s", i,
+					et_rx_stats[j].desc);
+				data += ETH_GSTRING_LEN;
+			}
+		}
+		for (i = 0; i < adapter->num_tx_qs; i++) {
+			for (j = 0; j < ETHTOOL_TXSTATS_NUM; j++) {
+				sprintf(data, "txq%d: %s", i,
+					et_tx_stats[j].desc);
+				data += ETH_GSTRING_LEN;
+			}
+		}
+		break;
+	case ETH_SS_TEST:
+		for (i = 0; i < ETHTOOL_TESTS_NUM; i++) {
+			memcpy(data, et_self_tests[i], ETH_GSTRING_LEN);
+			data += ETH_GSTRING_LEN;
+		}
+		break;
+	}
+}
+
+static int be_get_sset_count(struct net_device *netdev, int stringset)
+{
+	struct be_adapter *adapter = netdev_priv(netdev);
+
+	switch (stringset) {
+	case ETH_SS_TEST:
+		return ETHTOOL_TESTS_NUM;
+	case ETH_SS_STATS:
+		return ETHTOOL_STATS_NUM +
+			adapter->num_rx_qs * ETHTOOL_RXSTATS_NUM +
+			adapter->num_tx_qs * ETHTOOL_TXSTATS_NUM;
+	default:
+		return -EINVAL;
+	}
+}
+
+static u32 be_get_port_type(struct be_adapter *adapter)
+{
+	u32 port;
+
+	switch (adapter->phy.interface_type) {
+	case PHY_TYPE_BASET_1GB:
+	case PHY_TYPE_BASEX_1GB:
+	case PHY_TYPE_SGMII:
+		port = PORT_TP;
+		break;
+	case PHY_TYPE_SFP_PLUS_10GB:
+		if (adapter->phy.cable_type & SFP_PLUS_COPPER_CABLE)
+			port = PORT_DA;
+		else
+			port = PORT_FIBRE;
+		break;
+	case PHY_TYPE_QSFP:
+		if (adapter->phy.cable_type & QSFP_PLUS_CR4_CABLE)
+			port = PORT_DA;
+		else
+			port = PORT_FIBRE;
+		break;
+	case PHY_TYPE_XFP_10GB:
+	case PHY_TYPE_SFP_1GB:
+		port = PORT_FIBRE;
+		break;
+	case PHY_TYPE_BASET_10GB:
+		port = PORT_TP;
+		break;
+	default:
+		port = PORT_OTHER;
+	}
+
+	return port;
+}
+
+static u32 convert_to_et_setting(struct be_adapter *adapter, u32 if_speeds)
+{
+	u32 val = 0;
+
+	switch (adapter->phy.interface_type) {
+	case PHY_TYPE_BASET_1GB:
+	case PHY_TYPE_BASEX_1GB:
+	case PHY_TYPE_SGMII:
+		val |= SUPPORTED_TP;
+		if (if_speeds & BE_SUPPORTED_SPEED_1GBPS)
+			val |= SUPPORTED_1000baseT_Full;
+		if (if_speeds & BE_SUPPORTED_SPEED_100MBPS)
+			val |= SUPPORTED_100baseT_Full;
+		if (if_speeds & BE_SUPPORTED_SPEED_10MBPS)
+			val |= SUPPORTED_10baseT_Full;
+		break;
+	case PHY_TYPE_KX4_10GB:
+		val |= SUPPORTED_Backplane;
+		if (if_speeds & BE_SUPPORTED_SPEED_1GBPS)
+			val |= SUPPORTED_1000baseKX_Full;
+		if (if_speeds & BE_SUPPORTED_SPEED_10GBPS)
+			val |= SUPPORTED_10000baseKX4_Full;
+		break;
+	case PHY_TYPE_KR2_20GB:
+		val |= SUPPORTED_Backplane;
+		if (if_speeds & BE_SUPPORTED_SPEED_10GBPS)
+			val |= SUPPORTED_10000baseKR_Full;
+		if (if_speeds & BE_SUPPORTED_SPEED_20GBPS)
+			val |= SUPPORTED_20000baseKR2_Full;
+		break;
+	case PHY_TYPE_KR_10GB:
+		val |= SUPPORTED_Backplane |
+				SUPPORTED_10000baseKR_Full;
+		break;
+	case PHY_TYPE_KR4_40GB:
+		val |= SUPPORTED_Backplane;
+		if (if_speeds & BE_SUPPORTED_SPEED_10GBPS)
+			val |= SUPPORTED_10000baseKR_Full;
+		if (if_speeds & BE_SUPPORTED_SPEED_40GBPS)
+			val |= SUPPORTED_40000baseKR4_Full;
+		break;
+	case PHY_TYPE_QSFP:
+		if (if_speeds & BE_SUPPORTED_SPEED_40GBPS) {
+			switch (adapter->phy.cable_type) {
+			case QSFP_PLUS_CR4_CABLE:
+				val |= SUPPORTED_40000baseCR4_Full;
+				break;
+			case QSFP_PLUS_LR4_CABLE:
+				val |= SUPPORTED_40000baseLR4_Full;
+				break;
+			default:
+				val |= SUPPORTED_40000baseSR4_Full;
+				break;
+			}
+		}
+	case PHY_TYPE_SFP_PLUS_10GB:
+	case PHY_TYPE_XFP_10GB:
+	case PHY_TYPE_SFP_1GB:
+		val |= SUPPORTED_FIBRE;
+		if (if_speeds & BE_SUPPORTED_SPEED_10GBPS)
+			val |= SUPPORTED_10000baseT_Full;
+		if (if_speeds & BE_SUPPORTED_SPEED_1GBPS)
+			val |= SUPPORTED_1000baseT_Full;
+		break;
+	case PHY_TYPE_BASET_10GB:
+		val |= SUPPORTED_TP;
+		if (if_speeds & BE_SUPPORTED_SPEED_10GBPS)
+			val |= SUPPORTED_10000baseT_Full;
+		if (if_speeds & BE_SUPPORTED_SPEED_1GBPS)
+			val |= SUPPORTED_1000baseT_Full;
+		if (if_speeds & BE_SUPPORTED_SPEED_100MBPS)
+			val |= SUPPORTED_100baseT_Full;
+		break;
+	default:
+		val |= SUPPORTED_TP;
+	}
+
+	return val;
+}
+
+bool be_pause_supported(struct be_adapter *adapter)
+{
+	return (adapter->phy.interface_type == PHY_TYPE_SFP_PLUS_10GB ||
+		adapter->phy.interface_type == PHY_TYPE_XFP_10GB) ?
+		false : true;
+}
+
+static int be_get_settings(struct net_device *netdev, struct ethtool_cmd *ecmd)
+{
+	struct be_adapter *adapter = netdev_priv(netdev);
+	u8 link_status;
+	u16 link_speed = 0;
+	int status;
+	u32 auto_speeds;
+	u32 fixed_speeds;
+
+	if (adapter->phy.link_speed < 0) {
+		status = be_cmd_link_status_query(adapter, &link_speed,
+						  &link_status, 0);
+		if (!status)
+			be_link_status_update(adapter, link_status);
+		ethtool_cmd_speed_set(ecmd, link_speed);
+
+		status = be_cmd_get_phy_info(adapter);
+		if (!status) {
+			auto_speeds = adapter->phy.auto_speeds_supported;
+			fixed_speeds = adapter->phy.fixed_speeds_supported;
+
+			be_cmd_query_cable_type(adapter);
+
+			ecmd->supported =
+				convert_to_et_setting(adapter,
+						      auto_speeds |
+						      fixed_speeds);
+			ecmd->advertising =
+				convert_to_et_setting(adapter, auto_speeds);
+
+			ecmd->port = be_get_port_type(adapter);
+
+			if (adapter->phy.auto_speeds_supported) {
+				ecmd->supported |= SUPPORTED_Autoneg;
+				ecmd->autoneg = AUTONEG_ENABLE;
+				ecmd->advertising |= ADVERTISED_Autoneg;
+			}
+
+			ecmd->supported |= SUPPORTED_Pause;
+			if (be_pause_supported(adapter))
+				ecmd->advertising |= ADVERTISED_Pause;
+
+			switch (adapter->phy.interface_type) {
+			case PHY_TYPE_KR_10GB:
+			case PHY_TYPE_KX4_10GB:
+				ecmd->transceiver = XCVR_INTERNAL;
+				break;
+			default:
+				ecmd->transceiver = XCVR_EXTERNAL;
+				break;
+			}
+		} else {
+			ecmd->port = PORT_OTHER;
+			ecmd->autoneg = AUTONEG_DISABLE;
+			ecmd->transceiver = XCVR_DUMMY1;
+		}
+
+		/* Save for future use */
+		adapter->phy.link_speed = ethtool_cmd_speed(ecmd);
+		adapter->phy.port_type = ecmd->port;
+		adapter->phy.transceiver = ecmd->transceiver;
+		adapter->phy.autoneg = ecmd->autoneg;
+		adapter->phy.advertising = ecmd->advertising;
+		adapter->phy.supported = ecmd->supported;
+	} else {
+		ethtool_cmd_speed_set(ecmd, adapter->phy.link_speed);
+		ecmd->port = adapter->phy.port_type;
+		ecmd->transceiver = adapter->phy.transceiver;
+		ecmd->autoneg = adapter->phy.autoneg;
+		ecmd->advertising = adapter->phy.advertising;
+		ecmd->supported = adapter->phy.supported;
+	}
+
+	ecmd->duplex = netif_carrier_ok(netdev) ? DUPLEX_FULL : DUPLEX_UNKNOWN;
+	ecmd->phy_address = adapter->port_num;
+
+	return 0;
+}
+
+static void be_get_ringparam(struct net_device *netdev,
+			     struct ethtool_ringparam *ring)
+{
+	struct be_adapter *adapter = netdev_priv(netdev);
+
+	ring->rx_max_pending = adapter->rx_obj[0].q.len;
+	ring->rx_pending = adapter->rx_obj[0].q.len;
+	ring->tx_max_pending = adapter->tx_obj[0].q.len;
+	ring->tx_pending = adapter->tx_obj[0].q.len;
+}
+
+static void
+be_get_pauseparam(struct net_device *netdev, struct ethtool_pauseparam *ecmd)
+{
+	struct be_adapter *adapter = netdev_priv(netdev);
+
+	be_cmd_get_flow_control(adapter, &ecmd->tx_pause, &ecmd->rx_pause);
+	ecmd->autoneg = adapter->phy.fc_autoneg;
+}
+
+static int
+be_set_pauseparam(struct net_device *netdev, struct ethtool_pauseparam *ecmd)
+{
+	struct be_adapter *adapter = netdev_priv(netdev);
+	int status;
+
+	if (ecmd->autoneg != adapter->phy.fc_autoneg)
+		return -EINVAL;
+	adapter->tx_fc = ecmd->tx_pause;
+	adapter->rx_fc = ecmd->rx_pause;
+
+	status = be_cmd_set_flow_control(adapter,
+					 adapter->tx_fc, adapter->rx_fc);
+	if (status)
+		dev_warn(&adapter->pdev->dev, "Pause param set failed\n");
+
+	return be_cmd_status(status);
+}
+
+static int be_set_phys_id(struct net_device *netdev,
+			  enum ethtool_phys_id_state state)
+{
+	struct be_adapter *adapter = netdev_priv(netdev);
+
+	switch (state) {
+	case ETHTOOL_ID_ACTIVE:
+		be_cmd_get_beacon_state(adapter, adapter->hba_port_num,
+					&adapter->beacon_state);
+		return 1;	/* cycle on/off once per second */
+
+	case ETHTOOL_ID_ON:
+		be_cmd_set_beacon_state(adapter, adapter->hba_port_num, 0, 0,
+					BEACON_STATE_ENABLED);
+		break;
+
+	case ETHTOOL_ID_OFF:
+		be_cmd_set_beacon_state(adapter, adapter->hba_port_num, 0, 0,
+					BEACON_STATE_DISABLED);
+		break;
+
+	case ETHTOOL_ID_INACTIVE:
+		be_cmd_set_beacon_state(adapter, adapter->hba_port_num, 0, 0,
+					adapter->beacon_state);
+	}
+
+	return 0;
+}
+
+static int be_set_dump(struct net_device *netdev, struct ethtool_dump *dump)
+{
+	struct be_adapter *adapter = netdev_priv(netdev);
+	struct device *dev = &adapter->pdev->dev;
+	int status;
+
+	if (!lancer_chip(adapter) ||
+	    !check_privilege(adapter, MAX_PRIVILEGES))
+		return -EOPNOTSUPP;
+
+	switch (dump->flag) {
+	case LANCER_INITIATE_FW_DUMP:
+		status = lancer_initiate_dump(adapter);
+		if (!status)
+			dev_info(dev, "FW dump initiated successfully\n");
+		break;
+	case LANCER_DELETE_FW_DUMP:
+		status = lancer_delete_dump(adapter);
+		if (!status)
+			dev_info(dev, "FW dump deleted successfully\n");
+	break;
+	default:
+		dev_err(dev, "Invalid dump level: 0x%x\n", dump->flag);
+		return -EINVAL;
+	}
+	return status;
+}
+
+static void be_get_wol(struct net_device *netdev, struct ethtool_wolinfo *wol)
+{
+	struct be_adapter *adapter = netdev_priv(netdev);
+
+	if (adapter->wol_cap & BE_WOL_CAP) {
+		wol->supported |= WAKE_MAGIC;
+		if (adapter->wol_en)
+			wol->wolopts |= WAKE_MAGIC;
+	} else {
+		wol->wolopts = 0;
+	}
+	memset(&wol->sopass, 0, sizeof(wol->sopass));
+}
+
+static int be_set_wol(struct net_device *netdev, struct ethtool_wolinfo *wol)
+{
+	struct be_adapter *adapter = netdev_priv(netdev);
+
+	if (wol->wolopts & ~WAKE_MAGIC)
+		return -EOPNOTSUPP;
+
+	if (!(adapter->wol_cap & BE_WOL_CAP)) {
+		dev_warn(&adapter->pdev->dev, "WOL not supported\n");
+		return -EOPNOTSUPP;
+	}
+
+	if (wol->wolopts & WAKE_MAGIC)
+		adapter->wol_en = true;
+	else
+		adapter->wol_en = false;
+
+	return 0;
+}
+
+static int be_test_ddr_dma(struct be_adapter *adapter)
+{
+	int ret, i;
+	struct be_dma_mem ddrdma_cmd;
+	static const u64 pattern[2] = {
+		0x5a5a5a5a5a5a5a5aULL, 0xa5a5a5a5a5a5a5a5ULL
+	};
+
+	ddrdma_cmd.size = sizeof(struct be_cmd_req_ddrdma_test);
+	ddrdma_cmd.va = dma_alloc_coherent(&adapter->pdev->dev, ddrdma_cmd.size,
+					   &ddrdma_cmd.dma, GFP_KERNEL);
+	if (!ddrdma_cmd.va)
+		return -ENOMEM;
+
+	for (i = 0; i < 2; i++) {
+		ret = be_cmd_ddr_dma_test(adapter, pattern[i],
+					  4096, &ddrdma_cmd);
+		if (ret != 0)
+			goto err;
+	}
+
+err:
+	dma_free_coherent(&adapter->pdev->dev, ddrdma_cmd.size, ddrdma_cmd.va,
+			  ddrdma_cmd.dma);
+	return be_cmd_status(ret);
+}
+
+static u64 be_loopback_test(struct be_adapter *adapter, u8 loopback_type,
+			    u64 *status)
+{
+	be_cmd_set_loopback(adapter, adapter->hba_port_num, loopback_type, 1);
+	*status = be_cmd_loopback_test(adapter, adapter->hba_port_num,
+				       loopback_type, 1500, 2, 0xabc);
+	be_cmd_set_loopback(adapter, adapter->hba_port_num, BE_NO_LOOPBACK, 1);
+	return *status;
+}
+
+static void be_self_test(struct net_device *netdev, struct ethtool_test *test,
+			 u64 *data)
+{
+	struct be_adapter *adapter = netdev_priv(netdev);
+	int status;
+	u8 link_status = 0;
+
+	if (adapter->function_caps & BE_FUNCTION_CAPS_SUPER_NIC) {
+		dev_err(&adapter->pdev->dev, "Self test not supported\n");
+		test->flags |= ETH_TEST_FL_FAILED;
+		return;
+	}
+
+	memset(data, 0, sizeof(u64) * ETHTOOL_TESTS_NUM);
+
+	if (test->flags & ETH_TEST_FL_OFFLINE) {
+		if (be_loopback_test(adapter, BE_MAC_LOOPBACK, &data[0]) != 0)
+			test->flags |= ETH_TEST_FL_FAILED;
+
+		if (be_loopback_test(adapter, BE_PHY_LOOPBACK, &data[1]) != 0)
+			test->flags |= ETH_TEST_FL_FAILED;
+
+		if (test->flags & ETH_TEST_FL_EXTERNAL_LB) {
+			if (be_loopback_test(adapter, BE_ONE_PORT_EXT_LOOPBACK,
+					     &data[2]) != 0)
+				test->flags |= ETH_TEST_FL_FAILED;
+			test->flags |= ETH_TEST_FL_EXTERNAL_LB_DONE;
+		}
+	}
+
+	if (!lancer_chip(adapter) && be_test_ddr_dma(adapter) != 0) {
+		data[3] = 1;
+		test->flags |= ETH_TEST_FL_FAILED;
+	}
+
+	status = be_cmd_link_status_query(adapter, NULL, &link_status, 0);
+	if (status) {
+		test->flags |= ETH_TEST_FL_FAILED;
+		data[4] = -1;
+	} else if (!link_status) {
+		test->flags |= ETH_TEST_FL_FAILED;
+		data[4] = 1;
+	}
+}
+
+static int be_do_flash(struct net_device *netdev, struct ethtool_flash *efl)
+{
+	struct be_adapter *adapter = netdev_priv(netdev);
+
+	return be_load_fw(adapter, efl->data);
+}
+
+static int be_get_eeprom_len(struct net_device *netdev)
+{
+	struct be_adapter *adapter = netdev_priv(netdev);
+
+	if (!check_privilege(adapter, MAX_PRIVILEGES))
+		return 0;
+
+	if (lancer_chip(adapter)) {
+		if (be_physfn(adapter))
+			return lancer_cmd_get_file_len(adapter,
+						       LANCER_VPD_PF_FILE);
+		else
+			return lancer_cmd_get_file_len(adapter,
+						       LANCER_VPD_VF_FILE);
+	} else {
+		return BE_READ_SEEPROM_LEN;
+	}
+}
+
+static int be_read_eeprom(struct net_device *netdev,
+			  struct ethtool_eeprom *eeprom, uint8_t *data)
+{
+	struct be_adapter *adapter = netdev_priv(netdev);
+	struct be_dma_mem eeprom_cmd;
+	struct be_cmd_resp_seeprom_read *resp;
+	int status;
+
+	if (!eeprom->len)
+		return -EINVAL;
+
+	if (lancer_chip(adapter)) {
+		if (be_physfn(adapter))
+			return lancer_cmd_read_file(adapter, LANCER_VPD_PF_FILE,
+						    eeprom->len, data);
+		else
+			return lancer_cmd_read_file(adapter, LANCER_VPD_VF_FILE,
+						    eeprom->len, data);
+	}
+
+	eeprom->magic = BE_VENDOR_ID | (adapter->pdev->device<<16);
+
+	memset(&eeprom_cmd, 0, sizeof(struct be_dma_mem));
+	eeprom_cmd.size = sizeof(struct be_cmd_req_seeprom_read);
+	eeprom_cmd.va = dma_alloc_coherent(&adapter->pdev->dev, eeprom_cmd.size,
+					   &eeprom_cmd.dma, GFP_KERNEL);
+
+	if (!eeprom_cmd.va)
+		return -ENOMEM;
+
+	status = be_cmd_get_seeprom_data(adapter, &eeprom_cmd);
+
+	if (!status) {
+		resp = eeprom_cmd.va;
+		memcpy(data, resp->seeprom_data + eeprom->offset, eeprom->len);
+	}
+	dma_free_coherent(&adapter->pdev->dev, eeprom_cmd.size, eeprom_cmd.va,
+			  eeprom_cmd.dma);
+
+	return be_cmd_status(status);
+}
+
+static u32 be_get_msg_level(struct net_device *netdev)
+{
+	struct be_adapter *adapter = netdev_priv(netdev);
+
+	return adapter->msg_enable;
+}
+
+static void be_set_msg_level(struct net_device *netdev, u32 level)
+{
+	struct be_adapter *adapter = netdev_priv(netdev);
+
+	if (adapter->msg_enable == level)
+		return;
+
+	if ((level & NETIF_MSG_HW) != (adapter->msg_enable & NETIF_MSG_HW))
+		if (BEx_chip(adapter))
+			be_cmd_set_fw_log_level(adapter, level & NETIF_MSG_HW ?
+						FW_LOG_LEVEL_DEFAULT :
+						FW_LOG_LEVEL_FATAL);
+	adapter->msg_enable = level;
+}
+
+static u64 be_get_rss_hash_opts(struct be_adapter *adapter, u64 flow_type)
+{
+	u64 data = 0;
+
+	switch (flow_type) {
+	case TCP_V4_FLOW:
+		if (adapter->rss_info.rss_flags & RSS_ENABLE_IPV4)
+			data |= RXH_IP_DST | RXH_IP_SRC;
+		if (adapter->rss_info.rss_flags & RSS_ENABLE_TCP_IPV4)
+			data |= RXH_L4_B_0_1 | RXH_L4_B_2_3;
+		break;
+	case UDP_V4_FLOW:
+		if (adapter->rss_info.rss_flags & RSS_ENABLE_IPV4)
+			data |= RXH_IP_DST | RXH_IP_SRC;
+		if (adapter->rss_info.rss_flags & RSS_ENABLE_UDP_IPV4)
+			data |= RXH_L4_B_0_1 | RXH_L4_B_2_3;
+		break;
+	case TCP_V6_FLOW:
+		if (adapter->rss_info.rss_flags & RSS_ENABLE_IPV6)
+			data |= RXH_IP_DST | RXH_IP_SRC;
+		if (adapter->rss_info.rss_flags & RSS_ENABLE_TCP_IPV6)
+			data |= RXH_L4_B_0_1 | RXH_L4_B_2_3;
+		break;
+	case UDP_V6_FLOW:
+		if (adapter->rss_info.rss_flags & RSS_ENABLE_IPV6)
+			data |= RXH_IP_DST | RXH_IP_SRC;
+		if (adapter->rss_info.rss_flags & RSS_ENABLE_UDP_IPV6)
+			data |= RXH_L4_B_0_1 | RXH_L4_B_2_3;
+		break;
+	}
+
+	return data;
+}
+
+static int be_get_rxnfc(struct net_device *netdev, struct ethtool_rxnfc *cmd,
+			u32 *rule_locs)
+{
+	struct be_adapter *adapter = netdev_priv(netdev);
+
+	if (!be_multi_rxq(adapter)) {
+		dev_info(&adapter->pdev->dev,
+			 "ethtool::get_rxnfc: RX flow hashing is disabled\n");
+		return -EINVAL;
+	}
+
+	switch (cmd->cmd) {
+	case ETHTOOL_GRXFH:
+		cmd->data = be_get_rss_hash_opts(adapter, cmd->flow_type);
+		break;
+	case ETHTOOL_GRXRINGS:
+		cmd->data = adapter->num_rx_qs - 1;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int be_set_rss_hash_opts(struct be_adapter *adapter,
+				struct ethtool_rxnfc *cmd)
+{
+	struct be_rx_obj *rxo;
+	int status = 0, i, j;
+	u8 rsstable[128];
+	u32 rss_flags = adapter->rss_info.rss_flags;
+
+	if (cmd->data != L3_RSS_FLAGS &&
+	    cmd->data != (L3_RSS_FLAGS | L4_RSS_FLAGS))
+		return -EINVAL;
+
+	switch (cmd->flow_type) {
+	case TCP_V4_FLOW:
+		if (cmd->data == L3_RSS_FLAGS)
+			rss_flags &= ~RSS_ENABLE_TCP_IPV4;
+		else if (cmd->data == (L3_RSS_FLAGS | L4_RSS_FLAGS))
+			rss_flags |= RSS_ENABLE_IPV4 |
+					RSS_ENABLE_TCP_IPV4;
+		break;
+	case TCP_V6_FLOW:
+		if (cmd->data == L3_RSS_FLAGS)
+			rss_flags &= ~RSS_ENABLE_TCP_IPV6;
+		else if (cmd->data == (L3_RSS_FLAGS | L4_RSS_FLAGS))
+			rss_flags |= RSS_ENABLE_IPV6 |
+					RSS_ENABLE_TCP_IPV6;
+		break;
+	case UDP_V4_FLOW:
+		if ((cmd->data == (L3_RSS_FLAGS | L4_RSS_FLAGS)) &&
+		    BEx_chip(adapter))
+			return -EINVAL;
+
+		if (cmd->data == L3_RSS_FLAGS)
+			rss_flags &= ~RSS_ENABLE_UDP_IPV4;
+		else if (cmd->data == (L3_RSS_FLAGS | L4_RSS_FLAGS))
+			rss_flags |= RSS_ENABLE_IPV4 |
+					RSS_ENABLE_UDP_IPV4;
+		break;
+	case UDP_V6_FLOW:
+		if ((cmd->data == (L3_RSS_FLAGS | L4_RSS_FLAGS)) &&
+		    BEx_chip(adapter))
+			return -EINVAL;
+
+		if (cmd->data == L3_RSS_FLAGS)
+			rss_flags &= ~RSS_ENABLE_UDP_IPV6;
+		else if (cmd->data == (L3_RSS_FLAGS | L4_RSS_FLAGS))
+			rss_flags |= RSS_ENABLE_IPV6 |
+					RSS_ENABLE_UDP_IPV6;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	if (rss_flags == adapter->rss_info.rss_flags)
+		return status;
+
+	if (be_multi_rxq(adapter)) {
+		for (j = 0; j < 128; j += adapter->num_rx_qs - 1) {
+			for_all_rss_queues(adapter, rxo, i) {
+				if ((j + i) >= 128)
+					break;
+				rsstable[j + i] = rxo->rss_id;
+			}
+		}
+	}
+
+	status = be_cmd_rss_config(adapter, adapter->rss_info.rsstable,
+				   rss_flags, 128, adapter->rss_info.rss_hkey);
+	if (!status)
+		adapter->rss_info.rss_flags = rss_flags;
+
+	return be_cmd_status(status);
+}
+
+static int be_set_rxnfc(struct net_device *netdev, struct ethtool_rxnfc *cmd)
+{
+	struct be_adapter *adapter = netdev_priv(netdev);
+	int status = 0;
+
+	if (!be_multi_rxq(adapter)) {
+		dev_err(&adapter->pdev->dev,
+			"ethtool::set_rxnfc: RX flow hashing is disabled\n");
+		return -EINVAL;
+	}
+
+	switch (cmd->cmd) {
+	case ETHTOOL_SRXFH:
+		status = be_set_rss_hash_opts(adapter, cmd);
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return status;
+}
+
+static void be_get_channels(struct net_device *netdev,
+			    struct ethtool_channels *ch)
+{
+	struct be_adapter *adapter = netdev_priv(netdev);
+
+	ch->combined_count = adapter->num_evt_qs;
+	ch->max_combined = be_max_qs(adapter);
+}
+
+static int be_set_channels(struct net_device  *netdev,
+			   struct ethtool_channels *ch)
+{
+	struct be_adapter *adapter = netdev_priv(netdev);
+	int status;
+
+	if (ch->rx_count || ch->tx_count || ch->other_count ||
+	    !ch->combined_count || ch->combined_count > be_max_qs(adapter))
+		return -EINVAL;
+
+	adapter->cfg_num_qs = ch->combined_count;
+
+	status = be_update_queues(adapter);
+	return be_cmd_status(status);
+}
+
+static u32 be_get_rxfh_indir_size(struct net_device *netdev)
+{
+	return RSS_INDIR_TABLE_LEN;
+}
+
+static u32 be_get_rxfh_key_size(struct net_device *netdev)
+{
+	return RSS_HASH_KEY_LEN;
+}
+
+static int be_get_rxfh(struct net_device *netdev, u32 *indir, u8 *hkey)
+{
+	struct be_adapter *adapter = netdev_priv(netdev);
+	int i;
+	struct rss_info *rss = &adapter->rss_info;
+
+	if (indir) {
+		for (i = 0; i < RSS_INDIR_TABLE_LEN; i++)
+			indir[i] = rss->rss_queue[i];
+	}
+
+	if (hkey)
+		memcpy(hkey, rss->rss_hkey, RSS_HASH_KEY_LEN);
+
+	return 0;
+}
+
+static int be_set_rxfh(struct net_device *netdev, const u32 *indir,
+		       const u8 *hkey)
+{
+	int rc = 0, i, j;
+	struct be_adapter *adapter = netdev_priv(netdev);
+	u8 rsstable[RSS_INDIR_TABLE_LEN];
+
+	if (indir) {
+		struct be_rx_obj *rxo;
+
+		for (i = 0; i < RSS_INDIR_TABLE_LEN; i++) {
+			j = indir[i];
+			rxo = &adapter->rx_obj[j];
+			rsstable[i] = rxo->rss_id;
+			adapter->rss_info.rss_queue[i] = j;
+		}
+	} else {
+		memcpy(rsstable, adapter->rss_info.rsstable,
+		       RSS_INDIR_TABLE_LEN);
+	}
+
+	if (!hkey)
+		hkey =  adapter->rss_info.rss_hkey;
+
+	rc = be_cmd_rss_config(adapter, rsstable,
+			       adapter->rss_info.rss_flags,
+			       RSS_INDIR_TABLE_LEN, hkey);
+	if (rc) {
+		adapter->rss_info.rss_flags = RSS_ENABLE_NONE;
+		return -EIO;
+	}
+	memcpy(adapter->rss_info.rss_hkey, hkey, RSS_HASH_KEY_LEN);
+	memcpy(adapter->rss_info.rsstable, rsstable,
+	       RSS_INDIR_TABLE_LEN);
+	return 0;
+}
+
+static int be_get_module_info(struct net_device *netdev,
+			      struct ethtool_modinfo *modinfo)
+{
+	struct be_adapter *adapter = netdev_priv(netdev);
+	u8 page_data[PAGE_DATA_LEN];
+	int status;
+
+	if (!check_privilege(adapter, MAX_PRIVILEGES))
+		return -EOPNOTSUPP;
+
+	status = be_cmd_read_port_transceiver_data(adapter, TR_PAGE_A0,
+						   page_data);
+	if (!status) {
+		if (!page_data[SFP_PLUS_SFF_8472_COMP]) {
+			modinfo->type = ETH_MODULE_SFF_8079;
+			modinfo->eeprom_len = PAGE_DATA_LEN;
+		} else {
+			modinfo->type = ETH_MODULE_SFF_8472;
+			modinfo->eeprom_len = 2 * PAGE_DATA_LEN;
+		}
+	}
+	return be_cmd_status(status);
+}
+
+static int be_get_module_eeprom(struct net_device *netdev,
+				struct ethtool_eeprom *eeprom, u8 *data)
+{
+	struct be_adapter *adapter = netdev_priv(netdev);
+	int status;
+
+	if (!check_privilege(adapter, MAX_PRIVILEGES))
+		return -EOPNOTSUPP;
+
+	status = be_cmd_read_port_transceiver_data(adapter, TR_PAGE_A0,
+						   data);
+	if (status)
+		goto err;
+
+	if (eeprom->offset + eeprom->len > PAGE_DATA_LEN) {
+		status = be_cmd_read_port_transceiver_data(adapter,
+							   TR_PAGE_A2,
+							   data +
+							   PAGE_DATA_LEN);
+		if (status)
+			goto err;
+	}
+	if (eeprom->offset)
+		memcpy(data, data + eeprom->offset, eeprom->len);
+err:
+	return be_cmd_status(status);
+}
+
+const struct ethtool_ops be_ethtool_ops = {
+	.get_settings = be_get_settings,
+	.get_drvinfo = be_get_drvinfo,
+	.get_wol = be_get_wol,
+	.set_wol = be_set_wol,
+	.get_link = ethtool_op_get_link,
+	.get_eeprom_len = be_get_eeprom_len,
+	.get_eeprom = be_read_eeprom,
+	.get_coalesce = be_get_coalesce,
+	.set_coalesce = be_set_coalesce,
+	.get_ringparam = be_get_ringparam,
+	.get_pauseparam = be_get_pauseparam,
+	.set_pauseparam = be_set_pauseparam,
+	.get_strings = be_get_stat_strings,
+	.set_phys_id = be_set_phys_id,
+	.set_dump = be_set_dump,
+	.get_msglevel = be_get_msg_level,
+	.set_msglevel = be_set_msg_level,
+	.get_sset_count = be_get_sset_count,
+	.get_ethtool_stats = be_get_ethtool_stats,
+	.get_regs_len = be_get_reg_len,
+	.get_regs = be_get_regs,
+	.flash_device = be_do_flash,
+	.self_test = be_self_test,
+	.get_rxnfc = be_get_rxnfc,
+	.set_rxnfc = be_set_rxnfc,
+	.get_rxfh_indir_size = be_get_rxfh_indir_size,
+	.get_rxfh_key_size = be_get_rxfh_key_size,
+	.get_rxfh = be_get_rxfh,
+	.set_rxfh = be_set_rxfh,
+	.get_channels = be_get_channels,
+	.set_channels = be_set_channels,
+	.get_module_info = be_get_module_info,
+	.get_module_eeprom = be_get_module_eeprom
+};
diff --git a/drivers/net/ethernet/emulex/benet/be_hw.h b/drivers/net/ethernet/emulex/benet/be_hw.h
new file mode 100644
index 0000000..295ee08
--- /dev/null
+++ b/drivers/net/ethernet/emulex/benet/be_hw.h
@@ -0,0 +1,572 @@
+/*
+ * Copyright (C) 2005 - 2014 Emulex
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.  The full GNU General
+ * Public License is included in this distribution in the file called COPYING.
+ *
+ * Contact Information:
+ * linux-drivers@emulex.com
+ *
+ * Emulex
+ * 3333 Susan Street
+ * Costa Mesa, CA 92626
+ */
+
+/********* Mailbox door bell *************/
+/* Used for driver communication with the FW.
+ * The software must write this register twice to post any command. First,
+ * it writes the register with hi=1 and the upper bits of the physical address
+ * for the MAILBOX structure. Software must poll the ready bit until this
+ * is acknowledged. Then, sotware writes the register with hi=0 with the lower
+ * bits in the address. It must poll the ready bit until the command is
+ * complete. Upon completion, the MAILBOX will contain a valid completion
+ * queue entry.
+ */
+#define MPU_MAILBOX_DB_OFFSET	0x160
+#define MPU_MAILBOX_DB_RDY_MASK	0x1 	/* bit 0 */
+#define MPU_MAILBOX_DB_HI_MASK	0x2	/* bit 1 */
+
+#define MPU_EP_CONTROL 		0
+
+/********** MPU semphore: used for SH & BE  *************/
+#define SLIPORT_SEMAPHORE_OFFSET_BEx		0xac  /* CSR BAR offset */
+#define SLIPORT_SEMAPHORE_OFFSET_SH		0x94  /* PCI-CFG offset */
+#define POST_STAGE_MASK				0x0000FFFF
+#define POST_ERR_MASK				0x1
+#define POST_ERR_SHIFT				31
+
+/* MPU semphore POST stage values */
+#define POST_STAGE_AWAITING_HOST_RDY 	0x1 /* FW awaiting goahead from host */
+#define POST_STAGE_HOST_RDY 		0x2 /* Host has given go-ahed to FW */
+#define POST_STAGE_BE_RESET		0x3 /* Host wants to reset chip */
+#define POST_STAGE_ARMFW_RDY		0xc000	/* FW is done with POST */
+
+
+/* Lancer SLIPORT registers */
+#define SLIPORT_STATUS_OFFSET		0x404
+#define SLIPORT_CONTROL_OFFSET		0x408
+#define SLIPORT_ERROR1_OFFSET		0x40C
+#define SLIPORT_ERROR2_OFFSET		0x410
+#define PHYSDEV_CONTROL_OFFSET		0x414
+
+#define SLIPORT_STATUS_ERR_MASK		0x80000000
+#define SLIPORT_STATUS_DIP_MASK		0x02000000
+#define SLIPORT_STATUS_RN_MASK		0x01000000
+#define SLIPORT_STATUS_RDY_MASK		0x00800000
+#define SLI_PORT_CONTROL_IP_MASK	0x08000000
+#define PHYSDEV_CONTROL_FW_RESET_MASK	0x00000002
+#define PHYSDEV_CONTROL_DD_MASK		0x00000004
+#define PHYSDEV_CONTROL_INP_MASK	0x40000000
+
+#define SLIPORT_ERROR_NO_RESOURCE1	0x2
+#define SLIPORT_ERROR_NO_RESOURCE2	0x9
+
+#define SLIPORT_ERROR_FW_RESET1		0x2
+#define SLIPORT_ERROR_FW_RESET2		0x0
+
+/********* Memory BAR register ************/
+#define PCICFG_MEMBAR_CTRL_INT_CTRL_OFFSET 	0xfc
+/* Host Interrupt Enable, if set interrupts are enabled although "PCI Interrupt
+ * Disable" may still globally block interrupts in addition to individual
+ * interrupt masks; a mechanism for the device driver to block all interrupts
+ * atomically without having to arbitrate for the PCI Interrupt Disable bit
+ * with the OS.
+ */
+#define MEMBAR_CTRL_INT_CTRL_HOSTINTR_MASK	(1 << 29) /* bit 29 */
+
+/********* PCI Function Capability *********/
+#define BE_FUNCTION_CAPS_RSS			0x2
+#define BE_FUNCTION_CAPS_SUPER_NIC		0x40
+
+/********* Power management (WOL) **********/
+#define PCICFG_PM_CONTROL_OFFSET		0x44
+#define PCICFG_PM_CONTROL_MASK			0x108	/* bits 3 & 8 */
+
+/********* Online Control Registers *******/
+#define PCICFG_ONLINE0				0xB0
+#define PCICFG_ONLINE1				0xB4
+
+/********* UE Status and Mask Registers ***/
+#define PCICFG_UE_STATUS_LOW			0xA0
+#define PCICFG_UE_STATUS_HIGH			0xA4
+#define PCICFG_UE_STATUS_LOW_MASK		0xA8
+#define PCICFG_UE_STATUS_HI_MASK		0xAC
+
+/******** SLI_INTF ***********************/
+#define SLI_INTF_REG_OFFSET			0x58
+#define SLI_INTF_VALID_MASK			0xE0000000
+#define SLI_INTF_VALID				0xC0000000
+#define SLI_INTF_HINT2_MASK			0x1F000000
+#define SLI_INTF_HINT2_SHIFT			24
+#define SLI_INTF_HINT1_MASK			0x00FF0000
+#define SLI_INTF_HINT1_SHIFT			16
+#define SLI_INTF_FAMILY_MASK			0x00000F00
+#define SLI_INTF_FAMILY_SHIFT			8
+#define SLI_INTF_IF_TYPE_MASK			0x0000F000
+#define SLI_INTF_IF_TYPE_SHIFT			12
+#define SLI_INTF_REV_MASK			0x000000F0
+#define SLI_INTF_REV_SHIFT			4
+#define SLI_INTF_FT_MASK			0x00000001
+
+#define SLI_INTF_TYPE_2		2
+#define SLI_INTF_TYPE_3		3
+
+/********* ISR0 Register offset **********/
+#define CEV_ISR0_OFFSET 			0xC18
+#define CEV_ISR_SIZE				4
+
+/********* Event Q door bell *************/
+#define DB_EQ_OFFSET			DB_CQ_OFFSET
+#define DB_EQ_RING_ID_MASK		0x1FF	/* bits 0 - 8 */
+#define DB_EQ_RING_ID_EXT_MASK		0x3e00  /* bits 9-13 */
+#define DB_EQ_RING_ID_EXT_MASK_SHIFT	(2) /* qid bits 9-13 placing at 11-15 */
+
+/* Clear the interrupt for this eq */
+#define DB_EQ_CLR_SHIFT			(9)	/* bit 9 */
+/* Must be 1 */
+#define DB_EQ_EVNT_SHIFT		(10)	/* bit 10 */
+/* Number of event entries processed */
+#define DB_EQ_NUM_POPPED_SHIFT		(16)	/* bits 16 - 28 */
+/* Rearm bit */
+#define DB_EQ_REARM_SHIFT		(29)	/* bit 29 */
+
+/********* Compl Q door bell *************/
+#define DB_CQ_OFFSET 			0x120
+#define DB_CQ_RING_ID_MASK		0x3FF	/* bits 0 - 9 */
+#define DB_CQ_RING_ID_EXT_MASK		0x7C00	/* bits 10-14 */
+#define DB_CQ_RING_ID_EXT_MASK_SHIFT	(1)	/* qid bits 10-14
+						 placing at 11-15 */
+
+/* Number of event entries processed */
+#define DB_CQ_NUM_POPPED_SHIFT		(16) 	/* bits 16 - 28 */
+/* Rearm bit */
+#define DB_CQ_REARM_SHIFT		(29) 	/* bit 29 */
+
+/********** TX ULP door bell *************/
+#define DB_TXULP1_OFFSET		0x60
+#define DB_TXULP_RING_ID_MASK		0x7FF	/* bits 0 - 10 */
+/* Number of tx entries posted */
+#define DB_TXULP_NUM_POSTED_SHIFT	(16)	/* bits 16 - 29 */
+#define DB_TXULP_NUM_POSTED_MASK	0x3FFF	/* bits 16 - 29 */
+
+/********** RQ(erx) door bell ************/
+#define DB_RQ_OFFSET 			0x100
+#define DB_RQ_RING_ID_MASK		0x3FF	/* bits 0 - 9 */
+/* Number of rx frags posted */
+#define DB_RQ_NUM_POSTED_SHIFT		(24)	/* bits 24 - 31 */
+
+/********** MCC door bell ************/
+#define DB_MCCQ_OFFSET 			0x140
+#define DB_MCCQ_RING_ID_MASK		0x7FF	/* bits 0 - 10 */
+/* Number of entries posted */
+#define DB_MCCQ_NUM_POSTED_SHIFT	(16)	/* bits 16 - 29 */
+
+/********** SRIOV VF PCICFG OFFSET ********/
+#define SRIOV_VF_PCICFG_OFFSET		(4096)
+
+/********** FAT TABLE  ********/
+#define RETRIEVE_FAT	0
+#define QUERY_FAT	1
+
+/* Flashrom related descriptors */
+#define MAX_FLASH_COMP			32
+#define IMAGE_TYPE_FIRMWARE		160
+#define IMAGE_TYPE_BOOTCODE		224
+#define IMAGE_TYPE_OPTIONROM		32
+
+#define NUM_FLASHDIR_ENTRIES		32
+
+#define OPTYPE_ISCSI_ACTIVE		0
+#define OPTYPE_REDBOOT			1
+#define OPTYPE_BIOS			2
+#define OPTYPE_PXE_BIOS			3
+#define OPTYPE_FCOE_BIOS		8
+#define OPTYPE_ISCSI_BACKUP		9
+#define OPTYPE_FCOE_FW_ACTIVE		10
+#define OPTYPE_FCOE_FW_BACKUP		11
+#define OPTYPE_NCSI_FW			13
+#define OPTYPE_REDBOOT_DIR		18
+#define OPTYPE_REDBOOT_CONFIG		19
+#define OPTYPE_SH_PHY_FW		21
+#define OPTYPE_FLASHISM_JUMPVECTOR	22
+#define OPTYPE_UFI_DIR			23
+#define OPTYPE_PHY_FW			99
+#define TN_8022				13
+
+#define FLASHROM_OPER_PHY_FLASH		9
+#define FLASHROM_OPER_PHY_SAVE		10
+#define FLASHROM_OPER_FLASH		1
+#define FLASHROM_OPER_SAVE		2
+#define FLASHROM_OPER_REPORT		4
+
+#define FLASH_IMAGE_MAX_SIZE_g2		(1310720) /* Max firmware image size */
+#define FLASH_BIOS_IMAGE_MAX_SIZE_g2	(262144)  /* Max OPTION ROM image sz */
+#define FLASH_REDBOOT_IMAGE_MAX_SIZE_g2	(262144)  /* Max Redboot image sz    */
+#define FLASH_IMAGE_MAX_SIZE_g3		(2097152) /* Max firmware image size */
+#define FLASH_BIOS_IMAGE_MAX_SIZE_g3	(524288)  /* Max OPTION ROM image sz */
+#define FLASH_REDBOOT_IMAGE_MAX_SIZE_g3	(1048576)  /* Max Redboot image sz    */
+#define FLASH_NCSI_IMAGE_MAX_SIZE_g3	(262144)
+#define FLASH_PHY_FW_IMAGE_MAX_SIZE_g3	262144
+
+#define FLASH_NCSI_MAGIC		(0x16032009)
+#define FLASH_NCSI_DISABLED		(0)
+#define FLASH_NCSI_ENABLED		(1)
+
+#define FLASH_NCSI_BITFILE_HDR_OFFSET	(0x600000)
+
+/* Offsets for components on Flash. */
+#define FLASH_iSCSI_PRIMARY_IMAGE_START_g2 (1048576)
+#define FLASH_iSCSI_BACKUP_IMAGE_START_g2  (2359296)
+#define FLASH_FCoE_PRIMARY_IMAGE_START_g2  (3670016)
+#define FLASH_FCoE_BACKUP_IMAGE_START_g2   (4980736)
+#define FLASH_iSCSI_BIOS_START_g2          (7340032)
+#define FLASH_PXE_BIOS_START_g2            (7864320)
+#define FLASH_FCoE_BIOS_START_g2           (524288)
+#define FLASH_REDBOOT_START_g2		  (0)
+
+#define FLASH_NCSI_START_g3		   (15990784)
+#define FLASH_iSCSI_PRIMARY_IMAGE_START_g3 (2097152)
+#define FLASH_iSCSI_BACKUP_IMAGE_START_g3  (4194304)
+#define FLASH_FCoE_PRIMARY_IMAGE_START_g3  (6291456)
+#define FLASH_FCoE_BACKUP_IMAGE_START_g3   (8388608)
+#define FLASH_iSCSI_BIOS_START_g3          (12582912)
+#define FLASH_PXE_BIOS_START_g3            (13107200)
+#define FLASH_FCoE_BIOS_START_g3           (13631488)
+#define FLASH_REDBOOT_START_g3             (262144)
+#define FLASH_PHY_FW_START_g3		   1310720
+
+#define IMAGE_NCSI			16
+#define IMAGE_OPTION_ROM_PXE		32
+#define IMAGE_OPTION_ROM_FCoE		33
+#define IMAGE_OPTION_ROM_ISCSI		34
+#define IMAGE_FLASHISM_JUMPVECTOR	48
+#define IMAGE_FLASH_ISM			49
+#define IMAGE_JUMP_VECTOR		50
+#define IMAGE_FIRMWARE_iSCSI		160
+#define IMAGE_FIRMWARE_COMP_iSCSI	161
+#define IMAGE_FIRMWARE_FCoE		162
+#define IMAGE_FIRMWARE_COMP_FCoE	163
+#define IMAGE_FIRMWARE_BACKUP_iSCSI	176
+#define IMAGE_FIRMWARE_BACKUP_COMP_iSCSI 177
+#define IMAGE_FIRMWARE_BACKUP_FCoE	178
+#define IMAGE_FIRMWARE_BACKUP_COMP_FCoE 179
+#define IMAGE_FIRMWARE_PHY		192
+#define IMAGE_REDBOOT_DIR		208
+#define IMAGE_REDBOOT_CONFIG		209
+#define IMAGE_UFI_DIR			210
+#define IMAGE_BOOT_CODE			224
+
+/************* Rx Packet Type Encoding **************/
+#define BE_UNICAST_PACKET		0
+#define BE_MULTICAST_PACKET		1
+#define BE_BROADCAST_PACKET		2
+#define BE_RSVD_PACKET			3
+
+/*
+ * BE descriptors: host memory data structures whose formats
+ * are hardwired in BE silicon.
+ */
+/* Event Queue Descriptor */
+#define EQ_ENTRY_VALID_MASK 		0x1	/* bit 0 */
+#define EQ_ENTRY_RES_ID_MASK 		0xFFFF	/* bits 16 - 31 */
+#define EQ_ENTRY_RES_ID_SHIFT 		16
+
+struct be_eq_entry {
+	u32 evt;
+};
+
+/* TX Queue Descriptor */
+#define ETH_WRB_FRAG_LEN_MASK		0xFFFF
+struct be_eth_wrb {
+	u32 frag_pa_hi;		/* dword 0 */
+	u32 frag_pa_lo;		/* dword 1 */
+	u32 rsvd0;		/* dword 2 */
+	u32 frag_len;		/* dword 3: bits 0 - 15 */
+} __packed;
+
+/* Pseudo amap definition for eth_hdr_wrb in which each bit of the
+ * actual structure is defined as a byte : used to calculate
+ * offset/shift/mask of each field */
+struct amap_eth_hdr_wrb {
+	u8 rsvd0[32];		/* dword 0 */
+	u8 rsvd1[32];		/* dword 1 */
+	u8 complete;		/* dword 2 */
+	u8 event;
+	u8 crc;
+	u8 forward;
+	u8 lso6;
+	u8 mgmt;
+	u8 ipcs;
+	u8 udpcs;
+	u8 tcpcs;
+	u8 lso;
+	u8 vlan;
+	u8 gso[2];
+	u8 num_wrb[5];
+	u8 lso_mss[14];
+	u8 len[16];		/* dword 3 */
+	u8 vlan_tag[16];
+} __packed;
+
+struct be_eth_hdr_wrb {
+	u32 dw[4];
+};
+
+/********* Tx Compl Status Encoding *********/
+#define BE_TX_COMP_HDR_PARSE_ERR	0x2
+#define BE_TX_COMP_NDMA_ERR		0x3
+#define BE_TX_COMP_ACL_ERR		0x5
+
+#define LANCER_TX_COMP_LSO_ERR			0x1
+#define LANCER_TX_COMP_HSW_DROP_MAC_ERR		0x3
+#define LANCER_TX_COMP_HSW_DROP_VLAN_ERR	0x5
+#define LANCER_TX_COMP_QINQ_ERR			0x7
+#define LANCER_TX_COMP_PARITY_ERR		0xb
+#define LANCER_TX_COMP_DMA_ERR			0xd
+
+/* TX Compl Queue Descriptor */
+
+/* Pseudo amap definition for eth_tx_compl in which each bit of the
+ * actual structure is defined as a byte: used to calculate
+ * offset/shift/mask of each field */
+struct amap_eth_tx_compl {
+	u8 wrb_index[16];	/* dword 0 */
+	u8 ct[2]; 		/* dword 0 */
+	u8 port[2];		/* dword 0 */
+	u8 rsvd0[8];		/* dword 0 */
+	u8 status[4];		/* dword 0 */
+	u8 user_bytes[16];	/* dword 1 */
+	u8 nwh_bytes[8];	/* dword 1 */
+	u8 lso;			/* dword 1 */
+	u8 cast_enc[2];		/* dword 1 */
+	u8 rsvd1[5];		/* dword 1 */
+	u8 rsvd2[32];		/* dword 2 */
+	u8 pkts[16];		/* dword 3 */
+	u8 ringid[11];		/* dword 3 */
+	u8 hash_val[4];		/* dword 3 */
+	u8 valid;		/* dword 3 */
+} __packed;
+
+struct be_eth_tx_compl {
+	u32 dw[4];
+};
+
+/* RX Queue Descriptor */
+struct be_eth_rx_d {
+	u32 fragpa_hi;
+	u32 fragpa_lo;
+};
+
+/* RX Compl Queue Descriptor */
+
+/* Pseudo amap definition for BE2 and BE3 legacy mode eth_rx_compl in which
+ * each bit of the actual structure is defined as a byte: used to calculate
+ * offset/shift/mask of each field */
+struct amap_eth_rx_compl_v0 {
+	u8 vlan_tag[16];	/* dword 0 */
+	u8 pktsize[14];		/* dword 0 */
+	u8 port;		/* dword 0 */
+	u8 ip_opt;		/* dword 0 */
+	u8 err;			/* dword 1 */
+	u8 rsshp;		/* dword 1 */
+	u8 ipf;			/* dword 1 */
+	u8 tcpf;		/* dword 1 */
+	u8 udpf;		/* dword 1 */
+	u8 ipcksm;		/* dword 1 */
+	u8 l4_cksm;		/* dword 1 */
+	u8 ip_version;		/* dword 1 */
+	u8 macdst[6];		/* dword 1 */
+	u8 vtp;			/* dword 1 */
+	u8 ip_frag;		/* dword 1 */
+	u8 fragndx[10];		/* dword 1 */
+	u8 ct[2];		/* dword 1 */
+	u8 sw;			/* dword 1 */
+	u8 numfrags[3];		/* dword 1 */
+	u8 rss_flush;		/* dword 2 */
+	u8 cast_enc[2];		/* dword 2 */
+	u8 qnq;			/* dword 2 */
+	u8 rss_bank;		/* dword 2 */
+	u8 rsvd1[23];		/* dword 2 */
+	u8 lro_pkt;		/* dword 2 */
+	u8 rsvd2[2];		/* dword 2 */
+	u8 valid;		/* dword 2 */
+	u8 rsshash[32];		/* dword 3 */
+} __packed;
+
+/* Pseudo amap definition for BE3 native mode eth_rx_compl in which
+ * each bit of the actual structure is defined as a byte: used to calculate
+ * offset/shift/mask of each field */
+struct amap_eth_rx_compl_v1 {
+	u8 vlan_tag[16];	/* dword 0 */
+	u8 pktsize[14];		/* dword 0 */
+	u8 vtp;			/* dword 0 */
+	u8 ip_opt;		/* dword 0 */
+	u8 err;			/* dword 1 */
+	u8 rsshp;		/* dword 1 */
+	u8 ipf;			/* dword 1 */
+	u8 tcpf;		/* dword 1 */
+	u8 udpf;		/* dword 1 */
+	u8 ipcksm;		/* dword 1 */
+	u8 l4_cksm;		/* dword 1 */
+	u8 ip_version;		/* dword 1 */
+	u8 macdst[7];		/* dword 1 */
+	u8 rsvd0;		/* dword 1 */
+	u8 fragndx[10];		/* dword 1 */
+	u8 ct[2];		/* dword 1 */
+	u8 sw;			/* dword 1 */
+	u8 numfrags[3];		/* dword 1 */
+	u8 rss_flush;		/* dword 2 */
+	u8 cast_enc[2];		/* dword 2 */
+	u8 qnq;			/* dword 2 */
+	u8 rss_bank;		/* dword 2 */
+	u8 port[2];		/* dword 2 */
+	u8 vntagp;		/* dword 2 */
+	u8 header_len[8];	/* dword 2 */
+	u8 header_split[2];	/* dword 2 */
+	u8 rsvd1[12];		/* dword 2 */
+	u8 tunneled;
+	u8 valid;		/* dword 2 */
+	u8 rsshash[32];		/* dword 3 */
+} __packed;
+
+struct be_eth_rx_compl {
+	u32 dw[4];
+};
+
+struct mgmt_hba_attribs {
+	u8 flashrom_version_string[32];
+	u8 manufacturer_name[32];
+	u32 supported_modes;
+	u32 rsvd0[3];
+	u8 ncsi_ver_string[12];
+	u32 default_extended_timeout;
+	u8 controller_model_number[32];
+	u8 controller_description[64];
+	u8 controller_serial_number[32];
+	u8 ip_version_string[32];
+	u8 firmware_version_string[32];
+	u8 bios_version_string[32];
+	u8 redboot_version_string[32];
+	u8 driver_version_string[32];
+	u8 fw_on_flash_version_string[32];
+	u32 functionalities_supported;
+	u16 max_cdblength;
+	u8 asic_revision;
+	u8 generational_guid[16];
+	u8 hba_port_count;
+	u16 default_link_down_timeout;
+	u8 iscsi_ver_min_max;
+	u8 multifunction_device;
+	u8 cache_valid;
+	u8 hba_status;
+	u8 max_domains_supported;
+	u8 phy_port;
+	u32 firmware_post_status;
+	u32 hba_mtu[8];
+	u32 rsvd1[4];
+};
+
+struct mgmt_controller_attrib {
+	struct mgmt_hba_attribs hba_attribs;
+	u16 pci_vendor_id;
+	u16 pci_device_id;
+	u16 pci_sub_vendor_id;
+	u16 pci_sub_system_id;
+	u8 pci_bus_number;
+	u8 pci_device_number;
+	u8 pci_function_number;
+	u8 interface_type;
+	u64 unique_identifier;
+	u32 rsvd0[5];
+};
+
+struct controller_id {
+	u32 vendor;
+	u32 device;
+	u32 subvendor;
+	u32 subdevice;
+};
+
+struct flash_comp {
+	unsigned long offset;
+	int optype;
+	int size;
+	int img_type;
+};
+
+struct image_hdr {
+	u32 imageid;
+	u32 imageoffset;
+	u32 imagelength;
+	u32 image_checksum;
+	u8 image_version[32];
+};
+struct flash_file_hdr_g2 {
+	u8 sign[32];
+	u32 cksum;
+	u32 antidote;
+	struct controller_id cont_id;
+	u32 file_len;
+	u32 chunk_num;
+	u32 total_chunks;
+	u32 num_imgs;
+	u8 build[24];
+};
+
+struct flash_file_hdr_g3 {
+	u8 sign[52];
+	u8 ufi_version[4];
+	u32 file_len;
+	u32 cksum;
+	u32 antidote;
+	u32 num_imgs;
+	u8 build[24];
+	u8 asic_type_rev;
+	u8 rsvd[31];
+};
+
+struct flash_section_hdr {
+	u32 format_rev;
+	u32 cksum;
+	u32 antidote;
+	u32 num_images;
+	u8 id_string[128];
+	u32 rsvd[4];
+} __packed;
+
+struct flash_section_hdr_g2 {
+	u32 format_rev;
+	u32 cksum;
+	u32 antidote;
+	u32 build_num;
+	u8 id_string[128];
+	u32 rsvd[8];
+} __packed;
+
+struct flash_section_entry {
+	u32 type;
+	u32 offset;
+	u32 pad_size;
+	u32 image_size;
+	u32 cksum;
+	u32 entry_point;
+	u16 optype;
+	u16 rsvd0;
+	u32 rsvd1;
+	u8 ver_data[32];
+} __packed;
+
+struct flash_section_info {
+	u8 cookie[32];
+	struct flash_section_hdr fsec_hdr;
+	struct flash_section_entry fsec_entry[32];
+} __packed;
+
+struct flash_section_info_g2 {
+	u8 cookie[32];
+	struct flash_section_hdr_g2 fsec_hdr;
+	struct flash_section_entry fsec_entry[32];
+} __packed;
diff --git a/drivers/net/ethernet/emulex/benet/be_main.c b/drivers/net/ethernet/emulex/benet/be_main.c
new file mode 100644
index 0000000..597c463
--- /dev/null
+++ b/drivers/net/ethernet/emulex/benet/be_main.c
@@ -0,0 +1,5211 @@
+/*
+ * Copyright (C) 2005 - 2014 Emulex
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.  The full GNU General
+ * Public License is included in this distribution in the file called COPYING.
+ *
+ * Contact Information:
+ * linux-drivers@emulex.com
+ *
+ * Emulex
+ * 3333 Susan Street
+ * Costa Mesa, CA 92626
+ */
+
+#include <linux/prefetch.h>
+#include <linux/module.h>
+#include "be.h"
+#include "be_cmds.h"
+#include <asm/div64.h>
+#include <linux/aer.h>
+#include <linux/if_bridge.h>
+#include <net/busy_poll.h>
+#include <net/vxlan.h>
+
+MODULE_VERSION(DRV_VER);
+MODULE_DEVICE_TABLE(pci, be_dev_ids);
+MODULE_DESCRIPTION(DRV_DESC " " DRV_VER);
+MODULE_AUTHOR("Emulex Corporation");
+MODULE_LICENSE("GPL");
+
+static unsigned int num_vfs;
+module_param(num_vfs, uint, S_IRUGO);
+MODULE_PARM_DESC(num_vfs, "Number of PCI VFs to initialize");
+
+static ushort rx_frag_size = 2048;
+module_param(rx_frag_size, ushort, S_IRUGO);
+MODULE_PARM_DESC(rx_frag_size, "Size of a fragment that holds rcvd data.");
+
+static const struct pci_device_id be_dev_ids[] = {
+	{ PCI_DEVICE(BE_VENDOR_ID, BE_DEVICE_ID1) },
+	{ PCI_DEVICE(BE_VENDOR_ID, BE_DEVICE_ID2) },
+	{ PCI_DEVICE(BE_VENDOR_ID, OC_DEVICE_ID1) },
+	{ PCI_DEVICE(BE_VENDOR_ID, OC_DEVICE_ID2) },
+	{ PCI_DEVICE(EMULEX_VENDOR_ID, OC_DEVICE_ID3)},
+	{ PCI_DEVICE(EMULEX_VENDOR_ID, OC_DEVICE_ID4)},
+	{ PCI_DEVICE(EMULEX_VENDOR_ID, OC_DEVICE_ID5)},
+	{ PCI_DEVICE(EMULEX_VENDOR_ID, OC_DEVICE_ID6)},
+	{ 0 }
+};
+MODULE_DEVICE_TABLE(pci, be_dev_ids);
+/* UE Status Low CSR */
+static const char * const ue_status_low_desc[] = {
+	"CEV",
+	"CTX",
+	"DBUF",
+	"ERX",
+	"Host",
+	"MPU",
+	"NDMA",
+	"PTC ",
+	"RDMA ",
+	"RXF ",
+	"RXIPS ",
+	"RXULP0 ",
+	"RXULP1 ",
+	"RXULP2 ",
+	"TIM ",
+	"TPOST ",
+	"TPRE ",
+	"TXIPS ",
+	"TXULP0 ",
+	"TXULP1 ",
+	"UC ",
+	"WDMA ",
+	"TXULP2 ",
+	"HOST1 ",
+	"P0_OB_LINK ",
+	"P1_OB_LINK ",
+	"HOST_GPIO ",
+	"MBOX ",
+	"ERX2 ",
+	"SPARE ",
+	"JTAG ",
+	"MPU_INTPEND "
+};
+
+/* UE Status High CSR */
+static const char * const ue_status_hi_desc[] = {
+	"LPCMEMHOST",
+	"MGMT_MAC",
+	"PCS0ONLINE",
+	"MPU_IRAM",
+	"PCS1ONLINE",
+	"PCTL0",
+	"PCTL1",
+	"PMEM",
+	"RR",
+	"TXPB",
+	"RXPP",
+	"XAUI",
+	"TXP",
+	"ARM",
+	"IPC",
+	"HOST2",
+	"HOST3",
+	"HOST4",
+	"HOST5",
+	"HOST6",
+	"HOST7",
+	"ECRC",
+	"Poison TLP",
+	"NETC",
+	"PERIPH",
+	"LLTXULP",
+	"D2P",
+	"RCON",
+	"LDMA",
+	"LLTXP",
+	"LLTXPB",
+	"Unknown"
+};
+
+static void be_queue_free(struct be_adapter *adapter, struct be_queue_info *q)
+{
+	struct be_dma_mem *mem = &q->dma_mem;
+
+	if (mem->va) {
+		dma_free_coherent(&adapter->pdev->dev, mem->size, mem->va,
+				  mem->dma);
+		mem->va = NULL;
+	}
+}
+
+static int be_queue_alloc(struct be_adapter *adapter, struct be_queue_info *q,
+			  u16 len, u16 entry_size)
+{
+	struct be_dma_mem *mem = &q->dma_mem;
+
+	memset(q, 0, sizeof(*q));
+	q->len = len;
+	q->entry_size = entry_size;
+	mem->size = len * entry_size;
+	mem->va = dma_zalloc_coherent(&adapter->pdev->dev, mem->size, &mem->dma,
+				      GFP_KERNEL);
+	if (!mem->va)
+		return -ENOMEM;
+	return 0;
+}
+
+static void be_reg_intr_set(struct be_adapter *adapter, bool enable)
+{
+	u32 reg, enabled;
+
+	pci_read_config_dword(adapter->pdev, PCICFG_MEMBAR_CTRL_INT_CTRL_OFFSET,
+			      &reg);
+	enabled = reg & MEMBAR_CTRL_INT_CTRL_HOSTINTR_MASK;
+
+	if (!enabled && enable)
+		reg |= MEMBAR_CTRL_INT_CTRL_HOSTINTR_MASK;
+	else if (enabled && !enable)
+		reg &= ~MEMBAR_CTRL_INT_CTRL_HOSTINTR_MASK;
+	else
+		return;
+
+	pci_write_config_dword(adapter->pdev,
+			       PCICFG_MEMBAR_CTRL_INT_CTRL_OFFSET, reg);
+}
+
+static void be_intr_set(struct be_adapter *adapter, bool enable)
+{
+	int status = 0;
+
+	/* On lancer interrupts can't be controlled via this register */
+	if (lancer_chip(adapter))
+		return;
+
+	if (adapter->eeh_error)
+		return;
+
+	status = be_cmd_intr_set(adapter, enable);
+	if (status)
+		be_reg_intr_set(adapter, enable);
+}
+
+static void be_rxq_notify(struct be_adapter *adapter, u16 qid, u16 posted)
+{
+	u32 val = 0;
+
+	val |= qid & DB_RQ_RING_ID_MASK;
+	val |= posted << DB_RQ_NUM_POSTED_SHIFT;
+
+	wmb();
+	iowrite32(val, adapter->db + DB_RQ_OFFSET);
+}
+
+static void be_txq_notify(struct be_adapter *adapter, struct be_tx_obj *txo,
+			  u16 posted)
+{
+	u32 val = 0;
+
+	val |= txo->q.id & DB_TXULP_RING_ID_MASK;
+	val |= (posted & DB_TXULP_NUM_POSTED_MASK) << DB_TXULP_NUM_POSTED_SHIFT;
+
+	wmb();
+	iowrite32(val, adapter->db + txo->db_offset);
+}
+
+static void be_eq_notify(struct be_adapter *adapter, u16 qid,
+			 bool arm, bool clear_int, u16 num_popped)
+{
+	u32 val = 0;
+
+	val |= qid & DB_EQ_RING_ID_MASK;
+	val |= ((qid & DB_EQ_RING_ID_EXT_MASK) << DB_EQ_RING_ID_EXT_MASK_SHIFT);
+
+	if (adapter->eeh_error)
+		return;
+
+	if (arm)
+		val |= 1 << DB_EQ_REARM_SHIFT;
+	if (clear_int)
+		val |= 1 << DB_EQ_CLR_SHIFT;
+	val |= 1 << DB_EQ_EVNT_SHIFT;
+	val |= num_popped << DB_EQ_NUM_POPPED_SHIFT;
+	iowrite32(val, adapter->db + DB_EQ_OFFSET);
+}
+
+void be_cq_notify(struct be_adapter *adapter, u16 qid, bool arm, u16 num_popped)
+{
+	u32 val = 0;
+
+	val |= qid & DB_CQ_RING_ID_MASK;
+	val |= ((qid & DB_CQ_RING_ID_EXT_MASK) <<
+			DB_CQ_RING_ID_EXT_MASK_SHIFT);
+
+	if (adapter->eeh_error)
+		return;
+
+	if (arm)
+		val |= 1 << DB_CQ_REARM_SHIFT;
+	val |= num_popped << DB_CQ_NUM_POPPED_SHIFT;
+	iowrite32(val, adapter->db + DB_CQ_OFFSET);
+}
+
+static int be_mac_addr_set(struct net_device *netdev, void *p)
+{
+	struct be_adapter *adapter = netdev_priv(netdev);
+	struct device *dev = &adapter->pdev->dev;
+	struct sockaddr *addr = p;
+	int status;
+	u8 mac[ETH_ALEN];
+	u32 old_pmac_id = adapter->pmac_id[0], curr_pmac_id = 0;
+
+	if (!is_valid_ether_addr(addr->sa_data))
+		return -EADDRNOTAVAIL;
+
+	/* Proceed further only if, User provided MAC is different
+	 * from active MAC
+	 */
+	if (ether_addr_equal(addr->sa_data, netdev->dev_addr))
+		return 0;
+
+	/* The PMAC_ADD cmd may fail if the VF doesn't have FILTMGMT
+	 * privilege or if PF did not provision the new MAC address.
+	 * On BE3, this cmd will always fail if the VF doesn't have the
+	 * FILTMGMT privilege. This failure is OK, only if the PF programmed
+	 * the MAC for the VF.
+	 */
+	status = be_cmd_pmac_add(adapter, (u8 *)addr->sa_data,
+				 adapter->if_handle, &adapter->pmac_id[0], 0);
+	if (!status) {
+		curr_pmac_id = adapter->pmac_id[0];
+
+		/* Delete the old programmed MAC. This call may fail if the
+		 * old MAC was already deleted by the PF driver.
+		 */
+		if (adapter->pmac_id[0] != old_pmac_id)
+			be_cmd_pmac_del(adapter, adapter->if_handle,
+					old_pmac_id, 0);
+	}
+
+	/* Decide if the new MAC is successfully activated only after
+	 * querying the FW
+	 */
+	status = be_cmd_get_active_mac(adapter, curr_pmac_id, mac,
+				       adapter->if_handle, true, 0);
+	if (status)
+		goto err;
+
+	/* The MAC change did not happen, either due to lack of privilege
+	 * or PF didn't pre-provision.
+	 */
+	if (!ether_addr_equal(addr->sa_data, mac)) {
+		status = -EPERM;
+		goto err;
+	}
+
+	memcpy(netdev->dev_addr, addr->sa_data, netdev->addr_len);
+	dev_info(dev, "MAC address changed to %pM\n", mac);
+	return 0;
+err:
+	dev_warn(dev, "MAC address change to %pM failed\n", addr->sa_data);
+	return status;
+}
+
+/* BE2 supports only v0 cmd */
+static void *hw_stats_from_cmd(struct be_adapter *adapter)
+{
+	if (BE2_chip(adapter)) {
+		struct be_cmd_resp_get_stats_v0 *cmd = adapter->stats_cmd.va;
+
+		return &cmd->hw_stats;
+	} else if (BE3_chip(adapter)) {
+		struct be_cmd_resp_get_stats_v1 *cmd = adapter->stats_cmd.va;
+
+		return &cmd->hw_stats;
+	} else {
+		struct be_cmd_resp_get_stats_v2 *cmd = adapter->stats_cmd.va;
+
+		return &cmd->hw_stats;
+	}
+}
+
+/* BE2 supports only v0 cmd */
+static void *be_erx_stats_from_cmd(struct be_adapter *adapter)
+{
+	if (BE2_chip(adapter)) {
+		struct be_hw_stats_v0 *hw_stats = hw_stats_from_cmd(adapter);
+
+		return &hw_stats->erx;
+	} else if (BE3_chip(adapter)) {
+		struct be_hw_stats_v1 *hw_stats = hw_stats_from_cmd(adapter);
+
+		return &hw_stats->erx;
+	} else {
+		struct be_hw_stats_v2 *hw_stats = hw_stats_from_cmd(adapter);
+
+		return &hw_stats->erx;
+	}
+}
+
+static void populate_be_v0_stats(struct be_adapter *adapter)
+{
+	struct be_hw_stats_v0 *hw_stats = hw_stats_from_cmd(adapter);
+	struct be_pmem_stats *pmem_sts = &hw_stats->pmem;
+	struct be_rxf_stats_v0 *rxf_stats = &hw_stats->rxf;
+	struct be_port_rxf_stats_v0 *port_stats =
+					&rxf_stats->port[adapter->port_num];
+	struct be_drv_stats *drvs = &adapter->drv_stats;
+
+	be_dws_le_to_cpu(hw_stats, sizeof(*hw_stats));
+	drvs->rx_pause_frames = port_stats->rx_pause_frames;
+	drvs->rx_crc_errors = port_stats->rx_crc_errors;
+	drvs->rx_control_frames = port_stats->rx_control_frames;
+	drvs->rx_in_range_errors = port_stats->rx_in_range_errors;
+	drvs->rx_frame_too_long = port_stats->rx_frame_too_long;
+	drvs->rx_dropped_runt = port_stats->rx_dropped_runt;
+	drvs->rx_ip_checksum_errs = port_stats->rx_ip_checksum_errs;
+	drvs->rx_tcp_checksum_errs = port_stats->rx_tcp_checksum_errs;
+	drvs->rx_udp_checksum_errs = port_stats->rx_udp_checksum_errs;
+	drvs->rxpp_fifo_overflow_drop = port_stats->rx_fifo_overflow;
+	drvs->rx_dropped_tcp_length = port_stats->rx_dropped_tcp_length;
+	drvs->rx_dropped_too_small = port_stats->rx_dropped_too_small;
+	drvs->rx_dropped_too_short = port_stats->rx_dropped_too_short;
+	drvs->rx_out_range_errors = port_stats->rx_out_range_errors;
+	drvs->rx_input_fifo_overflow_drop = port_stats->rx_input_fifo_overflow;
+	drvs->rx_dropped_header_too_small =
+		port_stats->rx_dropped_header_too_small;
+	drvs->rx_address_filtered =
+					port_stats->rx_address_filtered +
+					port_stats->rx_vlan_filtered;
+	drvs->rx_alignment_symbol_errors =
+		port_stats->rx_alignment_symbol_errors;
+
+	drvs->tx_pauseframes = port_stats->tx_pauseframes;
+	drvs->tx_controlframes = port_stats->tx_controlframes;
+
+	if (adapter->port_num)
+		drvs->jabber_events = rxf_stats->port1_jabber_events;
+	else
+		drvs->jabber_events = rxf_stats->port0_jabber_events;
+	drvs->rx_drops_no_pbuf = rxf_stats->rx_drops_no_pbuf;
+	drvs->rx_drops_no_erx_descr = rxf_stats->rx_drops_no_erx_descr;
+	drvs->forwarded_packets = rxf_stats->forwarded_packets;
+	drvs->rx_drops_mtu = rxf_stats->rx_drops_mtu;
+	drvs->rx_drops_no_tpre_descr = rxf_stats->rx_drops_no_tpre_descr;
+	drvs->rx_drops_too_many_frags = rxf_stats->rx_drops_too_many_frags;
+	adapter->drv_stats.eth_red_drops = pmem_sts->eth_red_drops;
+}
+
+static void populate_be_v1_stats(struct be_adapter *adapter)
+{
+	struct be_hw_stats_v1 *hw_stats = hw_stats_from_cmd(adapter);
+	struct be_pmem_stats *pmem_sts = &hw_stats->pmem;
+	struct be_rxf_stats_v1 *rxf_stats = &hw_stats->rxf;
+	struct be_port_rxf_stats_v1 *port_stats =
+					&rxf_stats->port[adapter->port_num];
+	struct be_drv_stats *drvs = &adapter->drv_stats;
+
+	be_dws_le_to_cpu(hw_stats, sizeof(*hw_stats));
+	drvs->pmem_fifo_overflow_drop = port_stats->pmem_fifo_overflow_drop;
+	drvs->rx_priority_pause_frames = port_stats->rx_priority_pause_frames;
+	drvs->rx_pause_frames = port_stats->rx_pause_frames;
+	drvs->rx_crc_errors = port_stats->rx_crc_errors;
+	drvs->rx_control_frames = port_stats->rx_control_frames;
+	drvs->rx_in_range_errors = port_stats->rx_in_range_errors;
+	drvs->rx_frame_too_long = port_stats->rx_frame_too_long;
+	drvs->rx_dropped_runt = port_stats->rx_dropped_runt;
+	drvs->rx_ip_checksum_errs = port_stats->rx_ip_checksum_errs;
+	drvs->rx_tcp_checksum_errs = port_stats->rx_tcp_checksum_errs;
+	drvs->rx_udp_checksum_errs = port_stats->rx_udp_checksum_errs;
+	drvs->rx_dropped_tcp_length = port_stats->rx_dropped_tcp_length;
+	drvs->rx_dropped_too_small = port_stats->rx_dropped_too_small;
+	drvs->rx_dropped_too_short = port_stats->rx_dropped_too_short;
+	drvs->rx_out_range_errors = port_stats->rx_out_range_errors;
+	drvs->rx_dropped_header_too_small =
+		port_stats->rx_dropped_header_too_small;
+	drvs->rx_input_fifo_overflow_drop =
+		port_stats->rx_input_fifo_overflow_drop;
+	drvs->rx_address_filtered = port_stats->rx_address_filtered;
+	drvs->rx_alignment_symbol_errors =
+		port_stats->rx_alignment_symbol_errors;
+	drvs->rxpp_fifo_overflow_drop = port_stats->rxpp_fifo_overflow_drop;
+	drvs->tx_pauseframes = port_stats->tx_pauseframes;
+	drvs->tx_controlframes = port_stats->tx_controlframes;
+	drvs->tx_priority_pauseframes = port_stats->tx_priority_pauseframes;
+	drvs->jabber_events = port_stats->jabber_events;
+	drvs->rx_drops_no_pbuf = rxf_stats->rx_drops_no_pbuf;
+	drvs->rx_drops_no_erx_descr = rxf_stats->rx_drops_no_erx_descr;
+	drvs->forwarded_packets = rxf_stats->forwarded_packets;
+	drvs->rx_drops_mtu = rxf_stats->rx_drops_mtu;
+	drvs->rx_drops_no_tpre_descr = rxf_stats->rx_drops_no_tpre_descr;
+	drvs->rx_drops_too_many_frags = rxf_stats->rx_drops_too_many_frags;
+	adapter->drv_stats.eth_red_drops = pmem_sts->eth_red_drops;
+}
+
+static void populate_be_v2_stats(struct be_adapter *adapter)
+{
+	struct be_hw_stats_v2 *hw_stats = hw_stats_from_cmd(adapter);
+	struct be_pmem_stats *pmem_sts = &hw_stats->pmem;
+	struct be_rxf_stats_v2 *rxf_stats = &hw_stats->rxf;
+	struct be_port_rxf_stats_v2 *port_stats =
+					&rxf_stats->port[adapter->port_num];
+	struct be_drv_stats *drvs = &adapter->drv_stats;
+
+	be_dws_le_to_cpu(hw_stats, sizeof(*hw_stats));
+	drvs->pmem_fifo_overflow_drop = port_stats->pmem_fifo_overflow_drop;
+	drvs->rx_priority_pause_frames = port_stats->rx_priority_pause_frames;
+	drvs->rx_pause_frames = port_stats->rx_pause_frames;
+	drvs->rx_crc_errors = port_stats->rx_crc_errors;
+	drvs->rx_control_frames = port_stats->rx_control_frames;
+	drvs->rx_in_range_errors = port_stats->rx_in_range_errors;
+	drvs->rx_frame_too_long = port_stats->rx_frame_too_long;
+	drvs->rx_dropped_runt = port_stats->rx_dropped_runt;
+	drvs->rx_ip_checksum_errs = port_stats->rx_ip_checksum_errs;
+	drvs->rx_tcp_checksum_errs = port_stats->rx_tcp_checksum_errs;
+	drvs->rx_udp_checksum_errs = port_stats->rx_udp_checksum_errs;
+	drvs->rx_dropped_tcp_length = port_stats->rx_dropped_tcp_length;
+	drvs->rx_dropped_too_small = port_stats->rx_dropped_too_small;
+	drvs->rx_dropped_too_short = port_stats->rx_dropped_too_short;
+	drvs->rx_out_range_errors = port_stats->rx_out_range_errors;
+	drvs->rx_dropped_header_too_small =
+		port_stats->rx_dropped_header_too_small;
+	drvs->rx_input_fifo_overflow_drop =
+		port_stats->rx_input_fifo_overflow_drop;
+	drvs->rx_address_filtered = port_stats->rx_address_filtered;
+	drvs->rx_alignment_symbol_errors =
+		port_stats->rx_alignment_symbol_errors;
+	drvs->rxpp_fifo_overflow_drop = port_stats->rxpp_fifo_overflow_drop;
+	drvs->tx_pauseframes = port_stats->tx_pauseframes;
+	drvs->tx_controlframes = port_stats->tx_controlframes;
+	drvs->tx_priority_pauseframes = port_stats->tx_priority_pauseframes;
+	drvs->jabber_events = port_stats->jabber_events;
+	drvs->rx_drops_no_pbuf = rxf_stats->rx_drops_no_pbuf;
+	drvs->rx_drops_no_erx_descr = rxf_stats->rx_drops_no_erx_descr;
+	drvs->forwarded_packets = rxf_stats->forwarded_packets;
+	drvs->rx_drops_mtu = rxf_stats->rx_drops_mtu;
+	drvs->rx_drops_no_tpre_descr = rxf_stats->rx_drops_no_tpre_descr;
+	drvs->rx_drops_too_many_frags = rxf_stats->rx_drops_too_many_frags;
+	adapter->drv_stats.eth_red_drops = pmem_sts->eth_red_drops;
+	if (be_roce_supported(adapter)) {
+		drvs->rx_roce_bytes_lsd = port_stats->roce_bytes_received_lsd;
+		drvs->rx_roce_bytes_msd = port_stats->roce_bytes_received_msd;
+		drvs->rx_roce_frames = port_stats->roce_frames_received;
+		drvs->roce_drops_crc = port_stats->roce_drops_crc;
+		drvs->roce_drops_payload_len =
+			port_stats->roce_drops_payload_len;
+	}
+}
+
+static void populate_lancer_stats(struct be_adapter *adapter)
+{
+	struct be_drv_stats *drvs = &adapter->drv_stats;
+	struct lancer_pport_stats *pport_stats = pport_stats_from_cmd(adapter);
+
+	be_dws_le_to_cpu(pport_stats, sizeof(*pport_stats));
+	drvs->rx_pause_frames = pport_stats->rx_pause_frames_lo;
+	drvs->rx_crc_errors = pport_stats->rx_crc_errors_lo;
+	drvs->rx_control_frames = pport_stats->rx_control_frames_lo;
+	drvs->rx_in_range_errors = pport_stats->rx_in_range_errors;
+	drvs->rx_frame_too_long = pport_stats->rx_frames_too_long_lo;
+	drvs->rx_dropped_runt = pport_stats->rx_dropped_runt;
+	drvs->rx_ip_checksum_errs = pport_stats->rx_ip_checksum_errors;
+	drvs->rx_tcp_checksum_errs = pport_stats->rx_tcp_checksum_errors;
+	drvs->rx_udp_checksum_errs = pport_stats->rx_udp_checksum_errors;
+	drvs->rx_dropped_tcp_length =
+				pport_stats->rx_dropped_invalid_tcp_length;
+	drvs->rx_dropped_too_small = pport_stats->rx_dropped_too_small;
+	drvs->rx_dropped_too_short = pport_stats->rx_dropped_too_short;
+	drvs->rx_out_range_errors = pport_stats->rx_out_of_range_errors;
+	drvs->rx_dropped_header_too_small =
+				pport_stats->rx_dropped_header_too_small;
+	drvs->rx_input_fifo_overflow_drop = pport_stats->rx_fifo_overflow;
+	drvs->rx_address_filtered =
+					pport_stats->rx_address_filtered +
+					pport_stats->rx_vlan_filtered;
+	drvs->rx_alignment_symbol_errors = pport_stats->rx_symbol_errors_lo;
+	drvs->rxpp_fifo_overflow_drop = pport_stats->rx_fifo_overflow;
+	drvs->tx_pauseframes = pport_stats->tx_pause_frames_lo;
+	drvs->tx_controlframes = pport_stats->tx_control_frames_lo;
+	drvs->jabber_events = pport_stats->rx_jabbers;
+	drvs->forwarded_packets = pport_stats->num_forwards_lo;
+	drvs->rx_drops_mtu = pport_stats->rx_drops_mtu_lo;
+	drvs->rx_drops_too_many_frags =
+				pport_stats->rx_drops_too_many_frags_lo;
+}
+
+static void accumulate_16bit_val(u32 *acc, u16 val)
+{
+#define lo(x)			(x & 0xFFFF)
+#define hi(x)			(x & 0xFFFF0000)
+	bool wrapped = val < lo(*acc);
+	u32 newacc = hi(*acc) + val;
+
+	if (wrapped)
+		newacc += 65536;
+	ACCESS_ONCE(*acc) = newacc;
+}
+
+static void populate_erx_stats(struct be_adapter *adapter,
+			       struct be_rx_obj *rxo, u32 erx_stat)
+{
+	if (!BEx_chip(adapter))
+		rx_stats(rxo)->rx_drops_no_frags = erx_stat;
+	else
+		/* below erx HW counter can actually wrap around after
+		 * 65535. Driver accumulates a 32-bit value
+		 */
+		accumulate_16bit_val(&rx_stats(rxo)->rx_drops_no_frags,
+				     (u16)erx_stat);
+}
+
+void be_parse_stats(struct be_adapter *adapter)
+{
+	struct be_erx_stats_v2 *erx = be_erx_stats_from_cmd(adapter);
+	struct be_rx_obj *rxo;
+	int i;
+	u32 erx_stat;
+
+	if (lancer_chip(adapter)) {
+		populate_lancer_stats(adapter);
+	} else {
+		if (BE2_chip(adapter))
+			populate_be_v0_stats(adapter);
+		else if (BE3_chip(adapter))
+			/* for BE3 */
+			populate_be_v1_stats(adapter);
+		else
+			populate_be_v2_stats(adapter);
+
+		/* erx_v2 is longer than v0, v1. use v2 for v0, v1 access */
+		for_all_rx_queues(adapter, rxo, i) {
+			erx_stat = erx->rx_drops_no_fragments[rxo->q.id];
+			populate_erx_stats(adapter, rxo, erx_stat);
+		}
+	}
+}
+
+static struct rtnl_link_stats64 *be_get_stats64(struct net_device *netdev,
+						struct rtnl_link_stats64 *stats)
+{
+	struct be_adapter *adapter = netdev_priv(netdev);
+	struct be_drv_stats *drvs = &adapter->drv_stats;
+	struct be_rx_obj *rxo;
+	struct be_tx_obj *txo;
+	u64 pkts, bytes;
+	unsigned int start;
+	int i;
+
+	for_all_rx_queues(adapter, rxo, i) {
+		const struct be_rx_stats *rx_stats = rx_stats(rxo);
+
+		do {
+			start = u64_stats_fetch_begin_irq(&rx_stats->sync);
+			pkts = rx_stats(rxo)->rx_pkts;
+			bytes = rx_stats(rxo)->rx_bytes;
+		} while (u64_stats_fetch_retry_irq(&rx_stats->sync, start));
+		stats->rx_packets += pkts;
+		stats->rx_bytes += bytes;
+		stats->multicast += rx_stats(rxo)->rx_mcast_pkts;
+		stats->rx_dropped += rx_stats(rxo)->rx_drops_no_skbs +
+					rx_stats(rxo)->rx_drops_no_frags;
+	}
+
+	for_all_tx_queues(adapter, txo, i) {
+		const struct be_tx_stats *tx_stats = tx_stats(txo);
+
+		do {
+			start = u64_stats_fetch_begin_irq(&tx_stats->sync);
+			pkts = tx_stats(txo)->tx_pkts;
+			bytes = tx_stats(txo)->tx_bytes;
+		} while (u64_stats_fetch_retry_irq(&tx_stats->sync, start));
+		stats->tx_packets += pkts;
+		stats->tx_bytes += bytes;
+	}
+
+	/* bad pkts received */
+	stats->rx_errors = drvs->rx_crc_errors +
+		drvs->rx_alignment_symbol_errors +
+		drvs->rx_in_range_errors +
+		drvs->rx_out_range_errors +
+		drvs->rx_frame_too_long +
+		drvs->rx_dropped_too_small +
+		drvs->rx_dropped_too_short +
+		drvs->rx_dropped_header_too_small +
+		drvs->rx_dropped_tcp_length +
+		drvs->rx_dropped_runt;
+
+	/* detailed rx errors */
+	stats->rx_length_errors = drvs->rx_in_range_errors +
+		drvs->rx_out_range_errors +
+		drvs->rx_frame_too_long;
+
+	stats->rx_crc_errors = drvs->rx_crc_errors;
+
+	/* frame alignment errors */
+	stats->rx_frame_errors = drvs->rx_alignment_symbol_errors;
+
+	/* receiver fifo overrun */
+	/* drops_no_pbuf is no per i/f, it's per BE card */
+	stats->rx_fifo_errors = drvs->rxpp_fifo_overflow_drop +
+				drvs->rx_input_fifo_overflow_drop +
+				drvs->rx_drops_no_pbuf;
+	return stats;
+}
+
+void be_link_status_update(struct be_adapter *adapter, u8 link_status)
+{
+	struct net_device *netdev = adapter->netdev;
+
+	if (!(adapter->flags & BE_FLAGS_LINK_STATUS_INIT)) {
+		netif_carrier_off(netdev);
+		adapter->flags |= BE_FLAGS_LINK_STATUS_INIT;
+	}
+
+	if (link_status)
+		netif_carrier_on(netdev);
+	else
+		netif_carrier_off(netdev);
+}
+
+static void be_tx_stats_update(struct be_tx_obj *txo,
+			       u32 wrb_cnt, u32 copied, u32 gso_segs,
+			       bool stopped)
+{
+	struct be_tx_stats *stats = tx_stats(txo);
+
+	u64_stats_update_begin(&stats->sync);
+	stats->tx_reqs++;
+	stats->tx_wrbs += wrb_cnt;
+	stats->tx_bytes += copied;
+	stats->tx_pkts += (gso_segs ? gso_segs : 1);
+	if (stopped)
+		stats->tx_stops++;
+	u64_stats_update_end(&stats->sync);
+}
+
+/* Determine number of WRB entries needed to xmit data in an skb */
+static u32 wrb_cnt_for_skb(struct be_adapter *adapter, struct sk_buff *skb,
+			   bool *dummy)
+{
+	int cnt = (skb->len > skb->data_len);
+
+	cnt += skb_shinfo(skb)->nr_frags;
+
+	/* to account for hdr wrb */
+	cnt++;
+	if (lancer_chip(adapter) || !(cnt & 1)) {
+		*dummy = false;
+	} else {
+		/* add a dummy to make it an even num */
+		cnt++;
+		*dummy = true;
+	}
+	BUG_ON(cnt > BE_MAX_TX_FRAG_COUNT);
+	return cnt;
+}
+
+static inline void wrb_fill(struct be_eth_wrb *wrb, u64 addr, int len)
+{
+	wrb->frag_pa_hi = upper_32_bits(addr);
+	wrb->frag_pa_lo = addr & 0xFFFFFFFF;
+	wrb->frag_len = len & ETH_WRB_FRAG_LEN_MASK;
+	wrb->rsvd0 = 0;
+}
+
+static inline u16 be_get_tx_vlan_tag(struct be_adapter *adapter,
+				     struct sk_buff *skb)
+{
+	u8 vlan_prio;
+	u16 vlan_tag;
+
+	vlan_tag = vlan_tx_tag_get(skb);
+	vlan_prio = (vlan_tag & VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT;
+	/* If vlan priority provided by OS is NOT in available bmap */
+	if (!(adapter->vlan_prio_bmap & (1 << vlan_prio)))
+		vlan_tag = (vlan_tag & ~VLAN_PRIO_MASK) |
+				adapter->recommended_prio;
+
+	return vlan_tag;
+}
+
+/* Used only for IP tunnel packets */
+static u16 skb_inner_ip_proto(struct sk_buff *skb)
+{
+	return (inner_ip_hdr(skb)->version == 4) ?
+		inner_ip_hdr(skb)->protocol : inner_ipv6_hdr(skb)->nexthdr;
+}
+
+static u16 skb_ip_proto(struct sk_buff *skb)
+{
+	return (ip_hdr(skb)->version == 4) ?
+		ip_hdr(skb)->protocol : ipv6_hdr(skb)->nexthdr;
+}
+
+static void wrb_fill_hdr(struct be_adapter *adapter, struct be_eth_hdr_wrb *hdr,
+			 struct sk_buff *skb, u32 wrb_cnt, u32 len,
+			 bool skip_hw_vlan)
+{
+	u16 vlan_tag, proto;
+
+	memset(hdr, 0, sizeof(*hdr));
+
+	SET_TX_WRB_HDR_BITS(crc, hdr, 1);
+
+	if (skb_is_gso(skb)) {
+		SET_TX_WRB_HDR_BITS(lso, hdr, 1);
+		SET_TX_WRB_HDR_BITS(lso_mss, hdr, skb_shinfo(skb)->gso_size);
+		if (skb_is_gso_v6(skb) && !lancer_chip(adapter))
+			SET_TX_WRB_HDR_BITS(lso6, hdr, 1);
+	} else if (skb->ip_summed == CHECKSUM_PARTIAL) {
+		if (skb->encapsulation) {
+			SET_TX_WRB_HDR_BITS(ipcs, hdr, 1);
+			proto = skb_inner_ip_proto(skb);
+		} else {
+			proto = skb_ip_proto(skb);
+		}
+		if (proto == IPPROTO_TCP)
+			SET_TX_WRB_HDR_BITS(tcpcs, hdr, 1);
+		else if (proto == IPPROTO_UDP)
+			SET_TX_WRB_HDR_BITS(udpcs, hdr, 1);
+	}
+
+	if (vlan_tx_tag_present(skb)) {
+		SET_TX_WRB_HDR_BITS(vlan, hdr, 1);
+		vlan_tag = be_get_tx_vlan_tag(adapter, skb);
+		SET_TX_WRB_HDR_BITS(vlan_tag, hdr, vlan_tag);
+	}
+
+	/* To skip HW VLAN tagging: evt = 1, compl = 0 */
+	SET_TX_WRB_HDR_BITS(complete, hdr, !skip_hw_vlan);
+	SET_TX_WRB_HDR_BITS(event, hdr, 1);
+	SET_TX_WRB_HDR_BITS(num_wrb, hdr, wrb_cnt);
+	SET_TX_WRB_HDR_BITS(len, hdr, len);
+}
+
+static void unmap_tx_frag(struct device *dev, struct be_eth_wrb *wrb,
+			  bool unmap_single)
+{
+	dma_addr_t dma;
+
+	be_dws_le_to_cpu(wrb, sizeof(*wrb));
+
+	dma = (u64)wrb->frag_pa_hi << 32 | (u64)wrb->frag_pa_lo;
+	if (wrb->frag_len) {
+		if (unmap_single)
+			dma_unmap_single(dev, dma, wrb->frag_len,
+					 DMA_TO_DEVICE);
+		else
+			dma_unmap_page(dev, dma, wrb->frag_len, DMA_TO_DEVICE);
+	}
+}
+
+static int make_tx_wrbs(struct be_adapter *adapter, struct be_queue_info *txq,
+			struct sk_buff *skb, u32 wrb_cnt, bool dummy_wrb,
+			bool skip_hw_vlan)
+{
+	dma_addr_t busaddr;
+	int i, copied = 0;
+	struct device *dev = &adapter->pdev->dev;
+	struct sk_buff *first_skb = skb;
+	struct be_eth_wrb *wrb;
+	struct be_eth_hdr_wrb *hdr;
+	bool map_single = false;
+	u16 map_head;
+
+	hdr = queue_head_node(txq);
+	queue_head_inc(txq);
+	map_head = txq->head;
+
+	if (skb->len > skb->data_len) {
+		int len = skb_headlen(skb);
+
+		busaddr = dma_map_single(dev, skb->data, len, DMA_TO_DEVICE);
+		if (dma_mapping_error(dev, busaddr))
+			goto dma_err;
+		map_single = true;
+		wrb = queue_head_node(txq);
+		wrb_fill(wrb, busaddr, len);
+		be_dws_cpu_to_le(wrb, sizeof(*wrb));
+		queue_head_inc(txq);
+		copied += len;
+	}
+
+	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+		const struct skb_frag_struct *frag = &skb_shinfo(skb)->frags[i];
+
+		busaddr = skb_frag_dma_map(dev, frag, 0,
+					   skb_frag_size(frag), DMA_TO_DEVICE);
+		if (dma_mapping_error(dev, busaddr))
+			goto dma_err;
+		wrb = queue_head_node(txq);
+		wrb_fill(wrb, busaddr, skb_frag_size(frag));
+		be_dws_cpu_to_le(wrb, sizeof(*wrb));
+		queue_head_inc(txq);
+		copied += skb_frag_size(frag);
+	}
+
+	if (dummy_wrb) {
+		wrb = queue_head_node(txq);
+		wrb_fill(wrb, 0, 0);
+		be_dws_cpu_to_le(wrb, sizeof(*wrb));
+		queue_head_inc(txq);
+	}
+
+	wrb_fill_hdr(adapter, hdr, first_skb, wrb_cnt, copied, skip_hw_vlan);
+	be_dws_cpu_to_le(hdr, sizeof(*hdr));
+
+	return copied;
+dma_err:
+	txq->head = map_head;
+	while (copied) {
+		wrb = queue_head_node(txq);
+		unmap_tx_frag(dev, wrb, map_single);
+		map_single = false;
+		copied -= wrb->frag_len;
+		adapter->drv_stats.dma_map_errors++;
+		queue_head_inc(txq);
+	}
+	return 0;
+}
+
+static struct sk_buff *be_insert_vlan_in_pkt(struct be_adapter *adapter,
+					     struct sk_buff *skb,
+					     bool *skip_hw_vlan)
+{
+	u16 vlan_tag = 0;
+
+	skb = skb_share_check(skb, GFP_ATOMIC);
+	if (unlikely(!skb))
+		return skb;
+
+	if (vlan_tx_tag_present(skb))
+		vlan_tag = be_get_tx_vlan_tag(adapter, skb);
+
+	if (qnq_async_evt_rcvd(adapter) && adapter->pvid) {
+		if (!vlan_tag)
+			vlan_tag = adapter->pvid;
+		/* f/w workaround to set skip_hw_vlan = 1, informs the F/W to
+		 * skip VLAN insertion
+		 */
+		if (skip_hw_vlan)
+			*skip_hw_vlan = true;
+	}
+
+	if (vlan_tag) {
+		skb = __vlan_put_tag(skb, htons(ETH_P_8021Q), vlan_tag);
+		if (unlikely(!skb))
+			return skb;
+		skb->vlan_tci = 0;
+	}
+
+	/* Insert the outer VLAN, if any */
+	if (adapter->qnq_vid) {
+		vlan_tag = adapter->qnq_vid;
+		skb = __vlan_put_tag(skb, htons(ETH_P_8021Q), vlan_tag);
+		if (unlikely(!skb))
+			return skb;
+		if (skip_hw_vlan)
+			*skip_hw_vlan = true;
+	}
+
+	return skb;
+}
+
+static bool be_ipv6_exthdr_check(struct sk_buff *skb)
+{
+	struct ethhdr *eh = (struct ethhdr *)skb->data;
+	u16 offset = ETH_HLEN;
+
+	if (eh->h_proto == htons(ETH_P_IPV6)) {
+		struct ipv6hdr *ip6h = (struct ipv6hdr *)(skb->data + offset);
+
+		offset += sizeof(struct ipv6hdr);
+		if (ip6h->nexthdr != NEXTHDR_TCP &&
+		    ip6h->nexthdr != NEXTHDR_UDP) {
+			struct ipv6_opt_hdr *ehdr =
+				(struct ipv6_opt_hdr *)(skb->data + offset);
+
+			/* offending pkt: 2nd byte following IPv6 hdr is 0xff */
+			if (ehdr->hdrlen == 0xff)
+				return true;
+		}
+	}
+	return false;
+}
+
+static int be_vlan_tag_tx_chk(struct be_adapter *adapter, struct sk_buff *skb)
+{
+	return vlan_tx_tag_present(skb) || adapter->pvid || adapter->qnq_vid;
+}
+
+static int be_ipv6_tx_stall_chk(struct be_adapter *adapter, struct sk_buff *skb)
+{
+	return BE3_chip(adapter) && be_ipv6_exthdr_check(skb);
+}
+
+static struct sk_buff *be_lancer_xmit_workarounds(struct be_adapter *adapter,
+						  struct sk_buff *skb,
+						  bool *skip_hw_vlan)
+{
+	struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
+	unsigned int eth_hdr_len;
+	struct iphdr *ip;
+
+	/* For padded packets, BE HW modifies tot_len field in IP header
+	 * incorrecly when VLAN tag is inserted by HW.
+	 * For padded packets, Lancer computes incorrect checksum.
+	 */
+	eth_hdr_len = ntohs(skb->protocol) == ETH_P_8021Q ?
+						VLAN_ETH_HLEN : ETH_HLEN;
+	if (skb->len <= 60 &&
+	    (lancer_chip(adapter) || vlan_tx_tag_present(skb)) &&
+	    is_ipv4_pkt(skb)) {
+		ip = (struct iphdr *)ip_hdr(skb);
+		pskb_trim(skb, eth_hdr_len + ntohs(ip->tot_len));
+	}
+
+	/* If vlan tag is already inlined in the packet, skip HW VLAN
+	 * tagging in pvid-tagging mode
+	 */
+	if (be_pvid_tagging_enabled(adapter) &&
+	    veh->h_vlan_proto == htons(ETH_P_8021Q))
+		*skip_hw_vlan = true;
+
+	/* HW has a bug wherein it will calculate CSUM for VLAN
+	 * pkts even though it is disabled.
+	 * Manually insert VLAN in pkt.
+	 */
+	if (skb->ip_summed != CHECKSUM_PARTIAL &&
+	    vlan_tx_tag_present(skb)) {
+		skb = be_insert_vlan_in_pkt(adapter, skb, skip_hw_vlan);
+		if (unlikely(!skb))
+			goto err;
+	}
+
+	/* HW may lockup when VLAN HW tagging is requested on
+	 * certain ipv6 packets. Drop such pkts if the HW workaround to
+	 * skip HW tagging is not enabled by FW.
+	 */
+	if (unlikely(be_ipv6_tx_stall_chk(adapter, skb) &&
+		     (adapter->pvid || adapter->qnq_vid) &&
+		     !qnq_async_evt_rcvd(adapter)))
+		goto tx_drop;
+
+	/* Manual VLAN tag insertion to prevent:
+	 * ASIC lockup when the ASIC inserts VLAN tag into
+	 * certain ipv6 packets. Insert VLAN tags in driver,
+	 * and set event, completion, vlan bits accordingly
+	 * in the Tx WRB.
+	 */
+	if (be_ipv6_tx_stall_chk(adapter, skb) &&
+	    be_vlan_tag_tx_chk(adapter, skb)) {
+		skb = be_insert_vlan_in_pkt(adapter, skb, skip_hw_vlan);
+		if (unlikely(!skb))
+			goto err;
+	}
+
+	return skb;
+tx_drop:
+	dev_kfree_skb_any(skb);
+err:
+	return NULL;
+}
+
+static struct sk_buff *be_xmit_workarounds(struct be_adapter *adapter,
+					   struct sk_buff *skb,
+					   bool *skip_hw_vlan)
+{
+	/* Lancer, SH-R ASICs have a bug wherein Packets that are 32 bytes or
+	 * less may cause a transmit stall on that port. So the work-around is
+	 * to pad short packets (<= 32 bytes) to a 36-byte length.
+	 */
+	if (unlikely(!BEx_chip(adapter) && skb->len <= 32)) {
+		if (skb_padto(skb, 36))
+			return NULL;
+		skb->len = 36;
+	}
+
+	if (BEx_chip(adapter) || lancer_chip(adapter)) {
+		skb = be_lancer_xmit_workarounds(adapter, skb, skip_hw_vlan);
+		if (!skb)
+			return NULL;
+	}
+
+	return skb;
+}
+
+static netdev_tx_t be_xmit(struct sk_buff *skb, struct net_device *netdev)
+{
+	struct be_adapter *adapter = netdev_priv(netdev);
+	struct be_tx_obj *txo = &adapter->tx_obj[skb_get_queue_mapping(skb)];
+	struct be_queue_info *txq = &txo->q;
+	bool dummy_wrb, stopped = false;
+	u32 wrb_cnt = 0, copied = 0;
+	bool skip_hw_vlan = false;
+	u32 start = txq->head;
+
+	skb = be_xmit_workarounds(adapter, skb, &skip_hw_vlan);
+	if (!skb) {
+		tx_stats(txo)->tx_drv_drops++;
+		return NETDEV_TX_OK;
+	}
+
+	wrb_cnt = wrb_cnt_for_skb(adapter, skb, &dummy_wrb);
+
+	copied = make_tx_wrbs(adapter, txq, skb, wrb_cnt, dummy_wrb,
+			      skip_hw_vlan);
+	if (copied) {
+		int gso_segs = skb_shinfo(skb)->gso_segs;
+
+		/* record the sent skb in the sent_skb table */
+		BUG_ON(txo->sent_skb_list[start]);
+		txo->sent_skb_list[start] = skb;
+
+		/* Ensure txq has space for the next skb; Else stop the queue
+		 * *BEFORE* ringing the tx doorbell, so that we serialze the
+		 * tx compls of the current transmit which'll wake up the queue
+		 */
+		atomic_add(wrb_cnt, &txq->used);
+		if ((BE_MAX_TX_FRAG_COUNT + atomic_read(&txq->used)) >=
+								txq->len) {
+			netif_stop_subqueue(netdev, skb_get_queue_mapping(skb));
+			stopped = true;
+		}
+
+		be_txq_notify(adapter, txo, wrb_cnt);
+
+		be_tx_stats_update(txo, wrb_cnt, copied, gso_segs, stopped);
+	} else {
+		txq->head = start;
+		tx_stats(txo)->tx_drv_drops++;
+		dev_kfree_skb_any(skb);
+	}
+	return NETDEV_TX_OK;
+}
+
+static int be_change_mtu(struct net_device *netdev, int new_mtu)
+{
+	struct be_adapter *adapter = netdev_priv(netdev);
+	struct device *dev = &adapter->pdev->dev;
+
+	if (new_mtu < BE_MIN_MTU || new_mtu > BE_MAX_MTU) {
+		dev_info(dev, "MTU must be between %d and %d bytes\n",
+			 BE_MIN_MTU, BE_MAX_MTU);
+		return -EINVAL;
+	}
+
+	dev_info(dev, "MTU changed from %d to %d bytes\n",
+		 netdev->mtu, new_mtu);
+	netdev->mtu = new_mtu;
+	return 0;
+}
+
+/*
+ * A max of 64 (BE_NUM_VLANS_SUPPORTED) vlans can be configured in BE.
+ * If the user configures more, place BE in vlan promiscuous mode.
+ */
+static int be_vid_config(struct be_adapter *adapter)
+{
+	struct device *dev = &adapter->pdev->dev;
+	u16 vids[BE_NUM_VLANS_SUPPORTED];
+	u16 num = 0, i = 0;
+	int status = 0;
+
+	/* No need to further configure vids if in promiscuous mode */
+	if (adapter->promiscuous)
+		return 0;
+
+	if (adapter->vlans_added > be_max_vlans(adapter))
+		goto set_vlan_promisc;
+
+	/* Construct VLAN Table to give to HW */
+	for_each_set_bit(i, adapter->vids, VLAN_N_VID)
+		vids[num++] = cpu_to_le16(i);
+
+	status = be_cmd_vlan_config(adapter, adapter->if_handle, vids, num);
+	if (status) {
+		/* Set to VLAN promisc mode as setting VLAN filter failed */
+		if (addl_status(status) ==
+				MCC_ADDL_STATUS_INSUFFICIENT_RESOURCES)
+			goto set_vlan_promisc;
+		dev_err(dev, "Setting HW VLAN filtering failed\n");
+	} else {
+		if (adapter->flags & BE_FLAGS_VLAN_PROMISC) {
+			/* hw VLAN filtering re-enabled. */
+			status = be_cmd_rx_filter(adapter,
+						  BE_FLAGS_VLAN_PROMISC, OFF);
+			if (!status) {
+				dev_info(dev,
+					 "Disabling VLAN Promiscuous mode\n");
+				adapter->flags &= ~BE_FLAGS_VLAN_PROMISC;
+			}
+		}
+	}
+
+	return status;
+
+set_vlan_promisc:
+	if (adapter->flags & BE_FLAGS_VLAN_PROMISC)
+		return 0;
+
+	status = be_cmd_rx_filter(adapter, BE_FLAGS_VLAN_PROMISC, ON);
+	if (!status) {
+		dev_info(dev, "Enable VLAN Promiscuous mode\n");
+		adapter->flags |= BE_FLAGS_VLAN_PROMISC;
+	} else
+		dev_err(dev, "Failed to enable VLAN Promiscuous mode\n");
+	return status;
+}
+
+static int be_vlan_add_vid(struct net_device *netdev, __be16 proto, u16 vid)
+{
+	struct be_adapter *adapter = netdev_priv(netdev);
+	int status = 0;
+
+	/* Packets with VID 0 are always received by Lancer by default */
+	if (lancer_chip(adapter) && vid == 0)
+		return status;
+
+	if (test_bit(vid, adapter->vids))
+		return status;
+
+	set_bit(vid, adapter->vids);
+	adapter->vlans_added++;
+
+	status = be_vid_config(adapter);
+	if (status) {
+		adapter->vlans_added--;
+		clear_bit(vid, adapter->vids);
+	}
+
+	return status;
+}
+
+static int be_vlan_rem_vid(struct net_device *netdev, __be16 proto, u16 vid)
+{
+	struct be_adapter *adapter = netdev_priv(netdev);
+
+	/* Packets with VID 0 are always received by Lancer by default */
+	if (lancer_chip(adapter) && vid == 0)
+		return 0;
+
+	clear_bit(vid, adapter->vids);
+	adapter->vlans_added--;
+
+	return be_vid_config(adapter);
+}
+
+static void be_clear_promisc(struct be_adapter *adapter)
+{
+	adapter->promiscuous = false;
+	adapter->flags &= ~(BE_FLAGS_VLAN_PROMISC | BE_FLAGS_MCAST_PROMISC);
+
+	be_cmd_rx_filter(adapter, IFF_PROMISC, OFF);
+}
+
+static void be_set_rx_mode(struct net_device *netdev)
+{
+	struct be_adapter *adapter = netdev_priv(netdev);
+	int status;
+
+	if (netdev->flags & IFF_PROMISC) {
+		be_cmd_rx_filter(adapter, IFF_PROMISC, ON);
+		adapter->promiscuous = true;
+		goto done;
+	}
+
+	/* BE was previously in promiscuous mode; disable it */
+	if (adapter->promiscuous) {
+		be_clear_promisc(adapter);
+		if (adapter->vlans_added)
+			be_vid_config(adapter);
+	}
+
+	/* Enable multicast promisc if num configured exceeds what we support */
+	if (netdev->flags & IFF_ALLMULTI ||
+	    netdev_mc_count(netdev) > be_max_mc(adapter))
+		goto set_mcast_promisc;
+
+	if (netdev_uc_count(netdev) != adapter->uc_macs) {
+		struct netdev_hw_addr *ha;
+		int i = 1; /* First slot is claimed by the Primary MAC */
+
+		for (; adapter->uc_macs > 0; adapter->uc_macs--, i++) {
+			be_cmd_pmac_del(adapter, adapter->if_handle,
+					adapter->pmac_id[i], 0);
+		}
+
+		if (netdev_uc_count(netdev) > be_max_uc(adapter)) {
+			be_cmd_rx_filter(adapter, IFF_PROMISC, ON);
+			adapter->promiscuous = true;
+			goto done;
+		}
+
+		netdev_for_each_uc_addr(ha, adapter->netdev) {
+			adapter->uc_macs++; /* First slot is for Primary MAC */
+			be_cmd_pmac_add(adapter, (u8 *)ha->addr,
+					adapter->if_handle,
+					&adapter->pmac_id[adapter->uc_macs], 0);
+		}
+	}
+
+	status = be_cmd_rx_filter(adapter, IFF_MULTICAST, ON);
+	if (!status) {
+		if (adapter->flags & BE_FLAGS_MCAST_PROMISC)
+			adapter->flags &= ~BE_FLAGS_MCAST_PROMISC;
+		goto done;
+	}
+
+set_mcast_promisc:
+	if (adapter->flags & BE_FLAGS_MCAST_PROMISC)
+		return;
+
+	/* Set to MCAST promisc mode if setting MULTICAST address fails
+	 * or if num configured exceeds what we support
+	 */
+	status = be_cmd_rx_filter(adapter, IFF_ALLMULTI, ON);
+	if (!status)
+		adapter->flags |= BE_FLAGS_MCAST_PROMISC;
+done:
+	return;
+}
+
+static int be_set_vf_mac(struct net_device *netdev, int vf, u8 *mac)
+{
+	struct be_adapter *adapter = netdev_priv(netdev);
+	struct be_vf_cfg *vf_cfg = &adapter->vf_cfg[vf];
+	int status;
+
+	if (!sriov_enabled(adapter))
+		return -EPERM;
+
+	if (!is_valid_ether_addr(mac) || vf >= adapter->num_vfs)
+		return -EINVAL;
+
+	/* Proceed further only if user provided MAC is different
+	 * from active MAC
+	 */
+	if (ether_addr_equal(mac, vf_cfg->mac_addr))
+		return 0;
+
+	if (BEx_chip(adapter)) {
+		be_cmd_pmac_del(adapter, vf_cfg->if_handle, vf_cfg->pmac_id,
+				vf + 1);
+
+		status = be_cmd_pmac_add(adapter, mac, vf_cfg->if_handle,
+					 &vf_cfg->pmac_id, vf + 1);
+	} else {
+		status = be_cmd_set_mac(adapter, mac, vf_cfg->if_handle,
+					vf + 1);
+	}
+
+	if (status) {
+		dev_err(&adapter->pdev->dev, "MAC %pM set on VF %d Failed: %#x",
+			mac, vf, status);
+		return be_cmd_status(status);
+	}
+
+	ether_addr_copy(vf_cfg->mac_addr, mac);
+
+	return 0;
+}
+
+static int be_get_vf_config(struct net_device *netdev, int vf,
+			    struct ifla_vf_info *vi)
+{
+	struct be_adapter *adapter = netdev_priv(netdev);
+	struct be_vf_cfg *vf_cfg = &adapter->vf_cfg[vf];
+
+	if (!sriov_enabled(adapter))
+		return -EPERM;
+
+	if (vf >= adapter->num_vfs)
+		return -EINVAL;
+
+	vi->vf = vf;
+	vi->max_tx_rate = vf_cfg->tx_rate;
+	vi->min_tx_rate = 0;
+	vi->vlan = vf_cfg->vlan_tag & VLAN_VID_MASK;
+	vi->qos = vf_cfg->vlan_tag >> VLAN_PRIO_SHIFT;
+	memcpy(&vi->mac, vf_cfg->mac_addr, ETH_ALEN);
+	vi->linkstate = adapter->vf_cfg[vf].plink_tracking;
+
+	return 0;
+}
+
+static int be_set_vf_vlan(struct net_device *netdev, int vf, u16 vlan, u8 qos)
+{
+	struct be_adapter *adapter = netdev_priv(netdev);
+	struct be_vf_cfg *vf_cfg = &adapter->vf_cfg[vf];
+	int status = 0;
+
+	if (!sriov_enabled(adapter))
+		return -EPERM;
+
+	if (vf >= adapter->num_vfs || vlan > 4095 || qos > 7)
+		return -EINVAL;
+
+	if (vlan || qos) {
+		vlan |= qos << VLAN_PRIO_SHIFT;
+		if (vf_cfg->vlan_tag != vlan)
+			status = be_cmd_set_hsw_config(adapter, vlan, vf + 1,
+						       vf_cfg->if_handle, 0);
+	} else {
+		/* Reset Transparent Vlan Tagging. */
+		status = be_cmd_set_hsw_config(adapter, BE_RESET_VLAN_TAG_ID,
+					       vf + 1, vf_cfg->if_handle, 0);
+	}
+
+	if (status) {
+		dev_err(&adapter->pdev->dev,
+			"VLAN %d config on VF %d failed : %#x\n", vlan,
+			vf, status);
+		return be_cmd_status(status);
+	}
+
+	vf_cfg->vlan_tag = vlan;
+
+	return 0;
+}
+
+static int be_set_vf_tx_rate(struct net_device *netdev, int vf,
+			     int min_tx_rate, int max_tx_rate)
+{
+	struct be_adapter *adapter = netdev_priv(netdev);
+	struct device *dev = &adapter->pdev->dev;
+	int percent_rate, status = 0;
+	u16 link_speed = 0;
+	u8 link_status;
+
+	if (!sriov_enabled(adapter))
+		return -EPERM;
+
+	if (vf >= adapter->num_vfs)
+		return -EINVAL;
+
+	if (min_tx_rate)
+		return -EINVAL;
+
+	if (!max_tx_rate)
+		goto config_qos;
+
+	status = be_cmd_link_status_query(adapter, &link_speed,
+					  &link_status, 0);
+	if (status)
+		goto err;
+
+	if (!link_status) {
+		dev_err(dev, "TX-rate setting not allowed when link is down\n");
+		status = -ENETDOWN;
+		goto err;
+	}
+
+	if (max_tx_rate < 100 || max_tx_rate > link_speed) {
+		dev_err(dev, "TX-rate must be between 100 and %d Mbps\n",
+			link_speed);
+		status = -EINVAL;
+		goto err;
+	}
+
+	/* On Skyhawk the QOS setting must be done only as a % value */
+	percent_rate = link_speed / 100;
+	if (skyhawk_chip(adapter) && (max_tx_rate % percent_rate)) {
+		dev_err(dev, "TX-rate must be a multiple of %d Mbps\n",
+			percent_rate);
+		status = -EINVAL;
+		goto err;
+	}
+
+config_qos:
+	status = be_cmd_config_qos(adapter, max_tx_rate, link_speed, vf + 1);
+	if (status)
+		goto err;
+
+	adapter->vf_cfg[vf].tx_rate = max_tx_rate;
+	return 0;
+
+err:
+	dev_err(dev, "TX-rate setting of %dMbps on VF%d failed\n",
+		max_tx_rate, vf);
+	return be_cmd_status(status);
+}
+
+static int be_set_vf_link_state(struct net_device *netdev, int vf,
+				int link_state)
+{
+	struct be_adapter *adapter = netdev_priv(netdev);
+	int status;
+
+	if (!sriov_enabled(adapter))
+		return -EPERM;
+
+	if (vf >= adapter->num_vfs)
+		return -EINVAL;
+
+	status = be_cmd_set_logical_link_config(adapter, link_state, vf+1);
+	if (status) {
+		dev_err(&adapter->pdev->dev,
+			"Link state change on VF %d failed: %#x\n", vf, status);
+		return be_cmd_status(status);
+	}
+
+	adapter->vf_cfg[vf].plink_tracking = link_state;
+
+	return 0;
+}
+
+static void be_aic_update(struct be_aic_obj *aic, u64 rx_pkts, u64 tx_pkts,
+			  ulong now)
+{
+	aic->rx_pkts_prev = rx_pkts;
+	aic->tx_reqs_prev = tx_pkts;
+	aic->jiffies = now;
+}
+
+static void be_eqd_update(struct be_adapter *adapter)
+{
+	struct be_set_eqd set_eqd[MAX_EVT_QS];
+	int eqd, i, num = 0, start;
+	struct be_aic_obj *aic;
+	struct be_eq_obj *eqo;
+	struct be_rx_obj *rxo;
+	struct be_tx_obj *txo;
+	u64 rx_pkts, tx_pkts;
+	ulong now;
+	u32 pps, delta;
+
+	for_all_evt_queues(adapter, eqo, i) {
+		aic = &adapter->aic_obj[eqo->idx];
+		if (!aic->enable) {
+			if (aic->jiffies)
+				aic->jiffies = 0;
+			eqd = aic->et_eqd;
+			goto modify_eqd;
+		}
+
+		rxo = &adapter->rx_obj[eqo->idx];
+		do {
+			start = u64_stats_fetch_begin_irq(&rxo->stats.sync);
+			rx_pkts = rxo->stats.rx_pkts;
+		} while (u64_stats_fetch_retry_irq(&rxo->stats.sync, start));
+
+		txo = &adapter->tx_obj[eqo->idx];
+		do {
+			start = u64_stats_fetch_begin_irq(&txo->stats.sync);
+			tx_pkts = txo->stats.tx_reqs;
+		} while (u64_stats_fetch_retry_irq(&txo->stats.sync, start));
+
+		/* Skip, if wrapped around or first calculation */
+		now = jiffies;
+		if (!aic->jiffies || time_before(now, aic->jiffies) ||
+		    rx_pkts < aic->rx_pkts_prev ||
+		    tx_pkts < aic->tx_reqs_prev) {
+			be_aic_update(aic, rx_pkts, tx_pkts, now);
+			continue;
+		}
+
+		delta = jiffies_to_msecs(now - aic->jiffies);
+		pps = (((u32)(rx_pkts - aic->rx_pkts_prev) * 1000) / delta) +
+			(((u32)(tx_pkts - aic->tx_reqs_prev) * 1000) / delta);
+		eqd = (pps / 15000) << 2;
+
+		if (eqd < 8)
+			eqd = 0;
+		eqd = min_t(u32, eqd, aic->max_eqd);
+		eqd = max_t(u32, eqd, aic->min_eqd);
+
+		be_aic_update(aic, rx_pkts, tx_pkts, now);
+modify_eqd:
+		if (eqd != aic->prev_eqd) {
+			set_eqd[num].delay_multiplier = (eqd * 65)/100;
+			set_eqd[num].eq_id = eqo->q.id;
+			aic->prev_eqd = eqd;
+			num++;
+		}
+	}
+
+	if (num)
+		be_cmd_modify_eqd(adapter, set_eqd, num);
+}
+
+static void be_rx_stats_update(struct be_rx_obj *rxo,
+			       struct be_rx_compl_info *rxcp)
+{
+	struct be_rx_stats *stats = rx_stats(rxo);
+
+	u64_stats_update_begin(&stats->sync);
+	stats->rx_compl++;
+	stats->rx_bytes += rxcp->pkt_size;
+	stats->rx_pkts++;
+	if (rxcp->pkt_type == BE_MULTICAST_PACKET)
+		stats->rx_mcast_pkts++;
+	if (rxcp->err)
+		stats->rx_compl_err++;
+	u64_stats_update_end(&stats->sync);
+}
+
+static inline bool csum_passed(struct be_rx_compl_info *rxcp)
+{
+	/* L4 checksum is not reliable for non TCP/UDP packets.
+	 * Also ignore ipcksm for ipv6 pkts
+	 */
+	return (rxcp->tcpf || rxcp->udpf) && rxcp->l4_csum &&
+		(rxcp->ip_csum || rxcp->ipv6) && !rxcp->err;
+}
+
+static struct be_rx_page_info *get_rx_page_info(struct be_rx_obj *rxo)
+{
+	struct be_adapter *adapter = rxo->adapter;
+	struct be_rx_page_info *rx_page_info;
+	struct be_queue_info *rxq = &rxo->q;
+	u16 frag_idx = rxq->tail;
+
+	rx_page_info = &rxo->page_info_tbl[frag_idx];
+	BUG_ON(!rx_page_info->page);
+
+	if (rx_page_info->last_frag) {
+		dma_unmap_page(&adapter->pdev->dev,
+			       dma_unmap_addr(rx_page_info, bus),
+			       adapter->big_page_size, DMA_FROM_DEVICE);
+		rx_page_info->last_frag = false;
+	} else {
+		dma_sync_single_for_cpu(&adapter->pdev->dev,
+					dma_unmap_addr(rx_page_info, bus),
+					rx_frag_size, DMA_FROM_DEVICE);
+	}
+
+	queue_tail_inc(rxq);
+	atomic_dec(&rxq->used);
+	return rx_page_info;
+}
+
+/* Throwaway the data in the Rx completion */
+static void be_rx_compl_discard(struct be_rx_obj *rxo,
+				struct be_rx_compl_info *rxcp)
+{
+	struct be_rx_page_info *page_info;
+	u16 i, num_rcvd = rxcp->num_rcvd;
+
+	for (i = 0; i < num_rcvd; i++) {
+		page_info = get_rx_page_info(rxo);
+		put_page(page_info->page);
+		memset(page_info, 0, sizeof(*page_info));
+	}
+}
+
+/*
+ * skb_fill_rx_data forms a complete skb for an ether frame
+ * indicated by rxcp.
+ */
+static void skb_fill_rx_data(struct be_rx_obj *rxo, struct sk_buff *skb,
+			     struct be_rx_compl_info *rxcp)
+{
+	struct be_rx_page_info *page_info;
+	u16 i, j;
+	u16 hdr_len, curr_frag_len, remaining;
+	u8 *start;
+
+	page_info = get_rx_page_info(rxo);
+	start = page_address(page_info->page) + page_info->page_offset;
+	prefetch(start);
+
+	/* Copy data in the first descriptor of this completion */
+	curr_frag_len = min(rxcp->pkt_size, rx_frag_size);
+
+	skb->len = curr_frag_len;
+	if (curr_frag_len <= BE_HDR_LEN) { /* tiny packet */
+		memcpy(skb->data, start, curr_frag_len);
+		/* Complete packet has now been moved to data */
+		put_page(page_info->page);
+		skb->data_len = 0;
+		skb->tail += curr_frag_len;
+	} else {
+		hdr_len = ETH_HLEN;
+		memcpy(skb->data, start, hdr_len);
+		skb_shinfo(skb)->nr_frags = 1;
+		skb_frag_set_page(skb, 0, page_info->page);
+		skb_shinfo(skb)->frags[0].page_offset =
+					page_info->page_offset + hdr_len;
+		skb_frag_size_set(&skb_shinfo(skb)->frags[0],
+				  curr_frag_len - hdr_len);
+		skb->data_len = curr_frag_len - hdr_len;
+		skb->truesize += rx_frag_size;
+		skb->tail += hdr_len;
+	}
+	page_info->page = NULL;
+
+	if (rxcp->pkt_size <= rx_frag_size) {
+		BUG_ON(rxcp->num_rcvd != 1);
+		return;
+	}
+
+	/* More frags present for this completion */
+	remaining = rxcp->pkt_size - curr_frag_len;
+	for (i = 1, j = 0; i < rxcp->num_rcvd; i++) {
+		page_info = get_rx_page_info(rxo);
+		curr_frag_len = min(remaining, rx_frag_size);
+
+		/* Coalesce all frags from the same physical page in one slot */
+		if (page_info->page_offset == 0) {
+			/* Fresh page */
+			j++;
+			skb_frag_set_page(skb, j, page_info->page);
+			skb_shinfo(skb)->frags[j].page_offset =
+							page_info->page_offset;
+			skb_frag_size_set(&skb_shinfo(skb)->frags[j], 0);
+			skb_shinfo(skb)->nr_frags++;
+		} else {
+			put_page(page_info->page);
+		}
+
+		skb_frag_size_add(&skb_shinfo(skb)->frags[j], curr_frag_len);
+		skb->len += curr_frag_len;
+		skb->data_len += curr_frag_len;
+		skb->truesize += rx_frag_size;
+		remaining -= curr_frag_len;
+		page_info->page = NULL;
+	}
+	BUG_ON(j > MAX_SKB_FRAGS);
+}
+
+/* Process the RX completion indicated by rxcp when GRO is disabled */
+static void be_rx_compl_process(struct be_rx_obj *rxo, struct napi_struct *napi,
+				struct be_rx_compl_info *rxcp)
+{
+	struct be_adapter *adapter = rxo->adapter;
+	struct net_device *netdev = adapter->netdev;
+	struct sk_buff *skb;
+
+	skb = netdev_alloc_skb_ip_align(netdev, BE_RX_SKB_ALLOC_SIZE);
+	if (unlikely(!skb)) {
+		rx_stats(rxo)->rx_drops_no_skbs++;
+		be_rx_compl_discard(rxo, rxcp);
+		return;
+	}
+
+	skb_fill_rx_data(rxo, skb, rxcp);
+
+	if (likely((netdev->features & NETIF_F_RXCSUM) && csum_passed(rxcp)))
+		skb->ip_summed = CHECKSUM_UNNECESSARY;
+	else
+		skb_checksum_none_assert(skb);
+
+	skb->protocol = eth_type_trans(skb, netdev);
+	skb_record_rx_queue(skb, rxo - &adapter->rx_obj[0]);
+	if (netdev->features & NETIF_F_RXHASH)
+		skb_set_hash(skb, rxcp->rss_hash, PKT_HASH_TYPE_L3);
+
+	skb->csum_level = rxcp->tunneled;
+	skb_mark_napi_id(skb, napi);
+
+	if (rxcp->vlanf)
+		__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), rxcp->vlan_tag);
+
+	netif_receive_skb(skb);
+}
+
+/* Process the RX completion indicated by rxcp when GRO is enabled */
+static void be_rx_compl_process_gro(struct be_rx_obj *rxo,
+				    struct napi_struct *napi,
+				    struct be_rx_compl_info *rxcp)
+{
+	struct be_adapter *adapter = rxo->adapter;
+	struct be_rx_page_info *page_info;
+	struct sk_buff *skb = NULL;
+	u16 remaining, curr_frag_len;
+	u16 i, j;
+
+	skb = napi_get_frags(napi);
+	if (!skb) {
+		be_rx_compl_discard(rxo, rxcp);
+		return;
+	}
+
+	remaining = rxcp->pkt_size;
+	for (i = 0, j = -1; i < rxcp->num_rcvd; i++) {
+		page_info = get_rx_page_info(rxo);
+
+		curr_frag_len = min(remaining, rx_frag_size);
+
+		/* Coalesce all frags from the same physical page in one slot */
+		if (i == 0 || page_info->page_offset == 0) {
+			/* First frag or Fresh page */
+			j++;
+			skb_frag_set_page(skb, j, page_info->page);
+			skb_shinfo(skb)->frags[j].page_offset =
+							page_info->page_offset;
+			skb_frag_size_set(&skb_shinfo(skb)->frags[j], 0);
+		} else {
+			put_page(page_info->page);
+		}
+		skb_frag_size_add(&skb_shinfo(skb)->frags[j], curr_frag_len);
+		skb->truesize += rx_frag_size;
+		remaining -= curr_frag_len;
+		memset(page_info, 0, sizeof(*page_info));
+	}
+	BUG_ON(j > MAX_SKB_FRAGS);
+
+	skb_shinfo(skb)->nr_frags = j + 1;
+	skb->len = rxcp->pkt_size;
+	skb->data_len = rxcp->pkt_size;
+	skb->ip_summed = CHECKSUM_UNNECESSARY;
+	skb_record_rx_queue(skb, rxo - &adapter->rx_obj[0]);
+	if (adapter->netdev->features & NETIF_F_RXHASH)
+		skb_set_hash(skb, rxcp->rss_hash, PKT_HASH_TYPE_L3);
+
+	skb->csum_level = rxcp->tunneled;
+	skb_mark_napi_id(skb, napi);
+
+	if (rxcp->vlanf)
+		__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), rxcp->vlan_tag);
+
+	napi_gro_frags(napi);
+}
+
+static void be_parse_rx_compl_v1(struct be_eth_rx_compl *compl,
+				 struct be_rx_compl_info *rxcp)
+{
+	rxcp->pkt_size = GET_RX_COMPL_V1_BITS(pktsize, compl);
+	rxcp->vlanf = GET_RX_COMPL_V1_BITS(vtp, compl);
+	rxcp->err = GET_RX_COMPL_V1_BITS(err, compl);
+	rxcp->tcpf = GET_RX_COMPL_V1_BITS(tcpf, compl);
+	rxcp->udpf = GET_RX_COMPL_V1_BITS(udpf, compl);
+	rxcp->ip_csum = GET_RX_COMPL_V1_BITS(ipcksm, compl);
+	rxcp->l4_csum = GET_RX_COMPL_V1_BITS(l4_cksm, compl);
+	rxcp->ipv6 = GET_RX_COMPL_V1_BITS(ip_version, compl);
+	rxcp->num_rcvd = GET_RX_COMPL_V1_BITS(numfrags, compl);
+	rxcp->pkt_type = GET_RX_COMPL_V1_BITS(cast_enc, compl);
+	rxcp->rss_hash = GET_RX_COMPL_V1_BITS(rsshash, compl);
+	if (rxcp->vlanf) {
+		rxcp->qnq = GET_RX_COMPL_V1_BITS(qnq, compl);
+		rxcp->vlan_tag = GET_RX_COMPL_V1_BITS(vlan_tag, compl);
+	}
+	rxcp->port = GET_RX_COMPL_V1_BITS(port, compl);
+	rxcp->tunneled =
+		GET_RX_COMPL_V1_BITS(tunneled, compl);
+}
+
+static void be_parse_rx_compl_v0(struct be_eth_rx_compl *compl,
+				 struct be_rx_compl_info *rxcp)
+{
+	rxcp->pkt_size = GET_RX_COMPL_V0_BITS(pktsize, compl);
+	rxcp->vlanf = GET_RX_COMPL_V0_BITS(vtp, compl);
+	rxcp->err = GET_RX_COMPL_V0_BITS(err, compl);
+	rxcp->tcpf = GET_RX_COMPL_V0_BITS(tcpf, compl);
+	rxcp->udpf = GET_RX_COMPL_V0_BITS(udpf, compl);
+	rxcp->ip_csum = GET_RX_COMPL_V0_BITS(ipcksm, compl);
+	rxcp->l4_csum = GET_RX_COMPL_V0_BITS(l4_cksm, compl);
+	rxcp->ipv6 = GET_RX_COMPL_V0_BITS(ip_version, compl);
+	rxcp->num_rcvd = GET_RX_COMPL_V0_BITS(numfrags, compl);
+	rxcp->pkt_type = GET_RX_COMPL_V0_BITS(cast_enc, compl);
+	rxcp->rss_hash = GET_RX_COMPL_V0_BITS(rsshash, compl);
+	if (rxcp->vlanf) {
+		rxcp->qnq = GET_RX_COMPL_V0_BITS(qnq, compl);
+		rxcp->vlan_tag = GET_RX_COMPL_V0_BITS(vlan_tag, compl);
+	}
+	rxcp->port = GET_RX_COMPL_V0_BITS(port, compl);
+	rxcp->ip_frag = GET_RX_COMPL_V0_BITS(ip_frag, compl);
+}
+
+static struct be_rx_compl_info *be_rx_compl_get(struct be_rx_obj *rxo)
+{
+	struct be_eth_rx_compl *compl = queue_tail_node(&rxo->cq);
+	struct be_rx_compl_info *rxcp = &rxo->rxcp;
+	struct be_adapter *adapter = rxo->adapter;
+
+	/* For checking the valid bit it is Ok to use either definition as the
+	 * valid bit is at the same position in both v0 and v1 Rx compl */
+	if (compl->dw[offsetof(struct amap_eth_rx_compl_v1, valid) / 32] == 0)
+		return NULL;
+
+	rmb();
+	be_dws_le_to_cpu(compl, sizeof(*compl));
+
+	if (adapter->be3_native)
+		be_parse_rx_compl_v1(compl, rxcp);
+	else
+		be_parse_rx_compl_v0(compl, rxcp);
+
+	if (rxcp->ip_frag)
+		rxcp->l4_csum = 0;
+
+	if (rxcp->vlanf) {
+		/* In QNQ modes, if qnq bit is not set, then the packet was
+		 * tagged only with the transparent outer vlan-tag and must
+		 * not be treated as a vlan packet by host
+		 */
+		if (be_is_qnq_mode(adapter) && !rxcp->qnq)
+			rxcp->vlanf = 0;
+
+		if (!lancer_chip(adapter))
+			rxcp->vlan_tag = swab16(rxcp->vlan_tag);
+
+		if (adapter->pvid == (rxcp->vlan_tag & VLAN_VID_MASK) &&
+		    !test_bit(rxcp->vlan_tag, adapter->vids))
+			rxcp->vlanf = 0;
+	}
+
+	/* As the compl has been parsed, reset it; we wont touch it again */
+	compl->dw[offsetof(struct amap_eth_rx_compl_v1, valid) / 32] = 0;
+
+	queue_tail_inc(&rxo->cq);
+	return rxcp;
+}
+
+static inline struct page *be_alloc_pages(u32 size, gfp_t gfp)
+{
+	u32 order = get_order(size);
+
+	if (order > 0)
+		gfp |= __GFP_COMP;
+	return  alloc_pages(gfp, order);
+}
+
+/*
+ * Allocate a page, split it to fragments of size rx_frag_size and post as
+ * receive buffers to BE
+ */
+static void be_post_rx_frags(struct be_rx_obj *rxo, gfp_t gfp, u32 frags_needed)
+{
+	struct be_adapter *adapter = rxo->adapter;
+	struct be_rx_page_info *page_info = NULL, *prev_page_info = NULL;
+	struct be_queue_info *rxq = &rxo->q;
+	struct page *pagep = NULL;
+	struct device *dev = &adapter->pdev->dev;
+	struct be_eth_rx_d *rxd;
+	u64 page_dmaaddr = 0, frag_dmaaddr;
+	u32 posted, page_offset = 0, notify = 0;
+
+	page_info = &rxo->page_info_tbl[rxq->head];
+	for (posted = 0; posted < frags_needed && !page_info->page; posted++) {
+		if (!pagep) {
+			pagep = be_alloc_pages(adapter->big_page_size, gfp);
+			if (unlikely(!pagep)) {
+				rx_stats(rxo)->rx_post_fail++;
+				break;
+			}
+			page_dmaaddr = dma_map_page(dev, pagep, 0,
+						    adapter->big_page_size,
+						    DMA_FROM_DEVICE);
+			if (dma_mapping_error(dev, page_dmaaddr)) {
+				put_page(pagep);
+				pagep = NULL;
+				adapter->drv_stats.dma_map_errors++;
+				break;
+			}
+			page_offset = 0;
+		} else {
+			get_page(pagep);
+			page_offset += rx_frag_size;
+		}
+		page_info->page_offset = page_offset;
+		page_info->page = pagep;
+
+		rxd = queue_head_node(rxq);
+		frag_dmaaddr = page_dmaaddr + page_info->page_offset;
+		rxd->fragpa_lo = cpu_to_le32(frag_dmaaddr & 0xFFFFFFFF);
+		rxd->fragpa_hi = cpu_to_le32(upper_32_bits(frag_dmaaddr));
+
+		/* Any space left in the current big page for another frag? */
+		if ((page_offset + rx_frag_size + rx_frag_size) >
+					adapter->big_page_size) {
+			pagep = NULL;
+			page_info->last_frag = true;
+			dma_unmap_addr_set(page_info, bus, page_dmaaddr);
+		} else {
+			dma_unmap_addr_set(page_info, bus, frag_dmaaddr);
+		}
+
+		prev_page_info = page_info;
+		queue_head_inc(rxq);
+		page_info = &rxo->page_info_tbl[rxq->head];
+	}
+
+	/* Mark the last frag of a page when we break out of the above loop
+	 * with no more slots available in the RXQ
+	 */
+	if (pagep) {
+		prev_page_info->last_frag = true;
+		dma_unmap_addr_set(prev_page_info, bus, page_dmaaddr);
+	}
+
+	if (posted) {
+		atomic_add(posted, &rxq->used);
+		if (rxo->rx_post_starved)
+			rxo->rx_post_starved = false;
+		do {
+			notify = min(256u, posted);
+			be_rxq_notify(adapter, rxq->id, notify);
+			posted -= notify;
+		} while (posted);
+	} else if (atomic_read(&rxq->used) == 0) {
+		/* Let be_worker replenish when memory is available */
+		rxo->rx_post_starved = true;
+	}
+}
+
+static struct be_eth_tx_compl *be_tx_compl_get(struct be_queue_info *tx_cq)
+{
+	struct be_eth_tx_compl *txcp = queue_tail_node(tx_cq);
+
+	if (txcp->dw[offsetof(struct amap_eth_tx_compl, valid) / 32] == 0)
+		return NULL;
+
+	rmb();
+	be_dws_le_to_cpu(txcp, sizeof(*txcp));
+
+	txcp->dw[offsetof(struct amap_eth_tx_compl, valid) / 32] = 0;
+
+	queue_tail_inc(tx_cq);
+	return txcp;
+}
+
+static u16 be_tx_compl_process(struct be_adapter *adapter,
+			       struct be_tx_obj *txo, u16 last_index)
+{
+	struct be_queue_info *txq = &txo->q;
+	struct be_eth_wrb *wrb;
+	struct sk_buff **sent_skbs = txo->sent_skb_list;
+	struct sk_buff *sent_skb;
+	u16 cur_index, num_wrbs = 1; /* account for hdr wrb */
+	bool unmap_skb_hdr = true;
+
+	sent_skb = sent_skbs[txq->tail];
+	BUG_ON(!sent_skb);
+	sent_skbs[txq->tail] = NULL;
+
+	/* skip header wrb */
+	queue_tail_inc(txq);
+
+	do {
+		cur_index = txq->tail;
+		wrb = queue_tail_node(txq);
+		unmap_tx_frag(&adapter->pdev->dev, wrb,
+			      (unmap_skb_hdr && skb_headlen(sent_skb)));
+		unmap_skb_hdr = false;
+
+		num_wrbs++;
+		queue_tail_inc(txq);
+	} while (cur_index != last_index);
+
+	dev_consume_skb_any(sent_skb);
+	return num_wrbs;
+}
+
+/* Return the number of events in the event queue */
+static inline int events_get(struct be_eq_obj *eqo)
+{
+	struct be_eq_entry *eqe;
+	int num = 0;
+
+	do {
+		eqe = queue_tail_node(&eqo->q);
+		if (eqe->evt == 0)
+			break;
+
+		rmb();
+		eqe->evt = 0;
+		num++;
+		queue_tail_inc(&eqo->q);
+	} while (true);
+
+	return num;
+}
+
+/* Leaves the EQ is disarmed state */
+static void be_eq_clean(struct be_eq_obj *eqo)
+{
+	int num = events_get(eqo);
+
+	be_eq_notify(eqo->adapter, eqo->q.id, false, true, num);
+}
+
+static void be_rx_cq_clean(struct be_rx_obj *rxo)
+{
+	struct be_rx_page_info *page_info;
+	struct be_queue_info *rxq = &rxo->q;
+	struct be_queue_info *rx_cq = &rxo->cq;
+	struct be_rx_compl_info *rxcp;
+	struct be_adapter *adapter = rxo->adapter;
+	int flush_wait = 0;
+
+	/* Consume pending rx completions.
+	 * Wait for the flush completion (identified by zero num_rcvd)
+	 * to arrive. Notify CQ even when there are no more CQ entries
+	 * for HW to flush partially coalesced CQ entries.
+	 * In Lancer, there is no need to wait for flush compl.
+	 */
+	for (;;) {
+		rxcp = be_rx_compl_get(rxo);
+		if (!rxcp) {
+			if (lancer_chip(adapter))
+				break;
+
+			if (flush_wait++ > 10 || be_hw_error(adapter)) {
+				dev_warn(&adapter->pdev->dev,
+					 "did not receive flush compl\n");
+				break;
+			}
+			be_cq_notify(adapter, rx_cq->id, true, 0);
+			mdelay(1);
+		} else {
+			be_rx_compl_discard(rxo, rxcp);
+			be_cq_notify(adapter, rx_cq->id, false, 1);
+			if (rxcp->num_rcvd == 0)
+				break;
+		}
+	}
+
+	/* After cleanup, leave the CQ in unarmed state */
+	be_cq_notify(adapter, rx_cq->id, false, 0);
+
+	/* Then free posted rx buffers that were not used */
+	while (atomic_read(&rxq->used) > 0) {
+		page_info = get_rx_page_info(rxo);
+		put_page(page_info->page);
+		memset(page_info, 0, sizeof(*page_info));
+	}
+	BUG_ON(atomic_read(&rxq->used));
+	rxq->tail = 0;
+	rxq->head = 0;
+}
+
+static void be_tx_compl_clean(struct be_adapter *adapter)
+{
+	struct be_tx_obj *txo;
+	struct be_queue_info *txq;
+	struct be_eth_tx_compl *txcp;
+	u16 end_idx, cmpl = 0, timeo = 0, num_wrbs = 0;
+	struct sk_buff *sent_skb;
+	bool dummy_wrb;
+	int i, pending_txqs;
+
+	/* Stop polling for compls when HW has been silent for 10ms */
+	do {
+		pending_txqs = adapter->num_tx_qs;
+
+		for_all_tx_queues(adapter, txo, i) {
+			cmpl = 0;
+			num_wrbs = 0;
+			txq = &txo->q;
+			while ((txcp = be_tx_compl_get(&txo->cq))) {
+				end_idx = GET_TX_COMPL_BITS(wrb_index, txcp);
+				num_wrbs += be_tx_compl_process(adapter, txo,
+								end_idx);
+				cmpl++;
+			}
+			if (cmpl) {
+				be_cq_notify(adapter, txo->cq.id, false, cmpl);
+				atomic_sub(num_wrbs, &txq->used);
+				timeo = 0;
+			}
+			if (atomic_read(&txq->used) == 0)
+				pending_txqs--;
+		}
+
+		if (pending_txqs == 0 || ++timeo > 10 || be_hw_error(adapter))
+			break;
+
+		mdelay(1);
+	} while (true);
+
+	for_all_tx_queues(adapter, txo, i) {
+		txq = &txo->q;
+		if (atomic_read(&txq->used))
+			dev_err(&adapter->pdev->dev, "%d pending tx-compls\n",
+				atomic_read(&txq->used));
+
+		/* free posted tx for which compls will never arrive */
+		while (atomic_read(&txq->used)) {
+			sent_skb = txo->sent_skb_list[txq->tail];
+			end_idx = txq->tail;
+			num_wrbs = wrb_cnt_for_skb(adapter, sent_skb,
+						   &dummy_wrb);
+			index_adv(&end_idx, num_wrbs - 1, txq->len);
+			num_wrbs = be_tx_compl_process(adapter, txo, end_idx);
+			atomic_sub(num_wrbs, &txq->used);
+		}
+	}
+}
+
+static void be_evt_queues_destroy(struct be_adapter *adapter)
+{
+	struct be_eq_obj *eqo;
+	int i;
+
+	for_all_evt_queues(adapter, eqo, i) {
+		if (eqo->q.created) {
+			be_eq_clean(eqo);
+			be_cmd_q_destroy(adapter, &eqo->q, QTYPE_EQ);
+			napi_hash_del(&eqo->napi);
+			netif_napi_del(&eqo->napi);
+		}
+		be_queue_free(adapter, &eqo->q);
+	}
+}
+
+static int be_evt_queues_create(struct be_adapter *adapter)
+{
+	struct be_queue_info *eq;
+	struct be_eq_obj *eqo;
+	struct be_aic_obj *aic;
+	int i, rc;
+
+	adapter->num_evt_qs = min_t(u16, num_irqs(adapter),
+				    adapter->cfg_num_qs);
+
+	for_all_evt_queues(adapter, eqo, i) {
+		netif_napi_add(adapter->netdev, &eqo->napi, be_poll,
+			       BE_NAPI_WEIGHT);
+		napi_hash_add(&eqo->napi);
+		aic = &adapter->aic_obj[i];
+		eqo->adapter = adapter;
+		eqo->idx = i;
+		aic->max_eqd = BE_MAX_EQD;
+		aic->enable = true;
+
+		eq = &eqo->q;
+		rc = be_queue_alloc(adapter, eq, EVNT_Q_LEN,
+				    sizeof(struct be_eq_entry));
+		if (rc)
+			return rc;
+
+		rc = be_cmd_eq_create(adapter, eqo);
+		if (rc)
+			return rc;
+	}
+	return 0;
+}
+
+static void be_mcc_queues_destroy(struct be_adapter *adapter)
+{
+	struct be_queue_info *q;
+
+	q = &adapter->mcc_obj.q;
+	if (q->created)
+		be_cmd_q_destroy(adapter, q, QTYPE_MCCQ);
+	be_queue_free(adapter, q);
+
+	q = &adapter->mcc_obj.cq;
+	if (q->created)
+		be_cmd_q_destroy(adapter, q, QTYPE_CQ);
+	be_queue_free(adapter, q);
+}
+
+/* Must be called only after TX qs are created as MCC shares TX EQ */
+static int be_mcc_queues_create(struct be_adapter *adapter)
+{
+	struct be_queue_info *q, *cq;
+
+	cq = &adapter->mcc_obj.cq;
+	if (be_queue_alloc(adapter, cq, MCC_CQ_LEN,
+			   sizeof(struct be_mcc_compl)))
+		goto err;
+
+	/* Use the default EQ for MCC completions */
+	if (be_cmd_cq_create(adapter, cq, &mcc_eqo(adapter)->q, true, 0))
+		goto mcc_cq_free;
+
+	q = &adapter->mcc_obj.q;
+	if (be_queue_alloc(adapter, q, MCC_Q_LEN, sizeof(struct be_mcc_wrb)))
+		goto mcc_cq_destroy;
+
+	if (be_cmd_mccq_create(adapter, q, cq))
+		goto mcc_q_free;
+
+	return 0;
+
+mcc_q_free:
+	be_queue_free(adapter, q);
+mcc_cq_destroy:
+	be_cmd_q_destroy(adapter, cq, QTYPE_CQ);
+mcc_cq_free:
+	be_queue_free(adapter, cq);
+err:
+	return -1;
+}
+
+static void be_tx_queues_destroy(struct be_adapter *adapter)
+{
+	struct be_queue_info *q;
+	struct be_tx_obj *txo;
+	u8 i;
+
+	for_all_tx_queues(adapter, txo, i) {
+		q = &txo->q;
+		if (q->created)
+			be_cmd_q_destroy(adapter, q, QTYPE_TXQ);
+		be_queue_free(adapter, q);
+
+		q = &txo->cq;
+		if (q->created)
+			be_cmd_q_destroy(adapter, q, QTYPE_CQ);
+		be_queue_free(adapter, q);
+	}
+}
+
+static int be_tx_qs_create(struct be_adapter *adapter)
+{
+	struct be_queue_info *cq, *eq;
+	struct be_tx_obj *txo;
+	int status, i;
+
+	adapter->num_tx_qs = min(adapter->num_evt_qs, be_max_txqs(adapter));
+
+	for_all_tx_queues(adapter, txo, i) {
+		cq = &txo->cq;
+		status = be_queue_alloc(adapter, cq, TX_CQ_LEN,
+					sizeof(struct be_eth_tx_compl));
+		if (status)
+			return status;
+
+		u64_stats_init(&txo->stats.sync);
+		u64_stats_init(&txo->stats.sync_compl);
+
+		/* If num_evt_qs is less than num_tx_qs, then more than
+		 * one txq share an eq
+		 */
+		eq = &adapter->eq_obj[i % adapter->num_evt_qs].q;
+		status = be_cmd_cq_create(adapter, cq, eq, false, 3);
+		if (status)
+			return status;
+
+		status = be_queue_alloc(adapter, &txo->q, TX_Q_LEN,
+					sizeof(struct be_eth_wrb));
+		if (status)
+			return status;
+
+		status = be_cmd_txq_create(adapter, txo);
+		if (status)
+			return status;
+	}
+
+	dev_info(&adapter->pdev->dev, "created %d TX queue(s)\n",
+		 adapter->num_tx_qs);
+	return 0;
+}
+
+static void be_rx_cqs_destroy(struct be_adapter *adapter)
+{
+	struct be_queue_info *q;
+	struct be_rx_obj *rxo;
+	int i;
+
+	for_all_rx_queues(adapter, rxo, i) {
+		q = &rxo->cq;
+		if (q->created)
+			be_cmd_q_destroy(adapter, q, QTYPE_CQ);
+		be_queue_free(adapter, q);
+	}
+}
+
+static int be_rx_cqs_create(struct be_adapter *adapter)
+{
+	struct be_queue_info *eq, *cq;
+	struct be_rx_obj *rxo;
+	int rc, i;
+
+	/* We can create as many RSS rings as there are EQs. */
+	adapter->num_rx_qs = adapter->num_evt_qs;
+
+	/* We'll use RSS only if atleast 2 RSS rings are supported.
+	 * When RSS is used, we'll need a default RXQ for non-IP traffic.
+	 */
+	if (adapter->num_rx_qs > 1)
+		adapter->num_rx_qs++;
+
+	adapter->big_page_size = (1 << get_order(rx_frag_size)) * PAGE_SIZE;
+	for_all_rx_queues(adapter, rxo, i) {
+		rxo->adapter = adapter;
+		cq = &rxo->cq;
+		rc = be_queue_alloc(adapter, cq, RX_CQ_LEN,
+				    sizeof(struct be_eth_rx_compl));
+		if (rc)
+			return rc;
+
+		u64_stats_init(&rxo->stats.sync);
+		eq = &adapter->eq_obj[i % adapter->num_evt_qs].q;
+		rc = be_cmd_cq_create(adapter, cq, eq, false, 3);
+		if (rc)
+			return rc;
+	}
+
+	dev_info(&adapter->pdev->dev,
+		 "created %d RSS queue(s) and 1 default RX queue\n",
+		 adapter->num_rx_qs - 1);
+	return 0;
+}
+
+static irqreturn_t be_intx(int irq, void *dev)
+{
+	struct be_eq_obj *eqo = dev;
+	struct be_adapter *adapter = eqo->adapter;
+	int num_evts = 0;
+
+	/* IRQ is not expected when NAPI is scheduled as the EQ
+	 * will not be armed.
+	 * But, this can happen on Lancer INTx where it takes
+	 * a while to de-assert INTx or in BE2 where occasionaly
+	 * an interrupt may be raised even when EQ is unarmed.
+	 * If NAPI is already scheduled, then counting & notifying
+	 * events will orphan them.
+	 */
+	if (napi_schedule_prep(&eqo->napi)) {
+		num_evts = events_get(eqo);
+		__napi_schedule(&eqo->napi);
+		if (num_evts)
+			eqo->spurious_intr = 0;
+	}
+	be_eq_notify(adapter, eqo->q.id, false, true, num_evts);
+
+	/* Return IRQ_HANDLED only for the the first spurious intr
+	 * after a valid intr to stop the kernel from branding
+	 * this irq as a bad one!
+	 */
+	if (num_evts || eqo->spurious_intr++ == 0)
+		return IRQ_HANDLED;
+	else
+		return IRQ_NONE;
+}
+
+static irqreturn_t be_msix(int irq, void *dev)
+{
+	struct be_eq_obj *eqo = dev;
+
+	be_eq_notify(eqo->adapter, eqo->q.id, false, true, 0);
+	napi_schedule(&eqo->napi);
+	return IRQ_HANDLED;
+}
+
+static inline bool do_gro(struct be_rx_compl_info *rxcp)
+{
+	return (rxcp->tcpf && !rxcp->err && rxcp->l4_csum) ? true : false;
+}
+
+static int be_process_rx(struct be_rx_obj *rxo, struct napi_struct *napi,
+			 int budget, int polling)
+{
+	struct be_adapter *adapter = rxo->adapter;
+	struct be_queue_info *rx_cq = &rxo->cq;
+	struct be_rx_compl_info *rxcp;
+	u32 work_done;
+	u32 frags_consumed = 0;
+
+	for (work_done = 0; work_done < budget; work_done++) {
+		rxcp = be_rx_compl_get(rxo);
+		if (!rxcp)
+			break;
+
+		/* Is it a flush compl that has no data */
+		if (unlikely(rxcp->num_rcvd == 0))
+			goto loop_continue;
+
+		/* Discard compl with partial DMA Lancer B0 */
+		if (unlikely(!rxcp->pkt_size)) {
+			be_rx_compl_discard(rxo, rxcp);
+			goto loop_continue;
+		}
+
+		/* On BE drop pkts that arrive due to imperfect filtering in
+		 * promiscuous mode on some skews
+		 */
+		if (unlikely(rxcp->port != adapter->port_num &&
+			     !lancer_chip(adapter))) {
+			be_rx_compl_discard(rxo, rxcp);
+			goto loop_continue;
+		}
+
+		/* Don't do gro when we're busy_polling */
+		if (do_gro(rxcp) && polling != BUSY_POLLING)
+			be_rx_compl_process_gro(rxo, napi, rxcp);
+		else
+			be_rx_compl_process(rxo, napi, rxcp);
+
+loop_continue:
+		frags_consumed += rxcp->num_rcvd;
+		be_rx_stats_update(rxo, rxcp);
+	}
+
+	if (work_done) {
+		be_cq_notify(adapter, rx_cq->id, true, work_done);
+
+		/* When an rx-obj gets into post_starved state, just
+		 * let be_worker do the posting.
+		 */
+		if (atomic_read(&rxo->q.used) < RX_FRAGS_REFILL_WM &&
+		    !rxo->rx_post_starved)
+			be_post_rx_frags(rxo, GFP_ATOMIC,
+					 max_t(u32, MAX_RX_POST,
+					       frags_consumed));
+	}
+
+	return work_done;
+}
+
+static inline void be_update_tx_err(struct be_tx_obj *txo, u32 status)
+{
+	switch (status) {
+	case BE_TX_COMP_HDR_PARSE_ERR:
+		tx_stats(txo)->tx_hdr_parse_err++;
+		break;
+	case BE_TX_COMP_NDMA_ERR:
+		tx_stats(txo)->tx_dma_err++;
+		break;
+	case BE_TX_COMP_ACL_ERR:
+		tx_stats(txo)->tx_spoof_check_err++;
+		break;
+	}
+}
+
+static inline void lancer_update_tx_err(struct be_tx_obj *txo, u32 status)
+{
+	switch (status) {
+	case LANCER_TX_COMP_LSO_ERR:
+		tx_stats(txo)->tx_tso_err++;
+		break;
+	case LANCER_TX_COMP_HSW_DROP_MAC_ERR:
+	case LANCER_TX_COMP_HSW_DROP_VLAN_ERR:
+		tx_stats(txo)->tx_spoof_check_err++;
+		break;
+	case LANCER_TX_COMP_QINQ_ERR:
+		tx_stats(txo)->tx_qinq_err++;
+		break;
+	case LANCER_TX_COMP_PARITY_ERR:
+		tx_stats(txo)->tx_internal_parity_err++;
+		break;
+	case LANCER_TX_COMP_DMA_ERR:
+		tx_stats(txo)->tx_dma_err++;
+		break;
+	}
+}
+
+static void be_process_tx(struct be_adapter *adapter, struct be_tx_obj *txo,
+			  int idx)
+{
+	struct be_eth_tx_compl *txcp;
+	int num_wrbs = 0, work_done = 0;
+	u32 compl_status;
+	u16 last_idx;
+
+	while ((txcp = be_tx_compl_get(&txo->cq))) {
+		last_idx = GET_TX_COMPL_BITS(wrb_index, txcp);
+		num_wrbs += be_tx_compl_process(adapter, txo, last_idx);
+		work_done++;
+
+		compl_status = GET_TX_COMPL_BITS(status, txcp);
+		if (compl_status) {
+			if (lancer_chip(adapter))
+				lancer_update_tx_err(txo, compl_status);
+			else
+				be_update_tx_err(txo, compl_status);
+		}
+	}
+
+	if (work_done) {
+		be_cq_notify(adapter, txo->cq.id, true, work_done);
+		atomic_sub(num_wrbs, &txo->q.used);
+
+		/* As Tx wrbs have been freed up, wake up netdev queue
+		 * if it was stopped due to lack of tx wrbs.  */
+		if (__netif_subqueue_stopped(adapter->netdev, idx) &&
+		    atomic_read(&txo->q.used) < txo->q.len / 2) {
+			netif_wake_subqueue(adapter->netdev, idx);
+		}
+
+		u64_stats_update_begin(&tx_stats(txo)->sync_compl);
+		tx_stats(txo)->tx_compl += work_done;
+		u64_stats_update_end(&tx_stats(txo)->sync_compl);
+	}
+}
+
+int be_poll(struct napi_struct *napi, int budget)
+{
+	struct be_eq_obj *eqo = container_of(napi, struct be_eq_obj, napi);
+	struct be_adapter *adapter = eqo->adapter;
+	int max_work = 0, work, i, num_evts;
+	struct be_rx_obj *rxo;
+	struct be_tx_obj *txo;
+
+	num_evts = events_get(eqo);
+
+	for_all_tx_queues_on_eq(adapter, eqo, txo, i)
+		be_process_tx(adapter, txo, i);
+
+	if (be_lock_napi(eqo)) {
+		/* This loop will iterate twice for EQ0 in which
+		 * completions of the last RXQ (default one) are also processed
+		 * For other EQs the loop iterates only once
+		 */
+		for_all_rx_queues_on_eq(adapter, eqo, rxo, i) {
+			work = be_process_rx(rxo, napi, budget, NAPI_POLLING);
+			max_work = max(work, max_work);
+		}
+		be_unlock_napi(eqo);
+	} else {
+		max_work = budget;
+	}
+
+	if (is_mcc_eqo(eqo))
+		be_process_mcc(adapter);
+
+	if (max_work < budget) {
+		napi_complete(napi);
+		be_eq_notify(adapter, eqo->q.id, true, false, num_evts);
+	} else {
+		/* As we'll continue in polling mode, count and clear events */
+		be_eq_notify(adapter, eqo->q.id, false, false, num_evts);
+	}
+	return max_work;
+}
+
+#ifdef CONFIG_NET_RX_BUSY_POLL
+static int be_busy_poll(struct napi_struct *napi)
+{
+	struct be_eq_obj *eqo = container_of(napi, struct be_eq_obj, napi);
+	struct be_adapter *adapter = eqo->adapter;
+	struct be_rx_obj *rxo;
+	int i, work = 0;
+
+	if (!be_lock_busy_poll(eqo))
+		return LL_FLUSH_BUSY;
+
+	for_all_rx_queues_on_eq(adapter, eqo, rxo, i) {
+		work = be_process_rx(rxo, napi, 4, BUSY_POLLING);
+		if (work)
+			break;
+	}
+
+	be_unlock_busy_poll(eqo);
+	return work;
+}
+#endif
+
+void be_detect_error(struct be_adapter *adapter)
+{
+	u32 ue_lo = 0, ue_hi = 0, ue_lo_mask = 0, ue_hi_mask = 0;
+	u32 sliport_status = 0, sliport_err1 = 0, sliport_err2 = 0;
+	u32 i;
+	bool error_detected = false;
+	struct device *dev = &adapter->pdev->dev;
+	struct net_device *netdev = adapter->netdev;
+
+	if (be_hw_error(adapter))
+		return;
+
+	if (lancer_chip(adapter)) {
+		sliport_status = ioread32(adapter->db + SLIPORT_STATUS_OFFSET);
+		if (sliport_status & SLIPORT_STATUS_ERR_MASK) {
+			sliport_err1 = ioread32(adapter->db +
+						SLIPORT_ERROR1_OFFSET);
+			sliport_err2 = ioread32(adapter->db +
+						SLIPORT_ERROR2_OFFSET);
+			adapter->hw_error = true;
+			/* Do not log error messages if its a FW reset */
+			if (sliport_err1 == SLIPORT_ERROR_FW_RESET1 &&
+			    sliport_err2 == SLIPORT_ERROR_FW_RESET2) {
+				dev_info(dev, "Firmware update in progress\n");
+			} else {
+				error_detected = true;
+				dev_err(dev, "Error detected in the card\n");
+				dev_err(dev, "ERR: sliport status 0x%x\n",
+					sliport_status);
+				dev_err(dev, "ERR: sliport error1 0x%x\n",
+					sliport_err1);
+				dev_err(dev, "ERR: sliport error2 0x%x\n",
+					sliport_err2);
+			}
+		}
+	} else {
+		pci_read_config_dword(adapter->pdev,
+				      PCICFG_UE_STATUS_LOW, &ue_lo);
+		pci_read_config_dword(adapter->pdev,
+				      PCICFG_UE_STATUS_HIGH, &ue_hi);
+		pci_read_config_dword(adapter->pdev,
+				      PCICFG_UE_STATUS_LOW_MASK, &ue_lo_mask);
+		pci_read_config_dword(adapter->pdev,
+				      PCICFG_UE_STATUS_HI_MASK, &ue_hi_mask);
+
+		ue_lo = (ue_lo & ~ue_lo_mask);
+		ue_hi = (ue_hi & ~ue_hi_mask);
+
+		/* On certain platforms BE hardware can indicate spurious UEs.
+		 * Allow HW to stop working completely in case of a real UE.
+		 * Hence not setting the hw_error for UE detection.
+		 */
+
+		if (ue_lo || ue_hi) {
+			error_detected = true;
+			dev_err(dev,
+				"Unrecoverable Error detected in the adapter");
+			dev_err(dev, "Please reboot server to recover");
+			if (skyhawk_chip(adapter))
+				adapter->hw_error = true;
+			for (i = 0; ue_lo; ue_lo >>= 1, i++) {
+				if (ue_lo & 1)
+					dev_err(dev, "UE: %s bit set\n",
+						ue_status_low_desc[i]);
+			}
+			for (i = 0; ue_hi; ue_hi >>= 1, i++) {
+				if (ue_hi & 1)
+					dev_err(dev, "UE: %s bit set\n",
+						ue_status_hi_desc[i]);
+			}
+		}
+	}
+	if (error_detected)
+		netif_carrier_off(netdev);
+}
+
+static void be_msix_disable(struct be_adapter *adapter)
+{
+	if (msix_enabled(adapter)) {
+		pci_disable_msix(adapter->pdev);
+		adapter->num_msix_vec = 0;
+		adapter->num_msix_roce_vec = 0;
+	}
+}
+
+static int be_msix_enable(struct be_adapter *adapter)
+{
+	int i, num_vec;
+	struct device *dev = &adapter->pdev->dev;
+
+	/* If RoCE is supported, program the max number of NIC vectors that
+	 * may be configured via set-channels, along with vectors needed for
+	 * RoCe. Else, just program the number we'll use initially.
+	 */
+	if (be_roce_supported(adapter))
+		num_vec = min_t(int, 2 * be_max_eqs(adapter),
+				2 * num_online_cpus());
+	else
+		num_vec = adapter->cfg_num_qs;
+
+	for (i = 0; i < num_vec; i++)
+		adapter->msix_entries[i].entry = i;
+
+	num_vec = pci_enable_msix_range(adapter->pdev, adapter->msix_entries,
+					MIN_MSIX_VECTORS, num_vec);
+	if (num_vec < 0)
+		goto fail;
+
+	if (be_roce_supported(adapter) && num_vec > MIN_MSIX_VECTORS) {
+		adapter->num_msix_roce_vec = num_vec / 2;
+		dev_info(dev, "enabled %d MSI-x vector(s) for RoCE\n",
+			 adapter->num_msix_roce_vec);
+	}
+
+	adapter->num_msix_vec = num_vec - adapter->num_msix_roce_vec;
+
+	dev_info(dev, "enabled %d MSI-x vector(s) for NIC\n",
+		 adapter->num_msix_vec);
+	return 0;
+
+fail:
+	dev_warn(dev, "MSIx enable failed\n");
+
+	/* INTx is not supported in VFs, so fail probe if enable_msix fails */
+	if (!be_physfn(adapter))
+		return num_vec;
+	return 0;
+}
+
+static inline int be_msix_vec_get(struct be_adapter *adapter,
+				  struct be_eq_obj *eqo)
+{
+	return adapter->msix_entries[eqo->msix_idx].vector;
+}
+
+static int be_msix_register(struct be_adapter *adapter)
+{
+	struct net_device *netdev = adapter->netdev;
+	struct be_eq_obj *eqo;
+	int status, i, vec;
+
+	for_all_evt_queues(adapter, eqo, i) {
+		sprintf(eqo->desc, "%s-q%d", netdev->name, i);
+		vec = be_msix_vec_get(adapter, eqo);
+		status = request_irq(vec, be_msix, 0, eqo->desc, eqo);
+		if (status)
+			goto err_msix;
+	}
+
+	return 0;
+err_msix:
+	for (i--, eqo = &adapter->eq_obj[i]; i >= 0; i--, eqo--)
+		free_irq(be_msix_vec_get(adapter, eqo), eqo);
+	dev_warn(&adapter->pdev->dev, "MSIX Request IRQ failed - err %d\n",
+		 status);
+	be_msix_disable(adapter);
+	return status;
+}
+
+static int be_irq_register(struct be_adapter *adapter)
+{
+	struct net_device *netdev = adapter->netdev;
+	int status;
+
+	if (msix_enabled(adapter)) {
+		status = be_msix_register(adapter);
+		if (status == 0)
+			goto done;
+		/* INTx is not supported for VF */
+		if (!be_physfn(adapter))
+			return status;
+	}
+
+	/* INTx: only the first EQ is used */
+	netdev->irq = adapter->pdev->irq;
+	status = request_irq(netdev->irq, be_intx, IRQF_SHARED, netdev->name,
+			     &adapter->eq_obj[0]);
+	if (status) {
+		dev_err(&adapter->pdev->dev,
+			"INTx request IRQ failed - err %d\n", status);
+		return status;
+	}
+done:
+	adapter->isr_registered = true;
+	return 0;
+}
+
+static void be_irq_unregister(struct be_adapter *adapter)
+{
+	struct net_device *netdev = adapter->netdev;
+	struct be_eq_obj *eqo;
+	int i;
+
+	if (!adapter->isr_registered)
+		return;
+
+	/* INTx */
+	if (!msix_enabled(adapter)) {
+		free_irq(netdev->irq, &adapter->eq_obj[0]);
+		goto done;
+	}
+
+	/* MSIx */
+	for_all_evt_queues(adapter, eqo, i)
+		free_irq(be_msix_vec_get(adapter, eqo), eqo);
+
+done:
+	adapter->isr_registered = false;
+}
+
+static void be_rx_qs_destroy(struct be_adapter *adapter)
+{
+	struct be_queue_info *q;
+	struct be_rx_obj *rxo;
+	int i;
+
+	for_all_rx_queues(adapter, rxo, i) {
+		q = &rxo->q;
+		if (q->created) {
+			be_cmd_rxq_destroy(adapter, q);
+			be_rx_cq_clean(rxo);
+		}
+		be_queue_free(adapter, q);
+	}
+}
+
+static int be_close(struct net_device *netdev)
+{
+	struct be_adapter *adapter = netdev_priv(netdev);
+	struct be_eq_obj *eqo;
+	int i;
+
+	/* This protection is needed as be_close() may be called even when the
+	 * adapter is in cleared state (after eeh perm failure)
+	 */
+	if (!(adapter->flags & BE_FLAGS_SETUP_DONE))
+		return 0;
+
+	be_roce_dev_close(adapter);
+
+	if (adapter->flags & BE_FLAGS_NAPI_ENABLED) {
+		for_all_evt_queues(adapter, eqo, i) {
+			napi_disable(&eqo->napi);
+			be_disable_busy_poll(eqo);
+		}
+		adapter->flags &= ~BE_FLAGS_NAPI_ENABLED;
+	}
+
+	be_async_mcc_disable(adapter);
+
+	/* Wait for all pending tx completions to arrive so that
+	 * all tx skbs are freed.
+	 */
+	netif_tx_disable(netdev);
+	be_tx_compl_clean(adapter);
+
+	be_rx_qs_destroy(adapter);
+
+	for (i = 1; i < (adapter->uc_macs + 1); i++)
+		be_cmd_pmac_del(adapter, adapter->if_handle,
+				adapter->pmac_id[i], 0);
+	adapter->uc_macs = 0;
+
+	for_all_evt_queues(adapter, eqo, i) {
+		if (msix_enabled(adapter))
+			synchronize_irq(be_msix_vec_get(adapter, eqo));
+		else
+			synchronize_irq(netdev->irq);
+		be_eq_clean(eqo);
+	}
+
+	be_irq_unregister(adapter);
+
+	return 0;
+}
+
+static int be_rx_qs_create(struct be_adapter *adapter)
+{
+	struct be_rx_obj *rxo;
+	int rc, i, j;
+	u8 rss_hkey[RSS_HASH_KEY_LEN];
+	struct rss_info *rss = &adapter->rss_info;
+
+	for_all_rx_queues(adapter, rxo, i) {
+		rc = be_queue_alloc(adapter, &rxo->q, RX_Q_LEN,
+				    sizeof(struct be_eth_rx_d));
+		if (rc)
+			return rc;
+	}
+
+	/* The FW would like the default RXQ to be created first */
+	rxo = default_rxo(adapter);
+	rc = be_cmd_rxq_create(adapter, &rxo->q, rxo->cq.id, rx_frag_size,
+			       adapter->if_handle, false, &rxo->rss_id);
+	if (rc)
+		return rc;
+
+	for_all_rss_queues(adapter, rxo, i) {
+		rc = be_cmd_rxq_create(adapter, &rxo->q, rxo->cq.id,
+				       rx_frag_size, adapter->if_handle,
+				       true, &rxo->rss_id);
+		if (rc)
+			return rc;
+	}
+
+	if (be_multi_rxq(adapter)) {
+		for (j = 0; j < RSS_INDIR_TABLE_LEN;
+			j += adapter->num_rx_qs - 1) {
+			for_all_rss_queues(adapter, rxo, i) {
+				if ((j + i) >= RSS_INDIR_TABLE_LEN)
+					break;
+				rss->rsstable[j + i] = rxo->rss_id;
+				rss->rss_queue[j + i] = i;
+			}
+		}
+		rss->rss_flags = RSS_ENABLE_TCP_IPV4 | RSS_ENABLE_IPV4 |
+			RSS_ENABLE_TCP_IPV6 | RSS_ENABLE_IPV6;
+
+		if (!BEx_chip(adapter))
+			rss->rss_flags |= RSS_ENABLE_UDP_IPV4 |
+				RSS_ENABLE_UDP_IPV6;
+	} else {
+		/* Disable RSS, if only default RX Q is created */
+		rss->rss_flags = RSS_ENABLE_NONE;
+	}
+
+	get_random_bytes(rss_hkey, RSS_HASH_KEY_LEN);
+	rc = be_cmd_rss_config(adapter, rss->rsstable, rss->rss_flags,
+			       128, rss_hkey);
+	if (rc) {
+		rss->rss_flags = RSS_ENABLE_NONE;
+		return rc;
+	}
+
+	memcpy(rss->rss_hkey, rss_hkey, RSS_HASH_KEY_LEN);
+
+	/* First time posting */
+	for_all_rx_queues(adapter, rxo, i)
+		be_post_rx_frags(rxo, GFP_KERNEL, MAX_RX_POST);
+	return 0;
+}
+
+static int be_open(struct net_device *netdev)
+{
+	struct be_adapter *adapter = netdev_priv(netdev);
+	struct be_eq_obj *eqo;
+	struct be_rx_obj *rxo;
+	struct be_tx_obj *txo;
+	u8 link_status;
+	int status, i;
+
+	status = be_rx_qs_create(adapter);
+	if (status)
+		goto err;
+
+	status = be_irq_register(adapter);
+	if (status)
+		goto err;
+
+	for_all_rx_queues(adapter, rxo, i)
+		be_cq_notify(adapter, rxo->cq.id, true, 0);
+
+	for_all_tx_queues(adapter, txo, i)
+		be_cq_notify(adapter, txo->cq.id, true, 0);
+
+	be_async_mcc_enable(adapter);
+
+	for_all_evt_queues(adapter, eqo, i) {
+		napi_enable(&eqo->napi);
+		be_enable_busy_poll(eqo);
+		be_eq_notify(adapter, eqo->q.id, true, true, 0);
+	}
+	adapter->flags |= BE_FLAGS_NAPI_ENABLED;
+
+	status = be_cmd_link_status_query(adapter, NULL, &link_status, 0);
+	if (!status)
+		be_link_status_update(adapter, link_status);
+
+	netif_tx_start_all_queues(netdev);
+	be_roce_dev_open(adapter);
+
+#ifdef CONFIG_BE2NET_VXLAN
+	if (skyhawk_chip(adapter))
+		vxlan_get_rx_port(netdev);
+#endif
+
+	return 0;
+err:
+	be_close(adapter->netdev);
+	return -EIO;
+}
+
+static int be_setup_wol(struct be_adapter *adapter, bool enable)
+{
+	struct be_dma_mem cmd;
+	int status = 0;
+	u8 mac[ETH_ALEN];
+
+	memset(mac, 0, ETH_ALEN);
+
+	cmd.size = sizeof(struct be_cmd_req_acpi_wol_magic_config);
+	cmd.va = dma_zalloc_coherent(&adapter->pdev->dev, cmd.size, &cmd.dma,
+				     GFP_KERNEL);
+	if (!cmd.va)
+		return -ENOMEM;
+
+	if (enable) {
+		status = pci_write_config_dword(adapter->pdev,
+						PCICFG_PM_CONTROL_OFFSET,
+						PCICFG_PM_CONTROL_MASK);
+		if (status) {
+			dev_err(&adapter->pdev->dev,
+				"Could not enable Wake-on-lan\n");
+			dma_free_coherent(&adapter->pdev->dev, cmd.size, cmd.va,
+					  cmd.dma);
+			return status;
+		}
+		status = be_cmd_enable_magic_wol(adapter,
+						 adapter->netdev->dev_addr,
+						 &cmd);
+		pci_enable_wake(adapter->pdev, PCI_D3hot, 1);
+		pci_enable_wake(adapter->pdev, PCI_D3cold, 1);
+	} else {
+		status = be_cmd_enable_magic_wol(adapter, mac, &cmd);
+		pci_enable_wake(adapter->pdev, PCI_D3hot, 0);
+		pci_enable_wake(adapter->pdev, PCI_D3cold, 0);
+	}
+
+	dma_free_coherent(&adapter->pdev->dev, cmd.size, cmd.va, cmd.dma);
+	return status;
+}
+
+/*
+ * Generate a seed MAC address from the PF MAC Address using jhash.
+ * MAC Address for VFs are assigned incrementally starting from the seed.
+ * These addresses are programmed in the ASIC by the PF and the VF driver
+ * queries for the MAC address during its probe.
+ */
+static int be_vf_eth_addr_config(struct be_adapter *adapter)
+{
+	u32 vf;
+	int status = 0;
+	u8 mac[ETH_ALEN];
+	struct be_vf_cfg *vf_cfg;
+
+	be_vf_eth_addr_generate(adapter, mac);
+
+	for_all_vfs(adapter, vf_cfg, vf) {
+		if (BEx_chip(adapter))
+			status = be_cmd_pmac_add(adapter, mac,
+						 vf_cfg->if_handle,
+						 &vf_cfg->pmac_id, vf + 1);
+		else
+			status = be_cmd_set_mac(adapter, mac, vf_cfg->if_handle,
+						vf + 1);
+
+		if (status)
+			dev_err(&adapter->pdev->dev,
+				"Mac address assignment failed for VF %d\n",
+				vf);
+		else
+			memcpy(vf_cfg->mac_addr, mac, ETH_ALEN);
+
+		mac[5] += 1;
+	}
+	return status;
+}
+
+static int be_vfs_mac_query(struct be_adapter *adapter)
+{
+	int status, vf;
+	u8 mac[ETH_ALEN];
+	struct be_vf_cfg *vf_cfg;
+
+	for_all_vfs(adapter, vf_cfg, vf) {
+		status = be_cmd_get_active_mac(adapter, vf_cfg->pmac_id,
+					       mac, vf_cfg->if_handle,
+					       false, vf+1);
+		if (status)
+			return status;
+		memcpy(vf_cfg->mac_addr, mac, ETH_ALEN);
+	}
+	return 0;
+}
+
+static void be_vf_clear(struct be_adapter *adapter)
+{
+	struct be_vf_cfg *vf_cfg;
+	u32 vf;
+
+	if (pci_vfs_assigned(adapter->pdev)) {
+		dev_warn(&adapter->pdev->dev,
+			 "VFs are assigned to VMs: not disabling VFs\n");
+		goto done;
+	}
+
+	pci_disable_sriov(adapter->pdev);
+
+	for_all_vfs(adapter, vf_cfg, vf) {
+		if (BEx_chip(adapter))
+			be_cmd_pmac_del(adapter, vf_cfg->if_handle,
+					vf_cfg->pmac_id, vf + 1);
+		else
+			be_cmd_set_mac(adapter, NULL, vf_cfg->if_handle,
+				       vf + 1);
+
+		be_cmd_if_destroy(adapter, vf_cfg->if_handle, vf + 1);
+	}
+done:
+	kfree(adapter->vf_cfg);
+	adapter->num_vfs = 0;
+	adapter->flags &= ~BE_FLAGS_SRIOV_ENABLED;
+}
+
+static void be_clear_queues(struct be_adapter *adapter)
+{
+	be_mcc_queues_destroy(adapter);
+	be_rx_cqs_destroy(adapter);
+	be_tx_queues_destroy(adapter);
+	be_evt_queues_destroy(adapter);
+}
+
+static void be_cancel_worker(struct be_adapter *adapter)
+{
+	if (adapter->flags & BE_FLAGS_WORKER_SCHEDULED) {
+		cancel_delayed_work_sync(&adapter->work);
+		adapter->flags &= ~BE_FLAGS_WORKER_SCHEDULED;
+	}
+}
+
+static void be_mac_clear(struct be_adapter *adapter)
+{
+	int i;
+
+	if (adapter->pmac_id) {
+		for (i = 0; i < (adapter->uc_macs + 1); i++)
+			be_cmd_pmac_del(adapter, adapter->if_handle,
+					adapter->pmac_id[i], 0);
+		adapter->uc_macs = 0;
+
+		kfree(adapter->pmac_id);
+		adapter->pmac_id = NULL;
+	}
+}
+
+#ifdef CONFIG_BE2NET_VXLAN
+static void be_disable_vxlan_offloads(struct be_adapter *adapter)
+{
+	if (adapter->flags & BE_FLAGS_VXLAN_OFFLOADS)
+		be_cmd_manage_iface(adapter, adapter->if_handle,
+				    OP_CONVERT_TUNNEL_TO_NORMAL);
+
+	if (adapter->vxlan_port)
+		be_cmd_set_vxlan_port(adapter, 0);
+
+	adapter->flags &= ~BE_FLAGS_VXLAN_OFFLOADS;
+	adapter->vxlan_port = 0;
+}
+#endif
+
+static int be_clear(struct be_adapter *adapter)
+{
+	be_cancel_worker(adapter);
+
+	if (sriov_enabled(adapter))
+		be_vf_clear(adapter);
+
+	/* Re-configure FW to distribute resources evenly across max-supported
+	 * number of VFs, only when VFs are not already enabled.
+	 */
+	if (be_physfn(adapter) && !pci_vfs_assigned(adapter->pdev))
+		be_cmd_set_sriov_config(adapter, adapter->pool_res,
+					pci_sriov_get_totalvfs(adapter->pdev));
+
+#ifdef CONFIG_BE2NET_VXLAN
+	be_disable_vxlan_offloads(adapter);
+#endif
+	/* delete the primary mac along with the uc-mac list */
+	be_mac_clear(adapter);
+
+	be_cmd_if_destroy(adapter, adapter->if_handle,  0);
+
+	be_clear_queues(adapter);
+
+	be_msix_disable(adapter);
+	adapter->flags &= ~BE_FLAGS_SETUP_DONE;
+	return 0;
+}
+
+static int be_vfs_if_create(struct be_adapter *adapter)
+{
+	struct be_resources res = {0};
+	struct be_vf_cfg *vf_cfg;
+	u32 cap_flags, en_flags, vf;
+	int status = 0;
+
+	cap_flags = BE_IF_FLAGS_UNTAGGED | BE_IF_FLAGS_BROADCAST |
+		    BE_IF_FLAGS_MULTICAST;
+
+	for_all_vfs(adapter, vf_cfg, vf) {
+		if (!BE3_chip(adapter)) {
+			status = be_cmd_get_profile_config(adapter, &res,
+							   vf + 1);
+			if (!status)
+				cap_flags = res.if_cap_flags;
+		}
+
+		/* If a FW profile exists, then cap_flags are updated */
+		en_flags = cap_flags & (BE_IF_FLAGS_UNTAGGED |
+					BE_IF_FLAGS_BROADCAST |
+					BE_IF_FLAGS_MULTICAST);
+		status =
+		    be_cmd_if_create(adapter, cap_flags, en_flags,
+				     &vf_cfg->if_handle, vf + 1);
+		if (status)
+			goto err;
+	}
+err:
+	return status;
+}
+
+static int be_vf_setup_init(struct be_adapter *adapter)
+{
+	struct be_vf_cfg *vf_cfg;
+	int vf;
+
+	adapter->vf_cfg = kcalloc(adapter->num_vfs, sizeof(*vf_cfg),
+				  GFP_KERNEL);
+	if (!adapter->vf_cfg)
+		return -ENOMEM;
+
+	for_all_vfs(adapter, vf_cfg, vf) {
+		vf_cfg->if_handle = -1;
+		vf_cfg->pmac_id = -1;
+	}
+	return 0;
+}
+
+static int be_vf_setup(struct be_adapter *adapter)
+{
+	struct device *dev = &adapter->pdev->dev;
+	struct be_vf_cfg *vf_cfg;
+	int status, old_vfs, vf;
+	u32 privileges;
+
+	old_vfs = pci_num_vf(adapter->pdev);
+
+	status = be_vf_setup_init(adapter);
+	if (status)
+		goto err;
+
+	if (old_vfs) {
+		for_all_vfs(adapter, vf_cfg, vf) {
+			status = be_cmd_get_if_id(adapter, vf_cfg, vf);
+			if (status)
+				goto err;
+		}
+
+		status = be_vfs_mac_query(adapter);
+		if (status)
+			goto err;
+	} else {
+		status = be_vfs_if_create(adapter);
+		if (status)
+			goto err;
+
+		status = be_vf_eth_addr_config(adapter);
+		if (status)
+			goto err;
+	}
+
+	for_all_vfs(adapter, vf_cfg, vf) {
+		/* Allow VFs to programs MAC/VLAN filters */
+		status = be_cmd_get_fn_privileges(adapter, &privileges, vf + 1);
+		if (!status && !(privileges & BE_PRIV_FILTMGMT)) {
+			status = be_cmd_set_fn_privileges(adapter,
+							  privileges |
+							  BE_PRIV_FILTMGMT,
+							  vf + 1);
+			if (!status)
+				dev_info(dev, "VF%d has FILTMGMT privilege\n",
+					 vf);
+		}
+
+		/* Allow full available bandwidth */
+		if (!old_vfs)
+			be_cmd_config_qos(adapter, 0, 0, vf + 1);
+
+		if (!old_vfs) {
+			be_cmd_enable_vf(adapter, vf + 1);
+			be_cmd_set_logical_link_config(adapter,
+						       IFLA_VF_LINK_STATE_AUTO,
+						       vf+1);
+		}
+	}
+
+	if (!old_vfs) {
+		status = pci_enable_sriov(adapter->pdev, adapter->num_vfs);
+		if (status) {
+			dev_err(dev, "SRIOV enable failed\n");
+			adapter->num_vfs = 0;
+			goto err;
+		}
+	}
+
+	adapter->flags |= BE_FLAGS_SRIOV_ENABLED;
+	return 0;
+err:
+	dev_err(dev, "VF setup failed\n");
+	be_vf_clear(adapter);
+	return status;
+}
+
+/* Converting function_mode bits on BE3 to SH mc_type enums */
+
+static u8 be_convert_mc_type(u32 function_mode)
+{
+	if (function_mode & VNIC_MODE && function_mode & QNQ_MODE)
+		return vNIC1;
+	else if (function_mode & QNQ_MODE)
+		return FLEX10;
+	else if (function_mode & VNIC_MODE)
+		return vNIC2;
+	else if (function_mode & UMC_ENABLED)
+		return UMC;
+	else
+		return MC_NONE;
+}
+
+/* On BE2/BE3 FW does not suggest the supported limits */
+static void BEx_get_resources(struct be_adapter *adapter,
+			      struct be_resources *res)
+{
+	bool use_sriov = adapter->num_vfs ? 1 : 0;
+
+	if (be_physfn(adapter))
+		res->max_uc_mac = BE_UC_PMAC_COUNT;
+	else
+		res->max_uc_mac = BE_VF_UC_PMAC_COUNT;
+
+	adapter->mc_type = be_convert_mc_type(adapter->function_mode);
+
+	if (be_is_mc(adapter)) {
+		/* Assuming that there are 4 channels per port,
+		 * when multi-channel is enabled
+		 */
+		if (be_is_qnq_mode(adapter))
+			res->max_vlans = BE_NUM_VLANS_SUPPORTED/8;
+		else
+			/* In a non-qnq multichannel mode, the pvid
+			 * takes up one vlan entry
+			 */
+			res->max_vlans = (BE_NUM_VLANS_SUPPORTED / 4) - 1;
+	} else {
+		res->max_vlans = BE_NUM_VLANS_SUPPORTED;
+	}
+
+	res->max_mcast_mac = BE_MAX_MC;
+
+	/* 1) For BE3 1Gb ports, FW does not support multiple TXQs
+	 * 2) Create multiple TX rings on a BE3-R multi-channel interface
+	 *    *only* if it is RSS-capable.
+	 */
+	if (BE2_chip(adapter) || use_sriov ||  (adapter->port_num > 1) ||
+	    !be_physfn(adapter) || (be_is_mc(adapter) &&
+	    !(adapter->function_caps & BE_FUNCTION_CAPS_RSS))) {
+		res->max_tx_qs = 1;
+	} else if (adapter->function_caps & BE_FUNCTION_CAPS_SUPER_NIC) {
+		struct be_resources super_nic_res = {0};
+
+		/* On a SuperNIC profile, the driver needs to use the
+		 * GET_PROFILE_CONFIG cmd to query the per-function TXQ limits
+		 */
+		be_cmd_get_profile_config(adapter, &super_nic_res, 0);
+		/* Some old versions of BE3 FW don't report max_tx_qs value */
+		res->max_tx_qs = super_nic_res.max_tx_qs ? : BE3_MAX_TX_QS;
+	} else {
+		res->max_tx_qs = BE3_MAX_TX_QS;
+	}
+
+	if ((adapter->function_caps & BE_FUNCTION_CAPS_RSS) &&
+	    !use_sriov && be_physfn(adapter))
+		res->max_rss_qs = (adapter->be3_native) ?
+					   BE3_MAX_RSS_QS : BE2_MAX_RSS_QS;
+	res->max_rx_qs = res->max_rss_qs + 1;
+
+	if (be_physfn(adapter))
+		res->max_evt_qs = (be_max_vfs(adapter) > 0) ?
+					BE3_SRIOV_MAX_EVT_QS : BE3_MAX_EVT_QS;
+	else
+		res->max_evt_qs = 1;
+
+	res->if_cap_flags = BE_IF_CAP_FLAGS_WANT;
+	if (!(adapter->function_caps & BE_FUNCTION_CAPS_RSS))
+		res->if_cap_flags &= ~BE_IF_FLAGS_RSS;
+}
+
+static void be_setup_init(struct be_adapter *adapter)
+{
+	adapter->vlan_prio_bmap = 0xff;
+	adapter->phy.link_speed = -1;
+	adapter->if_handle = -1;
+	adapter->be3_native = false;
+	adapter->promiscuous = false;
+	if (be_physfn(adapter))
+		adapter->cmd_privileges = MAX_PRIVILEGES;
+	else
+		adapter->cmd_privileges = MIN_PRIVILEGES;
+}
+
+static int be_get_sriov_config(struct be_adapter *adapter)
+{
+	struct device *dev = &adapter->pdev->dev;
+	struct be_resources res = {0};
+	int max_vfs, old_vfs;
+
+	/* Some old versions of BE3 FW don't report max_vfs value */
+	be_cmd_get_profile_config(adapter, &res, 0);
+
+	if (BE3_chip(adapter) && !res.max_vfs) {
+		max_vfs = pci_sriov_get_totalvfs(adapter->pdev);
+		res.max_vfs = max_vfs > 0 ? min(MAX_VFS, max_vfs) : 0;
+	}
+
+	adapter->pool_res = res;
+
+	if (!be_max_vfs(adapter)) {
+		if (num_vfs)
+			dev_warn(dev, "SRIOV is disabled. Ignoring num_vfs\n");
+		adapter->num_vfs = 0;
+		return 0;
+	}
+
+	pci_sriov_set_totalvfs(adapter->pdev, be_max_vfs(adapter));
+
+	/* validate num_vfs module param */
+	old_vfs = pci_num_vf(adapter->pdev);
+	if (old_vfs) {
+		dev_info(dev, "%d VFs are already enabled\n", old_vfs);
+		if (old_vfs != num_vfs)
+			dev_warn(dev, "Ignoring num_vfs=%d setting\n", num_vfs);
+		adapter->num_vfs = old_vfs;
+	} else {
+		if (num_vfs > be_max_vfs(adapter)) {
+			dev_info(dev, "Resources unavailable to init %d VFs\n",
+				 num_vfs);
+			dev_info(dev, "Limiting to %d VFs\n",
+				 be_max_vfs(adapter));
+		}
+		adapter->num_vfs = min_t(u16, num_vfs, be_max_vfs(adapter));
+	}
+
+	return 0;
+}
+
+static int be_get_resources(struct be_adapter *adapter)
+{
+	struct device *dev = &adapter->pdev->dev;
+	struct be_resources res = {0};
+	int status;
+
+	if (BEx_chip(adapter)) {
+		BEx_get_resources(adapter, &res);
+		adapter->res = res;
+	}
+
+	/* For Lancer, SH etc read per-function resource limits from FW.
+	 * GET_FUNC_CONFIG returns per function guaranteed limits.
+	 * GET_PROFILE_CONFIG returns PCI-E related limits PF-pool limits
+	 */
+	if (!BEx_chip(adapter)) {
+		status = be_cmd_get_func_config(adapter, &res);
+		if (status)
+			return status;
+
+		/* If RoCE may be enabled stash away half the EQs for RoCE */
+		if (be_roce_supported(adapter))
+			res.max_evt_qs /= 2;
+		adapter->res = res;
+	}
+
+	dev_info(dev, "Max: txqs %d, rxqs %d, rss %d, eqs %d, vfs %d\n",
+		 be_max_txqs(adapter), be_max_rxqs(adapter),
+		 be_max_rss(adapter), be_max_eqs(adapter),
+		 be_max_vfs(adapter));
+	dev_info(dev, "Max: uc-macs %d, mc-macs %d, vlans %d\n",
+		 be_max_uc(adapter), be_max_mc(adapter),
+		 be_max_vlans(adapter));
+
+	return 0;
+}
+
+static void be_sriov_config(struct be_adapter *adapter)
+{
+	struct device *dev = &adapter->pdev->dev;
+	int status;
+
+	status = be_get_sriov_config(adapter);
+	if (status) {
+		dev_err(dev, "Failed to query SR-IOV configuration\n");
+		dev_err(dev, "SR-IOV cannot be enabled\n");
+		return;
+	}
+
+	/* When the HW is in SRIOV capable configuration, the PF-pool
+	 * resources are equally distributed across the max-number of
+	 * VFs. The user may request only a subset of the max-vfs to be
+	 * enabled. Based on num_vfs, redistribute the resources across
+	 * num_vfs so that each VF will have access to more number of
+	 * resources. This facility is not available in BE3 FW.
+	 * Also, this is done by FW in Lancer chip.
+	 */
+	if (be_max_vfs(adapter) && !pci_num_vf(adapter->pdev)) {
+		status = be_cmd_set_sriov_config(adapter,
+						 adapter->pool_res,
+						 adapter->num_vfs);
+		if (status)
+			dev_err(dev, "Failed to optimize SR-IOV resources\n");
+	}
+}
+
+static int be_get_config(struct be_adapter *adapter)
+{
+	u16 profile_id;
+	int status;
+
+	status = be_cmd_query_fw_cfg(adapter);
+	if (status)
+		return status;
+
+	 if (be_physfn(adapter)) {
+		status = be_cmd_get_active_profile(adapter, &profile_id);
+		if (!status)
+			dev_info(&adapter->pdev->dev,
+				 "Using profile 0x%x\n", profile_id);
+	}
+
+	if (!BE2_chip(adapter) && be_physfn(adapter))
+		be_sriov_config(adapter);
+
+	status = be_get_resources(adapter);
+	if (status)
+		return status;
+
+	adapter->pmac_id = kcalloc(be_max_uc(adapter),
+				   sizeof(*adapter->pmac_id), GFP_KERNEL);
+	if (!adapter->pmac_id)
+		return -ENOMEM;
+
+	/* Sanitize cfg_num_qs based on HW and platform limits */
+	adapter->cfg_num_qs = min(adapter->cfg_num_qs, be_max_qs(adapter));
+
+	return 0;
+}
+
+static int be_mac_setup(struct be_adapter *adapter)
+{
+	u8 mac[ETH_ALEN];
+	int status;
+
+	if (is_zero_ether_addr(adapter->netdev->dev_addr)) {
+		status = be_cmd_get_perm_mac(adapter, mac);
+		if (status)
+			return status;
+
+		memcpy(adapter->netdev->dev_addr, mac, ETH_ALEN);
+		memcpy(adapter->netdev->perm_addr, mac, ETH_ALEN);
+	} else {
+		/* Maybe the HW was reset; dev_addr must be re-programmed */
+		memcpy(mac, adapter->netdev->dev_addr, ETH_ALEN);
+	}
+
+	/* For BE3-R VFs, the PF programs the initial MAC address */
+	if (!(BEx_chip(adapter) && be_virtfn(adapter)))
+		be_cmd_pmac_add(adapter, mac, adapter->if_handle,
+				&adapter->pmac_id[0], 0);
+	return 0;
+}
+
+static void be_schedule_worker(struct be_adapter *adapter)
+{
+	schedule_delayed_work(&adapter->work, msecs_to_jiffies(1000));
+	adapter->flags |= BE_FLAGS_WORKER_SCHEDULED;
+}
+
+static int be_setup_queues(struct be_adapter *adapter)
+{
+	struct net_device *netdev = adapter->netdev;
+	int status;
+
+	status = be_evt_queues_create(adapter);
+	if (status)
+		goto err;
+
+	status = be_tx_qs_create(adapter);
+	if (status)
+		goto err;
+
+	status = be_rx_cqs_create(adapter);
+	if (status)
+		goto err;
+
+	status = be_mcc_queues_create(adapter);
+	if (status)
+		goto err;
+
+	status = netif_set_real_num_rx_queues(netdev, adapter->num_rx_qs);
+	if (status)
+		goto err;
+
+	status = netif_set_real_num_tx_queues(netdev, adapter->num_tx_qs);
+	if (status)
+		goto err;
+
+	return 0;
+err:
+	dev_err(&adapter->pdev->dev, "queue_setup failed\n");
+	return status;
+}
+
+int be_update_queues(struct be_adapter *adapter)
+{
+	struct net_device *netdev = adapter->netdev;
+	int status;
+
+	if (netif_running(netdev))
+		be_close(netdev);
+
+	be_cancel_worker(adapter);
+
+	/* If any vectors have been shared with RoCE we cannot re-program
+	 * the MSIx table.
+	 */
+	if (!adapter->num_msix_roce_vec)
+		be_msix_disable(adapter);
+
+	be_clear_queues(adapter);
+
+	if (!msix_enabled(adapter)) {
+		status = be_msix_enable(adapter);
+		if (status)
+			return status;
+	}
+
+	status = be_setup_queues(adapter);
+	if (status)
+		return status;
+
+	be_schedule_worker(adapter);
+
+	if (netif_running(netdev))
+		status = be_open(netdev);
+
+	return status;
+}
+
+static int be_setup(struct be_adapter *adapter)
+{
+	struct device *dev = &adapter->pdev->dev;
+	u32 tx_fc, rx_fc, en_flags;
+	int status;
+
+	be_setup_init(adapter);
+
+	if (!lancer_chip(adapter))
+		be_cmd_req_native_mode(adapter);
+
+	status = be_get_config(adapter);
+	if (status)
+		goto err;
+
+	status = be_msix_enable(adapter);
+	if (status)
+		goto err;
+
+	en_flags = BE_IF_FLAGS_UNTAGGED | BE_IF_FLAGS_BROADCAST |
+		   BE_IF_FLAGS_MULTICAST | BE_IF_FLAGS_PASS_L3L4_ERRORS;
+	if (adapter->function_caps & BE_FUNCTION_CAPS_RSS)
+		en_flags |= BE_IF_FLAGS_RSS;
+	en_flags = en_flags & be_if_cap_flags(adapter);
+	status = be_cmd_if_create(adapter, be_if_cap_flags(adapter), en_flags,
+				  &adapter->if_handle, 0);
+	if (status)
+		goto err;
+
+	/* Updating real_num_tx/rx_queues() requires rtnl_lock() */
+	rtnl_lock();
+	status = be_setup_queues(adapter);
+	rtnl_unlock();
+	if (status)
+		goto err;
+
+	be_cmd_get_fn_privileges(adapter, &adapter->cmd_privileges, 0);
+
+	status = be_mac_setup(adapter);
+	if (status)
+		goto err;
+
+	be_cmd_get_fw_ver(adapter);
+	dev_info(dev, "FW version is %s\n", adapter->fw_ver);
+
+	if (BE2_chip(adapter) && fw_major_num(adapter->fw_ver) < 4) {
+		dev_err(dev, "Firmware on card is old(%s), IRQs may not work",
+			adapter->fw_ver);
+		dev_err(dev, "Please upgrade firmware to version >= 4.0\n");
+	}
+
+	if (adapter->vlans_added)
+		be_vid_config(adapter);
+
+	be_set_rx_mode(adapter->netdev);
+
+	be_cmd_get_acpi_wol_cap(adapter);
+
+	be_cmd_get_flow_control(adapter, &tx_fc, &rx_fc);
+
+	if (rx_fc != adapter->rx_fc || tx_fc != adapter->tx_fc)
+		be_cmd_set_flow_control(adapter, adapter->tx_fc,
+					adapter->rx_fc);
+
+	if (be_physfn(adapter))
+		be_cmd_set_logical_link_config(adapter,
+					       IFLA_VF_LINK_STATE_AUTO, 0);
+
+	if (adapter->num_vfs)
+		be_vf_setup(adapter);
+
+	status = be_cmd_get_phy_info(adapter);
+	if (!status && be_pause_supported(adapter))
+		adapter->phy.fc_autoneg = 1;
+
+	be_schedule_worker(adapter);
+	adapter->flags |= BE_FLAGS_SETUP_DONE;
+	return 0;
+err:
+	be_clear(adapter);
+	return status;
+}
+
+#ifdef CONFIG_NET_POLL_CONTROLLER
+static void be_netpoll(struct net_device *netdev)
+{
+	struct be_adapter *adapter = netdev_priv(netdev);
+	struct be_eq_obj *eqo;
+	int i;
+
+	for_all_evt_queues(adapter, eqo, i) {
+		be_eq_notify(eqo->adapter, eqo->q.id, false, true, 0);
+		napi_schedule(&eqo->napi);
+	}
+}
+#endif
+
+static char flash_cookie[2][16] = {"*** SE FLAS", "H DIRECTORY *** "};
+
+static bool phy_flashing_required(struct be_adapter *adapter)
+{
+	return (adapter->phy.phy_type == TN_8022 &&
+		adapter->phy.interface_type == PHY_TYPE_BASET_10GB);
+}
+
+static bool is_comp_in_ufi(struct be_adapter *adapter,
+			   struct flash_section_info *fsec, int type)
+{
+	int i = 0, img_type = 0;
+	struct flash_section_info_g2 *fsec_g2 = NULL;
+
+	if (BE2_chip(adapter))
+		fsec_g2 = (struct flash_section_info_g2 *)fsec;
+
+	for (i = 0; i < MAX_FLASH_COMP; i++) {
+		if (fsec_g2)
+			img_type = le32_to_cpu(fsec_g2->fsec_entry[i].type);
+		else
+			img_type = le32_to_cpu(fsec->fsec_entry[i].type);
+
+		if (img_type == type)
+			return true;
+	}
+	return false;
+
+}
+
+static struct flash_section_info *get_fsec_info(struct be_adapter *adapter,
+						int header_size,
+						const struct firmware *fw)
+{
+	struct flash_section_info *fsec = NULL;
+	const u8 *p = fw->data;
+
+	p += header_size;
+	while (p < (fw->data + fw->size)) {
+		fsec = (struct flash_section_info *)p;
+		if (!memcmp(flash_cookie, fsec->cookie, sizeof(flash_cookie)))
+			return fsec;
+		p += 32;
+	}
+	return NULL;
+}
+
+static int be_check_flash_crc(struct be_adapter *adapter, const u8 *p,
+			      u32 img_offset, u32 img_size, int hdr_size,
+			      u16 img_optype, bool *crc_match)
+{
+	u32 crc_offset;
+	int status;
+	u8 crc[4];
+
+	status = be_cmd_get_flash_crc(adapter, crc, img_optype, img_size - 4);
+	if (status)
+		return status;
+
+	crc_offset = hdr_size + img_offset + img_size - 4;
+
+	/* Skip flashing, if crc of flashed region matches */
+	if (!memcmp(crc, p + crc_offset, 4))
+		*crc_match = true;
+	else
+		*crc_match = false;
+
+	return status;
+}
+
+static int be_flash(struct be_adapter *adapter, const u8 *img,
+		    struct be_dma_mem *flash_cmd, int optype, int img_size)
+{
+	struct be_cmd_write_flashrom *req = flash_cmd->va;
+	u32 total_bytes, flash_op, num_bytes;
+	int status;
+
+	total_bytes = img_size;
+	while (total_bytes) {
+		num_bytes = min_t(u32, 32*1024, total_bytes);
+
+		total_bytes -= num_bytes;
+
+		if (!total_bytes) {
+			if (optype == OPTYPE_PHY_FW)
+				flash_op = FLASHROM_OPER_PHY_FLASH;
+			else
+				flash_op = FLASHROM_OPER_FLASH;
+		} else {
+			if (optype == OPTYPE_PHY_FW)
+				flash_op = FLASHROM_OPER_PHY_SAVE;
+			else
+				flash_op = FLASHROM_OPER_SAVE;
+		}
+
+		memcpy(req->data_buf, img, num_bytes);
+		img += num_bytes;
+		status = be_cmd_write_flashrom(adapter, flash_cmd, optype,
+					       flash_op, num_bytes);
+		if (base_status(status) == MCC_STATUS_ILLEGAL_REQUEST &&
+		    optype == OPTYPE_PHY_FW)
+			break;
+		else if (status)
+			return status;
+	}
+	return 0;
+}
+
+/* For BE2, BE3 and BE3-R */
+static int be_flash_BEx(struct be_adapter *adapter,
+			const struct firmware *fw,
+			struct be_dma_mem *flash_cmd, int num_of_images)
+{
+	int img_hdrs_size = (num_of_images * sizeof(struct image_hdr));
+	struct device *dev = &adapter->pdev->dev;
+	struct flash_section_info *fsec = NULL;
+	int status, i, filehdr_size, num_comp;
+	const struct flash_comp *pflashcomp;
+	bool crc_match;
+	const u8 *p;
+
+	struct flash_comp gen3_flash_types[] = {
+		{ FLASH_iSCSI_PRIMARY_IMAGE_START_g3, OPTYPE_ISCSI_ACTIVE,
+			FLASH_IMAGE_MAX_SIZE_g3, IMAGE_FIRMWARE_iSCSI},
+		{ FLASH_REDBOOT_START_g3, OPTYPE_REDBOOT,
+			FLASH_REDBOOT_IMAGE_MAX_SIZE_g3, IMAGE_BOOT_CODE},
+		{ FLASH_iSCSI_BIOS_START_g3, OPTYPE_BIOS,
+			FLASH_BIOS_IMAGE_MAX_SIZE_g3, IMAGE_OPTION_ROM_ISCSI},
+		{ FLASH_PXE_BIOS_START_g3, OPTYPE_PXE_BIOS,
+			FLASH_BIOS_IMAGE_MAX_SIZE_g3, IMAGE_OPTION_ROM_PXE},
+		{ FLASH_FCoE_BIOS_START_g3, OPTYPE_FCOE_BIOS,
+			FLASH_BIOS_IMAGE_MAX_SIZE_g3, IMAGE_OPTION_ROM_FCoE},
+		{ FLASH_iSCSI_BACKUP_IMAGE_START_g3, OPTYPE_ISCSI_BACKUP,
+			FLASH_IMAGE_MAX_SIZE_g3, IMAGE_FIRMWARE_BACKUP_iSCSI},
+		{ FLASH_FCoE_PRIMARY_IMAGE_START_g3, OPTYPE_FCOE_FW_ACTIVE,
+			FLASH_IMAGE_MAX_SIZE_g3, IMAGE_FIRMWARE_FCoE},
+		{ FLASH_FCoE_BACKUP_IMAGE_START_g3, OPTYPE_FCOE_FW_BACKUP,
+			FLASH_IMAGE_MAX_SIZE_g3, IMAGE_FIRMWARE_BACKUP_FCoE},
+		{ FLASH_NCSI_START_g3, OPTYPE_NCSI_FW,
+			FLASH_NCSI_IMAGE_MAX_SIZE_g3, IMAGE_NCSI},
+		{ FLASH_PHY_FW_START_g3, OPTYPE_PHY_FW,
+			FLASH_PHY_FW_IMAGE_MAX_SIZE_g3, IMAGE_FIRMWARE_PHY}
+	};
+
+	struct flash_comp gen2_flash_types[] = {
+		{ FLASH_iSCSI_PRIMARY_IMAGE_START_g2, OPTYPE_ISCSI_ACTIVE,
+			FLASH_IMAGE_MAX_SIZE_g2, IMAGE_FIRMWARE_iSCSI},
+		{ FLASH_REDBOOT_START_g2, OPTYPE_REDBOOT,
+			FLASH_REDBOOT_IMAGE_MAX_SIZE_g2, IMAGE_BOOT_CODE},
+		{ FLASH_iSCSI_BIOS_START_g2, OPTYPE_BIOS,
+			FLASH_BIOS_IMAGE_MAX_SIZE_g2, IMAGE_OPTION_ROM_ISCSI},
+		{ FLASH_PXE_BIOS_START_g2, OPTYPE_PXE_BIOS,
+			FLASH_BIOS_IMAGE_MAX_SIZE_g2, IMAGE_OPTION_ROM_PXE},
+		{ FLASH_FCoE_BIOS_START_g2, OPTYPE_FCOE_BIOS,
+			FLASH_BIOS_IMAGE_MAX_SIZE_g2, IMAGE_OPTION_ROM_FCoE},
+		{ FLASH_iSCSI_BACKUP_IMAGE_START_g2, OPTYPE_ISCSI_BACKUP,
+			FLASH_IMAGE_MAX_SIZE_g2, IMAGE_FIRMWARE_BACKUP_iSCSI},
+		{ FLASH_FCoE_PRIMARY_IMAGE_START_g2, OPTYPE_FCOE_FW_ACTIVE,
+			FLASH_IMAGE_MAX_SIZE_g2, IMAGE_FIRMWARE_FCoE},
+		{ FLASH_FCoE_BACKUP_IMAGE_START_g2, OPTYPE_FCOE_FW_BACKUP,
+			 FLASH_IMAGE_MAX_SIZE_g2, IMAGE_FIRMWARE_BACKUP_FCoE}
+	};
+
+	if (BE3_chip(adapter)) {
+		pflashcomp = gen3_flash_types;
+		filehdr_size = sizeof(struct flash_file_hdr_g3);
+		num_comp = ARRAY_SIZE(gen3_flash_types);
+	} else {
+		pflashcomp = gen2_flash_types;
+		filehdr_size = sizeof(struct flash_file_hdr_g2);
+		num_comp = ARRAY_SIZE(gen2_flash_types);
+	}
+
+	/* Get flash section info*/
+	fsec = get_fsec_info(adapter, filehdr_size + img_hdrs_size, fw);
+	if (!fsec) {
+		dev_err(dev, "Invalid Cookie. FW image may be corrupted\n");
+		return -1;
+	}
+	for (i = 0; i < num_comp; i++) {
+		if (!is_comp_in_ufi(adapter, fsec, pflashcomp[i].img_type))
+			continue;
+
+		if ((pflashcomp[i].optype == OPTYPE_NCSI_FW) &&
+		    memcmp(adapter->fw_ver, "3.102.148.0", 11) < 0)
+			continue;
+
+		if (pflashcomp[i].optype == OPTYPE_PHY_FW  &&
+		    !phy_flashing_required(adapter))
+				continue;
+
+		if (pflashcomp[i].optype == OPTYPE_REDBOOT) {
+			status = be_check_flash_crc(adapter, fw->data,
+						    pflashcomp[i].offset,
+						    pflashcomp[i].size,
+						    filehdr_size +
+						    img_hdrs_size,
+						    OPTYPE_REDBOOT, &crc_match);
+			if (status) {
+				dev_err(dev,
+					"Could not get CRC for 0x%x region\n",
+					pflashcomp[i].optype);
+				continue;
+			}
+
+			if (crc_match)
+				continue;
+		}
+
+		p = fw->data + filehdr_size + pflashcomp[i].offset +
+			img_hdrs_size;
+		if (p + pflashcomp[i].size > fw->data + fw->size)
+			return -1;
+
+		status = be_flash(adapter, p, flash_cmd, pflashcomp[i].optype,
+				  pflashcomp[i].size);
+		if (status) {
+			dev_err(dev, "Flashing section type 0x%x failed\n",
+				pflashcomp[i].img_type);
+			return status;
+		}
+	}
+	return 0;
+}
+
+static u16 be_get_img_optype(struct flash_section_entry fsec_entry)
+{
+	u32 img_type = le32_to_cpu(fsec_entry.type);
+	u16 img_optype = le16_to_cpu(fsec_entry.optype);
+
+	if (img_optype != 0xFFFF)
+		return img_optype;
+
+	switch (img_type) {
+	case IMAGE_FIRMWARE_iSCSI:
+		img_optype = OPTYPE_ISCSI_ACTIVE;
+		break;
+	case IMAGE_BOOT_CODE:
+		img_optype = OPTYPE_REDBOOT;
+		break;
+	case IMAGE_OPTION_ROM_ISCSI:
+		img_optype = OPTYPE_BIOS;
+		break;
+	case IMAGE_OPTION_ROM_PXE:
+		img_optype = OPTYPE_PXE_BIOS;
+		break;
+	case IMAGE_OPTION_ROM_FCoE:
+		img_optype = OPTYPE_FCOE_BIOS;
+		break;
+	case IMAGE_FIRMWARE_BACKUP_iSCSI:
+		img_optype = OPTYPE_ISCSI_BACKUP;
+		break;
+	case IMAGE_NCSI:
+		img_optype = OPTYPE_NCSI_FW;
+		break;
+	case IMAGE_FLASHISM_JUMPVECTOR:
+		img_optype = OPTYPE_FLASHISM_JUMPVECTOR;
+		break;
+	case IMAGE_FIRMWARE_PHY:
+		img_optype = OPTYPE_SH_PHY_FW;
+		break;
+	case IMAGE_REDBOOT_DIR:
+		img_optype = OPTYPE_REDBOOT_DIR;
+		break;
+	case IMAGE_REDBOOT_CONFIG:
+		img_optype = OPTYPE_REDBOOT_CONFIG;
+		break;
+	case IMAGE_UFI_DIR:
+		img_optype = OPTYPE_UFI_DIR;
+		break;
+	default:
+		break;
+	}
+
+	return img_optype;
+}
+
+static int be_flash_skyhawk(struct be_adapter *adapter,
+			    const struct firmware *fw,
+			    struct be_dma_mem *flash_cmd, int num_of_images)
+{
+	int img_hdrs_size = num_of_images * sizeof(struct image_hdr);
+	struct device *dev = &adapter->pdev->dev;
+	struct flash_section_info *fsec = NULL;
+	u32 img_offset, img_size, img_type;
+	int status, i, filehdr_size;
+	bool crc_match, old_fw_img;
+	u16 img_optype;
+	const u8 *p;
+
+	filehdr_size = sizeof(struct flash_file_hdr_g3);
+	fsec = get_fsec_info(adapter, filehdr_size + img_hdrs_size, fw);
+	if (!fsec) {
+		dev_err(dev, "Invalid Cookie. FW image may be corrupted\n");
+		return -EINVAL;
+	}
+
+	for (i = 0; i < le32_to_cpu(fsec->fsec_hdr.num_images); i++) {
+		img_offset = le32_to_cpu(fsec->fsec_entry[i].offset);
+		img_size   = le32_to_cpu(fsec->fsec_entry[i].pad_size);
+		img_type   = le32_to_cpu(fsec->fsec_entry[i].type);
+		img_optype = be_get_img_optype(fsec->fsec_entry[i]);
+		old_fw_img = fsec->fsec_entry[i].optype == 0xFFFF;
+
+		if (img_optype == 0xFFFF)
+			continue;
+		/* Don't bother verifying CRC if an old FW image is being
+		 * flashed
+		 */
+		if (old_fw_img)
+			goto flash;
+
+		status = be_check_flash_crc(adapter, fw->data, img_offset,
+					    img_size, filehdr_size +
+					    img_hdrs_size, img_optype,
+					    &crc_match);
+		/* The current FW image on the card does not recognize the new
+		 * FLASH op_type. The FW download is partially complete.
+		 * Reboot the server now to enable FW image to recognize the
+		 * new FLASH op_type. To complete the remaining process,
+		 * download the same FW again after the reboot.
+		 */
+		if (base_status(status) == MCC_STATUS_ILLEGAL_REQUEST ||
+		    base_status(status) == MCC_STATUS_ILLEGAL_FIELD) {
+			dev_err(dev, "Flash incomplete. Reset the server\n");
+			dev_err(dev, "Download FW image again after reset\n");
+			return -EAGAIN;
+		} else if (status) {
+			dev_err(dev, "Could not get CRC for 0x%x region\n",
+				img_optype);
+			return -EFAULT;
+		}
+
+		if (crc_match)
+			continue;
+
+flash:
+		p = fw->data + filehdr_size + img_offset + img_hdrs_size;
+		if (p + img_size > fw->data + fw->size)
+			return -1;
+
+		status = be_flash(adapter, p, flash_cmd, img_optype, img_size);
+		/* For old FW images ignore ILLEGAL_FIELD error or errors on
+		 * UFI_DIR region
+		 */
+		if (old_fw_img &&
+		    (base_status(status) == MCC_STATUS_ILLEGAL_FIELD ||
+		     (img_optype == OPTYPE_UFI_DIR &&
+		      base_status(status) == MCC_STATUS_FAILED))) {
+			continue;
+		} else if (status) {
+			dev_err(dev, "Flashing section type 0x%x failed\n",
+				img_type);
+			return -EFAULT;
+		}
+	}
+	return 0;
+}
+
+static int lancer_fw_download(struct be_adapter *adapter,
+			      const struct firmware *fw)
+{
+#define LANCER_FW_DOWNLOAD_CHUNK      (32 * 1024)
+#define LANCER_FW_DOWNLOAD_LOCATION   "/prg"
+	struct device *dev = &adapter->pdev->dev;
+	struct be_dma_mem flash_cmd;
+	const u8 *data_ptr = NULL;
+	u8 *dest_image_ptr = NULL;
+	size_t image_size = 0;
+	u32 chunk_size = 0;
+	u32 data_written = 0;
+	u32 offset = 0;
+	int status = 0;
+	u8 add_status = 0;
+	u8 change_status;
+
+	if (!IS_ALIGNED(fw->size, sizeof(u32))) {
+		dev_err(dev, "FW image size should be multiple of 4\n");
+		return -EINVAL;
+	}
+
+	flash_cmd.size = sizeof(struct lancer_cmd_req_write_object)
+				+ LANCER_FW_DOWNLOAD_CHUNK;
+	flash_cmd.va = dma_alloc_coherent(dev, flash_cmd.size,
+					  &flash_cmd.dma, GFP_KERNEL);
+	if (!flash_cmd.va)
+		return -ENOMEM;
+
+	dest_image_ptr = flash_cmd.va +
+				sizeof(struct lancer_cmd_req_write_object);
+	image_size = fw->size;
+	data_ptr = fw->data;
+
+	while (image_size) {
+		chunk_size = min_t(u32, image_size, LANCER_FW_DOWNLOAD_CHUNK);
+
+		/* Copy the image chunk content. */
+		memcpy(dest_image_ptr, data_ptr, chunk_size);
+
+		status = lancer_cmd_write_object(adapter, &flash_cmd,
+						 chunk_size, offset,
+						 LANCER_FW_DOWNLOAD_LOCATION,
+						 &data_written, &change_status,
+						 &add_status);
+		if (status)
+			break;
+
+		offset += data_written;
+		data_ptr += data_written;
+		image_size -= data_written;
+	}
+
+	if (!status) {
+		/* Commit the FW written */
+		status = lancer_cmd_write_object(adapter, &flash_cmd,
+						 0, offset,
+						 LANCER_FW_DOWNLOAD_LOCATION,
+						 &data_written, &change_status,
+						 &add_status);
+	}
+
+	dma_free_coherent(dev, flash_cmd.size, flash_cmd.va, flash_cmd.dma);
+	if (status) {
+		dev_err(dev, "Firmware load error\n");
+		return be_cmd_status(status);
+	}
+
+	dev_info(dev, "Firmware flashed successfully\n");
+
+	if (change_status == LANCER_FW_RESET_NEEDED) {
+		dev_info(dev, "Resetting adapter to activate new FW\n");
+		status = lancer_physdev_ctrl(adapter,
+					     PHYSDEV_CONTROL_FW_RESET_MASK);
+		if (status) {
+			dev_err(dev, "Adapter busy, could not reset FW\n");
+			dev_err(dev, "Reboot server to activate new FW\n");
+		}
+	} else if (change_status != LANCER_NO_RESET_NEEDED) {
+		dev_info(dev, "Reboot server to activate new FW\n");
+	}
+
+	return 0;
+}
+
+#define UFI_TYPE2		2
+#define UFI_TYPE3		3
+#define UFI_TYPE3R		10
+#define UFI_TYPE4		4
+static int be_get_ufi_type(struct be_adapter *adapter,
+			   struct flash_file_hdr_g3 *fhdr)
+{
+	if (!fhdr)
+		goto be_get_ufi_exit;
+
+	if (skyhawk_chip(adapter) && fhdr->build[0] == '4')
+		return UFI_TYPE4;
+	else if (BE3_chip(adapter) && fhdr->build[0] == '3') {
+		if (fhdr->asic_type_rev == 0x10)
+			return UFI_TYPE3R;
+		else
+			return UFI_TYPE3;
+	} else if (BE2_chip(adapter) && fhdr->build[0] == '2')
+		return UFI_TYPE2;
+
+be_get_ufi_exit:
+	dev_err(&adapter->pdev->dev,
+		"UFI and Interface are not compatible for flashing\n");
+	return -1;
+}
+
+static int be_fw_download(struct be_adapter *adapter, const struct firmware* fw)
+{
+	struct flash_file_hdr_g3 *fhdr3;
+	struct image_hdr *img_hdr_ptr = NULL;
+	struct be_dma_mem flash_cmd;
+	const u8 *p;
+	int status = 0, i = 0, num_imgs = 0, ufi_type = 0;
+
+	flash_cmd.size = sizeof(struct be_cmd_write_flashrom);
+	flash_cmd.va = dma_alloc_coherent(&adapter->pdev->dev, flash_cmd.size,
+					  &flash_cmd.dma, GFP_KERNEL);
+	if (!flash_cmd.va) {
+		status = -ENOMEM;
+		goto be_fw_exit;
+	}
+
+	p = fw->data;
+	fhdr3 = (struct flash_file_hdr_g3 *)p;
+
+	ufi_type = be_get_ufi_type(adapter, fhdr3);
+
+	num_imgs = le32_to_cpu(fhdr3->num_imgs);
+	for (i = 0; i < num_imgs; i++) {
+		img_hdr_ptr = (struct image_hdr *)(fw->data +
+				(sizeof(struct flash_file_hdr_g3) +
+				 i * sizeof(struct image_hdr)));
+		if (le32_to_cpu(img_hdr_ptr->imageid) == 1) {
+			switch (ufi_type) {
+			case UFI_TYPE4:
+				status = be_flash_skyhawk(adapter, fw,
+							  &flash_cmd, num_imgs);
+				break;
+			case UFI_TYPE3R:
+				status = be_flash_BEx(adapter, fw, &flash_cmd,
+						      num_imgs);
+				break;
+			case UFI_TYPE3:
+				/* Do not flash this ufi on BE3-R cards */
+				if (adapter->asic_rev < 0x10)
+					status = be_flash_BEx(adapter, fw,
+							      &flash_cmd,
+							      num_imgs);
+				else {
+					status = -EINVAL;
+					dev_err(&adapter->pdev->dev,
+						"Can't load BE3 UFI on BE3R\n");
+				}
+			}
+		}
+	}
+
+	if (ufi_type == UFI_TYPE2)
+		status = be_flash_BEx(adapter, fw, &flash_cmd, 0);
+	else if (ufi_type == -1)
+		status = -EINVAL;
+
+	dma_free_coherent(&adapter->pdev->dev, flash_cmd.size, flash_cmd.va,
+			  flash_cmd.dma);
+	if (status) {
+		dev_err(&adapter->pdev->dev, "Firmware load error\n");
+		goto be_fw_exit;
+	}
+
+	dev_info(&adapter->pdev->dev, "Firmware flashed successfully\n");
+
+be_fw_exit:
+	return status;
+}
+
+int be_load_fw(struct be_adapter *adapter, u8 *fw_file)
+{
+	const struct firmware *fw;
+	int status;
+
+	if (!netif_running(adapter->netdev)) {
+		dev_err(&adapter->pdev->dev,
+			"Firmware load not allowed (interface is down)\n");
+		return -ENETDOWN;
+	}
+
+	status = request_firmware(&fw, fw_file, &adapter->pdev->dev);
+	if (status)
+		goto fw_exit;
+
+	dev_info(&adapter->pdev->dev, "Flashing firmware file %s\n", fw_file);
+
+	if (lancer_chip(adapter))
+		status = lancer_fw_download(adapter, fw);
+	else
+		status = be_fw_download(adapter, fw);
+
+	if (!status)
+		be_cmd_get_fw_ver(adapter);
+
+fw_exit:
+	release_firmware(fw);
+	return status;
+}
+
+static int be_ndo_bridge_setlink(struct net_device *dev, struct nlmsghdr *nlh)
+{
+	struct be_adapter *adapter = netdev_priv(dev);
+	struct nlattr *attr, *br_spec;
+	int rem;
+	int status = 0;
+	u16 mode = 0;
+
+	if (!sriov_enabled(adapter))
+		return -EOPNOTSUPP;
+
+	br_spec = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg), IFLA_AF_SPEC);
+	if (!br_spec)
+		return -EINVAL;
+
+	nla_for_each_nested(attr, br_spec, rem) {
+		if (nla_type(attr) != IFLA_BRIDGE_MODE)
+			continue;
+
+		if (nla_len(attr) < sizeof(mode))
+			return -EINVAL;
+
+		mode = nla_get_u16(attr);
+		if (mode != BRIDGE_MODE_VEPA && mode != BRIDGE_MODE_VEB)
+			return -EINVAL;
+
+		status = be_cmd_set_hsw_config(adapter, 0, 0,
+					       adapter->if_handle,
+					       mode == BRIDGE_MODE_VEPA ?
+					       PORT_FWD_TYPE_VEPA :
+					       PORT_FWD_TYPE_VEB);
+		if (status)
+			goto err;
+
+		dev_info(&adapter->pdev->dev, "enabled switch mode: %s\n",
+			 mode == BRIDGE_MODE_VEPA ? "VEPA" : "VEB");
+
+		return status;
+	}
+err:
+	dev_err(&adapter->pdev->dev, "Failed to set switch mode %s\n",
+		mode == BRIDGE_MODE_VEPA ? "VEPA" : "VEB");
+
+	return status;
+}
+
+static int be_ndo_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq,
+				 struct net_device *dev, u32 filter_mask)
+{
+	struct be_adapter *adapter = netdev_priv(dev);
+	int status = 0;
+	u8 hsw_mode;
+
+	if (!sriov_enabled(adapter))
+		return 0;
+
+	/* BE and Lancer chips support VEB mode only */
+	if (BEx_chip(adapter) || lancer_chip(adapter)) {
+		hsw_mode = PORT_FWD_TYPE_VEB;
+	} else {
+		status = be_cmd_get_hsw_config(adapter, NULL, 0,
+					       adapter->if_handle, &hsw_mode);
+		if (status)
+			return 0;
+	}
+
+	return ndo_dflt_bridge_getlink(skb, pid, seq, dev,
+				       hsw_mode == PORT_FWD_TYPE_VEPA ?
+				       BRIDGE_MODE_VEPA : BRIDGE_MODE_VEB);
+}
+
+#ifdef CONFIG_BE2NET_VXLAN
+static void be_add_vxlan_port(struct net_device *netdev, sa_family_t sa_family,
+			      __be16 port)
+{
+	struct be_adapter *adapter = netdev_priv(netdev);
+	struct device *dev = &adapter->pdev->dev;
+	int status;
+
+	if (lancer_chip(adapter) || BEx_chip(adapter))
+		return;
+
+	if (adapter->flags & BE_FLAGS_VXLAN_OFFLOADS) {
+		dev_warn(dev, "Cannot add UDP port %d for VxLAN offloads\n",
+			 be16_to_cpu(port));
+		dev_info(dev,
+			 "Only one UDP port supported for VxLAN offloads\n");
+		return;
+	}
+
+	status = be_cmd_manage_iface(adapter, adapter->if_handle,
+				     OP_CONVERT_NORMAL_TO_TUNNEL);
+	if (status) {
+		dev_warn(dev, "Failed to convert normal interface to tunnel\n");
+		goto err;
+	}
+
+	status = be_cmd_set_vxlan_port(adapter, port);
+	if (status) {
+		dev_warn(dev, "Failed to add VxLAN port\n");
+		goto err;
+	}
+	adapter->flags |= BE_FLAGS_VXLAN_OFFLOADS;
+	adapter->vxlan_port = port;
+
+	dev_info(dev, "Enabled VxLAN offloads for UDP port %d\n",
+		 be16_to_cpu(port));
+	return;
+err:
+	be_disable_vxlan_offloads(adapter);
+}
+
+static void be_del_vxlan_port(struct net_device *netdev, sa_family_t sa_family,
+			      __be16 port)
+{
+	struct be_adapter *adapter = netdev_priv(netdev);
+
+	if (lancer_chip(adapter) || BEx_chip(adapter))
+		return;
+
+	if (adapter->vxlan_port != port)
+		return;
+
+	be_disable_vxlan_offloads(adapter);
+
+	dev_info(&adapter->pdev->dev,
+		 "Disabled VxLAN offloads for UDP port %d\n",
+		 be16_to_cpu(port));
+}
+
+static bool be_gso_check(struct sk_buff *skb, struct net_device *dev)
+{
+	return vxlan_gso_check(skb);
+}
+#endif
+
+static const struct net_device_ops be_netdev_ops = {
+	.ndo_open		= be_open,
+	.ndo_stop		= be_close,
+	.ndo_start_xmit		= be_xmit,
+	.ndo_set_rx_mode	= be_set_rx_mode,
+	.ndo_set_mac_address	= be_mac_addr_set,
+	.ndo_change_mtu		= be_change_mtu,
+	.ndo_get_stats64	= be_get_stats64,
+	.ndo_validate_addr	= eth_validate_addr,
+	.ndo_vlan_rx_add_vid	= be_vlan_add_vid,
+	.ndo_vlan_rx_kill_vid	= be_vlan_rem_vid,
+	.ndo_set_vf_mac		= be_set_vf_mac,
+	.ndo_set_vf_vlan	= be_set_vf_vlan,
+	.ndo_set_vf_rate	= be_set_vf_tx_rate,
+	.ndo_get_vf_config	= be_get_vf_config,
+	.ndo_set_vf_link_state  = be_set_vf_link_state,
+#ifdef CONFIG_NET_POLL_CONTROLLER
+	.ndo_poll_controller	= be_netpoll,
+#endif
+	.ndo_bridge_setlink	= be_ndo_bridge_setlink,
+	.ndo_bridge_getlink	= be_ndo_bridge_getlink,
+#ifdef CONFIG_NET_RX_BUSY_POLL
+	.ndo_busy_poll		= be_busy_poll,
+#endif
+#ifdef CONFIG_BE2NET_VXLAN
+	.ndo_add_vxlan_port	= be_add_vxlan_port,
+	.ndo_del_vxlan_port	= be_del_vxlan_port,
+	.ndo_gso_check		= be_gso_check,
+#endif
+};
+
+static void be_netdev_init(struct net_device *netdev)
+{
+	struct be_adapter *adapter = netdev_priv(netdev);
+
+	if (skyhawk_chip(adapter)) {
+		netdev->hw_enc_features |= NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM |
+					   NETIF_F_TSO | NETIF_F_TSO6 |
+					   NETIF_F_GSO_UDP_TUNNEL;
+		netdev->hw_features |= NETIF_F_GSO_UDP_TUNNEL;
+	}
+	netdev->hw_features |= NETIF_F_SG | NETIF_F_TSO | NETIF_F_TSO6 |
+		NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM | NETIF_F_RXCSUM |
+		NETIF_F_HW_VLAN_CTAG_TX;
+	if (be_multi_rxq(adapter))
+		netdev->hw_features |= NETIF_F_RXHASH;
+
+	netdev->features |= netdev->hw_features |
+		NETIF_F_HW_VLAN_CTAG_RX | NETIF_F_HW_VLAN_CTAG_FILTER;
+
+	netdev->vlan_features |= NETIF_F_SG | NETIF_F_TSO | NETIF_F_TSO6 |
+		NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM;
+
+	netdev->priv_flags |= IFF_UNICAST_FLT;
+
+	netdev->flags |= IFF_MULTICAST;
+
+	netif_set_gso_max_size(netdev, 65535 - ETH_HLEN);
+
+	netdev->netdev_ops = &be_netdev_ops;
+
+	netdev->ethtool_ops = &be_ethtool_ops;
+}
+
+static void be_unmap_pci_bars(struct be_adapter *adapter)
+{
+	if (adapter->csr)
+		pci_iounmap(adapter->pdev, adapter->csr);
+	if (adapter->db)
+		pci_iounmap(adapter->pdev, adapter->db);
+}
+
+static int db_bar(struct be_adapter *adapter)
+{
+	if (lancer_chip(adapter) || !be_physfn(adapter))
+		return 0;
+	else
+		return 4;
+}
+
+static int be_roce_map_pci_bars(struct be_adapter *adapter)
+{
+	if (skyhawk_chip(adapter)) {
+		adapter->roce_db.size = 4096;
+		adapter->roce_db.io_addr = pci_resource_start(adapter->pdev,
+							      db_bar(adapter));
+		adapter->roce_db.total_size = pci_resource_len(adapter->pdev,
+							       db_bar(adapter));
+	}
+	return 0;
+}
+
+static int be_map_pci_bars(struct be_adapter *adapter)
+{
+	u8 __iomem *addr;
+
+	if (BEx_chip(adapter) && be_physfn(adapter)) {
+		adapter->csr = pci_iomap(adapter->pdev, 2, 0);
+		if (!adapter->csr)
+			return -ENOMEM;
+	}
+
+	addr = pci_iomap(adapter->pdev, db_bar(adapter), 0);
+	if (!addr)
+		goto pci_map_err;
+	adapter->db = addr;
+
+	be_roce_map_pci_bars(adapter);
+	return 0;
+
+pci_map_err:
+	dev_err(&adapter->pdev->dev, "Error in mapping PCI BARs\n");
+	be_unmap_pci_bars(adapter);
+	return -ENOMEM;
+}
+
+static void be_ctrl_cleanup(struct be_adapter *adapter)
+{
+	struct be_dma_mem *mem = &adapter->mbox_mem_alloced;
+
+	be_unmap_pci_bars(adapter);
+
+	if (mem->va)
+		dma_free_coherent(&adapter->pdev->dev, mem->size, mem->va,
+				  mem->dma);
+
+	mem = &adapter->rx_filter;
+	if (mem->va)
+		dma_free_coherent(&adapter->pdev->dev, mem->size, mem->va,
+				  mem->dma);
+}
+
+static int be_ctrl_init(struct be_adapter *adapter)
+{
+	struct be_dma_mem *mbox_mem_alloc = &adapter->mbox_mem_alloced;
+	struct be_dma_mem *mbox_mem_align = &adapter->mbox_mem;
+	struct be_dma_mem *rx_filter = &adapter->rx_filter;
+	u32 sli_intf;
+	int status;
+
+	pci_read_config_dword(adapter->pdev, SLI_INTF_REG_OFFSET, &sli_intf);
+	adapter->sli_family = (sli_intf & SLI_INTF_FAMILY_MASK) >>
+				 SLI_INTF_FAMILY_SHIFT;
+	adapter->virtfn = (sli_intf & SLI_INTF_FT_MASK) ? 1 : 0;
+
+	status = be_map_pci_bars(adapter);
+	if (status)
+		goto done;
+
+	mbox_mem_alloc->size = sizeof(struct be_mcc_mailbox) + 16;
+	mbox_mem_alloc->va = dma_alloc_coherent(&adapter->pdev->dev,
+						mbox_mem_alloc->size,
+						&mbox_mem_alloc->dma,
+						GFP_KERNEL);
+	if (!mbox_mem_alloc->va) {
+		status = -ENOMEM;
+		goto unmap_pci_bars;
+	}
+	mbox_mem_align->size = sizeof(struct be_mcc_mailbox);
+	mbox_mem_align->va = PTR_ALIGN(mbox_mem_alloc->va, 16);
+	mbox_mem_align->dma = PTR_ALIGN(mbox_mem_alloc->dma, 16);
+	memset(mbox_mem_align->va, 0, sizeof(struct be_mcc_mailbox));
+
+	rx_filter->size = sizeof(struct be_cmd_req_rx_filter);
+	rx_filter->va = dma_zalloc_coherent(&adapter->pdev->dev,
+					    rx_filter->size, &rx_filter->dma,
+					    GFP_KERNEL);
+	if (!rx_filter->va) {
+		status = -ENOMEM;
+		goto free_mbox;
+	}
+
+	mutex_init(&adapter->mbox_lock);
+	spin_lock_init(&adapter->mcc_lock);
+	spin_lock_init(&adapter->mcc_cq_lock);
+
+	init_completion(&adapter->et_cmd_compl);
+	pci_save_state(adapter->pdev);
+	return 0;
+
+free_mbox:
+	dma_free_coherent(&adapter->pdev->dev, mbox_mem_alloc->size,
+			  mbox_mem_alloc->va, mbox_mem_alloc->dma);
+
+unmap_pci_bars:
+	be_unmap_pci_bars(adapter);
+
+done:
+	return status;
+}
+
+static void be_stats_cleanup(struct be_adapter *adapter)
+{
+	struct be_dma_mem *cmd = &adapter->stats_cmd;
+
+	if (cmd->va)
+		dma_free_coherent(&adapter->pdev->dev, cmd->size,
+				  cmd->va, cmd->dma);
+}
+
+static int be_stats_init(struct be_adapter *adapter)
+{
+	struct be_dma_mem *cmd = &adapter->stats_cmd;
+
+	if (lancer_chip(adapter))
+		cmd->size = sizeof(struct lancer_cmd_req_pport_stats);
+	else if (BE2_chip(adapter))
+		cmd->size = sizeof(struct be_cmd_req_get_stats_v0);
+	else if (BE3_chip(adapter))
+		cmd->size = sizeof(struct be_cmd_req_get_stats_v1);
+	else
+		/* ALL non-BE ASICs */
+		cmd->size = sizeof(struct be_cmd_req_get_stats_v2);
+
+	cmd->va = dma_zalloc_coherent(&adapter->pdev->dev, cmd->size, &cmd->dma,
+				      GFP_KERNEL);
+	if (!cmd->va)
+		return -ENOMEM;
+	return 0;
+}
+
+static void be_remove(struct pci_dev *pdev)
+{
+	struct be_adapter *adapter = pci_get_drvdata(pdev);
+
+	if (!adapter)
+		return;
+
+	be_roce_dev_remove(adapter);
+	be_intr_set(adapter, false);
+
+	cancel_delayed_work_sync(&adapter->func_recovery_work);
+
+	unregister_netdev(adapter->netdev);
+
+	be_clear(adapter);
+
+	/* tell fw we're done with firing cmds */
+	be_cmd_fw_clean(adapter);
+
+	be_stats_cleanup(adapter);
+
+	be_ctrl_cleanup(adapter);
+
+	pci_disable_pcie_error_reporting(pdev);
+
+	pci_release_regions(pdev);
+	pci_disable_device(pdev);
+
+	free_netdev(adapter->netdev);
+}
+
+static int be_get_initial_config(struct be_adapter *adapter)
+{
+	int status, level;
+
+	status = be_cmd_get_cntl_attributes(adapter);
+	if (status)
+		return status;
+
+	/* Must be a power of 2 or else MODULO will BUG_ON */
+	adapter->be_get_temp_freq = 64;
+
+	if (BEx_chip(adapter)) {
+		level = be_cmd_get_fw_log_level(adapter);
+		adapter->msg_enable =
+			level <= FW_LOG_LEVEL_DEFAULT ? NETIF_MSG_HW : 0;
+	}
+
+	adapter->cfg_num_qs = netif_get_num_default_rss_queues();
+	return 0;
+}
+
+static int lancer_recover_func(struct be_adapter *adapter)
+{
+	struct device *dev = &adapter->pdev->dev;
+	int status;
+
+	status = lancer_test_and_set_rdy_state(adapter);
+	if (status)
+		goto err;
+
+	if (netif_running(adapter->netdev))
+		be_close(adapter->netdev);
+
+	be_clear(adapter);
+
+	be_clear_all_error(adapter);
+
+	status = be_setup(adapter);
+	if (status)
+		goto err;
+
+	if (netif_running(adapter->netdev)) {
+		status = be_open(adapter->netdev);
+		if (status)
+			goto err;
+	}
+
+	dev_err(dev, "Adapter recovery successful\n");
+	return 0;
+err:
+	if (status == -EAGAIN)
+		dev_err(dev, "Waiting for resource provisioning\n");
+	else
+		dev_err(dev, "Adapter recovery failed\n");
+
+	return status;
+}
+
+static void be_func_recovery_task(struct work_struct *work)
+{
+	struct be_adapter *adapter =
+		container_of(work, struct be_adapter,  func_recovery_work.work);
+	int status = 0;
+
+	be_detect_error(adapter);
+
+	if (adapter->hw_error && lancer_chip(adapter)) {
+		rtnl_lock();
+		netif_device_detach(adapter->netdev);
+		rtnl_unlock();
+
+		status = lancer_recover_func(adapter);
+		if (!status)
+			netif_device_attach(adapter->netdev);
+	}
+
+	/* In Lancer, for all errors other than provisioning error (-EAGAIN),
+	 * no need to attempt further recovery.
+	 */
+	if (!status || status == -EAGAIN)
+		schedule_delayed_work(&adapter->func_recovery_work,
+				      msecs_to_jiffies(1000));
+}
+
+static void be_worker(struct work_struct *work)
+{
+	struct be_adapter *adapter =
+		container_of(work, struct be_adapter, work.work);
+	struct be_rx_obj *rxo;
+	int i;
+
+	/* when interrupts are not yet enabled, just reap any pending
+	* mcc completions */
+	if (!netif_running(adapter->netdev)) {
+		local_bh_disable();
+		be_process_mcc(adapter);
+		local_bh_enable();
+		goto reschedule;
+	}
+
+	if (!adapter->stats_cmd_sent) {
+		if (lancer_chip(adapter))
+			lancer_cmd_get_pport_stats(adapter,
+						   &adapter->stats_cmd);
+		else
+			be_cmd_get_stats(adapter, &adapter->stats_cmd);
+	}
+
+	if (be_physfn(adapter) &&
+	    MODULO(adapter->work_counter, adapter->be_get_temp_freq) == 0)
+		be_cmd_get_die_temperature(adapter);
+
+	for_all_rx_queues(adapter, rxo, i) {
+		/* Replenish RX-queues starved due to memory
+		 * allocation failures.
+		 */
+		if (rxo->rx_post_starved)
+			be_post_rx_frags(rxo, GFP_KERNEL, MAX_RX_POST);
+	}
+
+	be_eqd_update(adapter);
+
+reschedule:
+	adapter->work_counter++;
+	schedule_delayed_work(&adapter->work, msecs_to_jiffies(1000));
+}
+
+/* If any VFs are already enabled don't FLR the PF */
+static bool be_reset_required(struct be_adapter *adapter)
+{
+	return pci_num_vf(adapter->pdev) ? false : true;
+}
+
+static char *mc_name(struct be_adapter *adapter)
+{
+	char *str = "";	/* default */
+
+	switch (adapter->mc_type) {
+	case UMC:
+		str = "UMC";
+		break;
+	case FLEX10:
+		str = "FLEX10";
+		break;
+	case vNIC1:
+		str = "vNIC-1";
+		break;
+	case nPAR:
+		str = "nPAR";
+		break;
+	case UFP:
+		str = "UFP";
+		break;
+	case vNIC2:
+		str = "vNIC-2";
+		break;
+	default:
+		str = "";
+	}
+
+	return str;
+}
+
+static inline char *func_name(struct be_adapter *adapter)
+{
+	return be_physfn(adapter) ? "PF" : "VF";
+}
+
+static int be_probe(struct pci_dev *pdev, const struct pci_device_id *pdev_id)
+{
+	int status = 0;
+	struct be_adapter *adapter;
+	struct net_device *netdev;
+	char port_name;
+
+	dev_info(&pdev->dev, "%s version is %s\n", DRV_NAME, DRV_VER);
+
+	status = pci_enable_device(pdev);
+	if (status)
+		goto do_none;
+
+	status = pci_request_regions(pdev, DRV_NAME);
+	if (status)
+		goto disable_dev;
+	pci_set_master(pdev);
+
+	netdev = alloc_etherdev_mqs(sizeof(*adapter), MAX_TX_QS, MAX_RX_QS);
+	if (!netdev) {
+		status = -ENOMEM;
+		goto rel_reg;
+	}
+	adapter = netdev_priv(netdev);
+	adapter->pdev = pdev;
+	pci_set_drvdata(pdev, adapter);
+	adapter->netdev = netdev;
+	SET_NETDEV_DEV(netdev, &pdev->dev);
+
+	status = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64));
+	if (!status) {
+		netdev->features |= NETIF_F_HIGHDMA;
+	} else {
+		status = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(32));
+		if (status) {
+			dev_err(&pdev->dev, "Could not set PCI DMA Mask\n");
+			goto free_netdev;
+		}
+	}
+
+	status = pci_enable_pcie_error_reporting(pdev);
+	if (!status)
+		dev_info(&pdev->dev, "PCIe error reporting enabled\n");
+
+	status = be_ctrl_init(adapter);
+	if (status)
+		goto free_netdev;
+
+	/* sync up with fw's ready state */
+	if (be_physfn(adapter)) {
+		status = be_fw_wait_ready(adapter);
+		if (status)
+			goto ctrl_clean;
+	}
+
+	if (be_reset_required(adapter)) {
+		status = be_cmd_reset_function(adapter);
+		if (status)
+			goto ctrl_clean;
+
+		/* Wait for interrupts to quiesce after an FLR */
+		msleep(100);
+	}
+
+	/* Allow interrupts for other ULPs running on NIC function */
+	be_intr_set(adapter, true);
+
+	/* tell fw we're ready to fire cmds */
+	status = be_cmd_fw_init(adapter);
+	if (status)
+		goto ctrl_clean;
+
+	status = be_stats_init(adapter);
+	if (status)
+		goto ctrl_clean;
+
+	status = be_get_initial_config(adapter);
+	if (status)
+		goto stats_clean;
+
+	INIT_DELAYED_WORK(&adapter->work, be_worker);
+	INIT_DELAYED_WORK(&adapter->func_recovery_work, be_func_recovery_task);
+	adapter->rx_fc = true;
+	adapter->tx_fc = true;
+
+	status = be_setup(adapter);
+	if (status)
+		goto stats_clean;
+
+	be_netdev_init(netdev);
+	status = register_netdev(netdev);
+	if (status != 0)
+		goto unsetup;
+
+	be_roce_dev_add(adapter);
+
+	schedule_delayed_work(&adapter->func_recovery_work,
+			      msecs_to_jiffies(1000));
+
+	be_cmd_query_port_name(adapter, &port_name);
+
+	dev_info(&pdev->dev, "%s: %s %s port %c\n", nic_name(pdev),
+		 func_name(adapter), mc_name(adapter), port_name);
+
+	return 0;
+
+unsetup:
+	be_clear(adapter);
+stats_clean:
+	be_stats_cleanup(adapter);
+ctrl_clean:
+	be_ctrl_cleanup(adapter);
+free_netdev:
+	free_netdev(netdev);
+rel_reg:
+	pci_release_regions(pdev);
+disable_dev:
+	pci_disable_device(pdev);
+do_none:
+	dev_err(&pdev->dev, "%s initialization failed\n", nic_name(pdev));
+	return status;
+}
+
+static int be_suspend(struct pci_dev *pdev, pm_message_t state)
+{
+	struct be_adapter *adapter = pci_get_drvdata(pdev);
+	struct net_device *netdev =  adapter->netdev;
+
+	if (adapter->wol_en)
+		be_setup_wol(adapter, true);
+
+	be_intr_set(adapter, false);
+	cancel_delayed_work_sync(&adapter->func_recovery_work);
+
+	netif_device_detach(netdev);
+	if (netif_running(netdev)) {
+		rtnl_lock();
+		be_close(netdev);
+		rtnl_unlock();
+	}
+	be_clear(adapter);
+
+	pci_save_state(pdev);
+	pci_disable_device(pdev);
+	pci_set_power_state(pdev, pci_choose_state(pdev, state));
+	return 0;
+}
+
+static int be_resume(struct pci_dev *pdev)
+{
+	int status = 0;
+	struct be_adapter *adapter = pci_get_drvdata(pdev);
+	struct net_device *netdev =  adapter->netdev;
+
+	netif_device_detach(netdev);
+
+	status = pci_enable_device(pdev);
+	if (status)
+		return status;
+
+	pci_set_power_state(pdev, PCI_D0);
+	pci_restore_state(pdev);
+
+	status = be_fw_wait_ready(adapter);
+	if (status)
+		return status;
+
+	be_intr_set(adapter, true);
+	/* tell fw we're ready to fire cmds */
+	status = be_cmd_fw_init(adapter);
+	if (status)
+		return status;
+
+	be_setup(adapter);
+	if (netif_running(netdev)) {
+		rtnl_lock();
+		be_open(netdev);
+		rtnl_unlock();
+	}
+
+	schedule_delayed_work(&adapter->func_recovery_work,
+			      msecs_to_jiffies(1000));
+	netif_device_attach(netdev);
+
+	if (adapter->wol_en)
+		be_setup_wol(adapter, false);
+
+	return 0;
+}
+
+/*
+ * An FLR will stop BE from DMAing any data.
+ */
+static void be_shutdown(struct pci_dev *pdev)
+{
+	struct be_adapter *adapter = pci_get_drvdata(pdev);
+
+	if (!adapter)
+		return;
+
+	be_roce_dev_shutdown(adapter);
+	cancel_delayed_work_sync(&adapter->work);
+	cancel_delayed_work_sync(&adapter->func_recovery_work);
+
+	netif_device_detach(adapter->netdev);
+
+	be_cmd_reset_function(adapter);
+
+	pci_disable_device(pdev);
+}
+
+static pci_ers_result_t be_eeh_err_detected(struct pci_dev *pdev,
+					    pci_channel_state_t state)
+{
+	struct be_adapter *adapter = pci_get_drvdata(pdev);
+	struct net_device *netdev =  adapter->netdev;
+
+	dev_err(&adapter->pdev->dev, "EEH error detected\n");
+
+	if (!adapter->eeh_error) {
+		adapter->eeh_error = true;
+
+		cancel_delayed_work_sync(&adapter->func_recovery_work);
+
+		rtnl_lock();
+		netif_device_detach(netdev);
+		if (netif_running(netdev))
+			be_close(netdev);
+		rtnl_unlock();
+
+		be_clear(adapter);
+	}
+
+	if (state == pci_channel_io_perm_failure)
+		return PCI_ERS_RESULT_DISCONNECT;
+
+	pci_disable_device(pdev);
+
+	/* The error could cause the FW to trigger a flash debug dump.
+	 * Resetting the card while flash dump is in progress
+	 * can cause it not to recover; wait for it to finish.
+	 * Wait only for first function as it is needed only once per
+	 * adapter.
+	 */
+	if (pdev->devfn == 0)
+		ssleep(30);
+
+	return PCI_ERS_RESULT_NEED_RESET;
+}
+
+static pci_ers_result_t be_eeh_reset(struct pci_dev *pdev)
+{
+	struct be_adapter *adapter = pci_get_drvdata(pdev);
+	int status;
+
+	dev_info(&adapter->pdev->dev, "EEH reset\n");
+
+	status = pci_enable_device(pdev);
+	if (status)
+		return PCI_ERS_RESULT_DISCONNECT;
+
+	pci_set_master(pdev);
+	pci_set_power_state(pdev, PCI_D0);
+	pci_restore_state(pdev);
+
+	/* Check if card is ok and fw is ready */
+	dev_info(&adapter->pdev->dev,
+		 "Waiting for FW to be ready after EEH reset\n");
+	status = be_fw_wait_ready(adapter);
+	if (status)
+		return PCI_ERS_RESULT_DISCONNECT;
+
+	pci_cleanup_aer_uncorrect_error_status(pdev);
+	be_clear_all_error(adapter);
+	return PCI_ERS_RESULT_RECOVERED;
+}
+
+static void be_eeh_resume(struct pci_dev *pdev)
+{
+	int status = 0;
+	struct be_adapter *adapter = pci_get_drvdata(pdev);
+	struct net_device *netdev =  adapter->netdev;
+
+	dev_info(&adapter->pdev->dev, "EEH resume\n");
+
+	pci_save_state(pdev);
+
+	status = be_cmd_reset_function(adapter);
+	if (status)
+		goto err;
+
+	/* On some BE3 FW versions, after a HW reset,
+	 * interrupts will remain disabled for each function.
+	 * So, explicitly enable interrupts
+	 */
+	be_intr_set(adapter, true);
+
+	/* tell fw we're ready to fire cmds */
+	status = be_cmd_fw_init(adapter);
+	if (status)
+		goto err;
+
+	status = be_setup(adapter);
+	if (status)
+		goto err;
+
+	if (netif_running(netdev)) {
+		status = be_open(netdev);
+		if (status)
+			goto err;
+	}
+
+	schedule_delayed_work(&adapter->func_recovery_work,
+			      msecs_to_jiffies(1000));
+	netif_device_attach(netdev);
+	return;
+err:
+	dev_err(&adapter->pdev->dev, "EEH resume failed\n");
+}
+
+static const struct pci_error_handlers be_eeh_handlers = {
+	.error_detected = be_eeh_err_detected,
+	.slot_reset = be_eeh_reset,
+	.resume = be_eeh_resume,
+};
+
+static struct pci_driver be_driver = {
+	.name = DRV_NAME,
+	.id_table = be_dev_ids,
+	.probe = be_probe,
+	.remove = be_remove,
+	.suspend = be_suspend,
+	.resume = be_resume,
+	.shutdown = be_shutdown,
+	.err_handler = &be_eeh_handlers
+};
+
+static int __init be_init_module(void)
+{
+	if (rx_frag_size != 8192 && rx_frag_size != 4096 &&
+	    rx_frag_size != 2048) {
+		printk(KERN_WARNING DRV_NAME
+			" : Module param rx_frag_size must be 2048/4096/8192."
+			" Using 2048\n");
+		rx_frag_size = 2048;
+	}
+
+	return pci_register_driver(&be_driver);
+}
+module_init(be_init_module);
+
+static void __exit be_exit_module(void)
+{
+	pci_unregister_driver(&be_driver);
+}
+module_exit(be_exit_module);
diff --git a/drivers/net/ethernet/emulex/benet/be_roce.c b/drivers/net/ethernet/emulex/benet/be_roce.c
new file mode 100644
index 0000000..1328664
--- /dev/null
+++ b/drivers/net/ethernet/emulex/benet/be_roce.c
@@ -0,0 +1,200 @@
+/*
+ * Copyright (C) 2005 - 2014 Emulex
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation. The full GNU General
+ * Public License is included in this distribution in the file called COPYING.
+ *
+ * Contact Information:
+ * linux-drivers@emulex.com
+ *
+ * Emulex
+ * 3333 Susan Street
+ * Costa Mesa, CA 92626
+ */
+
+#include <linux/mutex.h>
+#include <linux/list.h>
+#include <linux/netdevice.h>
+#include <linux/module.h>
+
+#include "be.h"
+#include "be_cmds.h"
+
+static struct ocrdma_driver *ocrdma_drv;
+static LIST_HEAD(be_adapter_list);
+static DEFINE_MUTEX(be_adapter_list_lock);
+
+static void _be_roce_dev_add(struct be_adapter *adapter)
+{
+	struct be_dev_info dev_info;
+	int i, num_vec;
+	struct pci_dev *pdev = adapter->pdev;
+
+	if (!ocrdma_drv)
+		return;
+
+	if (ocrdma_drv->be_abi_version != BE_ROCE_ABI_VERSION) {
+		dev_warn(&pdev->dev, "Cannot initialize RoCE due to ocrdma ABI mismatch\n");
+		return;
+	}
+
+	if (pdev->device == OC_DEVICE_ID5) {
+		/* only msix is supported on these devices */
+		if (!msix_enabled(adapter))
+			return;
+		/* DPP region address and length */
+		dev_info.dpp_unmapped_addr = pci_resource_start(pdev, 2);
+		dev_info.dpp_unmapped_len = pci_resource_len(pdev, 2);
+	} else {
+		dev_info.dpp_unmapped_addr = 0;
+		dev_info.dpp_unmapped_len = 0;
+	}
+	dev_info.pdev = adapter->pdev;
+	dev_info.db = adapter->db;
+	dev_info.unmapped_db = adapter->roce_db.io_addr;
+	dev_info.db_page_size = adapter->roce_db.size;
+	dev_info.db_total_size = adapter->roce_db.total_size;
+	dev_info.netdev = adapter->netdev;
+	memcpy(dev_info.mac_addr, adapter->netdev->dev_addr, ETH_ALEN);
+	dev_info.dev_family = adapter->sli_family;
+	if (msix_enabled(adapter)) {
+		/* provide all the vectors, so that EQ creation response
+		 * can decide which one to use.
+		 */
+		num_vec = adapter->num_msix_vec + adapter->num_msix_roce_vec;
+		dev_info.intr_mode = BE_INTERRUPT_MODE_MSIX;
+		dev_info.msix.num_vectors = min(num_vec, MAX_MSIX_VECTORS);
+		/* provide start index of the vector,
+		 * so in case of linear usage,
+		 * it can use the base as starting point.
+		 */
+		dev_info.msix.start_vector = adapter->num_evt_qs;
+		for (i = 0; i < dev_info.msix.num_vectors; i++) {
+			dev_info.msix.vector_list[i] =
+			    adapter->msix_entries[i].vector;
+		}
+	} else {
+		dev_info.msix.num_vectors = 0;
+		dev_info.intr_mode = BE_INTERRUPT_MODE_INTX;
+	}
+	adapter->ocrdma_dev = ocrdma_drv->add(&dev_info);
+}
+
+void be_roce_dev_add(struct be_adapter *adapter)
+{
+	if (be_roce_supported(adapter)) {
+		INIT_LIST_HEAD(&adapter->entry);
+		mutex_lock(&be_adapter_list_lock);
+		list_add_tail(&adapter->entry, &be_adapter_list);
+
+		/* invoke add() routine of roce driver only if
+		 * valid driver registered with add method and add() is not yet
+		 * invoked on a given adapter.
+		 */
+		_be_roce_dev_add(adapter);
+		mutex_unlock(&be_adapter_list_lock);
+	}
+}
+
+static void _be_roce_dev_remove(struct be_adapter *adapter)
+{
+	if (ocrdma_drv && ocrdma_drv->remove && adapter->ocrdma_dev)
+		ocrdma_drv->remove(adapter->ocrdma_dev);
+	adapter->ocrdma_dev = NULL;
+}
+
+void be_roce_dev_remove(struct be_adapter *adapter)
+{
+	if (be_roce_supported(adapter)) {
+		mutex_lock(&be_adapter_list_lock);
+		_be_roce_dev_remove(adapter);
+		list_del(&adapter->entry);
+		mutex_unlock(&be_adapter_list_lock);
+	}
+}
+
+static void _be_roce_dev_open(struct be_adapter *adapter)
+{
+	if (ocrdma_drv && adapter->ocrdma_dev &&
+	    ocrdma_drv->state_change_handler)
+		ocrdma_drv->state_change_handler(adapter->ocrdma_dev,
+						 BE_DEV_UP);
+}
+
+void be_roce_dev_open(struct be_adapter *adapter)
+{
+	if (be_roce_supported(adapter)) {
+		mutex_lock(&be_adapter_list_lock);
+		_be_roce_dev_open(adapter);
+		mutex_unlock(&be_adapter_list_lock);
+	}
+}
+
+static void _be_roce_dev_close(struct be_adapter *adapter)
+{
+	if (ocrdma_drv && adapter->ocrdma_dev &&
+	    ocrdma_drv->state_change_handler)
+		ocrdma_drv->state_change_handler(adapter->ocrdma_dev,
+						 BE_DEV_DOWN);
+}
+
+void be_roce_dev_close(struct be_adapter *adapter)
+{
+	if (be_roce_supported(adapter)) {
+		mutex_lock(&be_adapter_list_lock);
+		_be_roce_dev_close(adapter);
+		mutex_unlock(&be_adapter_list_lock);
+	}
+}
+
+void be_roce_dev_shutdown(struct be_adapter *adapter)
+{
+	if (be_roce_supported(adapter)) {
+		mutex_lock(&be_adapter_list_lock);
+		if (ocrdma_drv && adapter->ocrdma_dev &&
+		    ocrdma_drv->state_change_handler)
+			ocrdma_drv->state_change_handler(adapter->ocrdma_dev,
+							 BE_DEV_SHUTDOWN);
+		mutex_unlock(&be_adapter_list_lock);
+	}
+}
+
+int be_roce_register_driver(struct ocrdma_driver *drv)
+{
+	struct be_adapter *dev;
+
+	mutex_lock(&be_adapter_list_lock);
+	if (ocrdma_drv) {
+		mutex_unlock(&be_adapter_list_lock);
+		return -EINVAL;
+	}
+	ocrdma_drv = drv;
+	list_for_each_entry(dev, &be_adapter_list, entry) {
+		struct net_device *netdev;
+
+		_be_roce_dev_add(dev);
+		netdev = dev->netdev;
+		if (netif_running(netdev) && netif_oper_up(netdev))
+			_be_roce_dev_open(dev);
+	}
+	mutex_unlock(&be_adapter_list_lock);
+	return 0;
+}
+EXPORT_SYMBOL(be_roce_register_driver);
+
+void be_roce_unregister_driver(struct ocrdma_driver *drv)
+{
+	struct be_adapter *dev;
+
+	mutex_lock(&be_adapter_list_lock);
+	list_for_each_entry(dev, &be_adapter_list, entry) {
+		if (dev->ocrdma_dev)
+			_be_roce_dev_remove(dev);
+	}
+	ocrdma_drv = NULL;
+	mutex_unlock(&be_adapter_list_lock);
+}
+EXPORT_SYMBOL(be_roce_unregister_driver);
diff --git a/drivers/net/ethernet/emulex/benet/be_roce.h b/drivers/net/ethernet/emulex/benet/be_roce.h
new file mode 100644
index 0000000..e6f7eb1
--- /dev/null
+++ b/drivers/net/ethernet/emulex/benet/be_roce.h
@@ -0,0 +1,79 @@
+/*
+ * Copyright (C) 2005 - 2014 Emulex
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation. The full GNU General
+ * Public License is included in this distribution in the file called COPYING.
+ *
+ * Contact Information:
+ * linux-drivers@emulex.com
+ *
+ * Emulex
+ * 3333 Susan Street
+ * Costa Mesa, CA 92626
+ */
+
+#ifndef BE_ROCE_H
+#define BE_ROCE_H
+
+#include <linux/pci.h>
+#include <linux/netdevice.h>
+
+#define BE_ROCE_ABI_VERSION	1
+
+struct ocrdma_dev;
+
+enum be_interrupt_mode {
+	BE_INTERRUPT_MODE_MSIX	= 0,
+	BE_INTERRUPT_MODE_INTX	= 1,
+	BE_INTERRUPT_MODE_MSI	= 2,
+};
+
+#define MAX_MSIX_VECTORS		32
+struct be_dev_info {
+	u8 __iomem *db;
+	u64 unmapped_db;
+	u32 db_page_size;
+	u32 db_total_size;
+	u64 dpp_unmapped_addr;
+	u32 dpp_unmapped_len;
+	struct pci_dev *pdev;
+	struct net_device *netdev;
+	u8 mac_addr[ETH_ALEN];
+	u32 dev_family;
+	enum be_interrupt_mode intr_mode;
+	struct {
+		int num_vectors;
+		int start_vector;
+		u32 vector_list[MAX_MSIX_VECTORS];
+	} msix;
+};
+
+/* ocrdma driver register's the callback functions with nic driver. */
+struct ocrdma_driver {
+	unsigned char name[32];
+	u32 be_abi_version;
+	struct ocrdma_dev *(*add) (struct be_dev_info *dev_info);
+	void (*remove) (struct ocrdma_dev *);
+	void (*state_change_handler) (struct ocrdma_dev *, u32 new_state);
+};
+
+enum {
+	BE_DEV_UP	= 0,
+	BE_DEV_DOWN	= 1,
+	BE_DEV_SHUTDOWN = 2
+};
+
+/* APIs for RoCE driver to register callback handlers,
+ * which will be invoked when device is added, removed, ifup, ifdown
+ */
+int be_roce_register_driver(struct ocrdma_driver *drv);
+void be_roce_unregister_driver(struct ocrdma_driver *drv);
+
+/* API for RoCE driver to issue mailbox commands */
+int be_roce_mcc_cmd(void *netdev_handle, void *wrb_payload,
+		    int wrb_payload_size, u16 *cmd_status, u16 *ext_status);
+
+#endif /* BE_ROCE_H */
diff --git a/drivers/net/ethernet/mellanox/Kconfig b/drivers/net/ethernet/mellanox/Kconfig
new file mode 100644
index 0000000..8cf7563
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/Kconfig
@@ -0,0 +1,24 @@
+#
+# Mellanox driver configuration
+#
+
+config NET_VENDOR_MELLANOX
+	bool "Mellanox devices"
+	default y
+	depends on PCI
+	---help---
+	  If you have a network (Ethernet) card belonging to this class, say Y
+	  and read the Ethernet-HOWTO, available from
+	  <http://www.tldp.org/docs.html#howto>.
+
+	  Note that the answer to this question doesn't directly affect the
+	  kernel: saying N will just cause the configurator to skip all
+	  the questions about Mellanox cards. If you say Y, you will be asked
+	  for your specific card in the following questions.
+
+if NET_VENDOR_MELLANOX
+
+source "drivers/net/ethernet/mellanox/mlx4/Kconfig"
+source "drivers/net/ethernet/mellanox/mlx5/core/Kconfig"
+
+endif # NET_VENDOR_MELLANOX
diff --git a/drivers/net/ethernet/mellanox/Makefile b/drivers/net/ethernet/mellanox/Makefile
new file mode 100644
index 0000000..38fe32e
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/Makefile
@@ -0,0 +1,6 @@
+#
+# Makefile for the Mellanox device drivers.
+#
+
+obj-$(CONFIG_MLX4_CORE) += mlx4/
+obj-$(CONFIG_MLX5_CORE) += mlx5/core/
diff --git a/drivers/net/ethernet/mellanox/mlx4/Kconfig b/drivers/net/ethernet/mellanox/mlx4/Kconfig
new file mode 100644
index 0000000..1486ce9
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4/Kconfig
@@ -0,0 +1,46 @@
+#
+# Mellanox driver configuration
+#
+
+config MLX4_EN
+	tristate "Mellanox Technologies 1/10/40Gbit Ethernet support"
+	depends on PCI
+	select MLX4_CORE
+	select PTP_1588_CLOCK
+	---help---
+	  This driver supports Mellanox Technologies ConnectX Ethernet
+	  devices.
+
+config MLX4_EN_DCB
+	bool "Data Center Bridging (DCB) Support"
+	default y
+	depends on MLX4_EN && DCB
+	---help---
+	  Say Y here if you want to use Data Center Bridging (DCB) in the
+	  driver.
+	  If set to N, will not be able to configure QoS and ratelimit attributes.
+	  This flag is depended on the kernel's DCB support.
+
+	  If unsure, set to Y
+
+config MLX4_EN_VXLAN
+	bool "VXLAN offloads Support"
+	default y
+	depends on MLX4_EN && VXLAN && !(MLX4_EN=y && VXLAN=m)
+	---help---
+	  Say Y here if you want to use VXLAN offloads in the driver.
+
+config MLX4_CORE
+	tristate
+	depends on PCI
+	default n
+
+config MLX4_DEBUG
+	bool "Verbose debugging output" if (MLX4_CORE && EXPERT)
+	depends on MLX4_CORE
+	default y
+	---help---
+	  This option causes debugging code to be compiled into the
+	  mlx4_core driver.  The output can be turned on via the
+	  debug_level module parameter (which can also be set after
+	  the driver is loaded through sysfs).
diff --git a/drivers/net/ethernet/mellanox/mlx4/Makefile b/drivers/net/ethernet/mellanox/mlx4/Makefile
new file mode 100644
index 0000000..3e9c70f
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4/Makefile
@@ -0,0 +1,10 @@
+obj-$(CONFIG_MLX4_CORE)		+= mlx4_core.o
+
+mlx4_core-y :=	alloc.o catas.o cmd.o cq.o eq.o fw.o icm.o intf.o main.o mcg.o \
+		mr.o pd.o port.o profile.o qp.o reset.o sense.o srq.o resource_tracker.o
+
+obj-$(CONFIG_MLX4_EN)               += mlx4_en.o
+
+mlx4_en-y := 	en_main.o en_tx.o en_rx.o en_ethtool.o en_port.o en_cq.o \
+		en_resources.o en_netdev.o en_selftest.o en_clock.o
+mlx4_en-$(CONFIG_MLX4_EN_DCB) += en_dcb_nl.o
diff --git a/drivers/net/ethernet/mellanox/mlx4/alloc.c b/drivers/net/ethernet/mellanox/mlx4/alloc.c
new file mode 100644
index 0000000..b0297da
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4/alloc.c
@@ -0,0 +1,419 @@
+/*
+ * Copyright (c) 2006, 2007 Cisco Systems, Inc.  All rights reserved.
+ * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/export.h>
+#include <linux/bitmap.h>
+#include <linux/dma-mapping.h>
+#include <linux/vmalloc.h>
+
+#include "mlx4.h"
+
+u32 mlx4_bitmap_alloc(struct mlx4_bitmap *bitmap)
+{
+	u32 obj;
+
+	spin_lock(&bitmap->lock);
+
+	obj = find_next_zero_bit(bitmap->table, bitmap->max, bitmap->last);
+	if (obj >= bitmap->max) {
+		bitmap->top = (bitmap->top + bitmap->max + bitmap->reserved_top)
+				& bitmap->mask;
+		obj = find_first_zero_bit(bitmap->table, bitmap->max);
+	}
+
+	if (obj < bitmap->max) {
+		set_bit(obj, bitmap->table);
+		bitmap->last = (obj + 1);
+		if (bitmap->last == bitmap->max)
+			bitmap->last = 0;
+		obj |= bitmap->top;
+	} else
+		obj = -1;
+
+	if (obj != -1)
+		--bitmap->avail;
+
+	spin_unlock(&bitmap->lock);
+
+	return obj;
+}
+
+void mlx4_bitmap_free(struct mlx4_bitmap *bitmap, u32 obj, int use_rr)
+{
+	mlx4_bitmap_free_range(bitmap, obj, 1, use_rr);
+}
+
+u32 mlx4_bitmap_alloc_range(struct mlx4_bitmap *bitmap, int cnt, int align)
+{
+	u32 obj;
+
+	if (likely(cnt == 1 && align == 1))
+		return mlx4_bitmap_alloc(bitmap);
+
+	spin_lock(&bitmap->lock);
+
+	obj = bitmap_find_next_zero_area(bitmap->table, bitmap->max,
+				bitmap->last, cnt, align - 1);
+	if (obj >= bitmap->max) {
+		bitmap->top = (bitmap->top + bitmap->max + bitmap->reserved_top)
+				& bitmap->mask;
+		obj = bitmap_find_next_zero_area(bitmap->table, bitmap->max,
+						0, cnt, align - 1);
+	}
+
+	if (obj < bitmap->max) {
+		bitmap_set(bitmap->table, obj, cnt);
+		if (obj == bitmap->last) {
+			bitmap->last = (obj + cnt);
+			if (bitmap->last >= bitmap->max)
+				bitmap->last = 0;
+		}
+		obj |= bitmap->top;
+	} else
+		obj = -1;
+
+	if (obj != -1)
+		bitmap->avail -= cnt;
+
+	spin_unlock(&bitmap->lock);
+
+	return obj;
+}
+
+u32 mlx4_bitmap_avail(struct mlx4_bitmap *bitmap)
+{
+	return bitmap->avail;
+}
+
+void mlx4_bitmap_free_range(struct mlx4_bitmap *bitmap, u32 obj, int cnt,
+			    int use_rr)
+{
+	obj &= bitmap->max + bitmap->reserved_top - 1;
+
+	spin_lock(&bitmap->lock);
+	if (!use_rr) {
+		bitmap->last = min(bitmap->last, obj);
+		bitmap->top = (bitmap->top + bitmap->max + bitmap->reserved_top)
+				& bitmap->mask;
+	}
+	bitmap_clear(bitmap->table, obj, cnt);
+	bitmap->avail += cnt;
+	spin_unlock(&bitmap->lock);
+}
+
+int mlx4_bitmap_init(struct mlx4_bitmap *bitmap, u32 num, u32 mask,
+		     u32 reserved_bot, u32 reserved_top)
+{
+	/* num must be a power of 2 */
+	if (num != roundup_pow_of_two(num))
+		return -EINVAL;
+
+	bitmap->last = 0;
+	bitmap->top  = 0;
+	bitmap->max  = num - reserved_top;
+	bitmap->mask = mask;
+	bitmap->reserved_top = reserved_top;
+	bitmap->avail = num - reserved_top - reserved_bot;
+	spin_lock_init(&bitmap->lock);
+	bitmap->table = kzalloc(BITS_TO_LONGS(bitmap->max) *
+				sizeof (long), GFP_KERNEL);
+	if (!bitmap->table)
+		return -ENOMEM;
+
+	bitmap_set(bitmap->table, 0, reserved_bot);
+
+	return 0;
+}
+
+void mlx4_bitmap_cleanup(struct mlx4_bitmap *bitmap)
+{
+	kfree(bitmap->table);
+}
+
+/*
+ * Handling for queue buffers -- we allocate a bunch of memory and
+ * register it in a memory region at HCA virtual address 0.  If the
+ * requested size is > max_direct, we split the allocation into
+ * multiple pages, so we don't require too much contiguous memory.
+ */
+
+int mlx4_buf_alloc(struct mlx4_dev *dev, int size, int max_direct,
+		   struct mlx4_buf *buf, gfp_t gfp)
+{
+	dma_addr_t t;
+
+	if (size <= max_direct) {
+		buf->nbufs        = 1;
+		buf->npages       = 1;
+		buf->page_shift   = get_order(size) + PAGE_SHIFT;
+		buf->direct.buf   = dma_alloc_coherent(&dev->pdev->dev,
+						       size, &t, gfp);
+		if (!buf->direct.buf)
+			return -ENOMEM;
+
+		buf->direct.map = t;
+
+		while (t & ((1 << buf->page_shift) - 1)) {
+			--buf->page_shift;
+			buf->npages *= 2;
+		}
+
+		memset(buf->direct.buf, 0, size);
+	} else {
+		int i;
+
+		buf->direct.buf  = NULL;
+		buf->nbufs       = (size + PAGE_SIZE - 1) / PAGE_SIZE;
+		buf->npages      = buf->nbufs;
+		buf->page_shift  = PAGE_SHIFT;
+		buf->page_list   = kcalloc(buf->nbufs, sizeof(*buf->page_list),
+					   gfp);
+		if (!buf->page_list)
+			return -ENOMEM;
+
+		for (i = 0; i < buf->nbufs; ++i) {
+			buf->page_list[i].buf =
+				dma_alloc_coherent(&dev->pdev->dev, PAGE_SIZE,
+						   &t, gfp);
+			if (!buf->page_list[i].buf)
+				goto err_free;
+
+			buf->page_list[i].map = t;
+
+			memset(buf->page_list[i].buf, 0, PAGE_SIZE);
+		}
+
+		if (BITS_PER_LONG == 64) {
+			struct page **pages;
+			pages = kmalloc(sizeof *pages * buf->nbufs, gfp);
+			if (!pages)
+				goto err_free;
+			for (i = 0; i < buf->nbufs; ++i)
+				pages[i] = virt_to_page(buf->page_list[i].buf);
+			buf->direct.buf = vmap(pages, buf->nbufs, VM_MAP, PAGE_KERNEL);
+			kfree(pages);
+			if (!buf->direct.buf)
+				goto err_free;
+		}
+	}
+
+	return 0;
+
+err_free:
+	mlx4_buf_free(dev, size, buf);
+
+	return -ENOMEM;
+}
+EXPORT_SYMBOL_GPL(mlx4_buf_alloc);
+
+void mlx4_buf_free(struct mlx4_dev *dev, int size, struct mlx4_buf *buf)
+{
+	int i;
+
+	if (buf->nbufs == 1)
+		dma_free_coherent(&dev->pdev->dev, size, buf->direct.buf,
+				  buf->direct.map);
+	else {
+		if (BITS_PER_LONG == 64 && buf->direct.buf)
+			vunmap(buf->direct.buf);
+
+		for (i = 0; i < buf->nbufs; ++i)
+			if (buf->page_list[i].buf)
+				dma_free_coherent(&dev->pdev->dev, PAGE_SIZE,
+						  buf->page_list[i].buf,
+						  buf->page_list[i].map);
+		kfree(buf->page_list);
+	}
+}
+EXPORT_SYMBOL_GPL(mlx4_buf_free);
+
+static struct mlx4_db_pgdir *mlx4_alloc_db_pgdir(struct device *dma_device,
+						 gfp_t gfp)
+{
+	struct mlx4_db_pgdir *pgdir;
+
+	pgdir = kzalloc(sizeof *pgdir, gfp);
+	if (!pgdir)
+		return NULL;
+
+	bitmap_fill(pgdir->order1, MLX4_DB_PER_PAGE / 2);
+	pgdir->bits[0] = pgdir->order0;
+	pgdir->bits[1] = pgdir->order1;
+	pgdir->db_page = dma_alloc_coherent(dma_device, PAGE_SIZE,
+					    &pgdir->db_dma, gfp);
+	if (!pgdir->db_page) {
+		kfree(pgdir);
+		return NULL;
+	}
+
+	return pgdir;
+}
+
+static int mlx4_alloc_db_from_pgdir(struct mlx4_db_pgdir *pgdir,
+				    struct mlx4_db *db, int order)
+{
+	int o;
+	int i;
+
+	for (o = order; o <= 1; ++o) {
+		i = find_first_bit(pgdir->bits[o], MLX4_DB_PER_PAGE >> o);
+		if (i < MLX4_DB_PER_PAGE >> o)
+			goto found;
+	}
+
+	return -ENOMEM;
+
+found:
+	clear_bit(i, pgdir->bits[o]);
+
+	i <<= o;
+
+	if (o > order)
+		set_bit(i ^ 1, pgdir->bits[order]);
+
+	db->u.pgdir = pgdir;
+	db->index   = i;
+	db->db      = pgdir->db_page + db->index;
+	db->dma     = pgdir->db_dma  + db->index * 4;
+	db->order   = order;
+
+	return 0;
+}
+
+int mlx4_db_alloc(struct mlx4_dev *dev, struct mlx4_db *db, int order, gfp_t gfp)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_db_pgdir *pgdir;
+	int ret = 0;
+
+	mutex_lock(&priv->pgdir_mutex);
+
+	list_for_each_entry(pgdir, &priv->pgdir_list, list)
+		if (!mlx4_alloc_db_from_pgdir(pgdir, db, order))
+			goto out;
+
+	pgdir = mlx4_alloc_db_pgdir(&(dev->pdev->dev), gfp);
+	if (!pgdir) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	list_add(&pgdir->list, &priv->pgdir_list);
+
+	/* This should never fail -- we just allocated an empty page: */
+	WARN_ON(mlx4_alloc_db_from_pgdir(pgdir, db, order));
+
+out:
+	mutex_unlock(&priv->pgdir_mutex);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(mlx4_db_alloc);
+
+void mlx4_db_free(struct mlx4_dev *dev, struct mlx4_db *db)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int o;
+	int i;
+
+	mutex_lock(&priv->pgdir_mutex);
+
+	o = db->order;
+	i = db->index;
+
+	if (db->order == 0 && test_bit(i ^ 1, db->u.pgdir->order0)) {
+		clear_bit(i ^ 1, db->u.pgdir->order0);
+		++o;
+	}
+	i >>= o;
+	set_bit(i, db->u.pgdir->bits[o]);
+
+	if (bitmap_full(db->u.pgdir->order1, MLX4_DB_PER_PAGE / 2)) {
+		dma_free_coherent(&(dev->pdev->dev), PAGE_SIZE,
+				  db->u.pgdir->db_page, db->u.pgdir->db_dma);
+		list_del(&db->u.pgdir->list);
+		kfree(db->u.pgdir);
+	}
+
+	mutex_unlock(&priv->pgdir_mutex);
+}
+EXPORT_SYMBOL_GPL(mlx4_db_free);
+
+int mlx4_alloc_hwq_res(struct mlx4_dev *dev, struct mlx4_hwq_resources *wqres,
+		       int size, int max_direct)
+{
+	int err;
+
+	err = mlx4_db_alloc(dev, &wqres->db, 1, GFP_KERNEL);
+	if (err)
+		return err;
+
+	*wqres->db.db = 0;
+
+	err = mlx4_buf_alloc(dev, size, max_direct, &wqres->buf, GFP_KERNEL);
+	if (err)
+		goto err_db;
+
+	err = mlx4_mtt_init(dev, wqres->buf.npages, wqres->buf.page_shift,
+			    &wqres->mtt);
+	if (err)
+		goto err_buf;
+
+	err = mlx4_buf_write_mtt(dev, &wqres->mtt, &wqres->buf, GFP_KERNEL);
+	if (err)
+		goto err_mtt;
+
+	return 0;
+
+err_mtt:
+	mlx4_mtt_cleanup(dev, &wqres->mtt);
+err_buf:
+	mlx4_buf_free(dev, size, &wqres->buf);
+err_db:
+	mlx4_db_free(dev, &wqres->db);
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx4_alloc_hwq_res);
+
+void mlx4_free_hwq_res(struct mlx4_dev *dev, struct mlx4_hwq_resources *wqres,
+		       int size)
+{
+	mlx4_mtt_cleanup(dev, &wqres->mtt);
+	mlx4_buf_free(dev, size, &wqres->buf);
+	mlx4_db_free(dev, &wqres->db);
+}
+EXPORT_SYMBOL_GPL(mlx4_free_hwq_res);
diff --git a/drivers/net/ethernet/mellanox/mlx4/catas.c b/drivers/net/ethernet/mellanox/mlx4/catas.c
new file mode 100644
index 0000000..9c656fe
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4/catas.c
@@ -0,0 +1,171 @@
+/*
+ * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
+ * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/workqueue.h>
+#include <linux/module.h>
+
+#include "mlx4.h"
+
+enum {
+	MLX4_CATAS_POLL_INTERVAL	= 5 * HZ,
+};
+
+static DEFINE_SPINLOCK(catas_lock);
+
+static LIST_HEAD(catas_list);
+static struct work_struct catas_work;
+
+static int internal_err_reset = 1;
+module_param(internal_err_reset, int, 0644);
+MODULE_PARM_DESC(internal_err_reset,
+		 "Reset device on internal errors if non-zero"
+		 " (default 1, in SRIOV mode default is 0)");
+
+static void dump_err_buf(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	int i;
+
+	mlx4_err(dev, "Internal error detected:\n");
+	for (i = 0; i < priv->fw.catas_size; ++i)
+		mlx4_err(dev, "  buf[%02x]: %08x\n",
+			 i, swab32(readl(priv->catas_err.map + i)));
+}
+
+static void poll_catas(unsigned long dev_ptr)
+{
+	struct mlx4_dev *dev = (struct mlx4_dev *) dev_ptr;
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	if (readl(priv->catas_err.map)) {
+		/* If the device is off-line, we cannot try to recover it */
+		if (pci_channel_offline(dev->pdev))
+			mod_timer(&priv->catas_err.timer,
+				  round_jiffies(jiffies + MLX4_CATAS_POLL_INTERVAL));
+		else {
+			dump_err_buf(dev);
+			mlx4_dispatch_event(dev, MLX4_DEV_EVENT_CATASTROPHIC_ERROR, 0);
+
+			if (internal_err_reset) {
+				spin_lock(&catas_lock);
+				list_add(&priv->catas_err.list, &catas_list);
+				spin_unlock(&catas_lock);
+
+				queue_work(mlx4_wq, &catas_work);
+			}
+		}
+	} else
+		mod_timer(&priv->catas_err.timer,
+			  round_jiffies(jiffies + MLX4_CATAS_POLL_INTERVAL));
+}
+
+static void catas_reset(struct work_struct *work)
+{
+	struct mlx4_priv *priv, *tmppriv;
+	struct mlx4_dev *dev;
+
+	LIST_HEAD(tlist);
+	int ret;
+
+	spin_lock_irq(&catas_lock);
+	list_splice_init(&catas_list, &tlist);
+	spin_unlock_irq(&catas_lock);
+
+	list_for_each_entry_safe(priv, tmppriv, &tlist, catas_err.list) {
+		struct pci_dev *pdev = priv->dev.pdev;
+
+		/* If the device is off-line, we cannot reset it */
+		if (pci_channel_offline(pdev))
+			continue;
+
+		ret = mlx4_restart_one(priv->dev.pdev);
+		/* 'priv' now is not valid */
+		if (ret)
+			pr_err("mlx4 %s: Reset failed (%d)\n",
+			       pci_name(pdev), ret);
+		else {
+			dev  = pci_get_drvdata(pdev);
+			mlx4_dbg(dev, "Reset succeeded\n");
+		}
+	}
+}
+
+void mlx4_start_catas_poll(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	phys_addr_t addr;
+
+	/*If we are in SRIOV the default of the module param must be 0*/
+	if (mlx4_is_mfunc(dev))
+		internal_err_reset = 0;
+
+	INIT_LIST_HEAD(&priv->catas_err.list);
+	init_timer(&priv->catas_err.timer);
+	priv->catas_err.map = NULL;
+
+	addr = pci_resource_start(dev->pdev, priv->fw.catas_bar) +
+		priv->fw.catas_offset;
+
+	priv->catas_err.map = ioremap(addr, priv->fw.catas_size * 4);
+	if (!priv->catas_err.map) {
+		mlx4_warn(dev, "Failed to map internal error buffer at 0x%llx\n",
+			  (unsigned long long) addr);
+		return;
+	}
+
+	priv->catas_err.timer.data     = (unsigned long) dev;
+	priv->catas_err.timer.function = poll_catas;
+	priv->catas_err.timer.expires  =
+		round_jiffies(jiffies + MLX4_CATAS_POLL_INTERVAL);
+	add_timer(&priv->catas_err.timer);
+}
+
+void mlx4_stop_catas_poll(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	del_timer_sync(&priv->catas_err.timer);
+
+	if (priv->catas_err.map)
+		iounmap(priv->catas_err.map);
+
+	spin_lock_irq(&catas_lock);
+	list_del(&priv->catas_err.list);
+	spin_unlock_irq(&catas_lock);
+}
+
+void  __init mlx4_catas_init(void)
+{
+	INIT_WORK(&catas_work, catas_reset);
+}
diff --git a/drivers/net/ethernet/mellanox/mlx4/cmd.c b/drivers/net/ethernet/mellanox/mlx4/cmd.c
new file mode 100644
index 0000000..b16e1b9
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4/cmd.c
@@ -0,0 +1,2650 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005, 2006, 2007, 2008 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2005, 2006, 2007 Cisco Systems, Inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/export.h>
+#include <linux/pci.h>
+#include <linux/errno.h>
+
+#include <linux/mlx4/cmd.h>
+#include <linux/mlx4/device.h>
+#include <linux/semaphore.h>
+#include <rdma/ib_smi.h>
+
+#include <asm/io.h>
+
+#include "mlx4.h"
+#include "fw.h"
+
+#define CMD_POLL_TOKEN 0xffff
+#define INBOX_MASK	0xffffffffffffff00ULL
+
+#define CMD_CHAN_VER 1
+#define CMD_CHAN_IF_REV 1
+
+enum {
+	/* command completed successfully: */
+	CMD_STAT_OK		= 0x00,
+	/* Internal error (such as a bus error) occurred while processing command: */
+	CMD_STAT_INTERNAL_ERR	= 0x01,
+	/* Operation/command not supported or opcode modifier not supported: */
+	CMD_STAT_BAD_OP		= 0x02,
+	/* Parameter not supported or parameter out of range: */
+	CMD_STAT_BAD_PARAM	= 0x03,
+	/* System not enabled or bad system state: */
+	CMD_STAT_BAD_SYS_STATE	= 0x04,
+	/* Attempt to access reserved or unallocaterd resource: */
+	CMD_STAT_BAD_RESOURCE	= 0x05,
+	/* Requested resource is currently executing a command, or is otherwise busy: */
+	CMD_STAT_RESOURCE_BUSY	= 0x06,
+	/* Required capability exceeds device limits: */
+	CMD_STAT_EXCEED_LIM	= 0x08,
+	/* Resource is not in the appropriate state or ownership: */
+	CMD_STAT_BAD_RES_STATE	= 0x09,
+	/* Index out of range: */
+	CMD_STAT_BAD_INDEX	= 0x0a,
+	/* FW image corrupted: */
+	CMD_STAT_BAD_NVMEM	= 0x0b,
+	/* Error in ICM mapping (e.g. not enough auxiliary ICM pages to execute command): */
+	CMD_STAT_ICM_ERROR	= 0x0c,
+	/* Attempt to modify a QP/EE which is not in the presumed state: */
+	CMD_STAT_BAD_QP_STATE   = 0x10,
+	/* Bad segment parameters (Address/Size): */
+	CMD_STAT_BAD_SEG_PARAM	= 0x20,
+	/* Memory Region has Memory Windows bound to: */
+	CMD_STAT_REG_BOUND	= 0x21,
+	/* HCA local attached memory not present: */
+	CMD_STAT_LAM_NOT_PRE	= 0x22,
+	/* Bad management packet (silently discarded): */
+	CMD_STAT_BAD_PKT	= 0x30,
+	/* More outstanding CQEs in CQ than new CQ size: */
+	CMD_STAT_BAD_SIZE	= 0x40,
+	/* Multi Function device support required: */
+	CMD_STAT_MULTI_FUNC_REQ	= 0x50,
+};
+
+enum {
+	HCR_IN_PARAM_OFFSET	= 0x00,
+	HCR_IN_MODIFIER_OFFSET	= 0x08,
+	HCR_OUT_PARAM_OFFSET	= 0x0c,
+	HCR_TOKEN_OFFSET	= 0x14,
+	HCR_STATUS_OFFSET	= 0x18,
+
+	HCR_OPMOD_SHIFT		= 12,
+	HCR_T_BIT		= 21,
+	HCR_E_BIT		= 22,
+	HCR_GO_BIT		= 23
+};
+
+enum {
+	GO_BIT_TIMEOUT_MSECS	= 10000
+};
+
+enum mlx4_vlan_transition {
+	MLX4_VLAN_TRANSITION_VST_VST = 0,
+	MLX4_VLAN_TRANSITION_VST_VGT = 1,
+	MLX4_VLAN_TRANSITION_VGT_VST = 2,
+	MLX4_VLAN_TRANSITION_VGT_VGT = 3,
+};
+
+
+struct mlx4_cmd_context {
+	struct completion	done;
+	int			result;
+	int			next;
+	u64			out_param;
+	u16			token;
+	u8			fw_status;
+};
+
+static int mlx4_master_process_vhcr(struct mlx4_dev *dev, int slave,
+				    struct mlx4_vhcr_cmd *in_vhcr);
+
+static int mlx4_status_to_errno(u8 status)
+{
+	static const int trans_table[] = {
+		[CMD_STAT_INTERNAL_ERR]	  = -EIO,
+		[CMD_STAT_BAD_OP]	  = -EPERM,
+		[CMD_STAT_BAD_PARAM]	  = -EINVAL,
+		[CMD_STAT_BAD_SYS_STATE]  = -ENXIO,
+		[CMD_STAT_BAD_RESOURCE]	  = -EBADF,
+		[CMD_STAT_RESOURCE_BUSY]  = -EBUSY,
+		[CMD_STAT_EXCEED_LIM]	  = -ENOMEM,
+		[CMD_STAT_BAD_RES_STATE]  = -EBADF,
+		[CMD_STAT_BAD_INDEX]	  = -EBADF,
+		[CMD_STAT_BAD_NVMEM]	  = -EFAULT,
+		[CMD_STAT_ICM_ERROR]	  = -ENFILE,
+		[CMD_STAT_BAD_QP_STATE]   = -EINVAL,
+		[CMD_STAT_BAD_SEG_PARAM]  = -EFAULT,
+		[CMD_STAT_REG_BOUND]	  = -EBUSY,
+		[CMD_STAT_LAM_NOT_PRE]	  = -EAGAIN,
+		[CMD_STAT_BAD_PKT]	  = -EINVAL,
+		[CMD_STAT_BAD_SIZE]	  = -ENOMEM,
+		[CMD_STAT_MULTI_FUNC_REQ] = -EACCES,
+	};
+
+	if (status >= ARRAY_SIZE(trans_table) ||
+	    (status != CMD_STAT_OK && trans_table[status] == 0))
+		return -EIO;
+
+	return trans_table[status];
+}
+
+static u8 mlx4_errno_to_status(int errno)
+{
+	switch (errno) {
+	case -EPERM:
+		return CMD_STAT_BAD_OP;
+	case -EINVAL:
+		return CMD_STAT_BAD_PARAM;
+	case -ENXIO:
+		return CMD_STAT_BAD_SYS_STATE;
+	case -EBUSY:
+		return CMD_STAT_RESOURCE_BUSY;
+	case -ENOMEM:
+		return CMD_STAT_EXCEED_LIM;
+	case -ENFILE:
+		return CMD_STAT_ICM_ERROR;
+	default:
+		return CMD_STAT_INTERNAL_ERR;
+	}
+}
+
+static int comm_pending(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	u32 status = readl(&priv->mfunc.comm->slave_read);
+
+	return (swab32(status) >> 31) != priv->cmd.comm_toggle;
+}
+
+static void mlx4_comm_cmd_post(struct mlx4_dev *dev, u8 cmd, u16 param)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	u32 val;
+
+	priv->cmd.comm_toggle ^= 1;
+	val = param | (cmd << 16) | (priv->cmd.comm_toggle << 31);
+	__raw_writel((__force u32) cpu_to_be32(val),
+		     &priv->mfunc.comm->slave_write);
+	mmiowb();
+}
+
+static int mlx4_comm_cmd_poll(struct mlx4_dev *dev, u8 cmd, u16 param,
+		       unsigned long timeout)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	unsigned long end;
+	int err = 0;
+	int ret_from_pending = 0;
+
+	/* First, verify that the master reports correct status */
+	if (comm_pending(dev)) {
+		mlx4_warn(dev, "Communication channel is not idle - my toggle is %d (cmd:0x%x)\n",
+			  priv->cmd.comm_toggle, cmd);
+		return -EAGAIN;
+	}
+
+	/* Write command */
+	down(&priv->cmd.poll_sem);
+	mlx4_comm_cmd_post(dev, cmd, param);
+
+	end = msecs_to_jiffies(timeout) + jiffies;
+	while (comm_pending(dev) && time_before(jiffies, end))
+		cond_resched();
+	ret_from_pending = comm_pending(dev);
+	if (ret_from_pending) {
+		/* check if the slave is trying to boot in the middle of
+		 * FLR process. The only non-zero result in the RESET command
+		 * is MLX4_DELAY_RESET_SLAVE*/
+		if ((MLX4_COMM_CMD_RESET == cmd)) {
+			err = MLX4_DELAY_RESET_SLAVE;
+		} else {
+			mlx4_warn(dev, "Communication channel timed out\n");
+			err = -ETIMEDOUT;
+		}
+	}
+
+	up(&priv->cmd.poll_sem);
+	return err;
+}
+
+static int mlx4_comm_cmd_wait(struct mlx4_dev *dev, u8 op,
+			      u16 param, unsigned long timeout)
+{
+	struct mlx4_cmd *cmd = &mlx4_priv(dev)->cmd;
+	struct mlx4_cmd_context *context;
+	unsigned long end;
+	int err = 0;
+
+	down(&cmd->event_sem);
+
+	spin_lock(&cmd->context_lock);
+	BUG_ON(cmd->free_head < 0);
+	context = &cmd->context[cmd->free_head];
+	context->token += cmd->token_mask + 1;
+	cmd->free_head = context->next;
+	spin_unlock(&cmd->context_lock);
+
+	init_completion(&context->done);
+
+	mlx4_comm_cmd_post(dev, op, param);
+
+	if (!wait_for_completion_timeout(&context->done,
+					 msecs_to_jiffies(timeout))) {
+		mlx4_warn(dev, "communication channel command 0x%x timed out\n",
+			  op);
+		err = -EBUSY;
+		goto out;
+	}
+
+	err = context->result;
+	if (err && context->fw_status != CMD_STAT_MULTI_FUNC_REQ) {
+		mlx4_err(dev, "command 0x%x failed: fw status = 0x%x\n",
+			 op, context->fw_status);
+		goto out;
+	}
+
+out:
+	/* wait for comm channel ready
+	 * this is necessary for prevention the race
+	 * when switching between event to polling mode
+	 */
+	end = msecs_to_jiffies(timeout) + jiffies;
+	while (comm_pending(dev) && time_before(jiffies, end))
+		cond_resched();
+
+	spin_lock(&cmd->context_lock);
+	context->next = cmd->free_head;
+	cmd->free_head = context - cmd->context;
+	spin_unlock(&cmd->context_lock);
+
+	up(&cmd->event_sem);
+	return err;
+}
+
+int mlx4_comm_cmd(struct mlx4_dev *dev, u8 cmd, u16 param,
+		  unsigned long timeout)
+{
+	if (mlx4_priv(dev)->cmd.use_events)
+		return mlx4_comm_cmd_wait(dev, cmd, param, timeout);
+	return mlx4_comm_cmd_poll(dev, cmd, param, timeout);
+}
+
+static int cmd_pending(struct mlx4_dev *dev)
+{
+	u32 status;
+
+	if (pci_channel_offline(dev->pdev))
+		return -EIO;
+
+	status = readl(mlx4_priv(dev)->cmd.hcr + HCR_STATUS_OFFSET);
+
+	return (status & swab32(1 << HCR_GO_BIT)) ||
+		(mlx4_priv(dev)->cmd.toggle ==
+		 !!(status & swab32(1 << HCR_T_BIT)));
+}
+
+static int mlx4_cmd_post(struct mlx4_dev *dev, u64 in_param, u64 out_param,
+			 u32 in_modifier, u8 op_modifier, u16 op, u16 token,
+			 int event)
+{
+	struct mlx4_cmd *cmd = &mlx4_priv(dev)->cmd;
+	u32 __iomem *hcr = cmd->hcr;
+	int ret = -EAGAIN;
+	unsigned long end;
+
+	mutex_lock(&cmd->hcr_mutex);
+
+	if (pci_channel_offline(dev->pdev)) {
+		/*
+		 * Device is going through error recovery
+		 * and cannot accept commands.
+		 */
+		ret = -EIO;
+		goto out;
+	}
+
+	end = jiffies;
+	if (event)
+		end += msecs_to_jiffies(GO_BIT_TIMEOUT_MSECS);
+
+	while (cmd_pending(dev)) {
+		if (pci_channel_offline(dev->pdev)) {
+			/*
+			 * Device is going through error recovery
+			 * and cannot accept commands.
+			 */
+			ret = -EIO;
+			goto out;
+		}
+
+		if (time_after_eq(jiffies, end)) {
+			mlx4_err(dev, "%s:cmd_pending failed\n", __func__);
+			goto out;
+		}
+		cond_resched();
+	}
+
+	/*
+	 * We use writel (instead of something like memcpy_toio)
+	 * because writes of less than 32 bits to the HCR don't work
+	 * (and some architectures such as ia64 implement memcpy_toio
+	 * in terms of writeb).
+	 */
+	__raw_writel((__force u32) cpu_to_be32(in_param >> 32),		  hcr + 0);
+	__raw_writel((__force u32) cpu_to_be32(in_param & 0xfffffffful),  hcr + 1);
+	__raw_writel((__force u32) cpu_to_be32(in_modifier),		  hcr + 2);
+	__raw_writel((__force u32) cpu_to_be32(out_param >> 32),	  hcr + 3);
+	__raw_writel((__force u32) cpu_to_be32(out_param & 0xfffffffful), hcr + 4);
+	__raw_writel((__force u32) cpu_to_be32(token << 16),		  hcr + 5);
+
+	/* __raw_writel may not order writes. */
+	wmb();
+
+	__raw_writel((__force u32) cpu_to_be32((1 << HCR_GO_BIT)		|
+					       (cmd->toggle << HCR_T_BIT)	|
+					       (event ? (1 << HCR_E_BIT) : 0)	|
+					       (op_modifier << HCR_OPMOD_SHIFT) |
+					       op), hcr + 6);
+
+	/*
+	 * Make sure that our HCR writes don't get mixed in with
+	 * writes from another CPU starting a FW command.
+	 */
+	mmiowb();
+
+	cmd->toggle = cmd->toggle ^ 1;
+
+	ret = 0;
+
+out:
+	mutex_unlock(&cmd->hcr_mutex);
+	return ret;
+}
+
+static int mlx4_slave_cmd(struct mlx4_dev *dev, u64 in_param, u64 *out_param,
+			  int out_is_imm, u32 in_modifier, u8 op_modifier,
+			  u16 op, unsigned long timeout)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_vhcr_cmd *vhcr = priv->mfunc.vhcr;
+	int ret;
+
+	mutex_lock(&priv->cmd.slave_cmd_mutex);
+
+	vhcr->in_param = cpu_to_be64(in_param);
+	vhcr->out_param = out_param ? cpu_to_be64(*out_param) : 0;
+	vhcr->in_modifier = cpu_to_be32(in_modifier);
+	vhcr->opcode = cpu_to_be16((((u16) op_modifier) << 12) | (op & 0xfff));
+	vhcr->token = cpu_to_be16(CMD_POLL_TOKEN);
+	vhcr->status = 0;
+	vhcr->flags = !!(priv->cmd.use_events) << 6;
+
+	if (mlx4_is_master(dev)) {
+		ret = mlx4_master_process_vhcr(dev, dev->caps.function, vhcr);
+		if (!ret) {
+			if (out_is_imm) {
+				if (out_param)
+					*out_param =
+						be64_to_cpu(vhcr->out_param);
+				else {
+					mlx4_err(dev, "response expected while output mailbox is NULL for command 0x%x\n",
+						 op);
+					vhcr->status = CMD_STAT_BAD_PARAM;
+				}
+			}
+			ret = mlx4_status_to_errno(vhcr->status);
+		}
+	} else {
+		ret = mlx4_comm_cmd(dev, MLX4_COMM_CMD_VHCR_POST, 0,
+				    MLX4_COMM_TIME + timeout);
+		if (!ret) {
+			if (out_is_imm) {
+				if (out_param)
+					*out_param =
+						be64_to_cpu(vhcr->out_param);
+				else {
+					mlx4_err(dev, "response expected while output mailbox is NULL for command 0x%x\n",
+						 op);
+					vhcr->status = CMD_STAT_BAD_PARAM;
+				}
+			}
+			ret = mlx4_status_to_errno(vhcr->status);
+		} else
+			mlx4_err(dev, "failed execution of VHCR_POST command opcode 0x%x\n",
+				 op);
+	}
+
+	mutex_unlock(&priv->cmd.slave_cmd_mutex);
+	return ret;
+}
+
+static int mlx4_cmd_poll(struct mlx4_dev *dev, u64 in_param, u64 *out_param,
+			 int out_is_imm, u32 in_modifier, u8 op_modifier,
+			 u16 op, unsigned long timeout)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	void __iomem *hcr = priv->cmd.hcr;
+	int err = 0;
+	unsigned long end;
+	u32 stat;
+
+	down(&priv->cmd.poll_sem);
+
+	if (pci_channel_offline(dev->pdev)) {
+		/*
+		 * Device is going through error recovery
+		 * and cannot accept commands.
+		 */
+		err = -EIO;
+		goto out;
+	}
+
+	if (out_is_imm && !out_param) {
+		mlx4_err(dev, "response expected while output mailbox is NULL for command 0x%x\n",
+			 op);
+		err = -EINVAL;
+		goto out;
+	}
+
+	err = mlx4_cmd_post(dev, in_param, out_param ? *out_param : 0,
+			    in_modifier, op_modifier, op, CMD_POLL_TOKEN, 0);
+	if (err)
+		goto out;
+
+	end = msecs_to_jiffies(timeout) + jiffies;
+	while (cmd_pending(dev) && time_before(jiffies, end)) {
+		if (pci_channel_offline(dev->pdev)) {
+			/*
+			 * Device is going through error recovery
+			 * and cannot accept commands.
+			 */
+			err = -EIO;
+			goto out;
+		}
+
+		cond_resched();
+	}
+
+	if (cmd_pending(dev)) {
+		mlx4_warn(dev, "command 0x%x timed out (go bit not cleared)\n",
+			  op);
+		err = -ETIMEDOUT;
+		goto out;
+	}
+
+	if (out_is_imm)
+		*out_param =
+			(u64) be32_to_cpu((__force __be32)
+					  __raw_readl(hcr + HCR_OUT_PARAM_OFFSET)) << 32 |
+			(u64) be32_to_cpu((__force __be32)
+					  __raw_readl(hcr + HCR_OUT_PARAM_OFFSET + 4));
+	stat = be32_to_cpu((__force __be32)
+			   __raw_readl(hcr + HCR_STATUS_OFFSET)) >> 24;
+	err = mlx4_status_to_errno(stat);
+	if (err)
+		mlx4_err(dev, "command 0x%x failed: fw status = 0x%x\n",
+			 op, stat);
+
+out:
+	up(&priv->cmd.poll_sem);
+	return err;
+}
+
+void mlx4_cmd_event(struct mlx4_dev *dev, u16 token, u8 status, u64 out_param)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_cmd_context *context =
+		&priv->cmd.context[token & priv->cmd.token_mask];
+
+	/* previously timed out command completing at long last */
+	if (token != context->token)
+		return;
+
+	context->fw_status = status;
+	context->result    = mlx4_status_to_errno(status);
+	context->out_param = out_param;
+
+	complete(&context->done);
+}
+
+static int mlx4_cmd_wait(struct mlx4_dev *dev, u64 in_param, u64 *out_param,
+			 int out_is_imm, u32 in_modifier, u8 op_modifier,
+			 u16 op, unsigned long timeout)
+{
+	struct mlx4_cmd *cmd = &mlx4_priv(dev)->cmd;
+	struct mlx4_cmd_context *context;
+	int err = 0;
+
+	down(&cmd->event_sem);
+
+	spin_lock(&cmd->context_lock);
+	BUG_ON(cmd->free_head < 0);
+	context = &cmd->context[cmd->free_head];
+	context->token += cmd->token_mask + 1;
+	cmd->free_head = context->next;
+	spin_unlock(&cmd->context_lock);
+
+	if (out_is_imm && !out_param) {
+		mlx4_err(dev, "response expected while output mailbox is NULL for command 0x%x\n",
+			 op);
+		err = -EINVAL;
+		goto out;
+	}
+
+	init_completion(&context->done);
+
+	mlx4_cmd_post(dev, in_param, out_param ? *out_param : 0,
+		      in_modifier, op_modifier, op, context->token, 1);
+
+	if (!wait_for_completion_timeout(&context->done,
+					 msecs_to_jiffies(timeout))) {
+		mlx4_warn(dev, "command 0x%x timed out (go bit not cleared)\n",
+			  op);
+		err = -EBUSY;
+		goto out;
+	}
+
+	err = context->result;
+	if (err) {
+		/* Since we do not want to have this error message always
+		 * displayed at driver start when there are ConnectX2 HCAs
+		 * on the host, we deprecate the error message for this
+		 * specific command/input_mod/opcode_mod/fw-status to be debug.
+		 */
+		if (op == MLX4_CMD_SET_PORT && in_modifier == 1 &&
+		    op_modifier == 0 && context->fw_status == CMD_STAT_BAD_SIZE)
+			mlx4_dbg(dev, "command 0x%x failed: fw status = 0x%x\n",
+				 op, context->fw_status);
+		else
+			mlx4_err(dev, "command 0x%x failed: fw status = 0x%x\n",
+				 op, context->fw_status);
+		goto out;
+	}
+
+	if (out_is_imm)
+		*out_param = context->out_param;
+
+out:
+	spin_lock(&cmd->context_lock);
+	context->next = cmd->free_head;
+	cmd->free_head = context - cmd->context;
+	spin_unlock(&cmd->context_lock);
+
+	up(&cmd->event_sem);
+	return err;
+}
+
+int __mlx4_cmd(struct mlx4_dev *dev, u64 in_param, u64 *out_param,
+	       int out_is_imm, u32 in_modifier, u8 op_modifier,
+	       u16 op, unsigned long timeout, int native)
+{
+	if (pci_channel_offline(dev->pdev))
+		return -EIO;
+
+	if (!mlx4_is_mfunc(dev) || (native && mlx4_is_master(dev))) {
+		if (mlx4_priv(dev)->cmd.use_events)
+			return mlx4_cmd_wait(dev, in_param, out_param,
+					     out_is_imm, in_modifier,
+					     op_modifier, op, timeout);
+		else
+			return mlx4_cmd_poll(dev, in_param, out_param,
+					     out_is_imm, in_modifier,
+					     op_modifier, op, timeout);
+	}
+	return mlx4_slave_cmd(dev, in_param, out_param, out_is_imm,
+			      in_modifier, op_modifier, op, timeout);
+}
+EXPORT_SYMBOL_GPL(__mlx4_cmd);
+
+
+static int mlx4_ARM_COMM_CHANNEL(struct mlx4_dev *dev)
+{
+	return mlx4_cmd(dev, 0, 0, 0, MLX4_CMD_ARM_COMM_CHANNEL,
+			MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE);
+}
+
+static int mlx4_ACCESS_MEM(struct mlx4_dev *dev, u64 master_addr,
+			   int slave, u64 slave_addr,
+			   int size, int is_read)
+{
+	u64 in_param;
+	u64 out_param;
+
+	if ((slave_addr & 0xfff) | (master_addr & 0xfff) |
+	    (slave & ~0x7f) | (size & 0xff)) {
+		mlx4_err(dev, "Bad access mem params - slave_addr:0x%llx master_addr:0x%llx slave_id:%d size:%d\n",
+			 slave_addr, master_addr, slave, size);
+		return -EINVAL;
+	}
+
+	if (is_read) {
+		in_param = (u64) slave | slave_addr;
+		out_param = (u64) dev->caps.function | master_addr;
+	} else {
+		in_param = (u64) dev->caps.function | master_addr;
+		out_param = (u64) slave | slave_addr;
+	}
+
+	return mlx4_cmd_imm(dev, in_param, &out_param, size, 0,
+			    MLX4_CMD_ACCESS_MEM,
+			    MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE);
+}
+
+static int query_pkey_block(struct mlx4_dev *dev, u8 port, u16 index, u16 *pkey,
+			       struct mlx4_cmd_mailbox *inbox,
+			       struct mlx4_cmd_mailbox *outbox)
+{
+	struct ib_smp *in_mad = (struct ib_smp *)(inbox->buf);
+	struct ib_smp *out_mad = (struct ib_smp *)(outbox->buf);
+	int err;
+	int i;
+
+	if (index & 0x1f)
+		return -EINVAL;
+
+	in_mad->attr_mod = cpu_to_be32(index / 32);
+
+	err = mlx4_cmd_box(dev, inbox->dma, outbox->dma, port, 3,
+			   MLX4_CMD_MAD_IFC, MLX4_CMD_TIME_CLASS_C,
+			   MLX4_CMD_NATIVE);
+	if (err)
+		return err;
+
+	for (i = 0; i < 32; ++i)
+		pkey[i] = be16_to_cpu(((__be16 *) out_mad->data)[i]);
+
+	return err;
+}
+
+static int get_full_pkey_table(struct mlx4_dev *dev, u8 port, u16 *table,
+			       struct mlx4_cmd_mailbox *inbox,
+			       struct mlx4_cmd_mailbox *outbox)
+{
+	int i;
+	int err;
+
+	for (i = 0; i < dev->caps.pkey_table_len[port]; i += 32) {
+		err = query_pkey_block(dev, port, i, table + i, inbox, outbox);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+#define PORT_CAPABILITY_LOCATION_IN_SMP 20
+#define PORT_STATE_OFFSET 32
+
+static enum ib_port_state vf_port_state(struct mlx4_dev *dev, int port, int vf)
+{
+	if (mlx4_get_slave_port_state(dev, vf, port) == SLAVE_PORT_UP)
+		return IB_PORT_ACTIVE;
+	else
+		return IB_PORT_DOWN;
+}
+
+static int mlx4_MAD_IFC_wrapper(struct mlx4_dev *dev, int slave,
+				struct mlx4_vhcr *vhcr,
+				struct mlx4_cmd_mailbox *inbox,
+				struct mlx4_cmd_mailbox *outbox,
+				struct mlx4_cmd_info *cmd)
+{
+	struct ib_smp *smp = inbox->buf;
+	u32 index;
+	u8 port;
+	u8 opcode_modifier;
+	u16 *table;
+	int err;
+	int vidx, pidx;
+	int network_view;
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct ib_smp *outsmp = outbox->buf;
+	__be16 *outtab = (__be16 *)(outsmp->data);
+	__be32 slave_cap_mask;
+	__be64 slave_node_guid;
+
+	port = vhcr->in_modifier;
+
+	/* network-view bit is for driver use only, and should not be passed to FW */
+	opcode_modifier = vhcr->op_modifier & ~0x8; /* clear netw view bit */
+	network_view = !!(vhcr->op_modifier & 0x8);
+
+	if (smp->base_version == 1 &&
+	    smp->mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED &&
+	    smp->class_version == 1) {
+		/* host view is paravirtualized */
+		if (!network_view && smp->method == IB_MGMT_METHOD_GET) {
+			if (smp->attr_id == IB_SMP_ATTR_PKEY_TABLE) {
+				index = be32_to_cpu(smp->attr_mod);
+				if (port < 1 || port > dev->caps.num_ports)
+					return -EINVAL;
+				table = kcalloc(dev->caps.pkey_table_len[port], sizeof *table, GFP_KERNEL);
+				if (!table)
+					return -ENOMEM;
+				/* need to get the full pkey table because the paravirtualized
+				 * pkeys may be scattered among several pkey blocks.
+				 */
+				err = get_full_pkey_table(dev, port, table, inbox, outbox);
+				if (!err) {
+					for (vidx = index * 32; vidx < (index + 1) * 32; ++vidx) {
+						pidx = priv->virt2phys_pkey[slave][port - 1][vidx];
+						outtab[vidx % 32] = cpu_to_be16(table[pidx]);
+					}
+				}
+				kfree(table);
+				return err;
+			}
+			if (smp->attr_id == IB_SMP_ATTR_PORT_INFO) {
+				/*get the slave specific caps:*/
+				/*do the command */
+				err = mlx4_cmd_box(dev, inbox->dma, outbox->dma,
+					    vhcr->in_modifier, opcode_modifier,
+					    vhcr->op, MLX4_CMD_TIME_CLASS_C, MLX4_CMD_NATIVE);
+				/* modify the response for slaves */
+				if (!err && slave != mlx4_master_func_num(dev)) {
+					u8 *state = outsmp->data + PORT_STATE_OFFSET;
+
+					*state = (*state & 0xf0) | vf_port_state(dev, port, slave);
+					slave_cap_mask = priv->mfunc.master.slave_state[slave].ib_cap_mask[port];
+					memcpy(outsmp->data + PORT_CAPABILITY_LOCATION_IN_SMP, &slave_cap_mask, 4);
+				}
+				return err;
+			}
+			if (smp->attr_id == IB_SMP_ATTR_GUID_INFO) {
+				/* compute slave's gid block */
+				smp->attr_mod = cpu_to_be32(slave / 8);
+				/* execute cmd */
+				err = mlx4_cmd_box(dev, inbox->dma, outbox->dma,
+					     vhcr->in_modifier, opcode_modifier,
+					     vhcr->op, MLX4_CMD_TIME_CLASS_C, MLX4_CMD_NATIVE);
+				if (!err) {
+					/* if needed, move slave gid to index 0 */
+					if (slave % 8)
+						memcpy(outsmp->data,
+						       outsmp->data + (slave % 8) * 8, 8);
+					/* delete all other gids */
+					memset(outsmp->data + 8, 0, 56);
+				}
+				return err;
+			}
+			if (smp->attr_id == IB_SMP_ATTR_NODE_INFO) {
+				err = mlx4_cmd_box(dev, inbox->dma, outbox->dma,
+					     vhcr->in_modifier, opcode_modifier,
+					     vhcr->op, MLX4_CMD_TIME_CLASS_C, MLX4_CMD_NATIVE);
+				if (!err) {
+					slave_node_guid =  mlx4_get_slave_node_guid(dev, slave);
+					memcpy(outsmp->data + 12, &slave_node_guid, 8);
+				}
+				return err;
+			}
+		}
+	}
+
+	/* Non-privileged VFs are only allowed "host" view LID-routed 'Get' MADs.
+	 * These are the MADs used by ib verbs (such as ib_query_gids).
+	 */
+	if (slave != mlx4_master_func_num(dev) &&
+	    !mlx4_vf_smi_enabled(dev, slave, port)) {
+		if (!(smp->mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED &&
+		      smp->method == IB_MGMT_METHOD_GET) || network_view) {
+			mlx4_err(dev, "Unprivileged slave %d is trying to execute a Subnet MGMT MAD, class 0x%x, method 0x%x, view=%s for attr 0x%x. Rejecting\n",
+				 slave, smp->method, smp->mgmt_class,
+				 network_view ? "Network" : "Host",
+				 be16_to_cpu(smp->attr_id));
+			return -EPERM;
+		}
+	}
+
+	return mlx4_cmd_box(dev, inbox->dma, outbox->dma,
+				    vhcr->in_modifier, opcode_modifier,
+				    vhcr->op, MLX4_CMD_TIME_CLASS_C, MLX4_CMD_NATIVE);
+}
+
+static int mlx4_CMD_EPERM_wrapper(struct mlx4_dev *dev, int slave,
+		     struct mlx4_vhcr *vhcr,
+		     struct mlx4_cmd_mailbox *inbox,
+		     struct mlx4_cmd_mailbox *outbox,
+		     struct mlx4_cmd_info *cmd)
+{
+	return -EPERM;
+}
+
+int mlx4_DMA_wrapper(struct mlx4_dev *dev, int slave,
+		     struct mlx4_vhcr *vhcr,
+		     struct mlx4_cmd_mailbox *inbox,
+		     struct mlx4_cmd_mailbox *outbox,
+		     struct mlx4_cmd_info *cmd)
+{
+	u64 in_param;
+	u64 out_param;
+	int err;
+
+	in_param = cmd->has_inbox ? (u64) inbox->dma : vhcr->in_param;
+	out_param = cmd->has_outbox ? (u64) outbox->dma : vhcr->out_param;
+	if (cmd->encode_slave_id) {
+		in_param &= 0xffffffffffffff00ll;
+		in_param |= slave;
+	}
+
+	err = __mlx4_cmd(dev, in_param, &out_param, cmd->out_is_imm,
+			 vhcr->in_modifier, vhcr->op_modifier, vhcr->op,
+			 MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE);
+
+	if (cmd->out_is_imm)
+		vhcr->out_param = out_param;
+
+	return err;
+}
+
+static struct mlx4_cmd_info cmd_info[] = {
+	{
+		.opcode = MLX4_CMD_QUERY_FW,
+		.has_inbox = false,
+		.has_outbox = true,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_QUERY_FW_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_QUERY_HCA,
+		.has_inbox = false,
+		.has_outbox = true,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = NULL
+	},
+	{
+		.opcode = MLX4_CMD_QUERY_DEV_CAP,
+		.has_inbox = false,
+		.has_outbox = true,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_QUERY_DEV_CAP_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_QUERY_FUNC_CAP,
+		.has_inbox = false,
+		.has_outbox = true,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_QUERY_FUNC_CAP_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_QUERY_ADAPTER,
+		.has_inbox = false,
+		.has_outbox = true,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = NULL
+	},
+	{
+		.opcode = MLX4_CMD_INIT_PORT,
+		.has_inbox = false,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_INIT_PORT_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_CLOSE_PORT,
+		.has_inbox = false,
+		.has_outbox = false,
+		.out_is_imm  = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_CLOSE_PORT_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_QUERY_PORT,
+		.has_inbox = false,
+		.has_outbox = true,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_QUERY_PORT_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_SET_PORT,
+		.has_inbox = true,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_SET_PORT_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_MAP_EQ,
+		.has_inbox = false,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_MAP_EQ_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_SW2HW_EQ,
+		.has_inbox = true,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = true,
+		.verify = NULL,
+		.wrapper = mlx4_SW2HW_EQ_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_HW_HEALTH_CHECK,
+		.has_inbox = false,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = NULL
+	},
+	{
+		.opcode = MLX4_CMD_NOP,
+		.has_inbox = false,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = NULL
+	},
+	{
+		.opcode = MLX4_CMD_CONFIG_DEV,
+		.has_inbox = false,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_CMD_EPERM_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_ALLOC_RES,
+		.has_inbox = false,
+		.has_outbox = false,
+		.out_is_imm = true,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_ALLOC_RES_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_FREE_RES,
+		.has_inbox = false,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_FREE_RES_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_SW2HW_MPT,
+		.has_inbox = true,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = true,
+		.verify = NULL,
+		.wrapper = mlx4_SW2HW_MPT_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_QUERY_MPT,
+		.has_inbox = false,
+		.has_outbox = true,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_QUERY_MPT_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_HW2SW_MPT,
+		.has_inbox = false,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_HW2SW_MPT_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_READ_MTT,
+		.has_inbox = false,
+		.has_outbox = true,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = NULL
+	},
+	{
+		.opcode = MLX4_CMD_WRITE_MTT,
+		.has_inbox = true,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_WRITE_MTT_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_SYNC_TPT,
+		.has_inbox = true,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = NULL
+	},
+	{
+		.opcode = MLX4_CMD_HW2SW_EQ,
+		.has_inbox = false,
+		.has_outbox = true,
+		.out_is_imm = false,
+		.encode_slave_id = true,
+		.verify = NULL,
+		.wrapper = mlx4_HW2SW_EQ_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_QUERY_EQ,
+		.has_inbox = false,
+		.has_outbox = true,
+		.out_is_imm = false,
+		.encode_slave_id = true,
+		.verify = NULL,
+		.wrapper = mlx4_QUERY_EQ_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_SW2HW_CQ,
+		.has_inbox = true,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = true,
+		.verify = NULL,
+		.wrapper = mlx4_SW2HW_CQ_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_HW2SW_CQ,
+		.has_inbox = false,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_HW2SW_CQ_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_QUERY_CQ,
+		.has_inbox = false,
+		.has_outbox = true,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_QUERY_CQ_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_MODIFY_CQ,
+		.has_inbox = true,
+		.has_outbox = false,
+		.out_is_imm = true,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_MODIFY_CQ_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_SW2HW_SRQ,
+		.has_inbox = true,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = true,
+		.verify = NULL,
+		.wrapper = mlx4_SW2HW_SRQ_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_HW2SW_SRQ,
+		.has_inbox = false,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_HW2SW_SRQ_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_QUERY_SRQ,
+		.has_inbox = false,
+		.has_outbox = true,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_QUERY_SRQ_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_ARM_SRQ,
+		.has_inbox = false,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_ARM_SRQ_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_RST2INIT_QP,
+		.has_inbox = true,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = true,
+		.verify = NULL,
+		.wrapper = mlx4_RST2INIT_QP_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_INIT2INIT_QP,
+		.has_inbox = true,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_INIT2INIT_QP_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_INIT2RTR_QP,
+		.has_inbox = true,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_INIT2RTR_QP_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_RTR2RTS_QP,
+		.has_inbox = true,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_RTR2RTS_QP_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_RTS2RTS_QP,
+		.has_inbox = true,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_RTS2RTS_QP_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_SQERR2RTS_QP,
+		.has_inbox = true,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_SQERR2RTS_QP_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_2ERR_QP,
+		.has_inbox = false,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_GEN_QP_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_RTS2SQD_QP,
+		.has_inbox = false,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_GEN_QP_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_SQD2SQD_QP,
+		.has_inbox = true,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_SQD2SQD_QP_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_SQD2RTS_QP,
+		.has_inbox = true,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_SQD2RTS_QP_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_2RST_QP,
+		.has_inbox = false,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_2RST_QP_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_QUERY_QP,
+		.has_inbox = false,
+		.has_outbox = true,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_GEN_QP_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_SUSPEND_QP,
+		.has_inbox = false,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_GEN_QP_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_UNSUSPEND_QP,
+		.has_inbox = false,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_GEN_QP_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_UPDATE_QP,
+		.has_inbox = true,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_UPDATE_QP_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_GET_OP_REQ,
+		.has_inbox = false,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_CMD_EPERM_wrapper,
+	},
+	{
+		.opcode = MLX4_CMD_CONF_SPECIAL_QP,
+		.has_inbox = false,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL, /* XXX verify: only demux can do this */
+		.wrapper = NULL
+	},
+	{
+		.opcode = MLX4_CMD_MAD_IFC,
+		.has_inbox = true,
+		.has_outbox = true,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_MAD_IFC_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_MAD_DEMUX,
+		.has_inbox = false,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_CMD_EPERM_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_QUERY_IF_STAT,
+		.has_inbox = false,
+		.has_outbox = true,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_QUERY_IF_STAT_wrapper
+	},
+	/* Native multicast commands are not available for guests */
+	{
+		.opcode = MLX4_CMD_QP_ATTACH,
+		.has_inbox = true,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_QP_ATTACH_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_PROMISC,
+		.has_inbox = false,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_PROMISC_wrapper
+	},
+	/* Ethernet specific commands */
+	{
+		.opcode = MLX4_CMD_SET_VLAN_FLTR,
+		.has_inbox = true,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_SET_VLAN_FLTR_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_SET_MCAST_FLTR,
+		.has_inbox = false,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_SET_MCAST_FLTR_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_DUMP_ETH_STATS,
+		.has_inbox = false,
+		.has_outbox = true,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_DUMP_ETH_STATS_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_INFORM_FLR_DONE,
+		.has_inbox = false,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = NULL
+	},
+	/* flow steering commands */
+	{
+		.opcode = MLX4_QP_FLOW_STEERING_ATTACH,
+		.has_inbox = true,
+		.has_outbox = false,
+		.out_is_imm = true,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_QP_FLOW_STEERING_ATTACH_wrapper
+	},
+	{
+		.opcode = MLX4_QP_FLOW_STEERING_DETACH,
+		.has_inbox = false,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_QP_FLOW_STEERING_DETACH_wrapper
+	},
+	{
+		.opcode = MLX4_FLOW_STEERING_IB_UC_QP_RANGE,
+		.has_inbox = false,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_CMD_EPERM_wrapper
+	},
+};
+
+static int mlx4_master_process_vhcr(struct mlx4_dev *dev, int slave,
+				    struct mlx4_vhcr_cmd *in_vhcr)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_cmd_info *cmd = NULL;
+	struct mlx4_vhcr_cmd *vhcr_cmd = in_vhcr ? in_vhcr : priv->mfunc.vhcr;
+	struct mlx4_vhcr *vhcr;
+	struct mlx4_cmd_mailbox *inbox = NULL;
+	struct mlx4_cmd_mailbox *outbox = NULL;
+	u64 in_param;
+	u64 out_param;
+	int ret = 0;
+	int i;
+	int err = 0;
+
+	/* Create sw representation of Virtual HCR */
+	vhcr = kzalloc(sizeof(struct mlx4_vhcr), GFP_KERNEL);
+	if (!vhcr)
+		return -ENOMEM;
+
+	/* DMA in the vHCR */
+	if (!in_vhcr) {
+		ret = mlx4_ACCESS_MEM(dev, priv->mfunc.vhcr_dma, slave,
+				      priv->mfunc.master.slave_state[slave].vhcr_dma,
+				      ALIGN(sizeof(struct mlx4_vhcr_cmd),
+					    MLX4_ACCESS_MEM_ALIGN), 1);
+		if (ret) {
+			mlx4_err(dev, "%s: Failed reading vhcr ret: 0x%x\n",
+				 __func__, ret);
+			kfree(vhcr);
+			return ret;
+		}
+	}
+
+	/* Fill SW VHCR fields */
+	vhcr->in_param = be64_to_cpu(vhcr_cmd->in_param);
+	vhcr->out_param = be64_to_cpu(vhcr_cmd->out_param);
+	vhcr->in_modifier = be32_to_cpu(vhcr_cmd->in_modifier);
+	vhcr->token = be16_to_cpu(vhcr_cmd->token);
+	vhcr->op = be16_to_cpu(vhcr_cmd->opcode) & 0xfff;
+	vhcr->op_modifier = (u8) (be16_to_cpu(vhcr_cmd->opcode) >> 12);
+	vhcr->e_bit = vhcr_cmd->flags & (1 << 6);
+
+	/* Lookup command */
+	for (i = 0; i < ARRAY_SIZE(cmd_info); ++i) {
+		if (vhcr->op == cmd_info[i].opcode) {
+			cmd = &cmd_info[i];
+			break;
+		}
+	}
+	if (!cmd) {
+		mlx4_err(dev, "Unknown command:0x%x accepted from slave:%d\n",
+			 vhcr->op, slave);
+		vhcr_cmd->status = CMD_STAT_BAD_PARAM;
+		goto out_status;
+	}
+
+	/* Read inbox */
+	if (cmd->has_inbox) {
+		vhcr->in_param &= INBOX_MASK;
+		inbox = mlx4_alloc_cmd_mailbox(dev);
+		if (IS_ERR(inbox)) {
+			vhcr_cmd->status = CMD_STAT_BAD_SIZE;
+			inbox = NULL;
+			goto out_status;
+		}
+
+		if (mlx4_ACCESS_MEM(dev, inbox->dma, slave,
+				    vhcr->in_param,
+				    MLX4_MAILBOX_SIZE, 1)) {
+			mlx4_err(dev, "%s: Failed reading inbox (cmd:0x%x)\n",
+				 __func__, cmd->opcode);
+			vhcr_cmd->status = CMD_STAT_INTERNAL_ERR;
+			goto out_status;
+		}
+	}
+
+	/* Apply permission and bound checks if applicable */
+	if (cmd->verify && cmd->verify(dev, slave, vhcr, inbox)) {
+		mlx4_warn(dev, "Command:0x%x from slave: %d failed protection checks for resource_id:%d\n",
+			  vhcr->op, slave, vhcr->in_modifier);
+		vhcr_cmd->status = CMD_STAT_BAD_OP;
+		goto out_status;
+	}
+
+	/* Allocate outbox */
+	if (cmd->has_outbox) {
+		outbox = mlx4_alloc_cmd_mailbox(dev);
+		if (IS_ERR(outbox)) {
+			vhcr_cmd->status = CMD_STAT_BAD_SIZE;
+			outbox = NULL;
+			goto out_status;
+		}
+	}
+
+	/* Execute the command! */
+	if (cmd->wrapper) {
+		err = cmd->wrapper(dev, slave, vhcr, inbox, outbox,
+				   cmd);
+		if (cmd->out_is_imm)
+			vhcr_cmd->out_param = cpu_to_be64(vhcr->out_param);
+	} else {
+		in_param = cmd->has_inbox ? (u64) inbox->dma :
+			vhcr->in_param;
+		out_param = cmd->has_outbox ? (u64) outbox->dma :
+			vhcr->out_param;
+		err = __mlx4_cmd(dev, in_param, &out_param,
+				 cmd->out_is_imm, vhcr->in_modifier,
+				 vhcr->op_modifier, vhcr->op,
+				 MLX4_CMD_TIME_CLASS_A,
+				 MLX4_CMD_NATIVE);
+
+		if (cmd->out_is_imm) {
+			vhcr->out_param = out_param;
+			vhcr_cmd->out_param = cpu_to_be64(vhcr->out_param);
+		}
+	}
+
+	if (err) {
+		mlx4_warn(dev, "vhcr command:0x%x slave:%d failed with error:%d, status %d\n",
+			  vhcr->op, slave, vhcr->errno, err);
+		vhcr_cmd->status = mlx4_errno_to_status(err);
+		goto out_status;
+	}
+
+
+	/* Write outbox if command completed successfully */
+	if (cmd->has_outbox && !vhcr_cmd->status) {
+		ret = mlx4_ACCESS_MEM(dev, outbox->dma, slave,
+				      vhcr->out_param,
+				      MLX4_MAILBOX_SIZE, MLX4_CMD_WRAPPED);
+		if (ret) {
+			/* If we failed to write back the outbox after the
+			 *command was successfully executed, we must fail this
+			 * slave, as it is now in undefined state */
+			mlx4_err(dev, "%s:Failed writing outbox\n", __func__);
+			goto out;
+		}
+	}
+
+out_status:
+	/* DMA back vhcr result */
+	if (!in_vhcr) {
+		ret = mlx4_ACCESS_MEM(dev, priv->mfunc.vhcr_dma, slave,
+				      priv->mfunc.master.slave_state[slave].vhcr_dma,
+				      ALIGN(sizeof(struct mlx4_vhcr),
+					    MLX4_ACCESS_MEM_ALIGN),
+				      MLX4_CMD_WRAPPED);
+		if (ret)
+			mlx4_err(dev, "%s:Failed writing vhcr result\n",
+				 __func__);
+		else if (vhcr->e_bit &&
+			 mlx4_GEN_EQE(dev, slave, &priv->mfunc.master.cmd_eqe))
+				mlx4_warn(dev, "Failed to generate command completion eqe for slave %d\n",
+					  slave);
+	}
+
+out:
+	kfree(vhcr);
+	mlx4_free_cmd_mailbox(dev, inbox);
+	mlx4_free_cmd_mailbox(dev, outbox);
+	return ret;
+}
+
+static int mlx4_master_immediate_activate_vlan_qos(struct mlx4_priv *priv,
+					    int slave, int port)
+{
+	struct mlx4_vport_oper_state *vp_oper;
+	struct mlx4_vport_state *vp_admin;
+	struct mlx4_vf_immed_vlan_work *work;
+	struct mlx4_dev *dev = &(priv->dev);
+	int err;
+	int admin_vlan_ix = NO_INDX;
+
+	vp_oper = &priv->mfunc.master.vf_oper[slave].vport[port];
+	vp_admin = &priv->mfunc.master.vf_admin[slave].vport[port];
+
+	if (vp_oper->state.default_vlan == vp_admin->default_vlan &&
+	    vp_oper->state.default_qos == vp_admin->default_qos &&
+	    vp_oper->state.link_state == vp_admin->link_state)
+		return 0;
+
+	if (!(priv->mfunc.master.slave_state[slave].active &&
+	      dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_UPDATE_QP)) {
+		/* even if the UPDATE_QP command isn't supported, we still want
+		 * to set this VF link according to the admin directive
+		 */
+		vp_oper->state.link_state = vp_admin->link_state;
+		return -1;
+	}
+
+	mlx4_dbg(dev, "updating immediately admin params slave %d port %d\n",
+		 slave, port);
+	mlx4_dbg(dev, "vlan %d QoS %d link down %d\n",
+		 vp_admin->default_vlan, vp_admin->default_qos,
+		 vp_admin->link_state);
+
+	work = kzalloc(sizeof(*work), GFP_KERNEL);
+	if (!work)
+		return -ENOMEM;
+
+	if (vp_oper->state.default_vlan != vp_admin->default_vlan) {
+		if (MLX4_VGT != vp_admin->default_vlan) {
+			err = __mlx4_register_vlan(&priv->dev, port,
+						   vp_admin->default_vlan,
+						   &admin_vlan_ix);
+			if (err) {
+				kfree(work);
+				mlx4_warn(&priv->dev,
+					  "No vlan resources slave %d, port %d\n",
+					  slave, port);
+				return err;
+			}
+		} else {
+			admin_vlan_ix = NO_INDX;
+		}
+		work->flags |= MLX4_VF_IMMED_VLAN_FLAG_VLAN;
+		mlx4_dbg(&priv->dev,
+			 "alloc vlan %d idx  %d slave %d port %d\n",
+			 (int)(vp_admin->default_vlan),
+			 admin_vlan_ix, slave, port);
+	}
+
+	/* save original vlan ix and vlan id */
+	work->orig_vlan_id = vp_oper->state.default_vlan;
+	work->orig_vlan_ix = vp_oper->vlan_idx;
+
+	/* handle new qos */
+	if (vp_oper->state.default_qos != vp_admin->default_qos)
+		work->flags |= MLX4_VF_IMMED_VLAN_FLAG_QOS;
+
+	if (work->flags & MLX4_VF_IMMED_VLAN_FLAG_VLAN)
+		vp_oper->vlan_idx = admin_vlan_ix;
+
+	vp_oper->state.default_vlan = vp_admin->default_vlan;
+	vp_oper->state.default_qos = vp_admin->default_qos;
+	vp_oper->state.link_state = vp_admin->link_state;
+
+	if (vp_admin->link_state == IFLA_VF_LINK_STATE_DISABLE)
+		work->flags |= MLX4_VF_IMMED_VLAN_FLAG_LINK_DISABLE;
+
+	/* iterate over QPs owned by this slave, using UPDATE_QP */
+	work->port = port;
+	work->slave = slave;
+	work->qos = vp_oper->state.default_qos;
+	work->vlan_id = vp_oper->state.default_vlan;
+	work->vlan_ix = vp_oper->vlan_idx;
+	work->priv = priv;
+	INIT_WORK(&work->work, mlx4_vf_immed_vlan_work_handler);
+	queue_work(priv->mfunc.master.comm_wq, &work->work);
+
+	return 0;
+}
+
+
+static int mlx4_master_activate_admin_state(struct mlx4_priv *priv, int slave)
+{
+	int port, err;
+	struct mlx4_vport_state *vp_admin;
+	struct mlx4_vport_oper_state *vp_oper;
+	struct mlx4_active_ports actv_ports = mlx4_get_active_ports(
+			&priv->dev, slave);
+	int min_port = find_first_bit(actv_ports.ports,
+				      priv->dev.caps.num_ports) + 1;
+	int max_port = min_port - 1 +
+		bitmap_weight(actv_ports.ports, priv->dev.caps.num_ports);
+
+	for (port = min_port; port <= max_port; port++) {
+		if (!test_bit(port - 1, actv_ports.ports))
+			continue;
+		priv->mfunc.master.vf_oper[slave].smi_enabled[port] =
+			priv->mfunc.master.vf_admin[slave].enable_smi[port];
+		vp_oper = &priv->mfunc.master.vf_oper[slave].vport[port];
+		vp_admin = &priv->mfunc.master.vf_admin[slave].vport[port];
+		vp_oper->state = *vp_admin;
+		if (MLX4_VGT != vp_admin->default_vlan) {
+			err = __mlx4_register_vlan(&priv->dev, port,
+						   vp_admin->default_vlan, &(vp_oper->vlan_idx));
+			if (err) {
+				vp_oper->vlan_idx = NO_INDX;
+				mlx4_warn(&priv->dev,
+					  "No vlan resources slave %d, port %d\n",
+					  slave, port);
+				return err;
+			}
+			mlx4_dbg(&priv->dev, "alloc vlan %d idx  %d slave %d port %d\n",
+				 (int)(vp_oper->state.default_vlan),
+				 vp_oper->vlan_idx, slave, port);
+		}
+		if (vp_admin->spoofchk) {
+			vp_oper->mac_idx = __mlx4_register_mac(&priv->dev,
+							       port,
+							       vp_admin->mac);
+			if (0 > vp_oper->mac_idx) {
+				err = vp_oper->mac_idx;
+				vp_oper->mac_idx = NO_INDX;
+				mlx4_warn(&priv->dev,
+					  "No mac resources slave %d, port %d\n",
+					  slave, port);
+				return err;
+			}
+			mlx4_dbg(&priv->dev, "alloc mac %llx idx  %d slave %d port %d\n",
+				 vp_oper->state.mac, vp_oper->mac_idx, slave, port);
+		}
+	}
+	return 0;
+}
+
+static void mlx4_master_deactivate_admin_state(struct mlx4_priv *priv, int slave)
+{
+	int port;
+	struct mlx4_vport_oper_state *vp_oper;
+	struct mlx4_active_ports actv_ports = mlx4_get_active_ports(
+			&priv->dev, slave);
+	int min_port = find_first_bit(actv_ports.ports,
+				      priv->dev.caps.num_ports) + 1;
+	int max_port = min_port - 1 +
+		bitmap_weight(actv_ports.ports, priv->dev.caps.num_ports);
+
+
+	for (port = min_port; port <= max_port; port++) {
+		if (!test_bit(port - 1, actv_ports.ports))
+			continue;
+		priv->mfunc.master.vf_oper[slave].smi_enabled[port] =
+			MLX4_VF_SMI_DISABLED;
+		vp_oper = &priv->mfunc.master.vf_oper[slave].vport[port];
+		if (NO_INDX != vp_oper->vlan_idx) {
+			__mlx4_unregister_vlan(&priv->dev,
+					       port, vp_oper->state.default_vlan);
+			vp_oper->vlan_idx = NO_INDX;
+		}
+		if (NO_INDX != vp_oper->mac_idx) {
+			__mlx4_unregister_mac(&priv->dev, port, vp_oper->state.mac);
+			vp_oper->mac_idx = NO_INDX;
+		}
+	}
+	return;
+}
+
+static void mlx4_master_do_cmd(struct mlx4_dev *dev, int slave, u8 cmd,
+			       u16 param, u8 toggle)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_slave_state *slave_state = priv->mfunc.master.slave_state;
+	u32 reply;
+	u8 is_going_down = 0;
+	int i;
+	unsigned long flags;
+
+	slave_state[slave].comm_toggle ^= 1;
+	reply = (u32) slave_state[slave].comm_toggle << 31;
+	if (toggle != slave_state[slave].comm_toggle) {
+		mlx4_warn(dev, "Incorrect toggle %d from slave %d. *** MASTER STATE COMPROMISED ***\n",
+			  toggle, slave);
+		goto reset_slave;
+	}
+	if (cmd == MLX4_COMM_CMD_RESET) {
+		mlx4_warn(dev, "Received reset from slave:%d\n", slave);
+		slave_state[slave].active = false;
+		slave_state[slave].old_vlan_api = false;
+		mlx4_master_deactivate_admin_state(priv, slave);
+		for (i = 0; i < MLX4_EVENT_TYPES_NUM; ++i) {
+				slave_state[slave].event_eq[i].eqn = -1;
+				slave_state[slave].event_eq[i].token = 0;
+		}
+		/*check if we are in the middle of FLR process,
+		if so return "retry" status to the slave*/
+		if (MLX4_COMM_CMD_FLR == slave_state[slave].last_cmd)
+			goto inform_slave_state;
+
+		mlx4_dispatch_event(dev, MLX4_DEV_EVENT_SLAVE_SHUTDOWN, slave);
+
+		/* write the version in the event field */
+		reply |= mlx4_comm_get_version();
+
+		goto reset_slave;
+	}
+	/*command from slave in the middle of FLR*/
+	if (cmd != MLX4_COMM_CMD_RESET &&
+	    MLX4_COMM_CMD_FLR == slave_state[slave].last_cmd) {
+		mlx4_warn(dev, "slave:%d is Trying to run cmd(0x%x) in the middle of FLR\n",
+			  slave, cmd);
+		return;
+	}
+
+	switch (cmd) {
+	case MLX4_COMM_CMD_VHCR0:
+		if (slave_state[slave].last_cmd != MLX4_COMM_CMD_RESET)
+			goto reset_slave;
+		slave_state[slave].vhcr_dma = ((u64) param) << 48;
+		priv->mfunc.master.slave_state[slave].cookie = 0;
+		mutex_init(&priv->mfunc.master.gen_eqe_mutex[slave]);
+		break;
+	case MLX4_COMM_CMD_VHCR1:
+		if (slave_state[slave].last_cmd != MLX4_COMM_CMD_VHCR0)
+			goto reset_slave;
+		slave_state[slave].vhcr_dma |= ((u64) param) << 32;
+		break;
+	case MLX4_COMM_CMD_VHCR2:
+		if (slave_state[slave].last_cmd != MLX4_COMM_CMD_VHCR1)
+			goto reset_slave;
+		slave_state[slave].vhcr_dma |= ((u64) param) << 16;
+		break;
+	case MLX4_COMM_CMD_VHCR_EN:
+		if (slave_state[slave].last_cmd != MLX4_COMM_CMD_VHCR2)
+			goto reset_slave;
+		slave_state[slave].vhcr_dma |= param;
+		if (mlx4_master_activate_admin_state(priv, slave))
+				goto reset_slave;
+		slave_state[slave].active = true;
+		mlx4_dispatch_event(dev, MLX4_DEV_EVENT_SLAVE_INIT, slave);
+		break;
+	case MLX4_COMM_CMD_VHCR_POST:
+		if ((slave_state[slave].last_cmd != MLX4_COMM_CMD_VHCR_EN) &&
+		    (slave_state[slave].last_cmd != MLX4_COMM_CMD_VHCR_POST))
+			goto reset_slave;
+
+		mutex_lock(&priv->cmd.slave_cmd_mutex);
+		if (mlx4_master_process_vhcr(dev, slave, NULL)) {
+			mlx4_err(dev, "Failed processing vhcr for slave:%d, resetting slave\n",
+				 slave);
+			mutex_unlock(&priv->cmd.slave_cmd_mutex);
+			goto reset_slave;
+		}
+		mutex_unlock(&priv->cmd.slave_cmd_mutex);
+		break;
+	default:
+		mlx4_warn(dev, "Bad comm cmd:%d from slave:%d\n", cmd, slave);
+		goto reset_slave;
+	}
+	spin_lock_irqsave(&priv->mfunc.master.slave_state_lock, flags);
+	if (!slave_state[slave].is_slave_going_down)
+		slave_state[slave].last_cmd = cmd;
+	else
+		is_going_down = 1;
+	spin_unlock_irqrestore(&priv->mfunc.master.slave_state_lock, flags);
+	if (is_going_down) {
+		mlx4_warn(dev, "Slave is going down aborting command(%d) executing from slave:%d\n",
+			  cmd, slave);
+		return;
+	}
+	__raw_writel((__force u32) cpu_to_be32(reply),
+		     &priv->mfunc.comm[slave].slave_read);
+	mmiowb();
+
+	return;
+
+reset_slave:
+	/* cleanup any slave resources */
+	mlx4_delete_all_resources_for_slave(dev, slave);
+	spin_lock_irqsave(&priv->mfunc.master.slave_state_lock, flags);
+	if (!slave_state[slave].is_slave_going_down)
+		slave_state[slave].last_cmd = MLX4_COMM_CMD_RESET;
+	spin_unlock_irqrestore(&priv->mfunc.master.slave_state_lock, flags);
+	/*with slave in the middle of flr, no need to clean resources again.*/
+inform_slave_state:
+	memset(&slave_state[slave].event_eq, 0,
+	       sizeof(struct mlx4_slave_event_eq_info));
+	__raw_writel((__force u32) cpu_to_be32(reply),
+		     &priv->mfunc.comm[slave].slave_read);
+	wmb();
+}
+
+/* master command processing */
+void mlx4_master_comm_channel(struct work_struct *work)
+{
+	struct mlx4_mfunc_master_ctx *master =
+		container_of(work,
+			     struct mlx4_mfunc_master_ctx,
+			     comm_work);
+	struct mlx4_mfunc *mfunc =
+		container_of(master, struct mlx4_mfunc, master);
+	struct mlx4_priv *priv =
+		container_of(mfunc, struct mlx4_priv, mfunc);
+	struct mlx4_dev *dev = &priv->dev;
+	__be32 *bit_vec;
+	u32 comm_cmd;
+	u32 vec;
+	int i, j, slave;
+	int toggle;
+	int served = 0;
+	int reported = 0;
+	u32 slt;
+
+	bit_vec = master->comm_arm_bit_vector;
+	for (i = 0; i < COMM_CHANNEL_BIT_ARRAY_SIZE; i++) {
+		vec = be32_to_cpu(bit_vec[i]);
+		for (j = 0; j < 32; j++) {
+			if (!(vec & (1 << j)))
+				continue;
+			++reported;
+			slave = (i * 32) + j;
+			comm_cmd = swab32(readl(
+					  &mfunc->comm[slave].slave_write));
+			slt = swab32(readl(&mfunc->comm[slave].slave_read))
+				     >> 31;
+			toggle = comm_cmd >> 31;
+			if (toggle != slt) {
+				if (master->slave_state[slave].comm_toggle
+				    != slt) {
+					pr_info("slave %d out of sync. read toggle %d, state toggle %d. Resynching.\n",
+						slave, slt,
+						master->slave_state[slave].comm_toggle);
+					master->slave_state[slave].comm_toggle =
+						slt;
+				}
+				mlx4_master_do_cmd(dev, slave,
+						   comm_cmd >> 16 & 0xff,
+						   comm_cmd & 0xffff, toggle);
+				++served;
+			}
+		}
+	}
+
+	if (reported && reported != served)
+		mlx4_warn(dev, "Got command event with bitmask from %d slaves but %d were served\n",
+			  reported, served);
+
+	if (mlx4_ARM_COMM_CHANNEL(dev))
+		mlx4_warn(dev, "Failed to arm comm channel events\n");
+}
+
+static int sync_toggles(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int wr_toggle;
+	int rd_toggle;
+	unsigned long end;
+
+	wr_toggle = swab32(readl(&priv->mfunc.comm->slave_write)) >> 31;
+	end = jiffies + msecs_to_jiffies(5000);
+
+	while (time_before(jiffies, end)) {
+		rd_toggle = swab32(readl(&priv->mfunc.comm->slave_read)) >> 31;
+		if (rd_toggle == wr_toggle) {
+			priv->cmd.comm_toggle = rd_toggle;
+			return 0;
+		}
+
+		cond_resched();
+	}
+
+	/*
+	 * we could reach here if for example the previous VM using this
+	 * function misbehaved and left the channel with unsynced state. We
+	 * should fix this here and give this VM a chance to use a properly
+	 * synced channel
+	 */
+	mlx4_warn(dev, "recovering from previously mis-behaved VM\n");
+	__raw_writel((__force u32) 0, &priv->mfunc.comm->slave_read);
+	__raw_writel((__force u32) 0, &priv->mfunc.comm->slave_write);
+	priv->cmd.comm_toggle = 0;
+
+	return 0;
+}
+
+int mlx4_multi_func_init(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_slave_state *s_state;
+	int i, j, err, port;
+
+	if (mlx4_is_master(dev))
+		priv->mfunc.comm =
+		ioremap(pci_resource_start(dev->pdev, priv->fw.comm_bar) +
+			priv->fw.comm_base, MLX4_COMM_PAGESIZE);
+	else
+		priv->mfunc.comm =
+		ioremap(pci_resource_start(dev->pdev, 2) +
+			MLX4_SLAVE_COMM_BASE, MLX4_COMM_PAGESIZE);
+	if (!priv->mfunc.comm) {
+		mlx4_err(dev, "Couldn't map communication vector\n");
+		goto err_vhcr;
+	}
+
+	if (mlx4_is_master(dev)) {
+		priv->mfunc.master.slave_state =
+			kzalloc(dev->num_slaves *
+				sizeof(struct mlx4_slave_state), GFP_KERNEL);
+		if (!priv->mfunc.master.slave_state)
+			goto err_comm;
+
+		priv->mfunc.master.vf_admin =
+			kzalloc(dev->num_slaves *
+				sizeof(struct mlx4_vf_admin_state), GFP_KERNEL);
+		if (!priv->mfunc.master.vf_admin)
+			goto err_comm_admin;
+
+		priv->mfunc.master.vf_oper =
+			kzalloc(dev->num_slaves *
+				sizeof(struct mlx4_vf_oper_state), GFP_KERNEL);
+		if (!priv->mfunc.master.vf_oper)
+			goto err_comm_oper;
+
+		for (i = 0; i < dev->num_slaves; ++i) {
+			s_state = &priv->mfunc.master.slave_state[i];
+			s_state->last_cmd = MLX4_COMM_CMD_RESET;
+			for (j = 0; j < MLX4_EVENT_TYPES_NUM; ++j)
+				s_state->event_eq[j].eqn = -1;
+			__raw_writel((__force u32) 0,
+				     &priv->mfunc.comm[i].slave_write);
+			__raw_writel((__force u32) 0,
+				     &priv->mfunc.comm[i].slave_read);
+			mmiowb();
+			for (port = 1; port <= MLX4_MAX_PORTS; port++) {
+				s_state->vlan_filter[port] =
+					kzalloc(sizeof(struct mlx4_vlan_fltr),
+						GFP_KERNEL);
+				if (!s_state->vlan_filter[port]) {
+					if (--port)
+						kfree(s_state->vlan_filter[port]);
+					goto err_slaves;
+				}
+				INIT_LIST_HEAD(&s_state->mcast_filters[port]);
+				priv->mfunc.master.vf_admin[i].vport[port].default_vlan = MLX4_VGT;
+				priv->mfunc.master.vf_oper[i].vport[port].state.default_vlan = MLX4_VGT;
+				priv->mfunc.master.vf_oper[i].vport[port].vlan_idx = NO_INDX;
+				priv->mfunc.master.vf_oper[i].vport[port].mac_idx = NO_INDX;
+			}
+			spin_lock_init(&s_state->lock);
+		}
+
+		memset(&priv->mfunc.master.cmd_eqe, 0, dev->caps.eqe_size);
+		priv->mfunc.master.cmd_eqe.type = MLX4_EVENT_TYPE_CMD;
+		INIT_WORK(&priv->mfunc.master.comm_work,
+			  mlx4_master_comm_channel);
+		INIT_WORK(&priv->mfunc.master.slave_event_work,
+			  mlx4_gen_slave_eqe);
+		INIT_WORK(&priv->mfunc.master.slave_flr_event_work,
+			  mlx4_master_handle_slave_flr);
+		spin_lock_init(&priv->mfunc.master.slave_state_lock);
+		spin_lock_init(&priv->mfunc.master.slave_eq.event_lock);
+		priv->mfunc.master.comm_wq =
+			create_singlethread_workqueue("mlx4_comm");
+		if (!priv->mfunc.master.comm_wq)
+			goto err_slaves;
+
+		if (mlx4_init_resource_tracker(dev))
+			goto err_thread;
+
+		err = mlx4_ARM_COMM_CHANNEL(dev);
+		if (err) {
+			mlx4_err(dev, " Failed to arm comm channel eq: %x\n",
+				 err);
+			goto err_resource;
+		}
+
+	} else {
+		err = sync_toggles(dev);
+		if (err) {
+			mlx4_err(dev, "Couldn't sync toggles\n");
+			goto err_comm;
+		}
+	}
+	return 0;
+
+err_resource:
+	mlx4_free_resource_tracker(dev, RES_TR_FREE_ALL);
+err_thread:
+	flush_workqueue(priv->mfunc.master.comm_wq);
+	destroy_workqueue(priv->mfunc.master.comm_wq);
+err_slaves:
+	while (--i) {
+		for (port = 1; port <= MLX4_MAX_PORTS; port++)
+			kfree(priv->mfunc.master.slave_state[i].vlan_filter[port]);
+	}
+	kfree(priv->mfunc.master.vf_oper);
+err_comm_oper:
+	kfree(priv->mfunc.master.vf_admin);
+err_comm_admin:
+	kfree(priv->mfunc.master.slave_state);
+err_comm:
+	iounmap(priv->mfunc.comm);
+err_vhcr:
+	dma_free_coherent(&(dev->pdev->dev), PAGE_SIZE,
+					     priv->mfunc.vhcr,
+					     priv->mfunc.vhcr_dma);
+	priv->mfunc.vhcr = NULL;
+	return -ENOMEM;
+}
+
+int mlx4_cmd_init(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	mutex_init(&priv->cmd.hcr_mutex);
+	mutex_init(&priv->cmd.slave_cmd_mutex);
+	sema_init(&priv->cmd.poll_sem, 1);
+	priv->cmd.use_events = 0;
+	priv->cmd.toggle     = 1;
+
+	priv->cmd.hcr = NULL;
+	priv->mfunc.vhcr = NULL;
+
+	if (!mlx4_is_slave(dev)) {
+		priv->cmd.hcr = ioremap(pci_resource_start(dev->pdev, 0) +
+					MLX4_HCR_BASE, MLX4_HCR_SIZE);
+		if (!priv->cmd.hcr) {
+			mlx4_err(dev, "Couldn't map command register\n");
+			return -ENOMEM;
+		}
+	}
+
+	if (mlx4_is_mfunc(dev)) {
+		priv->mfunc.vhcr = dma_alloc_coherent(&(dev->pdev->dev), PAGE_SIZE,
+						      &priv->mfunc.vhcr_dma,
+						      GFP_KERNEL);
+		if (!priv->mfunc.vhcr)
+			goto err_hcr;
+	}
+
+	priv->cmd.pool = pci_pool_create("mlx4_cmd", dev->pdev,
+					 MLX4_MAILBOX_SIZE,
+					 MLX4_MAILBOX_SIZE, 0);
+	if (!priv->cmd.pool)
+		goto err_vhcr;
+
+	return 0;
+
+err_vhcr:
+	if (mlx4_is_mfunc(dev))
+		dma_free_coherent(&(dev->pdev->dev), PAGE_SIZE,
+				  priv->mfunc.vhcr, priv->mfunc.vhcr_dma);
+	priv->mfunc.vhcr = NULL;
+
+err_hcr:
+	if (!mlx4_is_slave(dev))
+		iounmap(priv->cmd.hcr);
+	return -ENOMEM;
+}
+
+void mlx4_multi_func_cleanup(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int i, port;
+
+	if (mlx4_is_master(dev)) {
+		flush_workqueue(priv->mfunc.master.comm_wq);
+		destroy_workqueue(priv->mfunc.master.comm_wq);
+		for (i = 0; i < dev->num_slaves; i++) {
+			for (port = 1; port <= MLX4_MAX_PORTS; port++)
+				kfree(priv->mfunc.master.slave_state[i].vlan_filter[port]);
+		}
+		kfree(priv->mfunc.master.slave_state);
+		kfree(priv->mfunc.master.vf_admin);
+		kfree(priv->mfunc.master.vf_oper);
+	}
+
+	iounmap(priv->mfunc.comm);
+}
+
+void mlx4_cmd_cleanup(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	pci_pool_destroy(priv->cmd.pool);
+
+	if (!mlx4_is_slave(dev))
+		iounmap(priv->cmd.hcr);
+	if (mlx4_is_mfunc(dev))
+		dma_free_coherent(&(dev->pdev->dev), PAGE_SIZE,
+				  priv->mfunc.vhcr, priv->mfunc.vhcr_dma);
+	priv->mfunc.vhcr = NULL;
+}
+
+/*
+ * Switch to using events to issue FW commands (can only be called
+ * after event queue for command events has been initialized).
+ */
+int mlx4_cmd_use_events(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int i;
+	int err = 0;
+
+	priv->cmd.context = kmalloc(priv->cmd.max_cmds *
+				   sizeof (struct mlx4_cmd_context),
+				   GFP_KERNEL);
+	if (!priv->cmd.context)
+		return -ENOMEM;
+
+	for (i = 0; i < priv->cmd.max_cmds; ++i) {
+		priv->cmd.context[i].token = i;
+		priv->cmd.context[i].next  = i + 1;
+	}
+
+	priv->cmd.context[priv->cmd.max_cmds - 1].next = -1;
+	priv->cmd.free_head = 0;
+
+	sema_init(&priv->cmd.event_sem, priv->cmd.max_cmds);
+	spin_lock_init(&priv->cmd.context_lock);
+
+	for (priv->cmd.token_mask = 1;
+	     priv->cmd.token_mask < priv->cmd.max_cmds;
+	     priv->cmd.token_mask <<= 1)
+		; /* nothing */
+	--priv->cmd.token_mask;
+
+	down(&priv->cmd.poll_sem);
+	priv->cmd.use_events = 1;
+
+	return err;
+}
+
+/*
+ * Switch back to polling (used when shutting down the device)
+ */
+void mlx4_cmd_use_polling(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int i;
+
+	priv->cmd.use_events = 0;
+
+	for (i = 0; i < priv->cmd.max_cmds; ++i)
+		down(&priv->cmd.event_sem);
+
+	kfree(priv->cmd.context);
+
+	up(&priv->cmd.poll_sem);
+}
+
+struct mlx4_cmd_mailbox *mlx4_alloc_cmd_mailbox(struct mlx4_dev *dev)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+
+	mailbox = kmalloc(sizeof *mailbox, GFP_KERNEL);
+	if (!mailbox)
+		return ERR_PTR(-ENOMEM);
+
+	mailbox->buf = pci_pool_alloc(mlx4_priv(dev)->cmd.pool, GFP_KERNEL,
+				      &mailbox->dma);
+	if (!mailbox->buf) {
+		kfree(mailbox);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	memset(mailbox->buf, 0, MLX4_MAILBOX_SIZE);
+
+	return mailbox;
+}
+EXPORT_SYMBOL_GPL(mlx4_alloc_cmd_mailbox);
+
+void mlx4_free_cmd_mailbox(struct mlx4_dev *dev,
+			   struct mlx4_cmd_mailbox *mailbox)
+{
+	if (!mailbox)
+		return;
+
+	pci_pool_free(mlx4_priv(dev)->cmd.pool, mailbox->buf, mailbox->dma);
+	kfree(mailbox);
+}
+EXPORT_SYMBOL_GPL(mlx4_free_cmd_mailbox);
+
+u32 mlx4_comm_get_version(void)
+{
+	 return ((u32) CMD_CHAN_IF_REV << 8) | (u32) CMD_CHAN_VER;
+}
+
+static int mlx4_get_slave_indx(struct mlx4_dev *dev, int vf)
+{
+	if ((vf < 0) || (vf >= dev->num_vfs)) {
+		mlx4_err(dev, "Bad vf number:%d (number of activated vf: %d)\n", vf, dev->num_vfs);
+		return -EINVAL;
+	}
+
+	return vf+1;
+}
+
+int mlx4_get_vf_indx(struct mlx4_dev *dev, int slave)
+{
+	if (slave < 1 || slave > dev->num_vfs) {
+		mlx4_err(dev,
+			 "Bad slave number:%d (number of activated slaves: %lu)\n",
+			 slave, dev->num_slaves);
+		return -EINVAL;
+	}
+	return slave - 1;
+}
+
+struct mlx4_active_ports mlx4_get_active_ports(struct mlx4_dev *dev, int slave)
+{
+	struct mlx4_active_ports actv_ports;
+	int vf;
+
+	bitmap_zero(actv_ports.ports, MLX4_MAX_PORTS);
+
+	if (slave == 0) {
+		bitmap_fill(actv_ports.ports, dev->caps.num_ports);
+		return actv_ports;
+	}
+
+	vf = mlx4_get_vf_indx(dev, slave);
+	if (vf < 0)
+		return actv_ports;
+
+	bitmap_set(actv_ports.ports, dev->dev_vfs[vf].min_port - 1,
+		   min((int)dev->dev_vfs[mlx4_get_vf_indx(dev, slave)].n_ports,
+		   dev->caps.num_ports));
+
+	return actv_ports;
+}
+EXPORT_SYMBOL_GPL(mlx4_get_active_ports);
+
+int mlx4_slave_convert_port(struct mlx4_dev *dev, int slave, int port)
+{
+	unsigned n;
+	struct mlx4_active_ports actv_ports = mlx4_get_active_ports(dev, slave);
+	unsigned m = bitmap_weight(actv_ports.ports, dev->caps.num_ports);
+
+	if (port <= 0 || port > m)
+		return -EINVAL;
+
+	n = find_first_bit(actv_ports.ports, dev->caps.num_ports);
+	if (port <= n)
+		port = n + 1;
+
+	return port;
+}
+EXPORT_SYMBOL_GPL(mlx4_slave_convert_port);
+
+int mlx4_phys_to_slave_port(struct mlx4_dev *dev, int slave, int port)
+{
+	struct mlx4_active_ports actv_ports = mlx4_get_active_ports(dev, slave);
+	if (test_bit(port - 1, actv_ports.ports))
+		return port -
+			find_first_bit(actv_ports.ports, dev->caps.num_ports);
+
+	return -1;
+}
+EXPORT_SYMBOL_GPL(mlx4_phys_to_slave_port);
+
+struct mlx4_slaves_pport mlx4_phys_to_slaves_pport(struct mlx4_dev *dev,
+						   int port)
+{
+	unsigned i;
+	struct mlx4_slaves_pport slaves_pport;
+
+	bitmap_zero(slaves_pport.slaves, MLX4_MFUNC_MAX);
+
+	if (port <= 0 || port > dev->caps.num_ports)
+		return slaves_pport;
+
+	for (i = 0; i < dev->num_vfs + 1; i++) {
+		struct mlx4_active_ports actv_ports =
+			mlx4_get_active_ports(dev, i);
+		if (test_bit(port - 1, actv_ports.ports))
+			set_bit(i, slaves_pport.slaves);
+	}
+
+	return slaves_pport;
+}
+EXPORT_SYMBOL_GPL(mlx4_phys_to_slaves_pport);
+
+struct mlx4_slaves_pport mlx4_phys_to_slaves_pport_actv(
+		struct mlx4_dev *dev,
+		const struct mlx4_active_ports *crit_ports)
+{
+	unsigned i;
+	struct mlx4_slaves_pport slaves_pport;
+
+	bitmap_zero(slaves_pport.slaves, MLX4_MFUNC_MAX);
+
+	for (i = 0; i < dev->num_vfs + 1; i++) {
+		struct mlx4_active_ports actv_ports =
+			mlx4_get_active_ports(dev, i);
+		if (bitmap_equal(crit_ports->ports, actv_ports.ports,
+				 dev->caps.num_ports))
+			set_bit(i, slaves_pport.slaves);
+	}
+
+	return slaves_pport;
+}
+EXPORT_SYMBOL_GPL(mlx4_phys_to_slaves_pport_actv);
+
+static int mlx4_slaves_closest_port(struct mlx4_dev *dev, int slave, int port)
+{
+	struct mlx4_active_ports actv_ports = mlx4_get_active_ports(dev, slave);
+	int min_port = find_first_bit(actv_ports.ports, dev->caps.num_ports)
+			+ 1;
+	int max_port = min_port +
+		bitmap_weight(actv_ports.ports, dev->caps.num_ports);
+
+	if (port < min_port)
+		port = min_port;
+	else if (port >= max_port)
+		port = max_port - 1;
+
+	return port;
+}
+
+int mlx4_set_vf_mac(struct mlx4_dev *dev, int port, int vf, u64 mac)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_vport_state *s_info;
+	int slave;
+
+	if (!mlx4_is_master(dev))
+		return -EPROTONOSUPPORT;
+
+	slave = mlx4_get_slave_indx(dev, vf);
+	if (slave < 0)
+		return -EINVAL;
+
+	port = mlx4_slaves_closest_port(dev, slave, port);
+	s_info = &priv->mfunc.master.vf_admin[slave].vport[port];
+	s_info->mac = mac;
+	mlx4_info(dev, "default mac on vf %d port %d to %llX will take afect only after vf restart\n",
+		  vf, port, s_info->mac);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx4_set_vf_mac);
+
+
+int mlx4_set_vf_vlan(struct mlx4_dev *dev, int port, int vf, u16 vlan, u8 qos)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_vport_state *vf_admin;
+	int slave;
+
+	if ((!mlx4_is_master(dev)) ||
+	    !(dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_VLAN_CONTROL))
+		return -EPROTONOSUPPORT;
+
+	if ((vlan > 4095) || (qos > 7))
+		return -EINVAL;
+
+	slave = mlx4_get_slave_indx(dev, vf);
+	if (slave < 0)
+		return -EINVAL;
+
+	port = mlx4_slaves_closest_port(dev, slave, port);
+	vf_admin = &priv->mfunc.master.vf_admin[slave].vport[port];
+
+	if ((0 == vlan) && (0 == qos))
+		vf_admin->default_vlan = MLX4_VGT;
+	else
+		vf_admin->default_vlan = vlan;
+	vf_admin->default_qos = qos;
+
+	if (mlx4_master_immediate_activate_vlan_qos(priv, slave, port))
+		mlx4_info(dev,
+			  "updating vf %d port %d config will take effect on next VF restart\n",
+			  vf, port);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx4_set_vf_vlan);
+
+ /* mlx4_get_slave_default_vlan -
+ * return true if VST ( default vlan)
+ * if VST, will return vlan & qos (if not NULL)
+ */
+bool mlx4_get_slave_default_vlan(struct mlx4_dev *dev, int port, int slave,
+				 u16 *vlan, u8 *qos)
+{
+	struct mlx4_vport_oper_state *vp_oper;
+	struct mlx4_priv *priv;
+
+	priv = mlx4_priv(dev);
+	port = mlx4_slaves_closest_port(dev, slave, port);
+	vp_oper = &priv->mfunc.master.vf_oper[slave].vport[port];
+
+	if (MLX4_VGT != vp_oper->state.default_vlan) {
+		if (vlan)
+			*vlan = vp_oper->state.default_vlan;
+		if (qos)
+			*qos = vp_oper->state.default_qos;
+		return true;
+	}
+	return false;
+}
+EXPORT_SYMBOL_GPL(mlx4_get_slave_default_vlan);
+
+int mlx4_set_vf_spoofchk(struct mlx4_dev *dev, int port, int vf, bool setting)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_vport_state *s_info;
+	int slave;
+
+	if ((!mlx4_is_master(dev)) ||
+	    !(dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_FSM))
+		return -EPROTONOSUPPORT;
+
+	slave = mlx4_get_slave_indx(dev, vf);
+	if (slave < 0)
+		return -EINVAL;
+
+	port = mlx4_slaves_closest_port(dev, slave, port);
+	s_info = &priv->mfunc.master.vf_admin[slave].vport[port];
+	s_info->spoofchk = setting;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx4_set_vf_spoofchk);
+
+int mlx4_get_vf_config(struct mlx4_dev *dev, int port, int vf, struct ifla_vf_info *ivf)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_vport_state *s_info;
+	int slave;
+
+	if (!mlx4_is_master(dev))
+		return -EPROTONOSUPPORT;
+
+	slave = mlx4_get_slave_indx(dev, vf);
+	if (slave < 0)
+		return -EINVAL;
+
+	s_info = &priv->mfunc.master.vf_admin[slave].vport[port];
+	ivf->vf = vf;
+
+	/* need to convert it to a func */
+	ivf->mac[0] = ((s_info->mac >> (5*8)) & 0xff);
+	ivf->mac[1] = ((s_info->mac >> (4*8)) & 0xff);
+	ivf->mac[2] = ((s_info->mac >> (3*8)) & 0xff);
+	ivf->mac[3] = ((s_info->mac >> (2*8)) & 0xff);
+	ivf->mac[4] = ((s_info->mac >> (1*8)) & 0xff);
+	ivf->mac[5] = ((s_info->mac)  & 0xff);
+
+	ivf->vlan		= s_info->default_vlan;
+	ivf->qos		= s_info->default_qos;
+	ivf->max_tx_rate	= s_info->tx_rate;
+	ivf->min_tx_rate	= 0;
+	ivf->spoofchk		= s_info->spoofchk;
+	ivf->linkstate		= s_info->link_state;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx4_get_vf_config);
+
+int mlx4_set_vf_link_state(struct mlx4_dev *dev, int port, int vf, int link_state)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_vport_state *s_info;
+	int slave;
+	u8 link_stat_event;
+
+	slave = mlx4_get_slave_indx(dev, vf);
+	if (slave < 0)
+		return -EINVAL;
+
+	port = mlx4_slaves_closest_port(dev, slave, port);
+	switch (link_state) {
+	case IFLA_VF_LINK_STATE_AUTO:
+		/* get current link state */
+		if (!priv->sense.do_sense_port[port])
+			link_stat_event = MLX4_PORT_CHANGE_SUBTYPE_ACTIVE;
+		else
+			link_stat_event = MLX4_PORT_CHANGE_SUBTYPE_DOWN;
+	    break;
+
+	case IFLA_VF_LINK_STATE_ENABLE:
+		link_stat_event = MLX4_PORT_CHANGE_SUBTYPE_ACTIVE;
+	    break;
+
+	case IFLA_VF_LINK_STATE_DISABLE:
+		link_stat_event = MLX4_PORT_CHANGE_SUBTYPE_DOWN;
+	    break;
+
+	default:
+		mlx4_warn(dev, "unknown value for link_state %02x on slave %d port %d\n",
+			  link_state, slave, port);
+		return -EINVAL;
+	};
+	s_info = &priv->mfunc.master.vf_admin[slave].vport[port];
+	s_info->link_state = link_state;
+
+	/* send event */
+	mlx4_gen_port_state_change_eqe(dev, slave, port, link_stat_event);
+
+	if (mlx4_master_immediate_activate_vlan_qos(priv, slave, port))
+		mlx4_dbg(dev,
+			 "updating vf %d port %d no link state HW enforcment\n",
+			 vf, port);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx4_set_vf_link_state);
+
+int mlx4_vf_smi_enabled(struct mlx4_dev *dev, int slave, int port)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	if (slave < 1 || slave >= dev->num_slaves ||
+	    port < 1 || port > MLX4_MAX_PORTS)
+		return 0;
+
+	return priv->mfunc.master.vf_oper[slave].smi_enabled[port] ==
+		MLX4_VF_SMI_ENABLED;
+}
+EXPORT_SYMBOL_GPL(mlx4_vf_smi_enabled);
+
+int mlx4_vf_get_enable_smi_admin(struct mlx4_dev *dev, int slave, int port)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	if (slave == mlx4_master_func_num(dev))
+		return 1;
+
+	if (slave < 1 || slave >= dev->num_slaves ||
+	    port < 1 || port > MLX4_MAX_PORTS)
+		return 0;
+
+	return priv->mfunc.master.vf_admin[slave].enable_smi[port] ==
+		MLX4_VF_SMI_ENABLED;
+}
+EXPORT_SYMBOL_GPL(mlx4_vf_get_enable_smi_admin);
+
+int mlx4_vf_set_enable_smi_admin(struct mlx4_dev *dev, int slave, int port,
+				 int enabled)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	if (slave == mlx4_master_func_num(dev))
+		return 0;
+
+	if (slave < 1 || slave >= dev->num_slaves ||
+	    port < 1 || port > MLX4_MAX_PORTS ||
+	    enabled < 0 || enabled > 1)
+		return -EINVAL;
+
+	priv->mfunc.master.vf_admin[slave].enable_smi[port] = enabled;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx4_vf_set_enable_smi_admin);
diff --git a/drivers/net/ethernet/mellanox/mlx4/cq.c b/drivers/net/ethernet/mellanox/mlx4/cq.c
new file mode 100644
index 0000000..56022d6
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4/cq.c
@@ -0,0 +1,359 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2005, 2006, 2007 Cisco Systems, Inc. All rights reserved.
+ * Copyright (c) 2005, 2006, 2007, 2008 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2004 Voltaire, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/hardirq.h>
+#include <linux/export.h>
+
+#include <linux/mlx4/cmd.h>
+#include <linux/mlx4/cq.h>
+
+#include "mlx4.h"
+#include "icm.h"
+
+#define MLX4_CQ_STATUS_OK		( 0 << 28)
+#define MLX4_CQ_STATUS_OVERFLOW		( 9 << 28)
+#define MLX4_CQ_STATUS_WRITE_FAIL	(10 << 28)
+#define MLX4_CQ_FLAG_CC			( 1 << 18)
+#define MLX4_CQ_FLAG_OI			( 1 << 17)
+#define MLX4_CQ_STATE_ARMED		( 9 <<  8)
+#define MLX4_CQ_STATE_ARMED_SOL		( 6 <<  8)
+#define MLX4_EQ_STATE_FIRED		(10 <<  8)
+
+void mlx4_cq_completion(struct mlx4_dev *dev, u32 cqn)
+{
+	struct mlx4_cq *cq;
+
+	cq = radix_tree_lookup(&mlx4_priv(dev)->cq_table.tree,
+			       cqn & (dev->caps.num_cqs - 1));
+	if (!cq) {
+		mlx4_dbg(dev, "Completion event for bogus CQ %08x\n", cqn);
+		return;
+	}
+
+	++cq->arm_sn;
+
+	cq->comp(cq);
+}
+
+void mlx4_cq_event(struct mlx4_dev *dev, u32 cqn, int event_type)
+{
+	struct mlx4_cq_table *cq_table = &mlx4_priv(dev)->cq_table;
+	struct mlx4_cq *cq;
+
+	spin_lock(&cq_table->lock);
+
+	cq = radix_tree_lookup(&cq_table->tree, cqn & (dev->caps.num_cqs - 1));
+	if (cq)
+		atomic_inc(&cq->refcount);
+
+	spin_unlock(&cq_table->lock);
+
+	if (!cq) {
+		mlx4_warn(dev, "Async event for bogus CQ %08x\n", cqn);
+		return;
+	}
+
+	cq->event(cq, event_type);
+
+	if (atomic_dec_and_test(&cq->refcount))
+		complete(&cq->free);
+}
+
+static int mlx4_SW2HW_CQ(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox,
+			 int cq_num)
+{
+	return mlx4_cmd(dev, mailbox->dma, cq_num, 0,
+			MLX4_CMD_SW2HW_CQ, MLX4_CMD_TIME_CLASS_A,
+			MLX4_CMD_WRAPPED);
+}
+
+static int mlx4_MODIFY_CQ(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox,
+			 int cq_num, u32 opmod)
+{
+	return mlx4_cmd(dev, mailbox->dma, cq_num, opmod, MLX4_CMD_MODIFY_CQ,
+			MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED);
+}
+
+static int mlx4_HW2SW_CQ(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox,
+			 int cq_num)
+{
+	return mlx4_cmd_box(dev, 0, mailbox ? mailbox->dma : 0,
+			    cq_num, mailbox ? 0 : 1, MLX4_CMD_HW2SW_CQ,
+			    MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED);
+}
+
+int mlx4_cq_modify(struct mlx4_dev *dev, struct mlx4_cq *cq,
+		   u16 count, u16 period)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_cq_context *cq_context;
+	int err;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+
+	cq_context = mailbox->buf;
+	cq_context->cq_max_count = cpu_to_be16(count);
+	cq_context->cq_period    = cpu_to_be16(period);
+
+	err = mlx4_MODIFY_CQ(dev, mailbox, cq->cqn, 1);
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx4_cq_modify);
+
+int mlx4_cq_resize(struct mlx4_dev *dev, struct mlx4_cq *cq,
+		   int entries, struct mlx4_mtt *mtt)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_cq_context *cq_context;
+	u64 mtt_addr;
+	int err;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+
+	cq_context = mailbox->buf;
+	cq_context->logsize_usrpage = cpu_to_be32(ilog2(entries) << 24);
+	cq_context->log_page_size   = mtt->page_shift - 12;
+	mtt_addr = mlx4_mtt_addr(dev, mtt);
+	cq_context->mtt_base_addr_h = mtt_addr >> 32;
+	cq_context->mtt_base_addr_l = cpu_to_be32(mtt_addr & 0xffffffff);
+
+	err = mlx4_MODIFY_CQ(dev, mailbox, cq->cqn, 0);
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx4_cq_resize);
+
+int __mlx4_cq_alloc_icm(struct mlx4_dev *dev, int *cqn)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_cq_table *cq_table = &priv->cq_table;
+	int err;
+
+	*cqn = mlx4_bitmap_alloc(&cq_table->bitmap);
+	if (*cqn == -1)
+		return -ENOMEM;
+
+	err = mlx4_table_get(dev, &cq_table->table, *cqn, GFP_KERNEL);
+	if (err)
+		goto err_out;
+
+	err = mlx4_table_get(dev, &cq_table->cmpt_table, *cqn, GFP_KERNEL);
+	if (err)
+		goto err_put;
+	return 0;
+
+err_put:
+	mlx4_table_put(dev, &cq_table->table, *cqn);
+
+err_out:
+	mlx4_bitmap_free(&cq_table->bitmap, *cqn, MLX4_NO_RR);
+	return err;
+}
+
+static int mlx4_cq_alloc_icm(struct mlx4_dev *dev, int *cqn)
+{
+	u64 out_param;
+	int err;
+
+	if (mlx4_is_mfunc(dev)) {
+		err = mlx4_cmd_imm(dev, 0, &out_param, RES_CQ,
+				   RES_OP_RESERVE_AND_MAP, MLX4_CMD_ALLOC_RES,
+				   MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED);
+		if (err)
+			return err;
+		else {
+			*cqn = get_param_l(&out_param);
+			return 0;
+		}
+	}
+	return __mlx4_cq_alloc_icm(dev, cqn);
+}
+
+void __mlx4_cq_free_icm(struct mlx4_dev *dev, int cqn)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_cq_table *cq_table = &priv->cq_table;
+
+	mlx4_table_put(dev, &cq_table->cmpt_table, cqn);
+	mlx4_table_put(dev, &cq_table->table, cqn);
+	mlx4_bitmap_free(&cq_table->bitmap, cqn, MLX4_NO_RR);
+}
+
+static void mlx4_cq_free_icm(struct mlx4_dev *dev, int cqn)
+{
+	u64 in_param = 0;
+	int err;
+
+	if (mlx4_is_mfunc(dev)) {
+		set_param_l(&in_param, cqn);
+		err = mlx4_cmd(dev, in_param, RES_CQ, RES_OP_RESERVE_AND_MAP,
+			       MLX4_CMD_FREE_RES,
+			       MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED);
+		if (err)
+			mlx4_warn(dev, "Failed freeing cq:%d\n", cqn);
+	} else
+		__mlx4_cq_free_icm(dev, cqn);
+}
+
+int mlx4_cq_alloc(struct mlx4_dev *dev, int nent,
+		  struct mlx4_mtt *mtt, struct mlx4_uar *uar, u64 db_rec,
+		  struct mlx4_cq *cq, unsigned vector, int collapsed,
+		  int timestamp_en)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_cq_table *cq_table = &priv->cq_table;
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_cq_context *cq_context;
+	u64 mtt_addr;
+	int err;
+
+	if (vector > dev->caps.num_comp_vectors + dev->caps.comp_pool)
+		return -EINVAL;
+
+	cq->vector = vector;
+
+	err = mlx4_cq_alloc_icm(dev, &cq->cqn);
+	if (err)
+		return err;
+
+	spin_lock_irq(&cq_table->lock);
+	err = radix_tree_insert(&cq_table->tree, cq->cqn, cq);
+	spin_unlock_irq(&cq_table->lock);
+	if (err)
+		goto err_icm;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox)) {
+		err = PTR_ERR(mailbox);
+		goto err_radix;
+	}
+
+	cq_context = mailbox->buf;
+	cq_context->flags	    = cpu_to_be32(!!collapsed << 18);
+	if (timestamp_en)
+		cq_context->flags  |= cpu_to_be32(1 << 19);
+
+	cq_context->logsize_usrpage = cpu_to_be32((ilog2(nent) << 24) | uar->index);
+	cq_context->comp_eqn	    = priv->eq_table.eq[vector].eqn;
+	cq_context->log_page_size   = mtt->page_shift - MLX4_ICM_PAGE_SHIFT;
+
+	mtt_addr = mlx4_mtt_addr(dev, mtt);
+	cq_context->mtt_base_addr_h = mtt_addr >> 32;
+	cq_context->mtt_base_addr_l = cpu_to_be32(mtt_addr & 0xffffffff);
+	cq_context->db_rec_addr     = cpu_to_be64(db_rec);
+
+	err = mlx4_SW2HW_CQ(dev, mailbox, cq->cqn);
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	if (err)
+		goto err_radix;
+
+	cq->cons_index = 0;
+	cq->arm_sn     = 1;
+	cq->uar        = uar;
+	atomic_set(&cq->refcount, 1);
+	init_completion(&cq->free);
+
+	cq->irq = priv->eq_table.eq[cq->vector].irq;
+	return 0;
+
+err_radix:
+	spin_lock_irq(&cq_table->lock);
+	radix_tree_delete(&cq_table->tree, cq->cqn);
+	spin_unlock_irq(&cq_table->lock);
+
+err_icm:
+	mlx4_cq_free_icm(dev, cq->cqn);
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx4_cq_alloc);
+
+void mlx4_cq_free(struct mlx4_dev *dev, struct mlx4_cq *cq)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_cq_table *cq_table = &priv->cq_table;
+	int err;
+
+	err = mlx4_HW2SW_CQ(dev, NULL, cq->cqn);
+	if (err)
+		mlx4_warn(dev, "HW2SW_CQ failed (%d) for CQN %06x\n", err, cq->cqn);
+
+	synchronize_irq(priv->eq_table.eq[cq->vector].irq);
+
+	spin_lock_irq(&cq_table->lock);
+	radix_tree_delete(&cq_table->tree, cq->cqn);
+	spin_unlock_irq(&cq_table->lock);
+
+	if (atomic_dec_and_test(&cq->refcount))
+		complete(&cq->free);
+	wait_for_completion(&cq->free);
+
+	mlx4_cq_free_icm(dev, cq->cqn);
+}
+EXPORT_SYMBOL_GPL(mlx4_cq_free);
+
+int mlx4_init_cq_table(struct mlx4_dev *dev)
+{
+	struct mlx4_cq_table *cq_table = &mlx4_priv(dev)->cq_table;
+	int err;
+
+	spin_lock_init(&cq_table->lock);
+	INIT_RADIX_TREE(&cq_table->tree, GFP_ATOMIC);
+	if (mlx4_is_slave(dev))
+		return 0;
+
+	err = mlx4_bitmap_init(&cq_table->bitmap, dev->caps.num_cqs,
+			       dev->caps.num_cqs - 1, dev->caps.reserved_cqs, 0);
+	if (err)
+		return err;
+
+	return 0;
+}
+
+void mlx4_cleanup_cq_table(struct mlx4_dev *dev)
+{
+	if (mlx4_is_slave(dev))
+		return;
+	/* Nothing to do to clean up radix_tree */
+	mlx4_bitmap_cleanup(&mlx4_priv(dev)->cq_table.bitmap);
+}
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_clock.c b/drivers/net/ethernet/mellanox/mlx4/en_clock.c
new file mode 100644
index 0000000..57dda95
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4/en_clock.c
@@ -0,0 +1,334 @@
+/*
+ * Copyright (c) 2012 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <linux/mlx4/device.h>
+
+#include "mlx4_en.h"
+
+int mlx4_en_timestamp_config(struct net_device *dev, int tx_type, int rx_filter)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = priv->mdev;
+	int port_up = 0;
+	int err = 0;
+
+	if (priv->hwtstamp_config.tx_type == tx_type &&
+	    priv->hwtstamp_config.rx_filter == rx_filter)
+		return 0;
+
+	mutex_lock(&mdev->state_lock);
+	if (priv->port_up) {
+		port_up = 1;
+		mlx4_en_stop_port(dev, 1);
+	}
+
+	mlx4_en_free_resources(priv);
+
+	en_warn(priv, "Changing Time Stamp configuration\n");
+
+	priv->hwtstamp_config.tx_type = tx_type;
+	priv->hwtstamp_config.rx_filter = rx_filter;
+
+	if (rx_filter != HWTSTAMP_FILTER_NONE)
+		dev->features &= ~NETIF_F_HW_VLAN_CTAG_RX;
+	else
+		dev->features |= NETIF_F_HW_VLAN_CTAG_RX;
+
+	err = mlx4_en_alloc_resources(priv);
+	if (err) {
+		en_err(priv, "Failed reallocating port resources\n");
+		goto out;
+	}
+	if (port_up) {
+		err = mlx4_en_start_port(dev);
+		if (err)
+			en_err(priv, "Failed starting port\n");
+	}
+
+out:
+	mutex_unlock(&mdev->state_lock);
+	netdev_features_change(dev);
+	return err;
+}
+
+/* mlx4_en_read_clock - read raw cycle counter (to be used by time counter)
+ */
+static cycle_t mlx4_en_read_clock(const struct cyclecounter *tc)
+{
+	struct mlx4_en_dev *mdev =
+		container_of(tc, struct mlx4_en_dev, cycles);
+	struct mlx4_dev *dev = mdev->dev;
+
+	return mlx4_read_clock(dev) & tc->mask;
+}
+
+u64 mlx4_en_get_cqe_ts(struct mlx4_cqe *cqe)
+{
+	u64 hi, lo;
+	struct mlx4_ts_cqe *ts_cqe = (struct mlx4_ts_cqe *)cqe;
+
+	lo = (u64)be16_to_cpu(ts_cqe->timestamp_lo);
+	hi = ((u64)be32_to_cpu(ts_cqe->timestamp_hi) + !lo) << 16;
+
+	return hi | lo;
+}
+
+void mlx4_en_fill_hwtstamps(struct mlx4_en_dev *mdev,
+			    struct skb_shared_hwtstamps *hwts,
+			    u64 timestamp)
+{
+	unsigned long flags;
+	u64 nsec;
+
+	read_lock_irqsave(&mdev->clock_lock, flags);
+	nsec = timecounter_cyc2time(&mdev->clock, timestamp);
+	read_unlock_irqrestore(&mdev->clock_lock, flags);
+
+	memset(hwts, 0, sizeof(struct skb_shared_hwtstamps));
+	hwts->hwtstamp = ns_to_ktime(nsec);
+}
+
+/**
+ * mlx4_en_remove_timestamp - disable PTP device
+ * @mdev: board private structure
+ *
+ * Stop the PTP support.
+ **/
+void mlx4_en_remove_timestamp(struct mlx4_en_dev *mdev)
+{
+	if (mdev->ptp_clock) {
+		ptp_clock_unregister(mdev->ptp_clock);
+		mdev->ptp_clock = NULL;
+		mlx4_info(mdev, "removed PHC\n");
+	}
+}
+
+void mlx4_en_ptp_overflow_check(struct mlx4_en_dev *mdev)
+{
+	bool timeout = time_is_before_jiffies(mdev->last_overflow_check +
+					      mdev->overflow_period);
+	unsigned long flags;
+
+	if (timeout) {
+		write_lock_irqsave(&mdev->clock_lock, flags);
+		timecounter_read(&mdev->clock);
+		write_unlock_irqrestore(&mdev->clock_lock, flags);
+		mdev->last_overflow_check = jiffies;
+	}
+}
+
+/**
+ * mlx4_en_phc_adjfreq - adjust the frequency of the hardware clock
+ * @ptp: ptp clock structure
+ * @delta: Desired frequency change in parts per billion
+ *
+ * Adjust the frequency of the PHC cycle counter by the indicated delta from
+ * the base frequency.
+ **/
+static int mlx4_en_phc_adjfreq(struct ptp_clock_info *ptp, s32 delta)
+{
+	u64 adj;
+	u32 diff, mult;
+	int neg_adj = 0;
+	unsigned long flags;
+	struct mlx4_en_dev *mdev = container_of(ptp, struct mlx4_en_dev,
+						ptp_clock_info);
+
+	if (delta < 0) {
+		neg_adj = 1;
+		delta = -delta;
+	}
+	mult = mdev->nominal_c_mult;
+	adj = mult;
+	adj *= delta;
+	diff = div_u64(adj, 1000000000ULL);
+
+	write_lock_irqsave(&mdev->clock_lock, flags);
+	timecounter_read(&mdev->clock);
+	mdev->cycles.mult = neg_adj ? mult - diff : mult + diff;
+	write_unlock_irqrestore(&mdev->clock_lock, flags);
+
+	return 0;
+}
+
+/**
+ * mlx4_en_phc_adjtime - Shift the time of the hardware clock
+ * @ptp: ptp clock structure
+ * @delta: Desired change in nanoseconds
+ *
+ * Adjust the timer by resetting the timecounter structure.
+ **/
+static int mlx4_en_phc_adjtime(struct ptp_clock_info *ptp, s64 delta)
+{
+	struct mlx4_en_dev *mdev = container_of(ptp, struct mlx4_en_dev,
+						ptp_clock_info);
+	unsigned long flags;
+	s64 now;
+
+	write_lock_irqsave(&mdev->clock_lock, flags);
+	now = timecounter_read(&mdev->clock);
+	now += delta;
+	timecounter_init(&mdev->clock, &mdev->cycles, now);
+	write_unlock_irqrestore(&mdev->clock_lock, flags);
+
+	return 0;
+}
+
+/**
+ * mlx4_en_phc_gettime - Reads the current time from the hardware clock
+ * @ptp: ptp clock structure
+ * @ts: timespec structure to hold the current time value
+ *
+ * Read the timecounter and return the correct value in ns after converting
+ * it into a struct timespec.
+ **/
+static int mlx4_en_phc_gettime(struct ptp_clock_info *ptp, struct timespec *ts)
+{
+	struct mlx4_en_dev *mdev = container_of(ptp, struct mlx4_en_dev,
+						ptp_clock_info);
+	unsigned long flags;
+	u32 remainder;
+	u64 ns;
+
+	write_lock_irqsave(&mdev->clock_lock, flags);
+	ns = timecounter_read(&mdev->clock);
+	write_unlock_irqrestore(&mdev->clock_lock, flags);
+
+	ts->tv_sec = div_u64_rem(ns, NSEC_PER_SEC, &remainder);
+	ts->tv_nsec = remainder;
+
+	return 0;
+}
+
+/**
+ * mlx4_en_phc_settime - Set the current time on the hardware clock
+ * @ptp: ptp clock structure
+ * @ts: timespec containing the new time for the cycle counter
+ *
+ * Reset the timecounter to use a new base value instead of the kernel
+ * wall timer value.
+ **/
+static int mlx4_en_phc_settime(struct ptp_clock_info *ptp,
+			       const struct timespec *ts)
+{
+	struct mlx4_en_dev *mdev = container_of(ptp, struct mlx4_en_dev,
+						ptp_clock_info);
+	u64 ns = timespec_to_ns(ts);
+	unsigned long flags;
+
+	/* reset the timecounter */
+	write_lock_irqsave(&mdev->clock_lock, flags);
+	timecounter_init(&mdev->clock, &mdev->cycles, ns);
+	write_unlock_irqrestore(&mdev->clock_lock, flags);
+
+	return 0;
+}
+
+/**
+ * mlx4_en_phc_enable - enable or disable an ancillary feature
+ * @ptp: ptp clock structure
+ * @request: Desired resource to enable or disable
+ * @on: Caller passes one to enable or zero to disable
+ *
+ * Enable (or disable) ancillary features of the PHC subsystem.
+ * Currently, no ancillary features are supported.
+ **/
+static int mlx4_en_phc_enable(struct ptp_clock_info __always_unused *ptp,
+			      struct ptp_clock_request __always_unused *request,
+			      int __always_unused on)
+{
+	return -EOPNOTSUPP;
+}
+
+static const struct ptp_clock_info mlx4_en_ptp_clock_info = {
+	.owner		= THIS_MODULE,
+	.max_adj	= 100000000,
+	.n_alarm	= 0,
+	.n_ext_ts	= 0,
+	.n_per_out	= 0,
+	.n_pins		= 0,
+	.pps		= 0,
+	.adjfreq	= mlx4_en_phc_adjfreq,
+	.adjtime	= mlx4_en_phc_adjtime,
+	.gettime	= mlx4_en_phc_gettime,
+	.settime	= mlx4_en_phc_settime,
+	.enable		= mlx4_en_phc_enable,
+};
+
+void mlx4_en_init_timestamp(struct mlx4_en_dev *mdev)
+{
+	struct mlx4_dev *dev = mdev->dev;
+	unsigned long flags;
+	u64 ns;
+
+	rwlock_init(&mdev->clock_lock);
+
+	memset(&mdev->cycles, 0, sizeof(mdev->cycles));
+	mdev->cycles.read = mlx4_en_read_clock;
+	mdev->cycles.mask = CLOCKSOURCE_MASK(48);
+	/* Using shift to make calculation more accurate. Since current HW
+	 * clock frequency is 427 MHz, and cycles are given using a 48 bits
+	 * register, the biggest shift when calculating using u64, is 14
+	 * (max_cycles * multiplier < 2^64)
+	 */
+	mdev->cycles.shift = 14;
+	mdev->cycles.mult =
+		clocksource_khz2mult(1000 * dev->caps.hca_core_clock, mdev->cycles.shift);
+	mdev->nominal_c_mult = mdev->cycles.mult;
+
+	write_lock_irqsave(&mdev->clock_lock, flags);
+	timecounter_init(&mdev->clock, &mdev->cycles,
+			 ktime_to_ns(ktime_get_real()));
+	write_unlock_irqrestore(&mdev->clock_lock, flags);
+
+	/* Calculate period in seconds to call the overflow watchdog - to make
+	 * sure counter is checked at least once every wrap around.
+	 */
+	ns = cyclecounter_cyc2ns(&mdev->cycles, mdev->cycles.mask);
+	do_div(ns, NSEC_PER_SEC / 2 / HZ);
+	mdev->overflow_period = ns;
+
+	/* Configure the PHC */
+	mdev->ptp_clock_info = mlx4_en_ptp_clock_info;
+	snprintf(mdev->ptp_clock_info.name, 16, "mlx4 ptp");
+
+	mdev->ptp_clock = ptp_clock_register(&mdev->ptp_clock_info,
+					     &mdev->pdev->dev);
+	if (IS_ERR(mdev->ptp_clock)) {
+		mdev->ptp_clock = NULL;
+		mlx4_err(mdev, "ptp_clock_register failed\n");
+	} else {
+		mlx4_info(mdev, "registered PHC clock\n");
+	}
+
+}
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_cq.c b/drivers/net/ethernet/mellanox/mlx4/en_cq.c
new file mode 100644
index 0000000..82322b1
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4/en_cq.c
@@ -0,0 +1,232 @@
+/*
+ * Copyright (c) 2007 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <linux/mlx4/cq.h>
+#include <linux/mlx4/qp.h>
+#include <linux/mlx4/cmd.h>
+
+#include "mlx4_en.h"
+
+static void mlx4_en_cq_event(struct mlx4_cq *cq, enum mlx4_event event)
+{
+	return;
+}
+
+
+int mlx4_en_create_cq(struct mlx4_en_priv *priv,
+		      struct mlx4_en_cq **pcq,
+		      int entries, int ring, enum cq_type mode,
+		      int node)
+{
+	struct mlx4_en_dev *mdev = priv->mdev;
+	struct mlx4_en_cq *cq;
+	int err;
+
+	cq = kzalloc_node(sizeof(*cq), GFP_KERNEL, node);
+	if (!cq) {
+		cq = kzalloc(sizeof(*cq), GFP_KERNEL);
+		if (!cq) {
+			en_err(priv, "Failed to allocate CQ structure\n");
+			return -ENOMEM;
+		}
+	}
+
+	cq->size = entries;
+	cq->buf_size = cq->size * mdev->dev->caps.cqe_size;
+
+	cq->ring = ring;
+	cq->is_tx = mode;
+
+	/* Allocate HW buffers on provided NUMA node.
+	 * dev->numa_node is used in mtt range allocation flow.
+	 */
+	set_dev_node(&mdev->dev->pdev->dev, node);
+	err = mlx4_alloc_hwq_res(mdev->dev, &cq->wqres,
+				cq->buf_size, 2 * PAGE_SIZE);
+	set_dev_node(&mdev->dev->pdev->dev, mdev->dev->numa_node);
+	if (err)
+		goto err_cq;
+
+	err = mlx4_en_map_buffer(&cq->wqres.buf);
+	if (err)
+		goto err_res;
+
+	cq->buf = (struct mlx4_cqe *)cq->wqres.buf.direct.buf;
+	*pcq = cq;
+
+	return 0;
+
+err_res:
+	mlx4_free_hwq_res(mdev->dev, &cq->wqres, cq->buf_size);
+err_cq:
+	kfree(cq);
+	*pcq = NULL;
+	return err;
+}
+
+int mlx4_en_activate_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq,
+			int cq_idx)
+{
+	struct mlx4_en_dev *mdev = priv->mdev;
+	int err = 0;
+	char name[25];
+	int timestamp_en = 0;
+	struct cpu_rmap *rmap =
+#ifdef CONFIG_RFS_ACCEL
+		priv->dev->rx_cpu_rmap;
+#else
+		NULL;
+#endif
+
+	cq->dev = mdev->pndev[priv->port];
+	cq->mcq.set_ci_db  = cq->wqres.db.db;
+	cq->mcq.arm_db     = cq->wqres.db.db + 1;
+	*cq->mcq.set_ci_db = 0;
+	*cq->mcq.arm_db    = 0;
+	memset(cq->buf, 0, cq->buf_size);
+
+	if (cq->is_tx == RX) {
+		if (mdev->dev->caps.comp_pool) {
+			if (!cq->vector) {
+				sprintf(name, "%s-%d", priv->dev->name,
+					cq->ring);
+				/* Set IRQ for specific name (per ring) */
+				if (mlx4_assign_eq(mdev->dev, name, rmap,
+						   &cq->vector)) {
+					cq->vector = (cq->ring + 1 + priv->port)
+					    % mdev->dev->caps.num_comp_vectors;
+					mlx4_warn(mdev, "Failed assigning an EQ to %s, falling back to legacy EQ's\n",
+						  name);
+				}
+
+			}
+		} else {
+			cq->vector = (cq->ring + 1 + priv->port) %
+				mdev->dev->caps.num_comp_vectors;
+		}
+
+		cq->irq_desc =
+			irq_to_desc(mlx4_eq_get_irq(mdev->dev,
+						    cq->vector));
+	} else {
+		/* For TX we use the same irq per
+		ring we assigned for the RX    */
+		struct mlx4_en_cq *rx_cq;
+
+		cq_idx = cq_idx % priv->rx_ring_num;
+		rx_cq = priv->rx_cq[cq_idx];
+		cq->vector = rx_cq->vector;
+	}
+
+	if (!cq->is_tx)
+		cq->size = priv->rx_ring[cq->ring]->actual_size;
+
+	if ((cq->is_tx && priv->hwtstamp_config.tx_type) ||
+	    (!cq->is_tx && priv->hwtstamp_config.rx_filter))
+		timestamp_en = 1;
+
+	err = mlx4_cq_alloc(mdev->dev, cq->size, &cq->wqres.mtt,
+			    &mdev->priv_uar, cq->wqres.db.dma, &cq->mcq,
+			    cq->vector, 0, timestamp_en);
+	if (err)
+		return err;
+
+	cq->mcq.comp  = cq->is_tx ? mlx4_en_tx_irq : mlx4_en_rx_irq;
+	cq->mcq.event = mlx4_en_cq_event;
+
+	if (cq->is_tx) {
+		netif_napi_add(cq->dev, &cq->napi, mlx4_en_poll_tx_cq,
+			       NAPI_POLL_WEIGHT);
+	} else {
+		struct mlx4_en_rx_ring *ring = priv->rx_ring[cq->ring];
+
+		err = irq_set_affinity_hint(cq->mcq.irq,
+					    ring->affinity_mask);
+		if (err)
+			mlx4_warn(mdev, "Failed setting affinity hint\n");
+
+		netif_napi_add(cq->dev, &cq->napi, mlx4_en_poll_rx_cq, 64);
+		napi_hash_add(&cq->napi);
+	}
+
+	napi_enable(&cq->napi);
+
+	return 0;
+}
+
+void mlx4_en_destroy_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq **pcq)
+{
+	struct mlx4_en_dev *mdev = priv->mdev;
+	struct mlx4_en_cq *cq = *pcq;
+
+	mlx4_en_unmap_buffer(&cq->wqres.buf);
+	mlx4_free_hwq_res(mdev->dev, &cq->wqres, cq->buf_size);
+	if (priv->mdev->dev->caps.comp_pool && cq->vector) {
+		mlx4_release_eq(priv->mdev->dev, cq->vector);
+	}
+	cq->vector = 0;
+	cq->buf_size = 0;
+	cq->buf = NULL;
+	kfree(cq);
+	*pcq = NULL;
+}
+
+void mlx4_en_deactivate_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq)
+{
+	napi_disable(&cq->napi);
+	if (!cq->is_tx) {
+		napi_hash_del(&cq->napi);
+		synchronize_rcu();
+		irq_set_affinity_hint(cq->mcq.irq, NULL);
+	}
+	netif_napi_del(&cq->napi);
+
+	mlx4_cq_free(priv->mdev->dev, &cq->mcq);
+}
+
+/* Set rx cq moderation parameters */
+int mlx4_en_set_cq_moder(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq)
+{
+	return mlx4_cq_modify(priv->mdev->dev, &cq->mcq,
+			      cq->moder_cnt, cq->moder_time);
+}
+
+int mlx4_en_arm_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq)
+{
+	mlx4_cq_arm(&cq->mcq, MLX4_CQ_DB_REQ_NOT, priv->mdev->uar_map,
+		    &priv->mdev->uar_lock);
+
+	return 0;
+}
+
+
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_dcb_nl.c b/drivers/net/ethernet/mellanox/mlx4/en_dcb_nl.c
new file mode 100644
index 0000000..c95ca25
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4/en_dcb_nl.c
@@ -0,0 +1,263 @@
+/*
+ * Copyright (c) 2011 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <linux/dcbnl.h>
+#include <linux/math64.h>
+
+#include "mlx4_en.h"
+
+static int mlx4_en_dcbnl_ieee_getets(struct net_device *dev,
+				   struct ieee_ets *ets)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct ieee_ets *my_ets = &priv->ets;
+
+	/* No IEEE PFC settings available */
+	if (!my_ets)
+		return -EINVAL;
+
+	ets->ets_cap = IEEE_8021QAZ_MAX_TCS;
+	ets->cbs = my_ets->cbs;
+	memcpy(ets->tc_tx_bw, my_ets->tc_tx_bw, sizeof(ets->tc_tx_bw));
+	memcpy(ets->tc_tsa, my_ets->tc_tsa, sizeof(ets->tc_tsa));
+	memcpy(ets->prio_tc, my_ets->prio_tc, sizeof(ets->prio_tc));
+
+	return 0;
+}
+
+static int mlx4_en_ets_validate(struct mlx4_en_priv *priv, struct ieee_ets *ets)
+{
+	int i;
+	int total_ets_bw = 0;
+	int has_ets_tc = 0;
+
+	for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++) {
+		if (ets->prio_tc[i] >= MLX4_EN_NUM_UP) {
+			en_err(priv, "Bad priority in UP <=> TC mapping. TC: %d, UP: %d\n",
+					i, ets->prio_tc[i]);
+			return -EINVAL;
+		}
+
+		switch (ets->tc_tsa[i]) {
+		case IEEE_8021QAZ_TSA_STRICT:
+			break;
+		case IEEE_8021QAZ_TSA_ETS:
+			has_ets_tc = 1;
+			total_ets_bw += ets->tc_tx_bw[i];
+			break;
+		default:
+			en_err(priv, "TC[%d]: Not supported TSA: %d\n",
+					i, ets->tc_tsa[i]);
+			return -ENOTSUPP;
+		}
+	}
+
+	if (has_ets_tc && total_ets_bw != MLX4_EN_BW_MAX) {
+		en_err(priv, "Bad ETS BW sum: %d. Should be exactly 100%%\n",
+				total_ets_bw);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int mlx4_en_config_port_scheduler(struct mlx4_en_priv *priv,
+		struct ieee_ets *ets, u16 *ratelimit)
+{
+	struct mlx4_en_dev *mdev = priv->mdev;
+	int num_strict = 0;
+	int i;
+	__u8 tc_tx_bw[IEEE_8021QAZ_MAX_TCS] = { 0 };
+	__u8 pg[IEEE_8021QAZ_MAX_TCS] = { 0 };
+
+	ets = ets ?: &priv->ets;
+	ratelimit = ratelimit ?: priv->maxrate;
+
+	/* higher TC means higher priority => lower pg */
+	for (i = IEEE_8021QAZ_MAX_TCS - 1; i >= 0; i--) {
+		switch (ets->tc_tsa[i]) {
+		case IEEE_8021QAZ_TSA_STRICT:
+			pg[i] = num_strict++;
+			tc_tx_bw[i] = MLX4_EN_BW_MAX;
+			break;
+		case IEEE_8021QAZ_TSA_ETS:
+			pg[i] = MLX4_EN_TC_ETS;
+			tc_tx_bw[i] = ets->tc_tx_bw[i] ?: MLX4_EN_BW_MIN;
+			break;
+		}
+	}
+
+	return mlx4_SET_PORT_SCHEDULER(mdev->dev, priv->port, tc_tx_bw, pg,
+			ratelimit);
+}
+
+static int
+mlx4_en_dcbnl_ieee_setets(struct net_device *dev, struct ieee_ets *ets)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = priv->mdev;
+	int err;
+
+	err = mlx4_en_ets_validate(priv, ets);
+	if (err)
+		return err;
+
+	err = mlx4_SET_PORT_PRIO2TC(mdev->dev, priv->port, ets->prio_tc);
+	if (err)
+		return err;
+
+	err = mlx4_en_config_port_scheduler(priv, ets, NULL);
+	if (err)
+		return err;
+
+	memcpy(&priv->ets, ets, sizeof(priv->ets));
+
+	return 0;
+}
+
+static int mlx4_en_dcbnl_ieee_getpfc(struct net_device *dev,
+		struct ieee_pfc *pfc)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+
+	pfc->pfc_cap = IEEE_8021QAZ_MAX_TCS;
+	pfc->pfc_en = priv->prof->tx_ppp;
+
+	return 0;
+}
+
+static int mlx4_en_dcbnl_ieee_setpfc(struct net_device *dev,
+		struct ieee_pfc *pfc)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_port_profile *prof = priv->prof;
+	struct mlx4_en_dev *mdev = priv->mdev;
+	int err;
+
+	en_dbg(DRV, priv, "cap: 0x%x en: 0x%x mbc: 0x%x delay: %d\n",
+			pfc->pfc_cap,
+			pfc->pfc_en,
+			pfc->mbc,
+			pfc->delay);
+
+	prof->rx_pause = !pfc->pfc_en;
+	prof->tx_pause = !pfc->pfc_en;
+	prof->rx_ppp = pfc->pfc_en;
+	prof->tx_ppp = pfc->pfc_en;
+
+	err = mlx4_SET_PORT_general(mdev->dev, priv->port,
+				    priv->rx_skb_size + ETH_FCS_LEN,
+				    prof->tx_pause,
+				    prof->tx_ppp,
+				    prof->rx_pause,
+				    prof->rx_ppp);
+	if (err)
+		en_err(priv, "Failed setting pause params\n");
+
+	return err;
+}
+
+static u8 mlx4_en_dcbnl_getdcbx(struct net_device *dev)
+{
+	return DCB_CAP_DCBX_HOST | DCB_CAP_DCBX_VER_IEEE;
+}
+
+static u8 mlx4_en_dcbnl_setdcbx(struct net_device *dev, u8 mode)
+{
+	if ((mode & DCB_CAP_DCBX_LLD_MANAGED) ||
+	    (mode & DCB_CAP_DCBX_VER_CEE) ||
+	    !(mode & DCB_CAP_DCBX_VER_IEEE) ||
+	    !(mode & DCB_CAP_DCBX_HOST))
+		return 1;
+
+	return 0;
+}
+
+#define MLX4_RATELIMIT_UNITS_IN_KB 100000 /* rate-limit HW unit in Kbps */
+static int mlx4_en_dcbnl_ieee_getmaxrate(struct net_device *dev,
+				   struct ieee_maxrate *maxrate)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	int i;
+
+	for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++)
+		maxrate->tc_maxrate[i] =
+			priv->maxrate[i] * MLX4_RATELIMIT_UNITS_IN_KB;
+
+	return 0;
+}
+
+static int mlx4_en_dcbnl_ieee_setmaxrate(struct net_device *dev,
+		struct ieee_maxrate *maxrate)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	u16 tmp[IEEE_8021QAZ_MAX_TCS];
+	int i, err;
+
+	for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++) {
+		/* Convert from Kbps into HW units, rounding result up.
+		 * Setting to 0, means unlimited BW.
+		 */
+		tmp[i] = div_u64(maxrate->tc_maxrate[i] +
+				 MLX4_RATELIMIT_UNITS_IN_KB - 1,
+				 MLX4_RATELIMIT_UNITS_IN_KB);
+	}
+
+	err = mlx4_en_config_port_scheduler(priv, NULL, tmp);
+	if (err)
+		return err;
+
+	memcpy(priv->maxrate, tmp, sizeof(priv->maxrate));
+
+	return 0;
+}
+
+const struct dcbnl_rtnl_ops mlx4_en_dcbnl_ops = {
+	.ieee_getets	= mlx4_en_dcbnl_ieee_getets,
+	.ieee_setets	= mlx4_en_dcbnl_ieee_setets,
+	.ieee_getmaxrate = mlx4_en_dcbnl_ieee_getmaxrate,
+	.ieee_setmaxrate = mlx4_en_dcbnl_ieee_setmaxrate,
+	.ieee_getpfc	= mlx4_en_dcbnl_ieee_getpfc,
+	.ieee_setpfc	= mlx4_en_dcbnl_ieee_setpfc,
+
+	.getdcbx	= mlx4_en_dcbnl_getdcbx,
+	.setdcbx	= mlx4_en_dcbnl_setdcbx,
+};
+
+const struct dcbnl_rtnl_ops mlx4_en_dcbnl_pfc_ops = {
+	.ieee_getpfc	= mlx4_en_dcbnl_ieee_getpfc,
+	.ieee_setpfc	= mlx4_en_dcbnl_ieee_setpfc,
+
+	.getdcbx	= mlx4_en_dcbnl_getdcbx,
+	.setdcbx	= mlx4_en_dcbnl_setdcbx,
+};
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c
new file mode 100644
index 0000000..ae83da9
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c
@@ -0,0 +1,1349 @@
+/*
+ * Copyright (c) 2007 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/ethtool.h>
+#include <linux/netdevice.h>
+#include <linux/mlx4/driver.h>
+#include <linux/in.h>
+#include <net/ip.h>
+
+#include "mlx4_en.h"
+#include "en_port.h"
+
+#define EN_ETHTOOL_QP_ATTACH (1ull << 63)
+#define EN_ETHTOOL_SHORT_MASK cpu_to_be16(0xffff)
+#define EN_ETHTOOL_WORD_MASK  cpu_to_be32(0xffffffff)
+
+static int mlx4_en_moderation_update(struct mlx4_en_priv *priv)
+{
+	int i;
+	int err = 0;
+
+	for (i = 0; i < priv->tx_ring_num; i++) {
+		priv->tx_cq[i]->moder_cnt = priv->tx_frames;
+		priv->tx_cq[i]->moder_time = priv->tx_usecs;
+		if (priv->port_up) {
+			err = mlx4_en_set_cq_moder(priv, priv->tx_cq[i]);
+			if (err)
+				return err;
+		}
+	}
+
+	if (priv->adaptive_rx_coal)
+		return 0;
+
+	for (i = 0; i < priv->rx_ring_num; i++) {
+		priv->rx_cq[i]->moder_cnt = priv->rx_frames;
+		priv->rx_cq[i]->moder_time = priv->rx_usecs;
+		priv->last_moder_time[i] = MLX4_EN_AUTO_CONF;
+		if (priv->port_up) {
+			err = mlx4_en_set_cq_moder(priv, priv->rx_cq[i]);
+			if (err)
+				return err;
+		}
+	}
+
+	return err;
+}
+
+static void
+mlx4_en_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *drvinfo)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = priv->mdev;
+
+	strlcpy(drvinfo->driver, DRV_NAME, sizeof(drvinfo->driver));
+	strlcpy(drvinfo->version, DRV_VERSION " (" DRV_RELDATE ")",
+		sizeof(drvinfo->version));
+	snprintf(drvinfo->fw_version, sizeof(drvinfo->fw_version),
+		"%d.%d.%d",
+		(u16) (mdev->dev->caps.fw_ver >> 32),
+		(u16) ((mdev->dev->caps.fw_ver >> 16) & 0xffff),
+		(u16) (mdev->dev->caps.fw_ver & 0xffff));
+	strlcpy(drvinfo->bus_info, pci_name(mdev->dev->pdev),
+		sizeof(drvinfo->bus_info));
+	drvinfo->n_stats = 0;
+	drvinfo->regdump_len = 0;
+	drvinfo->eedump_len = 0;
+}
+
+static const char mlx4_en_priv_flags[][ETH_GSTRING_LEN] = {
+	"blueflame",
+};
+
+static const char main_strings[][ETH_GSTRING_LEN] = {
+	"rx_packets", "tx_packets", "rx_bytes", "tx_bytes", "rx_errors",
+	"tx_errors", "rx_dropped", "tx_dropped", "multicast", "collisions",
+	"rx_length_errors", "rx_over_errors", "rx_crc_errors",
+	"rx_frame_errors", "rx_fifo_errors", "rx_missed_errors",
+	"tx_aborted_errors", "tx_carrier_errors", "tx_fifo_errors",
+	"tx_heartbeat_errors", "tx_window_errors",
+
+	/* port statistics */
+	"tso_packets",
+	"xmit_more",
+	"queue_stopped", "wake_queue", "tx_timeout", "rx_alloc_failed",
+	"rx_csum_good", "rx_csum_none", "tx_chksum_offload",
+
+	/* packet statistics */
+	"broadcast", "rx_prio_0", "rx_prio_1", "rx_prio_2", "rx_prio_3",
+	"rx_prio_4", "rx_prio_5", "rx_prio_6", "rx_prio_7", "tx_prio_0",
+	"tx_prio_1", "tx_prio_2", "tx_prio_3", "tx_prio_4", "tx_prio_5",
+	"tx_prio_6", "tx_prio_7",
+};
+#define NUM_MAIN_STATS	21
+#define NUM_ALL_STATS	(NUM_MAIN_STATS + NUM_PORT_STATS + NUM_PKT_STATS + NUM_PERF_STATS)
+
+static const char mlx4_en_test_names[][ETH_GSTRING_LEN]= {
+	"Interrupt Test",
+	"Link Test",
+	"Speed Test",
+	"Register Test",
+	"Loopback Test",
+};
+
+static u32 mlx4_en_get_msglevel(struct net_device *dev)
+{
+	return ((struct mlx4_en_priv *) netdev_priv(dev))->msg_enable;
+}
+
+static void mlx4_en_set_msglevel(struct net_device *dev, u32 val)
+{
+	((struct mlx4_en_priv *) netdev_priv(dev))->msg_enable = val;
+}
+
+static void mlx4_en_get_wol(struct net_device *netdev,
+			    struct ethtool_wolinfo *wol)
+{
+	struct mlx4_en_priv *priv = netdev_priv(netdev);
+	int err = 0;
+	u64 config = 0;
+	u64 mask;
+
+	if ((priv->port < 1) || (priv->port > 2)) {
+		en_err(priv, "Failed to get WoL information\n");
+		return;
+	}
+
+	mask = (priv->port == 1) ? MLX4_DEV_CAP_FLAG_WOL_PORT1 :
+		MLX4_DEV_CAP_FLAG_WOL_PORT2;
+
+	if (!(priv->mdev->dev->caps.flags & mask)) {
+		wol->supported = 0;
+		wol->wolopts = 0;
+		return;
+	}
+
+	err = mlx4_wol_read(priv->mdev->dev, &config, priv->port);
+	if (err) {
+		en_err(priv, "Failed to get WoL information\n");
+		return;
+	}
+
+	if (config & MLX4_EN_WOL_MAGIC)
+		wol->supported = WAKE_MAGIC;
+	else
+		wol->supported = 0;
+
+	if (config & MLX4_EN_WOL_ENABLED)
+		wol->wolopts = WAKE_MAGIC;
+	else
+		wol->wolopts = 0;
+}
+
+static int mlx4_en_set_wol(struct net_device *netdev,
+			    struct ethtool_wolinfo *wol)
+{
+	struct mlx4_en_priv *priv = netdev_priv(netdev);
+	u64 config = 0;
+	int err = 0;
+	u64 mask;
+
+	if ((priv->port < 1) || (priv->port > 2))
+		return -EOPNOTSUPP;
+
+	mask = (priv->port == 1) ? MLX4_DEV_CAP_FLAG_WOL_PORT1 :
+		MLX4_DEV_CAP_FLAG_WOL_PORT2;
+
+	if (!(priv->mdev->dev->caps.flags & mask))
+		return -EOPNOTSUPP;
+
+	if (wol->supported & ~WAKE_MAGIC)
+		return -EINVAL;
+
+	err = mlx4_wol_read(priv->mdev->dev, &config, priv->port);
+	if (err) {
+		en_err(priv, "Failed to get WoL info, unable to modify\n");
+		return err;
+	}
+
+	if (wol->wolopts & WAKE_MAGIC) {
+		config |= MLX4_EN_WOL_DO_MODIFY | MLX4_EN_WOL_ENABLED |
+				MLX4_EN_WOL_MAGIC;
+	} else {
+		config &= ~(MLX4_EN_WOL_ENABLED | MLX4_EN_WOL_MAGIC);
+		config |= MLX4_EN_WOL_DO_MODIFY;
+	}
+
+	err = mlx4_wol_write(priv->mdev->dev, config, priv->port);
+	if (err)
+		en_err(priv, "Failed to set WoL information\n");
+
+	return err;
+}
+
+static int mlx4_en_get_sset_count(struct net_device *dev, int sset)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	int bit_count = hweight64(priv->stats_bitmap);
+
+	switch (sset) {
+	case ETH_SS_STATS:
+		return (priv->stats_bitmap ? bit_count : NUM_ALL_STATS) +
+			(priv->tx_ring_num * 2) +
+#ifdef CONFIG_NET_RX_BUSY_POLL
+			(priv->rx_ring_num * 5);
+#else
+			(priv->rx_ring_num * 2);
+#endif
+	case ETH_SS_TEST:
+		return MLX4_EN_NUM_SELF_TEST - !(priv->mdev->dev->caps.flags
+					& MLX4_DEV_CAP_FLAG_UC_LOOPBACK) * 2;
+	case ETH_SS_PRIV_FLAGS:
+		return ARRAY_SIZE(mlx4_en_priv_flags);
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+static void mlx4_en_get_ethtool_stats(struct net_device *dev,
+		struct ethtool_stats *stats, uint64_t *data)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	int index = 0;
+	int i, j = 0;
+
+	spin_lock_bh(&priv->stats_lock);
+
+	if (!(priv->stats_bitmap)) {
+		for (i = 0; i < NUM_MAIN_STATS; i++)
+			data[index++] =
+				((unsigned long *) &priv->stats)[i];
+		for (i = 0; i < NUM_PORT_STATS; i++)
+			data[index++] =
+				((unsigned long *) &priv->port_stats)[i];
+		for (i = 0; i < NUM_PKT_STATS; i++)
+			data[index++] =
+				((unsigned long *) &priv->pkstats)[i];
+	} else {
+		for (i = 0; i < NUM_MAIN_STATS; i++) {
+			if ((priv->stats_bitmap >> j) & 1)
+				data[index++] =
+				((unsigned long *) &priv->stats)[i];
+			j++;
+		}
+		for (i = 0; i < NUM_PORT_STATS; i++) {
+			if ((priv->stats_bitmap >> j) & 1)
+				data[index++] =
+				((unsigned long *) &priv->port_stats)[i];
+			j++;
+		}
+	}
+	for (i = 0; i < priv->tx_ring_num; i++) {
+		data[index++] = priv->tx_ring[i]->packets;
+		data[index++] = priv->tx_ring[i]->bytes;
+	}
+	for (i = 0; i < priv->rx_ring_num; i++) {
+		data[index++] = priv->rx_ring[i]->packets;
+		data[index++] = priv->rx_ring[i]->bytes;
+#ifdef CONFIG_NET_RX_BUSY_POLL
+		data[index++] = priv->rx_ring[i]->yields;
+		data[index++] = priv->rx_ring[i]->misses;
+		data[index++] = priv->rx_ring[i]->cleaned;
+#endif
+	}
+	spin_unlock_bh(&priv->stats_lock);
+
+}
+
+static void mlx4_en_self_test(struct net_device *dev,
+			      struct ethtool_test *etest, u64 *buf)
+{
+	mlx4_en_ex_selftest(dev, &etest->flags, buf);
+}
+
+static void mlx4_en_get_strings(struct net_device *dev,
+				uint32_t stringset, uint8_t *data)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	int index = 0;
+	int i;
+
+	switch (stringset) {
+	case ETH_SS_TEST:
+		for (i = 0; i < MLX4_EN_NUM_SELF_TEST - 2; i++)
+			strcpy(data + i * ETH_GSTRING_LEN, mlx4_en_test_names[i]);
+		if (priv->mdev->dev->caps.flags & MLX4_DEV_CAP_FLAG_UC_LOOPBACK)
+			for (; i < MLX4_EN_NUM_SELF_TEST; i++)
+				strcpy(data + i * ETH_GSTRING_LEN, mlx4_en_test_names[i]);
+		break;
+
+	case ETH_SS_STATS:
+		/* Add main counters */
+		if (!priv->stats_bitmap) {
+			for (i = 0; i < NUM_MAIN_STATS; i++)
+				strcpy(data + (index++) * ETH_GSTRING_LEN,
+					main_strings[i]);
+			for (i = 0; i < NUM_PORT_STATS; i++)
+				strcpy(data + (index++) * ETH_GSTRING_LEN,
+					main_strings[i +
+					NUM_MAIN_STATS]);
+			for (i = 0; i < NUM_PKT_STATS; i++)
+				strcpy(data + (index++) * ETH_GSTRING_LEN,
+					main_strings[i +
+					NUM_MAIN_STATS +
+					NUM_PORT_STATS]);
+		} else
+			for (i = 0; i < NUM_MAIN_STATS + NUM_PORT_STATS; i++) {
+				if ((priv->stats_bitmap >> i) & 1) {
+					strcpy(data +
+					       (index++) * ETH_GSTRING_LEN,
+					       main_strings[i]);
+				}
+				if (!(priv->stats_bitmap >> i))
+					break;
+			}
+		for (i = 0; i < priv->tx_ring_num; i++) {
+			sprintf(data + (index++) * ETH_GSTRING_LEN,
+				"tx%d_packets", i);
+			sprintf(data + (index++) * ETH_GSTRING_LEN,
+				"tx%d_bytes", i);
+		}
+		for (i = 0; i < priv->rx_ring_num; i++) {
+			sprintf(data + (index++) * ETH_GSTRING_LEN,
+				"rx%d_packets", i);
+			sprintf(data + (index++) * ETH_GSTRING_LEN,
+				"rx%d_bytes", i);
+#ifdef CONFIG_NET_RX_BUSY_POLL
+			sprintf(data + (index++) * ETH_GSTRING_LEN,
+				"rx%d_napi_yield", i);
+			sprintf(data + (index++) * ETH_GSTRING_LEN,
+				"rx%d_misses", i);
+			sprintf(data + (index++) * ETH_GSTRING_LEN,
+				"rx%d_cleaned", i);
+#endif
+		}
+		break;
+	case ETH_SS_PRIV_FLAGS:
+		for (i = 0; i < ARRAY_SIZE(mlx4_en_priv_flags); i++)
+			strcpy(data + i * ETH_GSTRING_LEN,
+			       mlx4_en_priv_flags[i]);
+		break;
+
+	}
+}
+
+static int mlx4_en_get_settings(struct net_device *dev, struct ethtool_cmd *cmd)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	int trans_type;
+
+	cmd->autoneg = AUTONEG_DISABLE;
+	cmd->supported = SUPPORTED_10000baseT_Full;
+	cmd->advertising = ADVERTISED_10000baseT_Full;
+
+	if (mlx4_en_QUERY_PORT(priv->mdev, priv->port))
+		return -ENOMEM;
+
+	trans_type = priv->port_state.transciver;
+	if (netif_carrier_ok(dev)) {
+		ethtool_cmd_speed_set(cmd, priv->port_state.link_speed);
+		cmd->duplex = DUPLEX_FULL;
+	} else {
+		ethtool_cmd_speed_set(cmd, SPEED_UNKNOWN);
+		cmd->duplex = DUPLEX_UNKNOWN;
+	}
+
+	if (trans_type > 0 && trans_type <= 0xC) {
+		cmd->port = PORT_FIBRE;
+		cmd->transceiver = XCVR_EXTERNAL;
+		cmd->supported |= SUPPORTED_FIBRE;
+		cmd->advertising |= ADVERTISED_FIBRE;
+	} else if (trans_type == 0x80 || trans_type == 0) {
+		cmd->port = PORT_TP;
+		cmd->transceiver = XCVR_INTERNAL;
+		cmd->supported |= SUPPORTED_TP;
+		cmd->advertising |= ADVERTISED_TP;
+	} else  {
+		cmd->port = -1;
+		cmd->transceiver = -1;
+	}
+	return 0;
+}
+
+static int mlx4_en_set_settings(struct net_device *dev, struct ethtool_cmd *cmd)
+{
+	if ((cmd->autoneg == AUTONEG_ENABLE) ||
+	    (ethtool_cmd_speed(cmd) != SPEED_10000) ||
+	    (cmd->duplex != DUPLEX_FULL))
+		return -EINVAL;
+
+	/* Nothing to change */
+	return 0;
+}
+
+static int mlx4_en_get_coalesce(struct net_device *dev,
+			      struct ethtool_coalesce *coal)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+
+	coal->tx_coalesce_usecs = priv->tx_usecs;
+	coal->tx_max_coalesced_frames = priv->tx_frames;
+	coal->tx_max_coalesced_frames_irq = priv->tx_work_limit;
+
+	coal->rx_coalesce_usecs = priv->rx_usecs;
+	coal->rx_max_coalesced_frames = priv->rx_frames;
+
+	coal->pkt_rate_low = priv->pkt_rate_low;
+	coal->rx_coalesce_usecs_low = priv->rx_usecs_low;
+	coal->pkt_rate_high = priv->pkt_rate_high;
+	coal->rx_coalesce_usecs_high = priv->rx_usecs_high;
+	coal->rate_sample_interval = priv->sample_interval;
+	coal->use_adaptive_rx_coalesce = priv->adaptive_rx_coal;
+
+	return 0;
+}
+
+static int mlx4_en_set_coalesce(struct net_device *dev,
+			      struct ethtool_coalesce *coal)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+
+	if (!coal->tx_max_coalesced_frames_irq)
+		return -EINVAL;
+
+	priv->rx_frames = (coal->rx_max_coalesced_frames ==
+			   MLX4_EN_AUTO_CONF) ?
+				MLX4_EN_RX_COAL_TARGET :
+				coal->rx_max_coalesced_frames;
+	priv->rx_usecs = (coal->rx_coalesce_usecs ==
+			  MLX4_EN_AUTO_CONF) ?
+				MLX4_EN_RX_COAL_TIME :
+				coal->rx_coalesce_usecs;
+
+	/* Setting TX coalescing parameters */
+	if (coal->tx_coalesce_usecs != priv->tx_usecs ||
+	    coal->tx_max_coalesced_frames != priv->tx_frames) {
+		priv->tx_usecs = coal->tx_coalesce_usecs;
+		priv->tx_frames = coal->tx_max_coalesced_frames;
+	}
+
+	/* Set adaptive coalescing params */
+	priv->pkt_rate_low = coal->pkt_rate_low;
+	priv->rx_usecs_low = coal->rx_coalesce_usecs_low;
+	priv->pkt_rate_high = coal->pkt_rate_high;
+	priv->rx_usecs_high = coal->rx_coalesce_usecs_high;
+	priv->sample_interval = coal->rate_sample_interval;
+	priv->adaptive_rx_coal = coal->use_adaptive_rx_coalesce;
+	priv->tx_work_limit = coal->tx_max_coalesced_frames_irq;
+
+	return mlx4_en_moderation_update(priv);
+}
+
+static int mlx4_en_set_pauseparam(struct net_device *dev,
+				struct ethtool_pauseparam *pause)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = priv->mdev;
+	int err;
+
+	if (pause->autoneg)
+		return -EINVAL;
+
+	priv->prof->tx_pause = pause->tx_pause != 0;
+	priv->prof->rx_pause = pause->rx_pause != 0;
+	err = mlx4_SET_PORT_general(mdev->dev, priv->port,
+				    priv->rx_skb_size + ETH_FCS_LEN,
+				    priv->prof->tx_pause,
+				    priv->prof->tx_ppp,
+				    priv->prof->rx_pause,
+				    priv->prof->rx_ppp);
+	if (err)
+		en_err(priv, "Failed setting pause params\n");
+
+	return err;
+}
+
+static void mlx4_en_get_pauseparam(struct net_device *dev,
+				 struct ethtool_pauseparam *pause)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+
+	pause->tx_pause = priv->prof->tx_pause;
+	pause->rx_pause = priv->prof->rx_pause;
+}
+
+static int mlx4_en_set_ringparam(struct net_device *dev,
+				 struct ethtool_ringparam *param)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = priv->mdev;
+	u32 rx_size, tx_size;
+	int port_up = 0;
+	int err = 0;
+
+	if (param->rx_jumbo_pending || param->rx_mini_pending)
+		return -EINVAL;
+
+	rx_size = roundup_pow_of_two(param->rx_pending);
+	rx_size = max_t(u32, rx_size, MLX4_EN_MIN_RX_SIZE);
+	rx_size = min_t(u32, rx_size, MLX4_EN_MAX_RX_SIZE);
+	tx_size = roundup_pow_of_two(param->tx_pending);
+	tx_size = max_t(u32, tx_size, MLX4_EN_MIN_TX_SIZE);
+	tx_size = min_t(u32, tx_size, MLX4_EN_MAX_TX_SIZE);
+
+	if (rx_size == (priv->port_up ? priv->rx_ring[0]->actual_size :
+					priv->rx_ring[0]->size) &&
+	    tx_size == priv->tx_ring[0]->size)
+		return 0;
+
+	mutex_lock(&mdev->state_lock);
+	if (priv->port_up) {
+		port_up = 1;
+		mlx4_en_stop_port(dev, 1);
+	}
+
+	mlx4_en_free_resources(priv);
+
+	priv->prof->tx_ring_size = tx_size;
+	priv->prof->rx_ring_size = rx_size;
+
+	err = mlx4_en_alloc_resources(priv);
+	if (err) {
+		en_err(priv, "Failed reallocating port resources\n");
+		goto out;
+	}
+	if (port_up) {
+		err = mlx4_en_start_port(dev);
+		if (err)
+			en_err(priv, "Failed starting port\n");
+	}
+
+	err = mlx4_en_moderation_update(priv);
+
+out:
+	mutex_unlock(&mdev->state_lock);
+	return err;
+}
+
+static void mlx4_en_get_ringparam(struct net_device *dev,
+				  struct ethtool_ringparam *param)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+
+	memset(param, 0, sizeof(*param));
+	param->rx_max_pending = MLX4_EN_MAX_RX_SIZE;
+	param->tx_max_pending = MLX4_EN_MAX_TX_SIZE;
+	param->rx_pending = priv->port_up ?
+		priv->rx_ring[0]->actual_size : priv->rx_ring[0]->size;
+	param->tx_pending = priv->tx_ring[0]->size;
+}
+
+static u32 mlx4_en_get_rxfh_indir_size(struct net_device *dev)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+
+	return priv->rx_ring_num;
+}
+
+static int mlx4_en_get_rxfh(struct net_device *dev, u32 *ring_index, u8 *key)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_rss_map *rss_map = &priv->rss_map;
+	int rss_rings;
+	size_t n = priv->rx_ring_num;
+	int err = 0;
+
+	rss_rings = priv->prof->rss_rings ?: priv->rx_ring_num;
+
+	while (n--) {
+		ring_index[n] = rss_map->qps[n % rss_rings].qpn -
+			rss_map->base_qpn;
+	}
+
+	return err;
+}
+
+static int mlx4_en_set_rxfh(struct net_device *dev, const u32 *ring_index,
+			    const u8 *key)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = priv->mdev;
+	int port_up = 0;
+	int err = 0;
+	int i;
+	int rss_rings = 0;
+
+	/* Calculate RSS table size and make sure flows are spread evenly
+	 * between rings
+	 */
+	for (i = 0; i < priv->rx_ring_num; i++) {
+		if (i > 0 && !ring_index[i] && !rss_rings)
+			rss_rings = i;
+
+		if (ring_index[i] != (i % (rss_rings ?: priv->rx_ring_num)))
+			return -EINVAL;
+	}
+
+	if (!rss_rings)
+		rss_rings = priv->rx_ring_num;
+
+	/* RSS table size must be an order of 2 */
+	if (!is_power_of_2(rss_rings))
+		return -EINVAL;
+
+	mutex_lock(&mdev->state_lock);
+	if (priv->port_up) {
+		port_up = 1;
+		mlx4_en_stop_port(dev, 1);
+	}
+
+	priv->prof->rss_rings = rss_rings;
+
+	if (port_up) {
+		err = mlx4_en_start_port(dev);
+		if (err)
+			en_err(priv, "Failed starting port\n");
+	}
+
+	mutex_unlock(&mdev->state_lock);
+	return err;
+}
+
+#define all_zeros_or_all_ones(field)		\
+	((field) == 0 || (field) == (__force typeof(field))-1)
+
+static int mlx4_en_validate_flow(struct net_device *dev,
+				 struct ethtool_rxnfc *cmd)
+{
+	struct ethtool_usrip4_spec *l3_mask;
+	struct ethtool_tcpip4_spec *l4_mask;
+	struct ethhdr *eth_mask;
+
+	if (cmd->fs.location >= MAX_NUM_OF_FS_RULES)
+		return -EINVAL;
+
+	if (cmd->fs.flow_type & FLOW_MAC_EXT) {
+		/* dest mac mask must be ff:ff:ff:ff:ff:ff */
+		if (!is_broadcast_ether_addr(cmd->fs.m_ext.h_dest))
+			return -EINVAL;
+	}
+
+	switch (cmd->fs.flow_type & ~(FLOW_EXT | FLOW_MAC_EXT)) {
+	case TCP_V4_FLOW:
+	case UDP_V4_FLOW:
+		if (cmd->fs.m_u.tcp_ip4_spec.tos)
+			return -EINVAL;
+		l4_mask = &cmd->fs.m_u.tcp_ip4_spec;
+		/* don't allow mask which isn't all 0 or 1 */
+		if (!all_zeros_or_all_ones(l4_mask->ip4src) ||
+		    !all_zeros_or_all_ones(l4_mask->ip4dst) ||
+		    !all_zeros_or_all_ones(l4_mask->psrc) ||
+		    !all_zeros_or_all_ones(l4_mask->pdst))
+			return -EINVAL;
+		break;
+	case IP_USER_FLOW:
+		l3_mask = &cmd->fs.m_u.usr_ip4_spec;
+		if (l3_mask->l4_4_bytes || l3_mask->tos || l3_mask->proto ||
+		    cmd->fs.h_u.usr_ip4_spec.ip_ver != ETH_RX_NFC_IP4 ||
+		    (!l3_mask->ip4src && !l3_mask->ip4dst) ||
+		    !all_zeros_or_all_ones(l3_mask->ip4src) ||
+		    !all_zeros_or_all_ones(l3_mask->ip4dst))
+			return -EINVAL;
+		break;
+	case ETHER_FLOW:
+		eth_mask = &cmd->fs.m_u.ether_spec;
+		/* source mac mask must not be set */
+		if (!is_zero_ether_addr(eth_mask->h_source))
+			return -EINVAL;
+
+		/* dest mac mask must be ff:ff:ff:ff:ff:ff */
+		if (!is_broadcast_ether_addr(eth_mask->h_dest))
+			return -EINVAL;
+
+		if (!all_zeros_or_all_ones(eth_mask->h_proto))
+			return -EINVAL;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	if ((cmd->fs.flow_type & FLOW_EXT)) {
+		if (cmd->fs.m_ext.vlan_etype ||
+		    !((cmd->fs.m_ext.vlan_tci & cpu_to_be16(VLAN_VID_MASK)) ==
+		      0 ||
+		      (cmd->fs.m_ext.vlan_tci & cpu_to_be16(VLAN_VID_MASK)) ==
+		      cpu_to_be16(VLAN_VID_MASK)))
+			return -EINVAL;
+
+		if (cmd->fs.m_ext.vlan_tci) {
+			if (be16_to_cpu(cmd->fs.h_ext.vlan_tci) >= VLAN_N_VID)
+				return -EINVAL;
+
+		}
+	}
+
+	return 0;
+}
+
+static int mlx4_en_ethtool_add_mac_rule(struct ethtool_rxnfc *cmd,
+					struct list_head *rule_list_h,
+					struct mlx4_spec_list *spec_l2,
+					unsigned char *mac)
+{
+	int err = 0;
+	__be64 mac_msk = cpu_to_be64(MLX4_MAC_MASK << 16);
+
+	spec_l2->id = MLX4_NET_TRANS_RULE_ID_ETH;
+	memcpy(spec_l2->eth.dst_mac_msk, &mac_msk, ETH_ALEN);
+	memcpy(spec_l2->eth.dst_mac, mac, ETH_ALEN);
+
+	if ((cmd->fs.flow_type & FLOW_EXT) &&
+	    (cmd->fs.m_ext.vlan_tci & cpu_to_be16(VLAN_VID_MASK))) {
+		spec_l2->eth.vlan_id = cmd->fs.h_ext.vlan_tci;
+		spec_l2->eth.vlan_id_msk = cpu_to_be16(VLAN_VID_MASK);
+	}
+
+	list_add_tail(&spec_l2->list, rule_list_h);
+
+	return err;
+}
+
+static int mlx4_en_ethtool_add_mac_rule_by_ipv4(struct mlx4_en_priv *priv,
+						struct ethtool_rxnfc *cmd,
+						struct list_head *rule_list_h,
+						struct mlx4_spec_list *spec_l2,
+						__be32 ipv4_dst)
+{
+#ifdef CONFIG_INET
+	unsigned char mac[ETH_ALEN];
+
+	if (!ipv4_is_multicast(ipv4_dst)) {
+		if (cmd->fs.flow_type & FLOW_MAC_EXT)
+			memcpy(&mac, cmd->fs.h_ext.h_dest, ETH_ALEN);
+		else
+			memcpy(&mac, priv->dev->dev_addr, ETH_ALEN);
+	} else {
+		ip_eth_mc_map(ipv4_dst, mac);
+	}
+
+	return mlx4_en_ethtool_add_mac_rule(cmd, rule_list_h, spec_l2, &mac[0]);
+#else
+	return -EINVAL;
+#endif
+}
+
+static int add_ip_rule(struct mlx4_en_priv *priv,
+		       struct ethtool_rxnfc *cmd,
+		       struct list_head *list_h)
+{
+	int err;
+	struct mlx4_spec_list *spec_l2 = NULL;
+	struct mlx4_spec_list *spec_l3 = NULL;
+	struct ethtool_usrip4_spec *l3_mask = &cmd->fs.m_u.usr_ip4_spec;
+
+	spec_l3 = kzalloc(sizeof(*spec_l3), GFP_KERNEL);
+	spec_l2 = kzalloc(sizeof(*spec_l2), GFP_KERNEL);
+	if (!spec_l2 || !spec_l3) {
+		err = -ENOMEM;
+		goto free_spec;
+	}
+
+	err = mlx4_en_ethtool_add_mac_rule_by_ipv4(priv, cmd, list_h, spec_l2,
+						   cmd->fs.h_u.
+						   usr_ip4_spec.ip4dst);
+	if (err)
+		goto free_spec;
+	spec_l3->id = MLX4_NET_TRANS_RULE_ID_IPV4;
+	spec_l3->ipv4.src_ip = cmd->fs.h_u.usr_ip4_spec.ip4src;
+	if (l3_mask->ip4src)
+		spec_l3->ipv4.src_ip_msk = EN_ETHTOOL_WORD_MASK;
+	spec_l3->ipv4.dst_ip = cmd->fs.h_u.usr_ip4_spec.ip4dst;
+	if (l3_mask->ip4dst)
+		spec_l3->ipv4.dst_ip_msk = EN_ETHTOOL_WORD_MASK;
+	list_add_tail(&spec_l3->list, list_h);
+
+	return 0;
+
+free_spec:
+	kfree(spec_l2);
+	kfree(spec_l3);
+	return err;
+}
+
+static int add_tcp_udp_rule(struct mlx4_en_priv *priv,
+			     struct ethtool_rxnfc *cmd,
+			     struct list_head *list_h, int proto)
+{
+	int err;
+	struct mlx4_spec_list *spec_l2 = NULL;
+	struct mlx4_spec_list *spec_l3 = NULL;
+	struct mlx4_spec_list *spec_l4 = NULL;
+	struct ethtool_tcpip4_spec *l4_mask = &cmd->fs.m_u.tcp_ip4_spec;
+
+	spec_l2 = kzalloc(sizeof(*spec_l2), GFP_KERNEL);
+	spec_l3 = kzalloc(sizeof(*spec_l3), GFP_KERNEL);
+	spec_l4 = kzalloc(sizeof(*spec_l4), GFP_KERNEL);
+	if (!spec_l2 || !spec_l3 || !spec_l4) {
+		err = -ENOMEM;
+		goto free_spec;
+	}
+
+	spec_l3->id = MLX4_NET_TRANS_RULE_ID_IPV4;
+
+	if (proto == TCP_V4_FLOW) {
+		err = mlx4_en_ethtool_add_mac_rule_by_ipv4(priv, cmd, list_h,
+							   spec_l2,
+							   cmd->fs.h_u.
+							   tcp_ip4_spec.ip4dst);
+		if (err)
+			goto free_spec;
+		spec_l4->id = MLX4_NET_TRANS_RULE_ID_TCP;
+		spec_l3->ipv4.src_ip = cmd->fs.h_u.tcp_ip4_spec.ip4src;
+		spec_l3->ipv4.dst_ip = cmd->fs.h_u.tcp_ip4_spec.ip4dst;
+		spec_l4->tcp_udp.src_port = cmd->fs.h_u.tcp_ip4_spec.psrc;
+		spec_l4->tcp_udp.dst_port = cmd->fs.h_u.tcp_ip4_spec.pdst;
+	} else {
+		err = mlx4_en_ethtool_add_mac_rule_by_ipv4(priv, cmd, list_h,
+							   spec_l2,
+							   cmd->fs.h_u.
+							   udp_ip4_spec.ip4dst);
+		if (err)
+			goto free_spec;
+		spec_l4->id = MLX4_NET_TRANS_RULE_ID_UDP;
+		spec_l3->ipv4.src_ip = cmd->fs.h_u.udp_ip4_spec.ip4src;
+		spec_l3->ipv4.dst_ip = cmd->fs.h_u.udp_ip4_spec.ip4dst;
+		spec_l4->tcp_udp.src_port = cmd->fs.h_u.udp_ip4_spec.psrc;
+		spec_l4->tcp_udp.dst_port = cmd->fs.h_u.udp_ip4_spec.pdst;
+	}
+
+	if (l4_mask->ip4src)
+		spec_l3->ipv4.src_ip_msk = EN_ETHTOOL_WORD_MASK;
+	if (l4_mask->ip4dst)
+		spec_l3->ipv4.dst_ip_msk = EN_ETHTOOL_WORD_MASK;
+
+	if (l4_mask->psrc)
+		spec_l4->tcp_udp.src_port_msk = EN_ETHTOOL_SHORT_MASK;
+	if (l4_mask->pdst)
+		spec_l4->tcp_udp.dst_port_msk = EN_ETHTOOL_SHORT_MASK;
+
+	list_add_tail(&spec_l3->list, list_h);
+	list_add_tail(&spec_l4->list, list_h);
+
+	return 0;
+
+free_spec:
+	kfree(spec_l2);
+	kfree(spec_l3);
+	kfree(spec_l4);
+	return err;
+}
+
+static int mlx4_en_ethtool_to_net_trans_rule(struct net_device *dev,
+					     struct ethtool_rxnfc *cmd,
+					     struct list_head *rule_list_h)
+{
+	int err;
+	struct ethhdr *eth_spec;
+	struct mlx4_spec_list *spec_l2;
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+
+	err = mlx4_en_validate_flow(dev, cmd);
+	if (err)
+		return err;
+
+	switch (cmd->fs.flow_type & ~(FLOW_EXT | FLOW_MAC_EXT)) {
+	case ETHER_FLOW:
+		spec_l2 = kzalloc(sizeof(*spec_l2), GFP_KERNEL);
+		if (!spec_l2)
+			return -ENOMEM;
+
+		eth_spec = &cmd->fs.h_u.ether_spec;
+		mlx4_en_ethtool_add_mac_rule(cmd, rule_list_h, spec_l2,
+					     &eth_spec->h_dest[0]);
+		spec_l2->eth.ether_type = eth_spec->h_proto;
+		if (eth_spec->h_proto)
+			spec_l2->eth.ether_type_enable = 1;
+		break;
+	case IP_USER_FLOW:
+		err = add_ip_rule(priv, cmd, rule_list_h);
+		break;
+	case TCP_V4_FLOW:
+		err = add_tcp_udp_rule(priv, cmd, rule_list_h, TCP_V4_FLOW);
+		break;
+	case UDP_V4_FLOW:
+		err = add_tcp_udp_rule(priv, cmd, rule_list_h, UDP_V4_FLOW);
+		break;
+	}
+
+	return err;
+}
+
+static int mlx4_en_flow_replace(struct net_device *dev,
+				struct ethtool_rxnfc *cmd)
+{
+	int err;
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct ethtool_flow_id *loc_rule;
+	struct mlx4_spec_list *spec, *tmp_spec;
+	u32 qpn;
+	u64 reg_id;
+
+	struct mlx4_net_trans_rule rule = {
+		.queue_mode = MLX4_NET_TRANS_Q_FIFO,
+		.exclusive = 0,
+		.allow_loopback = 1,
+		.promisc_mode = MLX4_FS_REGULAR,
+	};
+
+	rule.port = priv->port;
+	rule.priority = MLX4_DOMAIN_ETHTOOL | cmd->fs.location;
+	INIT_LIST_HEAD(&rule.list);
+
+	/* Allow direct QP attaches if the EN_ETHTOOL_QP_ATTACH flag is set */
+	if (cmd->fs.ring_cookie == RX_CLS_FLOW_DISC)
+		qpn = priv->drop_qp.qpn;
+	else if (cmd->fs.ring_cookie & EN_ETHTOOL_QP_ATTACH) {
+		qpn = cmd->fs.ring_cookie & (EN_ETHTOOL_QP_ATTACH - 1);
+	} else {
+		if (cmd->fs.ring_cookie >= priv->rx_ring_num) {
+			en_warn(priv, "rxnfc: RX ring (%llu) doesn't exist\n",
+				cmd->fs.ring_cookie);
+			return -EINVAL;
+		}
+		qpn = priv->rss_map.qps[cmd->fs.ring_cookie].qpn;
+		if (!qpn) {
+			en_warn(priv, "rxnfc: RX ring (%llu) is inactive\n",
+				cmd->fs.ring_cookie);
+			return -EINVAL;
+		}
+	}
+	rule.qpn = qpn;
+	err = mlx4_en_ethtool_to_net_trans_rule(dev, cmd, &rule.list);
+	if (err)
+		goto out_free_list;
+
+	loc_rule = &priv->ethtool_rules[cmd->fs.location];
+	if (loc_rule->id) {
+		err = mlx4_flow_detach(priv->mdev->dev, loc_rule->id);
+		if (err) {
+			en_err(priv, "Fail to detach network rule at location %d. registration id = %llx\n",
+			       cmd->fs.location, loc_rule->id);
+			goto out_free_list;
+		}
+		loc_rule->id = 0;
+		memset(&loc_rule->flow_spec, 0,
+		       sizeof(struct ethtool_rx_flow_spec));
+		list_del(&loc_rule->list);
+	}
+	err = mlx4_flow_attach(priv->mdev->dev, &rule, &reg_id);
+	if (err) {
+		en_err(priv, "Fail to attach network rule at location %d\n",
+		       cmd->fs.location);
+		goto out_free_list;
+	}
+	loc_rule->id = reg_id;
+	memcpy(&loc_rule->flow_spec, &cmd->fs,
+	       sizeof(struct ethtool_rx_flow_spec));
+	list_add_tail(&loc_rule->list, &priv->ethtool_list);
+
+out_free_list:
+	list_for_each_entry_safe(spec, tmp_spec, &rule.list, list) {
+		list_del(&spec->list);
+		kfree(spec);
+	}
+	return err;
+}
+
+static int mlx4_en_flow_detach(struct net_device *dev,
+			       struct ethtool_rxnfc *cmd)
+{
+	int err = 0;
+	struct ethtool_flow_id *rule;
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+
+	if (cmd->fs.location >= MAX_NUM_OF_FS_RULES)
+		return -EINVAL;
+
+	rule = &priv->ethtool_rules[cmd->fs.location];
+	if (!rule->id) {
+		err =  -ENOENT;
+		goto out;
+	}
+
+	err = mlx4_flow_detach(priv->mdev->dev, rule->id);
+	if (err) {
+		en_err(priv, "Fail to detach network rule at location %d. registration id = 0x%llx\n",
+		       cmd->fs.location, rule->id);
+		goto out;
+	}
+	rule->id = 0;
+	memset(&rule->flow_spec, 0, sizeof(struct ethtool_rx_flow_spec));
+	list_del(&rule->list);
+out:
+	return err;
+
+}
+
+static int mlx4_en_get_flow(struct net_device *dev, struct ethtool_rxnfc *cmd,
+			    int loc)
+{
+	int err = 0;
+	struct ethtool_flow_id *rule;
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+
+	if (loc < 0 || loc >= MAX_NUM_OF_FS_RULES)
+		return -EINVAL;
+
+	rule = &priv->ethtool_rules[loc];
+	if (rule->id)
+		memcpy(&cmd->fs, &rule->flow_spec,
+		       sizeof(struct ethtool_rx_flow_spec));
+	else
+		err = -ENOENT;
+
+	return err;
+}
+
+static int mlx4_en_get_num_flows(struct mlx4_en_priv *priv)
+{
+
+	int i, res = 0;
+	for (i = 0; i < MAX_NUM_OF_FS_RULES; i++) {
+		if (priv->ethtool_rules[i].id)
+			res++;
+	}
+	return res;
+
+}
+
+static int mlx4_en_get_rxnfc(struct net_device *dev, struct ethtool_rxnfc *cmd,
+			     u32 *rule_locs)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = priv->mdev;
+	int err = 0;
+	int i = 0, priority = 0;
+
+	if ((cmd->cmd == ETHTOOL_GRXCLSRLCNT ||
+	     cmd->cmd == ETHTOOL_GRXCLSRULE ||
+	     cmd->cmd == ETHTOOL_GRXCLSRLALL) &&
+	    (mdev->dev->caps.steering_mode !=
+	     MLX4_STEERING_MODE_DEVICE_MANAGED || !priv->port_up))
+		return -EINVAL;
+
+	switch (cmd->cmd) {
+	case ETHTOOL_GRXRINGS:
+		cmd->data = priv->rx_ring_num;
+		break;
+	case ETHTOOL_GRXCLSRLCNT:
+		cmd->rule_cnt = mlx4_en_get_num_flows(priv);
+		break;
+	case ETHTOOL_GRXCLSRULE:
+		err = mlx4_en_get_flow(dev, cmd, cmd->fs.location);
+		break;
+	case ETHTOOL_GRXCLSRLALL:
+		while ((!err || err == -ENOENT) && priority < cmd->rule_cnt) {
+			err = mlx4_en_get_flow(dev, cmd, i);
+			if (!err)
+				rule_locs[priority++] = i;
+			i++;
+		}
+		err = 0;
+		break;
+	default:
+		err = -EOPNOTSUPP;
+		break;
+	}
+
+	return err;
+}
+
+static int mlx4_en_set_rxnfc(struct net_device *dev, struct ethtool_rxnfc *cmd)
+{
+	int err = 0;
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = priv->mdev;
+
+	if (mdev->dev->caps.steering_mode !=
+	    MLX4_STEERING_MODE_DEVICE_MANAGED || !priv->port_up)
+		return -EINVAL;
+
+	switch (cmd->cmd) {
+	case ETHTOOL_SRXCLSRLINS:
+		err = mlx4_en_flow_replace(dev, cmd);
+		break;
+	case ETHTOOL_SRXCLSRLDEL:
+		err = mlx4_en_flow_detach(dev, cmd);
+		break;
+	default:
+		en_warn(priv, "Unsupported ethtool command. (%d)\n", cmd->cmd);
+		return -EINVAL;
+	}
+
+	return err;
+}
+
+static void mlx4_en_get_channels(struct net_device *dev,
+				 struct ethtool_channels *channel)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+
+	memset(channel, 0, sizeof(*channel));
+
+	channel->max_rx = MAX_RX_RINGS;
+	channel->max_tx = MLX4_EN_MAX_TX_RING_P_UP;
+
+	channel->rx_count = priv->rx_ring_num;
+	channel->tx_count = priv->tx_ring_num / MLX4_EN_NUM_UP;
+}
+
+static int mlx4_en_set_channels(struct net_device *dev,
+				struct ethtool_channels *channel)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = priv->mdev;
+	int port_up = 0;
+	int err = 0;
+
+	if (channel->other_count || channel->combined_count ||
+	    channel->tx_count > MLX4_EN_MAX_TX_RING_P_UP ||
+	    channel->rx_count > MAX_RX_RINGS ||
+	    !channel->tx_count || !channel->rx_count)
+		return -EINVAL;
+
+	mutex_lock(&mdev->state_lock);
+	if (priv->port_up) {
+		port_up = 1;
+		mlx4_en_stop_port(dev, 1);
+	}
+
+	mlx4_en_free_resources(priv);
+
+	priv->num_tx_rings_p_up = channel->tx_count;
+	priv->tx_ring_num = channel->tx_count * MLX4_EN_NUM_UP;
+	priv->rx_ring_num = channel->rx_count;
+
+	err = mlx4_en_alloc_resources(priv);
+	if (err) {
+		en_err(priv, "Failed reallocating port resources\n");
+		goto out;
+	}
+
+	netif_set_real_num_tx_queues(dev, priv->tx_ring_num);
+	netif_set_real_num_rx_queues(dev, priv->rx_ring_num);
+
+	if (dev->num_tc)
+		mlx4_en_setup_tc(dev, MLX4_EN_NUM_UP);
+
+	en_warn(priv, "Using %d TX rings\n", priv->tx_ring_num);
+	en_warn(priv, "Using %d RX rings\n", priv->rx_ring_num);
+
+	if (port_up) {
+		err = mlx4_en_start_port(dev);
+		if (err)
+			en_err(priv, "Failed starting port\n");
+	}
+
+	err = mlx4_en_moderation_update(priv);
+
+out:
+	mutex_unlock(&mdev->state_lock);
+	return err;
+}
+
+static int mlx4_en_get_ts_info(struct net_device *dev,
+			       struct ethtool_ts_info *info)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = priv->mdev;
+	int ret;
+
+	ret = ethtool_op_get_ts_info(dev, info);
+	if (ret)
+		return ret;
+
+	if (mdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_TS) {
+		info->so_timestamping |=
+			SOF_TIMESTAMPING_TX_HARDWARE |
+			SOF_TIMESTAMPING_RX_HARDWARE |
+			SOF_TIMESTAMPING_RAW_HARDWARE;
+
+		info->tx_types =
+			(1 << HWTSTAMP_TX_OFF) |
+			(1 << HWTSTAMP_TX_ON);
+
+		info->rx_filters =
+			(1 << HWTSTAMP_FILTER_NONE) |
+			(1 << HWTSTAMP_FILTER_ALL);
+
+		if (mdev->ptp_clock)
+			info->phc_index = ptp_clock_index(mdev->ptp_clock);
+	}
+
+	return ret;
+}
+
+static int mlx4_en_set_priv_flags(struct net_device *dev, u32 flags)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	bool bf_enabled_new = !!(flags & MLX4_EN_PRIV_FLAGS_BLUEFLAME);
+	bool bf_enabled_old = !!(priv->pflags & MLX4_EN_PRIV_FLAGS_BLUEFLAME);
+	int i;
+
+	if (bf_enabled_new == bf_enabled_old)
+		return 0; /* Nothing to do */
+
+	if (bf_enabled_new) {
+		bool bf_supported = true;
+
+		for (i = 0; i < priv->tx_ring_num; i++)
+			bf_supported &= priv->tx_ring[i]->bf_alloced;
+
+		if (!bf_supported) {
+			en_err(priv, "BlueFlame is not supported\n");
+			return -EINVAL;
+		}
+
+		priv->pflags |= MLX4_EN_PRIV_FLAGS_BLUEFLAME;
+	} else {
+		priv->pflags &= ~MLX4_EN_PRIV_FLAGS_BLUEFLAME;
+	}
+
+	for (i = 0; i < priv->tx_ring_num; i++)
+		priv->tx_ring[i]->bf_enabled = bf_enabled_new;
+
+	en_info(priv, "BlueFlame %s\n",
+		bf_enabled_new ?  "Enabled" : "Disabled");
+
+	return 0;
+}
+
+static u32 mlx4_en_get_priv_flags(struct net_device *dev)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+
+	return priv->pflags;
+}
+
+static int mlx4_en_get_tunable(struct net_device *dev,
+			       const struct ethtool_tunable *tuna,
+			       void *data)
+{
+	const struct mlx4_en_priv *priv = netdev_priv(dev);
+	int ret = 0;
+
+	switch (tuna->id) {
+	case ETHTOOL_TX_COPYBREAK:
+		*(u32 *)data = priv->prof->inline_thold;
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+
+	return ret;
+}
+
+static int mlx4_en_set_tunable(struct net_device *dev,
+			       const struct ethtool_tunable *tuna,
+			       const void *data)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	int val, ret = 0;
+
+	switch (tuna->id) {
+	case ETHTOOL_TX_COPYBREAK:
+		val = *(u32 *)data;
+		if (val < MIN_PKT_LEN || val > MAX_INLINE)
+			ret = -EINVAL;
+		else
+			priv->prof->inline_thold = val;
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+
+	return ret;
+}
+
+
+const struct ethtool_ops mlx4_en_ethtool_ops = {
+	.get_drvinfo = mlx4_en_get_drvinfo,
+	.get_settings = mlx4_en_get_settings,
+	.set_settings = mlx4_en_set_settings,
+	.get_link = ethtool_op_get_link,
+	.get_strings = mlx4_en_get_strings,
+	.get_sset_count = mlx4_en_get_sset_count,
+	.get_ethtool_stats = mlx4_en_get_ethtool_stats,
+	.self_test = mlx4_en_self_test,
+	.get_wol = mlx4_en_get_wol,
+	.set_wol = mlx4_en_set_wol,
+	.get_msglevel = mlx4_en_get_msglevel,
+	.set_msglevel = mlx4_en_set_msglevel,
+	.get_coalesce = mlx4_en_get_coalesce,
+	.set_coalesce = mlx4_en_set_coalesce,
+	.get_pauseparam = mlx4_en_get_pauseparam,
+	.set_pauseparam = mlx4_en_set_pauseparam,
+	.get_ringparam = mlx4_en_get_ringparam,
+	.set_ringparam = mlx4_en_set_ringparam,
+	.get_rxnfc = mlx4_en_get_rxnfc,
+	.set_rxnfc = mlx4_en_set_rxnfc,
+	.get_rxfh_indir_size = mlx4_en_get_rxfh_indir_size,
+	.get_rxfh = mlx4_en_get_rxfh,
+	.set_rxfh = mlx4_en_set_rxfh,
+	.get_channels = mlx4_en_get_channels,
+	.set_channels = mlx4_en_set_channels,
+	.get_ts_info = mlx4_en_get_ts_info,
+	.set_priv_flags = mlx4_en_set_priv_flags,
+	.get_priv_flags = mlx4_en_get_priv_flags,
+	.get_tunable		= mlx4_en_get_tunable,
+	.set_tunable		= mlx4_en_set_tunable,
+};
+
+
+
+
+
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_main.c b/drivers/net/ethernet/mellanox/mlx4/en_main.c
new file mode 100644
index 0000000..2091ae8
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4/en_main.c
@@ -0,0 +1,368 @@
+/*
+ * Copyright (c) 2007 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <linux/cpumask.h>
+#include <linux/module.h>
+#include <linux/delay.h>
+#include <linux/netdevice.h>
+#include <linux/slab.h>
+
+#include <linux/mlx4/driver.h>
+#include <linux/mlx4/device.h>
+#include <linux/mlx4/cmd.h>
+
+#include "mlx4_en.h"
+
+MODULE_AUTHOR("Liran Liss, Yevgeny Petrilin");
+MODULE_DESCRIPTION("Mellanox ConnectX HCA Ethernet driver");
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_VERSION(DRV_VERSION " ("DRV_RELDATE")");
+
+static const char mlx4_en_version[] =
+	DRV_NAME ": Mellanox ConnectX HCA Ethernet driver v"
+	DRV_VERSION " (" DRV_RELDATE ")\n";
+
+#define MLX4_EN_PARM_INT(X, def_val, desc) \
+	static unsigned int X = def_val;\
+	module_param(X , uint, 0444); \
+	MODULE_PARM_DESC(X, desc);
+
+
+/*
+ * Device scope module parameters
+ */
+
+/* Enable RSS UDP traffic */
+MLX4_EN_PARM_INT(udp_rss, 1,
+		 "Enable RSS for incoming UDP traffic or disabled (0)");
+
+/* Priority pausing */
+MLX4_EN_PARM_INT(pfctx, 0, "Priority based Flow Control policy on TX[7:0]."
+			   " Per priority bit mask");
+MLX4_EN_PARM_INT(pfcrx, 0, "Priority based Flow Control policy on RX[7:0]."
+			   " Per priority bit mask");
+
+MLX4_EN_PARM_INT(inline_thold, MAX_INLINE,
+		 "Threshold for using inline data (range: 17-104, default: 104)");
+
+#define MAX_PFC_TX     0xff
+#define MAX_PFC_RX     0xff
+
+void en_print(const char *level, const struct mlx4_en_priv *priv,
+	      const char *format, ...)
+{
+	va_list args;
+	struct va_format vaf;
+
+	va_start(args, format);
+
+	vaf.fmt = format;
+	vaf.va = &args;
+	if (priv->registered)
+		printk("%s%s: %s: %pV",
+		       level, DRV_NAME, priv->dev->name, &vaf);
+	else
+		printk("%s%s: %s: Port %d: %pV",
+		       level, DRV_NAME, dev_name(&priv->mdev->pdev->dev),
+		       priv->port, &vaf);
+	va_end(args);
+}
+
+void mlx4_en_update_loopback_state(struct net_device *dev,
+				   netdev_features_t features)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+
+	priv->flags &= ~(MLX4_EN_FLAG_RX_FILTER_NEEDED|
+			MLX4_EN_FLAG_ENABLE_HW_LOOPBACK);
+
+	/* Drop the packet if SRIOV is not enabled
+	 * and not performing the selftest or flb disabled
+	 */
+	if (mlx4_is_mfunc(priv->mdev->dev) &&
+	    !(features & NETIF_F_LOOPBACK) && !priv->validate_loopback)
+		priv->flags |= MLX4_EN_FLAG_RX_FILTER_NEEDED;
+
+	/* Set dmac in Tx WQE if we are in SRIOV mode or if loopback selftest
+	 * is requested
+	 */
+	if (mlx4_is_mfunc(priv->mdev->dev) || priv->validate_loopback)
+		priv->flags |= MLX4_EN_FLAG_ENABLE_HW_LOOPBACK;
+}
+
+static int mlx4_en_get_profile(struct mlx4_en_dev *mdev)
+{
+	struct mlx4_en_profile *params = &mdev->profile;
+	int i;
+
+	params->udp_rss = udp_rss;
+	params->num_tx_rings_p_up = mlx4_low_memory_profile() ?
+		MLX4_EN_MIN_TX_RING_P_UP :
+		min_t(int, num_online_cpus(), MLX4_EN_MAX_TX_RING_P_UP);
+
+	if (params->udp_rss && !(mdev->dev->caps.flags
+					& MLX4_DEV_CAP_FLAG_UDP_RSS)) {
+		mlx4_warn(mdev, "UDP RSS is not supported on this device\n");
+		params->udp_rss = 0;
+	}
+	for (i = 1; i <= MLX4_MAX_PORTS; i++) {
+		params->prof[i].rx_pause = 1;
+		params->prof[i].rx_ppp = pfcrx;
+		params->prof[i].tx_pause = 1;
+		params->prof[i].tx_ppp = pfctx;
+		params->prof[i].tx_ring_size = MLX4_EN_DEF_TX_RING_SIZE;
+		params->prof[i].rx_ring_size = MLX4_EN_DEF_RX_RING_SIZE;
+		params->prof[i].tx_ring_num = params->num_tx_rings_p_up *
+			MLX4_EN_NUM_UP;
+		params->prof[i].rss_rings = 0;
+		params->prof[i].inline_thold = inline_thold;
+	}
+
+	return 0;
+}
+
+static void *mlx4_en_get_netdev(struct mlx4_dev *dev, void *ctx, u8 port)
+{
+	struct mlx4_en_dev *endev = ctx;
+
+	return endev->pndev[port];
+}
+
+static void mlx4_en_event(struct mlx4_dev *dev, void *endev_ptr,
+			  enum mlx4_dev_event event, unsigned long port)
+{
+	struct mlx4_en_dev *mdev = (struct mlx4_en_dev *) endev_ptr;
+	struct mlx4_en_priv *priv;
+
+	switch (event) {
+	case MLX4_DEV_EVENT_PORT_UP:
+	case MLX4_DEV_EVENT_PORT_DOWN:
+		if (!mdev->pndev[port])
+			return;
+		priv = netdev_priv(mdev->pndev[port]);
+		/* To prevent races, we poll the link state in a separate
+		  task rather than changing it here */
+		priv->link_state = event;
+		queue_work(mdev->workqueue, &priv->linkstate_task);
+		break;
+
+	case MLX4_DEV_EVENT_CATASTROPHIC_ERROR:
+		mlx4_err(mdev, "Internal error detected, restarting device\n");
+		break;
+
+	case MLX4_DEV_EVENT_SLAVE_INIT:
+	case MLX4_DEV_EVENT_SLAVE_SHUTDOWN:
+		break;
+	default:
+		if (port < 1 || port > dev->caps.num_ports ||
+		    !mdev->pndev[port])
+			return;
+		mlx4_warn(mdev, "Unhandled event %d for port %d\n", event,
+			  (int) port);
+	}
+}
+
+static void mlx4_en_remove(struct mlx4_dev *dev, void *endev_ptr)
+{
+	struct mlx4_en_dev *mdev = endev_ptr;
+	int i;
+
+	mutex_lock(&mdev->state_lock);
+	mdev->device_up = false;
+	mutex_unlock(&mdev->state_lock);
+
+	mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_ETH)
+		if (mdev->pndev[i])
+			mlx4_en_destroy_netdev(mdev->pndev[i]);
+
+	if (mdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_TS)
+		mlx4_en_remove_timestamp(mdev);
+
+	flush_workqueue(mdev->workqueue);
+	destroy_workqueue(mdev->workqueue);
+	(void) mlx4_mr_free(dev, &mdev->mr);
+	iounmap(mdev->uar_map);
+	mlx4_uar_free(dev, &mdev->priv_uar);
+	mlx4_pd_free(dev, mdev->priv_pdn);
+	kfree(mdev);
+}
+
+static void *mlx4_en_add(struct mlx4_dev *dev)
+{
+	struct mlx4_en_dev *mdev;
+	int i;
+	int err;
+
+	printk_once(KERN_INFO "%s", mlx4_en_version);
+
+	mdev = kzalloc(sizeof(*mdev), GFP_KERNEL);
+	if (!mdev) {
+		err = -ENOMEM;
+		goto err_free_res;
+	}
+
+	if (mlx4_pd_alloc(dev, &mdev->priv_pdn))
+		goto err_free_dev;
+
+	if (mlx4_uar_alloc(dev, &mdev->priv_uar))
+		goto err_pd;
+
+	mdev->uar_map = ioremap((phys_addr_t) mdev->priv_uar.pfn << PAGE_SHIFT,
+				PAGE_SIZE);
+	if (!mdev->uar_map)
+		goto err_uar;
+	spin_lock_init(&mdev->uar_lock);
+
+	mdev->dev = dev;
+	mdev->dma_device = &(dev->pdev->dev);
+	mdev->pdev = dev->pdev;
+	mdev->device_up = false;
+
+	mdev->LSO_support = !!(dev->caps.flags & (1 << 15));
+	if (!mdev->LSO_support)
+		mlx4_warn(mdev, "LSO not supported, please upgrade to later FW version to enable LSO\n");
+
+	if (mlx4_mr_alloc(mdev->dev, mdev->priv_pdn, 0, ~0ull,
+			 MLX4_PERM_LOCAL_WRITE |  MLX4_PERM_LOCAL_READ,
+			 0, 0, &mdev->mr)) {
+		mlx4_err(mdev, "Failed allocating memory region\n");
+		goto err_map;
+	}
+	if (mlx4_mr_enable(mdev->dev, &mdev->mr)) {
+		mlx4_err(mdev, "Failed enabling memory region\n");
+		goto err_mr;
+	}
+
+	/* Build device profile according to supplied module parameters */
+	err = mlx4_en_get_profile(mdev);
+	if (err) {
+		mlx4_err(mdev, "Bad module parameters, aborting\n");
+		goto err_mr;
+	}
+
+	/* Configure which ports to start according to module parameters */
+	mdev->port_cnt = 0;
+	mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_ETH)
+		mdev->port_cnt++;
+
+	/* Initialize time stamp mechanism */
+	if (mdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_TS)
+		mlx4_en_init_timestamp(mdev);
+
+	/* Set default number of RX rings*/
+	mlx4_en_set_num_rx_rings(mdev);
+
+	/* Create our own workqueue for reset/multicast tasks
+	 * Note: we cannot use the shared workqueue because of deadlocks caused
+	 *       by the rtnl lock */
+	mdev->workqueue = create_singlethread_workqueue("mlx4_en");
+	if (!mdev->workqueue) {
+		err = -ENOMEM;
+		goto err_mr;
+	}
+
+	/* At this stage all non-port specific tasks are complete:
+	 * mark the card state as up */
+	mutex_init(&mdev->state_lock);
+	mdev->device_up = true;
+
+	/* Setup ports */
+
+	/* Create a netdev for each port */
+	mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_ETH) {
+		mlx4_info(mdev, "Activating port:%d\n", i);
+		if (mlx4_en_init_netdev(mdev, i, &mdev->profile.prof[i]))
+			mdev->pndev[i] = NULL;
+	}
+
+	return mdev;
+
+err_mr:
+	(void) mlx4_mr_free(dev, &mdev->mr);
+err_map:
+	if (mdev->uar_map)
+		iounmap(mdev->uar_map);
+err_uar:
+	mlx4_uar_free(dev, &mdev->priv_uar);
+err_pd:
+	mlx4_pd_free(dev, mdev->priv_pdn);
+err_free_dev:
+	kfree(mdev);
+err_free_res:
+	return NULL;
+}
+
+static struct mlx4_interface mlx4_en_interface = {
+	.add		= mlx4_en_add,
+	.remove		= mlx4_en_remove,
+	.event		= mlx4_en_event,
+	.get_dev	= mlx4_en_get_netdev,
+	.protocol	= MLX4_PROT_ETH,
+};
+
+static void mlx4_en_verify_params(void)
+{
+	if (pfctx > MAX_PFC_TX) {
+		pr_warn("mlx4_en: WARNING: illegal module parameter pfctx 0x%x - should be in range 0-0x%x, will be changed to default (0)\n",
+			pfctx, MAX_PFC_TX);
+		pfctx = 0;
+	}
+
+	if (pfcrx > MAX_PFC_RX) {
+		pr_warn("mlx4_en: WARNING: illegal module parameter pfcrx 0x%x - should be in range 0-0x%x, will be changed to default (0)\n",
+			pfcrx, MAX_PFC_RX);
+		pfcrx = 0;
+	}
+
+	if (inline_thold < MIN_PKT_LEN || inline_thold > MAX_INLINE) {
+		pr_warn("mlx4_en: WARNING: illegal module parameter inline_thold %d - should be in range %d-%d, will be changed to default (%d)\n",
+			inline_thold, MIN_PKT_LEN, MAX_INLINE, MAX_INLINE);
+		inline_thold = MAX_INLINE;
+	}
+}
+
+static int __init mlx4_en_init(void)
+{
+	mlx4_en_verify_params();
+
+	return mlx4_register_interface(&mlx4_en_interface);
+}
+
+static void __exit mlx4_en_cleanup(void)
+{
+	mlx4_unregister_interface(&mlx4_en_interface);
+}
+
+module_init(mlx4_en_init);
+module_exit(mlx4_en_cleanup);
+
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
new file mode 100644
index 0000000..4d69e38
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
@@ -0,0 +1,2652 @@
+/*
+ * Copyright (c) 2007 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <linux/etherdevice.h>
+#include <linux/tcp.h>
+#include <linux/if_vlan.h>
+#include <linux/delay.h>
+#include <linux/slab.h>
+#include <linux/hash.h>
+#include <net/ip.h>
+#include <net/busy_poll.h>
+#include <net/vxlan.h>
+
+#include <linux/mlx4/driver.h>
+#include <linux/mlx4/device.h>
+#include <linux/mlx4/cmd.h>
+#include <linux/mlx4/cq.h>
+
+#include "mlx4_en.h"
+#include "en_port.h"
+
+int mlx4_en_setup_tc(struct net_device *dev, u8 up)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	int i;
+	unsigned int offset = 0;
+
+	if (up && up != MLX4_EN_NUM_UP)
+		return -EINVAL;
+
+	netdev_set_num_tc(dev, up);
+
+	/* Partition Tx queues evenly amongst UP's */
+	for (i = 0; i < up; i++) {
+		netdev_set_tc_queue(dev, i, priv->num_tx_rings_p_up, offset);
+		offset += priv->num_tx_rings_p_up;
+	}
+
+	return 0;
+}
+
+#ifdef CONFIG_NET_RX_BUSY_POLL
+/* must be called with local_bh_disable()d */
+static int mlx4_en_low_latency_recv(struct napi_struct *napi)
+{
+	struct mlx4_en_cq *cq = container_of(napi, struct mlx4_en_cq, napi);
+	struct net_device *dev = cq->dev;
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_rx_ring *rx_ring = priv->rx_ring[cq->ring];
+	int done;
+
+	if (!priv->port_up)
+		return LL_FLUSH_FAILED;
+
+	if (!mlx4_en_cq_lock_poll(cq))
+		return LL_FLUSH_BUSY;
+
+	done = mlx4_en_process_rx_cq(dev, cq, 4);
+	if (likely(done))
+		rx_ring->cleaned += done;
+	else
+		rx_ring->misses++;
+
+	mlx4_en_cq_unlock_poll(cq);
+
+	return done;
+}
+#endif	/* CONFIG_NET_RX_BUSY_POLL */
+
+#ifdef CONFIG_RFS_ACCEL
+
+struct mlx4_en_filter {
+	struct list_head next;
+	struct work_struct work;
+
+	u8     ip_proto;
+	__be32 src_ip;
+	__be32 dst_ip;
+	__be16 src_port;
+	__be16 dst_port;
+
+	int rxq_index;
+	struct mlx4_en_priv *priv;
+	u32 flow_id;			/* RFS infrastructure id */
+	int id;				/* mlx4_en driver id */
+	u64 reg_id;			/* Flow steering API id */
+	u8 activated;			/* Used to prevent expiry before filter
+					 * is attached
+					 */
+	struct hlist_node filter_chain;
+};
+
+static void mlx4_en_filter_rfs_expire(struct mlx4_en_priv *priv);
+
+static enum mlx4_net_trans_rule_id mlx4_ip_proto_to_trans_rule_id(u8 ip_proto)
+{
+	switch (ip_proto) {
+	case IPPROTO_UDP:
+		return MLX4_NET_TRANS_RULE_ID_UDP;
+	case IPPROTO_TCP:
+		return MLX4_NET_TRANS_RULE_ID_TCP;
+	default:
+		return MLX4_NET_TRANS_RULE_NUM;
+	}
+};
+
+static void mlx4_en_filter_work(struct work_struct *work)
+{
+	struct mlx4_en_filter *filter = container_of(work,
+						     struct mlx4_en_filter,
+						     work);
+	struct mlx4_en_priv *priv = filter->priv;
+	struct mlx4_spec_list spec_tcp_udp = {
+		.id = mlx4_ip_proto_to_trans_rule_id(filter->ip_proto),
+		{
+			.tcp_udp = {
+				.dst_port = filter->dst_port,
+				.dst_port_msk = (__force __be16)-1,
+				.src_port = filter->src_port,
+				.src_port_msk = (__force __be16)-1,
+			},
+		},
+	};
+	struct mlx4_spec_list spec_ip = {
+		.id = MLX4_NET_TRANS_RULE_ID_IPV4,
+		{
+			.ipv4 = {
+				.dst_ip = filter->dst_ip,
+				.dst_ip_msk = (__force __be32)-1,
+				.src_ip = filter->src_ip,
+				.src_ip_msk = (__force __be32)-1,
+			},
+		},
+	};
+	struct mlx4_spec_list spec_eth = {
+		.id = MLX4_NET_TRANS_RULE_ID_ETH,
+	};
+	struct mlx4_net_trans_rule rule = {
+		.list = LIST_HEAD_INIT(rule.list),
+		.queue_mode = MLX4_NET_TRANS_Q_LIFO,
+		.exclusive = 1,
+		.allow_loopback = 1,
+		.promisc_mode = MLX4_FS_REGULAR,
+		.port = priv->port,
+		.priority = MLX4_DOMAIN_RFS,
+	};
+	int rc;
+	__be64 mac_mask = cpu_to_be64(MLX4_MAC_MASK << 16);
+
+	if (spec_tcp_udp.id >= MLX4_NET_TRANS_RULE_NUM) {
+		en_warn(priv, "RFS: ignoring unsupported ip protocol (%d)\n",
+			filter->ip_proto);
+		goto ignore;
+	}
+	list_add_tail(&spec_eth.list, &rule.list);
+	list_add_tail(&spec_ip.list, &rule.list);
+	list_add_tail(&spec_tcp_udp.list, &rule.list);
+
+	rule.qpn = priv->rss_map.qps[filter->rxq_index].qpn;
+	memcpy(spec_eth.eth.dst_mac, priv->dev->dev_addr, ETH_ALEN);
+	memcpy(spec_eth.eth.dst_mac_msk, &mac_mask, ETH_ALEN);
+
+	filter->activated = 0;
+
+	if (filter->reg_id) {
+		rc = mlx4_flow_detach(priv->mdev->dev, filter->reg_id);
+		if (rc && rc != -ENOENT)
+			en_err(priv, "Error detaching flow. rc = %d\n", rc);
+	}
+
+	rc = mlx4_flow_attach(priv->mdev->dev, &rule, &filter->reg_id);
+	if (rc)
+		en_err(priv, "Error attaching flow. err = %d\n", rc);
+
+ignore:
+	mlx4_en_filter_rfs_expire(priv);
+
+	filter->activated = 1;
+}
+
+static inline struct hlist_head *
+filter_hash_bucket(struct mlx4_en_priv *priv, __be32 src_ip, __be32 dst_ip,
+		   __be16 src_port, __be16 dst_port)
+{
+	unsigned long l;
+	int bucket_idx;
+
+	l = (__force unsigned long)src_port |
+	    ((__force unsigned long)dst_port << 2);
+	l ^= (__force unsigned long)(src_ip ^ dst_ip);
+
+	bucket_idx = hash_long(l, MLX4_EN_FILTER_HASH_SHIFT);
+
+	return &priv->filter_hash[bucket_idx];
+}
+
+static struct mlx4_en_filter *
+mlx4_en_filter_alloc(struct mlx4_en_priv *priv, int rxq_index, __be32 src_ip,
+		     __be32 dst_ip, u8 ip_proto, __be16 src_port,
+		     __be16 dst_port, u32 flow_id)
+{
+	struct mlx4_en_filter *filter = NULL;
+
+	filter = kzalloc(sizeof(struct mlx4_en_filter), GFP_ATOMIC);
+	if (!filter)
+		return NULL;
+
+	filter->priv = priv;
+	filter->rxq_index = rxq_index;
+	INIT_WORK(&filter->work, mlx4_en_filter_work);
+
+	filter->src_ip = src_ip;
+	filter->dst_ip = dst_ip;
+	filter->ip_proto = ip_proto;
+	filter->src_port = src_port;
+	filter->dst_port = dst_port;
+
+	filter->flow_id = flow_id;
+
+	filter->id = priv->last_filter_id++ % RPS_NO_FILTER;
+
+	list_add_tail(&filter->next, &priv->filters);
+	hlist_add_head(&filter->filter_chain,
+		       filter_hash_bucket(priv, src_ip, dst_ip, src_port,
+					  dst_port));
+
+	return filter;
+}
+
+static void mlx4_en_filter_free(struct mlx4_en_filter *filter)
+{
+	struct mlx4_en_priv *priv = filter->priv;
+	int rc;
+
+	list_del(&filter->next);
+
+	rc = mlx4_flow_detach(priv->mdev->dev, filter->reg_id);
+	if (rc && rc != -ENOENT)
+		en_err(priv, "Error detaching flow. rc = %d\n", rc);
+
+	kfree(filter);
+}
+
+static inline struct mlx4_en_filter *
+mlx4_en_filter_find(struct mlx4_en_priv *priv, __be32 src_ip, __be32 dst_ip,
+		    u8 ip_proto, __be16 src_port, __be16 dst_port)
+{
+	struct mlx4_en_filter *filter;
+	struct mlx4_en_filter *ret = NULL;
+
+	hlist_for_each_entry(filter,
+			     filter_hash_bucket(priv, src_ip, dst_ip,
+						src_port, dst_port),
+			     filter_chain) {
+		if (filter->src_ip == src_ip &&
+		    filter->dst_ip == dst_ip &&
+		    filter->ip_proto == ip_proto &&
+		    filter->src_port == src_port &&
+		    filter->dst_port == dst_port) {
+			ret = filter;
+			break;
+		}
+	}
+
+	return ret;
+}
+
+static int
+mlx4_en_filter_rfs(struct net_device *net_dev, const struct sk_buff *skb,
+		   u16 rxq_index, u32 flow_id)
+{
+	struct mlx4_en_priv *priv = netdev_priv(net_dev);
+	struct mlx4_en_filter *filter;
+	const struct iphdr *ip;
+	const __be16 *ports;
+	u8 ip_proto;
+	__be32 src_ip;
+	__be32 dst_ip;
+	__be16 src_port;
+	__be16 dst_port;
+	int nhoff = skb_network_offset(skb);
+	int ret = 0;
+
+	if (skb->protocol != htons(ETH_P_IP))
+		return -EPROTONOSUPPORT;
+
+	ip = (const struct iphdr *)(skb->data + nhoff);
+	if (ip_is_fragment(ip))
+		return -EPROTONOSUPPORT;
+
+	if ((ip->protocol != IPPROTO_TCP) && (ip->protocol != IPPROTO_UDP))
+		return -EPROTONOSUPPORT;
+	ports = (const __be16 *)(skb->data + nhoff + 4 * ip->ihl);
+
+	ip_proto = ip->protocol;
+	src_ip = ip->saddr;
+	dst_ip = ip->daddr;
+	src_port = ports[0];
+	dst_port = ports[1];
+
+	spin_lock_bh(&priv->filters_lock);
+	filter = mlx4_en_filter_find(priv, src_ip, dst_ip, ip_proto,
+				     src_port, dst_port);
+	if (filter) {
+		if (filter->rxq_index == rxq_index)
+			goto out;
+
+		filter->rxq_index = rxq_index;
+	} else {
+		filter = mlx4_en_filter_alloc(priv, rxq_index,
+					      src_ip, dst_ip, ip_proto,
+					      src_port, dst_port, flow_id);
+		if (!filter) {
+			ret = -ENOMEM;
+			goto err;
+		}
+	}
+
+	queue_work(priv->mdev->workqueue, &filter->work);
+
+out:
+	ret = filter->id;
+err:
+	spin_unlock_bh(&priv->filters_lock);
+
+	return ret;
+}
+
+void mlx4_en_cleanup_filters(struct mlx4_en_priv *priv)
+{
+	struct mlx4_en_filter *filter, *tmp;
+	LIST_HEAD(del_list);
+
+	spin_lock_bh(&priv->filters_lock);
+	list_for_each_entry_safe(filter, tmp, &priv->filters, next) {
+		list_move(&filter->next, &del_list);
+		hlist_del(&filter->filter_chain);
+	}
+	spin_unlock_bh(&priv->filters_lock);
+
+	list_for_each_entry_safe(filter, tmp, &del_list, next) {
+		cancel_work_sync(&filter->work);
+		mlx4_en_filter_free(filter);
+	}
+}
+
+static void mlx4_en_filter_rfs_expire(struct mlx4_en_priv *priv)
+{
+	struct mlx4_en_filter *filter = NULL, *tmp, *last_filter = NULL;
+	LIST_HEAD(del_list);
+	int i = 0;
+
+	spin_lock_bh(&priv->filters_lock);
+	list_for_each_entry_safe(filter, tmp, &priv->filters, next) {
+		if (i > MLX4_EN_FILTER_EXPIRY_QUOTA)
+			break;
+
+		if (filter->activated &&
+		    !work_pending(&filter->work) &&
+		    rps_may_expire_flow(priv->dev,
+					filter->rxq_index, filter->flow_id,
+					filter->id)) {
+			list_move(&filter->next, &del_list);
+			hlist_del(&filter->filter_chain);
+		} else
+			last_filter = filter;
+
+		i++;
+	}
+
+	if (last_filter && (&last_filter->next != priv->filters.next))
+		list_move(&priv->filters, &last_filter->next);
+
+	spin_unlock_bh(&priv->filters_lock);
+
+	list_for_each_entry_safe(filter, tmp, &del_list, next)
+		mlx4_en_filter_free(filter);
+}
+#endif
+
+static int mlx4_en_vlan_rx_add_vid(struct net_device *dev,
+				   __be16 proto, u16 vid)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = priv->mdev;
+	int err;
+	int idx;
+
+	en_dbg(HW, priv, "adding VLAN:%d\n", vid);
+
+	set_bit(vid, priv->active_vlans);
+
+	/* Add VID to port VLAN filter */
+	mutex_lock(&mdev->state_lock);
+	if (mdev->device_up && priv->port_up) {
+		err = mlx4_SET_VLAN_FLTR(mdev->dev, priv);
+		if (err)
+			en_err(priv, "Failed configuring VLAN filter\n");
+	}
+	if (mlx4_register_vlan(mdev->dev, priv->port, vid, &idx))
+		en_dbg(HW, priv, "failed adding vlan %d\n", vid);
+	mutex_unlock(&mdev->state_lock);
+
+	return 0;
+}
+
+static int mlx4_en_vlan_rx_kill_vid(struct net_device *dev,
+				    __be16 proto, u16 vid)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = priv->mdev;
+	int err;
+
+	en_dbg(HW, priv, "Killing VID:%d\n", vid);
+
+	clear_bit(vid, priv->active_vlans);
+
+	/* Remove VID from port VLAN filter */
+	mutex_lock(&mdev->state_lock);
+	mlx4_unregister_vlan(mdev->dev, priv->port, vid);
+
+	if (mdev->device_up && priv->port_up) {
+		err = mlx4_SET_VLAN_FLTR(mdev->dev, priv);
+		if (err)
+			en_err(priv, "Failed configuring VLAN filter\n");
+	}
+	mutex_unlock(&mdev->state_lock);
+
+	return 0;
+}
+
+static void mlx4_en_u64_to_mac(unsigned char dst_mac[ETH_ALEN + 2], u64 src_mac)
+{
+	int i;
+	for (i = ETH_ALEN - 1; i >= 0; --i) {
+		dst_mac[i] = src_mac & 0xff;
+		src_mac >>= 8;
+	}
+	memset(&dst_mac[ETH_ALEN], 0, 2);
+}
+
+
+static int mlx4_en_tunnel_steer_add(struct mlx4_en_priv *priv, unsigned char *addr,
+				    int qpn, u64 *reg_id)
+{
+	int err;
+
+	if (priv->mdev->dev->caps.tunnel_offload_mode != MLX4_TUNNEL_OFFLOAD_MODE_VXLAN)
+		return 0; /* do nothing */
+
+	err = mlx4_tunnel_steer_add(priv->mdev->dev, addr, priv->port, qpn,
+				    MLX4_DOMAIN_NIC, reg_id);
+	if (err) {
+		en_err(priv, "failed to add vxlan steering rule, err %d\n", err);
+		return err;
+	}
+	en_dbg(DRV, priv, "added vxlan steering rule, mac %pM reg_id %llx\n", addr, *reg_id);
+	return 0;
+}
+
+
+static int mlx4_en_uc_steer_add(struct mlx4_en_priv *priv,
+				unsigned char *mac, int *qpn, u64 *reg_id)
+{
+	struct mlx4_en_dev *mdev = priv->mdev;
+	struct mlx4_dev *dev = mdev->dev;
+	int err;
+
+	switch (dev->caps.steering_mode) {
+	case MLX4_STEERING_MODE_B0: {
+		struct mlx4_qp qp;
+		u8 gid[16] = {0};
+
+		qp.qpn = *qpn;
+		memcpy(&gid[10], mac, ETH_ALEN);
+		gid[5] = priv->port;
+
+		err = mlx4_unicast_attach(dev, &qp, gid, 0, MLX4_PROT_ETH);
+		break;
+	}
+	case MLX4_STEERING_MODE_DEVICE_MANAGED: {
+		struct mlx4_spec_list spec_eth = { {NULL} };
+		__be64 mac_mask = cpu_to_be64(MLX4_MAC_MASK << 16);
+
+		struct mlx4_net_trans_rule rule = {
+			.queue_mode = MLX4_NET_TRANS_Q_FIFO,
+			.exclusive = 0,
+			.allow_loopback = 1,
+			.promisc_mode = MLX4_FS_REGULAR,
+			.priority = MLX4_DOMAIN_NIC,
+		};
+
+		rule.port = priv->port;
+		rule.qpn = *qpn;
+		INIT_LIST_HEAD(&rule.list);
+
+		spec_eth.id = MLX4_NET_TRANS_RULE_ID_ETH;
+		memcpy(spec_eth.eth.dst_mac, mac, ETH_ALEN);
+		memcpy(spec_eth.eth.dst_mac_msk, &mac_mask, ETH_ALEN);
+		list_add_tail(&spec_eth.list, &rule.list);
+
+		err = mlx4_flow_attach(dev, &rule, reg_id);
+		break;
+	}
+	default:
+		return -EINVAL;
+	}
+	if (err)
+		en_warn(priv, "Failed Attaching Unicast\n");
+
+	return err;
+}
+
+static void mlx4_en_uc_steer_release(struct mlx4_en_priv *priv,
+				     unsigned char *mac, int qpn, u64 reg_id)
+{
+	struct mlx4_en_dev *mdev = priv->mdev;
+	struct mlx4_dev *dev = mdev->dev;
+
+	switch (dev->caps.steering_mode) {
+	case MLX4_STEERING_MODE_B0: {
+		struct mlx4_qp qp;
+		u8 gid[16] = {0};
+
+		qp.qpn = qpn;
+		memcpy(&gid[10], mac, ETH_ALEN);
+		gid[5] = priv->port;
+
+		mlx4_unicast_detach(dev, &qp, gid, MLX4_PROT_ETH);
+		break;
+	}
+	case MLX4_STEERING_MODE_DEVICE_MANAGED: {
+		mlx4_flow_detach(dev, reg_id);
+		break;
+	}
+	default:
+		en_err(priv, "Invalid steering mode.\n");
+	}
+}
+
+static int mlx4_en_get_qp(struct mlx4_en_priv *priv)
+{
+	struct mlx4_en_dev *mdev = priv->mdev;
+	struct mlx4_dev *dev = mdev->dev;
+	struct mlx4_mac_entry *entry;
+	int index = 0;
+	int err = 0;
+	u64 reg_id;
+	int *qpn = &priv->base_qpn;
+	u64 mac = mlx4_mac_to_u64(priv->dev->dev_addr);
+
+	en_dbg(DRV, priv, "Registering MAC: %pM for adding\n",
+	       priv->dev->dev_addr);
+	index = mlx4_register_mac(dev, priv->port, mac);
+	if (index < 0) {
+		err = index;
+		en_err(priv, "Failed adding MAC: %pM\n",
+		       priv->dev->dev_addr);
+		return err;
+	}
+
+	if (dev->caps.steering_mode == MLX4_STEERING_MODE_A0) {
+		int base_qpn = mlx4_get_base_qpn(dev, priv->port);
+		*qpn = base_qpn + index;
+		return 0;
+	}
+
+	err = mlx4_qp_reserve_range(dev, 1, 1, qpn);
+	en_dbg(DRV, priv, "Reserved qp %d\n", *qpn);
+	if (err) {
+		en_err(priv, "Failed to reserve qp for mac registration\n");
+		goto qp_err;
+	}
+
+	err = mlx4_en_uc_steer_add(priv, priv->dev->dev_addr, qpn, &reg_id);
+	if (err)
+		goto steer_err;
+
+	err = mlx4_en_tunnel_steer_add(priv, priv->dev->dev_addr, *qpn,
+				       &priv->tunnel_reg_id);
+	if (err)
+		goto tunnel_err;
+
+	entry = kmalloc(sizeof(*entry), GFP_KERNEL);
+	if (!entry) {
+		err = -ENOMEM;
+		goto alloc_err;
+	}
+	memcpy(entry->mac, priv->dev->dev_addr, sizeof(entry->mac));
+	memcpy(priv->current_mac, entry->mac, sizeof(priv->current_mac));
+	entry->reg_id = reg_id;
+
+	hlist_add_head_rcu(&entry->hlist,
+			   &priv->mac_hash[entry->mac[MLX4_EN_MAC_HASH_IDX]]);
+
+	return 0;
+
+alloc_err:
+	if (priv->tunnel_reg_id)
+		mlx4_flow_detach(priv->mdev->dev, priv->tunnel_reg_id);
+tunnel_err:
+	mlx4_en_uc_steer_release(priv, priv->dev->dev_addr, *qpn, reg_id);
+
+steer_err:
+	mlx4_qp_release_range(dev, *qpn, 1);
+
+qp_err:
+	mlx4_unregister_mac(dev, priv->port, mac);
+	return err;
+}
+
+static void mlx4_en_put_qp(struct mlx4_en_priv *priv)
+{
+	struct mlx4_en_dev *mdev = priv->mdev;
+	struct mlx4_dev *dev = mdev->dev;
+	int qpn = priv->base_qpn;
+	u64 mac;
+
+	if (dev->caps.steering_mode == MLX4_STEERING_MODE_A0) {
+		mac = mlx4_mac_to_u64(priv->dev->dev_addr);
+		en_dbg(DRV, priv, "Registering MAC: %pM for deleting\n",
+		       priv->dev->dev_addr);
+		mlx4_unregister_mac(dev, priv->port, mac);
+	} else {
+		struct mlx4_mac_entry *entry;
+		struct hlist_node *tmp;
+		struct hlist_head *bucket;
+		unsigned int i;
+
+		for (i = 0; i < MLX4_EN_MAC_HASH_SIZE; ++i) {
+			bucket = &priv->mac_hash[i];
+			hlist_for_each_entry_safe(entry, tmp, bucket, hlist) {
+				mac = mlx4_mac_to_u64(entry->mac);
+				en_dbg(DRV, priv, "Registering MAC: %pM for deleting\n",
+				       entry->mac);
+				mlx4_en_uc_steer_release(priv, entry->mac,
+							 qpn, entry->reg_id);
+
+				mlx4_unregister_mac(dev, priv->port, mac);
+				hlist_del_rcu(&entry->hlist);
+				kfree_rcu(entry, rcu);
+			}
+		}
+
+		if (priv->tunnel_reg_id) {
+			mlx4_flow_detach(priv->mdev->dev, priv->tunnel_reg_id);
+			priv->tunnel_reg_id = 0;
+		}
+
+		en_dbg(DRV, priv, "Releasing qp: port %d, qpn %d\n",
+		       priv->port, qpn);
+		mlx4_qp_release_range(dev, qpn, 1);
+		priv->flags &= ~MLX4_EN_FLAG_FORCE_PROMISC;
+	}
+}
+
+static int mlx4_en_replace_mac(struct mlx4_en_priv *priv, int qpn,
+			       unsigned char *new_mac, unsigned char *prev_mac)
+{
+	struct mlx4_en_dev *mdev = priv->mdev;
+	struct mlx4_dev *dev = mdev->dev;
+	int err = 0;
+	u64 new_mac_u64 = mlx4_mac_to_u64(new_mac);
+
+	if (dev->caps.steering_mode != MLX4_STEERING_MODE_A0) {
+		struct hlist_head *bucket;
+		unsigned int mac_hash;
+		struct mlx4_mac_entry *entry;
+		struct hlist_node *tmp;
+		u64 prev_mac_u64 = mlx4_mac_to_u64(prev_mac);
+
+		bucket = &priv->mac_hash[prev_mac[MLX4_EN_MAC_HASH_IDX]];
+		hlist_for_each_entry_safe(entry, tmp, bucket, hlist) {
+			if (ether_addr_equal_64bits(entry->mac, prev_mac)) {
+				mlx4_en_uc_steer_release(priv, entry->mac,
+							 qpn, entry->reg_id);
+				mlx4_unregister_mac(dev, priv->port,
+						    prev_mac_u64);
+				hlist_del_rcu(&entry->hlist);
+				synchronize_rcu();
+				memcpy(entry->mac, new_mac, ETH_ALEN);
+				entry->reg_id = 0;
+				mac_hash = new_mac[MLX4_EN_MAC_HASH_IDX];
+				hlist_add_head_rcu(&entry->hlist,
+						   &priv->mac_hash[mac_hash]);
+				mlx4_register_mac(dev, priv->port, new_mac_u64);
+				err = mlx4_en_uc_steer_add(priv, new_mac,
+							   &qpn,
+							   &entry->reg_id);
+				if (err)
+					return err;
+				if (priv->tunnel_reg_id) {
+					mlx4_flow_detach(priv->mdev->dev, priv->tunnel_reg_id);
+					priv->tunnel_reg_id = 0;
+				}
+				err = mlx4_en_tunnel_steer_add(priv, new_mac, qpn,
+							       &priv->tunnel_reg_id);
+				return err;
+			}
+		}
+		return -EINVAL;
+	}
+
+	return __mlx4_replace_mac(dev, priv->port, qpn, new_mac_u64);
+}
+
+static int mlx4_en_do_set_mac(struct mlx4_en_priv *priv,
+			      unsigned char new_mac[ETH_ALEN + 2])
+{
+	int err = 0;
+
+	if (priv->port_up) {
+		/* Remove old MAC and insert the new one */
+		err = mlx4_en_replace_mac(priv, priv->base_qpn,
+					  new_mac, priv->current_mac);
+		if (err)
+			en_err(priv, "Failed changing HW MAC address\n");
+	} else
+		en_dbg(HW, priv, "Port is down while registering mac, exiting...\n");
+
+	if (!err)
+		memcpy(priv->current_mac, new_mac, sizeof(priv->current_mac));
+
+	return err;
+}
+
+static int mlx4_en_set_mac(struct net_device *dev, void *addr)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = priv->mdev;
+	struct sockaddr *saddr = addr;
+	unsigned char new_mac[ETH_ALEN + 2];
+	int err;
+
+	if (!is_valid_ether_addr(saddr->sa_data))
+		return -EADDRNOTAVAIL;
+
+	mutex_lock(&mdev->state_lock);
+	memcpy(new_mac, saddr->sa_data, ETH_ALEN);
+	err = mlx4_en_do_set_mac(priv, new_mac);
+	if (!err)
+		memcpy(dev->dev_addr, saddr->sa_data, ETH_ALEN);
+	mutex_unlock(&mdev->state_lock);
+
+	return err;
+}
+
+static void mlx4_en_clear_list(struct net_device *dev)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_mc_list *tmp, *mc_to_del;
+
+	list_for_each_entry_safe(mc_to_del, tmp, &priv->mc_list, list) {
+		list_del(&mc_to_del->list);
+		kfree(mc_to_del);
+	}
+}
+
+static void mlx4_en_cache_mclist(struct net_device *dev)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct netdev_hw_addr *ha;
+	struct mlx4_en_mc_list *tmp;
+
+	mlx4_en_clear_list(dev);
+	netdev_for_each_mc_addr(ha, dev) {
+		tmp = kzalloc(sizeof(struct mlx4_en_mc_list), GFP_ATOMIC);
+		if (!tmp) {
+			mlx4_en_clear_list(dev);
+			return;
+		}
+		memcpy(tmp->addr, ha->addr, ETH_ALEN);
+		list_add_tail(&tmp->list, &priv->mc_list);
+	}
+}
+
+static void update_mclist_flags(struct mlx4_en_priv *priv,
+				struct list_head *dst,
+				struct list_head *src)
+{
+	struct mlx4_en_mc_list *dst_tmp, *src_tmp, *new_mc;
+	bool found;
+
+	/* Find all the entries that should be removed from dst,
+	 * These are the entries that are not found in src
+	 */
+	list_for_each_entry(dst_tmp, dst, list) {
+		found = false;
+		list_for_each_entry(src_tmp, src, list) {
+			if (ether_addr_equal(dst_tmp->addr, src_tmp->addr)) {
+				found = true;
+				break;
+			}
+		}
+		if (!found)
+			dst_tmp->action = MCLIST_REM;
+	}
+
+	/* Add entries that exist in src but not in dst
+	 * mark them as need to add
+	 */
+	list_for_each_entry(src_tmp, src, list) {
+		found = false;
+		list_for_each_entry(dst_tmp, dst, list) {
+			if (ether_addr_equal(dst_tmp->addr, src_tmp->addr)) {
+				dst_tmp->action = MCLIST_NONE;
+				found = true;
+				break;
+			}
+		}
+		if (!found) {
+			new_mc = kmemdup(src_tmp,
+					 sizeof(struct mlx4_en_mc_list),
+					 GFP_KERNEL);
+			if (!new_mc)
+				return;
+
+			new_mc->action = MCLIST_ADD;
+			list_add_tail(&new_mc->list, dst);
+		}
+	}
+}
+
+static void mlx4_en_set_rx_mode(struct net_device *dev)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+
+	if (!priv->port_up)
+		return;
+
+	queue_work(priv->mdev->workqueue, &priv->rx_mode_task);
+}
+
+static void mlx4_en_set_promisc_mode(struct mlx4_en_priv *priv,
+				     struct mlx4_en_dev *mdev)
+{
+	int err = 0;
+
+	if (!(priv->flags & MLX4_EN_FLAG_PROMISC)) {
+		if (netif_msg_rx_status(priv))
+			en_warn(priv, "Entering promiscuous mode\n");
+		priv->flags |= MLX4_EN_FLAG_PROMISC;
+
+		/* Enable promiscouos mode */
+		switch (mdev->dev->caps.steering_mode) {
+		case MLX4_STEERING_MODE_DEVICE_MANAGED:
+			err = mlx4_flow_steer_promisc_add(mdev->dev,
+							  priv->port,
+							  priv->base_qpn,
+							  MLX4_FS_ALL_DEFAULT);
+			if (err)
+				en_err(priv, "Failed enabling promiscuous mode\n");
+			priv->flags |= MLX4_EN_FLAG_MC_PROMISC;
+			break;
+
+		case MLX4_STEERING_MODE_B0:
+			err = mlx4_unicast_promisc_add(mdev->dev,
+						       priv->base_qpn,
+						       priv->port);
+			if (err)
+				en_err(priv, "Failed enabling unicast promiscuous mode\n");
+
+			/* Add the default qp number as multicast
+			 * promisc
+			 */
+			if (!(priv->flags & MLX4_EN_FLAG_MC_PROMISC)) {
+				err = mlx4_multicast_promisc_add(mdev->dev,
+								 priv->base_qpn,
+								 priv->port);
+				if (err)
+					en_err(priv, "Failed enabling multicast promiscuous mode\n");
+				priv->flags |= MLX4_EN_FLAG_MC_PROMISC;
+			}
+			break;
+
+		case MLX4_STEERING_MODE_A0:
+			err = mlx4_SET_PORT_qpn_calc(mdev->dev,
+						     priv->port,
+						     priv->base_qpn,
+						     1);
+			if (err)
+				en_err(priv, "Failed enabling promiscuous mode\n");
+			break;
+		}
+
+		/* Disable port multicast filter (unconditionally) */
+		err = mlx4_SET_MCAST_FLTR(mdev->dev, priv->port, 0,
+					  0, MLX4_MCAST_DISABLE);
+		if (err)
+			en_err(priv, "Failed disabling multicast filter\n");
+	}
+}
+
+static void mlx4_en_clear_promisc_mode(struct mlx4_en_priv *priv,
+				       struct mlx4_en_dev *mdev)
+{
+	int err = 0;
+
+	if (netif_msg_rx_status(priv))
+		en_warn(priv, "Leaving promiscuous mode\n");
+	priv->flags &= ~MLX4_EN_FLAG_PROMISC;
+
+	/* Disable promiscouos mode */
+	switch (mdev->dev->caps.steering_mode) {
+	case MLX4_STEERING_MODE_DEVICE_MANAGED:
+		err = mlx4_flow_steer_promisc_remove(mdev->dev,
+						     priv->port,
+						     MLX4_FS_ALL_DEFAULT);
+		if (err)
+			en_err(priv, "Failed disabling promiscuous mode\n");
+		priv->flags &= ~MLX4_EN_FLAG_MC_PROMISC;
+		break;
+
+	case MLX4_STEERING_MODE_B0:
+		err = mlx4_unicast_promisc_remove(mdev->dev,
+						  priv->base_qpn,
+						  priv->port);
+		if (err)
+			en_err(priv, "Failed disabling unicast promiscuous mode\n");
+		/* Disable Multicast promisc */
+		if (priv->flags & MLX4_EN_FLAG_MC_PROMISC) {
+			err = mlx4_multicast_promisc_remove(mdev->dev,
+							    priv->base_qpn,
+							    priv->port);
+			if (err)
+				en_err(priv, "Failed disabling multicast promiscuous mode\n");
+			priv->flags &= ~MLX4_EN_FLAG_MC_PROMISC;
+		}
+		break;
+
+	case MLX4_STEERING_MODE_A0:
+		err = mlx4_SET_PORT_qpn_calc(mdev->dev,
+					     priv->port,
+					     priv->base_qpn, 0);
+		if (err)
+			en_err(priv, "Failed disabling promiscuous mode\n");
+		break;
+	}
+}
+
+static void mlx4_en_do_multicast(struct mlx4_en_priv *priv,
+				 struct net_device *dev,
+				 struct mlx4_en_dev *mdev)
+{
+	struct mlx4_en_mc_list *mclist, *tmp;
+	u64 mcast_addr = 0;
+	u8 mc_list[16] = {0};
+	int err = 0;
+
+	/* Enable/disable the multicast filter according to IFF_ALLMULTI */
+	if (dev->flags & IFF_ALLMULTI) {
+		err = mlx4_SET_MCAST_FLTR(mdev->dev, priv->port, 0,
+					  0, MLX4_MCAST_DISABLE);
+		if (err)
+			en_err(priv, "Failed disabling multicast filter\n");
+
+		/* Add the default qp number as multicast promisc */
+		if (!(priv->flags & MLX4_EN_FLAG_MC_PROMISC)) {
+			switch (mdev->dev->caps.steering_mode) {
+			case MLX4_STEERING_MODE_DEVICE_MANAGED:
+				err = mlx4_flow_steer_promisc_add(mdev->dev,
+								  priv->port,
+								  priv->base_qpn,
+								  MLX4_FS_MC_DEFAULT);
+				break;
+
+			case MLX4_STEERING_MODE_B0:
+				err = mlx4_multicast_promisc_add(mdev->dev,
+								 priv->base_qpn,
+								 priv->port);
+				break;
+
+			case MLX4_STEERING_MODE_A0:
+				break;
+			}
+			if (err)
+				en_err(priv, "Failed entering multicast promisc mode\n");
+			priv->flags |= MLX4_EN_FLAG_MC_PROMISC;
+		}
+	} else {
+		/* Disable Multicast promisc */
+		if (priv->flags & MLX4_EN_FLAG_MC_PROMISC) {
+			switch (mdev->dev->caps.steering_mode) {
+			case MLX4_STEERING_MODE_DEVICE_MANAGED:
+				err = mlx4_flow_steer_promisc_remove(mdev->dev,
+								     priv->port,
+								     MLX4_FS_MC_DEFAULT);
+				break;
+
+			case MLX4_STEERING_MODE_B0:
+				err = mlx4_multicast_promisc_remove(mdev->dev,
+								    priv->base_qpn,
+								    priv->port);
+				break;
+
+			case MLX4_STEERING_MODE_A0:
+				break;
+			}
+			if (err)
+				en_err(priv, "Failed disabling multicast promiscuous mode\n");
+			priv->flags &= ~MLX4_EN_FLAG_MC_PROMISC;
+		}
+
+		err = mlx4_SET_MCAST_FLTR(mdev->dev, priv->port, 0,
+					  0, MLX4_MCAST_DISABLE);
+		if (err)
+			en_err(priv, "Failed disabling multicast filter\n");
+
+		/* Flush mcast filter and init it with broadcast address */
+		mlx4_SET_MCAST_FLTR(mdev->dev, priv->port, ETH_BCAST,
+				    1, MLX4_MCAST_CONFIG);
+
+		/* Update multicast list - we cache all addresses so they won't
+		 * change while HW is updated holding the command semaphor */
+		netif_addr_lock_bh(dev);
+		mlx4_en_cache_mclist(dev);
+		netif_addr_unlock_bh(dev);
+		list_for_each_entry(mclist, &priv->mc_list, list) {
+			mcast_addr = mlx4_mac_to_u64(mclist->addr);
+			mlx4_SET_MCAST_FLTR(mdev->dev, priv->port,
+					    mcast_addr, 0, MLX4_MCAST_CONFIG);
+		}
+		err = mlx4_SET_MCAST_FLTR(mdev->dev, priv->port, 0,
+					  0, MLX4_MCAST_ENABLE);
+		if (err)
+			en_err(priv, "Failed enabling multicast filter\n");
+
+		update_mclist_flags(priv, &priv->curr_list, &priv->mc_list);
+		list_for_each_entry_safe(mclist, tmp, &priv->curr_list, list) {
+			if (mclist->action == MCLIST_REM) {
+				/* detach this address and delete from list */
+				memcpy(&mc_list[10], mclist->addr, ETH_ALEN);
+				mc_list[5] = priv->port;
+				err = mlx4_multicast_detach(mdev->dev,
+							    &priv->rss_map.indir_qp,
+							    mc_list,
+							    MLX4_PROT_ETH,
+							    mclist->reg_id);
+				if (err)
+					en_err(priv, "Fail to detach multicast address\n");
+
+				if (mclist->tunnel_reg_id) {
+					err = mlx4_flow_detach(priv->mdev->dev, mclist->tunnel_reg_id);
+					if (err)
+						en_err(priv, "Failed to detach multicast address\n");
+				}
+
+				/* remove from list */
+				list_del(&mclist->list);
+				kfree(mclist);
+			} else if (mclist->action == MCLIST_ADD) {
+				/* attach the address */
+				memcpy(&mc_list[10], mclist->addr, ETH_ALEN);
+				/* needed for B0 steering support */
+				mc_list[5] = priv->port;
+				err = mlx4_multicast_attach(mdev->dev,
+							    &priv->rss_map.indir_qp,
+							    mc_list,
+							    priv->port, 0,
+							    MLX4_PROT_ETH,
+							    &mclist->reg_id);
+				if (err)
+					en_err(priv, "Fail to attach multicast address\n");
+
+				err = mlx4_en_tunnel_steer_add(priv, &mc_list[10], priv->base_qpn,
+							       &mclist->tunnel_reg_id);
+				if (err)
+					en_err(priv, "Failed to attach multicast address\n");
+			}
+		}
+	}
+}
+
+static void mlx4_en_do_uc_filter(struct mlx4_en_priv *priv,
+				 struct net_device *dev,
+				 struct mlx4_en_dev *mdev)
+{
+	struct netdev_hw_addr *ha;
+	struct mlx4_mac_entry *entry;
+	struct hlist_node *tmp;
+	bool found;
+	u64 mac;
+	int err = 0;
+	struct hlist_head *bucket;
+	unsigned int i;
+	int removed = 0;
+	u32 prev_flags;
+
+	/* Note that we do not need to protect our mac_hash traversal with rcu,
+	 * since all modification code is protected by mdev->state_lock
+	 */
+
+	/* find what to remove */
+	for (i = 0; i < MLX4_EN_MAC_HASH_SIZE; ++i) {
+		bucket = &priv->mac_hash[i];
+		hlist_for_each_entry_safe(entry, tmp, bucket, hlist) {
+			found = false;
+			netdev_for_each_uc_addr(ha, dev) {
+				if (ether_addr_equal_64bits(entry->mac,
+							    ha->addr)) {
+					found = true;
+					break;
+				}
+			}
+
+			/* MAC address of the port is not in uc list */
+			if (ether_addr_equal_64bits(entry->mac,
+						    priv->current_mac))
+				found = true;
+
+			if (!found) {
+				mac = mlx4_mac_to_u64(entry->mac);
+				mlx4_en_uc_steer_release(priv, entry->mac,
+							 priv->base_qpn,
+							 entry->reg_id);
+				mlx4_unregister_mac(mdev->dev, priv->port, mac);
+
+				hlist_del_rcu(&entry->hlist);
+				kfree_rcu(entry, rcu);
+				en_dbg(DRV, priv, "Removed MAC %pM on port:%d\n",
+				       entry->mac, priv->port);
+				++removed;
+			}
+		}
+	}
+
+	/* if we didn't remove anything, there is no use in trying to add
+	 * again once we are in a forced promisc mode state
+	 */
+	if ((priv->flags & MLX4_EN_FLAG_FORCE_PROMISC) && 0 == removed)
+		return;
+
+	prev_flags = priv->flags;
+	priv->flags &= ~MLX4_EN_FLAG_FORCE_PROMISC;
+
+	/* find what to add */
+	netdev_for_each_uc_addr(ha, dev) {
+		found = false;
+		bucket = &priv->mac_hash[ha->addr[MLX4_EN_MAC_HASH_IDX]];
+		hlist_for_each_entry(entry, bucket, hlist) {
+			if (ether_addr_equal_64bits(entry->mac, ha->addr)) {
+				found = true;
+				break;
+			}
+		}
+
+		if (!found) {
+			entry = kmalloc(sizeof(*entry), GFP_KERNEL);
+			if (!entry) {
+				en_err(priv, "Failed adding MAC %pM on port:%d (out of memory)\n",
+				       ha->addr, priv->port);
+				priv->flags |= MLX4_EN_FLAG_FORCE_PROMISC;
+				break;
+			}
+			mac = mlx4_mac_to_u64(ha->addr);
+			memcpy(entry->mac, ha->addr, ETH_ALEN);
+			err = mlx4_register_mac(mdev->dev, priv->port, mac);
+			if (err < 0) {
+				en_err(priv, "Failed registering MAC %pM on port %d: %d\n",
+				       ha->addr, priv->port, err);
+				kfree(entry);
+				priv->flags |= MLX4_EN_FLAG_FORCE_PROMISC;
+				break;
+			}
+			err = mlx4_en_uc_steer_add(priv, ha->addr,
+						   &priv->base_qpn,
+						   &entry->reg_id);
+			if (err) {
+				en_err(priv, "Failed adding MAC %pM on port %d: %d\n",
+				       ha->addr, priv->port, err);
+				mlx4_unregister_mac(mdev->dev, priv->port, mac);
+				kfree(entry);
+				priv->flags |= MLX4_EN_FLAG_FORCE_PROMISC;
+				break;
+			} else {
+				unsigned int mac_hash;
+				en_dbg(DRV, priv, "Added MAC %pM on port:%d\n",
+				       ha->addr, priv->port);
+				mac_hash = ha->addr[MLX4_EN_MAC_HASH_IDX];
+				bucket = &priv->mac_hash[mac_hash];
+				hlist_add_head_rcu(&entry->hlist, bucket);
+			}
+		}
+	}
+
+	if (priv->flags & MLX4_EN_FLAG_FORCE_PROMISC) {
+		en_warn(priv, "Forcing promiscuous mode on port:%d\n",
+			priv->port);
+	} else if (prev_flags & MLX4_EN_FLAG_FORCE_PROMISC) {
+		en_warn(priv, "Stop forcing promiscuous mode on port:%d\n",
+			priv->port);
+	}
+}
+
+static void mlx4_en_do_set_rx_mode(struct work_struct *work)
+{
+	struct mlx4_en_priv *priv = container_of(work, struct mlx4_en_priv,
+						 rx_mode_task);
+	struct mlx4_en_dev *mdev = priv->mdev;
+	struct net_device *dev = priv->dev;
+
+	mutex_lock(&mdev->state_lock);
+	if (!mdev->device_up) {
+		en_dbg(HW, priv, "Card is not up, ignoring rx mode change.\n");
+		goto out;
+	}
+	if (!priv->port_up) {
+		en_dbg(HW, priv, "Port is down, ignoring rx mode change.\n");
+		goto out;
+	}
+
+	if (!netif_carrier_ok(dev)) {
+		if (!mlx4_en_QUERY_PORT(mdev, priv->port)) {
+			if (priv->port_state.link_state) {
+				priv->last_link_state = MLX4_DEV_EVENT_PORT_UP;
+				netif_carrier_on(dev);
+				en_dbg(LINK, priv, "Link Up\n");
+			}
+		}
+	}
+
+	if (dev->priv_flags & IFF_UNICAST_FLT)
+		mlx4_en_do_uc_filter(priv, dev, mdev);
+
+	/* Promsicuous mode: disable all filters */
+	if ((dev->flags & IFF_PROMISC) ||
+	    (priv->flags & MLX4_EN_FLAG_FORCE_PROMISC)) {
+		mlx4_en_set_promisc_mode(priv, mdev);
+		goto out;
+	}
+
+	/* Not in promiscuous mode */
+	if (priv->flags & MLX4_EN_FLAG_PROMISC)
+		mlx4_en_clear_promisc_mode(priv, mdev);
+
+	mlx4_en_do_multicast(priv, dev, mdev);
+out:
+	mutex_unlock(&mdev->state_lock);
+}
+
+#ifdef CONFIG_NET_POLL_CONTROLLER
+static void mlx4_en_netpoll(struct net_device *dev)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_cq *cq;
+	int i;
+
+	for (i = 0; i < priv->rx_ring_num; i++) {
+		cq = priv->rx_cq[i];
+		napi_schedule(&cq->napi);
+	}
+}
+#endif
+
+static void mlx4_en_tx_timeout(struct net_device *dev)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = priv->mdev;
+	int i;
+
+	if (netif_msg_timer(priv))
+		en_warn(priv, "Tx timeout called on port:%d\n", priv->port);
+
+	for (i = 0; i < priv->tx_ring_num; i++) {
+		if (!netif_tx_queue_stopped(netdev_get_tx_queue(dev, i)))
+			continue;
+		en_warn(priv, "TX timeout on queue: %d, QP: 0x%x, CQ: 0x%x, Cons: 0x%x, Prod: 0x%x\n",
+			i, priv->tx_ring[i]->qpn, priv->tx_ring[i]->cqn,
+			priv->tx_ring[i]->cons, priv->tx_ring[i]->prod);
+	}
+
+	priv->port_stats.tx_timeout++;
+	en_dbg(DRV, priv, "Scheduling watchdog\n");
+	queue_work(mdev->workqueue, &priv->watchdog_task);
+}
+
+
+static struct net_device_stats *mlx4_en_get_stats(struct net_device *dev)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+
+	spin_lock_bh(&priv->stats_lock);
+	memcpy(&priv->ret_stats, &priv->stats, sizeof(priv->stats));
+	spin_unlock_bh(&priv->stats_lock);
+
+	return &priv->ret_stats;
+}
+
+static void mlx4_en_set_default_moderation(struct mlx4_en_priv *priv)
+{
+	struct mlx4_en_cq *cq;
+	int i;
+
+	/* If we haven't received a specific coalescing setting
+	 * (module param), we set the moderation parameters as follows:
+	 * - moder_cnt is set to the number of mtu sized packets to
+	 *   satisfy our coalescing target.
+	 * - moder_time is set to a fixed value.
+	 */
+	priv->rx_frames = MLX4_EN_RX_COAL_TARGET;
+	priv->rx_usecs = MLX4_EN_RX_COAL_TIME;
+	priv->tx_frames = MLX4_EN_TX_COAL_PKTS;
+	priv->tx_usecs = MLX4_EN_TX_COAL_TIME;
+	en_dbg(INTR, priv, "Default coalesing params for mtu:%d - rx_frames:%d rx_usecs:%d\n",
+	       priv->dev->mtu, priv->rx_frames, priv->rx_usecs);
+
+	/* Setup cq moderation params */
+	for (i = 0; i < priv->rx_ring_num; i++) {
+		cq = priv->rx_cq[i];
+		cq->moder_cnt = priv->rx_frames;
+		cq->moder_time = priv->rx_usecs;
+		priv->last_moder_time[i] = MLX4_EN_AUTO_CONF;
+		priv->last_moder_packets[i] = 0;
+		priv->last_moder_bytes[i] = 0;
+	}
+
+	for (i = 0; i < priv->tx_ring_num; i++) {
+		cq = priv->tx_cq[i];
+		cq->moder_cnt = priv->tx_frames;
+		cq->moder_time = priv->tx_usecs;
+	}
+
+	/* Reset auto-moderation params */
+	priv->pkt_rate_low = MLX4_EN_RX_RATE_LOW;
+	priv->rx_usecs_low = MLX4_EN_RX_COAL_TIME_LOW;
+	priv->pkt_rate_high = MLX4_EN_RX_RATE_HIGH;
+	priv->rx_usecs_high = MLX4_EN_RX_COAL_TIME_HIGH;
+	priv->sample_interval = MLX4_EN_SAMPLE_INTERVAL;
+	priv->adaptive_rx_coal = 1;
+	priv->last_moder_jiffies = 0;
+	priv->last_moder_tx_packets = 0;
+}
+
+static void mlx4_en_auto_moderation(struct mlx4_en_priv *priv)
+{
+	unsigned long period = (unsigned long) (jiffies - priv->last_moder_jiffies);
+	struct mlx4_en_cq *cq;
+	unsigned long packets;
+	unsigned long rate;
+	unsigned long avg_pkt_size;
+	unsigned long rx_packets;
+	unsigned long rx_bytes;
+	unsigned long rx_pkt_diff;
+	int moder_time;
+	int ring, err;
+
+	if (!priv->adaptive_rx_coal || period < priv->sample_interval * HZ)
+		return;
+
+	for (ring = 0; ring < priv->rx_ring_num; ring++) {
+		spin_lock_bh(&priv->stats_lock);
+		rx_packets = priv->rx_ring[ring]->packets;
+		rx_bytes = priv->rx_ring[ring]->bytes;
+		spin_unlock_bh(&priv->stats_lock);
+
+		rx_pkt_diff = ((unsigned long) (rx_packets -
+				priv->last_moder_packets[ring]));
+		packets = rx_pkt_diff;
+		rate = packets * HZ / period;
+		avg_pkt_size = packets ? ((unsigned long) (rx_bytes -
+				priv->last_moder_bytes[ring])) / packets : 0;
+
+		/* Apply auto-moderation only when packet rate
+		 * exceeds a rate that it matters */
+		if (rate > (MLX4_EN_RX_RATE_THRESH / priv->rx_ring_num) &&
+		    avg_pkt_size > MLX4_EN_AVG_PKT_SMALL) {
+			if (rate < priv->pkt_rate_low)
+				moder_time = priv->rx_usecs_low;
+			else if (rate > priv->pkt_rate_high)
+				moder_time = priv->rx_usecs_high;
+			else
+				moder_time = (rate - priv->pkt_rate_low) *
+					(priv->rx_usecs_high - priv->rx_usecs_low) /
+					(priv->pkt_rate_high - priv->pkt_rate_low) +
+					priv->rx_usecs_low;
+		} else {
+			moder_time = priv->rx_usecs_low;
+		}
+
+		if (moder_time != priv->last_moder_time[ring]) {
+			priv->last_moder_time[ring] = moder_time;
+			cq = priv->rx_cq[ring];
+			cq->moder_time = moder_time;
+			cq->moder_cnt = priv->rx_frames;
+			err = mlx4_en_set_cq_moder(priv, cq);
+			if (err)
+				en_err(priv, "Failed modifying moderation for cq:%d\n",
+				       ring);
+		}
+		priv->last_moder_packets[ring] = rx_packets;
+		priv->last_moder_bytes[ring] = rx_bytes;
+	}
+
+	priv->last_moder_jiffies = jiffies;
+}
+
+static void mlx4_en_do_get_stats(struct work_struct *work)
+{
+	struct delayed_work *delay = to_delayed_work(work);
+	struct mlx4_en_priv *priv = container_of(delay, struct mlx4_en_priv,
+						 stats_task);
+	struct mlx4_en_dev *mdev = priv->mdev;
+	int err;
+
+	mutex_lock(&mdev->state_lock);
+	if (mdev->device_up) {
+		if (priv->port_up) {
+			err = mlx4_en_DUMP_ETH_STATS(mdev, priv->port, 0);
+			if (err)
+				en_dbg(HW, priv, "Could not update stats\n");
+
+			mlx4_en_auto_moderation(priv);
+		}
+
+		queue_delayed_work(mdev->workqueue, &priv->stats_task, STATS_DELAY);
+	}
+	if (mdev->mac_removed[MLX4_MAX_PORTS + 1 - priv->port]) {
+		mlx4_en_do_set_mac(priv, priv->current_mac);
+		mdev->mac_removed[MLX4_MAX_PORTS + 1 - priv->port] = 0;
+	}
+	mutex_unlock(&mdev->state_lock);
+}
+
+/* mlx4_en_service_task - Run service task for tasks that needed to be done
+ * periodically
+ */
+static void mlx4_en_service_task(struct work_struct *work)
+{
+	struct delayed_work *delay = to_delayed_work(work);
+	struct mlx4_en_priv *priv = container_of(delay, struct mlx4_en_priv,
+						 service_task);
+	struct mlx4_en_dev *mdev = priv->mdev;
+
+	mutex_lock(&mdev->state_lock);
+	if (mdev->device_up) {
+		if (mdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_TS)
+			mlx4_en_ptp_overflow_check(mdev);
+
+		queue_delayed_work(mdev->workqueue, &priv->service_task,
+				   SERVICE_TASK_DELAY);
+	}
+	mutex_unlock(&mdev->state_lock);
+}
+
+static void mlx4_en_linkstate(struct work_struct *work)
+{
+	struct mlx4_en_priv *priv = container_of(work, struct mlx4_en_priv,
+						 linkstate_task);
+	struct mlx4_en_dev *mdev = priv->mdev;
+	int linkstate = priv->link_state;
+
+	mutex_lock(&mdev->state_lock);
+	/* If observable port state changed set carrier state and
+	 * report to system log */
+	if (priv->last_link_state != linkstate) {
+		if (linkstate == MLX4_DEV_EVENT_PORT_DOWN) {
+			en_info(priv, "Link Down\n");
+			netif_carrier_off(priv->dev);
+		} else {
+			en_info(priv, "Link Up\n");
+			netif_carrier_on(priv->dev);
+		}
+	}
+	priv->last_link_state = linkstate;
+	mutex_unlock(&mdev->state_lock);
+}
+
+static int mlx4_en_init_affinity_hint(struct mlx4_en_priv *priv, int ring_idx)
+{
+	struct mlx4_en_rx_ring *ring = priv->rx_ring[ring_idx];
+	int numa_node = priv->mdev->dev->numa_node;
+	int ret = 0;
+
+	if (!zalloc_cpumask_var(&ring->affinity_mask, GFP_KERNEL))
+		return -ENOMEM;
+
+	ret = cpumask_set_cpu_local_first(ring_idx, numa_node,
+					  ring->affinity_mask);
+	if (ret)
+		free_cpumask_var(ring->affinity_mask);
+
+	return ret;
+}
+
+static void mlx4_en_free_affinity_hint(struct mlx4_en_priv *priv, int ring_idx)
+{
+	free_cpumask_var(priv->rx_ring[ring_idx]->affinity_mask);
+}
+
+int mlx4_en_start_port(struct net_device *dev)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = priv->mdev;
+	struct mlx4_en_cq *cq;
+	struct mlx4_en_tx_ring *tx_ring;
+	int rx_index = 0;
+	int tx_index = 0;
+	int err = 0;
+	int i;
+	int j;
+	u8 mc_list[16] = {0};
+
+	if (priv->port_up) {
+		en_dbg(DRV, priv, "start port called while port already up\n");
+		return 0;
+	}
+
+	INIT_LIST_HEAD(&priv->mc_list);
+	INIT_LIST_HEAD(&priv->curr_list);
+	INIT_LIST_HEAD(&priv->ethtool_list);
+	memset(&priv->ethtool_rules[0], 0,
+	       sizeof(struct ethtool_flow_id) * MAX_NUM_OF_FS_RULES);
+
+	/* Calculate Rx buf size */
+	dev->mtu = min(dev->mtu, priv->max_mtu);
+	mlx4_en_calc_rx_buf(dev);
+	en_dbg(DRV, priv, "Rx buf size:%d\n", priv->rx_skb_size);
+
+	/* Configure rx cq's and rings */
+	err = mlx4_en_activate_rx_rings(priv);
+	if (err) {
+		en_err(priv, "Failed to activate RX rings\n");
+		return err;
+	}
+	for (i = 0; i < priv->rx_ring_num; i++) {
+		cq = priv->rx_cq[i];
+
+		mlx4_en_cq_init_lock(cq);
+
+		err = mlx4_en_init_affinity_hint(priv, i);
+		if (err) {
+			en_err(priv, "Failed preparing IRQ affinity hint\n");
+			goto cq_err;
+		}
+
+		err = mlx4_en_activate_cq(priv, cq, i);
+		if (err) {
+			en_err(priv, "Failed activating Rx CQ\n");
+			mlx4_en_free_affinity_hint(priv, i);
+			goto cq_err;
+		}
+		for (j = 0; j < cq->size; j++)
+			cq->buf[j].owner_sr_opcode = MLX4_CQE_OWNER_MASK;
+		err = mlx4_en_set_cq_moder(priv, cq);
+		if (err) {
+			en_err(priv, "Failed setting cq moderation parameters\n");
+			mlx4_en_deactivate_cq(priv, cq);
+			mlx4_en_free_affinity_hint(priv, i);
+			goto cq_err;
+		}
+		mlx4_en_arm_cq(priv, cq);
+		priv->rx_ring[i]->cqn = cq->mcq.cqn;
+		++rx_index;
+	}
+
+	/* Set qp number */
+	en_dbg(DRV, priv, "Getting qp number for port %d\n", priv->port);
+	err = mlx4_en_get_qp(priv);
+	if (err) {
+		en_err(priv, "Failed getting eth qp\n");
+		goto cq_err;
+	}
+	mdev->mac_removed[priv->port] = 0;
+
+	err = mlx4_en_config_rss_steer(priv);
+	if (err) {
+		en_err(priv, "Failed configuring rss steering\n");
+		goto mac_err;
+	}
+
+	err = mlx4_en_create_drop_qp(priv);
+	if (err)
+		goto rss_err;
+
+	/* Configure tx cq's and rings */
+	for (i = 0; i < priv->tx_ring_num; i++) {
+		/* Configure cq */
+		cq = priv->tx_cq[i];
+		err = mlx4_en_activate_cq(priv, cq, i);
+		if (err) {
+			en_err(priv, "Failed allocating Tx CQ\n");
+			goto tx_err;
+		}
+		err = mlx4_en_set_cq_moder(priv, cq);
+		if (err) {
+			en_err(priv, "Failed setting cq moderation parameters\n");
+			mlx4_en_deactivate_cq(priv, cq);
+			goto tx_err;
+		}
+		en_dbg(DRV, priv, "Resetting index of collapsed CQ:%d to -1\n", i);
+		cq->buf->wqe_index = cpu_to_be16(0xffff);
+
+		/* Configure ring */
+		tx_ring = priv->tx_ring[i];
+		err = mlx4_en_activate_tx_ring(priv, tx_ring, cq->mcq.cqn,
+			i / priv->num_tx_rings_p_up);
+		if (err) {
+			en_err(priv, "Failed allocating Tx ring\n");
+			mlx4_en_deactivate_cq(priv, cq);
+			goto tx_err;
+		}
+		tx_ring->tx_queue = netdev_get_tx_queue(dev, i);
+
+		/* Arm CQ for TX completions */
+		mlx4_en_arm_cq(priv, cq);
+
+		/* Set initial ownership of all Tx TXBBs to SW (1) */
+		for (j = 0; j < tx_ring->buf_size; j += STAMP_STRIDE)
+			*((u32 *) (tx_ring->buf + j)) = 0xffffffff;
+		++tx_index;
+	}
+
+	/* Configure port */
+	err = mlx4_SET_PORT_general(mdev->dev, priv->port,
+				    priv->rx_skb_size + ETH_FCS_LEN,
+				    priv->prof->tx_pause,
+				    priv->prof->tx_ppp,
+				    priv->prof->rx_pause,
+				    priv->prof->rx_ppp);
+	if (err) {
+		en_err(priv, "Failed setting port general configurations for port %d, with error %d\n",
+		       priv->port, err);
+		goto tx_err;
+	}
+	/* Set default qp number */
+	err = mlx4_SET_PORT_qpn_calc(mdev->dev, priv->port, priv->base_qpn, 0);
+	if (err) {
+		en_err(priv, "Failed setting default qp numbers\n");
+		goto tx_err;
+	}
+
+	if (mdev->dev->caps.tunnel_offload_mode == MLX4_TUNNEL_OFFLOAD_MODE_VXLAN) {
+		err = mlx4_SET_PORT_VXLAN(mdev->dev, priv->port, VXLAN_STEER_BY_OUTER_MAC, 1);
+		if (err) {
+			en_err(priv, "Failed setting port L2 tunnel configuration, err %d\n",
+			       err);
+			goto tx_err;
+		}
+	}
+
+	/* Init port */
+	en_dbg(HW, priv, "Initializing port\n");
+	err = mlx4_INIT_PORT(mdev->dev, priv->port);
+	if (err) {
+		en_err(priv, "Failed Initializing port\n");
+		goto tx_err;
+	}
+
+	/* Attach rx QP to bradcast address */
+	memset(&mc_list[10], 0xff, ETH_ALEN);
+	mc_list[5] = priv->port; /* needed for B0 steering support */
+	if (mlx4_multicast_attach(mdev->dev, &priv->rss_map.indir_qp, mc_list,
+				  priv->port, 0, MLX4_PROT_ETH,
+				  &priv->broadcast_id))
+		mlx4_warn(mdev, "Failed Attaching Broadcast\n");
+
+	/* Must redo promiscuous mode setup. */
+	priv->flags &= ~(MLX4_EN_FLAG_PROMISC | MLX4_EN_FLAG_MC_PROMISC);
+
+	/* Schedule multicast task to populate multicast list */
+	queue_work(mdev->workqueue, &priv->rx_mode_task);
+
+	mlx4_set_stats_bitmap(mdev->dev, &priv->stats_bitmap);
+
+#ifdef CONFIG_MLX4_EN_VXLAN
+	if (priv->mdev->dev->caps.tunnel_offload_mode == MLX4_TUNNEL_OFFLOAD_MODE_VXLAN)
+		vxlan_get_rx_port(dev);
+#endif
+	priv->port_up = true;
+	netif_tx_start_all_queues(dev);
+	netif_device_attach(dev);
+
+	return 0;
+
+tx_err:
+	while (tx_index--) {
+		mlx4_en_deactivate_tx_ring(priv, priv->tx_ring[tx_index]);
+		mlx4_en_deactivate_cq(priv, priv->tx_cq[tx_index]);
+	}
+	mlx4_en_destroy_drop_qp(priv);
+rss_err:
+	mlx4_en_release_rss_steer(priv);
+mac_err:
+	mlx4_en_put_qp(priv);
+cq_err:
+	while (rx_index--) {
+		mlx4_en_deactivate_cq(priv, priv->rx_cq[rx_index]);
+		mlx4_en_free_affinity_hint(priv, i);
+	}
+	for (i = 0; i < priv->rx_ring_num; i++)
+		mlx4_en_deactivate_rx_ring(priv, priv->rx_ring[i]);
+
+	return err; /* need to close devices */
+}
+
+
+void mlx4_en_stop_port(struct net_device *dev, int detach)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = priv->mdev;
+	struct mlx4_en_mc_list *mclist, *tmp;
+	struct ethtool_flow_id *flow, *tmp_flow;
+	int i;
+	u8 mc_list[16] = {0};
+
+	if (!priv->port_up) {
+		en_dbg(DRV, priv, "stop port called while port already down\n");
+		return;
+	}
+
+	/* close port*/
+	mlx4_CLOSE_PORT(mdev->dev, priv->port);
+
+	/* Synchronize with tx routine */
+	netif_tx_lock_bh(dev);
+	if (detach)
+		netif_device_detach(dev);
+	netif_tx_stop_all_queues(dev);
+	netif_tx_unlock_bh(dev);
+
+	netif_tx_disable(dev);
+
+	/* Set port as not active */
+	priv->port_up = false;
+
+	/* Promsicuous mode */
+	if (mdev->dev->caps.steering_mode ==
+	    MLX4_STEERING_MODE_DEVICE_MANAGED) {
+		priv->flags &= ~(MLX4_EN_FLAG_PROMISC |
+				 MLX4_EN_FLAG_MC_PROMISC);
+		mlx4_flow_steer_promisc_remove(mdev->dev,
+					       priv->port,
+					       MLX4_FS_ALL_DEFAULT);
+		mlx4_flow_steer_promisc_remove(mdev->dev,
+					       priv->port,
+					       MLX4_FS_MC_DEFAULT);
+	} else if (priv->flags & MLX4_EN_FLAG_PROMISC) {
+		priv->flags &= ~MLX4_EN_FLAG_PROMISC;
+
+		/* Disable promiscouos mode */
+		mlx4_unicast_promisc_remove(mdev->dev, priv->base_qpn,
+					    priv->port);
+
+		/* Disable Multicast promisc */
+		if (priv->flags & MLX4_EN_FLAG_MC_PROMISC) {
+			mlx4_multicast_promisc_remove(mdev->dev, priv->base_qpn,
+						      priv->port);
+			priv->flags &= ~MLX4_EN_FLAG_MC_PROMISC;
+		}
+	}
+
+	/* Detach All multicasts */
+	memset(&mc_list[10], 0xff, ETH_ALEN);
+	mc_list[5] = priv->port; /* needed for B0 steering support */
+	mlx4_multicast_detach(mdev->dev, &priv->rss_map.indir_qp, mc_list,
+			      MLX4_PROT_ETH, priv->broadcast_id);
+	list_for_each_entry(mclist, &priv->curr_list, list) {
+		memcpy(&mc_list[10], mclist->addr, ETH_ALEN);
+		mc_list[5] = priv->port;
+		mlx4_multicast_detach(mdev->dev, &priv->rss_map.indir_qp,
+				      mc_list, MLX4_PROT_ETH, mclist->reg_id);
+		if (mclist->tunnel_reg_id)
+			mlx4_flow_detach(mdev->dev, mclist->tunnel_reg_id);
+	}
+	mlx4_en_clear_list(dev);
+	list_for_each_entry_safe(mclist, tmp, &priv->curr_list, list) {
+		list_del(&mclist->list);
+		kfree(mclist);
+	}
+
+	/* Flush multicast filter */
+	mlx4_SET_MCAST_FLTR(mdev->dev, priv->port, 0, 1, MLX4_MCAST_CONFIG);
+
+	/* Remove flow steering rules for the port*/
+	if (mdev->dev->caps.steering_mode ==
+	    MLX4_STEERING_MODE_DEVICE_MANAGED) {
+		ASSERT_RTNL();
+		list_for_each_entry_safe(flow, tmp_flow,
+					 &priv->ethtool_list, list) {
+			mlx4_flow_detach(mdev->dev, flow->id);
+			list_del(&flow->list);
+		}
+	}
+
+	mlx4_en_destroy_drop_qp(priv);
+
+	/* Free TX Rings */
+	for (i = 0; i < priv->tx_ring_num; i++) {
+		mlx4_en_deactivate_tx_ring(priv, priv->tx_ring[i]);
+		mlx4_en_deactivate_cq(priv, priv->tx_cq[i]);
+	}
+	msleep(10);
+
+	for (i = 0; i < priv->tx_ring_num; i++)
+		mlx4_en_free_tx_buf(dev, priv->tx_ring[i]);
+
+	/* Free RSS qps */
+	mlx4_en_release_rss_steer(priv);
+
+	/* Unregister Mac address for the port */
+	mlx4_en_put_qp(priv);
+	if (!(mdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_REASSIGN_MAC_EN))
+		mdev->mac_removed[priv->port] = 1;
+
+	/* Free RX Rings */
+	for (i = 0; i < priv->rx_ring_num; i++) {
+		struct mlx4_en_cq *cq = priv->rx_cq[i];
+
+		local_bh_disable();
+		while (!mlx4_en_cq_lock_napi(cq)) {
+			pr_info("CQ %d locked\n", i);
+			mdelay(1);
+		}
+		local_bh_enable();
+
+		while (test_bit(NAPI_STATE_SCHED, &cq->napi.state))
+			msleep(1);
+		mlx4_en_deactivate_rx_ring(priv, priv->rx_ring[i]);
+		mlx4_en_deactivate_cq(priv, cq);
+
+		mlx4_en_free_affinity_hint(priv, i);
+	}
+}
+
+static void mlx4_en_restart(struct work_struct *work)
+{
+	struct mlx4_en_priv *priv = container_of(work, struct mlx4_en_priv,
+						 watchdog_task);
+	struct mlx4_en_dev *mdev = priv->mdev;
+	struct net_device *dev = priv->dev;
+
+	en_dbg(DRV, priv, "Watchdog task called for port %d\n", priv->port);
+
+	mutex_lock(&mdev->state_lock);
+	if (priv->port_up) {
+		mlx4_en_stop_port(dev, 1);
+		if (mlx4_en_start_port(dev))
+			en_err(priv, "Failed restarting port %d\n", priv->port);
+	}
+	mutex_unlock(&mdev->state_lock);
+}
+
+static void mlx4_en_clear_stats(struct net_device *dev)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = priv->mdev;
+	int i;
+
+	if (mlx4_en_DUMP_ETH_STATS(mdev, priv->port, 1))
+		en_dbg(HW, priv, "Failed dumping statistics\n");
+
+	memset(&priv->stats, 0, sizeof(priv->stats));
+	memset(&priv->pstats, 0, sizeof(priv->pstats));
+	memset(&priv->pkstats, 0, sizeof(priv->pkstats));
+	memset(&priv->port_stats, 0, sizeof(priv->port_stats));
+
+	for (i = 0; i < priv->tx_ring_num; i++) {
+		priv->tx_ring[i]->bytes = 0;
+		priv->tx_ring[i]->packets = 0;
+		priv->tx_ring[i]->tx_csum = 0;
+	}
+	for (i = 0; i < priv->rx_ring_num; i++) {
+		priv->rx_ring[i]->bytes = 0;
+		priv->rx_ring[i]->packets = 0;
+		priv->rx_ring[i]->csum_ok = 0;
+		priv->rx_ring[i]->csum_none = 0;
+	}
+}
+
+static int mlx4_en_open(struct net_device *dev)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = priv->mdev;
+	int err = 0;
+
+	mutex_lock(&mdev->state_lock);
+
+	if (!mdev->device_up) {
+		en_err(priv, "Cannot open - device down/disabled\n");
+		err = -EBUSY;
+		goto out;
+	}
+
+	/* Reset HW statistics and SW counters */
+	mlx4_en_clear_stats(dev);
+
+	err = mlx4_en_start_port(dev);
+	if (err)
+		en_err(priv, "Failed starting port:%d\n", priv->port);
+
+out:
+	mutex_unlock(&mdev->state_lock);
+	return err;
+}
+
+
+static int mlx4_en_close(struct net_device *dev)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = priv->mdev;
+
+	en_dbg(IFDOWN, priv, "Close port called\n");
+
+	mutex_lock(&mdev->state_lock);
+
+	mlx4_en_stop_port(dev, 0);
+	netif_carrier_off(dev);
+
+	mutex_unlock(&mdev->state_lock);
+	return 0;
+}
+
+void mlx4_en_free_resources(struct mlx4_en_priv *priv)
+{
+	int i;
+
+#ifdef CONFIG_RFS_ACCEL
+	free_irq_cpu_rmap(priv->dev->rx_cpu_rmap);
+	priv->dev->rx_cpu_rmap = NULL;
+#endif
+
+	for (i = 0; i < priv->tx_ring_num; i++) {
+		if (priv->tx_ring && priv->tx_ring[i])
+			mlx4_en_destroy_tx_ring(priv, &priv->tx_ring[i]);
+		if (priv->tx_cq && priv->tx_cq[i])
+			mlx4_en_destroy_cq(priv, &priv->tx_cq[i]);
+	}
+
+	for (i = 0; i < priv->rx_ring_num; i++) {
+		if (priv->rx_ring[i])
+			mlx4_en_destroy_rx_ring(priv, &priv->rx_ring[i],
+				priv->prof->rx_ring_size, priv->stride);
+		if (priv->rx_cq[i])
+			mlx4_en_destroy_cq(priv, &priv->rx_cq[i]);
+	}
+
+	if (priv->base_tx_qpn) {
+		mlx4_qp_release_range(priv->mdev->dev, priv->base_tx_qpn, priv->tx_ring_num);
+		priv->base_tx_qpn = 0;
+	}
+}
+
+int mlx4_en_alloc_resources(struct mlx4_en_priv *priv)
+{
+	struct mlx4_en_port_profile *prof = priv->prof;
+	int i;
+	int err;
+	int node;
+
+	err = mlx4_qp_reserve_range(priv->mdev->dev, priv->tx_ring_num, 256, &priv->base_tx_qpn);
+	if (err) {
+		en_err(priv, "failed reserving range for TX rings\n");
+		return err;
+	}
+
+	/* Create tx Rings */
+	for (i = 0; i < priv->tx_ring_num; i++) {
+		node = cpu_to_node(i % num_online_cpus());
+		if (mlx4_en_create_cq(priv, &priv->tx_cq[i],
+				      prof->tx_ring_size, i, TX, node))
+			goto err;
+
+		if (mlx4_en_create_tx_ring(priv, &priv->tx_ring[i],
+					   priv->base_tx_qpn + i,
+					   prof->tx_ring_size, TXBB_SIZE,
+					   node, i))
+			goto err;
+	}
+
+	/* Create rx Rings */
+	for (i = 0; i < priv->rx_ring_num; i++) {
+		node = cpu_to_node(i % num_online_cpus());
+		if (mlx4_en_create_cq(priv, &priv->rx_cq[i],
+				      prof->rx_ring_size, i, RX, node))
+			goto err;
+
+		if (mlx4_en_create_rx_ring(priv, &priv->rx_ring[i],
+					   prof->rx_ring_size, priv->stride,
+					   node))
+			goto err;
+	}
+
+#ifdef CONFIG_RFS_ACCEL
+	if (priv->mdev->dev->caps.comp_pool) {
+		priv->dev->rx_cpu_rmap = alloc_irq_cpu_rmap(priv->mdev->dev->caps.comp_pool);
+		if (!priv->dev->rx_cpu_rmap)
+			goto err;
+	}
+#endif
+
+	return 0;
+
+err:
+	en_err(priv, "Failed to allocate NIC resources\n");
+	for (i = 0; i < priv->rx_ring_num; i++) {
+		if (priv->rx_ring[i])
+			mlx4_en_destroy_rx_ring(priv, &priv->rx_ring[i],
+						prof->rx_ring_size,
+						priv->stride);
+		if (priv->rx_cq[i])
+			mlx4_en_destroy_cq(priv, &priv->rx_cq[i]);
+	}
+	for (i = 0; i < priv->tx_ring_num; i++) {
+		if (priv->tx_ring[i])
+			mlx4_en_destroy_tx_ring(priv, &priv->tx_ring[i]);
+		if (priv->tx_cq[i])
+			mlx4_en_destroy_cq(priv, &priv->tx_cq[i]);
+	}
+	return -ENOMEM;
+}
+
+
+void mlx4_en_destroy_netdev(struct net_device *dev)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = priv->mdev;
+
+	en_dbg(DRV, priv, "Destroying netdev on port:%d\n", priv->port);
+
+	/* Unregister device - this will close the port if it was up */
+	if (priv->registered)
+		unregister_netdev(dev);
+
+	if (priv->allocated)
+		mlx4_free_hwq_res(mdev->dev, &priv->res, MLX4_EN_PAGE_SIZE);
+
+	cancel_delayed_work(&priv->stats_task);
+	cancel_delayed_work(&priv->service_task);
+	/* flush any pending task for this netdev */
+	flush_workqueue(mdev->workqueue);
+
+	/* Detach the netdev so tasks would not attempt to access it */
+	mutex_lock(&mdev->state_lock);
+	mdev->pndev[priv->port] = NULL;
+	mutex_unlock(&mdev->state_lock);
+
+	mlx4_en_free_resources(priv);
+
+	kfree(priv->tx_ring);
+	kfree(priv->tx_cq);
+
+	free_netdev(dev);
+}
+
+static int mlx4_en_change_mtu(struct net_device *dev, int new_mtu)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = priv->mdev;
+	int err = 0;
+
+	en_dbg(DRV, priv, "Change MTU called - current:%d new:%d\n",
+		 dev->mtu, new_mtu);
+
+	if ((new_mtu < MLX4_EN_MIN_MTU) || (new_mtu > priv->max_mtu)) {
+		en_err(priv, "Bad MTU size:%d.\n", new_mtu);
+		return -EPERM;
+	}
+	dev->mtu = new_mtu;
+
+	if (netif_running(dev)) {
+		mutex_lock(&mdev->state_lock);
+		if (!mdev->device_up) {
+			/* NIC is probably restarting - let watchdog task reset
+			 * the port */
+			en_dbg(DRV, priv, "Change MTU called with card down!?\n");
+		} else {
+			mlx4_en_stop_port(dev, 1);
+			err = mlx4_en_start_port(dev);
+			if (err) {
+				en_err(priv, "Failed restarting port:%d\n",
+					 priv->port);
+				queue_work(mdev->workqueue, &priv->watchdog_task);
+			}
+		}
+		mutex_unlock(&mdev->state_lock);
+	}
+	return 0;
+}
+
+static int mlx4_en_hwtstamp_set(struct net_device *dev, struct ifreq *ifr)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = priv->mdev;
+	struct hwtstamp_config config;
+
+	if (copy_from_user(&config, ifr->ifr_data, sizeof(config)))
+		return -EFAULT;
+
+	/* reserved for future extensions */
+	if (config.flags)
+		return -EINVAL;
+
+	/* device doesn't support time stamping */
+	if (!(mdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_TS))
+		return -EINVAL;
+
+	/* TX HW timestamp */
+	switch (config.tx_type) {
+	case HWTSTAMP_TX_OFF:
+	case HWTSTAMP_TX_ON:
+		break;
+	default:
+		return -ERANGE;
+	}
+
+	/* RX HW timestamp */
+	switch (config.rx_filter) {
+	case HWTSTAMP_FILTER_NONE:
+		break;
+	case HWTSTAMP_FILTER_ALL:
+	case HWTSTAMP_FILTER_SOME:
+	case HWTSTAMP_FILTER_PTP_V1_L4_EVENT:
+	case HWTSTAMP_FILTER_PTP_V1_L4_SYNC:
+	case HWTSTAMP_FILTER_PTP_V1_L4_DELAY_REQ:
+	case HWTSTAMP_FILTER_PTP_V2_L4_EVENT:
+	case HWTSTAMP_FILTER_PTP_V2_L4_SYNC:
+	case HWTSTAMP_FILTER_PTP_V2_L4_DELAY_REQ:
+	case HWTSTAMP_FILTER_PTP_V2_L2_EVENT:
+	case HWTSTAMP_FILTER_PTP_V2_L2_SYNC:
+	case HWTSTAMP_FILTER_PTP_V2_L2_DELAY_REQ:
+	case HWTSTAMP_FILTER_PTP_V2_EVENT:
+	case HWTSTAMP_FILTER_PTP_V2_SYNC:
+	case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ:
+		config.rx_filter = HWTSTAMP_FILTER_ALL;
+		break;
+	default:
+		return -ERANGE;
+	}
+
+	if (mlx4_en_timestamp_config(dev, config.tx_type, config.rx_filter)) {
+		config.tx_type = HWTSTAMP_TX_OFF;
+		config.rx_filter = HWTSTAMP_FILTER_NONE;
+	}
+
+	return copy_to_user(ifr->ifr_data, &config,
+			    sizeof(config)) ? -EFAULT : 0;
+}
+
+static int mlx4_en_hwtstamp_get(struct net_device *dev, struct ifreq *ifr)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+
+	return copy_to_user(ifr->ifr_data, &priv->hwtstamp_config,
+			    sizeof(priv->hwtstamp_config)) ? -EFAULT : 0;
+}
+
+static int mlx4_en_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
+{
+	switch (cmd) {
+	case SIOCSHWTSTAMP:
+		return mlx4_en_hwtstamp_set(dev, ifr);
+	case SIOCGHWTSTAMP:
+		return mlx4_en_hwtstamp_get(dev, ifr);
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+static int mlx4_en_set_features(struct net_device *netdev,
+		netdev_features_t features)
+{
+	struct mlx4_en_priv *priv = netdev_priv(netdev);
+
+	if (features & NETIF_F_LOOPBACK)
+		priv->ctrl_flags |= cpu_to_be32(MLX4_WQE_CTRL_FORCE_LOOPBACK);
+	else
+		priv->ctrl_flags &=
+			cpu_to_be32(~MLX4_WQE_CTRL_FORCE_LOOPBACK);
+
+	mlx4_en_update_loopback_state(netdev, features);
+
+	return 0;
+
+}
+
+static int mlx4_en_set_vf_mac(struct net_device *dev, int queue, u8 *mac)
+{
+	struct mlx4_en_priv *en_priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = en_priv->mdev;
+	u64 mac_u64 = mlx4_mac_to_u64(mac);
+
+	if (!is_valid_ether_addr(mac))
+		return -EINVAL;
+
+	return mlx4_set_vf_mac(mdev->dev, en_priv->port, queue, mac_u64);
+}
+
+static int mlx4_en_set_vf_vlan(struct net_device *dev, int vf, u16 vlan, u8 qos)
+{
+	struct mlx4_en_priv *en_priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = en_priv->mdev;
+
+	return mlx4_set_vf_vlan(mdev->dev, en_priv->port, vf, vlan, qos);
+}
+
+static int mlx4_en_set_vf_spoofchk(struct net_device *dev, int vf, bool setting)
+{
+	struct mlx4_en_priv *en_priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = en_priv->mdev;
+
+	return mlx4_set_vf_spoofchk(mdev->dev, en_priv->port, vf, setting);
+}
+
+static int mlx4_en_get_vf_config(struct net_device *dev, int vf, struct ifla_vf_info *ivf)
+{
+	struct mlx4_en_priv *en_priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = en_priv->mdev;
+
+	return mlx4_get_vf_config(mdev->dev, en_priv->port, vf, ivf);
+}
+
+static int mlx4_en_set_vf_link_state(struct net_device *dev, int vf, int link_state)
+{
+	struct mlx4_en_priv *en_priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = en_priv->mdev;
+
+	return mlx4_set_vf_link_state(mdev->dev, en_priv->port, vf, link_state);
+}
+
+#define PORT_ID_BYTE_LEN 8
+static int mlx4_en_get_phys_port_id(struct net_device *dev,
+				    struct netdev_phys_port_id *ppid)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_dev *mdev = priv->mdev->dev;
+	int i;
+	u64 phys_port_id = mdev->caps.phys_port_id[priv->port];
+
+	if (!phys_port_id)
+		return -EOPNOTSUPP;
+
+	ppid->id_len = sizeof(phys_port_id);
+	for (i = PORT_ID_BYTE_LEN - 1; i >= 0; --i) {
+		ppid->id[i] =  phys_port_id & 0xff;
+		phys_port_id >>= 8;
+	}
+	return 0;
+}
+
+#ifdef CONFIG_MLX4_EN_VXLAN
+static void mlx4_en_add_vxlan_offloads(struct work_struct *work)
+{
+	int ret;
+	struct mlx4_en_priv *priv = container_of(work, struct mlx4_en_priv,
+						 vxlan_add_task);
+
+	ret = mlx4_config_vxlan_port(priv->mdev->dev, priv->vxlan_port);
+	if (ret)
+		goto out;
+
+	ret = mlx4_SET_PORT_VXLAN(priv->mdev->dev, priv->port,
+				  VXLAN_STEER_BY_OUTER_MAC, 1);
+out:
+	if (ret) {
+		en_err(priv, "failed setting L2 tunnel configuration ret %d\n", ret);
+		return;
+	}
+
+	/* set offloads */
+	priv->dev->hw_enc_features |= NETIF_F_IP_CSUM | NETIF_F_RXCSUM |
+				      NETIF_F_TSO | NETIF_F_GSO_UDP_TUNNEL;
+	priv->dev->hw_features |= NETIF_F_GSO_UDP_TUNNEL;
+	priv->dev->features    |= NETIF_F_GSO_UDP_TUNNEL;
+}
+
+static void mlx4_en_del_vxlan_offloads(struct work_struct *work)
+{
+	int ret;
+	struct mlx4_en_priv *priv = container_of(work, struct mlx4_en_priv,
+						 vxlan_del_task);
+	/* unset offloads */
+	priv->dev->hw_enc_features &= ~(NETIF_F_IP_CSUM | NETIF_F_RXCSUM |
+				      NETIF_F_TSO | NETIF_F_GSO_UDP_TUNNEL);
+	priv->dev->hw_features &= ~NETIF_F_GSO_UDP_TUNNEL;
+	priv->dev->features    &= ~NETIF_F_GSO_UDP_TUNNEL;
+
+	ret = mlx4_SET_PORT_VXLAN(priv->mdev->dev, priv->port,
+				  VXLAN_STEER_BY_OUTER_MAC, 0);
+	if (ret)
+		en_err(priv, "failed setting L2 tunnel configuration ret %d\n", ret);
+
+	priv->vxlan_port = 0;
+}
+
+static void mlx4_en_add_vxlan_port(struct  net_device *dev,
+				   sa_family_t sa_family, __be16 port)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	__be16 current_port;
+
+	if (priv->mdev->dev->caps.tunnel_offload_mode != MLX4_TUNNEL_OFFLOAD_MODE_VXLAN)
+		return;
+
+	if (sa_family == AF_INET6)
+		return;
+
+	current_port = priv->vxlan_port;
+	if (current_port && current_port != port) {
+		en_warn(priv, "vxlan port %d configured, can't add port %d\n",
+			ntohs(current_port), ntohs(port));
+		return;
+	}
+
+	priv->vxlan_port = port;
+	queue_work(priv->mdev->workqueue, &priv->vxlan_add_task);
+}
+
+static void mlx4_en_del_vxlan_port(struct  net_device *dev,
+				   sa_family_t sa_family, __be16 port)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	__be16 current_port;
+
+	if (priv->mdev->dev->caps.tunnel_offload_mode != MLX4_TUNNEL_OFFLOAD_MODE_VXLAN)
+		return;
+
+	if (sa_family == AF_INET6)
+		return;
+
+	current_port = priv->vxlan_port;
+	if (current_port != port) {
+		en_dbg(DRV, priv, "vxlan port %d isn't configured, ignoring\n", ntohs(port));
+		return;
+	}
+
+	queue_work(priv->mdev->workqueue, &priv->vxlan_del_task);
+}
+
+static bool mlx4_en_gso_check(struct sk_buff *skb, struct net_device *dev)
+{
+	return vxlan_gso_check(skb);
+}
+#endif
+
+static const struct net_device_ops mlx4_netdev_ops = {
+	.ndo_open		= mlx4_en_open,
+	.ndo_stop		= mlx4_en_close,
+	.ndo_start_xmit		= mlx4_en_xmit,
+	.ndo_select_queue	= mlx4_en_select_queue,
+	.ndo_get_stats		= mlx4_en_get_stats,
+	.ndo_set_rx_mode	= mlx4_en_set_rx_mode,
+	.ndo_set_mac_address	= mlx4_en_set_mac,
+	.ndo_validate_addr	= eth_validate_addr,
+	.ndo_change_mtu		= mlx4_en_change_mtu,
+	.ndo_do_ioctl		= mlx4_en_ioctl,
+	.ndo_tx_timeout		= mlx4_en_tx_timeout,
+	.ndo_vlan_rx_add_vid	= mlx4_en_vlan_rx_add_vid,
+	.ndo_vlan_rx_kill_vid	= mlx4_en_vlan_rx_kill_vid,
+#ifdef CONFIG_NET_POLL_CONTROLLER
+	.ndo_poll_controller	= mlx4_en_netpoll,
+#endif
+	.ndo_set_features	= mlx4_en_set_features,
+	.ndo_setup_tc		= mlx4_en_setup_tc,
+#ifdef CONFIG_RFS_ACCEL
+	.ndo_rx_flow_steer	= mlx4_en_filter_rfs,
+#endif
+#ifdef CONFIG_NET_RX_BUSY_POLL
+	.ndo_busy_poll		= mlx4_en_low_latency_recv,
+#endif
+	.ndo_get_phys_port_id	= mlx4_en_get_phys_port_id,
+#ifdef CONFIG_MLX4_EN_VXLAN
+	.ndo_add_vxlan_port	= mlx4_en_add_vxlan_port,
+	.ndo_del_vxlan_port	= mlx4_en_del_vxlan_port,
+	.ndo_gso_check		= mlx4_en_gso_check,
+#endif
+};
+
+static const struct net_device_ops mlx4_netdev_ops_master = {
+	.ndo_open		= mlx4_en_open,
+	.ndo_stop		= mlx4_en_close,
+	.ndo_start_xmit		= mlx4_en_xmit,
+	.ndo_select_queue	= mlx4_en_select_queue,
+	.ndo_get_stats		= mlx4_en_get_stats,
+	.ndo_set_rx_mode	= mlx4_en_set_rx_mode,
+	.ndo_set_mac_address	= mlx4_en_set_mac,
+	.ndo_validate_addr	= eth_validate_addr,
+	.ndo_change_mtu		= mlx4_en_change_mtu,
+	.ndo_tx_timeout		= mlx4_en_tx_timeout,
+	.ndo_vlan_rx_add_vid	= mlx4_en_vlan_rx_add_vid,
+	.ndo_vlan_rx_kill_vid	= mlx4_en_vlan_rx_kill_vid,
+	.ndo_set_vf_mac		= mlx4_en_set_vf_mac,
+	.ndo_set_vf_vlan	= mlx4_en_set_vf_vlan,
+	.ndo_set_vf_spoofchk	= mlx4_en_set_vf_spoofchk,
+	.ndo_set_vf_link_state	= mlx4_en_set_vf_link_state,
+	.ndo_get_vf_config	= mlx4_en_get_vf_config,
+#ifdef CONFIG_NET_POLL_CONTROLLER
+	.ndo_poll_controller	= mlx4_en_netpoll,
+#endif
+	.ndo_set_features	= mlx4_en_set_features,
+	.ndo_setup_tc		= mlx4_en_setup_tc,
+#ifdef CONFIG_RFS_ACCEL
+	.ndo_rx_flow_steer	= mlx4_en_filter_rfs,
+#endif
+	.ndo_get_phys_port_id	= mlx4_en_get_phys_port_id,
+#ifdef CONFIG_MLX4_EN_VXLAN
+	.ndo_add_vxlan_port	= mlx4_en_add_vxlan_port,
+	.ndo_del_vxlan_port	= mlx4_en_del_vxlan_port,
+	.ndo_gso_check		= mlx4_en_gso_check,
+#endif
+};
+
+int mlx4_en_init_netdev(struct mlx4_en_dev *mdev, int port,
+			struct mlx4_en_port_profile *prof)
+{
+	struct net_device *dev;
+	struct mlx4_en_priv *priv;
+	int i;
+	int err;
+	u64 mac_u64;
+
+	dev = alloc_etherdev_mqs(sizeof(struct mlx4_en_priv),
+				 MAX_TX_RINGS, MAX_RX_RINGS);
+	if (dev == NULL)
+		return -ENOMEM;
+
+	netif_set_real_num_tx_queues(dev, prof->tx_ring_num);
+	netif_set_real_num_rx_queues(dev, prof->rx_ring_num);
+
+	SET_NETDEV_DEV(dev, &mdev->dev->pdev->dev);
+	dev->dev_port = port - 1;
+
+	/*
+	 * Initialize driver private data
+	 */
+
+	priv = netdev_priv(dev);
+	memset(priv, 0, sizeof(struct mlx4_en_priv));
+	priv->dev = dev;
+	priv->mdev = mdev;
+	priv->ddev = &mdev->pdev->dev;
+	priv->prof = prof;
+	priv->port = port;
+	priv->port_up = false;
+	priv->flags = prof->flags;
+	priv->pflags = MLX4_EN_PRIV_FLAGS_BLUEFLAME;
+	priv->ctrl_flags = cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE |
+			MLX4_WQE_CTRL_SOLICITED);
+	priv->num_tx_rings_p_up = mdev->profile.num_tx_rings_p_up;
+	priv->tx_ring_num = prof->tx_ring_num;
+	priv->tx_work_limit = MLX4_EN_DEFAULT_TX_WORK;
+
+	priv->tx_ring = kzalloc(sizeof(struct mlx4_en_tx_ring *) * MAX_TX_RINGS,
+				GFP_KERNEL);
+	if (!priv->tx_ring) {
+		err = -ENOMEM;
+		goto out;
+	}
+	priv->tx_cq = kzalloc(sizeof(struct mlx4_en_cq *) * MAX_TX_RINGS,
+			      GFP_KERNEL);
+	if (!priv->tx_cq) {
+		err = -ENOMEM;
+		goto out;
+	}
+	priv->rx_ring_num = prof->rx_ring_num;
+	priv->cqe_factor = (mdev->dev->caps.cqe_size == 64) ? 1 : 0;
+	priv->cqe_size = mdev->dev->caps.cqe_size;
+	priv->mac_index = -1;
+	priv->msg_enable = MLX4_EN_MSG_LEVEL;
+	spin_lock_init(&priv->stats_lock);
+	INIT_WORK(&priv->rx_mode_task, mlx4_en_do_set_rx_mode);
+	INIT_WORK(&priv->watchdog_task, mlx4_en_restart);
+	INIT_WORK(&priv->linkstate_task, mlx4_en_linkstate);
+	INIT_DELAYED_WORK(&priv->stats_task, mlx4_en_do_get_stats);
+	INIT_DELAYED_WORK(&priv->service_task, mlx4_en_service_task);
+#ifdef CONFIG_MLX4_EN_VXLAN
+	INIT_WORK(&priv->vxlan_add_task, mlx4_en_add_vxlan_offloads);
+	INIT_WORK(&priv->vxlan_del_task, mlx4_en_del_vxlan_offloads);
+#endif
+#ifdef CONFIG_MLX4_EN_DCB
+	if (!mlx4_is_slave(priv->mdev->dev)) {
+		if (mdev->dev->caps.flags & MLX4_DEV_CAP_FLAG_SET_ETH_SCHED) {
+			dev->dcbnl_ops = &mlx4_en_dcbnl_ops;
+		} else {
+			en_info(priv, "enabling only PFC DCB ops\n");
+			dev->dcbnl_ops = &mlx4_en_dcbnl_pfc_ops;
+		}
+	}
+#endif
+
+	for (i = 0; i < MLX4_EN_MAC_HASH_SIZE; ++i)
+		INIT_HLIST_HEAD(&priv->mac_hash[i]);
+
+	/* Query for default mac and max mtu */
+	priv->max_mtu = mdev->dev->caps.eth_mtu_cap[priv->port];
+
+	/* Set default MAC */
+	dev->addr_len = ETH_ALEN;
+	mlx4_en_u64_to_mac(dev->dev_addr, mdev->dev->caps.def_mac[priv->port]);
+	if (!is_valid_ether_addr(dev->dev_addr)) {
+		if (mlx4_is_slave(priv->mdev->dev)) {
+			eth_hw_addr_random(dev);
+			en_warn(priv, "Assigned random MAC address %pM\n", dev->dev_addr);
+			mac_u64 = mlx4_mac_to_u64(dev->dev_addr);
+			mdev->dev->caps.def_mac[priv->port] = mac_u64;
+		} else {
+			en_err(priv, "Port: %d, invalid mac burned: %pM, quiting\n",
+			       priv->port, dev->dev_addr);
+			err = -EINVAL;
+			goto out;
+		}
+	}
+
+	memcpy(priv->current_mac, dev->dev_addr, sizeof(priv->current_mac));
+
+	priv->stride = roundup_pow_of_two(sizeof(struct mlx4_en_rx_desc) +
+					  DS_SIZE * MLX4_EN_MAX_RX_FRAGS);
+	err = mlx4_en_alloc_resources(priv);
+	if (err)
+		goto out;
+
+#ifdef CONFIG_RFS_ACCEL
+	INIT_LIST_HEAD(&priv->filters);
+	spin_lock_init(&priv->filters_lock);
+#endif
+
+	/* Initialize time stamping config */
+	priv->hwtstamp_config.flags = 0;
+	priv->hwtstamp_config.tx_type = HWTSTAMP_TX_OFF;
+	priv->hwtstamp_config.rx_filter = HWTSTAMP_FILTER_NONE;
+
+	/* Allocate page for receive rings */
+	err = mlx4_alloc_hwq_res(mdev->dev, &priv->res,
+				MLX4_EN_PAGE_SIZE, MLX4_EN_PAGE_SIZE);
+	if (err) {
+		en_err(priv, "Failed to allocate page for rx qps\n");
+		goto out;
+	}
+	priv->allocated = 1;
+
+	/*
+	 * Initialize netdev entry points
+	 */
+	if (mlx4_is_master(priv->mdev->dev))
+		dev->netdev_ops = &mlx4_netdev_ops_master;
+	else
+		dev->netdev_ops = &mlx4_netdev_ops;
+	dev->watchdog_timeo = MLX4_EN_WATCHDOG_TIMEOUT;
+	netif_set_real_num_tx_queues(dev, priv->tx_ring_num);
+	netif_set_real_num_rx_queues(dev, priv->rx_ring_num);
+
+	dev->ethtool_ops = &mlx4_en_ethtool_ops;
+
+	/*
+	 * Set driver features
+	 */
+	dev->hw_features = NETIF_F_SG | NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM;
+	if (mdev->LSO_support)
+		dev->hw_features |= NETIF_F_TSO | NETIF_F_TSO6;
+
+	dev->vlan_features = dev->hw_features;
+
+	dev->hw_features |= NETIF_F_RXCSUM | NETIF_F_RXHASH;
+	dev->features = dev->hw_features | NETIF_F_HIGHDMA |
+			NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_CTAG_RX |
+			NETIF_F_HW_VLAN_CTAG_FILTER;
+	dev->hw_features |= NETIF_F_LOOPBACK;
+
+	if (mdev->dev->caps.steering_mode ==
+	    MLX4_STEERING_MODE_DEVICE_MANAGED)
+		dev->hw_features |= NETIF_F_NTUPLE;
+
+	if (mdev->dev->caps.steering_mode != MLX4_STEERING_MODE_A0)
+		dev->priv_flags |= IFF_UNICAST_FLT;
+
+	mdev->pndev[port] = dev;
+
+	netif_carrier_off(dev);
+	mlx4_en_set_default_moderation(priv);
+
+	err = register_netdev(dev);
+	if (err) {
+		en_err(priv, "Netdev registration failed for port %d\n", port);
+		goto out;
+	}
+	priv->registered = 1;
+
+	en_warn(priv, "Using %d TX rings\n", prof->tx_ring_num);
+	en_warn(priv, "Using %d RX rings\n", prof->rx_ring_num);
+
+	mlx4_en_update_loopback_state(priv->dev, priv->dev->features);
+
+	/* Configure port */
+	mlx4_en_calc_rx_buf(dev);
+	err = mlx4_SET_PORT_general(mdev->dev, priv->port,
+				    priv->rx_skb_size + ETH_FCS_LEN,
+				    prof->tx_pause, prof->tx_ppp,
+				    prof->rx_pause, prof->rx_ppp);
+	if (err) {
+		en_err(priv, "Failed setting port general configurations for port %d, with error %d\n",
+		       priv->port, err);
+		goto out;
+	}
+
+	if (mdev->dev->caps.tunnel_offload_mode == MLX4_TUNNEL_OFFLOAD_MODE_VXLAN) {
+		err = mlx4_SET_PORT_VXLAN(mdev->dev, priv->port, VXLAN_STEER_BY_OUTER_MAC, 1);
+		if (err) {
+			en_err(priv, "Failed setting port L2 tunnel configuration, err %d\n",
+			       err);
+			goto out;
+		}
+	}
+
+	/* Init port */
+	en_warn(priv, "Initializing port\n");
+	err = mlx4_INIT_PORT(mdev->dev, priv->port);
+	if (err) {
+		en_err(priv, "Failed Initializing port\n");
+		goto out;
+	}
+	queue_delayed_work(mdev->workqueue, &priv->stats_task, STATS_DELAY);
+
+	if (mdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_TS)
+		queue_delayed_work(mdev->workqueue, &priv->service_task,
+				   SERVICE_TASK_DELAY);
+
+	return 0;
+
+out:
+	mlx4_en_destroy_netdev(dev);
+	return err;
+}
+
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_port.c b/drivers/net/ethernet/mellanox/mlx4/en_port.c
new file mode 100644
index 0000000..0a0261d
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4/en_port.c
@@ -0,0 +1,228 @@
+/*
+ * Copyright (c) 2007 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+
+#include <linux/if_vlan.h>
+
+#include <linux/mlx4/device.h>
+#include <linux/mlx4/cmd.h>
+
+#include "en_port.h"
+#include "mlx4_en.h"
+
+
+int mlx4_SET_VLAN_FLTR(struct mlx4_dev *dev, struct mlx4_en_priv *priv)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_set_vlan_fltr_mbox *filter;
+	int i;
+	int j;
+	int index = 0;
+	u32 entry;
+	int err = 0;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+
+	filter = mailbox->buf;
+	for (i = VLAN_FLTR_SIZE - 1; i >= 0; i--) {
+		entry = 0;
+		for (j = 0; j < 32; j++)
+			if (test_bit(index++, priv->active_vlans))
+				entry |= 1 << j;
+		filter->entry[i] = cpu_to_be32(entry);
+	}
+	err = mlx4_cmd(dev, mailbox->dma, priv->port, 0, MLX4_CMD_SET_VLAN_FLTR,
+		       MLX4_CMD_TIME_CLASS_B, MLX4_CMD_WRAPPED);
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+
+int mlx4_en_QUERY_PORT(struct mlx4_en_dev *mdev, u8 port)
+{
+	struct mlx4_en_query_port_context *qport_context;
+	struct mlx4_en_priv *priv = netdev_priv(mdev->pndev[port]);
+	struct mlx4_en_port_state *state = &priv->port_state;
+	struct mlx4_cmd_mailbox *mailbox;
+	int err;
+
+	mailbox = mlx4_alloc_cmd_mailbox(mdev->dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	err = mlx4_cmd_box(mdev->dev, 0, mailbox->dma, port, 0,
+			   MLX4_CMD_QUERY_PORT, MLX4_CMD_TIME_CLASS_B,
+			   MLX4_CMD_WRAPPED);
+	if (err)
+		goto out;
+	qport_context = mailbox->buf;
+
+	/* This command is always accessed from Ethtool context
+	 * already synchronized, no need in locking */
+	state->link_state = !!(qport_context->link_up & MLX4_EN_LINK_UP_MASK);
+	switch (qport_context->link_speed & MLX4_EN_SPEED_MASK) {
+	case MLX4_EN_1G_SPEED:
+		state->link_speed = 1000;
+		break;
+	case MLX4_EN_10G_SPEED_XAUI:
+	case MLX4_EN_10G_SPEED_XFI:
+		state->link_speed = 10000;
+		break;
+	case MLX4_EN_40G_SPEED:
+		state->link_speed = 40000;
+		break;
+	default:
+		state->link_speed = -1;
+		break;
+	}
+	state->transciver = qport_context->transceiver;
+
+out:
+	mlx4_free_cmd_mailbox(mdev->dev, mailbox);
+	return err;
+}
+
+int mlx4_en_DUMP_ETH_STATS(struct mlx4_en_dev *mdev, u8 port, u8 reset)
+{
+	struct mlx4_en_stat_out_mbox *mlx4_en_stats;
+	struct mlx4_en_priv *priv = netdev_priv(mdev->pndev[port]);
+	struct net_device_stats *stats = &priv->stats;
+	struct mlx4_cmd_mailbox *mailbox;
+	u64 in_mod = reset << 8 | port;
+	int err;
+	int i;
+
+	mailbox = mlx4_alloc_cmd_mailbox(mdev->dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	err = mlx4_cmd_box(mdev->dev, 0, mailbox->dma, in_mod, 0,
+			   MLX4_CMD_DUMP_ETH_STATS, MLX4_CMD_TIME_CLASS_B,
+			   MLX4_CMD_WRAPPED);
+	if (err)
+		goto out;
+
+	mlx4_en_stats = mailbox->buf;
+
+	spin_lock_bh(&priv->stats_lock);
+
+	stats->rx_packets = 0;
+	stats->rx_bytes = 0;
+	priv->port_stats.rx_chksum_good = 0;
+	priv->port_stats.rx_chksum_none = 0;
+	for (i = 0; i < priv->rx_ring_num; i++) {
+		stats->rx_packets += priv->rx_ring[i]->packets;
+		stats->rx_bytes += priv->rx_ring[i]->bytes;
+		priv->port_stats.rx_chksum_good += priv->rx_ring[i]->csum_ok;
+		priv->port_stats.rx_chksum_none += priv->rx_ring[i]->csum_none;
+	}
+	stats->tx_packets = 0;
+	stats->tx_bytes = 0;
+	priv->port_stats.tx_chksum_offload = 0;
+	priv->port_stats.queue_stopped = 0;
+	priv->port_stats.wake_queue = 0;
+	priv->port_stats.tso_packets = 0;
+	priv->port_stats.xmit_more = 0;
+
+	for (i = 0; i < priv->tx_ring_num; i++) {
+		const struct mlx4_en_tx_ring *ring = priv->tx_ring[i];
+
+		stats->tx_packets += ring->packets;
+		stats->tx_bytes += ring->bytes;
+		priv->port_stats.tx_chksum_offload += ring->tx_csum;
+		priv->port_stats.queue_stopped     += ring->queue_stopped;
+		priv->port_stats.wake_queue        += ring->wake_queue;
+		priv->port_stats.tso_packets       += ring->tso_packets;
+		priv->port_stats.xmit_more         += ring->xmit_more;
+	}
+
+	stats->rx_errors = be64_to_cpu(mlx4_en_stats->PCS) +
+			   be32_to_cpu(mlx4_en_stats->RdropLength) +
+			   be32_to_cpu(mlx4_en_stats->RJBBR) +
+			   be32_to_cpu(mlx4_en_stats->RCRC) +
+			   be32_to_cpu(mlx4_en_stats->RRUNT);
+	stats->tx_errors = be32_to_cpu(mlx4_en_stats->TDROP);
+	stats->multicast = be64_to_cpu(mlx4_en_stats->MCAST_prio_0) +
+			   be64_to_cpu(mlx4_en_stats->MCAST_prio_1) +
+			   be64_to_cpu(mlx4_en_stats->MCAST_prio_2) +
+			   be64_to_cpu(mlx4_en_stats->MCAST_prio_3) +
+			   be64_to_cpu(mlx4_en_stats->MCAST_prio_4) +
+			   be64_to_cpu(mlx4_en_stats->MCAST_prio_5) +
+			   be64_to_cpu(mlx4_en_stats->MCAST_prio_6) +
+			   be64_to_cpu(mlx4_en_stats->MCAST_prio_7) +
+			   be64_to_cpu(mlx4_en_stats->MCAST_novlan);
+	stats->collisions = 0;
+	stats->rx_length_errors = be32_to_cpu(mlx4_en_stats->RdropLength);
+	stats->rx_over_errors = be32_to_cpu(mlx4_en_stats->RdropOvflw);
+	stats->rx_crc_errors = be32_to_cpu(mlx4_en_stats->RCRC);
+	stats->rx_frame_errors = 0;
+	stats->rx_fifo_errors = be32_to_cpu(mlx4_en_stats->RdropOvflw);
+	stats->rx_missed_errors = be32_to_cpu(mlx4_en_stats->RdropOvflw);
+	stats->tx_aborted_errors = 0;
+	stats->tx_carrier_errors = 0;
+	stats->tx_fifo_errors = 0;
+	stats->tx_heartbeat_errors = 0;
+	stats->tx_window_errors = 0;
+
+	priv->pkstats.broadcast =
+				be64_to_cpu(mlx4_en_stats->RBCAST_prio_0) +
+				be64_to_cpu(mlx4_en_stats->RBCAST_prio_1) +
+				be64_to_cpu(mlx4_en_stats->RBCAST_prio_2) +
+				be64_to_cpu(mlx4_en_stats->RBCAST_prio_3) +
+				be64_to_cpu(mlx4_en_stats->RBCAST_prio_4) +
+				be64_to_cpu(mlx4_en_stats->RBCAST_prio_5) +
+				be64_to_cpu(mlx4_en_stats->RBCAST_prio_6) +
+				be64_to_cpu(mlx4_en_stats->RBCAST_prio_7) +
+				be64_to_cpu(mlx4_en_stats->RBCAST_novlan);
+	priv->pkstats.rx_prio[0] = be64_to_cpu(mlx4_en_stats->RTOT_prio_0);
+	priv->pkstats.rx_prio[1] = be64_to_cpu(mlx4_en_stats->RTOT_prio_1);
+	priv->pkstats.rx_prio[2] = be64_to_cpu(mlx4_en_stats->RTOT_prio_2);
+	priv->pkstats.rx_prio[3] = be64_to_cpu(mlx4_en_stats->RTOT_prio_3);
+	priv->pkstats.rx_prio[4] = be64_to_cpu(mlx4_en_stats->RTOT_prio_4);
+	priv->pkstats.rx_prio[5] = be64_to_cpu(mlx4_en_stats->RTOT_prio_5);
+	priv->pkstats.rx_prio[6] = be64_to_cpu(mlx4_en_stats->RTOT_prio_6);
+	priv->pkstats.rx_prio[7] = be64_to_cpu(mlx4_en_stats->RTOT_prio_7);
+	priv->pkstats.tx_prio[0] = be64_to_cpu(mlx4_en_stats->TTOT_prio_0);
+	priv->pkstats.tx_prio[1] = be64_to_cpu(mlx4_en_stats->TTOT_prio_1);
+	priv->pkstats.tx_prio[2] = be64_to_cpu(mlx4_en_stats->TTOT_prio_2);
+	priv->pkstats.tx_prio[3] = be64_to_cpu(mlx4_en_stats->TTOT_prio_3);
+	priv->pkstats.tx_prio[4] = be64_to_cpu(mlx4_en_stats->TTOT_prio_4);
+	priv->pkstats.tx_prio[5] = be64_to_cpu(mlx4_en_stats->TTOT_prio_5);
+	priv->pkstats.tx_prio[6] = be64_to_cpu(mlx4_en_stats->TTOT_prio_6);
+	priv->pkstats.tx_prio[7] = be64_to_cpu(mlx4_en_stats->TTOT_prio_7);
+	spin_unlock_bh(&priv->stats_lock);
+
+out:
+	mlx4_free_cmd_mailbox(mdev->dev, mailbox);
+	return err;
+}
+
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_port.h b/drivers/net/ethernet/mellanox/mlx4/en_port.h
new file mode 100644
index 0000000..745090b
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4/en_port.h
@@ -0,0 +1,560 @@
+/*
+ * Copyright (c) 2007 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifndef _MLX4_EN_PORT_H_
+#define _MLX4_EN_PORT_H_
+
+
+#define SET_PORT_GEN_ALL_VALID	0x7
+#define SET_PORT_PROMISC_SHIFT	31
+#define SET_PORT_MC_PROMISC_SHIFT	30
+
+#define MLX4_EN_NUM_TC		8
+
+#define VLAN_FLTR_SIZE	128
+struct mlx4_set_vlan_fltr_mbox {
+	__be32 entry[VLAN_FLTR_SIZE];
+};
+
+
+enum {
+	MLX4_MCAST_CONFIG       = 0,
+	MLX4_MCAST_DISABLE      = 1,
+	MLX4_MCAST_ENABLE       = 2,
+};
+
+enum {
+	MLX4_EN_1G_SPEED	= 0x02,
+	MLX4_EN_10G_SPEED_XFI	= 0x01,
+	MLX4_EN_10G_SPEED_XAUI	= 0x00,
+	MLX4_EN_40G_SPEED	= 0x40,
+	MLX4_EN_OTHER_SPEED	= 0x0f,
+};
+
+struct mlx4_en_query_port_context {
+	u8 link_up;
+#define MLX4_EN_LINK_UP_MASK	0x80
+	u8 reserved;
+	__be16 mtu;
+	u8 reserved2;
+	u8 link_speed;
+#define MLX4_EN_SPEED_MASK	0x43
+	u16 reserved3[5];
+	__be64 mac;
+	u8 transceiver;
+};
+
+
+struct mlx4_en_stat_out_mbox {
+	/* Received frames with a length of 64 octets */
+	__be64 R64_prio_0;
+	__be64 R64_prio_1;
+	__be64 R64_prio_2;
+	__be64 R64_prio_3;
+	__be64 R64_prio_4;
+	__be64 R64_prio_5;
+	__be64 R64_prio_6;
+	__be64 R64_prio_7;
+	__be64 R64_novlan;
+	/* Received frames with a length of 127 octets */
+	__be64 R127_prio_0;
+	__be64 R127_prio_1;
+	__be64 R127_prio_2;
+	__be64 R127_prio_3;
+	__be64 R127_prio_4;
+	__be64 R127_prio_5;
+	__be64 R127_prio_6;
+	__be64 R127_prio_7;
+	__be64 R127_novlan;
+	/* Received frames with a length of 255 octets */
+	__be64 R255_prio_0;
+	__be64 R255_prio_1;
+	__be64 R255_prio_2;
+	__be64 R255_prio_3;
+	__be64 R255_prio_4;
+	__be64 R255_prio_5;
+	__be64 R255_prio_6;
+	__be64 R255_prio_7;
+	__be64 R255_novlan;
+	/* Received frames with a length of 511 octets */
+	__be64 R511_prio_0;
+	__be64 R511_prio_1;
+	__be64 R511_prio_2;
+	__be64 R511_prio_3;
+	__be64 R511_prio_4;
+	__be64 R511_prio_5;
+	__be64 R511_prio_6;
+	__be64 R511_prio_7;
+	__be64 R511_novlan;
+	/* Received frames with a length of 1023 octets */
+	__be64 R1023_prio_0;
+	__be64 R1023_prio_1;
+	__be64 R1023_prio_2;
+	__be64 R1023_prio_3;
+	__be64 R1023_prio_4;
+	__be64 R1023_prio_5;
+	__be64 R1023_prio_6;
+	__be64 R1023_prio_7;
+	__be64 R1023_novlan;
+	/* Received frames with a length of 1518 octets */
+	__be64 R1518_prio_0;
+	__be64 R1518_prio_1;
+	__be64 R1518_prio_2;
+	__be64 R1518_prio_3;
+	__be64 R1518_prio_4;
+	__be64 R1518_prio_5;
+	__be64 R1518_prio_6;
+	__be64 R1518_prio_7;
+	__be64 R1518_novlan;
+	/* Received frames with a length of 1522 octets */
+	__be64 R1522_prio_0;
+	__be64 R1522_prio_1;
+	__be64 R1522_prio_2;
+	__be64 R1522_prio_3;
+	__be64 R1522_prio_4;
+	__be64 R1522_prio_5;
+	__be64 R1522_prio_6;
+	__be64 R1522_prio_7;
+	__be64 R1522_novlan;
+	/* Received frames with a length of 1548 octets */
+	__be64 R1548_prio_0;
+	__be64 R1548_prio_1;
+	__be64 R1548_prio_2;
+	__be64 R1548_prio_3;
+	__be64 R1548_prio_4;
+	__be64 R1548_prio_5;
+	__be64 R1548_prio_6;
+	__be64 R1548_prio_7;
+	__be64 R1548_novlan;
+	/* Received frames with a length of 1548 < octets < MTU */
+	__be64 R2MTU_prio_0;
+	__be64 R2MTU_prio_1;
+	__be64 R2MTU_prio_2;
+	__be64 R2MTU_prio_3;
+	__be64 R2MTU_prio_4;
+	__be64 R2MTU_prio_5;
+	__be64 R2MTU_prio_6;
+	__be64 R2MTU_prio_7;
+	__be64 R2MTU_novlan;
+	/* Received frames with a length of MTU< octets and good CRC */
+	__be64 RGIANT_prio_0;
+	__be64 RGIANT_prio_1;
+	__be64 RGIANT_prio_2;
+	__be64 RGIANT_prio_3;
+	__be64 RGIANT_prio_4;
+	__be64 RGIANT_prio_5;
+	__be64 RGIANT_prio_6;
+	__be64 RGIANT_prio_7;
+	__be64 RGIANT_novlan;
+	/* Received broadcast frames with good CRC */
+	__be64 RBCAST_prio_0;
+	__be64 RBCAST_prio_1;
+	__be64 RBCAST_prio_2;
+	__be64 RBCAST_prio_3;
+	__be64 RBCAST_prio_4;
+	__be64 RBCAST_prio_5;
+	__be64 RBCAST_prio_6;
+	__be64 RBCAST_prio_7;
+	__be64 RBCAST_novlan;
+	/* Received multicast frames with good CRC */
+	__be64 MCAST_prio_0;
+	__be64 MCAST_prio_1;
+	__be64 MCAST_prio_2;
+	__be64 MCAST_prio_3;
+	__be64 MCAST_prio_4;
+	__be64 MCAST_prio_5;
+	__be64 MCAST_prio_6;
+	__be64 MCAST_prio_7;
+	__be64 MCAST_novlan;
+	/* Received unicast not short or GIANT frames with good CRC */
+	__be64 RTOTG_prio_0;
+	__be64 RTOTG_prio_1;
+	__be64 RTOTG_prio_2;
+	__be64 RTOTG_prio_3;
+	__be64 RTOTG_prio_4;
+	__be64 RTOTG_prio_5;
+	__be64 RTOTG_prio_6;
+	__be64 RTOTG_prio_7;
+	__be64 RTOTG_novlan;
+
+	/* Count of total octets of received frames, includes framing characters */
+	__be64 RTTLOCT_prio_0;
+	/* Count of total octets of received frames, not including framing
+	   characters */
+	__be64 RTTLOCT_NOFRM_prio_0;
+	/* Count of Total number of octets received
+	   (only for frames without errors) */
+	__be64 ROCT_prio_0;
+
+	__be64 RTTLOCT_prio_1;
+	__be64 RTTLOCT_NOFRM_prio_1;
+	__be64 ROCT_prio_1;
+
+	__be64 RTTLOCT_prio_2;
+	__be64 RTTLOCT_NOFRM_prio_2;
+	__be64 ROCT_prio_2;
+
+	__be64 RTTLOCT_prio_3;
+	__be64 RTTLOCT_NOFRM_prio_3;
+	__be64 ROCT_prio_3;
+
+	__be64 RTTLOCT_prio_4;
+	__be64 RTTLOCT_NOFRM_prio_4;
+	__be64 ROCT_prio_4;
+
+	__be64 RTTLOCT_prio_5;
+	__be64 RTTLOCT_NOFRM_prio_5;
+	__be64 ROCT_prio_5;
+
+	__be64 RTTLOCT_prio_6;
+	__be64 RTTLOCT_NOFRM_prio_6;
+	__be64 ROCT_prio_6;
+
+	__be64 RTTLOCT_prio_7;
+	__be64 RTTLOCT_NOFRM_prio_7;
+	__be64 ROCT_prio_7;
+
+	__be64 RTTLOCT_novlan;
+	__be64 RTTLOCT_NOFRM_novlan;
+	__be64 ROCT_novlan;
+
+	/* Count of Total received frames including bad frames */
+	__be64 RTOT_prio_0;
+	/* Count of  Total number of received frames with 802.1Q encapsulation */
+	__be64 R1Q_prio_0;
+	__be64 reserved1;
+
+	__be64 RTOT_prio_1;
+	__be64 R1Q_prio_1;
+	__be64 reserved2;
+
+	__be64 RTOT_prio_2;
+	__be64 R1Q_prio_2;
+	__be64 reserved3;
+
+	__be64 RTOT_prio_3;
+	__be64 R1Q_prio_3;
+	__be64 reserved4;
+
+	__be64 RTOT_prio_4;
+	__be64 R1Q_prio_4;
+	__be64 reserved5;
+
+	__be64 RTOT_prio_5;
+	__be64 R1Q_prio_5;
+	__be64 reserved6;
+
+	__be64 RTOT_prio_6;
+	__be64 R1Q_prio_6;
+	__be64 reserved7;
+
+	__be64 RTOT_prio_7;
+	__be64 R1Q_prio_7;
+	__be64 reserved8;
+
+	__be64 RTOT_novlan;
+	__be64 R1Q_novlan;
+	__be64 reserved9;
+
+	/* Total number of Successfully Received Control Frames */
+	__be64 RCNTL;
+	__be64 reserved10;
+	__be64 reserved11;
+	__be64 reserved12;
+	/* Count of received frames with a length/type field  value between 46
+	   (42 for VLANtagged frames) and 1500 (also 1500 for VLAN-tagged frames),
+	   inclusive */
+	__be64 RInRangeLengthErr;
+	/* Count of received frames with length/type field between 1501 and 1535
+	   decimal, inclusive */
+	__be64 ROutRangeLengthErr;
+	/* Count of received frames that are longer than max allowed size for
+	   802.3 frames (1518/1522) */
+	__be64 RFrmTooLong;
+	/* Count frames received with PCS error */
+	__be64 PCS;
+
+	/* Transmit frames with a length of 64 octets */
+	__be64 T64_prio_0;
+	__be64 T64_prio_1;
+	__be64 T64_prio_2;
+	__be64 T64_prio_3;
+	__be64 T64_prio_4;
+	__be64 T64_prio_5;
+	__be64 T64_prio_6;
+	__be64 T64_prio_7;
+	__be64 T64_novlan;
+	__be64 T64_loopbk;
+	/* Transmit frames with a length of 65 to 127 octets. */
+	__be64 T127_prio_0;
+	__be64 T127_prio_1;
+	__be64 T127_prio_2;
+	__be64 T127_prio_3;
+	__be64 T127_prio_4;
+	__be64 T127_prio_5;
+	__be64 T127_prio_6;
+	__be64 T127_prio_7;
+	__be64 T127_novlan;
+	__be64 T127_loopbk;
+	/* Transmit frames with a length of 128 to 255 octets */
+	__be64 T255_prio_0;
+	__be64 T255_prio_1;
+	__be64 T255_prio_2;
+	__be64 T255_prio_3;
+	__be64 T255_prio_4;
+	__be64 T255_prio_5;
+	__be64 T255_prio_6;
+	__be64 T255_prio_7;
+	__be64 T255_novlan;
+	__be64 T255_loopbk;
+	/* Transmit frames with a length of 256 to 511 octets */
+	__be64 T511_prio_0;
+	__be64 T511_prio_1;
+	__be64 T511_prio_2;
+	__be64 T511_prio_3;
+	__be64 T511_prio_4;
+	__be64 T511_prio_5;
+	__be64 T511_prio_6;
+	__be64 T511_prio_7;
+	__be64 T511_novlan;
+	__be64 T511_loopbk;
+	/* Transmit frames with a length of 512 to 1023 octets */
+	__be64 T1023_prio_0;
+	__be64 T1023_prio_1;
+	__be64 T1023_prio_2;
+	__be64 T1023_prio_3;
+	__be64 T1023_prio_4;
+	__be64 T1023_prio_5;
+	__be64 T1023_prio_6;
+	__be64 T1023_prio_7;
+	__be64 T1023_novlan;
+	__be64 T1023_loopbk;
+	/* Transmit frames with a length of 1024 to 1518 octets */
+	__be64 T1518_prio_0;
+	__be64 T1518_prio_1;
+	__be64 T1518_prio_2;
+	__be64 T1518_prio_3;
+	__be64 T1518_prio_4;
+	__be64 T1518_prio_5;
+	__be64 T1518_prio_6;
+	__be64 T1518_prio_7;
+	__be64 T1518_novlan;
+	__be64 T1518_loopbk;
+	/* Counts transmit frames with a length of 1519 to 1522 bytes */
+	__be64 T1522_prio_0;
+	__be64 T1522_prio_1;
+	__be64 T1522_prio_2;
+	__be64 T1522_prio_3;
+	__be64 T1522_prio_4;
+	__be64 T1522_prio_5;
+	__be64 T1522_prio_6;
+	__be64 T1522_prio_7;
+	__be64 T1522_novlan;
+	__be64 T1522_loopbk;
+	/* Transmit frames with a length of 1523 to 1548 octets */
+	__be64 T1548_prio_0;
+	__be64 T1548_prio_1;
+	__be64 T1548_prio_2;
+	__be64 T1548_prio_3;
+	__be64 T1548_prio_4;
+	__be64 T1548_prio_5;
+	__be64 T1548_prio_6;
+	__be64 T1548_prio_7;
+	__be64 T1548_novlan;
+	__be64 T1548_loopbk;
+	/* Counts transmit frames with a length of 1549 to MTU bytes */
+	__be64 T2MTU_prio_0;
+	__be64 T2MTU_prio_1;
+	__be64 T2MTU_prio_2;
+	__be64 T2MTU_prio_3;
+	__be64 T2MTU_prio_4;
+	__be64 T2MTU_prio_5;
+	__be64 T2MTU_prio_6;
+	__be64 T2MTU_prio_7;
+	__be64 T2MTU_novlan;
+	__be64 T2MTU_loopbk;
+	/* Transmit frames with a length greater than MTU octets and a good CRC. */
+	__be64 TGIANT_prio_0;
+	__be64 TGIANT_prio_1;
+	__be64 TGIANT_prio_2;
+	__be64 TGIANT_prio_3;
+	__be64 TGIANT_prio_4;
+	__be64 TGIANT_prio_5;
+	__be64 TGIANT_prio_6;
+	__be64 TGIANT_prio_7;
+	__be64 TGIANT_novlan;
+	__be64 TGIANT_loopbk;
+	/* Transmit broadcast frames with a good CRC */
+	__be64 TBCAST_prio_0;
+	__be64 TBCAST_prio_1;
+	__be64 TBCAST_prio_2;
+	__be64 TBCAST_prio_3;
+	__be64 TBCAST_prio_4;
+	__be64 TBCAST_prio_5;
+	__be64 TBCAST_prio_6;
+	__be64 TBCAST_prio_7;
+	__be64 TBCAST_novlan;
+	__be64 TBCAST_loopbk;
+	/* Transmit multicast frames with a good CRC */
+	__be64 TMCAST_prio_0;
+	__be64 TMCAST_prio_1;
+	__be64 TMCAST_prio_2;
+	__be64 TMCAST_prio_3;
+	__be64 TMCAST_prio_4;
+	__be64 TMCAST_prio_5;
+	__be64 TMCAST_prio_6;
+	__be64 TMCAST_prio_7;
+	__be64 TMCAST_novlan;
+	__be64 TMCAST_loopbk;
+	/* Transmit good frames that are neither broadcast nor multicast */
+	__be64 TTOTG_prio_0;
+	__be64 TTOTG_prio_1;
+	__be64 TTOTG_prio_2;
+	__be64 TTOTG_prio_3;
+	__be64 TTOTG_prio_4;
+	__be64 TTOTG_prio_5;
+	__be64 TTOTG_prio_6;
+	__be64 TTOTG_prio_7;
+	__be64 TTOTG_novlan;
+	__be64 TTOTG_loopbk;
+
+	/* total octets of transmitted frames, including framing characters */
+	__be64 TTTLOCT_prio_0;
+	/* total octets of transmitted frames, not including framing characters */
+	__be64 TTTLOCT_NOFRM_prio_0;
+	/* ifOutOctets */
+	__be64 TOCT_prio_0;
+
+	__be64 TTTLOCT_prio_1;
+	__be64 TTTLOCT_NOFRM_prio_1;
+	__be64 TOCT_prio_1;
+
+	__be64 TTTLOCT_prio_2;
+	__be64 TTTLOCT_NOFRM_prio_2;
+	__be64 TOCT_prio_2;
+
+	__be64 TTTLOCT_prio_3;
+	__be64 TTTLOCT_NOFRM_prio_3;
+	__be64 TOCT_prio_3;
+
+	__be64 TTTLOCT_prio_4;
+	__be64 TTTLOCT_NOFRM_prio_4;
+	__be64 TOCT_prio_4;
+
+	__be64 TTTLOCT_prio_5;
+	__be64 TTTLOCT_NOFRM_prio_5;
+	__be64 TOCT_prio_5;
+
+	__be64 TTTLOCT_prio_6;
+	__be64 TTTLOCT_NOFRM_prio_6;
+	__be64 TOCT_prio_6;
+
+	__be64 TTTLOCT_prio_7;
+	__be64 TTTLOCT_NOFRM_prio_7;
+	__be64 TOCT_prio_7;
+
+	__be64 TTTLOCT_novlan;
+	__be64 TTTLOCT_NOFRM_novlan;
+	__be64 TOCT_novlan;
+
+	__be64 TTTLOCT_loopbk;
+	__be64 TTTLOCT_NOFRM_loopbk;
+	__be64 TOCT_loopbk;
+
+	/* Total frames transmitted with a good CRC that are not aborted  */
+	__be64 TTOT_prio_0;
+	/* Total number of frames transmitted with 802.1Q encapsulation */
+	__be64 T1Q_prio_0;
+	__be64 reserved13;
+
+	__be64 TTOT_prio_1;
+	__be64 T1Q_prio_1;
+	__be64 reserved14;
+
+	__be64 TTOT_prio_2;
+	__be64 T1Q_prio_2;
+	__be64 reserved15;
+
+	__be64 TTOT_prio_3;
+	__be64 T1Q_prio_3;
+	__be64 reserved16;
+
+	__be64 TTOT_prio_4;
+	__be64 T1Q_prio_4;
+	__be64 reserved17;
+
+	__be64 TTOT_prio_5;
+	__be64 T1Q_prio_5;
+	__be64 reserved18;
+
+	__be64 TTOT_prio_6;
+	__be64 T1Q_prio_6;
+	__be64 reserved19;
+
+	__be64 TTOT_prio_7;
+	__be64 T1Q_prio_7;
+	__be64 reserved20;
+
+	__be64 TTOT_novlan;
+	__be64 T1Q_novlan;
+	__be64 reserved21;
+
+	__be64 TTOT_loopbk;
+	__be64 T1Q_loopbk;
+	__be64 reserved22;
+
+	/* Received frames with a length greater than MTU octets and a bad CRC */
+	__be32 RJBBR;
+	/* Received frames with a bad CRC that are not runts, jabbers,
+	   or alignment errors */
+	__be32 RCRC;
+	/* Received frames with SFD with a length of less than 64 octets and a
+	   bad CRC */
+	__be32 RRUNT;
+	/* Received frames with a length less than 64 octets and a good CRC */
+	__be32 RSHORT;
+	/* Total Number of Received Packets Dropped */
+	__be32 RDROP;
+	/* Drop due to overflow  */
+	__be32 RdropOvflw;
+	/* Drop due to overflow */
+	__be32 RdropLength;
+	/* Total of good frames. Does not include frames received with
+	   frame-too-long, FCS, or length errors */
+	__be32 RTOTFRMS;
+	/* Total dropped Xmited packets */
+	__be32 TDROP;
+};
+
+
+#endif
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_resources.c b/drivers/net/ethernet/mellanox/mlx4/en_resources.c
new file mode 100644
index 0000000..f1a5500
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4/en_resources.c
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) 2007 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/mlx4/qp.h>
+
+#include "mlx4_en.h"
+
+void mlx4_en_fill_qp_context(struct mlx4_en_priv *priv, int size, int stride,
+			     int is_tx, int rss, int qpn, int cqn,
+			     int user_prio, struct mlx4_qp_context *context)
+{
+	struct mlx4_en_dev *mdev = priv->mdev;
+	struct net_device *dev = priv->dev;
+
+	memset(context, 0, sizeof *context);
+	context->flags = cpu_to_be32(7 << 16 | rss << MLX4_RSS_QPC_FLAG_OFFSET);
+	context->pd = cpu_to_be32(mdev->priv_pdn);
+	context->mtu_msgmax = 0xff;
+	if (!is_tx && !rss)
+		context->rq_size_stride = ilog2(size) << 3 | (ilog2(stride) - 4);
+	if (is_tx)
+		context->sq_size_stride = ilog2(size) << 3 | (ilog2(stride) - 4);
+	else
+		context->sq_size_stride = ilog2(TXBB_SIZE) - 4;
+	context->usr_page = cpu_to_be32(mdev->priv_uar.index);
+	context->local_qpn = cpu_to_be32(qpn);
+	context->pri_path.ackto = 1 & 0x07;
+	context->pri_path.sched_queue = 0x83 | (priv->port - 1) << 6;
+	if (user_prio >= 0) {
+		context->pri_path.sched_queue |= user_prio << 3;
+		context->pri_path.feup = MLX4_FEUP_FORCE_ETH_UP;
+	}
+	context->pri_path.counter_index = 0xff;
+	context->cqn_send = cpu_to_be32(cqn);
+	context->cqn_recv = cpu_to_be32(cqn);
+	context->db_rec_addr = cpu_to_be64(priv->res.db.dma << 2);
+	if (!(dev->features & NETIF_F_HW_VLAN_CTAG_RX))
+		context->param3 |= cpu_to_be32(1 << 30);
+
+	if (!is_tx && !rss &&
+	    (mdev->dev->caps.tunnel_offload_mode ==  MLX4_TUNNEL_OFFLOAD_MODE_VXLAN)) {
+		en_dbg(HW, priv, "Setting RX qp %x tunnel mode to RX tunneled & non-tunneled\n", qpn);
+		context->srqn = cpu_to_be32(7 << 28); /* this fills bits 30:28 */
+	}
+}
+
+
+int mlx4_en_map_buffer(struct mlx4_buf *buf)
+{
+	struct page **pages;
+	int i;
+
+	if (BITS_PER_LONG == 64 || buf->nbufs == 1)
+		return 0;
+
+	pages = kmalloc(sizeof *pages * buf->nbufs, GFP_KERNEL);
+	if (!pages)
+		return -ENOMEM;
+
+	for (i = 0; i < buf->nbufs; ++i)
+		pages[i] = virt_to_page(buf->page_list[i].buf);
+
+	buf->direct.buf = vmap(pages, buf->nbufs, VM_MAP, PAGE_KERNEL);
+	kfree(pages);
+	if (!buf->direct.buf)
+		return -ENOMEM;
+
+	return 0;
+}
+
+void mlx4_en_unmap_buffer(struct mlx4_buf *buf)
+{
+	if (BITS_PER_LONG == 64 || buf->nbufs == 1)
+		return;
+
+	vunmap(buf->direct.buf);
+}
+
+void mlx4_en_sqp_event(struct mlx4_qp *qp, enum mlx4_event event)
+{
+    return;
+}
+
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
new file mode 100644
index 0000000..01660c5
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
@@ -0,0 +1,1169 @@
+/*
+ * Copyright (c) 2007 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <net/busy_poll.h>
+#include <linux/mlx4/cq.h>
+#include <linux/slab.h>
+#include <linux/mlx4/qp.h>
+#include <linux/skbuff.h>
+#include <linux/rculist.h>
+#include <linux/if_ether.h>
+#include <linux/if_vlan.h>
+#include <linux/vmalloc.h>
+#include <linux/irq.h>
+
+#include "mlx4_en.h"
+
+static int mlx4_alloc_pages(struct mlx4_en_priv *priv,
+			    struct mlx4_en_rx_alloc *page_alloc,
+			    const struct mlx4_en_frag_info *frag_info,
+			    gfp_t _gfp)
+{
+	int order;
+	struct page *page;
+	dma_addr_t dma;
+
+	for (order = MLX4_EN_ALLOC_PREFER_ORDER; ;) {
+		gfp_t gfp = _gfp;
+
+		if (order)
+			gfp |= __GFP_COMP | __GFP_NOWARN;
+		page = alloc_pages(gfp, order);
+		if (likely(page))
+			break;
+		if (--order < 0 ||
+		    ((PAGE_SIZE << order) < frag_info->frag_size))
+			return -ENOMEM;
+	}
+	dma = dma_map_page(priv->ddev, page, 0, PAGE_SIZE << order,
+			   PCI_DMA_FROMDEVICE);
+	if (dma_mapping_error(priv->ddev, dma)) {
+		put_page(page);
+		return -ENOMEM;
+	}
+	page_alloc->page_size = PAGE_SIZE << order;
+	page_alloc->page = page;
+	page_alloc->dma = dma;
+	page_alloc->page_offset = frag_info->frag_align;
+	/* Not doing get_page() for each frag is a big win
+	 * on asymetric workloads. Note we can not use atomic_set().
+	 */
+	atomic_add(page_alloc->page_size / frag_info->frag_stride - 1,
+		   &page->_count);
+	return 0;
+}
+
+static int mlx4_en_alloc_frags(struct mlx4_en_priv *priv,
+			       struct mlx4_en_rx_desc *rx_desc,
+			       struct mlx4_en_rx_alloc *frags,
+			       struct mlx4_en_rx_alloc *ring_alloc,
+			       gfp_t gfp)
+{
+	struct mlx4_en_rx_alloc page_alloc[MLX4_EN_MAX_RX_FRAGS];
+	const struct mlx4_en_frag_info *frag_info;
+	struct page *page;
+	dma_addr_t dma;
+	int i;
+
+	for (i = 0; i < priv->num_frags; i++) {
+		frag_info = &priv->frag_info[i];
+		page_alloc[i] = ring_alloc[i];
+		page_alloc[i].page_offset += frag_info->frag_stride;
+
+		if (page_alloc[i].page_offset + frag_info->frag_stride <=
+		    ring_alloc[i].page_size)
+			continue;
+
+		if (mlx4_alloc_pages(priv, &page_alloc[i], frag_info, gfp))
+			goto out;
+	}
+
+	for (i = 0; i < priv->num_frags; i++) {
+		frags[i] = ring_alloc[i];
+		dma = ring_alloc[i].dma + ring_alloc[i].page_offset;
+		ring_alloc[i] = page_alloc[i];
+		rx_desc->data[i].addr = cpu_to_be64(dma);
+	}
+
+	return 0;
+
+out:
+	while (i--) {
+		frag_info = &priv->frag_info[i];
+		if (page_alloc[i].page != ring_alloc[i].page) {
+			dma_unmap_page(priv->ddev, page_alloc[i].dma,
+				page_alloc[i].page_size, PCI_DMA_FROMDEVICE);
+			page = page_alloc[i].page;
+			atomic_set(&page->_count, 1);
+			put_page(page);
+		}
+	}
+	return -ENOMEM;
+}
+
+static void mlx4_en_free_frag(struct mlx4_en_priv *priv,
+			      struct mlx4_en_rx_alloc *frags,
+			      int i)
+{
+	const struct mlx4_en_frag_info *frag_info = &priv->frag_info[i];
+	u32 next_frag_end = frags[i].page_offset + 2 * frag_info->frag_stride;
+
+
+	if (next_frag_end > frags[i].page_size)
+		dma_unmap_page(priv->ddev, frags[i].dma, frags[i].page_size,
+			       PCI_DMA_FROMDEVICE);
+
+	if (frags[i].page)
+		put_page(frags[i].page);
+}
+
+static int mlx4_en_init_allocator(struct mlx4_en_priv *priv,
+				  struct mlx4_en_rx_ring *ring)
+{
+	int i;
+	struct mlx4_en_rx_alloc *page_alloc;
+
+	for (i = 0; i < priv->num_frags; i++) {
+		const struct mlx4_en_frag_info *frag_info = &priv->frag_info[i];
+
+		if (mlx4_alloc_pages(priv, &ring->page_alloc[i],
+				     frag_info, GFP_KERNEL))
+			goto out;
+	}
+	return 0;
+
+out:
+	while (i--) {
+		struct page *page;
+
+		page_alloc = &ring->page_alloc[i];
+		dma_unmap_page(priv->ddev, page_alloc->dma,
+			       page_alloc->page_size, PCI_DMA_FROMDEVICE);
+		page = page_alloc->page;
+		atomic_set(&page->_count, 1);
+		put_page(page);
+		page_alloc->page = NULL;
+	}
+	return -ENOMEM;
+}
+
+static void mlx4_en_destroy_allocator(struct mlx4_en_priv *priv,
+				      struct mlx4_en_rx_ring *ring)
+{
+	struct mlx4_en_rx_alloc *page_alloc;
+	int i;
+
+	for (i = 0; i < priv->num_frags; i++) {
+		const struct mlx4_en_frag_info *frag_info = &priv->frag_info[i];
+
+		page_alloc = &ring->page_alloc[i];
+		en_dbg(DRV, priv, "Freeing allocator:%d count:%d\n",
+		       i, page_count(page_alloc->page));
+
+		dma_unmap_page(priv->ddev, page_alloc->dma,
+				page_alloc->page_size, PCI_DMA_FROMDEVICE);
+		while (page_alloc->page_offset + frag_info->frag_stride <
+		       page_alloc->page_size) {
+			put_page(page_alloc->page);
+			page_alloc->page_offset += frag_info->frag_stride;
+		}
+		page_alloc->page = NULL;
+	}
+}
+
+static void mlx4_en_init_rx_desc(struct mlx4_en_priv *priv,
+				 struct mlx4_en_rx_ring *ring, int index)
+{
+	struct mlx4_en_rx_desc *rx_desc = ring->buf + ring->stride * index;
+	int possible_frags;
+	int i;
+
+	/* Set size and memtype fields */
+	for (i = 0; i < priv->num_frags; i++) {
+		rx_desc->data[i].byte_count =
+			cpu_to_be32(priv->frag_info[i].frag_size);
+		rx_desc->data[i].lkey = cpu_to_be32(priv->mdev->mr.key);
+	}
+
+	/* If the number of used fragments does not fill up the ring stride,
+	 * remaining (unused) fragments must be padded with null address/size
+	 * and a special memory key */
+	possible_frags = (ring->stride - sizeof(struct mlx4_en_rx_desc)) / DS_SIZE;
+	for (i = priv->num_frags; i < possible_frags; i++) {
+		rx_desc->data[i].byte_count = 0;
+		rx_desc->data[i].lkey = cpu_to_be32(MLX4_EN_MEMTYPE_PAD);
+		rx_desc->data[i].addr = 0;
+	}
+}
+
+static int mlx4_en_prepare_rx_desc(struct mlx4_en_priv *priv,
+				   struct mlx4_en_rx_ring *ring, int index,
+				   gfp_t gfp)
+{
+	struct mlx4_en_rx_desc *rx_desc = ring->buf + (index * ring->stride);
+	struct mlx4_en_rx_alloc *frags = ring->rx_info +
+					(index << priv->log_rx_info);
+
+	return mlx4_en_alloc_frags(priv, rx_desc, frags, ring->page_alloc, gfp);
+}
+
+static inline void mlx4_en_update_rx_prod_db(struct mlx4_en_rx_ring *ring)
+{
+	*ring->wqres.db.db = cpu_to_be32(ring->prod & 0xffff);
+}
+
+static void mlx4_en_free_rx_desc(struct mlx4_en_priv *priv,
+				 struct mlx4_en_rx_ring *ring,
+				 int index)
+{
+	struct mlx4_en_rx_alloc *frags;
+	int nr;
+
+	frags = ring->rx_info + (index << priv->log_rx_info);
+	for (nr = 0; nr < priv->num_frags; nr++) {
+		en_dbg(DRV, priv, "Freeing fragment:%d\n", nr);
+		mlx4_en_free_frag(priv, frags, nr);
+	}
+}
+
+static int mlx4_en_fill_rx_buffers(struct mlx4_en_priv *priv)
+{
+	struct mlx4_en_rx_ring *ring;
+	int ring_ind;
+	int buf_ind;
+	int new_size;
+
+	for (buf_ind = 0; buf_ind < priv->prof->rx_ring_size; buf_ind++) {
+		for (ring_ind = 0; ring_ind < priv->rx_ring_num; ring_ind++) {
+			ring = priv->rx_ring[ring_ind];
+
+			if (mlx4_en_prepare_rx_desc(priv, ring,
+						    ring->actual_size,
+						    GFP_KERNEL)) {
+				if (ring->actual_size < MLX4_EN_MIN_RX_SIZE) {
+					en_err(priv, "Failed to allocate enough rx buffers\n");
+					return -ENOMEM;
+				} else {
+					new_size = rounddown_pow_of_two(ring->actual_size);
+					en_warn(priv, "Only %d buffers allocated reducing ring size to %d\n",
+						ring->actual_size, new_size);
+					goto reduce_rings;
+				}
+			}
+			ring->actual_size++;
+			ring->prod++;
+		}
+	}
+	return 0;
+
+reduce_rings:
+	for (ring_ind = 0; ring_ind < priv->rx_ring_num; ring_ind++) {
+		ring = priv->rx_ring[ring_ind];
+		while (ring->actual_size > new_size) {
+			ring->actual_size--;
+			ring->prod--;
+			mlx4_en_free_rx_desc(priv, ring, ring->actual_size);
+		}
+	}
+
+	return 0;
+}
+
+static void mlx4_en_free_rx_buf(struct mlx4_en_priv *priv,
+				struct mlx4_en_rx_ring *ring)
+{
+	int index;
+
+	en_dbg(DRV, priv, "Freeing Rx buf - cons:%d prod:%d\n",
+	       ring->cons, ring->prod);
+
+	/* Unmap and free Rx buffers */
+	BUG_ON((u32) (ring->prod - ring->cons) > ring->actual_size);
+	while (ring->cons != ring->prod) {
+		index = ring->cons & ring->size_mask;
+		en_dbg(DRV, priv, "Processing descriptor:%d\n", index);
+		mlx4_en_free_rx_desc(priv, ring, index);
+		++ring->cons;
+	}
+}
+
+void mlx4_en_set_num_rx_rings(struct mlx4_en_dev *mdev)
+{
+	int i;
+	int num_of_eqs;
+	int num_rx_rings;
+	struct mlx4_dev *dev = mdev->dev;
+
+	mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_ETH) {
+		if (!dev->caps.comp_pool)
+			num_of_eqs = max_t(int, MIN_RX_RINGS,
+					   min_t(int,
+						 dev->caps.num_comp_vectors,
+						 DEF_RX_RINGS));
+		else
+			num_of_eqs = min_t(int, MAX_MSIX_P_PORT,
+					   dev->caps.comp_pool/
+					   dev->caps.num_ports) - 1;
+
+		num_rx_rings = mlx4_low_memory_profile() ? MIN_RX_RINGS :
+			min_t(int, num_of_eqs,
+			      netif_get_num_default_rss_queues());
+		mdev->profile.prof[i].rx_ring_num =
+			rounddown_pow_of_two(num_rx_rings);
+	}
+}
+
+int mlx4_en_create_rx_ring(struct mlx4_en_priv *priv,
+			   struct mlx4_en_rx_ring **pring,
+			   u32 size, u16 stride, int node)
+{
+	struct mlx4_en_dev *mdev = priv->mdev;
+	struct mlx4_en_rx_ring *ring;
+	int err = -ENOMEM;
+	int tmp;
+
+	ring = kzalloc_node(sizeof(*ring), GFP_KERNEL, node);
+	if (!ring) {
+		ring = kzalloc(sizeof(*ring), GFP_KERNEL);
+		if (!ring) {
+			en_err(priv, "Failed to allocate RX ring structure\n");
+			return -ENOMEM;
+		}
+	}
+
+	ring->prod = 0;
+	ring->cons = 0;
+	ring->size = size;
+	ring->size_mask = size - 1;
+	ring->stride = stride;
+	ring->log_stride = ffs(ring->stride) - 1;
+	ring->buf_size = ring->size * ring->stride + TXBB_SIZE;
+
+	tmp = size * roundup_pow_of_two(MLX4_EN_MAX_RX_FRAGS *
+					sizeof(struct mlx4_en_rx_alloc));
+	ring->rx_info = vmalloc_node(tmp, node);
+	if (!ring->rx_info) {
+		ring->rx_info = vmalloc(tmp);
+		if (!ring->rx_info) {
+			err = -ENOMEM;
+			goto err_ring;
+		}
+	}
+
+	en_dbg(DRV, priv, "Allocated rx_info ring at addr:%p size:%d\n",
+		 ring->rx_info, tmp);
+
+	/* Allocate HW buffers on provided NUMA node */
+	set_dev_node(&mdev->dev->pdev->dev, node);
+	err = mlx4_alloc_hwq_res(mdev->dev, &ring->wqres,
+				 ring->buf_size, 2 * PAGE_SIZE);
+	set_dev_node(&mdev->dev->pdev->dev, mdev->dev->numa_node);
+	if (err)
+		goto err_info;
+
+	err = mlx4_en_map_buffer(&ring->wqres.buf);
+	if (err) {
+		en_err(priv, "Failed to map RX buffer\n");
+		goto err_hwq;
+	}
+	ring->buf = ring->wqres.buf.direct.buf;
+
+	ring->hwtstamp_rx_filter = priv->hwtstamp_config.rx_filter;
+
+	*pring = ring;
+	return 0;
+
+err_hwq:
+	mlx4_free_hwq_res(mdev->dev, &ring->wqres, ring->buf_size);
+err_info:
+	vfree(ring->rx_info);
+	ring->rx_info = NULL;
+err_ring:
+	kfree(ring);
+	*pring = NULL;
+
+	return err;
+}
+
+int mlx4_en_activate_rx_rings(struct mlx4_en_priv *priv)
+{
+	struct mlx4_en_rx_ring *ring;
+	int i;
+	int ring_ind;
+	int err;
+	int stride = roundup_pow_of_two(sizeof(struct mlx4_en_rx_desc) +
+					DS_SIZE * priv->num_frags);
+
+	for (ring_ind = 0; ring_ind < priv->rx_ring_num; ring_ind++) {
+		ring = priv->rx_ring[ring_ind];
+
+		ring->prod = 0;
+		ring->cons = 0;
+		ring->actual_size = 0;
+		ring->cqn = priv->rx_cq[ring_ind]->mcq.cqn;
+
+		ring->stride = stride;
+		if (ring->stride <= TXBB_SIZE)
+			ring->buf += TXBB_SIZE;
+
+		ring->log_stride = ffs(ring->stride) - 1;
+		ring->buf_size = ring->size * ring->stride;
+
+		memset(ring->buf, 0, ring->buf_size);
+		mlx4_en_update_rx_prod_db(ring);
+
+		/* Initialize all descriptors */
+		for (i = 0; i < ring->size; i++)
+			mlx4_en_init_rx_desc(priv, ring, i);
+
+		/* Initialize page allocators */
+		err = mlx4_en_init_allocator(priv, ring);
+		if (err) {
+			en_err(priv, "Failed initializing ring allocator\n");
+			if (ring->stride <= TXBB_SIZE)
+				ring->buf -= TXBB_SIZE;
+			ring_ind--;
+			goto err_allocator;
+		}
+	}
+	err = mlx4_en_fill_rx_buffers(priv);
+	if (err)
+		goto err_buffers;
+
+	for (ring_ind = 0; ring_ind < priv->rx_ring_num; ring_ind++) {
+		ring = priv->rx_ring[ring_ind];
+
+		ring->size_mask = ring->actual_size - 1;
+		mlx4_en_update_rx_prod_db(ring);
+	}
+
+	return 0;
+
+err_buffers:
+	for (ring_ind = 0; ring_ind < priv->rx_ring_num; ring_ind++)
+		mlx4_en_free_rx_buf(priv, priv->rx_ring[ring_ind]);
+
+	ring_ind = priv->rx_ring_num - 1;
+err_allocator:
+	while (ring_ind >= 0) {
+		if (priv->rx_ring[ring_ind]->stride <= TXBB_SIZE)
+			priv->rx_ring[ring_ind]->buf -= TXBB_SIZE;
+		mlx4_en_destroy_allocator(priv, priv->rx_ring[ring_ind]);
+		ring_ind--;
+	}
+	return err;
+}
+
+void mlx4_en_destroy_rx_ring(struct mlx4_en_priv *priv,
+			     struct mlx4_en_rx_ring **pring,
+			     u32 size, u16 stride)
+{
+	struct mlx4_en_dev *mdev = priv->mdev;
+	struct mlx4_en_rx_ring *ring = *pring;
+
+	mlx4_en_unmap_buffer(&ring->wqres.buf);
+	mlx4_free_hwq_res(mdev->dev, &ring->wqres, size * stride + TXBB_SIZE);
+	vfree(ring->rx_info);
+	ring->rx_info = NULL;
+	kfree(ring);
+	*pring = NULL;
+#ifdef CONFIG_RFS_ACCEL
+	mlx4_en_cleanup_filters(priv);
+#endif
+}
+
+void mlx4_en_deactivate_rx_ring(struct mlx4_en_priv *priv,
+				struct mlx4_en_rx_ring *ring)
+{
+	mlx4_en_free_rx_buf(priv, ring);
+	if (ring->stride <= TXBB_SIZE)
+		ring->buf -= TXBB_SIZE;
+	mlx4_en_destroy_allocator(priv, ring);
+}
+
+
+static int mlx4_en_complete_rx_desc(struct mlx4_en_priv *priv,
+				    struct mlx4_en_rx_desc *rx_desc,
+				    struct mlx4_en_rx_alloc *frags,
+				    struct sk_buff *skb,
+				    int length)
+{
+	struct skb_frag_struct *skb_frags_rx = skb_shinfo(skb)->frags;
+	struct mlx4_en_frag_info *frag_info;
+	int nr;
+	dma_addr_t dma;
+
+	/* Collect used fragments while replacing them in the HW descriptors */
+	for (nr = 0; nr < priv->num_frags; nr++) {
+		frag_info = &priv->frag_info[nr];
+		if (length <= frag_info->frag_prefix_size)
+			break;
+		if (!frags[nr].page)
+			goto fail;
+
+		dma = be64_to_cpu(rx_desc->data[nr].addr);
+		dma_sync_single_for_cpu(priv->ddev, dma, frag_info->frag_size,
+					DMA_FROM_DEVICE);
+
+		/* Save page reference in skb */
+		__skb_frag_set_page(&skb_frags_rx[nr], frags[nr].page);
+		skb_frag_size_set(&skb_frags_rx[nr], frag_info->frag_size);
+		skb_frags_rx[nr].page_offset = frags[nr].page_offset;
+		skb->truesize += frag_info->frag_stride;
+		frags[nr].page = NULL;
+	}
+	/* Adjust size of last fragment to match actual length */
+	if (nr > 0)
+		skb_frag_size_set(&skb_frags_rx[nr - 1],
+			length - priv->frag_info[nr - 1].frag_prefix_size);
+	return nr;
+
+fail:
+	while (nr > 0) {
+		nr--;
+		__skb_frag_unref(&skb_frags_rx[nr]);
+	}
+	return 0;
+}
+
+
+static struct sk_buff *mlx4_en_rx_skb(struct mlx4_en_priv *priv,
+				      struct mlx4_en_rx_desc *rx_desc,
+				      struct mlx4_en_rx_alloc *frags,
+				      unsigned int length)
+{
+	struct sk_buff *skb;
+	void *va;
+	int used_frags;
+	dma_addr_t dma;
+
+	skb = netdev_alloc_skb(priv->dev, SMALL_PACKET_SIZE + NET_IP_ALIGN);
+	if (!skb) {
+		en_dbg(RX_ERR, priv, "Failed allocating skb\n");
+		return NULL;
+	}
+	skb_reserve(skb, NET_IP_ALIGN);
+	skb->len = length;
+
+	/* Get pointer to first fragment so we could copy the headers into the
+	 * (linear part of the) skb */
+	va = page_address(frags[0].page) + frags[0].page_offset;
+
+	if (length <= SMALL_PACKET_SIZE) {
+		/* We are copying all relevant data to the skb - temporarily
+		 * sync buffers for the copy */
+		dma = be64_to_cpu(rx_desc->data[0].addr);
+		dma_sync_single_for_cpu(priv->ddev, dma, length,
+					DMA_FROM_DEVICE);
+		skb_copy_to_linear_data(skb, va, length);
+		skb->tail += length;
+	} else {
+		unsigned int pull_len;
+
+		/* Move relevant fragments to skb */
+		used_frags = mlx4_en_complete_rx_desc(priv, rx_desc, frags,
+							skb, length);
+		if (unlikely(!used_frags)) {
+			kfree_skb(skb);
+			return NULL;
+		}
+		skb_shinfo(skb)->nr_frags = used_frags;
+
+		pull_len = eth_get_headlen(va, SMALL_PACKET_SIZE);
+		/* Copy headers into the skb linear buffer */
+		memcpy(skb->data, va, pull_len);
+		skb->tail += pull_len;
+
+		/* Skip headers in first fragment */
+		skb_shinfo(skb)->frags[0].page_offset += pull_len;
+
+		/* Adjust size of first fragment */
+		skb_frag_size_sub(&skb_shinfo(skb)->frags[0], pull_len);
+		skb->data_len = length - pull_len;
+	}
+	return skb;
+}
+
+static void validate_loopback(struct mlx4_en_priv *priv, struct sk_buff *skb)
+{
+	int i;
+	int offset = ETH_HLEN;
+
+	for (i = 0; i < MLX4_LOOPBACK_TEST_PAYLOAD; i++, offset++) {
+		if (*(skb->data + offset) != (unsigned char) (i & 0xff))
+			goto out_loopback;
+	}
+	/* Loopback found */
+	priv->loopback_ok = 1;
+
+out_loopback:
+	dev_kfree_skb_any(skb);
+}
+
+static void mlx4_en_refill_rx_buffers(struct mlx4_en_priv *priv,
+				     struct mlx4_en_rx_ring *ring)
+{
+	int index = ring->prod & ring->size_mask;
+
+	while ((u32) (ring->prod - ring->cons) < ring->actual_size) {
+		if (mlx4_en_prepare_rx_desc(priv, ring, index, GFP_ATOMIC))
+			break;
+		ring->prod++;
+		index = ring->prod & ring->size_mask;
+	}
+}
+
+int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int budget)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = priv->mdev;
+	struct mlx4_cqe *cqe;
+	struct mlx4_en_rx_ring *ring = priv->rx_ring[cq->ring];
+	struct mlx4_en_rx_alloc *frags;
+	struct mlx4_en_rx_desc *rx_desc;
+	struct sk_buff *skb;
+	int index;
+	int nr;
+	unsigned int length;
+	int polled = 0;
+	int ip_summed;
+	int factor = priv->cqe_factor;
+	u64 timestamp;
+	bool l2_tunnel;
+
+	if (!priv->port_up)
+		return 0;
+
+	if (budget <= 0)
+		return polled;
+
+	/* We assume a 1:1 mapping between CQEs and Rx descriptors, so Rx
+	 * descriptor offset can be deduced from the CQE index instead of
+	 * reading 'cqe->index' */
+	index = cq->mcq.cons_index & ring->size_mask;
+	cqe = mlx4_en_get_cqe(cq->buf, index, priv->cqe_size) + factor;
+
+	/* Process all completed CQEs */
+	while (XNOR(cqe->owner_sr_opcode & MLX4_CQE_OWNER_MASK,
+		    cq->mcq.cons_index & cq->size)) {
+
+		frags = ring->rx_info + (index << priv->log_rx_info);
+		rx_desc = ring->buf + (index << ring->log_stride);
+
+		/*
+		 * make sure we read the CQE after we read the ownership bit
+		 */
+		rmb();
+
+		/* Drop packet on bad receive or bad checksum */
+		if (unlikely((cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) ==
+						MLX4_CQE_OPCODE_ERROR)) {
+			en_err(priv, "CQE completed in error - vendor syndrom:%d syndrom:%d\n",
+			       ((struct mlx4_err_cqe *)cqe)->vendor_err_syndrome,
+			       ((struct mlx4_err_cqe *)cqe)->syndrome);
+			goto next;
+		}
+		if (unlikely(cqe->badfcs_enc & MLX4_CQE_BAD_FCS)) {
+			en_dbg(RX_ERR, priv, "Accepted frame with bad FCS\n");
+			goto next;
+		}
+
+		/* Check if we need to drop the packet if SRIOV is not enabled
+		 * and not performing the selftest or flb disabled
+		 */
+		if (priv->flags & MLX4_EN_FLAG_RX_FILTER_NEEDED) {
+			struct ethhdr *ethh;
+			dma_addr_t dma;
+			/* Get pointer to first fragment since we haven't
+			 * skb yet and cast it to ethhdr struct
+			 */
+			dma = be64_to_cpu(rx_desc->data[0].addr);
+			dma_sync_single_for_cpu(priv->ddev, dma, sizeof(*ethh),
+						DMA_FROM_DEVICE);
+			ethh = (struct ethhdr *)(page_address(frags[0].page) +
+						 frags[0].page_offset);
+
+			if (is_multicast_ether_addr(ethh->h_dest)) {
+				struct mlx4_mac_entry *entry;
+				struct hlist_head *bucket;
+				unsigned int mac_hash;
+
+				/* Drop the packet, since HW loopback-ed it */
+				mac_hash = ethh->h_source[MLX4_EN_MAC_HASH_IDX];
+				bucket = &priv->mac_hash[mac_hash];
+				rcu_read_lock();
+				hlist_for_each_entry_rcu(entry, bucket, hlist) {
+					if (ether_addr_equal_64bits(entry->mac,
+								    ethh->h_source)) {
+						rcu_read_unlock();
+						goto next;
+					}
+				}
+				rcu_read_unlock();
+			}
+		}
+
+		/*
+		 * Packet is OK - process it.
+		 */
+		length = be32_to_cpu(cqe->byte_cnt);
+		length -= ring->fcs_del;
+		ring->bytes += length;
+		ring->packets++;
+		l2_tunnel = (dev->hw_enc_features & NETIF_F_RXCSUM) &&
+			(cqe->vlan_my_qpn & cpu_to_be32(MLX4_CQE_L2_TUNNEL));
+
+		if (likely(dev->features & NETIF_F_RXCSUM)) {
+			if ((cqe->status & cpu_to_be16(MLX4_CQE_STATUS_IPOK)) &&
+			    (cqe->checksum == cpu_to_be16(0xffff))) {
+				ring->csum_ok++;
+				/* This packet is eligible for GRO if it is:
+				 * - DIX Ethernet (type interpretation)
+				 * - TCP/IP (v4)
+				 * - without IP options
+				 * - not an IP fragment
+				 * - no LLS polling in progress
+				 */
+				if (!mlx4_en_cq_busy_polling(cq) &&
+				    (dev->features & NETIF_F_GRO)) {
+					struct sk_buff *gro_skb = napi_get_frags(&cq->napi);
+					if (!gro_skb)
+						goto next;
+
+					nr = mlx4_en_complete_rx_desc(priv,
+						rx_desc, frags, gro_skb,
+						length);
+					if (!nr)
+						goto next;
+
+					skb_shinfo(gro_skb)->nr_frags = nr;
+					gro_skb->len = length;
+					gro_skb->data_len = length;
+					gro_skb->ip_summed = CHECKSUM_UNNECESSARY;
+
+					if (l2_tunnel)
+						gro_skb->csum_level = 1;
+					if ((cqe->vlan_my_qpn &
+					    cpu_to_be32(MLX4_CQE_VLAN_PRESENT_MASK)) &&
+					    (dev->features & NETIF_F_HW_VLAN_CTAG_RX)) {
+						u16 vid = be16_to_cpu(cqe->sl_vid);
+
+						__vlan_hwaccel_put_tag(gro_skb, htons(ETH_P_8021Q), vid);
+					}
+
+					if (dev->features & NETIF_F_RXHASH)
+						skb_set_hash(gro_skb,
+							     be32_to_cpu(cqe->immed_rss_invalid),
+							     PKT_HASH_TYPE_L3);
+
+					skb_record_rx_queue(gro_skb, cq->ring);
+					skb_mark_napi_id(gro_skb, &cq->napi);
+
+					if (ring->hwtstamp_rx_filter == HWTSTAMP_FILTER_ALL) {
+						timestamp = mlx4_en_get_cqe_ts(cqe);
+						mlx4_en_fill_hwtstamps(mdev,
+								       skb_hwtstamps(gro_skb),
+								       timestamp);
+					}
+
+					napi_gro_frags(&cq->napi);
+					goto next;
+				}
+
+				/* GRO not possible, complete processing here */
+				ip_summed = CHECKSUM_UNNECESSARY;
+			} else {
+				ip_summed = CHECKSUM_NONE;
+				ring->csum_none++;
+			}
+		} else {
+			ip_summed = CHECKSUM_NONE;
+			ring->csum_none++;
+		}
+
+		skb = mlx4_en_rx_skb(priv, rx_desc, frags, length);
+		if (!skb) {
+			priv->stats.rx_dropped++;
+			goto next;
+		}
+
+                if (unlikely(priv->validate_loopback)) {
+			validate_loopback(priv, skb);
+			goto next;
+		}
+
+		skb->ip_summed = ip_summed;
+		skb->protocol = eth_type_trans(skb, dev);
+		skb_record_rx_queue(skb, cq->ring);
+
+		if (l2_tunnel && ip_summed == CHECKSUM_UNNECESSARY)
+			skb->csum_level = 1;
+
+		if (dev->features & NETIF_F_RXHASH)
+			skb_set_hash(skb,
+				     be32_to_cpu(cqe->immed_rss_invalid),
+				     PKT_HASH_TYPE_L3);
+
+		if ((be32_to_cpu(cqe->vlan_my_qpn) &
+		    MLX4_CQE_VLAN_PRESENT_MASK) &&
+		    (dev->features & NETIF_F_HW_VLAN_CTAG_RX))
+			__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), be16_to_cpu(cqe->sl_vid));
+
+		if (ring->hwtstamp_rx_filter == HWTSTAMP_FILTER_ALL) {
+			timestamp = mlx4_en_get_cqe_ts(cqe);
+			mlx4_en_fill_hwtstamps(mdev, skb_hwtstamps(skb),
+					       timestamp);
+		}
+
+		skb_mark_napi_id(skb, &cq->napi);
+
+		if (!mlx4_en_cq_busy_polling(cq))
+			napi_gro_receive(&cq->napi, skb);
+		else
+			netif_receive_skb(skb);
+
+next:
+		for (nr = 0; nr < priv->num_frags; nr++)
+			mlx4_en_free_frag(priv, frags, nr);
+
+		++cq->mcq.cons_index;
+		index = (cq->mcq.cons_index) & ring->size_mask;
+		cqe = mlx4_en_get_cqe(cq->buf, index, priv->cqe_size) + factor;
+		if (++polled == budget)
+			goto out;
+	}
+
+out:
+	AVG_PERF_COUNTER(priv->pstats.rx_coal_avg, polled);
+	mlx4_cq_set_ci(&cq->mcq);
+	wmb(); /* ensure HW sees CQ consumer before we post new buffers */
+	ring->cons = cq->mcq.cons_index;
+	mlx4_en_refill_rx_buffers(priv, ring);
+	mlx4_en_update_rx_prod_db(ring);
+	return polled;
+}
+
+
+void mlx4_en_rx_irq(struct mlx4_cq *mcq)
+{
+	struct mlx4_en_cq *cq = container_of(mcq, struct mlx4_en_cq, mcq);
+	struct mlx4_en_priv *priv = netdev_priv(cq->dev);
+
+	if (priv->port_up)
+		napi_schedule(&cq->napi);
+	else
+		mlx4_en_arm_cq(priv, cq);
+}
+
+/* Rx CQ polling - called by NAPI */
+int mlx4_en_poll_rx_cq(struct napi_struct *napi, int budget)
+{
+	struct mlx4_en_cq *cq = container_of(napi, struct mlx4_en_cq, napi);
+	struct net_device *dev = cq->dev;
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	int done;
+
+	if (!mlx4_en_cq_lock_napi(cq))
+		return budget;
+
+	done = mlx4_en_process_rx_cq(dev, cq, budget);
+
+	mlx4_en_cq_unlock_napi(cq);
+
+	/* If we used up all the quota - we're probably not done yet... */
+	if (done == budget) {
+		int cpu_curr;
+		const struct cpumask *aff;
+
+		INC_PERF_COUNTER(priv->pstats.napi_quota);
+
+		cpu_curr = smp_processor_id();
+		aff = irq_desc_get_irq_data(cq->irq_desc)->affinity;
+
+		if (unlikely(!cpumask_test_cpu(cpu_curr, aff))) {
+			/* Current cpu is not according to smp_irq_affinity -
+			 * probably affinity changed. need to stop this NAPI
+			 * poll, and restart it on the right CPU
+			 */
+			napi_complete(napi);
+			mlx4_en_arm_cq(priv, cq);
+			return 0;
+		}
+	} else {
+		/* Done for now */
+		napi_complete(napi);
+		mlx4_en_arm_cq(priv, cq);
+	}
+	return done;
+}
+
+static const int frag_sizes[] = {
+	FRAG_SZ0,
+	FRAG_SZ1,
+	FRAG_SZ2,
+	FRAG_SZ3
+};
+
+void mlx4_en_calc_rx_buf(struct net_device *dev)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	int eff_mtu = dev->mtu + ETH_HLEN + VLAN_HLEN;
+	int buf_size = 0;
+	int i = 0;
+
+	while (buf_size < eff_mtu) {
+		priv->frag_info[i].frag_size =
+			(eff_mtu > buf_size + frag_sizes[i]) ?
+				frag_sizes[i] : eff_mtu - buf_size;
+		priv->frag_info[i].frag_prefix_size = buf_size;
+		if (!i)	{
+			priv->frag_info[i].frag_align = NET_IP_ALIGN;
+			priv->frag_info[i].frag_stride =
+				ALIGN(frag_sizes[i] + NET_IP_ALIGN, SMP_CACHE_BYTES);
+		} else {
+			priv->frag_info[i].frag_align = 0;
+			priv->frag_info[i].frag_stride =
+				ALIGN(frag_sizes[i], SMP_CACHE_BYTES);
+		}
+		buf_size += priv->frag_info[i].frag_size;
+		i++;
+	}
+
+	priv->num_frags = i;
+	priv->rx_skb_size = eff_mtu;
+	priv->log_rx_info = ROUNDUP_LOG2(i * sizeof(struct mlx4_en_rx_alloc));
+
+	en_dbg(DRV, priv, "Rx buffer scatter-list (effective-mtu:%d num_frags:%d):\n",
+	       eff_mtu, priv->num_frags);
+	for (i = 0; i < priv->num_frags; i++) {
+		en_err(priv,
+		       "  frag:%d - size:%d prefix:%d align:%d stride:%d\n",
+		       i,
+		       priv->frag_info[i].frag_size,
+		       priv->frag_info[i].frag_prefix_size,
+		       priv->frag_info[i].frag_align,
+		       priv->frag_info[i].frag_stride);
+	}
+}
+
+/* RSS related functions */
+
+static int mlx4_en_config_rss_qp(struct mlx4_en_priv *priv, int qpn,
+				 struct mlx4_en_rx_ring *ring,
+				 enum mlx4_qp_state *state,
+				 struct mlx4_qp *qp)
+{
+	struct mlx4_en_dev *mdev = priv->mdev;
+	struct mlx4_qp_context *context;
+	int err = 0;
+
+	context = kmalloc(sizeof(*context), GFP_KERNEL);
+	if (!context)
+		return -ENOMEM;
+
+	err = mlx4_qp_alloc(mdev->dev, qpn, qp, GFP_KERNEL);
+	if (err) {
+		en_err(priv, "Failed to allocate qp #%x\n", qpn);
+		goto out;
+	}
+	qp->event = mlx4_en_sqp_event;
+
+	memset(context, 0, sizeof *context);
+	mlx4_en_fill_qp_context(priv, ring->actual_size, ring->stride, 0, 0,
+				qpn, ring->cqn, -1, context);
+	context->db_rec_addr = cpu_to_be64(ring->wqres.db.dma);
+
+	/* Cancel FCS removal if FW allows */
+	if (mdev->dev->caps.flags & MLX4_DEV_CAP_FLAG_FCS_KEEP) {
+		context->param3 |= cpu_to_be32(1 << 29);
+		ring->fcs_del = ETH_FCS_LEN;
+	} else
+		ring->fcs_del = 0;
+
+	err = mlx4_qp_to_ready(mdev->dev, &ring->wqres.mtt, context, qp, state);
+	if (err) {
+		mlx4_qp_remove(mdev->dev, qp);
+		mlx4_qp_free(mdev->dev, qp);
+	}
+	mlx4_en_update_rx_prod_db(ring);
+out:
+	kfree(context);
+	return err;
+}
+
+int mlx4_en_create_drop_qp(struct mlx4_en_priv *priv)
+{
+	int err;
+	u32 qpn;
+
+	err = mlx4_qp_reserve_range(priv->mdev->dev, 1, 1, &qpn);
+	if (err) {
+		en_err(priv, "Failed reserving drop qpn\n");
+		return err;
+	}
+	err = mlx4_qp_alloc(priv->mdev->dev, qpn, &priv->drop_qp, GFP_KERNEL);
+	if (err) {
+		en_err(priv, "Failed allocating drop qp\n");
+		mlx4_qp_release_range(priv->mdev->dev, qpn, 1);
+		return err;
+	}
+
+	return 0;
+}
+
+void mlx4_en_destroy_drop_qp(struct mlx4_en_priv *priv)
+{
+	u32 qpn;
+
+	qpn = priv->drop_qp.qpn;
+	mlx4_qp_remove(priv->mdev->dev, &priv->drop_qp);
+	mlx4_qp_free(priv->mdev->dev, &priv->drop_qp);
+	mlx4_qp_release_range(priv->mdev->dev, qpn, 1);
+}
+
+/* Allocate rx qp's and configure them according to rss map */
+int mlx4_en_config_rss_steer(struct mlx4_en_priv *priv)
+{
+	struct mlx4_en_dev *mdev = priv->mdev;
+	struct mlx4_en_rss_map *rss_map = &priv->rss_map;
+	struct mlx4_qp_context context;
+	struct mlx4_rss_context *rss_context;
+	int rss_rings;
+	void *ptr;
+	u8 rss_mask = (MLX4_RSS_IPV4 | MLX4_RSS_TCP_IPV4 | MLX4_RSS_IPV6 |
+			MLX4_RSS_TCP_IPV6);
+	int i, qpn;
+	int err = 0;
+	int good_qps = 0;
+	static const u32 rsskey[10] = { 0xD181C62C, 0xF7F4DB5B, 0x1983A2FC,
+				0x943E1ADB, 0xD9389E6B, 0xD1039C2C, 0xA74499AD,
+				0x593D56D9, 0xF3253C06, 0x2ADC1FFC};
+
+	en_dbg(DRV, priv, "Configuring rss steering\n");
+	err = mlx4_qp_reserve_range(mdev->dev, priv->rx_ring_num,
+				    priv->rx_ring_num,
+				    &rss_map->base_qpn);
+	if (err) {
+		en_err(priv, "Failed reserving %d qps\n", priv->rx_ring_num);
+		return err;
+	}
+
+	for (i = 0; i < priv->rx_ring_num; i++) {
+		qpn = rss_map->base_qpn + i;
+		err = mlx4_en_config_rss_qp(priv, qpn, priv->rx_ring[i],
+					    &rss_map->state[i],
+					    &rss_map->qps[i]);
+		if (err)
+			goto rss_err;
+
+		++good_qps;
+	}
+
+	/* Configure RSS indirection qp */
+	err = mlx4_qp_alloc(mdev->dev, priv->base_qpn, &rss_map->indir_qp, GFP_KERNEL);
+	if (err) {
+		en_err(priv, "Failed to allocate RSS indirection QP\n");
+		goto rss_err;
+	}
+	rss_map->indir_qp.event = mlx4_en_sqp_event;
+	mlx4_en_fill_qp_context(priv, 0, 0, 0, 1, priv->base_qpn,
+				priv->rx_ring[0]->cqn, -1, &context);
+
+	if (!priv->prof->rss_rings || priv->prof->rss_rings > priv->rx_ring_num)
+		rss_rings = priv->rx_ring_num;
+	else
+		rss_rings = priv->prof->rss_rings;
+
+	ptr = ((void *) &context) + offsetof(struct mlx4_qp_context, pri_path)
+					+ MLX4_RSS_OFFSET_IN_QPC_PRI_PATH;
+	rss_context = ptr;
+	rss_context->base_qpn = cpu_to_be32(ilog2(rss_rings) << 24 |
+					    (rss_map->base_qpn));
+	rss_context->default_qpn = cpu_to_be32(rss_map->base_qpn);
+	if (priv->mdev->profile.udp_rss) {
+		rss_mask |=  MLX4_RSS_UDP_IPV4 | MLX4_RSS_UDP_IPV6;
+		rss_context->base_qpn_udp = rss_context->default_qpn;
+	}
+
+	if (mdev->dev->caps.tunnel_offload_mode == MLX4_TUNNEL_OFFLOAD_MODE_VXLAN) {
+		en_info(priv, "Setting RSS context tunnel type to RSS on inner headers\n");
+		rss_mask |= MLX4_RSS_BY_INNER_HEADERS;
+	}
+
+	rss_context->flags = rss_mask;
+	rss_context->hash_fn = MLX4_RSS_HASH_TOP;
+	for (i = 0; i < 10; i++)
+		rss_context->rss_key[i] = cpu_to_be32(rsskey[i]);
+
+	err = mlx4_qp_to_ready(mdev->dev, &priv->res.mtt, &context,
+			       &rss_map->indir_qp, &rss_map->indir_state);
+	if (err)
+		goto indir_err;
+
+	return 0;
+
+indir_err:
+	mlx4_qp_modify(mdev->dev, NULL, rss_map->indir_state,
+		       MLX4_QP_STATE_RST, NULL, 0, 0, &rss_map->indir_qp);
+	mlx4_qp_remove(mdev->dev, &rss_map->indir_qp);
+	mlx4_qp_free(mdev->dev, &rss_map->indir_qp);
+rss_err:
+	for (i = 0; i < good_qps; i++) {
+		mlx4_qp_modify(mdev->dev, NULL, rss_map->state[i],
+			       MLX4_QP_STATE_RST, NULL, 0, 0, &rss_map->qps[i]);
+		mlx4_qp_remove(mdev->dev, &rss_map->qps[i]);
+		mlx4_qp_free(mdev->dev, &rss_map->qps[i]);
+	}
+	mlx4_qp_release_range(mdev->dev, rss_map->base_qpn, priv->rx_ring_num);
+	return err;
+}
+
+void mlx4_en_release_rss_steer(struct mlx4_en_priv *priv)
+{
+	struct mlx4_en_dev *mdev = priv->mdev;
+	struct mlx4_en_rss_map *rss_map = &priv->rss_map;
+	int i;
+
+	mlx4_qp_modify(mdev->dev, NULL, rss_map->indir_state,
+		       MLX4_QP_STATE_RST, NULL, 0, 0, &rss_map->indir_qp);
+	mlx4_qp_remove(mdev->dev, &rss_map->indir_qp);
+	mlx4_qp_free(mdev->dev, &rss_map->indir_qp);
+
+	for (i = 0; i < priv->rx_ring_num; i++) {
+		mlx4_qp_modify(mdev->dev, NULL, rss_map->state[i],
+			       MLX4_QP_STATE_RST, NULL, 0, 0, &rss_map->qps[i]);
+		mlx4_qp_remove(mdev->dev, &rss_map->qps[i]);
+		mlx4_qp_free(mdev->dev, &rss_map->qps[i]);
+	}
+	mlx4_qp_release_range(mdev->dev, rss_map->base_qpn, priv->rx_ring_num);
+}
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_selftest.c b/drivers/net/ethernet/mellanox/mlx4/en_selftest.c
new file mode 100644
index 0000000..49d5afc
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4/en_selftest.c
@@ -0,0 +1,178 @@
+/*
+ * Copyright (c) 2007 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/ethtool.h>
+#include <linux/netdevice.h>
+#include <linux/delay.h>
+#include <linux/mlx4/driver.h>
+
+#include "mlx4_en.h"
+
+
+static int mlx4_en_test_registers(struct mlx4_en_priv *priv)
+{
+	return mlx4_cmd(priv->mdev->dev, 0, 0, 0, MLX4_CMD_HW_HEALTH_CHECK,
+			MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED);
+}
+
+static int mlx4_en_test_loopback_xmit(struct mlx4_en_priv *priv)
+{
+	struct sk_buff *skb;
+	struct ethhdr *ethh;
+	unsigned char *packet;
+	unsigned int packet_size = MLX4_LOOPBACK_TEST_PAYLOAD;
+	unsigned int i;
+	int err;
+
+
+	/* build the pkt before xmit */
+	skb = netdev_alloc_skb(priv->dev, MLX4_LOOPBACK_TEST_PAYLOAD + ETH_HLEN + NET_IP_ALIGN);
+	if (!skb)
+		return -ENOMEM;
+
+	skb_reserve(skb, NET_IP_ALIGN);
+
+	ethh = (struct ethhdr *)skb_put(skb, sizeof(struct ethhdr));
+	packet	= (unsigned char *)skb_put(skb, packet_size);
+	memcpy(ethh->h_dest, priv->dev->dev_addr, ETH_ALEN);
+	memset(ethh->h_source, 0, ETH_ALEN);
+	ethh->h_proto = htons(ETH_P_ARP);
+	skb_set_mac_header(skb, 0);
+	for (i = 0; i < packet_size; ++i)	/* fill our packet */
+		packet[i] = (unsigned char)(i & 0xff);
+
+	/* xmit the pkt */
+	err = mlx4_en_xmit(skb, priv->dev);
+	return err;
+}
+
+static int mlx4_en_test_loopback(struct mlx4_en_priv *priv)
+{
+	u32 loopback_ok = 0;
+	int i;
+
+
+        priv->loopback_ok = 0;
+	priv->validate_loopback = 1;
+
+	mlx4_en_update_loopback_state(priv->dev, priv->dev->features);
+
+	/* xmit */
+	if (mlx4_en_test_loopback_xmit(priv)) {
+		en_err(priv, "Transmitting loopback packet failed\n");
+		goto mlx4_en_test_loopback_exit;
+	}
+
+	/* polling for result */
+	for (i = 0; i < MLX4_EN_LOOPBACK_RETRIES; ++i) {
+		msleep(MLX4_EN_LOOPBACK_TIMEOUT);
+		if (priv->loopback_ok) {
+			loopback_ok = 1;
+			break;
+		}
+	}
+	if (!loopback_ok)
+		en_err(priv, "Loopback packet didn't arrive\n");
+
+mlx4_en_test_loopback_exit:
+
+	priv->validate_loopback = 0;
+	mlx4_en_update_loopback_state(priv->dev, priv->dev->features);
+	return !loopback_ok;
+}
+
+
+static int mlx4_en_test_link(struct mlx4_en_priv *priv)
+{
+	if (mlx4_en_QUERY_PORT(priv->mdev, priv->port))
+		return -ENOMEM;
+	if (priv->port_state.link_state == 1)
+		return 0;
+	else
+		return 1;
+}
+
+static int mlx4_en_test_speed(struct mlx4_en_priv *priv)
+{
+
+	if (mlx4_en_QUERY_PORT(priv->mdev, priv->port))
+		return -ENOMEM;
+
+	/* The device supports 1G, 10G and 40G speeds */
+	if (priv->port_state.link_speed != 1000 &&
+	    priv->port_state.link_speed != 10000 &&
+	    priv->port_state.link_speed != 40000)
+		return priv->port_state.link_speed;
+	return 0;
+}
+
+
+void mlx4_en_ex_selftest(struct net_device *dev, u32 *flags, u64 *buf)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = priv->mdev;
+	int i, carrier_ok;
+
+	memset(buf, 0, sizeof(u64) * MLX4_EN_NUM_SELF_TEST);
+
+	if (*flags & ETH_TEST_FL_OFFLINE) {
+		/* disable the interface */
+		carrier_ok = netif_carrier_ok(dev);
+
+		netif_carrier_off(dev);
+		/* Wait until all tx queues are empty.
+		 * there should not be any additional incoming traffic
+		 * since we turned the carrier off */
+		msleep(200);
+
+		if (priv->mdev->dev->caps.flags &
+					MLX4_DEV_CAP_FLAG_UC_LOOPBACK) {
+			buf[3] = mlx4_en_test_registers(priv);
+			if (priv->port_up)
+				buf[4] = mlx4_en_test_loopback(priv);
+		}
+
+		if (carrier_ok)
+			netif_carrier_on(dev);
+
+	}
+	buf[0] = mlx4_test_interrupts(mdev->dev);
+	buf[1] = mlx4_en_test_link(priv);
+	buf[2] = mlx4_en_test_speed(priv);
+
+	for (i = 0; i < MLX4_EN_NUM_SELF_TEST; i++) {
+		if (buf[i])
+			*flags |= ETH_TEST_FL_FAILED;
+	}
+}
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_tx.c b/drivers/net/ethernet/mellanox/mlx4/en_tx.c
new file mode 100644
index 0000000..454d9fe
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4/en_tx.c
@@ -0,0 +1,997 @@
+/*
+ * Copyright (c) 2007 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <asm/page.h>
+#include <linux/mlx4/cq.h>
+#include <linux/slab.h>
+#include <linux/mlx4/qp.h>
+#include <linux/skbuff.h>
+#include <linux/if_vlan.h>
+#include <linux/prefetch.h>
+#include <linux/vmalloc.h>
+#include <linux/tcp.h>
+#include <linux/ip.h>
+#include <linux/moduleparam.h>
+
+#include "mlx4_en.h"
+
+int mlx4_en_create_tx_ring(struct mlx4_en_priv *priv,
+			   struct mlx4_en_tx_ring **pring, int qpn, u32 size,
+			   u16 stride, int node, int queue_index)
+{
+	struct mlx4_en_dev *mdev = priv->mdev;
+	struct mlx4_en_tx_ring *ring;
+	int tmp;
+	int err;
+
+	ring = kzalloc_node(sizeof(*ring), GFP_KERNEL, node);
+	if (!ring) {
+		ring = kzalloc(sizeof(*ring), GFP_KERNEL);
+		if (!ring) {
+			en_err(priv, "Failed allocating TX ring\n");
+			return -ENOMEM;
+		}
+	}
+
+	ring->size = size;
+	ring->size_mask = size - 1;
+	ring->stride = stride;
+
+	tmp = size * sizeof(struct mlx4_en_tx_info);
+	ring->tx_info = kmalloc_node(tmp, GFP_KERNEL | __GFP_NOWARN, node);
+	if (!ring->tx_info) {
+		ring->tx_info = vmalloc(tmp);
+		if (!ring->tx_info) {
+			err = -ENOMEM;
+			goto err_ring;
+		}
+	}
+
+	en_dbg(DRV, priv, "Allocated tx_info ring at addr:%p size:%d\n",
+		 ring->tx_info, tmp);
+
+	ring->bounce_buf = kmalloc_node(MAX_DESC_SIZE, GFP_KERNEL, node);
+	if (!ring->bounce_buf) {
+		ring->bounce_buf = kmalloc(MAX_DESC_SIZE, GFP_KERNEL);
+		if (!ring->bounce_buf) {
+			err = -ENOMEM;
+			goto err_info;
+		}
+	}
+	ring->buf_size = ALIGN(size * ring->stride, MLX4_EN_PAGE_SIZE);
+
+	/* Allocate HW buffers on provided NUMA node */
+	set_dev_node(&mdev->dev->pdev->dev, node);
+	err = mlx4_alloc_hwq_res(mdev->dev, &ring->wqres, ring->buf_size,
+				 2 * PAGE_SIZE);
+	set_dev_node(&mdev->dev->pdev->dev, mdev->dev->numa_node);
+	if (err) {
+		en_err(priv, "Failed allocating hwq resources\n");
+		goto err_bounce;
+	}
+
+	err = mlx4_en_map_buffer(&ring->wqres.buf);
+	if (err) {
+		en_err(priv, "Failed to map TX buffer\n");
+		goto err_hwq_res;
+	}
+
+	ring->buf = ring->wqres.buf.direct.buf;
+
+	en_dbg(DRV, priv, "Allocated TX ring (addr:%p) - buf:%p size:%d buf_size:%d dma:%llx\n",
+	       ring, ring->buf, ring->size, ring->buf_size,
+	       (unsigned long long) ring->wqres.buf.direct.map);
+
+	ring->qpn = qpn;
+	err = mlx4_qp_alloc(mdev->dev, ring->qpn, &ring->qp, GFP_KERNEL);
+	if (err) {
+		en_err(priv, "Failed allocating qp %d\n", ring->qpn);
+		goto err_map;
+	}
+	ring->qp.event = mlx4_en_sqp_event;
+
+	err = mlx4_bf_alloc(mdev->dev, &ring->bf, node);
+	if (err) {
+		en_dbg(DRV, priv, "working without blueflame (%d)\n", err);
+		ring->bf.uar = &mdev->priv_uar;
+		ring->bf.uar->map = mdev->uar_map;
+		ring->bf_enabled = false;
+		ring->bf_alloced = false;
+		priv->pflags &= ~MLX4_EN_PRIV_FLAGS_BLUEFLAME;
+	} else {
+		ring->bf_alloced = true;
+		ring->bf_enabled = !!(priv->pflags &
+				      MLX4_EN_PRIV_FLAGS_BLUEFLAME);
+	}
+
+	ring->hwtstamp_tx_type = priv->hwtstamp_config.tx_type;
+	ring->queue_index = queue_index;
+
+	if (queue_index < priv->num_tx_rings_p_up && cpu_online(queue_index))
+		cpumask_set_cpu(queue_index, &ring->affinity_mask);
+
+	*pring = ring;
+	return 0;
+
+err_map:
+	mlx4_en_unmap_buffer(&ring->wqres.buf);
+err_hwq_res:
+	mlx4_free_hwq_res(mdev->dev, &ring->wqres, ring->buf_size);
+err_bounce:
+	kfree(ring->bounce_buf);
+	ring->bounce_buf = NULL;
+err_info:
+	kvfree(ring->tx_info);
+	ring->tx_info = NULL;
+err_ring:
+	kfree(ring);
+	*pring = NULL;
+	return err;
+}
+
+void mlx4_en_destroy_tx_ring(struct mlx4_en_priv *priv,
+			     struct mlx4_en_tx_ring **pring)
+{
+	struct mlx4_en_dev *mdev = priv->mdev;
+	struct mlx4_en_tx_ring *ring = *pring;
+	en_dbg(DRV, priv, "Destroying tx ring, qpn: %d\n", ring->qpn);
+
+	if (ring->bf_alloced)
+		mlx4_bf_free(mdev->dev, &ring->bf);
+	mlx4_qp_remove(mdev->dev, &ring->qp);
+	mlx4_qp_free(mdev->dev, &ring->qp);
+	mlx4_en_unmap_buffer(&ring->wqres.buf);
+	mlx4_free_hwq_res(mdev->dev, &ring->wqres, ring->buf_size);
+	kfree(ring->bounce_buf);
+	ring->bounce_buf = NULL;
+	kvfree(ring->tx_info);
+	ring->tx_info = NULL;
+	kfree(ring);
+	*pring = NULL;
+}
+
+int mlx4_en_activate_tx_ring(struct mlx4_en_priv *priv,
+			     struct mlx4_en_tx_ring *ring,
+			     int cq, int user_prio)
+{
+	struct mlx4_en_dev *mdev = priv->mdev;
+	int err;
+
+	ring->cqn = cq;
+	ring->prod = 0;
+	ring->cons = 0xffffffff;
+	ring->last_nr_txbb = 1;
+	memset(ring->tx_info, 0, ring->size * sizeof(struct mlx4_en_tx_info));
+	memset(ring->buf, 0, ring->buf_size);
+
+	ring->qp_state = MLX4_QP_STATE_RST;
+	ring->doorbell_qpn = cpu_to_be32(ring->qp.qpn << 8);
+	ring->mr_key = cpu_to_be32(mdev->mr.key);
+
+	mlx4_en_fill_qp_context(priv, ring->size, ring->stride, 1, 0, ring->qpn,
+				ring->cqn, user_prio, &ring->context);
+	if (ring->bf_alloced)
+		ring->context.usr_page = cpu_to_be32(ring->bf.uar->index);
+
+	err = mlx4_qp_to_ready(mdev->dev, &ring->wqres.mtt, &ring->context,
+			       &ring->qp, &ring->qp_state);
+	if (!user_prio && cpu_online(ring->queue_index))
+		netif_set_xps_queue(priv->dev, &ring->affinity_mask,
+				    ring->queue_index);
+
+	return err;
+}
+
+void mlx4_en_deactivate_tx_ring(struct mlx4_en_priv *priv,
+				struct mlx4_en_tx_ring *ring)
+{
+	struct mlx4_en_dev *mdev = priv->mdev;
+
+	mlx4_qp_modify(mdev->dev, NULL, ring->qp_state,
+		       MLX4_QP_STATE_RST, NULL, 0, 0, &ring->qp);
+}
+
+static void mlx4_en_stamp_wqe(struct mlx4_en_priv *priv,
+			      struct mlx4_en_tx_ring *ring, int index,
+			      u8 owner)
+{
+	__be32 stamp = cpu_to_be32(STAMP_VAL | (!!owner << STAMP_SHIFT));
+	struct mlx4_en_tx_desc *tx_desc = ring->buf + index * TXBB_SIZE;
+	struct mlx4_en_tx_info *tx_info = &ring->tx_info[index];
+	void *end = ring->buf + ring->buf_size;
+	__be32 *ptr = (__be32 *)tx_desc;
+	int i;
+
+	/* Optimize the common case when there are no wraparounds */
+	if (likely((void *)tx_desc + tx_info->nr_txbb * TXBB_SIZE <= end)) {
+		/* Stamp the freed descriptor */
+		for (i = 0; i < tx_info->nr_txbb * TXBB_SIZE;
+		     i += STAMP_STRIDE) {
+			*ptr = stamp;
+			ptr += STAMP_DWORDS;
+		}
+	} else {
+		/* Stamp the freed descriptor */
+		for (i = 0; i < tx_info->nr_txbb * TXBB_SIZE;
+		     i += STAMP_STRIDE) {
+			*ptr = stamp;
+			ptr += STAMP_DWORDS;
+			if ((void *)ptr >= end) {
+				ptr = ring->buf;
+				stamp ^= cpu_to_be32(0x80000000);
+			}
+		}
+	}
+}
+
+
+static u32 mlx4_en_free_tx_desc(struct mlx4_en_priv *priv,
+				struct mlx4_en_tx_ring *ring,
+				int index, u8 owner, u64 timestamp)
+{
+	struct mlx4_en_tx_info *tx_info = &ring->tx_info[index];
+	struct mlx4_en_tx_desc *tx_desc = ring->buf + index * TXBB_SIZE;
+	struct mlx4_wqe_data_seg *data = (void *) tx_desc + tx_info->data_offset;
+	void *end = ring->buf + ring->buf_size;
+	struct sk_buff *skb = tx_info->skb;
+	int nr_maps = tx_info->nr_maps;
+	int i;
+
+	/* We do not touch skb here, so prefetch skb->users location
+	 * to speedup consume_skb()
+	 */
+	prefetchw(&skb->users);
+
+	if (unlikely(timestamp)) {
+		struct skb_shared_hwtstamps hwts;
+
+		mlx4_en_fill_hwtstamps(priv->mdev, &hwts, timestamp);
+		skb_tstamp_tx(skb, &hwts);
+	}
+
+	/* Optimize the common case when there are no wraparounds */
+	if (likely((void *) tx_desc + tx_info->nr_txbb * TXBB_SIZE <= end)) {
+		if (!tx_info->inl) {
+			if (tx_info->linear)
+				dma_unmap_single(priv->ddev,
+						tx_info->map0_dma,
+						tx_info->map0_byte_count,
+						PCI_DMA_TODEVICE);
+			else
+				dma_unmap_page(priv->ddev,
+					       tx_info->map0_dma,
+					       tx_info->map0_byte_count,
+					       PCI_DMA_TODEVICE);
+			for (i = 1; i < nr_maps; i++) {
+				data++;
+				dma_unmap_page(priv->ddev,
+					(dma_addr_t)be64_to_cpu(data->addr),
+					be32_to_cpu(data->byte_count),
+					PCI_DMA_TODEVICE);
+			}
+		}
+	} else {
+		if (!tx_info->inl) {
+			if ((void *) data >= end) {
+				data = ring->buf + ((void *)data - end);
+			}
+
+			if (tx_info->linear)
+				dma_unmap_single(priv->ddev,
+						tx_info->map0_dma,
+						tx_info->map0_byte_count,
+						PCI_DMA_TODEVICE);
+			else
+				dma_unmap_page(priv->ddev,
+					       tx_info->map0_dma,
+					       tx_info->map0_byte_count,
+					       PCI_DMA_TODEVICE);
+			for (i = 1; i < nr_maps; i++) {
+				data++;
+				/* Check for wraparound before unmapping */
+				if ((void *) data >= end)
+					data = ring->buf;
+				dma_unmap_page(priv->ddev,
+					(dma_addr_t)be64_to_cpu(data->addr),
+					be32_to_cpu(data->byte_count),
+					PCI_DMA_TODEVICE);
+			}
+		}
+	}
+	dev_consume_skb_any(skb);
+	return tx_info->nr_txbb;
+}
+
+
+int mlx4_en_free_tx_buf(struct net_device *dev, struct mlx4_en_tx_ring *ring)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	int cnt = 0;
+
+	/* Skip last polled descriptor */
+	ring->cons += ring->last_nr_txbb;
+	en_dbg(DRV, priv, "Freeing Tx buf - cons:0x%x prod:0x%x\n",
+		 ring->cons, ring->prod);
+
+	if ((u32) (ring->prod - ring->cons) > ring->size) {
+		if (netif_msg_tx_err(priv))
+			en_warn(priv, "Tx consumer passed producer!\n");
+		return 0;
+	}
+
+	while (ring->cons != ring->prod) {
+		ring->last_nr_txbb = mlx4_en_free_tx_desc(priv, ring,
+						ring->cons & ring->size_mask,
+						!!(ring->cons & ring->size), 0);
+		ring->cons += ring->last_nr_txbb;
+		cnt++;
+	}
+
+	netdev_tx_reset_queue(ring->tx_queue);
+
+	if (cnt)
+		en_dbg(DRV, priv, "Freed %d uncompleted tx descriptors\n", cnt);
+
+	return cnt;
+}
+
+static bool mlx4_en_process_tx_cq(struct net_device *dev,
+				 struct mlx4_en_cq *cq)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_cq *mcq = &cq->mcq;
+	struct mlx4_en_tx_ring *ring = priv->tx_ring[cq->ring];
+	struct mlx4_cqe *cqe;
+	u16 index;
+	u16 new_index, ring_index, stamp_index;
+	u32 txbbs_skipped = 0;
+	u32 txbbs_stamp = 0;
+	u32 cons_index = mcq->cons_index;
+	int size = cq->size;
+	u32 size_mask = ring->size_mask;
+	struct mlx4_cqe *buf = cq->buf;
+	u32 packets = 0;
+	u32 bytes = 0;
+	int factor = priv->cqe_factor;
+	u64 timestamp = 0;
+	int done = 0;
+	int budget = priv->tx_work_limit;
+	u32 last_nr_txbb;
+	u32 ring_cons;
+
+	if (!priv->port_up)
+		return true;
+
+	netdev_txq_bql_complete_prefetchw(ring->tx_queue);
+
+	index = cons_index & size_mask;
+	cqe = mlx4_en_get_cqe(buf, index, priv->cqe_size) + factor;
+	last_nr_txbb = ACCESS_ONCE(ring->last_nr_txbb);
+	ring_cons = ACCESS_ONCE(ring->cons);
+	ring_index = ring_cons & size_mask;
+	stamp_index = ring_index;
+
+	/* Process all completed CQEs */
+	while (XNOR(cqe->owner_sr_opcode & MLX4_CQE_OWNER_MASK,
+			cons_index & size) && (done < budget)) {
+		/*
+		 * make sure we read the CQE after we read the
+		 * ownership bit
+		 */
+		rmb();
+
+		if (unlikely((cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) ==
+			     MLX4_CQE_OPCODE_ERROR)) {
+			struct mlx4_err_cqe *cqe_err = (struct mlx4_err_cqe *)cqe;
+
+			en_err(priv, "CQE error - vendor syndrome: 0x%x syndrome: 0x%x\n",
+			       cqe_err->vendor_err_syndrome,
+			       cqe_err->syndrome);
+		}
+
+		/* Skip over last polled CQE */
+		new_index = be16_to_cpu(cqe->wqe_index) & size_mask;
+
+		do {
+			txbbs_skipped += last_nr_txbb;
+			ring_index = (ring_index + last_nr_txbb) & size_mask;
+			if (ring->tx_info[ring_index].ts_requested)
+				timestamp = mlx4_en_get_cqe_ts(cqe);
+
+			/* free next descriptor */
+			last_nr_txbb = mlx4_en_free_tx_desc(
+					priv, ring, ring_index,
+					!!((ring_cons + txbbs_skipped) &
+					ring->size), timestamp);
+
+			mlx4_en_stamp_wqe(priv, ring, stamp_index,
+					  !!((ring_cons + txbbs_stamp) &
+						ring->size));
+			stamp_index = ring_index;
+			txbbs_stamp = txbbs_skipped;
+			packets++;
+			bytes += ring->tx_info[ring_index].nr_bytes;
+		} while ((++done < budget) && (ring_index != new_index));
+
+		++cons_index;
+		index = cons_index & size_mask;
+		cqe = mlx4_en_get_cqe(buf, index, priv->cqe_size) + factor;
+	}
+
+
+	/*
+	 * To prevent CQ overflow we first update CQ consumer and only then
+	 * the ring consumer.
+	 */
+	mcq->cons_index = cons_index;
+	mlx4_cq_set_ci(mcq);
+	wmb();
+
+	/* we want to dirty this cache line once */
+	ACCESS_ONCE(ring->last_nr_txbb) = last_nr_txbb;
+	ACCESS_ONCE(ring->cons) = ring_cons + txbbs_skipped;
+
+	netdev_tx_completed_queue(ring->tx_queue, packets, bytes);
+
+	/*
+	 * Wakeup Tx queue if this stopped, and at least 1 packet
+	 * was completed
+	 */
+	if (netif_tx_queue_stopped(ring->tx_queue) && txbbs_skipped > 0) {
+		netif_tx_wake_queue(ring->tx_queue);
+		ring->wake_queue++;
+	}
+	return done < budget;
+}
+
+void mlx4_en_tx_irq(struct mlx4_cq *mcq)
+{
+	struct mlx4_en_cq *cq = container_of(mcq, struct mlx4_en_cq, mcq);
+	struct mlx4_en_priv *priv = netdev_priv(cq->dev);
+
+	if (priv->port_up)
+		napi_schedule(&cq->napi);
+	else
+		mlx4_en_arm_cq(priv, cq);
+}
+
+/* TX CQ polling - called by NAPI */
+int mlx4_en_poll_tx_cq(struct napi_struct *napi, int budget)
+{
+	struct mlx4_en_cq *cq = container_of(napi, struct mlx4_en_cq, napi);
+	struct net_device *dev = cq->dev;
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	int clean_complete;
+
+	clean_complete = mlx4_en_process_tx_cq(dev, cq);
+	if (!clean_complete)
+		return budget;
+
+	napi_complete(napi);
+	mlx4_en_arm_cq(priv, cq);
+
+	return 0;
+}
+
+static struct mlx4_en_tx_desc *mlx4_en_bounce_to_desc(struct mlx4_en_priv *priv,
+						      struct mlx4_en_tx_ring *ring,
+						      u32 index,
+						      unsigned int desc_size)
+{
+	u32 copy = (ring->size - index) * TXBB_SIZE;
+	int i;
+
+	for (i = desc_size - copy - 4; i >= 0; i -= 4) {
+		if ((i & (TXBB_SIZE - 1)) == 0)
+			wmb();
+
+		*((u32 *) (ring->buf + i)) =
+			*((u32 *) (ring->bounce_buf + copy + i));
+	}
+
+	for (i = copy - 4; i >= 4 ; i -= 4) {
+		if ((i & (TXBB_SIZE - 1)) == 0)
+			wmb();
+
+		*((u32 *) (ring->buf + index * TXBB_SIZE + i)) =
+			*((u32 *) (ring->bounce_buf + i));
+	}
+
+	/* Return real descriptor location */
+	return ring->buf + index * TXBB_SIZE;
+}
+
+/* Decide if skb can be inlined in tx descriptor to avoid dma mapping
+ *
+ * It seems strange we do not simply use skb_copy_bits().
+ * This would allow to inline all skbs iff skb->len <= inline_thold
+ *
+ * Note that caller already checked skb was not a gso packet
+ */
+static bool is_inline(int inline_thold, const struct sk_buff *skb,
+		      const struct skb_shared_info *shinfo,
+		      void **pfrag)
+{
+	void *ptr;
+
+	if (skb->len > inline_thold || !inline_thold)
+		return false;
+
+	if (shinfo->nr_frags == 1) {
+		ptr = skb_frag_address_safe(&shinfo->frags[0]);
+		if (unlikely(!ptr))
+			return false;
+		*pfrag = ptr;
+		return true;
+	}
+	if (shinfo->nr_frags)
+		return false;
+	return true;
+}
+
+static int inline_size(const struct sk_buff *skb)
+{
+	if (skb->len + CTRL_SIZE + sizeof(struct mlx4_wqe_inline_seg)
+	    <= MLX4_INLINE_ALIGN)
+		return ALIGN(skb->len + CTRL_SIZE +
+			     sizeof(struct mlx4_wqe_inline_seg), 16);
+	else
+		return ALIGN(skb->len + CTRL_SIZE + 2 *
+			     sizeof(struct mlx4_wqe_inline_seg), 16);
+}
+
+static int get_real_size(const struct sk_buff *skb,
+			 const struct skb_shared_info *shinfo,
+			 struct net_device *dev,
+			 int *lso_header_size,
+			 bool *inline_ok,
+			 void **pfrag)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	int real_size;
+
+	if (shinfo->gso_size) {
+		*inline_ok = false;
+		if (skb->encapsulation)
+			*lso_header_size = (skb_inner_transport_header(skb) - skb->data) + inner_tcp_hdrlen(skb);
+		else
+			*lso_header_size = skb_transport_offset(skb) + tcp_hdrlen(skb);
+		real_size = CTRL_SIZE + shinfo->nr_frags * DS_SIZE +
+			ALIGN(*lso_header_size + 4, DS_SIZE);
+		if (unlikely(*lso_header_size != skb_headlen(skb))) {
+			/* We add a segment for the skb linear buffer only if
+			 * it contains data */
+			if (*lso_header_size < skb_headlen(skb))
+				real_size += DS_SIZE;
+			else {
+				if (netif_msg_tx_err(priv))
+					en_warn(priv, "Non-linear headers\n");
+				return 0;
+			}
+		}
+	} else {
+		*lso_header_size = 0;
+		*inline_ok = is_inline(priv->prof->inline_thold, skb,
+				       shinfo, pfrag);
+
+		if (*inline_ok)
+			real_size = inline_size(skb);
+		else
+			real_size = CTRL_SIZE +
+				    (shinfo->nr_frags + 1) * DS_SIZE;
+	}
+
+	return real_size;
+}
+
+static void build_inline_wqe(struct mlx4_en_tx_desc *tx_desc,
+			     const struct sk_buff *skb,
+			     const struct skb_shared_info *shinfo,
+			     int real_size, u16 *vlan_tag,
+			     int tx_ind, void *fragptr)
+{
+	struct mlx4_wqe_inline_seg *inl = &tx_desc->inl;
+	int spc = MLX4_INLINE_ALIGN - CTRL_SIZE - sizeof *inl;
+	unsigned int hlen = skb_headlen(skb);
+
+	if (skb->len <= spc) {
+		if (likely(skb->len >= MIN_PKT_LEN)) {
+			inl->byte_count = cpu_to_be32(1 << 31 | skb->len);
+		} else {
+			inl->byte_count = cpu_to_be32(1 << 31 | MIN_PKT_LEN);
+			memset(((void *)(inl + 1)) + skb->len, 0,
+			       MIN_PKT_LEN - skb->len);
+		}
+		skb_copy_from_linear_data(skb, inl + 1, hlen);
+		if (shinfo->nr_frags)
+			memcpy(((void *)(inl + 1)) + hlen, fragptr,
+			       skb_frag_size(&shinfo->frags[0]));
+
+	} else {
+		inl->byte_count = cpu_to_be32(1 << 31 | spc);
+		if (hlen <= spc) {
+			skb_copy_from_linear_data(skb, inl + 1, hlen);
+			if (hlen < spc) {
+				memcpy(((void *)(inl + 1)) + hlen,
+				       fragptr, spc - hlen);
+				fragptr +=  spc - hlen;
+			}
+			inl = (void *) (inl + 1) + spc;
+			memcpy(((void *)(inl + 1)), fragptr, skb->len - spc);
+		} else {
+			skb_copy_from_linear_data(skb, inl + 1, spc);
+			inl = (void *) (inl + 1) + spc;
+			skb_copy_from_linear_data_offset(skb, spc, inl + 1,
+							 hlen - spc);
+			if (shinfo->nr_frags)
+				memcpy(((void *)(inl + 1)) + hlen - spc,
+				       fragptr,
+				       skb_frag_size(&shinfo->frags[0]));
+		}
+
+		wmb();
+		inl->byte_count = cpu_to_be32(1 << 31 | (skb->len - spc));
+	}
+}
+
+u16 mlx4_en_select_queue(struct net_device *dev, struct sk_buff *skb,
+			 void *accel_priv, select_queue_fallback_t fallback)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	u16 rings_p_up = priv->num_tx_rings_p_up;
+	u8 up = 0;
+
+	if (dev->num_tc)
+		return skb_tx_hash(dev, skb);
+
+	if (vlan_tx_tag_present(skb))
+		up = vlan_tx_tag_get(skb) >> VLAN_PRIO_SHIFT;
+
+	return fallback(dev, skb) % rings_p_up + up * rings_p_up;
+}
+
+static void mlx4_bf_copy(void __iomem *dst, const void *src,
+			 unsigned int bytecnt)
+{
+	__iowrite64_copy(dst, src, bytecnt / 8);
+}
+
+netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+	struct skb_shared_info *shinfo = skb_shinfo(skb);
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct device *ddev = priv->ddev;
+	struct mlx4_en_tx_ring *ring;
+	struct mlx4_en_tx_desc *tx_desc;
+	struct mlx4_wqe_data_seg *data;
+	struct mlx4_en_tx_info *tx_info;
+	int tx_ind = 0;
+	int nr_txbb;
+	int desc_size;
+	int real_size;
+	u32 index, bf_index;
+	__be32 op_own;
+	u16 vlan_tag = 0;
+	int i_frag;
+	int lso_header_size;
+	void *fragptr = NULL;
+	bool bounce = false;
+	bool send_doorbell;
+	bool stop_queue;
+	bool inline_ok;
+	u32 ring_cons;
+
+	if (!priv->port_up)
+		goto tx_drop;
+
+	tx_ind = skb_get_queue_mapping(skb);
+	ring = priv->tx_ring[tx_ind];
+
+	/* fetch ring->cons far ahead before needing it to avoid stall */
+	ring_cons = ACCESS_ONCE(ring->cons);
+
+	real_size = get_real_size(skb, shinfo, dev, &lso_header_size,
+				  &inline_ok, &fragptr);
+	if (unlikely(!real_size))
+		goto tx_drop;
+
+	/* Align descriptor to TXBB size */
+	desc_size = ALIGN(real_size, TXBB_SIZE);
+	nr_txbb = desc_size / TXBB_SIZE;
+	if (unlikely(nr_txbb > MAX_DESC_TXBBS)) {
+		if (netif_msg_tx_err(priv))
+			en_warn(priv, "Oversized header or SG list\n");
+		goto tx_drop;
+	}
+
+	if (vlan_tx_tag_present(skb))
+		vlan_tag = vlan_tx_tag_get(skb);
+
+
+	netdev_txq_bql_enqueue_prefetchw(ring->tx_queue);
+
+	/* Track current inflight packets for performance analysis */
+	AVG_PERF_COUNTER(priv->pstats.inflight_avg,
+			 (u32)(ring->prod - ring_cons - 1));
+
+	/* Packet is good - grab an index and transmit it */
+	index = ring->prod & ring->size_mask;
+	bf_index = ring->prod;
+
+	/* See if we have enough space for whole descriptor TXBB for setting
+	 * SW ownership on next descriptor; if not, use a bounce buffer. */
+	if (likely(index + nr_txbb <= ring->size))
+		tx_desc = ring->buf + index * TXBB_SIZE;
+	else {
+		tx_desc = (struct mlx4_en_tx_desc *) ring->bounce_buf;
+		bounce = true;
+	}
+
+	/* Save skb in tx_info ring */
+	tx_info = &ring->tx_info[index];
+	tx_info->skb = skb;
+	tx_info->nr_txbb = nr_txbb;
+
+	data = &tx_desc->data;
+	if (lso_header_size)
+		data = ((void *)&tx_desc->lso + ALIGN(lso_header_size + 4,
+						      DS_SIZE));
+
+	/* valid only for none inline segments */
+	tx_info->data_offset = (void *)data - (void *)tx_desc;
+
+	tx_info->inl = inline_ok;
+
+	tx_info->linear = (lso_header_size < skb_headlen(skb) &&
+			   !inline_ok) ? 1 : 0;
+
+	tx_info->nr_maps = shinfo->nr_frags + tx_info->linear;
+	data += tx_info->nr_maps - 1;
+
+	if (!tx_info->inl) {
+		dma_addr_t dma = 0;
+		u32 byte_count = 0;
+
+		/* Map fragments if any */
+		for (i_frag = shinfo->nr_frags - 1; i_frag >= 0; i_frag--) {
+			const struct skb_frag_struct *frag;
+
+			frag = &shinfo->frags[i_frag];
+			byte_count = skb_frag_size(frag);
+			dma = skb_frag_dma_map(ddev, frag,
+					       0, byte_count,
+					       DMA_TO_DEVICE);
+			if (dma_mapping_error(ddev, dma))
+				goto tx_drop_unmap;
+
+			data->addr = cpu_to_be64(dma);
+			data->lkey = ring->mr_key;
+			wmb();
+			data->byte_count = cpu_to_be32(byte_count);
+			--data;
+		}
+
+		/* Map linear part if needed */
+		if (tx_info->linear) {
+			byte_count = skb_headlen(skb) - lso_header_size;
+
+			dma = dma_map_single(ddev, skb->data +
+					     lso_header_size, byte_count,
+					     PCI_DMA_TODEVICE);
+			if (dma_mapping_error(ddev, dma))
+				goto tx_drop_unmap;
+
+			data->addr = cpu_to_be64(dma);
+			data->lkey = ring->mr_key;
+			wmb();
+			data->byte_count = cpu_to_be32(byte_count);
+		}
+		/* tx completion can avoid cache line miss for common cases */
+		tx_info->map0_dma = dma;
+		tx_info->map0_byte_count = byte_count;
+	}
+
+	/*
+	 * For timestamping add flag to skb_shinfo and
+	 * set flag for further reference
+	 */
+	tx_info->ts_requested = 0;
+	if (unlikely(ring->hwtstamp_tx_type == HWTSTAMP_TX_ON &&
+		     shinfo->tx_flags & SKBTX_HW_TSTAMP)) {
+		shinfo->tx_flags |= SKBTX_IN_PROGRESS;
+		tx_info->ts_requested = 1;
+	}
+
+	/* Prepare ctrl segement apart opcode+ownership, which depends on
+	 * whether LSO is used */
+	tx_desc->ctrl.srcrb_flags = priv->ctrl_flags;
+	if (likely(skb->ip_summed == CHECKSUM_PARTIAL)) {
+		if (!skb->encapsulation)
+			tx_desc->ctrl.srcrb_flags |= cpu_to_be32(MLX4_WQE_CTRL_IP_CSUM |
+								 MLX4_WQE_CTRL_TCP_UDP_CSUM);
+		else
+			tx_desc->ctrl.srcrb_flags |= cpu_to_be32(MLX4_WQE_CTRL_IP_CSUM);
+		ring->tx_csum++;
+	}
+
+	if (priv->flags & MLX4_EN_FLAG_ENABLE_HW_LOOPBACK) {
+		struct ethhdr *ethh;
+
+		/* Copy dst mac address to wqe. This allows loopback in eSwitch,
+		 * so that VFs and PF can communicate with each other
+		 */
+		ethh = (struct ethhdr *)skb->data;
+		tx_desc->ctrl.srcrb_flags16[0] = get_unaligned((__be16 *)ethh->h_dest);
+		tx_desc->ctrl.imm = get_unaligned((__be32 *)(ethh->h_dest + 2));
+	}
+
+	/* Handle LSO (TSO) packets */
+	if (lso_header_size) {
+		int i;
+
+		/* Mark opcode as LSO */
+		op_own = cpu_to_be32(MLX4_OPCODE_LSO | (1 << 6)) |
+			((ring->prod & ring->size) ?
+				cpu_to_be32(MLX4_EN_BIT_DESC_OWN) : 0);
+
+		/* Fill in the LSO prefix */
+		tx_desc->lso.mss_hdr_size = cpu_to_be32(
+			shinfo->gso_size << 16 | lso_header_size);
+
+		/* Copy headers;
+		 * note that we already verified that it is linear */
+		memcpy(tx_desc->lso.header, skb->data, lso_header_size);
+
+		ring->tso_packets++;
+
+		i = ((skb->len - lso_header_size) / shinfo->gso_size) +
+			!!((skb->len - lso_header_size) % shinfo->gso_size);
+		tx_info->nr_bytes = skb->len + (i - 1) * lso_header_size;
+		ring->packets += i;
+	} else {
+		/* Normal (Non LSO) packet */
+		op_own = cpu_to_be32(MLX4_OPCODE_SEND) |
+			((ring->prod & ring->size) ?
+			 cpu_to_be32(MLX4_EN_BIT_DESC_OWN) : 0);
+		tx_info->nr_bytes = max_t(unsigned int, skb->len, ETH_ZLEN);
+		ring->packets++;
+	}
+	ring->bytes += tx_info->nr_bytes;
+	netdev_tx_sent_queue(ring->tx_queue, tx_info->nr_bytes);
+	AVG_PERF_COUNTER(priv->pstats.tx_pktsz_avg, skb->len);
+
+	if (tx_info->inl)
+		build_inline_wqe(tx_desc, skb, shinfo, real_size, &vlan_tag,
+				 tx_ind, fragptr);
+
+	if (skb->encapsulation) {
+		struct iphdr *ipv4 = (struct iphdr *)skb_inner_network_header(skb);
+		if (ipv4->protocol == IPPROTO_TCP || ipv4->protocol == IPPROTO_UDP)
+			op_own |= cpu_to_be32(MLX4_WQE_CTRL_IIP | MLX4_WQE_CTRL_ILP);
+		else
+			op_own |= cpu_to_be32(MLX4_WQE_CTRL_IIP);
+	}
+
+	ring->prod += nr_txbb;
+
+	/* If we used a bounce buffer then copy descriptor back into place */
+	if (unlikely(bounce))
+		tx_desc = mlx4_en_bounce_to_desc(priv, ring, index, desc_size);
+
+	skb_tx_timestamp(skb);
+
+	/* Check available TXBBs And 2K spare for prefetch */
+	stop_queue = (int)(ring->prod - ring_cons) >
+		      ring->size - HEADROOM - MAX_DESC_TXBBS;
+	if (unlikely(stop_queue)) {
+		netif_tx_stop_queue(ring->tx_queue);
+		ring->queue_stopped++;
+	}
+	send_doorbell = !skb->xmit_more || netif_xmit_stopped(ring->tx_queue);
+
+	real_size = (real_size / 16) & 0x3f;
+
+	if (ring->bf_enabled && desc_size <= MAX_BF && !bounce &&
+	    !vlan_tx_tag_present(skb) && send_doorbell) {
+		tx_desc->ctrl.bf_qpn = ring->doorbell_qpn |
+				       cpu_to_be32(real_size);
+
+		op_own |= htonl((bf_index & 0xffff) << 8);
+		/* Ensure new descriptor hits memory
+		 * before setting ownership of this descriptor to HW
+		 */
+		wmb();
+		tx_desc->ctrl.owner_opcode = op_own;
+
+		wmb();
+
+		mlx4_bf_copy(ring->bf.reg + ring->bf.offset, &tx_desc->ctrl,
+			     desc_size);
+
+		wmb();
+
+		ring->bf.offset ^= ring->bf.buf_size;
+	} else {
+		tx_desc->ctrl.vlan_tag = cpu_to_be16(vlan_tag);
+		tx_desc->ctrl.ins_vlan = MLX4_WQE_CTRL_INS_VLAN *
+			!!vlan_tx_tag_present(skb);
+		tx_desc->ctrl.fence_size = real_size;
+
+		/* Ensure new descriptor hits memory
+		 * before setting ownership of this descriptor to HW
+		 */
+		wmb();
+		tx_desc->ctrl.owner_opcode = op_own;
+		if (send_doorbell) {
+			wmb();
+			iowrite32(ring->doorbell_qpn,
+				  ring->bf.uar->map + MLX4_SEND_DOORBELL);
+		} else {
+			ring->xmit_more++;
+		}
+	}
+
+	if (unlikely(stop_queue)) {
+		/* If queue was emptied after the if (stop_queue) , and before
+		 * the netif_tx_stop_queue() - need to wake the queue,
+		 * or else it will remain stopped forever.
+		 * Need a memory barrier to make sure ring->cons was not
+		 * updated before queue was stopped.
+		 */
+		smp_rmb();
+
+		ring_cons = ACCESS_ONCE(ring->cons);
+		if (unlikely(((int)(ring->prod - ring_cons)) <=
+			     ring->size - HEADROOM - MAX_DESC_TXBBS)) {
+			netif_tx_wake_queue(ring->tx_queue);
+			ring->wake_queue++;
+		}
+	}
+	return NETDEV_TX_OK;
+
+tx_drop_unmap:
+	en_err(priv, "DMA mapping error\n");
+
+	while (++i_frag < shinfo->nr_frags) {
+		++data;
+		dma_unmap_page(ddev, (dma_addr_t) be64_to_cpu(data->addr),
+			       be32_to_cpu(data->byte_count),
+			       PCI_DMA_TODEVICE);
+	}
+
+tx_drop:
+	dev_kfree_skb_any(skb);
+	priv->stats.tx_dropped++;
+	return NETDEV_TX_OK;
+}
+
diff --git a/drivers/net/ethernet/mellanox/mlx4/eq.c b/drivers/net/ethernet/mellanox/mlx4/eq.c
new file mode 100644
index 0000000..49290a4
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4/eq.c
@@ -0,0 +1,1409 @@
+/*
+ * Copyright (c) 2005, 2006, 2007, 2008 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2005, 2006, 2007 Cisco Systems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *	- Redistributions of source code must retain the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer.
+ *
+ *	- Redistributions in binary form must reproduce the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer in the documentation and/or other materials
+ *	  provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/interrupt.h>
+#include <linux/slab.h>
+#include <linux/export.h>
+#include <linux/mm.h>
+#include <linux/dma-mapping.h>
+
+#include <linux/mlx4/cmd.h>
+#include <linux/cpu_rmap.h>
+
+#include "mlx4.h"
+#include "fw.h"
+
+enum {
+	MLX4_IRQNAME_SIZE	= 32
+};
+
+enum {
+	MLX4_NUM_ASYNC_EQE	= 0x100,
+	MLX4_NUM_SPARE_EQE	= 0x80,
+	MLX4_EQ_ENTRY_SIZE	= 0x20
+};
+
+#define MLX4_EQ_STATUS_OK	   ( 0 << 28)
+#define MLX4_EQ_STATUS_WRITE_FAIL  (10 << 28)
+#define MLX4_EQ_OWNER_SW	   ( 0 << 24)
+#define MLX4_EQ_OWNER_HW	   ( 1 << 24)
+#define MLX4_EQ_FLAG_EC		   ( 1 << 18)
+#define MLX4_EQ_FLAG_OI		   ( 1 << 17)
+#define MLX4_EQ_STATE_ARMED	   ( 9 <<  8)
+#define MLX4_EQ_STATE_FIRED	   (10 <<  8)
+#define MLX4_EQ_STATE_ALWAYS_ARMED (11 <<  8)
+
+#define MLX4_ASYNC_EVENT_MASK ((1ull << MLX4_EVENT_TYPE_PATH_MIG)	    | \
+			       (1ull << MLX4_EVENT_TYPE_COMM_EST)	    | \
+			       (1ull << MLX4_EVENT_TYPE_SQ_DRAINED)	    | \
+			       (1ull << MLX4_EVENT_TYPE_CQ_ERROR)	    | \
+			       (1ull << MLX4_EVENT_TYPE_WQ_CATAS_ERROR)	    | \
+			       (1ull << MLX4_EVENT_TYPE_EEC_CATAS_ERROR)    | \
+			       (1ull << MLX4_EVENT_TYPE_PATH_MIG_FAILED)    | \
+			       (1ull << MLX4_EVENT_TYPE_WQ_INVAL_REQ_ERROR) | \
+			       (1ull << MLX4_EVENT_TYPE_WQ_ACCESS_ERROR)    | \
+			       (1ull << MLX4_EVENT_TYPE_PORT_CHANGE)	    | \
+			       (1ull << MLX4_EVENT_TYPE_ECC_DETECT)	    | \
+			       (1ull << MLX4_EVENT_TYPE_SRQ_CATAS_ERROR)    | \
+			       (1ull << MLX4_EVENT_TYPE_SRQ_QP_LAST_WQE)    | \
+			       (1ull << MLX4_EVENT_TYPE_SRQ_LIMIT)	    | \
+			       (1ull << MLX4_EVENT_TYPE_CMD)		    | \
+			       (1ull << MLX4_EVENT_TYPE_OP_REQUIRED)	    | \
+			       (1ull << MLX4_EVENT_TYPE_COMM_CHANNEL)       | \
+			       (1ull << MLX4_EVENT_TYPE_FLR_EVENT)	    | \
+			       (1ull << MLX4_EVENT_TYPE_FATAL_WARNING))
+
+static u64 get_async_ev_mask(struct mlx4_dev *dev)
+{
+	u64 async_ev_mask = MLX4_ASYNC_EVENT_MASK;
+	if (dev->caps.flags & MLX4_DEV_CAP_FLAG_PORT_MNG_CHG_EV)
+		async_ev_mask |= (1ull << MLX4_EVENT_TYPE_PORT_MNG_CHG_EVENT);
+
+	return async_ev_mask;
+}
+
+static void eq_set_ci(struct mlx4_eq *eq, int req_not)
+{
+	__raw_writel((__force u32) cpu_to_be32((eq->cons_index & 0xffffff) |
+					       req_not << 31),
+		     eq->doorbell);
+	/* We still want ordering, just not swabbing, so add a barrier */
+	mb();
+}
+
+static struct mlx4_eqe *get_eqe(struct mlx4_eq *eq, u32 entry, u8 eqe_factor,
+				u8 eqe_size)
+{
+	/* (entry & (eq->nent - 1)) gives us a cyclic array */
+	unsigned long offset = (entry & (eq->nent - 1)) * eqe_size;
+	/* CX3 is capable of extending the EQE from 32 to 64 bytes with
+	 * strides of 64B,128B and 256B.
+	 * When 64B EQE is used, the first (in the lower addresses)
+	 * 32 bytes in the 64 byte EQE are reserved and the next 32 bytes
+	 * contain the legacy EQE information.
+	 * In all other cases, the first 32B contains the legacy EQE info.
+	 */
+	return eq->page_list[offset / PAGE_SIZE].buf + (offset + (eqe_factor ? MLX4_EQ_ENTRY_SIZE : 0)) % PAGE_SIZE;
+}
+
+static struct mlx4_eqe *next_eqe_sw(struct mlx4_eq *eq, u8 eqe_factor, u8 size)
+{
+	struct mlx4_eqe *eqe = get_eqe(eq, eq->cons_index, eqe_factor, size);
+	return !!(eqe->owner & 0x80) ^ !!(eq->cons_index & eq->nent) ? NULL : eqe;
+}
+
+static struct mlx4_eqe *next_slave_event_eqe(struct mlx4_slave_event_eq *slave_eq)
+{
+	struct mlx4_eqe *eqe =
+		&slave_eq->event_eqe[slave_eq->cons & (SLAVE_EVENT_EQ_SIZE - 1)];
+	return (!!(eqe->owner & 0x80) ^
+		!!(slave_eq->cons & SLAVE_EVENT_EQ_SIZE)) ?
+		eqe : NULL;
+}
+
+void mlx4_gen_slave_eqe(struct work_struct *work)
+{
+	struct mlx4_mfunc_master_ctx *master =
+		container_of(work, struct mlx4_mfunc_master_ctx,
+			     slave_event_work);
+	struct mlx4_mfunc *mfunc =
+		container_of(master, struct mlx4_mfunc, master);
+	struct mlx4_priv *priv = container_of(mfunc, struct mlx4_priv, mfunc);
+	struct mlx4_dev *dev = &priv->dev;
+	struct mlx4_slave_event_eq *slave_eq = &mfunc->master.slave_eq;
+	struct mlx4_eqe *eqe;
+	u8 slave;
+	int i;
+
+	for (eqe = next_slave_event_eqe(slave_eq); eqe;
+	      eqe = next_slave_event_eqe(slave_eq)) {
+		slave = eqe->slave_id;
+
+		/* All active slaves need to receive the event */
+		if (slave == ALL_SLAVES) {
+			for (i = 0; i < dev->num_slaves; i++) {
+				if (i != dev->caps.function &&
+				    master->slave_state[i].active)
+					if (mlx4_GEN_EQE(dev, i, eqe))
+						mlx4_warn(dev, "Failed to generate event for slave %d\n",
+							  i);
+			}
+		} else {
+			if (mlx4_GEN_EQE(dev, slave, eqe))
+				mlx4_warn(dev, "Failed to generate event for slave %d\n",
+					  slave);
+		}
+		++slave_eq->cons;
+	}
+}
+
+
+static void slave_event(struct mlx4_dev *dev, u8 slave, struct mlx4_eqe *eqe)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_slave_event_eq *slave_eq = &priv->mfunc.master.slave_eq;
+	struct mlx4_eqe *s_eqe;
+	unsigned long flags;
+
+	spin_lock_irqsave(&slave_eq->event_lock, flags);
+	s_eqe = &slave_eq->event_eqe[slave_eq->prod & (SLAVE_EVENT_EQ_SIZE - 1)];
+	if ((!!(s_eqe->owner & 0x80)) ^
+	    (!!(slave_eq->prod & SLAVE_EVENT_EQ_SIZE))) {
+		mlx4_warn(dev, "Master failed to generate an EQE for slave: %d. No free EQE on slave events queue\n",
+			  slave);
+		spin_unlock_irqrestore(&slave_eq->event_lock, flags);
+		return;
+	}
+
+	memcpy(s_eqe, eqe, dev->caps.eqe_size - 1);
+	s_eqe->slave_id = slave;
+	/* ensure all information is written before setting the ownersip bit */
+	wmb();
+	s_eqe->owner = !!(slave_eq->prod & SLAVE_EVENT_EQ_SIZE) ? 0x0 : 0x80;
+	++slave_eq->prod;
+
+	queue_work(priv->mfunc.master.comm_wq,
+		   &priv->mfunc.master.slave_event_work);
+	spin_unlock_irqrestore(&slave_eq->event_lock, flags);
+}
+
+static void mlx4_slave_event(struct mlx4_dev *dev, int slave,
+			     struct mlx4_eqe *eqe)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_slave_state *s_slave =
+		&priv->mfunc.master.slave_state[slave];
+
+	if (!s_slave->active) {
+		/*mlx4_warn(dev, "Trying to pass event to inactive slave\n");*/
+		return;
+	}
+
+	slave_event(dev, slave, eqe);
+}
+
+int mlx4_gen_pkey_eqe(struct mlx4_dev *dev, int slave, u8 port)
+{
+	struct mlx4_eqe eqe;
+
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_slave_state *s_slave = &priv->mfunc.master.slave_state[slave];
+
+	if (!s_slave->active)
+		return 0;
+
+	memset(&eqe, 0, sizeof eqe);
+
+	eqe.type = MLX4_EVENT_TYPE_PORT_MNG_CHG_EVENT;
+	eqe.subtype = MLX4_DEV_PMC_SUBTYPE_PKEY_TABLE;
+	eqe.event.port_mgmt_change.port = port;
+
+	return mlx4_GEN_EQE(dev, slave, &eqe);
+}
+EXPORT_SYMBOL(mlx4_gen_pkey_eqe);
+
+int mlx4_gen_guid_change_eqe(struct mlx4_dev *dev, int slave, u8 port)
+{
+	struct mlx4_eqe eqe;
+
+	/*don't send if we don't have the that slave */
+	if (dev->num_vfs < slave)
+		return 0;
+	memset(&eqe, 0, sizeof eqe);
+
+	eqe.type = MLX4_EVENT_TYPE_PORT_MNG_CHG_EVENT;
+	eqe.subtype = MLX4_DEV_PMC_SUBTYPE_GUID_INFO;
+	eqe.event.port_mgmt_change.port = port;
+
+	return mlx4_GEN_EQE(dev, slave, &eqe);
+}
+EXPORT_SYMBOL(mlx4_gen_guid_change_eqe);
+
+int mlx4_gen_port_state_change_eqe(struct mlx4_dev *dev, int slave, u8 port,
+				   u8 port_subtype_change)
+{
+	struct mlx4_eqe eqe;
+
+	/*don't send if we don't have the that slave */
+	if (dev->num_vfs < slave)
+		return 0;
+	memset(&eqe, 0, sizeof eqe);
+
+	eqe.type = MLX4_EVENT_TYPE_PORT_CHANGE;
+	eqe.subtype = port_subtype_change;
+	eqe.event.port_change.port = cpu_to_be32(port << 28);
+
+	mlx4_dbg(dev, "%s: sending: %d to slave: %d on port: %d\n", __func__,
+		 port_subtype_change, slave, port);
+	return mlx4_GEN_EQE(dev, slave, &eqe);
+}
+EXPORT_SYMBOL(mlx4_gen_port_state_change_eqe);
+
+enum slave_port_state mlx4_get_slave_port_state(struct mlx4_dev *dev, int slave, u8 port)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_slave_state *s_state = priv->mfunc.master.slave_state;
+	struct mlx4_active_ports actv_ports = mlx4_get_active_ports(dev, slave);
+
+	if (slave >= dev->num_slaves || port > dev->caps.num_ports ||
+	    port <= 0 || !test_bit(port - 1, actv_ports.ports)) {
+		pr_err("%s: Error: asking for slave:%d, port:%d\n",
+		       __func__, slave, port);
+		return SLAVE_PORT_DOWN;
+	}
+	return s_state[slave].port_state[port];
+}
+EXPORT_SYMBOL(mlx4_get_slave_port_state);
+
+static int mlx4_set_slave_port_state(struct mlx4_dev *dev, int slave, u8 port,
+				     enum slave_port_state state)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_slave_state *s_state = priv->mfunc.master.slave_state;
+	struct mlx4_active_ports actv_ports = mlx4_get_active_ports(dev, slave);
+
+	if (slave >= dev->num_slaves || port > dev->caps.num_ports ||
+	    port <= 0 || !test_bit(port - 1, actv_ports.ports)) {
+		pr_err("%s: Error: asking for slave:%d, port:%d\n",
+		       __func__, slave, port);
+		return -1;
+	}
+	s_state[slave].port_state[port] = state;
+
+	return 0;
+}
+
+static void set_all_slave_state(struct mlx4_dev *dev, u8 port, int event)
+{
+	int i;
+	enum slave_port_gen_event gen_event;
+	struct mlx4_slaves_pport slaves_pport = mlx4_phys_to_slaves_pport(dev,
+									  port);
+
+	for (i = 0; i < dev->num_vfs + 1; i++)
+		if (test_bit(i, slaves_pport.slaves))
+			set_and_calc_slave_port_state(dev, i, port,
+						      event, &gen_event);
+}
+/**************************************************************************
+	The function get as input the new event to that port,
+	and according to the prev state change the slave's port state.
+	The events are:
+		MLX4_PORT_STATE_DEV_EVENT_PORT_DOWN,
+		MLX4_PORT_STATE_DEV_EVENT_PORT_UP
+		MLX4_PORT_STATE_IB_EVENT_GID_VALID
+		MLX4_PORT_STATE_IB_EVENT_GID_INVALID
+***************************************************************************/
+int set_and_calc_slave_port_state(struct mlx4_dev *dev, int slave,
+				  u8 port, int event,
+				  enum slave_port_gen_event *gen_event)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_slave_state *ctx = NULL;
+	unsigned long flags;
+	int ret = -1;
+	struct mlx4_active_ports actv_ports = mlx4_get_active_ports(dev, slave);
+	enum slave_port_state cur_state =
+		mlx4_get_slave_port_state(dev, slave, port);
+
+	*gen_event = SLAVE_PORT_GEN_EVENT_NONE;
+
+	if (slave >= dev->num_slaves || port > dev->caps.num_ports ||
+	    port <= 0 || !test_bit(port - 1, actv_ports.ports)) {
+		pr_err("%s: Error: asking for slave:%d, port:%d\n",
+		       __func__, slave, port);
+		return ret;
+	}
+
+	ctx = &priv->mfunc.master.slave_state[slave];
+	spin_lock_irqsave(&ctx->lock, flags);
+
+	switch (cur_state) {
+	case SLAVE_PORT_DOWN:
+		if (MLX4_PORT_STATE_DEV_EVENT_PORT_UP == event)
+			mlx4_set_slave_port_state(dev, slave, port,
+						  SLAVE_PENDING_UP);
+		break;
+	case SLAVE_PENDING_UP:
+		if (MLX4_PORT_STATE_DEV_EVENT_PORT_DOWN == event)
+			mlx4_set_slave_port_state(dev, slave, port,
+						  SLAVE_PORT_DOWN);
+		else if (MLX4_PORT_STATE_IB_PORT_STATE_EVENT_GID_VALID == event) {
+			mlx4_set_slave_port_state(dev, slave, port,
+						  SLAVE_PORT_UP);
+			*gen_event = SLAVE_PORT_GEN_EVENT_UP;
+		}
+		break;
+	case SLAVE_PORT_UP:
+		if (MLX4_PORT_STATE_DEV_EVENT_PORT_DOWN == event) {
+			mlx4_set_slave_port_state(dev, slave, port,
+						  SLAVE_PORT_DOWN);
+			*gen_event = SLAVE_PORT_GEN_EVENT_DOWN;
+		} else if (MLX4_PORT_STATE_IB_EVENT_GID_INVALID ==
+				event) {
+			mlx4_set_slave_port_state(dev, slave, port,
+						  SLAVE_PENDING_UP);
+			*gen_event = SLAVE_PORT_GEN_EVENT_DOWN;
+		}
+		break;
+	default:
+		pr_err("%s: BUG!!! UNKNOWN state: slave:%d, port:%d\n",
+		       __func__, slave, port);
+		goto out;
+	}
+	ret = mlx4_get_slave_port_state(dev, slave, port);
+
+out:
+	spin_unlock_irqrestore(&ctx->lock, flags);
+	return ret;
+}
+
+EXPORT_SYMBOL(set_and_calc_slave_port_state);
+
+int mlx4_gen_slaves_port_mgt_ev(struct mlx4_dev *dev, u8 port, int attr)
+{
+	struct mlx4_eqe eqe;
+
+	memset(&eqe, 0, sizeof eqe);
+
+	eqe.type = MLX4_EVENT_TYPE_PORT_MNG_CHG_EVENT;
+	eqe.subtype = MLX4_DEV_PMC_SUBTYPE_PORT_INFO;
+	eqe.event.port_mgmt_change.port = port;
+	eqe.event.port_mgmt_change.params.port_info.changed_attr =
+		cpu_to_be32((u32) attr);
+
+	slave_event(dev, ALL_SLAVES, &eqe);
+	return 0;
+}
+EXPORT_SYMBOL(mlx4_gen_slaves_port_mgt_ev);
+
+void mlx4_master_handle_slave_flr(struct work_struct *work)
+{
+	struct mlx4_mfunc_master_ctx *master =
+		container_of(work, struct mlx4_mfunc_master_ctx,
+			     slave_flr_event_work);
+	struct mlx4_mfunc *mfunc =
+		container_of(master, struct mlx4_mfunc, master);
+	struct mlx4_priv *priv =
+		container_of(mfunc, struct mlx4_priv, mfunc);
+	struct mlx4_dev *dev = &priv->dev;
+	struct mlx4_slave_state *slave_state = priv->mfunc.master.slave_state;
+	int i;
+	int err;
+	unsigned long flags;
+
+	mlx4_dbg(dev, "mlx4_handle_slave_flr\n");
+
+	for (i = 0 ; i < dev->num_slaves; i++) {
+
+		if (MLX4_COMM_CMD_FLR == slave_state[i].last_cmd) {
+			mlx4_dbg(dev, "mlx4_handle_slave_flr: clean slave: %d\n",
+				 i);
+
+			mlx4_delete_all_resources_for_slave(dev, i);
+			/*return the slave to running mode*/
+			spin_lock_irqsave(&priv->mfunc.master.slave_state_lock, flags);
+			slave_state[i].last_cmd = MLX4_COMM_CMD_RESET;
+			slave_state[i].is_slave_going_down = 0;
+			spin_unlock_irqrestore(&priv->mfunc.master.slave_state_lock, flags);
+			/*notify the FW:*/
+			err = mlx4_cmd(dev, 0, i, 0, MLX4_CMD_INFORM_FLR_DONE,
+				       MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED);
+			if (err)
+				mlx4_warn(dev, "Failed to notify FW on FLR done (slave:%d)\n",
+					  i);
+		}
+	}
+}
+
+static int mlx4_eq_int(struct mlx4_dev *dev, struct mlx4_eq *eq)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_eqe *eqe;
+	int cqn;
+	int eqes_found = 0;
+	int set_ci = 0;
+	int port;
+	int slave = 0;
+	int ret;
+	u32 flr_slave;
+	u8 update_slave_state;
+	int i;
+	enum slave_port_gen_event gen_event;
+	unsigned long flags;
+	struct mlx4_vport_state *s_info;
+	int eqe_size = dev->caps.eqe_size;
+
+	while ((eqe = next_eqe_sw(eq, dev->caps.eqe_factor, eqe_size))) {
+		/*
+		 * Make sure we read EQ entry contents after we've
+		 * checked the ownership bit.
+		 */
+		rmb();
+
+		switch (eqe->type) {
+		case MLX4_EVENT_TYPE_COMP:
+			cqn = be32_to_cpu(eqe->event.comp.cqn) & 0xffffff;
+			mlx4_cq_completion(dev, cqn);
+			break;
+
+		case MLX4_EVENT_TYPE_PATH_MIG:
+		case MLX4_EVENT_TYPE_COMM_EST:
+		case MLX4_EVENT_TYPE_SQ_DRAINED:
+		case MLX4_EVENT_TYPE_SRQ_QP_LAST_WQE:
+		case MLX4_EVENT_TYPE_WQ_CATAS_ERROR:
+		case MLX4_EVENT_TYPE_PATH_MIG_FAILED:
+		case MLX4_EVENT_TYPE_WQ_INVAL_REQ_ERROR:
+		case MLX4_EVENT_TYPE_WQ_ACCESS_ERROR:
+			mlx4_dbg(dev, "event %d arrived\n", eqe->type);
+			if (mlx4_is_master(dev)) {
+				/* forward only to slave owning the QP */
+				ret = mlx4_get_slave_from_resource_id(dev,
+						RES_QP,
+						be32_to_cpu(eqe->event.qp.qpn)
+						& 0xffffff, &slave);
+				if (ret && ret != -ENOENT) {
+					mlx4_dbg(dev, "QP event %02x(%02x) on EQ %d at index %u: could not get slave id (%d)\n",
+						 eqe->type, eqe->subtype,
+						 eq->eqn, eq->cons_index, ret);
+					break;
+				}
+
+				if (!ret && slave != dev->caps.function) {
+					mlx4_slave_event(dev, slave, eqe);
+					break;
+				}
+
+			}
+			mlx4_qp_event(dev, be32_to_cpu(eqe->event.qp.qpn) &
+				      0xffffff, eqe->type);
+			break;
+
+		case MLX4_EVENT_TYPE_SRQ_LIMIT:
+			mlx4_dbg(dev, "%s: MLX4_EVENT_TYPE_SRQ_LIMIT\n",
+				 __func__);
+		case MLX4_EVENT_TYPE_SRQ_CATAS_ERROR:
+			if (mlx4_is_master(dev)) {
+				/* forward only to slave owning the SRQ */
+				ret = mlx4_get_slave_from_resource_id(dev,
+						RES_SRQ,
+						be32_to_cpu(eqe->event.srq.srqn)
+						& 0xffffff,
+						&slave);
+				if (ret && ret != -ENOENT) {
+					mlx4_warn(dev, "SRQ event %02x(%02x) on EQ %d at index %u: could not get slave id (%d)\n",
+						  eqe->type, eqe->subtype,
+						  eq->eqn, eq->cons_index, ret);
+					break;
+				}
+				mlx4_warn(dev, "%s: slave:%d, srq_no:0x%x, event: %02x(%02x)\n",
+					  __func__, slave,
+					  be32_to_cpu(eqe->event.srq.srqn),
+					  eqe->type, eqe->subtype);
+
+				if (!ret && slave != dev->caps.function) {
+					mlx4_warn(dev, "%s: sending event %02x(%02x) to slave:%d\n",
+						  __func__, eqe->type,
+						  eqe->subtype, slave);
+					mlx4_slave_event(dev, slave, eqe);
+					break;
+				}
+			}
+			mlx4_srq_event(dev, be32_to_cpu(eqe->event.srq.srqn) &
+				       0xffffff, eqe->type);
+			break;
+
+		case MLX4_EVENT_TYPE_CMD:
+			mlx4_cmd_event(dev,
+				       be16_to_cpu(eqe->event.cmd.token),
+				       eqe->event.cmd.status,
+				       be64_to_cpu(eqe->event.cmd.out_param));
+			break;
+
+		case MLX4_EVENT_TYPE_PORT_CHANGE: {
+			struct mlx4_slaves_pport slaves_port;
+			port = be32_to_cpu(eqe->event.port_change.port) >> 28;
+			slaves_port = mlx4_phys_to_slaves_pport(dev, port);
+			if (eqe->subtype == MLX4_PORT_CHANGE_SUBTYPE_DOWN) {
+				mlx4_dispatch_event(dev, MLX4_DEV_EVENT_PORT_DOWN,
+						    port);
+				mlx4_priv(dev)->sense.do_sense_port[port] = 1;
+				if (!mlx4_is_master(dev))
+					break;
+				for (i = 0; i < dev->num_vfs + 1; i++) {
+					if (!test_bit(i, slaves_port.slaves))
+						continue;
+					if (dev->caps.port_type[port] == MLX4_PORT_TYPE_ETH) {
+						if (i == mlx4_master_func_num(dev))
+							continue;
+						mlx4_dbg(dev, "%s: Sending MLX4_PORT_CHANGE_SUBTYPE_DOWN to slave: %d, port:%d\n",
+							 __func__, i, port);
+						s_info = &priv->mfunc.master.vf_oper[slave].vport[port].state;
+						if (IFLA_VF_LINK_STATE_AUTO == s_info->link_state) {
+							eqe->event.port_change.port =
+								cpu_to_be32(
+								(be32_to_cpu(eqe->event.port_change.port) & 0xFFFFFFF)
+								| (mlx4_phys_to_slave_port(dev, i, port) << 28));
+							mlx4_slave_event(dev, i, eqe);
+						}
+					} else {  /* IB port */
+						set_and_calc_slave_port_state(dev, i, port,
+									      MLX4_PORT_STATE_DEV_EVENT_PORT_DOWN,
+									      &gen_event);
+						/*we can be in pending state, then do not send port_down event*/
+						if (SLAVE_PORT_GEN_EVENT_DOWN ==  gen_event) {
+							if (i == mlx4_master_func_num(dev))
+								continue;
+							mlx4_slave_event(dev, i, eqe);
+						}
+					}
+				}
+			} else {
+				mlx4_dispatch_event(dev, MLX4_DEV_EVENT_PORT_UP, port);
+
+				mlx4_priv(dev)->sense.do_sense_port[port] = 0;
+
+				if (!mlx4_is_master(dev))
+					break;
+				if (dev->caps.port_type[port] == MLX4_PORT_TYPE_ETH)
+					for (i = 0; i < dev->num_vfs + 1; i++) {
+						if (!test_bit(i, slaves_port.slaves))
+							continue;
+						if (i == mlx4_master_func_num(dev))
+							continue;
+						s_info = &priv->mfunc.master.vf_oper[slave].vport[port].state;
+						if (IFLA_VF_LINK_STATE_AUTO == s_info->link_state) {
+							eqe->event.port_change.port =
+								cpu_to_be32(
+								(be32_to_cpu(eqe->event.port_change.port) & 0xFFFFFFF)
+								| (mlx4_phys_to_slave_port(dev, i, port) << 28));
+							mlx4_slave_event(dev, i, eqe);
+						}
+					}
+				else /* IB port */
+					/* port-up event will be sent to a slave when the
+					 * slave's alias-guid is set. This is done in alias_GUID.c
+					 */
+					set_all_slave_state(dev, port, MLX4_DEV_EVENT_PORT_UP);
+			}
+			break;
+		}
+
+		case MLX4_EVENT_TYPE_CQ_ERROR:
+			mlx4_warn(dev, "CQ %s on CQN %06x\n",
+				  eqe->event.cq_err.syndrome == 1 ?
+				  "overrun" : "access violation",
+				  be32_to_cpu(eqe->event.cq_err.cqn) & 0xffffff);
+			if (mlx4_is_master(dev)) {
+				ret = mlx4_get_slave_from_resource_id(dev,
+					RES_CQ,
+					be32_to_cpu(eqe->event.cq_err.cqn)
+					& 0xffffff, &slave);
+				if (ret && ret != -ENOENT) {
+					mlx4_dbg(dev, "CQ event %02x(%02x) on EQ %d at index %u: could not get slave id (%d)\n",
+						 eqe->type, eqe->subtype,
+						 eq->eqn, eq->cons_index, ret);
+					break;
+				}
+
+				if (!ret && slave != dev->caps.function) {
+					mlx4_slave_event(dev, slave, eqe);
+					break;
+				}
+			}
+			mlx4_cq_event(dev,
+				      be32_to_cpu(eqe->event.cq_err.cqn)
+				      & 0xffffff,
+				      eqe->type);
+			break;
+
+		case MLX4_EVENT_TYPE_EQ_OVERFLOW:
+			mlx4_warn(dev, "EQ overrun on EQN %d\n", eq->eqn);
+			break;
+
+		case MLX4_EVENT_TYPE_OP_REQUIRED:
+			atomic_inc(&priv->opreq_count);
+			/* FW commands can't be executed from interrupt context
+			 * working in deferred task
+			 */
+			queue_work(mlx4_wq, &priv->opreq_task);
+			break;
+
+		case MLX4_EVENT_TYPE_COMM_CHANNEL:
+			if (!mlx4_is_master(dev)) {
+				mlx4_warn(dev, "Received comm channel event for non master device\n");
+				break;
+			}
+			memcpy(&priv->mfunc.master.comm_arm_bit_vector,
+			       eqe->event.comm_channel_arm.bit_vec,
+			       sizeof eqe->event.comm_channel_arm.bit_vec);
+			queue_work(priv->mfunc.master.comm_wq,
+				   &priv->mfunc.master.comm_work);
+			break;
+
+		case MLX4_EVENT_TYPE_FLR_EVENT:
+			flr_slave = be32_to_cpu(eqe->event.flr_event.slave_id);
+			if (!mlx4_is_master(dev)) {
+				mlx4_warn(dev, "Non-master function received FLR event\n");
+				break;
+			}
+
+			mlx4_dbg(dev, "FLR event for slave: %d\n", flr_slave);
+
+			if (flr_slave >= dev->num_slaves) {
+				mlx4_warn(dev,
+					  "Got FLR for unknown function: %d\n",
+					  flr_slave);
+				update_slave_state = 0;
+			} else
+				update_slave_state = 1;
+
+			spin_lock_irqsave(&priv->mfunc.master.slave_state_lock, flags);
+			if (update_slave_state) {
+				priv->mfunc.master.slave_state[flr_slave].active = false;
+				priv->mfunc.master.slave_state[flr_slave].last_cmd = MLX4_COMM_CMD_FLR;
+				priv->mfunc.master.slave_state[flr_slave].is_slave_going_down = 1;
+			}
+			spin_unlock_irqrestore(&priv->mfunc.master.slave_state_lock, flags);
+			queue_work(priv->mfunc.master.comm_wq,
+				   &priv->mfunc.master.slave_flr_event_work);
+			break;
+
+		case MLX4_EVENT_TYPE_FATAL_WARNING:
+			if (eqe->subtype == MLX4_FATAL_WARNING_SUBTYPE_WARMING) {
+				if (mlx4_is_master(dev))
+					for (i = 0; i < dev->num_slaves; i++) {
+						mlx4_dbg(dev, "%s: Sending MLX4_FATAL_WARNING_SUBTYPE_WARMING to slave: %d\n",
+							 __func__, i);
+						if (i == dev->caps.function)
+							continue;
+						mlx4_slave_event(dev, i, eqe);
+					}
+				mlx4_err(dev, "Temperature Threshold was reached! Threshold: %d celsius degrees; Current Temperature: %d\n",
+					 be16_to_cpu(eqe->event.warming.warning_threshold),
+					 be16_to_cpu(eqe->event.warming.current_temperature));
+			} else
+				mlx4_warn(dev, "Unhandled event FATAL WARNING (%02x), subtype %02x on EQ %d at index %u. owner=%x, nent=0x%x, slave=%x, ownership=%s\n",
+					  eqe->type, eqe->subtype, eq->eqn,
+					  eq->cons_index, eqe->owner, eq->nent,
+					  eqe->slave_id,
+					  !!(eqe->owner & 0x80) ^
+					  !!(eq->cons_index & eq->nent) ? "HW" : "SW");
+
+			break;
+
+		case MLX4_EVENT_TYPE_PORT_MNG_CHG_EVENT:
+			mlx4_dispatch_event(dev, MLX4_DEV_EVENT_PORT_MGMT_CHANGE,
+					    (unsigned long) eqe);
+			break;
+
+		case MLX4_EVENT_TYPE_EEC_CATAS_ERROR:
+		case MLX4_EVENT_TYPE_ECC_DETECT:
+		default:
+			mlx4_warn(dev, "Unhandled event %02x(%02x) on EQ %d at index %u. owner=%x, nent=0x%x, slave=%x, ownership=%s\n",
+				  eqe->type, eqe->subtype, eq->eqn,
+				  eq->cons_index, eqe->owner, eq->nent,
+				  eqe->slave_id,
+				  !!(eqe->owner & 0x80) ^
+				  !!(eq->cons_index & eq->nent) ? "HW" : "SW");
+			break;
+		};
+
+		++eq->cons_index;
+		eqes_found = 1;
+		++set_ci;
+
+		/*
+		 * The HCA will think the queue has overflowed if we
+		 * don't tell it we've been processing events.  We
+		 * create our EQs with MLX4_NUM_SPARE_EQE extra
+		 * entries, so we must update our consumer index at
+		 * least that often.
+		 */
+		if (unlikely(set_ci >= MLX4_NUM_SPARE_EQE)) {
+			eq_set_ci(eq, 0);
+			set_ci = 0;
+		}
+	}
+
+	eq_set_ci(eq, 1);
+
+	return eqes_found;
+}
+
+static irqreturn_t mlx4_interrupt(int irq, void *dev_ptr)
+{
+	struct mlx4_dev *dev = dev_ptr;
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int work = 0;
+	int i;
+
+	writel(priv->eq_table.clr_mask, priv->eq_table.clr_int);
+
+	for (i = 0; i < dev->caps.num_comp_vectors + 1; ++i)
+		work |= mlx4_eq_int(dev, &priv->eq_table.eq[i]);
+
+	return IRQ_RETVAL(work);
+}
+
+static irqreturn_t mlx4_msi_x_interrupt(int irq, void *eq_ptr)
+{
+	struct mlx4_eq  *eq  = eq_ptr;
+	struct mlx4_dev *dev = eq->dev;
+
+	mlx4_eq_int(dev, eq);
+
+	/* MSI-X vectors always belong to us */
+	return IRQ_HANDLED;
+}
+
+int mlx4_MAP_EQ_wrapper(struct mlx4_dev *dev, int slave,
+			struct mlx4_vhcr *vhcr,
+			struct mlx4_cmd_mailbox *inbox,
+			struct mlx4_cmd_mailbox *outbox,
+			struct mlx4_cmd_info *cmd)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_slave_event_eq_info *event_eq =
+		priv->mfunc.master.slave_state[slave].event_eq;
+	u32 in_modifier = vhcr->in_modifier;
+	u32 eqn = in_modifier & 0x3FF;
+	u64 in_param =  vhcr->in_param;
+	int err = 0;
+	int i;
+
+	if (slave == dev->caps.function)
+		err = mlx4_cmd(dev, in_param, (in_modifier & 0x80000000) | eqn,
+			       0, MLX4_CMD_MAP_EQ, MLX4_CMD_TIME_CLASS_B,
+			       MLX4_CMD_NATIVE);
+	if (!err)
+		for (i = 0; i < MLX4_EVENT_TYPES_NUM; ++i)
+			if (in_param & (1LL << i))
+				event_eq[i].eqn = in_modifier >> 31 ? -1 : eqn;
+
+	return err;
+}
+
+static int mlx4_MAP_EQ(struct mlx4_dev *dev, u64 event_mask, int unmap,
+			int eq_num)
+{
+	return mlx4_cmd(dev, event_mask, (unmap << 31) | eq_num,
+			0, MLX4_CMD_MAP_EQ, MLX4_CMD_TIME_CLASS_B,
+			MLX4_CMD_WRAPPED);
+}
+
+static int mlx4_SW2HW_EQ(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox,
+			 int eq_num)
+{
+	return mlx4_cmd(dev, mailbox->dma, eq_num, 0,
+			MLX4_CMD_SW2HW_EQ, MLX4_CMD_TIME_CLASS_A,
+			MLX4_CMD_WRAPPED);
+}
+
+static int mlx4_HW2SW_EQ(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox,
+			 int eq_num)
+{
+	return mlx4_cmd_box(dev, 0, mailbox->dma, eq_num,
+			    0, MLX4_CMD_HW2SW_EQ, MLX4_CMD_TIME_CLASS_A,
+			    MLX4_CMD_WRAPPED);
+}
+
+static int mlx4_num_eq_uar(struct mlx4_dev *dev)
+{
+	/*
+	 * Each UAR holds 4 EQ doorbells.  To figure out how many UARs
+	 * we need to map, take the difference of highest index and
+	 * the lowest index we'll use and add 1.
+	 */
+	return (dev->caps.num_comp_vectors + 1 + dev->caps.reserved_eqs +
+		 dev->caps.comp_pool)/4 - dev->caps.reserved_eqs/4 + 1;
+}
+
+static void __iomem *mlx4_get_eq_uar(struct mlx4_dev *dev, struct mlx4_eq *eq)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int index;
+
+	index = eq->eqn / 4 - dev->caps.reserved_eqs / 4;
+
+	if (!priv->eq_table.uar_map[index]) {
+		priv->eq_table.uar_map[index] =
+			ioremap(pci_resource_start(dev->pdev, 2) +
+				((eq->eqn / 4) << PAGE_SHIFT),
+				PAGE_SIZE);
+		if (!priv->eq_table.uar_map[index]) {
+			mlx4_err(dev, "Couldn't map EQ doorbell for EQN 0x%06x\n",
+				 eq->eqn);
+			return NULL;
+		}
+	}
+
+	return priv->eq_table.uar_map[index] + 0x800 + 8 * (eq->eqn % 4);
+}
+
+static void mlx4_unmap_uar(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int i;
+
+	for (i = 0; i < mlx4_num_eq_uar(dev); ++i)
+		if (priv->eq_table.uar_map[i]) {
+			iounmap(priv->eq_table.uar_map[i]);
+			priv->eq_table.uar_map[i] = NULL;
+		}
+}
+
+static int mlx4_create_eq(struct mlx4_dev *dev, int nent,
+			  u8 intr, struct mlx4_eq *eq)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_eq_context *eq_context;
+	int npages;
+	u64 *dma_list = NULL;
+	dma_addr_t t;
+	u64 mtt_addr;
+	int err = -ENOMEM;
+	int i;
+
+	eq->dev   = dev;
+	eq->nent  = roundup_pow_of_two(max(nent, 2));
+	/* CX3 is capable of extending the CQE/EQE from 32 to 64 bytes, with
+	 * strides of 64B,128B and 256B.
+	 */
+	npages = PAGE_ALIGN(eq->nent * dev->caps.eqe_size) / PAGE_SIZE;
+
+	eq->page_list = kmalloc(npages * sizeof *eq->page_list,
+				GFP_KERNEL);
+	if (!eq->page_list)
+		goto err_out;
+
+	for (i = 0; i < npages; ++i)
+		eq->page_list[i].buf = NULL;
+
+	dma_list = kmalloc(npages * sizeof *dma_list, GFP_KERNEL);
+	if (!dma_list)
+		goto err_out_free;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		goto err_out_free;
+	eq_context = mailbox->buf;
+
+	for (i = 0; i < npages; ++i) {
+		eq->page_list[i].buf = dma_alloc_coherent(&dev->pdev->dev,
+							  PAGE_SIZE, &t, GFP_KERNEL);
+		if (!eq->page_list[i].buf)
+			goto err_out_free_pages;
+
+		dma_list[i] = t;
+		eq->page_list[i].map = t;
+
+		memset(eq->page_list[i].buf, 0, PAGE_SIZE);
+	}
+
+	eq->eqn = mlx4_bitmap_alloc(&priv->eq_table.bitmap);
+	if (eq->eqn == -1)
+		goto err_out_free_pages;
+
+	eq->doorbell = mlx4_get_eq_uar(dev, eq);
+	if (!eq->doorbell) {
+		err = -ENOMEM;
+		goto err_out_free_eq;
+	}
+
+	err = mlx4_mtt_init(dev, npages, PAGE_SHIFT, &eq->mtt);
+	if (err)
+		goto err_out_free_eq;
+
+	err = mlx4_write_mtt(dev, &eq->mtt, 0, npages, dma_list);
+	if (err)
+		goto err_out_free_mtt;
+
+	eq_context->flags	  = cpu_to_be32(MLX4_EQ_STATUS_OK   |
+						MLX4_EQ_STATE_ARMED);
+	eq_context->log_eq_size	  = ilog2(eq->nent);
+	eq_context->intr	  = intr;
+	eq_context->log_page_size = PAGE_SHIFT - MLX4_ICM_PAGE_SHIFT;
+
+	mtt_addr = mlx4_mtt_addr(dev, &eq->mtt);
+	eq_context->mtt_base_addr_h = mtt_addr >> 32;
+	eq_context->mtt_base_addr_l = cpu_to_be32(mtt_addr & 0xffffffff);
+
+	err = mlx4_SW2HW_EQ(dev, mailbox, eq->eqn);
+	if (err) {
+		mlx4_warn(dev, "SW2HW_EQ failed (%d)\n", err);
+		goto err_out_free_mtt;
+	}
+
+	kfree(dma_list);
+	mlx4_free_cmd_mailbox(dev, mailbox);
+
+	eq->cons_index = 0;
+
+	return err;
+
+err_out_free_mtt:
+	mlx4_mtt_cleanup(dev, &eq->mtt);
+
+err_out_free_eq:
+	mlx4_bitmap_free(&priv->eq_table.bitmap, eq->eqn, MLX4_USE_RR);
+
+err_out_free_pages:
+	for (i = 0; i < npages; ++i)
+		if (eq->page_list[i].buf)
+			dma_free_coherent(&dev->pdev->dev, PAGE_SIZE,
+					  eq->page_list[i].buf,
+					  eq->page_list[i].map);
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+
+err_out_free:
+	kfree(eq->page_list);
+	kfree(dma_list);
+
+err_out:
+	return err;
+}
+
+static void mlx4_free_eq(struct mlx4_dev *dev,
+			 struct mlx4_eq *eq)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_cmd_mailbox *mailbox;
+	int err;
+	int i;
+	/* CX3 is capable of extending the CQE/EQE from 32 to 64 bytes, with
+	 * strides of 64B,128B and 256B
+	 */
+	int npages = PAGE_ALIGN(dev->caps.eqe_size  * eq->nent) / PAGE_SIZE;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return;
+
+	err = mlx4_HW2SW_EQ(dev, mailbox, eq->eqn);
+	if (err)
+		mlx4_warn(dev, "HW2SW_EQ failed (%d)\n", err);
+
+	if (0) {
+		mlx4_dbg(dev, "Dumping EQ context %02x:\n", eq->eqn);
+		for (i = 0; i < sizeof (struct mlx4_eq_context) / 4; ++i) {
+			if (i % 4 == 0)
+				pr_cont("[%02x] ", i * 4);
+			pr_cont(" %08x", be32_to_cpup(mailbox->buf + i * 4));
+			if ((i + 1) % 4 == 0)
+				pr_cont("\n");
+		}
+	}
+	synchronize_irq(eq->irq);
+
+	mlx4_mtt_cleanup(dev, &eq->mtt);
+	for (i = 0; i < npages; ++i)
+		dma_free_coherent(&dev->pdev->dev, PAGE_SIZE,
+				    eq->page_list[i].buf,
+				    eq->page_list[i].map);
+
+	kfree(eq->page_list);
+	mlx4_bitmap_free(&priv->eq_table.bitmap, eq->eqn, MLX4_USE_RR);
+	mlx4_free_cmd_mailbox(dev, mailbox);
+}
+
+static void mlx4_free_irqs(struct mlx4_dev *dev)
+{
+	struct mlx4_eq_table *eq_table = &mlx4_priv(dev)->eq_table;
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int	i, vec;
+
+	if (eq_table->have_irq)
+		free_irq(dev->pdev->irq, dev);
+
+	for (i = 0; i < dev->caps.num_comp_vectors + 1; ++i)
+		if (eq_table->eq[i].have_irq) {
+			free_irq(eq_table->eq[i].irq, eq_table->eq + i);
+			eq_table->eq[i].have_irq = 0;
+		}
+
+	for (i = 0; i < dev->caps.comp_pool; i++) {
+		/*
+		 * Freeing the assigned irq's
+		 * all bits should be 0, but we need to validate
+		 */
+		if (priv->msix_ctl.pool_bm & 1ULL << i) {
+			/* NO need protecting*/
+			vec = dev->caps.num_comp_vectors + 1 + i;
+			free_irq(priv->eq_table.eq[vec].irq,
+				 &priv->eq_table.eq[vec]);
+		}
+	}
+
+
+	kfree(eq_table->irq_names);
+}
+
+static int mlx4_map_clr_int(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	priv->clr_base = ioremap(pci_resource_start(dev->pdev, priv->fw.clr_int_bar) +
+				 priv->fw.clr_int_base, MLX4_CLR_INT_SIZE);
+	if (!priv->clr_base) {
+		mlx4_err(dev, "Couldn't map interrupt clear register, aborting\n");
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static void mlx4_unmap_clr_int(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	iounmap(priv->clr_base);
+}
+
+int mlx4_alloc_eq_table(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	priv->eq_table.eq = kcalloc(dev->caps.num_eqs - dev->caps.reserved_eqs,
+				    sizeof *priv->eq_table.eq, GFP_KERNEL);
+	if (!priv->eq_table.eq)
+		return -ENOMEM;
+
+	return 0;
+}
+
+void mlx4_free_eq_table(struct mlx4_dev *dev)
+{
+	kfree(mlx4_priv(dev)->eq_table.eq);
+}
+
+int mlx4_init_eq_table(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int err;
+	int i;
+
+	priv->eq_table.uar_map = kcalloc(mlx4_num_eq_uar(dev),
+					 sizeof *priv->eq_table.uar_map,
+					 GFP_KERNEL);
+	if (!priv->eq_table.uar_map) {
+		err = -ENOMEM;
+		goto err_out_free;
+	}
+
+	err = mlx4_bitmap_init(&priv->eq_table.bitmap, dev->caps.num_eqs,
+			       dev->caps.num_eqs - 1, dev->caps.reserved_eqs, 0);
+	if (err)
+		goto err_out_free;
+
+	for (i = 0; i < mlx4_num_eq_uar(dev); ++i)
+		priv->eq_table.uar_map[i] = NULL;
+
+	if (!mlx4_is_slave(dev)) {
+		err = mlx4_map_clr_int(dev);
+		if (err)
+			goto err_out_bitmap;
+
+		priv->eq_table.clr_mask =
+			swab32(1 << (priv->eq_table.inta_pin & 31));
+		priv->eq_table.clr_int  = priv->clr_base +
+			(priv->eq_table.inta_pin < 32 ? 4 : 0);
+	}
+
+	priv->eq_table.irq_names =
+		kmalloc(MLX4_IRQNAME_SIZE * (dev->caps.num_comp_vectors + 1 +
+					     dev->caps.comp_pool),
+			GFP_KERNEL);
+	if (!priv->eq_table.irq_names) {
+		err = -ENOMEM;
+		goto err_out_bitmap;
+	}
+
+	for (i = 0; i < dev->caps.num_comp_vectors; ++i) {
+		err = mlx4_create_eq(dev, dev->caps.num_cqs -
+					  dev->caps.reserved_cqs +
+					  MLX4_NUM_SPARE_EQE,
+				     (dev->flags & MLX4_FLAG_MSI_X) ? i : 0,
+				     &priv->eq_table.eq[i]);
+		if (err) {
+			--i;
+			goto err_out_unmap;
+		}
+	}
+
+	err = mlx4_create_eq(dev, MLX4_NUM_ASYNC_EQE + MLX4_NUM_SPARE_EQE,
+			     (dev->flags & MLX4_FLAG_MSI_X) ? dev->caps.num_comp_vectors : 0,
+			     &priv->eq_table.eq[dev->caps.num_comp_vectors]);
+	if (err)
+		goto err_out_comp;
+
+	/*if additional completion vectors poolsize is 0 this loop will not run*/
+	for (i = dev->caps.num_comp_vectors + 1;
+	      i < dev->caps.num_comp_vectors + dev->caps.comp_pool + 1; ++i) {
+
+		err = mlx4_create_eq(dev, dev->caps.num_cqs -
+					  dev->caps.reserved_cqs +
+					  MLX4_NUM_SPARE_EQE,
+				     (dev->flags & MLX4_FLAG_MSI_X) ? i : 0,
+				     &priv->eq_table.eq[i]);
+		if (err) {
+			--i;
+			goto err_out_unmap;
+		}
+	}
+
+
+	if (dev->flags & MLX4_FLAG_MSI_X) {
+		const char *eq_name;
+
+		for (i = 0; i < dev->caps.num_comp_vectors + 1; ++i) {
+			if (i < dev->caps.num_comp_vectors) {
+				snprintf(priv->eq_table.irq_names +
+					 i * MLX4_IRQNAME_SIZE,
+					 MLX4_IRQNAME_SIZE,
+					 "mlx4-comp-%d@pci:%s", i,
+					 pci_name(dev->pdev));
+			} else {
+				snprintf(priv->eq_table.irq_names +
+					 i * MLX4_IRQNAME_SIZE,
+					 MLX4_IRQNAME_SIZE,
+					 "mlx4-async@pci:%s",
+					 pci_name(dev->pdev));
+			}
+
+			eq_name = priv->eq_table.irq_names +
+				  i * MLX4_IRQNAME_SIZE;
+			err = request_irq(priv->eq_table.eq[i].irq,
+					  mlx4_msi_x_interrupt, 0, eq_name,
+					  priv->eq_table.eq + i);
+			if (err)
+				goto err_out_async;
+
+			priv->eq_table.eq[i].have_irq = 1;
+		}
+	} else {
+		snprintf(priv->eq_table.irq_names,
+			 MLX4_IRQNAME_SIZE,
+			 DRV_NAME "@pci:%s",
+			 pci_name(dev->pdev));
+		err = request_irq(dev->pdev->irq, mlx4_interrupt,
+				  IRQF_SHARED, priv->eq_table.irq_names, dev);
+		if (err)
+			goto err_out_async;
+
+		priv->eq_table.have_irq = 1;
+	}
+
+	err = mlx4_MAP_EQ(dev, get_async_ev_mask(dev), 0,
+			  priv->eq_table.eq[dev->caps.num_comp_vectors].eqn);
+	if (err)
+		mlx4_warn(dev, "MAP_EQ for async EQ %d failed (%d)\n",
+			   priv->eq_table.eq[dev->caps.num_comp_vectors].eqn, err);
+
+	for (i = 0; i < dev->caps.num_comp_vectors + 1; ++i)
+		eq_set_ci(&priv->eq_table.eq[i], 1);
+
+	return 0;
+
+err_out_async:
+	mlx4_free_eq(dev, &priv->eq_table.eq[dev->caps.num_comp_vectors]);
+
+err_out_comp:
+	i = dev->caps.num_comp_vectors - 1;
+
+err_out_unmap:
+	while (i >= 0) {
+		mlx4_free_eq(dev, &priv->eq_table.eq[i]);
+		--i;
+	}
+	if (!mlx4_is_slave(dev))
+		mlx4_unmap_clr_int(dev);
+	mlx4_free_irqs(dev);
+
+err_out_bitmap:
+	mlx4_unmap_uar(dev);
+	mlx4_bitmap_cleanup(&priv->eq_table.bitmap);
+
+err_out_free:
+	kfree(priv->eq_table.uar_map);
+
+	return err;
+}
+
+void mlx4_cleanup_eq_table(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int i;
+
+	mlx4_MAP_EQ(dev, get_async_ev_mask(dev), 1,
+		    priv->eq_table.eq[dev->caps.num_comp_vectors].eqn);
+
+	mlx4_free_irqs(dev);
+
+	for (i = 0; i < dev->caps.num_comp_vectors + dev->caps.comp_pool + 1; ++i)
+		mlx4_free_eq(dev, &priv->eq_table.eq[i]);
+
+	if (!mlx4_is_slave(dev))
+		mlx4_unmap_clr_int(dev);
+
+	mlx4_unmap_uar(dev);
+	mlx4_bitmap_cleanup(&priv->eq_table.bitmap);
+
+	kfree(priv->eq_table.uar_map);
+}
+
+/* A test that verifies that we can accept interrupts on all
+ * the irq vectors of the device.
+ * Interrupts are checked using the NOP command.
+ */
+int mlx4_test_interrupts(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int i;
+	int err;
+
+	err = mlx4_NOP(dev);
+	/* When not in MSI_X, there is only one irq to check */
+	if (!(dev->flags & MLX4_FLAG_MSI_X) || mlx4_is_slave(dev))
+		return err;
+
+	/* A loop over all completion vectors, for each vector we will check
+	 * whether it works by mapping command completions to that vector
+	 * and performing a NOP command
+	 */
+	for(i = 0; !err && (i < dev->caps.num_comp_vectors); ++i) {
+		/* Temporary use polling for command completions */
+		mlx4_cmd_use_polling(dev);
+
+		/* Map the new eq to handle all asynchronous events */
+		err = mlx4_MAP_EQ(dev, get_async_ev_mask(dev), 0,
+				  priv->eq_table.eq[i].eqn);
+		if (err) {
+			mlx4_warn(dev, "Failed mapping eq for interrupt test\n");
+			mlx4_cmd_use_events(dev);
+			break;
+		}
+
+		/* Go back to using events */
+		mlx4_cmd_use_events(dev);
+		err = mlx4_NOP(dev);
+	}
+
+	/* Return to default */
+	mlx4_MAP_EQ(dev, get_async_ev_mask(dev), 0,
+		    priv->eq_table.eq[dev->caps.num_comp_vectors].eqn);
+	return err;
+}
+EXPORT_SYMBOL(mlx4_test_interrupts);
+
+int mlx4_assign_eq(struct mlx4_dev *dev, char *name, struct cpu_rmap *rmap,
+		   int *vector)
+{
+
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int vec = 0, err = 0, i;
+
+	mutex_lock(&priv->msix_ctl.pool_lock);
+	for (i = 0; !vec && i < dev->caps.comp_pool; i++) {
+		if (~priv->msix_ctl.pool_bm & 1ULL << i) {
+			priv->msix_ctl.pool_bm |= 1ULL << i;
+			vec = dev->caps.num_comp_vectors + 1 + i;
+			snprintf(priv->eq_table.irq_names +
+					vec * MLX4_IRQNAME_SIZE,
+					MLX4_IRQNAME_SIZE, "%s", name);
+#ifdef CONFIG_RFS_ACCEL
+			if (rmap) {
+				err = irq_cpu_rmap_add(rmap,
+						       priv->eq_table.eq[vec].irq);
+				if (err)
+					mlx4_warn(dev, "Failed adding irq rmap\n");
+			}
+#endif
+			err = request_irq(priv->eq_table.eq[vec].irq,
+					  mlx4_msi_x_interrupt, 0,
+					  &priv->eq_table.irq_names[vec<<5],
+					  priv->eq_table.eq + vec);
+			if (err) {
+				/*zero out bit by fliping it*/
+				priv->msix_ctl.pool_bm ^= 1 << i;
+				vec = 0;
+				continue;
+				/*we dont want to break here*/
+			}
+
+			eq_set_ci(&priv->eq_table.eq[vec], 1);
+		}
+	}
+	mutex_unlock(&priv->msix_ctl.pool_lock);
+
+	if (vec) {
+		*vector = vec;
+	} else {
+		*vector = 0;
+		err = (i == dev->caps.comp_pool) ? -ENOSPC : err;
+	}
+	return err;
+}
+EXPORT_SYMBOL(mlx4_assign_eq);
+
+int mlx4_eq_get_irq(struct mlx4_dev *dev, int vec)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	return priv->eq_table.eq[vec].irq;
+}
+EXPORT_SYMBOL(mlx4_eq_get_irq);
+
+void mlx4_release_eq(struct mlx4_dev *dev, int vec)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	/*bm index*/
+	int i = vec - dev->caps.num_comp_vectors - 1;
+
+	if (likely(i >= 0)) {
+		/*sanity check , making sure were not trying to free irq's
+		  Belonging to a legacy EQ*/
+		mutex_lock(&priv->msix_ctl.pool_lock);
+		if (priv->msix_ctl.pool_bm & 1ULL << i) {
+			free_irq(priv->eq_table.eq[vec].irq,
+				 &priv->eq_table.eq[vec]);
+			priv->msix_ctl.pool_bm &= ~(1ULL << i);
+		}
+		mutex_unlock(&priv->msix_ctl.pool_lock);
+	}
+
+}
+EXPORT_SYMBOL(mlx4_release_eq);
+
diff --git a/drivers/net/ethernet/mellanox/mlx4/fw.c b/drivers/net/ethernet/mellanox/mlx4/fw.c
new file mode 100644
index 0000000..2e88a23
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4/fw.c
@@ -0,0 +1,2146 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005, 2006, 2007, 2008 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2005, 2006, 2007 Cisco Systems, Inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/etherdevice.h>
+#include <linux/mlx4/cmd.h>
+#include <linux/module.h>
+#include <linux/cache.h>
+
+#include "fw.h"
+#include "icm.h"
+
+enum {
+	MLX4_COMMAND_INTERFACE_MIN_REV		= 2,
+	MLX4_COMMAND_INTERFACE_MAX_REV		= 3,
+	MLX4_COMMAND_INTERFACE_NEW_PORT_CMDS	= 3,
+};
+
+extern void __buggy_use_of_MLX4_GET(void);
+extern void __buggy_use_of_MLX4_PUT(void);
+
+static bool enable_qos;
+module_param(enable_qos, bool, 0444);
+MODULE_PARM_DESC(enable_qos, "Enable Quality of Service support in the HCA (default: off)");
+
+#define MLX4_GET(dest, source, offset)				      \
+	do {							      \
+		void *__p = (char *) (source) + (offset);	      \
+		switch (sizeof (dest)) {			      \
+		case 1: (dest) = *(u8 *) __p;	    break;	      \
+		case 2: (dest) = be16_to_cpup(__p); break;	      \
+		case 4: (dest) = be32_to_cpup(__p); break;	      \
+		case 8: (dest) = be64_to_cpup(__p); break;	      \
+		default: __buggy_use_of_MLX4_GET();		      \
+		}						      \
+	} while (0)
+
+#define MLX4_PUT(dest, source, offset)				      \
+	do {							      \
+		void *__d = ((char *) (dest) + (offset));	      \
+		switch (sizeof(source)) {			      \
+		case 1: *(u8 *) __d = (source);		       break; \
+		case 2:	*(__be16 *) __d = cpu_to_be16(source); break; \
+		case 4:	*(__be32 *) __d = cpu_to_be32(source); break; \
+		case 8:	*(__be64 *) __d = cpu_to_be64(source); break; \
+		default: __buggy_use_of_MLX4_PUT();		      \
+		}						      \
+	} while (0)
+
+static void dump_dev_cap_flags(struct mlx4_dev *dev, u64 flags)
+{
+	static const char *fname[] = {
+		[ 0] = "RC transport",
+		[ 1] = "UC transport",
+		[ 2] = "UD transport",
+		[ 3] = "XRC transport",
+		[ 4] = "reliable multicast",
+		[ 5] = "FCoIB support",
+		[ 6] = "SRQ support",
+		[ 7] = "IPoIB checksum offload",
+		[ 8] = "P_Key violation counter",
+		[ 9] = "Q_Key violation counter",
+		[10] = "VMM",
+		[12] = "Dual Port Different Protocol (DPDP) support",
+		[15] = "Big LSO headers",
+		[16] = "MW support",
+		[17] = "APM support",
+		[18] = "Atomic ops support",
+		[19] = "Raw multicast support",
+		[20] = "Address vector port checking support",
+		[21] = "UD multicast support",
+		[24] = "Demand paging support",
+		[25] = "Router support",
+		[30] = "IBoE support",
+		[32] = "Unicast loopback support",
+		[34] = "FCS header control",
+		[38] = "Wake On LAN support",
+		[40] = "UDP RSS support",
+		[41] = "Unicast VEP steering support",
+		[42] = "Multicast VEP steering support",
+		[48] = "Counters support",
+		[53] = "Port ETS Scheduler support",
+		[55] = "Port link type sensing support",
+		[59] = "Port management change event support",
+		[61] = "64 byte EQE support",
+		[62] = "64 byte CQE support",
+	};
+	int i;
+
+	mlx4_dbg(dev, "DEV_CAP flags:\n");
+	for (i = 0; i < ARRAY_SIZE(fname); ++i)
+		if (fname[i] && (flags & (1LL << i)))
+			mlx4_dbg(dev, "    %s\n", fname[i]);
+}
+
+static void dump_dev_cap_flags2(struct mlx4_dev *dev, u64 flags)
+{
+	static const char * const fname[] = {
+		[0] = "RSS support",
+		[1] = "RSS Toeplitz Hash Function support",
+		[2] = "RSS XOR Hash Function support",
+		[3] = "Device managed flow steering support",
+		[4] = "Automatic MAC reassignment support",
+		[5] = "Time stamping support",
+		[6] = "VST (control vlan insertion/stripping) support",
+		[7] = "FSM (MAC anti-spoofing) support",
+		[8] = "Dynamic QP updates support",
+		[9] = "Device managed flow steering IPoIB support",
+		[10] = "TCP/IP offloads/flow-steering for VXLAN support",
+		[11] = "MAD DEMUX (Secure-Host) support",
+		[12] = "Large cache line (>64B) CQE stride support",
+		[13] = "Large cache line (>64B) EQE stride support"
+	};
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(fname); ++i)
+		if (fname[i] && (flags & (1LL << i)))
+			mlx4_dbg(dev, "    %s\n", fname[i]);
+}
+
+int mlx4_MOD_STAT_CFG(struct mlx4_dev *dev, struct mlx4_mod_stat_cfg *cfg)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	u32 *inbox;
+	int err = 0;
+
+#define MOD_STAT_CFG_IN_SIZE		0x100
+
+#define MOD_STAT_CFG_PG_SZ_M_OFFSET	0x002
+#define MOD_STAT_CFG_PG_SZ_OFFSET	0x003
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	inbox = mailbox->buf;
+
+	MLX4_PUT(inbox, cfg->log_pg_sz, MOD_STAT_CFG_PG_SZ_OFFSET);
+	MLX4_PUT(inbox, cfg->log_pg_sz_m, MOD_STAT_CFG_PG_SZ_M_OFFSET);
+
+	err = mlx4_cmd(dev, mailbox->dma, 0, 0, MLX4_CMD_MOD_STAT_CFG,
+			MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE);
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+
+int mlx4_QUERY_FUNC_CAP_wrapper(struct mlx4_dev *dev, int slave,
+				struct mlx4_vhcr *vhcr,
+				struct mlx4_cmd_mailbox *inbox,
+				struct mlx4_cmd_mailbox *outbox,
+				struct mlx4_cmd_info *cmd)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	u8	field, port;
+	u32	size, proxy_qp, qkey;
+	int	err = 0;
+
+#define QUERY_FUNC_CAP_FLAGS_OFFSET		0x0
+#define QUERY_FUNC_CAP_NUM_PORTS_OFFSET		0x1
+#define QUERY_FUNC_CAP_PF_BHVR_OFFSET		0x4
+#define QUERY_FUNC_CAP_FMR_OFFSET		0x8
+#define QUERY_FUNC_CAP_QP_QUOTA_OFFSET_DEP	0x10
+#define QUERY_FUNC_CAP_CQ_QUOTA_OFFSET_DEP	0x14
+#define QUERY_FUNC_CAP_SRQ_QUOTA_OFFSET_DEP	0x18
+#define QUERY_FUNC_CAP_MPT_QUOTA_OFFSET_DEP	0x20
+#define QUERY_FUNC_CAP_MTT_QUOTA_OFFSET_DEP	0x24
+#define QUERY_FUNC_CAP_MCG_QUOTA_OFFSET_DEP	0x28
+#define QUERY_FUNC_CAP_MAX_EQ_OFFSET		0x2c
+#define QUERY_FUNC_CAP_RESERVED_EQ_OFFSET	0x30
+
+#define QUERY_FUNC_CAP_QP_QUOTA_OFFSET		0x50
+#define QUERY_FUNC_CAP_CQ_QUOTA_OFFSET		0x54
+#define QUERY_FUNC_CAP_SRQ_QUOTA_OFFSET		0x58
+#define QUERY_FUNC_CAP_MPT_QUOTA_OFFSET		0x60
+#define QUERY_FUNC_CAP_MTT_QUOTA_OFFSET		0x64
+#define QUERY_FUNC_CAP_MCG_QUOTA_OFFSET		0x68
+
+#define QUERY_FUNC_CAP_FMR_FLAG			0x80
+#define QUERY_FUNC_CAP_FLAG_RDMA		0x40
+#define QUERY_FUNC_CAP_FLAG_ETH			0x80
+#define QUERY_FUNC_CAP_FLAG_QUOTAS		0x10
+
+/* when opcode modifier = 1 */
+#define QUERY_FUNC_CAP_PHYS_PORT_OFFSET		0x3
+#define QUERY_FUNC_CAP_PRIV_VF_QKEY_OFFSET	0x4
+#define QUERY_FUNC_CAP_FLAGS0_OFFSET		0x8
+#define QUERY_FUNC_CAP_FLAGS1_OFFSET		0xc
+
+#define QUERY_FUNC_CAP_QP0_TUNNEL		0x10
+#define QUERY_FUNC_CAP_QP0_PROXY		0x14
+#define QUERY_FUNC_CAP_QP1_TUNNEL		0x18
+#define QUERY_FUNC_CAP_QP1_PROXY		0x1c
+#define QUERY_FUNC_CAP_PHYS_PORT_ID		0x28
+
+#define QUERY_FUNC_CAP_FLAGS1_FORCE_MAC		0x40
+#define QUERY_FUNC_CAP_FLAGS1_FORCE_VLAN	0x80
+#define QUERY_FUNC_CAP_FLAGS1_NIC_INFO			0x10
+#define QUERY_FUNC_CAP_VF_ENABLE_QP0		0x08
+
+#define QUERY_FUNC_CAP_FLAGS0_FORCE_PHY_WQE_GID 0x80
+
+	if (vhcr->op_modifier == 1) {
+		struct mlx4_active_ports actv_ports =
+			mlx4_get_active_ports(dev, slave);
+		int converted_port = mlx4_slave_convert_port(
+				dev, slave, vhcr->in_modifier);
+
+		if (converted_port < 0)
+			return -EINVAL;
+
+		vhcr->in_modifier = converted_port;
+		/* phys-port = logical-port */
+		field = vhcr->in_modifier -
+			find_first_bit(actv_ports.ports, dev->caps.num_ports);
+		MLX4_PUT(outbox->buf, field, QUERY_FUNC_CAP_PHYS_PORT_OFFSET);
+
+		port = vhcr->in_modifier;
+		proxy_qp = dev->phys_caps.base_proxy_sqpn + 8 * slave + port - 1;
+
+		/* Set nic_info bit to mark new fields support */
+		field  = QUERY_FUNC_CAP_FLAGS1_NIC_INFO;
+
+		if (mlx4_vf_smi_enabled(dev, slave, port) &&
+		    !mlx4_get_parav_qkey(dev, proxy_qp, &qkey)) {
+			field |= QUERY_FUNC_CAP_VF_ENABLE_QP0;
+			MLX4_PUT(outbox->buf, qkey,
+				 QUERY_FUNC_CAP_PRIV_VF_QKEY_OFFSET);
+		}
+		MLX4_PUT(outbox->buf, field, QUERY_FUNC_CAP_FLAGS1_OFFSET);
+
+		/* size is now the QP number */
+		size = dev->phys_caps.base_tunnel_sqpn + 8 * slave + port - 1;
+		MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_QP0_TUNNEL);
+
+		size += 2;
+		MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_QP1_TUNNEL);
+
+		MLX4_PUT(outbox->buf, proxy_qp, QUERY_FUNC_CAP_QP0_PROXY);
+		proxy_qp += 2;
+		MLX4_PUT(outbox->buf, proxy_qp, QUERY_FUNC_CAP_QP1_PROXY);
+
+		MLX4_PUT(outbox->buf, dev->caps.phys_port_id[vhcr->in_modifier],
+			 QUERY_FUNC_CAP_PHYS_PORT_ID);
+
+	} else if (vhcr->op_modifier == 0) {
+		struct mlx4_active_ports actv_ports =
+			mlx4_get_active_ports(dev, slave);
+		/* enable rdma and ethernet interfaces, and new quota locations */
+		field = (QUERY_FUNC_CAP_FLAG_ETH | QUERY_FUNC_CAP_FLAG_RDMA |
+			 QUERY_FUNC_CAP_FLAG_QUOTAS);
+		MLX4_PUT(outbox->buf, field, QUERY_FUNC_CAP_FLAGS_OFFSET);
+
+		field = min(
+			bitmap_weight(actv_ports.ports, dev->caps.num_ports),
+			dev->caps.num_ports);
+		MLX4_PUT(outbox->buf, field, QUERY_FUNC_CAP_NUM_PORTS_OFFSET);
+
+		size = dev->caps.function_caps; /* set PF behaviours */
+		MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_PF_BHVR_OFFSET);
+
+		field = 0; /* protected FMR support not available as yet */
+		MLX4_PUT(outbox->buf, field, QUERY_FUNC_CAP_FMR_OFFSET);
+
+		size = priv->mfunc.master.res_tracker.res_alloc[RES_QP].quota[slave];
+		MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_QP_QUOTA_OFFSET);
+		size = dev->caps.num_qps;
+		MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_QP_QUOTA_OFFSET_DEP);
+
+		size = priv->mfunc.master.res_tracker.res_alloc[RES_SRQ].quota[slave];
+		MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_SRQ_QUOTA_OFFSET);
+		size = dev->caps.num_srqs;
+		MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_SRQ_QUOTA_OFFSET_DEP);
+
+		size = priv->mfunc.master.res_tracker.res_alloc[RES_CQ].quota[slave];
+		MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_CQ_QUOTA_OFFSET);
+		size = dev->caps.num_cqs;
+		MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_CQ_QUOTA_OFFSET_DEP);
+
+		size = dev->caps.num_eqs;
+		MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_MAX_EQ_OFFSET);
+
+		size = dev->caps.reserved_eqs;
+		MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_RESERVED_EQ_OFFSET);
+
+		size = priv->mfunc.master.res_tracker.res_alloc[RES_MPT].quota[slave];
+		MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_MPT_QUOTA_OFFSET);
+		size = dev->caps.num_mpts;
+		MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_MPT_QUOTA_OFFSET_DEP);
+
+		size = priv->mfunc.master.res_tracker.res_alloc[RES_MTT].quota[slave];
+		MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_MTT_QUOTA_OFFSET);
+		size = dev->caps.num_mtts;
+		MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_MTT_QUOTA_OFFSET_DEP);
+
+		size = dev->caps.num_mgms + dev->caps.num_amgms;
+		MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_MCG_QUOTA_OFFSET);
+		MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_MCG_QUOTA_OFFSET_DEP);
+
+	} else
+		err = -EINVAL;
+
+	return err;
+}
+
+int mlx4_QUERY_FUNC_CAP(struct mlx4_dev *dev, u32 gen_or_port,
+			struct mlx4_func_cap *func_cap)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	u32			*outbox;
+	u8			field, op_modifier;
+	u32			size, qkey;
+	int			err = 0, quotas = 0;
+
+	op_modifier = !!gen_or_port; /* 0 = general, 1 = logical port */
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+
+	err = mlx4_cmd_box(dev, 0, mailbox->dma, gen_or_port, op_modifier,
+			   MLX4_CMD_QUERY_FUNC_CAP,
+			   MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED);
+	if (err)
+		goto out;
+
+	outbox = mailbox->buf;
+
+	if (!op_modifier) {
+		MLX4_GET(field, outbox, QUERY_FUNC_CAP_FLAGS_OFFSET);
+		if (!(field & (QUERY_FUNC_CAP_FLAG_ETH | QUERY_FUNC_CAP_FLAG_RDMA))) {
+			mlx4_err(dev, "The host supports neither eth nor rdma interfaces\n");
+			err = -EPROTONOSUPPORT;
+			goto out;
+		}
+		func_cap->flags = field;
+		quotas = !!(func_cap->flags & QUERY_FUNC_CAP_FLAG_QUOTAS);
+
+		MLX4_GET(field, outbox, QUERY_FUNC_CAP_NUM_PORTS_OFFSET);
+		func_cap->num_ports = field;
+
+		MLX4_GET(size, outbox, QUERY_FUNC_CAP_PF_BHVR_OFFSET);
+		func_cap->pf_context_behaviour = size;
+
+		if (quotas) {
+			MLX4_GET(size, outbox, QUERY_FUNC_CAP_QP_QUOTA_OFFSET);
+			func_cap->qp_quota = size & 0xFFFFFF;
+
+			MLX4_GET(size, outbox, QUERY_FUNC_CAP_SRQ_QUOTA_OFFSET);
+			func_cap->srq_quota = size & 0xFFFFFF;
+
+			MLX4_GET(size, outbox, QUERY_FUNC_CAP_CQ_QUOTA_OFFSET);
+			func_cap->cq_quota = size & 0xFFFFFF;
+
+			MLX4_GET(size, outbox, QUERY_FUNC_CAP_MPT_QUOTA_OFFSET);
+			func_cap->mpt_quota = size & 0xFFFFFF;
+
+			MLX4_GET(size, outbox, QUERY_FUNC_CAP_MTT_QUOTA_OFFSET);
+			func_cap->mtt_quota = size & 0xFFFFFF;
+
+			MLX4_GET(size, outbox, QUERY_FUNC_CAP_MCG_QUOTA_OFFSET);
+			func_cap->mcg_quota = size & 0xFFFFFF;
+
+		} else {
+			MLX4_GET(size, outbox, QUERY_FUNC_CAP_QP_QUOTA_OFFSET_DEP);
+			func_cap->qp_quota = size & 0xFFFFFF;
+
+			MLX4_GET(size, outbox, QUERY_FUNC_CAP_SRQ_QUOTA_OFFSET_DEP);
+			func_cap->srq_quota = size & 0xFFFFFF;
+
+			MLX4_GET(size, outbox, QUERY_FUNC_CAP_CQ_QUOTA_OFFSET_DEP);
+			func_cap->cq_quota = size & 0xFFFFFF;
+
+			MLX4_GET(size, outbox, QUERY_FUNC_CAP_MPT_QUOTA_OFFSET_DEP);
+			func_cap->mpt_quota = size & 0xFFFFFF;
+
+			MLX4_GET(size, outbox, QUERY_FUNC_CAP_MTT_QUOTA_OFFSET_DEP);
+			func_cap->mtt_quota = size & 0xFFFFFF;
+
+			MLX4_GET(size, outbox, QUERY_FUNC_CAP_MCG_QUOTA_OFFSET_DEP);
+			func_cap->mcg_quota = size & 0xFFFFFF;
+		}
+		MLX4_GET(size, outbox, QUERY_FUNC_CAP_MAX_EQ_OFFSET);
+		func_cap->max_eq = size & 0xFFFFFF;
+
+		MLX4_GET(size, outbox, QUERY_FUNC_CAP_RESERVED_EQ_OFFSET);
+		func_cap->reserved_eq = size & 0xFFFFFF;
+
+		goto out;
+	}
+
+	/* logical port query */
+	if (gen_or_port > dev->caps.num_ports) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	MLX4_GET(func_cap->flags1, outbox, QUERY_FUNC_CAP_FLAGS1_OFFSET);
+	if (dev->caps.port_type[gen_or_port] == MLX4_PORT_TYPE_ETH) {
+		if (func_cap->flags1 & QUERY_FUNC_CAP_FLAGS1_FORCE_VLAN) {
+			mlx4_err(dev, "VLAN is enforced on this port\n");
+			err = -EPROTONOSUPPORT;
+			goto out;
+		}
+
+		if (func_cap->flags1 & QUERY_FUNC_CAP_FLAGS1_FORCE_MAC) {
+			mlx4_err(dev, "Force mac is enabled on this port\n");
+			err = -EPROTONOSUPPORT;
+			goto out;
+		}
+	} else if (dev->caps.port_type[gen_or_port] == MLX4_PORT_TYPE_IB) {
+		MLX4_GET(field, outbox, QUERY_FUNC_CAP_FLAGS0_OFFSET);
+		if (field & QUERY_FUNC_CAP_FLAGS0_FORCE_PHY_WQE_GID) {
+			mlx4_err(dev, "phy_wqe_gid is enforced on this ib port\n");
+			err = -EPROTONOSUPPORT;
+			goto out;
+		}
+	}
+
+	MLX4_GET(field, outbox, QUERY_FUNC_CAP_PHYS_PORT_OFFSET);
+	func_cap->physical_port = field;
+	if (func_cap->physical_port != gen_or_port) {
+		err = -ENOSYS;
+		goto out;
+	}
+
+	if (func_cap->flags1 & QUERY_FUNC_CAP_VF_ENABLE_QP0) {
+		MLX4_GET(qkey, outbox, QUERY_FUNC_CAP_PRIV_VF_QKEY_OFFSET);
+		func_cap->qp0_qkey = qkey;
+	} else {
+		func_cap->qp0_qkey = 0;
+	}
+
+	MLX4_GET(size, outbox, QUERY_FUNC_CAP_QP0_TUNNEL);
+	func_cap->qp0_tunnel_qpn = size & 0xFFFFFF;
+
+	MLX4_GET(size, outbox, QUERY_FUNC_CAP_QP0_PROXY);
+	func_cap->qp0_proxy_qpn = size & 0xFFFFFF;
+
+	MLX4_GET(size, outbox, QUERY_FUNC_CAP_QP1_TUNNEL);
+	func_cap->qp1_tunnel_qpn = size & 0xFFFFFF;
+
+	MLX4_GET(size, outbox, QUERY_FUNC_CAP_QP1_PROXY);
+	func_cap->qp1_proxy_qpn = size & 0xFFFFFF;
+
+	if (func_cap->flags1 & QUERY_FUNC_CAP_FLAGS1_NIC_INFO)
+		MLX4_GET(func_cap->phys_port_id, outbox,
+			 QUERY_FUNC_CAP_PHYS_PORT_ID);
+
+	/* All other resources are allocated by the master, but we still report
+	 * 'num' and 'reserved' capabilities as follows:
+	 * - num remains the maximum resource index
+	 * - 'num - reserved' is the total available objects of a resource, but
+	 *   resource indices may be less than 'reserved'
+	 * TODO: set per-resource quotas */
+
+out:
+	mlx4_free_cmd_mailbox(dev, mailbox);
+
+	return err;
+}
+
+int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	u32 *outbox;
+	u8 field;
+	u32 field32, flags, ext_flags;
+	u16 size;
+	u16 stat_rate;
+	int err;
+	int i;
+
+#define QUERY_DEV_CAP_OUT_SIZE		       0x100
+#define QUERY_DEV_CAP_MAX_SRQ_SZ_OFFSET		0x10
+#define QUERY_DEV_CAP_MAX_QP_SZ_OFFSET		0x11
+#define QUERY_DEV_CAP_RSVD_QP_OFFSET		0x12
+#define QUERY_DEV_CAP_MAX_QP_OFFSET		0x13
+#define QUERY_DEV_CAP_RSVD_SRQ_OFFSET		0x14
+#define QUERY_DEV_CAP_MAX_SRQ_OFFSET		0x15
+#define QUERY_DEV_CAP_RSVD_EEC_OFFSET		0x16
+#define QUERY_DEV_CAP_MAX_EEC_OFFSET		0x17
+#define QUERY_DEV_CAP_MAX_CQ_SZ_OFFSET		0x19
+#define QUERY_DEV_CAP_RSVD_CQ_OFFSET		0x1a
+#define QUERY_DEV_CAP_MAX_CQ_OFFSET		0x1b
+#define QUERY_DEV_CAP_MAX_MPT_OFFSET		0x1d
+#define QUERY_DEV_CAP_RSVD_EQ_OFFSET		0x1e
+#define QUERY_DEV_CAP_MAX_EQ_OFFSET		0x1f
+#define QUERY_DEV_CAP_RSVD_MTT_OFFSET		0x20
+#define QUERY_DEV_CAP_MAX_MRW_SZ_OFFSET		0x21
+#define QUERY_DEV_CAP_RSVD_MRW_OFFSET		0x22
+#define QUERY_DEV_CAP_MAX_MTT_SEG_OFFSET	0x23
+#define QUERY_DEV_CAP_MAX_AV_OFFSET		0x27
+#define QUERY_DEV_CAP_MAX_REQ_QP_OFFSET		0x29
+#define QUERY_DEV_CAP_MAX_RES_QP_OFFSET		0x2b
+#define QUERY_DEV_CAP_MAX_GSO_OFFSET		0x2d
+#define QUERY_DEV_CAP_RSS_OFFSET		0x2e
+#define QUERY_DEV_CAP_MAX_RDMA_OFFSET		0x2f
+#define QUERY_DEV_CAP_RSZ_SRQ_OFFSET		0x33
+#define QUERY_DEV_CAP_ACK_DELAY_OFFSET		0x35
+#define QUERY_DEV_CAP_MTU_WIDTH_OFFSET		0x36
+#define QUERY_DEV_CAP_VL_PORT_OFFSET		0x37
+#define QUERY_DEV_CAP_MAX_MSG_SZ_OFFSET		0x38
+#define QUERY_DEV_CAP_MAX_GID_OFFSET		0x3b
+#define QUERY_DEV_CAP_RATE_SUPPORT_OFFSET	0x3c
+#define QUERY_DEV_CAP_CQ_TS_SUPPORT_OFFSET	0x3e
+#define QUERY_DEV_CAP_MAX_PKEY_OFFSET		0x3f
+#define QUERY_DEV_CAP_EXT_FLAGS_OFFSET		0x40
+#define QUERY_DEV_CAP_FLAGS_OFFSET		0x44
+#define QUERY_DEV_CAP_RSVD_UAR_OFFSET		0x48
+#define QUERY_DEV_CAP_UAR_SZ_OFFSET		0x49
+#define QUERY_DEV_CAP_PAGE_SZ_OFFSET		0x4b
+#define QUERY_DEV_CAP_BF_OFFSET			0x4c
+#define QUERY_DEV_CAP_LOG_BF_REG_SZ_OFFSET	0x4d
+#define QUERY_DEV_CAP_LOG_MAX_BF_REGS_PER_PAGE_OFFSET	0x4e
+#define QUERY_DEV_CAP_LOG_MAX_BF_PAGES_OFFSET	0x4f
+#define QUERY_DEV_CAP_MAX_SG_SQ_OFFSET		0x51
+#define QUERY_DEV_CAP_MAX_DESC_SZ_SQ_OFFSET	0x52
+#define QUERY_DEV_CAP_MAX_SG_RQ_OFFSET		0x55
+#define QUERY_DEV_CAP_MAX_DESC_SZ_RQ_OFFSET	0x56
+#define QUERY_DEV_CAP_MAX_QP_MCG_OFFSET		0x61
+#define QUERY_DEV_CAP_RSVD_MCG_OFFSET		0x62
+#define QUERY_DEV_CAP_MAX_MCG_OFFSET		0x63
+#define QUERY_DEV_CAP_RSVD_PD_OFFSET		0x64
+#define QUERY_DEV_CAP_MAX_PD_OFFSET		0x65
+#define QUERY_DEV_CAP_RSVD_XRC_OFFSET		0x66
+#define QUERY_DEV_CAP_MAX_XRC_OFFSET		0x67
+#define QUERY_DEV_CAP_MAX_COUNTERS_OFFSET	0x68
+#define QUERY_DEV_CAP_EXT_2_FLAGS_OFFSET	0x70
+#define QUERY_DEV_CAP_FLOW_STEERING_IPOIB_OFFSET	0x74
+#define QUERY_DEV_CAP_FLOW_STEERING_RANGE_EN_OFFSET	0x76
+#define QUERY_DEV_CAP_FLOW_STEERING_MAX_QP_OFFSET	0x77
+#define QUERY_DEV_CAP_CQ_EQ_CACHE_LINE_STRIDE	0x7a
+#define QUERY_DEV_CAP_RDMARC_ENTRY_SZ_OFFSET	0x80
+#define QUERY_DEV_CAP_QPC_ENTRY_SZ_OFFSET	0x82
+#define QUERY_DEV_CAP_AUX_ENTRY_SZ_OFFSET	0x84
+#define QUERY_DEV_CAP_ALTC_ENTRY_SZ_OFFSET	0x86
+#define QUERY_DEV_CAP_EQC_ENTRY_SZ_OFFSET	0x88
+#define QUERY_DEV_CAP_CQC_ENTRY_SZ_OFFSET	0x8a
+#define QUERY_DEV_CAP_SRQ_ENTRY_SZ_OFFSET	0x8c
+#define QUERY_DEV_CAP_C_MPT_ENTRY_SZ_OFFSET	0x8e
+#define QUERY_DEV_CAP_MTT_ENTRY_SZ_OFFSET	0x90
+#define QUERY_DEV_CAP_D_MPT_ENTRY_SZ_OFFSET	0x92
+#define QUERY_DEV_CAP_BMME_FLAGS_OFFSET		0x94
+#define QUERY_DEV_CAP_RSVD_LKEY_OFFSET		0x98
+#define QUERY_DEV_CAP_MAX_ICM_SZ_OFFSET		0xa0
+#define QUERY_DEV_CAP_FW_REASSIGN_MAC		0x9d
+#define QUERY_DEV_CAP_VXLAN			0x9e
+#define QUERY_DEV_CAP_MAD_DEMUX_OFFSET		0xb0
+
+	dev_cap->flags2 = 0;
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	outbox = mailbox->buf;
+
+	err = mlx4_cmd_box(dev, 0, mailbox->dma, 0, 0, MLX4_CMD_QUERY_DEV_CAP,
+			   MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE);
+	if (err)
+		goto out;
+
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_RSVD_QP_OFFSET);
+	dev_cap->reserved_qps = 1 << (field & 0xf);
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_QP_OFFSET);
+	dev_cap->max_qps = 1 << (field & 0x1f);
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_RSVD_SRQ_OFFSET);
+	dev_cap->reserved_srqs = 1 << (field >> 4);
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_SRQ_OFFSET);
+	dev_cap->max_srqs = 1 << (field & 0x1f);
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_CQ_SZ_OFFSET);
+	dev_cap->max_cq_sz = 1 << field;
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_RSVD_CQ_OFFSET);
+	dev_cap->reserved_cqs = 1 << (field & 0xf);
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_CQ_OFFSET);
+	dev_cap->max_cqs = 1 << (field & 0x1f);
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_MPT_OFFSET);
+	dev_cap->max_mpts = 1 << (field & 0x3f);
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_RSVD_EQ_OFFSET);
+	dev_cap->reserved_eqs = field & 0xf;
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_EQ_OFFSET);
+	dev_cap->max_eqs = 1 << (field & 0xf);
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_RSVD_MTT_OFFSET);
+	dev_cap->reserved_mtts = 1 << (field >> 4);
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_MRW_SZ_OFFSET);
+	dev_cap->max_mrw_sz = 1 << field;
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_RSVD_MRW_OFFSET);
+	dev_cap->reserved_mrws = 1 << (field & 0xf);
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_MTT_SEG_OFFSET);
+	dev_cap->max_mtt_seg = 1 << (field & 0x3f);
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_REQ_QP_OFFSET);
+	dev_cap->max_requester_per_qp = 1 << (field & 0x3f);
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_RES_QP_OFFSET);
+	dev_cap->max_responder_per_qp = 1 << (field & 0x3f);
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_GSO_OFFSET);
+	field &= 0x1f;
+	if (!field)
+		dev_cap->max_gso_sz = 0;
+	else
+		dev_cap->max_gso_sz = 1 << field;
+
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_RSS_OFFSET);
+	if (field & 0x20)
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_RSS_XOR;
+	if (field & 0x10)
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_RSS_TOP;
+	field &= 0xf;
+	if (field) {
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_RSS;
+		dev_cap->max_rss_tbl_sz = 1 << field;
+	} else
+		dev_cap->max_rss_tbl_sz = 0;
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_RDMA_OFFSET);
+	dev_cap->max_rdma_global = 1 << (field & 0x3f);
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_ACK_DELAY_OFFSET);
+	dev_cap->local_ca_ack_delay = field & 0x1f;
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_VL_PORT_OFFSET);
+	dev_cap->num_ports = field & 0xf;
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_MSG_SZ_OFFSET);
+	dev_cap->max_msg_sz = 1 << (field & 0x1f);
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_FLOW_STEERING_RANGE_EN_OFFSET);
+	if (field & 0x80)
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_FS_EN;
+	dev_cap->fs_log_max_ucast_qp_range_size = field & 0x1f;
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_FLOW_STEERING_IPOIB_OFFSET);
+	if (field & 0x80)
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_DMFS_IPOIB;
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_FLOW_STEERING_MAX_QP_OFFSET);
+	dev_cap->fs_max_num_qp_per_entry = field;
+	MLX4_GET(stat_rate, outbox, QUERY_DEV_CAP_RATE_SUPPORT_OFFSET);
+	dev_cap->stat_rate_support = stat_rate;
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_CQ_TS_SUPPORT_OFFSET);
+	if (field & 0x80)
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_TS;
+	MLX4_GET(ext_flags, outbox, QUERY_DEV_CAP_EXT_FLAGS_OFFSET);
+	MLX4_GET(flags, outbox, QUERY_DEV_CAP_FLAGS_OFFSET);
+	dev_cap->flags = flags | (u64)ext_flags << 32;
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_RSVD_UAR_OFFSET);
+	dev_cap->reserved_uars = field >> 4;
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_UAR_SZ_OFFSET);
+	dev_cap->uar_size = 1 << ((field & 0x3f) + 20);
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_PAGE_SZ_OFFSET);
+	dev_cap->min_page_sz = 1 << field;
+
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_BF_OFFSET);
+	if (field & 0x80) {
+		MLX4_GET(field, outbox, QUERY_DEV_CAP_LOG_BF_REG_SZ_OFFSET);
+		dev_cap->bf_reg_size = 1 << (field & 0x1f);
+		MLX4_GET(field, outbox, QUERY_DEV_CAP_LOG_MAX_BF_REGS_PER_PAGE_OFFSET);
+		if ((1 << (field & 0x3f)) > (PAGE_SIZE / dev_cap->bf_reg_size))
+			field = 3;
+		dev_cap->bf_regs_per_page = 1 << (field & 0x3f);
+		mlx4_dbg(dev, "BlueFlame available (reg size %d, regs/page %d)\n",
+			 dev_cap->bf_reg_size, dev_cap->bf_regs_per_page);
+	} else {
+		dev_cap->bf_reg_size = 0;
+		mlx4_dbg(dev, "BlueFlame not available\n");
+	}
+
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_SG_SQ_OFFSET);
+	dev_cap->max_sq_sg = field;
+	MLX4_GET(size, outbox, QUERY_DEV_CAP_MAX_DESC_SZ_SQ_OFFSET);
+	dev_cap->max_sq_desc_sz = size;
+
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_QP_MCG_OFFSET);
+	dev_cap->max_qp_per_mcg = 1 << field;
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_RSVD_MCG_OFFSET);
+	dev_cap->reserved_mgms = field & 0xf;
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_MCG_OFFSET);
+	dev_cap->max_mcgs = 1 << field;
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_RSVD_PD_OFFSET);
+	dev_cap->reserved_pds = field >> 4;
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_PD_OFFSET);
+	dev_cap->max_pds = 1 << (field & 0x3f);
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_RSVD_XRC_OFFSET);
+	dev_cap->reserved_xrcds = field >> 4;
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_XRC_OFFSET);
+	dev_cap->max_xrcds = 1 << (field & 0x1f);
+
+	MLX4_GET(size, outbox, QUERY_DEV_CAP_RDMARC_ENTRY_SZ_OFFSET);
+	dev_cap->rdmarc_entry_sz = size;
+	MLX4_GET(size, outbox, QUERY_DEV_CAP_QPC_ENTRY_SZ_OFFSET);
+	dev_cap->qpc_entry_sz = size;
+	MLX4_GET(size, outbox, QUERY_DEV_CAP_AUX_ENTRY_SZ_OFFSET);
+	dev_cap->aux_entry_sz = size;
+	MLX4_GET(size, outbox, QUERY_DEV_CAP_ALTC_ENTRY_SZ_OFFSET);
+	dev_cap->altc_entry_sz = size;
+	MLX4_GET(size, outbox, QUERY_DEV_CAP_EQC_ENTRY_SZ_OFFSET);
+	dev_cap->eqc_entry_sz = size;
+	MLX4_GET(size, outbox, QUERY_DEV_CAP_CQC_ENTRY_SZ_OFFSET);
+	dev_cap->cqc_entry_sz = size;
+	MLX4_GET(size, outbox, QUERY_DEV_CAP_SRQ_ENTRY_SZ_OFFSET);
+	dev_cap->srq_entry_sz = size;
+	MLX4_GET(size, outbox, QUERY_DEV_CAP_C_MPT_ENTRY_SZ_OFFSET);
+	dev_cap->cmpt_entry_sz = size;
+	MLX4_GET(size, outbox, QUERY_DEV_CAP_MTT_ENTRY_SZ_OFFSET);
+	dev_cap->mtt_entry_sz = size;
+	MLX4_GET(size, outbox, QUERY_DEV_CAP_D_MPT_ENTRY_SZ_OFFSET);
+	dev_cap->dmpt_entry_sz = size;
+
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_SRQ_SZ_OFFSET);
+	dev_cap->max_srq_sz = 1 << field;
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_QP_SZ_OFFSET);
+	dev_cap->max_qp_sz = 1 << field;
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_RSZ_SRQ_OFFSET);
+	dev_cap->resize_srq = field & 1;
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_SG_RQ_OFFSET);
+	dev_cap->max_rq_sg = field;
+	MLX4_GET(size, outbox, QUERY_DEV_CAP_MAX_DESC_SZ_RQ_OFFSET);
+	dev_cap->max_rq_desc_sz = size;
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_CQ_EQ_CACHE_LINE_STRIDE);
+	if (field & (1 << 6))
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_CQE_STRIDE;
+	if (field & (1 << 7))
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_EQE_STRIDE;
+
+	MLX4_GET(dev_cap->bmme_flags, outbox,
+		 QUERY_DEV_CAP_BMME_FLAGS_OFFSET);
+	MLX4_GET(dev_cap->reserved_lkey, outbox,
+		 QUERY_DEV_CAP_RSVD_LKEY_OFFSET);
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_FW_REASSIGN_MAC);
+	if (field & 1<<6)
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_REASSIGN_MAC_EN;
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_VXLAN);
+	if (field & 1<<3)
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_VXLAN_OFFLOADS;
+	MLX4_GET(dev_cap->max_icm_sz, outbox,
+		 QUERY_DEV_CAP_MAX_ICM_SZ_OFFSET);
+	if (dev_cap->flags & MLX4_DEV_CAP_FLAG_COUNTERS)
+		MLX4_GET(dev_cap->max_counters, outbox,
+			 QUERY_DEV_CAP_MAX_COUNTERS_OFFSET);
+
+	MLX4_GET(field32, outbox,
+		 QUERY_DEV_CAP_MAD_DEMUX_OFFSET);
+	if (field32 & (1 << 0))
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_MAD_DEMUX;
+
+	MLX4_GET(field32, outbox, QUERY_DEV_CAP_EXT_2_FLAGS_OFFSET);
+	if (field32 & (1 << 16))
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_UPDATE_QP;
+	if (field32 & (1 << 26))
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_VLAN_CONTROL;
+	if (field32 & (1 << 20))
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_FSM;
+
+	if (dev->flags & MLX4_FLAG_OLD_PORT_CMDS) {
+		for (i = 1; i <= dev_cap->num_ports; ++i) {
+			MLX4_GET(field, outbox, QUERY_DEV_CAP_VL_PORT_OFFSET);
+			dev_cap->max_vl[i]	   = field >> 4;
+			MLX4_GET(field, outbox, QUERY_DEV_CAP_MTU_WIDTH_OFFSET);
+			dev_cap->ib_mtu[i]	   = field >> 4;
+			dev_cap->max_port_width[i] = field & 0xf;
+			MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_GID_OFFSET);
+			dev_cap->max_gids[i]	   = 1 << (field & 0xf);
+			MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_PKEY_OFFSET);
+			dev_cap->max_pkeys[i]	   = 1 << (field & 0xf);
+		}
+	} else {
+#define QUERY_PORT_SUPPORTED_TYPE_OFFSET	0x00
+#define QUERY_PORT_MTU_OFFSET			0x01
+#define QUERY_PORT_ETH_MTU_OFFSET		0x02
+#define QUERY_PORT_WIDTH_OFFSET			0x06
+#define QUERY_PORT_MAX_GID_PKEY_OFFSET		0x07
+#define QUERY_PORT_MAX_MACVLAN_OFFSET		0x0a
+#define QUERY_PORT_MAX_VL_OFFSET		0x0b
+#define QUERY_PORT_MAC_OFFSET			0x10
+#define QUERY_PORT_TRANS_VENDOR_OFFSET		0x18
+#define QUERY_PORT_WAVELENGTH_OFFSET		0x1c
+#define QUERY_PORT_TRANS_CODE_OFFSET		0x20
+
+		for (i = 1; i <= dev_cap->num_ports; ++i) {
+			err = mlx4_cmd_box(dev, 0, mailbox->dma, i, 0, MLX4_CMD_QUERY_PORT,
+					   MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE);
+			if (err)
+				goto out;
+
+			MLX4_GET(field, outbox, QUERY_PORT_SUPPORTED_TYPE_OFFSET);
+			dev_cap->supported_port_types[i] = field & 3;
+			dev_cap->suggested_type[i] = (field >> 3) & 1;
+			dev_cap->default_sense[i] = (field >> 4) & 1;
+			MLX4_GET(field, outbox, QUERY_PORT_MTU_OFFSET);
+			dev_cap->ib_mtu[i]	   = field & 0xf;
+			MLX4_GET(field, outbox, QUERY_PORT_WIDTH_OFFSET);
+			dev_cap->max_port_width[i] = field & 0xf;
+			MLX4_GET(field, outbox, QUERY_PORT_MAX_GID_PKEY_OFFSET);
+			dev_cap->max_gids[i]	   = 1 << (field >> 4);
+			dev_cap->max_pkeys[i]	   = 1 << (field & 0xf);
+			MLX4_GET(field, outbox, QUERY_PORT_MAX_VL_OFFSET);
+			dev_cap->max_vl[i]	   = field & 0xf;
+			MLX4_GET(field, outbox, QUERY_PORT_MAX_MACVLAN_OFFSET);
+			dev_cap->log_max_macs[i]  = field & 0xf;
+			dev_cap->log_max_vlans[i] = field >> 4;
+			MLX4_GET(dev_cap->eth_mtu[i], outbox, QUERY_PORT_ETH_MTU_OFFSET);
+			MLX4_GET(dev_cap->def_mac[i], outbox, QUERY_PORT_MAC_OFFSET);
+			MLX4_GET(field32, outbox, QUERY_PORT_TRANS_VENDOR_OFFSET);
+			dev_cap->trans_type[i] = field32 >> 24;
+			dev_cap->vendor_oui[i] = field32 & 0xffffff;
+			MLX4_GET(dev_cap->wavelength[i], outbox, QUERY_PORT_WAVELENGTH_OFFSET);
+			MLX4_GET(dev_cap->trans_code[i], outbox, QUERY_PORT_TRANS_CODE_OFFSET);
+		}
+	}
+
+	mlx4_dbg(dev, "Base MM extensions: flags %08x, rsvd L_Key %08x\n",
+		 dev_cap->bmme_flags, dev_cap->reserved_lkey);
+
+	/*
+	 * Each UAR has 4 EQ doorbells; so if a UAR is reserved, then
+	 * we can't use any EQs whose doorbell falls on that page,
+	 * even if the EQ itself isn't reserved.
+	 */
+	dev_cap->reserved_eqs = max(dev_cap->reserved_uars * 4,
+				    dev_cap->reserved_eqs);
+
+	mlx4_dbg(dev, "Max ICM size %lld MB\n",
+		 (unsigned long long) dev_cap->max_icm_sz >> 20);
+	mlx4_dbg(dev, "Max QPs: %d, reserved QPs: %d, entry size: %d\n",
+		 dev_cap->max_qps, dev_cap->reserved_qps, dev_cap->qpc_entry_sz);
+	mlx4_dbg(dev, "Max SRQs: %d, reserved SRQs: %d, entry size: %d\n",
+		 dev_cap->max_srqs, dev_cap->reserved_srqs, dev_cap->srq_entry_sz);
+	mlx4_dbg(dev, "Max CQs: %d, reserved CQs: %d, entry size: %d\n",
+		 dev_cap->max_cqs, dev_cap->reserved_cqs, dev_cap->cqc_entry_sz);
+	mlx4_dbg(dev, "Max EQs: %d, reserved EQs: %d, entry size: %d\n",
+		 dev_cap->max_eqs, dev_cap->reserved_eqs, dev_cap->eqc_entry_sz);
+	mlx4_dbg(dev, "reserved MPTs: %d, reserved MTTs: %d\n",
+		 dev_cap->reserved_mrws, dev_cap->reserved_mtts);
+	mlx4_dbg(dev, "Max PDs: %d, reserved PDs: %d, reserved UARs: %d\n",
+		 dev_cap->max_pds, dev_cap->reserved_pds, dev_cap->reserved_uars);
+	mlx4_dbg(dev, "Max QP/MCG: %d, reserved MGMs: %d\n",
+		 dev_cap->max_pds, dev_cap->reserved_mgms);
+	mlx4_dbg(dev, "Max CQEs: %d, max WQEs: %d, max SRQ WQEs: %d\n",
+		 dev_cap->max_cq_sz, dev_cap->max_qp_sz, dev_cap->max_srq_sz);
+	mlx4_dbg(dev, "Local CA ACK delay: %d, max MTU: %d, port width cap: %d\n",
+		 dev_cap->local_ca_ack_delay, 128 << dev_cap->ib_mtu[1],
+		 dev_cap->max_port_width[1]);
+	mlx4_dbg(dev, "Max SQ desc size: %d, max SQ S/G: %d\n",
+		 dev_cap->max_sq_desc_sz, dev_cap->max_sq_sg);
+	mlx4_dbg(dev, "Max RQ desc size: %d, max RQ S/G: %d\n",
+		 dev_cap->max_rq_desc_sz, dev_cap->max_rq_sg);
+	mlx4_dbg(dev, "Max GSO size: %d\n", dev_cap->max_gso_sz);
+	mlx4_dbg(dev, "Max counters: %d\n", dev_cap->max_counters);
+	mlx4_dbg(dev, "Max RSS Table size: %d\n", dev_cap->max_rss_tbl_sz);
+
+	dump_dev_cap_flags(dev, dev_cap->flags);
+	dump_dev_cap_flags2(dev, dev_cap->flags2);
+
+out:
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+
+int mlx4_QUERY_DEV_CAP_wrapper(struct mlx4_dev *dev, int slave,
+			       struct mlx4_vhcr *vhcr,
+			       struct mlx4_cmd_mailbox *inbox,
+			       struct mlx4_cmd_mailbox *outbox,
+			       struct mlx4_cmd_info *cmd)
+{
+	u64	flags;
+	int	err = 0;
+	u8	field;
+	u32	bmme_flags;
+	int	real_port;
+	int	slave_port;
+	int	first_port;
+	struct mlx4_active_ports actv_ports;
+
+	err = mlx4_cmd_box(dev, 0, outbox->dma, 0, 0, MLX4_CMD_QUERY_DEV_CAP,
+			   MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE);
+	if (err)
+		return err;
+
+	/* add port mng change event capability and disable mw type 1
+	 * unconditionally to slaves
+	 */
+	MLX4_GET(flags, outbox->buf, QUERY_DEV_CAP_EXT_FLAGS_OFFSET);
+	flags |= MLX4_DEV_CAP_FLAG_PORT_MNG_CHG_EV;
+	flags &= ~MLX4_DEV_CAP_FLAG_MEM_WINDOW;
+	actv_ports = mlx4_get_active_ports(dev, slave);
+	first_port = find_first_bit(actv_ports.ports, dev->caps.num_ports);
+	for (slave_port = 0, real_port = first_port;
+	     real_port < first_port +
+	     bitmap_weight(actv_ports.ports, dev->caps.num_ports);
+	     ++real_port, ++slave_port) {
+		if (flags & (MLX4_DEV_CAP_FLAG_WOL_PORT1 << real_port))
+			flags |= MLX4_DEV_CAP_FLAG_WOL_PORT1 << slave_port;
+		else
+			flags &= ~(MLX4_DEV_CAP_FLAG_WOL_PORT1 << slave_port);
+	}
+	for (; slave_port < dev->caps.num_ports; ++slave_port)
+		flags &= ~(MLX4_DEV_CAP_FLAG_WOL_PORT1 << slave_port);
+	MLX4_PUT(outbox->buf, flags, QUERY_DEV_CAP_EXT_FLAGS_OFFSET);
+
+	MLX4_GET(field, outbox->buf, QUERY_DEV_CAP_VL_PORT_OFFSET);
+	field &= ~0x0F;
+	field |= bitmap_weight(actv_ports.ports, dev->caps.num_ports) & 0x0F;
+	MLX4_PUT(outbox->buf, field, QUERY_DEV_CAP_VL_PORT_OFFSET);
+
+	/* For guests, disable timestamp */
+	MLX4_GET(field, outbox->buf, QUERY_DEV_CAP_CQ_TS_SUPPORT_OFFSET);
+	field &= 0x7f;
+	MLX4_PUT(outbox->buf, field, QUERY_DEV_CAP_CQ_TS_SUPPORT_OFFSET);
+
+	/* For guests, disable vxlan tunneling */
+	MLX4_GET(field, outbox->buf, QUERY_DEV_CAP_VXLAN);
+	field &= 0xf7;
+	MLX4_PUT(outbox->buf, field, QUERY_DEV_CAP_VXLAN);
+
+	/* For guests, report Blueflame disabled */
+	MLX4_GET(field, outbox->buf, QUERY_DEV_CAP_BF_OFFSET);
+	field &= 0x7f;
+	MLX4_PUT(outbox->buf, field, QUERY_DEV_CAP_BF_OFFSET);
+
+	/* For guests, disable mw type 2 */
+	MLX4_GET(bmme_flags, outbox->buf, QUERY_DEV_CAP_BMME_FLAGS_OFFSET);
+	bmme_flags &= ~MLX4_BMME_FLAG_TYPE_2_WIN;
+	MLX4_PUT(outbox->buf, bmme_flags, QUERY_DEV_CAP_BMME_FLAGS_OFFSET);
+
+	/* turn off device-managed steering capability if not enabled */
+	if (dev->caps.steering_mode != MLX4_STEERING_MODE_DEVICE_MANAGED) {
+		MLX4_GET(field, outbox->buf,
+			 QUERY_DEV_CAP_FLOW_STEERING_RANGE_EN_OFFSET);
+		field &= 0x7f;
+		MLX4_PUT(outbox->buf, field,
+			 QUERY_DEV_CAP_FLOW_STEERING_RANGE_EN_OFFSET);
+	}
+
+	/* turn off ipoib managed steering for guests */
+	MLX4_GET(field, outbox->buf, QUERY_DEV_CAP_FLOW_STEERING_IPOIB_OFFSET);
+	field &= ~0x80;
+	MLX4_PUT(outbox->buf, field, QUERY_DEV_CAP_FLOW_STEERING_IPOIB_OFFSET);
+
+	return 0;
+}
+
+int mlx4_QUERY_PORT_wrapper(struct mlx4_dev *dev, int slave,
+			    struct mlx4_vhcr *vhcr,
+			    struct mlx4_cmd_mailbox *inbox,
+			    struct mlx4_cmd_mailbox *outbox,
+			    struct mlx4_cmd_info *cmd)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	u64 def_mac;
+	u8 port_type;
+	u16 short_field;
+	int err;
+	int admin_link_state;
+	int port = mlx4_slave_convert_port(dev, slave,
+					   vhcr->in_modifier & 0xFF);
+
+#define MLX4_VF_PORT_NO_LINK_SENSE_MASK	0xE0
+#define MLX4_PORT_LINK_UP_MASK		0x80
+#define QUERY_PORT_CUR_MAX_PKEY_OFFSET	0x0c
+#define QUERY_PORT_CUR_MAX_GID_OFFSET	0x0e
+
+	if (port < 0)
+		return -EINVAL;
+
+	/* Protect against untrusted guests: enforce that this is the
+	 * QUERY_PORT general query.
+	 */
+	if (vhcr->op_modifier || vhcr->in_modifier & ~0xFF)
+		return -EINVAL;
+
+	vhcr->in_modifier = port;
+
+	err = mlx4_cmd_box(dev, 0, outbox->dma, vhcr->in_modifier, 0,
+			   MLX4_CMD_QUERY_PORT, MLX4_CMD_TIME_CLASS_B,
+			   MLX4_CMD_NATIVE);
+
+	if (!err && dev->caps.function != slave) {
+		def_mac = priv->mfunc.master.vf_oper[slave].vport[vhcr->in_modifier].state.mac;
+		MLX4_PUT(outbox->buf, def_mac, QUERY_PORT_MAC_OFFSET);
+
+		/* get port type - currently only eth is enabled */
+		MLX4_GET(port_type, outbox->buf,
+			 QUERY_PORT_SUPPORTED_TYPE_OFFSET);
+
+		/* No link sensing allowed */
+		port_type &= MLX4_VF_PORT_NO_LINK_SENSE_MASK;
+		/* set port type to currently operating port type */
+		port_type |= (dev->caps.port_type[vhcr->in_modifier] & 0x3);
+
+		admin_link_state = priv->mfunc.master.vf_oper[slave].vport[vhcr->in_modifier].state.link_state;
+		if (IFLA_VF_LINK_STATE_ENABLE == admin_link_state)
+			port_type |= MLX4_PORT_LINK_UP_MASK;
+		else if (IFLA_VF_LINK_STATE_DISABLE == admin_link_state)
+			port_type &= ~MLX4_PORT_LINK_UP_MASK;
+
+		MLX4_PUT(outbox->buf, port_type,
+			 QUERY_PORT_SUPPORTED_TYPE_OFFSET);
+
+		if (dev->caps.port_type[vhcr->in_modifier] == MLX4_PORT_TYPE_ETH)
+			short_field = mlx4_get_slave_num_gids(dev, slave, port);
+		else
+			short_field = 1; /* slave max gids */
+		MLX4_PUT(outbox->buf, short_field,
+			 QUERY_PORT_CUR_MAX_GID_OFFSET);
+
+		short_field = dev->caps.pkey_table_len[vhcr->in_modifier];
+		MLX4_PUT(outbox->buf, short_field,
+			 QUERY_PORT_CUR_MAX_PKEY_OFFSET);
+	}
+
+	return err;
+}
+
+int mlx4_get_slave_pkey_gid_tbl_len(struct mlx4_dev *dev, u8 port,
+				    int *gid_tbl_len, int *pkey_tbl_len)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	u32			*outbox;
+	u16			field;
+	int			err;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+
+	err =  mlx4_cmd_box(dev, 0, mailbox->dma, port, 0,
+			    MLX4_CMD_QUERY_PORT, MLX4_CMD_TIME_CLASS_B,
+			    MLX4_CMD_WRAPPED);
+	if (err)
+		goto out;
+
+	outbox = mailbox->buf;
+
+	MLX4_GET(field, outbox, QUERY_PORT_CUR_MAX_GID_OFFSET);
+	*gid_tbl_len = field;
+
+	MLX4_GET(field, outbox, QUERY_PORT_CUR_MAX_PKEY_OFFSET);
+	*pkey_tbl_len = field;
+
+out:
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+EXPORT_SYMBOL(mlx4_get_slave_pkey_gid_tbl_len);
+
+int mlx4_map_cmd(struct mlx4_dev *dev, u16 op, struct mlx4_icm *icm, u64 virt)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_icm_iter iter;
+	__be64 *pages;
+	int lg;
+	int nent = 0;
+	int i;
+	int err = 0;
+	int ts = 0, tc = 0;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	pages = mailbox->buf;
+
+	for (mlx4_icm_first(icm, &iter);
+	     !mlx4_icm_last(&iter);
+	     mlx4_icm_next(&iter)) {
+		/*
+		 * We have to pass pages that are aligned to their
+		 * size, so find the least significant 1 in the
+		 * address or size and use that as our log2 size.
+		 */
+		lg = ffs(mlx4_icm_addr(&iter) | mlx4_icm_size(&iter)) - 1;
+		if (lg < MLX4_ICM_PAGE_SHIFT) {
+			mlx4_warn(dev, "Got FW area not aligned to %d (%llx/%lx)\n",
+				  MLX4_ICM_PAGE_SIZE,
+				  (unsigned long long) mlx4_icm_addr(&iter),
+				  mlx4_icm_size(&iter));
+			err = -EINVAL;
+			goto out;
+		}
+
+		for (i = 0; i < mlx4_icm_size(&iter) >> lg; ++i) {
+			if (virt != -1) {
+				pages[nent * 2] = cpu_to_be64(virt);
+				virt += 1 << lg;
+			}
+
+			pages[nent * 2 + 1] =
+				cpu_to_be64((mlx4_icm_addr(&iter) + (i << lg)) |
+					    (lg - MLX4_ICM_PAGE_SHIFT));
+			ts += 1 << (lg - 10);
+			++tc;
+
+			if (++nent == MLX4_MAILBOX_SIZE / 16) {
+				err = mlx4_cmd(dev, mailbox->dma, nent, 0, op,
+						MLX4_CMD_TIME_CLASS_B,
+						MLX4_CMD_NATIVE);
+				if (err)
+					goto out;
+				nent = 0;
+			}
+		}
+	}
+
+	if (nent)
+		err = mlx4_cmd(dev, mailbox->dma, nent, 0, op,
+			       MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE);
+	if (err)
+		goto out;
+
+	switch (op) {
+	case MLX4_CMD_MAP_FA:
+		mlx4_dbg(dev, "Mapped %d chunks/%d KB for FW\n", tc, ts);
+		break;
+	case MLX4_CMD_MAP_ICM_AUX:
+		mlx4_dbg(dev, "Mapped %d chunks/%d KB for ICM aux\n", tc, ts);
+		break;
+	case MLX4_CMD_MAP_ICM:
+		mlx4_dbg(dev, "Mapped %d chunks/%d KB at %llx for ICM\n",
+			 tc, ts, (unsigned long long) virt - (ts << 10));
+		break;
+	}
+
+out:
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+
+int mlx4_MAP_FA(struct mlx4_dev *dev, struct mlx4_icm *icm)
+{
+	return mlx4_map_cmd(dev, MLX4_CMD_MAP_FA, icm, -1);
+}
+
+int mlx4_UNMAP_FA(struct mlx4_dev *dev)
+{
+	return mlx4_cmd(dev, 0, 0, 0, MLX4_CMD_UNMAP_FA,
+			MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE);
+}
+
+
+int mlx4_RUN_FW(struct mlx4_dev *dev)
+{
+	return mlx4_cmd(dev, 0, 0, 0, MLX4_CMD_RUN_FW,
+			MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE);
+}
+
+int mlx4_QUERY_FW(struct mlx4_dev *dev)
+{
+	struct mlx4_fw  *fw  = &mlx4_priv(dev)->fw;
+	struct mlx4_cmd *cmd = &mlx4_priv(dev)->cmd;
+	struct mlx4_cmd_mailbox *mailbox;
+	u32 *outbox;
+	int err = 0;
+	u64 fw_ver;
+	u16 cmd_if_rev;
+	u8 lg;
+
+#define QUERY_FW_OUT_SIZE             0x100
+#define QUERY_FW_VER_OFFSET            0x00
+#define QUERY_FW_PPF_ID		       0x09
+#define QUERY_FW_CMD_IF_REV_OFFSET     0x0a
+#define QUERY_FW_MAX_CMD_OFFSET        0x0f
+#define QUERY_FW_ERR_START_OFFSET      0x30
+#define QUERY_FW_ERR_SIZE_OFFSET       0x38
+#define QUERY_FW_ERR_BAR_OFFSET        0x3c
+
+#define QUERY_FW_SIZE_OFFSET           0x00
+#define QUERY_FW_CLR_INT_BASE_OFFSET   0x20
+#define QUERY_FW_CLR_INT_BAR_OFFSET    0x28
+
+#define QUERY_FW_COMM_BASE_OFFSET      0x40
+#define QUERY_FW_COMM_BAR_OFFSET       0x48
+
+#define QUERY_FW_CLOCK_OFFSET	       0x50
+#define QUERY_FW_CLOCK_BAR	       0x58
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	outbox = mailbox->buf;
+
+	err = mlx4_cmd_box(dev, 0, mailbox->dma, 0, 0, MLX4_CMD_QUERY_FW,
+			    MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE);
+	if (err)
+		goto out;
+
+	MLX4_GET(fw_ver, outbox, QUERY_FW_VER_OFFSET);
+	/*
+	 * FW subminor version is at more significant bits than minor
+	 * version, so swap here.
+	 */
+	dev->caps.fw_ver = (fw_ver & 0xffff00000000ull) |
+		((fw_ver & 0xffff0000ull) >> 16) |
+		((fw_ver & 0x0000ffffull) << 16);
+
+	MLX4_GET(lg, outbox, QUERY_FW_PPF_ID);
+	dev->caps.function = lg;
+
+	if (mlx4_is_slave(dev))
+		goto out;
+
+
+	MLX4_GET(cmd_if_rev, outbox, QUERY_FW_CMD_IF_REV_OFFSET);
+	if (cmd_if_rev < MLX4_COMMAND_INTERFACE_MIN_REV ||
+	    cmd_if_rev > MLX4_COMMAND_INTERFACE_MAX_REV) {
+		mlx4_err(dev, "Installed FW has unsupported command interface revision %d\n",
+			 cmd_if_rev);
+		mlx4_err(dev, "(Installed FW version is %d.%d.%03d)\n",
+			 (int) (dev->caps.fw_ver >> 32),
+			 (int) (dev->caps.fw_ver >> 16) & 0xffff,
+			 (int) dev->caps.fw_ver & 0xffff);
+		mlx4_err(dev, "This driver version supports only revisions %d to %d\n",
+			 MLX4_COMMAND_INTERFACE_MIN_REV, MLX4_COMMAND_INTERFACE_MAX_REV);
+		err = -ENODEV;
+		goto out;
+	}
+
+	if (cmd_if_rev < MLX4_COMMAND_INTERFACE_NEW_PORT_CMDS)
+		dev->flags |= MLX4_FLAG_OLD_PORT_CMDS;
+
+	MLX4_GET(lg, outbox, QUERY_FW_MAX_CMD_OFFSET);
+	cmd->max_cmds = 1 << lg;
+
+	mlx4_dbg(dev, "FW version %d.%d.%03d (cmd intf rev %d), max commands %d\n",
+		 (int) (dev->caps.fw_ver >> 32),
+		 (int) (dev->caps.fw_ver >> 16) & 0xffff,
+		 (int) dev->caps.fw_ver & 0xffff,
+		 cmd_if_rev, cmd->max_cmds);
+
+	MLX4_GET(fw->catas_offset, outbox, QUERY_FW_ERR_START_OFFSET);
+	MLX4_GET(fw->catas_size,   outbox, QUERY_FW_ERR_SIZE_OFFSET);
+	MLX4_GET(fw->catas_bar,    outbox, QUERY_FW_ERR_BAR_OFFSET);
+	fw->catas_bar = (fw->catas_bar >> 6) * 2;
+
+	mlx4_dbg(dev, "Catastrophic error buffer at 0x%llx, size 0x%x, BAR %d\n",
+		 (unsigned long long) fw->catas_offset, fw->catas_size, fw->catas_bar);
+
+	MLX4_GET(fw->fw_pages,     outbox, QUERY_FW_SIZE_OFFSET);
+	MLX4_GET(fw->clr_int_base, outbox, QUERY_FW_CLR_INT_BASE_OFFSET);
+	MLX4_GET(fw->clr_int_bar,  outbox, QUERY_FW_CLR_INT_BAR_OFFSET);
+	fw->clr_int_bar = (fw->clr_int_bar >> 6) * 2;
+
+	MLX4_GET(fw->comm_base, outbox, QUERY_FW_COMM_BASE_OFFSET);
+	MLX4_GET(fw->comm_bar,  outbox, QUERY_FW_COMM_BAR_OFFSET);
+	fw->comm_bar = (fw->comm_bar >> 6) * 2;
+	mlx4_dbg(dev, "Communication vector bar:%d offset:0x%llx\n",
+		 fw->comm_bar, fw->comm_base);
+	mlx4_dbg(dev, "FW size %d KB\n", fw->fw_pages >> 2);
+
+	MLX4_GET(fw->clock_offset, outbox, QUERY_FW_CLOCK_OFFSET);
+	MLX4_GET(fw->clock_bar,    outbox, QUERY_FW_CLOCK_BAR);
+	fw->clock_bar = (fw->clock_bar >> 6) * 2;
+	mlx4_dbg(dev, "Internal clock bar:%d offset:0x%llx\n",
+		 fw->clock_bar, fw->clock_offset);
+
+	/*
+	 * Round up number of system pages needed in case
+	 * MLX4_ICM_PAGE_SIZE < PAGE_SIZE.
+	 */
+	fw->fw_pages =
+		ALIGN(fw->fw_pages, PAGE_SIZE / MLX4_ICM_PAGE_SIZE) >>
+		(PAGE_SHIFT - MLX4_ICM_PAGE_SHIFT);
+
+	mlx4_dbg(dev, "Clear int @ %llx, BAR %d\n",
+		 (unsigned long long) fw->clr_int_base, fw->clr_int_bar);
+
+out:
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+
+int mlx4_QUERY_FW_wrapper(struct mlx4_dev *dev, int slave,
+			  struct mlx4_vhcr *vhcr,
+			  struct mlx4_cmd_mailbox *inbox,
+			  struct mlx4_cmd_mailbox *outbox,
+			  struct mlx4_cmd_info *cmd)
+{
+	u8 *outbuf;
+	int err;
+
+	outbuf = outbox->buf;
+	err = mlx4_cmd_box(dev, 0, outbox->dma, 0, 0, MLX4_CMD_QUERY_FW,
+			    MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE);
+	if (err)
+		return err;
+
+	/* for slaves, set pci PPF ID to invalid and zero out everything
+	 * else except FW version */
+	outbuf[0] = outbuf[1] = 0;
+	memset(&outbuf[8], 0, QUERY_FW_OUT_SIZE - 8);
+	outbuf[QUERY_FW_PPF_ID] = MLX4_INVALID_SLAVE_ID;
+
+	return 0;
+}
+
+static void get_board_id(void *vsd, char *board_id)
+{
+	int i;
+
+#define VSD_OFFSET_SIG1		0x00
+#define VSD_OFFSET_SIG2		0xde
+#define VSD_OFFSET_MLX_BOARD_ID	0xd0
+#define VSD_OFFSET_TS_BOARD_ID	0x20
+
+#define VSD_SIGNATURE_TOPSPIN	0x5ad
+
+	memset(board_id, 0, MLX4_BOARD_ID_LEN);
+
+	if (be16_to_cpup(vsd + VSD_OFFSET_SIG1) == VSD_SIGNATURE_TOPSPIN &&
+	    be16_to_cpup(vsd + VSD_OFFSET_SIG2) == VSD_SIGNATURE_TOPSPIN) {
+		strlcpy(board_id, vsd + VSD_OFFSET_TS_BOARD_ID, MLX4_BOARD_ID_LEN);
+	} else {
+		/*
+		 * The board ID is a string but the firmware byte
+		 * swaps each 4-byte word before passing it back to
+		 * us.  Therefore we need to swab it before printing.
+		 */
+		for (i = 0; i < 4; ++i)
+			((u32 *) board_id)[i] =
+				swab32(*(u32 *) (vsd + VSD_OFFSET_MLX_BOARD_ID + i * 4));
+	}
+}
+
+int mlx4_QUERY_ADAPTER(struct mlx4_dev *dev, struct mlx4_adapter *adapter)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	u32 *outbox;
+	int err;
+
+#define QUERY_ADAPTER_OUT_SIZE             0x100
+#define QUERY_ADAPTER_INTA_PIN_OFFSET      0x10
+#define QUERY_ADAPTER_VSD_OFFSET           0x20
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	outbox = mailbox->buf;
+
+	err = mlx4_cmd_box(dev, 0, mailbox->dma, 0, 0, MLX4_CMD_QUERY_ADAPTER,
+			   MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE);
+	if (err)
+		goto out;
+
+	MLX4_GET(adapter->inta_pin, outbox,    QUERY_ADAPTER_INTA_PIN_OFFSET);
+
+	get_board_id(outbox + QUERY_ADAPTER_VSD_OFFSET / 4,
+		     adapter->board_id);
+
+out:
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+
+int mlx4_INIT_HCA(struct mlx4_dev *dev, struct mlx4_init_hca_param *param)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	__be32 *inbox;
+	int err;
+
+#define INIT_HCA_IN_SIZE		 0x200
+#define INIT_HCA_VERSION_OFFSET		 0x000
+#define	 INIT_HCA_VERSION		 2
+#define INIT_HCA_VXLAN_OFFSET		 0x0c
+#define INIT_HCA_CACHELINE_SZ_OFFSET	 0x0e
+#define INIT_HCA_FLAGS_OFFSET		 0x014
+#define INIT_HCA_QPC_OFFSET		 0x020
+#define	 INIT_HCA_QPC_BASE_OFFSET	 (INIT_HCA_QPC_OFFSET + 0x10)
+#define	 INIT_HCA_LOG_QP_OFFSET		 (INIT_HCA_QPC_OFFSET + 0x17)
+#define	 INIT_HCA_SRQC_BASE_OFFSET	 (INIT_HCA_QPC_OFFSET + 0x28)
+#define	 INIT_HCA_LOG_SRQ_OFFSET	 (INIT_HCA_QPC_OFFSET + 0x2f)
+#define	 INIT_HCA_CQC_BASE_OFFSET	 (INIT_HCA_QPC_OFFSET + 0x30)
+#define	 INIT_HCA_LOG_CQ_OFFSET		 (INIT_HCA_QPC_OFFSET + 0x37)
+#define	 INIT_HCA_EQE_CQE_OFFSETS	 (INIT_HCA_QPC_OFFSET + 0x38)
+#define	 INIT_HCA_EQE_CQE_STRIDE_OFFSET  (INIT_HCA_QPC_OFFSET + 0x3b)
+#define	 INIT_HCA_ALTC_BASE_OFFSET	 (INIT_HCA_QPC_OFFSET + 0x40)
+#define	 INIT_HCA_AUXC_BASE_OFFSET	 (INIT_HCA_QPC_OFFSET + 0x50)
+#define	 INIT_HCA_EQC_BASE_OFFSET	 (INIT_HCA_QPC_OFFSET + 0x60)
+#define	 INIT_HCA_LOG_EQ_OFFSET		 (INIT_HCA_QPC_OFFSET + 0x67)
+#define	 INIT_HCA_RDMARC_BASE_OFFSET	 (INIT_HCA_QPC_OFFSET + 0x70)
+#define	 INIT_HCA_LOG_RD_OFFSET		 (INIT_HCA_QPC_OFFSET + 0x77)
+#define INIT_HCA_MCAST_OFFSET		 0x0c0
+#define	 INIT_HCA_MC_BASE_OFFSET	 (INIT_HCA_MCAST_OFFSET + 0x00)
+#define	 INIT_HCA_LOG_MC_ENTRY_SZ_OFFSET (INIT_HCA_MCAST_OFFSET + 0x12)
+#define	 INIT_HCA_LOG_MC_HASH_SZ_OFFSET	 (INIT_HCA_MCAST_OFFSET + 0x16)
+#define  INIT_HCA_UC_STEERING_OFFSET	 (INIT_HCA_MCAST_OFFSET + 0x18)
+#define	 INIT_HCA_LOG_MC_TABLE_SZ_OFFSET (INIT_HCA_MCAST_OFFSET + 0x1b)
+#define  INIT_HCA_DEVICE_MANAGED_FLOW_STEERING_EN	0x6
+#define  INIT_HCA_FS_PARAM_OFFSET         0x1d0
+#define  INIT_HCA_FS_BASE_OFFSET          (INIT_HCA_FS_PARAM_OFFSET + 0x00)
+#define  INIT_HCA_FS_LOG_ENTRY_SZ_OFFSET  (INIT_HCA_FS_PARAM_OFFSET + 0x12)
+#define  INIT_HCA_FS_LOG_TABLE_SZ_OFFSET  (INIT_HCA_FS_PARAM_OFFSET + 0x1b)
+#define  INIT_HCA_FS_ETH_BITS_OFFSET      (INIT_HCA_FS_PARAM_OFFSET + 0x21)
+#define  INIT_HCA_FS_ETH_NUM_ADDRS_OFFSET (INIT_HCA_FS_PARAM_OFFSET + 0x22)
+#define  INIT_HCA_FS_IB_BITS_OFFSET       (INIT_HCA_FS_PARAM_OFFSET + 0x25)
+#define  INIT_HCA_FS_IB_NUM_ADDRS_OFFSET  (INIT_HCA_FS_PARAM_OFFSET + 0x26)
+#define INIT_HCA_TPT_OFFSET		 0x0f0
+#define	 INIT_HCA_DMPT_BASE_OFFSET	 (INIT_HCA_TPT_OFFSET + 0x00)
+#define  INIT_HCA_TPT_MW_OFFSET		 (INIT_HCA_TPT_OFFSET + 0x08)
+#define	 INIT_HCA_LOG_MPT_SZ_OFFSET	 (INIT_HCA_TPT_OFFSET + 0x0b)
+#define	 INIT_HCA_MTT_BASE_OFFSET	 (INIT_HCA_TPT_OFFSET + 0x10)
+#define	 INIT_HCA_CMPT_BASE_OFFSET	 (INIT_HCA_TPT_OFFSET + 0x18)
+#define INIT_HCA_UAR_OFFSET		 0x120
+#define	 INIT_HCA_LOG_UAR_SZ_OFFSET	 (INIT_HCA_UAR_OFFSET + 0x0a)
+#define  INIT_HCA_UAR_PAGE_SZ_OFFSET     (INIT_HCA_UAR_OFFSET + 0x0b)
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	inbox = mailbox->buf;
+
+	*((u8 *) mailbox->buf + INIT_HCA_VERSION_OFFSET) = INIT_HCA_VERSION;
+
+	*((u8 *) mailbox->buf + INIT_HCA_CACHELINE_SZ_OFFSET) =
+		(ilog2(cache_line_size()) - 4) << 5;
+
+#if defined(__LITTLE_ENDIAN)
+	*(inbox + INIT_HCA_FLAGS_OFFSET / 4) &= ~cpu_to_be32(1 << 1);
+#elif defined(__BIG_ENDIAN)
+	*(inbox + INIT_HCA_FLAGS_OFFSET / 4) |= cpu_to_be32(1 << 1);
+#else
+#error Host endianness not defined
+#endif
+	/* Check port for UD address vector: */
+	*(inbox + INIT_HCA_FLAGS_OFFSET / 4) |= cpu_to_be32(1);
+
+	/* Enable IPoIB checksumming if we can: */
+	if (dev->caps.flags & MLX4_DEV_CAP_FLAG_IPOIB_CSUM)
+		*(inbox + INIT_HCA_FLAGS_OFFSET / 4) |= cpu_to_be32(1 << 3);
+
+	/* Enable QoS support if module parameter set */
+	if (enable_qos)
+		*(inbox + INIT_HCA_FLAGS_OFFSET / 4) |= cpu_to_be32(1 << 2);
+
+	/* enable counters */
+	if (dev->caps.flags & MLX4_DEV_CAP_FLAG_COUNTERS)
+		*(inbox + INIT_HCA_FLAGS_OFFSET / 4) |= cpu_to_be32(1 << 4);
+
+	/* CX3 is capable of extending CQEs/EQEs from 32 to 64 bytes */
+	if (dev->caps.flags & MLX4_DEV_CAP_FLAG_64B_EQE) {
+		*(inbox + INIT_HCA_EQE_CQE_OFFSETS / 4) |= cpu_to_be32(1 << 29);
+		dev->caps.eqe_size   = 64;
+		dev->caps.eqe_factor = 1;
+	} else {
+		dev->caps.eqe_size   = 32;
+		dev->caps.eqe_factor = 0;
+	}
+
+	if (dev->caps.flags & MLX4_DEV_CAP_FLAG_64B_CQE) {
+		*(inbox + INIT_HCA_EQE_CQE_OFFSETS / 4) |= cpu_to_be32(1 << 30);
+		dev->caps.cqe_size   = 64;
+		dev->caps.userspace_caps |= MLX4_USER_DEV_CAP_LARGE_CQE;
+	} else {
+		dev->caps.cqe_size   = 32;
+	}
+
+	/* CX3 is capable of extending CQEs\EQEs to strides larger than 64B */
+	if ((dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_EQE_STRIDE) &&
+	    (dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_CQE_STRIDE)) {
+		dev->caps.eqe_size = cache_line_size();
+		dev->caps.cqe_size = cache_line_size();
+		dev->caps.eqe_factor = 0;
+		MLX4_PUT(inbox, (u8)((ilog2(dev->caps.eqe_size) - 5) << 4 |
+				      (ilog2(dev->caps.eqe_size) - 5)),
+			 INIT_HCA_EQE_CQE_STRIDE_OFFSET);
+
+		/* User still need to know to support CQE > 32B */
+		dev->caps.userspace_caps |= MLX4_USER_DEV_CAP_LARGE_CQE;
+	}
+
+	/* QPC/EEC/CQC/EQC/RDMARC attributes */
+
+	MLX4_PUT(inbox, param->qpc_base,      INIT_HCA_QPC_BASE_OFFSET);
+	MLX4_PUT(inbox, param->log_num_qps,   INIT_HCA_LOG_QP_OFFSET);
+	MLX4_PUT(inbox, param->srqc_base,     INIT_HCA_SRQC_BASE_OFFSET);
+	MLX4_PUT(inbox, param->log_num_srqs,  INIT_HCA_LOG_SRQ_OFFSET);
+	MLX4_PUT(inbox, param->cqc_base,      INIT_HCA_CQC_BASE_OFFSET);
+	MLX4_PUT(inbox, param->log_num_cqs,   INIT_HCA_LOG_CQ_OFFSET);
+	MLX4_PUT(inbox, param->altc_base,     INIT_HCA_ALTC_BASE_OFFSET);
+	MLX4_PUT(inbox, param->auxc_base,     INIT_HCA_AUXC_BASE_OFFSET);
+	MLX4_PUT(inbox, param->eqc_base,      INIT_HCA_EQC_BASE_OFFSET);
+	MLX4_PUT(inbox, param->log_num_eqs,   INIT_HCA_LOG_EQ_OFFSET);
+	MLX4_PUT(inbox, param->rdmarc_base,   INIT_HCA_RDMARC_BASE_OFFSET);
+	MLX4_PUT(inbox, param->log_rd_per_qp, INIT_HCA_LOG_RD_OFFSET);
+
+	/* steering attributes */
+	if (dev->caps.steering_mode ==
+	    MLX4_STEERING_MODE_DEVICE_MANAGED) {
+		*(inbox + INIT_HCA_FLAGS_OFFSET / 4) |=
+			cpu_to_be32(1 <<
+				    INIT_HCA_DEVICE_MANAGED_FLOW_STEERING_EN);
+
+		MLX4_PUT(inbox, param->mc_base, INIT_HCA_FS_BASE_OFFSET);
+		MLX4_PUT(inbox, param->log_mc_entry_sz,
+			 INIT_HCA_FS_LOG_ENTRY_SZ_OFFSET);
+		MLX4_PUT(inbox, param->log_mc_table_sz,
+			 INIT_HCA_FS_LOG_TABLE_SZ_OFFSET);
+		/* Enable Ethernet flow steering
+		 * with udp unicast and tcp unicast
+		 */
+		MLX4_PUT(inbox, (u8) (MLX4_FS_UDP_UC_EN | MLX4_FS_TCP_UC_EN),
+			 INIT_HCA_FS_ETH_BITS_OFFSET);
+		MLX4_PUT(inbox, (u16) MLX4_FS_NUM_OF_L2_ADDR,
+			 INIT_HCA_FS_ETH_NUM_ADDRS_OFFSET);
+		/* Enable IPoIB flow steering
+		 * with udp unicast and tcp unicast
+		 */
+		MLX4_PUT(inbox, (u8) (MLX4_FS_UDP_UC_EN | MLX4_FS_TCP_UC_EN),
+			 INIT_HCA_FS_IB_BITS_OFFSET);
+		MLX4_PUT(inbox, (u16) MLX4_FS_NUM_OF_L2_ADDR,
+			 INIT_HCA_FS_IB_NUM_ADDRS_OFFSET);
+	} else {
+		MLX4_PUT(inbox, param->mc_base,	INIT_HCA_MC_BASE_OFFSET);
+		MLX4_PUT(inbox, param->log_mc_entry_sz,
+			 INIT_HCA_LOG_MC_ENTRY_SZ_OFFSET);
+		MLX4_PUT(inbox, param->log_mc_hash_sz,
+			 INIT_HCA_LOG_MC_HASH_SZ_OFFSET);
+		MLX4_PUT(inbox, param->log_mc_table_sz,
+			 INIT_HCA_LOG_MC_TABLE_SZ_OFFSET);
+		if (dev->caps.steering_mode == MLX4_STEERING_MODE_B0)
+			MLX4_PUT(inbox, (u8) (1 << 3),
+				 INIT_HCA_UC_STEERING_OFFSET);
+	}
+
+	/* TPT attributes */
+
+	MLX4_PUT(inbox, param->dmpt_base,  INIT_HCA_DMPT_BASE_OFFSET);
+	MLX4_PUT(inbox, param->mw_enabled, INIT_HCA_TPT_MW_OFFSET);
+	MLX4_PUT(inbox, param->log_mpt_sz, INIT_HCA_LOG_MPT_SZ_OFFSET);
+	MLX4_PUT(inbox, param->mtt_base,   INIT_HCA_MTT_BASE_OFFSET);
+	MLX4_PUT(inbox, param->cmpt_base,  INIT_HCA_CMPT_BASE_OFFSET);
+
+	/* UAR attributes */
+
+	MLX4_PUT(inbox, param->uar_page_sz,	INIT_HCA_UAR_PAGE_SZ_OFFSET);
+	MLX4_PUT(inbox, param->log_uar_sz,      INIT_HCA_LOG_UAR_SZ_OFFSET);
+
+	/* set parser VXLAN attributes */
+	if (dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_VXLAN_OFFLOADS) {
+		u8 parser_params = 0;
+		MLX4_PUT(inbox, parser_params,	INIT_HCA_VXLAN_OFFSET);
+	}
+
+	err = mlx4_cmd(dev, mailbox->dma, 0, 0, MLX4_CMD_INIT_HCA, 10000,
+		       MLX4_CMD_NATIVE);
+
+	if (err)
+		mlx4_err(dev, "INIT_HCA returns %d\n", err);
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+
+int mlx4_QUERY_HCA(struct mlx4_dev *dev,
+		   struct mlx4_init_hca_param *param)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	__be32 *outbox;
+	u32 dword_field;
+	int err;
+	u8 byte_field;
+
+#define QUERY_HCA_GLOBAL_CAPS_OFFSET	0x04
+#define QUERY_HCA_CORE_CLOCK_OFFSET	0x0c
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	outbox = mailbox->buf;
+
+	err = mlx4_cmd_box(dev, 0, mailbox->dma, 0, 0,
+			   MLX4_CMD_QUERY_HCA,
+			   MLX4_CMD_TIME_CLASS_B,
+			   !mlx4_is_slave(dev));
+	if (err)
+		goto out;
+
+	MLX4_GET(param->global_caps, outbox, QUERY_HCA_GLOBAL_CAPS_OFFSET);
+	MLX4_GET(param->hca_core_clock, outbox, QUERY_HCA_CORE_CLOCK_OFFSET);
+
+	/* QPC/EEC/CQC/EQC/RDMARC attributes */
+
+	MLX4_GET(param->qpc_base,      outbox, INIT_HCA_QPC_BASE_OFFSET);
+	MLX4_GET(param->log_num_qps,   outbox, INIT_HCA_LOG_QP_OFFSET);
+	MLX4_GET(param->srqc_base,     outbox, INIT_HCA_SRQC_BASE_OFFSET);
+	MLX4_GET(param->log_num_srqs,  outbox, INIT_HCA_LOG_SRQ_OFFSET);
+	MLX4_GET(param->cqc_base,      outbox, INIT_HCA_CQC_BASE_OFFSET);
+	MLX4_GET(param->log_num_cqs,   outbox, INIT_HCA_LOG_CQ_OFFSET);
+	MLX4_GET(param->altc_base,     outbox, INIT_HCA_ALTC_BASE_OFFSET);
+	MLX4_GET(param->auxc_base,     outbox, INIT_HCA_AUXC_BASE_OFFSET);
+	MLX4_GET(param->eqc_base,      outbox, INIT_HCA_EQC_BASE_OFFSET);
+	MLX4_GET(param->log_num_eqs,   outbox, INIT_HCA_LOG_EQ_OFFSET);
+	MLX4_GET(param->rdmarc_base,   outbox, INIT_HCA_RDMARC_BASE_OFFSET);
+	MLX4_GET(param->log_rd_per_qp, outbox, INIT_HCA_LOG_RD_OFFSET);
+
+	MLX4_GET(dword_field, outbox, INIT_HCA_FLAGS_OFFSET);
+	if (dword_field & (1 << INIT_HCA_DEVICE_MANAGED_FLOW_STEERING_EN)) {
+		param->steering_mode = MLX4_STEERING_MODE_DEVICE_MANAGED;
+	} else {
+		MLX4_GET(byte_field, outbox, INIT_HCA_UC_STEERING_OFFSET);
+		if (byte_field & 0x8)
+			param->steering_mode = MLX4_STEERING_MODE_B0;
+		else
+			param->steering_mode = MLX4_STEERING_MODE_A0;
+	}
+	/* steering attributes */
+	if (param->steering_mode == MLX4_STEERING_MODE_DEVICE_MANAGED) {
+		MLX4_GET(param->mc_base, outbox, INIT_HCA_FS_BASE_OFFSET);
+		MLX4_GET(param->log_mc_entry_sz, outbox,
+			 INIT_HCA_FS_LOG_ENTRY_SZ_OFFSET);
+		MLX4_GET(param->log_mc_table_sz, outbox,
+			 INIT_HCA_FS_LOG_TABLE_SZ_OFFSET);
+	} else {
+		MLX4_GET(param->mc_base, outbox, INIT_HCA_MC_BASE_OFFSET);
+		MLX4_GET(param->log_mc_entry_sz, outbox,
+			 INIT_HCA_LOG_MC_ENTRY_SZ_OFFSET);
+		MLX4_GET(param->log_mc_hash_sz,  outbox,
+			 INIT_HCA_LOG_MC_HASH_SZ_OFFSET);
+		MLX4_GET(param->log_mc_table_sz, outbox,
+			 INIT_HCA_LOG_MC_TABLE_SZ_OFFSET);
+	}
+
+	/* CX3 is capable of extending CQEs/EQEs from 32 to 64 bytes */
+	MLX4_GET(byte_field, outbox, INIT_HCA_EQE_CQE_OFFSETS);
+	if (byte_field & 0x20) /* 64-bytes eqe enabled */
+		param->dev_cap_enabled |= MLX4_DEV_CAP_64B_EQE_ENABLED;
+	if (byte_field & 0x40) /* 64-bytes cqe enabled */
+		param->dev_cap_enabled |= MLX4_DEV_CAP_64B_CQE_ENABLED;
+
+	/* CX3 is capable of extending CQEs\EQEs to strides larger than 64B */
+	MLX4_GET(byte_field, outbox, INIT_HCA_EQE_CQE_STRIDE_OFFSET);
+	if (byte_field) {
+		param->dev_cap_enabled |= MLX4_DEV_CAP_64B_EQE_ENABLED;
+		param->dev_cap_enabled |= MLX4_DEV_CAP_64B_CQE_ENABLED;
+		param->cqe_size = 1 << ((byte_field &
+					 MLX4_CQE_SIZE_MASK_STRIDE) + 5);
+		param->eqe_size = 1 << (((byte_field &
+					  MLX4_EQE_SIZE_MASK_STRIDE) >> 4) + 5);
+	}
+
+	/* TPT attributes */
+
+	MLX4_GET(param->dmpt_base,  outbox, INIT_HCA_DMPT_BASE_OFFSET);
+	MLX4_GET(param->mw_enabled, outbox, INIT_HCA_TPT_MW_OFFSET);
+	MLX4_GET(param->log_mpt_sz, outbox, INIT_HCA_LOG_MPT_SZ_OFFSET);
+	MLX4_GET(param->mtt_base,   outbox, INIT_HCA_MTT_BASE_OFFSET);
+	MLX4_GET(param->cmpt_base,  outbox, INIT_HCA_CMPT_BASE_OFFSET);
+
+	/* UAR attributes */
+
+	MLX4_GET(param->uar_page_sz, outbox, INIT_HCA_UAR_PAGE_SZ_OFFSET);
+	MLX4_GET(param->log_uar_sz, outbox, INIT_HCA_LOG_UAR_SZ_OFFSET);
+
+out:
+	mlx4_free_cmd_mailbox(dev, mailbox);
+
+	return err;
+}
+
+/* for IB-type ports only in SRIOV mode. Checks that both proxy QP0
+ * and real QP0 are active, so that the paravirtualized QP0 is ready
+ * to operate */
+static int check_qp0_state(struct mlx4_dev *dev, int function, int port)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	/* irrelevant if not infiniband */
+	if (priv->mfunc.master.qp0_state[port].proxy_qp0_active &&
+	    priv->mfunc.master.qp0_state[port].qp0_active)
+		return 1;
+	return 0;
+}
+
+int mlx4_INIT_PORT_wrapper(struct mlx4_dev *dev, int slave,
+			   struct mlx4_vhcr *vhcr,
+			   struct mlx4_cmd_mailbox *inbox,
+			   struct mlx4_cmd_mailbox *outbox,
+			   struct mlx4_cmd_info *cmd)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int port = mlx4_slave_convert_port(dev, slave, vhcr->in_modifier);
+	int err;
+
+	if (port < 0)
+		return -EINVAL;
+
+	if (priv->mfunc.master.slave_state[slave].init_port_mask & (1 << port))
+		return 0;
+
+	if (dev->caps.port_mask[port] != MLX4_PORT_TYPE_IB) {
+		/* Enable port only if it was previously disabled */
+		if (!priv->mfunc.master.init_port_ref[port]) {
+			err = mlx4_cmd(dev, 0, port, 0, MLX4_CMD_INIT_PORT,
+				       MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE);
+			if (err)
+				return err;
+		}
+		priv->mfunc.master.slave_state[slave].init_port_mask |= (1 << port);
+	} else {
+		if (slave == mlx4_master_func_num(dev)) {
+			if (check_qp0_state(dev, slave, port) &&
+			    !priv->mfunc.master.qp0_state[port].port_active) {
+				err = mlx4_cmd(dev, 0, port, 0, MLX4_CMD_INIT_PORT,
+					       MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE);
+				if (err)
+					return err;
+				priv->mfunc.master.qp0_state[port].port_active = 1;
+				priv->mfunc.master.slave_state[slave].init_port_mask |= (1 << port);
+			}
+		} else
+			priv->mfunc.master.slave_state[slave].init_port_mask |= (1 << port);
+	}
+	++priv->mfunc.master.init_port_ref[port];
+	return 0;
+}
+
+int mlx4_INIT_PORT(struct mlx4_dev *dev, int port)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	u32 *inbox;
+	int err;
+	u32 flags;
+	u16 field;
+
+	if (dev->flags & MLX4_FLAG_OLD_PORT_CMDS) {
+#define INIT_PORT_IN_SIZE          256
+#define INIT_PORT_FLAGS_OFFSET     0x00
+#define INIT_PORT_FLAG_SIG         (1 << 18)
+#define INIT_PORT_FLAG_NG          (1 << 17)
+#define INIT_PORT_FLAG_G0          (1 << 16)
+#define INIT_PORT_VL_SHIFT         4
+#define INIT_PORT_PORT_WIDTH_SHIFT 8
+#define INIT_PORT_MTU_OFFSET       0x04
+#define INIT_PORT_MAX_GID_OFFSET   0x06
+#define INIT_PORT_MAX_PKEY_OFFSET  0x0a
+#define INIT_PORT_GUID0_OFFSET     0x10
+#define INIT_PORT_NODE_GUID_OFFSET 0x18
+#define INIT_PORT_SI_GUID_OFFSET   0x20
+
+		mailbox = mlx4_alloc_cmd_mailbox(dev);
+		if (IS_ERR(mailbox))
+			return PTR_ERR(mailbox);
+		inbox = mailbox->buf;
+
+		flags = 0;
+		flags |= (dev->caps.vl_cap[port] & 0xf) << INIT_PORT_VL_SHIFT;
+		flags |= (dev->caps.port_width_cap[port] & 0xf) << INIT_PORT_PORT_WIDTH_SHIFT;
+		MLX4_PUT(inbox, flags,		  INIT_PORT_FLAGS_OFFSET);
+
+		field = 128 << dev->caps.ib_mtu_cap[port];
+		MLX4_PUT(inbox, field, INIT_PORT_MTU_OFFSET);
+		field = dev->caps.gid_table_len[port];
+		MLX4_PUT(inbox, field, INIT_PORT_MAX_GID_OFFSET);
+		field = dev->caps.pkey_table_len[port];
+		MLX4_PUT(inbox, field, INIT_PORT_MAX_PKEY_OFFSET);
+
+		err = mlx4_cmd(dev, mailbox->dma, port, 0, MLX4_CMD_INIT_PORT,
+			       MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE);
+
+		mlx4_free_cmd_mailbox(dev, mailbox);
+	} else
+		err = mlx4_cmd(dev, 0, port, 0, MLX4_CMD_INIT_PORT,
+			       MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED);
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx4_INIT_PORT);
+
+int mlx4_CLOSE_PORT_wrapper(struct mlx4_dev *dev, int slave,
+			    struct mlx4_vhcr *vhcr,
+			    struct mlx4_cmd_mailbox *inbox,
+			    struct mlx4_cmd_mailbox *outbox,
+			    struct mlx4_cmd_info *cmd)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int port = mlx4_slave_convert_port(dev, slave, vhcr->in_modifier);
+	int err;
+
+	if (port < 0)
+		return -EINVAL;
+
+	if (!(priv->mfunc.master.slave_state[slave].init_port_mask &
+	    (1 << port)))
+		return 0;
+
+	if (dev->caps.port_mask[port] != MLX4_PORT_TYPE_IB) {
+		if (priv->mfunc.master.init_port_ref[port] == 1) {
+			err = mlx4_cmd(dev, 0, port, 0, MLX4_CMD_CLOSE_PORT,
+				       1000, MLX4_CMD_NATIVE);
+			if (err)
+				return err;
+		}
+		priv->mfunc.master.slave_state[slave].init_port_mask &= ~(1 << port);
+	} else {
+		/* infiniband port */
+		if (slave == mlx4_master_func_num(dev)) {
+			if (!priv->mfunc.master.qp0_state[port].qp0_active &&
+			    priv->mfunc.master.qp0_state[port].port_active) {
+				err = mlx4_cmd(dev, 0, port, 0, MLX4_CMD_CLOSE_PORT,
+					       1000, MLX4_CMD_NATIVE);
+				if (err)
+					return err;
+				priv->mfunc.master.slave_state[slave].init_port_mask &= ~(1 << port);
+				priv->mfunc.master.qp0_state[port].port_active = 0;
+			}
+		} else
+			priv->mfunc.master.slave_state[slave].init_port_mask &= ~(1 << port);
+	}
+	--priv->mfunc.master.init_port_ref[port];
+	return 0;
+}
+
+int mlx4_CLOSE_PORT(struct mlx4_dev *dev, int port)
+{
+	return mlx4_cmd(dev, 0, port, 0, MLX4_CMD_CLOSE_PORT, 1000,
+			MLX4_CMD_WRAPPED);
+}
+EXPORT_SYMBOL_GPL(mlx4_CLOSE_PORT);
+
+int mlx4_CLOSE_HCA(struct mlx4_dev *dev, int panic)
+{
+	return mlx4_cmd(dev, 0, 0, panic, MLX4_CMD_CLOSE_HCA, 1000,
+			MLX4_CMD_NATIVE);
+}
+
+struct mlx4_config_dev {
+	__be32	update_flags;
+	__be32	rsdv1[3];
+	__be16	vxlan_udp_dport;
+	__be16	rsvd2;
+};
+
+#define MLX4_VXLAN_UDP_DPORT (1 << 0)
+
+static int mlx4_CONFIG_DEV(struct mlx4_dev *dev, struct mlx4_config_dev *config_dev)
+{
+	int err;
+	struct mlx4_cmd_mailbox *mailbox;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+
+	memcpy(mailbox->buf, config_dev, sizeof(*config_dev));
+
+	err = mlx4_cmd(dev, mailbox->dma, 0, 0, MLX4_CMD_CONFIG_DEV,
+		       MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE);
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+
+int mlx4_config_vxlan_port(struct mlx4_dev *dev, __be16 udp_port)
+{
+	struct mlx4_config_dev config_dev;
+
+	memset(&config_dev, 0, sizeof(config_dev));
+	config_dev.update_flags    = cpu_to_be32(MLX4_VXLAN_UDP_DPORT);
+	config_dev.vxlan_udp_dport = udp_port;
+
+	return mlx4_CONFIG_DEV(dev, &config_dev);
+}
+EXPORT_SYMBOL_GPL(mlx4_config_vxlan_port);
+
+
+int mlx4_SET_ICM_SIZE(struct mlx4_dev *dev, u64 icm_size, u64 *aux_pages)
+{
+	int ret = mlx4_cmd_imm(dev, icm_size, aux_pages, 0, 0,
+			       MLX4_CMD_SET_ICM_SIZE,
+			       MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE);
+	if (ret)
+		return ret;
+
+	/*
+	 * Round up number of system pages needed in case
+	 * MLX4_ICM_PAGE_SIZE < PAGE_SIZE.
+	 */
+	*aux_pages = ALIGN(*aux_pages, PAGE_SIZE / MLX4_ICM_PAGE_SIZE) >>
+		(PAGE_SHIFT - MLX4_ICM_PAGE_SHIFT);
+
+	return 0;
+}
+
+int mlx4_NOP(struct mlx4_dev *dev)
+{
+	/* Input modifier of 0x1f means "finish as soon as possible." */
+	return mlx4_cmd(dev, 0, 0x1f, 0, MLX4_CMD_NOP, 100, MLX4_CMD_NATIVE);
+}
+
+int mlx4_get_phys_port_id(struct mlx4_dev *dev)
+{
+	u8 port;
+	u32 *outbox;
+	struct mlx4_cmd_mailbox *mailbox;
+	u32 in_mod;
+	u32 guid_hi, guid_lo;
+	int err, ret = 0;
+#define MOD_STAT_CFG_PORT_OFFSET 8
+#define MOD_STAT_CFG_GUID_H	 0X14
+#define MOD_STAT_CFG_GUID_L	 0X1c
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	outbox = mailbox->buf;
+
+	for (port = 1; port <= dev->caps.num_ports; port++) {
+		in_mod = port << MOD_STAT_CFG_PORT_OFFSET;
+		err = mlx4_cmd_box(dev, 0, mailbox->dma, in_mod, 0x2,
+				   MLX4_CMD_MOD_STAT_CFG, MLX4_CMD_TIME_CLASS_A,
+				   MLX4_CMD_NATIVE);
+		if (err) {
+			mlx4_err(dev, "Fail to get port %d uplink guid\n",
+				 port);
+			ret = err;
+		} else {
+			MLX4_GET(guid_hi, outbox, MOD_STAT_CFG_GUID_H);
+			MLX4_GET(guid_lo, outbox, MOD_STAT_CFG_GUID_L);
+			dev->caps.phys_port_id[port] = (u64)guid_lo |
+						       (u64)guid_hi << 32;
+		}
+	}
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return ret;
+}
+
+#define MLX4_WOL_SETUP_MODE (5 << 28)
+int mlx4_wol_read(struct mlx4_dev *dev, u64 *config, int port)
+{
+	u32 in_mod = MLX4_WOL_SETUP_MODE | port << 8;
+
+	return mlx4_cmd_imm(dev, 0, config, in_mod, 0x3,
+			    MLX4_CMD_MOD_STAT_CFG, MLX4_CMD_TIME_CLASS_A,
+			    MLX4_CMD_NATIVE);
+}
+EXPORT_SYMBOL_GPL(mlx4_wol_read);
+
+int mlx4_wol_write(struct mlx4_dev *dev, u64 config, int port)
+{
+	u32 in_mod = MLX4_WOL_SETUP_MODE | port << 8;
+
+	return mlx4_cmd(dev, config, in_mod, 0x1, MLX4_CMD_MOD_STAT_CFG,
+			MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE);
+}
+EXPORT_SYMBOL_GPL(mlx4_wol_write);
+
+enum {
+	ADD_TO_MCG = 0x26,
+};
+
+
+void mlx4_opreq_action(struct work_struct *work)
+{
+	struct mlx4_priv *priv = container_of(work, struct mlx4_priv,
+					      opreq_task);
+	struct mlx4_dev *dev = &priv->dev;
+	int num_tasks = atomic_read(&priv->opreq_count);
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_mgm *mgm;
+	u32 *outbox;
+	u32 modifier;
+	u16 token;
+	u16 type;
+	int err;
+	u32 num_qps;
+	struct mlx4_qp qp;
+	int i;
+	u8 rem_mcg;
+	u8 prot;
+
+#define GET_OP_REQ_MODIFIER_OFFSET	0x08
+#define GET_OP_REQ_TOKEN_OFFSET		0x14
+#define GET_OP_REQ_TYPE_OFFSET		0x1a
+#define GET_OP_REQ_DATA_OFFSET		0x20
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox)) {
+		mlx4_err(dev, "Failed to allocate mailbox for GET_OP_REQ\n");
+		return;
+	}
+	outbox = mailbox->buf;
+
+	while (num_tasks) {
+		err = mlx4_cmd_box(dev, 0, mailbox->dma, 0, 0,
+				   MLX4_CMD_GET_OP_REQ, MLX4_CMD_TIME_CLASS_A,
+				   MLX4_CMD_NATIVE);
+		if (err) {
+			mlx4_err(dev, "Failed to retrieve required operation: %d\n",
+				 err);
+			return;
+		}
+		MLX4_GET(modifier, outbox, GET_OP_REQ_MODIFIER_OFFSET);
+		MLX4_GET(token, outbox, GET_OP_REQ_TOKEN_OFFSET);
+		MLX4_GET(type, outbox, GET_OP_REQ_TYPE_OFFSET);
+		type &= 0xfff;
+
+		switch (type) {
+		case ADD_TO_MCG:
+			if (dev->caps.steering_mode ==
+			    MLX4_STEERING_MODE_DEVICE_MANAGED) {
+				mlx4_warn(dev, "ADD MCG operation is not supported in DEVICE_MANAGED steering mode\n");
+				err = EPERM;
+				break;
+			}
+			mgm = (struct mlx4_mgm *)((u8 *)(outbox) +
+						  GET_OP_REQ_DATA_OFFSET);
+			num_qps = be32_to_cpu(mgm->members_count) &
+				  MGM_QPN_MASK;
+			rem_mcg = ((u8 *)(&mgm->members_count))[0] & 1;
+			prot = ((u8 *)(&mgm->members_count))[0] >> 6;
+
+			for (i = 0; i < num_qps; i++) {
+				qp.qpn = be32_to_cpu(mgm->qp[i]);
+				if (rem_mcg)
+					err = mlx4_multicast_detach(dev, &qp,
+								    mgm->gid,
+								    prot, 0);
+				else
+					err = mlx4_multicast_attach(dev, &qp,
+								    mgm->gid,
+								    mgm->gid[5]
+								    , 0, prot,
+								    NULL);
+				if (err)
+					break;
+			}
+			break;
+		default:
+			mlx4_warn(dev, "Bad type for required operation\n");
+			err = EINVAL;
+			break;
+		}
+		err = mlx4_cmd(dev, 0, ((u32) err |
+					(__force u32)cpu_to_be32(token) << 16),
+			       1, MLX4_CMD_GET_OP_REQ, MLX4_CMD_TIME_CLASS_A,
+			       MLX4_CMD_NATIVE);
+		if (err) {
+			mlx4_err(dev, "Failed to acknowledge required request: %d\n",
+				 err);
+			goto out;
+		}
+		memset(outbox, 0, 0xffc);
+		num_tasks = atomic_dec_return(&priv->opreq_count);
+	}
+
+out:
+	mlx4_free_cmd_mailbox(dev, mailbox);
+}
+
+static int mlx4_check_smp_firewall_active(struct mlx4_dev *dev,
+					  struct mlx4_cmd_mailbox *mailbox)
+{
+#define MLX4_CMD_MAD_DEMUX_SET_ATTR_OFFSET		0x10
+#define MLX4_CMD_MAD_DEMUX_GETRESP_ATTR_OFFSET		0x20
+#define MLX4_CMD_MAD_DEMUX_TRAP_ATTR_OFFSET		0x40
+#define MLX4_CMD_MAD_DEMUX_TRAP_REPRESS_ATTR_OFFSET	0x70
+
+	u32 set_attr_mask, getresp_attr_mask;
+	u32 trap_attr_mask, traprepress_attr_mask;
+
+	MLX4_GET(set_attr_mask, mailbox->buf,
+		 MLX4_CMD_MAD_DEMUX_SET_ATTR_OFFSET);
+	mlx4_dbg(dev, "SMP firewall set_attribute_mask = 0x%x\n",
+		 set_attr_mask);
+
+	MLX4_GET(getresp_attr_mask, mailbox->buf,
+		 MLX4_CMD_MAD_DEMUX_GETRESP_ATTR_OFFSET);
+	mlx4_dbg(dev, "SMP firewall getresp_attribute_mask = 0x%x\n",
+		 getresp_attr_mask);
+
+	MLX4_GET(trap_attr_mask, mailbox->buf,
+		 MLX4_CMD_MAD_DEMUX_TRAP_ATTR_OFFSET);
+	mlx4_dbg(dev, "SMP firewall trap_attribute_mask = 0x%x\n",
+		 trap_attr_mask);
+
+	MLX4_GET(traprepress_attr_mask, mailbox->buf,
+		 MLX4_CMD_MAD_DEMUX_TRAP_REPRESS_ATTR_OFFSET);
+	mlx4_dbg(dev, "SMP firewall traprepress_attribute_mask = 0x%x\n",
+		 traprepress_attr_mask);
+
+	if (set_attr_mask && getresp_attr_mask && trap_attr_mask &&
+	    traprepress_attr_mask)
+		return 1;
+
+	return 0;
+}
+
+int mlx4_config_mad_demux(struct mlx4_dev *dev)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	int secure_host_active;
+	int err;
+
+	/* Check if mad_demux is supported */
+	if (!(dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_MAD_DEMUX))
+		return 0;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox)) {
+		mlx4_warn(dev, "Failed to allocate mailbox for cmd MAD_DEMUX");
+		return -ENOMEM;
+	}
+
+	/* Query mad_demux to find out which MADs are handled by internal sma */
+	err = mlx4_cmd_box(dev, 0, mailbox->dma, 0x01 /* subn mgmt class */,
+			   MLX4_CMD_MAD_DEMUX_QUERY_RESTR, MLX4_CMD_MAD_DEMUX,
+			   MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE);
+	if (err) {
+		mlx4_warn(dev, "MLX4_CMD_MAD_DEMUX: query restrictions failed (%d)\n",
+			  err);
+		goto out;
+	}
+
+	secure_host_active = mlx4_check_smp_firewall_active(dev, mailbox);
+
+	/* Config mad_demux to handle all MADs returned by the query above */
+	err = mlx4_cmd(dev, mailbox->dma, 0x01 /* subn mgmt class */,
+		       MLX4_CMD_MAD_DEMUX_CONFIG, MLX4_CMD_MAD_DEMUX,
+		       MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE);
+	if (err) {
+		mlx4_warn(dev, "MLX4_CMD_MAD_DEMUX: configure failed (%d)\n", err);
+		goto out;
+	}
+
+	if (secure_host_active)
+		mlx4_warn(dev, "HCA operating in secure-host mode. SMP firewall activated.\n");
+out:
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
diff --git a/drivers/net/ethernet/mellanox/mlx4/fw.h b/drivers/net/ethernet/mellanox/mlx4/fw.h
new file mode 100644
index 0000000..9b835ae
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4/fw.h
@@ -0,0 +1,230 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005, 2006, 2007, 2008 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2006, 2007 Cisco Systems.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MLX4_FW_H
+#define MLX4_FW_H
+
+#include "mlx4.h"
+#include "icm.h"
+
+struct mlx4_mod_stat_cfg {
+	u8 log_pg_sz;
+	u8 log_pg_sz_m;
+};
+
+struct mlx4_dev_cap {
+	int max_srq_sz;
+	int max_qp_sz;
+	int reserved_qps;
+	int max_qps;
+	int reserved_srqs;
+	int max_srqs;
+	int max_cq_sz;
+	int reserved_cqs;
+	int max_cqs;
+	int max_mpts;
+	int reserved_eqs;
+	int max_eqs;
+	int reserved_mtts;
+	int max_mrw_sz;
+	int reserved_mrws;
+	int max_mtt_seg;
+	int max_requester_per_qp;
+	int max_responder_per_qp;
+	int max_rdma_global;
+	int local_ca_ack_delay;
+	int num_ports;
+	u32 max_msg_sz;
+	int ib_mtu[MLX4_MAX_PORTS + 1];
+	int max_port_width[MLX4_MAX_PORTS + 1];
+	int max_vl[MLX4_MAX_PORTS + 1];
+	int max_gids[MLX4_MAX_PORTS + 1];
+	int max_pkeys[MLX4_MAX_PORTS + 1];
+	u64 def_mac[MLX4_MAX_PORTS + 1];
+	u16 eth_mtu[MLX4_MAX_PORTS + 1];
+	int trans_type[MLX4_MAX_PORTS + 1];
+	int vendor_oui[MLX4_MAX_PORTS + 1];
+	u16 wavelength[MLX4_MAX_PORTS + 1];
+	u64 trans_code[MLX4_MAX_PORTS + 1];
+	u16 stat_rate_support;
+	int fs_log_max_ucast_qp_range_size;
+	int fs_max_num_qp_per_entry;
+	u64 flags;
+	u64 flags2;
+	int reserved_uars;
+	int uar_size;
+	int min_page_sz;
+	int bf_reg_size;
+	int bf_regs_per_page;
+	int max_sq_sg;
+	int max_sq_desc_sz;
+	int max_rq_sg;
+	int max_rq_desc_sz;
+	int max_qp_per_mcg;
+	int reserved_mgms;
+	int max_mcgs;
+	int reserved_pds;
+	int max_pds;
+	int reserved_xrcds;
+	int max_xrcds;
+	int qpc_entry_sz;
+	int rdmarc_entry_sz;
+	int altc_entry_sz;
+	int aux_entry_sz;
+	int srq_entry_sz;
+	int cqc_entry_sz;
+	int eqc_entry_sz;
+	int dmpt_entry_sz;
+	int cmpt_entry_sz;
+	int mtt_entry_sz;
+	int resize_srq;
+	u32 bmme_flags;
+	u32 reserved_lkey;
+	u64 max_icm_sz;
+	int max_gso_sz;
+	int max_rss_tbl_sz;
+	u8  supported_port_types[MLX4_MAX_PORTS + 1];
+	u8  suggested_type[MLX4_MAX_PORTS + 1];
+	u8  default_sense[MLX4_MAX_PORTS + 1];
+	u8  log_max_macs[MLX4_MAX_PORTS + 1];
+	u8  log_max_vlans[MLX4_MAX_PORTS + 1];
+	u32 max_counters;
+};
+
+struct mlx4_func_cap {
+	u8	num_ports;
+	u8	flags;
+	u32	pf_context_behaviour;
+	int	qp_quota;
+	int	cq_quota;
+	int	srq_quota;
+	int	mpt_quota;
+	int	mtt_quota;
+	int	max_eq;
+	int	reserved_eq;
+	int	mcg_quota;
+	u32	qp0_qkey;
+	u32	qp0_tunnel_qpn;
+	u32	qp0_proxy_qpn;
+	u32	qp1_tunnel_qpn;
+	u32	qp1_proxy_qpn;
+	u8	physical_port;
+	u8	port_flags;
+	u8	flags1;
+	u64	phys_port_id;
+};
+
+struct mlx4_adapter {
+	char board_id[MLX4_BOARD_ID_LEN];
+	u8   inta_pin;
+};
+
+struct mlx4_init_hca_param {
+	u64 qpc_base;
+	u64 rdmarc_base;
+	u64 auxc_base;
+	u64 altc_base;
+	u64 srqc_base;
+	u64 cqc_base;
+	u64 eqc_base;
+	u64 mc_base;
+	u64 dmpt_base;
+	u64 cmpt_base;
+	u64 mtt_base;
+	u64 global_caps;
+	u16 log_mc_entry_sz;
+	u16 log_mc_hash_sz;
+	u16 hca_core_clock; /* Internal Clock Frequency (in MHz) */
+	u8  log_num_qps;
+	u8  log_num_srqs;
+	u8  log_num_cqs;
+	u8  log_num_eqs;
+	u8  log_rd_per_qp;
+	u8  log_mc_table_sz;
+	u8  log_mpt_sz;
+	u8  log_uar_sz;
+	u8  mw_enabled;  /* Enable memory windows */
+	u8  uar_page_sz; /* log pg sz in 4k chunks */
+	u8  steering_mode; /* for QUERY_HCA */
+	u64 dev_cap_enabled;
+	u16 cqe_size; /* For use only when CQE stride feature enabled */
+	u16 eqe_size; /* For use only when EQE stride feature enabled */
+};
+
+struct mlx4_init_ib_param {
+	int port_width;
+	int vl_cap;
+	int mtu_cap;
+	u16 gid_cap;
+	u16 pkey_cap;
+	int set_guid0;
+	u64 guid0;
+	int set_node_guid;
+	u64 node_guid;
+	int set_si_guid;
+	u64 si_guid;
+};
+
+struct mlx4_set_ib_param {
+	int set_si_guid;
+	int reset_qkey_viol;
+	u64 si_guid;
+	u32 cap_mask;
+};
+
+int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap);
+int mlx4_QUERY_FUNC_CAP(struct mlx4_dev *dev, u32 gen_or_port,
+			struct mlx4_func_cap *func_cap);
+int mlx4_QUERY_FUNC_CAP_wrapper(struct mlx4_dev *dev, int slave,
+				struct mlx4_vhcr *vhcr,
+				struct mlx4_cmd_mailbox *inbox,
+				struct mlx4_cmd_mailbox *outbox,
+				struct mlx4_cmd_info *cmd);
+int mlx4_MAP_FA(struct mlx4_dev *dev, struct mlx4_icm *icm);
+int mlx4_UNMAP_FA(struct mlx4_dev *dev);
+int mlx4_RUN_FW(struct mlx4_dev *dev);
+int mlx4_QUERY_FW(struct mlx4_dev *dev);
+int mlx4_QUERY_ADAPTER(struct mlx4_dev *dev, struct mlx4_adapter *adapter);
+int mlx4_INIT_HCA(struct mlx4_dev *dev, struct mlx4_init_hca_param *param);
+int mlx4_QUERY_HCA(struct mlx4_dev *dev, struct mlx4_init_hca_param *param);
+int mlx4_CLOSE_HCA(struct mlx4_dev *dev, int panic);
+int mlx4_map_cmd(struct mlx4_dev *dev, u16 op, struct mlx4_icm *icm, u64 virt);
+int mlx4_SET_ICM_SIZE(struct mlx4_dev *dev, u64 icm_size, u64 *aux_pages);
+int mlx4_MAP_ICM_AUX(struct mlx4_dev *dev, struct mlx4_icm *icm);
+int mlx4_UNMAP_ICM_AUX(struct mlx4_dev *dev);
+int mlx4_NOP(struct mlx4_dev *dev);
+int mlx4_MOD_STAT_CFG(struct mlx4_dev *dev, struct mlx4_mod_stat_cfg *cfg);
+void mlx4_opreq_action(struct work_struct *work);
+
+#endif /* MLX4_FW_H */
diff --git a/drivers/net/ethernet/mellanox/mlx4/icm.c b/drivers/net/ethernet/mellanox/mlx4/icm.c
new file mode 100644
index 0000000..97c9b1d
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4/icm.c
@@ -0,0 +1,461 @@
+/*
+ * Copyright (c) 2005, 2006, 2007, 2008 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2006, 2007 Cisco Systems, Inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/errno.h>
+#include <linux/mm.h>
+#include <linux/scatterlist.h>
+#include <linux/slab.h>
+
+#include <linux/mlx4/cmd.h>
+
+#include "mlx4.h"
+#include "icm.h"
+#include "fw.h"
+
+/*
+ * We allocate in as big chunks as we can, up to a maximum of 256 KB
+ * per chunk.
+ */
+enum {
+	MLX4_ICM_ALLOC_SIZE	= 1 << 18,
+	MLX4_TABLE_CHUNK_SIZE	= 1 << 18
+};
+
+static void mlx4_free_icm_pages(struct mlx4_dev *dev, struct mlx4_icm_chunk *chunk)
+{
+	int i;
+
+	if (chunk->nsg > 0)
+		pci_unmap_sg(dev->pdev, chunk->mem, chunk->npages,
+			     PCI_DMA_BIDIRECTIONAL);
+
+	for (i = 0; i < chunk->npages; ++i)
+		__free_pages(sg_page(&chunk->mem[i]),
+			     get_order(chunk->mem[i].length));
+}
+
+static void mlx4_free_icm_coherent(struct mlx4_dev *dev, struct mlx4_icm_chunk *chunk)
+{
+	int i;
+
+	for (i = 0; i < chunk->npages; ++i)
+		dma_free_coherent(&dev->pdev->dev, chunk->mem[i].length,
+				  lowmem_page_address(sg_page(&chunk->mem[i])),
+				  sg_dma_address(&chunk->mem[i]));
+}
+
+void mlx4_free_icm(struct mlx4_dev *dev, struct mlx4_icm *icm, int coherent)
+{
+	struct mlx4_icm_chunk *chunk, *tmp;
+
+	if (!icm)
+		return;
+
+	list_for_each_entry_safe(chunk, tmp, &icm->chunk_list, list) {
+		if (coherent)
+			mlx4_free_icm_coherent(dev, chunk);
+		else
+			mlx4_free_icm_pages(dev, chunk);
+
+		kfree(chunk);
+	}
+
+	kfree(icm);
+}
+
+static int mlx4_alloc_icm_pages(struct scatterlist *mem, int order,
+				gfp_t gfp_mask, int node)
+{
+	struct page *page;
+
+	page = alloc_pages_node(node, gfp_mask, order);
+	if (!page) {
+		page = alloc_pages(gfp_mask, order);
+		if (!page)
+			return -ENOMEM;
+	}
+
+	sg_set_page(mem, page, PAGE_SIZE << order, 0);
+	return 0;
+}
+
+static int mlx4_alloc_icm_coherent(struct device *dev, struct scatterlist *mem,
+				    int order, gfp_t gfp_mask)
+{
+	void *buf = dma_alloc_coherent(dev, PAGE_SIZE << order,
+				       &sg_dma_address(mem), gfp_mask);
+	if (!buf)
+		return -ENOMEM;
+
+	sg_set_buf(mem, buf, PAGE_SIZE << order);
+	BUG_ON(mem->offset);
+	sg_dma_len(mem) = PAGE_SIZE << order;
+	return 0;
+}
+
+struct mlx4_icm *mlx4_alloc_icm(struct mlx4_dev *dev, int npages,
+				gfp_t gfp_mask, int coherent)
+{
+	struct mlx4_icm *icm;
+	struct mlx4_icm_chunk *chunk = NULL;
+	int cur_order;
+	int ret;
+
+	/* We use sg_set_buf for coherent allocs, which assumes low memory */
+	BUG_ON(coherent && (gfp_mask & __GFP_HIGHMEM));
+
+	icm = kmalloc_node(sizeof(*icm),
+			   gfp_mask & ~(__GFP_HIGHMEM | __GFP_NOWARN),
+			   dev->numa_node);
+	if (!icm) {
+		icm = kmalloc(sizeof(*icm),
+			      gfp_mask & ~(__GFP_HIGHMEM | __GFP_NOWARN));
+		if (!icm)
+			return NULL;
+	}
+
+	icm->refcount = 0;
+	INIT_LIST_HEAD(&icm->chunk_list);
+
+	cur_order = get_order(MLX4_ICM_ALLOC_SIZE);
+
+	while (npages > 0) {
+		if (!chunk) {
+			chunk = kmalloc_node(sizeof(*chunk),
+					     gfp_mask & ~(__GFP_HIGHMEM |
+							  __GFP_NOWARN),
+					     dev->numa_node);
+			if (!chunk) {
+				chunk = kmalloc(sizeof(*chunk),
+						gfp_mask & ~(__GFP_HIGHMEM |
+							     __GFP_NOWARN));
+				if (!chunk)
+					goto fail;
+			}
+
+			sg_init_table(chunk->mem, MLX4_ICM_CHUNK_LEN);
+			chunk->npages = 0;
+			chunk->nsg    = 0;
+			list_add_tail(&chunk->list, &icm->chunk_list);
+		}
+
+		while (1 << cur_order > npages)
+			--cur_order;
+
+		if (coherent)
+			ret = mlx4_alloc_icm_coherent(&dev->pdev->dev,
+						      &chunk->mem[chunk->npages],
+						      cur_order, gfp_mask);
+		else
+			ret = mlx4_alloc_icm_pages(&chunk->mem[chunk->npages],
+						   cur_order, gfp_mask,
+						   dev->numa_node);
+
+		if (ret) {
+			if (--cur_order < 0)
+				goto fail;
+			else
+				continue;
+		}
+
+		++chunk->npages;
+
+		if (coherent)
+			++chunk->nsg;
+		else if (chunk->npages == MLX4_ICM_CHUNK_LEN) {
+			chunk->nsg = pci_map_sg(dev->pdev, chunk->mem,
+						chunk->npages,
+						PCI_DMA_BIDIRECTIONAL);
+
+			if (chunk->nsg <= 0)
+				goto fail;
+		}
+
+		if (chunk->npages == MLX4_ICM_CHUNK_LEN)
+			chunk = NULL;
+
+		npages -= 1 << cur_order;
+	}
+
+	if (!coherent && chunk) {
+		chunk->nsg = pci_map_sg(dev->pdev, chunk->mem,
+					chunk->npages,
+					PCI_DMA_BIDIRECTIONAL);
+
+		if (chunk->nsg <= 0)
+			goto fail;
+	}
+
+	return icm;
+
+fail:
+	mlx4_free_icm(dev, icm, coherent);
+	return NULL;
+}
+
+static int mlx4_MAP_ICM(struct mlx4_dev *dev, struct mlx4_icm *icm, u64 virt)
+{
+	return mlx4_map_cmd(dev, MLX4_CMD_MAP_ICM, icm, virt);
+}
+
+static int mlx4_UNMAP_ICM(struct mlx4_dev *dev, u64 virt, u32 page_count)
+{
+	return mlx4_cmd(dev, virt, page_count, 0, MLX4_CMD_UNMAP_ICM,
+			MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE);
+}
+
+int mlx4_MAP_ICM_AUX(struct mlx4_dev *dev, struct mlx4_icm *icm)
+{
+	return mlx4_map_cmd(dev, MLX4_CMD_MAP_ICM_AUX, icm, -1);
+}
+
+int mlx4_UNMAP_ICM_AUX(struct mlx4_dev *dev)
+{
+	return mlx4_cmd(dev, 0, 0, 0, MLX4_CMD_UNMAP_ICM_AUX,
+			MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE);
+}
+
+int mlx4_table_get(struct mlx4_dev *dev, struct mlx4_icm_table *table, u32 obj,
+		   gfp_t gfp)
+{
+	u32 i = (obj & (table->num_obj - 1)) /
+			(MLX4_TABLE_CHUNK_SIZE / table->obj_size);
+	int ret = 0;
+
+	mutex_lock(&table->mutex);
+
+	if (table->icm[i]) {
+		++table->icm[i]->refcount;
+		goto out;
+	}
+
+	table->icm[i] = mlx4_alloc_icm(dev, MLX4_TABLE_CHUNK_SIZE >> PAGE_SHIFT,
+				       (table->lowmem ? gfp : GFP_HIGHUSER) |
+				       __GFP_NOWARN, table->coherent);
+	if (!table->icm[i]) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	if (mlx4_MAP_ICM(dev, table->icm[i], table->virt +
+			 (u64) i * MLX4_TABLE_CHUNK_SIZE)) {
+		mlx4_free_icm(dev, table->icm[i], table->coherent);
+		table->icm[i] = NULL;
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	++table->icm[i]->refcount;
+
+out:
+	mutex_unlock(&table->mutex);
+	return ret;
+}
+
+void mlx4_table_put(struct mlx4_dev *dev, struct mlx4_icm_table *table, u32 obj)
+{
+	u32 i;
+	u64 offset;
+
+	i = (obj & (table->num_obj - 1)) / (MLX4_TABLE_CHUNK_SIZE / table->obj_size);
+
+	mutex_lock(&table->mutex);
+
+	if (--table->icm[i]->refcount == 0) {
+		offset = (u64) i * MLX4_TABLE_CHUNK_SIZE;
+		mlx4_UNMAP_ICM(dev, table->virt + offset,
+			       MLX4_TABLE_CHUNK_SIZE / MLX4_ICM_PAGE_SIZE);
+		mlx4_free_icm(dev, table->icm[i], table->coherent);
+		table->icm[i] = NULL;
+	}
+
+	mutex_unlock(&table->mutex);
+}
+
+void *mlx4_table_find(struct mlx4_icm_table *table, u32 obj,
+			dma_addr_t *dma_handle)
+{
+	int offset, dma_offset, i;
+	u64 idx;
+	struct mlx4_icm_chunk *chunk;
+	struct mlx4_icm *icm;
+	struct page *page = NULL;
+
+	if (!table->lowmem)
+		return NULL;
+
+	mutex_lock(&table->mutex);
+
+	idx = (u64) (obj & (table->num_obj - 1)) * table->obj_size;
+	icm = table->icm[idx / MLX4_TABLE_CHUNK_SIZE];
+	dma_offset = offset = idx % MLX4_TABLE_CHUNK_SIZE;
+
+	if (!icm)
+		goto out;
+
+	list_for_each_entry(chunk, &icm->chunk_list, list) {
+		for (i = 0; i < chunk->npages; ++i) {
+			if (dma_handle && dma_offset >= 0) {
+				if (sg_dma_len(&chunk->mem[i]) > dma_offset)
+					*dma_handle = sg_dma_address(&chunk->mem[i]) +
+						dma_offset;
+				dma_offset -= sg_dma_len(&chunk->mem[i]);
+			}
+			/*
+			 * DMA mapping can merge pages but not split them,
+			 * so if we found the page, dma_handle has already
+			 * been assigned to.
+			 */
+			if (chunk->mem[i].length > offset) {
+				page = sg_page(&chunk->mem[i]);
+				goto out;
+			}
+			offset -= chunk->mem[i].length;
+		}
+	}
+
+out:
+	mutex_unlock(&table->mutex);
+	return page ? lowmem_page_address(page) + offset : NULL;
+}
+
+int mlx4_table_get_range(struct mlx4_dev *dev, struct mlx4_icm_table *table,
+			 u32 start, u32 end)
+{
+	int inc = MLX4_TABLE_CHUNK_SIZE / table->obj_size;
+	int err;
+	u32 i;
+
+	for (i = start; i <= end; i += inc) {
+		err = mlx4_table_get(dev, table, i, GFP_KERNEL);
+		if (err)
+			goto fail;
+	}
+
+	return 0;
+
+fail:
+	while (i > start) {
+		i -= inc;
+		mlx4_table_put(dev, table, i);
+	}
+
+	return err;
+}
+
+void mlx4_table_put_range(struct mlx4_dev *dev, struct mlx4_icm_table *table,
+			  u32 start, u32 end)
+{
+	u32 i;
+
+	for (i = start; i <= end; i += MLX4_TABLE_CHUNK_SIZE / table->obj_size)
+		mlx4_table_put(dev, table, i);
+}
+
+int mlx4_init_icm_table(struct mlx4_dev *dev, struct mlx4_icm_table *table,
+			u64 virt, int obj_size,	u32 nobj, int reserved,
+			int use_lowmem, int use_coherent)
+{
+	int obj_per_chunk;
+	int num_icm;
+	unsigned chunk_size;
+	int i;
+	u64 size;
+
+	obj_per_chunk = MLX4_TABLE_CHUNK_SIZE / obj_size;
+	num_icm = (nobj + obj_per_chunk - 1) / obj_per_chunk;
+
+	table->icm      = kcalloc(num_icm, sizeof *table->icm, GFP_KERNEL);
+	if (!table->icm)
+		return -ENOMEM;
+	table->virt     = virt;
+	table->num_icm  = num_icm;
+	table->num_obj  = nobj;
+	table->obj_size = obj_size;
+	table->lowmem   = use_lowmem;
+	table->coherent = use_coherent;
+	mutex_init(&table->mutex);
+
+	size = (u64) nobj * obj_size;
+	for (i = 0; i * MLX4_TABLE_CHUNK_SIZE < reserved * obj_size; ++i) {
+		chunk_size = MLX4_TABLE_CHUNK_SIZE;
+		if ((i + 1) * MLX4_TABLE_CHUNK_SIZE > size)
+			chunk_size = PAGE_ALIGN(size -
+					i * MLX4_TABLE_CHUNK_SIZE);
+
+		table->icm[i] = mlx4_alloc_icm(dev, chunk_size >> PAGE_SHIFT,
+					       (use_lowmem ? GFP_KERNEL : GFP_HIGHUSER) |
+					       __GFP_NOWARN, use_coherent);
+		if (!table->icm[i])
+			goto err;
+		if (mlx4_MAP_ICM(dev, table->icm[i], virt + i * MLX4_TABLE_CHUNK_SIZE)) {
+			mlx4_free_icm(dev, table->icm[i], use_coherent);
+			table->icm[i] = NULL;
+			goto err;
+		}
+
+		/*
+		 * Add a reference to this ICM chunk so that it never
+		 * gets freed (since it contains reserved firmware objects).
+		 */
+		++table->icm[i]->refcount;
+	}
+
+	return 0;
+
+err:
+	for (i = 0; i < num_icm; ++i)
+		if (table->icm[i]) {
+			mlx4_UNMAP_ICM(dev, virt + i * MLX4_TABLE_CHUNK_SIZE,
+				       MLX4_TABLE_CHUNK_SIZE / MLX4_ICM_PAGE_SIZE);
+			mlx4_free_icm(dev, table->icm[i], use_coherent);
+		}
+
+	kfree(table->icm);
+
+	return -ENOMEM;
+}
+
+void mlx4_cleanup_icm_table(struct mlx4_dev *dev, struct mlx4_icm_table *table)
+{
+	int i;
+
+	for (i = 0; i < table->num_icm; ++i)
+		if (table->icm[i]) {
+			mlx4_UNMAP_ICM(dev, table->virt + i * MLX4_TABLE_CHUNK_SIZE,
+				       MLX4_TABLE_CHUNK_SIZE / MLX4_ICM_PAGE_SIZE);
+			mlx4_free_icm(dev, table->icm[i], table->coherent);
+		}
+
+	kfree(table->icm);
+}
diff --git a/drivers/net/ethernet/mellanox/mlx4/icm.h b/drivers/net/ethernet/mellanox/mlx4/icm.h
new file mode 100644
index 0000000..0c73645
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4/icm.h
@@ -0,0 +1,129 @@
+/*
+ * Copyright (c) 2005, 2006, 2007, 2008 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2006, 2007 Cisco Systems, Inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MLX4_ICM_H
+#define MLX4_ICM_H
+
+#include <linux/list.h>
+#include <linux/pci.h>
+#include <linux/mutex.h>
+
+#define MLX4_ICM_CHUNK_LEN						\
+	((256 - sizeof (struct list_head) - 2 * sizeof (int)) /		\
+	 (sizeof (struct scatterlist)))
+
+enum {
+	MLX4_ICM_PAGE_SHIFT	= 12,
+	MLX4_ICM_PAGE_SIZE	= 1 << MLX4_ICM_PAGE_SHIFT,
+};
+
+struct mlx4_icm_chunk {
+	struct list_head	list;
+	int			npages;
+	int			nsg;
+	struct scatterlist	mem[MLX4_ICM_CHUNK_LEN];
+};
+
+struct mlx4_icm {
+	struct list_head	chunk_list;
+	int			refcount;
+};
+
+struct mlx4_icm_iter {
+	struct mlx4_icm	       *icm;
+	struct mlx4_icm_chunk  *chunk;
+	int			page_idx;
+};
+
+struct mlx4_dev;
+
+struct mlx4_icm *mlx4_alloc_icm(struct mlx4_dev *dev, int npages,
+				gfp_t gfp_mask, int coherent);
+void mlx4_free_icm(struct mlx4_dev *dev, struct mlx4_icm *icm, int coherent);
+
+int mlx4_table_get(struct mlx4_dev *dev, struct mlx4_icm_table *table, u32 obj,
+		   gfp_t gfp);
+void mlx4_table_put(struct mlx4_dev *dev, struct mlx4_icm_table *table, u32 obj);
+int mlx4_table_get_range(struct mlx4_dev *dev, struct mlx4_icm_table *table,
+			 u32 start, u32 end);
+void mlx4_table_put_range(struct mlx4_dev *dev, struct mlx4_icm_table *table,
+			  u32 start, u32 end);
+int mlx4_init_icm_table(struct mlx4_dev *dev, struct mlx4_icm_table *table,
+			u64 virt, int obj_size,	u32 nobj, int reserved,
+			int use_lowmem, int use_coherent);
+void mlx4_cleanup_icm_table(struct mlx4_dev *dev, struct mlx4_icm_table *table);
+void *mlx4_table_find(struct mlx4_icm_table *table, u32 obj, dma_addr_t *dma_handle);
+
+static inline void mlx4_icm_first(struct mlx4_icm *icm,
+				  struct mlx4_icm_iter *iter)
+{
+	iter->icm      = icm;
+	iter->chunk    = list_empty(&icm->chunk_list) ?
+		NULL : list_entry(icm->chunk_list.next,
+				  struct mlx4_icm_chunk, list);
+	iter->page_idx = 0;
+}
+
+static inline int mlx4_icm_last(struct mlx4_icm_iter *iter)
+{
+	return !iter->chunk;
+}
+
+static inline void mlx4_icm_next(struct mlx4_icm_iter *iter)
+{
+	if (++iter->page_idx >= iter->chunk->nsg) {
+		if (iter->chunk->list.next == &iter->icm->chunk_list) {
+			iter->chunk = NULL;
+			return;
+		}
+
+		iter->chunk = list_entry(iter->chunk->list.next,
+					 struct mlx4_icm_chunk, list);
+		iter->page_idx = 0;
+	}
+}
+
+static inline dma_addr_t mlx4_icm_addr(struct mlx4_icm_iter *iter)
+{
+	return sg_dma_address(&iter->chunk->mem[iter->page_idx]);
+}
+
+static inline unsigned long mlx4_icm_size(struct mlx4_icm_iter *iter)
+{
+	return sg_dma_len(&iter->chunk->mem[iter->page_idx]);
+}
+
+int mlx4_MAP_ICM_AUX(struct mlx4_dev *dev, struct mlx4_icm *icm);
+int mlx4_UNMAP_ICM_AUX(struct mlx4_dev *dev);
+
+#endif /* MLX4_ICM_H */
diff --git a/drivers/net/ethernet/mellanox/mlx4/intf.c b/drivers/net/ethernet/mellanox/mlx4/intf.c
new file mode 100644
index 0000000..116895a
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4/intf.c
@@ -0,0 +1,188 @@
+/*
+ * Copyright (c) 2006, 2007 Cisco Systems, Inc. All rights reserved.
+ * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/slab.h>
+#include <linux/export.h>
+
+#include "mlx4.h"
+
+struct mlx4_device_context {
+	struct list_head	list;
+	struct mlx4_interface  *intf;
+	void		       *context;
+};
+
+static LIST_HEAD(intf_list);
+static LIST_HEAD(dev_list);
+static DEFINE_MUTEX(intf_mutex);
+
+static void mlx4_add_device(struct mlx4_interface *intf, struct mlx4_priv *priv)
+{
+	struct mlx4_device_context *dev_ctx;
+
+	dev_ctx = kmalloc(sizeof *dev_ctx, GFP_KERNEL);
+	if (!dev_ctx)
+		return;
+
+	dev_ctx->intf    = intf;
+	dev_ctx->context = intf->add(&priv->dev);
+
+	if (dev_ctx->context) {
+		spin_lock_irq(&priv->ctx_lock);
+		list_add_tail(&dev_ctx->list, &priv->ctx_list);
+		spin_unlock_irq(&priv->ctx_lock);
+	} else
+		kfree(dev_ctx);
+}
+
+static void mlx4_remove_device(struct mlx4_interface *intf, struct mlx4_priv *priv)
+{
+	struct mlx4_device_context *dev_ctx;
+
+	list_for_each_entry(dev_ctx, &priv->ctx_list, list)
+		if (dev_ctx->intf == intf) {
+			spin_lock_irq(&priv->ctx_lock);
+			list_del(&dev_ctx->list);
+			spin_unlock_irq(&priv->ctx_lock);
+
+			intf->remove(&priv->dev, dev_ctx->context);
+			kfree(dev_ctx);
+			return;
+		}
+}
+
+int mlx4_register_interface(struct mlx4_interface *intf)
+{
+	struct mlx4_priv *priv;
+
+	if (!intf->add || !intf->remove)
+		return -EINVAL;
+
+	mutex_lock(&intf_mutex);
+
+	list_add_tail(&intf->list, &intf_list);
+	list_for_each_entry(priv, &dev_list, dev_list)
+		mlx4_add_device(intf, priv);
+
+	mutex_unlock(&intf_mutex);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx4_register_interface);
+
+void mlx4_unregister_interface(struct mlx4_interface *intf)
+{
+	struct mlx4_priv *priv;
+
+	mutex_lock(&intf_mutex);
+
+	list_for_each_entry(priv, &dev_list, dev_list)
+		mlx4_remove_device(intf, priv);
+
+	list_del(&intf->list);
+
+	mutex_unlock(&intf_mutex);
+}
+EXPORT_SYMBOL_GPL(mlx4_unregister_interface);
+
+void mlx4_dispatch_event(struct mlx4_dev *dev, enum mlx4_dev_event type,
+			 unsigned long param)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_device_context *dev_ctx;
+	unsigned long flags;
+
+	spin_lock_irqsave(&priv->ctx_lock, flags);
+
+	list_for_each_entry(dev_ctx, &priv->ctx_list, list)
+		if (dev_ctx->intf->event)
+			dev_ctx->intf->event(dev, dev_ctx->context, type, param);
+
+	spin_unlock_irqrestore(&priv->ctx_lock, flags);
+}
+
+int mlx4_register_device(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_interface *intf;
+
+	mutex_lock(&intf_mutex);
+
+	list_add_tail(&priv->dev_list, &dev_list);
+	list_for_each_entry(intf, &intf_list, list)
+		mlx4_add_device(intf, priv);
+
+	mutex_unlock(&intf_mutex);
+	if (!mlx4_is_slave(dev))
+		mlx4_start_catas_poll(dev);
+
+	return 0;
+}
+
+void mlx4_unregister_device(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_interface *intf;
+
+	if (!mlx4_is_slave(dev))
+		mlx4_stop_catas_poll(dev);
+	mutex_lock(&intf_mutex);
+
+	list_for_each_entry(intf, &intf_list, list)
+		mlx4_remove_device(intf, priv);
+
+	list_del(&priv->dev_list);
+
+	mutex_unlock(&intf_mutex);
+}
+
+void *mlx4_get_protocol_dev(struct mlx4_dev *dev, enum mlx4_protocol proto, int port)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_device_context *dev_ctx;
+	unsigned long flags;
+	void *result = NULL;
+
+	spin_lock_irqsave(&priv->ctx_lock, flags);
+
+	list_for_each_entry(dev_ctx, &priv->ctx_list, list)
+		if (dev_ctx->intf->protocol == proto && dev_ctx->intf->get_dev) {
+			result = dev_ctx->intf->get_dev(dev, dev_ctx->context, port);
+			break;
+		}
+
+	spin_unlock_irqrestore(&priv->ctx_lock, flags);
+
+	return result;
+}
+EXPORT_SYMBOL_GPL(mlx4_get_protocol_dev);
diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c
new file mode 100644
index 0000000..90de6e1
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4/main.c
@@ -0,0 +1,2998 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2005, 2006, 2007, 2008 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2006, 2007 Cisco Systems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/pci.h>
+#include <linux/dma-mapping.h>
+#include <linux/slab.h>
+#include <linux/io-mapping.h>
+#include <linux/delay.h>
+#include <linux/kmod.h>
+
+#include <linux/mlx4/device.h>
+#include <linux/mlx4/doorbell.h>
+
+#include "mlx4.h"
+#include "fw.h"
+#include "icm.h"
+
+MODULE_AUTHOR("Roland Dreier");
+MODULE_DESCRIPTION("Mellanox ConnectX HCA low-level driver");
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_VERSION(DRV_VERSION);
+
+struct workqueue_struct *mlx4_wq;
+
+#ifdef CONFIG_MLX4_DEBUG
+
+int mlx4_debug_level = 0;
+module_param_named(debug_level, mlx4_debug_level, int, 0644);
+MODULE_PARM_DESC(debug_level, "Enable debug tracing if > 0");
+
+#endif /* CONFIG_MLX4_DEBUG */
+
+#ifdef CONFIG_PCI_MSI
+
+static int msi_x = 1;
+module_param(msi_x, int, 0444);
+MODULE_PARM_DESC(msi_x, "attempt to use MSI-X if nonzero");
+
+#else /* CONFIG_PCI_MSI */
+
+#define msi_x (0)
+
+#endif /* CONFIG_PCI_MSI */
+
+static uint8_t num_vfs[3] = {0, 0, 0};
+static int num_vfs_argc;
+module_param_array(num_vfs, byte , &num_vfs_argc, 0444);
+MODULE_PARM_DESC(num_vfs, "enable #num_vfs functions if num_vfs > 0\n"
+			  "num_vfs=port1,port2,port1+2");
+
+static uint8_t probe_vf[3] = {0, 0, 0};
+static int probe_vfs_argc;
+module_param_array(probe_vf, byte, &probe_vfs_argc, 0444);
+MODULE_PARM_DESC(probe_vf, "number of vfs to probe by pf driver (num_vfs > 0)\n"
+			   "probe_vf=port1,port2,port1+2");
+
+int mlx4_log_num_mgm_entry_size = MLX4_DEFAULT_MGM_LOG_ENTRY_SIZE;
+module_param_named(log_num_mgm_entry_size,
+			mlx4_log_num_mgm_entry_size, int, 0444);
+MODULE_PARM_DESC(log_num_mgm_entry_size, "log mgm size, that defines the num"
+					 " of qp per mcg, for example:"
+					 " 10 gives 248.range: 7 <="
+					 " log_num_mgm_entry_size <= 12."
+					 " To activate device managed"
+					 " flow steering when available, set to -1");
+
+static bool enable_64b_cqe_eqe = true;
+module_param(enable_64b_cqe_eqe, bool, 0444);
+MODULE_PARM_DESC(enable_64b_cqe_eqe,
+		 "Enable 64 byte CQEs/EQEs when the FW supports this (default: True)");
+
+#define PF_CONTEXT_BEHAVIOUR_MASK	(MLX4_FUNC_CAP_64B_EQE_CQE | \
+					 MLX4_FUNC_CAP_EQE_CQE_STRIDE)
+
+static char mlx4_version[] =
+	DRV_NAME ": Mellanox ConnectX core driver v"
+	DRV_VERSION " (" DRV_RELDATE ")\n";
+
+static struct mlx4_profile default_profile = {
+	.num_qp		= 1 << 18,
+	.num_srq	= 1 << 16,
+	.rdmarc_per_qp	= 1 << 4,
+	.num_cq		= 1 << 16,
+	.num_mcg	= 1 << 13,
+	.num_mpt	= 1 << 19,
+	.num_mtt	= 1 << 20, /* It is really num mtt segements */
+};
+
+static struct mlx4_profile low_mem_profile = {
+	.num_qp		= 1 << 17,
+	.num_srq	= 1 << 6,
+	.rdmarc_per_qp	= 1 << 4,
+	.num_cq		= 1 << 8,
+	.num_mcg	= 1 << 8,
+	.num_mpt	= 1 << 9,
+	.num_mtt	= 1 << 7,
+};
+
+static int log_num_mac = 7;
+module_param_named(log_num_mac, log_num_mac, int, 0444);
+MODULE_PARM_DESC(log_num_mac, "Log2 max number of MACs per ETH port (1-7)");
+
+static int log_num_vlan;
+module_param_named(log_num_vlan, log_num_vlan, int, 0444);
+MODULE_PARM_DESC(log_num_vlan, "Log2 max number of VLANs per ETH port (0-7)");
+/* Log2 max number of VLANs per ETH port (0-7) */
+#define MLX4_LOG_NUM_VLANS 7
+#define MLX4_MIN_LOG_NUM_VLANS 0
+#define MLX4_MIN_LOG_NUM_MAC 1
+
+static bool use_prio;
+module_param_named(use_prio, use_prio, bool, 0444);
+MODULE_PARM_DESC(use_prio, "Enable steering by VLAN priority on ETH ports (deprecated)");
+
+int log_mtts_per_seg = ilog2(MLX4_MTT_ENTRY_PER_SEG);
+module_param_named(log_mtts_per_seg, log_mtts_per_seg, int, 0444);
+MODULE_PARM_DESC(log_mtts_per_seg, "Log2 number of MTT entries per segment (1-7)");
+
+static int port_type_array[2] = {MLX4_PORT_TYPE_NONE, MLX4_PORT_TYPE_NONE};
+static int arr_argc = 2;
+module_param_array(port_type_array, int, &arr_argc, 0444);
+MODULE_PARM_DESC(port_type_array, "Array of port types: HW_DEFAULT (0) is default "
+				"1 for IB, 2 for Ethernet");
+
+struct mlx4_port_config {
+	struct list_head list;
+	enum mlx4_port_type port_type[MLX4_MAX_PORTS + 1];
+	struct pci_dev *pdev;
+};
+
+static atomic_t pf_loading = ATOMIC_INIT(0);
+
+int mlx4_check_port_params(struct mlx4_dev *dev,
+			   enum mlx4_port_type *port_type)
+{
+	int i;
+
+	for (i = 0; i < dev->caps.num_ports - 1; i++) {
+		if (port_type[i] != port_type[i + 1]) {
+			if (!(dev->caps.flags & MLX4_DEV_CAP_FLAG_DPDP)) {
+				mlx4_err(dev, "Only same port types supported on this HCA, aborting\n");
+				return -EINVAL;
+			}
+		}
+	}
+
+	for (i = 0; i < dev->caps.num_ports; i++) {
+		if (!(port_type[i] & dev->caps.supported_type[i+1])) {
+			mlx4_err(dev, "Requested port type for port %d is not supported on this HCA\n",
+				 i + 1);
+			return -EINVAL;
+		}
+	}
+	return 0;
+}
+
+static void mlx4_set_port_mask(struct mlx4_dev *dev)
+{
+	int i;
+
+	for (i = 1; i <= dev->caps.num_ports; ++i)
+		dev->caps.port_mask[i] = dev->caps.port_type[i];
+}
+
+static void mlx4_enable_cqe_eqe_stride(struct mlx4_dev *dev)
+{
+	struct mlx4_caps *dev_cap = &dev->caps;
+
+	/* FW not supporting or cancelled by user */
+	if (!(dev_cap->flags2 & MLX4_DEV_CAP_FLAG2_EQE_STRIDE) ||
+	    !(dev_cap->flags2 & MLX4_DEV_CAP_FLAG2_CQE_STRIDE))
+		return;
+
+	/* Must have 64B CQE_EQE enabled by FW to use bigger stride
+	 * When FW has NCSI it may decide not to report 64B CQE/EQEs
+	 */
+	if (!(dev_cap->flags & MLX4_DEV_CAP_FLAG_64B_EQE) ||
+	    !(dev_cap->flags & MLX4_DEV_CAP_FLAG_64B_CQE)) {
+		dev_cap->flags2 &= ~MLX4_DEV_CAP_FLAG2_CQE_STRIDE;
+		dev_cap->flags2 &= ~MLX4_DEV_CAP_FLAG2_EQE_STRIDE;
+		return;
+	}
+
+	if (cache_line_size() == 128 || cache_line_size() == 256) {
+		mlx4_dbg(dev, "Enabling CQE stride cacheLine supported\n");
+		/* Changing the real data inside CQE size to 32B */
+		dev_cap->flags &= ~MLX4_DEV_CAP_FLAG_64B_CQE;
+		dev_cap->flags &= ~MLX4_DEV_CAP_FLAG_64B_EQE;
+
+		if (mlx4_is_master(dev))
+			dev_cap->function_caps |= MLX4_FUNC_CAP_EQE_CQE_STRIDE;
+	} else {
+		mlx4_dbg(dev, "Disabling CQE stride cacheLine unsupported\n");
+		dev_cap->flags2 &= ~MLX4_DEV_CAP_FLAG2_CQE_STRIDE;
+		dev_cap->flags2 &= ~MLX4_DEV_CAP_FLAG2_EQE_STRIDE;
+	}
+}
+
+static int mlx4_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
+{
+	int err;
+	int i;
+
+	err = mlx4_QUERY_DEV_CAP(dev, dev_cap);
+	if (err) {
+		mlx4_err(dev, "QUERY_DEV_CAP command failed, aborting\n");
+		return err;
+	}
+
+	if (dev_cap->min_page_sz > PAGE_SIZE) {
+		mlx4_err(dev, "HCA minimum page size of %d bigger than kernel PAGE_SIZE of %ld, aborting\n",
+			 dev_cap->min_page_sz, PAGE_SIZE);
+		return -ENODEV;
+	}
+	if (dev_cap->num_ports > MLX4_MAX_PORTS) {
+		mlx4_err(dev, "HCA has %d ports, but we only support %d, aborting\n",
+			 dev_cap->num_ports, MLX4_MAX_PORTS);
+		return -ENODEV;
+	}
+
+	if (dev_cap->uar_size > pci_resource_len(dev->pdev, 2)) {
+		mlx4_err(dev, "HCA reported UAR size of 0x%x bigger than PCI resource 2 size of 0x%llx, aborting\n",
+			 dev_cap->uar_size,
+			 (unsigned long long) pci_resource_len(dev->pdev, 2));
+		return -ENODEV;
+	}
+
+	dev->caps.num_ports	     = dev_cap->num_ports;
+	dev->phys_caps.num_phys_eqs  = MLX4_MAX_EQ_NUM;
+	for (i = 1; i <= dev->caps.num_ports; ++i) {
+		dev->caps.vl_cap[i]	    = dev_cap->max_vl[i];
+		dev->caps.ib_mtu_cap[i]	    = dev_cap->ib_mtu[i];
+		dev->phys_caps.gid_phys_table_len[i]  = dev_cap->max_gids[i];
+		dev->phys_caps.pkey_phys_table_len[i] = dev_cap->max_pkeys[i];
+		/* set gid and pkey table operating lengths by default
+		 * to non-sriov values */
+		dev->caps.gid_table_len[i]  = dev_cap->max_gids[i];
+		dev->caps.pkey_table_len[i] = dev_cap->max_pkeys[i];
+		dev->caps.port_width_cap[i] = dev_cap->max_port_width[i];
+		dev->caps.eth_mtu_cap[i]    = dev_cap->eth_mtu[i];
+		dev->caps.def_mac[i]        = dev_cap->def_mac[i];
+		dev->caps.supported_type[i] = dev_cap->supported_port_types[i];
+		dev->caps.suggested_type[i] = dev_cap->suggested_type[i];
+		dev->caps.default_sense[i] = dev_cap->default_sense[i];
+		dev->caps.trans_type[i]	    = dev_cap->trans_type[i];
+		dev->caps.vendor_oui[i]     = dev_cap->vendor_oui[i];
+		dev->caps.wavelength[i]     = dev_cap->wavelength[i];
+		dev->caps.trans_code[i]     = dev_cap->trans_code[i];
+	}
+
+	dev->caps.uar_page_size	     = PAGE_SIZE;
+	dev->caps.num_uars	     = dev_cap->uar_size / PAGE_SIZE;
+	dev->caps.local_ca_ack_delay = dev_cap->local_ca_ack_delay;
+	dev->caps.bf_reg_size	     = dev_cap->bf_reg_size;
+	dev->caps.bf_regs_per_page   = dev_cap->bf_regs_per_page;
+	dev->caps.max_sq_sg	     = dev_cap->max_sq_sg;
+	dev->caps.max_rq_sg	     = dev_cap->max_rq_sg;
+	dev->caps.max_wqes	     = dev_cap->max_qp_sz;
+	dev->caps.max_qp_init_rdma   = dev_cap->max_requester_per_qp;
+	dev->caps.max_srq_wqes	     = dev_cap->max_srq_sz;
+	dev->caps.max_srq_sge	     = dev_cap->max_rq_sg - 1;
+	dev->caps.reserved_srqs	     = dev_cap->reserved_srqs;
+	dev->caps.max_sq_desc_sz     = dev_cap->max_sq_desc_sz;
+	dev->caps.max_rq_desc_sz     = dev_cap->max_rq_desc_sz;
+	/*
+	 * Subtract 1 from the limit because we need to allocate a
+	 * spare CQE so the HCA HW can tell the difference between an
+	 * empty CQ and a full CQ.
+	 */
+	dev->caps.max_cqes	     = dev_cap->max_cq_sz - 1;
+	dev->caps.reserved_cqs	     = dev_cap->reserved_cqs;
+	dev->caps.reserved_eqs	     = dev_cap->reserved_eqs;
+	dev->caps.reserved_mtts      = dev_cap->reserved_mtts;
+	dev->caps.reserved_mrws	     = dev_cap->reserved_mrws;
+
+	/* The first 128 UARs are used for EQ doorbells */
+	dev->caps.reserved_uars	     = max_t(int, 128, dev_cap->reserved_uars);
+	dev->caps.reserved_pds	     = dev_cap->reserved_pds;
+	dev->caps.reserved_xrcds     = (dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC) ?
+					dev_cap->reserved_xrcds : 0;
+	dev->caps.max_xrcds          = (dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC) ?
+					dev_cap->max_xrcds : 0;
+	dev->caps.mtt_entry_sz       = dev_cap->mtt_entry_sz;
+
+	dev->caps.max_msg_sz         = dev_cap->max_msg_sz;
+	dev->caps.page_size_cap	     = ~(u32) (dev_cap->min_page_sz - 1);
+	dev->caps.flags		     = dev_cap->flags;
+	dev->caps.flags2	     = dev_cap->flags2;
+	dev->caps.bmme_flags	     = dev_cap->bmme_flags;
+	dev->caps.reserved_lkey	     = dev_cap->reserved_lkey;
+	dev->caps.stat_rate_support  = dev_cap->stat_rate_support;
+	dev->caps.max_gso_sz	     = dev_cap->max_gso_sz;
+	dev->caps.max_rss_tbl_sz     = dev_cap->max_rss_tbl_sz;
+
+	/* Sense port always allowed on supported devices for ConnectX-1 and -2 */
+	if (mlx4_priv(dev)->pci_dev_data & MLX4_PCI_DEV_FORCE_SENSE_PORT)
+		dev->caps.flags |= MLX4_DEV_CAP_FLAG_SENSE_SUPPORT;
+	/* Don't do sense port on multifunction devices (for now at least) */
+	if (mlx4_is_mfunc(dev))
+		dev->caps.flags &= ~MLX4_DEV_CAP_FLAG_SENSE_SUPPORT;
+
+	if (mlx4_low_memory_profile()) {
+		dev->caps.log_num_macs  = MLX4_MIN_LOG_NUM_MAC;
+		dev->caps.log_num_vlans = MLX4_MIN_LOG_NUM_VLANS;
+	} else {
+		dev->caps.log_num_macs  = log_num_mac;
+		dev->caps.log_num_vlans = MLX4_LOG_NUM_VLANS;
+	}
+
+	for (i = 1; i <= dev->caps.num_ports; ++i) {
+		dev->caps.port_type[i] = MLX4_PORT_TYPE_NONE;
+		if (dev->caps.supported_type[i]) {
+			/* if only ETH is supported - assign ETH */
+			if (dev->caps.supported_type[i] == MLX4_PORT_TYPE_ETH)
+				dev->caps.port_type[i] = MLX4_PORT_TYPE_ETH;
+			/* if only IB is supported, assign IB */
+			else if (dev->caps.supported_type[i] ==
+				 MLX4_PORT_TYPE_IB)
+				dev->caps.port_type[i] = MLX4_PORT_TYPE_IB;
+			else {
+				/* if IB and ETH are supported, we set the port
+				 * type according to user selection of port type;
+				 * if user selected none, take the FW hint */
+				if (port_type_array[i - 1] == MLX4_PORT_TYPE_NONE)
+					dev->caps.port_type[i] = dev->caps.suggested_type[i] ?
+						MLX4_PORT_TYPE_ETH : MLX4_PORT_TYPE_IB;
+				else
+					dev->caps.port_type[i] = port_type_array[i - 1];
+			}
+		}
+		/*
+		 * Link sensing is allowed on the port if 3 conditions are true:
+		 * 1. Both protocols are supported on the port.
+		 * 2. Different types are supported on the port
+		 * 3. FW declared that it supports link sensing
+		 */
+		mlx4_priv(dev)->sense.sense_allowed[i] =
+			((dev->caps.supported_type[i] == MLX4_PORT_TYPE_AUTO) &&
+			 (dev->caps.flags & MLX4_DEV_CAP_FLAG_DPDP) &&
+			 (dev->caps.flags & MLX4_DEV_CAP_FLAG_SENSE_SUPPORT));
+
+		/*
+		 * If "default_sense" bit is set, we move the port to "AUTO" mode
+		 * and perform sense_port FW command to try and set the correct
+		 * port type from beginning
+		 */
+		if (mlx4_priv(dev)->sense.sense_allowed[i] && dev->caps.default_sense[i]) {
+			enum mlx4_port_type sensed_port = MLX4_PORT_TYPE_NONE;
+			dev->caps.possible_type[i] = MLX4_PORT_TYPE_AUTO;
+			mlx4_SENSE_PORT(dev, i, &sensed_port);
+			if (sensed_port != MLX4_PORT_TYPE_NONE)
+				dev->caps.port_type[i] = sensed_port;
+		} else {
+			dev->caps.possible_type[i] = dev->caps.port_type[i];
+		}
+
+		if (dev->caps.log_num_macs > dev_cap->log_max_macs[i]) {
+			dev->caps.log_num_macs = dev_cap->log_max_macs[i];
+			mlx4_warn(dev, "Requested number of MACs is too much for port %d, reducing to %d\n",
+				  i, 1 << dev->caps.log_num_macs);
+		}
+		if (dev->caps.log_num_vlans > dev_cap->log_max_vlans[i]) {
+			dev->caps.log_num_vlans = dev_cap->log_max_vlans[i];
+			mlx4_warn(dev, "Requested number of VLANs is too much for port %d, reducing to %d\n",
+				  i, 1 << dev->caps.log_num_vlans);
+		}
+	}
+
+	dev->caps.max_counters = 1 << ilog2(dev_cap->max_counters);
+
+	dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW] = dev_cap->reserved_qps;
+	dev->caps.reserved_qps_cnt[MLX4_QP_REGION_ETH_ADDR] =
+		dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FC_ADDR] =
+		(1 << dev->caps.log_num_macs) *
+		(1 << dev->caps.log_num_vlans) *
+		dev->caps.num_ports;
+	dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FC_EXCH] = MLX4_NUM_FEXCH;
+
+	dev->caps.reserved_qps = dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW] +
+		dev->caps.reserved_qps_cnt[MLX4_QP_REGION_ETH_ADDR] +
+		dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FC_ADDR] +
+		dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FC_EXCH];
+
+	dev->caps.sqp_demux = (mlx4_is_master(dev)) ? MLX4_MAX_NUM_SLAVES : 0;
+
+	if (!enable_64b_cqe_eqe && !mlx4_is_slave(dev)) {
+		if (dev_cap->flags &
+		    (MLX4_DEV_CAP_FLAG_64B_CQE | MLX4_DEV_CAP_FLAG_64B_EQE)) {
+			mlx4_warn(dev, "64B EQEs/CQEs supported by the device but not enabled\n");
+			dev->caps.flags &= ~MLX4_DEV_CAP_FLAG_64B_CQE;
+			dev->caps.flags &= ~MLX4_DEV_CAP_FLAG_64B_EQE;
+		}
+
+		if (dev_cap->flags2 &
+		    (MLX4_DEV_CAP_FLAG2_CQE_STRIDE |
+		     MLX4_DEV_CAP_FLAG2_EQE_STRIDE)) {
+			mlx4_warn(dev, "Disabling EQE/CQE stride per user request\n");
+			dev_cap->flags2 &= ~MLX4_DEV_CAP_FLAG2_CQE_STRIDE;
+			dev_cap->flags2 &= ~MLX4_DEV_CAP_FLAG2_EQE_STRIDE;
+		}
+	}
+
+	if ((dev->caps.flags &
+	    (MLX4_DEV_CAP_FLAG_64B_CQE | MLX4_DEV_CAP_FLAG_64B_EQE)) &&
+	    mlx4_is_master(dev))
+		dev->caps.function_caps |= MLX4_FUNC_CAP_64B_EQE_CQE;
+
+	if (!mlx4_is_slave(dev))
+		mlx4_enable_cqe_eqe_stride(dev);
+
+	return 0;
+}
+
+static int mlx4_get_pcie_dev_link_caps(struct mlx4_dev *dev,
+				       enum pci_bus_speed *speed,
+				       enum pcie_link_width *width)
+{
+	u32 lnkcap1, lnkcap2;
+	int err1, err2;
+
+#define  PCIE_MLW_CAP_SHIFT 4	/* start of MLW mask in link capabilities */
+
+	*speed = PCI_SPEED_UNKNOWN;
+	*width = PCIE_LNK_WIDTH_UNKNOWN;
+
+	err1 = pcie_capability_read_dword(dev->pdev, PCI_EXP_LNKCAP, &lnkcap1);
+	err2 = pcie_capability_read_dword(dev->pdev, PCI_EXP_LNKCAP2, &lnkcap2);
+	if (!err2 && lnkcap2) { /* PCIe r3.0-compliant */
+		if (lnkcap2 & PCI_EXP_LNKCAP2_SLS_8_0GB)
+			*speed = PCIE_SPEED_8_0GT;
+		else if (lnkcap2 & PCI_EXP_LNKCAP2_SLS_5_0GB)
+			*speed = PCIE_SPEED_5_0GT;
+		else if (lnkcap2 & PCI_EXP_LNKCAP2_SLS_2_5GB)
+			*speed = PCIE_SPEED_2_5GT;
+	}
+	if (!err1) {
+		*width = (lnkcap1 & PCI_EXP_LNKCAP_MLW) >> PCIE_MLW_CAP_SHIFT;
+		if (!lnkcap2) { /* pre-r3.0 */
+			if (lnkcap1 & PCI_EXP_LNKCAP_SLS_5_0GB)
+				*speed = PCIE_SPEED_5_0GT;
+			else if (lnkcap1 & PCI_EXP_LNKCAP_SLS_2_5GB)
+				*speed = PCIE_SPEED_2_5GT;
+		}
+	}
+
+	if (*speed == PCI_SPEED_UNKNOWN || *width == PCIE_LNK_WIDTH_UNKNOWN) {
+		return err1 ? err1 :
+			err2 ? err2 : -EINVAL;
+	}
+	return 0;
+}
+
+static void mlx4_check_pcie_caps(struct mlx4_dev *dev)
+{
+	enum pcie_link_width width, width_cap;
+	enum pci_bus_speed speed, speed_cap;
+	int err;
+
+#define PCIE_SPEED_STR(speed) \
+	(speed == PCIE_SPEED_8_0GT ? "8.0GT/s" : \
+	 speed == PCIE_SPEED_5_0GT ? "5.0GT/s" : \
+	 speed == PCIE_SPEED_2_5GT ? "2.5GT/s" : \
+	 "Unknown")
+
+	err = mlx4_get_pcie_dev_link_caps(dev, &speed_cap, &width_cap);
+	if (err) {
+		mlx4_warn(dev,
+			  "Unable to determine PCIe device BW capabilities\n");
+		return;
+	}
+
+	err = pcie_get_minimum_link(dev->pdev, &speed, &width);
+	if (err || speed == PCI_SPEED_UNKNOWN ||
+	    width == PCIE_LNK_WIDTH_UNKNOWN) {
+		mlx4_warn(dev,
+			  "Unable to determine PCI device chain minimum BW\n");
+		return;
+	}
+
+	if (width != width_cap || speed != speed_cap)
+		mlx4_warn(dev,
+			  "PCIe BW is different than device's capability\n");
+
+	mlx4_info(dev, "PCIe link speed is %s, device supports %s\n",
+		  PCIE_SPEED_STR(speed), PCIE_SPEED_STR(speed_cap));
+	mlx4_info(dev, "PCIe link width is x%d, device supports x%d\n",
+		  width, width_cap);
+	return;
+}
+
+/*The function checks if there are live vf, return the num of them*/
+static int mlx4_how_many_lives_vf(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_slave_state *s_state;
+	int i;
+	int ret = 0;
+
+	for (i = 1/*the ppf is 0*/; i < dev->num_slaves; ++i) {
+		s_state = &priv->mfunc.master.slave_state[i];
+		if (s_state->active && s_state->last_cmd !=
+		    MLX4_COMM_CMD_RESET) {
+			mlx4_warn(dev, "%s: slave: %d is still active\n",
+				  __func__, i);
+			ret++;
+		}
+	}
+	return ret;
+}
+
+int mlx4_get_parav_qkey(struct mlx4_dev *dev, u32 qpn, u32 *qkey)
+{
+	u32 qk = MLX4_RESERVED_QKEY_BASE;
+
+	if (qpn >= dev->phys_caps.base_tunnel_sqpn + 8 * MLX4_MFUNC_MAX ||
+	    qpn < dev->phys_caps.base_proxy_sqpn)
+		return -EINVAL;
+
+	if (qpn >= dev->phys_caps.base_tunnel_sqpn)
+		/* tunnel qp */
+		qk += qpn - dev->phys_caps.base_tunnel_sqpn;
+	else
+		qk += qpn - dev->phys_caps.base_proxy_sqpn;
+	*qkey = qk;
+	return 0;
+}
+EXPORT_SYMBOL(mlx4_get_parav_qkey);
+
+void mlx4_sync_pkey_table(struct mlx4_dev *dev, int slave, int port, int i, int val)
+{
+	struct mlx4_priv *priv = container_of(dev, struct mlx4_priv, dev);
+
+	if (!mlx4_is_master(dev))
+		return;
+
+	priv->virt2phys_pkey[slave][port - 1][i] = val;
+}
+EXPORT_SYMBOL(mlx4_sync_pkey_table);
+
+void mlx4_put_slave_node_guid(struct mlx4_dev *dev, int slave, __be64 guid)
+{
+	struct mlx4_priv *priv = container_of(dev, struct mlx4_priv, dev);
+
+	if (!mlx4_is_master(dev))
+		return;
+
+	priv->slave_node_guids[slave] = guid;
+}
+EXPORT_SYMBOL(mlx4_put_slave_node_guid);
+
+__be64 mlx4_get_slave_node_guid(struct mlx4_dev *dev, int slave)
+{
+	struct mlx4_priv *priv = container_of(dev, struct mlx4_priv, dev);
+
+	if (!mlx4_is_master(dev))
+		return 0;
+
+	return priv->slave_node_guids[slave];
+}
+EXPORT_SYMBOL(mlx4_get_slave_node_guid);
+
+int mlx4_is_slave_active(struct mlx4_dev *dev, int slave)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_slave_state *s_slave;
+
+	if (!mlx4_is_master(dev))
+		return 0;
+
+	s_slave = &priv->mfunc.master.slave_state[slave];
+	return !!s_slave->active;
+}
+EXPORT_SYMBOL(mlx4_is_slave_active);
+
+static void slave_adjust_steering_mode(struct mlx4_dev *dev,
+				       struct mlx4_dev_cap *dev_cap,
+				       struct mlx4_init_hca_param *hca_param)
+{
+	dev->caps.steering_mode = hca_param->steering_mode;
+	if (dev->caps.steering_mode == MLX4_STEERING_MODE_DEVICE_MANAGED) {
+		dev->caps.num_qp_per_mgm = dev_cap->fs_max_num_qp_per_entry;
+		dev->caps.fs_log_max_ucast_qp_range_size =
+			dev_cap->fs_log_max_ucast_qp_range_size;
+	} else
+		dev->caps.num_qp_per_mgm =
+			4 * ((1 << hca_param->log_mc_entry_sz)/16 - 2);
+
+	mlx4_dbg(dev, "Steering mode is: %s\n",
+		 mlx4_steering_mode_str(dev->caps.steering_mode));
+}
+
+static int mlx4_slave_cap(struct mlx4_dev *dev)
+{
+	int			   err;
+	u32			   page_size;
+	struct mlx4_dev_cap	   dev_cap;
+	struct mlx4_func_cap	   func_cap;
+	struct mlx4_init_hca_param hca_param;
+	int			   i;
+
+	memset(&hca_param, 0, sizeof(hca_param));
+	err = mlx4_QUERY_HCA(dev, &hca_param);
+	if (err) {
+		mlx4_err(dev, "QUERY_HCA command failed, aborting\n");
+		return err;
+	}
+
+	/* fail if the hca has an unknown global capability
+	 * at this time global_caps should be always zeroed
+	 */
+	if (hca_param.global_caps) {
+		mlx4_err(dev, "Unknown hca global capabilities\n");
+		return -ENOSYS;
+	}
+
+	mlx4_log_num_mgm_entry_size = hca_param.log_mc_entry_sz;
+
+	dev->caps.hca_core_clock = hca_param.hca_core_clock;
+
+	memset(&dev_cap, 0, sizeof(dev_cap));
+	dev->caps.max_qp_dest_rdma = 1 << hca_param.log_rd_per_qp;
+	err = mlx4_dev_cap(dev, &dev_cap);
+	if (err) {
+		mlx4_err(dev, "QUERY_DEV_CAP command failed, aborting\n");
+		return err;
+	}
+
+	err = mlx4_QUERY_FW(dev);
+	if (err)
+		mlx4_err(dev, "QUERY_FW command failed: could not get FW version\n");
+
+	page_size = ~dev->caps.page_size_cap + 1;
+	mlx4_warn(dev, "HCA minimum page size:%d\n", page_size);
+	if (page_size > PAGE_SIZE) {
+		mlx4_err(dev, "HCA minimum page size of %d bigger than kernel PAGE_SIZE of %ld, aborting\n",
+			 page_size, PAGE_SIZE);
+		return -ENODEV;
+	}
+
+	/* slave gets uar page size from QUERY_HCA fw command */
+	dev->caps.uar_page_size = 1 << (hca_param.uar_page_sz + 12);
+
+	/* TODO: relax this assumption */
+	if (dev->caps.uar_page_size != PAGE_SIZE) {
+		mlx4_err(dev, "UAR size:%d != kernel PAGE_SIZE of %ld\n",
+			 dev->caps.uar_page_size, PAGE_SIZE);
+		return -ENODEV;
+	}
+
+	memset(&func_cap, 0, sizeof(func_cap));
+	err = mlx4_QUERY_FUNC_CAP(dev, 0, &func_cap);
+	if (err) {
+		mlx4_err(dev, "QUERY_FUNC_CAP general command failed, aborting (%d)\n",
+			 err);
+		return err;
+	}
+
+	if ((func_cap.pf_context_behaviour | PF_CONTEXT_BEHAVIOUR_MASK) !=
+	    PF_CONTEXT_BEHAVIOUR_MASK) {
+		mlx4_err(dev, "Unknown pf context behaviour\n");
+		return -ENOSYS;
+	}
+
+	dev->caps.num_ports		= func_cap.num_ports;
+	dev->quotas.qp			= func_cap.qp_quota;
+	dev->quotas.srq			= func_cap.srq_quota;
+	dev->quotas.cq			= func_cap.cq_quota;
+	dev->quotas.mpt			= func_cap.mpt_quota;
+	dev->quotas.mtt			= func_cap.mtt_quota;
+	dev->caps.num_qps		= 1 << hca_param.log_num_qps;
+	dev->caps.num_srqs		= 1 << hca_param.log_num_srqs;
+	dev->caps.num_cqs		= 1 << hca_param.log_num_cqs;
+	dev->caps.num_mpts		= 1 << hca_param.log_mpt_sz;
+	dev->caps.num_eqs		= func_cap.max_eq;
+	dev->caps.reserved_eqs		= func_cap.reserved_eq;
+	dev->caps.num_pds               = MLX4_NUM_PDS;
+	dev->caps.num_mgms              = 0;
+	dev->caps.num_amgms             = 0;
+
+	if (dev->caps.num_ports > MLX4_MAX_PORTS) {
+		mlx4_err(dev, "HCA has %d ports, but we only support %d, aborting\n",
+			 dev->caps.num_ports, MLX4_MAX_PORTS);
+		return -ENODEV;
+	}
+
+	dev->caps.qp0_qkey = kcalloc(dev->caps.num_ports, sizeof(u32), GFP_KERNEL);
+	dev->caps.qp0_tunnel = kcalloc(dev->caps.num_ports, sizeof (u32), GFP_KERNEL);
+	dev->caps.qp0_proxy = kcalloc(dev->caps.num_ports, sizeof (u32), GFP_KERNEL);
+	dev->caps.qp1_tunnel = kcalloc(dev->caps.num_ports, sizeof (u32), GFP_KERNEL);
+	dev->caps.qp1_proxy = kcalloc(dev->caps.num_ports, sizeof (u32), GFP_KERNEL);
+
+	if (!dev->caps.qp0_tunnel || !dev->caps.qp0_proxy ||
+	    !dev->caps.qp1_tunnel || !dev->caps.qp1_proxy ||
+	    !dev->caps.qp0_qkey) {
+		err = -ENOMEM;
+		goto err_mem;
+	}
+
+	for (i = 1; i <= dev->caps.num_ports; ++i) {
+		err = mlx4_QUERY_FUNC_CAP(dev, (u32) i, &func_cap);
+		if (err) {
+			mlx4_err(dev, "QUERY_FUNC_CAP port command failed for port %d, aborting (%d)\n",
+				 i, err);
+			goto err_mem;
+		}
+		dev->caps.qp0_qkey[i - 1] = func_cap.qp0_qkey;
+		dev->caps.qp0_tunnel[i - 1] = func_cap.qp0_tunnel_qpn;
+		dev->caps.qp0_proxy[i - 1] = func_cap.qp0_proxy_qpn;
+		dev->caps.qp1_tunnel[i - 1] = func_cap.qp1_tunnel_qpn;
+		dev->caps.qp1_proxy[i - 1] = func_cap.qp1_proxy_qpn;
+		dev->caps.port_mask[i] = dev->caps.port_type[i];
+		dev->caps.phys_port_id[i] = func_cap.phys_port_id;
+		if (mlx4_get_slave_pkey_gid_tbl_len(dev, i,
+						    &dev->caps.gid_table_len[i],
+						    &dev->caps.pkey_table_len[i]))
+			goto err_mem;
+	}
+
+	if (dev->caps.uar_page_size * (dev->caps.num_uars -
+				       dev->caps.reserved_uars) >
+				       pci_resource_len(dev->pdev, 2)) {
+		mlx4_err(dev, "HCA reported UAR region size of 0x%x bigger than PCI resource 2 size of 0x%llx, aborting\n",
+			 dev->caps.uar_page_size * dev->caps.num_uars,
+			 (unsigned long long) pci_resource_len(dev->pdev, 2));
+		goto err_mem;
+	}
+
+	if (hca_param.dev_cap_enabled & MLX4_DEV_CAP_64B_EQE_ENABLED) {
+		dev->caps.eqe_size   = 64;
+		dev->caps.eqe_factor = 1;
+	} else {
+		dev->caps.eqe_size   = 32;
+		dev->caps.eqe_factor = 0;
+	}
+
+	if (hca_param.dev_cap_enabled & MLX4_DEV_CAP_64B_CQE_ENABLED) {
+		dev->caps.cqe_size   = 64;
+		dev->caps.userspace_caps |= MLX4_USER_DEV_CAP_LARGE_CQE;
+	} else {
+		dev->caps.cqe_size   = 32;
+	}
+
+	if (hca_param.dev_cap_enabled & MLX4_DEV_CAP_EQE_STRIDE_ENABLED) {
+		dev->caps.eqe_size = hca_param.eqe_size;
+		dev->caps.eqe_factor = 0;
+	}
+
+	if (hca_param.dev_cap_enabled & MLX4_DEV_CAP_CQE_STRIDE_ENABLED) {
+		dev->caps.cqe_size = hca_param.cqe_size;
+		/* User still need to know when CQE > 32B */
+		dev->caps.userspace_caps |= MLX4_USER_DEV_CAP_LARGE_CQE;
+	}
+
+	dev->caps.flags2 &= ~MLX4_DEV_CAP_FLAG2_TS;
+	mlx4_warn(dev, "Timestamping is not supported in slave mode\n");
+
+	slave_adjust_steering_mode(dev, &dev_cap, &hca_param);
+
+	return 0;
+
+err_mem:
+	kfree(dev->caps.qp0_qkey);
+	kfree(dev->caps.qp0_tunnel);
+	kfree(dev->caps.qp0_proxy);
+	kfree(dev->caps.qp1_tunnel);
+	kfree(dev->caps.qp1_proxy);
+	dev->caps.qp0_qkey = NULL;
+	dev->caps.qp0_tunnel = NULL;
+	dev->caps.qp0_proxy = NULL;
+	dev->caps.qp1_tunnel = NULL;
+	dev->caps.qp1_proxy = NULL;
+
+	return err;
+}
+
+static void mlx4_request_modules(struct mlx4_dev *dev)
+{
+	int port;
+	int has_ib_port = false;
+	int has_eth_port = false;
+#define EN_DRV_NAME	"mlx4_en"
+#define IB_DRV_NAME	"mlx4_ib"
+
+	for (port = 1; port <= dev->caps.num_ports; port++) {
+		if (dev->caps.port_type[port] == MLX4_PORT_TYPE_IB)
+			has_ib_port = true;
+		else if (dev->caps.port_type[port] == MLX4_PORT_TYPE_ETH)
+			has_eth_port = true;
+	}
+
+	if (has_eth_port)
+		request_module_nowait(EN_DRV_NAME);
+	if (has_ib_port || (dev->caps.flags & MLX4_DEV_CAP_FLAG_IBOE))
+		request_module_nowait(IB_DRV_NAME);
+}
+
+/*
+ * Change the port configuration of the device.
+ * Every user of this function must hold the port mutex.
+ */
+int mlx4_change_port_types(struct mlx4_dev *dev,
+			   enum mlx4_port_type *port_types)
+{
+	int err = 0;
+	int change = 0;
+	int port;
+
+	for (port = 0; port <  dev->caps.num_ports; port++) {
+		/* Change the port type only if the new type is different
+		 * from the current, and not set to Auto */
+		if (port_types[port] != dev->caps.port_type[port + 1])
+			change = 1;
+	}
+	if (change) {
+		mlx4_unregister_device(dev);
+		for (port = 1; port <= dev->caps.num_ports; port++) {
+			mlx4_CLOSE_PORT(dev, port);
+			dev->caps.port_type[port] = port_types[port - 1];
+			err = mlx4_SET_PORT(dev, port, -1);
+			if (err) {
+				mlx4_err(dev, "Failed to set port %d, aborting\n",
+					 port);
+				goto out;
+			}
+		}
+		mlx4_set_port_mask(dev);
+		err = mlx4_register_device(dev);
+		if (err) {
+			mlx4_err(dev, "Failed to register device\n");
+			goto out;
+		}
+		mlx4_request_modules(dev);
+	}
+
+out:
+	return err;
+}
+
+static ssize_t show_port_type(struct device *dev,
+			      struct device_attribute *attr,
+			      char *buf)
+{
+	struct mlx4_port_info *info = container_of(attr, struct mlx4_port_info,
+						   port_attr);
+	struct mlx4_dev *mdev = info->dev;
+	char type[8];
+
+	sprintf(type, "%s",
+		(mdev->caps.port_type[info->port] == MLX4_PORT_TYPE_IB) ?
+		"ib" : "eth");
+	if (mdev->caps.possible_type[info->port] == MLX4_PORT_TYPE_AUTO)
+		sprintf(buf, "auto (%s)\n", type);
+	else
+		sprintf(buf, "%s\n", type);
+
+	return strlen(buf);
+}
+
+static ssize_t set_port_type(struct device *dev,
+			     struct device_attribute *attr,
+			     const char *buf, size_t count)
+{
+	struct mlx4_port_info *info = container_of(attr, struct mlx4_port_info,
+						   port_attr);
+	struct mlx4_dev *mdev = info->dev;
+	struct mlx4_priv *priv = mlx4_priv(mdev);
+	enum mlx4_port_type types[MLX4_MAX_PORTS];
+	enum mlx4_port_type new_types[MLX4_MAX_PORTS];
+	int i;
+	int err = 0;
+
+	if (!strcmp(buf, "ib\n"))
+		info->tmp_type = MLX4_PORT_TYPE_IB;
+	else if (!strcmp(buf, "eth\n"))
+		info->tmp_type = MLX4_PORT_TYPE_ETH;
+	else if (!strcmp(buf, "auto\n"))
+		info->tmp_type = MLX4_PORT_TYPE_AUTO;
+	else {
+		mlx4_err(mdev, "%s is not supported port type\n", buf);
+		return -EINVAL;
+	}
+
+	mlx4_stop_sense(mdev);
+	mutex_lock(&priv->port_mutex);
+	/* Possible type is always the one that was delivered */
+	mdev->caps.possible_type[info->port] = info->tmp_type;
+
+	for (i = 0; i < mdev->caps.num_ports; i++) {
+		types[i] = priv->port[i+1].tmp_type ? priv->port[i+1].tmp_type :
+					mdev->caps.possible_type[i+1];
+		if (types[i] == MLX4_PORT_TYPE_AUTO)
+			types[i] = mdev->caps.port_type[i+1];
+	}
+
+	if (!(mdev->caps.flags & MLX4_DEV_CAP_FLAG_DPDP) &&
+	    !(mdev->caps.flags & MLX4_DEV_CAP_FLAG_SENSE_SUPPORT)) {
+		for (i = 1; i <= mdev->caps.num_ports; i++) {
+			if (mdev->caps.possible_type[i] == MLX4_PORT_TYPE_AUTO) {
+				mdev->caps.possible_type[i] = mdev->caps.port_type[i];
+				err = -EINVAL;
+			}
+		}
+	}
+	if (err) {
+		mlx4_err(mdev, "Auto sensing is not supported on this HCA. Set only 'eth' or 'ib' for both ports (should be the same)\n");
+		goto out;
+	}
+
+	mlx4_do_sense_ports(mdev, new_types, types);
+
+	err = mlx4_check_port_params(mdev, new_types);
+	if (err)
+		goto out;
+
+	/* We are about to apply the changes after the configuration
+	 * was verified, no need to remember the temporary types
+	 * any more */
+	for (i = 0; i < mdev->caps.num_ports; i++)
+		priv->port[i + 1].tmp_type = 0;
+
+	err = mlx4_change_port_types(mdev, new_types);
+
+out:
+	mlx4_start_sense(mdev);
+	mutex_unlock(&priv->port_mutex);
+	return err ? err : count;
+}
+
+enum ibta_mtu {
+	IB_MTU_256  = 1,
+	IB_MTU_512  = 2,
+	IB_MTU_1024 = 3,
+	IB_MTU_2048 = 4,
+	IB_MTU_4096 = 5
+};
+
+static inline int int_to_ibta_mtu(int mtu)
+{
+	switch (mtu) {
+	case 256:  return IB_MTU_256;
+	case 512:  return IB_MTU_512;
+	case 1024: return IB_MTU_1024;
+	case 2048: return IB_MTU_2048;
+	case 4096: return IB_MTU_4096;
+	default: return -1;
+	}
+}
+
+static inline int ibta_mtu_to_int(enum ibta_mtu mtu)
+{
+	switch (mtu) {
+	case IB_MTU_256:  return  256;
+	case IB_MTU_512:  return  512;
+	case IB_MTU_1024: return 1024;
+	case IB_MTU_2048: return 2048;
+	case IB_MTU_4096: return 4096;
+	default: return -1;
+	}
+}
+
+static ssize_t show_port_ib_mtu(struct device *dev,
+			     struct device_attribute *attr,
+			     char *buf)
+{
+	struct mlx4_port_info *info = container_of(attr, struct mlx4_port_info,
+						   port_mtu_attr);
+	struct mlx4_dev *mdev = info->dev;
+
+	if (mdev->caps.port_type[info->port] == MLX4_PORT_TYPE_ETH)
+		mlx4_warn(mdev, "port level mtu is only used for IB ports\n");
+
+	sprintf(buf, "%d\n",
+			ibta_mtu_to_int(mdev->caps.port_ib_mtu[info->port]));
+	return strlen(buf);
+}
+
+static ssize_t set_port_ib_mtu(struct device *dev,
+			     struct device_attribute *attr,
+			     const char *buf, size_t count)
+{
+	struct mlx4_port_info *info = container_of(attr, struct mlx4_port_info,
+						   port_mtu_attr);
+	struct mlx4_dev *mdev = info->dev;
+	struct mlx4_priv *priv = mlx4_priv(mdev);
+	int err, port, mtu, ibta_mtu = -1;
+
+	if (mdev->caps.port_type[info->port] == MLX4_PORT_TYPE_ETH) {
+		mlx4_warn(mdev, "port level mtu is only used for IB ports\n");
+		return -EINVAL;
+	}
+
+	err = kstrtoint(buf, 0, &mtu);
+	if (!err)
+		ibta_mtu = int_to_ibta_mtu(mtu);
+
+	if (err || ibta_mtu < 0) {
+		mlx4_err(mdev, "%s is invalid IBTA mtu\n", buf);
+		return -EINVAL;
+	}
+
+	mdev->caps.port_ib_mtu[info->port] = ibta_mtu;
+
+	mlx4_stop_sense(mdev);
+	mutex_lock(&priv->port_mutex);
+	mlx4_unregister_device(mdev);
+	for (port = 1; port <= mdev->caps.num_ports; port++) {
+		mlx4_CLOSE_PORT(mdev, port);
+		err = mlx4_SET_PORT(mdev, port, -1);
+		if (err) {
+			mlx4_err(mdev, "Failed to set port %d, aborting\n",
+				 port);
+			goto err_set_port;
+		}
+	}
+	err = mlx4_register_device(mdev);
+err_set_port:
+	mutex_unlock(&priv->port_mutex);
+	mlx4_start_sense(mdev);
+	return err ? err : count;
+}
+
+static int mlx4_load_fw(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int err;
+
+	priv->fw.fw_icm = mlx4_alloc_icm(dev, priv->fw.fw_pages,
+					 GFP_HIGHUSER | __GFP_NOWARN, 0);
+	if (!priv->fw.fw_icm) {
+		mlx4_err(dev, "Couldn't allocate FW area, aborting\n");
+		return -ENOMEM;
+	}
+
+	err = mlx4_MAP_FA(dev, priv->fw.fw_icm);
+	if (err) {
+		mlx4_err(dev, "MAP_FA command failed, aborting\n");
+		goto err_free;
+	}
+
+	err = mlx4_RUN_FW(dev);
+	if (err) {
+		mlx4_err(dev, "RUN_FW command failed, aborting\n");
+		goto err_unmap_fa;
+	}
+
+	return 0;
+
+err_unmap_fa:
+	mlx4_UNMAP_FA(dev);
+
+err_free:
+	mlx4_free_icm(dev, priv->fw.fw_icm, 0);
+	return err;
+}
+
+static int mlx4_init_cmpt_table(struct mlx4_dev *dev, u64 cmpt_base,
+				int cmpt_entry_sz)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int err;
+	int num_eqs;
+
+	err = mlx4_init_icm_table(dev, &priv->qp_table.cmpt_table,
+				  cmpt_base +
+				  ((u64) (MLX4_CMPT_TYPE_QP *
+					  cmpt_entry_sz) << MLX4_CMPT_SHIFT),
+				  cmpt_entry_sz, dev->caps.num_qps,
+				  dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW],
+				  0, 0);
+	if (err)
+		goto err;
+
+	err = mlx4_init_icm_table(dev, &priv->srq_table.cmpt_table,
+				  cmpt_base +
+				  ((u64) (MLX4_CMPT_TYPE_SRQ *
+					  cmpt_entry_sz) << MLX4_CMPT_SHIFT),
+				  cmpt_entry_sz, dev->caps.num_srqs,
+				  dev->caps.reserved_srqs, 0, 0);
+	if (err)
+		goto err_qp;
+
+	err = mlx4_init_icm_table(dev, &priv->cq_table.cmpt_table,
+				  cmpt_base +
+				  ((u64) (MLX4_CMPT_TYPE_CQ *
+					  cmpt_entry_sz) << MLX4_CMPT_SHIFT),
+				  cmpt_entry_sz, dev->caps.num_cqs,
+				  dev->caps.reserved_cqs, 0, 0);
+	if (err)
+		goto err_srq;
+
+	num_eqs = (mlx4_is_master(dev)) ? dev->phys_caps.num_phys_eqs :
+		  dev->caps.num_eqs;
+	err = mlx4_init_icm_table(dev, &priv->eq_table.cmpt_table,
+				  cmpt_base +
+				  ((u64) (MLX4_CMPT_TYPE_EQ *
+					  cmpt_entry_sz) << MLX4_CMPT_SHIFT),
+				  cmpt_entry_sz, num_eqs, num_eqs, 0, 0);
+	if (err)
+		goto err_cq;
+
+	return 0;
+
+err_cq:
+	mlx4_cleanup_icm_table(dev, &priv->cq_table.cmpt_table);
+
+err_srq:
+	mlx4_cleanup_icm_table(dev, &priv->srq_table.cmpt_table);
+
+err_qp:
+	mlx4_cleanup_icm_table(dev, &priv->qp_table.cmpt_table);
+
+err:
+	return err;
+}
+
+static int mlx4_init_icm(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap,
+			 struct mlx4_init_hca_param *init_hca, u64 icm_size)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	u64 aux_pages;
+	int num_eqs;
+	int err;
+
+	err = mlx4_SET_ICM_SIZE(dev, icm_size, &aux_pages);
+	if (err) {
+		mlx4_err(dev, "SET_ICM_SIZE command failed, aborting\n");
+		return err;
+	}
+
+	mlx4_dbg(dev, "%lld KB of HCA context requires %lld KB aux memory\n",
+		 (unsigned long long) icm_size >> 10,
+		 (unsigned long long) aux_pages << 2);
+
+	priv->fw.aux_icm = mlx4_alloc_icm(dev, aux_pages,
+					  GFP_HIGHUSER | __GFP_NOWARN, 0);
+	if (!priv->fw.aux_icm) {
+		mlx4_err(dev, "Couldn't allocate aux memory, aborting\n");
+		return -ENOMEM;
+	}
+
+	err = mlx4_MAP_ICM_AUX(dev, priv->fw.aux_icm);
+	if (err) {
+		mlx4_err(dev, "MAP_ICM_AUX command failed, aborting\n");
+		goto err_free_aux;
+	}
+
+	err = mlx4_init_cmpt_table(dev, init_hca->cmpt_base, dev_cap->cmpt_entry_sz);
+	if (err) {
+		mlx4_err(dev, "Failed to map cMPT context memory, aborting\n");
+		goto err_unmap_aux;
+	}
+
+
+	num_eqs = (mlx4_is_master(dev)) ? dev->phys_caps.num_phys_eqs :
+		   dev->caps.num_eqs;
+	err = mlx4_init_icm_table(dev, &priv->eq_table.table,
+				  init_hca->eqc_base, dev_cap->eqc_entry_sz,
+				  num_eqs, num_eqs, 0, 0);
+	if (err) {
+		mlx4_err(dev, "Failed to map EQ context memory, aborting\n");
+		goto err_unmap_cmpt;
+	}
+
+	/*
+	 * Reserved MTT entries must be aligned up to a cacheline
+	 * boundary, since the FW will write to them, while the driver
+	 * writes to all other MTT entries. (The variable
+	 * dev->caps.mtt_entry_sz below is really the MTT segment
+	 * size, not the raw entry size)
+	 */
+	dev->caps.reserved_mtts =
+		ALIGN(dev->caps.reserved_mtts * dev->caps.mtt_entry_sz,
+		      dma_get_cache_alignment()) / dev->caps.mtt_entry_sz;
+
+	err = mlx4_init_icm_table(dev, &priv->mr_table.mtt_table,
+				  init_hca->mtt_base,
+				  dev->caps.mtt_entry_sz,
+				  dev->caps.num_mtts,
+				  dev->caps.reserved_mtts, 1, 0);
+	if (err) {
+		mlx4_err(dev, "Failed to map MTT context memory, aborting\n");
+		goto err_unmap_eq;
+	}
+
+	err = mlx4_init_icm_table(dev, &priv->mr_table.dmpt_table,
+				  init_hca->dmpt_base,
+				  dev_cap->dmpt_entry_sz,
+				  dev->caps.num_mpts,
+				  dev->caps.reserved_mrws, 1, 1);
+	if (err) {
+		mlx4_err(dev, "Failed to map dMPT context memory, aborting\n");
+		goto err_unmap_mtt;
+	}
+
+	err = mlx4_init_icm_table(dev, &priv->qp_table.qp_table,
+				  init_hca->qpc_base,
+				  dev_cap->qpc_entry_sz,
+				  dev->caps.num_qps,
+				  dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW],
+				  0, 0);
+	if (err) {
+		mlx4_err(dev, "Failed to map QP context memory, aborting\n");
+		goto err_unmap_dmpt;
+	}
+
+	err = mlx4_init_icm_table(dev, &priv->qp_table.auxc_table,
+				  init_hca->auxc_base,
+				  dev_cap->aux_entry_sz,
+				  dev->caps.num_qps,
+				  dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW],
+				  0, 0);
+	if (err) {
+		mlx4_err(dev, "Failed to map AUXC context memory, aborting\n");
+		goto err_unmap_qp;
+	}
+
+	err = mlx4_init_icm_table(dev, &priv->qp_table.altc_table,
+				  init_hca->altc_base,
+				  dev_cap->altc_entry_sz,
+				  dev->caps.num_qps,
+				  dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW],
+				  0, 0);
+	if (err) {
+		mlx4_err(dev, "Failed to map ALTC context memory, aborting\n");
+		goto err_unmap_auxc;
+	}
+
+	err = mlx4_init_icm_table(dev, &priv->qp_table.rdmarc_table,
+				  init_hca->rdmarc_base,
+				  dev_cap->rdmarc_entry_sz << priv->qp_table.rdmarc_shift,
+				  dev->caps.num_qps,
+				  dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW],
+				  0, 0);
+	if (err) {
+		mlx4_err(dev, "Failed to map RDMARC context memory, aborting\n");
+		goto err_unmap_altc;
+	}
+
+	err = mlx4_init_icm_table(dev, &priv->cq_table.table,
+				  init_hca->cqc_base,
+				  dev_cap->cqc_entry_sz,
+				  dev->caps.num_cqs,
+				  dev->caps.reserved_cqs, 0, 0);
+	if (err) {
+		mlx4_err(dev, "Failed to map CQ context memory, aborting\n");
+		goto err_unmap_rdmarc;
+	}
+
+	err = mlx4_init_icm_table(dev, &priv->srq_table.table,
+				  init_hca->srqc_base,
+				  dev_cap->srq_entry_sz,
+				  dev->caps.num_srqs,
+				  dev->caps.reserved_srqs, 0, 0);
+	if (err) {
+		mlx4_err(dev, "Failed to map SRQ context memory, aborting\n");
+		goto err_unmap_cq;
+	}
+
+	/*
+	 * For flow steering device managed mode it is required to use
+	 * mlx4_init_icm_table. For B0 steering mode it's not strictly
+	 * required, but for simplicity just map the whole multicast
+	 * group table now.  The table isn't very big and it's a lot
+	 * easier than trying to track ref counts.
+	 */
+	err = mlx4_init_icm_table(dev, &priv->mcg_table.table,
+				  init_hca->mc_base,
+				  mlx4_get_mgm_entry_size(dev),
+				  dev->caps.num_mgms + dev->caps.num_amgms,
+				  dev->caps.num_mgms + dev->caps.num_amgms,
+				  0, 0);
+	if (err) {
+		mlx4_err(dev, "Failed to map MCG context memory, aborting\n");
+		goto err_unmap_srq;
+	}
+
+	return 0;
+
+err_unmap_srq:
+	mlx4_cleanup_icm_table(dev, &priv->srq_table.table);
+
+err_unmap_cq:
+	mlx4_cleanup_icm_table(dev, &priv->cq_table.table);
+
+err_unmap_rdmarc:
+	mlx4_cleanup_icm_table(dev, &priv->qp_table.rdmarc_table);
+
+err_unmap_altc:
+	mlx4_cleanup_icm_table(dev, &priv->qp_table.altc_table);
+
+err_unmap_auxc:
+	mlx4_cleanup_icm_table(dev, &priv->qp_table.auxc_table);
+
+err_unmap_qp:
+	mlx4_cleanup_icm_table(dev, &priv->qp_table.qp_table);
+
+err_unmap_dmpt:
+	mlx4_cleanup_icm_table(dev, &priv->mr_table.dmpt_table);
+
+err_unmap_mtt:
+	mlx4_cleanup_icm_table(dev, &priv->mr_table.mtt_table);
+
+err_unmap_eq:
+	mlx4_cleanup_icm_table(dev, &priv->eq_table.table);
+
+err_unmap_cmpt:
+	mlx4_cleanup_icm_table(dev, &priv->eq_table.cmpt_table);
+	mlx4_cleanup_icm_table(dev, &priv->cq_table.cmpt_table);
+	mlx4_cleanup_icm_table(dev, &priv->srq_table.cmpt_table);
+	mlx4_cleanup_icm_table(dev, &priv->qp_table.cmpt_table);
+
+err_unmap_aux:
+	mlx4_UNMAP_ICM_AUX(dev);
+
+err_free_aux:
+	mlx4_free_icm(dev, priv->fw.aux_icm, 0);
+
+	return err;
+}
+
+static void mlx4_free_icms(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	mlx4_cleanup_icm_table(dev, &priv->mcg_table.table);
+	mlx4_cleanup_icm_table(dev, &priv->srq_table.table);
+	mlx4_cleanup_icm_table(dev, &priv->cq_table.table);
+	mlx4_cleanup_icm_table(dev, &priv->qp_table.rdmarc_table);
+	mlx4_cleanup_icm_table(dev, &priv->qp_table.altc_table);
+	mlx4_cleanup_icm_table(dev, &priv->qp_table.auxc_table);
+	mlx4_cleanup_icm_table(dev, &priv->qp_table.qp_table);
+	mlx4_cleanup_icm_table(dev, &priv->mr_table.dmpt_table);
+	mlx4_cleanup_icm_table(dev, &priv->mr_table.mtt_table);
+	mlx4_cleanup_icm_table(dev, &priv->eq_table.table);
+	mlx4_cleanup_icm_table(dev, &priv->eq_table.cmpt_table);
+	mlx4_cleanup_icm_table(dev, &priv->cq_table.cmpt_table);
+	mlx4_cleanup_icm_table(dev, &priv->srq_table.cmpt_table);
+	mlx4_cleanup_icm_table(dev, &priv->qp_table.cmpt_table);
+
+	mlx4_UNMAP_ICM_AUX(dev);
+	mlx4_free_icm(dev, priv->fw.aux_icm, 0);
+}
+
+static void mlx4_slave_exit(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	mutex_lock(&priv->cmd.slave_cmd_mutex);
+	if (mlx4_comm_cmd(dev, MLX4_COMM_CMD_RESET, 0, MLX4_COMM_TIME))
+		mlx4_warn(dev, "Failed to close slave function\n");
+	mutex_unlock(&priv->cmd.slave_cmd_mutex);
+}
+
+static int map_bf_area(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	resource_size_t bf_start;
+	resource_size_t bf_len;
+	int err = 0;
+
+	if (!dev->caps.bf_reg_size)
+		return -ENXIO;
+
+	bf_start = pci_resource_start(dev->pdev, 2) +
+			(dev->caps.num_uars << PAGE_SHIFT);
+	bf_len = pci_resource_len(dev->pdev, 2) -
+			(dev->caps.num_uars << PAGE_SHIFT);
+	priv->bf_mapping = io_mapping_create_wc(bf_start, bf_len);
+	if (!priv->bf_mapping)
+		err = -ENOMEM;
+
+	return err;
+}
+
+static void unmap_bf_area(struct mlx4_dev *dev)
+{
+	if (mlx4_priv(dev)->bf_mapping)
+		io_mapping_free(mlx4_priv(dev)->bf_mapping);
+}
+
+cycle_t mlx4_read_clock(struct mlx4_dev *dev)
+{
+	u32 clockhi, clocklo, clockhi1;
+	cycle_t cycles;
+	int i;
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	for (i = 0; i < 10; i++) {
+		clockhi = swab32(readl(priv->clock_mapping));
+		clocklo = swab32(readl(priv->clock_mapping + 4));
+		clockhi1 = swab32(readl(priv->clock_mapping));
+		if (clockhi == clockhi1)
+			break;
+	}
+
+	cycles = (u64) clockhi << 32 | (u64) clocklo;
+
+	return cycles;
+}
+EXPORT_SYMBOL_GPL(mlx4_read_clock);
+
+
+static int map_internal_clock(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	priv->clock_mapping =
+		ioremap(pci_resource_start(dev->pdev, priv->fw.clock_bar) +
+			priv->fw.clock_offset, MLX4_CLOCK_SIZE);
+
+	if (!priv->clock_mapping)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static void unmap_internal_clock(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	if (priv->clock_mapping)
+		iounmap(priv->clock_mapping);
+}
+
+static void mlx4_close_hca(struct mlx4_dev *dev)
+{
+	unmap_internal_clock(dev);
+	unmap_bf_area(dev);
+	if (mlx4_is_slave(dev))
+		mlx4_slave_exit(dev);
+	else {
+		mlx4_CLOSE_HCA(dev, 0);
+		mlx4_free_icms(dev);
+		mlx4_UNMAP_FA(dev);
+		mlx4_free_icm(dev, mlx4_priv(dev)->fw.fw_icm, 0);
+	}
+}
+
+static int mlx4_init_slave(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	u64 dma = (u64) priv->mfunc.vhcr_dma;
+	int ret_from_reset = 0;
+	u32 slave_read;
+	u32 cmd_channel_ver;
+
+	if (atomic_read(&pf_loading)) {
+		mlx4_warn(dev, "PF is not ready - Deferring probe\n");
+		return -EPROBE_DEFER;
+	}
+
+	mutex_lock(&priv->cmd.slave_cmd_mutex);
+	priv->cmd.max_cmds = 1;
+	mlx4_warn(dev, "Sending reset\n");
+	ret_from_reset = mlx4_comm_cmd(dev, MLX4_COMM_CMD_RESET, 0,
+				       MLX4_COMM_TIME);
+	/* if we are in the middle of flr the slave will try
+	 * NUM_OF_RESET_RETRIES times before leaving.*/
+	if (ret_from_reset) {
+		if (MLX4_DELAY_RESET_SLAVE == ret_from_reset) {
+			mlx4_warn(dev, "slave is currently in the middle of FLR - Deferring probe\n");
+			mutex_unlock(&priv->cmd.slave_cmd_mutex);
+			return -EPROBE_DEFER;
+		} else
+			goto err;
+	}
+
+	/* check the driver version - the slave I/F revision
+	 * must match the master's */
+	slave_read = swab32(readl(&priv->mfunc.comm->slave_read));
+	cmd_channel_ver = mlx4_comm_get_version();
+
+	if (MLX4_COMM_GET_IF_REV(cmd_channel_ver) !=
+		MLX4_COMM_GET_IF_REV(slave_read)) {
+		mlx4_err(dev, "slave driver version is not supported by the master\n");
+		goto err;
+	}
+
+	mlx4_warn(dev, "Sending vhcr0\n");
+	if (mlx4_comm_cmd(dev, MLX4_COMM_CMD_VHCR0, dma >> 48,
+						    MLX4_COMM_TIME))
+		goto err;
+	if (mlx4_comm_cmd(dev, MLX4_COMM_CMD_VHCR1, dma >> 32,
+						    MLX4_COMM_TIME))
+		goto err;
+	if (mlx4_comm_cmd(dev, MLX4_COMM_CMD_VHCR2, dma >> 16,
+						    MLX4_COMM_TIME))
+		goto err;
+	if (mlx4_comm_cmd(dev, MLX4_COMM_CMD_VHCR_EN, dma, MLX4_COMM_TIME))
+		goto err;
+
+	mutex_unlock(&priv->cmd.slave_cmd_mutex);
+	return 0;
+
+err:
+	mlx4_comm_cmd(dev, MLX4_COMM_CMD_RESET, 0, 0);
+	mutex_unlock(&priv->cmd.slave_cmd_mutex);
+	return -EIO;
+}
+
+static void mlx4_parav_master_pf_caps(struct mlx4_dev *dev)
+{
+	int i;
+
+	for (i = 1; i <= dev->caps.num_ports; i++) {
+		if (dev->caps.port_type[i] == MLX4_PORT_TYPE_ETH)
+			dev->caps.gid_table_len[i] =
+				mlx4_get_slave_num_gids(dev, 0, i);
+		else
+			dev->caps.gid_table_len[i] = 1;
+		dev->caps.pkey_table_len[i] =
+			dev->phys_caps.pkey_phys_table_len[i] - 1;
+	}
+}
+
+static int choose_log_fs_mgm_entry_size(int qp_per_entry)
+{
+	int i = MLX4_MIN_MGM_LOG_ENTRY_SIZE;
+
+	for (i = MLX4_MIN_MGM_LOG_ENTRY_SIZE; i <= MLX4_MAX_MGM_LOG_ENTRY_SIZE;
+	      i++) {
+		if (qp_per_entry <= 4 * ((1 << i) / 16 - 2))
+			break;
+	}
+
+	return (i <= MLX4_MAX_MGM_LOG_ENTRY_SIZE) ? i : -1;
+}
+
+static void choose_steering_mode(struct mlx4_dev *dev,
+				 struct mlx4_dev_cap *dev_cap)
+{
+	if (mlx4_log_num_mgm_entry_size == -1 &&
+	    dev_cap->flags2 & MLX4_DEV_CAP_FLAG2_FS_EN &&
+	    (!mlx4_is_mfunc(dev) ||
+	     (dev_cap->fs_max_num_qp_per_entry >= (dev->num_vfs + 1))) &&
+	    choose_log_fs_mgm_entry_size(dev_cap->fs_max_num_qp_per_entry) >=
+		MLX4_MIN_MGM_LOG_ENTRY_SIZE) {
+		dev->oper_log_mgm_entry_size =
+			choose_log_fs_mgm_entry_size(dev_cap->fs_max_num_qp_per_entry);
+		dev->caps.steering_mode = MLX4_STEERING_MODE_DEVICE_MANAGED;
+		dev->caps.num_qp_per_mgm = dev_cap->fs_max_num_qp_per_entry;
+		dev->caps.fs_log_max_ucast_qp_range_size =
+			dev_cap->fs_log_max_ucast_qp_range_size;
+	} else {
+		if (dev->caps.flags & MLX4_DEV_CAP_FLAG_VEP_UC_STEER &&
+		    dev->caps.flags & MLX4_DEV_CAP_FLAG_VEP_MC_STEER)
+			dev->caps.steering_mode = MLX4_STEERING_MODE_B0;
+		else {
+			dev->caps.steering_mode = MLX4_STEERING_MODE_A0;
+
+			if (dev->caps.flags & MLX4_DEV_CAP_FLAG_VEP_UC_STEER ||
+			    dev->caps.flags & MLX4_DEV_CAP_FLAG_VEP_MC_STEER)
+				mlx4_warn(dev, "Must have both UC_STEER and MC_STEER flags set to use B0 steering - falling back to A0 steering mode\n");
+		}
+		dev->oper_log_mgm_entry_size =
+			mlx4_log_num_mgm_entry_size > 0 ?
+			mlx4_log_num_mgm_entry_size :
+			MLX4_DEFAULT_MGM_LOG_ENTRY_SIZE;
+		dev->caps.num_qp_per_mgm = mlx4_get_qp_per_mgm(dev);
+	}
+	mlx4_dbg(dev, "Steering mode is: %s, oper_log_mgm_entry_size = %d, modparam log_num_mgm_entry_size = %d\n",
+		 mlx4_steering_mode_str(dev->caps.steering_mode),
+		 dev->oper_log_mgm_entry_size,
+		 mlx4_log_num_mgm_entry_size);
+}
+
+static void choose_tunnel_offload_mode(struct mlx4_dev *dev,
+				       struct mlx4_dev_cap *dev_cap)
+{
+	if (dev->caps.steering_mode == MLX4_STEERING_MODE_DEVICE_MANAGED &&
+	    dev_cap->flags2 & MLX4_DEV_CAP_FLAG2_VXLAN_OFFLOADS)
+		dev->caps.tunnel_offload_mode = MLX4_TUNNEL_OFFLOAD_MODE_VXLAN;
+	else
+		dev->caps.tunnel_offload_mode = MLX4_TUNNEL_OFFLOAD_MODE_NONE;
+
+	mlx4_dbg(dev, "Tunneling offload mode is: %s\n",  (dev->caps.tunnel_offload_mode
+		 == MLX4_TUNNEL_OFFLOAD_MODE_VXLAN) ? "vxlan" : "none");
+}
+
+static int mlx4_init_hca(struct mlx4_dev *dev)
+{
+	struct mlx4_priv	  *priv = mlx4_priv(dev);
+	struct mlx4_adapter	   adapter;
+	struct mlx4_dev_cap	   dev_cap;
+	struct mlx4_mod_stat_cfg   mlx4_cfg;
+	struct mlx4_profile	   profile;
+	struct mlx4_init_hca_param init_hca;
+	u64 icm_size;
+	int err;
+
+	if (!mlx4_is_slave(dev)) {
+		err = mlx4_QUERY_FW(dev);
+		if (err) {
+			if (err == -EACCES)
+				mlx4_info(dev, "non-primary physical function, skipping\n");
+			else
+				mlx4_err(dev, "QUERY_FW command failed, aborting\n");
+			return err;
+		}
+
+		err = mlx4_load_fw(dev);
+		if (err) {
+			mlx4_err(dev, "Failed to start FW, aborting\n");
+			return err;
+		}
+
+		mlx4_cfg.log_pg_sz_m = 1;
+		mlx4_cfg.log_pg_sz = 0;
+		err = mlx4_MOD_STAT_CFG(dev, &mlx4_cfg);
+		if (err)
+			mlx4_warn(dev, "Failed to override log_pg_sz parameter\n");
+
+		err = mlx4_dev_cap(dev, &dev_cap);
+		if (err) {
+			mlx4_err(dev, "QUERY_DEV_CAP command failed, aborting\n");
+			goto err_stop_fw;
+		}
+
+		choose_steering_mode(dev, &dev_cap);
+		choose_tunnel_offload_mode(dev, &dev_cap);
+
+		err = mlx4_get_phys_port_id(dev);
+		if (err)
+			mlx4_err(dev, "Fail to get physical port id\n");
+
+		if (mlx4_is_master(dev))
+			mlx4_parav_master_pf_caps(dev);
+
+		if (mlx4_low_memory_profile()) {
+			mlx4_info(dev, "Running from within kdump kernel. Using low memory profile\n");
+			profile = low_mem_profile;
+		} else {
+			profile = default_profile;
+		}
+		if (dev->caps.steering_mode ==
+		    MLX4_STEERING_MODE_DEVICE_MANAGED)
+			profile.num_mcg = MLX4_FS_NUM_MCG;
+
+		icm_size = mlx4_make_profile(dev, &profile, &dev_cap,
+					     &init_hca);
+		if ((long long) icm_size < 0) {
+			err = icm_size;
+			goto err_stop_fw;
+		}
+
+		dev->caps.max_fmr_maps = (1 << (32 - ilog2(dev->caps.num_mpts))) - 1;
+
+		init_hca.log_uar_sz = ilog2(dev->caps.num_uars);
+		init_hca.uar_page_sz = PAGE_SHIFT - 12;
+		init_hca.mw_enabled = 0;
+		if (dev->caps.flags & MLX4_DEV_CAP_FLAG_MEM_WINDOW ||
+		    dev->caps.bmme_flags & MLX4_BMME_FLAG_TYPE_2_WIN)
+			init_hca.mw_enabled = INIT_HCA_TPT_MW_ENABLE;
+
+		err = mlx4_init_icm(dev, &dev_cap, &init_hca, icm_size);
+		if (err)
+			goto err_stop_fw;
+
+		err = mlx4_INIT_HCA(dev, &init_hca);
+		if (err) {
+			mlx4_err(dev, "INIT_HCA command failed, aborting\n");
+			goto err_free_icm;
+		}
+		/*
+		 * If TS is supported by FW
+		 * read HCA frequency by QUERY_HCA command
+		 */
+		if (dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_TS) {
+			memset(&init_hca, 0, sizeof(init_hca));
+			err = mlx4_QUERY_HCA(dev, &init_hca);
+			if (err) {
+				mlx4_err(dev, "QUERY_HCA command failed, disable timestamp\n");
+				dev->caps.flags2 &= ~MLX4_DEV_CAP_FLAG2_TS;
+			} else {
+				dev->caps.hca_core_clock =
+					init_hca.hca_core_clock;
+			}
+
+			/* In case we got HCA frequency 0 - disable timestamping
+			 * to avoid dividing by zero
+			 */
+			if (!dev->caps.hca_core_clock) {
+				dev->caps.flags2 &= ~MLX4_DEV_CAP_FLAG2_TS;
+				mlx4_err(dev,
+					 "HCA frequency is 0 - timestamping is not supported\n");
+			} else if (map_internal_clock(dev)) {
+				/*
+				 * Map internal clock,
+				 * in case of failure disable timestamping
+				 */
+				dev->caps.flags2 &= ~MLX4_DEV_CAP_FLAG2_TS;
+				mlx4_err(dev, "Failed to map internal clock. Timestamping is not supported\n");
+			}
+		}
+	} else {
+		err = mlx4_init_slave(dev);
+		if (err) {
+			if (err != -EPROBE_DEFER)
+				mlx4_err(dev, "Failed to initialize slave\n");
+			return err;
+		}
+
+		err = mlx4_slave_cap(dev);
+		if (err) {
+			mlx4_err(dev, "Failed to obtain slave caps\n");
+			goto err_close;
+		}
+	}
+
+	if (map_bf_area(dev))
+		mlx4_dbg(dev, "Failed to map blue flame area\n");
+
+	/*Only the master set the ports, all the rest got it from it.*/
+	if (!mlx4_is_slave(dev))
+		mlx4_set_port_mask(dev);
+
+	err = mlx4_QUERY_ADAPTER(dev, &adapter);
+	if (err) {
+		mlx4_err(dev, "QUERY_ADAPTER command failed, aborting\n");
+		goto unmap_bf;
+	}
+
+	priv->eq_table.inta_pin = adapter.inta_pin;
+	memcpy(dev->board_id, adapter.board_id, sizeof dev->board_id);
+
+	return 0;
+
+unmap_bf:
+	unmap_internal_clock(dev);
+	unmap_bf_area(dev);
+
+	if (mlx4_is_slave(dev)) {
+		kfree(dev->caps.qp0_qkey);
+		kfree(dev->caps.qp0_tunnel);
+		kfree(dev->caps.qp0_proxy);
+		kfree(dev->caps.qp1_tunnel);
+		kfree(dev->caps.qp1_proxy);
+	}
+
+err_close:
+	if (mlx4_is_slave(dev))
+		mlx4_slave_exit(dev);
+	else
+		mlx4_CLOSE_HCA(dev, 0);
+
+err_free_icm:
+	if (!mlx4_is_slave(dev))
+		mlx4_free_icms(dev);
+
+err_stop_fw:
+	if (!mlx4_is_slave(dev)) {
+		mlx4_UNMAP_FA(dev);
+		mlx4_free_icm(dev, priv->fw.fw_icm, 0);
+	}
+	return err;
+}
+
+static int mlx4_init_counters_table(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int nent;
+
+	if (!(dev->caps.flags & MLX4_DEV_CAP_FLAG_COUNTERS))
+		return -ENOENT;
+
+	nent = dev->caps.max_counters;
+	return mlx4_bitmap_init(&priv->counters_bitmap, nent, nent - 1, 0, 0);
+}
+
+static void mlx4_cleanup_counters_table(struct mlx4_dev *dev)
+{
+	mlx4_bitmap_cleanup(&mlx4_priv(dev)->counters_bitmap);
+}
+
+int __mlx4_counter_alloc(struct mlx4_dev *dev, u32 *idx)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	if (!(dev->caps.flags & MLX4_DEV_CAP_FLAG_COUNTERS))
+		return -ENOENT;
+
+	*idx = mlx4_bitmap_alloc(&priv->counters_bitmap);
+	if (*idx == -1)
+		return -ENOMEM;
+
+	return 0;
+}
+
+int mlx4_counter_alloc(struct mlx4_dev *dev, u32 *idx)
+{
+	u64 out_param;
+	int err;
+
+	if (mlx4_is_mfunc(dev)) {
+		err = mlx4_cmd_imm(dev, 0, &out_param, RES_COUNTER,
+				   RES_OP_RESERVE, MLX4_CMD_ALLOC_RES,
+				   MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED);
+		if (!err)
+			*idx = get_param_l(&out_param);
+
+		return err;
+	}
+	return __mlx4_counter_alloc(dev, idx);
+}
+EXPORT_SYMBOL_GPL(mlx4_counter_alloc);
+
+void __mlx4_counter_free(struct mlx4_dev *dev, u32 idx)
+{
+	mlx4_bitmap_free(&mlx4_priv(dev)->counters_bitmap, idx, MLX4_USE_RR);
+	return;
+}
+
+void mlx4_counter_free(struct mlx4_dev *dev, u32 idx)
+{
+	u64 in_param = 0;
+
+	if (mlx4_is_mfunc(dev)) {
+		set_param_l(&in_param, idx);
+		mlx4_cmd(dev, in_param, RES_COUNTER, RES_OP_RESERVE,
+			 MLX4_CMD_FREE_RES, MLX4_CMD_TIME_CLASS_A,
+			 MLX4_CMD_WRAPPED);
+		return;
+	}
+	__mlx4_counter_free(dev, idx);
+}
+EXPORT_SYMBOL_GPL(mlx4_counter_free);
+
+static int mlx4_setup_hca(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int err;
+	int port;
+	__be32 ib_port_default_caps;
+
+	err = mlx4_init_uar_table(dev);
+	if (err) {
+		mlx4_err(dev, "Failed to initialize user access region table, aborting\n");
+		 return err;
+	}
+
+	err = mlx4_uar_alloc(dev, &priv->driver_uar);
+	if (err) {
+		mlx4_err(dev, "Failed to allocate driver access region, aborting\n");
+		goto err_uar_table_free;
+	}
+
+	priv->kar = ioremap((phys_addr_t) priv->driver_uar.pfn << PAGE_SHIFT, PAGE_SIZE);
+	if (!priv->kar) {
+		mlx4_err(dev, "Couldn't map kernel access region, aborting\n");
+		err = -ENOMEM;
+		goto err_uar_free;
+	}
+
+	err = mlx4_init_pd_table(dev);
+	if (err) {
+		mlx4_err(dev, "Failed to initialize protection domain table, aborting\n");
+		goto err_kar_unmap;
+	}
+
+	err = mlx4_init_xrcd_table(dev);
+	if (err) {
+		mlx4_err(dev, "Failed to initialize reliable connection domain table, aborting\n");
+		goto err_pd_table_free;
+	}
+
+	err = mlx4_init_mr_table(dev);
+	if (err) {
+		mlx4_err(dev, "Failed to initialize memory region table, aborting\n");
+		goto err_xrcd_table_free;
+	}
+
+	if (!mlx4_is_slave(dev)) {
+		err = mlx4_init_mcg_table(dev);
+		if (err) {
+			mlx4_err(dev, "Failed to initialize multicast group table, aborting\n");
+			goto err_mr_table_free;
+		}
+		err = mlx4_config_mad_demux(dev);
+		if (err) {
+			mlx4_err(dev, "Failed in config_mad_demux, aborting\n");
+			goto err_mcg_table_free;
+		}
+	}
+
+	err = mlx4_init_eq_table(dev);
+	if (err) {
+		mlx4_err(dev, "Failed to initialize event queue table, aborting\n");
+		goto err_mcg_table_free;
+	}
+
+	err = mlx4_cmd_use_events(dev);
+	if (err) {
+		mlx4_err(dev, "Failed to switch to event-driven firmware commands, aborting\n");
+		goto err_eq_table_free;
+	}
+
+	err = mlx4_NOP(dev);
+	if (err) {
+		if (dev->flags & MLX4_FLAG_MSI_X) {
+			mlx4_warn(dev, "NOP command failed to generate MSI-X interrupt IRQ %d)\n",
+				  priv->eq_table.eq[dev->caps.num_comp_vectors].irq);
+			mlx4_warn(dev, "Trying again without MSI-X\n");
+		} else {
+			mlx4_err(dev, "NOP command failed to generate interrupt (IRQ %d), aborting\n",
+				 priv->eq_table.eq[dev->caps.num_comp_vectors].irq);
+			mlx4_err(dev, "BIOS or ACPI interrupt routing problem?\n");
+		}
+
+		goto err_cmd_poll;
+	}
+
+	mlx4_dbg(dev, "NOP command IRQ test passed\n");
+
+	err = mlx4_init_cq_table(dev);
+	if (err) {
+		mlx4_err(dev, "Failed to initialize completion queue table, aborting\n");
+		goto err_cmd_poll;
+	}
+
+	err = mlx4_init_srq_table(dev);
+	if (err) {
+		mlx4_err(dev, "Failed to initialize shared receive queue table, aborting\n");
+		goto err_cq_table_free;
+	}
+
+	err = mlx4_init_qp_table(dev);
+	if (err) {
+		mlx4_err(dev, "Failed to initialize queue pair table, aborting\n");
+		goto err_srq_table_free;
+	}
+
+	err = mlx4_init_counters_table(dev);
+	if (err && err != -ENOENT) {
+		mlx4_err(dev, "Failed to initialize counters table, aborting\n");
+		goto err_qp_table_free;
+	}
+
+	if (!mlx4_is_slave(dev)) {
+		for (port = 1; port <= dev->caps.num_ports; port++) {
+			ib_port_default_caps = 0;
+			err = mlx4_get_port_ib_caps(dev, port,
+						    &ib_port_default_caps);
+			if (err)
+				mlx4_warn(dev, "failed to get port %d default ib capabilities (%d). Continuing with caps = 0\n",
+					  port, err);
+			dev->caps.ib_port_def_cap[port] = ib_port_default_caps;
+
+			/* initialize per-slave default ib port capabilities */
+			if (mlx4_is_master(dev)) {
+				int i;
+				for (i = 0; i < dev->num_slaves; i++) {
+					if (i == mlx4_master_func_num(dev))
+						continue;
+					priv->mfunc.master.slave_state[i].ib_cap_mask[port] =
+						ib_port_default_caps;
+				}
+			}
+
+			if (mlx4_is_mfunc(dev))
+				dev->caps.port_ib_mtu[port] = IB_MTU_2048;
+			else
+				dev->caps.port_ib_mtu[port] = IB_MTU_4096;
+
+			err = mlx4_SET_PORT(dev, port, mlx4_is_master(dev) ?
+					    dev->caps.pkey_table_len[port] : -1);
+			if (err) {
+				mlx4_err(dev, "Failed to set port %d, aborting\n",
+					 port);
+				goto err_counters_table_free;
+			}
+		}
+	}
+
+	return 0;
+
+err_counters_table_free:
+	mlx4_cleanup_counters_table(dev);
+
+err_qp_table_free:
+	mlx4_cleanup_qp_table(dev);
+
+err_srq_table_free:
+	mlx4_cleanup_srq_table(dev);
+
+err_cq_table_free:
+	mlx4_cleanup_cq_table(dev);
+
+err_cmd_poll:
+	mlx4_cmd_use_polling(dev);
+
+err_eq_table_free:
+	mlx4_cleanup_eq_table(dev);
+
+err_mcg_table_free:
+	if (!mlx4_is_slave(dev))
+		mlx4_cleanup_mcg_table(dev);
+
+err_mr_table_free:
+	mlx4_cleanup_mr_table(dev);
+
+err_xrcd_table_free:
+	mlx4_cleanup_xrcd_table(dev);
+
+err_pd_table_free:
+	mlx4_cleanup_pd_table(dev);
+
+err_kar_unmap:
+	iounmap(priv->kar);
+
+err_uar_free:
+	mlx4_uar_free(dev, &priv->driver_uar);
+
+err_uar_table_free:
+	mlx4_cleanup_uar_table(dev);
+	return err;
+}
+
+static void mlx4_enable_msi_x(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct msix_entry *entries;
+	int nreq = min_t(int, dev->caps.num_ports *
+			 min_t(int, num_online_cpus() + 1,
+			       MAX_MSIX_P_PORT) + MSIX_LEGACY_SZ, MAX_MSIX);
+	int i;
+
+	if (msi_x) {
+		nreq = min_t(int, dev->caps.num_eqs - dev->caps.reserved_eqs,
+			     nreq);
+
+		entries = kcalloc(nreq, sizeof *entries, GFP_KERNEL);
+		if (!entries)
+			goto no_msi;
+
+		for (i = 0; i < nreq; ++i)
+			entries[i].entry = i;
+
+		nreq = pci_enable_msix_range(dev->pdev, entries, 2, nreq);
+
+		if (nreq < 0) {
+			kfree(entries);
+			goto no_msi;
+		} else if (nreq < MSIX_LEGACY_SZ +
+			   dev->caps.num_ports * MIN_MSIX_P_PORT) {
+			/*Working in legacy mode , all EQ's shared*/
+			dev->caps.comp_pool           = 0;
+			dev->caps.num_comp_vectors = nreq - 1;
+		} else {
+			dev->caps.comp_pool           = nreq - MSIX_LEGACY_SZ;
+			dev->caps.num_comp_vectors = MSIX_LEGACY_SZ - 1;
+		}
+		for (i = 0; i < nreq; ++i)
+			priv->eq_table.eq[i].irq = entries[i].vector;
+
+		dev->flags |= MLX4_FLAG_MSI_X;
+
+		kfree(entries);
+		return;
+	}
+
+no_msi:
+	dev->caps.num_comp_vectors = 1;
+	dev->caps.comp_pool	   = 0;
+
+	for (i = 0; i < 2; ++i)
+		priv->eq_table.eq[i].irq = dev->pdev->irq;
+}
+
+static int mlx4_init_port_info(struct mlx4_dev *dev, int port)
+{
+	struct mlx4_port_info *info = &mlx4_priv(dev)->port[port];
+	int err = 0;
+
+	info->dev = dev;
+	info->port = port;
+	if (!mlx4_is_slave(dev)) {
+		mlx4_init_mac_table(dev, &info->mac_table);
+		mlx4_init_vlan_table(dev, &info->vlan_table);
+		mlx4_init_roce_gid_table(dev, &info->gid_table);
+		info->base_qpn = mlx4_get_base_qpn(dev, port);
+	}
+
+	sprintf(info->dev_name, "mlx4_port%d", port);
+	info->port_attr.attr.name = info->dev_name;
+	if (mlx4_is_mfunc(dev))
+		info->port_attr.attr.mode = S_IRUGO;
+	else {
+		info->port_attr.attr.mode = S_IRUGO | S_IWUSR;
+		info->port_attr.store     = set_port_type;
+	}
+	info->port_attr.show      = show_port_type;
+	sysfs_attr_init(&info->port_attr.attr);
+
+	err = device_create_file(&dev->pdev->dev, &info->port_attr);
+	if (err) {
+		mlx4_err(dev, "Failed to create file for port %d\n", port);
+		info->port = -1;
+	}
+
+	sprintf(info->dev_mtu_name, "mlx4_port%d_mtu", port);
+	info->port_mtu_attr.attr.name = info->dev_mtu_name;
+	if (mlx4_is_mfunc(dev))
+		info->port_mtu_attr.attr.mode = S_IRUGO;
+	else {
+		info->port_mtu_attr.attr.mode = S_IRUGO | S_IWUSR;
+		info->port_mtu_attr.store     = set_port_ib_mtu;
+	}
+	info->port_mtu_attr.show      = show_port_ib_mtu;
+	sysfs_attr_init(&info->port_mtu_attr.attr);
+
+	err = device_create_file(&dev->pdev->dev, &info->port_mtu_attr);
+	if (err) {
+		mlx4_err(dev, "Failed to create mtu file for port %d\n", port);
+		device_remove_file(&info->dev->pdev->dev, &info->port_attr);
+		info->port = -1;
+	}
+
+	return err;
+}
+
+static void mlx4_cleanup_port_info(struct mlx4_port_info *info)
+{
+	if (info->port < 0)
+		return;
+
+	device_remove_file(&info->dev->pdev->dev, &info->port_attr);
+	device_remove_file(&info->dev->pdev->dev, &info->port_mtu_attr);
+}
+
+static int mlx4_init_steering(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int num_entries = dev->caps.num_ports;
+	int i, j;
+
+	priv->steer = kzalloc(sizeof(struct mlx4_steer) * num_entries, GFP_KERNEL);
+	if (!priv->steer)
+		return -ENOMEM;
+
+	for (i = 0; i < num_entries; i++)
+		for (j = 0; j < MLX4_NUM_STEERS; j++) {
+			INIT_LIST_HEAD(&priv->steer[i].promisc_qps[j]);
+			INIT_LIST_HEAD(&priv->steer[i].steer_entries[j]);
+		}
+	return 0;
+}
+
+static void mlx4_clear_steering(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_steer_index *entry, *tmp_entry;
+	struct mlx4_promisc_qp *pqp, *tmp_pqp;
+	int num_entries = dev->caps.num_ports;
+	int i, j;
+
+	for (i = 0; i < num_entries; i++) {
+		for (j = 0; j < MLX4_NUM_STEERS; j++) {
+			list_for_each_entry_safe(pqp, tmp_pqp,
+						 &priv->steer[i].promisc_qps[j],
+						 list) {
+				list_del(&pqp->list);
+				kfree(pqp);
+			}
+			list_for_each_entry_safe(entry, tmp_entry,
+						 &priv->steer[i].steer_entries[j],
+						 list) {
+				list_del(&entry->list);
+				list_for_each_entry_safe(pqp, tmp_pqp,
+							 &entry->duplicates,
+							 list) {
+					list_del(&pqp->list);
+					kfree(pqp);
+				}
+				kfree(entry);
+			}
+		}
+	}
+	kfree(priv->steer);
+}
+
+static int extended_func_num(struct pci_dev *pdev)
+{
+	return PCI_SLOT(pdev->devfn) * 8 + PCI_FUNC(pdev->devfn);
+}
+
+#define MLX4_OWNER_BASE	0x8069c
+#define MLX4_OWNER_SIZE	4
+
+static int mlx4_get_ownership(struct mlx4_dev *dev)
+{
+	void __iomem *owner;
+	u32 ret;
+
+	if (pci_channel_offline(dev->pdev))
+		return -EIO;
+
+	owner = ioremap(pci_resource_start(dev->pdev, 0) + MLX4_OWNER_BASE,
+			MLX4_OWNER_SIZE);
+	if (!owner) {
+		mlx4_err(dev, "Failed to obtain ownership bit\n");
+		return -ENOMEM;
+	}
+
+	ret = readl(owner);
+	iounmap(owner);
+	return (int) !!ret;
+}
+
+static void mlx4_free_ownership(struct mlx4_dev *dev)
+{
+	void __iomem *owner;
+
+	if (pci_channel_offline(dev->pdev))
+		return;
+
+	owner = ioremap(pci_resource_start(dev->pdev, 0) + MLX4_OWNER_BASE,
+			MLX4_OWNER_SIZE);
+	if (!owner) {
+		mlx4_err(dev, "Failed to obtain ownership bit\n");
+		return;
+	}
+	writel(0, owner);
+	msleep(1000);
+	iounmap(owner);
+}
+
+static int mlx4_load_one(struct pci_dev *pdev, int pci_dev_data,
+			 int total_vfs, int *nvfs, struct mlx4_priv *priv)
+{
+	struct mlx4_dev *dev;
+	unsigned sum = 0;
+	int err;
+	int port;
+	int i;
+	int existing_vfs = 0;
+
+	dev = &priv->dev;
+
+	INIT_LIST_HEAD(&priv->ctx_list);
+	spin_lock_init(&priv->ctx_lock);
+
+	mutex_init(&priv->port_mutex);
+
+	INIT_LIST_HEAD(&priv->pgdir_list);
+	mutex_init(&priv->pgdir_mutex);
+
+	INIT_LIST_HEAD(&priv->bf_list);
+	mutex_init(&priv->bf_mutex);
+
+	dev->rev_id = pdev->revision;
+	dev->numa_node = dev_to_node(&pdev->dev);
+
+	/* Detect if this device is a virtual function */
+	if (pci_dev_data & MLX4_PCI_DEV_IS_VF) {
+		mlx4_warn(dev, "Detected virtual function - running in slave mode\n");
+		dev->flags |= MLX4_FLAG_SLAVE;
+	} else {
+		/* We reset the device and enable SRIOV only for physical
+		 * devices.  Try to claim ownership on the device;
+		 * if already taken, skip -- do not allow multiple PFs */
+		err = mlx4_get_ownership(dev);
+		if (err) {
+			if (err < 0)
+				return err;
+			else {
+				mlx4_warn(dev, "Multiple PFs not yet supported - Skipping PF\n");
+				return -EINVAL;
+			}
+		}
+
+		if (total_vfs) {
+			mlx4_warn(dev, "Enabling SR-IOV with %d VFs\n",
+				  total_vfs);
+			dev->dev_vfs = kzalloc(
+				total_vfs * sizeof(*dev->dev_vfs),
+				GFP_KERNEL);
+			if (NULL == dev->dev_vfs) {
+				mlx4_err(dev, "Failed to allocate memory for VFs\n");
+				err = -ENOMEM;
+				goto err_free_own;
+			} else {
+				atomic_inc(&pf_loading);
+				existing_vfs = pci_num_vf(pdev);
+				if (existing_vfs) {
+					err = 0;
+					if (existing_vfs != total_vfs)
+						mlx4_err(dev, "SR-IOV was already enabled, but with num_vfs (%d) different than requested (%d)\n",
+							 existing_vfs, total_vfs);
+				} else {
+					err = pci_enable_sriov(pdev, total_vfs);
+				}
+				if (err) {
+					mlx4_err(dev, "Failed to enable SR-IOV, continuing without SR-IOV (err = %d)\n",
+						 err);
+					atomic_dec(&pf_loading);
+				} else {
+					mlx4_warn(dev, "Running in master mode\n");
+					dev->flags |= MLX4_FLAG_SRIOV |
+						MLX4_FLAG_MASTER;
+					dev->num_vfs = total_vfs;
+				}
+			}
+		}
+
+		atomic_set(&priv->opreq_count, 0);
+		INIT_WORK(&priv->opreq_task, mlx4_opreq_action);
+
+		/*
+		 * Now reset the HCA before we touch the PCI capabilities or
+		 * attempt a firmware command, since a boot ROM may have left
+		 * the HCA in an undefined state.
+		 */
+		err = mlx4_reset(dev);
+		if (err) {
+			mlx4_err(dev, "Failed to reset HCA, aborting\n");
+			goto err_sriov;
+		}
+	}
+
+slave_start:
+	err = mlx4_cmd_init(dev);
+	if (err) {
+		mlx4_err(dev, "Failed to init command interface, aborting\n");
+		goto err_sriov;
+	}
+
+	/* In slave functions, the communication channel must be initialized
+	 * before posting commands. Also, init num_slaves before calling
+	 * mlx4_init_hca */
+	if (mlx4_is_mfunc(dev)) {
+		if (mlx4_is_master(dev))
+			dev->num_slaves = MLX4_MAX_NUM_SLAVES;
+		else {
+			dev->num_slaves = 0;
+			err = mlx4_multi_func_init(dev);
+			if (err) {
+				mlx4_err(dev, "Failed to init slave mfunc interface, aborting\n");
+				goto err_cmd;
+			}
+		}
+	}
+
+	err = mlx4_init_hca(dev);
+	if (err) {
+		if (err == -EACCES) {
+			/* Not primary Physical function
+			 * Running in slave mode */
+			mlx4_cmd_cleanup(dev);
+			dev->flags |= MLX4_FLAG_SLAVE;
+			dev->flags &= ~MLX4_FLAG_MASTER;
+			goto slave_start;
+		} else
+			goto err_mfunc;
+	}
+
+	/* check if the device is functioning at its maximum possible speed.
+	 * No return code for this call, just warn the user in case of PCI
+	 * express device capabilities are under-satisfied by the bus.
+	 */
+	if (!mlx4_is_slave(dev))
+		mlx4_check_pcie_caps(dev);
+
+	/* In master functions, the communication channel must be initialized
+	 * after obtaining its address from fw */
+	if (mlx4_is_master(dev)) {
+		int ib_ports = 0;
+
+		mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_IB)
+			ib_ports++;
+
+		if (ib_ports &&
+		    (num_vfs_argc > 1 || probe_vfs_argc > 1)) {
+			mlx4_err(dev,
+				 "Invalid syntax of num_vfs/probe_vfs with IB port - single port VFs syntax is only supported when all ports are configured as ethernet\n");
+			err = -EINVAL;
+			goto err_close;
+		}
+		if (dev->caps.num_ports < 2 &&
+		    num_vfs_argc > 1) {
+			err = -EINVAL;
+			mlx4_err(dev,
+				 "Error: Trying to configure VFs on port 2, but HCA has only %d physical ports\n",
+				 dev->caps.num_ports);
+			goto err_close;
+		}
+		memcpy(dev->nvfs, nvfs, sizeof(dev->nvfs));
+
+		for (i = 0; i < sizeof(dev->nvfs)/sizeof(dev->nvfs[0]); i++) {
+			unsigned j;
+
+			for (j = 0; j < dev->nvfs[i]; ++sum, ++j) {
+				dev->dev_vfs[sum].min_port = i < 2 ? i + 1 : 1;
+				dev->dev_vfs[sum].n_ports = i < 2 ? 1 :
+					dev->caps.num_ports;
+			}
+		}
+
+		/* In master functions, the communication channel
+		 * must be initialized after obtaining its address from fw
+		 */
+		err = mlx4_multi_func_init(dev);
+		if (err) {
+			mlx4_err(dev, "Failed to init master mfunc interface, aborting.\n");
+			goto err_close;
+		}
+	}
+
+	err = mlx4_alloc_eq_table(dev);
+	if (err)
+		goto err_master_mfunc;
+
+	priv->msix_ctl.pool_bm = 0;
+	mutex_init(&priv->msix_ctl.pool_lock);
+
+	mlx4_enable_msi_x(dev);
+	if ((mlx4_is_mfunc(dev)) &&
+	    !(dev->flags & MLX4_FLAG_MSI_X)) {
+		err = -ENOSYS;
+		mlx4_err(dev, "INTx is not supported in multi-function mode, aborting\n");
+		goto err_free_eq;
+	}
+
+	if (!mlx4_is_slave(dev)) {
+		err = mlx4_init_steering(dev);
+		if (err)
+			goto err_disable_msix;
+	}
+
+	err = mlx4_setup_hca(dev);
+	if (err == -EBUSY && (dev->flags & MLX4_FLAG_MSI_X) &&
+	    !mlx4_is_mfunc(dev)) {
+		dev->flags &= ~MLX4_FLAG_MSI_X;
+		dev->caps.num_comp_vectors = 1;
+		dev->caps.comp_pool	   = 0;
+		pci_disable_msix(pdev);
+		err = mlx4_setup_hca(dev);
+	}
+
+	if (err)
+		goto err_steer;
+
+	mlx4_init_quotas(dev);
+
+	for (port = 1; port <= dev->caps.num_ports; port++) {
+		err = mlx4_init_port_info(dev, port);
+		if (err)
+			goto err_port;
+	}
+
+	err = mlx4_register_device(dev);
+	if (err)
+		goto err_port;
+
+	mlx4_request_modules(dev);
+
+	mlx4_sense_init(dev);
+	mlx4_start_sense(dev);
+
+	priv->removed = 0;
+
+	if (mlx4_is_master(dev) && dev->num_vfs)
+		atomic_dec(&pf_loading);
+
+	return 0;
+
+err_port:
+	for (--port; port >= 1; --port)
+		mlx4_cleanup_port_info(&priv->port[port]);
+
+	mlx4_cleanup_counters_table(dev);
+	mlx4_cleanup_qp_table(dev);
+	mlx4_cleanup_srq_table(dev);
+	mlx4_cleanup_cq_table(dev);
+	mlx4_cmd_use_polling(dev);
+	mlx4_cleanup_eq_table(dev);
+	mlx4_cleanup_mcg_table(dev);
+	mlx4_cleanup_mr_table(dev);
+	mlx4_cleanup_xrcd_table(dev);
+	mlx4_cleanup_pd_table(dev);
+	mlx4_cleanup_uar_table(dev);
+
+err_steer:
+	if (!mlx4_is_slave(dev))
+		mlx4_clear_steering(dev);
+
+err_disable_msix:
+	if (dev->flags & MLX4_FLAG_MSI_X)
+		pci_disable_msix(pdev);
+
+err_free_eq:
+	mlx4_free_eq_table(dev);
+
+err_master_mfunc:
+	if (mlx4_is_master(dev))
+		mlx4_multi_func_cleanup(dev);
+
+	if (mlx4_is_slave(dev)) {
+		kfree(dev->caps.qp0_qkey);
+		kfree(dev->caps.qp0_tunnel);
+		kfree(dev->caps.qp0_proxy);
+		kfree(dev->caps.qp1_tunnel);
+		kfree(dev->caps.qp1_proxy);
+	}
+
+err_close:
+	mlx4_close_hca(dev);
+
+err_mfunc:
+	if (mlx4_is_slave(dev))
+		mlx4_multi_func_cleanup(dev);
+
+err_cmd:
+	mlx4_cmd_cleanup(dev);
+
+err_sriov:
+	if (dev->flags & MLX4_FLAG_SRIOV && !existing_vfs)
+		pci_disable_sriov(pdev);
+
+	if (mlx4_is_master(dev) && dev->num_vfs)
+		atomic_dec(&pf_loading);
+
+	kfree(priv->dev.dev_vfs);
+
+err_free_own:
+	if (!mlx4_is_slave(dev))
+		mlx4_free_ownership(dev);
+
+	return err;
+}
+
+static int __mlx4_init_one(struct pci_dev *pdev, int pci_dev_data,
+			   struct mlx4_priv *priv)
+{
+	int err;
+	int nvfs[MLX4_MAX_PORTS + 1] = {0, 0, 0};
+	int prb_vf[MLX4_MAX_PORTS + 1] = {0, 0, 0};
+	const int param_map[MLX4_MAX_PORTS + 1][MLX4_MAX_PORTS + 1] = {
+		{2, 0, 0}, {0, 1, 2}, {0, 1, 2} };
+	unsigned total_vfs = 0;
+	unsigned int i;
+
+	pr_info(DRV_NAME ": Initializing %s\n", pci_name(pdev));
+
+	err = pci_enable_device(pdev);
+	if (err) {
+		dev_err(&pdev->dev, "Cannot enable PCI device, aborting\n");
+		return err;
+	}
+
+	/* Due to requirement that all VFs and the PF are *guaranteed* 2 MACS
+	 * per port, we must limit the number of VFs to 63 (since their are
+	 * 128 MACs)
+	 */
+	for (i = 0; i < sizeof(nvfs)/sizeof(nvfs[0]) && i < num_vfs_argc;
+	     total_vfs += nvfs[param_map[num_vfs_argc - 1][i]], i++) {
+		nvfs[param_map[num_vfs_argc - 1][i]] = num_vfs[i];
+		if (nvfs[i] < 0) {
+			dev_err(&pdev->dev, "num_vfs module parameter cannot be negative\n");
+			err = -EINVAL;
+			goto err_disable_pdev;
+		}
+	}
+	for (i = 0; i < sizeof(prb_vf)/sizeof(prb_vf[0]) && i < probe_vfs_argc;
+	     i++) {
+		prb_vf[param_map[probe_vfs_argc - 1][i]] = probe_vf[i];
+		if (prb_vf[i] < 0 || prb_vf[i] > nvfs[i]) {
+			dev_err(&pdev->dev, "probe_vf module parameter cannot be negative or greater than num_vfs\n");
+			err = -EINVAL;
+			goto err_disable_pdev;
+		}
+	}
+	if (total_vfs >= MLX4_MAX_NUM_VF) {
+		dev_err(&pdev->dev,
+			"Requested more VF's (%d) than allowed (%d)\n",
+			total_vfs, MLX4_MAX_NUM_VF - 1);
+		err = -EINVAL;
+		goto err_disable_pdev;
+	}
+
+	for (i = 0; i < MLX4_MAX_PORTS; i++) {
+		if (nvfs[i] + nvfs[2] >= MLX4_MAX_NUM_VF_P_PORT) {
+			dev_err(&pdev->dev,
+				"Requested more VF's (%d) for port (%d) than allowed (%d)\n",
+				nvfs[i] + nvfs[2], i + 1,
+				MLX4_MAX_NUM_VF_P_PORT - 1);
+			err = -EINVAL;
+			goto err_disable_pdev;
+		}
+	}
+
+	/* Check for BARs. */
+	if (!(pci_dev_data & MLX4_PCI_DEV_IS_VF) &&
+	    !(pci_resource_flags(pdev, 0) & IORESOURCE_MEM)) {
+		dev_err(&pdev->dev, "Missing DCS, aborting (driver_data: 0x%x, pci_resource_flags(pdev, 0):0x%lx)\n",
+			pci_dev_data, pci_resource_flags(pdev, 0));
+		err = -ENODEV;
+		goto err_disable_pdev;
+	}
+	if (!(pci_resource_flags(pdev, 2) & IORESOURCE_MEM)) {
+		dev_err(&pdev->dev, "Missing UAR, aborting\n");
+		err = -ENODEV;
+		goto err_disable_pdev;
+	}
+
+	err = pci_request_regions(pdev, DRV_NAME);
+	if (err) {
+		dev_err(&pdev->dev, "Couldn't get PCI resources, aborting\n");
+		goto err_disable_pdev;
+	}
+
+	pci_set_master(pdev);
+
+	err = pci_set_dma_mask(pdev, DMA_BIT_MASK(64));
+	if (err) {
+		dev_warn(&pdev->dev, "Warning: couldn't set 64-bit PCI DMA mask\n");
+		err = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
+		if (err) {
+			dev_err(&pdev->dev, "Can't set PCI DMA mask, aborting\n");
+			goto err_release_regions;
+		}
+	}
+	err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64));
+	if (err) {
+		dev_warn(&pdev->dev, "Warning: couldn't set 64-bit consistent PCI DMA mask\n");
+		err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32));
+		if (err) {
+			dev_err(&pdev->dev, "Can't set consistent PCI DMA mask, aborting\n");
+			goto err_release_regions;
+		}
+	}
+
+	/* Allow large DMA segments, up to the firmware limit of 1 GB */
+	dma_set_max_seg_size(&pdev->dev, 1024 * 1024 * 1024);
+	/* Detect if this device is a virtual function */
+	if (pci_dev_data & MLX4_PCI_DEV_IS_VF) {
+		/* When acting as pf, we normally skip vfs unless explicitly
+		 * requested to probe them.
+		 */
+		if (total_vfs) {
+			unsigned vfs_offset = 0;
+
+			for (i = 0; i < sizeof(nvfs)/sizeof(nvfs[0]) &&
+			     vfs_offset + nvfs[i] < extended_func_num(pdev);
+			     vfs_offset += nvfs[i], i++)
+				;
+			if (i == sizeof(nvfs)/sizeof(nvfs[0])) {
+				err = -ENODEV;
+				goto err_release_regions;
+			}
+			if ((extended_func_num(pdev) - vfs_offset)
+			    > prb_vf[i]) {
+				dev_warn(&pdev->dev, "Skipping virtual function:%d\n",
+					 extended_func_num(pdev));
+				err = -ENODEV;
+				goto err_release_regions;
+			}
+		}
+	}
+
+	err = mlx4_load_one(pdev, pci_dev_data, total_vfs, nvfs, priv);
+	if (err)
+		goto err_release_regions;
+	return 0;
+
+err_release_regions:
+	pci_release_regions(pdev);
+
+err_disable_pdev:
+	pci_disable_device(pdev);
+	pci_set_drvdata(pdev, NULL);
+	return err;
+}
+
+static int mlx4_init_one(struct pci_dev *pdev, const struct pci_device_id *id)
+{
+	struct mlx4_priv *priv;
+	struct mlx4_dev *dev;
+	int ret;
+
+	printk_once(KERN_INFO "%s", mlx4_version);
+
+	priv = kzalloc(sizeof(*priv), GFP_KERNEL);
+	if (!priv)
+		return -ENOMEM;
+
+	dev       = &priv->dev;
+	dev->pdev = pdev;
+	pci_set_drvdata(pdev, dev);
+	priv->pci_dev_data = id->driver_data;
+
+	ret =  __mlx4_init_one(pdev, id->driver_data, priv);
+	if (ret)
+		kfree(priv);
+
+	return ret;
+}
+
+static void mlx4_unload_one(struct pci_dev *pdev)
+{
+	struct mlx4_dev  *dev  = pci_get_drvdata(pdev);
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int               pci_dev_data;
+	int p;
+	int active_vfs = 0;
+
+	if (priv->removed)
+		return;
+
+	pci_dev_data = priv->pci_dev_data;
+
+	/* Disabling SR-IOV is not allowed while there are active vf's */
+	if (mlx4_is_master(dev)) {
+		active_vfs = mlx4_how_many_lives_vf(dev);
+		if (active_vfs) {
+			pr_warn("Removing PF when there are active VF's !!\n");
+			pr_warn("Will not disable SR-IOV.\n");
+		}
+	}
+	mlx4_stop_sense(dev);
+	mlx4_unregister_device(dev);
+
+	for (p = 1; p <= dev->caps.num_ports; p++) {
+		mlx4_cleanup_port_info(&priv->port[p]);
+		mlx4_CLOSE_PORT(dev, p);
+	}
+
+	if (mlx4_is_master(dev))
+		mlx4_free_resource_tracker(dev,
+					   RES_TR_FREE_SLAVES_ONLY);
+
+	mlx4_cleanup_counters_table(dev);
+	mlx4_cleanup_qp_table(dev);
+	mlx4_cleanup_srq_table(dev);
+	mlx4_cleanup_cq_table(dev);
+	mlx4_cmd_use_polling(dev);
+	mlx4_cleanup_eq_table(dev);
+	mlx4_cleanup_mcg_table(dev);
+	mlx4_cleanup_mr_table(dev);
+	mlx4_cleanup_xrcd_table(dev);
+	mlx4_cleanup_pd_table(dev);
+
+	if (mlx4_is_master(dev))
+		mlx4_free_resource_tracker(dev,
+					   RES_TR_FREE_STRUCTS_ONLY);
+
+	iounmap(priv->kar);
+	mlx4_uar_free(dev, &priv->driver_uar);
+	mlx4_cleanup_uar_table(dev);
+	if (!mlx4_is_slave(dev))
+		mlx4_clear_steering(dev);
+	mlx4_free_eq_table(dev);
+	if (mlx4_is_master(dev))
+		mlx4_multi_func_cleanup(dev);
+	mlx4_close_hca(dev);
+	if (mlx4_is_slave(dev))
+		mlx4_multi_func_cleanup(dev);
+	mlx4_cmd_cleanup(dev);
+
+	if (dev->flags & MLX4_FLAG_MSI_X)
+		pci_disable_msix(pdev);
+	if (dev->flags & MLX4_FLAG_SRIOV && !active_vfs) {
+		mlx4_warn(dev, "Disabling SR-IOV\n");
+		pci_disable_sriov(pdev);
+		dev->num_vfs = 0;
+	}
+
+	if (!mlx4_is_slave(dev))
+		mlx4_free_ownership(dev);
+
+	kfree(dev->caps.qp0_qkey);
+	kfree(dev->caps.qp0_tunnel);
+	kfree(dev->caps.qp0_proxy);
+	kfree(dev->caps.qp1_tunnel);
+	kfree(dev->caps.qp1_proxy);
+	kfree(dev->dev_vfs);
+
+	memset(priv, 0, sizeof(*priv));
+	priv->pci_dev_data = pci_dev_data;
+	priv->removed = 1;
+}
+
+static void mlx4_remove_one(struct pci_dev *pdev)
+{
+	struct mlx4_dev  *dev  = pci_get_drvdata(pdev);
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	mlx4_unload_one(pdev);
+	pci_release_regions(pdev);
+	pci_disable_device(pdev);
+	kfree(priv);
+	pci_set_drvdata(pdev, NULL);
+}
+
+int mlx4_restart_one(struct pci_dev *pdev)
+{
+	struct mlx4_dev	 *dev  = pci_get_drvdata(pdev);
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int nvfs[MLX4_MAX_PORTS + 1] = {0, 0, 0};
+	int pci_dev_data, err, total_vfs;
+
+	pci_dev_data = priv->pci_dev_data;
+	total_vfs = dev->num_vfs;
+	memcpy(nvfs, dev->nvfs, sizeof(dev->nvfs));
+
+	mlx4_unload_one(pdev);
+	err = mlx4_load_one(pdev, pci_dev_data, total_vfs, nvfs, priv);
+	if (err) {
+		mlx4_err(dev, "%s: ERROR: mlx4_load_one failed, pci_name=%s, err=%d\n",
+			 __func__, pci_name(pdev), err);
+		return err;
+	}
+
+	return err;
+}
+
+static const struct pci_device_id mlx4_pci_table[] = {
+	/* MT25408 "Hermon" SDR */
+	{ PCI_VDEVICE(MELLANOX, 0x6340), MLX4_PCI_DEV_FORCE_SENSE_PORT },
+	/* MT25408 "Hermon" DDR */
+	{ PCI_VDEVICE(MELLANOX, 0x634a), MLX4_PCI_DEV_FORCE_SENSE_PORT },
+	/* MT25408 "Hermon" QDR */
+	{ PCI_VDEVICE(MELLANOX, 0x6354), MLX4_PCI_DEV_FORCE_SENSE_PORT },
+	/* MT25408 "Hermon" DDR PCIe gen2 */
+	{ PCI_VDEVICE(MELLANOX, 0x6732), MLX4_PCI_DEV_FORCE_SENSE_PORT },
+	/* MT25408 "Hermon" QDR PCIe gen2 */
+	{ PCI_VDEVICE(MELLANOX, 0x673c), MLX4_PCI_DEV_FORCE_SENSE_PORT },
+	/* MT25408 "Hermon" EN 10GigE */
+	{ PCI_VDEVICE(MELLANOX, 0x6368), MLX4_PCI_DEV_FORCE_SENSE_PORT },
+	/* MT25408 "Hermon" EN 10GigE PCIe gen2 */
+	{ PCI_VDEVICE(MELLANOX, 0x6750), MLX4_PCI_DEV_FORCE_SENSE_PORT },
+	/* MT25458 ConnectX EN 10GBASE-T 10GigE */
+	{ PCI_VDEVICE(MELLANOX, 0x6372), MLX4_PCI_DEV_FORCE_SENSE_PORT },
+	/* MT25458 ConnectX EN 10GBASE-T+Gen2 10GigE */
+	{ PCI_VDEVICE(MELLANOX, 0x675a), MLX4_PCI_DEV_FORCE_SENSE_PORT },
+	/* MT26468 ConnectX EN 10GigE PCIe gen2*/
+	{ PCI_VDEVICE(MELLANOX, 0x6764), MLX4_PCI_DEV_FORCE_SENSE_PORT },
+	/* MT26438 ConnectX EN 40GigE PCIe gen2 5GT/s */
+	{ PCI_VDEVICE(MELLANOX, 0x6746), MLX4_PCI_DEV_FORCE_SENSE_PORT },
+	/* MT26478 ConnectX2 40GigE PCIe gen2 */
+	{ PCI_VDEVICE(MELLANOX, 0x676e), MLX4_PCI_DEV_FORCE_SENSE_PORT },
+	/* MT25400 Family [ConnectX-2 Virtual Function] */
+	{ PCI_VDEVICE(MELLANOX, 0x1002), MLX4_PCI_DEV_IS_VF },
+	/* MT27500 Family [ConnectX-3] */
+	{ PCI_VDEVICE(MELLANOX, 0x1003), 0 },
+	/* MT27500 Family [ConnectX-3 Virtual Function] */
+	{ PCI_VDEVICE(MELLANOX, 0x1004), MLX4_PCI_DEV_IS_VF },
+	{ PCI_VDEVICE(MELLANOX, 0x1005), 0 }, /* MT27510 Family */
+	{ PCI_VDEVICE(MELLANOX, 0x1006), 0 }, /* MT27511 Family */
+	{ PCI_VDEVICE(MELLANOX, 0x1007), 0 }, /* MT27520 Family */
+	{ PCI_VDEVICE(MELLANOX, 0x1008), 0 }, /* MT27521 Family */
+	{ PCI_VDEVICE(MELLANOX, 0x1009), 0 }, /* MT27530 Family */
+	{ PCI_VDEVICE(MELLANOX, 0x100a), 0 }, /* MT27531 Family */
+	{ PCI_VDEVICE(MELLANOX, 0x100b), 0 }, /* MT27540 Family */
+	{ PCI_VDEVICE(MELLANOX, 0x100c), 0 }, /* MT27541 Family */
+	{ PCI_VDEVICE(MELLANOX, 0x100d), 0 }, /* MT27550 Family */
+	{ PCI_VDEVICE(MELLANOX, 0x100e), 0 }, /* MT27551 Family */
+	{ PCI_VDEVICE(MELLANOX, 0x100f), 0 }, /* MT27560 Family */
+	{ PCI_VDEVICE(MELLANOX, 0x1010), 0 }, /* MT27561 Family */
+	{ 0, }
+};
+
+MODULE_DEVICE_TABLE(pci, mlx4_pci_table);
+
+static pci_ers_result_t mlx4_pci_err_detected(struct pci_dev *pdev,
+					      pci_channel_state_t state)
+{
+	mlx4_unload_one(pdev);
+
+	return state == pci_channel_io_perm_failure ?
+		PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_NEED_RESET;
+}
+
+static pci_ers_result_t mlx4_pci_slot_reset(struct pci_dev *pdev)
+{
+	struct mlx4_dev	 *dev  = pci_get_drvdata(pdev);
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int               ret;
+
+	ret = __mlx4_init_one(pdev, priv->pci_dev_data, priv);
+
+	return ret ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED;
+}
+
+static const struct pci_error_handlers mlx4_err_handler = {
+	.error_detected = mlx4_pci_err_detected,
+	.slot_reset     = mlx4_pci_slot_reset,
+};
+
+static struct pci_driver mlx4_driver = {
+	.name		= DRV_NAME,
+	.id_table	= mlx4_pci_table,
+	.probe		= mlx4_init_one,
+	.shutdown	= mlx4_unload_one,
+	.remove		= mlx4_remove_one,
+	.err_handler    = &mlx4_err_handler,
+};
+
+static int __init mlx4_verify_params(void)
+{
+	if ((log_num_mac < 0) || (log_num_mac > 7)) {
+		pr_warn("mlx4_core: bad num_mac: %d\n", log_num_mac);
+		return -1;
+	}
+
+	if (log_num_vlan != 0)
+		pr_warn("mlx4_core: log_num_vlan - obsolete module param, using %d\n",
+			MLX4_LOG_NUM_VLANS);
+
+	if (use_prio != 0)
+		pr_warn("mlx4_core: use_prio - obsolete module param, ignored\n");
+
+	if ((log_mtts_per_seg < 1) || (log_mtts_per_seg > 7)) {
+		pr_warn("mlx4_core: bad log_mtts_per_seg: %d\n",
+			log_mtts_per_seg);
+		return -1;
+	}
+
+	/* Check if module param for ports type has legal combination */
+	if (port_type_array[0] == false && port_type_array[1] == true) {
+		pr_warn("Module parameter configuration ETH/IB is not supported. Switching to default configuration IB/IB\n");
+		port_type_array[0] = true;
+	}
+
+	if (mlx4_log_num_mgm_entry_size != -1 &&
+	    (mlx4_log_num_mgm_entry_size < MLX4_MIN_MGM_LOG_ENTRY_SIZE ||
+	     mlx4_log_num_mgm_entry_size > MLX4_MAX_MGM_LOG_ENTRY_SIZE)) {
+		pr_warn("mlx4_core: mlx4_log_num_mgm_entry_size (%d) not in legal range (-1 or %d..%d)\n",
+			mlx4_log_num_mgm_entry_size,
+			MLX4_MIN_MGM_LOG_ENTRY_SIZE,
+			MLX4_MAX_MGM_LOG_ENTRY_SIZE);
+		return -1;
+	}
+
+	return 0;
+}
+
+static int __init mlx4_init(void)
+{
+	int ret;
+
+	if (mlx4_verify_params())
+		return -EINVAL;
+
+	mlx4_catas_init();
+
+	mlx4_wq = create_singlethread_workqueue("mlx4");
+	if (!mlx4_wq)
+		return -ENOMEM;
+
+	ret = pci_register_driver(&mlx4_driver);
+	if (ret < 0)
+		destroy_workqueue(mlx4_wq);
+	return ret < 0 ? ret : 0;
+}
+
+static void __exit mlx4_cleanup(void)
+{
+	pci_unregister_driver(&mlx4_driver);
+	destroy_workqueue(mlx4_wq);
+}
+
+module_init(mlx4_init);
+module_exit(mlx4_cleanup);
diff --git a/drivers/net/ethernet/mellanox/mlx4/mcg.c b/drivers/net/ethernet/mellanox/mlx4/mcg.c
new file mode 100644
index 0000000..8728431
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4/mcg.c
@@ -0,0 +1,1613 @@
+/*
+ * Copyright (c) 2006, 2007 Cisco Systems, Inc.  All rights reserved.
+ * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/string.h>
+#include <linux/etherdevice.h>
+
+#include <linux/mlx4/cmd.h>
+#include <linux/export.h>
+
+#include "mlx4.h"
+
+static const u8 zero_gid[16];	/* automatically initialized to 0 */
+
+int mlx4_get_mgm_entry_size(struct mlx4_dev *dev)
+{
+	return 1 << dev->oper_log_mgm_entry_size;
+}
+
+int mlx4_get_qp_per_mgm(struct mlx4_dev *dev)
+{
+	return 4 * (mlx4_get_mgm_entry_size(dev) / 16 - 2);
+}
+
+static int mlx4_QP_FLOW_STEERING_ATTACH(struct mlx4_dev *dev,
+					struct mlx4_cmd_mailbox *mailbox,
+					u32 size,
+					u64 *reg_id)
+{
+	u64 imm;
+	int err = 0;
+
+	err = mlx4_cmd_imm(dev, mailbox->dma, &imm, size, 0,
+			   MLX4_QP_FLOW_STEERING_ATTACH, MLX4_CMD_TIME_CLASS_A,
+			   MLX4_CMD_NATIVE);
+	if (err)
+		return err;
+	*reg_id = imm;
+
+	return err;
+}
+
+static int mlx4_QP_FLOW_STEERING_DETACH(struct mlx4_dev *dev, u64 regid)
+{
+	int err = 0;
+
+	err = mlx4_cmd(dev, regid, 0, 0,
+		       MLX4_QP_FLOW_STEERING_DETACH, MLX4_CMD_TIME_CLASS_A,
+		       MLX4_CMD_NATIVE);
+
+	return err;
+}
+
+static int mlx4_READ_ENTRY(struct mlx4_dev *dev, int index,
+			   struct mlx4_cmd_mailbox *mailbox)
+{
+	return mlx4_cmd_box(dev, 0, mailbox->dma, index, 0, MLX4_CMD_READ_MCG,
+			    MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE);
+}
+
+static int mlx4_WRITE_ENTRY(struct mlx4_dev *dev, int index,
+			    struct mlx4_cmd_mailbox *mailbox)
+{
+	return mlx4_cmd(dev, mailbox->dma, index, 0, MLX4_CMD_WRITE_MCG,
+			MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE);
+}
+
+static int mlx4_WRITE_PROMISC(struct mlx4_dev *dev, u8 port, u8 steer,
+			      struct mlx4_cmd_mailbox *mailbox)
+{
+	u32 in_mod;
+
+	in_mod = (u32) port << 16 | steer << 1;
+	return mlx4_cmd(dev, mailbox->dma, in_mod, 0x1,
+			MLX4_CMD_WRITE_MCG, MLX4_CMD_TIME_CLASS_A,
+			MLX4_CMD_NATIVE);
+}
+
+static int mlx4_GID_HASH(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox,
+			 u16 *hash, u8 op_mod)
+{
+	u64 imm;
+	int err;
+
+	err = mlx4_cmd_imm(dev, mailbox->dma, &imm, 0, op_mod,
+			   MLX4_CMD_MGID_HASH, MLX4_CMD_TIME_CLASS_A,
+			   MLX4_CMD_NATIVE);
+
+	if (!err)
+		*hash = imm;
+
+	return err;
+}
+
+static struct mlx4_promisc_qp *get_promisc_qp(struct mlx4_dev *dev, u8 port,
+					      enum mlx4_steer_type steer,
+					      u32 qpn)
+{
+	struct mlx4_steer *s_steer;
+	struct mlx4_promisc_qp *pqp;
+
+	if (port < 1 || port > dev->caps.num_ports)
+		return NULL;
+
+	s_steer = &mlx4_priv(dev)->steer[port - 1];
+
+	list_for_each_entry(pqp, &s_steer->promisc_qps[steer], list) {
+		if (pqp->qpn == qpn)
+			return pqp;
+	}
+	/* not found */
+	return NULL;
+}
+
+/*
+ * Add new entry to steering data structure.
+ * All promisc QPs should be added as well
+ */
+static int new_steering_entry(struct mlx4_dev *dev, u8 port,
+			      enum mlx4_steer_type steer,
+			      unsigned int index, u32 qpn)
+{
+	struct mlx4_steer *s_steer;
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_mgm *mgm;
+	u32 members_count;
+	struct mlx4_steer_index *new_entry;
+	struct mlx4_promisc_qp *pqp;
+	struct mlx4_promisc_qp *dqp = NULL;
+	u32 prot;
+	int err;
+
+	if (port < 1 || port > dev->caps.num_ports)
+		return -EINVAL;
+
+	s_steer = &mlx4_priv(dev)->steer[port - 1];
+	new_entry = kzalloc(sizeof *new_entry, GFP_KERNEL);
+	if (!new_entry)
+		return -ENOMEM;
+
+	INIT_LIST_HEAD(&new_entry->duplicates);
+	new_entry->index = index;
+	list_add_tail(&new_entry->list, &s_steer->steer_entries[steer]);
+
+	/* If the given qpn is also a promisc qp,
+	 * it should be inserted to duplicates list
+	 */
+	pqp = get_promisc_qp(dev, port, steer, qpn);
+	if (pqp) {
+		dqp = kmalloc(sizeof *dqp, GFP_KERNEL);
+		if (!dqp) {
+			err = -ENOMEM;
+			goto out_alloc;
+		}
+		dqp->qpn = qpn;
+		list_add_tail(&dqp->list, &new_entry->duplicates);
+	}
+
+	/* if no promisc qps for this vep, we are done */
+	if (list_empty(&s_steer->promisc_qps[steer]))
+		return 0;
+
+	/* now need to add all the promisc qps to the new
+	 * steering entry, as they should also receive the packets
+	 * destined to this address */
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox)) {
+		err = -ENOMEM;
+		goto out_alloc;
+	}
+	mgm = mailbox->buf;
+
+	err = mlx4_READ_ENTRY(dev, index, mailbox);
+	if (err)
+		goto out_mailbox;
+
+	members_count = be32_to_cpu(mgm->members_count) & 0xffffff;
+	prot = be32_to_cpu(mgm->members_count) >> 30;
+	list_for_each_entry(pqp, &s_steer->promisc_qps[steer], list) {
+		/* don't add already existing qpn */
+		if (pqp->qpn == qpn)
+			continue;
+		if (members_count == dev->caps.num_qp_per_mgm) {
+			/* out of space */
+			err = -ENOMEM;
+			goto out_mailbox;
+		}
+
+		/* add the qpn */
+		mgm->qp[members_count++] = cpu_to_be32(pqp->qpn & MGM_QPN_MASK);
+	}
+	/* update the qps count and update the entry with all the promisc qps*/
+	mgm->members_count = cpu_to_be32(members_count | (prot << 30));
+	err = mlx4_WRITE_ENTRY(dev, index, mailbox);
+
+out_mailbox:
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	if (!err)
+		return 0;
+out_alloc:
+	if (dqp) {
+		list_del(&dqp->list);
+		kfree(dqp);
+	}
+	list_del(&new_entry->list);
+	kfree(new_entry);
+	return err;
+}
+
+/* update the data structures with existing steering entry */
+static int existing_steering_entry(struct mlx4_dev *dev, u8 port,
+				   enum mlx4_steer_type steer,
+				   unsigned int index, u32 qpn)
+{
+	struct mlx4_steer *s_steer;
+	struct mlx4_steer_index *tmp_entry, *entry = NULL;
+	struct mlx4_promisc_qp *pqp;
+	struct mlx4_promisc_qp *dqp;
+
+	if (port < 1 || port > dev->caps.num_ports)
+		return -EINVAL;
+
+	s_steer = &mlx4_priv(dev)->steer[port - 1];
+
+	pqp = get_promisc_qp(dev, port, steer, qpn);
+	if (!pqp)
+		return 0; /* nothing to do */
+
+	list_for_each_entry(tmp_entry, &s_steer->steer_entries[steer], list) {
+		if (tmp_entry->index == index) {
+			entry = tmp_entry;
+			break;
+		}
+	}
+	if (unlikely(!entry)) {
+		mlx4_warn(dev, "Steering entry at index %x is not registered\n", index);
+		return -EINVAL;
+	}
+
+	/* the given qpn is listed as a promisc qpn
+	 * we need to add it as a duplicate to this entry
+	 * for future references */
+	list_for_each_entry(dqp, &entry->duplicates, list) {
+		if (qpn == dqp->qpn)
+			return 0; /* qp is already duplicated */
+	}
+
+	/* add the qp as a duplicate on this index */
+	dqp = kmalloc(sizeof *dqp, GFP_KERNEL);
+	if (!dqp)
+		return -ENOMEM;
+	dqp->qpn = qpn;
+	list_add_tail(&dqp->list, &entry->duplicates);
+
+	return 0;
+}
+
+/* Check whether a qpn is a duplicate on steering entry
+ * If so, it should not be removed from mgm */
+static bool check_duplicate_entry(struct mlx4_dev *dev, u8 port,
+				  enum mlx4_steer_type steer,
+				  unsigned int index, u32 qpn)
+{
+	struct mlx4_steer *s_steer;
+	struct mlx4_steer_index *tmp_entry, *entry = NULL;
+	struct mlx4_promisc_qp *dqp, *tmp_dqp;
+
+	if (port < 1 || port > dev->caps.num_ports)
+		return NULL;
+
+	s_steer = &mlx4_priv(dev)->steer[port - 1];
+
+	/* if qp is not promisc, it cannot be duplicated */
+	if (!get_promisc_qp(dev, port, steer, qpn))
+		return false;
+
+	/* The qp is promisc qp so it is a duplicate on this index
+	 * Find the index entry, and remove the duplicate */
+	list_for_each_entry(tmp_entry, &s_steer->steer_entries[steer], list) {
+		if (tmp_entry->index == index) {
+			entry = tmp_entry;
+			break;
+		}
+	}
+	if (unlikely(!entry)) {
+		mlx4_warn(dev, "Steering entry for index %x is not registered\n", index);
+		return false;
+	}
+	list_for_each_entry_safe(dqp, tmp_dqp, &entry->duplicates, list) {
+		if (dqp->qpn == qpn) {
+			list_del(&dqp->list);
+			kfree(dqp);
+		}
+	}
+	return true;
+}
+
+/* Returns true if all the QPs != tqpn contained in this entry
+ * are Promisc QPs. Returns false otherwise.
+ */
+static bool promisc_steering_entry(struct mlx4_dev *dev, u8 port,
+				   enum mlx4_steer_type steer,
+				   unsigned int index, u32 tqpn,
+				   u32 *members_count)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_mgm *mgm;
+	u32 m_count;
+	bool ret = false;
+	int i;
+
+	if (port < 1 || port > dev->caps.num_ports)
+		return false;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return false;
+	mgm = mailbox->buf;
+
+	if (mlx4_READ_ENTRY(dev, index, mailbox))
+		goto out;
+	m_count = be32_to_cpu(mgm->members_count) & 0xffffff;
+	if (members_count)
+		*members_count = m_count;
+
+	for (i = 0;  i < m_count; i++) {
+		u32 qpn = be32_to_cpu(mgm->qp[i]) & MGM_QPN_MASK;
+		if (!get_promisc_qp(dev, port, steer, qpn) && qpn != tqpn) {
+			/* the qp is not promisc, the entry can't be removed */
+			goto out;
+		}
+	}
+	ret = true;
+out:
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return ret;
+}
+
+/* IF a steering entry contains only promisc QPs, it can be removed. */
+static bool can_remove_steering_entry(struct mlx4_dev *dev, u8 port,
+				      enum mlx4_steer_type steer,
+				      unsigned int index, u32 tqpn)
+{
+	struct mlx4_steer *s_steer;
+	struct mlx4_steer_index *entry = NULL, *tmp_entry;
+	u32 members_count;
+	bool ret = false;
+
+	if (port < 1 || port > dev->caps.num_ports)
+		return NULL;
+
+	s_steer = &mlx4_priv(dev)->steer[port - 1];
+
+	if (!promisc_steering_entry(dev, port, steer, index,
+				    tqpn, &members_count))
+		goto out;
+
+	/* All the qps currently registered for this entry are promiscuous,
+	  * Checking for duplicates */
+	ret = true;
+	list_for_each_entry_safe(entry, tmp_entry, &s_steer->steer_entries[steer], list) {
+		if (entry->index == index) {
+			if (list_empty(&entry->duplicates) ||
+			    members_count == 1) {
+				struct mlx4_promisc_qp *pqp, *tmp_pqp;
+				/* If there is only 1 entry in duplicates then
+				 * this is the QP we want to delete, going over
+				 * the list and deleting the entry.
+				 */
+				list_del(&entry->list);
+				list_for_each_entry_safe(pqp, tmp_pqp,
+							 &entry->duplicates,
+							 list) {
+					list_del(&pqp->list);
+					kfree(pqp);
+				}
+				kfree(entry);
+			} else {
+				/* This entry contains duplicates so it shouldn't be removed */
+				ret = false;
+				goto out;
+			}
+		}
+	}
+
+out:
+	return ret;
+}
+
+static int add_promisc_qp(struct mlx4_dev *dev, u8 port,
+			  enum mlx4_steer_type steer, u32 qpn)
+{
+	struct mlx4_steer *s_steer;
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_mgm *mgm;
+	struct mlx4_steer_index *entry;
+	struct mlx4_promisc_qp *pqp;
+	struct mlx4_promisc_qp *dqp;
+	u32 members_count;
+	u32 prot;
+	int i;
+	bool found;
+	int err;
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	if (port < 1 || port > dev->caps.num_ports)
+		return -EINVAL;
+
+	s_steer = &mlx4_priv(dev)->steer[port - 1];
+
+	mutex_lock(&priv->mcg_table.mutex);
+
+	if (get_promisc_qp(dev, port, steer, qpn)) {
+		err = 0;  /* Noting to do, already exists */
+		goto out_mutex;
+	}
+
+	pqp = kmalloc(sizeof *pqp, GFP_KERNEL);
+	if (!pqp) {
+		err = -ENOMEM;
+		goto out_mutex;
+	}
+	pqp->qpn = qpn;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox)) {
+		err = -ENOMEM;
+		goto out_alloc;
+	}
+	mgm = mailbox->buf;
+
+	if (!(mlx4_is_mfunc(dev) && steer == MLX4_UC_STEER)) {
+		/* The promisc QP needs to be added for each one of the steering
+		 * entries. If it already exists, needs to be added as
+		 * a duplicate for this entry.
+		 */
+		list_for_each_entry(entry,
+				    &s_steer->steer_entries[steer],
+				    list) {
+			err = mlx4_READ_ENTRY(dev, entry->index, mailbox);
+			if (err)
+				goto out_mailbox;
+
+			members_count = be32_to_cpu(mgm->members_count) &
+					0xffffff;
+			prot = be32_to_cpu(mgm->members_count) >> 30;
+			found = false;
+			for (i = 0; i < members_count; i++) {
+				if ((be32_to_cpu(mgm->qp[i]) &
+				     MGM_QPN_MASK) == qpn) {
+					/* Entry already exists.
+					 * Add to duplicates.
+					 */
+					dqp = kmalloc(sizeof(*dqp), GFP_KERNEL);
+					if (!dqp) {
+						err = -ENOMEM;
+						goto out_mailbox;
+					}
+					dqp->qpn = qpn;
+					list_add_tail(&dqp->list,
+						      &entry->duplicates);
+					found = true;
+				}
+			}
+			if (!found) {
+				/* Need to add the qpn to mgm */
+				if (members_count ==
+				    dev->caps.num_qp_per_mgm) {
+					/* entry is full */
+					err = -ENOMEM;
+					goto out_mailbox;
+				}
+				mgm->qp[members_count++] =
+					cpu_to_be32(qpn & MGM_QPN_MASK);
+				mgm->members_count =
+					cpu_to_be32(members_count |
+						    (prot << 30));
+				err = mlx4_WRITE_ENTRY(dev, entry->index,
+						       mailbox);
+				if (err)
+					goto out_mailbox;
+			}
+		}
+	}
+
+	/* add the new qpn to list of promisc qps */
+	list_add_tail(&pqp->list, &s_steer->promisc_qps[steer]);
+	/* now need to add all the promisc qps to default entry */
+	memset(mgm, 0, sizeof *mgm);
+	members_count = 0;
+	list_for_each_entry(dqp, &s_steer->promisc_qps[steer], list) {
+		if (members_count == dev->caps.num_qp_per_mgm) {
+			/* entry is full */
+			err = -ENOMEM;
+			goto out_list;
+		}
+		mgm->qp[members_count++] = cpu_to_be32(dqp->qpn & MGM_QPN_MASK);
+	}
+	mgm->members_count = cpu_to_be32(members_count | MLX4_PROT_ETH << 30);
+
+	err = mlx4_WRITE_PROMISC(dev, port, steer, mailbox);
+	if (err)
+		goto out_list;
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	mutex_unlock(&priv->mcg_table.mutex);
+	return 0;
+
+out_list:
+	list_del(&pqp->list);
+out_mailbox:
+	mlx4_free_cmd_mailbox(dev, mailbox);
+out_alloc:
+	kfree(pqp);
+out_mutex:
+	mutex_unlock(&priv->mcg_table.mutex);
+	return err;
+}
+
+static int remove_promisc_qp(struct mlx4_dev *dev, u8 port,
+			     enum mlx4_steer_type steer, u32 qpn)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_steer *s_steer;
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_mgm *mgm;
+	struct mlx4_steer_index *entry, *tmp_entry;
+	struct mlx4_promisc_qp *pqp;
+	struct mlx4_promisc_qp *dqp;
+	u32 members_count;
+	bool found;
+	bool back_to_list = false;
+	int i;
+	int err;
+
+	if (port < 1 || port > dev->caps.num_ports)
+		return -EINVAL;
+
+	s_steer = &mlx4_priv(dev)->steer[port - 1];
+	mutex_lock(&priv->mcg_table.mutex);
+
+	pqp = get_promisc_qp(dev, port, steer, qpn);
+	if (unlikely(!pqp)) {
+		mlx4_warn(dev, "QP %x is not promiscuous QP\n", qpn);
+		/* nothing to do */
+		err = 0;
+		goto out_mutex;
+	}
+
+	/*remove from list of promisc qps */
+	list_del(&pqp->list);
+
+	/* set the default entry not to include the removed one */
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox)) {
+		err = -ENOMEM;
+		back_to_list = true;
+		goto out_list;
+	}
+	mgm = mailbox->buf;
+	members_count = 0;
+	list_for_each_entry(dqp, &s_steer->promisc_qps[steer], list)
+		mgm->qp[members_count++] = cpu_to_be32(dqp->qpn & MGM_QPN_MASK);
+	mgm->members_count = cpu_to_be32(members_count | MLX4_PROT_ETH << 30);
+
+	err = mlx4_WRITE_PROMISC(dev, port, steer, mailbox);
+	if (err)
+		goto out_mailbox;
+
+	if (!(mlx4_is_mfunc(dev) && steer == MLX4_UC_STEER)) {
+		/* Remove the QP from all the steering entries */
+		list_for_each_entry_safe(entry, tmp_entry,
+					 &s_steer->steer_entries[steer],
+					 list) {
+			found = false;
+			list_for_each_entry(dqp, &entry->duplicates, list) {
+				if (dqp->qpn == qpn) {
+					found = true;
+					break;
+				}
+			}
+			if (found) {
+				/* A duplicate, no need to change the MGM,
+				 * only update the duplicates list
+				 */
+				list_del(&dqp->list);
+				kfree(dqp);
+			} else {
+				int loc = -1;
+
+				err = mlx4_READ_ENTRY(dev,
+						      entry->index,
+						      mailbox);
+					if (err)
+						goto out_mailbox;
+				members_count =
+					be32_to_cpu(mgm->members_count) &
+					0xffffff;
+				if (!members_count) {
+					mlx4_warn(dev, "QP %06x wasn't found in entry %x mcount=0. deleting entry...\n",
+						  qpn, entry->index);
+					list_del(&entry->list);
+					kfree(entry);
+					continue;
+				}
+
+				for (i = 0; i < members_count; ++i)
+					if ((be32_to_cpu(mgm->qp[i]) &
+					     MGM_QPN_MASK) == qpn) {
+						loc = i;
+						break;
+					}
+
+				if (loc < 0) {
+					mlx4_err(dev, "QP %06x wasn't found in entry %d\n",
+						 qpn, entry->index);
+					err = -EINVAL;
+					goto out_mailbox;
+				}
+
+				/* Copy the last QP in this MGM
+				 * over removed QP
+				 */
+				mgm->qp[loc] = mgm->qp[members_count - 1];
+				mgm->qp[members_count - 1] = 0;
+				mgm->members_count =
+					cpu_to_be32(--members_count |
+						    (MLX4_PROT_ETH << 30));
+
+				err = mlx4_WRITE_ENTRY(dev,
+						       entry->index,
+						       mailbox);
+					if (err)
+						goto out_mailbox;
+			}
+		}
+	}
+
+out_mailbox:
+	mlx4_free_cmd_mailbox(dev, mailbox);
+out_list:
+	if (back_to_list)
+		list_add_tail(&pqp->list, &s_steer->promisc_qps[steer]);
+	else
+		kfree(pqp);
+out_mutex:
+	mutex_unlock(&priv->mcg_table.mutex);
+	return err;
+}
+
+/*
+ * Caller must hold MCG table semaphore.  gid and mgm parameters must
+ * be properly aligned for command interface.
+ *
+ *  Returns 0 unless a firmware command error occurs.
+ *
+ * If GID is found in MGM or MGM is empty, *index = *hash, *prev = -1
+ * and *mgm holds MGM entry.
+ *
+ * if GID is found in AMGM, *index = index in AMGM, *prev = index of
+ * previous entry in hash chain and *mgm holds AMGM entry.
+ *
+ * If no AMGM exists for given gid, *index = -1, *prev = index of last
+ * entry in hash chain and *mgm holds end of hash chain.
+ */
+static int find_entry(struct mlx4_dev *dev, u8 port,
+		      u8 *gid, enum mlx4_protocol prot,
+		      struct mlx4_cmd_mailbox *mgm_mailbox,
+		      int *prev, int *index)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_mgm *mgm = mgm_mailbox->buf;
+	u8 *mgid;
+	int err;
+	u16 hash;
+	u8 op_mod = (prot == MLX4_PROT_ETH) ?
+		!!(dev->caps.flags & MLX4_DEV_CAP_FLAG_VEP_MC_STEER) : 0;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return -ENOMEM;
+	mgid = mailbox->buf;
+
+	memcpy(mgid, gid, 16);
+
+	err = mlx4_GID_HASH(dev, mailbox, &hash, op_mod);
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	if (err)
+		return err;
+
+	if (0)
+		mlx4_dbg(dev, "Hash for %pI6 is %04x\n", gid, hash);
+
+	*index = hash;
+	*prev  = -1;
+
+	do {
+		err = mlx4_READ_ENTRY(dev, *index, mgm_mailbox);
+		if (err)
+			return err;
+
+		if (!(be32_to_cpu(mgm->members_count) & 0xffffff)) {
+			if (*index != hash) {
+				mlx4_err(dev, "Found zero MGID in AMGM\n");
+				err = -EINVAL;
+			}
+			return err;
+		}
+
+		if (!memcmp(mgm->gid, gid, 16) &&
+		    be32_to_cpu(mgm->members_count) >> 30 == prot)
+			return err;
+
+		*prev = *index;
+		*index = be32_to_cpu(mgm->next_gid_index) >> 6;
+	} while (*index);
+
+	*index = -1;
+	return err;
+}
+
+static const u8 __promisc_mode[] = {
+	[MLX4_FS_REGULAR]   = 0x0,
+	[MLX4_FS_ALL_DEFAULT] = 0x1,
+	[MLX4_FS_MC_DEFAULT] = 0x3,
+	[MLX4_FS_UC_SNIFFER] = 0x4,
+	[MLX4_FS_MC_SNIFFER] = 0x5,
+};
+
+int mlx4_map_sw_to_hw_steering_mode(struct mlx4_dev *dev,
+				    enum mlx4_net_trans_promisc_mode flow_type)
+{
+	if (flow_type >= MLX4_FS_MODE_NUM) {
+		mlx4_err(dev, "Invalid flow type. type = %d\n", flow_type);
+		return -EINVAL;
+	}
+	return __promisc_mode[flow_type];
+}
+EXPORT_SYMBOL_GPL(mlx4_map_sw_to_hw_steering_mode);
+
+static void trans_rule_ctrl_to_hw(struct mlx4_net_trans_rule *ctrl,
+				  struct mlx4_net_trans_rule_hw_ctrl *hw)
+{
+	u8 flags = 0;
+
+	flags = ctrl->queue_mode == MLX4_NET_TRANS_Q_LIFO ? 1 : 0;
+	flags |= ctrl->exclusive ? (1 << 2) : 0;
+	flags |= ctrl->allow_loopback ? (1 << 3) : 0;
+
+	hw->flags = flags;
+	hw->type = __promisc_mode[ctrl->promisc_mode];
+	hw->prio = cpu_to_be16(ctrl->priority);
+	hw->port = ctrl->port;
+	hw->qpn = cpu_to_be32(ctrl->qpn);
+}
+
+const u16 __sw_id_hw[] = {
+	[MLX4_NET_TRANS_RULE_ID_ETH]     = 0xE001,
+	[MLX4_NET_TRANS_RULE_ID_IB]      = 0xE005,
+	[MLX4_NET_TRANS_RULE_ID_IPV6]    = 0xE003,
+	[MLX4_NET_TRANS_RULE_ID_IPV4]    = 0xE002,
+	[MLX4_NET_TRANS_RULE_ID_TCP]     = 0xE004,
+	[MLX4_NET_TRANS_RULE_ID_UDP]     = 0xE006,
+	[MLX4_NET_TRANS_RULE_ID_VXLAN]	 = 0xE008
+};
+
+int mlx4_map_sw_to_hw_steering_id(struct mlx4_dev *dev,
+				  enum mlx4_net_trans_rule_id id)
+{
+	if (id >= MLX4_NET_TRANS_RULE_NUM) {
+		mlx4_err(dev, "Invalid network rule id. id = %d\n", id);
+		return -EINVAL;
+	}
+	return __sw_id_hw[id];
+}
+EXPORT_SYMBOL_GPL(mlx4_map_sw_to_hw_steering_id);
+
+static const int __rule_hw_sz[] = {
+	[MLX4_NET_TRANS_RULE_ID_ETH] =
+		sizeof(struct mlx4_net_trans_rule_hw_eth),
+	[MLX4_NET_TRANS_RULE_ID_IB] =
+		sizeof(struct mlx4_net_trans_rule_hw_ib),
+	[MLX4_NET_TRANS_RULE_ID_IPV6] = 0,
+	[MLX4_NET_TRANS_RULE_ID_IPV4] =
+		sizeof(struct mlx4_net_trans_rule_hw_ipv4),
+	[MLX4_NET_TRANS_RULE_ID_TCP] =
+		sizeof(struct mlx4_net_trans_rule_hw_tcp_udp),
+	[MLX4_NET_TRANS_RULE_ID_UDP] =
+		sizeof(struct mlx4_net_trans_rule_hw_tcp_udp),
+	[MLX4_NET_TRANS_RULE_ID_VXLAN] =
+		sizeof(struct mlx4_net_trans_rule_hw_vxlan)
+};
+
+int mlx4_hw_rule_sz(struct mlx4_dev *dev,
+	       enum mlx4_net_trans_rule_id id)
+{
+	if (id >= MLX4_NET_TRANS_RULE_NUM) {
+		mlx4_err(dev, "Invalid network rule id. id = %d\n", id);
+		return -EINVAL;
+	}
+
+	return __rule_hw_sz[id];
+}
+EXPORT_SYMBOL_GPL(mlx4_hw_rule_sz);
+
+static int parse_trans_rule(struct mlx4_dev *dev, struct mlx4_spec_list *spec,
+			    struct _rule_hw *rule_hw)
+{
+	if (mlx4_hw_rule_sz(dev, spec->id) < 0)
+		return -EINVAL;
+	memset(rule_hw, 0, mlx4_hw_rule_sz(dev, spec->id));
+	rule_hw->id = cpu_to_be16(__sw_id_hw[spec->id]);
+	rule_hw->size = mlx4_hw_rule_sz(dev, spec->id) >> 2;
+
+	switch (spec->id) {
+	case MLX4_NET_TRANS_RULE_ID_ETH:
+		memcpy(rule_hw->eth.dst_mac, spec->eth.dst_mac, ETH_ALEN);
+		memcpy(rule_hw->eth.dst_mac_msk, spec->eth.dst_mac_msk,
+		       ETH_ALEN);
+		memcpy(rule_hw->eth.src_mac, spec->eth.src_mac, ETH_ALEN);
+		memcpy(rule_hw->eth.src_mac_msk, spec->eth.src_mac_msk,
+		       ETH_ALEN);
+		if (spec->eth.ether_type_enable) {
+			rule_hw->eth.ether_type_enable = 1;
+			rule_hw->eth.ether_type = spec->eth.ether_type;
+		}
+		rule_hw->eth.vlan_tag = spec->eth.vlan_id;
+		rule_hw->eth.vlan_tag_msk = spec->eth.vlan_id_msk;
+		break;
+
+	case MLX4_NET_TRANS_RULE_ID_IB:
+		rule_hw->ib.l3_qpn = spec->ib.l3_qpn;
+		rule_hw->ib.qpn_mask = spec->ib.qpn_msk;
+		memcpy(&rule_hw->ib.dst_gid, &spec->ib.dst_gid, 16);
+		memcpy(&rule_hw->ib.dst_gid_msk, &spec->ib.dst_gid_msk, 16);
+		break;
+
+	case MLX4_NET_TRANS_RULE_ID_IPV6:
+		return -EOPNOTSUPP;
+
+	case MLX4_NET_TRANS_RULE_ID_IPV4:
+		rule_hw->ipv4.src_ip = spec->ipv4.src_ip;
+		rule_hw->ipv4.src_ip_msk = spec->ipv4.src_ip_msk;
+		rule_hw->ipv4.dst_ip = spec->ipv4.dst_ip;
+		rule_hw->ipv4.dst_ip_msk = spec->ipv4.dst_ip_msk;
+		break;
+
+	case MLX4_NET_TRANS_RULE_ID_TCP:
+	case MLX4_NET_TRANS_RULE_ID_UDP:
+		rule_hw->tcp_udp.dst_port = spec->tcp_udp.dst_port;
+		rule_hw->tcp_udp.dst_port_msk = spec->tcp_udp.dst_port_msk;
+		rule_hw->tcp_udp.src_port = spec->tcp_udp.src_port;
+		rule_hw->tcp_udp.src_port_msk = spec->tcp_udp.src_port_msk;
+		break;
+
+	case MLX4_NET_TRANS_RULE_ID_VXLAN:
+		rule_hw->vxlan.vni =
+			cpu_to_be32(be32_to_cpu(spec->vxlan.vni) << 8);
+		rule_hw->vxlan.vni_mask =
+			cpu_to_be32(be32_to_cpu(spec->vxlan.vni_mask) << 8);
+		break;
+
+	default:
+		return -EINVAL;
+	}
+
+	return __rule_hw_sz[spec->id];
+}
+
+static void mlx4_err_rule(struct mlx4_dev *dev, char *str,
+			  struct mlx4_net_trans_rule *rule)
+{
+#define BUF_SIZE 256
+	struct mlx4_spec_list *cur;
+	char buf[BUF_SIZE];
+	int len = 0;
+
+	mlx4_err(dev, "%s", str);
+	len += snprintf(buf + len, BUF_SIZE - len,
+			"port = %d prio = 0x%x qp = 0x%x ",
+			rule->port, rule->priority, rule->qpn);
+
+	list_for_each_entry(cur, &rule->list, list) {
+		switch (cur->id) {
+		case MLX4_NET_TRANS_RULE_ID_ETH:
+			len += snprintf(buf + len, BUF_SIZE - len,
+					"dmac = %pM ", &cur->eth.dst_mac);
+			if (cur->eth.ether_type)
+				len += snprintf(buf + len, BUF_SIZE - len,
+						"ethertype = 0x%x ",
+						be16_to_cpu(cur->eth.ether_type));
+			if (cur->eth.vlan_id)
+				len += snprintf(buf + len, BUF_SIZE - len,
+						"vlan-id = %d ",
+						be16_to_cpu(cur->eth.vlan_id));
+			break;
+
+		case MLX4_NET_TRANS_RULE_ID_IPV4:
+			if (cur->ipv4.src_ip)
+				len += snprintf(buf + len, BUF_SIZE - len,
+						"src-ip = %pI4 ",
+						&cur->ipv4.src_ip);
+			if (cur->ipv4.dst_ip)
+				len += snprintf(buf + len, BUF_SIZE - len,
+						"dst-ip = %pI4 ",
+						&cur->ipv4.dst_ip);
+			break;
+
+		case MLX4_NET_TRANS_RULE_ID_TCP:
+		case MLX4_NET_TRANS_RULE_ID_UDP:
+			if (cur->tcp_udp.src_port)
+				len += snprintf(buf + len, BUF_SIZE - len,
+						"src-port = %d ",
+						be16_to_cpu(cur->tcp_udp.src_port));
+			if (cur->tcp_udp.dst_port)
+				len += snprintf(buf + len, BUF_SIZE - len,
+						"dst-port = %d ",
+						be16_to_cpu(cur->tcp_udp.dst_port));
+			break;
+
+		case MLX4_NET_TRANS_RULE_ID_IB:
+			len += snprintf(buf + len, BUF_SIZE - len,
+					"dst-gid = %pI6\n", cur->ib.dst_gid);
+			len += snprintf(buf + len, BUF_SIZE - len,
+					"dst-gid-mask = %pI6\n",
+					cur->ib.dst_gid_msk);
+			break;
+
+		case MLX4_NET_TRANS_RULE_ID_VXLAN:
+			len += snprintf(buf + len, BUF_SIZE - len,
+					"VNID = %d ", be32_to_cpu(cur->vxlan.vni));
+			break;
+		case MLX4_NET_TRANS_RULE_ID_IPV6:
+			break;
+
+		default:
+			break;
+		}
+	}
+	len += snprintf(buf + len, BUF_SIZE - len, "\n");
+	mlx4_err(dev, "%s", buf);
+
+	if (len >= BUF_SIZE)
+		mlx4_err(dev, "Network rule error message was truncated, print buffer is too small\n");
+}
+
+int mlx4_flow_attach(struct mlx4_dev *dev,
+		     struct mlx4_net_trans_rule *rule, u64 *reg_id)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_spec_list *cur;
+	u32 size = 0;
+	int ret;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+
+	trans_rule_ctrl_to_hw(rule, mailbox->buf);
+
+	size += sizeof(struct mlx4_net_trans_rule_hw_ctrl);
+
+	list_for_each_entry(cur, &rule->list, list) {
+		ret = parse_trans_rule(dev, cur, mailbox->buf + size);
+		if (ret < 0) {
+			mlx4_free_cmd_mailbox(dev, mailbox);
+			return ret;
+		}
+		size += ret;
+	}
+
+	ret = mlx4_QP_FLOW_STEERING_ATTACH(dev, mailbox, size >> 2, reg_id);
+	if (ret == -ENOMEM)
+		mlx4_err_rule(dev,
+			      "mcg table is full. Fail to register network rule\n",
+			      rule);
+	else if (ret)
+		mlx4_err_rule(dev, "Fail to register network rule\n", rule);
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(mlx4_flow_attach);
+
+int mlx4_flow_detach(struct mlx4_dev *dev, u64 reg_id)
+{
+	int err;
+
+	err = mlx4_QP_FLOW_STEERING_DETACH(dev, reg_id);
+	if (err)
+		mlx4_err(dev, "Fail to detach network rule. registration id = 0x%llx\n",
+			 reg_id);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx4_flow_detach);
+
+int mlx4_tunnel_steer_add(struct mlx4_dev *dev, unsigned char *addr,
+			  int port, int qpn, u16 prio, u64 *reg_id)
+{
+	int err;
+	struct mlx4_spec_list spec_eth_outer = { {NULL} };
+	struct mlx4_spec_list spec_vxlan     = { {NULL} };
+	struct mlx4_spec_list spec_eth_inner = { {NULL} };
+
+	struct mlx4_net_trans_rule rule = {
+		.queue_mode = MLX4_NET_TRANS_Q_FIFO,
+		.exclusive = 0,
+		.allow_loopback = 1,
+		.promisc_mode = MLX4_FS_REGULAR,
+	};
+
+	__be64 mac_mask = cpu_to_be64(MLX4_MAC_MASK << 16);
+
+	rule.port = port;
+	rule.qpn = qpn;
+	rule.priority = prio;
+	INIT_LIST_HEAD(&rule.list);
+
+	spec_eth_outer.id = MLX4_NET_TRANS_RULE_ID_ETH;
+	memcpy(spec_eth_outer.eth.dst_mac, addr, ETH_ALEN);
+	memcpy(spec_eth_outer.eth.dst_mac_msk, &mac_mask, ETH_ALEN);
+
+	spec_vxlan.id = MLX4_NET_TRANS_RULE_ID_VXLAN;    /* any vxlan header */
+	spec_eth_inner.id = MLX4_NET_TRANS_RULE_ID_ETH;	 /* any inner eth header */
+
+	list_add_tail(&spec_eth_outer.list, &rule.list);
+	list_add_tail(&spec_vxlan.list,     &rule.list);
+	list_add_tail(&spec_eth_inner.list, &rule.list);
+
+	err = mlx4_flow_attach(dev, &rule, reg_id);
+	return err;
+}
+EXPORT_SYMBOL(mlx4_tunnel_steer_add);
+
+int mlx4_FLOW_STEERING_IB_UC_QP_RANGE(struct mlx4_dev *dev, u32 min_range_qpn,
+				      u32 max_range_qpn)
+{
+	int err;
+	u64 in_param;
+
+	in_param = ((u64) min_range_qpn) << 32;
+	in_param |= ((u64) max_range_qpn) & 0xFFFFFFFF;
+
+	err = mlx4_cmd(dev, in_param, 0, 0,
+			MLX4_FLOW_STEERING_IB_UC_QP_RANGE,
+			MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE);
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx4_FLOW_STEERING_IB_UC_QP_RANGE);
+
+int mlx4_qp_attach_common(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16],
+			  int block_mcast_loopback, enum mlx4_protocol prot,
+			  enum mlx4_steer_type steer)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_mgm *mgm;
+	u32 members_count;
+	int index, prev;
+	int link = 0;
+	int i;
+	int err;
+	u8 port = gid[5];
+	u8 new_entry = 0;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	mgm = mailbox->buf;
+
+	mutex_lock(&priv->mcg_table.mutex);
+	err = find_entry(dev, port, gid, prot,
+			 mailbox, &prev, &index);
+	if (err)
+		goto out;
+
+	if (index != -1) {
+		if (!(be32_to_cpu(mgm->members_count) & 0xffffff)) {
+			new_entry = 1;
+			memcpy(mgm->gid, gid, 16);
+		}
+	} else {
+		link = 1;
+
+		index = mlx4_bitmap_alloc(&priv->mcg_table.bitmap);
+		if (index == -1) {
+			mlx4_err(dev, "No AMGM entries left\n");
+			err = -ENOMEM;
+			goto out;
+		}
+		index += dev->caps.num_mgms;
+
+		new_entry = 1;
+		memset(mgm, 0, sizeof *mgm);
+		memcpy(mgm->gid, gid, 16);
+	}
+
+	members_count = be32_to_cpu(mgm->members_count) & 0xffffff;
+	if (members_count == dev->caps.num_qp_per_mgm) {
+		mlx4_err(dev, "MGM at index %x is full\n", index);
+		err = -ENOMEM;
+		goto out;
+	}
+
+	for (i = 0; i < members_count; ++i)
+		if ((be32_to_cpu(mgm->qp[i]) & MGM_QPN_MASK) == qp->qpn) {
+			mlx4_dbg(dev, "QP %06x already a member of MGM\n", qp->qpn);
+			err = 0;
+			goto out;
+		}
+
+	if (block_mcast_loopback)
+		mgm->qp[members_count++] = cpu_to_be32((qp->qpn & MGM_QPN_MASK) |
+						       (1U << MGM_BLCK_LB_BIT));
+	else
+		mgm->qp[members_count++] = cpu_to_be32(qp->qpn & MGM_QPN_MASK);
+
+	mgm->members_count = cpu_to_be32(members_count | (u32) prot << 30);
+
+	err = mlx4_WRITE_ENTRY(dev, index, mailbox);
+	if (err)
+		goto out;
+
+	if (!link)
+		goto out;
+
+	err = mlx4_READ_ENTRY(dev, prev, mailbox);
+	if (err)
+		goto out;
+
+	mgm->next_gid_index = cpu_to_be32(index << 6);
+
+	err = mlx4_WRITE_ENTRY(dev, prev, mailbox);
+	if (err)
+		goto out;
+
+out:
+	if (prot == MLX4_PROT_ETH) {
+		/* manage the steering entry for promisc mode */
+		if (new_entry)
+			new_steering_entry(dev, port, steer, index, qp->qpn);
+		else
+			existing_steering_entry(dev, port, steer,
+						index, qp->qpn);
+	}
+	if (err && link && index != -1) {
+		if (index < dev->caps.num_mgms)
+			mlx4_warn(dev, "Got AMGM index %d < %d\n",
+				  index, dev->caps.num_mgms);
+		else
+			mlx4_bitmap_free(&priv->mcg_table.bitmap,
+					 index - dev->caps.num_mgms, MLX4_USE_RR);
+	}
+	mutex_unlock(&priv->mcg_table.mutex);
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+
+int mlx4_qp_detach_common(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16],
+			  enum mlx4_protocol prot, enum mlx4_steer_type steer)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_mgm *mgm;
+	u32 members_count;
+	int prev, index;
+	int i, loc = -1;
+	int err;
+	u8 port = gid[5];
+	bool removed_entry = false;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	mgm = mailbox->buf;
+
+	mutex_lock(&priv->mcg_table.mutex);
+
+	err = find_entry(dev, port, gid, prot,
+			 mailbox, &prev, &index);
+	if (err)
+		goto out;
+
+	if (index == -1) {
+		mlx4_err(dev, "MGID %pI6 not found\n", gid);
+		err = -EINVAL;
+		goto out;
+	}
+
+	/* If this QP is also a promisc QP, it shouldn't be removed only if
+	 * at least one none promisc QP is also attached to this MCG
+	 */
+	if (prot == MLX4_PROT_ETH &&
+	    check_duplicate_entry(dev, port, steer, index, qp->qpn) &&
+	    !promisc_steering_entry(dev, port, steer, index, qp->qpn, NULL))
+			goto out;
+
+	members_count = be32_to_cpu(mgm->members_count) & 0xffffff;
+	for (i = 0; i < members_count; ++i)
+		if ((be32_to_cpu(mgm->qp[i]) & MGM_QPN_MASK) == qp->qpn) {
+			loc = i;
+			break;
+		}
+
+	if (loc == -1) {
+		mlx4_err(dev, "QP %06x not found in MGM\n", qp->qpn);
+		err = -EINVAL;
+		goto out;
+	}
+
+	/* copy the last QP in this MGM over removed QP */
+	mgm->qp[loc] = mgm->qp[members_count - 1];
+	mgm->qp[members_count - 1] = 0;
+	mgm->members_count = cpu_to_be32(--members_count | (u32) prot << 30);
+
+	if (prot == MLX4_PROT_ETH)
+		removed_entry = can_remove_steering_entry(dev, port, steer,
+								index, qp->qpn);
+	if (members_count && (prot != MLX4_PROT_ETH || !removed_entry)) {
+		err = mlx4_WRITE_ENTRY(dev, index, mailbox);
+		goto out;
+	}
+
+	/* We are going to delete the entry, members count should be 0 */
+	mgm->members_count = cpu_to_be32((u32) prot << 30);
+
+	if (prev == -1) {
+		/* Remove entry from MGM */
+		int amgm_index = be32_to_cpu(mgm->next_gid_index) >> 6;
+		if (amgm_index) {
+			err = mlx4_READ_ENTRY(dev, amgm_index, mailbox);
+			if (err)
+				goto out;
+		} else
+			memset(mgm->gid, 0, 16);
+
+		err = mlx4_WRITE_ENTRY(dev, index, mailbox);
+		if (err)
+			goto out;
+
+		if (amgm_index) {
+			if (amgm_index < dev->caps.num_mgms)
+				mlx4_warn(dev, "MGM entry %d had AMGM index %d < %d\n",
+					  index, amgm_index, dev->caps.num_mgms);
+			else
+				mlx4_bitmap_free(&priv->mcg_table.bitmap,
+						 amgm_index - dev->caps.num_mgms, MLX4_USE_RR);
+		}
+	} else {
+		/* Remove entry from AMGM */
+		int cur_next_index = be32_to_cpu(mgm->next_gid_index) >> 6;
+		err = mlx4_READ_ENTRY(dev, prev, mailbox);
+		if (err)
+			goto out;
+
+		mgm->next_gid_index = cpu_to_be32(cur_next_index << 6);
+
+		err = mlx4_WRITE_ENTRY(dev, prev, mailbox);
+		if (err)
+			goto out;
+
+		if (index < dev->caps.num_mgms)
+			mlx4_warn(dev, "entry %d had next AMGM index %d < %d\n",
+				  prev, index, dev->caps.num_mgms);
+		else
+			mlx4_bitmap_free(&priv->mcg_table.bitmap,
+					 index - dev->caps.num_mgms, MLX4_USE_RR);
+	}
+
+out:
+	mutex_unlock(&priv->mcg_table.mutex);
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+
+static int mlx4_QP_ATTACH(struct mlx4_dev *dev, struct mlx4_qp *qp,
+			  u8 gid[16], u8 attach, u8 block_loopback,
+			  enum mlx4_protocol prot)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	int err = 0;
+	int qpn;
+
+	if (!mlx4_is_mfunc(dev))
+		return -EBADF;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+
+	memcpy(mailbox->buf, gid, 16);
+	qpn = qp->qpn;
+	qpn |= (prot << 28);
+	if (attach && block_loopback)
+		qpn |= (1 << 31);
+
+	err = mlx4_cmd(dev, mailbox->dma, qpn, attach,
+		       MLX4_CMD_QP_ATTACH, MLX4_CMD_TIME_CLASS_A,
+		       MLX4_CMD_WRAPPED);
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+
+int mlx4_trans_to_dmfs_attach(struct mlx4_dev *dev, struct mlx4_qp *qp,
+			      u8 gid[16], u8 port,
+			      int block_mcast_loopback,
+			      enum mlx4_protocol prot, u64 *reg_id)
+{
+		struct mlx4_spec_list spec = { {NULL} };
+		__be64 mac_mask = cpu_to_be64(MLX4_MAC_MASK << 16);
+
+		struct mlx4_net_trans_rule rule = {
+			.queue_mode = MLX4_NET_TRANS_Q_FIFO,
+			.exclusive = 0,
+			.promisc_mode = MLX4_FS_REGULAR,
+			.priority = MLX4_DOMAIN_NIC,
+		};
+
+		rule.allow_loopback = !block_mcast_loopback;
+		rule.port = port;
+		rule.qpn = qp->qpn;
+		INIT_LIST_HEAD(&rule.list);
+
+		switch (prot) {
+		case MLX4_PROT_ETH:
+			spec.id = MLX4_NET_TRANS_RULE_ID_ETH;
+			memcpy(spec.eth.dst_mac, &gid[10], ETH_ALEN);
+			memcpy(spec.eth.dst_mac_msk, &mac_mask, ETH_ALEN);
+			break;
+
+		case MLX4_PROT_IB_IPV6:
+			spec.id = MLX4_NET_TRANS_RULE_ID_IB;
+			memcpy(spec.ib.dst_gid, gid, 16);
+			memset(&spec.ib.dst_gid_msk, 0xff, 16);
+			break;
+		default:
+			return -EINVAL;
+		}
+		list_add_tail(&spec.list, &rule.list);
+
+		return mlx4_flow_attach(dev, &rule, reg_id);
+}
+
+int mlx4_multicast_attach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16],
+			  u8 port, int block_mcast_loopback,
+			  enum mlx4_protocol prot, u64 *reg_id)
+{
+	switch (dev->caps.steering_mode) {
+	case MLX4_STEERING_MODE_A0:
+		if (prot == MLX4_PROT_ETH)
+			return 0;
+
+	case MLX4_STEERING_MODE_B0:
+		if (prot == MLX4_PROT_ETH)
+			gid[7] |= (MLX4_MC_STEER << 1);
+
+		if (mlx4_is_mfunc(dev))
+			return mlx4_QP_ATTACH(dev, qp, gid, 1,
+					      block_mcast_loopback, prot);
+		return mlx4_qp_attach_common(dev, qp, gid,
+					     block_mcast_loopback, prot,
+					     MLX4_MC_STEER);
+
+	case MLX4_STEERING_MODE_DEVICE_MANAGED:
+		return mlx4_trans_to_dmfs_attach(dev, qp, gid, port,
+						 block_mcast_loopback,
+						 prot, reg_id);
+	default:
+		return -EINVAL;
+	}
+}
+EXPORT_SYMBOL_GPL(mlx4_multicast_attach);
+
+int mlx4_multicast_detach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16],
+			  enum mlx4_protocol prot, u64 reg_id)
+{
+	switch (dev->caps.steering_mode) {
+	case MLX4_STEERING_MODE_A0:
+		if (prot == MLX4_PROT_ETH)
+			return 0;
+
+	case MLX4_STEERING_MODE_B0:
+		if (prot == MLX4_PROT_ETH)
+			gid[7] |= (MLX4_MC_STEER << 1);
+
+		if (mlx4_is_mfunc(dev))
+			return mlx4_QP_ATTACH(dev, qp, gid, 0, 0, prot);
+
+		return mlx4_qp_detach_common(dev, qp, gid, prot,
+					     MLX4_MC_STEER);
+
+	case MLX4_STEERING_MODE_DEVICE_MANAGED:
+		return mlx4_flow_detach(dev, reg_id);
+
+	default:
+		return -EINVAL;
+	}
+}
+EXPORT_SYMBOL_GPL(mlx4_multicast_detach);
+
+int mlx4_flow_steer_promisc_add(struct mlx4_dev *dev, u8 port,
+				u32 qpn, enum mlx4_net_trans_promisc_mode mode)
+{
+	struct mlx4_net_trans_rule rule;
+	u64 *regid_p;
+
+	switch (mode) {
+	case MLX4_FS_ALL_DEFAULT:
+		regid_p = &dev->regid_promisc_array[port];
+		break;
+	case MLX4_FS_MC_DEFAULT:
+		regid_p = &dev->regid_allmulti_array[port];
+		break;
+	default:
+		return -1;
+	}
+
+	if (*regid_p != 0)
+		return -1;
+
+	rule.promisc_mode = mode;
+	rule.port = port;
+	rule.qpn = qpn;
+	INIT_LIST_HEAD(&rule.list);
+	mlx4_err(dev, "going promisc on %x\n", port);
+
+	return  mlx4_flow_attach(dev, &rule, regid_p);
+}
+EXPORT_SYMBOL_GPL(mlx4_flow_steer_promisc_add);
+
+int mlx4_flow_steer_promisc_remove(struct mlx4_dev *dev, u8 port,
+				   enum mlx4_net_trans_promisc_mode mode)
+{
+	int ret;
+	u64 *regid_p;
+
+	switch (mode) {
+	case MLX4_FS_ALL_DEFAULT:
+		regid_p = &dev->regid_promisc_array[port];
+		break;
+	case MLX4_FS_MC_DEFAULT:
+		regid_p = &dev->regid_allmulti_array[port];
+		break;
+	default:
+		return -1;
+	}
+
+	if (*regid_p == 0)
+		return -1;
+
+	ret =  mlx4_flow_detach(dev, *regid_p);
+	if (ret == 0)
+		*regid_p = 0;
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(mlx4_flow_steer_promisc_remove);
+
+int mlx4_unicast_attach(struct mlx4_dev *dev,
+			struct mlx4_qp *qp, u8 gid[16],
+			int block_mcast_loopback, enum mlx4_protocol prot)
+{
+	if (prot == MLX4_PROT_ETH)
+		gid[7] |= (MLX4_UC_STEER << 1);
+
+	if (mlx4_is_mfunc(dev))
+		return mlx4_QP_ATTACH(dev, qp, gid, 1,
+					block_mcast_loopback, prot);
+
+	return mlx4_qp_attach_common(dev, qp, gid, block_mcast_loopback,
+					prot, MLX4_UC_STEER);
+}
+EXPORT_SYMBOL_GPL(mlx4_unicast_attach);
+
+int mlx4_unicast_detach(struct mlx4_dev *dev, struct mlx4_qp *qp,
+			       u8 gid[16], enum mlx4_protocol prot)
+{
+	if (prot == MLX4_PROT_ETH)
+		gid[7] |= (MLX4_UC_STEER << 1);
+
+	if (mlx4_is_mfunc(dev))
+		return mlx4_QP_ATTACH(dev, qp, gid, 0, 0, prot);
+
+	return mlx4_qp_detach_common(dev, qp, gid, prot, MLX4_UC_STEER);
+}
+EXPORT_SYMBOL_GPL(mlx4_unicast_detach);
+
+int mlx4_PROMISC_wrapper(struct mlx4_dev *dev, int slave,
+			 struct mlx4_vhcr *vhcr,
+			 struct mlx4_cmd_mailbox *inbox,
+			 struct mlx4_cmd_mailbox *outbox,
+			 struct mlx4_cmd_info *cmd)
+{
+	u32 qpn = (u32) vhcr->in_param & 0xffffffff;
+	int port = mlx4_slave_convert_port(dev, slave, vhcr->in_param >> 62);
+	enum mlx4_steer_type steer = vhcr->in_modifier;
+
+	if (port < 0)
+		return -EINVAL;
+
+	/* Promiscuous unicast is not allowed in mfunc */
+	if (mlx4_is_mfunc(dev) && steer == MLX4_UC_STEER)
+		return 0;
+
+	if (vhcr->op_modifier)
+		return add_promisc_qp(dev, port, steer, qpn);
+	else
+		return remove_promisc_qp(dev, port, steer, qpn);
+}
+
+static int mlx4_PROMISC(struct mlx4_dev *dev, u32 qpn,
+			enum mlx4_steer_type steer, u8 add, u8 port)
+{
+	return mlx4_cmd(dev, (u64) qpn | (u64) port << 62, (u32) steer, add,
+			MLX4_CMD_PROMISC, MLX4_CMD_TIME_CLASS_A,
+			MLX4_CMD_WRAPPED);
+}
+
+int mlx4_multicast_promisc_add(struct mlx4_dev *dev, u32 qpn, u8 port)
+{
+	if (mlx4_is_mfunc(dev))
+		return mlx4_PROMISC(dev, qpn, MLX4_MC_STEER, 1, port);
+
+	return add_promisc_qp(dev, port, MLX4_MC_STEER, qpn);
+}
+EXPORT_SYMBOL_GPL(mlx4_multicast_promisc_add);
+
+int mlx4_multicast_promisc_remove(struct mlx4_dev *dev, u32 qpn, u8 port)
+{
+	if (mlx4_is_mfunc(dev))
+		return mlx4_PROMISC(dev, qpn, MLX4_MC_STEER, 0, port);
+
+	return remove_promisc_qp(dev, port, MLX4_MC_STEER, qpn);
+}
+EXPORT_SYMBOL_GPL(mlx4_multicast_promisc_remove);
+
+int mlx4_unicast_promisc_add(struct mlx4_dev *dev, u32 qpn, u8 port)
+{
+	if (mlx4_is_mfunc(dev))
+		return mlx4_PROMISC(dev, qpn, MLX4_UC_STEER, 1, port);
+
+	return add_promisc_qp(dev, port, MLX4_UC_STEER, qpn);
+}
+EXPORT_SYMBOL_GPL(mlx4_unicast_promisc_add);
+
+int mlx4_unicast_promisc_remove(struct mlx4_dev *dev, u32 qpn, u8 port)
+{
+	if (mlx4_is_mfunc(dev))
+		return mlx4_PROMISC(dev, qpn, MLX4_UC_STEER, 0, port);
+
+	return remove_promisc_qp(dev, port, MLX4_UC_STEER, qpn);
+}
+EXPORT_SYMBOL_GPL(mlx4_unicast_promisc_remove);
+
+int mlx4_init_mcg_table(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int err;
+
+	/* No need for mcg_table when fw managed the mcg table*/
+	if (dev->caps.steering_mode ==
+	    MLX4_STEERING_MODE_DEVICE_MANAGED)
+		return 0;
+	err = mlx4_bitmap_init(&priv->mcg_table.bitmap, dev->caps.num_amgms,
+			       dev->caps.num_amgms - 1, 0, 0);
+	if (err)
+		return err;
+
+	mutex_init(&priv->mcg_table.mutex);
+
+	return 0;
+}
+
+void mlx4_cleanup_mcg_table(struct mlx4_dev *dev)
+{
+	if (dev->caps.steering_mode !=
+	    MLX4_STEERING_MODE_DEVICE_MANAGED)
+		mlx4_bitmap_cleanup(&mlx4_priv(dev)->mcg_table.bitmap);
+}
diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4.h b/drivers/net/ethernet/mellanox/mlx4/mlx4.h
new file mode 100644
index 0000000..de10dbb
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4/mlx4.h
@@ -0,0 +1,1316 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2005, 2006, 2007 Cisco Systems.  All rights reserved.
+ * Copyright (c) 2005, 2006, 2007, 2008 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2004 Voltaire, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MLX4_H
+#define MLX4_H
+
+#include <linux/mutex.h>
+#include <linux/radix-tree.h>
+#include <linux/rbtree.h>
+#include <linux/timer.h>
+#include <linux/semaphore.h>
+#include <linux/workqueue.h>
+
+#include <linux/mlx4/device.h>
+#include <linux/mlx4/driver.h>
+#include <linux/mlx4/doorbell.h>
+#include <linux/mlx4/cmd.h>
+
+#define DRV_NAME	"mlx4_core"
+#define PFX		DRV_NAME ": "
+#define DRV_VERSION	"2.2-1"
+#define DRV_RELDATE	"Feb, 2014"
+
+#define MLX4_FS_UDP_UC_EN		(1 << 1)
+#define MLX4_FS_TCP_UC_EN		(1 << 2)
+#define MLX4_FS_NUM_OF_L2_ADDR		8
+#define MLX4_FS_MGM_LOG_ENTRY_SIZE	7
+#define MLX4_FS_NUM_MCG			(1 << 17)
+
+#define INIT_HCA_TPT_MW_ENABLE          (1 << 7)
+
+struct mlx4_set_port_prio2tc_context {
+	u8 prio2tc[4];
+};
+
+struct mlx4_port_scheduler_tc_cfg_be {
+	__be16 pg;
+	__be16 bw_precentage;
+	__be16 max_bw_units; /* 3-100Mbps, 4-1Gbps, other values - reserved */
+	__be16 max_bw_value;
+};
+
+struct mlx4_set_port_scheduler_context {
+	struct mlx4_port_scheduler_tc_cfg_be tc[MLX4_NUM_TC];
+};
+
+enum {
+	MLX4_HCR_BASE		= 0x80680,
+	MLX4_HCR_SIZE		= 0x0001c,
+	MLX4_CLR_INT_SIZE	= 0x00008,
+	MLX4_SLAVE_COMM_BASE	= 0x0,
+	MLX4_COMM_PAGESIZE	= 0x1000,
+	MLX4_CLOCK_SIZE		= 0x00008
+};
+
+enum {
+	MLX4_DEFAULT_MGM_LOG_ENTRY_SIZE = 10,
+	MLX4_MIN_MGM_LOG_ENTRY_SIZE = 7,
+	MLX4_MAX_MGM_LOG_ENTRY_SIZE = 12,
+	MLX4_MAX_QP_PER_MGM = 4 * ((1 << MLX4_MAX_MGM_LOG_ENTRY_SIZE) / 16 - 2),
+	MLX4_MTT_ENTRY_PER_SEG	= 8,
+};
+
+enum {
+	MLX4_NUM_PDS		= 1 << 15
+};
+
+enum {
+	MLX4_CMPT_TYPE_QP	= 0,
+	MLX4_CMPT_TYPE_SRQ	= 1,
+	MLX4_CMPT_TYPE_CQ	= 2,
+	MLX4_CMPT_TYPE_EQ	= 3,
+	MLX4_CMPT_NUM_TYPE
+};
+
+enum {
+	MLX4_CMPT_SHIFT		= 24,
+	MLX4_NUM_CMPTS		= MLX4_CMPT_NUM_TYPE << MLX4_CMPT_SHIFT
+};
+
+enum mlx4_mpt_state {
+	MLX4_MPT_DISABLED = 0,
+	MLX4_MPT_EN_HW,
+	MLX4_MPT_EN_SW
+};
+
+#define MLX4_COMM_TIME		10000
+enum {
+	MLX4_COMM_CMD_RESET,
+	MLX4_COMM_CMD_VHCR0,
+	MLX4_COMM_CMD_VHCR1,
+	MLX4_COMM_CMD_VHCR2,
+	MLX4_COMM_CMD_VHCR_EN,
+	MLX4_COMM_CMD_VHCR_POST,
+	MLX4_COMM_CMD_FLR = 254
+};
+
+enum {
+	MLX4_VF_SMI_DISABLED,
+	MLX4_VF_SMI_ENABLED
+};
+
+/*The flag indicates that the slave should delay the RESET cmd*/
+#define MLX4_DELAY_RESET_SLAVE 0xbbbbbbb
+/*indicates how many retries will be done if we are in the middle of FLR*/
+#define NUM_OF_RESET_RETRIES	10
+#define SLEEP_TIME_IN_RESET	(2 * 1000)
+enum mlx4_resource {
+	RES_QP,
+	RES_CQ,
+	RES_SRQ,
+	RES_XRCD,
+	RES_MPT,
+	RES_MTT,
+	RES_MAC,
+	RES_VLAN,
+	RES_EQ,
+	RES_COUNTER,
+	RES_FS_RULE,
+	MLX4_NUM_OF_RESOURCE_TYPE
+};
+
+enum mlx4_alloc_mode {
+	RES_OP_RESERVE,
+	RES_OP_RESERVE_AND_MAP,
+	RES_OP_MAP_ICM,
+};
+
+enum mlx4_res_tracker_free_type {
+	RES_TR_FREE_ALL,
+	RES_TR_FREE_SLAVES_ONLY,
+	RES_TR_FREE_STRUCTS_ONLY,
+};
+
+/*
+ *Virtual HCR structures.
+ * mlx4_vhcr is the sw representation, in machine endianess
+ *
+ * mlx4_vhcr_cmd is the formalized structure, the one that is passed
+ * to FW to go through communication channel.
+ * It is big endian, and has the same structure as the physical HCR
+ * used by command interface
+ */
+struct mlx4_vhcr {
+	u64	in_param;
+	u64	out_param;
+	u32	in_modifier;
+	u32	errno;
+	u16	op;
+	u16	token;
+	u8	op_modifier;
+	u8	e_bit;
+};
+
+struct mlx4_vhcr_cmd {
+	__be64 in_param;
+	__be32 in_modifier;
+	__be64 out_param;
+	__be16 token;
+	u16 reserved;
+	u8 status;
+	u8 flags;
+	__be16 opcode;
+};
+
+struct mlx4_cmd_info {
+	u16 opcode;
+	bool has_inbox;
+	bool has_outbox;
+	bool out_is_imm;
+	bool encode_slave_id;
+	int (*verify)(struct mlx4_dev *dev, int slave, struct mlx4_vhcr *vhcr,
+		      struct mlx4_cmd_mailbox *inbox);
+	int (*wrapper)(struct mlx4_dev *dev, int slave, struct mlx4_vhcr *vhcr,
+		       struct mlx4_cmd_mailbox *inbox,
+		       struct mlx4_cmd_mailbox *outbox,
+		       struct mlx4_cmd_info *cmd);
+};
+
+#ifdef CONFIG_MLX4_DEBUG
+extern int mlx4_debug_level;
+#else /* CONFIG_MLX4_DEBUG */
+#define mlx4_debug_level	(0)
+#endif /* CONFIG_MLX4_DEBUG */
+
+#define mlx4_dbg(mdev, format, ...)					\
+do {									\
+	if (mlx4_debug_level)						\
+		dev_printk(KERN_DEBUG, &(mdev)->pdev->dev, format,	\
+			   ##__VA_ARGS__);				\
+} while (0)
+
+#define mlx4_err(mdev, format, ...)					\
+	dev_err(&(mdev)->pdev->dev, format, ##__VA_ARGS__)
+#define mlx4_info(mdev, format, ...)					\
+	dev_info(&(mdev)->pdev->dev, format, ##__VA_ARGS__)
+#define mlx4_warn(mdev, format, ...)					\
+	dev_warn(&(mdev)->pdev->dev, format, ##__VA_ARGS__)
+
+extern int mlx4_log_num_mgm_entry_size;
+extern int log_mtts_per_seg;
+
+#define MLX4_MAX_NUM_SLAVES	(MLX4_MAX_NUM_PF + MLX4_MAX_NUM_VF)
+#define ALL_SLAVES 0xff
+
+struct mlx4_bitmap {
+	u32			last;
+	u32			top;
+	u32			max;
+	u32                     reserved_top;
+	u32			mask;
+	u32			avail;
+	spinlock_t		lock;
+	unsigned long	       *table;
+};
+
+struct mlx4_buddy {
+	unsigned long	      **bits;
+	unsigned int	       *num_free;
+	u32			max_order;
+	spinlock_t		lock;
+};
+
+struct mlx4_icm;
+
+struct mlx4_icm_table {
+	u64			virt;
+	int			num_icm;
+	u32			num_obj;
+	int			obj_size;
+	int			lowmem;
+	int			coherent;
+	struct mutex		mutex;
+	struct mlx4_icm	      **icm;
+};
+
+#define MLX4_MPT_FLAG_SW_OWNS	    (0xfUL << 28)
+#define MLX4_MPT_FLAG_FREE	    (0x3UL << 28)
+#define MLX4_MPT_FLAG_MIO	    (1 << 17)
+#define MLX4_MPT_FLAG_BIND_ENABLE   (1 << 15)
+#define MLX4_MPT_FLAG_PHYSICAL	    (1 <<  9)
+#define MLX4_MPT_FLAG_REGION	    (1 <<  8)
+
+#define MLX4_MPT_PD_MASK	    (0x1FFFFUL)
+#define MLX4_MPT_PD_VF_MASK	    (0xFE0000UL)
+#define MLX4_MPT_PD_FLAG_FAST_REG   (1 << 27)
+#define MLX4_MPT_PD_FLAG_RAE	    (1 << 28)
+#define MLX4_MPT_PD_FLAG_EN_INV	    (3 << 24)
+
+#define MLX4_MPT_QP_FLAG_BOUND_QP   (1 << 7)
+
+#define MLX4_MPT_STATUS_SW		0xF0
+#define MLX4_MPT_STATUS_HW		0x00
+
+#define MLX4_CQE_SIZE_MASK_STRIDE	0x3
+#define MLX4_EQE_SIZE_MASK_STRIDE	0x30
+
+/*
+ * Must be packed because mtt_seg is 64 bits but only aligned to 32 bits.
+ */
+struct mlx4_mpt_entry {
+	__be32 flags;
+	__be32 qpn;
+	__be32 key;
+	__be32 pd_flags;
+	__be64 start;
+	__be64 length;
+	__be32 lkey;
+	__be32 win_cnt;
+	u8	reserved1[3];
+	u8	mtt_rep;
+	__be64 mtt_addr;
+	__be32 mtt_sz;
+	__be32 entity_size;
+	__be32 first_byte_offset;
+} __packed;
+
+/*
+ * Must be packed because start is 64 bits but only aligned to 32 bits.
+ */
+struct mlx4_eq_context {
+	__be32			flags;
+	u16			reserved1[3];
+	__be16			page_offset;
+	u8			log_eq_size;
+	u8			reserved2[4];
+	u8			eq_period;
+	u8			reserved3;
+	u8			eq_max_count;
+	u8			reserved4[3];
+	u8			intr;
+	u8			log_page_size;
+	u8			reserved5[2];
+	u8			mtt_base_addr_h;
+	__be32			mtt_base_addr_l;
+	u32			reserved6[2];
+	__be32			consumer_index;
+	__be32			producer_index;
+	u32			reserved7[4];
+};
+
+struct mlx4_cq_context {
+	__be32			flags;
+	u16			reserved1[3];
+	__be16			page_offset;
+	__be32			logsize_usrpage;
+	__be16			cq_period;
+	__be16			cq_max_count;
+	u8			reserved2[3];
+	u8			comp_eqn;
+	u8			log_page_size;
+	u8			reserved3[2];
+	u8			mtt_base_addr_h;
+	__be32			mtt_base_addr_l;
+	__be32			last_notified_index;
+	__be32			solicit_producer_index;
+	__be32			consumer_index;
+	__be32			producer_index;
+	u32			reserved4[2];
+	__be64			db_rec_addr;
+};
+
+struct mlx4_srq_context {
+	__be32			state_logsize_srqn;
+	u8			logstride;
+	u8			reserved1;
+	__be16			xrcd;
+	__be32			pg_offset_cqn;
+	u32			reserved2;
+	u8			log_page_size;
+	u8			reserved3[2];
+	u8			mtt_base_addr_h;
+	__be32			mtt_base_addr_l;
+	__be32			pd;
+	__be16			limit_watermark;
+	__be16			wqe_cnt;
+	u16			reserved4;
+	__be16			wqe_counter;
+	u32			reserved5;
+	__be64			db_rec_addr;
+};
+
+struct mlx4_eq {
+	struct mlx4_dev	       *dev;
+	void __iomem	       *doorbell;
+	int			eqn;
+	u32			cons_index;
+	u16			irq;
+	u16			have_irq;
+	int			nent;
+	struct mlx4_buf_list   *page_list;
+	struct mlx4_mtt		mtt;
+};
+
+struct mlx4_slave_eqe {
+	u8 type;
+	u8 port;
+	u32 param;
+};
+
+struct mlx4_slave_event_eq_info {
+	int eqn;
+	u16 token;
+};
+
+struct mlx4_profile {
+	int			num_qp;
+	int			rdmarc_per_qp;
+	int			num_srq;
+	int			num_cq;
+	int			num_mcg;
+	int			num_mpt;
+	unsigned		num_mtt;
+};
+
+struct mlx4_fw {
+	u64			clr_int_base;
+	u64			catas_offset;
+	u64			comm_base;
+	u64			clock_offset;
+	struct mlx4_icm	       *fw_icm;
+	struct mlx4_icm	       *aux_icm;
+	u32			catas_size;
+	u16			fw_pages;
+	u8			clr_int_bar;
+	u8			catas_bar;
+	u8			comm_bar;
+	u8			clock_bar;
+};
+
+struct mlx4_comm {
+	u32			slave_write;
+	u32			slave_read;
+};
+
+enum {
+	MLX4_MCAST_CONFIG       = 0,
+	MLX4_MCAST_DISABLE      = 1,
+	MLX4_MCAST_ENABLE       = 2,
+};
+
+#define VLAN_FLTR_SIZE	128
+
+struct mlx4_vlan_fltr {
+	__be32 entry[VLAN_FLTR_SIZE];
+};
+
+struct mlx4_mcast_entry {
+	struct list_head list;
+	u64 addr;
+};
+
+struct mlx4_promisc_qp {
+	struct list_head list;
+	u32 qpn;
+};
+
+struct mlx4_steer_index {
+	struct list_head list;
+	unsigned int index;
+	struct list_head duplicates;
+};
+
+#define MLX4_EVENT_TYPES_NUM 64
+
+struct mlx4_slave_state {
+	u8 comm_toggle;
+	u8 last_cmd;
+	u8 init_port_mask;
+	bool active;
+	bool old_vlan_api;
+	u8 function;
+	dma_addr_t vhcr_dma;
+	u16 mtu[MLX4_MAX_PORTS + 1];
+	__be32 ib_cap_mask[MLX4_MAX_PORTS + 1];
+	struct mlx4_slave_eqe eq[MLX4_MFUNC_MAX_EQES];
+	struct list_head mcast_filters[MLX4_MAX_PORTS + 1];
+	struct mlx4_vlan_fltr *vlan_filter[MLX4_MAX_PORTS + 1];
+	/* event type to eq number lookup */
+	struct mlx4_slave_event_eq_info event_eq[MLX4_EVENT_TYPES_NUM];
+	u16 eq_pi;
+	u16 eq_ci;
+	spinlock_t lock;
+	/*initialized via the kzalloc*/
+	u8 is_slave_going_down;
+	u32 cookie;
+	enum slave_port_state port_state[MLX4_MAX_PORTS + 1];
+};
+
+#define MLX4_VGT 4095
+#define NO_INDX  (-1)
+
+struct mlx4_vport_state {
+	u64 mac;
+	u16 default_vlan;
+	u8  default_qos;
+	u32 tx_rate;
+	bool spoofchk;
+	u32 link_state;
+};
+
+struct mlx4_vf_admin_state {
+	struct mlx4_vport_state vport[MLX4_MAX_PORTS + 1];
+	u8 enable_smi[MLX4_MAX_PORTS + 1];
+};
+
+struct mlx4_vport_oper_state {
+	struct mlx4_vport_state state;
+	int mac_idx;
+	int vlan_idx;
+};
+
+struct mlx4_vf_oper_state {
+	struct mlx4_vport_oper_state vport[MLX4_MAX_PORTS + 1];
+	u8 smi_enabled[MLX4_MAX_PORTS + 1];
+};
+
+struct slave_list {
+	struct mutex mutex;
+	struct list_head res_list[MLX4_NUM_OF_RESOURCE_TYPE];
+};
+
+struct resource_allocator {
+	spinlock_t alloc_lock; /* protect quotas */
+	union {
+		int res_reserved;
+		int res_port_rsvd[MLX4_MAX_PORTS];
+	};
+	union {
+		int res_free;
+		int res_port_free[MLX4_MAX_PORTS];
+	};
+	int *quota;
+	int *allocated;
+	int *guaranteed;
+};
+
+struct mlx4_resource_tracker {
+	spinlock_t lock;
+	/* tree for each resources */
+	struct rb_root res_tree[MLX4_NUM_OF_RESOURCE_TYPE];
+	/* num_of_slave's lists, one per slave */
+	struct slave_list *slave_list;
+	struct resource_allocator res_alloc[MLX4_NUM_OF_RESOURCE_TYPE];
+};
+
+#define SLAVE_EVENT_EQ_SIZE	128
+struct mlx4_slave_event_eq {
+	u32 eqn;
+	u32 cons;
+	u32 prod;
+	spinlock_t event_lock;
+	struct mlx4_eqe event_eqe[SLAVE_EVENT_EQ_SIZE];
+};
+
+struct mlx4_master_qp0_state {
+	int proxy_qp0_active;
+	int qp0_active;
+	int port_active;
+};
+
+struct mlx4_mfunc_master_ctx {
+	struct mlx4_slave_state *slave_state;
+	struct mlx4_vf_admin_state *vf_admin;
+	struct mlx4_vf_oper_state *vf_oper;
+	struct mlx4_master_qp0_state qp0_state[MLX4_MAX_PORTS + 1];
+	int			init_port_ref[MLX4_MAX_PORTS + 1];
+	u16			max_mtu[MLX4_MAX_PORTS + 1];
+	int			disable_mcast_ref[MLX4_MAX_PORTS + 1];
+	struct mlx4_resource_tracker res_tracker;
+	struct workqueue_struct *comm_wq;
+	struct work_struct	comm_work;
+	struct work_struct	slave_event_work;
+	struct work_struct	slave_flr_event_work;
+	spinlock_t		slave_state_lock;
+	__be32			comm_arm_bit_vector[4];
+	struct mlx4_eqe		cmd_eqe;
+	struct mlx4_slave_event_eq slave_eq;
+	struct mutex		gen_eqe_mutex[MLX4_MFUNC_MAX];
+};
+
+struct mlx4_mfunc {
+	struct mlx4_comm __iomem       *comm;
+	struct mlx4_vhcr_cmd	       *vhcr;
+	dma_addr_t			vhcr_dma;
+
+	struct mlx4_mfunc_master_ctx	master;
+};
+
+#define MGM_QPN_MASK       0x00FFFFFF
+#define MGM_BLCK_LB_BIT    30
+
+struct mlx4_mgm {
+	__be32			next_gid_index;
+	__be32			members_count;
+	u32			reserved[2];
+	u8			gid[16];
+	__be32			qp[MLX4_MAX_QP_PER_MGM];
+};
+
+struct mlx4_cmd {
+	struct pci_pool	       *pool;
+	void __iomem	       *hcr;
+	struct mutex		hcr_mutex;
+	struct mutex		slave_cmd_mutex;
+	struct semaphore	poll_sem;
+	struct semaphore	event_sem;
+	int			max_cmds;
+	spinlock_t		context_lock;
+	int			free_head;
+	struct mlx4_cmd_context *context;
+	u16			token_mask;
+	u8			use_events;
+	u8			toggle;
+	u8			comm_toggle;
+};
+
+enum {
+	MLX4_VF_IMMED_VLAN_FLAG_VLAN = 1 << 0,
+	MLX4_VF_IMMED_VLAN_FLAG_QOS = 1 << 1,
+	MLX4_VF_IMMED_VLAN_FLAG_LINK_DISABLE = 1 << 2,
+};
+struct mlx4_vf_immed_vlan_work {
+	struct work_struct	work;
+	struct mlx4_priv	*priv;
+	int			flags;
+	int			slave;
+	int			vlan_ix;
+	int			orig_vlan_ix;
+	u8			port;
+	u8			qos;
+	u16			vlan_id;
+	u16			orig_vlan_id;
+};
+
+
+struct mlx4_uar_table {
+	struct mlx4_bitmap	bitmap;
+};
+
+struct mlx4_mr_table {
+	struct mlx4_bitmap	mpt_bitmap;
+	struct mlx4_buddy	mtt_buddy;
+	u64			mtt_base;
+	u64			mpt_base;
+	struct mlx4_icm_table	mtt_table;
+	struct mlx4_icm_table	dmpt_table;
+};
+
+struct mlx4_cq_table {
+	struct mlx4_bitmap	bitmap;
+	spinlock_t		lock;
+	struct radix_tree_root	tree;
+	struct mlx4_icm_table	table;
+	struct mlx4_icm_table	cmpt_table;
+};
+
+struct mlx4_eq_table {
+	struct mlx4_bitmap	bitmap;
+	char		       *irq_names;
+	void __iomem	       *clr_int;
+	void __iomem	      **uar_map;
+	u32			clr_mask;
+	struct mlx4_eq	       *eq;
+	struct mlx4_icm_table	table;
+	struct mlx4_icm_table	cmpt_table;
+	int			have_irq;
+	u8			inta_pin;
+};
+
+struct mlx4_srq_table {
+	struct mlx4_bitmap	bitmap;
+	spinlock_t		lock;
+	struct radix_tree_root	tree;
+	struct mlx4_icm_table	table;
+	struct mlx4_icm_table	cmpt_table;
+};
+
+struct mlx4_qp_table {
+	struct mlx4_bitmap	bitmap;
+	u32			rdmarc_base;
+	int			rdmarc_shift;
+	spinlock_t		lock;
+	struct mlx4_icm_table	qp_table;
+	struct mlx4_icm_table	auxc_table;
+	struct mlx4_icm_table	altc_table;
+	struct mlx4_icm_table	rdmarc_table;
+	struct mlx4_icm_table	cmpt_table;
+};
+
+struct mlx4_mcg_table {
+	struct mutex		mutex;
+	struct mlx4_bitmap	bitmap;
+	struct mlx4_icm_table	table;
+};
+
+struct mlx4_catas_err {
+	u32 __iomem	       *map;
+	struct timer_list	timer;
+	struct list_head	list;
+};
+
+#define MLX4_MAX_MAC_NUM	128
+#define MLX4_MAC_TABLE_SIZE	(MLX4_MAX_MAC_NUM << 3)
+
+struct mlx4_mac_table {
+	__be64			entries[MLX4_MAX_MAC_NUM];
+	int			refs[MLX4_MAX_MAC_NUM];
+	struct mutex		mutex;
+	int			total;
+	int			max;
+};
+
+#define MLX4_ROCE_GID_ENTRY_SIZE	16
+
+struct mlx4_roce_gid_entry {
+	u8 raw[MLX4_ROCE_GID_ENTRY_SIZE];
+};
+
+struct mlx4_roce_gid_table {
+	struct mlx4_roce_gid_entry	roce_gids[MLX4_ROCE_MAX_GIDS];
+	struct mutex			mutex;
+};
+
+#define MLX4_MAX_VLAN_NUM	128
+#define MLX4_VLAN_TABLE_SIZE	(MLX4_MAX_VLAN_NUM << 2)
+
+struct mlx4_vlan_table {
+	__be32			entries[MLX4_MAX_VLAN_NUM];
+	int			refs[MLX4_MAX_VLAN_NUM];
+	struct mutex		mutex;
+	int			total;
+	int			max;
+};
+
+#define SET_PORT_GEN_ALL_VALID		0x7
+#define SET_PORT_PROMISC_SHIFT		31
+#define SET_PORT_MC_PROMISC_SHIFT	30
+
+enum {
+	MCAST_DIRECT_ONLY	= 0,
+	MCAST_DIRECT		= 1,
+	MCAST_DEFAULT		= 2
+};
+
+
+struct mlx4_set_port_general_context {
+	u8 reserved[3];
+	u8 flags;
+	u16 reserved2;
+	__be16 mtu;
+	u8 pptx;
+	u8 pfctx;
+	u16 reserved3;
+	u8 pprx;
+	u8 pfcrx;
+	u16 reserved4;
+};
+
+struct mlx4_set_port_rqp_calc_context {
+	__be32 base_qpn;
+	u8 rererved;
+	u8 n_mac;
+	u8 n_vlan;
+	u8 n_prio;
+	u8 reserved2[3];
+	u8 mac_miss;
+	u8 intra_no_vlan;
+	u8 no_vlan;
+	u8 intra_vlan_miss;
+	u8 vlan_miss;
+	u8 reserved3[3];
+	u8 no_vlan_prio;
+	__be32 promisc;
+	__be32 mcast;
+};
+
+struct mlx4_port_info {
+	struct mlx4_dev	       *dev;
+	int			port;
+	char			dev_name[16];
+	struct device_attribute port_attr;
+	enum mlx4_port_type	tmp_type;
+	char			dev_mtu_name[16];
+	struct device_attribute port_mtu_attr;
+	struct mlx4_mac_table	mac_table;
+	struct mlx4_vlan_table	vlan_table;
+	struct mlx4_roce_gid_table gid_table;
+	int			base_qpn;
+};
+
+struct mlx4_sense {
+	struct mlx4_dev		*dev;
+	u8			do_sense_port[MLX4_MAX_PORTS + 1];
+	u8			sense_allowed[MLX4_MAX_PORTS + 1];
+	struct delayed_work	sense_poll;
+};
+
+struct mlx4_msix_ctl {
+	u64		pool_bm;
+	struct mutex	pool_lock;
+};
+
+struct mlx4_steer {
+	struct list_head promisc_qps[MLX4_NUM_STEERS];
+	struct list_head steer_entries[MLX4_NUM_STEERS];
+};
+
+enum {
+	MLX4_PCI_DEV_IS_VF		= 1 << 0,
+	MLX4_PCI_DEV_FORCE_SENSE_PORT	= 1 << 1,
+};
+
+enum {
+	MLX4_NO_RR	= 0,
+	MLX4_USE_RR	= 1,
+};
+
+struct mlx4_priv {
+	struct mlx4_dev		dev;
+
+	struct list_head	dev_list;
+	struct list_head	ctx_list;
+	spinlock_t		ctx_lock;
+
+	int			pci_dev_data;
+	int                     removed;
+
+	struct list_head        pgdir_list;
+	struct mutex            pgdir_mutex;
+
+	struct mlx4_fw		fw;
+	struct mlx4_cmd		cmd;
+	struct mlx4_mfunc	mfunc;
+
+	struct mlx4_bitmap	pd_bitmap;
+	struct mlx4_bitmap	xrcd_bitmap;
+	struct mlx4_uar_table	uar_table;
+	struct mlx4_mr_table	mr_table;
+	struct mlx4_cq_table	cq_table;
+	struct mlx4_eq_table	eq_table;
+	struct mlx4_srq_table	srq_table;
+	struct mlx4_qp_table	qp_table;
+	struct mlx4_mcg_table	mcg_table;
+	struct mlx4_bitmap	counters_bitmap;
+
+	struct mlx4_catas_err	catas_err;
+
+	void __iomem	       *clr_base;
+
+	struct mlx4_uar		driver_uar;
+	void __iomem	       *kar;
+	struct mlx4_port_info	port[MLX4_MAX_PORTS + 1];
+	struct mlx4_sense       sense;
+	struct mutex		port_mutex;
+	struct mlx4_msix_ctl	msix_ctl;
+	struct mlx4_steer	*steer;
+	struct list_head	bf_list;
+	struct mutex		bf_mutex;
+	struct io_mapping	*bf_mapping;
+	void __iomem            *clock_mapping;
+	int			reserved_mtts;
+	int			fs_hash_mode;
+	u8 virt2phys_pkey[MLX4_MFUNC_MAX][MLX4_MAX_PORTS][MLX4_MAX_PORT_PKEYS];
+	__be64			slave_node_guids[MLX4_MFUNC_MAX];
+
+	atomic_t		opreq_count;
+	struct work_struct	opreq_task;
+};
+
+static inline struct mlx4_priv *mlx4_priv(struct mlx4_dev *dev)
+{
+	return container_of(dev, struct mlx4_priv, dev);
+}
+
+#define MLX4_SENSE_RANGE	(HZ * 3)
+
+extern struct workqueue_struct *mlx4_wq;
+
+u32 mlx4_bitmap_alloc(struct mlx4_bitmap *bitmap);
+void mlx4_bitmap_free(struct mlx4_bitmap *bitmap, u32 obj, int use_rr);
+u32 mlx4_bitmap_alloc_range(struct mlx4_bitmap *bitmap, int cnt, int align);
+void mlx4_bitmap_free_range(struct mlx4_bitmap *bitmap, u32 obj, int cnt,
+			    int use_rr);
+u32 mlx4_bitmap_avail(struct mlx4_bitmap *bitmap);
+int mlx4_bitmap_init(struct mlx4_bitmap *bitmap, u32 num, u32 mask,
+		     u32 reserved_bot, u32 resetrved_top);
+void mlx4_bitmap_cleanup(struct mlx4_bitmap *bitmap);
+
+int mlx4_reset(struct mlx4_dev *dev);
+
+int mlx4_alloc_eq_table(struct mlx4_dev *dev);
+void mlx4_free_eq_table(struct mlx4_dev *dev);
+
+int mlx4_init_pd_table(struct mlx4_dev *dev);
+int mlx4_init_xrcd_table(struct mlx4_dev *dev);
+int mlx4_init_uar_table(struct mlx4_dev *dev);
+int mlx4_init_mr_table(struct mlx4_dev *dev);
+int mlx4_init_eq_table(struct mlx4_dev *dev);
+int mlx4_init_cq_table(struct mlx4_dev *dev);
+int mlx4_init_qp_table(struct mlx4_dev *dev);
+int mlx4_init_srq_table(struct mlx4_dev *dev);
+int mlx4_init_mcg_table(struct mlx4_dev *dev);
+
+void mlx4_cleanup_pd_table(struct mlx4_dev *dev);
+void mlx4_cleanup_xrcd_table(struct mlx4_dev *dev);
+void mlx4_cleanup_uar_table(struct mlx4_dev *dev);
+void mlx4_cleanup_mr_table(struct mlx4_dev *dev);
+void mlx4_cleanup_eq_table(struct mlx4_dev *dev);
+void mlx4_cleanup_cq_table(struct mlx4_dev *dev);
+void mlx4_cleanup_qp_table(struct mlx4_dev *dev);
+void mlx4_cleanup_srq_table(struct mlx4_dev *dev);
+void mlx4_cleanup_mcg_table(struct mlx4_dev *dev);
+int __mlx4_qp_alloc_icm(struct mlx4_dev *dev, int qpn, gfp_t gfp);
+void __mlx4_qp_free_icm(struct mlx4_dev *dev, int qpn);
+int __mlx4_cq_alloc_icm(struct mlx4_dev *dev, int *cqn);
+void __mlx4_cq_free_icm(struct mlx4_dev *dev, int cqn);
+int __mlx4_srq_alloc_icm(struct mlx4_dev *dev, int *srqn);
+void __mlx4_srq_free_icm(struct mlx4_dev *dev, int srqn);
+int __mlx4_mpt_reserve(struct mlx4_dev *dev);
+void __mlx4_mpt_release(struct mlx4_dev *dev, u32 index);
+int __mlx4_mpt_alloc_icm(struct mlx4_dev *dev, u32 index, gfp_t gfp);
+void __mlx4_mpt_free_icm(struct mlx4_dev *dev, u32 index);
+u32 __mlx4_alloc_mtt_range(struct mlx4_dev *dev, int order);
+void __mlx4_free_mtt_range(struct mlx4_dev *dev, u32 first_seg, int order);
+
+int mlx4_WRITE_MTT_wrapper(struct mlx4_dev *dev, int slave,
+			   struct mlx4_vhcr *vhcr,
+			   struct mlx4_cmd_mailbox *inbox,
+			   struct mlx4_cmd_mailbox *outbox,
+			   struct mlx4_cmd_info *cmd);
+int mlx4_SYNC_TPT_wrapper(struct mlx4_dev *dev, int slave,
+			   struct mlx4_vhcr *vhcr,
+			   struct mlx4_cmd_mailbox *inbox,
+			   struct mlx4_cmd_mailbox *outbox,
+			   struct mlx4_cmd_info *cmd);
+int mlx4_SW2HW_MPT_wrapper(struct mlx4_dev *dev, int slave,
+			   struct mlx4_vhcr *vhcr,
+			   struct mlx4_cmd_mailbox *inbox,
+			   struct mlx4_cmd_mailbox *outbox,
+			   struct mlx4_cmd_info *cmd);
+int mlx4_HW2SW_MPT_wrapper(struct mlx4_dev *dev, int slave,
+			   struct mlx4_vhcr *vhcr,
+			   struct mlx4_cmd_mailbox *inbox,
+			   struct mlx4_cmd_mailbox *outbox,
+			   struct mlx4_cmd_info *cmd);
+int mlx4_QUERY_MPT_wrapper(struct mlx4_dev *dev, int slave,
+			   struct mlx4_vhcr *vhcr,
+			   struct mlx4_cmd_mailbox *inbox,
+			   struct mlx4_cmd_mailbox *outbox,
+			   struct mlx4_cmd_info *cmd);
+int mlx4_SW2HW_EQ_wrapper(struct mlx4_dev *dev, int slave,
+			  struct mlx4_vhcr *vhcr,
+			  struct mlx4_cmd_mailbox *inbox,
+			  struct mlx4_cmd_mailbox *outbox,
+			  struct mlx4_cmd_info *cmd);
+int mlx4_DMA_wrapper(struct mlx4_dev *dev, int slave,
+		     struct mlx4_vhcr *vhcr,
+		     struct mlx4_cmd_mailbox *inbox,
+		     struct mlx4_cmd_mailbox *outbox,
+		     struct mlx4_cmd_info *cmd);
+int __mlx4_qp_reserve_range(struct mlx4_dev *dev, int cnt, int align,
+			    int *base);
+void __mlx4_qp_release_range(struct mlx4_dev *dev, int base_qpn, int cnt);
+int __mlx4_register_mac(struct mlx4_dev *dev, u8 port, u64 mac);
+void __mlx4_unregister_mac(struct mlx4_dev *dev, u8 port, u64 mac);
+int __mlx4_write_mtt(struct mlx4_dev *dev, struct mlx4_mtt *mtt,
+		     int start_index, int npages, u64 *page_list);
+int __mlx4_counter_alloc(struct mlx4_dev *dev, u32 *idx);
+void __mlx4_counter_free(struct mlx4_dev *dev, u32 idx);
+int __mlx4_xrcd_alloc(struct mlx4_dev *dev, u32 *xrcdn);
+void __mlx4_xrcd_free(struct mlx4_dev *dev, u32 xrcdn);
+
+void mlx4_start_catas_poll(struct mlx4_dev *dev);
+void mlx4_stop_catas_poll(struct mlx4_dev *dev);
+void mlx4_catas_init(void);
+int mlx4_restart_one(struct pci_dev *pdev);
+int mlx4_register_device(struct mlx4_dev *dev);
+void mlx4_unregister_device(struct mlx4_dev *dev);
+void mlx4_dispatch_event(struct mlx4_dev *dev, enum mlx4_dev_event type,
+			 unsigned long param);
+
+struct mlx4_dev_cap;
+struct mlx4_init_hca_param;
+
+u64 mlx4_make_profile(struct mlx4_dev *dev,
+		      struct mlx4_profile *request,
+		      struct mlx4_dev_cap *dev_cap,
+		      struct mlx4_init_hca_param *init_hca);
+void mlx4_master_comm_channel(struct work_struct *work);
+void mlx4_gen_slave_eqe(struct work_struct *work);
+void mlx4_master_handle_slave_flr(struct work_struct *work);
+
+int mlx4_ALLOC_RES_wrapper(struct mlx4_dev *dev, int slave,
+			   struct mlx4_vhcr *vhcr,
+			   struct mlx4_cmd_mailbox *inbox,
+			   struct mlx4_cmd_mailbox *outbox,
+			   struct mlx4_cmd_info *cmd);
+int mlx4_FREE_RES_wrapper(struct mlx4_dev *dev, int slave,
+			  struct mlx4_vhcr *vhcr,
+			  struct mlx4_cmd_mailbox *inbox,
+			  struct mlx4_cmd_mailbox *outbox,
+			  struct mlx4_cmd_info *cmd);
+int mlx4_MAP_EQ_wrapper(struct mlx4_dev *dev, int slave,
+			struct mlx4_vhcr *vhcr, struct mlx4_cmd_mailbox *inbox,
+			struct mlx4_cmd_mailbox *outbox,
+			struct mlx4_cmd_info *cmd);
+int mlx4_COMM_INT_wrapper(struct mlx4_dev *dev, int slave,
+			  struct mlx4_vhcr *vhcr,
+			  struct mlx4_cmd_mailbox *inbox,
+			  struct mlx4_cmd_mailbox *outbox,
+			  struct mlx4_cmd_info *cmd);
+int mlx4_HW2SW_EQ_wrapper(struct mlx4_dev *dev, int slave,
+			    struct mlx4_vhcr *vhcr,
+			    struct mlx4_cmd_mailbox *inbox,
+			    struct mlx4_cmd_mailbox *outbox,
+			  struct mlx4_cmd_info *cmd);
+int mlx4_QUERY_EQ_wrapper(struct mlx4_dev *dev, int slave,
+			  struct mlx4_vhcr *vhcr,
+			  struct mlx4_cmd_mailbox *inbox,
+			  struct mlx4_cmd_mailbox *outbox,
+			  struct mlx4_cmd_info *cmd);
+int mlx4_SW2HW_CQ_wrapper(struct mlx4_dev *dev, int slave,
+			  struct mlx4_vhcr *vhcr,
+			  struct mlx4_cmd_mailbox *inbox,
+			  struct mlx4_cmd_mailbox *outbox,
+			  struct mlx4_cmd_info *cmd);
+int mlx4_HW2SW_CQ_wrapper(struct mlx4_dev *dev, int slave,
+			  struct mlx4_vhcr *vhcr,
+			  struct mlx4_cmd_mailbox *inbox,
+			  struct mlx4_cmd_mailbox *outbox,
+			  struct mlx4_cmd_info *cmd);
+int mlx4_QUERY_CQ_wrapper(struct mlx4_dev *dev, int slave,
+			  struct mlx4_vhcr *vhcr,
+			  struct mlx4_cmd_mailbox *inbox,
+			  struct mlx4_cmd_mailbox *outbox,
+			  struct mlx4_cmd_info *cmd);
+int mlx4_MODIFY_CQ_wrapper(struct mlx4_dev *dev, int slave,
+			  struct mlx4_vhcr *vhcr,
+			  struct mlx4_cmd_mailbox *inbox,
+			  struct mlx4_cmd_mailbox *outbox,
+			   struct mlx4_cmd_info *cmd);
+int mlx4_SW2HW_SRQ_wrapper(struct mlx4_dev *dev, int slave,
+			   struct mlx4_vhcr *vhcr,
+			   struct mlx4_cmd_mailbox *inbox,
+			   struct mlx4_cmd_mailbox *outbox,
+			   struct mlx4_cmd_info *cmd);
+int mlx4_HW2SW_SRQ_wrapper(struct mlx4_dev *dev, int slave,
+			   struct mlx4_vhcr *vhcr,
+			   struct mlx4_cmd_mailbox *inbox,
+			   struct mlx4_cmd_mailbox *outbox,
+			   struct mlx4_cmd_info *cmd);
+int mlx4_QUERY_SRQ_wrapper(struct mlx4_dev *dev, int slave,
+			   struct mlx4_vhcr *vhcr,
+			   struct mlx4_cmd_mailbox *inbox,
+			   struct mlx4_cmd_mailbox *outbox,
+			   struct mlx4_cmd_info *cmd);
+int mlx4_ARM_SRQ_wrapper(struct mlx4_dev *dev, int slave,
+			 struct mlx4_vhcr *vhcr,
+			 struct mlx4_cmd_mailbox *inbox,
+			 struct mlx4_cmd_mailbox *outbox,
+			 struct mlx4_cmd_info *cmd);
+int mlx4_GEN_QP_wrapper(struct mlx4_dev *dev, int slave,
+			struct mlx4_vhcr *vhcr,
+			struct mlx4_cmd_mailbox *inbox,
+			struct mlx4_cmd_mailbox *outbox,
+			struct mlx4_cmd_info *cmd);
+int mlx4_RST2INIT_QP_wrapper(struct mlx4_dev *dev, int slave,
+			     struct mlx4_vhcr *vhcr,
+			     struct mlx4_cmd_mailbox *inbox,
+			     struct mlx4_cmd_mailbox *outbox,
+			     struct mlx4_cmd_info *cmd);
+int mlx4_INIT2INIT_QP_wrapper(struct mlx4_dev *dev, int slave,
+			      struct mlx4_vhcr *vhcr,
+			      struct mlx4_cmd_mailbox *inbox,
+			      struct mlx4_cmd_mailbox *outbox,
+			      struct mlx4_cmd_info *cmd);
+int mlx4_INIT2RTR_QP_wrapper(struct mlx4_dev *dev, int slave,
+			     struct mlx4_vhcr *vhcr,
+			     struct mlx4_cmd_mailbox *inbox,
+			     struct mlx4_cmd_mailbox *outbox,
+			     struct mlx4_cmd_info *cmd);
+int mlx4_RTR2RTS_QP_wrapper(struct mlx4_dev *dev, int slave,
+			    struct mlx4_vhcr *vhcr,
+			    struct mlx4_cmd_mailbox *inbox,
+			    struct mlx4_cmd_mailbox *outbox,
+			    struct mlx4_cmd_info *cmd);
+int mlx4_RTS2RTS_QP_wrapper(struct mlx4_dev *dev, int slave,
+			    struct mlx4_vhcr *vhcr,
+			    struct mlx4_cmd_mailbox *inbox,
+			    struct mlx4_cmd_mailbox *outbox,
+			    struct mlx4_cmd_info *cmd);
+int mlx4_SQERR2RTS_QP_wrapper(struct mlx4_dev *dev, int slave,
+			      struct mlx4_vhcr *vhcr,
+			      struct mlx4_cmd_mailbox *inbox,
+			      struct mlx4_cmd_mailbox *outbox,
+			      struct mlx4_cmd_info *cmd);
+int mlx4_2ERR_QP_wrapper(struct mlx4_dev *dev, int slave,
+			 struct mlx4_vhcr *vhcr,
+			 struct mlx4_cmd_mailbox *inbox,
+			 struct mlx4_cmd_mailbox *outbox,
+			 struct mlx4_cmd_info *cmd);
+int mlx4_RTS2SQD_QP_wrapper(struct mlx4_dev *dev, int slave,
+			    struct mlx4_vhcr *vhcr,
+			    struct mlx4_cmd_mailbox *inbox,
+			    struct mlx4_cmd_mailbox *outbox,
+			    struct mlx4_cmd_info *cmd);
+int mlx4_SQD2SQD_QP_wrapper(struct mlx4_dev *dev, int slave,
+			    struct mlx4_vhcr *vhcr,
+			    struct mlx4_cmd_mailbox *inbox,
+			    struct mlx4_cmd_mailbox *outbox,
+			    struct mlx4_cmd_info *cmd);
+int mlx4_SQD2RTS_QP_wrapper(struct mlx4_dev *dev, int slave,
+			    struct mlx4_vhcr *vhcr,
+			    struct mlx4_cmd_mailbox *inbox,
+			    struct mlx4_cmd_mailbox *outbox,
+			    struct mlx4_cmd_info *cmd);
+int mlx4_2RST_QP_wrapper(struct mlx4_dev *dev, int slave,
+			 struct mlx4_vhcr *vhcr,
+			 struct mlx4_cmd_mailbox *inbox,
+			 struct mlx4_cmd_mailbox *outbox,
+			 struct mlx4_cmd_info *cmd);
+int mlx4_QUERY_QP_wrapper(struct mlx4_dev *dev, int slave,
+			  struct mlx4_vhcr *vhcr,
+			  struct mlx4_cmd_mailbox *inbox,
+			  struct mlx4_cmd_mailbox *outbox,
+			  struct mlx4_cmd_info *cmd);
+
+int mlx4_GEN_EQE(struct mlx4_dev *dev, int slave, struct mlx4_eqe *eqe);
+
+int mlx4_cmd_init(struct mlx4_dev *dev);
+void mlx4_cmd_cleanup(struct mlx4_dev *dev);
+int mlx4_multi_func_init(struct mlx4_dev *dev);
+void mlx4_multi_func_cleanup(struct mlx4_dev *dev);
+void mlx4_cmd_event(struct mlx4_dev *dev, u16 token, u8 status, u64 out_param);
+int mlx4_cmd_use_events(struct mlx4_dev *dev);
+void mlx4_cmd_use_polling(struct mlx4_dev *dev);
+
+int mlx4_comm_cmd(struct mlx4_dev *dev, u8 cmd, u16 param,
+		  unsigned long timeout);
+
+void mlx4_cq_completion(struct mlx4_dev *dev, u32 cqn);
+void mlx4_cq_event(struct mlx4_dev *dev, u32 cqn, int event_type);
+
+void mlx4_qp_event(struct mlx4_dev *dev, u32 qpn, int event_type);
+
+void mlx4_srq_event(struct mlx4_dev *dev, u32 srqn, int event_type);
+
+void mlx4_handle_catas_err(struct mlx4_dev *dev);
+
+int mlx4_SENSE_PORT(struct mlx4_dev *dev, int port,
+		    enum mlx4_port_type *type);
+void mlx4_do_sense_ports(struct mlx4_dev *dev,
+			 enum mlx4_port_type *stype,
+			 enum mlx4_port_type *defaults);
+void mlx4_start_sense(struct mlx4_dev *dev);
+void mlx4_stop_sense(struct mlx4_dev *dev);
+void mlx4_sense_init(struct mlx4_dev *dev);
+int mlx4_check_port_params(struct mlx4_dev *dev,
+			   enum mlx4_port_type *port_type);
+int mlx4_change_port_types(struct mlx4_dev *dev,
+			   enum mlx4_port_type *port_types);
+
+void mlx4_init_mac_table(struct mlx4_dev *dev, struct mlx4_mac_table *table);
+void mlx4_init_vlan_table(struct mlx4_dev *dev, struct mlx4_vlan_table *table);
+void mlx4_init_roce_gid_table(struct mlx4_dev *dev,
+			      struct mlx4_roce_gid_table *table);
+void __mlx4_unregister_vlan(struct mlx4_dev *dev, u8 port, u16 vlan);
+int __mlx4_register_vlan(struct mlx4_dev *dev, u8 port, u16 vlan, int *index);
+
+int mlx4_SET_PORT(struct mlx4_dev *dev, u8 port, int pkey_tbl_sz);
+/* resource tracker functions*/
+int mlx4_get_slave_from_resource_id(struct mlx4_dev *dev,
+				    enum mlx4_resource resource_type,
+				    u64 resource_id, int *slave);
+void mlx4_delete_all_resources_for_slave(struct mlx4_dev *dev, int slave_id);
+void mlx4_reset_roce_gids(struct mlx4_dev *dev, int slave);
+int mlx4_init_resource_tracker(struct mlx4_dev *dev);
+
+void mlx4_free_resource_tracker(struct mlx4_dev *dev,
+				enum mlx4_res_tracker_free_type type);
+
+int mlx4_QUERY_FW_wrapper(struct mlx4_dev *dev, int slave,
+			  struct mlx4_vhcr *vhcr,
+			  struct mlx4_cmd_mailbox *inbox,
+			  struct mlx4_cmd_mailbox *outbox,
+			  struct mlx4_cmd_info *cmd);
+int mlx4_SET_PORT_wrapper(struct mlx4_dev *dev, int slave,
+			  struct mlx4_vhcr *vhcr,
+			  struct mlx4_cmd_mailbox *inbox,
+			  struct mlx4_cmd_mailbox *outbox,
+			  struct mlx4_cmd_info *cmd);
+int mlx4_INIT_PORT_wrapper(struct mlx4_dev *dev, int slave,
+			   struct mlx4_vhcr *vhcr,
+			   struct mlx4_cmd_mailbox *inbox,
+			   struct mlx4_cmd_mailbox *outbox,
+			   struct mlx4_cmd_info *cmd);
+int mlx4_CLOSE_PORT_wrapper(struct mlx4_dev *dev, int slave,
+			    struct mlx4_vhcr *vhcr,
+			    struct mlx4_cmd_mailbox *inbox,
+			    struct mlx4_cmd_mailbox *outbox,
+			    struct mlx4_cmd_info *cmd);
+int mlx4_QUERY_DEV_CAP_wrapper(struct mlx4_dev *dev, int slave,
+			       struct mlx4_vhcr *vhcr,
+			       struct mlx4_cmd_mailbox *inbox,
+			       struct mlx4_cmd_mailbox *outbox,
+			       struct mlx4_cmd_info *cmd);
+int mlx4_QUERY_PORT_wrapper(struct mlx4_dev *dev, int slave,
+			    struct mlx4_vhcr *vhcr,
+			    struct mlx4_cmd_mailbox *inbox,
+			    struct mlx4_cmd_mailbox *outbox,
+			    struct mlx4_cmd_info *cmd);
+int mlx4_get_port_ib_caps(struct mlx4_dev *dev, u8 port, __be32 *caps);
+
+int mlx4_get_slave_pkey_gid_tbl_len(struct mlx4_dev *dev, u8 port,
+				    int *gid_tbl_len, int *pkey_tbl_len);
+
+int mlx4_QP_ATTACH_wrapper(struct mlx4_dev *dev, int slave,
+			   struct mlx4_vhcr *vhcr,
+			   struct mlx4_cmd_mailbox *inbox,
+			   struct mlx4_cmd_mailbox *outbox,
+			   struct mlx4_cmd_info *cmd);
+
+int mlx4_UPDATE_QP_wrapper(struct mlx4_dev *dev, int slave,
+			   struct mlx4_vhcr *vhcr,
+			   struct mlx4_cmd_mailbox *inbox,
+			   struct mlx4_cmd_mailbox *outbox,
+			   struct mlx4_cmd_info *cmd);
+
+int mlx4_PROMISC_wrapper(struct mlx4_dev *dev, int slave,
+			 struct mlx4_vhcr *vhcr,
+			 struct mlx4_cmd_mailbox *inbox,
+			 struct mlx4_cmd_mailbox *outbox,
+			 struct mlx4_cmd_info *cmd);
+int mlx4_qp_detach_common(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16],
+			  enum mlx4_protocol prot, enum mlx4_steer_type steer);
+int mlx4_qp_attach_common(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16],
+			  int block_mcast_loopback, enum mlx4_protocol prot,
+			  enum mlx4_steer_type steer);
+int mlx4_trans_to_dmfs_attach(struct mlx4_dev *dev, struct mlx4_qp *qp,
+			      u8 gid[16], u8 port,
+			      int block_mcast_loopback,
+			      enum mlx4_protocol prot, u64 *reg_id);
+int mlx4_SET_MCAST_FLTR_wrapper(struct mlx4_dev *dev, int slave,
+				struct mlx4_vhcr *vhcr,
+				struct mlx4_cmd_mailbox *inbox,
+				struct mlx4_cmd_mailbox *outbox,
+				struct mlx4_cmd_info *cmd);
+int mlx4_SET_VLAN_FLTR_wrapper(struct mlx4_dev *dev, int slave,
+			       struct mlx4_vhcr *vhcr,
+			       struct mlx4_cmd_mailbox *inbox,
+			       struct mlx4_cmd_mailbox *outbox,
+			       struct mlx4_cmd_info *cmd);
+int mlx4_common_set_vlan_fltr(struct mlx4_dev *dev, int function,
+				     int port, void *buf);
+int mlx4_common_dump_eth_stats(struct mlx4_dev *dev, int slave, u32 in_mod,
+				struct mlx4_cmd_mailbox *outbox);
+int mlx4_DUMP_ETH_STATS_wrapper(struct mlx4_dev *dev, int slave,
+				   struct mlx4_vhcr *vhcr,
+				   struct mlx4_cmd_mailbox *inbox,
+				   struct mlx4_cmd_mailbox *outbox,
+				struct mlx4_cmd_info *cmd);
+int mlx4_PKEY_TABLE_wrapper(struct mlx4_dev *dev, int slave,
+			    struct mlx4_vhcr *vhcr,
+			    struct mlx4_cmd_mailbox *inbox,
+			    struct mlx4_cmd_mailbox *outbox,
+			    struct mlx4_cmd_info *cmd);
+int mlx4_QUERY_IF_STAT_wrapper(struct mlx4_dev *dev, int slave,
+			       struct mlx4_vhcr *vhcr,
+			       struct mlx4_cmd_mailbox *inbox,
+			       struct mlx4_cmd_mailbox *outbox,
+			       struct mlx4_cmd_info *cmd);
+int mlx4_QP_FLOW_STEERING_ATTACH_wrapper(struct mlx4_dev *dev, int slave,
+					 struct mlx4_vhcr *vhcr,
+					 struct mlx4_cmd_mailbox *inbox,
+					 struct mlx4_cmd_mailbox *outbox,
+					 struct mlx4_cmd_info *cmd);
+int mlx4_QP_FLOW_STEERING_DETACH_wrapper(struct mlx4_dev *dev, int slave,
+					 struct mlx4_vhcr *vhcr,
+					 struct mlx4_cmd_mailbox *inbox,
+					 struct mlx4_cmd_mailbox *outbox,
+					 struct mlx4_cmd_info *cmd);
+
+int mlx4_get_mgm_entry_size(struct mlx4_dev *dev);
+int mlx4_get_qp_per_mgm(struct mlx4_dev *dev);
+
+static inline void set_param_l(u64 *arg, u32 val)
+{
+	*arg = (*arg & 0xffffffff00000000ULL) | (u64) val;
+}
+
+static inline void set_param_h(u64 *arg, u32 val)
+{
+	*arg = (*arg & 0xffffffff) | ((u64) val << 32);
+}
+
+static inline u32 get_param_l(u64 *arg)
+{
+	return (u32) (*arg & 0xffffffff);
+}
+
+static inline u32 get_param_h(u64 *arg)
+{
+	return (u32)(*arg >> 32);
+}
+
+static inline spinlock_t *mlx4_tlock(struct mlx4_dev *dev)
+{
+	return &mlx4_priv(dev)->mfunc.master.res_tracker.lock;
+}
+
+#define NOT_MASKED_PD_BITS 17
+
+void mlx4_vf_immed_vlan_work_handler(struct work_struct *_work);
+
+void mlx4_init_quotas(struct mlx4_dev *dev);
+
+int mlx4_get_slave_num_gids(struct mlx4_dev *dev, int slave, int port);
+/* Returns the VF index of slave */
+int mlx4_get_vf_indx(struct mlx4_dev *dev, int slave);
+int mlx4_config_mad_demux(struct mlx4_dev *dev);
+
+#endif /* MLX4_H */
diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
new file mode 100644
index 0000000..8fef658
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
@@ -0,0 +1,881 @@
+/*
+ * Copyright (c) 2007 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifndef _MLX4_EN_H_
+#define _MLX4_EN_H_
+
+#include <linux/bitops.h>
+#include <linux/compiler.h>
+#include <linux/list.h>
+#include <linux/mutex.h>
+#include <linux/netdevice.h>
+#include <linux/if_vlan.h>
+#include <linux/net_tstamp.h>
+#ifdef CONFIG_MLX4_EN_DCB
+#include <linux/dcbnl.h>
+#endif
+#include <linux/cpu_rmap.h>
+#include <linux/ptp_clock_kernel.h>
+
+#include <linux/mlx4/device.h>
+#include <linux/mlx4/qp.h>
+#include <linux/mlx4/cq.h>
+#include <linux/mlx4/srq.h>
+#include <linux/mlx4/doorbell.h>
+#include <linux/mlx4/cmd.h>
+
+#include "en_port.h"
+
+#define DRV_NAME	"mlx4_en"
+#define DRV_VERSION	"2.2-1"
+#define DRV_RELDATE	"Feb 2014"
+
+#define MLX4_EN_MSG_LEVEL	(NETIF_MSG_LINK | NETIF_MSG_IFDOWN)
+
+/*
+ * Device constants
+ */
+
+
+#define MLX4_EN_PAGE_SHIFT	12
+#define MLX4_EN_PAGE_SIZE	(1 << MLX4_EN_PAGE_SHIFT)
+#define DEF_RX_RINGS		16
+#define MAX_RX_RINGS		128
+#define MIN_RX_RINGS		4
+#define TXBB_SIZE		64
+#define HEADROOM		(2048 / TXBB_SIZE + 1)
+#define STAMP_STRIDE		64
+#define STAMP_DWORDS		(STAMP_STRIDE / 4)
+#define STAMP_SHIFT		31
+#define STAMP_VAL		0x7fffffff
+#define STATS_DELAY		(HZ / 4)
+#define SERVICE_TASK_DELAY	(HZ / 4)
+#define MAX_NUM_OF_FS_RULES	256
+
+#define MLX4_EN_FILTER_HASH_SHIFT 4
+#define MLX4_EN_FILTER_EXPIRY_QUOTA 60
+
+/* Typical TSO descriptor with 16 gather entries is 352 bytes... */
+#define MAX_DESC_SIZE		512
+#define MAX_DESC_TXBBS		(MAX_DESC_SIZE / TXBB_SIZE)
+
+/*
+ * OS related constants and tunables
+ */
+
+#define MLX4_EN_PRIV_FLAGS_BLUEFLAME 1
+
+#define MLX4_EN_WATCHDOG_TIMEOUT	(15 * HZ)
+
+/* Use the maximum between 16384 and a single page */
+#define MLX4_EN_ALLOC_SIZE	PAGE_ALIGN(16384)
+
+#define MLX4_EN_ALLOC_PREFER_ORDER	PAGE_ALLOC_COSTLY_ORDER
+
+/* Receive fragment sizes; we use at most 3 fragments (for 9600 byte MTU
+ * and 4K allocations) */
+enum {
+	FRAG_SZ0 = 1536 - NET_IP_ALIGN,
+	FRAG_SZ1 = 4096,
+	FRAG_SZ2 = 4096,
+	FRAG_SZ3 = MLX4_EN_ALLOC_SIZE
+};
+#define MLX4_EN_MAX_RX_FRAGS	4
+
+/* Maximum ring sizes */
+#define MLX4_EN_MAX_TX_SIZE	8192
+#define MLX4_EN_MAX_RX_SIZE	8192
+
+/* Minimum ring size for our page-allocation scheme to work */
+#define MLX4_EN_MIN_RX_SIZE	(MLX4_EN_ALLOC_SIZE / SMP_CACHE_BYTES)
+#define MLX4_EN_MIN_TX_SIZE	(4096 / TXBB_SIZE)
+
+#define MLX4_EN_SMALL_PKT_SIZE		64
+#define MLX4_EN_MIN_TX_RING_P_UP	1
+#define MLX4_EN_MAX_TX_RING_P_UP	32
+#define MLX4_EN_NUM_UP			8
+#define MLX4_EN_DEF_TX_RING_SIZE	512
+#define MLX4_EN_DEF_RX_RING_SIZE  	1024
+#define MAX_TX_RINGS			(MLX4_EN_MAX_TX_RING_P_UP * \
+					 MLX4_EN_NUM_UP)
+
+#define MLX4_EN_DEFAULT_TX_WORK		256
+
+/* Target number of packets to coalesce with interrupt moderation */
+#define MLX4_EN_RX_COAL_TARGET	44
+#define MLX4_EN_RX_COAL_TIME	0x10
+
+#define MLX4_EN_TX_COAL_PKTS	16
+#define MLX4_EN_TX_COAL_TIME	0x10
+
+#define MLX4_EN_RX_RATE_LOW		400000
+#define MLX4_EN_RX_COAL_TIME_LOW	0
+#define MLX4_EN_RX_RATE_HIGH		450000
+#define MLX4_EN_RX_COAL_TIME_HIGH	128
+#define MLX4_EN_RX_SIZE_THRESH		1024
+#define MLX4_EN_RX_RATE_THRESH		(1000000 / MLX4_EN_RX_COAL_TIME_HIGH)
+#define MLX4_EN_SAMPLE_INTERVAL		0
+#define MLX4_EN_AVG_PKT_SMALL		256
+
+#define MLX4_EN_AUTO_CONF	0xffff
+
+#define MLX4_EN_DEF_RX_PAUSE	1
+#define MLX4_EN_DEF_TX_PAUSE	1
+
+/* Interval between successive polls in the Tx routine when polling is used
+   instead of interrupts (in per-core Tx rings) - should be power of 2 */
+#define MLX4_EN_TX_POLL_MODER	16
+#define MLX4_EN_TX_POLL_TIMEOUT	(HZ / 4)
+
+#define SMALL_PACKET_SIZE      (256 - NET_IP_ALIGN)
+#define HEADER_COPY_SIZE       (128 - NET_IP_ALIGN)
+#define MLX4_LOOPBACK_TEST_PAYLOAD (HEADER_COPY_SIZE - ETH_HLEN)
+
+#define MLX4_EN_MIN_MTU		46
+#define ETH_BCAST		0xffffffffffffULL
+
+#define MLX4_EN_LOOPBACK_RETRIES	5
+#define MLX4_EN_LOOPBACK_TIMEOUT	100
+
+#ifdef MLX4_EN_PERF_STAT
+/* Number of samples to 'average' */
+#define AVG_SIZE			128
+#define AVG_FACTOR			1024
+#define NUM_PERF_STATS			NUM_PERF_COUNTERS
+
+#define INC_PERF_COUNTER(cnt)		(++(cnt))
+#define ADD_PERF_COUNTER(cnt, add)	((cnt) += (add))
+#define AVG_PERF_COUNTER(cnt, sample) \
+	((cnt) = ((cnt) * (AVG_SIZE - 1) + (sample) * AVG_FACTOR) / AVG_SIZE)
+#define GET_PERF_COUNTER(cnt)		(cnt)
+#define GET_AVG_PERF_COUNTER(cnt)	((cnt) / AVG_FACTOR)
+
+#else
+
+#define NUM_PERF_STATS			0
+#define INC_PERF_COUNTER(cnt)		do {} while (0)
+#define ADD_PERF_COUNTER(cnt, add)	do {} while (0)
+#define AVG_PERF_COUNTER(cnt, sample)	do {} while (0)
+#define GET_PERF_COUNTER(cnt)		(0)
+#define GET_AVG_PERF_COUNTER(cnt)	(0)
+#endif /* MLX4_EN_PERF_STAT */
+
+/* Constants for TX flow */
+enum {
+	MAX_INLINE = 104, /* 128 - 16 - 4 - 4 */
+	MAX_BF = 256,
+	MIN_PKT_LEN = 17,
+};
+
+/*
+ * Configurables
+ */
+
+enum cq_type {
+	RX = 0,
+	TX = 1,
+};
+
+
+/*
+ * Useful macros
+ */
+#define ROUNDUP_LOG2(x)		ilog2(roundup_pow_of_two(x))
+#define XNOR(x, y)		(!(x) == !(y))
+
+
+struct mlx4_en_tx_info {
+	struct sk_buff *skb;
+	dma_addr_t	map0_dma;
+	u32		map0_byte_count;
+	u32		nr_txbb;
+	u32		nr_bytes;
+	u8		linear;
+	u8		data_offset;
+	u8		inl;
+	u8		ts_requested;
+	u8		nr_maps;
+} ____cacheline_aligned_in_smp;
+
+
+#define MLX4_EN_BIT_DESC_OWN	0x80000000
+#define CTRL_SIZE	sizeof(struct mlx4_wqe_ctrl_seg)
+#define MLX4_EN_MEMTYPE_PAD	0x100
+#define DS_SIZE		sizeof(struct mlx4_wqe_data_seg)
+
+
+struct mlx4_en_tx_desc {
+	struct mlx4_wqe_ctrl_seg ctrl;
+	union {
+		struct mlx4_wqe_data_seg data; /* at least one data segment */
+		struct mlx4_wqe_lso_seg lso;
+		struct mlx4_wqe_inline_seg inl;
+	};
+};
+
+#define MLX4_EN_USE_SRQ		0x01000000
+
+#define MLX4_EN_CX3_LOW_ID	0x1000
+#define MLX4_EN_CX3_HIGH_ID	0x1005
+
+struct mlx4_en_rx_alloc {
+	struct page	*page;
+	dma_addr_t	dma;
+	u32		page_offset;
+	u32		page_size;
+};
+
+struct mlx4_en_tx_ring {
+	/* cache line used and dirtied in tx completion
+	 * (mlx4_en_free_tx_buf())
+	 */
+	u32			last_nr_txbb;
+	u32			cons;
+	unsigned long		wake_queue;
+
+	/* cache line used and dirtied in mlx4_en_xmit() */
+	u32			prod ____cacheline_aligned_in_smp;
+	unsigned long		bytes;
+	unsigned long		packets;
+	unsigned long		tx_csum;
+	unsigned long		tso_packets;
+	unsigned long		xmit_more;
+	struct mlx4_bf		bf;
+	unsigned long		queue_stopped;
+
+	/* Following part should be mostly read */
+	cpumask_t		affinity_mask;
+	struct mlx4_qp		qp;
+	struct mlx4_hwq_resources wqres;
+	u32			size; /* number of TXBBs */
+	u32			size_mask;
+	u16			stride;
+	u16			cqn;	/* index of port CQ associated with this ring */
+	u32			buf_size;
+	__be32			doorbell_qpn;
+	__be32			mr_key;
+	void			*buf;
+	struct mlx4_en_tx_info	*tx_info;
+	u8			*bounce_buf;
+	struct mlx4_qp_context	context;
+	int			qpn;
+	enum mlx4_qp_state	qp_state;
+	u8			queue_index;
+	bool			bf_enabled;
+	bool			bf_alloced;
+	struct netdev_queue	*tx_queue;
+	int			hwtstamp_tx_type;
+} ____cacheline_aligned_in_smp;
+
+struct mlx4_en_rx_desc {
+	/* actual number of entries depends on rx ring stride */
+	struct mlx4_wqe_data_seg data[0];
+};
+
+struct mlx4_en_rx_ring {
+	struct mlx4_hwq_resources wqres;
+	struct mlx4_en_rx_alloc page_alloc[MLX4_EN_MAX_RX_FRAGS];
+	u32 size ;	/* number of Rx descs*/
+	u32 actual_size;
+	u32 size_mask;
+	u16 stride;
+	u16 log_stride;
+	u16 cqn;	/* index of port CQ associated with this ring */
+	u32 prod;
+	u32 cons;
+	u32 buf_size;
+	u8  fcs_del;
+	void *buf;
+	void *rx_info;
+	unsigned long bytes;
+	unsigned long packets;
+#ifdef CONFIG_NET_RX_BUSY_POLL
+	unsigned long yields;
+	unsigned long misses;
+	unsigned long cleaned;
+#endif
+	unsigned long csum_ok;
+	unsigned long csum_none;
+	int hwtstamp_rx_filter;
+	cpumask_var_t affinity_mask;
+};
+
+struct mlx4_en_cq {
+	struct mlx4_cq          mcq;
+	struct mlx4_hwq_resources wqres;
+	int                     ring;
+	struct net_device      *dev;
+	struct napi_struct	napi;
+	int size;
+	int buf_size;
+	unsigned vector;
+	enum cq_type is_tx;
+	u16 moder_time;
+	u16 moder_cnt;
+	struct mlx4_cqe *buf;
+#define MLX4_EN_OPCODE_ERROR	0x1e
+
+#ifdef CONFIG_NET_RX_BUSY_POLL
+	unsigned int state;
+#define MLX4_EN_CQ_STATE_IDLE        0
+#define MLX4_EN_CQ_STATE_NAPI     1    /* NAPI owns this CQ */
+#define MLX4_EN_CQ_STATE_POLL     2    /* poll owns this CQ */
+#define MLX4_CQ_LOCKED (MLX4_EN_CQ_STATE_NAPI | MLX4_EN_CQ_STATE_POLL)
+#define MLX4_EN_CQ_STATE_NAPI_YIELD  4    /* NAPI yielded this CQ */
+#define MLX4_EN_CQ_STATE_POLL_YIELD  8    /* poll yielded this CQ */
+#define CQ_YIELD (MLX4_EN_CQ_STATE_NAPI_YIELD | MLX4_EN_CQ_STATE_POLL_YIELD)
+#define CQ_USER_PEND (MLX4_EN_CQ_STATE_POLL | MLX4_EN_CQ_STATE_POLL_YIELD)
+	spinlock_t poll_lock; /* protects from LLS/napi conflicts */
+#endif  /* CONFIG_NET_RX_BUSY_POLL */
+	struct irq_desc *irq_desc;
+};
+
+struct mlx4_en_port_profile {
+	u32 flags;
+	u32 tx_ring_num;
+	u32 rx_ring_num;
+	u32 tx_ring_size;
+	u32 rx_ring_size;
+	u8 rx_pause;
+	u8 rx_ppp;
+	u8 tx_pause;
+	u8 tx_ppp;
+	int rss_rings;
+	int inline_thold;
+};
+
+struct mlx4_en_profile {
+	int rss_xor;
+	int udp_rss;
+	u8 rss_mask;
+	u32 active_ports;
+	u32 small_pkt_int;
+	u8 no_reset;
+	u8 num_tx_rings_p_up;
+	struct mlx4_en_port_profile prof[MLX4_MAX_PORTS + 1];
+};
+
+struct mlx4_en_dev {
+	struct mlx4_dev         *dev;
+	struct pci_dev		*pdev;
+	struct mutex		state_lock;
+	struct net_device       *pndev[MLX4_MAX_PORTS + 1];
+	u32                     port_cnt;
+	bool			device_up;
+	struct mlx4_en_profile  profile;
+	u32			LSO_support;
+	struct workqueue_struct *workqueue;
+	struct device           *dma_device;
+	void __iomem            *uar_map;
+	struct mlx4_uar         priv_uar;
+	struct mlx4_mr		mr;
+	u32                     priv_pdn;
+	spinlock_t              uar_lock;
+	u8			mac_removed[MLX4_MAX_PORTS + 1];
+	rwlock_t		clock_lock;
+	u32			nominal_c_mult;
+	struct cyclecounter	cycles;
+	struct timecounter	clock;
+	unsigned long		last_overflow_check;
+	unsigned long		overflow_period;
+	struct ptp_clock	*ptp_clock;
+	struct ptp_clock_info	ptp_clock_info;
+};
+
+
+struct mlx4_en_rss_map {
+	int base_qpn;
+	struct mlx4_qp qps[MAX_RX_RINGS];
+	enum mlx4_qp_state state[MAX_RX_RINGS];
+	struct mlx4_qp indir_qp;
+	enum mlx4_qp_state indir_state;
+};
+
+struct mlx4_en_port_state {
+	int link_state;
+	int link_speed;
+	int transciver;
+};
+
+struct mlx4_en_pkt_stats {
+	unsigned long broadcast;
+	unsigned long rx_prio[8];
+	unsigned long tx_prio[8];
+#define NUM_PKT_STATS		17
+};
+
+struct mlx4_en_port_stats {
+	unsigned long tso_packets;
+	unsigned long xmit_more;
+	unsigned long queue_stopped;
+	unsigned long wake_queue;
+	unsigned long tx_timeout;
+	unsigned long rx_alloc_failed;
+	unsigned long rx_chksum_good;
+	unsigned long rx_chksum_none;
+	unsigned long tx_chksum_offload;
+#define NUM_PORT_STATS		9
+};
+
+struct mlx4_en_perf_stats {
+	u32 tx_poll;
+	u64 tx_pktsz_avg;
+	u32 inflight_avg;
+	u16 tx_coal_avg;
+	u16 rx_coal_avg;
+	u32 napi_quota;
+#define NUM_PERF_COUNTERS		6
+};
+
+enum mlx4_en_mclist_act {
+	MCLIST_NONE,
+	MCLIST_REM,
+	MCLIST_ADD,
+};
+
+struct mlx4_en_mc_list {
+	struct list_head	list;
+	enum mlx4_en_mclist_act	action;
+	u8			addr[ETH_ALEN];
+	u64			reg_id;
+	u64			tunnel_reg_id;
+};
+
+struct mlx4_en_frag_info {
+	u16 frag_size;
+	u16 frag_prefix_size;
+	u16 frag_stride;
+	u16 frag_align;
+};
+
+#ifdef CONFIG_MLX4_EN_DCB
+/* Minimal TC BW - setting to 0 will block traffic */
+#define MLX4_EN_BW_MIN 1
+#define MLX4_EN_BW_MAX 100 /* Utilize 100% of the line */
+
+#define MLX4_EN_TC_ETS 7
+
+#endif
+
+struct ethtool_flow_id {
+	struct list_head list;
+	struct ethtool_rx_flow_spec flow_spec;
+	u64 id;
+};
+
+enum {
+	MLX4_EN_FLAG_PROMISC		= (1 << 0),
+	MLX4_EN_FLAG_MC_PROMISC		= (1 << 1),
+	/* whether we need to enable hardware loopback by putting dmac
+	 * in Tx WQE
+	 */
+	MLX4_EN_FLAG_ENABLE_HW_LOOPBACK	= (1 << 2),
+	/* whether we need to drop packets that hardware loopback-ed */
+	MLX4_EN_FLAG_RX_FILTER_NEEDED	= (1 << 3),
+	MLX4_EN_FLAG_FORCE_PROMISC	= (1 << 4)
+};
+
+#define MLX4_EN_MAC_HASH_SIZE (1 << BITS_PER_BYTE)
+#define MLX4_EN_MAC_HASH_IDX 5
+
+struct mlx4_en_priv {
+	struct mlx4_en_dev *mdev;
+	struct mlx4_en_port_profile *prof;
+	struct net_device *dev;
+	unsigned long active_vlans[BITS_TO_LONGS(VLAN_N_VID)];
+	struct net_device_stats stats;
+	struct net_device_stats ret_stats;
+	struct mlx4_en_port_state port_state;
+	spinlock_t stats_lock;
+	struct ethtool_flow_id ethtool_rules[MAX_NUM_OF_FS_RULES];
+	/* To allow rules removal while port is going down */
+	struct list_head ethtool_list;
+
+	unsigned long last_moder_packets[MAX_RX_RINGS];
+	unsigned long last_moder_tx_packets;
+	unsigned long last_moder_bytes[MAX_RX_RINGS];
+	unsigned long last_moder_jiffies;
+	int last_moder_time[MAX_RX_RINGS];
+	u16 rx_usecs;
+	u16 rx_frames;
+	u16 tx_usecs;
+	u16 tx_frames;
+	u32 pkt_rate_low;
+	u16 rx_usecs_low;
+	u32 pkt_rate_high;
+	u16 rx_usecs_high;
+	u16 sample_interval;
+	u16 adaptive_rx_coal;
+	u32 msg_enable;
+	u32 loopback_ok;
+	u32 validate_loopback;
+
+	struct mlx4_hwq_resources res;
+	int link_state;
+	int last_link_state;
+	bool port_up;
+	int port;
+	int registered;
+	int allocated;
+	int stride;
+	unsigned char current_mac[ETH_ALEN + 2];
+	int mac_index;
+	unsigned max_mtu;
+	int base_qpn;
+	int cqe_factor;
+	int cqe_size;
+
+	struct mlx4_en_rss_map rss_map;
+	__be32 ctrl_flags;
+	u32 flags;
+	u8 num_tx_rings_p_up;
+	u32 tx_work_limit;
+	u32 tx_ring_num;
+	u32 rx_ring_num;
+	u32 rx_skb_size;
+	struct mlx4_en_frag_info frag_info[MLX4_EN_MAX_RX_FRAGS];
+	u16 num_frags;
+	u16 log_rx_info;
+
+	struct mlx4_en_tx_ring **tx_ring;
+	struct mlx4_en_rx_ring *rx_ring[MAX_RX_RINGS];
+	struct mlx4_en_cq **tx_cq;
+	struct mlx4_en_cq *rx_cq[MAX_RX_RINGS];
+	struct mlx4_qp drop_qp;
+	struct work_struct rx_mode_task;
+	struct work_struct watchdog_task;
+	struct work_struct linkstate_task;
+	struct delayed_work stats_task;
+	struct delayed_work service_task;
+#ifdef CONFIG_MLX4_EN_VXLAN
+	struct work_struct vxlan_add_task;
+	struct work_struct vxlan_del_task;
+#endif
+	struct mlx4_en_perf_stats pstats;
+	struct mlx4_en_pkt_stats pkstats;
+	struct mlx4_en_port_stats port_stats;
+	u64 stats_bitmap;
+	struct list_head mc_list;
+	struct list_head curr_list;
+	u64 broadcast_id;
+	struct mlx4_en_stat_out_mbox hw_stats;
+	int vids[128];
+	bool wol;
+	struct device *ddev;
+	int base_tx_qpn;
+	struct hlist_head mac_hash[MLX4_EN_MAC_HASH_SIZE];
+	struct hwtstamp_config hwtstamp_config;
+
+#ifdef CONFIG_MLX4_EN_DCB
+	struct ieee_ets ets;
+	u16 maxrate[IEEE_8021QAZ_MAX_TCS];
+#endif
+#ifdef CONFIG_RFS_ACCEL
+	spinlock_t filters_lock;
+	int last_filter_id;
+	struct list_head filters;
+	struct hlist_head filter_hash[1 << MLX4_EN_FILTER_HASH_SHIFT];
+#endif
+	u64 tunnel_reg_id;
+	__be16 vxlan_port;
+
+	u32 pflags;
+};
+
+enum mlx4_en_wol {
+	MLX4_EN_WOL_MAGIC = (1ULL << 61),
+	MLX4_EN_WOL_ENABLED = (1ULL << 62),
+};
+
+struct mlx4_mac_entry {
+	struct hlist_node hlist;
+	unsigned char mac[ETH_ALEN + 2];
+	u64 reg_id;
+	struct rcu_head rcu;
+};
+
+static inline struct mlx4_cqe *mlx4_en_get_cqe(void *buf, int idx, int cqe_sz)
+{
+	return buf + idx * cqe_sz;
+}
+
+#ifdef CONFIG_NET_RX_BUSY_POLL
+static inline void mlx4_en_cq_init_lock(struct mlx4_en_cq *cq)
+{
+	spin_lock_init(&cq->poll_lock);
+	cq->state = MLX4_EN_CQ_STATE_IDLE;
+}
+
+/* called from the device poll rutine to get ownership of a cq */
+static inline bool mlx4_en_cq_lock_napi(struct mlx4_en_cq *cq)
+{
+	int rc = true;
+	spin_lock(&cq->poll_lock);
+	if (cq->state & MLX4_CQ_LOCKED) {
+		WARN_ON(cq->state & MLX4_EN_CQ_STATE_NAPI);
+		cq->state |= MLX4_EN_CQ_STATE_NAPI_YIELD;
+		rc = false;
+	} else
+		/* we don't care if someone yielded */
+		cq->state = MLX4_EN_CQ_STATE_NAPI;
+	spin_unlock(&cq->poll_lock);
+	return rc;
+}
+
+/* returns true is someone tried to get the cq while napi had it */
+static inline bool mlx4_en_cq_unlock_napi(struct mlx4_en_cq *cq)
+{
+	int rc = false;
+	spin_lock(&cq->poll_lock);
+	WARN_ON(cq->state & (MLX4_EN_CQ_STATE_POLL |
+			       MLX4_EN_CQ_STATE_NAPI_YIELD));
+
+	if (cq->state & MLX4_EN_CQ_STATE_POLL_YIELD)
+		rc = true;
+	cq->state = MLX4_EN_CQ_STATE_IDLE;
+	spin_unlock(&cq->poll_lock);
+	return rc;
+}
+
+/* called from mlx4_en_low_latency_poll() */
+static inline bool mlx4_en_cq_lock_poll(struct mlx4_en_cq *cq)
+{
+	int rc = true;
+	spin_lock_bh(&cq->poll_lock);
+	if ((cq->state & MLX4_CQ_LOCKED)) {
+		struct net_device *dev = cq->dev;
+		struct mlx4_en_priv *priv = netdev_priv(dev);
+		struct mlx4_en_rx_ring *rx_ring = priv->rx_ring[cq->ring];
+
+		cq->state |= MLX4_EN_CQ_STATE_POLL_YIELD;
+		rc = false;
+		rx_ring->yields++;
+	} else
+		/* preserve yield marks */
+		cq->state |= MLX4_EN_CQ_STATE_POLL;
+	spin_unlock_bh(&cq->poll_lock);
+	return rc;
+}
+
+/* returns true if someone tried to get the cq while it was locked */
+static inline bool mlx4_en_cq_unlock_poll(struct mlx4_en_cq *cq)
+{
+	int rc = false;
+	spin_lock_bh(&cq->poll_lock);
+	WARN_ON(cq->state & (MLX4_EN_CQ_STATE_NAPI));
+
+	if (cq->state & MLX4_EN_CQ_STATE_POLL_YIELD)
+		rc = true;
+	cq->state = MLX4_EN_CQ_STATE_IDLE;
+	spin_unlock_bh(&cq->poll_lock);
+	return rc;
+}
+
+/* true if a socket is polling, even if it did not get the lock */
+static inline bool mlx4_en_cq_busy_polling(struct mlx4_en_cq *cq)
+{
+	WARN_ON(!(cq->state & MLX4_CQ_LOCKED));
+	return cq->state & CQ_USER_PEND;
+}
+#else
+static inline void mlx4_en_cq_init_lock(struct mlx4_en_cq *cq)
+{
+}
+
+static inline bool mlx4_en_cq_lock_napi(struct mlx4_en_cq *cq)
+{
+	return true;
+}
+
+static inline bool mlx4_en_cq_unlock_napi(struct mlx4_en_cq *cq)
+{
+	return false;
+}
+
+static inline bool mlx4_en_cq_lock_poll(struct mlx4_en_cq *cq)
+{
+	return false;
+}
+
+static inline bool mlx4_en_cq_unlock_poll(struct mlx4_en_cq *cq)
+{
+	return false;
+}
+
+static inline bool mlx4_en_cq_busy_polling(struct mlx4_en_cq *cq)
+{
+	return false;
+}
+#endif /* CONFIG_NET_RX_BUSY_POLL */
+
+#define MLX4_EN_WOL_DO_MODIFY (1ULL << 63)
+
+void mlx4_en_update_loopback_state(struct net_device *dev,
+				   netdev_features_t features);
+
+void mlx4_en_destroy_netdev(struct net_device *dev);
+int mlx4_en_init_netdev(struct mlx4_en_dev *mdev, int port,
+			struct mlx4_en_port_profile *prof);
+
+int mlx4_en_start_port(struct net_device *dev);
+void mlx4_en_stop_port(struct net_device *dev, int detach);
+
+void mlx4_en_free_resources(struct mlx4_en_priv *priv);
+int mlx4_en_alloc_resources(struct mlx4_en_priv *priv);
+
+int mlx4_en_create_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq **pcq,
+		      int entries, int ring, enum cq_type mode, int node);
+void mlx4_en_destroy_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq **pcq);
+int mlx4_en_activate_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq,
+			int cq_idx);
+void mlx4_en_deactivate_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq);
+int mlx4_en_set_cq_moder(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq);
+int mlx4_en_arm_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq);
+
+void mlx4_en_tx_irq(struct mlx4_cq *mcq);
+u16 mlx4_en_select_queue(struct net_device *dev, struct sk_buff *skb,
+			 void *accel_priv, select_queue_fallback_t fallback);
+netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev);
+
+int mlx4_en_create_tx_ring(struct mlx4_en_priv *priv,
+			   struct mlx4_en_tx_ring **pring,
+			   int qpn, u32 size, u16 stride,
+			   int node, int queue_index);
+void mlx4_en_destroy_tx_ring(struct mlx4_en_priv *priv,
+			     struct mlx4_en_tx_ring **pring);
+int mlx4_en_activate_tx_ring(struct mlx4_en_priv *priv,
+			     struct mlx4_en_tx_ring *ring,
+			     int cq, int user_prio);
+void mlx4_en_deactivate_tx_ring(struct mlx4_en_priv *priv,
+				struct mlx4_en_tx_ring *ring);
+void mlx4_en_set_num_rx_rings(struct mlx4_en_dev *mdev);
+int mlx4_en_create_rx_ring(struct mlx4_en_priv *priv,
+			   struct mlx4_en_rx_ring **pring,
+			   u32 size, u16 stride, int node);
+void mlx4_en_destroy_rx_ring(struct mlx4_en_priv *priv,
+			     struct mlx4_en_rx_ring **pring,
+			     u32 size, u16 stride);
+int mlx4_en_activate_rx_rings(struct mlx4_en_priv *priv);
+void mlx4_en_deactivate_rx_ring(struct mlx4_en_priv *priv,
+				struct mlx4_en_rx_ring *ring);
+int mlx4_en_process_rx_cq(struct net_device *dev,
+			  struct mlx4_en_cq *cq,
+			  int budget);
+int mlx4_en_poll_rx_cq(struct napi_struct *napi, int budget);
+int mlx4_en_poll_tx_cq(struct napi_struct *napi, int budget);
+void mlx4_en_fill_qp_context(struct mlx4_en_priv *priv, int size, int stride,
+		int is_tx, int rss, int qpn, int cqn, int user_prio,
+		struct mlx4_qp_context *context);
+void mlx4_en_sqp_event(struct mlx4_qp *qp, enum mlx4_event event);
+int mlx4_en_map_buffer(struct mlx4_buf *buf);
+void mlx4_en_unmap_buffer(struct mlx4_buf *buf);
+
+void mlx4_en_calc_rx_buf(struct net_device *dev);
+int mlx4_en_config_rss_steer(struct mlx4_en_priv *priv);
+void mlx4_en_release_rss_steer(struct mlx4_en_priv *priv);
+int mlx4_en_create_drop_qp(struct mlx4_en_priv *priv);
+void mlx4_en_destroy_drop_qp(struct mlx4_en_priv *priv);
+int mlx4_en_free_tx_buf(struct net_device *dev, struct mlx4_en_tx_ring *ring);
+void mlx4_en_rx_irq(struct mlx4_cq *mcq);
+
+int mlx4_SET_MCAST_FLTR(struct mlx4_dev *dev, u8 port, u64 mac, u64 clear, u8 mode);
+int mlx4_SET_VLAN_FLTR(struct mlx4_dev *dev, struct mlx4_en_priv *priv);
+
+int mlx4_en_DUMP_ETH_STATS(struct mlx4_en_dev *mdev, u8 port, u8 reset);
+int mlx4_en_QUERY_PORT(struct mlx4_en_dev *mdev, u8 port);
+
+#ifdef CONFIG_MLX4_EN_DCB
+extern const struct dcbnl_rtnl_ops mlx4_en_dcbnl_ops;
+extern const struct dcbnl_rtnl_ops mlx4_en_dcbnl_pfc_ops;
+#endif
+
+int mlx4_en_setup_tc(struct net_device *dev, u8 up);
+
+#ifdef CONFIG_RFS_ACCEL
+void mlx4_en_cleanup_filters(struct mlx4_en_priv *priv);
+#endif
+
+#define MLX4_EN_NUM_SELF_TEST	5
+void mlx4_en_ex_selftest(struct net_device *dev, u32 *flags, u64 *buf);
+void mlx4_en_ptp_overflow_check(struct mlx4_en_dev *mdev);
+
+/*
+ * Functions for time stamping
+ */
+u64 mlx4_en_get_cqe_ts(struct mlx4_cqe *cqe);
+void mlx4_en_fill_hwtstamps(struct mlx4_en_dev *mdev,
+			    struct skb_shared_hwtstamps *hwts,
+			    u64 timestamp);
+void mlx4_en_init_timestamp(struct mlx4_en_dev *mdev);
+void mlx4_en_remove_timestamp(struct mlx4_en_dev *mdev);
+int mlx4_en_timestamp_config(struct net_device *dev,
+			     int tx_type,
+			     int rx_filter);
+
+/* Globals
+ */
+extern const struct ethtool_ops mlx4_en_ethtool_ops;
+
+
+
+/*
+ * printk / logging functions
+ */
+
+__printf(3, 4)
+void en_print(const char *level, const struct mlx4_en_priv *priv,
+	      const char *format, ...);
+
+#define en_dbg(mlevel, priv, format, ...)				\
+do {									\
+	if (NETIF_MSG_##mlevel & (priv)->msg_enable)			\
+		en_print(KERN_DEBUG, priv, format, ##__VA_ARGS__);	\
+} while (0)
+#define en_warn(priv, format, ...)					\
+	en_print(KERN_WARNING, priv, format, ##__VA_ARGS__)
+#define en_err(priv, format, ...)					\
+	en_print(KERN_ERR, priv, format, ##__VA_ARGS__)
+#define en_info(priv, format, ...)					\
+	en_print(KERN_INFO, priv, format, ##__VA_ARGS__)
+
+#define mlx4_err(mdev, format, ...)					\
+	pr_err(DRV_NAME " %s: " format,					\
+	       dev_name(&(mdev)->pdev->dev), ##__VA_ARGS__)
+#define mlx4_info(mdev, format, ...)					\
+	pr_info(DRV_NAME " %s: " format,				\
+		dev_name(&(mdev)->pdev->dev), ##__VA_ARGS__)
+#define mlx4_warn(mdev, format, ...)					\
+	pr_warn(DRV_NAME " %s: " format,				\
+		dev_name(&(mdev)->pdev->dev), ##__VA_ARGS__)
+
+#endif
diff --git a/drivers/net/ethernet/mellanox/mlx4/mr.c b/drivers/net/ethernet/mellanox/mlx4/mr.c
new file mode 100644
index 0000000..193a6ad
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4/mr.c
@@ -0,0 +1,1166 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005, 2006, 2007, 2008 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2006, 2007 Cisco Systems, Inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/errno.h>
+#include <linux/export.h>
+#include <linux/slab.h>
+#include <linux/kernel.h>
+#include <linux/vmalloc.h>
+
+#include <linux/mlx4/cmd.h>
+
+#include "mlx4.h"
+#include "icm.h"
+
+static u32 mlx4_buddy_alloc(struct mlx4_buddy *buddy, int order)
+{
+	int o;
+	int m;
+	u32 seg;
+
+	spin_lock(&buddy->lock);
+
+	for (o = order; o <= buddy->max_order; ++o)
+		if (buddy->num_free[o]) {
+			m = 1 << (buddy->max_order - o);
+			seg = find_first_bit(buddy->bits[o], m);
+			if (seg < m)
+				goto found;
+		}
+
+	spin_unlock(&buddy->lock);
+	return -1;
+
+ found:
+	clear_bit(seg, buddy->bits[o]);
+	--buddy->num_free[o];
+
+	while (o > order) {
+		--o;
+		seg <<= 1;
+		set_bit(seg ^ 1, buddy->bits[o]);
+		++buddy->num_free[o];
+	}
+
+	spin_unlock(&buddy->lock);
+
+	seg <<= order;
+
+	return seg;
+}
+
+static void mlx4_buddy_free(struct mlx4_buddy *buddy, u32 seg, int order)
+{
+	seg >>= order;
+
+	spin_lock(&buddy->lock);
+
+	while (test_bit(seg ^ 1, buddy->bits[order])) {
+		clear_bit(seg ^ 1, buddy->bits[order]);
+		--buddy->num_free[order];
+		seg >>= 1;
+		++order;
+	}
+
+	set_bit(seg, buddy->bits[order]);
+	++buddy->num_free[order];
+
+	spin_unlock(&buddy->lock);
+}
+
+static int mlx4_buddy_init(struct mlx4_buddy *buddy, int max_order)
+{
+	int i, s;
+
+	buddy->max_order = max_order;
+	spin_lock_init(&buddy->lock);
+
+	buddy->bits = kcalloc(buddy->max_order + 1, sizeof (long *),
+			      GFP_KERNEL);
+	buddy->num_free = kcalloc((buddy->max_order + 1), sizeof *buddy->num_free,
+				  GFP_KERNEL);
+	if (!buddy->bits || !buddy->num_free)
+		goto err_out;
+
+	for (i = 0; i <= buddy->max_order; ++i) {
+		s = BITS_TO_LONGS(1 << (buddy->max_order - i));
+		buddy->bits[i] = kcalloc(s, sizeof (long), GFP_KERNEL | __GFP_NOWARN);
+		if (!buddy->bits[i]) {
+			buddy->bits[i] = vzalloc(s * sizeof(long));
+			if (!buddy->bits[i])
+				goto err_out_free;
+		}
+	}
+
+	set_bit(0, buddy->bits[buddy->max_order]);
+	buddy->num_free[buddy->max_order] = 1;
+
+	return 0;
+
+err_out_free:
+	for (i = 0; i <= buddy->max_order; ++i)
+		if (buddy->bits[i] && is_vmalloc_addr(buddy->bits[i]))
+			vfree(buddy->bits[i]);
+		else
+			kfree(buddy->bits[i]);
+
+err_out:
+	kfree(buddy->bits);
+	kfree(buddy->num_free);
+
+	return -ENOMEM;
+}
+
+static void mlx4_buddy_cleanup(struct mlx4_buddy *buddy)
+{
+	int i;
+
+	for (i = 0; i <= buddy->max_order; ++i)
+		if (is_vmalloc_addr(buddy->bits[i]))
+			vfree(buddy->bits[i]);
+		else
+			kfree(buddy->bits[i]);
+
+	kfree(buddy->bits);
+	kfree(buddy->num_free);
+}
+
+u32 __mlx4_alloc_mtt_range(struct mlx4_dev *dev, int order)
+{
+	struct mlx4_mr_table *mr_table = &mlx4_priv(dev)->mr_table;
+	u32 seg;
+	int seg_order;
+	u32 offset;
+
+	seg_order = max_t(int, order - log_mtts_per_seg, 0);
+
+	seg = mlx4_buddy_alloc(&mr_table->mtt_buddy, seg_order);
+	if (seg == -1)
+		return -1;
+
+	offset = seg * (1 << log_mtts_per_seg);
+
+	if (mlx4_table_get_range(dev, &mr_table->mtt_table, offset,
+				 offset + (1 << order) - 1)) {
+		mlx4_buddy_free(&mr_table->mtt_buddy, seg, seg_order);
+		return -1;
+	}
+
+	return offset;
+}
+
+static u32 mlx4_alloc_mtt_range(struct mlx4_dev *dev, int order)
+{
+	u64 in_param = 0;
+	u64 out_param;
+	int err;
+
+	if (mlx4_is_mfunc(dev)) {
+		set_param_l(&in_param, order);
+		err = mlx4_cmd_imm(dev, in_param, &out_param, RES_MTT,
+						       RES_OP_RESERVE_AND_MAP,
+						       MLX4_CMD_ALLOC_RES,
+						       MLX4_CMD_TIME_CLASS_A,
+						       MLX4_CMD_WRAPPED);
+		if (err)
+			return -1;
+		return get_param_l(&out_param);
+	}
+	return __mlx4_alloc_mtt_range(dev, order);
+}
+
+int mlx4_mtt_init(struct mlx4_dev *dev, int npages, int page_shift,
+		  struct mlx4_mtt *mtt)
+{
+	int i;
+
+	if (!npages) {
+		mtt->order      = -1;
+		mtt->page_shift = MLX4_ICM_PAGE_SHIFT;
+		return 0;
+	} else
+		mtt->page_shift = page_shift;
+
+	for (mtt->order = 0, i = 1; i < npages; i <<= 1)
+		++mtt->order;
+
+	mtt->offset = mlx4_alloc_mtt_range(dev, mtt->order);
+	if (mtt->offset == -1)
+		return -ENOMEM;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx4_mtt_init);
+
+void __mlx4_free_mtt_range(struct mlx4_dev *dev, u32 offset, int order)
+{
+	u32 first_seg;
+	int seg_order;
+	struct mlx4_mr_table *mr_table = &mlx4_priv(dev)->mr_table;
+
+	seg_order = max_t(int, order - log_mtts_per_seg, 0);
+	first_seg = offset / (1 << log_mtts_per_seg);
+
+	mlx4_buddy_free(&mr_table->mtt_buddy, first_seg, seg_order);
+	mlx4_table_put_range(dev, &mr_table->mtt_table, offset,
+			     offset + (1 << order) - 1);
+}
+
+static void mlx4_free_mtt_range(struct mlx4_dev *dev, u32 offset, int order)
+{
+	u64 in_param = 0;
+	int err;
+
+	if (mlx4_is_mfunc(dev)) {
+		set_param_l(&in_param, offset);
+		set_param_h(&in_param, order);
+		err = mlx4_cmd(dev, in_param, RES_MTT, RES_OP_RESERVE_AND_MAP,
+						       MLX4_CMD_FREE_RES,
+						       MLX4_CMD_TIME_CLASS_A,
+						       MLX4_CMD_WRAPPED);
+		if (err)
+			mlx4_warn(dev, "Failed to free mtt range at:%d order:%d\n",
+				  offset, order);
+		return;
+	}
+	 __mlx4_free_mtt_range(dev, offset, order);
+}
+
+void mlx4_mtt_cleanup(struct mlx4_dev *dev, struct mlx4_mtt *mtt)
+{
+	if (mtt->order < 0)
+		return;
+
+	mlx4_free_mtt_range(dev, mtt->offset, mtt->order);
+}
+EXPORT_SYMBOL_GPL(mlx4_mtt_cleanup);
+
+u64 mlx4_mtt_addr(struct mlx4_dev *dev, struct mlx4_mtt *mtt)
+{
+	return (u64) mtt->offset * dev->caps.mtt_entry_sz;
+}
+EXPORT_SYMBOL_GPL(mlx4_mtt_addr);
+
+static u32 hw_index_to_key(u32 ind)
+{
+	return (ind >> 24) | (ind << 8);
+}
+
+static u32 key_to_hw_index(u32 key)
+{
+	return (key << 24) | (key >> 8);
+}
+
+static int mlx4_SW2HW_MPT(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox,
+			  int mpt_index)
+{
+	return mlx4_cmd(dev, mailbox->dma, mpt_index,
+			0, MLX4_CMD_SW2HW_MPT, MLX4_CMD_TIME_CLASS_B,
+			MLX4_CMD_WRAPPED);
+}
+
+static int mlx4_HW2SW_MPT(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox,
+			  int mpt_index)
+{
+	return mlx4_cmd_box(dev, 0, mailbox ? mailbox->dma : 0, mpt_index,
+			    !mailbox, MLX4_CMD_HW2SW_MPT,
+			    MLX4_CMD_TIME_CLASS_B, MLX4_CMD_WRAPPED);
+}
+
+/* Must protect against concurrent access */
+int mlx4_mr_hw_get_mpt(struct mlx4_dev *dev, struct mlx4_mr *mmr,
+		       struct mlx4_mpt_entry ***mpt_entry)
+{
+	int err;
+	int key = key_to_hw_index(mmr->key) & (dev->caps.num_mpts - 1);
+	struct mlx4_cmd_mailbox *mailbox = NULL;
+
+	if (mmr->enabled != MLX4_MPT_EN_HW)
+		return -EINVAL;
+
+	err = mlx4_HW2SW_MPT(dev, NULL, key);
+	if (err) {
+		mlx4_warn(dev, "HW2SW_MPT failed (%d).", err);
+		mlx4_warn(dev, "Most likely the MR has MWs bound to it.\n");
+		return err;
+	}
+
+	mmr->enabled = MLX4_MPT_EN_SW;
+
+	if (!mlx4_is_mfunc(dev)) {
+		**mpt_entry = mlx4_table_find(
+				&mlx4_priv(dev)->mr_table.dmpt_table,
+				key, NULL);
+	} else {
+		mailbox = mlx4_alloc_cmd_mailbox(dev);
+		if (IS_ERR_OR_NULL(mailbox))
+			return PTR_ERR(mailbox);
+
+		err = mlx4_cmd_box(dev, 0, mailbox->dma, key,
+				   0, MLX4_CMD_QUERY_MPT,
+				   MLX4_CMD_TIME_CLASS_B,
+				   MLX4_CMD_WRAPPED);
+		if (err)
+			goto free_mailbox;
+
+		*mpt_entry = (struct mlx4_mpt_entry **)&mailbox->buf;
+	}
+
+	if (!(*mpt_entry) || !(**mpt_entry)) {
+		err = -ENOMEM;
+		goto free_mailbox;
+	}
+
+	return 0;
+
+free_mailbox:
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx4_mr_hw_get_mpt);
+
+int mlx4_mr_hw_write_mpt(struct mlx4_dev *dev, struct mlx4_mr *mmr,
+			 struct mlx4_mpt_entry **mpt_entry)
+{
+	int err;
+
+	if (!mlx4_is_mfunc(dev)) {
+		/* Make sure any changes to this entry are flushed */
+		wmb();
+
+		*(u8 *)(*mpt_entry) = MLX4_MPT_STATUS_HW;
+
+		/* Make sure the new status is written */
+		wmb();
+
+		err = mlx4_SYNC_TPT(dev);
+	} else {
+		int key = key_to_hw_index(mmr->key) & (dev->caps.num_mpts - 1);
+
+		struct mlx4_cmd_mailbox *mailbox =
+			container_of((void *)mpt_entry, struct mlx4_cmd_mailbox,
+				     buf);
+
+		err = mlx4_SW2HW_MPT(dev, mailbox, key);
+	}
+
+	if (!err) {
+		mmr->pd = be32_to_cpu((*mpt_entry)->pd_flags) & MLX4_MPT_PD_MASK;
+		mmr->enabled = MLX4_MPT_EN_HW;
+	}
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx4_mr_hw_write_mpt);
+
+void mlx4_mr_hw_put_mpt(struct mlx4_dev *dev,
+			struct mlx4_mpt_entry **mpt_entry)
+{
+	if (mlx4_is_mfunc(dev)) {
+		struct mlx4_cmd_mailbox *mailbox =
+			container_of((void *)mpt_entry, struct mlx4_cmd_mailbox,
+				     buf);
+		mlx4_free_cmd_mailbox(dev, mailbox);
+	}
+}
+EXPORT_SYMBOL_GPL(mlx4_mr_hw_put_mpt);
+
+int mlx4_mr_hw_change_pd(struct mlx4_dev *dev, struct mlx4_mpt_entry *mpt_entry,
+			 u32 pdn)
+{
+	u32 pd_flags = be32_to_cpu(mpt_entry->pd_flags) & ~MLX4_MPT_PD_MASK;
+	/* The wrapper function will put the slave's id here */
+	if (mlx4_is_mfunc(dev))
+		pd_flags &= ~MLX4_MPT_PD_VF_MASK;
+
+	mpt_entry->pd_flags = cpu_to_be32(pd_flags |
+					  (pdn & MLX4_MPT_PD_MASK)
+					  | MLX4_MPT_PD_FLAG_EN_INV);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx4_mr_hw_change_pd);
+
+int mlx4_mr_hw_change_access(struct mlx4_dev *dev,
+			     struct mlx4_mpt_entry *mpt_entry,
+			     u32 access)
+{
+	u32 flags = (be32_to_cpu(mpt_entry->flags) & ~MLX4_PERM_MASK) |
+		    (access & MLX4_PERM_MASK);
+
+	mpt_entry->flags = cpu_to_be32(flags);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx4_mr_hw_change_access);
+
+static int mlx4_mr_alloc_reserved(struct mlx4_dev *dev, u32 mridx, u32 pd,
+			   u64 iova, u64 size, u32 access, int npages,
+			   int page_shift, struct mlx4_mr *mr)
+{
+	mr->iova       = iova;
+	mr->size       = size;
+	mr->pd	       = pd;
+	mr->access     = access;
+	mr->enabled    = MLX4_MPT_DISABLED;
+	mr->key	       = hw_index_to_key(mridx);
+
+	return mlx4_mtt_init(dev, npages, page_shift, &mr->mtt);
+}
+
+static int mlx4_WRITE_MTT(struct mlx4_dev *dev,
+			  struct mlx4_cmd_mailbox *mailbox,
+			  int num_entries)
+{
+	return mlx4_cmd(dev, mailbox->dma, num_entries, 0, MLX4_CMD_WRITE_MTT,
+			MLX4_CMD_TIME_CLASS_A,  MLX4_CMD_WRAPPED);
+}
+
+int __mlx4_mpt_reserve(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	return mlx4_bitmap_alloc(&priv->mr_table.mpt_bitmap);
+}
+
+static int mlx4_mpt_reserve(struct mlx4_dev *dev)
+{
+	u64 out_param;
+
+	if (mlx4_is_mfunc(dev)) {
+		if (mlx4_cmd_imm(dev, 0, &out_param, RES_MPT, RES_OP_RESERVE,
+				   MLX4_CMD_ALLOC_RES,
+				   MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED))
+			return -1;
+		return get_param_l(&out_param);
+	}
+	return  __mlx4_mpt_reserve(dev);
+}
+
+void __mlx4_mpt_release(struct mlx4_dev *dev, u32 index)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	mlx4_bitmap_free(&priv->mr_table.mpt_bitmap, index, MLX4_NO_RR);
+}
+
+static void mlx4_mpt_release(struct mlx4_dev *dev, u32 index)
+{
+	u64 in_param = 0;
+
+	if (mlx4_is_mfunc(dev)) {
+		set_param_l(&in_param, index);
+		if (mlx4_cmd(dev, in_param, RES_MPT, RES_OP_RESERVE,
+			       MLX4_CMD_FREE_RES,
+			       MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED))
+			mlx4_warn(dev, "Failed to release mr index:%d\n",
+				  index);
+		return;
+	}
+	__mlx4_mpt_release(dev, index);
+}
+
+int __mlx4_mpt_alloc_icm(struct mlx4_dev *dev, u32 index, gfp_t gfp)
+{
+	struct mlx4_mr_table *mr_table = &mlx4_priv(dev)->mr_table;
+
+	return mlx4_table_get(dev, &mr_table->dmpt_table, index, gfp);
+}
+
+static int mlx4_mpt_alloc_icm(struct mlx4_dev *dev, u32 index, gfp_t gfp)
+{
+	u64 param = 0;
+
+	if (mlx4_is_mfunc(dev)) {
+		set_param_l(&param, index);
+		return mlx4_cmd_imm(dev, param, &param, RES_MPT, RES_OP_MAP_ICM,
+							MLX4_CMD_ALLOC_RES,
+							MLX4_CMD_TIME_CLASS_A,
+							MLX4_CMD_WRAPPED);
+	}
+	return __mlx4_mpt_alloc_icm(dev, index, gfp);
+}
+
+void __mlx4_mpt_free_icm(struct mlx4_dev *dev, u32 index)
+{
+	struct mlx4_mr_table *mr_table = &mlx4_priv(dev)->mr_table;
+
+	mlx4_table_put(dev, &mr_table->dmpt_table, index);
+}
+
+static void mlx4_mpt_free_icm(struct mlx4_dev *dev, u32 index)
+{
+	u64 in_param = 0;
+
+	if (mlx4_is_mfunc(dev)) {
+		set_param_l(&in_param, index);
+		if (mlx4_cmd(dev, in_param, RES_MPT, RES_OP_MAP_ICM,
+			     MLX4_CMD_FREE_RES, MLX4_CMD_TIME_CLASS_A,
+			     MLX4_CMD_WRAPPED))
+			mlx4_warn(dev, "Failed to free icm of mr index:%d\n",
+				  index);
+		return;
+	}
+	return __mlx4_mpt_free_icm(dev, index);
+}
+
+int mlx4_mr_alloc(struct mlx4_dev *dev, u32 pd, u64 iova, u64 size, u32 access,
+		  int npages, int page_shift, struct mlx4_mr *mr)
+{
+	u32 index;
+	int err;
+
+	index = mlx4_mpt_reserve(dev);
+	if (index == -1)
+		return -ENOMEM;
+
+	err = mlx4_mr_alloc_reserved(dev, index, pd, iova, size,
+				     access, npages, page_shift, mr);
+	if (err)
+		mlx4_mpt_release(dev, index);
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx4_mr_alloc);
+
+static int mlx4_mr_free_reserved(struct mlx4_dev *dev, struct mlx4_mr *mr)
+{
+	int err;
+
+	if (mr->enabled == MLX4_MPT_EN_HW) {
+		err = mlx4_HW2SW_MPT(dev, NULL,
+				     key_to_hw_index(mr->key) &
+				     (dev->caps.num_mpts - 1));
+		if (err) {
+			mlx4_warn(dev, "HW2SW_MPT failed (%d), MR has MWs bound to it\n",
+				  err);
+			return err;
+		}
+
+		mr->enabled = MLX4_MPT_EN_SW;
+	}
+	mlx4_mtt_cleanup(dev, &mr->mtt);
+
+	return 0;
+}
+
+int mlx4_mr_free(struct mlx4_dev *dev, struct mlx4_mr *mr)
+{
+	int ret;
+
+	ret = mlx4_mr_free_reserved(dev, mr);
+	if (ret)
+		return ret;
+	if (mr->enabled)
+		mlx4_mpt_free_icm(dev, key_to_hw_index(mr->key));
+	mlx4_mpt_release(dev, key_to_hw_index(mr->key));
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx4_mr_free);
+
+void mlx4_mr_rereg_mem_cleanup(struct mlx4_dev *dev, struct mlx4_mr *mr)
+{
+	mlx4_mtt_cleanup(dev, &mr->mtt);
+}
+EXPORT_SYMBOL_GPL(mlx4_mr_rereg_mem_cleanup);
+
+int mlx4_mr_rereg_mem_write(struct mlx4_dev *dev, struct mlx4_mr *mr,
+			    u64 iova, u64 size, int npages,
+			    int page_shift, struct mlx4_mpt_entry *mpt_entry)
+{
+	int err;
+
+	mpt_entry->start       = cpu_to_be64(iova);
+	mpt_entry->length      = cpu_to_be64(size);
+	mpt_entry->entity_size = cpu_to_be32(page_shift);
+
+	err = mlx4_mtt_init(dev, npages, page_shift, &mr->mtt);
+	if (err)
+		return err;
+
+	mpt_entry->pd_flags &= cpu_to_be32(MLX4_MPT_PD_MASK |
+					   MLX4_MPT_PD_FLAG_EN_INV);
+	mpt_entry->flags    &= cpu_to_be32(MLX4_MPT_FLAG_FREE |
+					   MLX4_MPT_FLAG_SW_OWNS);
+	if (mr->mtt.order < 0) {
+		mpt_entry->flags |= cpu_to_be32(MLX4_MPT_FLAG_PHYSICAL);
+		mpt_entry->mtt_addr = 0;
+	} else {
+		mpt_entry->mtt_addr = cpu_to_be64(mlx4_mtt_addr(dev,
+						  &mr->mtt));
+		if (mr->mtt.page_shift == 0)
+			mpt_entry->mtt_sz    = cpu_to_be32(1 << mr->mtt.order);
+	}
+	if (mr->mtt.order >= 0 && mr->mtt.page_shift == 0) {
+		/* fast register MR in free state */
+		mpt_entry->flags    |= cpu_to_be32(MLX4_MPT_FLAG_FREE);
+		mpt_entry->pd_flags |= cpu_to_be32(MLX4_MPT_PD_FLAG_FAST_REG |
+						   MLX4_MPT_PD_FLAG_RAE);
+	} else {
+		mpt_entry->flags    |= cpu_to_be32(MLX4_MPT_FLAG_SW_OWNS);
+	}
+	mr->enabled = MLX4_MPT_EN_SW;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx4_mr_rereg_mem_write);
+
+int mlx4_mr_enable(struct mlx4_dev *dev, struct mlx4_mr *mr)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_mpt_entry *mpt_entry;
+	int err;
+
+	err = mlx4_mpt_alloc_icm(dev, key_to_hw_index(mr->key), GFP_KERNEL);
+	if (err)
+		return err;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox)) {
+		err = PTR_ERR(mailbox);
+		goto err_table;
+	}
+	mpt_entry = mailbox->buf;
+	mpt_entry->flags = cpu_to_be32(MLX4_MPT_FLAG_MIO	 |
+				       MLX4_MPT_FLAG_REGION	 |
+				       mr->access);
+
+	mpt_entry->key	       = cpu_to_be32(key_to_hw_index(mr->key));
+	mpt_entry->pd_flags    = cpu_to_be32(mr->pd | MLX4_MPT_PD_FLAG_EN_INV);
+	mpt_entry->start       = cpu_to_be64(mr->iova);
+	mpt_entry->length      = cpu_to_be64(mr->size);
+	mpt_entry->entity_size = cpu_to_be32(mr->mtt.page_shift);
+
+	if (mr->mtt.order < 0) {
+		mpt_entry->flags |= cpu_to_be32(MLX4_MPT_FLAG_PHYSICAL);
+		mpt_entry->mtt_addr = 0;
+	} else {
+		mpt_entry->mtt_addr = cpu_to_be64(mlx4_mtt_addr(dev,
+						  &mr->mtt));
+	}
+
+	if (mr->mtt.order >= 0 && mr->mtt.page_shift == 0) {
+		/* fast register MR in free state */
+		mpt_entry->flags    |= cpu_to_be32(MLX4_MPT_FLAG_FREE);
+		mpt_entry->pd_flags |= cpu_to_be32(MLX4_MPT_PD_FLAG_FAST_REG |
+						   MLX4_MPT_PD_FLAG_RAE);
+		mpt_entry->mtt_sz    = cpu_to_be32(1 << mr->mtt.order);
+	} else {
+		mpt_entry->flags    |= cpu_to_be32(MLX4_MPT_FLAG_SW_OWNS);
+	}
+
+	err = mlx4_SW2HW_MPT(dev, mailbox,
+			     key_to_hw_index(mr->key) & (dev->caps.num_mpts - 1));
+	if (err) {
+		mlx4_warn(dev, "SW2HW_MPT failed (%d)\n", err);
+		goto err_cmd;
+	}
+	mr->enabled = MLX4_MPT_EN_HW;
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+
+	return 0;
+
+err_cmd:
+	mlx4_free_cmd_mailbox(dev, mailbox);
+
+err_table:
+	mlx4_mpt_free_icm(dev, key_to_hw_index(mr->key));
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx4_mr_enable);
+
+static int mlx4_write_mtt_chunk(struct mlx4_dev *dev, struct mlx4_mtt *mtt,
+				int start_index, int npages, u64 *page_list)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	__be64 *mtts;
+	dma_addr_t dma_handle;
+	int i;
+
+	mtts = mlx4_table_find(&priv->mr_table.mtt_table, mtt->offset +
+			       start_index, &dma_handle);
+
+	if (!mtts)
+		return -ENOMEM;
+
+	dma_sync_single_for_cpu(&dev->pdev->dev, dma_handle,
+				npages * sizeof (u64), DMA_TO_DEVICE);
+
+	for (i = 0; i < npages; ++i)
+		mtts[i] = cpu_to_be64(page_list[i] | MLX4_MTT_FLAG_PRESENT);
+
+	dma_sync_single_for_device(&dev->pdev->dev, dma_handle,
+				   npages * sizeof (u64), DMA_TO_DEVICE);
+
+	return 0;
+}
+
+int __mlx4_write_mtt(struct mlx4_dev *dev, struct mlx4_mtt *mtt,
+		     int start_index, int npages, u64 *page_list)
+{
+	int err = 0;
+	int chunk;
+	int mtts_per_page;
+	int max_mtts_first_page;
+
+	/* compute how may mtts fit in the first page */
+	mtts_per_page = PAGE_SIZE / sizeof(u64);
+	max_mtts_first_page = mtts_per_page - (mtt->offset + start_index)
+			      % mtts_per_page;
+
+	chunk = min_t(int, max_mtts_first_page, npages);
+
+	while (npages > 0) {
+		err = mlx4_write_mtt_chunk(dev, mtt, start_index, chunk, page_list);
+		if (err)
+			return err;
+		npages      -= chunk;
+		start_index += chunk;
+		page_list   += chunk;
+
+		chunk = min_t(int, mtts_per_page, npages);
+	}
+	return err;
+}
+
+int mlx4_write_mtt(struct mlx4_dev *dev, struct mlx4_mtt *mtt,
+		   int start_index, int npages, u64 *page_list)
+{
+	struct mlx4_cmd_mailbox *mailbox = NULL;
+	__be64 *inbox = NULL;
+	int chunk;
+	int err = 0;
+	int i;
+
+	if (mtt->order < 0)
+		return -EINVAL;
+
+	if (mlx4_is_mfunc(dev)) {
+		mailbox = mlx4_alloc_cmd_mailbox(dev);
+		if (IS_ERR(mailbox))
+			return PTR_ERR(mailbox);
+		inbox = mailbox->buf;
+
+		while (npages > 0) {
+			chunk = min_t(int, MLX4_MAILBOX_SIZE / sizeof(u64) - 2,
+				      npages);
+			inbox[0] = cpu_to_be64(mtt->offset + start_index);
+			inbox[1] = 0;
+			for (i = 0; i < chunk; ++i)
+				inbox[i + 2] = cpu_to_be64(page_list[i] |
+					       MLX4_MTT_FLAG_PRESENT);
+			err = mlx4_WRITE_MTT(dev, mailbox, chunk);
+			if (err) {
+				mlx4_free_cmd_mailbox(dev, mailbox);
+				return err;
+			}
+
+			npages      -= chunk;
+			start_index += chunk;
+			page_list   += chunk;
+		}
+		mlx4_free_cmd_mailbox(dev, mailbox);
+		return err;
+	}
+
+	return __mlx4_write_mtt(dev, mtt, start_index, npages, page_list);
+}
+EXPORT_SYMBOL_GPL(mlx4_write_mtt);
+
+int mlx4_buf_write_mtt(struct mlx4_dev *dev, struct mlx4_mtt *mtt,
+		       struct mlx4_buf *buf, gfp_t gfp)
+{
+	u64 *page_list;
+	int err;
+	int i;
+
+	page_list = kmalloc(buf->npages * sizeof *page_list,
+			    gfp);
+	if (!page_list)
+		return -ENOMEM;
+
+	for (i = 0; i < buf->npages; ++i)
+		if (buf->nbufs == 1)
+			page_list[i] = buf->direct.map + (i << buf->page_shift);
+		else
+			page_list[i] = buf->page_list[i].map;
+
+	err = mlx4_write_mtt(dev, mtt, 0, buf->npages, page_list);
+
+	kfree(page_list);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx4_buf_write_mtt);
+
+int mlx4_mw_alloc(struct mlx4_dev *dev, u32 pd, enum mlx4_mw_type type,
+		  struct mlx4_mw *mw)
+{
+	u32 index;
+
+	if ((type == MLX4_MW_TYPE_1 &&
+	     !(dev->caps.flags & MLX4_DEV_CAP_FLAG_MEM_WINDOW)) ||
+	     (type == MLX4_MW_TYPE_2 &&
+	     !(dev->caps.bmme_flags & MLX4_BMME_FLAG_TYPE_2_WIN)))
+		return -ENOTSUPP;
+
+	index = mlx4_mpt_reserve(dev);
+	if (index == -1)
+		return -ENOMEM;
+
+	mw->key	    = hw_index_to_key(index);
+	mw->pd      = pd;
+	mw->type    = type;
+	mw->enabled = MLX4_MPT_DISABLED;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx4_mw_alloc);
+
+int mlx4_mw_enable(struct mlx4_dev *dev, struct mlx4_mw *mw)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_mpt_entry *mpt_entry;
+	int err;
+
+	err = mlx4_mpt_alloc_icm(dev, key_to_hw_index(mw->key), GFP_KERNEL);
+	if (err)
+		return err;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox)) {
+		err = PTR_ERR(mailbox);
+		goto err_table;
+	}
+	mpt_entry = mailbox->buf;
+
+	/* Note that the MLX4_MPT_FLAG_REGION bit in mpt_entry->flags is turned
+	 * off, thus creating a memory window and not a memory region.
+	 */
+	mpt_entry->key	       = cpu_to_be32(key_to_hw_index(mw->key));
+	mpt_entry->pd_flags    = cpu_to_be32(mw->pd);
+	if (mw->type == MLX4_MW_TYPE_2) {
+		mpt_entry->flags    |= cpu_to_be32(MLX4_MPT_FLAG_FREE);
+		mpt_entry->qpn       = cpu_to_be32(MLX4_MPT_QP_FLAG_BOUND_QP);
+		mpt_entry->pd_flags |= cpu_to_be32(MLX4_MPT_PD_FLAG_EN_INV);
+	}
+
+	err = mlx4_SW2HW_MPT(dev, mailbox,
+			     key_to_hw_index(mw->key) &
+			     (dev->caps.num_mpts - 1));
+	if (err) {
+		mlx4_warn(dev, "SW2HW_MPT failed (%d)\n", err);
+		goto err_cmd;
+	}
+	mw->enabled = MLX4_MPT_EN_HW;
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+
+	return 0;
+
+err_cmd:
+	mlx4_free_cmd_mailbox(dev, mailbox);
+
+err_table:
+	mlx4_mpt_free_icm(dev, key_to_hw_index(mw->key));
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx4_mw_enable);
+
+void mlx4_mw_free(struct mlx4_dev *dev, struct mlx4_mw *mw)
+{
+	int err;
+
+	if (mw->enabled == MLX4_MPT_EN_HW) {
+		err = mlx4_HW2SW_MPT(dev, NULL,
+				     key_to_hw_index(mw->key) &
+				     (dev->caps.num_mpts - 1));
+		if (err)
+			mlx4_warn(dev, "xxx HW2SW_MPT failed (%d)\n", err);
+
+		mw->enabled = MLX4_MPT_EN_SW;
+	}
+	if (mw->enabled)
+		mlx4_mpt_free_icm(dev, key_to_hw_index(mw->key));
+	mlx4_mpt_release(dev, key_to_hw_index(mw->key));
+}
+EXPORT_SYMBOL_GPL(mlx4_mw_free);
+
+int mlx4_init_mr_table(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_mr_table *mr_table = &priv->mr_table;
+	int err;
+
+	/* Nothing to do for slaves - all MR handling is forwarded
+	* to the master */
+	if (mlx4_is_slave(dev))
+		return 0;
+
+	if (!is_power_of_2(dev->caps.num_mpts))
+		return -EINVAL;
+
+	err = mlx4_bitmap_init(&mr_table->mpt_bitmap, dev->caps.num_mpts,
+			       ~0, dev->caps.reserved_mrws, 0);
+	if (err)
+		return err;
+
+	err = mlx4_buddy_init(&mr_table->mtt_buddy,
+			      ilog2((u32)dev->caps.num_mtts /
+			      (1 << log_mtts_per_seg)));
+	if (err)
+		goto err_buddy;
+
+	if (dev->caps.reserved_mtts) {
+		priv->reserved_mtts =
+			mlx4_alloc_mtt_range(dev,
+					     fls(dev->caps.reserved_mtts - 1));
+		if (priv->reserved_mtts < 0) {
+			mlx4_warn(dev, "MTT table of order %u is too small\n",
+				  mr_table->mtt_buddy.max_order);
+			err = -ENOMEM;
+			goto err_reserve_mtts;
+		}
+	}
+
+	return 0;
+
+err_reserve_mtts:
+	mlx4_buddy_cleanup(&mr_table->mtt_buddy);
+
+err_buddy:
+	mlx4_bitmap_cleanup(&mr_table->mpt_bitmap);
+
+	return err;
+}
+
+void mlx4_cleanup_mr_table(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_mr_table *mr_table = &priv->mr_table;
+
+	if (mlx4_is_slave(dev))
+		return;
+	if (priv->reserved_mtts >= 0)
+		mlx4_free_mtt_range(dev, priv->reserved_mtts,
+				    fls(dev->caps.reserved_mtts - 1));
+	mlx4_buddy_cleanup(&mr_table->mtt_buddy);
+	mlx4_bitmap_cleanup(&mr_table->mpt_bitmap);
+}
+
+static inline int mlx4_check_fmr(struct mlx4_fmr *fmr, u64 *page_list,
+				  int npages, u64 iova)
+{
+	int i, page_mask;
+
+	if (npages > fmr->max_pages)
+		return -EINVAL;
+
+	page_mask = (1 << fmr->page_shift) - 1;
+
+	/* We are getting page lists, so va must be page aligned. */
+	if (iova & page_mask)
+		return -EINVAL;
+
+	/* Trust the user not to pass misaligned data in page_list */
+	if (0)
+		for (i = 0; i < npages; ++i) {
+			if (page_list[i] & ~page_mask)
+				return -EINVAL;
+		}
+
+	if (fmr->maps >= fmr->max_maps)
+		return -EINVAL;
+
+	return 0;
+}
+
+int mlx4_map_phys_fmr(struct mlx4_dev *dev, struct mlx4_fmr *fmr, u64 *page_list,
+		      int npages, u64 iova, u32 *lkey, u32 *rkey)
+{
+	u32 key;
+	int i, err;
+
+	err = mlx4_check_fmr(fmr, page_list, npages, iova);
+	if (err)
+		return err;
+
+	++fmr->maps;
+
+	key = key_to_hw_index(fmr->mr.key);
+	key += dev->caps.num_mpts;
+	*lkey = *rkey = fmr->mr.key = hw_index_to_key(key);
+
+	*(u8 *) fmr->mpt = MLX4_MPT_STATUS_SW;
+
+	/* Make sure MPT status is visible before writing MTT entries */
+	wmb();
+
+	dma_sync_single_for_cpu(&dev->pdev->dev, fmr->dma_handle,
+				npages * sizeof(u64), DMA_TO_DEVICE);
+
+	for (i = 0; i < npages; ++i)
+		fmr->mtts[i] = cpu_to_be64(page_list[i] | MLX4_MTT_FLAG_PRESENT);
+
+	dma_sync_single_for_device(&dev->pdev->dev, fmr->dma_handle,
+				   npages * sizeof(u64), DMA_TO_DEVICE);
+
+	fmr->mpt->key    = cpu_to_be32(key);
+	fmr->mpt->lkey   = cpu_to_be32(key);
+	fmr->mpt->length = cpu_to_be64(npages * (1ull << fmr->page_shift));
+	fmr->mpt->start  = cpu_to_be64(iova);
+
+	/* Make MTT entries are visible before setting MPT status */
+	wmb();
+
+	*(u8 *) fmr->mpt = MLX4_MPT_STATUS_HW;
+
+	/* Make sure MPT status is visible before consumer can use FMR */
+	wmb();
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx4_map_phys_fmr);
+
+int mlx4_fmr_alloc(struct mlx4_dev *dev, u32 pd, u32 access, int max_pages,
+		   int max_maps, u8 page_shift, struct mlx4_fmr *fmr)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int err = -ENOMEM;
+
+	if (max_maps > dev->caps.max_fmr_maps)
+		return -EINVAL;
+
+	if (page_shift < (ffs(dev->caps.page_size_cap) - 1) || page_shift >= 32)
+		return -EINVAL;
+
+	/* All MTTs must fit in the same page */
+	if (max_pages * sizeof *fmr->mtts > PAGE_SIZE)
+		return -EINVAL;
+
+	fmr->page_shift = page_shift;
+	fmr->max_pages  = max_pages;
+	fmr->max_maps   = max_maps;
+	fmr->maps = 0;
+
+	err = mlx4_mr_alloc(dev, pd, 0, 0, access, max_pages,
+			    page_shift, &fmr->mr);
+	if (err)
+		return err;
+
+	fmr->mtts = mlx4_table_find(&priv->mr_table.mtt_table,
+				    fmr->mr.mtt.offset,
+				    &fmr->dma_handle);
+
+	if (!fmr->mtts) {
+		err = -ENOMEM;
+		goto err_free;
+	}
+
+	return 0;
+
+err_free:
+	(void) mlx4_mr_free(dev, &fmr->mr);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx4_fmr_alloc);
+
+int mlx4_fmr_enable(struct mlx4_dev *dev, struct mlx4_fmr *fmr)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int err;
+
+	err = mlx4_mr_enable(dev, &fmr->mr);
+	if (err)
+		return err;
+
+	fmr->mpt = mlx4_table_find(&priv->mr_table.dmpt_table,
+				    key_to_hw_index(fmr->mr.key), NULL);
+	if (!fmr->mpt)
+		return -ENOMEM;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx4_fmr_enable);
+
+void mlx4_fmr_unmap(struct mlx4_dev *dev, struct mlx4_fmr *fmr,
+		    u32 *lkey, u32 *rkey)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	int err;
+
+	if (!fmr->maps)
+		return;
+
+	fmr->maps = 0;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox)) {
+		err = PTR_ERR(mailbox);
+		pr_warn("mlx4_ib: mlx4_alloc_cmd_mailbox failed (%d)\n", err);
+		return;
+	}
+
+	err = mlx4_HW2SW_MPT(dev, NULL,
+			     key_to_hw_index(fmr->mr.key) &
+			     (dev->caps.num_mpts - 1));
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	if (err) {
+		pr_warn("mlx4_ib: mlx4_HW2SW_MPT failed (%d)\n", err);
+		return;
+	}
+	fmr->mr.enabled = MLX4_MPT_EN_SW;
+}
+EXPORT_SYMBOL_GPL(mlx4_fmr_unmap);
+
+int mlx4_fmr_free(struct mlx4_dev *dev, struct mlx4_fmr *fmr)
+{
+	int ret;
+
+	if (fmr->maps)
+		return -EBUSY;
+
+	ret = mlx4_mr_free(dev, &fmr->mr);
+	if (ret)
+		return ret;
+	fmr->mr.enabled = MLX4_MPT_DISABLED;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx4_fmr_free);
+
+int mlx4_SYNC_TPT(struct mlx4_dev *dev)
+{
+	return mlx4_cmd(dev, 0, 0, 0, MLX4_CMD_SYNC_TPT, 1000,
+			MLX4_CMD_NATIVE);
+}
+EXPORT_SYMBOL_GPL(mlx4_SYNC_TPT);
diff --git a/drivers/net/ethernet/mellanox/mlx4/pd.c b/drivers/net/ethernet/mellanox/mlx4/pd.c
new file mode 100644
index 0000000..7421607
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4/pd.c
@@ -0,0 +1,286 @@
+/*
+ * Copyright (c) 2006, 2007 Cisco Systems, Inc.  All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/errno.h>
+#include <linux/export.h>
+#include <linux/io-mapping.h>
+
+#include <asm/page.h>
+
+#include "mlx4.h"
+#include "icm.h"
+
+enum {
+	MLX4_NUM_RESERVED_UARS = 8
+};
+
+int mlx4_pd_alloc(struct mlx4_dev *dev, u32 *pdn)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	*pdn = mlx4_bitmap_alloc(&priv->pd_bitmap);
+	if (*pdn == -1)
+		return -ENOMEM;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx4_pd_alloc);
+
+void mlx4_pd_free(struct mlx4_dev *dev, u32 pdn)
+{
+	mlx4_bitmap_free(&mlx4_priv(dev)->pd_bitmap, pdn, MLX4_USE_RR);
+}
+EXPORT_SYMBOL_GPL(mlx4_pd_free);
+
+int __mlx4_xrcd_alloc(struct mlx4_dev *dev, u32 *xrcdn)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	*xrcdn = mlx4_bitmap_alloc(&priv->xrcd_bitmap);
+	if (*xrcdn == -1)
+		return -ENOMEM;
+
+	return 0;
+}
+
+int mlx4_xrcd_alloc(struct mlx4_dev *dev, u32 *xrcdn)
+{
+	u64 out_param;
+	int err;
+
+	if (mlx4_is_mfunc(dev)) {
+		err = mlx4_cmd_imm(dev, 0, &out_param,
+				   RES_XRCD, RES_OP_RESERVE,
+				   MLX4_CMD_ALLOC_RES,
+				   MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED);
+		if (err)
+			return err;
+
+		*xrcdn = get_param_l(&out_param);
+		return 0;
+	}
+	return __mlx4_xrcd_alloc(dev, xrcdn);
+}
+EXPORT_SYMBOL_GPL(mlx4_xrcd_alloc);
+
+void __mlx4_xrcd_free(struct mlx4_dev *dev, u32 xrcdn)
+{
+	mlx4_bitmap_free(&mlx4_priv(dev)->xrcd_bitmap, xrcdn, MLX4_USE_RR);
+}
+
+void mlx4_xrcd_free(struct mlx4_dev *dev, u32 xrcdn)
+{
+	u64 in_param = 0;
+	int err;
+
+	if (mlx4_is_mfunc(dev)) {
+		set_param_l(&in_param, xrcdn);
+		err = mlx4_cmd(dev, in_param, RES_XRCD,
+			       RES_OP_RESERVE, MLX4_CMD_FREE_RES,
+			       MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED);
+		if (err)
+			mlx4_warn(dev, "Failed to release xrcdn %d\n", xrcdn);
+	} else
+		__mlx4_xrcd_free(dev, xrcdn);
+}
+EXPORT_SYMBOL_GPL(mlx4_xrcd_free);
+
+int mlx4_init_pd_table(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	return mlx4_bitmap_init(&priv->pd_bitmap, dev->caps.num_pds,
+				(1 << NOT_MASKED_PD_BITS) - 1,
+				 dev->caps.reserved_pds, 0);
+}
+
+void mlx4_cleanup_pd_table(struct mlx4_dev *dev)
+{
+	mlx4_bitmap_cleanup(&mlx4_priv(dev)->pd_bitmap);
+}
+
+int mlx4_init_xrcd_table(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	return mlx4_bitmap_init(&priv->xrcd_bitmap, (1 << 16),
+				(1 << 16) - 1, dev->caps.reserved_xrcds + 1, 0);
+}
+
+void mlx4_cleanup_xrcd_table(struct mlx4_dev *dev)
+{
+	mlx4_bitmap_cleanup(&mlx4_priv(dev)->xrcd_bitmap);
+}
+
+int mlx4_uar_alloc(struct mlx4_dev *dev, struct mlx4_uar *uar)
+{
+	int offset;
+
+	uar->index = mlx4_bitmap_alloc(&mlx4_priv(dev)->uar_table.bitmap);
+	if (uar->index == -1)
+		return -ENOMEM;
+
+	if (mlx4_is_slave(dev))
+		offset = uar->index % ((int) pci_resource_len(dev->pdev, 2) /
+				       dev->caps.uar_page_size);
+	else
+		offset = uar->index;
+	uar->pfn = (pci_resource_start(dev->pdev, 2) >> PAGE_SHIFT) + offset;
+	uar->map = NULL;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx4_uar_alloc);
+
+void mlx4_uar_free(struct mlx4_dev *dev, struct mlx4_uar *uar)
+{
+	mlx4_bitmap_free(&mlx4_priv(dev)->uar_table.bitmap, uar->index, MLX4_USE_RR);
+}
+EXPORT_SYMBOL_GPL(mlx4_uar_free);
+
+int mlx4_bf_alloc(struct mlx4_dev *dev, struct mlx4_bf *bf, int node)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_uar *uar;
+	int err = 0;
+	int idx;
+
+	if (!priv->bf_mapping)
+		return -ENOMEM;
+
+	mutex_lock(&priv->bf_mutex);
+	if (!list_empty(&priv->bf_list))
+		uar = list_entry(priv->bf_list.next, struct mlx4_uar, bf_list);
+	else {
+		if (mlx4_bitmap_avail(&priv->uar_table.bitmap) < MLX4_NUM_RESERVED_UARS) {
+			err = -ENOMEM;
+			goto out;
+		}
+		uar = kmalloc_node(sizeof(*uar), GFP_KERNEL, node);
+		if (!uar) {
+			uar = kmalloc(sizeof(*uar), GFP_KERNEL);
+			if (!uar) {
+				err = -ENOMEM;
+				goto out;
+			}
+		}
+		err = mlx4_uar_alloc(dev, uar);
+		if (err)
+			goto free_kmalloc;
+
+		uar->map = ioremap(uar->pfn << PAGE_SHIFT, PAGE_SIZE);
+		if (!uar->map) {
+			err = -ENOMEM;
+			goto free_uar;
+		}
+
+		uar->bf_map = io_mapping_map_wc(priv->bf_mapping, uar->index << PAGE_SHIFT);
+		if (!uar->bf_map) {
+			err = -ENOMEM;
+			goto unamp_uar;
+		}
+		uar->free_bf_bmap = 0;
+		list_add(&uar->bf_list, &priv->bf_list);
+	}
+
+	bf->uar = uar;
+	idx = ffz(uar->free_bf_bmap);
+	uar->free_bf_bmap |= 1 << idx;
+	bf->uar = uar;
+	bf->offset = 0;
+	bf->buf_size = dev->caps.bf_reg_size / 2;
+	bf->reg = uar->bf_map + idx * dev->caps.bf_reg_size;
+	if (uar->free_bf_bmap == (1 << dev->caps.bf_regs_per_page) - 1)
+		list_del_init(&uar->bf_list);
+
+	goto out;
+
+unamp_uar:
+	bf->uar = NULL;
+	iounmap(uar->map);
+
+free_uar:
+	mlx4_uar_free(dev, uar);
+
+free_kmalloc:
+	kfree(uar);
+
+out:
+	mutex_unlock(&priv->bf_mutex);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx4_bf_alloc);
+
+void mlx4_bf_free(struct mlx4_dev *dev, struct mlx4_bf *bf)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int idx;
+
+	if (!bf->uar || !bf->uar->bf_map)
+		return;
+
+	mutex_lock(&priv->bf_mutex);
+	idx = (bf->reg - bf->uar->bf_map) / dev->caps.bf_reg_size;
+	bf->uar->free_bf_bmap &= ~(1 << idx);
+	if (!bf->uar->free_bf_bmap) {
+		if (!list_empty(&bf->uar->bf_list))
+			list_del(&bf->uar->bf_list);
+
+		io_mapping_unmap(bf->uar->bf_map);
+		iounmap(bf->uar->map);
+		mlx4_uar_free(dev, bf->uar);
+		kfree(bf->uar);
+	} else if (list_empty(&bf->uar->bf_list))
+		list_add(&bf->uar->bf_list, &priv->bf_list);
+
+	mutex_unlock(&priv->bf_mutex);
+}
+EXPORT_SYMBOL_GPL(mlx4_bf_free);
+
+int mlx4_init_uar_table(struct mlx4_dev *dev)
+{
+	if (dev->caps.num_uars <= 128) {
+		mlx4_err(dev, "Only %d UAR pages (need more than 128)\n",
+			 dev->caps.num_uars);
+		mlx4_err(dev, "Increase firmware log2_uar_bar_megabytes?\n");
+		return -ENODEV;
+	}
+
+	return mlx4_bitmap_init(&mlx4_priv(dev)->uar_table.bitmap,
+				dev->caps.num_uars, dev->caps.num_uars - 1,
+				dev->caps.reserved_uars, 0);
+}
+
+void mlx4_cleanup_uar_table(struct mlx4_dev *dev)
+{
+	mlx4_bitmap_cleanup(&mlx4_priv(dev)->uar_table.bitmap);
+}
diff --git a/drivers/net/ethernet/mellanox/mlx4/port.c b/drivers/net/ethernet/mellanox/mlx4/port.c
new file mode 100644
index 0000000..94eeb2c
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4/port.c
@@ -0,0 +1,1313 @@
+/*
+ * Copyright (c) 2007 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/errno.h>
+#include <linux/if_ether.h>
+#include <linux/if_vlan.h>
+#include <linux/export.h>
+
+#include <linux/mlx4/cmd.h>
+
+#include "mlx4.h"
+
+#define MLX4_MAC_VALID		(1ull << 63)
+
+#define MLX4_VLAN_VALID		(1u << 31)
+#define MLX4_VLAN_MASK		0xfff
+
+#define MLX4_STATS_TRAFFIC_COUNTERS_MASK	0xfULL
+#define MLX4_STATS_TRAFFIC_DROPS_MASK		0xc0ULL
+#define MLX4_STATS_ERROR_COUNTERS_MASK		0x1ffc30ULL
+#define MLX4_STATS_PORT_COUNTERS_MASK		0x1fe00000ULL
+
+void mlx4_init_mac_table(struct mlx4_dev *dev, struct mlx4_mac_table *table)
+{
+	int i;
+
+	mutex_init(&table->mutex);
+	for (i = 0; i < MLX4_MAX_MAC_NUM; i++) {
+		table->entries[i] = 0;
+		table->refs[i]	 = 0;
+	}
+	table->max   = 1 << dev->caps.log_num_macs;
+	table->total = 0;
+}
+
+void mlx4_init_vlan_table(struct mlx4_dev *dev, struct mlx4_vlan_table *table)
+{
+	int i;
+
+	mutex_init(&table->mutex);
+	for (i = 0; i < MLX4_MAX_VLAN_NUM; i++) {
+		table->entries[i] = 0;
+		table->refs[i]	 = 0;
+	}
+	table->max   = (1 << dev->caps.log_num_vlans) - MLX4_VLAN_REGULAR;
+	table->total = 0;
+}
+
+void mlx4_init_roce_gid_table(struct mlx4_dev *dev,
+			      struct mlx4_roce_gid_table *table)
+{
+	int i;
+
+	mutex_init(&table->mutex);
+	for (i = 0; i < MLX4_ROCE_MAX_GIDS; i++)
+		memset(table->roce_gids[i].raw, 0, MLX4_ROCE_GID_ENTRY_SIZE);
+}
+
+static int validate_index(struct mlx4_dev *dev,
+			  struct mlx4_mac_table *table, int index)
+{
+	int err = 0;
+
+	if (index < 0 || index >= table->max || !table->entries[index]) {
+		mlx4_warn(dev, "No valid Mac entry for the given index\n");
+		err = -EINVAL;
+	}
+	return err;
+}
+
+static int find_index(struct mlx4_dev *dev,
+		      struct mlx4_mac_table *table, u64 mac)
+{
+	int i;
+
+	for (i = 0; i < MLX4_MAX_MAC_NUM; i++) {
+		if (table->refs[i] &&
+		    (MLX4_MAC_MASK & mac) ==
+		    (MLX4_MAC_MASK & be64_to_cpu(table->entries[i])))
+			return i;
+	}
+	/* Mac not found */
+	return -EINVAL;
+}
+
+static int mlx4_set_port_mac_table(struct mlx4_dev *dev, u8 port,
+				   __be64 *entries)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	u32 in_mod;
+	int err;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+
+	memcpy(mailbox->buf, entries, MLX4_MAC_TABLE_SIZE);
+
+	in_mod = MLX4_SET_PORT_MAC_TABLE << 8 | port;
+
+	err = mlx4_cmd(dev, mailbox->dma, in_mod, 1, MLX4_CMD_SET_PORT,
+		       MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE);
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+
+int mlx4_find_cached_mac(struct mlx4_dev *dev, u8 port, u64 mac, int *idx)
+{
+	struct mlx4_port_info *info = &mlx4_priv(dev)->port[port];
+	struct mlx4_mac_table *table = &info->mac_table;
+	int i;
+
+	for (i = 0; i < MLX4_MAX_MAC_NUM; i++) {
+		if (!table->refs[i])
+			continue;
+
+		if (mac == (MLX4_MAC_MASK & be64_to_cpu(table->entries[i]))) {
+			*idx = i;
+			return 0;
+		}
+	}
+
+	return -ENOENT;
+}
+EXPORT_SYMBOL_GPL(mlx4_find_cached_mac);
+
+int __mlx4_register_mac(struct mlx4_dev *dev, u8 port, u64 mac)
+{
+	struct mlx4_port_info *info = &mlx4_priv(dev)->port[port];
+	struct mlx4_mac_table *table = &info->mac_table;
+	int i, err = 0;
+	int free = -1;
+
+	mlx4_dbg(dev, "Registering MAC: 0x%llx for port %d\n",
+		 (unsigned long long) mac, port);
+
+	mutex_lock(&table->mutex);
+	for (i = 0; i < MLX4_MAX_MAC_NUM; i++) {
+		if (!table->refs[i]) {
+			if (free < 0)
+				free = i;
+			continue;
+		}
+
+		if ((MLX4_MAC_MASK & mac) ==
+		     (MLX4_MAC_MASK & be64_to_cpu(table->entries[i]))) {
+			/* MAC already registered, increment ref count */
+			err = i;
+			++table->refs[i];
+			goto out;
+		}
+	}
+
+	mlx4_dbg(dev, "Free MAC index is %d\n", free);
+
+	if (table->total == table->max) {
+		/* No free mac entries */
+		err = -ENOSPC;
+		goto out;
+	}
+
+	/* Register new MAC */
+	table->entries[free] = cpu_to_be64(mac | MLX4_MAC_VALID);
+
+	err = mlx4_set_port_mac_table(dev, port, table->entries);
+	if (unlikely(err)) {
+		mlx4_err(dev, "Failed adding MAC: 0x%llx\n",
+			 (unsigned long long) mac);
+		table->entries[free] = 0;
+		goto out;
+	}
+	table->refs[free] = 1;
+	err = free;
+	++table->total;
+out:
+	mutex_unlock(&table->mutex);
+	return err;
+}
+EXPORT_SYMBOL_GPL(__mlx4_register_mac);
+
+int mlx4_register_mac(struct mlx4_dev *dev, u8 port, u64 mac)
+{
+	u64 out_param = 0;
+	int err = -EINVAL;
+
+	if (mlx4_is_mfunc(dev)) {
+		if (!(dev->flags & MLX4_FLAG_OLD_REG_MAC)) {
+			err = mlx4_cmd_imm(dev, mac, &out_param,
+					   ((u32) port) << 8 | (u32) RES_MAC,
+					   RES_OP_RESERVE_AND_MAP, MLX4_CMD_ALLOC_RES,
+					   MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED);
+		}
+		if (err && err == -EINVAL && mlx4_is_slave(dev)) {
+			/* retry using old REG_MAC format */
+			set_param_l(&out_param, port);
+			err = mlx4_cmd_imm(dev, mac, &out_param, RES_MAC,
+					   RES_OP_RESERVE_AND_MAP, MLX4_CMD_ALLOC_RES,
+					   MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED);
+			if (!err)
+				dev->flags |= MLX4_FLAG_OLD_REG_MAC;
+		}
+		if (err)
+			return err;
+
+		return get_param_l(&out_param);
+	}
+	return __mlx4_register_mac(dev, port, mac);
+}
+EXPORT_SYMBOL_GPL(mlx4_register_mac);
+
+int mlx4_get_base_qpn(struct mlx4_dev *dev, u8 port)
+{
+	return dev->caps.reserved_qps_base[MLX4_QP_REGION_ETH_ADDR] +
+			(port - 1) * (1 << dev->caps.log_num_macs);
+}
+EXPORT_SYMBOL_GPL(mlx4_get_base_qpn);
+
+void __mlx4_unregister_mac(struct mlx4_dev *dev, u8 port, u64 mac)
+{
+	struct mlx4_port_info *info;
+	struct mlx4_mac_table *table;
+	int index;
+
+	if (port < 1 || port > dev->caps.num_ports) {
+		mlx4_warn(dev, "invalid port number (%d), aborting...\n", port);
+		return;
+	}
+	info = &mlx4_priv(dev)->port[port];
+	table = &info->mac_table;
+	mutex_lock(&table->mutex);
+	index = find_index(dev, table, mac);
+
+	if (validate_index(dev, table, index))
+		goto out;
+	if (--table->refs[index]) {
+		mlx4_dbg(dev, "Have more references for index %d, no need to modify mac table\n",
+			 index);
+		goto out;
+	}
+
+	table->entries[index] = 0;
+	mlx4_set_port_mac_table(dev, port, table->entries);
+	--table->total;
+out:
+	mutex_unlock(&table->mutex);
+}
+EXPORT_SYMBOL_GPL(__mlx4_unregister_mac);
+
+void mlx4_unregister_mac(struct mlx4_dev *dev, u8 port, u64 mac)
+{
+	u64 out_param = 0;
+
+	if (mlx4_is_mfunc(dev)) {
+		if (!(dev->flags & MLX4_FLAG_OLD_REG_MAC)) {
+			(void) mlx4_cmd_imm(dev, mac, &out_param,
+					    ((u32) port) << 8 | (u32) RES_MAC,
+					    RES_OP_RESERVE_AND_MAP, MLX4_CMD_FREE_RES,
+					    MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED);
+		} else {
+			/* use old unregister mac format */
+			set_param_l(&out_param, port);
+			(void) mlx4_cmd_imm(dev, mac, &out_param, RES_MAC,
+					    RES_OP_RESERVE_AND_MAP, MLX4_CMD_FREE_RES,
+					    MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED);
+		}
+		return;
+	}
+	__mlx4_unregister_mac(dev, port, mac);
+	return;
+}
+EXPORT_SYMBOL_GPL(mlx4_unregister_mac);
+
+int __mlx4_replace_mac(struct mlx4_dev *dev, u8 port, int qpn, u64 new_mac)
+{
+	struct mlx4_port_info *info = &mlx4_priv(dev)->port[port];
+	struct mlx4_mac_table *table = &info->mac_table;
+	int index = qpn - info->base_qpn;
+	int err = 0;
+
+	/* CX1 doesn't support multi-functions */
+	mutex_lock(&table->mutex);
+
+	err = validate_index(dev, table, index);
+	if (err)
+		goto out;
+
+	table->entries[index] = cpu_to_be64(new_mac | MLX4_MAC_VALID);
+
+	err = mlx4_set_port_mac_table(dev, port, table->entries);
+	if (unlikely(err)) {
+		mlx4_err(dev, "Failed adding MAC: 0x%llx\n",
+			 (unsigned long long) new_mac);
+		table->entries[index] = 0;
+	}
+out:
+	mutex_unlock(&table->mutex);
+	return err;
+}
+EXPORT_SYMBOL_GPL(__mlx4_replace_mac);
+
+static int mlx4_set_port_vlan_table(struct mlx4_dev *dev, u8 port,
+				    __be32 *entries)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	u32 in_mod;
+	int err;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+
+	memcpy(mailbox->buf, entries, MLX4_VLAN_TABLE_SIZE);
+	in_mod = MLX4_SET_PORT_VLAN_TABLE << 8 | port;
+	err = mlx4_cmd(dev, mailbox->dma, in_mod, 1, MLX4_CMD_SET_PORT,
+		       MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE);
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+
+	return err;
+}
+
+int mlx4_find_cached_vlan(struct mlx4_dev *dev, u8 port, u16 vid, int *idx)
+{
+	struct mlx4_vlan_table *table = &mlx4_priv(dev)->port[port].vlan_table;
+	int i;
+
+	for (i = 0; i < MLX4_MAX_VLAN_NUM; ++i) {
+		if (table->refs[i] &&
+		    (vid == (MLX4_VLAN_MASK &
+			      be32_to_cpu(table->entries[i])))) {
+			/* VLAN already registered, increase reference count */
+			*idx = i;
+			return 0;
+		}
+	}
+
+	return -ENOENT;
+}
+EXPORT_SYMBOL_GPL(mlx4_find_cached_vlan);
+
+int __mlx4_register_vlan(struct mlx4_dev *dev, u8 port, u16 vlan,
+				int *index)
+{
+	struct mlx4_vlan_table *table = &mlx4_priv(dev)->port[port].vlan_table;
+	int i, err = 0;
+	int free = -1;
+
+	mutex_lock(&table->mutex);
+
+	if (table->total == table->max) {
+		/* No free vlan entries */
+		err = -ENOSPC;
+		goto out;
+	}
+
+	for (i = MLX4_VLAN_REGULAR; i < MLX4_MAX_VLAN_NUM; i++) {
+		if (free < 0 && (table->refs[i] == 0)) {
+			free = i;
+			continue;
+		}
+
+		if (table->refs[i] &&
+		    (vlan == (MLX4_VLAN_MASK &
+			      be32_to_cpu(table->entries[i])))) {
+			/* Vlan already registered, increase references count */
+			*index = i;
+			++table->refs[i];
+			goto out;
+		}
+	}
+
+	if (free < 0) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	/* Register new VLAN */
+	table->refs[free] = 1;
+	table->entries[free] = cpu_to_be32(vlan | MLX4_VLAN_VALID);
+
+	err = mlx4_set_port_vlan_table(dev, port, table->entries);
+	if (unlikely(err)) {
+		mlx4_warn(dev, "Failed adding vlan: %u\n", vlan);
+		table->refs[free] = 0;
+		table->entries[free] = 0;
+		goto out;
+	}
+
+	*index = free;
+	++table->total;
+out:
+	mutex_unlock(&table->mutex);
+	return err;
+}
+
+int mlx4_register_vlan(struct mlx4_dev *dev, u8 port, u16 vlan, int *index)
+{
+	u64 out_param = 0;
+	int err;
+
+	if (vlan > 4095)
+		return -EINVAL;
+
+	if (mlx4_is_mfunc(dev)) {
+		err = mlx4_cmd_imm(dev, vlan, &out_param,
+				   ((u32) port) << 8 | (u32) RES_VLAN,
+				   RES_OP_RESERVE_AND_MAP, MLX4_CMD_ALLOC_RES,
+				   MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED);
+		if (!err)
+			*index = get_param_l(&out_param);
+
+		return err;
+	}
+	return __mlx4_register_vlan(dev, port, vlan, index);
+}
+EXPORT_SYMBOL_GPL(mlx4_register_vlan);
+
+void __mlx4_unregister_vlan(struct mlx4_dev *dev, u8 port, u16 vlan)
+{
+	struct mlx4_vlan_table *table = &mlx4_priv(dev)->port[port].vlan_table;
+	int index;
+
+	mutex_lock(&table->mutex);
+	if (mlx4_find_cached_vlan(dev, port, vlan, &index)) {
+		mlx4_warn(dev, "vlan 0x%x is not in the vlan table\n", vlan);
+		goto out;
+	}
+
+	if (index < MLX4_VLAN_REGULAR) {
+		mlx4_warn(dev, "Trying to free special vlan index %d\n", index);
+		goto out;
+	}
+
+	if (--table->refs[index]) {
+		mlx4_dbg(dev, "Have %d more references for index %d, no need to modify vlan table\n",
+			 table->refs[index], index);
+		goto out;
+	}
+	table->entries[index] = 0;
+	mlx4_set_port_vlan_table(dev, port, table->entries);
+	--table->total;
+out:
+	mutex_unlock(&table->mutex);
+}
+
+void mlx4_unregister_vlan(struct mlx4_dev *dev, u8 port, u16 vlan)
+{
+	u64 out_param = 0;
+
+	if (mlx4_is_mfunc(dev)) {
+		(void) mlx4_cmd_imm(dev, vlan, &out_param,
+				    ((u32) port) << 8 | (u32) RES_VLAN,
+				    RES_OP_RESERVE_AND_MAP,
+				    MLX4_CMD_FREE_RES, MLX4_CMD_TIME_CLASS_A,
+				    MLX4_CMD_WRAPPED);
+		return;
+	}
+	__mlx4_unregister_vlan(dev, port, vlan);
+}
+EXPORT_SYMBOL_GPL(mlx4_unregister_vlan);
+
+int mlx4_get_port_ib_caps(struct mlx4_dev *dev, u8 port, __be32 *caps)
+{
+	struct mlx4_cmd_mailbox *inmailbox, *outmailbox;
+	u8 *inbuf, *outbuf;
+	int err;
+
+	inmailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(inmailbox))
+		return PTR_ERR(inmailbox);
+
+	outmailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(outmailbox)) {
+		mlx4_free_cmd_mailbox(dev, inmailbox);
+		return PTR_ERR(outmailbox);
+	}
+
+	inbuf = inmailbox->buf;
+	outbuf = outmailbox->buf;
+	inbuf[0] = 1;
+	inbuf[1] = 1;
+	inbuf[2] = 1;
+	inbuf[3] = 1;
+	*(__be16 *) (&inbuf[16]) = cpu_to_be16(0x0015);
+	*(__be32 *) (&inbuf[20]) = cpu_to_be32(port);
+
+	err = mlx4_cmd_box(dev, inmailbox->dma, outmailbox->dma, port, 3,
+			   MLX4_CMD_MAD_IFC, MLX4_CMD_TIME_CLASS_C,
+			   MLX4_CMD_NATIVE);
+	if (!err)
+		*caps = *(__be32 *) (outbuf + 84);
+	mlx4_free_cmd_mailbox(dev, inmailbox);
+	mlx4_free_cmd_mailbox(dev, outmailbox);
+	return err;
+}
+static struct mlx4_roce_gid_entry zgid_entry;
+
+int mlx4_get_slave_num_gids(struct mlx4_dev *dev, int slave, int port)
+{
+	int vfs;
+	int slave_gid = slave;
+	unsigned i;
+	struct mlx4_slaves_pport slaves_pport;
+	struct mlx4_active_ports actv_ports;
+	unsigned max_port_p_one;
+
+	if (slave == 0)
+		return MLX4_ROCE_PF_GIDS;
+
+	/* Slave is a VF */
+	slaves_pport = mlx4_phys_to_slaves_pport(dev, port);
+	actv_ports = mlx4_get_active_ports(dev, slave);
+	max_port_p_one = find_first_bit(actv_ports.ports, dev->caps.num_ports) +
+		bitmap_weight(actv_ports.ports, dev->caps.num_ports) + 1;
+
+	for (i = 1; i < max_port_p_one; i++) {
+		struct mlx4_active_ports exclusive_ports;
+		struct mlx4_slaves_pport slaves_pport_actv;
+		bitmap_zero(exclusive_ports.ports, dev->caps.num_ports);
+		set_bit(i - 1, exclusive_ports.ports);
+		if (i == port)
+			continue;
+		slaves_pport_actv = mlx4_phys_to_slaves_pport_actv(
+				    dev, &exclusive_ports);
+		slave_gid -= bitmap_weight(slaves_pport_actv.slaves,
+					   dev->num_vfs + 1);
+	}
+	vfs = bitmap_weight(slaves_pport.slaves, dev->num_vfs + 1) - 1;
+	if (slave_gid <= ((MLX4_ROCE_MAX_GIDS - MLX4_ROCE_PF_GIDS) % vfs))
+		return ((MLX4_ROCE_MAX_GIDS - MLX4_ROCE_PF_GIDS) / vfs) + 1;
+	return (MLX4_ROCE_MAX_GIDS - MLX4_ROCE_PF_GIDS) / vfs;
+}
+
+int mlx4_get_base_gid_ix(struct mlx4_dev *dev, int slave, int port)
+{
+	int gids;
+	unsigned i;
+	int slave_gid = slave;
+	int vfs;
+
+	struct mlx4_slaves_pport slaves_pport;
+	struct mlx4_active_ports actv_ports;
+	unsigned max_port_p_one;
+
+	if (slave == 0)
+		return 0;
+
+	slaves_pport = mlx4_phys_to_slaves_pport(dev, port);
+	actv_ports = mlx4_get_active_ports(dev, slave);
+	max_port_p_one = find_first_bit(actv_ports.ports, dev->caps.num_ports) +
+		bitmap_weight(actv_ports.ports, dev->caps.num_ports) + 1;
+
+	for (i = 1; i < max_port_p_one; i++) {
+		struct mlx4_active_ports exclusive_ports;
+		struct mlx4_slaves_pport slaves_pport_actv;
+		bitmap_zero(exclusive_ports.ports, dev->caps.num_ports);
+		set_bit(i - 1, exclusive_ports.ports);
+		if (i == port)
+			continue;
+		slaves_pport_actv = mlx4_phys_to_slaves_pport_actv(
+				    dev, &exclusive_ports);
+		slave_gid -= bitmap_weight(slaves_pport_actv.slaves,
+					   dev->num_vfs + 1);
+	}
+	gids = MLX4_ROCE_MAX_GIDS - MLX4_ROCE_PF_GIDS;
+	vfs = bitmap_weight(slaves_pport.slaves, dev->num_vfs + 1) - 1;
+	if (slave_gid <= gids % vfs)
+		return MLX4_ROCE_PF_GIDS + ((gids / vfs) + 1) * (slave_gid - 1);
+
+	return MLX4_ROCE_PF_GIDS + (gids % vfs) +
+		((gids / vfs) * (slave_gid - 1));
+}
+EXPORT_SYMBOL_GPL(mlx4_get_base_gid_ix);
+
+static int mlx4_reset_roce_port_gids(struct mlx4_dev *dev, int slave,
+				     int port, struct mlx4_cmd_mailbox *mailbox)
+{
+	struct mlx4_roce_gid_entry *gid_entry_mbox;
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int num_gids, base, offset;
+	int i, err;
+
+	num_gids = mlx4_get_slave_num_gids(dev, slave, port);
+	base = mlx4_get_base_gid_ix(dev, slave, port);
+
+	memset(mailbox->buf, 0, MLX4_MAILBOX_SIZE);
+
+	mutex_lock(&(priv->port[port].gid_table.mutex));
+	/* Zero-out gids belonging to that slave in the port GID table */
+	for (i = 0, offset = base; i < num_gids; offset++, i++)
+		memcpy(priv->port[port].gid_table.roce_gids[offset].raw,
+		       zgid_entry.raw, MLX4_ROCE_GID_ENTRY_SIZE);
+
+	/* Now, copy roce port gids table to mailbox for passing to FW */
+	gid_entry_mbox = (struct mlx4_roce_gid_entry *)mailbox->buf;
+	for (i = 0; i < MLX4_ROCE_MAX_GIDS; gid_entry_mbox++, i++)
+		memcpy(gid_entry_mbox->raw,
+		       priv->port[port].gid_table.roce_gids[i].raw,
+		       MLX4_ROCE_GID_ENTRY_SIZE);
+
+	err = mlx4_cmd(dev, mailbox->dma,
+		       ((u32)port) | (MLX4_SET_PORT_GID_TABLE << 8), 1,
+		       MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B,
+		       MLX4_CMD_NATIVE);
+	mutex_unlock(&(priv->port[port].gid_table.mutex));
+	return err;
+}
+
+
+void mlx4_reset_roce_gids(struct mlx4_dev *dev, int slave)
+{
+	struct mlx4_active_ports actv_ports;
+	struct mlx4_cmd_mailbox *mailbox;
+	int num_eth_ports, err;
+	int i;
+
+	if (slave < 0 || slave > dev->num_vfs)
+		return;
+
+	actv_ports = mlx4_get_active_ports(dev, slave);
+
+	for (i = 0, num_eth_ports = 0; i < dev->caps.num_ports; i++) {
+		if (test_bit(i, actv_ports.ports)) {
+			if (dev->caps.port_type[i + 1] != MLX4_PORT_TYPE_ETH)
+				continue;
+			num_eth_ports++;
+		}
+	}
+
+	if (!num_eth_ports)
+		return;
+
+	/* have ETH ports.  Alloc mailbox for SET_PORT command */
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return;
+
+	for (i = 0; i < dev->caps.num_ports; i++) {
+		if (test_bit(i, actv_ports.ports)) {
+			if (dev->caps.port_type[i + 1] != MLX4_PORT_TYPE_ETH)
+				continue;
+			err = mlx4_reset_roce_port_gids(dev, slave, i + 1, mailbox);
+			if (err)
+				mlx4_warn(dev, "Could not reset ETH port GID table for slave %d, port %d (%d)\n",
+					  slave, i + 1, err);
+		}
+	}
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return;
+}
+
+static int mlx4_common_set_port(struct mlx4_dev *dev, int slave, u32 in_mod,
+				u8 op_mod, struct mlx4_cmd_mailbox *inbox)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_port_info *port_info;
+	struct mlx4_mfunc_master_ctx *master = &priv->mfunc.master;
+	struct mlx4_slave_state *slave_st = &master->slave_state[slave];
+	struct mlx4_set_port_rqp_calc_context *qpn_context;
+	struct mlx4_set_port_general_context *gen_context;
+	struct mlx4_roce_gid_entry *gid_entry_tbl, *gid_entry_mbox, *gid_entry_mb1;
+	int reset_qkey_viols;
+	int port;
+	int is_eth;
+	int num_gids;
+	int base;
+	u32 in_modifier;
+	u32 promisc;
+	u16 mtu, prev_mtu;
+	int err;
+	int i, j;
+	int offset;
+	__be32 agg_cap_mask;
+	__be32 slave_cap_mask;
+	__be32 new_cap_mask;
+
+	port = in_mod & 0xff;
+	in_modifier = in_mod >> 8;
+	is_eth = op_mod;
+	port_info = &priv->port[port];
+
+	/* Slaves cannot perform SET_PORT operations except changing MTU */
+	if (is_eth) {
+		if (slave != dev->caps.function &&
+		    in_modifier != MLX4_SET_PORT_GENERAL &&
+		    in_modifier != MLX4_SET_PORT_GID_TABLE) {
+			mlx4_warn(dev, "denying SET_PORT for slave:%d\n",
+					slave);
+			return -EINVAL;
+		}
+		switch (in_modifier) {
+		case MLX4_SET_PORT_RQP_CALC:
+			qpn_context = inbox->buf;
+			qpn_context->base_qpn =
+				cpu_to_be32(port_info->base_qpn);
+			qpn_context->n_mac = 0x7;
+			promisc = be32_to_cpu(qpn_context->promisc) >>
+				SET_PORT_PROMISC_SHIFT;
+			qpn_context->promisc = cpu_to_be32(
+				promisc << SET_PORT_PROMISC_SHIFT |
+				port_info->base_qpn);
+			promisc = be32_to_cpu(qpn_context->mcast) >>
+				SET_PORT_MC_PROMISC_SHIFT;
+			qpn_context->mcast = cpu_to_be32(
+				promisc << SET_PORT_MC_PROMISC_SHIFT |
+				port_info->base_qpn);
+			break;
+		case MLX4_SET_PORT_GENERAL:
+			gen_context = inbox->buf;
+			/* Mtu is configured as the max MTU among all the
+			 * the functions on the port. */
+			mtu = be16_to_cpu(gen_context->mtu);
+			mtu = min_t(int, mtu, dev->caps.eth_mtu_cap[port] +
+				    ETH_HLEN + VLAN_HLEN + ETH_FCS_LEN);
+			prev_mtu = slave_st->mtu[port];
+			slave_st->mtu[port] = mtu;
+			if (mtu > master->max_mtu[port])
+				master->max_mtu[port] = mtu;
+			if (mtu < prev_mtu && prev_mtu ==
+						master->max_mtu[port]) {
+				slave_st->mtu[port] = mtu;
+				master->max_mtu[port] = mtu;
+				for (i = 0; i < dev->num_slaves; i++) {
+					master->max_mtu[port] =
+					max(master->max_mtu[port],
+					    master->slave_state[i].mtu[port]);
+				}
+			}
+
+			gen_context->mtu = cpu_to_be16(master->max_mtu[port]);
+			break;
+		case MLX4_SET_PORT_GID_TABLE:
+			/* change to MULTIPLE entries: number of guest's gids
+			 * need a FOR-loop here over number of gids the guest has.
+			 * 1. Check no duplicates in gids passed by slave
+			 */
+			num_gids = mlx4_get_slave_num_gids(dev, slave, port);
+			base = mlx4_get_base_gid_ix(dev, slave, port);
+			gid_entry_mbox = (struct mlx4_roce_gid_entry *)(inbox->buf);
+			for (i = 0; i < num_gids; gid_entry_mbox++, i++) {
+				if (!memcmp(gid_entry_mbox->raw, zgid_entry.raw,
+					    sizeof(zgid_entry)))
+					continue;
+				gid_entry_mb1 = gid_entry_mbox + 1;
+				for (j = i + 1; j < num_gids; gid_entry_mb1++, j++) {
+					if (!memcmp(gid_entry_mb1->raw,
+						    zgid_entry.raw, sizeof(zgid_entry)))
+						continue;
+					if (!memcmp(gid_entry_mb1->raw, gid_entry_mbox->raw,
+						    sizeof(gid_entry_mbox->raw))) {
+						/* found duplicate */
+						return -EINVAL;
+					}
+				}
+			}
+
+			/* 2. Check that do not have duplicates in OTHER
+			 *    entries in the port GID table
+			 */
+
+			mutex_lock(&(priv->port[port].gid_table.mutex));
+			for (i = 0; i < MLX4_ROCE_MAX_GIDS; i++) {
+				if (i >= base && i < base + num_gids)
+					continue; /* don't compare to slave's current gids */
+				gid_entry_tbl = &priv->port[port].gid_table.roce_gids[i];
+				if (!memcmp(gid_entry_tbl->raw, zgid_entry.raw, sizeof(zgid_entry)))
+					continue;
+				gid_entry_mbox = (struct mlx4_roce_gid_entry *)(inbox->buf);
+				for (j = 0; j < num_gids; gid_entry_mbox++, j++) {
+					if (!memcmp(gid_entry_mbox->raw, zgid_entry.raw,
+						    sizeof(zgid_entry)))
+						continue;
+					if (!memcmp(gid_entry_mbox->raw, gid_entry_tbl->raw,
+						    sizeof(gid_entry_tbl->raw))) {
+						/* found duplicate */
+						mlx4_warn(dev, "requested gid entry for slave:%d is a duplicate of gid at index %d\n",
+							  slave, i);
+						mutex_unlock(&(priv->port[port].gid_table.mutex));
+						return -EINVAL;
+					}
+				}
+			}
+
+			/* insert slave GIDs with memcpy, starting at slave's base index */
+			gid_entry_mbox = (struct mlx4_roce_gid_entry *)(inbox->buf);
+			for (i = 0, offset = base; i < num_gids; gid_entry_mbox++, offset++, i++)
+				memcpy(priv->port[port].gid_table.roce_gids[offset].raw,
+				       gid_entry_mbox->raw, MLX4_ROCE_GID_ENTRY_SIZE);
+
+			/* Now, copy roce port gids table to current mailbox for passing to FW */
+			gid_entry_mbox = (struct mlx4_roce_gid_entry *)(inbox->buf);
+			for (i = 0; i < MLX4_ROCE_MAX_GIDS; gid_entry_mbox++, i++)
+				memcpy(gid_entry_mbox->raw,
+				       priv->port[port].gid_table.roce_gids[i].raw,
+				       MLX4_ROCE_GID_ENTRY_SIZE);
+
+			err = mlx4_cmd(dev, inbox->dma, in_mod & 0xffff, op_mod,
+				       MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B,
+				       MLX4_CMD_NATIVE);
+			mutex_unlock(&(priv->port[port].gid_table.mutex));
+			return err;
+		}
+
+		return mlx4_cmd(dev, inbox->dma, in_mod & 0xffff, op_mod,
+				MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B,
+				MLX4_CMD_NATIVE);
+	}
+
+	/* For IB, we only consider:
+	 * - The capability mask, which is set to the aggregate of all
+	 *   slave function capabilities
+	 * - The QKey violatin counter - reset according to each request.
+	 */
+
+	if (dev->flags & MLX4_FLAG_OLD_PORT_CMDS) {
+		reset_qkey_viols = (*(u8 *) inbox->buf) & 0x40;
+		new_cap_mask = ((__be32 *) inbox->buf)[2];
+	} else {
+		reset_qkey_viols = ((u8 *) inbox->buf)[3] & 0x1;
+		new_cap_mask = ((__be32 *) inbox->buf)[1];
+	}
+
+	/* slave may not set the IS_SM capability for the port */
+	if (slave != mlx4_master_func_num(dev) &&
+	    (be32_to_cpu(new_cap_mask) & MLX4_PORT_CAP_IS_SM))
+		return -EINVAL;
+
+	/* No DEV_MGMT in multifunc mode */
+	if (mlx4_is_mfunc(dev) &&
+	    (be32_to_cpu(new_cap_mask) & MLX4_PORT_CAP_DEV_MGMT_SUP))
+		return -EINVAL;
+
+	agg_cap_mask = 0;
+	slave_cap_mask =
+		priv->mfunc.master.slave_state[slave].ib_cap_mask[port];
+	priv->mfunc.master.slave_state[slave].ib_cap_mask[port] = new_cap_mask;
+	for (i = 0; i < dev->num_slaves; i++)
+		agg_cap_mask |=
+			priv->mfunc.master.slave_state[i].ib_cap_mask[port];
+
+	/* only clear mailbox for guests.  Master may be setting
+	* MTU or PKEY table size
+	*/
+	if (slave != dev->caps.function)
+		memset(inbox->buf, 0, 256);
+	if (dev->flags & MLX4_FLAG_OLD_PORT_CMDS) {
+		*(u8 *) inbox->buf	   |= !!reset_qkey_viols << 6;
+		((__be32 *) inbox->buf)[2] = agg_cap_mask;
+	} else {
+		((u8 *) inbox->buf)[3]     |= !!reset_qkey_viols;
+		((__be32 *) inbox->buf)[1] = agg_cap_mask;
+	}
+
+	err = mlx4_cmd(dev, inbox->dma, port, is_eth, MLX4_CMD_SET_PORT,
+		       MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE);
+	if (err)
+		priv->mfunc.master.slave_state[slave].ib_cap_mask[port] =
+			slave_cap_mask;
+	return err;
+}
+
+int mlx4_SET_PORT_wrapper(struct mlx4_dev *dev, int slave,
+			  struct mlx4_vhcr *vhcr,
+			  struct mlx4_cmd_mailbox *inbox,
+			  struct mlx4_cmd_mailbox *outbox,
+			  struct mlx4_cmd_info *cmd)
+{
+	int port = mlx4_slave_convert_port(
+			dev, slave, vhcr->in_modifier & 0xFF);
+
+	if (port < 0)
+		return -EINVAL;
+
+	vhcr->in_modifier = (vhcr->in_modifier & ~0xFF) |
+			    (port & 0xFF);
+
+	return mlx4_common_set_port(dev, slave, vhcr->in_modifier,
+				    vhcr->op_modifier, inbox);
+}
+
+/* bit locations for set port command with zero op modifier */
+enum {
+	MLX4_SET_PORT_VL_CAP	 = 4, /* bits 7:4 */
+	MLX4_SET_PORT_MTU_CAP	 = 12, /* bits 15:12 */
+	MLX4_CHANGE_PORT_PKEY_TBL_SZ = 20,
+	MLX4_CHANGE_PORT_VL_CAP	 = 21,
+	MLX4_CHANGE_PORT_MTU_CAP = 22,
+};
+
+int mlx4_SET_PORT(struct mlx4_dev *dev, u8 port, int pkey_tbl_sz)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	int err, vl_cap, pkey_tbl_flag = 0;
+
+	if (dev->caps.port_type[port] == MLX4_PORT_TYPE_ETH)
+		return 0;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+
+	((__be32 *) mailbox->buf)[1] = dev->caps.ib_port_def_cap[port];
+
+	if (pkey_tbl_sz >= 0 && mlx4_is_master(dev)) {
+		pkey_tbl_flag = 1;
+		((__be16 *) mailbox->buf)[20] = cpu_to_be16(pkey_tbl_sz);
+	}
+
+	/* IB VL CAP enum isn't used by the firmware, just numerical values */
+	for (vl_cap = 8; vl_cap >= 1; vl_cap >>= 1) {
+		((__be32 *) mailbox->buf)[0] = cpu_to_be32(
+			(1 << MLX4_CHANGE_PORT_MTU_CAP) |
+			(1 << MLX4_CHANGE_PORT_VL_CAP)  |
+			(pkey_tbl_flag << MLX4_CHANGE_PORT_PKEY_TBL_SZ) |
+			(dev->caps.port_ib_mtu[port] << MLX4_SET_PORT_MTU_CAP) |
+			(vl_cap << MLX4_SET_PORT_VL_CAP));
+		err = mlx4_cmd(dev, mailbox->dma, port, 0, MLX4_CMD_SET_PORT,
+				MLX4_CMD_TIME_CLASS_B, MLX4_CMD_WRAPPED);
+		if (err != -ENOMEM)
+			break;
+	}
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+
+int mlx4_SET_PORT_general(struct mlx4_dev *dev, u8 port, int mtu,
+			  u8 pptx, u8 pfctx, u8 pprx, u8 pfcrx)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_set_port_general_context *context;
+	int err;
+	u32 in_mod;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	context = mailbox->buf;
+	context->flags = SET_PORT_GEN_ALL_VALID;
+	context->mtu = cpu_to_be16(mtu);
+	context->pptx = (pptx * (!pfctx)) << 7;
+	context->pfctx = pfctx;
+	context->pprx = (pprx * (!pfcrx)) << 7;
+	context->pfcrx = pfcrx;
+
+	in_mod = MLX4_SET_PORT_GENERAL << 8 | port;
+	err = mlx4_cmd(dev, mailbox->dma, in_mod, 1, MLX4_CMD_SET_PORT,
+		       MLX4_CMD_TIME_CLASS_B,  MLX4_CMD_WRAPPED);
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+EXPORT_SYMBOL(mlx4_SET_PORT_general);
+
+int mlx4_SET_PORT_qpn_calc(struct mlx4_dev *dev, u8 port, u32 base_qpn,
+			   u8 promisc)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_set_port_rqp_calc_context *context;
+	int err;
+	u32 in_mod;
+	u32 m_promisc = (dev->caps.flags & MLX4_DEV_CAP_FLAG_VEP_MC_STEER) ?
+		MCAST_DIRECT : MCAST_DEFAULT;
+
+	if (dev->caps.steering_mode != MLX4_STEERING_MODE_A0)
+		return 0;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	context = mailbox->buf;
+	context->base_qpn = cpu_to_be32(base_qpn);
+	context->n_mac = dev->caps.log_num_macs;
+	context->promisc = cpu_to_be32(promisc << SET_PORT_PROMISC_SHIFT |
+				       base_qpn);
+	context->mcast = cpu_to_be32(m_promisc << SET_PORT_MC_PROMISC_SHIFT |
+				     base_qpn);
+	context->intra_no_vlan = 0;
+	context->no_vlan = MLX4_NO_VLAN_IDX;
+	context->intra_vlan_miss = 0;
+	context->vlan_miss = MLX4_VLAN_MISS_IDX;
+
+	in_mod = MLX4_SET_PORT_RQP_CALC << 8 | port;
+	err = mlx4_cmd(dev, mailbox->dma, in_mod, 1, MLX4_CMD_SET_PORT,
+		       MLX4_CMD_TIME_CLASS_B,  MLX4_CMD_WRAPPED);
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+EXPORT_SYMBOL(mlx4_SET_PORT_qpn_calc);
+
+int mlx4_SET_PORT_PRIO2TC(struct mlx4_dev *dev, u8 port, u8 *prio2tc)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_set_port_prio2tc_context *context;
+	int err;
+	u32 in_mod;
+	int i;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	context = mailbox->buf;
+	for (i = 0; i < MLX4_NUM_UP; i += 2)
+		context->prio2tc[i >> 1] = prio2tc[i] << 4 | prio2tc[i + 1];
+
+	in_mod = MLX4_SET_PORT_PRIO2TC << 8 | port;
+	err = mlx4_cmd(dev, mailbox->dma, in_mod, 1, MLX4_CMD_SET_PORT,
+		       MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE);
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+EXPORT_SYMBOL(mlx4_SET_PORT_PRIO2TC);
+
+int mlx4_SET_PORT_SCHEDULER(struct mlx4_dev *dev, u8 port, u8 *tc_tx_bw,
+		u8 *pg, u16 *ratelimit)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_set_port_scheduler_context *context;
+	int err;
+	u32 in_mod;
+	int i;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	context = mailbox->buf;
+
+	for (i = 0; i < MLX4_NUM_TC; i++) {
+		struct mlx4_port_scheduler_tc_cfg_be *tc = &context->tc[i];
+		u16 r;
+
+		if (ratelimit && ratelimit[i]) {
+			if (ratelimit[i] <= MLX4_MAX_100M_UNITS_VAL) {
+				r = ratelimit[i];
+				tc->max_bw_units =
+					htons(MLX4_RATELIMIT_100M_UNITS);
+			} else {
+				r = ratelimit[i]/10;
+				tc->max_bw_units =
+					htons(MLX4_RATELIMIT_1G_UNITS);
+			}
+			tc->max_bw_value = htons(r);
+		} else {
+			tc->max_bw_value = htons(MLX4_RATELIMIT_DEFAULT);
+			tc->max_bw_units = htons(MLX4_RATELIMIT_1G_UNITS);
+		}
+
+		tc->pg = htons(pg[i]);
+		tc->bw_precentage = htons(tc_tx_bw[i]);
+	}
+
+	in_mod = MLX4_SET_PORT_SCHEDULER << 8 | port;
+	err = mlx4_cmd(dev, mailbox->dma, in_mod, 1, MLX4_CMD_SET_PORT,
+		       MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE);
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+EXPORT_SYMBOL(mlx4_SET_PORT_SCHEDULER);
+
+enum {
+	VXLAN_ENABLE_MODIFY	= 1 << 7,
+	VXLAN_STEERING_MODIFY	= 1 << 6,
+
+	VXLAN_ENABLE		= 1 << 7,
+};
+
+struct mlx4_set_port_vxlan_context {
+	u32	reserved1;
+	u8	modify_flags;
+	u8	reserved2;
+	u8	enable_flags;
+	u8	steering;
+};
+
+int mlx4_SET_PORT_VXLAN(struct mlx4_dev *dev, u8 port, u8 steering, int enable)
+{
+	int err;
+	u32 in_mod;
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_set_port_vxlan_context  *context;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	context = mailbox->buf;
+	memset(context, 0, sizeof(*context));
+
+	context->modify_flags = VXLAN_ENABLE_MODIFY | VXLAN_STEERING_MODIFY;
+	if (enable)
+		context->enable_flags = VXLAN_ENABLE;
+	context->steering  = steering;
+
+	in_mod = MLX4_SET_PORT_VXLAN << 8 | port;
+	err = mlx4_cmd(dev, mailbox->dma, in_mod, 1, MLX4_CMD_SET_PORT,
+		       MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE);
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+EXPORT_SYMBOL(mlx4_SET_PORT_VXLAN);
+
+int mlx4_SET_MCAST_FLTR_wrapper(struct mlx4_dev *dev, int slave,
+				struct mlx4_vhcr *vhcr,
+				struct mlx4_cmd_mailbox *inbox,
+				struct mlx4_cmd_mailbox *outbox,
+				struct mlx4_cmd_info *cmd)
+{
+	int err = 0;
+
+	return err;
+}
+
+int mlx4_SET_MCAST_FLTR(struct mlx4_dev *dev, u8 port,
+			u64 mac, u64 clear, u8 mode)
+{
+	return mlx4_cmd(dev, (mac | (clear << 63)), port, mode,
+			MLX4_CMD_SET_MCAST_FLTR, MLX4_CMD_TIME_CLASS_B,
+			MLX4_CMD_WRAPPED);
+}
+EXPORT_SYMBOL(mlx4_SET_MCAST_FLTR);
+
+int mlx4_SET_VLAN_FLTR_wrapper(struct mlx4_dev *dev, int slave,
+			       struct mlx4_vhcr *vhcr,
+			       struct mlx4_cmd_mailbox *inbox,
+			       struct mlx4_cmd_mailbox *outbox,
+			       struct mlx4_cmd_info *cmd)
+{
+	int err = 0;
+
+	return err;
+}
+
+int mlx4_common_dump_eth_stats(struct mlx4_dev *dev, int slave,
+			       u32 in_mod, struct mlx4_cmd_mailbox *outbox)
+{
+	return mlx4_cmd_box(dev, 0, outbox->dma, in_mod, 0,
+			    MLX4_CMD_DUMP_ETH_STATS, MLX4_CMD_TIME_CLASS_B,
+			    MLX4_CMD_NATIVE);
+}
+
+int mlx4_DUMP_ETH_STATS_wrapper(struct mlx4_dev *dev, int slave,
+				struct mlx4_vhcr *vhcr,
+				struct mlx4_cmd_mailbox *inbox,
+				struct mlx4_cmd_mailbox *outbox,
+				struct mlx4_cmd_info *cmd)
+{
+	if (slave != dev->caps.function)
+		return 0;
+	return mlx4_common_dump_eth_stats(dev, slave,
+					  vhcr->in_modifier, outbox);
+}
+
+void mlx4_set_stats_bitmap(struct mlx4_dev *dev, u64 *stats_bitmap)
+{
+	if (!mlx4_is_mfunc(dev)) {
+		*stats_bitmap = 0;
+		return;
+	}
+
+	*stats_bitmap = (MLX4_STATS_TRAFFIC_COUNTERS_MASK |
+			 MLX4_STATS_TRAFFIC_DROPS_MASK |
+			 MLX4_STATS_PORT_COUNTERS_MASK);
+
+	if (mlx4_is_master(dev))
+		*stats_bitmap |= MLX4_STATS_ERROR_COUNTERS_MASK;
+}
+EXPORT_SYMBOL(mlx4_set_stats_bitmap);
+
+int mlx4_get_slave_from_roce_gid(struct mlx4_dev *dev, int port, u8 *gid,
+				 int *slave_id)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int i, found_ix = -1;
+	int vf_gids = MLX4_ROCE_MAX_GIDS - MLX4_ROCE_PF_GIDS;
+	struct mlx4_slaves_pport slaves_pport;
+	unsigned num_vfs;
+	int slave_gid;
+
+	if (!mlx4_is_mfunc(dev))
+		return -EINVAL;
+
+	slaves_pport = mlx4_phys_to_slaves_pport(dev, port);
+	num_vfs = bitmap_weight(slaves_pport.slaves, dev->num_vfs + 1) - 1;
+
+	for (i = 0; i < MLX4_ROCE_MAX_GIDS; i++) {
+		if (!memcmp(priv->port[port].gid_table.roce_gids[i].raw, gid,
+			    MLX4_ROCE_GID_ENTRY_SIZE)) {
+			found_ix = i;
+			break;
+		}
+	}
+
+	if (found_ix >= 0) {
+		/* Calculate a slave_gid which is the slave number in the gid
+		 * table and not a globally unique slave number.
+		 */
+		if (found_ix < MLX4_ROCE_PF_GIDS)
+			slave_gid = 0;
+		else if (found_ix < MLX4_ROCE_PF_GIDS + (vf_gids % num_vfs) *
+			 (vf_gids / num_vfs + 1))
+			slave_gid = ((found_ix - MLX4_ROCE_PF_GIDS) /
+				     (vf_gids / num_vfs + 1)) + 1;
+		else
+			slave_gid =
+			((found_ix - MLX4_ROCE_PF_GIDS -
+			  ((vf_gids % num_vfs) * ((vf_gids / num_vfs + 1)))) /
+			 (vf_gids / num_vfs)) + vf_gids % num_vfs + 1;
+
+		/* Calculate the globally unique slave id */
+		if (slave_gid) {
+			struct mlx4_active_ports exclusive_ports;
+			struct mlx4_active_ports actv_ports;
+			struct mlx4_slaves_pport slaves_pport_actv;
+			unsigned max_port_p_one;
+			int num_vfs_before = 0;
+			int candidate_slave_gid;
+
+			/* Calculate how many VFs are on the previous port, if exists */
+			for (i = 1; i < port; i++) {
+				bitmap_zero(exclusive_ports.ports, dev->caps.num_ports);
+				set_bit(i - 1, exclusive_ports.ports);
+				slaves_pport_actv =
+					mlx4_phys_to_slaves_pport_actv(
+							dev, &exclusive_ports);
+				num_vfs_before += bitmap_weight(
+						slaves_pport_actv.slaves,
+						dev->num_vfs + 1);
+			}
+
+			/* candidate_slave_gid isn't necessarily the correct slave, but
+			 * it has the same number of ports and is assigned to the same
+			 * ports as the real slave we're looking for. On dual port VF,
+			 * slave_gid = [single port VFs on port <port>] +
+			 * [offset of the current slave from the first dual port VF] +
+			 * 1 (for the PF).
+			 */
+			candidate_slave_gid = slave_gid + num_vfs_before;
+
+			actv_ports = mlx4_get_active_ports(dev, candidate_slave_gid);
+			max_port_p_one = find_first_bit(
+				actv_ports.ports, dev->caps.num_ports) +
+				bitmap_weight(actv_ports.ports,
+					      dev->caps.num_ports) + 1;
+
+			/* Calculate the real slave number */
+			for (i = 1; i < max_port_p_one; i++) {
+				if (i == port)
+					continue;
+				bitmap_zero(exclusive_ports.ports,
+					    dev->caps.num_ports);
+				set_bit(i - 1, exclusive_ports.ports);
+				slaves_pport_actv =
+					mlx4_phys_to_slaves_pport_actv(
+						dev, &exclusive_ports);
+				slave_gid += bitmap_weight(
+						slaves_pport_actv.slaves,
+						dev->num_vfs + 1);
+			}
+		}
+		*slave_id = slave_gid;
+	}
+
+	return (found_ix >= 0) ? 0 : -EINVAL;
+}
+EXPORT_SYMBOL(mlx4_get_slave_from_roce_gid);
+
+int mlx4_get_roce_gid_from_slave(struct mlx4_dev *dev, int port, int slave_id,
+				 u8 *gid)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	if (!mlx4_is_master(dev))
+		return -EINVAL;
+
+	memcpy(gid, priv->port[port].gid_table.roce_gids[slave_id].raw,
+	       MLX4_ROCE_GID_ENTRY_SIZE);
+	return 0;
+}
+EXPORT_SYMBOL(mlx4_get_roce_gid_from_slave);
diff --git a/drivers/net/ethernet/mellanox/mlx4/profile.c b/drivers/net/ethernet/mellanox/mlx4/profile.c
new file mode 100644
index 0000000..14089d9
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4/profile.c
@@ -0,0 +1,266 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2006, 2007 Cisco Systems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/slab.h>
+
+#include "mlx4.h"
+#include "fw.h"
+
+enum {
+	MLX4_RES_QP,
+	MLX4_RES_RDMARC,
+	MLX4_RES_ALTC,
+	MLX4_RES_AUXC,
+	MLX4_RES_SRQ,
+	MLX4_RES_CQ,
+	MLX4_RES_EQ,
+	MLX4_RES_DMPT,
+	MLX4_RES_CMPT,
+	MLX4_RES_MTT,
+	MLX4_RES_MCG,
+	MLX4_RES_NUM
+};
+
+static const char *res_name[] = {
+	[MLX4_RES_QP]		= "QP",
+	[MLX4_RES_RDMARC]	= "RDMARC",
+	[MLX4_RES_ALTC]		= "ALTC",
+	[MLX4_RES_AUXC]		= "AUXC",
+	[MLX4_RES_SRQ]		= "SRQ",
+	[MLX4_RES_CQ]		= "CQ",
+	[MLX4_RES_EQ]		= "EQ",
+	[MLX4_RES_DMPT]		= "DMPT",
+	[MLX4_RES_CMPT]		= "CMPT",
+	[MLX4_RES_MTT]		= "MTT",
+	[MLX4_RES_MCG]		= "MCG",
+};
+
+u64 mlx4_make_profile(struct mlx4_dev *dev,
+		      struct mlx4_profile *request,
+		      struct mlx4_dev_cap *dev_cap,
+		      struct mlx4_init_hca_param *init_hca)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_resource {
+		u64 size;
+		u64 start;
+		int type;
+		u32 num;
+		int log_num;
+	};
+
+	u64 total_size = 0;
+	struct mlx4_resource *profile;
+	struct mlx4_resource tmp;
+	struct sysinfo si;
+	int i, j;
+
+	profile = kcalloc(MLX4_RES_NUM, sizeof(*profile), GFP_KERNEL);
+	if (!profile)
+		return -ENOMEM;
+
+	/*
+	 * We want to scale the number of MTTs with the size of the
+	 * system memory, since it makes sense to register a lot of
+	 * memory on a system with a lot of memory.  As a heuristic,
+	 * make sure we have enough MTTs to cover twice the system
+	 * memory (with PAGE_SIZE entries).
+	 *
+	 * This number has to be a power of two and fit into 32 bits
+	 * due to device limitations, so cap this at 2^31 as well.
+	 * That limits us to 8TB of memory registration per HCA with
+	 * 4KB pages, which is probably OK for the next few months.
+	 */
+	si_meminfo(&si);
+	request->num_mtt =
+		roundup_pow_of_two(max_t(unsigned, request->num_mtt,
+					 min(1UL << (31 - log_mtts_per_seg),
+					     si.totalram >> (log_mtts_per_seg - 1))));
+
+	profile[MLX4_RES_QP].size     = dev_cap->qpc_entry_sz;
+	profile[MLX4_RES_RDMARC].size = dev_cap->rdmarc_entry_sz;
+	profile[MLX4_RES_ALTC].size   = dev_cap->altc_entry_sz;
+	profile[MLX4_RES_AUXC].size   = dev_cap->aux_entry_sz;
+	profile[MLX4_RES_SRQ].size    = dev_cap->srq_entry_sz;
+	profile[MLX4_RES_CQ].size     = dev_cap->cqc_entry_sz;
+	profile[MLX4_RES_EQ].size     = dev_cap->eqc_entry_sz;
+	profile[MLX4_RES_DMPT].size   = dev_cap->dmpt_entry_sz;
+	profile[MLX4_RES_CMPT].size   = dev_cap->cmpt_entry_sz;
+	profile[MLX4_RES_MTT].size    = dev_cap->mtt_entry_sz;
+	profile[MLX4_RES_MCG].size    = mlx4_get_mgm_entry_size(dev);
+
+	profile[MLX4_RES_QP].num      = request->num_qp;
+	profile[MLX4_RES_RDMARC].num  = request->num_qp * request->rdmarc_per_qp;
+	profile[MLX4_RES_ALTC].num    = request->num_qp;
+	profile[MLX4_RES_AUXC].num    = request->num_qp;
+	profile[MLX4_RES_SRQ].num     = request->num_srq;
+	profile[MLX4_RES_CQ].num      = request->num_cq;
+	profile[MLX4_RES_EQ].num      = mlx4_is_mfunc(dev) ?
+					dev->phys_caps.num_phys_eqs :
+					min_t(unsigned, dev_cap->max_eqs, MAX_MSIX);
+	profile[MLX4_RES_DMPT].num    = request->num_mpt;
+	profile[MLX4_RES_CMPT].num    = MLX4_NUM_CMPTS;
+	profile[MLX4_RES_MTT].num     = request->num_mtt * (1 << log_mtts_per_seg);
+	profile[MLX4_RES_MCG].num     = request->num_mcg;
+
+	for (i = 0; i < MLX4_RES_NUM; ++i) {
+		profile[i].type     = i;
+		profile[i].num      = roundup_pow_of_two(profile[i].num);
+		profile[i].log_num  = ilog2(profile[i].num);
+		profile[i].size    *= profile[i].num;
+		profile[i].size     = max(profile[i].size, (u64) PAGE_SIZE);
+	}
+
+	/*
+	 * Sort the resources in decreasing order of size.  Since they
+	 * all have sizes that are powers of 2, we'll be able to keep
+	 * resources aligned to their size and pack them without gaps
+	 * using the sorted order.
+	 */
+	for (i = MLX4_RES_NUM; i > 0; --i)
+		for (j = 1; j < i; ++j) {
+			if (profile[j].size > profile[j - 1].size) {
+				tmp	       = profile[j];
+				profile[j]     = profile[j - 1];
+				profile[j - 1] = tmp;
+			}
+		}
+
+	for (i = 0; i < MLX4_RES_NUM; ++i) {
+		if (profile[i].size) {
+			profile[i].start = total_size;
+			total_size	+= profile[i].size;
+		}
+
+		if (total_size > dev_cap->max_icm_sz) {
+			mlx4_err(dev, "Profile requires 0x%llx bytes; won't fit in 0x%llx bytes of context memory\n",
+				 (unsigned long long) total_size,
+				 (unsigned long long) dev_cap->max_icm_sz);
+			kfree(profile);
+			return -ENOMEM;
+		}
+
+		if (profile[i].size)
+			mlx4_dbg(dev, "  profile[%2d] (%6s): 2^%02d entries @ 0x%10llx, size 0x%10llx\n",
+				 i, res_name[profile[i].type],
+				 profile[i].log_num,
+				 (unsigned long long) profile[i].start,
+				 (unsigned long long) profile[i].size);
+	}
+
+	mlx4_dbg(dev, "HCA context memory: reserving %d KB\n",
+		 (int) (total_size >> 10));
+
+	for (i = 0; i < MLX4_RES_NUM; ++i) {
+		switch (profile[i].type) {
+		case MLX4_RES_QP:
+			dev->caps.num_qps     = profile[i].num;
+			init_hca->qpc_base    = profile[i].start;
+			init_hca->log_num_qps = profile[i].log_num;
+			break;
+		case MLX4_RES_RDMARC:
+			for (priv->qp_table.rdmarc_shift = 0;
+			     request->num_qp << priv->qp_table.rdmarc_shift < profile[i].num;
+			     ++priv->qp_table.rdmarc_shift)
+				; /* nothing */
+			dev->caps.max_qp_dest_rdma = 1 << priv->qp_table.rdmarc_shift;
+			priv->qp_table.rdmarc_base   = (u32) profile[i].start;
+			init_hca->rdmarc_base	     = profile[i].start;
+			init_hca->log_rd_per_qp	     = priv->qp_table.rdmarc_shift;
+			break;
+		case MLX4_RES_ALTC:
+			init_hca->altc_base = profile[i].start;
+			break;
+		case MLX4_RES_AUXC:
+			init_hca->auxc_base = profile[i].start;
+			break;
+		case MLX4_RES_SRQ:
+			dev->caps.num_srqs     = profile[i].num;
+			init_hca->srqc_base    = profile[i].start;
+			init_hca->log_num_srqs = profile[i].log_num;
+			break;
+		case MLX4_RES_CQ:
+			dev->caps.num_cqs     = profile[i].num;
+			init_hca->cqc_base    = profile[i].start;
+			init_hca->log_num_cqs = profile[i].log_num;
+			break;
+		case MLX4_RES_EQ:
+			dev->caps.num_eqs     = roundup_pow_of_two(min_t(unsigned, dev_cap->max_eqs,
+									 MAX_MSIX));
+			init_hca->eqc_base    = profile[i].start;
+			init_hca->log_num_eqs = ilog2(dev->caps.num_eqs);
+			break;
+		case MLX4_RES_DMPT:
+			dev->caps.num_mpts	= profile[i].num;
+			priv->mr_table.mpt_base = profile[i].start;
+			init_hca->dmpt_base	= profile[i].start;
+			init_hca->log_mpt_sz	= profile[i].log_num;
+			break;
+		case MLX4_RES_CMPT:
+			init_hca->cmpt_base	 = profile[i].start;
+			break;
+		case MLX4_RES_MTT:
+			dev->caps.num_mtts	 = profile[i].num;
+			priv->mr_table.mtt_base	 = profile[i].start;
+			init_hca->mtt_base	 = profile[i].start;
+			break;
+		case MLX4_RES_MCG:
+			init_hca->mc_base	  = profile[i].start;
+			init_hca->log_mc_entry_sz =
+					ilog2(mlx4_get_mgm_entry_size(dev));
+			init_hca->log_mc_table_sz = profile[i].log_num;
+			if (dev->caps.steering_mode ==
+			    MLX4_STEERING_MODE_DEVICE_MANAGED) {
+				dev->caps.num_mgms = profile[i].num;
+			} else {
+				init_hca->log_mc_hash_sz =
+						profile[i].log_num - 1;
+				dev->caps.num_mgms = profile[i].num >> 1;
+				dev->caps.num_amgms = profile[i].num >> 1;
+			}
+			break;
+		default:
+			break;
+		}
+	}
+
+	/*
+	 * PDs don't take any HCA memory, but we assign them as part
+	 * of the HCA profile anyway.
+	 */
+	dev->caps.num_pds = MLX4_NUM_PDS;
+
+	kfree(profile);
+	return total_size;
+}
diff --git a/drivers/net/ethernet/mellanox/mlx4/qp.c b/drivers/net/ethernet/mellanox/mlx4/qp.c
new file mode 100644
index 0000000..2301365
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4/qp.c
@@ -0,0 +1,633 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005, 2006, 2007 Cisco Systems, Inc. All rights reserved.
+ * Copyright (c) 2005, 2006, 2007, 2008 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2004 Voltaire, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/gfp.h>
+#include <linux/export.h>
+
+#include <linux/mlx4/cmd.h>
+#include <linux/mlx4/qp.h>
+
+#include "mlx4.h"
+#include "icm.h"
+
+void mlx4_qp_event(struct mlx4_dev *dev, u32 qpn, int event_type)
+{
+	struct mlx4_qp_table *qp_table = &mlx4_priv(dev)->qp_table;
+	struct mlx4_qp *qp;
+
+	spin_lock(&qp_table->lock);
+
+	qp = __mlx4_qp_lookup(dev, qpn);
+	if (qp)
+		atomic_inc(&qp->refcount);
+
+	spin_unlock(&qp_table->lock);
+
+	if (!qp) {
+		mlx4_dbg(dev, "Async event for none existent QP %08x\n", qpn);
+		return;
+	}
+
+	qp->event(qp, event_type);
+
+	if (atomic_dec_and_test(&qp->refcount))
+		complete(&qp->free);
+}
+
+/* used for INIT/CLOSE port logic */
+static int is_master_qp0(struct mlx4_dev *dev, struct mlx4_qp *qp, int *real_qp0, int *proxy_qp0)
+{
+	/* this procedure is called after we already know we are on the master */
+	/* qp0 is either the proxy qp0, or the real qp0 */
+	u32 pf_proxy_offset = dev->phys_caps.base_proxy_sqpn + 8 * mlx4_master_func_num(dev);
+	*proxy_qp0 = qp->qpn >= pf_proxy_offset && qp->qpn <= pf_proxy_offset + 1;
+
+	*real_qp0 = qp->qpn >= dev->phys_caps.base_sqpn &&
+		qp->qpn <= dev->phys_caps.base_sqpn + 1;
+
+	return *real_qp0 || *proxy_qp0;
+}
+
+static int __mlx4_qp_modify(struct mlx4_dev *dev, struct mlx4_mtt *mtt,
+		     enum mlx4_qp_state cur_state, enum mlx4_qp_state new_state,
+		     struct mlx4_qp_context *context,
+		     enum mlx4_qp_optpar optpar,
+		     int sqd_event, struct mlx4_qp *qp, int native)
+{
+	static const u16 op[MLX4_QP_NUM_STATE][MLX4_QP_NUM_STATE] = {
+		[MLX4_QP_STATE_RST] = {
+			[MLX4_QP_STATE_RST]	= MLX4_CMD_2RST_QP,
+			[MLX4_QP_STATE_ERR]	= MLX4_CMD_2ERR_QP,
+			[MLX4_QP_STATE_INIT]	= MLX4_CMD_RST2INIT_QP,
+		},
+		[MLX4_QP_STATE_INIT]  = {
+			[MLX4_QP_STATE_RST]	= MLX4_CMD_2RST_QP,
+			[MLX4_QP_STATE_ERR]	= MLX4_CMD_2ERR_QP,
+			[MLX4_QP_STATE_INIT]	= MLX4_CMD_INIT2INIT_QP,
+			[MLX4_QP_STATE_RTR]	= MLX4_CMD_INIT2RTR_QP,
+		},
+		[MLX4_QP_STATE_RTR]   = {
+			[MLX4_QP_STATE_RST]	= MLX4_CMD_2RST_QP,
+			[MLX4_QP_STATE_ERR]	= MLX4_CMD_2ERR_QP,
+			[MLX4_QP_STATE_RTS]	= MLX4_CMD_RTR2RTS_QP,
+		},
+		[MLX4_QP_STATE_RTS]   = {
+			[MLX4_QP_STATE_RST]	= MLX4_CMD_2RST_QP,
+			[MLX4_QP_STATE_ERR]	= MLX4_CMD_2ERR_QP,
+			[MLX4_QP_STATE_RTS]	= MLX4_CMD_RTS2RTS_QP,
+			[MLX4_QP_STATE_SQD]	= MLX4_CMD_RTS2SQD_QP,
+		},
+		[MLX4_QP_STATE_SQD] = {
+			[MLX4_QP_STATE_RST]	= MLX4_CMD_2RST_QP,
+			[MLX4_QP_STATE_ERR]	= MLX4_CMD_2ERR_QP,
+			[MLX4_QP_STATE_RTS]	= MLX4_CMD_SQD2RTS_QP,
+			[MLX4_QP_STATE_SQD]	= MLX4_CMD_SQD2SQD_QP,
+		},
+		[MLX4_QP_STATE_SQER] = {
+			[MLX4_QP_STATE_RST]	= MLX4_CMD_2RST_QP,
+			[MLX4_QP_STATE_ERR]	= MLX4_CMD_2ERR_QP,
+			[MLX4_QP_STATE_RTS]	= MLX4_CMD_SQERR2RTS_QP,
+		},
+		[MLX4_QP_STATE_ERR] = {
+			[MLX4_QP_STATE_RST]	= MLX4_CMD_2RST_QP,
+			[MLX4_QP_STATE_ERR]	= MLX4_CMD_2ERR_QP,
+		}
+	};
+
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_cmd_mailbox *mailbox;
+	int ret = 0;
+	int real_qp0 = 0;
+	int proxy_qp0 = 0;
+	u8 port;
+
+	if (cur_state >= MLX4_QP_NUM_STATE || new_state >= MLX4_QP_NUM_STATE ||
+	    !op[cur_state][new_state])
+		return -EINVAL;
+
+	if (op[cur_state][new_state] == MLX4_CMD_2RST_QP) {
+		ret = mlx4_cmd(dev, 0, qp->qpn, 2,
+			MLX4_CMD_2RST_QP, MLX4_CMD_TIME_CLASS_A, native);
+		if (mlx4_is_master(dev) && cur_state != MLX4_QP_STATE_ERR &&
+		    cur_state != MLX4_QP_STATE_RST &&
+		    is_master_qp0(dev, qp, &real_qp0, &proxy_qp0)) {
+			port = (qp->qpn & 1) + 1;
+			if (proxy_qp0)
+				priv->mfunc.master.qp0_state[port].proxy_qp0_active = 0;
+			else
+				priv->mfunc.master.qp0_state[port].qp0_active = 0;
+		}
+		return ret;
+	}
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+
+	if (cur_state == MLX4_QP_STATE_RST && new_state == MLX4_QP_STATE_INIT) {
+		u64 mtt_addr = mlx4_mtt_addr(dev, mtt);
+		context->mtt_base_addr_h = mtt_addr >> 32;
+		context->mtt_base_addr_l = cpu_to_be32(mtt_addr & 0xffffffff);
+		context->log_page_size   = mtt->page_shift - MLX4_ICM_PAGE_SHIFT;
+	}
+
+	*(__be32 *) mailbox->buf = cpu_to_be32(optpar);
+	memcpy(mailbox->buf + 8, context, sizeof *context);
+
+	((struct mlx4_qp_context *) (mailbox->buf + 8))->local_qpn =
+		cpu_to_be32(qp->qpn);
+
+	ret = mlx4_cmd(dev, mailbox->dma,
+		       qp->qpn | (!!sqd_event << 31),
+		       new_state == MLX4_QP_STATE_RST ? 2 : 0,
+		       op[cur_state][new_state], MLX4_CMD_TIME_CLASS_C, native);
+
+	if (mlx4_is_master(dev) && is_master_qp0(dev, qp, &real_qp0, &proxy_qp0)) {
+		port = (qp->qpn & 1) + 1;
+		if (cur_state != MLX4_QP_STATE_ERR &&
+		    cur_state != MLX4_QP_STATE_RST &&
+		    new_state == MLX4_QP_STATE_ERR) {
+			if (proxy_qp0)
+				priv->mfunc.master.qp0_state[port].proxy_qp0_active = 0;
+			else
+				priv->mfunc.master.qp0_state[port].qp0_active = 0;
+		} else if (new_state == MLX4_QP_STATE_RTR) {
+			if (proxy_qp0)
+				priv->mfunc.master.qp0_state[port].proxy_qp0_active = 1;
+			else
+				priv->mfunc.master.qp0_state[port].qp0_active = 1;
+		}
+	}
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return ret;
+}
+
+int mlx4_qp_modify(struct mlx4_dev *dev, struct mlx4_mtt *mtt,
+		   enum mlx4_qp_state cur_state, enum mlx4_qp_state new_state,
+		   struct mlx4_qp_context *context,
+		   enum mlx4_qp_optpar optpar,
+		   int sqd_event, struct mlx4_qp *qp)
+{
+	return __mlx4_qp_modify(dev, mtt, cur_state, new_state, context,
+				optpar, sqd_event, qp, 0);
+}
+EXPORT_SYMBOL_GPL(mlx4_qp_modify);
+
+int __mlx4_qp_reserve_range(struct mlx4_dev *dev, int cnt, int align,
+				   int *base)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_qp_table *qp_table = &priv->qp_table;
+
+	*base = mlx4_bitmap_alloc_range(&qp_table->bitmap, cnt, align);
+	if (*base == -1)
+		return -ENOMEM;
+
+	return 0;
+}
+
+int mlx4_qp_reserve_range(struct mlx4_dev *dev, int cnt, int align, int *base)
+{
+	u64 in_param = 0;
+	u64 out_param;
+	int err;
+
+	if (mlx4_is_mfunc(dev)) {
+		set_param_l(&in_param, cnt);
+		set_param_h(&in_param, align);
+		err = mlx4_cmd_imm(dev, in_param, &out_param,
+				   RES_QP, RES_OP_RESERVE,
+				   MLX4_CMD_ALLOC_RES,
+				   MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED);
+		if (err)
+			return err;
+
+		*base = get_param_l(&out_param);
+		return 0;
+	}
+	return __mlx4_qp_reserve_range(dev, cnt, align, base);
+}
+EXPORT_SYMBOL_GPL(mlx4_qp_reserve_range);
+
+void __mlx4_qp_release_range(struct mlx4_dev *dev, int base_qpn, int cnt)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_qp_table *qp_table = &priv->qp_table;
+
+	if (mlx4_is_qp_reserved(dev, (u32) base_qpn))
+		return;
+	mlx4_bitmap_free_range(&qp_table->bitmap, base_qpn, cnt, MLX4_USE_RR);
+}
+
+void mlx4_qp_release_range(struct mlx4_dev *dev, int base_qpn, int cnt)
+{
+	u64 in_param = 0;
+	int err;
+
+	if (mlx4_is_mfunc(dev)) {
+		set_param_l(&in_param, base_qpn);
+		set_param_h(&in_param, cnt);
+		err = mlx4_cmd(dev, in_param, RES_QP, RES_OP_RESERVE,
+			       MLX4_CMD_FREE_RES,
+			       MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED);
+		if (err) {
+			mlx4_warn(dev, "Failed to release qp range base:%d cnt:%d\n",
+				  base_qpn, cnt);
+		}
+	} else
+		 __mlx4_qp_release_range(dev, base_qpn, cnt);
+}
+EXPORT_SYMBOL_GPL(mlx4_qp_release_range);
+
+int __mlx4_qp_alloc_icm(struct mlx4_dev *dev, int qpn, gfp_t gfp)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_qp_table *qp_table = &priv->qp_table;
+	int err;
+
+	err = mlx4_table_get(dev, &qp_table->qp_table, qpn, gfp);
+	if (err)
+		goto err_out;
+
+	err = mlx4_table_get(dev, &qp_table->auxc_table, qpn, gfp);
+	if (err)
+		goto err_put_qp;
+
+	err = mlx4_table_get(dev, &qp_table->altc_table, qpn, gfp);
+	if (err)
+		goto err_put_auxc;
+
+	err = mlx4_table_get(dev, &qp_table->rdmarc_table, qpn, gfp);
+	if (err)
+		goto err_put_altc;
+
+	err = mlx4_table_get(dev, &qp_table->cmpt_table, qpn, gfp);
+	if (err)
+		goto err_put_rdmarc;
+
+	return 0;
+
+err_put_rdmarc:
+	mlx4_table_put(dev, &qp_table->rdmarc_table, qpn);
+
+err_put_altc:
+	mlx4_table_put(dev, &qp_table->altc_table, qpn);
+
+err_put_auxc:
+	mlx4_table_put(dev, &qp_table->auxc_table, qpn);
+
+err_put_qp:
+	mlx4_table_put(dev, &qp_table->qp_table, qpn);
+
+err_out:
+	return err;
+}
+
+static int mlx4_qp_alloc_icm(struct mlx4_dev *dev, int qpn, gfp_t gfp)
+{
+	u64 param = 0;
+
+	if (mlx4_is_mfunc(dev)) {
+		set_param_l(&param, qpn);
+		return mlx4_cmd_imm(dev, param, &param, RES_QP, RES_OP_MAP_ICM,
+				    MLX4_CMD_ALLOC_RES, MLX4_CMD_TIME_CLASS_A,
+				    MLX4_CMD_WRAPPED);
+	}
+	return __mlx4_qp_alloc_icm(dev, qpn, gfp);
+}
+
+void __mlx4_qp_free_icm(struct mlx4_dev *dev, int qpn)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_qp_table *qp_table = &priv->qp_table;
+
+	mlx4_table_put(dev, &qp_table->cmpt_table, qpn);
+	mlx4_table_put(dev, &qp_table->rdmarc_table, qpn);
+	mlx4_table_put(dev, &qp_table->altc_table, qpn);
+	mlx4_table_put(dev, &qp_table->auxc_table, qpn);
+	mlx4_table_put(dev, &qp_table->qp_table, qpn);
+}
+
+static void mlx4_qp_free_icm(struct mlx4_dev *dev, int qpn)
+{
+	u64 in_param = 0;
+
+	if (mlx4_is_mfunc(dev)) {
+		set_param_l(&in_param, qpn);
+		if (mlx4_cmd(dev, in_param, RES_QP, RES_OP_MAP_ICM,
+			     MLX4_CMD_FREE_RES, MLX4_CMD_TIME_CLASS_A,
+			     MLX4_CMD_WRAPPED))
+			mlx4_warn(dev, "Failed to free icm of qp:%d\n", qpn);
+	} else
+		__mlx4_qp_free_icm(dev, qpn);
+}
+
+int mlx4_qp_alloc(struct mlx4_dev *dev, int qpn, struct mlx4_qp *qp, gfp_t gfp)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_qp_table *qp_table = &priv->qp_table;
+	int err;
+
+	if (!qpn)
+		return -EINVAL;
+
+	qp->qpn = qpn;
+
+	err = mlx4_qp_alloc_icm(dev, qpn, gfp);
+	if (err)
+		return err;
+
+	spin_lock_irq(&qp_table->lock);
+	err = radix_tree_insert(&dev->qp_table_tree, qp->qpn &
+				(dev->caps.num_qps - 1), qp);
+	spin_unlock_irq(&qp_table->lock);
+	if (err)
+		goto err_icm;
+
+	atomic_set(&qp->refcount, 1);
+	init_completion(&qp->free);
+
+	return 0;
+
+err_icm:
+	mlx4_qp_free_icm(dev, qpn);
+	return err;
+}
+
+EXPORT_SYMBOL_GPL(mlx4_qp_alloc);
+
+#define MLX4_UPDATE_QP_SUPPORTED_ATTRS MLX4_UPDATE_QP_SMAC
+int mlx4_update_qp(struct mlx4_dev *dev, u32 qpn,
+		   enum mlx4_update_qp_attr attr,
+		   struct mlx4_update_qp_params *params)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_update_qp_context *cmd;
+	u64 pri_addr_path_mask = 0;
+	u64 qp_mask = 0;
+	int err = 0;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+
+	cmd = (struct mlx4_update_qp_context *)mailbox->buf;
+
+	if (!attr || (attr & ~MLX4_UPDATE_QP_SUPPORTED_ATTRS))
+		return -EINVAL;
+
+	if (attr & MLX4_UPDATE_QP_SMAC) {
+		pri_addr_path_mask |= 1ULL << MLX4_UPD_QP_PATH_MASK_MAC_INDEX;
+		cmd->qp_context.pri_path.grh_mylmc = params->smac_index;
+	}
+
+	if (attr & MLX4_UPDATE_QP_VSD) {
+		qp_mask |= 1ULL << MLX4_UPD_QP_MASK_VSD;
+		if (params->flags & MLX4_UPDATE_QP_PARAMS_FLAGS_VSD_ENABLE)
+			cmd->qp_context.param3 |= cpu_to_be32(MLX4_STRIP_VLAN);
+	}
+
+	cmd->primary_addr_path_mask = cpu_to_be64(pri_addr_path_mask);
+	cmd->qp_mask = cpu_to_be64(qp_mask);
+
+	err = mlx4_cmd(dev, mailbox->dma, qpn & 0xffffff, 0,
+		       MLX4_CMD_UPDATE_QP, MLX4_CMD_TIME_CLASS_A,
+		       MLX4_CMD_NATIVE);
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx4_update_qp);
+
+void mlx4_qp_remove(struct mlx4_dev *dev, struct mlx4_qp *qp)
+{
+	struct mlx4_qp_table *qp_table = &mlx4_priv(dev)->qp_table;
+	unsigned long flags;
+
+	spin_lock_irqsave(&qp_table->lock, flags);
+	radix_tree_delete(&dev->qp_table_tree, qp->qpn & (dev->caps.num_qps - 1));
+	spin_unlock_irqrestore(&qp_table->lock, flags);
+}
+EXPORT_SYMBOL_GPL(mlx4_qp_remove);
+
+void mlx4_qp_free(struct mlx4_dev *dev, struct mlx4_qp *qp)
+{
+	if (atomic_dec_and_test(&qp->refcount))
+		complete(&qp->free);
+	wait_for_completion(&qp->free);
+
+	mlx4_qp_free_icm(dev, qp->qpn);
+}
+EXPORT_SYMBOL_GPL(mlx4_qp_free);
+
+static int mlx4_CONF_SPECIAL_QP(struct mlx4_dev *dev, u32 base_qpn)
+{
+	return mlx4_cmd(dev, 0, base_qpn, 0, MLX4_CMD_CONF_SPECIAL_QP,
+			MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE);
+}
+
+int mlx4_init_qp_table(struct mlx4_dev *dev)
+{
+	struct mlx4_qp_table *qp_table = &mlx4_priv(dev)->qp_table;
+	int err;
+	int reserved_from_top = 0;
+	int k;
+
+	spin_lock_init(&qp_table->lock);
+	INIT_RADIX_TREE(&dev->qp_table_tree, GFP_ATOMIC);
+	if (mlx4_is_slave(dev))
+		return 0;
+
+	/*
+	 * We reserve 2 extra QPs per port for the special QPs.  The
+	 * block of special QPs must be aligned to a multiple of 8, so
+	 * round up.
+	 *
+	 * We also reserve the MSB of the 24-bit QP number to indicate
+	 * that a QP is an XRC QP.
+	 */
+	dev->phys_caps.base_sqpn =
+		ALIGN(dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW], 8);
+
+	{
+		int sort[MLX4_NUM_QP_REGION];
+		int i, j, tmp;
+		int last_base = dev->caps.num_qps;
+
+		for (i = 1; i < MLX4_NUM_QP_REGION; ++i)
+			sort[i] = i;
+
+		for (i = MLX4_NUM_QP_REGION; i > 0; --i) {
+			for (j = 2; j < i; ++j) {
+				if (dev->caps.reserved_qps_cnt[sort[j]] >
+				    dev->caps.reserved_qps_cnt[sort[j - 1]]) {
+					tmp             = sort[j];
+					sort[j]         = sort[j - 1];
+					sort[j - 1]     = tmp;
+				}
+			}
+		}
+
+		for (i = 1; i < MLX4_NUM_QP_REGION; ++i) {
+			last_base -= dev->caps.reserved_qps_cnt[sort[i]];
+			dev->caps.reserved_qps_base[sort[i]] = last_base;
+			reserved_from_top +=
+				dev->caps.reserved_qps_cnt[sort[i]];
+		}
+
+	}
+
+       /* Reserve 8 real SQPs in both native and SRIOV modes.
+	* In addition, in SRIOV mode, reserve 8 proxy SQPs per function
+	* (for all PFs and VFs), and 8 corresponding tunnel QPs.
+	* Each proxy SQP works opposite its own tunnel QP.
+	*
+	* The QPs are arranged as follows:
+	* a. 8 real SQPs
+	* b. All the proxy SQPs (8 per function)
+	* c. All the tunnel QPs (8 per function)
+	*/
+
+	err = mlx4_bitmap_init(&qp_table->bitmap, dev->caps.num_qps,
+			       (1 << 23) - 1, mlx4_num_reserved_sqps(dev),
+			       reserved_from_top);
+	if (err)
+		return err;
+
+	if (mlx4_is_mfunc(dev)) {
+		/* for PPF use */
+		dev->phys_caps.base_proxy_sqpn = dev->phys_caps.base_sqpn + 8;
+		dev->phys_caps.base_tunnel_sqpn = dev->phys_caps.base_sqpn + 8 + 8 * MLX4_MFUNC_MAX;
+
+		/* In mfunc, calculate proxy and tunnel qp offsets for the PF here,
+		 * since the PF does not call mlx4_slave_caps */
+		dev->caps.qp0_tunnel = kcalloc(dev->caps.num_ports, sizeof (u32), GFP_KERNEL);
+		dev->caps.qp0_proxy = kcalloc(dev->caps.num_ports, sizeof (u32), GFP_KERNEL);
+		dev->caps.qp1_tunnel = kcalloc(dev->caps.num_ports, sizeof (u32), GFP_KERNEL);
+		dev->caps.qp1_proxy = kcalloc(dev->caps.num_ports, sizeof (u32), GFP_KERNEL);
+
+		if (!dev->caps.qp0_tunnel || !dev->caps.qp0_proxy ||
+		    !dev->caps.qp1_tunnel || !dev->caps.qp1_proxy) {
+			err = -ENOMEM;
+			goto err_mem;
+		}
+
+		for (k = 0; k < dev->caps.num_ports; k++) {
+			dev->caps.qp0_proxy[k] = dev->phys_caps.base_proxy_sqpn +
+				8 * mlx4_master_func_num(dev) + k;
+			dev->caps.qp0_tunnel[k] = dev->caps.qp0_proxy[k] + 8 * MLX4_MFUNC_MAX;
+			dev->caps.qp1_proxy[k] = dev->phys_caps.base_proxy_sqpn +
+				8 * mlx4_master_func_num(dev) + MLX4_MAX_PORTS + k;
+			dev->caps.qp1_tunnel[k] = dev->caps.qp1_proxy[k] + 8 * MLX4_MFUNC_MAX;
+		}
+	}
+
+
+	err = mlx4_CONF_SPECIAL_QP(dev, dev->phys_caps.base_sqpn);
+	if (err)
+		goto err_mem;
+	return 0;
+
+err_mem:
+	kfree(dev->caps.qp0_tunnel);
+	kfree(dev->caps.qp0_proxy);
+	kfree(dev->caps.qp1_tunnel);
+	kfree(dev->caps.qp1_proxy);
+	dev->caps.qp0_tunnel = dev->caps.qp0_proxy =
+		dev->caps.qp1_tunnel = dev->caps.qp1_proxy = NULL;
+	return err;
+}
+
+void mlx4_cleanup_qp_table(struct mlx4_dev *dev)
+{
+	if (mlx4_is_slave(dev))
+		return;
+
+	mlx4_CONF_SPECIAL_QP(dev, 0);
+	mlx4_bitmap_cleanup(&mlx4_priv(dev)->qp_table.bitmap);
+}
+
+int mlx4_qp_query(struct mlx4_dev *dev, struct mlx4_qp *qp,
+		  struct mlx4_qp_context *context)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	int err;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+
+	err = mlx4_cmd_box(dev, 0, mailbox->dma, qp->qpn, 0,
+			   MLX4_CMD_QUERY_QP, MLX4_CMD_TIME_CLASS_A,
+			   MLX4_CMD_WRAPPED);
+	if (!err)
+		memcpy(context, mailbox->buf + 8, sizeof *context);
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx4_qp_query);
+
+int mlx4_qp_to_ready(struct mlx4_dev *dev, struct mlx4_mtt *mtt,
+		     struct mlx4_qp_context *context,
+		     struct mlx4_qp *qp, enum mlx4_qp_state *qp_state)
+{
+	int err;
+	int i;
+	enum mlx4_qp_state states[] = {
+		MLX4_QP_STATE_RST,
+		MLX4_QP_STATE_INIT,
+		MLX4_QP_STATE_RTR,
+		MLX4_QP_STATE_RTS
+	};
+
+	for (i = 0; i < ARRAY_SIZE(states) - 1; i++) {
+		context->flags &= cpu_to_be32(~(0xf << 28));
+		context->flags |= cpu_to_be32(states[i + 1] << 28);
+		err = mlx4_qp_modify(dev, mtt, states[i], states[i + 1],
+				     context, 0, 0, qp);
+		if (err) {
+			mlx4_err(dev, "Failed to bring QP to state: %d with error: %d\n",
+				 states[i + 1], err);
+			return err;
+		}
+
+		*qp_state = states[i + 1];
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx4_qp_to_ready);
diff --git a/drivers/net/ethernet/mellanox/mlx4/reset.c b/drivers/net/ethernet/mellanox/mlx4/reset.c
new file mode 100644
index 0000000..ea1c6d0
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4/reset.c
@@ -0,0 +1,179 @@
+/*
+ * Copyright (c) 2006, 2007 Cisco Systems, Inc.  All rights reserved.
+ * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/errno.h>
+#include <linux/pci.h>
+#include <linux/delay.h>
+#include <linux/slab.h>
+#include <linux/jiffies.h>
+
+#include "mlx4.h"
+
+int mlx4_reset(struct mlx4_dev *dev)
+{
+	void __iomem *reset;
+	u32 *hca_header = NULL;
+	int pcie_cap;
+	u16 devctl;
+	u16 linkctl;
+	u16 vendor;
+	unsigned long end;
+	u32 sem;
+	int i;
+	int err = 0;
+
+#define MLX4_RESET_BASE		0xf0000
+#define MLX4_RESET_SIZE		  0x400
+#define MLX4_SEM_OFFSET		  0x3fc
+#define MLX4_RESET_OFFSET	   0x10
+#define MLX4_RESET_VALUE	swab32(1)
+
+#define MLX4_SEM_TIMEOUT_JIFFIES	(10 * HZ)
+#define MLX4_RESET_TIMEOUT_JIFFIES	(2 * HZ)
+
+	/*
+	 * Reset the chip.  This is somewhat ugly because we have to
+	 * save off the PCI header before reset and then restore it
+	 * after the chip reboots.  We skip config space offsets 22
+	 * and 23 since those have a special meaning.
+	 */
+
+	/* Do we need to save off the full 4K PCI Express header?? */
+	hca_header = kmalloc(256, GFP_KERNEL);
+	if (!hca_header) {
+		err = -ENOMEM;
+		mlx4_err(dev, "Couldn't allocate memory to save HCA PCI header, aborting\n");
+		goto out;
+	}
+
+	pcie_cap = pci_pcie_cap(dev->pdev);
+
+	for (i = 0; i < 64; ++i) {
+		if (i == 22 || i == 23)
+			continue;
+		if (pci_read_config_dword(dev->pdev, i * 4, hca_header + i)) {
+			err = -ENODEV;
+			mlx4_err(dev, "Couldn't save HCA PCI header, aborting\n");
+			goto out;
+		}
+	}
+
+	reset = ioremap(pci_resource_start(dev->pdev, 0) + MLX4_RESET_BASE,
+			MLX4_RESET_SIZE);
+	if (!reset) {
+		err = -ENOMEM;
+		mlx4_err(dev, "Couldn't map HCA reset register, aborting\n");
+		goto out;
+	}
+
+	/* grab HW semaphore to lock out flash updates */
+	end = jiffies + MLX4_SEM_TIMEOUT_JIFFIES;
+	do {
+		sem = readl(reset + MLX4_SEM_OFFSET);
+		if (!sem)
+			break;
+
+		msleep(1);
+	} while (time_before(jiffies, end));
+
+	if (sem) {
+		mlx4_err(dev, "Failed to obtain HW semaphore, aborting\n");
+		err = -EAGAIN;
+		iounmap(reset);
+		goto out;
+	}
+
+	/* actually hit reset */
+	writel(MLX4_RESET_VALUE, reset + MLX4_RESET_OFFSET);
+	iounmap(reset);
+
+	/* Docs say to wait one second before accessing device */
+	msleep(1000);
+
+	end = jiffies + MLX4_RESET_TIMEOUT_JIFFIES;
+	do {
+		if (!pci_read_config_word(dev->pdev, PCI_VENDOR_ID, &vendor) &&
+		    vendor != 0xffff)
+			break;
+
+		msleep(1);
+	} while (time_before(jiffies, end));
+
+	if (vendor == 0xffff) {
+		err = -ENODEV;
+		mlx4_err(dev, "PCI device did not come back after reset, aborting\n");
+		goto out;
+	}
+
+	/* Now restore the PCI headers */
+	if (pcie_cap) {
+		devctl = hca_header[(pcie_cap + PCI_EXP_DEVCTL) / 4];
+		if (pcie_capability_write_word(dev->pdev, PCI_EXP_DEVCTL,
+					       devctl)) {
+			err = -ENODEV;
+			mlx4_err(dev, "Couldn't restore HCA PCI Express Device Control register, aborting\n");
+			goto out;
+		}
+		linkctl = hca_header[(pcie_cap + PCI_EXP_LNKCTL) / 4];
+		if (pcie_capability_write_word(dev->pdev, PCI_EXP_LNKCTL,
+					       linkctl)) {
+			err = -ENODEV;
+			mlx4_err(dev, "Couldn't restore HCA PCI Express Link control register, aborting\n");
+			goto out;
+		}
+	}
+
+	for (i = 0; i < 16; ++i) {
+		if (i * 4 == PCI_COMMAND)
+			continue;
+
+		if (pci_write_config_dword(dev->pdev, i * 4, hca_header[i])) {
+			err = -ENODEV;
+			mlx4_err(dev, "Couldn't restore HCA reg %x, aborting\n",
+				 i);
+			goto out;
+		}
+	}
+
+	if (pci_write_config_dword(dev->pdev, PCI_COMMAND,
+				   hca_header[PCI_COMMAND / 4])) {
+		err = -ENODEV;
+		mlx4_err(dev, "Couldn't restore HCA COMMAND, aborting\n");
+		goto out;
+	}
+
+out:
+	kfree(hca_header);
+
+	return err;
+}
diff --git a/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c b/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c
new file mode 100644
index 0000000..cd5cf6d
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c
@@ -0,0 +1,4916 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005, 2006, 2007, 2008 Mellanox Technologies.
+ * All rights reserved.
+ * Copyright (c) 2005, 2006, 2007 Cisco Systems, Inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/sched.h>
+#include <linux/pci.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/io.h>
+#include <linux/slab.h>
+#include <linux/mlx4/cmd.h>
+#include <linux/mlx4/qp.h>
+#include <linux/if_ether.h>
+#include <linux/etherdevice.h>
+
+#include "mlx4.h"
+#include "fw.h"
+
+#define MLX4_MAC_VALID		(1ull << 63)
+
+struct mac_res {
+	struct list_head list;
+	u64 mac;
+	int ref_count;
+	u8 smac_index;
+	u8 port;
+};
+
+struct vlan_res {
+	struct list_head list;
+	u16 vlan;
+	int ref_count;
+	int vlan_index;
+	u8 port;
+};
+
+struct res_common {
+	struct list_head	list;
+	struct rb_node		node;
+	u64		        res_id;
+	int			owner;
+	int			state;
+	int			from_state;
+	int			to_state;
+	int			removing;
+};
+
+enum {
+	RES_ANY_BUSY = 1
+};
+
+struct res_gid {
+	struct list_head	list;
+	u8			gid[16];
+	enum mlx4_protocol	prot;
+	enum mlx4_steer_type	steer;
+	u64			reg_id;
+};
+
+enum res_qp_states {
+	RES_QP_BUSY = RES_ANY_BUSY,
+
+	/* QP number was allocated */
+	RES_QP_RESERVED,
+
+	/* ICM memory for QP context was mapped */
+	RES_QP_MAPPED,
+
+	/* QP is in hw ownership */
+	RES_QP_HW
+};
+
+struct res_qp {
+	struct res_common	com;
+	struct res_mtt	       *mtt;
+	struct res_cq	       *rcq;
+	struct res_cq	       *scq;
+	struct res_srq	       *srq;
+	struct list_head	mcg_list;
+	spinlock_t		mcg_spl;
+	int			local_qpn;
+	atomic_t		ref_count;
+	u32			qpc_flags;
+	/* saved qp params before VST enforcement in order to restore on VGT */
+	u8			sched_queue;
+	__be32			param3;
+	u8			vlan_control;
+	u8			fvl_rx;
+	u8			pri_path_fl;
+	u8			vlan_index;
+	u8			feup;
+};
+
+enum res_mtt_states {
+	RES_MTT_BUSY = RES_ANY_BUSY,
+	RES_MTT_ALLOCATED,
+};
+
+static inline const char *mtt_states_str(enum res_mtt_states state)
+{
+	switch (state) {
+	case RES_MTT_BUSY: return "RES_MTT_BUSY";
+	case RES_MTT_ALLOCATED: return "RES_MTT_ALLOCATED";
+	default: return "Unknown";
+	}
+}
+
+struct res_mtt {
+	struct res_common	com;
+	int			order;
+	atomic_t		ref_count;
+};
+
+enum res_mpt_states {
+	RES_MPT_BUSY = RES_ANY_BUSY,
+	RES_MPT_RESERVED,
+	RES_MPT_MAPPED,
+	RES_MPT_HW,
+};
+
+struct res_mpt {
+	struct res_common	com;
+	struct res_mtt	       *mtt;
+	int			key;
+};
+
+enum res_eq_states {
+	RES_EQ_BUSY = RES_ANY_BUSY,
+	RES_EQ_RESERVED,
+	RES_EQ_HW,
+};
+
+struct res_eq {
+	struct res_common	com;
+	struct res_mtt	       *mtt;
+};
+
+enum res_cq_states {
+	RES_CQ_BUSY = RES_ANY_BUSY,
+	RES_CQ_ALLOCATED,
+	RES_CQ_HW,
+};
+
+struct res_cq {
+	struct res_common	com;
+	struct res_mtt	       *mtt;
+	atomic_t		ref_count;
+};
+
+enum res_srq_states {
+	RES_SRQ_BUSY = RES_ANY_BUSY,
+	RES_SRQ_ALLOCATED,
+	RES_SRQ_HW,
+};
+
+struct res_srq {
+	struct res_common	com;
+	struct res_mtt	       *mtt;
+	struct res_cq	       *cq;
+	atomic_t		ref_count;
+};
+
+enum res_counter_states {
+	RES_COUNTER_BUSY = RES_ANY_BUSY,
+	RES_COUNTER_ALLOCATED,
+};
+
+struct res_counter {
+	struct res_common	com;
+	int			port;
+};
+
+enum res_xrcdn_states {
+	RES_XRCD_BUSY = RES_ANY_BUSY,
+	RES_XRCD_ALLOCATED,
+};
+
+struct res_xrcdn {
+	struct res_common	com;
+	int			port;
+};
+
+enum res_fs_rule_states {
+	RES_FS_RULE_BUSY = RES_ANY_BUSY,
+	RES_FS_RULE_ALLOCATED,
+};
+
+struct res_fs_rule {
+	struct res_common	com;
+	int			qpn;
+};
+
+static int mlx4_is_eth(struct mlx4_dev *dev, int port)
+{
+	return dev->caps.port_mask[port] == MLX4_PORT_TYPE_IB ? 0 : 1;
+}
+
+static void *res_tracker_lookup(struct rb_root *root, u64 res_id)
+{
+	struct rb_node *node = root->rb_node;
+
+	while (node) {
+		struct res_common *res = container_of(node, struct res_common,
+						      node);
+
+		if (res_id < res->res_id)
+			node = node->rb_left;
+		else if (res_id > res->res_id)
+			node = node->rb_right;
+		else
+			return res;
+	}
+	return NULL;
+}
+
+static int res_tracker_insert(struct rb_root *root, struct res_common *res)
+{
+	struct rb_node **new = &(root->rb_node), *parent = NULL;
+
+	/* Figure out where to put new node */
+	while (*new) {
+		struct res_common *this = container_of(*new, struct res_common,
+						       node);
+
+		parent = *new;
+		if (res->res_id < this->res_id)
+			new = &((*new)->rb_left);
+		else if (res->res_id > this->res_id)
+			new = &((*new)->rb_right);
+		else
+			return -EEXIST;
+	}
+
+	/* Add new node and rebalance tree. */
+	rb_link_node(&res->node, parent, new);
+	rb_insert_color(&res->node, root);
+
+	return 0;
+}
+
+enum qp_transition {
+	QP_TRANS_INIT2RTR,
+	QP_TRANS_RTR2RTS,
+	QP_TRANS_RTS2RTS,
+	QP_TRANS_SQERR2RTS,
+	QP_TRANS_SQD2SQD,
+	QP_TRANS_SQD2RTS
+};
+
+/* For Debug uses */
+static const char *resource_str(enum mlx4_resource rt)
+{
+	switch (rt) {
+	case RES_QP: return "RES_QP";
+	case RES_CQ: return "RES_CQ";
+	case RES_SRQ: return "RES_SRQ";
+	case RES_MPT: return "RES_MPT";
+	case RES_MTT: return "RES_MTT";
+	case RES_MAC: return  "RES_MAC";
+	case RES_VLAN: return  "RES_VLAN";
+	case RES_EQ: return "RES_EQ";
+	case RES_COUNTER: return "RES_COUNTER";
+	case RES_FS_RULE: return "RES_FS_RULE";
+	case RES_XRCD: return "RES_XRCD";
+	default: return "Unknown resource type !!!";
+	};
+}
+
+static void rem_slave_vlans(struct mlx4_dev *dev, int slave);
+static inline int mlx4_grant_resource(struct mlx4_dev *dev, int slave,
+				      enum mlx4_resource res_type, int count,
+				      int port)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct resource_allocator *res_alloc =
+		&priv->mfunc.master.res_tracker.res_alloc[res_type];
+	int err = -EINVAL;
+	int allocated, free, reserved, guaranteed, from_free;
+	int from_rsvd;
+
+	if (slave > dev->num_vfs)
+		return -EINVAL;
+
+	spin_lock(&res_alloc->alloc_lock);
+	allocated = (port > 0) ?
+		res_alloc->allocated[(port - 1) * (dev->num_vfs + 1) + slave] :
+		res_alloc->allocated[slave];
+	free = (port > 0) ? res_alloc->res_port_free[port - 1] :
+		res_alloc->res_free;
+	reserved = (port > 0) ? res_alloc->res_port_rsvd[port - 1] :
+		res_alloc->res_reserved;
+	guaranteed = res_alloc->guaranteed[slave];
+
+	if (allocated + count > res_alloc->quota[slave]) {
+		mlx4_warn(dev, "VF %d port %d res %s: quota exceeded, count %d alloc %d quota %d\n",
+			  slave, port, resource_str(res_type), count,
+			  allocated, res_alloc->quota[slave]);
+		goto out;
+	}
+
+	if (allocated + count <= guaranteed) {
+		err = 0;
+		from_rsvd = count;
+	} else {
+		/* portion may need to be obtained from free area */
+		if (guaranteed - allocated > 0)
+			from_free = count - (guaranteed - allocated);
+		else
+			from_free = count;
+
+		from_rsvd = count - from_free;
+
+		if (free - from_free >= reserved)
+			err = 0;
+		else
+			mlx4_warn(dev, "VF %d port %d res %s: free pool empty, free %d from_free %d rsvd %d\n",
+				  slave, port, resource_str(res_type), free,
+				  from_free, reserved);
+	}
+
+	if (!err) {
+		/* grant the request */
+		if (port > 0) {
+			res_alloc->allocated[(port - 1) * (dev->num_vfs + 1) + slave] += count;
+			res_alloc->res_port_free[port - 1] -= count;
+			res_alloc->res_port_rsvd[port - 1] -= from_rsvd;
+		} else {
+			res_alloc->allocated[slave] += count;
+			res_alloc->res_free -= count;
+			res_alloc->res_reserved -= from_rsvd;
+		}
+	}
+
+out:
+	spin_unlock(&res_alloc->alloc_lock);
+	return err;
+}
+
+static inline void mlx4_release_resource(struct mlx4_dev *dev, int slave,
+				    enum mlx4_resource res_type, int count,
+				    int port)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct resource_allocator *res_alloc =
+		&priv->mfunc.master.res_tracker.res_alloc[res_type];
+	int allocated, guaranteed, from_rsvd;
+
+	if (slave > dev->num_vfs)
+		return;
+
+	spin_lock(&res_alloc->alloc_lock);
+
+	allocated = (port > 0) ?
+		res_alloc->allocated[(port - 1) * (dev->num_vfs + 1) + slave] :
+		res_alloc->allocated[slave];
+	guaranteed = res_alloc->guaranteed[slave];
+
+	if (allocated - count >= guaranteed) {
+		from_rsvd = 0;
+	} else {
+		/* portion may need to be returned to reserved area */
+		if (allocated - guaranteed > 0)
+			from_rsvd = count - (allocated - guaranteed);
+		else
+			from_rsvd = count;
+	}
+
+	if (port > 0) {
+		res_alloc->allocated[(port - 1) * (dev->num_vfs + 1) + slave] -= count;
+		res_alloc->res_port_free[port - 1] += count;
+		res_alloc->res_port_rsvd[port - 1] += from_rsvd;
+	} else {
+		res_alloc->allocated[slave] -= count;
+		res_alloc->res_free += count;
+		res_alloc->res_reserved += from_rsvd;
+	}
+
+	spin_unlock(&res_alloc->alloc_lock);
+	return;
+}
+
+static inline void initialize_res_quotas(struct mlx4_dev *dev,
+					 struct resource_allocator *res_alloc,
+					 enum mlx4_resource res_type,
+					 int vf, int num_instances)
+{
+	res_alloc->guaranteed[vf] = num_instances / (2 * (dev->num_vfs + 1));
+	res_alloc->quota[vf] = (num_instances / 2) + res_alloc->guaranteed[vf];
+	if (vf == mlx4_master_func_num(dev)) {
+		res_alloc->res_free = num_instances;
+		if (res_type == RES_MTT) {
+			/* reserved mtts will be taken out of the PF allocation */
+			res_alloc->res_free += dev->caps.reserved_mtts;
+			res_alloc->guaranteed[vf] += dev->caps.reserved_mtts;
+			res_alloc->quota[vf] += dev->caps.reserved_mtts;
+		}
+	}
+}
+
+void mlx4_init_quotas(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int pf;
+
+	/* quotas for VFs are initialized in mlx4_slave_cap */
+	if (mlx4_is_slave(dev))
+		return;
+
+	if (!mlx4_is_mfunc(dev)) {
+		dev->quotas.qp = dev->caps.num_qps - dev->caps.reserved_qps -
+			mlx4_num_reserved_sqps(dev);
+		dev->quotas.cq = dev->caps.num_cqs - dev->caps.reserved_cqs;
+		dev->quotas.srq = dev->caps.num_srqs - dev->caps.reserved_srqs;
+		dev->quotas.mtt = dev->caps.num_mtts - dev->caps.reserved_mtts;
+		dev->quotas.mpt = dev->caps.num_mpts - dev->caps.reserved_mrws;
+		return;
+	}
+
+	pf = mlx4_master_func_num(dev);
+	dev->quotas.qp =
+		priv->mfunc.master.res_tracker.res_alloc[RES_QP].quota[pf];
+	dev->quotas.cq =
+		priv->mfunc.master.res_tracker.res_alloc[RES_CQ].quota[pf];
+	dev->quotas.srq =
+		priv->mfunc.master.res_tracker.res_alloc[RES_SRQ].quota[pf];
+	dev->quotas.mtt =
+		priv->mfunc.master.res_tracker.res_alloc[RES_MTT].quota[pf];
+	dev->quotas.mpt =
+		priv->mfunc.master.res_tracker.res_alloc[RES_MPT].quota[pf];
+}
+int mlx4_init_resource_tracker(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int i, j;
+	int t;
+
+	priv->mfunc.master.res_tracker.slave_list =
+		kzalloc(dev->num_slaves * sizeof(struct slave_list),
+			GFP_KERNEL);
+	if (!priv->mfunc.master.res_tracker.slave_list)
+		return -ENOMEM;
+
+	for (i = 0 ; i < dev->num_slaves; i++) {
+		for (t = 0; t < MLX4_NUM_OF_RESOURCE_TYPE; ++t)
+			INIT_LIST_HEAD(&priv->mfunc.master.res_tracker.
+				       slave_list[i].res_list[t]);
+		mutex_init(&priv->mfunc.master.res_tracker.slave_list[i].mutex);
+	}
+
+	mlx4_dbg(dev, "Started init_resource_tracker: %ld slaves\n",
+		 dev->num_slaves);
+	for (i = 0 ; i < MLX4_NUM_OF_RESOURCE_TYPE; i++)
+		priv->mfunc.master.res_tracker.res_tree[i] = RB_ROOT;
+
+	for (i = 0; i < MLX4_NUM_OF_RESOURCE_TYPE; i++) {
+		struct resource_allocator *res_alloc =
+			&priv->mfunc.master.res_tracker.res_alloc[i];
+		res_alloc->quota = kmalloc((dev->num_vfs + 1) * sizeof(int), GFP_KERNEL);
+		res_alloc->guaranteed = kmalloc((dev->num_vfs + 1) * sizeof(int), GFP_KERNEL);
+		if (i == RES_MAC || i == RES_VLAN)
+			res_alloc->allocated = kzalloc(MLX4_MAX_PORTS *
+						       (dev->num_vfs + 1) * sizeof(int),
+							GFP_KERNEL);
+		else
+			res_alloc->allocated = kzalloc((dev->num_vfs + 1) * sizeof(int), GFP_KERNEL);
+
+		if (!res_alloc->quota || !res_alloc->guaranteed ||
+		    !res_alloc->allocated)
+			goto no_mem_err;
+
+		spin_lock_init(&res_alloc->alloc_lock);
+		for (t = 0; t < dev->num_vfs + 1; t++) {
+			struct mlx4_active_ports actv_ports =
+				mlx4_get_active_ports(dev, t);
+			switch (i) {
+			case RES_QP:
+				initialize_res_quotas(dev, res_alloc, RES_QP,
+						      t, dev->caps.num_qps -
+						      dev->caps.reserved_qps -
+						      mlx4_num_reserved_sqps(dev));
+				break;
+			case RES_CQ:
+				initialize_res_quotas(dev, res_alloc, RES_CQ,
+						      t, dev->caps.num_cqs -
+						      dev->caps.reserved_cqs);
+				break;
+			case RES_SRQ:
+				initialize_res_quotas(dev, res_alloc, RES_SRQ,
+						      t, dev->caps.num_srqs -
+						      dev->caps.reserved_srqs);
+				break;
+			case RES_MPT:
+				initialize_res_quotas(dev, res_alloc, RES_MPT,
+						      t, dev->caps.num_mpts -
+						      dev->caps.reserved_mrws);
+				break;
+			case RES_MTT:
+				initialize_res_quotas(dev, res_alloc, RES_MTT,
+						      t, dev->caps.num_mtts -
+						      dev->caps.reserved_mtts);
+				break;
+			case RES_MAC:
+				if (t == mlx4_master_func_num(dev)) {
+					int max_vfs_pport = 0;
+					/* Calculate the max vfs per port for */
+					/* both ports.			      */
+					for (j = 0; j < dev->caps.num_ports;
+					     j++) {
+						struct mlx4_slaves_pport slaves_pport =
+							mlx4_phys_to_slaves_pport(dev, j + 1);
+						unsigned current_slaves =
+							bitmap_weight(slaves_pport.slaves,
+								      dev->caps.num_ports) - 1;
+						if (max_vfs_pport < current_slaves)
+							max_vfs_pport =
+								current_slaves;
+					}
+					res_alloc->quota[t] =
+						MLX4_MAX_MAC_NUM -
+						2 * max_vfs_pport;
+					res_alloc->guaranteed[t] = 2;
+					for (j = 0; j < MLX4_MAX_PORTS; j++)
+						res_alloc->res_port_free[j] =
+							MLX4_MAX_MAC_NUM;
+				} else {
+					res_alloc->quota[t] = MLX4_MAX_MAC_NUM;
+					res_alloc->guaranteed[t] = 2;
+				}
+				break;
+			case RES_VLAN:
+				if (t == mlx4_master_func_num(dev)) {
+					res_alloc->quota[t] = MLX4_MAX_VLAN_NUM;
+					res_alloc->guaranteed[t] = MLX4_MAX_VLAN_NUM / 2;
+					for (j = 0; j < MLX4_MAX_PORTS; j++)
+						res_alloc->res_port_free[j] =
+							res_alloc->quota[t];
+				} else {
+					res_alloc->quota[t] = MLX4_MAX_VLAN_NUM / 2;
+					res_alloc->guaranteed[t] = 0;
+				}
+				break;
+			case RES_COUNTER:
+				res_alloc->quota[t] = dev->caps.max_counters;
+				res_alloc->guaranteed[t] = 0;
+				if (t == mlx4_master_func_num(dev))
+					res_alloc->res_free = res_alloc->quota[t];
+				break;
+			default:
+				break;
+			}
+			if (i == RES_MAC || i == RES_VLAN) {
+				for (j = 0; j < dev->caps.num_ports; j++)
+					if (test_bit(j, actv_ports.ports))
+						res_alloc->res_port_rsvd[j] +=
+							res_alloc->guaranteed[t];
+			} else {
+				res_alloc->res_reserved += res_alloc->guaranteed[t];
+			}
+		}
+	}
+	spin_lock_init(&priv->mfunc.master.res_tracker.lock);
+	return 0;
+
+no_mem_err:
+	for (i = 0; i < MLX4_NUM_OF_RESOURCE_TYPE; i++) {
+		kfree(priv->mfunc.master.res_tracker.res_alloc[i].allocated);
+		priv->mfunc.master.res_tracker.res_alloc[i].allocated = NULL;
+		kfree(priv->mfunc.master.res_tracker.res_alloc[i].guaranteed);
+		priv->mfunc.master.res_tracker.res_alloc[i].guaranteed = NULL;
+		kfree(priv->mfunc.master.res_tracker.res_alloc[i].quota);
+		priv->mfunc.master.res_tracker.res_alloc[i].quota = NULL;
+	}
+	return -ENOMEM;
+}
+
+void mlx4_free_resource_tracker(struct mlx4_dev *dev,
+				enum mlx4_res_tracker_free_type type)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int i;
+
+	if (priv->mfunc.master.res_tracker.slave_list) {
+		if (type != RES_TR_FREE_STRUCTS_ONLY) {
+			for (i = 0; i < dev->num_slaves; i++) {
+				if (type == RES_TR_FREE_ALL ||
+				    dev->caps.function != i)
+					mlx4_delete_all_resources_for_slave(dev, i);
+			}
+			/* free master's vlans */
+			i = dev->caps.function;
+			mlx4_reset_roce_gids(dev, i);
+			mutex_lock(&priv->mfunc.master.res_tracker.slave_list[i].mutex);
+			rem_slave_vlans(dev, i);
+			mutex_unlock(&priv->mfunc.master.res_tracker.slave_list[i].mutex);
+		}
+
+		if (type != RES_TR_FREE_SLAVES_ONLY) {
+			for (i = 0; i < MLX4_NUM_OF_RESOURCE_TYPE; i++) {
+				kfree(priv->mfunc.master.res_tracker.res_alloc[i].allocated);
+				priv->mfunc.master.res_tracker.res_alloc[i].allocated = NULL;
+				kfree(priv->mfunc.master.res_tracker.res_alloc[i].guaranteed);
+				priv->mfunc.master.res_tracker.res_alloc[i].guaranteed = NULL;
+				kfree(priv->mfunc.master.res_tracker.res_alloc[i].quota);
+				priv->mfunc.master.res_tracker.res_alloc[i].quota = NULL;
+			}
+			kfree(priv->mfunc.master.res_tracker.slave_list);
+			priv->mfunc.master.res_tracker.slave_list = NULL;
+		}
+	}
+}
+
+static void update_pkey_index(struct mlx4_dev *dev, int slave,
+			      struct mlx4_cmd_mailbox *inbox)
+{
+	u8 sched = *(u8 *)(inbox->buf + 64);
+	u8 orig_index = *(u8 *)(inbox->buf + 35);
+	u8 new_index;
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int port;
+
+	port = (sched >> 6 & 1) + 1;
+
+	new_index = priv->virt2phys_pkey[slave][port - 1][orig_index];
+	*(u8 *)(inbox->buf + 35) = new_index;
+}
+
+static void update_gid(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *inbox,
+		       u8 slave)
+{
+	struct mlx4_qp_context	*qp_ctx = inbox->buf + 8;
+	enum mlx4_qp_optpar	optpar = be32_to_cpu(*(__be32 *) inbox->buf);
+	u32			ts = (be32_to_cpu(qp_ctx->flags) >> 16) & 0xff;
+	int port;
+
+	if (MLX4_QP_ST_UD == ts) {
+		port = (qp_ctx->pri_path.sched_queue >> 6 & 1) + 1;
+		if (mlx4_is_eth(dev, port))
+			qp_ctx->pri_path.mgid_index =
+				mlx4_get_base_gid_ix(dev, slave, port) | 0x80;
+		else
+			qp_ctx->pri_path.mgid_index = slave | 0x80;
+
+	} else if (MLX4_QP_ST_RC == ts || MLX4_QP_ST_XRC == ts || MLX4_QP_ST_UC == ts) {
+		if (optpar & MLX4_QP_OPTPAR_PRIMARY_ADDR_PATH) {
+			port = (qp_ctx->pri_path.sched_queue >> 6 & 1) + 1;
+			if (mlx4_is_eth(dev, port)) {
+				qp_ctx->pri_path.mgid_index +=
+					mlx4_get_base_gid_ix(dev, slave, port);
+				qp_ctx->pri_path.mgid_index &= 0x7f;
+			} else {
+				qp_ctx->pri_path.mgid_index = slave & 0x7F;
+			}
+		}
+		if (optpar & MLX4_QP_OPTPAR_ALT_ADDR_PATH) {
+			port = (qp_ctx->alt_path.sched_queue >> 6 & 1) + 1;
+			if (mlx4_is_eth(dev, port)) {
+				qp_ctx->alt_path.mgid_index +=
+					mlx4_get_base_gid_ix(dev, slave, port);
+				qp_ctx->alt_path.mgid_index &= 0x7f;
+			} else {
+				qp_ctx->alt_path.mgid_index = slave & 0x7F;
+			}
+		}
+	}
+}
+
+static int update_vport_qp_param(struct mlx4_dev *dev,
+				 struct mlx4_cmd_mailbox *inbox,
+				 u8 slave, u32 qpn)
+{
+	struct mlx4_qp_context	*qpc = inbox->buf + 8;
+	struct mlx4_vport_oper_state *vp_oper;
+	struct mlx4_priv *priv;
+	u32 qp_type;
+	int port;
+
+	port = (qpc->pri_path.sched_queue & 0x40) ? 2 : 1;
+	priv = mlx4_priv(dev);
+	vp_oper = &priv->mfunc.master.vf_oper[slave].vport[port];
+	qp_type	= (be32_to_cpu(qpc->flags) >> 16) & 0xff;
+
+	if (MLX4_VGT != vp_oper->state.default_vlan) {
+		/* the reserved QPs (special, proxy, tunnel)
+		 * do not operate over vlans
+		 */
+		if (mlx4_is_qp_reserved(dev, qpn))
+			return 0;
+
+		/* force strip vlan by clear vsd, MLX QP refers to Raw Ethernet */
+		if (qp_type == MLX4_QP_ST_UD ||
+		    (qp_type == MLX4_QP_ST_MLX && mlx4_is_eth(dev, port))) {
+			if (dev->caps.bmme_flags & MLX4_BMME_FLAG_VSD_INIT2RTR) {
+				*(__be32 *)inbox->buf =
+					cpu_to_be32(be32_to_cpu(*(__be32 *)inbox->buf) |
+					MLX4_QP_OPTPAR_VLAN_STRIPPING);
+				qpc->param3 &= ~cpu_to_be32(MLX4_STRIP_VLAN);
+			} else {
+				struct mlx4_update_qp_params params = {.flags = 0};
+
+				mlx4_update_qp(dev, qpn, MLX4_UPDATE_QP_VSD, &params);
+			}
+		}
+
+		if (vp_oper->state.link_state == IFLA_VF_LINK_STATE_DISABLE &&
+		    dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_UPDATE_QP) {
+			qpc->pri_path.vlan_control =
+				MLX4_VLAN_CTRL_ETH_TX_BLOCK_TAGGED |
+				MLX4_VLAN_CTRL_ETH_TX_BLOCK_PRIO_TAGGED |
+				MLX4_VLAN_CTRL_ETH_TX_BLOCK_UNTAGGED |
+				MLX4_VLAN_CTRL_ETH_RX_BLOCK_PRIO_TAGGED |
+				MLX4_VLAN_CTRL_ETH_RX_BLOCK_UNTAGGED |
+				MLX4_VLAN_CTRL_ETH_RX_BLOCK_TAGGED;
+		} else if (0 != vp_oper->state.default_vlan) {
+			qpc->pri_path.vlan_control =
+				MLX4_VLAN_CTRL_ETH_TX_BLOCK_TAGGED |
+				MLX4_VLAN_CTRL_ETH_RX_BLOCK_PRIO_TAGGED |
+				MLX4_VLAN_CTRL_ETH_RX_BLOCK_UNTAGGED;
+		} else { /* priority tagged */
+			qpc->pri_path.vlan_control =
+				MLX4_VLAN_CTRL_ETH_TX_BLOCK_TAGGED |
+				MLX4_VLAN_CTRL_ETH_RX_BLOCK_TAGGED;
+		}
+
+		qpc->pri_path.fvl_rx |= MLX4_FVL_RX_FORCE_ETH_VLAN;
+		qpc->pri_path.vlan_index = vp_oper->vlan_idx;
+		qpc->pri_path.fl |= MLX4_FL_CV | MLX4_FL_ETH_HIDE_CQE_VLAN;
+		qpc->pri_path.feup |= MLX4_FEUP_FORCE_ETH_UP | MLX4_FVL_FORCE_ETH_VLAN;
+		qpc->pri_path.sched_queue &= 0xC7;
+		qpc->pri_path.sched_queue |= (vp_oper->state.default_qos) << 3;
+	}
+	if (vp_oper->state.spoofchk) {
+		qpc->pri_path.feup |= MLX4_FSM_FORCE_ETH_SRC_MAC;
+		qpc->pri_path.grh_mylmc = (0x80 & qpc->pri_path.grh_mylmc) + vp_oper->mac_idx;
+	}
+	return 0;
+}
+
+static int mpt_mask(struct mlx4_dev *dev)
+{
+	return dev->caps.num_mpts - 1;
+}
+
+static void *find_res(struct mlx4_dev *dev, u64 res_id,
+		      enum mlx4_resource type)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	return res_tracker_lookup(&priv->mfunc.master.res_tracker.res_tree[type],
+				  res_id);
+}
+
+static int get_res(struct mlx4_dev *dev, int slave, u64 res_id,
+		   enum mlx4_resource type,
+		   void *res)
+{
+	struct res_common *r;
+	int err = 0;
+
+	spin_lock_irq(mlx4_tlock(dev));
+	r = find_res(dev, res_id, type);
+	if (!r) {
+		err = -ENONET;
+		goto exit;
+	}
+
+	if (r->state == RES_ANY_BUSY) {
+		err = -EBUSY;
+		goto exit;
+	}
+
+	if (r->owner != slave) {
+		err = -EPERM;
+		goto exit;
+	}
+
+	r->from_state = r->state;
+	r->state = RES_ANY_BUSY;
+
+	if (res)
+		*((struct res_common **)res) = r;
+
+exit:
+	spin_unlock_irq(mlx4_tlock(dev));
+	return err;
+}
+
+int mlx4_get_slave_from_resource_id(struct mlx4_dev *dev,
+				    enum mlx4_resource type,
+				    u64 res_id, int *slave)
+{
+
+	struct res_common *r;
+	int err = -ENOENT;
+	int id = res_id;
+
+	if (type == RES_QP)
+		id &= 0x7fffff;
+	spin_lock(mlx4_tlock(dev));
+
+	r = find_res(dev, id, type);
+	if (r) {
+		*slave = r->owner;
+		err = 0;
+	}
+	spin_unlock(mlx4_tlock(dev));
+
+	return err;
+}
+
+static void put_res(struct mlx4_dev *dev, int slave, u64 res_id,
+		    enum mlx4_resource type)
+{
+	struct res_common *r;
+
+	spin_lock_irq(mlx4_tlock(dev));
+	r = find_res(dev, res_id, type);
+	if (r)
+		r->state = r->from_state;
+	spin_unlock_irq(mlx4_tlock(dev));
+}
+
+static struct res_common *alloc_qp_tr(int id)
+{
+	struct res_qp *ret;
+
+	ret = kzalloc(sizeof *ret, GFP_KERNEL);
+	if (!ret)
+		return NULL;
+
+	ret->com.res_id = id;
+	ret->com.state = RES_QP_RESERVED;
+	ret->local_qpn = id;
+	INIT_LIST_HEAD(&ret->mcg_list);
+	spin_lock_init(&ret->mcg_spl);
+	atomic_set(&ret->ref_count, 0);
+
+	return &ret->com;
+}
+
+static struct res_common *alloc_mtt_tr(int id, int order)
+{
+	struct res_mtt *ret;
+
+	ret = kzalloc(sizeof *ret, GFP_KERNEL);
+	if (!ret)
+		return NULL;
+
+	ret->com.res_id = id;
+	ret->order = order;
+	ret->com.state = RES_MTT_ALLOCATED;
+	atomic_set(&ret->ref_count, 0);
+
+	return &ret->com;
+}
+
+static struct res_common *alloc_mpt_tr(int id, int key)
+{
+	struct res_mpt *ret;
+
+	ret = kzalloc(sizeof *ret, GFP_KERNEL);
+	if (!ret)
+		return NULL;
+
+	ret->com.res_id = id;
+	ret->com.state = RES_MPT_RESERVED;
+	ret->key = key;
+
+	return &ret->com;
+}
+
+static struct res_common *alloc_eq_tr(int id)
+{
+	struct res_eq *ret;
+
+	ret = kzalloc(sizeof *ret, GFP_KERNEL);
+	if (!ret)
+		return NULL;
+
+	ret->com.res_id = id;
+	ret->com.state = RES_EQ_RESERVED;
+
+	return &ret->com;
+}
+
+static struct res_common *alloc_cq_tr(int id)
+{
+	struct res_cq *ret;
+
+	ret = kzalloc(sizeof *ret, GFP_KERNEL);
+	if (!ret)
+		return NULL;
+
+	ret->com.res_id = id;
+	ret->com.state = RES_CQ_ALLOCATED;
+	atomic_set(&ret->ref_count, 0);
+
+	return &ret->com;
+}
+
+static struct res_common *alloc_srq_tr(int id)
+{
+	struct res_srq *ret;
+
+	ret = kzalloc(sizeof *ret, GFP_KERNEL);
+	if (!ret)
+		return NULL;
+
+	ret->com.res_id = id;
+	ret->com.state = RES_SRQ_ALLOCATED;
+	atomic_set(&ret->ref_count, 0);
+
+	return &ret->com;
+}
+
+static struct res_common *alloc_counter_tr(int id)
+{
+	struct res_counter *ret;
+
+	ret = kzalloc(sizeof *ret, GFP_KERNEL);
+	if (!ret)
+		return NULL;
+
+	ret->com.res_id = id;
+	ret->com.state = RES_COUNTER_ALLOCATED;
+
+	return &ret->com;
+}
+
+static struct res_common *alloc_xrcdn_tr(int id)
+{
+	struct res_xrcdn *ret;
+
+	ret = kzalloc(sizeof *ret, GFP_KERNEL);
+	if (!ret)
+		return NULL;
+
+	ret->com.res_id = id;
+	ret->com.state = RES_XRCD_ALLOCATED;
+
+	return &ret->com;
+}
+
+static struct res_common *alloc_fs_rule_tr(u64 id, int qpn)
+{
+	struct res_fs_rule *ret;
+
+	ret = kzalloc(sizeof *ret, GFP_KERNEL);
+	if (!ret)
+		return NULL;
+
+	ret->com.res_id = id;
+	ret->com.state = RES_FS_RULE_ALLOCATED;
+	ret->qpn = qpn;
+	return &ret->com;
+}
+
+static struct res_common *alloc_tr(u64 id, enum mlx4_resource type, int slave,
+				   int extra)
+{
+	struct res_common *ret;
+
+	switch (type) {
+	case RES_QP:
+		ret = alloc_qp_tr(id);
+		break;
+	case RES_MPT:
+		ret = alloc_mpt_tr(id, extra);
+		break;
+	case RES_MTT:
+		ret = alloc_mtt_tr(id, extra);
+		break;
+	case RES_EQ:
+		ret = alloc_eq_tr(id);
+		break;
+	case RES_CQ:
+		ret = alloc_cq_tr(id);
+		break;
+	case RES_SRQ:
+		ret = alloc_srq_tr(id);
+		break;
+	case RES_MAC:
+		pr_err("implementation missing\n");
+		return NULL;
+	case RES_COUNTER:
+		ret = alloc_counter_tr(id);
+		break;
+	case RES_XRCD:
+		ret = alloc_xrcdn_tr(id);
+		break;
+	case RES_FS_RULE:
+		ret = alloc_fs_rule_tr(id, extra);
+		break;
+	default:
+		return NULL;
+	}
+	if (ret)
+		ret->owner = slave;
+
+	return ret;
+}
+
+static int add_res_range(struct mlx4_dev *dev, int slave, u64 base, int count,
+			 enum mlx4_resource type, int extra)
+{
+	int i;
+	int err;
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct res_common **res_arr;
+	struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker;
+	struct rb_root *root = &tracker->res_tree[type];
+
+	res_arr = kzalloc(count * sizeof *res_arr, GFP_KERNEL);
+	if (!res_arr)
+		return -ENOMEM;
+
+	for (i = 0; i < count; ++i) {
+		res_arr[i] = alloc_tr(base + i, type, slave, extra);
+		if (!res_arr[i]) {
+			for (--i; i >= 0; --i)
+				kfree(res_arr[i]);
+
+			kfree(res_arr);
+			return -ENOMEM;
+		}
+	}
+
+	spin_lock_irq(mlx4_tlock(dev));
+	for (i = 0; i < count; ++i) {
+		if (find_res(dev, base + i, type)) {
+			err = -EEXIST;
+			goto undo;
+		}
+		err = res_tracker_insert(root, res_arr[i]);
+		if (err)
+			goto undo;
+		list_add_tail(&res_arr[i]->list,
+			      &tracker->slave_list[slave].res_list[type]);
+	}
+	spin_unlock_irq(mlx4_tlock(dev));
+	kfree(res_arr);
+
+	return 0;
+
+undo:
+	for (--i; i >= base; --i)
+		rb_erase(&res_arr[i]->node, root);
+
+	spin_unlock_irq(mlx4_tlock(dev));
+
+	for (i = 0; i < count; ++i)
+		kfree(res_arr[i]);
+
+	kfree(res_arr);
+
+	return err;
+}
+
+static int remove_qp_ok(struct res_qp *res)
+{
+	if (res->com.state == RES_QP_BUSY || atomic_read(&res->ref_count) ||
+	    !list_empty(&res->mcg_list)) {
+		pr_err("resource tracker: fail to remove qp, state %d, ref_count %d\n",
+		       res->com.state, atomic_read(&res->ref_count));
+		return -EBUSY;
+	} else if (res->com.state != RES_QP_RESERVED) {
+		return -EPERM;
+	}
+
+	return 0;
+}
+
+static int remove_mtt_ok(struct res_mtt *res, int order)
+{
+	if (res->com.state == RES_MTT_BUSY ||
+	    atomic_read(&res->ref_count)) {
+		pr_devel("%s-%d: state %s, ref_count %d\n",
+			 __func__, __LINE__,
+			 mtt_states_str(res->com.state),
+			 atomic_read(&res->ref_count));
+		return -EBUSY;
+	} else if (res->com.state != RES_MTT_ALLOCATED)
+		return -EPERM;
+	else if (res->order != order)
+		return -EINVAL;
+
+	return 0;
+}
+
+static int remove_mpt_ok(struct res_mpt *res)
+{
+	if (res->com.state == RES_MPT_BUSY)
+		return -EBUSY;
+	else if (res->com.state != RES_MPT_RESERVED)
+		return -EPERM;
+
+	return 0;
+}
+
+static int remove_eq_ok(struct res_eq *res)
+{
+	if (res->com.state == RES_MPT_BUSY)
+		return -EBUSY;
+	else if (res->com.state != RES_MPT_RESERVED)
+		return -EPERM;
+
+	return 0;
+}
+
+static int remove_counter_ok(struct res_counter *res)
+{
+	if (res->com.state == RES_COUNTER_BUSY)
+		return -EBUSY;
+	else if (res->com.state != RES_COUNTER_ALLOCATED)
+		return -EPERM;
+
+	return 0;
+}
+
+static int remove_xrcdn_ok(struct res_xrcdn *res)
+{
+	if (res->com.state == RES_XRCD_BUSY)
+		return -EBUSY;
+	else if (res->com.state != RES_XRCD_ALLOCATED)
+		return -EPERM;
+
+	return 0;
+}
+
+static int remove_fs_rule_ok(struct res_fs_rule *res)
+{
+	if (res->com.state == RES_FS_RULE_BUSY)
+		return -EBUSY;
+	else if (res->com.state != RES_FS_RULE_ALLOCATED)
+		return -EPERM;
+
+	return 0;
+}
+
+static int remove_cq_ok(struct res_cq *res)
+{
+	if (res->com.state == RES_CQ_BUSY)
+		return -EBUSY;
+	else if (res->com.state != RES_CQ_ALLOCATED)
+		return -EPERM;
+
+	return 0;
+}
+
+static int remove_srq_ok(struct res_srq *res)
+{
+	if (res->com.state == RES_SRQ_BUSY)
+		return -EBUSY;
+	else if (res->com.state != RES_SRQ_ALLOCATED)
+		return -EPERM;
+
+	return 0;
+}
+
+static int remove_ok(struct res_common *res, enum mlx4_resource type, int extra)
+{
+	switch (type) {
+	case RES_QP:
+		return remove_qp_ok((struct res_qp *)res);
+	case RES_CQ:
+		return remove_cq_ok((struct res_cq *)res);
+	case RES_SRQ:
+		return remove_srq_ok((struct res_srq *)res);
+	case RES_MPT:
+		return remove_mpt_ok((struct res_mpt *)res);
+	case RES_MTT:
+		return remove_mtt_ok((struct res_mtt *)res, extra);
+	case RES_MAC:
+		return -ENOSYS;
+	case RES_EQ:
+		return remove_eq_ok((struct res_eq *)res);
+	case RES_COUNTER:
+		return remove_counter_ok((struct res_counter *)res);
+	case RES_XRCD:
+		return remove_xrcdn_ok((struct res_xrcdn *)res);
+	case RES_FS_RULE:
+		return remove_fs_rule_ok((struct res_fs_rule *)res);
+	default:
+		return -EINVAL;
+	}
+}
+
+static int rem_res_range(struct mlx4_dev *dev, int slave, u64 base, int count,
+			 enum mlx4_resource type, int extra)
+{
+	u64 i;
+	int err;
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker;
+	struct res_common *r;
+
+	spin_lock_irq(mlx4_tlock(dev));
+	for (i = base; i < base + count; ++i) {
+		r = res_tracker_lookup(&tracker->res_tree[type], i);
+		if (!r) {
+			err = -ENOENT;
+			goto out;
+		}
+		if (r->owner != slave) {
+			err = -EPERM;
+			goto out;
+		}
+		err = remove_ok(r, type, extra);
+		if (err)
+			goto out;
+	}
+
+	for (i = base; i < base + count; ++i) {
+		r = res_tracker_lookup(&tracker->res_tree[type], i);
+		rb_erase(&r->node, &tracker->res_tree[type]);
+		list_del(&r->list);
+		kfree(r);
+	}
+	err = 0;
+
+out:
+	spin_unlock_irq(mlx4_tlock(dev));
+
+	return err;
+}
+
+static int qp_res_start_move_to(struct mlx4_dev *dev, int slave, int qpn,
+				enum res_qp_states state, struct res_qp **qp,
+				int alloc)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker;
+	struct res_qp *r;
+	int err = 0;
+
+	spin_lock_irq(mlx4_tlock(dev));
+	r = res_tracker_lookup(&tracker->res_tree[RES_QP], qpn);
+	if (!r)
+		err = -ENOENT;
+	else if (r->com.owner != slave)
+		err = -EPERM;
+	else {
+		switch (state) {
+		case RES_QP_BUSY:
+			mlx4_dbg(dev, "%s: failed RES_QP, 0x%llx\n",
+				 __func__, r->com.res_id);
+			err = -EBUSY;
+			break;
+
+		case RES_QP_RESERVED:
+			if (r->com.state == RES_QP_MAPPED && !alloc)
+				break;
+
+			mlx4_dbg(dev, "failed RES_QP, 0x%llx\n", r->com.res_id);
+			err = -EINVAL;
+			break;
+
+		case RES_QP_MAPPED:
+			if ((r->com.state == RES_QP_RESERVED && alloc) ||
+			    r->com.state == RES_QP_HW)
+				break;
+			else {
+				mlx4_dbg(dev, "failed RES_QP, 0x%llx\n",
+					  r->com.res_id);
+				err = -EINVAL;
+			}
+
+			break;
+
+		case RES_QP_HW:
+			if (r->com.state != RES_QP_MAPPED)
+				err = -EINVAL;
+			break;
+		default:
+			err = -EINVAL;
+		}
+
+		if (!err) {
+			r->com.from_state = r->com.state;
+			r->com.to_state = state;
+			r->com.state = RES_QP_BUSY;
+			if (qp)
+				*qp = r;
+		}
+	}
+
+	spin_unlock_irq(mlx4_tlock(dev));
+
+	return err;
+}
+
+static int mr_res_start_move_to(struct mlx4_dev *dev, int slave, int index,
+				enum res_mpt_states state, struct res_mpt **mpt)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker;
+	struct res_mpt *r;
+	int err = 0;
+
+	spin_lock_irq(mlx4_tlock(dev));
+	r = res_tracker_lookup(&tracker->res_tree[RES_MPT], index);
+	if (!r)
+		err = -ENOENT;
+	else if (r->com.owner != slave)
+		err = -EPERM;
+	else {
+		switch (state) {
+		case RES_MPT_BUSY:
+			err = -EINVAL;
+			break;
+
+		case RES_MPT_RESERVED:
+			if (r->com.state != RES_MPT_MAPPED)
+				err = -EINVAL;
+			break;
+
+		case RES_MPT_MAPPED:
+			if (r->com.state != RES_MPT_RESERVED &&
+			    r->com.state != RES_MPT_HW)
+				err = -EINVAL;
+			break;
+
+		case RES_MPT_HW:
+			if (r->com.state != RES_MPT_MAPPED)
+				err = -EINVAL;
+			break;
+		default:
+			err = -EINVAL;
+		}
+
+		if (!err) {
+			r->com.from_state = r->com.state;
+			r->com.to_state = state;
+			r->com.state = RES_MPT_BUSY;
+			if (mpt)
+				*mpt = r;
+		}
+	}
+
+	spin_unlock_irq(mlx4_tlock(dev));
+
+	return err;
+}
+
+static int eq_res_start_move_to(struct mlx4_dev *dev, int slave, int index,
+				enum res_eq_states state, struct res_eq **eq)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker;
+	struct res_eq *r;
+	int err = 0;
+
+	spin_lock_irq(mlx4_tlock(dev));
+	r = res_tracker_lookup(&tracker->res_tree[RES_EQ], index);
+	if (!r)
+		err = -ENOENT;
+	else if (r->com.owner != slave)
+		err = -EPERM;
+	else {
+		switch (state) {
+		case RES_EQ_BUSY:
+			err = -EINVAL;
+			break;
+
+		case RES_EQ_RESERVED:
+			if (r->com.state != RES_EQ_HW)
+				err = -EINVAL;
+			break;
+
+		case RES_EQ_HW:
+			if (r->com.state != RES_EQ_RESERVED)
+				err = -EINVAL;
+			break;
+
+		default:
+			err = -EINVAL;
+		}
+
+		if (!err) {
+			r->com.from_state = r->com.state;
+			r->com.to_state = state;
+			r->com.state = RES_EQ_BUSY;
+			if (eq)
+				*eq = r;
+		}
+	}
+
+	spin_unlock_irq(mlx4_tlock(dev));
+
+	return err;
+}
+
+static int cq_res_start_move_to(struct mlx4_dev *dev, int slave, int cqn,
+				enum res_cq_states state, struct res_cq **cq)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker;
+	struct res_cq *r;
+	int err;
+
+	spin_lock_irq(mlx4_tlock(dev));
+	r = res_tracker_lookup(&tracker->res_tree[RES_CQ], cqn);
+	if (!r) {
+		err = -ENOENT;
+	} else if (r->com.owner != slave) {
+		err = -EPERM;
+	} else if (state == RES_CQ_ALLOCATED) {
+		if (r->com.state != RES_CQ_HW)
+			err = -EINVAL;
+		else if (atomic_read(&r->ref_count))
+			err = -EBUSY;
+		else
+			err = 0;
+	} else if (state != RES_CQ_HW || r->com.state != RES_CQ_ALLOCATED) {
+		err = -EINVAL;
+	} else {
+		err = 0;
+	}
+
+	if (!err) {
+		r->com.from_state = r->com.state;
+		r->com.to_state = state;
+		r->com.state = RES_CQ_BUSY;
+		if (cq)
+			*cq = r;
+	}
+
+	spin_unlock_irq(mlx4_tlock(dev));
+
+	return err;
+}
+
+static int srq_res_start_move_to(struct mlx4_dev *dev, int slave, int index,
+				 enum res_srq_states state, struct res_srq **srq)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker;
+	struct res_srq *r;
+	int err = 0;
+
+	spin_lock_irq(mlx4_tlock(dev));
+	r = res_tracker_lookup(&tracker->res_tree[RES_SRQ], index);
+	if (!r) {
+		err = -ENOENT;
+	} else if (r->com.owner != slave) {
+		err = -EPERM;
+	} else if (state == RES_SRQ_ALLOCATED) {
+		if (r->com.state != RES_SRQ_HW)
+			err = -EINVAL;
+		else if (atomic_read(&r->ref_count))
+			err = -EBUSY;
+	} else if (state != RES_SRQ_HW || r->com.state != RES_SRQ_ALLOCATED) {
+		err = -EINVAL;
+	}
+
+	if (!err) {
+		r->com.from_state = r->com.state;
+		r->com.to_state = state;
+		r->com.state = RES_SRQ_BUSY;
+		if (srq)
+			*srq = r;
+	}
+
+	spin_unlock_irq(mlx4_tlock(dev));
+
+	return err;
+}
+
+static void res_abort_move(struct mlx4_dev *dev, int slave,
+			   enum mlx4_resource type, int id)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker;
+	struct res_common *r;
+
+	spin_lock_irq(mlx4_tlock(dev));
+	r = res_tracker_lookup(&tracker->res_tree[type], id);
+	if (r && (r->owner == slave))
+		r->state = r->from_state;
+	spin_unlock_irq(mlx4_tlock(dev));
+}
+
+static void res_end_move(struct mlx4_dev *dev, int slave,
+			 enum mlx4_resource type, int id)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker;
+	struct res_common *r;
+
+	spin_lock_irq(mlx4_tlock(dev));
+	r = res_tracker_lookup(&tracker->res_tree[type], id);
+	if (r && (r->owner == slave))
+		r->state = r->to_state;
+	spin_unlock_irq(mlx4_tlock(dev));
+}
+
+static int valid_reserved(struct mlx4_dev *dev, int slave, int qpn)
+{
+	return mlx4_is_qp_reserved(dev, qpn) &&
+		(mlx4_is_master(dev) || mlx4_is_guest_proxy(dev, slave, qpn));
+}
+
+static int fw_reserved(struct mlx4_dev *dev, int qpn)
+{
+	return qpn < dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW];
+}
+
+static int qp_alloc_res(struct mlx4_dev *dev, int slave, int op, int cmd,
+			u64 in_param, u64 *out_param)
+{
+	int err;
+	int count;
+	int align;
+	int base;
+	int qpn;
+
+	switch (op) {
+	case RES_OP_RESERVE:
+		count = get_param_l(&in_param) & 0xffffff;
+		align = get_param_h(&in_param);
+		err = mlx4_grant_resource(dev, slave, RES_QP, count, 0);
+		if (err)
+			return err;
+
+		err = __mlx4_qp_reserve_range(dev, count, align, &base);
+		if (err) {
+			mlx4_release_resource(dev, slave, RES_QP, count, 0);
+			return err;
+		}
+
+		err = add_res_range(dev, slave, base, count, RES_QP, 0);
+		if (err) {
+			mlx4_release_resource(dev, slave, RES_QP, count, 0);
+			__mlx4_qp_release_range(dev, base, count);
+			return err;
+		}
+		set_param_l(out_param, base);
+		break;
+	case RES_OP_MAP_ICM:
+		qpn = get_param_l(&in_param) & 0x7fffff;
+		if (valid_reserved(dev, slave, qpn)) {
+			err = add_res_range(dev, slave, qpn, 1, RES_QP, 0);
+			if (err)
+				return err;
+		}
+
+		err = qp_res_start_move_to(dev, slave, qpn, RES_QP_MAPPED,
+					   NULL, 1);
+		if (err)
+			return err;
+
+		if (!fw_reserved(dev, qpn)) {
+			err = __mlx4_qp_alloc_icm(dev, qpn, GFP_KERNEL);
+			if (err) {
+				res_abort_move(dev, slave, RES_QP, qpn);
+				return err;
+			}
+		}
+
+		res_end_move(dev, slave, RES_QP, qpn);
+		break;
+
+	default:
+		err = -EINVAL;
+		break;
+	}
+	return err;
+}
+
+static int mtt_alloc_res(struct mlx4_dev *dev, int slave, int op, int cmd,
+			 u64 in_param, u64 *out_param)
+{
+	int err = -EINVAL;
+	int base;
+	int order;
+
+	if (op != RES_OP_RESERVE_AND_MAP)
+		return err;
+
+	order = get_param_l(&in_param);
+
+	err = mlx4_grant_resource(dev, slave, RES_MTT, 1 << order, 0);
+	if (err)
+		return err;
+
+	base = __mlx4_alloc_mtt_range(dev, order);
+	if (base == -1) {
+		mlx4_release_resource(dev, slave, RES_MTT, 1 << order, 0);
+		return -ENOMEM;
+	}
+
+	err = add_res_range(dev, slave, base, 1, RES_MTT, order);
+	if (err) {
+		mlx4_release_resource(dev, slave, RES_MTT, 1 << order, 0);
+		__mlx4_free_mtt_range(dev, base, order);
+	} else {
+		set_param_l(out_param, base);
+	}
+
+	return err;
+}
+
+static int mpt_alloc_res(struct mlx4_dev *dev, int slave, int op, int cmd,
+			 u64 in_param, u64 *out_param)
+{
+	int err = -EINVAL;
+	int index;
+	int id;
+	struct res_mpt *mpt;
+
+	switch (op) {
+	case RES_OP_RESERVE:
+		err = mlx4_grant_resource(dev, slave, RES_MPT, 1, 0);
+		if (err)
+			break;
+
+		index = __mlx4_mpt_reserve(dev);
+		if (index == -1) {
+			mlx4_release_resource(dev, slave, RES_MPT, 1, 0);
+			break;
+		}
+		id = index & mpt_mask(dev);
+
+		err = add_res_range(dev, slave, id, 1, RES_MPT, index);
+		if (err) {
+			mlx4_release_resource(dev, slave, RES_MPT, 1, 0);
+			__mlx4_mpt_release(dev, index);
+			break;
+		}
+		set_param_l(out_param, index);
+		break;
+	case RES_OP_MAP_ICM:
+		index = get_param_l(&in_param);
+		id = index & mpt_mask(dev);
+		err = mr_res_start_move_to(dev, slave, id,
+					   RES_MPT_MAPPED, &mpt);
+		if (err)
+			return err;
+
+		err = __mlx4_mpt_alloc_icm(dev, mpt->key, GFP_KERNEL);
+		if (err) {
+			res_abort_move(dev, slave, RES_MPT, id);
+			return err;
+		}
+
+		res_end_move(dev, slave, RES_MPT, id);
+		break;
+	}
+	return err;
+}
+
+static int cq_alloc_res(struct mlx4_dev *dev, int slave, int op, int cmd,
+			u64 in_param, u64 *out_param)
+{
+	int cqn;
+	int err;
+
+	switch (op) {
+	case RES_OP_RESERVE_AND_MAP:
+		err = mlx4_grant_resource(dev, slave, RES_CQ, 1, 0);
+		if (err)
+			break;
+
+		err = __mlx4_cq_alloc_icm(dev, &cqn);
+		if (err) {
+			mlx4_release_resource(dev, slave, RES_CQ, 1, 0);
+			break;
+		}
+
+		err = add_res_range(dev, slave, cqn, 1, RES_CQ, 0);
+		if (err) {
+			mlx4_release_resource(dev, slave, RES_CQ, 1, 0);
+			__mlx4_cq_free_icm(dev, cqn);
+			break;
+		}
+
+		set_param_l(out_param, cqn);
+		break;
+
+	default:
+		err = -EINVAL;
+	}
+
+	return err;
+}
+
+static int srq_alloc_res(struct mlx4_dev *dev, int slave, int op, int cmd,
+			 u64 in_param, u64 *out_param)
+{
+	int srqn;
+	int err;
+
+	switch (op) {
+	case RES_OP_RESERVE_AND_MAP:
+		err = mlx4_grant_resource(dev, slave, RES_SRQ, 1, 0);
+		if (err)
+			break;
+
+		err = __mlx4_srq_alloc_icm(dev, &srqn);
+		if (err) {
+			mlx4_release_resource(dev, slave, RES_SRQ, 1, 0);
+			break;
+		}
+
+		err = add_res_range(dev, slave, srqn, 1, RES_SRQ, 0);
+		if (err) {
+			mlx4_release_resource(dev, slave, RES_SRQ, 1, 0);
+			__mlx4_srq_free_icm(dev, srqn);
+			break;
+		}
+
+		set_param_l(out_param, srqn);
+		break;
+
+	default:
+		err = -EINVAL;
+	}
+
+	return err;
+}
+
+static int mac_find_smac_ix_in_slave(struct mlx4_dev *dev, int slave, int port,
+				     u8 smac_index, u64 *mac)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker;
+	struct list_head *mac_list =
+		&tracker->slave_list[slave].res_list[RES_MAC];
+	struct mac_res *res, *tmp;
+
+	list_for_each_entry_safe(res, tmp, mac_list, list) {
+		if (res->smac_index == smac_index && res->port == (u8) port) {
+			*mac = res->mac;
+			return 0;
+		}
+	}
+	return -ENOENT;
+}
+
+static int mac_add_to_slave(struct mlx4_dev *dev, int slave, u64 mac, int port, u8 smac_index)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker;
+	struct list_head *mac_list =
+		&tracker->slave_list[slave].res_list[RES_MAC];
+	struct mac_res *res, *tmp;
+
+	list_for_each_entry_safe(res, tmp, mac_list, list) {
+		if (res->mac == mac && res->port == (u8) port) {
+			/* mac found. update ref count */
+			++res->ref_count;
+			return 0;
+		}
+	}
+
+	if (mlx4_grant_resource(dev, slave, RES_MAC, 1, port))
+		return -EINVAL;
+	res = kzalloc(sizeof *res, GFP_KERNEL);
+	if (!res) {
+		mlx4_release_resource(dev, slave, RES_MAC, 1, port);
+		return -ENOMEM;
+	}
+	res->mac = mac;
+	res->port = (u8) port;
+	res->smac_index = smac_index;
+	res->ref_count = 1;
+	list_add_tail(&res->list,
+		      &tracker->slave_list[slave].res_list[RES_MAC]);
+	return 0;
+}
+
+static void mac_del_from_slave(struct mlx4_dev *dev, int slave, u64 mac,
+			       int port)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker;
+	struct list_head *mac_list =
+		&tracker->slave_list[slave].res_list[RES_MAC];
+	struct mac_res *res, *tmp;
+
+	list_for_each_entry_safe(res, tmp, mac_list, list) {
+		if (res->mac == mac && res->port == (u8) port) {
+			if (!--res->ref_count) {
+				list_del(&res->list);
+				mlx4_release_resource(dev, slave, RES_MAC, 1, port);
+				kfree(res);
+			}
+			break;
+		}
+	}
+}
+
+static void rem_slave_macs(struct mlx4_dev *dev, int slave)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker;
+	struct list_head *mac_list =
+		&tracker->slave_list[slave].res_list[RES_MAC];
+	struct mac_res *res, *tmp;
+	int i;
+
+	list_for_each_entry_safe(res, tmp, mac_list, list) {
+		list_del(&res->list);
+		/* dereference the mac the num times the slave referenced it */
+		for (i = 0; i < res->ref_count; i++)
+			__mlx4_unregister_mac(dev, res->port, res->mac);
+		mlx4_release_resource(dev, slave, RES_MAC, 1, res->port);
+		kfree(res);
+	}
+}
+
+static int mac_alloc_res(struct mlx4_dev *dev, int slave, int op, int cmd,
+			 u64 in_param, u64 *out_param, int in_port)
+{
+	int err = -EINVAL;
+	int port;
+	u64 mac;
+	u8 smac_index;
+
+	if (op != RES_OP_RESERVE_AND_MAP)
+		return err;
+
+	port = !in_port ? get_param_l(out_param) : in_port;
+	port = mlx4_slave_convert_port(
+			dev, slave, port);
+
+	if (port < 0)
+		return -EINVAL;
+	mac = in_param;
+
+	err = __mlx4_register_mac(dev, port, mac);
+	if (err >= 0) {
+		smac_index = err;
+		set_param_l(out_param, err);
+		err = 0;
+	}
+
+	if (!err) {
+		err = mac_add_to_slave(dev, slave, mac, port, smac_index);
+		if (err)
+			__mlx4_unregister_mac(dev, port, mac);
+	}
+	return err;
+}
+
+static int vlan_add_to_slave(struct mlx4_dev *dev, int slave, u16 vlan,
+			     int port, int vlan_index)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker;
+	struct list_head *vlan_list =
+		&tracker->slave_list[slave].res_list[RES_VLAN];
+	struct vlan_res *res, *tmp;
+
+	list_for_each_entry_safe(res, tmp, vlan_list, list) {
+		if (res->vlan == vlan && res->port == (u8) port) {
+			/* vlan found. update ref count */
+			++res->ref_count;
+			return 0;
+		}
+	}
+
+	if (mlx4_grant_resource(dev, slave, RES_VLAN, 1, port))
+		return -EINVAL;
+	res = kzalloc(sizeof(*res), GFP_KERNEL);
+	if (!res) {
+		mlx4_release_resource(dev, slave, RES_VLAN, 1, port);
+		return -ENOMEM;
+	}
+	res->vlan = vlan;
+	res->port = (u8) port;
+	res->vlan_index = vlan_index;
+	res->ref_count = 1;
+	list_add_tail(&res->list,
+		      &tracker->slave_list[slave].res_list[RES_VLAN]);
+	return 0;
+}
+
+
+static void vlan_del_from_slave(struct mlx4_dev *dev, int slave, u16 vlan,
+				int port)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker;
+	struct list_head *vlan_list =
+		&tracker->slave_list[slave].res_list[RES_VLAN];
+	struct vlan_res *res, *tmp;
+
+	list_for_each_entry_safe(res, tmp, vlan_list, list) {
+		if (res->vlan == vlan && res->port == (u8) port) {
+			if (!--res->ref_count) {
+				list_del(&res->list);
+				mlx4_release_resource(dev, slave, RES_VLAN,
+						      1, port);
+				kfree(res);
+			}
+			break;
+		}
+	}
+}
+
+static void rem_slave_vlans(struct mlx4_dev *dev, int slave)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker;
+	struct list_head *vlan_list =
+		&tracker->slave_list[slave].res_list[RES_VLAN];
+	struct vlan_res *res, *tmp;
+	int i;
+
+	list_for_each_entry_safe(res, tmp, vlan_list, list) {
+		list_del(&res->list);
+		/* dereference the vlan the num times the slave referenced it */
+		for (i = 0; i < res->ref_count; i++)
+			__mlx4_unregister_vlan(dev, res->port, res->vlan);
+		mlx4_release_resource(dev, slave, RES_VLAN, 1, res->port);
+		kfree(res);
+	}
+}
+
+static int vlan_alloc_res(struct mlx4_dev *dev, int slave, int op, int cmd,
+			  u64 in_param, u64 *out_param, int in_port)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_slave_state *slave_state = priv->mfunc.master.slave_state;
+	int err;
+	u16 vlan;
+	int vlan_index;
+	int port;
+
+	port = !in_port ? get_param_l(out_param) : in_port;
+
+	if (!port || op != RES_OP_RESERVE_AND_MAP)
+		return -EINVAL;
+
+	port = mlx4_slave_convert_port(
+			dev, slave, port);
+
+	if (port < 0)
+		return -EINVAL;
+	/* upstream kernels had NOP for reg/unreg vlan. Continue this. */
+	if (!in_port && port > 0 && port <= dev->caps.num_ports) {
+		slave_state[slave].old_vlan_api = true;
+		return 0;
+	}
+
+	vlan = (u16) in_param;
+
+	err = __mlx4_register_vlan(dev, port, vlan, &vlan_index);
+	if (!err) {
+		set_param_l(out_param, (u32) vlan_index);
+		err = vlan_add_to_slave(dev, slave, vlan, port, vlan_index);
+		if (err)
+			__mlx4_unregister_vlan(dev, port, vlan);
+	}
+	return err;
+}
+
+static int counter_alloc_res(struct mlx4_dev *dev, int slave, int op, int cmd,
+			     u64 in_param, u64 *out_param)
+{
+	u32 index;
+	int err;
+
+	if (op != RES_OP_RESERVE)
+		return -EINVAL;
+
+	err = mlx4_grant_resource(dev, slave, RES_COUNTER, 1, 0);
+	if (err)
+		return err;
+
+	err = __mlx4_counter_alloc(dev, &index);
+	if (err) {
+		mlx4_release_resource(dev, slave, RES_COUNTER, 1, 0);
+		return err;
+	}
+
+	err = add_res_range(dev, slave, index, 1, RES_COUNTER, 0);
+	if (err) {
+		__mlx4_counter_free(dev, index);
+		mlx4_release_resource(dev, slave, RES_COUNTER, 1, 0);
+	} else {
+		set_param_l(out_param, index);
+	}
+
+	return err;
+}
+
+static int xrcdn_alloc_res(struct mlx4_dev *dev, int slave, int op, int cmd,
+			   u64 in_param, u64 *out_param)
+{
+	u32 xrcdn;
+	int err;
+
+	if (op != RES_OP_RESERVE)
+		return -EINVAL;
+
+	err = __mlx4_xrcd_alloc(dev, &xrcdn);
+	if (err)
+		return err;
+
+	err = add_res_range(dev, slave, xrcdn, 1, RES_XRCD, 0);
+	if (err)
+		__mlx4_xrcd_free(dev, xrcdn);
+	else
+		set_param_l(out_param, xrcdn);
+
+	return err;
+}
+
+int mlx4_ALLOC_RES_wrapper(struct mlx4_dev *dev, int slave,
+			   struct mlx4_vhcr *vhcr,
+			   struct mlx4_cmd_mailbox *inbox,
+			   struct mlx4_cmd_mailbox *outbox,
+			   struct mlx4_cmd_info *cmd)
+{
+	int err;
+	int alop = vhcr->op_modifier;
+
+	switch (vhcr->in_modifier & 0xFF) {
+	case RES_QP:
+		err = qp_alloc_res(dev, slave, vhcr->op_modifier, alop,
+				   vhcr->in_param, &vhcr->out_param);
+		break;
+
+	case RES_MTT:
+		err = mtt_alloc_res(dev, slave, vhcr->op_modifier, alop,
+				    vhcr->in_param, &vhcr->out_param);
+		break;
+
+	case RES_MPT:
+		err = mpt_alloc_res(dev, slave, vhcr->op_modifier, alop,
+				    vhcr->in_param, &vhcr->out_param);
+		break;
+
+	case RES_CQ:
+		err = cq_alloc_res(dev, slave, vhcr->op_modifier, alop,
+				   vhcr->in_param, &vhcr->out_param);
+		break;
+
+	case RES_SRQ:
+		err = srq_alloc_res(dev, slave, vhcr->op_modifier, alop,
+				    vhcr->in_param, &vhcr->out_param);
+		break;
+
+	case RES_MAC:
+		err = mac_alloc_res(dev, slave, vhcr->op_modifier, alop,
+				    vhcr->in_param, &vhcr->out_param,
+				    (vhcr->in_modifier >> 8) & 0xFF);
+		break;
+
+	case RES_VLAN:
+		err = vlan_alloc_res(dev, slave, vhcr->op_modifier, alop,
+				     vhcr->in_param, &vhcr->out_param,
+				     (vhcr->in_modifier >> 8) & 0xFF);
+		break;
+
+	case RES_COUNTER:
+		err = counter_alloc_res(dev, slave, vhcr->op_modifier, alop,
+					vhcr->in_param, &vhcr->out_param);
+		break;
+
+	case RES_XRCD:
+		err = xrcdn_alloc_res(dev, slave, vhcr->op_modifier, alop,
+				      vhcr->in_param, &vhcr->out_param);
+		break;
+
+	default:
+		err = -EINVAL;
+		break;
+	}
+
+	return err;
+}
+
+static int qp_free_res(struct mlx4_dev *dev, int slave, int op, int cmd,
+		       u64 in_param)
+{
+	int err;
+	int count;
+	int base;
+	int qpn;
+
+	switch (op) {
+	case RES_OP_RESERVE:
+		base = get_param_l(&in_param) & 0x7fffff;
+		count = get_param_h(&in_param);
+		err = rem_res_range(dev, slave, base, count, RES_QP, 0);
+		if (err)
+			break;
+		mlx4_release_resource(dev, slave, RES_QP, count, 0);
+		__mlx4_qp_release_range(dev, base, count);
+		break;
+	case RES_OP_MAP_ICM:
+		qpn = get_param_l(&in_param) & 0x7fffff;
+		err = qp_res_start_move_to(dev, slave, qpn, RES_QP_RESERVED,
+					   NULL, 0);
+		if (err)
+			return err;
+
+		if (!fw_reserved(dev, qpn))
+			__mlx4_qp_free_icm(dev, qpn);
+
+		res_end_move(dev, slave, RES_QP, qpn);
+
+		if (valid_reserved(dev, slave, qpn))
+			err = rem_res_range(dev, slave, qpn, 1, RES_QP, 0);
+		break;
+	default:
+		err = -EINVAL;
+		break;
+	}
+	return err;
+}
+
+static int mtt_free_res(struct mlx4_dev *dev, int slave, int op, int cmd,
+			u64 in_param, u64 *out_param)
+{
+	int err = -EINVAL;
+	int base;
+	int order;
+
+	if (op != RES_OP_RESERVE_AND_MAP)
+		return err;
+
+	base = get_param_l(&in_param);
+	order = get_param_h(&in_param);
+	err = rem_res_range(dev, slave, base, 1, RES_MTT, order);
+	if (!err) {
+		mlx4_release_resource(dev, slave, RES_MTT, 1 << order, 0);
+		__mlx4_free_mtt_range(dev, base, order);
+	}
+	return err;
+}
+
+static int mpt_free_res(struct mlx4_dev *dev, int slave, int op, int cmd,
+			u64 in_param)
+{
+	int err = -EINVAL;
+	int index;
+	int id;
+	struct res_mpt *mpt;
+
+	switch (op) {
+	case RES_OP_RESERVE:
+		index = get_param_l(&in_param);
+		id = index & mpt_mask(dev);
+		err = get_res(dev, slave, id, RES_MPT, &mpt);
+		if (err)
+			break;
+		index = mpt->key;
+		put_res(dev, slave, id, RES_MPT);
+
+		err = rem_res_range(dev, slave, id, 1, RES_MPT, 0);
+		if (err)
+			break;
+		mlx4_release_resource(dev, slave, RES_MPT, 1, 0);
+		__mlx4_mpt_release(dev, index);
+		break;
+	case RES_OP_MAP_ICM:
+			index = get_param_l(&in_param);
+			id = index & mpt_mask(dev);
+			err = mr_res_start_move_to(dev, slave, id,
+						   RES_MPT_RESERVED, &mpt);
+			if (err)
+				return err;
+
+			__mlx4_mpt_free_icm(dev, mpt->key);
+			res_end_move(dev, slave, RES_MPT, id);
+			return err;
+		break;
+	default:
+		err = -EINVAL;
+		break;
+	}
+	return err;
+}
+
+static int cq_free_res(struct mlx4_dev *dev, int slave, int op, int cmd,
+		       u64 in_param, u64 *out_param)
+{
+	int cqn;
+	int err;
+
+	switch (op) {
+	case RES_OP_RESERVE_AND_MAP:
+		cqn = get_param_l(&in_param);
+		err = rem_res_range(dev, slave, cqn, 1, RES_CQ, 0);
+		if (err)
+			break;
+
+		mlx4_release_resource(dev, slave, RES_CQ, 1, 0);
+		__mlx4_cq_free_icm(dev, cqn);
+		break;
+
+	default:
+		err = -EINVAL;
+		break;
+	}
+
+	return err;
+}
+
+static int srq_free_res(struct mlx4_dev *dev, int slave, int op, int cmd,
+			u64 in_param, u64 *out_param)
+{
+	int srqn;
+	int err;
+
+	switch (op) {
+	case RES_OP_RESERVE_AND_MAP:
+		srqn = get_param_l(&in_param);
+		err = rem_res_range(dev, slave, srqn, 1, RES_SRQ, 0);
+		if (err)
+			break;
+
+		mlx4_release_resource(dev, slave, RES_SRQ, 1, 0);
+		__mlx4_srq_free_icm(dev, srqn);
+		break;
+
+	default:
+		err = -EINVAL;
+		break;
+	}
+
+	return err;
+}
+
+static int mac_free_res(struct mlx4_dev *dev, int slave, int op, int cmd,
+			    u64 in_param, u64 *out_param, int in_port)
+{
+	int port;
+	int err = 0;
+
+	switch (op) {
+	case RES_OP_RESERVE_AND_MAP:
+		port = !in_port ? get_param_l(out_param) : in_port;
+		port = mlx4_slave_convert_port(
+				dev, slave, port);
+
+		if (port < 0)
+			return -EINVAL;
+		mac_del_from_slave(dev, slave, in_param, port);
+		__mlx4_unregister_mac(dev, port, in_param);
+		break;
+	default:
+		err = -EINVAL;
+		break;
+	}
+
+	return err;
+
+}
+
+static int vlan_free_res(struct mlx4_dev *dev, int slave, int op, int cmd,
+			    u64 in_param, u64 *out_param, int port)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_slave_state *slave_state = priv->mfunc.master.slave_state;
+	int err = 0;
+
+	port = mlx4_slave_convert_port(
+			dev, slave, port);
+
+	if (port < 0)
+		return -EINVAL;
+	switch (op) {
+	case RES_OP_RESERVE_AND_MAP:
+		if (slave_state[slave].old_vlan_api)
+			return 0;
+		if (!port)
+			return -EINVAL;
+		vlan_del_from_slave(dev, slave, in_param, port);
+		__mlx4_unregister_vlan(dev, port, in_param);
+		break;
+	default:
+		err = -EINVAL;
+		break;
+	}
+
+	return err;
+}
+
+static int counter_free_res(struct mlx4_dev *dev, int slave, int op, int cmd,
+			    u64 in_param, u64 *out_param)
+{
+	int index;
+	int err;
+
+	if (op != RES_OP_RESERVE)
+		return -EINVAL;
+
+	index = get_param_l(&in_param);
+	err = rem_res_range(dev, slave, index, 1, RES_COUNTER, 0);
+	if (err)
+		return err;
+
+	__mlx4_counter_free(dev, index);
+	mlx4_release_resource(dev, slave, RES_COUNTER, 1, 0);
+
+	return err;
+}
+
+static int xrcdn_free_res(struct mlx4_dev *dev, int slave, int op, int cmd,
+			  u64 in_param, u64 *out_param)
+{
+	int xrcdn;
+	int err;
+
+	if (op != RES_OP_RESERVE)
+		return -EINVAL;
+
+	xrcdn = get_param_l(&in_param);
+	err = rem_res_range(dev, slave, xrcdn, 1, RES_XRCD, 0);
+	if (err)
+		return err;
+
+	__mlx4_xrcd_free(dev, xrcdn);
+
+	return err;
+}
+
+int mlx4_FREE_RES_wrapper(struct mlx4_dev *dev, int slave,
+			  struct mlx4_vhcr *vhcr,
+			  struct mlx4_cmd_mailbox *inbox,
+			  struct mlx4_cmd_mailbox *outbox,
+			  struct mlx4_cmd_info *cmd)
+{
+	int err = -EINVAL;
+	int alop = vhcr->op_modifier;
+
+	switch (vhcr->in_modifier & 0xFF) {
+	case RES_QP:
+		err = qp_free_res(dev, slave, vhcr->op_modifier, alop,
+				  vhcr->in_param);
+		break;
+
+	case RES_MTT:
+		err = mtt_free_res(dev, slave, vhcr->op_modifier, alop,
+				   vhcr->in_param, &vhcr->out_param);
+		break;
+
+	case RES_MPT:
+		err = mpt_free_res(dev, slave, vhcr->op_modifier, alop,
+				   vhcr->in_param);
+		break;
+
+	case RES_CQ:
+		err = cq_free_res(dev, slave, vhcr->op_modifier, alop,
+				  vhcr->in_param, &vhcr->out_param);
+		break;
+
+	case RES_SRQ:
+		err = srq_free_res(dev, slave, vhcr->op_modifier, alop,
+				   vhcr->in_param, &vhcr->out_param);
+		break;
+
+	case RES_MAC:
+		err = mac_free_res(dev, slave, vhcr->op_modifier, alop,
+				   vhcr->in_param, &vhcr->out_param,
+				   (vhcr->in_modifier >> 8) & 0xFF);
+		break;
+
+	case RES_VLAN:
+		err = vlan_free_res(dev, slave, vhcr->op_modifier, alop,
+				    vhcr->in_param, &vhcr->out_param,
+				    (vhcr->in_modifier >> 8) & 0xFF);
+		break;
+
+	case RES_COUNTER:
+		err = counter_free_res(dev, slave, vhcr->op_modifier, alop,
+				       vhcr->in_param, &vhcr->out_param);
+		break;
+
+	case RES_XRCD:
+		err = xrcdn_free_res(dev, slave, vhcr->op_modifier, alop,
+				     vhcr->in_param, &vhcr->out_param);
+
+	default:
+		break;
+	}
+	return err;
+}
+
+/* ugly but other choices are uglier */
+static int mr_phys_mpt(struct mlx4_mpt_entry *mpt)
+{
+	return (be32_to_cpu(mpt->flags) >> 9) & 1;
+}
+
+static int mr_get_mtt_addr(struct mlx4_mpt_entry *mpt)
+{
+	return (int)be64_to_cpu(mpt->mtt_addr) & 0xfffffff8;
+}
+
+static int mr_get_mtt_size(struct mlx4_mpt_entry *mpt)
+{
+	return be32_to_cpu(mpt->mtt_sz);
+}
+
+static u32 mr_get_pd(struct mlx4_mpt_entry *mpt)
+{
+	return be32_to_cpu(mpt->pd_flags) & 0x00ffffff;
+}
+
+static int mr_is_fmr(struct mlx4_mpt_entry *mpt)
+{
+	return be32_to_cpu(mpt->pd_flags) & MLX4_MPT_PD_FLAG_FAST_REG;
+}
+
+static int mr_is_bind_enabled(struct mlx4_mpt_entry *mpt)
+{
+	return be32_to_cpu(mpt->flags) & MLX4_MPT_FLAG_BIND_ENABLE;
+}
+
+static int mr_is_region(struct mlx4_mpt_entry *mpt)
+{
+	return be32_to_cpu(mpt->flags) & MLX4_MPT_FLAG_REGION;
+}
+
+static int qp_get_mtt_addr(struct mlx4_qp_context *qpc)
+{
+	return be32_to_cpu(qpc->mtt_base_addr_l) & 0xfffffff8;
+}
+
+static int srq_get_mtt_addr(struct mlx4_srq_context *srqc)
+{
+	return be32_to_cpu(srqc->mtt_base_addr_l) & 0xfffffff8;
+}
+
+static int qp_get_mtt_size(struct mlx4_qp_context *qpc)
+{
+	int page_shift = (qpc->log_page_size & 0x3f) + 12;
+	int log_sq_size = (qpc->sq_size_stride >> 3) & 0xf;
+	int log_sq_sride = qpc->sq_size_stride & 7;
+	int log_rq_size = (qpc->rq_size_stride >> 3) & 0xf;
+	int log_rq_stride = qpc->rq_size_stride & 7;
+	int srq = (be32_to_cpu(qpc->srqn) >> 24) & 1;
+	int rss = (be32_to_cpu(qpc->flags) >> 13) & 1;
+	u32 ts = (be32_to_cpu(qpc->flags) >> 16) & 0xff;
+	int xrc = (ts == MLX4_QP_ST_XRC) ? 1 : 0;
+	int sq_size;
+	int rq_size;
+	int total_pages;
+	int total_mem;
+	int page_offset = (be32_to_cpu(qpc->params2) >> 6) & 0x3f;
+
+	sq_size = 1 << (log_sq_size + log_sq_sride + 4);
+	rq_size = (srq|rss|xrc) ? 0 : (1 << (log_rq_size + log_rq_stride + 4));
+	total_mem = sq_size + rq_size;
+	total_pages =
+		roundup_pow_of_two((total_mem + (page_offset << 6)) >>
+				   page_shift);
+
+	return total_pages;
+}
+
+static int check_mtt_range(struct mlx4_dev *dev, int slave, int start,
+			   int size, struct res_mtt *mtt)
+{
+	int res_start = mtt->com.res_id;
+	int res_size = (1 << mtt->order);
+
+	if (start < res_start || start + size > res_start + res_size)
+		return -EPERM;
+	return 0;
+}
+
+int mlx4_SW2HW_MPT_wrapper(struct mlx4_dev *dev, int slave,
+			   struct mlx4_vhcr *vhcr,
+			   struct mlx4_cmd_mailbox *inbox,
+			   struct mlx4_cmd_mailbox *outbox,
+			   struct mlx4_cmd_info *cmd)
+{
+	int err;
+	int index = vhcr->in_modifier;
+	struct res_mtt *mtt;
+	struct res_mpt *mpt;
+	int mtt_base = mr_get_mtt_addr(inbox->buf) / dev->caps.mtt_entry_sz;
+	int phys;
+	int id;
+	u32 pd;
+	int pd_slave;
+
+	id = index & mpt_mask(dev);
+	err = mr_res_start_move_to(dev, slave, id, RES_MPT_HW, &mpt);
+	if (err)
+		return err;
+
+	/* Disable memory windows for VFs. */
+	if (!mr_is_region(inbox->buf)) {
+		err = -EPERM;
+		goto ex_abort;
+	}
+
+	/* Make sure that the PD bits related to the slave id are zeros. */
+	pd = mr_get_pd(inbox->buf);
+	pd_slave = (pd >> 17) & 0x7f;
+	if (pd_slave != 0 && pd_slave != slave) {
+		err = -EPERM;
+		goto ex_abort;
+	}
+
+	if (mr_is_fmr(inbox->buf)) {
+		/* FMR and Bind Enable are forbidden in slave devices. */
+		if (mr_is_bind_enabled(inbox->buf)) {
+			err = -EPERM;
+			goto ex_abort;
+		}
+		/* FMR and Memory Windows are also forbidden. */
+		if (!mr_is_region(inbox->buf)) {
+			err = -EPERM;
+			goto ex_abort;
+		}
+	}
+
+	phys = mr_phys_mpt(inbox->buf);
+	if (!phys) {
+		err = get_res(dev, slave, mtt_base, RES_MTT, &mtt);
+		if (err)
+			goto ex_abort;
+
+		err = check_mtt_range(dev, slave, mtt_base,
+				      mr_get_mtt_size(inbox->buf), mtt);
+		if (err)
+			goto ex_put;
+
+		mpt->mtt = mtt;
+	}
+
+	err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd);
+	if (err)
+		goto ex_put;
+
+	if (!phys) {
+		atomic_inc(&mtt->ref_count);
+		put_res(dev, slave, mtt->com.res_id, RES_MTT);
+	}
+
+	res_end_move(dev, slave, RES_MPT, id);
+	return 0;
+
+ex_put:
+	if (!phys)
+		put_res(dev, slave, mtt->com.res_id, RES_MTT);
+ex_abort:
+	res_abort_move(dev, slave, RES_MPT, id);
+
+	return err;
+}
+
+int mlx4_HW2SW_MPT_wrapper(struct mlx4_dev *dev, int slave,
+			   struct mlx4_vhcr *vhcr,
+			   struct mlx4_cmd_mailbox *inbox,
+			   struct mlx4_cmd_mailbox *outbox,
+			   struct mlx4_cmd_info *cmd)
+{
+	int err;
+	int index = vhcr->in_modifier;
+	struct res_mpt *mpt;
+	int id;
+
+	id = index & mpt_mask(dev);
+	err = mr_res_start_move_to(dev, slave, id, RES_MPT_MAPPED, &mpt);
+	if (err)
+		return err;
+
+	err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd);
+	if (err)
+		goto ex_abort;
+
+	if (mpt->mtt)
+		atomic_dec(&mpt->mtt->ref_count);
+
+	res_end_move(dev, slave, RES_MPT, id);
+	return 0;
+
+ex_abort:
+	res_abort_move(dev, slave, RES_MPT, id);
+
+	return err;
+}
+
+int mlx4_QUERY_MPT_wrapper(struct mlx4_dev *dev, int slave,
+			   struct mlx4_vhcr *vhcr,
+			   struct mlx4_cmd_mailbox *inbox,
+			   struct mlx4_cmd_mailbox *outbox,
+			   struct mlx4_cmd_info *cmd)
+{
+	int err;
+	int index = vhcr->in_modifier;
+	struct res_mpt *mpt;
+	int id;
+
+	id = index & mpt_mask(dev);
+	err = get_res(dev, slave, id, RES_MPT, &mpt);
+	if (err)
+		return err;
+
+	if (mpt->com.from_state == RES_MPT_MAPPED) {
+		/* In order to allow rereg in SRIOV, we need to alter the MPT entry. To do
+		 * that, the VF must read the MPT. But since the MPT entry memory is not
+		 * in the VF's virtual memory space, it must use QUERY_MPT to obtain the
+		 * entry contents. To guarantee that the MPT cannot be changed, the driver
+		 * must perform HW2SW_MPT before this query and return the MPT entry to HW
+		 * ownership fofollowing the change. The change here allows the VF to
+		 * perform QUERY_MPT also when the entry is in SW ownership.
+		 */
+		struct mlx4_mpt_entry *mpt_entry = mlx4_table_find(
+					&mlx4_priv(dev)->mr_table.dmpt_table,
+					mpt->key, NULL);
+
+		if (NULL == mpt_entry || NULL == outbox->buf) {
+			err = -EINVAL;
+			goto out;
+		}
+
+		memcpy(outbox->buf, mpt_entry, sizeof(*mpt_entry));
+
+		err = 0;
+	} else if (mpt->com.from_state == RES_MPT_HW) {
+		err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd);
+	} else {
+		err = -EBUSY;
+		goto out;
+	}
+
+
+out:
+	put_res(dev, slave, id, RES_MPT);
+	return err;
+}
+
+static int qp_get_rcqn(struct mlx4_qp_context *qpc)
+{
+	return be32_to_cpu(qpc->cqn_recv) & 0xffffff;
+}
+
+static int qp_get_scqn(struct mlx4_qp_context *qpc)
+{
+	return be32_to_cpu(qpc->cqn_send) & 0xffffff;
+}
+
+static u32 qp_get_srqn(struct mlx4_qp_context *qpc)
+{
+	return be32_to_cpu(qpc->srqn) & 0x1ffffff;
+}
+
+static void adjust_proxy_tun_qkey(struct mlx4_dev *dev, struct mlx4_vhcr *vhcr,
+				  struct mlx4_qp_context *context)
+{
+	u32 qpn = vhcr->in_modifier & 0xffffff;
+	u32 qkey = 0;
+
+	if (mlx4_get_parav_qkey(dev, qpn, &qkey))
+		return;
+
+	/* adjust qkey in qp context */
+	context->qkey = cpu_to_be32(qkey);
+}
+
+int mlx4_RST2INIT_QP_wrapper(struct mlx4_dev *dev, int slave,
+			     struct mlx4_vhcr *vhcr,
+			     struct mlx4_cmd_mailbox *inbox,
+			     struct mlx4_cmd_mailbox *outbox,
+			     struct mlx4_cmd_info *cmd)
+{
+	int err;
+	int qpn = vhcr->in_modifier & 0x7fffff;
+	struct res_mtt *mtt;
+	struct res_qp *qp;
+	struct mlx4_qp_context *qpc = inbox->buf + 8;
+	int mtt_base = qp_get_mtt_addr(qpc) / dev->caps.mtt_entry_sz;
+	int mtt_size = qp_get_mtt_size(qpc);
+	struct res_cq *rcq;
+	struct res_cq *scq;
+	int rcqn = qp_get_rcqn(qpc);
+	int scqn = qp_get_scqn(qpc);
+	u32 srqn = qp_get_srqn(qpc) & 0xffffff;
+	int use_srq = (qp_get_srqn(qpc) >> 24) & 1;
+	struct res_srq *srq;
+	int local_qpn = be32_to_cpu(qpc->local_qpn) & 0xffffff;
+
+	err = qp_res_start_move_to(dev, slave, qpn, RES_QP_HW, &qp, 0);
+	if (err)
+		return err;
+	qp->local_qpn = local_qpn;
+	qp->sched_queue = 0;
+	qp->param3 = 0;
+	qp->vlan_control = 0;
+	qp->fvl_rx = 0;
+	qp->pri_path_fl = 0;
+	qp->vlan_index = 0;
+	qp->feup = 0;
+	qp->qpc_flags = be32_to_cpu(qpc->flags);
+
+	err = get_res(dev, slave, mtt_base, RES_MTT, &mtt);
+	if (err)
+		goto ex_abort;
+
+	err = check_mtt_range(dev, slave, mtt_base, mtt_size, mtt);
+	if (err)
+		goto ex_put_mtt;
+
+	err = get_res(dev, slave, rcqn, RES_CQ, &rcq);
+	if (err)
+		goto ex_put_mtt;
+
+	if (scqn != rcqn) {
+		err = get_res(dev, slave, scqn, RES_CQ, &scq);
+		if (err)
+			goto ex_put_rcq;
+	} else
+		scq = rcq;
+
+	if (use_srq) {
+		err = get_res(dev, slave, srqn, RES_SRQ, &srq);
+		if (err)
+			goto ex_put_scq;
+	}
+
+	adjust_proxy_tun_qkey(dev, vhcr, qpc);
+	update_pkey_index(dev, slave, inbox);
+	err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd);
+	if (err)
+		goto ex_put_srq;
+	atomic_inc(&mtt->ref_count);
+	qp->mtt = mtt;
+	atomic_inc(&rcq->ref_count);
+	qp->rcq = rcq;
+	atomic_inc(&scq->ref_count);
+	qp->scq = scq;
+
+	if (scqn != rcqn)
+		put_res(dev, slave, scqn, RES_CQ);
+
+	if (use_srq) {
+		atomic_inc(&srq->ref_count);
+		put_res(dev, slave, srqn, RES_SRQ);
+		qp->srq = srq;
+	}
+	put_res(dev, slave, rcqn, RES_CQ);
+	put_res(dev, slave, mtt_base, RES_MTT);
+	res_end_move(dev, slave, RES_QP, qpn);
+
+	return 0;
+
+ex_put_srq:
+	if (use_srq)
+		put_res(dev, slave, srqn, RES_SRQ);
+ex_put_scq:
+	if (scqn != rcqn)
+		put_res(dev, slave, scqn, RES_CQ);
+ex_put_rcq:
+	put_res(dev, slave, rcqn, RES_CQ);
+ex_put_mtt:
+	put_res(dev, slave, mtt_base, RES_MTT);
+ex_abort:
+	res_abort_move(dev, slave, RES_QP, qpn);
+
+	return err;
+}
+
+static int eq_get_mtt_addr(struct mlx4_eq_context *eqc)
+{
+	return be32_to_cpu(eqc->mtt_base_addr_l) & 0xfffffff8;
+}
+
+static int eq_get_mtt_size(struct mlx4_eq_context *eqc)
+{
+	int log_eq_size = eqc->log_eq_size & 0x1f;
+	int page_shift = (eqc->log_page_size & 0x3f) + 12;
+
+	if (log_eq_size + 5 < page_shift)
+		return 1;
+
+	return 1 << (log_eq_size + 5 - page_shift);
+}
+
+static int cq_get_mtt_addr(struct mlx4_cq_context *cqc)
+{
+	return be32_to_cpu(cqc->mtt_base_addr_l) & 0xfffffff8;
+}
+
+static int cq_get_mtt_size(struct mlx4_cq_context *cqc)
+{
+	int log_cq_size = (be32_to_cpu(cqc->logsize_usrpage) >> 24) & 0x1f;
+	int page_shift = (cqc->log_page_size & 0x3f) + 12;
+
+	if (log_cq_size + 5 < page_shift)
+		return 1;
+
+	return 1 << (log_cq_size + 5 - page_shift);
+}
+
+int mlx4_SW2HW_EQ_wrapper(struct mlx4_dev *dev, int slave,
+			  struct mlx4_vhcr *vhcr,
+			  struct mlx4_cmd_mailbox *inbox,
+			  struct mlx4_cmd_mailbox *outbox,
+			  struct mlx4_cmd_info *cmd)
+{
+	int err;
+	int eqn = vhcr->in_modifier;
+	int res_id = (slave << 8) | eqn;
+	struct mlx4_eq_context *eqc = inbox->buf;
+	int mtt_base = eq_get_mtt_addr(eqc) / dev->caps.mtt_entry_sz;
+	int mtt_size = eq_get_mtt_size(eqc);
+	struct res_eq *eq;
+	struct res_mtt *mtt;
+
+	err = add_res_range(dev, slave, res_id, 1, RES_EQ, 0);
+	if (err)
+		return err;
+	err = eq_res_start_move_to(dev, slave, res_id, RES_EQ_HW, &eq);
+	if (err)
+		goto out_add;
+
+	err = get_res(dev, slave, mtt_base, RES_MTT, &mtt);
+	if (err)
+		goto out_move;
+
+	err = check_mtt_range(dev, slave, mtt_base, mtt_size, mtt);
+	if (err)
+		goto out_put;
+
+	err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd);
+	if (err)
+		goto out_put;
+
+	atomic_inc(&mtt->ref_count);
+	eq->mtt = mtt;
+	put_res(dev, slave, mtt->com.res_id, RES_MTT);
+	res_end_move(dev, slave, RES_EQ, res_id);
+	return 0;
+
+out_put:
+	put_res(dev, slave, mtt->com.res_id, RES_MTT);
+out_move:
+	res_abort_move(dev, slave, RES_EQ, res_id);
+out_add:
+	rem_res_range(dev, slave, res_id, 1, RES_EQ, 0);
+	return err;
+}
+
+static int get_containing_mtt(struct mlx4_dev *dev, int slave, int start,
+			      int len, struct res_mtt **res)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker;
+	struct res_mtt *mtt;
+	int err = -EINVAL;
+
+	spin_lock_irq(mlx4_tlock(dev));
+	list_for_each_entry(mtt, &tracker->slave_list[slave].res_list[RES_MTT],
+			    com.list) {
+		if (!check_mtt_range(dev, slave, start, len, mtt)) {
+			*res = mtt;
+			mtt->com.from_state = mtt->com.state;
+			mtt->com.state = RES_MTT_BUSY;
+			err = 0;
+			break;
+		}
+	}
+	spin_unlock_irq(mlx4_tlock(dev));
+
+	return err;
+}
+
+static int verify_qp_parameters(struct mlx4_dev *dev,
+				struct mlx4_vhcr *vhcr,
+				struct mlx4_cmd_mailbox *inbox,
+				enum qp_transition transition, u8 slave)
+{
+	u32			qp_type;
+	u32			qpn;
+	struct mlx4_qp_context	*qp_ctx;
+	enum mlx4_qp_optpar	optpar;
+	int port;
+	int num_gids;
+
+	qp_ctx  = inbox->buf + 8;
+	qp_type	= (be32_to_cpu(qp_ctx->flags) >> 16) & 0xff;
+	optpar	= be32_to_cpu(*(__be32 *) inbox->buf);
+
+	switch (qp_type) {
+	case MLX4_QP_ST_RC:
+	case MLX4_QP_ST_XRC:
+	case MLX4_QP_ST_UC:
+		switch (transition) {
+		case QP_TRANS_INIT2RTR:
+		case QP_TRANS_RTR2RTS:
+		case QP_TRANS_RTS2RTS:
+		case QP_TRANS_SQD2SQD:
+		case QP_TRANS_SQD2RTS:
+			if (slave != mlx4_master_func_num(dev))
+				if (optpar & MLX4_QP_OPTPAR_PRIMARY_ADDR_PATH) {
+					port = (qp_ctx->pri_path.sched_queue >> 6 & 1) + 1;
+					if (dev->caps.port_mask[port] != MLX4_PORT_TYPE_IB)
+						num_gids = mlx4_get_slave_num_gids(dev, slave, port);
+					else
+						num_gids = 1;
+					if (qp_ctx->pri_path.mgid_index >= num_gids)
+						return -EINVAL;
+				}
+				if (optpar & MLX4_QP_OPTPAR_ALT_ADDR_PATH) {
+					port = (qp_ctx->alt_path.sched_queue >> 6 & 1) + 1;
+					if (dev->caps.port_mask[port] != MLX4_PORT_TYPE_IB)
+						num_gids = mlx4_get_slave_num_gids(dev, slave, port);
+					else
+						num_gids = 1;
+					if (qp_ctx->alt_path.mgid_index >= num_gids)
+						return -EINVAL;
+				}
+			break;
+		default:
+			break;
+		}
+		break;
+
+	case MLX4_QP_ST_MLX:
+		qpn = vhcr->in_modifier & 0x7fffff;
+		port = (qp_ctx->pri_path.sched_queue >> 6 & 1) + 1;
+		if (transition == QP_TRANS_INIT2RTR &&
+		    slave != mlx4_master_func_num(dev) &&
+		    mlx4_is_qp_reserved(dev, qpn) &&
+		    !mlx4_vf_smi_enabled(dev, slave, port)) {
+			/* only enabled VFs may create MLX proxy QPs */
+			mlx4_err(dev, "%s: unprivileged slave %d attempting to create an MLX proxy special QP on port %d\n",
+				 __func__, slave, port);
+			return -EPERM;
+		}
+		break;
+
+	default:
+		break;
+	}
+
+	return 0;
+}
+
+int mlx4_WRITE_MTT_wrapper(struct mlx4_dev *dev, int slave,
+			   struct mlx4_vhcr *vhcr,
+			   struct mlx4_cmd_mailbox *inbox,
+			   struct mlx4_cmd_mailbox *outbox,
+			   struct mlx4_cmd_info *cmd)
+{
+	struct mlx4_mtt mtt;
+	__be64 *page_list = inbox->buf;
+	u64 *pg_list = (u64 *)page_list;
+	int i;
+	struct res_mtt *rmtt = NULL;
+	int start = be64_to_cpu(page_list[0]);
+	int npages = vhcr->in_modifier;
+	int err;
+
+	err = get_containing_mtt(dev, slave, start, npages, &rmtt);
+	if (err)
+		return err;
+
+	/* Call the SW implementation of write_mtt:
+	 * - Prepare a dummy mtt struct
+	 * - Translate inbox contents to simple addresses in host endianess */
+	mtt.offset = 0;  /* TBD this is broken but I don't handle it since
+			    we don't really use it */
+	mtt.order = 0;
+	mtt.page_shift = 0;
+	for (i = 0; i < npages; ++i)
+		pg_list[i + 2] = (be64_to_cpu(page_list[i + 2]) & ~1ULL);
+
+	err = __mlx4_write_mtt(dev, &mtt, be64_to_cpu(page_list[0]), npages,
+			       ((u64 *)page_list + 2));
+
+	if (rmtt)
+		put_res(dev, slave, rmtt->com.res_id, RES_MTT);
+
+	return err;
+}
+
+int mlx4_HW2SW_EQ_wrapper(struct mlx4_dev *dev, int slave,
+			  struct mlx4_vhcr *vhcr,
+			  struct mlx4_cmd_mailbox *inbox,
+			  struct mlx4_cmd_mailbox *outbox,
+			  struct mlx4_cmd_info *cmd)
+{
+	int eqn = vhcr->in_modifier;
+	int res_id = eqn | (slave << 8);
+	struct res_eq *eq;
+	int err;
+
+	err = eq_res_start_move_to(dev, slave, res_id, RES_EQ_RESERVED, &eq);
+	if (err)
+		return err;
+
+	err = get_res(dev, slave, eq->mtt->com.res_id, RES_MTT, NULL);
+	if (err)
+		goto ex_abort;
+
+	err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd);
+	if (err)
+		goto ex_put;
+
+	atomic_dec(&eq->mtt->ref_count);
+	put_res(dev, slave, eq->mtt->com.res_id, RES_MTT);
+	res_end_move(dev, slave, RES_EQ, res_id);
+	rem_res_range(dev, slave, res_id, 1, RES_EQ, 0);
+
+	return 0;
+
+ex_put:
+	put_res(dev, slave, eq->mtt->com.res_id, RES_MTT);
+ex_abort:
+	res_abort_move(dev, slave, RES_EQ, res_id);
+
+	return err;
+}
+
+int mlx4_GEN_EQE(struct mlx4_dev *dev, int slave, struct mlx4_eqe *eqe)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_slave_event_eq_info *event_eq;
+	struct mlx4_cmd_mailbox *mailbox;
+	u32 in_modifier = 0;
+	int err;
+	int res_id;
+	struct res_eq *req;
+
+	if (!priv->mfunc.master.slave_state)
+		return -EINVAL;
+
+	event_eq = &priv->mfunc.master.slave_state[slave].event_eq[eqe->type];
+
+	/* Create the event only if the slave is registered */
+	if (event_eq->eqn < 0)
+		return 0;
+
+	mutex_lock(&priv->mfunc.master.gen_eqe_mutex[slave]);
+	res_id = (slave << 8) | event_eq->eqn;
+	err = get_res(dev, slave, res_id, RES_EQ, &req);
+	if (err)
+		goto unlock;
+
+	if (req->com.from_state != RES_EQ_HW) {
+		err = -EINVAL;
+		goto put;
+	}
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox)) {
+		err = PTR_ERR(mailbox);
+		goto put;
+	}
+
+	if (eqe->type == MLX4_EVENT_TYPE_CMD) {
+		++event_eq->token;
+		eqe->event.cmd.token = cpu_to_be16(event_eq->token);
+	}
+
+	memcpy(mailbox->buf, (u8 *) eqe, 28);
+
+	in_modifier = (slave & 0xff) | ((event_eq->eqn & 0xff) << 16);
+
+	err = mlx4_cmd(dev, mailbox->dma, in_modifier, 0,
+		       MLX4_CMD_GEN_EQE, MLX4_CMD_TIME_CLASS_B,
+		       MLX4_CMD_NATIVE);
+
+	put_res(dev, slave, res_id, RES_EQ);
+	mutex_unlock(&priv->mfunc.master.gen_eqe_mutex[slave]);
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+
+put:
+	put_res(dev, slave, res_id, RES_EQ);
+
+unlock:
+	mutex_unlock(&priv->mfunc.master.gen_eqe_mutex[slave]);
+	return err;
+}
+
+int mlx4_QUERY_EQ_wrapper(struct mlx4_dev *dev, int slave,
+			  struct mlx4_vhcr *vhcr,
+			  struct mlx4_cmd_mailbox *inbox,
+			  struct mlx4_cmd_mailbox *outbox,
+			  struct mlx4_cmd_info *cmd)
+{
+	int eqn = vhcr->in_modifier;
+	int res_id = eqn | (slave << 8);
+	struct res_eq *eq;
+	int err;
+
+	err = get_res(dev, slave, res_id, RES_EQ, &eq);
+	if (err)
+		return err;
+
+	if (eq->com.from_state != RES_EQ_HW) {
+		err = -EINVAL;
+		goto ex_put;
+	}
+
+	err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd);
+
+ex_put:
+	put_res(dev, slave, res_id, RES_EQ);
+	return err;
+}
+
+int mlx4_SW2HW_CQ_wrapper(struct mlx4_dev *dev, int slave,
+			  struct mlx4_vhcr *vhcr,
+			  struct mlx4_cmd_mailbox *inbox,
+			  struct mlx4_cmd_mailbox *outbox,
+			  struct mlx4_cmd_info *cmd)
+{
+	int err;
+	int cqn = vhcr->in_modifier;
+	struct mlx4_cq_context *cqc = inbox->buf;
+	int mtt_base = cq_get_mtt_addr(cqc) / dev->caps.mtt_entry_sz;
+	struct res_cq *cq;
+	struct res_mtt *mtt;
+
+	err = cq_res_start_move_to(dev, slave, cqn, RES_CQ_HW, &cq);
+	if (err)
+		return err;
+	err = get_res(dev, slave, mtt_base, RES_MTT, &mtt);
+	if (err)
+		goto out_move;
+	err = check_mtt_range(dev, slave, mtt_base, cq_get_mtt_size(cqc), mtt);
+	if (err)
+		goto out_put;
+	err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd);
+	if (err)
+		goto out_put;
+	atomic_inc(&mtt->ref_count);
+	cq->mtt = mtt;
+	put_res(dev, slave, mtt->com.res_id, RES_MTT);
+	res_end_move(dev, slave, RES_CQ, cqn);
+	return 0;
+
+out_put:
+	put_res(dev, slave, mtt->com.res_id, RES_MTT);
+out_move:
+	res_abort_move(dev, slave, RES_CQ, cqn);
+	return err;
+}
+
+int mlx4_HW2SW_CQ_wrapper(struct mlx4_dev *dev, int slave,
+			  struct mlx4_vhcr *vhcr,
+			  struct mlx4_cmd_mailbox *inbox,
+			  struct mlx4_cmd_mailbox *outbox,
+			  struct mlx4_cmd_info *cmd)
+{
+	int err;
+	int cqn = vhcr->in_modifier;
+	struct res_cq *cq;
+
+	err = cq_res_start_move_to(dev, slave, cqn, RES_CQ_ALLOCATED, &cq);
+	if (err)
+		return err;
+	err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd);
+	if (err)
+		goto out_move;
+	atomic_dec(&cq->mtt->ref_count);
+	res_end_move(dev, slave, RES_CQ, cqn);
+	return 0;
+
+out_move:
+	res_abort_move(dev, slave, RES_CQ, cqn);
+	return err;
+}
+
+int mlx4_QUERY_CQ_wrapper(struct mlx4_dev *dev, int slave,
+			  struct mlx4_vhcr *vhcr,
+			  struct mlx4_cmd_mailbox *inbox,
+			  struct mlx4_cmd_mailbox *outbox,
+			  struct mlx4_cmd_info *cmd)
+{
+	int cqn = vhcr->in_modifier;
+	struct res_cq *cq;
+	int err;
+
+	err = get_res(dev, slave, cqn, RES_CQ, &cq);
+	if (err)
+		return err;
+
+	if (cq->com.from_state != RES_CQ_HW)
+		goto ex_put;
+
+	err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd);
+ex_put:
+	put_res(dev, slave, cqn, RES_CQ);
+
+	return err;
+}
+
+static int handle_resize(struct mlx4_dev *dev, int slave,
+			 struct mlx4_vhcr *vhcr,
+			 struct mlx4_cmd_mailbox *inbox,
+			 struct mlx4_cmd_mailbox *outbox,
+			 struct mlx4_cmd_info *cmd,
+			 struct res_cq *cq)
+{
+	int err;
+	struct res_mtt *orig_mtt;
+	struct res_mtt *mtt;
+	struct mlx4_cq_context *cqc = inbox->buf;
+	int mtt_base = cq_get_mtt_addr(cqc) / dev->caps.mtt_entry_sz;
+
+	err = get_res(dev, slave, cq->mtt->com.res_id, RES_MTT, &orig_mtt);
+	if (err)
+		return err;
+
+	if (orig_mtt != cq->mtt) {
+		err = -EINVAL;
+		goto ex_put;
+	}
+
+	err = get_res(dev, slave, mtt_base, RES_MTT, &mtt);
+	if (err)
+		goto ex_put;
+
+	err = check_mtt_range(dev, slave, mtt_base, cq_get_mtt_size(cqc), mtt);
+	if (err)
+		goto ex_put1;
+	err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd);
+	if (err)
+		goto ex_put1;
+	atomic_dec(&orig_mtt->ref_count);
+	put_res(dev, slave, orig_mtt->com.res_id, RES_MTT);
+	atomic_inc(&mtt->ref_count);
+	cq->mtt = mtt;
+	put_res(dev, slave, mtt->com.res_id, RES_MTT);
+	return 0;
+
+ex_put1:
+	put_res(dev, slave, mtt->com.res_id, RES_MTT);
+ex_put:
+	put_res(dev, slave, orig_mtt->com.res_id, RES_MTT);
+
+	return err;
+
+}
+
+int mlx4_MODIFY_CQ_wrapper(struct mlx4_dev *dev, int slave,
+			   struct mlx4_vhcr *vhcr,
+			   struct mlx4_cmd_mailbox *inbox,
+			   struct mlx4_cmd_mailbox *outbox,
+			   struct mlx4_cmd_info *cmd)
+{
+	int cqn = vhcr->in_modifier;
+	struct res_cq *cq;
+	int err;
+
+	err = get_res(dev, slave, cqn, RES_CQ, &cq);
+	if (err)
+		return err;
+
+	if (cq->com.from_state != RES_CQ_HW)
+		goto ex_put;
+
+	if (vhcr->op_modifier == 0) {
+		err = handle_resize(dev, slave, vhcr, inbox, outbox, cmd, cq);
+		goto ex_put;
+	}
+
+	err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd);
+ex_put:
+	put_res(dev, slave, cqn, RES_CQ);
+
+	return err;
+}
+
+static int srq_get_mtt_size(struct mlx4_srq_context *srqc)
+{
+	int log_srq_size = (be32_to_cpu(srqc->state_logsize_srqn) >> 24) & 0xf;
+	int log_rq_stride = srqc->logstride & 7;
+	int page_shift = (srqc->log_page_size & 0x3f) + 12;
+
+	if (log_srq_size + log_rq_stride + 4 < page_shift)
+		return 1;
+
+	return 1 << (log_srq_size + log_rq_stride + 4 - page_shift);
+}
+
+int mlx4_SW2HW_SRQ_wrapper(struct mlx4_dev *dev, int slave,
+			   struct mlx4_vhcr *vhcr,
+			   struct mlx4_cmd_mailbox *inbox,
+			   struct mlx4_cmd_mailbox *outbox,
+			   struct mlx4_cmd_info *cmd)
+{
+	int err;
+	int srqn = vhcr->in_modifier;
+	struct res_mtt *mtt;
+	struct res_srq *srq;
+	struct mlx4_srq_context *srqc = inbox->buf;
+	int mtt_base = srq_get_mtt_addr(srqc) / dev->caps.mtt_entry_sz;
+
+	if (srqn != (be32_to_cpu(srqc->state_logsize_srqn) & 0xffffff))
+		return -EINVAL;
+
+	err = srq_res_start_move_to(dev, slave, srqn, RES_SRQ_HW, &srq);
+	if (err)
+		return err;
+	err = get_res(dev, slave, mtt_base, RES_MTT, &mtt);
+	if (err)
+		goto ex_abort;
+	err = check_mtt_range(dev, slave, mtt_base, srq_get_mtt_size(srqc),
+			      mtt);
+	if (err)
+		goto ex_put_mtt;
+
+	err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd);
+	if (err)
+		goto ex_put_mtt;
+
+	atomic_inc(&mtt->ref_count);
+	srq->mtt = mtt;
+	put_res(dev, slave, mtt->com.res_id, RES_MTT);
+	res_end_move(dev, slave, RES_SRQ, srqn);
+	return 0;
+
+ex_put_mtt:
+	put_res(dev, slave, mtt->com.res_id, RES_MTT);
+ex_abort:
+	res_abort_move(dev, slave, RES_SRQ, srqn);
+
+	return err;
+}
+
+int mlx4_HW2SW_SRQ_wrapper(struct mlx4_dev *dev, int slave,
+			   struct mlx4_vhcr *vhcr,
+			   struct mlx4_cmd_mailbox *inbox,
+			   struct mlx4_cmd_mailbox *outbox,
+			   struct mlx4_cmd_info *cmd)
+{
+	int err;
+	int srqn = vhcr->in_modifier;
+	struct res_srq *srq;
+
+	err = srq_res_start_move_to(dev, slave, srqn, RES_SRQ_ALLOCATED, &srq);
+	if (err)
+		return err;
+	err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd);
+	if (err)
+		goto ex_abort;
+	atomic_dec(&srq->mtt->ref_count);
+	if (srq->cq)
+		atomic_dec(&srq->cq->ref_count);
+	res_end_move(dev, slave, RES_SRQ, srqn);
+
+	return 0;
+
+ex_abort:
+	res_abort_move(dev, slave, RES_SRQ, srqn);
+
+	return err;
+}
+
+int mlx4_QUERY_SRQ_wrapper(struct mlx4_dev *dev, int slave,
+			   struct mlx4_vhcr *vhcr,
+			   struct mlx4_cmd_mailbox *inbox,
+			   struct mlx4_cmd_mailbox *outbox,
+			   struct mlx4_cmd_info *cmd)
+{
+	int err;
+	int srqn = vhcr->in_modifier;
+	struct res_srq *srq;
+
+	err = get_res(dev, slave, srqn, RES_SRQ, &srq);
+	if (err)
+		return err;
+	if (srq->com.from_state != RES_SRQ_HW) {
+		err = -EBUSY;
+		goto out;
+	}
+	err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd);
+out:
+	put_res(dev, slave, srqn, RES_SRQ);
+	return err;
+}
+
+int mlx4_ARM_SRQ_wrapper(struct mlx4_dev *dev, int slave,
+			 struct mlx4_vhcr *vhcr,
+			 struct mlx4_cmd_mailbox *inbox,
+			 struct mlx4_cmd_mailbox *outbox,
+			 struct mlx4_cmd_info *cmd)
+{
+	int err;
+	int srqn = vhcr->in_modifier;
+	struct res_srq *srq;
+
+	err = get_res(dev, slave, srqn, RES_SRQ, &srq);
+	if (err)
+		return err;
+
+	if (srq->com.from_state != RES_SRQ_HW) {
+		err = -EBUSY;
+		goto out;
+	}
+
+	err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd);
+out:
+	put_res(dev, slave, srqn, RES_SRQ);
+	return err;
+}
+
+int mlx4_GEN_QP_wrapper(struct mlx4_dev *dev, int slave,
+			struct mlx4_vhcr *vhcr,
+			struct mlx4_cmd_mailbox *inbox,
+			struct mlx4_cmd_mailbox *outbox,
+			struct mlx4_cmd_info *cmd)
+{
+	int err;
+	int qpn = vhcr->in_modifier & 0x7fffff;
+	struct res_qp *qp;
+
+	err = get_res(dev, slave, qpn, RES_QP, &qp);
+	if (err)
+		return err;
+	if (qp->com.from_state != RES_QP_HW) {
+		err = -EBUSY;
+		goto out;
+	}
+
+	err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd);
+out:
+	put_res(dev, slave, qpn, RES_QP);
+	return err;
+}
+
+int mlx4_INIT2INIT_QP_wrapper(struct mlx4_dev *dev, int slave,
+			      struct mlx4_vhcr *vhcr,
+			      struct mlx4_cmd_mailbox *inbox,
+			      struct mlx4_cmd_mailbox *outbox,
+			      struct mlx4_cmd_info *cmd)
+{
+	struct mlx4_qp_context *context = inbox->buf + 8;
+	adjust_proxy_tun_qkey(dev, vhcr, context);
+	update_pkey_index(dev, slave, inbox);
+	return mlx4_GEN_QP_wrapper(dev, slave, vhcr, inbox, outbox, cmd);
+}
+
+static int adjust_qp_sched_queue(struct mlx4_dev *dev, int slave,
+				  struct mlx4_qp_context *qpc,
+				  struct mlx4_cmd_mailbox *inbox)
+{
+	enum mlx4_qp_optpar optpar = be32_to_cpu(*(__be32 *)inbox->buf);
+	u8 pri_sched_queue;
+	int port = mlx4_slave_convert_port(
+		   dev, slave, (qpc->pri_path.sched_queue >> 6 & 1) + 1) - 1;
+
+	if (port < 0)
+		return -EINVAL;
+
+	pri_sched_queue = (qpc->pri_path.sched_queue & ~(1 << 6)) |
+			  ((port & 1) << 6);
+
+	if (optpar & MLX4_QP_OPTPAR_PRIMARY_ADDR_PATH ||
+	    mlx4_is_eth(dev, port + 1)) {
+		qpc->pri_path.sched_queue = pri_sched_queue;
+	}
+
+	if (optpar & MLX4_QP_OPTPAR_ALT_ADDR_PATH) {
+		port = mlx4_slave_convert_port(
+				dev, slave, (qpc->alt_path.sched_queue >> 6 & 1)
+				+ 1) - 1;
+		if (port < 0)
+			return -EINVAL;
+		qpc->alt_path.sched_queue =
+			(qpc->alt_path.sched_queue & ~(1 << 6)) |
+			(port & 1) << 6;
+	}
+	return 0;
+}
+
+static int roce_verify_mac(struct mlx4_dev *dev, int slave,
+				struct mlx4_qp_context *qpc,
+				struct mlx4_cmd_mailbox *inbox)
+{
+	u64 mac;
+	int port;
+	u32 ts = (be32_to_cpu(qpc->flags) >> 16) & 0xff;
+	u8 sched = *(u8 *)(inbox->buf + 64);
+	u8 smac_ix;
+
+	port = (sched >> 6 & 1) + 1;
+	if (mlx4_is_eth(dev, port) && (ts != MLX4_QP_ST_MLX)) {
+		smac_ix = qpc->pri_path.grh_mylmc & 0x7f;
+		if (mac_find_smac_ix_in_slave(dev, slave, port, smac_ix, &mac))
+			return -ENOENT;
+	}
+	return 0;
+}
+
+int mlx4_INIT2RTR_QP_wrapper(struct mlx4_dev *dev, int slave,
+			     struct mlx4_vhcr *vhcr,
+			     struct mlx4_cmd_mailbox *inbox,
+			     struct mlx4_cmd_mailbox *outbox,
+			     struct mlx4_cmd_info *cmd)
+{
+	int err;
+	struct mlx4_qp_context *qpc = inbox->buf + 8;
+	int qpn = vhcr->in_modifier & 0x7fffff;
+	struct res_qp *qp;
+	u8 orig_sched_queue;
+	__be32	orig_param3 = qpc->param3;
+	u8 orig_vlan_control = qpc->pri_path.vlan_control;
+	u8 orig_fvl_rx = qpc->pri_path.fvl_rx;
+	u8 orig_pri_path_fl = qpc->pri_path.fl;
+	u8 orig_vlan_index = qpc->pri_path.vlan_index;
+	u8 orig_feup = qpc->pri_path.feup;
+
+	err = adjust_qp_sched_queue(dev, slave, qpc, inbox);
+	if (err)
+		return err;
+	err = verify_qp_parameters(dev, vhcr, inbox, QP_TRANS_INIT2RTR, slave);
+	if (err)
+		return err;
+
+	if (roce_verify_mac(dev, slave, qpc, inbox))
+		return -EINVAL;
+
+	update_pkey_index(dev, slave, inbox);
+	update_gid(dev, inbox, (u8)slave);
+	adjust_proxy_tun_qkey(dev, vhcr, qpc);
+	orig_sched_queue = qpc->pri_path.sched_queue;
+	err = update_vport_qp_param(dev, inbox, slave, qpn);
+	if (err)
+		return err;
+
+	err = get_res(dev, slave, qpn, RES_QP, &qp);
+	if (err)
+		return err;
+	if (qp->com.from_state != RES_QP_HW) {
+		err = -EBUSY;
+		goto out;
+	}
+
+	err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd);
+out:
+	/* if no error, save sched queue value passed in by VF. This is
+	 * essentially the QOS value provided by the VF. This will be useful
+	 * if we allow dynamic changes from VST back to VGT
+	 */
+	if (!err) {
+		qp->sched_queue = orig_sched_queue;
+		qp->param3	= orig_param3;
+		qp->vlan_control = orig_vlan_control;
+		qp->fvl_rx	=  orig_fvl_rx;
+		qp->pri_path_fl = orig_pri_path_fl;
+		qp->vlan_index  = orig_vlan_index;
+		qp->feup	= orig_feup;
+	}
+	put_res(dev, slave, qpn, RES_QP);
+	return err;
+}
+
+int mlx4_RTR2RTS_QP_wrapper(struct mlx4_dev *dev, int slave,
+			    struct mlx4_vhcr *vhcr,
+			    struct mlx4_cmd_mailbox *inbox,
+			    struct mlx4_cmd_mailbox *outbox,
+			    struct mlx4_cmd_info *cmd)
+{
+	int err;
+	struct mlx4_qp_context *context = inbox->buf + 8;
+
+	err = adjust_qp_sched_queue(dev, slave, context, inbox);
+	if (err)
+		return err;
+	err = verify_qp_parameters(dev, vhcr, inbox, QP_TRANS_RTR2RTS, slave);
+	if (err)
+		return err;
+
+	update_pkey_index(dev, slave, inbox);
+	update_gid(dev, inbox, (u8)slave);
+	adjust_proxy_tun_qkey(dev, vhcr, context);
+	return mlx4_GEN_QP_wrapper(dev, slave, vhcr, inbox, outbox, cmd);
+}
+
+int mlx4_RTS2RTS_QP_wrapper(struct mlx4_dev *dev, int slave,
+			    struct mlx4_vhcr *vhcr,
+			    struct mlx4_cmd_mailbox *inbox,
+			    struct mlx4_cmd_mailbox *outbox,
+			    struct mlx4_cmd_info *cmd)
+{
+	int err;
+	struct mlx4_qp_context *context = inbox->buf + 8;
+
+	err = adjust_qp_sched_queue(dev, slave, context, inbox);
+	if (err)
+		return err;
+	err = verify_qp_parameters(dev, vhcr, inbox, QP_TRANS_RTS2RTS, slave);
+	if (err)
+		return err;
+
+	update_pkey_index(dev, slave, inbox);
+	update_gid(dev, inbox, (u8)slave);
+	adjust_proxy_tun_qkey(dev, vhcr, context);
+	return mlx4_GEN_QP_wrapper(dev, slave, vhcr, inbox, outbox, cmd);
+}
+
+
+int mlx4_SQERR2RTS_QP_wrapper(struct mlx4_dev *dev, int slave,
+			      struct mlx4_vhcr *vhcr,
+			      struct mlx4_cmd_mailbox *inbox,
+			      struct mlx4_cmd_mailbox *outbox,
+			      struct mlx4_cmd_info *cmd)
+{
+	struct mlx4_qp_context *context = inbox->buf + 8;
+	int err = adjust_qp_sched_queue(dev, slave, context, inbox);
+	if (err)
+		return err;
+	adjust_proxy_tun_qkey(dev, vhcr, context);
+	return mlx4_GEN_QP_wrapper(dev, slave, vhcr, inbox, outbox, cmd);
+}
+
+int mlx4_SQD2SQD_QP_wrapper(struct mlx4_dev *dev, int slave,
+			    struct mlx4_vhcr *vhcr,
+			    struct mlx4_cmd_mailbox *inbox,
+			    struct mlx4_cmd_mailbox *outbox,
+			    struct mlx4_cmd_info *cmd)
+{
+	int err;
+	struct mlx4_qp_context *context = inbox->buf + 8;
+
+	err = adjust_qp_sched_queue(dev, slave, context, inbox);
+	if (err)
+		return err;
+	err = verify_qp_parameters(dev, vhcr, inbox, QP_TRANS_SQD2SQD, slave);
+	if (err)
+		return err;
+
+	adjust_proxy_tun_qkey(dev, vhcr, context);
+	update_gid(dev, inbox, (u8)slave);
+	update_pkey_index(dev, slave, inbox);
+	return mlx4_GEN_QP_wrapper(dev, slave, vhcr, inbox, outbox, cmd);
+}
+
+int mlx4_SQD2RTS_QP_wrapper(struct mlx4_dev *dev, int slave,
+			    struct mlx4_vhcr *vhcr,
+			    struct mlx4_cmd_mailbox *inbox,
+			    struct mlx4_cmd_mailbox *outbox,
+			    struct mlx4_cmd_info *cmd)
+{
+	int err;
+	struct mlx4_qp_context *context = inbox->buf + 8;
+
+	err = adjust_qp_sched_queue(dev, slave, context, inbox);
+	if (err)
+		return err;
+	err = verify_qp_parameters(dev, vhcr, inbox, QP_TRANS_SQD2RTS, slave);
+	if (err)
+		return err;
+
+	adjust_proxy_tun_qkey(dev, vhcr, context);
+	update_gid(dev, inbox, (u8)slave);
+	update_pkey_index(dev, slave, inbox);
+	return mlx4_GEN_QP_wrapper(dev, slave, vhcr, inbox, outbox, cmd);
+}
+
+int mlx4_2RST_QP_wrapper(struct mlx4_dev *dev, int slave,
+			 struct mlx4_vhcr *vhcr,
+			 struct mlx4_cmd_mailbox *inbox,
+			 struct mlx4_cmd_mailbox *outbox,
+			 struct mlx4_cmd_info *cmd)
+{
+	int err;
+	int qpn = vhcr->in_modifier & 0x7fffff;
+	struct res_qp *qp;
+
+	err = qp_res_start_move_to(dev, slave, qpn, RES_QP_MAPPED, &qp, 0);
+	if (err)
+		return err;
+	err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd);
+	if (err)
+		goto ex_abort;
+
+	atomic_dec(&qp->mtt->ref_count);
+	atomic_dec(&qp->rcq->ref_count);
+	atomic_dec(&qp->scq->ref_count);
+	if (qp->srq)
+		atomic_dec(&qp->srq->ref_count);
+	res_end_move(dev, slave, RES_QP, qpn);
+	return 0;
+
+ex_abort:
+	res_abort_move(dev, slave, RES_QP, qpn);
+
+	return err;
+}
+
+static struct res_gid *find_gid(struct mlx4_dev *dev, int slave,
+				struct res_qp *rqp, u8 *gid)
+{
+	struct res_gid *res;
+
+	list_for_each_entry(res, &rqp->mcg_list, list) {
+		if (!memcmp(res->gid, gid, 16))
+			return res;
+	}
+	return NULL;
+}
+
+static int add_mcg_res(struct mlx4_dev *dev, int slave, struct res_qp *rqp,
+		       u8 *gid, enum mlx4_protocol prot,
+		       enum mlx4_steer_type steer, u64 reg_id)
+{
+	struct res_gid *res;
+	int err;
+
+	res = kzalloc(sizeof *res, GFP_KERNEL);
+	if (!res)
+		return -ENOMEM;
+
+	spin_lock_irq(&rqp->mcg_spl);
+	if (find_gid(dev, slave, rqp, gid)) {
+		kfree(res);
+		err = -EEXIST;
+	} else {
+		memcpy(res->gid, gid, 16);
+		res->prot = prot;
+		res->steer = steer;
+		res->reg_id = reg_id;
+		list_add_tail(&res->list, &rqp->mcg_list);
+		err = 0;
+	}
+	spin_unlock_irq(&rqp->mcg_spl);
+
+	return err;
+}
+
+static int rem_mcg_res(struct mlx4_dev *dev, int slave, struct res_qp *rqp,
+		       u8 *gid, enum mlx4_protocol prot,
+		       enum mlx4_steer_type steer, u64 *reg_id)
+{
+	struct res_gid *res;
+	int err;
+
+	spin_lock_irq(&rqp->mcg_spl);
+	res = find_gid(dev, slave, rqp, gid);
+	if (!res || res->prot != prot || res->steer != steer)
+		err = -EINVAL;
+	else {
+		*reg_id = res->reg_id;
+		list_del(&res->list);
+		kfree(res);
+		err = 0;
+	}
+	spin_unlock_irq(&rqp->mcg_spl);
+
+	return err;
+}
+
+static int qp_attach(struct mlx4_dev *dev, int slave, struct mlx4_qp *qp,
+		     u8 gid[16], int block_loopback, enum mlx4_protocol prot,
+		     enum mlx4_steer_type type, u64 *reg_id)
+{
+	switch (dev->caps.steering_mode) {
+	case MLX4_STEERING_MODE_DEVICE_MANAGED: {
+		int port = mlx4_slave_convert_port(dev, slave, gid[5]);
+		if (port < 0)
+			return port;
+		return mlx4_trans_to_dmfs_attach(dev, qp, gid, port,
+						block_loopback, prot,
+						reg_id);
+	}
+	case MLX4_STEERING_MODE_B0:
+		if (prot == MLX4_PROT_ETH) {
+			int port = mlx4_slave_convert_port(dev, slave, gid[5]);
+			if (port < 0)
+				return port;
+			gid[5] = port;
+		}
+		return mlx4_qp_attach_common(dev, qp, gid,
+					    block_loopback, prot, type);
+	default:
+		return -EINVAL;
+	}
+}
+
+static int qp_detach(struct mlx4_dev *dev, struct mlx4_qp *qp,
+		     u8 gid[16], enum mlx4_protocol prot,
+		     enum mlx4_steer_type type, u64 reg_id)
+{
+	switch (dev->caps.steering_mode) {
+	case MLX4_STEERING_MODE_DEVICE_MANAGED:
+		return mlx4_flow_detach(dev, reg_id);
+	case MLX4_STEERING_MODE_B0:
+		return mlx4_qp_detach_common(dev, qp, gid, prot, type);
+	default:
+		return -EINVAL;
+	}
+}
+
+static int mlx4_adjust_port(struct mlx4_dev *dev, int slave,
+			    u8 *gid, enum mlx4_protocol prot)
+{
+	int real_port;
+
+	if (prot != MLX4_PROT_ETH)
+		return 0;
+
+	if (dev->caps.steering_mode == MLX4_STEERING_MODE_B0 ||
+	    dev->caps.steering_mode == MLX4_STEERING_MODE_DEVICE_MANAGED) {
+		real_port = mlx4_slave_convert_port(dev, slave, gid[5]);
+		if (real_port < 0)
+			return -EINVAL;
+		gid[5] = real_port;
+	}
+
+	return 0;
+}
+
+int mlx4_QP_ATTACH_wrapper(struct mlx4_dev *dev, int slave,
+			       struct mlx4_vhcr *vhcr,
+			       struct mlx4_cmd_mailbox *inbox,
+			       struct mlx4_cmd_mailbox *outbox,
+			       struct mlx4_cmd_info *cmd)
+{
+	struct mlx4_qp qp; /* dummy for calling attach/detach */
+	u8 *gid = inbox->buf;
+	enum mlx4_protocol prot = (vhcr->in_modifier >> 28) & 0x7;
+	int err;
+	int qpn;
+	struct res_qp *rqp;
+	u64 reg_id = 0;
+	int attach = vhcr->op_modifier;
+	int block_loopback = vhcr->in_modifier >> 31;
+	u8 steer_type_mask = 2;
+	enum mlx4_steer_type type = (gid[7] & steer_type_mask) >> 1;
+
+	qpn = vhcr->in_modifier & 0xffffff;
+	err = get_res(dev, slave, qpn, RES_QP, &rqp);
+	if (err)
+		return err;
+
+	qp.qpn = qpn;
+	if (attach) {
+		err = qp_attach(dev, slave, &qp, gid, block_loopback, prot,
+				type, &reg_id);
+		if (err) {
+			pr_err("Fail to attach rule to qp 0x%x\n", qpn);
+			goto ex_put;
+		}
+		err = add_mcg_res(dev, slave, rqp, gid, prot, type, reg_id);
+		if (err)
+			goto ex_detach;
+	} else {
+		err = mlx4_adjust_port(dev, slave, gid, prot);
+		if (err)
+			goto ex_put;
+
+		err = rem_mcg_res(dev, slave, rqp, gid, prot, type, &reg_id);
+		if (err)
+			goto ex_put;
+
+		err = qp_detach(dev, &qp, gid, prot, type, reg_id);
+		if (err)
+			pr_err("Fail to detach rule from qp 0x%x reg_id = 0x%llx\n",
+			       qpn, reg_id);
+	}
+	put_res(dev, slave, qpn, RES_QP);
+	return err;
+
+ex_detach:
+	qp_detach(dev, &qp, gid, prot, type, reg_id);
+ex_put:
+	put_res(dev, slave, qpn, RES_QP);
+	return err;
+}
+
+/*
+ * MAC validation for Flow Steering rules.
+ * VF can attach rules only with a mac address which is assigned to it.
+ */
+static int validate_eth_header_mac(int slave, struct _rule_hw *eth_header,
+				   struct list_head *rlist)
+{
+	struct mac_res *res, *tmp;
+	__be64 be_mac;
+
+	/* make sure it isn't multicast or broadcast mac*/
+	if (!is_multicast_ether_addr(eth_header->eth.dst_mac) &&
+	    !is_broadcast_ether_addr(eth_header->eth.dst_mac)) {
+		list_for_each_entry_safe(res, tmp, rlist, list) {
+			be_mac = cpu_to_be64(res->mac << 16);
+			if (ether_addr_equal((u8 *)&be_mac, eth_header->eth.dst_mac))
+				return 0;
+		}
+		pr_err("MAC %pM doesn't belong to VF %d, Steering rule rejected\n",
+		       eth_header->eth.dst_mac, slave);
+		return -EINVAL;
+	}
+	return 0;
+}
+
+/*
+ * In case of missing eth header, append eth header with a MAC address
+ * assigned to the VF.
+ */
+static int add_eth_header(struct mlx4_dev *dev, int slave,
+			  struct mlx4_cmd_mailbox *inbox,
+			  struct list_head *rlist, int header_id)
+{
+	struct mac_res *res, *tmp;
+	u8 port;
+	struct mlx4_net_trans_rule_hw_ctrl *ctrl;
+	struct mlx4_net_trans_rule_hw_eth *eth_header;
+	struct mlx4_net_trans_rule_hw_ipv4 *ip_header;
+	struct mlx4_net_trans_rule_hw_tcp_udp *l4_header;
+	__be64 be_mac = 0;
+	__be64 mac_msk = cpu_to_be64(MLX4_MAC_MASK << 16);
+
+	ctrl = (struct mlx4_net_trans_rule_hw_ctrl *)inbox->buf;
+	port = ctrl->port;
+	eth_header = (struct mlx4_net_trans_rule_hw_eth *)(ctrl + 1);
+
+	/* Clear a space in the inbox for eth header */
+	switch (header_id) {
+	case MLX4_NET_TRANS_RULE_ID_IPV4:
+		ip_header =
+			(struct mlx4_net_trans_rule_hw_ipv4 *)(eth_header + 1);
+		memmove(ip_header, eth_header,
+			sizeof(*ip_header) + sizeof(*l4_header));
+		break;
+	case MLX4_NET_TRANS_RULE_ID_TCP:
+	case MLX4_NET_TRANS_RULE_ID_UDP:
+		l4_header = (struct mlx4_net_trans_rule_hw_tcp_udp *)
+			    (eth_header + 1);
+		memmove(l4_header, eth_header, sizeof(*l4_header));
+		break;
+	default:
+		return -EINVAL;
+	}
+	list_for_each_entry_safe(res, tmp, rlist, list) {
+		if (port == res->port) {
+			be_mac = cpu_to_be64(res->mac << 16);
+			break;
+		}
+	}
+	if (!be_mac) {
+		pr_err("Failed adding eth header to FS rule, Can't find matching MAC for port %d\n",
+		       port);
+		return -EINVAL;
+	}
+
+	memset(eth_header, 0, sizeof(*eth_header));
+	eth_header->size = sizeof(*eth_header) >> 2;
+	eth_header->id = cpu_to_be16(__sw_id_hw[MLX4_NET_TRANS_RULE_ID_ETH]);
+	memcpy(eth_header->dst_mac, &be_mac, ETH_ALEN);
+	memcpy(eth_header->dst_mac_msk, &mac_msk, ETH_ALEN);
+
+	return 0;
+
+}
+
+#define MLX4_UPD_QP_PATH_MASK_SUPPORTED (1ULL << MLX4_UPD_QP_PATH_MASK_MAC_INDEX)
+int mlx4_UPDATE_QP_wrapper(struct mlx4_dev *dev, int slave,
+			   struct mlx4_vhcr *vhcr,
+			   struct mlx4_cmd_mailbox *inbox,
+			   struct mlx4_cmd_mailbox *outbox,
+			   struct mlx4_cmd_info *cmd_info)
+{
+	int err;
+	u32 qpn = vhcr->in_modifier & 0xffffff;
+	struct res_qp *rqp;
+	u64 mac;
+	unsigned port;
+	u64 pri_addr_path_mask;
+	struct mlx4_update_qp_context *cmd;
+	int smac_index;
+
+	cmd = (struct mlx4_update_qp_context *)inbox->buf;
+
+	pri_addr_path_mask = be64_to_cpu(cmd->primary_addr_path_mask);
+	if (cmd->qp_mask || cmd->secondary_addr_path_mask ||
+	    (pri_addr_path_mask & ~MLX4_UPD_QP_PATH_MASK_SUPPORTED))
+		return -EPERM;
+
+	/* Just change the smac for the QP */
+	err = get_res(dev, slave, qpn, RES_QP, &rqp);
+	if (err) {
+		mlx4_err(dev, "Updating qpn 0x%x for slave %d rejected\n", qpn, slave);
+		return err;
+	}
+
+	port = (rqp->sched_queue >> 6 & 1) + 1;
+
+	if (pri_addr_path_mask & (1ULL << MLX4_UPD_QP_PATH_MASK_MAC_INDEX)) {
+		smac_index = cmd->qp_context.pri_path.grh_mylmc;
+		err = mac_find_smac_ix_in_slave(dev, slave, port,
+						smac_index, &mac);
+
+		if (err) {
+			mlx4_err(dev, "Failed to update qpn 0x%x, MAC is invalid. smac_ix: %d\n",
+				 qpn, smac_index);
+			goto err_mac;
+		}
+	}
+
+	err = mlx4_cmd(dev, inbox->dma,
+		       vhcr->in_modifier, 0,
+		       MLX4_CMD_UPDATE_QP, MLX4_CMD_TIME_CLASS_A,
+		       MLX4_CMD_NATIVE);
+	if (err) {
+		mlx4_err(dev, "Failed to update qpn on qpn 0x%x, command failed\n", qpn);
+		goto err_mac;
+	}
+
+err_mac:
+	put_res(dev, slave, qpn, RES_QP);
+	return err;
+}
+
+int mlx4_QP_FLOW_STEERING_ATTACH_wrapper(struct mlx4_dev *dev, int slave,
+					 struct mlx4_vhcr *vhcr,
+					 struct mlx4_cmd_mailbox *inbox,
+					 struct mlx4_cmd_mailbox *outbox,
+					 struct mlx4_cmd_info *cmd)
+{
+
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker;
+	struct list_head *rlist = &tracker->slave_list[slave].res_list[RES_MAC];
+	int err;
+	int qpn;
+	struct res_qp *rqp;
+	struct mlx4_net_trans_rule_hw_ctrl *ctrl;
+	struct _rule_hw  *rule_header;
+	int header_id;
+
+	if (dev->caps.steering_mode !=
+	    MLX4_STEERING_MODE_DEVICE_MANAGED)
+		return -EOPNOTSUPP;
+
+	ctrl = (struct mlx4_net_trans_rule_hw_ctrl *)inbox->buf;
+	ctrl->port = mlx4_slave_convert_port(dev, slave, ctrl->port);
+	if (ctrl->port <= 0)
+		return -EINVAL;
+	qpn = be32_to_cpu(ctrl->qpn) & 0xffffff;
+	err = get_res(dev, slave, qpn, RES_QP, &rqp);
+	if (err) {
+		pr_err("Steering rule with qpn 0x%x rejected\n", qpn);
+		return err;
+	}
+	rule_header = (struct _rule_hw *)(ctrl + 1);
+	header_id = map_hw_to_sw_id(be16_to_cpu(rule_header->id));
+
+	switch (header_id) {
+	case MLX4_NET_TRANS_RULE_ID_ETH:
+		if (validate_eth_header_mac(slave, rule_header, rlist)) {
+			err = -EINVAL;
+			goto err_put;
+		}
+		break;
+	case MLX4_NET_TRANS_RULE_ID_IB:
+		break;
+	case MLX4_NET_TRANS_RULE_ID_IPV4:
+	case MLX4_NET_TRANS_RULE_ID_TCP:
+	case MLX4_NET_TRANS_RULE_ID_UDP:
+		pr_warn("Can't attach FS rule without L2 headers, adding L2 header\n");
+		if (add_eth_header(dev, slave, inbox, rlist, header_id)) {
+			err = -EINVAL;
+			goto err_put;
+		}
+		vhcr->in_modifier +=
+			sizeof(struct mlx4_net_trans_rule_hw_eth) >> 2;
+		break;
+	default:
+		pr_err("Corrupted mailbox\n");
+		err = -EINVAL;
+		goto err_put;
+	}
+
+	err = mlx4_cmd_imm(dev, inbox->dma, &vhcr->out_param,
+			   vhcr->in_modifier, 0,
+			   MLX4_QP_FLOW_STEERING_ATTACH, MLX4_CMD_TIME_CLASS_A,
+			   MLX4_CMD_NATIVE);
+	if (err)
+		goto err_put;
+
+	err = add_res_range(dev, slave, vhcr->out_param, 1, RES_FS_RULE, qpn);
+	if (err) {
+		mlx4_err(dev, "Fail to add flow steering resources\n");
+		/* detach rule*/
+		mlx4_cmd(dev, vhcr->out_param, 0, 0,
+			 MLX4_QP_FLOW_STEERING_DETACH, MLX4_CMD_TIME_CLASS_A,
+			 MLX4_CMD_NATIVE);
+		goto err_put;
+	}
+	atomic_inc(&rqp->ref_count);
+err_put:
+	put_res(dev, slave, qpn, RES_QP);
+	return err;
+}
+
+int mlx4_QP_FLOW_STEERING_DETACH_wrapper(struct mlx4_dev *dev, int slave,
+					 struct mlx4_vhcr *vhcr,
+					 struct mlx4_cmd_mailbox *inbox,
+					 struct mlx4_cmd_mailbox *outbox,
+					 struct mlx4_cmd_info *cmd)
+{
+	int err;
+	struct res_qp *rqp;
+	struct res_fs_rule *rrule;
+
+	if (dev->caps.steering_mode !=
+	    MLX4_STEERING_MODE_DEVICE_MANAGED)
+		return -EOPNOTSUPP;
+
+	err = get_res(dev, slave, vhcr->in_param, RES_FS_RULE, &rrule);
+	if (err)
+		return err;
+	/* Release the rule form busy state before removal */
+	put_res(dev, slave, vhcr->in_param, RES_FS_RULE);
+	err = get_res(dev, slave, rrule->qpn, RES_QP, &rqp);
+	if (err)
+		return err;
+
+	err = rem_res_range(dev, slave, vhcr->in_param, 1, RES_FS_RULE, 0);
+	if (err) {
+		mlx4_err(dev, "Fail to remove flow steering resources\n");
+		goto out;
+	}
+
+	err = mlx4_cmd(dev, vhcr->in_param, 0, 0,
+		       MLX4_QP_FLOW_STEERING_DETACH, MLX4_CMD_TIME_CLASS_A,
+		       MLX4_CMD_NATIVE);
+	if (!err)
+		atomic_dec(&rqp->ref_count);
+out:
+	put_res(dev, slave, rrule->qpn, RES_QP);
+	return err;
+}
+
+enum {
+	BUSY_MAX_RETRIES = 10
+};
+
+int mlx4_QUERY_IF_STAT_wrapper(struct mlx4_dev *dev, int slave,
+			       struct mlx4_vhcr *vhcr,
+			       struct mlx4_cmd_mailbox *inbox,
+			       struct mlx4_cmd_mailbox *outbox,
+			       struct mlx4_cmd_info *cmd)
+{
+	int err;
+	int index = vhcr->in_modifier & 0xffff;
+
+	err = get_res(dev, slave, index, RES_COUNTER, NULL);
+	if (err)
+		return err;
+
+	err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd);
+	put_res(dev, slave, index, RES_COUNTER);
+	return err;
+}
+
+static void detach_qp(struct mlx4_dev *dev, int slave, struct res_qp *rqp)
+{
+	struct res_gid *rgid;
+	struct res_gid *tmp;
+	struct mlx4_qp qp; /* dummy for calling attach/detach */
+
+	list_for_each_entry_safe(rgid, tmp, &rqp->mcg_list, list) {
+		switch (dev->caps.steering_mode) {
+		case MLX4_STEERING_MODE_DEVICE_MANAGED:
+			mlx4_flow_detach(dev, rgid->reg_id);
+			break;
+		case MLX4_STEERING_MODE_B0:
+			qp.qpn = rqp->local_qpn;
+			(void) mlx4_qp_detach_common(dev, &qp, rgid->gid,
+						     rgid->prot, rgid->steer);
+			break;
+		}
+		list_del(&rgid->list);
+		kfree(rgid);
+	}
+}
+
+static int _move_all_busy(struct mlx4_dev *dev, int slave,
+			  enum mlx4_resource type, int print)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_resource_tracker *tracker =
+		&priv->mfunc.master.res_tracker;
+	struct list_head *rlist = &tracker->slave_list[slave].res_list[type];
+	struct res_common *r;
+	struct res_common *tmp;
+	int busy;
+
+	busy = 0;
+	spin_lock_irq(mlx4_tlock(dev));
+	list_for_each_entry_safe(r, tmp, rlist, list) {
+		if (r->owner == slave) {
+			if (!r->removing) {
+				if (r->state == RES_ANY_BUSY) {
+					if (print)
+						mlx4_dbg(dev,
+							 "%s id 0x%llx is busy\n",
+							  resource_str(type),
+							  r->res_id);
+					++busy;
+				} else {
+					r->from_state = r->state;
+					r->state = RES_ANY_BUSY;
+					r->removing = 1;
+				}
+			}
+		}
+	}
+	spin_unlock_irq(mlx4_tlock(dev));
+
+	return busy;
+}
+
+static int move_all_busy(struct mlx4_dev *dev, int slave,
+			 enum mlx4_resource type)
+{
+	unsigned long begin;
+	int busy;
+
+	begin = jiffies;
+	do {
+		busy = _move_all_busy(dev, slave, type, 0);
+		if (time_after(jiffies, begin + 5 * HZ))
+			break;
+		if (busy)
+			cond_resched();
+	} while (busy);
+
+	if (busy)
+		busy = _move_all_busy(dev, slave, type, 1);
+
+	return busy;
+}
+static void rem_slave_qps(struct mlx4_dev *dev, int slave)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker;
+	struct list_head *qp_list =
+		&tracker->slave_list[slave].res_list[RES_QP];
+	struct res_qp *qp;
+	struct res_qp *tmp;
+	int state;
+	u64 in_param;
+	int qpn;
+	int err;
+
+	err = move_all_busy(dev, slave, RES_QP);
+	if (err)
+		mlx4_warn(dev, "rem_slave_qps: Could not move all qps to busy for slave %d\n",
+			  slave);
+
+	spin_lock_irq(mlx4_tlock(dev));
+	list_for_each_entry_safe(qp, tmp, qp_list, com.list) {
+		spin_unlock_irq(mlx4_tlock(dev));
+		if (qp->com.owner == slave) {
+			qpn = qp->com.res_id;
+			detach_qp(dev, slave, qp);
+			state = qp->com.from_state;
+			while (state != 0) {
+				switch (state) {
+				case RES_QP_RESERVED:
+					spin_lock_irq(mlx4_tlock(dev));
+					rb_erase(&qp->com.node,
+						 &tracker->res_tree[RES_QP]);
+					list_del(&qp->com.list);
+					spin_unlock_irq(mlx4_tlock(dev));
+					if (!valid_reserved(dev, slave, qpn)) {
+						__mlx4_qp_release_range(dev, qpn, 1);
+						mlx4_release_resource(dev, slave,
+								      RES_QP, 1, 0);
+					}
+					kfree(qp);
+					state = 0;
+					break;
+				case RES_QP_MAPPED:
+					if (!valid_reserved(dev, slave, qpn))
+						__mlx4_qp_free_icm(dev, qpn);
+					state = RES_QP_RESERVED;
+					break;
+				case RES_QP_HW:
+					in_param = slave;
+					err = mlx4_cmd(dev, in_param,
+						       qp->local_qpn, 2,
+						       MLX4_CMD_2RST_QP,
+						       MLX4_CMD_TIME_CLASS_A,
+						       MLX4_CMD_NATIVE);
+					if (err)
+						mlx4_dbg(dev, "rem_slave_qps: failed to move slave %d qpn %d to reset\n",
+							 slave, qp->local_qpn);
+					atomic_dec(&qp->rcq->ref_count);
+					atomic_dec(&qp->scq->ref_count);
+					atomic_dec(&qp->mtt->ref_count);
+					if (qp->srq)
+						atomic_dec(&qp->srq->ref_count);
+					state = RES_QP_MAPPED;
+					break;
+				default:
+					state = 0;
+				}
+			}
+		}
+		spin_lock_irq(mlx4_tlock(dev));
+	}
+	spin_unlock_irq(mlx4_tlock(dev));
+}
+
+static void rem_slave_srqs(struct mlx4_dev *dev, int slave)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker;
+	struct list_head *srq_list =
+		&tracker->slave_list[slave].res_list[RES_SRQ];
+	struct res_srq *srq;
+	struct res_srq *tmp;
+	int state;
+	u64 in_param;
+	LIST_HEAD(tlist);
+	int srqn;
+	int err;
+
+	err = move_all_busy(dev, slave, RES_SRQ);
+	if (err)
+		mlx4_warn(dev, "rem_slave_srqs: Could not move all srqs - too busy for slave %d\n",
+			  slave);
+
+	spin_lock_irq(mlx4_tlock(dev));
+	list_for_each_entry_safe(srq, tmp, srq_list, com.list) {
+		spin_unlock_irq(mlx4_tlock(dev));
+		if (srq->com.owner == slave) {
+			srqn = srq->com.res_id;
+			state = srq->com.from_state;
+			while (state != 0) {
+				switch (state) {
+				case RES_SRQ_ALLOCATED:
+					__mlx4_srq_free_icm(dev, srqn);
+					spin_lock_irq(mlx4_tlock(dev));
+					rb_erase(&srq->com.node,
+						 &tracker->res_tree[RES_SRQ]);
+					list_del(&srq->com.list);
+					spin_unlock_irq(mlx4_tlock(dev));
+					mlx4_release_resource(dev, slave,
+							      RES_SRQ, 1, 0);
+					kfree(srq);
+					state = 0;
+					break;
+
+				case RES_SRQ_HW:
+					in_param = slave;
+					err = mlx4_cmd(dev, in_param, srqn, 1,
+						       MLX4_CMD_HW2SW_SRQ,
+						       MLX4_CMD_TIME_CLASS_A,
+						       MLX4_CMD_NATIVE);
+					if (err)
+						mlx4_dbg(dev, "rem_slave_srqs: failed to move slave %d srq %d to SW ownership\n",
+							 slave, srqn);
+
+					atomic_dec(&srq->mtt->ref_count);
+					if (srq->cq)
+						atomic_dec(&srq->cq->ref_count);
+					state = RES_SRQ_ALLOCATED;
+					break;
+
+				default:
+					state = 0;
+				}
+			}
+		}
+		spin_lock_irq(mlx4_tlock(dev));
+	}
+	spin_unlock_irq(mlx4_tlock(dev));
+}
+
+static void rem_slave_cqs(struct mlx4_dev *dev, int slave)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker;
+	struct list_head *cq_list =
+		&tracker->slave_list[slave].res_list[RES_CQ];
+	struct res_cq *cq;
+	struct res_cq *tmp;
+	int state;
+	u64 in_param;
+	LIST_HEAD(tlist);
+	int cqn;
+	int err;
+
+	err = move_all_busy(dev, slave, RES_CQ);
+	if (err)
+		mlx4_warn(dev, "rem_slave_cqs: Could not move all cqs - too busy for slave %d\n",
+			  slave);
+
+	spin_lock_irq(mlx4_tlock(dev));
+	list_for_each_entry_safe(cq, tmp, cq_list, com.list) {
+		spin_unlock_irq(mlx4_tlock(dev));
+		if (cq->com.owner == slave && !atomic_read(&cq->ref_count)) {
+			cqn = cq->com.res_id;
+			state = cq->com.from_state;
+			while (state != 0) {
+				switch (state) {
+				case RES_CQ_ALLOCATED:
+					__mlx4_cq_free_icm(dev, cqn);
+					spin_lock_irq(mlx4_tlock(dev));
+					rb_erase(&cq->com.node,
+						 &tracker->res_tree[RES_CQ]);
+					list_del(&cq->com.list);
+					spin_unlock_irq(mlx4_tlock(dev));
+					mlx4_release_resource(dev, slave,
+							      RES_CQ, 1, 0);
+					kfree(cq);
+					state = 0;
+					break;
+
+				case RES_CQ_HW:
+					in_param = slave;
+					err = mlx4_cmd(dev, in_param, cqn, 1,
+						       MLX4_CMD_HW2SW_CQ,
+						       MLX4_CMD_TIME_CLASS_A,
+						       MLX4_CMD_NATIVE);
+					if (err)
+						mlx4_dbg(dev, "rem_slave_cqs: failed to move slave %d cq %d to SW ownership\n",
+							 slave, cqn);
+					atomic_dec(&cq->mtt->ref_count);
+					state = RES_CQ_ALLOCATED;
+					break;
+
+				default:
+					state = 0;
+				}
+			}
+		}
+		spin_lock_irq(mlx4_tlock(dev));
+	}
+	spin_unlock_irq(mlx4_tlock(dev));
+}
+
+static void rem_slave_mrs(struct mlx4_dev *dev, int slave)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker;
+	struct list_head *mpt_list =
+		&tracker->slave_list[slave].res_list[RES_MPT];
+	struct res_mpt *mpt;
+	struct res_mpt *tmp;
+	int state;
+	u64 in_param;
+	LIST_HEAD(tlist);
+	int mptn;
+	int err;
+
+	err = move_all_busy(dev, slave, RES_MPT);
+	if (err)
+		mlx4_warn(dev, "rem_slave_mrs: Could not move all mpts - too busy for slave %d\n",
+			  slave);
+
+	spin_lock_irq(mlx4_tlock(dev));
+	list_for_each_entry_safe(mpt, tmp, mpt_list, com.list) {
+		spin_unlock_irq(mlx4_tlock(dev));
+		if (mpt->com.owner == slave) {
+			mptn = mpt->com.res_id;
+			state = mpt->com.from_state;
+			while (state != 0) {
+				switch (state) {
+				case RES_MPT_RESERVED:
+					__mlx4_mpt_release(dev, mpt->key);
+					spin_lock_irq(mlx4_tlock(dev));
+					rb_erase(&mpt->com.node,
+						 &tracker->res_tree[RES_MPT]);
+					list_del(&mpt->com.list);
+					spin_unlock_irq(mlx4_tlock(dev));
+					mlx4_release_resource(dev, slave,
+							      RES_MPT, 1, 0);
+					kfree(mpt);
+					state = 0;
+					break;
+
+				case RES_MPT_MAPPED:
+					__mlx4_mpt_free_icm(dev, mpt->key);
+					state = RES_MPT_RESERVED;
+					break;
+
+				case RES_MPT_HW:
+					in_param = slave;
+					err = mlx4_cmd(dev, in_param, mptn, 0,
+						     MLX4_CMD_HW2SW_MPT,
+						     MLX4_CMD_TIME_CLASS_A,
+						     MLX4_CMD_NATIVE);
+					if (err)
+						mlx4_dbg(dev, "rem_slave_mrs: failed to move slave %d mpt %d to SW ownership\n",
+							 slave, mptn);
+					if (mpt->mtt)
+						atomic_dec(&mpt->mtt->ref_count);
+					state = RES_MPT_MAPPED;
+					break;
+				default:
+					state = 0;
+				}
+			}
+		}
+		spin_lock_irq(mlx4_tlock(dev));
+	}
+	spin_unlock_irq(mlx4_tlock(dev));
+}
+
+static void rem_slave_mtts(struct mlx4_dev *dev, int slave)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_resource_tracker *tracker =
+		&priv->mfunc.master.res_tracker;
+	struct list_head *mtt_list =
+		&tracker->slave_list[slave].res_list[RES_MTT];
+	struct res_mtt *mtt;
+	struct res_mtt *tmp;
+	int state;
+	LIST_HEAD(tlist);
+	int base;
+	int err;
+
+	err = move_all_busy(dev, slave, RES_MTT);
+	if (err)
+		mlx4_warn(dev, "rem_slave_mtts: Could not move all mtts  - too busy for slave %d\n",
+			  slave);
+
+	spin_lock_irq(mlx4_tlock(dev));
+	list_for_each_entry_safe(mtt, tmp, mtt_list, com.list) {
+		spin_unlock_irq(mlx4_tlock(dev));
+		if (mtt->com.owner == slave) {
+			base = mtt->com.res_id;
+			state = mtt->com.from_state;
+			while (state != 0) {
+				switch (state) {
+				case RES_MTT_ALLOCATED:
+					__mlx4_free_mtt_range(dev, base,
+							      mtt->order);
+					spin_lock_irq(mlx4_tlock(dev));
+					rb_erase(&mtt->com.node,
+						 &tracker->res_tree[RES_MTT]);
+					list_del(&mtt->com.list);
+					spin_unlock_irq(mlx4_tlock(dev));
+					mlx4_release_resource(dev, slave, RES_MTT,
+							      1 << mtt->order, 0);
+					kfree(mtt);
+					state = 0;
+					break;
+
+				default:
+					state = 0;
+				}
+			}
+		}
+		spin_lock_irq(mlx4_tlock(dev));
+	}
+	spin_unlock_irq(mlx4_tlock(dev));
+}
+
+static void rem_slave_fs_rule(struct mlx4_dev *dev, int slave)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_resource_tracker *tracker =
+		&priv->mfunc.master.res_tracker;
+	struct list_head *fs_rule_list =
+		&tracker->slave_list[slave].res_list[RES_FS_RULE];
+	struct res_fs_rule *fs_rule;
+	struct res_fs_rule *tmp;
+	int state;
+	u64 base;
+	int err;
+
+	err = move_all_busy(dev, slave, RES_FS_RULE);
+	if (err)
+		mlx4_warn(dev, "rem_slave_fs_rule: Could not move all mtts to busy for slave %d\n",
+			  slave);
+
+	spin_lock_irq(mlx4_tlock(dev));
+	list_for_each_entry_safe(fs_rule, tmp, fs_rule_list, com.list) {
+		spin_unlock_irq(mlx4_tlock(dev));
+		if (fs_rule->com.owner == slave) {
+			base = fs_rule->com.res_id;
+			state = fs_rule->com.from_state;
+			while (state != 0) {
+				switch (state) {
+				case RES_FS_RULE_ALLOCATED:
+					/* detach rule */
+					err = mlx4_cmd(dev, base, 0, 0,
+						       MLX4_QP_FLOW_STEERING_DETACH,
+						       MLX4_CMD_TIME_CLASS_A,
+						       MLX4_CMD_NATIVE);
+
+					spin_lock_irq(mlx4_tlock(dev));
+					rb_erase(&fs_rule->com.node,
+						 &tracker->res_tree[RES_FS_RULE]);
+					list_del(&fs_rule->com.list);
+					spin_unlock_irq(mlx4_tlock(dev));
+					kfree(fs_rule);
+					state = 0;
+					break;
+
+				default:
+					state = 0;
+				}
+			}
+		}
+		spin_lock_irq(mlx4_tlock(dev));
+	}
+	spin_unlock_irq(mlx4_tlock(dev));
+}
+
+static void rem_slave_eqs(struct mlx4_dev *dev, int slave)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker;
+	struct list_head *eq_list =
+		&tracker->slave_list[slave].res_list[RES_EQ];
+	struct res_eq *eq;
+	struct res_eq *tmp;
+	int err;
+	int state;
+	LIST_HEAD(tlist);
+	int eqn;
+	struct mlx4_cmd_mailbox *mailbox;
+
+	err = move_all_busy(dev, slave, RES_EQ);
+	if (err)
+		mlx4_warn(dev, "rem_slave_eqs: Could not move all eqs - too busy for slave %d\n",
+			  slave);
+
+	spin_lock_irq(mlx4_tlock(dev));
+	list_for_each_entry_safe(eq, tmp, eq_list, com.list) {
+		spin_unlock_irq(mlx4_tlock(dev));
+		if (eq->com.owner == slave) {
+			eqn = eq->com.res_id;
+			state = eq->com.from_state;
+			while (state != 0) {
+				switch (state) {
+				case RES_EQ_RESERVED:
+					spin_lock_irq(mlx4_tlock(dev));
+					rb_erase(&eq->com.node,
+						 &tracker->res_tree[RES_EQ]);
+					list_del(&eq->com.list);
+					spin_unlock_irq(mlx4_tlock(dev));
+					kfree(eq);
+					state = 0;
+					break;
+
+				case RES_EQ_HW:
+					mailbox = mlx4_alloc_cmd_mailbox(dev);
+					if (IS_ERR(mailbox)) {
+						cond_resched();
+						continue;
+					}
+					err = mlx4_cmd_box(dev, slave, 0,
+							   eqn & 0xff, 0,
+							   MLX4_CMD_HW2SW_EQ,
+							   MLX4_CMD_TIME_CLASS_A,
+							   MLX4_CMD_NATIVE);
+					if (err)
+						mlx4_dbg(dev, "rem_slave_eqs: failed to move slave %d eqs %d to SW ownership\n",
+							 slave, eqn);
+					mlx4_free_cmd_mailbox(dev, mailbox);
+					atomic_dec(&eq->mtt->ref_count);
+					state = RES_EQ_RESERVED;
+					break;
+
+				default:
+					state = 0;
+				}
+			}
+		}
+		spin_lock_irq(mlx4_tlock(dev));
+	}
+	spin_unlock_irq(mlx4_tlock(dev));
+}
+
+static void rem_slave_counters(struct mlx4_dev *dev, int slave)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker;
+	struct list_head *counter_list =
+		&tracker->slave_list[slave].res_list[RES_COUNTER];
+	struct res_counter *counter;
+	struct res_counter *tmp;
+	int err;
+	int index;
+
+	err = move_all_busy(dev, slave, RES_COUNTER);
+	if (err)
+		mlx4_warn(dev, "rem_slave_counters: Could not move all counters - too busy for slave %d\n",
+			  slave);
+
+	spin_lock_irq(mlx4_tlock(dev));
+	list_for_each_entry_safe(counter, tmp, counter_list, com.list) {
+		if (counter->com.owner == slave) {
+			index = counter->com.res_id;
+			rb_erase(&counter->com.node,
+				 &tracker->res_tree[RES_COUNTER]);
+			list_del(&counter->com.list);
+			kfree(counter);
+			__mlx4_counter_free(dev, index);
+			mlx4_release_resource(dev, slave, RES_COUNTER, 1, 0);
+		}
+	}
+	spin_unlock_irq(mlx4_tlock(dev));
+}
+
+static void rem_slave_xrcdns(struct mlx4_dev *dev, int slave)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker;
+	struct list_head *xrcdn_list =
+		&tracker->slave_list[slave].res_list[RES_XRCD];
+	struct res_xrcdn *xrcd;
+	struct res_xrcdn *tmp;
+	int err;
+	int xrcdn;
+
+	err = move_all_busy(dev, slave, RES_XRCD);
+	if (err)
+		mlx4_warn(dev, "rem_slave_xrcdns: Could not move all xrcdns - too busy for slave %d\n",
+			  slave);
+
+	spin_lock_irq(mlx4_tlock(dev));
+	list_for_each_entry_safe(xrcd, tmp, xrcdn_list, com.list) {
+		if (xrcd->com.owner == slave) {
+			xrcdn = xrcd->com.res_id;
+			rb_erase(&xrcd->com.node, &tracker->res_tree[RES_XRCD]);
+			list_del(&xrcd->com.list);
+			kfree(xrcd);
+			__mlx4_xrcd_free(dev, xrcdn);
+		}
+	}
+	spin_unlock_irq(mlx4_tlock(dev));
+}
+
+void mlx4_delete_all_resources_for_slave(struct mlx4_dev *dev, int slave)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	mlx4_reset_roce_gids(dev, slave);
+	mutex_lock(&priv->mfunc.master.res_tracker.slave_list[slave].mutex);
+	rem_slave_vlans(dev, slave);
+	rem_slave_macs(dev, slave);
+	rem_slave_fs_rule(dev, slave);
+	rem_slave_qps(dev, slave);
+	rem_slave_srqs(dev, slave);
+	rem_slave_cqs(dev, slave);
+	rem_slave_mrs(dev, slave);
+	rem_slave_eqs(dev, slave);
+	rem_slave_mtts(dev, slave);
+	rem_slave_counters(dev, slave);
+	rem_slave_xrcdns(dev, slave);
+	mutex_unlock(&priv->mfunc.master.res_tracker.slave_list[slave].mutex);
+}
+
+void mlx4_vf_immed_vlan_work_handler(struct work_struct *_work)
+{
+	struct mlx4_vf_immed_vlan_work *work =
+		container_of(_work, struct mlx4_vf_immed_vlan_work, work);
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_update_qp_context *upd_context;
+	struct mlx4_dev *dev = &work->priv->dev;
+	struct mlx4_resource_tracker *tracker =
+		&work->priv->mfunc.master.res_tracker;
+	struct list_head *qp_list =
+		&tracker->slave_list[work->slave].res_list[RES_QP];
+	struct res_qp *qp;
+	struct res_qp *tmp;
+	u64 qp_path_mask_vlan_ctrl =
+		       ((1ULL << MLX4_UPD_QP_PATH_MASK_ETH_TX_BLOCK_UNTAGGED) |
+		       (1ULL << MLX4_UPD_QP_PATH_MASK_ETH_TX_BLOCK_1P) |
+		       (1ULL << MLX4_UPD_QP_PATH_MASK_ETH_TX_BLOCK_TAGGED) |
+		       (1ULL << MLX4_UPD_QP_PATH_MASK_ETH_RX_BLOCK_UNTAGGED) |
+		       (1ULL << MLX4_UPD_QP_PATH_MASK_ETH_RX_BLOCK_1P) |
+		       (1ULL << MLX4_UPD_QP_PATH_MASK_ETH_RX_BLOCK_TAGGED));
+
+	u64 qp_path_mask = ((1ULL << MLX4_UPD_QP_PATH_MASK_VLAN_INDEX) |
+		       (1ULL << MLX4_UPD_QP_PATH_MASK_FVL) |
+		       (1ULL << MLX4_UPD_QP_PATH_MASK_CV) |
+		       (1ULL << MLX4_UPD_QP_PATH_MASK_ETH_HIDE_CQE_VLAN) |
+		       (1ULL << MLX4_UPD_QP_PATH_MASK_FEUP) |
+		       (1ULL << MLX4_UPD_QP_PATH_MASK_FVL_RX) |
+		       (1ULL << MLX4_UPD_QP_PATH_MASK_SCHED_QUEUE));
+
+	int err;
+	int port, errors = 0;
+	u8 vlan_control;
+
+	if (mlx4_is_slave(dev)) {
+		mlx4_warn(dev, "Trying to update-qp in slave %d\n",
+			  work->slave);
+		goto out;
+	}
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		goto out;
+	if (work->flags & MLX4_VF_IMMED_VLAN_FLAG_LINK_DISABLE) /* block all */
+		vlan_control = MLX4_VLAN_CTRL_ETH_TX_BLOCK_TAGGED |
+			MLX4_VLAN_CTRL_ETH_TX_BLOCK_PRIO_TAGGED |
+			MLX4_VLAN_CTRL_ETH_TX_BLOCK_UNTAGGED |
+			MLX4_VLAN_CTRL_ETH_RX_BLOCK_PRIO_TAGGED |
+			MLX4_VLAN_CTRL_ETH_RX_BLOCK_UNTAGGED |
+			MLX4_VLAN_CTRL_ETH_RX_BLOCK_TAGGED;
+	else if (!work->vlan_id)
+		vlan_control = MLX4_VLAN_CTRL_ETH_TX_BLOCK_TAGGED |
+			MLX4_VLAN_CTRL_ETH_RX_BLOCK_TAGGED;
+	else
+		vlan_control = MLX4_VLAN_CTRL_ETH_TX_BLOCK_TAGGED |
+			MLX4_VLAN_CTRL_ETH_RX_BLOCK_PRIO_TAGGED |
+			MLX4_VLAN_CTRL_ETH_RX_BLOCK_UNTAGGED;
+
+	upd_context = mailbox->buf;
+	upd_context->qp_mask = cpu_to_be64(1ULL << MLX4_UPD_QP_MASK_VSD);
+
+	spin_lock_irq(mlx4_tlock(dev));
+	list_for_each_entry_safe(qp, tmp, qp_list, com.list) {
+		spin_unlock_irq(mlx4_tlock(dev));
+		if (qp->com.owner == work->slave) {
+			if (qp->com.from_state != RES_QP_HW ||
+			    !qp->sched_queue ||  /* no INIT2RTR trans yet */
+			    mlx4_is_qp_reserved(dev, qp->local_qpn) ||
+			    qp->qpc_flags & (1 << MLX4_RSS_QPC_FLAG_OFFSET)) {
+				spin_lock_irq(mlx4_tlock(dev));
+				continue;
+			}
+			port = (qp->sched_queue >> 6 & 1) + 1;
+			if (port != work->port) {
+				spin_lock_irq(mlx4_tlock(dev));
+				continue;
+			}
+			if (MLX4_QP_ST_RC == ((qp->qpc_flags >> 16) & 0xff))
+				upd_context->primary_addr_path_mask = cpu_to_be64(qp_path_mask);
+			else
+				upd_context->primary_addr_path_mask =
+					cpu_to_be64(qp_path_mask | qp_path_mask_vlan_ctrl);
+			if (work->vlan_id == MLX4_VGT) {
+				upd_context->qp_context.param3 = qp->param3;
+				upd_context->qp_context.pri_path.vlan_control = qp->vlan_control;
+				upd_context->qp_context.pri_path.fvl_rx = qp->fvl_rx;
+				upd_context->qp_context.pri_path.vlan_index = qp->vlan_index;
+				upd_context->qp_context.pri_path.fl = qp->pri_path_fl;
+				upd_context->qp_context.pri_path.feup = qp->feup;
+				upd_context->qp_context.pri_path.sched_queue =
+					qp->sched_queue;
+			} else {
+				upd_context->qp_context.param3 = qp->param3 & ~cpu_to_be32(MLX4_STRIP_VLAN);
+				upd_context->qp_context.pri_path.vlan_control = vlan_control;
+				upd_context->qp_context.pri_path.vlan_index = work->vlan_ix;
+				upd_context->qp_context.pri_path.fvl_rx =
+					qp->fvl_rx | MLX4_FVL_RX_FORCE_ETH_VLAN;
+				upd_context->qp_context.pri_path.fl =
+					qp->pri_path_fl | MLX4_FL_CV | MLX4_FL_ETH_HIDE_CQE_VLAN;
+				upd_context->qp_context.pri_path.feup =
+					qp->feup | MLX4_FEUP_FORCE_ETH_UP | MLX4_FVL_FORCE_ETH_VLAN;
+				upd_context->qp_context.pri_path.sched_queue =
+					qp->sched_queue & 0xC7;
+				upd_context->qp_context.pri_path.sched_queue |=
+					((work->qos & 0x7) << 3);
+			}
+
+			err = mlx4_cmd(dev, mailbox->dma,
+				       qp->local_qpn & 0xffffff,
+				       0, MLX4_CMD_UPDATE_QP,
+				       MLX4_CMD_TIME_CLASS_C, MLX4_CMD_NATIVE);
+			if (err) {
+				mlx4_info(dev, "UPDATE_QP failed for slave %d, port %d, qpn %d (%d)\n",
+					  work->slave, port, qp->local_qpn, err);
+				errors++;
+			}
+		}
+		spin_lock_irq(mlx4_tlock(dev));
+	}
+	spin_unlock_irq(mlx4_tlock(dev));
+	mlx4_free_cmd_mailbox(dev, mailbox);
+
+	if (errors)
+		mlx4_err(dev, "%d UPDATE_QP failures for slave %d, port %d\n",
+			 errors, work->slave, work->port);
+
+	/* unregister previous vlan_id if needed and we had no errors
+	 * while updating the QPs
+	 */
+	if (work->flags & MLX4_VF_IMMED_VLAN_FLAG_VLAN && !errors &&
+	    NO_INDX != work->orig_vlan_ix)
+		__mlx4_unregister_vlan(&work->priv->dev, work->port,
+				       work->orig_vlan_id);
+out:
+	kfree(work);
+	return;
+}
diff --git a/drivers/net/ethernet/mellanox/mlx4/sense.c b/drivers/net/ethernet/mellanox/mlx4/sense.c
new file mode 100644
index 0000000..094773d
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4/sense.c
@@ -0,0 +1,143 @@
+/*
+ * Copyright (c) 2007 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <linux/errno.h>
+#include <linux/if_ether.h>
+
+#include <linux/mlx4/cmd.h>
+
+#include "mlx4.h"
+
+int mlx4_SENSE_PORT(struct mlx4_dev *dev, int port,
+		    enum mlx4_port_type *type)
+{
+	u64 out_param;
+	int err = 0;
+
+	err = mlx4_cmd_imm(dev, 0, &out_param, port, 0,
+			   MLX4_CMD_SENSE_PORT, MLX4_CMD_TIME_CLASS_B,
+			   MLX4_CMD_WRAPPED);
+	if (err) {
+		mlx4_err(dev, "Sense command failed for port: %d\n", port);
+		return err;
+	}
+
+	if (out_param > 2) {
+		mlx4_err(dev, "Sense returned illegal value: 0x%llx\n", out_param);
+		return -EINVAL;
+	}
+
+	*type = out_param;
+	return 0;
+}
+
+void mlx4_do_sense_ports(struct mlx4_dev *dev,
+			 enum mlx4_port_type *stype,
+			 enum mlx4_port_type *defaults)
+{
+	struct mlx4_sense *sense = &mlx4_priv(dev)->sense;
+	int err;
+	int i;
+
+	for (i = 1; i <= dev->caps.num_ports; i++) {
+		stype[i - 1] = 0;
+		if (sense->do_sense_port[i] && sense->sense_allowed[i] &&
+		    dev->caps.possible_type[i] == MLX4_PORT_TYPE_AUTO) {
+			err = mlx4_SENSE_PORT(dev, i, &stype[i - 1]);
+			if (err)
+				stype[i - 1] = defaults[i - 1];
+		} else
+			stype[i - 1] = defaults[i - 1];
+	}
+
+	/*
+	 * If sensed nothing, remain in current configuration.
+	 */
+	for (i = 0; i < dev->caps.num_ports; i++)
+		stype[i] = stype[i] ? stype[i] : defaults[i];
+
+}
+
+static void mlx4_sense_port(struct work_struct *work)
+{
+	struct delayed_work *delay = to_delayed_work(work);
+	struct mlx4_sense *sense = container_of(delay, struct mlx4_sense,
+						sense_poll);
+	struct mlx4_dev *dev = sense->dev;
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	enum mlx4_port_type stype[MLX4_MAX_PORTS];
+
+	mutex_lock(&priv->port_mutex);
+	mlx4_do_sense_ports(dev, stype, &dev->caps.port_type[1]);
+
+	if (mlx4_check_port_params(dev, stype))
+		goto sense_again;
+
+	if (mlx4_change_port_types(dev, stype))
+		mlx4_err(dev, "Failed to change port_types\n");
+
+sense_again:
+	mutex_unlock(&priv->port_mutex);
+	queue_delayed_work(mlx4_wq , &sense->sense_poll,
+			   round_jiffies_relative(MLX4_SENSE_RANGE));
+}
+
+void mlx4_start_sense(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_sense *sense = &priv->sense;
+
+	if (!(dev->caps.flags & MLX4_DEV_CAP_FLAG_DPDP))
+		return;
+
+	queue_delayed_work(mlx4_wq , &sense->sense_poll,
+			   round_jiffies_relative(MLX4_SENSE_RANGE));
+}
+
+void mlx4_stop_sense(struct mlx4_dev *dev)
+{
+	cancel_delayed_work_sync(&mlx4_priv(dev)->sense.sense_poll);
+}
+
+void  mlx4_sense_init(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_sense *sense = &priv->sense;
+	int port;
+
+	sense->dev = dev;
+	for (port = 1; port <= dev->caps.num_ports; port++)
+		sense->do_sense_port[port] = 1;
+
+	INIT_DEFERRABLE_WORK(&sense->sense_poll, mlx4_sense_port);
+}
diff --git a/drivers/net/ethernet/mellanox/mlx4/srq.c b/drivers/net/ethernet/mellanox/mlx4/srq.c
new file mode 100644
index 0000000..6714662
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4/srq.c
@@ -0,0 +1,313 @@
+/*
+ * Copyright (c) 2006, 2007 Cisco Systems, Inc. All rights reserved.
+ * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+
+#include <linux/mlx4/cmd.h>
+#include <linux/mlx4/srq.h>
+#include <linux/export.h>
+#include <linux/gfp.h>
+
+#include "mlx4.h"
+#include "icm.h"
+
+void mlx4_srq_event(struct mlx4_dev *dev, u32 srqn, int event_type)
+{
+	struct mlx4_srq_table *srq_table = &mlx4_priv(dev)->srq_table;
+	struct mlx4_srq *srq;
+
+	spin_lock(&srq_table->lock);
+
+	srq = radix_tree_lookup(&srq_table->tree, srqn & (dev->caps.num_srqs - 1));
+	if (srq)
+		atomic_inc(&srq->refcount);
+
+	spin_unlock(&srq_table->lock);
+
+	if (!srq) {
+		mlx4_warn(dev, "Async event for bogus SRQ %08x\n", srqn);
+		return;
+	}
+
+	srq->event(srq, event_type);
+
+	if (atomic_dec_and_test(&srq->refcount))
+		complete(&srq->free);
+}
+
+static int mlx4_SW2HW_SRQ(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox,
+			  int srq_num)
+{
+	return mlx4_cmd(dev, mailbox->dma, srq_num, 0,
+			MLX4_CMD_SW2HW_SRQ, MLX4_CMD_TIME_CLASS_A,
+			MLX4_CMD_WRAPPED);
+}
+
+static int mlx4_HW2SW_SRQ(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox,
+			  int srq_num)
+{
+	return mlx4_cmd_box(dev, 0, mailbox ? mailbox->dma : 0, srq_num,
+			    mailbox ? 0 : 1, MLX4_CMD_HW2SW_SRQ,
+			    MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED);
+}
+
+static int mlx4_ARM_SRQ(struct mlx4_dev *dev, int srq_num, int limit_watermark)
+{
+	return mlx4_cmd(dev, limit_watermark, srq_num, 0, MLX4_CMD_ARM_SRQ,
+			MLX4_CMD_TIME_CLASS_B, MLX4_CMD_WRAPPED);
+}
+
+static int mlx4_QUERY_SRQ(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox,
+			  int srq_num)
+{
+	return mlx4_cmd_box(dev, 0, mailbox->dma, srq_num, 0, MLX4_CMD_QUERY_SRQ,
+			    MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED);
+}
+
+int __mlx4_srq_alloc_icm(struct mlx4_dev *dev, int *srqn)
+{
+	struct mlx4_srq_table *srq_table = &mlx4_priv(dev)->srq_table;
+	int err;
+
+
+	*srqn = mlx4_bitmap_alloc(&srq_table->bitmap);
+	if (*srqn == -1)
+		return -ENOMEM;
+
+	err = mlx4_table_get(dev, &srq_table->table, *srqn, GFP_KERNEL);
+	if (err)
+		goto err_out;
+
+	err = mlx4_table_get(dev, &srq_table->cmpt_table, *srqn, GFP_KERNEL);
+	if (err)
+		goto err_put;
+	return 0;
+
+err_put:
+	mlx4_table_put(dev, &srq_table->table, *srqn);
+
+err_out:
+	mlx4_bitmap_free(&srq_table->bitmap, *srqn, MLX4_NO_RR);
+	return err;
+}
+
+static int mlx4_srq_alloc_icm(struct mlx4_dev *dev, int *srqn)
+{
+	u64 out_param;
+	int err;
+
+	if (mlx4_is_mfunc(dev)) {
+		err = mlx4_cmd_imm(dev, 0, &out_param, RES_SRQ,
+				   RES_OP_RESERVE_AND_MAP,
+				   MLX4_CMD_ALLOC_RES,
+				   MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED);
+		if (!err)
+			*srqn = get_param_l(&out_param);
+
+		return err;
+	}
+	return __mlx4_srq_alloc_icm(dev, srqn);
+}
+
+void __mlx4_srq_free_icm(struct mlx4_dev *dev, int srqn)
+{
+	struct mlx4_srq_table *srq_table = &mlx4_priv(dev)->srq_table;
+
+	mlx4_table_put(dev, &srq_table->cmpt_table, srqn);
+	mlx4_table_put(dev, &srq_table->table, srqn);
+	mlx4_bitmap_free(&srq_table->bitmap, srqn, MLX4_NO_RR);
+}
+
+static void mlx4_srq_free_icm(struct mlx4_dev *dev, int srqn)
+{
+	u64 in_param = 0;
+
+	if (mlx4_is_mfunc(dev)) {
+		set_param_l(&in_param, srqn);
+		if (mlx4_cmd(dev, in_param, RES_SRQ, RES_OP_RESERVE_AND_MAP,
+			     MLX4_CMD_FREE_RES,
+			     MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED))
+			mlx4_warn(dev, "Failed freeing cq:%d\n", srqn);
+		return;
+	}
+	__mlx4_srq_free_icm(dev, srqn);
+}
+
+int mlx4_srq_alloc(struct mlx4_dev *dev, u32 pdn, u32 cqn, u16 xrcd,
+		   struct mlx4_mtt *mtt, u64 db_rec, struct mlx4_srq *srq)
+{
+	struct mlx4_srq_table *srq_table = &mlx4_priv(dev)->srq_table;
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_srq_context *srq_context;
+	u64 mtt_addr;
+	int err;
+
+	err = mlx4_srq_alloc_icm(dev, &srq->srqn);
+	if (err)
+		return err;
+
+	spin_lock_irq(&srq_table->lock);
+	err = radix_tree_insert(&srq_table->tree, srq->srqn, srq);
+	spin_unlock_irq(&srq_table->lock);
+	if (err)
+		goto err_icm;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox)) {
+		err = PTR_ERR(mailbox);
+		goto err_radix;
+	}
+
+	srq_context = mailbox->buf;
+	srq_context->state_logsize_srqn = cpu_to_be32((ilog2(srq->max) << 24) |
+						      srq->srqn);
+	srq_context->logstride          = srq->wqe_shift - 4;
+	srq_context->xrcd		= cpu_to_be16(xrcd);
+	srq_context->pg_offset_cqn	= cpu_to_be32(cqn & 0xffffff);
+	srq_context->log_page_size      = mtt->page_shift - MLX4_ICM_PAGE_SHIFT;
+
+	mtt_addr = mlx4_mtt_addr(dev, mtt);
+	srq_context->mtt_base_addr_h    = mtt_addr >> 32;
+	srq_context->mtt_base_addr_l    = cpu_to_be32(mtt_addr & 0xffffffff);
+	srq_context->pd			= cpu_to_be32(pdn);
+	srq_context->db_rec_addr        = cpu_to_be64(db_rec);
+
+	err = mlx4_SW2HW_SRQ(dev, mailbox, srq->srqn);
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	if (err)
+		goto err_radix;
+
+	atomic_set(&srq->refcount, 1);
+	init_completion(&srq->free);
+
+	return 0;
+
+err_radix:
+	spin_lock_irq(&srq_table->lock);
+	radix_tree_delete(&srq_table->tree, srq->srqn);
+	spin_unlock_irq(&srq_table->lock);
+
+err_icm:
+	mlx4_srq_free_icm(dev, srq->srqn);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx4_srq_alloc);
+
+void mlx4_srq_free(struct mlx4_dev *dev, struct mlx4_srq *srq)
+{
+	struct mlx4_srq_table *srq_table = &mlx4_priv(dev)->srq_table;
+	int err;
+
+	err = mlx4_HW2SW_SRQ(dev, NULL, srq->srqn);
+	if (err)
+		mlx4_warn(dev, "HW2SW_SRQ failed (%d) for SRQN %06x\n", err, srq->srqn);
+
+	spin_lock_irq(&srq_table->lock);
+	radix_tree_delete(&srq_table->tree, srq->srqn);
+	spin_unlock_irq(&srq_table->lock);
+
+	if (atomic_dec_and_test(&srq->refcount))
+		complete(&srq->free);
+	wait_for_completion(&srq->free);
+
+	mlx4_srq_free_icm(dev, srq->srqn);
+}
+EXPORT_SYMBOL_GPL(mlx4_srq_free);
+
+int mlx4_srq_arm(struct mlx4_dev *dev, struct mlx4_srq *srq, int limit_watermark)
+{
+	return mlx4_ARM_SRQ(dev, srq->srqn, limit_watermark);
+}
+EXPORT_SYMBOL_GPL(mlx4_srq_arm);
+
+int mlx4_srq_query(struct mlx4_dev *dev, struct mlx4_srq *srq, int *limit_watermark)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_srq_context *srq_context;
+	int err;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+
+	srq_context = mailbox->buf;
+
+	err = mlx4_QUERY_SRQ(dev, mailbox, srq->srqn);
+	if (err)
+		goto err_out;
+	*limit_watermark = be16_to_cpu(srq_context->limit_watermark);
+
+err_out:
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx4_srq_query);
+
+int mlx4_init_srq_table(struct mlx4_dev *dev)
+{
+	struct mlx4_srq_table *srq_table = &mlx4_priv(dev)->srq_table;
+	int err;
+
+	spin_lock_init(&srq_table->lock);
+	INIT_RADIX_TREE(&srq_table->tree, GFP_ATOMIC);
+	if (mlx4_is_slave(dev))
+		return 0;
+
+	err = mlx4_bitmap_init(&srq_table->bitmap, dev->caps.num_srqs,
+			       dev->caps.num_srqs - 1, dev->caps.reserved_srqs, 0);
+	if (err)
+		return err;
+
+	return 0;
+}
+
+void mlx4_cleanup_srq_table(struct mlx4_dev *dev)
+{
+	if (mlx4_is_slave(dev))
+		return;
+	mlx4_bitmap_cleanup(&mlx4_priv(dev)->srq_table.bitmap);
+}
+
+struct mlx4_srq *mlx4_srq_lookup(struct mlx4_dev *dev, u32 srqn)
+{
+	struct mlx4_srq_table *srq_table = &mlx4_priv(dev)->srq_table;
+	struct mlx4_srq *srq;
+	unsigned long flags;
+
+	spin_lock_irqsave(&srq_table->lock, flags);
+	srq = radix_tree_lookup(&srq_table->tree,
+				srqn & (dev->caps.num_srqs - 1));
+	spin_unlock_irqrestore(&srq_table->lock, flags);
+
+	return srq;
+}
+EXPORT_SYMBOL_GPL(mlx4_srq_lookup);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Kconfig b/drivers/net/ethernet/mellanox/mlx5/core/Kconfig
new file mode 100644
index 0000000..8ff57e8
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/Kconfig
@@ -0,0 +1,8 @@
+#
+# Mellanox driver configuration
+#
+
+config MLX5_CORE
+	tristate
+	depends on PCI
+	default n
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
new file mode 100644
index 0000000..105780b
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
@@ -0,0 +1,5 @@
+obj-$(CONFIG_MLX5_CORE)		+= mlx5_core.o
+
+mlx5_core-y :=	main.o cmd.o debugfs.o fw.o eq.o uar.o pagealloc.o \
+		health.o mcg.o cq.o srq.o alloc.o qp.o port.o mr.o pd.o   \
+		mad.o
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/alloc.c b/drivers/net/ethernet/mellanox/mlx5/core/alloc.c
new file mode 100644
index 0000000..56779c1
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/alloc.c
@@ -0,0 +1,238 @@
+/*
+ * Copyright (c) 2013, Mellanox Technologies inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/export.h>
+#include <linux/bitmap.h>
+#include <linux/dma-mapping.h>
+#include <linux/vmalloc.h>
+#include <linux/mlx5/driver.h>
+
+#include "mlx5_core.h"
+
+/* Handling for queue buffers -- we allocate a bunch of memory and
+ * register it in a memory region at HCA virtual address 0.  If the
+ * requested size is > max_direct, we split the allocation into
+ * multiple pages, so we don't require too much contiguous memory.
+ */
+
+int mlx5_buf_alloc(struct mlx5_core_dev *dev, int size, int max_direct,
+		   struct mlx5_buf *buf)
+{
+	dma_addr_t t;
+
+	buf->size = size;
+	if (size <= max_direct) {
+		buf->nbufs        = 1;
+		buf->npages       = 1;
+		buf->page_shift   = (u8)get_order(size) + PAGE_SHIFT;
+		buf->direct.buf   = dma_zalloc_coherent(&dev->pdev->dev,
+							size, &t, GFP_KERNEL);
+		if (!buf->direct.buf)
+			return -ENOMEM;
+
+		buf->direct.map = t;
+
+		while (t & ((1 << buf->page_shift) - 1)) {
+			--buf->page_shift;
+			buf->npages *= 2;
+		}
+	} else {
+		int i;
+
+		buf->direct.buf  = NULL;
+		buf->nbufs       = (size + PAGE_SIZE - 1) / PAGE_SIZE;
+		buf->npages      = buf->nbufs;
+		buf->page_shift  = PAGE_SHIFT;
+		buf->page_list   = kcalloc(buf->nbufs, sizeof(*buf->page_list),
+					   GFP_KERNEL);
+		if (!buf->page_list)
+			return -ENOMEM;
+
+		for (i = 0; i < buf->nbufs; i++) {
+			buf->page_list[i].buf =
+				dma_zalloc_coherent(&dev->pdev->dev, PAGE_SIZE,
+						    &t, GFP_KERNEL);
+			if (!buf->page_list[i].buf)
+				goto err_free;
+
+			buf->page_list[i].map = t;
+		}
+
+		if (BITS_PER_LONG == 64) {
+			struct page **pages;
+			pages = kmalloc(sizeof(*pages) * buf->nbufs, GFP_KERNEL);
+			if (!pages)
+				goto err_free;
+			for (i = 0; i < buf->nbufs; i++)
+				pages[i] = virt_to_page(buf->page_list[i].buf);
+			buf->direct.buf = vmap(pages, buf->nbufs, VM_MAP, PAGE_KERNEL);
+			kfree(pages);
+			if (!buf->direct.buf)
+				goto err_free;
+		}
+	}
+
+	return 0;
+
+err_free:
+	mlx5_buf_free(dev, buf);
+
+	return -ENOMEM;
+}
+EXPORT_SYMBOL_GPL(mlx5_buf_alloc);
+
+void mlx5_buf_free(struct mlx5_core_dev *dev, struct mlx5_buf *buf)
+{
+	int i;
+
+	if (buf->nbufs == 1)
+		dma_free_coherent(&dev->pdev->dev, buf->size, buf->direct.buf,
+				  buf->direct.map);
+	else {
+		if (BITS_PER_LONG == 64 && buf->direct.buf)
+			vunmap(buf->direct.buf);
+
+		for (i = 0; i < buf->nbufs; i++)
+			if (buf->page_list[i].buf)
+				dma_free_coherent(&dev->pdev->dev, PAGE_SIZE,
+						  buf->page_list[i].buf,
+						  buf->page_list[i].map);
+		kfree(buf->page_list);
+	}
+}
+EXPORT_SYMBOL_GPL(mlx5_buf_free);
+
+static struct mlx5_db_pgdir *mlx5_alloc_db_pgdir(struct device *dma_device)
+{
+	struct mlx5_db_pgdir *pgdir;
+
+	pgdir = kzalloc(sizeof(*pgdir), GFP_KERNEL);
+	if (!pgdir)
+		return NULL;
+
+	bitmap_fill(pgdir->bitmap, MLX5_DB_PER_PAGE);
+	pgdir->db_page = dma_alloc_coherent(dma_device, PAGE_SIZE,
+					    &pgdir->db_dma, GFP_KERNEL);
+	if (!pgdir->db_page) {
+		kfree(pgdir);
+		return NULL;
+	}
+
+	return pgdir;
+}
+
+static int mlx5_alloc_db_from_pgdir(struct mlx5_db_pgdir *pgdir,
+				    struct mlx5_db *db)
+{
+	int offset;
+	int i;
+
+	i = find_first_bit(pgdir->bitmap, MLX5_DB_PER_PAGE);
+	if (i >= MLX5_DB_PER_PAGE)
+		return -ENOMEM;
+
+	__clear_bit(i, pgdir->bitmap);
+
+	db->u.pgdir = pgdir;
+	db->index   = i;
+	offset = db->index * L1_CACHE_BYTES;
+	db->db      = pgdir->db_page + offset / sizeof(*pgdir->db_page);
+	db->dma     = pgdir->db_dma  + offset;
+
+	return 0;
+}
+
+int mlx5_db_alloc(struct mlx5_core_dev *dev, struct mlx5_db *db)
+{
+	struct mlx5_db_pgdir *pgdir;
+	int ret = 0;
+
+	mutex_lock(&dev->priv.pgdir_mutex);
+
+	list_for_each_entry(pgdir, &dev->priv.pgdir_list, list)
+		if (!mlx5_alloc_db_from_pgdir(pgdir, db))
+			goto out;
+
+	pgdir = mlx5_alloc_db_pgdir(&(dev->pdev->dev));
+	if (!pgdir) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	list_add(&pgdir->list, &dev->priv.pgdir_list);
+
+	/* This should never fail -- we just allocated an empty page: */
+	WARN_ON(mlx5_alloc_db_from_pgdir(pgdir, db));
+
+out:
+	mutex_unlock(&dev->priv.pgdir_mutex);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(mlx5_db_alloc);
+
+void mlx5_db_free(struct mlx5_core_dev *dev, struct mlx5_db *db)
+{
+	mutex_lock(&dev->priv.pgdir_mutex);
+
+	__set_bit(db->index, db->u.pgdir->bitmap);
+
+	if (bitmap_full(db->u.pgdir->bitmap, MLX5_DB_PER_PAGE)) {
+		dma_free_coherent(&(dev->pdev->dev), PAGE_SIZE,
+				  db->u.pgdir->db_page, db->u.pgdir->db_dma);
+		list_del(&db->u.pgdir->list);
+		kfree(db->u.pgdir);
+	}
+
+	mutex_unlock(&dev->priv.pgdir_mutex);
+}
+EXPORT_SYMBOL_GPL(mlx5_db_free);
+
+
+void mlx5_fill_page_array(struct mlx5_buf *buf, __be64 *pas)
+{
+	u64 addr;
+	int i;
+
+	for (i = 0; i < buf->npages; i++) {
+		if (buf->nbufs == 1)
+			addr = buf->direct.map + (i << buf->page_shift);
+		else
+			addr = buf->page_list[i].map;
+
+		pas[i] = cpu_to_be64(addr);
+	}
+}
+EXPORT_SYMBOL_GPL(mlx5_fill_page_array);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
new file mode 100644
index 0000000..368c6c5
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
@@ -0,0 +1,1556 @@
+/*
+ * Copyright (c) 2013, Mellanox Technologies inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <asm-generic/kmap_types.h>
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/pci.h>
+#include <linux/dma-mapping.h>
+#include <linux/slab.h>
+#include <linux/delay.h>
+#include <linux/random.h>
+#include <linux/io-mapping.h>
+#include <linux/mlx5/driver.h>
+#include <linux/debugfs.h>
+
+#include "mlx5_core.h"
+
+enum {
+	CMD_IF_REV = 5,
+};
+
+enum {
+	CMD_MODE_POLLING,
+	CMD_MODE_EVENTS
+};
+
+enum {
+	NUM_LONG_LISTS	  = 2,
+	NUM_MED_LISTS	  = 64,
+	LONG_LIST_SIZE	  = (2ULL * 1024 * 1024 * 1024 / PAGE_SIZE) * 8 + 16 +
+				MLX5_CMD_DATA_BLOCK_SIZE,
+	MED_LIST_SIZE	  = 16 + MLX5_CMD_DATA_BLOCK_SIZE,
+};
+
+enum {
+	MLX5_CMD_DELIVERY_STAT_OK			= 0x0,
+	MLX5_CMD_DELIVERY_STAT_SIGNAT_ERR		= 0x1,
+	MLX5_CMD_DELIVERY_STAT_TOK_ERR			= 0x2,
+	MLX5_CMD_DELIVERY_STAT_BAD_BLK_NUM_ERR		= 0x3,
+	MLX5_CMD_DELIVERY_STAT_OUT_PTR_ALIGN_ERR	= 0x4,
+	MLX5_CMD_DELIVERY_STAT_IN_PTR_ALIGN_ERR		= 0x5,
+	MLX5_CMD_DELIVERY_STAT_FW_ERR			= 0x6,
+	MLX5_CMD_DELIVERY_STAT_IN_LENGTH_ERR		= 0x7,
+	MLX5_CMD_DELIVERY_STAT_OUT_LENGTH_ERR		= 0x8,
+	MLX5_CMD_DELIVERY_STAT_RES_FLD_NOT_CLR_ERR	= 0x9,
+	MLX5_CMD_DELIVERY_STAT_CMD_DESCR_ERR		= 0x10,
+};
+
+enum {
+	MLX5_CMD_STAT_OK			= 0x0,
+	MLX5_CMD_STAT_INT_ERR			= 0x1,
+	MLX5_CMD_STAT_BAD_OP_ERR		= 0x2,
+	MLX5_CMD_STAT_BAD_PARAM_ERR		= 0x3,
+	MLX5_CMD_STAT_BAD_SYS_STATE_ERR		= 0x4,
+	MLX5_CMD_STAT_BAD_RES_ERR		= 0x5,
+	MLX5_CMD_STAT_RES_BUSY			= 0x6,
+	MLX5_CMD_STAT_LIM_ERR			= 0x8,
+	MLX5_CMD_STAT_BAD_RES_STATE_ERR		= 0x9,
+	MLX5_CMD_STAT_IX_ERR			= 0xa,
+	MLX5_CMD_STAT_NO_RES_ERR		= 0xf,
+	MLX5_CMD_STAT_BAD_INP_LEN_ERR		= 0x50,
+	MLX5_CMD_STAT_BAD_OUTP_LEN_ERR		= 0x51,
+	MLX5_CMD_STAT_BAD_QP_STATE_ERR		= 0x10,
+	MLX5_CMD_STAT_BAD_PKT_ERR		= 0x30,
+	MLX5_CMD_STAT_BAD_SIZE_OUTS_CQES_ERR	= 0x40,
+};
+
+static struct mlx5_cmd_work_ent *alloc_cmd(struct mlx5_cmd *cmd,
+					   struct mlx5_cmd_msg *in,
+					   struct mlx5_cmd_msg *out,
+					   void *uout, int uout_size,
+					   mlx5_cmd_cbk_t cbk,
+					   void *context, int page_queue)
+{
+	gfp_t alloc_flags = cbk ? GFP_ATOMIC : GFP_KERNEL;
+	struct mlx5_cmd_work_ent *ent;
+
+	ent = kzalloc(sizeof(*ent), alloc_flags);
+	if (!ent)
+		return ERR_PTR(-ENOMEM);
+
+	ent->in		= in;
+	ent->out	= out;
+	ent->uout	= uout;
+	ent->uout_size	= uout_size;
+	ent->callback	= cbk;
+	ent->context	= context;
+	ent->cmd	= cmd;
+	ent->page_queue = page_queue;
+
+	return ent;
+}
+
+static u8 alloc_token(struct mlx5_cmd *cmd)
+{
+	u8 token;
+
+	spin_lock(&cmd->token_lock);
+	token = cmd->token++ % 255 + 1;
+	spin_unlock(&cmd->token_lock);
+
+	return token;
+}
+
+static int alloc_ent(struct mlx5_cmd *cmd)
+{
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&cmd->alloc_lock, flags);
+	ret = find_first_bit(&cmd->bitmask, cmd->max_reg_cmds);
+	if (ret < cmd->max_reg_cmds)
+		clear_bit(ret, &cmd->bitmask);
+	spin_unlock_irqrestore(&cmd->alloc_lock, flags);
+
+	return ret < cmd->max_reg_cmds ? ret : -ENOMEM;
+}
+
+static void free_ent(struct mlx5_cmd *cmd, int idx)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&cmd->alloc_lock, flags);
+	set_bit(idx, &cmd->bitmask);
+	spin_unlock_irqrestore(&cmd->alloc_lock, flags);
+}
+
+static struct mlx5_cmd_layout *get_inst(struct mlx5_cmd *cmd, int idx)
+{
+	return cmd->cmd_buf + (idx << cmd->log_stride);
+}
+
+static u8 xor8_buf(void *buf, int len)
+{
+	u8 *ptr = buf;
+	u8 sum = 0;
+	int i;
+
+	for (i = 0; i < len; i++)
+		sum ^= ptr[i];
+
+	return sum;
+}
+
+static int verify_block_sig(struct mlx5_cmd_prot_block *block)
+{
+	if (xor8_buf(block->rsvd0, sizeof(*block) - sizeof(block->data) - 1) != 0xff)
+		return -EINVAL;
+
+	if (xor8_buf(block, sizeof(*block)) != 0xff)
+		return -EINVAL;
+
+	return 0;
+}
+
+static void calc_block_sig(struct mlx5_cmd_prot_block *block, u8 token,
+			   int csum)
+{
+	block->token = token;
+	if (csum) {
+		block->ctrl_sig = ~xor8_buf(block->rsvd0, sizeof(*block) -
+					    sizeof(block->data) - 2);
+		block->sig = ~xor8_buf(block, sizeof(*block) - 1);
+	}
+}
+
+static void calc_chain_sig(struct mlx5_cmd_msg *msg, u8 token, int csum)
+{
+	struct mlx5_cmd_mailbox *next = msg->next;
+
+	while (next) {
+		calc_block_sig(next->buf, token, csum);
+		next = next->next;
+	}
+}
+
+static void set_signature(struct mlx5_cmd_work_ent *ent, int csum)
+{
+	ent->lay->sig = ~xor8_buf(ent->lay, sizeof(*ent->lay));
+	calc_chain_sig(ent->in, ent->token, csum);
+	calc_chain_sig(ent->out, ent->token, csum);
+}
+
+static void poll_timeout(struct mlx5_cmd_work_ent *ent)
+{
+	unsigned long poll_end = jiffies + msecs_to_jiffies(MLX5_CMD_TIMEOUT_MSEC + 1000);
+	u8 own;
+
+	do {
+		own = ent->lay->status_own;
+		if (!(own & CMD_OWNER_HW)) {
+			ent->ret = 0;
+			return;
+		}
+		usleep_range(5000, 10000);
+	} while (time_before(jiffies, poll_end));
+
+	ent->ret = -ETIMEDOUT;
+}
+
+static void free_cmd(struct mlx5_cmd_work_ent *ent)
+{
+	kfree(ent);
+}
+
+
+static int verify_signature(struct mlx5_cmd_work_ent *ent)
+{
+	struct mlx5_cmd_mailbox *next = ent->out->next;
+	int err;
+	u8 sig;
+
+	sig = xor8_buf(ent->lay, sizeof(*ent->lay));
+	if (sig != 0xff)
+		return -EINVAL;
+
+	while (next) {
+		err = verify_block_sig(next->buf);
+		if (err)
+			return err;
+
+		next = next->next;
+	}
+
+	return 0;
+}
+
+static void dump_buf(void *buf, int size, int data_only, int offset)
+{
+	__be32 *p = buf;
+	int i;
+
+	for (i = 0; i < size; i += 16) {
+		pr_debug("%03x: %08x %08x %08x %08x\n", offset, be32_to_cpu(p[0]),
+			 be32_to_cpu(p[1]), be32_to_cpu(p[2]),
+			 be32_to_cpu(p[3]));
+		p += 4;
+		offset += 16;
+	}
+	if (!data_only)
+		pr_debug("\n");
+}
+
+const char *mlx5_command_str(int command)
+{
+	switch (command) {
+	case MLX5_CMD_OP_QUERY_HCA_CAP:
+		return "QUERY_HCA_CAP";
+
+	case MLX5_CMD_OP_SET_HCA_CAP:
+		return "SET_HCA_CAP";
+
+	case MLX5_CMD_OP_QUERY_ADAPTER:
+		return "QUERY_ADAPTER";
+
+	case MLX5_CMD_OP_INIT_HCA:
+		return "INIT_HCA";
+
+	case MLX5_CMD_OP_TEARDOWN_HCA:
+		return "TEARDOWN_HCA";
+
+	case MLX5_CMD_OP_ENABLE_HCA:
+		return "MLX5_CMD_OP_ENABLE_HCA";
+
+	case MLX5_CMD_OP_DISABLE_HCA:
+		return "MLX5_CMD_OP_DISABLE_HCA";
+
+	case MLX5_CMD_OP_QUERY_PAGES:
+		return "QUERY_PAGES";
+
+	case MLX5_CMD_OP_MANAGE_PAGES:
+		return "MANAGE_PAGES";
+
+	case MLX5_CMD_OP_CREATE_MKEY:
+		return "CREATE_MKEY";
+
+	case MLX5_CMD_OP_QUERY_MKEY:
+		return "QUERY_MKEY";
+
+	case MLX5_CMD_OP_DESTROY_MKEY:
+		return "DESTROY_MKEY";
+
+	case MLX5_CMD_OP_QUERY_SPECIAL_CONTEXTS:
+		return "QUERY_SPECIAL_CONTEXTS";
+
+	case MLX5_CMD_OP_CREATE_EQ:
+		return "CREATE_EQ";
+
+	case MLX5_CMD_OP_DESTROY_EQ:
+		return "DESTROY_EQ";
+
+	case MLX5_CMD_OP_QUERY_EQ:
+		return "QUERY_EQ";
+
+	case MLX5_CMD_OP_CREATE_CQ:
+		return "CREATE_CQ";
+
+	case MLX5_CMD_OP_DESTROY_CQ:
+		return "DESTROY_CQ";
+
+	case MLX5_CMD_OP_QUERY_CQ:
+		return "QUERY_CQ";
+
+	case MLX5_CMD_OP_MODIFY_CQ:
+		return "MODIFY_CQ";
+
+	case MLX5_CMD_OP_CREATE_QP:
+		return "CREATE_QP";
+
+	case MLX5_CMD_OP_DESTROY_QP:
+		return "DESTROY_QP";
+
+	case MLX5_CMD_OP_RST2INIT_QP:
+		return "RST2INIT_QP";
+
+	case MLX5_CMD_OP_INIT2RTR_QP:
+		return "INIT2RTR_QP";
+
+	case MLX5_CMD_OP_RTR2RTS_QP:
+		return "RTR2RTS_QP";
+
+	case MLX5_CMD_OP_RTS2RTS_QP:
+		return "RTS2RTS_QP";
+
+	case MLX5_CMD_OP_SQERR2RTS_QP:
+		return "SQERR2RTS_QP";
+
+	case MLX5_CMD_OP_2ERR_QP:
+		return "2ERR_QP";
+
+	case MLX5_CMD_OP_2RST_QP:
+		return "2RST_QP";
+
+	case MLX5_CMD_OP_QUERY_QP:
+		return "QUERY_QP";
+
+	case MLX5_CMD_OP_MAD_IFC:
+		return "MAD_IFC";
+
+	case MLX5_CMD_OP_INIT2INIT_QP:
+		return "INIT2INIT_QP";
+
+	case MLX5_CMD_OP_CREATE_PSV:
+		return "CREATE_PSV";
+
+	case MLX5_CMD_OP_DESTROY_PSV:
+		return "DESTROY_PSV";
+
+	case MLX5_CMD_OP_CREATE_SRQ:
+		return "CREATE_SRQ";
+
+	case MLX5_CMD_OP_DESTROY_SRQ:
+		return "DESTROY_SRQ";
+
+	case MLX5_CMD_OP_QUERY_SRQ:
+		return "QUERY_SRQ";
+
+	case MLX5_CMD_OP_ARM_RQ:
+		return "ARM_RQ";
+
+	case MLX5_CMD_OP_RESIZE_SRQ:
+		return "RESIZE_SRQ";
+
+	case MLX5_CMD_OP_ALLOC_PD:
+		return "ALLOC_PD";
+
+	case MLX5_CMD_OP_DEALLOC_PD:
+		return "DEALLOC_PD";
+
+	case MLX5_CMD_OP_ALLOC_UAR:
+		return "ALLOC_UAR";
+
+	case MLX5_CMD_OP_DEALLOC_UAR:
+		return "DEALLOC_UAR";
+
+	case MLX5_CMD_OP_ATTACH_TO_MCG:
+		return "ATTACH_TO_MCG";
+
+	case MLX5_CMD_OP_DETACH_FROM_MCG:
+		return "DETACH_FROM_MCG";
+
+	case MLX5_CMD_OP_ALLOC_XRCD:
+		return "ALLOC_XRCD";
+
+	case MLX5_CMD_OP_DEALLOC_XRCD:
+		return "DEALLOC_XRCD";
+
+	case MLX5_CMD_OP_ACCESS_REG:
+		return "MLX5_CMD_OP_ACCESS_REG";
+
+	default: return "unknown command opcode";
+	}
+}
+
+static void dump_command(struct mlx5_core_dev *dev,
+			 struct mlx5_cmd_work_ent *ent, int input)
+{
+	u16 op = be16_to_cpu(((struct mlx5_inbox_hdr *)(ent->lay->in))->opcode);
+	struct mlx5_cmd_msg *msg = input ? ent->in : ent->out;
+	struct mlx5_cmd_mailbox *next = msg->next;
+	int data_only;
+	u32 offset = 0;
+	int dump_len;
+
+	data_only = !!(mlx5_core_debug_mask & (1 << MLX5_CMD_DATA));
+
+	if (data_only)
+		mlx5_core_dbg_mask(dev, 1 << MLX5_CMD_DATA,
+				   "dump command data %s(0x%x) %s\n",
+				   mlx5_command_str(op), op,
+				   input ? "INPUT" : "OUTPUT");
+	else
+		mlx5_core_dbg(dev, "dump command %s(0x%x) %s\n",
+			      mlx5_command_str(op), op,
+			      input ? "INPUT" : "OUTPUT");
+
+	if (data_only) {
+		if (input) {
+			dump_buf(ent->lay->in, sizeof(ent->lay->in), 1, offset);
+			offset += sizeof(ent->lay->in);
+		} else {
+			dump_buf(ent->lay->out, sizeof(ent->lay->out), 1, offset);
+			offset += sizeof(ent->lay->out);
+		}
+	} else {
+		dump_buf(ent->lay, sizeof(*ent->lay), 0, offset);
+		offset += sizeof(*ent->lay);
+	}
+
+	while (next && offset < msg->len) {
+		if (data_only) {
+			dump_len = min_t(int, MLX5_CMD_DATA_BLOCK_SIZE, msg->len - offset);
+			dump_buf(next->buf, dump_len, 1, offset);
+			offset += MLX5_CMD_DATA_BLOCK_SIZE;
+		} else {
+			mlx5_core_dbg(dev, "command block:\n");
+			dump_buf(next->buf, sizeof(struct mlx5_cmd_prot_block), 0, offset);
+			offset += sizeof(struct mlx5_cmd_prot_block);
+		}
+		next = next->next;
+	}
+
+	if (data_only)
+		pr_debug("\n");
+}
+
+static void cmd_work_handler(struct work_struct *work)
+{
+	struct mlx5_cmd_work_ent *ent = container_of(work, struct mlx5_cmd_work_ent, work);
+	struct mlx5_cmd *cmd = ent->cmd;
+	struct mlx5_core_dev *dev = container_of(cmd, struct mlx5_core_dev, cmd);
+	struct mlx5_cmd_layout *lay;
+	struct semaphore *sem;
+
+	sem = ent->page_queue ? &cmd->pages_sem : &cmd->sem;
+	down(sem);
+	if (!ent->page_queue) {
+		ent->idx = alloc_ent(cmd);
+		if (ent->idx < 0) {
+			mlx5_core_err(dev, "failed to allocate command entry\n");
+			up(sem);
+			return;
+		}
+	} else {
+		ent->idx = cmd->max_reg_cmds;
+	}
+
+	ent->token = alloc_token(cmd);
+	cmd->ent_arr[ent->idx] = ent;
+	lay = get_inst(cmd, ent->idx);
+	ent->lay = lay;
+	memset(lay, 0, sizeof(*lay));
+	memcpy(lay->in, ent->in->first.data, sizeof(lay->in));
+	ent->op = be32_to_cpu(lay->in[0]) >> 16;
+	if (ent->in->next)
+		lay->in_ptr = cpu_to_be64(ent->in->next->dma);
+	lay->inlen = cpu_to_be32(ent->in->len);
+	if (ent->out->next)
+		lay->out_ptr = cpu_to_be64(ent->out->next->dma);
+	lay->outlen = cpu_to_be32(ent->out->len);
+	lay->type = MLX5_PCI_CMD_XPORT;
+	lay->token = ent->token;
+	lay->status_own = CMD_OWNER_HW;
+	set_signature(ent, !cmd->checksum_disabled);
+	dump_command(dev, ent, 1);
+	ent->ts1 = ktime_get_ns();
+
+	/* ring doorbell after the descriptor is valid */
+	wmb();
+	iowrite32be(1 << ent->idx, &dev->iseg->cmd_dbell);
+	mlx5_core_dbg(dev, "write 0x%x to command doorbell\n", 1 << ent->idx);
+	mmiowb();
+	if (cmd->mode == CMD_MODE_POLLING) {
+		poll_timeout(ent);
+		/* make sure we read the descriptor after ownership is SW */
+		rmb();
+		mlx5_cmd_comp_handler(dev, 1UL << ent->idx);
+	}
+}
+
+static const char *deliv_status_to_str(u8 status)
+{
+	switch (status) {
+	case MLX5_CMD_DELIVERY_STAT_OK:
+		return "no errors";
+	case MLX5_CMD_DELIVERY_STAT_SIGNAT_ERR:
+		return "signature error";
+	case MLX5_CMD_DELIVERY_STAT_TOK_ERR:
+		return "token error";
+	case MLX5_CMD_DELIVERY_STAT_BAD_BLK_NUM_ERR:
+		return "bad block number";
+	case MLX5_CMD_DELIVERY_STAT_OUT_PTR_ALIGN_ERR:
+		return "output pointer not aligned to block size";
+	case MLX5_CMD_DELIVERY_STAT_IN_PTR_ALIGN_ERR:
+		return "input pointer not aligned to block size";
+	case MLX5_CMD_DELIVERY_STAT_FW_ERR:
+		return "firmware internal error";
+	case MLX5_CMD_DELIVERY_STAT_IN_LENGTH_ERR:
+		return "command input length error";
+	case MLX5_CMD_DELIVERY_STAT_OUT_LENGTH_ERR:
+		return "command ouput length error";
+	case MLX5_CMD_DELIVERY_STAT_RES_FLD_NOT_CLR_ERR:
+		return "reserved fields not cleared";
+	case MLX5_CMD_DELIVERY_STAT_CMD_DESCR_ERR:
+		return "bad command descriptor type";
+	default:
+		return "unknown status code";
+	}
+}
+
+static u16 msg_to_opcode(struct mlx5_cmd_msg *in)
+{
+	struct mlx5_inbox_hdr *hdr = (struct mlx5_inbox_hdr *)(in->first.data);
+
+	return be16_to_cpu(hdr->opcode);
+}
+
+static int wait_func(struct mlx5_core_dev *dev, struct mlx5_cmd_work_ent *ent)
+{
+	unsigned long timeout = msecs_to_jiffies(MLX5_CMD_TIMEOUT_MSEC);
+	struct mlx5_cmd *cmd = &dev->cmd;
+	int err;
+
+	if (cmd->mode == CMD_MODE_POLLING) {
+		wait_for_completion(&ent->done);
+		err = ent->ret;
+	} else {
+		if (!wait_for_completion_timeout(&ent->done, timeout))
+			err = -ETIMEDOUT;
+		else
+			err = 0;
+	}
+	if (err == -ETIMEDOUT) {
+		mlx5_core_warn(dev, "%s(0x%x) timeout. Will cause a leak of a command resource\n",
+			       mlx5_command_str(msg_to_opcode(ent->in)),
+			       msg_to_opcode(ent->in));
+	}
+	mlx5_core_dbg(dev, "err %d, delivery status %s(%d)\n",
+		      err, deliv_status_to_str(ent->status), ent->status);
+
+	return err;
+}
+
+/*  Notes:
+ *    1. Callback functions may not sleep
+ *    2. page queue commands do not support asynchrous completion
+ */
+static int mlx5_cmd_invoke(struct mlx5_core_dev *dev, struct mlx5_cmd_msg *in,
+			   struct mlx5_cmd_msg *out, void *uout, int uout_size,
+			   mlx5_cmd_cbk_t callback,
+			   void *context, int page_queue, u8 *status)
+{
+	struct mlx5_cmd *cmd = &dev->cmd;
+	struct mlx5_cmd_work_ent *ent;
+	struct mlx5_cmd_stats *stats;
+	int err = 0;
+	s64 ds;
+	u16 op;
+
+	if (callback && page_queue)
+		return -EINVAL;
+
+	ent = alloc_cmd(cmd, in, out, uout, uout_size, callback, context,
+			page_queue);
+	if (IS_ERR(ent))
+		return PTR_ERR(ent);
+
+	if (!callback)
+		init_completion(&ent->done);
+
+	INIT_WORK(&ent->work, cmd_work_handler);
+	if (page_queue) {
+		cmd_work_handler(&ent->work);
+	} else if (!queue_work(cmd->wq, &ent->work)) {
+		mlx5_core_warn(dev, "failed to queue work\n");
+		err = -ENOMEM;
+		goto out_free;
+	}
+
+	if (!callback) {
+		err = wait_func(dev, ent);
+		if (err == -ETIMEDOUT)
+			goto out;
+
+		ds = ent->ts2 - ent->ts1;
+		op = be16_to_cpu(((struct mlx5_inbox_hdr *)in->first.data)->opcode);
+		if (op < ARRAY_SIZE(cmd->stats)) {
+			stats = &cmd->stats[op];
+			spin_lock_irq(&stats->lock);
+			stats->sum += ds;
+			++stats->n;
+			spin_unlock_irq(&stats->lock);
+		}
+		mlx5_core_dbg_mask(dev, 1 << MLX5_CMD_TIME,
+				   "fw exec time for %s is %lld nsec\n",
+				   mlx5_command_str(op), ds);
+		*status = ent->status;
+		free_cmd(ent);
+	}
+
+	return err;
+
+out_free:
+	free_cmd(ent);
+out:
+	return err;
+}
+
+static ssize_t dbg_write(struct file *filp, const char __user *buf,
+			 size_t count, loff_t *pos)
+{
+	struct mlx5_core_dev *dev = filp->private_data;
+	struct mlx5_cmd_debug *dbg = &dev->cmd.dbg;
+	char lbuf[3];
+	int err;
+
+	if (!dbg->in_msg || !dbg->out_msg)
+		return -ENOMEM;
+
+	if (copy_from_user(lbuf, buf, sizeof(lbuf)))
+		return -EFAULT;
+
+	lbuf[sizeof(lbuf) - 1] = 0;
+
+	if (strcmp(lbuf, "go"))
+		return -EINVAL;
+
+	err = mlx5_cmd_exec(dev, dbg->in_msg, dbg->inlen, dbg->out_msg, dbg->outlen);
+
+	return err ? err : count;
+}
+
+
+static const struct file_operations fops = {
+	.owner	= THIS_MODULE,
+	.open	= simple_open,
+	.write	= dbg_write,
+};
+
+static int mlx5_copy_to_msg(struct mlx5_cmd_msg *to, void *from, int size)
+{
+	struct mlx5_cmd_prot_block *block;
+	struct mlx5_cmd_mailbox *next;
+	int copy;
+
+	if (!to || !from)
+		return -ENOMEM;
+
+	copy = min_t(int, size, sizeof(to->first.data));
+	memcpy(to->first.data, from, copy);
+	size -= copy;
+	from += copy;
+
+	next = to->next;
+	while (size) {
+		if (!next) {
+			/* this is a BUG */
+			return -ENOMEM;
+		}
+
+		copy = min_t(int, size, MLX5_CMD_DATA_BLOCK_SIZE);
+		block = next->buf;
+		memcpy(block->data, from, copy);
+		from += copy;
+		size -= copy;
+		next = next->next;
+	}
+
+	return 0;
+}
+
+static int mlx5_copy_from_msg(void *to, struct mlx5_cmd_msg *from, int size)
+{
+	struct mlx5_cmd_prot_block *block;
+	struct mlx5_cmd_mailbox *next;
+	int copy;
+
+	if (!to || !from)
+		return -ENOMEM;
+
+	copy = min_t(int, size, sizeof(from->first.data));
+	memcpy(to, from->first.data, copy);
+	size -= copy;
+	to += copy;
+
+	next = from->next;
+	while (size) {
+		if (!next) {
+			/* this is a BUG */
+			return -ENOMEM;
+		}
+
+		copy = min_t(int, size, MLX5_CMD_DATA_BLOCK_SIZE);
+		block = next->buf;
+
+		memcpy(to, block->data, copy);
+		to += copy;
+		size -= copy;
+		next = next->next;
+	}
+
+	return 0;
+}
+
+static struct mlx5_cmd_mailbox *alloc_cmd_box(struct mlx5_core_dev *dev,
+					      gfp_t flags)
+{
+	struct mlx5_cmd_mailbox *mailbox;
+
+	mailbox = kmalloc(sizeof(*mailbox), flags);
+	if (!mailbox)
+		return ERR_PTR(-ENOMEM);
+
+	mailbox->buf = pci_pool_alloc(dev->cmd.pool, flags,
+				      &mailbox->dma);
+	if (!mailbox->buf) {
+		mlx5_core_dbg(dev, "failed allocation\n");
+		kfree(mailbox);
+		return ERR_PTR(-ENOMEM);
+	}
+	memset(mailbox->buf, 0, sizeof(struct mlx5_cmd_prot_block));
+	mailbox->next = NULL;
+
+	return mailbox;
+}
+
+static void free_cmd_box(struct mlx5_core_dev *dev,
+			 struct mlx5_cmd_mailbox *mailbox)
+{
+	pci_pool_free(dev->cmd.pool, mailbox->buf, mailbox->dma);
+	kfree(mailbox);
+}
+
+static struct mlx5_cmd_msg *mlx5_alloc_cmd_msg(struct mlx5_core_dev *dev,
+					       gfp_t flags, int size)
+{
+	struct mlx5_cmd_mailbox *tmp, *head = NULL;
+	struct mlx5_cmd_prot_block *block;
+	struct mlx5_cmd_msg *msg;
+	int blen;
+	int err;
+	int n;
+	int i;
+
+	msg = kzalloc(sizeof(*msg), flags);
+	if (!msg)
+		return ERR_PTR(-ENOMEM);
+
+	blen = size - min_t(int, sizeof(msg->first.data), size);
+	n = (blen + MLX5_CMD_DATA_BLOCK_SIZE - 1) / MLX5_CMD_DATA_BLOCK_SIZE;
+
+	for (i = 0; i < n; i++) {
+		tmp = alloc_cmd_box(dev, flags);
+		if (IS_ERR(tmp)) {
+			mlx5_core_warn(dev, "failed allocating block\n");
+			err = PTR_ERR(tmp);
+			goto err_alloc;
+		}
+
+		block = tmp->buf;
+		tmp->next = head;
+		block->next = cpu_to_be64(tmp->next ? tmp->next->dma : 0);
+		block->block_num = cpu_to_be32(n - i - 1);
+		head = tmp;
+	}
+	msg->next = head;
+	msg->len = size;
+	return msg;
+
+err_alloc:
+	while (head) {
+		tmp = head->next;
+		free_cmd_box(dev, head);
+		head = tmp;
+	}
+	kfree(msg);
+
+	return ERR_PTR(err);
+}
+
+static void mlx5_free_cmd_msg(struct mlx5_core_dev *dev,
+				  struct mlx5_cmd_msg *msg)
+{
+	struct mlx5_cmd_mailbox *head = msg->next;
+	struct mlx5_cmd_mailbox *next;
+
+	while (head) {
+		next = head->next;
+		free_cmd_box(dev, head);
+		head = next;
+	}
+	kfree(msg);
+}
+
+static ssize_t data_write(struct file *filp, const char __user *buf,
+			  size_t count, loff_t *pos)
+{
+	struct mlx5_core_dev *dev = filp->private_data;
+	struct mlx5_cmd_debug *dbg = &dev->cmd.dbg;
+	void *ptr;
+	int err;
+
+	if (*pos != 0)
+		return -EINVAL;
+
+	kfree(dbg->in_msg);
+	dbg->in_msg = NULL;
+	dbg->inlen = 0;
+
+	ptr = kzalloc(count, GFP_KERNEL);
+	if (!ptr)
+		return -ENOMEM;
+
+	if (copy_from_user(ptr, buf, count)) {
+		err = -EFAULT;
+		goto out;
+	}
+	dbg->in_msg = ptr;
+	dbg->inlen = count;
+
+	*pos = count;
+
+	return count;
+
+out:
+	kfree(ptr);
+	return err;
+}
+
+static ssize_t data_read(struct file *filp, char __user *buf, size_t count,
+			 loff_t *pos)
+{
+	struct mlx5_core_dev *dev = filp->private_data;
+	struct mlx5_cmd_debug *dbg = &dev->cmd.dbg;
+	int copy;
+
+	if (*pos)
+		return 0;
+
+	if (!dbg->out_msg)
+		return -ENOMEM;
+
+	copy = min_t(int, count, dbg->outlen);
+	if (copy_to_user(buf, dbg->out_msg, copy))
+		return -EFAULT;
+
+	*pos += copy;
+
+	return copy;
+}
+
+static const struct file_operations dfops = {
+	.owner	= THIS_MODULE,
+	.open	= simple_open,
+	.write	= data_write,
+	.read	= data_read,
+};
+
+static ssize_t outlen_read(struct file *filp, char __user *buf, size_t count,
+			   loff_t *pos)
+{
+	struct mlx5_core_dev *dev = filp->private_data;
+	struct mlx5_cmd_debug *dbg = &dev->cmd.dbg;
+	char outlen[8];
+	int err;
+
+	if (*pos)
+		return 0;
+
+	err = snprintf(outlen, sizeof(outlen), "%d", dbg->outlen);
+	if (err < 0)
+		return err;
+
+	if (copy_to_user(buf, &outlen, err))
+		return -EFAULT;
+
+	*pos += err;
+
+	return err;
+}
+
+static ssize_t outlen_write(struct file *filp, const char __user *buf,
+			    size_t count, loff_t *pos)
+{
+	struct mlx5_core_dev *dev = filp->private_data;
+	struct mlx5_cmd_debug *dbg = &dev->cmd.dbg;
+	char outlen_str[8];
+	int outlen;
+	void *ptr;
+	int err;
+
+	if (*pos != 0 || count > 6)
+		return -EINVAL;
+
+	kfree(dbg->out_msg);
+	dbg->out_msg = NULL;
+	dbg->outlen = 0;
+
+	if (copy_from_user(outlen_str, buf, count))
+		return -EFAULT;
+
+	outlen_str[7] = 0;
+
+	err = sscanf(outlen_str, "%d", &outlen);
+	if (err < 0)
+		return err;
+
+	ptr = kzalloc(outlen, GFP_KERNEL);
+	if (!ptr)
+		return -ENOMEM;
+
+	dbg->out_msg = ptr;
+	dbg->outlen = outlen;
+
+	*pos = count;
+
+	return count;
+}
+
+static const struct file_operations olfops = {
+	.owner	= THIS_MODULE,
+	.open	= simple_open,
+	.write	= outlen_write,
+	.read	= outlen_read,
+};
+
+static void set_wqname(struct mlx5_core_dev *dev)
+{
+	struct mlx5_cmd *cmd = &dev->cmd;
+
+	snprintf(cmd->wq_name, sizeof(cmd->wq_name), "mlx5_cmd_%s",
+		 dev_name(&dev->pdev->dev));
+}
+
+static void clean_debug_files(struct mlx5_core_dev *dev)
+{
+	struct mlx5_cmd_debug *dbg = &dev->cmd.dbg;
+
+	if (!mlx5_debugfs_root)
+		return;
+
+	mlx5_cmdif_debugfs_cleanup(dev);
+	debugfs_remove_recursive(dbg->dbg_root);
+}
+
+static int create_debugfs_files(struct mlx5_core_dev *dev)
+{
+	struct mlx5_cmd_debug *dbg = &dev->cmd.dbg;
+	int err = -ENOMEM;
+
+	if (!mlx5_debugfs_root)
+		return 0;
+
+	dbg->dbg_root = debugfs_create_dir("cmd", dev->priv.dbg_root);
+	if (!dbg->dbg_root)
+		return err;
+
+	dbg->dbg_in = debugfs_create_file("in", 0400, dbg->dbg_root,
+					  dev, &dfops);
+	if (!dbg->dbg_in)
+		goto err_dbg;
+
+	dbg->dbg_out = debugfs_create_file("out", 0200, dbg->dbg_root,
+					   dev, &dfops);
+	if (!dbg->dbg_out)
+		goto err_dbg;
+
+	dbg->dbg_outlen = debugfs_create_file("out_len", 0600, dbg->dbg_root,
+					      dev, &olfops);
+	if (!dbg->dbg_outlen)
+		goto err_dbg;
+
+	dbg->dbg_status = debugfs_create_u8("status", 0600, dbg->dbg_root,
+					    &dbg->status);
+	if (!dbg->dbg_status)
+		goto err_dbg;
+
+	dbg->dbg_run = debugfs_create_file("run", 0200, dbg->dbg_root, dev, &fops);
+	if (!dbg->dbg_run)
+		goto err_dbg;
+
+	mlx5_cmdif_debugfs_init(dev);
+
+	return 0;
+
+err_dbg:
+	clean_debug_files(dev);
+	return err;
+}
+
+void mlx5_cmd_use_events(struct mlx5_core_dev *dev)
+{
+	struct mlx5_cmd *cmd = &dev->cmd;
+	int i;
+
+	for (i = 0; i < cmd->max_reg_cmds; i++)
+		down(&cmd->sem);
+
+	down(&cmd->pages_sem);
+
+	flush_workqueue(cmd->wq);
+
+	cmd->mode = CMD_MODE_EVENTS;
+
+	up(&cmd->pages_sem);
+	for (i = 0; i < cmd->max_reg_cmds; i++)
+		up(&cmd->sem);
+}
+
+void mlx5_cmd_use_polling(struct mlx5_core_dev *dev)
+{
+	struct mlx5_cmd *cmd = &dev->cmd;
+	int i;
+
+	for (i = 0; i < cmd->max_reg_cmds; i++)
+		down(&cmd->sem);
+
+	down(&cmd->pages_sem);
+
+	flush_workqueue(cmd->wq);
+	cmd->mode = CMD_MODE_POLLING;
+
+	up(&cmd->pages_sem);
+	for (i = 0; i < cmd->max_reg_cmds; i++)
+		up(&cmd->sem);
+}
+
+static void free_msg(struct mlx5_core_dev *dev, struct mlx5_cmd_msg *msg)
+{
+	unsigned long flags;
+
+	if (msg->cache) {
+		spin_lock_irqsave(&msg->cache->lock, flags);
+		list_add_tail(&msg->list, &msg->cache->head);
+		spin_unlock_irqrestore(&msg->cache->lock, flags);
+	} else {
+		mlx5_free_cmd_msg(dev, msg);
+	}
+}
+
+void mlx5_cmd_comp_handler(struct mlx5_core_dev *dev, unsigned long vector)
+{
+	struct mlx5_cmd *cmd = &dev->cmd;
+	struct mlx5_cmd_work_ent *ent;
+	mlx5_cmd_cbk_t callback;
+	void *context;
+	int err;
+	int i;
+	s64 ds;
+	struct mlx5_cmd_stats *stats;
+	unsigned long flags;
+
+	for (i = 0; i < (1 << cmd->log_sz); i++) {
+		if (test_bit(i, &vector)) {
+			struct semaphore *sem;
+
+			ent = cmd->ent_arr[i];
+			if (ent->page_queue)
+				sem = &cmd->pages_sem;
+			else
+				sem = &cmd->sem;
+			ent->ts2 = ktime_get_ns();
+			memcpy(ent->out->first.data, ent->lay->out, sizeof(ent->lay->out));
+			dump_command(dev, ent, 0);
+			if (!ent->ret) {
+				if (!cmd->checksum_disabled)
+					ent->ret = verify_signature(ent);
+				else
+					ent->ret = 0;
+				ent->status = ent->lay->status_own >> 1;
+				mlx5_core_dbg(dev, "command completed. ret 0x%x, delivery status %s(0x%x)\n",
+					      ent->ret, deliv_status_to_str(ent->status), ent->status);
+			}
+			free_ent(cmd, ent->idx);
+			if (ent->callback) {
+				ds = ent->ts2 - ent->ts1;
+				if (ent->op < ARRAY_SIZE(cmd->stats)) {
+					stats = &cmd->stats[ent->op];
+					spin_lock_irqsave(&stats->lock, flags);
+					stats->sum += ds;
+					++stats->n;
+					spin_unlock_irqrestore(&stats->lock, flags);
+				}
+
+				callback = ent->callback;
+				context = ent->context;
+				err = ent->ret;
+				if (!err)
+					err = mlx5_copy_from_msg(ent->uout,
+								 ent->out,
+								 ent->uout_size);
+
+				mlx5_free_cmd_msg(dev, ent->out);
+				free_msg(dev, ent->in);
+
+				free_cmd(ent);
+				callback(err, context);
+			} else {
+				complete(&ent->done);
+			}
+			up(sem);
+		}
+	}
+}
+EXPORT_SYMBOL(mlx5_cmd_comp_handler);
+
+static int status_to_err(u8 status)
+{
+	return status ? -1 : 0; /* TBD more meaningful codes */
+}
+
+static struct mlx5_cmd_msg *alloc_msg(struct mlx5_core_dev *dev, int in_size,
+				      gfp_t gfp)
+{
+	struct mlx5_cmd_msg *msg = ERR_PTR(-ENOMEM);
+	struct mlx5_cmd *cmd = &dev->cmd;
+	struct cache_ent *ent = NULL;
+
+	if (in_size > MED_LIST_SIZE && in_size <= LONG_LIST_SIZE)
+		ent = &cmd->cache.large;
+	else if (in_size > 16 && in_size <= MED_LIST_SIZE)
+		ent = &cmd->cache.med;
+
+	if (ent) {
+		spin_lock_irq(&ent->lock);
+		if (!list_empty(&ent->head)) {
+			msg = list_entry(ent->head.next, typeof(*msg), list);
+			/* For cached lists, we must explicitly state what is
+			 * the real size
+			 */
+			msg->len = in_size;
+			list_del(&msg->list);
+		}
+		spin_unlock_irq(&ent->lock);
+	}
+
+	if (IS_ERR(msg))
+		msg = mlx5_alloc_cmd_msg(dev, gfp, in_size);
+
+	return msg;
+}
+
+static int is_manage_pages(struct mlx5_inbox_hdr *in)
+{
+	return be16_to_cpu(in->opcode) == MLX5_CMD_OP_MANAGE_PAGES;
+}
+
+static int cmd_exec(struct mlx5_core_dev *dev, void *in, int in_size, void *out,
+		    int out_size, mlx5_cmd_cbk_t callback, void *context)
+{
+	struct mlx5_cmd_msg *inb;
+	struct mlx5_cmd_msg *outb;
+	int pages_queue;
+	gfp_t gfp;
+	int err;
+	u8 status = 0;
+
+	pages_queue = is_manage_pages(in);
+	gfp = callback ? GFP_ATOMIC : GFP_KERNEL;
+
+	inb = alloc_msg(dev, in_size, gfp);
+	if (IS_ERR(inb)) {
+		err = PTR_ERR(inb);
+		return err;
+	}
+
+	err = mlx5_copy_to_msg(inb, in, in_size);
+	if (err) {
+		mlx5_core_warn(dev, "err %d\n", err);
+		goto out_in;
+	}
+
+	outb = mlx5_alloc_cmd_msg(dev, gfp, out_size);
+	if (IS_ERR(outb)) {
+		err = PTR_ERR(outb);
+		goto out_in;
+	}
+
+	err = mlx5_cmd_invoke(dev, inb, outb, out, out_size, callback, context,
+			      pages_queue, &status);
+	if (err)
+		goto out_out;
+
+	mlx5_core_dbg(dev, "err %d, status %d\n", err, status);
+	if (status) {
+		err = status_to_err(status);
+		goto out_out;
+	}
+
+	err = mlx5_copy_from_msg(out, outb, out_size);
+
+out_out:
+	if (!callback)
+		mlx5_free_cmd_msg(dev, outb);
+
+out_in:
+	if (!callback)
+		free_msg(dev, inb);
+	return err;
+}
+
+int mlx5_cmd_exec(struct mlx5_core_dev *dev, void *in, int in_size, void *out,
+		  int out_size)
+{
+	return cmd_exec(dev, in, in_size, out, out_size, NULL, NULL);
+}
+EXPORT_SYMBOL(mlx5_cmd_exec);
+
+int mlx5_cmd_exec_cb(struct mlx5_core_dev *dev, void *in, int in_size,
+		     void *out, int out_size, mlx5_cmd_cbk_t callback,
+		     void *context)
+{
+	return cmd_exec(dev, in, in_size, out, out_size, callback, context);
+}
+EXPORT_SYMBOL(mlx5_cmd_exec_cb);
+
+static void destroy_msg_cache(struct mlx5_core_dev *dev)
+{
+	struct mlx5_cmd *cmd = &dev->cmd;
+	struct mlx5_cmd_msg *msg;
+	struct mlx5_cmd_msg *n;
+
+	list_for_each_entry_safe(msg, n, &cmd->cache.large.head, list) {
+		list_del(&msg->list);
+		mlx5_free_cmd_msg(dev, msg);
+	}
+
+	list_for_each_entry_safe(msg, n, &cmd->cache.med.head, list) {
+		list_del(&msg->list);
+		mlx5_free_cmd_msg(dev, msg);
+	}
+}
+
+static int create_msg_cache(struct mlx5_core_dev *dev)
+{
+	struct mlx5_cmd *cmd = &dev->cmd;
+	struct mlx5_cmd_msg *msg;
+	int err;
+	int i;
+
+	spin_lock_init(&cmd->cache.large.lock);
+	INIT_LIST_HEAD(&cmd->cache.large.head);
+	spin_lock_init(&cmd->cache.med.lock);
+	INIT_LIST_HEAD(&cmd->cache.med.head);
+
+	for (i = 0; i < NUM_LONG_LISTS; i++) {
+		msg = mlx5_alloc_cmd_msg(dev, GFP_KERNEL, LONG_LIST_SIZE);
+		if (IS_ERR(msg)) {
+			err = PTR_ERR(msg);
+			goto ex_err;
+		}
+		msg->cache = &cmd->cache.large;
+		list_add_tail(&msg->list, &cmd->cache.large.head);
+	}
+
+	for (i = 0; i < NUM_MED_LISTS; i++) {
+		msg = mlx5_alloc_cmd_msg(dev, GFP_KERNEL, MED_LIST_SIZE);
+		if (IS_ERR(msg)) {
+			err = PTR_ERR(msg);
+			goto ex_err;
+		}
+		msg->cache = &cmd->cache.med;
+		list_add_tail(&msg->list, &cmd->cache.med.head);
+	}
+
+	return 0;
+
+ex_err:
+	destroy_msg_cache(dev);
+	return err;
+}
+
+int mlx5_cmd_init(struct mlx5_core_dev *dev)
+{
+	int size = sizeof(struct mlx5_cmd_prot_block);
+	int align = roundup_pow_of_two(size);
+	struct mlx5_cmd *cmd = &dev->cmd;
+	u32 cmd_h, cmd_l;
+	u16 cmd_if_rev;
+	int err;
+	int i;
+
+	cmd_if_rev = cmdif_rev(dev);
+	if (cmd_if_rev != CMD_IF_REV) {
+		dev_err(&dev->pdev->dev,
+			"Driver cmdif rev(%d) differs from firmware's(%d)\n",
+			CMD_IF_REV, cmd_if_rev);
+		return -EINVAL;
+	}
+
+	cmd->pool = pci_pool_create("mlx5_cmd", dev->pdev, size, align, 0);
+	if (!cmd->pool)
+		return -ENOMEM;
+
+	cmd->cmd_buf = (void *)__get_free_pages(GFP_ATOMIC, 0);
+	if (!cmd->cmd_buf) {
+		err = -ENOMEM;
+		goto err_free_pool;
+	}
+	cmd->dma = dma_map_single(&dev->pdev->dev, cmd->cmd_buf, PAGE_SIZE,
+				  DMA_BIDIRECTIONAL);
+	if (dma_mapping_error(&dev->pdev->dev, cmd->dma)) {
+		err = -ENOMEM;
+		goto err_free;
+	}
+
+	cmd_l = ioread32be(&dev->iseg->cmdq_addr_l_sz) & 0xff;
+	cmd->log_sz = cmd_l >> 4 & 0xf;
+	cmd->log_stride = cmd_l & 0xf;
+	if (1 << cmd->log_sz > MLX5_MAX_COMMANDS) {
+		dev_err(&dev->pdev->dev, "firmware reports too many outstanding commands %d\n",
+			1 << cmd->log_sz);
+		err = -EINVAL;
+		goto err_map;
+	}
+
+	if (cmd->log_sz + cmd->log_stride > PAGE_SHIFT) {
+		dev_err(&dev->pdev->dev, "command queue size overflow\n");
+		err = -EINVAL;
+		goto err_map;
+	}
+
+	cmd->checksum_disabled = 1;
+	cmd->max_reg_cmds = (1 << cmd->log_sz) - 1;
+	cmd->bitmask = (1 << cmd->max_reg_cmds) - 1;
+
+	cmd->cmdif_rev = ioread32be(&dev->iseg->cmdif_rev_fw_sub) >> 16;
+	if (cmd->cmdif_rev > CMD_IF_REV) {
+		dev_err(&dev->pdev->dev, "driver does not support command interface version. driver %d, firmware %d\n",
+			CMD_IF_REV, cmd->cmdif_rev);
+		err = -ENOTSUPP;
+		goto err_map;
+	}
+
+	spin_lock_init(&cmd->alloc_lock);
+	spin_lock_init(&cmd->token_lock);
+	for (i = 0; i < ARRAY_SIZE(cmd->stats); i++)
+		spin_lock_init(&cmd->stats[i].lock);
+
+	sema_init(&cmd->sem, cmd->max_reg_cmds);
+	sema_init(&cmd->pages_sem, 1);
+
+	cmd_h = (u32)((u64)(cmd->dma) >> 32);
+	cmd_l = (u32)(cmd->dma);
+	if (cmd_l & 0xfff) {
+		dev_err(&dev->pdev->dev, "invalid command queue address\n");
+		err = -ENOMEM;
+		goto err_map;
+	}
+
+	iowrite32be(cmd_h, &dev->iseg->cmdq_addr_h);
+	iowrite32be(cmd_l, &dev->iseg->cmdq_addr_l_sz);
+
+	/* Make sure firmware sees the complete address before we proceed */
+	wmb();
+
+	mlx5_core_dbg(dev, "descriptor at dma 0x%llx\n", (unsigned long long)(cmd->dma));
+
+	cmd->mode = CMD_MODE_POLLING;
+
+	err = create_msg_cache(dev);
+	if (err) {
+		dev_err(&dev->pdev->dev, "failed to create command cache\n");
+		goto err_map;
+	}
+
+	set_wqname(dev);
+	cmd->wq = create_singlethread_workqueue(cmd->wq_name);
+	if (!cmd->wq) {
+		dev_err(&dev->pdev->dev, "failed to create command workqueue\n");
+		err = -ENOMEM;
+		goto err_cache;
+	}
+
+	err = create_debugfs_files(dev);
+	if (err) {
+		err = -ENOMEM;
+		goto err_wq;
+	}
+
+	return 0;
+
+err_wq:
+	destroy_workqueue(cmd->wq);
+
+err_cache:
+	destroy_msg_cache(dev);
+
+err_map:
+	dma_unmap_single(&dev->pdev->dev, cmd->dma, PAGE_SIZE,
+			 DMA_BIDIRECTIONAL);
+err_free:
+	free_pages((unsigned long)cmd->cmd_buf, 0);
+
+err_free_pool:
+	pci_pool_destroy(cmd->pool);
+
+	return err;
+}
+EXPORT_SYMBOL(mlx5_cmd_init);
+
+void mlx5_cmd_cleanup(struct mlx5_core_dev *dev)
+{
+	struct mlx5_cmd *cmd = &dev->cmd;
+
+	clean_debug_files(dev);
+	destroy_workqueue(cmd->wq);
+	destroy_msg_cache(dev);
+	dma_unmap_single(&dev->pdev->dev, cmd->dma, PAGE_SIZE,
+			 DMA_BIDIRECTIONAL);
+	free_pages((unsigned long)cmd->cmd_buf, 0);
+	pci_pool_destroy(cmd->pool);
+}
+EXPORT_SYMBOL(mlx5_cmd_cleanup);
+
+static const char *cmd_status_str(u8 status)
+{
+	switch (status) {
+	case MLX5_CMD_STAT_OK:
+		return "OK";
+	case MLX5_CMD_STAT_INT_ERR:
+		return "internal error";
+	case MLX5_CMD_STAT_BAD_OP_ERR:
+		return "bad operation";
+	case MLX5_CMD_STAT_BAD_PARAM_ERR:
+		return "bad parameter";
+	case MLX5_CMD_STAT_BAD_SYS_STATE_ERR:
+		return "bad system state";
+	case MLX5_CMD_STAT_BAD_RES_ERR:
+		return "bad resource";
+	case MLX5_CMD_STAT_RES_BUSY:
+		return "resource busy";
+	case MLX5_CMD_STAT_LIM_ERR:
+		return "limits exceeded";
+	case MLX5_CMD_STAT_BAD_RES_STATE_ERR:
+		return "bad resource state";
+	case MLX5_CMD_STAT_IX_ERR:
+		return "bad index";
+	case MLX5_CMD_STAT_NO_RES_ERR:
+		return "no resources";
+	case MLX5_CMD_STAT_BAD_INP_LEN_ERR:
+		return "bad input length";
+	case MLX5_CMD_STAT_BAD_OUTP_LEN_ERR:
+		return "bad output length";
+	case MLX5_CMD_STAT_BAD_QP_STATE_ERR:
+		return "bad QP state";
+	case MLX5_CMD_STAT_BAD_PKT_ERR:
+		return "bad packet (discarded)";
+	case MLX5_CMD_STAT_BAD_SIZE_OUTS_CQES_ERR:
+		return "bad size too many outstanding CQEs";
+	default:
+		return "unknown status";
+	}
+}
+
+static int cmd_status_to_err(u8 status)
+{
+	switch (status) {
+	case MLX5_CMD_STAT_OK:				return 0;
+	case MLX5_CMD_STAT_INT_ERR:			return -EIO;
+	case MLX5_CMD_STAT_BAD_OP_ERR:			return -EINVAL;
+	case MLX5_CMD_STAT_BAD_PARAM_ERR:		return -EINVAL;
+	case MLX5_CMD_STAT_BAD_SYS_STATE_ERR:		return -EIO;
+	case MLX5_CMD_STAT_BAD_RES_ERR:			return -EINVAL;
+	case MLX5_CMD_STAT_RES_BUSY:			return -EBUSY;
+	case MLX5_CMD_STAT_LIM_ERR:			return -ENOMEM;
+	case MLX5_CMD_STAT_BAD_RES_STATE_ERR:		return -EINVAL;
+	case MLX5_CMD_STAT_IX_ERR:			return -EINVAL;
+	case MLX5_CMD_STAT_NO_RES_ERR:			return -EAGAIN;
+	case MLX5_CMD_STAT_BAD_INP_LEN_ERR:		return -EIO;
+	case MLX5_CMD_STAT_BAD_OUTP_LEN_ERR:		return -EIO;
+	case MLX5_CMD_STAT_BAD_QP_STATE_ERR:		return -EINVAL;
+	case MLX5_CMD_STAT_BAD_PKT_ERR:			return -EINVAL;
+	case MLX5_CMD_STAT_BAD_SIZE_OUTS_CQES_ERR:	return -EINVAL;
+	default:					return -EIO;
+	}
+}
+
+/* this will be available till all the commands use set/get macros */
+int mlx5_cmd_status_to_err(struct mlx5_outbox_hdr *hdr)
+{
+	if (!hdr->status)
+		return 0;
+
+	pr_warn("command failed, status %s(0x%x), syndrome 0x%x\n",
+		cmd_status_str(hdr->status), hdr->status,
+		be32_to_cpu(hdr->syndrome));
+
+	return cmd_status_to_err(hdr->status);
+}
+
+int mlx5_cmd_status_to_err_v2(void *ptr)
+{
+	u32	syndrome;
+	u8	status;
+
+	status = be32_to_cpu(*(__be32 *)ptr) >> 24;
+	if (!status)
+		return 0;
+
+	syndrome = be32_to_cpu(*(__be32 *)(ptr + 4));
+
+	pr_warn("command failed, status %s(0x%x), syndrome 0x%x\n",
+		cmd_status_str(status), status, syndrome);
+
+	return cmd_status_to_err(status);
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cq.c b/drivers/net/ethernet/mellanox/mlx5/core/cq.c
new file mode 100644
index 0000000..43c5f48
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/cq.c
@@ -0,0 +1,237 @@
+/*
+ * Copyright (c) 2013, Mellanox Technologies inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/hardirq.h>
+#include <linux/mlx5/driver.h>
+#include <linux/mlx5/cmd.h>
+#include <rdma/ib_verbs.h>
+#include <linux/mlx5/cq.h>
+#include "mlx5_core.h"
+
+void mlx5_cq_completion(struct mlx5_core_dev *dev, u32 cqn)
+{
+	struct mlx5_core_cq *cq;
+	struct mlx5_cq_table *table = &dev->priv.cq_table;
+
+	spin_lock(&table->lock);
+	cq = radix_tree_lookup(&table->tree, cqn);
+	if (likely(cq))
+		atomic_inc(&cq->refcount);
+	spin_unlock(&table->lock);
+
+	if (!cq) {
+		mlx5_core_warn(dev, "Completion event for bogus CQ 0x%x\n", cqn);
+		return;
+	}
+
+	++cq->arm_sn;
+
+	cq->comp(cq);
+
+	if (atomic_dec_and_test(&cq->refcount))
+		complete(&cq->free);
+}
+
+void mlx5_cq_event(struct mlx5_core_dev *dev, u32 cqn, int event_type)
+{
+	struct mlx5_cq_table *table = &dev->priv.cq_table;
+	struct mlx5_core_cq *cq;
+
+	spin_lock(&table->lock);
+
+	cq = radix_tree_lookup(&table->tree, cqn);
+	if (cq)
+		atomic_inc(&cq->refcount);
+
+	spin_unlock(&table->lock);
+
+	if (!cq) {
+		mlx5_core_warn(dev, "Async event for bogus CQ 0x%x\n", cqn);
+		return;
+	}
+
+	cq->event(cq, event_type);
+
+	if (atomic_dec_and_test(&cq->refcount))
+		complete(&cq->free);
+}
+
+
+int mlx5_core_create_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq,
+			struct mlx5_create_cq_mbox_in *in, int inlen)
+{
+	int err;
+	struct mlx5_cq_table *table = &dev->priv.cq_table;
+	struct mlx5_create_cq_mbox_out out;
+	struct mlx5_destroy_cq_mbox_in din;
+	struct mlx5_destroy_cq_mbox_out dout;
+
+	in->hdr.opcode = cpu_to_be16(MLX5_CMD_OP_CREATE_CQ);
+	memset(&out, 0, sizeof(out));
+	err = mlx5_cmd_exec(dev, in, inlen, &out, sizeof(out));
+	if (err)
+		return err;
+
+	if (out.hdr.status)
+		return mlx5_cmd_status_to_err(&out.hdr);
+
+	cq->cqn = be32_to_cpu(out.cqn) & 0xffffff;
+	cq->cons_index = 0;
+	cq->arm_sn     = 0;
+	atomic_set(&cq->refcount, 1);
+	init_completion(&cq->free);
+
+	spin_lock_irq(&table->lock);
+	err = radix_tree_insert(&table->tree, cq->cqn, cq);
+	spin_unlock_irq(&table->lock);
+	if (err)
+		goto err_cmd;
+
+	cq->pid = current->pid;
+	err = mlx5_debug_cq_add(dev, cq);
+	if (err)
+		mlx5_core_dbg(dev, "failed adding CP 0x%x to debug file system\n",
+			      cq->cqn);
+
+	return 0;
+
+err_cmd:
+	memset(&din, 0, sizeof(din));
+	memset(&dout, 0, sizeof(dout));
+	din.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_DESTROY_CQ);
+	mlx5_cmd_exec(dev, &din, sizeof(din), &dout, sizeof(dout));
+	return err;
+}
+EXPORT_SYMBOL(mlx5_core_create_cq);
+
+int mlx5_core_destroy_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq)
+{
+	struct mlx5_cq_table *table = &dev->priv.cq_table;
+	struct mlx5_destroy_cq_mbox_in in;
+	struct mlx5_destroy_cq_mbox_out out;
+	struct mlx5_core_cq *tmp;
+	int err;
+
+	spin_lock_irq(&table->lock);
+	tmp = radix_tree_delete(&table->tree, cq->cqn);
+	spin_unlock_irq(&table->lock);
+	if (!tmp) {
+		mlx5_core_warn(dev, "cq 0x%x not found in tree\n", cq->cqn);
+		return -EINVAL;
+	}
+	if (tmp != cq) {
+		mlx5_core_warn(dev, "corruption on srqn 0x%x\n", cq->cqn);
+		return -EINVAL;
+	}
+
+	memset(&in, 0, sizeof(in));
+	memset(&out, 0, sizeof(out));
+	in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_DESTROY_CQ);
+	in.cqn = cpu_to_be32(cq->cqn);
+	err = mlx5_cmd_exec(dev, &in, sizeof(in), &out, sizeof(out));
+	if (err)
+		return err;
+
+	if (out.hdr.status)
+		return mlx5_cmd_status_to_err(&out.hdr);
+
+	synchronize_irq(cq->irqn);
+
+	mlx5_debug_cq_remove(dev, cq);
+	if (atomic_dec_and_test(&cq->refcount))
+		complete(&cq->free);
+	wait_for_completion(&cq->free);
+
+	return 0;
+}
+EXPORT_SYMBOL(mlx5_core_destroy_cq);
+
+int mlx5_core_query_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq,
+		       struct mlx5_query_cq_mbox_out *out)
+{
+	struct mlx5_query_cq_mbox_in in;
+	int err;
+
+	memset(&in, 0, sizeof(in));
+	memset(out, 0, sizeof(*out));
+
+	in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_QUERY_CQ);
+	in.cqn = cpu_to_be32(cq->cqn);
+	err = mlx5_cmd_exec(dev, &in, sizeof(in), out, sizeof(*out));
+	if (err)
+		return err;
+
+	if (out->hdr.status)
+		return mlx5_cmd_status_to_err(&out->hdr);
+
+	return err;
+}
+EXPORT_SYMBOL(mlx5_core_query_cq);
+
+
+int mlx5_core_modify_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq,
+			struct mlx5_modify_cq_mbox_in *in, int in_sz)
+{
+	struct mlx5_modify_cq_mbox_out out;
+	int err;
+
+	memset(&out, 0, sizeof(out));
+	in->hdr.opcode = cpu_to_be16(MLX5_CMD_OP_MODIFY_CQ);
+	err = mlx5_cmd_exec(dev, in, in_sz, &out, sizeof(out));
+	if (err)
+		return err;
+
+	if (out.hdr.status)
+		return mlx5_cmd_status_to_err(&out.hdr);
+
+	return 0;
+}
+EXPORT_SYMBOL(mlx5_core_modify_cq);
+
+int mlx5_init_cq_table(struct mlx5_core_dev *dev)
+{
+	struct mlx5_cq_table *table = &dev->priv.cq_table;
+	int err;
+
+	spin_lock_init(&table->lock);
+	INIT_RADIX_TREE(&table->tree, GFP_ATOMIC);
+	err = mlx5_cq_debugfs_init(dev);
+
+	return err;
+}
+
+void mlx5_cleanup_cq_table(struct mlx5_core_dev *dev)
+{
+	mlx5_cq_debugfs_cleanup(dev);
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/debugfs.c b/drivers/net/ethernet/mellanox/mlx5/core/debugfs.c
new file mode 100644
index 0000000..10e1f1a
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/debugfs.c
@@ -0,0 +1,610 @@
+/*
+ * Copyright (c) 2013, Mellanox Technologies inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/debugfs.h>
+#include <linux/mlx5/qp.h>
+#include <linux/mlx5/cq.h>
+#include <linux/mlx5/driver.h>
+#include "mlx5_core.h"
+
+enum {
+	QP_PID,
+	QP_STATE,
+	QP_XPORT,
+	QP_MTU,
+	QP_N_RECV,
+	QP_RECV_SZ,
+	QP_N_SEND,
+	QP_LOG_PG_SZ,
+	QP_RQPN,
+};
+
+static char *qp_fields[] = {
+	[QP_PID]	= "pid",
+	[QP_STATE]	= "state",
+	[QP_XPORT]	= "transport",
+	[QP_MTU]	= "mtu",
+	[QP_N_RECV]	= "num_recv",
+	[QP_RECV_SZ]	= "rcv_wqe_sz",
+	[QP_N_SEND]	= "num_send",
+	[QP_LOG_PG_SZ]	= "log2_page_sz",
+	[QP_RQPN]	= "remote_qpn",
+};
+
+enum {
+	EQ_NUM_EQES,
+	EQ_INTR,
+	EQ_LOG_PG_SZ,
+};
+
+static char *eq_fields[] = {
+	[EQ_NUM_EQES]	= "num_eqes",
+	[EQ_INTR]	= "intr",
+	[EQ_LOG_PG_SZ]	= "log_page_size",
+};
+
+enum {
+	CQ_PID,
+	CQ_NUM_CQES,
+	CQ_LOG_PG_SZ,
+};
+
+static char *cq_fields[] = {
+	[CQ_PID]	= "pid",
+	[CQ_NUM_CQES]	= "num_cqes",
+	[CQ_LOG_PG_SZ]	= "log_page_size",
+};
+
+struct dentry *mlx5_debugfs_root;
+EXPORT_SYMBOL(mlx5_debugfs_root);
+
+void mlx5_register_debugfs(void)
+{
+	mlx5_debugfs_root = debugfs_create_dir("mlx5", NULL);
+	if (IS_ERR_OR_NULL(mlx5_debugfs_root))
+		mlx5_debugfs_root = NULL;
+}
+
+void mlx5_unregister_debugfs(void)
+{
+	debugfs_remove(mlx5_debugfs_root);
+}
+
+int mlx5_qp_debugfs_init(struct mlx5_core_dev *dev)
+{
+	if (!mlx5_debugfs_root)
+		return 0;
+
+	atomic_set(&dev->num_qps, 0);
+
+	dev->priv.qp_debugfs = debugfs_create_dir("QPs",  dev->priv.dbg_root);
+	if (!dev->priv.qp_debugfs)
+		return -ENOMEM;
+
+	return 0;
+}
+
+void mlx5_qp_debugfs_cleanup(struct mlx5_core_dev *dev)
+{
+	if (!mlx5_debugfs_root)
+		return;
+
+	debugfs_remove_recursive(dev->priv.qp_debugfs);
+}
+
+int mlx5_eq_debugfs_init(struct mlx5_core_dev *dev)
+{
+	if (!mlx5_debugfs_root)
+		return 0;
+
+	dev->priv.eq_debugfs = debugfs_create_dir("EQs",  dev->priv.dbg_root);
+	if (!dev->priv.eq_debugfs)
+		return -ENOMEM;
+
+	return 0;
+}
+
+void mlx5_eq_debugfs_cleanup(struct mlx5_core_dev *dev)
+{
+	if (!mlx5_debugfs_root)
+		return;
+
+	debugfs_remove_recursive(dev->priv.eq_debugfs);
+}
+
+static ssize_t average_read(struct file *filp, char __user *buf, size_t count,
+			    loff_t *pos)
+{
+	struct mlx5_cmd_stats *stats;
+	u64 field = 0;
+	int ret;
+	char tbuf[22];
+
+	if (*pos)
+		return 0;
+
+	stats = filp->private_data;
+	spin_lock_irq(&stats->lock);
+	if (stats->n)
+		field = div64_u64(stats->sum, stats->n);
+	spin_unlock_irq(&stats->lock);
+	ret = snprintf(tbuf, sizeof(tbuf), "%llu\n", field);
+	if (ret > 0) {
+		if (copy_to_user(buf, tbuf, ret))
+			return -EFAULT;
+	}
+
+	*pos += ret;
+	return ret;
+}
+
+
+static ssize_t average_write(struct file *filp, const char __user *buf,
+			     size_t count, loff_t *pos)
+{
+	struct mlx5_cmd_stats *stats;
+
+	stats = filp->private_data;
+	spin_lock_irq(&stats->lock);
+	stats->sum = 0;
+	stats->n = 0;
+	spin_unlock_irq(&stats->lock);
+
+	*pos += count;
+
+	return count;
+}
+
+static const struct file_operations stats_fops = {
+	.owner	= THIS_MODULE,
+	.open	= simple_open,
+	.read	= average_read,
+	.write	= average_write,
+};
+
+int mlx5_cmdif_debugfs_init(struct mlx5_core_dev *dev)
+{
+	struct mlx5_cmd_stats *stats;
+	struct dentry **cmd;
+	const char *namep;
+	int err;
+	int i;
+
+	if (!mlx5_debugfs_root)
+		return 0;
+
+	cmd = &dev->priv.cmdif_debugfs;
+	*cmd = debugfs_create_dir("commands", dev->priv.dbg_root);
+	if (!*cmd)
+		return -ENOMEM;
+
+	for (i = 0; i < ARRAY_SIZE(dev->cmd.stats); i++) {
+		stats = &dev->cmd.stats[i];
+		namep = mlx5_command_str(i);
+		if (strcmp(namep, "unknown command opcode")) {
+			stats->root = debugfs_create_dir(namep, *cmd);
+			if (!stats->root) {
+				mlx5_core_warn(dev, "failed adding command %d\n",
+					       i);
+				err = -ENOMEM;
+				goto out;
+			}
+
+			stats->avg = debugfs_create_file("average", 0400,
+							 stats->root, stats,
+							 &stats_fops);
+			if (!stats->avg) {
+				mlx5_core_warn(dev, "failed creating debugfs file\n");
+				err = -ENOMEM;
+				goto out;
+			}
+
+			stats->count = debugfs_create_u64("n", 0400,
+							  stats->root,
+							  &stats->n);
+			if (!stats->count) {
+				mlx5_core_warn(dev, "failed creating debugfs file\n");
+				err = -ENOMEM;
+				goto out;
+			}
+		}
+	}
+
+	return 0;
+out:
+	debugfs_remove_recursive(dev->priv.cmdif_debugfs);
+	return err;
+}
+
+void mlx5_cmdif_debugfs_cleanup(struct mlx5_core_dev *dev)
+{
+	if (!mlx5_debugfs_root)
+		return;
+
+	debugfs_remove_recursive(dev->priv.cmdif_debugfs);
+}
+
+int mlx5_cq_debugfs_init(struct mlx5_core_dev *dev)
+{
+	if (!mlx5_debugfs_root)
+		return 0;
+
+	dev->priv.cq_debugfs = debugfs_create_dir("CQs",  dev->priv.dbg_root);
+	if (!dev->priv.cq_debugfs)
+		return -ENOMEM;
+
+	return 0;
+}
+
+void mlx5_cq_debugfs_cleanup(struct mlx5_core_dev *dev)
+{
+	if (!mlx5_debugfs_root)
+		return;
+
+	debugfs_remove_recursive(dev->priv.cq_debugfs);
+}
+
+static u64 qp_read_field(struct mlx5_core_dev *dev, struct mlx5_core_qp *qp,
+			 int index, int *is_str)
+{
+	struct mlx5_query_qp_mbox_out *out;
+	struct mlx5_qp_context *ctx;
+	u64 param = 0;
+	int err;
+	int no_sq;
+
+	out = kzalloc(sizeof(*out), GFP_KERNEL);
+	if (!out)
+		return param;
+
+	err = mlx5_core_qp_query(dev, qp, out, sizeof(*out));
+	if (err) {
+		mlx5_core_warn(dev, "failed to query qp\n");
+		goto out;
+	}
+
+	*is_str = 0;
+	ctx = &out->ctx;
+	switch (index) {
+	case QP_PID:
+		param = qp->pid;
+		break;
+	case QP_STATE:
+		param = (u64)mlx5_qp_state_str(be32_to_cpu(ctx->flags) >> 28);
+		*is_str = 1;
+		break;
+	case QP_XPORT:
+		param = (u64)mlx5_qp_type_str((be32_to_cpu(ctx->flags) >> 16) & 0xff);
+		*is_str = 1;
+		break;
+	case QP_MTU:
+		switch (ctx->mtu_msgmax >> 5) {
+		case IB_MTU_256:
+			param = 256;
+			break;
+		case IB_MTU_512:
+			param = 512;
+			break;
+		case IB_MTU_1024:
+			param = 1024;
+			break;
+		case IB_MTU_2048:
+			param = 2048;
+			break;
+		case IB_MTU_4096:
+			param = 4096;
+			break;
+		default:
+			param = 0;
+		}
+		break;
+	case QP_N_RECV:
+		param = 1 << ((ctx->rq_size_stride >> 3) & 0xf);
+		break;
+	case QP_RECV_SZ:
+		param = 1 << ((ctx->rq_size_stride & 7) + 4);
+		break;
+	case QP_N_SEND:
+		no_sq = be16_to_cpu(ctx->sq_crq_size) >> 15;
+		if (!no_sq)
+			param = 1 << (be16_to_cpu(ctx->sq_crq_size) >> 11);
+		else
+			param = 0;
+		break;
+	case QP_LOG_PG_SZ:
+		param = (be32_to_cpu(ctx->log_pg_sz_remote_qpn) >> 24) & 0x1f;
+		param += 12;
+		break;
+	case QP_RQPN:
+		param = be32_to_cpu(ctx->log_pg_sz_remote_qpn) & 0xffffff;
+		break;
+	}
+
+out:
+	kfree(out);
+	return param;
+}
+
+static u64 eq_read_field(struct mlx5_core_dev *dev, struct mlx5_eq *eq,
+			 int index)
+{
+	struct mlx5_query_eq_mbox_out *out;
+	struct mlx5_eq_context *ctx;
+	u64 param = 0;
+	int err;
+
+	out = kzalloc(sizeof(*out), GFP_KERNEL);
+	if (!out)
+		return param;
+
+	ctx = &out->ctx;
+
+	err = mlx5_core_eq_query(dev, eq, out, sizeof(*out));
+	if (err) {
+		mlx5_core_warn(dev, "failed to query eq\n");
+		goto out;
+	}
+
+	switch (index) {
+	case EQ_NUM_EQES:
+		param = 1 << ((be32_to_cpu(ctx->log_sz_usr_page) >> 24) & 0x1f);
+		break;
+	case EQ_INTR:
+		param = ctx->intr;
+		break;
+	case EQ_LOG_PG_SZ:
+		param = (ctx->log_page_size & 0x1f) + 12;
+		break;
+	}
+
+out:
+	kfree(out);
+	return param;
+}
+
+static u64 cq_read_field(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq,
+			 int index)
+{
+	struct mlx5_query_cq_mbox_out *out;
+	struct mlx5_cq_context *ctx;
+	u64 param = 0;
+	int err;
+
+	out = kzalloc(sizeof(*out), GFP_KERNEL);
+	if (!out)
+		return param;
+
+	ctx = &out->ctx;
+
+	err = mlx5_core_query_cq(dev, cq, out);
+	if (err) {
+		mlx5_core_warn(dev, "failed to query cq\n");
+		goto out;
+	}
+
+	switch (index) {
+	case CQ_PID:
+		param = cq->pid;
+		break;
+	case CQ_NUM_CQES:
+		param = 1 << ((be32_to_cpu(ctx->log_sz_usr_page) >> 24) & 0x1f);
+		break;
+	case CQ_LOG_PG_SZ:
+		param = (ctx->log_pg_sz & 0x1f) + 12;
+		break;
+	}
+
+out:
+	kfree(out);
+	return param;
+}
+
+static ssize_t dbg_read(struct file *filp, char __user *buf, size_t count,
+			loff_t *pos)
+{
+	struct mlx5_field_desc *desc;
+	struct mlx5_rsc_debug *d;
+	char tbuf[18];
+	int is_str = 0;
+	u64 field;
+	int ret;
+
+	if (*pos)
+		return 0;
+
+	desc = filp->private_data;
+	d = (void *)(desc - desc->i) - sizeof(*d);
+	switch (d->type) {
+	case MLX5_DBG_RSC_QP:
+		field = qp_read_field(d->dev, d->object, desc->i, &is_str);
+		break;
+
+	case MLX5_DBG_RSC_EQ:
+		field = eq_read_field(d->dev, d->object, desc->i);
+		break;
+
+	case MLX5_DBG_RSC_CQ:
+		field = cq_read_field(d->dev, d->object, desc->i);
+		break;
+
+	default:
+		mlx5_core_warn(d->dev, "invalid resource type %d\n", d->type);
+		return -EINVAL;
+	}
+
+
+	if (is_str)
+		ret = snprintf(tbuf, sizeof(tbuf), "%s\n", (const char *)field);
+	else
+		ret = snprintf(tbuf, sizeof(tbuf), "0x%llx\n", field);
+
+	if (ret > 0) {
+		if (copy_to_user(buf, tbuf, ret))
+			return -EFAULT;
+	}
+
+	*pos += ret;
+	return ret;
+}
+
+static const struct file_operations fops = {
+	.owner	= THIS_MODULE,
+	.open	= simple_open,
+	.read	= dbg_read,
+};
+
+static int add_res_tree(struct mlx5_core_dev *dev, enum dbg_rsc_type type,
+			struct dentry *root, struct mlx5_rsc_debug **dbg,
+			int rsn, char **field, int nfile, void *data)
+{
+	struct mlx5_rsc_debug *d;
+	char resn[32];
+	int err;
+	int i;
+
+	d = kzalloc(sizeof(*d) + nfile * sizeof(d->fields[0]), GFP_KERNEL);
+	if (!d)
+		return -ENOMEM;
+
+	d->dev = dev;
+	d->object = data;
+	d->type = type;
+	sprintf(resn, "0x%x", rsn);
+	d->root = debugfs_create_dir(resn,  root);
+	if (!d->root) {
+		err = -ENOMEM;
+		goto out_free;
+	}
+
+	for (i = 0; i < nfile; i++) {
+		d->fields[i].i = i;
+		d->fields[i].dent = debugfs_create_file(field[i], 0400,
+							d->root, &d->fields[i],
+							&fops);
+		if (!d->fields[i].dent) {
+			err = -ENOMEM;
+			goto out_rem;
+		}
+	}
+	*dbg = d;
+
+	return 0;
+out_rem:
+	debugfs_remove_recursive(d->root);
+
+out_free:
+	kfree(d);
+	return err;
+}
+
+static void rem_res_tree(struct mlx5_rsc_debug *d)
+{
+	debugfs_remove_recursive(d->root);
+	kfree(d);
+}
+
+int mlx5_debug_qp_add(struct mlx5_core_dev *dev, struct mlx5_core_qp *qp)
+{
+	int err;
+
+	if (!mlx5_debugfs_root)
+		return 0;
+
+	err = add_res_tree(dev, MLX5_DBG_RSC_QP, dev->priv.qp_debugfs,
+			   &qp->dbg, qp->qpn, qp_fields,
+			   ARRAY_SIZE(qp_fields), qp);
+	if (err)
+		qp->dbg = NULL;
+
+	return err;
+}
+
+void mlx5_debug_qp_remove(struct mlx5_core_dev *dev, struct mlx5_core_qp *qp)
+{
+	if (!mlx5_debugfs_root)
+		return;
+
+	if (qp->dbg)
+		rem_res_tree(qp->dbg);
+}
+
+
+int mlx5_debug_eq_add(struct mlx5_core_dev *dev, struct mlx5_eq *eq)
+{
+	int err;
+
+	if (!mlx5_debugfs_root)
+		return 0;
+
+	err = add_res_tree(dev, MLX5_DBG_RSC_EQ, dev->priv.eq_debugfs,
+			   &eq->dbg, eq->eqn, eq_fields,
+			   ARRAY_SIZE(eq_fields), eq);
+	if (err)
+		eq->dbg = NULL;
+
+	return err;
+}
+
+void mlx5_debug_eq_remove(struct mlx5_core_dev *dev, struct mlx5_eq *eq)
+{
+	if (!mlx5_debugfs_root)
+		return;
+
+	if (eq->dbg)
+		rem_res_tree(eq->dbg);
+}
+
+int mlx5_debug_cq_add(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq)
+{
+	int err;
+
+	if (!mlx5_debugfs_root)
+		return 0;
+
+	err = add_res_tree(dev, MLX5_DBG_RSC_CQ, dev->priv.cq_debugfs,
+			   &cq->dbg, cq->cqn, cq_fields,
+			   ARRAY_SIZE(cq_fields), cq);
+	if (err)
+		cq->dbg = NULL;
+
+	return err;
+}
+
+void mlx5_debug_cq_remove(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq)
+{
+	if (!mlx5_debugfs_root)
+		return;
+
+	if (cq->dbg)
+		rem_res_tree(cq->dbg);
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
new file mode 100644
index 0000000..ad2c96a
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
@@ -0,0 +1,528 @@
+/*
+ * Copyright (c) 2013, Mellanox Technologies inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/interrupt.h>
+#include <linux/module.h>
+#include <linux/mlx5/driver.h>
+#include <linux/mlx5/cmd.h>
+#include "mlx5_core.h"
+
+enum {
+	MLX5_EQE_SIZE		= sizeof(struct mlx5_eqe),
+	MLX5_EQE_OWNER_INIT_VAL	= 0x1,
+};
+
+enum {
+	MLX5_EQ_STATE_ARMED		= 0x9,
+	MLX5_EQ_STATE_FIRED		= 0xa,
+	MLX5_EQ_STATE_ALWAYS_ARMED	= 0xb,
+};
+
+enum {
+	MLX5_NUM_SPARE_EQE	= 0x80,
+	MLX5_NUM_ASYNC_EQE	= 0x100,
+	MLX5_NUM_CMD_EQE	= 32,
+};
+
+enum {
+	MLX5_EQ_DOORBEL_OFFSET	= 0x40,
+};
+
+#define MLX5_ASYNC_EVENT_MASK ((1ull << MLX5_EVENT_TYPE_PATH_MIG)	    | \
+			       (1ull << MLX5_EVENT_TYPE_COMM_EST)	    | \
+			       (1ull << MLX5_EVENT_TYPE_SQ_DRAINED)	    | \
+			       (1ull << MLX5_EVENT_TYPE_CQ_ERROR)	    | \
+			       (1ull << MLX5_EVENT_TYPE_WQ_CATAS_ERROR)	    | \
+			       (1ull << MLX5_EVENT_TYPE_PATH_MIG_FAILED)    | \
+			       (1ull << MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR) | \
+			       (1ull << MLX5_EVENT_TYPE_WQ_ACCESS_ERROR)    | \
+			       (1ull << MLX5_EVENT_TYPE_PORT_CHANGE)	    | \
+			       (1ull << MLX5_EVENT_TYPE_SRQ_CATAS_ERROR)    | \
+			       (1ull << MLX5_EVENT_TYPE_SRQ_LAST_WQE)	    | \
+			       (1ull << MLX5_EVENT_TYPE_SRQ_RQ_LIMIT))
+
+struct map_eq_in {
+	u64	mask;
+	u32	reserved;
+	u32	unmap_eqn;
+};
+
+struct cre_des_eq {
+	u8	reserved[15];
+	u8	eqn;
+};
+
+static int mlx5_cmd_destroy_eq(struct mlx5_core_dev *dev, u8 eqn)
+{
+	struct mlx5_destroy_eq_mbox_in in;
+	struct mlx5_destroy_eq_mbox_out out;
+	int err;
+
+	memset(&in, 0, sizeof(in));
+	memset(&out, 0, sizeof(out));
+	in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_DESTROY_EQ);
+	in.eqn = eqn;
+	err = mlx5_cmd_exec(dev, &in, sizeof(in), &out, sizeof(out));
+	if (!err)
+		goto ex;
+
+	if (out.hdr.status)
+		err = mlx5_cmd_status_to_err(&out.hdr);
+
+ex:
+	return err;
+}
+
+static struct mlx5_eqe *get_eqe(struct mlx5_eq *eq, u32 entry)
+{
+	return mlx5_buf_offset(&eq->buf, entry * MLX5_EQE_SIZE);
+}
+
+static struct mlx5_eqe *next_eqe_sw(struct mlx5_eq *eq)
+{
+	struct mlx5_eqe *eqe = get_eqe(eq, eq->cons_index & (eq->nent - 1));
+
+	return ((eqe->owner & 1) ^ !!(eq->cons_index & eq->nent)) ? NULL : eqe;
+}
+
+static const char *eqe_type_str(u8 type)
+{
+	switch (type) {
+	case MLX5_EVENT_TYPE_COMP:
+		return "MLX5_EVENT_TYPE_COMP";
+	case MLX5_EVENT_TYPE_PATH_MIG:
+		return "MLX5_EVENT_TYPE_PATH_MIG";
+	case MLX5_EVENT_TYPE_COMM_EST:
+		return "MLX5_EVENT_TYPE_COMM_EST";
+	case MLX5_EVENT_TYPE_SQ_DRAINED:
+		return "MLX5_EVENT_TYPE_SQ_DRAINED";
+	case MLX5_EVENT_TYPE_SRQ_LAST_WQE:
+		return "MLX5_EVENT_TYPE_SRQ_LAST_WQE";
+	case MLX5_EVENT_TYPE_SRQ_RQ_LIMIT:
+		return "MLX5_EVENT_TYPE_SRQ_RQ_LIMIT";
+	case MLX5_EVENT_TYPE_CQ_ERROR:
+		return "MLX5_EVENT_TYPE_CQ_ERROR";
+	case MLX5_EVENT_TYPE_WQ_CATAS_ERROR:
+		return "MLX5_EVENT_TYPE_WQ_CATAS_ERROR";
+	case MLX5_EVENT_TYPE_PATH_MIG_FAILED:
+		return "MLX5_EVENT_TYPE_PATH_MIG_FAILED";
+	case MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR:
+		return "MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR";
+	case MLX5_EVENT_TYPE_WQ_ACCESS_ERROR:
+		return "MLX5_EVENT_TYPE_WQ_ACCESS_ERROR";
+	case MLX5_EVENT_TYPE_SRQ_CATAS_ERROR:
+		return "MLX5_EVENT_TYPE_SRQ_CATAS_ERROR";
+	case MLX5_EVENT_TYPE_INTERNAL_ERROR:
+		return "MLX5_EVENT_TYPE_INTERNAL_ERROR";
+	case MLX5_EVENT_TYPE_PORT_CHANGE:
+		return "MLX5_EVENT_TYPE_PORT_CHANGE";
+	case MLX5_EVENT_TYPE_GPIO_EVENT:
+		return "MLX5_EVENT_TYPE_GPIO_EVENT";
+	case MLX5_EVENT_TYPE_REMOTE_CONFIG:
+		return "MLX5_EVENT_TYPE_REMOTE_CONFIG";
+	case MLX5_EVENT_TYPE_DB_BF_CONGESTION:
+		return "MLX5_EVENT_TYPE_DB_BF_CONGESTION";
+	case MLX5_EVENT_TYPE_STALL_EVENT:
+		return "MLX5_EVENT_TYPE_STALL_EVENT";
+	case MLX5_EVENT_TYPE_CMD:
+		return "MLX5_EVENT_TYPE_CMD";
+	case MLX5_EVENT_TYPE_PAGE_REQUEST:
+		return "MLX5_EVENT_TYPE_PAGE_REQUEST";
+	default:
+		return "Unrecognized event";
+	}
+}
+
+static enum mlx5_dev_event port_subtype_event(u8 subtype)
+{
+	switch (subtype) {
+	case MLX5_PORT_CHANGE_SUBTYPE_DOWN:
+		return MLX5_DEV_EVENT_PORT_DOWN;
+	case MLX5_PORT_CHANGE_SUBTYPE_ACTIVE:
+		return MLX5_DEV_EVENT_PORT_UP;
+	case MLX5_PORT_CHANGE_SUBTYPE_INITIALIZED:
+		return MLX5_DEV_EVENT_PORT_INITIALIZED;
+	case MLX5_PORT_CHANGE_SUBTYPE_LID:
+		return MLX5_DEV_EVENT_LID_CHANGE;
+	case MLX5_PORT_CHANGE_SUBTYPE_PKEY:
+		return MLX5_DEV_EVENT_PKEY_CHANGE;
+	case MLX5_PORT_CHANGE_SUBTYPE_GUID:
+		return MLX5_DEV_EVENT_GUID_CHANGE;
+	case MLX5_PORT_CHANGE_SUBTYPE_CLIENT_REREG:
+		return MLX5_DEV_EVENT_CLIENT_REREG;
+	}
+	return -1;
+}
+
+static void eq_update_ci(struct mlx5_eq *eq, int arm)
+{
+	__be32 __iomem *addr = eq->doorbell + (arm ? 0 : 2);
+	u32 val = (eq->cons_index & 0xffffff) | (eq->eqn << 24);
+	__raw_writel((__force u32) cpu_to_be32(val), addr);
+	/* We still want ordering, just not swabbing, so add a barrier */
+	mb();
+}
+
+static int mlx5_eq_int(struct mlx5_core_dev *dev, struct mlx5_eq *eq)
+{
+	struct mlx5_eqe *eqe;
+	int eqes_found = 0;
+	int set_ci = 0;
+	u32 cqn;
+	u32 rsn;
+	u8 port;
+
+	while ((eqe = next_eqe_sw(eq))) {
+		/*
+		 * Make sure we read EQ entry contents after we've
+		 * checked the ownership bit.
+		 */
+		rmb();
+
+		mlx5_core_dbg(eq->dev, "eqn %d, eqe type %s\n",
+			      eq->eqn, eqe_type_str(eqe->type));
+		switch (eqe->type) {
+		case MLX5_EVENT_TYPE_COMP:
+			cqn = be32_to_cpu(eqe->data.comp.cqn) & 0xffffff;
+			mlx5_cq_completion(dev, cqn);
+			break;
+
+		case MLX5_EVENT_TYPE_PATH_MIG:
+		case MLX5_EVENT_TYPE_COMM_EST:
+		case MLX5_EVENT_TYPE_SQ_DRAINED:
+		case MLX5_EVENT_TYPE_SRQ_LAST_WQE:
+		case MLX5_EVENT_TYPE_WQ_CATAS_ERROR:
+		case MLX5_EVENT_TYPE_PATH_MIG_FAILED:
+		case MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR:
+		case MLX5_EVENT_TYPE_WQ_ACCESS_ERROR:
+			rsn = be32_to_cpu(eqe->data.qp_srq.qp_srq_n) & 0xffffff;
+			mlx5_core_dbg(dev, "event %s(%d) arrived\n",
+				      eqe_type_str(eqe->type), eqe->type);
+			mlx5_rsc_event(dev, rsn, eqe->type);
+			break;
+
+		case MLX5_EVENT_TYPE_SRQ_RQ_LIMIT:
+		case MLX5_EVENT_TYPE_SRQ_CATAS_ERROR:
+			rsn = be32_to_cpu(eqe->data.qp_srq.qp_srq_n) & 0xffffff;
+			mlx5_core_dbg(dev, "SRQ event %s(%d): srqn 0x%x\n",
+				      eqe_type_str(eqe->type), eqe->type, rsn);
+			mlx5_srq_event(dev, rsn, eqe->type);
+			break;
+
+		case MLX5_EVENT_TYPE_CMD:
+			mlx5_cmd_comp_handler(dev, be32_to_cpu(eqe->data.cmd.vector));
+			break;
+
+		case MLX5_EVENT_TYPE_PORT_CHANGE:
+			port = (eqe->data.port.port >> 4) & 0xf;
+			switch (eqe->sub_type) {
+			case MLX5_PORT_CHANGE_SUBTYPE_DOWN:
+			case MLX5_PORT_CHANGE_SUBTYPE_ACTIVE:
+			case MLX5_PORT_CHANGE_SUBTYPE_LID:
+			case MLX5_PORT_CHANGE_SUBTYPE_PKEY:
+			case MLX5_PORT_CHANGE_SUBTYPE_GUID:
+			case MLX5_PORT_CHANGE_SUBTYPE_CLIENT_REREG:
+			case MLX5_PORT_CHANGE_SUBTYPE_INITIALIZED:
+				if (dev->event)
+					dev->event(dev, port_subtype_event(eqe->sub_type),
+						   (unsigned long)port);
+				break;
+			default:
+				mlx5_core_warn(dev, "Port event with unrecognized subtype: port %d, sub_type %d\n",
+					       port, eqe->sub_type);
+			}
+			break;
+		case MLX5_EVENT_TYPE_CQ_ERROR:
+			cqn = be32_to_cpu(eqe->data.cq_err.cqn) & 0xffffff;
+			mlx5_core_warn(dev, "CQ error on CQN 0x%x, syndrom 0x%x\n",
+				       cqn, eqe->data.cq_err.syndrome);
+			mlx5_cq_event(dev, cqn, eqe->type);
+			break;
+
+		case MLX5_EVENT_TYPE_PAGE_REQUEST:
+			{
+				u16 func_id = be16_to_cpu(eqe->data.req_pages.func_id);
+				s32 npages = be32_to_cpu(eqe->data.req_pages.num_pages);
+
+				mlx5_core_dbg(dev, "page request for func 0x%x, npages %d\n",
+					      func_id, npages);
+				mlx5_core_req_pages_handler(dev, func_id, npages);
+			}
+			break;
+
+
+		default:
+			mlx5_core_warn(dev, "Unhandled event 0x%x on EQ 0x%x\n",
+				       eqe->type, eq->eqn);
+			break;
+		}
+
+		++eq->cons_index;
+		eqes_found = 1;
+		++set_ci;
+
+		/* The HCA will think the queue has overflowed if we
+		 * don't tell it we've been processing events.  We
+		 * create our EQs with MLX5_NUM_SPARE_EQE extra
+		 * entries, so we must update our consumer index at
+		 * least that often.
+		 */
+		if (unlikely(set_ci >= MLX5_NUM_SPARE_EQE)) {
+			eq_update_ci(eq, 0);
+			set_ci = 0;
+		}
+	}
+
+	eq_update_ci(eq, 1);
+
+	return eqes_found;
+}
+
+static irqreturn_t mlx5_msix_handler(int irq, void *eq_ptr)
+{
+	struct mlx5_eq *eq = eq_ptr;
+	struct mlx5_core_dev *dev = eq->dev;
+
+	mlx5_eq_int(dev, eq);
+
+	/* MSI-X vectors always belong to us */
+	return IRQ_HANDLED;
+}
+
+static void init_eq_buf(struct mlx5_eq *eq)
+{
+	struct mlx5_eqe *eqe;
+	int i;
+
+	for (i = 0; i < eq->nent; i++) {
+		eqe = get_eqe(eq, i);
+		eqe->owner = MLX5_EQE_OWNER_INIT_VAL;
+	}
+}
+
+int mlx5_create_map_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq, u8 vecidx,
+		       int nent, u64 mask, const char *name, struct mlx5_uar *uar)
+{
+	struct mlx5_eq_table *table = &dev->priv.eq_table;
+	struct mlx5_create_eq_mbox_in *in;
+	struct mlx5_create_eq_mbox_out out;
+	int err;
+	int inlen;
+
+	eq->nent = roundup_pow_of_two(nent + MLX5_NUM_SPARE_EQE);
+	err = mlx5_buf_alloc(dev, eq->nent * MLX5_EQE_SIZE, 2 * PAGE_SIZE,
+			     &eq->buf);
+	if (err)
+		return err;
+
+	init_eq_buf(eq);
+
+	inlen = sizeof(*in) + sizeof(in->pas[0]) * eq->buf.npages;
+	in = mlx5_vzalloc(inlen);
+	if (!in) {
+		err = -ENOMEM;
+		goto err_buf;
+	}
+	memset(&out, 0, sizeof(out));
+
+	mlx5_fill_page_array(&eq->buf, in->pas);
+
+	in->hdr.opcode = cpu_to_be16(MLX5_CMD_OP_CREATE_EQ);
+	in->ctx.log_sz_usr_page = cpu_to_be32(ilog2(eq->nent) << 24 | uar->index);
+	in->ctx.intr = vecidx;
+	in->ctx.log_page_size = eq->buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT;
+	in->events_mask = cpu_to_be64(mask);
+
+	err = mlx5_cmd_exec(dev, in, inlen, &out, sizeof(out));
+	if (err)
+		goto err_in;
+
+	if (out.hdr.status) {
+		err = mlx5_cmd_status_to_err(&out.hdr);
+		goto err_in;
+	}
+
+	snprintf(eq->name, MLX5_MAX_EQ_NAME, "%s@pci:%s",
+		 name, pci_name(dev->pdev));
+	eq->eqn = out.eq_number;
+	eq->irqn = vecidx;
+	eq->dev = dev;
+	eq->doorbell = uar->map + MLX5_EQ_DOORBEL_OFFSET;
+	err = request_irq(table->msix_arr[vecidx].vector, mlx5_msix_handler, 0,
+			  eq->name, eq);
+	if (err)
+		goto err_eq;
+
+	err = mlx5_debug_eq_add(dev, eq);
+	if (err)
+		goto err_irq;
+
+	/* EQs are created in ARMED state
+	 */
+	eq_update_ci(eq, 1);
+
+	mlx5_vfree(in);
+	return 0;
+
+err_irq:
+	free_irq(table->msix_arr[vecidx].vector, eq);
+
+err_eq:
+	mlx5_cmd_destroy_eq(dev, eq->eqn);
+
+err_in:
+	mlx5_vfree(in);
+
+err_buf:
+	mlx5_buf_free(dev, &eq->buf);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx5_create_map_eq);
+
+int mlx5_destroy_unmap_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq)
+{
+	struct mlx5_eq_table *table = &dev->priv.eq_table;
+	int err;
+
+	mlx5_debug_eq_remove(dev, eq);
+	free_irq(table->msix_arr[eq->irqn].vector, eq);
+	err = mlx5_cmd_destroy_eq(dev, eq->eqn);
+	if (err)
+		mlx5_core_warn(dev, "failed to destroy a previously created eq: eqn %d\n",
+			       eq->eqn);
+	synchronize_irq(table->msix_arr[eq->irqn].vector);
+	mlx5_buf_free(dev, &eq->buf);
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx5_destroy_unmap_eq);
+
+int mlx5_eq_init(struct mlx5_core_dev *dev)
+{
+	int err;
+
+	spin_lock_init(&dev->priv.eq_table.lock);
+
+	err = mlx5_eq_debugfs_init(dev);
+
+	return err;
+}
+
+
+void mlx5_eq_cleanup(struct mlx5_core_dev *dev)
+{
+	mlx5_eq_debugfs_cleanup(dev);
+}
+
+int mlx5_start_eqs(struct mlx5_core_dev *dev)
+{
+	struct mlx5_eq_table *table = &dev->priv.eq_table;
+	int err;
+
+	err = mlx5_create_map_eq(dev, &table->cmd_eq, MLX5_EQ_VEC_CMD,
+				 MLX5_NUM_CMD_EQE, 1ull << MLX5_EVENT_TYPE_CMD,
+				 "mlx5_cmd_eq", &dev->priv.uuari.uars[0]);
+	if (err) {
+		mlx5_core_warn(dev, "failed to create cmd EQ %d\n", err);
+		return err;
+	}
+
+	mlx5_cmd_use_events(dev);
+
+	err = mlx5_create_map_eq(dev, &table->async_eq, MLX5_EQ_VEC_ASYNC,
+				 MLX5_NUM_ASYNC_EQE, MLX5_ASYNC_EVENT_MASK,
+				 "mlx5_async_eq", &dev->priv.uuari.uars[0]);
+	if (err) {
+		mlx5_core_warn(dev, "failed to create async EQ %d\n", err);
+		goto err1;
+	}
+
+	err = mlx5_create_map_eq(dev, &table->pages_eq,
+				 MLX5_EQ_VEC_PAGES,
+				 dev->caps.gen.max_vf + 1,
+				 1 << MLX5_EVENT_TYPE_PAGE_REQUEST, "mlx5_pages_eq",
+				 &dev->priv.uuari.uars[0]);
+	if (err) {
+		mlx5_core_warn(dev, "failed to create pages EQ %d\n", err);
+		goto err2;
+	}
+
+	return err;
+
+err2:
+	mlx5_destroy_unmap_eq(dev, &table->async_eq);
+
+err1:
+	mlx5_cmd_use_polling(dev);
+	mlx5_destroy_unmap_eq(dev, &table->cmd_eq);
+	return err;
+}
+
+int mlx5_stop_eqs(struct mlx5_core_dev *dev)
+{
+	struct mlx5_eq_table *table = &dev->priv.eq_table;
+	int err;
+
+	err = mlx5_destroy_unmap_eq(dev, &table->pages_eq);
+	if (err)
+		return err;
+
+	mlx5_destroy_unmap_eq(dev, &table->async_eq);
+	mlx5_cmd_use_polling(dev);
+
+	err = mlx5_destroy_unmap_eq(dev, &table->cmd_eq);
+	if (err)
+		mlx5_cmd_use_events(dev);
+
+	return err;
+}
+
+int mlx5_core_eq_query(struct mlx5_core_dev *dev, struct mlx5_eq *eq,
+		       struct mlx5_query_eq_mbox_out *out, int outlen)
+{
+	struct mlx5_query_eq_mbox_in in;
+	int err;
+
+	memset(&in, 0, sizeof(in));
+	memset(out, 0, outlen);
+	in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_QUERY_EQ);
+	in.eqn = eq->eqn;
+	err = mlx5_cmd_exec(dev, &in, sizeof(in), out, outlen);
+	if (err)
+		return err;
+
+	if (out->hdr.status)
+		err = mlx5_cmd_status_to_err(&out->hdr);
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx5_core_eq_query);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fw.c b/drivers/net/ethernet/mellanox/mlx5/core/fw.c
new file mode 100644
index 0000000..087c4c7
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fw.c
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2013, Mellanox Technologies inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/mlx5/driver.h>
+#include <linux/mlx5/cmd.h>
+#include <linux/module.h>
+#include "mlx5_core.h"
+
+int mlx5_cmd_query_adapter(struct mlx5_core_dev *dev)
+{
+	struct mlx5_cmd_query_adapter_mbox_out *out;
+	struct mlx5_cmd_query_adapter_mbox_in in;
+	int err;
+
+	out = kzalloc(sizeof(*out), GFP_KERNEL);
+	if (!out)
+		return -ENOMEM;
+
+	memset(&in, 0, sizeof(in));
+	in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_QUERY_ADAPTER);
+	err = mlx5_cmd_exec(dev, &in, sizeof(in), out, sizeof(*out));
+	if (err)
+		goto out_out;
+
+	if (out->hdr.status) {
+		err = mlx5_cmd_status_to_err(&out->hdr);
+		goto out_out;
+	}
+
+	memcpy(dev->board_id, out->vsd_psid, sizeof(out->vsd_psid));
+
+out_out:
+	kfree(out);
+
+	return err;
+}
+
+int mlx5_cmd_query_hca_cap(struct mlx5_core_dev *dev, struct mlx5_caps *caps)
+{
+	return mlx5_core_get_caps(dev, caps, HCA_CAP_OPMOD_GET_CUR);
+}
+
+int mlx5_cmd_init_hca(struct mlx5_core_dev *dev)
+{
+	struct mlx5_cmd_init_hca_mbox_in in;
+	struct mlx5_cmd_init_hca_mbox_out out;
+	int err;
+
+	memset(&in, 0, sizeof(in));
+	memset(&out, 0, sizeof(out));
+	in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_INIT_HCA);
+	err = mlx5_cmd_exec(dev, &in, sizeof(in), &out, sizeof(out));
+	if (err)
+		return err;
+
+	if (out.hdr.status)
+		err = mlx5_cmd_status_to_err(&out.hdr);
+
+	return err;
+}
+
+int mlx5_cmd_teardown_hca(struct mlx5_core_dev *dev)
+{
+	struct mlx5_cmd_teardown_hca_mbox_in in;
+	struct mlx5_cmd_teardown_hca_mbox_out out;
+	int err;
+
+	memset(&in, 0, sizeof(in));
+	memset(&out, 0, sizeof(out));
+	in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_TEARDOWN_HCA);
+	err = mlx5_cmd_exec(dev, &in, sizeof(in), &out, sizeof(out));
+	if (err)
+		return err;
+
+	if (out.hdr.status)
+		err = mlx5_cmd_status_to_err(&out.hdr);
+
+	return err;
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/health.c b/drivers/net/ethernet/mellanox/mlx5/core/health.c
new file mode 100644
index 0000000..3e6670c
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/health.c
@@ -0,0 +1,200 @@
+/*
+ * Copyright (c) 2013, Mellanox Technologies inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/random.h>
+#include <linux/vmalloc.h>
+#include <linux/mlx5/driver.h>
+#include <linux/mlx5/cmd.h>
+#include "mlx5_core.h"
+
+enum {
+	MLX5_HEALTH_POLL_INTERVAL	= 2 * HZ,
+	MAX_MISSES			= 3,
+};
+
+enum {
+	MLX5_HEALTH_SYNDR_FW_ERR		= 0x1,
+	MLX5_HEALTH_SYNDR_IRISC_ERR		= 0x7,
+	MLX5_HEALTH_SYNDR_CRC_ERR		= 0x9,
+	MLX5_HEALTH_SYNDR_FETCH_PCI_ERR		= 0xa,
+	MLX5_HEALTH_SYNDR_HW_FTL_ERR		= 0xb,
+	MLX5_HEALTH_SYNDR_ASYNC_EQ_OVERRUN_ERR	= 0xc,
+	MLX5_HEALTH_SYNDR_EQ_ERR		= 0xd,
+	MLX5_HEALTH_SYNDR_FFSER_ERR		= 0xf,
+};
+
+static DEFINE_SPINLOCK(health_lock);
+static LIST_HEAD(health_list);
+static struct work_struct health_work;
+
+static void health_care(struct work_struct *work)
+{
+	struct mlx5_core_health *health, *n;
+	struct mlx5_core_dev *dev;
+	struct mlx5_priv *priv;
+	LIST_HEAD(tlist);
+
+	spin_lock_irq(&health_lock);
+	list_splice_init(&health_list, &tlist);
+
+	spin_unlock_irq(&health_lock);
+
+	list_for_each_entry_safe(health, n, &tlist, list) {
+		priv = container_of(health, struct mlx5_priv, health);
+		dev = container_of(priv, struct mlx5_core_dev, priv);
+		mlx5_core_warn(dev, "handling bad device here\n");
+		/* nothing yet */
+		spin_lock_irq(&health_lock);
+		list_del_init(&health->list);
+		spin_unlock_irq(&health_lock);
+	}
+}
+
+static const char *hsynd_str(u8 synd)
+{
+	switch (synd) {
+	case MLX5_HEALTH_SYNDR_FW_ERR:
+		return "firmware internal error";
+	case MLX5_HEALTH_SYNDR_IRISC_ERR:
+		return "irisc not responding";
+	case MLX5_HEALTH_SYNDR_CRC_ERR:
+		return "firmware CRC error";
+	case MLX5_HEALTH_SYNDR_FETCH_PCI_ERR:
+		return "ICM fetch PCI error";
+	case MLX5_HEALTH_SYNDR_HW_FTL_ERR:
+		return "HW fatal error\n";
+	case MLX5_HEALTH_SYNDR_ASYNC_EQ_OVERRUN_ERR:
+		return "async EQ buffer overrun";
+	case MLX5_HEALTH_SYNDR_EQ_ERR:
+		return "EQ error";
+	case MLX5_HEALTH_SYNDR_FFSER_ERR:
+		return "FFSER error";
+	default:
+		return "unrecognized error";
+	}
+}
+
+static u16 read_be16(__be16 __iomem *p)
+{
+	return swab16(readl((__force u16 __iomem *) p));
+}
+
+static u32 read_be32(__be32 __iomem *p)
+{
+	return swab32(readl((__force u32 __iomem *) p));
+}
+
+static void print_health_info(struct mlx5_core_dev *dev)
+{
+	struct mlx5_core_health *health = &dev->priv.health;
+	struct health_buffer __iomem *h = health->health;
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(h->assert_var); i++)
+		pr_info("assert_var[%d] 0x%08x\n", i, read_be32(h->assert_var + i));
+
+	pr_info("assert_exit_ptr 0x%08x\n", read_be32(&h->assert_exit_ptr));
+	pr_info("assert_callra 0x%08x\n", read_be32(&h->assert_callra));
+	pr_info("fw_ver 0x%08x\n", read_be32(&h->fw_ver));
+	pr_info("hw_id 0x%08x\n", read_be32(&h->hw_id));
+	pr_info("irisc_index %d\n", readb(&h->irisc_index));
+	pr_info("synd 0x%x: %s\n", readb(&h->synd), hsynd_str(readb(&h->synd)));
+	pr_info("ext_sync 0x%04x\n", read_be16(&h->ext_sync));
+}
+
+static void poll_health(unsigned long data)
+{
+	struct mlx5_core_dev *dev = (struct mlx5_core_dev *)data;
+	struct mlx5_core_health *health = &dev->priv.health;
+	unsigned long next;
+	u32 count;
+
+	count = ioread32be(health->health_counter);
+	if (count == health->prev)
+		++health->miss_counter;
+	else
+		health->miss_counter = 0;
+
+	health->prev = count;
+	if (health->miss_counter == MAX_MISSES) {
+		mlx5_core_err(dev, "device's health compromised\n");
+		print_health_info(dev);
+		spin_lock_irq(&health_lock);
+		list_add_tail(&health->list, &health_list);
+		spin_unlock_irq(&health_lock);
+
+		queue_work(mlx5_core_wq, &health_work);
+	} else {
+		get_random_bytes(&next, sizeof(next));
+		next %= HZ;
+		next += jiffies + MLX5_HEALTH_POLL_INTERVAL;
+		mod_timer(&health->timer, next);
+	}
+}
+
+void mlx5_start_health_poll(struct mlx5_core_dev *dev)
+{
+	struct mlx5_core_health *health = &dev->priv.health;
+
+	INIT_LIST_HEAD(&health->list);
+	init_timer(&health->timer);
+	health->health = &dev->iseg->health;
+	health->health_counter = &dev->iseg->health_counter;
+
+	health->timer.data = (unsigned long)dev;
+	health->timer.function = poll_health;
+	health->timer.expires = round_jiffies(jiffies + MLX5_HEALTH_POLL_INTERVAL);
+	add_timer(&health->timer);
+}
+
+void mlx5_stop_health_poll(struct mlx5_core_dev *dev)
+{
+	struct mlx5_core_health *health = &dev->priv.health;
+
+	del_timer_sync(&health->timer);
+
+	spin_lock_irq(&health_lock);
+	if (!list_empty(&health->list))
+		list_del_init(&health->list);
+	spin_unlock_irq(&health_lock);
+}
+
+void mlx5_health_cleanup(void)
+{
+}
+
+void  __init mlx5_health_init(void)
+{
+	INIT_WORK(&health_work, health_care);
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mad.c b/drivers/net/ethernet/mellanox/mlx5/core/mad.c
new file mode 100644
index 0000000..fd80ecf
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/mad.c
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2013, Mellanox Technologies inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/mlx5/driver.h>
+#include <linux/mlx5/cmd.h>
+#include "mlx5_core.h"
+
+int mlx5_core_mad_ifc(struct mlx5_core_dev *dev, void *inb, void *outb,
+		      u16 opmod, u8 port)
+{
+	struct mlx5_mad_ifc_mbox_in *in = NULL;
+	struct mlx5_mad_ifc_mbox_out *out = NULL;
+	int err;
+
+	in = kzalloc(sizeof(*in), GFP_KERNEL);
+	if (!in)
+		return -ENOMEM;
+
+	out = kzalloc(sizeof(*out), GFP_KERNEL);
+	if (!out) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	in->hdr.opcode = cpu_to_be16(MLX5_CMD_OP_MAD_IFC);
+	in->hdr.opmod = cpu_to_be16(opmod);
+	in->port = port;
+
+	memcpy(in->data, inb, sizeof(in->data));
+
+	err = mlx5_cmd_exec(dev, in, sizeof(*in), out, sizeof(*out));
+	if (err)
+		goto out;
+
+	if (out->hdr.status) {
+		err = mlx5_cmd_status_to_err(&out->hdr);
+		goto out;
+	}
+
+	memcpy(outb, out->data, sizeof(out->data));
+
+out:
+	kfree(out);
+	kfree(in);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx5_core_mad_ifc);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
new file mode 100644
index 0000000..71b10b2
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -0,0 +1,948 @@
+/*
+ * Copyright (c) 2013, Mellanox Technologies inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <asm-generic/kmap_types.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/pci.h>
+#include <linux/dma-mapping.h>
+#include <linux/slab.h>
+#include <linux/io-mapping.h>
+#include <linux/mlx5/driver.h>
+#include <linux/mlx5/cq.h>
+#include <linux/mlx5/qp.h>
+#include <linux/mlx5/srq.h>
+#include <linux/debugfs.h>
+#include <linux/mlx5/mlx5_ifc.h>
+#include "mlx5_core.h"
+
+#define DRIVER_NAME "mlx5_core"
+#define DRIVER_VERSION "2.2-1"
+#define DRIVER_RELDATE	"Feb 2014"
+
+MODULE_AUTHOR("Eli Cohen <eli@mellanox.com>");
+MODULE_DESCRIPTION("Mellanox ConnectX-IB HCA core library");
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_VERSION(DRIVER_VERSION);
+
+int mlx5_core_debug_mask;
+module_param_named(debug_mask, mlx5_core_debug_mask, int, 0644);
+MODULE_PARM_DESC(debug_mask, "debug mask: 1 = dump cmd data, 2 = dump cmd exec time, 3 = both. Default=0");
+
+#define MLX5_DEFAULT_PROF	2
+static int prof_sel = MLX5_DEFAULT_PROF;
+module_param_named(prof_sel, prof_sel, int, 0444);
+MODULE_PARM_DESC(prof_sel, "profile selector. Valid range 0 - 2");
+
+struct workqueue_struct *mlx5_core_wq;
+static LIST_HEAD(intf_list);
+static LIST_HEAD(dev_list);
+static DEFINE_MUTEX(intf_mutex);
+
+struct mlx5_device_context {
+	struct list_head	list;
+	struct mlx5_interface  *intf;
+	void		       *context;
+};
+
+static struct mlx5_profile profile[] = {
+	[0] = {
+		.mask           = 0,
+	},
+	[1] = {
+		.mask		= MLX5_PROF_MASK_QP_SIZE,
+		.log_max_qp	= 12,
+	},
+	[2] = {
+		.mask		= MLX5_PROF_MASK_QP_SIZE |
+				  MLX5_PROF_MASK_MR_CACHE,
+		.log_max_qp	= 17,
+		.mr_cache[0]	= {
+			.size	= 500,
+			.limit	= 250
+		},
+		.mr_cache[1]	= {
+			.size	= 500,
+			.limit	= 250
+		},
+		.mr_cache[2]	= {
+			.size	= 500,
+			.limit	= 250
+		},
+		.mr_cache[3]	= {
+			.size	= 500,
+			.limit	= 250
+		},
+		.mr_cache[4]	= {
+			.size	= 500,
+			.limit	= 250
+		},
+		.mr_cache[5]	= {
+			.size	= 500,
+			.limit	= 250
+		},
+		.mr_cache[6]	= {
+			.size	= 500,
+			.limit	= 250
+		},
+		.mr_cache[7]	= {
+			.size	= 500,
+			.limit	= 250
+		},
+		.mr_cache[8]	= {
+			.size	= 500,
+			.limit	= 250
+		},
+		.mr_cache[9]	= {
+			.size	= 500,
+			.limit	= 250
+		},
+		.mr_cache[10]	= {
+			.size	= 500,
+			.limit	= 250
+		},
+		.mr_cache[11]	= {
+			.size	= 500,
+			.limit	= 250
+		},
+		.mr_cache[12]	= {
+			.size	= 64,
+			.limit	= 32
+		},
+		.mr_cache[13]	= {
+			.size	= 32,
+			.limit	= 16
+		},
+		.mr_cache[14]	= {
+			.size	= 16,
+			.limit	= 8
+		},
+		.mr_cache[15]	= {
+			.size	= 8,
+			.limit	= 4
+		},
+	},
+};
+
+static int set_dma_caps(struct pci_dev *pdev)
+{
+	int err;
+
+	err = pci_set_dma_mask(pdev, DMA_BIT_MASK(64));
+	if (err) {
+		dev_warn(&pdev->dev, "Warning: couldn't set 64-bit PCI DMA mask\n");
+		err = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
+		if (err) {
+			dev_err(&pdev->dev, "Can't set PCI DMA mask, aborting\n");
+			return err;
+		}
+	}
+
+	err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64));
+	if (err) {
+		dev_warn(&pdev->dev,
+			 "Warning: couldn't set 64-bit consistent PCI DMA mask\n");
+		err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32));
+		if (err) {
+			dev_err(&pdev->dev,
+				"Can't set consistent PCI DMA mask, aborting\n");
+			return err;
+		}
+	}
+
+	dma_set_max_seg_size(&pdev->dev, 2u * 1024 * 1024 * 1024);
+	return err;
+}
+
+static int request_bar(struct pci_dev *pdev)
+{
+	int err = 0;
+
+	if (!(pci_resource_flags(pdev, 0) & IORESOURCE_MEM)) {
+		dev_err(&pdev->dev, "Missing registers BAR, aborting\n");
+		return -ENODEV;
+	}
+
+	err = pci_request_regions(pdev, DRIVER_NAME);
+	if (err)
+		dev_err(&pdev->dev, "Couldn't get PCI resources, aborting\n");
+
+	return err;
+}
+
+static void release_bar(struct pci_dev *pdev)
+{
+	pci_release_regions(pdev);
+}
+
+static int mlx5_enable_msix(struct mlx5_core_dev *dev)
+{
+	struct mlx5_eq_table *table = &dev->priv.eq_table;
+	int num_eqs = 1 << dev->caps.gen.log_max_eq;
+	int nvec;
+	int i;
+
+	nvec = dev->caps.gen.num_ports * num_online_cpus() + MLX5_EQ_VEC_COMP_BASE;
+	nvec = min_t(int, nvec, num_eqs);
+	if (nvec <= MLX5_EQ_VEC_COMP_BASE)
+		return -ENOMEM;
+
+	table->msix_arr = kzalloc(nvec * sizeof(*table->msix_arr), GFP_KERNEL);
+	if (!table->msix_arr)
+		return -ENOMEM;
+
+	for (i = 0; i < nvec; i++)
+		table->msix_arr[i].entry = i;
+
+	nvec = pci_enable_msix_range(dev->pdev, table->msix_arr,
+				     MLX5_EQ_VEC_COMP_BASE, nvec);
+	if (nvec < 0)
+		return nvec;
+
+	table->num_comp_vectors = nvec - MLX5_EQ_VEC_COMP_BASE;
+
+	return 0;
+}
+
+static void mlx5_disable_msix(struct mlx5_core_dev *dev)
+{
+	struct mlx5_eq_table *table = &dev->priv.eq_table;
+
+	pci_disable_msix(dev->pdev);
+	kfree(table->msix_arr);
+}
+
+struct mlx5_reg_host_endianess {
+	u8	he;
+	u8      rsvd[15];
+};
+
+
+#define CAP_MASK(pos, size) ((u64)((1 << (size)) - 1) << (pos))
+
+enum {
+	MLX5_CAP_BITS_RW_MASK = CAP_MASK(MLX5_CAP_OFF_CMDIF_CSUM, 2) |
+				MLX5_DEV_CAP_FLAG_DCT,
+};
+
+static u16 to_fw_pkey_sz(u32 size)
+{
+	switch (size) {
+	case 128:
+		return 0;
+	case 256:
+		return 1;
+	case 512:
+		return 2;
+	case 1024:
+		return 3;
+	case 2048:
+		return 4;
+	case 4096:
+		return 5;
+	default:
+		pr_warn("invalid pkey table size %d\n", size);
+		return 0;
+	}
+}
+
+/* selectively copy writable fields clearing any reserved area
+ */
+static void copy_rw_fields(void *to, struct mlx5_caps *from)
+{
+	__be64 *flags_off = (__be64 *)MLX5_ADDR_OF(cmd_hca_cap, to, reserved_22);
+	u64 v64;
+
+	MLX5_SET(cmd_hca_cap, to, log_max_qp, from->gen.log_max_qp);
+	MLX5_SET(cmd_hca_cap, to, log_max_ra_req_qp, from->gen.log_max_ra_req_qp);
+	MLX5_SET(cmd_hca_cap, to, log_max_ra_res_qp, from->gen.log_max_ra_res_qp);
+	MLX5_SET(cmd_hca_cap, to, pkey_table_size, from->gen.pkey_table_size);
+	MLX5_SET(cmd_hca_cap, to, log_max_ra_req_dc, from->gen.log_max_ra_req_dc);
+	MLX5_SET(cmd_hca_cap, to, log_max_ra_res_dc, from->gen.log_max_ra_res_dc);
+	MLX5_SET(cmd_hca_cap, to, pkey_table_size, to_fw_pkey_sz(from->gen.pkey_table_size));
+	v64 = from->gen.flags & MLX5_CAP_BITS_RW_MASK;
+	*flags_off = cpu_to_be64(v64);
+}
+
+static u16 get_pkey_table_size(int pkey)
+{
+	if (pkey > MLX5_MAX_LOG_PKEY_TABLE)
+		return 0;
+
+	return MLX5_MIN_PKEY_TABLE_SIZE << pkey;
+}
+
+static void fw2drv_caps(struct mlx5_caps *caps, void *out)
+{
+	struct mlx5_general_caps *gen = &caps->gen;
+
+	gen->max_srq_wqes = 1 << MLX5_GET_PR(cmd_hca_cap, out, log_max_srq_sz);
+	gen->max_wqes = 1 << MLX5_GET_PR(cmd_hca_cap, out, log_max_qp_sz);
+	gen->log_max_qp = MLX5_GET_PR(cmd_hca_cap, out, log_max_qp);
+	gen->log_max_strq = MLX5_GET_PR(cmd_hca_cap, out, log_max_strq_sz);
+	gen->log_max_srq = MLX5_GET_PR(cmd_hca_cap, out, log_max_srqs);
+	gen->max_cqes = 1 << MLX5_GET_PR(cmd_hca_cap, out, log_max_cq_sz);
+	gen->log_max_cq = MLX5_GET_PR(cmd_hca_cap, out, log_max_cq);
+	gen->max_eqes = 1 << MLX5_GET_PR(cmd_hca_cap, out, log_max_eq_sz);
+	gen->log_max_mkey = MLX5_GET_PR(cmd_hca_cap, out, log_max_mkey);
+	gen->log_max_eq = MLX5_GET_PR(cmd_hca_cap, out, log_max_eq);
+	gen->max_indirection = MLX5_GET_PR(cmd_hca_cap, out, max_indirection);
+	gen->log_max_mrw_sz = MLX5_GET_PR(cmd_hca_cap, out, log_max_mrw_sz);
+	gen->log_max_bsf_list_size = MLX5_GET_PR(cmd_hca_cap, out, log_max_bsf_list_size);
+	gen->log_max_klm_list_size = MLX5_GET_PR(cmd_hca_cap, out, log_max_klm_list_size);
+	gen->log_max_ra_req_dc = MLX5_GET_PR(cmd_hca_cap, out, log_max_ra_req_dc);
+	gen->log_max_ra_res_dc = MLX5_GET_PR(cmd_hca_cap, out, log_max_ra_res_dc);
+	gen->log_max_ra_req_qp = MLX5_GET_PR(cmd_hca_cap, out, log_max_ra_req_qp);
+	gen->log_max_ra_res_qp = MLX5_GET_PR(cmd_hca_cap, out, log_max_ra_res_qp);
+	gen->max_qp_counters = MLX5_GET_PR(cmd_hca_cap, out, max_qp_cnt);
+	gen->pkey_table_size = get_pkey_table_size(MLX5_GET_PR(cmd_hca_cap, out, pkey_table_size));
+	gen->local_ca_ack_delay = MLX5_GET_PR(cmd_hca_cap, out, local_ca_ack_delay);
+	gen->num_ports = MLX5_GET_PR(cmd_hca_cap, out, num_ports);
+	gen->log_max_msg = MLX5_GET_PR(cmd_hca_cap, out, log_max_msg);
+	gen->stat_rate_support = MLX5_GET_PR(cmd_hca_cap, out, stat_rate_support);
+	gen->flags = be64_to_cpu(*(__be64 *)MLX5_ADDR_OF(cmd_hca_cap, out, reserved_22));
+	pr_debug("flags = 0x%llx\n", gen->flags);
+	gen->uar_sz = MLX5_GET_PR(cmd_hca_cap, out, uar_sz);
+	gen->min_log_pg_sz = MLX5_GET_PR(cmd_hca_cap, out, log_pg_sz);
+	gen->bf_reg_size = MLX5_GET_PR(cmd_hca_cap, out, bf);
+	gen->bf_reg_size = 1 << MLX5_GET_PR(cmd_hca_cap, out, log_bf_reg_size);
+	gen->max_sq_desc_sz = MLX5_GET_PR(cmd_hca_cap, out, max_wqe_sz_sq);
+	gen->max_rq_desc_sz = MLX5_GET_PR(cmd_hca_cap, out, max_wqe_sz_rq);
+	gen->max_dc_sq_desc_sz = MLX5_GET_PR(cmd_hca_cap, out, max_wqe_sz_sq_dc);
+	gen->max_qp_mcg = MLX5_GET_PR(cmd_hca_cap, out, max_qp_mcg);
+	gen->log_max_pd = MLX5_GET_PR(cmd_hca_cap, out, log_max_pd);
+	gen->log_max_xrcd = MLX5_GET_PR(cmd_hca_cap, out, log_max_xrcd);
+	gen->log_uar_page_sz = MLX5_GET_PR(cmd_hca_cap, out, log_uar_page_sz);
+}
+
+static const char *caps_opmod_str(u16 opmod)
+{
+	switch (opmod) {
+	case HCA_CAP_OPMOD_GET_MAX:
+		return "GET_MAX";
+	case HCA_CAP_OPMOD_GET_CUR:
+		return "GET_CUR";
+	default:
+		return "Invalid";
+	}
+}
+
+int mlx5_core_get_caps(struct mlx5_core_dev *dev, struct mlx5_caps *caps,
+		       u16 opmod)
+{
+	u8 in[MLX5_ST_SZ_BYTES(query_hca_cap_in)];
+	int out_sz = MLX5_ST_SZ_BYTES(query_hca_cap_out);
+	void *out;
+	int err;
+
+	memset(in, 0, sizeof(in));
+	out = kzalloc(out_sz, GFP_KERNEL);
+	if (!out)
+		return -ENOMEM;
+	MLX5_SET(query_hca_cap_in, in, opcode, MLX5_CMD_OP_QUERY_HCA_CAP);
+	MLX5_SET(query_hca_cap_in, in, op_mod, opmod);
+	err = mlx5_cmd_exec(dev, in, sizeof(in), out, out_sz);
+	if (err)
+		goto query_ex;
+
+	err = mlx5_cmd_status_to_err_v2(out);
+	if (err) {
+		mlx5_core_warn(dev, "query max hca cap failed, %d\n", err);
+		goto query_ex;
+	}
+	mlx5_core_dbg(dev, "%s\n", caps_opmod_str(opmod));
+	fw2drv_caps(caps, MLX5_ADDR_OF(query_hca_cap_out, out, capability_struct));
+
+query_ex:
+	kfree(out);
+	return err;
+}
+
+static int set_caps(struct mlx5_core_dev *dev, void *in, int in_sz)
+{
+	u32 out[MLX5_ST_SZ_DW(set_hca_cap_out)];
+	int err;
+
+	memset(out, 0, sizeof(out));
+
+	MLX5_SET(set_hca_cap_in, in, opcode, MLX5_CMD_OP_SET_HCA_CAP);
+	err = mlx5_cmd_exec(dev, in, in_sz, out, sizeof(out));
+	if (err)
+		return err;
+
+	err = mlx5_cmd_status_to_err_v2(out);
+
+	return err;
+}
+
+static int handle_hca_cap(struct mlx5_core_dev *dev)
+{
+	void *set_ctx = NULL;
+	struct mlx5_profile *prof = dev->profile;
+	struct mlx5_caps *cur_caps = NULL;
+	struct mlx5_caps *max_caps = NULL;
+	int err = -ENOMEM;
+	int set_sz = MLX5_ST_SZ_BYTES(set_hca_cap_in);
+
+	set_ctx = kzalloc(set_sz, GFP_KERNEL);
+	if (!set_ctx)
+		goto query_ex;
+
+	max_caps = kzalloc(sizeof(*max_caps), GFP_KERNEL);
+	if (!max_caps)
+		goto query_ex;
+
+	cur_caps = kzalloc(sizeof(*cur_caps), GFP_KERNEL);
+	if (!cur_caps)
+		goto query_ex;
+
+	err = mlx5_core_get_caps(dev, max_caps, HCA_CAP_OPMOD_GET_MAX);
+	if (err)
+		goto query_ex;
+
+	err = mlx5_core_get_caps(dev, cur_caps, HCA_CAP_OPMOD_GET_CUR);
+	if (err)
+		goto query_ex;
+
+	/* we limit the size of the pkey table to 128 entries for now */
+	cur_caps->gen.pkey_table_size = 128;
+
+	if (prof->mask & MLX5_PROF_MASK_QP_SIZE)
+		cur_caps->gen.log_max_qp = prof->log_max_qp;
+
+	/* disable checksum */
+	cur_caps->gen.flags &= ~MLX5_DEV_CAP_FLAG_CMDIF_CSUM;
+
+	copy_rw_fields(MLX5_ADDR_OF(set_hca_cap_in, set_ctx, hca_capability_struct),
+		       cur_caps);
+	err = set_caps(dev, set_ctx, set_sz);
+
+query_ex:
+	kfree(cur_caps);
+	kfree(max_caps);
+	kfree(set_ctx);
+
+	return err;
+}
+
+static int set_hca_ctrl(struct mlx5_core_dev *dev)
+{
+	struct mlx5_reg_host_endianess he_in;
+	struct mlx5_reg_host_endianess he_out;
+	int err;
+
+	memset(&he_in, 0, sizeof(he_in));
+	he_in.he = MLX5_SET_HOST_ENDIANNESS;
+	err = mlx5_core_access_reg(dev, &he_in,  sizeof(he_in),
+					&he_out, sizeof(he_out),
+					MLX5_REG_HOST_ENDIANNESS, 0, 1);
+	return err;
+}
+
+static int mlx5_core_enable_hca(struct mlx5_core_dev *dev)
+{
+	int err;
+	struct mlx5_enable_hca_mbox_in in;
+	struct mlx5_enable_hca_mbox_out out;
+
+	memset(&in, 0, sizeof(in));
+	memset(&out, 0, sizeof(out));
+	in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_ENABLE_HCA);
+	err = mlx5_cmd_exec(dev, &in, sizeof(in), &out, sizeof(out));
+	if (err)
+		return err;
+
+	if (out.hdr.status)
+		return mlx5_cmd_status_to_err(&out.hdr);
+
+	return 0;
+}
+
+static int mlx5_core_disable_hca(struct mlx5_core_dev *dev)
+{
+	int err;
+	struct mlx5_disable_hca_mbox_in in;
+	struct mlx5_disable_hca_mbox_out out;
+
+	memset(&in, 0, sizeof(in));
+	memset(&out, 0, sizeof(out));
+	in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_DISABLE_HCA);
+	err = mlx5_cmd_exec(dev, &in, sizeof(in), &out, sizeof(out));
+	if (err)
+		return err;
+
+	if (out.hdr.status)
+		return mlx5_cmd_status_to_err(&out.hdr);
+
+	return 0;
+}
+
+static int mlx5_dev_init(struct mlx5_core_dev *dev, struct pci_dev *pdev)
+{
+	struct mlx5_priv *priv = &dev->priv;
+	int err;
+
+	dev->pdev = pdev;
+	pci_set_drvdata(dev->pdev, dev);
+	strncpy(priv->name, dev_name(&pdev->dev), MLX5_MAX_NAME_LEN);
+	priv->name[MLX5_MAX_NAME_LEN - 1] = 0;
+
+	mutex_init(&priv->pgdir_mutex);
+	INIT_LIST_HEAD(&priv->pgdir_list);
+	spin_lock_init(&priv->mkey_lock);
+
+	priv->dbg_root = debugfs_create_dir(dev_name(&pdev->dev), mlx5_debugfs_root);
+	if (!priv->dbg_root)
+		return -ENOMEM;
+
+	err = pci_enable_device(pdev);
+	if (err) {
+		dev_err(&pdev->dev, "Cannot enable PCI device, aborting\n");
+		goto err_dbg;
+	}
+
+	err = request_bar(pdev);
+	if (err) {
+		dev_err(&pdev->dev, "error requesting BARs, aborting\n");
+		goto err_disable;
+	}
+
+	pci_set_master(pdev);
+
+	err = set_dma_caps(pdev);
+	if (err) {
+		dev_err(&pdev->dev, "Failed setting DMA capabilities mask, aborting\n");
+		goto err_clr_master;
+	}
+
+	dev->iseg_base = pci_resource_start(dev->pdev, 0);
+	dev->iseg = ioremap(dev->iseg_base, sizeof(*dev->iseg));
+	if (!dev->iseg) {
+		err = -ENOMEM;
+		dev_err(&pdev->dev, "Failed mapping initialization segment, aborting\n");
+		goto err_clr_master;
+	}
+	dev_info(&pdev->dev, "firmware version: %d.%d.%d\n", fw_rev_maj(dev),
+		 fw_rev_min(dev), fw_rev_sub(dev));
+
+	err = mlx5_cmd_init(dev);
+	if (err) {
+		dev_err(&pdev->dev, "Failed initializing command interface, aborting\n");
+		goto err_unmap;
+	}
+
+	mlx5_pagealloc_init(dev);
+
+	err = mlx5_core_enable_hca(dev);
+	if (err) {
+		dev_err(&pdev->dev, "enable hca failed\n");
+		goto err_pagealloc_cleanup;
+	}
+
+	err = mlx5_satisfy_startup_pages(dev, 1);
+	if (err) {
+		dev_err(&pdev->dev, "failed to allocate boot pages\n");
+		goto err_disable_hca;
+	}
+
+	err = set_hca_ctrl(dev);
+	if (err) {
+		dev_err(&pdev->dev, "set_hca_ctrl failed\n");
+		goto reclaim_boot_pages;
+	}
+
+	err = handle_hca_cap(dev);
+	if (err) {
+		dev_err(&pdev->dev, "handle_hca_cap failed\n");
+		goto reclaim_boot_pages;
+	}
+
+	err = mlx5_satisfy_startup_pages(dev, 0);
+	if (err) {
+		dev_err(&pdev->dev, "failed to allocate init pages\n");
+		goto reclaim_boot_pages;
+	}
+
+	err = mlx5_pagealloc_start(dev);
+	if (err) {
+		dev_err(&pdev->dev, "mlx5_pagealloc_start failed\n");
+		goto reclaim_boot_pages;
+	}
+
+	err = mlx5_cmd_init_hca(dev);
+	if (err) {
+		dev_err(&pdev->dev, "init hca failed\n");
+		goto err_pagealloc_stop;
+	}
+
+	mlx5_start_health_poll(dev);
+
+	err = mlx5_cmd_query_hca_cap(dev, &dev->caps);
+	if (err) {
+		dev_err(&pdev->dev, "query hca failed\n");
+		goto err_stop_poll;
+	}
+
+	err = mlx5_cmd_query_adapter(dev);
+	if (err) {
+		dev_err(&pdev->dev, "query adapter failed\n");
+		goto err_stop_poll;
+	}
+
+	err = mlx5_enable_msix(dev);
+	if (err) {
+		dev_err(&pdev->dev, "enable msix failed\n");
+		goto err_stop_poll;
+	}
+
+	err = mlx5_eq_init(dev);
+	if (err) {
+		dev_err(&pdev->dev, "failed to initialize eq\n");
+		goto disable_msix;
+	}
+
+	err = mlx5_alloc_uuars(dev, &priv->uuari);
+	if (err) {
+		dev_err(&pdev->dev, "Failed allocating uar, aborting\n");
+		goto err_eq_cleanup;
+	}
+
+	err = mlx5_start_eqs(dev);
+	if (err) {
+		dev_err(&pdev->dev, "Failed to start pages and async EQs\n");
+		goto err_free_uar;
+	}
+
+	MLX5_INIT_DOORBELL_LOCK(&priv->cq_uar_lock);
+
+	mlx5_init_cq_table(dev);
+	mlx5_init_qp_table(dev);
+	mlx5_init_srq_table(dev);
+	mlx5_init_mr_table(dev);
+
+	return 0;
+
+err_free_uar:
+	mlx5_free_uuars(dev, &priv->uuari);
+
+err_eq_cleanup:
+	mlx5_eq_cleanup(dev);
+
+disable_msix:
+	mlx5_disable_msix(dev);
+
+err_stop_poll:
+	mlx5_stop_health_poll(dev);
+	if (mlx5_cmd_teardown_hca(dev)) {
+		dev_err(&dev->pdev->dev, "tear_down_hca failed, skip cleanup\n");
+		return err;
+	}
+
+err_pagealloc_stop:
+	mlx5_pagealloc_stop(dev);
+
+reclaim_boot_pages:
+	mlx5_reclaim_startup_pages(dev);
+
+err_disable_hca:
+	mlx5_core_disable_hca(dev);
+
+err_pagealloc_cleanup:
+	mlx5_pagealloc_cleanup(dev);
+	mlx5_cmd_cleanup(dev);
+
+err_unmap:
+	iounmap(dev->iseg);
+
+err_clr_master:
+	pci_clear_master(dev->pdev);
+	release_bar(dev->pdev);
+
+err_disable:
+	pci_disable_device(dev->pdev);
+
+err_dbg:
+	debugfs_remove(priv->dbg_root);
+	return err;
+}
+EXPORT_SYMBOL(mlx5_dev_init);
+
+static void mlx5_dev_cleanup(struct mlx5_core_dev *dev)
+{
+	struct mlx5_priv *priv = &dev->priv;
+
+	mlx5_cleanup_srq_table(dev);
+	mlx5_cleanup_qp_table(dev);
+	mlx5_cleanup_cq_table(dev);
+	mlx5_stop_eqs(dev);
+	mlx5_free_uuars(dev, &priv->uuari);
+	mlx5_eq_cleanup(dev);
+	mlx5_disable_msix(dev);
+	mlx5_stop_health_poll(dev);
+	if (mlx5_cmd_teardown_hca(dev)) {
+		dev_err(&dev->pdev->dev, "tear_down_hca failed, skip cleanup\n");
+		return;
+	}
+	mlx5_pagealloc_stop(dev);
+	mlx5_reclaim_startup_pages(dev);
+	mlx5_core_disable_hca(dev);
+	mlx5_pagealloc_cleanup(dev);
+	mlx5_cmd_cleanup(dev);
+	iounmap(dev->iseg);
+	pci_clear_master(dev->pdev);
+	release_bar(dev->pdev);
+	pci_disable_device(dev->pdev);
+	debugfs_remove(priv->dbg_root);
+}
+
+static void mlx5_add_device(struct mlx5_interface *intf, struct mlx5_priv *priv)
+{
+	struct mlx5_device_context *dev_ctx;
+	struct mlx5_core_dev *dev = container_of(priv, struct mlx5_core_dev, priv);
+
+	dev_ctx = kmalloc(sizeof(*dev_ctx), GFP_KERNEL);
+	if (!dev_ctx) {
+		pr_warn("mlx5_add_device: alloc context failed\n");
+		return;
+	}
+
+	dev_ctx->intf    = intf;
+	dev_ctx->context = intf->add(dev);
+
+	if (dev_ctx->context) {
+		spin_lock_irq(&priv->ctx_lock);
+		list_add_tail(&dev_ctx->list, &priv->ctx_list);
+		spin_unlock_irq(&priv->ctx_lock);
+	} else {
+		kfree(dev_ctx);
+	}
+}
+
+static void mlx5_remove_device(struct mlx5_interface *intf, struct mlx5_priv *priv)
+{
+	struct mlx5_device_context *dev_ctx;
+	struct mlx5_core_dev *dev = container_of(priv, struct mlx5_core_dev, priv);
+
+	list_for_each_entry(dev_ctx, &priv->ctx_list, list)
+		if (dev_ctx->intf == intf) {
+			spin_lock_irq(&priv->ctx_lock);
+			list_del(&dev_ctx->list);
+			spin_unlock_irq(&priv->ctx_lock);
+
+			intf->remove(dev, dev_ctx->context);
+			kfree(dev_ctx);
+			return;
+		}
+}
+static int mlx5_register_device(struct mlx5_core_dev *dev)
+{
+	struct mlx5_priv *priv = &dev->priv;
+	struct mlx5_interface *intf;
+
+	mutex_lock(&intf_mutex);
+	list_add_tail(&priv->dev_list, &dev_list);
+	list_for_each_entry(intf, &intf_list, list)
+		mlx5_add_device(intf, priv);
+	mutex_unlock(&intf_mutex);
+
+	return 0;
+}
+static void mlx5_unregister_device(struct mlx5_core_dev *dev)
+{
+	struct mlx5_priv *priv = &dev->priv;
+	struct mlx5_interface *intf;
+
+	mutex_lock(&intf_mutex);
+	list_for_each_entry(intf, &intf_list, list)
+		mlx5_remove_device(intf, priv);
+	list_del(&priv->dev_list);
+	mutex_unlock(&intf_mutex);
+}
+
+int mlx5_register_interface(struct mlx5_interface *intf)
+{
+	struct mlx5_priv *priv;
+
+	if (!intf->add || !intf->remove)
+		return -EINVAL;
+
+	mutex_lock(&intf_mutex);
+	list_add_tail(&intf->list, &intf_list);
+	list_for_each_entry(priv, &dev_list, dev_list)
+		mlx5_add_device(intf, priv);
+	mutex_unlock(&intf_mutex);
+
+	return 0;
+}
+EXPORT_SYMBOL(mlx5_register_interface);
+
+void mlx5_unregister_interface(struct mlx5_interface *intf)
+{
+	struct mlx5_priv *priv;
+
+	mutex_lock(&intf_mutex);
+	list_for_each_entry(priv, &dev_list, dev_list)
+	       mlx5_remove_device(intf, priv);
+	list_del(&intf->list);
+	mutex_unlock(&intf_mutex);
+}
+EXPORT_SYMBOL(mlx5_unregister_interface);
+
+static void mlx5_core_event(struct mlx5_core_dev *dev, enum mlx5_dev_event event,
+			    unsigned long param)
+{
+	struct mlx5_priv *priv = &dev->priv;
+	struct mlx5_device_context *dev_ctx;
+	unsigned long flags;
+
+	spin_lock_irqsave(&priv->ctx_lock, flags);
+
+	list_for_each_entry(dev_ctx, &priv->ctx_list, list)
+		if (dev_ctx->intf->event)
+			dev_ctx->intf->event(dev, dev_ctx->context, event, param);
+
+	spin_unlock_irqrestore(&priv->ctx_lock, flags);
+}
+
+struct mlx5_core_event_handler {
+	void (*event)(struct mlx5_core_dev *dev,
+		      enum mlx5_dev_event event,
+		      void *data);
+};
+
+static int init_one(struct pci_dev *pdev,
+		    const struct pci_device_id *id)
+{
+	struct mlx5_core_dev *dev;
+	struct mlx5_priv *priv;
+	int err;
+
+	dev = kzalloc(sizeof(*dev), GFP_KERNEL);
+	if (!dev) {
+		dev_err(&pdev->dev, "kzalloc failed\n");
+		return -ENOMEM;
+	}
+	priv = &dev->priv;
+
+	pci_set_drvdata(pdev, dev);
+
+	if (prof_sel < 0 || prof_sel >= ARRAY_SIZE(profile)) {
+		pr_warn("selected profile out of range, selecting default (%d)\n",
+			MLX5_DEFAULT_PROF);
+		prof_sel = MLX5_DEFAULT_PROF;
+	}
+	dev->profile = &profile[prof_sel];
+	dev->event = mlx5_core_event;
+
+	INIT_LIST_HEAD(&priv->ctx_list);
+	spin_lock_init(&priv->ctx_lock);
+	err = mlx5_dev_init(dev, pdev);
+	if (err) {
+		dev_err(&pdev->dev, "mlx5_dev_init failed %d\n", err);
+		goto out;
+	}
+
+	err = mlx5_register_device(dev);
+	if (err) {
+		dev_err(&pdev->dev, "mlx5_register_device failed %d\n", err);
+		goto out_init;
+	}
+
+	return 0;
+
+out_init:
+	mlx5_dev_cleanup(dev);
+out:
+	kfree(dev);
+	return err;
+}
+static void remove_one(struct pci_dev *pdev)
+{
+	struct mlx5_core_dev *dev  = pci_get_drvdata(pdev);
+
+	mlx5_unregister_device(dev);
+	mlx5_dev_cleanup(dev);
+	kfree(dev);
+}
+
+static const struct pci_device_id mlx5_core_pci_table[] = {
+	{ PCI_VDEVICE(MELLANOX, 4113) }, /* MT4113 Connect-IB */
+	{ PCI_VDEVICE(MELLANOX, 4115) }, /* ConnectX-4 */
+	{ 0, }
+};
+
+MODULE_DEVICE_TABLE(pci, mlx5_core_pci_table);
+
+static struct pci_driver mlx5_core_driver = {
+	.name           = DRIVER_NAME,
+	.id_table       = mlx5_core_pci_table,
+	.probe          = init_one,
+	.remove         = remove_one
+};
+
+static int __init init(void)
+{
+	int err;
+
+	mlx5_register_debugfs();
+	mlx5_core_wq = create_singlethread_workqueue("mlx5_core_wq");
+	if (!mlx5_core_wq) {
+		err = -ENOMEM;
+		goto err_debug;
+	}
+	mlx5_health_init();
+
+	err = pci_register_driver(&mlx5_core_driver);
+	if (err)
+		goto err_health;
+
+	return 0;
+
+err_health:
+	mlx5_health_cleanup();
+	destroy_workqueue(mlx5_core_wq);
+err_debug:
+	mlx5_unregister_debugfs();
+	return err;
+}
+
+static void __exit cleanup(void)
+{
+	pci_unregister_driver(&mlx5_core_driver);
+	mlx5_health_cleanup();
+	destroy_workqueue(mlx5_core_wq);
+	mlx5_unregister_debugfs();
+}
+
+module_init(init);
+module_exit(cleanup);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mcg.c b/drivers/net/ethernet/mellanox/mlx5/core/mcg.c
new file mode 100644
index 0000000..4483764
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/mcg.c
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2013, Mellanox Technologies inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/mlx5/driver.h>
+#include <linux/mlx5/cmd.h>
+#include <rdma/ib_verbs.h>
+#include "mlx5_core.h"
+
+struct mlx5_attach_mcg_mbox_in {
+	struct mlx5_inbox_hdr	hdr;
+	__be32			qpn;
+	__be32			rsvd;
+	u8			gid[16];
+};
+
+struct mlx5_attach_mcg_mbox_out {
+	struct mlx5_outbox_hdr	hdr;
+	u8			rsvf[8];
+};
+
+struct mlx5_detach_mcg_mbox_in {
+	struct mlx5_inbox_hdr	hdr;
+	__be32			qpn;
+	__be32			rsvd;
+	u8			gid[16];
+};
+
+struct mlx5_detach_mcg_mbox_out {
+	struct mlx5_outbox_hdr	hdr;
+	u8			rsvf[8];
+};
+
+int mlx5_core_attach_mcg(struct mlx5_core_dev *dev, union ib_gid *mgid, u32 qpn)
+{
+	struct mlx5_attach_mcg_mbox_in in;
+	struct mlx5_attach_mcg_mbox_out out;
+	int err;
+
+	memset(&in, 0, sizeof(in));
+	memset(&out, 0, sizeof(out));
+	in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_ATTACH_TO_MCG);
+	memcpy(in.gid, mgid, sizeof(*mgid));
+	in.qpn = cpu_to_be32(qpn);
+	err = mlx5_cmd_exec(dev, &in, sizeof(in), &out, sizeof(out));
+	if (err)
+		return err;
+
+	if (out.hdr.status)
+		err = mlx5_cmd_status_to_err(&out.hdr);
+
+	return err;
+}
+EXPORT_SYMBOL(mlx5_core_attach_mcg);
+
+int mlx5_core_detach_mcg(struct mlx5_core_dev *dev, union ib_gid *mgid, u32 qpn)
+{
+	struct mlx5_detach_mcg_mbox_in in;
+	struct mlx5_detach_mcg_mbox_out out;
+	int err;
+
+	memset(&in, 0, sizeof(in));
+	memset(&out, 0, sizeof(out));
+	in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_DETACH_FROM_MCG);
+	memcpy(in.gid, mgid, sizeof(*mgid));
+	in.qpn = cpu_to_be32(qpn);
+	err = mlx5_cmd_exec(dev, &in, sizeof(in), &out, sizeof(out));
+	if (err)
+		return err;
+
+	if (out.hdr.status)
+		err = mlx5_cmd_status_to_err(&out.hdr);
+
+	return err;
+}
+EXPORT_SYMBOL(mlx5_core_detach_mcg);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
new file mode 100644
index 0000000..f0c9f9a
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2013, Mellanox Technologies inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __MLX5_CORE_H__
+#define __MLX5_CORE_H__
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+
+extern int mlx5_core_debug_mask;
+
+#define mlx5_core_dbg(dev, format, ...)					\
+	pr_debug("%s:%s:%d:(pid %d): " format,				\
+		 (dev)->priv.name, __func__, __LINE__, current->pid,	\
+		 ##__VA_ARGS__)
+
+#define mlx5_core_dbg_mask(dev, mask, format, ...)			\
+do {									\
+	if ((mask) & mlx5_core_debug_mask)				\
+		mlx5_core_dbg(dev, format, ##__VA_ARGS__);		\
+} while (0)
+
+#define mlx5_core_err(dev, format, ...)					\
+	pr_err("%s:%s:%d:(pid %d): " format,				\
+	       (dev)->priv.name, __func__, __LINE__, current->pid,	\
+	       ##__VA_ARGS__)
+
+#define mlx5_core_warn(dev, format, ...)				\
+	pr_warn("%s:%s:%d:(pid %d): " format,				\
+		(dev)->priv.name, __func__, __LINE__, current->pid,	\
+		##__VA_ARGS__)
+
+enum {
+	MLX5_CMD_DATA, /* print command payload only */
+	MLX5_CMD_TIME, /* print command execution time */
+};
+
+
+int mlx5_cmd_query_hca_cap(struct mlx5_core_dev *dev,
+			   struct mlx5_caps *caps);
+int mlx5_cmd_query_adapter(struct mlx5_core_dev *dev);
+int mlx5_cmd_init_hca(struct mlx5_core_dev *dev);
+int mlx5_cmd_teardown_hca(struct mlx5_core_dev *dev);
+
+#endif /* __MLX5_CORE_H__ */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mr.c b/drivers/net/ethernet/mellanox/mlx5/core/mr.c
new file mode 100644
index 0000000..184c361
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/mr.c
@@ -0,0 +1,248 @@
+/*
+ * Copyright (c) 2013, Mellanox Technologies inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/mlx5/driver.h>
+#include <linux/mlx5/cmd.h>
+#include "mlx5_core.h"
+
+void mlx5_init_mr_table(struct mlx5_core_dev *dev)
+{
+	struct mlx5_mr_table *table = &dev->priv.mr_table;
+
+	rwlock_init(&table->lock);
+	INIT_RADIX_TREE(&table->tree, GFP_ATOMIC);
+}
+
+void mlx5_cleanup_mr_table(struct mlx5_core_dev *dev)
+{
+}
+
+int mlx5_core_create_mkey(struct mlx5_core_dev *dev, struct mlx5_core_mr *mr,
+			  struct mlx5_create_mkey_mbox_in *in, int inlen,
+			  mlx5_cmd_cbk_t callback, void *context,
+			  struct mlx5_create_mkey_mbox_out *out)
+{
+	struct mlx5_mr_table *table = &dev->priv.mr_table;
+	struct mlx5_create_mkey_mbox_out lout;
+	int err;
+	u8 key;
+
+	memset(&lout, 0, sizeof(lout));
+	spin_lock_irq(&dev->priv.mkey_lock);
+	key = dev->priv.mkey_key++;
+	spin_unlock_irq(&dev->priv.mkey_lock);
+	in->seg.qpn_mkey7_0 |= cpu_to_be32(key);
+	in->hdr.opcode = cpu_to_be16(MLX5_CMD_OP_CREATE_MKEY);
+	if (callback) {
+		err = mlx5_cmd_exec_cb(dev, in, inlen, out, sizeof(*out),
+				       callback, context);
+		return err;
+	} else {
+		err = mlx5_cmd_exec(dev, in, inlen, &lout, sizeof(lout));
+	}
+
+	if (err) {
+		mlx5_core_dbg(dev, "cmd exec failed %d\n", err);
+		return err;
+	}
+
+	if (lout.hdr.status) {
+		mlx5_core_dbg(dev, "status %d\n", lout.hdr.status);
+		return mlx5_cmd_status_to_err(&lout.hdr);
+	}
+
+	mr->iova = be64_to_cpu(in->seg.start_addr);
+	mr->size = be64_to_cpu(in->seg.len);
+	mr->key = mlx5_idx_to_mkey(be32_to_cpu(lout.mkey) & 0xffffff) | key;
+	mr->pd = be32_to_cpu(in->seg.flags_pd) & 0xffffff;
+
+	mlx5_core_dbg(dev, "out 0x%x, key 0x%x, mkey 0x%x\n",
+		      be32_to_cpu(lout.mkey), key, mr->key);
+
+	/* connect to MR tree */
+	write_lock_irq(&table->lock);
+	err = radix_tree_insert(&table->tree, mlx5_base_mkey(mr->key), mr);
+	write_unlock_irq(&table->lock);
+	if (err) {
+		mlx5_core_warn(dev, "failed radix tree insert of mr 0x%x, %d\n",
+			       mlx5_base_mkey(mr->key), err);
+		mlx5_core_destroy_mkey(dev, mr);
+	}
+
+	return err;
+}
+EXPORT_SYMBOL(mlx5_core_create_mkey);
+
+int mlx5_core_destroy_mkey(struct mlx5_core_dev *dev, struct mlx5_core_mr *mr)
+{
+	struct mlx5_mr_table *table = &dev->priv.mr_table;
+	struct mlx5_destroy_mkey_mbox_in in;
+	struct mlx5_destroy_mkey_mbox_out out;
+	struct mlx5_core_mr *deleted_mr;
+	unsigned long flags;
+	int err;
+
+	memset(&in, 0, sizeof(in));
+	memset(&out, 0, sizeof(out));
+
+	write_lock_irqsave(&table->lock, flags);
+	deleted_mr = radix_tree_delete(&table->tree, mlx5_base_mkey(mr->key));
+	write_unlock_irqrestore(&table->lock, flags);
+	if (!deleted_mr) {
+		mlx5_core_warn(dev, "failed radix tree delete of mr 0x%x\n",
+			       mlx5_base_mkey(mr->key));
+		return -ENOENT;
+	}
+
+	in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_DESTROY_MKEY);
+	in.mkey = cpu_to_be32(mlx5_mkey_to_idx(mr->key));
+	err = mlx5_cmd_exec(dev, &in, sizeof(in), &out, sizeof(out));
+	if (err)
+		return err;
+
+	if (out.hdr.status)
+		return mlx5_cmd_status_to_err(&out.hdr);
+
+	return err;
+}
+EXPORT_SYMBOL(mlx5_core_destroy_mkey);
+
+int mlx5_core_query_mkey(struct mlx5_core_dev *dev, struct mlx5_core_mr *mr,
+			 struct mlx5_query_mkey_mbox_out *out, int outlen)
+{
+	struct mlx5_destroy_mkey_mbox_in in;
+	int err;
+
+	memset(&in, 0, sizeof(in));
+	memset(out, 0, outlen);
+
+	in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_QUERY_MKEY);
+	in.mkey = cpu_to_be32(mlx5_mkey_to_idx(mr->key));
+	err = mlx5_cmd_exec(dev, &in, sizeof(in), out, outlen);
+	if (err)
+		return err;
+
+	if (out->hdr.status)
+		return mlx5_cmd_status_to_err(&out->hdr);
+
+	return err;
+}
+EXPORT_SYMBOL(mlx5_core_query_mkey);
+
+int mlx5_core_dump_fill_mkey(struct mlx5_core_dev *dev, struct mlx5_core_mr *mr,
+			     u32 *mkey)
+{
+	struct mlx5_query_special_ctxs_mbox_in in;
+	struct mlx5_query_special_ctxs_mbox_out out;
+	int err;
+
+	memset(&in, 0, sizeof(in));
+	memset(&out, 0, sizeof(out));
+
+	in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_QUERY_SPECIAL_CONTEXTS);
+	err = mlx5_cmd_exec(dev, &in, sizeof(in), &out, sizeof(out));
+	if (err)
+		return err;
+
+	if (out.hdr.status)
+		return mlx5_cmd_status_to_err(&out.hdr);
+
+	*mkey = be32_to_cpu(out.dump_fill_mkey);
+
+	return err;
+}
+EXPORT_SYMBOL(mlx5_core_dump_fill_mkey);
+
+int mlx5_core_create_psv(struct mlx5_core_dev *dev, u32 pdn,
+			 int npsvs, u32 *sig_index)
+{
+	struct mlx5_allocate_psv_in in;
+	struct mlx5_allocate_psv_out out;
+	int i, err;
+
+	if (npsvs > MLX5_MAX_PSVS)
+		return -EINVAL;
+
+	memset(&in, 0, sizeof(in));
+	memset(&out, 0, sizeof(out));
+
+	in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_CREATE_PSV);
+	in.npsv_pd = cpu_to_be32((npsvs << 28) | pdn);
+	err = mlx5_cmd_exec(dev, &in, sizeof(in), &out, sizeof(out));
+	if (err) {
+		mlx5_core_err(dev, "cmd exec failed %d\n", err);
+		return err;
+	}
+
+	if (out.hdr.status) {
+		mlx5_core_err(dev, "create_psv bad status %d\n",
+			      out.hdr.status);
+		return mlx5_cmd_status_to_err(&out.hdr);
+	}
+
+	for (i = 0; i < npsvs; i++)
+		sig_index[i] = be32_to_cpu(out.psv_idx[i]) & 0xffffff;
+
+	return err;
+}
+EXPORT_SYMBOL(mlx5_core_create_psv);
+
+int mlx5_core_destroy_psv(struct mlx5_core_dev *dev, int psv_num)
+{
+	struct mlx5_destroy_psv_in in;
+	struct mlx5_destroy_psv_out out;
+	int err;
+
+	memset(&in, 0, sizeof(in));
+	memset(&out, 0, sizeof(out));
+
+	in.psv_number = cpu_to_be32(psv_num);
+	in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_DESTROY_PSV);
+	err = mlx5_cmd_exec(dev, &in, sizeof(in), &out, sizeof(out));
+	if (err) {
+		mlx5_core_err(dev, "destroy_psv cmd exec failed %d\n", err);
+		goto out;
+	}
+
+	if (out.hdr.status) {
+		mlx5_core_err(dev, "destroy_psv bad status %d\n",
+			      out.hdr.status);
+		err = mlx5_cmd_status_to_err(&out.hdr);
+		goto out;
+	}
+
+out:
+	return err;
+}
+EXPORT_SYMBOL(mlx5_core_destroy_psv);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c b/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c
new file mode 100644
index 0000000..d476918
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c
@@ -0,0 +1,531 @@
+/*
+ * Copyright (c) 2013, Mellanox Technologies inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <asm-generic/kmap_types.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/mlx5/driver.h>
+#include <linux/mlx5/cmd.h>
+#include "mlx5_core.h"
+
+enum {
+	MLX5_PAGES_CANT_GIVE	= 0,
+	MLX5_PAGES_GIVE		= 1,
+	MLX5_PAGES_TAKE		= 2
+};
+
+enum {
+	MLX5_BOOT_PAGES		= 1,
+	MLX5_INIT_PAGES		= 2,
+	MLX5_POST_INIT_PAGES	= 3
+};
+
+struct mlx5_pages_req {
+	struct mlx5_core_dev *dev;
+	u16	func_id;
+	s32	npages;
+	struct work_struct work;
+};
+
+struct fw_page {
+	struct rb_node		rb_node;
+	u64			addr;
+	struct page	       *page;
+	u16			func_id;
+	unsigned long		bitmask;
+	struct list_head	list;
+	unsigned		free_count;
+};
+
+struct mlx5_query_pages_inbox {
+	struct mlx5_inbox_hdr	hdr;
+	u8			rsvd[8];
+};
+
+struct mlx5_query_pages_outbox {
+	struct mlx5_outbox_hdr	hdr;
+	__be16			rsvd;
+	__be16			func_id;
+	__be32			num_pages;
+};
+
+struct mlx5_manage_pages_inbox {
+	struct mlx5_inbox_hdr	hdr;
+	__be16			rsvd;
+	__be16			func_id;
+	__be32			num_entries;
+	__be64			pas[0];
+};
+
+struct mlx5_manage_pages_outbox {
+	struct mlx5_outbox_hdr	hdr;
+	__be32			num_entries;
+	u8			rsvd[4];
+	__be64			pas[0];
+};
+
+enum {
+	MAX_RECLAIM_TIME_MSECS	= 5000,
+};
+
+enum {
+	MLX5_MAX_RECLAIM_TIME_MILI	= 5000,
+	MLX5_NUM_4K_IN_PAGE		= PAGE_SIZE / MLX5_ADAPTER_PAGE_SIZE,
+};
+
+static int insert_page(struct mlx5_core_dev *dev, u64 addr, struct page *page, u16 func_id)
+{
+	struct rb_root *root = &dev->priv.page_root;
+	struct rb_node **new = &root->rb_node;
+	struct rb_node *parent = NULL;
+	struct fw_page *nfp;
+	struct fw_page *tfp;
+	int i;
+
+	while (*new) {
+		parent = *new;
+		tfp = rb_entry(parent, struct fw_page, rb_node);
+		if (tfp->addr < addr)
+			new = &parent->rb_left;
+		else if (tfp->addr > addr)
+			new = &parent->rb_right;
+		else
+			return -EEXIST;
+	}
+
+	nfp = kzalloc(sizeof(*nfp), GFP_KERNEL);
+	if (!nfp)
+		return -ENOMEM;
+
+	nfp->addr = addr;
+	nfp->page = page;
+	nfp->func_id = func_id;
+	nfp->free_count = MLX5_NUM_4K_IN_PAGE;
+	for (i = 0; i < MLX5_NUM_4K_IN_PAGE; i++)
+		set_bit(i, &nfp->bitmask);
+
+	rb_link_node(&nfp->rb_node, parent, new);
+	rb_insert_color(&nfp->rb_node, root);
+	list_add(&nfp->list, &dev->priv.free_list);
+
+	return 0;
+}
+
+static struct fw_page *find_fw_page(struct mlx5_core_dev *dev, u64 addr)
+{
+	struct rb_root *root = &dev->priv.page_root;
+	struct rb_node *tmp = root->rb_node;
+	struct fw_page *result = NULL;
+	struct fw_page *tfp;
+
+	while (tmp) {
+		tfp = rb_entry(tmp, struct fw_page, rb_node);
+		if (tfp->addr < addr) {
+			tmp = tmp->rb_left;
+		} else if (tfp->addr > addr) {
+			tmp = tmp->rb_right;
+		} else {
+			result = tfp;
+			break;
+		}
+	}
+
+	return result;
+}
+
+static int mlx5_cmd_query_pages(struct mlx5_core_dev *dev, u16 *func_id,
+				s32 *npages, int boot)
+{
+	struct mlx5_query_pages_inbox	in;
+	struct mlx5_query_pages_outbox	out;
+	int err;
+
+	memset(&in, 0, sizeof(in));
+	memset(&out, 0, sizeof(out));
+	in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_QUERY_PAGES);
+	in.hdr.opmod = boot ? cpu_to_be16(MLX5_BOOT_PAGES) : cpu_to_be16(MLX5_INIT_PAGES);
+
+	err = mlx5_cmd_exec(dev, &in, sizeof(in), &out, sizeof(out));
+	if (err)
+		return err;
+
+	if (out.hdr.status)
+		return mlx5_cmd_status_to_err(&out.hdr);
+
+	*npages = be32_to_cpu(out.num_pages);
+	*func_id = be16_to_cpu(out.func_id);
+
+	return err;
+}
+
+static int alloc_4k(struct mlx5_core_dev *dev, u64 *addr)
+{
+	struct fw_page *fp;
+	unsigned n;
+
+	if (list_empty(&dev->priv.free_list))
+		return -ENOMEM;
+
+	fp = list_entry(dev->priv.free_list.next, struct fw_page, list);
+	n = find_first_bit(&fp->bitmask, 8 * sizeof(fp->bitmask));
+	if (n >= MLX5_NUM_4K_IN_PAGE) {
+		mlx5_core_warn(dev, "alloc 4k bug\n");
+		return -ENOENT;
+	}
+	clear_bit(n, &fp->bitmask);
+	fp->free_count--;
+	if (!fp->free_count)
+		list_del(&fp->list);
+
+	*addr = fp->addr + n * MLX5_ADAPTER_PAGE_SIZE;
+
+	return 0;
+}
+
+static void free_4k(struct mlx5_core_dev *dev, u64 addr)
+{
+	struct fw_page *fwp;
+	int n;
+
+	fwp = find_fw_page(dev, addr & PAGE_MASK);
+	if (!fwp) {
+		mlx5_core_warn(dev, "page not found\n");
+		return;
+	}
+
+	n = (addr & ~PAGE_MASK) >> MLX5_ADAPTER_PAGE_SHIFT;
+	fwp->free_count++;
+	set_bit(n, &fwp->bitmask);
+	if (fwp->free_count == MLX5_NUM_4K_IN_PAGE) {
+		rb_erase(&fwp->rb_node, &dev->priv.page_root);
+		if (fwp->free_count != 1)
+			list_del(&fwp->list);
+		dma_unmap_page(&dev->pdev->dev, addr & PAGE_MASK, PAGE_SIZE,
+			       DMA_BIDIRECTIONAL);
+		__free_page(fwp->page);
+		kfree(fwp);
+	} else if (fwp->free_count == 1) {
+		list_add(&fwp->list, &dev->priv.free_list);
+	}
+}
+
+static int alloc_system_page(struct mlx5_core_dev *dev, u16 func_id)
+{
+	struct page *page;
+	u64 addr;
+	int err;
+
+	page = alloc_page(GFP_HIGHUSER);
+	if (!page) {
+		mlx5_core_warn(dev, "failed to allocate page\n");
+		return -ENOMEM;
+	}
+	addr = dma_map_page(&dev->pdev->dev, page, 0,
+			    PAGE_SIZE, DMA_BIDIRECTIONAL);
+	if (dma_mapping_error(&dev->pdev->dev, addr)) {
+		mlx5_core_warn(dev, "failed dma mapping page\n");
+		err = -ENOMEM;
+		goto out_alloc;
+	}
+	err = insert_page(dev, addr, page, func_id);
+	if (err) {
+		mlx5_core_err(dev, "failed to track allocated page\n");
+		goto out_mapping;
+	}
+
+	return 0;
+
+out_mapping:
+	dma_unmap_page(&dev->pdev->dev, addr, PAGE_SIZE, DMA_BIDIRECTIONAL);
+
+out_alloc:
+	__free_page(page);
+
+	return err;
+}
+static int give_pages(struct mlx5_core_dev *dev, u16 func_id, int npages,
+		      int notify_fail)
+{
+	struct mlx5_manage_pages_inbox *in;
+	struct mlx5_manage_pages_outbox out;
+	struct mlx5_manage_pages_inbox *nin;
+	int inlen;
+	u64 addr;
+	int err;
+	int i;
+
+	inlen = sizeof(*in) + npages * sizeof(in->pas[0]);
+	in = mlx5_vzalloc(inlen);
+	if (!in) {
+		mlx5_core_warn(dev, "vzalloc failed %d\n", inlen);
+		return -ENOMEM;
+	}
+	memset(&out, 0, sizeof(out));
+
+	for (i = 0; i < npages; i++) {
+retry:
+		err = alloc_4k(dev, &addr);
+		if (err) {
+			if (err == -ENOMEM)
+				err = alloc_system_page(dev, func_id);
+			if (err)
+				goto out_4k;
+
+			goto retry;
+		}
+		in->pas[i] = cpu_to_be64(addr);
+	}
+
+	in->hdr.opcode = cpu_to_be16(MLX5_CMD_OP_MANAGE_PAGES);
+	in->hdr.opmod = cpu_to_be16(MLX5_PAGES_GIVE);
+	in->func_id = cpu_to_be16(func_id);
+	in->num_entries = cpu_to_be32(npages);
+	err = mlx5_cmd_exec(dev, in, inlen, &out, sizeof(out));
+	if (err) {
+		mlx5_core_warn(dev, "func_id 0x%x, npages %d, err %d\n",
+			       func_id, npages, err);
+		goto out_alloc;
+	}
+	dev->priv.fw_pages += npages;
+
+	if (out.hdr.status) {
+		err = mlx5_cmd_status_to_err(&out.hdr);
+		if (err) {
+			mlx5_core_warn(dev, "func_id 0x%x, npages %d, status %d\n",
+				       func_id, npages, out.hdr.status);
+			goto out_alloc;
+		}
+	}
+
+	mlx5_core_dbg(dev, "err %d\n", err);
+
+	goto out_free;
+
+out_alloc:
+	if (notify_fail) {
+		nin = kzalloc(sizeof(*nin), GFP_KERNEL);
+		if (!nin) {
+			mlx5_core_warn(dev, "allocation failed\n");
+			goto out_4k;
+		}
+		memset(&out, 0, sizeof(out));
+		nin->hdr.opcode = cpu_to_be16(MLX5_CMD_OP_MANAGE_PAGES);
+		nin->hdr.opmod = cpu_to_be16(MLX5_PAGES_CANT_GIVE);
+		if (mlx5_cmd_exec(dev, nin, sizeof(*nin), &out, sizeof(out)))
+			mlx5_core_warn(dev, "page notify failed\n");
+		kfree(nin);
+	}
+
+out_4k:
+	for (i--; i >= 0; i--)
+		free_4k(dev, be64_to_cpu(in->pas[i]));
+out_free:
+	mlx5_vfree(in);
+	return err;
+}
+
+static int reclaim_pages(struct mlx5_core_dev *dev, u32 func_id, int npages,
+			 int *nclaimed)
+{
+	struct mlx5_manage_pages_inbox   in;
+	struct mlx5_manage_pages_outbox *out;
+	int num_claimed;
+	int outlen;
+	u64 addr;
+	int err;
+	int i;
+
+	if (nclaimed)
+		*nclaimed = 0;
+
+	memset(&in, 0, sizeof(in));
+	outlen = sizeof(*out) + npages * sizeof(out->pas[0]);
+	out = mlx5_vzalloc(outlen);
+	if (!out)
+		return -ENOMEM;
+
+	in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_MANAGE_PAGES);
+	in.hdr.opmod = cpu_to_be16(MLX5_PAGES_TAKE);
+	in.func_id = cpu_to_be16(func_id);
+	in.num_entries = cpu_to_be32(npages);
+	mlx5_core_dbg(dev, "npages %d, outlen %d\n", npages, outlen);
+	err = mlx5_cmd_exec(dev, &in, sizeof(in), out, outlen);
+	if (err) {
+		mlx5_core_err(dev, "failed reclaiming pages\n");
+		goto out_free;
+	}
+	dev->priv.fw_pages -= npages;
+
+	if (out->hdr.status) {
+		err = mlx5_cmd_status_to_err(&out->hdr);
+		goto out_free;
+	}
+
+	num_claimed = be32_to_cpu(out->num_entries);
+	if (nclaimed)
+		*nclaimed = num_claimed;
+
+	for (i = 0; i < num_claimed; i++) {
+		addr = be64_to_cpu(out->pas[i]);
+		free_4k(dev, addr);
+	}
+
+out_free:
+	mlx5_vfree(out);
+	return err;
+}
+
+static void pages_work_handler(struct work_struct *work)
+{
+	struct mlx5_pages_req *req = container_of(work, struct mlx5_pages_req, work);
+	struct mlx5_core_dev *dev = req->dev;
+	int err = 0;
+
+	if (req->npages < 0)
+		err = reclaim_pages(dev, req->func_id, -1 * req->npages, NULL);
+	else if (req->npages > 0)
+		err = give_pages(dev, req->func_id, req->npages, 1);
+
+	if (err)
+		mlx5_core_warn(dev, "%s fail %d\n",
+			       req->npages < 0 ? "reclaim" : "give", err);
+
+	kfree(req);
+}
+
+void mlx5_core_req_pages_handler(struct mlx5_core_dev *dev, u16 func_id,
+				 s32 npages)
+{
+	struct mlx5_pages_req *req;
+
+	req = kzalloc(sizeof(*req), GFP_ATOMIC);
+	if (!req) {
+		mlx5_core_warn(dev, "failed to allocate pages request\n");
+		return;
+	}
+
+	req->dev = dev;
+	req->func_id = func_id;
+	req->npages = npages;
+	INIT_WORK(&req->work, pages_work_handler);
+	queue_work(dev->priv.pg_wq, &req->work);
+}
+
+int mlx5_satisfy_startup_pages(struct mlx5_core_dev *dev, int boot)
+{
+	u16 uninitialized_var(func_id);
+	s32 uninitialized_var(npages);
+	int err;
+
+	err = mlx5_cmd_query_pages(dev, &func_id, &npages, boot);
+	if (err)
+		return err;
+
+	mlx5_core_dbg(dev, "requested %d %s pages for func_id 0x%x\n",
+		      npages, boot ? "boot" : "init", func_id);
+
+	return give_pages(dev, func_id, npages, 0);
+}
+
+enum {
+	MLX5_BLKS_FOR_RECLAIM_PAGES = 12
+};
+
+static int optimal_reclaimed_pages(void)
+{
+	struct mlx5_cmd_prot_block *block;
+	struct mlx5_cmd_layout *lay;
+	int ret;
+
+	ret = (sizeof(lay->out) + MLX5_BLKS_FOR_RECLAIM_PAGES * sizeof(block->data) -
+	       sizeof(struct mlx5_manage_pages_outbox)) /
+	       FIELD_SIZEOF(struct mlx5_manage_pages_outbox, pas[0]);
+
+	return ret;
+}
+
+int mlx5_reclaim_startup_pages(struct mlx5_core_dev *dev)
+{
+	unsigned long end = jiffies + msecs_to_jiffies(MAX_RECLAIM_TIME_MSECS);
+	struct fw_page *fwp;
+	struct rb_node *p;
+	int nclaimed = 0;
+	int err;
+
+	do {
+		p = rb_first(&dev->priv.page_root);
+		if (p) {
+			fwp = rb_entry(p, struct fw_page, rb_node);
+			err = reclaim_pages(dev, fwp->func_id,
+					    optimal_reclaimed_pages(),
+					    &nclaimed);
+			if (err) {
+				mlx5_core_warn(dev, "failed reclaiming pages (%d)\n",
+					       err);
+				return err;
+			}
+			if (nclaimed)
+				end = jiffies + msecs_to_jiffies(MAX_RECLAIM_TIME_MSECS);
+		}
+		if (time_after(jiffies, end)) {
+			mlx5_core_warn(dev, "FW did not return all pages. giving up...\n");
+			break;
+		}
+	} while (p);
+
+	return 0;
+}
+
+void mlx5_pagealloc_init(struct mlx5_core_dev *dev)
+{
+	dev->priv.page_root = RB_ROOT;
+	INIT_LIST_HEAD(&dev->priv.free_list);
+}
+
+void mlx5_pagealloc_cleanup(struct mlx5_core_dev *dev)
+{
+	/* nothing */
+}
+
+int mlx5_pagealloc_start(struct mlx5_core_dev *dev)
+{
+	dev->priv.pg_wq = create_singlethread_workqueue("mlx5_page_allocator");
+	if (!dev->priv.pg_wq)
+		return -ENOMEM;
+
+	return 0;
+}
+
+void mlx5_pagealloc_stop(struct mlx5_core_dev *dev)
+{
+	destroy_workqueue(dev->priv.pg_wq);
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/pd.c b/drivers/net/ethernet/mellanox/mlx5/core/pd.c
new file mode 100644
index 0000000..790da5c
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/pd.c
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2013, Mellanox Technologies inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/mlx5/driver.h>
+#include <linux/mlx5/cmd.h>
+#include "mlx5_core.h"
+
+struct mlx5_alloc_pd_mbox_in {
+	struct mlx5_inbox_hdr	hdr;
+	u8			rsvd[8];
+};
+
+struct mlx5_alloc_pd_mbox_out {
+	struct mlx5_outbox_hdr	hdr;
+	__be32			pdn;
+	u8			rsvd[4];
+};
+
+struct mlx5_dealloc_pd_mbox_in {
+	struct mlx5_inbox_hdr	hdr;
+	__be32			pdn;
+	u8			rsvd[4];
+};
+
+struct mlx5_dealloc_pd_mbox_out {
+	struct mlx5_outbox_hdr	hdr;
+	u8			rsvd[8];
+};
+
+int mlx5_core_alloc_pd(struct mlx5_core_dev *dev, u32 *pdn)
+{
+	struct mlx5_alloc_pd_mbox_in	in;
+	struct mlx5_alloc_pd_mbox_out	out;
+	int err;
+
+	memset(&in, 0, sizeof(in));
+	memset(&out, 0, sizeof(out));
+	in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_ALLOC_PD);
+	err = mlx5_cmd_exec(dev, &in, sizeof(in), &out, sizeof(out));
+	if (err)
+		return err;
+
+	if (out.hdr.status)
+		return mlx5_cmd_status_to_err(&out.hdr);
+
+	*pdn = be32_to_cpu(out.pdn) & 0xffffff;
+	return err;
+}
+EXPORT_SYMBOL(mlx5_core_alloc_pd);
+
+int mlx5_core_dealloc_pd(struct mlx5_core_dev *dev, u32 pdn)
+{
+	struct mlx5_dealloc_pd_mbox_in	in;
+	struct mlx5_dealloc_pd_mbox_out	out;
+	int err;
+
+	memset(&in, 0, sizeof(in));
+	memset(&out, 0, sizeof(out));
+	in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_DEALLOC_PD);
+	in.pdn = cpu_to_be32(pdn);
+	err = mlx5_cmd_exec(dev, &in, sizeof(in), &out, sizeof(out));
+	if (err)
+		return err;
+
+	if (out.hdr.status)
+		return mlx5_cmd_status_to_err(&out.hdr);
+
+	return err;
+}
+EXPORT_SYMBOL(mlx5_core_dealloc_pd);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/port.c b/drivers/net/ethernet/mellanox/mlx5/core/port.c
new file mode 100644
index 0000000..3139658
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/port.c
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2013, Mellanox Technologies inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/mlx5/driver.h>
+#include <linux/mlx5/cmd.h>
+#include "mlx5_core.h"
+
+int mlx5_core_access_reg(struct mlx5_core_dev *dev, void *data_in,
+			 int size_in, void *data_out, int size_out,
+			 u16 reg_num, int arg, int write)
+{
+	struct mlx5_access_reg_mbox_in *in = NULL;
+	struct mlx5_access_reg_mbox_out *out = NULL;
+	int err = -ENOMEM;
+
+	in = mlx5_vzalloc(sizeof(*in) + size_in);
+	if (!in)
+		return -ENOMEM;
+
+	out = mlx5_vzalloc(sizeof(*out) + size_out);
+	if (!out)
+		goto ex1;
+
+	memcpy(in->data, data_in, size_in);
+	in->hdr.opcode = cpu_to_be16(MLX5_CMD_OP_ACCESS_REG);
+	in->hdr.opmod = cpu_to_be16(!write);
+	in->arg = cpu_to_be32(arg);
+	in->register_id = cpu_to_be16(reg_num);
+	err = mlx5_cmd_exec(dev, in, sizeof(*in) + size_in, out,
+			    sizeof(*out) + size_out);
+	if (err)
+		goto ex2;
+
+	if (out->hdr.status)
+		err = mlx5_cmd_status_to_err(&out->hdr);
+
+	if (!err)
+		memcpy(data_out, out->data, size_out);
+
+ex2:
+	mlx5_vfree(out);
+ex1:
+	mlx5_vfree(in);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx5_core_access_reg);
+
+
+struct mlx5_reg_pcap {
+	u8			rsvd0;
+	u8			port_num;
+	u8			rsvd1[2];
+	__be32			caps_127_96;
+	__be32			caps_95_64;
+	__be32			caps_63_32;
+	__be32			caps_31_0;
+};
+
+int mlx5_set_port_caps(struct mlx5_core_dev *dev, u8 port_num, u32 caps)
+{
+	struct mlx5_reg_pcap in;
+	struct mlx5_reg_pcap out;
+	int err;
+
+	memset(&in, 0, sizeof(in));
+	in.caps_127_96 = cpu_to_be32(caps);
+	in.port_num = port_num;
+
+	err = mlx5_core_access_reg(dev, &in, sizeof(in), &out,
+				   sizeof(out), MLX5_REG_PCAP, 0, 1);
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx5_set_port_caps);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/qp.c b/drivers/net/ethernet/mellanox/mlx5/core/qp.c
new file mode 100644
index 0000000..5261a2b
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/qp.c
@@ -0,0 +1,324 @@
+/*
+ * Copyright (c) 2013, Mellanox Technologies inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+
+#include <linux/gfp.h>
+#include <linux/export.h>
+#include <linux/mlx5/cmd.h>
+#include <linux/mlx5/qp.h>
+#include <linux/mlx5/driver.h>
+
+#include "mlx5_core.h"
+
+static struct mlx5_core_rsc_common *mlx5_get_rsc(struct mlx5_core_dev *dev,
+						 u32 rsn)
+{
+	struct mlx5_qp_table *table = &dev->priv.qp_table;
+	struct mlx5_core_rsc_common *common;
+
+	spin_lock(&table->lock);
+
+	common = radix_tree_lookup(&table->tree, rsn);
+	if (common)
+		atomic_inc(&common->refcount);
+
+	spin_unlock(&table->lock);
+
+	if (!common) {
+		mlx5_core_warn(dev, "Async event for bogus resource 0x%x\n",
+			       rsn);
+		return NULL;
+	}
+	return common;
+}
+
+void mlx5_core_put_rsc(struct mlx5_core_rsc_common *common)
+{
+	if (atomic_dec_and_test(&common->refcount))
+		complete(&common->free);
+}
+
+void mlx5_rsc_event(struct mlx5_core_dev *dev, u32 rsn, int event_type)
+{
+	struct mlx5_core_rsc_common *common = mlx5_get_rsc(dev, rsn);
+	struct mlx5_core_qp *qp;
+
+	if (!common)
+		return;
+
+	switch (common->res) {
+	case MLX5_RES_QP:
+		qp = (struct mlx5_core_qp *)common;
+		qp->event(qp, event_type);
+		break;
+
+	default:
+		mlx5_core_warn(dev, "invalid resource type for 0x%x\n", rsn);
+	}
+
+	mlx5_core_put_rsc(common);
+}
+
+int mlx5_core_create_qp(struct mlx5_core_dev *dev,
+			struct mlx5_core_qp *qp,
+			struct mlx5_create_qp_mbox_in *in,
+			int inlen)
+{
+	struct mlx5_qp_table *table = &dev->priv.qp_table;
+	struct mlx5_create_qp_mbox_out out;
+	struct mlx5_destroy_qp_mbox_in din;
+	struct mlx5_destroy_qp_mbox_out dout;
+	int err;
+
+	memset(&out, 0, sizeof(out));
+	in->hdr.opcode = cpu_to_be16(MLX5_CMD_OP_CREATE_QP);
+
+	err = mlx5_cmd_exec(dev, in, inlen, &out, sizeof(out));
+	if (err) {
+		mlx5_core_warn(dev, "ret %d\n", err);
+		return err;
+	}
+
+	if (out.hdr.status) {
+		mlx5_core_warn(dev, "current num of QPs 0x%x\n",
+			       atomic_read(&dev->num_qps));
+		return mlx5_cmd_status_to_err(&out.hdr);
+	}
+
+	qp->qpn = be32_to_cpu(out.qpn) & 0xffffff;
+	mlx5_core_dbg(dev, "qpn = 0x%x\n", qp->qpn);
+
+	qp->common.res = MLX5_RES_QP;
+	spin_lock_irq(&table->lock);
+	err = radix_tree_insert(&table->tree, qp->qpn, qp);
+	spin_unlock_irq(&table->lock);
+	if (err) {
+		mlx5_core_warn(dev, "err %d\n", err);
+		goto err_cmd;
+	}
+
+	err = mlx5_debug_qp_add(dev, qp);
+	if (err)
+		mlx5_core_dbg(dev, "failed adding QP 0x%x to debug file system\n",
+			      qp->qpn);
+
+	qp->pid = current->pid;
+	atomic_set(&qp->common.refcount, 1);
+	atomic_inc(&dev->num_qps);
+	init_completion(&qp->common.free);
+
+	return 0;
+
+err_cmd:
+	memset(&din, 0, sizeof(din));
+	memset(&dout, 0, sizeof(dout));
+	din.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_DESTROY_QP);
+	din.qpn = cpu_to_be32(qp->qpn);
+	mlx5_cmd_exec(dev, &din, sizeof(din), &out, sizeof(dout));
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx5_core_create_qp);
+
+int mlx5_core_destroy_qp(struct mlx5_core_dev *dev,
+			 struct mlx5_core_qp *qp)
+{
+	struct mlx5_destroy_qp_mbox_in in;
+	struct mlx5_destroy_qp_mbox_out out;
+	struct mlx5_qp_table *table = &dev->priv.qp_table;
+	unsigned long flags;
+	int err;
+
+	mlx5_debug_qp_remove(dev, qp);
+
+	spin_lock_irqsave(&table->lock, flags);
+	radix_tree_delete(&table->tree, qp->qpn);
+	spin_unlock_irqrestore(&table->lock, flags);
+
+	mlx5_core_put_rsc((struct mlx5_core_rsc_common *)qp);
+	wait_for_completion(&qp->common.free);
+
+	memset(&in, 0, sizeof(in));
+	memset(&out, 0, sizeof(out));
+	in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_DESTROY_QP);
+	in.qpn = cpu_to_be32(qp->qpn);
+	err = mlx5_cmd_exec(dev, &in, sizeof(in), &out, sizeof(out));
+	if (err)
+		return err;
+
+	if (out.hdr.status)
+		return mlx5_cmd_status_to_err(&out.hdr);
+
+	atomic_dec(&dev->num_qps);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx5_core_destroy_qp);
+
+int mlx5_core_qp_modify(struct mlx5_core_dev *dev, enum mlx5_qp_state cur_state,
+			enum mlx5_qp_state new_state,
+			struct mlx5_modify_qp_mbox_in *in, int sqd_event,
+			struct mlx5_core_qp *qp)
+{
+	static const u16 optab[MLX5_QP_NUM_STATE][MLX5_QP_NUM_STATE] = {
+		[MLX5_QP_STATE_RST] = {
+			[MLX5_QP_STATE_RST]	= MLX5_CMD_OP_2RST_QP,
+			[MLX5_QP_STATE_ERR]	= MLX5_CMD_OP_2ERR_QP,
+			[MLX5_QP_STATE_INIT]	= MLX5_CMD_OP_RST2INIT_QP,
+		},
+		[MLX5_QP_STATE_INIT]  = {
+			[MLX5_QP_STATE_RST]	= MLX5_CMD_OP_2RST_QP,
+			[MLX5_QP_STATE_ERR]	= MLX5_CMD_OP_2ERR_QP,
+			[MLX5_QP_STATE_INIT]	= MLX5_CMD_OP_INIT2INIT_QP,
+			[MLX5_QP_STATE_RTR]	= MLX5_CMD_OP_INIT2RTR_QP,
+		},
+		[MLX5_QP_STATE_RTR]   = {
+			[MLX5_QP_STATE_RST]	= MLX5_CMD_OP_2RST_QP,
+			[MLX5_QP_STATE_ERR]	= MLX5_CMD_OP_2ERR_QP,
+			[MLX5_QP_STATE_RTS]	= MLX5_CMD_OP_RTR2RTS_QP,
+		},
+		[MLX5_QP_STATE_RTS]   = {
+			[MLX5_QP_STATE_RST]	= MLX5_CMD_OP_2RST_QP,
+			[MLX5_QP_STATE_ERR]	= MLX5_CMD_OP_2ERR_QP,
+			[MLX5_QP_STATE_RTS]	= MLX5_CMD_OP_RTS2RTS_QP,
+		},
+		[MLX5_QP_STATE_SQD] = {
+			[MLX5_QP_STATE_RST]	= MLX5_CMD_OP_2RST_QP,
+			[MLX5_QP_STATE_ERR]	= MLX5_CMD_OP_2ERR_QP,
+		},
+		[MLX5_QP_STATE_SQER] = {
+			[MLX5_QP_STATE_RST]	= MLX5_CMD_OP_2RST_QP,
+			[MLX5_QP_STATE_ERR]	= MLX5_CMD_OP_2ERR_QP,
+			[MLX5_QP_STATE_RTS]	= MLX5_CMD_OP_SQERR2RTS_QP,
+		},
+		[MLX5_QP_STATE_ERR] = {
+			[MLX5_QP_STATE_RST]	= MLX5_CMD_OP_2RST_QP,
+			[MLX5_QP_STATE_ERR]	= MLX5_CMD_OP_2ERR_QP,
+		}
+	};
+
+	struct mlx5_modify_qp_mbox_out out;
+	int err = 0;
+	u16 op;
+
+	if (cur_state >= MLX5_QP_NUM_STATE || new_state >= MLX5_QP_NUM_STATE ||
+	    !optab[cur_state][new_state])
+		return -EINVAL;
+
+	memset(&out, 0, sizeof(out));
+	op = optab[cur_state][new_state];
+	in->hdr.opcode = cpu_to_be16(op);
+	in->qpn = cpu_to_be32(qp->qpn);
+	err = mlx5_cmd_exec(dev, in, sizeof(*in), &out, sizeof(out));
+	if (err)
+		return err;
+
+	return mlx5_cmd_status_to_err(&out.hdr);
+}
+EXPORT_SYMBOL_GPL(mlx5_core_qp_modify);
+
+void mlx5_init_qp_table(struct mlx5_core_dev *dev)
+{
+	struct mlx5_qp_table *table = &dev->priv.qp_table;
+
+	spin_lock_init(&table->lock);
+	INIT_RADIX_TREE(&table->tree, GFP_ATOMIC);
+	mlx5_qp_debugfs_init(dev);
+}
+
+void mlx5_cleanup_qp_table(struct mlx5_core_dev *dev)
+{
+	mlx5_qp_debugfs_cleanup(dev);
+}
+
+int mlx5_core_qp_query(struct mlx5_core_dev *dev, struct mlx5_core_qp *qp,
+		       struct mlx5_query_qp_mbox_out *out, int outlen)
+{
+	struct mlx5_query_qp_mbox_in in;
+	int err;
+
+	memset(&in, 0, sizeof(in));
+	memset(out, 0, outlen);
+	in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_QUERY_QP);
+	in.qpn = cpu_to_be32(qp->qpn);
+	err = mlx5_cmd_exec(dev, &in, sizeof(in), out, outlen);
+	if (err)
+		return err;
+
+	if (out->hdr.status)
+		return mlx5_cmd_status_to_err(&out->hdr);
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx5_core_qp_query);
+
+int mlx5_core_xrcd_alloc(struct mlx5_core_dev *dev, u32 *xrcdn)
+{
+	struct mlx5_alloc_xrcd_mbox_in in;
+	struct mlx5_alloc_xrcd_mbox_out out;
+	int err;
+
+	memset(&in, 0, sizeof(in));
+	memset(&out, 0, sizeof(out));
+	in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_ALLOC_XRCD);
+	err = mlx5_cmd_exec(dev, &in, sizeof(in), &out, sizeof(out));
+	if (err)
+		return err;
+
+	if (out.hdr.status)
+		err = mlx5_cmd_status_to_err(&out.hdr);
+	else
+		*xrcdn = be32_to_cpu(out.xrcdn);
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx5_core_xrcd_alloc);
+
+int mlx5_core_xrcd_dealloc(struct mlx5_core_dev *dev, u32 xrcdn)
+{
+	struct mlx5_dealloc_xrcd_mbox_in in;
+	struct mlx5_dealloc_xrcd_mbox_out out;
+	int err;
+
+	memset(&in, 0, sizeof(in));
+	memset(&out, 0, sizeof(out));
+	in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_DEALLOC_XRCD);
+	in.xrcdn = cpu_to_be32(xrcdn);
+	err = mlx5_cmd_exec(dev, &in, sizeof(in), &out, sizeof(out));
+	if (err)
+		return err;
+
+	if (out.hdr.status)
+		err = mlx5_cmd_status_to_err(&out.hdr);
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx5_core_xrcd_dealloc);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/srq.c b/drivers/net/ethernet/mellanox/mlx5/core/srq.c
new file mode 100644
index 0000000..38bce93
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/srq.c
@@ -0,0 +1,223 @@
+/*
+ * Copyright (c) 2013, Mellanox Technologies inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/mlx5/driver.h>
+#include <linux/mlx5/cmd.h>
+#include <linux/mlx5/srq.h>
+#include <rdma/ib_verbs.h>
+#include "mlx5_core.h"
+
+void mlx5_srq_event(struct mlx5_core_dev *dev, u32 srqn, int event_type)
+{
+	struct mlx5_srq_table *table = &dev->priv.srq_table;
+	struct mlx5_core_srq *srq;
+
+	spin_lock(&table->lock);
+
+	srq = radix_tree_lookup(&table->tree, srqn);
+	if (srq)
+		atomic_inc(&srq->refcount);
+
+	spin_unlock(&table->lock);
+
+	if (!srq) {
+		mlx5_core_warn(dev, "Async event for bogus SRQ 0x%08x\n", srqn);
+		return;
+	}
+
+	srq->event(srq, event_type);
+
+	if (atomic_dec_and_test(&srq->refcount))
+		complete(&srq->free);
+}
+
+struct mlx5_core_srq *mlx5_core_get_srq(struct mlx5_core_dev *dev, u32 srqn)
+{
+	struct mlx5_srq_table *table = &dev->priv.srq_table;
+	struct mlx5_core_srq *srq;
+
+	spin_lock(&table->lock);
+
+	srq = radix_tree_lookup(&table->tree, srqn);
+	if (srq)
+		atomic_inc(&srq->refcount);
+
+	spin_unlock(&table->lock);
+
+	return srq;
+}
+EXPORT_SYMBOL(mlx5_core_get_srq);
+
+int mlx5_core_create_srq(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq,
+			 struct mlx5_create_srq_mbox_in *in, int inlen)
+{
+	struct mlx5_create_srq_mbox_out out;
+	struct mlx5_srq_table *table = &dev->priv.srq_table;
+	struct mlx5_destroy_srq_mbox_in din;
+	struct mlx5_destroy_srq_mbox_out dout;
+	int err;
+
+	memset(&out, 0, sizeof(out));
+	in->hdr.opcode = cpu_to_be16(MLX5_CMD_OP_CREATE_SRQ);
+	err = mlx5_cmd_exec(dev, in, inlen, &out, sizeof(out));
+	if (err)
+		return err;
+
+	if (out.hdr.status)
+		return mlx5_cmd_status_to_err(&out.hdr);
+
+	srq->srqn = be32_to_cpu(out.srqn) & 0xffffff;
+
+	atomic_set(&srq->refcount, 1);
+	init_completion(&srq->free);
+
+	spin_lock_irq(&table->lock);
+	err = radix_tree_insert(&table->tree, srq->srqn, srq);
+	spin_unlock_irq(&table->lock);
+	if (err) {
+		mlx5_core_warn(dev, "err %d, srqn 0x%x\n", err, srq->srqn);
+		goto err_cmd;
+	}
+
+	return 0;
+
+err_cmd:
+	memset(&din, 0, sizeof(din));
+	memset(&dout, 0, sizeof(dout));
+	din.srqn = cpu_to_be32(srq->srqn);
+	din.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_DESTROY_SRQ);
+	mlx5_cmd_exec(dev, &din, sizeof(din), &dout, sizeof(dout));
+	return err;
+}
+EXPORT_SYMBOL(mlx5_core_create_srq);
+
+int mlx5_core_destroy_srq(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq)
+{
+	struct mlx5_destroy_srq_mbox_in in;
+	struct mlx5_destroy_srq_mbox_out out;
+	struct mlx5_srq_table *table = &dev->priv.srq_table;
+	struct mlx5_core_srq *tmp;
+	int err;
+
+	spin_lock_irq(&table->lock);
+	tmp = radix_tree_delete(&table->tree, srq->srqn);
+	spin_unlock_irq(&table->lock);
+	if (!tmp) {
+		mlx5_core_warn(dev, "srq 0x%x not found in tree\n", srq->srqn);
+		return -EINVAL;
+	}
+	if (tmp != srq) {
+		mlx5_core_warn(dev, "corruption on srqn 0x%x\n", srq->srqn);
+		return -EINVAL;
+	}
+
+	memset(&in, 0, sizeof(in));
+	memset(&out, 0, sizeof(out));
+	in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_DESTROY_SRQ);
+	in.srqn = cpu_to_be32(srq->srqn);
+	err = mlx5_cmd_exec(dev, &in, sizeof(in), &out, sizeof(out));
+	if (err)
+		return err;
+
+	if (out.hdr.status)
+		return mlx5_cmd_status_to_err(&out.hdr);
+
+	if (atomic_dec_and_test(&srq->refcount))
+		complete(&srq->free);
+	wait_for_completion(&srq->free);
+
+	return 0;
+}
+EXPORT_SYMBOL(mlx5_core_destroy_srq);
+
+int mlx5_core_query_srq(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq,
+			struct mlx5_query_srq_mbox_out *out)
+{
+	struct mlx5_query_srq_mbox_in in;
+	int err;
+
+	memset(&in, 0, sizeof(in));
+	memset(out, 0, sizeof(*out));
+
+	in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_QUERY_SRQ);
+	in.srqn = cpu_to_be32(srq->srqn);
+	err = mlx5_cmd_exec(dev, &in, sizeof(in), out, sizeof(*out));
+	if (err)
+		return err;
+
+	if (out->hdr.status)
+		return mlx5_cmd_status_to_err(&out->hdr);
+
+	return err;
+}
+EXPORT_SYMBOL(mlx5_core_query_srq);
+
+int mlx5_core_arm_srq(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq,
+		      u16 lwm, int is_srq)
+{
+	struct mlx5_arm_srq_mbox_in	in;
+	struct mlx5_arm_srq_mbox_out	out;
+	int err;
+
+	memset(&in, 0, sizeof(in));
+	memset(&out, 0, sizeof(out));
+
+	in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_ARM_RQ);
+	in.hdr.opmod = cpu_to_be16(!!is_srq);
+	in.srqn = cpu_to_be32(srq->srqn);
+	in.lwm = cpu_to_be16(lwm);
+
+	err = mlx5_cmd_exec(dev, &in, sizeof(in), &out, sizeof(out));
+	if (err)
+		return err;
+
+	if (out.hdr.status)
+		return mlx5_cmd_status_to_err(&out.hdr);
+
+	return err;
+}
+EXPORT_SYMBOL(mlx5_core_arm_srq);
+
+void mlx5_init_srq_table(struct mlx5_core_dev *dev)
+{
+	struct mlx5_srq_table *table = &dev->priv.srq_table;
+
+	spin_lock_init(&table->lock);
+	INIT_RADIX_TREE(&table->tree, GFP_ATOMIC);
+}
+
+void mlx5_cleanup_srq_table(struct mlx5_core_dev *dev)
+{
+	/* nothing */
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/uar.c b/drivers/net/ethernet/mellanox/mlx5/core/uar.c
new file mode 100644
index 0000000..0a6348c
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/uar.c
@@ -0,0 +1,224 @@
+/*
+ * Copyright (c) 2013, Mellanox Technologies inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/mlx5/driver.h>
+#include <linux/mlx5/cmd.h>
+#include "mlx5_core.h"
+
+enum {
+	NUM_DRIVER_UARS		= 4,
+	NUM_LOW_LAT_UUARS	= 4,
+};
+
+
+struct mlx5_alloc_uar_mbox_in {
+	struct mlx5_inbox_hdr	hdr;
+	u8			rsvd[8];
+};
+
+struct mlx5_alloc_uar_mbox_out {
+	struct mlx5_outbox_hdr	hdr;
+	__be32			uarn;
+	u8			rsvd[4];
+};
+
+struct mlx5_free_uar_mbox_in {
+	struct mlx5_inbox_hdr	hdr;
+	__be32			uarn;
+	u8			rsvd[4];
+};
+
+struct mlx5_free_uar_mbox_out {
+	struct mlx5_outbox_hdr	hdr;
+	u8			rsvd[8];
+};
+
+int mlx5_cmd_alloc_uar(struct mlx5_core_dev *dev, u32 *uarn)
+{
+	struct mlx5_alloc_uar_mbox_in	in;
+	struct mlx5_alloc_uar_mbox_out	out;
+	int err;
+
+	memset(&in, 0, sizeof(in));
+	memset(&out, 0, sizeof(out));
+	in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_ALLOC_UAR);
+	err = mlx5_cmd_exec(dev, &in, sizeof(in), &out, sizeof(out));
+	if (err)
+		goto ex;
+
+	if (out.hdr.status) {
+		err = mlx5_cmd_status_to_err(&out.hdr);
+		goto ex;
+	}
+
+	*uarn = be32_to_cpu(out.uarn) & 0xffffff;
+
+ex:
+	return err;
+}
+EXPORT_SYMBOL(mlx5_cmd_alloc_uar);
+
+int mlx5_cmd_free_uar(struct mlx5_core_dev *dev, u32 uarn)
+{
+	struct mlx5_free_uar_mbox_in	in;
+	struct mlx5_free_uar_mbox_out	out;
+	int err;
+
+	memset(&in, 0, sizeof(in));
+	in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_DEALLOC_UAR);
+	in.uarn = cpu_to_be32(uarn);
+	err = mlx5_cmd_exec(dev, &in, sizeof(in), &out, sizeof(out));
+	if (err)
+		goto ex;
+
+	if (out.hdr.status)
+		err = mlx5_cmd_status_to_err(&out.hdr);
+
+ex:
+	return err;
+}
+EXPORT_SYMBOL(mlx5_cmd_free_uar);
+
+static int need_uuar_lock(int uuarn)
+{
+	int tot_uuars = NUM_DRIVER_UARS * MLX5_BF_REGS_PER_PAGE;
+
+	if (uuarn == 0 || tot_uuars - NUM_LOW_LAT_UUARS)
+		return 0;
+
+	return 1;
+}
+
+int mlx5_alloc_uuars(struct mlx5_core_dev *dev, struct mlx5_uuar_info *uuari)
+{
+	int tot_uuars = NUM_DRIVER_UARS * MLX5_BF_REGS_PER_PAGE;
+	struct mlx5_bf *bf;
+	phys_addr_t addr;
+	int err;
+	int i;
+
+	uuari->num_uars = NUM_DRIVER_UARS;
+	uuari->num_low_latency_uuars = NUM_LOW_LAT_UUARS;
+
+	mutex_init(&uuari->lock);
+	uuari->uars = kcalloc(uuari->num_uars, sizeof(*uuari->uars), GFP_KERNEL);
+	if (!uuari->uars)
+		return -ENOMEM;
+
+	uuari->bfs = kcalloc(tot_uuars, sizeof(*uuari->bfs), GFP_KERNEL);
+	if (!uuari->bfs) {
+		err = -ENOMEM;
+		goto out_uars;
+	}
+
+	uuari->bitmap = kcalloc(BITS_TO_LONGS(tot_uuars), sizeof(*uuari->bitmap),
+				GFP_KERNEL);
+	if (!uuari->bitmap) {
+		err = -ENOMEM;
+		goto out_bfs;
+	}
+
+	uuari->count = kcalloc(tot_uuars, sizeof(*uuari->count), GFP_KERNEL);
+	if (!uuari->count) {
+		err = -ENOMEM;
+		goto out_bitmap;
+	}
+
+	for (i = 0; i < uuari->num_uars; i++) {
+		err = mlx5_cmd_alloc_uar(dev, &uuari->uars[i].index);
+		if (err)
+			goto out_count;
+
+		addr = dev->iseg_base + ((phys_addr_t)(uuari->uars[i].index) << PAGE_SHIFT);
+		uuari->uars[i].map = ioremap(addr, PAGE_SIZE);
+		if (!uuari->uars[i].map) {
+			mlx5_cmd_free_uar(dev, uuari->uars[i].index);
+			err = -ENOMEM;
+			goto out_count;
+		}
+		mlx5_core_dbg(dev, "allocated uar index 0x%x, mmaped at %p\n",
+			      uuari->uars[i].index, uuari->uars[i].map);
+	}
+
+	for (i = 0; i < tot_uuars; i++) {
+		bf = &uuari->bfs[i];
+
+		bf->buf_size = dev->caps.gen.bf_reg_size / 2;
+		bf->uar = &uuari->uars[i / MLX5_BF_REGS_PER_PAGE];
+		bf->regreg = uuari->uars[i / MLX5_BF_REGS_PER_PAGE].map;
+		bf->reg = NULL; /* Add WC support */
+		bf->offset = (i % MLX5_BF_REGS_PER_PAGE) * dev->caps.gen.bf_reg_size +
+			MLX5_BF_OFFSET;
+		bf->need_lock = need_uuar_lock(i);
+		spin_lock_init(&bf->lock);
+		spin_lock_init(&bf->lock32);
+		bf->uuarn = i;
+	}
+
+	return 0;
+
+out_count:
+	for (i--; i >= 0; i--) {
+		iounmap(uuari->uars[i].map);
+		mlx5_cmd_free_uar(dev, uuari->uars[i].index);
+	}
+	kfree(uuari->count);
+
+out_bitmap:
+	kfree(uuari->bitmap);
+
+out_bfs:
+	kfree(uuari->bfs);
+
+out_uars:
+	kfree(uuari->uars);
+	return err;
+}
+
+int mlx5_free_uuars(struct mlx5_core_dev *dev, struct mlx5_uuar_info *uuari)
+{
+	int i = uuari->num_uars;
+
+	for (i--; i >= 0; i--) {
+		iounmap(uuari->uars[i].map);
+		mlx5_cmd_free_uar(dev, uuari->uars[i].index);
+	}
+
+	kfree(uuari->count);
+	kfree(uuari->bitmap);
+	kfree(uuari->bfs);
+	kfree(uuari->uars);
+
+	return 0;
+}
diff --git a/drivers/scsi/Makefile b/drivers/scsi/Makefile
new file mode 100644
index 0000000..59f1ce6
--- /dev/null
+++ b/drivers/scsi/Makefile
@@ -0,0 +1,199 @@
+#
+# Makefile for linux/drivers/scsi
+#
+# 30 May 2000, Christoph Hellwig <hch@infradead.org>
+# Rewritten to use lists instead of if-statements.
+#
+# 20 Sep 2000, Torben Mathiasen <tmm@image.dk>
+# Changed link order to reflect new scsi initialization.
+#
+# *!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!
+# The link order must be, SCSI Core, SCSI HBA drivers, and
+# lastly SCSI peripheral drivers (disk/tape/cdrom/etc.) to
+# satisfy certain initialization assumptions in the SCSI layer.
+# *!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!
+
+
+CFLAGS_aha152x.o =   -DAHA152X_STAT -DAUTOCONF
+CFLAGS_gdth.o    = # -DDEBUG_GDTH=2 -D__SERIAL__ -D__COM2__ -DGDTH_STATISTICS
+
+obj-$(CONFIG_PCMCIA)		+= pcmcia/
+
+obj-$(CONFIG_SCSI)		+= scsi_mod.o
+
+obj-$(CONFIG_RAID_ATTRS)	+= raid_class.o
+
+# --- NOTE ORDERING HERE ---
+# For kernel non-modular link, transport attributes need to
+# be initialised before drivers
+# --------------------------
+obj-$(CONFIG_SCSI_SPI_ATTRS)	+= scsi_transport_spi.o
+obj-$(CONFIG_SCSI_FC_ATTRS) 	+= scsi_transport_fc.o
+obj-$(CONFIG_SCSI_ISCSI_ATTRS)	+= scsi_transport_iscsi.o
+obj-$(CONFIG_SCSI_SAS_ATTRS)	+= scsi_transport_sas.o
+obj-$(CONFIG_SCSI_SAS_LIBSAS)	+= libsas/
+obj-$(CONFIG_SCSI_SRP_ATTRS)	+= scsi_transport_srp.o
+obj-$(CONFIG_SCSI_DH)		+= device_handler/
+
+obj-$(CONFIG_LIBFC)		+= libfc/
+obj-$(CONFIG_LIBFCOE)		+= fcoe/
+obj-$(CONFIG_FCOE)		+= fcoe/
+obj-$(CONFIG_FCOE_FNIC)		+= fnic/
+obj-$(CONFIG_SCSI_BNX2X_FCOE)	+= libfc/ fcoe/ bnx2fc/
+obj-$(CONFIG_ISCSI_TCP) 	+= libiscsi.o	libiscsi_tcp.o iscsi_tcp.o
+obj-$(CONFIG_INFINIBAND_ISER) 	+= libiscsi.o
+obj-$(CONFIG_ISCSI_BOOT_SYSFS)	+= iscsi_boot_sysfs.o
+obj-$(CONFIG_SCSI_A4000T)	+= 53c700.o	a4000t.o
+obj-$(CONFIG_SCSI_ZORRO7XX)	+= 53c700.o	zorro7xx.o
+obj-$(CONFIG_A3000_SCSI)	+= a3000.o	wd33c93.o
+obj-$(CONFIG_A2091_SCSI)	+= a2091.o	wd33c93.o
+obj-$(CONFIG_GVP11_SCSI)	+= gvp11.o	wd33c93.o
+obj-$(CONFIG_MVME147_SCSI)	+= mvme147.o	wd33c93.o
+obj-$(CONFIG_SGIWD93_SCSI)	+= sgiwd93.o	wd33c93.o
+obj-$(CONFIG_ATARI_SCSI)	+= atari_scsi.o
+obj-$(CONFIG_MAC_SCSI)		+= mac_scsi.o
+obj-$(CONFIG_SCSI_MAC_ESP)	+= esp_scsi.o	mac_esp.o
+obj-$(CONFIG_SUN3_SCSI)		+= sun3_scsi.o  sun3_scsi_vme.o
+obj-$(CONFIG_MVME16x_SCSI)	+= 53c700.o	mvme16x_scsi.o
+obj-$(CONFIG_BVME6000_SCSI)	+= 53c700.o	bvme6000_scsi.o
+obj-$(CONFIG_SCSI_SIM710)	+= 53c700.o	sim710.o
+obj-$(CONFIG_SCSI_ADVANSYS)	+= advansys.o
+obj-$(CONFIG_SCSI_BUSLOGIC)	+= BusLogic.o
+obj-$(CONFIG_SCSI_DPT_I2O)	+= dpt_i2o.o
+obj-$(CONFIG_SCSI_U14_34F)	+= u14-34f.o
+obj-$(CONFIG_SCSI_ARCMSR)	+= arcmsr/
+obj-$(CONFIG_SCSI_ULTRASTOR)	+= ultrastor.o
+obj-$(CONFIG_SCSI_AHA152X)	+= aha152x.o
+obj-$(CONFIG_SCSI_AHA1542)	+= aha1542.o
+obj-$(CONFIG_SCSI_AHA1740)	+= aha1740.o
+obj-$(CONFIG_SCSI_AIC7XXX)	+= aic7xxx/
+obj-$(CONFIG_SCSI_AIC79XX)	+= aic7xxx/
+obj-$(CONFIG_SCSI_AACRAID)	+= aacraid/
+obj-$(CONFIG_SCSI_AIC94XX)	+= aic94xx/
+obj-$(CONFIG_SCSI_PM8001)	+= pm8001/
+obj-$(CONFIG_SCSI_ISCI)		+= isci/
+obj-$(CONFIG_SCSI_IPS)		+= ips.o
+obj-$(CONFIG_SCSI_FUTURE_DOMAIN)+= fdomain.o
+obj-$(CONFIG_SCSI_IN2000)	+= in2000.o
+obj-$(CONFIG_SCSI_GENERIC_NCR5380) += g_NCR5380.o
+obj-$(CONFIG_SCSI_GENERIC_NCR5380_MMIO) += g_NCR5380_mmio.o
+obj-$(CONFIG_SCSI_NCR53C406A)	+= NCR53c406a.o
+obj-$(CONFIG_SCSI_NCR_D700)	+= 53c700.o NCR_D700.o
+obj-$(CONFIG_SCSI_NCR_Q720)	+= NCR_Q720_mod.o
+obj-$(CONFIG_SCSI_SYM53C416)	+= sym53c416.o
+obj-$(CONFIG_SCSI_QLOGIC_FAS)	+= qlogicfas408.o	qlogicfas.o
+obj-$(CONFIG_PCMCIA_QLOGIC)	+= qlogicfas408.o
+obj-$(CONFIG_SCSI_QLOGIC_1280)	+= qla1280.o 
+obj-$(CONFIG_SCSI_QLA_FC)	+= qla2xxx/
+obj-$(CONFIG_SCSI_QLA_ISCSI)	+= libiscsi.o qla4xxx/
+obj-$(CONFIG_SCSI_LPFC)		+= lpfc/
+obj-$(CONFIG_SCSI_BFA_FC)	+= bfa/
+obj-$(CONFIG_SCSI_CHELSIO_FCOE)	+= csiostor/
+obj-$(CONFIG_SCSI_PAS16)	+= pas16.o
+obj-$(CONFIG_SCSI_T128)		+= t128.o
+obj-$(CONFIG_SCSI_DMX3191D)	+= dmx3191d.o
+obj-$(CONFIG_SCSI_HPSA)		+= hpsa.o
+obj-$(CONFIG_SCSI_DTC3280)	+= dtc.o
+obj-$(CONFIG_SCSI_SYM53C8XX_2)	+= sym53c8xx_2/
+obj-$(CONFIG_SCSI_ZALON)	+= zalon7xx.o
+obj-$(CONFIG_SCSI_EATA_PIO)	+= eata_pio.o
+obj-$(CONFIG_SCSI_7000FASST)	+= wd7000.o
+obj-$(CONFIG_SCSI_EATA)		+= eata.o
+obj-$(CONFIG_SCSI_DC395x)	+= dc395x.o
+obj-$(CONFIG_SCSI_DC390T)	+= tmscsim.o
+obj-$(CONFIG_MEGARAID_LEGACY)	+= megaraid.o
+obj-$(CONFIG_MEGARAID_NEWGEN)	+= megaraid/
+obj-$(CONFIG_MEGARAID_SAS)	+= megaraid/
+obj-$(CONFIG_SCSI_MPT2SAS)	+= mpt2sas/
+obj-$(CONFIG_SCSI_MPT3SAS)	+= mpt3sas/
+obj-$(CONFIG_SCSI_UFSHCD)	+= ufs/
+obj-$(CONFIG_SCSI_ACARD)	+= atp870u.o
+obj-$(CONFIG_SCSI_SUNESP)	+= esp_scsi.o	sun_esp.o
+obj-$(CONFIG_SCSI_GDTH)		+= gdth.o
+obj-$(CONFIG_SCSI_INITIO)	+= initio.o
+obj-$(CONFIG_SCSI_INIA100)	+= a100u2w.o
+obj-$(CONFIG_SCSI_QLOGICPTI)	+= qlogicpti.o
+obj-$(CONFIG_SCSI_MESH)		+= mesh.o
+obj-$(CONFIG_SCSI_MAC53C94)	+= mac53c94.o
+obj-$(CONFIG_BLK_DEV_3W_XXXX_RAID) += 3w-xxxx.o
+obj-$(CONFIG_SCSI_3W_9XXX)	+= 3w-9xxx.o
+obj-$(CONFIG_SCSI_3W_SAS)	+= 3w-sas.o
+obj-$(CONFIG_SCSI_PPA)		+= ppa.o
+obj-$(CONFIG_SCSI_IMM)		+= imm.o
+obj-$(CONFIG_JAZZ_ESP)		+= esp_scsi.o	jazz_esp.o
+obj-$(CONFIG_SUN3X_ESP)		+= esp_scsi.o	sun3x_esp.o
+obj-$(CONFIG_SCSI_LASI700)	+= 53c700.o lasi700.o
+obj-$(CONFIG_SCSI_SNI_53C710)	+= 53c700.o sni_53c710.o
+obj-$(CONFIG_SCSI_NSP32)	+= nsp32.o
+obj-$(CONFIG_SCSI_IPR)		+= ipr.o
+obj-$(CONFIG_SCSI_IBMVSCSI)	+= ibmvscsi/
+obj-$(CONFIG_SCSI_IBMVFC)	+= ibmvscsi/
+obj-$(CONFIG_SCSI_HPTIOP)	+= hptiop.o
+obj-$(CONFIG_SCSI_STEX)		+= stex.o
+obj-$(CONFIG_SCSI_MVSAS)	+= mvsas/
+obj-$(CONFIG_SCSI_MVUMI)	+= mvumi.o
+obj-$(CONFIG_PS3_ROM)		+= ps3rom.o
+obj-$(CONFIG_SCSI_CXGB3_ISCSI)	+= libiscsi.o libiscsi_tcp.o cxgbi/
+obj-$(CONFIG_SCSI_CXGB4_ISCSI)	+= libiscsi.o libiscsi_tcp.o cxgbi/
+obj-$(CONFIG_SCSI_BNX2_ISCSI)	+= libiscsi.o bnx2i/
+obj-$(CONFIG_BE2ISCSI)		+= libiscsi.o be2iscsi/
+obj-$(CONFIG_SCSI_ESAS2R)	+= esas2r/
+obj-$(CONFIG_SCSI_PMCRAID)	+= pmcraid.o
+obj-$(CONFIG_SCSI_VIRTIO)	+= virtio_scsi.o
+obj-$(CONFIG_VMWARE_PVSCSI)	+= vmw_pvscsi.o
+obj-$(CONFIG_XEN_SCSI_FRONTEND)	+= xen-scsifront.o
+obj-$(CONFIG_HYPERV_STORAGE)	+= hv_storvsc.o
+
+obj-$(CONFIG_ARM)		+= arm/
+
+obj-$(CONFIG_CHR_DEV_ST)	+= st.o
+obj-$(CONFIG_CHR_DEV_OSST)	+= osst.o
+obj-$(CONFIG_BLK_DEV_SD)	+= sd_mod.o
+obj-$(CONFIG_BLK_DEV_SR)	+= sr_mod.o
+obj-$(CONFIG_CHR_DEV_SG)	+= sg.o
+obj-$(CONFIG_CHR_DEV_SCH)	+= ch.o
+obj-$(CONFIG_SCSI_ENCLOSURE)	+= ses.o
+
+obj-$(CONFIG_SCSI_OSD_INITIATOR) += osd/
+
+# This goes last, so that "real" scsi devices probe earlier
+obj-$(CONFIG_SCSI_DEBUG)	+= scsi_debug.o
+
+scsi_mod-y			+= scsi.o hosts.o scsi_ioctl.o constants.o \
+				   scsicam.o scsi_error.o scsi_lib.o
+scsi_mod-$(CONFIG_SCSI_DMA)	+= scsi_lib_dma.o
+scsi_mod-y			+= scsi_scan.o scsi_sysfs.o scsi_devinfo.o
+scsi_mod-$(CONFIG_SCSI_NETLINK)	+= scsi_netlink.o
+scsi_mod-$(CONFIG_SYSCTL)	+= scsi_sysctl.o
+scsi_mod-$(CONFIG_SCSI_PROC_FS)	+= scsi_proc.o
+scsi_mod-y			+= scsi_trace.o
+scsi_mod-$(CONFIG_PM)		+= scsi_pm.o
+
+hv_storvsc-y			:= storvsc_drv.o
+
+sd_mod-objs	:= sd.o
+sd_mod-$(CONFIG_BLK_DEV_INTEGRITY) += sd_dif.o
+
+sr_mod-objs	:= sr.o sr_ioctl.o sr_vendor.o
+ncr53c8xx-flags-$(CONFIG_SCSI_ZALON) \
+		:= -DCONFIG_NCR53C8XX_PREFETCH -DSCSI_NCR_BIG_ENDIAN \
+			-DCONFIG_SCSI_NCR53C8XX_NO_WORD_TRANSFERS
+CFLAGS_ncr53c8xx.o	:= $(ncr53c8xx-flags-y) $(ncr53c8xx-flags-m)
+zalon7xx-objs	:= zalon.o ncr53c8xx.o
+NCR_Q720_mod-objs	:= NCR_Q720.o ncr53c8xx.o
+oktagon_esp_mod-objs	:= oktagon_esp.o oktagon_io.o
+
+# Files generated that shall be removed upon make clean
+clean-files :=	53c700_d.h 53c700_u.h
+
+$(obj)/53c700.o $(MODVERDIR)/$(obj)/53c700.ver: $(obj)/53c700_d.h
+
+# If you want to play with the firmware, uncomment
+# GENERATE_FIRMWARE := 1
+
+ifdef GENERATE_FIRMWARE
+
+$(obj)/53c700_d.h: $(src)/53c700.scr $(src)/script_asm.pl
+	$(PERL) -s $(src)/script_asm.pl -ncr7x0_family $@ $(@:_d.h=_u.h) < $<
+
+endif
diff --git a/drivers/scsi/cxgbi/Kconfig b/drivers/scsi/cxgbi/Kconfig
new file mode 100644
index 0000000..17eb5d5
--- /dev/null
+++ b/drivers/scsi/cxgbi/Kconfig
@@ -0,0 +1,2 @@
+source "drivers/scsi/cxgbi/cxgb3i/Kconfig"
+source "drivers/scsi/cxgbi/cxgb4i/Kconfig"
diff --git a/drivers/scsi/cxgbi/Makefile b/drivers/scsi/cxgbi/Makefile
new file mode 100644
index 0000000..86007e3
--- /dev/null
+++ b/drivers/scsi/cxgbi/Makefile
@@ -0,0 +1,2 @@
+obj-$(CONFIG_SCSI_CXGB3_ISCSI)	+= libcxgbi.o cxgb3i/
+obj-$(CONFIG_SCSI_CXGB4_ISCSI)	+= libcxgbi.o cxgb4i/
diff --git a/drivers/scsi/cxgbi/cxgb3i/Kbuild b/drivers/scsi/cxgbi/cxgb3i/Kbuild
new file mode 100644
index 0000000..6f095e2
--- /dev/null
+++ b/drivers/scsi/cxgbi/cxgb3i/Kbuild
@@ -0,0 +1,3 @@
+EXTRA_CFLAGS += -I$(srctree)/drivers/net/ethernet/chelsio/cxgb3
+
+obj-$(CONFIG_SCSI_CXGB3_ISCSI) += cxgb3i.o
diff --git a/drivers/scsi/cxgbi/cxgb3i/Kconfig b/drivers/scsi/cxgbi/cxgb3i/Kconfig
new file mode 100644
index 0000000..e460398
--- /dev/null
+++ b/drivers/scsi/cxgbi/cxgb3i/Kconfig
@@ -0,0 +1,10 @@
+config SCSI_CXGB3_ISCSI
+	tristate "Chelsio T3 iSCSI support"
+	depends on PCI && INET && (IPV6 || IPV6=n)
+	select NETDEVICES
+	select ETHERNET
+	select NET_VENDOR_CHELSIO
+	select CHELSIO_T3
+	select SCSI_ISCSI_ATTRS
+	---help---
+	  This driver supports iSCSI offload for the Chelsio T3 devices.
diff --git a/drivers/scsi/cxgbi/cxgb3i/cxgb3i.c b/drivers/scsi/cxgbi/cxgb3i/cxgb3i.c
new file mode 100644
index 0000000..49692a1
--- /dev/null
+++ b/drivers/scsi/cxgbi/cxgb3i/cxgb3i.c
@@ -0,0 +1,1413 @@
+/*
+ * cxgb3i_offload.c: Chelsio S3xx iscsi offloaded tcp connection management
+ *
+ * Copyright (C) 2003-2008 Chelsio Communications.  All rights reserved.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the LICENSE file included in this
+ * release for licensing terms and conditions.
+ *
+ * Written by:	Dimitris Michailidis (dm@chelsio.com)
+ *		Karen Xie (kxie@chelsio.com)
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ":%s: " fmt, __func__
+
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <scsi/scsi_host.h>
+
+#include "common.h"
+#include "t3_cpl.h"
+#include "t3cdev.h"
+#include "cxgb3_defs.h"
+#include "cxgb3_ctl_defs.h"
+#include "cxgb3_offload.h"
+#include "firmware_exports.h"
+#include "cxgb3i.h"
+
+static unsigned int dbg_level;
+#include "../libcxgbi.h"
+
+#define DRV_MODULE_NAME         "cxgb3i"
+#define DRV_MODULE_DESC         "Chelsio T3 iSCSI Driver"
+#define DRV_MODULE_VERSION	"2.0.0"
+#define DRV_MODULE_RELDATE	"Jun. 2010"
+
+static char version[] =
+	DRV_MODULE_DESC " " DRV_MODULE_NAME
+	" v" DRV_MODULE_VERSION " (" DRV_MODULE_RELDATE ")\n";
+
+MODULE_AUTHOR("Chelsio Communications, Inc.");
+MODULE_DESCRIPTION(DRV_MODULE_DESC);
+MODULE_VERSION(DRV_MODULE_VERSION);
+MODULE_LICENSE("GPL");
+
+module_param(dbg_level, uint, 0644);
+MODULE_PARM_DESC(dbg_level, "debug flag (default=0)");
+
+static int cxgb3i_rcv_win = 256 * 1024;
+module_param(cxgb3i_rcv_win, int, 0644);
+MODULE_PARM_DESC(cxgb3i_rcv_win, "TCP receive window in bytes (default=256KB)");
+
+static int cxgb3i_snd_win = 128 * 1024;
+module_param(cxgb3i_snd_win, int, 0644);
+MODULE_PARM_DESC(cxgb3i_snd_win, "TCP send window in bytes (default=128KB)");
+
+static int cxgb3i_rx_credit_thres = 10 * 1024;
+module_param(cxgb3i_rx_credit_thres, int, 0644);
+MODULE_PARM_DESC(rx_credit_thres,
+		 "RX credits return threshold in bytes (default=10KB)");
+
+static unsigned int cxgb3i_max_connect = 8 * 1024;
+module_param(cxgb3i_max_connect, uint, 0644);
+MODULE_PARM_DESC(cxgb3i_max_connect, "Max. # of connections (default=8092)");
+
+static unsigned int cxgb3i_sport_base = 20000;
+module_param(cxgb3i_sport_base, uint, 0644);
+MODULE_PARM_DESC(cxgb3i_sport_base, "starting port number (default=20000)");
+
+static void cxgb3i_dev_open(struct t3cdev *);
+static void cxgb3i_dev_close(struct t3cdev *);
+static void cxgb3i_dev_event_handler(struct t3cdev *, u32, u32);
+
+static struct cxgb3_client t3_client = {
+	.name = DRV_MODULE_NAME,
+	.handlers = cxgb3i_cpl_handlers,
+	.add = cxgb3i_dev_open,
+	.remove = cxgb3i_dev_close,
+	.event_handler = cxgb3i_dev_event_handler,
+};
+
+static struct scsi_host_template cxgb3i_host_template = {
+	.module		= THIS_MODULE,
+	.name		= DRV_MODULE_NAME,
+	.proc_name	= DRV_MODULE_NAME,
+	.can_queue	= CXGB3I_SCSI_HOST_QDEPTH,
+	.queuecommand	= iscsi_queuecommand,
+	.change_queue_depth = iscsi_change_queue_depth,
+	.sg_tablesize	= SG_ALL,
+	.max_sectors	= 0xFFFF,
+	.cmd_per_lun	= ISCSI_DEF_CMD_PER_LUN,
+	.eh_abort_handler = iscsi_eh_abort,
+	.eh_device_reset_handler = iscsi_eh_device_reset,
+	.eh_target_reset_handler = iscsi_eh_recover_target,
+	.target_alloc	= iscsi_target_alloc,
+	.use_clustering	= DISABLE_CLUSTERING,
+	.this_id	= -1,
+};
+
+static struct iscsi_transport cxgb3i_iscsi_transport = {
+	.owner		= THIS_MODULE,
+	.name		= DRV_MODULE_NAME,
+	/* owner and name should be set already */
+	.caps		= CAP_RECOVERY_L0 | CAP_MULTI_R2T | CAP_HDRDGST
+				| CAP_DATADGST | CAP_DIGEST_OFFLOAD |
+				CAP_PADDING_OFFLOAD | CAP_TEXT_NEGO,
+	.attr_is_visible	= cxgbi_attr_is_visible,
+	.get_host_param	= cxgbi_get_host_param,
+	.set_host_param	= cxgbi_set_host_param,
+	/* session management */
+	.create_session	= cxgbi_create_session,
+	.destroy_session	= cxgbi_destroy_session,
+	.get_session_param = iscsi_session_get_param,
+	/* connection management */
+	.create_conn	= cxgbi_create_conn,
+	.bind_conn	= cxgbi_bind_conn,
+	.destroy_conn	= iscsi_tcp_conn_teardown,
+	.start_conn	= iscsi_conn_start,
+	.stop_conn	= iscsi_conn_stop,
+	.get_conn_param	= iscsi_conn_get_param,
+	.set_param	= cxgbi_set_conn_param,
+	.get_stats	= cxgbi_get_conn_stats,
+	/* pdu xmit req from user space */
+	.send_pdu	= iscsi_conn_send_pdu,
+	/* task */
+	.init_task	= iscsi_tcp_task_init,
+	.xmit_task	= iscsi_tcp_task_xmit,
+	.cleanup_task	= cxgbi_cleanup_task,
+	/* pdu */
+	.alloc_pdu	= cxgbi_conn_alloc_pdu,
+	.init_pdu	= cxgbi_conn_init_pdu,
+	.xmit_pdu	= cxgbi_conn_xmit_pdu,
+	.parse_pdu_itt	= cxgbi_parse_pdu_itt,
+	/* TCP connect/disconnect */
+	.get_ep_param	= cxgbi_get_ep_param,
+	.ep_connect	= cxgbi_ep_connect,
+	.ep_poll	= cxgbi_ep_poll,
+	.ep_disconnect	= cxgbi_ep_disconnect,
+	/* Error recovery timeout call */
+	.session_recovery_timedout = iscsi_session_recovery_timedout,
+};
+
+static struct scsi_transport_template *cxgb3i_stt;
+
+/*
+ * CPL (Chelsio Protocol Language) defines a message passing interface between
+ * the host driver and Chelsio asic.
+ * The section below implments CPLs that related to iscsi tcp connection
+ * open/close/abort and data send/receive.
+ */
+
+static int push_tx_frames(struct cxgbi_sock *csk, int req_completion);
+
+static void send_act_open_req(struct cxgbi_sock *csk, struct sk_buff *skb,
+			      const struct l2t_entry *e)
+{
+	unsigned int wscale = cxgbi_sock_compute_wscale(cxgb3i_rcv_win);
+	struct cpl_act_open_req *req = (struct cpl_act_open_req *)skb->head;
+
+	skb->priority = CPL_PRIORITY_SETUP;
+
+	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
+	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ACT_OPEN_REQ, csk->atid));
+	req->local_port = csk->saddr.sin_port;
+	req->peer_port = csk->daddr.sin_port;
+	req->local_ip = csk->saddr.sin_addr.s_addr;
+	req->peer_ip = csk->daddr.sin_addr.s_addr;
+
+	req->opt0h = htonl(V_KEEP_ALIVE(1) | F_TCAM_BYPASS |
+			V_WND_SCALE(wscale) | V_MSS_IDX(csk->mss_idx) |
+			V_L2T_IDX(e->idx) | V_TX_CHANNEL(e->smt_idx));
+	req->opt0l = htonl(V_ULP_MODE(ULP2_MODE_ISCSI) |
+			V_RCV_BUFSIZ(cxgb3i_rcv_win>>10));
+
+	log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_SOCK,
+		"csk 0x%p,%u,0x%lx,%u, %pI4:%u-%pI4:%u, %u,%u,%u.\n",
+		csk, csk->state, csk->flags, csk->atid,
+		&req->local_ip, ntohs(req->local_port),
+		&req->peer_ip, ntohs(req->peer_port),
+		csk->mss_idx, e->idx, e->smt_idx);
+
+	l2t_send(csk->cdev->lldev, skb, csk->l2t);
+}
+
+static inline void act_open_arp_failure(struct t3cdev *dev, struct sk_buff *skb)
+{
+	cxgbi_sock_act_open_req_arp_failure(NULL, skb);
+}
+
+/*
+ * CPL connection close request: host ->
+ *
+ * Close a connection by sending a CPL_CLOSE_CON_REQ message and queue it to
+ * the write queue (i.e., after any unsent txt data).
+ */
+static void send_close_req(struct cxgbi_sock *csk)
+{
+	struct sk_buff *skb = csk->cpl_close;
+	struct cpl_close_con_req *req = (struct cpl_close_con_req *)skb->head;
+	unsigned int tid = csk->tid;
+
+	log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_SOCK,
+		"csk 0x%p,%u,0x%lx,%u.\n",
+		csk, csk->state, csk->flags, csk->tid);
+
+	csk->cpl_close = NULL;
+	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_CLOSE_CON));
+	req->wr.wr_lo = htonl(V_WR_TID(tid));
+	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_CON_REQ, tid));
+	req->rsvd = htonl(csk->write_seq);
+
+	cxgbi_sock_skb_entail(csk, skb);
+	if (csk->state >= CTP_ESTABLISHED)
+		push_tx_frames(csk, 1);
+}
+
+/*
+ * CPL connection abort request: host ->
+ *
+ * Send an ABORT_REQ message. Makes sure we do not send multiple ABORT_REQs
+ * for the same connection and also that we do not try to send a message
+ * after the connection has closed.
+ */
+static void abort_arp_failure(struct t3cdev *tdev, struct sk_buff *skb)
+{
+	struct cpl_abort_req *req = cplhdr(skb);
+
+	log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_SOCK,
+		"t3dev 0x%p, tid %u, skb 0x%p.\n",
+		tdev, GET_TID(req), skb);
+	req->cmd = CPL_ABORT_NO_RST;
+	cxgb3_ofld_send(tdev, skb);
+}
+
+static void send_abort_req(struct cxgbi_sock *csk)
+{
+	struct sk_buff *skb = csk->cpl_abort_req;
+	struct cpl_abort_req *req;
+
+	if (unlikely(csk->state == CTP_ABORTING || !skb))
+		return;
+	cxgbi_sock_set_state(csk, CTP_ABORTING);
+	cxgbi_sock_set_flag(csk, CTPF_ABORT_RPL_PENDING);
+	/* Purge the send queue so we don't send anything after an abort. */
+	cxgbi_sock_purge_write_queue(csk);
+
+	csk->cpl_abort_req = NULL;
+	req = (struct cpl_abort_req *)skb->head;
+	skb->priority = CPL_PRIORITY_DATA;
+	set_arp_failure_handler(skb, abort_arp_failure);
+	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_REQ));
+	req->wr.wr_lo = htonl(V_WR_TID(csk->tid));
+	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ABORT_REQ, csk->tid));
+	req->rsvd0 = htonl(csk->snd_nxt);
+	req->rsvd1 = !cxgbi_sock_flag(csk, CTPF_TX_DATA_SENT);
+	req->cmd = CPL_ABORT_SEND_RST;
+
+	log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_SOCK,
+		"csk 0x%p,%u,0x%lx,%u, snd_nxt %u, 0x%x.\n",
+		csk, csk->state, csk->flags, csk->tid, csk->snd_nxt,
+		req->rsvd1);
+
+	l2t_send(csk->cdev->lldev, skb, csk->l2t);
+}
+
+/*
+ * CPL connection abort reply: host ->
+ *
+ * Send an ABORT_RPL message in response of the ABORT_REQ received.
+ */
+static void send_abort_rpl(struct cxgbi_sock *csk, int rst_status)
+{
+	struct sk_buff *skb = csk->cpl_abort_rpl;
+	struct cpl_abort_rpl *rpl = (struct cpl_abort_rpl *)skb->head;
+
+	log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_SOCK,
+		"csk 0x%p,%u,0x%lx,%u, status %d.\n",
+		csk, csk->state, csk->flags, csk->tid, rst_status);
+
+	csk->cpl_abort_rpl = NULL;
+	skb->priority = CPL_PRIORITY_DATA;
+	rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_RPL));
+	rpl->wr.wr_lo = htonl(V_WR_TID(csk->tid));
+	OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_ABORT_RPL, csk->tid));
+	rpl->cmd = rst_status;
+	cxgb3_ofld_send(csk->cdev->lldev, skb);
+}
+
+/*
+ * CPL connection rx data ack: host ->
+ * Send RX credits through an RX_DATA_ACK CPL message. Returns the number of
+ * credits sent.
+ */
+static u32 send_rx_credits(struct cxgbi_sock *csk, u32 credits)
+{
+	struct sk_buff *skb;
+	struct cpl_rx_data_ack *req;
+	u32 dack = F_RX_DACK_CHANGE | V_RX_DACK_MODE(1);
+
+	log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_PDU_RX,
+		"csk 0x%p,%u,0x%lx,%u, credit %u, dack %u.\n",
+		csk, csk->state, csk->flags, csk->tid, credits, dack);
+
+	skb = alloc_wr(sizeof(*req), 0, GFP_ATOMIC);
+	if (!skb) {
+		pr_info("csk 0x%p, credit %u, OOM.\n", csk, credits);
+		return 0;
+	}
+	req = (struct cpl_rx_data_ack *)skb->head;
+	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
+	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, csk->tid));
+	req->credit_dack = htonl(F_RX_DACK_CHANGE | V_RX_DACK_MODE(1) |
+				V_RX_CREDITS(credits));
+	skb->priority = CPL_PRIORITY_ACK;
+	cxgb3_ofld_send(csk->cdev->lldev, skb);
+	return credits;
+}
+
+/*
+ * CPL connection tx data: host ->
+ *
+ * Send iscsi PDU via TX_DATA CPL message. Returns the number of
+ * credits sent.
+ * Each TX_DATA consumes work request credit (wrs), so we need to keep track of
+ * how many we've used so far and how many are pending (i.e., yet ack'ed by T3).
+ */
+
+static unsigned int wrlen __read_mostly;
+static unsigned int skb_wrs[SKB_WR_LIST_SIZE] __read_mostly;
+
+static void init_wr_tab(unsigned int wr_len)
+{
+	int i;
+
+	if (skb_wrs[1])		/* already initialized */
+		return;
+	for (i = 1; i < SKB_WR_LIST_SIZE; i++) {
+		int sgl_len = (3 * i) / 2 + (i & 1);
+
+		sgl_len += 3;
+		skb_wrs[i] = (sgl_len <= wr_len
+			      ? 1 : 1 + (sgl_len - 2) / (wr_len - 1));
+	}
+	wrlen = wr_len * 8;
+}
+
+static inline void make_tx_data_wr(struct cxgbi_sock *csk, struct sk_buff *skb,
+				   int len, int req_completion)
+{
+	struct tx_data_wr *req;
+	struct l2t_entry *l2t = csk->l2t;
+
+	skb_reset_transport_header(skb);
+	req = (struct tx_data_wr *)__skb_push(skb, sizeof(*req));
+	req->wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_TX_DATA) |
+			(req_completion ? F_WR_COMPL : 0));
+	req->wr_lo = htonl(V_WR_TID(csk->tid));
+	/* len includes the length of any HW ULP additions */
+	req->len = htonl(len);
+	/* V_TX_ULP_SUBMODE sets both the mode and submode */
+	req->flags = htonl(V_TX_ULP_SUBMODE(cxgbi_skcb_ulp_mode(skb)) |
+			   V_TX_SHOVE((skb_peek(&csk->write_queue) ? 0 : 1)));
+	req->sndseq = htonl(csk->snd_nxt);
+	req->param = htonl(V_TX_PORT(l2t->smt_idx));
+
+	if (!cxgbi_sock_flag(csk, CTPF_TX_DATA_SENT)) {
+		req->flags |= htonl(V_TX_ACK_PAGES(2) | F_TX_INIT |
+				    V_TX_CPU_IDX(csk->rss_qid));
+		/* sendbuffer is in units of 32KB. */
+		req->param |= htonl(V_TX_SNDBUF(cxgb3i_snd_win >> 15));
+		cxgbi_sock_set_flag(csk, CTPF_TX_DATA_SENT);
+	}
+}
+
+/**
+ * push_tx_frames -- start transmit
+ * @c3cn: the offloaded connection
+ * @req_completion: request wr_ack or not
+ *
+ * Prepends TX_DATA_WR or CPL_CLOSE_CON_REQ headers to buffers waiting in a
+ * connection's send queue and sends them on to T3.  Must be called with the
+ * connection's lock held.  Returns the amount of send buffer space that was
+ * freed as a result of sending queued data to T3.
+ */
+
+static void arp_failure_skb_discard(struct t3cdev *dev, struct sk_buff *skb)
+{
+	kfree_skb(skb);
+}
+
+static int push_tx_frames(struct cxgbi_sock *csk, int req_completion)
+{
+	int total_size = 0;
+	struct sk_buff *skb;
+
+	if (unlikely(csk->state < CTP_ESTABLISHED ||
+		csk->state == CTP_CLOSE_WAIT_1 || csk->state >= CTP_ABORTING)) {
+			log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_PDU_TX,
+				"csk 0x%p,%u,0x%lx,%u, in closing state.\n",
+				csk, csk->state, csk->flags, csk->tid);
+		return 0;
+	}
+
+	while (csk->wr_cred && (skb = skb_peek(&csk->write_queue)) != NULL) {
+		int len = skb->len;	/* length before skb_push */
+		int frags = skb_shinfo(skb)->nr_frags + (len != skb->data_len);
+		int wrs_needed = skb_wrs[frags];
+
+		if (wrs_needed > 1 && len + sizeof(struct tx_data_wr) <= wrlen)
+			wrs_needed = 1;
+
+		WARN_ON(frags >= SKB_WR_LIST_SIZE || wrs_needed < 1);
+
+		if (csk->wr_cred < wrs_needed) {
+			log_debug(1 << CXGBI_DBG_PDU_TX,
+				"csk 0x%p, skb len %u/%u, frag %u, wr %d<%u.\n",
+				csk, skb->len, skb->data_len, frags,
+				wrs_needed, csk->wr_cred);
+			break;
+		}
+
+		__skb_unlink(skb, &csk->write_queue);
+		skb->priority = CPL_PRIORITY_DATA;
+		skb->csum = wrs_needed;	/* remember this until the WR_ACK */
+		csk->wr_cred -= wrs_needed;
+		csk->wr_una_cred += wrs_needed;
+		cxgbi_sock_enqueue_wr(csk, skb);
+
+		log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_PDU_TX,
+			"csk 0x%p, enqueue, skb len %u/%u, frag %u, wr %d, "
+			"left %u, unack %u.\n",
+			csk, skb->len, skb->data_len, frags, skb->csum,
+			csk->wr_cred, csk->wr_una_cred);
+
+		if (likely(cxgbi_skcb_test_flag(skb, SKCBF_TX_NEED_HDR))) {
+			if ((req_completion &&
+				csk->wr_una_cred == wrs_needed) ||
+			     csk->wr_una_cred >= csk->wr_max_cred / 2) {
+				req_completion = 1;
+				csk->wr_una_cred = 0;
+			}
+			len += cxgbi_ulp_extra_len(cxgbi_skcb_ulp_mode(skb));
+			make_tx_data_wr(csk, skb, len, req_completion);
+			csk->snd_nxt += len;
+			cxgbi_skcb_clear_flag(skb, SKCBF_TX_NEED_HDR);
+		}
+		total_size += skb->truesize;
+		log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_PDU_TX,
+			"csk 0x%p, tid 0x%x, send skb 0x%p.\n",
+			csk, csk->tid, skb);
+		set_arp_failure_handler(skb, arp_failure_skb_discard);
+		l2t_send(csk->cdev->lldev, skb, csk->l2t);
+	}
+	return total_size;
+}
+
+/*
+ * Process a CPL_ACT_ESTABLISH message: -> host
+ * Updates connection state from an active establish CPL message.  Runs with
+ * the connection lock held.
+ */
+
+static inline void free_atid(struct cxgbi_sock *csk)
+{
+	if (cxgbi_sock_flag(csk, CTPF_HAS_ATID)) {
+		cxgb3_free_atid(csk->cdev->lldev, csk->atid);
+		cxgbi_sock_clear_flag(csk, CTPF_HAS_ATID);
+		cxgbi_sock_put(csk);
+	}
+}
+
+static int do_act_establish(struct t3cdev *tdev, struct sk_buff *skb, void *ctx)
+{
+	struct cxgbi_sock *csk = ctx;
+	struct cpl_act_establish *req = cplhdr(skb);
+	unsigned int tid = GET_TID(req);
+	unsigned int atid = G_PASS_OPEN_TID(ntohl(req->tos_tid));
+	u32 rcv_isn = ntohl(req->rcv_isn);	/* real RCV_ISN + 1 */
+
+	log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_SOCK,
+		"atid 0x%x,tid 0x%x, csk 0x%p,%u,0x%lx, isn %u.\n",
+		atid, atid, csk, csk->state, csk->flags, rcv_isn);
+
+	cxgbi_sock_get(csk);
+	cxgbi_sock_set_flag(csk, CTPF_HAS_TID);
+	csk->tid = tid;
+	cxgb3_insert_tid(csk->cdev->lldev, &t3_client, csk, tid);
+
+	free_atid(csk);
+
+	csk->rss_qid = G_QNUM(ntohs(skb->csum));
+
+	spin_lock_bh(&csk->lock);
+	if (csk->retry_timer.function) {
+		del_timer(&csk->retry_timer);
+		csk->retry_timer.function = NULL;
+	}
+
+	if (unlikely(csk->state != CTP_ACTIVE_OPEN))
+		pr_info("csk 0x%p,%u,0x%lx,%u, got EST.\n",
+			csk, csk->state, csk->flags, csk->tid);
+
+	csk->copied_seq = csk->rcv_wup = csk->rcv_nxt = rcv_isn;
+	if (cxgb3i_rcv_win > (M_RCV_BUFSIZ << 10))
+		csk->rcv_wup -= cxgb3i_rcv_win - (M_RCV_BUFSIZ << 10);
+
+	cxgbi_sock_established(csk, ntohl(req->snd_isn), ntohs(req->tcp_opt));
+
+	if (unlikely(cxgbi_sock_flag(csk, CTPF_ACTIVE_CLOSE_NEEDED)))
+		/* upper layer has requested closing */
+		send_abort_req(csk);
+	else {
+		if (skb_queue_len(&csk->write_queue))
+			push_tx_frames(csk, 1);
+		cxgbi_conn_tx_open(csk);
+	}
+
+	spin_unlock_bh(&csk->lock);
+	__kfree_skb(skb);
+	return 0;
+}
+
+/*
+ * Process a CPL_ACT_OPEN_RPL message: -> host
+ * Handle active open failures.
+ */
+static int act_open_rpl_status_to_errno(int status)
+{
+	switch (status) {
+	case CPL_ERR_CONN_RESET:
+		return -ECONNREFUSED;
+	case CPL_ERR_ARP_MISS:
+		return -EHOSTUNREACH;
+	case CPL_ERR_CONN_TIMEDOUT:
+		return -ETIMEDOUT;
+	case CPL_ERR_TCAM_FULL:
+		return -ENOMEM;
+	case CPL_ERR_CONN_EXIST:
+		return -EADDRINUSE;
+	default:
+		return -EIO;
+	}
+}
+
+static void act_open_retry_timer(unsigned long data)
+{
+	struct sk_buff *skb;
+	struct cxgbi_sock *csk = (struct cxgbi_sock *)data;
+
+	log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_SOCK,
+		"csk 0x%p,%u,0x%lx,%u.\n",
+		csk, csk->state, csk->flags, csk->tid);
+
+	cxgbi_sock_get(csk);
+	spin_lock_bh(&csk->lock);
+	skb = alloc_wr(sizeof(struct cpl_act_open_req), 0, GFP_ATOMIC);
+	if (!skb)
+		cxgbi_sock_fail_act_open(csk, -ENOMEM);
+	else {
+		skb->sk = (struct sock *)csk;
+		set_arp_failure_handler(skb, act_open_arp_failure);
+		send_act_open_req(csk, skb, csk->l2t);
+	}
+	spin_unlock_bh(&csk->lock);
+	cxgbi_sock_put(csk);
+}
+
+static int do_act_open_rpl(struct t3cdev *tdev, struct sk_buff *skb, void *ctx)
+{
+	struct cxgbi_sock *csk = ctx;
+	struct cpl_act_open_rpl *rpl = cplhdr(skb);
+
+	pr_info("csk 0x%p,%u,0x%lx,%u, status %u, %pI4:%u-%pI4:%u.\n",
+		csk, csk->state, csk->flags, csk->atid, rpl->status,
+		&csk->saddr.sin_addr.s_addr, ntohs(csk->saddr.sin_port),
+		&csk->daddr.sin_addr.s_addr, ntohs(csk->daddr.sin_port));
+
+	if (rpl->status != CPL_ERR_TCAM_FULL &&
+	    rpl->status != CPL_ERR_CONN_EXIST &&
+	    rpl->status != CPL_ERR_ARP_MISS)
+		cxgb3_queue_tid_release(tdev, GET_TID(rpl));
+
+	cxgbi_sock_get(csk);
+	spin_lock_bh(&csk->lock);
+	if (rpl->status == CPL_ERR_CONN_EXIST &&
+	    csk->retry_timer.function != act_open_retry_timer) {
+		csk->retry_timer.function = act_open_retry_timer;
+		mod_timer(&csk->retry_timer, jiffies + HZ / 2);
+	} else
+		cxgbi_sock_fail_act_open(csk,
+				act_open_rpl_status_to_errno(rpl->status));
+
+	spin_unlock_bh(&csk->lock);
+	cxgbi_sock_put(csk);
+	__kfree_skb(skb);
+	return 0;
+}
+
+/*
+ * Process PEER_CLOSE CPL messages: -> host
+ * Handle peer FIN.
+ */
+static int do_peer_close(struct t3cdev *cdev, struct sk_buff *skb, void *ctx)
+{
+	struct cxgbi_sock *csk = ctx;
+
+	log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_SOCK,
+		"csk 0x%p,%u,0x%lx,%u.\n",
+		csk, csk->state, csk->flags, csk->tid);
+
+	cxgbi_sock_rcv_peer_close(csk);
+	__kfree_skb(skb);
+	return 0;
+}
+
+/*
+ * Process CLOSE_CONN_RPL CPL message: -> host
+ * Process a peer ACK to our FIN.
+ */
+static int do_close_con_rpl(struct t3cdev *cdev, struct sk_buff *skb,
+			    void *ctx)
+{
+	struct cxgbi_sock *csk = ctx;
+	struct cpl_close_con_rpl *rpl = cplhdr(skb);
+
+	log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_SOCK,
+		"csk 0x%p,%u,0x%lx,%u, snxt %u.\n",
+		csk, csk->state, csk->flags, csk->tid, ntohl(rpl->snd_nxt));
+
+	cxgbi_sock_rcv_close_conn_rpl(csk, ntohl(rpl->snd_nxt));
+	__kfree_skb(skb);
+	return 0;
+}
+
+/*
+ * Process ABORT_REQ_RSS CPL message: -> host
+ * Process abort requests.  If we are waiting for an ABORT_RPL we ignore this
+ * request except that we need to reply to it.
+ */
+
+static int abort_status_to_errno(struct cxgbi_sock *csk, int abort_reason,
+				 int *need_rst)
+{
+	switch (abort_reason) {
+	case CPL_ERR_BAD_SYN: /* fall through */
+	case CPL_ERR_CONN_RESET:
+		return csk->state > CTP_ESTABLISHED ? -EPIPE : -ECONNRESET;
+	case CPL_ERR_XMIT_TIMEDOUT:
+	case CPL_ERR_PERSIST_TIMEDOUT:
+	case CPL_ERR_FINWAIT2_TIMEDOUT:
+	case CPL_ERR_KEEPALIVE_TIMEDOUT:
+		return -ETIMEDOUT;
+	default:
+		return -EIO;
+	}
+}
+
+static int do_abort_req(struct t3cdev *cdev, struct sk_buff *skb, void *ctx)
+{
+	const struct cpl_abort_req_rss *req = cplhdr(skb);
+	struct cxgbi_sock *csk = ctx;
+	int rst_status = CPL_ABORT_NO_RST;
+
+	log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_SOCK,
+		"csk 0x%p,%u,0x%lx,%u.\n",
+		csk, csk->state, csk->flags, csk->tid);
+
+	if (req->status == CPL_ERR_RTX_NEG_ADVICE ||
+	    req->status == CPL_ERR_PERSIST_NEG_ADVICE) {
+		goto done;
+	}
+
+	cxgbi_sock_get(csk);
+	spin_lock_bh(&csk->lock);
+
+	if (!cxgbi_sock_flag(csk, CTPF_ABORT_REQ_RCVD)) {
+		cxgbi_sock_set_flag(csk, CTPF_ABORT_REQ_RCVD);
+		cxgbi_sock_set_state(csk, CTP_ABORTING);
+		goto out;
+	}
+
+	cxgbi_sock_clear_flag(csk, CTPF_ABORT_REQ_RCVD);
+	send_abort_rpl(csk, rst_status);
+
+	if (!cxgbi_sock_flag(csk, CTPF_ABORT_RPL_PENDING)) {
+		csk->err = abort_status_to_errno(csk, req->status, &rst_status);
+		cxgbi_sock_closed(csk);
+	}
+
+out:
+	spin_unlock_bh(&csk->lock);
+	cxgbi_sock_put(csk);
+done:
+	__kfree_skb(skb);
+	return 0;
+}
+
+/*
+ * Process ABORT_RPL_RSS CPL message: -> host
+ * Process abort replies.  We only process these messages if we anticipate
+ * them as the coordination between SW and HW in this area is somewhat lacking
+ * and sometimes we get ABORT_RPLs after we are done with the connection that
+ * originated the ABORT_REQ.
+ */
+static int do_abort_rpl(struct t3cdev *cdev, struct sk_buff *skb, void *ctx)
+{
+	struct cpl_abort_rpl_rss *rpl = cplhdr(skb);
+	struct cxgbi_sock *csk = ctx;
+
+	log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_SOCK,
+		"status 0x%x, csk 0x%p, s %u, 0x%lx.\n",
+		rpl->status, csk, csk ? csk->state : 0,
+		csk ? csk->flags : 0UL);
+	/*
+	 * Ignore replies to post-close aborts indicating that the abort was
+	 * requested too late.  These connections are terminated when we get
+	 * PEER_CLOSE or CLOSE_CON_RPL and by the time the abort_rpl_rss
+	 * arrives the TID is either no longer used or it has been recycled.
+	 */
+	if (rpl->status == CPL_ERR_ABORT_FAILED)
+		goto rel_skb;
+	/*
+	 * Sometimes we've already closed the connection, e.g., a post-close
+	 * abort races with ABORT_REQ_RSS, the latter frees the connection
+	 * expecting the ABORT_REQ will fail with CPL_ERR_ABORT_FAILED,
+	 * but FW turns the ABORT_REQ into a regular one and so we get
+	 * ABORT_RPL_RSS with status 0 and no connection.
+	 */
+	if (csk)
+		cxgbi_sock_rcv_abort_rpl(csk);
+rel_skb:
+	__kfree_skb(skb);
+	return 0;
+}
+
+/*
+ * Process RX_ISCSI_HDR CPL message: -> host
+ * Handle received PDUs, the payload could be DDP'ed. If not, the payload
+ * follow after the bhs.
+ */
+static int do_iscsi_hdr(struct t3cdev *t3dev, struct sk_buff *skb, void *ctx)
+{
+	struct cxgbi_sock *csk = ctx;
+	struct cpl_iscsi_hdr *hdr_cpl = cplhdr(skb);
+	struct cpl_iscsi_hdr_norss data_cpl;
+	struct cpl_rx_data_ddp_norss ddp_cpl;
+	unsigned int hdr_len, data_len, status;
+	unsigned int len;
+	int err;
+
+	log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_PDU_RX,
+		"csk 0x%p,%u,0x%lx,%u, skb 0x%p,%u.\n",
+		csk, csk->state, csk->flags, csk->tid, skb, skb->len);
+
+	spin_lock_bh(&csk->lock);
+
+	if (unlikely(csk->state >= CTP_PASSIVE_CLOSE)) {
+		log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_SOCK,
+			"csk 0x%p,%u,0x%lx,%u, bad state.\n",
+			csk, csk->state, csk->flags, csk->tid);
+		if (csk->state != CTP_ABORTING)
+			goto abort_conn;
+		else
+			goto discard;
+	}
+
+	cxgbi_skcb_tcp_seq(skb) = ntohl(hdr_cpl->seq);
+	cxgbi_skcb_flags(skb) = 0;
+
+	skb_reset_transport_header(skb);
+	__skb_pull(skb, sizeof(struct cpl_iscsi_hdr));
+
+	len = hdr_len = ntohs(hdr_cpl->len);
+	/* msg coalesce is off or not enough data received */
+	if (skb->len <= hdr_len) {
+		pr_err("%s: tid %u, CPL_ISCSI_HDR, skb len %u < %u.\n",
+			csk->cdev->ports[csk->port_id]->name, csk->tid,
+			skb->len, hdr_len);
+		goto abort_conn;
+	}
+	cxgbi_skcb_set_flag(skb, SKCBF_RX_COALESCED);
+
+	err = skb_copy_bits(skb, skb->len - sizeof(ddp_cpl), &ddp_cpl,
+			    sizeof(ddp_cpl));
+	if (err < 0) {
+		pr_err("%s: tid %u, copy cpl_ddp %u-%zu failed %d.\n",
+			csk->cdev->ports[csk->port_id]->name, csk->tid,
+			skb->len, sizeof(ddp_cpl), err);
+		goto abort_conn;
+	}
+
+	cxgbi_skcb_set_flag(skb, SKCBF_RX_STATUS);
+	cxgbi_skcb_rx_pdulen(skb) = ntohs(ddp_cpl.len);
+	cxgbi_skcb_rx_ddigest(skb) = ntohl(ddp_cpl.ulp_crc);
+	status = ntohl(ddp_cpl.ddp_status);
+
+	log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_PDU_RX,
+		"csk 0x%p, skb 0x%p,%u, pdulen %u, status 0x%x.\n",
+		csk, skb, skb->len, cxgbi_skcb_rx_pdulen(skb), status);
+
+	if (status & (1 << CPL_RX_DDP_STATUS_HCRC_SHIFT))
+		cxgbi_skcb_set_flag(skb, SKCBF_RX_HCRC_ERR);
+	if (status & (1 << CPL_RX_DDP_STATUS_DCRC_SHIFT))
+		cxgbi_skcb_set_flag(skb, SKCBF_RX_DCRC_ERR);
+	if (status & (1 << CPL_RX_DDP_STATUS_PAD_SHIFT))
+		cxgbi_skcb_set_flag(skb, SKCBF_RX_PAD_ERR);
+
+	if (skb->len > (hdr_len + sizeof(ddp_cpl))) {
+		err = skb_copy_bits(skb, hdr_len, &data_cpl, sizeof(data_cpl));
+		if (err < 0) {
+			pr_err("%s: tid %u, cp %zu/%u failed %d.\n",
+				csk->cdev->ports[csk->port_id]->name,
+				csk->tid, sizeof(data_cpl), skb->len, err);
+			goto abort_conn;
+		}
+		data_len = ntohs(data_cpl.len);
+		log_debug(1 << CXGBI_DBG_DDP | 1 << CXGBI_DBG_PDU_RX,
+			"skb 0x%p, pdu not ddp'ed %u/%u, status 0x%x.\n",
+			skb, data_len, cxgbi_skcb_rx_pdulen(skb), status);
+		len += sizeof(data_cpl) + data_len;
+	} else if (status & (1 << CPL_RX_DDP_STATUS_DDP_SHIFT))
+		cxgbi_skcb_set_flag(skb, SKCBF_RX_DATA_DDPD);
+
+	csk->rcv_nxt = ntohl(ddp_cpl.seq) + cxgbi_skcb_rx_pdulen(skb);
+	__pskb_trim(skb, len);
+	__skb_queue_tail(&csk->receive_queue, skb);
+	cxgbi_conn_pdu_ready(csk);
+
+	spin_unlock_bh(&csk->lock);
+	return 0;
+
+abort_conn:
+	send_abort_req(csk);
+discard:
+	spin_unlock_bh(&csk->lock);
+	__kfree_skb(skb);
+	return 0;
+}
+
+/*
+ * Process TX_DATA_ACK CPL messages: -> host
+ * Process an acknowledgment of WR completion.  Advance snd_una and send the
+ * next batch of work requests from the write queue.
+ */
+static int do_wr_ack(struct t3cdev *cdev, struct sk_buff *skb, void *ctx)
+{
+	struct cxgbi_sock *csk = ctx;
+	struct cpl_wr_ack *hdr = cplhdr(skb);
+
+	log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_PDU_RX,
+		"csk 0x%p,%u,0x%lx,%u, cr %u.\n",
+		csk, csk->state, csk->flags, csk->tid, ntohs(hdr->credits));
+
+	cxgbi_sock_rcv_wr_ack(csk, ntohs(hdr->credits), ntohl(hdr->snd_una), 1);
+	__kfree_skb(skb);
+	return 0;
+}
+
+/*
+ * for each connection, pre-allocate skbs needed for close/abort requests. So
+ * that we can service the request right away.
+ */
+static int alloc_cpls(struct cxgbi_sock *csk)
+{
+	csk->cpl_close = alloc_wr(sizeof(struct cpl_close_con_req), 0,
+					GFP_KERNEL);
+	if (!csk->cpl_close)
+		return -ENOMEM;
+	csk->cpl_abort_req = alloc_wr(sizeof(struct cpl_abort_req), 0,
+					GFP_KERNEL);
+	if (!csk->cpl_abort_req)
+		goto free_cpl_skbs;
+
+	csk->cpl_abort_rpl = alloc_wr(sizeof(struct cpl_abort_rpl), 0,
+					GFP_KERNEL);
+	if (!csk->cpl_abort_rpl)
+		goto free_cpl_skbs;
+
+	return 0;
+
+free_cpl_skbs:
+	cxgbi_sock_free_cpl_skbs(csk);
+	return -ENOMEM;
+}
+
+/**
+ * release_offload_resources - release offload resource
+ * @c3cn: the offloaded iscsi tcp connection.
+ * Release resources held by an offload connection (TID, L2T entry, etc.)
+ */
+static void l2t_put(struct cxgbi_sock *csk)
+{
+	struct t3cdev *t3dev = (struct t3cdev *)csk->cdev->lldev;
+
+	if (csk->l2t) {
+		l2t_release(t3dev, csk->l2t);
+		csk->l2t = NULL;
+		cxgbi_sock_put(csk);
+	}
+}
+
+static void release_offload_resources(struct cxgbi_sock *csk)
+{
+	struct t3cdev *t3dev = (struct t3cdev *)csk->cdev->lldev;
+
+	log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_SOCK,
+		"csk 0x%p,%u,0x%lx,%u.\n",
+		csk, csk->state, csk->flags, csk->tid);
+
+	csk->rss_qid = 0;
+	cxgbi_sock_free_cpl_skbs(csk);
+
+	if (csk->wr_cred != csk->wr_max_cred) {
+		cxgbi_sock_purge_wr_queue(csk);
+		cxgbi_sock_reset_wr_list(csk);
+	}
+	l2t_put(csk);
+	if (cxgbi_sock_flag(csk, CTPF_HAS_ATID))
+		free_atid(csk);
+	else if (cxgbi_sock_flag(csk, CTPF_HAS_TID)) {
+		cxgb3_remove_tid(t3dev, (void *)csk, csk->tid);
+		cxgbi_sock_clear_flag(csk, CTPF_HAS_TID);
+		cxgbi_sock_put(csk);
+	}
+	csk->dst = NULL;
+	csk->cdev = NULL;
+}
+
+static void update_address(struct cxgbi_hba *chba)
+{
+	if (chba->ipv4addr) {
+		if (chba->vdev &&
+		    chba->ipv4addr != cxgb3i_get_private_ipv4addr(chba->vdev)) {
+			cxgb3i_set_private_ipv4addr(chba->vdev, chba->ipv4addr);
+			cxgb3i_set_private_ipv4addr(chba->ndev, 0);
+			pr_info("%s set %pI4.\n",
+				chba->vdev->name, &chba->ipv4addr);
+		} else if (chba->ipv4addr !=
+				cxgb3i_get_private_ipv4addr(chba->ndev)) {
+			cxgb3i_set_private_ipv4addr(chba->ndev, chba->ipv4addr);
+			pr_info("%s set %pI4.\n",
+				chba->ndev->name, &chba->ipv4addr);
+		}
+	} else if (cxgb3i_get_private_ipv4addr(chba->ndev)) {
+		if (chba->vdev)
+			cxgb3i_set_private_ipv4addr(chba->vdev, 0);
+		cxgb3i_set_private_ipv4addr(chba->ndev, 0);
+	}
+}
+
+static int init_act_open(struct cxgbi_sock *csk)
+{
+	struct dst_entry *dst = csk->dst;
+	struct cxgbi_device *cdev = csk->cdev;
+	struct t3cdev *t3dev = (struct t3cdev *)cdev->lldev;
+	struct net_device *ndev = cdev->ports[csk->port_id];
+	struct cxgbi_hba *chba = cdev->hbas[csk->port_id];
+	struct sk_buff *skb = NULL;
+
+	log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_SOCK,
+		"csk 0x%p,%u,0x%lx.\n", csk, csk->state, csk->flags);
+
+	update_address(chba);
+	if (chba->ipv4addr)
+		csk->saddr.sin_addr.s_addr = chba->ipv4addr;
+
+	csk->rss_qid = 0;
+	csk->l2t = t3_l2t_get(t3dev, dst, ndev,
+			      &csk->daddr.sin_addr.s_addr);
+	if (!csk->l2t) {
+		pr_err("NO l2t available.\n");
+		return -EINVAL;
+	}
+	cxgbi_sock_get(csk);
+
+	csk->atid = cxgb3_alloc_atid(t3dev, &t3_client, csk);
+	if (csk->atid < 0) {
+		pr_err("NO atid available.\n");
+		goto rel_resource;
+	}
+	cxgbi_sock_set_flag(csk, CTPF_HAS_ATID);
+	cxgbi_sock_get(csk);
+
+	skb = alloc_wr(sizeof(struct cpl_act_open_req), 0, GFP_KERNEL);
+	if (!skb)
+		goto rel_resource;
+	skb->sk = (struct sock *)csk;
+	set_arp_failure_handler(skb, act_open_arp_failure);
+
+	csk->wr_max_cred = csk->wr_cred = T3C_DATA(t3dev)->max_wrs - 1;
+	csk->wr_una_cred = 0;
+	csk->mss_idx = cxgbi_sock_select_mss(csk, dst_mtu(dst));
+	cxgbi_sock_reset_wr_list(csk);
+	csk->err = 0;
+
+	log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_SOCK,
+		"csk 0x%p,%u,0x%lx, %pI4:%u-%pI4:%u.\n",
+		csk, csk->state, csk->flags,
+		&csk->saddr.sin_addr.s_addr, ntohs(csk->saddr.sin_port),
+		&csk->daddr.sin_addr.s_addr, ntohs(csk->daddr.sin_port));
+
+	cxgbi_sock_set_state(csk, CTP_ACTIVE_OPEN);
+	send_act_open_req(csk, skb, csk->l2t);
+	return 0;
+
+rel_resource:
+	if (skb)
+		__kfree_skb(skb);
+	return -EINVAL;
+}
+
+cxgb3_cpl_handler_func cxgb3i_cpl_handlers[NUM_CPL_CMDS] = {
+	[CPL_ACT_ESTABLISH] = do_act_establish,
+	[CPL_ACT_OPEN_RPL] = do_act_open_rpl,
+	[CPL_PEER_CLOSE] = do_peer_close,
+	[CPL_ABORT_REQ_RSS] = do_abort_req,
+	[CPL_ABORT_RPL_RSS] = do_abort_rpl,
+	[CPL_CLOSE_CON_RPL] = do_close_con_rpl,
+	[CPL_TX_DMA_ACK] = do_wr_ack,
+	[CPL_ISCSI_HDR] = do_iscsi_hdr,
+};
+
+/**
+ * cxgb3i_ofld_init - allocate and initialize resources for each adapter found
+ * @cdev:	cxgbi adapter
+ */
+int cxgb3i_ofld_init(struct cxgbi_device *cdev)
+{
+	struct t3cdev *t3dev = (struct t3cdev *)cdev->lldev;
+	struct adap_ports port;
+	struct ofld_page_info rx_page_info;
+	unsigned int wr_len;
+	int rc;
+
+	if (t3dev->ctl(t3dev, GET_WR_LEN, &wr_len) < 0 ||
+	    t3dev->ctl(t3dev, GET_PORTS, &port) < 0 ||
+	    t3dev->ctl(t3dev, GET_RX_PAGE_INFO, &rx_page_info) < 0) {
+		pr_warn("t3 0x%p, offload up, ioctl failed.\n", t3dev);
+		return -EINVAL;
+	}
+
+	if (cxgb3i_max_connect > CXGBI_MAX_CONN)
+		cxgb3i_max_connect = CXGBI_MAX_CONN;
+
+	rc = cxgbi_device_portmap_create(cdev, cxgb3i_sport_base,
+					cxgb3i_max_connect);
+	if (rc < 0)
+		return rc;
+
+	init_wr_tab(wr_len);
+	cdev->csk_release_offload_resources = release_offload_resources;
+	cdev->csk_push_tx_frames = push_tx_frames;
+	cdev->csk_send_abort_req = send_abort_req;
+	cdev->csk_send_close_req = send_close_req;
+	cdev->csk_send_rx_credits = send_rx_credits;
+	cdev->csk_alloc_cpls = alloc_cpls;
+	cdev->csk_init_act_open = init_act_open;
+
+	pr_info("cdev 0x%p, offload up, added.\n", cdev);
+	return 0;
+}
+
+/*
+ * functions to program the pagepod in h/w
+ */
+static inline void ulp_mem_io_set_hdr(struct sk_buff *skb, unsigned int addr)
+{
+	struct ulp_mem_io *req = (struct ulp_mem_io *)skb->head;
+
+	memset(req, 0, sizeof(*req));
+
+	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS));
+	req->cmd_lock_addr = htonl(V_ULP_MEMIO_ADDR(addr >> 5) |
+				   V_ULPTX_CMD(ULP_MEM_WRITE));
+	req->len = htonl(V_ULP_MEMIO_DATA_LEN(PPOD_SIZE >> 5) |
+			 V_ULPTX_NFLITS((PPOD_SIZE >> 3) + 1));
+}
+
+static int ddp_set_map(struct cxgbi_sock *csk, struct cxgbi_pagepod_hdr *hdr,
+			unsigned int idx, unsigned int npods,
+				struct cxgbi_gather_list *gl)
+{
+	struct cxgbi_device *cdev = csk->cdev;
+	struct cxgbi_ddp_info *ddp = cdev->ddp;
+	unsigned int pm_addr = (idx << PPOD_SIZE_SHIFT) + ddp->llimit;
+	int i;
+
+	log_debug(1 << CXGBI_DBG_DDP,
+		"csk 0x%p, idx %u, npods %u, gl 0x%p.\n",
+		csk, idx, npods, gl);
+
+	for (i = 0; i < npods; i++, idx++, pm_addr += PPOD_SIZE) {
+		struct sk_buff *skb = alloc_wr(sizeof(struct ulp_mem_io) +
+						PPOD_SIZE, 0, GFP_ATOMIC);
+
+		if (!skb)
+			return -ENOMEM;
+
+		ulp_mem_io_set_hdr(skb, pm_addr);
+		cxgbi_ddp_ppod_set((struct cxgbi_pagepod *)(skb->head +
+					sizeof(struct ulp_mem_io)),
+				   hdr, gl, i * PPOD_PAGES_MAX);
+		skb->priority = CPL_PRIORITY_CONTROL;
+		cxgb3_ofld_send(cdev->lldev, skb);
+	}
+	return 0;
+}
+
+static void ddp_clear_map(struct cxgbi_hba *chba, unsigned int tag,
+			  unsigned int idx, unsigned int npods)
+{
+	struct cxgbi_device *cdev = chba->cdev;
+	struct cxgbi_ddp_info *ddp = cdev->ddp;
+	unsigned int pm_addr = (idx << PPOD_SIZE_SHIFT) + ddp->llimit;
+	int i;
+
+	log_debug(1 << CXGBI_DBG_DDP,
+		"cdev 0x%p, idx %u, npods %u, tag 0x%x.\n",
+		cdev, idx, npods, tag);
+
+	for (i = 0; i < npods; i++, idx++, pm_addr += PPOD_SIZE) {
+		struct sk_buff *skb = alloc_wr(sizeof(struct ulp_mem_io) +
+						PPOD_SIZE, 0, GFP_ATOMIC);
+
+		if (!skb) {
+			pr_err("tag 0x%x, 0x%x, %d/%u, skb OOM.\n",
+				tag, idx, i, npods);
+			continue;
+		}
+		ulp_mem_io_set_hdr(skb, pm_addr);
+		skb->priority = CPL_PRIORITY_CONTROL;
+		cxgb3_ofld_send(cdev->lldev, skb);
+	}
+}
+
+static int ddp_setup_conn_pgidx(struct cxgbi_sock *csk,
+				       unsigned int tid, int pg_idx, bool reply)
+{
+	struct sk_buff *skb = alloc_wr(sizeof(struct cpl_set_tcb_field), 0,
+					GFP_KERNEL);
+	struct cpl_set_tcb_field *req;
+	u64 val = pg_idx < DDP_PGIDX_MAX ? pg_idx : 0;
+
+	log_debug(1 << CXGBI_DBG_DDP,
+		"csk 0x%p, tid %u, pg_idx %d.\n", csk, tid, pg_idx);
+	if (!skb)
+		return -ENOMEM;
+
+	/* set up ulp submode and page size */
+	req = (struct cpl_set_tcb_field *)skb->head;
+	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
+	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, tid));
+	req->reply = V_NO_REPLY(reply ? 0 : 1);
+	req->cpu_idx = 0;
+	req->word = htons(31);
+	req->mask = cpu_to_be64(0xF0000000);
+	req->val = cpu_to_be64(val << 28);
+	skb->priority = CPL_PRIORITY_CONTROL;
+
+	cxgb3_ofld_send(csk->cdev->lldev, skb);
+	return 0;
+}
+
+/**
+ * cxgb3i_setup_conn_digest - setup conn. digest setting
+ * @csk: cxgb tcp socket
+ * @tid: connection id
+ * @hcrc: header digest enabled
+ * @dcrc: data digest enabled
+ * @reply: request reply from h/w
+ * set up the iscsi digest settings for a connection identified by tid
+ */
+static int ddp_setup_conn_digest(struct cxgbi_sock *csk, unsigned int tid,
+			     int hcrc, int dcrc, int reply)
+{
+	struct sk_buff *skb = alloc_wr(sizeof(struct cpl_set_tcb_field), 0,
+					GFP_KERNEL);
+	struct cpl_set_tcb_field *req;
+	u64 val = (hcrc ? 1 : 0) | (dcrc ? 2 : 0);
+
+	log_debug(1 << CXGBI_DBG_DDP,
+		"csk 0x%p, tid %u, crc %d,%d.\n", csk, tid, hcrc, dcrc);
+	if (!skb)
+		return -ENOMEM;
+
+	/* set up ulp submode and page size */
+	req = (struct cpl_set_tcb_field *)skb->head;
+	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
+	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, tid));
+	req->reply = V_NO_REPLY(reply ? 0 : 1);
+	req->cpu_idx = 0;
+	req->word = htons(31);
+	req->mask = cpu_to_be64(0x0F000000);
+	req->val = cpu_to_be64(val << 24);
+	skb->priority = CPL_PRIORITY_CONTROL;
+
+	cxgb3_ofld_send(csk->cdev->lldev, skb);
+	return 0;
+}
+
+/**
+ * t3_ddp_cleanup - release the cxgb3 adapter's ddp resource
+ * @cdev: cxgb3i adapter
+ * release all the resource held by the ddp pagepod manager for a given
+ * adapter if needed
+ */
+
+static void t3_ddp_cleanup(struct cxgbi_device *cdev)
+{
+	struct t3cdev *tdev = (struct t3cdev *)cdev->lldev;
+
+	if (cxgbi_ddp_cleanup(cdev)) {
+		pr_info("t3dev 0x%p, ulp_iscsi no more user.\n", tdev);
+		tdev->ulp_iscsi = NULL;
+	}
+}
+
+/**
+ * ddp_init - initialize the cxgb3 adapter's ddp resource
+ * @cdev: cxgb3i adapter
+ * initialize the ddp pagepod manager for a given adapter
+ */
+static int cxgb3i_ddp_init(struct cxgbi_device *cdev)
+{
+	struct t3cdev *tdev = (struct t3cdev *)cdev->lldev;
+	struct cxgbi_ddp_info *ddp = tdev->ulp_iscsi;
+	struct ulp_iscsi_info uinfo;
+	unsigned int pgsz_factor[4];
+	int i, err;
+
+	if (ddp) {
+		kref_get(&ddp->refcnt);
+		pr_warn("t3dev 0x%p, ddp 0x%p already set up.\n",
+			tdev, tdev->ulp_iscsi);
+		cdev->ddp = ddp;
+		return -EALREADY;
+	}
+
+	err = tdev->ctl(tdev, ULP_ISCSI_GET_PARAMS, &uinfo);
+	if (err < 0) {
+		pr_err("%s, failed to get iscsi param err=%d.\n",
+			 tdev->name, err);
+		return err;
+	}
+
+	err = cxgbi_ddp_init(cdev, uinfo.llimit, uinfo.ulimit,
+			uinfo.max_txsz, uinfo.max_rxsz);
+	if (err < 0)
+		return err;
+
+	ddp = cdev->ddp;
+
+	uinfo.tagmask = ddp->idx_mask << PPOD_IDX_SHIFT;
+	cxgbi_ddp_page_size_factor(pgsz_factor);
+	for (i = 0; i < 4; i++)
+		uinfo.pgsz_factor[i] = pgsz_factor[i];
+	uinfo.ulimit = uinfo.llimit + (ddp->nppods << PPOD_SIZE_SHIFT);
+
+	err = tdev->ctl(tdev, ULP_ISCSI_SET_PARAMS, &uinfo);
+	if (err < 0) {
+		pr_warn("%s unable to set iscsi param err=%d, ddp disabled.\n",
+			tdev->name, err);
+		cxgbi_ddp_cleanup(cdev);
+		return err;
+	}
+	tdev->ulp_iscsi = ddp;
+
+	cdev->csk_ddp_setup_digest = ddp_setup_conn_digest;
+	cdev->csk_ddp_setup_pgidx = ddp_setup_conn_pgidx;
+	cdev->csk_ddp_set = ddp_set_map;
+	cdev->csk_ddp_clear = ddp_clear_map;
+
+	pr_info("tdev 0x%p, nppods %u, bits %u, mask 0x%x,0x%x pkt %u/%u, "
+		"%u/%u.\n",
+		tdev, ddp->nppods, ddp->idx_bits, ddp->idx_mask,
+		ddp->rsvd_tag_mask, ddp->max_txsz, uinfo.max_txsz,
+		ddp->max_rxsz, uinfo.max_rxsz);
+	return 0;
+}
+
+static void cxgb3i_dev_close(struct t3cdev *t3dev)
+{
+	struct cxgbi_device *cdev = cxgbi_device_find_by_lldev(t3dev);
+
+	if (!cdev || cdev->flags & CXGBI_FLAG_ADAPTER_RESET) {
+		pr_info("0x%p close, f 0x%x.\n", cdev, cdev ? cdev->flags : 0);
+		return;
+	}
+
+	cxgbi_device_unregister(cdev);
+}
+
+/**
+ * cxgb3i_dev_open - init a t3 adapter structure and any h/w settings
+ * @t3dev: t3cdev adapter
+ */
+static void cxgb3i_dev_open(struct t3cdev *t3dev)
+{
+	struct cxgbi_device *cdev = cxgbi_device_find_by_lldev(t3dev);
+	struct adapter *adapter = tdev2adap(t3dev);
+	int i, err;
+
+	if (cdev) {
+		pr_info("0x%p, updating.\n", cdev);
+		return;
+	}
+
+	cdev = cxgbi_device_register(0, adapter->params.nports);
+	if (!cdev) {
+		pr_warn("device 0x%p register failed.\n", t3dev);
+		return;
+	}
+
+	cdev->flags = CXGBI_FLAG_DEV_T3 | CXGBI_FLAG_IPV4_SET;
+	cdev->lldev = t3dev;
+	cdev->pdev = adapter->pdev;
+	cdev->ports = adapter->port;
+	cdev->nports = adapter->params.nports;
+	cdev->mtus = adapter->params.mtus;
+	cdev->nmtus = NMTUS;
+	cdev->snd_win = cxgb3i_snd_win;
+	cdev->rcv_win = cxgb3i_rcv_win;
+	cdev->rx_credit_thres = cxgb3i_rx_credit_thres;
+	cdev->skb_tx_rsvd = CXGB3I_TX_HEADER_LEN;
+	cdev->skb_rx_extra = sizeof(struct cpl_iscsi_hdr_norss);
+	cdev->dev_ddp_cleanup = t3_ddp_cleanup;
+	cdev->itp = &cxgb3i_iscsi_transport;
+
+	err = cxgb3i_ddp_init(cdev);
+	if (err) {
+		pr_info("0x%p ddp init failed\n", cdev);
+		goto err_out;
+	}
+
+	err = cxgb3i_ofld_init(cdev);
+	if (err) {
+		pr_info("0x%p offload init failed\n", cdev);
+		goto err_out;
+	}
+
+	err = cxgbi_hbas_add(cdev, CXGB3I_MAX_LUN, CXGBI_MAX_CONN,
+				&cxgb3i_host_template, cxgb3i_stt);
+	if (err)
+		goto err_out;
+
+	for (i = 0; i < cdev->nports; i++)
+		cdev->hbas[i]->ipv4addr =
+			cxgb3i_get_private_ipv4addr(cdev->ports[i]);
+
+	pr_info("cdev 0x%p, f 0x%x, t3dev 0x%p open, err %d.\n",
+		cdev, cdev ? cdev->flags : 0, t3dev, err);
+	return;
+
+err_out:
+	cxgbi_device_unregister(cdev);
+}
+
+static void cxgb3i_dev_event_handler(struct t3cdev *t3dev, u32 event, u32 port)
+{
+	struct cxgbi_device *cdev = cxgbi_device_find_by_lldev(t3dev);
+
+	log_debug(1 << CXGBI_DBG_TOE,
+		"0x%p, cdev 0x%p, event 0x%x, port 0x%x.\n",
+		t3dev, cdev, event, port);
+	if (!cdev)
+		return;
+
+	switch (event) {
+	case OFFLOAD_STATUS_DOWN:
+		cdev->flags |= CXGBI_FLAG_ADAPTER_RESET;
+		break;
+	case OFFLOAD_STATUS_UP:
+		cdev->flags &= ~CXGBI_FLAG_ADAPTER_RESET;
+		break;
+	}
+}
+
+/**
+ * cxgb3i_init_module - module init entry point
+ *
+ * initialize any driver wide global data structures and register itself
+ *	with the cxgb3 module
+ */
+static int __init cxgb3i_init_module(void)
+{
+	int rc;
+
+	printk(KERN_INFO "%s", version);
+
+	rc = cxgbi_iscsi_init(&cxgb3i_iscsi_transport, &cxgb3i_stt);
+	if (rc < 0)
+		return rc;
+
+	cxgb3_register_client(&t3_client);
+	return 0;
+}
+
+/**
+ * cxgb3i_exit_module - module cleanup/exit entry point
+ *
+ * go through the driver hba list and for each hba, release any resource held.
+ *	and unregisters iscsi transport and the cxgb3 module
+ */
+static void __exit cxgb3i_exit_module(void)
+{
+	cxgb3_unregister_client(&t3_client);
+	cxgbi_device_unregister_all(CXGBI_FLAG_DEV_T3);
+	cxgbi_iscsi_cleanup(&cxgb3i_iscsi_transport, &cxgb3i_stt);
+}
+
+module_init(cxgb3i_init_module);
+module_exit(cxgb3i_exit_module);
diff --git a/drivers/scsi/cxgbi/cxgb3i/cxgb3i.h b/drivers/scsi/cxgbi/cxgb3i/cxgb3i.h
new file mode 100644
index 0000000..20593fd
--- /dev/null
+++ b/drivers/scsi/cxgbi/cxgb3i/cxgb3i.h
@@ -0,0 +1,62 @@
+/*
+ * cxgb3i.h: Chelsio S3xx iSCSI driver.
+ *
+ * Copyright (c) 2008 Chelsio Communications, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.
+ *
+ * Written by: Karen Xie (kxie@chelsio.com)
+ */
+
+#ifndef __CXGB3I_H__
+#define __CXGB3I_H__
+
+#define CXGB3I_SCSI_HOST_QDEPTH 1024
+#define CXGB3I_MAX_LUN		512
+#define ISCSI_PDU_NONPAYLOAD_MAX \
+	(sizeof(struct iscsi_hdr) + ISCSI_MAX_AHS_SIZE + 2*ISCSI_DIGEST_SIZE)
+
+/*for TX: a skb must have a headroom of at least TX_HEADER_LEN bytes */
+#define CXGB3I_TX_HEADER_LEN \
+	(sizeof(struct tx_data_wr) + sizeof(struct sge_opaque_hdr))
+
+extern cxgb3_cpl_handler_func cxgb3i_cpl_handlers[NUM_CPL_CMDS];
+
+static inline unsigned int cxgb3i_get_private_ipv4addr(struct net_device *ndev)
+{
+	return ((struct port_info *)(netdev_priv(ndev)))->iscsi_ipv4addr;
+}
+
+static inline void cxgb3i_set_private_ipv4addr(struct net_device *ndev,
+						unsigned int addr)
+{
+	struct port_info *pi =  (struct port_info *)netdev_priv(ndev);
+
+	pi->iscsic.flags = addr ? 1 : 0;
+	pi->iscsi_ipv4addr = addr;
+	if (addr)
+		memcpy(pi->iscsic.mac_addr, ndev->dev_addr, ETH_ALEN);
+}
+
+struct cpl_iscsi_hdr_norss {
+	union opcode_tid ot;
+	u16 pdu_len_ddp;
+	u16 len;
+	u32 seq;
+	u16 urg;
+	u8 rsvd;
+	u8 status;
+};
+
+struct cpl_rx_data_ddp_norss {
+	union opcode_tid ot;
+	u16 urg;
+	u16 len;
+	u32 seq;
+	u32 nxt_seq;
+	u32 ulp_crc;
+	u32 ddp_status;
+};
+#endif
diff --git a/drivers/scsi/cxgbi/cxgb4i/Kbuild b/drivers/scsi/cxgbi/cxgb4i/Kbuild
new file mode 100644
index 0000000..8290cda
--- /dev/null
+++ b/drivers/scsi/cxgbi/cxgb4i/Kbuild
@@ -0,0 +1,3 @@
+EXTRA_CFLAGS += -I$(srctree)/drivers/net/ethernet/chelsio/cxgb4
+
+obj-$(CONFIG_SCSI_CXGB4_ISCSI) += cxgb4i.o
diff --git a/drivers/scsi/cxgbi/cxgb4i/Kconfig b/drivers/scsi/cxgbi/cxgb4i/Kconfig
new file mode 100644
index 0000000..8c4e423
--- /dev/null
+++ b/drivers/scsi/cxgbi/cxgb4i/Kconfig
@@ -0,0 +1,10 @@
+config SCSI_CXGB4_ISCSI
+	tristate "Chelsio T4 iSCSI support"
+	depends on PCI && INET && (IPV6 || IPV6=n)
+	select NETDEVICES
+	select ETHERNET
+	select NET_VENDOR_CHELSIO
+	select CHELSIO_T4
+	select SCSI_ISCSI_ATTRS
+	---help---
+	  This driver supports iSCSI offload for the Chelsio T4 devices.
diff --git a/drivers/scsi/cxgbi/cxgb4i/cxgb4i.c b/drivers/scsi/cxgbi/cxgb4i/cxgb4i.c
new file mode 100644
index 0000000..1508125
--- /dev/null
+++ b/drivers/scsi/cxgbi/cxgb4i/cxgb4i.c
@@ -0,0 +1,1805 @@
+/*
+ * cxgb4i.c: Chelsio T4 iSCSI driver.
+ *
+ * Copyright (c) 2010 Chelsio Communications, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.
+ *
+ * Written by:	Karen Xie (kxie@chelsio.com)
+ *		Rakesh Ranjan (rranjan@chelsio.com)
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ":%s: " fmt, __func__
+
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <scsi/scsi_host.h>
+#include <net/tcp.h>
+#include <net/dst.h>
+#include <linux/netdevice.h>
+#include <net/addrconf.h>
+
+#include "t4_regs.h"
+#include "t4_msg.h"
+#include "cxgb4.h"
+#include "cxgb4_uld.h"
+#include "t4fw_api.h"
+#include "l2t.h"
+#include "cxgb4i.h"
+
+static unsigned int dbg_level;
+
+#include "../libcxgbi.h"
+
+#define	DRV_MODULE_NAME		"cxgb4i"
+#define DRV_MODULE_DESC		"Chelsio T4/T5 iSCSI Driver"
+#define	DRV_MODULE_VERSION	"0.9.4"
+
+static char version[] =
+	DRV_MODULE_DESC " " DRV_MODULE_NAME
+	" v" DRV_MODULE_VERSION "\n";
+
+MODULE_AUTHOR("Chelsio Communications, Inc.");
+MODULE_DESCRIPTION(DRV_MODULE_DESC);
+MODULE_VERSION(DRV_MODULE_VERSION);
+MODULE_LICENSE("GPL");
+
+module_param(dbg_level, uint, 0644);
+MODULE_PARM_DESC(dbg_level, "Debug flag (default=0)");
+
+static int cxgb4i_rcv_win = 256 * 1024;
+module_param(cxgb4i_rcv_win, int, 0644);
+MODULE_PARM_DESC(cxgb4i_rcv_win, "TCP reveive window in bytes");
+
+static int cxgb4i_snd_win = 128 * 1024;
+module_param(cxgb4i_snd_win, int, 0644);
+MODULE_PARM_DESC(cxgb4i_snd_win, "TCP send window in bytes");
+
+static int cxgb4i_rx_credit_thres = 10 * 1024;
+module_param(cxgb4i_rx_credit_thres, int, 0644);
+MODULE_PARM_DESC(cxgb4i_rx_credit_thres,
+		"RX credits return threshold in bytes (default=10KB)");
+
+static unsigned int cxgb4i_max_connect = (8 * 1024);
+module_param(cxgb4i_max_connect, uint, 0644);
+MODULE_PARM_DESC(cxgb4i_max_connect, "Maximum number of connections");
+
+static unsigned short cxgb4i_sport_base = 20000;
+module_param(cxgb4i_sport_base, ushort, 0644);
+MODULE_PARM_DESC(cxgb4i_sport_base, "Starting port number (default 20000)");
+
+typedef void (*cxgb4i_cplhandler_func)(struct cxgbi_device *, struct sk_buff *);
+
+static void *t4_uld_add(const struct cxgb4_lld_info *);
+static int t4_uld_rx_handler(void *, const __be64 *, const struct pkt_gl *);
+static int t4_uld_state_change(void *, enum cxgb4_state state);
+
+static const struct cxgb4_uld_info cxgb4i_uld_info = {
+	.name = DRV_MODULE_NAME,
+	.add = t4_uld_add,
+	.rx_handler = t4_uld_rx_handler,
+	.state_change = t4_uld_state_change,
+};
+
+static struct scsi_host_template cxgb4i_host_template = {
+	.module		= THIS_MODULE,
+	.name		= DRV_MODULE_NAME,
+	.proc_name	= DRV_MODULE_NAME,
+	.can_queue	= CXGB4I_SCSI_HOST_QDEPTH,
+	.queuecommand	= iscsi_queuecommand,
+	.change_queue_depth = iscsi_change_queue_depth,
+	.sg_tablesize	= SG_ALL,
+	.max_sectors	= 0xFFFF,
+	.cmd_per_lun	= ISCSI_DEF_CMD_PER_LUN,
+	.eh_abort_handler = iscsi_eh_abort,
+	.eh_device_reset_handler = iscsi_eh_device_reset,
+	.eh_target_reset_handler = iscsi_eh_recover_target,
+	.target_alloc	= iscsi_target_alloc,
+	.use_clustering	= DISABLE_CLUSTERING,
+	.this_id	= -1,
+};
+
+static struct iscsi_transport cxgb4i_iscsi_transport = {
+	.owner		= THIS_MODULE,
+	.name		= DRV_MODULE_NAME,
+	.caps		= CAP_RECOVERY_L0 | CAP_MULTI_R2T | CAP_HDRDGST |
+				CAP_DATADGST | CAP_DIGEST_OFFLOAD |
+				CAP_PADDING_OFFLOAD | CAP_TEXT_NEGO,
+	.attr_is_visible	= cxgbi_attr_is_visible,
+	.get_host_param	= cxgbi_get_host_param,
+	.set_host_param	= cxgbi_set_host_param,
+	/* session management */
+	.create_session	= cxgbi_create_session,
+	.destroy_session	= cxgbi_destroy_session,
+	.get_session_param = iscsi_session_get_param,
+	/* connection management */
+	.create_conn	= cxgbi_create_conn,
+	.bind_conn		= cxgbi_bind_conn,
+	.destroy_conn	= iscsi_tcp_conn_teardown,
+	.start_conn		= iscsi_conn_start,
+	.stop_conn		= iscsi_conn_stop,
+	.get_conn_param	= iscsi_conn_get_param,
+	.set_param	= cxgbi_set_conn_param,
+	.get_stats	= cxgbi_get_conn_stats,
+	/* pdu xmit req from user space */
+	.send_pdu	= iscsi_conn_send_pdu,
+	/* task */
+	.init_task	= iscsi_tcp_task_init,
+	.xmit_task	= iscsi_tcp_task_xmit,
+	.cleanup_task	= cxgbi_cleanup_task,
+	/* pdu */
+	.alloc_pdu	= cxgbi_conn_alloc_pdu,
+	.init_pdu	= cxgbi_conn_init_pdu,
+	.xmit_pdu	= cxgbi_conn_xmit_pdu,
+	.parse_pdu_itt	= cxgbi_parse_pdu_itt,
+	/* TCP connect/disconnect */
+	.get_ep_param	= cxgbi_get_ep_param,
+	.ep_connect	= cxgbi_ep_connect,
+	.ep_poll	= cxgbi_ep_poll,
+	.ep_disconnect	= cxgbi_ep_disconnect,
+	/* Error recovery timeout call */
+	.session_recovery_timedout = iscsi_session_recovery_timedout,
+};
+
+static struct scsi_transport_template *cxgb4i_stt;
+
+/*
+ * CPL (Chelsio Protocol Language) defines a message passing interface between
+ * the host driver and Chelsio asic.
+ * The section below implments CPLs that related to iscsi tcp connection
+ * open/close/abort and data send/receive.
+ */
+
+#define DIV_ROUND_UP(n, d)	(((n) + (d) - 1) / (d))
+#define RCV_BUFSIZ_MASK		0x3FFU
+#define MAX_IMM_TX_PKT_LEN	128
+
+static inline void set_queue(struct sk_buff *skb, unsigned int queue,
+				const struct cxgbi_sock *csk)
+{
+	skb->queue_mapping = queue;
+}
+
+static int push_tx_frames(struct cxgbi_sock *, int);
+
+/*
+ * is_ofld_imm - check whether a packet can be sent as immediate data
+ * @skb: the packet
+ *
+ * Returns true if a packet can be sent as an offload WR with immediate
+ * data.  We currently use the same limit as for Ethernet packets.
+ */
+static inline int is_ofld_imm(const struct sk_buff *skb)
+{
+	return skb->len <= (MAX_IMM_TX_PKT_LEN -
+			sizeof(struct fw_ofld_tx_data_wr));
+}
+
+static void send_act_open_req(struct cxgbi_sock *csk, struct sk_buff *skb,
+				struct l2t_entry *e)
+{
+	struct cxgb4_lld_info *lldi = cxgbi_cdev_priv(csk->cdev);
+	int t4 = is_t4(lldi->adapter_type);
+	int wscale = cxgbi_sock_compute_wscale(csk->mss_idx);
+	unsigned long long opt0;
+	unsigned int opt2;
+	unsigned int qid_atid = ((unsigned int)csk->atid) |
+				 (((unsigned int)csk->rss_qid) << 14);
+
+	opt0 = KEEP_ALIVE(1) |
+		WND_SCALE(wscale) |
+		MSS_IDX(csk->mss_idx) |
+		L2T_IDX(((struct l2t_entry *)csk->l2t)->idx) |
+		TX_CHAN(csk->tx_chan) |
+		SMAC_SEL(csk->smac_idx) |
+		ULP_MODE(ULP_MODE_ISCSI) |
+		RCV_BUFSIZ(cxgb4i_rcv_win >> 10);
+	opt2 = RX_CHANNEL(0) |
+		RSS_QUEUE_VALID |
+		(1 << 20) |
+		RSS_QUEUE(csk->rss_qid);
+
+	if (is_t4(lldi->adapter_type)) {
+		struct cpl_act_open_req *req =
+				(struct cpl_act_open_req *)skb->head;
+
+		INIT_TP_WR(req, 0);
+		OPCODE_TID(req) = cpu_to_be32(MK_OPCODE_TID(CPL_ACT_OPEN_REQ,
+					qid_atid));
+		req->local_port = csk->saddr.sin_port;
+		req->peer_port = csk->daddr.sin_port;
+		req->local_ip = csk->saddr.sin_addr.s_addr;
+		req->peer_ip = csk->daddr.sin_addr.s_addr;
+		req->opt0 = cpu_to_be64(opt0);
+		req->params = cpu_to_be32(cxgb4_select_ntuple(
+					csk->cdev->ports[csk->port_id],
+					csk->l2t));
+		opt2 |= 1 << 22;
+		req->opt2 = cpu_to_be32(opt2);
+
+		log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_SOCK,
+			"csk t4 0x%p, %pI4:%u-%pI4:%u, atid %d, qid %u.\n",
+			csk, &req->local_ip, ntohs(req->local_port),
+			&req->peer_ip, ntohs(req->peer_port),
+			csk->atid, csk->rss_qid);
+	} else {
+		struct cpl_t5_act_open_req *req =
+				(struct cpl_t5_act_open_req *)skb->head;
+
+		INIT_TP_WR(req, 0);
+		OPCODE_TID(req) = cpu_to_be32(MK_OPCODE_TID(CPL_ACT_OPEN_REQ,
+					qid_atid));
+		req->local_port = csk->saddr.sin_port;
+		req->peer_port = csk->daddr.sin_port;
+		req->local_ip = csk->saddr.sin_addr.s_addr;
+		req->peer_ip = csk->daddr.sin_addr.s_addr;
+		req->opt0 = cpu_to_be64(opt0);
+		req->params = cpu_to_be64(V_FILTER_TUPLE(
+				cxgb4_select_ntuple(
+					csk->cdev->ports[csk->port_id],
+					csk->l2t)));
+		opt2 |= 1 << 31;
+		req->opt2 = cpu_to_be32(opt2);
+
+		log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_SOCK,
+			"csk t5 0x%p, %pI4:%u-%pI4:%u, atid %d, qid %u.\n",
+			csk, &req->local_ip, ntohs(req->local_port),
+			&req->peer_ip, ntohs(req->peer_port),
+			csk->atid, csk->rss_qid);
+	}
+
+	set_wr_txq(skb, CPL_PRIORITY_SETUP, csk->port_id);
+
+	pr_info_ipaddr("t%d csk 0x%p,%u,0x%lx,%u, rss_qid %u.\n",
+		       (&csk->saddr), (&csk->daddr), t4 ? 4 : 5, csk,
+		       csk->state, csk->flags, csk->atid, csk->rss_qid);
+
+	cxgb4_l2t_send(csk->cdev->ports[csk->port_id], skb, csk->l2t);
+}
+
+#if IS_ENABLED(CONFIG_IPV6)
+static void send_act_open_req6(struct cxgbi_sock *csk, struct sk_buff *skb,
+			       struct l2t_entry *e)
+{
+	struct cxgb4_lld_info *lldi = cxgbi_cdev_priv(csk->cdev);
+	int t4 = is_t4(lldi->adapter_type);
+	int wscale = cxgbi_sock_compute_wscale(csk->mss_idx);
+	unsigned long long opt0;
+	unsigned int opt2;
+	unsigned int qid_atid = ((unsigned int)csk->atid) |
+				 (((unsigned int)csk->rss_qid) << 14);
+
+	opt0 = KEEP_ALIVE(1) |
+		WND_SCALE(wscale) |
+		MSS_IDX(csk->mss_idx) |
+		L2T_IDX(((struct l2t_entry *)csk->l2t)->idx) |
+		TX_CHAN(csk->tx_chan) |
+		SMAC_SEL(csk->smac_idx) |
+		ULP_MODE(ULP_MODE_ISCSI) |
+		RCV_BUFSIZ(cxgb4i_rcv_win >> 10);
+
+	opt2 = RX_CHANNEL(0) |
+		RSS_QUEUE_VALID |
+		RX_FC_DISABLE |
+		RSS_QUEUE(csk->rss_qid);
+
+	if (t4) {
+		struct cpl_act_open_req6 *req =
+			    (struct cpl_act_open_req6 *)skb->head;
+
+		INIT_TP_WR(req, 0);
+		OPCODE_TID(req) = cpu_to_be32(MK_OPCODE_TID(CPL_ACT_OPEN_REQ6,
+							    qid_atid));
+		req->local_port = csk->saddr6.sin6_port;
+		req->peer_port = csk->daddr6.sin6_port;
+
+		req->local_ip_hi = *(__be64 *)(csk->saddr6.sin6_addr.s6_addr);
+		req->local_ip_lo = *(__be64 *)(csk->saddr6.sin6_addr.s6_addr +
+								    8);
+		req->peer_ip_hi = *(__be64 *)(csk->daddr6.sin6_addr.s6_addr);
+		req->peer_ip_lo = *(__be64 *)(csk->daddr6.sin6_addr.s6_addr +
+								    8);
+
+		req->opt0 = cpu_to_be64(opt0);
+
+		opt2 |= RX_FC_VALID;
+		req->opt2 = cpu_to_be32(opt2);
+
+		req->params = cpu_to_be32(cxgb4_select_ntuple(
+					  csk->cdev->ports[csk->port_id],
+					  csk->l2t));
+	} else {
+		struct cpl_t5_act_open_req6 *req =
+				(struct cpl_t5_act_open_req6 *)skb->head;
+
+		INIT_TP_WR(req, 0);
+		OPCODE_TID(req) = cpu_to_be32(MK_OPCODE_TID(CPL_ACT_OPEN_REQ6,
+							    qid_atid));
+		req->local_port = csk->saddr6.sin6_port;
+		req->peer_port = csk->daddr6.sin6_port;
+		req->local_ip_hi = *(__be64 *)(csk->saddr6.sin6_addr.s6_addr);
+		req->local_ip_lo = *(__be64 *)(csk->saddr6.sin6_addr.s6_addr +
+									8);
+		req->peer_ip_hi = *(__be64 *)(csk->daddr6.sin6_addr.s6_addr);
+		req->peer_ip_lo = *(__be64 *)(csk->daddr6.sin6_addr.s6_addr +
+									8);
+		req->opt0 = cpu_to_be64(opt0);
+
+		opt2 |= T5_OPT_2_VALID;
+		req->opt2 = cpu_to_be32(opt2);
+
+		req->params = cpu_to_be64(V_FILTER_TUPLE(cxgb4_select_ntuple(
+					  csk->cdev->ports[csk->port_id],
+					  csk->l2t)));
+	}
+
+	set_wr_txq(skb, CPL_PRIORITY_SETUP, csk->port_id);
+
+	pr_info("t%d csk 0x%p,%u,0x%lx,%u, [%pI6]:%u-[%pI6]:%u, rss_qid %u.\n",
+		t4 ? 4 : 5, csk, csk->state, csk->flags, csk->atid,
+		&csk->saddr6.sin6_addr, ntohs(csk->saddr.sin_port),
+		&csk->daddr6.sin6_addr, ntohs(csk->daddr.sin_port),
+		csk->rss_qid);
+
+	cxgb4_l2t_send(csk->cdev->ports[csk->port_id], skb, csk->l2t);
+}
+#endif
+
+static void send_close_req(struct cxgbi_sock *csk)
+{
+	struct sk_buff *skb = csk->cpl_close;
+	struct cpl_close_con_req *req = (struct cpl_close_con_req *)skb->head;
+	unsigned int tid = csk->tid;
+
+	log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_SOCK,
+		"csk 0x%p,%u,0x%lx, tid %u.\n",
+		csk, csk->state, csk->flags, csk->tid);
+	csk->cpl_close = NULL;
+	set_wr_txq(skb, CPL_PRIORITY_DATA, csk->port_id);
+	INIT_TP_WR(req, tid);
+	OPCODE_TID(req) = cpu_to_be32(MK_OPCODE_TID(CPL_CLOSE_CON_REQ, tid));
+	req->rsvd = 0;
+
+	cxgbi_sock_skb_entail(csk, skb);
+	if (csk->state >= CTP_ESTABLISHED)
+		push_tx_frames(csk, 1);
+}
+
+static void abort_arp_failure(void *handle, struct sk_buff *skb)
+{
+	struct cxgbi_sock *csk = (struct cxgbi_sock *)handle;
+	struct cpl_abort_req *req;
+
+	log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_SOCK,
+		"csk 0x%p,%u,0x%lx, tid %u, abort.\n",
+		csk, csk->state, csk->flags, csk->tid);
+	req = (struct cpl_abort_req *)skb->data;
+	req->cmd = CPL_ABORT_NO_RST;
+	cxgb4_ofld_send(csk->cdev->ports[csk->port_id], skb);
+}
+
+static void send_abort_req(struct cxgbi_sock *csk)
+{
+	struct cpl_abort_req *req;
+	struct sk_buff *skb = csk->cpl_abort_req;
+
+	if (unlikely(csk->state == CTP_ABORTING) || !skb || !csk->cdev)
+		return;
+	cxgbi_sock_set_state(csk, CTP_ABORTING);
+	cxgbi_sock_set_flag(csk, CTPF_ABORT_RPL_PENDING);
+	cxgbi_sock_purge_write_queue(csk);
+
+	csk->cpl_abort_req = NULL;
+	req = (struct cpl_abort_req *)skb->head;
+	set_queue(skb, CPL_PRIORITY_DATA, csk);
+	req->cmd = CPL_ABORT_SEND_RST;
+	t4_set_arp_err_handler(skb, csk, abort_arp_failure);
+	INIT_TP_WR(req, csk->tid);
+	OPCODE_TID(req) = cpu_to_be32(MK_OPCODE_TID(CPL_ABORT_REQ, csk->tid));
+	req->rsvd0 = htonl(csk->snd_nxt);
+	req->rsvd1 = !cxgbi_sock_flag(csk, CTPF_TX_DATA_SENT);
+
+	log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_SOCK,
+		"csk 0x%p,%u,0x%lx,%u, snd_nxt %u, 0x%x.\n",
+		csk, csk->state, csk->flags, csk->tid, csk->snd_nxt,
+		req->rsvd1);
+
+	cxgb4_l2t_send(csk->cdev->ports[csk->port_id], skb, csk->l2t);
+}
+
+static void send_abort_rpl(struct cxgbi_sock *csk, int rst_status)
+{
+	struct sk_buff *skb = csk->cpl_abort_rpl;
+	struct cpl_abort_rpl *rpl = (struct cpl_abort_rpl *)skb->head;
+
+	log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_SOCK,
+		"csk 0x%p,%u,0x%lx,%u, status %d.\n",
+		csk, csk->state, csk->flags, csk->tid, rst_status);
+
+	csk->cpl_abort_rpl = NULL;
+	set_queue(skb, CPL_PRIORITY_DATA, csk);
+	INIT_TP_WR(rpl, csk->tid);
+	OPCODE_TID(rpl) = cpu_to_be32(MK_OPCODE_TID(CPL_ABORT_RPL, csk->tid));
+	rpl->cmd = rst_status;
+	cxgb4_ofld_send(csk->cdev->ports[csk->port_id], skb);
+}
+
+/*
+ * CPL connection rx data ack: host ->
+ * Send RX credits through an RX_DATA_ACK CPL message. Returns the number of
+ * credits sent.
+ */
+static u32 send_rx_credits(struct cxgbi_sock *csk, u32 credits)
+{
+	struct sk_buff *skb;
+	struct cpl_rx_data_ack *req;
+
+	log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_PDU_RX,
+		"csk 0x%p,%u,0x%lx,%u, credit %u.\n",
+		csk, csk->state, csk->flags, csk->tid, credits);
+
+	skb = alloc_wr(sizeof(*req), 0, GFP_ATOMIC);
+	if (!skb) {
+		pr_info("csk 0x%p, credit %u, OOM.\n", csk, credits);
+		return 0;
+	}
+	req = (struct cpl_rx_data_ack *)skb->head;
+
+	set_wr_txq(skb, CPL_PRIORITY_ACK, csk->port_id);
+	INIT_TP_WR(req, csk->tid);
+	OPCODE_TID(req) = cpu_to_be32(MK_OPCODE_TID(CPL_RX_DATA_ACK,
+				      csk->tid));
+	req->credit_dack = cpu_to_be32(RX_CREDITS(credits) | RX_FORCE_ACK(1));
+	cxgb4_ofld_send(csk->cdev->ports[csk->port_id], skb);
+	return credits;
+}
+
+/*
+ * sgl_len - calculates the size of an SGL of the given capacity
+ * @n: the number of SGL entries
+ * Calculates the number of flits needed for a scatter/gather list that
+ * can hold the given number of entries.
+ */
+static inline unsigned int sgl_len(unsigned int n)
+{
+	n--;
+	return (3 * n) / 2 + (n & 1) + 2;
+}
+
+/*
+ * calc_tx_flits_ofld - calculate # of flits for an offload packet
+ * @skb: the packet
+ *
+ * Returns the number of flits needed for the given offload packet.
+ * These packets are already fully constructed and no additional headers
+ * will be added.
+ */
+static inline unsigned int calc_tx_flits_ofld(const struct sk_buff *skb)
+{
+	unsigned int flits, cnt;
+
+	if (is_ofld_imm(skb))
+		return DIV_ROUND_UP(skb->len, 8);
+	flits = skb_transport_offset(skb) / 8;
+	cnt = skb_shinfo(skb)->nr_frags;
+	if (skb_tail_pointer(skb) != skb_transport_header(skb))
+		cnt++;
+	return flits + sgl_len(cnt);
+}
+
+static inline void send_tx_flowc_wr(struct cxgbi_sock *csk)
+{
+	struct sk_buff *skb;
+	struct fw_flowc_wr *flowc;
+	int flowclen, i;
+
+	flowclen = 80;
+	skb = alloc_wr(flowclen, 0, GFP_ATOMIC);
+	flowc = (struct fw_flowc_wr *)skb->head;
+	flowc->op_to_nparams =
+		htonl(FW_WR_OP(FW_FLOWC_WR) | FW_FLOWC_WR_NPARAMS(8));
+	flowc->flowid_len16 =
+		htonl(FW_WR_LEN16(DIV_ROUND_UP(72, 16)) |
+				FW_WR_FLOWID(csk->tid));
+	flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_PFNVFN;
+	flowc->mnemval[0].val = htonl(csk->cdev->pfvf);
+	flowc->mnemval[1].mnemonic = FW_FLOWC_MNEM_CH;
+	flowc->mnemval[1].val = htonl(csk->tx_chan);
+	flowc->mnemval[2].mnemonic = FW_FLOWC_MNEM_PORT;
+	flowc->mnemval[2].val = htonl(csk->tx_chan);
+	flowc->mnemval[3].mnemonic = FW_FLOWC_MNEM_IQID;
+	flowc->mnemval[3].val = htonl(csk->rss_qid);
+	flowc->mnemval[4].mnemonic = FW_FLOWC_MNEM_SNDNXT;
+	flowc->mnemval[4].val = htonl(csk->snd_nxt);
+	flowc->mnemval[5].mnemonic = FW_FLOWC_MNEM_RCVNXT;
+	flowc->mnemval[5].val = htonl(csk->rcv_nxt);
+	flowc->mnemval[6].mnemonic = FW_FLOWC_MNEM_SNDBUF;
+	flowc->mnemval[6].val = htonl(cxgb4i_snd_win);
+	flowc->mnemval[7].mnemonic = FW_FLOWC_MNEM_MSS;
+	flowc->mnemval[7].val = htonl(csk->advmss);
+	flowc->mnemval[8].mnemonic = 0;
+	flowc->mnemval[8].val = 0;
+	for (i = 0; i < 9; i++) {
+		flowc->mnemval[i].r4[0] = 0;
+		flowc->mnemval[i].r4[1] = 0;
+		flowc->mnemval[i].r4[2] = 0;
+	}
+	set_queue(skb, CPL_PRIORITY_DATA, csk);
+
+	log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_SOCK,
+		"csk 0x%p, tid 0x%x, %u,%u,%u,%u,%u,%u,%u.\n",
+		csk, csk->tid, 0, csk->tx_chan, csk->rss_qid,
+		csk->snd_nxt, csk->rcv_nxt, cxgb4i_snd_win,
+		csk->advmss);
+
+	cxgb4_ofld_send(csk->cdev->ports[csk->port_id], skb);
+}
+
+static inline void make_tx_data_wr(struct cxgbi_sock *csk, struct sk_buff *skb,
+				   int dlen, int len, u32 credits, int compl)
+{
+	struct fw_ofld_tx_data_wr *req;
+	unsigned int submode = cxgbi_skcb_ulp_mode(skb) & 3;
+	unsigned int wr_ulp_mode = 0;
+
+	req = (struct fw_ofld_tx_data_wr *)__skb_push(skb, sizeof(*req));
+
+	if (is_ofld_imm(skb)) {
+		req->op_to_immdlen = htonl(FW_WR_OP(FW_OFLD_TX_DATA_WR) |
+					FW_WR_COMPL(1) |
+					FW_WR_IMMDLEN(dlen));
+		req->flowid_len16 = htonl(FW_WR_FLOWID(csk->tid) |
+						FW_WR_LEN16(credits));
+	} else {
+		req->op_to_immdlen =
+			cpu_to_be32(FW_WR_OP(FW_OFLD_TX_DATA_WR) |
+					FW_WR_COMPL(1) |
+					FW_WR_IMMDLEN(0));
+		req->flowid_len16 =
+			cpu_to_be32(FW_WR_FLOWID(csk->tid) |
+					FW_WR_LEN16(credits));
+	}
+	if (submode)
+		wr_ulp_mode = FW_OFLD_TX_DATA_WR_ULPMODE(ULP2_MODE_ISCSI) |
+				FW_OFLD_TX_DATA_WR_ULPSUBMODE(submode);
+	req->tunnel_to_proxy = htonl(wr_ulp_mode |
+		 FW_OFLD_TX_DATA_WR_SHOVE(skb_peek(&csk->write_queue) ? 0 : 1));
+	req->plen = htonl(len);
+	if (!cxgbi_sock_flag(csk, CTPF_TX_DATA_SENT))
+		cxgbi_sock_set_flag(csk, CTPF_TX_DATA_SENT);
+}
+
+static void arp_failure_skb_discard(void *handle, struct sk_buff *skb)
+{
+	kfree_skb(skb);
+}
+
+static int push_tx_frames(struct cxgbi_sock *csk, int req_completion)
+{
+	int total_size = 0;
+	struct sk_buff *skb;
+
+	if (unlikely(csk->state < CTP_ESTABLISHED ||
+		csk->state == CTP_CLOSE_WAIT_1 || csk->state >= CTP_ABORTING)) {
+		log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_SOCK |
+			 1 << CXGBI_DBG_PDU_TX,
+			"csk 0x%p,%u,0x%lx,%u, in closing state.\n",
+			csk, csk->state, csk->flags, csk->tid);
+		return 0;
+	}
+
+	while (csk->wr_cred && (skb = skb_peek(&csk->write_queue)) != NULL) {
+		int dlen = skb->len;
+		int len = skb->len;
+		unsigned int credits_needed;
+
+		skb_reset_transport_header(skb);
+		if (is_ofld_imm(skb))
+			credits_needed = DIV_ROUND_UP(dlen +
+					sizeof(struct fw_ofld_tx_data_wr), 16);
+		else
+			credits_needed = DIV_ROUND_UP(8*calc_tx_flits_ofld(skb)
+					+ sizeof(struct fw_ofld_tx_data_wr),
+					16);
+
+		if (csk->wr_cred < credits_needed) {
+			log_debug(1 << CXGBI_DBG_PDU_TX,
+				"csk 0x%p, skb %u/%u, wr %d < %u.\n",
+				csk, skb->len, skb->data_len,
+				credits_needed, csk->wr_cred);
+			break;
+		}
+		__skb_unlink(skb, &csk->write_queue);
+		set_queue(skb, CPL_PRIORITY_DATA, csk);
+		skb->csum = credits_needed;
+		csk->wr_cred -= credits_needed;
+		csk->wr_una_cred += credits_needed;
+		cxgbi_sock_enqueue_wr(csk, skb);
+
+		log_debug(1 << CXGBI_DBG_PDU_TX,
+			"csk 0x%p, skb %u/%u, wr %d, left %u, unack %u.\n",
+			csk, skb->len, skb->data_len, credits_needed,
+			csk->wr_cred, csk->wr_una_cred);
+
+		if (likely(cxgbi_skcb_test_flag(skb, SKCBF_TX_NEED_HDR))) {
+			if (!cxgbi_sock_flag(csk, CTPF_TX_DATA_SENT)) {
+				send_tx_flowc_wr(csk);
+				skb->csum += 5;
+				csk->wr_cred -= 5;
+				csk->wr_una_cred += 5;
+			}
+			len += cxgbi_ulp_extra_len(cxgbi_skcb_ulp_mode(skb));
+			make_tx_data_wr(csk, skb, dlen, len, credits_needed,
+					req_completion);
+			csk->snd_nxt += len;
+			cxgbi_skcb_clear_flag(skb, SKCBF_TX_NEED_HDR);
+		}
+		total_size += skb->truesize;
+		t4_set_arp_err_handler(skb, csk, arp_failure_skb_discard);
+
+		log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_PDU_TX,
+			"csk 0x%p,%u,0x%lx,%u, skb 0x%p, %u.\n",
+			csk, csk->state, csk->flags, csk->tid, skb, len);
+
+		cxgb4_l2t_send(csk->cdev->ports[csk->port_id], skb, csk->l2t);
+	}
+	return total_size;
+}
+
+static inline void free_atid(struct cxgbi_sock *csk)
+{
+	struct cxgb4_lld_info *lldi = cxgbi_cdev_priv(csk->cdev);
+
+	if (cxgbi_sock_flag(csk, CTPF_HAS_ATID)) {
+		cxgb4_free_atid(lldi->tids, csk->atid);
+		cxgbi_sock_clear_flag(csk, CTPF_HAS_ATID);
+		cxgbi_sock_put(csk);
+	}
+}
+
+static void do_act_establish(struct cxgbi_device *cdev, struct sk_buff *skb)
+{
+	struct cxgbi_sock *csk;
+	struct cpl_act_establish *req = (struct cpl_act_establish *)skb->data;
+	unsigned short tcp_opt = ntohs(req->tcp_opt);
+	unsigned int tid = GET_TID(req);
+	unsigned int atid = GET_TID_TID(ntohl(req->tos_atid));
+	struct cxgb4_lld_info *lldi = cxgbi_cdev_priv(cdev);
+	struct tid_info *t = lldi->tids;
+	u32 rcv_isn = be32_to_cpu(req->rcv_isn);
+
+	csk = lookup_atid(t, atid);
+	if (unlikely(!csk)) {
+		pr_err("NO conn. for atid %u, cdev 0x%p.\n", atid, cdev);
+		goto rel_skb;
+	}
+
+	if (csk->atid != atid) {
+		pr_err("bad conn atid %u, csk 0x%p,%u,0x%lx,tid %u, atid %u.\n",
+			atid, csk, csk->state, csk->flags, csk->tid, csk->atid);
+		goto rel_skb;
+	}
+
+	pr_info_ipaddr("atid 0x%x, tid 0x%x, csk 0x%p,%u,0x%lx, isn %u.\n",
+		       (&csk->saddr), (&csk->daddr),
+		       atid, tid, csk, csk->state, csk->flags, rcv_isn);
+
+	module_put(THIS_MODULE);
+
+	cxgbi_sock_get(csk);
+	csk->tid = tid;
+	cxgb4_insert_tid(lldi->tids, csk, tid);
+	cxgbi_sock_set_flag(csk, CTPF_HAS_TID);
+
+	free_atid(csk);
+
+	spin_lock_bh(&csk->lock);
+	if (unlikely(csk->state != CTP_ACTIVE_OPEN))
+		pr_info("csk 0x%p,%u,0x%lx,%u, got EST.\n",
+			csk, csk->state, csk->flags, csk->tid);
+
+	if (csk->retry_timer.function) {
+		del_timer(&csk->retry_timer);
+		csk->retry_timer.function = NULL;
+	}
+
+	csk->copied_seq = csk->rcv_wup = csk->rcv_nxt = rcv_isn;
+	/*
+	 * Causes the first RX_DATA_ACK to supply any Rx credits we couldn't
+	 * pass through opt0.
+	 */
+	if (cxgb4i_rcv_win > (RCV_BUFSIZ_MASK << 10))
+		csk->rcv_wup -= cxgb4i_rcv_win - (RCV_BUFSIZ_MASK << 10);
+
+	csk->advmss = lldi->mtus[GET_TCPOPT_MSS(tcp_opt)] - 40;
+	if (GET_TCPOPT_TSTAMP(tcp_opt))
+		csk->advmss -= 12;
+	if (csk->advmss < 128)
+		csk->advmss = 128;
+
+	log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_SOCK,
+		"csk 0x%p, mss_idx %u, advmss %u.\n",
+			csk, GET_TCPOPT_MSS(tcp_opt), csk->advmss);
+
+	cxgbi_sock_established(csk, ntohl(req->snd_isn), ntohs(req->tcp_opt));
+
+	if (unlikely(cxgbi_sock_flag(csk, CTPF_ACTIVE_CLOSE_NEEDED)))
+		send_abort_req(csk);
+	else {
+		if (skb_queue_len(&csk->write_queue))
+			push_tx_frames(csk, 0);
+		cxgbi_conn_tx_open(csk);
+	}
+	spin_unlock_bh(&csk->lock);
+
+rel_skb:
+	__kfree_skb(skb);
+}
+
+static int act_open_rpl_status_to_errno(int status)
+{
+	switch (status) {
+	case CPL_ERR_CONN_RESET:
+		return -ECONNREFUSED;
+	case CPL_ERR_ARP_MISS:
+		return -EHOSTUNREACH;
+	case CPL_ERR_CONN_TIMEDOUT:
+		return -ETIMEDOUT;
+	case CPL_ERR_TCAM_FULL:
+		return -ENOMEM;
+	case CPL_ERR_CONN_EXIST:
+		return -EADDRINUSE;
+	default:
+		return -EIO;
+	}
+}
+
+static void csk_act_open_retry_timer(unsigned long data)
+{
+	struct sk_buff *skb = NULL;
+	struct cxgbi_sock *csk = (struct cxgbi_sock *)data;
+	struct cxgb4_lld_info *lldi = cxgbi_cdev_priv(csk->cdev);
+	void (*send_act_open_func)(struct cxgbi_sock *, struct sk_buff *,
+				   struct l2t_entry *);
+	int t4 = is_t4(lldi->adapter_type), size, size6;
+
+	log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_SOCK,
+		"csk 0x%p,%u,0x%lx,%u.\n",
+		csk, csk->state, csk->flags, csk->tid);
+
+	cxgbi_sock_get(csk);
+	spin_lock_bh(&csk->lock);
+
+	if (t4) {
+		size = sizeof(struct cpl_act_open_req);
+		size6 = sizeof(struct cpl_act_open_req6);
+	} else {
+		size = sizeof(struct cpl_t5_act_open_req);
+		size6 = sizeof(struct cpl_t5_act_open_req6);
+	}
+
+	if (csk->csk_family == AF_INET) {
+		send_act_open_func = send_act_open_req;
+		skb = alloc_wr(size, 0, GFP_ATOMIC);
+#if IS_ENABLED(CONFIG_IPV6)
+	} else {
+		send_act_open_func = send_act_open_req6;
+		skb = alloc_wr(size6, 0, GFP_ATOMIC);
+#endif
+	}
+
+	if (!skb)
+		cxgbi_sock_fail_act_open(csk, -ENOMEM);
+	else {
+		skb->sk = (struct sock *)csk;
+		t4_set_arp_err_handler(skb, csk,
+				       cxgbi_sock_act_open_req_arp_failure);
+		send_act_open_func(csk, skb, csk->l2t);
+	}
+
+	spin_unlock_bh(&csk->lock);
+	cxgbi_sock_put(csk);
+
+}
+
+static void do_act_open_rpl(struct cxgbi_device *cdev, struct sk_buff *skb)
+{
+	struct cxgbi_sock *csk;
+	struct cpl_act_open_rpl *rpl = (struct cpl_act_open_rpl *)skb->data;
+	unsigned int tid = GET_TID(rpl);
+	unsigned int atid =
+		GET_TID_TID(GET_AOPEN_ATID(be32_to_cpu(rpl->atid_status)));
+	unsigned int status = GET_AOPEN_STATUS(be32_to_cpu(rpl->atid_status));
+	struct cxgb4_lld_info *lldi = cxgbi_cdev_priv(cdev);
+	struct tid_info *t = lldi->tids;
+
+	csk = lookup_atid(t, atid);
+	if (unlikely(!csk)) {
+		pr_err("NO matching conn. atid %u, tid %u.\n", atid, tid);
+		goto rel_skb;
+	}
+
+	pr_info_ipaddr("tid %u/%u, status %u.\n"
+		       "csk 0x%p,%u,0x%lx. ", (&csk->saddr), (&csk->daddr),
+		       atid, tid, status, csk, csk->state, csk->flags);
+
+	if (status == CPL_ERR_RTX_NEG_ADVICE)
+		goto rel_skb;
+
+	module_put(THIS_MODULE);
+
+	if (status && status != CPL_ERR_TCAM_FULL &&
+	    status != CPL_ERR_CONN_EXIST &&
+	    status != CPL_ERR_ARP_MISS)
+		cxgb4_remove_tid(lldi->tids, csk->port_id, GET_TID(rpl));
+
+	cxgbi_sock_get(csk);
+	spin_lock_bh(&csk->lock);
+
+	if (status == CPL_ERR_CONN_EXIST &&
+	    csk->retry_timer.function != csk_act_open_retry_timer) {
+		csk->retry_timer.function = csk_act_open_retry_timer;
+		mod_timer(&csk->retry_timer, jiffies + HZ / 2);
+	} else
+		cxgbi_sock_fail_act_open(csk,
+					act_open_rpl_status_to_errno(status));
+
+	spin_unlock_bh(&csk->lock);
+	cxgbi_sock_put(csk);
+rel_skb:
+	__kfree_skb(skb);
+}
+
+static void do_peer_close(struct cxgbi_device *cdev, struct sk_buff *skb)
+{
+	struct cxgbi_sock *csk;
+	struct cpl_peer_close *req = (struct cpl_peer_close *)skb->data;
+	unsigned int tid = GET_TID(req);
+	struct cxgb4_lld_info *lldi = cxgbi_cdev_priv(cdev);
+	struct tid_info *t = lldi->tids;
+
+	csk = lookup_tid(t, tid);
+	if (unlikely(!csk)) {
+		pr_err("can't find connection for tid %u.\n", tid);
+		goto rel_skb;
+	}
+	pr_info_ipaddr("csk 0x%p,%u,0x%lx,%u.\n",
+		       (&csk->saddr), (&csk->daddr),
+		       csk, csk->state, csk->flags, csk->tid);
+	cxgbi_sock_rcv_peer_close(csk);
+rel_skb:
+	__kfree_skb(skb);
+}
+
+static void do_close_con_rpl(struct cxgbi_device *cdev, struct sk_buff *skb)
+{
+	struct cxgbi_sock *csk;
+	struct cpl_close_con_rpl *rpl = (struct cpl_close_con_rpl *)skb->data;
+	unsigned int tid = GET_TID(rpl);
+	struct cxgb4_lld_info *lldi = cxgbi_cdev_priv(cdev);
+	struct tid_info *t = lldi->tids;
+
+	csk = lookup_tid(t, tid);
+	if (unlikely(!csk)) {
+		pr_err("can't find connection for tid %u.\n", tid);
+		goto rel_skb;
+	}
+	pr_info_ipaddr("csk 0x%p,%u,0x%lx,%u.\n",
+		       (&csk->saddr), (&csk->daddr),
+		       csk, csk->state, csk->flags, csk->tid);
+	cxgbi_sock_rcv_close_conn_rpl(csk, ntohl(rpl->snd_nxt));
+rel_skb:
+	__kfree_skb(skb);
+}
+
+static int abort_status_to_errno(struct cxgbi_sock *csk, int abort_reason,
+								int *need_rst)
+{
+	switch (abort_reason) {
+	case CPL_ERR_BAD_SYN: /* fall through */
+	case CPL_ERR_CONN_RESET:
+		return csk->state > CTP_ESTABLISHED ?
+			-EPIPE : -ECONNRESET;
+	case CPL_ERR_XMIT_TIMEDOUT:
+	case CPL_ERR_PERSIST_TIMEDOUT:
+	case CPL_ERR_FINWAIT2_TIMEDOUT:
+	case CPL_ERR_KEEPALIVE_TIMEDOUT:
+		return -ETIMEDOUT;
+	default:
+		return -EIO;
+	}
+}
+
+static void do_abort_req_rss(struct cxgbi_device *cdev, struct sk_buff *skb)
+{
+	struct cxgbi_sock *csk;
+	struct cpl_abort_req_rss *req = (struct cpl_abort_req_rss *)skb->data;
+	unsigned int tid = GET_TID(req);
+	struct cxgb4_lld_info *lldi = cxgbi_cdev_priv(cdev);
+	struct tid_info *t = lldi->tids;
+	int rst_status = CPL_ABORT_NO_RST;
+
+	csk = lookup_tid(t, tid);
+	if (unlikely(!csk)) {
+		pr_err("can't find connection for tid %u.\n", tid);
+		goto rel_skb;
+	}
+
+	pr_info_ipaddr("csk 0x%p,%u,0x%lx,%u, status %u.\n",
+		       (&csk->saddr), (&csk->daddr),
+		       csk, csk->state, csk->flags, csk->tid, req->status);
+
+	if (req->status == CPL_ERR_RTX_NEG_ADVICE ||
+	    req->status == CPL_ERR_PERSIST_NEG_ADVICE)
+		goto rel_skb;
+
+	cxgbi_sock_get(csk);
+	spin_lock_bh(&csk->lock);
+
+	cxgbi_sock_clear_flag(csk, CTPF_ABORT_REQ_RCVD);
+
+	if (!cxgbi_sock_flag(csk, CTPF_TX_DATA_SENT)) {
+		send_tx_flowc_wr(csk);
+		cxgbi_sock_set_flag(csk, CTPF_TX_DATA_SENT);
+	}
+
+	cxgbi_sock_set_flag(csk, CTPF_ABORT_REQ_RCVD);
+	cxgbi_sock_set_state(csk, CTP_ABORTING);
+
+	send_abort_rpl(csk, rst_status);
+
+	if (!cxgbi_sock_flag(csk, CTPF_ABORT_RPL_PENDING)) {
+		csk->err = abort_status_to_errno(csk, req->status, &rst_status);
+		cxgbi_sock_closed(csk);
+	}
+
+	spin_unlock_bh(&csk->lock);
+	cxgbi_sock_put(csk);
+rel_skb:
+	__kfree_skb(skb);
+}
+
+static void do_abort_rpl_rss(struct cxgbi_device *cdev, struct sk_buff *skb)
+{
+	struct cxgbi_sock *csk;
+	struct cpl_abort_rpl_rss *rpl = (struct cpl_abort_rpl_rss *)skb->data;
+	unsigned int tid = GET_TID(rpl);
+	struct cxgb4_lld_info *lldi = cxgbi_cdev_priv(cdev);
+	struct tid_info *t = lldi->tids;
+
+	csk = lookup_tid(t, tid);
+	if (!csk)
+		goto rel_skb;
+
+	if (csk)
+		pr_info_ipaddr("csk 0x%p,%u,0x%lx,%u, status %u.\n",
+			       (&csk->saddr), (&csk->daddr), csk,
+			       csk->state, csk->flags, csk->tid, rpl->status);
+
+	if (rpl->status == CPL_ERR_ABORT_FAILED)
+		goto rel_skb;
+
+	cxgbi_sock_rcv_abort_rpl(csk);
+rel_skb:
+	__kfree_skb(skb);
+}
+
+static void do_rx_iscsi_hdr(struct cxgbi_device *cdev, struct sk_buff *skb)
+{
+	struct cxgbi_sock *csk;
+	struct cpl_iscsi_hdr *cpl = (struct cpl_iscsi_hdr *)skb->data;
+	unsigned short pdu_len_ddp = be16_to_cpu(cpl->pdu_len_ddp);
+	unsigned int tid = GET_TID(cpl);
+	struct cxgb4_lld_info *lldi = cxgbi_cdev_priv(cdev);
+	struct tid_info *t = lldi->tids;
+
+	csk = lookup_tid(t, tid);
+	if (unlikely(!csk)) {
+		pr_err("can't find conn. for tid %u.\n", tid);
+		goto rel_skb;
+	}
+
+	log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_PDU_RX,
+		"csk 0x%p,%u,0x%lx, tid %u, skb 0x%p,%u, 0x%x.\n",
+		csk, csk->state, csk->flags, csk->tid, skb, skb->len,
+		pdu_len_ddp);
+
+	spin_lock_bh(&csk->lock);
+
+	if (unlikely(csk->state >= CTP_PASSIVE_CLOSE)) {
+		log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_SOCK,
+			"csk 0x%p,%u,0x%lx,%u, bad state.\n",
+			csk, csk->state, csk->flags, csk->tid);
+		if (csk->state != CTP_ABORTING)
+			goto abort_conn;
+		else
+			goto discard;
+	}
+
+	cxgbi_skcb_tcp_seq(skb) = ntohl(cpl->seq);
+	cxgbi_skcb_flags(skb) = 0;
+
+	skb_reset_transport_header(skb);
+	__skb_pull(skb, sizeof(*cpl));
+	__pskb_trim(skb, ntohs(cpl->len));
+
+	if (!csk->skb_ulp_lhdr) {
+		unsigned char *bhs;
+		unsigned int hlen, dlen, plen;
+
+		log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_PDU_RX,
+			"csk 0x%p,%u,0x%lx, tid %u, skb 0x%p header.\n",
+			csk, csk->state, csk->flags, csk->tid, skb);
+		csk->skb_ulp_lhdr = skb;
+		cxgbi_skcb_set_flag(skb, SKCBF_RX_HDR);
+
+		if (cxgbi_skcb_tcp_seq(skb) != csk->rcv_nxt) {
+			pr_info("tid %u, CPL_ISCSI_HDR, bad seq, 0x%x/0x%x.\n",
+				csk->tid, cxgbi_skcb_tcp_seq(skb),
+				csk->rcv_nxt);
+			goto abort_conn;
+		}
+
+		bhs = skb->data;
+		hlen = ntohs(cpl->len);
+		dlen = ntohl(*(unsigned int *)(bhs + 4)) & 0xFFFFFF;
+
+		plen = ISCSI_PDU_LEN(pdu_len_ddp);
+		if (is_t4(lldi->adapter_type))
+			plen -= 40;
+
+		if ((hlen + dlen) != plen) {
+			pr_info("tid 0x%x, CPL_ISCSI_HDR, pdu len "
+				"mismatch %u != %u + %u, seq 0x%x.\n",
+				csk->tid, plen, hlen, dlen,
+				cxgbi_skcb_tcp_seq(skb));
+			goto abort_conn;
+		}
+
+		cxgbi_skcb_rx_pdulen(skb) = (hlen + dlen + 3) & (~0x3);
+		if (dlen)
+			cxgbi_skcb_rx_pdulen(skb) += csk->dcrc_len;
+		csk->rcv_nxt += cxgbi_skcb_rx_pdulen(skb);
+
+		log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_PDU_RX,
+			"csk 0x%p, skb 0x%p, 0x%x,%u+%u,0x%x,0x%x.\n",
+			csk, skb, *bhs, hlen, dlen,
+			ntohl(*((unsigned int *)(bhs + 16))),
+			ntohl(*((unsigned int *)(bhs + 24))));
+
+	} else {
+		struct sk_buff *lskb = csk->skb_ulp_lhdr;
+
+		cxgbi_skcb_set_flag(lskb, SKCBF_RX_DATA);
+		log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_PDU_RX,
+			"csk 0x%p,%u,0x%lx, skb 0x%p data, 0x%p.\n",
+			csk, csk->state, csk->flags, skb, lskb);
+	}
+
+	__skb_queue_tail(&csk->receive_queue, skb);
+	spin_unlock_bh(&csk->lock);
+	return;
+
+abort_conn:
+	send_abort_req(csk);
+discard:
+	spin_unlock_bh(&csk->lock);
+rel_skb:
+	__kfree_skb(skb);
+}
+
+static void do_rx_data_ddp(struct cxgbi_device *cdev,
+				  struct sk_buff *skb)
+{
+	struct cxgbi_sock *csk;
+	struct sk_buff *lskb;
+	struct cpl_rx_data_ddp *rpl = (struct cpl_rx_data_ddp *)skb->data;
+	unsigned int tid = GET_TID(rpl);
+	struct cxgb4_lld_info *lldi = cxgbi_cdev_priv(cdev);
+	struct tid_info *t = lldi->tids;
+	unsigned int status = ntohl(rpl->ddpvld);
+
+	csk = lookup_tid(t, tid);
+	if (unlikely(!csk)) {
+		pr_err("can't find connection for tid %u.\n", tid);
+		goto rel_skb;
+	}
+
+	log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_PDU_RX,
+		"csk 0x%p,%u,0x%lx, skb 0x%p,0x%x, lhdr 0x%p.\n",
+		csk, csk->state, csk->flags, skb, status, csk->skb_ulp_lhdr);
+
+	spin_lock_bh(&csk->lock);
+
+	if (unlikely(csk->state >= CTP_PASSIVE_CLOSE)) {
+		log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_SOCK,
+			"csk 0x%p,%u,0x%lx,%u, bad state.\n",
+			csk, csk->state, csk->flags, csk->tid);
+		if (csk->state != CTP_ABORTING)
+			goto abort_conn;
+		else
+			goto discard;
+	}
+
+	if (!csk->skb_ulp_lhdr) {
+		pr_err("tid 0x%x, rcv RX_DATA_DDP w/o pdu bhs.\n", csk->tid);
+		goto abort_conn;
+	}
+
+	lskb = csk->skb_ulp_lhdr;
+	csk->skb_ulp_lhdr = NULL;
+
+	cxgbi_skcb_rx_ddigest(lskb) = ntohl(rpl->ulp_crc);
+
+	if (ntohs(rpl->len) != cxgbi_skcb_rx_pdulen(lskb))
+		pr_info("tid 0x%x, RX_DATA_DDP pdulen %u != %u.\n",
+			csk->tid, ntohs(rpl->len), cxgbi_skcb_rx_pdulen(lskb));
+
+	if (status & (1 << CPL_RX_DDP_STATUS_HCRC_SHIFT)) {
+		pr_info("csk 0x%p, lhdr 0x%p, status 0x%x, hcrc bad 0x%lx.\n",
+			csk, lskb, status, cxgbi_skcb_flags(lskb));
+		cxgbi_skcb_set_flag(lskb, SKCBF_RX_HCRC_ERR);
+	}
+	if (status & (1 << CPL_RX_DDP_STATUS_DCRC_SHIFT)) {
+		pr_info("csk 0x%p, lhdr 0x%p, status 0x%x, dcrc bad 0x%lx.\n",
+			csk, lskb, status, cxgbi_skcb_flags(lskb));
+		cxgbi_skcb_set_flag(lskb, SKCBF_RX_DCRC_ERR);
+	}
+	if (status & (1 << CPL_RX_DDP_STATUS_PAD_SHIFT)) {
+		log_debug(1 << CXGBI_DBG_PDU_RX,
+			"csk 0x%p, lhdr 0x%p, status 0x%x, pad bad.\n",
+			csk, lskb, status);
+		cxgbi_skcb_set_flag(lskb, SKCBF_RX_PAD_ERR);
+	}
+	if ((status & (1 << CPL_RX_DDP_STATUS_DDP_SHIFT)) &&
+		!cxgbi_skcb_test_flag(lskb, SKCBF_RX_DATA)) {
+		log_debug(1 << CXGBI_DBG_PDU_RX,
+			"csk 0x%p, lhdr 0x%p, 0x%x, data ddp'ed.\n",
+			csk, lskb, status);
+		cxgbi_skcb_set_flag(lskb, SKCBF_RX_DATA_DDPD);
+	}
+	log_debug(1 << CXGBI_DBG_PDU_RX,
+		"csk 0x%p, lskb 0x%p, f 0x%lx.\n",
+		csk, lskb, cxgbi_skcb_flags(lskb));
+
+	cxgbi_skcb_set_flag(lskb, SKCBF_RX_STATUS);
+	cxgbi_conn_pdu_ready(csk);
+	spin_unlock_bh(&csk->lock);
+	goto rel_skb;
+
+abort_conn:
+	send_abort_req(csk);
+discard:
+	spin_unlock_bh(&csk->lock);
+rel_skb:
+	__kfree_skb(skb);
+}
+
+static void do_fw4_ack(struct cxgbi_device *cdev, struct sk_buff *skb)
+{
+	struct cxgbi_sock *csk;
+	struct cpl_fw4_ack *rpl = (struct cpl_fw4_ack *)skb->data;
+	unsigned int tid = GET_TID(rpl);
+	struct cxgb4_lld_info *lldi = cxgbi_cdev_priv(cdev);
+	struct tid_info *t = lldi->tids;
+
+	csk = lookup_tid(t, tid);
+	if (unlikely(!csk))
+		pr_err("can't find connection for tid %u.\n", tid);
+	else {
+		log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_SOCK,
+			"csk 0x%p,%u,0x%lx,%u.\n",
+			csk, csk->state, csk->flags, csk->tid);
+		cxgbi_sock_rcv_wr_ack(csk, rpl->credits, ntohl(rpl->snd_una),
+					rpl->seq_vld);
+	}
+	__kfree_skb(skb);
+}
+
+static void do_set_tcb_rpl(struct cxgbi_device *cdev, struct sk_buff *skb)
+{
+	struct cpl_set_tcb_rpl *rpl = (struct cpl_set_tcb_rpl *)skb->data;
+	unsigned int tid = GET_TID(rpl);
+	struct cxgb4_lld_info *lldi = cxgbi_cdev_priv(cdev);
+	struct tid_info *t = lldi->tids;
+	struct cxgbi_sock *csk;
+
+	csk = lookup_tid(t, tid);
+	if (!csk)
+		pr_err("can't find conn. for tid %u.\n", tid);
+
+	log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_SOCK,
+		"csk 0x%p,%u,%lx,%u, status 0x%x.\n",
+		csk, csk->state, csk->flags, csk->tid, rpl->status);
+
+	if (rpl->status != CPL_ERR_NONE)
+		pr_err("csk 0x%p,%u, SET_TCB_RPL status %u.\n",
+			csk, tid, rpl->status);
+
+	__kfree_skb(skb);
+}
+
+static int alloc_cpls(struct cxgbi_sock *csk)
+{
+	csk->cpl_close = alloc_wr(sizeof(struct cpl_close_con_req),
+					0, GFP_KERNEL);
+	if (!csk->cpl_close)
+		return -ENOMEM;
+
+	csk->cpl_abort_req = alloc_wr(sizeof(struct cpl_abort_req),
+					0, GFP_KERNEL);
+	if (!csk->cpl_abort_req)
+		goto free_cpls;
+
+	csk->cpl_abort_rpl = alloc_wr(sizeof(struct cpl_abort_rpl),
+					0, GFP_KERNEL);
+	if (!csk->cpl_abort_rpl)
+		goto free_cpls;
+	return 0;
+
+free_cpls:
+	cxgbi_sock_free_cpl_skbs(csk);
+	return -ENOMEM;
+}
+
+static inline void l2t_put(struct cxgbi_sock *csk)
+{
+	if (csk->l2t) {
+		cxgb4_l2t_release(csk->l2t);
+		csk->l2t = NULL;
+		cxgbi_sock_put(csk);
+	}
+}
+
+static void release_offload_resources(struct cxgbi_sock *csk)
+{
+	struct cxgb4_lld_info *lldi;
+
+	log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_SOCK,
+		"csk 0x%p,%u,0x%lx,%u.\n",
+		csk, csk->state, csk->flags, csk->tid);
+
+	cxgbi_sock_free_cpl_skbs(csk);
+	if (csk->wr_cred != csk->wr_max_cred) {
+		cxgbi_sock_purge_wr_queue(csk);
+		cxgbi_sock_reset_wr_list(csk);
+	}
+
+	l2t_put(csk);
+	if (cxgbi_sock_flag(csk, CTPF_HAS_ATID))
+		free_atid(csk);
+	else if (cxgbi_sock_flag(csk, CTPF_HAS_TID)) {
+		lldi = cxgbi_cdev_priv(csk->cdev);
+		cxgb4_remove_tid(lldi->tids, 0, csk->tid);
+		cxgbi_sock_clear_flag(csk, CTPF_HAS_TID);
+		cxgbi_sock_put(csk);
+	}
+	csk->dst = NULL;
+	csk->cdev = NULL;
+}
+
+static int init_act_open(struct cxgbi_sock *csk)
+{
+	struct cxgbi_device *cdev = csk->cdev;
+	struct cxgb4_lld_info *lldi = cxgbi_cdev_priv(cdev);
+	struct net_device *ndev = cdev->ports[csk->port_id];
+	struct sk_buff *skb = NULL;
+	struct neighbour *n = NULL;
+	void *daddr;
+	unsigned int step;
+	unsigned int size, size6;
+	int t4 = is_t4(lldi->adapter_type);
+
+	log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_SOCK,
+		"csk 0x%p,%u,0x%lx,%u.\n",
+		csk, csk->state, csk->flags, csk->tid);
+
+	if (csk->csk_family == AF_INET)
+		daddr = &csk->daddr.sin_addr.s_addr;
+#if IS_ENABLED(CONFIG_IPV6)
+	else if (csk->csk_family == AF_INET6)
+		daddr = &csk->daddr6.sin6_addr;
+#endif
+	else {
+		pr_err("address family 0x%x not supported\n", csk->csk_family);
+		goto rel_resource;
+	}
+
+	n = dst_neigh_lookup(csk->dst, daddr);
+
+	if (!n) {
+		pr_err("%s, can't get neighbour of csk->dst.\n", ndev->name);
+		goto rel_resource;
+	}
+
+	csk->atid = cxgb4_alloc_atid(lldi->tids, csk);
+	if (csk->atid < 0) {
+		pr_err("%s, NO atid available.\n", ndev->name);
+		return -EINVAL;
+	}
+	cxgbi_sock_set_flag(csk, CTPF_HAS_ATID);
+	cxgbi_sock_get(csk);
+
+	csk->l2t = cxgb4_l2t_get(lldi->l2t, n, ndev, 0);
+	if (!csk->l2t) {
+		pr_err("%s, cannot alloc l2t.\n", ndev->name);
+		goto rel_resource;
+	}
+	cxgbi_sock_get(csk);
+
+	if (t4) {
+		size = sizeof(struct cpl_act_open_req);
+		size6 = sizeof(struct cpl_act_open_req6);
+	} else {
+		size = sizeof(struct cpl_t5_act_open_req);
+		size6 = sizeof(struct cpl_t5_act_open_req6);
+	}
+
+	if (csk->csk_family == AF_INET)
+		skb = alloc_wr(size, 0, GFP_NOIO);
+#if IS_ENABLED(CONFIG_IPV6)
+	else
+		skb = alloc_wr(size6, 0, GFP_NOIO);
+#endif
+
+	if (!skb)
+		goto rel_resource;
+	skb->sk = (struct sock *)csk;
+	t4_set_arp_err_handler(skb, csk, cxgbi_sock_act_open_req_arp_failure);
+
+	if (!csk->mtu)
+		csk->mtu = dst_mtu(csk->dst);
+	cxgb4_best_mtu(lldi->mtus, csk->mtu, &csk->mss_idx);
+	csk->tx_chan = cxgb4_port_chan(ndev);
+	/* SMT two entries per row */
+	csk->smac_idx = ((cxgb4_port_viid(ndev) & 0x7F)) << 1;
+	step = lldi->ntxq / lldi->nchan;
+	csk->txq_idx = cxgb4_port_idx(ndev) * step;
+	step = lldi->nrxq / lldi->nchan;
+	csk->rss_qid = lldi->rxq_ids[cxgb4_port_idx(ndev) * step];
+	csk->wr_cred = lldi->wr_cred -
+		       DIV_ROUND_UP(sizeof(struct cpl_abort_req), 16);
+	csk->wr_max_cred = csk->wr_cred;
+	csk->wr_una_cred = 0;
+	cxgbi_sock_reset_wr_list(csk);
+	csk->err = 0;
+
+	pr_info_ipaddr("csk 0x%p,%u,0x%lx,%u,%u,%u, mtu %u,%u, smac %u.\n",
+		       (&csk->saddr), (&csk->daddr), csk, csk->state,
+		       csk->flags, csk->tx_chan, csk->txq_idx, csk->rss_qid,
+		       csk->mtu, csk->mss_idx, csk->smac_idx);
+
+	/* must wait for either a act_open_rpl or act_open_establish */
+	try_module_get(THIS_MODULE);
+	cxgbi_sock_set_state(csk, CTP_ACTIVE_OPEN);
+	if (csk->csk_family == AF_INET)
+		send_act_open_req(csk, skb, csk->l2t);
+#if IS_ENABLED(CONFIG_IPV6)
+	else
+		send_act_open_req6(csk, skb, csk->l2t);
+#endif
+	neigh_release(n);
+
+	return 0;
+
+rel_resource:
+	if (n)
+		neigh_release(n);
+	if (skb)
+		__kfree_skb(skb);
+	return -EINVAL;
+}
+
+cxgb4i_cplhandler_func cxgb4i_cplhandlers[NUM_CPL_CMDS] = {
+	[CPL_ACT_ESTABLISH] = do_act_establish,
+	[CPL_ACT_OPEN_RPL] = do_act_open_rpl,
+	[CPL_PEER_CLOSE] = do_peer_close,
+	[CPL_ABORT_REQ_RSS] = do_abort_req_rss,
+	[CPL_ABORT_RPL_RSS] = do_abort_rpl_rss,
+	[CPL_CLOSE_CON_RPL] = do_close_con_rpl,
+	[CPL_FW4_ACK] = do_fw4_ack,
+	[CPL_ISCSI_HDR] = do_rx_iscsi_hdr,
+	[CPL_ISCSI_DATA] = do_rx_iscsi_hdr,
+	[CPL_SET_TCB_RPL] = do_set_tcb_rpl,
+	[CPL_RX_DATA_DDP] = do_rx_data_ddp,
+	[CPL_RX_ISCSI_DDP] = do_rx_data_ddp,
+};
+
+int cxgb4i_ofld_init(struct cxgbi_device *cdev)
+{
+	int rc;
+
+	if (cxgb4i_max_connect > CXGB4I_MAX_CONN)
+		cxgb4i_max_connect = CXGB4I_MAX_CONN;
+
+	rc = cxgbi_device_portmap_create(cdev, cxgb4i_sport_base,
+					cxgb4i_max_connect);
+	if (rc < 0)
+		return rc;
+
+	cdev->csk_release_offload_resources = release_offload_resources;
+	cdev->csk_push_tx_frames = push_tx_frames;
+	cdev->csk_send_abort_req = send_abort_req;
+	cdev->csk_send_close_req = send_close_req;
+	cdev->csk_send_rx_credits = send_rx_credits;
+	cdev->csk_alloc_cpls = alloc_cpls;
+	cdev->csk_init_act_open = init_act_open;
+
+	pr_info("cdev 0x%p, offload up, added.\n", cdev);
+	return 0;
+}
+
+/*
+ * functions to program the pagepod in h/w
+ */
+#define ULPMEM_IDATA_MAX_NPPODS	4 /* 256/PPOD_SIZE */
+static inline void ulp_mem_io_set_hdr(struct cxgb4_lld_info *lldi,
+				struct ulp_mem_io *req,
+				unsigned int wr_len, unsigned int dlen,
+				unsigned int pm_addr)
+{
+	struct ulptx_idata *idata = (struct ulptx_idata *)(req + 1);
+
+	INIT_ULPTX_WR(req, wr_len, 0, 0);
+	if (is_t4(lldi->adapter_type))
+		req->cmd = htonl(ULPTX_CMD(ULP_TX_MEM_WRITE) |
+					(ULP_MEMIO_ORDER(1)));
+	else
+		req->cmd = htonl(ULPTX_CMD(ULP_TX_MEM_WRITE) |
+					(V_T5_ULP_MEMIO_IMM(1)));
+	req->dlen = htonl(ULP_MEMIO_DATA_LEN(dlen >> 5));
+	req->lock_addr = htonl(ULP_MEMIO_ADDR(pm_addr >> 5));
+	req->len16 = htonl(DIV_ROUND_UP(wr_len - sizeof(req->wr), 16));
+
+	idata->cmd_more = htonl(ULPTX_CMD(ULP_TX_SC_IMM));
+	idata->len = htonl(dlen);
+}
+
+static int ddp_ppod_write_idata(struct cxgbi_device *cdev, unsigned int port_id,
+				struct cxgbi_pagepod_hdr *hdr, unsigned int idx,
+				unsigned int npods,
+				struct cxgbi_gather_list *gl,
+				unsigned int gl_pidx)
+{
+	struct cxgbi_ddp_info *ddp = cdev->ddp;
+	struct cxgb4_lld_info *lldi = cxgbi_cdev_priv(cdev);
+	struct sk_buff *skb;
+	struct ulp_mem_io *req;
+	struct ulptx_idata *idata;
+	struct cxgbi_pagepod *ppod;
+	unsigned int pm_addr = idx * PPOD_SIZE + ddp->llimit;
+	unsigned int dlen = PPOD_SIZE * npods;
+	unsigned int wr_len = roundup(sizeof(struct ulp_mem_io) +
+				sizeof(struct ulptx_idata) + dlen, 16);
+	unsigned int i;
+
+	skb = alloc_wr(wr_len, 0, GFP_ATOMIC);
+	if (!skb) {
+		pr_err("cdev 0x%p, idx %u, npods %u, OOM.\n",
+			cdev, idx, npods);
+		return -ENOMEM;
+	}
+	req = (struct ulp_mem_io *)skb->head;
+	set_queue(skb, CPL_PRIORITY_CONTROL, NULL);
+
+	ulp_mem_io_set_hdr(lldi, req, wr_len, dlen, pm_addr);
+	idata = (struct ulptx_idata *)(req + 1);
+	ppod = (struct cxgbi_pagepod *)(idata + 1);
+
+	for (i = 0; i < npods; i++, ppod++, gl_pidx += PPOD_PAGES_MAX) {
+		if (!hdr && !gl)
+			cxgbi_ddp_ppod_clear(ppod);
+		else
+			cxgbi_ddp_ppod_set(ppod, hdr, gl, gl_pidx);
+	}
+
+	cxgb4_ofld_send(cdev->ports[port_id], skb);
+	return 0;
+}
+
+static int ddp_set_map(struct cxgbi_sock *csk, struct cxgbi_pagepod_hdr *hdr,
+			unsigned int idx, unsigned int npods,
+			struct cxgbi_gather_list *gl)
+{
+	unsigned int i, cnt;
+	int err = 0;
+
+	for (i = 0; i < npods; i += cnt, idx += cnt) {
+		cnt = npods - i;
+		if (cnt > ULPMEM_IDATA_MAX_NPPODS)
+			cnt = ULPMEM_IDATA_MAX_NPPODS;
+		err = ddp_ppod_write_idata(csk->cdev, csk->port_id, hdr,
+					idx, cnt, gl, 4 * i);
+		if (err < 0)
+			break;
+	}
+	return err;
+}
+
+static void ddp_clear_map(struct cxgbi_hba *chba, unsigned int tag,
+			  unsigned int idx, unsigned int npods)
+{
+	unsigned int i, cnt;
+	int err;
+
+	for (i = 0; i < npods; i += cnt, idx += cnt) {
+		cnt = npods - i;
+		if (cnt > ULPMEM_IDATA_MAX_NPPODS)
+			cnt = ULPMEM_IDATA_MAX_NPPODS;
+		err = ddp_ppod_write_idata(chba->cdev, chba->port_id, NULL,
+					idx, cnt, NULL, 0);
+		if (err < 0)
+			break;
+	}
+}
+
+static int ddp_setup_conn_pgidx(struct cxgbi_sock *csk, unsigned int tid,
+				int pg_idx, bool reply)
+{
+	struct sk_buff *skb;
+	struct cpl_set_tcb_field *req;
+
+	if (!pg_idx || pg_idx >= DDP_PGIDX_MAX)
+		return 0;
+
+	skb = alloc_wr(sizeof(*req), 0, GFP_KERNEL);
+	if (!skb)
+		return -ENOMEM;
+
+	/*  set up ulp page size */
+	req = (struct cpl_set_tcb_field *)skb->head;
+	INIT_TP_WR(req, csk->tid);
+	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, csk->tid));
+	req->reply_ctrl = htons(NO_REPLY(reply) | QUEUENO(csk->rss_qid));
+	req->word_cookie = htons(0);
+	req->mask = cpu_to_be64(0x3 << 8);
+	req->val = cpu_to_be64(pg_idx << 8);
+	set_wr_txq(skb, CPL_PRIORITY_CONTROL, csk->port_id);
+
+	log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_SOCK,
+		"csk 0x%p, tid 0x%x, pg_idx %u.\n", csk, csk->tid, pg_idx);
+
+	cxgb4_ofld_send(csk->cdev->ports[csk->port_id], skb);
+	return 0;
+}
+
+static int ddp_setup_conn_digest(struct cxgbi_sock *csk, unsigned int tid,
+				 int hcrc, int dcrc, int reply)
+{
+	struct sk_buff *skb;
+	struct cpl_set_tcb_field *req;
+
+	if (!hcrc && !dcrc)
+		return 0;
+
+	skb = alloc_wr(sizeof(*req), 0, GFP_KERNEL);
+	if (!skb)
+		return -ENOMEM;
+
+	csk->hcrc_len = (hcrc ? 4 : 0);
+	csk->dcrc_len = (dcrc ? 4 : 0);
+	/*  set up ulp submode */
+	req = (struct cpl_set_tcb_field *)skb->head;
+	INIT_TP_WR(req, tid);
+	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, tid));
+	req->reply_ctrl = htons(NO_REPLY(reply) | QUEUENO(csk->rss_qid));
+	req->word_cookie = htons(0);
+	req->mask = cpu_to_be64(0x3 << 4);
+	req->val = cpu_to_be64(((hcrc ? ULP_CRC_HEADER : 0) |
+				(dcrc ? ULP_CRC_DATA : 0)) << 4);
+	set_wr_txq(skb, CPL_PRIORITY_CONTROL, csk->port_id);
+
+	log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_SOCK,
+		"csk 0x%p, tid 0x%x, crc %d,%d.\n", csk, csk->tid, hcrc, dcrc);
+
+	cxgb4_ofld_send(csk->cdev->ports[csk->port_id], skb);
+	return 0;
+}
+
+static int cxgb4i_ddp_init(struct cxgbi_device *cdev)
+{
+	struct cxgb4_lld_info *lldi = cxgbi_cdev_priv(cdev);
+	struct cxgbi_ddp_info *ddp = cdev->ddp;
+	unsigned int tagmask, pgsz_factor[4];
+	int err;
+
+	if (ddp) {
+		kref_get(&ddp->refcnt);
+		pr_warn("cdev 0x%p, ddp 0x%p already set up.\n",
+			cdev, cdev->ddp);
+		return -EALREADY;
+	}
+
+	err = cxgbi_ddp_init(cdev, lldi->vr->iscsi.start,
+			lldi->vr->iscsi.start + lldi->vr->iscsi.size - 1,
+			lldi->iscsi_iolen, lldi->iscsi_iolen);
+	if (err < 0)
+		return err;
+
+	ddp = cdev->ddp;
+
+	tagmask = ddp->idx_mask << PPOD_IDX_SHIFT;
+	cxgbi_ddp_page_size_factor(pgsz_factor);
+	cxgb4_iscsi_init(lldi->ports[0], tagmask, pgsz_factor);
+
+	cdev->csk_ddp_setup_digest = ddp_setup_conn_digest;
+	cdev->csk_ddp_setup_pgidx = ddp_setup_conn_pgidx;
+	cdev->csk_ddp_set = ddp_set_map;
+	cdev->csk_ddp_clear = ddp_clear_map;
+
+	pr_info("cxgb4i 0x%p tag: sw %u, rsvd %u,%u, mask 0x%x.\n",
+		cdev, cdev->tag_format.sw_bits, cdev->tag_format.rsvd_bits,
+		cdev->tag_format.rsvd_shift, cdev->tag_format.rsvd_mask);
+	pr_info("cxgb4i 0x%p, nppods %u, bits %u, mask 0x%x,0x%x pkt %u/%u, "
+		" %u/%u.\n",
+		cdev, ddp->nppods, ddp->idx_bits, ddp->idx_mask,
+		ddp->rsvd_tag_mask, ddp->max_txsz, lldi->iscsi_iolen,
+		ddp->max_rxsz, lldi->iscsi_iolen);
+	pr_info("cxgb4i 0x%p max payload size: %u/%u, %u/%u.\n",
+		cdev, cdev->tx_max_size, ddp->max_txsz, cdev->rx_max_size,
+		ddp->max_rxsz);
+	return 0;
+}
+
+static void *t4_uld_add(const struct cxgb4_lld_info *lldi)
+{
+	struct cxgbi_device *cdev;
+	struct port_info *pi;
+	int i, rc;
+
+	cdev = cxgbi_device_register(sizeof(*lldi), lldi->nports);
+	if (!cdev) {
+		pr_info("t4 device 0x%p, register failed.\n", lldi);
+		return NULL;
+	}
+	pr_info("0x%p,0x%x, ports %u,%s, chan %u, q %u,%u, wr %u.\n",
+		cdev, lldi->adapter_type, lldi->nports,
+		lldi->ports[0]->name, lldi->nchan, lldi->ntxq,
+		lldi->nrxq, lldi->wr_cred);
+	for (i = 0; i < lldi->nrxq; i++)
+		log_debug(1 << CXGBI_DBG_DEV,
+			"t4 0x%p, rxq id #%d: %u.\n",
+			cdev, i, lldi->rxq_ids[i]);
+
+	memcpy(cxgbi_cdev_priv(cdev), lldi, sizeof(*lldi));
+	cdev->flags = CXGBI_FLAG_DEV_T4;
+	cdev->pdev = lldi->pdev;
+	cdev->ports = lldi->ports;
+	cdev->nports = lldi->nports;
+	cdev->mtus = lldi->mtus;
+	cdev->nmtus = NMTUS;
+	cdev->snd_win = cxgb4i_snd_win;
+	cdev->rcv_win = cxgb4i_rcv_win;
+	cdev->rx_credit_thres = cxgb4i_rx_credit_thres;
+	cdev->skb_tx_rsvd = CXGB4I_TX_HEADER_LEN;
+	cdev->skb_rx_extra = sizeof(struct cpl_iscsi_hdr);
+	cdev->itp = &cxgb4i_iscsi_transport;
+
+	cdev->pfvf = FW_VIID_PFN_GET(cxgb4_port_viid(lldi->ports[0])) << 8;
+	pr_info("cdev 0x%p,%s, pfvf %u.\n",
+		cdev, lldi->ports[0]->name, cdev->pfvf);
+
+	rc = cxgb4i_ddp_init(cdev);
+	if (rc) {
+		pr_info("t4 0x%p ddp init failed.\n", cdev);
+		goto err_out;
+	}
+	rc = cxgb4i_ofld_init(cdev);
+	if (rc) {
+		pr_info("t4 0x%p ofld init failed.\n", cdev);
+		goto err_out;
+	}
+
+	rc = cxgbi_hbas_add(cdev, CXGB4I_MAX_LUN, CXGBI_MAX_CONN,
+				&cxgb4i_host_template, cxgb4i_stt);
+	if (rc)
+		goto err_out;
+
+	for (i = 0; i < cdev->nports; i++) {
+		pi = netdev_priv(lldi->ports[i]);
+		cdev->hbas[i]->port_id = pi->port_id;
+	}
+	return cdev;
+
+err_out:
+	cxgbi_device_unregister(cdev);
+	return ERR_PTR(-ENOMEM);
+}
+
+#define RX_PULL_LEN	128
+static int t4_uld_rx_handler(void *handle, const __be64 *rsp,
+				const struct pkt_gl *pgl)
+{
+	const struct cpl_act_establish *rpl;
+	struct sk_buff *skb;
+	unsigned int opc;
+	struct cxgbi_device *cdev = handle;
+
+	if (pgl == NULL) {
+		unsigned int len = 64 - sizeof(struct rsp_ctrl) - 8;
+
+		skb = alloc_wr(len, 0, GFP_ATOMIC);
+		if (!skb)
+			goto nomem;
+		skb_copy_to_linear_data(skb, &rsp[1], len);
+	} else {
+		if (unlikely(*(u8 *)rsp != *(u8 *)pgl->va)) {
+			pr_info("? FL 0x%p,RSS%#llx,FL %#llx,len %u.\n",
+				pgl->va, be64_to_cpu(*rsp),
+				be64_to_cpu(*(u64 *)pgl->va),
+				pgl->tot_len);
+			return 0;
+		}
+		skb = cxgb4_pktgl_to_skb(pgl, RX_PULL_LEN, RX_PULL_LEN);
+		if (unlikely(!skb))
+			goto nomem;
+	}
+
+	rpl = (struct cpl_act_establish *)skb->data;
+	opc = rpl->ot.opcode;
+	log_debug(1 << CXGBI_DBG_TOE,
+		"cdev %p, opcode 0x%x(0x%x,0x%x), skb %p.\n",
+		 cdev, opc, rpl->ot.opcode_tid, ntohl(rpl->ot.opcode_tid), skb);
+	if (cxgb4i_cplhandlers[opc])
+		cxgb4i_cplhandlers[opc](cdev, skb);
+	else {
+		pr_err("No handler for opcode 0x%x.\n", opc);
+		__kfree_skb(skb);
+	}
+	return 0;
+nomem:
+	log_debug(1 << CXGBI_DBG_TOE, "OOM bailing out.\n");
+	return 1;
+}
+
+static int t4_uld_state_change(void *handle, enum cxgb4_state state)
+{
+	struct cxgbi_device *cdev = handle;
+
+	switch (state) {
+	case CXGB4_STATE_UP:
+		pr_info("cdev 0x%p, UP.\n", cdev);
+		break;
+	case CXGB4_STATE_START_RECOVERY:
+		pr_info("cdev 0x%p, RECOVERY.\n", cdev);
+		/* close all connections */
+		break;
+	case CXGB4_STATE_DOWN:
+		pr_info("cdev 0x%p, DOWN.\n", cdev);
+		break;
+	case CXGB4_STATE_DETACH:
+		pr_info("cdev 0x%p, DETACH.\n", cdev);
+		cxgbi_device_unregister(cdev);
+		break;
+	default:
+		pr_info("cdev 0x%p, unknown state %d.\n", cdev, state);
+		break;
+	}
+	return 0;
+}
+
+static int __init cxgb4i_init_module(void)
+{
+	int rc;
+
+	printk(KERN_INFO "%s", version);
+
+	rc = cxgbi_iscsi_init(&cxgb4i_iscsi_transport, &cxgb4i_stt);
+	if (rc < 0)
+		return rc;
+	cxgb4_register_uld(CXGB4_ULD_ISCSI, &cxgb4i_uld_info);
+
+	return 0;
+}
+
+static void __exit cxgb4i_exit_module(void)
+{
+	cxgb4_unregister_uld(CXGB4_ULD_ISCSI);
+	cxgbi_device_unregister_all(CXGBI_FLAG_DEV_T4);
+	cxgbi_iscsi_cleanup(&cxgb4i_iscsi_transport, &cxgb4i_stt);
+}
+
+module_init(cxgb4i_init_module);
+module_exit(cxgb4i_exit_module);
diff --git a/drivers/scsi/cxgbi/cxgb4i/cxgb4i.h b/drivers/scsi/cxgbi/cxgb4i/cxgb4i.h
new file mode 100644
index 0000000..1096026
--- /dev/null
+++ b/drivers/scsi/cxgbi/cxgb4i/cxgb4i.h
@@ -0,0 +1,43 @@
+/*
+ * cxgb4i.h: Chelsio T4 iSCSI driver.
+ *
+ * Copyright (c) 2010 Chelsio Communications, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.
+ *
+ * Written by: Karen Xie (kxie@chelsio.com)
+ * Written by: Rakesh Ranjan (rranjan@chelsio.com)
+ */
+
+#ifndef	__CXGB4I_H__
+#define	__CXGB4I_H__
+
+#define	CXGB4I_SCSI_HOST_QDEPTH	1024
+#define	CXGB4I_MAX_CONN		16384
+#define	CXGB4I_MAX_TARGET	CXGB4I_MAX_CONN
+#define	CXGB4I_MAX_LUN		0x1000
+
+/* for TX: a skb must have a headroom of at least TX_HEADER_LEN bytes */
+#define CXGB4I_TX_HEADER_LEN \
+	(sizeof(struct fw_ofld_tx_data_wr) + sizeof(struct sge_opaque_hdr))
+
+struct ulptx_idata {
+	__be32 cmd_more;
+	__be32 len;
+};
+
+struct cpl_rx_data_ddp {
+	union opcode_tid ot;
+	__be16 urg;
+	__be16 len;
+	__be32 seq;
+	union {
+		__be32 nxt_seq;
+		__be32 ddp_report;
+	};
+	__be32 ulp_crc;
+	__be32 ddpvld;
+};
+#endif	/* __CXGB4I_H__ */
diff --git a/drivers/scsi/cxgbi/libcxgbi.c b/drivers/scsi/cxgbi/libcxgbi.c
new file mode 100644
index 0000000..7da59c3
--- /dev/null
+++ b/drivers/scsi/cxgbi/libcxgbi.c
@@ -0,0 +1,2929 @@
+/*
+ * libcxgbi.c: Chelsio common library for T3/T4 iSCSI driver.
+ *
+ * Copyright (c) 2010 Chelsio Communications, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.
+ *
+ * Written by: Karen Xie (kxie@chelsio.com)
+ * Written by: Rakesh Ranjan (rranjan@chelsio.com)
+ */
+
+#define pr_fmt(fmt)	KBUILD_MODNAME ":%s: " fmt, __func__
+
+#include <linux/skbuff.h>
+#include <linux/crypto.h>
+#include <linux/scatterlist.h>
+#include <linux/pci.h>
+#include <scsi/scsi.h>
+#include <scsi/scsi_cmnd.h>
+#include <scsi/scsi_host.h>
+#include <linux/if_vlan.h>
+#include <linux/inet.h>
+#include <net/dst.h>
+#include <net/route.h>
+#include <net/ipv6.h>
+#include <net/ip6_route.h>
+#include <net/addrconf.h>
+
+#include <linux/inetdevice.h>	/* ip_dev_find */
+#include <linux/module.h>
+#include <net/tcp.h>
+
+static unsigned int dbg_level;
+
+#include "libcxgbi.h"
+
+#define DRV_MODULE_NAME		"libcxgbi"
+#define DRV_MODULE_DESC		"Chelsio iSCSI driver library"
+#define DRV_MODULE_VERSION	"0.9.0"
+#define DRV_MODULE_RELDATE	"Jun. 2010"
+
+MODULE_AUTHOR("Chelsio Communications, Inc.");
+MODULE_DESCRIPTION(DRV_MODULE_DESC);
+MODULE_VERSION(DRV_MODULE_VERSION);
+MODULE_LICENSE("GPL");
+
+module_param(dbg_level, uint, 0644);
+MODULE_PARM_DESC(dbg_level, "libiscsi debug level (default=0)");
+
+
+/*
+ * cxgbi device management
+ * maintains a list of the cxgbi devices
+ */
+static LIST_HEAD(cdev_list);
+static DEFINE_MUTEX(cdev_mutex);
+
+static LIST_HEAD(cdev_rcu_list);
+static DEFINE_SPINLOCK(cdev_rcu_lock);
+
+int cxgbi_device_portmap_create(struct cxgbi_device *cdev, unsigned int base,
+				unsigned int max_conn)
+{
+	struct cxgbi_ports_map *pmap = &cdev->pmap;
+
+	pmap->port_csk = cxgbi_alloc_big_mem(max_conn *
+					     sizeof(struct cxgbi_sock *),
+					     GFP_KERNEL);
+	if (!pmap->port_csk) {
+		pr_warn("cdev 0x%p, portmap OOM %u.\n", cdev, max_conn);
+		return -ENOMEM;
+	}
+
+	pmap->max_connect = max_conn;
+	pmap->sport_base = base;
+	spin_lock_init(&pmap->lock);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(cxgbi_device_portmap_create);
+
+void cxgbi_device_portmap_cleanup(struct cxgbi_device *cdev)
+{
+	struct cxgbi_ports_map *pmap = &cdev->pmap;
+	struct cxgbi_sock *csk;
+	int i;
+
+	for (i = 0; i < pmap->max_connect; i++) {
+		if (pmap->port_csk[i]) {
+			csk = pmap->port_csk[i];
+			pmap->port_csk[i] = NULL;
+			log_debug(1 << CXGBI_DBG_SOCK,
+				"csk 0x%p, cdev 0x%p, offload down.\n",
+				csk, cdev);
+			spin_lock_bh(&csk->lock);
+			cxgbi_sock_set_flag(csk, CTPF_OFFLOAD_DOWN);
+			cxgbi_sock_closed(csk);
+			spin_unlock_bh(&csk->lock);
+			cxgbi_sock_put(csk);
+		}
+	}
+}
+EXPORT_SYMBOL_GPL(cxgbi_device_portmap_cleanup);
+
+static inline void cxgbi_device_destroy(struct cxgbi_device *cdev)
+{
+	log_debug(1 << CXGBI_DBG_DEV,
+		"cdev 0x%p, p# %u.\n", cdev, cdev->nports);
+	cxgbi_hbas_remove(cdev);
+	cxgbi_device_portmap_cleanup(cdev);
+	if (cdev->dev_ddp_cleanup)
+		cdev->dev_ddp_cleanup(cdev);
+	else
+		cxgbi_ddp_cleanup(cdev);
+	if (cdev->ddp)
+		cxgbi_ddp_cleanup(cdev);
+	if (cdev->pmap.max_connect)
+		cxgbi_free_big_mem(cdev->pmap.port_csk);
+	kfree(cdev);
+}
+
+struct cxgbi_device *cxgbi_device_register(unsigned int extra,
+					   unsigned int nports)
+{
+	struct cxgbi_device *cdev;
+
+	cdev = kzalloc(sizeof(*cdev) + extra + nports *
+			(sizeof(struct cxgbi_hba *) +
+			 sizeof(struct net_device *)),
+			GFP_KERNEL);
+	if (!cdev) {
+		pr_warn("nport %d, OOM.\n", nports);
+		return NULL;
+	}
+	cdev->ports = (struct net_device **)(cdev + 1);
+	cdev->hbas = (struct cxgbi_hba **)(((char*)cdev->ports) + nports *
+						sizeof(struct net_device *));
+	if (extra)
+		cdev->dd_data = ((char *)cdev->hbas) +
+				nports * sizeof(struct cxgbi_hba *);
+	spin_lock_init(&cdev->pmap.lock);
+
+	mutex_lock(&cdev_mutex);
+	list_add_tail(&cdev->list_head, &cdev_list);
+	mutex_unlock(&cdev_mutex);
+
+	spin_lock(&cdev_rcu_lock);
+	list_add_tail_rcu(&cdev->rcu_node, &cdev_rcu_list);
+	spin_unlock(&cdev_rcu_lock);
+
+	log_debug(1 << CXGBI_DBG_DEV,
+		"cdev 0x%p, p# %u.\n", cdev, nports);
+	return cdev;
+}
+EXPORT_SYMBOL_GPL(cxgbi_device_register);
+
+void cxgbi_device_unregister(struct cxgbi_device *cdev)
+{
+	log_debug(1 << CXGBI_DBG_DEV,
+		"cdev 0x%p, p# %u,%s.\n",
+		cdev, cdev->nports, cdev->nports ? cdev->ports[0]->name : "");
+
+	mutex_lock(&cdev_mutex);
+	list_del(&cdev->list_head);
+	mutex_unlock(&cdev_mutex);
+
+	spin_lock(&cdev_rcu_lock);
+	list_del_rcu(&cdev->rcu_node);
+	spin_unlock(&cdev_rcu_lock);
+	synchronize_rcu();
+
+	cxgbi_device_destroy(cdev);
+}
+EXPORT_SYMBOL_GPL(cxgbi_device_unregister);
+
+void cxgbi_device_unregister_all(unsigned int flag)
+{
+	struct cxgbi_device *cdev, *tmp;
+
+	mutex_lock(&cdev_mutex);
+	list_for_each_entry_safe(cdev, tmp, &cdev_list, list_head) {
+		if ((cdev->flags & flag) == flag) {
+			mutex_unlock(&cdev_mutex);
+			cxgbi_device_unregister(cdev);
+			mutex_lock(&cdev_mutex);
+		}
+	}
+	mutex_unlock(&cdev_mutex);
+}
+EXPORT_SYMBOL_GPL(cxgbi_device_unregister_all);
+
+struct cxgbi_device *cxgbi_device_find_by_lldev(void *lldev)
+{
+	struct cxgbi_device *cdev, *tmp;
+
+	mutex_lock(&cdev_mutex);
+	list_for_each_entry_safe(cdev, tmp, &cdev_list, list_head) {
+		if (cdev->lldev == lldev) {
+			mutex_unlock(&cdev_mutex);
+			return cdev;
+		}
+	}
+	mutex_unlock(&cdev_mutex);
+
+	log_debug(1 << CXGBI_DBG_DEV,
+		"lldev 0x%p, NO match found.\n", lldev);
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(cxgbi_device_find_by_lldev);
+
+struct cxgbi_device *cxgbi_device_find_by_netdev(struct net_device *ndev,
+						 int *port)
+{
+	struct net_device *vdev = NULL;
+	struct cxgbi_device *cdev, *tmp;
+	int i;
+
+	if (ndev->priv_flags & IFF_802_1Q_VLAN) {
+		vdev = ndev;
+		ndev = vlan_dev_real_dev(ndev);
+		log_debug(1 << CXGBI_DBG_DEV,
+			"vlan dev %s -> %s.\n", vdev->name, ndev->name);
+	}
+
+	mutex_lock(&cdev_mutex);
+	list_for_each_entry_safe(cdev, tmp, &cdev_list, list_head) {
+		for (i = 0; i < cdev->nports; i++) {
+			if (ndev == cdev->ports[i]) {
+				cdev->hbas[i]->vdev = vdev;
+				mutex_unlock(&cdev_mutex);
+				if (port)
+					*port = i;
+				return cdev;
+			}
+		}
+	}
+	mutex_unlock(&cdev_mutex);
+	log_debug(1 << CXGBI_DBG_DEV,
+		"ndev 0x%p, %s, NO match found.\n", ndev, ndev->name);
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(cxgbi_device_find_by_netdev);
+
+struct cxgbi_device *cxgbi_device_find_by_netdev_rcu(struct net_device *ndev,
+						     int *port)
+{
+	struct net_device *vdev = NULL;
+	struct cxgbi_device *cdev;
+	int i;
+
+	if (ndev->priv_flags & IFF_802_1Q_VLAN) {
+		vdev = ndev;
+		ndev = vlan_dev_real_dev(ndev);
+		pr_info("vlan dev %s -> %s.\n", vdev->name, ndev->name);
+	}
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(cdev, &cdev_rcu_list, rcu_node) {
+		for (i = 0; i < cdev->nports; i++) {
+			if (ndev == cdev->ports[i]) {
+				cdev->hbas[i]->vdev = vdev;
+				rcu_read_unlock();
+				if (port)
+					*port = i;
+				return cdev;
+			}
+		}
+	}
+	rcu_read_unlock();
+
+	log_debug(1 << CXGBI_DBG_DEV,
+		  "ndev 0x%p, %s, NO match found.\n", ndev, ndev->name);
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(cxgbi_device_find_by_netdev_rcu);
+
+#if IS_ENABLED(CONFIG_IPV6)
+static struct cxgbi_device *cxgbi_device_find_by_mac(struct net_device *ndev,
+						     int *port)
+{
+	struct net_device *vdev = NULL;
+	struct cxgbi_device *cdev, *tmp;
+	int i;
+
+	if (ndev->priv_flags & IFF_802_1Q_VLAN) {
+		vdev = ndev;
+		ndev = vlan_dev_real_dev(ndev);
+		pr_info("vlan dev %s -> %s.\n", vdev->name, ndev->name);
+	}
+
+	mutex_lock(&cdev_mutex);
+	list_for_each_entry_safe(cdev, tmp, &cdev_list, list_head) {
+		for (i = 0; i < cdev->nports; i++) {
+			if (!memcmp(ndev->dev_addr, cdev->ports[i]->dev_addr,
+				    MAX_ADDR_LEN)) {
+				cdev->hbas[i]->vdev = vdev;
+				mutex_unlock(&cdev_mutex);
+				if (port)
+					*port = i;
+				return cdev;
+			}
+		}
+	}
+	mutex_unlock(&cdev_mutex);
+	log_debug(1 << CXGBI_DBG_DEV,
+		  "ndev 0x%p, %s, NO match mac found.\n",
+		  ndev, ndev->name);
+	return NULL;
+}
+#endif
+
+void cxgbi_hbas_remove(struct cxgbi_device *cdev)
+{
+	int i;
+	struct cxgbi_hba *chba;
+
+	log_debug(1 << CXGBI_DBG_DEV,
+		"cdev 0x%p, p#%u.\n", cdev, cdev->nports);
+
+	for (i = 0; i < cdev->nports; i++) {
+		chba = cdev->hbas[i];
+		if (chba) {
+			cdev->hbas[i] = NULL;
+			iscsi_host_remove(chba->shost);
+			pci_dev_put(cdev->pdev);
+			iscsi_host_free(chba->shost);
+		}
+	}
+}
+EXPORT_SYMBOL_GPL(cxgbi_hbas_remove);
+
+int cxgbi_hbas_add(struct cxgbi_device *cdev, u64 max_lun,
+		unsigned int max_id, struct scsi_host_template *sht,
+		struct scsi_transport_template *stt)
+{
+	struct cxgbi_hba *chba;
+	struct Scsi_Host *shost;
+	int i, err;
+
+	log_debug(1 << CXGBI_DBG_DEV, "cdev 0x%p, p#%u.\n", cdev, cdev->nports);
+
+	for (i = 0; i < cdev->nports; i++) {
+		shost = iscsi_host_alloc(sht, sizeof(*chba), 1);
+		if (!shost) {
+			pr_info("0x%p, p%d, %s, host alloc failed.\n",
+				cdev, i, cdev->ports[i]->name);
+			err = -ENOMEM;
+			goto err_out;
+		}
+
+		shost->transportt = stt;
+		shost->max_lun = max_lun;
+		shost->max_id = max_id;
+		shost->max_channel = 0;
+		shost->max_cmd_len = 16;
+
+		chba = iscsi_host_priv(shost);
+		chba->cdev = cdev;
+		chba->ndev = cdev->ports[i];
+		chba->shost = shost;
+
+		log_debug(1 << CXGBI_DBG_DEV,
+			"cdev 0x%p, p#%d %s: chba 0x%p.\n",
+			cdev, i, cdev->ports[i]->name, chba);
+
+		pci_dev_get(cdev->pdev);
+		err = iscsi_host_add(shost, &cdev->pdev->dev);
+		if (err) {
+			pr_info("cdev 0x%p, p#%d %s, host add failed.\n",
+				cdev, i, cdev->ports[i]->name);
+			pci_dev_put(cdev->pdev);
+			scsi_host_put(shost);
+			goto  err_out;
+		}
+
+		cdev->hbas[i] = chba;
+	}
+
+	return 0;
+
+err_out:
+	cxgbi_hbas_remove(cdev);
+	return err;
+}
+EXPORT_SYMBOL_GPL(cxgbi_hbas_add);
+
+/*
+ * iSCSI offload
+ *
+ * - source port management
+ *   To find a free source port in the port allocation map we use a very simple
+ *   rotor scheme to look for the next free port.
+ *
+ *   If a source port has been specified make sure that it doesn't collide with
+ *   our normal source port allocation map.  If it's outside the range of our
+ *   allocation/deallocation scheme just let them use it.
+ *
+ *   If the source port is outside our allocation range, the caller is
+ *   responsible for keeping track of their port usage.
+ */
+
+static struct cxgbi_sock *find_sock_on_port(struct cxgbi_device *cdev,
+					    unsigned char port_id)
+{
+	struct cxgbi_ports_map *pmap = &cdev->pmap;
+	unsigned int i;
+	unsigned int used;
+
+	if (!pmap->max_connect || !pmap->used)
+		return NULL;
+
+	spin_lock_bh(&pmap->lock);
+	used = pmap->used;
+	for (i = 0; used && i < pmap->max_connect; i++) {
+		struct cxgbi_sock *csk = pmap->port_csk[i];
+
+		if (csk) {
+			if (csk->port_id == port_id) {
+				spin_unlock_bh(&pmap->lock);
+				return csk;
+			}
+			used--;
+		}
+	}
+	spin_unlock_bh(&pmap->lock);
+
+	return NULL;
+}
+
+static int sock_get_port(struct cxgbi_sock *csk)
+{
+	struct cxgbi_device *cdev = csk->cdev;
+	struct cxgbi_ports_map *pmap = &cdev->pmap;
+	unsigned int start;
+	int idx;
+	__be16 *port;
+
+	if (!pmap->max_connect) {
+		pr_err("cdev 0x%p, p#%u %s, NO port map.\n",
+			   cdev, csk->port_id, cdev->ports[csk->port_id]->name);
+		return -EADDRNOTAVAIL;
+	}
+
+	if (csk->csk_family == AF_INET)
+		port = &csk->saddr.sin_port;
+	else /* ipv6 */
+		port = &csk->saddr6.sin6_port;
+
+	if (*port) {
+		pr_err("source port NON-ZERO %u.\n",
+			ntohs(*port));
+		return -EADDRINUSE;
+	}
+
+	spin_lock_bh(&pmap->lock);
+	if (pmap->used >= pmap->max_connect) {
+		spin_unlock_bh(&pmap->lock);
+		pr_info("cdev 0x%p, p#%u %s, ALL ports used.\n",
+			cdev, csk->port_id, cdev->ports[csk->port_id]->name);
+		return -EADDRNOTAVAIL;
+	}
+
+	start = idx = pmap->next;
+	do {
+		if (++idx >= pmap->max_connect)
+			idx = 0;
+		if (!pmap->port_csk[idx]) {
+			pmap->used++;
+			*port = htons(pmap->sport_base + idx);
+			pmap->next = idx;
+			pmap->port_csk[idx] = csk;
+			spin_unlock_bh(&pmap->lock);
+			cxgbi_sock_get(csk);
+			log_debug(1 << CXGBI_DBG_SOCK,
+				"cdev 0x%p, p#%u %s, p %u, %u.\n",
+				cdev, csk->port_id,
+				cdev->ports[csk->port_id]->name,
+				pmap->sport_base + idx, pmap->next);
+			return 0;
+		}
+	} while (idx != start);
+	spin_unlock_bh(&pmap->lock);
+
+	/* should not happen */
+	pr_warn("cdev 0x%p, p#%u %s, next %u?\n",
+		cdev, csk->port_id, cdev->ports[csk->port_id]->name,
+		pmap->next);
+	return -EADDRNOTAVAIL;
+}
+
+static void sock_put_port(struct cxgbi_sock *csk)
+{
+	struct cxgbi_device *cdev = csk->cdev;
+	struct cxgbi_ports_map *pmap = &cdev->pmap;
+	__be16 *port;
+
+	if (csk->csk_family == AF_INET)
+		port = &csk->saddr.sin_port;
+	else /* ipv6 */
+		port = &csk->saddr6.sin6_port;
+
+	if (*port) {
+		int idx = ntohs(*port) - pmap->sport_base;
+
+		*port = 0;
+		if (idx < 0 || idx >= pmap->max_connect) {
+			pr_err("cdev 0x%p, p#%u %s, port %u OOR.\n",
+				cdev, csk->port_id,
+				cdev->ports[csk->port_id]->name,
+				ntohs(*port));
+			return;
+		}
+
+		spin_lock_bh(&pmap->lock);
+		pmap->port_csk[idx] = NULL;
+		pmap->used--;
+		spin_unlock_bh(&pmap->lock);
+
+		log_debug(1 << CXGBI_DBG_SOCK,
+			"cdev 0x%p, p#%u %s, release %u.\n",
+			cdev, csk->port_id, cdev->ports[csk->port_id]->name,
+			pmap->sport_base + idx);
+
+		cxgbi_sock_put(csk);
+	}
+}
+
+/*
+ * iscsi tcp connection
+ */
+void cxgbi_sock_free_cpl_skbs(struct cxgbi_sock *csk)
+{
+	if (csk->cpl_close) {
+		kfree_skb(csk->cpl_close);
+		csk->cpl_close = NULL;
+	}
+	if (csk->cpl_abort_req) {
+		kfree_skb(csk->cpl_abort_req);
+		csk->cpl_abort_req = NULL;
+	}
+	if (csk->cpl_abort_rpl) {
+		kfree_skb(csk->cpl_abort_rpl);
+		csk->cpl_abort_rpl = NULL;
+	}
+}
+EXPORT_SYMBOL_GPL(cxgbi_sock_free_cpl_skbs);
+
+static struct cxgbi_sock *cxgbi_sock_create(struct cxgbi_device *cdev)
+{
+	struct cxgbi_sock *csk = kzalloc(sizeof(*csk), GFP_NOIO);
+
+	if (!csk) {
+		pr_info("alloc csk %zu failed.\n", sizeof(*csk));
+		return NULL;
+	}
+
+	if (cdev->csk_alloc_cpls(csk) < 0) {
+		pr_info("csk 0x%p, alloc cpls failed.\n", csk);
+		kfree(csk);
+		return NULL;
+	}
+
+	spin_lock_init(&csk->lock);
+	kref_init(&csk->refcnt);
+	skb_queue_head_init(&csk->receive_queue);
+	skb_queue_head_init(&csk->write_queue);
+	setup_timer(&csk->retry_timer, NULL, (unsigned long)csk);
+	rwlock_init(&csk->callback_lock);
+	csk->cdev = cdev;
+	csk->flags = 0;
+	cxgbi_sock_set_state(csk, CTP_CLOSED);
+
+	log_debug(1 << CXGBI_DBG_SOCK, "cdev 0x%p, new csk 0x%p.\n", cdev, csk);
+
+	return csk;
+}
+
+static struct rtable *find_route_ipv4(struct flowi4 *fl4,
+				      __be32 saddr, __be32 daddr,
+				      __be16 sport, __be16 dport, u8 tos)
+{
+	struct rtable *rt;
+
+	rt = ip_route_output_ports(&init_net, fl4, NULL, daddr, saddr,
+				   dport, sport, IPPROTO_TCP, tos, 0);
+	if (IS_ERR(rt))
+		return NULL;
+
+	return rt;
+}
+
+static struct cxgbi_sock *cxgbi_check_route(struct sockaddr *dst_addr)
+{
+	struct sockaddr_in *daddr = (struct sockaddr_in *)dst_addr;
+	struct dst_entry *dst;
+	struct net_device *ndev;
+	struct cxgbi_device *cdev;
+	struct rtable *rt = NULL;
+	struct neighbour *n;
+	struct flowi4 fl4;
+	struct cxgbi_sock *csk = NULL;
+	unsigned int mtu = 0;
+	int port = 0xFFFF;
+	int err = 0;
+
+	rt = find_route_ipv4(&fl4, 0, daddr->sin_addr.s_addr, 0, daddr->sin_port, 0);
+	if (!rt) {
+		pr_info("no route to ipv4 0x%x, port %u.\n",
+			be32_to_cpu(daddr->sin_addr.s_addr),
+			be16_to_cpu(daddr->sin_port));
+		err = -ENETUNREACH;
+		goto err_out;
+	}
+	dst = &rt->dst;
+	n = dst_neigh_lookup(dst, &daddr->sin_addr.s_addr);
+	if (!n) {
+		err = -ENODEV;
+		goto rel_rt;
+	}
+	ndev = n->dev;
+
+	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
+		pr_info("multi-cast route %pI4, port %u, dev %s.\n",
+			&daddr->sin_addr.s_addr, ntohs(daddr->sin_port),
+			ndev->name);
+		err = -ENETUNREACH;
+		goto rel_neigh;
+	}
+
+	if (ndev->flags & IFF_LOOPBACK) {
+		ndev = ip_dev_find(&init_net, daddr->sin_addr.s_addr);
+		mtu = ndev->mtu;
+		pr_info("rt dev %s, loopback -> %s, mtu %u.\n",
+			n->dev->name, ndev->name, mtu);
+	}
+
+	cdev = cxgbi_device_find_by_netdev(ndev, &port);
+	if (!cdev) {
+		pr_info("dst %pI4, %s, NOT cxgbi device.\n",
+			&daddr->sin_addr.s_addr, ndev->name);
+		err = -ENETUNREACH;
+		goto rel_neigh;
+	}
+	log_debug(1 << CXGBI_DBG_SOCK,
+		"route to %pI4 :%u, ndev p#%d,%s, cdev 0x%p.\n",
+		&daddr->sin_addr.s_addr, ntohs(daddr->sin_port),
+			   port, ndev->name, cdev);
+
+	csk = cxgbi_sock_create(cdev);
+	if (!csk) {
+		err = -ENOMEM;
+		goto rel_neigh;
+	}
+	csk->cdev = cdev;
+	csk->port_id = port;
+	csk->mtu = mtu;
+	csk->dst = dst;
+
+	csk->csk_family = AF_INET;
+	csk->daddr.sin_addr.s_addr = daddr->sin_addr.s_addr;
+	csk->daddr.sin_port = daddr->sin_port;
+	csk->daddr.sin_family = daddr->sin_family;
+	csk->saddr.sin_family = daddr->sin_family;
+	csk->saddr.sin_addr.s_addr = fl4.saddr;
+	neigh_release(n);
+
+	return csk;
+
+rel_neigh:
+	neigh_release(n);
+
+rel_rt:
+	ip_rt_put(rt);
+	if (csk)
+		cxgbi_sock_closed(csk);
+err_out:
+	return ERR_PTR(err);
+}
+
+#if IS_ENABLED(CONFIG_IPV6)
+static struct rt6_info *find_route_ipv6(const struct in6_addr *saddr,
+					const struct in6_addr *daddr)
+{
+	struct flowi6 fl;
+
+	if (saddr)
+		memcpy(&fl.saddr, saddr, sizeof(struct in6_addr));
+	if (daddr)
+		memcpy(&fl.daddr, daddr, sizeof(struct in6_addr));
+	return (struct rt6_info *)ip6_route_output(&init_net, NULL, &fl);
+}
+
+static struct cxgbi_sock *cxgbi_check_route6(struct sockaddr *dst_addr)
+{
+	struct sockaddr_in6 *daddr6 = (struct sockaddr_in6 *)dst_addr;
+	struct dst_entry *dst;
+	struct net_device *ndev;
+	struct cxgbi_device *cdev;
+	struct rt6_info *rt = NULL;
+	struct neighbour *n;
+	struct in6_addr pref_saddr;
+	struct cxgbi_sock *csk = NULL;
+	unsigned int mtu = 0;
+	int port = 0xFFFF;
+	int err = 0;
+
+	rt = find_route_ipv6(NULL, &daddr6->sin6_addr);
+
+	if (!rt) {
+		pr_info("no route to ipv6 %pI6 port %u\n",
+			daddr6->sin6_addr.s6_addr,
+			be16_to_cpu(daddr6->sin6_port));
+		err = -ENETUNREACH;
+		goto err_out;
+	}
+
+	dst = &rt->dst;
+
+	n = dst_neigh_lookup(dst, &daddr6->sin6_addr);
+
+	if (!n) {
+		pr_info("%pI6, port %u, dst no neighbour.\n",
+			daddr6->sin6_addr.s6_addr,
+			be16_to_cpu(daddr6->sin6_port));
+		err = -ENETUNREACH;
+		goto rel_rt;
+	}
+	ndev = n->dev;
+
+	if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
+		pr_info("multi-cast route %pI6 port %u, dev %s.\n",
+			daddr6->sin6_addr.s6_addr,
+			ntohs(daddr6->sin6_port), ndev->name);
+		err = -ENETUNREACH;
+		goto rel_rt;
+	}
+
+	cdev = cxgbi_device_find_by_netdev(ndev, &port);
+	if (!cdev)
+		cdev = cxgbi_device_find_by_mac(ndev, &port);
+	if (!cdev) {
+		pr_info("dst %pI6 %s, NOT cxgbi device.\n",
+			daddr6->sin6_addr.s6_addr, ndev->name);
+		err = -ENETUNREACH;
+		goto rel_rt;
+	}
+	log_debug(1 << CXGBI_DBG_SOCK,
+		  "route to %pI6 :%u, ndev p#%d,%s, cdev 0x%p.\n",
+		  daddr6->sin6_addr.s6_addr, ntohs(daddr6->sin6_port), port,
+		  ndev->name, cdev);
+
+	csk = cxgbi_sock_create(cdev);
+	if (!csk) {
+		err = -ENOMEM;
+		goto rel_rt;
+	}
+	csk->cdev = cdev;
+	csk->port_id = port;
+	csk->mtu = mtu;
+	csk->dst = dst;
+
+	if (ipv6_addr_any(&rt->rt6i_prefsrc.addr)) {
+		struct inet6_dev *idev = ip6_dst_idev((struct dst_entry *)rt);
+
+		err = ipv6_dev_get_saddr(&init_net, idev ? idev->dev : NULL,
+					 &daddr6->sin6_addr, 0, &pref_saddr);
+		if (err) {
+			pr_info("failed to get source address to reach %pI6\n",
+				&daddr6->sin6_addr);
+			goto rel_rt;
+		}
+	} else {
+		pref_saddr = rt->rt6i_prefsrc.addr;
+	}
+
+	csk->csk_family = AF_INET6;
+	csk->daddr6.sin6_addr = daddr6->sin6_addr;
+	csk->daddr6.sin6_port = daddr6->sin6_port;
+	csk->daddr6.sin6_family = daddr6->sin6_family;
+	csk->saddr6.sin6_family = daddr6->sin6_family;
+	csk->saddr6.sin6_addr = pref_saddr;
+
+	neigh_release(n);
+	return csk;
+
+rel_rt:
+	if (n)
+		neigh_release(n);
+
+	ip6_rt_put(rt);
+	if (csk)
+		cxgbi_sock_closed(csk);
+err_out:
+	return ERR_PTR(err);
+}
+#endif /* IS_ENABLED(CONFIG_IPV6) */
+
+void cxgbi_sock_established(struct cxgbi_sock *csk, unsigned int snd_isn,
+			unsigned int opt)
+{
+	csk->write_seq = csk->snd_nxt = csk->snd_una = snd_isn;
+	dst_confirm(csk->dst);
+	smp_mb();
+	cxgbi_sock_set_state(csk, CTP_ESTABLISHED);
+}
+EXPORT_SYMBOL_GPL(cxgbi_sock_established);
+
+static void cxgbi_inform_iscsi_conn_closing(struct cxgbi_sock *csk)
+{
+	log_debug(1 << CXGBI_DBG_SOCK,
+		"csk 0x%p, state %u, flags 0x%lx, conn 0x%p.\n",
+		csk, csk->state, csk->flags, csk->user_data);
+
+	if (csk->state != CTP_ESTABLISHED) {
+		read_lock_bh(&csk->callback_lock);
+		if (csk->user_data)
+			iscsi_conn_failure(csk->user_data,
+					ISCSI_ERR_TCP_CONN_CLOSE);
+		read_unlock_bh(&csk->callback_lock);
+	}
+}
+
+void cxgbi_sock_closed(struct cxgbi_sock *csk)
+{
+	log_debug(1 << CXGBI_DBG_SOCK, "csk 0x%p,%u,0x%lx,%u.\n",
+		csk, (csk)->state, (csk)->flags, (csk)->tid);
+	cxgbi_sock_set_flag(csk, CTPF_ACTIVE_CLOSE_NEEDED);
+	if (csk->state == CTP_ACTIVE_OPEN || csk->state == CTP_CLOSED)
+		return;
+	if (csk->saddr.sin_port)
+		sock_put_port(csk);
+	if (csk->dst)
+		dst_release(csk->dst);
+	csk->cdev->csk_release_offload_resources(csk);
+	cxgbi_sock_set_state(csk, CTP_CLOSED);
+	cxgbi_inform_iscsi_conn_closing(csk);
+	cxgbi_sock_put(csk);
+}
+EXPORT_SYMBOL_GPL(cxgbi_sock_closed);
+
+static void need_active_close(struct cxgbi_sock *csk)
+{
+	int data_lost;
+	int close_req = 0;
+
+	log_debug(1 << CXGBI_DBG_SOCK, "csk 0x%p,%u,0x%lx,%u.\n",
+		csk, (csk)->state, (csk)->flags, (csk)->tid);
+	spin_lock_bh(&csk->lock);
+	dst_confirm(csk->dst);
+	data_lost = skb_queue_len(&csk->receive_queue);
+	__skb_queue_purge(&csk->receive_queue);
+
+	if (csk->state == CTP_ACTIVE_OPEN)
+		cxgbi_sock_set_flag(csk, CTPF_ACTIVE_CLOSE_NEEDED);
+	else if (csk->state == CTP_ESTABLISHED) {
+		close_req = 1;
+		cxgbi_sock_set_state(csk, CTP_ACTIVE_CLOSE);
+	} else if (csk->state == CTP_PASSIVE_CLOSE) {
+		close_req = 1;
+		cxgbi_sock_set_state(csk, CTP_CLOSE_WAIT_2);
+	}
+
+	if (close_req) {
+		if (data_lost)
+			csk->cdev->csk_send_abort_req(csk);
+		else
+			csk->cdev->csk_send_close_req(csk);
+	}
+
+	spin_unlock_bh(&csk->lock);
+}
+
+void cxgbi_sock_fail_act_open(struct cxgbi_sock *csk, int errno)
+{
+	pr_info("csk 0x%p,%u,%lx, %pI4:%u-%pI4:%u, err %d.\n",
+			csk, csk->state, csk->flags,
+			&csk->saddr.sin_addr.s_addr, csk->saddr.sin_port,
+			&csk->daddr.sin_addr.s_addr, csk->daddr.sin_port,
+			errno);
+
+	cxgbi_sock_set_state(csk, CTP_CONNECTING);
+	csk->err = errno;
+	cxgbi_sock_closed(csk);
+}
+EXPORT_SYMBOL_GPL(cxgbi_sock_fail_act_open);
+
+void cxgbi_sock_act_open_req_arp_failure(void *handle, struct sk_buff *skb)
+{
+	struct cxgbi_sock *csk = (struct cxgbi_sock *)skb->sk;
+
+	log_debug(1 << CXGBI_DBG_SOCK, "csk 0x%p,%u,0x%lx,%u.\n",
+		csk, (csk)->state, (csk)->flags, (csk)->tid);
+	cxgbi_sock_get(csk);
+	spin_lock_bh(&csk->lock);
+	if (csk->state == CTP_ACTIVE_OPEN)
+		cxgbi_sock_fail_act_open(csk, -EHOSTUNREACH);
+	spin_unlock_bh(&csk->lock);
+	cxgbi_sock_put(csk);
+	__kfree_skb(skb);
+}
+EXPORT_SYMBOL_GPL(cxgbi_sock_act_open_req_arp_failure);
+
+void cxgbi_sock_rcv_abort_rpl(struct cxgbi_sock *csk)
+{
+	cxgbi_sock_get(csk);
+	spin_lock_bh(&csk->lock);
+
+	cxgbi_sock_set_flag(csk, CTPF_ABORT_RPL_RCVD);
+	if (cxgbi_sock_flag(csk, CTPF_ABORT_RPL_PENDING)) {
+		cxgbi_sock_clear_flag(csk, CTPF_ABORT_RPL_PENDING);
+		if (cxgbi_sock_flag(csk, CTPF_ABORT_REQ_RCVD))
+			pr_err("csk 0x%p,%u,0x%lx,%u,ABT_RPL_RSS.\n",
+			       csk, csk->state, csk->flags, csk->tid);
+		cxgbi_sock_closed(csk);
+	}
+
+	spin_unlock_bh(&csk->lock);
+	cxgbi_sock_put(csk);
+}
+EXPORT_SYMBOL_GPL(cxgbi_sock_rcv_abort_rpl);
+
+void cxgbi_sock_rcv_peer_close(struct cxgbi_sock *csk)
+{
+	log_debug(1 << CXGBI_DBG_SOCK, "csk 0x%p,%u,0x%lx,%u.\n",
+		csk, (csk)->state, (csk)->flags, (csk)->tid);
+	cxgbi_sock_get(csk);
+	spin_lock_bh(&csk->lock);
+
+	if (cxgbi_sock_flag(csk, CTPF_ABORT_RPL_PENDING))
+		goto done;
+
+	switch (csk->state) {
+	case CTP_ESTABLISHED:
+		cxgbi_sock_set_state(csk, CTP_PASSIVE_CLOSE);
+		break;
+	case CTP_ACTIVE_CLOSE:
+		cxgbi_sock_set_state(csk, CTP_CLOSE_WAIT_2);
+		break;
+	case CTP_CLOSE_WAIT_1:
+		cxgbi_sock_closed(csk);
+		break;
+	case CTP_ABORTING:
+		break;
+	default:
+		pr_err("csk 0x%p,%u,0x%lx,%u, bad state.\n",
+			csk, csk->state, csk->flags, csk->tid);
+	}
+	cxgbi_inform_iscsi_conn_closing(csk);
+done:
+	spin_unlock_bh(&csk->lock);
+	cxgbi_sock_put(csk);
+}
+EXPORT_SYMBOL_GPL(cxgbi_sock_rcv_peer_close);
+
+void cxgbi_sock_rcv_close_conn_rpl(struct cxgbi_sock *csk, u32 snd_nxt)
+{
+	log_debug(1 << CXGBI_DBG_SOCK, "csk 0x%p,%u,0x%lx,%u.\n",
+		csk, (csk)->state, (csk)->flags, (csk)->tid);
+	cxgbi_sock_get(csk);
+	spin_lock_bh(&csk->lock);
+
+	csk->snd_una = snd_nxt - 1;
+	if (cxgbi_sock_flag(csk, CTPF_ABORT_RPL_PENDING))
+		goto done;
+
+	switch (csk->state) {
+	case CTP_ACTIVE_CLOSE:
+		cxgbi_sock_set_state(csk, CTP_CLOSE_WAIT_1);
+		break;
+	case CTP_CLOSE_WAIT_1:
+	case CTP_CLOSE_WAIT_2:
+		cxgbi_sock_closed(csk);
+		break;
+	case CTP_ABORTING:
+		break;
+	default:
+		pr_err("csk 0x%p,%u,0x%lx,%u, bad state.\n",
+			csk, csk->state, csk->flags, csk->tid);
+	}
+done:
+	spin_unlock_bh(&csk->lock);
+	cxgbi_sock_put(csk);
+}
+EXPORT_SYMBOL_GPL(cxgbi_sock_rcv_close_conn_rpl);
+
+void cxgbi_sock_rcv_wr_ack(struct cxgbi_sock *csk, unsigned int credits,
+			   unsigned int snd_una, int seq_chk)
+{
+	log_debug(1 << CXGBI_DBG_TOE | 1 << CXGBI_DBG_SOCK,
+			"csk 0x%p,%u,0x%lx,%u, cr %u,%u+%u, snd_una %u,%d.\n",
+			csk, csk->state, csk->flags, csk->tid, credits,
+			csk->wr_cred, csk->wr_una_cred, snd_una, seq_chk);
+
+	spin_lock_bh(&csk->lock);
+
+	csk->wr_cred += credits;
+	if (csk->wr_una_cred > csk->wr_max_cred - csk->wr_cred)
+		csk->wr_una_cred = csk->wr_max_cred - csk->wr_cred;
+
+	while (credits) {
+		struct sk_buff *p = cxgbi_sock_peek_wr(csk);
+
+		if (unlikely(!p)) {
+			pr_err("csk 0x%p,%u,0x%lx,%u, cr %u,%u+%u, empty.\n",
+				csk, csk->state, csk->flags, csk->tid, credits,
+				csk->wr_cred, csk->wr_una_cred);
+			break;
+		}
+
+		if (unlikely(credits < p->csum)) {
+			pr_warn("csk 0x%p,%u,0x%lx,%u, cr %u,%u+%u, < %u.\n",
+				csk, csk->state, csk->flags, csk->tid,
+				credits, csk->wr_cred, csk->wr_una_cred,
+				p->csum);
+			p->csum -= credits;
+			break;
+		} else {
+			cxgbi_sock_dequeue_wr(csk);
+			credits -= p->csum;
+			kfree_skb(p);
+		}
+	}
+
+	cxgbi_sock_check_wr_invariants(csk);
+
+	if (seq_chk) {
+		if (unlikely(before(snd_una, csk->snd_una))) {
+			pr_warn("csk 0x%p,%u,0x%lx,%u, snd_una %u/%u.",
+				csk, csk->state, csk->flags, csk->tid, snd_una,
+				csk->snd_una);
+			goto done;
+		}
+
+		if (csk->snd_una != snd_una) {
+			csk->snd_una = snd_una;
+			dst_confirm(csk->dst);
+		}
+	}
+
+	if (skb_queue_len(&csk->write_queue)) {
+		if (csk->cdev->csk_push_tx_frames(csk, 0))
+			cxgbi_conn_tx_open(csk);
+	} else
+		cxgbi_conn_tx_open(csk);
+done:
+	spin_unlock_bh(&csk->lock);
+}
+EXPORT_SYMBOL_GPL(cxgbi_sock_rcv_wr_ack);
+
+static unsigned int cxgbi_sock_find_best_mtu(struct cxgbi_sock *csk,
+					     unsigned short mtu)
+{
+	int i = 0;
+
+	while (i < csk->cdev->nmtus - 1 && csk->cdev->mtus[i + 1] <= mtu)
+		++i;
+
+	return i;
+}
+
+unsigned int cxgbi_sock_select_mss(struct cxgbi_sock *csk, unsigned int pmtu)
+{
+	unsigned int idx;
+	struct dst_entry *dst = csk->dst;
+
+	csk->advmss = dst_metric_advmss(dst);
+
+	if (csk->advmss > pmtu - 40)
+		csk->advmss = pmtu - 40;
+	if (csk->advmss < csk->cdev->mtus[0] - 40)
+		csk->advmss = csk->cdev->mtus[0] - 40;
+	idx = cxgbi_sock_find_best_mtu(csk, csk->advmss + 40);
+
+	return idx;
+}
+EXPORT_SYMBOL_GPL(cxgbi_sock_select_mss);
+
+void cxgbi_sock_skb_entail(struct cxgbi_sock *csk, struct sk_buff *skb)
+{
+	cxgbi_skcb_tcp_seq(skb) = csk->write_seq;
+	__skb_queue_tail(&csk->write_queue, skb);
+}
+EXPORT_SYMBOL_GPL(cxgbi_sock_skb_entail);
+
+void cxgbi_sock_purge_wr_queue(struct cxgbi_sock *csk)
+{
+	struct sk_buff *skb;
+
+	while ((skb = cxgbi_sock_dequeue_wr(csk)) != NULL)
+		kfree_skb(skb);
+}
+EXPORT_SYMBOL_GPL(cxgbi_sock_purge_wr_queue);
+
+void cxgbi_sock_check_wr_invariants(const struct cxgbi_sock *csk)
+{
+	int pending = cxgbi_sock_count_pending_wrs(csk);
+
+	if (unlikely(csk->wr_cred + pending != csk->wr_max_cred))
+		pr_err("csk 0x%p, tid %u, credit %u + %u != %u.\n",
+			csk, csk->tid, csk->wr_cred, pending, csk->wr_max_cred);
+}
+EXPORT_SYMBOL_GPL(cxgbi_sock_check_wr_invariants);
+
+static int cxgbi_sock_send_pdus(struct cxgbi_sock *csk, struct sk_buff *skb)
+{
+	struct cxgbi_device *cdev = csk->cdev;
+	struct sk_buff *next;
+	int err, copied = 0;
+
+	spin_lock_bh(&csk->lock);
+
+	if (csk->state != CTP_ESTABLISHED) {
+		log_debug(1 << CXGBI_DBG_PDU_TX,
+			"csk 0x%p,%u,0x%lx,%u, EAGAIN.\n",
+			csk, csk->state, csk->flags, csk->tid);
+		err = -EAGAIN;
+		goto out_err;
+	}
+
+	if (csk->err) {
+		log_debug(1 << CXGBI_DBG_PDU_TX,
+			"csk 0x%p,%u,0x%lx,%u, EPIPE %d.\n",
+			csk, csk->state, csk->flags, csk->tid, csk->err);
+		err = -EPIPE;
+		goto out_err;
+	}
+
+	if (csk->write_seq - csk->snd_una >= cdev->snd_win) {
+		log_debug(1 << CXGBI_DBG_PDU_TX,
+			"csk 0x%p,%u,0x%lx,%u, FULL %u-%u >= %u.\n",
+			csk, csk->state, csk->flags, csk->tid, csk->write_seq,
+			csk->snd_una, cdev->snd_win);
+		err = -ENOBUFS;
+		goto out_err;
+	}
+
+	while (skb) {
+		int frags = skb_shinfo(skb)->nr_frags +
+				(skb->len != skb->data_len);
+
+		if (unlikely(skb_headroom(skb) < cdev->skb_tx_rsvd)) {
+			pr_err("csk 0x%p, skb head %u < %u.\n",
+				csk, skb_headroom(skb), cdev->skb_tx_rsvd);
+			err = -EINVAL;
+			goto out_err;
+		}
+
+		if (frags >= SKB_WR_LIST_SIZE) {
+			pr_err("csk 0x%p, frags %d, %u,%u >%u.\n",
+				csk, skb_shinfo(skb)->nr_frags, skb->len,
+				skb->data_len, (uint)(SKB_WR_LIST_SIZE));
+			err = -EINVAL;
+			goto out_err;
+		}
+
+		next = skb->next;
+		skb->next = NULL;
+		cxgbi_skcb_set_flag(skb, SKCBF_TX_NEED_HDR);
+		cxgbi_sock_skb_entail(csk, skb);
+		copied += skb->len;
+		csk->write_seq += skb->len +
+				cxgbi_ulp_extra_len(cxgbi_skcb_ulp_mode(skb));
+		skb = next;
+	}
+done:
+	if (likely(skb_queue_len(&csk->write_queue)))
+		cdev->csk_push_tx_frames(csk, 1);
+	spin_unlock_bh(&csk->lock);
+	return copied;
+
+out_err:
+	if (copied == 0 && err == -EPIPE)
+		copied = csk->err ? csk->err : -EPIPE;
+	else
+		copied = err;
+	goto done;
+}
+
+/*
+ * Direct Data Placement -
+ * Directly place the iSCSI Data-In or Data-Out PDU's payload into pre-posted
+ * final destination host-memory buffers based on the Initiator Task Tag (ITT)
+ * in Data-In or Target Task Tag (TTT) in Data-Out PDUs.
+ * The host memory address is programmed into h/w in the format of pagepod
+ * entries.
+ * The location of the pagepod entry is encoded into ddp tag which is used as
+ * the base for ITT/TTT.
+ */
+
+static unsigned char ddp_page_order[DDP_PGIDX_MAX] = {0, 1, 2, 4};
+static unsigned char ddp_page_shift[DDP_PGIDX_MAX] = {12, 13, 14, 16};
+static unsigned char page_idx = DDP_PGIDX_MAX;
+
+static unsigned char sw_tag_idx_bits;
+static unsigned char sw_tag_age_bits;
+
+/*
+ * Direct-Data Placement page size adjustment
+ */
+static int ddp_adjust_page_table(void)
+{
+	int i;
+	unsigned int base_order, order;
+
+	if (PAGE_SIZE < (1UL << ddp_page_shift[0])) {
+		pr_info("PAGE_SIZE 0x%lx too small, min 0x%lx\n",
+			PAGE_SIZE, 1UL << ddp_page_shift[0]);
+		return -EINVAL;
+	}
+
+	base_order = get_order(1UL << ddp_page_shift[0]);
+	order = get_order(1UL << PAGE_SHIFT);
+
+	for (i = 0; i < DDP_PGIDX_MAX; i++) {
+		/* first is the kernel page size, then just doubling */
+		ddp_page_order[i] = order - base_order + i;
+		ddp_page_shift[i] = PAGE_SHIFT + i;
+	}
+	return 0;
+}
+
+static int ddp_find_page_index(unsigned long pgsz)
+{
+	int i;
+
+	for (i = 0; i < DDP_PGIDX_MAX; i++) {
+		if (pgsz == (1UL << ddp_page_shift[i]))
+			return i;
+	}
+	pr_info("ddp page size %lu not supported.\n", pgsz);
+	return DDP_PGIDX_MAX;
+}
+
+static void ddp_setup_host_page_size(void)
+{
+	if (page_idx == DDP_PGIDX_MAX) {
+		page_idx = ddp_find_page_index(PAGE_SIZE);
+
+		if (page_idx == DDP_PGIDX_MAX) {
+			pr_info("system PAGE %lu, update hw.\n", PAGE_SIZE);
+			if (ddp_adjust_page_table() < 0) {
+				pr_info("PAGE %lu, disable ddp.\n", PAGE_SIZE);
+				return;
+			}
+			page_idx = ddp_find_page_index(PAGE_SIZE);
+		}
+		pr_info("system PAGE %lu, ddp idx %u.\n", PAGE_SIZE, page_idx);
+	}
+}
+
+void cxgbi_ddp_page_size_factor(int *pgsz_factor)
+{
+	int i;
+
+	for (i = 0; i < DDP_PGIDX_MAX; i++)
+		pgsz_factor[i] = ddp_page_order[i];
+}
+EXPORT_SYMBOL_GPL(cxgbi_ddp_page_size_factor);
+
+/*
+ * DDP setup & teardown
+ */
+
+void cxgbi_ddp_ppod_set(struct cxgbi_pagepod *ppod,
+			struct cxgbi_pagepod_hdr *hdr,
+			struct cxgbi_gather_list *gl, unsigned int gidx)
+{
+	int i;
+
+	memcpy(ppod, hdr, sizeof(*hdr));
+	for (i = 0; i < (PPOD_PAGES_MAX + 1); i++, gidx++) {
+		ppod->addr[i] = gidx < gl->nelem ?
+				cpu_to_be64(gl->phys_addr[gidx]) : 0ULL;
+	}
+}
+EXPORT_SYMBOL_GPL(cxgbi_ddp_ppod_set);
+
+void cxgbi_ddp_ppod_clear(struct cxgbi_pagepod *ppod)
+{
+	memset(ppod, 0, sizeof(*ppod));
+}
+EXPORT_SYMBOL_GPL(cxgbi_ddp_ppod_clear);
+
+static inline int ddp_find_unused_entries(struct cxgbi_ddp_info *ddp,
+					unsigned int start, unsigned int max,
+					unsigned int count,
+					struct cxgbi_gather_list *gl)
+{
+	unsigned int i, j, k;
+
+	/*  not enough entries */
+	if ((max - start) < count) {
+		log_debug(1 << CXGBI_DBG_DDP,
+			"NOT enough entries %u+%u < %u.\n", start, count, max);
+		return -EBUSY;
+	}
+
+	max -= count;
+	spin_lock(&ddp->map_lock);
+	for (i = start; i < max;) {
+		for (j = 0, k = i; j < count; j++, k++) {
+			if (ddp->gl_map[k])
+				break;
+		}
+		if (j == count) {
+			for (j = 0, k = i; j < count; j++, k++)
+				ddp->gl_map[k] = gl;
+			spin_unlock(&ddp->map_lock);
+			return i;
+		}
+		i += j + 1;
+	}
+	spin_unlock(&ddp->map_lock);
+	log_debug(1 << CXGBI_DBG_DDP,
+		"NO suitable entries %u available.\n", count);
+	return -EBUSY;
+}
+
+static inline void ddp_unmark_entries(struct cxgbi_ddp_info *ddp,
+						int start, int count)
+{
+	spin_lock(&ddp->map_lock);
+	memset(&ddp->gl_map[start], 0,
+		count * sizeof(struct cxgbi_gather_list *));
+	spin_unlock(&ddp->map_lock);
+}
+
+static inline void ddp_gl_unmap(struct pci_dev *pdev,
+					struct cxgbi_gather_list *gl)
+{
+	int i;
+
+	for (i = 0; i < gl->nelem; i++)
+		dma_unmap_page(&pdev->dev, gl->phys_addr[i], PAGE_SIZE,
+				PCI_DMA_FROMDEVICE);
+}
+
+static inline int ddp_gl_map(struct pci_dev *pdev,
+				    struct cxgbi_gather_list *gl)
+{
+	int i;
+
+	for (i = 0; i < gl->nelem; i++) {
+		gl->phys_addr[i] = dma_map_page(&pdev->dev, gl->pages[i], 0,
+						PAGE_SIZE,
+						PCI_DMA_FROMDEVICE);
+		if (unlikely(dma_mapping_error(&pdev->dev, gl->phys_addr[i]))) {
+			log_debug(1 << CXGBI_DBG_DDP,
+				"page %d 0x%p, 0x%p dma mapping err.\n",
+				i, gl->pages[i], pdev);
+			goto unmap;
+		}
+	}
+	return i;
+unmap:
+	if (i) {
+		unsigned int nelem = gl->nelem;
+
+		gl->nelem = i;
+		ddp_gl_unmap(pdev, gl);
+		gl->nelem = nelem;
+	}
+	return -EINVAL;
+}
+
+static void ddp_release_gl(struct cxgbi_gather_list *gl,
+				  struct pci_dev *pdev)
+{
+	ddp_gl_unmap(pdev, gl);
+	kfree(gl);
+}
+
+static struct cxgbi_gather_list *ddp_make_gl(unsigned int xferlen,
+						    struct scatterlist *sgl,
+						    unsigned int sgcnt,
+						    struct pci_dev *pdev,
+						    gfp_t gfp)
+{
+	struct cxgbi_gather_list *gl;
+	struct scatterlist *sg = sgl;
+	struct page *sgpage = sg_page(sg);
+	unsigned int sglen = sg->length;
+	unsigned int sgoffset = sg->offset;
+	unsigned int npages = (xferlen + sgoffset + PAGE_SIZE - 1) >>
+				PAGE_SHIFT;
+	int i = 1, j = 0;
+
+	if (xferlen < DDP_THRESHOLD) {
+		log_debug(1 << CXGBI_DBG_DDP,
+			"xfer %u < threshold %u, no ddp.\n",
+			xferlen, DDP_THRESHOLD);
+		return NULL;
+	}
+
+	gl = kzalloc(sizeof(struct cxgbi_gather_list) +
+		     npages * (sizeof(dma_addr_t) +
+		     sizeof(struct page *)), gfp);
+	if (!gl) {
+		log_debug(1 << CXGBI_DBG_DDP,
+			"xfer %u, %u pages, OOM.\n", xferlen, npages);
+		return NULL;
+	}
+
+	 log_debug(1 << CXGBI_DBG_DDP,
+		"xfer %u, sgl %u, gl max %u.\n", xferlen, sgcnt, npages);
+
+	gl->pages = (struct page **)&gl->phys_addr[npages];
+	gl->nelem = npages;
+	gl->length = xferlen;
+	gl->offset = sgoffset;
+	gl->pages[0] = sgpage;
+
+	for (i = 1, sg = sg_next(sgl), j = 0; i < sgcnt;
+		i++, sg = sg_next(sg)) {
+		struct page *page = sg_page(sg);
+
+		if (sgpage == page && sg->offset == sgoffset + sglen)
+			sglen += sg->length;
+		else {
+			/*  make sure the sgl is fit for ddp:
+			 *  each has the same page size, and
+			 *  all of the middle pages are used completely
+			 */
+			if ((j && sgoffset) || ((i != sgcnt - 1) &&
+			    ((sglen + sgoffset) & ~PAGE_MASK))) {
+				log_debug(1 << CXGBI_DBG_DDP,
+					"page %d/%u, %u + %u.\n",
+					i, sgcnt, sgoffset, sglen);
+				goto error_out;
+			}
+
+			j++;
+			if (j == gl->nelem || sg->offset) {
+				log_debug(1 << CXGBI_DBG_DDP,
+					"page %d/%u, offset %u.\n",
+					j, gl->nelem, sg->offset);
+				goto error_out;
+			}
+			gl->pages[j] = page;
+			sglen = sg->length;
+			sgoffset = sg->offset;
+			sgpage = page;
+		}
+	}
+	gl->nelem = ++j;
+
+	if (ddp_gl_map(pdev, gl) < 0)
+		goto error_out;
+
+	return gl;
+
+error_out:
+	kfree(gl);
+	return NULL;
+}
+
+static void ddp_tag_release(struct cxgbi_hba *chba, u32 tag)
+{
+	struct cxgbi_device *cdev = chba->cdev;
+	struct cxgbi_ddp_info *ddp = cdev->ddp;
+	u32 idx;
+
+	idx = (tag >> PPOD_IDX_SHIFT) & ddp->idx_mask;
+	if (idx < ddp->nppods) {
+		struct cxgbi_gather_list *gl = ddp->gl_map[idx];
+		unsigned int npods;
+
+		if (!gl || !gl->nelem) {
+			pr_warn("tag 0x%x, idx %u, gl 0x%p, %u.\n",
+				tag, idx, gl, gl ? gl->nelem : 0);
+			return;
+		}
+		npods = (gl->nelem + PPOD_PAGES_MAX - 1) >> PPOD_PAGES_SHIFT;
+		log_debug(1 << CXGBI_DBG_DDP,
+			"tag 0x%x, release idx %u, npods %u.\n",
+			tag, idx, npods);
+		cdev->csk_ddp_clear(chba, tag, idx, npods);
+		ddp_unmark_entries(ddp, idx, npods);
+		ddp_release_gl(gl, ddp->pdev);
+	} else
+		pr_warn("tag 0x%x, idx %u > max %u.\n", tag, idx, ddp->nppods);
+}
+
+static int ddp_tag_reserve(struct cxgbi_sock *csk, unsigned int tid,
+			   u32 sw_tag, u32 *tagp, struct cxgbi_gather_list *gl,
+			   gfp_t gfp)
+{
+	struct cxgbi_device *cdev = csk->cdev;
+	struct cxgbi_ddp_info *ddp = cdev->ddp;
+	struct cxgbi_tag_format *tformat = &cdev->tag_format;
+	struct cxgbi_pagepod_hdr hdr;
+	unsigned int npods;
+	int idx = -1;
+	int err = -ENOMEM;
+	u32 tag;
+
+	npods = (gl->nelem + PPOD_PAGES_MAX - 1) >> PPOD_PAGES_SHIFT;
+	if (ddp->idx_last == ddp->nppods)
+		idx = ddp_find_unused_entries(ddp, 0, ddp->nppods,
+							npods, gl);
+	else {
+		idx = ddp_find_unused_entries(ddp, ddp->idx_last + 1,
+							ddp->nppods, npods,
+							gl);
+		if (idx < 0 && ddp->idx_last >= npods) {
+			idx = ddp_find_unused_entries(ddp, 0,
+				min(ddp->idx_last + npods, ddp->nppods),
+							npods, gl);
+		}
+	}
+	if (idx < 0) {
+		log_debug(1 << CXGBI_DBG_DDP,
+			"xferlen %u, gl %u, npods %u NO DDP.\n",
+			gl->length, gl->nelem, npods);
+		return idx;
+	}
+
+	tag = cxgbi_ddp_tag_base(tformat, sw_tag);
+	tag |= idx << PPOD_IDX_SHIFT;
+
+	hdr.rsvd = 0;
+	hdr.vld_tid = htonl(PPOD_VALID_FLAG | PPOD_TID(tid));
+	hdr.pgsz_tag_clr = htonl(tag & ddp->rsvd_tag_mask);
+	hdr.max_offset = htonl(gl->length);
+	hdr.page_offset = htonl(gl->offset);
+
+	err = cdev->csk_ddp_set(csk, &hdr, idx, npods, gl);
+	if (err < 0)
+		goto unmark_entries;
+
+	ddp->idx_last = idx;
+	log_debug(1 << CXGBI_DBG_DDP,
+		"xfer %u, gl %u,%u, tid 0x%x, tag 0x%x->0x%x(%u,%u).\n",
+		gl->length, gl->nelem, gl->offset, tid, sw_tag, tag, idx,
+		npods);
+	*tagp = tag;
+	return 0;
+
+unmark_entries:
+	ddp_unmark_entries(ddp, idx, npods);
+	return err;
+}
+
+int cxgbi_ddp_reserve(struct cxgbi_sock *csk, unsigned int *tagp,
+			unsigned int sw_tag, unsigned int xferlen,
+			struct scatterlist *sgl, unsigned int sgcnt, gfp_t gfp)
+{
+	struct cxgbi_device *cdev = csk->cdev;
+	struct cxgbi_tag_format *tformat = &cdev->tag_format;
+	struct cxgbi_gather_list *gl;
+	int err;
+
+	if (page_idx >= DDP_PGIDX_MAX || !cdev->ddp ||
+	    xferlen < DDP_THRESHOLD) {
+		log_debug(1 << CXGBI_DBG_DDP,
+			"pgidx %u, xfer %u, NO ddp.\n", page_idx, xferlen);
+		return -EINVAL;
+	}
+
+	if (!cxgbi_sw_tag_usable(tformat, sw_tag)) {
+		log_debug(1 << CXGBI_DBG_DDP,
+			"sw_tag 0x%x NOT usable.\n", sw_tag);
+		return -EINVAL;
+	}
+
+	gl = ddp_make_gl(xferlen, sgl, sgcnt, cdev->pdev, gfp);
+	if (!gl)
+		return -ENOMEM;
+
+	err = ddp_tag_reserve(csk, csk->tid, sw_tag, tagp, gl, gfp);
+	if (err < 0)
+		ddp_release_gl(gl, cdev->pdev);
+
+	return err;
+}
+
+static void ddp_destroy(struct kref *kref)
+{
+	struct cxgbi_ddp_info *ddp = container_of(kref,
+						struct cxgbi_ddp_info,
+						refcnt);
+	struct cxgbi_device *cdev = ddp->cdev;
+	int i = 0;
+
+	pr_info("kref 0, destroy ddp 0x%p, cdev 0x%p.\n", ddp, cdev);
+
+	while (i < ddp->nppods) {
+		struct cxgbi_gather_list *gl = ddp->gl_map[i];
+
+		if (gl) {
+			int npods = (gl->nelem + PPOD_PAGES_MAX - 1)
+					>> PPOD_PAGES_SHIFT;
+			pr_info("cdev 0x%p, ddp %d + %d.\n", cdev, i, npods);
+			kfree(gl);
+			i += npods;
+		} else
+			i++;
+	}
+	cxgbi_free_big_mem(ddp);
+}
+
+int cxgbi_ddp_cleanup(struct cxgbi_device *cdev)
+{
+	struct cxgbi_ddp_info *ddp = cdev->ddp;
+
+	log_debug(1 << CXGBI_DBG_DDP,
+		"cdev 0x%p, release ddp 0x%p.\n", cdev, ddp);
+	cdev->ddp = NULL;
+	if (ddp)
+		return kref_put(&ddp->refcnt, ddp_destroy);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(cxgbi_ddp_cleanup);
+
+int cxgbi_ddp_init(struct cxgbi_device *cdev,
+		   unsigned int llimit, unsigned int ulimit,
+		   unsigned int max_txsz, unsigned int max_rxsz)
+{
+	struct cxgbi_ddp_info *ddp;
+	unsigned int ppmax, bits;
+
+	ppmax = (ulimit - llimit + 1) >> PPOD_SIZE_SHIFT;
+	bits = __ilog2_u32(ppmax) + 1;
+	if (bits > PPOD_IDX_MAX_SIZE)
+		bits = PPOD_IDX_MAX_SIZE;
+	ppmax = (1 << (bits - 1)) - 1;
+
+	ddp = cxgbi_alloc_big_mem(sizeof(struct cxgbi_ddp_info) +
+				ppmax * (sizeof(struct cxgbi_gather_list *) +
+					 sizeof(struct sk_buff *)),
+				GFP_KERNEL);
+	if (!ddp) {
+		pr_warn("cdev 0x%p, ddp ppmax %u OOM.\n", cdev, ppmax);
+		return -ENOMEM;
+	}
+	ddp->gl_map = (struct cxgbi_gather_list **)(ddp + 1);
+	cdev->ddp = ddp;
+
+	spin_lock_init(&ddp->map_lock);
+	kref_init(&ddp->refcnt);
+
+	ddp->cdev = cdev;
+	ddp->pdev = cdev->pdev;
+	ddp->llimit = llimit;
+	ddp->ulimit = ulimit;
+	ddp->max_txsz = min_t(unsigned int, max_txsz, ULP2_MAX_PKT_SIZE);
+	ddp->max_rxsz = min_t(unsigned int, max_rxsz, ULP2_MAX_PKT_SIZE);
+	ddp->nppods = ppmax;
+	ddp->idx_last = ppmax;
+	ddp->idx_bits = bits;
+	ddp->idx_mask = (1 << bits) - 1;
+	ddp->rsvd_tag_mask = (1 << (bits + PPOD_IDX_SHIFT)) - 1;
+
+	cdev->tag_format.sw_bits = sw_tag_idx_bits + sw_tag_age_bits;
+	cdev->tag_format.rsvd_bits = ddp->idx_bits;
+	cdev->tag_format.rsvd_shift = PPOD_IDX_SHIFT;
+	cdev->tag_format.rsvd_mask = (1 << cdev->tag_format.rsvd_bits) - 1;
+
+	pr_info("%s tag format, sw %u, rsvd %u,%u, mask 0x%x.\n",
+		cdev->ports[0]->name, cdev->tag_format.sw_bits,
+		cdev->tag_format.rsvd_bits, cdev->tag_format.rsvd_shift,
+		cdev->tag_format.rsvd_mask);
+
+	cdev->tx_max_size = min_t(unsigned int, ULP2_MAX_PDU_PAYLOAD,
+				ddp->max_txsz - ISCSI_PDU_NONPAYLOAD_LEN);
+	cdev->rx_max_size = min_t(unsigned int, ULP2_MAX_PDU_PAYLOAD,
+				ddp->max_rxsz - ISCSI_PDU_NONPAYLOAD_LEN);
+
+	log_debug(1 << CXGBI_DBG_DDP,
+		"%s max payload size: %u/%u, %u/%u.\n",
+		cdev->ports[0]->name, cdev->tx_max_size, ddp->max_txsz,
+		cdev->rx_max_size, ddp->max_rxsz);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(cxgbi_ddp_init);
+
+/*
+ * APIs interacting with open-iscsi libraries
+ */
+
+static unsigned char padding[4];
+
+static void task_release_itt(struct iscsi_task *task, itt_t hdr_itt)
+{
+	struct scsi_cmnd *sc = task->sc;
+	struct iscsi_tcp_conn *tcp_conn = task->conn->dd_data;
+	struct cxgbi_conn *cconn = tcp_conn->dd_data;
+	struct cxgbi_hba *chba = cconn->chba;
+	struct cxgbi_tag_format *tformat = &chba->cdev->tag_format;
+	u32 tag = ntohl((__force u32)hdr_itt);
+
+	log_debug(1 << CXGBI_DBG_DDP,
+		   "cdev 0x%p, release tag 0x%x.\n", chba->cdev, tag);
+	if (sc &&
+	    (scsi_bidi_cmnd(sc) || sc->sc_data_direction == DMA_FROM_DEVICE) &&
+	    cxgbi_is_ddp_tag(tformat, tag))
+		ddp_tag_release(chba, tag);
+}
+
+static int task_reserve_itt(struct iscsi_task *task, itt_t *hdr_itt)
+{
+	struct scsi_cmnd *sc = task->sc;
+	struct iscsi_conn *conn = task->conn;
+	struct iscsi_session *sess = conn->session;
+	struct iscsi_tcp_conn *tcp_conn = conn->dd_data;
+	struct cxgbi_conn *cconn = tcp_conn->dd_data;
+	struct cxgbi_hba *chba = cconn->chba;
+	struct cxgbi_tag_format *tformat = &chba->cdev->tag_format;
+	u32 sw_tag = (sess->age << cconn->task_idx_bits) | task->itt;
+	u32 tag = 0;
+	int err = -EINVAL;
+
+	if (sc &&
+	    (scsi_bidi_cmnd(sc) || sc->sc_data_direction == DMA_FROM_DEVICE)) {
+		err = cxgbi_ddp_reserve(cconn->cep->csk, &tag, sw_tag,
+					scsi_in(sc)->length,
+					scsi_in(sc)->table.sgl,
+					scsi_in(sc)->table.nents,
+					GFP_ATOMIC);
+		if (err < 0)
+			log_debug(1 << CXGBI_DBG_DDP,
+				"csk 0x%p, R task 0x%p, %u,%u, no ddp.\n",
+				cconn->cep->csk, task, scsi_in(sc)->length,
+				scsi_in(sc)->table.nents);
+	}
+
+	if (err < 0)
+		tag = cxgbi_set_non_ddp_tag(tformat, sw_tag);
+	/*  the itt need to sent in big-endian order */
+	*hdr_itt = (__force itt_t)htonl(tag);
+
+	log_debug(1 << CXGBI_DBG_DDP,
+		"cdev 0x%p, task 0x%p, 0x%x(0x%x,0x%x)->0x%x/0x%x.\n",
+		chba->cdev, task, sw_tag, task->itt, sess->age, tag, *hdr_itt);
+	return 0;
+}
+
+void cxgbi_parse_pdu_itt(struct iscsi_conn *conn, itt_t itt, int *idx, int *age)
+{
+	struct iscsi_tcp_conn *tcp_conn = conn->dd_data;
+	struct cxgbi_conn *cconn = tcp_conn->dd_data;
+	struct cxgbi_device *cdev = cconn->chba->cdev;
+	u32 tag = ntohl((__force u32) itt);
+	u32 sw_bits;
+
+	sw_bits = cxgbi_tag_nonrsvd_bits(&cdev->tag_format, tag);
+	if (idx)
+		*idx = sw_bits & ((1 << cconn->task_idx_bits) - 1);
+	if (age)
+		*age = (sw_bits >> cconn->task_idx_bits) & ISCSI_AGE_MASK;
+
+	log_debug(1 << CXGBI_DBG_DDP,
+		"cdev 0x%p, tag 0x%x/0x%x, -> 0x%x(0x%x,0x%x).\n",
+		cdev, tag, itt, sw_bits, idx ? *idx : 0xFFFFF,
+		age ? *age : 0xFF);
+}
+EXPORT_SYMBOL_GPL(cxgbi_parse_pdu_itt);
+
+void cxgbi_conn_tx_open(struct cxgbi_sock *csk)
+{
+	struct iscsi_conn *conn = csk->user_data;
+
+	if (conn) {
+		log_debug(1 << CXGBI_DBG_SOCK,
+			"csk 0x%p, cid %d.\n", csk, conn->id);
+		iscsi_conn_queue_work(conn);
+	}
+}
+EXPORT_SYMBOL_GPL(cxgbi_conn_tx_open);
+
+/*
+ * pdu receive, interact with libiscsi_tcp
+ */
+static inline int read_pdu_skb(struct iscsi_conn *conn,
+			       struct sk_buff *skb,
+			       unsigned int offset,
+			       int offloaded)
+{
+	int status = 0;
+	int bytes_read;
+
+	bytes_read = iscsi_tcp_recv_skb(conn, skb, offset, offloaded, &status);
+	switch (status) {
+	case ISCSI_TCP_CONN_ERR:
+		pr_info("skb 0x%p, off %u, %d, TCP_ERR.\n",
+			  skb, offset, offloaded);
+		return -EIO;
+	case ISCSI_TCP_SUSPENDED:
+		log_debug(1 << CXGBI_DBG_PDU_RX,
+			"skb 0x%p, off %u, %d, TCP_SUSPEND, rc %d.\n",
+			skb, offset, offloaded, bytes_read);
+		/* no transfer - just have caller flush queue */
+		return bytes_read;
+	case ISCSI_TCP_SKB_DONE:
+		pr_info("skb 0x%p, off %u, %d, TCP_SKB_DONE.\n",
+			skb, offset, offloaded);
+		/*
+		 * pdus should always fit in the skb and we should get
+		 * segment done notifcation.
+		 */
+		iscsi_conn_printk(KERN_ERR, conn, "Invalid pdu or skb.");
+		return -EFAULT;
+	case ISCSI_TCP_SEGMENT_DONE:
+		log_debug(1 << CXGBI_DBG_PDU_RX,
+			"skb 0x%p, off %u, %d, TCP_SEG_DONE, rc %d.\n",
+			skb, offset, offloaded, bytes_read);
+		return bytes_read;
+	default:
+		pr_info("skb 0x%p, off %u, %d, invalid status %d.\n",
+			skb, offset, offloaded, status);
+		return -EINVAL;
+	}
+}
+
+static int skb_read_pdu_bhs(struct iscsi_conn *conn, struct sk_buff *skb)
+{
+	struct iscsi_tcp_conn *tcp_conn = conn->dd_data;
+
+	log_debug(1 << CXGBI_DBG_PDU_RX,
+		"conn 0x%p, skb 0x%p, len %u, flag 0x%lx.\n",
+		conn, skb, skb->len, cxgbi_skcb_flags(skb));
+
+	if (!iscsi_tcp_recv_segment_is_hdr(tcp_conn)) {
+		pr_info("conn 0x%p, skb 0x%p, not hdr.\n", conn, skb);
+		iscsi_conn_failure(conn, ISCSI_ERR_PROTO);
+		return -EIO;
+	}
+
+	if (conn->hdrdgst_en &&
+	    cxgbi_skcb_test_flag(skb, SKCBF_RX_HCRC_ERR)) {
+		pr_info("conn 0x%p, skb 0x%p, hcrc.\n", conn, skb);
+		iscsi_conn_failure(conn, ISCSI_ERR_HDR_DGST);
+		return -EIO;
+	}
+
+	return read_pdu_skb(conn, skb, 0, 0);
+}
+
+static int skb_read_pdu_data(struct iscsi_conn *conn, struct sk_buff *lskb,
+			     struct sk_buff *skb, unsigned int offset)
+{
+	struct iscsi_tcp_conn *tcp_conn = conn->dd_data;
+	bool offloaded = 0;
+	int opcode = tcp_conn->in.hdr->opcode & ISCSI_OPCODE_MASK;
+
+	log_debug(1 << CXGBI_DBG_PDU_RX,
+		"conn 0x%p, skb 0x%p, len %u, flag 0x%lx.\n",
+		conn, skb, skb->len, cxgbi_skcb_flags(skb));
+
+	if (conn->datadgst_en &&
+	    cxgbi_skcb_test_flag(lskb, SKCBF_RX_DCRC_ERR)) {
+		pr_info("conn 0x%p, skb 0x%p, dcrc 0x%lx.\n",
+			conn, lskb, cxgbi_skcb_flags(lskb));
+		iscsi_conn_failure(conn, ISCSI_ERR_DATA_DGST);
+		return -EIO;
+	}
+
+	if (iscsi_tcp_recv_segment_is_hdr(tcp_conn))
+		return 0;
+
+	/* coalesced, add header digest length */
+	if (lskb == skb && conn->hdrdgst_en)
+		offset += ISCSI_DIGEST_SIZE;
+
+	if (cxgbi_skcb_test_flag(lskb, SKCBF_RX_DATA_DDPD))
+		offloaded = 1;
+
+	if (opcode == ISCSI_OP_SCSI_DATA_IN)
+		log_debug(1 << CXGBI_DBG_PDU_RX,
+			"skb 0x%p, op 0x%x, itt 0x%x, %u %s ddp'ed.\n",
+			skb, opcode, ntohl(tcp_conn->in.hdr->itt),
+			tcp_conn->in.datalen, offloaded ? "is" : "not");
+
+	return read_pdu_skb(conn, skb, offset, offloaded);
+}
+
+static void csk_return_rx_credits(struct cxgbi_sock *csk, int copied)
+{
+	struct cxgbi_device *cdev = csk->cdev;
+	int must_send;
+	u32 credits;
+
+	log_debug(1 << CXGBI_DBG_PDU_RX,
+		"csk 0x%p,%u,0x%lx,%u, seq %u, wup %u, thre %u, %u.\n",
+		csk, csk->state, csk->flags, csk->tid, csk->copied_seq,
+		csk->rcv_wup, cdev->rx_credit_thres,
+		cdev->rcv_win);
+
+	if (csk->state != CTP_ESTABLISHED)
+		return;
+
+	credits = csk->copied_seq - csk->rcv_wup;
+	if (unlikely(!credits))
+		return;
+	if (unlikely(cdev->rx_credit_thres == 0))
+		return;
+
+	must_send = credits + 16384 >= cdev->rcv_win;
+	if (must_send || credits >= cdev->rx_credit_thres)
+		csk->rcv_wup += cdev->csk_send_rx_credits(csk, credits);
+}
+
+void cxgbi_conn_pdu_ready(struct cxgbi_sock *csk)
+{
+	struct cxgbi_device *cdev = csk->cdev;
+	struct iscsi_conn *conn = csk->user_data;
+	struct sk_buff *skb;
+	unsigned int read = 0;
+	int err = 0;
+
+	log_debug(1 << CXGBI_DBG_PDU_RX,
+		"csk 0x%p, conn 0x%p.\n", csk, conn);
+
+	if (unlikely(!conn || conn->suspend_rx)) {
+		log_debug(1 << CXGBI_DBG_PDU_RX,
+			"csk 0x%p, conn 0x%p, id %d, suspend_rx %lu!\n",
+			csk, conn, conn ? conn->id : 0xFF,
+			conn ? conn->suspend_rx : 0xFF);
+		return;
+	}
+
+	while (!err) {
+		skb = skb_peek(&csk->receive_queue);
+		if (!skb ||
+		    !(cxgbi_skcb_test_flag(skb, SKCBF_RX_STATUS))) {
+			if (skb)
+				log_debug(1 << CXGBI_DBG_PDU_RX,
+					"skb 0x%p, NOT ready 0x%lx.\n",
+					skb, cxgbi_skcb_flags(skb));
+			break;
+		}
+		__skb_unlink(skb, &csk->receive_queue);
+
+		read += cxgbi_skcb_rx_pdulen(skb);
+		log_debug(1 << CXGBI_DBG_PDU_RX,
+			"csk 0x%p, skb 0x%p,%u,f 0x%lx, pdu len %u.\n",
+			csk, skb, skb->len, cxgbi_skcb_flags(skb),
+			cxgbi_skcb_rx_pdulen(skb));
+
+		if (cxgbi_skcb_test_flag(skb, SKCBF_RX_COALESCED)) {
+			err = skb_read_pdu_bhs(conn, skb);
+			if (err < 0) {
+				pr_err("coalesced bhs, csk 0x%p, skb 0x%p,%u, "
+					"f 0x%lx, plen %u.\n",
+					csk, skb, skb->len,
+					cxgbi_skcb_flags(skb),
+					cxgbi_skcb_rx_pdulen(skb));
+				goto skb_done;
+			}
+			err = skb_read_pdu_data(conn, skb, skb,
+						err + cdev->skb_rx_extra);
+			if (err < 0)
+				pr_err("coalesced data, csk 0x%p, skb 0x%p,%u, "
+					"f 0x%lx, plen %u.\n",
+					csk, skb, skb->len,
+					cxgbi_skcb_flags(skb),
+					cxgbi_skcb_rx_pdulen(skb));
+		} else {
+			err = skb_read_pdu_bhs(conn, skb);
+			if (err < 0) {
+				pr_err("bhs, csk 0x%p, skb 0x%p,%u, "
+					"f 0x%lx, plen %u.\n",
+					csk, skb, skb->len,
+					cxgbi_skcb_flags(skb),
+					cxgbi_skcb_rx_pdulen(skb));
+				goto skb_done;
+			}
+
+			if (cxgbi_skcb_test_flag(skb, SKCBF_RX_DATA)) {
+				struct sk_buff *dskb;
+
+				dskb = skb_peek(&csk->receive_queue);
+				if (!dskb) {
+					pr_err("csk 0x%p, skb 0x%p,%u, f 0x%lx,"
+						" plen %u, NO data.\n",
+						csk, skb, skb->len,
+						cxgbi_skcb_flags(skb),
+						cxgbi_skcb_rx_pdulen(skb));
+					err = -EIO;
+					goto skb_done;
+				}
+				__skb_unlink(dskb, &csk->receive_queue);
+
+				err = skb_read_pdu_data(conn, skb, dskb, 0);
+				if (err < 0)
+					pr_err("data, csk 0x%p, skb 0x%p,%u, "
+						"f 0x%lx, plen %u, dskb 0x%p,"
+						"%u.\n",
+						csk, skb, skb->len,
+						cxgbi_skcb_flags(skb),
+						cxgbi_skcb_rx_pdulen(skb),
+						dskb, dskb->len);
+				__kfree_skb(dskb);
+			} else
+				err = skb_read_pdu_data(conn, skb, skb, 0);
+		}
+skb_done:
+		__kfree_skb(skb);
+
+		if (err < 0)
+			break;
+	}
+
+	log_debug(1 << CXGBI_DBG_PDU_RX, "csk 0x%p, read %u.\n", csk, read);
+	if (read) {
+		csk->copied_seq += read;
+		csk_return_rx_credits(csk, read);
+		conn->rxdata_octets += read;
+	}
+
+	if (err < 0) {
+		pr_info("csk 0x%p, 0x%p, rx failed %d, read %u.\n",
+			csk, conn, err, read);
+		iscsi_conn_failure(conn, ISCSI_ERR_CONN_FAILED);
+	}
+}
+EXPORT_SYMBOL_GPL(cxgbi_conn_pdu_ready);
+
+static int sgl_seek_offset(struct scatterlist *sgl, unsigned int sgcnt,
+				unsigned int offset, unsigned int *off,
+				struct scatterlist **sgp)
+{
+	int i;
+	struct scatterlist *sg;
+
+	for_each_sg(sgl, sg, sgcnt, i) {
+		if (offset < sg->length) {
+			*off = offset;
+			*sgp = sg;
+			return 0;
+		}
+		offset -= sg->length;
+	}
+	return -EFAULT;
+}
+
+static int sgl_read_to_frags(struct scatterlist *sg, unsigned int sgoffset,
+				unsigned int dlen, struct page_frag *frags,
+				int frag_max)
+{
+	unsigned int datalen = dlen;
+	unsigned int sglen = sg->length - sgoffset;
+	struct page *page = sg_page(sg);
+	int i;
+
+	i = 0;
+	do {
+		unsigned int copy;
+
+		if (!sglen) {
+			sg = sg_next(sg);
+			if (!sg) {
+				pr_warn("sg %d NULL, len %u/%u.\n",
+					i, datalen, dlen);
+				return -EINVAL;
+			}
+			sgoffset = 0;
+			sglen = sg->length;
+			page = sg_page(sg);
+
+		}
+		copy = min(datalen, sglen);
+		if (i && page == frags[i - 1].page &&
+		    sgoffset + sg->offset ==
+			frags[i - 1].offset + frags[i - 1].size) {
+			frags[i - 1].size += copy;
+		} else {
+			if (i >= frag_max) {
+				pr_warn("too many pages %u, dlen %u.\n",
+					frag_max, dlen);
+				return -EINVAL;
+			}
+
+			frags[i].page = page;
+			frags[i].offset = sg->offset + sgoffset;
+			frags[i].size = copy;
+			i++;
+		}
+		datalen -= copy;
+		sgoffset += copy;
+		sglen -= copy;
+	} while (datalen);
+
+	return i;
+}
+
+int cxgbi_conn_alloc_pdu(struct iscsi_task *task, u8 opcode)
+{
+	struct iscsi_tcp_conn *tcp_conn = task->conn->dd_data;
+	struct cxgbi_conn *cconn = tcp_conn->dd_data;
+	struct cxgbi_device *cdev = cconn->chba->cdev;
+	struct iscsi_conn *conn = task->conn;
+	struct iscsi_tcp_task *tcp_task = task->dd_data;
+	struct cxgbi_task_data *tdata = iscsi_task_cxgbi_data(task);
+	struct scsi_cmnd *sc = task->sc;
+	int headroom = SKB_TX_ISCSI_PDU_HEADER_MAX;
+
+	tcp_task->dd_data = tdata;
+	task->hdr = NULL;
+
+	if (SKB_MAX_HEAD(cdev->skb_tx_rsvd) > (512 * MAX_SKB_FRAGS) &&
+	    (opcode == ISCSI_OP_SCSI_DATA_OUT ||
+	     (opcode == ISCSI_OP_SCSI_CMD &&
+	      (scsi_bidi_cmnd(sc) || sc->sc_data_direction == DMA_TO_DEVICE))))
+		/* data could goes into skb head */
+		headroom += min_t(unsigned int,
+				SKB_MAX_HEAD(cdev->skb_tx_rsvd),
+				conn->max_xmit_dlength);
+
+	tdata->skb = alloc_skb(cdev->skb_tx_rsvd + headroom, GFP_ATOMIC);
+	if (!tdata->skb) {
+		struct cxgbi_sock *csk = cconn->cep->csk;
+		struct net_device *ndev = cdev->ports[csk->port_id];
+		ndev->stats.tx_dropped++;
+		return -ENOMEM;
+	}
+
+	skb_reserve(tdata->skb, cdev->skb_tx_rsvd);
+	task->hdr = (struct iscsi_hdr *)tdata->skb->data;
+	task->hdr_max = SKB_TX_ISCSI_PDU_HEADER_MAX; /* BHS + AHS */
+
+	/* data_out uses scsi_cmd's itt */
+	if (opcode != ISCSI_OP_SCSI_DATA_OUT)
+		task_reserve_itt(task, &task->hdr->itt);
+
+	log_debug(1 << CXGBI_DBG_ISCSI | 1 << CXGBI_DBG_PDU_TX,
+		"task 0x%p, op 0x%x, skb 0x%p,%u+%u/%u, itt 0x%x.\n",
+		task, opcode, tdata->skb, cdev->skb_tx_rsvd, headroom,
+		conn->max_xmit_dlength, ntohl(task->hdr->itt));
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(cxgbi_conn_alloc_pdu);
+
+static inline void tx_skb_setmode(struct sk_buff *skb, int hcrc, int dcrc)
+{
+	if (hcrc || dcrc) {
+		u8 submode = 0;
+
+		if (hcrc)
+			submode |= 1;
+		if (dcrc)
+			submode |= 2;
+		cxgbi_skcb_ulp_mode(skb) = (ULP2_MODE_ISCSI << 4) | submode;
+	} else
+		cxgbi_skcb_ulp_mode(skb) = 0;
+}
+
+int cxgbi_conn_init_pdu(struct iscsi_task *task, unsigned int offset,
+			      unsigned int count)
+{
+	struct iscsi_conn *conn = task->conn;
+	struct cxgbi_task_data *tdata = iscsi_task_cxgbi_data(task);
+	struct sk_buff *skb = tdata->skb;
+	unsigned int datalen = count;
+	int i, padlen = iscsi_padding(count);
+	struct page *pg;
+
+	log_debug(1 << CXGBI_DBG_ISCSI | 1 << CXGBI_DBG_PDU_TX,
+		"task 0x%p,0x%p, skb 0x%p, 0x%x,0x%x,0x%x, %u+%u.\n",
+		task, task->sc, skb, (*skb->data) & ISCSI_OPCODE_MASK,
+		ntohl(task->cmdsn), ntohl(task->hdr->itt), offset, count);
+
+	skb_put(skb, task->hdr_len);
+	tx_skb_setmode(skb, conn->hdrdgst_en, datalen ? conn->datadgst_en : 0);
+	if (!count)
+		return 0;
+
+	if (task->sc) {
+		struct scsi_data_buffer *sdb = scsi_out(task->sc);
+		struct scatterlist *sg = NULL;
+		int err;
+
+		tdata->offset = offset;
+		tdata->count = count;
+		err = sgl_seek_offset(
+					sdb->table.sgl, sdb->table.nents,
+					tdata->offset, &tdata->sgoffset, &sg);
+		if (err < 0) {
+			pr_warn("tpdu, sgl %u, bad offset %u/%u.\n",
+				sdb->table.nents, tdata->offset, sdb->length);
+			return err;
+		}
+		err = sgl_read_to_frags(sg, tdata->sgoffset, tdata->count,
+					tdata->frags, MAX_PDU_FRAGS);
+		if (err < 0) {
+			pr_warn("tpdu, sgl %u, bad offset %u + %u.\n",
+				sdb->table.nents, tdata->offset, tdata->count);
+			return err;
+		}
+		tdata->nr_frags = err;
+
+		if (tdata->nr_frags > MAX_SKB_FRAGS ||
+		    (padlen && tdata->nr_frags == MAX_SKB_FRAGS)) {
+			char *dst = skb->data + task->hdr_len;
+			struct page_frag *frag = tdata->frags;
+
+			/* data fits in the skb's headroom */
+			for (i = 0; i < tdata->nr_frags; i++, frag++) {
+				char *src = kmap_atomic(frag->page);
+
+				memcpy(dst, src+frag->offset, frag->size);
+				dst += frag->size;
+				kunmap_atomic(src);
+			}
+			if (padlen) {
+				memset(dst, 0, padlen);
+				padlen = 0;
+			}
+			skb_put(skb, count + padlen);
+		} else {
+			/* data fit into frag_list */
+			for (i = 0; i < tdata->nr_frags; i++) {
+				__skb_fill_page_desc(skb, i,
+						tdata->frags[i].page,
+						tdata->frags[i].offset,
+						tdata->frags[i].size);
+				skb_frag_ref(skb, i);
+			}
+			skb_shinfo(skb)->nr_frags = tdata->nr_frags;
+			skb->len += count;
+			skb->data_len += count;
+			skb->truesize += count;
+		}
+
+	} else {
+		pg = virt_to_page(task->data);
+
+		get_page(pg);
+		skb_fill_page_desc(skb, 0, pg, offset_in_page(task->data),
+					count);
+		skb->len += count;
+		skb->data_len += count;
+		skb->truesize += count;
+	}
+
+	if (padlen) {
+		i = skb_shinfo(skb)->nr_frags;
+		skb_fill_page_desc(skb, skb_shinfo(skb)->nr_frags,
+				virt_to_page(padding), offset_in_page(padding),
+				padlen);
+
+		skb->data_len += padlen;
+		skb->truesize += padlen;
+		skb->len += padlen;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(cxgbi_conn_init_pdu);
+
+int cxgbi_conn_xmit_pdu(struct iscsi_task *task)
+{
+	struct iscsi_tcp_conn *tcp_conn = task->conn->dd_data;
+	struct cxgbi_conn *cconn = tcp_conn->dd_data;
+	struct cxgbi_task_data *tdata = iscsi_task_cxgbi_data(task);
+	struct sk_buff *skb = tdata->skb;
+	unsigned int datalen;
+	int err;
+
+	if (!skb) {
+		log_debug(1 << CXGBI_DBG_ISCSI | 1 << CXGBI_DBG_PDU_TX,
+			"task 0x%p, skb NULL.\n", task);
+		return 0;
+	}
+
+	datalen = skb->data_len;
+	tdata->skb = NULL;
+	err = cxgbi_sock_send_pdus(cconn->cep->csk, skb);
+	if (err > 0) {
+		int pdulen = err;
+
+		log_debug(1 << CXGBI_DBG_PDU_TX,
+			"task 0x%p,0x%p, skb 0x%p, len %u/%u, rv %d.\n",
+			task, task->sc, skb, skb->len, skb->data_len, err);
+
+		if (task->conn->hdrdgst_en)
+			pdulen += ISCSI_DIGEST_SIZE;
+
+		if (datalen && task->conn->datadgst_en)
+			pdulen += ISCSI_DIGEST_SIZE;
+
+		task->conn->txdata_octets += pdulen;
+		return 0;
+	}
+
+	if (err == -EAGAIN || err == -ENOBUFS) {
+		log_debug(1 << CXGBI_DBG_PDU_TX,
+			"task 0x%p, skb 0x%p, len %u/%u, %d EAGAIN.\n",
+			task, skb, skb->len, skb->data_len, err);
+		/* reset skb to send when we are called again */
+		tdata->skb = skb;
+		return err;
+	}
+
+	kfree_skb(skb);
+	log_debug(1 << CXGBI_DBG_ISCSI | 1 << CXGBI_DBG_PDU_TX,
+		"itt 0x%x, skb 0x%p, len %u/%u, xmit err %d.\n",
+		task->itt, skb, skb->len, skb->data_len, err);
+	iscsi_conn_printk(KERN_ERR, task->conn, "xmit err %d.\n", err);
+	iscsi_conn_failure(task->conn, ISCSI_ERR_XMIT_FAILED);
+	return err;
+}
+EXPORT_SYMBOL_GPL(cxgbi_conn_xmit_pdu);
+
+void cxgbi_cleanup_task(struct iscsi_task *task)
+{
+	struct cxgbi_task_data *tdata = iscsi_task_cxgbi_data(task);
+
+	log_debug(1 << CXGBI_DBG_ISCSI,
+		"task 0x%p, skb 0x%p, itt 0x%x.\n",
+		task, tdata->skb, task->hdr_itt);
+
+	/*  never reached the xmit task callout */
+	if (tdata->skb)
+		__kfree_skb(tdata->skb);
+	memset(tdata, 0, sizeof(*tdata));
+
+	task_release_itt(task, task->hdr_itt);
+	iscsi_tcp_cleanup_task(task);
+}
+EXPORT_SYMBOL_GPL(cxgbi_cleanup_task);
+
+void cxgbi_get_conn_stats(struct iscsi_cls_conn *cls_conn,
+				struct iscsi_stats *stats)
+{
+	struct iscsi_conn *conn = cls_conn->dd_data;
+
+	stats->txdata_octets = conn->txdata_octets;
+	stats->rxdata_octets = conn->rxdata_octets;
+	stats->scsicmd_pdus = conn->scsicmd_pdus_cnt;
+	stats->dataout_pdus = conn->dataout_pdus_cnt;
+	stats->scsirsp_pdus = conn->scsirsp_pdus_cnt;
+	stats->datain_pdus = conn->datain_pdus_cnt;
+	stats->r2t_pdus = conn->r2t_pdus_cnt;
+	stats->tmfcmd_pdus = conn->tmfcmd_pdus_cnt;
+	stats->tmfrsp_pdus = conn->tmfrsp_pdus_cnt;
+	stats->digest_err = 0;
+	stats->timeout_err = 0;
+	stats->custom_length = 1;
+	strcpy(stats->custom[0].desc, "eh_abort_cnt");
+	stats->custom[0].value = conn->eh_abort_cnt;
+}
+EXPORT_SYMBOL_GPL(cxgbi_get_conn_stats);
+
+static int cxgbi_conn_max_xmit_dlength(struct iscsi_conn *conn)
+{
+	struct iscsi_tcp_conn *tcp_conn = conn->dd_data;
+	struct cxgbi_conn *cconn = tcp_conn->dd_data;
+	struct cxgbi_device *cdev = cconn->chba->cdev;
+	unsigned int headroom = SKB_MAX_HEAD(cdev->skb_tx_rsvd);
+	unsigned int max_def = 512 * MAX_SKB_FRAGS;
+	unsigned int max = max(max_def, headroom);
+
+	max = min(cconn->chba->cdev->tx_max_size, max);
+	if (conn->max_xmit_dlength)
+		conn->max_xmit_dlength = min(conn->max_xmit_dlength, max);
+	else
+		conn->max_xmit_dlength = max;
+	cxgbi_align_pdu_size(conn->max_xmit_dlength);
+
+	return 0;
+}
+
+static int cxgbi_conn_max_recv_dlength(struct iscsi_conn *conn)
+{
+	struct iscsi_tcp_conn *tcp_conn = conn->dd_data;
+	struct cxgbi_conn *cconn = tcp_conn->dd_data;
+	unsigned int max = cconn->chba->cdev->rx_max_size;
+
+	cxgbi_align_pdu_size(max);
+
+	if (conn->max_recv_dlength) {
+		if (conn->max_recv_dlength > max) {
+			pr_err("MaxRecvDataSegmentLength %u > %u.\n",
+				conn->max_recv_dlength, max);
+			return -EINVAL;
+		}
+		conn->max_recv_dlength = min(conn->max_recv_dlength, max);
+		cxgbi_align_pdu_size(conn->max_recv_dlength);
+	} else
+		conn->max_recv_dlength = max;
+
+	return 0;
+}
+
+int cxgbi_set_conn_param(struct iscsi_cls_conn *cls_conn,
+			enum iscsi_param param, char *buf, int buflen)
+{
+	struct iscsi_conn *conn = cls_conn->dd_data;
+	struct iscsi_tcp_conn *tcp_conn = conn->dd_data;
+	struct cxgbi_conn *cconn = tcp_conn->dd_data;
+	struct cxgbi_sock *csk = cconn->cep->csk;
+	int err;
+
+	log_debug(1 << CXGBI_DBG_ISCSI,
+		"cls_conn 0x%p, param %d, buf(%d) %s.\n",
+		cls_conn, param, buflen, buf);
+
+	switch (param) {
+	case ISCSI_PARAM_HDRDGST_EN:
+		err = iscsi_set_param(cls_conn, param, buf, buflen);
+		if (!err && conn->hdrdgst_en)
+			err = csk->cdev->csk_ddp_setup_digest(csk, csk->tid,
+							conn->hdrdgst_en,
+							conn->datadgst_en, 0);
+		break;
+	case ISCSI_PARAM_DATADGST_EN:
+		err = iscsi_set_param(cls_conn, param, buf, buflen);
+		if (!err && conn->datadgst_en)
+			err = csk->cdev->csk_ddp_setup_digest(csk, csk->tid,
+							conn->hdrdgst_en,
+							conn->datadgst_en, 0);
+		break;
+	case ISCSI_PARAM_MAX_R2T:
+		return iscsi_tcp_set_max_r2t(conn, buf);
+	case ISCSI_PARAM_MAX_RECV_DLENGTH:
+		err = iscsi_set_param(cls_conn, param, buf, buflen);
+		if (!err)
+			err = cxgbi_conn_max_recv_dlength(conn);
+		break;
+	case ISCSI_PARAM_MAX_XMIT_DLENGTH:
+		err = iscsi_set_param(cls_conn, param, buf, buflen);
+		if (!err)
+			err = cxgbi_conn_max_xmit_dlength(conn);
+		break;
+	default:
+		return iscsi_set_param(cls_conn, param, buf, buflen);
+	}
+	return err;
+}
+EXPORT_SYMBOL_GPL(cxgbi_set_conn_param);
+
+static inline int csk_print_port(struct cxgbi_sock *csk, char *buf)
+{
+	int len;
+
+	cxgbi_sock_get(csk);
+	len = sprintf(buf, "%hu\n", ntohs(csk->daddr.sin_port));
+	cxgbi_sock_put(csk);
+
+	return len;
+}
+
+static inline int csk_print_ip(struct cxgbi_sock *csk, char *buf)
+{
+	int len;
+
+	cxgbi_sock_get(csk);
+	if (csk->csk_family == AF_INET)
+		len = sprintf(buf, "%pI4",
+			      &csk->daddr.sin_addr.s_addr);
+	else
+		len = sprintf(buf, "%pI6",
+			      &csk->daddr6.sin6_addr);
+
+	cxgbi_sock_put(csk);
+
+	return len;
+}
+
+int cxgbi_get_ep_param(struct iscsi_endpoint *ep, enum iscsi_param param,
+		       char *buf)
+{
+	struct cxgbi_endpoint *cep = ep->dd_data;
+	struct cxgbi_sock *csk;
+	int len;
+
+	log_debug(1 << CXGBI_DBG_ISCSI,
+		"cls_conn 0x%p, param %d.\n", ep, param);
+
+	switch (param) {
+	case ISCSI_PARAM_CONN_PORT:
+	case ISCSI_PARAM_CONN_ADDRESS:
+		if (!cep)
+			return -ENOTCONN;
+
+		csk = cep->csk;
+		if (!csk)
+			return -ENOTCONN;
+
+		return iscsi_conn_get_addr_param((struct sockaddr_storage *)
+						 &csk->daddr, param, buf);
+	default:
+		return -ENOSYS;
+	}
+	return len;
+}
+EXPORT_SYMBOL_GPL(cxgbi_get_ep_param);
+
+struct iscsi_cls_conn *
+cxgbi_create_conn(struct iscsi_cls_session *cls_session, u32 cid)
+{
+	struct iscsi_cls_conn *cls_conn;
+	struct iscsi_conn *conn;
+	struct iscsi_tcp_conn *tcp_conn;
+	struct cxgbi_conn *cconn;
+
+	cls_conn = iscsi_tcp_conn_setup(cls_session, sizeof(*cconn), cid);
+	if (!cls_conn)
+		return NULL;
+
+	conn = cls_conn->dd_data;
+	tcp_conn = conn->dd_data;
+	cconn = tcp_conn->dd_data;
+	cconn->iconn = conn;
+
+	log_debug(1 << CXGBI_DBG_ISCSI,
+		"cid %u(0x%x), cls 0x%p,0x%p, conn 0x%p,0x%p,0x%p.\n",
+		cid, cid, cls_session, cls_conn, conn, tcp_conn, cconn);
+
+	return cls_conn;
+}
+EXPORT_SYMBOL_GPL(cxgbi_create_conn);
+
+int cxgbi_bind_conn(struct iscsi_cls_session *cls_session,
+				struct iscsi_cls_conn *cls_conn,
+				u64 transport_eph, int is_leading)
+{
+	struct iscsi_conn *conn = cls_conn->dd_data;
+	struct iscsi_tcp_conn *tcp_conn = conn->dd_data;
+	struct cxgbi_conn *cconn = tcp_conn->dd_data;
+	struct iscsi_endpoint *ep;
+	struct cxgbi_endpoint *cep;
+	struct cxgbi_sock *csk;
+	int err;
+
+	ep = iscsi_lookup_endpoint(transport_eph);
+	if (!ep)
+		return -EINVAL;
+
+	/*  setup ddp pagesize */
+	cep = ep->dd_data;
+	csk = cep->csk;
+	err = csk->cdev->csk_ddp_setup_pgidx(csk, csk->tid, page_idx, 0);
+	if (err < 0)
+		return err;
+
+	err = iscsi_conn_bind(cls_session, cls_conn, is_leading);
+	if (err)
+		return -EINVAL;
+
+	/*  calculate the tag idx bits needed for this conn based on cmds_max */
+	cconn->task_idx_bits = (__ilog2_u32(conn->session->cmds_max - 1)) + 1;
+
+	write_lock_bh(&csk->callback_lock);
+	csk->user_data = conn;
+	cconn->chba = cep->chba;
+	cconn->cep = cep;
+	cep->cconn = cconn;
+	write_unlock_bh(&csk->callback_lock);
+
+	cxgbi_conn_max_xmit_dlength(conn);
+	cxgbi_conn_max_recv_dlength(conn);
+
+	log_debug(1 << CXGBI_DBG_ISCSI,
+		"cls 0x%p,0x%p, ep 0x%p, cconn 0x%p, csk 0x%p.\n",
+		cls_session, cls_conn, ep, cconn, csk);
+	/*  init recv engine */
+	iscsi_tcp_hdr_recv_prep(tcp_conn);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(cxgbi_bind_conn);
+
+struct iscsi_cls_session *cxgbi_create_session(struct iscsi_endpoint *ep,
+						u16 cmds_max, u16 qdepth,
+						u32 initial_cmdsn)
+{
+	struct cxgbi_endpoint *cep;
+	struct cxgbi_hba *chba;
+	struct Scsi_Host *shost;
+	struct iscsi_cls_session *cls_session;
+	struct iscsi_session *session;
+
+	if (!ep) {
+		pr_err("missing endpoint.\n");
+		return NULL;
+	}
+
+	cep = ep->dd_data;
+	chba = cep->chba;
+	shost = chba->shost;
+
+	BUG_ON(chba != iscsi_host_priv(shost));
+
+	cls_session = iscsi_session_setup(chba->cdev->itp, shost,
+					cmds_max, 0,
+					sizeof(struct iscsi_tcp_task) +
+					sizeof(struct cxgbi_task_data),
+					initial_cmdsn, ISCSI_MAX_TARGET);
+	if (!cls_session)
+		return NULL;
+
+	session = cls_session->dd_data;
+	if (iscsi_tcp_r2tpool_alloc(session))
+		goto remove_session;
+
+	log_debug(1 << CXGBI_DBG_ISCSI,
+		"ep 0x%p, cls sess 0x%p.\n", ep, cls_session);
+	return cls_session;
+
+remove_session:
+	iscsi_session_teardown(cls_session);
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(cxgbi_create_session);
+
+void cxgbi_destroy_session(struct iscsi_cls_session *cls_session)
+{
+	log_debug(1 << CXGBI_DBG_ISCSI,
+		"cls sess 0x%p.\n", cls_session);
+
+	iscsi_tcp_r2tpool_free(cls_session->dd_data);
+	iscsi_session_teardown(cls_session);
+}
+EXPORT_SYMBOL_GPL(cxgbi_destroy_session);
+
+int cxgbi_set_host_param(struct Scsi_Host *shost, enum iscsi_host_param param,
+			char *buf, int buflen)
+{
+	struct cxgbi_hba *chba = iscsi_host_priv(shost);
+
+	if (!chba->ndev) {
+		shost_printk(KERN_ERR, shost, "Could not get host param. "
+				"netdev for host not set.\n");
+		return -ENODEV;
+	}
+
+	log_debug(1 << CXGBI_DBG_ISCSI,
+		"shost 0x%p, hba 0x%p,%s, param %d, buf(%d) %s.\n",
+		shost, chba, chba->ndev->name, param, buflen, buf);
+
+	switch (param) {
+	case ISCSI_HOST_PARAM_IPADDRESS:
+	{
+		__be32 addr = in_aton(buf);
+		log_debug(1 << CXGBI_DBG_ISCSI,
+			"hba %s, req. ipv4 %pI4.\n", chba->ndev->name, &addr);
+		cxgbi_set_iscsi_ipv4(chba, addr);
+		return 0;
+	}
+	case ISCSI_HOST_PARAM_HWADDRESS:
+	case ISCSI_HOST_PARAM_NETDEV_NAME:
+		return 0;
+	default:
+		return iscsi_host_set_param(shost, param, buf, buflen);
+	}
+}
+EXPORT_SYMBOL_GPL(cxgbi_set_host_param);
+
+int cxgbi_get_host_param(struct Scsi_Host *shost, enum iscsi_host_param param,
+			char *buf)
+{
+	struct cxgbi_hba *chba = iscsi_host_priv(shost);
+	int len = 0;
+
+	if (!chba->ndev) {
+		shost_printk(KERN_ERR, shost, "Could not get host param. "
+				"netdev for host not set.\n");
+		return -ENODEV;
+	}
+
+	log_debug(1 << CXGBI_DBG_ISCSI,
+		"shost 0x%p, hba 0x%p,%s, param %d.\n",
+		shost, chba, chba->ndev->name, param);
+
+	switch (param) {
+	case ISCSI_HOST_PARAM_HWADDRESS:
+		len = sysfs_format_mac(buf, chba->ndev->dev_addr, 6);
+		break;
+	case ISCSI_HOST_PARAM_NETDEV_NAME:
+		len = sprintf(buf, "%s\n", chba->ndev->name);
+		break;
+	case ISCSI_HOST_PARAM_IPADDRESS:
+	{
+		struct cxgbi_sock *csk = find_sock_on_port(chba->cdev,
+							   chba->port_id);
+		if (csk) {
+			len = sprintf(buf, "%pIS",
+				      (struct sockaddr *)&csk->saddr);
+		}
+		log_debug(1 << CXGBI_DBG_ISCSI,
+			  "hba %s, addr %s.\n", chba->ndev->name, buf);
+		break;
+	}
+	default:
+		return iscsi_host_get_param(shost, param, buf);
+	}
+
+	return len;
+}
+EXPORT_SYMBOL_GPL(cxgbi_get_host_param);
+
+struct iscsi_endpoint *cxgbi_ep_connect(struct Scsi_Host *shost,
+					struct sockaddr *dst_addr,
+					int non_blocking)
+{
+	struct iscsi_endpoint *ep;
+	struct cxgbi_endpoint *cep;
+	struct cxgbi_hba *hba = NULL;
+	struct cxgbi_sock *csk;
+	int err = -EINVAL;
+
+	log_debug(1 << CXGBI_DBG_ISCSI | 1 << CXGBI_DBG_SOCK,
+		"shost 0x%p, non_blocking %d, dst_addr 0x%p.\n",
+		shost, non_blocking, dst_addr);
+
+	if (shost) {
+		hba = iscsi_host_priv(shost);
+		if (!hba) {
+			pr_info("shost 0x%p, priv NULL.\n", shost);
+			goto err_out;
+		}
+	}
+
+	if (dst_addr->sa_family == AF_INET) {
+		csk = cxgbi_check_route(dst_addr);
+#if IS_ENABLED(CONFIG_IPV6)
+	} else if (dst_addr->sa_family == AF_INET6) {
+		csk = cxgbi_check_route6(dst_addr);
+#endif
+	} else {
+		pr_info("address family 0x%x NOT supported.\n",
+			dst_addr->sa_family);
+		err = -EAFNOSUPPORT;
+		return (struct iscsi_endpoint *)ERR_PTR(err);
+	}
+
+	if (IS_ERR(csk))
+		return (struct iscsi_endpoint *)csk;
+	cxgbi_sock_get(csk);
+
+	if (!hba)
+		hba = csk->cdev->hbas[csk->port_id];
+	else if (hba != csk->cdev->hbas[csk->port_id]) {
+		pr_info("Could not connect through requested host %u"
+			"hba 0x%p != 0x%p (%u).\n",
+			shost->host_no, hba,
+			csk->cdev->hbas[csk->port_id], csk->port_id);
+		err = -ENOSPC;
+		goto release_conn;
+	}
+
+	err = sock_get_port(csk);
+	if (err)
+		goto release_conn;
+
+	cxgbi_sock_set_state(csk, CTP_CONNECTING);
+	err = csk->cdev->csk_init_act_open(csk);
+	if (err)
+		goto release_conn;
+
+	if (cxgbi_sock_is_closing(csk)) {
+		err = -ENOSPC;
+		pr_info("csk 0x%p is closing.\n", csk);
+		goto release_conn;
+	}
+
+	ep = iscsi_create_endpoint(sizeof(*cep));
+	if (!ep) {
+		err = -ENOMEM;
+		pr_info("iscsi alloc ep, OOM.\n");
+		goto release_conn;
+	}
+
+	cep = ep->dd_data;
+	cep->csk = csk;
+	cep->chba = hba;
+
+	log_debug(1 << CXGBI_DBG_ISCSI | 1 << CXGBI_DBG_SOCK,
+		"ep 0x%p, cep 0x%p, csk 0x%p, hba 0x%p,%s.\n",
+		ep, cep, csk, hba, hba->ndev->name);
+	return ep;
+
+release_conn:
+	cxgbi_sock_put(csk);
+	cxgbi_sock_closed(csk);
+err_out:
+	return ERR_PTR(err);
+}
+EXPORT_SYMBOL_GPL(cxgbi_ep_connect);
+
+int cxgbi_ep_poll(struct iscsi_endpoint *ep, int timeout_ms)
+{
+	struct cxgbi_endpoint *cep = ep->dd_data;
+	struct cxgbi_sock *csk = cep->csk;
+
+	if (!cxgbi_sock_is_established(csk))
+		return 0;
+	return 1;
+}
+EXPORT_SYMBOL_GPL(cxgbi_ep_poll);
+
+void cxgbi_ep_disconnect(struct iscsi_endpoint *ep)
+{
+	struct cxgbi_endpoint *cep = ep->dd_data;
+	struct cxgbi_conn *cconn = cep->cconn;
+	struct cxgbi_sock *csk = cep->csk;
+
+	log_debug(1 << CXGBI_DBG_ISCSI | 1 << CXGBI_DBG_SOCK,
+		"ep 0x%p, cep 0x%p, cconn 0x%p, csk 0x%p,%u,0x%lx.\n",
+		ep, cep, cconn, csk, csk->state, csk->flags);
+
+	if (cconn && cconn->iconn) {
+		iscsi_suspend_tx(cconn->iconn);
+		write_lock_bh(&csk->callback_lock);
+		cep->csk->user_data = NULL;
+		cconn->cep = NULL;
+		write_unlock_bh(&csk->callback_lock);
+	}
+	iscsi_destroy_endpoint(ep);
+
+	if (likely(csk->state >= CTP_ESTABLISHED))
+		need_active_close(csk);
+	else
+		cxgbi_sock_closed(csk);
+
+	cxgbi_sock_put(csk);
+}
+EXPORT_SYMBOL_GPL(cxgbi_ep_disconnect);
+
+int cxgbi_iscsi_init(struct iscsi_transport *itp,
+			struct scsi_transport_template **stt)
+{
+	*stt = iscsi_register_transport(itp);
+	if (*stt == NULL) {
+		pr_err("unable to register %s transport 0x%p.\n",
+			itp->name, itp);
+		return -ENODEV;
+	}
+	log_debug(1 << CXGBI_DBG_ISCSI,
+		"%s, registered iscsi transport 0x%p.\n",
+		itp->name, stt);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(cxgbi_iscsi_init);
+
+void cxgbi_iscsi_cleanup(struct iscsi_transport *itp,
+			struct scsi_transport_template **stt)
+{
+	if (*stt) {
+		log_debug(1 << CXGBI_DBG_ISCSI,
+			"de-register transport 0x%p, %s, stt 0x%p.\n",
+			itp, itp->name, *stt);
+		*stt = NULL;
+		iscsi_unregister_transport(itp);
+	}
+}
+EXPORT_SYMBOL_GPL(cxgbi_iscsi_cleanup);
+
+umode_t cxgbi_attr_is_visible(int param_type, int param)
+{
+	switch (param_type) {
+	case ISCSI_HOST_PARAM:
+		switch (param) {
+		case ISCSI_HOST_PARAM_NETDEV_NAME:
+		case ISCSI_HOST_PARAM_HWADDRESS:
+		case ISCSI_HOST_PARAM_IPADDRESS:
+		case ISCSI_HOST_PARAM_INITIATOR_NAME:
+			return S_IRUGO;
+		default:
+			return 0;
+		}
+	case ISCSI_PARAM:
+		switch (param) {
+		case ISCSI_PARAM_MAX_RECV_DLENGTH:
+		case ISCSI_PARAM_MAX_XMIT_DLENGTH:
+		case ISCSI_PARAM_HDRDGST_EN:
+		case ISCSI_PARAM_DATADGST_EN:
+		case ISCSI_PARAM_CONN_ADDRESS:
+		case ISCSI_PARAM_CONN_PORT:
+		case ISCSI_PARAM_EXP_STATSN:
+		case ISCSI_PARAM_PERSISTENT_ADDRESS:
+		case ISCSI_PARAM_PERSISTENT_PORT:
+		case ISCSI_PARAM_PING_TMO:
+		case ISCSI_PARAM_RECV_TMO:
+		case ISCSI_PARAM_INITIAL_R2T_EN:
+		case ISCSI_PARAM_MAX_R2T:
+		case ISCSI_PARAM_IMM_DATA_EN:
+		case ISCSI_PARAM_FIRST_BURST:
+		case ISCSI_PARAM_MAX_BURST:
+		case ISCSI_PARAM_PDU_INORDER_EN:
+		case ISCSI_PARAM_DATASEQ_INORDER_EN:
+		case ISCSI_PARAM_ERL:
+		case ISCSI_PARAM_TARGET_NAME:
+		case ISCSI_PARAM_TPGT:
+		case ISCSI_PARAM_USERNAME:
+		case ISCSI_PARAM_PASSWORD:
+		case ISCSI_PARAM_USERNAME_IN:
+		case ISCSI_PARAM_PASSWORD_IN:
+		case ISCSI_PARAM_FAST_ABORT:
+		case ISCSI_PARAM_ABORT_TMO:
+		case ISCSI_PARAM_LU_RESET_TMO:
+		case ISCSI_PARAM_TGT_RESET_TMO:
+		case ISCSI_PARAM_IFACE_NAME:
+		case ISCSI_PARAM_INITIATOR_NAME:
+			return S_IRUGO;
+		default:
+			return 0;
+		}
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(cxgbi_attr_is_visible);
+
+static int __init libcxgbi_init_module(void)
+{
+	sw_tag_idx_bits = (__ilog2_u32(ISCSI_ITT_MASK)) + 1;
+	sw_tag_age_bits = (__ilog2_u32(ISCSI_AGE_MASK)) + 1;
+
+	pr_info("tag itt 0x%x, %u bits, age 0x%x, %u bits.\n",
+		ISCSI_ITT_MASK, sw_tag_idx_bits,
+		ISCSI_AGE_MASK, sw_tag_age_bits);
+
+	ddp_setup_host_page_size();
+	return 0;
+}
+
+static void __exit libcxgbi_exit_module(void)
+{
+	cxgbi_device_unregister_all(0xFF);
+	return;
+}
+
+module_init(libcxgbi_init_module);
+module_exit(libcxgbi_exit_module);
diff --git a/drivers/scsi/cxgbi/libcxgbi.h b/drivers/scsi/cxgbi/libcxgbi.h
new file mode 100644
index 0000000..2c7cb1c
--- /dev/null
+++ b/drivers/scsi/cxgbi/libcxgbi.h
@@ -0,0 +1,758 @@
+/*
+ * libcxgbi.h: Chelsio common library for T3/T4 iSCSI driver.
+ *
+ * Copyright (c) 2010 Chelsio Communications, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.
+ *
+ * Written by: Karen Xie (kxie@chelsio.com)
+ * Written by: Rakesh Ranjan (rranjan@chelsio.com)
+ */
+
+#ifndef	__LIBCXGBI_H__
+#define	__LIBCXGBI_H__
+
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/debugfs.h>
+#include <linux/list.h>
+#include <linux/netdevice.h>
+#include <linux/if_vlan.h>
+#include <linux/scatterlist.h>
+#include <linux/skbuff.h>
+#include <linux/vmalloc.h>
+#include <scsi/scsi_device.h>
+#include <scsi/libiscsi_tcp.h>
+
+enum cxgbi_dbg_flag {
+	CXGBI_DBG_ISCSI,
+	CXGBI_DBG_DDP,
+	CXGBI_DBG_TOE,
+	CXGBI_DBG_SOCK,
+
+	CXGBI_DBG_PDU_TX,
+	CXGBI_DBG_PDU_RX,
+	CXGBI_DBG_DEV,
+};
+
+#define log_debug(level, fmt, ...)	\
+	do {	\
+		if (dbg_level & (level)) \
+			pr_info(fmt, ##__VA_ARGS__); \
+	} while (0)
+
+#define pr_info_ipaddr(fmt_trail,					\
+			addr1, addr2, args_trail...)			\
+do {									\
+	if (!((1 << CXGBI_DBG_SOCK) & dbg_level))			\
+		break;							\
+	pr_info("%pISpc - %pISpc, " fmt_trail,				\
+		addr1, addr2, args_trail);				\
+} while (0)
+
+/* max. connections per adapter */
+#define CXGBI_MAX_CONN		16384
+
+/* always allocate rooms for AHS */
+#define SKB_TX_ISCSI_PDU_HEADER_MAX	\
+	(sizeof(struct iscsi_hdr) + ISCSI_MAX_AHS_SIZE)
+
+#define	ISCSI_PDU_NONPAYLOAD_LEN	312 /* bhs(48) + ahs(256) + digest(8)*/
+
+/*
+ * align pdu size to multiple of 512 for better performance
+ */
+#define cxgbi_align_pdu_size(n) do { n = (n) & (~511); } while (0)
+
+#define ULP2_MODE_ISCSI		2
+
+#define ULP2_MAX_PKT_SIZE	16224
+#define ULP2_MAX_PDU_PAYLOAD	\
+	(ULP2_MAX_PKT_SIZE - ISCSI_PDU_NONPAYLOAD_LEN)
+
+/*
+ * For iscsi connections HW may inserts digest bytes into the pdu. Those digest
+ * bytes are not sent by the host but are part of the TCP payload and therefore
+ * consume TCP sequence space.
+ */
+static const unsigned int ulp2_extra_len[] = { 0, 4, 4, 8 };
+static inline unsigned int cxgbi_ulp_extra_len(int submode)
+{
+	return ulp2_extra_len[submode & 3];
+}
+
+/*
+ * struct pagepod_hdr, pagepod - pagepod format
+ */
+
+#define CPL_RX_DDP_STATUS_DDP_SHIFT	16 /* ddp'able */
+#define CPL_RX_DDP_STATUS_PAD_SHIFT	19 /* pad error */
+#define CPL_RX_DDP_STATUS_HCRC_SHIFT	20 /* hcrc error */
+#define CPL_RX_DDP_STATUS_DCRC_SHIFT	21 /* dcrc error */
+
+struct cxgbi_pagepod_hdr {
+	u32 vld_tid;
+	u32 pgsz_tag_clr;
+	u32 max_offset;
+	u32 page_offset;
+	u64 rsvd;
+};
+
+#define PPOD_PAGES_MAX			4
+struct cxgbi_pagepod {
+	struct cxgbi_pagepod_hdr hdr;
+	u64 addr[PPOD_PAGES_MAX + 1];
+};
+
+struct cxgbi_tag_format {
+	unsigned char sw_bits;
+	unsigned char rsvd_bits;
+	unsigned char rsvd_shift;
+	unsigned char filler[1];
+	u32 rsvd_mask;
+};
+
+struct cxgbi_gather_list {
+	unsigned int tag;
+	unsigned int length;
+	unsigned int offset;
+	unsigned int nelem;
+	struct page **pages;
+	dma_addr_t phys_addr[0];
+};
+
+struct cxgbi_ddp_info {
+	struct kref refcnt;
+	struct cxgbi_device *cdev;
+	struct pci_dev *pdev;
+	unsigned int max_txsz;
+	unsigned int max_rxsz;
+	unsigned int llimit;
+	unsigned int ulimit;
+	unsigned int nppods;
+	unsigned int idx_last;
+	unsigned char idx_bits;
+	unsigned char filler[3];
+	unsigned int idx_mask;
+	unsigned int rsvd_tag_mask;
+	spinlock_t map_lock;
+	struct cxgbi_gather_list **gl_map;
+};
+
+#define DDP_PGIDX_MAX		4
+#define DDP_THRESHOLD		2048
+
+#define PPOD_PAGES_SHIFT	2       /*  4 pages per pod */
+
+#define PPOD_SIZE               sizeof(struct cxgbi_pagepod)  /*  64 */
+#define PPOD_SIZE_SHIFT         6
+
+#define ULPMEM_DSGL_MAX_NPPODS	16	/*  1024/PPOD_SIZE */
+#define ULPMEM_IDATA_MAX_NPPODS	4	/*  256/PPOD_SIZE */
+#define PCIE_MEMWIN_MAX_NPPODS	16	/*  1024/PPOD_SIZE */
+
+#define PPOD_COLOR_SHIFT	0
+#define PPOD_COLOR(x)		((x) << PPOD_COLOR_SHIFT)
+
+#define PPOD_IDX_SHIFT          6
+#define PPOD_IDX_MAX_SIZE       24
+
+#define PPOD_TID_SHIFT		0
+#define PPOD_TID(x)		((x) << PPOD_TID_SHIFT)
+
+#define PPOD_TAG_SHIFT		6
+#define PPOD_TAG(x)		((x) << PPOD_TAG_SHIFT)
+
+#define PPOD_VALID_SHIFT	24
+#define PPOD_VALID(x)		((x) << PPOD_VALID_SHIFT)
+#define PPOD_VALID_FLAG		PPOD_VALID(1U)
+
+/*
+ * sge_opaque_hdr -
+ * Opaque version of structure the SGE stores at skb->head of TX_DATA packets
+ * and for which we must reserve space.
+ */
+struct sge_opaque_hdr {
+	void *dev;
+	dma_addr_t addr[MAX_SKB_FRAGS + 1];
+};
+
+struct cxgbi_sock {
+	struct cxgbi_device *cdev;
+
+	int tid;
+	int atid;
+	unsigned long flags;
+	unsigned int mtu;
+	unsigned short rss_qid;
+	unsigned short txq_idx;
+	unsigned short advmss;
+	unsigned int tx_chan;
+	unsigned int rx_chan;
+	unsigned int mss_idx;
+	unsigned int smac_idx;
+	unsigned char port_id;
+	int wr_max_cred;
+	int wr_cred;
+	int wr_una_cred;
+	unsigned char hcrc_len;
+	unsigned char dcrc_len;
+
+	void *l2t;
+	struct sk_buff *wr_pending_head;
+	struct sk_buff *wr_pending_tail;
+	struct sk_buff *cpl_close;
+	struct sk_buff *cpl_abort_req;
+	struct sk_buff *cpl_abort_rpl;
+	struct sk_buff *skb_ulp_lhdr;
+	spinlock_t lock;
+	struct kref refcnt;
+	unsigned int state;
+	unsigned int csk_family;
+	union {
+		struct sockaddr_in saddr;
+		struct sockaddr_in6 saddr6;
+	};
+	union {
+		struct sockaddr_in daddr;
+		struct sockaddr_in6 daddr6;
+	};
+	struct dst_entry *dst;
+	struct sk_buff_head receive_queue;
+	struct sk_buff_head write_queue;
+	struct timer_list retry_timer;
+	int err;
+	rwlock_t callback_lock;
+	void *user_data;
+
+	u32 rcv_nxt;
+	u32 copied_seq;
+	u32 rcv_wup;
+	u32 snd_nxt;
+	u32 snd_una;
+	u32 write_seq;
+};
+
+/*
+ * connection states
+ */
+enum cxgbi_sock_states{
+	CTP_CLOSED,
+	CTP_CONNECTING,
+	CTP_ACTIVE_OPEN,
+	CTP_ESTABLISHED,
+	CTP_ACTIVE_CLOSE,
+	CTP_PASSIVE_CLOSE,
+	CTP_CLOSE_WAIT_1,
+	CTP_CLOSE_WAIT_2,
+	CTP_ABORTING,
+};
+
+/*
+ * Connection flags -- many to track some close related events.
+ */
+enum cxgbi_sock_flags {
+	CTPF_ABORT_RPL_RCVD,	/*received one ABORT_RPL_RSS message */
+	CTPF_ABORT_REQ_RCVD,	/*received one ABORT_REQ_RSS message */
+	CTPF_ABORT_RPL_PENDING,	/* expecting an abort reply */
+	CTPF_TX_DATA_SENT,	/* already sent a TX_DATA WR */
+	CTPF_ACTIVE_CLOSE_NEEDED,/* need to be closed */
+	CTPF_HAS_ATID,		/* reserved atid */
+	CTPF_HAS_TID,		/* reserved hw tid */
+	CTPF_OFFLOAD_DOWN,	/* offload function off */
+};
+
+struct cxgbi_skb_rx_cb {
+	__u32 ddigest;
+	__u32 pdulen;
+};
+
+struct cxgbi_skb_tx_cb {
+	void *l2t;
+	struct sk_buff *wr_next;
+};
+
+enum cxgbi_skcb_flags {
+	SKCBF_TX_NEED_HDR,	/* packet needs a header */
+	SKCBF_RX_COALESCED,	/* received whole pdu */
+	SKCBF_RX_HDR,		/* received pdu header */
+	SKCBF_RX_DATA,		/* received pdu payload */
+	SKCBF_RX_STATUS,	/* received ddp status */
+	SKCBF_RX_DATA_DDPD,	/* pdu payload ddp'd */
+	SKCBF_RX_HCRC_ERR,	/* header digest error */
+	SKCBF_RX_DCRC_ERR,	/* data digest error */
+	SKCBF_RX_PAD_ERR,	/* padding byte error */
+};
+
+struct cxgbi_skb_cb {
+	unsigned char ulp_mode;
+	unsigned long flags;
+	unsigned int seq;
+	union {
+		struct cxgbi_skb_rx_cb rx;
+		struct cxgbi_skb_tx_cb tx;
+	};
+};
+
+#define CXGBI_SKB_CB(skb)	((struct cxgbi_skb_cb *)&((skb)->cb[0]))
+#define cxgbi_skcb_flags(skb)		(CXGBI_SKB_CB(skb)->flags)
+#define cxgbi_skcb_ulp_mode(skb)	(CXGBI_SKB_CB(skb)->ulp_mode)
+#define cxgbi_skcb_tcp_seq(skb)		(CXGBI_SKB_CB(skb)->seq)
+#define cxgbi_skcb_rx_ddigest(skb)	(CXGBI_SKB_CB(skb)->rx.ddigest)
+#define cxgbi_skcb_rx_pdulen(skb)	(CXGBI_SKB_CB(skb)->rx.pdulen)
+#define cxgbi_skcb_tx_wr_next(skb)	(CXGBI_SKB_CB(skb)->tx.wr_next)
+
+static inline void cxgbi_skcb_set_flag(struct sk_buff *skb,
+					enum cxgbi_skcb_flags flag)
+{
+	__set_bit(flag, &(cxgbi_skcb_flags(skb)));
+}
+
+static inline void cxgbi_skcb_clear_flag(struct sk_buff *skb,
+					enum cxgbi_skcb_flags flag)
+{
+	__clear_bit(flag, &(cxgbi_skcb_flags(skb)));
+}
+
+static inline int cxgbi_skcb_test_flag(struct sk_buff *skb,
+					enum cxgbi_skcb_flags flag)
+{
+	return test_bit(flag, &(cxgbi_skcb_flags(skb)));
+}
+
+static inline void cxgbi_sock_set_flag(struct cxgbi_sock *csk,
+					enum cxgbi_sock_flags flag)
+{
+	__set_bit(flag, &csk->flags);
+	log_debug(1 << CXGBI_DBG_SOCK,
+		"csk 0x%p,%u,0x%lx, bit %d.\n",
+		csk, csk->state, csk->flags, flag);
+}
+
+static inline void cxgbi_sock_clear_flag(struct cxgbi_sock *csk,
+					enum cxgbi_sock_flags flag)
+{
+	__clear_bit(flag, &csk->flags);
+	log_debug(1 << CXGBI_DBG_SOCK,
+		"csk 0x%p,%u,0x%lx, bit %d.\n",
+		csk, csk->state, csk->flags, flag);
+}
+
+static inline int cxgbi_sock_flag(struct cxgbi_sock *csk,
+				enum cxgbi_sock_flags flag)
+{
+	if (csk == NULL)
+		return 0;
+	return test_bit(flag, &csk->flags);
+}
+
+static inline void cxgbi_sock_set_state(struct cxgbi_sock *csk, int state)
+{
+	log_debug(1 << CXGBI_DBG_SOCK,
+		"csk 0x%p,%u,0x%lx, state -> %u.\n",
+		csk, csk->state, csk->flags, state);
+	csk->state = state;
+}
+
+static inline void cxgbi_sock_free(struct kref *kref)
+{
+	struct cxgbi_sock *csk = container_of(kref,
+						struct cxgbi_sock,
+						refcnt);
+	if (csk) {
+		log_debug(1 << CXGBI_DBG_SOCK,
+			"free csk 0x%p, state %u, flags 0x%lx\n",
+			csk, csk->state, csk->flags);
+		kfree(csk);
+	}
+}
+
+static inline void __cxgbi_sock_put(const char *fn, struct cxgbi_sock *csk)
+{
+	log_debug(1 << CXGBI_DBG_SOCK,
+		"%s, put csk 0x%p, ref %u-1.\n",
+		fn, csk, atomic_read(&csk->refcnt.refcount));
+	kref_put(&csk->refcnt, cxgbi_sock_free);
+}
+#define cxgbi_sock_put(csk)	__cxgbi_sock_put(__func__, csk)
+
+static inline void __cxgbi_sock_get(const char *fn, struct cxgbi_sock *csk)
+{
+	log_debug(1 << CXGBI_DBG_SOCK,
+		"%s, get csk 0x%p, ref %u+1.\n",
+		fn, csk, atomic_read(&csk->refcnt.refcount));
+	kref_get(&csk->refcnt);
+}
+#define cxgbi_sock_get(csk)	__cxgbi_sock_get(__func__, csk)
+
+static inline int cxgbi_sock_is_closing(struct cxgbi_sock *csk)
+{
+	return csk->state >= CTP_ACTIVE_CLOSE;
+}
+
+static inline int cxgbi_sock_is_established(struct cxgbi_sock *csk)
+{
+	return csk->state == CTP_ESTABLISHED;
+}
+
+static inline void cxgbi_sock_purge_write_queue(struct cxgbi_sock *csk)
+{
+	struct sk_buff *skb;
+
+	while ((skb = __skb_dequeue(&csk->write_queue)))
+		__kfree_skb(skb);
+}
+
+static inline unsigned int cxgbi_sock_compute_wscale(unsigned int win)
+{
+	unsigned int wscale = 0;
+
+	while (wscale < 14 && (65535 << wscale) < win)
+		wscale++;
+	return wscale;
+}
+
+static inline struct sk_buff *alloc_wr(int wrlen, int dlen, gfp_t gfp)
+{
+	struct sk_buff *skb = alloc_skb(wrlen + dlen, gfp);
+
+	if (skb) {
+		__skb_put(skb, wrlen);
+		memset(skb->head, 0, wrlen + dlen);
+	} else
+		pr_info("alloc cpl wr skb %u+%u, OOM.\n", wrlen, dlen);
+	return skb;
+}
+
+
+/*
+ * The number of WRs needed for an skb depends on the number of fragments
+ * in the skb and whether it has any payload in its main body.  This maps the
+ * length of the gather list represented by an skb into the # of necessary WRs.
+ * The extra two fragments are for iscsi bhs and payload padding.
+ */
+#define SKB_WR_LIST_SIZE	 (MAX_SKB_FRAGS + 2)
+
+static inline void cxgbi_sock_reset_wr_list(struct cxgbi_sock *csk)
+{
+	csk->wr_pending_head = csk->wr_pending_tail = NULL;
+}
+
+static inline void cxgbi_sock_enqueue_wr(struct cxgbi_sock *csk,
+					  struct sk_buff *skb)
+{
+	cxgbi_skcb_tx_wr_next(skb) = NULL;
+	/*
+	 * We want to take an extra reference since both us and the driver
+	 * need to free the packet before it's really freed. We know there's
+	 * just one user currently so we use atomic_set rather than skb_get
+	 * to avoid the atomic op.
+	 */
+	atomic_set(&skb->users, 2);
+
+	if (!csk->wr_pending_head)
+		csk->wr_pending_head = skb;
+	else
+		cxgbi_skcb_tx_wr_next(csk->wr_pending_tail) = skb;
+	csk->wr_pending_tail = skb;
+}
+
+static inline int cxgbi_sock_count_pending_wrs(const struct cxgbi_sock *csk)
+{
+	int n = 0;
+	const struct sk_buff *skb = csk->wr_pending_head;
+
+	while (skb) {
+		n += skb->csum;
+		skb = cxgbi_skcb_tx_wr_next(skb);
+	}
+	return n;
+}
+
+static inline struct sk_buff *cxgbi_sock_peek_wr(const struct cxgbi_sock *csk)
+{
+	return csk->wr_pending_head;
+}
+
+static inline struct sk_buff *cxgbi_sock_dequeue_wr(struct cxgbi_sock *csk)
+{
+	struct sk_buff *skb = csk->wr_pending_head;
+
+	if (likely(skb)) {
+		csk->wr_pending_head = cxgbi_skcb_tx_wr_next(skb);
+		cxgbi_skcb_tx_wr_next(skb) = NULL;
+	}
+	return skb;
+}
+
+void cxgbi_sock_check_wr_invariants(const struct cxgbi_sock *);
+void cxgbi_sock_purge_wr_queue(struct cxgbi_sock *);
+void cxgbi_sock_skb_entail(struct cxgbi_sock *, struct sk_buff *);
+void cxgbi_sock_fail_act_open(struct cxgbi_sock *, int);
+void cxgbi_sock_act_open_req_arp_failure(void *, struct sk_buff *);
+void cxgbi_sock_closed(struct cxgbi_sock *);
+void cxgbi_sock_established(struct cxgbi_sock *, unsigned int, unsigned int);
+void cxgbi_sock_rcv_abort_rpl(struct cxgbi_sock *);
+void cxgbi_sock_rcv_peer_close(struct cxgbi_sock *);
+void cxgbi_sock_rcv_close_conn_rpl(struct cxgbi_sock *, u32);
+void cxgbi_sock_rcv_wr_ack(struct cxgbi_sock *, unsigned int, unsigned int,
+				int);
+unsigned int cxgbi_sock_select_mss(struct cxgbi_sock *, unsigned int);
+void cxgbi_sock_free_cpl_skbs(struct cxgbi_sock *);
+
+struct cxgbi_hba {
+	struct net_device *ndev;
+	struct net_device *vdev;	/* vlan dev */
+	struct Scsi_Host *shost;
+	struct cxgbi_device *cdev;
+	__be32 ipv4addr;
+	unsigned char port_id;
+};
+
+struct cxgbi_ports_map {
+	unsigned int max_connect;
+	unsigned int used;
+	unsigned short sport_base;
+	spinlock_t lock;
+	unsigned int next;
+	struct cxgbi_sock **port_csk;
+};
+
+#define CXGBI_FLAG_DEV_T3		0x1
+#define CXGBI_FLAG_DEV_T4		0x2
+#define CXGBI_FLAG_ADAPTER_RESET	0x4
+#define CXGBI_FLAG_IPV4_SET		0x10
+struct cxgbi_device {
+	struct list_head list_head;
+	struct list_head rcu_node;
+	unsigned int flags;
+	struct net_device **ports;
+	void *lldev;
+	struct cxgbi_hba **hbas;
+	const unsigned short *mtus;
+	unsigned char nmtus;
+	unsigned char nports;
+	struct pci_dev *pdev;
+	struct dentry *debugfs_root;
+	struct iscsi_transport *itp;
+
+	unsigned int pfvf;
+	unsigned int snd_win;
+	unsigned int rcv_win;
+	unsigned int rx_credit_thres;
+	unsigned int skb_tx_rsvd;
+	unsigned int skb_rx_extra;	/* for msg coalesced mode */
+	unsigned int tx_max_size;
+	unsigned int rx_max_size;
+	struct cxgbi_ports_map pmap;
+	struct cxgbi_tag_format tag_format;
+	struct cxgbi_ddp_info *ddp;
+
+	void (*dev_ddp_cleanup)(struct cxgbi_device *);
+	int (*csk_ddp_set)(struct cxgbi_sock *, struct cxgbi_pagepod_hdr *,
+				unsigned int, unsigned int,
+				struct cxgbi_gather_list *);
+	void (*csk_ddp_clear)(struct cxgbi_hba *,
+				unsigned int, unsigned int, unsigned int);
+	int (*csk_ddp_setup_digest)(struct cxgbi_sock *,
+				unsigned int, int, int, int);
+	int (*csk_ddp_setup_pgidx)(struct cxgbi_sock *,
+				unsigned int, int, bool);
+
+	void (*csk_release_offload_resources)(struct cxgbi_sock *);
+	int (*csk_rx_pdu_ready)(struct cxgbi_sock *, struct sk_buff *);
+	u32 (*csk_send_rx_credits)(struct cxgbi_sock *, u32);
+	int (*csk_push_tx_frames)(struct cxgbi_sock *, int);
+	void (*csk_send_abort_req)(struct cxgbi_sock *);
+	void (*csk_send_close_req)(struct cxgbi_sock *);
+	int (*csk_alloc_cpls)(struct cxgbi_sock *);
+	int (*csk_init_act_open)(struct cxgbi_sock *);
+
+	void *dd_data;
+};
+#define cxgbi_cdev_priv(cdev)	((cdev)->dd_data)
+
+struct cxgbi_conn {
+	struct cxgbi_endpoint *cep;
+	struct iscsi_conn *iconn;
+	struct cxgbi_hba *chba;
+	u32 task_idx_bits;
+};
+
+struct cxgbi_endpoint {
+	struct cxgbi_conn *cconn;
+	struct cxgbi_hba *chba;
+	struct cxgbi_sock *csk;
+};
+
+#define MAX_PDU_FRAGS	((ULP2_MAX_PDU_PAYLOAD + 512 - 1) / 512)
+struct cxgbi_task_data {
+	unsigned short nr_frags;
+	struct page_frag frags[MAX_PDU_FRAGS];
+	struct sk_buff *skb;
+	unsigned int offset;
+	unsigned int count;
+	unsigned int sgoffset;
+};
+#define iscsi_task_cxgbi_data(task) \
+	((task)->dd_data + sizeof(struct iscsi_tcp_task))
+
+static inline int cxgbi_is_ddp_tag(struct cxgbi_tag_format *tformat, u32 tag)
+{
+	return !(tag & (1 << (tformat->rsvd_bits + tformat->rsvd_shift - 1)));
+}
+
+static inline int cxgbi_sw_tag_usable(struct cxgbi_tag_format *tformat,
+					u32 sw_tag)
+{
+	sw_tag >>= (32 - tformat->rsvd_bits);
+	return !sw_tag;
+}
+
+static inline u32 cxgbi_set_non_ddp_tag(struct cxgbi_tag_format *tformat,
+					u32 sw_tag)
+{
+	unsigned char shift = tformat->rsvd_bits + tformat->rsvd_shift - 1;
+	u32 mask = (1 << shift) - 1;
+
+	if (sw_tag && (sw_tag & ~mask)) {
+		u32 v1 = sw_tag & ((1 << shift) - 1);
+		u32 v2 = (sw_tag >> (shift - 1)) << shift;
+
+		return v2 | v1 | 1 << shift;
+	}
+
+	return sw_tag | 1 << shift;
+}
+
+static inline u32 cxgbi_ddp_tag_base(struct cxgbi_tag_format *tformat,
+					u32 sw_tag)
+{
+	u32 mask = (1 << tformat->rsvd_shift) - 1;
+
+	if (sw_tag && (sw_tag & ~mask)) {
+		u32 v1 = sw_tag & mask;
+		u32 v2 = sw_tag >> tformat->rsvd_shift;
+
+		v2 <<= tformat->rsvd_bits + tformat->rsvd_shift;
+
+		return v2 | v1;
+	}
+
+	return sw_tag;
+}
+
+static inline u32 cxgbi_tag_rsvd_bits(struct cxgbi_tag_format *tformat,
+					u32 tag)
+{
+	if (cxgbi_is_ddp_tag(tformat, tag))
+		return (tag >> tformat->rsvd_shift) & tformat->rsvd_mask;
+
+	return 0;
+}
+
+static inline u32 cxgbi_tag_nonrsvd_bits(struct cxgbi_tag_format *tformat,
+					u32 tag)
+{
+	unsigned char shift = tformat->rsvd_bits + tformat->rsvd_shift - 1;
+	u32 v1, v2;
+
+	if (cxgbi_is_ddp_tag(tformat, tag)) {
+		v1 = tag & ((1 << tformat->rsvd_shift) - 1);
+		v2 = (tag >> (shift + 1)) << tformat->rsvd_shift;
+	} else {
+		u32 mask = (1 << shift) - 1;
+		tag &= ~(1 << shift);
+		v1 = tag & mask;
+		v2 = (tag >> 1) & ~mask;
+	}
+	return v1 | v2;
+}
+
+static inline void *cxgbi_alloc_big_mem(unsigned int size,
+					gfp_t gfp)
+{
+	void *p = kzalloc(size, gfp | __GFP_NOWARN);
+
+	if (!p)
+		p = vzalloc(size);
+
+	return p;
+}
+
+static inline void cxgbi_free_big_mem(void *addr)
+{
+	if (is_vmalloc_addr(addr))
+		vfree(addr);
+	else
+		kfree(addr);
+}
+
+static inline void cxgbi_set_iscsi_ipv4(struct cxgbi_hba *chba, __be32 ipaddr)
+{
+	if (chba->cdev->flags & CXGBI_FLAG_IPV4_SET)
+		chba->ipv4addr = ipaddr;
+	else
+		pr_info("set iscsi ipv4 NOT supported, using %s ipv4.\n",
+			chba->ndev->name);
+}
+
+struct cxgbi_device *cxgbi_device_register(unsigned int, unsigned int);
+void cxgbi_device_unregister(struct cxgbi_device *);
+void cxgbi_device_unregister_all(unsigned int flag);
+struct cxgbi_device *cxgbi_device_find_by_lldev(void *);
+struct cxgbi_device *cxgbi_device_find_by_netdev(struct net_device *, int *);
+struct cxgbi_device *cxgbi_device_find_by_netdev_rcu(struct net_device *,
+						     int *);
+int cxgbi_hbas_add(struct cxgbi_device *, u64, unsigned int,
+			struct scsi_host_template *,
+			struct scsi_transport_template *);
+void cxgbi_hbas_remove(struct cxgbi_device *);
+
+int cxgbi_device_portmap_create(struct cxgbi_device *cdev, unsigned int base,
+			unsigned int max_conn);
+void cxgbi_device_portmap_cleanup(struct cxgbi_device *cdev);
+
+void cxgbi_conn_tx_open(struct cxgbi_sock *);
+void cxgbi_conn_pdu_ready(struct cxgbi_sock *);
+int cxgbi_conn_alloc_pdu(struct iscsi_task *, u8);
+int cxgbi_conn_init_pdu(struct iscsi_task *, unsigned int , unsigned int);
+int cxgbi_conn_xmit_pdu(struct iscsi_task *);
+
+void cxgbi_cleanup_task(struct iscsi_task *task);
+
+umode_t cxgbi_attr_is_visible(int param_type, int param);
+void cxgbi_get_conn_stats(struct iscsi_cls_conn *, struct iscsi_stats *);
+int cxgbi_set_conn_param(struct iscsi_cls_conn *,
+			enum iscsi_param, char *, int);
+int cxgbi_get_ep_param(struct iscsi_endpoint *ep, enum iscsi_param, char *);
+struct iscsi_cls_conn *cxgbi_create_conn(struct iscsi_cls_session *, u32);
+int cxgbi_bind_conn(struct iscsi_cls_session *,
+			struct iscsi_cls_conn *, u64, int);
+void cxgbi_destroy_session(struct iscsi_cls_session *);
+struct iscsi_cls_session *cxgbi_create_session(struct iscsi_endpoint *,
+			u16, u16, u32);
+int cxgbi_set_host_param(struct Scsi_Host *,
+			enum iscsi_host_param, char *, int);
+int cxgbi_get_host_param(struct Scsi_Host *, enum iscsi_host_param, char *);
+struct iscsi_endpoint *cxgbi_ep_connect(struct Scsi_Host *,
+			struct sockaddr *, int);
+int cxgbi_ep_poll(struct iscsi_endpoint *, int);
+void cxgbi_ep_disconnect(struct iscsi_endpoint *);
+
+int cxgbi_iscsi_init(struct iscsi_transport *,
+			struct scsi_transport_template **);
+void cxgbi_iscsi_cleanup(struct iscsi_transport *,
+			struct scsi_transport_template **);
+void cxgbi_parse_pdu_itt(struct iscsi_conn *, itt_t, int *, int *);
+int cxgbi_ddp_init(struct cxgbi_device *, unsigned int, unsigned int,
+			unsigned int, unsigned int);
+int cxgbi_ddp_cleanup(struct cxgbi_device *);
+void cxgbi_ddp_page_size_factor(int *);
+void cxgbi_ddp_ppod_clear(struct cxgbi_pagepod *);
+void cxgbi_ddp_ppod_set(struct cxgbi_pagepod *, struct cxgbi_pagepod_hdr *,
+			struct cxgbi_gather_list *, unsigned int);
+#endif	/*__LIBCXGBI_H__*/
diff --git a/drivers/scsi/scsi_priv.h b/drivers/scsi/scsi_priv.h
new file mode 100644
index 0000000..12b8e1b
--- /dev/null
+++ b/drivers/scsi/scsi_priv.h
@@ -0,0 +1,186 @@
+#ifndef _SCSI_PRIV_H
+#define _SCSI_PRIV_H
+
+#include <linux/device.h>
+#include <linux/async.h>
+#include <scsi/scsi_device.h>
+
+struct request_queue;
+struct request;
+struct scsi_cmnd;
+struct scsi_device;
+struct scsi_target;
+struct scsi_host_template;
+struct Scsi_Host;
+struct scsi_nl_hdr;
+
+
+/*
+ * Scsi Error Handler Flags
+ */
+#define SCSI_EH_CANCEL_CMD	0x0001	/* Cancel this cmd */
+#define SCSI_EH_ABORT_SCHEDULED	0x0002	/* Abort has been scheduled */
+
+#define SCSI_SENSE_VALID(scmd) \
+	(((scmd)->sense_buffer[0] & 0x70) == 0x70)
+
+/* hosts.c */
+extern int scsi_init_hosts(void);
+extern void scsi_exit_hosts(void);
+
+/* scsi.c */
+extern int scsi_dispatch_cmd(struct scsi_cmnd *cmd);
+extern int scsi_setup_command_freelist(struct Scsi_Host *shost);
+extern void scsi_destroy_command_freelist(struct Scsi_Host *shost);
+#ifdef CONFIG_SCSI_LOGGING
+void scsi_log_send(struct scsi_cmnd *cmd);
+void scsi_log_completion(struct scsi_cmnd *cmd, int disposition);
+#else
+static inline void scsi_log_send(struct scsi_cmnd *cmd) 
+	{ };
+static inline void scsi_log_completion(struct scsi_cmnd *cmd, int disposition)
+	{ };
+#endif
+
+/* scsi_devinfo.c */
+
+/* list of keys for the lists */
+enum {
+	SCSI_DEVINFO_GLOBAL = 0,
+	SCSI_DEVINFO_SPI,
+};
+
+extern int scsi_get_device_flags(struct scsi_device *sdev,
+				 const unsigned char *vendor,
+				 const unsigned char *model);
+extern int scsi_get_device_flags_keyed(struct scsi_device *sdev,
+				       const unsigned char *vendor,
+				       const unsigned char *model, int key);
+extern int scsi_dev_info_list_add_keyed(int compatible, char *vendor,
+					char *model, char *strflags,
+					int flags, int key);
+extern int scsi_dev_info_list_del_keyed(char *vendor, char *model, int key);
+extern int scsi_dev_info_add_list(int key, const char *name);
+extern int scsi_dev_info_remove_list(int key);
+
+extern int __init scsi_init_devinfo(void);
+extern void scsi_exit_devinfo(void);
+
+/* scsi_error.c */
+extern void scmd_eh_abort_handler(struct work_struct *work);
+extern enum blk_eh_timer_return scsi_times_out(struct request *req);
+extern int scsi_error_handler(void *host);
+extern int scsi_decide_disposition(struct scsi_cmnd *cmd);
+extern void scsi_eh_wakeup(struct Scsi_Host *shost);
+extern int scsi_eh_scmd_add(struct scsi_cmnd *, int);
+void scsi_eh_ready_devs(struct Scsi_Host *shost,
+			struct list_head *work_q,
+			struct list_head *done_q);
+int scsi_eh_get_sense(struct list_head *work_q,
+		      struct list_head *done_q);
+int scsi_noretry_cmd(struct scsi_cmnd *scmd);
+
+/* scsi_lib.c */
+extern int scsi_maybe_unblock_host(struct scsi_device *sdev);
+extern void scsi_device_unbusy(struct scsi_device *sdev);
+extern void scsi_queue_insert(struct scsi_cmnd *cmd, int reason);
+extern void scsi_next_command(struct scsi_cmnd *cmd);
+extern void scsi_io_completion(struct scsi_cmnd *, unsigned int);
+extern void scsi_run_host_queues(struct Scsi_Host *shost);
+extern struct request_queue *scsi_alloc_queue(struct scsi_device *sdev);
+extern struct request_queue *scsi_mq_alloc_queue(struct scsi_device *sdev);
+extern int scsi_mq_setup_tags(struct Scsi_Host *shost);
+extern void scsi_mq_destroy_tags(struct Scsi_Host *shost);
+extern int scsi_init_queue(void);
+extern void scsi_exit_queue(void);
+struct request_queue;
+struct request;
+extern struct kmem_cache *scsi_sdb_cache;
+
+/* scsi_proc.c */
+#ifdef CONFIG_SCSI_PROC_FS
+extern void scsi_proc_hostdir_add(struct scsi_host_template *);
+extern void scsi_proc_hostdir_rm(struct scsi_host_template *);
+extern void scsi_proc_host_add(struct Scsi_Host *);
+extern void scsi_proc_host_rm(struct Scsi_Host *);
+extern int scsi_init_procfs(void);
+extern void scsi_exit_procfs(void);
+#else
+# define scsi_proc_hostdir_add(sht)	do { } while (0)
+# define scsi_proc_hostdir_rm(sht)	do { } while (0)
+# define scsi_proc_host_add(shost)	do { } while (0)
+# define scsi_proc_host_rm(shost)	do { } while (0)
+# define scsi_init_procfs()		(0)
+# define scsi_exit_procfs()		do { } while (0)
+#endif /* CONFIG_PROC_FS */
+
+/* scsi_scan.c */
+extern char scsi_scan_type[];
+extern int scsi_complete_async_scans(void);
+extern int scsi_scan_host_selected(struct Scsi_Host *, unsigned int,
+				   unsigned int, u64, int);
+extern void scsi_forget_host(struct Scsi_Host *);
+extern void scsi_rescan_device(struct device *);
+
+/* scsi_sysctl.c */
+#ifdef CONFIG_SYSCTL
+extern int scsi_init_sysctl(void);
+extern void scsi_exit_sysctl(void);
+#else
+# define scsi_init_sysctl()		(0)
+# define scsi_exit_sysctl()		do { } while (0)
+#endif /* CONFIG_SYSCTL */
+
+/* scsi_sysfs.c */
+extern int scsi_sysfs_add_sdev(struct scsi_device *);
+extern int scsi_sysfs_add_host(struct Scsi_Host *);
+extern int scsi_sysfs_register(void);
+extern void scsi_sysfs_unregister(void);
+extern void scsi_sysfs_device_initialize(struct scsi_device *);
+extern int scsi_sysfs_target_initialize(struct scsi_device *);
+extern struct scsi_transport_template blank_transport_template;
+extern void __scsi_remove_device(struct scsi_device *);
+
+extern struct bus_type scsi_bus_type;
+extern const struct attribute_group *scsi_sysfs_shost_attr_groups[];
+
+/* scsi_netlink.c */
+#ifdef CONFIG_SCSI_NETLINK
+extern void scsi_netlink_init(void);
+extern void scsi_netlink_exit(void);
+extern struct sock *scsi_nl_sock;
+#else
+static inline void scsi_netlink_init(void) {}
+static inline void scsi_netlink_exit(void) {}
+#endif
+
+/* scsi_pm.c */
+#ifdef CONFIG_PM
+extern const struct dev_pm_ops scsi_bus_pm_ops;
+#endif
+#ifdef CONFIG_PM_RUNTIME
+extern void scsi_autopm_get_target(struct scsi_target *);
+extern void scsi_autopm_put_target(struct scsi_target *);
+extern int scsi_autopm_get_host(struct Scsi_Host *);
+extern void scsi_autopm_put_host(struct Scsi_Host *);
+#else
+static inline void scsi_autopm_get_target(struct scsi_target *t) {}
+static inline void scsi_autopm_put_target(struct scsi_target *t) {}
+static inline int scsi_autopm_get_host(struct Scsi_Host *h) { return 0; }
+static inline void scsi_autopm_put_host(struct Scsi_Host *h) {}
+#endif /* CONFIG_PM_RUNTIME */
+
+extern struct async_domain scsi_sd_pm_domain;
+extern struct async_domain scsi_sd_probe_domain;
+
+/* 
+ * internal scsi timeout functions: for use by mid-layer and transport
+ * classes.
+ */
+
+#define SCSI_DEVICE_BLOCK_MAX_TIMEOUT	600	/* units in seconds */
+extern int scsi_internal_device_block(struct scsi_device *sdev);
+extern int scsi_internal_device_unblock(struct scsi_device *sdev,
+					enum scsi_device_state new_state);
+
+#endif /* _SCSI_PRIV_H */
diff --git a/drivers/scsi/scsi_transport_srp.c b/drivers/scsi/scsi_transport_srp.c
new file mode 100644
index 0000000..ae45bd9
--- /dev/null
+++ b/drivers/scsi/scsi_transport_srp.c
@@ -0,0 +1,927 @@
+/*
+ * SCSI RDMA (SRP) transport class
+ *
+ * Copyright (C) 2007 FUJITA Tomonori <tomof@acm.org>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation, version 2 of the
+ * License.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
+ * 02110-1301 USA
+ */
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/jiffies.h>
+#include <linux/err.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/delay.h>
+
+#include <scsi/scsi.h>
+#include <scsi/scsi_cmnd.h>
+#include <scsi/scsi_device.h>
+#include <scsi/scsi_host.h>
+#include <scsi/scsi_transport.h>
+#include <scsi/scsi_transport_srp.h>
+#include "scsi_priv.h"
+
+struct srp_host_attrs {
+	atomic_t next_port_id;
+};
+#define to_srp_host_attrs(host)	((struct srp_host_attrs *)(host)->shost_data)
+
+#define SRP_HOST_ATTRS 0
+#define SRP_RPORT_ATTRS 8
+
+struct srp_internal {
+	struct scsi_transport_template t;
+	struct srp_function_template *f;
+
+	struct device_attribute *host_attrs[SRP_HOST_ATTRS + 1];
+
+	struct device_attribute *rport_attrs[SRP_RPORT_ATTRS + 1];
+	struct transport_container rport_attr_cont;
+};
+
+#define to_srp_internal(tmpl) container_of(tmpl, struct srp_internal, t)
+
+#define	dev_to_rport(d)	container_of(d, struct srp_rport, dev)
+#define transport_class_to_srp_rport(dev) dev_to_rport((dev)->parent)
+static inline struct Scsi_Host *rport_to_shost(struct srp_rport *r)
+{
+	return dev_to_shost(r->dev.parent);
+}
+
+/**
+ * srp_tmo_valid() - check timeout combination validity
+ * @reconnect_delay: Reconnect delay in seconds.
+ * @fast_io_fail_tmo: Fast I/O fail timeout in seconds.
+ * @dev_loss_tmo: Device loss timeout in seconds.
+ *
+ * The combination of the timeout parameters must be such that SCSI commands
+ * are finished in a reasonable time. Hence do not allow the fast I/O fail
+ * timeout to exceed SCSI_DEVICE_BLOCK_MAX_TIMEOUT nor allow dev_loss_tmo to
+ * exceed that limit if failing I/O fast has been disabled. Furthermore, these
+ * parameters must be such that multipath can detect failed paths timely.
+ * Hence do not allow all three parameters to be disabled simultaneously.
+ */
+int srp_tmo_valid(int reconnect_delay, int fast_io_fail_tmo, int dev_loss_tmo)
+{
+	if (reconnect_delay < 0 && fast_io_fail_tmo < 0 && dev_loss_tmo < 0)
+		return -EINVAL;
+	if (reconnect_delay == 0)
+		return -EINVAL;
+	if (fast_io_fail_tmo > SCSI_DEVICE_BLOCK_MAX_TIMEOUT)
+		return -EINVAL;
+	if (fast_io_fail_tmo < 0 &&
+	    dev_loss_tmo > SCSI_DEVICE_BLOCK_MAX_TIMEOUT)
+		return -EINVAL;
+	if (dev_loss_tmo >= LONG_MAX / HZ)
+		return -EINVAL;
+	if (fast_io_fail_tmo >= 0 && dev_loss_tmo >= 0 &&
+	    fast_io_fail_tmo >= dev_loss_tmo)
+		return -EINVAL;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(srp_tmo_valid);
+
+static int srp_host_setup(struct transport_container *tc, struct device *dev,
+			  struct device *cdev)
+{
+	struct Scsi_Host *shost = dev_to_shost(dev);
+	struct srp_host_attrs *srp_host = to_srp_host_attrs(shost);
+
+	atomic_set(&srp_host->next_port_id, 0);
+	return 0;
+}
+
+static DECLARE_TRANSPORT_CLASS(srp_host_class, "srp_host", srp_host_setup,
+			       NULL, NULL);
+
+static DECLARE_TRANSPORT_CLASS(srp_rport_class, "srp_remote_ports",
+			       NULL, NULL, NULL);
+
+#define SRP_PID(p) \
+	(p)->port_id[0], (p)->port_id[1], (p)->port_id[2], (p)->port_id[3], \
+	(p)->port_id[4], (p)->port_id[5], (p)->port_id[6], (p)->port_id[7], \
+	(p)->port_id[8], (p)->port_id[9], (p)->port_id[10], (p)->port_id[11], \
+	(p)->port_id[12], (p)->port_id[13], (p)->port_id[14], (p)->port_id[15]
+
+#define SRP_PID_FMT "%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:" \
+	"%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x"
+
+static ssize_t
+show_srp_rport_id(struct device *dev, struct device_attribute *attr,
+		  char *buf)
+{
+	struct srp_rport *rport = transport_class_to_srp_rport(dev);
+	return sprintf(buf, SRP_PID_FMT "\n", SRP_PID(rport));
+}
+
+static DEVICE_ATTR(port_id, S_IRUGO, show_srp_rport_id, NULL);
+
+static const struct {
+	u32 value;
+	char *name;
+} srp_rport_role_names[] = {
+	{SRP_RPORT_ROLE_INITIATOR, "SRP Initiator"},
+	{SRP_RPORT_ROLE_TARGET, "SRP Target"},
+};
+
+static ssize_t
+show_srp_rport_roles(struct device *dev, struct device_attribute *attr,
+		     char *buf)
+{
+	struct srp_rport *rport = transport_class_to_srp_rport(dev);
+	int i;
+	char *name = NULL;
+
+	for (i = 0; i < ARRAY_SIZE(srp_rport_role_names); i++)
+		if (srp_rport_role_names[i].value == rport->roles) {
+			name = srp_rport_role_names[i].name;
+			break;
+		}
+	return sprintf(buf, "%s\n", name ? : "unknown");
+}
+
+static DEVICE_ATTR(roles, S_IRUGO, show_srp_rport_roles, NULL);
+
+static ssize_t store_srp_rport_delete(struct device *dev,
+				      struct device_attribute *attr,
+				      const char *buf, size_t count)
+{
+	struct srp_rport *rport = transport_class_to_srp_rport(dev);
+	struct Scsi_Host *shost = dev_to_shost(dev);
+	struct srp_internal *i = to_srp_internal(shost->transportt);
+
+	if (i->f->rport_delete) {
+		i->f->rport_delete(rport);
+		return count;
+	} else {
+		return -ENOSYS;
+	}
+}
+
+static DEVICE_ATTR(delete, S_IWUSR, NULL, store_srp_rport_delete);
+
+static ssize_t show_srp_rport_state(struct device *dev,
+				    struct device_attribute *attr,
+				    char *buf)
+{
+	static const char *const state_name[] = {
+		[SRP_RPORT_RUNNING]	= "running",
+		[SRP_RPORT_BLOCKED]	= "blocked",
+		[SRP_RPORT_FAIL_FAST]	= "fail-fast",
+		[SRP_RPORT_LOST]	= "lost",
+	};
+	struct srp_rport *rport = transport_class_to_srp_rport(dev);
+	enum srp_rport_state state = rport->state;
+
+	return sprintf(buf, "%s\n",
+		       (unsigned)state < ARRAY_SIZE(state_name) ?
+		       state_name[state] : "???");
+}
+
+static DEVICE_ATTR(state, S_IRUGO, show_srp_rport_state, NULL);
+
+static ssize_t srp_show_tmo(char *buf, int tmo)
+{
+	return tmo >= 0 ? sprintf(buf, "%d\n", tmo) : sprintf(buf, "off\n");
+}
+
+static int srp_parse_tmo(int *tmo, const char *buf)
+{
+	int res = 0;
+
+	if (strncmp(buf, "off", 3) != 0)
+		res = kstrtoint(buf, 0, tmo);
+	else
+		*tmo = -1;
+
+	return res;
+}
+
+static ssize_t show_reconnect_delay(struct device *dev,
+				    struct device_attribute *attr, char *buf)
+{
+	struct srp_rport *rport = transport_class_to_srp_rport(dev);
+
+	return srp_show_tmo(buf, rport->reconnect_delay);
+}
+
+static ssize_t store_reconnect_delay(struct device *dev,
+				     struct device_attribute *attr,
+				     const char *buf, const size_t count)
+{
+	struct srp_rport *rport = transport_class_to_srp_rport(dev);
+	int res, delay;
+
+	res = srp_parse_tmo(&delay, buf);
+	if (res)
+		goto out;
+	res = srp_tmo_valid(delay, rport->fast_io_fail_tmo,
+			    rport->dev_loss_tmo);
+	if (res)
+		goto out;
+
+	if (rport->reconnect_delay <= 0 && delay > 0 &&
+	    rport->state != SRP_RPORT_RUNNING) {
+		queue_delayed_work(system_long_wq, &rport->reconnect_work,
+				   delay * HZ);
+	} else if (delay <= 0) {
+		cancel_delayed_work(&rport->reconnect_work);
+	}
+	rport->reconnect_delay = delay;
+	res = count;
+
+out:
+	return res;
+}
+
+static DEVICE_ATTR(reconnect_delay, S_IRUGO | S_IWUSR, show_reconnect_delay,
+		   store_reconnect_delay);
+
+static ssize_t show_failed_reconnects(struct device *dev,
+				      struct device_attribute *attr, char *buf)
+{
+	struct srp_rport *rport = transport_class_to_srp_rport(dev);
+
+	return sprintf(buf, "%d\n", rport->failed_reconnects);
+}
+
+static DEVICE_ATTR(failed_reconnects, S_IRUGO, show_failed_reconnects, NULL);
+
+static ssize_t show_srp_rport_fast_io_fail_tmo(struct device *dev,
+					       struct device_attribute *attr,
+					       char *buf)
+{
+	struct srp_rport *rport = transport_class_to_srp_rport(dev);
+
+	return srp_show_tmo(buf, rport->fast_io_fail_tmo);
+}
+
+static ssize_t store_srp_rport_fast_io_fail_tmo(struct device *dev,
+						struct device_attribute *attr,
+						const char *buf, size_t count)
+{
+	struct srp_rport *rport = transport_class_to_srp_rport(dev);
+	int res;
+	int fast_io_fail_tmo;
+
+	res = srp_parse_tmo(&fast_io_fail_tmo, buf);
+	if (res)
+		goto out;
+	res = srp_tmo_valid(rport->reconnect_delay, fast_io_fail_tmo,
+			    rport->dev_loss_tmo);
+	if (res)
+		goto out;
+	rport->fast_io_fail_tmo = fast_io_fail_tmo;
+	res = count;
+
+out:
+	return res;
+}
+
+static DEVICE_ATTR(fast_io_fail_tmo, S_IRUGO | S_IWUSR,
+		   show_srp_rport_fast_io_fail_tmo,
+		   store_srp_rport_fast_io_fail_tmo);
+
+static ssize_t show_srp_rport_dev_loss_tmo(struct device *dev,
+					   struct device_attribute *attr,
+					   char *buf)
+{
+	struct srp_rport *rport = transport_class_to_srp_rport(dev);
+
+	return srp_show_tmo(buf, rport->dev_loss_tmo);
+}
+
+static ssize_t store_srp_rport_dev_loss_tmo(struct device *dev,
+					    struct device_attribute *attr,
+					    const char *buf, size_t count)
+{
+	struct srp_rport *rport = transport_class_to_srp_rport(dev);
+	int res;
+	int dev_loss_tmo;
+
+	res = srp_parse_tmo(&dev_loss_tmo, buf);
+	if (res)
+		goto out;
+	res = srp_tmo_valid(rport->reconnect_delay, rport->fast_io_fail_tmo,
+			    dev_loss_tmo);
+	if (res)
+		goto out;
+	rport->dev_loss_tmo = dev_loss_tmo;
+	res = count;
+
+out:
+	return res;
+}
+
+static DEVICE_ATTR(dev_loss_tmo, S_IRUGO | S_IWUSR,
+		   show_srp_rport_dev_loss_tmo,
+		   store_srp_rport_dev_loss_tmo);
+
+static int srp_rport_set_state(struct srp_rport *rport,
+			       enum srp_rport_state new_state)
+{
+	enum srp_rport_state old_state = rport->state;
+
+	lockdep_assert_held(&rport->mutex);
+
+	switch (new_state) {
+	case SRP_RPORT_RUNNING:
+		switch (old_state) {
+		case SRP_RPORT_LOST:
+			goto invalid;
+		default:
+			break;
+		}
+		break;
+	case SRP_RPORT_BLOCKED:
+		switch (old_state) {
+		case SRP_RPORT_RUNNING:
+			break;
+		default:
+			goto invalid;
+		}
+		break;
+	case SRP_RPORT_FAIL_FAST:
+		switch (old_state) {
+		case SRP_RPORT_LOST:
+			goto invalid;
+		default:
+			break;
+		}
+		break;
+	case SRP_RPORT_LOST:
+		break;
+	}
+	rport->state = new_state;
+	return 0;
+
+invalid:
+	return -EINVAL;
+}
+
+/**
+ * srp_reconnect_work() - reconnect and schedule a new attempt if necessary
+ * @work: Work structure used for scheduling this operation.
+ */
+static void srp_reconnect_work(struct work_struct *work)
+{
+	struct srp_rport *rport = container_of(to_delayed_work(work),
+					struct srp_rport, reconnect_work);
+	struct Scsi_Host *shost = rport_to_shost(rport);
+	int delay, res;
+
+	res = srp_reconnect_rport(rport);
+	if (res != 0) {
+		shost_printk(KERN_ERR, shost,
+			     "reconnect attempt %d failed (%d)\n",
+			     ++rport->failed_reconnects, res);
+		delay = rport->reconnect_delay *
+			min(100, max(1, rport->failed_reconnects - 10));
+		if (delay > 0)
+			queue_delayed_work(system_long_wq,
+					   &rport->reconnect_work, delay * HZ);
+	}
+}
+
+static void __rport_fail_io_fast(struct srp_rport *rport)
+{
+	struct Scsi_Host *shost = rport_to_shost(rport);
+	struct srp_internal *i;
+
+	lockdep_assert_held(&rport->mutex);
+
+	if (srp_rport_set_state(rport, SRP_RPORT_FAIL_FAST))
+		return;
+	scsi_target_unblock(rport->dev.parent, SDEV_TRANSPORT_OFFLINE);
+
+	/* Involve the LLD if possible to terminate all I/O on the rport. */
+	i = to_srp_internal(shost->transportt);
+	if (i->f->terminate_rport_io)
+		i->f->terminate_rport_io(rport);
+}
+
+/**
+ * rport_fast_io_fail_timedout() - fast I/O failure timeout handler
+ * @work: Work structure used for scheduling this operation.
+ */
+static void rport_fast_io_fail_timedout(struct work_struct *work)
+{
+	struct srp_rport *rport = container_of(to_delayed_work(work),
+					struct srp_rport, fast_io_fail_work);
+	struct Scsi_Host *shost = rport_to_shost(rport);
+
+	pr_info("fast_io_fail_tmo expired for SRP %s / %s.\n",
+		dev_name(&rport->dev), dev_name(&shost->shost_gendev));
+
+	mutex_lock(&rport->mutex);
+	if (rport->state == SRP_RPORT_BLOCKED)
+		__rport_fail_io_fast(rport);
+	mutex_unlock(&rport->mutex);
+}
+
+/**
+ * rport_dev_loss_timedout() - device loss timeout handler
+ * @work: Work structure used for scheduling this operation.
+ */
+static void rport_dev_loss_timedout(struct work_struct *work)
+{
+	struct srp_rport *rport = container_of(to_delayed_work(work),
+					struct srp_rport, dev_loss_work);
+	struct Scsi_Host *shost = rport_to_shost(rport);
+	struct srp_internal *i = to_srp_internal(shost->transportt);
+
+	pr_info("dev_loss_tmo expired for SRP %s / %s.\n",
+		dev_name(&rport->dev), dev_name(&shost->shost_gendev));
+
+	mutex_lock(&rport->mutex);
+	WARN_ON(srp_rport_set_state(rport, SRP_RPORT_LOST) != 0);
+	scsi_target_unblock(rport->dev.parent, SDEV_TRANSPORT_OFFLINE);
+	mutex_unlock(&rport->mutex);
+
+	i->f->rport_delete(rport);
+}
+
+static void __srp_start_tl_fail_timers(struct srp_rport *rport)
+{
+	struct Scsi_Host *shost = rport_to_shost(rport);
+	int delay, fast_io_fail_tmo, dev_loss_tmo;
+
+	lockdep_assert_held(&rport->mutex);
+
+	delay = rport->reconnect_delay;
+	fast_io_fail_tmo = rport->fast_io_fail_tmo;
+	dev_loss_tmo = rport->dev_loss_tmo;
+	pr_debug("%s current state: %d\n", dev_name(&shost->shost_gendev),
+		 rport->state);
+
+	if (rport->state == SRP_RPORT_LOST)
+		return;
+	if (delay > 0)
+		queue_delayed_work(system_long_wq, &rport->reconnect_work,
+				   1UL * delay * HZ);
+	if ((fast_io_fail_tmo >= 0 || dev_loss_tmo >= 0) &&
+	    srp_rport_set_state(rport, SRP_RPORT_BLOCKED) == 0) {
+		pr_debug("%s new state: %d\n", dev_name(&shost->shost_gendev),
+			 rport->state);
+		scsi_target_block(&shost->shost_gendev);
+		if (fast_io_fail_tmo >= 0)
+			queue_delayed_work(system_long_wq,
+					   &rport->fast_io_fail_work,
+					   1UL * fast_io_fail_tmo * HZ);
+		if (dev_loss_tmo >= 0)
+			queue_delayed_work(system_long_wq,
+					   &rport->dev_loss_work,
+					   1UL * dev_loss_tmo * HZ);
+	}
+}
+
+/**
+ * srp_start_tl_fail_timers() - start the transport layer failure timers
+ * @rport: SRP target port.
+ *
+ * Start the transport layer fast I/O failure and device loss timers. Do not
+ * modify a timer that was already started.
+ */
+void srp_start_tl_fail_timers(struct srp_rport *rport)
+{
+	mutex_lock(&rport->mutex);
+	__srp_start_tl_fail_timers(rport);
+	mutex_unlock(&rport->mutex);
+}
+EXPORT_SYMBOL(srp_start_tl_fail_timers);
+
+/**
+ * scsi_request_fn_active() - number of kernel threads inside scsi_request_fn()
+ * @shost: SCSI host for which to count the number of scsi_request_fn() callers.
+ */
+static int scsi_request_fn_active(struct Scsi_Host *shost)
+{
+	struct scsi_device *sdev;
+	struct request_queue *q;
+	int request_fn_active = 0;
+
+	shost_for_each_device(sdev, shost) {
+		q = sdev->request_queue;
+
+		spin_lock_irq(q->queue_lock);
+		request_fn_active += q->request_fn_active;
+		spin_unlock_irq(q->queue_lock);
+	}
+
+	return request_fn_active;
+}
+
+/**
+ * srp_reconnect_rport() - reconnect to an SRP target port
+ * @rport: SRP target port.
+ *
+ * Blocks SCSI command queueing before invoking reconnect() such that
+ * queuecommand() won't be invoked concurrently with reconnect() from outside
+ * the SCSI EH. This is important since a reconnect() implementation may
+ * reallocate resources needed by queuecommand().
+ *
+ * Notes:
+ * - This function neither waits until outstanding requests have finished nor
+ *   tries to abort these. It is the responsibility of the reconnect()
+ *   function to finish outstanding commands before reconnecting to the target
+ *   port.
+ * - It is the responsibility of the caller to ensure that the resources
+ *   reallocated by the reconnect() function won't be used while this function
+ *   is in progress. One possible strategy is to invoke this function from
+ *   the context of the SCSI EH thread only. Another possible strategy is to
+ *   lock the rport mutex inside each SCSI LLD callback that can be invoked by
+ *   the SCSI EH (the scsi_host_template.eh_*() functions and also the
+ *   scsi_host_template.queuecommand() function).
+ */
+int srp_reconnect_rport(struct srp_rport *rport)
+{
+	struct Scsi_Host *shost = rport_to_shost(rport);
+	struct srp_internal *i = to_srp_internal(shost->transportt);
+	struct scsi_device *sdev;
+	int res;
+
+	pr_debug("SCSI host %s\n", dev_name(&shost->shost_gendev));
+
+	res = mutex_lock_interruptible(&rport->mutex);
+	if (res)
+		goto out;
+	scsi_target_block(&shost->shost_gendev);
+	while (scsi_request_fn_active(shost))
+		msleep(20);
+	res = rport->state != SRP_RPORT_LOST ? i->f->reconnect(rport) : -ENODEV;
+	pr_debug("%s (state %d): transport.reconnect() returned %d\n",
+		 dev_name(&shost->shost_gendev), rport->state, res);
+	if (res == 0) {
+		cancel_delayed_work(&rport->fast_io_fail_work);
+		cancel_delayed_work(&rport->dev_loss_work);
+
+		rport->failed_reconnects = 0;
+		srp_rport_set_state(rport, SRP_RPORT_RUNNING);
+		scsi_target_unblock(&shost->shost_gendev, SDEV_RUNNING);
+		/*
+		 * If the SCSI error handler has offlined one or more devices,
+		 * invoking scsi_target_unblock() won't change the state of
+		 * these devices into running so do that explicitly.
+		 */
+		spin_lock_irq(shost->host_lock);
+		__shost_for_each_device(sdev, shost)
+			if (sdev->sdev_state == SDEV_OFFLINE)
+				sdev->sdev_state = SDEV_RUNNING;
+		spin_unlock_irq(shost->host_lock);
+	} else if (rport->state == SRP_RPORT_RUNNING) {
+		/*
+		 * srp_reconnect_rport() has been invoked with fast_io_fail
+		 * and dev_loss off. Mark the port as failed and start the TL
+		 * failure timers if these had not yet been started.
+		 */
+		__rport_fail_io_fast(rport);
+		scsi_target_unblock(&shost->shost_gendev,
+				    SDEV_TRANSPORT_OFFLINE);
+		__srp_start_tl_fail_timers(rport);
+	} else if (rport->state != SRP_RPORT_BLOCKED) {
+		scsi_target_unblock(&shost->shost_gendev,
+				    SDEV_TRANSPORT_OFFLINE);
+	}
+	mutex_unlock(&rport->mutex);
+
+out:
+	return res;
+}
+EXPORT_SYMBOL(srp_reconnect_rport);
+
+/**
+ * srp_timed_out() - SRP transport intercept of the SCSI timeout EH
+ * @scmd: SCSI command.
+ *
+ * If a timeout occurs while an rport is in the blocked state, ask the SCSI
+ * EH to continue waiting (BLK_EH_RESET_TIMER). Otherwise let the SCSI core
+ * handle the timeout (BLK_EH_NOT_HANDLED).
+ *
+ * Note: This function is called from soft-IRQ context and with the request
+ * queue lock held.
+ */
+static enum blk_eh_timer_return srp_timed_out(struct scsi_cmnd *scmd)
+{
+	struct scsi_device *sdev = scmd->device;
+	struct Scsi_Host *shost = sdev->host;
+	struct srp_internal *i = to_srp_internal(shost->transportt);
+
+	pr_debug("timeout for sdev %s\n", dev_name(&sdev->sdev_gendev));
+	return i->f->reset_timer_if_blocked && scsi_device_blocked(sdev) ?
+		BLK_EH_RESET_TIMER : BLK_EH_NOT_HANDLED;
+}
+
+static void srp_rport_release(struct device *dev)
+{
+	struct srp_rport *rport = dev_to_rport(dev);
+
+	put_device(dev->parent);
+	kfree(rport);
+}
+
+static int scsi_is_srp_rport(const struct device *dev)
+{
+	return dev->release == srp_rport_release;
+}
+
+static int srp_rport_match(struct attribute_container *cont,
+			   struct device *dev)
+{
+	struct Scsi_Host *shost;
+	struct srp_internal *i;
+
+	if (!scsi_is_srp_rport(dev))
+		return 0;
+
+	shost = dev_to_shost(dev->parent);
+	if (!shost->transportt)
+		return 0;
+	if (shost->transportt->host_attrs.ac.class != &srp_host_class.class)
+		return 0;
+
+	i = to_srp_internal(shost->transportt);
+	return &i->rport_attr_cont.ac == cont;
+}
+
+static int srp_host_match(struct attribute_container *cont, struct device *dev)
+{
+	struct Scsi_Host *shost;
+	struct srp_internal *i;
+
+	if (!scsi_is_host_device(dev))
+		return 0;
+
+	shost = dev_to_shost(dev);
+	if (!shost->transportt)
+		return 0;
+	if (shost->transportt->host_attrs.ac.class != &srp_host_class.class)
+		return 0;
+
+	i = to_srp_internal(shost->transportt);
+	return &i->t.host_attrs.ac == cont;
+}
+
+/**
+ * srp_rport_get() - increment rport reference count
+ * @rport: SRP target port.
+ */
+void srp_rport_get(struct srp_rport *rport)
+{
+	get_device(&rport->dev);
+}
+EXPORT_SYMBOL(srp_rport_get);
+
+/**
+ * srp_rport_put() - decrement rport reference count
+ * @rport: SRP target port.
+ */
+void srp_rport_put(struct srp_rport *rport)
+{
+	put_device(&rport->dev);
+}
+EXPORT_SYMBOL(srp_rport_put);
+
+/**
+ * srp_rport_add - add a SRP remote port to the device hierarchy
+ * @shost:	scsi host the remote port is connected to.
+ * @ids:	The port id for the remote port.
+ *
+ * Publishes a port to the rest of the system.
+ */
+struct srp_rport *srp_rport_add(struct Scsi_Host *shost,
+				struct srp_rport_identifiers *ids)
+{
+	struct srp_rport *rport;
+	struct device *parent = &shost->shost_gendev;
+	struct srp_internal *i = to_srp_internal(shost->transportt);
+	int id, ret;
+
+	rport = kzalloc(sizeof(*rport), GFP_KERNEL);
+	if (!rport)
+		return ERR_PTR(-ENOMEM);
+
+	mutex_init(&rport->mutex);
+
+	device_initialize(&rport->dev);
+
+	rport->dev.parent = get_device(parent);
+	rport->dev.release = srp_rport_release;
+
+	memcpy(rport->port_id, ids->port_id, sizeof(rport->port_id));
+	rport->roles = ids->roles;
+
+	if (i->f->reconnect)
+		rport->reconnect_delay = i->f->reconnect_delay ?
+			*i->f->reconnect_delay : 10;
+	INIT_DELAYED_WORK(&rport->reconnect_work, srp_reconnect_work);
+	rport->fast_io_fail_tmo = i->f->fast_io_fail_tmo ?
+		*i->f->fast_io_fail_tmo : 15;
+	rport->dev_loss_tmo = i->f->dev_loss_tmo ? *i->f->dev_loss_tmo : 60;
+	INIT_DELAYED_WORK(&rport->fast_io_fail_work,
+			  rport_fast_io_fail_timedout);
+	INIT_DELAYED_WORK(&rport->dev_loss_work, rport_dev_loss_timedout);
+
+	id = atomic_inc_return(&to_srp_host_attrs(shost)->next_port_id);
+	dev_set_name(&rport->dev, "port-%d:%d", shost->host_no, id);
+
+	transport_setup_device(&rport->dev);
+
+	ret = device_add(&rport->dev);
+	if (ret) {
+		transport_destroy_device(&rport->dev);
+		put_device(&rport->dev);
+		return ERR_PTR(ret);
+	}
+
+	transport_add_device(&rport->dev);
+	transport_configure_device(&rport->dev);
+
+	return rport;
+}
+EXPORT_SYMBOL_GPL(srp_rport_add);
+
+/**
+ * srp_rport_del  -  remove a SRP remote port
+ * @rport:	SRP remote port to remove
+ *
+ * Removes the specified SRP remote port.
+ */
+void srp_rport_del(struct srp_rport *rport)
+{
+	struct device *dev = &rport->dev;
+
+	transport_remove_device(dev);
+	device_del(dev);
+	transport_destroy_device(dev);
+
+	put_device(dev);
+}
+EXPORT_SYMBOL_GPL(srp_rport_del);
+
+static int do_srp_rport_del(struct device *dev, void *data)
+{
+	if (scsi_is_srp_rport(dev))
+		srp_rport_del(dev_to_rport(dev));
+	return 0;
+}
+
+/**
+ * srp_remove_host  -  tear down a Scsi_Host's SRP data structures
+ * @shost:	Scsi Host that is torn down
+ *
+ * Removes all SRP remote ports for a given Scsi_Host.
+ * Must be called just before scsi_remove_host for SRP HBAs.
+ */
+void srp_remove_host(struct Scsi_Host *shost)
+{
+	device_for_each_child(&shost->shost_gendev, NULL, do_srp_rport_del);
+}
+EXPORT_SYMBOL_GPL(srp_remove_host);
+
+/**
+ * srp_stop_rport_timers - stop the transport layer recovery timers
+ * @rport: SRP remote port for which to stop the timers.
+ *
+ * Must be called after srp_remove_host() and scsi_remove_host(). The caller
+ * must hold a reference on the rport (rport->dev) and on the SCSI host
+ * (rport->dev.parent).
+ */
+void srp_stop_rport_timers(struct srp_rport *rport)
+{
+	mutex_lock(&rport->mutex);
+	if (rport->state == SRP_RPORT_BLOCKED)
+		__rport_fail_io_fast(rport);
+	srp_rport_set_state(rport, SRP_RPORT_LOST);
+	mutex_unlock(&rport->mutex);
+
+	cancel_delayed_work_sync(&rport->reconnect_work);
+	cancel_delayed_work_sync(&rport->fast_io_fail_work);
+	cancel_delayed_work_sync(&rport->dev_loss_work);
+}
+EXPORT_SYMBOL_GPL(srp_stop_rport_timers);
+
+static int srp_tsk_mgmt_response(struct Scsi_Host *shost, u64 nexus, u64 tm_id,
+				 int result)
+{
+	struct srp_internal *i = to_srp_internal(shost->transportt);
+	return i->f->tsk_mgmt_response(shost, nexus, tm_id, result);
+}
+
+static int srp_it_nexus_response(struct Scsi_Host *shost, u64 nexus, int result)
+{
+	struct srp_internal *i = to_srp_internal(shost->transportt);
+	return i->f->it_nexus_response(shost, nexus, result);
+}
+
+/**
+ * srp_attach_transport  -  instantiate SRP transport template
+ * @ft:		SRP transport class function template
+ */
+struct scsi_transport_template *
+srp_attach_transport(struct srp_function_template *ft)
+{
+	int count;
+	struct srp_internal *i;
+
+	i = kzalloc(sizeof(*i), GFP_KERNEL);
+	if (!i)
+		return NULL;
+
+	i->t.eh_timed_out = srp_timed_out;
+
+	i->t.tsk_mgmt_response = srp_tsk_mgmt_response;
+	i->t.it_nexus_response = srp_it_nexus_response;
+
+	i->t.host_size = sizeof(struct srp_host_attrs);
+	i->t.host_attrs.ac.attrs = &i->host_attrs[0];
+	i->t.host_attrs.ac.class = &srp_host_class.class;
+	i->t.host_attrs.ac.match = srp_host_match;
+	i->host_attrs[0] = NULL;
+	transport_container_register(&i->t.host_attrs);
+
+	i->rport_attr_cont.ac.attrs = &i->rport_attrs[0];
+	i->rport_attr_cont.ac.class = &srp_rport_class.class;
+	i->rport_attr_cont.ac.match = srp_rport_match;
+
+	count = 0;
+	i->rport_attrs[count++] = &dev_attr_port_id;
+	i->rport_attrs[count++] = &dev_attr_roles;
+	if (ft->has_rport_state) {
+		i->rport_attrs[count++] = &dev_attr_state;
+		i->rport_attrs[count++] = &dev_attr_fast_io_fail_tmo;
+		i->rport_attrs[count++] = &dev_attr_dev_loss_tmo;
+	}
+	if (ft->reconnect) {
+		i->rport_attrs[count++] = &dev_attr_reconnect_delay;
+		i->rport_attrs[count++] = &dev_attr_failed_reconnects;
+	}
+	if (ft->rport_delete)
+		i->rport_attrs[count++] = &dev_attr_delete;
+	i->rport_attrs[count++] = NULL;
+	BUG_ON(count > ARRAY_SIZE(i->rport_attrs));
+
+	transport_container_register(&i->rport_attr_cont);
+
+	i->f = ft;
+
+	return &i->t;
+}
+EXPORT_SYMBOL_GPL(srp_attach_transport);
+
+/**
+ * srp_release_transport  -  release SRP transport template instance
+ * @t:		transport template instance
+ */
+void srp_release_transport(struct scsi_transport_template *t)
+{
+	struct srp_internal *i = to_srp_internal(t);
+
+	transport_container_unregister(&i->t.host_attrs);
+	transport_container_unregister(&i->rport_attr_cont);
+
+	kfree(i);
+}
+EXPORT_SYMBOL_GPL(srp_release_transport);
+
+static __init int srp_transport_init(void)
+{
+	int ret;
+
+	ret = transport_class_register(&srp_host_class);
+	if (ret)
+		return ret;
+	ret = transport_class_register(&srp_rport_class);
+	if (ret)
+		goto unregister_host_class;
+
+	return 0;
+unregister_host_class:
+	transport_class_unregister(&srp_host_class);
+	return ret;
+}
+
+static void __exit srp_transport_exit(void)
+{
+	transport_class_unregister(&srp_host_class);
+	transport_class_unregister(&srp_rport_class);
+}
+
+MODULE_AUTHOR("FUJITA Tomonori");
+MODULE_DESCRIPTION("SRP Transport Attributes");
+MODULE_LICENSE("GPL");
+
+module_init(srp_transport_init);
+module_exit(srp_transport_exit);
diff --git a/include/linux/mlx4/cmd.h b/include/linux/mlx4/cmd.h
new file mode 100644
index 0000000..379c026
--- /dev/null
+++ b/include/linux/mlx4/cmd.h
@@ -0,0 +1,261 @@
+/*
+ * Copyright (c) 2006 Cisco Systems, Inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MLX4_CMD_H
+#define MLX4_CMD_H
+
+#include <linux/dma-mapping.h>
+#include <linux/if_link.h>
+
+enum {
+	/* initialization and general commands */
+	MLX4_CMD_SYS_EN		 = 0x1,
+	MLX4_CMD_SYS_DIS	 = 0x2,
+	MLX4_CMD_MAP_FA		 = 0xfff,
+	MLX4_CMD_UNMAP_FA	 = 0xffe,
+	MLX4_CMD_RUN_FW		 = 0xff6,
+	MLX4_CMD_MOD_STAT_CFG	 = 0x34,
+	MLX4_CMD_QUERY_DEV_CAP	 = 0x3,
+	MLX4_CMD_QUERY_FW	 = 0x4,
+	MLX4_CMD_ENABLE_LAM	 = 0xff8,
+	MLX4_CMD_DISABLE_LAM	 = 0xff7,
+	MLX4_CMD_QUERY_DDR	 = 0x5,
+	MLX4_CMD_QUERY_ADAPTER	 = 0x6,
+	MLX4_CMD_INIT_HCA	 = 0x7,
+	MLX4_CMD_CLOSE_HCA	 = 0x8,
+	MLX4_CMD_INIT_PORT	 = 0x9,
+	MLX4_CMD_CLOSE_PORT	 = 0xa,
+	MLX4_CMD_QUERY_HCA	 = 0xb,
+	MLX4_CMD_QUERY_PORT	 = 0x43,
+	MLX4_CMD_SENSE_PORT	 = 0x4d,
+	MLX4_CMD_HW_HEALTH_CHECK = 0x50,
+	MLX4_CMD_SET_PORT	 = 0xc,
+	MLX4_CMD_SET_NODE	 = 0x5a,
+	MLX4_CMD_QUERY_FUNC	 = 0x56,
+	MLX4_CMD_ACCESS_DDR	 = 0x2e,
+	MLX4_CMD_MAP_ICM	 = 0xffa,
+	MLX4_CMD_UNMAP_ICM	 = 0xff9,
+	MLX4_CMD_MAP_ICM_AUX	 = 0xffc,
+	MLX4_CMD_UNMAP_ICM_AUX	 = 0xffb,
+	MLX4_CMD_SET_ICM_SIZE	 = 0xffd,
+	/*master notify fw on finish for slave's flr*/
+	MLX4_CMD_INFORM_FLR_DONE = 0x5b,
+	MLX4_CMD_GET_OP_REQ      = 0x59,
+
+	/* TPT commands */
+	MLX4_CMD_SW2HW_MPT	 = 0xd,
+	MLX4_CMD_QUERY_MPT	 = 0xe,
+	MLX4_CMD_HW2SW_MPT	 = 0xf,
+	MLX4_CMD_READ_MTT	 = 0x10,
+	MLX4_CMD_WRITE_MTT	 = 0x11,
+	MLX4_CMD_SYNC_TPT	 = 0x2f,
+
+	/* EQ commands */
+	MLX4_CMD_MAP_EQ		 = 0x12,
+	MLX4_CMD_SW2HW_EQ	 = 0x13,
+	MLX4_CMD_HW2SW_EQ	 = 0x14,
+	MLX4_CMD_QUERY_EQ	 = 0x15,
+
+	/* CQ commands */
+	MLX4_CMD_SW2HW_CQ	 = 0x16,
+	MLX4_CMD_HW2SW_CQ	 = 0x17,
+	MLX4_CMD_QUERY_CQ	 = 0x18,
+	MLX4_CMD_MODIFY_CQ	 = 0x2c,
+
+	/* SRQ commands */
+	MLX4_CMD_SW2HW_SRQ	 = 0x35,
+	MLX4_CMD_HW2SW_SRQ	 = 0x36,
+	MLX4_CMD_QUERY_SRQ	 = 0x37,
+	MLX4_CMD_ARM_SRQ	 = 0x40,
+
+	/* QP/EE commands */
+	MLX4_CMD_RST2INIT_QP	 = 0x19,
+	MLX4_CMD_INIT2RTR_QP	 = 0x1a,
+	MLX4_CMD_RTR2RTS_QP	 = 0x1b,
+	MLX4_CMD_RTS2RTS_QP	 = 0x1c,
+	MLX4_CMD_SQERR2RTS_QP	 = 0x1d,
+	MLX4_CMD_2ERR_QP	 = 0x1e,
+	MLX4_CMD_RTS2SQD_QP	 = 0x1f,
+	MLX4_CMD_SQD2SQD_QP	 = 0x38,
+	MLX4_CMD_SQD2RTS_QP	 = 0x20,
+	MLX4_CMD_2RST_QP	 = 0x21,
+	MLX4_CMD_QUERY_QP	 = 0x22,
+	MLX4_CMD_INIT2INIT_QP	 = 0x2d,
+	MLX4_CMD_SUSPEND_QP	 = 0x32,
+	MLX4_CMD_UNSUSPEND_QP	 = 0x33,
+	MLX4_CMD_UPDATE_QP	 = 0x61,
+	/* special QP and management commands */
+	MLX4_CMD_CONF_SPECIAL_QP = 0x23,
+	MLX4_CMD_MAD_IFC	 = 0x24,
+	MLX4_CMD_MAD_DEMUX	 = 0x203,
+
+	/* multicast commands */
+	MLX4_CMD_READ_MCG	 = 0x25,
+	MLX4_CMD_WRITE_MCG	 = 0x26,
+	MLX4_CMD_MGID_HASH	 = 0x27,
+
+	/* miscellaneous commands */
+	MLX4_CMD_DIAG_RPRT	 = 0x30,
+	MLX4_CMD_NOP		 = 0x31,
+	MLX4_CMD_CONFIG_DEV	 = 0x3a,
+	MLX4_CMD_ACCESS_MEM	 = 0x2e,
+	MLX4_CMD_SET_VEP	 = 0x52,
+
+	/* Ethernet specific commands */
+	MLX4_CMD_SET_VLAN_FLTR	 = 0x47,
+	MLX4_CMD_SET_MCAST_FLTR	 = 0x48,
+	MLX4_CMD_DUMP_ETH_STATS	 = 0x49,
+
+	/* Communication channel commands */
+	MLX4_CMD_ARM_COMM_CHANNEL = 0x57,
+	MLX4_CMD_GEN_EQE	 = 0x58,
+
+	/* virtual commands */
+	MLX4_CMD_ALLOC_RES	 = 0xf00,
+	MLX4_CMD_FREE_RES	 = 0xf01,
+	MLX4_CMD_MCAST_ATTACH	 = 0xf05,
+	MLX4_CMD_UCAST_ATTACH	 = 0xf06,
+	MLX4_CMD_PROMISC         = 0xf08,
+	MLX4_CMD_QUERY_FUNC_CAP  = 0xf0a,
+	MLX4_CMD_QP_ATTACH	 = 0xf0b,
+
+	/* debug commands */
+	MLX4_CMD_QUERY_DEBUG_MSG = 0x2a,
+	MLX4_CMD_SET_DEBUG_MSG	 = 0x2b,
+
+	/* statistics commands */
+	MLX4_CMD_QUERY_IF_STAT	 = 0X54,
+	MLX4_CMD_SET_IF_STAT	 = 0X55,
+
+	/* register/delete flow steering network rules */
+	MLX4_QP_FLOW_STEERING_ATTACH = 0x65,
+	MLX4_QP_FLOW_STEERING_DETACH = 0x66,
+	MLX4_FLOW_STEERING_IB_UC_QP_RANGE = 0x64,
+};
+
+enum {
+	MLX4_CMD_TIME_CLASS_A	= 10000,
+	MLX4_CMD_TIME_CLASS_B	= 10000,
+	MLX4_CMD_TIME_CLASS_C	= 10000,
+};
+
+enum {
+	MLX4_MAILBOX_SIZE	= 4096,
+	MLX4_ACCESS_MEM_ALIGN	= 256,
+};
+
+enum {
+	/* set port opcode modifiers */
+	MLX4_SET_PORT_GENERAL   = 0x0,
+	MLX4_SET_PORT_RQP_CALC  = 0x1,
+	MLX4_SET_PORT_MAC_TABLE = 0x2,
+	MLX4_SET_PORT_VLAN_TABLE = 0x3,
+	MLX4_SET_PORT_PRIO_MAP  = 0x4,
+	MLX4_SET_PORT_GID_TABLE = 0x5,
+	MLX4_SET_PORT_PRIO2TC	= 0x8,
+	MLX4_SET_PORT_SCHEDULER = 0x9,
+	MLX4_SET_PORT_VXLAN	= 0xB
+};
+
+enum {
+	MLX4_CMD_MAD_DEMUX_CONFIG	= 0,
+	MLX4_CMD_MAD_DEMUX_QUERY_STATE	= 1,
+	MLX4_CMD_MAD_DEMUX_QUERY_RESTR	= 2, /* Query mad demux restrictions */
+};
+
+enum {
+	MLX4_CMD_WRAPPED,
+	MLX4_CMD_NATIVE
+};
+
+struct mlx4_dev;
+
+struct mlx4_cmd_mailbox {
+	void		       *buf;
+	dma_addr_t		dma;
+};
+
+int __mlx4_cmd(struct mlx4_dev *dev, u64 in_param, u64 *out_param,
+	       int out_is_imm, u32 in_modifier, u8 op_modifier,
+	       u16 op, unsigned long timeout, int native);
+
+/* Invoke a command with no output parameter */
+static inline int mlx4_cmd(struct mlx4_dev *dev, u64 in_param, u32 in_modifier,
+			   u8 op_modifier, u16 op, unsigned long timeout,
+			   int native)
+{
+	return __mlx4_cmd(dev, in_param, NULL, 0, in_modifier,
+			  op_modifier, op, timeout, native);
+}
+
+/* Invoke a command with an output mailbox */
+static inline int mlx4_cmd_box(struct mlx4_dev *dev, u64 in_param, u64 out_param,
+			       u32 in_modifier, u8 op_modifier, u16 op,
+			       unsigned long timeout, int native)
+{
+	return __mlx4_cmd(dev, in_param, &out_param, 0, in_modifier,
+			  op_modifier, op, timeout, native);
+}
+
+/*
+ * Invoke a command with an immediate output parameter (and copy the
+ * output into the caller's out_param pointer after the command
+ * executes).
+ */
+static inline int mlx4_cmd_imm(struct mlx4_dev *dev, u64 in_param, u64 *out_param,
+			       u32 in_modifier, u8 op_modifier, u16 op,
+			       unsigned long timeout, int native)
+{
+	return __mlx4_cmd(dev, in_param, out_param, 1, in_modifier,
+			  op_modifier, op, timeout, native);
+}
+
+struct mlx4_cmd_mailbox *mlx4_alloc_cmd_mailbox(struct mlx4_dev *dev);
+void mlx4_free_cmd_mailbox(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox);
+
+u32 mlx4_comm_get_version(void);
+int mlx4_set_vf_mac(struct mlx4_dev *dev, int port, int vf, u64 mac);
+int mlx4_set_vf_vlan(struct mlx4_dev *dev, int port, int vf, u16 vlan, u8 qos);
+int mlx4_set_vf_spoofchk(struct mlx4_dev *dev, int port, int vf, bool setting);
+int mlx4_get_vf_config(struct mlx4_dev *dev, int port, int vf, struct ifla_vf_info *ivf);
+int mlx4_set_vf_link_state(struct mlx4_dev *dev, int port, int vf, int link_state);
+/*
+ * mlx4_get_slave_default_vlan -
+ * return true if VST ( default vlan)
+ * if VST, will return vlan & qos (if not NULL)
+ */
+bool mlx4_get_slave_default_vlan(struct mlx4_dev *dev, int port, int slave,
+				 u16 *vlan, u8 *qos);
+
+#define MLX4_COMM_GET_IF_REV(cmd_chan_ver) (u8)((cmd_chan_ver) >> 8)
+
+#endif /* MLX4_CMD_H */
diff --git a/include/linux/mlx4/cq.h b/include/linux/mlx4/cq.h
new file mode 100644
index 0000000..e7ecc12
--- /dev/null
+++ b/include/linux/mlx4/cq.h
@@ -0,0 +1,178 @@
+/*
+ * Copyright (c) 2007 Cisco Systems, Inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *	- Redistributions of source code must retain the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer.
+ *
+ *	- Redistributions in binary form must reproduce the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer in the documentation and/or other materials
+ *	  provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MLX4_CQ_H
+#define MLX4_CQ_H
+
+#include <linux/types.h>
+#include <uapi/linux/if_ether.h>
+
+#include <linux/mlx4/device.h>
+#include <linux/mlx4/doorbell.h>
+
+struct mlx4_cqe {
+	__be32			vlan_my_qpn;
+	__be32			immed_rss_invalid;
+	__be32			g_mlpath_rqpn;
+	__be16			sl_vid;
+	union {
+		struct {
+			__be16	rlid;
+			__be16  status;
+			u8      ipv6_ext_mask;
+			u8      badfcs_enc;
+		};
+		u8  smac[ETH_ALEN];
+	};
+	__be32			byte_cnt;
+	__be16			wqe_index;
+	__be16			checksum;
+	u8			reserved[3];
+	u8			owner_sr_opcode;
+};
+
+struct mlx4_err_cqe {
+	__be32			my_qpn;
+	u32			reserved1[5];
+	__be16			wqe_index;
+	u8			vendor_err_syndrome;
+	u8			syndrome;
+	u8			reserved2[3];
+	u8			owner_sr_opcode;
+};
+
+struct mlx4_ts_cqe {
+	__be32			vlan_my_qpn;
+	__be32			immed_rss_invalid;
+	__be32			g_mlpath_rqpn;
+	__be32			timestamp_hi;
+	__be16			status;
+	u8			ipv6_ext_mask;
+	u8			badfcs_enc;
+	__be32			byte_cnt;
+	__be16			wqe_index;
+	__be16			checksum;
+	u8			reserved;
+	__be16			timestamp_lo;
+	u8			owner_sr_opcode;
+} __packed;
+
+enum {
+	MLX4_CQE_L2_TUNNEL_IPOK		= 1 << 31,
+	MLX4_CQE_VLAN_PRESENT_MASK	= 1 << 29,
+	MLX4_CQE_L2_TUNNEL		= 1 << 27,
+	MLX4_CQE_L2_TUNNEL_CSUM		= 1 << 26,
+	MLX4_CQE_L2_TUNNEL_IPV4		= 1 << 25,
+
+	MLX4_CQE_QPN_MASK		= 0xffffff,
+	MLX4_CQE_VID_MASK		= 0xfff,
+};
+
+enum {
+	MLX4_CQE_OWNER_MASK	= 0x80,
+	MLX4_CQE_IS_SEND_MASK	= 0x40,
+	MLX4_CQE_OPCODE_MASK	= 0x1f
+};
+
+enum {
+	MLX4_CQE_SYNDROME_LOCAL_LENGTH_ERR		= 0x01,
+	MLX4_CQE_SYNDROME_LOCAL_QP_OP_ERR		= 0x02,
+	MLX4_CQE_SYNDROME_LOCAL_PROT_ERR		= 0x04,
+	MLX4_CQE_SYNDROME_WR_FLUSH_ERR			= 0x05,
+	MLX4_CQE_SYNDROME_MW_BIND_ERR			= 0x06,
+	MLX4_CQE_SYNDROME_BAD_RESP_ERR			= 0x10,
+	MLX4_CQE_SYNDROME_LOCAL_ACCESS_ERR		= 0x11,
+	MLX4_CQE_SYNDROME_REMOTE_INVAL_REQ_ERR		= 0x12,
+	MLX4_CQE_SYNDROME_REMOTE_ACCESS_ERR		= 0x13,
+	MLX4_CQE_SYNDROME_REMOTE_OP_ERR			= 0x14,
+	MLX4_CQE_SYNDROME_TRANSPORT_RETRY_EXC_ERR	= 0x15,
+	MLX4_CQE_SYNDROME_RNR_RETRY_EXC_ERR		= 0x16,
+	MLX4_CQE_SYNDROME_REMOTE_ABORTED_ERR		= 0x22,
+};
+
+enum {
+	MLX4_CQE_STATUS_IPV4		= 1 << 6,
+	MLX4_CQE_STATUS_IPV4F		= 1 << 7,
+	MLX4_CQE_STATUS_IPV6		= 1 << 8,
+	MLX4_CQE_STATUS_IPV4OPT		= 1 << 9,
+	MLX4_CQE_STATUS_TCP		= 1 << 10,
+	MLX4_CQE_STATUS_UDP		= 1 << 11,
+	MLX4_CQE_STATUS_IPOK		= 1 << 12,
+};
+
+enum {
+	MLX4_CQE_LLC                     = 1,
+	MLX4_CQE_SNAP                    = 1 << 1,
+	MLX4_CQE_BAD_FCS                 = 1 << 4,
+};
+
+static inline void mlx4_cq_arm(struct mlx4_cq *cq, u32 cmd,
+			       void __iomem *uar_page,
+			       spinlock_t *doorbell_lock)
+{
+	__be32 doorbell[2];
+	u32 sn;
+	u32 ci;
+
+	sn = cq->arm_sn & 3;
+	ci = cq->cons_index & 0xffffff;
+
+	*cq->arm_db = cpu_to_be32(sn << 28 | cmd | ci);
+
+	/*
+	 * Make sure that the doorbell record in host memory is
+	 * written before ringing the doorbell via PCI MMIO.
+	 */
+	wmb();
+
+	doorbell[0] = cpu_to_be32(sn << 28 | cmd | cq->cqn);
+	doorbell[1] = cpu_to_be32(ci);
+
+	mlx4_write64(doorbell, uar_page + MLX4_CQ_DOORBELL, doorbell_lock);
+}
+
+static inline void mlx4_cq_set_ci(struct mlx4_cq *cq)
+{
+	*cq->set_ci_db = cpu_to_be32(cq->cons_index & 0xffffff);
+}
+
+enum {
+	MLX4_CQ_DB_REQ_NOT_SOL		= 1 << 24,
+	MLX4_CQ_DB_REQ_NOT		= 2 << 24
+};
+
+int mlx4_cq_modify(struct mlx4_dev *dev, struct mlx4_cq *cq,
+		   u16 count, u16 period);
+int mlx4_cq_resize(struct mlx4_dev *dev, struct mlx4_cq *cq,
+		   int entries, struct mlx4_mtt *mtt);
+
+#endif /* MLX4_CQ_H */
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
new file mode 100644
index 0000000..37e4404
--- /dev/null
+++ b/include/linux/mlx4/device.h
@@ -0,0 +1,1292 @@
+/*
+ * Copyright (c) 2006, 2007 Cisco Systems, Inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *	- Redistributions of source code must retain the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer.
+ *
+ *	- Redistributions in binary form must reproduce the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer in the documentation and/or other materials
+ *	  provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MLX4_DEVICE_H
+#define MLX4_DEVICE_H
+
+#include <linux/if_ether.h>
+#include <linux/pci.h>
+#include <linux/completion.h>
+#include <linux/radix-tree.h>
+#include <linux/cpu_rmap.h>
+#include <linux/crash_dump.h>
+
+#include <linux/atomic.h>
+
+#include <linux/clocksource.h>
+
+#define MAX_MSIX_P_PORT		17
+#define MAX_MSIX		64
+#define MSIX_LEGACY_SZ		4
+#define MIN_MSIX_P_PORT		5
+
+#define MLX4_NUM_UP			8
+#define MLX4_NUM_TC			8
+#define MLX4_MAX_100M_UNITS_VAL		255	/*
+						 * work around: can't set values
+						 * greater then this value when
+						 * using 100 Mbps units.
+						 */
+#define MLX4_RATELIMIT_100M_UNITS	3	/* 100 Mbps */
+#define MLX4_RATELIMIT_1G_UNITS		4	/* 1 Gbps */
+#define MLX4_RATELIMIT_DEFAULT		0x00ff
+
+#define MLX4_ROCE_MAX_GIDS	128
+#define MLX4_ROCE_PF_GIDS	16
+
+enum {
+	MLX4_FLAG_MSI_X		= 1 << 0,
+	MLX4_FLAG_OLD_PORT_CMDS	= 1 << 1,
+	MLX4_FLAG_MASTER	= 1 << 2,
+	MLX4_FLAG_SLAVE		= 1 << 3,
+	MLX4_FLAG_SRIOV		= 1 << 4,
+	MLX4_FLAG_OLD_REG_MAC	= 1 << 6,
+};
+
+enum {
+	MLX4_PORT_CAP_IS_SM	= 1 << 1,
+	MLX4_PORT_CAP_DEV_MGMT_SUP = 1 << 19,
+};
+
+enum {
+	MLX4_MAX_PORTS		= 2,
+	MLX4_MAX_PORT_PKEYS	= 128
+};
+
+/* base qkey for use in sriov tunnel-qp/proxy-qp communication.
+ * These qkeys must not be allowed for general use. This is a 64k range,
+ * and to test for violation, we use the mask (protect against future chg).
+ */
+#define MLX4_RESERVED_QKEY_BASE  (0xFFFF0000)
+#define MLX4_RESERVED_QKEY_MASK  (0xFFFF0000)
+
+enum {
+	MLX4_BOARD_ID_LEN = 64
+};
+
+enum {
+	MLX4_MAX_NUM_PF		= 16,
+	MLX4_MAX_NUM_VF		= 64,
+	MLX4_MAX_NUM_VF_P_PORT  = 64,
+	MLX4_MFUNC_MAX		= 80,
+	MLX4_MAX_EQ_NUM		= 1024,
+	MLX4_MFUNC_EQ_NUM	= 4,
+	MLX4_MFUNC_MAX_EQES     = 8,
+	MLX4_MFUNC_EQE_MASK     = (MLX4_MFUNC_MAX_EQES - 1)
+};
+
+/* Driver supports 3 diffrent device methods to manage traffic steering:
+ *	-device managed - High level API for ib and eth flow steering. FW is
+ *			  managing flow steering tables.
+ *	- B0 steering mode - Common low level API for ib and (if supported) eth.
+ *	- A0 steering mode - Limited low level API for eth. In case of IB,
+ *			     B0 mode is in use.
+ */
+enum {
+	MLX4_STEERING_MODE_A0,
+	MLX4_STEERING_MODE_B0,
+	MLX4_STEERING_MODE_DEVICE_MANAGED
+};
+
+static inline const char *mlx4_steering_mode_str(int steering_mode)
+{
+	switch (steering_mode) {
+	case MLX4_STEERING_MODE_A0:
+		return "A0 steering";
+
+	case MLX4_STEERING_MODE_B0:
+		return "B0 steering";
+
+	case MLX4_STEERING_MODE_DEVICE_MANAGED:
+		return "Device managed flow steering";
+
+	default:
+		return "Unrecognize steering mode";
+	}
+}
+
+enum {
+	MLX4_TUNNEL_OFFLOAD_MODE_NONE,
+	MLX4_TUNNEL_OFFLOAD_MODE_VXLAN
+};
+
+enum {
+	MLX4_DEV_CAP_FLAG_RC		= 1LL <<  0,
+	MLX4_DEV_CAP_FLAG_UC		= 1LL <<  1,
+	MLX4_DEV_CAP_FLAG_UD		= 1LL <<  2,
+	MLX4_DEV_CAP_FLAG_XRC		= 1LL <<  3,
+	MLX4_DEV_CAP_FLAG_SRQ		= 1LL <<  6,
+	MLX4_DEV_CAP_FLAG_IPOIB_CSUM	= 1LL <<  7,
+	MLX4_DEV_CAP_FLAG_BAD_PKEY_CNTR	= 1LL <<  8,
+	MLX4_DEV_CAP_FLAG_BAD_QKEY_CNTR	= 1LL <<  9,
+	MLX4_DEV_CAP_FLAG_DPDP		= 1LL << 12,
+	MLX4_DEV_CAP_FLAG_BLH		= 1LL << 15,
+	MLX4_DEV_CAP_FLAG_MEM_WINDOW	= 1LL << 16,
+	MLX4_DEV_CAP_FLAG_APM		= 1LL << 17,
+	MLX4_DEV_CAP_FLAG_ATOMIC	= 1LL << 18,
+	MLX4_DEV_CAP_FLAG_RAW_MCAST	= 1LL << 19,
+	MLX4_DEV_CAP_FLAG_UD_AV_PORT	= 1LL << 20,
+	MLX4_DEV_CAP_FLAG_UD_MCAST	= 1LL << 21,
+	MLX4_DEV_CAP_FLAG_IBOE		= 1LL << 30,
+	MLX4_DEV_CAP_FLAG_UC_LOOPBACK	= 1LL << 32,
+	MLX4_DEV_CAP_FLAG_FCS_KEEP	= 1LL << 34,
+	MLX4_DEV_CAP_FLAG_WOL_PORT1	= 1LL << 37,
+	MLX4_DEV_CAP_FLAG_WOL_PORT2	= 1LL << 38,
+	MLX4_DEV_CAP_FLAG_UDP_RSS	= 1LL << 40,
+	MLX4_DEV_CAP_FLAG_VEP_UC_STEER	= 1LL << 41,
+	MLX4_DEV_CAP_FLAG_VEP_MC_STEER	= 1LL << 42,
+	MLX4_DEV_CAP_FLAG_COUNTERS	= 1LL << 48,
+	MLX4_DEV_CAP_FLAG_SET_ETH_SCHED = 1LL << 53,
+	MLX4_DEV_CAP_FLAG_SENSE_SUPPORT	= 1LL << 55,
+	MLX4_DEV_CAP_FLAG_PORT_MNG_CHG_EV = 1LL << 59,
+	MLX4_DEV_CAP_FLAG_64B_EQE	= 1LL << 61,
+	MLX4_DEV_CAP_FLAG_64B_CQE	= 1LL << 62
+};
+
+enum {
+	MLX4_DEV_CAP_FLAG2_RSS			= 1LL <<  0,
+	MLX4_DEV_CAP_FLAG2_RSS_TOP		= 1LL <<  1,
+	MLX4_DEV_CAP_FLAG2_RSS_XOR		= 1LL <<  2,
+	MLX4_DEV_CAP_FLAG2_FS_EN		= 1LL <<  3,
+	MLX4_DEV_CAP_FLAG2_REASSIGN_MAC_EN	= 1LL <<  4,
+	MLX4_DEV_CAP_FLAG2_TS			= 1LL <<  5,
+	MLX4_DEV_CAP_FLAG2_VLAN_CONTROL		= 1LL <<  6,
+	MLX4_DEV_CAP_FLAG2_FSM			= 1LL <<  7,
+	MLX4_DEV_CAP_FLAG2_UPDATE_QP		= 1LL <<  8,
+	MLX4_DEV_CAP_FLAG2_DMFS_IPOIB		= 1LL <<  9,
+	MLX4_DEV_CAP_FLAG2_VXLAN_OFFLOADS	= 1LL <<  10,
+	MLX4_DEV_CAP_FLAG2_MAD_DEMUX		= 1LL <<  11,
+	MLX4_DEV_CAP_FLAG2_CQE_STRIDE		= 1LL <<  12,
+	MLX4_DEV_CAP_FLAG2_EQE_STRIDE		= 1LL <<  13
+};
+
+enum {
+	MLX4_DEV_CAP_64B_EQE_ENABLED	= 1LL << 0,
+	MLX4_DEV_CAP_64B_CQE_ENABLED	= 1LL << 1,
+	MLX4_DEV_CAP_CQE_STRIDE_ENABLED	= 1LL << 2,
+	MLX4_DEV_CAP_EQE_STRIDE_ENABLED	= 1LL << 3
+};
+
+enum {
+	MLX4_USER_DEV_CAP_LARGE_CQE	= 1L << 0
+};
+
+enum {
+	MLX4_FUNC_CAP_64B_EQE_CQE	= 1L << 0,
+	MLX4_FUNC_CAP_EQE_CQE_STRIDE	= 1L << 1
+};
+
+
+#define MLX4_ATTR_EXTENDED_PORT_INFO	cpu_to_be16(0xff90)
+
+enum {
+	MLX4_BMME_FLAG_WIN_TYPE_2B	= 1 <<  1,
+	MLX4_BMME_FLAG_LOCAL_INV	= 1 <<  6,
+	MLX4_BMME_FLAG_REMOTE_INV	= 1 <<  7,
+	MLX4_BMME_FLAG_TYPE_2_WIN	= 1 <<  9,
+	MLX4_BMME_FLAG_RESERVED_LKEY	= 1 << 10,
+	MLX4_BMME_FLAG_FAST_REG_WR	= 1 << 11,
+	MLX4_BMME_FLAG_VSD_INIT2RTR	= 1 << 28,
+};
+
+enum mlx4_event {
+	MLX4_EVENT_TYPE_COMP		   = 0x00,
+	MLX4_EVENT_TYPE_PATH_MIG	   = 0x01,
+	MLX4_EVENT_TYPE_COMM_EST	   = 0x02,
+	MLX4_EVENT_TYPE_SQ_DRAINED	   = 0x03,
+	MLX4_EVENT_TYPE_SRQ_QP_LAST_WQE	   = 0x13,
+	MLX4_EVENT_TYPE_SRQ_LIMIT	   = 0x14,
+	MLX4_EVENT_TYPE_CQ_ERROR	   = 0x04,
+	MLX4_EVENT_TYPE_WQ_CATAS_ERROR	   = 0x05,
+	MLX4_EVENT_TYPE_EEC_CATAS_ERROR	   = 0x06,
+	MLX4_EVENT_TYPE_PATH_MIG_FAILED	   = 0x07,
+	MLX4_EVENT_TYPE_WQ_INVAL_REQ_ERROR = 0x10,
+	MLX4_EVENT_TYPE_WQ_ACCESS_ERROR	   = 0x11,
+	MLX4_EVENT_TYPE_SRQ_CATAS_ERROR	   = 0x12,
+	MLX4_EVENT_TYPE_LOCAL_CATAS_ERROR  = 0x08,
+	MLX4_EVENT_TYPE_PORT_CHANGE	   = 0x09,
+	MLX4_EVENT_TYPE_EQ_OVERFLOW	   = 0x0f,
+	MLX4_EVENT_TYPE_ECC_DETECT	   = 0x0e,
+	MLX4_EVENT_TYPE_CMD		   = 0x0a,
+	MLX4_EVENT_TYPE_VEP_UPDATE	   = 0x19,
+	MLX4_EVENT_TYPE_COMM_CHANNEL	   = 0x18,
+	MLX4_EVENT_TYPE_OP_REQUIRED	   = 0x1a,
+	MLX4_EVENT_TYPE_FATAL_WARNING	   = 0x1b,
+	MLX4_EVENT_TYPE_FLR_EVENT	   = 0x1c,
+	MLX4_EVENT_TYPE_PORT_MNG_CHG_EVENT = 0x1d,
+	MLX4_EVENT_TYPE_NONE		   = 0xff,
+};
+
+enum {
+	MLX4_PORT_CHANGE_SUBTYPE_DOWN	= 1,
+	MLX4_PORT_CHANGE_SUBTYPE_ACTIVE	= 4
+};
+
+enum {
+	MLX4_FATAL_WARNING_SUBTYPE_WARMING = 0,
+};
+
+enum slave_port_state {
+	SLAVE_PORT_DOWN = 0,
+	SLAVE_PENDING_UP,
+	SLAVE_PORT_UP,
+};
+
+enum slave_port_gen_event {
+	SLAVE_PORT_GEN_EVENT_DOWN = 0,
+	SLAVE_PORT_GEN_EVENT_UP,
+	SLAVE_PORT_GEN_EVENT_NONE,
+};
+
+enum slave_port_state_event {
+	MLX4_PORT_STATE_DEV_EVENT_PORT_DOWN,
+	MLX4_PORT_STATE_DEV_EVENT_PORT_UP,
+	MLX4_PORT_STATE_IB_PORT_STATE_EVENT_GID_VALID,
+	MLX4_PORT_STATE_IB_EVENT_GID_INVALID,
+};
+
+enum {
+	MLX4_PERM_LOCAL_READ	= 1 << 10,
+	MLX4_PERM_LOCAL_WRITE	= 1 << 11,
+	MLX4_PERM_REMOTE_READ	= 1 << 12,
+	MLX4_PERM_REMOTE_WRITE	= 1 << 13,
+	MLX4_PERM_ATOMIC	= 1 << 14,
+	MLX4_PERM_BIND_MW	= 1 << 15,
+	MLX4_PERM_MASK		= 0xFC00
+};
+
+enum {
+	MLX4_OPCODE_NOP			= 0x00,
+	MLX4_OPCODE_SEND_INVAL		= 0x01,
+	MLX4_OPCODE_RDMA_WRITE		= 0x08,
+	MLX4_OPCODE_RDMA_WRITE_IMM	= 0x09,
+	MLX4_OPCODE_SEND		= 0x0a,
+	MLX4_OPCODE_SEND_IMM		= 0x0b,
+	MLX4_OPCODE_LSO			= 0x0e,
+	MLX4_OPCODE_RDMA_READ		= 0x10,
+	MLX4_OPCODE_ATOMIC_CS		= 0x11,
+	MLX4_OPCODE_ATOMIC_FA		= 0x12,
+	MLX4_OPCODE_MASKED_ATOMIC_CS	= 0x14,
+	MLX4_OPCODE_MASKED_ATOMIC_FA	= 0x15,
+	MLX4_OPCODE_BIND_MW		= 0x18,
+	MLX4_OPCODE_FMR			= 0x19,
+	MLX4_OPCODE_LOCAL_INVAL		= 0x1b,
+	MLX4_OPCODE_CONFIG_CMD		= 0x1f,
+
+	MLX4_RECV_OPCODE_RDMA_WRITE_IMM	= 0x00,
+	MLX4_RECV_OPCODE_SEND		= 0x01,
+	MLX4_RECV_OPCODE_SEND_IMM	= 0x02,
+	MLX4_RECV_OPCODE_SEND_INVAL	= 0x03,
+
+	MLX4_CQE_OPCODE_ERROR		= 0x1e,
+	MLX4_CQE_OPCODE_RESIZE		= 0x16,
+};
+
+enum {
+	MLX4_STAT_RATE_OFFSET	= 5
+};
+
+enum mlx4_protocol {
+	MLX4_PROT_IB_IPV6 = 0,
+	MLX4_PROT_ETH,
+	MLX4_PROT_IB_IPV4,
+	MLX4_PROT_FCOE
+};
+
+enum {
+	MLX4_MTT_FLAG_PRESENT		= 1
+};
+
+enum mlx4_qp_region {
+	MLX4_QP_REGION_FW = 0,
+	MLX4_QP_REGION_ETH_ADDR,
+	MLX4_QP_REGION_FC_ADDR,
+	MLX4_QP_REGION_FC_EXCH,
+	MLX4_NUM_QP_REGION
+};
+
+enum mlx4_port_type {
+	MLX4_PORT_TYPE_NONE	= 0,
+	MLX4_PORT_TYPE_IB	= 1,
+	MLX4_PORT_TYPE_ETH	= 2,
+	MLX4_PORT_TYPE_AUTO	= 3
+};
+
+enum mlx4_special_vlan_idx {
+	MLX4_NO_VLAN_IDX        = 0,
+	MLX4_VLAN_MISS_IDX,
+	MLX4_VLAN_REGULAR
+};
+
+enum mlx4_steer_type {
+	MLX4_MC_STEER = 0,
+	MLX4_UC_STEER,
+	MLX4_NUM_STEERS
+};
+
+enum {
+	MLX4_NUM_FEXCH          = 64 * 1024,
+};
+
+enum {
+	MLX4_MAX_FAST_REG_PAGES = 511,
+};
+
+enum {
+	MLX4_DEV_PMC_SUBTYPE_GUID_INFO	 = 0x14,
+	MLX4_DEV_PMC_SUBTYPE_PORT_INFO	 = 0x15,
+	MLX4_DEV_PMC_SUBTYPE_PKEY_TABLE	 = 0x16,
+};
+
+/* Port mgmt change event handling */
+enum {
+	MLX4_EQ_PORT_INFO_MSTR_SM_LID_CHANGE_MASK	= 1 << 0,
+	MLX4_EQ_PORT_INFO_GID_PFX_CHANGE_MASK		= 1 << 1,
+	MLX4_EQ_PORT_INFO_LID_CHANGE_MASK		= 1 << 2,
+	MLX4_EQ_PORT_INFO_CLIENT_REREG_MASK		= 1 << 3,
+	MLX4_EQ_PORT_INFO_MSTR_SM_SL_CHANGE_MASK	= 1 << 4,
+};
+
+#define MSTR_SM_CHANGE_MASK (MLX4_EQ_PORT_INFO_MSTR_SM_SL_CHANGE_MASK | \
+			     MLX4_EQ_PORT_INFO_MSTR_SM_LID_CHANGE_MASK)
+
+static inline u64 mlx4_fw_ver(u64 major, u64 minor, u64 subminor)
+{
+	return (major << 32) | (minor << 16) | subminor;
+}
+
+struct mlx4_phys_caps {
+	u32			gid_phys_table_len[MLX4_MAX_PORTS + 1];
+	u32			pkey_phys_table_len[MLX4_MAX_PORTS + 1];
+	u32			num_phys_eqs;
+	u32			base_sqpn;
+	u32			base_proxy_sqpn;
+	u32			base_tunnel_sqpn;
+};
+
+struct mlx4_caps {
+	u64			fw_ver;
+	u32			function;
+	int			num_ports;
+	int			vl_cap[MLX4_MAX_PORTS + 1];
+	int			ib_mtu_cap[MLX4_MAX_PORTS + 1];
+	__be32			ib_port_def_cap[MLX4_MAX_PORTS + 1];
+	u64			def_mac[MLX4_MAX_PORTS + 1];
+	int			eth_mtu_cap[MLX4_MAX_PORTS + 1];
+	int			gid_table_len[MLX4_MAX_PORTS + 1];
+	int			pkey_table_len[MLX4_MAX_PORTS + 1];
+	int			trans_type[MLX4_MAX_PORTS + 1];
+	int			vendor_oui[MLX4_MAX_PORTS + 1];
+	int			wavelength[MLX4_MAX_PORTS + 1];
+	u64			trans_code[MLX4_MAX_PORTS + 1];
+	int			local_ca_ack_delay;
+	int			num_uars;
+	u32			uar_page_size;
+	int			bf_reg_size;
+	int			bf_regs_per_page;
+	int			max_sq_sg;
+	int			max_rq_sg;
+	int			num_qps;
+	int			max_wqes;
+	int			max_sq_desc_sz;
+	int			max_rq_desc_sz;
+	int			max_qp_init_rdma;
+	int			max_qp_dest_rdma;
+	u32			*qp0_qkey;
+	u32			*qp0_proxy;
+	u32			*qp1_proxy;
+	u32			*qp0_tunnel;
+	u32			*qp1_tunnel;
+	int			num_srqs;
+	int			max_srq_wqes;
+	int			max_srq_sge;
+	int			reserved_srqs;
+	int			num_cqs;
+	int			max_cqes;
+	int			reserved_cqs;
+	int			num_eqs;
+	int			reserved_eqs;
+	int			num_comp_vectors;
+	int			comp_pool;
+	int			num_mpts;
+	int			max_fmr_maps;
+	int			num_mtts;
+	int			fmr_reserved_mtts;
+	int			reserved_mtts;
+	int			reserved_mrws;
+	int			reserved_uars;
+	int			num_mgms;
+	int			num_amgms;
+	int			reserved_mcgs;
+	int			num_qp_per_mgm;
+	int			steering_mode;
+	int			fs_log_max_ucast_qp_range_size;
+	int			num_pds;
+	int			reserved_pds;
+	int			max_xrcds;
+	int			reserved_xrcds;
+	int			mtt_entry_sz;
+	u32			max_msg_sz;
+	u32			page_size_cap;
+	u64			flags;
+	u64			flags2;
+	u32			bmme_flags;
+	u32			reserved_lkey;
+	u16			stat_rate_support;
+	u8			port_width_cap[MLX4_MAX_PORTS + 1];
+	int			max_gso_sz;
+	int			max_rss_tbl_sz;
+	int                     reserved_qps_cnt[MLX4_NUM_QP_REGION];
+	int			reserved_qps;
+	int                     reserved_qps_base[MLX4_NUM_QP_REGION];
+	int                     log_num_macs;
+	int                     log_num_vlans;
+	enum mlx4_port_type	port_type[MLX4_MAX_PORTS + 1];
+	u8			supported_type[MLX4_MAX_PORTS + 1];
+	u8                      suggested_type[MLX4_MAX_PORTS + 1];
+	u8                      default_sense[MLX4_MAX_PORTS + 1];
+	u32			port_mask[MLX4_MAX_PORTS + 1];
+	enum mlx4_port_type	possible_type[MLX4_MAX_PORTS + 1];
+	u32			max_counters;
+	u8			port_ib_mtu[MLX4_MAX_PORTS + 1];
+	u16			sqp_demux;
+	u32			eqe_size;
+	u32			cqe_size;
+	u8			eqe_factor;
+	u32			userspace_caps; /* userspace must be aware of these */
+	u32			function_caps;  /* VFs must be aware of these */
+	u16			hca_core_clock;
+	u64			phys_port_id[MLX4_MAX_PORTS + 1];
+	int			tunnel_offload_mode;
+};
+
+struct mlx4_buf_list {
+	void		       *buf;
+	dma_addr_t		map;
+};
+
+struct mlx4_buf {
+	struct mlx4_buf_list	direct;
+	struct mlx4_buf_list   *page_list;
+	int			nbufs;
+	int			npages;
+	int			page_shift;
+};
+
+struct mlx4_mtt {
+	u32			offset;
+	int			order;
+	int			page_shift;
+};
+
+enum {
+	MLX4_DB_PER_PAGE = PAGE_SIZE / 4
+};
+
+struct mlx4_db_pgdir {
+	struct list_head	list;
+	DECLARE_BITMAP(order0, MLX4_DB_PER_PAGE);
+	DECLARE_BITMAP(order1, MLX4_DB_PER_PAGE / 2);
+	unsigned long	       *bits[2];
+	__be32		       *db_page;
+	dma_addr_t		db_dma;
+};
+
+struct mlx4_ib_user_db_page;
+
+struct mlx4_db {
+	__be32			*db;
+	union {
+		struct mlx4_db_pgdir		*pgdir;
+		struct mlx4_ib_user_db_page	*user_page;
+	}			u;
+	dma_addr_t		dma;
+	int			index;
+	int			order;
+};
+
+struct mlx4_hwq_resources {
+	struct mlx4_db		db;
+	struct mlx4_mtt		mtt;
+	struct mlx4_buf		buf;
+};
+
+struct mlx4_mr {
+	struct mlx4_mtt		mtt;
+	u64			iova;
+	u64			size;
+	u32			key;
+	u32			pd;
+	u32			access;
+	int			enabled;
+};
+
+enum mlx4_mw_type {
+	MLX4_MW_TYPE_1 = 1,
+	MLX4_MW_TYPE_2 = 2,
+};
+
+struct mlx4_mw {
+	u32			key;
+	u32			pd;
+	enum mlx4_mw_type	type;
+	int			enabled;
+};
+
+struct mlx4_fmr {
+	struct mlx4_mr		mr;
+	struct mlx4_mpt_entry  *mpt;
+	__be64		       *mtts;
+	dma_addr_t		dma_handle;
+	int			max_pages;
+	int			max_maps;
+	int			maps;
+	u8			page_shift;
+};
+
+struct mlx4_uar {
+	unsigned long		pfn;
+	int			index;
+	struct list_head	bf_list;
+	unsigned		free_bf_bmap;
+	void __iomem	       *map;
+	void __iomem	       *bf_map;
+};
+
+struct mlx4_bf {
+	unsigned int		offset;
+	int			buf_size;
+	struct mlx4_uar	       *uar;
+	void __iomem	       *reg;
+};
+
+struct mlx4_cq {
+	void (*comp)		(struct mlx4_cq *);
+	void (*event)		(struct mlx4_cq *, enum mlx4_event);
+
+	struct mlx4_uar	       *uar;
+
+	u32			cons_index;
+
+	u16                     irq;
+	__be32		       *set_ci_db;
+	__be32		       *arm_db;
+	int			arm_sn;
+
+	int			cqn;
+	unsigned		vector;
+
+	atomic_t		refcount;
+	struct completion	free;
+};
+
+struct mlx4_qp {
+	void (*event)		(struct mlx4_qp *, enum mlx4_event);
+
+	int			qpn;
+
+	atomic_t		refcount;
+	struct completion	free;
+};
+
+struct mlx4_srq {
+	void (*event)		(struct mlx4_srq *, enum mlx4_event);
+
+	int			srqn;
+	int			max;
+	int			max_gs;
+	int			wqe_shift;
+
+	atomic_t		refcount;
+	struct completion	free;
+};
+
+struct mlx4_av {
+	__be32			port_pd;
+	u8			reserved1;
+	u8			g_slid;
+	__be16			dlid;
+	u8			reserved2;
+	u8			gid_index;
+	u8			stat_rate;
+	u8			hop_limit;
+	__be32			sl_tclass_flowlabel;
+	u8			dgid[16];
+};
+
+struct mlx4_eth_av {
+	__be32		port_pd;
+	u8		reserved1;
+	u8		smac_idx;
+	u16		reserved2;
+	u8		reserved3;
+	u8		gid_index;
+	u8		stat_rate;
+	u8		hop_limit;
+	__be32		sl_tclass_flowlabel;
+	u8		dgid[16];
+	u8		s_mac[6];
+	u8		reserved4[2];
+	__be16		vlan;
+	u8		mac[ETH_ALEN];
+};
+
+union mlx4_ext_av {
+	struct mlx4_av		ib;
+	struct mlx4_eth_av	eth;
+};
+
+struct mlx4_counter {
+	u8	reserved1[3];
+	u8	counter_mode;
+	__be32	num_ifc;
+	u32	reserved2[2];
+	__be64	rx_frames;
+	__be64	rx_bytes;
+	__be64	tx_frames;
+	__be64	tx_bytes;
+};
+
+struct mlx4_quotas {
+	int qp;
+	int cq;
+	int srq;
+	int mpt;
+	int mtt;
+	int counter;
+	int xrcd;
+};
+
+struct mlx4_vf_dev {
+	u8			min_port;
+	u8			n_ports;
+};
+
+struct mlx4_dev {
+	struct pci_dev	       *pdev;
+	unsigned long		flags;
+	unsigned long		num_slaves;
+	struct mlx4_caps	caps;
+	struct mlx4_phys_caps	phys_caps;
+	struct mlx4_quotas	quotas;
+	struct radix_tree_root	qp_table_tree;
+	u8			rev_id;
+	char			board_id[MLX4_BOARD_ID_LEN];
+	int			num_vfs;
+	int			numa_node;
+	int			oper_log_mgm_entry_size;
+	u64			regid_promisc_array[MLX4_MAX_PORTS + 1];
+	u64			regid_allmulti_array[MLX4_MAX_PORTS + 1];
+	struct mlx4_vf_dev     *dev_vfs;
+	int                     nvfs[MLX4_MAX_PORTS + 1];
+};
+
+struct mlx4_eqe {
+	u8			reserved1;
+	u8			type;
+	u8			reserved2;
+	u8			subtype;
+	union {
+		u32		raw[6];
+		struct {
+			__be32	cqn;
+		} __packed comp;
+		struct {
+			u16	reserved1;
+			__be16	token;
+			u32	reserved2;
+			u8	reserved3[3];
+			u8	status;
+			__be64	out_param;
+		} __packed cmd;
+		struct {
+			__be32	qpn;
+		} __packed qp;
+		struct {
+			__be32	srqn;
+		} __packed srq;
+		struct {
+			__be32	cqn;
+			u32	reserved1;
+			u8	reserved2[3];
+			u8	syndrome;
+		} __packed cq_err;
+		struct {
+			u32	reserved1[2];
+			__be32	port;
+		} __packed port_change;
+		struct {
+			#define COMM_CHANNEL_BIT_ARRAY_SIZE	4
+			u32 reserved;
+			u32 bit_vec[COMM_CHANNEL_BIT_ARRAY_SIZE];
+		} __packed comm_channel_arm;
+		struct {
+			u8	port;
+			u8	reserved[3];
+			__be64	mac;
+		} __packed mac_update;
+		struct {
+			__be32	slave_id;
+		} __packed flr_event;
+		struct {
+			__be16  current_temperature;
+			__be16  warning_threshold;
+		} __packed warming;
+		struct {
+			u8 reserved[3];
+			u8 port;
+			union {
+				struct {
+					__be16 mstr_sm_lid;
+					__be16 port_lid;
+					__be32 changed_attr;
+					u8 reserved[3];
+					u8 mstr_sm_sl;
+					__be64 gid_prefix;
+				} __packed port_info;
+				struct {
+					__be32 block_ptr;
+					__be32 tbl_entries_mask;
+				} __packed tbl_change_info;
+			} params;
+		} __packed port_mgmt_change;
+	}			event;
+	u8			slave_id;
+	u8			reserved3[2];
+	u8			owner;
+} __packed;
+
+struct mlx4_init_port_param {
+	int			set_guid0;
+	int			set_node_guid;
+	int			set_si_guid;
+	u16			mtu;
+	int			port_width_cap;
+	u16			vl_cap;
+	u16			max_gid;
+	u16			max_pkey;
+	u64			guid0;
+	u64			node_guid;
+	u64			si_guid;
+};
+
+#define mlx4_foreach_port(port, dev, type)				\
+	for ((port) = 1; (port) <= (dev)->caps.num_ports; (port)++)	\
+		if ((type) == (dev)->caps.port_mask[(port)])
+
+#define mlx4_foreach_non_ib_transport_port(port, dev)                     \
+	for ((port) = 1; (port) <= (dev)->caps.num_ports; (port)++)	  \
+		if (((dev)->caps.port_mask[port] != MLX4_PORT_TYPE_IB))
+
+#define mlx4_foreach_ib_transport_port(port, dev)                         \
+	for ((port) = 1; (port) <= (dev)->caps.num_ports; (port)++)	  \
+		if (((dev)->caps.port_mask[port] == MLX4_PORT_TYPE_IB) || \
+			((dev)->caps.flags & MLX4_DEV_CAP_FLAG_IBOE))
+
+#define MLX4_INVALID_SLAVE_ID	0xFF
+
+void handle_port_mgmt_change_event(struct work_struct *work);
+
+static inline int mlx4_master_func_num(struct mlx4_dev *dev)
+{
+	return dev->caps.function;
+}
+
+static inline int mlx4_is_master(struct mlx4_dev *dev)
+{
+	return dev->flags & MLX4_FLAG_MASTER;
+}
+
+static inline int mlx4_num_reserved_sqps(struct mlx4_dev *dev)
+{
+	return dev->phys_caps.base_sqpn + 8 +
+		16 * MLX4_MFUNC_MAX * !!mlx4_is_master(dev);
+}
+
+static inline int mlx4_is_qp_reserved(struct mlx4_dev *dev, u32 qpn)
+{
+	return (qpn < dev->phys_caps.base_sqpn + 8 +
+		16 * MLX4_MFUNC_MAX * !!mlx4_is_master(dev));
+}
+
+static inline int mlx4_is_guest_proxy(struct mlx4_dev *dev, int slave, u32 qpn)
+{
+	int guest_proxy_base = dev->phys_caps.base_proxy_sqpn + slave * 8;
+
+	if (qpn >= guest_proxy_base && qpn < guest_proxy_base + 8)
+		return 1;
+
+	return 0;
+}
+
+static inline int mlx4_is_mfunc(struct mlx4_dev *dev)
+{
+	return dev->flags & (MLX4_FLAG_SLAVE | MLX4_FLAG_MASTER);
+}
+
+static inline int mlx4_is_slave(struct mlx4_dev *dev)
+{
+	return dev->flags & MLX4_FLAG_SLAVE;
+}
+
+int mlx4_buf_alloc(struct mlx4_dev *dev, int size, int max_direct,
+		   struct mlx4_buf *buf, gfp_t gfp);
+void mlx4_buf_free(struct mlx4_dev *dev, int size, struct mlx4_buf *buf);
+static inline void *mlx4_buf_offset(struct mlx4_buf *buf, int offset)
+{
+	if (BITS_PER_LONG == 64 || buf->nbufs == 1)
+		return buf->direct.buf + offset;
+	else
+		return buf->page_list[offset >> PAGE_SHIFT].buf +
+			(offset & (PAGE_SIZE - 1));
+}
+
+int mlx4_pd_alloc(struct mlx4_dev *dev, u32 *pdn);
+void mlx4_pd_free(struct mlx4_dev *dev, u32 pdn);
+int mlx4_xrcd_alloc(struct mlx4_dev *dev, u32 *xrcdn);
+void mlx4_xrcd_free(struct mlx4_dev *dev, u32 xrcdn);
+
+int mlx4_uar_alloc(struct mlx4_dev *dev, struct mlx4_uar *uar);
+void mlx4_uar_free(struct mlx4_dev *dev, struct mlx4_uar *uar);
+int mlx4_bf_alloc(struct mlx4_dev *dev, struct mlx4_bf *bf, int node);
+void mlx4_bf_free(struct mlx4_dev *dev, struct mlx4_bf *bf);
+
+int mlx4_mtt_init(struct mlx4_dev *dev, int npages, int page_shift,
+		  struct mlx4_mtt *mtt);
+void mlx4_mtt_cleanup(struct mlx4_dev *dev, struct mlx4_mtt *mtt);
+u64 mlx4_mtt_addr(struct mlx4_dev *dev, struct mlx4_mtt *mtt);
+
+int mlx4_mr_alloc(struct mlx4_dev *dev, u32 pd, u64 iova, u64 size, u32 access,
+		  int npages, int page_shift, struct mlx4_mr *mr);
+int mlx4_mr_free(struct mlx4_dev *dev, struct mlx4_mr *mr);
+int mlx4_mr_enable(struct mlx4_dev *dev, struct mlx4_mr *mr);
+int mlx4_mw_alloc(struct mlx4_dev *dev, u32 pd, enum mlx4_mw_type type,
+		  struct mlx4_mw *mw);
+void mlx4_mw_free(struct mlx4_dev *dev, struct mlx4_mw *mw);
+int mlx4_mw_enable(struct mlx4_dev *dev, struct mlx4_mw *mw);
+int mlx4_write_mtt(struct mlx4_dev *dev, struct mlx4_mtt *mtt,
+		   int start_index, int npages, u64 *page_list);
+int mlx4_buf_write_mtt(struct mlx4_dev *dev, struct mlx4_mtt *mtt,
+		       struct mlx4_buf *buf, gfp_t gfp);
+
+int mlx4_db_alloc(struct mlx4_dev *dev, struct mlx4_db *db, int order,
+		  gfp_t gfp);
+void mlx4_db_free(struct mlx4_dev *dev, struct mlx4_db *db);
+
+int mlx4_alloc_hwq_res(struct mlx4_dev *dev, struct mlx4_hwq_resources *wqres,
+		       int size, int max_direct);
+void mlx4_free_hwq_res(struct mlx4_dev *mdev, struct mlx4_hwq_resources *wqres,
+		       int size);
+
+int mlx4_cq_alloc(struct mlx4_dev *dev, int nent, struct mlx4_mtt *mtt,
+		  struct mlx4_uar *uar, u64 db_rec, struct mlx4_cq *cq,
+		  unsigned vector, int collapsed, int timestamp_en);
+void mlx4_cq_free(struct mlx4_dev *dev, struct mlx4_cq *cq);
+
+int mlx4_qp_reserve_range(struct mlx4_dev *dev, int cnt, int align, int *base);
+void mlx4_qp_release_range(struct mlx4_dev *dev, int base_qpn, int cnt);
+
+int mlx4_qp_alloc(struct mlx4_dev *dev, int qpn, struct mlx4_qp *qp,
+		  gfp_t gfp);
+void mlx4_qp_free(struct mlx4_dev *dev, struct mlx4_qp *qp);
+
+int mlx4_srq_alloc(struct mlx4_dev *dev, u32 pdn, u32 cqn, u16 xrcdn,
+		   struct mlx4_mtt *mtt, u64 db_rec, struct mlx4_srq *srq);
+void mlx4_srq_free(struct mlx4_dev *dev, struct mlx4_srq *srq);
+int mlx4_srq_arm(struct mlx4_dev *dev, struct mlx4_srq *srq, int limit_watermark);
+int mlx4_srq_query(struct mlx4_dev *dev, struct mlx4_srq *srq, int *limit_watermark);
+
+int mlx4_INIT_PORT(struct mlx4_dev *dev, int port);
+int mlx4_CLOSE_PORT(struct mlx4_dev *dev, int port);
+
+int mlx4_unicast_attach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16],
+			int block_mcast_loopback, enum mlx4_protocol prot);
+int mlx4_unicast_detach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16],
+			enum mlx4_protocol prot);
+int mlx4_multicast_attach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16],
+			  u8 port, int block_mcast_loopback,
+			  enum mlx4_protocol protocol, u64 *reg_id);
+int mlx4_multicast_detach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16],
+			  enum mlx4_protocol protocol, u64 reg_id);
+
+enum {
+	MLX4_DOMAIN_UVERBS	= 0x1000,
+	MLX4_DOMAIN_ETHTOOL     = 0x2000,
+	MLX4_DOMAIN_RFS         = 0x3000,
+	MLX4_DOMAIN_NIC    = 0x5000,
+};
+
+enum mlx4_net_trans_rule_id {
+	MLX4_NET_TRANS_RULE_ID_ETH = 0,
+	MLX4_NET_TRANS_RULE_ID_IB,
+	MLX4_NET_TRANS_RULE_ID_IPV6,
+	MLX4_NET_TRANS_RULE_ID_IPV4,
+	MLX4_NET_TRANS_RULE_ID_TCP,
+	MLX4_NET_TRANS_RULE_ID_UDP,
+	MLX4_NET_TRANS_RULE_ID_VXLAN,
+	MLX4_NET_TRANS_RULE_NUM, /* should be last */
+};
+
+extern const u16 __sw_id_hw[];
+
+static inline int map_hw_to_sw_id(u16 header_id)
+{
+
+	int i;
+	for (i = 0; i < MLX4_NET_TRANS_RULE_NUM; i++) {
+		if (header_id == __sw_id_hw[i])
+			return i;
+	}
+	return -EINVAL;
+}
+
+enum mlx4_net_trans_promisc_mode {
+	MLX4_FS_REGULAR = 1,
+	MLX4_FS_ALL_DEFAULT,
+	MLX4_FS_MC_DEFAULT,
+	MLX4_FS_UC_SNIFFER,
+	MLX4_FS_MC_SNIFFER,
+	MLX4_FS_MODE_NUM, /* should be last */
+};
+
+struct mlx4_spec_eth {
+	u8	dst_mac[ETH_ALEN];
+	u8	dst_mac_msk[ETH_ALEN];
+	u8	src_mac[ETH_ALEN];
+	u8	src_mac_msk[ETH_ALEN];
+	u8	ether_type_enable;
+	__be16	ether_type;
+	__be16	vlan_id_msk;
+	__be16	vlan_id;
+};
+
+struct mlx4_spec_tcp_udp {
+	__be16 dst_port;
+	__be16 dst_port_msk;
+	__be16 src_port;
+	__be16 src_port_msk;
+};
+
+struct mlx4_spec_ipv4 {
+	__be32 dst_ip;
+	__be32 dst_ip_msk;
+	__be32 src_ip;
+	__be32 src_ip_msk;
+};
+
+struct mlx4_spec_ib {
+	__be32  l3_qpn;
+	__be32	qpn_msk;
+	u8	dst_gid[16];
+	u8	dst_gid_msk[16];
+};
+
+struct mlx4_spec_vxlan {
+	__be32 vni;
+	__be32 vni_mask;
+
+};
+
+struct mlx4_spec_list {
+	struct	list_head list;
+	enum	mlx4_net_trans_rule_id id;
+	union {
+		struct mlx4_spec_eth eth;
+		struct mlx4_spec_ib ib;
+		struct mlx4_spec_ipv4 ipv4;
+		struct mlx4_spec_tcp_udp tcp_udp;
+		struct mlx4_spec_vxlan vxlan;
+	};
+};
+
+enum mlx4_net_trans_hw_rule_queue {
+	MLX4_NET_TRANS_Q_FIFO,
+	MLX4_NET_TRANS_Q_LIFO,
+};
+
+struct mlx4_net_trans_rule {
+	struct	list_head list;
+	enum	mlx4_net_trans_hw_rule_queue queue_mode;
+	bool	exclusive;
+	bool	allow_loopback;
+	enum	mlx4_net_trans_promisc_mode promisc_mode;
+	u8	port;
+	u16	priority;
+	u32	qpn;
+};
+
+struct mlx4_net_trans_rule_hw_ctrl {
+	__be16 prio;
+	u8 type;
+	u8 flags;
+	u8 rsvd1;
+	u8 funcid;
+	u8 vep;
+	u8 port;
+	__be32 qpn;
+	__be32 rsvd2;
+};
+
+struct mlx4_net_trans_rule_hw_ib {
+	u8 size;
+	u8 rsvd1;
+	__be16 id;
+	u32 rsvd2;
+	__be32 l3_qpn;
+	__be32 qpn_mask;
+	u8 dst_gid[16];
+	u8 dst_gid_msk[16];
+} __packed;
+
+struct mlx4_net_trans_rule_hw_eth {
+	u8	size;
+	u8	rsvd;
+	__be16	id;
+	u8	rsvd1[6];
+	u8	dst_mac[6];
+	u16	rsvd2;
+	u8	dst_mac_msk[6];
+	u16	rsvd3;
+	u8	src_mac[6];
+	u16	rsvd4;
+	u8	src_mac_msk[6];
+	u8      rsvd5;
+	u8      ether_type_enable;
+	__be16  ether_type;
+	__be16  vlan_tag_msk;
+	__be16  vlan_tag;
+} __packed;
+
+struct mlx4_net_trans_rule_hw_tcp_udp {
+	u8	size;
+	u8	rsvd;
+	__be16	id;
+	__be16	rsvd1[3];
+	__be16	dst_port;
+	__be16	rsvd2;
+	__be16	dst_port_msk;
+	__be16	rsvd3;
+	__be16	src_port;
+	__be16	rsvd4;
+	__be16	src_port_msk;
+} __packed;
+
+struct mlx4_net_trans_rule_hw_ipv4 {
+	u8	size;
+	u8	rsvd;
+	__be16	id;
+	__be32	rsvd1;
+	__be32	dst_ip;
+	__be32	dst_ip_msk;
+	__be32	src_ip;
+	__be32	src_ip_msk;
+} __packed;
+
+struct mlx4_net_trans_rule_hw_vxlan {
+	u8	size;
+	u8	rsvd;
+	__be16	id;
+	__be32	rsvd1;
+	__be32	vni;
+	__be32	vni_mask;
+} __packed;
+
+struct _rule_hw {
+	union {
+		struct {
+			u8 size;
+			u8 rsvd;
+			__be16 id;
+		};
+		struct mlx4_net_trans_rule_hw_eth eth;
+		struct mlx4_net_trans_rule_hw_ib ib;
+		struct mlx4_net_trans_rule_hw_ipv4 ipv4;
+		struct mlx4_net_trans_rule_hw_tcp_udp tcp_udp;
+		struct mlx4_net_trans_rule_hw_vxlan vxlan;
+	};
+};
+
+enum {
+	VXLAN_STEER_BY_OUTER_MAC	= 1 << 0,
+	VXLAN_STEER_BY_OUTER_VLAN	= 1 << 1,
+	VXLAN_STEER_BY_VSID_VNI		= 1 << 2,
+	VXLAN_STEER_BY_INNER_MAC	= 1 << 3,
+	VXLAN_STEER_BY_INNER_VLAN	= 1 << 4,
+};
+
+
+int mlx4_flow_steer_promisc_add(struct mlx4_dev *dev, u8 port, u32 qpn,
+				enum mlx4_net_trans_promisc_mode mode);
+int mlx4_flow_steer_promisc_remove(struct mlx4_dev *dev, u8 port,
+				   enum mlx4_net_trans_promisc_mode mode);
+int mlx4_multicast_promisc_add(struct mlx4_dev *dev, u32 qpn, u8 port);
+int mlx4_multicast_promisc_remove(struct mlx4_dev *dev, u32 qpn, u8 port);
+int mlx4_unicast_promisc_add(struct mlx4_dev *dev, u32 qpn, u8 port);
+int mlx4_unicast_promisc_remove(struct mlx4_dev *dev, u32 qpn, u8 port);
+int mlx4_SET_MCAST_FLTR(struct mlx4_dev *dev, u8 port, u64 mac, u64 clear, u8 mode);
+
+int mlx4_register_mac(struct mlx4_dev *dev, u8 port, u64 mac);
+void mlx4_unregister_mac(struct mlx4_dev *dev, u8 port, u64 mac);
+int mlx4_get_base_qpn(struct mlx4_dev *dev, u8 port);
+int __mlx4_replace_mac(struct mlx4_dev *dev, u8 port, int qpn, u64 new_mac);
+void mlx4_set_stats_bitmap(struct mlx4_dev *dev, u64 *stats_bitmap);
+int mlx4_SET_PORT_general(struct mlx4_dev *dev, u8 port, int mtu,
+			  u8 pptx, u8 pfctx, u8 pprx, u8 pfcrx);
+int mlx4_SET_PORT_qpn_calc(struct mlx4_dev *dev, u8 port, u32 base_qpn,
+			   u8 promisc);
+int mlx4_SET_PORT_PRIO2TC(struct mlx4_dev *dev, u8 port, u8 *prio2tc);
+int mlx4_SET_PORT_SCHEDULER(struct mlx4_dev *dev, u8 port, u8 *tc_tx_bw,
+		u8 *pg, u16 *ratelimit);
+int mlx4_SET_PORT_VXLAN(struct mlx4_dev *dev, u8 port, u8 steering, int enable);
+int mlx4_find_cached_mac(struct mlx4_dev *dev, u8 port, u64 mac, int *idx);
+int mlx4_find_cached_vlan(struct mlx4_dev *dev, u8 port, u16 vid, int *idx);
+int mlx4_register_vlan(struct mlx4_dev *dev, u8 port, u16 vlan, int *index);
+void mlx4_unregister_vlan(struct mlx4_dev *dev, u8 port, u16 vlan);
+
+int mlx4_map_phys_fmr(struct mlx4_dev *dev, struct mlx4_fmr *fmr, u64 *page_list,
+		      int npages, u64 iova, u32 *lkey, u32 *rkey);
+int mlx4_fmr_alloc(struct mlx4_dev *dev, u32 pd, u32 access, int max_pages,
+		   int max_maps, u8 page_shift, struct mlx4_fmr *fmr);
+int mlx4_fmr_enable(struct mlx4_dev *dev, struct mlx4_fmr *fmr);
+void mlx4_fmr_unmap(struct mlx4_dev *dev, struct mlx4_fmr *fmr,
+		    u32 *lkey, u32 *rkey);
+int mlx4_fmr_free(struct mlx4_dev *dev, struct mlx4_fmr *fmr);
+int mlx4_SYNC_TPT(struct mlx4_dev *dev);
+int mlx4_test_interrupts(struct mlx4_dev *dev);
+int mlx4_assign_eq(struct mlx4_dev *dev, char *name, struct cpu_rmap *rmap,
+		   int *vector);
+void mlx4_release_eq(struct mlx4_dev *dev, int vec);
+
+int mlx4_eq_get_irq(struct mlx4_dev *dev, int vec);
+
+int mlx4_get_phys_port_id(struct mlx4_dev *dev);
+int mlx4_wol_read(struct mlx4_dev *dev, u64 *config, int port);
+int mlx4_wol_write(struct mlx4_dev *dev, u64 config, int port);
+
+int mlx4_counter_alloc(struct mlx4_dev *dev, u32 *idx);
+void mlx4_counter_free(struct mlx4_dev *dev, u32 idx);
+
+int mlx4_flow_attach(struct mlx4_dev *dev,
+		     struct mlx4_net_trans_rule *rule, u64 *reg_id);
+int mlx4_flow_detach(struct mlx4_dev *dev, u64 reg_id);
+int mlx4_map_sw_to_hw_steering_mode(struct mlx4_dev *dev,
+				    enum mlx4_net_trans_promisc_mode flow_type);
+int mlx4_map_sw_to_hw_steering_id(struct mlx4_dev *dev,
+				  enum mlx4_net_trans_rule_id id);
+int mlx4_hw_rule_sz(struct mlx4_dev *dev, enum mlx4_net_trans_rule_id id);
+
+int mlx4_tunnel_steer_add(struct mlx4_dev *dev, unsigned char *addr,
+			  int port, int qpn, u16 prio, u64 *reg_id);
+
+void mlx4_sync_pkey_table(struct mlx4_dev *dev, int slave, int port,
+			  int i, int val);
+
+int mlx4_get_parav_qkey(struct mlx4_dev *dev, u32 qpn, u32 *qkey);
+
+int mlx4_is_slave_active(struct mlx4_dev *dev, int slave);
+int mlx4_gen_pkey_eqe(struct mlx4_dev *dev, int slave, u8 port);
+int mlx4_gen_guid_change_eqe(struct mlx4_dev *dev, int slave, u8 port);
+int mlx4_gen_slaves_port_mgt_ev(struct mlx4_dev *dev, u8 port, int attr);
+int mlx4_gen_port_state_change_eqe(struct mlx4_dev *dev, int slave, u8 port, u8 port_subtype_change);
+enum slave_port_state mlx4_get_slave_port_state(struct mlx4_dev *dev, int slave, u8 port);
+int set_and_calc_slave_port_state(struct mlx4_dev *dev, int slave, u8 port, int event, enum slave_port_gen_event *gen_event);
+
+void mlx4_put_slave_node_guid(struct mlx4_dev *dev, int slave, __be64 guid);
+__be64 mlx4_get_slave_node_guid(struct mlx4_dev *dev, int slave);
+
+int mlx4_get_slave_from_roce_gid(struct mlx4_dev *dev, int port, u8 *gid,
+				 int *slave_id);
+int mlx4_get_roce_gid_from_slave(struct mlx4_dev *dev, int port, int slave_id,
+				 u8 *gid);
+
+int mlx4_FLOW_STEERING_IB_UC_QP_RANGE(struct mlx4_dev *dev, u32 min_range_qpn,
+				      u32 max_range_qpn);
+
+cycle_t mlx4_read_clock(struct mlx4_dev *dev);
+
+struct mlx4_active_ports {
+	DECLARE_BITMAP(ports, MLX4_MAX_PORTS);
+};
+/* Returns a bitmap of the physical ports which are assigned to slave */
+struct mlx4_active_ports mlx4_get_active_ports(struct mlx4_dev *dev, int slave);
+
+/* Returns the physical port that represents the virtual port of the slave, */
+/* or a value < 0 in case of an error. If a slave has 2 ports, the identity */
+/* mapping is returned.							    */
+int mlx4_slave_convert_port(struct mlx4_dev *dev, int slave, int port);
+
+struct mlx4_slaves_pport {
+	DECLARE_BITMAP(slaves, MLX4_MFUNC_MAX);
+};
+/* Returns a bitmap of all slaves that are assigned to port. */
+struct mlx4_slaves_pport mlx4_phys_to_slaves_pport(struct mlx4_dev *dev,
+						   int port);
+
+/* Returns a bitmap of all slaves that are assigned exactly to all the */
+/* the ports that are set in crit_ports.			       */
+struct mlx4_slaves_pport mlx4_phys_to_slaves_pport_actv(
+		struct mlx4_dev *dev,
+		const struct mlx4_active_ports *crit_ports);
+
+/* Returns the slave's virtual port that represents the physical port. */
+int mlx4_phys_to_slave_port(struct mlx4_dev *dev, int slave, int port);
+
+int mlx4_get_base_gid_ix(struct mlx4_dev *dev, int slave, int port);
+
+int mlx4_config_vxlan_port(struct mlx4_dev *dev, __be16 udp_port);
+int mlx4_vf_smi_enabled(struct mlx4_dev *dev, int slave, int port);
+int mlx4_vf_get_enable_smi_admin(struct mlx4_dev *dev, int slave, int port);
+int mlx4_vf_set_enable_smi_admin(struct mlx4_dev *dev, int slave, int port,
+				 int enable);
+int mlx4_mr_hw_get_mpt(struct mlx4_dev *dev, struct mlx4_mr *mmr,
+		       struct mlx4_mpt_entry ***mpt_entry);
+int mlx4_mr_hw_write_mpt(struct mlx4_dev *dev, struct mlx4_mr *mmr,
+			 struct mlx4_mpt_entry **mpt_entry);
+int mlx4_mr_hw_change_pd(struct mlx4_dev *dev, struct mlx4_mpt_entry *mpt_entry,
+			 u32 pdn);
+int mlx4_mr_hw_change_access(struct mlx4_dev *dev,
+			     struct mlx4_mpt_entry *mpt_entry,
+			     u32 access);
+void mlx4_mr_hw_put_mpt(struct mlx4_dev *dev,
+			struct mlx4_mpt_entry **mpt_entry);
+void mlx4_mr_rereg_mem_cleanup(struct mlx4_dev *dev, struct mlx4_mr *mr);
+int mlx4_mr_rereg_mem_write(struct mlx4_dev *dev, struct mlx4_mr *mr,
+			    u64 iova, u64 size, int npages,
+			    int page_shift, struct mlx4_mpt_entry *mpt_entry);
+
+/* Returns true if running in low memory profile (kdump kernel) */
+static inline bool mlx4_low_memory_profile(void)
+{
+	return is_kdump_kernel();
+}
+
+#endif /* MLX4_DEVICE_H */
diff --git a/include/linux/mlx4/doorbell.h b/include/linux/mlx4/doorbell.h
new file mode 100644
index 0000000..f31bba2
--- /dev/null
+++ b/include/linux/mlx4/doorbell.h
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MLX4_DOORBELL_H
+#define MLX4_DOORBELL_H
+
+#include <linux/types.h>
+#include <linux/io.h>
+
+#define MLX4_SEND_DOORBELL    0x14
+#define MLX4_CQ_DOORBELL      0x20
+
+#if BITS_PER_LONG == 64
+/*
+ * Assume that we can just write a 64-bit doorbell atomically.  s390
+ * actually doesn't have writeq() but S/390 systems don't even have
+ * PCI so we won't worry about it.
+ */
+
+#define MLX4_DECLARE_DOORBELL_LOCK(name)
+#define MLX4_INIT_DOORBELL_LOCK(ptr)    do { } while (0)
+#define MLX4_GET_DOORBELL_LOCK(ptr)      (NULL)
+
+static inline void mlx4_write64(__be32 val[2], void __iomem *dest,
+				spinlock_t *doorbell_lock)
+{
+	__raw_writeq(*(u64 *) val, dest);
+}
+
+#else
+
+/*
+ * Just fall back to a spinlock to protect the doorbell if
+ * BITS_PER_LONG is 32 -- there's no portable way to do atomic 64-bit
+ * MMIO writes.
+ */
+
+#define MLX4_DECLARE_DOORBELL_LOCK(name) spinlock_t name;
+#define MLX4_INIT_DOORBELL_LOCK(ptr)     spin_lock_init(ptr)
+#define MLX4_GET_DOORBELL_LOCK(ptr)      (ptr)
+
+static inline void mlx4_write64(__be32 val[2], void __iomem *dest,
+				spinlock_t *doorbell_lock)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(doorbell_lock, flags);
+	__raw_writel((__force u32) val[0], dest);
+	__raw_writel((__force u32) val[1], dest + 4);
+	spin_unlock_irqrestore(doorbell_lock, flags);
+}
+
+#endif
+
+#endif /* MLX4_DOORBELL_H */
diff --git a/include/linux/mlx4/driver.h b/include/linux/mlx4/driver.h
new file mode 100644
index 0000000..022055c
--- /dev/null
+++ b/include/linux/mlx4/driver.h
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2006 Cisco Systems, Inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MLX4_DRIVER_H
+#define MLX4_DRIVER_H
+
+#include <linux/mlx4/device.h>
+
+struct mlx4_dev;
+
+#define MLX4_MAC_MASK	   0xffffffffffffULL
+
+enum mlx4_dev_event {
+	MLX4_DEV_EVENT_CATASTROPHIC_ERROR,
+	MLX4_DEV_EVENT_PORT_UP,
+	MLX4_DEV_EVENT_PORT_DOWN,
+	MLX4_DEV_EVENT_PORT_REINIT,
+	MLX4_DEV_EVENT_PORT_MGMT_CHANGE,
+	MLX4_DEV_EVENT_SLAVE_INIT,
+	MLX4_DEV_EVENT_SLAVE_SHUTDOWN,
+};
+
+struct mlx4_interface {
+	void *			(*add)	 (struct mlx4_dev *dev);
+	void			(*remove)(struct mlx4_dev *dev, void *context);
+	void			(*event) (struct mlx4_dev *dev, void *context,
+					  enum mlx4_dev_event event, unsigned long param);
+	void *			(*get_dev)(struct mlx4_dev *dev, void *context, u8 port);
+	struct list_head	list;
+	enum mlx4_protocol	protocol;
+};
+
+int mlx4_register_interface(struct mlx4_interface *intf);
+void mlx4_unregister_interface(struct mlx4_interface *intf);
+
+void *mlx4_get_protocol_dev(struct mlx4_dev *dev, enum mlx4_protocol proto, int port);
+
+static inline u64 mlx4_mac_to_u64(u8 *addr)
+{
+	u64 mac = 0;
+	int i;
+
+	for (i = 0; i < ETH_ALEN; i++) {
+		mac <<= 8;
+		mac |= addr[i];
+	}
+	return mac;
+}
+
+#endif /* MLX4_DRIVER_H */
diff --git a/include/linux/mlx4/qp.h b/include/linux/mlx4/qp.h
new file mode 100644
index 0000000..5f4e36c
--- /dev/null
+++ b/include/linux/mlx4/qp.h
@@ -0,0 +1,462 @@
+/*
+ * Copyright (c) 2007 Cisco Systems, Inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *	- Redistributions of source code must retain the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer.
+ *
+ *	- Redistributions in binary form must reproduce the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer in the documentation and/or other materials
+ *	  provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MLX4_QP_H
+#define MLX4_QP_H
+
+#include <linux/types.h>
+#include <linux/if_ether.h>
+
+#include <linux/mlx4/device.h>
+
+#define MLX4_INVALID_LKEY	0x100
+
+enum mlx4_qp_optpar {
+	MLX4_QP_OPTPAR_ALT_ADDR_PATH		= 1 << 0,
+	MLX4_QP_OPTPAR_RRE			= 1 << 1,
+	MLX4_QP_OPTPAR_RAE			= 1 << 2,
+	MLX4_QP_OPTPAR_RWE			= 1 << 3,
+	MLX4_QP_OPTPAR_PKEY_INDEX		= 1 << 4,
+	MLX4_QP_OPTPAR_Q_KEY			= 1 << 5,
+	MLX4_QP_OPTPAR_RNR_TIMEOUT		= 1 << 6,
+	MLX4_QP_OPTPAR_PRIMARY_ADDR_PATH	= 1 << 7,
+	MLX4_QP_OPTPAR_SRA_MAX			= 1 << 8,
+	MLX4_QP_OPTPAR_RRA_MAX			= 1 << 9,
+	MLX4_QP_OPTPAR_PM_STATE			= 1 << 10,
+	MLX4_QP_OPTPAR_RETRY_COUNT		= 1 << 12,
+	MLX4_QP_OPTPAR_RNR_RETRY		= 1 << 13,
+	MLX4_QP_OPTPAR_ACK_TIMEOUT		= 1 << 14,
+	MLX4_QP_OPTPAR_SCHED_QUEUE		= 1 << 16,
+	MLX4_QP_OPTPAR_COUNTER_INDEX		= 1 << 20,
+	MLX4_QP_OPTPAR_VLAN_STRIPPING		= 1 << 21,
+};
+
+enum mlx4_qp_state {
+	MLX4_QP_STATE_RST			= 0,
+	MLX4_QP_STATE_INIT			= 1,
+	MLX4_QP_STATE_RTR			= 2,
+	MLX4_QP_STATE_RTS			= 3,
+	MLX4_QP_STATE_SQER			= 4,
+	MLX4_QP_STATE_SQD			= 5,
+	MLX4_QP_STATE_ERR			= 6,
+	MLX4_QP_STATE_SQ_DRAINING		= 7,
+	MLX4_QP_NUM_STATE
+};
+
+enum {
+	MLX4_QP_ST_RC				= 0x0,
+	MLX4_QP_ST_UC				= 0x1,
+	MLX4_QP_ST_RD				= 0x2,
+	MLX4_QP_ST_UD				= 0x3,
+	MLX4_QP_ST_XRC				= 0x6,
+	MLX4_QP_ST_MLX				= 0x7
+};
+
+enum {
+	MLX4_QP_PM_MIGRATED			= 0x3,
+	MLX4_QP_PM_ARMED			= 0x0,
+	MLX4_QP_PM_REARM			= 0x1
+};
+
+enum {
+	/* params1 */
+	MLX4_QP_BIT_SRE				= 1 << 15,
+	MLX4_QP_BIT_SWE				= 1 << 14,
+	MLX4_QP_BIT_SAE				= 1 << 13,
+	/* params2 */
+	MLX4_QP_BIT_RRE				= 1 << 15,
+	MLX4_QP_BIT_RWE				= 1 << 14,
+	MLX4_QP_BIT_RAE				= 1 << 13,
+	MLX4_QP_BIT_RIC				= 1 <<	4,
+};
+
+enum {
+	MLX4_RSS_HASH_XOR			= 0,
+	MLX4_RSS_HASH_TOP			= 1,
+
+	MLX4_RSS_UDP_IPV6			= 1 << 0,
+	MLX4_RSS_UDP_IPV4			= 1 << 1,
+	MLX4_RSS_TCP_IPV6			= 1 << 2,
+	MLX4_RSS_IPV6				= 1 << 3,
+	MLX4_RSS_TCP_IPV4			= 1 << 4,
+	MLX4_RSS_IPV4				= 1 << 5,
+
+	MLX4_RSS_BY_OUTER_HEADERS		= 0 << 6,
+	MLX4_RSS_BY_INNER_HEADERS		= 2 << 6,
+	MLX4_RSS_BY_INNER_HEADERS_IPONLY	= 3 << 6,
+
+	/* offset of mlx4_rss_context within mlx4_qp_context.pri_path */
+	MLX4_RSS_OFFSET_IN_QPC_PRI_PATH		= 0x24,
+	/* offset of being RSS indirection QP within mlx4_qp_context.flags */
+	MLX4_RSS_QPC_FLAG_OFFSET		= 13,
+};
+
+struct mlx4_rss_context {
+	__be32			base_qpn;
+	__be32			default_qpn;
+	u16			reserved;
+	u8			hash_fn;
+	u8			flags;
+	__be32			rss_key[10];
+	__be32			base_qpn_udp;
+};
+
+struct mlx4_qp_path {
+	u8			fl;
+	u8			vlan_control;
+	u8			disable_pkey_check;
+	u8			pkey_index;
+	u8			counter_index;
+	u8			grh_mylmc;
+	__be16			rlid;
+	u8			ackto;
+	u8			mgid_index;
+	u8			static_rate;
+	u8			hop_limit;
+	__be32			tclass_flowlabel;
+	u8			rgid[16];
+	u8			sched_queue;
+	u8			vlan_index;
+	u8			feup;
+	u8			fvl_rx;
+	u8			reserved4[2];
+	u8			dmac[ETH_ALEN];
+};
+
+enum { /* fl */
+	MLX4_FL_CV      = 1 << 6,
+	MLX4_FL_ETH_HIDE_CQE_VLAN       = 1 << 2
+};
+enum { /* vlan_control */
+	MLX4_VLAN_CTRL_ETH_TX_BLOCK_TAGGED	= 1 << 6,
+	MLX4_VLAN_CTRL_ETH_TX_BLOCK_PRIO_TAGGED	= 1 << 5, /* 802.1p priority tag */
+	MLX4_VLAN_CTRL_ETH_TX_BLOCK_UNTAGGED	= 1 << 4,
+	MLX4_VLAN_CTRL_ETH_RX_BLOCK_TAGGED	= 1 << 2,
+	MLX4_VLAN_CTRL_ETH_RX_BLOCK_PRIO_TAGGED	= 1 << 1, /* 802.1p priority tag */
+	MLX4_VLAN_CTRL_ETH_RX_BLOCK_UNTAGGED	= 1 << 0
+};
+
+enum { /* feup */
+	MLX4_FEUP_FORCE_ETH_UP          = 1 << 6, /* force Eth UP */
+	MLX4_FSM_FORCE_ETH_SRC_MAC      = 1 << 5, /* force Source MAC */
+	MLX4_FVL_FORCE_ETH_VLAN         = 1 << 3  /* force Eth vlan */
+};
+
+enum { /* fvl_rx */
+	MLX4_FVL_RX_FORCE_ETH_VLAN      = 1 << 0 /* enforce Eth rx vlan */
+};
+
+struct mlx4_qp_context {
+	__be32			flags;
+	__be32			pd;
+	u8			mtu_msgmax;
+	u8			rq_size_stride;
+	u8			sq_size_stride;
+	u8			rlkey;
+	__be32			usr_page;
+	__be32			local_qpn;
+	__be32			remote_qpn;
+	struct			mlx4_qp_path pri_path;
+	struct			mlx4_qp_path alt_path;
+	__be32			params1;
+	u32			reserved1;
+	__be32			next_send_psn;
+	__be32			cqn_send;
+	u32			reserved2[2];
+	__be32			last_acked_psn;
+	__be32			ssn;
+	__be32			params2;
+	__be32			rnr_nextrecvpsn;
+	__be32			xrcd;
+	__be32			cqn_recv;
+	__be64			db_rec_addr;
+	__be32			qkey;
+	__be32			srqn;
+	__be32			msn;
+	__be16			rq_wqe_counter;
+	__be16			sq_wqe_counter;
+	u32			reserved3[2];
+	__be32			param3;
+	__be32			nummmcpeers_basemkey;
+	u8			log_page_size;
+	u8			reserved4[2];
+	u8			mtt_base_addr_h;
+	__be32			mtt_base_addr_l;
+	u32			reserved5[10];
+};
+
+struct mlx4_update_qp_context {
+	__be64			qp_mask;
+	__be64			primary_addr_path_mask;
+	__be64			secondary_addr_path_mask;
+	u64			reserved1;
+	struct mlx4_qp_context	qp_context;
+	u64			reserved2[58];
+};
+
+enum {
+	MLX4_UPD_QP_MASK_PM_STATE	= 32,
+	MLX4_UPD_QP_MASK_VSD		= 33,
+};
+
+enum {
+	MLX4_UPD_QP_PATH_MASK_PKEY_INDEX		= 0 + 32,
+	MLX4_UPD_QP_PATH_MASK_FSM			= 1 + 32,
+	MLX4_UPD_QP_PATH_MASK_MAC_INDEX			= 2 + 32,
+	MLX4_UPD_QP_PATH_MASK_FVL			= 3 + 32,
+	MLX4_UPD_QP_PATH_MASK_CV			= 4 + 32,
+	MLX4_UPD_QP_PATH_MASK_VLAN_INDEX		= 5 + 32,
+	MLX4_UPD_QP_PATH_MASK_ETH_HIDE_CQE_VLAN		= 6 + 32,
+	MLX4_UPD_QP_PATH_MASK_ETH_TX_BLOCK_UNTAGGED	= 7 + 32,
+	MLX4_UPD_QP_PATH_MASK_ETH_TX_BLOCK_1P		= 8 + 32,
+	MLX4_UPD_QP_PATH_MASK_ETH_TX_BLOCK_TAGGED	= 9 + 32,
+	MLX4_UPD_QP_PATH_MASK_ETH_RX_BLOCK_UNTAGGED	= 10 + 32,
+	MLX4_UPD_QP_PATH_MASK_ETH_RX_BLOCK_1P		= 11 + 32,
+	MLX4_UPD_QP_PATH_MASK_ETH_RX_BLOCK_TAGGED	= 12 + 32,
+	MLX4_UPD_QP_PATH_MASK_FEUP			= 13 + 32,
+	MLX4_UPD_QP_PATH_MASK_SCHED_QUEUE		= 14 + 32,
+	MLX4_UPD_QP_PATH_MASK_IF_COUNTER_INDEX		= 15 + 32,
+	MLX4_UPD_QP_PATH_MASK_FVL_RX			= 16 + 32,
+};
+
+enum { /* param3 */
+	MLX4_STRIP_VLAN = 1 << 30
+};
+
+/* Which firmware version adds support for NEC (NoErrorCompletion) bit */
+#define MLX4_FW_VER_WQE_CTRL_NEC mlx4_fw_ver(2, 2, 232)
+
+enum {
+	MLX4_WQE_CTRL_NEC		= 1 << 29,
+	MLX4_WQE_CTRL_IIP		= 1 << 28,
+	MLX4_WQE_CTRL_ILP		= 1 << 27,
+	MLX4_WQE_CTRL_FENCE		= 1 << 6,
+	MLX4_WQE_CTRL_CQ_UPDATE		= 3 << 2,
+	MLX4_WQE_CTRL_SOLICITED		= 1 << 1,
+	MLX4_WQE_CTRL_IP_CSUM		= 1 << 4,
+	MLX4_WQE_CTRL_TCP_UDP_CSUM	= 1 << 5,
+	MLX4_WQE_CTRL_INS_VLAN		= 1 << 6,
+	MLX4_WQE_CTRL_STRONG_ORDER	= 1 << 7,
+	MLX4_WQE_CTRL_FORCE_LOOPBACK	= 1 << 0,
+};
+
+struct mlx4_wqe_ctrl_seg {
+	__be32			owner_opcode;
+	union {
+		struct {
+			__be16			vlan_tag;
+			u8			ins_vlan;
+			u8			fence_size;
+		};
+		__be32			bf_qpn;
+	};
+	/*
+	 * High 24 bits are SRC remote buffer; low 8 bits are flags:
+	 * [7]   SO (strong ordering)
+	 * [5]   TCP/UDP checksum
+	 * [4]   IP checksum
+	 * [3:2] C (generate completion queue entry)
+	 * [1]   SE (solicited event)
+	 * [0]   FL (force loopback)
+	 */
+	union {
+		__be32			srcrb_flags;
+		__be16			srcrb_flags16[2];
+	};
+	/*
+	 * imm is immediate data for send/RDMA write w/ immediate;
+	 * also invalidation key for send with invalidate; input
+	 * modifier for WQEs on CCQs.
+	 */
+	__be32			imm;
+};
+
+enum {
+	MLX4_WQE_MLX_VL15	= 1 << 17,
+	MLX4_WQE_MLX_SLR	= 1 << 16
+};
+
+struct mlx4_wqe_mlx_seg {
+	u8			owner;
+	u8			reserved1[2];
+	u8			opcode;
+	__be16			sched_prio;
+	u8			reserved2;
+	u8			size;
+	/*
+	 * [17]    VL15
+	 * [16]    SLR
+	 * [15:12] static rate
+	 * [11:8]  SL
+	 * [4]     ICRC
+	 * [3:2]   C
+	 * [0]     FL (force loopback)
+	 */
+	__be32			flags;
+	__be16			rlid;
+	u16			reserved3;
+};
+
+struct mlx4_wqe_datagram_seg {
+	__be32			av[8];
+	__be32			dqpn;
+	__be32			qkey;
+	__be16			vlan;
+	u8			mac[ETH_ALEN];
+};
+
+struct mlx4_wqe_lso_seg {
+	__be32			mss_hdr_size;
+	__be32			header[0];
+};
+
+enum mlx4_wqe_bind_seg_flags2 {
+	MLX4_WQE_BIND_ZERO_BASED = (1 << 30),
+	MLX4_WQE_BIND_TYPE_2     = (1 << 31),
+};
+
+struct mlx4_wqe_bind_seg {
+	__be32			flags1;
+	__be32			flags2;
+	__be32			new_rkey;
+	__be32			lkey;
+	__be64			addr;
+	__be64			length;
+};
+
+enum {
+	MLX4_WQE_FMR_PERM_LOCAL_READ	= 1 << 27,
+	MLX4_WQE_FMR_PERM_LOCAL_WRITE	= 1 << 28,
+	MLX4_WQE_FMR_AND_BIND_PERM_REMOTE_READ	= 1 << 29,
+	MLX4_WQE_FMR_AND_BIND_PERM_REMOTE_WRITE	= 1 << 30,
+	MLX4_WQE_FMR_AND_BIND_PERM_ATOMIC	= 1 << 31
+};
+
+struct mlx4_wqe_fmr_seg {
+	__be32			flags;
+	__be32			mem_key;
+	__be64			buf_list;
+	__be64			start_addr;
+	__be64			reg_len;
+	__be32			offset;
+	__be32			page_size;
+	u32			reserved[2];
+};
+
+struct mlx4_wqe_fmr_ext_seg {
+	u8			flags;
+	u8			reserved;
+	__be16			app_mask;
+	__be16			wire_app_tag;
+	__be16			mem_app_tag;
+	__be32			wire_ref_tag_base;
+	__be32			mem_ref_tag_base;
+};
+
+struct mlx4_wqe_local_inval_seg {
+	u64			reserved1;
+	__be32			mem_key;
+	u32			reserved2;
+	u64			reserved3[2];
+};
+
+struct mlx4_wqe_raddr_seg {
+	__be64			raddr;
+	__be32			rkey;
+	u32			reserved;
+};
+
+struct mlx4_wqe_atomic_seg {
+	__be64			swap_add;
+	__be64			compare;
+};
+
+struct mlx4_wqe_masked_atomic_seg {
+	__be64			swap_add;
+	__be64			compare;
+	__be64			swap_add_mask;
+	__be64			compare_mask;
+};
+
+struct mlx4_wqe_data_seg {
+	__be32			byte_count;
+	__be32			lkey;
+	__be64			addr;
+};
+
+enum {
+	MLX4_INLINE_ALIGN	= 64,
+	MLX4_INLINE_SEG		= 1 << 31,
+};
+
+struct mlx4_wqe_inline_seg {
+	__be32			byte_count;
+};
+
+enum mlx4_update_qp_attr {
+	MLX4_UPDATE_QP_SMAC		= 1 << 0,
+	MLX4_UPDATE_QP_VSD		= 1 << 2,
+	MLX4_UPDATE_QP_SUPPORTED_ATTRS	= (1 << 2) - 1
+};
+
+enum mlx4_update_qp_params_flags {
+	MLX4_UPDATE_QP_PARAMS_FLAGS_VSD_ENABLE		= 1 << 0,
+};
+
+struct mlx4_update_qp_params {
+	u8	smac_index;
+	u32	flags;
+};
+
+int mlx4_update_qp(struct mlx4_dev *dev, u32 qpn,
+		   enum mlx4_update_qp_attr attr,
+		   struct mlx4_update_qp_params *params);
+int mlx4_qp_modify(struct mlx4_dev *dev, struct mlx4_mtt *mtt,
+		   enum mlx4_qp_state cur_state, enum mlx4_qp_state new_state,
+		   struct mlx4_qp_context *context, enum mlx4_qp_optpar optpar,
+		   int sqd_event, struct mlx4_qp *qp);
+
+int mlx4_qp_query(struct mlx4_dev *dev, struct mlx4_qp *qp,
+		  struct mlx4_qp_context *context);
+
+int mlx4_qp_to_ready(struct mlx4_dev *dev, struct mlx4_mtt *mtt,
+		     struct mlx4_qp_context *context,
+		     struct mlx4_qp *qp, enum mlx4_qp_state *qp_state);
+
+static inline struct mlx4_qp *__mlx4_qp_lookup(struct mlx4_dev *dev, u32 qpn)
+{
+	return radix_tree_lookup(&dev->qp_table_tree, qpn & (dev->caps.num_qps - 1));
+}
+
+void mlx4_qp_remove(struct mlx4_dev *dev, struct mlx4_qp *qp);
+
+#endif /* MLX4_QP_H */
diff --git a/include/linux/mlx4/srq.h b/include/linux/mlx4/srq.h
new file mode 100644
index 0000000..192e0f7
--- /dev/null
+++ b/include/linux/mlx4/srq.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2007 Cisco Systems, Inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *	- Redistributions of source code must retain the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer.
+ *
+ *	- Redistributions in binary form must reproduce the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer in the documentation and/or other materials
+ *	  provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MLX4_SRQ_H
+#define MLX4_SRQ_H
+
+struct mlx4_wqe_srq_next_seg {
+	u16			reserved1;
+	__be16			next_wqe_index;
+	u32			reserved2[3];
+};
+
+struct mlx4_srq *mlx4_srq_lookup(struct mlx4_dev *dev, u32 srqn);
+
+#endif /* MLX4_SRQ_H */
diff --git a/include/linux/mlx5/cmd.h b/include/linux/mlx5/cmd.h
new file mode 100644
index 0000000..2826a4b
--- /dev/null
+++ b/include/linux/mlx5/cmd.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2013, Mellanox Technologies inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MLX5_CMD_H
+#define MLX5_CMD_H
+
+#include <linux/types.h>
+
+struct manage_pages_layout {
+	u64	ptr;
+	u32	reserved;
+	u16	num_entries;
+	u16	func_id;
+};
+
+
+struct mlx5_cmd_alloc_uar_imm_out {
+	u32	rsvd[3];
+	u32	uarn;
+};
+
+#endif /* MLX5_CMD_H */
diff --git a/include/linux/mlx5/cq.h b/include/linux/mlx5/cq.h
new file mode 100644
index 0000000..f6b17ac
--- /dev/null
+++ b/include/linux/mlx5/cq.h
@@ -0,0 +1,174 @@
+/*
+ * Copyright (c) 2013, Mellanox Technologies inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MLX5_CORE_CQ_H
+#define MLX5_CORE_CQ_H
+
+#include <rdma/ib_verbs.h>
+#include <linux/mlx5/driver.h>
+
+
+struct mlx5_core_cq {
+	u32			cqn;
+	int			cqe_sz;
+	__be32		       *set_ci_db;
+	__be32		       *arm_db;
+	atomic_t		refcount;
+	struct completion	free;
+	unsigned		vector;
+	int			irqn;
+	void (*comp)		(struct mlx5_core_cq *);
+	void (*event)		(struct mlx5_core_cq *, enum mlx5_event);
+	struct mlx5_uar	       *uar;
+	u32			cons_index;
+	unsigned		arm_sn;
+	struct mlx5_rsc_debug	*dbg;
+	int			pid;
+};
+
+
+enum {
+	MLX5_CQE_SYNDROME_LOCAL_LENGTH_ERR		= 0x01,
+	MLX5_CQE_SYNDROME_LOCAL_QP_OP_ERR		= 0x02,
+	MLX5_CQE_SYNDROME_LOCAL_PROT_ERR		= 0x04,
+	MLX5_CQE_SYNDROME_WR_FLUSH_ERR			= 0x05,
+	MLX5_CQE_SYNDROME_MW_BIND_ERR			= 0x06,
+	MLX5_CQE_SYNDROME_BAD_RESP_ERR			= 0x10,
+	MLX5_CQE_SYNDROME_LOCAL_ACCESS_ERR		= 0x11,
+	MLX5_CQE_SYNDROME_REMOTE_INVAL_REQ_ERR		= 0x12,
+	MLX5_CQE_SYNDROME_REMOTE_ACCESS_ERR		= 0x13,
+	MLX5_CQE_SYNDROME_REMOTE_OP_ERR			= 0x14,
+	MLX5_CQE_SYNDROME_TRANSPORT_RETRY_EXC_ERR	= 0x15,
+	MLX5_CQE_SYNDROME_RNR_RETRY_EXC_ERR		= 0x16,
+	MLX5_CQE_SYNDROME_REMOTE_ABORTED_ERR		= 0x22,
+};
+
+enum {
+	MLX5_CQE_OWNER_MASK	= 1,
+	MLX5_CQE_REQ		= 0,
+	MLX5_CQE_RESP_WR_IMM	= 1,
+	MLX5_CQE_RESP_SEND	= 2,
+	MLX5_CQE_RESP_SEND_IMM	= 3,
+	MLX5_CQE_RESP_SEND_INV	= 4,
+	MLX5_CQE_RESIZE_CQ	= 5,
+	MLX5_CQE_SIG_ERR	= 12,
+	MLX5_CQE_REQ_ERR	= 13,
+	MLX5_CQE_RESP_ERR	= 14,
+	MLX5_CQE_INVALID	= 15,
+};
+
+enum {
+	MLX5_CQ_MODIFY_PERIOD	= 1 << 0,
+	MLX5_CQ_MODIFY_COUNT	= 1 << 1,
+	MLX5_CQ_MODIFY_OVERRUN	= 1 << 2,
+};
+
+enum {
+	MLX5_CQ_OPMOD_RESIZE		= 1,
+	MLX5_MODIFY_CQ_MASK_LOG_SIZE	= 1 << 0,
+	MLX5_MODIFY_CQ_MASK_PG_OFFSET	= 1 << 1,
+	MLX5_MODIFY_CQ_MASK_PG_SIZE	= 1 << 2,
+};
+
+struct mlx5_cq_modify_params {
+	int	type;
+	union {
+		struct {
+			u32	page_offset;
+			u8	log_cq_size;
+		} resize;
+
+		struct {
+		} moder;
+
+		struct {
+		} mapping;
+	} params;
+};
+
+enum {
+	CQE_SIZE_64 = 0,
+	CQE_SIZE_128 = 1,
+};
+
+static inline int cqe_sz_to_mlx_sz(u8 size)
+{
+	return size == 64 ? CQE_SIZE_64 : CQE_SIZE_128;
+}
+
+static inline void mlx5_cq_set_ci(struct mlx5_core_cq *cq)
+{
+	*cq->set_ci_db = cpu_to_be32(cq->cons_index & 0xffffff);
+}
+
+enum {
+	MLX5_CQ_DB_REQ_NOT_SOL		= 1 << 24,
+	MLX5_CQ_DB_REQ_NOT		= 0 << 24
+};
+
+static inline void mlx5_cq_arm(struct mlx5_core_cq *cq, u32 cmd,
+			       void __iomem *uar_page,
+			       spinlock_t *doorbell_lock)
+{
+	__be32 doorbell[2];
+	u32 sn;
+	u32 ci;
+
+	sn = cq->arm_sn & 3;
+	ci = cq->cons_index & 0xffffff;
+
+	*cq->arm_db = cpu_to_be32(sn << 28 | cmd | ci);
+
+	/* Make sure that the doorbell record in host memory is
+	 * written before ringing the doorbell via PCI MMIO.
+	 */
+	wmb();
+
+	doorbell[0] = cpu_to_be32(sn << 28 | cmd | ci);
+	doorbell[1] = cpu_to_be32(cq->cqn);
+
+	mlx5_write64(doorbell, uar_page + MLX5_CQ_DOORBELL, doorbell_lock);
+}
+
+int mlx5_init_cq_table(struct mlx5_core_dev *dev);
+void mlx5_cleanup_cq_table(struct mlx5_core_dev *dev);
+int mlx5_core_create_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq,
+			struct mlx5_create_cq_mbox_in *in, int inlen);
+int mlx5_core_destroy_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq);
+int mlx5_core_query_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq,
+		       struct mlx5_query_cq_mbox_out *out);
+int mlx5_core_modify_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq,
+			struct mlx5_modify_cq_mbox_in *in, int in_sz);
+int mlx5_debug_cq_add(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq);
+void mlx5_debug_cq_remove(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq);
+
+#endif /* MLX5_CORE_CQ_H */
diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
new file mode 100644
index 0000000..1d67fd3
--- /dev/null
+++ b/include/linux/mlx5/device.h
@@ -0,0 +1,937 @@
+/*
+ * Copyright (c) 2013, Mellanox Technologies inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MLX5_DEVICE_H
+#define MLX5_DEVICE_H
+
+#include <linux/types.h>
+#include <rdma/ib_verbs.h>
+
+#if defined(__LITTLE_ENDIAN)
+#define MLX5_SET_HOST_ENDIANNESS	0
+#elif defined(__BIG_ENDIAN)
+#define MLX5_SET_HOST_ENDIANNESS	0x80
+#else
+#error Host endianness not defined
+#endif
+
+/* helper macros */
+#define __mlx5_nullp(typ) ((struct mlx5_ifc_##typ##_bits *)0)
+#define __mlx5_bit_sz(typ, fld) sizeof(__mlx5_nullp(typ)->fld)
+#define __mlx5_bit_off(typ, fld) ((unsigned)(unsigned long)(&(__mlx5_nullp(typ)->fld)))
+#define __mlx5_dw_off(typ, fld) (__mlx5_bit_off(typ, fld) / 32)
+#define __mlx5_64_off(typ, fld) (__mlx5_bit_off(typ, fld) / 64)
+#define __mlx5_dw_bit_off(typ, fld) (32 - __mlx5_bit_sz(typ, fld) - (__mlx5_bit_off(typ, fld) & 0x1f))
+#define __mlx5_mask(typ, fld) ((u32)((1ull << __mlx5_bit_sz(typ, fld)) - 1))
+#define __mlx5_dw_mask(typ, fld) (__mlx5_mask(typ, fld) << __mlx5_dw_bit_off(typ, fld))
+#define __mlx5_st_sz_bits(typ) sizeof(struct mlx5_ifc_##typ##_bits)
+
+#define MLX5_FLD_SZ_BYTES(typ, fld) (__mlx5_bit_sz(typ, fld) / 8)
+#define MLX5_ST_SZ_BYTES(typ) (sizeof(struct mlx5_ifc_##typ##_bits) / 8)
+#define MLX5_ST_SZ_DW(typ) (sizeof(struct mlx5_ifc_##typ##_bits) / 32)
+#define MLX5_BYTE_OFF(typ, fld) (__mlx5_bit_off(typ, fld) / 8)
+#define MLX5_ADDR_OF(typ, p, fld) ((char *)(p) + MLX5_BYTE_OFF(typ, fld))
+
+/* insert a value to a struct */
+#define MLX5_SET(typ, p, fld, v) do { \
+	BUILD_BUG_ON(__mlx5_st_sz_bits(typ) % 32);             \
+	*((__be32 *)(p) + __mlx5_dw_off(typ, fld)) = \
+	cpu_to_be32((be32_to_cpu(*((__be32 *)(p) + __mlx5_dw_off(typ, fld))) & \
+		     (~__mlx5_dw_mask(typ, fld))) | (((v) & __mlx5_mask(typ, fld)) \
+		     << __mlx5_dw_bit_off(typ, fld))); \
+} while (0)
+
+#define MLX5_GET(typ, p, fld) ((be32_to_cpu(*((__be32 *)(p) +\
+__mlx5_dw_off(typ, fld))) >> __mlx5_dw_bit_off(typ, fld)) & \
+__mlx5_mask(typ, fld))
+
+#define MLX5_GET_PR(typ, p, fld) ({ \
+	u32 ___t = MLX5_GET(typ, p, fld); \
+	pr_debug(#fld " = 0x%x\n", ___t); \
+	___t; \
+})
+
+#define MLX5_SET64(typ, p, fld, v) do { \
+	BUILD_BUG_ON(__mlx5_bit_sz(typ, fld) != 64); \
+	BUILD_BUG_ON(__mlx5_bit_off(typ, fld) % 64); \
+	*((__be64 *)(p) + __mlx5_64_off(typ, fld)) = cpu_to_be64(v); \
+} while (0)
+
+#define MLX5_GET64(typ, p, fld) be64_to_cpu(*((__be64 *)(p) + __mlx5_64_off(typ, fld)))
+
+enum {
+	MLX5_MAX_COMMANDS		= 32,
+	MLX5_CMD_DATA_BLOCK_SIZE	= 512,
+	MLX5_PCI_CMD_XPORT		= 7,
+	MLX5_MKEY_BSF_OCTO_SIZE		= 4,
+	MLX5_MAX_PSVS			= 4,
+};
+
+enum {
+	MLX5_EXTENDED_UD_AV		= 0x80000000,
+};
+
+enum {
+	MLX5_CQ_STATE_ARMED		= 9,
+	MLX5_CQ_STATE_ALWAYS_ARMED	= 0xb,
+	MLX5_CQ_STATE_FIRED		= 0xa,
+};
+
+enum {
+	MLX5_STAT_RATE_OFFSET	= 5,
+};
+
+enum {
+	MLX5_INLINE_SEG = 0x80000000,
+};
+
+enum {
+	MLX5_MIN_PKEY_TABLE_SIZE = 128,
+	MLX5_MAX_LOG_PKEY_TABLE  = 5,
+};
+
+enum {
+	MLX5_PERM_LOCAL_READ	= 1 << 2,
+	MLX5_PERM_LOCAL_WRITE	= 1 << 3,
+	MLX5_PERM_REMOTE_READ	= 1 << 4,
+	MLX5_PERM_REMOTE_WRITE	= 1 << 5,
+	MLX5_PERM_ATOMIC	= 1 << 6,
+	MLX5_PERM_UMR_EN	= 1 << 7,
+};
+
+enum {
+	MLX5_PCIE_CTRL_SMALL_FENCE	= 1 << 0,
+	MLX5_PCIE_CTRL_RELAXED_ORDERING	= 1 << 2,
+	MLX5_PCIE_CTRL_NO_SNOOP		= 1 << 3,
+	MLX5_PCIE_CTRL_TLP_PROCE_EN	= 1 << 6,
+	MLX5_PCIE_CTRL_TPH_MASK		= 3 << 4,
+};
+
+enum {
+	MLX5_ACCESS_MODE_PA	= 0,
+	MLX5_ACCESS_MODE_MTT	= 1,
+	MLX5_ACCESS_MODE_KLM	= 2
+};
+
+enum {
+	MLX5_MKEY_REMOTE_INVAL	= 1 << 24,
+	MLX5_MKEY_FLAG_SYNC_UMR = 1 << 29,
+	MLX5_MKEY_BSF_EN	= 1 << 30,
+	MLX5_MKEY_LEN64		= 1 << 31,
+};
+
+enum {
+	MLX5_EN_RD	= (u64)1,
+	MLX5_EN_WR	= (u64)2
+};
+
+enum {
+	MLX5_BF_REGS_PER_PAGE		= 4,
+	MLX5_MAX_UAR_PAGES		= 1 << 8,
+	MLX5_NON_FP_BF_REGS_PER_PAGE	= 2,
+	MLX5_MAX_UUARS	= MLX5_MAX_UAR_PAGES * MLX5_NON_FP_BF_REGS_PER_PAGE,
+};
+
+enum {
+	MLX5_MKEY_MASK_LEN		= 1ull << 0,
+	MLX5_MKEY_MASK_PAGE_SIZE	= 1ull << 1,
+	MLX5_MKEY_MASK_START_ADDR	= 1ull << 6,
+	MLX5_MKEY_MASK_PD		= 1ull << 7,
+	MLX5_MKEY_MASK_EN_RINVAL	= 1ull << 8,
+	MLX5_MKEY_MASK_EN_SIGERR	= 1ull << 9,
+	MLX5_MKEY_MASK_BSF_EN		= 1ull << 12,
+	MLX5_MKEY_MASK_KEY		= 1ull << 13,
+	MLX5_MKEY_MASK_QPN		= 1ull << 14,
+	MLX5_MKEY_MASK_LR		= 1ull << 17,
+	MLX5_MKEY_MASK_LW		= 1ull << 18,
+	MLX5_MKEY_MASK_RR		= 1ull << 19,
+	MLX5_MKEY_MASK_RW		= 1ull << 20,
+	MLX5_MKEY_MASK_A		= 1ull << 21,
+	MLX5_MKEY_MASK_SMALL_FENCE	= 1ull << 23,
+	MLX5_MKEY_MASK_FREE		= 1ull << 29,
+};
+
+enum mlx5_event {
+	MLX5_EVENT_TYPE_COMP		   = 0x0,
+
+	MLX5_EVENT_TYPE_PATH_MIG	   = 0x01,
+	MLX5_EVENT_TYPE_COMM_EST	   = 0x02,
+	MLX5_EVENT_TYPE_SQ_DRAINED	   = 0x03,
+	MLX5_EVENT_TYPE_SRQ_LAST_WQE	   = 0x13,
+	MLX5_EVENT_TYPE_SRQ_RQ_LIMIT	   = 0x14,
+
+	MLX5_EVENT_TYPE_CQ_ERROR	   = 0x04,
+	MLX5_EVENT_TYPE_WQ_CATAS_ERROR	   = 0x05,
+	MLX5_EVENT_TYPE_PATH_MIG_FAILED	   = 0x07,
+	MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR = 0x10,
+	MLX5_EVENT_TYPE_WQ_ACCESS_ERROR	   = 0x11,
+	MLX5_EVENT_TYPE_SRQ_CATAS_ERROR	   = 0x12,
+
+	MLX5_EVENT_TYPE_INTERNAL_ERROR	   = 0x08,
+	MLX5_EVENT_TYPE_PORT_CHANGE	   = 0x09,
+	MLX5_EVENT_TYPE_GPIO_EVENT	   = 0x15,
+	MLX5_EVENT_TYPE_REMOTE_CONFIG	   = 0x19,
+
+	MLX5_EVENT_TYPE_DB_BF_CONGESTION   = 0x1a,
+	MLX5_EVENT_TYPE_STALL_EVENT	   = 0x1b,
+
+	MLX5_EVENT_TYPE_CMD		   = 0x0a,
+	MLX5_EVENT_TYPE_PAGE_REQUEST	   = 0xb,
+};
+
+enum {
+	MLX5_PORT_CHANGE_SUBTYPE_DOWN		= 1,
+	MLX5_PORT_CHANGE_SUBTYPE_ACTIVE		= 4,
+	MLX5_PORT_CHANGE_SUBTYPE_INITIALIZED	= 5,
+	MLX5_PORT_CHANGE_SUBTYPE_LID		= 6,
+	MLX5_PORT_CHANGE_SUBTYPE_PKEY		= 7,
+	MLX5_PORT_CHANGE_SUBTYPE_GUID		= 8,
+	MLX5_PORT_CHANGE_SUBTYPE_CLIENT_REREG	= 9,
+};
+
+enum {
+	MLX5_DEV_CAP_FLAG_RC		= 1LL <<  0,
+	MLX5_DEV_CAP_FLAG_UC		= 1LL <<  1,
+	MLX5_DEV_CAP_FLAG_UD		= 1LL <<  2,
+	MLX5_DEV_CAP_FLAG_XRC		= 1LL <<  3,
+	MLX5_DEV_CAP_FLAG_SRQ		= 1LL <<  6,
+	MLX5_DEV_CAP_FLAG_BAD_PKEY_CNTR	= 1LL <<  8,
+	MLX5_DEV_CAP_FLAG_BAD_QKEY_CNTR	= 1LL <<  9,
+	MLX5_DEV_CAP_FLAG_APM		= 1LL << 17,
+	MLX5_DEV_CAP_FLAG_ATOMIC	= 1LL << 18,
+	MLX5_DEV_CAP_FLAG_BLOCK_MCAST	= 1LL << 23,
+	MLX5_DEV_CAP_FLAG_ON_DMND_PG	= 1LL << 24,
+	MLX5_DEV_CAP_FLAG_CQ_MODER	= 1LL << 29,
+	MLX5_DEV_CAP_FLAG_RESIZE_CQ	= 1LL << 30,
+	MLX5_DEV_CAP_FLAG_RESIZE_SRQ	= 1LL << 32,
+	MLX5_DEV_CAP_FLAG_DCT		= 1LL << 37,
+	MLX5_DEV_CAP_FLAG_REMOTE_FENCE	= 1LL << 38,
+	MLX5_DEV_CAP_FLAG_TLP_HINTS	= 1LL << 39,
+	MLX5_DEV_CAP_FLAG_SIG_HAND_OVER	= 1LL << 40,
+	MLX5_DEV_CAP_FLAG_CMDIF_CSUM	= 3LL << 46,
+};
+
+enum {
+	MLX5_OPCODE_NOP			= 0x00,
+	MLX5_OPCODE_SEND_INVAL		= 0x01,
+	MLX5_OPCODE_RDMA_WRITE		= 0x08,
+	MLX5_OPCODE_RDMA_WRITE_IMM	= 0x09,
+	MLX5_OPCODE_SEND		= 0x0a,
+	MLX5_OPCODE_SEND_IMM		= 0x0b,
+	MLX5_OPCODE_RDMA_READ		= 0x10,
+	MLX5_OPCODE_ATOMIC_CS		= 0x11,
+	MLX5_OPCODE_ATOMIC_FA		= 0x12,
+	MLX5_OPCODE_ATOMIC_MASKED_CS	= 0x14,
+	MLX5_OPCODE_ATOMIC_MASKED_FA	= 0x15,
+	MLX5_OPCODE_BIND_MW		= 0x18,
+	MLX5_OPCODE_CONFIG_CMD		= 0x1f,
+
+	MLX5_RECV_OPCODE_RDMA_WRITE_IMM	= 0x00,
+	MLX5_RECV_OPCODE_SEND		= 0x01,
+	MLX5_RECV_OPCODE_SEND_IMM	= 0x02,
+	MLX5_RECV_OPCODE_SEND_INVAL	= 0x03,
+
+	MLX5_CQE_OPCODE_ERROR		= 0x1e,
+	MLX5_CQE_OPCODE_RESIZE		= 0x16,
+
+	MLX5_OPCODE_SET_PSV		= 0x20,
+	MLX5_OPCODE_GET_PSV		= 0x21,
+	MLX5_OPCODE_CHECK_PSV		= 0x22,
+	MLX5_OPCODE_RGET_PSV		= 0x26,
+	MLX5_OPCODE_RCHECK_PSV		= 0x27,
+
+	MLX5_OPCODE_UMR			= 0x25,
+
+};
+
+enum {
+	MLX5_SET_PORT_RESET_QKEY	= 0,
+	MLX5_SET_PORT_GUID0		= 16,
+	MLX5_SET_PORT_NODE_GUID		= 17,
+	MLX5_SET_PORT_SYS_GUID		= 18,
+	MLX5_SET_PORT_GID_TABLE		= 19,
+	MLX5_SET_PORT_PKEY_TABLE	= 20,
+};
+
+enum {
+	MLX5_MAX_PAGE_SHIFT		= 31
+};
+
+enum {
+	MLX5_ADAPTER_PAGE_SHIFT		= 12,
+	MLX5_ADAPTER_PAGE_SIZE		= 1 << MLX5_ADAPTER_PAGE_SHIFT,
+};
+
+enum {
+	MLX5_CAP_OFF_CMDIF_CSUM		= 46,
+};
+
+enum {
+	HCA_CAP_OPMOD_GET_MAX	= 0,
+	HCA_CAP_OPMOD_GET_CUR	= 1,
+};
+
+struct mlx5_inbox_hdr {
+	__be16		opcode;
+	u8		rsvd[4];
+	__be16		opmod;
+};
+
+struct mlx5_outbox_hdr {
+	u8		status;
+	u8		rsvd[3];
+	__be32		syndrome;
+};
+
+struct mlx5_cmd_query_adapter_mbox_in {
+	struct mlx5_inbox_hdr	hdr;
+	u8			rsvd[8];
+};
+
+struct mlx5_cmd_query_adapter_mbox_out {
+	struct mlx5_outbox_hdr	hdr;
+	u8			rsvd0[24];
+	u8			intapin;
+	u8			rsvd1[13];
+	__be16			vsd_vendor_id;
+	u8			vsd[208];
+	u8			vsd_psid[16];
+};
+
+struct mlx5_cmd_init_hca_mbox_in {
+	struct mlx5_inbox_hdr	hdr;
+	u8			rsvd0[2];
+	__be16			profile;
+	u8			rsvd1[4];
+};
+
+struct mlx5_cmd_init_hca_mbox_out {
+	struct mlx5_outbox_hdr	hdr;
+	u8			rsvd[8];
+};
+
+struct mlx5_cmd_teardown_hca_mbox_in {
+	struct mlx5_inbox_hdr	hdr;
+	u8			rsvd0[2];
+	__be16			profile;
+	u8			rsvd1[4];
+};
+
+struct mlx5_cmd_teardown_hca_mbox_out {
+	struct mlx5_outbox_hdr	hdr;
+	u8			rsvd[8];
+};
+
+struct mlx5_cmd_layout {
+	u8		type;
+	u8		rsvd0[3];
+	__be32		inlen;
+	__be64		in_ptr;
+	__be32		in[4];
+	__be32		out[4];
+	__be64		out_ptr;
+	__be32		outlen;
+	u8		token;
+	u8		sig;
+	u8		rsvd1;
+	u8		status_own;
+};
+
+
+struct health_buffer {
+	__be32		assert_var[5];
+	__be32		rsvd0[3];
+	__be32		assert_exit_ptr;
+	__be32		assert_callra;
+	__be32		rsvd1[2];
+	__be32		fw_ver;
+	__be32		hw_id;
+	__be32		rsvd2;
+	u8		irisc_index;
+	u8		synd;
+	__be16		ext_sync;
+};
+
+struct mlx5_init_seg {
+	__be32			fw_rev;
+	__be32			cmdif_rev_fw_sub;
+	__be32			rsvd0[2];
+	__be32			cmdq_addr_h;
+	__be32			cmdq_addr_l_sz;
+	__be32			cmd_dbell;
+	__be32			rsvd1[121];
+	struct health_buffer	health;
+	__be32			rsvd2[884];
+	__be32			health_counter;
+	__be32			rsvd3[1019];
+	__be64			ieee1588_clk;
+	__be32			ieee1588_clk_type;
+	__be32			clr_intx;
+};
+
+struct mlx5_eqe_comp {
+	__be32	reserved[6];
+	__be32	cqn;
+};
+
+struct mlx5_eqe_qp_srq {
+	__be32	reserved[6];
+	__be32	qp_srq_n;
+};
+
+struct mlx5_eqe_cq_err {
+	__be32	cqn;
+	u8	reserved1[7];
+	u8	syndrome;
+};
+
+struct mlx5_eqe_port_state {
+	u8	reserved0[8];
+	u8	port;
+};
+
+struct mlx5_eqe_gpio {
+	__be32	reserved0[2];
+	__be64	gpio_event;
+};
+
+struct mlx5_eqe_congestion {
+	u8	type;
+	u8	rsvd0;
+	u8	congestion_level;
+};
+
+struct mlx5_eqe_stall_vl {
+	u8	rsvd0[3];
+	u8	port_vl;
+};
+
+struct mlx5_eqe_cmd {
+	__be32	vector;
+	__be32	rsvd[6];
+};
+
+struct mlx5_eqe_page_req {
+	u8		rsvd0[2];
+	__be16		func_id;
+	__be32		num_pages;
+	__be32		rsvd1[5];
+};
+
+union ev_data {
+	__be32				raw[7];
+	struct mlx5_eqe_cmd		cmd;
+	struct mlx5_eqe_comp		comp;
+	struct mlx5_eqe_qp_srq		qp_srq;
+	struct mlx5_eqe_cq_err		cq_err;
+	struct mlx5_eqe_port_state	port;
+	struct mlx5_eqe_gpio		gpio;
+	struct mlx5_eqe_congestion	cong;
+	struct mlx5_eqe_stall_vl	stall_vl;
+	struct mlx5_eqe_page_req	req_pages;
+} __packed;
+
+struct mlx5_eqe {
+	u8		rsvd0;
+	u8		type;
+	u8		rsvd1;
+	u8		sub_type;
+	__be32		rsvd2[7];
+	union ev_data	data;
+	__be16		rsvd3;
+	u8		signature;
+	u8		owner;
+} __packed;
+
+struct mlx5_cmd_prot_block {
+	u8		data[MLX5_CMD_DATA_BLOCK_SIZE];
+	u8		rsvd0[48];
+	__be64		next;
+	__be32		block_num;
+	u8		rsvd1;
+	u8		token;
+	u8		ctrl_sig;
+	u8		sig;
+};
+
+struct mlx5_err_cqe {
+	u8	rsvd0[32];
+	__be32	srqn;
+	u8	rsvd1[18];
+	u8	vendor_err_synd;
+	u8	syndrome;
+	__be32	s_wqe_opcode_qpn;
+	__be16	wqe_counter;
+	u8	signature;
+	u8	op_own;
+};
+
+struct mlx5_cqe64 {
+	u8		rsvd0[17];
+	u8		ml_path;
+	u8		rsvd20[4];
+	__be16		slid;
+	__be32		flags_rqpn;
+	u8		rsvd28[4];
+	__be32		srqn;
+	__be32		imm_inval_pkey;
+	u8		rsvd40[4];
+	__be32		byte_cnt;
+	__be64		timestamp;
+	__be32		sop_drop_qpn;
+	__be16		wqe_counter;
+	u8		signature;
+	u8		op_own;
+};
+
+struct mlx5_sig_err_cqe {
+	u8		rsvd0[16];
+	__be32		expected_trans_sig;
+	__be32		actual_trans_sig;
+	__be32		expected_reftag;
+	__be32		actual_reftag;
+	__be16		syndrome;
+	u8		rsvd22[2];
+	__be32		mkey;
+	__be64		err_offset;
+	u8		rsvd30[8];
+	__be32		qpn;
+	u8		rsvd38[2];
+	u8		signature;
+	u8		op_own;
+};
+
+struct mlx5_wqe_srq_next_seg {
+	u8			rsvd0[2];
+	__be16			next_wqe_index;
+	u8			signature;
+	u8			rsvd1[11];
+};
+
+union mlx5_ext_cqe {
+	struct ib_grh	grh;
+	u8		inl[64];
+};
+
+struct mlx5_cqe128 {
+	union mlx5_ext_cqe	inl_grh;
+	struct mlx5_cqe64	cqe64;
+};
+
+struct mlx5_srq_ctx {
+	u8			state_log_sz;
+	u8			rsvd0[3];
+	__be32			flags_xrcd;
+	__be32			pgoff_cqn;
+	u8			rsvd1[4];
+	u8			log_pg_sz;
+	u8			rsvd2[7];
+	__be32			pd;
+	__be16			lwm;
+	__be16			wqe_cnt;
+	u8			rsvd3[8];
+	__be64			db_record;
+};
+
+struct mlx5_create_srq_mbox_in {
+	struct mlx5_inbox_hdr	hdr;
+	__be32			input_srqn;
+	u8			rsvd0[4];
+	struct mlx5_srq_ctx	ctx;
+	u8			rsvd1[208];
+	__be64			pas[0];
+};
+
+struct mlx5_create_srq_mbox_out {
+	struct mlx5_outbox_hdr	hdr;
+	__be32			srqn;
+	u8			rsvd[4];
+};
+
+struct mlx5_destroy_srq_mbox_in {
+	struct mlx5_inbox_hdr	hdr;
+	__be32			srqn;
+	u8			rsvd[4];
+};
+
+struct mlx5_destroy_srq_mbox_out {
+	struct mlx5_outbox_hdr	hdr;
+	u8			rsvd[8];
+};
+
+struct mlx5_query_srq_mbox_in {
+	struct mlx5_inbox_hdr	hdr;
+	__be32			srqn;
+	u8			rsvd0[4];
+};
+
+struct mlx5_query_srq_mbox_out {
+	struct mlx5_outbox_hdr	hdr;
+	u8			rsvd0[8];
+	struct mlx5_srq_ctx	ctx;
+	u8			rsvd1[32];
+	__be64			pas[0];
+};
+
+struct mlx5_arm_srq_mbox_in {
+	struct mlx5_inbox_hdr	hdr;
+	__be32			srqn;
+	__be16			rsvd;
+	__be16			lwm;
+};
+
+struct mlx5_arm_srq_mbox_out {
+	struct mlx5_outbox_hdr	hdr;
+	u8			rsvd[8];
+};
+
+struct mlx5_cq_context {
+	u8			status;
+	u8			cqe_sz_flags;
+	u8			st;
+	u8			rsvd3;
+	u8			rsvd4[6];
+	__be16			page_offset;
+	__be32			log_sz_usr_page;
+	__be16			cq_period;
+	__be16			cq_max_count;
+	__be16			rsvd20;
+	__be16			c_eqn;
+	u8			log_pg_sz;
+	u8			rsvd25[7];
+	__be32			last_notified_index;
+	__be32			solicit_producer_index;
+	__be32			consumer_counter;
+	__be32			producer_counter;
+	u8			rsvd48[8];
+	__be64			db_record_addr;
+};
+
+struct mlx5_create_cq_mbox_in {
+	struct mlx5_inbox_hdr	hdr;
+	__be32			input_cqn;
+	u8			rsvdx[4];
+	struct mlx5_cq_context	ctx;
+	u8			rsvd6[192];
+	__be64			pas[0];
+};
+
+struct mlx5_create_cq_mbox_out {
+	struct mlx5_outbox_hdr	hdr;
+	__be32			cqn;
+	u8			rsvd0[4];
+};
+
+struct mlx5_destroy_cq_mbox_in {
+	struct mlx5_inbox_hdr	hdr;
+	__be32			cqn;
+	u8			rsvd0[4];
+};
+
+struct mlx5_destroy_cq_mbox_out {
+	struct mlx5_outbox_hdr	hdr;
+	u8			rsvd0[8];
+};
+
+struct mlx5_query_cq_mbox_in {
+	struct mlx5_inbox_hdr	hdr;
+	__be32			cqn;
+	u8			rsvd0[4];
+};
+
+struct mlx5_query_cq_mbox_out {
+	struct mlx5_outbox_hdr	hdr;
+	u8			rsvd0[8];
+	struct mlx5_cq_context	ctx;
+	u8			rsvd6[16];
+	__be64			pas[0];
+};
+
+struct mlx5_modify_cq_mbox_in {
+	struct mlx5_inbox_hdr	hdr;
+	__be32			cqn;
+	__be32			field_select;
+	struct mlx5_cq_context	ctx;
+	u8			rsvd[192];
+	__be64			pas[0];
+};
+
+struct mlx5_modify_cq_mbox_out {
+	struct mlx5_outbox_hdr	hdr;
+	u8			rsvd[8];
+};
+
+struct mlx5_enable_hca_mbox_in {
+	struct mlx5_inbox_hdr	hdr;
+	u8			rsvd[8];
+};
+
+struct mlx5_enable_hca_mbox_out {
+	struct mlx5_outbox_hdr	hdr;
+	u8			rsvd[8];
+};
+
+struct mlx5_disable_hca_mbox_in {
+	struct mlx5_inbox_hdr	hdr;
+	u8			rsvd[8];
+};
+
+struct mlx5_disable_hca_mbox_out {
+	struct mlx5_outbox_hdr	hdr;
+	u8			rsvd[8];
+};
+
+struct mlx5_eq_context {
+	u8			status;
+	u8			ec_oi;
+	u8			st;
+	u8			rsvd2[7];
+	__be16			page_pffset;
+	__be32			log_sz_usr_page;
+	u8			rsvd3[7];
+	u8			intr;
+	u8			log_page_size;
+	u8			rsvd4[15];
+	__be32			consumer_counter;
+	__be32			produser_counter;
+	u8			rsvd5[16];
+};
+
+struct mlx5_create_eq_mbox_in {
+	struct mlx5_inbox_hdr	hdr;
+	u8			rsvd0[3];
+	u8			input_eqn;
+	u8			rsvd1[4];
+	struct mlx5_eq_context	ctx;
+	u8			rsvd2[8];
+	__be64			events_mask;
+	u8			rsvd3[176];
+	__be64			pas[0];
+};
+
+struct mlx5_create_eq_mbox_out {
+	struct mlx5_outbox_hdr	hdr;
+	u8			rsvd0[3];
+	u8			eq_number;
+	u8			rsvd1[4];
+};
+
+struct mlx5_destroy_eq_mbox_in {
+	struct mlx5_inbox_hdr	hdr;
+	u8			rsvd0[3];
+	u8			eqn;
+	u8			rsvd1[4];
+};
+
+struct mlx5_destroy_eq_mbox_out {
+	struct mlx5_outbox_hdr	hdr;
+	u8			rsvd[8];
+};
+
+struct mlx5_map_eq_mbox_in {
+	struct mlx5_inbox_hdr	hdr;
+	__be64			mask;
+	u8			mu;
+	u8			rsvd0[2];
+	u8			eqn;
+	u8			rsvd1[24];
+};
+
+struct mlx5_map_eq_mbox_out {
+	struct mlx5_outbox_hdr	hdr;
+	u8			rsvd[8];
+};
+
+struct mlx5_query_eq_mbox_in {
+	struct mlx5_inbox_hdr	hdr;
+	u8			rsvd0[3];
+	u8			eqn;
+	u8			rsvd1[4];
+};
+
+struct mlx5_query_eq_mbox_out {
+	struct mlx5_outbox_hdr	hdr;
+	u8			rsvd[8];
+	struct mlx5_eq_context	ctx;
+};
+
+struct mlx5_mkey_seg {
+	/* This is a two bit field occupying bits 31-30.
+	 * bit 31 is always 0,
+	 * bit 30 is zero for regular MRs and 1 (e.g free) for UMRs that do not have tanslation
+	 */
+	u8		status;
+	u8		pcie_control;
+	u8		flags;
+	u8		version;
+	__be32		qpn_mkey7_0;
+	u8		rsvd1[4];
+	__be32		flags_pd;
+	__be64		start_addr;
+	__be64		len;
+	__be32		bsfs_octo_size;
+	u8		rsvd2[16];
+	__be32		xlt_oct_size;
+	u8		rsvd3[3];
+	u8		log2_page_size;
+	u8		rsvd4[4];
+};
+
+struct mlx5_query_special_ctxs_mbox_in {
+	struct mlx5_inbox_hdr	hdr;
+	u8			rsvd[8];
+};
+
+struct mlx5_query_special_ctxs_mbox_out {
+	struct mlx5_outbox_hdr	hdr;
+	__be32			dump_fill_mkey;
+	__be32			reserved_lkey;
+};
+
+struct mlx5_create_mkey_mbox_in {
+	struct mlx5_inbox_hdr	hdr;
+	__be32			input_mkey_index;
+	u8			rsvd0[4];
+	struct mlx5_mkey_seg	seg;
+	u8			rsvd1[16];
+	__be32			xlat_oct_act_size;
+	__be32			rsvd2;
+	u8			rsvd3[168];
+	__be64			pas[0];
+};
+
+struct mlx5_create_mkey_mbox_out {
+	struct mlx5_outbox_hdr	hdr;
+	__be32			mkey;
+	u8			rsvd[4];
+};
+
+struct mlx5_destroy_mkey_mbox_in {
+	struct mlx5_inbox_hdr	hdr;
+	__be32			mkey;
+	u8			rsvd[4];
+};
+
+struct mlx5_destroy_mkey_mbox_out {
+	struct mlx5_outbox_hdr	hdr;
+	u8			rsvd[8];
+};
+
+struct mlx5_query_mkey_mbox_in {
+	struct mlx5_inbox_hdr	hdr;
+	__be32			mkey;
+};
+
+struct mlx5_query_mkey_mbox_out {
+	struct mlx5_outbox_hdr	hdr;
+	__be64			pas[0];
+};
+
+struct mlx5_modify_mkey_mbox_in {
+	struct mlx5_inbox_hdr	hdr;
+	__be32			mkey;
+	__be64			pas[0];
+};
+
+struct mlx5_modify_mkey_mbox_out {
+	struct mlx5_outbox_hdr	hdr;
+	u8			rsvd[8];
+};
+
+struct mlx5_dump_mkey_mbox_in {
+	struct mlx5_inbox_hdr	hdr;
+};
+
+struct mlx5_dump_mkey_mbox_out {
+	struct mlx5_outbox_hdr	hdr;
+	__be32			mkey;
+};
+
+struct mlx5_mad_ifc_mbox_in {
+	struct mlx5_inbox_hdr	hdr;
+	__be16			remote_lid;
+	u8			rsvd0;
+	u8			port;
+	u8			rsvd1[4];
+	u8			data[256];
+};
+
+struct mlx5_mad_ifc_mbox_out {
+	struct mlx5_outbox_hdr	hdr;
+	u8			rsvd[8];
+	u8			data[256];
+};
+
+struct mlx5_access_reg_mbox_in {
+	struct mlx5_inbox_hdr		hdr;
+	u8				rsvd0[2];
+	__be16				register_id;
+	__be32				arg;
+	__be32				data[0];
+};
+
+struct mlx5_access_reg_mbox_out {
+	struct mlx5_outbox_hdr		hdr;
+	u8				rsvd[8];
+	__be32				data[0];
+};
+
+#define MLX5_ATTR_EXTENDED_PORT_INFO	cpu_to_be16(0xff90)
+
+enum {
+	MLX_EXT_PORT_CAP_FLAG_EXTENDED_PORT_INFO	= 1 <<  0
+};
+
+struct mlx5_allocate_psv_in {
+	struct mlx5_inbox_hdr   hdr;
+	__be32			npsv_pd;
+	__be32			rsvd_psv0;
+};
+
+struct mlx5_allocate_psv_out {
+	struct mlx5_outbox_hdr  hdr;
+	u8			rsvd[8];
+	__be32			psv_idx[4];
+};
+
+struct mlx5_destroy_psv_in {
+	struct mlx5_inbox_hdr	hdr;
+	__be32                  psv_number;
+	u8                      rsvd[4];
+};
+
+struct mlx5_destroy_psv_out {
+	struct mlx5_outbox_hdr  hdr;
+	u8                      rsvd[8];
+};
+
+#endif /* MLX5_DEVICE_H */
diff --git a/include/linux/mlx5/doorbell.h b/include/linux/mlx5/doorbell.h
new file mode 100644
index 0000000..163a818
--- /dev/null
+++ b/include/linux/mlx5/doorbell.h
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2013, Mellanox Technologies inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MLX5_DOORBELL_H
+#define MLX5_DOORBELL_H
+
+#define MLX5_BF_OFFSET	      0x800
+#define MLX5_CQ_DOORBELL      0x20
+
+#if BITS_PER_LONG == 64
+/* Assume that we can just write a 64-bit doorbell atomically.  s390
+ * actually doesn't have writeq() but S/390 systems don't even have
+ * PCI so we won't worry about it.
+ */
+
+#define MLX5_DECLARE_DOORBELL_LOCK(name)
+#define MLX5_INIT_DOORBELL_LOCK(ptr)    do { } while (0)
+#define MLX5_GET_DOORBELL_LOCK(ptr)      (NULL)
+
+static inline void mlx5_write64(__be32 val[2], void __iomem *dest,
+				spinlock_t *doorbell_lock)
+{
+	__raw_writeq(*(u64 *)val, dest);
+}
+
+#else
+
+/* Just fall back to a spinlock to protect the doorbell if
+ * BITS_PER_LONG is 32 -- there's no portable way to do atomic 64-bit
+ * MMIO writes.
+ */
+
+#define MLX5_DECLARE_DOORBELL_LOCK(name) spinlock_t name;
+#define MLX5_INIT_DOORBELL_LOCK(ptr)     spin_lock_init(ptr)
+#define MLX5_GET_DOORBELL_LOCK(ptr)      (ptr)
+
+static inline void mlx5_write64(__be32 val[2], void __iomem *dest,
+				spinlock_t *doorbell_lock)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(doorbell_lock, flags);
+	__raw_writel((__force u32) val[0], dest);
+	__raw_writel((__force u32) val[1], dest + 4);
+	spin_unlock_irqrestore(doorbell_lock, flags);
+}
+
+#endif
+
+#endif /* MLX5_DOORBELL_H */
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
new file mode 100644
index 0000000..246310d
--- /dev/null
+++ b/include/linux/mlx5/driver.h
@@ -0,0 +1,796 @@
+/*
+ * Copyright (c) 2013, Mellanox Technologies inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MLX5_DRIVER_H
+#define MLX5_DRIVER_H
+
+#include <linux/kernel.h>
+#include <linux/completion.h>
+#include <linux/pci.h>
+#include <linux/spinlock_types.h>
+#include <linux/semaphore.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/radix-tree.h>
+
+#include <linux/mlx5/device.h>
+#include <linux/mlx5/doorbell.h>
+#include <linux/mlx5/mlx5_ifc.h>
+
+enum {
+	MLX5_BOARD_ID_LEN = 64,
+	MLX5_MAX_NAME_LEN = 16,
+};
+
+enum {
+	/* one minute for the sake of bringup. Generally, commands must always
+	 * complete and we may need to increase this timeout value
+	 */
+	MLX5_CMD_TIMEOUT_MSEC	= 7200 * 1000,
+	MLX5_CMD_WQ_MAX_NAME	= 32,
+};
+
+enum {
+	CMD_OWNER_SW		= 0x0,
+	CMD_OWNER_HW		= 0x1,
+	CMD_STATUS_SUCCESS	= 0,
+};
+
+enum mlx5_sqp_t {
+	MLX5_SQP_SMI		= 0,
+	MLX5_SQP_GSI		= 1,
+	MLX5_SQP_IEEE_1588	= 2,
+	MLX5_SQP_SNIFFER	= 3,
+	MLX5_SQP_SYNC_UMR	= 4,
+};
+
+enum {
+	MLX5_MAX_PORTS	= 2,
+};
+
+enum {
+	MLX5_EQ_VEC_PAGES	 = 0,
+	MLX5_EQ_VEC_CMD		 = 1,
+	MLX5_EQ_VEC_ASYNC	 = 2,
+	MLX5_EQ_VEC_COMP_BASE,
+};
+
+enum {
+	MLX5_MAX_EQ_NAME	= 32
+};
+
+enum {
+	MLX5_ATOMIC_MODE_IB_COMP	= 1 << 16,
+	MLX5_ATOMIC_MODE_CX		= 2 << 16,
+	MLX5_ATOMIC_MODE_8B		= 3 << 16,
+	MLX5_ATOMIC_MODE_16B		= 4 << 16,
+	MLX5_ATOMIC_MODE_32B		= 5 << 16,
+	MLX5_ATOMIC_MODE_64B		= 6 << 16,
+	MLX5_ATOMIC_MODE_128B		= 7 << 16,
+	MLX5_ATOMIC_MODE_256B		= 8 << 16,
+};
+
+enum {
+	MLX5_REG_PCAP		 = 0x5001,
+	MLX5_REG_PMTU		 = 0x5003,
+	MLX5_REG_PTYS		 = 0x5004,
+	MLX5_REG_PAOS		 = 0x5006,
+	MLX5_REG_PMAOS		 = 0x5012,
+	MLX5_REG_PUDE		 = 0x5009,
+	MLX5_REG_PMPE		 = 0x5010,
+	MLX5_REG_PELC		 = 0x500e,
+	MLX5_REG_PMLP		 = 0, /* TBD */
+	MLX5_REG_NODE_DESC	 = 0x6001,
+	MLX5_REG_HOST_ENDIANNESS = 0x7004,
+};
+
+enum dbg_rsc_type {
+	MLX5_DBG_RSC_QP,
+	MLX5_DBG_RSC_EQ,
+	MLX5_DBG_RSC_CQ,
+};
+
+struct mlx5_field_desc {
+	struct dentry	       *dent;
+	int			i;
+};
+
+struct mlx5_rsc_debug {
+	struct mlx5_core_dev   *dev;
+	void		       *object;
+	enum dbg_rsc_type	type;
+	struct dentry	       *root;
+	struct mlx5_field_desc	fields[0];
+};
+
+enum mlx5_dev_event {
+	MLX5_DEV_EVENT_SYS_ERROR,
+	MLX5_DEV_EVENT_PORT_UP,
+	MLX5_DEV_EVENT_PORT_DOWN,
+	MLX5_DEV_EVENT_PORT_INITIALIZED,
+	MLX5_DEV_EVENT_LID_CHANGE,
+	MLX5_DEV_EVENT_PKEY_CHANGE,
+	MLX5_DEV_EVENT_GUID_CHANGE,
+	MLX5_DEV_EVENT_CLIENT_REREG,
+};
+
+struct mlx5_uuar_info {
+	struct mlx5_uar	       *uars;
+	int			num_uars;
+	int			num_low_latency_uuars;
+	unsigned long	       *bitmap;
+	unsigned int	       *count;
+	struct mlx5_bf	       *bfs;
+
+	/*
+	 * protect uuar allocation data structs
+	 */
+	struct mutex		lock;
+	u32			ver;
+};
+
+struct mlx5_bf {
+	void __iomem	       *reg;
+	void __iomem	       *regreg;
+	int			buf_size;
+	struct mlx5_uar	       *uar;
+	unsigned long		offset;
+	int			need_lock;
+	/* protect blue flame buffer selection when needed
+	 */
+	spinlock_t		lock;
+
+	/* serialize 64 bit writes when done as two 32 bit accesses
+	 */
+	spinlock_t		lock32;
+	int			uuarn;
+};
+
+struct mlx5_cmd_first {
+	__be32		data[4];
+};
+
+struct mlx5_cmd_msg {
+	struct list_head		list;
+	struct cache_ent	       *cache;
+	u32				len;
+	struct mlx5_cmd_first		first;
+	struct mlx5_cmd_mailbox	       *next;
+};
+
+struct mlx5_cmd_debug {
+	struct dentry	       *dbg_root;
+	struct dentry	       *dbg_in;
+	struct dentry	       *dbg_out;
+	struct dentry	       *dbg_outlen;
+	struct dentry	       *dbg_status;
+	struct dentry	       *dbg_run;
+	void		       *in_msg;
+	void		       *out_msg;
+	u8			status;
+	u16			inlen;
+	u16			outlen;
+};
+
+struct cache_ent {
+	/* protect block chain allocations
+	 */
+	spinlock_t		lock;
+	struct list_head	head;
+};
+
+struct cmd_msg_cache {
+	struct cache_ent	large;
+	struct cache_ent	med;
+
+};
+
+struct mlx5_cmd_stats {
+	u64		sum;
+	u64		n;
+	struct dentry  *root;
+	struct dentry  *avg;
+	struct dentry  *count;
+	/* protect command average calculations */
+	spinlock_t	lock;
+};
+
+struct mlx5_cmd {
+	void	       *cmd_buf;
+	dma_addr_t	dma;
+	u16		cmdif_rev;
+	u8		log_sz;
+	u8		log_stride;
+	int		max_reg_cmds;
+	int		events;
+	u32 __iomem    *vector;
+
+	/* protect command queue allocations
+	 */
+	spinlock_t	alloc_lock;
+
+	/* protect token allocations
+	 */
+	spinlock_t	token_lock;
+	u8		token;
+	unsigned long	bitmask;
+	char		wq_name[MLX5_CMD_WQ_MAX_NAME];
+	struct workqueue_struct *wq;
+	struct semaphore sem;
+	struct semaphore pages_sem;
+	int	mode;
+	struct mlx5_cmd_work_ent *ent_arr[MLX5_MAX_COMMANDS];
+	struct pci_pool *pool;
+	struct mlx5_cmd_debug dbg;
+	struct cmd_msg_cache cache;
+	int checksum_disabled;
+	struct mlx5_cmd_stats stats[MLX5_CMD_OP_MAX];
+};
+
+struct mlx5_port_caps {
+	int	gid_table_len;
+	int	pkey_table_len;
+};
+
+struct mlx5_general_caps {
+	u8	log_max_eq;
+	u8	log_max_cq;
+	u8	log_max_qp;
+	u8	log_max_mkey;
+	u8	log_max_pd;
+	u8	log_max_srq;
+	u8	log_max_strq;
+	u8	log_max_mrw_sz;
+	u8	log_max_bsf_list_size;
+	u8	log_max_klm_list_size;
+	u32	max_cqes;
+	int	max_wqes;
+	u32	max_eqes;
+	u32	max_indirection;
+	int	max_sq_desc_sz;
+	int	max_rq_desc_sz;
+	int	max_dc_sq_desc_sz;
+	u64	flags;
+	u16	stat_rate_support;
+	int	log_max_msg;
+	int	num_ports;
+	u8	log_max_ra_res_qp;
+	u8	log_max_ra_req_qp;
+	int	max_srq_wqes;
+	int	bf_reg_size;
+	int	bf_regs_per_page;
+	struct mlx5_port_caps	port[MLX5_MAX_PORTS];
+	u8			ext_port_cap[MLX5_MAX_PORTS];
+	int	max_vf;
+	u32	reserved_lkey;
+	u8	local_ca_ack_delay;
+	u8	log_max_mcg;
+	u32	max_qp_mcg;
+	int	min_page_sz;
+	int	pd_cap;
+	u32	max_qp_counters;
+	u32	pkey_table_size;
+	u8	log_max_ra_req_dc;
+	u8	log_max_ra_res_dc;
+	u32	uar_sz;
+	u8	min_log_pg_sz;
+	u8	log_max_xrcd;
+	u16	log_uar_page_sz;
+};
+
+struct mlx5_caps {
+	struct mlx5_general_caps gen;
+};
+
+struct mlx5_cmd_mailbox {
+	void	       *buf;
+	dma_addr_t	dma;
+	struct mlx5_cmd_mailbox *next;
+};
+
+struct mlx5_buf_list {
+	void		       *buf;
+	dma_addr_t		map;
+};
+
+struct mlx5_buf {
+	struct mlx5_buf_list	direct;
+	struct mlx5_buf_list   *page_list;
+	int			nbufs;
+	int			npages;
+	int			size;
+	u8			page_shift;
+};
+
+struct mlx5_eq {
+	struct mlx5_core_dev   *dev;
+	__be32 __iomem	       *doorbell;
+	u32			cons_index;
+	struct mlx5_buf		buf;
+	int			size;
+	u8			irqn;
+	u8			eqn;
+	int			nent;
+	u64			mask;
+	char			name[MLX5_MAX_EQ_NAME];
+	struct list_head	list;
+	int			index;
+	struct mlx5_rsc_debug	*dbg;
+};
+
+struct mlx5_core_psv {
+	u32	psv_idx;
+	struct psv_layout {
+		u32	pd;
+		u16	syndrome;
+		u16	reserved;
+		u16	bg;
+		u16	app_tag;
+		u32	ref_tag;
+	} psv;
+};
+
+struct mlx5_core_sig_ctx {
+	struct mlx5_core_psv	psv_memory;
+	struct mlx5_core_psv	psv_wire;
+	struct ib_sig_err       err_item;
+	bool			sig_status_checked;
+	bool			sig_err_exists;
+	u32			sigerr_count;
+};
+
+struct mlx5_core_mr {
+	u64			iova;
+	u64			size;
+	u32			key;
+	u32			pd;
+};
+
+enum mlx5_res_type {
+	MLX5_RES_QP,
+};
+
+struct mlx5_core_rsc_common {
+	enum mlx5_res_type	res;
+	atomic_t		refcount;
+	struct completion	free;
+};
+
+struct mlx5_core_srq {
+	u32		srqn;
+	int		max;
+	int		max_gs;
+	int		max_avail_gather;
+	int		wqe_shift;
+	void (*event)	(struct mlx5_core_srq *, enum mlx5_event);
+
+	atomic_t		refcount;
+	struct completion	free;
+};
+
+struct mlx5_eq_table {
+	void __iomem	       *update_ci;
+	void __iomem	       *update_arm_ci;
+	struct list_head       *comp_eq_head;
+	struct mlx5_eq		pages_eq;
+	struct mlx5_eq		async_eq;
+	struct mlx5_eq		cmd_eq;
+	struct msix_entry	*msix_arr;
+	int			num_comp_vectors;
+	/* protect EQs list
+	 */
+	spinlock_t		lock;
+};
+
+struct mlx5_uar {
+	u32			index;
+	struct list_head	bf_list;
+	unsigned		free_bf_bmap;
+	void __iomem	       *wc_map;
+	void __iomem	       *map;
+};
+
+
+struct mlx5_core_health {
+	struct health_buffer __iomem   *health;
+	__be32 __iomem		       *health_counter;
+	struct timer_list		timer;
+	struct list_head		list;
+	u32				prev;
+	int				miss_counter;
+};
+
+struct mlx5_cq_table {
+	/* protect radix tree
+	 */
+	spinlock_t		lock;
+	struct radix_tree_root	tree;
+};
+
+struct mlx5_qp_table {
+	/* protect radix tree
+	 */
+	spinlock_t		lock;
+	struct radix_tree_root	tree;
+};
+
+struct mlx5_srq_table {
+	/* protect radix tree
+	 */
+	spinlock_t		lock;
+	struct radix_tree_root	tree;
+};
+
+struct mlx5_mr_table {
+	/* protect radix tree
+	 */
+	rwlock_t		lock;
+	struct radix_tree_root	tree;
+};
+
+struct mlx5_priv {
+	char			name[MLX5_MAX_NAME_LEN];
+	struct mlx5_eq_table	eq_table;
+	struct mlx5_uuar_info	uuari;
+	MLX5_DECLARE_DOORBELL_LOCK(cq_uar_lock);
+
+	/* pages stuff */
+	struct workqueue_struct *pg_wq;
+	struct rb_root		page_root;
+	int			fw_pages;
+	int			reg_pages;
+	struct list_head	free_list;
+
+	struct mlx5_core_health health;
+
+	struct mlx5_srq_table	srq_table;
+
+	/* start: qp staff */
+	struct mlx5_qp_table	qp_table;
+	struct dentry	       *qp_debugfs;
+	struct dentry	       *eq_debugfs;
+	struct dentry	       *cq_debugfs;
+	struct dentry	       *cmdif_debugfs;
+	/* end: qp staff */
+
+	/* start: cq staff */
+	struct mlx5_cq_table	cq_table;
+	/* end: cq staff */
+
+	/* start: mr staff */
+	struct mlx5_mr_table	mr_table;
+	/* end: mr staff */
+
+	/* start: alloc staff */
+	struct mutex            pgdir_mutex;
+	struct list_head        pgdir_list;
+	/* end: alloc staff */
+	struct dentry	       *dbg_root;
+
+	/* protect mkey key part */
+	spinlock_t		mkey_lock;
+	u8			mkey_key;
+
+	struct list_head        dev_list;
+	struct list_head        ctx_list;
+	spinlock_t              ctx_lock;
+};
+
+struct mlx5_core_dev {
+	struct pci_dev	       *pdev;
+	u8			rev_id;
+	char			board_id[MLX5_BOARD_ID_LEN];
+	struct mlx5_cmd		cmd;
+	struct mlx5_caps	caps;
+	phys_addr_t		iseg_base;
+	struct mlx5_init_seg __iomem *iseg;
+	void			(*event) (struct mlx5_core_dev *dev,
+					  enum mlx5_dev_event event,
+					  unsigned long param);
+	struct mlx5_priv	priv;
+	struct mlx5_profile	*profile;
+	atomic_t		num_qps;
+};
+
+struct mlx5_db {
+	__be32			*db;
+	union {
+		struct mlx5_db_pgdir		*pgdir;
+		struct mlx5_ib_user_db_page	*user_page;
+	}			u;
+	dma_addr_t		dma;
+	int			index;
+};
+
+enum {
+	MLX5_DB_PER_PAGE = PAGE_SIZE / L1_CACHE_BYTES,
+};
+
+enum {
+	MLX5_COMP_EQ_SIZE = 1024,
+};
+
+struct mlx5_db_pgdir {
+	struct list_head	list;
+	DECLARE_BITMAP(bitmap, MLX5_DB_PER_PAGE);
+	__be32		       *db_page;
+	dma_addr_t		db_dma;
+};
+
+typedef void (*mlx5_cmd_cbk_t)(int status, void *context);
+
+struct mlx5_cmd_work_ent {
+	struct mlx5_cmd_msg    *in;
+	struct mlx5_cmd_msg    *out;
+	void		       *uout;
+	int			uout_size;
+	mlx5_cmd_cbk_t		callback;
+	void		       *context;
+	int			idx;
+	struct completion	done;
+	struct mlx5_cmd        *cmd;
+	struct work_struct	work;
+	struct mlx5_cmd_layout *lay;
+	int			ret;
+	int			page_queue;
+	u8			status;
+	u8			token;
+	u64			ts1;
+	u64			ts2;
+	u16			op;
+};
+
+struct mlx5_pas {
+	u64	pa;
+	u8	log_sz;
+};
+
+static inline void *mlx5_buf_offset(struct mlx5_buf *buf, int offset)
+{
+	if (likely(BITS_PER_LONG == 64 || buf->nbufs == 1))
+		return buf->direct.buf + offset;
+	else
+		return buf->page_list[offset >> PAGE_SHIFT].buf +
+			(offset & (PAGE_SIZE - 1));
+}
+
+extern struct workqueue_struct *mlx5_core_wq;
+
+#define STRUCT_FIELD(header, field) \
+	.struct_offset_bytes = offsetof(struct ib_unpacked_ ## header, field),      \
+	.struct_size_bytes   = sizeof((struct ib_unpacked_ ## header *)0)->field
+
+struct ib_field {
+	size_t struct_offset_bytes;
+	size_t struct_size_bytes;
+	int    offset_bits;
+	int    size_bits;
+};
+
+static inline struct mlx5_core_dev *pci2mlx5_core_dev(struct pci_dev *pdev)
+{
+	return pci_get_drvdata(pdev);
+}
+
+extern struct dentry *mlx5_debugfs_root;
+
+static inline u16 fw_rev_maj(struct mlx5_core_dev *dev)
+{
+	return ioread32be(&dev->iseg->fw_rev) & 0xffff;
+}
+
+static inline u16 fw_rev_min(struct mlx5_core_dev *dev)
+{
+	return ioread32be(&dev->iseg->fw_rev) >> 16;
+}
+
+static inline u16 fw_rev_sub(struct mlx5_core_dev *dev)
+{
+	return ioread32be(&dev->iseg->cmdif_rev_fw_sub) & 0xffff;
+}
+
+static inline u16 cmdif_rev(struct mlx5_core_dev *dev)
+{
+	return ioread32be(&dev->iseg->cmdif_rev_fw_sub) >> 16;
+}
+
+static inline void *mlx5_vzalloc(unsigned long size)
+{
+	void *rtn;
+
+	rtn = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
+	if (!rtn)
+		rtn = vzalloc(size);
+	return rtn;
+}
+
+static inline void mlx5_vfree(const void *addr)
+{
+	if (addr && is_vmalloc_addr(addr))
+		vfree(addr);
+	else
+		kfree(addr);
+}
+
+static inline u32 mlx5_base_mkey(const u32 key)
+{
+	return key & 0xffffff00u;
+}
+
+int mlx5_cmd_init(struct mlx5_core_dev *dev);
+void mlx5_cmd_cleanup(struct mlx5_core_dev *dev);
+void mlx5_cmd_use_events(struct mlx5_core_dev *dev);
+void mlx5_cmd_use_polling(struct mlx5_core_dev *dev);
+int mlx5_cmd_status_to_err(struct mlx5_outbox_hdr *hdr);
+int mlx5_cmd_status_to_err_v2(void *ptr);
+int mlx5_core_get_caps(struct mlx5_core_dev *dev, struct mlx5_caps *caps,
+		       u16 opmod);
+int mlx5_cmd_exec(struct mlx5_core_dev *dev, void *in, int in_size, void *out,
+		  int out_size);
+int mlx5_cmd_exec_cb(struct mlx5_core_dev *dev, void *in, int in_size,
+		     void *out, int out_size, mlx5_cmd_cbk_t callback,
+		     void *context);
+int mlx5_cmd_alloc_uar(struct mlx5_core_dev *dev, u32 *uarn);
+int mlx5_cmd_free_uar(struct mlx5_core_dev *dev, u32 uarn);
+int mlx5_alloc_uuars(struct mlx5_core_dev *dev, struct mlx5_uuar_info *uuari);
+int mlx5_free_uuars(struct mlx5_core_dev *dev, struct mlx5_uuar_info *uuari);
+void mlx5_health_cleanup(void);
+void  __init mlx5_health_init(void);
+void mlx5_start_health_poll(struct mlx5_core_dev *dev);
+void mlx5_stop_health_poll(struct mlx5_core_dev *dev);
+int mlx5_buf_alloc(struct mlx5_core_dev *dev, int size, int max_direct,
+		   struct mlx5_buf *buf);
+void mlx5_buf_free(struct mlx5_core_dev *dev, struct mlx5_buf *buf);
+struct mlx5_cmd_mailbox *mlx5_alloc_cmd_mailbox_chain(struct mlx5_core_dev *dev,
+						      gfp_t flags, int npages);
+void mlx5_free_cmd_mailbox_chain(struct mlx5_core_dev *dev,
+				 struct mlx5_cmd_mailbox *head);
+int mlx5_core_create_srq(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq,
+			 struct mlx5_create_srq_mbox_in *in, int inlen);
+int mlx5_core_destroy_srq(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq);
+int mlx5_core_query_srq(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq,
+			struct mlx5_query_srq_mbox_out *out);
+int mlx5_core_arm_srq(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq,
+		      u16 lwm, int is_srq);
+void mlx5_init_mr_table(struct mlx5_core_dev *dev);
+void mlx5_cleanup_mr_table(struct mlx5_core_dev *dev);
+int mlx5_core_create_mkey(struct mlx5_core_dev *dev, struct mlx5_core_mr *mr,
+			  struct mlx5_create_mkey_mbox_in *in, int inlen,
+			  mlx5_cmd_cbk_t callback, void *context,
+			  struct mlx5_create_mkey_mbox_out *out);
+int mlx5_core_destroy_mkey(struct mlx5_core_dev *dev, struct mlx5_core_mr *mr);
+int mlx5_core_query_mkey(struct mlx5_core_dev *dev, struct mlx5_core_mr *mr,
+			 struct mlx5_query_mkey_mbox_out *out, int outlen);
+int mlx5_core_dump_fill_mkey(struct mlx5_core_dev *dev, struct mlx5_core_mr *mr,
+			     u32 *mkey);
+int mlx5_core_alloc_pd(struct mlx5_core_dev *dev, u32 *pdn);
+int mlx5_core_dealloc_pd(struct mlx5_core_dev *dev, u32 pdn);
+int mlx5_core_mad_ifc(struct mlx5_core_dev *dev, void *inb, void *outb,
+		      u16 opmod, u8 port);
+void mlx5_pagealloc_init(struct mlx5_core_dev *dev);
+void mlx5_pagealloc_cleanup(struct mlx5_core_dev *dev);
+int mlx5_pagealloc_start(struct mlx5_core_dev *dev);
+void mlx5_pagealloc_stop(struct mlx5_core_dev *dev);
+void mlx5_core_req_pages_handler(struct mlx5_core_dev *dev, u16 func_id,
+				 s32 npages);
+int mlx5_satisfy_startup_pages(struct mlx5_core_dev *dev, int boot);
+int mlx5_reclaim_startup_pages(struct mlx5_core_dev *dev);
+void mlx5_register_debugfs(void);
+void mlx5_unregister_debugfs(void);
+int mlx5_eq_init(struct mlx5_core_dev *dev);
+void mlx5_eq_cleanup(struct mlx5_core_dev *dev);
+void mlx5_fill_page_array(struct mlx5_buf *buf, __be64 *pas);
+void mlx5_cq_completion(struct mlx5_core_dev *dev, u32 cqn);
+void mlx5_rsc_event(struct mlx5_core_dev *dev, u32 rsn, int event_type);
+void mlx5_srq_event(struct mlx5_core_dev *dev, u32 srqn, int event_type);
+struct mlx5_core_srq *mlx5_core_get_srq(struct mlx5_core_dev *dev, u32 srqn);
+void mlx5_cmd_comp_handler(struct mlx5_core_dev *dev, unsigned long vector);
+void mlx5_cq_event(struct mlx5_core_dev *dev, u32 cqn, int event_type);
+int mlx5_create_map_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq, u8 vecidx,
+		       int nent, u64 mask, const char *name, struct mlx5_uar *uar);
+int mlx5_destroy_unmap_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq);
+int mlx5_start_eqs(struct mlx5_core_dev *dev);
+int mlx5_stop_eqs(struct mlx5_core_dev *dev);
+int mlx5_core_attach_mcg(struct mlx5_core_dev *dev, union ib_gid *mgid, u32 qpn);
+int mlx5_core_detach_mcg(struct mlx5_core_dev *dev, union ib_gid *mgid, u32 qpn);
+
+int mlx5_qp_debugfs_init(struct mlx5_core_dev *dev);
+void mlx5_qp_debugfs_cleanup(struct mlx5_core_dev *dev);
+int mlx5_core_access_reg(struct mlx5_core_dev *dev, void *data_in,
+			 int size_in, void *data_out, int size_out,
+			 u16 reg_num, int arg, int write);
+int mlx5_set_port_caps(struct mlx5_core_dev *dev, u8 port_num, u32 caps);
+
+int mlx5_debug_eq_add(struct mlx5_core_dev *dev, struct mlx5_eq *eq);
+void mlx5_debug_eq_remove(struct mlx5_core_dev *dev, struct mlx5_eq *eq);
+int mlx5_core_eq_query(struct mlx5_core_dev *dev, struct mlx5_eq *eq,
+		       struct mlx5_query_eq_mbox_out *out, int outlen);
+int mlx5_eq_debugfs_init(struct mlx5_core_dev *dev);
+void mlx5_eq_debugfs_cleanup(struct mlx5_core_dev *dev);
+int mlx5_cq_debugfs_init(struct mlx5_core_dev *dev);
+void mlx5_cq_debugfs_cleanup(struct mlx5_core_dev *dev);
+int mlx5_db_alloc(struct mlx5_core_dev *dev, struct mlx5_db *db);
+void mlx5_db_free(struct mlx5_core_dev *dev, struct mlx5_db *db);
+
+const char *mlx5_command_str(int command);
+int mlx5_cmdif_debugfs_init(struct mlx5_core_dev *dev);
+void mlx5_cmdif_debugfs_cleanup(struct mlx5_core_dev *dev);
+int mlx5_core_create_psv(struct mlx5_core_dev *dev, u32 pdn,
+			 int npsvs, u32 *sig_index);
+int mlx5_core_destroy_psv(struct mlx5_core_dev *dev, int psv_num);
+void mlx5_core_put_rsc(struct mlx5_core_rsc_common *common);
+
+static inline u32 mlx5_mkey_to_idx(u32 mkey)
+{
+	return mkey >> 8;
+}
+
+static inline u32 mlx5_idx_to_mkey(u32 mkey_idx)
+{
+	return mkey_idx << 8;
+}
+
+static inline u8 mlx5_mkey_variant(u32 mkey)
+{
+	return mkey & 0xff;
+}
+
+enum {
+	MLX5_PROF_MASK_QP_SIZE		= (u64)1 << 0,
+	MLX5_PROF_MASK_MR_CACHE		= (u64)1 << 1,
+};
+
+enum {
+	MAX_MR_CACHE_ENTRIES    = 16,
+};
+
+struct mlx5_interface {
+	void *			(*add)(struct mlx5_core_dev *dev);
+	void			(*remove)(struct mlx5_core_dev *dev, void *context);
+	void			(*event)(struct mlx5_core_dev *dev, void *context,
+					 enum mlx5_dev_event event, unsigned long param);
+	struct list_head	list;
+};
+
+int mlx5_register_interface(struct mlx5_interface *intf);
+void mlx5_unregister_interface(struct mlx5_interface *intf);
+
+struct mlx5_profile {
+	u64	mask;
+	u8	log_max_qp;
+	struct {
+		int	size;
+		int	limit;
+	} mr_cache[MAX_MR_CACHE_ENTRIES];
+};
+
+#endif /* MLX5_DRIVER_H */
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
new file mode 100644
index 0000000..5f48b8f
--- /dev/null
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -0,0 +1,349 @@
+/*
+ * Copyright (c) 2014, Mellanox Technologies inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MLX5_IFC_H
+#define MLX5_IFC_H
+
+enum {
+	MLX5_CMD_OP_QUERY_HCA_CAP                 = 0x100,
+	MLX5_CMD_OP_QUERY_ADAPTER                 = 0x101,
+	MLX5_CMD_OP_INIT_HCA                      = 0x102,
+	MLX5_CMD_OP_TEARDOWN_HCA                  = 0x103,
+	MLX5_CMD_OP_ENABLE_HCA                    = 0x104,
+	MLX5_CMD_OP_DISABLE_HCA                   = 0x105,
+	MLX5_CMD_OP_QUERY_PAGES                   = 0x107,
+	MLX5_CMD_OP_MANAGE_PAGES                  = 0x108,
+	MLX5_CMD_OP_SET_HCA_CAP                   = 0x109,
+	MLX5_CMD_OP_CREATE_MKEY                   = 0x200,
+	MLX5_CMD_OP_QUERY_MKEY                    = 0x201,
+	MLX5_CMD_OP_DESTROY_MKEY                  = 0x202,
+	MLX5_CMD_OP_QUERY_SPECIAL_CONTEXTS        = 0x203,
+	MLX5_CMD_OP_PAGE_FAULT_RESUME             = 0x204,
+	MLX5_CMD_OP_CREATE_EQ                     = 0x301,
+	MLX5_CMD_OP_DESTROY_EQ                    = 0x302,
+	MLX5_CMD_OP_QUERY_EQ                      = 0x303,
+	MLX5_CMD_OP_GEN_EQE                       = 0x304,
+	MLX5_CMD_OP_CREATE_CQ                     = 0x400,
+	MLX5_CMD_OP_DESTROY_CQ                    = 0x401,
+	MLX5_CMD_OP_QUERY_CQ                      = 0x402,
+	MLX5_CMD_OP_MODIFY_CQ                     = 0x403,
+	MLX5_CMD_OP_CREATE_QP                     = 0x500,
+	MLX5_CMD_OP_DESTROY_QP                    = 0x501,
+	MLX5_CMD_OP_RST2INIT_QP                   = 0x502,
+	MLX5_CMD_OP_INIT2RTR_QP                   = 0x503,
+	MLX5_CMD_OP_RTR2RTS_QP                    = 0x504,
+	MLX5_CMD_OP_RTS2RTS_QP                    = 0x505,
+	MLX5_CMD_OP_SQERR2RTS_QP                  = 0x506,
+	MLX5_CMD_OP_2ERR_QP                       = 0x507,
+	MLX5_CMD_OP_2RST_QP                       = 0x50a,
+	MLX5_CMD_OP_QUERY_QP                      = 0x50b,
+	MLX5_CMD_OP_INIT2INIT_QP                  = 0x50e,
+	MLX5_CMD_OP_CREATE_PSV                    = 0x600,
+	MLX5_CMD_OP_DESTROY_PSV                   = 0x601,
+	MLX5_CMD_OP_CREATE_SRQ                    = 0x700,
+	MLX5_CMD_OP_DESTROY_SRQ                   = 0x701,
+	MLX5_CMD_OP_QUERY_SRQ                     = 0x702,
+	MLX5_CMD_OP_ARM_RQ                        = 0x703,
+	MLX5_CMD_OP_RESIZE_SRQ                    = 0x704,
+	MLX5_CMD_OP_CREATE_DCT                    = 0x710,
+	MLX5_CMD_OP_DESTROY_DCT                   = 0x711,
+	MLX5_CMD_OP_DRAIN_DCT                     = 0x712,
+	MLX5_CMD_OP_QUERY_DCT                     = 0x713,
+	MLX5_CMD_OP_ARM_DCT_FOR_KEY_VIOLATION     = 0x714,
+	MLX5_CMD_OP_QUERY_VPORT_STATE             = 0x750,
+	MLX5_CMD_OP_MODIFY_VPORT_STATE            = 0x751,
+	MLX5_CMD_OP_QUERY_ESW_VPORT_CONTEXT       = 0x752,
+	MLX5_CMD_OP_MODIFY_ESW_VPORT_CONTEXT      = 0x753,
+	MLX5_CMD_OP_QUERY_NIC_VPORT_CONTEXT       = 0x754,
+	MLX5_CMD_OP_MODIFY_NIC_VPORT_CONTEXT      = 0x755,
+	MLX5_CMD_OP_QUERY_RCOE_ADDRESS            = 0x760,
+	MLX5_CMD_OP_SET_ROCE_ADDRESS              = 0x761,
+	MLX5_CMD_OP_QUERY_VPORT_COUNTER           = 0x770,
+	MLX5_CMD_OP_ALLOC_Q_COUNTER               = 0x771,
+	MLX5_CMD_OP_DEALLOC_Q_COUNTER             = 0x772,
+	MLX5_CMD_OP_QUERY_Q_COUNTER               = 0x773,
+	MLX5_CMD_OP_ALLOC_PD                      = 0x800,
+	MLX5_CMD_OP_DEALLOC_PD                    = 0x801,
+	MLX5_CMD_OP_ALLOC_UAR                     = 0x802,
+	MLX5_CMD_OP_DEALLOC_UAR                   = 0x803,
+	MLX5_CMD_OP_CONFIG_INT_MODERATION         = 0x804,
+	MLX5_CMD_OP_ACCESS_REG                    = 0x805,
+	MLX5_CMD_OP_ATTACH_TO_MCG                 = 0x806,
+	MLX5_CMD_OP_DETACH_FROM_MCG               = 0x807,
+	MLX5_CMD_OP_GET_DROPPED_PACKET_LOG        = 0x80a,
+	MLX5_CMD_OP_MAD_IFC                       = 0x50d,
+	MLX5_CMD_OP_QUERY_MAD_DEMUX               = 0x80b,
+	MLX5_CMD_OP_SET_MAD_DEMUX                 = 0x80c,
+	MLX5_CMD_OP_NOP                           = 0x80d,
+	MLX5_CMD_OP_ALLOC_XRCD                    = 0x80e,
+	MLX5_CMD_OP_DEALLOC_XRCD                  = 0x80f,
+	MLX5_CMD_OP_SET_BURST_SIZE                = 0x812,
+	MLX5_CMD_OP_QUERY_BURST_SZIE              = 0x813,
+	MLX5_CMD_OP_ACTIVATE_TRACER               = 0x814,
+	MLX5_CMD_OP_DEACTIVATE_TRACER             = 0x815,
+	MLX5_CMD_OP_CREATE_SNIFFER_RULE           = 0x820,
+	MLX5_CMD_OP_DESTROY_SNIFFER_RULE          = 0x821,
+	MLX5_CMD_OP_QUERY_CONG_PARAMS             = 0x822,
+	MLX5_CMD_OP_MODIFY_CONG_PARAMS            = 0x823,
+	MLX5_CMD_OP_QUERY_CONG_STATISTICS         = 0x824,
+	MLX5_CMD_OP_CREATE_TIR                    = 0x900,
+	MLX5_CMD_OP_MODIFY_TIR                    = 0x901,
+	MLX5_CMD_OP_DESTROY_TIR                   = 0x902,
+	MLX5_CMD_OP_QUERY_TIR                     = 0x903,
+	MLX5_CMD_OP_CREATE_TIS                    = 0x912,
+	MLX5_CMD_OP_MODIFY_TIS                    = 0x913,
+	MLX5_CMD_OP_DESTROY_TIS                   = 0x914,
+	MLX5_CMD_OP_QUERY_TIS                     = 0x915,
+	MLX5_CMD_OP_CREATE_SQ                     = 0x904,
+	MLX5_CMD_OP_MODIFY_SQ                     = 0x905,
+	MLX5_CMD_OP_DESTROY_SQ                    = 0x906,
+	MLX5_CMD_OP_QUERY_SQ                      = 0x907,
+	MLX5_CMD_OP_CREATE_RQ                     = 0x908,
+	MLX5_CMD_OP_MODIFY_RQ                     = 0x909,
+	MLX5_CMD_OP_DESTROY_RQ                    = 0x90a,
+	MLX5_CMD_OP_QUERY_RQ                      = 0x90b,
+	MLX5_CMD_OP_CREATE_RMP                    = 0x90c,
+	MLX5_CMD_OP_MODIFY_RMP                    = 0x90d,
+	MLX5_CMD_OP_DESTROY_RMP                   = 0x90e,
+	MLX5_CMD_OP_QUERY_RMP                     = 0x90f,
+	MLX5_CMD_OP_SET_FLOW_TABLE_ENTRY          = 0x910,
+	MLX5_CMD_OP_QUERY_FLOW_TABLE_ENTRY        = 0x911,
+	MLX5_CMD_OP_MAX				  = 0x911
+};
+
+struct mlx5_ifc_cmd_hca_cap_bits {
+	u8         reserved_0[0x80];
+
+	u8         log_max_srq_sz[0x8];
+	u8         log_max_qp_sz[0x8];
+	u8         reserved_1[0xb];
+	u8         log_max_qp[0x5];
+
+	u8         log_max_strq_sz[0x8];
+	u8         reserved_2[0x3];
+	u8         log_max_srqs[0x5];
+	u8         reserved_3[0x10];
+
+	u8         reserved_4[0x8];
+	u8         log_max_cq_sz[0x8];
+	u8         reserved_5[0xb];
+	u8         log_max_cq[0x5];
+
+	u8         log_max_eq_sz[0x8];
+	u8         reserved_6[0x2];
+	u8         log_max_mkey[0x6];
+	u8         reserved_7[0xc];
+	u8         log_max_eq[0x4];
+
+	u8         max_indirection[0x8];
+	u8         reserved_8[0x1];
+	u8         log_max_mrw_sz[0x7];
+	u8         reserved_9[0x2];
+	u8         log_max_bsf_list_size[0x6];
+	u8         reserved_10[0x2];
+	u8         log_max_klm_list_size[0x6];
+
+	u8         reserved_11[0xa];
+	u8         log_max_ra_req_dc[0x6];
+	u8         reserved_12[0xa];
+	u8         log_max_ra_res_dc[0x6];
+
+	u8         reserved_13[0xa];
+	u8         log_max_ra_req_qp[0x6];
+	u8         reserved_14[0xa];
+	u8         log_max_ra_res_qp[0x6];
+
+	u8         pad_cap[0x1];
+	u8         cc_query_allowed[0x1];
+	u8         cc_modify_allowed[0x1];
+	u8         reserved_15[0x1d];
+
+	u8         reserved_16[0x6];
+	u8         max_qp_cnt[0xa];
+	u8         pkey_table_size[0x10];
+
+	u8         eswitch_owner[0x1];
+	u8         reserved_17[0xa];
+	u8         local_ca_ack_delay[0x5];
+	u8         reserved_18[0x8];
+	u8         num_ports[0x8];
+
+	u8         reserved_19[0x3];
+	u8         log_max_msg[0x5];
+	u8         reserved_20[0x18];
+
+	u8         stat_rate_support[0x10];
+	u8         reserved_21[0x10];
+
+	u8         reserved_22[0x10];
+	u8         cmdif_checksum[0x2];
+	u8         sigerr_cqe[0x1];
+	u8         reserved_23[0x1];
+	u8         wq_signature[0x1];
+	u8         sctr_data_cqe[0x1];
+	u8         reserved_24[0x1];
+	u8         sho[0x1];
+	u8         tph[0x1];
+	u8         rf[0x1];
+	u8         dc[0x1];
+	u8         reserved_25[0x2];
+	u8         roce[0x1];
+	u8         atomic[0x1];
+	u8         rsz_srq[0x1];
+
+	u8         cq_oi[0x1];
+	u8         cq_resize[0x1];
+	u8         cq_moderation[0x1];
+	u8         sniffer_rule_flow[0x1];
+	u8         sniffer_rule_vport[0x1];
+	u8         sniffer_rule_phy[0x1];
+	u8         reserved_26[0x1];
+	u8         pg[0x1];
+	u8         block_lb_mc[0x1];
+	u8         reserved_27[0x3];
+	u8         cd[0x1];
+	u8         reserved_28[0x1];
+	u8         apm[0x1];
+	u8         reserved_29[0x7];
+	u8         qkv[0x1];
+	u8         pkv[0x1];
+	u8         reserved_30[0x4];
+	u8         xrc[0x1];
+	u8         ud[0x1];
+	u8         uc[0x1];
+	u8         rc[0x1];
+
+	u8         reserved_31[0xa];
+	u8         uar_sz[0x6];
+	u8         reserved_32[0x8];
+	u8         log_pg_sz[0x8];
+
+	u8         bf[0x1];
+	u8         reserved_33[0xa];
+	u8         log_bf_reg_size[0x5];
+	u8         reserved_34[0x10];
+
+	u8         reserved_35[0x10];
+	u8         max_wqe_sz_sq[0x10];
+
+	u8         reserved_36[0x10];
+	u8         max_wqe_sz_rq[0x10];
+
+	u8         reserved_37[0x10];
+	u8         max_wqe_sz_sq_dc[0x10];
+
+	u8         reserved_38[0x7];
+	u8         max_qp_mcg[0x19];
+
+	u8         reserved_39[0x18];
+	u8         log_max_mcg[0x8];
+
+	u8         reserved_40[0xb];
+	u8         log_max_pd[0x5];
+	u8         reserved_41[0xb];
+	u8         log_max_xrcd[0x5];
+
+	u8         reserved_42[0x20];
+
+	u8         reserved_43[0x3];
+	u8         log_max_rq[0x5];
+	u8         reserved_44[0x3];
+	u8         log_max_sq[0x5];
+	u8         reserved_45[0x3];
+	u8         log_max_tir[0x5];
+	u8         reserved_46[0x3];
+	u8         log_max_tis[0x5];
+
+	u8         reserved_47[0x13];
+	u8         log_max_rq_per_tir[0x5];
+	u8         reserved_48[0x3];
+	u8         log_max_tis_per_sq[0x5];
+
+	u8         reserved_49[0xe0];
+
+	u8         reserved_50[0x10];
+	u8         log_uar_page_sz[0x10];
+
+	u8         reserved_51[0x100];
+
+	u8         reserved_52[0x1f];
+	u8         cqe_zip[0x1];
+
+	u8         cqe_zip_timeout[0x10];
+	u8         cqe_zip_max_num[0x10];
+
+	u8         reserved_53[0x220];
+};
+
+struct mlx5_ifc_set_hca_cap_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_0[0x10];
+
+	u8         reserved_1[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_2[0x40];
+
+	struct mlx5_ifc_cmd_hca_cap_bits hca_capability_struct;
+};
+
+struct mlx5_ifc_query_hca_cap_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_0[0x10];
+
+	u8         reserved_1[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_2[0x40];
+};
+
+struct mlx5_ifc_query_hca_cap_out_bits {
+	u8         status[0x8];
+	u8         reserved_0[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_1[0x40];
+
+	u8         capability_struct[256][0x8];
+};
+
+struct mlx5_ifc_set_hca_cap_out_bits {
+	u8         status[0x8];
+	u8         reserved_0[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_1[0x40];
+};
+
+#endif /* MLX5_IFC_H */
diff --git a/include/linux/mlx5/qp.h b/include/linux/mlx5/qp.h
new file mode 100644
index 0000000..3fa075d
--- /dev/null
+++ b/include/linux/mlx5/qp.h
@@ -0,0 +1,598 @@
+/*
+ * Copyright (c) 2013, Mellanox Technologies inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MLX5_QP_H
+#define MLX5_QP_H
+
+#include <linux/mlx5/device.h>
+#include <linux/mlx5/driver.h>
+
+#define MLX5_INVALID_LKEY	0x100
+#define MLX5_SIG_WQE_SIZE	(MLX5_SEND_WQE_BB * 5)
+#define MLX5_DIF_SIZE		8
+#define MLX5_STRIDE_BLOCK_OP	0x400
+#define MLX5_CPY_GRD_MASK	0xc0
+#define MLX5_CPY_APP_MASK	0x30
+#define MLX5_CPY_REF_MASK	0x0f
+#define MLX5_BSF_INC_REFTAG	(1 << 6)
+#define MLX5_BSF_INL_VALID	(1 << 15)
+#define MLX5_BSF_REFRESH_DIF	(1 << 14)
+#define MLX5_BSF_REPEAT_BLOCK	(1 << 7)
+#define MLX5_BSF_APPTAG_ESCAPE	0x1
+#define MLX5_BSF_APPREF_ESCAPE	0x2
+
+enum mlx5_qp_optpar {
+	MLX5_QP_OPTPAR_ALT_ADDR_PATH		= 1 << 0,
+	MLX5_QP_OPTPAR_RRE			= 1 << 1,
+	MLX5_QP_OPTPAR_RAE			= 1 << 2,
+	MLX5_QP_OPTPAR_RWE			= 1 << 3,
+	MLX5_QP_OPTPAR_PKEY_INDEX		= 1 << 4,
+	MLX5_QP_OPTPAR_Q_KEY			= 1 << 5,
+	MLX5_QP_OPTPAR_RNR_TIMEOUT		= 1 << 6,
+	MLX5_QP_OPTPAR_PRIMARY_ADDR_PATH	= 1 << 7,
+	MLX5_QP_OPTPAR_SRA_MAX			= 1 << 8,
+	MLX5_QP_OPTPAR_RRA_MAX			= 1 << 9,
+	MLX5_QP_OPTPAR_PM_STATE			= 1 << 10,
+	MLX5_QP_OPTPAR_RETRY_COUNT		= 1 << 12,
+	MLX5_QP_OPTPAR_RNR_RETRY		= 1 << 13,
+	MLX5_QP_OPTPAR_ACK_TIMEOUT		= 1 << 14,
+	MLX5_QP_OPTPAR_PRI_PORT			= 1 << 16,
+	MLX5_QP_OPTPAR_SRQN			= 1 << 18,
+	MLX5_QP_OPTPAR_CQN_RCV			= 1 << 19,
+	MLX5_QP_OPTPAR_DC_HS			= 1 << 20,
+	MLX5_QP_OPTPAR_DC_KEY			= 1 << 21,
+};
+
+enum mlx5_qp_state {
+	MLX5_QP_STATE_RST			= 0,
+	MLX5_QP_STATE_INIT			= 1,
+	MLX5_QP_STATE_RTR			= 2,
+	MLX5_QP_STATE_RTS			= 3,
+	MLX5_QP_STATE_SQER			= 4,
+	MLX5_QP_STATE_SQD			= 5,
+	MLX5_QP_STATE_ERR			= 6,
+	MLX5_QP_STATE_SQ_DRAINING		= 7,
+	MLX5_QP_STATE_SUSPENDED			= 9,
+	MLX5_QP_NUM_STATE
+};
+
+enum {
+	MLX5_QP_ST_RC				= 0x0,
+	MLX5_QP_ST_UC				= 0x1,
+	MLX5_QP_ST_UD				= 0x2,
+	MLX5_QP_ST_XRC				= 0x3,
+	MLX5_QP_ST_MLX				= 0x4,
+	MLX5_QP_ST_DCI				= 0x5,
+	MLX5_QP_ST_DCT				= 0x6,
+	MLX5_QP_ST_QP0				= 0x7,
+	MLX5_QP_ST_QP1				= 0x8,
+	MLX5_QP_ST_RAW_ETHERTYPE		= 0x9,
+	MLX5_QP_ST_RAW_IPV6			= 0xa,
+	MLX5_QP_ST_SNIFFER			= 0xb,
+	MLX5_QP_ST_SYNC_UMR			= 0xe,
+	MLX5_QP_ST_PTP_1588			= 0xd,
+	MLX5_QP_ST_REG_UMR			= 0xc,
+	MLX5_QP_ST_MAX
+};
+
+enum {
+	MLX5_QP_PM_MIGRATED			= 0x3,
+	MLX5_QP_PM_ARMED			= 0x0,
+	MLX5_QP_PM_REARM			= 0x1
+};
+
+enum {
+	MLX5_NON_ZERO_RQ	= 0 << 24,
+	MLX5_SRQ_RQ		= 1 << 24,
+	MLX5_CRQ_RQ		= 2 << 24,
+	MLX5_ZERO_LEN_RQ	= 3 << 24
+};
+
+enum {
+	/* params1 */
+	MLX5_QP_BIT_SRE				= 1 << 15,
+	MLX5_QP_BIT_SWE				= 1 << 14,
+	MLX5_QP_BIT_SAE				= 1 << 13,
+	/* params2 */
+	MLX5_QP_BIT_RRE				= 1 << 15,
+	MLX5_QP_BIT_RWE				= 1 << 14,
+	MLX5_QP_BIT_RAE				= 1 << 13,
+	MLX5_QP_BIT_RIC				= 1 <<	4,
+};
+
+enum {
+	MLX5_WQE_CTRL_CQ_UPDATE		= 2 << 2,
+	MLX5_WQE_CTRL_SOLICITED		= 1 << 1,
+};
+
+enum {
+	MLX5_SEND_WQE_BB	= 64,
+};
+
+enum {
+	MLX5_WQE_FMR_PERM_LOCAL_READ	= 1 << 27,
+	MLX5_WQE_FMR_PERM_LOCAL_WRITE	= 1 << 28,
+	MLX5_WQE_FMR_PERM_REMOTE_READ	= 1 << 29,
+	MLX5_WQE_FMR_PERM_REMOTE_WRITE	= 1 << 30,
+	MLX5_WQE_FMR_PERM_ATOMIC	= 1 << 31
+};
+
+enum {
+	MLX5_FENCE_MODE_NONE			= 0 << 5,
+	MLX5_FENCE_MODE_INITIATOR_SMALL		= 1 << 5,
+	MLX5_FENCE_MODE_STRONG_ORDERING		= 3 << 5,
+	MLX5_FENCE_MODE_SMALL_AND_FENCE		= 4 << 5,
+};
+
+enum {
+	MLX5_QP_LAT_SENSITIVE	= 1 << 28,
+	MLX5_QP_BLOCK_MCAST	= 1 << 30,
+	MLX5_QP_ENABLE_SIG	= 1 << 31,
+};
+
+enum {
+	MLX5_RCV_DBR	= 0,
+	MLX5_SND_DBR	= 1,
+};
+
+enum {
+	MLX5_FLAGS_INLINE	= 1<<7,
+	MLX5_FLAGS_CHECK_FREE   = 1<<5,
+};
+
+struct mlx5_wqe_fmr_seg {
+	__be32			flags;
+	__be32			mem_key;
+	__be64			buf_list;
+	__be64			start_addr;
+	__be64			reg_len;
+	__be32			offset;
+	__be32			page_size;
+	u32			reserved[2];
+};
+
+struct mlx5_wqe_ctrl_seg {
+	__be32			opmod_idx_opcode;
+	__be32			qpn_ds;
+	u8			signature;
+	u8			rsvd[2];
+	u8			fm_ce_se;
+	__be32			imm;
+};
+
+struct mlx5_wqe_xrc_seg {
+	__be32			xrc_srqn;
+	u8			rsvd[12];
+};
+
+struct mlx5_wqe_masked_atomic_seg {
+	__be64			swap_add;
+	__be64			compare;
+	__be64			swap_add_mask;
+	__be64			compare_mask;
+};
+
+struct mlx5_av {
+	union {
+		struct {
+			__be32	qkey;
+			__be32	reserved;
+		} qkey;
+		__be64	dc_key;
+	} key;
+	__be32	dqp_dct;
+	u8	stat_rate_sl;
+	u8	fl_mlid;
+	__be16	rlid;
+	u8	reserved0[10];
+	u8	tclass;
+	u8	hop_limit;
+	__be32	grh_gid_fl;
+	u8	rgid[16];
+};
+
+struct mlx5_wqe_datagram_seg {
+	struct mlx5_av	av;
+};
+
+struct mlx5_wqe_raddr_seg {
+	__be64			raddr;
+	__be32			rkey;
+	u32			reserved;
+};
+
+struct mlx5_wqe_atomic_seg {
+	__be64			swap_add;
+	__be64			compare;
+};
+
+struct mlx5_wqe_data_seg {
+	__be32			byte_count;
+	__be32			lkey;
+	__be64			addr;
+};
+
+struct mlx5_wqe_umr_ctrl_seg {
+	u8		flags;
+	u8		rsvd0[3];
+	__be16		klm_octowords;
+	__be16		bsf_octowords;
+	__be64		mkey_mask;
+	u8		rsvd1[32];
+};
+
+struct mlx5_seg_set_psv {
+	__be32		psv_num;
+	__be16		syndrome;
+	__be16		status;
+	__be32		transient_sig;
+	__be32		ref_tag;
+};
+
+struct mlx5_seg_get_psv {
+	u8		rsvd[19];
+	u8		num_psv;
+	__be32		l_key;
+	__be64		va;
+	__be32		psv_index[4];
+};
+
+struct mlx5_seg_check_psv {
+	u8		rsvd0[2];
+	__be16		err_coalescing_op;
+	u8		rsvd1[2];
+	__be16		xport_err_op;
+	u8		rsvd2[2];
+	__be16		xport_err_mask;
+	u8		rsvd3[7];
+	u8		num_psv;
+	__be32		l_key;
+	__be64		va;
+	__be32		psv_index[4];
+};
+
+struct mlx5_rwqe_sig {
+	u8	rsvd0[4];
+	u8	signature;
+	u8	rsvd1[11];
+};
+
+struct mlx5_wqe_signature_seg {
+	u8	rsvd0[4];
+	u8	signature;
+	u8	rsvd1[11];
+};
+
+struct mlx5_wqe_inline_seg {
+	__be32	byte_count;
+};
+
+enum mlx5_sig_type {
+	MLX5_DIF_CRC = 0x1,
+	MLX5_DIF_IPCS = 0x2,
+};
+
+struct mlx5_bsf_inl {
+	__be16		vld_refresh;
+	__be16		dif_apptag;
+	__be32		dif_reftag;
+	u8		sig_type;
+	u8		rp_inv_seed;
+	u8		rsvd[3];
+	u8		dif_inc_ref_guard_check;
+	__be16		dif_app_bitmask_check;
+};
+
+struct mlx5_bsf {
+	struct mlx5_bsf_basic {
+		u8		bsf_size_sbs;
+		u8		check_byte_mask;
+		union {
+			u8	copy_byte_mask;
+			u8	bs_selector;
+			u8	rsvd_wflags;
+		} wire;
+		union {
+			u8	bs_selector;
+			u8	rsvd_mflags;
+		} mem;
+		__be32		raw_data_size;
+		__be32		w_bfs_psv;
+		__be32		m_bfs_psv;
+	} basic;
+	struct mlx5_bsf_ext {
+		__be32		t_init_gen_pro_size;
+		__be32		rsvd_epi_size;
+		__be32		w_tfs_psv;
+		__be32		m_tfs_psv;
+	} ext;
+	struct mlx5_bsf_inl	w_inl;
+	struct mlx5_bsf_inl	m_inl;
+};
+
+struct mlx5_klm {
+	__be32		bcount;
+	__be32		key;
+	__be64		va;
+};
+
+struct mlx5_stride_block_entry {
+	__be16		stride;
+	__be16		bcount;
+	__be32		key;
+	__be64		va;
+};
+
+struct mlx5_stride_block_ctrl_seg {
+	__be32		bcount_per_cycle;
+	__be32		op;
+	__be32		repeat_count;
+	u16		rsvd;
+	__be16		num_entries;
+};
+
+struct mlx5_core_qp {
+	struct mlx5_core_rsc_common	common; /* must be first */
+	void (*event)		(struct mlx5_core_qp *, int);
+	int			qpn;
+	struct mlx5_rsc_debug	*dbg;
+	int			pid;
+};
+
+struct mlx5_qp_path {
+	u8			fl;
+	u8			rsvd3;
+	u8			free_ar;
+	u8			pkey_index;
+	u8			rsvd0;
+	u8			grh_mlid;
+	__be16			rlid;
+	u8			ackto_lt;
+	u8			mgid_index;
+	u8			static_rate;
+	u8			hop_limit;
+	__be32			tclass_flowlabel;
+	u8			rgid[16];
+	u8			rsvd1[4];
+	u8			sl;
+	u8			port;
+	u8			rsvd2[6];
+};
+
+struct mlx5_qp_context {
+	__be32			flags;
+	__be32			flags_pd;
+	u8			mtu_msgmax;
+	u8			rq_size_stride;
+	__be16			sq_crq_size;
+	__be32			qp_counter_set_usr_page;
+	__be32			wire_qpn;
+	__be32			log_pg_sz_remote_qpn;
+	struct			mlx5_qp_path pri_path;
+	struct			mlx5_qp_path alt_path;
+	__be32			params1;
+	u8			reserved2[4];
+	__be32			next_send_psn;
+	__be32			cqn_send;
+	u8			reserved3[8];
+	__be32			last_acked_psn;
+	__be32			ssn;
+	__be32			params2;
+	__be32			rnr_nextrecvpsn;
+	__be32			xrcd;
+	__be32			cqn_recv;
+	__be64			db_rec_addr;
+	__be32			qkey;
+	__be32			rq_type_srqn;
+	__be32			rmsn;
+	__be16			hw_sq_wqe_counter;
+	__be16			sw_sq_wqe_counter;
+	__be16			hw_rcyclic_byte_counter;
+	__be16			hw_rq_counter;
+	__be16			sw_rcyclic_byte_counter;
+	__be16			sw_rq_counter;
+	u8			rsvd0[5];
+	u8			cgs;
+	u8			cs_req;
+	u8			cs_res;
+	__be64			dc_access_key;
+	u8			rsvd1[24];
+};
+
+struct mlx5_create_qp_mbox_in {
+	struct mlx5_inbox_hdr	hdr;
+	__be32			input_qpn;
+	u8			rsvd0[4];
+	__be32			opt_param_mask;
+	u8			rsvd1[4];
+	struct mlx5_qp_context	ctx;
+	u8			rsvd3[16];
+	__be64			pas[0];
+};
+
+struct mlx5_create_qp_mbox_out {
+	struct mlx5_outbox_hdr	hdr;
+	__be32			qpn;
+	u8			rsvd0[4];
+};
+
+struct mlx5_destroy_qp_mbox_in {
+	struct mlx5_inbox_hdr	hdr;
+	__be32			qpn;
+	u8			rsvd0[4];
+};
+
+struct mlx5_destroy_qp_mbox_out {
+	struct mlx5_outbox_hdr	hdr;
+	u8			rsvd0[8];
+};
+
+struct mlx5_modify_qp_mbox_in {
+	struct mlx5_inbox_hdr	hdr;
+	__be32			qpn;
+	u8			rsvd1[4];
+	__be32			optparam;
+	u8			rsvd0[4];
+	struct mlx5_qp_context	ctx;
+};
+
+struct mlx5_modify_qp_mbox_out {
+	struct mlx5_outbox_hdr	hdr;
+	u8			rsvd0[8];
+};
+
+struct mlx5_query_qp_mbox_in {
+	struct mlx5_inbox_hdr	hdr;
+	__be32			qpn;
+	u8			rsvd[4];
+};
+
+struct mlx5_query_qp_mbox_out {
+	struct mlx5_outbox_hdr	hdr;
+	u8			rsvd1[8];
+	__be32			optparam;
+	u8			rsvd0[4];
+	struct mlx5_qp_context	ctx;
+	u8			rsvd2[16];
+	__be64			pas[0];
+};
+
+struct mlx5_conf_sqp_mbox_in {
+	struct mlx5_inbox_hdr	hdr;
+	__be32			qpn;
+	u8			rsvd[3];
+	u8			type;
+};
+
+struct mlx5_conf_sqp_mbox_out {
+	struct mlx5_outbox_hdr	hdr;
+	u8			rsvd[8];
+};
+
+struct mlx5_alloc_xrcd_mbox_in {
+	struct mlx5_inbox_hdr	hdr;
+	u8			rsvd[8];
+};
+
+struct mlx5_alloc_xrcd_mbox_out {
+	struct mlx5_outbox_hdr	hdr;
+	__be32			xrcdn;
+	u8			rsvd[4];
+};
+
+struct mlx5_dealloc_xrcd_mbox_in {
+	struct mlx5_inbox_hdr	hdr;
+	__be32			xrcdn;
+	u8			rsvd[4];
+};
+
+struct mlx5_dealloc_xrcd_mbox_out {
+	struct mlx5_outbox_hdr	hdr;
+	u8			rsvd[8];
+};
+
+static inline struct mlx5_core_qp *__mlx5_qp_lookup(struct mlx5_core_dev *dev, u32 qpn)
+{
+	return radix_tree_lookup(&dev->priv.qp_table.tree, qpn);
+}
+
+static inline struct mlx5_core_mr *__mlx5_mr_lookup(struct mlx5_core_dev *dev, u32 key)
+{
+	return radix_tree_lookup(&dev->priv.mr_table.tree, key);
+}
+
+int mlx5_core_create_qp(struct mlx5_core_dev *dev,
+			struct mlx5_core_qp *qp,
+			struct mlx5_create_qp_mbox_in *in,
+			int inlen);
+int mlx5_core_qp_modify(struct mlx5_core_dev *dev, enum mlx5_qp_state cur_state,
+			enum mlx5_qp_state new_state,
+			struct mlx5_modify_qp_mbox_in *in, int sqd_event,
+			struct mlx5_core_qp *qp);
+int mlx5_core_destroy_qp(struct mlx5_core_dev *dev,
+			 struct mlx5_core_qp *qp);
+int mlx5_core_qp_query(struct mlx5_core_dev *dev, struct mlx5_core_qp *qp,
+		       struct mlx5_query_qp_mbox_out *out, int outlen);
+
+int mlx5_core_xrcd_alloc(struct mlx5_core_dev *dev, u32 *xrcdn);
+int mlx5_core_xrcd_dealloc(struct mlx5_core_dev *dev, u32 xrcdn);
+void mlx5_init_qp_table(struct mlx5_core_dev *dev);
+void mlx5_cleanup_qp_table(struct mlx5_core_dev *dev);
+int mlx5_debug_qp_add(struct mlx5_core_dev *dev, struct mlx5_core_qp *qp);
+void mlx5_debug_qp_remove(struct mlx5_core_dev *dev, struct mlx5_core_qp *qp);
+
+static inline const char *mlx5_qp_type_str(int type)
+{
+	switch (type) {
+	case MLX5_QP_ST_RC: return "RC";
+	case MLX5_QP_ST_UC: return "C";
+	case MLX5_QP_ST_UD: return "UD";
+	case MLX5_QP_ST_XRC: return "XRC";
+	case MLX5_QP_ST_MLX: return "MLX";
+	case MLX5_QP_ST_QP0: return "QP0";
+	case MLX5_QP_ST_QP1: return "QP1";
+	case MLX5_QP_ST_RAW_ETHERTYPE: return "RAW_ETHERTYPE";
+	case MLX5_QP_ST_RAW_IPV6: return "RAW_IPV6";
+	case MLX5_QP_ST_SNIFFER: return "SNIFFER";
+	case MLX5_QP_ST_SYNC_UMR: return "SYNC_UMR";
+	case MLX5_QP_ST_PTP_1588: return "PTP_1588";
+	case MLX5_QP_ST_REG_UMR: return "REG_UMR";
+	default: return "Invalid transport type";
+	}
+}
+
+static inline const char *mlx5_qp_state_str(int state)
+{
+	switch (state) {
+	case MLX5_QP_STATE_RST:
+	return "RST";
+	case MLX5_QP_STATE_INIT:
+	return "INIT";
+	case MLX5_QP_STATE_RTR:
+	return "RTR";
+	case MLX5_QP_STATE_RTS:
+	return "RTS";
+	case MLX5_QP_STATE_SQER:
+	return "SQER";
+	case MLX5_QP_STATE_SQD:
+	return "SQD";
+	case MLX5_QP_STATE_ERR:
+	return "ERR";
+	case MLX5_QP_STATE_SQ_DRAINING:
+	return "SQ_DRAINING";
+	case MLX5_QP_STATE_SUSPENDED:
+	return "SUSPENDED";
+	default: return "Invalid QP state";
+	}
+}
+
+#endif /* MLX5_QP_H */
diff --git a/include/linux/mlx5/srq.h b/include/linux/mlx5/srq.h
new file mode 100644
index 0000000..e1a363a
--- /dev/null
+++ b/include/linux/mlx5/srq.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2013, Mellanox Technologies inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MLX5_SRQ_H
+#define MLX5_SRQ_H
+
+#include <linux/mlx5/driver.h>
+
+void mlx5_init_srq_table(struct mlx5_core_dev *dev);
+void mlx5_cleanup_srq_table(struct mlx5_core_dev *dev);
+
+#endif /* MLX5_SRQ_H */
diff --git a/include/rdma/ib.h b/include/rdma/ib.h
new file mode 100644
index 0000000..cf8f9e7
--- /dev/null
+++ b/include/rdma/ib.h
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2010 Intel Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if !defined(_RDMA_IB_H)
+#define _RDMA_IB_H
+
+#include <linux/types.h>
+
+struct ib_addr {
+	union {
+		__u8		uib_addr8[16];
+		__be16		uib_addr16[8];
+		__be32		uib_addr32[4];
+		__be64		uib_addr64[2];
+	} ib_u;
+#define sib_addr8		ib_u.uib_addr8
+#define sib_addr16		ib_u.uib_addr16
+#define sib_addr32		ib_u.uib_addr32
+#define sib_addr64		ib_u.uib_addr64
+#define sib_raw			ib_u.uib_addr8
+#define sib_subnet_prefix	ib_u.uib_addr64[0]
+#define sib_interface_id	ib_u.uib_addr64[1]
+};
+
+static inline int ib_addr_any(const struct ib_addr *a)
+{
+	return ((a->sib_addr64[0] | a->sib_addr64[1]) == 0);
+}
+
+static inline int ib_addr_loopback(const struct ib_addr *a)
+{
+	return ((a->sib_addr32[0] | a->sib_addr32[1] |
+		 a->sib_addr32[2] | (a->sib_addr32[3] ^ htonl(1))) == 0);
+}
+
+static inline void ib_addr_set(struct ib_addr *addr,
+			       __be32 w1, __be32 w2, __be32 w3, __be32 w4)
+{
+	addr->sib_addr32[0] = w1;
+	addr->sib_addr32[1] = w2;
+	addr->sib_addr32[2] = w3;
+	addr->sib_addr32[3] = w4;
+}
+
+static inline int ib_addr_cmp(const struct ib_addr *a1, const struct ib_addr *a2)
+{
+	return memcmp(a1, a2, sizeof(struct ib_addr));
+}
+
+struct sockaddr_ib {
+	unsigned short int	sib_family;	/* AF_IB */
+	__be16			sib_pkey;
+	__be32			sib_flowinfo;
+	struct ib_addr		sib_addr;
+	__be64			sib_sid;
+	__be64			sib_sid_mask;
+	__u64			sib_scope_id;
+};
+
+#endif /* _RDMA_IB_H */
diff --git a/include/rdma/ib_addr.h b/include/rdma/ib_addr.h
new file mode 100644
index 0000000..ce55906
--- /dev/null
+++ b/include/rdma/ib_addr.h
@@ -0,0 +1,312 @@
+/*
+ * Copyright (c) 2005 Voltaire Inc.  All rights reserved.
+ * Copyright (c) 2005 Intel Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if !defined(IB_ADDR_H)
+#define IB_ADDR_H
+
+#include <linux/in.h>
+#include <linux/in6.h>
+#include <linux/if_arp.h>
+#include <linux/netdevice.h>
+#include <linux/inetdevice.h>
+#include <linux/socket.h>
+#include <linux/if_vlan.h>
+#include <net/ipv6.h>
+#include <net/if_inet6.h>
+#include <net/ip.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_pack.h>
+#include <net/ipv6.h>
+
+struct rdma_addr_client {
+	atomic_t refcount;
+	struct completion comp;
+};
+
+/**
+ * rdma_addr_register_client - Register an address client.
+ */
+void rdma_addr_register_client(struct rdma_addr_client *client);
+
+/**
+ * rdma_addr_unregister_client - Deregister an address client.
+ * @client: Client object to deregister.
+ */
+void rdma_addr_unregister_client(struct rdma_addr_client *client);
+
+struct rdma_dev_addr {
+	unsigned char src_dev_addr[MAX_ADDR_LEN];
+	unsigned char dst_dev_addr[MAX_ADDR_LEN];
+	unsigned char broadcast[MAX_ADDR_LEN];
+	unsigned short dev_type;
+	int bound_dev_if;
+	enum rdma_transport_type transport;
+};
+
+/**
+ * rdma_translate_ip - Translate a local IP address to an RDMA hardware
+ *   address.
+ */
+int rdma_translate_ip(struct sockaddr *addr, struct rdma_dev_addr *dev_addr,
+		      u16 *vlan_id);
+
+/**
+ * rdma_resolve_ip - Resolve source and destination IP addresses to
+ *   RDMA hardware addresses.
+ * @client: Address client associated with request.
+ * @src_addr: An optional source address to use in the resolution.  If a
+ *   source address is not provided, a usable address will be returned via
+ *   the callback.
+ * @dst_addr: The destination address to resolve.
+ * @addr: A reference to a data location that will receive the resolved
+ *   addresses.  The data location must remain valid until the callback has
+ *   been invoked.
+ * @timeout_ms: Amount of time to wait for the address resolution to complete.
+ * @callback: Call invoked once address resolution has completed, timed out,
+ *   or been canceled.  A status of 0 indicates success.
+ * @context: User-specified context associated with the call.
+ */
+int rdma_resolve_ip(struct rdma_addr_client *client,
+		    struct sockaddr *src_addr, struct sockaddr *dst_addr,
+		    struct rdma_dev_addr *addr, int timeout_ms,
+		    void (*callback)(int status, struct sockaddr *src_addr,
+				     struct rdma_dev_addr *addr, void *context),
+		    void *context);
+
+void rdma_addr_cancel(struct rdma_dev_addr *addr);
+
+int rdma_copy_addr(struct rdma_dev_addr *dev_addr, struct net_device *dev,
+	      const unsigned char *dst_dev_addr);
+
+int rdma_addr_size(struct sockaddr *addr);
+
+int rdma_addr_find_smac_by_sgid(union ib_gid *sgid, u8 *smac, u16 *vlan_id);
+int rdma_addr_find_dmac_by_grh(union ib_gid *sgid, union ib_gid *dgid, u8 *smac,
+			       u16 *vlan_id);
+
+static inline u16 ib_addr_get_pkey(struct rdma_dev_addr *dev_addr)
+{
+	return ((u16)dev_addr->broadcast[8] << 8) | (u16)dev_addr->broadcast[9];
+}
+
+static inline void ib_addr_set_pkey(struct rdma_dev_addr *dev_addr, u16 pkey)
+{
+	dev_addr->broadcast[8] = pkey >> 8;
+	dev_addr->broadcast[9] = (unsigned char) pkey;
+}
+
+static inline void ib_addr_get_mgid(struct rdma_dev_addr *dev_addr,
+				    union ib_gid *gid)
+{
+	memcpy(gid, dev_addr->broadcast + 4, sizeof *gid);
+}
+
+static inline int rdma_addr_gid_offset(struct rdma_dev_addr *dev_addr)
+{
+	return dev_addr->dev_type == ARPHRD_INFINIBAND ? 4 : 0;
+}
+
+static inline u16 rdma_vlan_dev_vlan_id(const struct net_device *dev)
+{
+	return dev->priv_flags & IFF_802_1Q_VLAN ?
+		vlan_dev_vlan_id(dev) : 0xffff;
+}
+
+static inline int rdma_ip2gid(struct sockaddr *addr, union ib_gid *gid)
+{
+	switch (addr->sa_family) {
+	case AF_INET:
+		ipv6_addr_set_v4mapped(((struct sockaddr_in *)
+					addr)->sin_addr.s_addr,
+				       (struct in6_addr *)gid);
+		break;
+	case AF_INET6:
+		memcpy(gid->raw, &((struct sockaddr_in6 *)addr)->sin6_addr, 16);
+		break;
+	default:
+		return -EINVAL;
+	}
+	return 0;
+}
+
+/* Important - sockaddr should be a union of sockaddr_in and sockaddr_in6 */
+static inline int rdma_gid2ip(struct sockaddr *out, union ib_gid *gid)
+{
+	if (ipv6_addr_v4mapped((struct in6_addr *)gid)) {
+		struct sockaddr_in *out_in = (struct sockaddr_in *)out;
+		memset(out_in, 0, sizeof(*out_in));
+		out_in->sin_family = AF_INET;
+		memcpy(&out_in->sin_addr.s_addr, gid->raw + 12, 4);
+	} else {
+		struct sockaddr_in6 *out_in = (struct sockaddr_in6 *)out;
+		memset(out_in, 0, sizeof(*out_in));
+		out_in->sin6_family = AF_INET6;
+		memcpy(&out_in->sin6_addr.s6_addr, gid->raw, 16);
+	}
+	return 0;
+}
+
+static inline void iboe_addr_get_sgid(struct rdma_dev_addr *dev_addr,
+				      union ib_gid *gid)
+{
+	struct net_device *dev;
+	struct in_device *ip4;
+
+	dev = dev_get_by_index(&init_net, dev_addr->bound_dev_if);
+	if (dev) {
+		ip4 = (struct in_device *)dev->ip_ptr;
+		if (ip4 && ip4->ifa_list && ip4->ifa_list->ifa_address)
+			ipv6_addr_set_v4mapped(ip4->ifa_list->ifa_address,
+					       (struct in6_addr *)gid);
+		dev_put(dev);
+	}
+}
+
+static inline void rdma_addr_get_sgid(struct rdma_dev_addr *dev_addr, union ib_gid *gid)
+{
+	if (dev_addr->transport == RDMA_TRANSPORT_IB &&
+	    dev_addr->dev_type != ARPHRD_INFINIBAND)
+		iboe_addr_get_sgid(dev_addr, gid);
+	else
+		memcpy(gid, dev_addr->src_dev_addr +
+		       rdma_addr_gid_offset(dev_addr), sizeof *gid);
+}
+
+static inline void rdma_addr_set_sgid(struct rdma_dev_addr *dev_addr, union ib_gid *gid)
+{
+	memcpy(dev_addr->src_dev_addr + rdma_addr_gid_offset(dev_addr), gid, sizeof *gid);
+}
+
+static inline void rdma_addr_get_dgid(struct rdma_dev_addr *dev_addr, union ib_gid *gid)
+{
+	memcpy(gid, dev_addr->dst_dev_addr + rdma_addr_gid_offset(dev_addr), sizeof *gid);
+}
+
+static inline void rdma_addr_set_dgid(struct rdma_dev_addr *dev_addr, union ib_gid *gid)
+{
+	memcpy(dev_addr->dst_dev_addr + rdma_addr_gid_offset(dev_addr), gid, sizeof *gid);
+}
+
+static inline enum ib_mtu iboe_get_mtu(int mtu)
+{
+	/*
+	 * reduce IB headers from effective IBoE MTU. 28 stands for
+	 * atomic header which is the biggest possible header after BTH
+	 */
+	mtu = mtu - IB_GRH_BYTES - IB_BTH_BYTES - 28;
+
+	if (mtu >= ib_mtu_enum_to_int(IB_MTU_4096))
+		return IB_MTU_4096;
+	else if (mtu >= ib_mtu_enum_to_int(IB_MTU_2048))
+		return IB_MTU_2048;
+	else if (mtu >= ib_mtu_enum_to_int(IB_MTU_1024))
+		return IB_MTU_1024;
+	else if (mtu >= ib_mtu_enum_to_int(IB_MTU_512))
+		return IB_MTU_512;
+	else if (mtu >= ib_mtu_enum_to_int(IB_MTU_256))
+		return IB_MTU_256;
+	else
+		return 0;
+}
+
+static inline int iboe_get_rate(struct net_device *dev)
+{
+	struct ethtool_cmd cmd;
+	u32 speed;
+	int err;
+
+	rtnl_lock();
+	err = __ethtool_get_settings(dev, &cmd);
+	rtnl_unlock();
+	if (err)
+		return IB_RATE_PORT_CURRENT;
+
+	speed = ethtool_cmd_speed(&cmd);
+	if (speed >= 40000)
+		return IB_RATE_40_GBPS;
+	else if (speed >= 30000)
+		return IB_RATE_30_GBPS;
+	else if (speed >= 20000)
+		return IB_RATE_20_GBPS;
+	else if (speed >= 10000)
+		return IB_RATE_10_GBPS;
+	else
+		return IB_RATE_PORT_CURRENT;
+}
+
+static inline int rdma_link_local_addr(struct in6_addr *addr)
+{
+	if (addr->s6_addr32[0] == htonl(0xfe800000) &&
+	    addr->s6_addr32[1] == 0)
+		return 1;
+
+	return 0;
+}
+
+static inline void rdma_get_ll_mac(struct in6_addr *addr, u8 *mac)
+{
+	memcpy(mac, &addr->s6_addr[8], 3);
+	memcpy(mac + 3, &addr->s6_addr[13], 3);
+	mac[0] ^= 2;
+}
+
+static inline int rdma_is_multicast_addr(struct in6_addr *addr)
+{
+	return addr->s6_addr[0] == 0xff;
+}
+
+static inline void rdma_get_mcast_mac(struct in6_addr *addr, u8 *mac)
+{
+	int i;
+
+	mac[0] = 0x33;
+	mac[1] = 0x33;
+	for (i = 2; i < 6; ++i)
+		mac[i] = addr->s6_addr[i + 10];
+}
+
+static inline u16 rdma_get_vlan_id(union ib_gid *dgid)
+{
+	u16 vid;
+
+	vid = dgid->raw[11] << 8 | dgid->raw[12];
+	return vid < 0x1000 ? vid : 0xffff;
+}
+
+static inline struct net_device *rdma_vlan_dev_real_dev(const struct net_device *dev)
+{
+	return dev->priv_flags & IFF_802_1Q_VLAN ?
+		vlan_dev_real_dev(dev) : NULL;
+}
+
+#endif /* IB_ADDR_H */
diff --git a/include/rdma/ib_cache.h b/include/rdma/ib_cache.h
new file mode 100644
index 0000000..ad9a3c2
--- /dev/null
+++ b/include/rdma/ib_cache.h
@@ -0,0 +1,132 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Intel Corporation. All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _IB_CACHE_H
+#define _IB_CACHE_H
+
+#include <rdma/ib_verbs.h>
+
+/**
+ * ib_get_cached_gid - Returns a cached GID table entry
+ * @device: The device to query.
+ * @port_num: The port number of the device to query.
+ * @index: The index into the cached GID table to query.
+ * @gid: The GID value found at the specified index.
+ *
+ * ib_get_cached_gid() fetches the specified GID table entry stored in
+ * the local software cache.
+ */
+int ib_get_cached_gid(struct ib_device    *device,
+		      u8                   port_num,
+		      int                  index,
+		      union ib_gid        *gid);
+
+/**
+ * ib_find_cached_gid - Returns the port number and GID table index where
+ *   a specified GID value occurs.
+ * @device: The device to query.
+ * @gid: The GID value to search for.
+ * @port_num: The port number of the device where the GID value was found.
+ * @index: The index into the cached GID table where the GID was found.  This
+ *   parameter may be NULL.
+ *
+ * ib_find_cached_gid() searches for the specified GID value in
+ * the local software cache.
+ */
+int ib_find_cached_gid(struct ib_device *device,
+		       union ib_gid	*gid,
+		       u8               *port_num,
+		       u16              *index);
+
+/**
+ * ib_get_cached_pkey - Returns a cached PKey table entry
+ * @device: The device to query.
+ * @port_num: The port number of the device to query.
+ * @index: The index into the cached PKey table to query.
+ * @pkey: The PKey value found at the specified index.
+ *
+ * ib_get_cached_pkey() fetches the specified PKey table entry stored in
+ * the local software cache.
+ */
+int ib_get_cached_pkey(struct ib_device    *device_handle,
+		       u8                   port_num,
+		       int                  index,
+		       u16                 *pkey);
+
+/**
+ * ib_find_cached_pkey - Returns the PKey table index where a specified
+ *   PKey value occurs.
+ * @device: The device to query.
+ * @port_num: The port number of the device to search for the PKey.
+ * @pkey: The PKey value to search for.
+ * @index: The index into the cached PKey table where the PKey was found.
+ *
+ * ib_find_cached_pkey() searches the specified PKey table in
+ * the local software cache.
+ */
+int ib_find_cached_pkey(struct ib_device    *device,
+			u8                   port_num,
+			u16                  pkey,
+			u16                 *index);
+
+/**
+ * ib_find_exact_cached_pkey - Returns the PKey table index where a specified
+ *   PKey value occurs. Comparison uses the FULL 16 bits (incl membership bit)
+ * @device: The device to query.
+ * @port_num: The port number of the device to search for the PKey.
+ * @pkey: The PKey value to search for.
+ * @index: The index into the cached PKey table where the PKey was found.
+ *
+ * ib_find_exact_cached_pkey() searches the specified PKey table in
+ * the local software cache.
+ */
+int ib_find_exact_cached_pkey(struct ib_device    *device,
+			      u8                   port_num,
+			      u16                  pkey,
+			      u16                 *index);
+
+/**
+ * ib_get_cached_lmc - Returns a cached lmc table entry
+ * @device: The device to query.
+ * @port_num: The port number of the device to query.
+ * @lmc: The lmc value for the specified port for that device.
+ *
+ * ib_get_cached_lmc() fetches the specified lmc table entry stored in
+ * the local software cache.
+ */
+int ib_get_cached_lmc(struct ib_device *device,
+		      u8                port_num,
+		      u8                *lmc);
+
+#endif /* _IB_CACHE_H */
diff --git a/include/rdma/ib_cm.h b/include/rdma/ib_cm.h
new file mode 100644
index 0000000..0e3ff30
--- /dev/null
+++ b/include/rdma/ib_cm.h
@@ -0,0 +1,604 @@
+/*
+ * Copyright (c) 2004, 2005 Intel Corporation.  All rights reserved.
+ * Copyright (c) 2004 Topspin Corporation.  All rights reserved.
+ * Copyright (c) 2004 Voltaire Corporation.  All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#if !defined(IB_CM_H)
+#define IB_CM_H
+
+#include <rdma/ib_mad.h>
+#include <rdma/ib_sa.h>
+
+/* ib_cm and ib_user_cm modules share /sys/class/infiniband_cm */
+extern struct class cm_class;
+
+enum ib_cm_state {
+	IB_CM_IDLE,
+	IB_CM_LISTEN,
+	IB_CM_REQ_SENT,
+	IB_CM_REQ_RCVD,
+	IB_CM_MRA_REQ_SENT,
+	IB_CM_MRA_REQ_RCVD,
+	IB_CM_REP_SENT,
+	IB_CM_REP_RCVD,
+	IB_CM_MRA_REP_SENT,
+	IB_CM_MRA_REP_RCVD,
+	IB_CM_ESTABLISHED,
+	IB_CM_DREQ_SENT,
+	IB_CM_DREQ_RCVD,
+	IB_CM_TIMEWAIT,
+	IB_CM_SIDR_REQ_SENT,
+	IB_CM_SIDR_REQ_RCVD
+};
+
+enum ib_cm_lap_state {
+	IB_CM_LAP_UNINIT,
+	IB_CM_LAP_IDLE,
+	IB_CM_LAP_SENT,
+	IB_CM_LAP_RCVD,
+	IB_CM_MRA_LAP_SENT,
+	IB_CM_MRA_LAP_RCVD,
+};
+
+enum ib_cm_event_type {
+	IB_CM_REQ_ERROR,
+	IB_CM_REQ_RECEIVED,
+	IB_CM_REP_ERROR,
+	IB_CM_REP_RECEIVED,
+	IB_CM_RTU_RECEIVED,
+	IB_CM_USER_ESTABLISHED,
+	IB_CM_DREQ_ERROR,
+	IB_CM_DREQ_RECEIVED,
+	IB_CM_DREP_RECEIVED,
+	IB_CM_TIMEWAIT_EXIT,
+	IB_CM_MRA_RECEIVED,
+	IB_CM_REJ_RECEIVED,
+	IB_CM_LAP_ERROR,
+	IB_CM_LAP_RECEIVED,
+	IB_CM_APR_RECEIVED,
+	IB_CM_SIDR_REQ_ERROR,
+	IB_CM_SIDR_REQ_RECEIVED,
+	IB_CM_SIDR_REP_RECEIVED
+};
+
+enum ib_cm_data_size {
+	IB_CM_REQ_PRIVATE_DATA_SIZE	 = 92,
+	IB_CM_MRA_PRIVATE_DATA_SIZE	 = 222,
+	IB_CM_REJ_PRIVATE_DATA_SIZE	 = 148,
+	IB_CM_REP_PRIVATE_DATA_SIZE	 = 196,
+	IB_CM_RTU_PRIVATE_DATA_SIZE	 = 224,
+	IB_CM_DREQ_PRIVATE_DATA_SIZE	 = 220,
+	IB_CM_DREP_PRIVATE_DATA_SIZE	 = 224,
+	IB_CM_REJ_ARI_LENGTH		 = 72,
+	IB_CM_LAP_PRIVATE_DATA_SIZE	 = 168,
+	IB_CM_APR_PRIVATE_DATA_SIZE	 = 148,
+	IB_CM_APR_INFO_LENGTH		 = 72,
+	IB_CM_SIDR_REQ_PRIVATE_DATA_SIZE = 216,
+	IB_CM_SIDR_REP_PRIVATE_DATA_SIZE = 136,
+	IB_CM_SIDR_REP_INFO_LENGTH	 = 72,
+	IB_CM_COMPARE_SIZE		 = 64
+};
+
+struct ib_cm_id;
+
+struct ib_cm_req_event_param {
+	struct ib_cm_id		*listen_id;
+	u8			port;
+
+	struct ib_sa_path_rec	*primary_path;
+	struct ib_sa_path_rec	*alternate_path;
+
+	__be64			remote_ca_guid;
+	u32			remote_qkey;
+	u32			remote_qpn;
+	enum ib_qp_type		qp_type;
+
+	u32			starting_psn;
+	u8			responder_resources;
+	u8			initiator_depth;
+	unsigned int		local_cm_response_timeout:5;
+	unsigned int		flow_control:1;
+	unsigned int		remote_cm_response_timeout:5;
+	unsigned int		retry_count:3;
+	unsigned int		rnr_retry_count:3;
+	unsigned int		srq:1;
+};
+
+struct ib_cm_rep_event_param {
+	__be64			remote_ca_guid;
+	u32			remote_qkey;
+	u32			remote_qpn;
+	u32			starting_psn;
+	u8			responder_resources;
+	u8			initiator_depth;
+	unsigned int		target_ack_delay:5;
+	unsigned int		failover_accepted:2;
+	unsigned int		flow_control:1;
+	unsigned int		rnr_retry_count:3;
+	unsigned int		srq:1;
+};
+
+enum ib_cm_rej_reason {
+	IB_CM_REJ_NO_QP				= 1,
+	IB_CM_REJ_NO_EEC			= 2,
+	IB_CM_REJ_NO_RESOURCES			= 3,
+	IB_CM_REJ_TIMEOUT			= 4,
+	IB_CM_REJ_UNSUPPORTED			= 5,
+	IB_CM_REJ_INVALID_COMM_ID		= 6,
+	IB_CM_REJ_INVALID_COMM_INSTANCE		= 7,
+	IB_CM_REJ_INVALID_SERVICE_ID		= 8,
+	IB_CM_REJ_INVALID_TRANSPORT_TYPE	= 9,
+	IB_CM_REJ_STALE_CONN			= 10,
+	IB_CM_REJ_RDC_NOT_EXIST			= 11,
+	IB_CM_REJ_INVALID_GID			= 12,
+	IB_CM_REJ_INVALID_LID			= 13,
+	IB_CM_REJ_INVALID_SL			= 14,
+	IB_CM_REJ_INVALID_TRAFFIC_CLASS		= 15,
+	IB_CM_REJ_INVALID_HOP_LIMIT		= 16,
+	IB_CM_REJ_INVALID_PACKET_RATE		= 17,
+	IB_CM_REJ_INVALID_ALT_GID		= 18,
+	IB_CM_REJ_INVALID_ALT_LID		= 19,
+	IB_CM_REJ_INVALID_ALT_SL		= 20,
+	IB_CM_REJ_INVALID_ALT_TRAFFIC_CLASS	= 21,
+	IB_CM_REJ_INVALID_ALT_HOP_LIMIT		= 22,
+	IB_CM_REJ_INVALID_ALT_PACKET_RATE	= 23,
+	IB_CM_REJ_PORT_CM_REDIRECT		= 24,
+	IB_CM_REJ_PORT_REDIRECT			= 25,
+	IB_CM_REJ_INVALID_MTU			= 26,
+	IB_CM_REJ_INSUFFICIENT_RESP_RESOURCES	= 27,
+	IB_CM_REJ_CONSUMER_DEFINED		= 28,
+	IB_CM_REJ_INVALID_RNR_RETRY		= 29,
+	IB_CM_REJ_DUPLICATE_LOCAL_COMM_ID	= 30,
+	IB_CM_REJ_INVALID_CLASS_VERSION		= 31,
+	IB_CM_REJ_INVALID_FLOW_LABEL		= 32,
+	IB_CM_REJ_INVALID_ALT_FLOW_LABEL	= 33
+};
+
+struct ib_cm_rej_event_param {
+	enum ib_cm_rej_reason	reason;
+	void			*ari;
+	u8			ari_length;
+};
+
+struct ib_cm_mra_event_param {
+	u8	service_timeout;
+};
+
+struct ib_cm_lap_event_param {
+	struct ib_sa_path_rec	*alternate_path;
+};
+
+enum ib_cm_apr_status {
+	IB_CM_APR_SUCCESS,
+	IB_CM_APR_INVALID_COMM_ID,
+	IB_CM_APR_UNSUPPORTED,
+	IB_CM_APR_REJECT,
+	IB_CM_APR_REDIRECT,
+	IB_CM_APR_IS_CURRENT,
+	IB_CM_APR_INVALID_QPN_EECN,
+	IB_CM_APR_INVALID_LID,
+	IB_CM_APR_INVALID_GID,
+	IB_CM_APR_INVALID_FLOW_LABEL,
+	IB_CM_APR_INVALID_TCLASS,
+	IB_CM_APR_INVALID_HOP_LIMIT,
+	IB_CM_APR_INVALID_PACKET_RATE,
+	IB_CM_APR_INVALID_SL
+};
+
+struct ib_cm_apr_event_param {
+	enum ib_cm_apr_status	ap_status;
+	void			*apr_info;
+	u8			info_len;
+};
+
+struct ib_cm_sidr_req_event_param {
+	struct ib_cm_id		*listen_id;
+	u8			port;
+	u16			pkey;
+};
+
+enum ib_cm_sidr_status {
+	IB_SIDR_SUCCESS,
+	IB_SIDR_UNSUPPORTED,
+	IB_SIDR_REJECT,
+	IB_SIDR_NO_QP,
+	IB_SIDR_REDIRECT,
+	IB_SIDR_UNSUPPORTED_VERSION
+};
+
+struct ib_cm_sidr_rep_event_param {
+	enum ib_cm_sidr_status	status;
+	u32			qkey;
+	u32			qpn;
+	void			*info;
+	u8			info_len;
+};
+
+struct ib_cm_event {
+	enum ib_cm_event_type	event;
+	union {
+		struct ib_cm_req_event_param	req_rcvd;
+		struct ib_cm_rep_event_param	rep_rcvd;
+		/* No data for RTU received events. */
+		struct ib_cm_rej_event_param	rej_rcvd;
+		struct ib_cm_mra_event_param	mra_rcvd;
+		struct ib_cm_lap_event_param	lap_rcvd;
+		struct ib_cm_apr_event_param	apr_rcvd;
+		/* No data for DREQ/DREP received events. */
+		struct ib_cm_sidr_req_event_param sidr_req_rcvd;
+		struct ib_cm_sidr_rep_event_param sidr_rep_rcvd;
+		enum ib_wc_status		send_status;
+	} param;
+
+	void			*private_data;
+};
+
+#define CM_REQ_ATTR_ID		cpu_to_be16(0x0010)
+#define CM_MRA_ATTR_ID		cpu_to_be16(0x0011)
+#define CM_REJ_ATTR_ID		cpu_to_be16(0x0012)
+#define CM_REP_ATTR_ID		cpu_to_be16(0x0013)
+#define CM_RTU_ATTR_ID		cpu_to_be16(0x0014)
+#define CM_DREQ_ATTR_ID		cpu_to_be16(0x0015)
+#define CM_DREP_ATTR_ID		cpu_to_be16(0x0016)
+#define CM_SIDR_REQ_ATTR_ID	cpu_to_be16(0x0017)
+#define CM_SIDR_REP_ATTR_ID	cpu_to_be16(0x0018)
+#define CM_LAP_ATTR_ID		cpu_to_be16(0x0019)
+#define CM_APR_ATTR_ID		cpu_to_be16(0x001A)
+
+/**
+ * ib_cm_handler - User-defined callback to process communication events.
+ * @cm_id: Communication identifier associated with the reported event.
+ * @event: Information about the communication event.
+ *
+ * IB_CM_REQ_RECEIVED and IB_CM_SIDR_REQ_RECEIVED communication events
+ * generated as a result of listen requests result in the allocation of a
+ * new @cm_id.  The new @cm_id is returned to the user through this callback.
+ * Clients are responsible for destroying the new @cm_id.  For peer-to-peer
+ * IB_CM_REQ_RECEIVED and all other events, the returned @cm_id corresponds
+ * to a user's existing communication identifier.
+ *
+ * Users may not call ib_destroy_cm_id while in the context of this callback;
+ * however, returning a non-zero value instructs the communication manager to
+ * destroy the @cm_id after the callback completes.
+ */
+typedef int (*ib_cm_handler)(struct ib_cm_id *cm_id,
+			     struct ib_cm_event *event);
+
+struct ib_cm_id {
+	ib_cm_handler		cm_handler;
+	void			*context;
+	struct ib_device	*device;
+	__be64			service_id;
+	__be64			service_mask;
+	enum ib_cm_state	state;		/* internal CM/debug use */
+	enum ib_cm_lap_state	lap_state;	/* internal CM/debug use */
+	__be32			local_id;
+	__be32			remote_id;
+	u32			remote_cm_qpn;  /* 1 unless redirected */
+};
+
+/**
+ * ib_create_cm_id - Allocate a communication identifier.
+ * @device: Device associated with the cm_id.  All related communication will
+ * be associated with the specified device.
+ * @cm_handler: Callback invoked to notify the user of CM events.
+ * @context: User specified context associated with the communication
+ *   identifier.
+ *
+ * Communication identifiers are used to track connection states, service
+ * ID resolution requests, and listen requests.
+ */
+struct ib_cm_id *ib_create_cm_id(struct ib_device *device,
+				 ib_cm_handler cm_handler,
+				 void *context);
+
+/**
+ * ib_destroy_cm_id - Destroy a connection identifier.
+ * @cm_id: Connection identifier to destroy.
+ *
+ * This call blocks until the connection identifier is destroyed.
+ */
+void ib_destroy_cm_id(struct ib_cm_id *cm_id);
+
+#define IB_SERVICE_ID_AGN_MASK	cpu_to_be64(0xFF00000000000000ULL)
+#define IB_CM_ASSIGN_SERVICE_ID	cpu_to_be64(0x0200000000000000ULL)
+#define IB_CMA_SERVICE_ID	cpu_to_be64(0x0000000001000000ULL)
+#define IB_CMA_SERVICE_ID_MASK	cpu_to_be64(0xFFFFFFFFFF000000ULL)
+#define IB_SDP_SERVICE_ID	cpu_to_be64(0x0000000000010000ULL)
+#define IB_SDP_SERVICE_ID_MASK	cpu_to_be64(0xFFFFFFFFFFFF0000ULL)
+
+struct ib_cm_compare_data {
+	u8  data[IB_CM_COMPARE_SIZE];
+	u8  mask[IB_CM_COMPARE_SIZE];
+};
+
+/**
+ * ib_cm_listen - Initiates listening on the specified service ID for
+ *   connection and service ID resolution requests.
+ * @cm_id: Connection identifier associated with the listen request.
+ * @service_id: Service identifier matched against incoming connection
+ *   and service ID resolution requests.  The service ID should be specified
+ *   network-byte order.  If set to IB_CM_ASSIGN_SERVICE_ID, the CM will
+ *   assign a service ID to the caller.
+ * @service_mask: Mask applied to service ID used to listen across a
+ *   range of service IDs.  If set to 0, the service ID is matched
+ *   exactly.  This parameter is ignored if %service_id is set to
+ *   IB_CM_ASSIGN_SERVICE_ID.
+ * @compare_data: This parameter is optional.  It specifies data that must
+ *   appear in the private data of a connection request for the specified
+ *   listen request.
+ */
+int ib_cm_listen(struct ib_cm_id *cm_id, __be64 service_id, __be64 service_mask,
+		 struct ib_cm_compare_data *compare_data);
+
+struct ib_cm_req_param {
+	struct ib_sa_path_rec	*primary_path;
+	struct ib_sa_path_rec	*alternate_path;
+	__be64			service_id;
+	u32			qp_num;
+	enum ib_qp_type		qp_type;
+	u32			starting_psn;
+	const void		*private_data;
+	u8			private_data_len;
+	u8			peer_to_peer;
+	u8			responder_resources;
+	u8			initiator_depth;
+	u8			remote_cm_response_timeout;
+	u8			flow_control;
+	u8			local_cm_response_timeout;
+	u8			retry_count;
+	u8			rnr_retry_count;
+	u8			max_cm_retries;
+	u8			srq;
+};
+
+/**
+ * ib_send_cm_req - Sends a connection request to the remote node.
+ * @cm_id: Connection identifier that will be associated with the
+ *   connection request.
+ * @param: Connection request information needed to establish the
+ *   connection.
+ */
+int ib_send_cm_req(struct ib_cm_id *cm_id,
+		   struct ib_cm_req_param *param);
+
+struct ib_cm_rep_param {
+	u32		qp_num;
+	u32		starting_psn;
+	const void	*private_data;
+	u8		private_data_len;
+	u8		responder_resources;
+	u8		initiator_depth;
+	u8		failover_accepted;
+	u8		flow_control;
+	u8		rnr_retry_count;
+	u8		srq;
+};
+
+/**
+ * ib_send_cm_rep - Sends a connection reply in response to a connection
+ *   request.
+ * @cm_id: Connection identifier that will be associated with the
+ *   connection request.
+ * @param: Connection reply information needed to establish the
+ *   connection.
+ */
+int ib_send_cm_rep(struct ib_cm_id *cm_id,
+		   struct ib_cm_rep_param *param);
+
+/**
+ * ib_send_cm_rtu - Sends a connection ready to use message in response
+ *   to a connection reply message.
+ * @cm_id: Connection identifier associated with the connection request.
+ * @private_data: Optional user-defined private data sent with the
+ *   ready to use message.
+ * @private_data_len: Size of the private data buffer, in bytes.
+ */
+int ib_send_cm_rtu(struct ib_cm_id *cm_id,
+		   const void *private_data,
+		   u8 private_data_len);
+
+/**
+ * ib_send_cm_dreq - Sends a disconnection request for an existing
+ *   connection.
+ * @cm_id: Connection identifier associated with the connection being
+ *   released.
+ * @private_data: Optional user-defined private data sent with the
+ *   disconnection request message.
+ * @private_data_len: Size of the private data buffer, in bytes.
+ */
+int ib_send_cm_dreq(struct ib_cm_id *cm_id,
+		    const void *private_data,
+		    u8 private_data_len);
+
+/**
+ * ib_send_cm_drep - Sends a disconnection reply to a disconnection request.
+ * @cm_id: Connection identifier associated with the connection being
+ *   released.
+ * @private_data: Optional user-defined private data sent with the
+ *   disconnection reply message.
+ * @private_data_len: Size of the private data buffer, in bytes.
+ *
+ * If the cm_id is in the correct state, the CM will transition the connection
+ * to the timewait state, even if an error occurs sending the DREP message.
+ */
+int ib_send_cm_drep(struct ib_cm_id *cm_id,
+		    const void *private_data,
+		    u8 private_data_len);
+
+/**
+ * ib_cm_notify - Notifies the CM of an event reported to the consumer.
+ * @cm_id: Connection identifier to transition to established.
+ * @event: Type of event.
+ *
+ * This routine should be invoked by users to notify the CM of relevant
+ * communication events.  Events that should be reported to the CM and
+ * when to report them are:
+ *
+ * IB_EVENT_COMM_EST - Used when a message is received on a connected
+ *    QP before an RTU has been received.
+ * IB_EVENT_PATH_MIG - Notifies the CM that the connection has failed over
+ *   to the alternate path.
+ */
+int ib_cm_notify(struct ib_cm_id *cm_id, enum ib_event_type event);
+
+/**
+ * ib_send_cm_rej - Sends a connection rejection message to the
+ *   remote node.
+ * @cm_id: Connection identifier associated with the connection being
+ *   rejected.
+ * @reason: Reason for the connection request rejection.
+ * @ari: Optional additional rejection information.
+ * @ari_length: Size of the additional rejection information, in bytes.
+ * @private_data: Optional user-defined private data sent with the
+ *   rejection message.
+ * @private_data_len: Size of the private data buffer, in bytes.
+ */
+int ib_send_cm_rej(struct ib_cm_id *cm_id,
+		   enum ib_cm_rej_reason reason,
+		   void *ari,
+		   u8 ari_length,
+		   const void *private_data,
+		   u8 private_data_len);
+
+#define IB_CM_MRA_FLAG_DELAY 0x80  /* Send MRA only after a duplicate msg */
+
+/**
+ * ib_send_cm_mra - Sends a message receipt acknowledgement to a connection
+ *   message.
+ * @cm_id: Connection identifier associated with the connection message.
+ * @service_timeout: The lower 5-bits specify the maximum time required for
+ *   the sender to reply to the connection message.  The upper 3-bits
+ *   specify additional control flags.
+ * @private_data: Optional user-defined private data sent with the
+ *   message receipt acknowledgement.
+ * @private_data_len: Size of the private data buffer, in bytes.
+ */
+int ib_send_cm_mra(struct ib_cm_id *cm_id,
+		   u8 service_timeout,
+		   const void *private_data,
+		   u8 private_data_len);
+
+/**
+ * ib_send_cm_lap - Sends a load alternate path request.
+ * @cm_id: Connection identifier associated with the load alternate path
+ *   message.
+ * @alternate_path: A path record that identifies the alternate path to
+ *   load.
+ * @private_data: Optional user-defined private data sent with the
+ *   load alternate path message.
+ * @private_data_len: Size of the private data buffer, in bytes.
+ */
+int ib_send_cm_lap(struct ib_cm_id *cm_id,
+		   struct ib_sa_path_rec *alternate_path,
+		   const void *private_data,
+		   u8 private_data_len);
+
+/**
+ * ib_cm_init_qp_attr - Initializes the QP attributes for use in transitioning
+ *   to a specified QP state.
+ * @cm_id: Communication identifier associated with the QP attributes to
+ *   initialize.
+ * @qp_attr: On input, specifies the desired QP state.  On output, the
+ *   mandatory and desired optional attributes will be set in order to
+ *   modify the QP to the specified state.
+ * @qp_attr_mask: The QP attribute mask that may be used to transition the
+ *   QP to the specified state.
+ *
+ * Users must set the @qp_attr->qp_state to the desired QP state.  This call
+ * will set all required attributes for the given transition, along with
+ * known optional attributes.  Users may override the attributes returned from
+ * this call before calling ib_modify_qp.
+ */
+int ib_cm_init_qp_attr(struct ib_cm_id *cm_id,
+		       struct ib_qp_attr *qp_attr,
+		       int *qp_attr_mask);
+
+/**
+ * ib_send_cm_apr - Sends an alternate path response message in response to
+ *   a load alternate path request.
+ * @cm_id: Connection identifier associated with the alternate path response.
+ * @status: Reply status sent with the alternate path response.
+ * @info: Optional additional information sent with the alternate path
+ *   response.
+ * @info_length: Size of the additional information, in bytes.
+ * @private_data: Optional user-defined private data sent with the
+ *   alternate path response message.
+ * @private_data_len: Size of the private data buffer, in bytes.
+ */
+int ib_send_cm_apr(struct ib_cm_id *cm_id,
+		   enum ib_cm_apr_status status,
+		   void *info,
+		   u8 info_length,
+		   const void *private_data,
+		   u8 private_data_len);
+
+struct ib_cm_sidr_req_param {
+	struct ib_sa_path_rec	*path;
+	__be64			service_id;
+	int			timeout_ms;
+	const void		*private_data;
+	u8			private_data_len;
+	u8			max_cm_retries;
+};
+
+/**
+ * ib_send_cm_sidr_req - Sends a service ID resolution request to the
+ *   remote node.
+ * @cm_id: Communication identifier that will be associated with the
+ *   service ID resolution request.
+ * @param: Service ID resolution request information.
+ */
+int ib_send_cm_sidr_req(struct ib_cm_id *cm_id,
+			struct ib_cm_sidr_req_param *param);
+
+struct ib_cm_sidr_rep_param {
+	u32			qp_num;
+	u32			qkey;
+	enum ib_cm_sidr_status	status;
+	const void		*info;
+	u8			info_length;
+	const void		*private_data;
+	u8			private_data_len;
+};
+
+/**
+ * ib_send_cm_sidr_rep - Sends a service ID resolution reply to the
+ *   remote node.
+ * @cm_id: Communication identifier associated with the received service ID
+ *   resolution request.
+ * @param: Service ID resolution reply information.
+ */
+int ib_send_cm_sidr_rep(struct ib_cm_id *cm_id,
+			struct ib_cm_sidr_rep_param *param);
+
+#endif /* IB_CM_H */
diff --git a/include/rdma/ib_fmr_pool.h b/include/rdma/ib_fmr_pool.h
new file mode 100644
index 0000000..f62b842
--- /dev/null
+++ b/include/rdma/ib_fmr_pool.h
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2004 Topspin Corporation.  All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if !defined(IB_FMR_POOL_H)
+#define IB_FMR_POOL_H
+
+#include <rdma/ib_verbs.h>
+
+struct ib_fmr_pool;
+
+/**
+ * struct ib_fmr_pool_param - Parameters for creating FMR pool
+ * @max_pages_per_fmr:Maximum number of pages per map request.
+ * @page_shift: Log2 of sizeof "pages" mapped by this fmr
+ * @access:Access flags for FMRs in pool.
+ * @pool_size:Number of FMRs to allocate for pool.
+ * @dirty_watermark:Flush is triggered when @dirty_watermark dirty
+ *     FMRs are present.
+ * @flush_function:Callback called when unmapped FMRs are flushed and
+ *     more FMRs are possibly available for mapping
+ * @flush_arg:Context passed to user's flush function.
+ * @cache:If set, FMRs may be reused after unmapping for identical map
+ *     requests.
+ */
+struct ib_fmr_pool_param {
+	int                     max_pages_per_fmr;
+	int                     page_shift;
+	enum ib_access_flags    access;
+	int                     pool_size;
+	int                     dirty_watermark;
+	void                  (*flush_function)(struct ib_fmr_pool *pool,
+						void               *arg);
+	void                   *flush_arg;
+	unsigned                cache:1;
+};
+
+struct ib_pool_fmr {
+	struct ib_fmr      *fmr;
+	struct ib_fmr_pool *pool;
+	struct list_head    list;
+	struct hlist_node   cache_node;
+	int                 ref_count;
+	int                 remap_count;
+	u64                 io_virtual_address;
+	int                 page_list_len;
+	u64                 page_list[0];
+};
+
+struct ib_fmr_pool *ib_create_fmr_pool(struct ib_pd             *pd,
+				       struct ib_fmr_pool_param *params);
+
+void ib_destroy_fmr_pool(struct ib_fmr_pool *pool);
+
+int ib_flush_fmr_pool(struct ib_fmr_pool *pool);
+
+struct ib_pool_fmr *ib_fmr_pool_map_phys(struct ib_fmr_pool *pool_handle,
+					 u64                *page_list,
+					 int                 list_len,
+					 u64                 io_virtual_address);
+
+int ib_fmr_pool_unmap(struct ib_pool_fmr *fmr);
+
+#endif /* IB_FMR_POOL_H */
diff --git a/include/rdma/ib_mad.h b/include/rdma/ib_mad.h
new file mode 100644
index 0000000..9bb99e9
--- /dev/null
+++ b/include/rdma/ib_mad.h
@@ -0,0 +1,680 @@
+/*
+ * Copyright (c) 2004 Mellanox Technologies Ltd.  All rights reserved.
+ * Copyright (c) 2004 Infinicon Corporation.  All rights reserved.
+ * Copyright (c) 2004 Intel Corporation.  All rights reserved.
+ * Copyright (c) 2004 Topspin Corporation.  All rights reserved.
+ * Copyright (c) 2004-2006 Voltaire Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if !defined(IB_MAD_H)
+#define IB_MAD_H
+
+#include <linux/list.h>
+
+#include <rdma/ib_verbs.h>
+#include <uapi/rdma/ib_user_mad.h>
+
+/* Management base version */
+#define IB_MGMT_BASE_VERSION			1
+
+/* Management classes */
+#define IB_MGMT_CLASS_SUBN_LID_ROUTED		0x01
+#define IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE	0x81
+#define IB_MGMT_CLASS_SUBN_ADM			0x03
+#define IB_MGMT_CLASS_PERF_MGMT			0x04
+#define IB_MGMT_CLASS_BM			0x05
+#define IB_MGMT_CLASS_DEVICE_MGMT		0x06
+#define IB_MGMT_CLASS_CM			0x07
+#define IB_MGMT_CLASS_SNMP			0x08
+#define IB_MGMT_CLASS_DEVICE_ADM		0x10
+#define IB_MGMT_CLASS_BOOT_MGMT			0x11
+#define IB_MGMT_CLASS_BIS			0x12
+#define IB_MGMT_CLASS_CONG_MGMT			0x21
+#define IB_MGMT_CLASS_VENDOR_RANGE2_START	0x30
+#define IB_MGMT_CLASS_VENDOR_RANGE2_END		0x4F
+
+#define	IB_OPENIB_OUI				(0x001405)
+
+/* Management methods */
+#define IB_MGMT_METHOD_GET			0x01
+#define IB_MGMT_METHOD_SET			0x02
+#define IB_MGMT_METHOD_GET_RESP			0x81
+#define IB_MGMT_METHOD_SEND			0x03
+#define IB_MGMT_METHOD_TRAP			0x05
+#define IB_MGMT_METHOD_REPORT			0x06
+#define IB_MGMT_METHOD_REPORT_RESP		0x86
+#define IB_MGMT_METHOD_TRAP_REPRESS		0x07
+
+#define IB_MGMT_METHOD_RESP			0x80
+#define IB_BM_ATTR_MOD_RESP			cpu_to_be32(1)
+
+#define IB_MGMT_MAX_METHODS			128
+
+/* MAD Status field bit masks */
+#define IB_MGMT_MAD_STATUS_SUCCESS			0x0000
+#define IB_MGMT_MAD_STATUS_BUSY				0x0001
+#define IB_MGMT_MAD_STATUS_REDIRECT_REQD		0x0002
+#define IB_MGMT_MAD_STATUS_BAD_VERSION			0x0004
+#define IB_MGMT_MAD_STATUS_UNSUPPORTED_METHOD		0x0008
+#define IB_MGMT_MAD_STATUS_UNSUPPORTED_METHOD_ATTRIB	0x000c
+#define IB_MGMT_MAD_STATUS_INVALID_ATTRIB_VALUE		0x001c
+
+/* RMPP information */
+#define IB_MGMT_RMPP_VERSION			1
+
+#define IB_MGMT_RMPP_TYPE_DATA			1
+#define IB_MGMT_RMPP_TYPE_ACK			2
+#define IB_MGMT_RMPP_TYPE_STOP			3
+#define IB_MGMT_RMPP_TYPE_ABORT			4
+
+#define IB_MGMT_RMPP_FLAG_ACTIVE		1
+#define IB_MGMT_RMPP_FLAG_FIRST			(1<<1)
+#define IB_MGMT_RMPP_FLAG_LAST			(1<<2)
+
+#define IB_MGMT_RMPP_NO_RESPTIME		0x1F
+
+#define	IB_MGMT_RMPP_STATUS_SUCCESS		0
+#define	IB_MGMT_RMPP_STATUS_RESX		1
+#define	IB_MGMT_RMPP_STATUS_ABORT_MIN		118
+#define	IB_MGMT_RMPP_STATUS_T2L			118
+#define	IB_MGMT_RMPP_STATUS_BAD_LEN		119
+#define	IB_MGMT_RMPP_STATUS_BAD_SEG		120
+#define	IB_MGMT_RMPP_STATUS_BADT		121
+#define	IB_MGMT_RMPP_STATUS_W2S			122
+#define	IB_MGMT_RMPP_STATUS_S2B			123
+#define	IB_MGMT_RMPP_STATUS_BAD_STATUS		124
+#define	IB_MGMT_RMPP_STATUS_UNV			125
+#define	IB_MGMT_RMPP_STATUS_TMR			126
+#define	IB_MGMT_RMPP_STATUS_UNSPEC		127
+#define	IB_MGMT_RMPP_STATUS_ABORT_MAX		127
+
+#define IB_QP0		0
+#define IB_QP1		cpu_to_be32(1)
+#define IB_QP1_QKEY	0x80010000
+#define IB_QP_SET_QKEY	0x80000000
+
+#define IB_DEFAULT_PKEY_PARTIAL 0x7FFF
+#define IB_DEFAULT_PKEY_FULL	0xFFFF
+
+enum {
+	IB_MGMT_MAD_HDR = 24,
+	IB_MGMT_MAD_DATA = 232,
+	IB_MGMT_RMPP_HDR = 36,
+	IB_MGMT_RMPP_DATA = 220,
+	IB_MGMT_VENDOR_HDR = 40,
+	IB_MGMT_VENDOR_DATA = 216,
+	IB_MGMT_SA_HDR = 56,
+	IB_MGMT_SA_DATA = 200,
+	IB_MGMT_DEVICE_HDR = 64,
+	IB_MGMT_DEVICE_DATA = 192,
+};
+
+struct ib_mad_hdr {
+	u8	base_version;
+	u8	mgmt_class;
+	u8	class_version;
+	u8	method;
+	__be16	status;
+	__be16	class_specific;
+	__be64	tid;
+	__be16	attr_id;
+	__be16	resv;
+	__be32	attr_mod;
+};
+
+struct ib_rmpp_hdr {
+	u8	rmpp_version;
+	u8	rmpp_type;
+	u8	rmpp_rtime_flags;
+	u8	rmpp_status;
+	__be32	seg_num;
+	__be32	paylen_newwin;
+};
+
+typedef u64 __bitwise ib_sa_comp_mask;
+
+#define IB_SA_COMP_MASK(n) ((__force ib_sa_comp_mask) cpu_to_be64(1ull << (n)))
+
+/*
+ * ib_sa_hdr and ib_sa_mad structures must be packed because they have
+ * 64-bit fields that are only 32-bit aligned. 64-bit architectures will
+ * lay them out wrong otherwise.  (And unfortunately they are sent on
+ * the wire so we can't change the layout)
+ */
+struct ib_sa_hdr {
+	__be64			sm_key;
+	__be16			attr_offset;
+	__be16			reserved;
+	ib_sa_comp_mask		comp_mask;
+} __attribute__ ((packed));
+
+struct ib_mad {
+	struct ib_mad_hdr	mad_hdr;
+	u8			data[IB_MGMT_MAD_DATA];
+};
+
+struct ib_rmpp_mad {
+	struct ib_mad_hdr	mad_hdr;
+	struct ib_rmpp_hdr	rmpp_hdr;
+	u8			data[IB_MGMT_RMPP_DATA];
+};
+
+struct ib_sa_mad {
+	struct ib_mad_hdr	mad_hdr;
+	struct ib_rmpp_hdr	rmpp_hdr;
+	struct ib_sa_hdr	sa_hdr;
+	u8			data[IB_MGMT_SA_DATA];
+} __attribute__ ((packed));
+
+struct ib_vendor_mad {
+	struct ib_mad_hdr	mad_hdr;
+	struct ib_rmpp_hdr	rmpp_hdr;
+	u8			reserved;
+	u8			oui[3];
+	u8			data[IB_MGMT_VENDOR_DATA];
+};
+
+struct ib_class_port_info {
+	u8			base_version;
+	u8			class_version;
+	__be16			capability_mask;
+	u8			reserved[3];
+	u8			resp_time_value;
+	u8			redirect_gid[16];
+	__be32			redirect_tcslfl;
+	__be16			redirect_lid;
+	__be16			redirect_pkey;
+	__be32			redirect_qp;
+	__be32			redirect_qkey;
+	u8			trap_gid[16];
+	__be32			trap_tcslfl;
+	__be16			trap_lid;
+	__be16			trap_pkey;
+	__be32			trap_hlqp;
+	__be32			trap_qkey;
+};
+
+/**
+ * ib_mad_send_buf - MAD data buffer and work request for sends.
+ * @next: A pointer used to chain together MADs for posting.
+ * @mad: References an allocated MAD data buffer for MADs that do not have
+ *   RMPP active.  For MADs using RMPP, references the common and management
+ *   class specific headers.
+ * @mad_agent: MAD agent that allocated the buffer.
+ * @ah: The address handle to use when sending the MAD.
+ * @context: User-controlled context fields.
+ * @hdr_len: Indicates the size of the data header of the MAD.  This length
+ *   includes the common MAD, RMPP, and class specific headers.
+ * @data_len: Indicates the total size of user-transferred data.
+ * @seg_count: The number of RMPP segments allocated for this send.
+ * @seg_size: Size of each RMPP segment.
+ * @timeout_ms: Time to wait for a response.
+ * @retries: Number of times to retry a request for a response.  For MADs
+ *   using RMPP, this applies per window.  On completion, returns the number
+ *   of retries needed to complete the transfer.
+ *
+ * Users are responsible for initializing the MAD buffer itself, with the
+ * exception of any RMPP header.  Additional segment buffer space allocated
+ * beyond data_len is padding.
+ */
+struct ib_mad_send_buf {
+	struct ib_mad_send_buf	*next;
+	void			*mad;
+	struct ib_mad_agent	*mad_agent;
+	struct ib_ah		*ah;
+	void			*context[2];
+	int			hdr_len;
+	int			data_len;
+	int			seg_count;
+	int			seg_size;
+	int			timeout_ms;
+	int			retries;
+};
+
+/**
+ * ib_response_mad - Returns if the specified MAD has been generated in
+ *   response to a sent request or trap.
+ */
+int ib_response_mad(struct ib_mad *mad);
+
+/**
+ * ib_get_rmpp_resptime - Returns the RMPP response time.
+ * @rmpp_hdr: An RMPP header.
+ */
+static inline u8 ib_get_rmpp_resptime(struct ib_rmpp_hdr *rmpp_hdr)
+{
+	return rmpp_hdr->rmpp_rtime_flags >> 3;
+}
+
+/**
+ * ib_get_rmpp_flags - Returns the RMPP flags.
+ * @rmpp_hdr: An RMPP header.
+ */
+static inline u8 ib_get_rmpp_flags(struct ib_rmpp_hdr *rmpp_hdr)
+{
+	return rmpp_hdr->rmpp_rtime_flags & 0x7;
+}
+
+/**
+ * ib_set_rmpp_resptime - Sets the response time in an RMPP header.
+ * @rmpp_hdr: An RMPP header.
+ * @rtime: The response time to set.
+ */
+static inline void ib_set_rmpp_resptime(struct ib_rmpp_hdr *rmpp_hdr, u8 rtime)
+{
+	rmpp_hdr->rmpp_rtime_flags = ib_get_rmpp_flags(rmpp_hdr) | (rtime << 3);
+}
+
+/**
+ * ib_set_rmpp_flags - Sets the flags in an RMPP header.
+ * @rmpp_hdr: An RMPP header.
+ * @flags: The flags to set.
+ */
+static inline void ib_set_rmpp_flags(struct ib_rmpp_hdr *rmpp_hdr, u8 flags)
+{
+	rmpp_hdr->rmpp_rtime_flags = (rmpp_hdr->rmpp_rtime_flags & 0xF8) |
+				     (flags & 0x7);
+}
+
+struct ib_mad_agent;
+struct ib_mad_send_wc;
+struct ib_mad_recv_wc;
+
+/**
+ * ib_mad_send_handler - callback handler for a sent MAD.
+ * @mad_agent: MAD agent that sent the MAD.
+ * @mad_send_wc: Send work completion information on the sent MAD.
+ */
+typedef void (*ib_mad_send_handler)(struct ib_mad_agent *mad_agent,
+				    struct ib_mad_send_wc *mad_send_wc);
+
+/**
+ * ib_mad_snoop_handler - Callback handler for snooping sent MADs.
+ * @mad_agent: MAD agent that snooped the MAD.
+ * @send_wr: Work request information on the sent MAD.
+ * @mad_send_wc: Work completion information on the sent MAD.  Valid
+ *   only for snooping that occurs on a send completion.
+ *
+ * Clients snooping MADs should not modify data referenced by the @send_wr
+ * or @mad_send_wc.
+ */
+typedef void (*ib_mad_snoop_handler)(struct ib_mad_agent *mad_agent,
+				     struct ib_mad_send_buf *send_buf,
+				     struct ib_mad_send_wc *mad_send_wc);
+
+/**
+ * ib_mad_recv_handler - callback handler for a received MAD.
+ * @mad_agent: MAD agent requesting the received MAD.
+ * @mad_recv_wc: Received work completion information on the received MAD.
+ *
+ * MADs received in response to a send request operation will be handed to
+ * the user before the send operation completes.  All data buffers given
+ * to registered agents through this routine are owned by the receiving
+ * client, except for snooping agents.  Clients snooping MADs should not
+ * modify the data referenced by @mad_recv_wc.
+ */
+typedef void (*ib_mad_recv_handler)(struct ib_mad_agent *mad_agent,
+				    struct ib_mad_recv_wc *mad_recv_wc);
+
+/**
+ * ib_mad_agent - Used to track MAD registration with the access layer.
+ * @device: Reference to device registration is on.
+ * @qp: Reference to QP used for sending and receiving MADs.
+ * @mr: Memory region for system memory usable for DMA.
+ * @recv_handler: Callback handler for a received MAD.
+ * @send_handler: Callback handler for a sent MAD.
+ * @snoop_handler: Callback handler for snooped sent MADs.
+ * @context: User-specified context associated with this registration.
+ * @hi_tid: Access layer assigned transaction ID for this client.
+ *   Unsolicited MADs sent by this client will have the upper 32-bits
+ *   of their TID set to this value.
+ * @flags: registration flags
+ * @port_num: Port number on which QP is registered
+ * @rmpp_version: If set, indicates the RMPP version used by this agent.
+ */
+enum {
+	IB_MAD_USER_RMPP = IB_USER_MAD_USER_RMPP,
+};
+struct ib_mad_agent {
+	struct ib_device	*device;
+	struct ib_qp		*qp;
+	struct ib_mr		*mr;
+	ib_mad_recv_handler	recv_handler;
+	ib_mad_send_handler	send_handler;
+	ib_mad_snoop_handler	snoop_handler;
+	void			*context;
+	u32			hi_tid;
+	u32			flags;
+	u8			port_num;
+	u8			rmpp_version;
+};
+
+/**
+ * ib_mad_send_wc - MAD send completion information.
+ * @send_buf: Send MAD data buffer associated with the send MAD request.
+ * @status: Completion status.
+ * @vendor_err: Optional vendor error information returned with a failed
+ *   request.
+ */
+struct ib_mad_send_wc {
+	struct ib_mad_send_buf	*send_buf;
+	enum ib_wc_status	status;
+	u32			vendor_err;
+};
+
+/**
+ * ib_mad_recv_buf - received MAD buffer information.
+ * @list: Reference to next data buffer for a received RMPP MAD.
+ * @grh: References a data buffer containing the global route header.
+ *   The data refereced by this buffer is only valid if the GRH is
+ *   valid.
+ * @mad: References the start of the received MAD.
+ */
+struct ib_mad_recv_buf {
+	struct list_head	list;
+	struct ib_grh		*grh;
+	struct ib_mad		*mad;
+};
+
+/**
+ * ib_mad_recv_wc - received MAD information.
+ * @wc: Completion information for the received data.
+ * @recv_buf: Specifies the location of the received data buffer(s).
+ * @rmpp_list: Specifies a list of RMPP reassembled received MAD buffers.
+ * @mad_len: The length of the received MAD, without duplicated headers.
+ *
+ * For received response, the wr_id contains a pointer to the ib_mad_send_buf
+ *   for the corresponding send request.
+ */
+struct ib_mad_recv_wc {
+	struct ib_wc		*wc;
+	struct ib_mad_recv_buf	recv_buf;
+	struct list_head	rmpp_list;
+	int			mad_len;
+};
+
+/**
+ * ib_mad_reg_req - MAD registration request
+ * @mgmt_class: Indicates which management class of MADs should be receive
+ *   by the caller.  This field is only required if the user wishes to
+ *   receive unsolicited MADs, otherwise it should be 0.
+ * @mgmt_class_version: Indicates which version of MADs for the given
+ *   management class to receive.
+ * @oui: Indicates IEEE OUI when mgmt_class is a vendor class
+ *   in the range from 0x30 to 0x4f. Otherwise not used.
+ * @method_mask: The caller will receive unsolicited MADs for any method
+ *   where @method_mask = 1.
+ *
+ */
+struct ib_mad_reg_req {
+	u8	mgmt_class;
+	u8	mgmt_class_version;
+	u8	oui[3];
+	DECLARE_BITMAP(method_mask, IB_MGMT_MAX_METHODS);
+};
+
+/**
+ * ib_register_mad_agent - Register to send/receive MADs.
+ * @device: The device to register with.
+ * @port_num: The port on the specified device to use.
+ * @qp_type: Specifies which QP to access.  Must be either
+ *   IB_QPT_SMI or IB_QPT_GSI.
+ * @mad_reg_req: Specifies which unsolicited MADs should be received
+ *   by the caller.  This parameter may be NULL if the caller only
+ *   wishes to receive solicited responses.
+ * @rmpp_version: If set, indicates that the client will send
+ *   and receive MADs that contain the RMPP header for the given version.
+ *   If set to 0, indicates that RMPP is not used by this client.
+ * @send_handler: The completion callback routine invoked after a send
+ *   request has completed.
+ * @recv_handler: The completion callback routine invoked for a received
+ *   MAD.
+ * @context: User specified context associated with the registration.
+ * @registration_flags: Registration flags to set for this agent
+ */
+struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device,
+					   u8 port_num,
+					   enum ib_qp_type qp_type,
+					   struct ib_mad_reg_req *mad_reg_req,
+					   u8 rmpp_version,
+					   ib_mad_send_handler send_handler,
+					   ib_mad_recv_handler recv_handler,
+					   void *context,
+					   u32 registration_flags);
+
+enum ib_mad_snoop_flags {
+	/*IB_MAD_SNOOP_POSTED_SENDS	   = 1,*/
+	/*IB_MAD_SNOOP_RMPP_SENDS	   = (1<<1),*/
+	IB_MAD_SNOOP_SEND_COMPLETIONS	   = (1<<2),
+	/*IB_MAD_SNOOP_RMPP_SEND_COMPLETIONS = (1<<3),*/
+	IB_MAD_SNOOP_RECVS		   = (1<<4)
+	/*IB_MAD_SNOOP_RMPP_RECVS	   = (1<<5),*/
+	/*IB_MAD_SNOOP_REDIRECTED_QPS	   = (1<<6)*/
+};
+
+/**
+ * ib_register_mad_snoop - Register to snoop sent and received MADs.
+ * @device: The device to register with.
+ * @port_num: The port on the specified device to use.
+ * @qp_type: Specifies which QP traffic to snoop.  Must be either
+ *   IB_QPT_SMI or IB_QPT_GSI.
+ * @mad_snoop_flags: Specifies information where snooping occurs.
+ * @send_handler: The callback routine invoked for a snooped send.
+ * @recv_handler: The callback routine invoked for a snooped receive.
+ * @context: User specified context associated with the registration.
+ */
+struct ib_mad_agent *ib_register_mad_snoop(struct ib_device *device,
+					   u8 port_num,
+					   enum ib_qp_type qp_type,
+					   int mad_snoop_flags,
+					   ib_mad_snoop_handler snoop_handler,
+					   ib_mad_recv_handler recv_handler,
+					   void *context);
+
+/**
+ * ib_unregister_mad_agent - Unregisters a client from using MAD services.
+ * @mad_agent: Corresponding MAD registration request to deregister.
+ *
+ * After invoking this routine, MAD services are no longer usable by the
+ * client on the associated QP.
+ */
+int ib_unregister_mad_agent(struct ib_mad_agent *mad_agent);
+
+/**
+ * ib_post_send_mad - Posts MAD(s) to the send queue of the QP associated
+ *   with the registered client.
+ * @send_buf: Specifies the information needed to send the MAD(s).
+ * @bad_send_buf: Specifies the MAD on which an error was encountered.  This
+ *   parameter is optional if only a single MAD is posted.
+ *
+ * Sent MADs are not guaranteed to complete in the order that they were posted.
+ *
+ * If the MAD requires RMPP, the data buffer should contain a single copy
+ * of the common MAD, RMPP, and class specific headers, followed by the class
+ * defined data.  If the class defined data would not divide evenly into
+ * RMPP segments, then space must be allocated at the end of the referenced
+ * buffer for any required padding.  To indicate the amount of class defined
+ * data being transferred, the paylen_newwin field in the RMPP header should
+ * be set to the size of the class specific header plus the amount of class
+ * defined data being transferred.  The paylen_newwin field should be
+ * specified in network-byte order.
+ */
+int ib_post_send_mad(struct ib_mad_send_buf *send_buf,
+		     struct ib_mad_send_buf **bad_send_buf);
+
+
+/**
+ * ib_free_recv_mad - Returns data buffers used to receive a MAD.
+ * @mad_recv_wc: Work completion information for a received MAD.
+ *
+ * Clients receiving MADs through their ib_mad_recv_handler must call this
+ * routine to return the work completion buffers to the access layer.
+ */
+void ib_free_recv_mad(struct ib_mad_recv_wc *mad_recv_wc);
+
+/**
+ * ib_cancel_mad - Cancels an outstanding send MAD operation.
+ * @mad_agent: Specifies the registration associated with sent MAD.
+ * @send_buf: Indicates the MAD to cancel.
+ *
+ * MADs will be returned to the user through the corresponding
+ * ib_mad_send_handler.
+ */
+void ib_cancel_mad(struct ib_mad_agent *mad_agent,
+		   struct ib_mad_send_buf *send_buf);
+
+/**
+ * ib_modify_mad - Modifies an outstanding send MAD operation.
+ * @mad_agent: Specifies the registration associated with sent MAD.
+ * @send_buf: Indicates the MAD to modify.
+ * @timeout_ms: New timeout value for sent MAD.
+ *
+ * This call will reset the timeout value for a sent MAD to the specified
+ * value.
+ */
+int ib_modify_mad(struct ib_mad_agent *mad_agent,
+		  struct ib_mad_send_buf *send_buf, u32 timeout_ms);
+
+/**
+ * ib_redirect_mad_qp - Registers a QP for MAD services.
+ * @qp: Reference to a QP that requires MAD services.
+ * @rmpp_version: If set, indicates that the client will send
+ *   and receive MADs that contain the RMPP header for the given version.
+ *   If set to 0, indicates that RMPP is not used by this client.
+ * @send_handler: The completion callback routine invoked after a send
+ *   request has completed.
+ * @recv_handler: The completion callback routine invoked for a received
+ *   MAD.
+ * @context: User specified context associated with the registration.
+ *
+ * Use of this call allows clients to use MAD services, such as RMPP,
+ * on user-owned QPs.  After calling this routine, users may send
+ * MADs on the specified QP by calling ib_mad_post_send.
+ */
+struct ib_mad_agent *ib_redirect_mad_qp(struct ib_qp *qp,
+					u8 rmpp_version,
+					ib_mad_send_handler send_handler,
+					ib_mad_recv_handler recv_handler,
+					void *context);
+
+/**
+ * ib_process_mad_wc - Processes a work completion associated with a
+ *   MAD sent or received on a redirected QP.
+ * @mad_agent: Specifies the registered MAD service using the redirected QP.
+ * @wc: References a work completion associated with a sent or received
+ *   MAD segment.
+ *
+ * This routine is used to complete or continue processing on a MAD request.
+ * If the work completion is associated with a send operation, calling
+ * this routine is required to continue an RMPP transfer or to wait for a
+ * corresponding response, if it is a request.  If the work completion is
+ * associated with a receive operation, calling this routine is required to
+ * process an inbound or outbound RMPP transfer, or to match a response MAD
+ * with its corresponding request.
+ */
+int ib_process_mad_wc(struct ib_mad_agent *mad_agent,
+		      struct ib_wc *wc);
+
+/**
+ * ib_create_send_mad - Allocate and initialize a data buffer and work request
+ *   for sending a MAD.
+ * @mad_agent: Specifies the registered MAD service to associate with the MAD.
+ * @remote_qpn: Specifies the QPN of the receiving node.
+ * @pkey_index: Specifies which PKey the MAD will be sent using.  This field
+ *   is valid only if the remote_qpn is QP 1.
+ * @rmpp_active: Indicates if the send will enable RMPP.
+ * @hdr_len: Indicates the size of the data header of the MAD.  This length
+ *   should include the common MAD header, RMPP header, plus any class
+ *   specific header.
+ * @data_len: Indicates the size of any user-transferred data.  The call will
+ *   automatically adjust the allocated buffer size to account for any
+ *   additional padding that may be necessary.
+ * @gfp_mask: GFP mask used for the memory allocation.
+ *
+ * This routine allocates a MAD for sending.  The returned MAD send buffer
+ * will reference a data buffer usable for sending a MAD, along
+ * with an initialized work request structure.  Users may modify the returned
+ * MAD data buffer before posting the send.
+ *
+ * The returned MAD header, class specific headers, and any padding will be
+ * cleared.  Users are responsible for initializing the common MAD header,
+ * any class specific header, and MAD data area.
+ * If @rmpp_active is set, the RMPP header will be initialized for sending.
+ */
+struct ib_mad_send_buf *ib_create_send_mad(struct ib_mad_agent *mad_agent,
+					   u32 remote_qpn, u16 pkey_index,
+					   int rmpp_active,
+					   int hdr_len, int data_len,
+					   gfp_t gfp_mask);
+
+/**
+ * ib_is_mad_class_rmpp - returns whether given management class
+ * supports RMPP.
+ * @mgmt_class: management class
+ *
+ * This routine returns whether the management class supports RMPP.
+ */
+int ib_is_mad_class_rmpp(u8 mgmt_class);
+
+/**
+ * ib_get_mad_data_offset - returns the data offset for a given
+ * management class.
+ * @mgmt_class: management class
+ *
+ * This routine returns the data offset in the MAD for the management
+ * class requested.
+ */
+int ib_get_mad_data_offset(u8 mgmt_class);
+
+/**
+ * ib_get_rmpp_segment - returns the data buffer for a given RMPP segment.
+ * @send_buf: Previously allocated send data buffer.
+ * @seg_num: number of segment to return
+ *
+ * This routine returns a pointer to the data buffer of an RMPP MAD.
+ * Users must provide synchronization to @send_buf around this call.
+ */
+void *ib_get_rmpp_segment(struct ib_mad_send_buf *send_buf, int seg_num);
+
+/**
+ * ib_free_send_mad - Returns data buffers used to send a MAD.
+ * @send_buf: Previously allocated send data buffer.
+ */
+void ib_free_send_mad(struct ib_mad_send_buf *send_buf);
+
+/**
+ * ib_mad_kernel_rmpp_agent - Returns if the agent is performing RMPP.
+ * @agent: the agent in question
+ * @return: true if agent is performing rmpp, false otherwise.
+ */
+int ib_mad_kernel_rmpp_agent(struct ib_mad_agent *agent);
+
+#endif /* IB_MAD_H */
diff --git a/include/rdma/ib_marshall.h b/include/rdma/ib_marshall.h
new file mode 100644
index 0000000..db03720
--- /dev/null
+++ b/include/rdma/ib_marshall.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2005-2006 Intel Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if !defined(IB_USER_MARSHALL_H)
+#define IB_USER_MARSHALL_H
+
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_sa.h>
+#include <rdma/ib_user_verbs.h>
+#include <rdma/ib_user_sa.h>
+
+void ib_copy_qp_attr_to_user(struct ib_uverbs_qp_attr *dst,
+			     struct ib_qp_attr *src);
+
+void ib_copy_ah_attr_to_user(struct ib_uverbs_ah_attr *dst,
+			     struct ib_ah_attr *src);
+
+void ib_copy_path_rec_to_user(struct ib_user_path_rec *dst,
+			      struct ib_sa_path_rec *src);
+
+void ib_copy_path_rec_from_user(struct ib_sa_path_rec *dst,
+				struct ib_user_path_rec *src);
+
+#endif /* IB_USER_MARSHALL_H */
diff --git a/include/rdma/ib_pack.h b/include/rdma/ib_pack.h
new file mode 100644
index 0000000..b1f7592
--- /dev/null
+++ b/include/rdma/ib_pack.h
@@ -0,0 +1,268 @@
+/*
+ * Copyright (c) 2004 Topspin Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef IB_PACK_H
+#define IB_PACK_H
+
+#include <rdma/ib_verbs.h>
+#include <uapi/linux/if_ether.h>
+
+enum {
+	IB_LRH_BYTES  = 8,
+	IB_ETH_BYTES  = 14,
+	IB_VLAN_BYTES = 4,
+	IB_GRH_BYTES  = 40,
+	IB_BTH_BYTES  = 12,
+	IB_DETH_BYTES = 8
+};
+
+struct ib_field {
+	size_t struct_offset_bytes;
+	size_t struct_size_bytes;
+	int    offset_words;
+	int    offset_bits;
+	int    size_bits;
+	char  *field_name;
+};
+
+#define RESERVED \
+	.field_name          = "reserved"
+
+/*
+ * This macro cleans up the definitions of constants for BTH opcodes.
+ * It is used to define constants such as IB_OPCODE_UD_SEND_ONLY,
+ * which becomes IB_OPCODE_UD + IB_OPCODE_SEND_ONLY, and this gives
+ * the correct value.
+ *
+ * In short, user code should use the constants defined using the
+ * macro rather than worrying about adding together other constants.
+*/
+#define IB_OPCODE(transport, op) \
+	IB_OPCODE_ ## transport ## _ ## op = \
+		IB_OPCODE_ ## transport + IB_OPCODE_ ## op
+
+enum {
+	/* transport types -- just used to define real constants */
+	IB_OPCODE_RC                                = 0x00,
+	IB_OPCODE_UC                                = 0x20,
+	IB_OPCODE_RD                                = 0x40,
+	IB_OPCODE_UD                                = 0x60,
+
+	/* operations -- just used to define real constants */
+	IB_OPCODE_SEND_FIRST                        = 0x00,
+	IB_OPCODE_SEND_MIDDLE                       = 0x01,
+	IB_OPCODE_SEND_LAST                         = 0x02,
+	IB_OPCODE_SEND_LAST_WITH_IMMEDIATE          = 0x03,
+	IB_OPCODE_SEND_ONLY                         = 0x04,
+	IB_OPCODE_SEND_ONLY_WITH_IMMEDIATE          = 0x05,
+	IB_OPCODE_RDMA_WRITE_FIRST                  = 0x06,
+	IB_OPCODE_RDMA_WRITE_MIDDLE                 = 0x07,
+	IB_OPCODE_RDMA_WRITE_LAST                   = 0x08,
+	IB_OPCODE_RDMA_WRITE_LAST_WITH_IMMEDIATE    = 0x09,
+	IB_OPCODE_RDMA_WRITE_ONLY                   = 0x0a,
+	IB_OPCODE_RDMA_WRITE_ONLY_WITH_IMMEDIATE    = 0x0b,
+	IB_OPCODE_RDMA_READ_REQUEST                 = 0x0c,
+	IB_OPCODE_RDMA_READ_RESPONSE_FIRST          = 0x0d,
+	IB_OPCODE_RDMA_READ_RESPONSE_MIDDLE         = 0x0e,
+	IB_OPCODE_RDMA_READ_RESPONSE_LAST           = 0x0f,
+	IB_OPCODE_RDMA_READ_RESPONSE_ONLY           = 0x10,
+	IB_OPCODE_ACKNOWLEDGE                       = 0x11,
+	IB_OPCODE_ATOMIC_ACKNOWLEDGE                = 0x12,
+	IB_OPCODE_COMPARE_SWAP                      = 0x13,
+	IB_OPCODE_FETCH_ADD                         = 0x14,
+
+	/* real constants follow -- see comment about above IB_OPCODE()
+	   macro for more details */
+
+	/* RC */
+	IB_OPCODE(RC, SEND_FIRST),
+	IB_OPCODE(RC, SEND_MIDDLE),
+	IB_OPCODE(RC, SEND_LAST),
+	IB_OPCODE(RC, SEND_LAST_WITH_IMMEDIATE),
+	IB_OPCODE(RC, SEND_ONLY),
+	IB_OPCODE(RC, SEND_ONLY_WITH_IMMEDIATE),
+	IB_OPCODE(RC, RDMA_WRITE_FIRST),
+	IB_OPCODE(RC, RDMA_WRITE_MIDDLE),
+	IB_OPCODE(RC, RDMA_WRITE_LAST),
+	IB_OPCODE(RC, RDMA_WRITE_LAST_WITH_IMMEDIATE),
+	IB_OPCODE(RC, RDMA_WRITE_ONLY),
+	IB_OPCODE(RC, RDMA_WRITE_ONLY_WITH_IMMEDIATE),
+	IB_OPCODE(RC, RDMA_READ_REQUEST),
+	IB_OPCODE(RC, RDMA_READ_RESPONSE_FIRST),
+	IB_OPCODE(RC, RDMA_READ_RESPONSE_MIDDLE),
+	IB_OPCODE(RC, RDMA_READ_RESPONSE_LAST),
+	IB_OPCODE(RC, RDMA_READ_RESPONSE_ONLY),
+	IB_OPCODE(RC, ACKNOWLEDGE),
+	IB_OPCODE(RC, ATOMIC_ACKNOWLEDGE),
+	IB_OPCODE(RC, COMPARE_SWAP),
+	IB_OPCODE(RC, FETCH_ADD),
+
+	/* UC */
+	IB_OPCODE(UC, SEND_FIRST),
+	IB_OPCODE(UC, SEND_MIDDLE),
+	IB_OPCODE(UC, SEND_LAST),
+	IB_OPCODE(UC, SEND_LAST_WITH_IMMEDIATE),
+	IB_OPCODE(UC, SEND_ONLY),
+	IB_OPCODE(UC, SEND_ONLY_WITH_IMMEDIATE),
+	IB_OPCODE(UC, RDMA_WRITE_FIRST),
+	IB_OPCODE(UC, RDMA_WRITE_MIDDLE),
+	IB_OPCODE(UC, RDMA_WRITE_LAST),
+	IB_OPCODE(UC, RDMA_WRITE_LAST_WITH_IMMEDIATE),
+	IB_OPCODE(UC, RDMA_WRITE_ONLY),
+	IB_OPCODE(UC, RDMA_WRITE_ONLY_WITH_IMMEDIATE),
+
+	/* RD */
+	IB_OPCODE(RD, SEND_FIRST),
+	IB_OPCODE(RD, SEND_MIDDLE),
+	IB_OPCODE(RD, SEND_LAST),
+	IB_OPCODE(RD, SEND_LAST_WITH_IMMEDIATE),
+	IB_OPCODE(RD, SEND_ONLY),
+	IB_OPCODE(RD, SEND_ONLY_WITH_IMMEDIATE),
+	IB_OPCODE(RD, RDMA_WRITE_FIRST),
+	IB_OPCODE(RD, RDMA_WRITE_MIDDLE),
+	IB_OPCODE(RD, RDMA_WRITE_LAST),
+	IB_OPCODE(RD, RDMA_WRITE_LAST_WITH_IMMEDIATE),
+	IB_OPCODE(RD, RDMA_WRITE_ONLY),
+	IB_OPCODE(RD, RDMA_WRITE_ONLY_WITH_IMMEDIATE),
+	IB_OPCODE(RD, RDMA_READ_REQUEST),
+	IB_OPCODE(RD, RDMA_READ_RESPONSE_FIRST),
+	IB_OPCODE(RD, RDMA_READ_RESPONSE_MIDDLE),
+	IB_OPCODE(RD, RDMA_READ_RESPONSE_LAST),
+	IB_OPCODE(RD, RDMA_READ_RESPONSE_ONLY),
+	IB_OPCODE(RD, ACKNOWLEDGE),
+	IB_OPCODE(RD, ATOMIC_ACKNOWLEDGE),
+	IB_OPCODE(RD, COMPARE_SWAP),
+	IB_OPCODE(RD, FETCH_ADD),
+
+	/* UD */
+	IB_OPCODE(UD, SEND_ONLY),
+	IB_OPCODE(UD, SEND_ONLY_WITH_IMMEDIATE)
+};
+
+enum {
+	IB_LNH_RAW        = 0,
+	IB_LNH_IP         = 1,
+	IB_LNH_IBA_LOCAL  = 2,
+	IB_LNH_IBA_GLOBAL = 3
+};
+
+struct ib_unpacked_lrh {
+	u8        virtual_lane;
+	u8        link_version;
+	u8        service_level;
+	u8        link_next_header;
+	__be16    destination_lid;
+	__be16    packet_length;
+	__be16    source_lid;
+};
+
+struct ib_unpacked_grh {
+	u8    	     ip_version;
+	u8    	     traffic_class;
+	__be32 	     flow_label;
+	__be16       payload_length;
+	u8    	     next_header;
+	u8    	     hop_limit;
+	union ib_gid source_gid;
+	union ib_gid destination_gid;
+};
+
+struct ib_unpacked_bth {
+	u8           opcode;
+	u8           solicited_event;
+	u8           mig_req;
+	u8           pad_count;
+	u8           transport_header_version;
+	__be16       pkey;
+	__be32       destination_qpn;
+	u8           ack_req;
+	__be32       psn;
+};
+
+struct ib_unpacked_deth {
+	__be32       qkey;
+	__be32       source_qpn;
+};
+
+struct ib_unpacked_eth {
+	u8	dmac_h[4];
+	u8	dmac_l[2];
+	u8	smac_h[2];
+	u8	smac_l[4];
+	__be16	type;
+};
+
+struct ib_unpacked_vlan {
+	__be16  tag;
+	__be16  type;
+};
+
+struct ib_ud_header {
+	int                     lrh_present;
+	struct ib_unpacked_lrh  lrh;
+	int			eth_present;
+	struct ib_unpacked_eth	eth;
+	int                     vlan_present;
+	struct ib_unpacked_vlan vlan;
+	int			grh_present;
+	struct ib_unpacked_grh	grh;
+	struct ib_unpacked_bth	bth;
+	struct ib_unpacked_deth deth;
+	int			immediate_present;
+	__be32			immediate_data;
+};
+
+void ib_pack(const struct ib_field        *desc,
+	     int                           desc_len,
+	     void                         *structure,
+	     void                         *buf);
+
+void ib_unpack(const struct ib_field        *desc,
+	       int                           desc_len,
+	       void                         *buf,
+	       void                         *structure);
+
+void ib_ud_header_init(int		    payload_bytes,
+		       int		    lrh_present,
+		       int		    eth_present,
+		       int		    vlan_present,
+		       int		    grh_present,
+		       int		    immediate_present,
+		       struct ib_ud_header *header);
+
+int ib_ud_header_pack(struct ib_ud_header *header,
+		      void                *buf);
+
+int ib_ud_header_unpack(void                *buf,
+			struct ib_ud_header *header);
+
+#endif /* IB_PACK_H */
diff --git a/include/rdma/ib_pma.h b/include/rdma/ib_pma.h
new file mode 100644
index 0000000..a5889f1
--- /dev/null
+++ b/include/rdma/ib_pma.h
@@ -0,0 +1,156 @@
+/*
+ * Copyright (c) 2006, 2007, 2008, 2009, 2010 QLogic Corporation.
+ * All rights reserved.
+ * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if !defined(IB_PMA_H)
+#define IB_PMA_H
+
+#include <rdma/ib_mad.h>
+
+/*
+ * PMA class portinfo capability mask bits
+ */
+#define IB_PMA_CLASS_CAP_ALLPORTSELECT  cpu_to_be16(1 << 8)
+#define IB_PMA_CLASS_CAP_EXT_WIDTH      cpu_to_be16(1 << 9)
+#define IB_PMA_CLASS_CAP_XMIT_WAIT      cpu_to_be16(1 << 12)
+
+#define IB_PMA_CLASS_PORT_INFO          cpu_to_be16(0x0001)
+#define IB_PMA_PORT_SAMPLES_CONTROL     cpu_to_be16(0x0010)
+#define IB_PMA_PORT_SAMPLES_RESULT      cpu_to_be16(0x0011)
+#define IB_PMA_PORT_COUNTERS            cpu_to_be16(0x0012)
+#define IB_PMA_PORT_COUNTERS_EXT        cpu_to_be16(0x001D)
+#define IB_PMA_PORT_SAMPLES_RESULT_EXT  cpu_to_be16(0x001E)
+
+struct ib_pma_mad {
+	struct ib_mad_hdr mad_hdr;
+	u8 reserved[40];
+	u8 data[192];
+} __packed;
+
+struct ib_pma_portsamplescontrol {
+	u8 opcode;
+	u8 port_select;
+	u8 tick;
+	u8 counter_width;		/* resv: 7:3, counter width: 2:0 */
+	__be32 counter_mask0_9;		/* 2, 10 3-bit fields */
+	__be16 counter_mask10_14;	/* 1, 5 3-bit fields */
+	u8 sample_mechanisms;
+	u8 sample_status;		/* only lower 2 bits */
+	__be64 option_mask;
+	__be64 vendor_mask;
+	__be32 sample_start;
+	__be32 sample_interval;
+	__be16 tag;
+	__be16 counter_select[15];
+	__be32 reserved1;
+	__be64 samples_only_option_mask;
+	__be32 reserved2[28];
+};
+
+struct ib_pma_portsamplesresult {
+	__be16 tag;
+	__be16 sample_status;   /* only lower 2 bits */
+	__be32 counter[15];
+};
+
+struct ib_pma_portsamplesresult_ext {
+	__be16 tag;
+	__be16 sample_status;   /* only lower 2 bits */
+	__be32 extended_width;  /* only upper 2 bits */
+	__be64 counter[15];
+};
+
+struct ib_pma_portcounters {
+	u8 reserved;
+	u8 port_select;
+	__be16 counter_select;
+	__be16 symbol_error_counter;
+	u8 link_error_recovery_counter;
+	u8 link_downed_counter;
+	__be16 port_rcv_errors;
+	__be16 port_rcv_remphys_errors;
+	__be16 port_rcv_switch_relay_errors;
+	__be16 port_xmit_discards;
+	u8 port_xmit_constraint_errors;
+	u8 port_rcv_constraint_errors;
+	u8 reserved1;
+	u8 link_overrun_errors; /* LocalLink: 7:4, BufferOverrun: 3:0 */
+	__be16 reserved2;
+	__be16 vl15_dropped;
+	__be32 port_xmit_data;
+	__be32 port_rcv_data;
+	__be32 port_xmit_packets;
+	__be32 port_rcv_packets;
+	__be32 port_xmit_wait;
+} __packed;
+
+
+#define IB_PMA_SEL_SYMBOL_ERROR                 cpu_to_be16(0x0001)
+#define IB_PMA_SEL_LINK_ERROR_RECOVERY          cpu_to_be16(0x0002)
+#define IB_PMA_SEL_LINK_DOWNED                  cpu_to_be16(0x0004)
+#define IB_PMA_SEL_PORT_RCV_ERRORS              cpu_to_be16(0x0008)
+#define IB_PMA_SEL_PORT_RCV_REMPHYS_ERRORS      cpu_to_be16(0x0010)
+#define IB_PMA_SEL_PORT_XMIT_DISCARDS           cpu_to_be16(0x0040)
+#define IB_PMA_SEL_LOCAL_LINK_INTEGRITY_ERRORS  cpu_to_be16(0x0200)
+#define IB_PMA_SEL_EXCESSIVE_BUFFER_OVERRUNS    cpu_to_be16(0x0400)
+#define IB_PMA_SEL_PORT_VL15_DROPPED            cpu_to_be16(0x0800)
+#define IB_PMA_SEL_PORT_XMIT_DATA               cpu_to_be16(0x1000)
+#define IB_PMA_SEL_PORT_RCV_DATA                cpu_to_be16(0x2000)
+#define IB_PMA_SEL_PORT_XMIT_PACKETS            cpu_to_be16(0x4000)
+#define IB_PMA_SEL_PORT_RCV_PACKETS             cpu_to_be16(0x8000)
+
+struct ib_pma_portcounters_ext {
+	u8 reserved;
+	u8 port_select;
+	__be16 counter_select;
+	__be32 reserved1;
+	__be64 port_xmit_data;
+	__be64 port_rcv_data;
+	__be64 port_xmit_packets;
+	__be64 port_rcv_packets;
+	__be64 port_unicast_xmit_packets;
+	__be64 port_unicast_rcv_packets;
+	__be64 port_multicast_xmit_packets;
+	__be64 port_multicast_rcv_packets;
+} __packed;
+
+#define IB_PMA_SELX_PORT_XMIT_DATA              cpu_to_be16(0x0001)
+#define IB_PMA_SELX_PORT_RCV_DATA               cpu_to_be16(0x0002)
+#define IB_PMA_SELX_PORT_XMIT_PACKETS           cpu_to_be16(0x0004)
+#define IB_PMA_SELX_PORT_RCV_PACKETS            cpu_to_be16(0x0008)
+#define IB_PMA_SELX_PORT_UNI_XMIT_PACKETS       cpu_to_be16(0x0010)
+#define IB_PMA_SELX_PORT_UNI_RCV_PACKETS        cpu_to_be16(0x0020)
+#define IB_PMA_SELX_PORT_MULTI_XMIT_PACKETS     cpu_to_be16(0x0040)
+#define IB_PMA_SELX_PORT_MULTI_RCV_PACKETS      cpu_to_be16(0x0080)
+
+#endif /* IB_PMA_H */
diff --git a/include/rdma/ib_sa.h b/include/rdma/ib_sa.h
new file mode 100644
index 0000000..7e071a6
--- /dev/null
+++ b/include/rdma/ib_sa.h
@@ -0,0 +1,431 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Voltaire, Inc.  All rights reserved.
+ * Copyright (c) 2006 Intel Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef IB_SA_H
+#define IB_SA_H
+
+#include <linux/completion.h>
+#include <linux/compiler.h>
+
+#include <linux/atomic.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_mad.h>
+
+enum {
+	IB_SA_CLASS_VERSION		= 2,	/* IB spec version 1.1/1.2 */
+
+	IB_SA_METHOD_GET_TABLE		= 0x12,
+	IB_SA_METHOD_GET_TABLE_RESP	= 0x92,
+	IB_SA_METHOD_DELETE		= 0x15,
+	IB_SA_METHOD_DELETE_RESP	= 0x95,
+	IB_SA_METHOD_GET_MULTI		= 0x14,
+	IB_SA_METHOD_GET_MULTI_RESP	= 0x94,
+	IB_SA_METHOD_GET_TRACE_TBL	= 0x13
+};
+
+enum {
+	IB_SA_ATTR_CLASS_PORTINFO    = 0x01,
+	IB_SA_ATTR_NOTICE	     = 0x02,
+	IB_SA_ATTR_INFORM_INFO	     = 0x03,
+	IB_SA_ATTR_NODE_REC	     = 0x11,
+	IB_SA_ATTR_PORT_INFO_REC     = 0x12,
+	IB_SA_ATTR_SL2VL_REC	     = 0x13,
+	IB_SA_ATTR_SWITCH_REC	     = 0x14,
+	IB_SA_ATTR_LINEAR_FDB_REC    = 0x15,
+	IB_SA_ATTR_RANDOM_FDB_REC    = 0x16,
+	IB_SA_ATTR_MCAST_FDB_REC     = 0x17,
+	IB_SA_ATTR_SM_INFO_REC	     = 0x18,
+	IB_SA_ATTR_LINK_REC	     = 0x20,
+	IB_SA_ATTR_GUID_INFO_REC     = 0x30,
+	IB_SA_ATTR_SERVICE_REC	     = 0x31,
+	IB_SA_ATTR_PARTITION_REC     = 0x33,
+	IB_SA_ATTR_PATH_REC	     = 0x35,
+	IB_SA_ATTR_VL_ARB_REC	     = 0x36,
+	IB_SA_ATTR_MC_MEMBER_REC     = 0x38,
+	IB_SA_ATTR_TRACE_REC	     = 0x39,
+	IB_SA_ATTR_MULTI_PATH_REC    = 0x3a,
+	IB_SA_ATTR_SERVICE_ASSOC_REC = 0x3b,
+	IB_SA_ATTR_INFORM_INFO_REC   = 0xf3
+};
+
+enum ib_sa_selector {
+	IB_SA_GT   = 0,
+	IB_SA_LT   = 1,
+	IB_SA_EQ   = 2,
+	/*
+	 * The meaning of "best" depends on the attribute: for
+	 * example, for MTU best will return the largest available
+	 * MTU, while for packet life time, best will return the
+	 * smallest available life time.
+	 */
+	IB_SA_BEST = 3
+};
+
+/*
+ * Structures for SA records are named "struct ib_sa_xxx_rec."  No
+ * attempt is made to pack structures to match the physical layout of
+ * SA records in SA MADs; all packing and unpacking is handled by the
+ * SA query code.
+ *
+ * For a record with structure ib_sa_xxx_rec, the naming convention
+ * for the component mask value for field yyy is IB_SA_XXX_REC_YYY (we
+ * never use different abbreviations or otherwise change the spelling
+ * of xxx/yyy between ib_sa_xxx_rec.yyy and IB_SA_XXX_REC_YYY).
+ *
+ * Reserved rows are indicated with comments to help maintainability.
+ */
+
+#define IB_SA_PATH_REC_SERVICE_ID		       (IB_SA_COMP_MASK( 0) |\
+							IB_SA_COMP_MASK( 1))
+#define IB_SA_PATH_REC_DGID				IB_SA_COMP_MASK( 2)
+#define IB_SA_PATH_REC_SGID				IB_SA_COMP_MASK( 3)
+#define IB_SA_PATH_REC_DLID				IB_SA_COMP_MASK( 4)
+#define IB_SA_PATH_REC_SLID				IB_SA_COMP_MASK( 5)
+#define IB_SA_PATH_REC_RAW_TRAFFIC			IB_SA_COMP_MASK( 6)
+/* reserved:								 7 */
+#define IB_SA_PATH_REC_FLOW_LABEL       		IB_SA_COMP_MASK( 8)
+#define IB_SA_PATH_REC_HOP_LIMIT			IB_SA_COMP_MASK( 9)
+#define IB_SA_PATH_REC_TRAFFIC_CLASS			IB_SA_COMP_MASK(10)
+#define IB_SA_PATH_REC_REVERSIBLE			IB_SA_COMP_MASK(11)
+#define IB_SA_PATH_REC_NUMB_PATH			IB_SA_COMP_MASK(12)
+#define IB_SA_PATH_REC_PKEY				IB_SA_COMP_MASK(13)
+#define IB_SA_PATH_REC_QOS_CLASS			IB_SA_COMP_MASK(14)
+#define IB_SA_PATH_REC_SL				IB_SA_COMP_MASK(15)
+#define IB_SA_PATH_REC_MTU_SELECTOR			IB_SA_COMP_MASK(16)
+#define IB_SA_PATH_REC_MTU				IB_SA_COMP_MASK(17)
+#define IB_SA_PATH_REC_RATE_SELECTOR			IB_SA_COMP_MASK(18)
+#define IB_SA_PATH_REC_RATE				IB_SA_COMP_MASK(19)
+#define IB_SA_PATH_REC_PACKET_LIFE_TIME_SELECTOR	IB_SA_COMP_MASK(20)
+#define IB_SA_PATH_REC_PACKET_LIFE_TIME			IB_SA_COMP_MASK(21)
+#define IB_SA_PATH_REC_PREFERENCE			IB_SA_COMP_MASK(22)
+
+struct ib_sa_path_rec {
+	__be64       service_id;
+	union ib_gid dgid;
+	union ib_gid sgid;
+	__be16       dlid;
+	__be16       slid;
+	int          raw_traffic;
+	/* reserved */
+	__be32       flow_label;
+	u8           hop_limit;
+	u8           traffic_class;
+	int          reversible;
+	u8           numb_path;
+	__be16       pkey;
+	__be16       qos_class;
+	u8           sl;
+	u8           mtu_selector;
+	u8           mtu;
+	u8           rate_selector;
+	u8           rate;
+	u8           packet_life_time_selector;
+	u8           packet_life_time;
+	u8           preference;
+	u8           smac[ETH_ALEN];
+	u8           dmac[ETH_ALEN];
+	u16	     vlan_id;
+};
+
+#define IB_SA_MCMEMBER_REC_MGID				IB_SA_COMP_MASK( 0)
+#define IB_SA_MCMEMBER_REC_PORT_GID			IB_SA_COMP_MASK( 1)
+#define IB_SA_MCMEMBER_REC_QKEY				IB_SA_COMP_MASK( 2)
+#define IB_SA_MCMEMBER_REC_MLID				IB_SA_COMP_MASK( 3)
+#define IB_SA_MCMEMBER_REC_MTU_SELECTOR			IB_SA_COMP_MASK( 4)
+#define IB_SA_MCMEMBER_REC_MTU				IB_SA_COMP_MASK( 5)
+#define IB_SA_MCMEMBER_REC_TRAFFIC_CLASS		IB_SA_COMP_MASK( 6)
+#define IB_SA_MCMEMBER_REC_PKEY				IB_SA_COMP_MASK( 7)
+#define IB_SA_MCMEMBER_REC_RATE_SELECTOR		IB_SA_COMP_MASK( 8)
+#define IB_SA_MCMEMBER_REC_RATE				IB_SA_COMP_MASK( 9)
+#define IB_SA_MCMEMBER_REC_PACKET_LIFE_TIME_SELECTOR	IB_SA_COMP_MASK(10)
+#define IB_SA_MCMEMBER_REC_PACKET_LIFE_TIME		IB_SA_COMP_MASK(11)
+#define IB_SA_MCMEMBER_REC_SL				IB_SA_COMP_MASK(12)
+#define IB_SA_MCMEMBER_REC_FLOW_LABEL			IB_SA_COMP_MASK(13)
+#define IB_SA_MCMEMBER_REC_HOP_LIMIT			IB_SA_COMP_MASK(14)
+#define IB_SA_MCMEMBER_REC_SCOPE			IB_SA_COMP_MASK(15)
+#define IB_SA_MCMEMBER_REC_JOIN_STATE			IB_SA_COMP_MASK(16)
+#define IB_SA_MCMEMBER_REC_PROXY_JOIN			IB_SA_COMP_MASK(17)
+
+struct ib_sa_mcmember_rec {
+	union ib_gid mgid;
+	union ib_gid port_gid;
+	__be32       qkey;
+	__be16       mlid;
+	u8           mtu_selector;
+	u8           mtu;
+	u8           traffic_class;
+	__be16       pkey;
+	u8 	     rate_selector;
+	u8 	     rate;
+	u8 	     packet_life_time_selector;
+	u8 	     packet_life_time;
+	u8           sl;
+	__be32       flow_label;
+	u8           hop_limit;
+	u8           scope;
+	u8           join_state;
+	int          proxy_join;
+};
+
+/* Service Record Component Mask Sec 15.2.5.14 Ver 1.1	*/
+#define IB_SA_SERVICE_REC_SERVICE_ID			IB_SA_COMP_MASK( 0)
+#define IB_SA_SERVICE_REC_SERVICE_GID			IB_SA_COMP_MASK( 1)
+#define IB_SA_SERVICE_REC_SERVICE_PKEY			IB_SA_COMP_MASK( 2)
+/* reserved:								 3 */
+#define IB_SA_SERVICE_REC_SERVICE_LEASE			IB_SA_COMP_MASK( 4)
+#define IB_SA_SERVICE_REC_SERVICE_KEY			IB_SA_COMP_MASK( 5)
+#define IB_SA_SERVICE_REC_SERVICE_NAME			IB_SA_COMP_MASK( 6)
+#define IB_SA_SERVICE_REC_SERVICE_DATA8_0		IB_SA_COMP_MASK( 7)
+#define IB_SA_SERVICE_REC_SERVICE_DATA8_1		IB_SA_COMP_MASK( 8)
+#define IB_SA_SERVICE_REC_SERVICE_DATA8_2		IB_SA_COMP_MASK( 9)
+#define IB_SA_SERVICE_REC_SERVICE_DATA8_3		IB_SA_COMP_MASK(10)
+#define IB_SA_SERVICE_REC_SERVICE_DATA8_4		IB_SA_COMP_MASK(11)
+#define IB_SA_SERVICE_REC_SERVICE_DATA8_5		IB_SA_COMP_MASK(12)
+#define IB_SA_SERVICE_REC_SERVICE_DATA8_6		IB_SA_COMP_MASK(13)
+#define IB_SA_SERVICE_REC_SERVICE_DATA8_7		IB_SA_COMP_MASK(14)
+#define IB_SA_SERVICE_REC_SERVICE_DATA8_8		IB_SA_COMP_MASK(15)
+#define IB_SA_SERVICE_REC_SERVICE_DATA8_9		IB_SA_COMP_MASK(16)
+#define IB_SA_SERVICE_REC_SERVICE_DATA8_10		IB_SA_COMP_MASK(17)
+#define IB_SA_SERVICE_REC_SERVICE_DATA8_11		IB_SA_COMP_MASK(18)
+#define IB_SA_SERVICE_REC_SERVICE_DATA8_12		IB_SA_COMP_MASK(19)
+#define IB_SA_SERVICE_REC_SERVICE_DATA8_13		IB_SA_COMP_MASK(20)
+#define IB_SA_SERVICE_REC_SERVICE_DATA8_14		IB_SA_COMP_MASK(21)
+#define IB_SA_SERVICE_REC_SERVICE_DATA8_15		IB_SA_COMP_MASK(22)
+#define IB_SA_SERVICE_REC_SERVICE_DATA16_0		IB_SA_COMP_MASK(23)
+#define IB_SA_SERVICE_REC_SERVICE_DATA16_1		IB_SA_COMP_MASK(24)
+#define IB_SA_SERVICE_REC_SERVICE_DATA16_2		IB_SA_COMP_MASK(25)
+#define IB_SA_SERVICE_REC_SERVICE_DATA16_3		IB_SA_COMP_MASK(26)
+#define IB_SA_SERVICE_REC_SERVICE_DATA16_4		IB_SA_COMP_MASK(27)
+#define IB_SA_SERVICE_REC_SERVICE_DATA16_5		IB_SA_COMP_MASK(28)
+#define IB_SA_SERVICE_REC_SERVICE_DATA16_6		IB_SA_COMP_MASK(29)
+#define IB_SA_SERVICE_REC_SERVICE_DATA16_7		IB_SA_COMP_MASK(30)
+#define IB_SA_SERVICE_REC_SERVICE_DATA32_0		IB_SA_COMP_MASK(31)
+#define IB_SA_SERVICE_REC_SERVICE_DATA32_1		IB_SA_COMP_MASK(32)
+#define IB_SA_SERVICE_REC_SERVICE_DATA32_2		IB_SA_COMP_MASK(33)
+#define IB_SA_SERVICE_REC_SERVICE_DATA32_3		IB_SA_COMP_MASK(34)
+#define IB_SA_SERVICE_REC_SERVICE_DATA64_0		IB_SA_COMP_MASK(35)
+#define IB_SA_SERVICE_REC_SERVICE_DATA64_1		IB_SA_COMP_MASK(36)
+
+#define IB_DEFAULT_SERVICE_LEASE 	0xFFFFFFFF
+
+struct ib_sa_service_rec {
+	u64		id;
+	union ib_gid	gid;
+	__be16 		pkey;
+	/* reserved */
+	u32		lease;
+	u8		key[16];
+	u8		name[64];
+	u8		data8[16];
+	u16		data16[8];
+	u32		data32[4];
+	u64		data64[2];
+};
+
+#define IB_SA_GUIDINFO_REC_LID		IB_SA_COMP_MASK(0)
+#define IB_SA_GUIDINFO_REC_BLOCK_NUM	IB_SA_COMP_MASK(1)
+#define IB_SA_GUIDINFO_REC_RES1		IB_SA_COMP_MASK(2)
+#define IB_SA_GUIDINFO_REC_RES2		IB_SA_COMP_MASK(3)
+#define IB_SA_GUIDINFO_REC_GID0		IB_SA_COMP_MASK(4)
+#define IB_SA_GUIDINFO_REC_GID1		IB_SA_COMP_MASK(5)
+#define IB_SA_GUIDINFO_REC_GID2		IB_SA_COMP_MASK(6)
+#define IB_SA_GUIDINFO_REC_GID3		IB_SA_COMP_MASK(7)
+#define IB_SA_GUIDINFO_REC_GID4		IB_SA_COMP_MASK(8)
+#define IB_SA_GUIDINFO_REC_GID5		IB_SA_COMP_MASK(9)
+#define IB_SA_GUIDINFO_REC_GID6		IB_SA_COMP_MASK(10)
+#define IB_SA_GUIDINFO_REC_GID7		IB_SA_COMP_MASK(11)
+
+struct ib_sa_guidinfo_rec {
+	__be16	lid;
+	u8	block_num;
+	/* reserved */
+	u8	res1;
+	__be32	res2;
+	u8	guid_info_list[64];
+};
+
+struct ib_sa_client {
+	atomic_t users;
+	struct completion comp;
+};
+
+/**
+ * ib_sa_register_client - Register an SA client.
+ */
+void ib_sa_register_client(struct ib_sa_client *client);
+
+/**
+ * ib_sa_unregister_client - Deregister an SA client.
+ * @client: Client object to deregister.
+ */
+void ib_sa_unregister_client(struct ib_sa_client *client);
+
+struct ib_sa_query;
+
+void ib_sa_cancel_query(int id, struct ib_sa_query *query);
+
+int ib_sa_path_rec_get(struct ib_sa_client *client,
+		       struct ib_device *device, u8 port_num,
+		       struct ib_sa_path_rec *rec,
+		       ib_sa_comp_mask comp_mask,
+		       int timeout_ms, gfp_t gfp_mask,
+		       void (*callback)(int status,
+					struct ib_sa_path_rec *resp,
+					void *context),
+		       void *context,
+		       struct ib_sa_query **query);
+
+int ib_sa_service_rec_query(struct ib_sa_client *client,
+			 struct ib_device *device, u8 port_num,
+			 u8 method,
+			 struct ib_sa_service_rec *rec,
+			 ib_sa_comp_mask comp_mask,
+			 int timeout_ms, gfp_t gfp_mask,
+			 void (*callback)(int status,
+					  struct ib_sa_service_rec *resp,
+					  void *context),
+			 void *context,
+			 struct ib_sa_query **sa_query);
+
+struct ib_sa_multicast {
+	struct ib_sa_mcmember_rec rec;
+	ib_sa_comp_mask		comp_mask;
+	int			(*callback)(int status,
+					    struct ib_sa_multicast *multicast);
+	void			*context;
+};
+
+/**
+ * ib_sa_join_multicast - Initiates a join request to the specified multicast
+ *   group.
+ * @client: SA client
+ * @device: Device associated with the multicast group.
+ * @port_num: Port on the specified device to associate with the multicast
+ *   group.
+ * @rec: SA multicast member record specifying group attributes.
+ * @comp_mask: Component mask indicating which group attributes of %rec are
+ *   valid.
+ * @gfp_mask: GFP mask for memory allocations.
+ * @callback: User callback invoked once the join operation completes.
+ * @context: User specified context stored with the ib_sa_multicast structure.
+ *
+ * This call initiates a multicast join request with the SA for the specified
+ * multicast group.  If the join operation is started successfully, it returns
+ * an ib_sa_multicast structure that is used to track the multicast operation.
+ * Users must free this structure by calling ib_free_multicast, even if the
+ * join operation later fails.  (The callback status is non-zero.)
+ *
+ * If the join operation fails; status will be non-zero, with the following
+ * failures possible:
+ * -ETIMEDOUT: The request timed out.
+ * -EIO: An error occurred sending the query.
+ * -EINVAL: The MCMemberRecord values differed from the existing group's.
+ * -ENETRESET: Indicates that an fatal error has occurred on the multicast
+ *   group, and the user must rejoin the group to continue using it.
+ */
+struct ib_sa_multicast *ib_sa_join_multicast(struct ib_sa_client *client,
+					     struct ib_device *device, u8 port_num,
+					     struct ib_sa_mcmember_rec *rec,
+					     ib_sa_comp_mask comp_mask, gfp_t gfp_mask,
+					     int (*callback)(int status,
+							     struct ib_sa_multicast
+								    *multicast),
+					     void *context);
+
+/**
+ * ib_free_multicast - Frees the multicast tracking structure, and releases
+ *    any reference on the multicast group.
+ * @multicast: Multicast tracking structure allocated by ib_join_multicast.
+ *
+ * This call blocks until the multicast identifier is destroyed.  It may
+ * not be called from within the multicast callback; however, returning a non-
+ * zero value from the callback will result in destroying the multicast
+ * tracking structure.
+ */
+void ib_sa_free_multicast(struct ib_sa_multicast *multicast);
+
+/**
+ * ib_get_mcmember_rec - Looks up a multicast member record by its MGID and
+ *   returns it if found.
+ * @device: Device associated with the multicast group.
+ * @port_num: Port on the specified device to associate with the multicast
+ *   group.
+ * @mgid: MGID of multicast group.
+ * @rec: Location to copy SA multicast member record.
+ */
+int ib_sa_get_mcmember_rec(struct ib_device *device, u8 port_num,
+			   union ib_gid *mgid, struct ib_sa_mcmember_rec *rec);
+
+/**
+ * ib_init_ah_from_mcmember - Initialize address handle attributes based on
+ * an SA multicast member record.
+ */
+int ib_init_ah_from_mcmember(struct ib_device *device, u8 port_num,
+			     struct ib_sa_mcmember_rec *rec,
+			     struct ib_ah_attr *ah_attr);
+
+/**
+ * ib_init_ah_from_path - Initialize address handle attributes based on an SA
+ *   path record.
+ */
+int ib_init_ah_from_path(struct ib_device *device, u8 port_num,
+			 struct ib_sa_path_rec *rec,
+			 struct ib_ah_attr *ah_attr);
+
+/**
+ * ib_sa_pack_path - Conert a path record from struct ib_sa_path_rec
+ * to IB MAD wire format.
+ */
+void ib_sa_pack_path(struct ib_sa_path_rec *rec, void *attribute);
+
+/**
+ * ib_sa_unpack_path - Convert a path record from MAD format to struct
+ * ib_sa_path_rec.
+ */
+void ib_sa_unpack_path(void *attribute, struct ib_sa_path_rec *rec);
+
+/* Support GuidInfoRecord */
+int ib_sa_guid_info_rec_query(struct ib_sa_client *client,
+			      struct ib_device *device, u8 port_num,
+			      struct ib_sa_guidinfo_rec *rec,
+			      ib_sa_comp_mask comp_mask, u8 method,
+			      int timeout_ms, gfp_t gfp_mask,
+			      void (*callback)(int status,
+					       struct ib_sa_guidinfo_rec *resp,
+					       void *context),
+			      void *context,
+			      struct ib_sa_query **sa_query);
+
+#endif /* IB_SA_H */
diff --git a/include/rdma/ib_smi.h b/include/rdma/ib_smi.h
new file mode 100644
index 0000000..98b9086
--- /dev/null
+++ b/include/rdma/ib_smi.h
@@ -0,0 +1,128 @@
+/*
+ * Copyright (c) 2004 Mellanox Technologies Ltd.  All rights reserved.
+ * Copyright (c) 2004 Infinicon Corporation.  All rights reserved.
+ * Copyright (c) 2004 Intel Corporation.  All rights reserved.
+ * Copyright (c) 2004 Topspin Corporation.  All rights reserved.
+ * Copyright (c) 2004 Voltaire Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if !defined(IB_SMI_H)
+#define IB_SMI_H
+
+#include <rdma/ib_mad.h>
+
+#define IB_SMP_DATA_SIZE			64
+#define IB_SMP_MAX_PATH_HOPS			64
+
+struct ib_smp {
+	u8	base_version;
+	u8	mgmt_class;
+	u8	class_version;
+	u8	method;
+	__be16	status;
+	u8	hop_ptr;
+	u8	hop_cnt;
+	__be64	tid;
+	__be16	attr_id;
+	__be16	resv;
+	__be32	attr_mod;
+	__be64	mkey;
+	__be16	dr_slid;
+	__be16	dr_dlid;
+	u8	reserved[28];
+	u8	data[IB_SMP_DATA_SIZE];
+	u8	initial_path[IB_SMP_MAX_PATH_HOPS];
+	u8	return_path[IB_SMP_MAX_PATH_HOPS];
+} __attribute__ ((packed));
+
+#define IB_SMP_DIRECTION			cpu_to_be16(0x8000)
+
+/* Subnet management attributes */
+#define IB_SMP_ATTR_NOTICE			cpu_to_be16(0x0002)
+#define IB_SMP_ATTR_NODE_DESC			cpu_to_be16(0x0010)
+#define IB_SMP_ATTR_NODE_INFO			cpu_to_be16(0x0011)
+#define IB_SMP_ATTR_SWITCH_INFO			cpu_to_be16(0x0012)
+#define IB_SMP_ATTR_GUID_INFO			cpu_to_be16(0x0014)
+#define IB_SMP_ATTR_PORT_INFO			cpu_to_be16(0x0015)
+#define IB_SMP_ATTR_PKEY_TABLE			cpu_to_be16(0x0016)
+#define IB_SMP_ATTR_SL_TO_VL_TABLE		cpu_to_be16(0x0017)
+#define IB_SMP_ATTR_VL_ARB_TABLE		cpu_to_be16(0x0018)
+#define IB_SMP_ATTR_LINEAR_FORWARD_TABLE	cpu_to_be16(0x0019)
+#define IB_SMP_ATTR_RANDOM_FORWARD_TABLE	cpu_to_be16(0x001A)
+#define IB_SMP_ATTR_MCAST_FORWARD_TABLE		cpu_to_be16(0x001B)
+#define IB_SMP_ATTR_SM_INFO			cpu_to_be16(0x0020)
+#define IB_SMP_ATTR_VENDOR_DIAG			cpu_to_be16(0x0030)
+#define IB_SMP_ATTR_LED_INFO			cpu_to_be16(0x0031)
+#define IB_SMP_ATTR_VENDOR_MASK			cpu_to_be16(0xFF00)
+
+struct ib_port_info {
+	__be64 mkey;
+	__be64 gid_prefix;
+	__be16 lid;
+	__be16 sm_lid;
+	__be32 cap_mask;
+	__be16 diag_code;
+	__be16 mkey_lease_period;
+	u8 local_port_num;
+	u8 link_width_enabled;
+	u8 link_width_supported;
+	u8 link_width_active;
+	u8 linkspeed_portstate;			/* 4 bits, 4 bits */
+	u8 portphysstate_linkdown;		/* 4 bits, 4 bits */
+	u8 mkeyprot_resv_lmc;			/* 2 bits, 3, 3 */
+	u8 linkspeedactive_enabled;		/* 4 bits, 4 bits */
+	u8 neighbormtu_mastersmsl;		/* 4 bits, 4 bits */
+	u8 vlcap_inittype;			/* 4 bits, 4 bits */
+	u8 vl_high_limit;
+	u8 vl_arb_high_cap;
+	u8 vl_arb_low_cap;
+	u8 inittypereply_mtucap;		/* 4 bits, 4 bits */
+	u8 vlstallcnt_hoqlife;			/* 3 bits, 5 bits */
+	u8 operationalvl_pei_peo_fpi_fpo;	/* 4 bits, 1, 1, 1, 1 */
+	__be16 mkey_violations;
+	__be16 pkey_violations;
+	__be16 qkey_violations;
+	u8 guid_cap;
+	u8 clientrereg_resv_subnetto;		/* 1 bit, 2 bits, 5 */
+	u8 resv_resptimevalue;			/* 3 bits, 5 bits */
+	u8 localphyerrors_overrunerrors;	/* 4 bits, 4 bits */
+	__be16 max_credit_hint;
+	u8 resv;
+	u8 link_roundtrip_latency[3];
+};
+
+static inline u8
+ib_get_smp_direction(struct ib_smp *smp)
+{
+	return ((smp->status & IB_SMP_DIRECTION) == IB_SMP_DIRECTION);
+}
+
+#endif /* IB_SMI_H */
diff --git a/include/rdma/ib_umem.h b/include/rdma/ib_umem.h
new file mode 100644
index 0000000..a2bf41e
--- /dev/null
+++ b/include/rdma/ib_umem.h
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2007 Cisco Systems.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef IB_UMEM_H
+#define IB_UMEM_H
+
+#include <linux/list.h>
+#include <linux/scatterlist.h>
+#include <linux/workqueue.h>
+
+struct ib_ucontext;
+
+struct ib_umem {
+	struct ib_ucontext     *context;
+	size_t			length;
+	int			offset;
+	int			page_size;
+	int                     writable;
+	int                     hugetlb;
+	struct work_struct	work;
+	struct pid             *pid;
+	struct mm_struct       *mm;
+	unsigned long		diff;
+	struct sg_table sg_head;
+	int             nmap;
+	int             npages;
+};
+
+#ifdef CONFIG_INFINIBAND_USER_MEM
+
+struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr,
+			    size_t size, int access, int dmasync);
+void ib_umem_release(struct ib_umem *umem);
+int ib_umem_page_count(struct ib_umem *umem);
+
+#else /* CONFIG_INFINIBAND_USER_MEM */
+
+#include <linux/err.h>
+
+static inline struct ib_umem *ib_umem_get(struct ib_ucontext *context,
+					  unsigned long addr, size_t size,
+					  int access, int dmasync) {
+	return ERR_PTR(-EINVAL);
+}
+static inline void ib_umem_release(struct ib_umem *umem) { }
+static inline int ib_umem_page_count(struct ib_umem *umem) { return 0; }
+
+#endif /* CONFIG_INFINIBAND_USER_MEM */
+
+#endif /* IB_UMEM_H */
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
new file mode 100644
index 0000000..470a011
--- /dev/null
+++ b/include/rdma/ib_verbs.h
@@ -0,0 +1,2626 @@
+/*
+ * Copyright (c) 2004 Mellanox Technologies Ltd.  All rights reserved.
+ * Copyright (c) 2004 Infinicon Corporation.  All rights reserved.
+ * Copyright (c) 2004 Intel Corporation.  All rights reserved.
+ * Copyright (c) 2004 Topspin Corporation.  All rights reserved.
+ * Copyright (c) 2004 Voltaire Corporation.  All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2005, 2006, 2007 Cisco Systems.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if !defined(IB_VERBS_H)
+#define IB_VERBS_H
+
+#include <linux/types.h>
+#include <linux/device.h>
+#include <linux/mm.h>
+#include <linux/dma-mapping.h>
+#include <linux/kref.h>
+#include <linux/list.h>
+#include <linux/rwsem.h>
+#include <linux/scatterlist.h>
+#include <linux/workqueue.h>
+#include <uapi/linux/if_ether.h>
+
+#include <linux/atomic.h>
+#include <asm/uaccess.h>
+
+extern struct workqueue_struct *ib_wq;
+
+union ib_gid {
+	u8	raw[16];
+	struct {
+		__be64	subnet_prefix;
+		__be64	interface_id;
+	} global;
+};
+
+enum rdma_node_type {
+	/* IB values map to NodeInfo:NodeType. */
+	RDMA_NODE_IB_CA 	= 1,
+	RDMA_NODE_IB_SWITCH,
+	RDMA_NODE_IB_ROUTER,
+	RDMA_NODE_RNIC,
+	RDMA_NODE_USNIC,
+	RDMA_NODE_USNIC_UDP,
+};
+
+enum rdma_transport_type {
+	RDMA_TRANSPORT_IB,
+	RDMA_TRANSPORT_IWARP,
+	RDMA_TRANSPORT_USNIC,
+	RDMA_TRANSPORT_USNIC_UDP
+};
+
+__attribute_const__ enum rdma_transport_type
+rdma_node_get_transport(enum rdma_node_type node_type);
+
+enum rdma_link_layer {
+	IB_LINK_LAYER_UNSPECIFIED,
+	IB_LINK_LAYER_INFINIBAND,
+	IB_LINK_LAYER_ETHERNET,
+};
+
+enum ib_device_cap_flags {
+	IB_DEVICE_RESIZE_MAX_WR		= 1,
+	IB_DEVICE_BAD_PKEY_CNTR		= (1<<1),
+	IB_DEVICE_BAD_QKEY_CNTR		= (1<<2),
+	IB_DEVICE_RAW_MULTI		= (1<<3),
+	IB_DEVICE_AUTO_PATH_MIG		= (1<<4),
+	IB_DEVICE_CHANGE_PHY_PORT	= (1<<5),
+	IB_DEVICE_UD_AV_PORT_ENFORCE	= (1<<6),
+	IB_DEVICE_CURR_QP_STATE_MOD	= (1<<7),
+	IB_DEVICE_SHUTDOWN_PORT		= (1<<8),
+	IB_DEVICE_INIT_TYPE		= (1<<9),
+	IB_DEVICE_PORT_ACTIVE_EVENT	= (1<<10),
+	IB_DEVICE_SYS_IMAGE_GUID	= (1<<11),
+	IB_DEVICE_RC_RNR_NAK_GEN	= (1<<12),
+	IB_DEVICE_SRQ_RESIZE		= (1<<13),
+	IB_DEVICE_N_NOTIFY_CQ		= (1<<14),
+	IB_DEVICE_LOCAL_DMA_LKEY	= (1<<15),
+	IB_DEVICE_RESERVED		= (1<<16), /* old SEND_W_INV */
+	IB_DEVICE_MEM_WINDOW		= (1<<17),
+	/*
+	 * Devices should set IB_DEVICE_UD_IP_SUM if they support
+	 * insertion of UDP and TCP checksum on outgoing UD IPoIB
+	 * messages and can verify the validity of checksum for
+	 * incoming messages.  Setting this flag implies that the
+	 * IPoIB driver may set NETIF_F_IP_CSUM for datagram mode.
+	 */
+	IB_DEVICE_UD_IP_CSUM		= (1<<18),
+	IB_DEVICE_UD_TSO		= (1<<19),
+	IB_DEVICE_XRC			= (1<<20),
+	IB_DEVICE_MEM_MGT_EXTENSIONS	= (1<<21),
+	IB_DEVICE_BLOCK_MULTICAST_LOOPBACK = (1<<22),
+	IB_DEVICE_MEM_WINDOW_TYPE_2A	= (1<<23),
+	IB_DEVICE_MEM_WINDOW_TYPE_2B	= (1<<24),
+	IB_DEVICE_MANAGED_FLOW_STEERING = (1<<29),
+	IB_DEVICE_SIGNATURE_HANDOVER	= (1<<30)
+};
+
+enum ib_signature_prot_cap {
+	IB_PROT_T10DIF_TYPE_1 = 1,
+	IB_PROT_T10DIF_TYPE_2 = 1 << 1,
+	IB_PROT_T10DIF_TYPE_3 = 1 << 2,
+};
+
+enum ib_signature_guard_cap {
+	IB_GUARD_T10DIF_CRC	= 1,
+	IB_GUARD_T10DIF_CSUM	= 1 << 1,
+};
+
+enum ib_atomic_cap {
+	IB_ATOMIC_NONE,
+	IB_ATOMIC_HCA,
+	IB_ATOMIC_GLOB
+};
+
+struct ib_device_attr {
+	u64			fw_ver;
+	__be64			sys_image_guid;
+	u64			max_mr_size;
+	u64			page_size_cap;
+	u32			vendor_id;
+	u32			vendor_part_id;
+	u32			hw_ver;
+	int			max_qp;
+	int			max_qp_wr;
+	int			device_cap_flags;
+	int			max_sge;
+	int			max_sge_rd;
+	int			max_cq;
+	int			max_cqe;
+	int			max_mr;
+	int			max_pd;
+	int			max_qp_rd_atom;
+	int			max_ee_rd_atom;
+	int			max_res_rd_atom;
+	int			max_qp_init_rd_atom;
+	int			max_ee_init_rd_atom;
+	enum ib_atomic_cap	atomic_cap;
+	enum ib_atomic_cap	masked_atomic_cap;
+	int			max_ee;
+	int			max_rdd;
+	int			max_mw;
+	int			max_raw_ipv6_qp;
+	int			max_raw_ethy_qp;
+	int			max_mcast_grp;
+	int			max_mcast_qp_attach;
+	int			max_total_mcast_qp_attach;
+	int			max_ah;
+	int			max_fmr;
+	int			max_map_per_fmr;
+	int			max_srq;
+	int			max_srq_wr;
+	int			max_srq_sge;
+	unsigned int		max_fast_reg_page_list_len;
+	u16			max_pkeys;
+	u8			local_ca_ack_delay;
+	int			sig_prot_cap;
+	int			sig_guard_cap;
+};
+
+enum ib_mtu {
+	IB_MTU_256  = 1,
+	IB_MTU_512  = 2,
+	IB_MTU_1024 = 3,
+	IB_MTU_2048 = 4,
+	IB_MTU_4096 = 5
+};
+
+static inline int ib_mtu_enum_to_int(enum ib_mtu mtu)
+{
+	switch (mtu) {
+	case IB_MTU_256:  return  256;
+	case IB_MTU_512:  return  512;
+	case IB_MTU_1024: return 1024;
+	case IB_MTU_2048: return 2048;
+	case IB_MTU_4096: return 4096;
+	default: 	  return -1;
+	}
+}
+
+enum ib_port_state {
+	IB_PORT_NOP		= 0,
+	IB_PORT_DOWN		= 1,
+	IB_PORT_INIT		= 2,
+	IB_PORT_ARMED		= 3,
+	IB_PORT_ACTIVE		= 4,
+	IB_PORT_ACTIVE_DEFER	= 5
+};
+
+enum ib_port_cap_flags {
+	IB_PORT_SM				= 1 <<  1,
+	IB_PORT_NOTICE_SUP			= 1 <<  2,
+	IB_PORT_TRAP_SUP			= 1 <<  3,
+	IB_PORT_OPT_IPD_SUP                     = 1 <<  4,
+	IB_PORT_AUTO_MIGR_SUP			= 1 <<  5,
+	IB_PORT_SL_MAP_SUP			= 1 <<  6,
+	IB_PORT_MKEY_NVRAM			= 1 <<  7,
+	IB_PORT_PKEY_NVRAM			= 1 <<  8,
+	IB_PORT_LED_INFO_SUP			= 1 <<  9,
+	IB_PORT_SM_DISABLED			= 1 << 10,
+	IB_PORT_SYS_IMAGE_GUID_SUP		= 1 << 11,
+	IB_PORT_PKEY_SW_EXT_PORT_TRAP_SUP	= 1 << 12,
+	IB_PORT_EXTENDED_SPEEDS_SUP             = 1 << 14,
+	IB_PORT_CM_SUP				= 1 << 16,
+	IB_PORT_SNMP_TUNNEL_SUP			= 1 << 17,
+	IB_PORT_REINIT_SUP			= 1 << 18,
+	IB_PORT_DEVICE_MGMT_SUP			= 1 << 19,
+	IB_PORT_VENDOR_CLASS_SUP		= 1 << 20,
+	IB_PORT_DR_NOTICE_SUP			= 1 << 21,
+	IB_PORT_CAP_MASK_NOTICE_SUP		= 1 << 22,
+	IB_PORT_BOOT_MGMT_SUP			= 1 << 23,
+	IB_PORT_LINK_LATENCY_SUP		= 1 << 24,
+	IB_PORT_CLIENT_REG_SUP			= 1 << 25,
+	IB_PORT_IP_BASED_GIDS			= 1 << 26
+};
+
+enum ib_port_width {
+	IB_WIDTH_1X	= 1,
+	IB_WIDTH_4X	= 2,
+	IB_WIDTH_8X	= 4,
+	IB_WIDTH_12X	= 8
+};
+
+static inline int ib_width_enum_to_int(enum ib_port_width width)
+{
+	switch (width) {
+	case IB_WIDTH_1X:  return  1;
+	case IB_WIDTH_4X:  return  4;
+	case IB_WIDTH_8X:  return  8;
+	case IB_WIDTH_12X: return 12;
+	default: 	  return -1;
+	}
+}
+
+enum ib_port_speed {
+	IB_SPEED_SDR	= 1,
+	IB_SPEED_DDR	= 2,
+	IB_SPEED_QDR	= 4,
+	IB_SPEED_FDR10	= 8,
+	IB_SPEED_FDR	= 16,
+	IB_SPEED_EDR	= 32
+};
+
+struct ib_protocol_stats {
+	/* TBD... */
+};
+
+struct iw_protocol_stats {
+	u64	ipInReceives;
+	u64	ipInHdrErrors;
+	u64	ipInTooBigErrors;
+	u64	ipInNoRoutes;
+	u64	ipInAddrErrors;
+	u64	ipInUnknownProtos;
+	u64	ipInTruncatedPkts;
+	u64	ipInDiscards;
+	u64	ipInDelivers;
+	u64	ipOutForwDatagrams;
+	u64	ipOutRequests;
+	u64	ipOutDiscards;
+	u64	ipOutNoRoutes;
+	u64	ipReasmTimeout;
+	u64	ipReasmReqds;
+	u64	ipReasmOKs;
+	u64	ipReasmFails;
+	u64	ipFragOKs;
+	u64	ipFragFails;
+	u64	ipFragCreates;
+	u64	ipInMcastPkts;
+	u64	ipOutMcastPkts;
+	u64	ipInBcastPkts;
+	u64	ipOutBcastPkts;
+
+	u64	tcpRtoAlgorithm;
+	u64	tcpRtoMin;
+	u64	tcpRtoMax;
+	u64	tcpMaxConn;
+	u64	tcpActiveOpens;
+	u64	tcpPassiveOpens;
+	u64	tcpAttemptFails;
+	u64	tcpEstabResets;
+	u64	tcpCurrEstab;
+	u64	tcpInSegs;
+	u64	tcpOutSegs;
+	u64	tcpRetransSegs;
+	u64	tcpInErrs;
+	u64	tcpOutRsts;
+};
+
+union rdma_protocol_stats {
+	struct ib_protocol_stats	ib;
+	struct iw_protocol_stats	iw;
+};
+
+struct ib_port_attr {
+	enum ib_port_state	state;
+	enum ib_mtu		max_mtu;
+	enum ib_mtu		active_mtu;
+	int			gid_tbl_len;
+	u32			port_cap_flags;
+	u32			max_msg_sz;
+	u32			bad_pkey_cntr;
+	u32			qkey_viol_cntr;
+	u16			pkey_tbl_len;
+	u16			lid;
+	u16			sm_lid;
+	u8			lmc;
+	u8			max_vl_num;
+	u8			sm_sl;
+	u8			subnet_timeout;
+	u8			init_type_reply;
+	u8			active_width;
+	u8			active_speed;
+	u8                      phys_state;
+};
+
+enum ib_device_modify_flags {
+	IB_DEVICE_MODIFY_SYS_IMAGE_GUID	= 1 << 0,
+	IB_DEVICE_MODIFY_NODE_DESC	= 1 << 1
+};
+
+struct ib_device_modify {
+	u64	sys_image_guid;
+	char	node_desc[64];
+};
+
+enum ib_port_modify_flags {
+	IB_PORT_SHUTDOWN		= 1,
+	IB_PORT_INIT_TYPE		= (1<<2),
+	IB_PORT_RESET_QKEY_CNTR		= (1<<3)
+};
+
+struct ib_port_modify {
+	u32	set_port_cap_mask;
+	u32	clr_port_cap_mask;
+	u8	init_type;
+};
+
+enum ib_event_type {
+	IB_EVENT_CQ_ERR,
+	IB_EVENT_QP_FATAL,
+	IB_EVENT_QP_REQ_ERR,
+	IB_EVENT_QP_ACCESS_ERR,
+	IB_EVENT_COMM_EST,
+	IB_EVENT_SQ_DRAINED,
+	IB_EVENT_PATH_MIG,
+	IB_EVENT_PATH_MIG_ERR,
+	IB_EVENT_DEVICE_FATAL,
+	IB_EVENT_PORT_ACTIVE,
+	IB_EVENT_PORT_ERR,
+	IB_EVENT_LID_CHANGE,
+	IB_EVENT_PKEY_CHANGE,
+	IB_EVENT_SM_CHANGE,
+	IB_EVENT_SRQ_ERR,
+	IB_EVENT_SRQ_LIMIT_REACHED,
+	IB_EVENT_QP_LAST_WQE_REACHED,
+	IB_EVENT_CLIENT_REREGISTER,
+	IB_EVENT_GID_CHANGE,
+};
+
+struct ib_event {
+	struct ib_device	*device;
+	union {
+		struct ib_cq	*cq;
+		struct ib_qp	*qp;
+		struct ib_srq	*srq;
+		u8		port_num;
+	} element;
+	enum ib_event_type	event;
+};
+
+struct ib_event_handler {
+	struct ib_device *device;
+	void            (*handler)(struct ib_event_handler *, struct ib_event *);
+	struct list_head  list;
+};
+
+#define INIT_IB_EVENT_HANDLER(_ptr, _device, _handler)		\
+	do {							\
+		(_ptr)->device  = _device;			\
+		(_ptr)->handler = _handler;			\
+		INIT_LIST_HEAD(&(_ptr)->list);			\
+	} while (0)
+
+struct ib_global_route {
+	union ib_gid	dgid;
+	u32		flow_label;
+	u8		sgid_index;
+	u8		hop_limit;
+	u8		traffic_class;
+};
+
+struct ib_grh {
+	__be32		version_tclass_flow;
+	__be16		paylen;
+	u8		next_hdr;
+	u8		hop_limit;
+	union ib_gid	sgid;
+	union ib_gid	dgid;
+};
+
+enum {
+	IB_MULTICAST_QPN = 0xffffff
+};
+
+#define IB_LID_PERMISSIVE	cpu_to_be16(0xFFFF)
+
+enum ib_ah_flags {
+	IB_AH_GRH	= 1
+};
+
+enum ib_rate {
+	IB_RATE_PORT_CURRENT = 0,
+	IB_RATE_2_5_GBPS = 2,
+	IB_RATE_5_GBPS   = 5,
+	IB_RATE_10_GBPS  = 3,
+	IB_RATE_20_GBPS  = 6,
+	IB_RATE_30_GBPS  = 4,
+	IB_RATE_40_GBPS  = 7,
+	IB_RATE_60_GBPS  = 8,
+	IB_RATE_80_GBPS  = 9,
+	IB_RATE_120_GBPS = 10,
+	IB_RATE_14_GBPS  = 11,
+	IB_RATE_56_GBPS  = 12,
+	IB_RATE_112_GBPS = 13,
+	IB_RATE_168_GBPS = 14,
+	IB_RATE_25_GBPS  = 15,
+	IB_RATE_100_GBPS = 16,
+	IB_RATE_200_GBPS = 17,
+	IB_RATE_300_GBPS = 18
+};
+
+/**
+ * ib_rate_to_mult - Convert the IB rate enum to a multiple of the
+ * base rate of 2.5 Gbit/sec.  For example, IB_RATE_5_GBPS will be
+ * converted to 2, since 5 Gbit/sec is 2 * 2.5 Gbit/sec.
+ * @rate: rate to convert.
+ */
+__attribute_const__ int ib_rate_to_mult(enum ib_rate rate);
+
+/**
+ * ib_rate_to_mbps - Convert the IB rate enum to Mbps.
+ * For example, IB_RATE_2_5_GBPS will be converted to 2500.
+ * @rate: rate to convert.
+ */
+__attribute_const__ int ib_rate_to_mbps(enum ib_rate rate);
+
+enum ib_mr_create_flags {
+	IB_MR_SIGNATURE_EN = 1,
+};
+
+/**
+ * ib_mr_init_attr - Memory region init attributes passed to routine
+ *     ib_create_mr.
+ * @max_reg_descriptors: max number of registration descriptors that
+ *     may be used with registration work requests.
+ * @flags: MR creation flags bit mask.
+ */
+struct ib_mr_init_attr {
+	int	    max_reg_descriptors;
+	u32	    flags;
+};
+
+/**
+ * Signature types
+ * IB_SIG_TYPE_NONE: Unprotected.
+ * IB_SIG_TYPE_T10_DIF: Type T10-DIF
+ */
+enum ib_signature_type {
+	IB_SIG_TYPE_NONE,
+	IB_SIG_TYPE_T10_DIF,
+};
+
+/**
+ * Signature T10-DIF block-guard types
+ * IB_T10DIF_CRC: Corresponds to T10-PI mandated CRC checksum rules.
+ * IB_T10DIF_CSUM: Corresponds to IP checksum rules.
+ */
+enum ib_t10_dif_bg_type {
+	IB_T10DIF_CRC,
+	IB_T10DIF_CSUM
+};
+
+/**
+ * struct ib_t10_dif_domain - Parameters specific for T10-DIF
+ *     domain.
+ * @bg_type: T10-DIF block guard type (CRC|CSUM)
+ * @pi_interval: protection information interval.
+ * @bg: seed of guard computation.
+ * @app_tag: application tag of guard block
+ * @ref_tag: initial guard block reference tag.
+ * @ref_remap: Indicate wethear the reftag increments each block
+ * @app_escape: Indicate to skip block check if apptag=0xffff
+ * @ref_escape: Indicate to skip block check if reftag=0xffffffff
+ * @apptag_check_mask: check bitmask of application tag.
+ */
+struct ib_t10_dif_domain {
+	enum ib_t10_dif_bg_type bg_type;
+	u16			pi_interval;
+	u16			bg;
+	u16			app_tag;
+	u32			ref_tag;
+	bool			ref_remap;
+	bool			app_escape;
+	bool			ref_escape;
+	u16			apptag_check_mask;
+};
+
+/**
+ * struct ib_sig_domain - Parameters for signature domain
+ * @sig_type: specific signauture type
+ * @sig: union of all signature domain attributes that may
+ *     be used to set domain layout.
+ */
+struct ib_sig_domain {
+	enum ib_signature_type sig_type;
+	union {
+		struct ib_t10_dif_domain dif;
+	} sig;
+};
+
+/**
+ * struct ib_sig_attrs - Parameters for signature handover operation
+ * @check_mask: bitmask for signature byte check (8 bytes)
+ * @mem: memory domain layout desciptor.
+ * @wire: wire domain layout desciptor.
+ */
+struct ib_sig_attrs {
+	u8			check_mask;
+	struct ib_sig_domain	mem;
+	struct ib_sig_domain	wire;
+};
+
+enum ib_sig_err_type {
+	IB_SIG_BAD_GUARD,
+	IB_SIG_BAD_REFTAG,
+	IB_SIG_BAD_APPTAG,
+};
+
+/**
+ * struct ib_sig_err - signature error descriptor
+ */
+struct ib_sig_err {
+	enum ib_sig_err_type	err_type;
+	u32			expected;
+	u32			actual;
+	u64			sig_err_offset;
+	u32			key;
+};
+
+enum ib_mr_status_check {
+	IB_MR_CHECK_SIG_STATUS = 1,
+};
+
+/**
+ * struct ib_mr_status - Memory region status container
+ *
+ * @fail_status: Bitmask of MR checks status. For each
+ *     failed check a corresponding status bit is set.
+ * @sig_err: Additional info for IB_MR_CEHCK_SIG_STATUS
+ *     failure.
+ */
+struct ib_mr_status {
+	u32		    fail_status;
+	struct ib_sig_err   sig_err;
+};
+
+/**
+ * mult_to_ib_rate - Convert a multiple of 2.5 Gbit/sec to an IB rate
+ * enum.
+ * @mult: multiple to convert.
+ */
+__attribute_const__ enum ib_rate mult_to_ib_rate(int mult);
+
+struct ib_ah_attr {
+	struct ib_global_route	grh;
+	u16			dlid;
+	u8			sl;
+	u8			src_path_bits;
+	u8			static_rate;
+	u8			ah_flags;
+	u8			port_num;
+	u8			dmac[ETH_ALEN];
+	u16			vlan_id;
+};
+
+enum ib_wc_status {
+	IB_WC_SUCCESS,
+	IB_WC_LOC_LEN_ERR,
+	IB_WC_LOC_QP_OP_ERR,
+	IB_WC_LOC_EEC_OP_ERR,
+	IB_WC_LOC_PROT_ERR,
+	IB_WC_WR_FLUSH_ERR,
+	IB_WC_MW_BIND_ERR,
+	IB_WC_BAD_RESP_ERR,
+	IB_WC_LOC_ACCESS_ERR,
+	IB_WC_REM_INV_REQ_ERR,
+	IB_WC_REM_ACCESS_ERR,
+	IB_WC_REM_OP_ERR,
+	IB_WC_RETRY_EXC_ERR,
+	IB_WC_RNR_RETRY_EXC_ERR,
+	IB_WC_LOC_RDD_VIOL_ERR,
+	IB_WC_REM_INV_RD_REQ_ERR,
+	IB_WC_REM_ABORT_ERR,
+	IB_WC_INV_EECN_ERR,
+	IB_WC_INV_EEC_STATE_ERR,
+	IB_WC_FATAL_ERR,
+	IB_WC_RESP_TIMEOUT_ERR,
+	IB_WC_GENERAL_ERR
+};
+
+enum ib_wc_opcode {
+	IB_WC_SEND,
+	IB_WC_RDMA_WRITE,
+	IB_WC_RDMA_READ,
+	IB_WC_COMP_SWAP,
+	IB_WC_FETCH_ADD,
+	IB_WC_BIND_MW,
+	IB_WC_LSO,
+	IB_WC_LOCAL_INV,
+	IB_WC_FAST_REG_MR,
+	IB_WC_MASKED_COMP_SWAP,
+	IB_WC_MASKED_FETCH_ADD,
+/*
+ * Set value of IB_WC_RECV so consumers can test if a completion is a
+ * receive by testing (opcode & IB_WC_RECV).
+ */
+	IB_WC_RECV			= 1 << 7,
+	IB_WC_RECV_RDMA_WITH_IMM
+};
+
+enum ib_wc_flags {
+	IB_WC_GRH		= 1,
+	IB_WC_WITH_IMM		= (1<<1),
+	IB_WC_WITH_INVALIDATE	= (1<<2),
+	IB_WC_IP_CSUM_OK	= (1<<3),
+	IB_WC_WITH_SMAC		= (1<<4),
+	IB_WC_WITH_VLAN		= (1<<5),
+};
+
+struct ib_wc {
+	u64			wr_id;
+	enum ib_wc_status	status;
+	enum ib_wc_opcode	opcode;
+	u32			vendor_err;
+	u32			byte_len;
+	struct ib_qp	       *qp;
+	union {
+		__be32		imm_data;
+		u32		invalidate_rkey;
+	} ex;
+	u32			src_qp;
+	int			wc_flags;
+	u16			pkey_index;
+	u16			slid;
+	u8			sl;
+	u8			dlid_path_bits;
+	u8			port_num;	/* valid only for DR SMPs on switches */
+	u8			smac[ETH_ALEN];
+	u16			vlan_id;
+};
+
+enum ib_cq_notify_flags {
+	IB_CQ_SOLICITED			= 1 << 0,
+	IB_CQ_NEXT_COMP			= 1 << 1,
+	IB_CQ_SOLICITED_MASK		= IB_CQ_SOLICITED | IB_CQ_NEXT_COMP,
+	IB_CQ_REPORT_MISSED_EVENTS	= 1 << 2,
+};
+
+enum ib_srq_type {
+	IB_SRQT_BASIC,
+	IB_SRQT_XRC
+};
+
+enum ib_srq_attr_mask {
+	IB_SRQ_MAX_WR	= 1 << 0,
+	IB_SRQ_LIMIT	= 1 << 1,
+};
+
+struct ib_srq_attr {
+	u32	max_wr;
+	u32	max_sge;
+	u32	srq_limit;
+};
+
+struct ib_srq_init_attr {
+	void		      (*event_handler)(struct ib_event *, void *);
+	void		       *srq_context;
+	struct ib_srq_attr	attr;
+	enum ib_srq_type	srq_type;
+
+	union {
+		struct {
+			struct ib_xrcd *xrcd;
+			struct ib_cq   *cq;
+		} xrc;
+	} ext;
+};
+
+struct ib_qp_cap {
+	u32	max_send_wr;
+	u32	max_recv_wr;
+	u32	max_send_sge;
+	u32	max_recv_sge;
+	u32	max_inline_data;
+};
+
+enum ib_sig_type {
+	IB_SIGNAL_ALL_WR,
+	IB_SIGNAL_REQ_WR
+};
+
+enum ib_qp_type {
+	/*
+	 * IB_QPT_SMI and IB_QPT_GSI have to be the first two entries
+	 * here (and in that order) since the MAD layer uses them as
+	 * indices into a 2-entry table.
+	 */
+	IB_QPT_SMI,
+	IB_QPT_GSI,
+
+	IB_QPT_RC,
+	IB_QPT_UC,
+	IB_QPT_UD,
+	IB_QPT_RAW_IPV6,
+	IB_QPT_RAW_ETHERTYPE,
+	IB_QPT_RAW_PACKET = 8,
+	IB_QPT_XRC_INI = 9,
+	IB_QPT_XRC_TGT,
+	IB_QPT_MAX,
+	/* Reserve a range for qp types internal to the low level driver.
+	 * These qp types will not be visible at the IB core layer, so the
+	 * IB_QPT_MAX usages should not be affected in the core layer
+	 */
+	IB_QPT_RESERVED1 = 0x1000,
+	IB_QPT_RESERVED2,
+	IB_QPT_RESERVED3,
+	IB_QPT_RESERVED4,
+	IB_QPT_RESERVED5,
+	IB_QPT_RESERVED6,
+	IB_QPT_RESERVED7,
+	IB_QPT_RESERVED8,
+	IB_QPT_RESERVED9,
+	IB_QPT_RESERVED10,
+};
+
+enum ib_qp_create_flags {
+	IB_QP_CREATE_IPOIB_UD_LSO		= 1 << 0,
+	IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK	= 1 << 1,
+	IB_QP_CREATE_NETIF_QP			= 1 << 5,
+	IB_QP_CREATE_SIGNATURE_EN		= 1 << 6,
+	IB_QP_CREATE_USE_GFP_NOIO		= 1 << 7,
+	/* reserve bits 26-31 for low level drivers' internal use */
+	IB_QP_CREATE_RESERVED_START		= 1 << 26,
+	IB_QP_CREATE_RESERVED_END		= 1 << 31,
+};
+
+
+/*
+ * Note: users may not call ib_close_qp or ib_destroy_qp from the event_handler
+ * callback to destroy the passed in QP.
+ */
+
+struct ib_qp_init_attr {
+	void                  (*event_handler)(struct ib_event *, void *);
+	void		       *qp_context;
+	struct ib_cq	       *send_cq;
+	struct ib_cq	       *recv_cq;
+	struct ib_srq	       *srq;
+	struct ib_xrcd	       *xrcd;     /* XRC TGT QPs only */
+	struct ib_qp_cap	cap;
+	enum ib_sig_type	sq_sig_type;
+	enum ib_qp_type		qp_type;
+	enum ib_qp_create_flags	create_flags;
+	u8			port_num; /* special QP types only */
+};
+
+struct ib_qp_open_attr {
+	void                  (*event_handler)(struct ib_event *, void *);
+	void		       *qp_context;
+	u32			qp_num;
+	enum ib_qp_type		qp_type;
+};
+
+enum ib_rnr_timeout {
+	IB_RNR_TIMER_655_36 =  0,
+	IB_RNR_TIMER_000_01 =  1,
+	IB_RNR_TIMER_000_02 =  2,
+	IB_RNR_TIMER_000_03 =  3,
+	IB_RNR_TIMER_000_04 =  4,
+	IB_RNR_TIMER_000_06 =  5,
+	IB_RNR_TIMER_000_08 =  6,
+	IB_RNR_TIMER_000_12 =  7,
+	IB_RNR_TIMER_000_16 =  8,
+	IB_RNR_TIMER_000_24 =  9,
+	IB_RNR_TIMER_000_32 = 10,
+	IB_RNR_TIMER_000_48 = 11,
+	IB_RNR_TIMER_000_64 = 12,
+	IB_RNR_TIMER_000_96 = 13,
+	IB_RNR_TIMER_001_28 = 14,
+	IB_RNR_TIMER_001_92 = 15,
+	IB_RNR_TIMER_002_56 = 16,
+	IB_RNR_TIMER_003_84 = 17,
+	IB_RNR_TIMER_005_12 = 18,
+	IB_RNR_TIMER_007_68 = 19,
+	IB_RNR_TIMER_010_24 = 20,
+	IB_RNR_TIMER_015_36 = 21,
+	IB_RNR_TIMER_020_48 = 22,
+	IB_RNR_TIMER_030_72 = 23,
+	IB_RNR_TIMER_040_96 = 24,
+	IB_RNR_TIMER_061_44 = 25,
+	IB_RNR_TIMER_081_92 = 26,
+	IB_RNR_TIMER_122_88 = 27,
+	IB_RNR_TIMER_163_84 = 28,
+	IB_RNR_TIMER_245_76 = 29,
+	IB_RNR_TIMER_327_68 = 30,
+	IB_RNR_TIMER_491_52 = 31
+};
+
+enum ib_qp_attr_mask {
+	IB_QP_STATE			= 1,
+	IB_QP_CUR_STATE			= (1<<1),
+	IB_QP_EN_SQD_ASYNC_NOTIFY	= (1<<2),
+	IB_QP_ACCESS_FLAGS		= (1<<3),
+	IB_QP_PKEY_INDEX		= (1<<4),
+	IB_QP_PORT			= (1<<5),
+	IB_QP_QKEY			= (1<<6),
+	IB_QP_AV			= (1<<7),
+	IB_QP_PATH_MTU			= (1<<8),
+	IB_QP_TIMEOUT			= (1<<9),
+	IB_QP_RETRY_CNT			= (1<<10),
+	IB_QP_RNR_RETRY			= (1<<11),
+	IB_QP_RQ_PSN			= (1<<12),
+	IB_QP_MAX_QP_RD_ATOMIC		= (1<<13),
+	IB_QP_ALT_PATH			= (1<<14),
+	IB_QP_MIN_RNR_TIMER		= (1<<15),
+	IB_QP_SQ_PSN			= (1<<16),
+	IB_QP_MAX_DEST_RD_ATOMIC	= (1<<17),
+	IB_QP_PATH_MIG_STATE		= (1<<18),
+	IB_QP_CAP			= (1<<19),
+	IB_QP_DEST_QPN			= (1<<20),
+	IB_QP_SMAC			= (1<<21),
+	IB_QP_ALT_SMAC			= (1<<22),
+	IB_QP_VID			= (1<<23),
+	IB_QP_ALT_VID			= (1<<24),
+};
+
+enum ib_qp_state {
+	IB_QPS_RESET,
+	IB_QPS_INIT,
+	IB_QPS_RTR,
+	IB_QPS_RTS,
+	IB_QPS_SQD,
+	IB_QPS_SQE,
+	IB_QPS_ERR
+};
+
+enum ib_mig_state {
+	IB_MIG_MIGRATED,
+	IB_MIG_REARM,
+	IB_MIG_ARMED
+};
+
+enum ib_mw_type {
+	IB_MW_TYPE_1 = 1,
+	IB_MW_TYPE_2 = 2
+};
+
+struct ib_qp_attr {
+	enum ib_qp_state	qp_state;
+	enum ib_qp_state	cur_qp_state;
+	enum ib_mtu		path_mtu;
+	enum ib_mig_state	path_mig_state;
+	u32			qkey;
+	u32			rq_psn;
+	u32			sq_psn;
+	u32			dest_qp_num;
+	int			qp_access_flags;
+	struct ib_qp_cap	cap;
+	struct ib_ah_attr	ah_attr;
+	struct ib_ah_attr	alt_ah_attr;
+	u16			pkey_index;
+	u16			alt_pkey_index;
+	u8			en_sqd_async_notify;
+	u8			sq_draining;
+	u8			max_rd_atomic;
+	u8			max_dest_rd_atomic;
+	u8			min_rnr_timer;
+	u8			port_num;
+	u8			timeout;
+	u8			retry_cnt;
+	u8			rnr_retry;
+	u8			alt_port_num;
+	u8			alt_timeout;
+	u8			smac[ETH_ALEN];
+	u8			alt_smac[ETH_ALEN];
+	u16			vlan_id;
+	u16			alt_vlan_id;
+};
+
+enum ib_wr_opcode {
+	IB_WR_RDMA_WRITE,
+	IB_WR_RDMA_WRITE_WITH_IMM,
+	IB_WR_SEND,
+	IB_WR_SEND_WITH_IMM,
+	IB_WR_RDMA_READ,
+	IB_WR_ATOMIC_CMP_AND_SWP,
+	IB_WR_ATOMIC_FETCH_AND_ADD,
+	IB_WR_LSO,
+	IB_WR_SEND_WITH_INV,
+	IB_WR_RDMA_READ_WITH_INV,
+	IB_WR_LOCAL_INV,
+	IB_WR_FAST_REG_MR,
+	IB_WR_MASKED_ATOMIC_CMP_AND_SWP,
+	IB_WR_MASKED_ATOMIC_FETCH_AND_ADD,
+	IB_WR_BIND_MW,
+	IB_WR_REG_SIG_MR,
+	/* reserve values for low level drivers' internal use.
+	 * These values will not be used at all in the ib core layer.
+	 */
+	IB_WR_RESERVED1 = 0xf0,
+	IB_WR_RESERVED2,
+	IB_WR_RESERVED3,
+	IB_WR_RESERVED4,
+	IB_WR_RESERVED5,
+	IB_WR_RESERVED6,
+	IB_WR_RESERVED7,
+	IB_WR_RESERVED8,
+	IB_WR_RESERVED9,
+	IB_WR_RESERVED10,
+};
+
+enum ib_send_flags {
+	IB_SEND_FENCE		= 1,
+	IB_SEND_SIGNALED	= (1<<1),
+	IB_SEND_SOLICITED	= (1<<2),
+	IB_SEND_INLINE		= (1<<3),
+	IB_SEND_IP_CSUM		= (1<<4),
+
+	/* reserve bits 26-31 for low level drivers' internal use */
+	IB_SEND_RESERVED_START	= (1 << 26),
+	IB_SEND_RESERVED_END	= (1 << 31),
+};
+
+struct ib_sge {
+	u64	addr;
+	u32	length;
+	u32	lkey;
+};
+
+struct ib_fast_reg_page_list {
+	struct ib_device       *device;
+	u64		       *page_list;
+	unsigned int		max_page_list_len;
+};
+
+/**
+ * struct ib_mw_bind_info - Parameters for a memory window bind operation.
+ * @mr: A memory region to bind the memory window to.
+ * @addr: The address where the memory window should begin.
+ * @length: The length of the memory window, in bytes.
+ * @mw_access_flags: Access flags from enum ib_access_flags for the window.
+ *
+ * This struct contains the shared parameters for type 1 and type 2
+ * memory window bind operations.
+ */
+struct ib_mw_bind_info {
+	struct ib_mr   *mr;
+	u64		addr;
+	u64		length;
+	int		mw_access_flags;
+};
+
+struct ib_send_wr {
+	struct ib_send_wr      *next;
+	u64			wr_id;
+	struct ib_sge	       *sg_list;
+	int			num_sge;
+	enum ib_wr_opcode	opcode;
+	int			send_flags;
+	union {
+		__be32		imm_data;
+		u32		invalidate_rkey;
+	} ex;
+	union {
+		struct {
+			u64	remote_addr;
+			u32	rkey;
+		} rdma;
+		struct {
+			u64	remote_addr;
+			u64	compare_add;
+			u64	swap;
+			u64	compare_add_mask;
+			u64	swap_mask;
+			u32	rkey;
+		} atomic;
+		struct {
+			struct ib_ah *ah;
+			void   *header;
+			int     hlen;
+			int     mss;
+			u32	remote_qpn;
+			u32	remote_qkey;
+			u16	pkey_index; /* valid for GSI only */
+			u8	port_num;   /* valid for DR SMPs on switch only */
+		} ud;
+		struct {
+			u64				iova_start;
+			struct ib_fast_reg_page_list   *page_list;
+			unsigned int			page_shift;
+			unsigned int			page_list_len;
+			u32				length;
+			int				access_flags;
+			u32				rkey;
+		} fast_reg;
+		struct {
+			struct ib_mw            *mw;
+			/* The new rkey for the memory window. */
+			u32                      rkey;
+			struct ib_mw_bind_info   bind_info;
+		} bind_mw;
+		struct {
+			struct ib_sig_attrs    *sig_attrs;
+			struct ib_mr	       *sig_mr;
+			int			access_flags;
+			struct ib_sge	       *prot;
+		} sig_handover;
+	} wr;
+	u32			xrc_remote_srq_num;	/* XRC TGT QPs only */
+};
+
+struct ib_recv_wr {
+	struct ib_recv_wr      *next;
+	u64			wr_id;
+	struct ib_sge	       *sg_list;
+	int			num_sge;
+};
+
+enum ib_access_flags {
+	IB_ACCESS_LOCAL_WRITE	= 1,
+	IB_ACCESS_REMOTE_WRITE	= (1<<1),
+	IB_ACCESS_REMOTE_READ	= (1<<2),
+	IB_ACCESS_REMOTE_ATOMIC	= (1<<3),
+	IB_ACCESS_MW_BIND	= (1<<4),
+	IB_ZERO_BASED		= (1<<5)
+};
+
+struct ib_phys_buf {
+	u64      addr;
+	u64      size;
+};
+
+struct ib_mr_attr {
+	struct ib_pd	*pd;
+	u64		device_virt_addr;
+	u64		size;
+	int		mr_access_flags;
+	u32		lkey;
+	u32		rkey;
+};
+
+enum ib_mr_rereg_flags {
+	IB_MR_REREG_TRANS	= 1,
+	IB_MR_REREG_PD		= (1<<1),
+	IB_MR_REREG_ACCESS	= (1<<2),
+	IB_MR_REREG_SUPPORTED	= ((IB_MR_REREG_ACCESS << 1) - 1)
+};
+
+/**
+ * struct ib_mw_bind - Parameters for a type 1 memory window bind operation.
+ * @wr_id:      Work request id.
+ * @send_flags: Flags from ib_send_flags enum.
+ * @bind_info:  More parameters of the bind operation.
+ */
+struct ib_mw_bind {
+	u64                    wr_id;
+	int                    send_flags;
+	struct ib_mw_bind_info bind_info;
+};
+
+struct ib_fmr_attr {
+	int	max_pages;
+	int	max_maps;
+	u8	page_shift;
+};
+
+struct ib_ucontext {
+	struct ib_device       *device;
+	struct list_head	pd_list;
+	struct list_head	mr_list;
+	struct list_head	mw_list;
+	struct list_head	cq_list;
+	struct list_head	qp_list;
+	struct list_head	srq_list;
+	struct list_head	ah_list;
+	struct list_head	xrcd_list;
+	struct list_head	rule_list;
+	int			closing;
+};
+
+struct ib_uobject {
+	u64			user_handle;	/* handle given to us by userspace */
+	struct ib_ucontext     *context;	/* associated user context */
+	void		       *object;		/* containing object */
+	struct list_head	list;		/* link to context's list */
+	int			id;		/* index into kernel idr */
+	struct kref		ref;
+	struct rw_semaphore	mutex;		/* protects .live */
+	int			live;
+};
+
+struct ib_udata {
+	const void __user *inbuf;
+	void __user *outbuf;
+	size_t       inlen;
+	size_t       outlen;
+};
+
+struct ib_pd {
+	struct ib_device       *device;
+	struct ib_uobject      *uobject;
+	atomic_t          	usecnt; /* count all resources */
+};
+
+struct ib_xrcd {
+	struct ib_device       *device;
+	atomic_t		usecnt; /* count all exposed resources */
+	struct inode	       *inode;
+
+	struct mutex		tgt_qp_mutex;
+	struct list_head	tgt_qp_list;
+};
+
+struct ib_ah {
+	struct ib_device	*device;
+	struct ib_pd		*pd;
+	struct ib_uobject	*uobject;
+};
+
+typedef void (*ib_comp_handler)(struct ib_cq *cq, void *cq_context);
+
+struct ib_cq {
+	struct ib_device       *device;
+	struct ib_uobject      *uobject;
+	ib_comp_handler   	comp_handler;
+	void                  (*event_handler)(struct ib_event *, void *);
+	void                   *cq_context;
+	int               	cqe;
+	atomic_t          	usecnt; /* count number of work queues */
+};
+
+struct ib_srq {
+	struct ib_device       *device;
+	struct ib_pd	       *pd;
+	struct ib_uobject      *uobject;
+	void		      (*event_handler)(struct ib_event *, void *);
+	void		       *srq_context;
+	enum ib_srq_type	srq_type;
+	atomic_t		usecnt;
+
+	union {
+		struct {
+			struct ib_xrcd *xrcd;
+			struct ib_cq   *cq;
+			u32		srq_num;
+		} xrc;
+	} ext;
+};
+
+struct ib_qp {
+	struct ib_device       *device;
+	struct ib_pd	       *pd;
+	struct ib_cq	       *send_cq;
+	struct ib_cq	       *recv_cq;
+	struct ib_srq	       *srq;
+	struct ib_xrcd	       *xrcd; /* XRC TGT QPs only */
+	struct list_head	xrcd_list;
+	/* count times opened, mcast attaches, flow attaches */
+	atomic_t		usecnt;
+	struct list_head	open_list;
+	struct ib_qp           *real_qp;
+	struct ib_uobject      *uobject;
+	void                  (*event_handler)(struct ib_event *, void *);
+	void		       *qp_context;
+	u32			qp_num;
+	enum ib_qp_type		qp_type;
+};
+
+struct ib_mr {
+	struct ib_device  *device;
+	struct ib_pd	  *pd;
+	struct ib_uobject *uobject;
+	u32		   lkey;
+	u32		   rkey;
+	atomic_t	   usecnt; /* count number of MWs */
+};
+
+struct ib_mw {
+	struct ib_device	*device;
+	struct ib_pd		*pd;
+	struct ib_uobject	*uobject;
+	u32			rkey;
+	enum ib_mw_type         type;
+};
+
+struct ib_fmr {
+	struct ib_device	*device;
+	struct ib_pd		*pd;
+	struct list_head	list;
+	u32			lkey;
+	u32			rkey;
+};
+
+/* Supported steering options */
+enum ib_flow_attr_type {
+	/* steering according to rule specifications */
+	IB_FLOW_ATTR_NORMAL		= 0x0,
+	/* default unicast and multicast rule -
+	 * receive all Eth traffic which isn't steered to any QP
+	 */
+	IB_FLOW_ATTR_ALL_DEFAULT	= 0x1,
+	/* default multicast rule -
+	 * receive all Eth multicast traffic which isn't steered to any QP
+	 */
+	IB_FLOW_ATTR_MC_DEFAULT		= 0x2,
+	/* sniffer rule - receive all port traffic */
+	IB_FLOW_ATTR_SNIFFER		= 0x3
+};
+
+/* Supported steering header types */
+enum ib_flow_spec_type {
+	/* L2 headers*/
+	IB_FLOW_SPEC_ETH	= 0x20,
+	IB_FLOW_SPEC_IB		= 0x22,
+	/* L3 header*/
+	IB_FLOW_SPEC_IPV4	= 0x30,
+	/* L4 headers*/
+	IB_FLOW_SPEC_TCP	= 0x40,
+	IB_FLOW_SPEC_UDP	= 0x41
+};
+#define IB_FLOW_SPEC_LAYER_MASK	0xF0
+#define IB_FLOW_SPEC_SUPPORT_LAYERS 4
+
+/* Flow steering rule priority is set according to it's domain.
+ * Lower domain value means higher priority.
+ */
+enum ib_flow_domain {
+	IB_FLOW_DOMAIN_USER,
+	IB_FLOW_DOMAIN_ETHTOOL,
+	IB_FLOW_DOMAIN_RFS,
+	IB_FLOW_DOMAIN_NIC,
+	IB_FLOW_DOMAIN_NUM /* Must be last */
+};
+
+struct ib_flow_eth_filter {
+	u8	dst_mac[6];
+	u8	src_mac[6];
+	__be16	ether_type;
+	__be16	vlan_tag;
+};
+
+struct ib_flow_spec_eth {
+	enum ib_flow_spec_type	  type;
+	u16			  size;
+	struct ib_flow_eth_filter val;
+	struct ib_flow_eth_filter mask;
+};
+
+struct ib_flow_ib_filter {
+	__be16 dlid;
+	__u8   sl;
+};
+
+struct ib_flow_spec_ib {
+	enum ib_flow_spec_type	 type;
+	u16			 size;
+	struct ib_flow_ib_filter val;
+	struct ib_flow_ib_filter mask;
+};
+
+struct ib_flow_ipv4_filter {
+	__be32	src_ip;
+	__be32	dst_ip;
+};
+
+struct ib_flow_spec_ipv4 {
+	enum ib_flow_spec_type	   type;
+	u16			   size;
+	struct ib_flow_ipv4_filter val;
+	struct ib_flow_ipv4_filter mask;
+};
+
+struct ib_flow_tcp_udp_filter {
+	__be16	dst_port;
+	__be16	src_port;
+};
+
+struct ib_flow_spec_tcp_udp {
+	enum ib_flow_spec_type	      type;
+	u16			      size;
+	struct ib_flow_tcp_udp_filter val;
+	struct ib_flow_tcp_udp_filter mask;
+};
+
+union ib_flow_spec {
+	struct {
+		enum ib_flow_spec_type	type;
+		u16			size;
+	};
+	struct ib_flow_spec_eth		eth;
+	struct ib_flow_spec_ib		ib;
+	struct ib_flow_spec_ipv4        ipv4;
+	struct ib_flow_spec_tcp_udp	tcp_udp;
+};
+
+struct ib_flow_attr {
+	enum ib_flow_attr_type type;
+	u16	     size;
+	u16	     priority;
+	u32	     flags;
+	u8	     num_of_specs;
+	u8	     port;
+	/* Following are the optional layers according to user request
+	 * struct ib_flow_spec_xxx
+	 * struct ib_flow_spec_yyy
+	 */
+};
+
+struct ib_flow {
+	struct ib_qp		*qp;
+	struct ib_uobject	*uobject;
+};
+
+struct ib_mad;
+struct ib_grh;
+
+enum ib_process_mad_flags {
+	IB_MAD_IGNORE_MKEY	= 1,
+	IB_MAD_IGNORE_BKEY	= 2,
+	IB_MAD_IGNORE_ALL	= IB_MAD_IGNORE_MKEY | IB_MAD_IGNORE_BKEY
+};
+
+enum ib_mad_result {
+	IB_MAD_RESULT_FAILURE  = 0,      /* (!SUCCESS is the important flag) */
+	IB_MAD_RESULT_SUCCESS  = 1 << 0, /* MAD was successfully processed   */
+	IB_MAD_RESULT_REPLY    = 1 << 1, /* Reply packet needs to be sent    */
+	IB_MAD_RESULT_CONSUMED = 1 << 2  /* Packet consumed: stop processing */
+};
+
+#define IB_DEVICE_NAME_MAX 64
+
+struct ib_cache {
+	rwlock_t                lock;
+	struct ib_event_handler event_handler;
+	struct ib_pkey_cache  **pkey_cache;
+	struct ib_gid_cache   **gid_cache;
+	u8                     *lmc_cache;
+};
+
+struct ib_dma_mapping_ops {
+	int		(*mapping_error)(struct ib_device *dev,
+					 u64 dma_addr);
+	u64		(*map_single)(struct ib_device *dev,
+				      void *ptr, size_t size,
+				      enum dma_data_direction direction);
+	void		(*unmap_single)(struct ib_device *dev,
+					u64 addr, size_t size,
+					enum dma_data_direction direction);
+	u64		(*map_page)(struct ib_device *dev,
+				    struct page *page, unsigned long offset,
+				    size_t size,
+				    enum dma_data_direction direction);
+	void		(*unmap_page)(struct ib_device *dev,
+				      u64 addr, size_t size,
+				      enum dma_data_direction direction);
+	int		(*map_sg)(struct ib_device *dev,
+				  struct scatterlist *sg, int nents,
+				  enum dma_data_direction direction);
+	void		(*unmap_sg)(struct ib_device *dev,
+				    struct scatterlist *sg, int nents,
+				    enum dma_data_direction direction);
+	void		(*sync_single_for_cpu)(struct ib_device *dev,
+					       u64 dma_handle,
+					       size_t size,
+					       enum dma_data_direction dir);
+	void		(*sync_single_for_device)(struct ib_device *dev,
+						  u64 dma_handle,
+						  size_t size,
+						  enum dma_data_direction dir);
+	void		*(*alloc_coherent)(struct ib_device *dev,
+					   size_t size,
+					   u64 *dma_handle,
+					   gfp_t flag);
+	void		(*free_coherent)(struct ib_device *dev,
+					 size_t size, void *cpu_addr,
+					 u64 dma_handle);
+};
+
+struct iw_cm_verbs;
+
+struct ib_device {
+	struct device                *dma_device;
+
+	char                          name[IB_DEVICE_NAME_MAX];
+
+	struct list_head              event_handler_list;
+	spinlock_t                    event_handler_lock;
+
+	spinlock_t                    client_data_lock;
+	struct list_head              core_list;
+	struct list_head              client_data_list;
+
+	struct ib_cache               cache;
+	int                          *pkey_tbl_len;
+	int                          *gid_tbl_len;
+
+	int			      num_comp_vectors;
+
+	struct iw_cm_verbs	     *iwcm;
+
+	int		           (*get_protocol_stats)(struct ib_device *device,
+							 union rdma_protocol_stats *stats);
+	int		           (*query_device)(struct ib_device *device,
+						   struct ib_device_attr *device_attr);
+	int		           (*query_port)(struct ib_device *device,
+						 u8 port_num,
+						 struct ib_port_attr *port_attr);
+	enum rdma_link_layer	   (*get_link_layer)(struct ib_device *device,
+						     u8 port_num);
+	int		           (*query_gid)(struct ib_device *device,
+						u8 port_num, int index,
+						union ib_gid *gid);
+	int		           (*query_pkey)(struct ib_device *device,
+						 u8 port_num, u16 index, u16 *pkey);
+	int		           (*modify_device)(struct ib_device *device,
+						    int device_modify_mask,
+						    struct ib_device_modify *device_modify);
+	int		           (*modify_port)(struct ib_device *device,
+						  u8 port_num, int port_modify_mask,
+						  struct ib_port_modify *port_modify);
+	struct ib_ucontext *       (*alloc_ucontext)(struct ib_device *device,
+						     struct ib_udata *udata);
+	int                        (*dealloc_ucontext)(struct ib_ucontext *context);
+	int                        (*mmap)(struct ib_ucontext *context,
+					   struct vm_area_struct *vma);
+	struct ib_pd *             (*alloc_pd)(struct ib_device *device,
+					       struct ib_ucontext *context,
+					       struct ib_udata *udata);
+	int                        (*dealloc_pd)(struct ib_pd *pd);
+	struct ib_ah *             (*create_ah)(struct ib_pd *pd,
+						struct ib_ah_attr *ah_attr);
+	int                        (*modify_ah)(struct ib_ah *ah,
+						struct ib_ah_attr *ah_attr);
+	int                        (*query_ah)(struct ib_ah *ah,
+					       struct ib_ah_attr *ah_attr);
+	int                        (*destroy_ah)(struct ib_ah *ah);
+	struct ib_srq *            (*create_srq)(struct ib_pd *pd,
+						 struct ib_srq_init_attr *srq_init_attr,
+						 struct ib_udata *udata);
+	int                        (*modify_srq)(struct ib_srq *srq,
+						 struct ib_srq_attr *srq_attr,
+						 enum ib_srq_attr_mask srq_attr_mask,
+						 struct ib_udata *udata);
+	int                        (*query_srq)(struct ib_srq *srq,
+						struct ib_srq_attr *srq_attr);
+	int                        (*destroy_srq)(struct ib_srq *srq);
+	int                        (*post_srq_recv)(struct ib_srq *srq,
+						    struct ib_recv_wr *recv_wr,
+						    struct ib_recv_wr **bad_recv_wr);
+	struct ib_qp *             (*create_qp)(struct ib_pd *pd,
+						struct ib_qp_init_attr *qp_init_attr,
+						struct ib_udata *udata);
+	int                        (*modify_qp)(struct ib_qp *qp,
+						struct ib_qp_attr *qp_attr,
+						int qp_attr_mask,
+						struct ib_udata *udata);
+	int                        (*query_qp)(struct ib_qp *qp,
+					       struct ib_qp_attr *qp_attr,
+					       int qp_attr_mask,
+					       struct ib_qp_init_attr *qp_init_attr);
+	int                        (*destroy_qp)(struct ib_qp *qp);
+	int                        (*post_send)(struct ib_qp *qp,
+						struct ib_send_wr *send_wr,
+						struct ib_send_wr **bad_send_wr);
+	int                        (*post_recv)(struct ib_qp *qp,
+						struct ib_recv_wr *recv_wr,
+						struct ib_recv_wr **bad_recv_wr);
+	struct ib_cq *             (*create_cq)(struct ib_device *device, int cqe,
+						int comp_vector,
+						struct ib_ucontext *context,
+						struct ib_udata *udata);
+	int                        (*modify_cq)(struct ib_cq *cq, u16 cq_count,
+						u16 cq_period);
+	int                        (*destroy_cq)(struct ib_cq *cq);
+	int                        (*resize_cq)(struct ib_cq *cq, int cqe,
+						struct ib_udata *udata);
+	int                        (*poll_cq)(struct ib_cq *cq, int num_entries,
+					      struct ib_wc *wc);
+	int                        (*peek_cq)(struct ib_cq *cq, int wc_cnt);
+	int                        (*req_notify_cq)(struct ib_cq *cq,
+						    enum ib_cq_notify_flags flags);
+	int                        (*req_ncomp_notif)(struct ib_cq *cq,
+						      int wc_cnt);
+	struct ib_mr *             (*get_dma_mr)(struct ib_pd *pd,
+						 int mr_access_flags);
+	struct ib_mr *             (*reg_phys_mr)(struct ib_pd *pd,
+						  struct ib_phys_buf *phys_buf_array,
+						  int num_phys_buf,
+						  int mr_access_flags,
+						  u64 *iova_start);
+	struct ib_mr *             (*reg_user_mr)(struct ib_pd *pd,
+						  u64 start, u64 length,
+						  u64 virt_addr,
+						  int mr_access_flags,
+						  struct ib_udata *udata);
+	int			   (*rereg_user_mr)(struct ib_mr *mr,
+						    int flags,
+						    u64 start, u64 length,
+						    u64 virt_addr,
+						    int mr_access_flags,
+						    struct ib_pd *pd,
+						    struct ib_udata *udata);
+	int                        (*query_mr)(struct ib_mr *mr,
+					       struct ib_mr_attr *mr_attr);
+	int                        (*dereg_mr)(struct ib_mr *mr);
+	int                        (*destroy_mr)(struct ib_mr *mr);
+	struct ib_mr *		   (*create_mr)(struct ib_pd *pd,
+						struct ib_mr_init_attr *mr_init_attr);
+	struct ib_mr *		   (*alloc_fast_reg_mr)(struct ib_pd *pd,
+					       int max_page_list_len);
+	struct ib_fast_reg_page_list * (*alloc_fast_reg_page_list)(struct ib_device *device,
+								   int page_list_len);
+	void			   (*free_fast_reg_page_list)(struct ib_fast_reg_page_list *page_list);
+	int                        (*rereg_phys_mr)(struct ib_mr *mr,
+						    int mr_rereg_mask,
+						    struct ib_pd *pd,
+						    struct ib_phys_buf *phys_buf_array,
+						    int num_phys_buf,
+						    int mr_access_flags,
+						    u64 *iova_start);
+	struct ib_mw *             (*alloc_mw)(struct ib_pd *pd,
+					       enum ib_mw_type type);
+	int                        (*bind_mw)(struct ib_qp *qp,
+					      struct ib_mw *mw,
+					      struct ib_mw_bind *mw_bind);
+	int                        (*dealloc_mw)(struct ib_mw *mw);
+	struct ib_fmr *	           (*alloc_fmr)(struct ib_pd *pd,
+						int mr_access_flags,
+						struct ib_fmr_attr *fmr_attr);
+	int		           (*map_phys_fmr)(struct ib_fmr *fmr,
+						   u64 *page_list, int list_len,
+						   u64 iova);
+	int		           (*unmap_fmr)(struct list_head *fmr_list);
+	int		           (*dealloc_fmr)(struct ib_fmr *fmr);
+	int                        (*attach_mcast)(struct ib_qp *qp,
+						   union ib_gid *gid,
+						   u16 lid);
+	int                        (*detach_mcast)(struct ib_qp *qp,
+						   union ib_gid *gid,
+						   u16 lid);
+	int                        (*process_mad)(struct ib_device *device,
+						  int process_mad_flags,
+						  u8 port_num,
+						  struct ib_wc *in_wc,
+						  struct ib_grh *in_grh,
+						  struct ib_mad *in_mad,
+						  struct ib_mad *out_mad);
+	struct ib_xrcd *	   (*alloc_xrcd)(struct ib_device *device,
+						 struct ib_ucontext *ucontext,
+						 struct ib_udata *udata);
+	int			   (*dealloc_xrcd)(struct ib_xrcd *xrcd);
+	struct ib_flow *	   (*create_flow)(struct ib_qp *qp,
+						  struct ib_flow_attr
+						  *flow_attr,
+						  int domain);
+	int			   (*destroy_flow)(struct ib_flow *flow_id);
+	int			   (*check_mr_status)(struct ib_mr *mr, u32 check_mask,
+						      struct ib_mr_status *mr_status);
+
+	struct ib_dma_mapping_ops   *dma_ops;
+
+	struct module               *owner;
+	struct device                dev;
+	struct kobject               *ports_parent;
+	struct list_head             port_list;
+
+	enum {
+		IB_DEV_UNINITIALIZED,
+		IB_DEV_REGISTERED,
+		IB_DEV_UNREGISTERED
+	}                            reg_state;
+
+	int			     uverbs_abi_ver;
+	u64			     uverbs_cmd_mask;
+	u64			     uverbs_ex_cmd_mask;
+
+	char			     node_desc[64];
+	__be64			     node_guid;
+	u32			     local_dma_lkey;
+	u8                           node_type;
+	u8                           phys_port_cnt;
+};
+
+struct ib_client {
+	char  *name;
+	void (*add)   (struct ib_device *);
+	void (*remove)(struct ib_device *);
+
+	struct list_head list;
+};
+
+struct ib_device *ib_alloc_device(size_t size);
+void ib_dealloc_device(struct ib_device *device);
+
+int ib_register_device(struct ib_device *device,
+		       int (*port_callback)(struct ib_device *,
+					    u8, struct kobject *));
+void ib_unregister_device(struct ib_device *device);
+
+int ib_register_client   (struct ib_client *client);
+void ib_unregister_client(struct ib_client *client);
+
+void *ib_get_client_data(struct ib_device *device, struct ib_client *client);
+void  ib_set_client_data(struct ib_device *device, struct ib_client *client,
+			 void *data);
+
+static inline int ib_copy_from_udata(void *dest, struct ib_udata *udata, size_t len)
+{
+	return copy_from_user(dest, udata->inbuf, len) ? -EFAULT : 0;
+}
+
+static inline int ib_copy_to_udata(struct ib_udata *udata, void *src, size_t len)
+{
+	return copy_to_user(udata->outbuf, src, len) ? -EFAULT : 0;
+}
+
+/**
+ * ib_modify_qp_is_ok - Check that the supplied attribute mask
+ * contains all required attributes and no attributes not allowed for
+ * the given QP state transition.
+ * @cur_state: Current QP state
+ * @next_state: Next QP state
+ * @type: QP type
+ * @mask: Mask of supplied QP attributes
+ * @ll : link layer of port
+ *
+ * This function is a helper function that a low-level driver's
+ * modify_qp method can use to validate the consumer's input.  It
+ * checks that cur_state and next_state are valid QP states, that a
+ * transition from cur_state to next_state is allowed by the IB spec,
+ * and that the attribute mask supplied is allowed for the transition.
+ */
+int ib_modify_qp_is_ok(enum ib_qp_state cur_state, enum ib_qp_state next_state,
+		       enum ib_qp_type type, enum ib_qp_attr_mask mask,
+		       enum rdma_link_layer ll);
+
+int ib_register_event_handler  (struct ib_event_handler *event_handler);
+int ib_unregister_event_handler(struct ib_event_handler *event_handler);
+void ib_dispatch_event(struct ib_event *event);
+
+int ib_query_device(struct ib_device *device,
+		    struct ib_device_attr *device_attr);
+
+int ib_query_port(struct ib_device *device,
+		  u8 port_num, struct ib_port_attr *port_attr);
+
+enum rdma_link_layer rdma_port_get_link_layer(struct ib_device *device,
+					       u8 port_num);
+
+int ib_query_gid(struct ib_device *device,
+		 u8 port_num, int index, union ib_gid *gid);
+
+int ib_query_pkey(struct ib_device *device,
+		  u8 port_num, u16 index, u16 *pkey);
+
+int ib_modify_device(struct ib_device *device,
+		     int device_modify_mask,
+		     struct ib_device_modify *device_modify);
+
+int ib_modify_port(struct ib_device *device,
+		   u8 port_num, int port_modify_mask,
+		   struct ib_port_modify *port_modify);
+
+int ib_find_gid(struct ib_device *device, union ib_gid *gid,
+		u8 *port_num, u16 *index);
+
+int ib_find_pkey(struct ib_device *device,
+		 u8 port_num, u16 pkey, u16 *index);
+
+/**
+ * ib_alloc_pd - Allocates an unused protection domain.
+ * @device: The device on which to allocate the protection domain.
+ *
+ * A protection domain object provides an association between QPs, shared
+ * receive queues, address handles, memory regions, and memory windows.
+ */
+struct ib_pd *ib_alloc_pd(struct ib_device *device);
+
+/**
+ * ib_dealloc_pd - Deallocates a protection domain.
+ * @pd: The protection domain to deallocate.
+ */
+int ib_dealloc_pd(struct ib_pd *pd);
+
+/**
+ * ib_create_ah - Creates an address handle for the given address vector.
+ * @pd: The protection domain associated with the address handle.
+ * @ah_attr: The attributes of the address vector.
+ *
+ * The address handle is used to reference a local or global destination
+ * in all UD QP post sends.
+ */
+struct ib_ah *ib_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr);
+
+/**
+ * ib_init_ah_from_wc - Initializes address handle attributes from a
+ *   work completion.
+ * @device: Device on which the received message arrived.
+ * @port_num: Port on which the received message arrived.
+ * @wc: Work completion associated with the received message.
+ * @grh: References the received global route header.  This parameter is
+ *   ignored unless the work completion indicates that the GRH is valid.
+ * @ah_attr: Returned attributes that can be used when creating an address
+ *   handle for replying to the message.
+ */
+int ib_init_ah_from_wc(struct ib_device *device, u8 port_num, struct ib_wc *wc,
+		       struct ib_grh *grh, struct ib_ah_attr *ah_attr);
+
+/**
+ * ib_create_ah_from_wc - Creates an address handle associated with the
+ *   sender of the specified work completion.
+ * @pd: The protection domain associated with the address handle.
+ * @wc: Work completion information associated with a received message.
+ * @grh: References the received global route header.  This parameter is
+ *   ignored unless the work completion indicates that the GRH is valid.
+ * @port_num: The outbound port number to associate with the address.
+ *
+ * The address handle is used to reference a local or global destination
+ * in all UD QP post sends.
+ */
+struct ib_ah *ib_create_ah_from_wc(struct ib_pd *pd, struct ib_wc *wc,
+				   struct ib_grh *grh, u8 port_num);
+
+/**
+ * ib_modify_ah - Modifies the address vector associated with an address
+ *   handle.
+ * @ah: The address handle to modify.
+ * @ah_attr: The new address vector attributes to associate with the
+ *   address handle.
+ */
+int ib_modify_ah(struct ib_ah *ah, struct ib_ah_attr *ah_attr);
+
+/**
+ * ib_query_ah - Queries the address vector associated with an address
+ *   handle.
+ * @ah: The address handle to query.
+ * @ah_attr: The address vector attributes associated with the address
+ *   handle.
+ */
+int ib_query_ah(struct ib_ah *ah, struct ib_ah_attr *ah_attr);
+
+/**
+ * ib_destroy_ah - Destroys an address handle.
+ * @ah: The address handle to destroy.
+ */
+int ib_destroy_ah(struct ib_ah *ah);
+
+/**
+ * ib_create_srq - Creates a SRQ associated with the specified protection
+ *   domain.
+ * @pd: The protection domain associated with the SRQ.
+ * @srq_init_attr: A list of initial attributes required to create the
+ *   SRQ.  If SRQ creation succeeds, then the attributes are updated to
+ *   the actual capabilities of the created SRQ.
+ *
+ * srq_attr->max_wr and srq_attr->max_sge are read the determine the
+ * requested size of the SRQ, and set to the actual values allocated
+ * on return.  If ib_create_srq() succeeds, then max_wr and max_sge
+ * will always be at least as large as the requested values.
+ */
+struct ib_srq *ib_create_srq(struct ib_pd *pd,
+			     struct ib_srq_init_attr *srq_init_attr);
+
+/**
+ * ib_modify_srq - Modifies the attributes for the specified SRQ.
+ * @srq: The SRQ to modify.
+ * @srq_attr: On input, specifies the SRQ attributes to modify.  On output,
+ *   the current values of selected SRQ attributes are returned.
+ * @srq_attr_mask: A bit-mask used to specify which attributes of the SRQ
+ *   are being modified.
+ *
+ * The mask may contain IB_SRQ_MAX_WR to resize the SRQ and/or
+ * IB_SRQ_LIMIT to set the SRQ's limit and request notification when
+ * the number of receives queued drops below the limit.
+ */
+int ib_modify_srq(struct ib_srq *srq,
+		  struct ib_srq_attr *srq_attr,
+		  enum ib_srq_attr_mask srq_attr_mask);
+
+/**
+ * ib_query_srq - Returns the attribute list and current values for the
+ *   specified SRQ.
+ * @srq: The SRQ to query.
+ * @srq_attr: The attributes of the specified SRQ.
+ */
+int ib_query_srq(struct ib_srq *srq,
+		 struct ib_srq_attr *srq_attr);
+
+/**
+ * ib_destroy_srq - Destroys the specified SRQ.
+ * @srq: The SRQ to destroy.
+ */
+int ib_destroy_srq(struct ib_srq *srq);
+
+/**
+ * ib_post_srq_recv - Posts a list of work requests to the specified SRQ.
+ * @srq: The SRQ to post the work request on.
+ * @recv_wr: A list of work requests to post on the receive queue.
+ * @bad_recv_wr: On an immediate failure, this parameter will reference
+ *   the work request that failed to be posted on the QP.
+ */
+static inline int ib_post_srq_recv(struct ib_srq *srq,
+				   struct ib_recv_wr *recv_wr,
+				   struct ib_recv_wr **bad_recv_wr)
+{
+	return srq->device->post_srq_recv(srq, recv_wr, bad_recv_wr);
+}
+
+/**
+ * ib_create_qp - Creates a QP associated with the specified protection
+ *   domain.
+ * @pd: The protection domain associated with the QP.
+ * @qp_init_attr: A list of initial attributes required to create the
+ *   QP.  If QP creation succeeds, then the attributes are updated to
+ *   the actual capabilities of the created QP.
+ */
+struct ib_qp *ib_create_qp(struct ib_pd *pd,
+			   struct ib_qp_init_attr *qp_init_attr);
+
+/**
+ * ib_modify_qp - Modifies the attributes for the specified QP and then
+ *   transitions the QP to the given state.
+ * @qp: The QP to modify.
+ * @qp_attr: On input, specifies the QP attributes to modify.  On output,
+ *   the current values of selected QP attributes are returned.
+ * @qp_attr_mask: A bit-mask used to specify which attributes of the QP
+ *   are being modified.
+ */
+int ib_modify_qp(struct ib_qp *qp,
+		 struct ib_qp_attr *qp_attr,
+		 int qp_attr_mask);
+
+/**
+ * ib_query_qp - Returns the attribute list and current values for the
+ *   specified QP.
+ * @qp: The QP to query.
+ * @qp_attr: The attributes of the specified QP.
+ * @qp_attr_mask: A bit-mask used to select specific attributes to query.
+ * @qp_init_attr: Additional attributes of the selected QP.
+ *
+ * The qp_attr_mask may be used to limit the query to gathering only the
+ * selected attributes.
+ */
+int ib_query_qp(struct ib_qp *qp,
+		struct ib_qp_attr *qp_attr,
+		int qp_attr_mask,
+		struct ib_qp_init_attr *qp_init_attr);
+
+/**
+ * ib_destroy_qp - Destroys the specified QP.
+ * @qp: The QP to destroy.
+ */
+int ib_destroy_qp(struct ib_qp *qp);
+
+/**
+ * ib_open_qp - Obtain a reference to an existing sharable QP.
+ * @xrcd - XRC domain
+ * @qp_open_attr: Attributes identifying the QP to open.
+ *
+ * Returns a reference to a sharable QP.
+ */
+struct ib_qp *ib_open_qp(struct ib_xrcd *xrcd,
+			 struct ib_qp_open_attr *qp_open_attr);
+
+/**
+ * ib_close_qp - Release an external reference to a QP.
+ * @qp: The QP handle to release
+ *
+ * The opened QP handle is released by the caller.  The underlying
+ * shared QP is not destroyed until all internal references are released.
+ */
+int ib_close_qp(struct ib_qp *qp);
+
+/**
+ * ib_post_send - Posts a list of work requests to the send queue of
+ *   the specified QP.
+ * @qp: The QP to post the work request on.
+ * @send_wr: A list of work requests to post on the send queue.
+ * @bad_send_wr: On an immediate failure, this parameter will reference
+ *   the work request that failed to be posted on the QP.
+ *
+ * While IBA Vol. 1 section 11.4.1.1 specifies that if an immediate
+ * error is returned, the QP state shall not be affected,
+ * ib_post_send() will return an immediate error after queueing any
+ * earlier work requests in the list.
+ */
+static inline int ib_post_send(struct ib_qp *qp,
+			       struct ib_send_wr *send_wr,
+			       struct ib_send_wr **bad_send_wr)
+{
+	return qp->device->post_send(qp, send_wr, bad_send_wr);
+}
+
+/**
+ * ib_post_recv - Posts a list of work requests to the receive queue of
+ *   the specified QP.
+ * @qp: The QP to post the work request on.
+ * @recv_wr: A list of work requests to post on the receive queue.
+ * @bad_recv_wr: On an immediate failure, this parameter will reference
+ *   the work request that failed to be posted on the QP.
+ */
+static inline int ib_post_recv(struct ib_qp *qp,
+			       struct ib_recv_wr *recv_wr,
+			       struct ib_recv_wr **bad_recv_wr)
+{
+	return qp->device->post_recv(qp, recv_wr, bad_recv_wr);
+}
+
+/**
+ * ib_create_cq - Creates a CQ on the specified device.
+ * @device: The device on which to create the CQ.
+ * @comp_handler: A user-specified callback that is invoked when a
+ *   completion event occurs on the CQ.
+ * @event_handler: A user-specified callback that is invoked when an
+ *   asynchronous event not associated with a completion occurs on the CQ.
+ * @cq_context: Context associated with the CQ returned to the user via
+ *   the associated completion and event handlers.
+ * @cqe: The minimum size of the CQ.
+ * @comp_vector - Completion vector used to signal completion events.
+ *     Must be >= 0 and < context->num_comp_vectors.
+ *
+ * Users can examine the cq structure to determine the actual CQ size.
+ */
+struct ib_cq *ib_create_cq(struct ib_device *device,
+			   ib_comp_handler comp_handler,
+			   void (*event_handler)(struct ib_event *, void *),
+			   void *cq_context, int cqe, int comp_vector);
+
+/**
+ * ib_resize_cq - Modifies the capacity of the CQ.
+ * @cq: The CQ to resize.
+ * @cqe: The minimum size of the CQ.
+ *
+ * Users can examine the cq structure to determine the actual CQ size.
+ */
+int ib_resize_cq(struct ib_cq *cq, int cqe);
+
+/**
+ * ib_modify_cq - Modifies moderation params of the CQ
+ * @cq: The CQ to modify.
+ * @cq_count: number of CQEs that will trigger an event
+ * @cq_period: max period of time in usec before triggering an event
+ *
+ */
+int ib_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period);
+
+/**
+ * ib_destroy_cq - Destroys the specified CQ.
+ * @cq: The CQ to destroy.
+ */
+int ib_destroy_cq(struct ib_cq *cq);
+
+/**
+ * ib_poll_cq - poll a CQ for completion(s)
+ * @cq:the CQ being polled
+ * @num_entries:maximum number of completions to return
+ * @wc:array of at least @num_entries &struct ib_wc where completions
+ *   will be returned
+ *
+ * Poll a CQ for (possibly multiple) completions.  If the return value
+ * is < 0, an error occurred.  If the return value is >= 0, it is the
+ * number of completions returned.  If the return value is
+ * non-negative and < num_entries, then the CQ was emptied.
+ */
+static inline int ib_poll_cq(struct ib_cq *cq, int num_entries,
+			     struct ib_wc *wc)
+{
+	return cq->device->poll_cq(cq, num_entries, wc);
+}
+
+/**
+ * ib_peek_cq - Returns the number of unreaped completions currently
+ *   on the specified CQ.
+ * @cq: The CQ to peek.
+ * @wc_cnt: A minimum number of unreaped completions to check for.
+ *
+ * If the number of unreaped completions is greater than or equal to wc_cnt,
+ * this function returns wc_cnt, otherwise, it returns the actual number of
+ * unreaped completions.
+ */
+int ib_peek_cq(struct ib_cq *cq, int wc_cnt);
+
+/**
+ * ib_req_notify_cq - Request completion notification on a CQ.
+ * @cq: The CQ to generate an event for.
+ * @flags:
+ *   Must contain exactly one of %IB_CQ_SOLICITED or %IB_CQ_NEXT_COMP
+ *   to request an event on the next solicited event or next work
+ *   completion at any type, respectively. %IB_CQ_REPORT_MISSED_EVENTS
+ *   may also be |ed in to request a hint about missed events, as
+ *   described below.
+ *
+ * Return Value:
+ *    < 0 means an error occurred while requesting notification
+ *   == 0 means notification was requested successfully, and if
+ *        IB_CQ_REPORT_MISSED_EVENTS was passed in, then no events
+ *        were missed and it is safe to wait for another event.  In
+ *        this case is it guaranteed that any work completions added
+ *        to the CQ since the last CQ poll will trigger a completion
+ *        notification event.
+ *    > 0 is only returned if IB_CQ_REPORT_MISSED_EVENTS was passed
+ *        in.  It means that the consumer must poll the CQ again to
+ *        make sure it is empty to avoid missing an event because of a
+ *        race between requesting notification and an entry being
+ *        added to the CQ.  This return value means it is possible
+ *        (but not guaranteed) that a work completion has been added
+ *        to the CQ since the last poll without triggering a
+ *        completion notification event.
+ */
+static inline int ib_req_notify_cq(struct ib_cq *cq,
+				   enum ib_cq_notify_flags flags)
+{
+	return cq->device->req_notify_cq(cq, flags);
+}
+
+/**
+ * ib_req_ncomp_notif - Request completion notification when there are
+ *   at least the specified number of unreaped completions on the CQ.
+ * @cq: The CQ to generate an event for.
+ * @wc_cnt: The number of unreaped completions that should be on the
+ *   CQ before an event is generated.
+ */
+static inline int ib_req_ncomp_notif(struct ib_cq *cq, int wc_cnt)
+{
+	return cq->device->req_ncomp_notif ?
+		cq->device->req_ncomp_notif(cq, wc_cnt) :
+		-ENOSYS;
+}
+
+/**
+ * ib_get_dma_mr - Returns a memory region for system memory that is
+ *   usable for DMA.
+ * @pd: The protection domain associated with the memory region.
+ * @mr_access_flags: Specifies the memory access rights.
+ *
+ * Note that the ib_dma_*() functions defined below must be used
+ * to create/destroy addresses used with the Lkey or Rkey returned
+ * by ib_get_dma_mr().
+ */
+struct ib_mr *ib_get_dma_mr(struct ib_pd *pd, int mr_access_flags);
+
+/**
+ * ib_dma_mapping_error - check a DMA addr for error
+ * @dev: The device for which the dma_addr was created
+ * @dma_addr: The DMA address to check
+ */
+static inline int ib_dma_mapping_error(struct ib_device *dev, u64 dma_addr)
+{
+	if (dev->dma_ops)
+		return dev->dma_ops->mapping_error(dev, dma_addr);
+	return dma_mapping_error(dev->dma_device, dma_addr);
+}
+
+/**
+ * ib_dma_map_single - Map a kernel virtual address to DMA address
+ * @dev: The device for which the dma_addr is to be created
+ * @cpu_addr: The kernel virtual address
+ * @size: The size of the region in bytes
+ * @direction: The direction of the DMA
+ */
+static inline u64 ib_dma_map_single(struct ib_device *dev,
+				    void *cpu_addr, size_t size,
+				    enum dma_data_direction direction)
+{
+	if (dev->dma_ops)
+		return dev->dma_ops->map_single(dev, cpu_addr, size, direction);
+	return dma_map_single(dev->dma_device, cpu_addr, size, direction);
+}
+
+/**
+ * ib_dma_unmap_single - Destroy a mapping created by ib_dma_map_single()
+ * @dev: The device for which the DMA address was created
+ * @addr: The DMA address
+ * @size: The size of the region in bytes
+ * @direction: The direction of the DMA
+ */
+static inline void ib_dma_unmap_single(struct ib_device *dev,
+				       u64 addr, size_t size,
+				       enum dma_data_direction direction)
+{
+	if (dev->dma_ops)
+		dev->dma_ops->unmap_single(dev, addr, size, direction);
+	else
+		dma_unmap_single(dev->dma_device, addr, size, direction);
+}
+
+static inline u64 ib_dma_map_single_attrs(struct ib_device *dev,
+					  void *cpu_addr, size_t size,
+					  enum dma_data_direction direction,
+					  struct dma_attrs *attrs)
+{
+	return dma_map_single_attrs(dev->dma_device, cpu_addr, size,
+				    direction, attrs);
+}
+
+static inline void ib_dma_unmap_single_attrs(struct ib_device *dev,
+					     u64 addr, size_t size,
+					     enum dma_data_direction direction,
+					     struct dma_attrs *attrs)
+{
+	return dma_unmap_single_attrs(dev->dma_device, addr, size,
+				      direction, attrs);
+}
+
+/**
+ * ib_dma_map_page - Map a physical page to DMA address
+ * @dev: The device for which the dma_addr is to be created
+ * @page: The page to be mapped
+ * @offset: The offset within the page
+ * @size: The size of the region in bytes
+ * @direction: The direction of the DMA
+ */
+static inline u64 ib_dma_map_page(struct ib_device *dev,
+				  struct page *page,
+				  unsigned long offset,
+				  size_t size,
+					 enum dma_data_direction direction)
+{
+	if (dev->dma_ops)
+		return dev->dma_ops->map_page(dev, page, offset, size, direction);
+	return dma_map_page(dev->dma_device, page, offset, size, direction);
+}
+
+/**
+ * ib_dma_unmap_page - Destroy a mapping created by ib_dma_map_page()
+ * @dev: The device for which the DMA address was created
+ * @addr: The DMA address
+ * @size: The size of the region in bytes
+ * @direction: The direction of the DMA
+ */
+static inline void ib_dma_unmap_page(struct ib_device *dev,
+				     u64 addr, size_t size,
+				     enum dma_data_direction direction)
+{
+	if (dev->dma_ops)
+		dev->dma_ops->unmap_page(dev, addr, size, direction);
+	else
+		dma_unmap_page(dev->dma_device, addr, size, direction);
+}
+
+/**
+ * ib_dma_map_sg - Map a scatter/gather list to DMA addresses
+ * @dev: The device for which the DMA addresses are to be created
+ * @sg: The array of scatter/gather entries
+ * @nents: The number of scatter/gather entries
+ * @direction: The direction of the DMA
+ */
+static inline int ib_dma_map_sg(struct ib_device *dev,
+				struct scatterlist *sg, int nents,
+				enum dma_data_direction direction)
+{
+	if (dev->dma_ops)
+		return dev->dma_ops->map_sg(dev, sg, nents, direction);
+	return dma_map_sg(dev->dma_device, sg, nents, direction);
+}
+
+/**
+ * ib_dma_unmap_sg - Unmap a scatter/gather list of DMA addresses
+ * @dev: The device for which the DMA addresses were created
+ * @sg: The array of scatter/gather entries
+ * @nents: The number of scatter/gather entries
+ * @direction: The direction of the DMA
+ */
+static inline void ib_dma_unmap_sg(struct ib_device *dev,
+				   struct scatterlist *sg, int nents,
+				   enum dma_data_direction direction)
+{
+	if (dev->dma_ops)
+		dev->dma_ops->unmap_sg(dev, sg, nents, direction);
+	else
+		dma_unmap_sg(dev->dma_device, sg, nents, direction);
+}
+
+static inline int ib_dma_map_sg_attrs(struct ib_device *dev,
+				      struct scatterlist *sg, int nents,
+				      enum dma_data_direction direction,
+				      struct dma_attrs *attrs)
+{
+	return dma_map_sg_attrs(dev->dma_device, sg, nents, direction, attrs);
+}
+
+static inline void ib_dma_unmap_sg_attrs(struct ib_device *dev,
+					 struct scatterlist *sg, int nents,
+					 enum dma_data_direction direction,
+					 struct dma_attrs *attrs)
+{
+	dma_unmap_sg_attrs(dev->dma_device, sg, nents, direction, attrs);
+}
+/**
+ * ib_sg_dma_address - Return the DMA address from a scatter/gather entry
+ * @dev: The device for which the DMA addresses were created
+ * @sg: The scatter/gather entry
+ *
+ * Note: this function is obsolete. To do: change all occurrences of
+ * ib_sg_dma_address() into sg_dma_address().
+ */
+static inline u64 ib_sg_dma_address(struct ib_device *dev,
+				    struct scatterlist *sg)
+{
+	return sg_dma_address(sg);
+}
+
+/**
+ * ib_sg_dma_len - Return the DMA length from a scatter/gather entry
+ * @dev: The device for which the DMA addresses were created
+ * @sg: The scatter/gather entry
+ *
+ * Note: this function is obsolete. To do: change all occurrences of
+ * ib_sg_dma_len() into sg_dma_len().
+ */
+static inline unsigned int ib_sg_dma_len(struct ib_device *dev,
+					 struct scatterlist *sg)
+{
+	return sg_dma_len(sg);
+}
+
+/**
+ * ib_dma_sync_single_for_cpu - Prepare DMA region to be accessed by CPU
+ * @dev: The device for which the DMA address was created
+ * @addr: The DMA address
+ * @size: The size of the region in bytes
+ * @dir: The direction of the DMA
+ */
+static inline void ib_dma_sync_single_for_cpu(struct ib_device *dev,
+					      u64 addr,
+					      size_t size,
+					      enum dma_data_direction dir)
+{
+	if (dev->dma_ops)
+		dev->dma_ops->sync_single_for_cpu(dev, addr, size, dir);
+	else
+		dma_sync_single_for_cpu(dev->dma_device, addr, size, dir);
+}
+
+/**
+ * ib_dma_sync_single_for_device - Prepare DMA region to be accessed by device
+ * @dev: The device for which the DMA address was created
+ * @addr: The DMA address
+ * @size: The size of the region in bytes
+ * @dir: The direction of the DMA
+ */
+static inline void ib_dma_sync_single_for_device(struct ib_device *dev,
+						 u64 addr,
+						 size_t size,
+						 enum dma_data_direction dir)
+{
+	if (dev->dma_ops)
+		dev->dma_ops->sync_single_for_device(dev, addr, size, dir);
+	else
+		dma_sync_single_for_device(dev->dma_device, addr, size, dir);
+}
+
+/**
+ * ib_dma_alloc_coherent - Allocate memory and map it for DMA
+ * @dev: The device for which the DMA address is requested
+ * @size: The size of the region to allocate in bytes
+ * @dma_handle: A pointer for returning the DMA address of the region
+ * @flag: memory allocator flags
+ */
+static inline void *ib_dma_alloc_coherent(struct ib_device *dev,
+					   size_t size,
+					   u64 *dma_handle,
+					   gfp_t flag)
+{
+	if (dev->dma_ops)
+		return dev->dma_ops->alloc_coherent(dev, size, dma_handle, flag);
+	else {
+		dma_addr_t handle;
+		void *ret;
+
+		ret = dma_alloc_coherent(dev->dma_device, size, &handle, flag);
+		*dma_handle = handle;
+		return ret;
+	}
+}
+
+/**
+ * ib_dma_free_coherent - Free memory allocated by ib_dma_alloc_coherent()
+ * @dev: The device for which the DMA addresses were allocated
+ * @size: The size of the region
+ * @cpu_addr: the address returned by ib_dma_alloc_coherent()
+ * @dma_handle: the DMA address returned by ib_dma_alloc_coherent()
+ */
+static inline void ib_dma_free_coherent(struct ib_device *dev,
+					size_t size, void *cpu_addr,
+					u64 dma_handle)
+{
+	if (dev->dma_ops)
+		dev->dma_ops->free_coherent(dev, size, cpu_addr, dma_handle);
+	else
+		dma_free_coherent(dev->dma_device, size, cpu_addr, dma_handle);
+}
+
+/**
+ * ib_reg_phys_mr - Prepares a virtually addressed memory region for use
+ *   by an HCA.
+ * @pd: The protection domain associated assigned to the registered region.
+ * @phys_buf_array: Specifies a list of physical buffers to use in the
+ *   memory region.
+ * @num_phys_buf: Specifies the size of the phys_buf_array.
+ * @mr_access_flags: Specifies the memory access rights.
+ * @iova_start: The offset of the region's starting I/O virtual address.
+ */
+struct ib_mr *ib_reg_phys_mr(struct ib_pd *pd,
+			     struct ib_phys_buf *phys_buf_array,
+			     int num_phys_buf,
+			     int mr_access_flags,
+			     u64 *iova_start);
+
+/**
+ * ib_rereg_phys_mr - Modifies the attributes of an existing memory region.
+ *   Conceptually, this call performs the functions deregister memory region
+ *   followed by register physical memory region.  Where possible,
+ *   resources are reused instead of deallocated and reallocated.
+ * @mr: The memory region to modify.
+ * @mr_rereg_mask: A bit-mask used to indicate which of the following
+ *   properties of the memory region are being modified.
+ * @pd: If %IB_MR_REREG_PD is set in mr_rereg_mask, this field specifies
+ *   the new protection domain to associated with the memory region,
+ *   otherwise, this parameter is ignored.
+ * @phys_buf_array: If %IB_MR_REREG_TRANS is set in mr_rereg_mask, this
+ *   field specifies a list of physical buffers to use in the new
+ *   translation, otherwise, this parameter is ignored.
+ * @num_phys_buf: If %IB_MR_REREG_TRANS is set in mr_rereg_mask, this
+ *   field specifies the size of the phys_buf_array, otherwise, this
+ *   parameter is ignored.
+ * @mr_access_flags: If %IB_MR_REREG_ACCESS is set in mr_rereg_mask, this
+ *   field specifies the new memory access rights, otherwise, this
+ *   parameter is ignored.
+ * @iova_start: The offset of the region's starting I/O virtual address.
+ */
+int ib_rereg_phys_mr(struct ib_mr *mr,
+		     int mr_rereg_mask,
+		     struct ib_pd *pd,
+		     struct ib_phys_buf *phys_buf_array,
+		     int num_phys_buf,
+		     int mr_access_flags,
+		     u64 *iova_start);
+
+/**
+ * ib_query_mr - Retrieves information about a specific memory region.
+ * @mr: The memory region to retrieve information about.
+ * @mr_attr: The attributes of the specified memory region.
+ */
+int ib_query_mr(struct ib_mr *mr, struct ib_mr_attr *mr_attr);
+
+/**
+ * ib_dereg_mr - Deregisters a memory region and removes it from the
+ *   HCA translation table.
+ * @mr: The memory region to deregister.
+ *
+ * This function can fail, if the memory region has memory windows bound to it.
+ */
+int ib_dereg_mr(struct ib_mr *mr);
+
+
+/**
+ * ib_create_mr - Allocates a memory region that may be used for
+ *     signature handover operations.
+ * @pd: The protection domain associated with the region.
+ * @mr_init_attr: memory region init attributes.
+ */
+struct ib_mr *ib_create_mr(struct ib_pd *pd,
+			   struct ib_mr_init_attr *mr_init_attr);
+
+/**
+ * ib_destroy_mr - Destroys a memory region that was created using
+ *     ib_create_mr and removes it from HW translation tables.
+ * @mr: The memory region to destroy.
+ *
+ * This function can fail, if the memory region has memory windows bound to it.
+ */
+int ib_destroy_mr(struct ib_mr *mr);
+
+/**
+ * ib_alloc_fast_reg_mr - Allocates memory region usable with the
+ *   IB_WR_FAST_REG_MR send work request.
+ * @pd: The protection domain associated with the region.
+ * @max_page_list_len: requested max physical buffer list length to be
+ *   used with fast register work requests for this MR.
+ */
+struct ib_mr *ib_alloc_fast_reg_mr(struct ib_pd *pd, int max_page_list_len);
+
+/**
+ * ib_alloc_fast_reg_page_list - Allocates a page list array
+ * @device - ib device pointer.
+ * @page_list_len - size of the page list array to be allocated.
+ *
+ * This allocates and returns a struct ib_fast_reg_page_list * and a
+ * page_list array that is at least page_list_len in size.  The actual
+ * size is returned in max_page_list_len.  The caller is responsible
+ * for initializing the contents of the page_list array before posting
+ * a send work request with the IB_WC_FAST_REG_MR opcode.
+ *
+ * The page_list array entries must be translated using one of the
+ * ib_dma_*() functions just like the addresses passed to
+ * ib_map_phys_fmr().  Once the ib_post_send() is issued, the struct
+ * ib_fast_reg_page_list must not be modified by the caller until the
+ * IB_WC_FAST_REG_MR work request completes.
+ */
+struct ib_fast_reg_page_list *ib_alloc_fast_reg_page_list(
+				struct ib_device *device, int page_list_len);
+
+/**
+ * ib_free_fast_reg_page_list - Deallocates a previously allocated
+ *   page list array.
+ * @page_list - struct ib_fast_reg_page_list pointer to be deallocated.
+ */
+void ib_free_fast_reg_page_list(struct ib_fast_reg_page_list *page_list);
+
+/**
+ * ib_update_fast_reg_key - updates the key portion of the fast_reg MR
+ *   R_Key and L_Key.
+ * @mr - struct ib_mr pointer to be updated.
+ * @newkey - new key to be used.
+ */
+static inline void ib_update_fast_reg_key(struct ib_mr *mr, u8 newkey)
+{
+	mr->lkey = (mr->lkey & 0xffffff00) | newkey;
+	mr->rkey = (mr->rkey & 0xffffff00) | newkey;
+}
+
+/**
+ * ib_inc_rkey - increments the key portion of the given rkey. Can be used
+ * for calculating a new rkey for type 2 memory windows.
+ * @rkey - the rkey to increment.
+ */
+static inline u32 ib_inc_rkey(u32 rkey)
+{
+	const u32 mask = 0x000000ff;
+	return ((rkey + 1) & mask) | (rkey & ~mask);
+}
+
+/**
+ * ib_alloc_mw - Allocates a memory window.
+ * @pd: The protection domain associated with the memory window.
+ * @type: The type of the memory window (1 or 2).
+ */
+struct ib_mw *ib_alloc_mw(struct ib_pd *pd, enum ib_mw_type type);
+
+/**
+ * ib_bind_mw - Posts a work request to the send queue of the specified
+ *   QP, which binds the memory window to the given address range and
+ *   remote access attributes.
+ * @qp: QP to post the bind work request on.
+ * @mw: The memory window to bind.
+ * @mw_bind: Specifies information about the memory window, including
+ *   its address range, remote access rights, and associated memory region.
+ *
+ * If there is no immediate error, the function will update the rkey member
+ * of the mw parameter to its new value. The bind operation can still fail
+ * asynchronously.
+ */
+static inline int ib_bind_mw(struct ib_qp *qp,
+			     struct ib_mw *mw,
+			     struct ib_mw_bind *mw_bind)
+{
+	/* XXX reference counting in corresponding MR? */
+	return mw->device->bind_mw ?
+		mw->device->bind_mw(qp, mw, mw_bind) :
+		-ENOSYS;
+}
+
+/**
+ * ib_dealloc_mw - Deallocates a memory window.
+ * @mw: The memory window to deallocate.
+ */
+int ib_dealloc_mw(struct ib_mw *mw);
+
+/**
+ * ib_alloc_fmr - Allocates a unmapped fast memory region.
+ * @pd: The protection domain associated with the unmapped region.
+ * @mr_access_flags: Specifies the memory access rights.
+ * @fmr_attr: Attributes of the unmapped region.
+ *
+ * A fast memory region must be mapped before it can be used as part of
+ * a work request.
+ */
+struct ib_fmr *ib_alloc_fmr(struct ib_pd *pd,
+			    int mr_access_flags,
+			    struct ib_fmr_attr *fmr_attr);
+
+/**
+ * ib_map_phys_fmr - Maps a list of physical pages to a fast memory region.
+ * @fmr: The fast memory region to associate with the pages.
+ * @page_list: An array of physical pages to map to the fast memory region.
+ * @list_len: The number of pages in page_list.
+ * @iova: The I/O virtual address to use with the mapped region.
+ */
+static inline int ib_map_phys_fmr(struct ib_fmr *fmr,
+				  u64 *page_list, int list_len,
+				  u64 iova)
+{
+	return fmr->device->map_phys_fmr(fmr, page_list, list_len, iova);
+}
+
+/**
+ * ib_unmap_fmr - Removes the mapping from a list of fast memory regions.
+ * @fmr_list: A linked list of fast memory regions to unmap.
+ */
+int ib_unmap_fmr(struct list_head *fmr_list);
+
+/**
+ * ib_dealloc_fmr - Deallocates a fast memory region.
+ * @fmr: The fast memory region to deallocate.
+ */
+int ib_dealloc_fmr(struct ib_fmr *fmr);
+
+/**
+ * ib_attach_mcast - Attaches the specified QP to a multicast group.
+ * @qp: QP to attach to the multicast group.  The QP must be type
+ *   IB_QPT_UD.
+ * @gid: Multicast group GID.
+ * @lid: Multicast group LID in host byte order.
+ *
+ * In order to send and receive multicast packets, subnet
+ * administration must have created the multicast group and configured
+ * the fabric appropriately.  The port associated with the specified
+ * QP must also be a member of the multicast group.
+ */
+int ib_attach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid);
+
+/**
+ * ib_detach_mcast - Detaches the specified QP from a multicast group.
+ * @qp: QP to detach from the multicast group.
+ * @gid: Multicast group GID.
+ * @lid: Multicast group LID in host byte order.
+ */
+int ib_detach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid);
+
+/**
+ * ib_alloc_xrcd - Allocates an XRC domain.
+ * @device: The device on which to allocate the XRC domain.
+ */
+struct ib_xrcd *ib_alloc_xrcd(struct ib_device *device);
+
+/**
+ * ib_dealloc_xrcd - Deallocates an XRC domain.
+ * @xrcd: The XRC domain to deallocate.
+ */
+int ib_dealloc_xrcd(struct ib_xrcd *xrcd);
+
+struct ib_flow *ib_create_flow(struct ib_qp *qp,
+			       struct ib_flow_attr *flow_attr, int domain);
+int ib_destroy_flow(struct ib_flow *flow_id);
+
+static inline int ib_check_mr_access(int flags)
+{
+	/*
+	 * Local write permission is required if remote write or
+	 * remote atomic permission is also requested.
+	 */
+	if (flags & (IB_ACCESS_REMOTE_ATOMIC | IB_ACCESS_REMOTE_WRITE) &&
+	    !(flags & IB_ACCESS_LOCAL_WRITE))
+		return -EINVAL;
+
+	return 0;
+}
+
+/**
+ * ib_check_mr_status: lightweight check of MR status.
+ *     This routine may provide status checks on a selected
+ *     ib_mr. first use is for signature status check.
+ *
+ * @mr: A memory region.
+ * @check_mask: Bitmask of which checks to perform from
+ *     ib_mr_status_check enumeration.
+ * @mr_status: The container of relevant status checks.
+ *     failed checks will be indicated in the status bitmask
+ *     and the relevant info shall be in the error item.
+ */
+int ib_check_mr_status(struct ib_mr *mr, u32 check_mask,
+		       struct ib_mr_status *mr_status);
+
+#endif /* IB_VERBS_H */
diff --git a/include/rdma/iw_cm.h b/include/rdma/iw_cm.h
new file mode 100644
index 0000000..1017e0b
--- /dev/null
+++ b/include/rdma/iw_cm.h
@@ -0,0 +1,251 @@
+/*
+ * Copyright (c) 2005 Network Appliance, Inc. All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef IW_CM_H
+#define IW_CM_H
+
+#include <linux/in.h>
+#include <rdma/ib_cm.h>
+
+struct iw_cm_id;
+
+enum iw_cm_event_type {
+	IW_CM_EVENT_CONNECT_REQUEST = 1, /* connect request received */
+	IW_CM_EVENT_CONNECT_REPLY,	 /* reply from active connect request */
+	IW_CM_EVENT_ESTABLISHED,	 /* passive side accept successful */
+	IW_CM_EVENT_DISCONNECT,		 /* orderly shutdown */
+	IW_CM_EVENT_CLOSE		 /* close complete */
+};
+
+struct iw_cm_event {
+	enum iw_cm_event_type event;
+	int			 status;
+	struct sockaddr_storage local_addr;
+	struct sockaddr_storage remote_addr;
+	void *private_data;
+	void *provider_data;
+	u8 private_data_len;
+	u8 ord;
+	u8 ird;
+};
+
+/**
+ * iw_cm_handler - Function to be called by the IW CM when delivering events
+ * to the client.
+ *
+ * @cm_id: The IW CM identifier associated with the event.
+ * @event: Pointer to the event structure.
+ */
+typedef int (*iw_cm_handler)(struct iw_cm_id *cm_id,
+			     struct iw_cm_event *event);
+
+/**
+ * iw_event_handler - Function called by the provider when delivering provider
+ * events to the IW CM.  Returns either 0 indicating the event was processed
+ * or -errno if the event could not be processed.
+ *
+ * @cm_id: The IW CM identifier associated with the event.
+ * @event: Pointer to the event structure.
+ */
+typedef int (*iw_event_handler)(struct iw_cm_id *cm_id,
+				 struct iw_cm_event *event);
+
+struct iw_cm_id {
+	iw_cm_handler		cm_handler;      /* client callback function */
+	void		        *context;	 /* client cb context */
+	struct ib_device	*device;
+	struct sockaddr_storage local_addr;
+	struct sockaddr_storage	remote_addr;
+	void			*provider_data;	 /* provider private data */
+	iw_event_handler        event_handler;   /* cb for provider
+						    events */
+	/* Used by provider to add and remove refs on IW cm_id */
+	void (*add_ref)(struct iw_cm_id *);
+	void (*rem_ref)(struct iw_cm_id *);
+};
+
+struct iw_cm_conn_param {
+	const void *private_data;
+	u16 private_data_len;
+	u32 ord;
+	u32 ird;
+	u32 qpn;
+};
+
+struct iw_cm_verbs {
+	void		(*add_ref)(struct ib_qp *qp);
+
+	void		(*rem_ref)(struct ib_qp *qp);
+
+	struct ib_qp *	(*get_qp)(struct ib_device *device,
+				  int qpn);
+
+	int		(*connect)(struct iw_cm_id *cm_id,
+				   struct iw_cm_conn_param *conn_param);
+
+	int		(*accept)(struct iw_cm_id *cm_id,
+				  struct iw_cm_conn_param *conn_param);
+
+	int		(*reject)(struct iw_cm_id *cm_id,
+				  const void *pdata, u8 pdata_len);
+
+	int		(*create_listen)(struct iw_cm_id *cm_id,
+					 int backlog);
+
+	int		(*destroy_listen)(struct iw_cm_id *cm_id);
+};
+
+/**
+ * iw_create_cm_id - Create an IW CM identifier.
+ *
+ * @device: The IB device on which to create the IW CM identier.
+ * @event_handler: User callback invoked to report events associated with the
+ *   returned IW CM identifier.
+ * @context: User specified context associated with the id.
+ */
+struct iw_cm_id *iw_create_cm_id(struct ib_device *device,
+				 iw_cm_handler cm_handler, void *context);
+
+/**
+ * iw_destroy_cm_id - Destroy an IW CM identifier.
+ *
+ * @cm_id: The previously created IW CM identifier to destroy.
+ *
+ * The client can assume that no events will be delivered for the CM ID after
+ * this function returns.
+ */
+void iw_destroy_cm_id(struct iw_cm_id *cm_id);
+
+/**
+ * iw_cm_bind_qp - Unbind the specified IW CM identifier and QP
+ *
+ * @cm_id: The IW CM idenfier to unbind from the QP.
+ * @qp: The QP
+ *
+ * This is called by the provider when destroying the QP to ensure
+ * that any references held by the IWCM are released. It may also
+ * be called by the IWCM when destroying a CM_ID to that any
+ * references held by the provider are released.
+ */
+void iw_cm_unbind_qp(struct iw_cm_id *cm_id, struct ib_qp *qp);
+
+/**
+ * iw_cm_get_qp - Return the ib_qp associated with a QPN
+ *
+ * @ib_device: The IB device
+ * @qpn: The queue pair number
+ */
+struct ib_qp *iw_cm_get_qp(struct ib_device *device, int qpn);
+
+/**
+ * iw_cm_listen - Listen for incoming connection requests on the
+ * specified IW CM id.
+ *
+ * @cm_id: The IW CM identifier.
+ * @backlog: The maximum number of outstanding un-accepted inbound listen
+ *   requests to queue.
+ *
+ * The source address and port number are specified in the IW CM identifier
+ * structure.
+ */
+int iw_cm_listen(struct iw_cm_id *cm_id, int backlog);
+
+/**
+ * iw_cm_accept - Called to accept an incoming connect request.
+ *
+ * @cm_id: The IW CM identifier associated with the connection request.
+ * @iw_param: Pointer to a structure containing connection establishment
+ *   parameters.
+ *
+ * The specified cm_id will have been provided in the event data for a
+ * CONNECT_REQUEST event. Subsequent events related to this connection will be
+ * delivered to the specified IW CM identifier prior and may occur prior to
+ * the return of this function. If this function returns a non-zero value, the
+ * client can assume that no events will be delivered to the specified IW CM
+ * identifier.
+ */
+int iw_cm_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *iw_param);
+
+/**
+ * iw_cm_reject - Reject an incoming connection request.
+ *
+ * @cm_id: Connection identifier associated with the request.
+ * @private_daa: Pointer to data to deliver to the remote peer as part of the
+ *   reject message.
+ * @private_data_len: The number of bytes in the private_data parameter.
+ *
+ * The client can assume that no events will be delivered to the specified IW
+ * CM identifier following the return of this function. The private_data
+ * buffer is available for reuse when this function returns.
+ */
+int iw_cm_reject(struct iw_cm_id *cm_id, const void *private_data,
+		 u8 private_data_len);
+
+/**
+ * iw_cm_connect - Called to request a connection to a remote peer.
+ *
+ * @cm_id: The IW CM identifier for the connection.
+ * @iw_param: Pointer to a structure containing connection  establishment
+ *   parameters.
+ *
+ * Events may be delivered to the specified IW CM identifier prior to the
+ * return of this function. If this function returns a non-zero value, the
+ * client can assume that no events will be delivered to the specified IW CM
+ * identifier.
+ */
+int iw_cm_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *iw_param);
+
+/**
+ * iw_cm_disconnect - Close the specified connection.
+ *
+ * @cm_id: The IW CM identifier to close.
+ * @abrupt: If 0, the connection will be closed gracefully, otherwise, the
+ *   connection will be reset.
+ *
+ * The IW CM identifier is still active until the IW_CM_EVENT_CLOSE event is
+ * delivered.
+ */
+int iw_cm_disconnect(struct iw_cm_id *cm_id, int abrupt);
+
+/**
+ * iw_cm_init_qp_attr - Called to initialize the attributes of the QP
+ * associated with a IW CM identifier.
+ *
+ * @cm_id: The IW CM identifier associated with the QP
+ * @qp_attr: Pointer to the QP attributes structure.
+ * @qp_attr_mask: Pointer to a bit vector specifying which QP attributes are
+ *   valid.
+ */
+int iw_cm_init_qp_attr(struct iw_cm_id *cm_id, struct ib_qp_attr *qp_attr,
+		       int *qp_attr_mask);
+
+#endif /* IW_CM_H */
diff --git a/include/rdma/iw_portmap.h b/include/rdma/iw_portmap.h
new file mode 100644
index 0000000..928b277
--- /dev/null
+++ b/include/rdma/iw_portmap.h
@@ -0,0 +1,199 @@
+/*
+ * Copyright (c) 2014 Intel Corporation. All rights reserved.
+ * Copyright (c) 2014 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer in the documentation and/or other materials
+ *	  provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef _IW_PORTMAP_H
+#define _IW_PORTMAP_H
+
+#define IWPM_ULIBNAME_SIZE	32
+#define IWPM_DEVNAME_SIZE	32
+#define IWPM_IFNAME_SIZE	16
+#define IWPM_IPADDR_SIZE	16
+
+enum {
+	IWPM_INVALID_NLMSG_ERR = 10,
+	IWPM_CREATE_MAPPING_ERR,
+	IWPM_DUPLICATE_MAPPING_ERR,
+	IWPM_UNKNOWN_MAPPING_ERR,
+	IWPM_CLIENT_DEV_INFO_ERR,
+	IWPM_USER_LIB_INFO_ERR,
+	IWPM_REMOTE_QUERY_REJECT
+};
+
+struct iwpm_dev_data {
+	char dev_name[IWPM_DEVNAME_SIZE];
+	char if_name[IWPM_IFNAME_SIZE];
+};
+
+struct iwpm_sa_data {
+	struct sockaddr_storage loc_addr;
+	struct sockaddr_storage mapped_loc_addr;
+	struct sockaddr_storage rem_addr;
+	struct sockaddr_storage mapped_rem_addr;
+};
+
+/**
+ * iwpm_init - Allocate resources for the iwarp port mapper
+ *
+ * Should be called when network interface goes up.
+ */
+int iwpm_init(u8);
+
+/**
+ * iwpm_exit - Deallocate resources for the iwarp port mapper
+ *
+ * Should be called when network interface goes down.
+ */
+int iwpm_exit(u8);
+
+/**
+ * iwpm_valid_pid - Check if the userspace iwarp port mapper pid is valid
+ *
+ * Returns true if the pid is greater than zero, otherwise returns false
+ */
+int iwpm_valid_pid(void);
+
+/**
+ * iwpm_register_pid - Send a netlink query to userspace
+ *                     to get the iwarp port mapper pid
+ * @pm_msg: Contains driver info to send to the userspace port mapper
+ * @nl_client: The index of the netlink client
+ */
+int iwpm_register_pid(struct iwpm_dev_data *pm_msg, u8 nl_client);
+
+/**
+ * iwpm_add_mapping - Send a netlink add mapping request to
+ *                    the userspace port mapper
+ * @pm_msg: Contains the local ip/tcp address info to send
+ * @nl_client: The index of the netlink client
+ *
+ * If the request is successful, the pm_msg stores
+ * the port mapper response (mapped address info)
+ */
+int iwpm_add_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client);
+
+/**
+ * iwpm_add_and_query_mapping - Send a netlink add and query mapping request
+ *				 to the userspace port mapper
+ * @pm_msg: Contains the local and remote ip/tcp address info to send
+ * @nl_client: The index of the netlink client
+ *
+ * If the request is successful, the pm_msg stores the
+ * port mapper response (mapped local and remote address info)
+ */
+int iwpm_add_and_query_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client);
+
+/**
+ * iwpm_remove_mapping - Send a netlink remove mapping request
+ *                       to the userspace port mapper
+ *
+ * @local_addr: Local ip/tcp address to remove
+ * @nl_client: The index of the netlink client
+ */
+int iwpm_remove_mapping(struct sockaddr_storage *local_addr, u8 nl_client);
+
+/**
+ * iwpm_register_pid_cb - Process the port mapper response to
+ *                        iwpm_register_pid query
+ * @skb:
+ * @cb: Contains the received message (payload and netlink header)
+ *
+ * If successful, the function receives the userspace port mapper pid
+ * which is used in future communication with the port mapper
+ */
+int iwpm_register_pid_cb(struct sk_buff *, struct netlink_callback *);
+
+/**
+ * iwpm_add_mapping_cb - Process the port mapper response to
+ *                       iwpm_add_mapping request
+ * @skb:
+ * @cb: Contains the received message (payload and netlink header)
+ */
+int iwpm_add_mapping_cb(struct sk_buff *, struct netlink_callback *);
+
+/**
+ * iwpm_add_and_query_mapping_cb - Process the port mapper response to
+ *                                 iwpm_add_and_query_mapping request
+ * @skb:
+ * @cb: Contains the received message (payload and netlink header)
+ */
+int iwpm_add_and_query_mapping_cb(struct sk_buff *, struct netlink_callback *);
+
+/**
+ * iwpm_mapping_error_cb - Process port mapper notification for error
+ *
+ * @skb:
+ * @cb: Contains the received message (payload and netlink header)
+ */
+int iwpm_mapping_error_cb(struct sk_buff *, struct netlink_callback *);
+
+/**
+ * iwpm_mapping_info_cb - Process a notification that the userspace
+ *                        port mapper daemon is started
+ * @skb:
+ * @cb: Contains the received message (payload and netlink header)
+ *
+ * Using the received port mapper pid, send all the local mapping
+ * info records to the userspace port mapper
+ */
+int iwpm_mapping_info_cb(struct sk_buff *, struct netlink_callback *);
+
+/**
+ * iwpm_ack_mapping_info_cb - Process the port mapper ack for
+ *                            the provided local mapping info records
+ * @skb:
+ * @cb: Contains the received message (payload and netlink header)
+ */
+int iwpm_ack_mapping_info_cb(struct sk_buff *, struct netlink_callback *);
+
+/**
+ * iwpm_create_mapinfo - Store local and mapped IPv4/IPv6 address
+ *                       info in a hash table
+ * @local_addr: Local ip/tcp address
+ * @mapped_addr: Mapped local ip/tcp address
+ * @nl_client: The index of the netlink client
+ */
+int iwpm_create_mapinfo(struct sockaddr_storage *local_addr,
+			struct sockaddr_storage *mapped_addr, u8 nl_client);
+
+/**
+ * iwpm_remove_mapinfo - Remove local and mapped IPv4/IPv6 address
+ *                       info from the hash table
+ * @local_addr: Local ip/tcp address
+ * @mapped_addr: Mapped local ip/tcp address
+ *
+ * Returns err code if mapping info is not found in the hash table,
+ * otherwise returns 0
+ */
+int iwpm_remove_mapinfo(struct sockaddr_storage *local_addr,
+			struct sockaddr_storage *mapped_addr);
+
+#endif /* _IW_PORTMAP_H */
diff --git a/include/rdma/rdma_cm.h b/include/rdma/rdma_cm.h
new file mode 100644
index 0000000..1ed2088
--- /dev/null
+++ b/include/rdma/rdma_cm.h
@@ -0,0 +1,383 @@
+/*
+ * Copyright (c) 2005 Voltaire Inc.  All rights reserved.
+ * Copyright (c) 2005 Intel Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if !defined(RDMA_CM_H)
+#define RDMA_CM_H
+
+#include <linux/socket.h>
+#include <linux/in6.h>
+#include <rdma/ib_addr.h>
+#include <rdma/ib_sa.h>
+
+/*
+ * Upon receiving a device removal event, users must destroy the associated
+ * RDMA identifier and release all resources allocated with the device.
+ */
+enum rdma_cm_event_type {
+	RDMA_CM_EVENT_ADDR_RESOLVED,
+	RDMA_CM_EVENT_ADDR_ERROR,
+	RDMA_CM_EVENT_ROUTE_RESOLVED,
+	RDMA_CM_EVENT_ROUTE_ERROR,
+	RDMA_CM_EVENT_CONNECT_REQUEST,
+	RDMA_CM_EVENT_CONNECT_RESPONSE,
+	RDMA_CM_EVENT_CONNECT_ERROR,
+	RDMA_CM_EVENT_UNREACHABLE,
+	RDMA_CM_EVENT_REJECTED,
+	RDMA_CM_EVENT_ESTABLISHED,
+	RDMA_CM_EVENT_DISCONNECTED,
+	RDMA_CM_EVENT_DEVICE_REMOVAL,
+	RDMA_CM_EVENT_MULTICAST_JOIN,
+	RDMA_CM_EVENT_MULTICAST_ERROR,
+	RDMA_CM_EVENT_ADDR_CHANGE,
+	RDMA_CM_EVENT_TIMEWAIT_EXIT
+};
+
+enum rdma_port_space {
+	RDMA_PS_SDP   = 0x0001,
+	RDMA_PS_IPOIB = 0x0002,
+	RDMA_PS_IB    = 0x013F,
+	RDMA_PS_TCP   = 0x0106,
+	RDMA_PS_UDP   = 0x0111,
+};
+
+#define RDMA_IB_IP_PS_MASK   0xFFFFFFFFFFFF0000ULL
+#define RDMA_IB_IP_PS_TCP    0x0000000001060000ULL
+#define RDMA_IB_IP_PS_UDP    0x0000000001110000ULL
+#define RDMA_IB_IP_PS_IB     0x00000000013F0000ULL
+
+struct rdma_addr {
+	struct sockaddr_storage src_addr;
+	struct sockaddr_storage dst_addr;
+	struct rdma_dev_addr dev_addr;
+};
+
+struct rdma_route {
+	struct rdma_addr addr;
+	struct ib_sa_path_rec *path_rec;
+	int num_paths;
+};
+
+struct rdma_conn_param {
+	const void *private_data;
+	u8 private_data_len;
+	u8 responder_resources;
+	u8 initiator_depth;
+	u8 flow_control;
+	u8 retry_count;		/* ignored when accepting */
+	u8 rnr_retry_count;
+	/* Fields below ignored if a QP is created on the rdma_cm_id. */
+	u8 srq;
+	u32 qp_num;
+	u32 qkey;
+};
+
+struct rdma_ud_param {
+	const void *private_data;
+	u8 private_data_len;
+	struct ib_ah_attr ah_attr;
+	u32 qp_num;
+	u32 qkey;
+};
+
+struct rdma_cm_event {
+	enum rdma_cm_event_type	 event;
+	int			 status;
+	union {
+		struct rdma_conn_param	conn;
+		struct rdma_ud_param	ud;
+	} param;
+};
+
+enum rdma_cm_state {
+	RDMA_CM_IDLE,
+	RDMA_CM_ADDR_QUERY,
+	RDMA_CM_ADDR_RESOLVED,
+	RDMA_CM_ROUTE_QUERY,
+	RDMA_CM_ROUTE_RESOLVED,
+	RDMA_CM_CONNECT,
+	RDMA_CM_DISCONNECT,
+	RDMA_CM_ADDR_BOUND,
+	RDMA_CM_LISTEN,
+	RDMA_CM_DEVICE_REMOVAL,
+	RDMA_CM_DESTROYING
+};
+
+struct rdma_cm_id;
+
+/**
+ * rdma_cm_event_handler - Callback used to report user events.
+ *
+ * Notes: Users may not call rdma_destroy_id from this callback to destroy
+ *   the passed in id, or a corresponding listen id.  Returning a
+ *   non-zero value from the callback will destroy the passed in id.
+ */
+typedef int (*rdma_cm_event_handler)(struct rdma_cm_id *id,
+				     struct rdma_cm_event *event);
+
+struct rdma_cm_id {
+	struct ib_device	*device;
+	void			*context;
+	struct ib_qp		*qp;
+	rdma_cm_event_handler	 event_handler;
+	struct rdma_route	 route;
+	enum rdma_port_space	 ps;
+	enum ib_qp_type		 qp_type;
+	u8			 port_num;
+};
+
+/**
+ * rdma_create_id - Create an RDMA identifier.
+ *
+ * @event_handler: User callback invoked to report events associated with the
+ *   returned rdma_id.
+ * @context: User specified context associated with the id.
+ * @ps: RDMA port space.
+ * @qp_type: type of queue pair associated with the id.
+ */
+struct rdma_cm_id *rdma_create_id(rdma_cm_event_handler event_handler,
+				  void *context, enum rdma_port_space ps,
+				  enum ib_qp_type qp_type);
+
+/**
+  * rdma_destroy_id - Destroys an RDMA identifier.
+  *
+  * @id: RDMA identifier.
+  *
+  * Note: calling this function has the effect of canceling in-flight
+  * asynchronous operations associated with the id.
+  */
+void rdma_destroy_id(struct rdma_cm_id *id);
+
+/**
+ * rdma_bind_addr - Bind an RDMA identifier to a source address and
+ *   associated RDMA device, if needed.
+ *
+ * @id: RDMA identifier.
+ * @addr: Local address information.  Wildcard values are permitted.
+ *
+ * This associates a source address with the RDMA identifier before calling
+ * rdma_listen.  If a specific local address is given, the RDMA identifier will
+ * be bound to a local RDMA device.
+ */
+int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr *addr);
+
+/**
+ * rdma_resolve_addr - Resolve destination and optional source addresses
+ *   from IP addresses to an RDMA address.  If successful, the specified
+ *   rdma_cm_id will be bound to a local device.
+ *
+ * @id: RDMA identifier.
+ * @src_addr: Source address information.  This parameter may be NULL.
+ * @dst_addr: Destination address information.
+ * @timeout_ms: Time to wait for resolution to complete.
+ */
+int rdma_resolve_addr(struct rdma_cm_id *id, struct sockaddr *src_addr,
+		      struct sockaddr *dst_addr, int timeout_ms);
+
+/**
+ * rdma_resolve_route - Resolve the RDMA address bound to the RDMA identifier
+ *   into route information needed to establish a connection.
+ *
+ * This is called on the client side of a connection.
+ * Users must have first called rdma_resolve_addr to resolve a dst_addr
+ * into an RDMA address before calling this routine.
+ */
+int rdma_resolve_route(struct rdma_cm_id *id, int timeout_ms);
+
+/**
+ * rdma_create_qp - Allocate a QP and associate it with the specified RDMA
+ * identifier.
+ *
+ * QPs allocated to an rdma_cm_id will automatically be transitioned by the CMA
+ * through their states.
+ */
+int rdma_create_qp(struct rdma_cm_id *id, struct ib_pd *pd,
+		   struct ib_qp_init_attr *qp_init_attr);
+
+/**
+ * rdma_destroy_qp - Deallocate the QP associated with the specified RDMA
+ * identifier.
+ *
+ * Users must destroy any QP associated with an RDMA identifier before
+ * destroying the RDMA ID.
+ */
+void rdma_destroy_qp(struct rdma_cm_id *id);
+
+/**
+ * rdma_init_qp_attr - Initializes the QP attributes for use in transitioning
+ *   to a specified QP state.
+ * @id: Communication identifier associated with the QP attributes to
+ *   initialize.
+ * @qp_attr: On input, specifies the desired QP state.  On output, the
+ *   mandatory and desired optional attributes will be set in order to
+ *   modify the QP to the specified state.
+ * @qp_attr_mask: The QP attribute mask that may be used to transition the
+ *   QP to the specified state.
+ *
+ * Users must set the @qp_attr->qp_state to the desired QP state.  This call
+ * will set all required attributes for the given transition, along with
+ * known optional attributes.  Users may override the attributes returned from
+ * this call before calling ib_modify_qp.
+ *
+ * Users that wish to have their QP automatically transitioned through its
+ * states can associate a QP with the rdma_cm_id by calling rdma_create_qp().
+ */
+int rdma_init_qp_attr(struct rdma_cm_id *id, struct ib_qp_attr *qp_attr,
+		       int *qp_attr_mask);
+
+/**
+ * rdma_connect - Initiate an active connection request.
+ * @id: Connection identifier to connect.
+ * @conn_param: Connection information used for connected QPs.
+ *
+ * Users must have resolved a route for the rdma_cm_id to connect with
+ * by having called rdma_resolve_route before calling this routine.
+ *
+ * This call will either connect to a remote QP or obtain remote QP
+ * information for unconnected rdma_cm_id's.  The actual operation is
+ * based on the rdma_cm_id's port space.
+ */
+int rdma_connect(struct rdma_cm_id *id, struct rdma_conn_param *conn_param);
+
+/**
+ * rdma_listen - This function is called by the passive side to
+ *   listen for incoming connection requests.
+ *
+ * Users must have bound the rdma_cm_id to a local address by calling
+ * rdma_bind_addr before calling this routine.
+ */
+int rdma_listen(struct rdma_cm_id *id, int backlog);
+
+/**
+ * rdma_accept - Called to accept a connection request or response.
+ * @id: Connection identifier associated with the request.
+ * @conn_param: Information needed to establish the connection.  This must be
+ *   provided if accepting a connection request.  If accepting a connection
+ *   response, this parameter must be NULL.
+ *
+ * Typically, this routine is only called by the listener to accept a connection
+ * request.  It must also be called on the active side of a connection if the
+ * user is performing their own QP transitions.
+ *
+ * In the case of error, a reject message is sent to the remote side and the
+ * state of the qp associated with the id is modified to error, such that any
+ * previously posted receive buffers would be flushed.
+ */
+int rdma_accept(struct rdma_cm_id *id, struct rdma_conn_param *conn_param);
+
+/**
+ * rdma_notify - Notifies the RDMA CM of an asynchronous event that has
+ * occurred on the connection.
+ * @id: Connection identifier to transition to established.
+ * @event: Asynchronous event.
+ *
+ * This routine should be invoked by users to notify the CM of relevant
+ * communication events.  Events that should be reported to the CM and
+ * when to report them are:
+ *
+ * IB_EVENT_COMM_EST - Used when a message is received on a connected
+ *    QP before an RTU has been received.
+ */
+int rdma_notify(struct rdma_cm_id *id, enum ib_event_type event);
+
+/**
+ * rdma_reject - Called to reject a connection request or response.
+ */
+int rdma_reject(struct rdma_cm_id *id, const void *private_data,
+		u8 private_data_len);
+
+/**
+ * rdma_disconnect - This function disconnects the associated QP and
+ *   transitions it into the error state.
+ */
+int rdma_disconnect(struct rdma_cm_id *id);
+
+/**
+ * rdma_join_multicast - Join the multicast group specified by the given
+ *   address.
+ * @id: Communication identifier associated with the request.
+ * @addr: Multicast address identifying the group to join.
+ * @context: User-defined context associated with the join request, returned
+ * to the user through the private_data pointer in multicast events.
+ */
+int rdma_join_multicast(struct rdma_cm_id *id, struct sockaddr *addr,
+			void *context);
+
+/**
+ * rdma_leave_multicast - Leave the multicast group specified by the given
+ *   address.
+ */
+void rdma_leave_multicast(struct rdma_cm_id *id, struct sockaddr *addr);
+
+/**
+ * rdma_set_service_type - Set the type of service associated with a
+ *   connection identifier.
+ * @id: Communication identifier to associated with service type.
+ * @tos: Type of service.
+ *
+ * The type of service is interpretted as a differentiated service
+ * field (RFC 2474).  The service type should be specified before
+ * performing route resolution, as existing communication on the
+ * connection identifier may be unaffected.  The type of service
+ * requested may not be supported by the network to all destinations.
+ */
+void rdma_set_service_type(struct rdma_cm_id *id, int tos);
+
+/**
+ * rdma_set_reuseaddr - Allow the reuse of local addresses when binding
+ *    the rdma_cm_id.
+ * @id: Communication identifier to configure.
+ * @reuse: Value indicating if the bound address is reusable.
+ *
+ * Reuse must be set before an address is bound to the id.
+ */
+int rdma_set_reuseaddr(struct rdma_cm_id *id, int reuse);
+
+/**
+ * rdma_set_afonly - Specify that listens are restricted to the
+ *    bound address family only.
+ * @id: Communication identifer to configure.
+ * @afonly: Value indicating if listens are restricted.
+ *
+ * Must be set before identifier is in the listening state.
+ */
+int rdma_set_afonly(struct rdma_cm_id *id, int afonly);
+
+ /**
+ * rdma_get_service_id - Return the IB service ID for a specified address.
+ * @id: Communication identifier associated with the address.
+ * @addr: Address for the service ID.
+ */
+__be64 rdma_get_service_id(struct rdma_cm_id *id, struct sockaddr *addr);
+
+#endif /* RDMA_CM_H */
diff --git a/include/rdma/rdma_cm_ib.h b/include/rdma/rdma_cm_ib.h
new file mode 100644
index 0000000..2389c3b
--- /dev/null
+++ b/include/rdma/rdma_cm_ib.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2006 Intel Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if !defined(RDMA_CM_IB_H)
+#define RDMA_CM_IB_H
+
+#include <rdma/rdma_cm.h>
+
+/**
+ * rdma_set_ib_paths - Manually sets the path records used to establish a
+ *   connection.
+ * @id: Connection identifier associated with the request.
+ * @path_rec: Reference to the path record
+ *
+ * This call permits a user to specify routing information for rdma_cm_id's
+ * bound to Infiniband devices.  It is called on the client side of a
+ * connection and replaces the call to rdma_resolve_route.
+ */
+int rdma_set_ib_paths(struct rdma_cm_id *id,
+		      struct ib_sa_path_rec *path_rec, int num_paths);
+
+/* Global qkey for UDP QPs and multicast groups. */
+#define RDMA_UDP_QKEY 0x01234567
+
+#endif /* RDMA_CM_IB_H */
diff --git a/include/rdma/rdma_netlink.h b/include/rdma/rdma_netlink.h
new file mode 100644
index 0000000..0790882
--- /dev/null
+++ b/include/rdma/rdma_netlink.h
@@ -0,0 +1,80 @@
+#ifndef _RDMA_NETLINK_H
+#define _RDMA_NETLINK_H
+
+
+#include <linux/netlink.h>
+#include <uapi/rdma/rdma_netlink.h>
+
+struct ibnl_client_cbs {
+	int (*dump)(struct sk_buff *skb, struct netlink_callback *nlcb);
+	struct module *module;
+};
+
+int ibnl_init(void);
+void ibnl_cleanup(void);
+
+/**
+ * Add a a client to the list of IB netlink exporters.
+ * @index: Index of the added client
+ * @nops: Number of supported ops by the added client.
+ * @cb_table: A table for op->callback
+ *
+ * Returns 0 on success or a negative error code.
+ */
+int ibnl_add_client(int index, int nops,
+		    const struct ibnl_client_cbs cb_table[]);
+
+/**
+ * Remove a client from IB netlink.
+ * @index: Index of the removed IB client.
+ *
+ * Returns 0 on success or a negative error code.
+ */
+int ibnl_remove_client(int index);
+
+/**
+ * Put a new message in a supplied skb.
+ * @skb: The netlink skb.
+ * @nlh: Pointer to put the header of the new netlink message.
+ * @seq: The message sequence number.
+ * @len: The requested message length to allocate.
+ * @client: Calling IB netlink client.
+ * @op: message content op.
+ * Returns the allocated buffer on success and NULL on failure.
+ */
+void *ibnl_put_msg(struct sk_buff *skb, struct nlmsghdr **nlh, int seq,
+		   int len, int client, int op, int flags);
+/**
+ * Put a new attribute in a supplied skb.
+ * @skb: The netlink skb.
+ * @nlh: Header of the netlink message to append the attribute to.
+ * @len: The length of the attribute data.
+ * @data: The attribute data to put.
+ * @type: The attribute type.
+ * Returns the 0 and a negative error code on failure.
+ */
+int ibnl_put_attr(struct sk_buff *skb, struct nlmsghdr *nlh,
+		  int len, void *data, int type);
+
+/**
+ * Send the supplied skb to a specific userspace PID.
+ * @skb: The netlink skb
+ * @nlh: Header of the netlink message to send
+ * @pid: Userspace netlink process ID
+ * Returns 0 on success or a negative error code.
+ */
+int ibnl_unicast(struct sk_buff *skb, struct nlmsghdr *nlh,
+			__u32 pid);
+
+/**
+ * Send the supplied skb to a netlink group.
+ * @skb: The netlink skb
+ * @nlh: Header of the netlink message to send
+ * @group: Netlink group ID
+ * @flags: allocation flags
+ * Returns 0 on success or a negative error code.
+ */
+int ibnl_multicast(struct sk_buff *skb, struct nlmsghdr *nlh,
+			unsigned int group, gfp_t flags);
+
+#endif /* _RDMA_NETLINK_H */
diff --git a/include/scsi/scsi_transport_srp.h b/include/scsi/scsi_transport_srp.h
new file mode 100644
index 0000000..cdb05dd
--- /dev/null
+++ b/include/scsi/scsi_transport_srp.h
@@ -0,0 +1,149 @@
+#ifndef SCSI_TRANSPORT_SRP_H
+#define SCSI_TRANSPORT_SRP_H
+
+#include <linux/transport_class.h>
+#include <linux/types.h>
+#include <linux/mutex.h>
+
+#define SRP_RPORT_ROLE_INITIATOR 0
+#define SRP_RPORT_ROLE_TARGET 1
+
+struct srp_rport_identifiers {
+	u8 port_id[16];
+	u8 roles;
+};
+
+/**
+ * enum srp_rport_state - SRP transport layer state
+ * @SRP_RPORT_RUNNING:   Transport layer operational.
+ * @SRP_RPORT_BLOCKED:   Transport layer not operational; fast I/O fail timer
+ *                       is running and I/O has been blocked.
+ * @SRP_RPORT_FAIL_FAST: Fast I/O fail timer has expired; fail I/O fast.
+ * @SRP_RPORT_LOST:      Port is being removed.
+ */
+enum srp_rport_state {
+	SRP_RPORT_RUNNING,
+	SRP_RPORT_BLOCKED,
+	SRP_RPORT_FAIL_FAST,
+	SRP_RPORT_LOST,
+};
+
+/**
+ * struct srp_rport - SRP initiator or target port
+ *
+ * Fields that are relevant for SRP initiator and SRP target drivers:
+ * @dev:               Device associated with this rport.
+ * @port_id:           16-byte port identifier.
+ * @roles:             Role of this port - initiator or target.
+ *
+ * Fields that are only relevant for SRP initiator drivers:
+ * @lld_data:          LLD private data.
+ * @mutex:             Protects against concurrent rport reconnect /
+ *                     fast_io_fail / dev_loss_tmo activity.
+ * @state:             rport state.
+ * @reconnect_delay:   Reconnect delay in seconds.
+ * @failed_reconnects: Number of failed reconnect attempts.
+ * @reconnect_work:    Work structure used for scheduling reconnect attempts.
+ * @fast_io_fail_tmo:  Fast I/O fail timeout in seconds.
+ * @dev_loss_tmo:      Device loss timeout in seconds.
+ * @fast_io_fail_work: Work structure used for scheduling fast I/O fail work.
+ * @dev_loss_work:     Work structure used for scheduling device loss work.
+ */
+struct srp_rport {
+	/* for initiator and target drivers */
+
+	struct device dev;
+
+	u8 port_id[16];
+	u8 roles;
+
+	/* for initiator drivers */
+
+	void			*lld_data;
+
+	struct mutex		mutex;
+	enum srp_rport_state	state;
+	int			reconnect_delay;
+	int			failed_reconnects;
+	struct delayed_work	reconnect_work;
+	int			fast_io_fail_tmo;
+	int			dev_loss_tmo;
+	struct delayed_work	fast_io_fail_work;
+	struct delayed_work	dev_loss_work;
+};
+
+/**
+ * struct srp_function_template
+ *
+ * Fields that are only relevant for SRP initiator drivers:
+ * @has_rport_state: Whether or not to create the state, fast_io_fail_tmo and
+ *     dev_loss_tmo sysfs attribute for an rport.
+ * @reset_timer_if_blocked: Whether or srp_timed_out() should reset the command
+ *     timer if the device on which it has been queued is blocked.
+ * @reconnect_delay: If not NULL, points to the default reconnect_delay value.
+ * @fast_io_fail_tmo: If not NULL, points to the default fast_io_fail_tmo value.
+ * @dev_loss_tmo: If not NULL, points to the default dev_loss_tmo value.
+ * @reconnect: Callback function for reconnecting to the target. See also
+ *     srp_reconnect_rport().
+ * @terminate_rport_io: Callback function for terminating all outstanding I/O
+ *     requests for an rport.
+ * @rport_delete: Callback function that deletes an rport.
+ *
+ * Fields that are only relevant for SRP target drivers:
+ * @tsk_mgmt_response: Callback function for sending a task management response.
+ * @it_nexus_response: Callback function for processing an IT nexus response.
+ */
+struct srp_function_template {
+	/* for initiator drivers */
+	bool has_rport_state;
+	bool reset_timer_if_blocked;
+	int *reconnect_delay;
+	int *fast_io_fail_tmo;
+	int *dev_loss_tmo;
+	int (*reconnect)(struct srp_rport *rport);
+	void (*terminate_rport_io)(struct srp_rport *rport);
+	void (*rport_delete)(struct srp_rport *rport);
+	/* for target drivers */
+	int (* tsk_mgmt_response)(struct Scsi_Host *, u64, u64, int);
+	int (* it_nexus_response)(struct Scsi_Host *, u64, int);
+};
+
+extern struct scsi_transport_template *
+srp_attach_transport(struct srp_function_template *);
+extern void srp_release_transport(struct scsi_transport_template *);
+
+extern void srp_rport_get(struct srp_rport *rport);
+extern void srp_rport_put(struct srp_rport *rport);
+extern struct srp_rport *srp_rport_add(struct Scsi_Host *,
+				       struct srp_rport_identifiers *);
+extern void srp_rport_del(struct srp_rport *);
+extern int srp_tmo_valid(int reconnect_delay, int fast_io_fail_tmo,
+			 int dev_loss_tmo);
+extern int srp_reconnect_rport(struct srp_rport *rport);
+extern void srp_start_tl_fail_timers(struct srp_rport *rport);
+extern void srp_remove_host(struct Scsi_Host *);
+extern void srp_stop_rport_timers(struct srp_rport *rport);
+
+/**
+ * srp_chkready() - evaluate the transport layer state before I/O
+ * @rport: SRP target port pointer.
+ *
+ * Returns a SCSI result code that can be returned by the LLD queuecommand()
+ * implementation. The role of this function is similar to that of
+ * fc_remote_port_chkready().
+ */
+static inline int srp_chkready(struct srp_rport *rport)
+{
+	switch (rport->state) {
+	case SRP_RPORT_RUNNING:
+	case SRP_RPORT_BLOCKED:
+	default:
+		return 0;
+	case SRP_RPORT_FAIL_FAST:
+		return DID_TRANSPORT_FAILFAST << 16;
+	case SRP_RPORT_LOST:
+		return DID_NO_CONNECT << 16;
+	}
+}
+
+#endif
diff --git a/include/scsi/srp.h b/include/scsi/srp.h
new file mode 100644
index 0000000..1ae84db
--- /dev/null
+++ b/include/scsi/srp.h
@@ -0,0 +1,280 @@
+/*
+ * Copyright (c) 2005 Cisco Systems.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id$
+ */
+
+#ifndef SCSI_SRP_H
+#define SCSI_SRP_H
+
+/*
+ * Structures and constants for the SCSI RDMA Protocol (SRP) as
+ * defined by the INCITS T10 committee.  This file was written using
+ * draft Revision 16a of the SRP standard.
+ */
+
+#include <linux/types.h>
+
+enum {
+	SRP_LOGIN_REQ	= 0x00,
+	SRP_TSK_MGMT	= 0x01,
+	SRP_CMD		= 0x02,
+	SRP_I_LOGOUT	= 0x03,
+	SRP_LOGIN_RSP	= 0xc0,
+	SRP_RSP		= 0xc1,
+	SRP_LOGIN_REJ	= 0xc2,
+	SRP_T_LOGOUT	= 0x80,
+	SRP_CRED_REQ	= 0x81,
+	SRP_AER_REQ	= 0x82,
+	SRP_CRED_RSP	= 0x41,
+	SRP_AER_RSP	= 0x42
+};
+
+enum {
+	SRP_BUF_FORMAT_DIRECT	= 1 << 1,
+	SRP_BUF_FORMAT_INDIRECT	= 1 << 2
+};
+
+enum {
+	SRP_NO_DATA_DESC	= 0,
+	SRP_DATA_DESC_DIRECT	= 1,
+	SRP_DATA_DESC_INDIRECT	= 2
+};
+
+enum {
+	SRP_TSK_ABORT_TASK	= 0x01,
+	SRP_TSK_ABORT_TASK_SET	= 0x02,
+	SRP_TSK_CLEAR_TASK_SET	= 0x04,
+	SRP_TSK_LUN_RESET	= 0x08,
+	SRP_TSK_CLEAR_ACA	= 0x40
+};
+
+enum srp_login_rej_reason {
+	SRP_LOGIN_REJ_UNABLE_ESTABLISH_CHANNEL		= 0x00010000,
+	SRP_LOGIN_REJ_INSUFFICIENT_RESOURCES		= 0x00010001,
+	SRP_LOGIN_REJ_REQ_IT_IU_LENGTH_TOO_LARGE	= 0x00010002,
+	SRP_LOGIN_REJ_UNABLE_ASSOCIATE_CHANNEL		= 0x00010003,
+	SRP_LOGIN_REJ_UNSUPPORTED_DESCRIPTOR_FMT	= 0x00010004,
+	SRP_LOGIN_REJ_MULTI_CHANNEL_UNSUPPORTED		= 0x00010005,
+	SRP_LOGIN_REJ_CHANNEL_LIMIT_REACHED		= 0x00010006
+};
+
+enum {
+	SRP_REV10_IB_IO_CLASS	= 0xff00,
+	SRP_REV16A_IB_IO_CLASS	= 0x0100
+};
+
+struct srp_direct_buf {
+	__be64	va;
+	__be32	key;
+	__be32  len;
+};
+
+/*
+ * We need the packed attribute because the SRP spec puts the list of
+ * descriptors at an offset of 20, which is not aligned to the size of
+ * struct srp_direct_buf.  The whole structure must be packed to avoid
+ * having the 20-byte structure padded to 24 bytes on 64-bit architectures.
+ */
+struct srp_indirect_buf {
+	struct srp_direct_buf	table_desc;
+	__be32			len;
+	struct srp_direct_buf	desc_list[0];
+} __attribute__((packed));
+
+enum {
+	SRP_MULTICHAN_SINGLE = 0,
+	SRP_MULTICHAN_MULTI  = 1
+};
+
+struct srp_login_req {
+	u8	opcode;
+	u8	reserved1[7];
+	u64	tag;
+	__be32	req_it_iu_len;
+	u8	reserved2[4];
+	__be16	req_buf_fmt;
+	u8	req_flags;
+	u8	reserved3[5];
+	u8	initiator_port_id[16];
+	u8	target_port_id[16];
+};
+
+/*
+ * The SRP spec defines the size of the LOGIN_RSP structure to be 52
+ * bytes, so it needs to be packed to avoid having it padded to 56
+ * bytes on 64-bit architectures.
+ */
+struct srp_login_rsp {
+	u8	opcode;
+	u8	reserved1[3];
+	__be32	req_lim_delta;
+	u64	tag;
+	__be32	max_it_iu_len;
+	__be32	max_ti_iu_len;
+	__be16	buf_fmt;
+	u8	rsp_flags;
+	u8	reserved2[25];
+} __attribute__((packed));
+
+struct srp_login_rej {
+	u8	opcode;
+	u8	reserved1[3];
+	__be32	reason;
+	u64	tag;
+	u8	reserved2[8];
+	__be16	buf_fmt;
+	u8	reserved3[6];
+};
+
+struct srp_i_logout {
+	u8	opcode;
+	u8	reserved[7];
+	u64	tag;
+};
+
+struct srp_t_logout {
+	u8	opcode;
+	u8	sol_not;
+	u8	reserved[2];
+	__be32	reason;
+	u64	tag;
+};
+
+/*
+ * We need the packed attribute because the SRP spec only aligns the
+ * 8-byte LUN field to 4 bytes.
+ */
+struct srp_tsk_mgmt {
+	u8	opcode;
+	u8	sol_not;
+	u8	reserved1[6];
+	u64	tag;
+	u8	reserved2[4];
+	__be64	lun __attribute__((packed));
+	u8	reserved3[2];
+	u8	tsk_mgmt_func;
+	u8	reserved4;
+	u64	task_tag;
+	u8	reserved5[8];
+};
+
+/*
+ * We need the packed attribute because the SRP spec only aligns the
+ * 8-byte LUN field to 4 bytes.
+ */
+struct srp_cmd {
+	u8	opcode;
+	u8	sol_not;
+	u8	reserved1[3];
+	u8	buf_fmt;
+	u8	data_out_desc_cnt;
+	u8	data_in_desc_cnt;
+	u64	tag;
+	u8	reserved2[4];
+	__be64	lun __attribute__((packed));
+	u8	reserved3;
+	u8	task_attr;
+	u8	reserved4;
+	u8	add_cdb_len;
+	u8	cdb[16];
+	u8	add_data[0];
+};
+
+enum {
+	SRP_RSP_FLAG_RSPVALID = 1 << 0,
+	SRP_RSP_FLAG_SNSVALID = 1 << 1,
+	SRP_RSP_FLAG_DOOVER   = 1 << 2,
+	SRP_RSP_FLAG_DOUNDER  = 1 << 3,
+	SRP_RSP_FLAG_DIOVER   = 1 << 4,
+	SRP_RSP_FLAG_DIUNDER  = 1 << 5
+};
+
+/*
+ * The SRP spec defines the size of the RSP structure to be 36 bytes,
+ * so it needs to be packed to avoid having it padded to 40 bytes on
+ * 64-bit architectures.
+ */
+struct srp_rsp {
+	u8	opcode;
+	u8	sol_not;
+	u8	reserved1[2];
+	__be32	req_lim_delta;
+	u64	tag;
+	u8	reserved2[2];
+	u8	flags;
+	u8	status;
+	__be32	data_out_res_cnt;
+	__be32	data_in_res_cnt;
+	__be32	sense_data_len;
+	__be32	resp_data_len;
+	u8	data[0];
+} __attribute__((packed));
+
+struct srp_cred_req {
+	u8	opcode;
+	u8	sol_not;
+	u8	reserved[2];
+	__be32	req_lim_delta;
+	u64	tag;
+};
+
+struct srp_cred_rsp {
+	u8	opcode;
+	u8	reserved[7];
+	u64	tag;
+};
+
+/*
+ * The SRP spec defines the fixed portion of the AER_REQ structure to be
+ * 36 bytes, so it needs to be packed to avoid having it padded to 40 bytes
+ * on 64-bit architectures.
+ */
+struct srp_aer_req {
+	u8	opcode;
+	u8	sol_not;
+	u8	reserved[2];
+	__be32	req_lim_delta;
+	u64	tag;
+	u32	reserved2;
+	__be64	lun;
+	__be32	sense_data_len;
+	u32	reserved3;
+	u8	sense_data[0];
+} __attribute__((packed));
+
+struct srp_aer_rsp {
+	u8	opcode;
+	u8	reserved[7];
+	u64	tag;
+};
+
+#endif /* SCSI_SRP_H */
diff --git a/include/uapi/linux/rds.h b/include/uapi/linux/rds.h
new file mode 100644
index 0000000..9195095
--- /dev/null
+++ b/include/uapi/linux/rds.h
@@ -0,0 +1,285 @@
+/*
+ * Copyright (c) 2008 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifndef _LINUX_RDS_H
+#define _LINUX_RDS_H
+
+#include <linux/types.h>
+
+#define RDS_IB_ABI_VERSION		0x301
+
+/*
+ * setsockopt/getsockopt for SOL_RDS
+ */
+#define RDS_CANCEL_SENT_TO      	1
+#define RDS_GET_MR			2
+#define RDS_FREE_MR			3
+/* deprecated: RDS_BARRIER 4 */
+#define RDS_RECVERR			5
+#define RDS_CONG_MONITOR		6
+#define RDS_GET_MR_FOR_DEST		7
+
+/*
+ * Control message types for SOL_RDS.
+ *
+ * CMSG_RDMA_ARGS (sendmsg)
+ *	Request a RDMA transfer to/from the specified
+ *	memory ranges.
+ *	The cmsg_data is a struct rds_rdma_args.
+ * RDS_CMSG_RDMA_DEST (recvmsg, sendmsg)
+ *	Kernel informs application about intended
+ *	source/destination of a RDMA transfer
+ * RDS_CMSG_RDMA_MAP (sendmsg)
+ *	Application asks kernel to map the given
+ *	memory range into a IB MR, and send the
+ *	R_Key along in an RDS extension header.
+ *	The cmsg_data is a struct rds_get_mr_args,
+ *	the same as for the GET_MR setsockopt.
+ * RDS_CMSG_RDMA_STATUS (recvmsg)
+ *	Returns the status of a completed RDMA operation.
+ */
+#define RDS_CMSG_RDMA_ARGS		1
+#define RDS_CMSG_RDMA_DEST		2
+#define RDS_CMSG_RDMA_MAP		3
+#define RDS_CMSG_RDMA_STATUS		4
+#define RDS_CMSG_CONG_UPDATE		5
+#define RDS_CMSG_ATOMIC_FADD		6
+#define RDS_CMSG_ATOMIC_CSWP		7
+#define RDS_CMSG_MASKED_ATOMIC_FADD	8
+#define RDS_CMSG_MASKED_ATOMIC_CSWP	9
+
+#define RDS_INFO_FIRST			10000
+#define RDS_INFO_COUNTERS		10000
+#define RDS_INFO_CONNECTIONS		10001
+/* 10002 aka RDS_INFO_FLOWS is deprecated */
+#define RDS_INFO_SEND_MESSAGES		10003
+#define RDS_INFO_RETRANS_MESSAGES       10004
+#define RDS_INFO_RECV_MESSAGES          10005
+#define RDS_INFO_SOCKETS                10006
+#define RDS_INFO_TCP_SOCKETS            10007
+#define RDS_INFO_IB_CONNECTIONS		10008
+#define RDS_INFO_CONNECTION_STATS	10009
+#define RDS_INFO_IWARP_CONNECTIONS	10010
+#define RDS_INFO_LAST			10010
+
+struct rds_info_counter {
+	uint8_t	name[32];
+	uint64_t	value;
+} __attribute__((packed));
+
+#define RDS_INFO_CONNECTION_FLAG_SENDING	0x01
+#define RDS_INFO_CONNECTION_FLAG_CONNECTING	0x02
+#define RDS_INFO_CONNECTION_FLAG_CONNECTED	0x04
+
+#define TRANSNAMSIZ	16
+
+struct rds_info_connection {
+	uint64_t	next_tx_seq;
+	uint64_t	next_rx_seq;
+	__be32		laddr;
+	__be32		faddr;
+	uint8_t	transport[TRANSNAMSIZ];		/* null term ascii */
+	uint8_t	flags;
+} __attribute__((packed));
+
+#define RDS_INFO_MESSAGE_FLAG_ACK               0x01
+#define RDS_INFO_MESSAGE_FLAG_FAST_ACK          0x02
+
+struct rds_info_message {
+	uint64_t	seq;
+	uint32_t	len;
+	__be32		laddr;
+	__be32		faddr;
+	__be16		lport;
+	__be16		fport;
+	uint8_t	flags;
+} __attribute__((packed));
+
+struct rds_info_socket {
+	uint32_t	sndbuf;
+	__be32		bound_addr;
+	__be32		connected_addr;
+	__be16		bound_port;
+	__be16		connected_port;
+	uint32_t	rcvbuf;
+	uint64_t	inum;
+} __attribute__((packed));
+
+struct rds_info_tcp_socket {
+	__be32          local_addr;
+	__be16          local_port;
+	__be32          peer_addr;
+	__be16          peer_port;
+	uint64_t       hdr_rem;
+	uint64_t       data_rem;
+	uint32_t       last_sent_nxt;
+	uint32_t       last_expected_una;
+	uint32_t       last_seen_una;
+} __attribute__((packed));
+
+#define RDS_IB_GID_LEN	16
+struct rds_info_rdma_connection {
+	__be32		src_addr;
+	__be32		dst_addr;
+	uint8_t		src_gid[RDS_IB_GID_LEN];
+	uint8_t		dst_gid[RDS_IB_GID_LEN];
+
+	uint32_t	max_send_wr;
+	uint32_t	max_recv_wr;
+	uint32_t	max_send_sge;
+	uint32_t	rdma_mr_max;
+	uint32_t	rdma_mr_size;
+};
+
+/*
+ * Congestion monitoring.
+ * Congestion control in RDS happens at the host connection
+ * level by exchanging a bitmap marking congested ports.
+ * By default, a process sleeping in poll() is always woken
+ * up when the congestion map is updated.
+ * With explicit monitoring, an application can have more
+ * fine-grained control.
+ * The application installs a 64bit mask value in the socket,
+ * where each bit corresponds to a group of ports.
+ * When a congestion update arrives, RDS checks the set of
+ * ports that are now uncongested against the list bit mask
+ * installed in the socket, and if they overlap, we queue a
+ * cong_notification on the socket.
+ *
+ * To install the congestion monitor bitmask, use RDS_CONG_MONITOR
+ * with the 64bit mask.
+ * Congestion updates are received via RDS_CMSG_CONG_UPDATE
+ * control messages.
+ *
+ * The correspondence between bits and ports is
+ *	1 << (portnum % 64)
+ */
+#define RDS_CONG_MONITOR_SIZE	64
+#define RDS_CONG_MONITOR_BIT(port)  (((unsigned int) port) % RDS_CONG_MONITOR_SIZE)
+#define RDS_CONG_MONITOR_MASK(port) (1ULL << RDS_CONG_MONITOR_BIT(port))
+
+/*
+ * RDMA related types
+ */
+
+/*
+ * This encapsulates a remote memory location.
+ * In the current implementation, it contains the R_Key
+ * of the remote memory region, and the offset into it
+ * (so that the application does not have to worry about
+ * alignment).
+ */
+typedef uint64_t	rds_rdma_cookie_t;
+
+struct rds_iovec {
+	uint64_t	addr;
+	uint64_t	bytes;
+};
+
+struct rds_get_mr_args {
+	struct rds_iovec vec;
+	uint64_t	cookie_addr;
+	uint64_t	flags;
+};
+
+struct rds_get_mr_for_dest_args {
+	struct sockaddr_storage	dest_addr;
+	struct rds_iovec 	vec;
+	uint64_t		cookie_addr;
+	uint64_t		flags;
+};
+
+struct rds_free_mr_args {
+	rds_rdma_cookie_t cookie;
+	uint64_t	flags;
+};
+
+struct rds_rdma_args {
+	rds_rdma_cookie_t cookie;
+	struct rds_iovec remote_vec;
+	uint64_t	local_vec_addr;
+	uint64_t	nr_local;
+	uint64_t	flags;
+	uint64_t	user_token;
+};
+
+struct rds_atomic_args {
+	rds_rdma_cookie_t cookie;
+	uint64_t 	local_addr;
+	uint64_t 	remote_addr;
+	union {
+		struct {
+			uint64_t	compare;
+			uint64_t	swap;
+		} cswp;
+		struct {
+			uint64_t	add;
+		} fadd;
+		struct {
+			uint64_t	compare;
+			uint64_t	swap;
+			uint64_t	compare_mask;
+			uint64_t	swap_mask;
+		} m_cswp;
+		struct {
+			uint64_t	add;
+			uint64_t	nocarry_mask;
+		} m_fadd;
+	};
+	uint64_t	flags;
+	uint64_t	user_token;
+};
+
+struct rds_rdma_notify {
+	uint64_t	user_token;
+	int32_t		status;
+};
+
+#define RDS_RDMA_SUCCESS	0
+#define RDS_RDMA_REMOTE_ERROR	1
+#define RDS_RDMA_CANCELED	2
+#define RDS_RDMA_DROPPED	3
+#define RDS_RDMA_OTHER_ERROR	4
+
+/*
+ * Common set of flags for all RDMA related structs
+ */
+#define RDS_RDMA_READWRITE	0x0001
+#define RDS_RDMA_FENCE		0x0002	/* use FENCE for immediate send */
+#define RDS_RDMA_INVALIDATE	0x0004	/* invalidate R_Key after freeing MR */
+#define RDS_RDMA_USE_ONCE	0x0008	/* free MR after use */
+#define RDS_RDMA_DONTWAIT	0x0010	/* Don't wait in SET_BARRIER */
+#define RDS_RDMA_NOTIFY_ME	0x0020	/* Notify when operation completes */
+#define RDS_RDMA_SILENT		0x0040	/* Do not interrupt remote */
+
+#endif /* IB_RDS_H */
diff --git a/include/uapi/rdma/Kbuild b/include/uapi/rdma/Kbuild
new file mode 100644
index 0000000..687ae33
--- /dev/null
+++ b/include/uapi/rdma/Kbuild
@@ -0,0 +1,7 @@
+# UAPI Header export list
+header-y += ib_user_cm.h
+header-y += ib_user_mad.h
+header-y += ib_user_sa.h
+header-y += ib_user_verbs.h
+header-y += rdma_netlink.h
+header-y += rdma_user_cm.h
diff --git a/include/uapi/rdma/ib_user_cm.h b/include/uapi/rdma/ib_user_cm.h
new file mode 100644
index 0000000..f79014a
--- /dev/null
+++ b/include/uapi/rdma/ib_user_cm.h
@@ -0,0 +1,325 @@
+/*
+ * Copyright (c) 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Intel Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef IB_USER_CM_H
+#define IB_USER_CM_H
+
+#include <linux/types.h>
+#include <rdma/ib_user_sa.h>
+
+#define IB_USER_CM_ABI_VERSION 5
+
+enum {
+	IB_USER_CM_CMD_CREATE_ID,
+	IB_USER_CM_CMD_DESTROY_ID,
+	IB_USER_CM_CMD_ATTR_ID,
+
+	IB_USER_CM_CMD_LISTEN,
+	IB_USER_CM_CMD_NOTIFY,
+
+	IB_USER_CM_CMD_SEND_REQ,
+	IB_USER_CM_CMD_SEND_REP,
+	IB_USER_CM_CMD_SEND_RTU,
+	IB_USER_CM_CMD_SEND_DREQ,
+	IB_USER_CM_CMD_SEND_DREP,
+	IB_USER_CM_CMD_SEND_REJ,
+	IB_USER_CM_CMD_SEND_MRA,
+	IB_USER_CM_CMD_SEND_LAP,
+	IB_USER_CM_CMD_SEND_APR,
+	IB_USER_CM_CMD_SEND_SIDR_REQ,
+	IB_USER_CM_CMD_SEND_SIDR_REP,
+
+	IB_USER_CM_CMD_EVENT,
+	IB_USER_CM_CMD_INIT_QP_ATTR,
+};
+/*
+ * command ABI structures.
+ */
+struct ib_ucm_cmd_hdr {
+	__u32 cmd;
+	__u16 in;
+	__u16 out;
+};
+
+struct ib_ucm_create_id {
+	__u64 uid;
+	__u64 response;
+};
+
+struct ib_ucm_create_id_resp {
+	__u32 id;
+};
+
+struct ib_ucm_destroy_id {
+	__u64 response;
+	__u32 id;
+	__u32 reserved;
+};
+
+struct ib_ucm_destroy_id_resp {
+	__u32 events_reported;
+};
+
+struct ib_ucm_attr_id {
+	__u64 response;
+	__u32 id;
+	__u32 reserved;
+};
+
+struct ib_ucm_attr_id_resp {
+	__be64 service_id;
+	__be64 service_mask;
+	__be32 local_id;
+	__be32 remote_id;
+};
+
+struct ib_ucm_init_qp_attr {
+	__u64 response;
+	__u32 id;
+	__u32 qp_state;
+};
+
+struct ib_ucm_listen {
+	__be64 service_id;
+	__be64 service_mask;
+	__u32 id;
+	__u32 reserved;
+};
+
+struct ib_ucm_notify {
+	__u32 id;
+	__u32 event;
+};
+
+struct ib_ucm_private_data {
+	__u64 data;
+	__u32 id;
+	__u8  len;
+	__u8  reserved[3];
+};
+
+struct ib_ucm_req {
+	__u32 id;
+	__u32 qpn;
+	__u32 qp_type;
+	__u32 psn;
+	__be64 sid;
+	__u64 data;
+	__u64 primary_path;
+	__u64 alternate_path;
+	__u8  len;
+	__u8  peer_to_peer;
+	__u8  responder_resources;
+	__u8  initiator_depth;
+	__u8  remote_cm_response_timeout;
+	__u8  flow_control;
+	__u8  local_cm_response_timeout;
+	__u8  retry_count;
+	__u8  rnr_retry_count;
+	__u8  max_cm_retries;
+	__u8  srq;
+	__u8  reserved[5];
+};
+
+struct ib_ucm_rep {
+	__u64 uid;
+	__u64 data;
+	__u32 id;
+	__u32 qpn;
+	__u32 psn;
+	__u8  len;
+	__u8  responder_resources;
+	__u8  initiator_depth;
+	__u8  target_ack_delay;
+	__u8  failover_accepted;
+	__u8  flow_control;
+	__u8  rnr_retry_count;
+	__u8  srq;
+	__u8  reserved[4];
+};
+
+struct ib_ucm_info {
+	__u32 id;
+	__u32 status;
+	__u64 info;
+	__u64 data;
+	__u8  info_len;
+	__u8  data_len;
+	__u8  reserved[6];
+};
+
+struct ib_ucm_mra {
+	__u64 data;
+	__u32 id;
+	__u8  len;
+	__u8  timeout;
+	__u8  reserved[2];
+};
+
+struct ib_ucm_lap {
+	__u64 path;
+	__u64 data;
+	__u32 id;
+	__u8  len;
+	__u8  reserved[3];
+};
+
+struct ib_ucm_sidr_req {
+	__u32 id;
+	__u32 timeout;
+	__be64 sid;
+	__u64 data;
+	__u64 path;
+	__u16 reserved_pkey;
+	__u8  len;
+	__u8  max_cm_retries;
+	__u8  reserved[4];
+};
+
+struct ib_ucm_sidr_rep {
+	__u32 id;
+	__u32 qpn;
+	__u32 qkey;
+	__u32 status;
+	__u64 info;
+	__u64 data;
+	__u8  info_len;
+	__u8  data_len;
+	__u8  reserved[6];
+};
+/*
+ * event notification ABI structures.
+ */
+struct ib_ucm_event_get {
+	__u64 response;
+	__u64 data;
+	__u64 info;
+	__u8  data_len;
+	__u8  info_len;
+	__u8  reserved[6];
+};
+
+struct ib_ucm_req_event_resp {
+	struct ib_user_path_rec primary_path;
+	struct ib_user_path_rec alternate_path;
+	__be64                 remote_ca_guid;
+	__u32                  remote_qkey;
+	__u32                  remote_qpn;
+	__u32                  qp_type;
+	__u32                  starting_psn;
+	__u8  responder_resources;
+	__u8  initiator_depth;
+	__u8  local_cm_response_timeout;
+	__u8  flow_control;
+	__u8  remote_cm_response_timeout;
+	__u8  retry_count;
+	__u8  rnr_retry_count;
+	__u8  srq;
+	__u8  port;
+	__u8  reserved[7];
+};
+
+struct ib_ucm_rep_event_resp {
+	__be64 remote_ca_guid;
+	__u32 remote_qkey;
+	__u32 remote_qpn;
+	__u32 starting_psn;
+	__u8  responder_resources;
+	__u8  initiator_depth;
+	__u8  target_ack_delay;
+	__u8  failover_accepted;
+	__u8  flow_control;
+	__u8  rnr_retry_count;
+	__u8  srq;
+	__u8  reserved[5];
+};
+
+struct ib_ucm_rej_event_resp {
+	__u32 reason;
+	/* ari in ib_ucm_event_get info field. */
+};
+
+struct ib_ucm_mra_event_resp {
+	__u8  timeout;
+	__u8  reserved[3];
+};
+
+struct ib_ucm_lap_event_resp {
+	struct ib_user_path_rec path;
+};
+
+struct ib_ucm_apr_event_resp {
+	__u32 status;
+	/* apr info in ib_ucm_event_get info field. */
+};
+
+struct ib_ucm_sidr_req_event_resp {
+	__u16 pkey;
+	__u8  port;
+	__u8  reserved;
+};
+
+struct ib_ucm_sidr_rep_event_resp {
+	__u32 status;
+	__u32 qkey;
+	__u32 qpn;
+	/* info in ib_ucm_event_get info field. */
+};
+
+#define IB_UCM_PRES_DATA      0x01
+#define IB_UCM_PRES_INFO      0x02
+#define IB_UCM_PRES_PRIMARY   0x04
+#define IB_UCM_PRES_ALTERNATE 0x08
+
+struct ib_ucm_event_resp {
+	__u64 uid;
+	__u32 id;
+	__u32 event;
+	__u32 present;
+	__u32 reserved;
+	union {
+		struct ib_ucm_req_event_resp req_resp;
+		struct ib_ucm_rep_event_resp rep_resp;
+		struct ib_ucm_rej_event_resp rej_resp;
+		struct ib_ucm_mra_event_resp mra_resp;
+		struct ib_ucm_lap_event_resp lap_resp;
+		struct ib_ucm_apr_event_resp apr_resp;
+
+		struct ib_ucm_sidr_req_event_resp sidr_req_resp;
+		struct ib_ucm_sidr_rep_event_resp sidr_rep_resp;
+
+		__u32                             send_status;
+	} u;
+};
+
+#endif /* IB_USER_CM_H */
diff --git a/include/uapi/rdma/ib_user_mad.h b/include/uapi/rdma/ib_user_mad.h
new file mode 100644
index 0000000..09f809f
--- /dev/null
+++ b/include/uapi/rdma/ib_user_mad.h
@@ -0,0 +1,245 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Voltaire, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef IB_USER_MAD_H
+#define IB_USER_MAD_H
+
+#include <linux/types.h>
+#include <linux/ioctl.h>
+
+/*
+ * Increment this value if any changes that break userspace ABI
+ * compatibility are made.
+ */
+#define IB_USER_MAD_ABI_VERSION	5
+
+/*
+ * Make sure that all structs defined in this file remain laid out so
+ * that they pack the same way on 32-bit and 64-bit architectures (to
+ * avoid incompatibility between 32-bit userspace and 64-bit kernels).
+ */
+
+/**
+ * ib_user_mad_hdr_old - Old version of MAD packet header without pkey_index
+ * @id - ID of agent MAD received with/to be sent with
+ * @status - 0 on successful receive, ETIMEDOUT if no response
+ *   received (transaction ID in data[] will be set to TID of original
+ *   request) (ignored on send)
+ * @timeout_ms - Milliseconds to wait for response (unset on receive)
+ * @retries - Number of automatic retries to attempt
+ * @qpn - Remote QP number received from/to be sent to
+ * @qkey - Remote Q_Key to be sent with (unset on receive)
+ * @lid - Remote lid received from/to be sent to
+ * @sl - Service level received with/to be sent with
+ * @path_bits - Local path bits received with/to be sent with
+ * @grh_present - If set, GRH was received/should be sent
+ * @gid_index - Local GID index to send with (unset on receive)
+ * @hop_limit - Hop limit in GRH
+ * @traffic_class - Traffic class in GRH
+ * @gid - Remote GID in GRH
+ * @flow_label - Flow label in GRH
+ */
+struct ib_user_mad_hdr_old {
+	__u32	id;
+	__u32	status;
+	__u32	timeout_ms;
+	__u32	retries;
+	__u32	length;
+	__be32	qpn;
+	__be32  qkey;
+	__be16	lid;
+	__u8	sl;
+	__u8	path_bits;
+	__u8	grh_present;
+	__u8	gid_index;
+	__u8	hop_limit;
+	__u8	traffic_class;
+	__u8	gid[16];
+	__be32	flow_label;
+};
+
+/**
+ * ib_user_mad_hdr - MAD packet header
+ *   This layout allows specifying/receiving the P_Key index.  To use
+ *   this capability, an application must call the
+ *   IB_USER_MAD_ENABLE_PKEY ioctl on the user MAD file handle before
+ *   any other actions with the file handle.
+ * @id - ID of agent MAD received with/to be sent with
+ * @status - 0 on successful receive, ETIMEDOUT if no response
+ *   received (transaction ID in data[] will be set to TID of original
+ *   request) (ignored on send)
+ * @timeout_ms - Milliseconds to wait for response (unset on receive)
+ * @retries - Number of automatic retries to attempt
+ * @qpn - Remote QP number received from/to be sent to
+ * @qkey - Remote Q_Key to be sent with (unset on receive)
+ * @lid - Remote lid received from/to be sent to
+ * @sl - Service level received with/to be sent with
+ * @path_bits - Local path bits received with/to be sent with
+ * @grh_present - If set, GRH was received/should be sent
+ * @gid_index - Local GID index to send with (unset on receive)
+ * @hop_limit - Hop limit in GRH
+ * @traffic_class - Traffic class in GRH
+ * @gid - Remote GID in GRH
+ * @flow_label - Flow label in GRH
+ * @pkey_index - P_Key index
+ */
+struct ib_user_mad_hdr {
+	__u32	id;
+	__u32	status;
+	__u32	timeout_ms;
+	__u32	retries;
+	__u32	length;
+	__be32	qpn;
+	__be32  qkey;
+	__be16	lid;
+	__u8	sl;
+	__u8	path_bits;
+	__u8	grh_present;
+	__u8	gid_index;
+	__u8	hop_limit;
+	__u8	traffic_class;
+	__u8	gid[16];
+	__be32	flow_label;
+	__u16	pkey_index;
+	__u8	reserved[6];
+};
+
+/**
+ * ib_user_mad - MAD packet
+ * @hdr - MAD packet header
+ * @data - Contents of MAD
+ *
+ */
+struct ib_user_mad {
+	struct ib_user_mad_hdr hdr;
+	__u64	data[0];
+};
+
+/*
+ * Earlier versions of this interface definition declared the
+ * method_mask[] member as an array of __u32 but treated it as a
+ * bitmap made up of longs in the kernel.  This ambiguity meant that
+ * 32-bit big-endian applications that can run on both 32-bit and
+ * 64-bit kernels had no consistent ABI to rely on, and 64-bit
+ * big-endian applications that treated method_mask as being made up
+ * of 32-bit words would have their bitmap misinterpreted.
+ *
+ * To clear up this confusion, we change the declaration of
+ * method_mask[] to use unsigned long and handle the conversion from
+ * 32-bit userspace to 64-bit kernel for big-endian systems in the
+ * compat_ioctl method.  Unfortunately, to keep the structure layout
+ * the same, we need the method_mask[] array to be aligned only to 4
+ * bytes even when long is 64 bits, which forces us into this ugly
+ * typedef.
+ */
+typedef unsigned long __attribute__((aligned(4))) packed_ulong;
+#define IB_USER_MAD_LONGS_PER_METHOD_MASK (128 / (8 * sizeof (long)))
+
+/**
+ * ib_user_mad_reg_req - MAD registration request
+ * @id - Set by the kernel; used to identify agent in future requests.
+ * @qpn - Queue pair number; must be 0 or 1.
+ * @method_mask - The caller will receive unsolicited MADs for any method
+ *   where @method_mask = 1.
+ * @mgmt_class - Indicates which management class of MADs should be receive
+ *   by the caller.  This field is only required if the user wishes to
+ *   receive unsolicited MADs, otherwise it should be 0.
+ * @mgmt_class_version - Indicates which version of MADs for the given
+ *   management class to receive.
+ * @oui: Indicates IEEE OUI when mgmt_class is a vendor class
+ *   in the range from 0x30 to 0x4f. Otherwise not used.
+ * @rmpp_version: If set, indicates the RMPP version used.
+ *
+ */
+struct ib_user_mad_reg_req {
+	__u32	id;
+	packed_ulong method_mask[IB_USER_MAD_LONGS_PER_METHOD_MASK];
+	__u8	qpn;
+	__u8	mgmt_class;
+	__u8	mgmt_class_version;
+	__u8    oui[3];
+	__u8	rmpp_version;
+};
+
+/**
+ * ib_user_mad_reg_req2 - MAD registration request
+ *
+ * @id                 - Set by the _kernel_; used by userspace to identify the
+ *                       registered agent in future requests.
+ * @qpn                - Queue pair number; must be 0 or 1.
+ * @mgmt_class         - Indicates which management class of MADs should be
+ *                       receive by the caller.  This field is only required if
+ *                       the user wishes to receive unsolicited MADs, otherwise
+ *                       it should be 0.
+ * @mgmt_class_version - Indicates which version of MADs for the given
+ *                       management class to receive.
+ * @res                - Ignored.
+ * @flags              - additional registration flags; Must be in the set of
+ *                       flags defined in IB_USER_MAD_REG_FLAGS_CAP
+ * @method_mask        - The caller wishes to receive unsolicited MADs for the
+ *                       methods whose bit(s) is(are) set.
+ * @oui                - Indicates IEEE OUI to use when mgmt_class is a vendor
+ *                       class in the range from 0x30 to 0x4f. Otherwise not
+ *                       used.
+ * @rmpp_version       - If set, indicates the RMPP version to use.
+ */
+enum {
+	IB_USER_MAD_USER_RMPP = (1 << 0),
+};
+#define IB_USER_MAD_REG_FLAGS_CAP (IB_USER_MAD_USER_RMPP)
+struct ib_user_mad_reg_req2 {
+	__u32	id;
+	__u32	qpn;
+	__u8	mgmt_class;
+	__u8	mgmt_class_version;
+	__u16   res;
+	__u32   flags;
+	__u64   method_mask[2];
+	__u32   oui;
+	__u8	rmpp_version;
+	__u8	reserved[3];
+};
+
+#define IB_IOCTL_MAGIC		0x1b
+
+#define IB_USER_MAD_REGISTER_AGENT	_IOWR(IB_IOCTL_MAGIC, 1, \
+					      struct ib_user_mad_reg_req)
+
+#define IB_USER_MAD_UNREGISTER_AGENT	_IOW(IB_IOCTL_MAGIC, 2, __u32)
+
+#define IB_USER_MAD_ENABLE_PKEY		_IO(IB_IOCTL_MAGIC, 3)
+
+#define IB_USER_MAD_REGISTER_AGENT2     _IOWR(IB_IOCTL_MAGIC, 4, \
+					      struct ib_user_mad_reg_req2)
+
+#endif /* IB_USER_MAD_H */
diff --git a/include/uapi/rdma/ib_user_sa.h b/include/uapi/rdma/ib_user_sa.h
new file mode 100644
index 0000000..cfc7c9b
--- /dev/null
+++ b/include/uapi/rdma/ib_user_sa.h
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2005 Intel Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef IB_USER_SA_H
+#define IB_USER_SA_H
+
+#include <linux/types.h>
+
+enum {
+	IB_PATH_GMP		= 1,
+	IB_PATH_PRIMARY		= (1<<1),
+	IB_PATH_ALTERNATE	= (1<<2),
+	IB_PATH_OUTBOUND	= (1<<3),
+	IB_PATH_INBOUND		= (1<<4),
+	IB_PATH_INBOUND_REVERSE = (1<<5),
+	IB_PATH_BIDIRECTIONAL	= IB_PATH_OUTBOUND | IB_PATH_INBOUND_REVERSE
+};
+
+struct ib_path_rec_data {
+	__u32	flags;
+	__u32	reserved;
+	__u32	path_rec[16];
+};
+
+struct ib_user_path_rec {
+	__u8	dgid[16];
+	__u8	sgid[16];
+	__be16	dlid;
+	__be16	slid;
+	__u32	raw_traffic;
+	__be32	flow_label;
+	__u32	reversible;
+	__u32	mtu;
+	__be16	pkey;
+	__u8	hop_limit;
+	__u8	traffic_class;
+	__u8	numb_path;
+	__u8	sl;
+	__u8	mtu_selector;
+	__u8	rate_selector;
+	__u8	rate;
+	__u8	packet_life_time_selector;
+	__u8	packet_life_time;
+	__u8	preference;
+};
+
+#endif /* IB_USER_SA_H */
diff --git a/include/uapi/rdma/ib_user_verbs.h b/include/uapi/rdma/ib_user_verbs.h
new file mode 100644
index 0000000..26daf55
--- /dev/null
+++ b/include/uapi/rdma/ib_user_verbs.h
@@ -0,0 +1,880 @@
+/*
+ * Copyright (c) 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005, 2006 Cisco Systems.  All rights reserved.
+ * Copyright (c) 2005 PathScale, Inc.  All rights reserved.
+ * Copyright (c) 2006 Mellanox Technologies.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef IB_USER_VERBS_H
+#define IB_USER_VERBS_H
+
+#include <linux/types.h>
+
+/*
+ * Increment this value if any changes that break userspace ABI
+ * compatibility are made.
+ */
+#define IB_USER_VERBS_ABI_VERSION	6
+#define IB_USER_VERBS_CMD_THRESHOLD    50
+
+enum {
+	IB_USER_VERBS_CMD_GET_CONTEXT,
+	IB_USER_VERBS_CMD_QUERY_DEVICE,
+	IB_USER_VERBS_CMD_QUERY_PORT,
+	IB_USER_VERBS_CMD_ALLOC_PD,
+	IB_USER_VERBS_CMD_DEALLOC_PD,
+	IB_USER_VERBS_CMD_CREATE_AH,
+	IB_USER_VERBS_CMD_MODIFY_AH,
+	IB_USER_VERBS_CMD_QUERY_AH,
+	IB_USER_VERBS_CMD_DESTROY_AH,
+	IB_USER_VERBS_CMD_REG_MR,
+	IB_USER_VERBS_CMD_REG_SMR,
+	IB_USER_VERBS_CMD_REREG_MR,
+	IB_USER_VERBS_CMD_QUERY_MR,
+	IB_USER_VERBS_CMD_DEREG_MR,
+	IB_USER_VERBS_CMD_ALLOC_MW,
+	IB_USER_VERBS_CMD_BIND_MW,
+	IB_USER_VERBS_CMD_DEALLOC_MW,
+	IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL,
+	IB_USER_VERBS_CMD_CREATE_CQ,
+	IB_USER_VERBS_CMD_RESIZE_CQ,
+	IB_USER_VERBS_CMD_DESTROY_CQ,
+	IB_USER_VERBS_CMD_POLL_CQ,
+	IB_USER_VERBS_CMD_PEEK_CQ,
+	IB_USER_VERBS_CMD_REQ_NOTIFY_CQ,
+	IB_USER_VERBS_CMD_CREATE_QP,
+	IB_USER_VERBS_CMD_QUERY_QP,
+	IB_USER_VERBS_CMD_MODIFY_QP,
+	IB_USER_VERBS_CMD_DESTROY_QP,
+	IB_USER_VERBS_CMD_POST_SEND,
+	IB_USER_VERBS_CMD_POST_RECV,
+	IB_USER_VERBS_CMD_ATTACH_MCAST,
+	IB_USER_VERBS_CMD_DETACH_MCAST,
+	IB_USER_VERBS_CMD_CREATE_SRQ,
+	IB_USER_VERBS_CMD_MODIFY_SRQ,
+	IB_USER_VERBS_CMD_QUERY_SRQ,
+	IB_USER_VERBS_CMD_DESTROY_SRQ,
+	IB_USER_VERBS_CMD_POST_SRQ_RECV,
+	IB_USER_VERBS_CMD_OPEN_XRCD,
+	IB_USER_VERBS_CMD_CLOSE_XRCD,
+	IB_USER_VERBS_CMD_CREATE_XSRQ,
+	IB_USER_VERBS_CMD_OPEN_QP,
+};
+
+enum {
+	IB_USER_VERBS_EX_CMD_CREATE_FLOW = IB_USER_VERBS_CMD_THRESHOLD,
+	IB_USER_VERBS_EX_CMD_DESTROY_FLOW
+};
+
+/*
+ * Make sure that all structs defined in this file remain laid out so
+ * that they pack the same way on 32-bit and 64-bit architectures (to
+ * avoid incompatibility between 32-bit userspace and 64-bit kernels).
+ * Specifically:
+ *  - Do not use pointer types -- pass pointers in __u64 instead.
+ *  - Make sure that any structure larger than 4 bytes is padded to a
+ *    multiple of 8 bytes.  Otherwise the structure size will be
+ *    different between 32-bit and 64-bit architectures.
+ */
+
+struct ib_uverbs_async_event_desc {
+	__u64 element;
+	__u32 event_type;	/* enum ib_event_type */
+	__u32 reserved;
+};
+
+struct ib_uverbs_comp_event_desc {
+	__u64 cq_handle;
+};
+
+/*
+ * All commands from userspace should start with a __u32 command field
+ * followed by __u16 in_words and out_words fields (which give the
+ * length of the command block and response buffer if any in 32-bit
+ * words).  The kernel driver will read these fields first and read
+ * the rest of the command struct based on these value.
+ */
+
+#define IB_USER_VERBS_CMD_COMMAND_MASK 0xff
+#define IB_USER_VERBS_CMD_FLAGS_MASK 0xff000000u
+#define IB_USER_VERBS_CMD_FLAGS_SHIFT 24
+
+#define IB_USER_VERBS_CMD_FLAG_EXTENDED 0x80
+
+struct ib_uverbs_cmd_hdr {
+	__u32 command;
+	__u16 in_words;
+	__u16 out_words;
+};
+
+struct ib_uverbs_ex_cmd_hdr {
+	__u64 response;
+	__u16 provider_in_words;
+	__u16 provider_out_words;
+	__u32 cmd_hdr_reserved;
+};
+
+struct ib_uverbs_get_context {
+	__u64 response;
+	__u64 driver_data[0];
+};
+
+struct ib_uverbs_get_context_resp {
+	__u32 async_fd;
+	__u32 num_comp_vectors;
+};
+
+struct ib_uverbs_query_device {
+	__u64 response;
+	__u64 driver_data[0];
+};
+
+struct ib_uverbs_query_device_resp {
+	__u64 fw_ver;
+	__be64 node_guid;
+	__be64 sys_image_guid;
+	__u64 max_mr_size;
+	__u64 page_size_cap;
+	__u32 vendor_id;
+	__u32 vendor_part_id;
+	__u32 hw_ver;
+	__u32 max_qp;
+	__u32 max_qp_wr;
+	__u32 device_cap_flags;
+	__u32 max_sge;
+	__u32 max_sge_rd;
+	__u32 max_cq;
+	__u32 max_cqe;
+	__u32 max_mr;
+	__u32 max_pd;
+	__u32 max_qp_rd_atom;
+	__u32 max_ee_rd_atom;
+	__u32 max_res_rd_atom;
+	__u32 max_qp_init_rd_atom;
+	__u32 max_ee_init_rd_atom;
+	__u32 atomic_cap;
+	__u32 max_ee;
+	__u32 max_rdd;
+	__u32 max_mw;
+	__u32 max_raw_ipv6_qp;
+	__u32 max_raw_ethy_qp;
+	__u32 max_mcast_grp;
+	__u32 max_mcast_qp_attach;
+	__u32 max_total_mcast_qp_attach;
+	__u32 max_ah;
+	__u32 max_fmr;
+	__u32 max_map_per_fmr;
+	__u32 max_srq;
+	__u32 max_srq_wr;
+	__u32 max_srq_sge;
+	__u16 max_pkeys;
+	__u8  local_ca_ack_delay;
+	__u8  phys_port_cnt;
+	__u8  reserved[4];
+};
+
+struct ib_uverbs_query_port {
+	__u64 response;
+	__u8  port_num;
+	__u8  reserved[7];
+	__u64 driver_data[0];
+};
+
+struct ib_uverbs_query_port_resp {
+	__u32 port_cap_flags;
+	__u32 max_msg_sz;
+	__u32 bad_pkey_cntr;
+	__u32 qkey_viol_cntr;
+	__u32 gid_tbl_len;
+	__u16 pkey_tbl_len;
+	__u16 lid;
+	__u16 sm_lid;
+	__u8  state;
+	__u8  max_mtu;
+	__u8  active_mtu;
+	__u8  lmc;
+	__u8  max_vl_num;
+	__u8  sm_sl;
+	__u8  subnet_timeout;
+	__u8  init_type_reply;
+	__u8  active_width;
+	__u8  active_speed;
+	__u8  phys_state;
+	__u8  link_layer;
+	__u8  reserved[2];
+};
+
+struct ib_uverbs_alloc_pd {
+	__u64 response;
+	__u64 driver_data[0];
+};
+
+struct ib_uverbs_alloc_pd_resp {
+	__u32 pd_handle;
+};
+
+struct ib_uverbs_dealloc_pd {
+	__u32 pd_handle;
+};
+
+struct ib_uverbs_open_xrcd {
+	__u64 response;
+	__u32 fd;
+	__u32 oflags;
+	__u64 driver_data[0];
+};
+
+struct ib_uverbs_open_xrcd_resp {
+	__u32 xrcd_handle;
+};
+
+struct ib_uverbs_close_xrcd {
+	__u32 xrcd_handle;
+};
+
+struct ib_uverbs_reg_mr {
+	__u64 response;
+	__u64 start;
+	__u64 length;
+	__u64 hca_va;
+	__u32 pd_handle;
+	__u32 access_flags;
+	__u64 driver_data[0];
+};
+
+struct ib_uverbs_reg_mr_resp {
+	__u32 mr_handle;
+	__u32 lkey;
+	__u32 rkey;
+};
+
+struct ib_uverbs_rereg_mr {
+	__u64 response;
+	__u32 mr_handle;
+	__u32 flags;
+	__u64 start;
+	__u64 length;
+	__u64 hca_va;
+	__u32 pd_handle;
+	__u32 access_flags;
+};
+
+struct ib_uverbs_rereg_mr_resp {
+	__u32 lkey;
+	__u32 rkey;
+};
+
+struct ib_uverbs_dereg_mr {
+	__u32 mr_handle;
+};
+
+struct ib_uverbs_alloc_mw {
+	__u64 response;
+	__u32 pd_handle;
+	__u8  mw_type;
+	__u8  reserved[3];
+};
+
+struct ib_uverbs_alloc_mw_resp {
+	__u32 mw_handle;
+	__u32 rkey;
+};
+
+struct ib_uverbs_dealloc_mw {
+	__u32 mw_handle;
+};
+
+struct ib_uverbs_create_comp_channel {
+	__u64 response;
+};
+
+struct ib_uverbs_create_comp_channel_resp {
+	__u32 fd;
+};
+
+struct ib_uverbs_create_cq {
+	__u64 response;
+	__u64 user_handle;
+	__u32 cqe;
+	__u32 comp_vector;
+	__s32 comp_channel;
+	__u32 reserved;
+	__u64 driver_data[0];
+};
+
+struct ib_uverbs_create_cq_resp {
+	__u32 cq_handle;
+	__u32 cqe;
+};
+
+struct ib_uverbs_resize_cq {
+	__u64 response;
+	__u32 cq_handle;
+	__u32 cqe;
+	__u64 driver_data[0];
+};
+
+struct ib_uverbs_resize_cq_resp {
+	__u32 cqe;
+	__u32 reserved;
+	__u64 driver_data[0];
+};
+
+struct ib_uverbs_poll_cq {
+	__u64 response;
+	__u32 cq_handle;
+	__u32 ne;
+};
+
+struct ib_uverbs_wc {
+	__u64 wr_id;
+	__u32 status;
+	__u32 opcode;
+	__u32 vendor_err;
+	__u32 byte_len;
+	union {
+		__u32 imm_data;
+		__u32 invalidate_rkey;
+	} ex;
+	__u32 qp_num;
+	__u32 src_qp;
+	__u32 wc_flags;
+	__u16 pkey_index;
+	__u16 slid;
+	__u8 sl;
+	__u8 dlid_path_bits;
+	__u8 port_num;
+	__u8 reserved;
+};
+
+struct ib_uverbs_poll_cq_resp {
+	__u32 count;
+	__u32 reserved;
+	struct ib_uverbs_wc wc[0];
+};
+
+struct ib_uverbs_req_notify_cq {
+	__u32 cq_handle;
+	__u32 solicited_only;
+};
+
+struct ib_uverbs_destroy_cq {
+	__u64 response;
+	__u32 cq_handle;
+	__u32 reserved;
+};
+
+struct ib_uverbs_destroy_cq_resp {
+	__u32 comp_events_reported;
+	__u32 async_events_reported;
+};
+
+struct ib_uverbs_global_route {
+	__u8  dgid[16];
+	__u32 flow_label;
+	__u8  sgid_index;
+	__u8  hop_limit;
+	__u8  traffic_class;
+	__u8  reserved;
+};
+
+struct ib_uverbs_ah_attr {
+	struct ib_uverbs_global_route grh;
+	__u16 dlid;
+	__u8  sl;
+	__u8  src_path_bits;
+	__u8  static_rate;
+	__u8  is_global;
+	__u8  port_num;
+	__u8  reserved;
+};
+
+struct ib_uverbs_qp_attr {
+	__u32	qp_attr_mask;
+	__u32	qp_state;
+	__u32	cur_qp_state;
+	__u32	path_mtu;
+	__u32	path_mig_state;
+	__u32	qkey;
+	__u32	rq_psn;
+	__u32	sq_psn;
+	__u32	dest_qp_num;
+	__u32	qp_access_flags;
+
+	struct ib_uverbs_ah_attr ah_attr;
+	struct ib_uverbs_ah_attr alt_ah_attr;
+
+	/* ib_qp_cap */
+	__u32	max_send_wr;
+	__u32	max_recv_wr;
+	__u32	max_send_sge;
+	__u32	max_recv_sge;
+	__u32	max_inline_data;
+
+	__u16	pkey_index;
+	__u16	alt_pkey_index;
+	__u8	en_sqd_async_notify;
+	__u8	sq_draining;
+	__u8	max_rd_atomic;
+	__u8	max_dest_rd_atomic;
+	__u8	min_rnr_timer;
+	__u8	port_num;
+	__u8	timeout;
+	__u8	retry_cnt;
+	__u8	rnr_retry;
+	__u8	alt_port_num;
+	__u8	alt_timeout;
+	__u8	reserved[5];
+};
+
+struct ib_uverbs_create_qp {
+	__u64 response;
+	__u64 user_handle;
+	__u32 pd_handle;
+	__u32 send_cq_handle;
+	__u32 recv_cq_handle;
+	__u32 srq_handle;
+	__u32 max_send_wr;
+	__u32 max_recv_wr;
+	__u32 max_send_sge;
+	__u32 max_recv_sge;
+	__u32 max_inline_data;
+	__u8  sq_sig_all;
+	__u8  qp_type;
+	__u8  is_srq;
+	__u8  reserved;
+	__u64 driver_data[0];
+};
+
+struct ib_uverbs_open_qp {
+	__u64 response;
+	__u64 user_handle;
+	__u32 pd_handle;
+	__u32 qpn;
+	__u8  qp_type;
+	__u8  reserved[7];
+	__u64 driver_data[0];
+};
+
+/* also used for open response */
+struct ib_uverbs_create_qp_resp {
+	__u32 qp_handle;
+	__u32 qpn;
+	__u32 max_send_wr;
+	__u32 max_recv_wr;
+	__u32 max_send_sge;
+	__u32 max_recv_sge;
+	__u32 max_inline_data;
+	__u32 reserved;
+};
+
+/*
+ * This struct needs to remain a multiple of 8 bytes to keep the
+ * alignment of the modify QP parameters.
+ */
+struct ib_uverbs_qp_dest {
+	__u8  dgid[16];
+	__u32 flow_label;
+	__u16 dlid;
+	__u16 reserved;
+	__u8  sgid_index;
+	__u8  hop_limit;
+	__u8  traffic_class;
+	__u8  sl;
+	__u8  src_path_bits;
+	__u8  static_rate;
+	__u8  is_global;
+	__u8  port_num;
+};
+
+struct ib_uverbs_query_qp {
+	__u64 response;
+	__u32 qp_handle;
+	__u32 attr_mask;
+	__u64 driver_data[0];
+};
+
+struct ib_uverbs_query_qp_resp {
+	struct ib_uverbs_qp_dest dest;
+	struct ib_uverbs_qp_dest alt_dest;
+	__u32 max_send_wr;
+	__u32 max_recv_wr;
+	__u32 max_send_sge;
+	__u32 max_recv_sge;
+	__u32 max_inline_data;
+	__u32 qkey;
+	__u32 rq_psn;
+	__u32 sq_psn;
+	__u32 dest_qp_num;
+	__u32 qp_access_flags;
+	__u16 pkey_index;
+	__u16 alt_pkey_index;
+	__u8  qp_state;
+	__u8  cur_qp_state;
+	__u8  path_mtu;
+	__u8  path_mig_state;
+	__u8  sq_draining;
+	__u8  max_rd_atomic;
+	__u8  max_dest_rd_atomic;
+	__u8  min_rnr_timer;
+	__u8  port_num;
+	__u8  timeout;
+	__u8  retry_cnt;
+	__u8  rnr_retry;
+	__u8  alt_port_num;
+	__u8  alt_timeout;
+	__u8  sq_sig_all;
+	__u8  reserved[5];
+	__u64 driver_data[0];
+};
+
+struct ib_uverbs_modify_qp {
+	struct ib_uverbs_qp_dest dest;
+	struct ib_uverbs_qp_dest alt_dest;
+	__u32 qp_handle;
+	__u32 attr_mask;
+	__u32 qkey;
+	__u32 rq_psn;
+	__u32 sq_psn;
+	__u32 dest_qp_num;
+	__u32 qp_access_flags;
+	__u16 pkey_index;
+	__u16 alt_pkey_index;
+	__u8  qp_state;
+	__u8  cur_qp_state;
+	__u8  path_mtu;
+	__u8  path_mig_state;
+	__u8  en_sqd_async_notify;
+	__u8  max_rd_atomic;
+	__u8  max_dest_rd_atomic;
+	__u8  min_rnr_timer;
+	__u8  port_num;
+	__u8  timeout;
+	__u8  retry_cnt;
+	__u8  rnr_retry;
+	__u8  alt_port_num;
+	__u8  alt_timeout;
+	__u8  reserved[2];
+	__u64 driver_data[0];
+};
+
+struct ib_uverbs_modify_qp_resp {
+};
+
+struct ib_uverbs_destroy_qp {
+	__u64 response;
+	__u32 qp_handle;
+	__u32 reserved;
+};
+
+struct ib_uverbs_destroy_qp_resp {
+	__u32 events_reported;
+};
+
+/*
+ * The ib_uverbs_sge structure isn't used anywhere, since we assume
+ * the ib_sge structure is packed the same way on 32-bit and 64-bit
+ * architectures in both kernel and user space.  It's just here to
+ * document the ABI.
+ */
+struct ib_uverbs_sge {
+	__u64 addr;
+	__u32 length;
+	__u32 lkey;
+};
+
+struct ib_uverbs_send_wr {
+	__u64 wr_id;
+	__u32 num_sge;
+	__u32 opcode;
+	__u32 send_flags;
+	union {
+		__u32 imm_data;
+		__u32 invalidate_rkey;
+	} ex;
+	union {
+		struct {
+			__u64 remote_addr;
+			__u32 rkey;
+			__u32 reserved;
+		} rdma;
+		struct {
+			__u64 remote_addr;
+			__u64 compare_add;
+			__u64 swap;
+			__u32 rkey;
+			__u32 reserved;
+		} atomic;
+		struct {
+			__u32 ah;
+			__u32 remote_qpn;
+			__u32 remote_qkey;
+			__u32 reserved;
+		} ud;
+	} wr;
+};
+
+struct ib_uverbs_post_send {
+	__u64 response;
+	__u32 qp_handle;
+	__u32 wr_count;
+	__u32 sge_count;
+	__u32 wqe_size;
+	struct ib_uverbs_send_wr send_wr[0];
+};
+
+struct ib_uverbs_post_send_resp {
+	__u32 bad_wr;
+};
+
+struct ib_uverbs_recv_wr {
+	__u64 wr_id;
+	__u32 num_sge;
+	__u32 reserved;
+};
+
+struct ib_uverbs_post_recv {
+	__u64 response;
+	__u32 qp_handle;
+	__u32 wr_count;
+	__u32 sge_count;
+	__u32 wqe_size;
+	struct ib_uverbs_recv_wr recv_wr[0];
+};
+
+struct ib_uverbs_post_recv_resp {
+	__u32 bad_wr;
+};
+
+struct ib_uverbs_post_srq_recv {
+	__u64 response;
+	__u32 srq_handle;
+	__u32 wr_count;
+	__u32 sge_count;
+	__u32 wqe_size;
+	struct ib_uverbs_recv_wr recv[0];
+};
+
+struct ib_uverbs_post_srq_recv_resp {
+	__u32 bad_wr;
+};
+
+struct ib_uverbs_create_ah {
+	__u64 response;
+	__u64 user_handle;
+	__u32 pd_handle;
+	__u32 reserved;
+	struct ib_uverbs_ah_attr attr;
+};
+
+struct ib_uverbs_create_ah_resp {
+	__u32 ah_handle;
+};
+
+struct ib_uverbs_destroy_ah {
+	__u32 ah_handle;
+};
+
+struct ib_uverbs_attach_mcast {
+	__u8  gid[16];
+	__u32 qp_handle;
+	__u16 mlid;
+	__u16 reserved;
+	__u64 driver_data[0];
+};
+
+struct ib_uverbs_detach_mcast {
+	__u8  gid[16];
+	__u32 qp_handle;
+	__u16 mlid;
+	__u16 reserved;
+	__u64 driver_data[0];
+};
+
+struct ib_uverbs_flow_spec_hdr {
+	__u32 type;
+	__u16 size;
+	__u16 reserved;
+	/* followed by flow_spec */
+	__u64 flow_spec_data[0];
+};
+
+struct ib_uverbs_flow_eth_filter {
+	__u8  dst_mac[6];
+	__u8  src_mac[6];
+	__be16 ether_type;
+	__be16 vlan_tag;
+};
+
+struct ib_uverbs_flow_spec_eth {
+	union {
+		struct ib_uverbs_flow_spec_hdr hdr;
+		struct {
+			__u32 type;
+			__u16 size;
+			__u16 reserved;
+		};
+	};
+	struct ib_uverbs_flow_eth_filter val;
+	struct ib_uverbs_flow_eth_filter mask;
+};
+
+struct ib_uverbs_flow_ipv4_filter {
+	__be32 src_ip;
+	__be32 dst_ip;
+};
+
+struct ib_uverbs_flow_spec_ipv4 {
+	union {
+		struct ib_uverbs_flow_spec_hdr hdr;
+		struct {
+			__u32 type;
+			__u16 size;
+			__u16 reserved;
+		};
+	};
+	struct ib_uverbs_flow_ipv4_filter val;
+	struct ib_uverbs_flow_ipv4_filter mask;
+};
+
+struct ib_uverbs_flow_tcp_udp_filter {
+	__be16 dst_port;
+	__be16 src_port;
+};
+
+struct ib_uverbs_flow_spec_tcp_udp {
+	union {
+		struct ib_uverbs_flow_spec_hdr hdr;
+		struct {
+			__u32 type;
+			__u16 size;
+			__u16 reserved;
+		};
+	};
+	struct ib_uverbs_flow_tcp_udp_filter val;
+	struct ib_uverbs_flow_tcp_udp_filter mask;
+};
+
+struct ib_uverbs_flow_attr {
+	__u32 type;
+	__u16 size;
+	__u16 priority;
+	__u8  num_of_specs;
+	__u8  reserved[2];
+	__u8  port;
+	__u32 flags;
+	/* Following are the optional layers according to user request
+	 * struct ib_flow_spec_xxx
+	 * struct ib_flow_spec_yyy
+	 */
+	struct ib_uverbs_flow_spec_hdr flow_specs[0];
+};
+
+struct ib_uverbs_create_flow  {
+	__u32 comp_mask;
+	__u32 qp_handle;
+	struct ib_uverbs_flow_attr flow_attr;
+};
+
+struct ib_uverbs_create_flow_resp {
+	__u32 comp_mask;
+	__u32 flow_handle;
+};
+
+struct ib_uverbs_destroy_flow  {
+	__u32 comp_mask;
+	__u32 flow_handle;
+};
+
+struct ib_uverbs_create_srq {
+	__u64 response;
+	__u64 user_handle;
+	__u32 pd_handle;
+	__u32 max_wr;
+	__u32 max_sge;
+	__u32 srq_limit;
+	__u64 driver_data[0];
+};
+
+struct ib_uverbs_create_xsrq {
+	__u64 response;
+	__u64 user_handle;
+	__u32 srq_type;
+	__u32 pd_handle;
+	__u32 max_wr;
+	__u32 max_sge;
+	__u32 srq_limit;
+	__u32 reserved;
+	__u32 xrcd_handle;
+	__u32 cq_handle;
+	__u64 driver_data[0];
+};
+
+struct ib_uverbs_create_srq_resp {
+	__u32 srq_handle;
+	__u32 max_wr;
+	__u32 max_sge;
+	__u32 srqn;
+};
+
+struct ib_uverbs_modify_srq {
+	__u32 srq_handle;
+	__u32 attr_mask;
+	__u32 max_wr;
+	__u32 srq_limit;
+	__u64 driver_data[0];
+};
+
+struct ib_uverbs_query_srq {
+	__u64 response;
+	__u32 srq_handle;
+	__u32 reserved;
+	__u64 driver_data[0];
+};
+
+struct ib_uverbs_query_srq_resp {
+	__u32 max_wr;
+	__u32 max_sge;
+	__u32 srq_limit;
+	__u32 reserved;
+};
+
+struct ib_uverbs_destroy_srq {
+	__u64 response;
+	__u32 srq_handle;
+	__u32 reserved;
+};
+
+struct ib_uverbs_destroy_srq_resp {
+	__u32 events_reported;
+};
+
+#endif /* IB_USER_VERBS_H */
diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h
new file mode 100644
index 0000000..de69170
--- /dev/null
+++ b/include/uapi/rdma/rdma_netlink.h
@@ -0,0 +1,131 @@
+#ifndef _UAPI_RDMA_NETLINK_H
+#define _UAPI_RDMA_NETLINK_H
+
+#include <linux/types.h>
+
+enum {
+	RDMA_NL_RDMA_CM = 1,
+	RDMA_NL_NES,
+	RDMA_NL_C4IW,
+	RDMA_NL_NUM_CLIENTS
+};
+
+enum {
+	RDMA_NL_GROUP_CM = 1,
+	RDMA_NL_GROUP_IWPM,
+	RDMA_NL_NUM_GROUPS
+};
+
+#define RDMA_NL_GET_CLIENT(type) ((type & (((1 << 6) - 1) << 10)) >> 10)
+#define RDMA_NL_GET_OP(type) (type & ((1 << 10) - 1))
+#define RDMA_NL_GET_TYPE(client, op) ((client << 10) + op)
+
+enum {
+	RDMA_NL_RDMA_CM_ID_STATS = 0,
+	RDMA_NL_RDMA_CM_NUM_OPS
+};
+
+enum {
+	RDMA_NL_RDMA_CM_ATTR_SRC_ADDR = 1,
+	RDMA_NL_RDMA_CM_ATTR_DST_ADDR,
+	RDMA_NL_RDMA_CM_NUM_ATTR,
+};
+
+/* iwarp port mapper op-codes */
+enum {
+	RDMA_NL_IWPM_REG_PID = 0,
+	RDMA_NL_IWPM_ADD_MAPPING,
+	RDMA_NL_IWPM_QUERY_MAPPING,
+	RDMA_NL_IWPM_REMOVE_MAPPING,
+	RDMA_NL_IWPM_HANDLE_ERR,
+	RDMA_NL_IWPM_MAPINFO,
+	RDMA_NL_IWPM_MAPINFO_NUM,
+	RDMA_NL_IWPM_NUM_OPS
+};
+
+struct rdma_cm_id_stats {
+	__u32	qp_num;
+	__u32	bound_dev_if;
+	__u32	port_space;
+	__s32	pid;
+	__u8	cm_state;
+	__u8	node_type;
+	__u8	port_num;
+	__u8	qp_type;
+};
+
+enum {
+	IWPM_NLA_REG_PID_UNSPEC = 0,
+	IWPM_NLA_REG_PID_SEQ,
+	IWPM_NLA_REG_IF_NAME,
+	IWPM_NLA_REG_IBDEV_NAME,
+	IWPM_NLA_REG_ULIB_NAME,
+	IWPM_NLA_REG_PID_MAX
+};
+
+enum {
+	IWPM_NLA_RREG_PID_UNSPEC = 0,
+	IWPM_NLA_RREG_PID_SEQ,
+	IWPM_NLA_RREG_IBDEV_NAME,
+	IWPM_NLA_RREG_ULIB_NAME,
+	IWPM_NLA_RREG_ULIB_VER,
+	IWPM_NLA_RREG_PID_ERR,
+	IWPM_NLA_RREG_PID_MAX
+
+};
+
+enum {
+	IWPM_NLA_MANAGE_MAPPING_UNSPEC = 0,
+	IWPM_NLA_MANAGE_MAPPING_SEQ,
+	IWPM_NLA_MANAGE_ADDR,
+	IWPM_NLA_MANAGE_MAPPED_LOC_ADDR,
+	IWPM_NLA_RMANAGE_MAPPING_ERR,
+	IWPM_NLA_RMANAGE_MAPPING_MAX
+};
+
+#define IWPM_NLA_MANAGE_MAPPING_MAX 3
+#define IWPM_NLA_QUERY_MAPPING_MAX  4
+#define IWPM_NLA_MAPINFO_SEND_MAX   3
+
+enum {
+	IWPM_NLA_QUERY_MAPPING_UNSPEC = 0,
+	IWPM_NLA_QUERY_MAPPING_SEQ,
+	IWPM_NLA_QUERY_LOCAL_ADDR,
+	IWPM_NLA_QUERY_REMOTE_ADDR,
+	IWPM_NLA_RQUERY_MAPPED_LOC_ADDR,
+	IWPM_NLA_RQUERY_MAPPED_REM_ADDR,
+	IWPM_NLA_RQUERY_MAPPING_ERR,
+	IWPM_NLA_RQUERY_MAPPING_MAX
+};
+
+enum {
+	IWPM_NLA_MAPINFO_REQ_UNSPEC = 0,
+	IWPM_NLA_MAPINFO_ULIB_NAME,
+	IWPM_NLA_MAPINFO_ULIB_VER,
+	IWPM_NLA_MAPINFO_REQ_MAX
+};
+
+enum {
+	IWPM_NLA_MAPINFO_UNSPEC = 0,
+	IWPM_NLA_MAPINFO_LOCAL_ADDR,
+	IWPM_NLA_MAPINFO_MAPPED_ADDR,
+	IWPM_NLA_MAPINFO_MAX
+};
+
+enum {
+	IWPM_NLA_MAPINFO_NUM_UNSPEC = 0,
+	IWPM_NLA_MAPINFO_SEQ,
+	IWPM_NLA_MAPINFO_SEND_NUM,
+	IWPM_NLA_MAPINFO_ACK_NUM,
+	IWPM_NLA_MAPINFO_NUM_MAX
+};
+
+enum {
+	IWPM_NLA_ERR_UNSPEC = 0,
+	IWPM_NLA_ERR_SEQ,
+	IWPM_NLA_ERR_CODE,
+	IWPM_NLA_ERR_MAX
+};
+
+
+#endif /* _UAPI_RDMA_NETLINK_H */
diff --git a/include/uapi/rdma/rdma_user_cm.h b/include/uapi/rdma/rdma_user_cm.h
new file mode 100644
index 0000000..3066718
--- /dev/null
+++ b/include/uapi/rdma/rdma_user_cm.h
@@ -0,0 +1,303 @@
+/*
+ * Copyright (c) 2005-2006 Intel Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef RDMA_USER_CM_H
+#define RDMA_USER_CM_H
+
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/in6.h>
+#include <rdma/ib_user_verbs.h>
+#include <rdma/ib_user_sa.h>
+
+#define RDMA_USER_CM_ABI_VERSION	4
+
+#define RDMA_MAX_PRIVATE_DATA		256
+
+enum {
+	RDMA_USER_CM_CMD_CREATE_ID,
+	RDMA_USER_CM_CMD_DESTROY_ID,
+	RDMA_USER_CM_CMD_BIND_IP,
+	RDMA_USER_CM_CMD_RESOLVE_IP,
+	RDMA_USER_CM_CMD_RESOLVE_ROUTE,
+	RDMA_USER_CM_CMD_QUERY_ROUTE,
+	RDMA_USER_CM_CMD_CONNECT,
+	RDMA_USER_CM_CMD_LISTEN,
+	RDMA_USER_CM_CMD_ACCEPT,
+	RDMA_USER_CM_CMD_REJECT,
+	RDMA_USER_CM_CMD_DISCONNECT,
+	RDMA_USER_CM_CMD_INIT_QP_ATTR,
+	RDMA_USER_CM_CMD_GET_EVENT,
+	RDMA_USER_CM_CMD_GET_OPTION,
+	RDMA_USER_CM_CMD_SET_OPTION,
+	RDMA_USER_CM_CMD_NOTIFY,
+	RDMA_USER_CM_CMD_JOIN_IP_MCAST,
+	RDMA_USER_CM_CMD_LEAVE_MCAST,
+	RDMA_USER_CM_CMD_MIGRATE_ID,
+	RDMA_USER_CM_CMD_QUERY,
+	RDMA_USER_CM_CMD_BIND,
+	RDMA_USER_CM_CMD_RESOLVE_ADDR,
+	RDMA_USER_CM_CMD_JOIN_MCAST
+};
+
+/*
+ * command ABI structures.
+ */
+struct rdma_ucm_cmd_hdr {
+	__u32 cmd;
+	__u16 in;
+	__u16 out;
+};
+
+struct rdma_ucm_create_id {
+	__u64 uid;
+	__u64 response;
+	__u16 ps;
+	__u8  qp_type;
+	__u8  reserved[5];
+};
+
+struct rdma_ucm_create_id_resp {
+	__u32 id;
+};
+
+struct rdma_ucm_destroy_id {
+	__u64 response;
+	__u32 id;
+	__u32 reserved;
+};
+
+struct rdma_ucm_destroy_id_resp {
+	__u32 events_reported;
+};
+
+struct rdma_ucm_bind_ip {
+	__u64 response;
+	struct sockaddr_in6 addr;
+	__u32 id;
+};
+
+struct rdma_ucm_bind {
+	__u32 id;
+	__u16 addr_size;
+	__u16 reserved;
+	struct sockaddr_storage addr;
+};
+
+struct rdma_ucm_resolve_ip {
+	struct sockaddr_in6 src_addr;
+	struct sockaddr_in6 dst_addr;
+	__u32 id;
+	__u32 timeout_ms;
+};
+
+struct rdma_ucm_resolve_addr {
+	__u32 id;
+	__u32 timeout_ms;
+	__u16 src_size;
+	__u16 dst_size;
+	__u32 reserved;
+	struct sockaddr_storage src_addr;
+	struct sockaddr_storage dst_addr;
+};
+
+struct rdma_ucm_resolve_route {
+	__u32 id;
+	__u32 timeout_ms;
+};
+
+enum {
+	RDMA_USER_CM_QUERY_ADDR,
+	RDMA_USER_CM_QUERY_PATH,
+	RDMA_USER_CM_QUERY_GID
+};
+
+struct rdma_ucm_query {
+	__u64 response;
+	__u32 id;
+	__u32 option;
+};
+
+struct rdma_ucm_query_route_resp {
+	__u64 node_guid;
+	struct ib_user_path_rec ib_route[2];
+	struct sockaddr_in6 src_addr;
+	struct sockaddr_in6 dst_addr;
+	__u32 num_paths;
+	__u8 port_num;
+	__u8 reserved[3];
+};
+
+struct rdma_ucm_query_addr_resp {
+	__u64 node_guid;
+	__u8  port_num;
+	__u8  reserved;
+	__u16 pkey;
+	__u16 src_size;
+	__u16 dst_size;
+	struct sockaddr_storage src_addr;
+	struct sockaddr_storage dst_addr;
+};
+
+struct rdma_ucm_query_path_resp {
+	__u32 num_paths;
+	__u32 reserved;
+	struct ib_path_rec_data path_data[0];
+};
+
+struct rdma_ucm_conn_param {
+	__u32 qp_num;
+	__u32 qkey;
+	__u8  private_data[RDMA_MAX_PRIVATE_DATA];
+	__u8  private_data_len;
+	__u8  srq;
+	__u8  responder_resources;
+	__u8  initiator_depth;
+	__u8  flow_control;
+	__u8  retry_count;
+	__u8  rnr_retry_count;
+	__u8  valid;
+};
+
+struct rdma_ucm_ud_param {
+	__u32 qp_num;
+	__u32 qkey;
+	struct ib_uverbs_ah_attr ah_attr;
+	__u8  private_data[RDMA_MAX_PRIVATE_DATA];
+	__u8  private_data_len;
+	__u8  reserved[7];
+};
+
+struct rdma_ucm_connect {
+	struct rdma_ucm_conn_param conn_param;
+	__u32 id;
+	__u32 reserved;
+};
+
+struct rdma_ucm_listen {
+	__u32 id;
+	__u32 backlog;
+};
+
+struct rdma_ucm_accept {
+	__u64 uid;
+	struct rdma_ucm_conn_param conn_param;
+	__u32 id;
+	__u32 reserved;
+};
+
+struct rdma_ucm_reject {
+	__u32 id;
+	__u8  private_data_len;
+	__u8  reserved[3];
+	__u8  private_data[RDMA_MAX_PRIVATE_DATA];
+};
+
+struct rdma_ucm_disconnect {
+	__u32 id;
+};
+
+struct rdma_ucm_init_qp_attr {
+	__u64 response;
+	__u32 id;
+	__u32 qp_state;
+};
+
+struct rdma_ucm_notify {
+	__u32 id;
+	__u32 event;
+};
+
+struct rdma_ucm_join_ip_mcast {
+	__u64 response;		/* rdma_ucm_create_id_resp */
+	__u64 uid;
+	struct sockaddr_in6 addr;
+	__u32 id;
+};
+
+struct rdma_ucm_join_mcast {
+	__u64 response;		/* rdma_ucma_create_id_resp */
+	__u64 uid;
+	__u32 id;
+	__u16 addr_size;
+	__u16 reserved;
+	struct sockaddr_storage addr;
+};
+
+struct rdma_ucm_get_event {
+	__u64 response;
+};
+
+struct rdma_ucm_event_resp {
+	__u64 uid;
+	__u32 id;
+	__u32 event;
+	__u32 status;
+	union {
+		struct rdma_ucm_conn_param conn;
+		struct rdma_ucm_ud_param   ud;
+	} param;
+};
+
+/* Option levels */
+enum {
+	RDMA_OPTION_ID		= 0,
+	RDMA_OPTION_IB		= 1
+};
+
+/* Option details */
+enum {
+	RDMA_OPTION_ID_TOS	 = 0,
+	RDMA_OPTION_ID_REUSEADDR = 1,
+	RDMA_OPTION_ID_AFONLY	 = 2,
+	RDMA_OPTION_IB_PATH	 = 1
+};
+
+struct rdma_ucm_set_option {
+	__u64 optval;
+	__u32 id;
+	__u32 level;
+	__u32 optname;
+	__u32 optlen;
+};
+
+struct rdma_ucm_migrate_id {
+	__u64 response;
+	__u32 id;
+	__u32 fd;
+};
+
+struct rdma_ucm_migrate_resp {
+	__u32 events_reported;
+};
+
+#endif /* RDMA_USER_CM_H */
diff --git a/net/rds/Kconfig b/net/rds/Kconfig
new file mode 100644
index 0000000..f2c670b
--- /dev/null
+++ b/net/rds/Kconfig
@@ -0,0 +1,28 @@
+
+config RDS
+	tristate "The RDS Protocol"
+	depends on INET
+	---help---
+	  The RDS (Reliable Datagram Sockets) protocol provides reliable,
+	  sequenced delivery of datagrams over Infiniband, iWARP,
+	  or TCP.
+
+config RDS_RDMA
+	tristate "RDS over Infiniband and iWARP"
+	depends on RDS && INFINIBAND && INFINIBAND_ADDR_TRANS
+	---help---
+	  Allow RDS to use Infiniband and iWARP as a transport.
+	  This transport supports RDMA operations.
+
+config RDS_TCP
+	tristate "RDS over TCP"
+	depends on RDS
+	---help---
+	  Allow RDS to use TCP as a transport.
+	  This transport does not support RDMA operations.
+
+config RDS_DEBUG
+        bool "RDS debugging messages"
+	depends on RDS
+        default n
+
diff --git a/net/rds/Makefile b/net/rds/Makefile
new file mode 100644
index 0000000..56d3f60
--- /dev/null
+++ b/net/rds/Makefile
@@ -0,0 +1,19 @@
+obj-$(CONFIG_RDS) += rds.o
+rds-y :=	af_rds.o bind.o cong.o connection.o info.o message.o   \
+			recv.o send.o stats.o sysctl.o threads.o transport.o \
+			loop.o page.o rdma.o
+
+obj-$(CONFIG_RDS_RDMA) += rds_rdma.o
+rds_rdma-y :=	rdma_transport.o \
+			ib.o ib_cm.o ib_recv.o ib_ring.o ib_send.o ib_stats.o \
+			ib_sysctl.o ib_rdma.o \
+			iw.o iw_cm.o iw_recv.o iw_ring.o iw_send.o iw_stats.o \
+			iw_sysctl.o iw_rdma.o
+
+
+obj-$(CONFIG_RDS_TCP) += rds_tcp.o
+rds_tcp-y :=		tcp.o tcp_connect.o tcp_listen.o tcp_recv.o \
+			tcp_send.o tcp_stats.o
+
+ccflags-$(CONFIG_RDS_DEBUG)	:=	-DDEBUG
+
diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c
new file mode 100644
index 0000000..1044337
--- /dev/null
+++ b/net/rds/af_rds.c
@@ -0,0 +1,599 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/gfp.h>
+#include <linux/in.h>
+#include <linux/poll.h>
+#include <net/sock.h>
+
+#include "rds.h"
+
+char *rds_str_array(char **array, size_t elements, size_t index)
+{
+	if ((index < elements) && array[index])
+		return array[index];
+	else
+		return "unknown";
+}
+EXPORT_SYMBOL(rds_str_array);
+
+/* this is just used for stats gathering :/ */
+static DEFINE_SPINLOCK(rds_sock_lock);
+static unsigned long rds_sock_count;
+static LIST_HEAD(rds_sock_list);
+DECLARE_WAIT_QUEUE_HEAD(rds_poll_waitq);
+
+/*
+ * This is called as the final descriptor referencing this socket is closed.
+ * We have to unbind the socket so that another socket can be bound to the
+ * address it was using.
+ *
+ * We have to be careful about racing with the incoming path.  sock_orphan()
+ * sets SOCK_DEAD and we use that as an indicator to the rx path that new
+ * messages shouldn't be queued.
+ */
+static int rds_release(struct socket *sock)
+{
+	struct sock *sk = sock->sk;
+	struct rds_sock *rs;
+
+	if (!sk)
+		goto out;
+
+	rs = rds_sk_to_rs(sk);
+
+	sock_orphan(sk);
+	/* Note - rds_clear_recv_queue grabs rs_recv_lock, so
+	 * that ensures the recv path has completed messing
+	 * with the socket. */
+	rds_clear_recv_queue(rs);
+	rds_cong_remove_socket(rs);
+
+	/*
+	 * the binding lookup hash uses rcu, we need to
+	 * make sure we synchronize_rcu before we free our
+	 * entry
+	 */
+	rds_remove_bound(rs);
+	synchronize_rcu();
+
+	rds_send_drop_to(rs, NULL);
+	rds_rdma_drop_keys(rs);
+	rds_notify_queue_get(rs, NULL);
+
+	spin_lock_bh(&rds_sock_lock);
+	list_del_init(&rs->rs_item);
+	rds_sock_count--;
+	spin_unlock_bh(&rds_sock_lock);
+
+	rds_trans_put(rs->rs_transport);
+
+	sock->sk = NULL;
+	sock_put(sk);
+out:
+	return 0;
+}
+
+/*
+ * Careful not to race with rds_release -> sock_orphan which clears sk_sleep.
+ * _bh() isn't OK here, we're called from interrupt handlers.  It's probably OK
+ * to wake the waitqueue after sk_sleep is clear as we hold a sock ref, but
+ * this seems more conservative.
+ * NB - normally, one would use sk_callback_lock for this, but we can
+ * get here from interrupts, whereas the network code grabs sk_callback_lock
+ * with _lock_bh only - so relying on sk_callback_lock introduces livelocks.
+ */
+void rds_wake_sk_sleep(struct rds_sock *rs)
+{
+	unsigned long flags;
+
+	read_lock_irqsave(&rs->rs_recv_lock, flags);
+	__rds_wake_sk_sleep(rds_rs_to_sk(rs));
+	read_unlock_irqrestore(&rs->rs_recv_lock, flags);
+}
+
+static int rds_getname(struct socket *sock, struct sockaddr *uaddr,
+		       int *uaddr_len, int peer)
+{
+	struct sockaddr_in *sin = (struct sockaddr_in *)uaddr;
+	struct rds_sock *rs = rds_sk_to_rs(sock->sk);
+
+	memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
+
+	/* racey, don't care */
+	if (peer) {
+		if (!rs->rs_conn_addr)
+			return -ENOTCONN;
+
+		sin->sin_port = rs->rs_conn_port;
+		sin->sin_addr.s_addr = rs->rs_conn_addr;
+	} else {
+		sin->sin_port = rs->rs_bound_port;
+		sin->sin_addr.s_addr = rs->rs_bound_addr;
+	}
+
+	sin->sin_family = AF_INET;
+
+	*uaddr_len = sizeof(*sin);
+	return 0;
+}
+
+/*
+ * RDS' poll is without a doubt the least intuitive part of the interface,
+ * as POLLIN and POLLOUT do not behave entirely as you would expect from
+ * a network protocol.
+ *
+ * POLLIN is asserted if
+ *  -	there is data on the receive queue.
+ *  -	to signal that a previously congested destination may have become
+ *	uncongested
+ *  -	A notification has been queued to the socket (this can be a congestion
+ *	update, or a RDMA completion).
+ *
+ * POLLOUT is asserted if there is room on the send queue. This does not mean
+ * however, that the next sendmsg() call will succeed. If the application tries
+ * to send to a congested destination, the system call may still fail (and
+ * return ENOBUFS).
+ */
+static unsigned int rds_poll(struct file *file, struct socket *sock,
+			     poll_table *wait)
+{
+	struct sock *sk = sock->sk;
+	struct rds_sock *rs = rds_sk_to_rs(sk);
+	unsigned int mask = 0;
+	unsigned long flags;
+
+	poll_wait(file, sk_sleep(sk), wait);
+
+	if (rs->rs_seen_congestion)
+		poll_wait(file, &rds_poll_waitq, wait);
+
+	read_lock_irqsave(&rs->rs_recv_lock, flags);
+	if (!rs->rs_cong_monitor) {
+		/* When a congestion map was updated, we signal POLLIN for
+		 * "historical" reasons. Applications can also poll for
+		 * WRBAND instead. */
+		if (rds_cong_updated_since(&rs->rs_cong_track))
+			mask |= (POLLIN | POLLRDNORM | POLLWRBAND);
+	} else {
+		spin_lock(&rs->rs_lock);
+		if (rs->rs_cong_notify)
+			mask |= (POLLIN | POLLRDNORM);
+		spin_unlock(&rs->rs_lock);
+	}
+	if (!list_empty(&rs->rs_recv_queue) ||
+	    !list_empty(&rs->rs_notify_queue))
+		mask |= (POLLIN | POLLRDNORM);
+	if (rs->rs_snd_bytes < rds_sk_sndbuf(rs))
+		mask |= (POLLOUT | POLLWRNORM);
+	read_unlock_irqrestore(&rs->rs_recv_lock, flags);
+
+	/* clear state any time we wake a seen-congested socket */
+	if (mask)
+		rs->rs_seen_congestion = 0;
+
+	return mask;
+}
+
+static int rds_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
+{
+	return -ENOIOCTLCMD;
+}
+
+static int rds_cancel_sent_to(struct rds_sock *rs, char __user *optval,
+			      int len)
+{
+	struct sockaddr_in sin;
+	int ret = 0;
+
+	/* racing with another thread binding seems ok here */
+	if (rs->rs_bound_addr == 0) {
+		ret = -ENOTCONN; /* XXX not a great errno */
+		goto out;
+	}
+
+	if (len < sizeof(struct sockaddr_in)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (copy_from_user(&sin, optval, sizeof(sin))) {
+		ret = -EFAULT;
+		goto out;
+	}
+
+	rds_send_drop_to(rs, &sin);
+out:
+	return ret;
+}
+
+static int rds_set_bool_option(unsigned char *optvar, char __user *optval,
+			       int optlen)
+{
+	int value;
+
+	if (optlen < sizeof(int))
+		return -EINVAL;
+	if (get_user(value, (int __user *) optval))
+		return -EFAULT;
+	*optvar = !!value;
+	return 0;
+}
+
+static int rds_cong_monitor(struct rds_sock *rs, char __user *optval,
+			    int optlen)
+{
+	int ret;
+
+	ret = rds_set_bool_option(&rs->rs_cong_monitor, optval, optlen);
+	if (ret == 0) {
+		if (rs->rs_cong_monitor) {
+			rds_cong_add_socket(rs);
+		} else {
+			rds_cong_remove_socket(rs);
+			rs->rs_cong_mask = 0;
+			rs->rs_cong_notify = 0;
+		}
+	}
+	return ret;
+}
+
+static int rds_setsockopt(struct socket *sock, int level, int optname,
+			  char __user *optval, unsigned int optlen)
+{
+	struct rds_sock *rs = rds_sk_to_rs(sock->sk);
+	int ret;
+
+	if (level != SOL_RDS) {
+		ret = -ENOPROTOOPT;
+		goto out;
+	}
+
+	switch (optname) {
+	case RDS_CANCEL_SENT_TO:
+		ret = rds_cancel_sent_to(rs, optval, optlen);
+		break;
+	case RDS_GET_MR:
+		ret = rds_get_mr(rs, optval, optlen);
+		break;
+	case RDS_GET_MR_FOR_DEST:
+		ret = rds_get_mr_for_dest(rs, optval, optlen);
+		break;
+	case RDS_FREE_MR:
+		ret = rds_free_mr(rs, optval, optlen);
+		break;
+	case RDS_RECVERR:
+		ret = rds_set_bool_option(&rs->rs_recverr, optval, optlen);
+		break;
+	case RDS_CONG_MONITOR:
+		ret = rds_cong_monitor(rs, optval, optlen);
+		break;
+	default:
+		ret = -ENOPROTOOPT;
+	}
+out:
+	return ret;
+}
+
+static int rds_getsockopt(struct socket *sock, int level, int optname,
+			  char __user *optval, int __user *optlen)
+{
+	struct rds_sock *rs = rds_sk_to_rs(sock->sk);
+	int ret = -ENOPROTOOPT, len;
+
+	if (level != SOL_RDS)
+		goto out;
+
+	if (get_user(len, optlen)) {
+		ret = -EFAULT;
+		goto out;
+	}
+
+	switch (optname) {
+	case RDS_INFO_FIRST ... RDS_INFO_LAST:
+		ret = rds_info_getsockopt(sock, optname, optval,
+					  optlen);
+		break;
+
+	case RDS_RECVERR:
+		if (len < sizeof(int))
+			ret = -EINVAL;
+		else
+		if (put_user(rs->rs_recverr, (int __user *) optval) ||
+		    put_user(sizeof(int), optlen))
+			ret = -EFAULT;
+		else
+			ret = 0;
+		break;
+	default:
+		break;
+	}
+
+out:
+	return ret;
+
+}
+
+static int rds_connect(struct socket *sock, struct sockaddr *uaddr,
+		       int addr_len, int flags)
+{
+	struct sock *sk = sock->sk;
+	struct sockaddr_in *sin = (struct sockaddr_in *)uaddr;
+	struct rds_sock *rs = rds_sk_to_rs(sk);
+	int ret = 0;
+
+	lock_sock(sk);
+
+	if (addr_len != sizeof(struct sockaddr_in)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (sin->sin_family != AF_INET) {
+		ret = -EAFNOSUPPORT;
+		goto out;
+	}
+
+	if (sin->sin_addr.s_addr == htonl(INADDR_ANY)) {
+		ret = -EDESTADDRREQ;
+		goto out;
+	}
+
+	rs->rs_conn_addr = sin->sin_addr.s_addr;
+	rs->rs_conn_port = sin->sin_port;
+
+out:
+	release_sock(sk);
+	return ret;
+}
+
+static struct proto rds_proto = {
+	.name	  = "RDS",
+	.owner	  = THIS_MODULE,
+	.obj_size = sizeof(struct rds_sock),
+};
+
+static const struct proto_ops rds_proto_ops = {
+	.family =	AF_RDS,
+	.owner =	THIS_MODULE,
+	.release =	rds_release,
+	.bind =		rds_bind,
+	.connect =	rds_connect,
+	.socketpair =	sock_no_socketpair,
+	.accept =	sock_no_accept,
+	.getname =	rds_getname,
+	.poll =		rds_poll,
+	.ioctl =	rds_ioctl,
+	.listen =	sock_no_listen,
+	.shutdown =	sock_no_shutdown,
+	.setsockopt =	rds_setsockopt,
+	.getsockopt =	rds_getsockopt,
+	.sendmsg =	rds_sendmsg,
+	.recvmsg =	rds_recvmsg,
+	.mmap =		sock_no_mmap,
+	.sendpage =	sock_no_sendpage,
+};
+
+static int __rds_create(struct socket *sock, struct sock *sk, int protocol)
+{
+	struct rds_sock *rs;
+
+	sock_init_data(sock, sk);
+	sock->ops		= &rds_proto_ops;
+	sk->sk_protocol		= protocol;
+
+	rs = rds_sk_to_rs(sk);
+	spin_lock_init(&rs->rs_lock);
+	rwlock_init(&rs->rs_recv_lock);
+	INIT_LIST_HEAD(&rs->rs_send_queue);
+	INIT_LIST_HEAD(&rs->rs_recv_queue);
+	INIT_LIST_HEAD(&rs->rs_notify_queue);
+	INIT_LIST_HEAD(&rs->rs_cong_list);
+	spin_lock_init(&rs->rs_rdma_lock);
+	rs->rs_rdma_keys = RB_ROOT;
+
+	spin_lock_bh(&rds_sock_lock);
+	list_add_tail(&rs->rs_item, &rds_sock_list);
+	rds_sock_count++;
+	spin_unlock_bh(&rds_sock_lock);
+
+	return 0;
+}
+
+static int rds_create(struct net *net, struct socket *sock, int protocol,
+		      int kern)
+{
+	struct sock *sk;
+
+	if (sock->type != SOCK_SEQPACKET || protocol)
+		return -ESOCKTNOSUPPORT;
+
+	sk = sk_alloc(net, AF_RDS, GFP_ATOMIC, &rds_proto);
+	if (!sk)
+		return -ENOMEM;
+
+	return __rds_create(sock, sk, protocol);
+}
+
+void rds_sock_addref(struct rds_sock *rs)
+{
+	sock_hold(rds_rs_to_sk(rs));
+}
+
+void rds_sock_put(struct rds_sock *rs)
+{
+	sock_put(rds_rs_to_sk(rs));
+}
+
+static const struct net_proto_family rds_family_ops = {
+	.family =	AF_RDS,
+	.create =	rds_create,
+	.owner	=	THIS_MODULE,
+};
+
+static void rds_sock_inc_info(struct socket *sock, unsigned int len,
+			      struct rds_info_iterator *iter,
+			      struct rds_info_lengths *lens)
+{
+	struct rds_sock *rs;
+	struct rds_incoming *inc;
+	unsigned int total = 0;
+
+	len /= sizeof(struct rds_info_message);
+
+	spin_lock_bh(&rds_sock_lock);
+
+	list_for_each_entry(rs, &rds_sock_list, rs_item) {
+		read_lock(&rs->rs_recv_lock);
+
+		/* XXX too lazy to maintain counts.. */
+		list_for_each_entry(inc, &rs->rs_recv_queue, i_item) {
+			total++;
+			if (total <= len)
+				rds_inc_info_copy(inc, iter, inc->i_saddr,
+						  rs->rs_bound_addr, 1);
+		}
+
+		read_unlock(&rs->rs_recv_lock);
+	}
+
+	spin_unlock_bh(&rds_sock_lock);
+
+	lens->nr = total;
+	lens->each = sizeof(struct rds_info_message);
+}
+
+static void rds_sock_info(struct socket *sock, unsigned int len,
+			  struct rds_info_iterator *iter,
+			  struct rds_info_lengths *lens)
+{
+	struct rds_info_socket sinfo;
+	struct rds_sock *rs;
+
+	len /= sizeof(struct rds_info_socket);
+
+	spin_lock_bh(&rds_sock_lock);
+
+	if (len < rds_sock_count)
+		goto out;
+
+	list_for_each_entry(rs, &rds_sock_list, rs_item) {
+		sinfo.sndbuf = rds_sk_sndbuf(rs);
+		sinfo.rcvbuf = rds_sk_rcvbuf(rs);
+		sinfo.bound_addr = rs->rs_bound_addr;
+		sinfo.connected_addr = rs->rs_conn_addr;
+		sinfo.bound_port = rs->rs_bound_port;
+		sinfo.connected_port = rs->rs_conn_port;
+		sinfo.inum = sock_i_ino(rds_rs_to_sk(rs));
+
+		rds_info_copy(iter, &sinfo, sizeof(sinfo));
+	}
+
+out:
+	lens->nr = rds_sock_count;
+	lens->each = sizeof(struct rds_info_socket);
+
+	spin_unlock_bh(&rds_sock_lock);
+}
+
+static void rds_exit(void)
+{
+	sock_unregister(rds_family_ops.family);
+	proto_unregister(&rds_proto);
+	rds_conn_exit();
+	rds_cong_exit();
+	rds_sysctl_exit();
+	rds_threads_exit();
+	rds_stats_exit();
+	rds_page_exit();
+	rds_info_deregister_func(RDS_INFO_SOCKETS, rds_sock_info);
+	rds_info_deregister_func(RDS_INFO_RECV_MESSAGES, rds_sock_inc_info);
+}
+module_exit(rds_exit);
+
+static int rds_init(void)
+{
+	int ret;
+
+	ret = rds_conn_init();
+	if (ret)
+		goto out;
+	ret = rds_threads_init();
+	if (ret)
+		goto out_conn;
+	ret = rds_sysctl_init();
+	if (ret)
+		goto out_threads;
+	ret = rds_stats_init();
+	if (ret)
+		goto out_sysctl;
+	ret = proto_register(&rds_proto, 1);
+	if (ret)
+		goto out_stats;
+	ret = sock_register(&rds_family_ops);
+	if (ret)
+		goto out_proto;
+
+	rds_info_register_func(RDS_INFO_SOCKETS, rds_sock_info);
+	rds_info_register_func(RDS_INFO_RECV_MESSAGES, rds_sock_inc_info);
+
+	goto out;
+
+out_proto:
+	proto_unregister(&rds_proto);
+out_stats:
+	rds_stats_exit();
+out_sysctl:
+	rds_sysctl_exit();
+out_threads:
+	rds_threads_exit();
+out_conn:
+	rds_conn_exit();
+	rds_cong_exit();
+	rds_page_exit();
+out:
+	return ret;
+}
+module_init(rds_init);
+
+#define DRV_VERSION     "4.0"
+#define DRV_RELDATE     "Feb 12, 2009"
+
+MODULE_AUTHOR("Oracle Corporation <rds-devel@oss.oracle.com>");
+MODULE_DESCRIPTION("RDS: Reliable Datagram Sockets"
+		   " v" DRV_VERSION " (" DRV_RELDATE ")");
+MODULE_VERSION(DRV_VERSION);
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_ALIAS_NETPROTO(PF_RDS);
diff --git a/net/rds/bind.c b/net/rds/bind.c
new file mode 100644
index 0000000..a2e6562
--- /dev/null
+++ b/net/rds/bind.c
@@ -0,0 +1,203 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <net/sock.h>
+#include <linux/in.h>
+#include <linux/if_arp.h>
+#include <linux/jhash.h>
+#include <linux/ratelimit.h>
+#include "rds.h"
+
+#define BIND_HASH_SIZE 1024
+static struct hlist_head bind_hash_table[BIND_HASH_SIZE];
+static DEFINE_SPINLOCK(rds_bind_lock);
+
+static struct hlist_head *hash_to_bucket(__be32 addr, __be16 port)
+{
+	return bind_hash_table + (jhash_2words((u32)addr, (u32)port, 0) &
+				  (BIND_HASH_SIZE - 1));
+}
+
+static struct rds_sock *rds_bind_lookup(__be32 addr, __be16 port,
+					struct rds_sock *insert)
+{
+	struct rds_sock *rs;
+	struct hlist_head *head = hash_to_bucket(addr, port);
+	u64 cmp;
+	u64 needle = ((u64)be32_to_cpu(addr) << 32) | be16_to_cpu(port);
+
+	rcu_read_lock();
+	hlist_for_each_entry_rcu(rs, head, rs_bound_node) {
+		cmp = ((u64)be32_to_cpu(rs->rs_bound_addr) << 32) |
+		      be16_to_cpu(rs->rs_bound_port);
+
+		if (cmp == needle) {
+			rcu_read_unlock();
+			return rs;
+		}
+	}
+	rcu_read_unlock();
+
+	if (insert) {
+		/*
+		 * make sure our addr and port are set before
+		 * we are added to the list, other people
+		 * in rcu will find us as soon as the
+		 * hlist_add_head_rcu is done
+		 */
+		insert->rs_bound_addr = addr;
+		insert->rs_bound_port = port;
+		rds_sock_addref(insert);
+
+		hlist_add_head_rcu(&insert->rs_bound_node, head);
+	}
+	return NULL;
+}
+
+/*
+ * Return the rds_sock bound at the given local address.
+ *
+ * The rx path can race with rds_release.  We notice if rds_release() has
+ * marked this socket and don't return a rs ref to the rx path.
+ */
+struct rds_sock *rds_find_bound(__be32 addr, __be16 port)
+{
+	struct rds_sock *rs;
+
+	rs = rds_bind_lookup(addr, port, NULL);
+
+	if (rs && !sock_flag(rds_rs_to_sk(rs), SOCK_DEAD))
+		rds_sock_addref(rs);
+	else
+		rs = NULL;
+
+	rdsdebug("returning rs %p for %pI4:%u\n", rs, &addr,
+		ntohs(port));
+	return rs;
+}
+
+/* returns -ve errno or +ve port */
+static int rds_add_bound(struct rds_sock *rs, __be32 addr, __be16 *port)
+{
+	unsigned long flags;
+	int ret = -EADDRINUSE;
+	u16 rover, last;
+
+	if (*port != 0) {
+		rover = be16_to_cpu(*port);
+		last = rover;
+	} else {
+		rover = max_t(u16, prandom_u32(), 2);
+		last = rover - 1;
+	}
+
+	spin_lock_irqsave(&rds_bind_lock, flags);
+
+	do {
+		if (rover == 0)
+			rover++;
+		if (!rds_bind_lookup(addr, cpu_to_be16(rover), rs)) {
+			*port = rs->rs_bound_port;
+			ret = 0;
+			rdsdebug("rs %p binding to %pI4:%d\n",
+			  rs, &addr, (int)ntohs(*port));
+			break;
+		}
+	} while (rover++ != last);
+
+	spin_unlock_irqrestore(&rds_bind_lock, flags);
+
+	return ret;
+}
+
+void rds_remove_bound(struct rds_sock *rs)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&rds_bind_lock, flags);
+
+	if (rs->rs_bound_addr) {
+		rdsdebug("rs %p unbinding from %pI4:%d\n",
+		  rs, &rs->rs_bound_addr,
+		  ntohs(rs->rs_bound_port));
+
+		hlist_del_init_rcu(&rs->rs_bound_node);
+		rds_sock_put(rs);
+		rs->rs_bound_addr = 0;
+	}
+
+	spin_unlock_irqrestore(&rds_bind_lock, flags);
+}
+
+int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
+{
+	struct sock *sk = sock->sk;
+	struct sockaddr_in *sin = (struct sockaddr_in *)uaddr;
+	struct rds_sock *rs = rds_sk_to_rs(sk);
+	struct rds_transport *trans;
+	int ret = 0;
+
+	lock_sock(sk);
+
+	if (addr_len != sizeof(struct sockaddr_in) ||
+	    sin->sin_family != AF_INET ||
+	    rs->rs_bound_addr ||
+	    sin->sin_addr.s_addr == htonl(INADDR_ANY)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	ret = rds_add_bound(rs, sin->sin_addr.s_addr, &sin->sin_port);
+	if (ret)
+		goto out;
+
+	trans = rds_trans_get_preferred(sin->sin_addr.s_addr);
+	if (!trans) {
+		ret = -EADDRNOTAVAIL;
+		rds_remove_bound(rs);
+		printk_ratelimited(KERN_INFO "RDS: rds_bind() could not find a transport, "
+				"load rds_tcp or rds_rdma?\n");
+		goto out;
+	}
+
+	rs->rs_transport = trans;
+	ret = 0;
+
+out:
+	release_sock(sk);
+
+	/* we might have called rds_remove_bound on error */
+	if (ret)
+		synchronize_rcu();
+	return ret;
+}
diff --git a/net/rds/cong.c b/net/rds/cong.c
new file mode 100644
index 0000000..e5b65ac
--- /dev/null
+++ b/net/rds/cong.c
@@ -0,0 +1,406 @@
+/*
+ * Copyright (c) 2007 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/rbtree.h>
+#include <linux/bitops.h>
+#include <linux/export.h>
+
+#include "rds.h"
+
+/*
+ * This file implements the receive side of the unconventional congestion
+ * management in RDS.
+ *
+ * Messages waiting in the receive queue on the receiving socket are accounted
+ * against the sockets SO_RCVBUF option value.  Only the payload bytes in the
+ * message are accounted for.  If the number of bytes queued equals or exceeds
+ * rcvbuf then the socket is congested.  All sends attempted to this socket's
+ * address should return block or return -EWOULDBLOCK.
+ *
+ * Applications are expected to be reasonably tuned such that this situation
+ * very rarely occurs.  An application encountering this "back-pressure" is
+ * considered a bug.
+ *
+ * This is implemented by having each node maintain bitmaps which indicate
+ * which ports on bound addresses are congested.  As the bitmap changes it is
+ * sent through all the connections which terminate in the local address of the
+ * bitmap which changed.
+ *
+ * The bitmaps are allocated as connections are brought up.  This avoids
+ * allocation in the interrupt handling path which queues messages on sockets.
+ * The dense bitmaps let transports send the entire bitmap on any bitmap change
+ * reasonably efficiently.  This is much easier to implement than some
+ * finer-grained communication of per-port congestion.  The sender does a very
+ * inexpensive bit test to test if the port it's about to send to is congested
+ * or not.
+ */
+
+/*
+ * Interaction with poll is a tad tricky. We want all processes stuck in
+ * poll to wake up and check whether a congested destination became uncongested.
+ * The really sad thing is we have no idea which destinations the application
+ * wants to send to - we don't even know which rds_connections are involved.
+ * So until we implement a more flexible rds poll interface, we have to make
+ * do with this:
+ * We maintain a global counter that is incremented each time a congestion map
+ * update is received. Each rds socket tracks this value, and if rds_poll
+ * finds that the saved generation number is smaller than the global generation
+ * number, it wakes up the process.
+ */
+static atomic_t		rds_cong_generation = ATOMIC_INIT(0);
+
+/*
+ * Congestion monitoring
+ */
+static LIST_HEAD(rds_cong_monitor);
+static DEFINE_RWLOCK(rds_cong_monitor_lock);
+
+/*
+ * Yes, a global lock.  It's used so infrequently that it's worth keeping it
+ * global to simplify the locking.  It's only used in the following
+ * circumstances:
+ *
+ *  - on connection buildup to associate a conn with its maps
+ *  - on map changes to inform conns of a new map to send
+ *
+ *  It's sadly ordered under the socket callback lock and the connection lock.
+ *  Receive paths can mark ports congested from interrupt context so the
+ *  lock masks interrupts.
+ */
+static DEFINE_SPINLOCK(rds_cong_lock);
+static struct rb_root rds_cong_tree = RB_ROOT;
+
+static struct rds_cong_map *rds_cong_tree_walk(__be32 addr,
+					       struct rds_cong_map *insert)
+{
+	struct rb_node **p = &rds_cong_tree.rb_node;
+	struct rb_node *parent = NULL;
+	struct rds_cong_map *map;
+
+	while (*p) {
+		parent = *p;
+		map = rb_entry(parent, struct rds_cong_map, m_rb_node);
+
+		if (addr < map->m_addr)
+			p = &(*p)->rb_left;
+		else if (addr > map->m_addr)
+			p = &(*p)->rb_right;
+		else
+			return map;
+	}
+
+	if (insert) {
+		rb_link_node(&insert->m_rb_node, parent, p);
+		rb_insert_color(&insert->m_rb_node, &rds_cong_tree);
+	}
+	return NULL;
+}
+
+/*
+ * There is only ever one bitmap for any address.  Connections try and allocate
+ * these bitmaps in the process getting pointers to them.  The bitmaps are only
+ * ever freed as the module is removed after all connections have been freed.
+ */
+static struct rds_cong_map *rds_cong_from_addr(__be32 addr)
+{
+	struct rds_cong_map *map;
+	struct rds_cong_map *ret = NULL;
+	unsigned long zp;
+	unsigned long i;
+	unsigned long flags;
+
+	map = kzalloc(sizeof(struct rds_cong_map), GFP_KERNEL);
+	if (!map)
+		return NULL;
+
+	map->m_addr = addr;
+	init_waitqueue_head(&map->m_waitq);
+	INIT_LIST_HEAD(&map->m_conn_list);
+
+	for (i = 0; i < RDS_CONG_MAP_PAGES; i++) {
+		zp = get_zeroed_page(GFP_KERNEL);
+		if (zp == 0)
+			goto out;
+		map->m_page_addrs[i] = zp;
+	}
+
+	spin_lock_irqsave(&rds_cong_lock, flags);
+	ret = rds_cong_tree_walk(addr, map);
+	spin_unlock_irqrestore(&rds_cong_lock, flags);
+
+	if (!ret) {
+		ret = map;
+		map = NULL;
+	}
+
+out:
+	if (map) {
+		for (i = 0; i < RDS_CONG_MAP_PAGES && map->m_page_addrs[i]; i++)
+			free_page(map->m_page_addrs[i]);
+		kfree(map);
+	}
+
+	rdsdebug("map %p for addr %x\n", ret, be32_to_cpu(addr));
+
+	return ret;
+}
+
+/*
+ * Put the conn on its local map's list.  This is called when the conn is
+ * really added to the hash.  It's nested under the rds_conn_lock, sadly.
+ */
+void rds_cong_add_conn(struct rds_connection *conn)
+{
+	unsigned long flags;
+
+	rdsdebug("conn %p now on map %p\n", conn, conn->c_lcong);
+	spin_lock_irqsave(&rds_cong_lock, flags);
+	list_add_tail(&conn->c_map_item, &conn->c_lcong->m_conn_list);
+	spin_unlock_irqrestore(&rds_cong_lock, flags);
+}
+
+void rds_cong_remove_conn(struct rds_connection *conn)
+{
+	unsigned long flags;
+
+	rdsdebug("removing conn %p from map %p\n", conn, conn->c_lcong);
+	spin_lock_irqsave(&rds_cong_lock, flags);
+	list_del_init(&conn->c_map_item);
+	spin_unlock_irqrestore(&rds_cong_lock, flags);
+}
+
+int rds_cong_get_maps(struct rds_connection *conn)
+{
+	conn->c_lcong = rds_cong_from_addr(conn->c_laddr);
+	conn->c_fcong = rds_cong_from_addr(conn->c_faddr);
+
+	if (!(conn->c_lcong && conn->c_fcong))
+		return -ENOMEM;
+
+	return 0;
+}
+
+void rds_cong_queue_updates(struct rds_cong_map *map)
+{
+	struct rds_connection *conn;
+	unsigned long flags;
+
+	spin_lock_irqsave(&rds_cong_lock, flags);
+
+	list_for_each_entry(conn, &map->m_conn_list, c_map_item) {
+		if (!test_and_set_bit(0, &conn->c_map_queued)) {
+			rds_stats_inc(s_cong_update_queued);
+			rds_send_xmit(conn);
+		}
+	}
+
+	spin_unlock_irqrestore(&rds_cong_lock, flags);
+}
+
+void rds_cong_map_updated(struct rds_cong_map *map, uint64_t portmask)
+{
+	rdsdebug("waking map %p for %pI4\n",
+	  map, &map->m_addr);
+	rds_stats_inc(s_cong_update_received);
+	atomic_inc(&rds_cong_generation);
+	if (waitqueue_active(&map->m_waitq))
+		wake_up(&map->m_waitq);
+	if (waitqueue_active(&rds_poll_waitq))
+		wake_up_all(&rds_poll_waitq);
+
+	if (portmask && !list_empty(&rds_cong_monitor)) {
+		unsigned long flags;
+		struct rds_sock *rs;
+
+		read_lock_irqsave(&rds_cong_monitor_lock, flags);
+		list_for_each_entry(rs, &rds_cong_monitor, rs_cong_list) {
+			spin_lock(&rs->rs_lock);
+			rs->rs_cong_notify |= (rs->rs_cong_mask & portmask);
+			rs->rs_cong_mask &= ~portmask;
+			spin_unlock(&rs->rs_lock);
+			if (rs->rs_cong_notify)
+				rds_wake_sk_sleep(rs);
+		}
+		read_unlock_irqrestore(&rds_cong_monitor_lock, flags);
+	}
+}
+EXPORT_SYMBOL_GPL(rds_cong_map_updated);
+
+int rds_cong_updated_since(unsigned long *recent)
+{
+	unsigned long gen = atomic_read(&rds_cong_generation);
+
+	if (likely(*recent == gen))
+		return 0;
+	*recent = gen;
+	return 1;
+}
+
+/*
+ * We're called under the locking that protects the sockets receive buffer
+ * consumption.  This makes it a lot easier for the caller to only call us
+ * when it knows that an existing set bit needs to be cleared, and vice versa.
+ * We can't block and we need to deal with concurrent sockets working against
+ * the same per-address map.
+ */
+void rds_cong_set_bit(struct rds_cong_map *map, __be16 port)
+{
+	unsigned long i;
+	unsigned long off;
+
+	rdsdebug("setting congestion for %pI4:%u in map %p\n",
+	  &map->m_addr, ntohs(port), map);
+
+	i = be16_to_cpu(port) / RDS_CONG_MAP_PAGE_BITS;
+	off = be16_to_cpu(port) % RDS_CONG_MAP_PAGE_BITS;
+
+	__set_bit_le(off, (void *)map->m_page_addrs[i]);
+}
+
+void rds_cong_clear_bit(struct rds_cong_map *map, __be16 port)
+{
+	unsigned long i;
+	unsigned long off;
+
+	rdsdebug("clearing congestion for %pI4:%u in map %p\n",
+	  &map->m_addr, ntohs(port), map);
+
+	i = be16_to_cpu(port) / RDS_CONG_MAP_PAGE_BITS;
+	off = be16_to_cpu(port) % RDS_CONG_MAP_PAGE_BITS;
+
+	__clear_bit_le(off, (void *)map->m_page_addrs[i]);
+}
+
+static int rds_cong_test_bit(struct rds_cong_map *map, __be16 port)
+{
+	unsigned long i;
+	unsigned long off;
+
+	i = be16_to_cpu(port) / RDS_CONG_MAP_PAGE_BITS;
+	off = be16_to_cpu(port) % RDS_CONG_MAP_PAGE_BITS;
+
+	return test_bit_le(off, (void *)map->m_page_addrs[i]);
+}
+
+void rds_cong_add_socket(struct rds_sock *rs)
+{
+	unsigned long flags;
+
+	write_lock_irqsave(&rds_cong_monitor_lock, flags);
+	if (list_empty(&rs->rs_cong_list))
+		list_add(&rs->rs_cong_list, &rds_cong_monitor);
+	write_unlock_irqrestore(&rds_cong_monitor_lock, flags);
+}
+
+void rds_cong_remove_socket(struct rds_sock *rs)
+{
+	unsigned long flags;
+	struct rds_cong_map *map;
+
+	write_lock_irqsave(&rds_cong_monitor_lock, flags);
+	list_del_init(&rs->rs_cong_list);
+	write_unlock_irqrestore(&rds_cong_monitor_lock, flags);
+
+	/* update congestion map for now-closed port */
+	spin_lock_irqsave(&rds_cong_lock, flags);
+	map = rds_cong_tree_walk(rs->rs_bound_addr, NULL);
+	spin_unlock_irqrestore(&rds_cong_lock, flags);
+
+	if (map && rds_cong_test_bit(map, rs->rs_bound_port)) {
+		rds_cong_clear_bit(map, rs->rs_bound_port);
+		rds_cong_queue_updates(map);
+	}
+}
+
+int rds_cong_wait(struct rds_cong_map *map, __be16 port, int nonblock,
+		  struct rds_sock *rs)
+{
+	if (!rds_cong_test_bit(map, port))
+		return 0;
+	if (nonblock) {
+		if (rs && rs->rs_cong_monitor) {
+			unsigned long flags;
+
+			/* It would have been nice to have an atomic set_bit on
+			 * a uint64_t. */
+			spin_lock_irqsave(&rs->rs_lock, flags);
+			rs->rs_cong_mask |= RDS_CONG_MONITOR_MASK(ntohs(port));
+			spin_unlock_irqrestore(&rs->rs_lock, flags);
+
+			/* Test again - a congestion update may have arrived in
+			 * the meantime. */
+			if (!rds_cong_test_bit(map, port))
+				return 0;
+		}
+		rds_stats_inc(s_cong_send_error);
+		return -ENOBUFS;
+	}
+
+	rds_stats_inc(s_cong_send_blocked);
+	rdsdebug("waiting on map %p for port %u\n", map, be16_to_cpu(port));
+
+	return wait_event_interruptible(map->m_waitq,
+					!rds_cong_test_bit(map, port));
+}
+
+void rds_cong_exit(void)
+{
+	struct rb_node *node;
+	struct rds_cong_map *map;
+	unsigned long i;
+
+	while ((node = rb_first(&rds_cong_tree))) {
+		map = rb_entry(node, struct rds_cong_map, m_rb_node);
+		rdsdebug("freeing map %p\n", map);
+		rb_erase(&map->m_rb_node, &rds_cong_tree);
+		for (i = 0; i < RDS_CONG_MAP_PAGES && map->m_page_addrs[i]; i++)
+			free_page(map->m_page_addrs[i]);
+		kfree(map);
+	}
+}
+
+/*
+ * Allocate a RDS message containing a congestion update.
+ */
+struct rds_message *rds_cong_update_alloc(struct rds_connection *conn)
+{
+	struct rds_cong_map *map = conn->c_lcong;
+	struct rds_message *rm;
+
+	rm = rds_message_map_pages(map->m_page_addrs, RDS_CONG_MAP_BYTES);
+	if (!IS_ERR(rm))
+		rm->m_inc.i_hdr.h_flags = RDS_FLAG_CONG_BITMAP;
+
+	return rm;
+}
diff --git a/net/rds/connection.c b/net/rds/connection.c
new file mode 100644
index 0000000..378c3a6
--- /dev/null
+++ b/net/rds/connection.c
@@ -0,0 +1,577 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/export.h>
+#include <net/inet_hashtables.h>
+
+#include "rds.h"
+#include "loop.h"
+
+#define RDS_CONNECTION_HASH_BITS 12
+#define RDS_CONNECTION_HASH_ENTRIES (1 << RDS_CONNECTION_HASH_BITS)
+#define RDS_CONNECTION_HASH_MASK (RDS_CONNECTION_HASH_ENTRIES - 1)
+
+/* converting this to RCU is a chore for another day.. */
+static DEFINE_SPINLOCK(rds_conn_lock);
+static unsigned long rds_conn_count;
+static struct hlist_head rds_conn_hash[RDS_CONNECTION_HASH_ENTRIES];
+static struct kmem_cache *rds_conn_slab;
+
+static struct hlist_head *rds_conn_bucket(__be32 laddr, __be32 faddr)
+{
+	static u32 rds_hash_secret __read_mostly;
+
+	unsigned long hash;
+
+	net_get_random_once(&rds_hash_secret, sizeof(rds_hash_secret));
+
+	/* Pass NULL, don't need struct net for hash */
+	hash = __inet_ehashfn(be32_to_cpu(laddr), 0,
+			      be32_to_cpu(faddr), 0,
+			      rds_hash_secret);
+	return &rds_conn_hash[hash & RDS_CONNECTION_HASH_MASK];
+}
+
+#define rds_conn_info_set(var, test, suffix) do {		\
+	if (test)						\
+		var |= RDS_INFO_CONNECTION_FLAG_##suffix;	\
+} while (0)
+
+/* rcu read lock must be held or the connection spinlock */
+static struct rds_connection *rds_conn_lookup(struct hlist_head *head,
+					      __be32 laddr, __be32 faddr,
+					      struct rds_transport *trans)
+{
+	struct rds_connection *conn, *ret = NULL;
+
+	hlist_for_each_entry_rcu(conn, head, c_hash_node) {
+		if (conn->c_faddr == faddr && conn->c_laddr == laddr &&
+				conn->c_trans == trans) {
+			ret = conn;
+			break;
+		}
+	}
+	rdsdebug("returning conn %p for %pI4 -> %pI4\n", ret,
+		 &laddr, &faddr);
+	return ret;
+}
+
+/*
+ * This is called by transports as they're bringing down a connection.
+ * It clears partial message state so that the transport can start sending
+ * and receiving over this connection again in the future.  It is up to
+ * the transport to have serialized this call with its send and recv.
+ */
+static void rds_conn_reset(struct rds_connection *conn)
+{
+	rdsdebug("connection %pI4 to %pI4 reset\n",
+	  &conn->c_laddr, &conn->c_faddr);
+
+	rds_stats_inc(s_conn_reset);
+	rds_send_reset(conn);
+	conn->c_flags = 0;
+
+	/* Do not clear next_rx_seq here, else we cannot distinguish
+	 * retransmitted packets from new packets, and will hand all
+	 * of them to the application. That is not consistent with the
+	 * reliability guarantees of RDS. */
+}
+
+/*
+ * There is only every one 'conn' for a given pair of addresses in the
+ * system at a time.  They contain messages to be retransmitted and so
+ * span the lifetime of the actual underlying transport connections.
+ *
+ * For now they are not garbage collected once they're created.  They
+ * are torn down as the module is removed, if ever.
+ */
+static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr,
+				       struct rds_transport *trans, gfp_t gfp,
+				       int is_outgoing)
+{
+	struct rds_connection *conn, *parent = NULL;
+	struct hlist_head *head = rds_conn_bucket(laddr, faddr);
+	struct rds_transport *loop_trans;
+	unsigned long flags;
+	int ret;
+
+	rcu_read_lock();
+	conn = rds_conn_lookup(head, laddr, faddr, trans);
+	if (conn && conn->c_loopback && conn->c_trans != &rds_loop_transport &&
+	    !is_outgoing) {
+		/* This is a looped back IB connection, and we're
+		 * called by the code handling the incoming connect.
+		 * We need a second connection object into which we
+		 * can stick the other QP. */
+		parent = conn;
+		conn = parent->c_passive;
+	}
+	rcu_read_unlock();
+	if (conn)
+		goto out;
+
+	conn = kmem_cache_zalloc(rds_conn_slab, gfp);
+	if (!conn) {
+		conn = ERR_PTR(-ENOMEM);
+		goto out;
+	}
+
+	INIT_HLIST_NODE(&conn->c_hash_node);
+	conn->c_laddr = laddr;
+	conn->c_faddr = faddr;
+	spin_lock_init(&conn->c_lock);
+	conn->c_next_tx_seq = 1;
+
+	init_waitqueue_head(&conn->c_waitq);
+	INIT_LIST_HEAD(&conn->c_send_queue);
+	INIT_LIST_HEAD(&conn->c_retrans);
+
+	ret = rds_cong_get_maps(conn);
+	if (ret) {
+		kmem_cache_free(rds_conn_slab, conn);
+		conn = ERR_PTR(ret);
+		goto out;
+	}
+
+	/*
+	 * This is where a connection becomes loopback.  If *any* RDS sockets
+	 * can bind to the destination address then we'd rather the messages
+	 * flow through loopback rather than either transport.
+	 */
+	loop_trans = rds_trans_get_preferred(faddr);
+	if (loop_trans) {
+		rds_trans_put(loop_trans);
+		conn->c_loopback = 1;
+		if (is_outgoing && trans->t_prefer_loopback) {
+			/* "outgoing" connection - and the transport
+			 * says it wants the connection handled by the
+			 * loopback transport. This is what TCP does.
+			 */
+			trans = &rds_loop_transport;
+		}
+	}
+
+	conn->c_trans = trans;
+
+	ret = trans->conn_alloc(conn, gfp);
+	if (ret) {
+		kmem_cache_free(rds_conn_slab, conn);
+		conn = ERR_PTR(ret);
+		goto out;
+	}
+
+	atomic_set(&conn->c_state, RDS_CONN_DOWN);
+	conn->c_reconnect_jiffies = 0;
+	INIT_DELAYED_WORK(&conn->c_send_w, rds_send_worker);
+	INIT_DELAYED_WORK(&conn->c_recv_w, rds_recv_worker);
+	INIT_DELAYED_WORK(&conn->c_conn_w, rds_connect_worker);
+	INIT_WORK(&conn->c_down_w, rds_shutdown_worker);
+	mutex_init(&conn->c_cm_lock);
+	conn->c_flags = 0;
+
+	rdsdebug("allocated conn %p for %pI4 -> %pI4 over %s %s\n",
+	  conn, &laddr, &faddr,
+	  trans->t_name ? trans->t_name : "[unknown]",
+	  is_outgoing ? "(outgoing)" : "");
+
+	/*
+	 * Since we ran without holding the conn lock, someone could
+	 * have created the same conn (either normal or passive) in the
+	 * interim. We check while holding the lock. If we won, we complete
+	 * init and return our conn. If we lost, we rollback and return the
+	 * other one.
+	 */
+	spin_lock_irqsave(&rds_conn_lock, flags);
+	if (parent) {
+		/* Creating passive conn */
+		if (parent->c_passive) {
+			trans->conn_free(conn->c_transport_data);
+			kmem_cache_free(rds_conn_slab, conn);
+			conn = parent->c_passive;
+		} else {
+			parent->c_passive = conn;
+			rds_cong_add_conn(conn);
+			rds_conn_count++;
+		}
+	} else {
+		/* Creating normal conn */
+		struct rds_connection *found;
+
+		found = rds_conn_lookup(head, laddr, faddr, trans);
+		if (found) {
+			trans->conn_free(conn->c_transport_data);
+			kmem_cache_free(rds_conn_slab, conn);
+			conn = found;
+		} else {
+			hlist_add_head_rcu(&conn->c_hash_node, head);
+			rds_cong_add_conn(conn);
+			rds_conn_count++;
+		}
+	}
+	spin_unlock_irqrestore(&rds_conn_lock, flags);
+
+out:
+	return conn;
+}
+
+struct rds_connection *rds_conn_create(__be32 laddr, __be32 faddr,
+				       struct rds_transport *trans, gfp_t gfp)
+{
+	return __rds_conn_create(laddr, faddr, trans, gfp, 0);
+}
+EXPORT_SYMBOL_GPL(rds_conn_create);
+
+struct rds_connection *rds_conn_create_outgoing(__be32 laddr, __be32 faddr,
+				       struct rds_transport *trans, gfp_t gfp)
+{
+	return __rds_conn_create(laddr, faddr, trans, gfp, 1);
+}
+EXPORT_SYMBOL_GPL(rds_conn_create_outgoing);
+
+void rds_conn_shutdown(struct rds_connection *conn)
+{
+	/* shut it down unless it's down already */
+	if (!rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_DOWN)) {
+		/*
+		 * Quiesce the connection mgmt handlers before we start tearing
+		 * things down. We don't hold the mutex for the entire
+		 * duration of the shutdown operation, else we may be
+		 * deadlocking with the CM handler. Instead, the CM event
+		 * handler is supposed to check for state DISCONNECTING
+		 */
+		mutex_lock(&conn->c_cm_lock);
+		if (!rds_conn_transition(conn, RDS_CONN_UP, RDS_CONN_DISCONNECTING)
+		 && !rds_conn_transition(conn, RDS_CONN_ERROR, RDS_CONN_DISCONNECTING)) {
+			rds_conn_error(conn, "shutdown called in state %d\n",
+					atomic_read(&conn->c_state));
+			mutex_unlock(&conn->c_cm_lock);
+			return;
+		}
+		mutex_unlock(&conn->c_cm_lock);
+
+		wait_event(conn->c_waitq,
+			   !test_bit(RDS_IN_XMIT, &conn->c_flags));
+
+		conn->c_trans->conn_shutdown(conn);
+		rds_conn_reset(conn);
+
+		if (!rds_conn_transition(conn, RDS_CONN_DISCONNECTING, RDS_CONN_DOWN)) {
+			/* This can happen - eg when we're in the middle of tearing
+			 * down the connection, and someone unloads the rds module.
+			 * Quite reproduceable with loopback connections.
+			 * Mostly harmless.
+			 */
+			rds_conn_error(conn,
+				"%s: failed to transition to state DOWN, "
+				"current state is %d\n",
+				__func__,
+				atomic_read(&conn->c_state));
+			return;
+		}
+	}
+
+	/* Then reconnect if it's still live.
+	 * The passive side of an IB loopback connection is never added
+	 * to the conn hash, so we never trigger a reconnect on this
+	 * conn - the reconnect is always triggered by the active peer. */
+	cancel_delayed_work_sync(&conn->c_conn_w);
+	rcu_read_lock();
+	if (!hlist_unhashed(&conn->c_hash_node)) {
+		rcu_read_unlock();
+		rds_queue_reconnect(conn);
+	} else {
+		rcu_read_unlock();
+	}
+}
+
+/*
+ * Stop and free a connection.
+ *
+ * This can only be used in very limited circumstances.  It assumes that once
+ * the conn has been shutdown that no one else is referencing the connection.
+ * We can only ensure this in the rmmod path in the current code.
+ */
+void rds_conn_destroy(struct rds_connection *conn)
+{
+	struct rds_message *rm, *rtmp;
+	unsigned long flags;
+
+	rdsdebug("freeing conn %p for %pI4 -> "
+		 "%pI4\n", conn, &conn->c_laddr,
+		 &conn->c_faddr);
+
+	/* Ensure conn will not be scheduled for reconnect */
+	spin_lock_irq(&rds_conn_lock);
+	hlist_del_init_rcu(&conn->c_hash_node);
+	spin_unlock_irq(&rds_conn_lock);
+	synchronize_rcu();
+
+	/* shut the connection down */
+	rds_conn_drop(conn);
+	flush_work(&conn->c_down_w);
+
+	/* make sure lingering queued work won't try to ref the conn */
+	cancel_delayed_work_sync(&conn->c_send_w);
+	cancel_delayed_work_sync(&conn->c_recv_w);
+
+	/* tear down queued messages */
+	list_for_each_entry_safe(rm, rtmp,
+				 &conn->c_send_queue,
+				 m_conn_item) {
+		list_del_init(&rm->m_conn_item);
+		BUG_ON(!list_empty(&rm->m_sock_item));
+		rds_message_put(rm);
+	}
+	if (conn->c_xmit_rm)
+		rds_message_put(conn->c_xmit_rm);
+
+	conn->c_trans->conn_free(conn->c_transport_data);
+
+	/*
+	 * The congestion maps aren't freed up here.  They're
+	 * freed by rds_cong_exit() after all the connections
+	 * have been freed.
+	 */
+	rds_cong_remove_conn(conn);
+
+	BUG_ON(!list_empty(&conn->c_retrans));
+	kmem_cache_free(rds_conn_slab, conn);
+
+	spin_lock_irqsave(&rds_conn_lock, flags);
+	rds_conn_count--;
+	spin_unlock_irqrestore(&rds_conn_lock, flags);
+}
+EXPORT_SYMBOL_GPL(rds_conn_destroy);
+
+static void rds_conn_message_info(struct socket *sock, unsigned int len,
+				  struct rds_info_iterator *iter,
+				  struct rds_info_lengths *lens,
+				  int want_send)
+{
+	struct hlist_head *head;
+	struct list_head *list;
+	struct rds_connection *conn;
+	struct rds_message *rm;
+	unsigned int total = 0;
+	unsigned long flags;
+	size_t i;
+
+	len /= sizeof(struct rds_info_message);
+
+	rcu_read_lock();
+
+	for (i = 0, head = rds_conn_hash; i < ARRAY_SIZE(rds_conn_hash);
+	     i++, head++) {
+		hlist_for_each_entry_rcu(conn, head, c_hash_node) {
+			if (want_send)
+				list = &conn->c_send_queue;
+			else
+				list = &conn->c_retrans;
+
+			spin_lock_irqsave(&conn->c_lock, flags);
+
+			/* XXX too lazy to maintain counts.. */
+			list_for_each_entry(rm, list, m_conn_item) {
+				total++;
+				if (total <= len)
+					rds_inc_info_copy(&rm->m_inc, iter,
+							  conn->c_laddr,
+							  conn->c_faddr, 0);
+			}
+
+			spin_unlock_irqrestore(&conn->c_lock, flags);
+		}
+	}
+	rcu_read_unlock();
+
+	lens->nr = total;
+	lens->each = sizeof(struct rds_info_message);
+}
+
+static void rds_conn_message_info_send(struct socket *sock, unsigned int len,
+				       struct rds_info_iterator *iter,
+				       struct rds_info_lengths *lens)
+{
+	rds_conn_message_info(sock, len, iter, lens, 1);
+}
+
+static void rds_conn_message_info_retrans(struct socket *sock,
+					  unsigned int len,
+					  struct rds_info_iterator *iter,
+					  struct rds_info_lengths *lens)
+{
+	rds_conn_message_info(sock, len, iter, lens, 0);
+}
+
+void rds_for_each_conn_info(struct socket *sock, unsigned int len,
+			  struct rds_info_iterator *iter,
+			  struct rds_info_lengths *lens,
+			  int (*visitor)(struct rds_connection *, void *),
+			  size_t item_len)
+{
+	uint64_t buffer[(item_len + 7) / 8];
+	struct hlist_head *head;
+	struct rds_connection *conn;
+	size_t i;
+
+	rcu_read_lock();
+
+	lens->nr = 0;
+	lens->each = item_len;
+
+	for (i = 0, head = rds_conn_hash; i < ARRAY_SIZE(rds_conn_hash);
+	     i++, head++) {
+		hlist_for_each_entry_rcu(conn, head, c_hash_node) {
+
+			/* XXX no c_lock usage.. */
+			if (!visitor(conn, buffer))
+				continue;
+
+			/* We copy as much as we can fit in the buffer,
+			 * but we count all items so that the caller
+			 * can resize the buffer. */
+			if (len >= item_len) {
+				rds_info_copy(iter, buffer, item_len);
+				len -= item_len;
+			}
+			lens->nr++;
+		}
+	}
+	rcu_read_unlock();
+}
+EXPORT_SYMBOL_GPL(rds_for_each_conn_info);
+
+static int rds_conn_info_visitor(struct rds_connection *conn,
+				  void *buffer)
+{
+	struct rds_info_connection *cinfo = buffer;
+
+	cinfo->next_tx_seq = conn->c_next_tx_seq;
+	cinfo->next_rx_seq = conn->c_next_rx_seq;
+	cinfo->laddr = conn->c_laddr;
+	cinfo->faddr = conn->c_faddr;
+	strncpy(cinfo->transport, conn->c_trans->t_name,
+		sizeof(cinfo->transport));
+	cinfo->flags = 0;
+
+	rds_conn_info_set(cinfo->flags, test_bit(RDS_IN_XMIT, &conn->c_flags),
+			  SENDING);
+	/* XXX Future: return the state rather than these funky bits */
+	rds_conn_info_set(cinfo->flags,
+			  atomic_read(&conn->c_state) == RDS_CONN_CONNECTING,
+			  CONNECTING);
+	rds_conn_info_set(cinfo->flags,
+			  atomic_read(&conn->c_state) == RDS_CONN_UP,
+			  CONNECTED);
+	return 1;
+}
+
+static void rds_conn_info(struct socket *sock, unsigned int len,
+			  struct rds_info_iterator *iter,
+			  struct rds_info_lengths *lens)
+{
+	rds_for_each_conn_info(sock, len, iter, lens,
+				rds_conn_info_visitor,
+				sizeof(struct rds_info_connection));
+}
+
+int rds_conn_init(void)
+{
+	rds_conn_slab = kmem_cache_create("rds_connection",
+					  sizeof(struct rds_connection),
+					  0, 0, NULL);
+	if (!rds_conn_slab)
+		return -ENOMEM;
+
+	rds_info_register_func(RDS_INFO_CONNECTIONS, rds_conn_info);
+	rds_info_register_func(RDS_INFO_SEND_MESSAGES,
+			       rds_conn_message_info_send);
+	rds_info_register_func(RDS_INFO_RETRANS_MESSAGES,
+			       rds_conn_message_info_retrans);
+
+	return 0;
+}
+
+void rds_conn_exit(void)
+{
+	rds_loop_exit();
+
+	WARN_ON(!hlist_empty(rds_conn_hash));
+
+	kmem_cache_destroy(rds_conn_slab);
+
+	rds_info_deregister_func(RDS_INFO_CONNECTIONS, rds_conn_info);
+	rds_info_deregister_func(RDS_INFO_SEND_MESSAGES,
+				 rds_conn_message_info_send);
+	rds_info_deregister_func(RDS_INFO_RETRANS_MESSAGES,
+				 rds_conn_message_info_retrans);
+}
+
+/*
+ * Force a disconnect
+ */
+void rds_conn_drop(struct rds_connection *conn)
+{
+	atomic_set(&conn->c_state, RDS_CONN_ERROR);
+	queue_work(rds_wq, &conn->c_down_w);
+}
+EXPORT_SYMBOL_GPL(rds_conn_drop);
+
+/*
+ * If the connection is down, trigger a connect. We may have scheduled a
+ * delayed reconnect however - in this case we should not interfere.
+ */
+void rds_conn_connect_if_down(struct rds_connection *conn)
+{
+	if (rds_conn_state(conn) == RDS_CONN_DOWN &&
+	    !test_and_set_bit(RDS_RECONNECT_PENDING, &conn->c_flags))
+		queue_delayed_work(rds_wq, &conn->c_conn_w, 0);
+}
+EXPORT_SYMBOL_GPL(rds_conn_connect_if_down);
+
+/*
+ * An error occurred on the connection
+ */
+void
+__rds_conn_error(struct rds_connection *conn, const char *fmt, ...)
+{
+	va_list ap;
+
+	va_start(ap, fmt);
+	vprintk(fmt, ap);
+	va_end(ap);
+
+	rds_conn_drop(conn);
+}
diff --git a/net/rds/ib.c b/net/rds/ib.c
new file mode 100644
index 0000000..ba2dffe
--- /dev/null
+++ b/net/rds/ib.c
@@ -0,0 +1,435 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/in.h>
+#include <linux/if.h>
+#include <linux/netdevice.h>
+#include <linux/inetdevice.h>
+#include <linux/if_arp.h>
+#include <linux/delay.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+
+#include "rds.h"
+#include "ib.h"
+
+static unsigned int fmr_pool_size = RDS_FMR_POOL_SIZE;
+unsigned int fmr_message_size = RDS_FMR_SIZE + 1; /* +1 allows for unaligned MRs */
+unsigned int rds_ib_retry_count = RDS_IB_DEFAULT_RETRY_COUNT;
+
+module_param(fmr_pool_size, int, 0444);
+MODULE_PARM_DESC(fmr_pool_size, " Max number of fmr per HCA");
+module_param(fmr_message_size, int, 0444);
+MODULE_PARM_DESC(fmr_message_size, " Max size of a RDMA transfer");
+module_param(rds_ib_retry_count, int, 0444);
+MODULE_PARM_DESC(rds_ib_retry_count, " Number of hw retries before reporting an error");
+
+/*
+ * we have a clumsy combination of RCU and a rwsem protecting this list
+ * because it is used both in the get_mr fast path and while blocking in
+ * the FMR flushing path.
+ */
+DECLARE_RWSEM(rds_ib_devices_lock);
+struct list_head rds_ib_devices;
+
+/* NOTE: if also grabbing ibdev lock, grab this first */
+DEFINE_SPINLOCK(ib_nodev_conns_lock);
+LIST_HEAD(ib_nodev_conns);
+
+static void rds_ib_nodev_connect(void)
+{
+	struct rds_ib_connection *ic;
+
+	spin_lock(&ib_nodev_conns_lock);
+	list_for_each_entry(ic, &ib_nodev_conns, ib_node)
+		rds_conn_connect_if_down(ic->conn);
+	spin_unlock(&ib_nodev_conns_lock);
+}
+
+static void rds_ib_dev_shutdown(struct rds_ib_device *rds_ibdev)
+{
+	struct rds_ib_connection *ic;
+	unsigned long flags;
+
+	spin_lock_irqsave(&rds_ibdev->spinlock, flags);
+	list_for_each_entry(ic, &rds_ibdev->conn_list, ib_node)
+		rds_conn_drop(ic->conn);
+	spin_unlock_irqrestore(&rds_ibdev->spinlock, flags);
+}
+
+/*
+ * rds_ib_destroy_mr_pool() blocks on a few things and mrs drop references
+ * from interrupt context so we push freing off into a work struct in krdsd.
+ */
+static void rds_ib_dev_free(struct work_struct *work)
+{
+	struct rds_ib_ipaddr *i_ipaddr, *i_next;
+	struct rds_ib_device *rds_ibdev = container_of(work,
+					struct rds_ib_device, free_work);
+
+	if (rds_ibdev->mr_pool)
+		rds_ib_destroy_mr_pool(rds_ibdev->mr_pool);
+	if (rds_ibdev->mr)
+		ib_dereg_mr(rds_ibdev->mr);
+	if (rds_ibdev->pd)
+		ib_dealloc_pd(rds_ibdev->pd);
+
+	list_for_each_entry_safe(i_ipaddr, i_next, &rds_ibdev->ipaddr_list, list) {
+		list_del(&i_ipaddr->list);
+		kfree(i_ipaddr);
+	}
+
+	kfree(rds_ibdev);
+}
+
+void rds_ib_dev_put(struct rds_ib_device *rds_ibdev)
+{
+	BUG_ON(atomic_read(&rds_ibdev->refcount) <= 0);
+	if (atomic_dec_and_test(&rds_ibdev->refcount))
+		queue_work(rds_wq, &rds_ibdev->free_work);
+}
+
+static void rds_ib_add_one(struct ib_device *device)
+{
+	struct rds_ib_device *rds_ibdev;
+	struct ib_device_attr *dev_attr;
+
+	/* Only handle IB (no iWARP) devices */
+	if (device->node_type != RDMA_NODE_IB_CA)
+		return;
+
+	dev_attr = kmalloc(sizeof *dev_attr, GFP_KERNEL);
+	if (!dev_attr)
+		return;
+
+	if (ib_query_device(device, dev_attr)) {
+		rdsdebug("Query device failed for %s\n", device->name);
+		goto free_attr;
+	}
+
+	rds_ibdev = kzalloc_node(sizeof(struct rds_ib_device), GFP_KERNEL,
+				 ibdev_to_node(device));
+	if (!rds_ibdev)
+		goto free_attr;
+
+	spin_lock_init(&rds_ibdev->spinlock);
+	atomic_set(&rds_ibdev->refcount, 1);
+	INIT_WORK(&rds_ibdev->free_work, rds_ib_dev_free);
+
+	rds_ibdev->max_wrs = dev_attr->max_qp_wr;
+	rds_ibdev->max_sge = min(dev_attr->max_sge, RDS_IB_MAX_SGE);
+
+	rds_ibdev->fmr_max_remaps = dev_attr->max_map_per_fmr?: 32;
+	rds_ibdev->max_fmrs = dev_attr->max_fmr ?
+			min_t(unsigned int, dev_attr->max_fmr, fmr_pool_size) :
+			fmr_pool_size;
+
+	rds_ibdev->max_initiator_depth = dev_attr->max_qp_init_rd_atom;
+	rds_ibdev->max_responder_resources = dev_attr->max_qp_rd_atom;
+
+	rds_ibdev->dev = device;
+	rds_ibdev->pd = ib_alloc_pd(device);
+	if (IS_ERR(rds_ibdev->pd)) {
+		rds_ibdev->pd = NULL;
+		goto put_dev;
+	}
+
+	rds_ibdev->mr = ib_get_dma_mr(rds_ibdev->pd, IB_ACCESS_LOCAL_WRITE);
+	if (IS_ERR(rds_ibdev->mr)) {
+		rds_ibdev->mr = NULL;
+		goto put_dev;
+	}
+
+	rds_ibdev->mr_pool = rds_ib_create_mr_pool(rds_ibdev);
+	if (IS_ERR(rds_ibdev->mr_pool)) {
+		rds_ibdev->mr_pool = NULL;
+		goto put_dev;
+	}
+
+	INIT_LIST_HEAD(&rds_ibdev->ipaddr_list);
+	INIT_LIST_HEAD(&rds_ibdev->conn_list);
+
+	down_write(&rds_ib_devices_lock);
+	list_add_tail_rcu(&rds_ibdev->list, &rds_ib_devices);
+	up_write(&rds_ib_devices_lock);
+	atomic_inc(&rds_ibdev->refcount);
+
+	ib_set_client_data(device, &rds_ib_client, rds_ibdev);
+	atomic_inc(&rds_ibdev->refcount);
+
+	rds_ib_nodev_connect();
+
+put_dev:
+	rds_ib_dev_put(rds_ibdev);
+free_attr:
+	kfree(dev_attr);
+}
+
+/*
+ * New connections use this to find the device to associate with the
+ * connection.  It's not in the fast path so we're not concerned about the
+ * performance of the IB call.  (As of this writing, it uses an interrupt
+ * blocking spinlock to serialize walking a per-device list of all registered
+ * clients.)
+ *
+ * RCU is used to handle incoming connections racing with device teardown.
+ * Rather than use a lock to serialize removal from the client_data and
+ * getting a new reference, we use an RCU grace period.  The destruction
+ * path removes the device from client_data and then waits for all RCU
+ * readers to finish.
+ *
+ * A new connection can get NULL from this if its arriving on a
+ * device that is in the process of being removed.
+ */
+struct rds_ib_device *rds_ib_get_client_data(struct ib_device *device)
+{
+	struct rds_ib_device *rds_ibdev;
+
+	rcu_read_lock();
+	rds_ibdev = ib_get_client_data(device, &rds_ib_client);
+	if (rds_ibdev)
+		atomic_inc(&rds_ibdev->refcount);
+	rcu_read_unlock();
+	return rds_ibdev;
+}
+
+/*
+ * The IB stack is letting us know that a device is going away.  This can
+ * happen if the underlying HCA driver is removed or if PCI hotplug is removing
+ * the pci function, for example.
+ *
+ * This can be called at any time and can be racing with any other RDS path.
+ */
+static void rds_ib_remove_one(struct ib_device *device)
+{
+	struct rds_ib_device *rds_ibdev;
+
+	rds_ibdev = ib_get_client_data(device, &rds_ib_client);
+	if (!rds_ibdev)
+		return;
+
+	rds_ib_dev_shutdown(rds_ibdev);
+
+	/* stop connection attempts from getting a reference to this device. */
+	ib_set_client_data(device, &rds_ib_client, NULL);
+
+	down_write(&rds_ib_devices_lock);
+	list_del_rcu(&rds_ibdev->list);
+	up_write(&rds_ib_devices_lock);
+
+	/*
+	 * This synchronize rcu is waiting for readers of both the ib
+	 * client data and the devices list to finish before we drop
+	 * both of those references.
+	 */
+	synchronize_rcu();
+	rds_ib_dev_put(rds_ibdev);
+	rds_ib_dev_put(rds_ibdev);
+}
+
+struct ib_client rds_ib_client = {
+	.name   = "rds_ib",
+	.add    = rds_ib_add_one,
+	.remove = rds_ib_remove_one
+};
+
+static int rds_ib_conn_info_visitor(struct rds_connection *conn,
+				    void *buffer)
+{
+	struct rds_info_rdma_connection *iinfo = buffer;
+	struct rds_ib_connection *ic;
+
+	/* We will only ever look at IB transports */
+	if (conn->c_trans != &rds_ib_transport)
+		return 0;
+
+	iinfo->src_addr = conn->c_laddr;
+	iinfo->dst_addr = conn->c_faddr;
+
+	memset(&iinfo->src_gid, 0, sizeof(iinfo->src_gid));
+	memset(&iinfo->dst_gid, 0, sizeof(iinfo->dst_gid));
+	if (rds_conn_state(conn) == RDS_CONN_UP) {
+		struct rds_ib_device *rds_ibdev;
+		struct rdma_dev_addr *dev_addr;
+
+		ic = conn->c_transport_data;
+		dev_addr = &ic->i_cm_id->route.addr.dev_addr;
+
+		rdma_addr_get_sgid(dev_addr, (union ib_gid *) &iinfo->src_gid);
+		rdma_addr_get_dgid(dev_addr, (union ib_gid *) &iinfo->dst_gid);
+
+		rds_ibdev = ic->rds_ibdev;
+		iinfo->max_send_wr = ic->i_send_ring.w_nr;
+		iinfo->max_recv_wr = ic->i_recv_ring.w_nr;
+		iinfo->max_send_sge = rds_ibdev->max_sge;
+		rds_ib_get_mr_info(rds_ibdev, iinfo);
+	}
+	return 1;
+}
+
+static void rds_ib_ic_info(struct socket *sock, unsigned int len,
+			   struct rds_info_iterator *iter,
+			   struct rds_info_lengths *lens)
+{
+	rds_for_each_conn_info(sock, len, iter, lens,
+				rds_ib_conn_info_visitor,
+				sizeof(struct rds_info_rdma_connection));
+}
+
+
+/*
+ * Early RDS/IB was built to only bind to an address if there is an IPoIB
+ * device with that address set.
+ *
+ * If it were me, I'd advocate for something more flexible.  Sending and
+ * receiving should be device-agnostic.  Transports would try and maintain
+ * connections between peers who have messages queued.  Userspace would be
+ * allowed to influence which paths have priority.  We could call userspace
+ * asserting this policy "routing".
+ */
+static int rds_ib_laddr_check(__be32 addr)
+{
+	int ret;
+	struct rdma_cm_id *cm_id;
+	struct sockaddr_in sin;
+
+	/* Create a CMA ID and try to bind it. This catches both
+	 * IB and iWARP capable NICs.
+	 */
+	cm_id = rdma_create_id(NULL, NULL, RDMA_PS_TCP, IB_QPT_RC);
+	if (IS_ERR(cm_id))
+		return PTR_ERR(cm_id);
+
+	memset(&sin, 0, sizeof(sin));
+	sin.sin_family = AF_INET;
+	sin.sin_addr.s_addr = addr;
+
+	/* rdma_bind_addr will only succeed for IB & iWARP devices */
+	ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin);
+	/* due to this, we will claim to support iWARP devices unless we
+	   check node_type. */
+	if (ret || !cm_id->device ||
+	    cm_id->device->node_type != RDMA_NODE_IB_CA)
+		ret = -EADDRNOTAVAIL;
+
+	rdsdebug("addr %pI4 ret %d node type %d\n",
+		&addr, ret,
+		cm_id->device ? cm_id->device->node_type : -1);
+
+	rdma_destroy_id(cm_id);
+
+	return ret;
+}
+
+static void rds_ib_unregister_client(void)
+{
+	ib_unregister_client(&rds_ib_client);
+	/* wait for rds_ib_dev_free() to complete */
+	flush_workqueue(rds_wq);
+}
+
+void rds_ib_exit(void)
+{
+	rds_info_deregister_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info);
+	rds_ib_unregister_client();
+	rds_ib_destroy_nodev_conns();
+	rds_ib_sysctl_exit();
+	rds_ib_recv_exit();
+	rds_trans_unregister(&rds_ib_transport);
+}
+
+struct rds_transport rds_ib_transport = {
+	.laddr_check		= rds_ib_laddr_check,
+	.xmit_complete		= rds_ib_xmit_complete,
+	.xmit			= rds_ib_xmit,
+	.xmit_rdma		= rds_ib_xmit_rdma,
+	.xmit_atomic		= rds_ib_xmit_atomic,
+	.recv			= rds_ib_recv,
+	.conn_alloc		= rds_ib_conn_alloc,
+	.conn_free		= rds_ib_conn_free,
+	.conn_connect		= rds_ib_conn_connect,
+	.conn_shutdown		= rds_ib_conn_shutdown,
+	.inc_copy_to_user	= rds_ib_inc_copy_to_user,
+	.inc_free		= rds_ib_inc_free,
+	.cm_initiate_connect	= rds_ib_cm_initiate_connect,
+	.cm_handle_connect	= rds_ib_cm_handle_connect,
+	.cm_connect_complete	= rds_ib_cm_connect_complete,
+	.stats_info_copy	= rds_ib_stats_info_copy,
+	.exit			= rds_ib_exit,
+	.get_mr			= rds_ib_get_mr,
+	.sync_mr		= rds_ib_sync_mr,
+	.free_mr		= rds_ib_free_mr,
+	.flush_mrs		= rds_ib_flush_mrs,
+	.t_owner		= THIS_MODULE,
+	.t_name			= "infiniband",
+	.t_type			= RDS_TRANS_IB
+};
+
+int rds_ib_init(void)
+{
+	int ret;
+
+	INIT_LIST_HEAD(&rds_ib_devices);
+
+	ret = ib_register_client(&rds_ib_client);
+	if (ret)
+		goto out;
+
+	ret = rds_ib_sysctl_init();
+	if (ret)
+		goto out_ibreg;
+
+	ret = rds_ib_recv_init();
+	if (ret)
+		goto out_sysctl;
+
+	ret = rds_trans_register(&rds_ib_transport);
+	if (ret)
+		goto out_recv;
+
+	rds_info_register_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info);
+
+	goto out;
+
+out_recv:
+	rds_ib_recv_exit();
+out_sysctl:
+	rds_ib_sysctl_exit();
+out_ibreg:
+	rds_ib_unregister_client();
+out:
+	return ret;
+}
+
+MODULE_LICENSE("GPL");
+
diff --git a/net/rds/ib.h b/net/rds/ib.h
new file mode 100644
index 0000000..7280ab8
--- /dev/null
+++ b/net/rds/ib.h
@@ -0,0 +1,373 @@
+#ifndef _RDS_IB_H
+#define _RDS_IB_H
+
+#include <rdma/ib_verbs.h>
+#include <rdma/rdma_cm.h>
+#include <linux/interrupt.h>
+#include <linux/pci.h>
+#include <linux/slab.h>
+#include "rds.h"
+#include "rdma_transport.h"
+
+#define RDS_FMR_SIZE			256
+#define RDS_FMR_POOL_SIZE		8192
+
+#define RDS_IB_MAX_SGE			8
+#define RDS_IB_RECV_SGE 		2
+
+#define RDS_IB_DEFAULT_RECV_WR		1024
+#define RDS_IB_DEFAULT_SEND_WR		256
+
+#define RDS_IB_DEFAULT_RETRY_COUNT	2
+
+#define RDS_IB_SUPPORTED_PROTOCOLS	0x00000003	/* minor versions supported */
+
+#define RDS_IB_RECYCLE_BATCH_COUNT	32
+
+extern struct rw_semaphore rds_ib_devices_lock;
+extern struct list_head rds_ib_devices;
+
+/*
+ * IB posts RDS_FRAG_SIZE fragments of pages to the receive queues to
+ * try and minimize the amount of memory tied up both the device and
+ * socket receive queues.
+ */
+struct rds_page_frag {
+	struct list_head	f_item;
+	struct list_head	f_cache_entry;
+	struct scatterlist	f_sg;
+};
+
+struct rds_ib_incoming {
+	struct list_head	ii_frags;
+	struct list_head	ii_cache_entry;
+	struct rds_incoming	ii_inc;
+};
+
+struct rds_ib_cache_head {
+	struct list_head *first;
+	unsigned long count;
+};
+
+struct rds_ib_refill_cache {
+	struct rds_ib_cache_head __percpu *percpu;
+	struct list_head	 *xfer;
+	struct list_head	 *ready;
+};
+
+struct rds_ib_connect_private {
+	/* Add new fields at the end, and don't permute existing fields. */
+	__be32			dp_saddr;
+	__be32			dp_daddr;
+	u8			dp_protocol_major;
+	u8			dp_protocol_minor;
+	__be16			dp_protocol_minor_mask; /* bitmask */
+	__be32			dp_reserved1;
+	__be64			dp_ack_seq;
+	__be32			dp_credit;		/* non-zero enables flow ctl */
+};
+
+struct rds_ib_send_work {
+	void			*s_op;
+	struct ib_send_wr	s_wr;
+	struct ib_sge		s_sge[RDS_IB_MAX_SGE];
+	unsigned long		s_queued;
+};
+
+struct rds_ib_recv_work {
+	struct rds_ib_incoming 	*r_ibinc;
+	struct rds_page_frag	*r_frag;
+	struct ib_recv_wr	r_wr;
+	struct ib_sge		r_sge[2];
+};
+
+struct rds_ib_work_ring {
+	u32		w_nr;
+	u32		w_alloc_ptr;
+	u32		w_alloc_ctr;
+	u32		w_free_ptr;
+	atomic_t	w_free_ctr;
+};
+
+struct rds_ib_device;
+
+struct rds_ib_connection {
+
+	struct list_head	ib_node;
+	struct rds_ib_device	*rds_ibdev;
+	struct rds_connection	*conn;
+
+	/* alphabet soup, IBTA style */
+	struct rdma_cm_id	*i_cm_id;
+	struct ib_pd		*i_pd;
+	struct ib_mr		*i_mr;
+	struct ib_cq		*i_send_cq;
+	struct ib_cq		*i_recv_cq;
+
+	/* tx */
+	struct rds_ib_work_ring	i_send_ring;
+	struct rm_data_op	*i_data_op;
+	struct rds_header	*i_send_hdrs;
+	u64			i_send_hdrs_dma;
+	struct rds_ib_send_work *i_sends;
+	atomic_t		i_signaled_sends;
+
+	/* rx */
+	struct tasklet_struct	i_recv_tasklet;
+	struct mutex		i_recv_mutex;
+	struct rds_ib_work_ring	i_recv_ring;
+	struct rds_ib_incoming	*i_ibinc;
+	u32			i_recv_data_rem;
+	struct rds_header	*i_recv_hdrs;
+	u64			i_recv_hdrs_dma;
+	struct rds_ib_recv_work *i_recvs;
+	u64			i_ack_recv;	/* last ACK received */
+	struct rds_ib_refill_cache i_cache_incs;
+	struct rds_ib_refill_cache i_cache_frags;
+
+	/* sending acks */
+	unsigned long		i_ack_flags;
+#ifdef KERNEL_HAS_ATOMIC64
+	atomic64_t		i_ack_next;	/* next ACK to send */
+#else
+	spinlock_t		i_ack_lock;	/* protect i_ack_next */
+	u64			i_ack_next;	/* next ACK to send */
+#endif
+	struct rds_header	*i_ack;
+	struct ib_send_wr	i_ack_wr;
+	struct ib_sge		i_ack_sge;
+	u64			i_ack_dma;
+	unsigned long		i_ack_queued;
+
+	/* Flow control related information
+	 *
+	 * Our algorithm uses a pair variables that we need to access
+	 * atomically - one for the send credits, and one posted
+	 * recv credits we need to transfer to remote.
+	 * Rather than protect them using a slow spinlock, we put both into
+	 * a single atomic_t and update it using cmpxchg
+	 */
+	atomic_t		i_credits;
+
+	/* Protocol version specific information */
+	unsigned int		i_flowctl:1;	/* enable/disable flow ctl */
+
+	/* Batched completions */
+	unsigned int		i_unsignaled_wrs;
+};
+
+/* This assumes that atomic_t is at least 32 bits */
+#define IB_GET_SEND_CREDITS(v)	((v) & 0xffff)
+#define IB_GET_POST_CREDITS(v)	((v) >> 16)
+#define IB_SET_SEND_CREDITS(v)	((v) & 0xffff)
+#define IB_SET_POST_CREDITS(v)	((v) << 16)
+
+struct rds_ib_ipaddr {
+	struct list_head	list;
+	__be32			ipaddr;
+};
+
+struct rds_ib_device {
+	struct list_head	list;
+	struct list_head	ipaddr_list;
+	struct list_head	conn_list;
+	struct ib_device	*dev;
+	struct ib_pd		*pd;
+	struct ib_mr		*mr;
+	struct rds_ib_mr_pool	*mr_pool;
+	unsigned int		fmr_max_remaps;
+	unsigned int		max_fmrs;
+	int			max_sge;
+	unsigned int		max_wrs;
+	unsigned int		max_initiator_depth;
+	unsigned int		max_responder_resources;
+	spinlock_t		spinlock;	/* protect the above */
+	atomic_t		refcount;
+	struct work_struct	free_work;
+};
+
+#define ibdev_to_node(ibdev) dev_to_node(ibdev->dma_device)
+#define rdsibdev_to_node(rdsibdev) ibdev_to_node(rdsibdev->dev)
+
+/* bits for i_ack_flags */
+#define IB_ACK_IN_FLIGHT	0
+#define IB_ACK_REQUESTED	1
+
+/* Magic WR_ID for ACKs */
+#define RDS_IB_ACK_WR_ID	(~(u64) 0)
+
+struct rds_ib_statistics {
+	uint64_t	s_ib_connect_raced;
+	uint64_t	s_ib_listen_closed_stale;
+	uint64_t	s_ib_tx_cq_call;
+	uint64_t	s_ib_tx_cq_event;
+	uint64_t	s_ib_tx_ring_full;
+	uint64_t	s_ib_tx_throttle;
+	uint64_t	s_ib_tx_sg_mapping_failure;
+	uint64_t	s_ib_tx_stalled;
+	uint64_t	s_ib_tx_credit_updates;
+	uint64_t	s_ib_rx_cq_call;
+	uint64_t	s_ib_rx_cq_event;
+	uint64_t	s_ib_rx_ring_empty;
+	uint64_t	s_ib_rx_refill_from_cq;
+	uint64_t	s_ib_rx_refill_from_thread;
+	uint64_t	s_ib_rx_alloc_limit;
+	uint64_t	s_ib_rx_credit_updates;
+	uint64_t	s_ib_ack_sent;
+	uint64_t	s_ib_ack_send_failure;
+	uint64_t	s_ib_ack_send_delayed;
+	uint64_t	s_ib_ack_send_piggybacked;
+	uint64_t	s_ib_ack_received;
+	uint64_t	s_ib_rdma_mr_alloc;
+	uint64_t	s_ib_rdma_mr_free;
+	uint64_t	s_ib_rdma_mr_used;
+	uint64_t	s_ib_rdma_mr_pool_flush;
+	uint64_t	s_ib_rdma_mr_pool_wait;
+	uint64_t	s_ib_rdma_mr_pool_depleted;
+	uint64_t	s_ib_atomic_cswp;
+	uint64_t	s_ib_atomic_fadd;
+};
+
+extern struct workqueue_struct *rds_ib_wq;
+
+/*
+ * Fake ib_dma_sync_sg_for_{cpu,device} as long as ib_verbs.h
+ * doesn't define it.
+ */
+static inline void rds_ib_dma_sync_sg_for_cpu(struct ib_device *dev,
+		struct scatterlist *sg, unsigned int sg_dma_len, int direction)
+{
+	unsigned int i;
+
+	for (i = 0; i < sg_dma_len; ++i) {
+		ib_dma_sync_single_for_cpu(dev,
+				ib_sg_dma_address(dev, &sg[i]),
+				ib_sg_dma_len(dev, &sg[i]),
+				direction);
+	}
+}
+#define ib_dma_sync_sg_for_cpu	rds_ib_dma_sync_sg_for_cpu
+
+static inline void rds_ib_dma_sync_sg_for_device(struct ib_device *dev,
+		struct scatterlist *sg, unsigned int sg_dma_len, int direction)
+{
+	unsigned int i;
+
+	for (i = 0; i < sg_dma_len; ++i) {
+		ib_dma_sync_single_for_device(dev,
+				ib_sg_dma_address(dev, &sg[i]),
+				ib_sg_dma_len(dev, &sg[i]),
+				direction);
+	}
+}
+#define ib_dma_sync_sg_for_device	rds_ib_dma_sync_sg_for_device
+
+
+/* ib.c */
+extern struct rds_transport rds_ib_transport;
+struct rds_ib_device *rds_ib_get_client_data(struct ib_device *device);
+void rds_ib_dev_put(struct rds_ib_device *rds_ibdev);
+extern struct ib_client rds_ib_client;
+
+extern unsigned int fmr_message_size;
+extern unsigned int rds_ib_retry_count;
+
+extern spinlock_t ib_nodev_conns_lock;
+extern struct list_head ib_nodev_conns;
+
+/* ib_cm.c */
+int rds_ib_conn_alloc(struct rds_connection *conn, gfp_t gfp);
+void rds_ib_conn_free(void *arg);
+int rds_ib_conn_connect(struct rds_connection *conn);
+void rds_ib_conn_shutdown(struct rds_connection *conn);
+void rds_ib_state_change(struct sock *sk);
+int rds_ib_listen_init(void);
+void rds_ib_listen_stop(void);
+void __rds_ib_conn_error(struct rds_connection *conn, const char *, ...);
+int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
+			     struct rdma_cm_event *event);
+int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id);
+void rds_ib_cm_connect_complete(struct rds_connection *conn,
+				struct rdma_cm_event *event);
+
+
+#define rds_ib_conn_error(conn, fmt...) \
+	__rds_ib_conn_error(conn, KERN_WARNING "RDS/IB: " fmt)
+
+/* ib_rdma.c */
+int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr);
+void rds_ib_add_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn);
+void rds_ib_remove_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn);
+void rds_ib_destroy_nodev_conns(void);
+struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *);
+void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_connection *iinfo);
+void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *);
+void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents,
+		    struct rds_sock *rs, u32 *key_ret);
+void rds_ib_sync_mr(void *trans_private, int dir);
+void rds_ib_free_mr(void *trans_private, int invalidate);
+void rds_ib_flush_mrs(void);
+
+/* ib_recv.c */
+int rds_ib_recv_init(void);
+void rds_ib_recv_exit(void);
+int rds_ib_recv(struct rds_connection *conn);
+int rds_ib_recv_alloc_caches(struct rds_ib_connection *ic);
+void rds_ib_recv_free_caches(struct rds_ib_connection *ic);
+void rds_ib_recv_refill(struct rds_connection *conn, int prefill);
+void rds_ib_inc_free(struct rds_incoming *inc);
+int rds_ib_inc_copy_to_user(struct rds_incoming *inc, struct iovec *iov,
+			     size_t size);
+void rds_ib_recv_cq_comp_handler(struct ib_cq *cq, void *context);
+void rds_ib_recv_tasklet_fn(unsigned long data);
+void rds_ib_recv_init_ring(struct rds_ib_connection *ic);
+void rds_ib_recv_clear_ring(struct rds_ib_connection *ic);
+void rds_ib_recv_init_ack(struct rds_ib_connection *ic);
+void rds_ib_attempt_ack(struct rds_ib_connection *ic);
+void rds_ib_ack_send_complete(struct rds_ib_connection *ic);
+u64 rds_ib_piggyb_ack(struct rds_ib_connection *ic);
+
+/* ib_ring.c */
+void rds_ib_ring_init(struct rds_ib_work_ring *ring, u32 nr);
+void rds_ib_ring_resize(struct rds_ib_work_ring *ring, u32 nr);
+u32 rds_ib_ring_alloc(struct rds_ib_work_ring *ring, u32 val, u32 *pos);
+void rds_ib_ring_free(struct rds_ib_work_ring *ring, u32 val);
+void rds_ib_ring_unalloc(struct rds_ib_work_ring *ring, u32 val);
+int rds_ib_ring_empty(struct rds_ib_work_ring *ring);
+int rds_ib_ring_low(struct rds_ib_work_ring *ring);
+u32 rds_ib_ring_oldest(struct rds_ib_work_ring *ring);
+u32 rds_ib_ring_completed(struct rds_ib_work_ring *ring, u32 wr_id, u32 oldest);
+extern wait_queue_head_t rds_ib_ring_empty_wait;
+
+/* ib_send.c */
+char *rds_ib_wc_status_str(enum ib_wc_status status);
+void rds_ib_xmit_complete(struct rds_connection *conn);
+int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
+		unsigned int hdr_off, unsigned int sg, unsigned int off);
+void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context);
+void rds_ib_send_init_ring(struct rds_ib_connection *ic);
+void rds_ib_send_clear_ring(struct rds_ib_connection *ic);
+int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op);
+void rds_ib_send_add_credits(struct rds_connection *conn, unsigned int credits);
+void rds_ib_advertise_credits(struct rds_connection *conn, unsigned int posted);
+int rds_ib_send_grab_credits(struct rds_ib_connection *ic, u32 wanted,
+			     u32 *adv_credits, int need_posted, int max_posted);
+int rds_ib_xmit_atomic(struct rds_connection *conn, struct rm_atomic_op *op);
+
+/* ib_stats.c */
+DECLARE_PER_CPU(struct rds_ib_statistics, rds_ib_stats);
+#define rds_ib_stats_inc(member) rds_stats_inc_which(rds_ib_stats, member)
+unsigned int rds_ib_stats_info_copy(struct rds_info_iterator *iter,
+				    unsigned int avail);
+
+/* ib_sysctl.c */
+int rds_ib_sysctl_init(void);
+void rds_ib_sysctl_exit(void);
+extern unsigned long rds_ib_sysctl_max_send_wr;
+extern unsigned long rds_ib_sysctl_max_recv_wr;
+extern unsigned long rds_ib_sysctl_max_unsig_wrs;
+extern unsigned long rds_ib_sysctl_max_unsig_bytes;
+extern unsigned long rds_ib_sysctl_max_recv_allocation;
+extern unsigned int rds_ib_sysctl_flow_control;
+
+#endif
diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c
new file mode 100644
index 0000000..31b74f5
--- /dev/null
+++ b/net/rds/ib_cm.c
@@ -0,0 +1,829 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/in.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/ratelimit.h>
+
+#include "rds.h"
+#include "ib.h"
+
+static char *rds_ib_event_type_strings[] = {
+#define RDS_IB_EVENT_STRING(foo) \
+		[IB_EVENT_##foo] = __stringify(IB_EVENT_##foo)
+	RDS_IB_EVENT_STRING(CQ_ERR),
+	RDS_IB_EVENT_STRING(QP_FATAL),
+	RDS_IB_EVENT_STRING(QP_REQ_ERR),
+	RDS_IB_EVENT_STRING(QP_ACCESS_ERR),
+	RDS_IB_EVENT_STRING(COMM_EST),
+	RDS_IB_EVENT_STRING(SQ_DRAINED),
+	RDS_IB_EVENT_STRING(PATH_MIG),
+	RDS_IB_EVENT_STRING(PATH_MIG_ERR),
+	RDS_IB_EVENT_STRING(DEVICE_FATAL),
+	RDS_IB_EVENT_STRING(PORT_ACTIVE),
+	RDS_IB_EVENT_STRING(PORT_ERR),
+	RDS_IB_EVENT_STRING(LID_CHANGE),
+	RDS_IB_EVENT_STRING(PKEY_CHANGE),
+	RDS_IB_EVENT_STRING(SM_CHANGE),
+	RDS_IB_EVENT_STRING(SRQ_ERR),
+	RDS_IB_EVENT_STRING(SRQ_LIMIT_REACHED),
+	RDS_IB_EVENT_STRING(QP_LAST_WQE_REACHED),
+	RDS_IB_EVENT_STRING(CLIENT_REREGISTER),
+#undef RDS_IB_EVENT_STRING
+};
+
+static char *rds_ib_event_str(enum ib_event_type type)
+{
+	return rds_str_array(rds_ib_event_type_strings,
+			     ARRAY_SIZE(rds_ib_event_type_strings), type);
+};
+
+/*
+ * Set the selected protocol version
+ */
+static void rds_ib_set_protocol(struct rds_connection *conn, unsigned int version)
+{
+	conn->c_version = version;
+}
+
+/*
+ * Set up flow control
+ */
+static void rds_ib_set_flow_control(struct rds_connection *conn, u32 credits)
+{
+	struct rds_ib_connection *ic = conn->c_transport_data;
+
+	if (rds_ib_sysctl_flow_control && credits != 0) {
+		/* We're doing flow control */
+		ic->i_flowctl = 1;
+		rds_ib_send_add_credits(conn, credits);
+	} else {
+		ic->i_flowctl = 0;
+	}
+}
+
+/*
+ * Tune RNR behavior. Without flow control, we use a rather
+ * low timeout, but not the absolute minimum - this should
+ * be tunable.
+ *
+ * We already set the RNR retry count to 7 (which is the
+ * smallest infinite number :-) above.
+ * If flow control is off, we want to change this back to 0
+ * so that we learn quickly when our credit accounting is
+ * buggy.
+ *
+ * Caller passes in a qp_attr pointer - don't waste stack spacv
+ * by allocation this twice.
+ */
+static void
+rds_ib_tune_rnr(struct rds_ib_connection *ic, struct ib_qp_attr *attr)
+{
+	int ret;
+
+	attr->min_rnr_timer = IB_RNR_TIMER_000_32;
+	ret = ib_modify_qp(ic->i_cm_id->qp, attr, IB_QP_MIN_RNR_TIMER);
+	if (ret)
+		printk(KERN_NOTICE "ib_modify_qp(IB_QP_MIN_RNR_TIMER): err=%d\n", -ret);
+}
+
+/*
+ * Connection established.
+ * We get here for both outgoing and incoming connection.
+ */
+void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_event *event)
+{
+	const struct rds_ib_connect_private *dp = NULL;
+	struct rds_ib_connection *ic = conn->c_transport_data;
+	struct ib_qp_attr qp_attr;
+	int err;
+
+	if (event->param.conn.private_data_len >= sizeof(*dp)) {
+		dp = event->param.conn.private_data;
+
+		/* make sure it isn't empty data */
+		if (dp->dp_protocol_major) {
+			rds_ib_set_protocol(conn,
+				RDS_PROTOCOL(dp->dp_protocol_major,
+				dp->dp_protocol_minor));
+			rds_ib_set_flow_control(conn, be32_to_cpu(dp->dp_credit));
+		}
+	}
+
+	if (conn->c_version < RDS_PROTOCOL(3,1)) {
+		printk(KERN_NOTICE "RDS/IB: Connection to %pI4 version %u.%u failed,"
+		       " no longer supported\n",
+		       &conn->c_faddr,
+		       RDS_PROTOCOL_MAJOR(conn->c_version),
+		       RDS_PROTOCOL_MINOR(conn->c_version));
+		rds_conn_destroy(conn);
+		return;
+	} else {
+		printk(KERN_NOTICE "RDS/IB: connected to %pI4 version %u.%u%s\n",
+		       &conn->c_faddr,
+		       RDS_PROTOCOL_MAJOR(conn->c_version),
+		       RDS_PROTOCOL_MINOR(conn->c_version),
+		       ic->i_flowctl ? ", flow control" : "");
+	}
+
+	/*
+	 * Init rings and fill recv. this needs to wait until protocol negotiation
+	 * is complete, since ring layout is different from 3.0 to 3.1.
+	 */
+	rds_ib_send_init_ring(ic);
+	rds_ib_recv_init_ring(ic);
+	/* Post receive buffers - as a side effect, this will update
+	 * the posted credit count. */
+	rds_ib_recv_refill(conn, 1);
+
+	/* Tune RNR behavior */
+	rds_ib_tune_rnr(ic, &qp_attr);
+
+	qp_attr.qp_state = IB_QPS_RTS;
+	err = ib_modify_qp(ic->i_cm_id->qp, &qp_attr, IB_QP_STATE);
+	if (err)
+		printk(KERN_NOTICE "ib_modify_qp(IB_QP_STATE, RTS): err=%d\n", err);
+
+	/* update ib_device with this local ipaddr */
+	err = rds_ib_update_ipaddr(ic->rds_ibdev, conn->c_laddr);
+	if (err)
+		printk(KERN_ERR "rds_ib_update_ipaddr failed (%d)\n",
+			err);
+
+	/* If the peer gave us the last packet it saw, process this as if
+	 * we had received a regular ACK. */
+	if (dp && dp->dp_ack_seq)
+		rds_send_drop_acked(conn, be64_to_cpu(dp->dp_ack_seq), NULL);
+
+	rds_connect_complete(conn);
+}
+
+static void rds_ib_cm_fill_conn_param(struct rds_connection *conn,
+			struct rdma_conn_param *conn_param,
+			struct rds_ib_connect_private *dp,
+			u32 protocol_version,
+			u32 max_responder_resources,
+			u32 max_initiator_depth)
+{
+	struct rds_ib_connection *ic = conn->c_transport_data;
+	struct rds_ib_device *rds_ibdev = ic->rds_ibdev;
+
+	memset(conn_param, 0, sizeof(struct rdma_conn_param));
+
+	conn_param->responder_resources =
+		min_t(u32, rds_ibdev->max_responder_resources, max_responder_resources);
+	conn_param->initiator_depth =
+		min_t(u32, rds_ibdev->max_initiator_depth, max_initiator_depth);
+	conn_param->retry_count = min_t(unsigned int, rds_ib_retry_count, 7);
+	conn_param->rnr_retry_count = 7;
+
+	if (dp) {
+		memset(dp, 0, sizeof(*dp));
+		dp->dp_saddr = conn->c_laddr;
+		dp->dp_daddr = conn->c_faddr;
+		dp->dp_protocol_major = RDS_PROTOCOL_MAJOR(protocol_version);
+		dp->dp_protocol_minor = RDS_PROTOCOL_MINOR(protocol_version);
+		dp->dp_protocol_minor_mask = cpu_to_be16(RDS_IB_SUPPORTED_PROTOCOLS);
+		dp->dp_ack_seq = rds_ib_piggyb_ack(ic);
+
+		/* Advertise flow control */
+		if (ic->i_flowctl) {
+			unsigned int credits;
+
+			credits = IB_GET_POST_CREDITS(atomic_read(&ic->i_credits));
+			dp->dp_credit = cpu_to_be32(credits);
+			atomic_sub(IB_SET_POST_CREDITS(credits), &ic->i_credits);
+		}
+
+		conn_param->private_data = dp;
+		conn_param->private_data_len = sizeof(*dp);
+	}
+}
+
+static void rds_ib_cq_event_handler(struct ib_event *event, void *data)
+{
+	rdsdebug("event %u (%s) data %p\n",
+		 event->event, rds_ib_event_str(event->event), data);
+}
+
+static void rds_ib_qp_event_handler(struct ib_event *event, void *data)
+{
+	struct rds_connection *conn = data;
+	struct rds_ib_connection *ic = conn->c_transport_data;
+
+	rdsdebug("conn %p ic %p event %u (%s)\n", conn, ic, event->event,
+		 rds_ib_event_str(event->event));
+
+	switch (event->event) {
+	case IB_EVENT_COMM_EST:
+		rdma_notify(ic->i_cm_id, IB_EVENT_COMM_EST);
+		break;
+	default:
+		rdsdebug("Fatal QP Event %u (%s) "
+			"- connection %pI4->%pI4, reconnecting\n",
+			event->event, rds_ib_event_str(event->event),
+			&conn->c_laddr, &conn->c_faddr);
+		rds_conn_drop(conn);
+		break;
+	}
+}
+
+/*
+ * This needs to be very careful to not leave IS_ERR pointers around for
+ * cleanup to trip over.
+ */
+static int rds_ib_setup_qp(struct rds_connection *conn)
+{
+	struct rds_ib_connection *ic = conn->c_transport_data;
+	struct ib_device *dev = ic->i_cm_id->device;
+	struct ib_qp_init_attr attr;
+	struct rds_ib_device *rds_ibdev;
+	int ret;
+
+	/*
+	 * It's normal to see a null device if an incoming connection races
+	 * with device removal, so we don't print a warning.
+	 */
+	rds_ibdev = rds_ib_get_client_data(dev);
+	if (!rds_ibdev)
+		return -EOPNOTSUPP;
+
+	/* add the conn now so that connection establishment has the dev */
+	rds_ib_add_conn(rds_ibdev, conn);
+
+	if (rds_ibdev->max_wrs < ic->i_send_ring.w_nr + 1)
+		rds_ib_ring_resize(&ic->i_send_ring, rds_ibdev->max_wrs - 1);
+	if (rds_ibdev->max_wrs < ic->i_recv_ring.w_nr + 1)
+		rds_ib_ring_resize(&ic->i_recv_ring, rds_ibdev->max_wrs - 1);
+
+	/* Protection domain and memory range */
+	ic->i_pd = rds_ibdev->pd;
+	ic->i_mr = rds_ibdev->mr;
+
+	ic->i_send_cq = ib_create_cq(dev, rds_ib_send_cq_comp_handler,
+				     rds_ib_cq_event_handler, conn,
+				     ic->i_send_ring.w_nr + 1, 0);
+	if (IS_ERR(ic->i_send_cq)) {
+		ret = PTR_ERR(ic->i_send_cq);
+		ic->i_send_cq = NULL;
+		rdsdebug("ib_create_cq send failed: %d\n", ret);
+		goto out;
+	}
+
+	ic->i_recv_cq = ib_create_cq(dev, rds_ib_recv_cq_comp_handler,
+				     rds_ib_cq_event_handler, conn,
+				     ic->i_recv_ring.w_nr, 0);
+	if (IS_ERR(ic->i_recv_cq)) {
+		ret = PTR_ERR(ic->i_recv_cq);
+		ic->i_recv_cq = NULL;
+		rdsdebug("ib_create_cq recv failed: %d\n", ret);
+		goto out;
+	}
+
+	ret = ib_req_notify_cq(ic->i_send_cq, IB_CQ_NEXT_COMP);
+	if (ret) {
+		rdsdebug("ib_req_notify_cq send failed: %d\n", ret);
+		goto out;
+	}
+
+	ret = ib_req_notify_cq(ic->i_recv_cq, IB_CQ_SOLICITED);
+	if (ret) {
+		rdsdebug("ib_req_notify_cq recv failed: %d\n", ret);
+		goto out;
+	}
+
+	/* XXX negotiate max send/recv with remote? */
+	memset(&attr, 0, sizeof(attr));
+	attr.event_handler = rds_ib_qp_event_handler;
+	attr.qp_context = conn;
+	/* + 1 to allow for the single ack message */
+	attr.cap.max_send_wr = ic->i_send_ring.w_nr + 1;
+	attr.cap.max_recv_wr = ic->i_recv_ring.w_nr + 1;
+	attr.cap.max_send_sge = rds_ibdev->max_sge;
+	attr.cap.max_recv_sge = RDS_IB_RECV_SGE;
+	attr.sq_sig_type = IB_SIGNAL_REQ_WR;
+	attr.qp_type = IB_QPT_RC;
+	attr.send_cq = ic->i_send_cq;
+	attr.recv_cq = ic->i_recv_cq;
+
+	/*
+	 * XXX this can fail if max_*_wr is too large?  Are we supposed
+	 * to back off until we get a value that the hardware can support?
+	 */
+	ret = rdma_create_qp(ic->i_cm_id, ic->i_pd, &attr);
+	if (ret) {
+		rdsdebug("rdma_create_qp failed: %d\n", ret);
+		goto out;
+	}
+
+	ic->i_send_hdrs = ib_dma_alloc_coherent(dev,
+					   ic->i_send_ring.w_nr *
+						sizeof(struct rds_header),
+					   &ic->i_send_hdrs_dma, GFP_KERNEL);
+	if (!ic->i_send_hdrs) {
+		ret = -ENOMEM;
+		rdsdebug("ib_dma_alloc_coherent send failed\n");
+		goto out;
+	}
+
+	ic->i_recv_hdrs = ib_dma_alloc_coherent(dev,
+					   ic->i_recv_ring.w_nr *
+						sizeof(struct rds_header),
+					   &ic->i_recv_hdrs_dma, GFP_KERNEL);
+	if (!ic->i_recv_hdrs) {
+		ret = -ENOMEM;
+		rdsdebug("ib_dma_alloc_coherent recv failed\n");
+		goto out;
+	}
+
+	ic->i_ack = ib_dma_alloc_coherent(dev, sizeof(struct rds_header),
+				       &ic->i_ack_dma, GFP_KERNEL);
+	if (!ic->i_ack) {
+		ret = -ENOMEM;
+		rdsdebug("ib_dma_alloc_coherent ack failed\n");
+		goto out;
+	}
+
+	ic->i_sends = vzalloc_node(ic->i_send_ring.w_nr * sizeof(struct rds_ib_send_work),
+				   ibdev_to_node(dev));
+	if (!ic->i_sends) {
+		ret = -ENOMEM;
+		rdsdebug("send allocation failed\n");
+		goto out;
+	}
+
+	ic->i_recvs = vzalloc_node(ic->i_recv_ring.w_nr * sizeof(struct rds_ib_recv_work),
+				   ibdev_to_node(dev));
+	if (!ic->i_recvs) {
+		ret = -ENOMEM;
+		rdsdebug("recv allocation failed\n");
+		goto out;
+	}
+
+	rds_ib_recv_init_ack(ic);
+
+	rdsdebug("conn %p pd %p mr %p cq %p %p\n", conn, ic->i_pd, ic->i_mr,
+		 ic->i_send_cq, ic->i_recv_cq);
+
+out:
+	rds_ib_dev_put(rds_ibdev);
+	return ret;
+}
+
+static u32 rds_ib_protocol_compatible(struct rdma_cm_event *event)
+{
+	const struct rds_ib_connect_private *dp = event->param.conn.private_data;
+	u16 common;
+	u32 version = 0;
+
+	/*
+	 * rdma_cm private data is odd - when there is any private data in the
+	 * request, we will be given a pretty large buffer without telling us the
+	 * original size. The only way to tell the difference is by looking at
+	 * the contents, which are initialized to zero.
+	 * If the protocol version fields aren't set, this is a connection attempt
+	 * from an older version. This could could be 3.0 or 2.0 - we can't tell.
+	 * We really should have changed this for OFED 1.3 :-(
+	 */
+
+	/* Be paranoid. RDS always has privdata */
+	if (!event->param.conn.private_data_len) {
+		printk(KERN_NOTICE "RDS incoming connection has no private data, "
+			"rejecting\n");
+		return 0;
+	}
+
+	/* Even if len is crap *now* I still want to check it. -ASG */
+	if (event->param.conn.private_data_len < sizeof (*dp) ||
+	    dp->dp_protocol_major == 0)
+		return RDS_PROTOCOL_3_0;
+
+	common = be16_to_cpu(dp->dp_protocol_minor_mask) & RDS_IB_SUPPORTED_PROTOCOLS;
+	if (dp->dp_protocol_major == 3 && common) {
+		version = RDS_PROTOCOL_3_0;
+		while ((common >>= 1) != 0)
+			version++;
+	} else
+		printk_ratelimited(KERN_NOTICE "RDS: Connection from %pI4 using incompatible protocol version %u.%u\n",
+				&dp->dp_saddr,
+				dp->dp_protocol_major,
+				dp->dp_protocol_minor);
+	return version;
+}
+
+int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
+				    struct rdma_cm_event *event)
+{
+	__be64 lguid = cm_id->route.path_rec->sgid.global.interface_id;
+	__be64 fguid = cm_id->route.path_rec->dgid.global.interface_id;
+	const struct rds_ib_connect_private *dp = event->param.conn.private_data;
+	struct rds_ib_connect_private dp_rep;
+	struct rds_connection *conn = NULL;
+	struct rds_ib_connection *ic = NULL;
+	struct rdma_conn_param conn_param;
+	u32 version;
+	int err = 1, destroy = 1;
+
+	/* Check whether the remote protocol version matches ours. */
+	version = rds_ib_protocol_compatible(event);
+	if (!version)
+		goto out;
+
+	rdsdebug("saddr %pI4 daddr %pI4 RDSv%u.%u lguid 0x%llx fguid "
+		 "0x%llx\n", &dp->dp_saddr, &dp->dp_daddr,
+		 RDS_PROTOCOL_MAJOR(version), RDS_PROTOCOL_MINOR(version),
+		 (unsigned long long)be64_to_cpu(lguid),
+		 (unsigned long long)be64_to_cpu(fguid));
+
+	conn = rds_conn_create(dp->dp_daddr, dp->dp_saddr, &rds_ib_transport,
+			       GFP_KERNEL);
+	if (IS_ERR(conn)) {
+		rdsdebug("rds_conn_create failed (%ld)\n", PTR_ERR(conn));
+		conn = NULL;
+		goto out;
+	}
+
+	/*
+	 * The connection request may occur while the
+	 * previous connection exist, e.g. in case of failover.
+	 * But as connections may be initiated simultaneously
+	 * by both hosts, we have a random backoff mechanism -
+	 * see the comment above rds_queue_reconnect()
+	 */
+	mutex_lock(&conn->c_cm_lock);
+	if (!rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_CONNECTING)) {
+		if (rds_conn_state(conn) == RDS_CONN_UP) {
+			rdsdebug("incoming connect while connecting\n");
+			rds_conn_drop(conn);
+			rds_ib_stats_inc(s_ib_listen_closed_stale);
+		} else
+		if (rds_conn_state(conn) == RDS_CONN_CONNECTING) {
+			/* Wait and see - our connect may still be succeeding */
+			rds_ib_stats_inc(s_ib_connect_raced);
+		}
+		goto out;
+	}
+
+	ic = conn->c_transport_data;
+
+	rds_ib_set_protocol(conn, version);
+	rds_ib_set_flow_control(conn, be32_to_cpu(dp->dp_credit));
+
+	/* If the peer gave us the last packet it saw, process this as if
+	 * we had received a regular ACK. */
+	if (dp->dp_ack_seq)
+		rds_send_drop_acked(conn, be64_to_cpu(dp->dp_ack_seq), NULL);
+
+	BUG_ON(cm_id->context);
+	BUG_ON(ic->i_cm_id);
+
+	ic->i_cm_id = cm_id;
+	cm_id->context = conn;
+
+	/* We got halfway through setting up the ib_connection, if we
+	 * fail now, we have to take the long route out of this mess. */
+	destroy = 0;
+
+	err = rds_ib_setup_qp(conn);
+	if (err) {
+		rds_ib_conn_error(conn, "rds_ib_setup_qp failed (%d)\n", err);
+		goto out;
+	}
+
+	rds_ib_cm_fill_conn_param(conn, &conn_param, &dp_rep, version,
+		event->param.conn.responder_resources,
+		event->param.conn.initiator_depth);
+
+	/* rdma_accept() calls rdma_reject() internally if it fails */
+	err = rdma_accept(cm_id, &conn_param);
+	if (err)
+		rds_ib_conn_error(conn, "rdma_accept failed (%d)\n", err);
+
+out:
+	if (conn)
+		mutex_unlock(&conn->c_cm_lock);
+	if (err)
+		rdma_reject(cm_id, NULL, 0);
+	return destroy;
+}
+
+
+int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id)
+{
+	struct rds_connection *conn = cm_id->context;
+	struct rds_ib_connection *ic = conn->c_transport_data;
+	struct rdma_conn_param conn_param;
+	struct rds_ib_connect_private dp;
+	int ret;
+
+	/* If the peer doesn't do protocol negotiation, we must
+	 * default to RDSv3.0 */
+	rds_ib_set_protocol(conn, RDS_PROTOCOL_3_0);
+	ic->i_flowctl = rds_ib_sysctl_flow_control;	/* advertise flow control */
+
+	ret = rds_ib_setup_qp(conn);
+	if (ret) {
+		rds_ib_conn_error(conn, "rds_ib_setup_qp failed (%d)\n", ret);
+		goto out;
+	}
+
+	rds_ib_cm_fill_conn_param(conn, &conn_param, &dp, RDS_PROTOCOL_VERSION,
+		UINT_MAX, UINT_MAX);
+	ret = rdma_connect(cm_id, &conn_param);
+	if (ret)
+		rds_ib_conn_error(conn, "rdma_connect failed (%d)\n", ret);
+
+out:
+	/* Beware - returning non-zero tells the rdma_cm to destroy
+	 * the cm_id. We should certainly not do it as long as we still
+	 * "own" the cm_id. */
+	if (ret) {
+		if (ic->i_cm_id == cm_id)
+			ret = 0;
+	}
+	return ret;
+}
+
+int rds_ib_conn_connect(struct rds_connection *conn)
+{
+	struct rds_ib_connection *ic = conn->c_transport_data;
+	struct sockaddr_in src, dest;
+	int ret;
+
+	/* XXX I wonder what affect the port space has */
+	/* delegate cm event handler to rdma_transport */
+	ic->i_cm_id = rdma_create_id(rds_rdma_cm_event_handler, conn,
+				     RDMA_PS_TCP, IB_QPT_RC);
+	if (IS_ERR(ic->i_cm_id)) {
+		ret = PTR_ERR(ic->i_cm_id);
+		ic->i_cm_id = NULL;
+		rdsdebug("rdma_create_id() failed: %d\n", ret);
+		goto out;
+	}
+
+	rdsdebug("created cm id %p for conn %p\n", ic->i_cm_id, conn);
+
+	src.sin_family = AF_INET;
+	src.sin_addr.s_addr = (__force u32)conn->c_laddr;
+	src.sin_port = (__force u16)htons(0);
+
+	dest.sin_family = AF_INET;
+	dest.sin_addr.s_addr = (__force u32)conn->c_faddr;
+	dest.sin_port = (__force u16)htons(RDS_PORT);
+
+	ret = rdma_resolve_addr(ic->i_cm_id, (struct sockaddr *)&src,
+				(struct sockaddr *)&dest,
+				RDS_RDMA_RESOLVE_TIMEOUT_MS);
+	if (ret) {
+		rdsdebug("addr resolve failed for cm id %p: %d\n", ic->i_cm_id,
+			 ret);
+		rdma_destroy_id(ic->i_cm_id);
+		ic->i_cm_id = NULL;
+	}
+
+out:
+	return ret;
+}
+
+/*
+ * This is so careful about only cleaning up resources that were built up
+ * so that it can be called at any point during startup.  In fact it
+ * can be called multiple times for a given connection.
+ */
+void rds_ib_conn_shutdown(struct rds_connection *conn)
+{
+	struct rds_ib_connection *ic = conn->c_transport_data;
+	int err = 0;
+
+	rdsdebug("cm %p pd %p cq %p %p qp %p\n", ic->i_cm_id,
+		 ic->i_pd, ic->i_send_cq, ic->i_recv_cq,
+		 ic->i_cm_id ? ic->i_cm_id->qp : NULL);
+
+	if (ic->i_cm_id) {
+		struct ib_device *dev = ic->i_cm_id->device;
+
+		rdsdebug("disconnecting cm %p\n", ic->i_cm_id);
+		err = rdma_disconnect(ic->i_cm_id);
+		if (err) {
+			/* Actually this may happen quite frequently, when
+			 * an outgoing connect raced with an incoming connect.
+			 */
+			rdsdebug("failed to disconnect, cm: %p err %d\n",
+				ic->i_cm_id, err);
+		}
+
+		/*
+		 * We want to wait for tx and rx completion to finish
+		 * before we tear down the connection, but we have to be
+		 * careful not to get stuck waiting on a send ring that
+		 * only has unsignaled sends in it.  We've shutdown new
+		 * sends before getting here so by waiting for signaled
+		 * sends to complete we're ensured that there will be no
+		 * more tx processing.
+		 */
+		wait_event(rds_ib_ring_empty_wait,
+			   rds_ib_ring_empty(&ic->i_recv_ring) &&
+			   (atomic_read(&ic->i_signaled_sends) == 0));
+		tasklet_kill(&ic->i_recv_tasklet);
+
+		if (ic->i_send_hdrs)
+			ib_dma_free_coherent(dev,
+					   ic->i_send_ring.w_nr *
+						sizeof(struct rds_header),
+					   ic->i_send_hdrs,
+					   ic->i_send_hdrs_dma);
+
+		if (ic->i_recv_hdrs)
+			ib_dma_free_coherent(dev,
+					   ic->i_recv_ring.w_nr *
+						sizeof(struct rds_header),
+					   ic->i_recv_hdrs,
+					   ic->i_recv_hdrs_dma);
+
+		if (ic->i_ack)
+			ib_dma_free_coherent(dev, sizeof(struct rds_header),
+					     ic->i_ack, ic->i_ack_dma);
+
+		if (ic->i_sends)
+			rds_ib_send_clear_ring(ic);
+		if (ic->i_recvs)
+			rds_ib_recv_clear_ring(ic);
+
+		if (ic->i_cm_id->qp)
+			rdma_destroy_qp(ic->i_cm_id);
+		if (ic->i_send_cq)
+			ib_destroy_cq(ic->i_send_cq);
+		if (ic->i_recv_cq)
+			ib_destroy_cq(ic->i_recv_cq);
+		rdma_destroy_id(ic->i_cm_id);
+
+		/*
+		 * Move connection back to the nodev list.
+		 */
+		if (ic->rds_ibdev)
+			rds_ib_remove_conn(ic->rds_ibdev, conn);
+
+		ic->i_cm_id = NULL;
+		ic->i_pd = NULL;
+		ic->i_mr = NULL;
+		ic->i_send_cq = NULL;
+		ic->i_recv_cq = NULL;
+		ic->i_send_hdrs = NULL;
+		ic->i_recv_hdrs = NULL;
+		ic->i_ack = NULL;
+	}
+	BUG_ON(ic->rds_ibdev);
+
+	/* Clear pending transmit */
+	if (ic->i_data_op) {
+		struct rds_message *rm;
+
+		rm = container_of(ic->i_data_op, struct rds_message, data);
+		rds_message_put(rm);
+		ic->i_data_op = NULL;
+	}
+
+	/* Clear the ACK state */
+	clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
+#ifdef KERNEL_HAS_ATOMIC64
+	atomic64_set(&ic->i_ack_next, 0);
+#else
+	ic->i_ack_next = 0;
+#endif
+	ic->i_ack_recv = 0;
+
+	/* Clear flow control state */
+	ic->i_flowctl = 0;
+	atomic_set(&ic->i_credits, 0);
+
+	rds_ib_ring_init(&ic->i_send_ring, rds_ib_sysctl_max_send_wr);
+	rds_ib_ring_init(&ic->i_recv_ring, rds_ib_sysctl_max_recv_wr);
+
+	if (ic->i_ibinc) {
+		rds_inc_put(&ic->i_ibinc->ii_inc);
+		ic->i_ibinc = NULL;
+	}
+
+	vfree(ic->i_sends);
+	ic->i_sends = NULL;
+	vfree(ic->i_recvs);
+	ic->i_recvs = NULL;
+}
+
+int rds_ib_conn_alloc(struct rds_connection *conn, gfp_t gfp)
+{
+	struct rds_ib_connection *ic;
+	unsigned long flags;
+	int ret;
+
+	/* XXX too lazy? */
+	ic = kzalloc(sizeof(struct rds_ib_connection), gfp);
+	if (!ic)
+		return -ENOMEM;
+
+	ret = rds_ib_recv_alloc_caches(ic);
+	if (ret) {
+		kfree(ic);
+		return ret;
+	}
+
+	INIT_LIST_HEAD(&ic->ib_node);
+	tasklet_init(&ic->i_recv_tasklet, rds_ib_recv_tasklet_fn,
+		     (unsigned long) ic);
+	mutex_init(&ic->i_recv_mutex);
+#ifndef KERNEL_HAS_ATOMIC64
+	spin_lock_init(&ic->i_ack_lock);
+#endif
+	atomic_set(&ic->i_signaled_sends, 0);
+
+	/*
+	 * rds_ib_conn_shutdown() waits for these to be emptied so they
+	 * must be initialized before it can be called.
+	 */
+	rds_ib_ring_init(&ic->i_send_ring, rds_ib_sysctl_max_send_wr);
+	rds_ib_ring_init(&ic->i_recv_ring, rds_ib_sysctl_max_recv_wr);
+
+	ic->conn = conn;
+	conn->c_transport_data = ic;
+
+	spin_lock_irqsave(&ib_nodev_conns_lock, flags);
+	list_add_tail(&ic->ib_node, &ib_nodev_conns);
+	spin_unlock_irqrestore(&ib_nodev_conns_lock, flags);
+
+
+	rdsdebug("conn %p conn ic %p\n", conn, conn->c_transport_data);
+	return 0;
+}
+
+/*
+ * Free a connection. Connection must be shut down and not set for reconnect.
+ */
+void rds_ib_conn_free(void *arg)
+{
+	struct rds_ib_connection *ic = arg;
+	spinlock_t	*lock_ptr;
+
+	rdsdebug("ic %p\n", ic);
+
+	/*
+	 * Conn is either on a dev's list or on the nodev list.
+	 * A race with shutdown() or connect() would cause problems
+	 * (since rds_ibdev would change) but that should never happen.
+	 */
+	lock_ptr = ic->rds_ibdev ? &ic->rds_ibdev->spinlock : &ib_nodev_conns_lock;
+
+	spin_lock_irq(lock_ptr);
+	list_del(&ic->ib_node);
+	spin_unlock_irq(lock_ptr);
+
+	rds_ib_recv_free_caches(ic);
+
+	kfree(ic);
+}
+
+
+/*
+ * An error occurred on the connection
+ */
+void
+__rds_ib_conn_error(struct rds_connection *conn, const char *fmt, ...)
+{
+	va_list ap;
+
+	rds_conn_drop(conn);
+
+	va_start(ap, fmt);
+	vprintk(fmt, ap);
+	va_end(ap);
+}
diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c
new file mode 100644
index 0000000..273b8bf
--- /dev/null
+++ b/net/rds/ib_rdma.c
@@ -0,0 +1,784 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/rculist.h>
+#include <linux/llist.h>
+
+#include "rds.h"
+#include "ib.h"
+
+static DEFINE_PER_CPU(unsigned long, clean_list_grace);
+#define CLEAN_LIST_BUSY_BIT 0
+
+/*
+ * This is stored as mr->r_trans_private.
+ */
+struct rds_ib_mr {
+	struct rds_ib_device	*device;
+	struct rds_ib_mr_pool	*pool;
+	struct ib_fmr		*fmr;
+
+	struct llist_node	llnode;
+
+	/* unmap_list is for freeing */
+	struct list_head	unmap_list;
+	unsigned int		remap_count;
+
+	struct scatterlist	*sg;
+	unsigned int		sg_len;
+	u64			*dma;
+	int			sg_dma_len;
+};
+
+/*
+ * Our own little FMR pool
+ */
+struct rds_ib_mr_pool {
+	struct mutex		flush_lock;		/* serialize fmr invalidate */
+	struct delayed_work	flush_worker;		/* flush worker */
+
+	atomic_t		item_count;		/* total # of MRs */
+	atomic_t		dirty_count;		/* # dirty of MRs */
+
+	struct llist_head	drop_list;		/* MRs that have reached their max_maps limit */
+	struct llist_head	free_list;		/* unused MRs */
+	struct llist_head	clean_list;		/* global unused & unamapped MRs */
+	wait_queue_head_t	flush_wait;
+
+	atomic_t		free_pinned;		/* memory pinned by free MRs */
+	unsigned long		max_items;
+	unsigned long		max_items_soft;
+	unsigned long		max_free_pinned;
+	struct ib_fmr_attr	fmr_attr;
+};
+
+static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, int free_all, struct rds_ib_mr **);
+static void rds_ib_teardown_mr(struct rds_ib_mr *ibmr);
+static void rds_ib_mr_pool_flush_worker(struct work_struct *work);
+
+static struct rds_ib_device *rds_ib_get_device(__be32 ipaddr)
+{
+	struct rds_ib_device *rds_ibdev;
+	struct rds_ib_ipaddr *i_ipaddr;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(rds_ibdev, &rds_ib_devices, list) {
+		list_for_each_entry_rcu(i_ipaddr, &rds_ibdev->ipaddr_list, list) {
+			if (i_ipaddr->ipaddr == ipaddr) {
+				atomic_inc(&rds_ibdev->refcount);
+				rcu_read_unlock();
+				return rds_ibdev;
+			}
+		}
+	}
+	rcu_read_unlock();
+
+	return NULL;
+}
+
+static int rds_ib_add_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr)
+{
+	struct rds_ib_ipaddr *i_ipaddr;
+
+	i_ipaddr = kmalloc(sizeof *i_ipaddr, GFP_KERNEL);
+	if (!i_ipaddr)
+		return -ENOMEM;
+
+	i_ipaddr->ipaddr = ipaddr;
+
+	spin_lock_irq(&rds_ibdev->spinlock);
+	list_add_tail_rcu(&i_ipaddr->list, &rds_ibdev->ipaddr_list);
+	spin_unlock_irq(&rds_ibdev->spinlock);
+
+	return 0;
+}
+
+static void rds_ib_remove_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr)
+{
+	struct rds_ib_ipaddr *i_ipaddr;
+	struct rds_ib_ipaddr *to_free = NULL;
+
+
+	spin_lock_irq(&rds_ibdev->spinlock);
+	list_for_each_entry_rcu(i_ipaddr, &rds_ibdev->ipaddr_list, list) {
+		if (i_ipaddr->ipaddr == ipaddr) {
+			list_del_rcu(&i_ipaddr->list);
+			to_free = i_ipaddr;
+			break;
+		}
+	}
+	spin_unlock_irq(&rds_ibdev->spinlock);
+
+	if (to_free) {
+		synchronize_rcu();
+		kfree(to_free);
+	}
+}
+
+int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr)
+{
+	struct rds_ib_device *rds_ibdev_old;
+
+	rds_ibdev_old = rds_ib_get_device(ipaddr);
+	if (rds_ibdev_old) {
+		rds_ib_remove_ipaddr(rds_ibdev_old, ipaddr);
+		rds_ib_dev_put(rds_ibdev_old);
+	}
+
+	return rds_ib_add_ipaddr(rds_ibdev, ipaddr);
+}
+
+void rds_ib_add_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn)
+{
+	struct rds_ib_connection *ic = conn->c_transport_data;
+
+	/* conn was previously on the nodev_conns_list */
+	spin_lock_irq(&ib_nodev_conns_lock);
+	BUG_ON(list_empty(&ib_nodev_conns));
+	BUG_ON(list_empty(&ic->ib_node));
+	list_del(&ic->ib_node);
+
+	spin_lock(&rds_ibdev->spinlock);
+	list_add_tail(&ic->ib_node, &rds_ibdev->conn_list);
+	spin_unlock(&rds_ibdev->spinlock);
+	spin_unlock_irq(&ib_nodev_conns_lock);
+
+	ic->rds_ibdev = rds_ibdev;
+	atomic_inc(&rds_ibdev->refcount);
+}
+
+void rds_ib_remove_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn)
+{
+	struct rds_ib_connection *ic = conn->c_transport_data;
+
+	/* place conn on nodev_conns_list */
+	spin_lock(&ib_nodev_conns_lock);
+
+	spin_lock_irq(&rds_ibdev->spinlock);
+	BUG_ON(list_empty(&ic->ib_node));
+	list_del(&ic->ib_node);
+	spin_unlock_irq(&rds_ibdev->spinlock);
+
+	list_add_tail(&ic->ib_node, &ib_nodev_conns);
+
+	spin_unlock(&ib_nodev_conns_lock);
+
+	ic->rds_ibdev = NULL;
+	rds_ib_dev_put(rds_ibdev);
+}
+
+void rds_ib_destroy_nodev_conns(void)
+{
+	struct rds_ib_connection *ic, *_ic;
+	LIST_HEAD(tmp_list);
+
+	/* avoid calling conn_destroy with irqs off */
+	spin_lock_irq(&ib_nodev_conns_lock);
+	list_splice(&ib_nodev_conns, &tmp_list);
+	spin_unlock_irq(&ib_nodev_conns_lock);
+
+	list_for_each_entry_safe(ic, _ic, &tmp_list, ib_node)
+		rds_conn_destroy(ic->conn);
+}
+
+struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_ibdev)
+{
+	struct rds_ib_mr_pool *pool;
+
+	pool = kzalloc(sizeof(*pool), GFP_KERNEL);
+	if (!pool)
+		return ERR_PTR(-ENOMEM);
+
+	init_llist_head(&pool->free_list);
+	init_llist_head(&pool->drop_list);
+	init_llist_head(&pool->clean_list);
+	mutex_init(&pool->flush_lock);
+	init_waitqueue_head(&pool->flush_wait);
+	INIT_DELAYED_WORK(&pool->flush_worker, rds_ib_mr_pool_flush_worker);
+
+	pool->fmr_attr.max_pages = fmr_message_size;
+	pool->fmr_attr.max_maps = rds_ibdev->fmr_max_remaps;
+	pool->fmr_attr.page_shift = PAGE_SHIFT;
+	pool->max_free_pinned = rds_ibdev->max_fmrs * fmr_message_size / 4;
+
+	/* We never allow more than max_items MRs to be allocated.
+	 * When we exceed more than max_items_soft, we start freeing
+	 * items more aggressively.
+	 * Make sure that max_items > max_items_soft > max_items / 2
+	 */
+	pool->max_items_soft = rds_ibdev->max_fmrs * 3 / 4;
+	pool->max_items = rds_ibdev->max_fmrs;
+
+	return pool;
+}
+
+void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_connection *iinfo)
+{
+	struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool;
+
+	iinfo->rdma_mr_max = pool->max_items;
+	iinfo->rdma_mr_size = pool->fmr_attr.max_pages;
+}
+
+void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *pool)
+{
+	cancel_delayed_work_sync(&pool->flush_worker);
+	rds_ib_flush_mr_pool(pool, 1, NULL);
+	WARN_ON(atomic_read(&pool->item_count));
+	WARN_ON(atomic_read(&pool->free_pinned));
+	kfree(pool);
+}
+
+static inline struct rds_ib_mr *rds_ib_reuse_fmr(struct rds_ib_mr_pool *pool)
+{
+	struct rds_ib_mr *ibmr = NULL;
+	struct llist_node *ret;
+	unsigned long *flag;
+
+	preempt_disable();
+	flag = this_cpu_ptr(&clean_list_grace);
+	set_bit(CLEAN_LIST_BUSY_BIT, flag);
+	ret = llist_del_first(&pool->clean_list);
+	if (ret)
+		ibmr = llist_entry(ret, struct rds_ib_mr, llnode);
+
+	clear_bit(CLEAN_LIST_BUSY_BIT, flag);
+	preempt_enable();
+	return ibmr;
+}
+
+static inline void wait_clean_list_grace(void)
+{
+	int cpu;
+	unsigned long *flag;
+
+	for_each_online_cpu(cpu) {
+		flag = &per_cpu(clean_list_grace, cpu);
+		while (test_bit(CLEAN_LIST_BUSY_BIT, flag))
+			cpu_relax();
+	}
+}
+
+static struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev)
+{
+	struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool;
+	struct rds_ib_mr *ibmr = NULL;
+	int err = 0, iter = 0;
+
+	if (atomic_read(&pool->dirty_count) >= pool->max_items / 10)
+		schedule_delayed_work(&pool->flush_worker, 10);
+
+	while (1) {
+		ibmr = rds_ib_reuse_fmr(pool);
+		if (ibmr)
+			return ibmr;
+
+		/* No clean MRs - now we have the choice of either
+		 * allocating a fresh MR up to the limit imposed by the
+		 * driver, or flush any dirty unused MRs.
+		 * We try to avoid stalling in the send path if possible,
+		 * so we allocate as long as we're allowed to.
+		 *
+		 * We're fussy with enforcing the FMR limit, though. If the driver
+		 * tells us we can't use more than N fmrs, we shouldn't start
+		 * arguing with it */
+		if (atomic_inc_return(&pool->item_count) <= pool->max_items)
+			break;
+
+		atomic_dec(&pool->item_count);
+
+		if (++iter > 2) {
+			rds_ib_stats_inc(s_ib_rdma_mr_pool_depleted);
+			return ERR_PTR(-EAGAIN);
+		}
+
+		/* We do have some empty MRs. Flush them out. */
+		rds_ib_stats_inc(s_ib_rdma_mr_pool_wait);
+		rds_ib_flush_mr_pool(pool, 0, &ibmr);
+		if (ibmr)
+			return ibmr;
+	}
+
+	ibmr = kzalloc_node(sizeof(*ibmr), GFP_KERNEL, rdsibdev_to_node(rds_ibdev));
+	if (!ibmr) {
+		err = -ENOMEM;
+		goto out_no_cigar;
+	}
+
+	memset(ibmr, 0, sizeof(*ibmr));
+
+	ibmr->fmr = ib_alloc_fmr(rds_ibdev->pd,
+			(IB_ACCESS_LOCAL_WRITE |
+			 IB_ACCESS_REMOTE_READ |
+			 IB_ACCESS_REMOTE_WRITE|
+			 IB_ACCESS_REMOTE_ATOMIC),
+			&pool->fmr_attr);
+	if (IS_ERR(ibmr->fmr)) {
+		err = PTR_ERR(ibmr->fmr);
+		ibmr->fmr = NULL;
+		printk(KERN_WARNING "RDS/IB: ib_alloc_fmr failed (err=%d)\n", err);
+		goto out_no_cigar;
+	}
+
+	rds_ib_stats_inc(s_ib_rdma_mr_alloc);
+	return ibmr;
+
+out_no_cigar:
+	if (ibmr) {
+		if (ibmr->fmr)
+			ib_dealloc_fmr(ibmr->fmr);
+		kfree(ibmr);
+	}
+	atomic_dec(&pool->item_count);
+	return ERR_PTR(err);
+}
+
+static int rds_ib_map_fmr(struct rds_ib_device *rds_ibdev, struct rds_ib_mr *ibmr,
+	       struct scatterlist *sg, unsigned int nents)
+{
+	struct ib_device *dev = rds_ibdev->dev;
+	struct scatterlist *scat = sg;
+	u64 io_addr = 0;
+	u64 *dma_pages;
+	u32 len;
+	int page_cnt, sg_dma_len;
+	int i, j;
+	int ret;
+
+	sg_dma_len = ib_dma_map_sg(dev, sg, nents,
+				 DMA_BIDIRECTIONAL);
+	if (unlikely(!sg_dma_len)) {
+		printk(KERN_WARNING "RDS/IB: dma_map_sg failed!\n");
+		return -EBUSY;
+	}
+
+	len = 0;
+	page_cnt = 0;
+
+	for (i = 0; i < sg_dma_len; ++i) {
+		unsigned int dma_len = ib_sg_dma_len(dev, &scat[i]);
+		u64 dma_addr = ib_sg_dma_address(dev, &scat[i]);
+
+		if (dma_addr & ~PAGE_MASK) {
+			if (i > 0)
+				return -EINVAL;
+			else
+				++page_cnt;
+		}
+		if ((dma_addr + dma_len) & ~PAGE_MASK) {
+			if (i < sg_dma_len - 1)
+				return -EINVAL;
+			else
+				++page_cnt;
+		}
+
+		len += dma_len;
+	}
+
+	page_cnt += len >> PAGE_SHIFT;
+	if (page_cnt > fmr_message_size)
+		return -EINVAL;
+
+	dma_pages = kmalloc_node(sizeof(u64) * page_cnt, GFP_ATOMIC,
+				 rdsibdev_to_node(rds_ibdev));
+	if (!dma_pages)
+		return -ENOMEM;
+
+	page_cnt = 0;
+	for (i = 0; i < sg_dma_len; ++i) {
+		unsigned int dma_len = ib_sg_dma_len(dev, &scat[i]);
+		u64 dma_addr = ib_sg_dma_address(dev, &scat[i]);
+
+		for (j = 0; j < dma_len; j += PAGE_SIZE)
+			dma_pages[page_cnt++] =
+				(dma_addr & PAGE_MASK) + j;
+	}
+
+	ret = ib_map_phys_fmr(ibmr->fmr,
+				   dma_pages, page_cnt, io_addr);
+	if (ret)
+		goto out;
+
+	/* Success - we successfully remapped the MR, so we can
+	 * safely tear down the old mapping. */
+	rds_ib_teardown_mr(ibmr);
+
+	ibmr->sg = scat;
+	ibmr->sg_len = nents;
+	ibmr->sg_dma_len = sg_dma_len;
+	ibmr->remap_count++;
+
+	rds_ib_stats_inc(s_ib_rdma_mr_used);
+	ret = 0;
+
+out:
+	kfree(dma_pages);
+
+	return ret;
+}
+
+void rds_ib_sync_mr(void *trans_private, int direction)
+{
+	struct rds_ib_mr *ibmr = trans_private;
+	struct rds_ib_device *rds_ibdev = ibmr->device;
+
+	switch (direction) {
+	case DMA_FROM_DEVICE:
+		ib_dma_sync_sg_for_cpu(rds_ibdev->dev, ibmr->sg,
+			ibmr->sg_dma_len, DMA_BIDIRECTIONAL);
+		break;
+	case DMA_TO_DEVICE:
+		ib_dma_sync_sg_for_device(rds_ibdev->dev, ibmr->sg,
+			ibmr->sg_dma_len, DMA_BIDIRECTIONAL);
+		break;
+	}
+}
+
+static void __rds_ib_teardown_mr(struct rds_ib_mr *ibmr)
+{
+	struct rds_ib_device *rds_ibdev = ibmr->device;
+
+	if (ibmr->sg_dma_len) {
+		ib_dma_unmap_sg(rds_ibdev->dev,
+				ibmr->sg, ibmr->sg_len,
+				DMA_BIDIRECTIONAL);
+		ibmr->sg_dma_len = 0;
+	}
+
+	/* Release the s/g list */
+	if (ibmr->sg_len) {
+		unsigned int i;
+
+		for (i = 0; i < ibmr->sg_len; ++i) {
+			struct page *page = sg_page(&ibmr->sg[i]);
+
+			/* FIXME we need a way to tell a r/w MR
+			 * from a r/o MR */
+			BUG_ON(irqs_disabled());
+			set_page_dirty(page);
+			put_page(page);
+		}
+		kfree(ibmr->sg);
+
+		ibmr->sg = NULL;
+		ibmr->sg_len = 0;
+	}
+}
+
+static void rds_ib_teardown_mr(struct rds_ib_mr *ibmr)
+{
+	unsigned int pinned = ibmr->sg_len;
+
+	__rds_ib_teardown_mr(ibmr);
+	if (pinned) {
+		struct rds_ib_device *rds_ibdev = ibmr->device;
+		struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool;
+
+		atomic_sub(pinned, &pool->free_pinned);
+	}
+}
+
+static inline unsigned int rds_ib_flush_goal(struct rds_ib_mr_pool *pool, int free_all)
+{
+	unsigned int item_count;
+
+	item_count = atomic_read(&pool->item_count);
+	if (free_all)
+		return item_count;
+
+	return 0;
+}
+
+/*
+ * given an llist of mrs, put them all into the list_head for more processing
+ */
+static void llist_append_to_list(struct llist_head *llist, struct list_head *list)
+{
+	struct rds_ib_mr *ibmr;
+	struct llist_node *node;
+	struct llist_node *next;
+
+	node = llist_del_all(llist);
+	while (node) {
+		next = node->next;
+		ibmr = llist_entry(node, struct rds_ib_mr, llnode);
+		list_add_tail(&ibmr->unmap_list, list);
+		node = next;
+	}
+}
+
+/*
+ * this takes a list head of mrs and turns it into linked llist nodes
+ * of clusters.  Each cluster has linked llist nodes of
+ * MR_CLUSTER_SIZE mrs that are ready for reuse.
+ */
+static void list_to_llist_nodes(struct rds_ib_mr_pool *pool,
+				struct list_head *list,
+				struct llist_node **nodes_head,
+				struct llist_node **nodes_tail)
+{
+	struct rds_ib_mr *ibmr;
+	struct llist_node *cur = NULL;
+	struct llist_node **next = nodes_head;
+
+	list_for_each_entry(ibmr, list, unmap_list) {
+		cur = &ibmr->llnode;
+		*next = cur;
+		next = &cur->next;
+	}
+	*next = NULL;
+	*nodes_tail = cur;
+}
+
+/*
+ * Flush our pool of MRs.
+ * At a minimum, all currently unused MRs are unmapped.
+ * If the number of MRs allocated exceeds the limit, we also try
+ * to free as many MRs as needed to get back to this limit.
+ */
+static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool,
+			        int free_all, struct rds_ib_mr **ibmr_ret)
+{
+	struct rds_ib_mr *ibmr, *next;
+	struct llist_node *clean_nodes;
+	struct llist_node *clean_tail;
+	LIST_HEAD(unmap_list);
+	LIST_HEAD(fmr_list);
+	unsigned long unpinned = 0;
+	unsigned int nfreed = 0, ncleaned = 0, free_goal;
+	int ret = 0;
+
+	rds_ib_stats_inc(s_ib_rdma_mr_pool_flush);
+
+	if (ibmr_ret) {
+		DEFINE_WAIT(wait);
+		while(!mutex_trylock(&pool->flush_lock)) {
+			ibmr = rds_ib_reuse_fmr(pool);
+			if (ibmr) {
+				*ibmr_ret = ibmr;
+				finish_wait(&pool->flush_wait, &wait);
+				goto out_nolock;
+			}
+
+			prepare_to_wait(&pool->flush_wait, &wait,
+					TASK_UNINTERRUPTIBLE);
+			if (llist_empty(&pool->clean_list))
+				schedule();
+
+			ibmr = rds_ib_reuse_fmr(pool);
+			if (ibmr) {
+				*ibmr_ret = ibmr;
+				finish_wait(&pool->flush_wait, &wait);
+				goto out_nolock;
+			}
+		}
+		finish_wait(&pool->flush_wait, &wait);
+	} else
+		mutex_lock(&pool->flush_lock);
+
+	if (ibmr_ret) {
+		ibmr = rds_ib_reuse_fmr(pool);
+		if (ibmr) {
+			*ibmr_ret = ibmr;
+			goto out;
+		}
+	}
+
+	/* Get the list of all MRs to be dropped. Ordering matters -
+	 * we want to put drop_list ahead of free_list.
+	 */
+	llist_append_to_list(&pool->drop_list, &unmap_list);
+	llist_append_to_list(&pool->free_list, &unmap_list);
+	if (free_all)
+		llist_append_to_list(&pool->clean_list, &unmap_list);
+
+	free_goal = rds_ib_flush_goal(pool, free_all);
+
+	if (list_empty(&unmap_list))
+		goto out;
+
+	/* String all ib_mr's onto one list and hand them to ib_unmap_fmr */
+	list_for_each_entry(ibmr, &unmap_list, unmap_list)
+		list_add(&ibmr->fmr->list, &fmr_list);
+
+	ret = ib_unmap_fmr(&fmr_list);
+	if (ret)
+		printk(KERN_WARNING "RDS/IB: ib_unmap_fmr failed (err=%d)\n", ret);
+
+	/* Now we can destroy the DMA mapping and unpin any pages */
+	list_for_each_entry_safe(ibmr, next, &unmap_list, unmap_list) {
+		unpinned += ibmr->sg_len;
+		__rds_ib_teardown_mr(ibmr);
+		if (nfreed < free_goal || ibmr->remap_count >= pool->fmr_attr.max_maps) {
+			rds_ib_stats_inc(s_ib_rdma_mr_free);
+			list_del(&ibmr->unmap_list);
+			ib_dealloc_fmr(ibmr->fmr);
+			kfree(ibmr);
+			nfreed++;
+		}
+		ncleaned++;
+	}
+
+	if (!list_empty(&unmap_list)) {
+		/* we have to make sure that none of the things we're about
+		 * to put on the clean list would race with other cpus trying
+		 * to pull items off.  The llist would explode if we managed to
+		 * remove something from the clean list and then add it back again
+		 * while another CPU was spinning on that same item in llist_del_first.
+		 *
+		 * This is pretty unlikely, but just in case  wait for an llist grace period
+		 * here before adding anything back into the clean list.
+		 */
+		wait_clean_list_grace();
+
+		list_to_llist_nodes(pool, &unmap_list, &clean_nodes, &clean_tail);
+		if (ibmr_ret)
+			*ibmr_ret = llist_entry(clean_nodes, struct rds_ib_mr, llnode);
+
+		/* more than one entry in llist nodes */
+		if (clean_nodes->next)
+			llist_add_batch(clean_nodes->next, clean_tail, &pool->clean_list);
+
+	}
+
+	atomic_sub(unpinned, &pool->free_pinned);
+	atomic_sub(ncleaned, &pool->dirty_count);
+	atomic_sub(nfreed, &pool->item_count);
+
+out:
+	mutex_unlock(&pool->flush_lock);
+	if (waitqueue_active(&pool->flush_wait))
+		wake_up(&pool->flush_wait);
+out_nolock:
+	return ret;
+}
+
+static void rds_ib_mr_pool_flush_worker(struct work_struct *work)
+{
+	struct rds_ib_mr_pool *pool = container_of(work, struct rds_ib_mr_pool, flush_worker.work);
+
+	rds_ib_flush_mr_pool(pool, 0, NULL);
+}
+
+void rds_ib_free_mr(void *trans_private, int invalidate)
+{
+	struct rds_ib_mr *ibmr = trans_private;
+	struct rds_ib_device *rds_ibdev = ibmr->device;
+	struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool;
+
+	rdsdebug("RDS/IB: free_mr nents %u\n", ibmr->sg_len);
+
+	/* Return it to the pool's free list */
+	if (ibmr->remap_count >= pool->fmr_attr.max_maps)
+		llist_add(&ibmr->llnode, &pool->drop_list);
+	else
+		llist_add(&ibmr->llnode, &pool->free_list);
+
+	atomic_add(ibmr->sg_len, &pool->free_pinned);
+	atomic_inc(&pool->dirty_count);
+
+	/* If we've pinned too many pages, request a flush */
+	if (atomic_read(&pool->free_pinned) >= pool->max_free_pinned ||
+	    atomic_read(&pool->dirty_count) >= pool->max_items / 10)
+		schedule_delayed_work(&pool->flush_worker, 10);
+
+	if (invalidate) {
+		if (likely(!in_interrupt())) {
+			rds_ib_flush_mr_pool(pool, 0, NULL);
+		} else {
+			/* We get here if the user created a MR marked
+			 * as use_once and invalidate at the same time. */
+			schedule_delayed_work(&pool->flush_worker, 10);
+		}
+	}
+
+	rds_ib_dev_put(rds_ibdev);
+}
+
+void rds_ib_flush_mrs(void)
+{
+	struct rds_ib_device *rds_ibdev;
+
+	down_read(&rds_ib_devices_lock);
+	list_for_each_entry(rds_ibdev, &rds_ib_devices, list) {
+		struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool;
+
+		if (pool)
+			rds_ib_flush_mr_pool(pool, 0, NULL);
+	}
+	up_read(&rds_ib_devices_lock);
+}
+
+void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents,
+		    struct rds_sock *rs, u32 *key_ret)
+{
+	struct rds_ib_device *rds_ibdev;
+	struct rds_ib_mr *ibmr = NULL;
+	int ret;
+
+	rds_ibdev = rds_ib_get_device(rs->rs_bound_addr);
+	if (!rds_ibdev) {
+		ret = -ENODEV;
+		goto out;
+	}
+
+	if (!rds_ibdev->mr_pool) {
+		ret = -ENODEV;
+		goto out;
+	}
+
+	ibmr = rds_ib_alloc_fmr(rds_ibdev);
+	if (IS_ERR(ibmr))
+		return ibmr;
+
+	ret = rds_ib_map_fmr(rds_ibdev, ibmr, sg, nents);
+	if (ret == 0)
+		*key_ret = ibmr->fmr->rkey;
+	else
+		printk(KERN_WARNING "RDS/IB: map_fmr failed (errno=%d)\n", ret);
+
+	ibmr->device = rds_ibdev;
+	rds_ibdev = NULL;
+
+ out:
+	if (ret) {
+		if (ibmr)
+			rds_ib_free_mr(ibmr, 0);
+		ibmr = ERR_PTR(ret);
+	}
+	if (rds_ibdev)
+		rds_ib_dev_put(rds_ibdev);
+	return ibmr;
+}
+
diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c
new file mode 100644
index 0000000..d67de45
--- /dev/null
+++ b/net/rds/ib_recv.c
@@ -0,0 +1,1079 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/pci.h>
+#include <linux/dma-mapping.h>
+#include <rdma/rdma_cm.h>
+
+#include "rds.h"
+#include "ib.h"
+
+static struct kmem_cache *rds_ib_incoming_slab;
+static struct kmem_cache *rds_ib_frag_slab;
+static atomic_t	rds_ib_allocation = ATOMIC_INIT(0);
+
+void rds_ib_recv_init_ring(struct rds_ib_connection *ic)
+{
+	struct rds_ib_recv_work *recv;
+	u32 i;
+
+	for (i = 0, recv = ic->i_recvs; i < ic->i_recv_ring.w_nr; i++, recv++) {
+		struct ib_sge *sge;
+
+		recv->r_ibinc = NULL;
+		recv->r_frag = NULL;
+
+		recv->r_wr.next = NULL;
+		recv->r_wr.wr_id = i;
+		recv->r_wr.sg_list = recv->r_sge;
+		recv->r_wr.num_sge = RDS_IB_RECV_SGE;
+
+		sge = &recv->r_sge[0];
+		sge->addr = ic->i_recv_hdrs_dma + (i * sizeof(struct rds_header));
+		sge->length = sizeof(struct rds_header);
+		sge->lkey = ic->i_mr->lkey;
+
+		sge = &recv->r_sge[1];
+		sge->addr = 0;
+		sge->length = RDS_FRAG_SIZE;
+		sge->lkey = ic->i_mr->lkey;
+	}
+}
+
+/*
+ * The entire 'from' list, including the from element itself, is put on
+ * to the tail of the 'to' list.
+ */
+static void list_splice_entire_tail(struct list_head *from,
+				    struct list_head *to)
+{
+	struct list_head *from_last = from->prev;
+
+	list_splice_tail(from_last, to);
+	list_add_tail(from_last, to);
+}
+
+static void rds_ib_cache_xfer_to_ready(struct rds_ib_refill_cache *cache)
+{
+	struct list_head *tmp;
+
+	tmp = xchg(&cache->xfer, NULL);
+	if (tmp) {
+		if (cache->ready)
+			list_splice_entire_tail(tmp, cache->ready);
+		else
+			cache->ready = tmp;
+	}
+}
+
+static int rds_ib_recv_alloc_cache(struct rds_ib_refill_cache *cache)
+{
+	struct rds_ib_cache_head *head;
+	int cpu;
+
+	cache->percpu = alloc_percpu(struct rds_ib_cache_head);
+	if (!cache->percpu)
+	       return -ENOMEM;
+
+	for_each_possible_cpu(cpu) {
+		head = per_cpu_ptr(cache->percpu, cpu);
+		head->first = NULL;
+		head->count = 0;
+	}
+	cache->xfer = NULL;
+	cache->ready = NULL;
+
+	return 0;
+}
+
+int rds_ib_recv_alloc_caches(struct rds_ib_connection *ic)
+{
+	int ret;
+
+	ret = rds_ib_recv_alloc_cache(&ic->i_cache_incs);
+	if (!ret) {
+		ret = rds_ib_recv_alloc_cache(&ic->i_cache_frags);
+		if (ret)
+			free_percpu(ic->i_cache_incs.percpu);
+	}
+
+	return ret;
+}
+
+static void rds_ib_cache_splice_all_lists(struct rds_ib_refill_cache *cache,
+					  struct list_head *caller_list)
+{
+	struct rds_ib_cache_head *head;
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		head = per_cpu_ptr(cache->percpu, cpu);
+		if (head->first) {
+			list_splice_entire_tail(head->first, caller_list);
+			head->first = NULL;
+		}
+	}
+
+	if (cache->ready) {
+		list_splice_entire_tail(cache->ready, caller_list);
+		cache->ready = NULL;
+	}
+}
+
+void rds_ib_recv_free_caches(struct rds_ib_connection *ic)
+{
+	struct rds_ib_incoming *inc;
+	struct rds_ib_incoming *inc_tmp;
+	struct rds_page_frag *frag;
+	struct rds_page_frag *frag_tmp;
+	LIST_HEAD(list);
+
+	rds_ib_cache_xfer_to_ready(&ic->i_cache_incs);
+	rds_ib_cache_splice_all_lists(&ic->i_cache_incs, &list);
+	free_percpu(ic->i_cache_incs.percpu);
+
+	list_for_each_entry_safe(inc, inc_tmp, &list, ii_cache_entry) {
+		list_del(&inc->ii_cache_entry);
+		WARN_ON(!list_empty(&inc->ii_frags));
+		kmem_cache_free(rds_ib_incoming_slab, inc);
+	}
+
+	rds_ib_cache_xfer_to_ready(&ic->i_cache_frags);
+	rds_ib_cache_splice_all_lists(&ic->i_cache_frags, &list);
+	free_percpu(ic->i_cache_frags.percpu);
+
+	list_for_each_entry_safe(frag, frag_tmp, &list, f_cache_entry) {
+		list_del(&frag->f_cache_entry);
+		WARN_ON(!list_empty(&frag->f_item));
+		kmem_cache_free(rds_ib_frag_slab, frag);
+	}
+}
+
+/* fwd decl */
+static void rds_ib_recv_cache_put(struct list_head *new_item,
+				  struct rds_ib_refill_cache *cache);
+static struct list_head *rds_ib_recv_cache_get(struct rds_ib_refill_cache *cache);
+
+
+/* Recycle frag and attached recv buffer f_sg */
+static void rds_ib_frag_free(struct rds_ib_connection *ic,
+			     struct rds_page_frag *frag)
+{
+	rdsdebug("frag %p page %p\n", frag, sg_page(&frag->f_sg));
+
+	rds_ib_recv_cache_put(&frag->f_cache_entry, &ic->i_cache_frags);
+}
+
+/* Recycle inc after freeing attached frags */
+void rds_ib_inc_free(struct rds_incoming *inc)
+{
+	struct rds_ib_incoming *ibinc;
+	struct rds_page_frag *frag;
+	struct rds_page_frag *pos;
+	struct rds_ib_connection *ic = inc->i_conn->c_transport_data;
+
+	ibinc = container_of(inc, struct rds_ib_incoming, ii_inc);
+
+	/* Free attached frags */
+	list_for_each_entry_safe(frag, pos, &ibinc->ii_frags, f_item) {
+		list_del_init(&frag->f_item);
+		rds_ib_frag_free(ic, frag);
+	}
+	BUG_ON(!list_empty(&ibinc->ii_frags));
+
+	rdsdebug("freeing ibinc %p inc %p\n", ibinc, inc);
+	rds_ib_recv_cache_put(&ibinc->ii_cache_entry, &ic->i_cache_incs);
+}
+
+static void rds_ib_recv_clear_one(struct rds_ib_connection *ic,
+				  struct rds_ib_recv_work *recv)
+{
+	if (recv->r_ibinc) {
+		rds_inc_put(&recv->r_ibinc->ii_inc);
+		recv->r_ibinc = NULL;
+	}
+	if (recv->r_frag) {
+		ib_dma_unmap_sg(ic->i_cm_id->device, &recv->r_frag->f_sg, 1, DMA_FROM_DEVICE);
+		rds_ib_frag_free(ic, recv->r_frag);
+		recv->r_frag = NULL;
+	}
+}
+
+void rds_ib_recv_clear_ring(struct rds_ib_connection *ic)
+{
+	u32 i;
+
+	for (i = 0; i < ic->i_recv_ring.w_nr; i++)
+		rds_ib_recv_clear_one(ic, &ic->i_recvs[i]);
+}
+
+static struct rds_ib_incoming *rds_ib_refill_one_inc(struct rds_ib_connection *ic,
+						     gfp_t slab_mask)
+{
+	struct rds_ib_incoming *ibinc;
+	struct list_head *cache_item;
+	int avail_allocs;
+
+	cache_item = rds_ib_recv_cache_get(&ic->i_cache_incs);
+	if (cache_item) {
+		ibinc = container_of(cache_item, struct rds_ib_incoming, ii_cache_entry);
+	} else {
+		avail_allocs = atomic_add_unless(&rds_ib_allocation,
+						 1, rds_ib_sysctl_max_recv_allocation);
+		if (!avail_allocs) {
+			rds_ib_stats_inc(s_ib_rx_alloc_limit);
+			return NULL;
+		}
+		ibinc = kmem_cache_alloc(rds_ib_incoming_slab, slab_mask);
+		if (!ibinc) {
+			atomic_dec(&rds_ib_allocation);
+			return NULL;
+		}
+	}
+	INIT_LIST_HEAD(&ibinc->ii_frags);
+	rds_inc_init(&ibinc->ii_inc, ic->conn, ic->conn->c_faddr);
+
+	return ibinc;
+}
+
+static struct rds_page_frag *rds_ib_refill_one_frag(struct rds_ib_connection *ic,
+						    gfp_t slab_mask, gfp_t page_mask)
+{
+	struct rds_page_frag *frag;
+	struct list_head *cache_item;
+	int ret;
+
+	cache_item = rds_ib_recv_cache_get(&ic->i_cache_frags);
+	if (cache_item) {
+		frag = container_of(cache_item, struct rds_page_frag, f_cache_entry);
+	} else {
+		frag = kmem_cache_alloc(rds_ib_frag_slab, slab_mask);
+		if (!frag)
+			return NULL;
+
+		sg_init_table(&frag->f_sg, 1);
+		ret = rds_page_remainder_alloc(&frag->f_sg,
+					       RDS_FRAG_SIZE, page_mask);
+		if (ret) {
+			kmem_cache_free(rds_ib_frag_slab, frag);
+			return NULL;
+		}
+	}
+
+	INIT_LIST_HEAD(&frag->f_item);
+
+	return frag;
+}
+
+static int rds_ib_recv_refill_one(struct rds_connection *conn,
+				  struct rds_ib_recv_work *recv, int prefill)
+{
+	struct rds_ib_connection *ic = conn->c_transport_data;
+	struct ib_sge *sge;
+	int ret = -ENOMEM;
+	gfp_t slab_mask = GFP_NOWAIT;
+	gfp_t page_mask = GFP_NOWAIT;
+
+	if (prefill) {
+		slab_mask = GFP_KERNEL;
+		page_mask = GFP_HIGHUSER;
+	}
+
+	if (!ic->i_cache_incs.ready)
+		rds_ib_cache_xfer_to_ready(&ic->i_cache_incs);
+	if (!ic->i_cache_frags.ready)
+		rds_ib_cache_xfer_to_ready(&ic->i_cache_frags);
+
+	/*
+	 * ibinc was taken from recv if recv contained the start of a message.
+	 * recvs that were continuations will still have this allocated.
+	 */
+	if (!recv->r_ibinc) {
+		recv->r_ibinc = rds_ib_refill_one_inc(ic, slab_mask);
+		if (!recv->r_ibinc)
+			goto out;
+	}
+
+	WARN_ON(recv->r_frag); /* leak! */
+	recv->r_frag = rds_ib_refill_one_frag(ic, slab_mask, page_mask);
+	if (!recv->r_frag)
+		goto out;
+
+	ret = ib_dma_map_sg(ic->i_cm_id->device, &recv->r_frag->f_sg,
+			    1, DMA_FROM_DEVICE);
+	WARN_ON(ret != 1);
+
+	sge = &recv->r_sge[0];
+	sge->addr = ic->i_recv_hdrs_dma + (recv - ic->i_recvs) * sizeof(struct rds_header);
+	sge->length = sizeof(struct rds_header);
+
+	sge = &recv->r_sge[1];
+	sge->addr = ib_sg_dma_address(ic->i_cm_id->device, &recv->r_frag->f_sg);
+	sge->length = ib_sg_dma_len(ic->i_cm_id->device, &recv->r_frag->f_sg);
+
+	ret = 0;
+out:
+	return ret;
+}
+
+/*
+ * This tries to allocate and post unused work requests after making sure that
+ * they have all the allocations they need to queue received fragments into
+ * sockets.
+ *
+ * -1 is returned if posting fails due to temporary resource exhaustion.
+ */
+void rds_ib_recv_refill(struct rds_connection *conn, int prefill)
+{
+	struct rds_ib_connection *ic = conn->c_transport_data;
+	struct rds_ib_recv_work *recv;
+	struct ib_recv_wr *failed_wr;
+	unsigned int posted = 0;
+	int ret = 0;
+	u32 pos;
+
+	while ((prefill || rds_conn_up(conn)) &&
+	       rds_ib_ring_alloc(&ic->i_recv_ring, 1, &pos)) {
+		if (pos >= ic->i_recv_ring.w_nr) {
+			printk(KERN_NOTICE "Argh - ring alloc returned pos=%u\n",
+					pos);
+			break;
+		}
+
+		recv = &ic->i_recvs[pos];
+		ret = rds_ib_recv_refill_one(conn, recv, prefill);
+		if (ret) {
+			break;
+		}
+
+		/* XXX when can this fail? */
+		ret = ib_post_recv(ic->i_cm_id->qp, &recv->r_wr, &failed_wr);
+		rdsdebug("recv %p ibinc %p page %p addr %lu ret %d\n", recv,
+			 recv->r_ibinc, sg_page(&recv->r_frag->f_sg),
+			 (long) ib_sg_dma_address(
+				ic->i_cm_id->device,
+				&recv->r_frag->f_sg),
+			ret);
+		if (ret) {
+			rds_ib_conn_error(conn, "recv post on "
+			       "%pI4 returned %d, disconnecting and "
+			       "reconnecting\n", &conn->c_faddr,
+			       ret);
+			break;
+		}
+
+		posted++;
+	}
+
+	/* We're doing flow control - update the window. */
+	if (ic->i_flowctl && posted)
+		rds_ib_advertise_credits(conn, posted);
+
+	if (ret)
+		rds_ib_ring_unalloc(&ic->i_recv_ring, 1);
+}
+
+/*
+ * We want to recycle several types of recv allocations, like incs and frags.
+ * To use this, the *_free() function passes in the ptr to a list_head within
+ * the recyclee, as well as the cache to put it on.
+ *
+ * First, we put the memory on a percpu list. When this reaches a certain size,
+ * We move it to an intermediate non-percpu list in a lockless manner, with some
+ * xchg/compxchg wizardry.
+ *
+ * N.B. Instead of a list_head as the anchor, we use a single pointer, which can
+ * be NULL and xchg'd. The list is actually empty when the pointer is NULL, and
+ * list_empty() will return true with one element is actually present.
+ */
+static void rds_ib_recv_cache_put(struct list_head *new_item,
+				 struct rds_ib_refill_cache *cache)
+{
+	unsigned long flags;
+	struct list_head *old, *chpfirst;
+
+	local_irq_save(flags);
+
+	chpfirst = __this_cpu_read(cache->percpu->first);
+	if (!chpfirst)
+		INIT_LIST_HEAD(new_item);
+	else /* put on front */
+		list_add_tail(new_item, chpfirst);
+
+	__this_cpu_write(cache->percpu->first, new_item);
+	__this_cpu_inc(cache->percpu->count);
+
+	if (__this_cpu_read(cache->percpu->count) < RDS_IB_RECYCLE_BATCH_COUNT)
+		goto end;
+
+	/*
+	 * Return our per-cpu first list to the cache's xfer by atomically
+	 * grabbing the current xfer list, appending it to our per-cpu list,
+	 * and then atomically returning that entire list back to the
+	 * cache's xfer list as long as it's still empty.
+	 */
+	do {
+		old = xchg(&cache->xfer, NULL);
+		if (old)
+			list_splice_entire_tail(old, chpfirst);
+		old = cmpxchg(&cache->xfer, NULL, chpfirst);
+	} while (old);
+
+
+	__this_cpu_write(cache->percpu->first, NULL);
+	__this_cpu_write(cache->percpu->count, 0);
+end:
+	local_irq_restore(flags);
+}
+
+static struct list_head *rds_ib_recv_cache_get(struct rds_ib_refill_cache *cache)
+{
+	struct list_head *head = cache->ready;
+
+	if (head) {
+		if (!list_empty(head)) {
+			cache->ready = head->next;
+			list_del_init(head);
+		} else
+			cache->ready = NULL;
+	}
+
+	return head;
+}
+
+int rds_ib_inc_copy_to_user(struct rds_incoming *inc, struct iovec *first_iov,
+			    size_t size)
+{
+	struct rds_ib_incoming *ibinc;
+	struct rds_page_frag *frag;
+	struct iovec *iov = first_iov;
+	unsigned long to_copy;
+	unsigned long frag_off = 0;
+	unsigned long iov_off = 0;
+	int copied = 0;
+	int ret;
+	u32 len;
+
+	ibinc = container_of(inc, struct rds_ib_incoming, ii_inc);
+	frag = list_entry(ibinc->ii_frags.next, struct rds_page_frag, f_item);
+	len = be32_to_cpu(inc->i_hdr.h_len);
+
+	while (copied < size && copied < len) {
+		if (frag_off == RDS_FRAG_SIZE) {
+			frag = list_entry(frag->f_item.next,
+					  struct rds_page_frag, f_item);
+			frag_off = 0;
+		}
+		while (iov_off == iov->iov_len) {
+			iov_off = 0;
+			iov++;
+		}
+
+		to_copy = min(iov->iov_len - iov_off, RDS_FRAG_SIZE - frag_off);
+		to_copy = min_t(size_t, to_copy, size - copied);
+		to_copy = min_t(unsigned long, to_copy, len - copied);
+
+		rdsdebug("%lu bytes to user [%p, %zu] + %lu from frag "
+			 "[%p, %u] + %lu\n",
+			 to_copy, iov->iov_base, iov->iov_len, iov_off,
+			 sg_page(&frag->f_sg), frag->f_sg.offset, frag_off);
+
+		/* XXX needs + offset for multiple recvs per page */
+		ret = rds_page_copy_to_user(sg_page(&frag->f_sg),
+					    frag->f_sg.offset + frag_off,
+					    iov->iov_base + iov_off,
+					    to_copy);
+		if (ret) {
+			copied = ret;
+			break;
+		}
+
+		iov_off += to_copy;
+		frag_off += to_copy;
+		copied += to_copy;
+	}
+
+	return copied;
+}
+
+/* ic starts out kzalloc()ed */
+void rds_ib_recv_init_ack(struct rds_ib_connection *ic)
+{
+	struct ib_send_wr *wr = &ic->i_ack_wr;
+	struct ib_sge *sge = &ic->i_ack_sge;
+
+	sge->addr = ic->i_ack_dma;
+	sge->length = sizeof(struct rds_header);
+	sge->lkey = ic->i_mr->lkey;
+
+	wr->sg_list = sge;
+	wr->num_sge = 1;
+	wr->opcode = IB_WR_SEND;
+	wr->wr_id = RDS_IB_ACK_WR_ID;
+	wr->send_flags = IB_SEND_SIGNALED | IB_SEND_SOLICITED;
+}
+
+/*
+ * You'd think that with reliable IB connections you wouldn't need to ack
+ * messages that have been received.  The problem is that IB hardware generates
+ * an ack message before it has DMAed the message into memory.  This creates a
+ * potential message loss if the HCA is disabled for any reason between when it
+ * sends the ack and before the message is DMAed and processed.  This is only a
+ * potential issue if another HCA is available for fail-over.
+ *
+ * When the remote host receives our ack they'll free the sent message from
+ * their send queue.  To decrease the latency of this we always send an ack
+ * immediately after we've received messages.
+ *
+ * For simplicity, we only have one ack in flight at a time.  This puts
+ * pressure on senders to have deep enough send queues to absorb the latency of
+ * a single ack frame being in flight.  This might not be good enough.
+ *
+ * This is implemented by have a long-lived send_wr and sge which point to a
+ * statically allocated ack frame.  This ack wr does not fall under the ring
+ * accounting that the tx and rx wrs do.  The QP attribute specifically makes
+ * room for it beyond the ring size.  Send completion notices its special
+ * wr_id and avoids working with the ring in that case.
+ */
+#ifndef KERNEL_HAS_ATOMIC64
+static void rds_ib_set_ack(struct rds_ib_connection *ic, u64 seq,
+				int ack_required)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&ic->i_ack_lock, flags);
+	ic->i_ack_next = seq;
+	if (ack_required)
+		set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
+	spin_unlock_irqrestore(&ic->i_ack_lock, flags);
+}
+
+static u64 rds_ib_get_ack(struct rds_ib_connection *ic)
+{
+	unsigned long flags;
+	u64 seq;
+
+	clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
+
+	spin_lock_irqsave(&ic->i_ack_lock, flags);
+	seq = ic->i_ack_next;
+	spin_unlock_irqrestore(&ic->i_ack_lock, flags);
+
+	return seq;
+}
+#else
+static void rds_ib_set_ack(struct rds_ib_connection *ic, u64 seq,
+				int ack_required)
+{
+	atomic64_set(&ic->i_ack_next, seq);
+	if (ack_required) {
+		smp_mb__before_atomic();
+		set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
+	}
+}
+
+static u64 rds_ib_get_ack(struct rds_ib_connection *ic)
+{
+	clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
+	smp_mb__after_atomic();
+
+	return atomic64_read(&ic->i_ack_next);
+}
+#endif
+
+
+static void rds_ib_send_ack(struct rds_ib_connection *ic, unsigned int adv_credits)
+{
+	struct rds_header *hdr = ic->i_ack;
+	struct ib_send_wr *failed_wr;
+	u64 seq;
+	int ret;
+
+	seq = rds_ib_get_ack(ic);
+
+	rdsdebug("send_ack: ic %p ack %llu\n", ic, (unsigned long long) seq);
+	rds_message_populate_header(hdr, 0, 0, 0);
+	hdr->h_ack = cpu_to_be64(seq);
+	hdr->h_credit = adv_credits;
+	rds_message_make_checksum(hdr);
+	ic->i_ack_queued = jiffies;
+
+	ret = ib_post_send(ic->i_cm_id->qp, &ic->i_ack_wr, &failed_wr);
+	if (unlikely(ret)) {
+		/* Failed to send. Release the WR, and
+		 * force another ACK.
+		 */
+		clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
+		set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
+
+		rds_ib_stats_inc(s_ib_ack_send_failure);
+
+		rds_ib_conn_error(ic->conn, "sending ack failed\n");
+	} else
+		rds_ib_stats_inc(s_ib_ack_sent);
+}
+
+/*
+ * There are 3 ways of getting acknowledgements to the peer:
+ *  1.	We call rds_ib_attempt_ack from the recv completion handler
+ *	to send an ACK-only frame.
+ *	However, there can be only one such frame in the send queue
+ *	at any time, so we may have to postpone it.
+ *  2.	When another (data) packet is transmitted while there's
+ *	an ACK in the queue, we piggyback the ACK sequence number
+ *	on the data packet.
+ *  3.	If the ACK WR is done sending, we get called from the
+ *	send queue completion handler, and check whether there's
+ *	another ACK pending (postponed because the WR was on the
+ *	queue). If so, we transmit it.
+ *
+ * We maintain 2 variables:
+ *  -	i_ack_flags, which keeps track of whether the ACK WR
+ *	is currently in the send queue or not (IB_ACK_IN_FLIGHT)
+ *  -	i_ack_next, which is the last sequence number we received
+ *
+ * Potentially, send queue and receive queue handlers can run concurrently.
+ * It would be nice to not have to use a spinlock to synchronize things,
+ * but the one problem that rules this out is that 64bit updates are
+ * not atomic on all platforms. Things would be a lot simpler if
+ * we had atomic64 or maybe cmpxchg64 everywhere.
+ *
+ * Reconnecting complicates this picture just slightly. When we
+ * reconnect, we may be seeing duplicate packets. The peer
+ * is retransmitting them, because it hasn't seen an ACK for
+ * them. It is important that we ACK these.
+ *
+ * ACK mitigation adds a header flag "ACK_REQUIRED"; any packet with
+ * this flag set *MUST* be acknowledged immediately.
+ */
+
+/*
+ * When we get here, we're called from the recv queue handler.
+ * Check whether we ought to transmit an ACK.
+ */
+void rds_ib_attempt_ack(struct rds_ib_connection *ic)
+{
+	unsigned int adv_credits;
+
+	if (!test_bit(IB_ACK_REQUESTED, &ic->i_ack_flags))
+		return;
+
+	if (test_and_set_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags)) {
+		rds_ib_stats_inc(s_ib_ack_send_delayed);
+		return;
+	}
+
+	/* Can we get a send credit? */
+	if (!rds_ib_send_grab_credits(ic, 1, &adv_credits, 0, RDS_MAX_ADV_CREDIT)) {
+		rds_ib_stats_inc(s_ib_tx_throttle);
+		clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
+		return;
+	}
+
+	clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
+	rds_ib_send_ack(ic, adv_credits);
+}
+
+/*
+ * We get here from the send completion handler, when the
+ * adapter tells us the ACK frame was sent.
+ */
+void rds_ib_ack_send_complete(struct rds_ib_connection *ic)
+{
+	clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
+	rds_ib_attempt_ack(ic);
+}
+
+/*
+ * This is called by the regular xmit code when it wants to piggyback
+ * an ACK on an outgoing frame.
+ */
+u64 rds_ib_piggyb_ack(struct rds_ib_connection *ic)
+{
+	if (test_and_clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags))
+		rds_ib_stats_inc(s_ib_ack_send_piggybacked);
+	return rds_ib_get_ack(ic);
+}
+
+/*
+ * It's kind of lame that we're copying from the posted receive pages into
+ * long-lived bitmaps.  We could have posted the bitmaps and rdma written into
+ * them.  But receiving new congestion bitmaps should be a *rare* event, so
+ * hopefully we won't need to invest that complexity in making it more
+ * efficient.  By copying we can share a simpler core with TCP which has to
+ * copy.
+ */
+static void rds_ib_cong_recv(struct rds_connection *conn,
+			      struct rds_ib_incoming *ibinc)
+{
+	struct rds_cong_map *map;
+	unsigned int map_off;
+	unsigned int map_page;
+	struct rds_page_frag *frag;
+	unsigned long frag_off;
+	unsigned long to_copy;
+	unsigned long copied;
+	uint64_t uncongested = 0;
+	void *addr;
+
+	/* catch completely corrupt packets */
+	if (be32_to_cpu(ibinc->ii_inc.i_hdr.h_len) != RDS_CONG_MAP_BYTES)
+		return;
+
+	map = conn->c_fcong;
+	map_page = 0;
+	map_off = 0;
+
+	frag = list_entry(ibinc->ii_frags.next, struct rds_page_frag, f_item);
+	frag_off = 0;
+
+	copied = 0;
+
+	while (copied < RDS_CONG_MAP_BYTES) {
+		uint64_t *src, *dst;
+		unsigned int k;
+
+		to_copy = min(RDS_FRAG_SIZE - frag_off, PAGE_SIZE - map_off);
+		BUG_ON(to_copy & 7); /* Must be 64bit aligned. */
+
+		addr = kmap_atomic(sg_page(&frag->f_sg));
+
+		src = addr + frag_off;
+		dst = (void *)map->m_page_addrs[map_page] + map_off;
+		for (k = 0; k < to_copy; k += 8) {
+			/* Record ports that became uncongested, ie
+			 * bits that changed from 0 to 1. */
+			uncongested |= ~(*src) & *dst;
+			*dst++ = *src++;
+		}
+		kunmap_atomic(addr);
+
+		copied += to_copy;
+
+		map_off += to_copy;
+		if (map_off == PAGE_SIZE) {
+			map_off = 0;
+			map_page++;
+		}
+
+		frag_off += to_copy;
+		if (frag_off == RDS_FRAG_SIZE) {
+			frag = list_entry(frag->f_item.next,
+					  struct rds_page_frag, f_item);
+			frag_off = 0;
+		}
+	}
+
+	/* the congestion map is in little endian order */
+	uncongested = le64_to_cpu(uncongested);
+
+	rds_cong_map_updated(map, uncongested);
+}
+
+/*
+ * Rings are posted with all the allocations they'll need to queue the
+ * incoming message to the receiving socket so this can't fail.
+ * All fragments start with a header, so we can make sure we're not receiving
+ * garbage, and we can tell a small 8 byte fragment from an ACK frame.
+ */
+struct rds_ib_ack_state {
+	u64		ack_next;
+	u64		ack_recv;
+	unsigned int	ack_required:1;
+	unsigned int	ack_next_valid:1;
+	unsigned int	ack_recv_valid:1;
+};
+
+static void rds_ib_process_recv(struct rds_connection *conn,
+				struct rds_ib_recv_work *recv, u32 data_len,
+				struct rds_ib_ack_state *state)
+{
+	struct rds_ib_connection *ic = conn->c_transport_data;
+	struct rds_ib_incoming *ibinc = ic->i_ibinc;
+	struct rds_header *ihdr, *hdr;
+
+	/* XXX shut down the connection if port 0,0 are seen? */
+
+	rdsdebug("ic %p ibinc %p recv %p byte len %u\n", ic, ibinc, recv,
+		 data_len);
+
+	if (data_len < sizeof(struct rds_header)) {
+		rds_ib_conn_error(conn, "incoming message "
+		       "from %pI4 didn't include a "
+		       "header, disconnecting and "
+		       "reconnecting\n",
+		       &conn->c_faddr);
+		return;
+	}
+	data_len -= sizeof(struct rds_header);
+
+	ihdr = &ic->i_recv_hdrs[recv - ic->i_recvs];
+
+	/* Validate the checksum. */
+	if (!rds_message_verify_checksum(ihdr)) {
+		rds_ib_conn_error(conn, "incoming message "
+		       "from %pI4 has corrupted header - "
+		       "forcing a reconnect\n",
+		       &conn->c_faddr);
+		rds_stats_inc(s_recv_drop_bad_checksum);
+		return;
+	}
+
+	/* Process the ACK sequence which comes with every packet */
+	state->ack_recv = be64_to_cpu(ihdr->h_ack);
+	state->ack_recv_valid = 1;
+
+	/* Process the credits update if there was one */
+	if (ihdr->h_credit)
+		rds_ib_send_add_credits(conn, ihdr->h_credit);
+
+	if (ihdr->h_sport == 0 && ihdr->h_dport == 0 && data_len == 0) {
+		/* This is an ACK-only packet. The fact that it gets
+		 * special treatment here is that historically, ACKs
+		 * were rather special beasts.
+		 */
+		rds_ib_stats_inc(s_ib_ack_received);
+
+		/*
+		 * Usually the frags make their way on to incs and are then freed as
+		 * the inc is freed.  We don't go that route, so we have to drop the
+		 * page ref ourselves.  We can't just leave the page on the recv
+		 * because that confuses the dma mapping of pages and each recv's use
+		 * of a partial page.
+		 *
+		 * FIXME: Fold this into the code path below.
+		 */
+		rds_ib_frag_free(ic, recv->r_frag);
+		recv->r_frag = NULL;
+		return;
+	}
+
+	/*
+	 * If we don't already have an inc on the connection then this
+	 * fragment has a header and starts a message.. copy its header
+	 * into the inc and save the inc so we can hang upcoming fragments
+	 * off its list.
+	 */
+	if (!ibinc) {
+		ibinc = recv->r_ibinc;
+		recv->r_ibinc = NULL;
+		ic->i_ibinc = ibinc;
+
+		hdr = &ibinc->ii_inc.i_hdr;
+		memcpy(hdr, ihdr, sizeof(*hdr));
+		ic->i_recv_data_rem = be32_to_cpu(hdr->h_len);
+
+		rdsdebug("ic %p ibinc %p rem %u flag 0x%x\n", ic, ibinc,
+			 ic->i_recv_data_rem, hdr->h_flags);
+	} else {
+		hdr = &ibinc->ii_inc.i_hdr;
+		/* We can't just use memcmp here; fragments of a
+		 * single message may carry different ACKs */
+		if (hdr->h_sequence != ihdr->h_sequence ||
+		    hdr->h_len != ihdr->h_len ||
+		    hdr->h_sport != ihdr->h_sport ||
+		    hdr->h_dport != ihdr->h_dport) {
+			rds_ib_conn_error(conn,
+				"fragment header mismatch; forcing reconnect\n");
+			return;
+		}
+	}
+
+	list_add_tail(&recv->r_frag->f_item, &ibinc->ii_frags);
+	recv->r_frag = NULL;
+
+	if (ic->i_recv_data_rem > RDS_FRAG_SIZE)
+		ic->i_recv_data_rem -= RDS_FRAG_SIZE;
+	else {
+		ic->i_recv_data_rem = 0;
+		ic->i_ibinc = NULL;
+
+		if (ibinc->ii_inc.i_hdr.h_flags == RDS_FLAG_CONG_BITMAP)
+			rds_ib_cong_recv(conn, ibinc);
+		else {
+			rds_recv_incoming(conn, conn->c_faddr, conn->c_laddr,
+					  &ibinc->ii_inc, GFP_ATOMIC);
+			state->ack_next = be64_to_cpu(hdr->h_sequence);
+			state->ack_next_valid = 1;
+		}
+
+		/* Evaluate the ACK_REQUIRED flag *after* we received
+		 * the complete frame, and after bumping the next_rx
+		 * sequence. */
+		if (hdr->h_flags & RDS_FLAG_ACK_REQUIRED) {
+			rds_stats_inc(s_recv_ack_required);
+			state->ack_required = 1;
+		}
+
+		rds_inc_put(&ibinc->ii_inc);
+	}
+}
+
+/*
+ * Plucking the oldest entry from the ring can be done concurrently with
+ * the thread refilling the ring.  Each ring operation is protected by
+ * spinlocks and the transient state of refilling doesn't change the
+ * recording of which entry is oldest.
+ *
+ * This relies on IB only calling one cq comp_handler for each cq so that
+ * there will only be one caller of rds_recv_incoming() per RDS connection.
+ */
+void rds_ib_recv_cq_comp_handler(struct ib_cq *cq, void *context)
+{
+	struct rds_connection *conn = context;
+	struct rds_ib_connection *ic = conn->c_transport_data;
+
+	rdsdebug("conn %p cq %p\n", conn, cq);
+
+	rds_ib_stats_inc(s_ib_rx_cq_call);
+
+	tasklet_schedule(&ic->i_recv_tasklet);
+}
+
+static inline void rds_poll_cq(struct rds_ib_connection *ic,
+			       struct rds_ib_ack_state *state)
+{
+	struct rds_connection *conn = ic->conn;
+	struct ib_wc wc;
+	struct rds_ib_recv_work *recv;
+
+	while (ib_poll_cq(ic->i_recv_cq, 1, &wc) > 0) {
+		rdsdebug("wc wr_id 0x%llx status %u (%s) byte_len %u imm_data %u\n",
+			 (unsigned long long)wc.wr_id, wc.status,
+			 rds_ib_wc_status_str(wc.status), wc.byte_len,
+			 be32_to_cpu(wc.ex.imm_data));
+		rds_ib_stats_inc(s_ib_rx_cq_event);
+
+		recv = &ic->i_recvs[rds_ib_ring_oldest(&ic->i_recv_ring)];
+
+		ib_dma_unmap_sg(ic->i_cm_id->device, &recv->r_frag->f_sg, 1, DMA_FROM_DEVICE);
+
+		/*
+		 * Also process recvs in connecting state because it is possible
+		 * to get a recv completion _before_ the rdmacm ESTABLISHED
+		 * event is processed.
+		 */
+		if (wc.status == IB_WC_SUCCESS) {
+			rds_ib_process_recv(conn, recv, wc.byte_len, state);
+		} else {
+			/* We expect errors as the qp is drained during shutdown */
+			if (rds_conn_up(conn) || rds_conn_connecting(conn))
+				rds_ib_conn_error(conn, "recv completion on %pI4 had "
+						  "status %u (%s), disconnecting and "
+						  "reconnecting\n", &conn->c_faddr,
+						  wc.status,
+						  rds_ib_wc_status_str(wc.status));
+		}
+
+		/*
+		 * It's very important that we only free this ring entry if we've truly
+		 * freed the resources allocated to the entry.  The refilling path can
+		 * leak if we don't.
+		 */
+		rds_ib_ring_free(&ic->i_recv_ring, 1);
+	}
+}
+
+void rds_ib_recv_tasklet_fn(unsigned long data)
+{
+	struct rds_ib_connection *ic = (struct rds_ib_connection *) data;
+	struct rds_connection *conn = ic->conn;
+	struct rds_ib_ack_state state = { 0, };
+
+	rds_poll_cq(ic, &state);
+	ib_req_notify_cq(ic->i_recv_cq, IB_CQ_SOLICITED);
+	rds_poll_cq(ic, &state);
+
+	if (state.ack_next_valid)
+		rds_ib_set_ack(ic, state.ack_next, state.ack_required);
+	if (state.ack_recv_valid && state.ack_recv > ic->i_ack_recv) {
+		rds_send_drop_acked(conn, state.ack_recv, NULL);
+		ic->i_ack_recv = state.ack_recv;
+	}
+	if (rds_conn_up(conn))
+		rds_ib_attempt_ack(ic);
+
+	/* If we ever end up with a really empty receive ring, we're
+	 * in deep trouble, as the sender will definitely see RNR
+	 * timeouts. */
+	if (rds_ib_ring_empty(&ic->i_recv_ring))
+		rds_ib_stats_inc(s_ib_rx_ring_empty);
+
+	if (rds_ib_ring_low(&ic->i_recv_ring))
+		rds_ib_recv_refill(conn, 0);
+}
+
+int rds_ib_recv(struct rds_connection *conn)
+{
+	struct rds_ib_connection *ic = conn->c_transport_data;
+	int ret = 0;
+
+	rdsdebug("conn %p\n", conn);
+	if (rds_conn_up(conn))
+		rds_ib_attempt_ack(ic);
+
+	return ret;
+}
+
+int rds_ib_recv_init(void)
+{
+	struct sysinfo si;
+	int ret = -ENOMEM;
+
+	/* Default to 30% of all available RAM for recv memory */
+	si_meminfo(&si);
+	rds_ib_sysctl_max_recv_allocation = si.totalram / 3 * PAGE_SIZE / RDS_FRAG_SIZE;
+
+	rds_ib_incoming_slab = kmem_cache_create("rds_ib_incoming",
+					sizeof(struct rds_ib_incoming),
+					0, SLAB_HWCACHE_ALIGN, NULL);
+	if (!rds_ib_incoming_slab)
+		goto out;
+
+	rds_ib_frag_slab = kmem_cache_create("rds_ib_frag",
+					sizeof(struct rds_page_frag),
+					0, SLAB_HWCACHE_ALIGN, NULL);
+	if (!rds_ib_frag_slab)
+		kmem_cache_destroy(rds_ib_incoming_slab);
+	else
+		ret = 0;
+out:
+	return ret;
+}
+
+void rds_ib_recv_exit(void)
+{
+	kmem_cache_destroy(rds_ib_incoming_slab);
+	kmem_cache_destroy(rds_ib_frag_slab);
+}
diff --git a/net/rds/ib_ring.c b/net/rds/ib_ring.c
new file mode 100644
index 0000000..ff97e8e
--- /dev/null
+++ b/net/rds/ib_ring.c
@@ -0,0 +1,168 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+
+#include "rds.h"
+#include "ib.h"
+
+/*
+ * Locking for IB rings.
+ * We assume that allocation is always protected by a mutex
+ * in the caller (this is a valid assumption for the current
+ * implementation).
+ *
+ * Freeing always happens in an interrupt, and hence only
+ * races with allocations, but not with other free()s.
+ *
+ * The interaction between allocation and freeing is that
+ * the alloc code has to determine the number of free entries.
+ * To this end, we maintain two counters; an allocation counter
+ * and a free counter. Both are allowed to run freely, and wrap
+ * around.
+ * The number of used entries is always (alloc_ctr - free_ctr) % NR.
+ *
+ * The current implementation makes free_ctr atomic. When the
+ * caller finds an allocation fails, it should set an "alloc fail"
+ * bit and retry the allocation. The "alloc fail" bit essentially tells
+ * the CQ completion handlers to wake it up after freeing some
+ * more entries.
+ */
+
+/*
+ * This only happens on shutdown.
+ */
+DECLARE_WAIT_QUEUE_HEAD(rds_ib_ring_empty_wait);
+
+void rds_ib_ring_init(struct rds_ib_work_ring *ring, u32 nr)
+{
+	memset(ring, 0, sizeof(*ring));
+	ring->w_nr = nr;
+	rdsdebug("ring %p nr %u\n", ring, ring->w_nr);
+}
+
+static inline u32 __rds_ib_ring_used(struct rds_ib_work_ring *ring)
+{
+	u32 diff;
+
+	/* This assumes that atomic_t has at least as many bits as u32 */
+	diff = ring->w_alloc_ctr - (u32) atomic_read(&ring->w_free_ctr);
+	BUG_ON(diff > ring->w_nr);
+
+	return diff;
+}
+
+void rds_ib_ring_resize(struct rds_ib_work_ring *ring, u32 nr)
+{
+	/* We only ever get called from the connection setup code,
+	 * prior to creating the QP. */
+	BUG_ON(__rds_ib_ring_used(ring));
+	ring->w_nr = nr;
+}
+
+static int __rds_ib_ring_empty(struct rds_ib_work_ring *ring)
+{
+	return __rds_ib_ring_used(ring) == 0;
+}
+
+u32 rds_ib_ring_alloc(struct rds_ib_work_ring *ring, u32 val, u32 *pos)
+{
+	u32 ret = 0, avail;
+
+	avail = ring->w_nr - __rds_ib_ring_used(ring);
+
+	rdsdebug("ring %p val %u next %u free %u\n", ring, val,
+		 ring->w_alloc_ptr, avail);
+
+	if (val && avail) {
+		ret = min(val, avail);
+		*pos = ring->w_alloc_ptr;
+
+		ring->w_alloc_ptr = (ring->w_alloc_ptr + ret) % ring->w_nr;
+		ring->w_alloc_ctr += ret;
+	}
+
+	return ret;
+}
+
+void rds_ib_ring_free(struct rds_ib_work_ring *ring, u32 val)
+{
+	ring->w_free_ptr = (ring->w_free_ptr + val) % ring->w_nr;
+	atomic_add(val, &ring->w_free_ctr);
+
+	if (__rds_ib_ring_empty(ring) &&
+	    waitqueue_active(&rds_ib_ring_empty_wait))
+		wake_up(&rds_ib_ring_empty_wait);
+}
+
+void rds_ib_ring_unalloc(struct rds_ib_work_ring *ring, u32 val)
+{
+	ring->w_alloc_ptr = (ring->w_alloc_ptr - val) % ring->w_nr;
+	ring->w_alloc_ctr -= val;
+}
+
+int rds_ib_ring_empty(struct rds_ib_work_ring *ring)
+{
+	return __rds_ib_ring_empty(ring);
+}
+
+int rds_ib_ring_low(struct rds_ib_work_ring *ring)
+{
+	return __rds_ib_ring_used(ring) <= (ring->w_nr >> 1);
+}
+
+/*
+ * returns the oldest alloced ring entry.  This will be the next one
+ * freed.  This can't be called if there are none allocated.
+ */
+u32 rds_ib_ring_oldest(struct rds_ib_work_ring *ring)
+{
+	return ring->w_free_ptr;
+}
+
+/*
+ * returns the number of completed work requests.
+ */
+
+u32 rds_ib_ring_completed(struct rds_ib_work_ring *ring, u32 wr_id, u32 oldest)
+{
+	u32 ret;
+
+	if (oldest <= (unsigned long long)wr_id)
+		ret = (unsigned long long)wr_id - oldest + 1;
+	else
+		ret = ring->w_nr - oldest + (unsigned long long)wr_id + 1;
+
+	rdsdebug("ring %p ret %u wr_id %u oldest %u\n", ring, ret,
+		 wr_id, oldest);
+	return ret;
+}
diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c
new file mode 100644
index 0000000..1dde91e
--- /dev/null
+++ b/net/rds/ib_send.c
@@ -0,0 +1,1020 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/in.h>
+#include <linux/device.h>
+#include <linux/dmapool.h>
+#include <linux/ratelimit.h>
+
+#include "rds.h"
+#include "ib.h"
+
+static char *rds_ib_wc_status_strings[] = {
+#define RDS_IB_WC_STATUS_STR(foo) \
+		[IB_WC_##foo] = __stringify(IB_WC_##foo)
+	RDS_IB_WC_STATUS_STR(SUCCESS),
+	RDS_IB_WC_STATUS_STR(LOC_LEN_ERR),
+	RDS_IB_WC_STATUS_STR(LOC_QP_OP_ERR),
+	RDS_IB_WC_STATUS_STR(LOC_EEC_OP_ERR),
+	RDS_IB_WC_STATUS_STR(LOC_PROT_ERR),
+	RDS_IB_WC_STATUS_STR(WR_FLUSH_ERR),
+	RDS_IB_WC_STATUS_STR(MW_BIND_ERR),
+	RDS_IB_WC_STATUS_STR(BAD_RESP_ERR),
+	RDS_IB_WC_STATUS_STR(LOC_ACCESS_ERR),
+	RDS_IB_WC_STATUS_STR(REM_INV_REQ_ERR),
+	RDS_IB_WC_STATUS_STR(REM_ACCESS_ERR),
+	RDS_IB_WC_STATUS_STR(REM_OP_ERR),
+	RDS_IB_WC_STATUS_STR(RETRY_EXC_ERR),
+	RDS_IB_WC_STATUS_STR(RNR_RETRY_EXC_ERR),
+	RDS_IB_WC_STATUS_STR(LOC_RDD_VIOL_ERR),
+	RDS_IB_WC_STATUS_STR(REM_INV_RD_REQ_ERR),
+	RDS_IB_WC_STATUS_STR(REM_ABORT_ERR),
+	RDS_IB_WC_STATUS_STR(INV_EECN_ERR),
+	RDS_IB_WC_STATUS_STR(INV_EEC_STATE_ERR),
+	RDS_IB_WC_STATUS_STR(FATAL_ERR),
+	RDS_IB_WC_STATUS_STR(RESP_TIMEOUT_ERR),
+	RDS_IB_WC_STATUS_STR(GENERAL_ERR),
+#undef RDS_IB_WC_STATUS_STR
+};
+
+char *rds_ib_wc_status_str(enum ib_wc_status status)
+{
+	return rds_str_array(rds_ib_wc_status_strings,
+			     ARRAY_SIZE(rds_ib_wc_status_strings), status);
+}
+
+/*
+ * Convert IB-specific error message to RDS error message and call core
+ * completion handler.
+ */
+static void rds_ib_send_complete(struct rds_message *rm,
+				 int wc_status,
+				 void (*complete)(struct rds_message *rm, int status))
+{
+	int notify_status;
+
+	switch (wc_status) {
+	case IB_WC_WR_FLUSH_ERR:
+		return;
+
+	case IB_WC_SUCCESS:
+		notify_status = RDS_RDMA_SUCCESS;
+		break;
+
+	case IB_WC_REM_ACCESS_ERR:
+		notify_status = RDS_RDMA_REMOTE_ERROR;
+		break;
+
+	default:
+		notify_status = RDS_RDMA_OTHER_ERROR;
+		break;
+	}
+	complete(rm, notify_status);
+}
+
+static void rds_ib_send_unmap_data(struct rds_ib_connection *ic,
+				   struct rm_data_op *op,
+				   int wc_status)
+{
+	if (op->op_nents)
+		ib_dma_unmap_sg(ic->i_cm_id->device,
+				op->op_sg, op->op_nents,
+				DMA_TO_DEVICE);
+}
+
+static void rds_ib_send_unmap_rdma(struct rds_ib_connection *ic,
+				   struct rm_rdma_op *op,
+				   int wc_status)
+{
+	if (op->op_mapped) {
+		ib_dma_unmap_sg(ic->i_cm_id->device,
+				op->op_sg, op->op_nents,
+				op->op_write ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
+		op->op_mapped = 0;
+	}
+
+	/* If the user asked for a completion notification on this
+	 * message, we can implement three different semantics:
+	 *  1.	Notify when we received the ACK on the RDS message
+	 *	that was queued with the RDMA. This provides reliable
+	 *	notification of RDMA status at the expense of a one-way
+	 *	packet delay.
+	 *  2.	Notify when the IB stack gives us the completion event for
+	 *	the RDMA operation.
+	 *  3.	Notify when the IB stack gives us the completion event for
+	 *	the accompanying RDS messages.
+	 * Here, we implement approach #3. To implement approach #2,
+	 * we would need to take an event for the rdma WR. To implement #1,
+	 * don't call rds_rdma_send_complete at all, and fall back to the notify
+	 * handling in the ACK processing code.
+	 *
+	 * Note: There's no need to explicitly sync any RDMA buffers using
+	 * ib_dma_sync_sg_for_cpu - the completion for the RDMA
+	 * operation itself unmapped the RDMA buffers, which takes care
+	 * of synching.
+	 */
+	rds_ib_send_complete(container_of(op, struct rds_message, rdma),
+			     wc_status, rds_rdma_send_complete);
+
+	if (op->op_write)
+		rds_stats_add(s_send_rdma_bytes, op->op_bytes);
+	else
+		rds_stats_add(s_recv_rdma_bytes, op->op_bytes);
+}
+
+static void rds_ib_send_unmap_atomic(struct rds_ib_connection *ic,
+				     struct rm_atomic_op *op,
+				     int wc_status)
+{
+	/* unmap atomic recvbuf */
+	if (op->op_mapped) {
+		ib_dma_unmap_sg(ic->i_cm_id->device, op->op_sg, 1,
+				DMA_FROM_DEVICE);
+		op->op_mapped = 0;
+	}
+
+	rds_ib_send_complete(container_of(op, struct rds_message, atomic),
+			     wc_status, rds_atomic_send_complete);
+
+	if (op->op_type == RDS_ATOMIC_TYPE_CSWP)
+		rds_ib_stats_inc(s_ib_atomic_cswp);
+	else
+		rds_ib_stats_inc(s_ib_atomic_fadd);
+}
+
+/*
+ * Unmap the resources associated with a struct send_work.
+ *
+ * Returns the rm for no good reason other than it is unobtainable
+ * other than by switching on wr.opcode, currently, and the caller,
+ * the event handler, needs it.
+ */
+static struct rds_message *rds_ib_send_unmap_op(struct rds_ib_connection *ic,
+						struct rds_ib_send_work *send,
+						int wc_status)
+{
+	struct rds_message *rm = NULL;
+
+	/* In the error case, wc.opcode sometimes contains garbage */
+	switch (send->s_wr.opcode) {
+	case IB_WR_SEND:
+		if (send->s_op) {
+			rm = container_of(send->s_op, struct rds_message, data);
+			rds_ib_send_unmap_data(ic, send->s_op, wc_status);
+		}
+		break;
+	case IB_WR_RDMA_WRITE:
+	case IB_WR_RDMA_READ:
+		if (send->s_op) {
+			rm = container_of(send->s_op, struct rds_message, rdma);
+			rds_ib_send_unmap_rdma(ic, send->s_op, wc_status);
+		}
+		break;
+	case IB_WR_ATOMIC_FETCH_AND_ADD:
+	case IB_WR_ATOMIC_CMP_AND_SWP:
+		if (send->s_op) {
+			rm = container_of(send->s_op, struct rds_message, atomic);
+			rds_ib_send_unmap_atomic(ic, send->s_op, wc_status);
+		}
+		break;
+	default:
+		printk_ratelimited(KERN_NOTICE
+			       "RDS/IB: %s: unexpected opcode 0x%x in WR!\n",
+			       __func__, send->s_wr.opcode);
+		break;
+	}
+
+	send->s_wr.opcode = 0xdead;
+
+	return rm;
+}
+
+void rds_ib_send_init_ring(struct rds_ib_connection *ic)
+{
+	struct rds_ib_send_work *send;
+	u32 i;
+
+	for (i = 0, send = ic->i_sends; i < ic->i_send_ring.w_nr; i++, send++) {
+		struct ib_sge *sge;
+
+		send->s_op = NULL;
+
+		send->s_wr.wr_id = i;
+		send->s_wr.sg_list = send->s_sge;
+		send->s_wr.ex.imm_data = 0;
+
+		sge = &send->s_sge[0];
+		sge->addr = ic->i_send_hdrs_dma + (i * sizeof(struct rds_header));
+		sge->length = sizeof(struct rds_header);
+		sge->lkey = ic->i_mr->lkey;
+
+		send->s_sge[1].lkey = ic->i_mr->lkey;
+	}
+}
+
+void rds_ib_send_clear_ring(struct rds_ib_connection *ic)
+{
+	struct rds_ib_send_work *send;
+	u32 i;
+
+	for (i = 0, send = ic->i_sends; i < ic->i_send_ring.w_nr; i++, send++) {
+		if (send->s_op && send->s_wr.opcode != 0xdead)
+			rds_ib_send_unmap_op(ic, send, IB_WC_WR_FLUSH_ERR);
+	}
+}
+
+/*
+ * The only fast path caller always has a non-zero nr, so we don't
+ * bother testing nr before performing the atomic sub.
+ */
+static void rds_ib_sub_signaled(struct rds_ib_connection *ic, int nr)
+{
+	if ((atomic_sub_return(nr, &ic->i_signaled_sends) == 0) &&
+	    waitqueue_active(&rds_ib_ring_empty_wait))
+		wake_up(&rds_ib_ring_empty_wait);
+	BUG_ON(atomic_read(&ic->i_signaled_sends) < 0);
+}
+
+/*
+ * The _oldest/_free ring operations here race cleanly with the alloc/unalloc
+ * operations performed in the send path.  As the sender allocs and potentially
+ * unallocs the next free entry in the ring it doesn't alter which is
+ * the next to be freed, which is what this is concerned with.
+ */
+void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context)
+{
+	struct rds_connection *conn = context;
+	struct rds_ib_connection *ic = conn->c_transport_data;
+	struct rds_message *rm = NULL;
+	struct ib_wc wc;
+	struct rds_ib_send_work *send;
+	u32 completed;
+	u32 oldest;
+	u32 i = 0;
+	int ret;
+	int nr_sig = 0;
+
+	rdsdebug("cq %p conn %p\n", cq, conn);
+	rds_ib_stats_inc(s_ib_tx_cq_call);
+	ret = ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
+	if (ret)
+		rdsdebug("ib_req_notify_cq send failed: %d\n", ret);
+
+	while (ib_poll_cq(cq, 1, &wc) > 0) {
+		rdsdebug("wc wr_id 0x%llx status %u (%s) byte_len %u imm_data %u\n",
+			 (unsigned long long)wc.wr_id, wc.status,
+			 rds_ib_wc_status_str(wc.status), wc.byte_len,
+			 be32_to_cpu(wc.ex.imm_data));
+		rds_ib_stats_inc(s_ib_tx_cq_event);
+
+		if (wc.wr_id == RDS_IB_ACK_WR_ID) {
+			if (time_after(jiffies, ic->i_ack_queued + HZ/2))
+				rds_ib_stats_inc(s_ib_tx_stalled);
+			rds_ib_ack_send_complete(ic);
+			continue;
+		}
+
+		oldest = rds_ib_ring_oldest(&ic->i_send_ring);
+
+		completed = rds_ib_ring_completed(&ic->i_send_ring, wc.wr_id, oldest);
+
+		for (i = 0; i < completed; i++) {
+			send = &ic->i_sends[oldest];
+			if (send->s_wr.send_flags & IB_SEND_SIGNALED)
+				nr_sig++;
+
+			rm = rds_ib_send_unmap_op(ic, send, wc.status);
+
+			if (time_after(jiffies, send->s_queued + HZ/2))
+				rds_ib_stats_inc(s_ib_tx_stalled);
+
+			if (send->s_op) {
+				if (send->s_op == rm->m_final_op) {
+					/* If anyone waited for this message to get flushed out, wake
+					 * them up now */
+					rds_message_unmapped(rm);
+				}
+				rds_message_put(rm);
+				send->s_op = NULL;
+			}
+
+			oldest = (oldest + 1) % ic->i_send_ring.w_nr;
+		}
+
+		rds_ib_ring_free(&ic->i_send_ring, completed);
+		rds_ib_sub_signaled(ic, nr_sig);
+		nr_sig = 0;
+
+		if (test_and_clear_bit(RDS_LL_SEND_FULL, &conn->c_flags) ||
+		    test_bit(0, &conn->c_map_queued))
+			queue_delayed_work(rds_wq, &conn->c_send_w, 0);
+
+		/* We expect errors as the qp is drained during shutdown */
+		if (wc.status != IB_WC_SUCCESS && rds_conn_up(conn)) {
+			rds_ib_conn_error(conn, "send completion on %pI4 had status "
+					  "%u (%s), disconnecting and reconnecting\n",
+					  &conn->c_faddr, wc.status,
+					  rds_ib_wc_status_str(wc.status));
+		}
+	}
+}
+
+/*
+ * This is the main function for allocating credits when sending
+ * messages.
+ *
+ * Conceptually, we have two counters:
+ *  -	send credits: this tells us how many WRs we're allowed
+ *	to submit without overruning the receiver's queue. For
+ *	each SEND WR we post, we decrement this by one.
+ *
+ *  -	posted credits: this tells us how many WRs we recently
+ *	posted to the receive queue. This value is transferred
+ *	to the peer as a "credit update" in a RDS header field.
+ *	Every time we transmit credits to the peer, we subtract
+ *	the amount of transferred credits from this counter.
+ *
+ * It is essential that we avoid situations where both sides have
+ * exhausted their send credits, and are unable to send new credits
+ * to the peer. We achieve this by requiring that we send at least
+ * one credit update to the peer before exhausting our credits.
+ * When new credits arrive, we subtract one credit that is withheld
+ * until we've posted new buffers and are ready to transmit these
+ * credits (see rds_ib_send_add_credits below).
+ *
+ * The RDS send code is essentially single-threaded; rds_send_xmit
+ * sets RDS_IN_XMIT to ensure exclusive access to the send ring.
+ * However, the ACK sending code is independent and can race with
+ * message SENDs.
+ *
+ * In the send path, we need to update the counters for send credits
+ * and the counter of posted buffers atomically - when we use the
+ * last available credit, we cannot allow another thread to race us
+ * and grab the posted credits counter.  Hence, we have to use a
+ * spinlock to protect the credit counter, or use atomics.
+ *
+ * Spinlocks shared between the send and the receive path are bad,
+ * because they create unnecessary delays. An early implementation
+ * using a spinlock showed a 5% degradation in throughput at some
+ * loads.
+ *
+ * This implementation avoids spinlocks completely, putting both
+ * counters into a single atomic, and updating that atomic using
+ * atomic_add (in the receive path, when receiving fresh credits),
+ * and using atomic_cmpxchg when updating the two counters.
+ */
+int rds_ib_send_grab_credits(struct rds_ib_connection *ic,
+			     u32 wanted, u32 *adv_credits, int need_posted, int max_posted)
+{
+	unsigned int avail, posted, got = 0, advertise;
+	long oldval, newval;
+
+	*adv_credits = 0;
+	if (!ic->i_flowctl)
+		return wanted;
+
+try_again:
+	advertise = 0;
+	oldval = newval = atomic_read(&ic->i_credits);
+	posted = IB_GET_POST_CREDITS(oldval);
+	avail = IB_GET_SEND_CREDITS(oldval);
+
+	rdsdebug("rds_ib_send_grab_credits(%u): credits=%u posted=%u\n",
+			wanted, avail, posted);
+
+	/* The last credit must be used to send a credit update. */
+	if (avail && !posted)
+		avail--;
+
+	if (avail < wanted) {
+		struct rds_connection *conn = ic->i_cm_id->context;
+
+		/* Oops, there aren't that many credits left! */
+		set_bit(RDS_LL_SEND_FULL, &conn->c_flags);
+		got = avail;
+	} else {
+		/* Sometimes you get what you want, lalala. */
+		got = wanted;
+	}
+	newval -= IB_SET_SEND_CREDITS(got);
+
+	/*
+	 * If need_posted is non-zero, then the caller wants
+	 * the posted regardless of whether any send credits are
+	 * available.
+	 */
+	if (posted && (got || need_posted)) {
+		advertise = min_t(unsigned int, posted, max_posted);
+		newval -= IB_SET_POST_CREDITS(advertise);
+	}
+
+	/* Finally bill everything */
+	if (atomic_cmpxchg(&ic->i_credits, oldval, newval) != oldval)
+		goto try_again;
+
+	*adv_credits = advertise;
+	return got;
+}
+
+void rds_ib_send_add_credits(struct rds_connection *conn, unsigned int credits)
+{
+	struct rds_ib_connection *ic = conn->c_transport_data;
+
+	if (credits == 0)
+		return;
+
+	rdsdebug("rds_ib_send_add_credits(%u): current=%u%s\n",
+			credits,
+			IB_GET_SEND_CREDITS(atomic_read(&ic->i_credits)),
+			test_bit(RDS_LL_SEND_FULL, &conn->c_flags) ? ", ll_send_full" : "");
+
+	atomic_add(IB_SET_SEND_CREDITS(credits), &ic->i_credits);
+	if (test_and_clear_bit(RDS_LL_SEND_FULL, &conn->c_flags))
+		queue_delayed_work(rds_wq, &conn->c_send_w, 0);
+
+	WARN_ON(IB_GET_SEND_CREDITS(credits) >= 16384);
+
+	rds_ib_stats_inc(s_ib_rx_credit_updates);
+}
+
+void rds_ib_advertise_credits(struct rds_connection *conn, unsigned int posted)
+{
+	struct rds_ib_connection *ic = conn->c_transport_data;
+
+	if (posted == 0)
+		return;
+
+	atomic_add(IB_SET_POST_CREDITS(posted), &ic->i_credits);
+
+	/* Decide whether to send an update to the peer now.
+	 * If we would send a credit update for every single buffer we
+	 * post, we would end up with an ACK storm (ACK arrives,
+	 * consumes buffer, we refill the ring, send ACK to remote
+	 * advertising the newly posted buffer... ad inf)
+	 *
+	 * Performance pretty much depends on how often we send
+	 * credit updates - too frequent updates mean lots of ACKs.
+	 * Too infrequent updates, and the peer will run out of
+	 * credits and has to throttle.
+	 * For the time being, 16 seems to be a good compromise.
+	 */
+	if (IB_GET_POST_CREDITS(atomic_read(&ic->i_credits)) >= 16)
+		set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
+}
+
+static inline int rds_ib_set_wr_signal_state(struct rds_ib_connection *ic,
+					     struct rds_ib_send_work *send,
+					     bool notify)
+{
+	/*
+	 * We want to delay signaling completions just enough to get
+	 * the batching benefits but not so much that we create dead time
+	 * on the wire.
+	 */
+	if (ic->i_unsignaled_wrs-- == 0 || notify) {
+		ic->i_unsignaled_wrs = rds_ib_sysctl_max_unsig_wrs;
+		send->s_wr.send_flags |= IB_SEND_SIGNALED;
+		return 1;
+	}
+	return 0;
+}
+
+/*
+ * This can be called multiple times for a given message.  The first time
+ * we see a message we map its scatterlist into the IB device so that
+ * we can provide that mapped address to the IB scatter gather entries
+ * in the IB work requests.  We translate the scatterlist into a series
+ * of work requests that fragment the message.  These work requests complete
+ * in order so we pass ownership of the message to the completion handler
+ * once we send the final fragment.
+ *
+ * The RDS core uses the c_send_lock to only enter this function once
+ * per connection.  This makes sure that the tx ring alloc/unalloc pairs
+ * don't get out of sync and confuse the ring.
+ */
+int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
+		unsigned int hdr_off, unsigned int sg, unsigned int off)
+{
+	struct rds_ib_connection *ic = conn->c_transport_data;
+	struct ib_device *dev = ic->i_cm_id->device;
+	struct rds_ib_send_work *send = NULL;
+	struct rds_ib_send_work *first;
+	struct rds_ib_send_work *prev;
+	struct ib_send_wr *failed_wr;
+	struct scatterlist *scat;
+	u32 pos;
+	u32 i;
+	u32 work_alloc;
+	u32 credit_alloc = 0;
+	u32 posted;
+	u32 adv_credits = 0;
+	int send_flags = 0;
+	int bytes_sent = 0;
+	int ret;
+	int flow_controlled = 0;
+	int nr_sig = 0;
+
+	BUG_ON(off % RDS_FRAG_SIZE);
+	BUG_ON(hdr_off != 0 && hdr_off != sizeof(struct rds_header));
+
+	/* Do not send cong updates to IB loopback */
+	if (conn->c_loopback
+	    && rm->m_inc.i_hdr.h_flags & RDS_FLAG_CONG_BITMAP) {
+		rds_cong_map_updated(conn->c_fcong, ~(u64) 0);
+		scat = &rm->data.op_sg[sg];
+		ret = max_t(int, RDS_CONG_MAP_BYTES, scat->length);
+		return sizeof(struct rds_header) + ret;
+	}
+
+	/* FIXME we may overallocate here */
+	if (be32_to_cpu(rm->m_inc.i_hdr.h_len) == 0)
+		i = 1;
+	else
+		i = ceil(be32_to_cpu(rm->m_inc.i_hdr.h_len), RDS_FRAG_SIZE);
+
+	work_alloc = rds_ib_ring_alloc(&ic->i_send_ring, i, &pos);
+	if (work_alloc == 0) {
+		set_bit(RDS_LL_SEND_FULL, &conn->c_flags);
+		rds_ib_stats_inc(s_ib_tx_ring_full);
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	if (ic->i_flowctl) {
+		credit_alloc = rds_ib_send_grab_credits(ic, work_alloc, &posted, 0, RDS_MAX_ADV_CREDIT);
+		adv_credits += posted;
+		if (credit_alloc < work_alloc) {
+			rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc - credit_alloc);
+			work_alloc = credit_alloc;
+			flow_controlled = 1;
+		}
+		if (work_alloc == 0) {
+			set_bit(RDS_LL_SEND_FULL, &conn->c_flags);
+			rds_ib_stats_inc(s_ib_tx_throttle);
+			ret = -ENOMEM;
+			goto out;
+		}
+	}
+
+	/* map the message the first time we see it */
+	if (!ic->i_data_op) {
+		if (rm->data.op_nents) {
+			rm->data.op_count = ib_dma_map_sg(dev,
+							  rm->data.op_sg,
+							  rm->data.op_nents,
+							  DMA_TO_DEVICE);
+			rdsdebug("ic %p mapping rm %p: %d\n", ic, rm, rm->data.op_count);
+			if (rm->data.op_count == 0) {
+				rds_ib_stats_inc(s_ib_tx_sg_mapping_failure);
+				rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
+				ret = -ENOMEM; /* XXX ? */
+				goto out;
+			}
+		} else {
+			rm->data.op_count = 0;
+		}
+
+		rds_message_addref(rm);
+		ic->i_data_op = &rm->data;
+
+		/* Finalize the header */
+		if (test_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags))
+			rm->m_inc.i_hdr.h_flags |= RDS_FLAG_ACK_REQUIRED;
+		if (test_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags))
+			rm->m_inc.i_hdr.h_flags |= RDS_FLAG_RETRANSMITTED;
+
+		/* If it has a RDMA op, tell the peer we did it. This is
+		 * used by the peer to release use-once RDMA MRs. */
+		if (rm->rdma.op_active) {
+			struct rds_ext_header_rdma ext_hdr;
+
+			ext_hdr.h_rdma_rkey = cpu_to_be32(rm->rdma.op_rkey);
+			rds_message_add_extension(&rm->m_inc.i_hdr,
+					RDS_EXTHDR_RDMA, &ext_hdr, sizeof(ext_hdr));
+		}
+		if (rm->m_rdma_cookie) {
+			rds_message_add_rdma_dest_extension(&rm->m_inc.i_hdr,
+					rds_rdma_cookie_key(rm->m_rdma_cookie),
+					rds_rdma_cookie_offset(rm->m_rdma_cookie));
+		}
+
+		/* Note - rds_ib_piggyb_ack clears the ACK_REQUIRED bit, so
+		 * we should not do this unless we have a chance of at least
+		 * sticking the header into the send ring. Which is why we
+		 * should call rds_ib_ring_alloc first. */
+		rm->m_inc.i_hdr.h_ack = cpu_to_be64(rds_ib_piggyb_ack(ic));
+		rds_message_make_checksum(&rm->m_inc.i_hdr);
+
+		/*
+		 * Update adv_credits since we reset the ACK_REQUIRED bit.
+		 */
+		if (ic->i_flowctl) {
+			rds_ib_send_grab_credits(ic, 0, &posted, 1, RDS_MAX_ADV_CREDIT - adv_credits);
+			adv_credits += posted;
+			BUG_ON(adv_credits > 255);
+		}
+	}
+
+	/* Sometimes you want to put a fence between an RDMA
+	 * READ and the following SEND.
+	 * We could either do this all the time
+	 * or when requested by the user. Right now, we let
+	 * the application choose.
+	 */
+	if (rm->rdma.op_active && rm->rdma.op_fence)
+		send_flags = IB_SEND_FENCE;
+
+	/* Each frag gets a header. Msgs may be 0 bytes */
+	send = &ic->i_sends[pos];
+	first = send;
+	prev = NULL;
+	scat = &ic->i_data_op->op_sg[sg];
+	i = 0;
+	do {
+		unsigned int len = 0;
+
+		/* Set up the header */
+		send->s_wr.send_flags = send_flags;
+		send->s_wr.opcode = IB_WR_SEND;
+		send->s_wr.num_sge = 1;
+		send->s_wr.next = NULL;
+		send->s_queued = jiffies;
+		send->s_op = NULL;
+
+		send->s_sge[0].addr = ic->i_send_hdrs_dma
+			+ (pos * sizeof(struct rds_header));
+		send->s_sge[0].length = sizeof(struct rds_header);
+
+		memcpy(&ic->i_send_hdrs[pos], &rm->m_inc.i_hdr, sizeof(struct rds_header));
+
+		/* Set up the data, if present */
+		if (i < work_alloc
+		    && scat != &rm->data.op_sg[rm->data.op_count]) {
+			len = min(RDS_FRAG_SIZE, ib_sg_dma_len(dev, scat) - off);
+			send->s_wr.num_sge = 2;
+
+			send->s_sge[1].addr = ib_sg_dma_address(dev, scat) + off;
+			send->s_sge[1].length = len;
+
+			bytes_sent += len;
+			off += len;
+			if (off == ib_sg_dma_len(dev, scat)) {
+				scat++;
+				off = 0;
+			}
+		}
+
+		rds_ib_set_wr_signal_state(ic, send, 0);
+
+		/*
+		 * Always signal the last one if we're stopping due to flow control.
+		 */
+		if (ic->i_flowctl && flow_controlled && i == (work_alloc-1))
+			send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED;
+
+		if (send->s_wr.send_flags & IB_SEND_SIGNALED)
+			nr_sig++;
+
+		rdsdebug("send %p wr %p num_sge %u next %p\n", send,
+			 &send->s_wr, send->s_wr.num_sge, send->s_wr.next);
+
+		if (ic->i_flowctl && adv_credits) {
+			struct rds_header *hdr = &ic->i_send_hdrs[pos];
+
+			/* add credit and redo the header checksum */
+			hdr->h_credit = adv_credits;
+			rds_message_make_checksum(hdr);
+			adv_credits = 0;
+			rds_ib_stats_inc(s_ib_tx_credit_updates);
+		}
+
+		if (prev)
+			prev->s_wr.next = &send->s_wr;
+		prev = send;
+
+		pos = (pos + 1) % ic->i_send_ring.w_nr;
+		send = &ic->i_sends[pos];
+		i++;
+
+	} while (i < work_alloc
+		 && scat != &rm->data.op_sg[rm->data.op_count]);
+
+	/* Account the RDS header in the number of bytes we sent, but just once.
+	 * The caller has no concept of fragmentation. */
+	if (hdr_off == 0)
+		bytes_sent += sizeof(struct rds_header);
+
+	/* if we finished the message then send completion owns it */
+	if (scat == &rm->data.op_sg[rm->data.op_count]) {
+		prev->s_op = ic->i_data_op;
+		prev->s_wr.send_flags |= IB_SEND_SOLICITED;
+		ic->i_data_op = NULL;
+	}
+
+	/* Put back wrs & credits we didn't use */
+	if (i < work_alloc) {
+		rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc - i);
+		work_alloc = i;
+	}
+	if (ic->i_flowctl && i < credit_alloc)
+		rds_ib_send_add_credits(conn, credit_alloc - i);
+
+	if (nr_sig)
+		atomic_add(nr_sig, &ic->i_signaled_sends);
+
+	/* XXX need to worry about failed_wr and partial sends. */
+	failed_wr = &first->s_wr;
+	ret = ib_post_send(ic->i_cm_id->qp, &first->s_wr, &failed_wr);
+	rdsdebug("ic %p first %p (wr %p) ret %d wr %p\n", ic,
+		 first, &first->s_wr, ret, failed_wr);
+	BUG_ON(failed_wr != &first->s_wr);
+	if (ret) {
+		printk(KERN_WARNING "RDS/IB: ib_post_send to %pI4 "
+		       "returned %d\n", &conn->c_faddr, ret);
+		rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
+		rds_ib_sub_signaled(ic, nr_sig);
+		if (prev->s_op) {
+			ic->i_data_op = prev->s_op;
+			prev->s_op = NULL;
+		}
+
+		rds_ib_conn_error(ic->conn, "ib_post_send failed\n");
+		goto out;
+	}
+
+	ret = bytes_sent;
+out:
+	BUG_ON(adv_credits);
+	return ret;
+}
+
+/*
+ * Issue atomic operation.
+ * A simplified version of the rdma case, we always map 1 SG, and
+ * only 8 bytes, for the return value from the atomic operation.
+ */
+int rds_ib_xmit_atomic(struct rds_connection *conn, struct rm_atomic_op *op)
+{
+	struct rds_ib_connection *ic = conn->c_transport_data;
+	struct rds_ib_send_work *send = NULL;
+	struct ib_send_wr *failed_wr;
+	struct rds_ib_device *rds_ibdev;
+	u32 pos;
+	u32 work_alloc;
+	int ret;
+	int nr_sig = 0;
+
+	rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rds_ib_client);
+
+	work_alloc = rds_ib_ring_alloc(&ic->i_send_ring, 1, &pos);
+	if (work_alloc != 1) {
+		rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
+		rds_ib_stats_inc(s_ib_tx_ring_full);
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	/* address of send request in ring */
+	send = &ic->i_sends[pos];
+	send->s_queued = jiffies;
+
+	if (op->op_type == RDS_ATOMIC_TYPE_CSWP) {
+		send->s_wr.opcode = IB_WR_MASKED_ATOMIC_CMP_AND_SWP;
+		send->s_wr.wr.atomic.compare_add = op->op_m_cswp.compare;
+		send->s_wr.wr.atomic.swap = op->op_m_cswp.swap;
+		send->s_wr.wr.atomic.compare_add_mask = op->op_m_cswp.compare_mask;
+		send->s_wr.wr.atomic.swap_mask = op->op_m_cswp.swap_mask;
+	} else { /* FADD */
+		send->s_wr.opcode = IB_WR_MASKED_ATOMIC_FETCH_AND_ADD;
+		send->s_wr.wr.atomic.compare_add = op->op_m_fadd.add;
+		send->s_wr.wr.atomic.swap = 0;
+		send->s_wr.wr.atomic.compare_add_mask = op->op_m_fadd.nocarry_mask;
+		send->s_wr.wr.atomic.swap_mask = 0;
+	}
+	nr_sig = rds_ib_set_wr_signal_state(ic, send, op->op_notify);
+	send->s_wr.num_sge = 1;
+	send->s_wr.next = NULL;
+	send->s_wr.wr.atomic.remote_addr = op->op_remote_addr;
+	send->s_wr.wr.atomic.rkey = op->op_rkey;
+	send->s_op = op;
+	rds_message_addref(container_of(send->s_op, struct rds_message, atomic));
+
+	/* map 8 byte retval buffer to the device */
+	ret = ib_dma_map_sg(ic->i_cm_id->device, op->op_sg, 1, DMA_FROM_DEVICE);
+	rdsdebug("ic %p mapping atomic op %p. mapped %d pg\n", ic, op, ret);
+	if (ret != 1) {
+		rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
+		rds_ib_stats_inc(s_ib_tx_sg_mapping_failure);
+		ret = -ENOMEM; /* XXX ? */
+		goto out;
+	}
+
+	/* Convert our struct scatterlist to struct ib_sge */
+	send->s_sge[0].addr = ib_sg_dma_address(ic->i_cm_id->device, op->op_sg);
+	send->s_sge[0].length = ib_sg_dma_len(ic->i_cm_id->device, op->op_sg);
+	send->s_sge[0].lkey = ic->i_mr->lkey;
+
+	rdsdebug("rva %Lx rpa %Lx len %u\n", op->op_remote_addr,
+		 send->s_sge[0].addr, send->s_sge[0].length);
+
+	if (nr_sig)
+		atomic_add(nr_sig, &ic->i_signaled_sends);
+
+	failed_wr = &send->s_wr;
+	ret = ib_post_send(ic->i_cm_id->qp, &send->s_wr, &failed_wr);
+	rdsdebug("ic %p send %p (wr %p) ret %d wr %p\n", ic,
+		 send, &send->s_wr, ret, failed_wr);
+	BUG_ON(failed_wr != &send->s_wr);
+	if (ret) {
+		printk(KERN_WARNING "RDS/IB: atomic ib_post_send to %pI4 "
+		       "returned %d\n", &conn->c_faddr, ret);
+		rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
+		rds_ib_sub_signaled(ic, nr_sig);
+		goto out;
+	}
+
+	if (unlikely(failed_wr != &send->s_wr)) {
+		printk(KERN_WARNING "RDS/IB: atomic ib_post_send() rc=%d, but failed_wqe updated!\n", ret);
+		BUG_ON(failed_wr != &send->s_wr);
+	}
+
+out:
+	return ret;
+}
+
+int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op)
+{
+	struct rds_ib_connection *ic = conn->c_transport_data;
+	struct rds_ib_send_work *send = NULL;
+	struct rds_ib_send_work *first;
+	struct rds_ib_send_work *prev;
+	struct ib_send_wr *failed_wr;
+	struct scatterlist *scat;
+	unsigned long len;
+	u64 remote_addr = op->op_remote_addr;
+	u32 max_sge = ic->rds_ibdev->max_sge;
+	u32 pos;
+	u32 work_alloc;
+	u32 i;
+	u32 j;
+	int sent;
+	int ret;
+	int num_sge;
+	int nr_sig = 0;
+
+	/* map the op the first time we see it */
+	if (!op->op_mapped) {
+		op->op_count = ib_dma_map_sg(ic->i_cm_id->device,
+					     op->op_sg, op->op_nents, (op->op_write) ?
+					     DMA_TO_DEVICE : DMA_FROM_DEVICE);
+		rdsdebug("ic %p mapping op %p: %d\n", ic, op, op->op_count);
+		if (op->op_count == 0) {
+			rds_ib_stats_inc(s_ib_tx_sg_mapping_failure);
+			ret = -ENOMEM; /* XXX ? */
+			goto out;
+		}
+
+		op->op_mapped = 1;
+	}
+
+	/*
+	 * Instead of knowing how to return a partial rdma read/write we insist that there
+	 * be enough work requests to send the entire message.
+	 */
+	i = ceil(op->op_count, max_sge);
+
+	work_alloc = rds_ib_ring_alloc(&ic->i_send_ring, i, &pos);
+	if (work_alloc != i) {
+		rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
+		rds_ib_stats_inc(s_ib_tx_ring_full);
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	send = &ic->i_sends[pos];
+	first = send;
+	prev = NULL;
+	scat = &op->op_sg[0];
+	sent = 0;
+	num_sge = op->op_count;
+
+	for (i = 0; i < work_alloc && scat != &op->op_sg[op->op_count]; i++) {
+		send->s_wr.send_flags = 0;
+		send->s_queued = jiffies;
+		send->s_op = NULL;
+
+		nr_sig += rds_ib_set_wr_signal_state(ic, send, op->op_notify);
+
+		send->s_wr.opcode = op->op_write ? IB_WR_RDMA_WRITE : IB_WR_RDMA_READ;
+		send->s_wr.wr.rdma.remote_addr = remote_addr;
+		send->s_wr.wr.rdma.rkey = op->op_rkey;
+
+		if (num_sge > max_sge) {
+			send->s_wr.num_sge = max_sge;
+			num_sge -= max_sge;
+		} else {
+			send->s_wr.num_sge = num_sge;
+		}
+
+		send->s_wr.next = NULL;
+
+		if (prev)
+			prev->s_wr.next = &send->s_wr;
+
+		for (j = 0; j < send->s_wr.num_sge && scat != &op->op_sg[op->op_count]; j++) {
+			len = ib_sg_dma_len(ic->i_cm_id->device, scat);
+			send->s_sge[j].addr =
+				 ib_sg_dma_address(ic->i_cm_id->device, scat);
+			send->s_sge[j].length = len;
+			send->s_sge[j].lkey = ic->i_mr->lkey;
+
+			sent += len;
+			rdsdebug("ic %p sent %d remote_addr %llu\n", ic, sent, remote_addr);
+
+			remote_addr += len;
+			scat++;
+		}
+
+		rdsdebug("send %p wr %p num_sge %u next %p\n", send,
+			&send->s_wr, send->s_wr.num_sge, send->s_wr.next);
+
+		prev = send;
+		if (++send == &ic->i_sends[ic->i_send_ring.w_nr])
+			send = ic->i_sends;
+	}
+
+	/* give a reference to the last op */
+	if (scat == &op->op_sg[op->op_count]) {
+		prev->s_op = op;
+		rds_message_addref(container_of(op, struct rds_message, rdma));
+	}
+
+	if (i < work_alloc) {
+		rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc - i);
+		work_alloc = i;
+	}
+
+	if (nr_sig)
+		atomic_add(nr_sig, &ic->i_signaled_sends);
+
+	failed_wr = &first->s_wr;
+	ret = ib_post_send(ic->i_cm_id->qp, &first->s_wr, &failed_wr);
+	rdsdebug("ic %p first %p (wr %p) ret %d wr %p\n", ic,
+		 first, &first->s_wr, ret, failed_wr);
+	BUG_ON(failed_wr != &first->s_wr);
+	if (ret) {
+		printk(KERN_WARNING "RDS/IB: rdma ib_post_send to %pI4 "
+		       "returned %d\n", &conn->c_faddr, ret);
+		rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
+		rds_ib_sub_signaled(ic, nr_sig);
+		goto out;
+	}
+
+	if (unlikely(failed_wr != &first->s_wr)) {
+		printk(KERN_WARNING "RDS/IB: ib_post_send() rc=%d, but failed_wqe updated!\n", ret);
+		BUG_ON(failed_wr != &first->s_wr);
+	}
+
+
+out:
+	return ret;
+}
+
+void rds_ib_xmit_complete(struct rds_connection *conn)
+{
+	struct rds_ib_connection *ic = conn->c_transport_data;
+
+	/* We may have a pending ACK or window update we were unable
+	 * to send previously (due to flow control). Try again. */
+	rds_ib_attempt_ack(ic);
+}
diff --git a/net/rds/ib_stats.c b/net/rds/ib_stats.c
new file mode 100644
index 0000000..2d5965d
--- /dev/null
+++ b/net/rds/ib_stats.c
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/percpu.h>
+#include <linux/seq_file.h>
+#include <linux/proc_fs.h>
+
+#include "rds.h"
+#include "ib.h"
+
+DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_ib_statistics, rds_ib_stats);
+
+static const char *const rds_ib_stat_names[] = {
+	"ib_connect_raced",
+	"ib_listen_closed_stale",
+	"ib_tx_cq_call",
+	"ib_tx_cq_event",
+	"ib_tx_ring_full",
+	"ib_tx_throttle",
+	"ib_tx_sg_mapping_failure",
+	"ib_tx_stalled",
+	"ib_tx_credit_updates",
+	"ib_rx_cq_call",
+	"ib_rx_cq_event",
+	"ib_rx_ring_empty",
+	"ib_rx_refill_from_cq",
+	"ib_rx_refill_from_thread",
+	"ib_rx_alloc_limit",
+	"ib_rx_credit_updates",
+	"ib_ack_sent",
+	"ib_ack_send_failure",
+	"ib_ack_send_delayed",
+	"ib_ack_send_piggybacked",
+	"ib_ack_received",
+	"ib_rdma_mr_alloc",
+	"ib_rdma_mr_free",
+	"ib_rdma_mr_used",
+	"ib_rdma_mr_pool_flush",
+	"ib_rdma_mr_pool_wait",
+	"ib_rdma_mr_pool_depleted",
+	"ib_atomic_cswp",
+	"ib_atomic_fadd",
+};
+
+unsigned int rds_ib_stats_info_copy(struct rds_info_iterator *iter,
+				    unsigned int avail)
+{
+	struct rds_ib_statistics stats = {0, };
+	uint64_t *src;
+	uint64_t *sum;
+	size_t i;
+	int cpu;
+
+	if (avail < ARRAY_SIZE(rds_ib_stat_names))
+		goto out;
+
+	for_each_online_cpu(cpu) {
+		src = (uint64_t *)&(per_cpu(rds_ib_stats, cpu));
+		sum = (uint64_t *)&stats;
+		for (i = 0; i < sizeof(stats) / sizeof(uint64_t); i++)
+			*(sum++) += *(src++);
+	}
+
+	rds_stats_info_copy(iter, (uint64_t *)&stats, rds_ib_stat_names,
+			    ARRAY_SIZE(rds_ib_stat_names));
+out:
+	return ARRAY_SIZE(rds_ib_stat_names);
+}
diff --git a/net/rds/ib_sysctl.c b/net/rds/ib_sysctl.c
new file mode 100644
index 0000000..e4e41b3
--- /dev/null
+++ b/net/rds/ib_sysctl.c
@@ -0,0 +1,121 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/sysctl.h>
+#include <linux/proc_fs.h>
+
+#include "ib.h"
+
+static struct ctl_table_header *rds_ib_sysctl_hdr;
+
+unsigned long rds_ib_sysctl_max_send_wr = RDS_IB_DEFAULT_SEND_WR;
+unsigned long rds_ib_sysctl_max_recv_wr = RDS_IB_DEFAULT_RECV_WR;
+unsigned long rds_ib_sysctl_max_recv_allocation = (128 * 1024 * 1024) / RDS_FRAG_SIZE;
+static unsigned long rds_ib_sysctl_max_wr_min = 1;
+/* hardware will fail CQ creation long before this */
+static unsigned long rds_ib_sysctl_max_wr_max = (u32)~0;
+
+unsigned long rds_ib_sysctl_max_unsig_wrs = 16;
+static unsigned long rds_ib_sysctl_max_unsig_wr_min = 1;
+static unsigned long rds_ib_sysctl_max_unsig_wr_max = 64;
+
+/*
+ * This sysctl does nothing.
+ *
+ * Backwards compatibility with RDS 3.0 wire protocol
+ * disables initial FC credit exchange.
+ * If it's ever possible to drop 3.0 support,
+ * setting this to 1 and moving init/refill of send/recv
+ * rings from ib_cm_connect_complete() back into ib_setup_qp()
+ * will cause credits to be added before protocol negotiation.
+ */
+unsigned int rds_ib_sysctl_flow_control = 0;
+
+static struct ctl_table rds_ib_sysctl_table[] = {
+	{
+		.procname       = "max_send_wr",
+		.data		= &rds_ib_sysctl_max_send_wr,
+		.maxlen         = sizeof(unsigned long),
+		.mode           = 0644,
+		.proc_handler   = proc_doulongvec_minmax,
+		.extra1		= &rds_ib_sysctl_max_wr_min,
+		.extra2		= &rds_ib_sysctl_max_wr_max,
+	},
+	{
+		.procname       = "max_recv_wr",
+		.data		= &rds_ib_sysctl_max_recv_wr,
+		.maxlen         = sizeof(unsigned long),
+		.mode           = 0644,
+		.proc_handler   = proc_doulongvec_minmax,
+		.extra1		= &rds_ib_sysctl_max_wr_min,
+		.extra2		= &rds_ib_sysctl_max_wr_max,
+	},
+	{
+		.procname       = "max_unsignaled_wr",
+		.data		= &rds_ib_sysctl_max_unsig_wrs,
+		.maxlen         = sizeof(unsigned long),
+		.mode           = 0644,
+		.proc_handler   = proc_doulongvec_minmax,
+		.extra1		= &rds_ib_sysctl_max_unsig_wr_min,
+		.extra2		= &rds_ib_sysctl_max_unsig_wr_max,
+	},
+	{
+		.procname       = "max_recv_allocation",
+		.data		= &rds_ib_sysctl_max_recv_allocation,
+		.maxlen         = sizeof(unsigned long),
+		.mode           = 0644,
+		.proc_handler   = proc_doulongvec_minmax,
+	},
+	{
+		.procname	= "flow_control",
+		.data		= &rds_ib_sysctl_flow_control,
+		.maxlen		= sizeof(rds_ib_sysctl_flow_control),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{ }
+};
+
+void rds_ib_sysctl_exit(void)
+{
+	if (rds_ib_sysctl_hdr)
+		unregister_net_sysctl_table(rds_ib_sysctl_hdr);
+}
+
+int rds_ib_sysctl_init(void)
+{
+	rds_ib_sysctl_hdr = register_net_sysctl(&init_net, "net/rds/ib", rds_ib_sysctl_table);
+	if (!rds_ib_sysctl_hdr)
+		return -ENOMEM;
+	return 0;
+}
diff --git a/net/rds/info.c b/net/rds/info.c
new file mode 100644
index 0000000..9a6b4f6
--- /dev/null
+++ b/net/rds/info.c
@@ -0,0 +1,243 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/percpu.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <linux/proc_fs.h>
+#include <linux/export.h>
+
+#include "rds.h"
+
+/*
+ * This file implements a getsockopt() call which copies a set of fixed
+ * sized structs into a user-specified buffer as a means of providing
+ * read-only information about RDS.
+ *
+ * For a given information source there are a given number of fixed sized
+ * structs at a given time.  The structs are only copied if the user-specified
+ * buffer is big enough.  The destination pages that make up the buffer
+ * are pinned for the duration of the copy.
+ *
+ * This gives us the following benefits:
+ *
+ * - simple implementation, no copy "position" across multiple calls
+ * - consistent snapshot of an info source
+ * - atomic copy works well with whatever locking info source has
+ * - one portable tool to get rds info across implementations
+ * - long-lived tool can get info without allocating
+ *
+ * at the following costs:
+ *
+ * - info source copy must be pinned, may be "large"
+ */
+
+struct rds_info_iterator {
+	struct page **pages;
+	void *addr;
+	unsigned long offset;
+};
+
+static DEFINE_SPINLOCK(rds_info_lock);
+static rds_info_func rds_info_funcs[RDS_INFO_LAST - RDS_INFO_FIRST + 1];
+
+void rds_info_register_func(int optname, rds_info_func func)
+{
+	int offset = optname - RDS_INFO_FIRST;
+
+	BUG_ON(optname < RDS_INFO_FIRST || optname > RDS_INFO_LAST);
+
+	spin_lock(&rds_info_lock);
+	BUG_ON(rds_info_funcs[offset]);
+	rds_info_funcs[offset] = func;
+	spin_unlock(&rds_info_lock);
+}
+EXPORT_SYMBOL_GPL(rds_info_register_func);
+
+void rds_info_deregister_func(int optname, rds_info_func func)
+{
+	int offset = optname - RDS_INFO_FIRST;
+
+	BUG_ON(optname < RDS_INFO_FIRST || optname > RDS_INFO_LAST);
+
+	spin_lock(&rds_info_lock);
+	BUG_ON(rds_info_funcs[offset] != func);
+	rds_info_funcs[offset] = NULL;
+	spin_unlock(&rds_info_lock);
+}
+EXPORT_SYMBOL_GPL(rds_info_deregister_func);
+
+/*
+ * Typically we hold an atomic kmap across multiple rds_info_copy() calls
+ * because the kmap is so expensive.  This must be called before using blocking
+ * operations while holding the mapping and as the iterator is torn down.
+ */
+void rds_info_iter_unmap(struct rds_info_iterator *iter)
+{
+	if (iter->addr) {
+		kunmap_atomic(iter->addr);
+		iter->addr = NULL;
+	}
+}
+
+/*
+ * get_user_pages() called flush_dcache_page() on the pages for us.
+ */
+void rds_info_copy(struct rds_info_iterator *iter, void *data,
+		   unsigned long bytes)
+{
+	unsigned long this;
+
+	while (bytes) {
+		if (!iter->addr)
+			iter->addr = kmap_atomic(*iter->pages);
+
+		this = min(bytes, PAGE_SIZE - iter->offset);
+
+		rdsdebug("page %p addr %p offset %lu this %lu data %p "
+			  "bytes %lu\n", *iter->pages, iter->addr,
+			  iter->offset, this, data, bytes);
+
+		memcpy(iter->addr + iter->offset, data, this);
+
+		data += this;
+		bytes -= this;
+		iter->offset += this;
+
+		if (iter->offset == PAGE_SIZE) {
+			kunmap_atomic(iter->addr);
+			iter->addr = NULL;
+			iter->offset = 0;
+			iter->pages++;
+		}
+	}
+}
+EXPORT_SYMBOL_GPL(rds_info_copy);
+
+/*
+ * @optval points to the userspace buffer that the information snapshot
+ * will be copied into.
+ *
+ * @optlen on input is the size of the buffer in userspace.  @optlen
+ * on output is the size of the requested snapshot in bytes.
+ *
+ * This function returns -errno if there is a failure, particularly -ENOSPC
+ * if the given userspace buffer was not large enough to fit the snapshot.
+ * On success it returns the positive number of bytes of each array element
+ * in the snapshot.
+ */
+int rds_info_getsockopt(struct socket *sock, int optname, char __user *optval,
+			int __user *optlen)
+{
+	struct rds_info_iterator iter;
+	struct rds_info_lengths lens;
+	unsigned long nr_pages = 0;
+	unsigned long start;
+	unsigned long i;
+	rds_info_func func;
+	struct page **pages = NULL;
+	int ret;
+	int len;
+	int total;
+
+	if (get_user(len, optlen)) {
+		ret = -EFAULT;
+		goto out;
+	}
+
+	/* check for all kinds of wrapping and the like */
+	start = (unsigned long)optval;
+	if (len < 0 || len + PAGE_SIZE - 1 < len || start + len < start) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	/* a 0 len call is just trying to probe its length */
+	if (len == 0)
+		goto call_func;
+
+	nr_pages = (PAGE_ALIGN(start + len) - (start & PAGE_MASK))
+			>> PAGE_SHIFT;
+
+	pages = kmalloc(nr_pages * sizeof(struct page *), GFP_KERNEL);
+	if (!pages) {
+		ret = -ENOMEM;
+		goto out;
+	}
+	ret = get_user_pages_fast(start, nr_pages, 1, pages);
+	if (ret != nr_pages) {
+		if (ret > 0)
+			nr_pages = ret;
+		else
+			nr_pages = 0;
+		ret = -EAGAIN; /* XXX ? */
+		goto out;
+	}
+
+	rdsdebug("len %d nr_pages %lu\n", len, nr_pages);
+
+call_func:
+	func = rds_info_funcs[optname - RDS_INFO_FIRST];
+	if (!func) {
+		ret = -ENOPROTOOPT;
+		goto out;
+	}
+
+	iter.pages = pages;
+	iter.addr = NULL;
+	iter.offset = start & (PAGE_SIZE - 1);
+
+	func(sock, len, &iter, &lens);
+	BUG_ON(lens.each == 0);
+
+	total = lens.nr * lens.each;
+
+	rds_info_iter_unmap(&iter);
+
+	if (total > len) {
+		len = total;
+		ret = -ENOSPC;
+	} else {
+		len = total;
+		ret = lens.each;
+	}
+
+	if (put_user(len, optlen))
+		ret = -EFAULT;
+
+out:
+	for (i = 0; pages && i < nr_pages; i++)
+		put_page(pages[i]);
+	kfree(pages);
+
+	return ret;
+}
diff --git a/net/rds/info.h b/net/rds/info.h
new file mode 100644
index 0000000..b6c052c
--- /dev/null
+++ b/net/rds/info.h
@@ -0,0 +1,30 @@
+#ifndef _RDS_INFO_H
+#define _RDS_INFO_H
+
+struct rds_info_lengths {
+	unsigned int	nr;
+	unsigned int	each;
+};
+
+struct rds_info_iterator;
+
+/*
+ * These functions must fill in the fields of @lens to reflect the size
+ * of the available info source.  If the snapshot fits in @len then it
+ * should be copied using @iter.  The caller will deduce if it was copied
+ * or not by comparing the lengths.
+ */
+typedef void (*rds_info_func)(struct socket *sock, unsigned int len,
+			      struct rds_info_iterator *iter,
+			      struct rds_info_lengths *lens);
+
+void rds_info_register_func(int optname, rds_info_func func);
+void rds_info_deregister_func(int optname, rds_info_func func);
+int rds_info_getsockopt(struct socket *sock, int optname, char __user *optval,
+			int __user *optlen);
+void rds_info_copy(struct rds_info_iterator *iter, void *data,
+		   unsigned long bytes);
+void rds_info_iter_unmap(struct rds_info_iterator *iter);
+
+
+#endif
diff --git a/net/rds/iw.c b/net/rds/iw.c
new file mode 100644
index 0000000..5899356
--- /dev/null
+++ b/net/rds/iw.c
@@ -0,0 +1,329 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/in.h>
+#include <linux/if.h>
+#include <linux/netdevice.h>
+#include <linux/inetdevice.h>
+#include <linux/if_arp.h>
+#include <linux/delay.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+
+#include "rds.h"
+#include "iw.h"
+
+unsigned int fastreg_pool_size = RDS_FASTREG_POOL_SIZE;
+unsigned int fastreg_message_size = RDS_FASTREG_SIZE + 1; /* +1 allows for unaligned MRs */
+
+module_param(fastreg_pool_size, int, 0444);
+MODULE_PARM_DESC(fastreg_pool_size, " Max number of fastreg MRs per device");
+module_param(fastreg_message_size, int, 0444);
+MODULE_PARM_DESC(fastreg_message_size, " Max size of a RDMA transfer (fastreg MRs)");
+
+struct list_head rds_iw_devices;
+
+/* NOTE: if also grabbing iwdev lock, grab this first */
+DEFINE_SPINLOCK(iw_nodev_conns_lock);
+LIST_HEAD(iw_nodev_conns);
+
+static void rds_iw_add_one(struct ib_device *device)
+{
+	struct rds_iw_device *rds_iwdev;
+	struct ib_device_attr *dev_attr;
+
+	/* Only handle iwarp devices */
+	if (device->node_type != RDMA_NODE_RNIC)
+		return;
+
+	dev_attr = kmalloc(sizeof *dev_attr, GFP_KERNEL);
+	if (!dev_attr)
+		return;
+
+	if (ib_query_device(device, dev_attr)) {
+		rdsdebug("Query device failed for %s\n", device->name);
+		goto free_attr;
+	}
+
+	rds_iwdev = kmalloc(sizeof *rds_iwdev, GFP_KERNEL);
+	if (!rds_iwdev)
+		goto free_attr;
+
+	spin_lock_init(&rds_iwdev->spinlock);
+
+	rds_iwdev->dma_local_lkey = !!(dev_attr->device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY);
+	rds_iwdev->max_wrs = dev_attr->max_qp_wr;
+	rds_iwdev->max_sge = min(dev_attr->max_sge, RDS_IW_MAX_SGE);
+
+	rds_iwdev->dev = device;
+	rds_iwdev->pd = ib_alloc_pd(device);
+	if (IS_ERR(rds_iwdev->pd))
+		goto free_dev;
+
+	if (!rds_iwdev->dma_local_lkey) {
+		rds_iwdev->mr = ib_get_dma_mr(rds_iwdev->pd,
+					IB_ACCESS_REMOTE_READ |
+					IB_ACCESS_REMOTE_WRITE |
+					IB_ACCESS_LOCAL_WRITE);
+		if (IS_ERR(rds_iwdev->mr))
+			goto err_pd;
+	} else
+		rds_iwdev->mr = NULL;
+
+	rds_iwdev->mr_pool = rds_iw_create_mr_pool(rds_iwdev);
+	if (IS_ERR(rds_iwdev->mr_pool)) {
+		rds_iwdev->mr_pool = NULL;
+		goto err_mr;
+	}
+
+	INIT_LIST_HEAD(&rds_iwdev->cm_id_list);
+	INIT_LIST_HEAD(&rds_iwdev->conn_list);
+	list_add_tail(&rds_iwdev->list, &rds_iw_devices);
+
+	ib_set_client_data(device, &rds_iw_client, rds_iwdev);
+
+	goto free_attr;
+
+err_mr:
+	if (rds_iwdev->mr)
+		ib_dereg_mr(rds_iwdev->mr);
+err_pd:
+	ib_dealloc_pd(rds_iwdev->pd);
+free_dev:
+	kfree(rds_iwdev);
+free_attr:
+	kfree(dev_attr);
+}
+
+static void rds_iw_remove_one(struct ib_device *device)
+{
+	struct rds_iw_device *rds_iwdev;
+	struct rds_iw_cm_id *i_cm_id, *next;
+
+	rds_iwdev = ib_get_client_data(device, &rds_iw_client);
+	if (!rds_iwdev)
+		return;
+
+	spin_lock_irq(&rds_iwdev->spinlock);
+	list_for_each_entry_safe(i_cm_id, next, &rds_iwdev->cm_id_list, list) {
+		list_del(&i_cm_id->list);
+		kfree(i_cm_id);
+	}
+	spin_unlock_irq(&rds_iwdev->spinlock);
+
+	rds_iw_destroy_conns(rds_iwdev);
+
+	if (rds_iwdev->mr_pool)
+		rds_iw_destroy_mr_pool(rds_iwdev->mr_pool);
+
+	if (rds_iwdev->mr)
+		ib_dereg_mr(rds_iwdev->mr);
+
+	while (ib_dealloc_pd(rds_iwdev->pd)) {
+		rdsdebug("Failed to dealloc pd %p\n", rds_iwdev->pd);
+		msleep(1);
+	}
+
+	list_del(&rds_iwdev->list);
+	kfree(rds_iwdev);
+}
+
+struct ib_client rds_iw_client = {
+	.name   = "rds_iw",
+	.add    = rds_iw_add_one,
+	.remove = rds_iw_remove_one
+};
+
+static int rds_iw_conn_info_visitor(struct rds_connection *conn,
+				    void *buffer)
+{
+	struct rds_info_rdma_connection *iinfo = buffer;
+	struct rds_iw_connection *ic;
+
+	/* We will only ever look at IB transports */
+	if (conn->c_trans != &rds_iw_transport)
+		return 0;
+
+	iinfo->src_addr = conn->c_laddr;
+	iinfo->dst_addr = conn->c_faddr;
+
+	memset(&iinfo->src_gid, 0, sizeof(iinfo->src_gid));
+	memset(&iinfo->dst_gid, 0, sizeof(iinfo->dst_gid));
+	if (rds_conn_state(conn) == RDS_CONN_UP) {
+		struct rds_iw_device *rds_iwdev;
+		struct rdma_dev_addr *dev_addr;
+
+		ic = conn->c_transport_data;
+		dev_addr = &ic->i_cm_id->route.addr.dev_addr;
+
+		rdma_addr_get_sgid(dev_addr, (union ib_gid *) &iinfo->src_gid);
+		rdma_addr_get_dgid(dev_addr, (union ib_gid *) &iinfo->dst_gid);
+
+		rds_iwdev = ib_get_client_data(ic->i_cm_id->device, &rds_iw_client);
+		iinfo->max_send_wr = ic->i_send_ring.w_nr;
+		iinfo->max_recv_wr = ic->i_recv_ring.w_nr;
+		iinfo->max_send_sge = rds_iwdev->max_sge;
+		rds_iw_get_mr_info(rds_iwdev, iinfo);
+	}
+	return 1;
+}
+
+static void rds_iw_ic_info(struct socket *sock, unsigned int len,
+			   struct rds_info_iterator *iter,
+			   struct rds_info_lengths *lens)
+{
+	rds_for_each_conn_info(sock, len, iter, lens,
+				rds_iw_conn_info_visitor,
+				sizeof(struct rds_info_rdma_connection));
+}
+
+
+/*
+ * Early RDS/IB was built to only bind to an address if there is an IPoIB
+ * device with that address set.
+ *
+ * If it were me, I'd advocate for something more flexible.  Sending and
+ * receiving should be device-agnostic.  Transports would try and maintain
+ * connections between peers who have messages queued.  Userspace would be
+ * allowed to influence which paths have priority.  We could call userspace
+ * asserting this policy "routing".
+ */
+static int rds_iw_laddr_check(__be32 addr)
+{
+	int ret;
+	struct rdma_cm_id *cm_id;
+	struct sockaddr_in sin;
+
+	/* Create a CMA ID and try to bind it. This catches both
+	 * IB and iWARP capable NICs.
+	 */
+	cm_id = rdma_create_id(NULL, NULL, RDMA_PS_TCP, IB_QPT_RC);
+	if (IS_ERR(cm_id))
+		return PTR_ERR(cm_id);
+
+	memset(&sin, 0, sizeof(sin));
+	sin.sin_family = AF_INET;
+	sin.sin_addr.s_addr = addr;
+
+	/* rdma_bind_addr will only succeed for IB & iWARP devices */
+	ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin);
+	/* due to this, we will claim to support IB devices unless we
+	   check node_type. */
+	if (ret || !cm_id->device ||
+	    cm_id->device->node_type != RDMA_NODE_RNIC)
+		ret = -EADDRNOTAVAIL;
+
+	rdsdebug("addr %pI4 ret %d node type %d\n",
+		&addr, ret,
+		cm_id->device ? cm_id->device->node_type : -1);
+
+	rdma_destroy_id(cm_id);
+
+	return ret;
+}
+
+void rds_iw_exit(void)
+{
+	rds_info_deregister_func(RDS_INFO_IWARP_CONNECTIONS, rds_iw_ic_info);
+	rds_iw_destroy_nodev_conns();
+	ib_unregister_client(&rds_iw_client);
+	rds_iw_sysctl_exit();
+	rds_iw_recv_exit();
+	rds_trans_unregister(&rds_iw_transport);
+}
+
+struct rds_transport rds_iw_transport = {
+	.laddr_check		= rds_iw_laddr_check,
+	.xmit_complete		= rds_iw_xmit_complete,
+	.xmit			= rds_iw_xmit,
+	.xmit_rdma		= rds_iw_xmit_rdma,
+	.recv			= rds_iw_recv,
+	.conn_alloc		= rds_iw_conn_alloc,
+	.conn_free		= rds_iw_conn_free,
+	.conn_connect		= rds_iw_conn_connect,
+	.conn_shutdown		= rds_iw_conn_shutdown,
+	.inc_copy_to_user	= rds_iw_inc_copy_to_user,
+	.inc_free		= rds_iw_inc_free,
+	.cm_initiate_connect	= rds_iw_cm_initiate_connect,
+	.cm_handle_connect	= rds_iw_cm_handle_connect,
+	.cm_connect_complete	= rds_iw_cm_connect_complete,
+	.stats_info_copy	= rds_iw_stats_info_copy,
+	.exit			= rds_iw_exit,
+	.get_mr			= rds_iw_get_mr,
+	.sync_mr		= rds_iw_sync_mr,
+	.free_mr		= rds_iw_free_mr,
+	.flush_mrs		= rds_iw_flush_mrs,
+	.t_owner		= THIS_MODULE,
+	.t_name			= "iwarp",
+	.t_type			= RDS_TRANS_IWARP,
+	.t_prefer_loopback	= 1,
+};
+
+int rds_iw_init(void)
+{
+	int ret;
+
+	INIT_LIST_HEAD(&rds_iw_devices);
+
+	ret = ib_register_client(&rds_iw_client);
+	if (ret)
+		goto out;
+
+	ret = rds_iw_sysctl_init();
+	if (ret)
+		goto out_ibreg;
+
+	ret = rds_iw_recv_init();
+	if (ret)
+		goto out_sysctl;
+
+	ret = rds_trans_register(&rds_iw_transport);
+	if (ret)
+		goto out_recv;
+
+	rds_info_register_func(RDS_INFO_IWARP_CONNECTIONS, rds_iw_ic_info);
+
+	goto out;
+
+out_recv:
+	rds_iw_recv_exit();
+out_sysctl:
+	rds_iw_sysctl_exit();
+out_ibreg:
+	ib_unregister_client(&rds_iw_client);
+out:
+	return ret;
+}
+
+MODULE_LICENSE("GPL");
+
diff --git a/net/rds/iw.h b/net/rds/iw.h
new file mode 100644
index 0000000..04ce3b1
--- /dev/null
+++ b/net/rds/iw.h
@@ -0,0 +1,396 @@
+#ifndef _RDS_IW_H
+#define _RDS_IW_H
+
+#include <linux/interrupt.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/rdma_cm.h>
+#include "rds.h"
+#include "rdma_transport.h"
+
+#define RDS_FASTREG_SIZE		20
+#define RDS_FASTREG_POOL_SIZE		2048
+
+#define RDS_IW_MAX_SGE			8
+#define RDS_IW_RECV_SGE 		2
+
+#define RDS_IW_DEFAULT_RECV_WR		1024
+#define RDS_IW_DEFAULT_SEND_WR		256
+
+#define RDS_IW_SUPPORTED_PROTOCOLS	0x00000003	/* minor versions supported */
+
+extern struct list_head rds_iw_devices;
+
+/*
+ * IB posts RDS_FRAG_SIZE fragments of pages to the receive queues to
+ * try and minimize the amount of memory tied up both the device and
+ * socket receive queues.
+ */
+/* page offset of the final full frag that fits in the page */
+#define RDS_PAGE_LAST_OFF (((PAGE_SIZE  / RDS_FRAG_SIZE) - 1) * RDS_FRAG_SIZE)
+struct rds_page_frag {
+	struct list_head	f_item;
+	struct page		*f_page;
+	unsigned long		f_offset;
+	dma_addr_t 		f_mapped;
+};
+
+struct rds_iw_incoming {
+	struct list_head	ii_frags;
+	struct rds_incoming	ii_inc;
+};
+
+struct rds_iw_connect_private {
+	/* Add new fields at the end, and don't permute existing fields. */
+	__be32			dp_saddr;
+	__be32			dp_daddr;
+	u8			dp_protocol_major;
+	u8			dp_protocol_minor;
+	__be16			dp_protocol_minor_mask; /* bitmask */
+	__be32			dp_reserved1;
+	__be64			dp_ack_seq;
+	__be32			dp_credit;		/* non-zero enables flow ctl */
+};
+
+struct rds_iw_scatterlist {
+	struct scatterlist	*list;
+	unsigned int		len;
+	int			dma_len;
+	unsigned int		dma_npages;
+	unsigned int		bytes;
+};
+
+struct rds_iw_mapping {
+	spinlock_t		m_lock;	/* protect the mapping struct */
+	struct list_head	m_list;
+	struct rds_iw_mr	*m_mr;
+	uint32_t		m_rkey;
+	struct rds_iw_scatterlist m_sg;
+};
+
+struct rds_iw_send_work {
+	struct rds_message	*s_rm;
+
+	/* We should really put these into a union: */
+	struct rm_rdma_op	*s_op;
+	struct rds_iw_mapping	*s_mapping;
+	struct ib_mr		*s_mr;
+	struct ib_fast_reg_page_list *s_page_list;
+	unsigned char		s_remap_count;
+
+	struct ib_send_wr	s_wr;
+	struct ib_sge		s_sge[RDS_IW_MAX_SGE];
+	unsigned long		s_queued;
+};
+
+struct rds_iw_recv_work {
+	struct rds_iw_incoming 	*r_iwinc;
+	struct rds_page_frag	*r_frag;
+	struct ib_recv_wr	r_wr;
+	struct ib_sge		r_sge[2];
+};
+
+struct rds_iw_work_ring {
+	u32		w_nr;
+	u32		w_alloc_ptr;
+	u32		w_alloc_ctr;
+	u32		w_free_ptr;
+	atomic_t	w_free_ctr;
+};
+
+struct rds_iw_device;
+
+struct rds_iw_connection {
+
+	struct list_head	iw_node;
+	struct rds_iw_device 	*rds_iwdev;
+	struct rds_connection	*conn;
+
+	/* alphabet soup, IBTA style */
+	struct rdma_cm_id	*i_cm_id;
+	struct ib_pd		*i_pd;
+	struct ib_mr		*i_mr;
+	struct ib_cq		*i_send_cq;
+	struct ib_cq		*i_recv_cq;
+
+	/* tx */
+	struct rds_iw_work_ring	i_send_ring;
+	struct rds_message	*i_rm;
+	struct rds_header	*i_send_hdrs;
+	u64			i_send_hdrs_dma;
+	struct rds_iw_send_work *i_sends;
+
+	/* rx */
+	struct tasklet_struct	i_recv_tasklet;
+	struct mutex		i_recv_mutex;
+	struct rds_iw_work_ring	i_recv_ring;
+	struct rds_iw_incoming	*i_iwinc;
+	u32			i_recv_data_rem;
+	struct rds_header	*i_recv_hdrs;
+	u64			i_recv_hdrs_dma;
+	struct rds_iw_recv_work *i_recvs;
+	struct rds_page_frag	i_frag;
+	u64			i_ack_recv;	/* last ACK received */
+
+	/* sending acks */
+	unsigned long		i_ack_flags;
+#ifdef KERNEL_HAS_ATOMIC64
+	atomic64_t		i_ack_next;	/* next ACK to send */
+#else
+	spinlock_t		i_ack_lock;	/* protect i_ack_next */
+	u64			i_ack_next;	/* next ACK to send */
+#endif
+	struct rds_header	*i_ack;
+	struct ib_send_wr	i_ack_wr;
+	struct ib_sge		i_ack_sge;
+	u64			i_ack_dma;
+	unsigned long		i_ack_queued;
+
+	/* Flow control related information
+	 *
+	 * Our algorithm uses a pair variables that we need to access
+	 * atomically - one for the send credits, and one posted
+	 * recv credits we need to transfer to remote.
+	 * Rather than protect them using a slow spinlock, we put both into
+	 * a single atomic_t and update it using cmpxchg
+	 */
+	atomic_t		i_credits;
+
+	/* Protocol version specific information */
+	unsigned int		i_flowctl:1;	/* enable/disable flow ctl */
+	unsigned int		i_dma_local_lkey:1;
+	unsigned int		i_fastreg_posted:1; /* fastreg posted on this connection */
+	/* Batched completions */
+	unsigned int		i_unsignaled_wrs;
+	long			i_unsignaled_bytes;
+};
+
+/* This assumes that atomic_t is at least 32 bits */
+#define IB_GET_SEND_CREDITS(v)	((v) & 0xffff)
+#define IB_GET_POST_CREDITS(v)	((v) >> 16)
+#define IB_SET_SEND_CREDITS(v)	((v) & 0xffff)
+#define IB_SET_POST_CREDITS(v)	((v) << 16)
+
+struct rds_iw_cm_id {
+	struct list_head	list;
+	struct rdma_cm_id	*cm_id;
+};
+
+struct rds_iw_device {
+	struct list_head	list;
+	struct list_head	cm_id_list;
+	struct list_head	conn_list;
+	struct ib_device	*dev;
+	struct ib_pd		*pd;
+	struct ib_mr		*mr;
+	struct rds_iw_mr_pool	*mr_pool;
+	int			max_sge;
+	unsigned int		max_wrs;
+	unsigned int		dma_local_lkey:1;
+	spinlock_t		spinlock;	/* protect the above */
+};
+
+/* bits for i_ack_flags */
+#define IB_ACK_IN_FLIGHT	0
+#define IB_ACK_REQUESTED	1
+
+/* Magic WR_ID for ACKs */
+#define RDS_IW_ACK_WR_ID	((u64)0xffffffffffffffffULL)
+#define RDS_IW_FAST_REG_WR_ID	((u64)0xefefefefefefefefULL)
+#define RDS_IW_LOCAL_INV_WR_ID	((u64)0xdfdfdfdfdfdfdfdfULL)
+
+struct rds_iw_statistics {
+	uint64_t	s_iw_connect_raced;
+	uint64_t	s_iw_listen_closed_stale;
+	uint64_t	s_iw_tx_cq_call;
+	uint64_t	s_iw_tx_cq_event;
+	uint64_t	s_iw_tx_ring_full;
+	uint64_t	s_iw_tx_throttle;
+	uint64_t	s_iw_tx_sg_mapping_failure;
+	uint64_t	s_iw_tx_stalled;
+	uint64_t	s_iw_tx_credit_updates;
+	uint64_t	s_iw_rx_cq_call;
+	uint64_t	s_iw_rx_cq_event;
+	uint64_t	s_iw_rx_ring_empty;
+	uint64_t	s_iw_rx_refill_from_cq;
+	uint64_t	s_iw_rx_refill_from_thread;
+	uint64_t	s_iw_rx_alloc_limit;
+	uint64_t	s_iw_rx_credit_updates;
+	uint64_t	s_iw_ack_sent;
+	uint64_t	s_iw_ack_send_failure;
+	uint64_t	s_iw_ack_send_delayed;
+	uint64_t	s_iw_ack_send_piggybacked;
+	uint64_t	s_iw_ack_received;
+	uint64_t	s_iw_rdma_mr_alloc;
+	uint64_t	s_iw_rdma_mr_free;
+	uint64_t	s_iw_rdma_mr_used;
+	uint64_t	s_iw_rdma_mr_pool_flush;
+	uint64_t	s_iw_rdma_mr_pool_wait;
+	uint64_t	s_iw_rdma_mr_pool_depleted;
+};
+
+extern struct workqueue_struct *rds_iw_wq;
+
+/*
+ * Fake ib_dma_sync_sg_for_{cpu,device} as long as ib_verbs.h
+ * doesn't define it.
+ */
+static inline void rds_iw_dma_sync_sg_for_cpu(struct ib_device *dev,
+		struct scatterlist *sg, unsigned int sg_dma_len, int direction)
+{
+	unsigned int i;
+
+	for (i = 0; i < sg_dma_len; ++i) {
+		ib_dma_sync_single_for_cpu(dev,
+				ib_sg_dma_address(dev, &sg[i]),
+				ib_sg_dma_len(dev, &sg[i]),
+				direction);
+	}
+}
+#define ib_dma_sync_sg_for_cpu	rds_iw_dma_sync_sg_for_cpu
+
+static inline void rds_iw_dma_sync_sg_for_device(struct ib_device *dev,
+		struct scatterlist *sg, unsigned int sg_dma_len, int direction)
+{
+	unsigned int i;
+
+	for (i = 0; i < sg_dma_len; ++i) {
+		ib_dma_sync_single_for_device(dev,
+				ib_sg_dma_address(dev, &sg[i]),
+				ib_sg_dma_len(dev, &sg[i]),
+				direction);
+	}
+}
+#define ib_dma_sync_sg_for_device	rds_iw_dma_sync_sg_for_device
+
+static inline u32 rds_iw_local_dma_lkey(struct rds_iw_connection *ic)
+{
+	return ic->i_dma_local_lkey ? ic->i_cm_id->device->local_dma_lkey : ic->i_mr->lkey;
+}
+
+/* ib.c */
+extern struct rds_transport rds_iw_transport;
+extern struct ib_client rds_iw_client;
+
+extern unsigned int fastreg_pool_size;
+extern unsigned int fastreg_message_size;
+
+extern spinlock_t iw_nodev_conns_lock;
+extern struct list_head iw_nodev_conns;
+
+/* ib_cm.c */
+int rds_iw_conn_alloc(struct rds_connection *conn, gfp_t gfp);
+void rds_iw_conn_free(void *arg);
+int rds_iw_conn_connect(struct rds_connection *conn);
+void rds_iw_conn_shutdown(struct rds_connection *conn);
+void rds_iw_state_change(struct sock *sk);
+int rds_iw_listen_init(void);
+void rds_iw_listen_stop(void);
+void __rds_iw_conn_error(struct rds_connection *conn, const char *, ...);
+int rds_iw_cm_handle_connect(struct rdma_cm_id *cm_id,
+			     struct rdma_cm_event *event);
+int rds_iw_cm_initiate_connect(struct rdma_cm_id *cm_id);
+void rds_iw_cm_connect_complete(struct rds_connection *conn,
+				struct rdma_cm_event *event);
+
+
+#define rds_iw_conn_error(conn, fmt...) \
+	__rds_iw_conn_error(conn, KERN_WARNING "RDS/IW: " fmt)
+
+/* ib_rdma.c */
+int rds_iw_update_cm_id(struct rds_iw_device *rds_iwdev, struct rdma_cm_id *cm_id);
+void rds_iw_add_conn(struct rds_iw_device *rds_iwdev, struct rds_connection *conn);
+void rds_iw_remove_conn(struct rds_iw_device *rds_iwdev, struct rds_connection *conn);
+void __rds_iw_destroy_conns(struct list_head *list, spinlock_t *list_lock);
+static inline void rds_iw_destroy_nodev_conns(void)
+{
+	__rds_iw_destroy_conns(&iw_nodev_conns, &iw_nodev_conns_lock);
+}
+static inline void rds_iw_destroy_conns(struct rds_iw_device *rds_iwdev)
+{
+	__rds_iw_destroy_conns(&rds_iwdev->conn_list, &rds_iwdev->spinlock);
+}
+struct rds_iw_mr_pool *rds_iw_create_mr_pool(struct rds_iw_device *);
+void rds_iw_get_mr_info(struct rds_iw_device *rds_iwdev, struct rds_info_rdma_connection *iinfo);
+void rds_iw_destroy_mr_pool(struct rds_iw_mr_pool *);
+void *rds_iw_get_mr(struct scatterlist *sg, unsigned long nents,
+		    struct rds_sock *rs, u32 *key_ret);
+void rds_iw_sync_mr(void *trans_private, int dir);
+void rds_iw_free_mr(void *trans_private, int invalidate);
+void rds_iw_flush_mrs(void);
+
+/* ib_recv.c */
+int rds_iw_recv_init(void);
+void rds_iw_recv_exit(void);
+int rds_iw_recv(struct rds_connection *conn);
+int rds_iw_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp,
+		       gfp_t page_gfp, int prefill);
+void rds_iw_inc_free(struct rds_incoming *inc);
+int rds_iw_inc_copy_to_user(struct rds_incoming *inc, struct iovec *iov,
+			     size_t size);
+void rds_iw_recv_cq_comp_handler(struct ib_cq *cq, void *context);
+void rds_iw_recv_tasklet_fn(unsigned long data);
+void rds_iw_recv_init_ring(struct rds_iw_connection *ic);
+void rds_iw_recv_clear_ring(struct rds_iw_connection *ic);
+void rds_iw_recv_init_ack(struct rds_iw_connection *ic);
+void rds_iw_attempt_ack(struct rds_iw_connection *ic);
+void rds_iw_ack_send_complete(struct rds_iw_connection *ic);
+u64 rds_iw_piggyb_ack(struct rds_iw_connection *ic);
+
+/* ib_ring.c */
+void rds_iw_ring_init(struct rds_iw_work_ring *ring, u32 nr);
+void rds_iw_ring_resize(struct rds_iw_work_ring *ring, u32 nr);
+u32 rds_iw_ring_alloc(struct rds_iw_work_ring *ring, u32 val, u32 *pos);
+void rds_iw_ring_free(struct rds_iw_work_ring *ring, u32 val);
+void rds_iw_ring_unalloc(struct rds_iw_work_ring *ring, u32 val);
+int rds_iw_ring_empty(struct rds_iw_work_ring *ring);
+int rds_iw_ring_low(struct rds_iw_work_ring *ring);
+u32 rds_iw_ring_oldest(struct rds_iw_work_ring *ring);
+u32 rds_iw_ring_completed(struct rds_iw_work_ring *ring, u32 wr_id, u32 oldest);
+extern wait_queue_head_t rds_iw_ring_empty_wait;
+
+/* ib_send.c */
+void rds_iw_xmit_complete(struct rds_connection *conn);
+int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm,
+		unsigned int hdr_off, unsigned int sg, unsigned int off);
+void rds_iw_send_cq_comp_handler(struct ib_cq *cq, void *context);
+void rds_iw_send_init_ring(struct rds_iw_connection *ic);
+void rds_iw_send_clear_ring(struct rds_iw_connection *ic);
+int rds_iw_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op);
+void rds_iw_send_add_credits(struct rds_connection *conn, unsigned int credits);
+void rds_iw_advertise_credits(struct rds_connection *conn, unsigned int posted);
+int rds_iw_send_grab_credits(struct rds_iw_connection *ic, u32 wanted,
+			     u32 *adv_credits, int need_posted, int max_posted);
+
+/* ib_stats.c */
+DECLARE_PER_CPU(struct rds_iw_statistics, rds_iw_stats);
+#define rds_iw_stats_inc(member) rds_stats_inc_which(rds_iw_stats, member)
+unsigned int rds_iw_stats_info_copy(struct rds_info_iterator *iter,
+				    unsigned int avail);
+
+/* ib_sysctl.c */
+int rds_iw_sysctl_init(void);
+void rds_iw_sysctl_exit(void);
+extern unsigned long rds_iw_sysctl_max_send_wr;
+extern unsigned long rds_iw_sysctl_max_recv_wr;
+extern unsigned long rds_iw_sysctl_max_unsig_wrs;
+extern unsigned long rds_iw_sysctl_max_unsig_bytes;
+extern unsigned long rds_iw_sysctl_max_recv_allocation;
+extern unsigned int rds_iw_sysctl_flow_control;
+
+/*
+ * Helper functions for getting/setting the header and data SGEs in
+ * RDS packets (not RDMA)
+ */
+static inline struct ib_sge *
+rds_iw_header_sge(struct rds_iw_connection *ic, struct ib_sge *sge)
+{
+	return &sge[0];
+}
+
+static inline struct ib_sge *
+rds_iw_data_sge(struct rds_iw_connection *ic, struct ib_sge *sge)
+{
+	return &sge[1];
+}
+
+#endif
diff --git a/net/rds/iw_cm.c b/net/rds/iw_cm.c
new file mode 100644
index 0000000..a91e1db
--- /dev/null
+++ b/net/rds/iw_cm.c
@@ -0,0 +1,765 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/in.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/ratelimit.h>
+
+#include "rds.h"
+#include "iw.h"
+
+/*
+ * Set the selected protocol version
+ */
+static void rds_iw_set_protocol(struct rds_connection *conn, unsigned int version)
+{
+	conn->c_version = version;
+}
+
+/*
+ * Set up flow control
+ */
+static void rds_iw_set_flow_control(struct rds_connection *conn, u32 credits)
+{
+	struct rds_iw_connection *ic = conn->c_transport_data;
+
+	if (rds_iw_sysctl_flow_control && credits != 0) {
+		/* We're doing flow control */
+		ic->i_flowctl = 1;
+		rds_iw_send_add_credits(conn, credits);
+	} else {
+		ic->i_flowctl = 0;
+	}
+}
+
+/*
+ * Connection established.
+ * We get here for both outgoing and incoming connection.
+ */
+void rds_iw_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_event *event)
+{
+	const struct rds_iw_connect_private *dp = NULL;
+	struct rds_iw_connection *ic = conn->c_transport_data;
+	struct rds_iw_device *rds_iwdev;
+	int err;
+
+	if (event->param.conn.private_data_len) {
+		dp = event->param.conn.private_data;
+
+		rds_iw_set_protocol(conn,
+				RDS_PROTOCOL(dp->dp_protocol_major,
+					dp->dp_protocol_minor));
+		rds_iw_set_flow_control(conn, be32_to_cpu(dp->dp_credit));
+	}
+
+	/* update ib_device with this local ipaddr & conn */
+	rds_iwdev = ib_get_client_data(ic->i_cm_id->device, &rds_iw_client);
+	err = rds_iw_update_cm_id(rds_iwdev, ic->i_cm_id);
+	if (err)
+		printk(KERN_ERR "rds_iw_update_ipaddr failed (%d)\n", err);
+	rds_iw_add_conn(rds_iwdev, conn);
+
+	/* If the peer gave us the last packet it saw, process this as if
+	 * we had received a regular ACK. */
+	if (dp && dp->dp_ack_seq)
+		rds_send_drop_acked(conn, be64_to_cpu(dp->dp_ack_seq), NULL);
+
+	printk(KERN_NOTICE "RDS/IW: connected to %pI4<->%pI4 version %u.%u%s\n",
+			&conn->c_laddr, &conn->c_faddr,
+			RDS_PROTOCOL_MAJOR(conn->c_version),
+			RDS_PROTOCOL_MINOR(conn->c_version),
+			ic->i_flowctl ? ", flow control" : "");
+
+	rds_connect_complete(conn);
+}
+
+static void rds_iw_cm_fill_conn_param(struct rds_connection *conn,
+			struct rdma_conn_param *conn_param,
+			struct rds_iw_connect_private *dp,
+			u32 protocol_version)
+{
+	struct rds_iw_connection *ic = conn->c_transport_data;
+
+	memset(conn_param, 0, sizeof(struct rdma_conn_param));
+	/* XXX tune these? */
+	conn_param->responder_resources = 1;
+	conn_param->initiator_depth = 1;
+
+	if (dp) {
+		memset(dp, 0, sizeof(*dp));
+		dp->dp_saddr = conn->c_laddr;
+		dp->dp_daddr = conn->c_faddr;
+		dp->dp_protocol_major = RDS_PROTOCOL_MAJOR(protocol_version);
+		dp->dp_protocol_minor = RDS_PROTOCOL_MINOR(protocol_version);
+		dp->dp_protocol_minor_mask = cpu_to_be16(RDS_IW_SUPPORTED_PROTOCOLS);
+		dp->dp_ack_seq = rds_iw_piggyb_ack(ic);
+
+		/* Advertise flow control */
+		if (ic->i_flowctl) {
+			unsigned int credits;
+
+			credits = IB_GET_POST_CREDITS(atomic_read(&ic->i_credits));
+			dp->dp_credit = cpu_to_be32(credits);
+			atomic_sub(IB_SET_POST_CREDITS(credits), &ic->i_credits);
+		}
+
+		conn_param->private_data = dp;
+		conn_param->private_data_len = sizeof(*dp);
+	}
+}
+
+static void rds_iw_cq_event_handler(struct ib_event *event, void *data)
+{
+	rdsdebug("event %u data %p\n", event->event, data);
+}
+
+static void rds_iw_qp_event_handler(struct ib_event *event, void *data)
+{
+	struct rds_connection *conn = data;
+	struct rds_iw_connection *ic = conn->c_transport_data;
+
+	rdsdebug("conn %p ic %p event %u\n", conn, ic, event->event);
+
+	switch (event->event) {
+	case IB_EVENT_COMM_EST:
+		rdma_notify(ic->i_cm_id, IB_EVENT_COMM_EST);
+		break;
+	case IB_EVENT_QP_REQ_ERR:
+	case IB_EVENT_QP_FATAL:
+	default:
+		rdsdebug("Fatal QP Event %u "
+			"- connection %pI4->%pI4, reconnecting\n",
+			event->event, &conn->c_laddr,
+			&conn->c_faddr);
+		rds_conn_drop(conn);
+		break;
+	}
+}
+
+/*
+ * Create a QP
+ */
+static int rds_iw_init_qp_attrs(struct ib_qp_init_attr *attr,
+		struct rds_iw_device *rds_iwdev,
+		struct rds_iw_work_ring *send_ring,
+		void (*send_cq_handler)(struct ib_cq *, void *),
+		struct rds_iw_work_ring *recv_ring,
+		void (*recv_cq_handler)(struct ib_cq *, void *),
+		void *context)
+{
+	struct ib_device *dev = rds_iwdev->dev;
+	unsigned int send_size, recv_size;
+	int ret;
+
+	/* The offset of 1 is to accommodate the additional ACK WR. */
+	send_size = min_t(unsigned int, rds_iwdev->max_wrs, rds_iw_sysctl_max_send_wr + 1);
+	recv_size = min_t(unsigned int, rds_iwdev->max_wrs, rds_iw_sysctl_max_recv_wr + 1);
+	rds_iw_ring_resize(send_ring, send_size - 1);
+	rds_iw_ring_resize(recv_ring, recv_size - 1);
+
+	memset(attr, 0, sizeof(*attr));
+	attr->event_handler = rds_iw_qp_event_handler;
+	attr->qp_context = context;
+	attr->cap.max_send_wr = send_size;
+	attr->cap.max_recv_wr = recv_size;
+	attr->cap.max_send_sge = rds_iwdev->max_sge;
+	attr->cap.max_recv_sge = RDS_IW_RECV_SGE;
+	attr->sq_sig_type = IB_SIGNAL_REQ_WR;
+	attr->qp_type = IB_QPT_RC;
+
+	attr->send_cq = ib_create_cq(dev, send_cq_handler,
+				     rds_iw_cq_event_handler,
+				     context, send_size, 0);
+	if (IS_ERR(attr->send_cq)) {
+		ret = PTR_ERR(attr->send_cq);
+		attr->send_cq = NULL;
+		rdsdebug("ib_create_cq send failed: %d\n", ret);
+		goto out;
+	}
+
+	attr->recv_cq = ib_create_cq(dev, recv_cq_handler,
+				     rds_iw_cq_event_handler,
+				     context, recv_size, 0);
+	if (IS_ERR(attr->recv_cq)) {
+		ret = PTR_ERR(attr->recv_cq);
+		attr->recv_cq = NULL;
+		rdsdebug("ib_create_cq send failed: %d\n", ret);
+		goto out;
+	}
+
+	ret = ib_req_notify_cq(attr->send_cq, IB_CQ_NEXT_COMP);
+	if (ret) {
+		rdsdebug("ib_req_notify_cq send failed: %d\n", ret);
+		goto out;
+	}
+
+	ret = ib_req_notify_cq(attr->recv_cq, IB_CQ_SOLICITED);
+	if (ret) {
+		rdsdebug("ib_req_notify_cq recv failed: %d\n", ret);
+		goto out;
+	}
+
+out:
+	if (ret) {
+		if (attr->send_cq)
+			ib_destroy_cq(attr->send_cq);
+		if (attr->recv_cq)
+			ib_destroy_cq(attr->recv_cq);
+	}
+	return ret;
+}
+
+/*
+ * This needs to be very careful to not leave IS_ERR pointers around for
+ * cleanup to trip over.
+ */
+static int rds_iw_setup_qp(struct rds_connection *conn)
+{
+	struct rds_iw_connection *ic = conn->c_transport_data;
+	struct ib_device *dev = ic->i_cm_id->device;
+	struct ib_qp_init_attr attr;
+	struct rds_iw_device *rds_iwdev;
+	int ret;
+
+	/* rds_iw_add_one creates a rds_iw_device object per IB device,
+	 * and allocates a protection domain, memory range and MR pool
+	 * for each.  If that fails for any reason, it will not register
+	 * the rds_iwdev at all.
+	 */
+	rds_iwdev = ib_get_client_data(dev, &rds_iw_client);
+	if (!rds_iwdev) {
+		printk_ratelimited(KERN_NOTICE "RDS/IW: No client_data for device %s\n",
+					dev->name);
+		return -EOPNOTSUPP;
+	}
+
+	/* Protection domain and memory range */
+	ic->i_pd = rds_iwdev->pd;
+	ic->i_mr = rds_iwdev->mr;
+
+	ret = rds_iw_init_qp_attrs(&attr, rds_iwdev,
+			&ic->i_send_ring, rds_iw_send_cq_comp_handler,
+			&ic->i_recv_ring, rds_iw_recv_cq_comp_handler,
+			conn);
+	if (ret < 0)
+		goto out;
+
+	ic->i_send_cq = attr.send_cq;
+	ic->i_recv_cq = attr.recv_cq;
+
+	/*
+	 * XXX this can fail if max_*_wr is too large?  Are we supposed
+	 * to back off until we get a value that the hardware can support?
+	 */
+	ret = rdma_create_qp(ic->i_cm_id, ic->i_pd, &attr);
+	if (ret) {
+		rdsdebug("rdma_create_qp failed: %d\n", ret);
+		goto out;
+	}
+
+	ic->i_send_hdrs = ib_dma_alloc_coherent(dev,
+					   ic->i_send_ring.w_nr *
+						sizeof(struct rds_header),
+					   &ic->i_send_hdrs_dma, GFP_KERNEL);
+	if (!ic->i_send_hdrs) {
+		ret = -ENOMEM;
+		rdsdebug("ib_dma_alloc_coherent send failed\n");
+		goto out;
+	}
+
+	ic->i_recv_hdrs = ib_dma_alloc_coherent(dev,
+					   ic->i_recv_ring.w_nr *
+						sizeof(struct rds_header),
+					   &ic->i_recv_hdrs_dma, GFP_KERNEL);
+	if (!ic->i_recv_hdrs) {
+		ret = -ENOMEM;
+		rdsdebug("ib_dma_alloc_coherent recv failed\n");
+		goto out;
+	}
+
+	ic->i_ack = ib_dma_alloc_coherent(dev, sizeof(struct rds_header),
+				       &ic->i_ack_dma, GFP_KERNEL);
+	if (!ic->i_ack) {
+		ret = -ENOMEM;
+		rdsdebug("ib_dma_alloc_coherent ack failed\n");
+		goto out;
+	}
+
+	ic->i_sends = vmalloc(ic->i_send_ring.w_nr * sizeof(struct rds_iw_send_work));
+	if (!ic->i_sends) {
+		ret = -ENOMEM;
+		rdsdebug("send allocation failed\n");
+		goto out;
+	}
+	rds_iw_send_init_ring(ic);
+
+	ic->i_recvs = vmalloc(ic->i_recv_ring.w_nr * sizeof(struct rds_iw_recv_work));
+	if (!ic->i_recvs) {
+		ret = -ENOMEM;
+		rdsdebug("recv allocation failed\n");
+		goto out;
+	}
+
+	rds_iw_recv_init_ring(ic);
+	rds_iw_recv_init_ack(ic);
+
+	/* Post receive buffers - as a side effect, this will update
+	 * the posted credit count. */
+	rds_iw_recv_refill(conn, GFP_KERNEL, GFP_HIGHUSER, 1);
+
+	rdsdebug("conn %p pd %p mr %p cq %p %p\n", conn, ic->i_pd, ic->i_mr,
+		 ic->i_send_cq, ic->i_recv_cq);
+
+out:
+	return ret;
+}
+
+static u32 rds_iw_protocol_compatible(const struct rds_iw_connect_private *dp)
+{
+	u16 common;
+	u32 version = 0;
+
+	/* rdma_cm private data is odd - when there is any private data in the
+	 * request, we will be given a pretty large buffer without telling us the
+	 * original size. The only way to tell the difference is by looking at
+	 * the contents, which are initialized to zero.
+	 * If the protocol version fields aren't set, this is a connection attempt
+	 * from an older version. This could could be 3.0 or 2.0 - we can't tell.
+	 * We really should have changed this for OFED 1.3 :-( */
+	if (dp->dp_protocol_major == 0)
+		return RDS_PROTOCOL_3_0;
+
+	common = be16_to_cpu(dp->dp_protocol_minor_mask) & RDS_IW_SUPPORTED_PROTOCOLS;
+	if (dp->dp_protocol_major == 3 && common) {
+		version = RDS_PROTOCOL_3_0;
+		while ((common >>= 1) != 0)
+			version++;
+	}
+	printk_ratelimited(KERN_NOTICE "RDS: Connection from %pI4 using "
+			"incompatible protocol version %u.%u\n",
+			&dp->dp_saddr,
+			dp->dp_protocol_major,
+			dp->dp_protocol_minor);
+	return version;
+}
+
+int rds_iw_cm_handle_connect(struct rdma_cm_id *cm_id,
+				    struct rdma_cm_event *event)
+{
+	const struct rds_iw_connect_private *dp = event->param.conn.private_data;
+	struct rds_iw_connect_private dp_rep;
+	struct rds_connection *conn = NULL;
+	struct rds_iw_connection *ic = NULL;
+	struct rdma_conn_param conn_param;
+	struct rds_iw_device *rds_iwdev;
+	u32 version;
+	int err, destroy = 1;
+
+	/* Check whether the remote protocol version matches ours. */
+	version = rds_iw_protocol_compatible(dp);
+	if (!version)
+		goto out;
+
+	rdsdebug("saddr %pI4 daddr %pI4 RDSv%u.%u\n",
+		 &dp->dp_saddr, &dp->dp_daddr,
+		 RDS_PROTOCOL_MAJOR(version), RDS_PROTOCOL_MINOR(version));
+
+	conn = rds_conn_create(dp->dp_daddr, dp->dp_saddr, &rds_iw_transport,
+			       GFP_KERNEL);
+	if (IS_ERR(conn)) {
+		rdsdebug("rds_conn_create failed (%ld)\n", PTR_ERR(conn));
+		conn = NULL;
+		goto out;
+	}
+
+	/*
+	 * The connection request may occur while the
+	 * previous connection exist, e.g. in case of failover.
+	 * But as connections may be initiated simultaneously
+	 * by both hosts, we have a random backoff mechanism -
+	 * see the comment above rds_queue_reconnect()
+	 */
+	mutex_lock(&conn->c_cm_lock);
+	if (!rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_CONNECTING)) {
+		if (rds_conn_state(conn) == RDS_CONN_UP) {
+			rdsdebug("incoming connect while connecting\n");
+			rds_conn_drop(conn);
+			rds_iw_stats_inc(s_iw_listen_closed_stale);
+		} else
+		if (rds_conn_state(conn) == RDS_CONN_CONNECTING) {
+			/* Wait and see - our connect may still be succeeding */
+			rds_iw_stats_inc(s_iw_connect_raced);
+		}
+		mutex_unlock(&conn->c_cm_lock);
+		goto out;
+	}
+
+	ic = conn->c_transport_data;
+
+	rds_iw_set_protocol(conn, version);
+	rds_iw_set_flow_control(conn, be32_to_cpu(dp->dp_credit));
+
+	/* If the peer gave us the last packet it saw, process this as if
+	 * we had received a regular ACK. */
+	if (dp->dp_ack_seq)
+		rds_send_drop_acked(conn, be64_to_cpu(dp->dp_ack_seq), NULL);
+
+	BUG_ON(cm_id->context);
+	BUG_ON(ic->i_cm_id);
+
+	ic->i_cm_id = cm_id;
+	cm_id->context = conn;
+
+	rds_iwdev = ib_get_client_data(cm_id->device, &rds_iw_client);
+	ic->i_dma_local_lkey = rds_iwdev->dma_local_lkey;
+
+	/* We got halfway through setting up the ib_connection, if we
+	 * fail now, we have to take the long route out of this mess. */
+	destroy = 0;
+
+	err = rds_iw_setup_qp(conn);
+	if (err) {
+		rds_iw_conn_error(conn, "rds_iw_setup_qp failed (%d)\n", err);
+		mutex_unlock(&conn->c_cm_lock);
+		goto out;
+	}
+
+	rds_iw_cm_fill_conn_param(conn, &conn_param, &dp_rep, version);
+
+	/* rdma_accept() calls rdma_reject() internally if it fails */
+	err = rdma_accept(cm_id, &conn_param);
+	mutex_unlock(&conn->c_cm_lock);
+	if (err) {
+		rds_iw_conn_error(conn, "rdma_accept failed (%d)\n", err);
+		goto out;
+	}
+
+	return 0;
+
+out:
+	rdma_reject(cm_id, NULL, 0);
+	return destroy;
+}
+
+
+int rds_iw_cm_initiate_connect(struct rdma_cm_id *cm_id)
+{
+	struct rds_connection *conn = cm_id->context;
+	struct rds_iw_connection *ic = conn->c_transport_data;
+	struct rdma_conn_param conn_param;
+	struct rds_iw_connect_private dp;
+	int ret;
+
+	/* If the peer doesn't do protocol negotiation, we must
+	 * default to RDSv3.0 */
+	rds_iw_set_protocol(conn, RDS_PROTOCOL_3_0);
+	ic->i_flowctl = rds_iw_sysctl_flow_control;	/* advertise flow control */
+
+	ret = rds_iw_setup_qp(conn);
+	if (ret) {
+		rds_iw_conn_error(conn, "rds_iw_setup_qp failed (%d)\n", ret);
+		goto out;
+	}
+
+	rds_iw_cm_fill_conn_param(conn, &conn_param, &dp, RDS_PROTOCOL_VERSION);
+
+	ret = rdma_connect(cm_id, &conn_param);
+	if (ret)
+		rds_iw_conn_error(conn, "rdma_connect failed (%d)\n", ret);
+
+out:
+	/* Beware - returning non-zero tells the rdma_cm to destroy
+	 * the cm_id. We should certainly not do it as long as we still
+	 * "own" the cm_id. */
+	if (ret) {
+		struct rds_iw_connection *ic = conn->c_transport_data;
+
+		if (ic->i_cm_id == cm_id)
+			ret = 0;
+	}
+	return ret;
+}
+
+int rds_iw_conn_connect(struct rds_connection *conn)
+{
+	struct rds_iw_connection *ic = conn->c_transport_data;
+	struct rds_iw_device *rds_iwdev;
+	struct sockaddr_in src, dest;
+	int ret;
+
+	/* XXX I wonder what affect the port space has */
+	/* delegate cm event handler to rdma_transport */
+	ic->i_cm_id = rdma_create_id(rds_rdma_cm_event_handler, conn,
+				     RDMA_PS_TCP, IB_QPT_RC);
+	if (IS_ERR(ic->i_cm_id)) {
+		ret = PTR_ERR(ic->i_cm_id);
+		ic->i_cm_id = NULL;
+		rdsdebug("rdma_create_id() failed: %d\n", ret);
+		goto out;
+	}
+
+	rdsdebug("created cm id %p for conn %p\n", ic->i_cm_id, conn);
+
+	src.sin_family = AF_INET;
+	src.sin_addr.s_addr = (__force u32)conn->c_laddr;
+	src.sin_port = (__force u16)htons(0);
+
+	/* First, bind to the local address and device. */
+	ret = rdma_bind_addr(ic->i_cm_id, (struct sockaddr *) &src);
+	if (ret) {
+		rdsdebug("rdma_bind_addr(%pI4) failed: %d\n",
+				&conn->c_laddr, ret);
+		rdma_destroy_id(ic->i_cm_id);
+		ic->i_cm_id = NULL;
+		goto out;
+	}
+
+	rds_iwdev = ib_get_client_data(ic->i_cm_id->device, &rds_iw_client);
+	ic->i_dma_local_lkey = rds_iwdev->dma_local_lkey;
+
+	dest.sin_family = AF_INET;
+	dest.sin_addr.s_addr = (__force u32)conn->c_faddr;
+	dest.sin_port = (__force u16)htons(RDS_PORT);
+
+	ret = rdma_resolve_addr(ic->i_cm_id, (struct sockaddr *)&src,
+				(struct sockaddr *)&dest,
+				RDS_RDMA_RESOLVE_TIMEOUT_MS);
+	if (ret) {
+		rdsdebug("addr resolve failed for cm id %p: %d\n", ic->i_cm_id,
+			 ret);
+		rdma_destroy_id(ic->i_cm_id);
+		ic->i_cm_id = NULL;
+	}
+
+out:
+	return ret;
+}
+
+/*
+ * This is so careful about only cleaning up resources that were built up
+ * so that it can be called at any point during startup.  In fact it
+ * can be called multiple times for a given connection.
+ */
+void rds_iw_conn_shutdown(struct rds_connection *conn)
+{
+	struct rds_iw_connection *ic = conn->c_transport_data;
+	int err = 0;
+	struct ib_qp_attr qp_attr;
+
+	rdsdebug("cm %p pd %p cq %p %p qp %p\n", ic->i_cm_id,
+		 ic->i_pd, ic->i_send_cq, ic->i_recv_cq,
+		 ic->i_cm_id ? ic->i_cm_id->qp : NULL);
+
+	if (ic->i_cm_id) {
+		struct ib_device *dev = ic->i_cm_id->device;
+
+		rdsdebug("disconnecting cm %p\n", ic->i_cm_id);
+		err = rdma_disconnect(ic->i_cm_id);
+		if (err) {
+			/* Actually this may happen quite frequently, when
+			 * an outgoing connect raced with an incoming connect.
+			 */
+			rdsdebug("rds_iw_conn_shutdown: failed to disconnect,"
+				   " cm: %p err %d\n", ic->i_cm_id, err);
+		}
+
+		if (ic->i_cm_id->qp) {
+			qp_attr.qp_state = IB_QPS_ERR;
+			ib_modify_qp(ic->i_cm_id->qp, &qp_attr, IB_QP_STATE);
+		}
+
+		wait_event(rds_iw_ring_empty_wait,
+			rds_iw_ring_empty(&ic->i_send_ring) &&
+			rds_iw_ring_empty(&ic->i_recv_ring));
+
+		if (ic->i_send_hdrs)
+			ib_dma_free_coherent(dev,
+					   ic->i_send_ring.w_nr *
+						sizeof(struct rds_header),
+					   ic->i_send_hdrs,
+					   ic->i_send_hdrs_dma);
+
+		if (ic->i_recv_hdrs)
+			ib_dma_free_coherent(dev,
+					   ic->i_recv_ring.w_nr *
+						sizeof(struct rds_header),
+					   ic->i_recv_hdrs,
+					   ic->i_recv_hdrs_dma);
+
+		if (ic->i_ack)
+			ib_dma_free_coherent(dev, sizeof(struct rds_header),
+					     ic->i_ack, ic->i_ack_dma);
+
+		if (ic->i_sends)
+			rds_iw_send_clear_ring(ic);
+		if (ic->i_recvs)
+			rds_iw_recv_clear_ring(ic);
+
+		if (ic->i_cm_id->qp)
+			rdma_destroy_qp(ic->i_cm_id);
+		if (ic->i_send_cq)
+			ib_destroy_cq(ic->i_send_cq);
+		if (ic->i_recv_cq)
+			ib_destroy_cq(ic->i_recv_cq);
+
+		/*
+		 * If associated with an rds_iw_device:
+		 * 	Move connection back to the nodev list.
+		 * 	Remove cm_id from the device cm_id list.
+		 */
+		if (ic->rds_iwdev)
+			rds_iw_remove_conn(ic->rds_iwdev, conn);
+
+		rdma_destroy_id(ic->i_cm_id);
+
+		ic->i_cm_id = NULL;
+		ic->i_pd = NULL;
+		ic->i_mr = NULL;
+		ic->i_send_cq = NULL;
+		ic->i_recv_cq = NULL;
+		ic->i_send_hdrs = NULL;
+		ic->i_recv_hdrs = NULL;
+		ic->i_ack = NULL;
+	}
+	BUG_ON(ic->rds_iwdev);
+
+	/* Clear pending transmit */
+	if (ic->i_rm) {
+		rds_message_put(ic->i_rm);
+		ic->i_rm = NULL;
+	}
+
+	/* Clear the ACK state */
+	clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
+#ifdef KERNEL_HAS_ATOMIC64
+	atomic64_set(&ic->i_ack_next, 0);
+#else
+	ic->i_ack_next = 0;
+#endif
+	ic->i_ack_recv = 0;
+
+	/* Clear flow control state */
+	ic->i_flowctl = 0;
+	atomic_set(&ic->i_credits, 0);
+
+	rds_iw_ring_init(&ic->i_send_ring, rds_iw_sysctl_max_send_wr);
+	rds_iw_ring_init(&ic->i_recv_ring, rds_iw_sysctl_max_recv_wr);
+
+	if (ic->i_iwinc) {
+		rds_inc_put(&ic->i_iwinc->ii_inc);
+		ic->i_iwinc = NULL;
+	}
+
+	vfree(ic->i_sends);
+	ic->i_sends = NULL;
+	vfree(ic->i_recvs);
+	ic->i_recvs = NULL;
+	rdsdebug("shutdown complete\n");
+}
+
+int rds_iw_conn_alloc(struct rds_connection *conn, gfp_t gfp)
+{
+	struct rds_iw_connection *ic;
+	unsigned long flags;
+
+	/* XXX too lazy? */
+	ic = kzalloc(sizeof(struct rds_iw_connection), gfp);
+	if (!ic)
+		return -ENOMEM;
+
+	INIT_LIST_HEAD(&ic->iw_node);
+	tasklet_init(&ic->i_recv_tasklet, rds_iw_recv_tasklet_fn,
+		     (unsigned long) ic);
+	mutex_init(&ic->i_recv_mutex);
+#ifndef KERNEL_HAS_ATOMIC64
+	spin_lock_init(&ic->i_ack_lock);
+#endif
+
+	/*
+	 * rds_iw_conn_shutdown() waits for these to be emptied so they
+	 * must be initialized before it can be called.
+	 */
+	rds_iw_ring_init(&ic->i_send_ring, rds_iw_sysctl_max_send_wr);
+	rds_iw_ring_init(&ic->i_recv_ring, rds_iw_sysctl_max_recv_wr);
+
+	ic->conn = conn;
+	conn->c_transport_data = ic;
+
+	spin_lock_irqsave(&iw_nodev_conns_lock, flags);
+	list_add_tail(&ic->iw_node, &iw_nodev_conns);
+	spin_unlock_irqrestore(&iw_nodev_conns_lock, flags);
+
+
+	rdsdebug("conn %p conn ic %p\n", conn, conn->c_transport_data);
+	return 0;
+}
+
+/*
+ * Free a connection. Connection must be shut down and not set for reconnect.
+ */
+void rds_iw_conn_free(void *arg)
+{
+	struct rds_iw_connection *ic = arg;
+	spinlock_t	*lock_ptr;
+
+	rdsdebug("ic %p\n", ic);
+
+	/*
+	 * Conn is either on a dev's list or on the nodev list.
+	 * A race with shutdown() or connect() would cause problems
+	 * (since rds_iwdev would change) but that should never happen.
+	 */
+	lock_ptr = ic->rds_iwdev ? &ic->rds_iwdev->spinlock : &iw_nodev_conns_lock;
+
+	spin_lock_irq(lock_ptr);
+	list_del(&ic->iw_node);
+	spin_unlock_irq(lock_ptr);
+
+	kfree(ic);
+}
+
+/*
+ * An error occurred on the connection
+ */
+void
+__rds_iw_conn_error(struct rds_connection *conn, const char *fmt, ...)
+{
+	va_list ap;
+
+	rds_conn_drop(conn);
+
+	va_start(ap, fmt);
+	vprintk(fmt, ap);
+	va_end(ap);
+}
diff --git a/net/rds/iw_rdma.c b/net/rds/iw_rdma.c
new file mode 100644
index 0000000..a817705
--- /dev/null
+++ b/net/rds/iw_rdma.c
@@ -0,0 +1,871 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/ratelimit.h>
+
+#include "rds.h"
+#include "iw.h"
+
+
+/*
+ * This is stored as mr->r_trans_private.
+ */
+struct rds_iw_mr {
+	struct rds_iw_device	*device;
+	struct rds_iw_mr_pool	*pool;
+	struct rdma_cm_id	*cm_id;
+
+	struct ib_mr	*mr;
+	struct ib_fast_reg_page_list *page_list;
+
+	struct rds_iw_mapping	mapping;
+	unsigned char		remap_count;
+};
+
+/*
+ * Our own little MR pool
+ */
+struct rds_iw_mr_pool {
+	struct rds_iw_device	*device;		/* back ptr to the device that owns us */
+
+	struct mutex		flush_lock;		/* serialize fmr invalidate */
+	struct work_struct	flush_worker;		/* flush worker */
+
+	spinlock_t		list_lock;		/* protect variables below */
+	atomic_t		item_count;		/* total # of MRs */
+	atomic_t		dirty_count;		/* # dirty of MRs */
+	struct list_head	dirty_list;		/* dirty mappings */
+	struct list_head	clean_list;		/* unused & unamapped MRs */
+	atomic_t		free_pinned;		/* memory pinned by free MRs */
+	unsigned long		max_message_size;	/* in pages */
+	unsigned long		max_items;
+	unsigned long		max_items_soft;
+	unsigned long		max_free_pinned;
+	int			max_pages;
+};
+
+static int rds_iw_flush_mr_pool(struct rds_iw_mr_pool *pool, int free_all);
+static void rds_iw_mr_pool_flush_worker(struct work_struct *work);
+static int rds_iw_init_fastreg(struct rds_iw_mr_pool *pool, struct rds_iw_mr *ibmr);
+static int rds_iw_map_fastreg(struct rds_iw_mr_pool *pool,
+			  struct rds_iw_mr *ibmr,
+			  struct scatterlist *sg, unsigned int nents);
+static void rds_iw_free_fastreg(struct rds_iw_mr_pool *pool, struct rds_iw_mr *ibmr);
+static unsigned int rds_iw_unmap_fastreg_list(struct rds_iw_mr_pool *pool,
+			struct list_head *unmap_list,
+			struct list_head *kill_list,
+			int *unpinned);
+static void rds_iw_destroy_fastreg(struct rds_iw_mr_pool *pool, struct rds_iw_mr *ibmr);
+
+static int rds_iw_get_device(struct rds_sock *rs, struct rds_iw_device **rds_iwdev, struct rdma_cm_id **cm_id)
+{
+	struct rds_iw_device *iwdev;
+	struct rds_iw_cm_id *i_cm_id;
+
+	*rds_iwdev = NULL;
+	*cm_id = NULL;
+
+	list_for_each_entry(iwdev, &rds_iw_devices, list) {
+		spin_lock_irq(&iwdev->spinlock);
+		list_for_each_entry(i_cm_id, &iwdev->cm_id_list, list) {
+			struct sockaddr_in *src_addr, *dst_addr;
+
+			src_addr = (struct sockaddr_in *)&i_cm_id->cm_id->route.addr.src_addr;
+			dst_addr = (struct sockaddr_in *)&i_cm_id->cm_id->route.addr.dst_addr;
+
+			rdsdebug("local ipaddr = %x port %d, "
+				 "remote ipaddr = %x port %d"
+				 "..looking for %x port %d, "
+				 "remote ipaddr = %x port %d\n",
+				src_addr->sin_addr.s_addr,
+				src_addr->sin_port,
+				dst_addr->sin_addr.s_addr,
+				dst_addr->sin_port,
+				rs->rs_bound_addr,
+				rs->rs_bound_port,
+				rs->rs_conn_addr,
+				rs->rs_conn_port);
+#ifdef WORKING_TUPLE_DETECTION
+			if (src_addr->sin_addr.s_addr == rs->rs_bound_addr &&
+			    src_addr->sin_port == rs->rs_bound_port &&
+			    dst_addr->sin_addr.s_addr == rs->rs_conn_addr &&
+			    dst_addr->sin_port == rs->rs_conn_port) {
+#else
+			/* FIXME - needs to compare the local and remote
+			 * ipaddr/port tuple, but the ipaddr is the only
+			 * available information in the rds_sock (as the rest are
+			 * zero'ed.  It doesn't appear to be properly populated
+			 * during connection setup...
+			 */
+			if (src_addr->sin_addr.s_addr == rs->rs_bound_addr) {
+#endif
+				spin_unlock_irq(&iwdev->spinlock);
+				*rds_iwdev = iwdev;
+				*cm_id = i_cm_id->cm_id;
+				return 0;
+			}
+		}
+		spin_unlock_irq(&iwdev->spinlock);
+	}
+
+	return 1;
+}
+
+static int rds_iw_add_cm_id(struct rds_iw_device *rds_iwdev, struct rdma_cm_id *cm_id)
+{
+	struct rds_iw_cm_id *i_cm_id;
+
+	i_cm_id = kmalloc(sizeof *i_cm_id, GFP_KERNEL);
+	if (!i_cm_id)
+		return -ENOMEM;
+
+	i_cm_id->cm_id = cm_id;
+
+	spin_lock_irq(&rds_iwdev->spinlock);
+	list_add_tail(&i_cm_id->list, &rds_iwdev->cm_id_list);
+	spin_unlock_irq(&rds_iwdev->spinlock);
+
+	return 0;
+}
+
+static void rds_iw_remove_cm_id(struct rds_iw_device *rds_iwdev,
+				struct rdma_cm_id *cm_id)
+{
+	struct rds_iw_cm_id *i_cm_id;
+
+	spin_lock_irq(&rds_iwdev->spinlock);
+	list_for_each_entry(i_cm_id, &rds_iwdev->cm_id_list, list) {
+		if (i_cm_id->cm_id == cm_id) {
+			list_del(&i_cm_id->list);
+			kfree(i_cm_id);
+			break;
+		}
+	}
+	spin_unlock_irq(&rds_iwdev->spinlock);
+}
+
+
+int rds_iw_update_cm_id(struct rds_iw_device *rds_iwdev, struct rdma_cm_id *cm_id)
+{
+	struct sockaddr_in *src_addr, *dst_addr;
+	struct rds_iw_device *rds_iwdev_old;
+	struct rds_sock rs;
+	struct rdma_cm_id *pcm_id;
+	int rc;
+
+	src_addr = (struct sockaddr_in *)&cm_id->route.addr.src_addr;
+	dst_addr = (struct sockaddr_in *)&cm_id->route.addr.dst_addr;
+
+	rs.rs_bound_addr = src_addr->sin_addr.s_addr;
+	rs.rs_bound_port = src_addr->sin_port;
+	rs.rs_conn_addr = dst_addr->sin_addr.s_addr;
+	rs.rs_conn_port = dst_addr->sin_port;
+
+	rc = rds_iw_get_device(&rs, &rds_iwdev_old, &pcm_id);
+	if (rc)
+		rds_iw_remove_cm_id(rds_iwdev, cm_id);
+
+	return rds_iw_add_cm_id(rds_iwdev, cm_id);
+}
+
+void rds_iw_add_conn(struct rds_iw_device *rds_iwdev, struct rds_connection *conn)
+{
+	struct rds_iw_connection *ic = conn->c_transport_data;
+
+	/* conn was previously on the nodev_conns_list */
+	spin_lock_irq(&iw_nodev_conns_lock);
+	BUG_ON(list_empty(&iw_nodev_conns));
+	BUG_ON(list_empty(&ic->iw_node));
+	list_del(&ic->iw_node);
+
+	spin_lock(&rds_iwdev->spinlock);
+	list_add_tail(&ic->iw_node, &rds_iwdev->conn_list);
+	spin_unlock(&rds_iwdev->spinlock);
+	spin_unlock_irq(&iw_nodev_conns_lock);
+
+	ic->rds_iwdev = rds_iwdev;
+}
+
+void rds_iw_remove_conn(struct rds_iw_device *rds_iwdev, struct rds_connection *conn)
+{
+	struct rds_iw_connection *ic = conn->c_transport_data;
+
+	/* place conn on nodev_conns_list */
+	spin_lock(&iw_nodev_conns_lock);
+
+	spin_lock_irq(&rds_iwdev->spinlock);
+	BUG_ON(list_empty(&ic->iw_node));
+	list_del(&ic->iw_node);
+	spin_unlock_irq(&rds_iwdev->spinlock);
+
+	list_add_tail(&ic->iw_node, &iw_nodev_conns);
+
+	spin_unlock(&iw_nodev_conns_lock);
+
+	rds_iw_remove_cm_id(ic->rds_iwdev, ic->i_cm_id);
+	ic->rds_iwdev = NULL;
+}
+
+void __rds_iw_destroy_conns(struct list_head *list, spinlock_t *list_lock)
+{
+	struct rds_iw_connection *ic, *_ic;
+	LIST_HEAD(tmp_list);
+
+	/* avoid calling conn_destroy with irqs off */
+	spin_lock_irq(list_lock);
+	list_splice(list, &tmp_list);
+	INIT_LIST_HEAD(list);
+	spin_unlock_irq(list_lock);
+
+	list_for_each_entry_safe(ic, _ic, &tmp_list, iw_node)
+		rds_conn_destroy(ic->conn);
+}
+
+static void rds_iw_set_scatterlist(struct rds_iw_scatterlist *sg,
+		struct scatterlist *list, unsigned int sg_len)
+{
+	sg->list = list;
+	sg->len = sg_len;
+	sg->dma_len = 0;
+	sg->dma_npages = 0;
+	sg->bytes = 0;
+}
+
+static u64 *rds_iw_map_scatterlist(struct rds_iw_device *rds_iwdev,
+			struct rds_iw_scatterlist *sg)
+{
+	struct ib_device *dev = rds_iwdev->dev;
+	u64 *dma_pages = NULL;
+	int i, j, ret;
+
+	WARN_ON(sg->dma_len);
+
+	sg->dma_len = ib_dma_map_sg(dev, sg->list, sg->len, DMA_BIDIRECTIONAL);
+	if (unlikely(!sg->dma_len)) {
+		printk(KERN_WARNING "RDS/IW: dma_map_sg failed!\n");
+		return ERR_PTR(-EBUSY);
+	}
+
+	sg->bytes = 0;
+	sg->dma_npages = 0;
+
+	ret = -EINVAL;
+	for (i = 0; i < sg->dma_len; ++i) {
+		unsigned int dma_len = ib_sg_dma_len(dev, &sg->list[i]);
+		u64 dma_addr = ib_sg_dma_address(dev, &sg->list[i]);
+		u64 end_addr;
+
+		sg->bytes += dma_len;
+
+		end_addr = dma_addr + dma_len;
+		if (dma_addr & PAGE_MASK) {
+			if (i > 0)
+				goto out_unmap;
+			dma_addr &= ~PAGE_MASK;
+		}
+		if (end_addr & PAGE_MASK) {
+			if (i < sg->dma_len - 1)
+				goto out_unmap;
+			end_addr = (end_addr + PAGE_MASK) & ~PAGE_MASK;
+		}
+
+		sg->dma_npages += (end_addr - dma_addr) >> PAGE_SHIFT;
+	}
+
+	/* Now gather the dma addrs into one list */
+	if (sg->dma_npages > fastreg_message_size)
+		goto out_unmap;
+
+	dma_pages = kmalloc(sizeof(u64) * sg->dma_npages, GFP_ATOMIC);
+	if (!dma_pages) {
+		ret = -ENOMEM;
+		goto out_unmap;
+	}
+
+	for (i = j = 0; i < sg->dma_len; ++i) {
+		unsigned int dma_len = ib_sg_dma_len(dev, &sg->list[i]);
+		u64 dma_addr = ib_sg_dma_address(dev, &sg->list[i]);
+		u64 end_addr;
+
+		end_addr = dma_addr + dma_len;
+		dma_addr &= ~PAGE_MASK;
+		for (; dma_addr < end_addr; dma_addr += PAGE_SIZE)
+			dma_pages[j++] = dma_addr;
+		BUG_ON(j > sg->dma_npages);
+	}
+
+	return dma_pages;
+
+out_unmap:
+	ib_dma_unmap_sg(rds_iwdev->dev, sg->list, sg->len, DMA_BIDIRECTIONAL);
+	sg->dma_len = 0;
+	kfree(dma_pages);
+	return ERR_PTR(ret);
+}
+
+
+struct rds_iw_mr_pool *rds_iw_create_mr_pool(struct rds_iw_device *rds_iwdev)
+{
+	struct rds_iw_mr_pool *pool;
+
+	pool = kzalloc(sizeof(*pool), GFP_KERNEL);
+	if (!pool) {
+		printk(KERN_WARNING "RDS/IW: rds_iw_create_mr_pool alloc error\n");
+		return ERR_PTR(-ENOMEM);
+	}
+
+	pool->device = rds_iwdev;
+	INIT_LIST_HEAD(&pool->dirty_list);
+	INIT_LIST_HEAD(&pool->clean_list);
+	mutex_init(&pool->flush_lock);
+	spin_lock_init(&pool->list_lock);
+	INIT_WORK(&pool->flush_worker, rds_iw_mr_pool_flush_worker);
+
+	pool->max_message_size = fastreg_message_size;
+	pool->max_items = fastreg_pool_size;
+	pool->max_free_pinned = pool->max_items * pool->max_message_size / 4;
+	pool->max_pages = fastreg_message_size;
+
+	/* We never allow more than max_items MRs to be allocated.
+	 * When we exceed more than max_items_soft, we start freeing
+	 * items more aggressively.
+	 * Make sure that max_items > max_items_soft > max_items / 2
+	 */
+	pool->max_items_soft = pool->max_items * 3 / 4;
+
+	return pool;
+}
+
+void rds_iw_get_mr_info(struct rds_iw_device *rds_iwdev, struct rds_info_rdma_connection *iinfo)
+{
+	struct rds_iw_mr_pool *pool = rds_iwdev->mr_pool;
+
+	iinfo->rdma_mr_max = pool->max_items;
+	iinfo->rdma_mr_size = pool->max_pages;
+}
+
+void rds_iw_destroy_mr_pool(struct rds_iw_mr_pool *pool)
+{
+	flush_workqueue(rds_wq);
+	rds_iw_flush_mr_pool(pool, 1);
+	BUG_ON(atomic_read(&pool->item_count));
+	BUG_ON(atomic_read(&pool->free_pinned));
+	kfree(pool);
+}
+
+static inline struct rds_iw_mr *rds_iw_reuse_fmr(struct rds_iw_mr_pool *pool)
+{
+	struct rds_iw_mr *ibmr = NULL;
+	unsigned long flags;
+
+	spin_lock_irqsave(&pool->list_lock, flags);
+	if (!list_empty(&pool->clean_list)) {
+		ibmr = list_entry(pool->clean_list.next, struct rds_iw_mr, mapping.m_list);
+		list_del_init(&ibmr->mapping.m_list);
+	}
+	spin_unlock_irqrestore(&pool->list_lock, flags);
+
+	return ibmr;
+}
+
+static struct rds_iw_mr *rds_iw_alloc_mr(struct rds_iw_device *rds_iwdev)
+{
+	struct rds_iw_mr_pool *pool = rds_iwdev->mr_pool;
+	struct rds_iw_mr *ibmr = NULL;
+	int err = 0, iter = 0;
+
+	while (1) {
+		ibmr = rds_iw_reuse_fmr(pool);
+		if (ibmr)
+			return ibmr;
+
+		/* No clean MRs - now we have the choice of either
+		 * allocating a fresh MR up to the limit imposed by the
+		 * driver, or flush any dirty unused MRs.
+		 * We try to avoid stalling in the send path if possible,
+		 * so we allocate as long as we're allowed to.
+		 *
+		 * We're fussy with enforcing the FMR limit, though. If the driver
+		 * tells us we can't use more than N fmrs, we shouldn't start
+		 * arguing with it */
+		if (atomic_inc_return(&pool->item_count) <= pool->max_items)
+			break;
+
+		atomic_dec(&pool->item_count);
+
+		if (++iter > 2) {
+			rds_iw_stats_inc(s_iw_rdma_mr_pool_depleted);
+			return ERR_PTR(-EAGAIN);
+		}
+
+		/* We do have some empty MRs. Flush them out. */
+		rds_iw_stats_inc(s_iw_rdma_mr_pool_wait);
+		rds_iw_flush_mr_pool(pool, 0);
+	}
+
+	ibmr = kzalloc(sizeof(*ibmr), GFP_KERNEL);
+	if (!ibmr) {
+		err = -ENOMEM;
+		goto out_no_cigar;
+	}
+
+	spin_lock_init(&ibmr->mapping.m_lock);
+	INIT_LIST_HEAD(&ibmr->mapping.m_list);
+	ibmr->mapping.m_mr = ibmr;
+
+	err = rds_iw_init_fastreg(pool, ibmr);
+	if (err)
+		goto out_no_cigar;
+
+	rds_iw_stats_inc(s_iw_rdma_mr_alloc);
+	return ibmr;
+
+out_no_cigar:
+	if (ibmr) {
+		rds_iw_destroy_fastreg(pool, ibmr);
+		kfree(ibmr);
+	}
+	atomic_dec(&pool->item_count);
+	return ERR_PTR(err);
+}
+
+void rds_iw_sync_mr(void *trans_private, int direction)
+{
+	struct rds_iw_mr *ibmr = trans_private;
+	struct rds_iw_device *rds_iwdev = ibmr->device;
+
+	switch (direction) {
+	case DMA_FROM_DEVICE:
+		ib_dma_sync_sg_for_cpu(rds_iwdev->dev, ibmr->mapping.m_sg.list,
+			ibmr->mapping.m_sg.dma_len, DMA_BIDIRECTIONAL);
+		break;
+	case DMA_TO_DEVICE:
+		ib_dma_sync_sg_for_device(rds_iwdev->dev, ibmr->mapping.m_sg.list,
+			ibmr->mapping.m_sg.dma_len, DMA_BIDIRECTIONAL);
+		break;
+	}
+}
+
+/*
+ * Flush our pool of MRs.
+ * At a minimum, all currently unused MRs are unmapped.
+ * If the number of MRs allocated exceeds the limit, we also try
+ * to free as many MRs as needed to get back to this limit.
+ */
+static int rds_iw_flush_mr_pool(struct rds_iw_mr_pool *pool, int free_all)
+{
+	struct rds_iw_mr *ibmr, *next;
+	LIST_HEAD(unmap_list);
+	LIST_HEAD(kill_list);
+	unsigned long flags;
+	unsigned int nfreed = 0, ncleaned = 0, unpinned = 0;
+	int ret = 0;
+
+	rds_iw_stats_inc(s_iw_rdma_mr_pool_flush);
+
+	mutex_lock(&pool->flush_lock);
+
+	spin_lock_irqsave(&pool->list_lock, flags);
+	/* Get the list of all mappings to be destroyed */
+	list_splice_init(&pool->dirty_list, &unmap_list);
+	if (free_all)
+		list_splice_init(&pool->clean_list, &kill_list);
+	spin_unlock_irqrestore(&pool->list_lock, flags);
+
+	/* Batched invalidate of dirty MRs.
+	 * For FMR based MRs, the mappings on the unmap list are
+	 * actually members of an ibmr (ibmr->mapping). They either
+	 * migrate to the kill_list, or have been cleaned and should be
+	 * moved to the clean_list.
+	 * For fastregs, they will be dynamically allocated, and
+	 * will be destroyed by the unmap function.
+	 */
+	if (!list_empty(&unmap_list)) {
+		ncleaned = rds_iw_unmap_fastreg_list(pool, &unmap_list,
+						     &kill_list, &unpinned);
+		/* If we've been asked to destroy all MRs, move those
+		 * that were simply cleaned to the kill list */
+		if (free_all)
+			list_splice_init(&unmap_list, &kill_list);
+	}
+
+	/* Destroy any MRs that are past their best before date */
+	list_for_each_entry_safe(ibmr, next, &kill_list, mapping.m_list) {
+		rds_iw_stats_inc(s_iw_rdma_mr_free);
+		list_del(&ibmr->mapping.m_list);
+		rds_iw_destroy_fastreg(pool, ibmr);
+		kfree(ibmr);
+		nfreed++;
+	}
+
+	/* Anything that remains are laundered ibmrs, which we can add
+	 * back to the clean list. */
+	if (!list_empty(&unmap_list)) {
+		spin_lock_irqsave(&pool->list_lock, flags);
+		list_splice(&unmap_list, &pool->clean_list);
+		spin_unlock_irqrestore(&pool->list_lock, flags);
+	}
+
+	atomic_sub(unpinned, &pool->free_pinned);
+	atomic_sub(ncleaned, &pool->dirty_count);
+	atomic_sub(nfreed, &pool->item_count);
+
+	mutex_unlock(&pool->flush_lock);
+	return ret;
+}
+
+static void rds_iw_mr_pool_flush_worker(struct work_struct *work)
+{
+	struct rds_iw_mr_pool *pool = container_of(work, struct rds_iw_mr_pool, flush_worker);
+
+	rds_iw_flush_mr_pool(pool, 0);
+}
+
+void rds_iw_free_mr(void *trans_private, int invalidate)
+{
+	struct rds_iw_mr *ibmr = trans_private;
+	struct rds_iw_mr_pool *pool = ibmr->device->mr_pool;
+
+	rdsdebug("RDS/IW: free_mr nents %u\n", ibmr->mapping.m_sg.len);
+	if (!pool)
+		return;
+
+	/* Return it to the pool's free list */
+	rds_iw_free_fastreg(pool, ibmr);
+
+	/* If we've pinned too many pages, request a flush */
+	if (atomic_read(&pool->free_pinned) >= pool->max_free_pinned ||
+	    atomic_read(&pool->dirty_count) >= pool->max_items / 10)
+		queue_work(rds_wq, &pool->flush_worker);
+
+	if (invalidate) {
+		if (likely(!in_interrupt())) {
+			rds_iw_flush_mr_pool(pool, 0);
+		} else {
+			/* We get here if the user created a MR marked
+			 * as use_once and invalidate at the same time. */
+			queue_work(rds_wq, &pool->flush_worker);
+		}
+	}
+}
+
+void rds_iw_flush_mrs(void)
+{
+	struct rds_iw_device *rds_iwdev;
+
+	list_for_each_entry(rds_iwdev, &rds_iw_devices, list) {
+		struct rds_iw_mr_pool *pool = rds_iwdev->mr_pool;
+
+		if (pool)
+			rds_iw_flush_mr_pool(pool, 0);
+	}
+}
+
+void *rds_iw_get_mr(struct scatterlist *sg, unsigned long nents,
+		    struct rds_sock *rs, u32 *key_ret)
+{
+	struct rds_iw_device *rds_iwdev;
+	struct rds_iw_mr *ibmr = NULL;
+	struct rdma_cm_id *cm_id;
+	int ret;
+
+	ret = rds_iw_get_device(rs, &rds_iwdev, &cm_id);
+	if (ret || !cm_id) {
+		ret = -ENODEV;
+		goto out;
+	}
+
+	if (!rds_iwdev->mr_pool) {
+		ret = -ENODEV;
+		goto out;
+	}
+
+	ibmr = rds_iw_alloc_mr(rds_iwdev);
+	if (IS_ERR(ibmr))
+		return ibmr;
+
+	ibmr->cm_id = cm_id;
+	ibmr->device = rds_iwdev;
+
+	ret = rds_iw_map_fastreg(rds_iwdev->mr_pool, ibmr, sg, nents);
+	if (ret == 0)
+		*key_ret = ibmr->mr->rkey;
+	else
+		printk(KERN_WARNING "RDS/IW: failed to map mr (errno=%d)\n", ret);
+
+out:
+	if (ret) {
+		if (ibmr)
+			rds_iw_free_mr(ibmr, 0);
+		ibmr = ERR_PTR(ret);
+	}
+	return ibmr;
+}
+
+/*
+ * iWARP fastreg handling
+ *
+ * The life cycle of a fastreg registration is a bit different from
+ * FMRs.
+ * The idea behind fastreg is to have one MR, to which we bind different
+ * mappings over time. To avoid stalling on the expensive map and invalidate
+ * operations, these operations are pipelined on the same send queue on
+ * which we want to send the message containing the r_key.
+ *
+ * This creates a bit of a problem for us, as we do not have the destination
+ * IP in GET_MR, so the connection must be setup prior to the GET_MR call for
+ * RDMA to be correctly setup.  If a fastreg request is present, rds_iw_xmit
+ * will try to queue a LOCAL_INV (if needed) and a FAST_REG_MR work request
+ * before queuing the SEND. When completions for these arrive, they are
+ * dispatched to the MR has a bit set showing that RDMa can be performed.
+ *
+ * There is another interesting aspect that's related to invalidation.
+ * The application can request that a mapping is invalidated in FREE_MR.
+ * The expectation there is that this invalidation step includes ALL
+ * PREVIOUSLY FREED MRs.
+ */
+static int rds_iw_init_fastreg(struct rds_iw_mr_pool *pool,
+				struct rds_iw_mr *ibmr)
+{
+	struct rds_iw_device *rds_iwdev = pool->device;
+	struct ib_fast_reg_page_list *page_list = NULL;
+	struct ib_mr *mr;
+	int err;
+
+	mr = ib_alloc_fast_reg_mr(rds_iwdev->pd, pool->max_message_size);
+	if (IS_ERR(mr)) {
+		err = PTR_ERR(mr);
+
+		printk(KERN_WARNING "RDS/IW: ib_alloc_fast_reg_mr failed (err=%d)\n", err);
+		return err;
+	}
+
+	/* FIXME - this is overkill, but mapping->m_sg.dma_len/mapping->m_sg.dma_npages
+	 * is not filled in.
+	 */
+	page_list = ib_alloc_fast_reg_page_list(rds_iwdev->dev, pool->max_message_size);
+	if (IS_ERR(page_list)) {
+		err = PTR_ERR(page_list);
+
+		printk(KERN_WARNING "RDS/IW: ib_alloc_fast_reg_page_list failed (err=%d)\n", err);
+		ib_dereg_mr(mr);
+		return err;
+	}
+
+	ibmr->page_list = page_list;
+	ibmr->mr = mr;
+	return 0;
+}
+
+static int rds_iw_rdma_build_fastreg(struct rds_iw_mapping *mapping)
+{
+	struct rds_iw_mr *ibmr = mapping->m_mr;
+	struct ib_send_wr f_wr, *failed_wr;
+	int ret;
+
+	/*
+	 * Perform a WR for the fast_reg_mr. Each individual page
+	 * in the sg list is added to the fast reg page list and placed
+	 * inside the fast_reg_mr WR.  The key used is a rolling 8bit
+	 * counter, which should guarantee uniqueness.
+	 */
+	ib_update_fast_reg_key(ibmr->mr, ibmr->remap_count++);
+	mapping->m_rkey = ibmr->mr->rkey;
+
+	memset(&f_wr, 0, sizeof(f_wr));
+	f_wr.wr_id = RDS_IW_FAST_REG_WR_ID;
+	f_wr.opcode = IB_WR_FAST_REG_MR;
+	f_wr.wr.fast_reg.length = mapping->m_sg.bytes;
+	f_wr.wr.fast_reg.rkey = mapping->m_rkey;
+	f_wr.wr.fast_reg.page_list = ibmr->page_list;
+	f_wr.wr.fast_reg.page_list_len = mapping->m_sg.dma_len;
+	f_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
+	f_wr.wr.fast_reg.access_flags = IB_ACCESS_LOCAL_WRITE |
+				IB_ACCESS_REMOTE_READ |
+				IB_ACCESS_REMOTE_WRITE;
+	f_wr.wr.fast_reg.iova_start = 0;
+	f_wr.send_flags = IB_SEND_SIGNALED;
+
+	failed_wr = &f_wr;
+	ret = ib_post_send(ibmr->cm_id->qp, &f_wr, &failed_wr);
+	BUG_ON(failed_wr != &f_wr);
+	if (ret)
+		printk_ratelimited(KERN_WARNING "RDS/IW: %s:%d ib_post_send returned %d\n",
+			__func__, __LINE__, ret);
+	return ret;
+}
+
+static int rds_iw_rdma_fastreg_inv(struct rds_iw_mr *ibmr)
+{
+	struct ib_send_wr s_wr, *failed_wr;
+	int ret = 0;
+
+	if (!ibmr->cm_id->qp || !ibmr->mr)
+		goto out;
+
+	memset(&s_wr, 0, sizeof(s_wr));
+	s_wr.wr_id = RDS_IW_LOCAL_INV_WR_ID;
+	s_wr.opcode = IB_WR_LOCAL_INV;
+	s_wr.ex.invalidate_rkey = ibmr->mr->rkey;
+	s_wr.send_flags = IB_SEND_SIGNALED;
+
+	failed_wr = &s_wr;
+	ret = ib_post_send(ibmr->cm_id->qp, &s_wr, &failed_wr);
+	if (ret) {
+		printk_ratelimited(KERN_WARNING "RDS/IW: %s:%d ib_post_send returned %d\n",
+			__func__, __LINE__, ret);
+		goto out;
+	}
+out:
+	return ret;
+}
+
+static int rds_iw_map_fastreg(struct rds_iw_mr_pool *pool,
+			struct rds_iw_mr *ibmr,
+			struct scatterlist *sg,
+			unsigned int sg_len)
+{
+	struct rds_iw_device *rds_iwdev = pool->device;
+	struct rds_iw_mapping *mapping = &ibmr->mapping;
+	u64 *dma_pages;
+	int i, ret = 0;
+
+	rds_iw_set_scatterlist(&mapping->m_sg, sg, sg_len);
+
+	dma_pages = rds_iw_map_scatterlist(rds_iwdev, &mapping->m_sg);
+	if (IS_ERR(dma_pages)) {
+		ret = PTR_ERR(dma_pages);
+		dma_pages = NULL;
+		goto out;
+	}
+
+	if (mapping->m_sg.dma_len > pool->max_message_size) {
+		ret = -EMSGSIZE;
+		goto out;
+	}
+
+	for (i = 0; i < mapping->m_sg.dma_npages; ++i)
+		ibmr->page_list->page_list[i] = dma_pages[i];
+
+	ret = rds_iw_rdma_build_fastreg(mapping);
+	if (ret)
+		goto out;
+
+	rds_iw_stats_inc(s_iw_rdma_mr_used);
+
+out:
+	kfree(dma_pages);
+
+	return ret;
+}
+
+/*
+ * "Free" a fastreg MR.
+ */
+static void rds_iw_free_fastreg(struct rds_iw_mr_pool *pool,
+		struct rds_iw_mr *ibmr)
+{
+	unsigned long flags;
+	int ret;
+
+	if (!ibmr->mapping.m_sg.dma_len)
+		return;
+
+	ret = rds_iw_rdma_fastreg_inv(ibmr);
+	if (ret)
+		return;
+
+	/* Try to post the LOCAL_INV WR to the queue. */
+	spin_lock_irqsave(&pool->list_lock, flags);
+
+	list_add_tail(&ibmr->mapping.m_list, &pool->dirty_list);
+	atomic_add(ibmr->mapping.m_sg.len, &pool->free_pinned);
+	atomic_inc(&pool->dirty_count);
+
+	spin_unlock_irqrestore(&pool->list_lock, flags);
+}
+
+static unsigned int rds_iw_unmap_fastreg_list(struct rds_iw_mr_pool *pool,
+				struct list_head *unmap_list,
+				struct list_head *kill_list,
+				int *unpinned)
+{
+	struct rds_iw_mapping *mapping, *next;
+	unsigned int ncleaned = 0;
+	LIST_HEAD(laundered);
+
+	/* Batched invalidation of fastreg MRs.
+	 * Why do we do it this way, even though we could pipeline unmap
+	 * and remap? The reason is the application semantics - when the
+	 * application requests an invalidation of MRs, it expects all
+	 * previously released R_Keys to become invalid.
+	 *
+	 * If we implement MR reuse naively, we risk memory corruption
+	 * (this has actually been observed). So the default behavior
+	 * requires that a MR goes through an explicit unmap operation before
+	 * we can reuse it again.
+	 *
+	 * We could probably improve on this a little, by allowing immediate
+	 * reuse of a MR on the same socket (eg you could add small
+	 * cache of unused MRs to strct rds_socket - GET_MR could grab one
+	 * of these without requiring an explicit invalidate).
+	 */
+	while (!list_empty(unmap_list)) {
+		unsigned long flags;
+
+		spin_lock_irqsave(&pool->list_lock, flags);
+		list_for_each_entry_safe(mapping, next, unmap_list, m_list) {
+			*unpinned += mapping->m_sg.len;
+			list_move(&mapping->m_list, &laundered);
+			ncleaned++;
+		}
+		spin_unlock_irqrestore(&pool->list_lock, flags);
+	}
+
+	/* Move all laundered mappings back to the unmap list.
+	 * We do not kill any WRs right now - it doesn't seem the
+	 * fastreg API has a max_remap limit. */
+	list_splice_init(&laundered, unmap_list);
+
+	return ncleaned;
+}
+
+static void rds_iw_destroy_fastreg(struct rds_iw_mr_pool *pool,
+		struct rds_iw_mr *ibmr)
+{
+	if (ibmr->page_list)
+		ib_free_fast_reg_page_list(ibmr->page_list);
+	if (ibmr->mr)
+		ib_dereg_mr(ibmr->mr);
+}
diff --git a/net/rds/iw_recv.c b/net/rds/iw_recv.c
new file mode 100644
index 0000000..aa8bf67
--- /dev/null
+++ b/net/rds/iw_recv.c
@@ -0,0 +1,919 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/pci.h>
+#include <linux/dma-mapping.h>
+#include <rdma/rdma_cm.h>
+
+#include "rds.h"
+#include "iw.h"
+
+static struct kmem_cache *rds_iw_incoming_slab;
+static struct kmem_cache *rds_iw_frag_slab;
+static atomic_t	rds_iw_allocation = ATOMIC_INIT(0);
+
+static void rds_iw_frag_drop_page(struct rds_page_frag *frag)
+{
+	rdsdebug("frag %p page %p\n", frag, frag->f_page);
+	__free_page(frag->f_page);
+	frag->f_page = NULL;
+}
+
+static void rds_iw_frag_free(struct rds_page_frag *frag)
+{
+	rdsdebug("frag %p page %p\n", frag, frag->f_page);
+	BUG_ON(frag->f_page);
+	kmem_cache_free(rds_iw_frag_slab, frag);
+}
+
+/*
+ * We map a page at a time.  Its fragments are posted in order.  This
+ * is called in fragment order as the fragments get send completion events.
+ * Only the last frag in the page performs the unmapping.
+ *
+ * It's OK for ring cleanup to call this in whatever order it likes because
+ * DMA is not in flight and so we can unmap while other ring entries still
+ * hold page references in their frags.
+ */
+static void rds_iw_recv_unmap_page(struct rds_iw_connection *ic,
+				   struct rds_iw_recv_work *recv)
+{
+	struct rds_page_frag *frag = recv->r_frag;
+
+	rdsdebug("recv %p frag %p page %p\n", recv, frag, frag->f_page);
+	if (frag->f_mapped)
+		ib_dma_unmap_page(ic->i_cm_id->device,
+			       frag->f_mapped,
+			       RDS_FRAG_SIZE, DMA_FROM_DEVICE);
+	frag->f_mapped = 0;
+}
+
+void rds_iw_recv_init_ring(struct rds_iw_connection *ic)
+{
+	struct rds_iw_recv_work *recv;
+	u32 i;
+
+	for (i = 0, recv = ic->i_recvs; i < ic->i_recv_ring.w_nr; i++, recv++) {
+		struct ib_sge *sge;
+
+		recv->r_iwinc = NULL;
+		recv->r_frag = NULL;
+
+		recv->r_wr.next = NULL;
+		recv->r_wr.wr_id = i;
+		recv->r_wr.sg_list = recv->r_sge;
+		recv->r_wr.num_sge = RDS_IW_RECV_SGE;
+
+		sge = rds_iw_data_sge(ic, recv->r_sge);
+		sge->addr = 0;
+		sge->length = RDS_FRAG_SIZE;
+		sge->lkey = 0;
+
+		sge = rds_iw_header_sge(ic, recv->r_sge);
+		sge->addr = ic->i_recv_hdrs_dma + (i * sizeof(struct rds_header));
+		sge->length = sizeof(struct rds_header);
+		sge->lkey = 0;
+	}
+}
+
+static void rds_iw_recv_clear_one(struct rds_iw_connection *ic,
+				  struct rds_iw_recv_work *recv)
+{
+	if (recv->r_iwinc) {
+		rds_inc_put(&recv->r_iwinc->ii_inc);
+		recv->r_iwinc = NULL;
+	}
+	if (recv->r_frag) {
+		rds_iw_recv_unmap_page(ic, recv);
+		if (recv->r_frag->f_page)
+			rds_iw_frag_drop_page(recv->r_frag);
+		rds_iw_frag_free(recv->r_frag);
+		recv->r_frag = NULL;
+	}
+}
+
+void rds_iw_recv_clear_ring(struct rds_iw_connection *ic)
+{
+	u32 i;
+
+	for (i = 0; i < ic->i_recv_ring.w_nr; i++)
+		rds_iw_recv_clear_one(ic, &ic->i_recvs[i]);
+
+	if (ic->i_frag.f_page)
+		rds_iw_frag_drop_page(&ic->i_frag);
+}
+
+static int rds_iw_recv_refill_one(struct rds_connection *conn,
+				  struct rds_iw_recv_work *recv,
+				  gfp_t kptr_gfp, gfp_t page_gfp)
+{
+	struct rds_iw_connection *ic = conn->c_transport_data;
+	dma_addr_t dma_addr;
+	struct ib_sge *sge;
+	int ret = -ENOMEM;
+
+	if (!recv->r_iwinc) {
+		if (!atomic_add_unless(&rds_iw_allocation, 1, rds_iw_sysctl_max_recv_allocation)) {
+			rds_iw_stats_inc(s_iw_rx_alloc_limit);
+			goto out;
+		}
+		recv->r_iwinc = kmem_cache_alloc(rds_iw_incoming_slab,
+						 kptr_gfp);
+		if (!recv->r_iwinc) {
+			atomic_dec(&rds_iw_allocation);
+			goto out;
+		}
+		INIT_LIST_HEAD(&recv->r_iwinc->ii_frags);
+		rds_inc_init(&recv->r_iwinc->ii_inc, conn, conn->c_faddr);
+	}
+
+	if (!recv->r_frag) {
+		recv->r_frag = kmem_cache_alloc(rds_iw_frag_slab, kptr_gfp);
+		if (!recv->r_frag)
+			goto out;
+		INIT_LIST_HEAD(&recv->r_frag->f_item);
+		recv->r_frag->f_page = NULL;
+	}
+
+	if (!ic->i_frag.f_page) {
+		ic->i_frag.f_page = alloc_page(page_gfp);
+		if (!ic->i_frag.f_page)
+			goto out;
+		ic->i_frag.f_offset = 0;
+	}
+
+	dma_addr = ib_dma_map_page(ic->i_cm_id->device,
+				  ic->i_frag.f_page,
+				  ic->i_frag.f_offset,
+				  RDS_FRAG_SIZE,
+				  DMA_FROM_DEVICE);
+	if (ib_dma_mapping_error(ic->i_cm_id->device, dma_addr))
+		goto out;
+
+	/*
+	 * Once we get the RDS_PAGE_LAST_OFF frag then rds_iw_frag_unmap()
+	 * must be called on this recv.  This happens as completions hit
+	 * in order or on connection shutdown.
+	 */
+	recv->r_frag->f_page = ic->i_frag.f_page;
+	recv->r_frag->f_offset = ic->i_frag.f_offset;
+	recv->r_frag->f_mapped = dma_addr;
+
+	sge = rds_iw_data_sge(ic, recv->r_sge);
+	sge->addr = dma_addr;
+	sge->length = RDS_FRAG_SIZE;
+
+	sge = rds_iw_header_sge(ic, recv->r_sge);
+	sge->addr = ic->i_recv_hdrs_dma + (recv - ic->i_recvs) * sizeof(struct rds_header);
+	sge->length = sizeof(struct rds_header);
+
+	get_page(recv->r_frag->f_page);
+
+	if (ic->i_frag.f_offset < RDS_PAGE_LAST_OFF) {
+		ic->i_frag.f_offset += RDS_FRAG_SIZE;
+	} else {
+		put_page(ic->i_frag.f_page);
+		ic->i_frag.f_page = NULL;
+		ic->i_frag.f_offset = 0;
+	}
+
+	ret = 0;
+out:
+	return ret;
+}
+
+/*
+ * This tries to allocate and post unused work requests after making sure that
+ * they have all the allocations they need to queue received fragments into
+ * sockets.  The i_recv_mutex is held here so that ring_alloc and _unalloc
+ * pairs don't go unmatched.
+ *
+ * -1 is returned if posting fails due to temporary resource exhaustion.
+ */
+int rds_iw_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp,
+		       gfp_t page_gfp, int prefill)
+{
+	struct rds_iw_connection *ic = conn->c_transport_data;
+	struct rds_iw_recv_work *recv;
+	struct ib_recv_wr *failed_wr;
+	unsigned int posted = 0;
+	int ret = 0;
+	u32 pos;
+
+	while ((prefill || rds_conn_up(conn)) &&
+	       rds_iw_ring_alloc(&ic->i_recv_ring, 1, &pos)) {
+		if (pos >= ic->i_recv_ring.w_nr) {
+			printk(KERN_NOTICE "Argh - ring alloc returned pos=%u\n",
+					pos);
+			ret = -EINVAL;
+			break;
+		}
+
+		recv = &ic->i_recvs[pos];
+		ret = rds_iw_recv_refill_one(conn, recv, kptr_gfp, page_gfp);
+		if (ret) {
+			ret = -1;
+			break;
+		}
+
+		/* XXX when can this fail? */
+		ret = ib_post_recv(ic->i_cm_id->qp, &recv->r_wr, &failed_wr);
+		rdsdebug("recv %p iwinc %p page %p addr %lu ret %d\n", recv,
+			 recv->r_iwinc, recv->r_frag->f_page,
+			 (long) recv->r_frag->f_mapped, ret);
+		if (ret) {
+			rds_iw_conn_error(conn, "recv post on "
+			       "%pI4 returned %d, disconnecting and "
+			       "reconnecting\n", &conn->c_faddr,
+			       ret);
+			ret = -1;
+			break;
+		}
+
+		posted++;
+	}
+
+	/* We're doing flow control - update the window. */
+	if (ic->i_flowctl && posted)
+		rds_iw_advertise_credits(conn, posted);
+
+	if (ret)
+		rds_iw_ring_unalloc(&ic->i_recv_ring, 1);
+	return ret;
+}
+
+static void rds_iw_inc_purge(struct rds_incoming *inc)
+{
+	struct rds_iw_incoming *iwinc;
+	struct rds_page_frag *frag;
+	struct rds_page_frag *pos;
+
+	iwinc = container_of(inc, struct rds_iw_incoming, ii_inc);
+	rdsdebug("purging iwinc %p inc %p\n", iwinc, inc);
+
+	list_for_each_entry_safe(frag, pos, &iwinc->ii_frags, f_item) {
+		list_del_init(&frag->f_item);
+		rds_iw_frag_drop_page(frag);
+		rds_iw_frag_free(frag);
+	}
+}
+
+void rds_iw_inc_free(struct rds_incoming *inc)
+{
+	struct rds_iw_incoming *iwinc;
+
+	iwinc = container_of(inc, struct rds_iw_incoming, ii_inc);
+
+	rds_iw_inc_purge(inc);
+	rdsdebug("freeing iwinc %p inc %p\n", iwinc, inc);
+	BUG_ON(!list_empty(&iwinc->ii_frags));
+	kmem_cache_free(rds_iw_incoming_slab, iwinc);
+	atomic_dec(&rds_iw_allocation);
+	BUG_ON(atomic_read(&rds_iw_allocation) < 0);
+}
+
+int rds_iw_inc_copy_to_user(struct rds_incoming *inc, struct iovec *first_iov,
+			    size_t size)
+{
+	struct rds_iw_incoming *iwinc;
+	struct rds_page_frag *frag;
+	struct iovec *iov = first_iov;
+	unsigned long to_copy;
+	unsigned long frag_off = 0;
+	unsigned long iov_off = 0;
+	int copied = 0;
+	int ret;
+	u32 len;
+
+	iwinc = container_of(inc, struct rds_iw_incoming, ii_inc);
+	frag = list_entry(iwinc->ii_frags.next, struct rds_page_frag, f_item);
+	len = be32_to_cpu(inc->i_hdr.h_len);
+
+	while (copied < size && copied < len) {
+		if (frag_off == RDS_FRAG_SIZE) {
+			frag = list_entry(frag->f_item.next,
+					  struct rds_page_frag, f_item);
+			frag_off = 0;
+		}
+		while (iov_off == iov->iov_len) {
+			iov_off = 0;
+			iov++;
+		}
+
+		to_copy = min(iov->iov_len - iov_off, RDS_FRAG_SIZE - frag_off);
+		to_copy = min_t(size_t, to_copy, size - copied);
+		to_copy = min_t(unsigned long, to_copy, len - copied);
+
+		rdsdebug("%lu bytes to user [%p, %zu] + %lu from frag "
+			 "[%p, %lu] + %lu\n",
+			 to_copy, iov->iov_base, iov->iov_len, iov_off,
+			 frag->f_page, frag->f_offset, frag_off);
+
+		/* XXX needs + offset for multiple recvs per page */
+		ret = rds_page_copy_to_user(frag->f_page,
+					    frag->f_offset + frag_off,
+					    iov->iov_base + iov_off,
+					    to_copy);
+		if (ret) {
+			copied = ret;
+			break;
+		}
+
+		iov_off += to_copy;
+		frag_off += to_copy;
+		copied += to_copy;
+	}
+
+	return copied;
+}
+
+/* ic starts out kzalloc()ed */
+void rds_iw_recv_init_ack(struct rds_iw_connection *ic)
+{
+	struct ib_send_wr *wr = &ic->i_ack_wr;
+	struct ib_sge *sge = &ic->i_ack_sge;
+
+	sge->addr = ic->i_ack_dma;
+	sge->length = sizeof(struct rds_header);
+	sge->lkey = rds_iw_local_dma_lkey(ic);
+
+	wr->sg_list = sge;
+	wr->num_sge = 1;
+	wr->opcode = IB_WR_SEND;
+	wr->wr_id = RDS_IW_ACK_WR_ID;
+	wr->send_flags = IB_SEND_SIGNALED | IB_SEND_SOLICITED;
+}
+
+/*
+ * You'd think that with reliable IB connections you wouldn't need to ack
+ * messages that have been received.  The problem is that IB hardware generates
+ * an ack message before it has DMAed the message into memory.  This creates a
+ * potential message loss if the HCA is disabled for any reason between when it
+ * sends the ack and before the message is DMAed and processed.  This is only a
+ * potential issue if another HCA is available for fail-over.
+ *
+ * When the remote host receives our ack they'll free the sent message from
+ * their send queue.  To decrease the latency of this we always send an ack
+ * immediately after we've received messages.
+ *
+ * For simplicity, we only have one ack in flight at a time.  This puts
+ * pressure on senders to have deep enough send queues to absorb the latency of
+ * a single ack frame being in flight.  This might not be good enough.
+ *
+ * This is implemented by have a long-lived send_wr and sge which point to a
+ * statically allocated ack frame.  This ack wr does not fall under the ring
+ * accounting that the tx and rx wrs do.  The QP attribute specifically makes
+ * room for it beyond the ring size.  Send completion notices its special
+ * wr_id and avoids working with the ring in that case.
+ */
+#ifndef KERNEL_HAS_ATOMIC64
+static void rds_iw_set_ack(struct rds_iw_connection *ic, u64 seq,
+				int ack_required)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&ic->i_ack_lock, flags);
+	ic->i_ack_next = seq;
+	if (ack_required)
+		set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
+	spin_unlock_irqrestore(&ic->i_ack_lock, flags);
+}
+
+static u64 rds_iw_get_ack(struct rds_iw_connection *ic)
+{
+	unsigned long flags;
+	u64 seq;
+
+	clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
+
+	spin_lock_irqsave(&ic->i_ack_lock, flags);
+	seq = ic->i_ack_next;
+	spin_unlock_irqrestore(&ic->i_ack_lock, flags);
+
+	return seq;
+}
+#else
+static void rds_iw_set_ack(struct rds_iw_connection *ic, u64 seq,
+				int ack_required)
+{
+	atomic64_set(&ic->i_ack_next, seq);
+	if (ack_required) {
+		smp_mb__before_atomic();
+		set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
+	}
+}
+
+static u64 rds_iw_get_ack(struct rds_iw_connection *ic)
+{
+	clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
+	smp_mb__after_atomic();
+
+	return atomic64_read(&ic->i_ack_next);
+}
+#endif
+
+
+static void rds_iw_send_ack(struct rds_iw_connection *ic, unsigned int adv_credits)
+{
+	struct rds_header *hdr = ic->i_ack;
+	struct ib_send_wr *failed_wr;
+	u64 seq;
+	int ret;
+
+	seq = rds_iw_get_ack(ic);
+
+	rdsdebug("send_ack: ic %p ack %llu\n", ic, (unsigned long long) seq);
+	rds_message_populate_header(hdr, 0, 0, 0);
+	hdr->h_ack = cpu_to_be64(seq);
+	hdr->h_credit = adv_credits;
+	rds_message_make_checksum(hdr);
+	ic->i_ack_queued = jiffies;
+
+	ret = ib_post_send(ic->i_cm_id->qp, &ic->i_ack_wr, &failed_wr);
+	if (unlikely(ret)) {
+		/* Failed to send. Release the WR, and
+		 * force another ACK.
+		 */
+		clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
+		set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
+
+		rds_iw_stats_inc(s_iw_ack_send_failure);
+
+		rds_iw_conn_error(ic->conn, "sending ack failed\n");
+	} else
+		rds_iw_stats_inc(s_iw_ack_sent);
+}
+
+/*
+ * There are 3 ways of getting acknowledgements to the peer:
+ *  1.	We call rds_iw_attempt_ack from the recv completion handler
+ *	to send an ACK-only frame.
+ *	However, there can be only one such frame in the send queue
+ *	at any time, so we may have to postpone it.
+ *  2.	When another (data) packet is transmitted while there's
+ *	an ACK in the queue, we piggyback the ACK sequence number
+ *	on the data packet.
+ *  3.	If the ACK WR is done sending, we get called from the
+ *	send queue completion handler, and check whether there's
+ *	another ACK pending (postponed because the WR was on the
+ *	queue). If so, we transmit it.
+ *
+ * We maintain 2 variables:
+ *  -	i_ack_flags, which keeps track of whether the ACK WR
+ *	is currently in the send queue or not (IB_ACK_IN_FLIGHT)
+ *  -	i_ack_next, which is the last sequence number we received
+ *
+ * Potentially, send queue and receive queue handlers can run concurrently.
+ * It would be nice to not have to use a spinlock to synchronize things,
+ * but the one problem that rules this out is that 64bit updates are
+ * not atomic on all platforms. Things would be a lot simpler if
+ * we had atomic64 or maybe cmpxchg64 everywhere.
+ *
+ * Reconnecting complicates this picture just slightly. When we
+ * reconnect, we may be seeing duplicate packets. The peer
+ * is retransmitting them, because it hasn't seen an ACK for
+ * them. It is important that we ACK these.
+ *
+ * ACK mitigation adds a header flag "ACK_REQUIRED"; any packet with
+ * this flag set *MUST* be acknowledged immediately.
+ */
+
+/*
+ * When we get here, we're called from the recv queue handler.
+ * Check whether we ought to transmit an ACK.
+ */
+void rds_iw_attempt_ack(struct rds_iw_connection *ic)
+{
+	unsigned int adv_credits;
+
+	if (!test_bit(IB_ACK_REQUESTED, &ic->i_ack_flags))
+		return;
+
+	if (test_and_set_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags)) {
+		rds_iw_stats_inc(s_iw_ack_send_delayed);
+		return;
+	}
+
+	/* Can we get a send credit? */
+	if (!rds_iw_send_grab_credits(ic, 1, &adv_credits, 0, RDS_MAX_ADV_CREDIT)) {
+		rds_iw_stats_inc(s_iw_tx_throttle);
+		clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
+		return;
+	}
+
+	clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
+	rds_iw_send_ack(ic, adv_credits);
+}
+
+/*
+ * We get here from the send completion handler, when the
+ * adapter tells us the ACK frame was sent.
+ */
+void rds_iw_ack_send_complete(struct rds_iw_connection *ic)
+{
+	clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
+	rds_iw_attempt_ack(ic);
+}
+
+/*
+ * This is called by the regular xmit code when it wants to piggyback
+ * an ACK on an outgoing frame.
+ */
+u64 rds_iw_piggyb_ack(struct rds_iw_connection *ic)
+{
+	if (test_and_clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags))
+		rds_iw_stats_inc(s_iw_ack_send_piggybacked);
+	return rds_iw_get_ack(ic);
+}
+
+/*
+ * It's kind of lame that we're copying from the posted receive pages into
+ * long-lived bitmaps.  We could have posted the bitmaps and rdma written into
+ * them.  But receiving new congestion bitmaps should be a *rare* event, so
+ * hopefully we won't need to invest that complexity in making it more
+ * efficient.  By copying we can share a simpler core with TCP which has to
+ * copy.
+ */
+static void rds_iw_cong_recv(struct rds_connection *conn,
+			      struct rds_iw_incoming *iwinc)
+{
+	struct rds_cong_map *map;
+	unsigned int map_off;
+	unsigned int map_page;
+	struct rds_page_frag *frag;
+	unsigned long frag_off;
+	unsigned long to_copy;
+	unsigned long copied;
+	uint64_t uncongested = 0;
+	void *addr;
+
+	/* catch completely corrupt packets */
+	if (be32_to_cpu(iwinc->ii_inc.i_hdr.h_len) != RDS_CONG_MAP_BYTES)
+		return;
+
+	map = conn->c_fcong;
+	map_page = 0;
+	map_off = 0;
+
+	frag = list_entry(iwinc->ii_frags.next, struct rds_page_frag, f_item);
+	frag_off = 0;
+
+	copied = 0;
+
+	while (copied < RDS_CONG_MAP_BYTES) {
+		uint64_t *src, *dst;
+		unsigned int k;
+
+		to_copy = min(RDS_FRAG_SIZE - frag_off, PAGE_SIZE - map_off);
+		BUG_ON(to_copy & 7); /* Must be 64bit aligned. */
+
+		addr = kmap_atomic(frag->f_page);
+
+		src = addr + frag_off;
+		dst = (void *)map->m_page_addrs[map_page] + map_off;
+		for (k = 0; k < to_copy; k += 8) {
+			/* Record ports that became uncongested, ie
+			 * bits that changed from 0 to 1. */
+			uncongested |= ~(*src) & *dst;
+			*dst++ = *src++;
+		}
+		kunmap_atomic(addr);
+
+		copied += to_copy;
+
+		map_off += to_copy;
+		if (map_off == PAGE_SIZE) {
+			map_off = 0;
+			map_page++;
+		}
+
+		frag_off += to_copy;
+		if (frag_off == RDS_FRAG_SIZE) {
+			frag = list_entry(frag->f_item.next,
+					  struct rds_page_frag, f_item);
+			frag_off = 0;
+		}
+	}
+
+	/* the congestion map is in little endian order */
+	uncongested = le64_to_cpu(uncongested);
+
+	rds_cong_map_updated(map, uncongested);
+}
+
+/*
+ * Rings are posted with all the allocations they'll need to queue the
+ * incoming message to the receiving socket so this can't fail.
+ * All fragments start with a header, so we can make sure we're not receiving
+ * garbage, and we can tell a small 8 byte fragment from an ACK frame.
+ */
+struct rds_iw_ack_state {
+	u64		ack_next;
+	u64		ack_recv;
+	unsigned int	ack_required:1;
+	unsigned int	ack_next_valid:1;
+	unsigned int	ack_recv_valid:1;
+};
+
+static void rds_iw_process_recv(struct rds_connection *conn,
+				struct rds_iw_recv_work *recv, u32 byte_len,
+				struct rds_iw_ack_state *state)
+{
+	struct rds_iw_connection *ic = conn->c_transport_data;
+	struct rds_iw_incoming *iwinc = ic->i_iwinc;
+	struct rds_header *ihdr, *hdr;
+
+	/* XXX shut down the connection if port 0,0 are seen? */
+
+	rdsdebug("ic %p iwinc %p recv %p byte len %u\n", ic, iwinc, recv,
+		 byte_len);
+
+	if (byte_len < sizeof(struct rds_header)) {
+		rds_iw_conn_error(conn, "incoming message "
+		       "from %pI4 didn't include a "
+		       "header, disconnecting and "
+		       "reconnecting\n",
+		       &conn->c_faddr);
+		return;
+	}
+	byte_len -= sizeof(struct rds_header);
+
+	ihdr = &ic->i_recv_hdrs[recv - ic->i_recvs];
+
+	/* Validate the checksum. */
+	if (!rds_message_verify_checksum(ihdr)) {
+		rds_iw_conn_error(conn, "incoming message "
+		       "from %pI4 has corrupted header - "
+		       "forcing a reconnect\n",
+		       &conn->c_faddr);
+		rds_stats_inc(s_recv_drop_bad_checksum);
+		return;
+	}
+
+	/* Process the ACK sequence which comes with every packet */
+	state->ack_recv = be64_to_cpu(ihdr->h_ack);
+	state->ack_recv_valid = 1;
+
+	/* Process the credits update if there was one */
+	if (ihdr->h_credit)
+		rds_iw_send_add_credits(conn, ihdr->h_credit);
+
+	if (ihdr->h_sport == 0 && ihdr->h_dport == 0 && byte_len == 0) {
+		/* This is an ACK-only packet. The fact that it gets
+		 * special treatment here is that historically, ACKs
+		 * were rather special beasts.
+		 */
+		rds_iw_stats_inc(s_iw_ack_received);
+
+		/*
+		 * Usually the frags make their way on to incs and are then freed as
+		 * the inc is freed.  We don't go that route, so we have to drop the
+		 * page ref ourselves.  We can't just leave the page on the recv
+		 * because that confuses the dma mapping of pages and each recv's use
+		 * of a partial page.  We can leave the frag, though, it will be
+		 * reused.
+		 *
+		 * FIXME: Fold this into the code path below.
+		 */
+		rds_iw_frag_drop_page(recv->r_frag);
+		return;
+	}
+
+	/*
+	 * If we don't already have an inc on the connection then this
+	 * fragment has a header and starts a message.. copy its header
+	 * into the inc and save the inc so we can hang upcoming fragments
+	 * off its list.
+	 */
+	if (!iwinc) {
+		iwinc = recv->r_iwinc;
+		recv->r_iwinc = NULL;
+		ic->i_iwinc = iwinc;
+
+		hdr = &iwinc->ii_inc.i_hdr;
+		memcpy(hdr, ihdr, sizeof(*hdr));
+		ic->i_recv_data_rem = be32_to_cpu(hdr->h_len);
+
+		rdsdebug("ic %p iwinc %p rem %u flag 0x%x\n", ic, iwinc,
+			 ic->i_recv_data_rem, hdr->h_flags);
+	} else {
+		hdr = &iwinc->ii_inc.i_hdr;
+		/* We can't just use memcmp here; fragments of a
+		 * single message may carry different ACKs */
+		if (hdr->h_sequence != ihdr->h_sequence ||
+		    hdr->h_len != ihdr->h_len ||
+		    hdr->h_sport != ihdr->h_sport ||
+		    hdr->h_dport != ihdr->h_dport) {
+			rds_iw_conn_error(conn,
+				"fragment header mismatch; forcing reconnect\n");
+			return;
+		}
+	}
+
+	list_add_tail(&recv->r_frag->f_item, &iwinc->ii_frags);
+	recv->r_frag = NULL;
+
+	if (ic->i_recv_data_rem > RDS_FRAG_SIZE)
+		ic->i_recv_data_rem -= RDS_FRAG_SIZE;
+	else {
+		ic->i_recv_data_rem = 0;
+		ic->i_iwinc = NULL;
+
+		if (iwinc->ii_inc.i_hdr.h_flags == RDS_FLAG_CONG_BITMAP)
+			rds_iw_cong_recv(conn, iwinc);
+		else {
+			rds_recv_incoming(conn, conn->c_faddr, conn->c_laddr,
+					  &iwinc->ii_inc, GFP_ATOMIC);
+			state->ack_next = be64_to_cpu(hdr->h_sequence);
+			state->ack_next_valid = 1;
+		}
+
+		/* Evaluate the ACK_REQUIRED flag *after* we received
+		 * the complete frame, and after bumping the next_rx
+		 * sequence. */
+		if (hdr->h_flags & RDS_FLAG_ACK_REQUIRED) {
+			rds_stats_inc(s_recv_ack_required);
+			state->ack_required = 1;
+		}
+
+		rds_inc_put(&iwinc->ii_inc);
+	}
+}
+
+/*
+ * Plucking the oldest entry from the ring can be done concurrently with
+ * the thread refilling the ring.  Each ring operation is protected by
+ * spinlocks and the transient state of refilling doesn't change the
+ * recording of which entry is oldest.
+ *
+ * This relies on IB only calling one cq comp_handler for each cq so that
+ * there will only be one caller of rds_recv_incoming() per RDS connection.
+ */
+void rds_iw_recv_cq_comp_handler(struct ib_cq *cq, void *context)
+{
+	struct rds_connection *conn = context;
+	struct rds_iw_connection *ic = conn->c_transport_data;
+
+	rdsdebug("conn %p cq %p\n", conn, cq);
+
+	rds_iw_stats_inc(s_iw_rx_cq_call);
+
+	tasklet_schedule(&ic->i_recv_tasklet);
+}
+
+static inline void rds_poll_cq(struct rds_iw_connection *ic,
+			       struct rds_iw_ack_state *state)
+{
+	struct rds_connection *conn = ic->conn;
+	struct ib_wc wc;
+	struct rds_iw_recv_work *recv;
+
+	while (ib_poll_cq(ic->i_recv_cq, 1, &wc) > 0) {
+		rdsdebug("wc wr_id 0x%llx status %u byte_len %u imm_data %u\n",
+			 (unsigned long long)wc.wr_id, wc.status, wc.byte_len,
+			 be32_to_cpu(wc.ex.imm_data));
+		rds_iw_stats_inc(s_iw_rx_cq_event);
+
+		recv = &ic->i_recvs[rds_iw_ring_oldest(&ic->i_recv_ring)];
+
+		rds_iw_recv_unmap_page(ic, recv);
+
+		/*
+		 * Also process recvs in connecting state because it is possible
+		 * to get a recv completion _before_ the rdmacm ESTABLISHED
+		 * event is processed.
+		 */
+		if (rds_conn_up(conn) || rds_conn_connecting(conn)) {
+			/* We expect errors as the qp is drained during shutdown */
+			if (wc.status == IB_WC_SUCCESS) {
+				rds_iw_process_recv(conn, recv, wc.byte_len, state);
+			} else {
+				rds_iw_conn_error(conn, "recv completion on "
+				       "%pI4 had status %u, disconnecting and "
+				       "reconnecting\n", &conn->c_faddr,
+				       wc.status);
+			}
+		}
+
+		rds_iw_ring_free(&ic->i_recv_ring, 1);
+	}
+}
+
+void rds_iw_recv_tasklet_fn(unsigned long data)
+{
+	struct rds_iw_connection *ic = (struct rds_iw_connection *) data;
+	struct rds_connection *conn = ic->conn;
+	struct rds_iw_ack_state state = { 0, };
+
+	rds_poll_cq(ic, &state);
+	ib_req_notify_cq(ic->i_recv_cq, IB_CQ_SOLICITED);
+	rds_poll_cq(ic, &state);
+
+	if (state.ack_next_valid)
+		rds_iw_set_ack(ic, state.ack_next, state.ack_required);
+	if (state.ack_recv_valid && state.ack_recv > ic->i_ack_recv) {
+		rds_send_drop_acked(conn, state.ack_recv, NULL);
+		ic->i_ack_recv = state.ack_recv;
+	}
+	if (rds_conn_up(conn))
+		rds_iw_attempt_ack(ic);
+
+	/* If we ever end up with a really empty receive ring, we're
+	 * in deep trouble, as the sender will definitely see RNR
+	 * timeouts. */
+	if (rds_iw_ring_empty(&ic->i_recv_ring))
+		rds_iw_stats_inc(s_iw_rx_ring_empty);
+
+	/*
+	 * If the ring is running low, then schedule the thread to refill.
+	 */
+	if (rds_iw_ring_low(&ic->i_recv_ring))
+		queue_delayed_work(rds_wq, &conn->c_recv_w, 0);
+}
+
+int rds_iw_recv(struct rds_connection *conn)
+{
+	struct rds_iw_connection *ic = conn->c_transport_data;
+	int ret = 0;
+
+	rdsdebug("conn %p\n", conn);
+
+	/*
+	 * If we get a temporary posting failure in this context then
+	 * we're really low and we want the caller to back off for a bit.
+	 */
+	mutex_lock(&ic->i_recv_mutex);
+	if (rds_iw_recv_refill(conn, GFP_KERNEL, GFP_HIGHUSER, 0))
+		ret = -ENOMEM;
+	else
+		rds_iw_stats_inc(s_iw_rx_refill_from_thread);
+	mutex_unlock(&ic->i_recv_mutex);
+
+	if (rds_conn_up(conn))
+		rds_iw_attempt_ack(ic);
+
+	return ret;
+}
+
+int rds_iw_recv_init(void)
+{
+	struct sysinfo si;
+	int ret = -ENOMEM;
+
+	/* Default to 30% of all available RAM for recv memory */
+	si_meminfo(&si);
+	rds_iw_sysctl_max_recv_allocation = si.totalram / 3 * PAGE_SIZE / RDS_FRAG_SIZE;
+
+	rds_iw_incoming_slab = kmem_cache_create("rds_iw_incoming",
+					sizeof(struct rds_iw_incoming),
+					0, 0, NULL);
+	if (!rds_iw_incoming_slab)
+		goto out;
+
+	rds_iw_frag_slab = kmem_cache_create("rds_iw_frag",
+					sizeof(struct rds_page_frag),
+					0, 0, NULL);
+	if (!rds_iw_frag_slab)
+		kmem_cache_destroy(rds_iw_incoming_slab);
+	else
+		ret = 0;
+out:
+	return ret;
+}
+
+void rds_iw_recv_exit(void)
+{
+	kmem_cache_destroy(rds_iw_incoming_slab);
+	kmem_cache_destroy(rds_iw_frag_slab);
+}
diff --git a/net/rds/iw_ring.c b/net/rds/iw_ring.c
new file mode 100644
index 0000000..da8e3b6
--- /dev/null
+++ b/net/rds/iw_ring.c
@@ -0,0 +1,169 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+
+#include "rds.h"
+#include "iw.h"
+
+/*
+ * Locking for IB rings.
+ * We assume that allocation is always protected by a mutex
+ * in the caller (this is a valid assumption for the current
+ * implementation).
+ *
+ * Freeing always happens in an interrupt, and hence only
+ * races with allocations, but not with other free()s.
+ *
+ * The interaction between allocation and freeing is that
+ * the alloc code has to determine the number of free entries.
+ * To this end, we maintain two counters; an allocation counter
+ * and a free counter. Both are allowed to run freely, and wrap
+ * around.
+ * The number of used entries is always (alloc_ctr - free_ctr) % NR.
+ *
+ * The current implementation makes free_ctr atomic. When the
+ * caller finds an allocation fails, it should set an "alloc fail"
+ * bit and retry the allocation. The "alloc fail" bit essentially tells
+ * the CQ completion handlers to wake it up after freeing some
+ * more entries.
+ */
+
+/*
+ * This only happens on shutdown.
+ */
+DECLARE_WAIT_QUEUE_HEAD(rds_iw_ring_empty_wait);
+
+void rds_iw_ring_init(struct rds_iw_work_ring *ring, u32 nr)
+{
+	memset(ring, 0, sizeof(*ring));
+	ring->w_nr = nr;
+	rdsdebug("ring %p nr %u\n", ring, ring->w_nr);
+}
+
+static inline u32 __rds_iw_ring_used(struct rds_iw_work_ring *ring)
+{
+	u32 diff;
+
+	/* This assumes that atomic_t has at least as many bits as u32 */
+	diff = ring->w_alloc_ctr - (u32) atomic_read(&ring->w_free_ctr);
+	BUG_ON(diff > ring->w_nr);
+
+	return diff;
+}
+
+void rds_iw_ring_resize(struct rds_iw_work_ring *ring, u32 nr)
+{
+	/* We only ever get called from the connection setup code,
+	 * prior to creating the QP. */
+	BUG_ON(__rds_iw_ring_used(ring));
+	ring->w_nr = nr;
+}
+
+static int __rds_iw_ring_empty(struct rds_iw_work_ring *ring)
+{
+	return __rds_iw_ring_used(ring) == 0;
+}
+
+u32 rds_iw_ring_alloc(struct rds_iw_work_ring *ring, u32 val, u32 *pos)
+{
+	u32 ret = 0, avail;
+
+	avail = ring->w_nr - __rds_iw_ring_used(ring);
+
+	rdsdebug("ring %p val %u next %u free %u\n", ring, val,
+		 ring->w_alloc_ptr, avail);
+
+	if (val && avail) {
+		ret = min(val, avail);
+		*pos = ring->w_alloc_ptr;
+
+		ring->w_alloc_ptr = (ring->w_alloc_ptr + ret) % ring->w_nr;
+		ring->w_alloc_ctr += ret;
+	}
+
+	return ret;
+}
+
+void rds_iw_ring_free(struct rds_iw_work_ring *ring, u32 val)
+{
+	ring->w_free_ptr = (ring->w_free_ptr + val) % ring->w_nr;
+	atomic_add(val, &ring->w_free_ctr);
+
+	if (__rds_iw_ring_empty(ring) &&
+	    waitqueue_active(&rds_iw_ring_empty_wait))
+		wake_up(&rds_iw_ring_empty_wait);
+}
+
+void rds_iw_ring_unalloc(struct rds_iw_work_ring *ring, u32 val)
+{
+	ring->w_alloc_ptr = (ring->w_alloc_ptr - val) % ring->w_nr;
+	ring->w_alloc_ctr -= val;
+}
+
+int rds_iw_ring_empty(struct rds_iw_work_ring *ring)
+{
+	return __rds_iw_ring_empty(ring);
+}
+
+int rds_iw_ring_low(struct rds_iw_work_ring *ring)
+{
+	return __rds_iw_ring_used(ring) <= (ring->w_nr >> 1);
+}
+
+
+/*
+ * returns the oldest alloced ring entry.  This will be the next one
+ * freed.  This can't be called if there are none allocated.
+ */
+u32 rds_iw_ring_oldest(struct rds_iw_work_ring *ring)
+{
+	return ring->w_free_ptr;
+}
+
+/*
+ * returns the number of completed work requests.
+ */
+
+u32 rds_iw_ring_completed(struct rds_iw_work_ring *ring, u32 wr_id, u32 oldest)
+{
+	u32 ret;
+
+	if (oldest <= (unsigned long long)wr_id)
+		ret = (unsigned long long)wr_id - oldest + 1;
+	else
+		ret = ring->w_nr - oldest + (unsigned long long)wr_id + 1;
+
+	rdsdebug("ring %p ret %u wr_id %u oldest %u\n", ring, ret,
+		 wr_id, oldest);
+	return ret;
+}
diff --git a/net/rds/iw_send.c b/net/rds/iw_send.c
new file mode 100644
index 0000000..9105ea0
--- /dev/null
+++ b/net/rds/iw_send.c
@@ -0,0 +1,974 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/in.h>
+#include <linux/device.h>
+#include <linux/dmapool.h>
+#include <linux/ratelimit.h>
+
+#include "rds.h"
+#include "iw.h"
+
+static void rds_iw_send_rdma_complete(struct rds_message *rm,
+				      int wc_status)
+{
+	int notify_status;
+
+	switch (wc_status) {
+	case IB_WC_WR_FLUSH_ERR:
+		return;
+
+	case IB_WC_SUCCESS:
+		notify_status = RDS_RDMA_SUCCESS;
+		break;
+
+	case IB_WC_REM_ACCESS_ERR:
+		notify_status = RDS_RDMA_REMOTE_ERROR;
+		break;
+
+	default:
+		notify_status = RDS_RDMA_OTHER_ERROR;
+		break;
+	}
+	rds_rdma_send_complete(rm, notify_status);
+}
+
+static void rds_iw_send_unmap_rdma(struct rds_iw_connection *ic,
+				   struct rm_rdma_op *op)
+{
+	if (op->op_mapped) {
+		ib_dma_unmap_sg(ic->i_cm_id->device,
+			op->op_sg, op->op_nents,
+			op->op_write ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
+		op->op_mapped = 0;
+	}
+}
+
+static void rds_iw_send_unmap_rm(struct rds_iw_connection *ic,
+			  struct rds_iw_send_work *send,
+			  int wc_status)
+{
+	struct rds_message *rm = send->s_rm;
+
+	rdsdebug("ic %p send %p rm %p\n", ic, send, rm);
+
+	ib_dma_unmap_sg(ic->i_cm_id->device,
+		     rm->data.op_sg, rm->data.op_nents,
+		     DMA_TO_DEVICE);
+
+	if (rm->rdma.op_active) {
+		rds_iw_send_unmap_rdma(ic, &rm->rdma);
+
+		/* If the user asked for a completion notification on this
+		 * message, we can implement three different semantics:
+		 *  1.	Notify when we received the ACK on the RDS message
+		 *	that was queued with the RDMA. This provides reliable
+		 *	notification of RDMA status at the expense of a one-way
+		 *	packet delay.
+		 *  2.	Notify when the IB stack gives us the completion event for
+		 *	the RDMA operation.
+		 *  3.	Notify when the IB stack gives us the completion event for
+		 *	the accompanying RDS messages.
+		 * Here, we implement approach #3. To implement approach #2,
+		 * call rds_rdma_send_complete from the cq_handler. To implement #1,
+		 * don't call rds_rdma_send_complete at all, and fall back to the notify
+		 * handling in the ACK processing code.
+		 *
+		 * Note: There's no need to explicitly sync any RDMA buffers using
+		 * ib_dma_sync_sg_for_cpu - the completion for the RDMA
+		 * operation itself unmapped the RDMA buffers, which takes care
+		 * of synching.
+		 */
+		rds_iw_send_rdma_complete(rm, wc_status);
+
+		if (rm->rdma.op_write)
+			rds_stats_add(s_send_rdma_bytes, rm->rdma.op_bytes);
+		else
+			rds_stats_add(s_recv_rdma_bytes, rm->rdma.op_bytes);
+	}
+
+	/* If anyone waited for this message to get flushed out, wake
+	 * them up now */
+	rds_message_unmapped(rm);
+
+	rds_message_put(rm);
+	send->s_rm = NULL;
+}
+
+void rds_iw_send_init_ring(struct rds_iw_connection *ic)
+{
+	struct rds_iw_send_work *send;
+	u32 i;
+
+	for (i = 0, send = ic->i_sends; i < ic->i_send_ring.w_nr; i++, send++) {
+		struct ib_sge *sge;
+
+		send->s_rm = NULL;
+		send->s_op = NULL;
+		send->s_mapping = NULL;
+
+		send->s_wr.next = NULL;
+		send->s_wr.wr_id = i;
+		send->s_wr.sg_list = send->s_sge;
+		send->s_wr.num_sge = 1;
+		send->s_wr.opcode = IB_WR_SEND;
+		send->s_wr.send_flags = 0;
+		send->s_wr.ex.imm_data = 0;
+
+		sge = rds_iw_data_sge(ic, send->s_sge);
+		sge->lkey = 0;
+
+		sge = rds_iw_header_sge(ic, send->s_sge);
+		sge->addr = ic->i_send_hdrs_dma + (i * sizeof(struct rds_header));
+		sge->length = sizeof(struct rds_header);
+		sge->lkey = 0;
+
+		send->s_mr = ib_alloc_fast_reg_mr(ic->i_pd, fastreg_message_size);
+		if (IS_ERR(send->s_mr)) {
+			printk(KERN_WARNING "RDS/IW: ib_alloc_fast_reg_mr failed\n");
+			break;
+		}
+
+		send->s_page_list = ib_alloc_fast_reg_page_list(
+			ic->i_cm_id->device, fastreg_message_size);
+		if (IS_ERR(send->s_page_list)) {
+			printk(KERN_WARNING "RDS/IW: ib_alloc_fast_reg_page_list failed\n");
+			break;
+		}
+	}
+}
+
+void rds_iw_send_clear_ring(struct rds_iw_connection *ic)
+{
+	struct rds_iw_send_work *send;
+	u32 i;
+
+	for (i = 0, send = ic->i_sends; i < ic->i_send_ring.w_nr; i++, send++) {
+		BUG_ON(!send->s_mr);
+		ib_dereg_mr(send->s_mr);
+		BUG_ON(!send->s_page_list);
+		ib_free_fast_reg_page_list(send->s_page_list);
+		if (send->s_wr.opcode == 0xdead)
+			continue;
+		if (send->s_rm)
+			rds_iw_send_unmap_rm(ic, send, IB_WC_WR_FLUSH_ERR);
+		if (send->s_op)
+			rds_iw_send_unmap_rdma(ic, send->s_op);
+	}
+}
+
+/*
+ * The _oldest/_free ring operations here race cleanly with the alloc/unalloc
+ * operations performed in the send path.  As the sender allocs and potentially
+ * unallocs the next free entry in the ring it doesn't alter which is
+ * the next to be freed, which is what this is concerned with.
+ */
+void rds_iw_send_cq_comp_handler(struct ib_cq *cq, void *context)
+{
+	struct rds_connection *conn = context;
+	struct rds_iw_connection *ic = conn->c_transport_data;
+	struct ib_wc wc;
+	struct rds_iw_send_work *send;
+	u32 completed;
+	u32 oldest;
+	u32 i;
+	int ret;
+
+	rdsdebug("cq %p conn %p\n", cq, conn);
+	rds_iw_stats_inc(s_iw_tx_cq_call);
+	ret = ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
+	if (ret)
+		rdsdebug("ib_req_notify_cq send failed: %d\n", ret);
+
+	while (ib_poll_cq(cq, 1, &wc) > 0) {
+		rdsdebug("wc wr_id 0x%llx status %u byte_len %u imm_data %u\n",
+			 (unsigned long long)wc.wr_id, wc.status, wc.byte_len,
+			 be32_to_cpu(wc.ex.imm_data));
+		rds_iw_stats_inc(s_iw_tx_cq_event);
+
+		if (wc.status != IB_WC_SUCCESS) {
+			printk(KERN_ERR "WC Error:  status = %d opcode = %d\n", wc.status, wc.opcode);
+			break;
+		}
+
+		if (wc.opcode == IB_WC_LOCAL_INV && wc.wr_id == RDS_IW_LOCAL_INV_WR_ID) {
+			ic->i_fastreg_posted = 0;
+			continue;
+		}
+
+		if (wc.opcode == IB_WC_FAST_REG_MR && wc.wr_id == RDS_IW_FAST_REG_WR_ID) {
+			ic->i_fastreg_posted = 1;
+			continue;
+		}
+
+		if (wc.wr_id == RDS_IW_ACK_WR_ID) {
+			if (time_after(jiffies, ic->i_ack_queued + HZ/2))
+				rds_iw_stats_inc(s_iw_tx_stalled);
+			rds_iw_ack_send_complete(ic);
+			continue;
+		}
+
+		oldest = rds_iw_ring_oldest(&ic->i_send_ring);
+
+		completed = rds_iw_ring_completed(&ic->i_send_ring, wc.wr_id, oldest);
+
+		for (i = 0; i < completed; i++) {
+			send = &ic->i_sends[oldest];
+
+			/* In the error case, wc.opcode sometimes contains garbage */
+			switch (send->s_wr.opcode) {
+			case IB_WR_SEND:
+				if (send->s_rm)
+					rds_iw_send_unmap_rm(ic, send, wc.status);
+				break;
+			case IB_WR_FAST_REG_MR:
+			case IB_WR_RDMA_WRITE:
+			case IB_WR_RDMA_READ:
+			case IB_WR_RDMA_READ_WITH_INV:
+				/* Nothing to be done - the SG list will be unmapped
+				 * when the SEND completes. */
+				break;
+			default:
+				printk_ratelimited(KERN_NOTICE
+						"RDS/IW: %s: unexpected opcode 0x%x in WR!\n",
+						__func__, send->s_wr.opcode);
+				break;
+			}
+
+			send->s_wr.opcode = 0xdead;
+			send->s_wr.num_sge = 1;
+			if (time_after(jiffies, send->s_queued + HZ/2))
+				rds_iw_stats_inc(s_iw_tx_stalled);
+
+			/* If a RDMA operation produced an error, signal this right
+			 * away. If we don't, the subsequent SEND that goes with this
+			 * RDMA will be canceled with ERR_WFLUSH, and the application
+			 * never learn that the RDMA failed. */
+			if (unlikely(wc.status == IB_WC_REM_ACCESS_ERR && send->s_op)) {
+				struct rds_message *rm;
+
+				rm = rds_send_get_message(conn, send->s_op);
+				if (rm)
+					rds_iw_send_rdma_complete(rm, wc.status);
+			}
+
+			oldest = (oldest + 1) % ic->i_send_ring.w_nr;
+		}
+
+		rds_iw_ring_free(&ic->i_send_ring, completed);
+
+		if (test_and_clear_bit(RDS_LL_SEND_FULL, &conn->c_flags) ||
+		    test_bit(0, &conn->c_map_queued))
+			queue_delayed_work(rds_wq, &conn->c_send_w, 0);
+
+		/* We expect errors as the qp is drained during shutdown */
+		if (wc.status != IB_WC_SUCCESS && rds_conn_up(conn)) {
+			rds_iw_conn_error(conn,
+				"send completion on %pI4 "
+				"had status %u, disconnecting and reconnecting\n",
+				&conn->c_faddr, wc.status);
+		}
+	}
+}
+
+/*
+ * This is the main function for allocating credits when sending
+ * messages.
+ *
+ * Conceptually, we have two counters:
+ *  -	send credits: this tells us how many WRs we're allowed
+ *	to submit without overruning the receiver's queue. For
+ *	each SEND WR we post, we decrement this by one.
+ *
+ *  -	posted credits: this tells us how many WRs we recently
+ *	posted to the receive queue. This value is transferred
+ *	to the peer as a "credit update" in a RDS header field.
+ *	Every time we transmit credits to the peer, we subtract
+ *	the amount of transferred credits from this counter.
+ *
+ * It is essential that we avoid situations where both sides have
+ * exhausted their send credits, and are unable to send new credits
+ * to the peer. We achieve this by requiring that we send at least
+ * one credit update to the peer before exhausting our credits.
+ * When new credits arrive, we subtract one credit that is withheld
+ * until we've posted new buffers and are ready to transmit these
+ * credits (see rds_iw_send_add_credits below).
+ *
+ * The RDS send code is essentially single-threaded; rds_send_xmit
+ * grabs c_send_lock to ensure exclusive access to the send ring.
+ * However, the ACK sending code is independent and can race with
+ * message SENDs.
+ *
+ * In the send path, we need to update the counters for send credits
+ * and the counter of posted buffers atomically - when we use the
+ * last available credit, we cannot allow another thread to race us
+ * and grab the posted credits counter.  Hence, we have to use a
+ * spinlock to protect the credit counter, or use atomics.
+ *
+ * Spinlocks shared between the send and the receive path are bad,
+ * because they create unnecessary delays. An early implementation
+ * using a spinlock showed a 5% degradation in throughput at some
+ * loads.
+ *
+ * This implementation avoids spinlocks completely, putting both
+ * counters into a single atomic, and updating that atomic using
+ * atomic_add (in the receive path, when receiving fresh credits),
+ * and using atomic_cmpxchg when updating the two counters.
+ */
+int rds_iw_send_grab_credits(struct rds_iw_connection *ic,
+			     u32 wanted, u32 *adv_credits, int need_posted, int max_posted)
+{
+	unsigned int avail, posted, got = 0, advertise;
+	long oldval, newval;
+
+	*adv_credits = 0;
+	if (!ic->i_flowctl)
+		return wanted;
+
+try_again:
+	advertise = 0;
+	oldval = newval = atomic_read(&ic->i_credits);
+	posted = IB_GET_POST_CREDITS(oldval);
+	avail = IB_GET_SEND_CREDITS(oldval);
+
+	rdsdebug("rds_iw_send_grab_credits(%u): credits=%u posted=%u\n",
+			wanted, avail, posted);
+
+	/* The last credit must be used to send a credit update. */
+	if (avail && !posted)
+		avail--;
+
+	if (avail < wanted) {
+		struct rds_connection *conn = ic->i_cm_id->context;
+
+		/* Oops, there aren't that many credits left! */
+		set_bit(RDS_LL_SEND_FULL, &conn->c_flags);
+		got = avail;
+	} else {
+		/* Sometimes you get what you want, lalala. */
+		got = wanted;
+	}
+	newval -= IB_SET_SEND_CREDITS(got);
+
+	/*
+	 * If need_posted is non-zero, then the caller wants
+	 * the posted regardless of whether any send credits are
+	 * available.
+	 */
+	if (posted && (got || need_posted)) {
+		advertise = min_t(unsigned int, posted, max_posted);
+		newval -= IB_SET_POST_CREDITS(advertise);
+	}
+
+	/* Finally bill everything */
+	if (atomic_cmpxchg(&ic->i_credits, oldval, newval) != oldval)
+		goto try_again;
+
+	*adv_credits = advertise;
+	return got;
+}
+
+void rds_iw_send_add_credits(struct rds_connection *conn, unsigned int credits)
+{
+	struct rds_iw_connection *ic = conn->c_transport_data;
+
+	if (credits == 0)
+		return;
+
+	rdsdebug("rds_iw_send_add_credits(%u): current=%u%s\n",
+			credits,
+			IB_GET_SEND_CREDITS(atomic_read(&ic->i_credits)),
+			test_bit(RDS_LL_SEND_FULL, &conn->c_flags) ? ", ll_send_full" : "");
+
+	atomic_add(IB_SET_SEND_CREDITS(credits), &ic->i_credits);
+	if (test_and_clear_bit(RDS_LL_SEND_FULL, &conn->c_flags))
+		queue_delayed_work(rds_wq, &conn->c_send_w, 0);
+
+	WARN_ON(IB_GET_SEND_CREDITS(credits) >= 16384);
+
+	rds_iw_stats_inc(s_iw_rx_credit_updates);
+}
+
+void rds_iw_advertise_credits(struct rds_connection *conn, unsigned int posted)
+{
+	struct rds_iw_connection *ic = conn->c_transport_data;
+
+	if (posted == 0)
+		return;
+
+	atomic_add(IB_SET_POST_CREDITS(posted), &ic->i_credits);
+
+	/* Decide whether to send an update to the peer now.
+	 * If we would send a credit update for every single buffer we
+	 * post, we would end up with an ACK storm (ACK arrives,
+	 * consumes buffer, we refill the ring, send ACK to remote
+	 * advertising the newly posted buffer... ad inf)
+	 *
+	 * Performance pretty much depends on how often we send
+	 * credit updates - too frequent updates mean lots of ACKs.
+	 * Too infrequent updates, and the peer will run out of
+	 * credits and has to throttle.
+	 * For the time being, 16 seems to be a good compromise.
+	 */
+	if (IB_GET_POST_CREDITS(atomic_read(&ic->i_credits)) >= 16)
+		set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
+}
+
+static inline void
+rds_iw_xmit_populate_wr(struct rds_iw_connection *ic,
+		struct rds_iw_send_work *send, unsigned int pos,
+		unsigned long buffer, unsigned int length,
+		int send_flags)
+{
+	struct ib_sge *sge;
+
+	WARN_ON(pos != send - ic->i_sends);
+
+	send->s_wr.send_flags = send_flags;
+	send->s_wr.opcode = IB_WR_SEND;
+	send->s_wr.num_sge = 2;
+	send->s_wr.next = NULL;
+	send->s_queued = jiffies;
+	send->s_op = NULL;
+
+	if (length != 0) {
+		sge = rds_iw_data_sge(ic, send->s_sge);
+		sge->addr = buffer;
+		sge->length = length;
+		sge->lkey = rds_iw_local_dma_lkey(ic);
+
+		sge = rds_iw_header_sge(ic, send->s_sge);
+	} else {
+		/* We're sending a packet with no payload. There is only
+		 * one SGE */
+		send->s_wr.num_sge = 1;
+		sge = &send->s_sge[0];
+	}
+
+	sge->addr = ic->i_send_hdrs_dma + (pos * sizeof(struct rds_header));
+	sge->length = sizeof(struct rds_header);
+	sge->lkey = rds_iw_local_dma_lkey(ic);
+}
+
+/*
+ * This can be called multiple times for a given message.  The first time
+ * we see a message we map its scatterlist into the IB device so that
+ * we can provide that mapped address to the IB scatter gather entries
+ * in the IB work requests.  We translate the scatterlist into a series
+ * of work requests that fragment the message.  These work requests complete
+ * in order so we pass ownership of the message to the completion handler
+ * once we send the final fragment.
+ *
+ * The RDS core uses the c_send_lock to only enter this function once
+ * per connection.  This makes sure that the tx ring alloc/unalloc pairs
+ * don't get out of sync and confuse the ring.
+ */
+int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm,
+		unsigned int hdr_off, unsigned int sg, unsigned int off)
+{
+	struct rds_iw_connection *ic = conn->c_transport_data;
+	struct ib_device *dev = ic->i_cm_id->device;
+	struct rds_iw_send_work *send = NULL;
+	struct rds_iw_send_work *first;
+	struct rds_iw_send_work *prev;
+	struct ib_send_wr *failed_wr;
+	struct scatterlist *scat;
+	u32 pos;
+	u32 i;
+	u32 work_alloc;
+	u32 credit_alloc;
+	u32 posted;
+	u32 adv_credits = 0;
+	int send_flags = 0;
+	int sent;
+	int ret;
+	int flow_controlled = 0;
+
+	BUG_ON(off % RDS_FRAG_SIZE);
+	BUG_ON(hdr_off != 0 && hdr_off != sizeof(struct rds_header));
+
+	/* Fastreg support */
+	if (rds_rdma_cookie_key(rm->m_rdma_cookie) && !ic->i_fastreg_posted) {
+		ret = -EAGAIN;
+		goto out;
+	}
+
+	/* FIXME we may overallocate here */
+	if (be32_to_cpu(rm->m_inc.i_hdr.h_len) == 0)
+		i = 1;
+	else
+		i = ceil(be32_to_cpu(rm->m_inc.i_hdr.h_len), RDS_FRAG_SIZE);
+
+	work_alloc = rds_iw_ring_alloc(&ic->i_send_ring, i, &pos);
+	if (work_alloc == 0) {
+		set_bit(RDS_LL_SEND_FULL, &conn->c_flags);
+		rds_iw_stats_inc(s_iw_tx_ring_full);
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	credit_alloc = work_alloc;
+	if (ic->i_flowctl) {
+		credit_alloc = rds_iw_send_grab_credits(ic, work_alloc, &posted, 0, RDS_MAX_ADV_CREDIT);
+		adv_credits += posted;
+		if (credit_alloc < work_alloc) {
+			rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc - credit_alloc);
+			work_alloc = credit_alloc;
+			flow_controlled++;
+		}
+		if (work_alloc == 0) {
+			set_bit(RDS_LL_SEND_FULL, &conn->c_flags);
+			rds_iw_stats_inc(s_iw_tx_throttle);
+			ret = -ENOMEM;
+			goto out;
+		}
+	}
+
+	/* map the message the first time we see it */
+	if (!ic->i_rm) {
+		/*
+		printk(KERN_NOTICE "rds_iw_xmit prep msg dport=%u flags=0x%x len=%d\n",
+				be16_to_cpu(rm->m_inc.i_hdr.h_dport),
+				rm->m_inc.i_hdr.h_flags,
+				be32_to_cpu(rm->m_inc.i_hdr.h_len));
+		   */
+		if (rm->data.op_nents) {
+			rm->data.op_count = ib_dma_map_sg(dev,
+							  rm->data.op_sg,
+							  rm->data.op_nents,
+							  DMA_TO_DEVICE);
+			rdsdebug("ic %p mapping rm %p: %d\n", ic, rm, rm->data.op_count);
+			if (rm->data.op_count == 0) {
+				rds_iw_stats_inc(s_iw_tx_sg_mapping_failure);
+				rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc);
+				ret = -ENOMEM; /* XXX ? */
+				goto out;
+			}
+		} else {
+			rm->data.op_count = 0;
+		}
+
+		ic->i_unsignaled_wrs = rds_iw_sysctl_max_unsig_wrs;
+		ic->i_unsignaled_bytes = rds_iw_sysctl_max_unsig_bytes;
+		rds_message_addref(rm);
+		ic->i_rm = rm;
+
+		/* Finalize the header */
+		if (test_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags))
+			rm->m_inc.i_hdr.h_flags |= RDS_FLAG_ACK_REQUIRED;
+		if (test_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags))
+			rm->m_inc.i_hdr.h_flags |= RDS_FLAG_RETRANSMITTED;
+
+		/* If it has a RDMA op, tell the peer we did it. This is
+		 * used by the peer to release use-once RDMA MRs. */
+		if (rm->rdma.op_active) {
+			struct rds_ext_header_rdma ext_hdr;
+
+			ext_hdr.h_rdma_rkey = cpu_to_be32(rm->rdma.op_rkey);
+			rds_message_add_extension(&rm->m_inc.i_hdr,
+					RDS_EXTHDR_RDMA, &ext_hdr, sizeof(ext_hdr));
+		}
+		if (rm->m_rdma_cookie) {
+			rds_message_add_rdma_dest_extension(&rm->m_inc.i_hdr,
+					rds_rdma_cookie_key(rm->m_rdma_cookie),
+					rds_rdma_cookie_offset(rm->m_rdma_cookie));
+		}
+
+		/* Note - rds_iw_piggyb_ack clears the ACK_REQUIRED bit, so
+		 * we should not do this unless we have a chance of at least
+		 * sticking the header into the send ring. Which is why we
+		 * should call rds_iw_ring_alloc first. */
+		rm->m_inc.i_hdr.h_ack = cpu_to_be64(rds_iw_piggyb_ack(ic));
+		rds_message_make_checksum(&rm->m_inc.i_hdr);
+
+		/*
+		 * Update adv_credits since we reset the ACK_REQUIRED bit.
+		 */
+		rds_iw_send_grab_credits(ic, 0, &posted, 1, RDS_MAX_ADV_CREDIT - adv_credits);
+		adv_credits += posted;
+		BUG_ON(adv_credits > 255);
+	}
+
+	send = &ic->i_sends[pos];
+	first = send;
+	prev = NULL;
+	scat = &rm->data.op_sg[sg];
+	sent = 0;
+	i = 0;
+
+	/* Sometimes you want to put a fence between an RDMA
+	 * READ and the following SEND.
+	 * We could either do this all the time
+	 * or when requested by the user. Right now, we let
+	 * the application choose.
+	 */
+	if (rm->rdma.op_active && rm->rdma.op_fence)
+		send_flags = IB_SEND_FENCE;
+
+	/*
+	 * We could be copying the header into the unused tail of the page.
+	 * That would need to be changed in the future when those pages might
+	 * be mapped userspace pages or page cache pages.  So instead we always
+	 * use a second sge and our long-lived ring of mapped headers.  We send
+	 * the header after the data so that the data payload can be aligned on
+	 * the receiver.
+	 */
+
+	/* handle a 0-len message */
+	if (be32_to_cpu(rm->m_inc.i_hdr.h_len) == 0) {
+		rds_iw_xmit_populate_wr(ic, send, pos, 0, 0, send_flags);
+		goto add_header;
+	}
+
+	/* if there's data reference it with a chain of work reqs */
+	for (; i < work_alloc && scat != &rm->data.op_sg[rm->data.op_count]; i++) {
+		unsigned int len;
+
+		send = &ic->i_sends[pos];
+
+		len = min(RDS_FRAG_SIZE, ib_sg_dma_len(dev, scat) - off);
+		rds_iw_xmit_populate_wr(ic, send, pos,
+				ib_sg_dma_address(dev, scat) + off, len,
+				send_flags);
+
+		/*
+		 * We want to delay signaling completions just enough to get
+		 * the batching benefits but not so much that we create dead time
+		 * on the wire.
+		 */
+		if (ic->i_unsignaled_wrs-- == 0) {
+			ic->i_unsignaled_wrs = rds_iw_sysctl_max_unsig_wrs;
+			send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED;
+		}
+
+		ic->i_unsignaled_bytes -= len;
+		if (ic->i_unsignaled_bytes <= 0) {
+			ic->i_unsignaled_bytes = rds_iw_sysctl_max_unsig_bytes;
+			send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED;
+		}
+
+		/*
+		 * Always signal the last one if we're stopping due to flow control.
+		 */
+		if (flow_controlled && i == (work_alloc-1))
+			send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED;
+
+		rdsdebug("send %p wr %p num_sge %u next %p\n", send,
+			 &send->s_wr, send->s_wr.num_sge, send->s_wr.next);
+
+		sent += len;
+		off += len;
+		if (off == ib_sg_dma_len(dev, scat)) {
+			scat++;
+			off = 0;
+		}
+
+add_header:
+		/* Tack on the header after the data. The header SGE should already
+		 * have been set up to point to the right header buffer. */
+		memcpy(&ic->i_send_hdrs[pos], &rm->m_inc.i_hdr, sizeof(struct rds_header));
+
+		if (0) {
+			struct rds_header *hdr = &ic->i_send_hdrs[pos];
+
+			printk(KERN_NOTICE "send WR dport=%u flags=0x%x len=%d\n",
+				be16_to_cpu(hdr->h_dport),
+				hdr->h_flags,
+				be32_to_cpu(hdr->h_len));
+		}
+		if (adv_credits) {
+			struct rds_header *hdr = &ic->i_send_hdrs[pos];
+
+			/* add credit and redo the header checksum */
+			hdr->h_credit = adv_credits;
+			rds_message_make_checksum(hdr);
+			adv_credits = 0;
+			rds_iw_stats_inc(s_iw_tx_credit_updates);
+		}
+
+		if (prev)
+			prev->s_wr.next = &send->s_wr;
+		prev = send;
+
+		pos = (pos + 1) % ic->i_send_ring.w_nr;
+	}
+
+	/* Account the RDS header in the number of bytes we sent, but just once.
+	 * The caller has no concept of fragmentation. */
+	if (hdr_off == 0)
+		sent += sizeof(struct rds_header);
+
+	/* if we finished the message then send completion owns it */
+	if (scat == &rm->data.op_sg[rm->data.op_count]) {
+		prev->s_rm = ic->i_rm;
+		prev->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED;
+		ic->i_rm = NULL;
+	}
+
+	if (i < work_alloc) {
+		rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc - i);
+		work_alloc = i;
+	}
+	if (ic->i_flowctl && i < credit_alloc)
+		rds_iw_send_add_credits(conn, credit_alloc - i);
+
+	/* XXX need to worry about failed_wr and partial sends. */
+	failed_wr = &first->s_wr;
+	ret = ib_post_send(ic->i_cm_id->qp, &first->s_wr, &failed_wr);
+	rdsdebug("ic %p first %p (wr %p) ret %d wr %p\n", ic,
+		 first, &first->s_wr, ret, failed_wr);
+	BUG_ON(failed_wr != &first->s_wr);
+	if (ret) {
+		printk(KERN_WARNING "RDS/IW: ib_post_send to %pI4 "
+		       "returned %d\n", &conn->c_faddr, ret);
+		rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc);
+		if (prev->s_rm) {
+			ic->i_rm = prev->s_rm;
+			prev->s_rm = NULL;
+		}
+		goto out;
+	}
+
+	ret = sent;
+out:
+	BUG_ON(adv_credits);
+	return ret;
+}
+
+static void rds_iw_build_send_fastreg(struct rds_iw_device *rds_iwdev, struct rds_iw_connection *ic, struct rds_iw_send_work *send, int nent, int len, u64 sg_addr)
+{
+	BUG_ON(nent > send->s_page_list->max_page_list_len);
+	/*
+	 * Perform a WR for the fast_reg_mr. Each individual page
+	 * in the sg list is added to the fast reg page list and placed
+	 * inside the fast_reg_mr WR.
+	 */
+	send->s_wr.opcode = IB_WR_FAST_REG_MR;
+	send->s_wr.wr.fast_reg.length = len;
+	send->s_wr.wr.fast_reg.rkey = send->s_mr->rkey;
+	send->s_wr.wr.fast_reg.page_list = send->s_page_list;
+	send->s_wr.wr.fast_reg.page_list_len = nent;
+	send->s_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
+	send->s_wr.wr.fast_reg.access_flags = IB_ACCESS_REMOTE_WRITE;
+	send->s_wr.wr.fast_reg.iova_start = sg_addr;
+
+	ib_update_fast_reg_key(send->s_mr, send->s_remap_count++);
+}
+
+int rds_iw_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op)
+{
+	struct rds_iw_connection *ic = conn->c_transport_data;
+	struct rds_iw_send_work *send = NULL;
+	struct rds_iw_send_work *first;
+	struct rds_iw_send_work *prev;
+	struct ib_send_wr *failed_wr;
+	struct rds_iw_device *rds_iwdev;
+	struct scatterlist *scat;
+	unsigned long len;
+	u64 remote_addr = op->op_remote_addr;
+	u32 pos, fr_pos;
+	u32 work_alloc;
+	u32 i;
+	u32 j;
+	int sent;
+	int ret;
+	int num_sge;
+
+	rds_iwdev = ib_get_client_data(ic->i_cm_id->device, &rds_iw_client);
+
+	/* map the message the first time we see it */
+	if (!op->op_mapped) {
+		op->op_count = ib_dma_map_sg(ic->i_cm_id->device,
+					     op->op_sg, op->op_nents, (op->op_write) ?
+					     DMA_TO_DEVICE : DMA_FROM_DEVICE);
+		rdsdebug("ic %p mapping op %p: %d\n", ic, op, op->op_count);
+		if (op->op_count == 0) {
+			rds_iw_stats_inc(s_iw_tx_sg_mapping_failure);
+			ret = -ENOMEM; /* XXX ? */
+			goto out;
+		}
+
+		op->op_mapped = 1;
+	}
+
+	if (!op->op_write) {
+		/* Alloc space on the send queue for the fastreg */
+		work_alloc = rds_iw_ring_alloc(&ic->i_send_ring, 1, &fr_pos);
+		if (work_alloc != 1) {
+			rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc);
+			rds_iw_stats_inc(s_iw_tx_ring_full);
+			ret = -ENOMEM;
+			goto out;
+		}
+	}
+
+	/*
+	 * Instead of knowing how to return a partial rdma read/write we insist that there
+	 * be enough work requests to send the entire message.
+	 */
+	i = ceil(op->op_count, rds_iwdev->max_sge);
+
+	work_alloc = rds_iw_ring_alloc(&ic->i_send_ring, i, &pos);
+	if (work_alloc != i) {
+		rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc);
+		rds_iw_stats_inc(s_iw_tx_ring_full);
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	send = &ic->i_sends[pos];
+	if (!op->op_write) {
+		first = prev = &ic->i_sends[fr_pos];
+	} else {
+		first = send;
+		prev = NULL;
+	}
+	scat = &op->op_sg[0];
+	sent = 0;
+	num_sge = op->op_count;
+
+	for (i = 0; i < work_alloc && scat != &op->op_sg[op->op_count]; i++) {
+		send->s_wr.send_flags = 0;
+		send->s_queued = jiffies;
+
+		/*
+		 * We want to delay signaling completions just enough to get
+		 * the batching benefits but not so much that we create dead time on the wire.
+		 */
+		if (ic->i_unsignaled_wrs-- == 0) {
+			ic->i_unsignaled_wrs = rds_iw_sysctl_max_unsig_wrs;
+			send->s_wr.send_flags = IB_SEND_SIGNALED;
+		}
+
+		/* To avoid the need to have the plumbing to invalidate the fastreg_mr used
+		 * for local access after RDS is finished with it, using
+		 * IB_WR_RDMA_READ_WITH_INV will invalidate it after the read has completed.
+		 */
+		if (op->op_write)
+			send->s_wr.opcode = IB_WR_RDMA_WRITE;
+		else
+			send->s_wr.opcode = IB_WR_RDMA_READ_WITH_INV;
+
+		send->s_wr.wr.rdma.remote_addr = remote_addr;
+		send->s_wr.wr.rdma.rkey = op->op_rkey;
+		send->s_op = op;
+
+		if (num_sge > rds_iwdev->max_sge) {
+			send->s_wr.num_sge = rds_iwdev->max_sge;
+			num_sge -= rds_iwdev->max_sge;
+		} else
+			send->s_wr.num_sge = num_sge;
+
+		send->s_wr.next = NULL;
+
+		if (prev)
+			prev->s_wr.next = &send->s_wr;
+
+		for (j = 0; j < send->s_wr.num_sge && scat != &op->op_sg[op->op_count]; j++) {
+			len = ib_sg_dma_len(ic->i_cm_id->device, scat);
+
+			if (send->s_wr.opcode == IB_WR_RDMA_READ_WITH_INV)
+				send->s_page_list->page_list[j] = ib_sg_dma_address(ic->i_cm_id->device, scat);
+			else {
+				send->s_sge[j].addr = ib_sg_dma_address(ic->i_cm_id->device, scat);
+				send->s_sge[j].length = len;
+				send->s_sge[j].lkey = rds_iw_local_dma_lkey(ic);
+			}
+
+			sent += len;
+			rdsdebug("ic %p sent %d remote_addr %llu\n", ic, sent, remote_addr);
+			remote_addr += len;
+
+			scat++;
+		}
+
+		if (send->s_wr.opcode == IB_WR_RDMA_READ_WITH_INV) {
+			send->s_wr.num_sge = 1;
+			send->s_sge[0].addr = conn->c_xmit_rm->m_rs->rs_user_addr;
+			send->s_sge[0].length = conn->c_xmit_rm->m_rs->rs_user_bytes;
+			send->s_sge[0].lkey = ic->i_sends[fr_pos].s_mr->lkey;
+		}
+
+		rdsdebug("send %p wr %p num_sge %u next %p\n", send,
+			&send->s_wr, send->s_wr.num_sge, send->s_wr.next);
+
+		prev = send;
+		if (++send == &ic->i_sends[ic->i_send_ring.w_nr])
+			send = ic->i_sends;
+	}
+
+	/* if we finished the message then send completion owns it */
+	if (scat == &op->op_sg[op->op_count])
+		first->s_wr.send_flags = IB_SEND_SIGNALED;
+
+	if (i < work_alloc) {
+		rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc - i);
+		work_alloc = i;
+	}
+
+	/* On iWARP, local memory access by a remote system (ie, RDMA Read) is not
+	 * recommended.  Putting the lkey on the wire is a security hole, as it can
+	 * allow for memory access to all of memory on the remote system.  Some
+	 * adapters do not allow using the lkey for this at all.  To bypass this use a
+	 * fastreg_mr (or possibly a dma_mr)
+	 */
+	if (!op->op_write) {
+		rds_iw_build_send_fastreg(rds_iwdev, ic, &ic->i_sends[fr_pos],
+			op->op_count, sent, conn->c_xmit_rm->m_rs->rs_user_addr);
+		work_alloc++;
+	}
+
+	failed_wr = &first->s_wr;
+	ret = ib_post_send(ic->i_cm_id->qp, &first->s_wr, &failed_wr);
+	rdsdebug("ic %p first %p (wr %p) ret %d wr %p\n", ic,
+		 first, &first->s_wr, ret, failed_wr);
+	BUG_ON(failed_wr != &first->s_wr);
+	if (ret) {
+		printk(KERN_WARNING "RDS/IW: rdma ib_post_send to %pI4 "
+		       "returned %d\n", &conn->c_faddr, ret);
+		rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc);
+		goto out;
+	}
+
+out:
+	return ret;
+}
+
+void rds_iw_xmit_complete(struct rds_connection *conn)
+{
+	struct rds_iw_connection *ic = conn->c_transport_data;
+
+	/* We may have a pending ACK or window update we were unable
+	 * to send previously (due to flow control). Try again. */
+	rds_iw_attempt_ack(ic);
+}
diff --git a/net/rds/iw_stats.c b/net/rds/iw_stats.c
new file mode 100644
index 0000000..5fe67f6
--- /dev/null
+++ b/net/rds/iw_stats.c
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/percpu.h>
+#include <linux/seq_file.h>
+#include <linux/proc_fs.h>
+
+#include "rds.h"
+#include "iw.h"
+
+DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_iw_statistics, rds_iw_stats);
+
+static const char *const rds_iw_stat_names[] = {
+	"iw_connect_raced",
+	"iw_listen_closed_stale",
+	"iw_tx_cq_call",
+	"iw_tx_cq_event",
+	"iw_tx_ring_full",
+	"iw_tx_throttle",
+	"iw_tx_sg_mapping_failure",
+	"iw_tx_stalled",
+	"iw_tx_credit_updates",
+	"iw_rx_cq_call",
+	"iw_rx_cq_event",
+	"iw_rx_ring_empty",
+	"iw_rx_refill_from_cq",
+	"iw_rx_refill_from_thread",
+	"iw_rx_alloc_limit",
+	"iw_rx_credit_updates",
+	"iw_ack_sent",
+	"iw_ack_send_failure",
+	"iw_ack_send_delayed",
+	"iw_ack_send_piggybacked",
+	"iw_ack_received",
+	"iw_rdma_mr_alloc",
+	"iw_rdma_mr_free",
+	"iw_rdma_mr_used",
+	"iw_rdma_mr_pool_flush",
+	"iw_rdma_mr_pool_wait",
+	"iw_rdma_mr_pool_depleted",
+};
+
+unsigned int rds_iw_stats_info_copy(struct rds_info_iterator *iter,
+				    unsigned int avail)
+{
+	struct rds_iw_statistics stats = {0, };
+	uint64_t *src;
+	uint64_t *sum;
+	size_t i;
+	int cpu;
+
+	if (avail < ARRAY_SIZE(rds_iw_stat_names))
+		goto out;
+
+	for_each_online_cpu(cpu) {
+		src = (uint64_t *)&(per_cpu(rds_iw_stats, cpu));
+		sum = (uint64_t *)&stats;
+		for (i = 0; i < sizeof(stats) / sizeof(uint64_t); i++)
+			*(sum++) += *(src++);
+	}
+
+	rds_stats_info_copy(iter, (uint64_t *)&stats, rds_iw_stat_names,
+			    ARRAY_SIZE(rds_iw_stat_names));
+out:
+	return ARRAY_SIZE(rds_iw_stat_names);
+}
diff --git a/net/rds/iw_sysctl.c b/net/rds/iw_sysctl.c
new file mode 100644
index 0000000..139239d
--- /dev/null
+++ b/net/rds/iw_sysctl.c
@@ -0,0 +1,123 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/sysctl.h>
+#include <linux/proc_fs.h>
+
+#include "iw.h"
+
+static struct ctl_table_header *rds_iw_sysctl_hdr;
+
+unsigned long rds_iw_sysctl_max_send_wr = RDS_IW_DEFAULT_SEND_WR;
+unsigned long rds_iw_sysctl_max_recv_wr = RDS_IW_DEFAULT_RECV_WR;
+unsigned long rds_iw_sysctl_max_recv_allocation = (128 * 1024 * 1024) / RDS_FRAG_SIZE;
+static unsigned long rds_iw_sysctl_max_wr_min = 1;
+/* hardware will fail CQ creation long before this */
+static unsigned long rds_iw_sysctl_max_wr_max = (u32)~0;
+
+unsigned long rds_iw_sysctl_max_unsig_wrs = 16;
+static unsigned long rds_iw_sysctl_max_unsig_wr_min = 1;
+static unsigned long rds_iw_sysctl_max_unsig_wr_max = 64;
+
+unsigned long rds_iw_sysctl_max_unsig_bytes = (16 << 20);
+static unsigned long rds_iw_sysctl_max_unsig_bytes_min = 1;
+static unsigned long rds_iw_sysctl_max_unsig_bytes_max = ~0UL;
+
+unsigned int rds_iw_sysctl_flow_control = 1;
+
+static struct ctl_table rds_iw_sysctl_table[] = {
+	{
+		.procname       = "max_send_wr",
+		.data		= &rds_iw_sysctl_max_send_wr,
+		.maxlen         = sizeof(unsigned long),
+		.mode           = 0644,
+		.proc_handler   = proc_doulongvec_minmax,
+		.extra1		= &rds_iw_sysctl_max_wr_min,
+		.extra2		= &rds_iw_sysctl_max_wr_max,
+	},
+	{
+		.procname       = "max_recv_wr",
+		.data		= &rds_iw_sysctl_max_recv_wr,
+		.maxlen         = sizeof(unsigned long),
+		.mode           = 0644,
+		.proc_handler   = proc_doulongvec_minmax,
+		.extra1		= &rds_iw_sysctl_max_wr_min,
+		.extra2		= &rds_iw_sysctl_max_wr_max,
+	},
+	{
+		.procname       = "max_unsignaled_wr",
+		.data		= &rds_iw_sysctl_max_unsig_wrs,
+		.maxlen         = sizeof(unsigned long),
+		.mode           = 0644,
+		.proc_handler   = proc_doulongvec_minmax,
+		.extra1		= &rds_iw_sysctl_max_unsig_wr_min,
+		.extra2		= &rds_iw_sysctl_max_unsig_wr_max,
+	},
+	{
+		.procname       = "max_unsignaled_bytes",
+		.data		= &rds_iw_sysctl_max_unsig_bytes,
+		.maxlen         = sizeof(unsigned long),
+		.mode           = 0644,
+		.proc_handler   = proc_doulongvec_minmax,
+		.extra1		= &rds_iw_sysctl_max_unsig_bytes_min,
+		.extra2		= &rds_iw_sysctl_max_unsig_bytes_max,
+	},
+	{
+		.procname       = "max_recv_allocation",
+		.data		= &rds_iw_sysctl_max_recv_allocation,
+		.maxlen         = sizeof(unsigned long),
+		.mode           = 0644,
+		.proc_handler   = proc_doulongvec_minmax,
+	},
+	{
+		.procname	= "flow_control",
+		.data		= &rds_iw_sysctl_flow_control,
+		.maxlen		= sizeof(rds_iw_sysctl_flow_control),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{ }
+};
+
+void rds_iw_sysctl_exit(void)
+{
+	unregister_net_sysctl_table(rds_iw_sysctl_hdr);
+}
+
+int rds_iw_sysctl_init(void)
+{
+	rds_iw_sysctl_hdr = register_net_sysctl(&init_net, "net/rds/iw", rds_iw_sysctl_table);
+	if (!rds_iw_sysctl_hdr)
+		return -ENOMEM;
+	return 0;
+}
diff --git a/net/rds/loop.c b/net/rds/loop.c
new file mode 100644
index 0000000..6b12b68
--- /dev/null
+++ b/net/rds/loop.c
@@ -0,0 +1,194 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/in.h>
+
+#include "rds.h"
+#include "loop.h"
+
+static DEFINE_SPINLOCK(loop_conns_lock);
+static LIST_HEAD(loop_conns);
+
+/*
+ * This 'loopback' transport is a special case for flows that originate
+ * and terminate on the same machine.
+ *
+ * Connection build-up notices if the destination address is thought of
+ * as a local address by a transport.  At that time it decides to use the
+ * loopback transport instead of the bound transport of the sending socket.
+ *
+ * The loopback transport's sending path just hands the sent rds_message
+ * straight to the receiving path via an embedded rds_incoming.
+ */
+
+/*
+ * Usually a message transits both the sender and receiver's conns as it
+ * flows to the receiver.  In the loopback case, though, the receive path
+ * is handed the sending conn so the sense of the addresses is reversed.
+ */
+static int rds_loop_xmit(struct rds_connection *conn, struct rds_message *rm,
+			 unsigned int hdr_off, unsigned int sg,
+			 unsigned int off)
+{
+	struct scatterlist *sgp = &rm->data.op_sg[sg];
+	int ret = sizeof(struct rds_header) +
+			be32_to_cpu(rm->m_inc.i_hdr.h_len);
+
+	/* Do not send cong updates to loopback */
+	if (rm->m_inc.i_hdr.h_flags & RDS_FLAG_CONG_BITMAP) {
+		rds_cong_map_updated(conn->c_fcong, ~(u64) 0);
+		ret = min_t(int, ret, sgp->length - conn->c_xmit_data_off);
+		goto out;
+	}
+
+	BUG_ON(hdr_off || sg || off);
+
+	rds_inc_init(&rm->m_inc, conn, conn->c_laddr);
+	/* For the embedded inc. Matching put is in loop_inc_free() */
+	rds_message_addref(rm);
+
+	rds_recv_incoming(conn, conn->c_laddr, conn->c_faddr, &rm->m_inc,
+			  GFP_KERNEL);
+
+	rds_send_drop_acked(conn, be64_to_cpu(rm->m_inc.i_hdr.h_sequence),
+			    NULL);
+
+	rds_inc_put(&rm->m_inc);
+out:
+	return ret;
+}
+
+/*
+ * See rds_loop_xmit(). Since our inc is embedded in the rm, we
+ * make sure the rm lives at least until the inc is done.
+ */
+static void rds_loop_inc_free(struct rds_incoming *inc)
+{
+        struct rds_message *rm = container_of(inc, struct rds_message, m_inc);
+        rds_message_put(rm);
+}
+
+/* we need to at least give the thread something to succeed */
+static int rds_loop_recv(struct rds_connection *conn)
+{
+	return 0;
+}
+
+struct rds_loop_connection {
+	struct list_head loop_node;
+	struct rds_connection *conn;
+};
+
+/*
+ * Even the loopback transport needs to keep track of its connections,
+ * so it can call rds_conn_destroy() on them on exit. N.B. there are
+ * 1+ loopback addresses (127.*.*.*) so it's not a bug to have
+ * multiple loopback conns allocated, although rather useless.
+ */
+static int rds_loop_conn_alloc(struct rds_connection *conn, gfp_t gfp)
+{
+	struct rds_loop_connection *lc;
+	unsigned long flags;
+
+	lc = kzalloc(sizeof(struct rds_loop_connection), gfp);
+	if (!lc)
+		return -ENOMEM;
+
+	INIT_LIST_HEAD(&lc->loop_node);
+	lc->conn = conn;
+	conn->c_transport_data = lc;
+
+	spin_lock_irqsave(&loop_conns_lock, flags);
+	list_add_tail(&lc->loop_node, &loop_conns);
+	spin_unlock_irqrestore(&loop_conns_lock, flags);
+
+	return 0;
+}
+
+static void rds_loop_conn_free(void *arg)
+{
+	struct rds_loop_connection *lc = arg;
+	unsigned long flags;
+
+	rdsdebug("lc %p\n", lc);
+	spin_lock_irqsave(&loop_conns_lock, flags);
+	list_del(&lc->loop_node);
+	spin_unlock_irqrestore(&loop_conns_lock, flags);
+	kfree(lc);
+}
+
+static int rds_loop_conn_connect(struct rds_connection *conn)
+{
+	rds_connect_complete(conn);
+	return 0;
+}
+
+static void rds_loop_conn_shutdown(struct rds_connection *conn)
+{
+}
+
+void rds_loop_exit(void)
+{
+	struct rds_loop_connection *lc, *_lc;
+	LIST_HEAD(tmp_list);
+
+	/* avoid calling conn_destroy with irqs off */
+	spin_lock_irq(&loop_conns_lock);
+	list_splice(&loop_conns, &tmp_list);
+	INIT_LIST_HEAD(&loop_conns);
+	spin_unlock_irq(&loop_conns_lock);
+
+	list_for_each_entry_safe(lc, _lc, &tmp_list, loop_node) {
+		WARN_ON(lc->conn->c_passive);
+		rds_conn_destroy(lc->conn);
+	}
+}
+
+/*
+ * This is missing .xmit_* because loop doesn't go through generic
+ * rds_send_xmit() and doesn't call rds_recv_incoming().  .listen_stop and
+ * .laddr_check are missing because transport.c doesn't iterate over
+ * rds_loop_transport.
+ */
+struct rds_transport rds_loop_transport = {
+	.xmit			= rds_loop_xmit,
+	.recv			= rds_loop_recv,
+	.conn_alloc		= rds_loop_conn_alloc,
+	.conn_free		= rds_loop_conn_free,
+	.conn_connect		= rds_loop_conn_connect,
+	.conn_shutdown		= rds_loop_conn_shutdown,
+	.inc_copy_to_user	= rds_message_inc_copy_to_user,
+	.inc_free		= rds_loop_inc_free,
+	.t_name			= "loopback",
+};
diff --git a/net/rds/loop.h b/net/rds/loop.h
new file mode 100644
index 0000000..f32b093
--- /dev/null
+++ b/net/rds/loop.h
@@ -0,0 +1,9 @@
+#ifndef _RDS_LOOP_H
+#define _RDS_LOOP_H
+
+/* loop.c */
+extern struct rds_transport rds_loop_transport;
+
+void rds_loop_exit(void);
+
+#endif
diff --git a/net/rds/message.c b/net/rds/message.c
new file mode 100644
index 0000000..aba232f
--- /dev/null
+++ b/net/rds/message.c
@@ -0,0 +1,402 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/export.h>
+
+#include "rds.h"
+
+static unsigned int	rds_exthdr_size[__RDS_EXTHDR_MAX] = {
+[RDS_EXTHDR_NONE]	= 0,
+[RDS_EXTHDR_VERSION]	= sizeof(struct rds_ext_header_version),
+[RDS_EXTHDR_RDMA]	= sizeof(struct rds_ext_header_rdma),
+[RDS_EXTHDR_RDMA_DEST]	= sizeof(struct rds_ext_header_rdma_dest),
+};
+
+
+void rds_message_addref(struct rds_message *rm)
+{
+	rdsdebug("addref rm %p ref %d\n", rm, atomic_read(&rm->m_refcount));
+	atomic_inc(&rm->m_refcount);
+}
+EXPORT_SYMBOL_GPL(rds_message_addref);
+
+/*
+ * This relies on dma_map_sg() not touching sg[].page during merging.
+ */
+static void rds_message_purge(struct rds_message *rm)
+{
+	unsigned long i;
+
+	if (unlikely(test_bit(RDS_MSG_PAGEVEC, &rm->m_flags)))
+		return;
+
+	for (i = 0; i < rm->data.op_nents; i++) {
+		rdsdebug("putting data page %p\n", (void *)sg_page(&rm->data.op_sg[i]));
+		/* XXX will have to put_page for page refs */
+		__free_page(sg_page(&rm->data.op_sg[i]));
+	}
+	rm->data.op_nents = 0;
+
+	if (rm->rdma.op_active)
+		rds_rdma_free_op(&rm->rdma);
+	if (rm->rdma.op_rdma_mr)
+		rds_mr_put(rm->rdma.op_rdma_mr);
+
+	if (rm->atomic.op_active)
+		rds_atomic_free_op(&rm->atomic);
+	if (rm->atomic.op_rdma_mr)
+		rds_mr_put(rm->atomic.op_rdma_mr);
+}
+
+void rds_message_put(struct rds_message *rm)
+{
+	rdsdebug("put rm %p ref %d\n", rm, atomic_read(&rm->m_refcount));
+	WARN(!atomic_read(&rm->m_refcount), "danger refcount zero on %p\n", rm);
+	if (atomic_dec_and_test(&rm->m_refcount)) {
+		BUG_ON(!list_empty(&rm->m_sock_item));
+		BUG_ON(!list_empty(&rm->m_conn_item));
+		rds_message_purge(rm);
+
+		kfree(rm);
+	}
+}
+EXPORT_SYMBOL_GPL(rds_message_put);
+
+void rds_message_populate_header(struct rds_header *hdr, __be16 sport,
+				 __be16 dport, u64 seq)
+{
+	hdr->h_flags = 0;
+	hdr->h_sport = sport;
+	hdr->h_dport = dport;
+	hdr->h_sequence = cpu_to_be64(seq);
+	hdr->h_exthdr[0] = RDS_EXTHDR_NONE;
+}
+EXPORT_SYMBOL_GPL(rds_message_populate_header);
+
+int rds_message_add_extension(struct rds_header *hdr, unsigned int type,
+			      const void *data, unsigned int len)
+{
+	unsigned int ext_len = sizeof(u8) + len;
+	unsigned char *dst;
+
+	/* For now, refuse to add more than one extension header */
+	if (hdr->h_exthdr[0] != RDS_EXTHDR_NONE)
+		return 0;
+
+	if (type >= __RDS_EXTHDR_MAX || len != rds_exthdr_size[type])
+		return 0;
+
+	if (ext_len >= RDS_HEADER_EXT_SPACE)
+		return 0;
+	dst = hdr->h_exthdr;
+
+	*dst++ = type;
+	memcpy(dst, data, len);
+
+	dst[len] = RDS_EXTHDR_NONE;
+	return 1;
+}
+EXPORT_SYMBOL_GPL(rds_message_add_extension);
+
+/*
+ * If a message has extension headers, retrieve them here.
+ * Call like this:
+ *
+ * unsigned int pos = 0;
+ *
+ * while (1) {
+ *	buflen = sizeof(buffer);
+ *	type = rds_message_next_extension(hdr, &pos, buffer, &buflen);
+ *	if (type == RDS_EXTHDR_NONE)
+ *		break;
+ *	...
+ * }
+ */
+int rds_message_next_extension(struct rds_header *hdr,
+		unsigned int *pos, void *buf, unsigned int *buflen)
+{
+	unsigned int offset, ext_type, ext_len;
+	u8 *src = hdr->h_exthdr;
+
+	offset = *pos;
+	if (offset >= RDS_HEADER_EXT_SPACE)
+		goto none;
+
+	/* Get the extension type and length. For now, the
+	 * length is implied by the extension type. */
+	ext_type = src[offset++];
+
+	if (ext_type == RDS_EXTHDR_NONE || ext_type >= __RDS_EXTHDR_MAX)
+		goto none;
+	ext_len = rds_exthdr_size[ext_type];
+	if (offset + ext_len > RDS_HEADER_EXT_SPACE)
+		goto none;
+
+	*pos = offset + ext_len;
+	if (ext_len < *buflen)
+		*buflen = ext_len;
+	memcpy(buf, src + offset, *buflen);
+	return ext_type;
+
+none:
+	*pos = RDS_HEADER_EXT_SPACE;
+	*buflen = 0;
+	return RDS_EXTHDR_NONE;
+}
+
+int rds_message_add_rdma_dest_extension(struct rds_header *hdr, u32 r_key, u32 offset)
+{
+	struct rds_ext_header_rdma_dest ext_hdr;
+
+	ext_hdr.h_rdma_rkey = cpu_to_be32(r_key);
+	ext_hdr.h_rdma_offset = cpu_to_be32(offset);
+	return rds_message_add_extension(hdr, RDS_EXTHDR_RDMA_DEST, &ext_hdr, sizeof(ext_hdr));
+}
+EXPORT_SYMBOL_GPL(rds_message_add_rdma_dest_extension);
+
+/*
+ * Each rds_message is allocated with extra space for the scatterlist entries
+ * rds ops will need. This is to minimize memory allocation count. Then, each rds op
+ * can grab SGs when initializing its part of the rds_message.
+ */
+struct rds_message *rds_message_alloc(unsigned int extra_len, gfp_t gfp)
+{
+	struct rds_message *rm;
+
+	if (extra_len > KMALLOC_MAX_SIZE - sizeof(struct rds_message))
+		return NULL;
+
+	rm = kzalloc(sizeof(struct rds_message) + extra_len, gfp);
+	if (!rm)
+		goto out;
+
+	rm->m_used_sgs = 0;
+	rm->m_total_sgs = extra_len / sizeof(struct scatterlist);
+
+	atomic_set(&rm->m_refcount, 1);
+	INIT_LIST_HEAD(&rm->m_sock_item);
+	INIT_LIST_HEAD(&rm->m_conn_item);
+	spin_lock_init(&rm->m_rs_lock);
+	init_waitqueue_head(&rm->m_flush_wait);
+
+out:
+	return rm;
+}
+
+/*
+ * RDS ops use this to grab SG entries from the rm's sg pool.
+ */
+struct scatterlist *rds_message_alloc_sgs(struct rds_message *rm, int nents)
+{
+	struct scatterlist *sg_first = (struct scatterlist *) &rm[1];
+	struct scatterlist *sg_ret;
+
+	WARN_ON(rm->m_used_sgs + nents > rm->m_total_sgs);
+	WARN_ON(!nents);
+
+	if (rm->m_used_sgs + nents > rm->m_total_sgs)
+		return NULL;
+
+	sg_ret = &sg_first[rm->m_used_sgs];
+	sg_init_table(sg_ret, nents);
+	rm->m_used_sgs += nents;
+
+	return sg_ret;
+}
+
+struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned int total_len)
+{
+	struct rds_message *rm;
+	unsigned int i;
+	int num_sgs = ceil(total_len, PAGE_SIZE);
+	int extra_bytes = num_sgs * sizeof(struct scatterlist);
+
+	rm = rds_message_alloc(extra_bytes, GFP_NOWAIT);
+	if (!rm)
+		return ERR_PTR(-ENOMEM);
+
+	set_bit(RDS_MSG_PAGEVEC, &rm->m_flags);
+	rm->m_inc.i_hdr.h_len = cpu_to_be32(total_len);
+	rm->data.op_nents = ceil(total_len, PAGE_SIZE);
+	rm->data.op_sg = rds_message_alloc_sgs(rm, num_sgs);
+	if (!rm->data.op_sg) {
+		rds_message_put(rm);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	for (i = 0; i < rm->data.op_nents; ++i) {
+		sg_set_page(&rm->data.op_sg[i],
+				virt_to_page(page_addrs[i]),
+				PAGE_SIZE, 0);
+	}
+
+	return rm;
+}
+
+int rds_message_copy_from_user(struct rds_message *rm, struct iovec *first_iov,
+					       size_t total_len)
+{
+	unsigned long to_copy;
+	unsigned long iov_off;
+	unsigned long sg_off;
+	struct iovec *iov;
+	struct scatterlist *sg;
+	int ret = 0;
+
+	rm->m_inc.i_hdr.h_len = cpu_to_be32(total_len);
+
+	/*
+	 * now allocate and copy in the data payload.
+	 */
+	sg = rm->data.op_sg;
+	iov = first_iov;
+	iov_off = 0;
+	sg_off = 0; /* Dear gcc, sg->page will be null from kzalloc. */
+
+	while (total_len) {
+		if (!sg_page(sg)) {
+			ret = rds_page_remainder_alloc(sg, total_len,
+						       GFP_HIGHUSER);
+			if (ret)
+				goto out;
+			rm->data.op_nents++;
+			sg_off = 0;
+		}
+
+		while (iov_off == iov->iov_len) {
+			iov_off = 0;
+			iov++;
+		}
+
+		to_copy = min(iov->iov_len - iov_off, sg->length - sg_off);
+		to_copy = min_t(size_t, to_copy, total_len);
+
+		rdsdebug("copying %lu bytes from user iov [%p, %zu] + %lu to "
+			 "sg [%p, %u, %u] + %lu\n",
+			 to_copy, iov->iov_base, iov->iov_len, iov_off,
+			 (void *)sg_page(sg), sg->offset, sg->length, sg_off);
+
+		ret = rds_page_copy_from_user(sg_page(sg), sg->offset + sg_off,
+					      iov->iov_base + iov_off,
+					      to_copy);
+		if (ret)
+			goto out;
+
+		iov_off += to_copy;
+		total_len -= to_copy;
+		sg_off += to_copy;
+
+		if (sg_off == sg->length)
+			sg++;
+	}
+
+out:
+	return ret;
+}
+
+int rds_message_inc_copy_to_user(struct rds_incoming *inc,
+				 struct iovec *first_iov, size_t size)
+{
+	struct rds_message *rm;
+	struct iovec *iov;
+	struct scatterlist *sg;
+	unsigned long to_copy;
+	unsigned long iov_off;
+	unsigned long vec_off;
+	int copied;
+	int ret;
+	u32 len;
+
+	rm = container_of(inc, struct rds_message, m_inc);
+	len = be32_to_cpu(rm->m_inc.i_hdr.h_len);
+
+	iov = first_iov;
+	iov_off = 0;
+	sg = rm->data.op_sg;
+	vec_off = 0;
+	copied = 0;
+
+	while (copied < size && copied < len) {
+		while (iov_off == iov->iov_len) {
+			iov_off = 0;
+			iov++;
+		}
+
+		to_copy = min(iov->iov_len - iov_off, sg->length - vec_off);
+		to_copy = min_t(size_t, to_copy, size - copied);
+		to_copy = min_t(unsigned long, to_copy, len - copied);
+
+		rdsdebug("copying %lu bytes to user iov [%p, %zu] + %lu to "
+			 "sg [%p, %u, %u] + %lu\n",
+			 to_copy, iov->iov_base, iov->iov_len, iov_off,
+			 sg_page(sg), sg->offset, sg->length, vec_off);
+
+		ret = rds_page_copy_to_user(sg_page(sg), sg->offset + vec_off,
+					    iov->iov_base + iov_off,
+					    to_copy);
+		if (ret) {
+			copied = ret;
+			break;
+		}
+
+		iov_off += to_copy;
+		vec_off += to_copy;
+		copied += to_copy;
+
+		if (vec_off == sg->length) {
+			vec_off = 0;
+			sg++;
+		}
+	}
+
+	return copied;
+}
+
+/*
+ * If the message is still on the send queue, wait until the transport
+ * is done with it. This is particularly important for RDMA operations.
+ */
+void rds_message_wait(struct rds_message *rm)
+{
+	wait_event_interruptible(rm->m_flush_wait,
+			!test_bit(RDS_MSG_MAPPED, &rm->m_flags));
+}
+
+void rds_message_unmapped(struct rds_message *rm)
+{
+	clear_bit(RDS_MSG_MAPPED, &rm->m_flags);
+	wake_up_interruptible(&rm->m_flush_wait);
+}
+EXPORT_SYMBOL_GPL(rds_message_unmapped);
+
diff --git a/net/rds/page.c b/net/rds/page.c
new file mode 100644
index 0000000..9005a2c
--- /dev/null
+++ b/net/rds/page.c
@@ -0,0 +1,215 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/highmem.h>
+#include <linux/gfp.h>
+#include <linux/cpu.h>
+#include <linux/export.h>
+
+#include "rds.h"
+
+struct rds_page_remainder {
+	struct page	*r_page;
+	unsigned long	r_offset;
+};
+
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_page_remainder,
+				     rds_page_remainders);
+
+/*
+ * returns 0 on success or -errno on failure.
+ *
+ * We don't have to worry about flush_dcache_page() as this only works
+ * with private pages.  If, say, we were to do directed receive to pinned
+ * user pages we'd have to worry more about cache coherence.  (Though
+ * the flush_dcache_page() in get_user_pages() would probably be enough).
+ */
+int rds_page_copy_user(struct page *page, unsigned long offset,
+		       void __user *ptr, unsigned long bytes,
+		       int to_user)
+{
+	unsigned long ret;
+	void *addr;
+
+	addr = kmap(page);
+	if (to_user) {
+		rds_stats_add(s_copy_to_user, bytes);
+		ret = copy_to_user(ptr, addr + offset, bytes);
+	} else {
+		rds_stats_add(s_copy_from_user, bytes);
+		ret = copy_from_user(addr + offset, ptr, bytes);
+	}
+	kunmap(page);
+
+	return ret ? -EFAULT : 0;
+}
+EXPORT_SYMBOL_GPL(rds_page_copy_user);
+
+/**
+ * rds_page_remainder_alloc - build up regions of a message.
+ *
+ * @scat: Scatter list for message
+ * @bytes: the number of bytes needed.
+ * @gfp: the waiting behaviour of the allocation
+ *
+ * @gfp is always ored with __GFP_HIGHMEM.  Callers must be prepared to
+ * kmap the pages, etc.
+ *
+ * If @bytes is at least a full page then this just returns a page from
+ * alloc_page().
+ *
+ * If @bytes is a partial page then this stores the unused region of the
+ * page in a per-cpu structure.  Future partial-page allocations may be
+ * satisfied from that cached region.  This lets us waste less memory on
+ * small allocations with minimal complexity.  It works because the transmit
+ * path passes read-only page regions down to devices.  They hold a page
+ * reference until they are done with the region.
+ */
+int rds_page_remainder_alloc(struct scatterlist *scat, unsigned long bytes,
+			     gfp_t gfp)
+{
+	struct rds_page_remainder *rem;
+	unsigned long flags;
+	struct page *page;
+	int ret;
+
+	gfp |= __GFP_HIGHMEM;
+
+	/* jump straight to allocation if we're trying for a huge page */
+	if (bytes >= PAGE_SIZE) {
+		page = alloc_page(gfp);
+		if (!page) {
+			ret = -ENOMEM;
+		} else {
+			sg_set_page(scat, page, PAGE_SIZE, 0);
+			ret = 0;
+		}
+		goto out;
+	}
+
+	rem = &per_cpu(rds_page_remainders, get_cpu());
+	local_irq_save(flags);
+
+	while (1) {
+		/* avoid a tiny region getting stuck by tossing it */
+		if (rem->r_page && bytes > (PAGE_SIZE - rem->r_offset)) {
+			rds_stats_inc(s_page_remainder_miss);
+			__free_page(rem->r_page);
+			rem->r_page = NULL;
+		}
+
+		/* hand out a fragment from the cached page */
+		if (rem->r_page && bytes <= (PAGE_SIZE - rem->r_offset)) {
+			sg_set_page(scat, rem->r_page, bytes, rem->r_offset);
+			get_page(sg_page(scat));
+
+			if (rem->r_offset != 0)
+				rds_stats_inc(s_page_remainder_hit);
+
+			rem->r_offset += bytes;
+			if (rem->r_offset == PAGE_SIZE) {
+				__free_page(rem->r_page);
+				rem->r_page = NULL;
+			}
+			ret = 0;
+			break;
+		}
+
+		/* alloc if there is nothing for us to use */
+		local_irq_restore(flags);
+		put_cpu();
+
+		page = alloc_page(gfp);
+
+		rem = &per_cpu(rds_page_remainders, get_cpu());
+		local_irq_save(flags);
+
+		if (!page) {
+			ret = -ENOMEM;
+			break;
+		}
+
+		/* did someone race to fill the remainder before us? */
+		if (rem->r_page) {
+			__free_page(page);
+			continue;
+		}
+
+		/* otherwise install our page and loop around to alloc */
+		rem->r_page = page;
+		rem->r_offset = 0;
+	}
+
+	local_irq_restore(flags);
+	put_cpu();
+out:
+	rdsdebug("bytes %lu ret %d %p %u %u\n", bytes, ret,
+		 ret ? NULL : sg_page(scat), ret ? 0 : scat->offset,
+		 ret ? 0 : scat->length);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(rds_page_remainder_alloc);
+
+static int rds_page_remainder_cpu_notify(struct notifier_block *self,
+					 unsigned long action, void *hcpu)
+{
+	struct rds_page_remainder *rem;
+	long cpu = (long)hcpu;
+
+	rem = &per_cpu(rds_page_remainders, cpu);
+
+	rdsdebug("cpu %ld action 0x%lx\n", cpu, action);
+
+	switch (action) {
+	case CPU_DEAD:
+		if (rem->r_page)
+			__free_page(rem->r_page);
+		rem->r_page = NULL;
+		break;
+	}
+
+	return 0;
+}
+
+static struct notifier_block rds_page_remainder_nb = {
+	.notifier_call = rds_page_remainder_cpu_notify,
+};
+
+void rds_page_exit(void)
+{
+	int i;
+
+	for_each_possible_cpu(i)
+		rds_page_remainder_cpu_notify(&rds_page_remainder_nb,
+					      (unsigned long)CPU_DEAD,
+					      (void *)(long)i);
+}
diff --git a/net/rds/rdma.c b/net/rds/rdma.c
new file mode 100644
index 0000000..40084d8
--- /dev/null
+++ b/net/rds/rdma.c
@@ -0,0 +1,859 @@
+/*
+ * Copyright (c) 2007 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/pagemap.h>
+#include <linux/slab.h>
+#include <linux/rbtree.h>
+#include <linux/dma-mapping.h> /* for DMA_*_DEVICE */
+
+#include "rds.h"
+
+/*
+ * XXX
+ *  - build with sparse
+ *  - should we limit the size of a mr region?  let transport return failure?
+ *  - should we detect duplicate keys on a socket?  hmm.
+ *  - an rdma is an mlock, apply rlimit?
+ */
+
+/*
+ * get the number of pages by looking at the page indices that the start and
+ * end addresses fall in.
+ *
+ * Returns 0 if the vec is invalid.  It is invalid if the number of bytes
+ * causes the address to wrap or overflows an unsigned int.  This comes
+ * from being stored in the 'length' member of 'struct scatterlist'.
+ */
+static unsigned int rds_pages_in_vec(struct rds_iovec *vec)
+{
+	if ((vec->addr + vec->bytes <= vec->addr) ||
+	    (vec->bytes > (u64)UINT_MAX))
+		return 0;
+
+	return ((vec->addr + vec->bytes + PAGE_SIZE - 1) >> PAGE_SHIFT) -
+		(vec->addr >> PAGE_SHIFT);
+}
+
+static struct rds_mr *rds_mr_tree_walk(struct rb_root *root, u64 key,
+				       struct rds_mr *insert)
+{
+	struct rb_node **p = &root->rb_node;
+	struct rb_node *parent = NULL;
+	struct rds_mr *mr;
+
+	while (*p) {
+		parent = *p;
+		mr = rb_entry(parent, struct rds_mr, r_rb_node);
+
+		if (key < mr->r_key)
+			p = &(*p)->rb_left;
+		else if (key > mr->r_key)
+			p = &(*p)->rb_right;
+		else
+			return mr;
+	}
+
+	if (insert) {
+		rb_link_node(&insert->r_rb_node, parent, p);
+		rb_insert_color(&insert->r_rb_node, root);
+		atomic_inc(&insert->r_refcount);
+	}
+	return NULL;
+}
+
+/*
+ * Destroy the transport-specific part of a MR.
+ */
+static void rds_destroy_mr(struct rds_mr *mr)
+{
+	struct rds_sock *rs = mr->r_sock;
+	void *trans_private = NULL;
+	unsigned long flags;
+
+	rdsdebug("RDS: destroy mr key is %x refcnt %u\n",
+			mr->r_key, atomic_read(&mr->r_refcount));
+
+	if (test_and_set_bit(RDS_MR_DEAD, &mr->r_state))
+		return;
+
+	spin_lock_irqsave(&rs->rs_rdma_lock, flags);
+	if (!RB_EMPTY_NODE(&mr->r_rb_node))
+		rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys);
+	trans_private = mr->r_trans_private;
+	mr->r_trans_private = NULL;
+	spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
+
+	if (trans_private)
+		mr->r_trans->free_mr(trans_private, mr->r_invalidate);
+}
+
+void __rds_put_mr_final(struct rds_mr *mr)
+{
+	rds_destroy_mr(mr);
+	kfree(mr);
+}
+
+/*
+ * By the time this is called we can't have any more ioctls called on
+ * the socket so we don't need to worry about racing with others.
+ */
+void rds_rdma_drop_keys(struct rds_sock *rs)
+{
+	struct rds_mr *mr;
+	struct rb_node *node;
+	unsigned long flags;
+
+	/* Release any MRs associated with this socket */
+	spin_lock_irqsave(&rs->rs_rdma_lock, flags);
+	while ((node = rb_first(&rs->rs_rdma_keys))) {
+		mr = container_of(node, struct rds_mr, r_rb_node);
+		if (mr->r_trans == rs->rs_transport)
+			mr->r_invalidate = 0;
+		rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys);
+		RB_CLEAR_NODE(&mr->r_rb_node);
+		spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
+		rds_destroy_mr(mr);
+		rds_mr_put(mr);
+		spin_lock_irqsave(&rs->rs_rdma_lock, flags);
+	}
+	spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
+
+	if (rs->rs_transport && rs->rs_transport->flush_mrs)
+		rs->rs_transport->flush_mrs();
+}
+
+/*
+ * Helper function to pin user pages.
+ */
+static int rds_pin_pages(unsigned long user_addr, unsigned int nr_pages,
+			struct page **pages, int write)
+{
+	int ret;
+
+	ret = get_user_pages_fast(user_addr, nr_pages, write, pages);
+
+	if (ret >= 0 && ret < nr_pages) {
+		while (ret--)
+			put_page(pages[ret]);
+		ret = -EFAULT;
+	}
+
+	return ret;
+}
+
+static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args,
+				u64 *cookie_ret, struct rds_mr **mr_ret)
+{
+	struct rds_mr *mr = NULL, *found;
+	unsigned int nr_pages;
+	struct page **pages = NULL;
+	struct scatterlist *sg;
+	void *trans_private;
+	unsigned long flags;
+	rds_rdma_cookie_t cookie;
+	unsigned int nents;
+	long i;
+	int ret;
+
+	if (rs->rs_bound_addr == 0) {
+		ret = -ENOTCONN; /* XXX not a great errno */
+		goto out;
+	}
+
+	if (!rs->rs_transport->get_mr) {
+		ret = -EOPNOTSUPP;
+		goto out;
+	}
+
+	nr_pages = rds_pages_in_vec(&args->vec);
+	if (nr_pages == 0) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	rdsdebug("RDS: get_mr addr %llx len %llu nr_pages %u\n",
+		args->vec.addr, args->vec.bytes, nr_pages);
+
+	/* XXX clamp nr_pages to limit the size of this alloc? */
+	pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL);
+	if (!pages) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	mr = kzalloc(sizeof(struct rds_mr), GFP_KERNEL);
+	if (!mr) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	atomic_set(&mr->r_refcount, 1);
+	RB_CLEAR_NODE(&mr->r_rb_node);
+	mr->r_trans = rs->rs_transport;
+	mr->r_sock = rs;
+
+	if (args->flags & RDS_RDMA_USE_ONCE)
+		mr->r_use_once = 1;
+	if (args->flags & RDS_RDMA_INVALIDATE)
+		mr->r_invalidate = 1;
+	if (args->flags & RDS_RDMA_READWRITE)
+		mr->r_write = 1;
+
+	/*
+	 * Pin the pages that make up the user buffer and transfer the page
+	 * pointers to the mr's sg array.  We check to see if we've mapped
+	 * the whole region after transferring the partial page references
+	 * to the sg array so that we can have one page ref cleanup path.
+	 *
+	 * For now we have no flag that tells us whether the mapping is
+	 * r/o or r/w. We need to assume r/w, or we'll do a lot of RDMA to
+	 * the zero page.
+	 */
+	ret = rds_pin_pages(args->vec.addr, nr_pages, pages, 1);
+	if (ret < 0)
+		goto out;
+
+	nents = ret;
+	sg = kcalloc(nents, sizeof(*sg), GFP_KERNEL);
+	if (!sg) {
+		ret = -ENOMEM;
+		goto out;
+	}
+	WARN_ON(!nents);
+	sg_init_table(sg, nents);
+
+	/* Stick all pages into the scatterlist */
+	for (i = 0 ; i < nents; i++)
+		sg_set_page(&sg[i], pages[i], PAGE_SIZE, 0);
+
+	rdsdebug("RDS: trans_private nents is %u\n", nents);
+
+	/* Obtain a transport specific MR. If this succeeds, the
+	 * s/g list is now owned by the MR.
+	 * Note that dma_map() implies that pending writes are
+	 * flushed to RAM, so no dma_sync is needed here. */
+	trans_private = rs->rs_transport->get_mr(sg, nents, rs,
+						 &mr->r_key);
+
+	if (IS_ERR(trans_private)) {
+		for (i = 0 ; i < nents; i++)
+			put_page(sg_page(&sg[i]));
+		kfree(sg);
+		ret = PTR_ERR(trans_private);
+		goto out;
+	}
+
+	mr->r_trans_private = trans_private;
+
+	rdsdebug("RDS: get_mr put_user key is %x cookie_addr %p\n",
+	       mr->r_key, (void *)(unsigned long) args->cookie_addr);
+
+	/* The user may pass us an unaligned address, but we can only
+	 * map page aligned regions. So we keep the offset, and build
+	 * a 64bit cookie containing <R_Key, offset> and pass that
+	 * around. */
+	cookie = rds_rdma_make_cookie(mr->r_key, args->vec.addr & ~PAGE_MASK);
+	if (cookie_ret)
+		*cookie_ret = cookie;
+
+	if (args->cookie_addr && put_user(cookie, (u64 __user *)(unsigned long) args->cookie_addr)) {
+		ret = -EFAULT;
+		goto out;
+	}
+
+	/* Inserting the new MR into the rbtree bumps its
+	 * reference count. */
+	spin_lock_irqsave(&rs->rs_rdma_lock, flags);
+	found = rds_mr_tree_walk(&rs->rs_rdma_keys, mr->r_key, mr);
+	spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
+
+	BUG_ON(found && found != mr);
+
+	rdsdebug("RDS: get_mr key is %x\n", mr->r_key);
+	if (mr_ret) {
+		atomic_inc(&mr->r_refcount);
+		*mr_ret = mr;
+	}
+
+	ret = 0;
+out:
+	kfree(pages);
+	if (mr)
+		rds_mr_put(mr);
+	return ret;
+}
+
+int rds_get_mr(struct rds_sock *rs, char __user *optval, int optlen)
+{
+	struct rds_get_mr_args args;
+
+	if (optlen != sizeof(struct rds_get_mr_args))
+		return -EINVAL;
+
+	if (copy_from_user(&args, (struct rds_get_mr_args __user *)optval,
+			   sizeof(struct rds_get_mr_args)))
+		return -EFAULT;
+
+	return __rds_rdma_map(rs, &args, NULL, NULL);
+}
+
+int rds_get_mr_for_dest(struct rds_sock *rs, char __user *optval, int optlen)
+{
+	struct rds_get_mr_for_dest_args args;
+	struct rds_get_mr_args new_args;
+
+	if (optlen != sizeof(struct rds_get_mr_for_dest_args))
+		return -EINVAL;
+
+	if (copy_from_user(&args, (struct rds_get_mr_for_dest_args __user *)optval,
+			   sizeof(struct rds_get_mr_for_dest_args)))
+		return -EFAULT;
+
+	/*
+	 * Initially, just behave like get_mr().
+	 * TODO: Implement get_mr as wrapper around this
+	 *	 and deprecate it.
+	 */
+	new_args.vec = args.vec;
+	new_args.cookie_addr = args.cookie_addr;
+	new_args.flags = args.flags;
+
+	return __rds_rdma_map(rs, &new_args, NULL, NULL);
+}
+
+/*
+ * Free the MR indicated by the given R_Key
+ */
+int rds_free_mr(struct rds_sock *rs, char __user *optval, int optlen)
+{
+	struct rds_free_mr_args args;
+	struct rds_mr *mr;
+	unsigned long flags;
+
+	if (optlen != sizeof(struct rds_free_mr_args))
+		return -EINVAL;
+
+	if (copy_from_user(&args, (struct rds_free_mr_args __user *)optval,
+			   sizeof(struct rds_free_mr_args)))
+		return -EFAULT;
+
+	/* Special case - a null cookie means flush all unused MRs */
+	if (args.cookie == 0) {
+		if (!rs->rs_transport || !rs->rs_transport->flush_mrs)
+			return -EINVAL;
+		rs->rs_transport->flush_mrs();
+		return 0;
+	}
+
+	/* Look up the MR given its R_key and remove it from the rbtree
+	 * so nobody else finds it.
+	 * This should also prevent races with rds_rdma_unuse.
+	 */
+	spin_lock_irqsave(&rs->rs_rdma_lock, flags);
+	mr = rds_mr_tree_walk(&rs->rs_rdma_keys, rds_rdma_cookie_key(args.cookie), NULL);
+	if (mr) {
+		rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys);
+		RB_CLEAR_NODE(&mr->r_rb_node);
+		if (args.flags & RDS_RDMA_INVALIDATE)
+			mr->r_invalidate = 1;
+	}
+	spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
+
+	if (!mr)
+		return -EINVAL;
+
+	/*
+	 * call rds_destroy_mr() ourselves so that we're sure it's done by the time
+	 * we return.  If we let rds_mr_put() do it it might not happen until
+	 * someone else drops their ref.
+	 */
+	rds_destroy_mr(mr);
+	rds_mr_put(mr);
+	return 0;
+}
+
+/*
+ * This is called when we receive an extension header that
+ * tells us this MR was used. It allows us to implement
+ * use_once semantics
+ */
+void rds_rdma_unuse(struct rds_sock *rs, u32 r_key, int force)
+{
+	struct rds_mr *mr;
+	unsigned long flags;
+	int zot_me = 0;
+
+	spin_lock_irqsave(&rs->rs_rdma_lock, flags);
+	mr = rds_mr_tree_walk(&rs->rs_rdma_keys, r_key, NULL);
+	if (!mr) {
+		printk(KERN_ERR "rds: trying to unuse MR with unknown r_key %u!\n", r_key);
+		spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
+		return;
+	}
+
+	if (mr->r_use_once || force) {
+		rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys);
+		RB_CLEAR_NODE(&mr->r_rb_node);
+		zot_me = 1;
+	}
+	spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
+
+	/* May have to issue a dma_sync on this memory region.
+	 * Note we could avoid this if the operation was a RDMA READ,
+	 * but at this point we can't tell. */
+	if (mr->r_trans->sync_mr)
+		mr->r_trans->sync_mr(mr->r_trans_private, DMA_FROM_DEVICE);
+
+	/* If the MR was marked as invalidate, this will
+	 * trigger an async flush. */
+	if (zot_me)
+		rds_destroy_mr(mr);
+	rds_mr_put(mr);
+}
+
+void rds_rdma_free_op(struct rm_rdma_op *ro)
+{
+	unsigned int i;
+
+	for (i = 0; i < ro->op_nents; i++) {
+		struct page *page = sg_page(&ro->op_sg[i]);
+
+		/* Mark page dirty if it was possibly modified, which
+		 * is the case for a RDMA_READ which copies from remote
+		 * to local memory */
+		if (!ro->op_write) {
+			BUG_ON(irqs_disabled());
+			set_page_dirty(page);
+		}
+		put_page(page);
+	}
+
+	kfree(ro->op_notifier);
+	ro->op_notifier = NULL;
+	ro->op_active = 0;
+}
+
+void rds_atomic_free_op(struct rm_atomic_op *ao)
+{
+	struct page *page = sg_page(ao->op_sg);
+
+	/* Mark page dirty if it was possibly modified, which
+	 * is the case for a RDMA_READ which copies from remote
+	 * to local memory */
+	set_page_dirty(page);
+	put_page(page);
+
+	kfree(ao->op_notifier);
+	ao->op_notifier = NULL;
+	ao->op_active = 0;
+}
+
+
+/*
+ * Count the number of pages needed to describe an incoming iovec array.
+ */
+static int rds_rdma_pages(struct rds_iovec iov[], int nr_iovecs)
+{
+	int tot_pages = 0;
+	unsigned int nr_pages;
+	unsigned int i;
+
+	/* figure out the number of pages in the vector */
+	for (i = 0; i < nr_iovecs; i++) {
+		nr_pages = rds_pages_in_vec(&iov[i]);
+		if (nr_pages == 0)
+			return -EINVAL;
+
+		tot_pages += nr_pages;
+
+		/*
+		 * nr_pages for one entry is limited to (UINT_MAX>>PAGE_SHIFT)+1,
+		 * so tot_pages cannot overflow without first going negative.
+		 */
+		if (tot_pages < 0)
+			return -EINVAL;
+	}
+
+	return tot_pages;
+}
+
+int rds_rdma_extra_size(struct rds_rdma_args *args)
+{
+	struct rds_iovec vec;
+	struct rds_iovec __user *local_vec;
+	int tot_pages = 0;
+	unsigned int nr_pages;
+	unsigned int i;
+
+	local_vec = (struct rds_iovec __user *)(unsigned long) args->local_vec_addr;
+
+	/* figure out the number of pages in the vector */
+	for (i = 0; i < args->nr_local; i++) {
+		if (copy_from_user(&vec, &local_vec[i],
+				   sizeof(struct rds_iovec)))
+			return -EFAULT;
+
+		nr_pages = rds_pages_in_vec(&vec);
+		if (nr_pages == 0)
+			return -EINVAL;
+
+		tot_pages += nr_pages;
+
+		/*
+		 * nr_pages for one entry is limited to (UINT_MAX>>PAGE_SHIFT)+1,
+		 * so tot_pages cannot overflow without first going negative.
+		 */
+		if (tot_pages < 0)
+			return -EINVAL;
+	}
+
+	return tot_pages * sizeof(struct scatterlist);
+}
+
+/*
+ * The application asks for a RDMA transfer.
+ * Extract all arguments and set up the rdma_op
+ */
+int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
+			  struct cmsghdr *cmsg)
+{
+	struct rds_rdma_args *args;
+	struct rm_rdma_op *op = &rm->rdma;
+	int nr_pages;
+	unsigned int nr_bytes;
+	struct page **pages = NULL;
+	struct rds_iovec iovstack[UIO_FASTIOV], *iovs = iovstack;
+	int iov_size;
+	unsigned int i, j;
+	int ret = 0;
+
+	if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_rdma_args))
+	    || rm->rdma.op_active)
+		return -EINVAL;
+
+	args = CMSG_DATA(cmsg);
+
+	if (rs->rs_bound_addr == 0) {
+		ret = -ENOTCONN; /* XXX not a great errno */
+		goto out_ret;
+	}
+
+	if (args->nr_local > UIO_MAXIOV) {
+		ret = -EMSGSIZE;
+		goto out_ret;
+	}
+
+	/* Check whether to allocate the iovec area */
+	iov_size = args->nr_local * sizeof(struct rds_iovec);
+	if (args->nr_local > UIO_FASTIOV) {
+		iovs = sock_kmalloc(rds_rs_to_sk(rs), iov_size, GFP_KERNEL);
+		if (!iovs) {
+			ret = -ENOMEM;
+			goto out_ret;
+		}
+	}
+
+	if (copy_from_user(iovs, (struct rds_iovec __user *)(unsigned long) args->local_vec_addr, iov_size)) {
+		ret = -EFAULT;
+		goto out;
+	}
+
+	nr_pages = rds_rdma_pages(iovs, args->nr_local);
+	if (nr_pages < 0) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL);
+	if (!pages) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	op->op_write = !!(args->flags & RDS_RDMA_READWRITE);
+	op->op_fence = !!(args->flags & RDS_RDMA_FENCE);
+	op->op_notify = !!(args->flags & RDS_RDMA_NOTIFY_ME);
+	op->op_silent = !!(args->flags & RDS_RDMA_SILENT);
+	op->op_active = 1;
+	op->op_recverr = rs->rs_recverr;
+	WARN_ON(!nr_pages);
+	op->op_sg = rds_message_alloc_sgs(rm, nr_pages);
+	if (!op->op_sg) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	if (op->op_notify || op->op_recverr) {
+		/* We allocate an uninitialized notifier here, because
+		 * we don't want to do that in the completion handler. We
+		 * would have to use GFP_ATOMIC there, and don't want to deal
+		 * with failed allocations.
+		 */
+		op->op_notifier = kmalloc(sizeof(struct rds_notifier), GFP_KERNEL);
+		if (!op->op_notifier) {
+			ret = -ENOMEM;
+			goto out;
+		}
+		op->op_notifier->n_user_token = args->user_token;
+		op->op_notifier->n_status = RDS_RDMA_SUCCESS;
+	}
+
+	/* The cookie contains the R_Key of the remote memory region, and
+	 * optionally an offset into it. This is how we implement RDMA into
+	 * unaligned memory.
+	 * When setting up the RDMA, we need to add that offset to the
+	 * destination address (which is really an offset into the MR)
+	 * FIXME: We may want to move this into ib_rdma.c
+	 */
+	op->op_rkey = rds_rdma_cookie_key(args->cookie);
+	op->op_remote_addr = args->remote_vec.addr + rds_rdma_cookie_offset(args->cookie);
+
+	nr_bytes = 0;
+
+	rdsdebug("RDS: rdma prepare nr_local %llu rva %llx rkey %x\n",
+	       (unsigned long long)args->nr_local,
+	       (unsigned long long)args->remote_vec.addr,
+	       op->op_rkey);
+
+	for (i = 0; i < args->nr_local; i++) {
+		struct rds_iovec *iov = &iovs[i];
+		/* don't need to check, rds_rdma_pages() verified nr will be +nonzero */
+		unsigned int nr = rds_pages_in_vec(iov);
+
+		rs->rs_user_addr = iov->addr;
+		rs->rs_user_bytes = iov->bytes;
+
+		/* If it's a WRITE operation, we want to pin the pages for reading.
+		 * If it's a READ operation, we need to pin the pages for writing.
+		 */
+		ret = rds_pin_pages(iov->addr, nr, pages, !op->op_write);
+		if (ret < 0)
+			goto out;
+
+		rdsdebug("RDS: nr_bytes %u nr %u iov->bytes %llu iov->addr %llx\n",
+			 nr_bytes, nr, iov->bytes, iov->addr);
+
+		nr_bytes += iov->bytes;
+
+		for (j = 0; j < nr; j++) {
+			unsigned int offset = iov->addr & ~PAGE_MASK;
+			struct scatterlist *sg;
+
+			sg = &op->op_sg[op->op_nents + j];
+			sg_set_page(sg, pages[j],
+					min_t(unsigned int, iov->bytes, PAGE_SIZE - offset),
+					offset);
+
+			rdsdebug("RDS: sg->offset %x sg->len %x iov->addr %llx iov->bytes %llu\n",
+			       sg->offset, sg->length, iov->addr, iov->bytes);
+
+			iov->addr += sg->length;
+			iov->bytes -= sg->length;
+		}
+
+		op->op_nents += nr;
+	}
+
+	if (nr_bytes > args->remote_vec.bytes) {
+		rdsdebug("RDS nr_bytes %u remote_bytes %u do not match\n",
+				nr_bytes,
+				(unsigned int) args->remote_vec.bytes);
+		ret = -EINVAL;
+		goto out;
+	}
+	op->op_bytes = nr_bytes;
+
+out:
+	if (iovs != iovstack)
+		sock_kfree_s(rds_rs_to_sk(rs), iovs, iov_size);
+	kfree(pages);
+out_ret:
+	if (ret)
+		rds_rdma_free_op(op);
+	else
+		rds_stats_inc(s_send_rdma);
+
+	return ret;
+}
+
+/*
+ * The application wants us to pass an RDMA destination (aka MR)
+ * to the remote
+ */
+int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm,
+			  struct cmsghdr *cmsg)
+{
+	unsigned long flags;
+	struct rds_mr *mr;
+	u32 r_key;
+	int err = 0;
+
+	if (cmsg->cmsg_len < CMSG_LEN(sizeof(rds_rdma_cookie_t)) ||
+	    rm->m_rdma_cookie != 0)
+		return -EINVAL;
+
+	memcpy(&rm->m_rdma_cookie, CMSG_DATA(cmsg), sizeof(rm->m_rdma_cookie));
+
+	/* We are reusing a previously mapped MR here. Most likely, the
+	 * application has written to the buffer, so we need to explicitly
+	 * flush those writes to RAM. Otherwise the HCA may not see them
+	 * when doing a DMA from that buffer.
+	 */
+	r_key = rds_rdma_cookie_key(rm->m_rdma_cookie);
+
+	spin_lock_irqsave(&rs->rs_rdma_lock, flags);
+	mr = rds_mr_tree_walk(&rs->rs_rdma_keys, r_key, NULL);
+	if (!mr)
+		err = -EINVAL;	/* invalid r_key */
+	else
+		atomic_inc(&mr->r_refcount);
+	spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
+
+	if (mr) {
+		mr->r_trans->sync_mr(mr->r_trans_private, DMA_TO_DEVICE);
+		rm->rdma.op_rdma_mr = mr;
+	}
+	return err;
+}
+
+/*
+ * The application passes us an address range it wants to enable RDMA
+ * to/from. We map the area, and save the <R_Key,offset> pair
+ * in rm->m_rdma_cookie. This causes it to be sent along to the peer
+ * in an extension header.
+ */
+int rds_cmsg_rdma_map(struct rds_sock *rs, struct rds_message *rm,
+			  struct cmsghdr *cmsg)
+{
+	if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_get_mr_args)) ||
+	    rm->m_rdma_cookie != 0)
+		return -EINVAL;
+
+	return __rds_rdma_map(rs, CMSG_DATA(cmsg), &rm->m_rdma_cookie, &rm->rdma.op_rdma_mr);
+}
+
+/*
+ * Fill in rds_message for an atomic request.
+ */
+int rds_cmsg_atomic(struct rds_sock *rs, struct rds_message *rm,
+		    struct cmsghdr *cmsg)
+{
+	struct page *page = NULL;
+	struct rds_atomic_args *args;
+	int ret = 0;
+
+	if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_atomic_args))
+	 || rm->atomic.op_active)
+		return -EINVAL;
+
+	args = CMSG_DATA(cmsg);
+
+	/* Nonmasked & masked cmsg ops converted to masked hw ops */
+	switch (cmsg->cmsg_type) {
+	case RDS_CMSG_ATOMIC_FADD:
+		rm->atomic.op_type = RDS_ATOMIC_TYPE_FADD;
+		rm->atomic.op_m_fadd.add = args->fadd.add;
+		rm->atomic.op_m_fadd.nocarry_mask = 0;
+		break;
+	case RDS_CMSG_MASKED_ATOMIC_FADD:
+		rm->atomic.op_type = RDS_ATOMIC_TYPE_FADD;
+		rm->atomic.op_m_fadd.add = args->m_fadd.add;
+		rm->atomic.op_m_fadd.nocarry_mask = args->m_fadd.nocarry_mask;
+		break;
+	case RDS_CMSG_ATOMIC_CSWP:
+		rm->atomic.op_type = RDS_ATOMIC_TYPE_CSWP;
+		rm->atomic.op_m_cswp.compare = args->cswp.compare;
+		rm->atomic.op_m_cswp.swap = args->cswp.swap;
+		rm->atomic.op_m_cswp.compare_mask = ~0;
+		rm->atomic.op_m_cswp.swap_mask = ~0;
+		break;
+	case RDS_CMSG_MASKED_ATOMIC_CSWP:
+		rm->atomic.op_type = RDS_ATOMIC_TYPE_CSWP;
+		rm->atomic.op_m_cswp.compare = args->m_cswp.compare;
+		rm->atomic.op_m_cswp.swap = args->m_cswp.swap;
+		rm->atomic.op_m_cswp.compare_mask = args->m_cswp.compare_mask;
+		rm->atomic.op_m_cswp.swap_mask = args->m_cswp.swap_mask;
+		break;
+	default:
+		BUG(); /* should never happen */
+	}
+
+	rm->atomic.op_notify = !!(args->flags & RDS_RDMA_NOTIFY_ME);
+	rm->atomic.op_silent = !!(args->flags & RDS_RDMA_SILENT);
+	rm->atomic.op_active = 1;
+	rm->atomic.op_recverr = rs->rs_recverr;
+	rm->atomic.op_sg = rds_message_alloc_sgs(rm, 1);
+	if (!rm->atomic.op_sg) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	/* verify 8 byte-aligned */
+	if (args->local_addr & 0x7) {
+		ret = -EFAULT;
+		goto err;
+	}
+
+	ret = rds_pin_pages(args->local_addr, 1, &page, 1);
+	if (ret != 1)
+		goto err;
+	ret = 0;
+
+	sg_set_page(rm->atomic.op_sg, page, 8, offset_in_page(args->local_addr));
+
+	if (rm->atomic.op_notify || rm->atomic.op_recverr) {
+		/* We allocate an uninitialized notifier here, because
+		 * we don't want to do that in the completion handler. We
+		 * would have to use GFP_ATOMIC there, and don't want to deal
+		 * with failed allocations.
+		 */
+		rm->atomic.op_notifier = kmalloc(sizeof(*rm->atomic.op_notifier), GFP_KERNEL);
+		if (!rm->atomic.op_notifier) {
+			ret = -ENOMEM;
+			goto err;
+		}
+
+		rm->atomic.op_notifier->n_user_token = args->user_token;
+		rm->atomic.op_notifier->n_status = RDS_RDMA_SUCCESS;
+	}
+
+	rm->atomic.op_rkey = rds_rdma_cookie_key(args->cookie);
+	rm->atomic.op_remote_addr = args->remote_addr + rds_rdma_cookie_offset(args->cookie);
+
+	return ret;
+err:
+	if (page)
+		put_page(page);
+	kfree(rm->atomic.op_notifier);
+
+	return ret;
+}
diff --git a/net/rds/rdma_transport.c b/net/rds/rdma_transport.c
new file mode 100644
index 0000000..6cd9d1d
--- /dev/null
+++ b/net/rds/rdma_transport.c
@@ -0,0 +1,251 @@
+/*
+ * Copyright (c) 2009 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/module.h>
+#include <rdma/rdma_cm.h>
+
+#include "rdma_transport.h"
+
+static struct rdma_cm_id *rds_rdma_listen_id;
+
+static char *rds_cm_event_strings[] = {
+#define RDS_CM_EVENT_STRING(foo) \
+		[RDMA_CM_EVENT_##foo] = __stringify(RDMA_CM_EVENT_##foo)
+	RDS_CM_EVENT_STRING(ADDR_RESOLVED),
+	RDS_CM_EVENT_STRING(ADDR_ERROR),
+	RDS_CM_EVENT_STRING(ROUTE_RESOLVED),
+	RDS_CM_EVENT_STRING(ROUTE_ERROR),
+	RDS_CM_EVENT_STRING(CONNECT_REQUEST),
+	RDS_CM_EVENT_STRING(CONNECT_RESPONSE),
+	RDS_CM_EVENT_STRING(CONNECT_ERROR),
+	RDS_CM_EVENT_STRING(UNREACHABLE),
+	RDS_CM_EVENT_STRING(REJECTED),
+	RDS_CM_EVENT_STRING(ESTABLISHED),
+	RDS_CM_EVENT_STRING(DISCONNECTED),
+	RDS_CM_EVENT_STRING(DEVICE_REMOVAL),
+	RDS_CM_EVENT_STRING(MULTICAST_JOIN),
+	RDS_CM_EVENT_STRING(MULTICAST_ERROR),
+	RDS_CM_EVENT_STRING(ADDR_CHANGE),
+	RDS_CM_EVENT_STRING(TIMEWAIT_EXIT),
+#undef RDS_CM_EVENT_STRING
+};
+
+static char *rds_cm_event_str(enum rdma_cm_event_type type)
+{
+	return rds_str_array(rds_cm_event_strings,
+			     ARRAY_SIZE(rds_cm_event_strings), type);
+};
+
+int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
+			      struct rdma_cm_event *event)
+{
+	/* this can be null in the listening path */
+	struct rds_connection *conn = cm_id->context;
+	struct rds_transport *trans;
+	int ret = 0;
+
+	rdsdebug("conn %p id %p handling event %u (%s)\n", conn, cm_id,
+		 event->event, rds_cm_event_str(event->event));
+
+	if (cm_id->device->node_type == RDMA_NODE_RNIC)
+		trans = &rds_iw_transport;
+	else
+		trans = &rds_ib_transport;
+
+	/* Prevent shutdown from tearing down the connection
+	 * while we're executing. */
+	if (conn) {
+		mutex_lock(&conn->c_cm_lock);
+
+		/* If the connection is being shut down, bail out
+		 * right away. We return 0 so cm_id doesn't get
+		 * destroyed prematurely */
+		if (rds_conn_state(conn) == RDS_CONN_DISCONNECTING) {
+			/* Reject incoming connections while we're tearing
+			 * down an existing one. */
+			if (event->event == RDMA_CM_EVENT_CONNECT_REQUEST)
+				ret = 1;
+			goto out;
+		}
+	}
+
+	switch (event->event) {
+	case RDMA_CM_EVENT_CONNECT_REQUEST:
+		ret = trans->cm_handle_connect(cm_id, event);
+		break;
+
+	case RDMA_CM_EVENT_ADDR_RESOLVED:
+		/* XXX do we need to clean up if this fails? */
+		ret = rdma_resolve_route(cm_id,
+					 RDS_RDMA_RESOLVE_TIMEOUT_MS);
+		break;
+
+	case RDMA_CM_EVENT_ROUTE_RESOLVED:
+		/* XXX worry about racing with listen acceptance */
+		ret = trans->cm_initiate_connect(cm_id);
+		break;
+
+	case RDMA_CM_EVENT_ESTABLISHED:
+		trans->cm_connect_complete(conn, event);
+		break;
+
+	case RDMA_CM_EVENT_ADDR_ERROR:
+	case RDMA_CM_EVENT_ROUTE_ERROR:
+	case RDMA_CM_EVENT_CONNECT_ERROR:
+	case RDMA_CM_EVENT_UNREACHABLE:
+	case RDMA_CM_EVENT_REJECTED:
+	case RDMA_CM_EVENT_DEVICE_REMOVAL:
+	case RDMA_CM_EVENT_ADDR_CHANGE:
+		if (conn)
+			rds_conn_drop(conn);
+		break;
+
+	case RDMA_CM_EVENT_DISCONNECTED:
+		rdsdebug("DISCONNECT event - dropping connection "
+			"%pI4->%pI4\n", &conn->c_laddr,
+			 &conn->c_faddr);
+		rds_conn_drop(conn);
+		break;
+
+	default:
+		/* things like device disconnect? */
+		printk(KERN_ERR "RDS: unknown event %u (%s)!\n",
+		       event->event, rds_cm_event_str(event->event));
+		break;
+	}
+
+out:
+	if (conn)
+		mutex_unlock(&conn->c_cm_lock);
+
+	rdsdebug("id %p event %u (%s) handling ret %d\n", cm_id, event->event,
+		 rds_cm_event_str(event->event), ret);
+
+	return ret;
+}
+
+static int rds_rdma_listen_init(void)
+{
+	struct sockaddr_in sin;
+	struct rdma_cm_id *cm_id;
+	int ret;
+
+	cm_id = rdma_create_id(rds_rdma_cm_event_handler, NULL, RDMA_PS_TCP,
+			       IB_QPT_RC);
+	if (IS_ERR(cm_id)) {
+		ret = PTR_ERR(cm_id);
+		printk(KERN_ERR "RDS/RDMA: failed to setup listener, "
+		       "rdma_create_id() returned %d\n", ret);
+		return ret;
+	}
+
+	sin.sin_family = AF_INET;
+	sin.sin_addr.s_addr = (__force u32)htonl(INADDR_ANY);
+	sin.sin_port = (__force u16)htons(RDS_PORT);
+
+	/*
+	 * XXX I bet this binds the cm_id to a device.  If we want to support
+	 * fail-over we'll have to take this into consideration.
+	 */
+	ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin);
+	if (ret) {
+		printk(KERN_ERR "RDS/RDMA: failed to setup listener, "
+		       "rdma_bind_addr() returned %d\n", ret);
+		goto out;
+	}
+
+	ret = rdma_listen(cm_id, 128);
+	if (ret) {
+		printk(KERN_ERR "RDS/RDMA: failed to setup listener, "
+		       "rdma_listen() returned %d\n", ret);
+		goto out;
+	}
+
+	rdsdebug("cm %p listening on port %u\n", cm_id, RDS_PORT);
+
+	rds_rdma_listen_id = cm_id;
+	cm_id = NULL;
+out:
+	if (cm_id)
+		rdma_destroy_id(cm_id);
+	return ret;
+}
+
+static void rds_rdma_listen_stop(void)
+{
+	if (rds_rdma_listen_id) {
+		rdsdebug("cm %p\n", rds_rdma_listen_id);
+		rdma_destroy_id(rds_rdma_listen_id);
+		rds_rdma_listen_id = NULL;
+	}
+}
+
+static int rds_rdma_init(void)
+{
+	int ret;
+
+	ret = rds_rdma_listen_init();
+	if (ret)
+		goto out;
+
+	ret = rds_iw_init();
+	if (ret)
+		goto err_iw_init;
+
+	ret = rds_ib_init();
+	if (ret)
+		goto err_ib_init;
+
+	goto out;
+
+err_ib_init:
+	rds_iw_exit();
+err_iw_init:
+	rds_rdma_listen_stop();
+out:
+	return ret;
+}
+module_init(rds_rdma_init);
+
+static void rds_rdma_exit(void)
+{
+	/* stop listening first to ensure no new connections are attempted */
+	rds_rdma_listen_stop();
+	rds_ib_exit();
+	rds_iw_exit();
+}
+module_exit(rds_rdma_exit);
+
+MODULE_AUTHOR("Oracle Corporation <rds-devel@oss.oracle.com>");
+MODULE_DESCRIPTION("RDS: IB/iWARP transport");
+MODULE_LICENSE("Dual BSD/GPL");
+
diff --git a/net/rds/rdma_transport.h b/net/rds/rdma_transport.h
new file mode 100644
index 0000000..faba4e3
--- /dev/null
+++ b/net/rds/rdma_transport.h
@@ -0,0 +1,24 @@
+#ifndef _RDMA_TRANSPORT_H
+#define _RDMA_TRANSPORT_H
+
+#include <rdma/ib_verbs.h>
+#include <rdma/rdma_cm.h>
+#include "rds.h"
+
+#define RDS_RDMA_RESOLVE_TIMEOUT_MS     5000
+
+int rds_rdma_conn_connect(struct rds_connection *conn);
+int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
+			      struct rdma_cm_event *event);
+
+/* from ib.c */
+extern struct rds_transport rds_ib_transport;
+int rds_ib_init(void);
+void rds_ib_exit(void);
+
+/* from iw.c */
+extern struct rds_transport rds_iw_transport;
+int rds_iw_init(void);
+void rds_iw_exit(void);
+
+#endif
diff --git a/net/rds/rds.h b/net/rds/rds.h
new file mode 100644
index 0000000..48f8ffc
--- /dev/null
+++ b/net/rds/rds.h
@@ -0,0 +1,812 @@
+#ifndef _RDS_RDS_H
+#define _RDS_RDS_H
+
+#include <net/sock.h>
+#include <linux/scatterlist.h>
+#include <linux/highmem.h>
+#include <rdma/rdma_cm.h>
+#include <linux/mutex.h>
+#include <linux/rds.h>
+
+#include "info.h"
+
+/*
+ * RDS Network protocol version
+ */
+#define RDS_PROTOCOL_3_0	0x0300
+#define RDS_PROTOCOL_3_1	0x0301
+#define RDS_PROTOCOL_VERSION	RDS_PROTOCOL_3_1
+#define RDS_PROTOCOL_MAJOR(v)	((v) >> 8)
+#define RDS_PROTOCOL_MINOR(v)	((v) & 255)
+#define RDS_PROTOCOL(maj, min)	(((maj) << 8) | min)
+
+/*
+ * XXX randomly chosen, but at least seems to be unused:
+ * #               18464-18768 Unassigned
+ * We should do better.  We want a reserved port to discourage unpriv'ed
+ * userspace from listening.
+ */
+#define RDS_PORT	18634
+
+#ifdef ATOMIC64_INIT
+#define KERNEL_HAS_ATOMIC64
+#endif
+
+#ifdef DEBUG
+#define rdsdebug(fmt, args...) pr_debug("%s(): " fmt, __func__ , ##args)
+#else
+/* sigh, pr_debug() causes unused variable warnings */
+static inline __printf(1, 2)
+void rdsdebug(char *fmt, ...)
+{
+}
+#endif
+
+/* XXX is there one of these somewhere? */
+#define ceil(x, y) \
+	({ unsigned long __x = (x), __y = (y); (__x + __y - 1) / __y; })
+
+#define RDS_FRAG_SHIFT	12
+#define RDS_FRAG_SIZE	((unsigned int)(1 << RDS_FRAG_SHIFT))
+
+#define RDS_CONG_MAP_BYTES	(65536 / 8)
+#define RDS_CONG_MAP_PAGES	(PAGE_ALIGN(RDS_CONG_MAP_BYTES) / PAGE_SIZE)
+#define RDS_CONG_MAP_PAGE_BITS	(PAGE_SIZE * 8)
+
+struct rds_cong_map {
+	struct rb_node		m_rb_node;
+	__be32			m_addr;
+	wait_queue_head_t	m_waitq;
+	struct list_head	m_conn_list;
+	unsigned long		m_page_addrs[RDS_CONG_MAP_PAGES];
+};
+
+
+/*
+ * This is how we will track the connection state:
+ * A connection is always in one of the following
+ * states. Updates to the state are atomic and imply
+ * a memory barrier.
+ */
+enum {
+	RDS_CONN_DOWN = 0,
+	RDS_CONN_CONNECTING,
+	RDS_CONN_DISCONNECTING,
+	RDS_CONN_UP,
+	RDS_CONN_ERROR,
+};
+
+/* Bits for c_flags */
+#define RDS_LL_SEND_FULL	0
+#define RDS_RECONNECT_PENDING	1
+#define RDS_IN_XMIT		2
+
+struct rds_connection {
+	struct hlist_node	c_hash_node;
+	__be32			c_laddr;
+	__be32			c_faddr;
+	unsigned int		c_loopback:1;
+	struct rds_connection	*c_passive;
+
+	struct rds_cong_map	*c_lcong;
+	struct rds_cong_map	*c_fcong;
+
+	struct rds_message	*c_xmit_rm;
+	unsigned long		c_xmit_sg;
+	unsigned int		c_xmit_hdr_off;
+	unsigned int		c_xmit_data_off;
+	unsigned int		c_xmit_atomic_sent;
+	unsigned int		c_xmit_rdma_sent;
+	unsigned int		c_xmit_data_sent;
+
+	spinlock_t		c_lock;		/* protect msg queues */
+	u64			c_next_tx_seq;
+	struct list_head	c_send_queue;
+	struct list_head	c_retrans;
+
+	u64			c_next_rx_seq;
+
+	struct rds_transport	*c_trans;
+	void			*c_transport_data;
+
+	atomic_t		c_state;
+	unsigned long		c_flags;
+	unsigned long		c_reconnect_jiffies;
+	struct delayed_work	c_send_w;
+	struct delayed_work	c_recv_w;
+	struct delayed_work	c_conn_w;
+	struct work_struct	c_down_w;
+	struct mutex		c_cm_lock;	/* protect conn state & cm */
+	wait_queue_head_t	c_waitq;
+
+	struct list_head	c_map_item;
+	unsigned long		c_map_queued;
+
+	unsigned int		c_unacked_packets;
+	unsigned int		c_unacked_bytes;
+
+	/* Protocol version */
+	unsigned int		c_version;
+};
+
+#define RDS_FLAG_CONG_BITMAP	0x01
+#define RDS_FLAG_ACK_REQUIRED	0x02
+#define RDS_FLAG_RETRANSMITTED	0x04
+#define RDS_MAX_ADV_CREDIT	255
+
+/*
+ * Maximum space available for extension headers.
+ */
+#define RDS_HEADER_EXT_SPACE	16
+
+struct rds_header {
+	__be64	h_sequence;
+	__be64	h_ack;
+	__be32	h_len;
+	__be16	h_sport;
+	__be16	h_dport;
+	u8	h_flags;
+	u8	h_credit;
+	u8	h_padding[4];
+	__sum16	h_csum;
+
+	u8	h_exthdr[RDS_HEADER_EXT_SPACE];
+};
+
+/*
+ * Reserved - indicates end of extensions
+ */
+#define RDS_EXTHDR_NONE		0
+
+/*
+ * This extension header is included in the very
+ * first message that is sent on a new connection,
+ * and identifies the protocol level. This will help
+ * rolling updates if a future change requires breaking
+ * the protocol.
+ * NB: This is no longer true for IB, where we do a version
+ * negotiation during the connection setup phase (protocol
+ * version information is included in the RDMA CM private data).
+ */
+#define RDS_EXTHDR_VERSION	1
+struct rds_ext_header_version {
+	__be32			h_version;
+};
+
+/*
+ * This extension header is included in the RDS message
+ * chasing an RDMA operation.
+ */
+#define RDS_EXTHDR_RDMA		2
+struct rds_ext_header_rdma {
+	__be32			h_rdma_rkey;
+};
+
+/*
+ * This extension header tells the peer about the
+ * destination <R_Key,offset> of the requested RDMA
+ * operation.
+ */
+#define RDS_EXTHDR_RDMA_DEST	3
+struct rds_ext_header_rdma_dest {
+	__be32			h_rdma_rkey;
+	__be32			h_rdma_offset;
+};
+
+#define __RDS_EXTHDR_MAX	16 /* for now */
+
+struct rds_incoming {
+	atomic_t		i_refcount;
+	struct list_head	i_item;
+	struct rds_connection	*i_conn;
+	struct rds_header	i_hdr;
+	unsigned long		i_rx_jiffies;
+	__be32			i_saddr;
+
+	rds_rdma_cookie_t	i_rdma_cookie;
+};
+
+struct rds_mr {
+	struct rb_node		r_rb_node;
+	atomic_t		r_refcount;
+	u32			r_key;
+
+	/* A copy of the creation flags */
+	unsigned int		r_use_once:1;
+	unsigned int		r_invalidate:1;
+	unsigned int		r_write:1;
+
+	/* This is for RDS_MR_DEAD.
+	 * It would be nice & consistent to make this part of the above
+	 * bit field here, but we need to use test_and_set_bit.
+	 */
+	unsigned long		r_state;
+	struct rds_sock		*r_sock; /* back pointer to the socket that owns us */
+	struct rds_transport	*r_trans;
+	void			*r_trans_private;
+};
+
+/* Flags for mr->r_state */
+#define RDS_MR_DEAD		0
+
+static inline rds_rdma_cookie_t rds_rdma_make_cookie(u32 r_key, u32 offset)
+{
+	return r_key | (((u64) offset) << 32);
+}
+
+static inline u32 rds_rdma_cookie_key(rds_rdma_cookie_t cookie)
+{
+	return cookie;
+}
+
+static inline u32 rds_rdma_cookie_offset(rds_rdma_cookie_t cookie)
+{
+	return cookie >> 32;
+}
+
+/* atomic operation types */
+#define RDS_ATOMIC_TYPE_CSWP		0
+#define RDS_ATOMIC_TYPE_FADD		1
+
+/*
+ * m_sock_item and m_conn_item are on lists that are serialized under
+ * conn->c_lock.  m_sock_item has additional meaning in that once it is empty
+ * the message will not be put back on the retransmit list after being sent.
+ * messages that are canceled while being sent rely on this.
+ *
+ * m_inc is used by loopback so that it can pass an incoming message straight
+ * back up into the rx path.  It embeds a wire header which is also used by
+ * the send path, which is kind of awkward.
+ *
+ * m_sock_item indicates the message's presence on a socket's send or receive
+ * queue.  m_rs will point to that socket.
+ *
+ * m_daddr is used by cancellation to prune messages to a given destination.
+ *
+ * The RDS_MSG_ON_SOCK and RDS_MSG_ON_CONN flags are used to avoid lock
+ * nesting.  As paths iterate over messages on a sock, or conn, they must
+ * also lock the conn, or sock, to remove the message from those lists too.
+ * Testing the flag to determine if the message is still on the lists lets
+ * us avoid testing the list_head directly.  That means each path can use
+ * the message's list_head to keep it on a local list while juggling locks
+ * without confusing the other path.
+ *
+ * m_ack_seq is an optional field set by transports who need a different
+ * sequence number range to invalidate.  They can use this in a callback
+ * that they pass to rds_send_drop_acked() to see if each message has been
+ * acked.  The HAS_ACK_SEQ flag can be used to detect messages which haven't
+ * had ack_seq set yet.
+ */
+#define RDS_MSG_ON_SOCK		1
+#define RDS_MSG_ON_CONN		2
+#define RDS_MSG_HAS_ACK_SEQ	3
+#define RDS_MSG_ACK_REQUIRED	4
+#define RDS_MSG_RETRANSMITTED	5
+#define RDS_MSG_MAPPED		6
+#define RDS_MSG_PAGEVEC		7
+
+struct rds_message {
+	atomic_t		m_refcount;
+	struct list_head	m_sock_item;
+	struct list_head	m_conn_item;
+	struct rds_incoming	m_inc;
+	u64			m_ack_seq;
+	__be32			m_daddr;
+	unsigned long		m_flags;
+
+	/* Never access m_rs without holding m_rs_lock.
+	 * Lock nesting is
+	 *  rm->m_rs_lock
+	 *   -> rs->rs_lock
+	 */
+	spinlock_t		m_rs_lock;
+	wait_queue_head_t	m_flush_wait;
+
+	struct rds_sock		*m_rs;
+
+	/* cookie to send to remote, in rds header */
+	rds_rdma_cookie_t	m_rdma_cookie;
+
+	unsigned int		m_used_sgs;
+	unsigned int		m_total_sgs;
+
+	void			*m_final_op;
+
+	struct {
+		struct rm_atomic_op {
+			int			op_type;
+			union {
+				struct {
+					uint64_t	compare;
+					uint64_t	swap;
+					uint64_t	compare_mask;
+					uint64_t	swap_mask;
+				} op_m_cswp;
+				struct {
+					uint64_t	add;
+					uint64_t	nocarry_mask;
+				} op_m_fadd;
+			};
+
+			u32			op_rkey;
+			u64			op_remote_addr;
+			unsigned int		op_notify:1;
+			unsigned int		op_recverr:1;
+			unsigned int		op_mapped:1;
+			unsigned int		op_silent:1;
+			unsigned int		op_active:1;
+			struct scatterlist	*op_sg;
+			struct rds_notifier	*op_notifier;
+
+			struct rds_mr		*op_rdma_mr;
+		} atomic;
+		struct rm_rdma_op {
+			u32			op_rkey;
+			u64			op_remote_addr;
+			unsigned int		op_write:1;
+			unsigned int		op_fence:1;
+			unsigned int		op_notify:1;
+			unsigned int		op_recverr:1;
+			unsigned int		op_mapped:1;
+			unsigned int		op_silent:1;
+			unsigned int		op_active:1;
+			unsigned int		op_bytes;
+			unsigned int		op_nents;
+			unsigned int		op_count;
+			struct scatterlist	*op_sg;
+			struct rds_notifier	*op_notifier;
+
+			struct rds_mr		*op_rdma_mr;
+		} rdma;
+		struct rm_data_op {
+			unsigned int		op_active:1;
+			unsigned int		op_nents;
+			unsigned int		op_count;
+			struct scatterlist	*op_sg;
+		} data;
+	};
+};
+
+/*
+ * The RDS notifier is used (optionally) to tell the application about
+ * completed RDMA operations. Rather than keeping the whole rds message
+ * around on the queue, we allocate a small notifier that is put on the
+ * socket's notifier_list. Notifications are delivered to the application
+ * through control messages.
+ */
+struct rds_notifier {
+	struct list_head	n_list;
+	uint64_t		n_user_token;
+	int			n_status;
+};
+
+/**
+ * struct rds_transport -  transport specific behavioural hooks
+ *
+ * @xmit: .xmit is called by rds_send_xmit() to tell the transport to send
+ *        part of a message.  The caller serializes on the send_sem so this
+ *        doesn't need to be reentrant for a given conn.  The header must be
+ *        sent before the data payload.  .xmit must be prepared to send a
+ *        message with no data payload.  .xmit should return the number of
+ *        bytes that were sent down the connection, including header bytes.
+ *        Returning 0 tells the caller that it doesn't need to perform any
+ *        additional work now.  This is usually the case when the transport has
+ *        filled the sending queue for its connection and will handle
+ *        triggering the rds thread to continue the send when space becomes
+ *        available.  Returning -EAGAIN tells the caller to retry the send
+ *        immediately.  Returning -ENOMEM tells the caller to retry the send at
+ *        some point in the future.
+ *
+ * @conn_shutdown: conn_shutdown stops traffic on the given connection.  Once
+ *                 it returns the connection can not call rds_recv_incoming().
+ *                 This will only be called once after conn_connect returns
+ *                 non-zero success and will The caller serializes this with
+ *                 the send and connecting paths (xmit_* and conn_*).  The
+ *                 transport is responsible for other serialization, including
+ *                 rds_recv_incoming().  This is called in process context but
+ *                 should try hard not to block.
+ */
+
+#define RDS_TRANS_IB	0
+#define RDS_TRANS_IWARP	1
+#define RDS_TRANS_TCP	2
+#define RDS_TRANS_COUNT	3
+
+struct rds_transport {
+	char			t_name[TRANSNAMSIZ];
+	struct list_head	t_item;
+	struct module		*t_owner;
+	unsigned int		t_prefer_loopback:1;
+	unsigned int		t_type;
+
+	int (*laddr_check)(__be32 addr);
+	int (*conn_alloc)(struct rds_connection *conn, gfp_t gfp);
+	void (*conn_free)(void *data);
+	int (*conn_connect)(struct rds_connection *conn);
+	void (*conn_shutdown)(struct rds_connection *conn);
+	void (*xmit_prepare)(struct rds_connection *conn);
+	void (*xmit_complete)(struct rds_connection *conn);
+	int (*xmit)(struct rds_connection *conn, struct rds_message *rm,
+		    unsigned int hdr_off, unsigned int sg, unsigned int off);
+	int (*xmit_rdma)(struct rds_connection *conn, struct rm_rdma_op *op);
+	int (*xmit_atomic)(struct rds_connection *conn, struct rm_atomic_op *op);
+	int (*recv)(struct rds_connection *conn);
+	int (*inc_copy_to_user)(struct rds_incoming *inc, struct iovec *iov,
+				size_t size);
+	void (*inc_free)(struct rds_incoming *inc);
+
+	int (*cm_handle_connect)(struct rdma_cm_id *cm_id,
+				 struct rdma_cm_event *event);
+	int (*cm_initiate_connect)(struct rdma_cm_id *cm_id);
+	void (*cm_connect_complete)(struct rds_connection *conn,
+				    struct rdma_cm_event *event);
+
+	unsigned int (*stats_info_copy)(struct rds_info_iterator *iter,
+					unsigned int avail);
+	void (*exit)(void);
+	void *(*get_mr)(struct scatterlist *sg, unsigned long nr_sg,
+			struct rds_sock *rs, u32 *key_ret);
+	void (*sync_mr)(void *trans_private, int direction);
+	void (*free_mr)(void *trans_private, int invalidate);
+	void (*flush_mrs)(void);
+};
+
+struct rds_sock {
+	struct sock		rs_sk;
+
+	u64			rs_user_addr;
+	u64			rs_user_bytes;
+
+	/*
+	 * bound_addr used for both incoming and outgoing, no INADDR_ANY
+	 * support.
+	 */
+	struct hlist_node	rs_bound_node;
+	__be32			rs_bound_addr;
+	__be32			rs_conn_addr;
+	__be16			rs_bound_port;
+	__be16			rs_conn_port;
+	struct rds_transport    *rs_transport;
+
+	/*
+	 * rds_sendmsg caches the conn it used the last time around.
+	 * This helps avoid costly lookups.
+	 */
+	struct rds_connection	*rs_conn;
+
+	/* flag indicating we were congested or not */
+	int			rs_congested;
+	/* seen congestion (ENOBUFS) when sending? */
+	int			rs_seen_congestion;
+
+	/* rs_lock protects all these adjacent members before the newline */
+	spinlock_t		rs_lock;
+	struct list_head	rs_send_queue;
+	u32			rs_snd_bytes;
+	int			rs_rcv_bytes;
+	struct list_head	rs_notify_queue;	/* currently used for failed RDMAs */
+
+	/* Congestion wake_up. If rs_cong_monitor is set, we use cong_mask
+	 * to decide whether the application should be woken up.
+	 * If not set, we use rs_cong_track to find out whether a cong map
+	 * update arrived.
+	 */
+	uint64_t		rs_cong_mask;
+	uint64_t		rs_cong_notify;
+	struct list_head	rs_cong_list;
+	unsigned long		rs_cong_track;
+
+	/*
+	 * rs_recv_lock protects the receive queue, and is
+	 * used to serialize with rds_release.
+	 */
+	rwlock_t		rs_recv_lock;
+	struct list_head	rs_recv_queue;
+
+	/* just for stats reporting */
+	struct list_head	rs_item;
+
+	/* these have their own lock */
+	spinlock_t		rs_rdma_lock;
+	struct rb_root		rs_rdma_keys;
+
+	/* Socket options - in case there will be more */
+	unsigned char		rs_recverr,
+				rs_cong_monitor;
+};
+
+static inline struct rds_sock *rds_sk_to_rs(const struct sock *sk)
+{
+	return container_of(sk, struct rds_sock, rs_sk);
+}
+static inline struct sock *rds_rs_to_sk(struct rds_sock *rs)
+{
+	return &rs->rs_sk;
+}
+
+/*
+ * The stack assigns sk_sndbuf and sk_rcvbuf to twice the specified value
+ * to account for overhead.  We don't account for overhead, we just apply
+ * the number of payload bytes to the specified value.
+ */
+static inline int rds_sk_sndbuf(struct rds_sock *rs)
+{
+	return rds_rs_to_sk(rs)->sk_sndbuf / 2;
+}
+static inline int rds_sk_rcvbuf(struct rds_sock *rs)
+{
+	return rds_rs_to_sk(rs)->sk_rcvbuf / 2;
+}
+
+struct rds_statistics {
+	uint64_t	s_conn_reset;
+	uint64_t	s_recv_drop_bad_checksum;
+	uint64_t	s_recv_drop_old_seq;
+	uint64_t	s_recv_drop_no_sock;
+	uint64_t	s_recv_drop_dead_sock;
+	uint64_t	s_recv_deliver_raced;
+	uint64_t	s_recv_delivered;
+	uint64_t	s_recv_queued;
+	uint64_t	s_recv_immediate_retry;
+	uint64_t	s_recv_delayed_retry;
+	uint64_t	s_recv_ack_required;
+	uint64_t	s_recv_rdma_bytes;
+	uint64_t	s_recv_ping;
+	uint64_t	s_send_queue_empty;
+	uint64_t	s_send_queue_full;
+	uint64_t	s_send_lock_contention;
+	uint64_t	s_send_lock_queue_raced;
+	uint64_t	s_send_immediate_retry;
+	uint64_t	s_send_delayed_retry;
+	uint64_t	s_send_drop_acked;
+	uint64_t	s_send_ack_required;
+	uint64_t	s_send_queued;
+	uint64_t	s_send_rdma;
+	uint64_t	s_send_rdma_bytes;
+	uint64_t	s_send_pong;
+	uint64_t	s_page_remainder_hit;
+	uint64_t	s_page_remainder_miss;
+	uint64_t	s_copy_to_user;
+	uint64_t	s_copy_from_user;
+	uint64_t	s_cong_update_queued;
+	uint64_t	s_cong_update_received;
+	uint64_t	s_cong_send_error;
+	uint64_t	s_cong_send_blocked;
+};
+
+/* af_rds.c */
+char *rds_str_array(char **array, size_t elements, size_t index);
+void rds_sock_addref(struct rds_sock *rs);
+void rds_sock_put(struct rds_sock *rs);
+void rds_wake_sk_sleep(struct rds_sock *rs);
+static inline void __rds_wake_sk_sleep(struct sock *sk)
+{
+	wait_queue_head_t *waitq = sk_sleep(sk);
+
+	if (!sock_flag(sk, SOCK_DEAD) && waitq)
+		wake_up(waitq);
+}
+extern wait_queue_head_t rds_poll_waitq;
+
+
+/* bind.c */
+int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len);
+void rds_remove_bound(struct rds_sock *rs);
+struct rds_sock *rds_find_bound(__be32 addr, __be16 port);
+
+/* cong.c */
+int rds_cong_get_maps(struct rds_connection *conn);
+void rds_cong_add_conn(struct rds_connection *conn);
+void rds_cong_remove_conn(struct rds_connection *conn);
+void rds_cong_set_bit(struct rds_cong_map *map, __be16 port);
+void rds_cong_clear_bit(struct rds_cong_map *map, __be16 port);
+int rds_cong_wait(struct rds_cong_map *map, __be16 port, int nonblock, struct rds_sock *rs);
+void rds_cong_queue_updates(struct rds_cong_map *map);
+void rds_cong_map_updated(struct rds_cong_map *map, uint64_t);
+int rds_cong_updated_since(unsigned long *recent);
+void rds_cong_add_socket(struct rds_sock *);
+void rds_cong_remove_socket(struct rds_sock *);
+void rds_cong_exit(void);
+struct rds_message *rds_cong_update_alloc(struct rds_connection *conn);
+
+/* conn.c */
+int rds_conn_init(void);
+void rds_conn_exit(void);
+struct rds_connection *rds_conn_create(__be32 laddr, __be32 faddr,
+				       struct rds_transport *trans, gfp_t gfp);
+struct rds_connection *rds_conn_create_outgoing(__be32 laddr, __be32 faddr,
+			       struct rds_transport *trans, gfp_t gfp);
+void rds_conn_shutdown(struct rds_connection *conn);
+void rds_conn_destroy(struct rds_connection *conn);
+void rds_conn_drop(struct rds_connection *conn);
+void rds_conn_connect_if_down(struct rds_connection *conn);
+void rds_for_each_conn_info(struct socket *sock, unsigned int len,
+			  struct rds_info_iterator *iter,
+			  struct rds_info_lengths *lens,
+			  int (*visitor)(struct rds_connection *, void *),
+			  size_t item_len);
+__printf(2, 3)
+void __rds_conn_error(struct rds_connection *conn, const char *, ...);
+#define rds_conn_error(conn, fmt...) \
+	__rds_conn_error(conn, KERN_WARNING "RDS: " fmt)
+
+static inline int
+rds_conn_transition(struct rds_connection *conn, int old, int new)
+{
+	return atomic_cmpxchg(&conn->c_state, old, new) == old;
+}
+
+static inline int
+rds_conn_state(struct rds_connection *conn)
+{
+	return atomic_read(&conn->c_state);
+}
+
+static inline int
+rds_conn_up(struct rds_connection *conn)
+{
+	return atomic_read(&conn->c_state) == RDS_CONN_UP;
+}
+
+static inline int
+rds_conn_connecting(struct rds_connection *conn)
+{
+	return atomic_read(&conn->c_state) == RDS_CONN_CONNECTING;
+}
+
+/* message.c */
+struct rds_message *rds_message_alloc(unsigned int nents, gfp_t gfp);
+struct scatterlist *rds_message_alloc_sgs(struct rds_message *rm, int nents);
+int rds_message_copy_from_user(struct rds_message *rm, struct iovec *first_iov,
+					       size_t total_len);
+struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned int total_len);
+void rds_message_populate_header(struct rds_header *hdr, __be16 sport,
+				 __be16 dport, u64 seq);
+int rds_message_add_extension(struct rds_header *hdr,
+			      unsigned int type, const void *data, unsigned int len);
+int rds_message_next_extension(struct rds_header *hdr,
+			       unsigned int *pos, void *buf, unsigned int *buflen);
+int rds_message_add_rdma_dest_extension(struct rds_header *hdr, u32 r_key, u32 offset);
+int rds_message_inc_copy_to_user(struct rds_incoming *inc,
+				 struct iovec *first_iov, size_t size);
+void rds_message_inc_free(struct rds_incoming *inc);
+void rds_message_addref(struct rds_message *rm);
+void rds_message_put(struct rds_message *rm);
+void rds_message_wait(struct rds_message *rm);
+void rds_message_unmapped(struct rds_message *rm);
+
+static inline void rds_message_make_checksum(struct rds_header *hdr)
+{
+	hdr->h_csum = 0;
+	hdr->h_csum = ip_fast_csum((void *) hdr, sizeof(*hdr) >> 2);
+}
+
+static inline int rds_message_verify_checksum(const struct rds_header *hdr)
+{
+	return !hdr->h_csum || ip_fast_csum((void *) hdr, sizeof(*hdr) >> 2) == 0;
+}
+
+
+/* page.c */
+int rds_page_remainder_alloc(struct scatterlist *scat, unsigned long bytes,
+			     gfp_t gfp);
+int rds_page_copy_user(struct page *page, unsigned long offset,
+		       void __user *ptr, unsigned long bytes,
+		       int to_user);
+#define rds_page_copy_to_user(page, offset, ptr, bytes) \
+	rds_page_copy_user(page, offset, ptr, bytes, 1)
+#define rds_page_copy_from_user(page, offset, ptr, bytes) \
+	rds_page_copy_user(page, offset, ptr, bytes, 0)
+void rds_page_exit(void);
+
+/* recv.c */
+void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn,
+		  __be32 saddr);
+void rds_inc_put(struct rds_incoming *inc);
+void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr,
+		       struct rds_incoming *inc, gfp_t gfp);
+int rds_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
+		size_t size, int msg_flags);
+void rds_clear_recv_queue(struct rds_sock *rs);
+int rds_notify_queue_get(struct rds_sock *rs, struct msghdr *msg);
+void rds_inc_info_copy(struct rds_incoming *inc,
+		       struct rds_info_iterator *iter,
+		       __be32 saddr, __be32 daddr, int flip);
+
+/* send.c */
+int rds_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
+		size_t payload_len);
+void rds_send_reset(struct rds_connection *conn);
+int rds_send_xmit(struct rds_connection *conn);
+struct sockaddr_in;
+void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest);
+typedef int (*is_acked_func)(struct rds_message *rm, uint64_t ack);
+void rds_send_drop_acked(struct rds_connection *conn, u64 ack,
+			 is_acked_func is_acked);
+int rds_send_pong(struct rds_connection *conn, __be16 dport);
+struct rds_message *rds_send_get_message(struct rds_connection *,
+					 struct rm_rdma_op *);
+
+/* rdma.c */
+void rds_rdma_unuse(struct rds_sock *rs, u32 r_key, int force);
+int rds_get_mr(struct rds_sock *rs, char __user *optval, int optlen);
+int rds_get_mr_for_dest(struct rds_sock *rs, char __user *optval, int optlen);
+int rds_free_mr(struct rds_sock *rs, char __user *optval, int optlen);
+void rds_rdma_drop_keys(struct rds_sock *rs);
+int rds_rdma_extra_size(struct rds_rdma_args *args);
+int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
+			  struct cmsghdr *cmsg);
+int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm,
+			  struct cmsghdr *cmsg);
+int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
+			  struct cmsghdr *cmsg);
+int rds_cmsg_rdma_map(struct rds_sock *rs, struct rds_message *rm,
+			  struct cmsghdr *cmsg);
+void rds_rdma_free_op(struct rm_rdma_op *ro);
+void rds_atomic_free_op(struct rm_atomic_op *ao);
+void rds_rdma_send_complete(struct rds_message *rm, int wc_status);
+void rds_atomic_send_complete(struct rds_message *rm, int wc_status);
+int rds_cmsg_atomic(struct rds_sock *rs, struct rds_message *rm,
+		    struct cmsghdr *cmsg);
+
+void __rds_put_mr_final(struct rds_mr *mr);
+static inline void rds_mr_put(struct rds_mr *mr)
+{
+	if (atomic_dec_and_test(&mr->r_refcount))
+		__rds_put_mr_final(mr);
+}
+
+/* stats.c */
+DECLARE_PER_CPU_SHARED_ALIGNED(struct rds_statistics, rds_stats);
+#define rds_stats_inc_which(which, member) do {		\
+	per_cpu(which, get_cpu()).member++;		\
+	put_cpu();					\
+} while (0)
+#define rds_stats_inc(member) rds_stats_inc_which(rds_stats, member)
+#define rds_stats_add_which(which, member, count) do {		\
+	per_cpu(which, get_cpu()).member += count;	\
+	put_cpu();					\
+} while (0)
+#define rds_stats_add(member, count) rds_stats_add_which(rds_stats, member, count)
+int rds_stats_init(void);
+void rds_stats_exit(void);
+void rds_stats_info_copy(struct rds_info_iterator *iter,
+			 uint64_t *values, const char *const *names,
+			 size_t nr);
+
+/* sysctl.c */
+int rds_sysctl_init(void);
+void rds_sysctl_exit(void);
+extern unsigned long rds_sysctl_sndbuf_min;
+extern unsigned long rds_sysctl_sndbuf_default;
+extern unsigned long rds_sysctl_sndbuf_max;
+extern unsigned long rds_sysctl_reconnect_min_jiffies;
+extern unsigned long rds_sysctl_reconnect_max_jiffies;
+extern unsigned int  rds_sysctl_max_unacked_packets;
+extern unsigned int  rds_sysctl_max_unacked_bytes;
+extern unsigned int  rds_sysctl_ping_enable;
+extern unsigned long rds_sysctl_trace_flags;
+extern unsigned int  rds_sysctl_trace_level;
+
+/* threads.c */
+int rds_threads_init(void);
+void rds_threads_exit(void);
+extern struct workqueue_struct *rds_wq;
+void rds_queue_reconnect(struct rds_connection *conn);
+void rds_connect_worker(struct work_struct *);
+void rds_shutdown_worker(struct work_struct *);
+void rds_send_worker(struct work_struct *);
+void rds_recv_worker(struct work_struct *);
+void rds_connect_complete(struct rds_connection *conn);
+
+/* transport.c */
+int rds_trans_register(struct rds_transport *trans);
+void rds_trans_unregister(struct rds_transport *trans);
+struct rds_transport *rds_trans_get_preferred(__be32 addr);
+void rds_trans_put(struct rds_transport *trans);
+unsigned int rds_trans_stats_info_copy(struct rds_info_iterator *iter,
+				       unsigned int avail);
+int rds_trans_init(void);
+void rds_trans_exit(void);
+
+#endif
diff --git a/net/rds/recv.c b/net/rds/recv.c
new file mode 100644
index 0000000..bd82522
--- /dev/null
+++ b/net/rds/recv.c
@@ -0,0 +1,547 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <net/sock.h>
+#include <linux/in.h>
+#include <linux/export.h>
+
+#include "rds.h"
+
+void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn,
+		  __be32 saddr)
+{
+	atomic_set(&inc->i_refcount, 1);
+	INIT_LIST_HEAD(&inc->i_item);
+	inc->i_conn = conn;
+	inc->i_saddr = saddr;
+	inc->i_rdma_cookie = 0;
+}
+EXPORT_SYMBOL_GPL(rds_inc_init);
+
+static void rds_inc_addref(struct rds_incoming *inc)
+{
+	rdsdebug("addref inc %p ref %d\n", inc, atomic_read(&inc->i_refcount));
+	atomic_inc(&inc->i_refcount);
+}
+
+void rds_inc_put(struct rds_incoming *inc)
+{
+	rdsdebug("put inc %p ref %d\n", inc, atomic_read(&inc->i_refcount));
+	if (atomic_dec_and_test(&inc->i_refcount)) {
+		BUG_ON(!list_empty(&inc->i_item));
+
+		inc->i_conn->c_trans->inc_free(inc);
+	}
+}
+EXPORT_SYMBOL_GPL(rds_inc_put);
+
+static void rds_recv_rcvbuf_delta(struct rds_sock *rs, struct sock *sk,
+				  struct rds_cong_map *map,
+				  int delta, __be16 port)
+{
+	int now_congested;
+
+	if (delta == 0)
+		return;
+
+	rs->rs_rcv_bytes += delta;
+	now_congested = rs->rs_rcv_bytes > rds_sk_rcvbuf(rs);
+
+	rdsdebug("rs %p (%pI4:%u) recv bytes %d buf %d "
+	  "now_cong %d delta %d\n",
+	  rs, &rs->rs_bound_addr,
+	  ntohs(rs->rs_bound_port), rs->rs_rcv_bytes,
+	  rds_sk_rcvbuf(rs), now_congested, delta);
+
+	/* wasn't -> am congested */
+	if (!rs->rs_congested && now_congested) {
+		rs->rs_congested = 1;
+		rds_cong_set_bit(map, port);
+		rds_cong_queue_updates(map);
+	}
+	/* was -> aren't congested */
+	/* Require more free space before reporting uncongested to prevent
+	   bouncing cong/uncong state too often */
+	else if (rs->rs_congested && (rs->rs_rcv_bytes < (rds_sk_rcvbuf(rs)/2))) {
+		rs->rs_congested = 0;
+		rds_cong_clear_bit(map, port);
+		rds_cong_queue_updates(map);
+	}
+
+	/* do nothing if no change in cong state */
+}
+
+/*
+ * Process all extension headers that come with this message.
+ */
+static void rds_recv_incoming_exthdrs(struct rds_incoming *inc, struct rds_sock *rs)
+{
+	struct rds_header *hdr = &inc->i_hdr;
+	unsigned int pos = 0, type, len;
+	union {
+		struct rds_ext_header_version version;
+		struct rds_ext_header_rdma rdma;
+		struct rds_ext_header_rdma_dest rdma_dest;
+	} buffer;
+
+	while (1) {
+		len = sizeof(buffer);
+		type = rds_message_next_extension(hdr, &pos, &buffer, &len);
+		if (type == RDS_EXTHDR_NONE)
+			break;
+		/* Process extension header here */
+		switch (type) {
+		case RDS_EXTHDR_RDMA:
+			rds_rdma_unuse(rs, be32_to_cpu(buffer.rdma.h_rdma_rkey), 0);
+			break;
+
+		case RDS_EXTHDR_RDMA_DEST:
+			/* We ignore the size for now. We could stash it
+			 * somewhere and use it for error checking. */
+			inc->i_rdma_cookie = rds_rdma_make_cookie(
+					be32_to_cpu(buffer.rdma_dest.h_rdma_rkey),
+					be32_to_cpu(buffer.rdma_dest.h_rdma_offset));
+
+			break;
+		}
+	}
+}
+
+/*
+ * The transport must make sure that this is serialized against other
+ * rx and conn reset on this specific conn.
+ *
+ * We currently assert that only one fragmented message will be sent
+ * down a connection at a time.  This lets us reassemble in the conn
+ * instead of per-flow which means that we don't have to go digging through
+ * flows to tear down partial reassembly progress on conn failure and
+ * we save flow lookup and locking for each frag arrival.  It does mean
+ * that small messages will wait behind large ones.  Fragmenting at all
+ * is only to reduce the memory consumption of pre-posted buffers.
+ *
+ * The caller passes in saddr and daddr instead of us getting it from the
+ * conn.  This lets loopback, who only has one conn for both directions,
+ * tell us which roles the addrs in the conn are playing for this message.
+ */
+void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr,
+		       struct rds_incoming *inc, gfp_t gfp)
+{
+	struct rds_sock *rs = NULL;
+	struct sock *sk;
+	unsigned long flags;
+
+	inc->i_conn = conn;
+	inc->i_rx_jiffies = jiffies;
+
+	rdsdebug("conn %p next %llu inc %p seq %llu len %u sport %u dport %u "
+		 "flags 0x%x rx_jiffies %lu\n", conn,
+		 (unsigned long long)conn->c_next_rx_seq,
+		 inc,
+		 (unsigned long long)be64_to_cpu(inc->i_hdr.h_sequence),
+		 be32_to_cpu(inc->i_hdr.h_len),
+		 be16_to_cpu(inc->i_hdr.h_sport),
+		 be16_to_cpu(inc->i_hdr.h_dport),
+		 inc->i_hdr.h_flags,
+		 inc->i_rx_jiffies);
+
+	/*
+	 * Sequence numbers should only increase.  Messages get their
+	 * sequence number as they're queued in a sending conn.  They
+	 * can be dropped, though, if the sending socket is closed before
+	 * they hit the wire.  So sequence numbers can skip forward
+	 * under normal operation.  They can also drop back in the conn
+	 * failover case as previously sent messages are resent down the
+	 * new instance of a conn.  We drop those, otherwise we have
+	 * to assume that the next valid seq does not come after a
+	 * hole in the fragment stream.
+	 *
+	 * The headers don't give us a way to realize if fragments of
+	 * a message have been dropped.  We assume that frags that arrive
+	 * to a flow are part of the current message on the flow that is
+	 * being reassembled.  This means that senders can't drop messages
+	 * from the sending conn until all their frags are sent.
+	 *
+	 * XXX we could spend more on the wire to get more robust failure
+	 * detection, arguably worth it to avoid data corruption.
+	 */
+	if (be64_to_cpu(inc->i_hdr.h_sequence) < conn->c_next_rx_seq &&
+	    (inc->i_hdr.h_flags & RDS_FLAG_RETRANSMITTED)) {
+		rds_stats_inc(s_recv_drop_old_seq);
+		goto out;
+	}
+	conn->c_next_rx_seq = be64_to_cpu(inc->i_hdr.h_sequence) + 1;
+
+	if (rds_sysctl_ping_enable && inc->i_hdr.h_dport == 0) {
+		rds_stats_inc(s_recv_ping);
+		rds_send_pong(conn, inc->i_hdr.h_sport);
+		goto out;
+	}
+
+	rs = rds_find_bound(daddr, inc->i_hdr.h_dport);
+	if (!rs) {
+		rds_stats_inc(s_recv_drop_no_sock);
+		goto out;
+	}
+
+	/* Process extension headers */
+	rds_recv_incoming_exthdrs(inc, rs);
+
+	/* We can be racing with rds_release() which marks the socket dead. */
+	sk = rds_rs_to_sk(rs);
+
+	/* serialize with rds_release -> sock_orphan */
+	write_lock_irqsave(&rs->rs_recv_lock, flags);
+	if (!sock_flag(sk, SOCK_DEAD)) {
+		rdsdebug("adding inc %p to rs %p's recv queue\n", inc, rs);
+		rds_stats_inc(s_recv_queued);
+		rds_recv_rcvbuf_delta(rs, sk, inc->i_conn->c_lcong,
+				      be32_to_cpu(inc->i_hdr.h_len),
+				      inc->i_hdr.h_dport);
+		rds_inc_addref(inc);
+		list_add_tail(&inc->i_item, &rs->rs_recv_queue);
+		__rds_wake_sk_sleep(sk);
+	} else {
+		rds_stats_inc(s_recv_drop_dead_sock);
+	}
+	write_unlock_irqrestore(&rs->rs_recv_lock, flags);
+
+out:
+	if (rs)
+		rds_sock_put(rs);
+}
+EXPORT_SYMBOL_GPL(rds_recv_incoming);
+
+/*
+ * be very careful here.  This is being called as the condition in
+ * wait_event_*() needs to cope with being called many times.
+ */
+static int rds_next_incoming(struct rds_sock *rs, struct rds_incoming **inc)
+{
+	unsigned long flags;
+
+	if (!*inc) {
+		read_lock_irqsave(&rs->rs_recv_lock, flags);
+		if (!list_empty(&rs->rs_recv_queue)) {
+			*inc = list_entry(rs->rs_recv_queue.next,
+					  struct rds_incoming,
+					  i_item);
+			rds_inc_addref(*inc);
+		}
+		read_unlock_irqrestore(&rs->rs_recv_lock, flags);
+	}
+
+	return *inc != NULL;
+}
+
+static int rds_still_queued(struct rds_sock *rs, struct rds_incoming *inc,
+			    int drop)
+{
+	struct sock *sk = rds_rs_to_sk(rs);
+	int ret = 0;
+	unsigned long flags;
+
+	write_lock_irqsave(&rs->rs_recv_lock, flags);
+	if (!list_empty(&inc->i_item)) {
+		ret = 1;
+		if (drop) {
+			/* XXX make sure this i_conn is reliable */
+			rds_recv_rcvbuf_delta(rs, sk, inc->i_conn->c_lcong,
+					      -be32_to_cpu(inc->i_hdr.h_len),
+					      inc->i_hdr.h_dport);
+			list_del_init(&inc->i_item);
+			rds_inc_put(inc);
+		}
+	}
+	write_unlock_irqrestore(&rs->rs_recv_lock, flags);
+
+	rdsdebug("inc %p rs %p still %d dropped %d\n", inc, rs, ret, drop);
+	return ret;
+}
+
+/*
+ * Pull errors off the error queue.
+ * If msghdr is NULL, we will just purge the error queue.
+ */
+int rds_notify_queue_get(struct rds_sock *rs, struct msghdr *msghdr)
+{
+	struct rds_notifier *notifier;
+	struct rds_rdma_notify cmsg = { 0 }; /* fill holes with zero */
+	unsigned int count = 0, max_messages = ~0U;
+	unsigned long flags;
+	LIST_HEAD(copy);
+	int err = 0;
+
+
+	/* put_cmsg copies to user space and thus may sleep. We can't do this
+	 * with rs_lock held, so first grab as many notifications as we can stuff
+	 * in the user provided cmsg buffer. We don't try to copy more, to avoid
+	 * losing notifications - except when the buffer is so small that it wouldn't
+	 * even hold a single notification. Then we give him as much of this single
+	 * msg as we can squeeze in, and set MSG_CTRUNC.
+	 */
+	if (msghdr) {
+		max_messages = msghdr->msg_controllen / CMSG_SPACE(sizeof(cmsg));
+		if (!max_messages)
+			max_messages = 1;
+	}
+
+	spin_lock_irqsave(&rs->rs_lock, flags);
+	while (!list_empty(&rs->rs_notify_queue) && count < max_messages) {
+		notifier = list_entry(rs->rs_notify_queue.next,
+				struct rds_notifier, n_list);
+		list_move(&notifier->n_list, &copy);
+		count++;
+	}
+	spin_unlock_irqrestore(&rs->rs_lock, flags);
+
+	if (!count)
+		return 0;
+
+	while (!list_empty(&copy)) {
+		notifier = list_entry(copy.next, struct rds_notifier, n_list);
+
+		if (msghdr) {
+			cmsg.user_token = notifier->n_user_token;
+			cmsg.status = notifier->n_status;
+
+			err = put_cmsg(msghdr, SOL_RDS, RDS_CMSG_RDMA_STATUS,
+				       sizeof(cmsg), &cmsg);
+			if (err)
+				break;
+		}
+
+		list_del_init(&notifier->n_list);
+		kfree(notifier);
+	}
+
+	/* If we bailed out because of an error in put_cmsg,
+	 * we may be left with one or more notifications that we
+	 * didn't process. Return them to the head of the list. */
+	if (!list_empty(&copy)) {
+		spin_lock_irqsave(&rs->rs_lock, flags);
+		list_splice(&copy, &rs->rs_notify_queue);
+		spin_unlock_irqrestore(&rs->rs_lock, flags);
+	}
+
+	return err;
+}
+
+/*
+ * Queue a congestion notification
+ */
+static int rds_notify_cong(struct rds_sock *rs, struct msghdr *msghdr)
+{
+	uint64_t notify = rs->rs_cong_notify;
+	unsigned long flags;
+	int err;
+
+	err = put_cmsg(msghdr, SOL_RDS, RDS_CMSG_CONG_UPDATE,
+			sizeof(notify), &notify);
+	if (err)
+		return err;
+
+	spin_lock_irqsave(&rs->rs_lock, flags);
+	rs->rs_cong_notify &= ~notify;
+	spin_unlock_irqrestore(&rs->rs_lock, flags);
+
+	return 0;
+}
+
+/*
+ * Receive any control messages.
+ */
+static int rds_cmsg_recv(struct rds_incoming *inc, struct msghdr *msg)
+{
+	int ret = 0;
+
+	if (inc->i_rdma_cookie) {
+		ret = put_cmsg(msg, SOL_RDS, RDS_CMSG_RDMA_DEST,
+				sizeof(inc->i_rdma_cookie), &inc->i_rdma_cookie);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+int rds_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
+		size_t size, int msg_flags)
+{
+	struct sock *sk = sock->sk;
+	struct rds_sock *rs = rds_sk_to_rs(sk);
+	long timeo;
+	int ret = 0, nonblock = msg_flags & MSG_DONTWAIT;
+	DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name);
+	struct rds_incoming *inc = NULL;
+
+	/* udp_recvmsg()->sock_recvtimeo() gets away without locking too.. */
+	timeo = sock_rcvtimeo(sk, nonblock);
+
+	rdsdebug("size %zu flags 0x%x timeo %ld\n", size, msg_flags, timeo);
+
+	if (msg_flags & MSG_OOB)
+		goto out;
+
+	while (1) {
+		/* If there are pending notifications, do those - and nothing else */
+		if (!list_empty(&rs->rs_notify_queue)) {
+			ret = rds_notify_queue_get(rs, msg);
+			break;
+		}
+
+		if (rs->rs_cong_notify) {
+			ret = rds_notify_cong(rs, msg);
+			break;
+		}
+
+		if (!rds_next_incoming(rs, &inc)) {
+			if (nonblock) {
+				ret = -EAGAIN;
+				break;
+			}
+
+			timeo = wait_event_interruptible_timeout(*sk_sleep(sk),
+					(!list_empty(&rs->rs_notify_queue) ||
+					 rs->rs_cong_notify ||
+					 rds_next_incoming(rs, &inc)), timeo);
+			rdsdebug("recvmsg woke inc %p timeo %ld\n", inc,
+				 timeo);
+			if (timeo > 0 || timeo == MAX_SCHEDULE_TIMEOUT)
+				continue;
+
+			ret = timeo;
+			if (ret == 0)
+				ret = -ETIMEDOUT;
+			break;
+		}
+
+		rdsdebug("copying inc %p from %pI4:%u to user\n", inc,
+			 &inc->i_conn->c_faddr,
+			 ntohs(inc->i_hdr.h_sport));
+		ret = inc->i_conn->c_trans->inc_copy_to_user(inc, msg->msg_iov,
+							     size);
+		if (ret < 0)
+			break;
+
+		/*
+		 * if the message we just copied isn't at the head of the
+		 * recv queue then someone else raced us to return it, try
+		 * to get the next message.
+		 */
+		if (!rds_still_queued(rs, inc, !(msg_flags & MSG_PEEK))) {
+			rds_inc_put(inc);
+			inc = NULL;
+			rds_stats_inc(s_recv_deliver_raced);
+			continue;
+		}
+
+		if (ret < be32_to_cpu(inc->i_hdr.h_len)) {
+			if (msg_flags & MSG_TRUNC)
+				ret = be32_to_cpu(inc->i_hdr.h_len);
+			msg->msg_flags |= MSG_TRUNC;
+		}
+
+		if (rds_cmsg_recv(inc, msg)) {
+			ret = -EFAULT;
+			goto out;
+		}
+
+		rds_stats_inc(s_recv_delivered);
+
+		if (sin) {
+			sin->sin_family = AF_INET;
+			sin->sin_port = inc->i_hdr.h_sport;
+			sin->sin_addr.s_addr = inc->i_saddr;
+			memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
+			msg->msg_namelen = sizeof(*sin);
+		}
+		break;
+	}
+
+	if (inc)
+		rds_inc_put(inc);
+
+out:
+	return ret;
+}
+
+/*
+ * The socket is being shut down and we're asked to drop messages that were
+ * queued for recvmsg.  The caller has unbound the socket so the receive path
+ * won't queue any more incoming fragments or messages on the socket.
+ */
+void rds_clear_recv_queue(struct rds_sock *rs)
+{
+	struct sock *sk = rds_rs_to_sk(rs);
+	struct rds_incoming *inc, *tmp;
+	unsigned long flags;
+
+	write_lock_irqsave(&rs->rs_recv_lock, flags);
+	list_for_each_entry_safe(inc, tmp, &rs->rs_recv_queue, i_item) {
+		rds_recv_rcvbuf_delta(rs, sk, inc->i_conn->c_lcong,
+				      -be32_to_cpu(inc->i_hdr.h_len),
+				      inc->i_hdr.h_dport);
+		list_del_init(&inc->i_item);
+		rds_inc_put(inc);
+	}
+	write_unlock_irqrestore(&rs->rs_recv_lock, flags);
+}
+
+/*
+ * inc->i_saddr isn't used here because it is only set in the receive
+ * path.
+ */
+void rds_inc_info_copy(struct rds_incoming *inc,
+		       struct rds_info_iterator *iter,
+		       __be32 saddr, __be32 daddr, int flip)
+{
+	struct rds_info_message minfo;
+
+	minfo.seq = be64_to_cpu(inc->i_hdr.h_sequence);
+	minfo.len = be32_to_cpu(inc->i_hdr.h_len);
+
+	if (flip) {
+		minfo.laddr = daddr;
+		minfo.faddr = saddr;
+		minfo.lport = inc->i_hdr.h_dport;
+		minfo.fport = inc->i_hdr.h_sport;
+	} else {
+		minfo.laddr = saddr;
+		minfo.faddr = daddr;
+		minfo.lport = inc->i_hdr.h_sport;
+		minfo.fport = inc->i_hdr.h_dport;
+	}
+
+	rds_info_copy(iter, &minfo, sizeof(minfo));
+}
diff --git a/net/rds/send.c b/net/rds/send.c
new file mode 100644
index 0000000..0a64541
--- /dev/null
+++ b/net/rds/send.c
@@ -0,0 +1,1137 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/moduleparam.h>
+#include <linux/gfp.h>
+#include <net/sock.h>
+#include <linux/in.h>
+#include <linux/list.h>
+#include <linux/ratelimit.h>
+#include <linux/export.h>
+
+#include "rds.h"
+
+/* When transmitting messages in rds_send_xmit, we need to emerge from
+ * time to time and briefly release the CPU. Otherwise the softlock watchdog
+ * will kick our shin.
+ * Also, it seems fairer to not let one busy connection stall all the
+ * others.
+ *
+ * send_batch_count is the number of times we'll loop in send_xmit. Setting
+ * it to 0 will restore the old behavior (where we looped until we had
+ * drained the queue).
+ */
+static int send_batch_count = 64;
+module_param(send_batch_count, int, 0444);
+MODULE_PARM_DESC(send_batch_count, " batch factor when working the send queue");
+
+static void rds_send_remove_from_sock(struct list_head *messages, int status);
+
+/*
+ * Reset the send state.  Callers must ensure that this doesn't race with
+ * rds_send_xmit().
+ */
+void rds_send_reset(struct rds_connection *conn)
+{
+	struct rds_message *rm, *tmp;
+	unsigned long flags;
+
+	if (conn->c_xmit_rm) {
+		rm = conn->c_xmit_rm;
+		conn->c_xmit_rm = NULL;
+		/* Tell the user the RDMA op is no longer mapped by the
+		 * transport. This isn't entirely true (it's flushed out
+		 * independently) but as the connection is down, there's
+		 * no ongoing RDMA to/from that memory */
+		rds_message_unmapped(rm);
+		rds_message_put(rm);
+	}
+
+	conn->c_xmit_sg = 0;
+	conn->c_xmit_hdr_off = 0;
+	conn->c_xmit_data_off = 0;
+	conn->c_xmit_atomic_sent = 0;
+	conn->c_xmit_rdma_sent = 0;
+	conn->c_xmit_data_sent = 0;
+
+	conn->c_map_queued = 0;
+
+	conn->c_unacked_packets = rds_sysctl_max_unacked_packets;
+	conn->c_unacked_bytes = rds_sysctl_max_unacked_bytes;
+
+	/* Mark messages as retransmissions, and move them to the send q */
+	spin_lock_irqsave(&conn->c_lock, flags);
+	list_for_each_entry_safe(rm, tmp, &conn->c_retrans, m_conn_item) {
+		set_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags);
+		set_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags);
+	}
+	list_splice_init(&conn->c_retrans, &conn->c_send_queue);
+	spin_unlock_irqrestore(&conn->c_lock, flags);
+}
+
+static int acquire_in_xmit(struct rds_connection *conn)
+{
+	return test_and_set_bit(RDS_IN_XMIT, &conn->c_flags) == 0;
+}
+
+static void release_in_xmit(struct rds_connection *conn)
+{
+	clear_bit(RDS_IN_XMIT, &conn->c_flags);
+	smp_mb__after_atomic();
+	/*
+	 * We don't use wait_on_bit()/wake_up_bit() because our waking is in a
+	 * hot path and finding waiters is very rare.  We don't want to walk
+	 * the system-wide hashed waitqueue buckets in the fast path only to
+	 * almost never find waiters.
+	 */
+	if (waitqueue_active(&conn->c_waitq))
+		wake_up_all(&conn->c_waitq);
+}
+
+/*
+ * We're making the conscious trade-off here to only send one message
+ * down the connection at a time.
+ *   Pro:
+ *      - tx queueing is a simple fifo list
+ *   	- reassembly is optional and easily done by transports per conn
+ *      - no per flow rx lookup at all, straight to the socket
+ *   	- less per-frag memory and wire overhead
+ *   Con:
+ *      - queued acks can be delayed behind large messages
+ *   Depends:
+ *      - small message latency is higher behind queued large messages
+ *      - large message latency isn't starved by intervening small sends
+ */
+int rds_send_xmit(struct rds_connection *conn)
+{
+	struct rds_message *rm;
+	unsigned long flags;
+	unsigned int tmp;
+	struct scatterlist *sg;
+	int ret = 0;
+	LIST_HEAD(to_be_dropped);
+
+restart:
+
+	/*
+	 * sendmsg calls here after having queued its message on the send
+	 * queue.  We only have one task feeding the connection at a time.  If
+	 * another thread is already feeding the queue then we back off.  This
+	 * avoids blocking the caller and trading per-connection data between
+	 * caches per message.
+	 */
+	if (!acquire_in_xmit(conn)) {
+		rds_stats_inc(s_send_lock_contention);
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	/*
+	 * rds_conn_shutdown() sets the conn state and then tests RDS_IN_XMIT,
+	 * we do the opposite to avoid races.
+	 */
+	if (!rds_conn_up(conn)) {
+		release_in_xmit(conn);
+		ret = 0;
+		goto out;
+	}
+
+	if (conn->c_trans->xmit_prepare)
+		conn->c_trans->xmit_prepare(conn);
+
+	/*
+	 * spin trying to push headers and data down the connection until
+	 * the connection doesn't make forward progress.
+	 */
+	while (1) {
+
+		rm = conn->c_xmit_rm;
+
+		/*
+		 * If between sending messages, we can send a pending congestion
+		 * map update.
+		 */
+		if (!rm && test_and_clear_bit(0, &conn->c_map_queued)) {
+			rm = rds_cong_update_alloc(conn);
+			if (IS_ERR(rm)) {
+				ret = PTR_ERR(rm);
+				break;
+			}
+			rm->data.op_active = 1;
+
+			conn->c_xmit_rm = rm;
+		}
+
+		/*
+		 * If not already working on one, grab the next message.
+		 *
+		 * c_xmit_rm holds a ref while we're sending this message down
+		 * the connction.  We can use this ref while holding the
+		 * send_sem.. rds_send_reset() is serialized with it.
+		 */
+		if (!rm) {
+			unsigned int len;
+
+			spin_lock_irqsave(&conn->c_lock, flags);
+
+			if (!list_empty(&conn->c_send_queue)) {
+				rm = list_entry(conn->c_send_queue.next,
+						struct rds_message,
+						m_conn_item);
+				rds_message_addref(rm);
+
+				/*
+				 * Move the message from the send queue to the retransmit
+				 * list right away.
+				 */
+				list_move_tail(&rm->m_conn_item, &conn->c_retrans);
+			}
+
+			spin_unlock_irqrestore(&conn->c_lock, flags);
+
+			if (!rm)
+				break;
+
+			/* Unfortunately, the way Infiniband deals with
+			 * RDMA to a bad MR key is by moving the entire
+			 * queue pair to error state. We cold possibly
+			 * recover from that, but right now we drop the
+			 * connection.
+			 * Therefore, we never retransmit messages with RDMA ops.
+			 */
+			if (rm->rdma.op_active &&
+			    test_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags)) {
+				spin_lock_irqsave(&conn->c_lock, flags);
+				if (test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags))
+					list_move(&rm->m_conn_item, &to_be_dropped);
+				spin_unlock_irqrestore(&conn->c_lock, flags);
+				continue;
+			}
+
+			/* Require an ACK every once in a while */
+			len = ntohl(rm->m_inc.i_hdr.h_len);
+			if (conn->c_unacked_packets == 0 ||
+			    conn->c_unacked_bytes < len) {
+				__set_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags);
+
+				conn->c_unacked_packets = rds_sysctl_max_unacked_packets;
+				conn->c_unacked_bytes = rds_sysctl_max_unacked_bytes;
+				rds_stats_inc(s_send_ack_required);
+			} else {
+				conn->c_unacked_bytes -= len;
+				conn->c_unacked_packets--;
+			}
+
+			conn->c_xmit_rm = rm;
+		}
+
+		/* The transport either sends the whole rdma or none of it */
+		if (rm->rdma.op_active && !conn->c_xmit_rdma_sent) {
+			rm->m_final_op = &rm->rdma;
+			ret = conn->c_trans->xmit_rdma(conn, &rm->rdma);
+			if (ret)
+				break;
+			conn->c_xmit_rdma_sent = 1;
+
+			/* The transport owns the mapped memory for now.
+			 * You can't unmap it while it's on the send queue */
+			set_bit(RDS_MSG_MAPPED, &rm->m_flags);
+		}
+
+		if (rm->atomic.op_active && !conn->c_xmit_atomic_sent) {
+			rm->m_final_op = &rm->atomic;
+			ret = conn->c_trans->xmit_atomic(conn, &rm->atomic);
+			if (ret)
+				break;
+			conn->c_xmit_atomic_sent = 1;
+
+			/* The transport owns the mapped memory for now.
+			 * You can't unmap it while it's on the send queue */
+			set_bit(RDS_MSG_MAPPED, &rm->m_flags);
+		}
+
+		/*
+		 * A number of cases require an RDS header to be sent
+		 * even if there is no data.
+		 * We permit 0-byte sends; rds-ping depends on this.
+		 * However, if there are exclusively attached silent ops,
+		 * we skip the hdr/data send, to enable silent operation.
+		 */
+		if (rm->data.op_nents == 0) {
+			int ops_present;
+			int all_ops_are_silent = 1;
+
+			ops_present = (rm->atomic.op_active || rm->rdma.op_active);
+			if (rm->atomic.op_active && !rm->atomic.op_silent)
+				all_ops_are_silent = 0;
+			if (rm->rdma.op_active && !rm->rdma.op_silent)
+				all_ops_are_silent = 0;
+
+			if (ops_present && all_ops_are_silent
+			    && !rm->m_rdma_cookie)
+				rm->data.op_active = 0;
+		}
+
+		if (rm->data.op_active && !conn->c_xmit_data_sent) {
+			rm->m_final_op = &rm->data;
+			ret = conn->c_trans->xmit(conn, rm,
+						  conn->c_xmit_hdr_off,
+						  conn->c_xmit_sg,
+						  conn->c_xmit_data_off);
+			if (ret <= 0)
+				break;
+
+			if (conn->c_xmit_hdr_off < sizeof(struct rds_header)) {
+				tmp = min_t(int, ret,
+					    sizeof(struct rds_header) -
+					    conn->c_xmit_hdr_off);
+				conn->c_xmit_hdr_off += tmp;
+				ret -= tmp;
+			}
+
+			sg = &rm->data.op_sg[conn->c_xmit_sg];
+			while (ret) {
+				tmp = min_t(int, ret, sg->length -
+						      conn->c_xmit_data_off);
+				conn->c_xmit_data_off += tmp;
+				ret -= tmp;
+				if (conn->c_xmit_data_off == sg->length) {
+					conn->c_xmit_data_off = 0;
+					sg++;
+					conn->c_xmit_sg++;
+					BUG_ON(ret != 0 &&
+					       conn->c_xmit_sg == rm->data.op_nents);
+				}
+			}
+
+			if (conn->c_xmit_hdr_off == sizeof(struct rds_header) &&
+			    (conn->c_xmit_sg == rm->data.op_nents))
+				conn->c_xmit_data_sent = 1;
+		}
+
+		/*
+		 * A rm will only take multiple times through this loop
+		 * if there is a data op. Thus, if the data is sent (or there was
+		 * none), then we're done with the rm.
+		 */
+		if (!rm->data.op_active || conn->c_xmit_data_sent) {
+			conn->c_xmit_rm = NULL;
+			conn->c_xmit_sg = 0;
+			conn->c_xmit_hdr_off = 0;
+			conn->c_xmit_data_off = 0;
+			conn->c_xmit_rdma_sent = 0;
+			conn->c_xmit_atomic_sent = 0;
+			conn->c_xmit_data_sent = 0;
+
+			rds_message_put(rm);
+		}
+	}
+
+	if (conn->c_trans->xmit_complete)
+		conn->c_trans->xmit_complete(conn);
+
+	release_in_xmit(conn);
+
+	/* Nuke any messages we decided not to retransmit. */
+	if (!list_empty(&to_be_dropped)) {
+		/* irqs on here, so we can put(), unlike above */
+		list_for_each_entry(rm, &to_be_dropped, m_conn_item)
+			rds_message_put(rm);
+		rds_send_remove_from_sock(&to_be_dropped, RDS_RDMA_DROPPED);
+	}
+
+	/*
+	 * Other senders can queue a message after we last test the send queue
+	 * but before we clear RDS_IN_XMIT.  In that case they'd back off and
+	 * not try and send their newly queued message.  We need to check the
+	 * send queue after having cleared RDS_IN_XMIT so that their message
+	 * doesn't get stuck on the send queue.
+	 *
+	 * If the transport cannot continue (i.e ret != 0), then it must
+	 * call us when more room is available, such as from the tx
+	 * completion handler.
+	 */
+	if (ret == 0) {
+		smp_mb();
+		if (!list_empty(&conn->c_send_queue)) {
+			rds_stats_inc(s_send_lock_queue_raced);
+			goto restart;
+		}
+	}
+out:
+	return ret;
+}
+
+static void rds_send_sndbuf_remove(struct rds_sock *rs, struct rds_message *rm)
+{
+	u32 len = be32_to_cpu(rm->m_inc.i_hdr.h_len);
+
+	assert_spin_locked(&rs->rs_lock);
+
+	BUG_ON(rs->rs_snd_bytes < len);
+	rs->rs_snd_bytes -= len;
+
+	if (rs->rs_snd_bytes == 0)
+		rds_stats_inc(s_send_queue_empty);
+}
+
+static inline int rds_send_is_acked(struct rds_message *rm, u64 ack,
+				    is_acked_func is_acked)
+{
+	if (is_acked)
+		return is_acked(rm, ack);
+	return be64_to_cpu(rm->m_inc.i_hdr.h_sequence) <= ack;
+}
+
+/*
+ * This is pretty similar to what happens below in the ACK
+ * handling code - except that we call here as soon as we get
+ * the IB send completion on the RDMA op and the accompanying
+ * message.
+ */
+void rds_rdma_send_complete(struct rds_message *rm, int status)
+{
+	struct rds_sock *rs = NULL;
+	struct rm_rdma_op *ro;
+	struct rds_notifier *notifier;
+	unsigned long flags;
+
+	spin_lock_irqsave(&rm->m_rs_lock, flags);
+
+	ro = &rm->rdma;
+	if (test_bit(RDS_MSG_ON_SOCK, &rm->m_flags) &&
+	    ro->op_active && ro->op_notify && ro->op_notifier) {
+		notifier = ro->op_notifier;
+		rs = rm->m_rs;
+		sock_hold(rds_rs_to_sk(rs));
+
+		notifier->n_status = status;
+		spin_lock(&rs->rs_lock);
+		list_add_tail(&notifier->n_list, &rs->rs_notify_queue);
+		spin_unlock(&rs->rs_lock);
+
+		ro->op_notifier = NULL;
+	}
+
+	spin_unlock_irqrestore(&rm->m_rs_lock, flags);
+
+	if (rs) {
+		rds_wake_sk_sleep(rs);
+		sock_put(rds_rs_to_sk(rs));
+	}
+}
+EXPORT_SYMBOL_GPL(rds_rdma_send_complete);
+
+/*
+ * Just like above, except looks at atomic op
+ */
+void rds_atomic_send_complete(struct rds_message *rm, int status)
+{
+	struct rds_sock *rs = NULL;
+	struct rm_atomic_op *ao;
+	struct rds_notifier *notifier;
+	unsigned long flags;
+
+	spin_lock_irqsave(&rm->m_rs_lock, flags);
+
+	ao = &rm->atomic;
+	if (test_bit(RDS_MSG_ON_SOCK, &rm->m_flags)
+	    && ao->op_active && ao->op_notify && ao->op_notifier) {
+		notifier = ao->op_notifier;
+		rs = rm->m_rs;
+		sock_hold(rds_rs_to_sk(rs));
+
+		notifier->n_status = status;
+		spin_lock(&rs->rs_lock);
+		list_add_tail(&notifier->n_list, &rs->rs_notify_queue);
+		spin_unlock(&rs->rs_lock);
+
+		ao->op_notifier = NULL;
+	}
+
+	spin_unlock_irqrestore(&rm->m_rs_lock, flags);
+
+	if (rs) {
+		rds_wake_sk_sleep(rs);
+		sock_put(rds_rs_to_sk(rs));
+	}
+}
+EXPORT_SYMBOL_GPL(rds_atomic_send_complete);
+
+/*
+ * This is the same as rds_rdma_send_complete except we
+ * don't do any locking - we have all the ingredients (message,
+ * socket, socket lock) and can just move the notifier.
+ */
+static inline void
+__rds_send_complete(struct rds_sock *rs, struct rds_message *rm, int status)
+{
+	struct rm_rdma_op *ro;
+	struct rm_atomic_op *ao;
+
+	ro = &rm->rdma;
+	if (ro->op_active && ro->op_notify && ro->op_notifier) {
+		ro->op_notifier->n_status = status;
+		list_add_tail(&ro->op_notifier->n_list, &rs->rs_notify_queue);
+		ro->op_notifier = NULL;
+	}
+
+	ao = &rm->atomic;
+	if (ao->op_active && ao->op_notify && ao->op_notifier) {
+		ao->op_notifier->n_status = status;
+		list_add_tail(&ao->op_notifier->n_list, &rs->rs_notify_queue);
+		ao->op_notifier = NULL;
+	}
+
+	/* No need to wake the app - caller does this */
+}
+
+/*
+ * This is called from the IB send completion when we detect
+ * a RDMA operation that failed with remote access error.
+ * So speed is not an issue here.
+ */
+struct rds_message *rds_send_get_message(struct rds_connection *conn,
+					 struct rm_rdma_op *op)
+{
+	struct rds_message *rm, *tmp, *found = NULL;
+	unsigned long flags;
+
+	spin_lock_irqsave(&conn->c_lock, flags);
+
+	list_for_each_entry_safe(rm, tmp, &conn->c_retrans, m_conn_item) {
+		if (&rm->rdma == op) {
+			atomic_inc(&rm->m_refcount);
+			found = rm;
+			goto out;
+		}
+	}
+
+	list_for_each_entry_safe(rm, tmp, &conn->c_send_queue, m_conn_item) {
+		if (&rm->rdma == op) {
+			atomic_inc(&rm->m_refcount);
+			found = rm;
+			break;
+		}
+	}
+
+out:
+	spin_unlock_irqrestore(&conn->c_lock, flags);
+
+	return found;
+}
+EXPORT_SYMBOL_GPL(rds_send_get_message);
+
+/*
+ * This removes messages from the socket's list if they're on it.  The list
+ * argument must be private to the caller, we must be able to modify it
+ * without locks.  The messages must have a reference held for their
+ * position on the list.  This function will drop that reference after
+ * removing the messages from the 'messages' list regardless of if it found
+ * the messages on the socket list or not.
+ */
+static void rds_send_remove_from_sock(struct list_head *messages, int status)
+{
+	unsigned long flags;
+	struct rds_sock *rs = NULL;
+	struct rds_message *rm;
+
+	while (!list_empty(messages)) {
+		int was_on_sock = 0;
+
+		rm = list_entry(messages->next, struct rds_message,
+				m_conn_item);
+		list_del_init(&rm->m_conn_item);
+
+		/*
+		 * If we see this flag cleared then we're *sure* that someone
+		 * else beat us to removing it from the sock.  If we race
+		 * with their flag update we'll get the lock and then really
+		 * see that the flag has been cleared.
+		 *
+		 * The message spinlock makes sure nobody clears rm->m_rs
+		 * while we're messing with it. It does not prevent the
+		 * message from being removed from the socket, though.
+		 */
+		spin_lock_irqsave(&rm->m_rs_lock, flags);
+		if (!test_bit(RDS_MSG_ON_SOCK, &rm->m_flags))
+			goto unlock_and_drop;
+
+		if (rs != rm->m_rs) {
+			if (rs) {
+				rds_wake_sk_sleep(rs);
+				sock_put(rds_rs_to_sk(rs));
+			}
+			rs = rm->m_rs;
+			if (rs)
+				sock_hold(rds_rs_to_sk(rs));
+		}
+		if (!rs)
+			goto unlock_and_drop;
+		spin_lock(&rs->rs_lock);
+
+		if (test_and_clear_bit(RDS_MSG_ON_SOCK, &rm->m_flags)) {
+			struct rm_rdma_op *ro = &rm->rdma;
+			struct rds_notifier *notifier;
+
+			list_del_init(&rm->m_sock_item);
+			rds_send_sndbuf_remove(rs, rm);
+
+			if (ro->op_active && ro->op_notifier &&
+			       (ro->op_notify || (ro->op_recverr && status))) {
+				notifier = ro->op_notifier;
+				list_add_tail(&notifier->n_list,
+						&rs->rs_notify_queue);
+				if (!notifier->n_status)
+					notifier->n_status = status;
+				rm->rdma.op_notifier = NULL;
+			}
+			was_on_sock = 1;
+			rm->m_rs = NULL;
+		}
+		spin_unlock(&rs->rs_lock);
+
+unlock_and_drop:
+		spin_unlock_irqrestore(&rm->m_rs_lock, flags);
+		rds_message_put(rm);
+		if (was_on_sock)
+			rds_message_put(rm);
+	}
+
+	if (rs) {
+		rds_wake_sk_sleep(rs);
+		sock_put(rds_rs_to_sk(rs));
+	}
+}
+
+/*
+ * Transports call here when they've determined that the receiver queued
+ * messages up to, and including, the given sequence number.  Messages are
+ * moved to the retrans queue when rds_send_xmit picks them off the send
+ * queue. This means that in the TCP case, the message may not have been
+ * assigned the m_ack_seq yet - but that's fine as long as tcp_is_acked
+ * checks the RDS_MSG_HAS_ACK_SEQ bit.
+ */
+void rds_send_drop_acked(struct rds_connection *conn, u64 ack,
+			 is_acked_func is_acked)
+{
+	struct rds_message *rm, *tmp;
+	unsigned long flags;
+	LIST_HEAD(list);
+
+	spin_lock_irqsave(&conn->c_lock, flags);
+
+	list_for_each_entry_safe(rm, tmp, &conn->c_retrans, m_conn_item) {
+		if (!rds_send_is_acked(rm, ack, is_acked))
+			break;
+
+		list_move(&rm->m_conn_item, &list);
+		clear_bit(RDS_MSG_ON_CONN, &rm->m_flags);
+	}
+
+	/* order flag updates with spin locks */
+	if (!list_empty(&list))
+		smp_mb__after_atomic();
+
+	spin_unlock_irqrestore(&conn->c_lock, flags);
+
+	/* now remove the messages from the sock list as needed */
+	rds_send_remove_from_sock(&list, RDS_RDMA_SUCCESS);
+}
+EXPORT_SYMBOL_GPL(rds_send_drop_acked);
+
+void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest)
+{
+	struct rds_message *rm, *tmp;
+	struct rds_connection *conn;
+	unsigned long flags;
+	LIST_HEAD(list);
+
+	/* get all the messages we're dropping under the rs lock */
+	spin_lock_irqsave(&rs->rs_lock, flags);
+
+	list_for_each_entry_safe(rm, tmp, &rs->rs_send_queue, m_sock_item) {
+		if (dest && (dest->sin_addr.s_addr != rm->m_daddr ||
+			     dest->sin_port != rm->m_inc.i_hdr.h_dport))
+			continue;
+
+		list_move(&rm->m_sock_item, &list);
+		rds_send_sndbuf_remove(rs, rm);
+		clear_bit(RDS_MSG_ON_SOCK, &rm->m_flags);
+	}
+
+	/* order flag updates with the rs lock */
+	smp_mb__after_atomic();
+
+	spin_unlock_irqrestore(&rs->rs_lock, flags);
+
+	if (list_empty(&list))
+		return;
+
+	/* Remove the messages from the conn */
+	list_for_each_entry(rm, &list, m_sock_item) {
+
+		conn = rm->m_inc.i_conn;
+
+		spin_lock_irqsave(&conn->c_lock, flags);
+		/*
+		 * Maybe someone else beat us to removing rm from the conn.
+		 * If we race with their flag update we'll get the lock and
+		 * then really see that the flag has been cleared.
+		 */
+		if (!test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags)) {
+			spin_unlock_irqrestore(&conn->c_lock, flags);
+			spin_lock_irqsave(&rm->m_rs_lock, flags);
+			rm->m_rs = NULL;
+			spin_unlock_irqrestore(&rm->m_rs_lock, flags);
+			continue;
+		}
+		list_del_init(&rm->m_conn_item);
+		spin_unlock_irqrestore(&conn->c_lock, flags);
+
+		/*
+		 * Couldn't grab m_rs_lock in top loop (lock ordering),
+		 * but we can now.
+		 */
+		spin_lock_irqsave(&rm->m_rs_lock, flags);
+
+		spin_lock(&rs->rs_lock);
+		__rds_send_complete(rs, rm, RDS_RDMA_CANCELED);
+		spin_unlock(&rs->rs_lock);
+
+		rm->m_rs = NULL;
+		spin_unlock_irqrestore(&rm->m_rs_lock, flags);
+
+		rds_message_put(rm);
+	}
+
+	rds_wake_sk_sleep(rs);
+
+	while (!list_empty(&list)) {
+		rm = list_entry(list.next, struct rds_message, m_sock_item);
+		list_del_init(&rm->m_sock_item);
+
+		rds_message_wait(rm);
+		rds_message_put(rm);
+	}
+}
+
+/*
+ * we only want this to fire once so we use the callers 'queued'.  It's
+ * possible that another thread can race with us and remove the
+ * message from the flow with RDS_CANCEL_SENT_TO.
+ */
+static int rds_send_queue_rm(struct rds_sock *rs, struct rds_connection *conn,
+			     struct rds_message *rm, __be16 sport,
+			     __be16 dport, int *queued)
+{
+	unsigned long flags;
+	u32 len;
+
+	if (*queued)
+		goto out;
+
+	len = be32_to_cpu(rm->m_inc.i_hdr.h_len);
+
+	/* this is the only place which holds both the socket's rs_lock
+	 * and the connection's c_lock */
+	spin_lock_irqsave(&rs->rs_lock, flags);
+
+	/*
+	 * If there is a little space in sndbuf, we don't queue anything,
+	 * and userspace gets -EAGAIN. But poll() indicates there's send
+	 * room. This can lead to bad behavior (spinning) if snd_bytes isn't
+	 * freed up by incoming acks. So we check the *old* value of
+	 * rs_snd_bytes here to allow the last msg to exceed the buffer,
+	 * and poll() now knows no more data can be sent.
+	 */
+	if (rs->rs_snd_bytes < rds_sk_sndbuf(rs)) {
+		rs->rs_snd_bytes += len;
+
+		/* let recv side know we are close to send space exhaustion.
+		 * This is probably not the optimal way to do it, as this
+		 * means we set the flag on *all* messages as soon as our
+		 * throughput hits a certain threshold.
+		 */
+		if (rs->rs_snd_bytes >= rds_sk_sndbuf(rs) / 2)
+			__set_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags);
+
+		list_add_tail(&rm->m_sock_item, &rs->rs_send_queue);
+		set_bit(RDS_MSG_ON_SOCK, &rm->m_flags);
+		rds_message_addref(rm);
+		rm->m_rs = rs;
+
+		/* The code ordering is a little weird, but we're
+		   trying to minimize the time we hold c_lock */
+		rds_message_populate_header(&rm->m_inc.i_hdr, sport, dport, 0);
+		rm->m_inc.i_conn = conn;
+		rds_message_addref(rm);
+
+		spin_lock(&conn->c_lock);
+		rm->m_inc.i_hdr.h_sequence = cpu_to_be64(conn->c_next_tx_seq++);
+		list_add_tail(&rm->m_conn_item, &conn->c_send_queue);
+		set_bit(RDS_MSG_ON_CONN, &rm->m_flags);
+		spin_unlock(&conn->c_lock);
+
+		rdsdebug("queued msg %p len %d, rs %p bytes %d seq %llu\n",
+			 rm, len, rs, rs->rs_snd_bytes,
+			 (unsigned long long)be64_to_cpu(rm->m_inc.i_hdr.h_sequence));
+
+		*queued = 1;
+	}
+
+	spin_unlock_irqrestore(&rs->rs_lock, flags);
+out:
+	return *queued;
+}
+
+/*
+ * rds_message is getting to be quite complicated, and we'd like to allocate
+ * it all in one go. This figures out how big it needs to be up front.
+ */
+static int rds_rm_size(struct msghdr *msg, int data_len)
+{
+	struct cmsghdr *cmsg;
+	int size = 0;
+	int cmsg_groups = 0;
+	int retval;
+
+	for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) {
+		if (!CMSG_OK(msg, cmsg))
+			return -EINVAL;
+
+		if (cmsg->cmsg_level != SOL_RDS)
+			continue;
+
+		switch (cmsg->cmsg_type) {
+		case RDS_CMSG_RDMA_ARGS:
+			cmsg_groups |= 1;
+			retval = rds_rdma_extra_size(CMSG_DATA(cmsg));
+			if (retval < 0)
+				return retval;
+			size += retval;
+
+			break;
+
+		case RDS_CMSG_RDMA_DEST:
+		case RDS_CMSG_RDMA_MAP:
+			cmsg_groups |= 2;
+			/* these are valid but do no add any size */
+			break;
+
+		case RDS_CMSG_ATOMIC_CSWP:
+		case RDS_CMSG_ATOMIC_FADD:
+		case RDS_CMSG_MASKED_ATOMIC_CSWP:
+		case RDS_CMSG_MASKED_ATOMIC_FADD:
+			cmsg_groups |= 1;
+			size += sizeof(struct scatterlist);
+			break;
+
+		default:
+			return -EINVAL;
+		}
+
+	}
+
+	size += ceil(data_len, PAGE_SIZE) * sizeof(struct scatterlist);
+
+	/* Ensure (DEST, MAP) are never used with (ARGS, ATOMIC) */
+	if (cmsg_groups == 3)
+		return -EINVAL;
+
+	return size;
+}
+
+static int rds_cmsg_send(struct rds_sock *rs, struct rds_message *rm,
+			 struct msghdr *msg, int *allocated_mr)
+{
+	struct cmsghdr *cmsg;
+	int ret = 0;
+
+	for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) {
+		if (!CMSG_OK(msg, cmsg))
+			return -EINVAL;
+
+		if (cmsg->cmsg_level != SOL_RDS)
+			continue;
+
+		/* As a side effect, RDMA_DEST and RDMA_MAP will set
+		 * rm->rdma.m_rdma_cookie and rm->rdma.m_rdma_mr.
+		 */
+		switch (cmsg->cmsg_type) {
+		case RDS_CMSG_RDMA_ARGS:
+			ret = rds_cmsg_rdma_args(rs, rm, cmsg);
+			break;
+
+		case RDS_CMSG_RDMA_DEST:
+			ret = rds_cmsg_rdma_dest(rs, rm, cmsg);
+			break;
+
+		case RDS_CMSG_RDMA_MAP:
+			ret = rds_cmsg_rdma_map(rs, rm, cmsg);
+			if (!ret)
+				*allocated_mr = 1;
+			break;
+		case RDS_CMSG_ATOMIC_CSWP:
+		case RDS_CMSG_ATOMIC_FADD:
+		case RDS_CMSG_MASKED_ATOMIC_CSWP:
+		case RDS_CMSG_MASKED_ATOMIC_FADD:
+			ret = rds_cmsg_atomic(rs, rm, cmsg);
+			break;
+
+		default:
+			return -EINVAL;
+		}
+
+		if (ret)
+			break;
+	}
+
+	return ret;
+}
+
+int rds_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
+		size_t payload_len)
+{
+	struct sock *sk = sock->sk;
+	struct rds_sock *rs = rds_sk_to_rs(sk);
+	DECLARE_SOCKADDR(struct sockaddr_in *, usin, msg->msg_name);
+	__be32 daddr;
+	__be16 dport;
+	struct rds_message *rm = NULL;
+	struct rds_connection *conn;
+	int ret = 0;
+	int queued = 0, allocated_mr = 0;
+	int nonblock = msg->msg_flags & MSG_DONTWAIT;
+	long timeo = sock_sndtimeo(sk, nonblock);
+
+	/* Mirror Linux UDP mirror of BSD error message compatibility */
+	/* XXX: Perhaps MSG_MORE someday */
+	if (msg->msg_flags & ~(MSG_DONTWAIT | MSG_CMSG_COMPAT)) {
+		ret = -EOPNOTSUPP;
+		goto out;
+	}
+
+	if (msg->msg_namelen) {
+		/* XXX fail non-unicast destination IPs? */
+		if (msg->msg_namelen < sizeof(*usin) || usin->sin_family != AF_INET) {
+			ret = -EINVAL;
+			goto out;
+		}
+		daddr = usin->sin_addr.s_addr;
+		dport = usin->sin_port;
+	} else {
+		/* We only care about consistency with ->connect() */
+		lock_sock(sk);
+		daddr = rs->rs_conn_addr;
+		dport = rs->rs_conn_port;
+		release_sock(sk);
+	}
+
+	/* racing with another thread binding seems ok here */
+	if (daddr == 0 || rs->rs_bound_addr == 0) {
+		ret = -ENOTCONN; /* XXX not a great errno */
+		goto out;
+	}
+
+	/* size of rm including all sgs */
+	ret = rds_rm_size(msg, payload_len);
+	if (ret < 0)
+		goto out;
+
+	rm = rds_message_alloc(ret, GFP_KERNEL);
+	if (!rm) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	/* Attach data to the rm */
+	if (payload_len) {
+		rm->data.op_sg = rds_message_alloc_sgs(rm, ceil(payload_len, PAGE_SIZE));
+		if (!rm->data.op_sg) {
+			ret = -ENOMEM;
+			goto out;
+		}
+		ret = rds_message_copy_from_user(rm, msg->msg_iov, payload_len);
+		if (ret)
+			goto out;
+	}
+	rm->data.op_active = 1;
+
+	rm->m_daddr = daddr;
+
+	/* rds_conn_create has a spinlock that runs with IRQ off.
+	 * Caching the conn in the socket helps a lot. */
+	if (rs->rs_conn && rs->rs_conn->c_faddr == daddr)
+		conn = rs->rs_conn;
+	else {
+		conn = rds_conn_create_outgoing(rs->rs_bound_addr, daddr,
+					rs->rs_transport,
+					sock->sk->sk_allocation);
+		if (IS_ERR(conn)) {
+			ret = PTR_ERR(conn);
+			goto out;
+		}
+		rs->rs_conn = conn;
+	}
+
+	/* Parse any control messages the user may have included. */
+	ret = rds_cmsg_send(rs, rm, msg, &allocated_mr);
+	if (ret)
+		goto out;
+
+	if (rm->rdma.op_active && !conn->c_trans->xmit_rdma) {
+		printk_ratelimited(KERN_NOTICE "rdma_op %p conn xmit_rdma %p\n",
+			       &rm->rdma, conn->c_trans->xmit_rdma);
+		ret = -EOPNOTSUPP;
+		goto out;
+	}
+
+	if (rm->atomic.op_active && !conn->c_trans->xmit_atomic) {
+		printk_ratelimited(KERN_NOTICE "atomic_op %p conn xmit_atomic %p\n",
+			       &rm->atomic, conn->c_trans->xmit_atomic);
+		ret = -EOPNOTSUPP;
+		goto out;
+	}
+
+	rds_conn_connect_if_down(conn);
+
+	ret = rds_cong_wait(conn->c_fcong, dport, nonblock, rs);
+	if (ret) {
+		rs->rs_seen_congestion = 1;
+		goto out;
+	}
+
+	while (!rds_send_queue_rm(rs, conn, rm, rs->rs_bound_port,
+				  dport, &queued)) {
+		rds_stats_inc(s_send_queue_full);
+		/* XXX make sure this is reasonable */
+		if (payload_len > rds_sk_sndbuf(rs)) {
+			ret = -EMSGSIZE;
+			goto out;
+		}
+		if (nonblock) {
+			ret = -EAGAIN;
+			goto out;
+		}
+
+		timeo = wait_event_interruptible_timeout(*sk_sleep(sk),
+					rds_send_queue_rm(rs, conn, rm,
+							  rs->rs_bound_port,
+							  dport,
+							  &queued),
+					timeo);
+		rdsdebug("sendmsg woke queued %d timeo %ld\n", queued, timeo);
+		if (timeo > 0 || timeo == MAX_SCHEDULE_TIMEOUT)
+			continue;
+
+		ret = timeo;
+		if (ret == 0)
+			ret = -ETIMEDOUT;
+		goto out;
+	}
+
+	/*
+	 * By now we've committed to the send.  We reuse rds_send_worker()
+	 * to retry sends in the rds thread if the transport asks us to.
+	 */
+	rds_stats_inc(s_send_queued);
+
+	if (!test_bit(RDS_LL_SEND_FULL, &conn->c_flags))
+		rds_send_xmit(conn);
+
+	rds_message_put(rm);
+	return payload_len;
+
+out:
+	/* If the user included a RDMA_MAP cmsg, we allocated a MR on the fly.
+	 * If the sendmsg goes through, we keep the MR. If it fails with EAGAIN
+	 * or in any other way, we need to destroy the MR again */
+	if (allocated_mr)
+		rds_rdma_unuse(rs, rds_rdma_cookie_key(rm->m_rdma_cookie), 1);
+
+	if (rm)
+		rds_message_put(rm);
+	return ret;
+}
+
+/*
+ * Reply to a ping packet.
+ */
+int
+rds_send_pong(struct rds_connection *conn, __be16 dport)
+{
+	struct rds_message *rm;
+	unsigned long flags;
+	int ret = 0;
+
+	rm = rds_message_alloc(0, GFP_ATOMIC);
+	if (!rm) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	rm->m_daddr = conn->c_faddr;
+	rm->data.op_active = 1;
+
+	rds_conn_connect_if_down(conn);
+
+	ret = rds_cong_wait(conn->c_fcong, dport, 1, NULL);
+	if (ret)
+		goto out;
+
+	spin_lock_irqsave(&conn->c_lock, flags);
+	list_add_tail(&rm->m_conn_item, &conn->c_send_queue);
+	set_bit(RDS_MSG_ON_CONN, &rm->m_flags);
+	rds_message_addref(rm);
+	rm->m_inc.i_conn = conn;
+
+	rds_message_populate_header(&rm->m_inc.i_hdr, 0, dport,
+				    conn->c_next_tx_seq);
+	conn->c_next_tx_seq++;
+	spin_unlock_irqrestore(&conn->c_lock, flags);
+
+	rds_stats_inc(s_send_queued);
+	rds_stats_inc(s_send_pong);
+
+	if (!test_bit(RDS_LL_SEND_FULL, &conn->c_flags))
+		queue_delayed_work(rds_wq, &conn->c_send_w, 0);
+
+	rds_message_put(rm);
+	return 0;
+
+out:
+	if (rm)
+		rds_message_put(rm);
+	return ret;
+}
diff --git a/net/rds/stats.c b/net/rds/stats.c
new file mode 100644
index 0000000..73be187
--- /dev/null
+++ b/net/rds/stats.c
@@ -0,0 +1,152 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/percpu.h>
+#include <linux/seq_file.h>
+#include <linux/proc_fs.h>
+#include <linux/export.h>
+
+#include "rds.h"
+
+DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_statistics, rds_stats);
+EXPORT_PER_CPU_SYMBOL_GPL(rds_stats);
+
+/* :.,$s/unsigned long\>.*\<s_\(.*\);/"\1",/g */
+
+static const char *const rds_stat_names[] = {
+	"conn_reset",
+	"recv_drop_bad_checksum",
+	"recv_drop_old_seq",
+	"recv_drop_no_sock",
+	"recv_drop_dead_sock",
+	"recv_deliver_raced",
+	"recv_delivered",
+	"recv_queued",
+	"recv_immediate_retry",
+	"recv_delayed_retry",
+	"recv_ack_required",
+	"recv_rdma_bytes",
+	"recv_ping",
+	"send_queue_empty",
+	"send_queue_full",
+	"send_lock_contention",
+	"send_lock_queue_raced",
+	"send_immediate_retry",
+	"send_delayed_retry",
+	"send_drop_acked",
+	"send_ack_required",
+	"send_queued",
+	"send_rdma",
+	"send_rdma_bytes",
+	"send_pong",
+	"page_remainder_hit",
+	"page_remainder_miss",
+	"copy_to_user",
+	"copy_from_user",
+	"cong_update_queued",
+	"cong_update_received",
+	"cong_send_error",
+	"cong_send_blocked",
+};
+
+void rds_stats_info_copy(struct rds_info_iterator *iter,
+			 uint64_t *values, const char *const *names, size_t nr)
+{
+	struct rds_info_counter ctr;
+	size_t i;
+
+	for (i = 0; i < nr; i++) {
+		BUG_ON(strlen(names[i]) >= sizeof(ctr.name));
+		strncpy(ctr.name, names[i], sizeof(ctr.name) - 1);
+		ctr.name[sizeof(ctr.name) - 1] = '\0';
+		ctr.value = values[i];
+
+		rds_info_copy(iter, &ctr, sizeof(ctr));
+	}
+}
+EXPORT_SYMBOL_GPL(rds_stats_info_copy);
+
+/*
+ * This gives global counters across all the transports.  The strings
+ * are copied in so that the tool doesn't need knowledge of the specific
+ * stats that we're exporting.  Some are pretty implementation dependent
+ * and may change over time.  That doesn't stop them from being useful.
+ *
+ * This is the only function in the chain that knows about the byte granular
+ * length in userspace.  It converts it to number of stat entries that the
+ * rest of the functions operate in.
+ */
+static void rds_stats_info(struct socket *sock, unsigned int len,
+			   struct rds_info_iterator *iter,
+			   struct rds_info_lengths *lens)
+{
+	struct rds_statistics stats = {0, };
+	uint64_t *src;
+	uint64_t *sum;
+	size_t i;
+	int cpu;
+	unsigned int avail;
+
+	avail = len / sizeof(struct rds_info_counter);
+
+	if (avail < ARRAY_SIZE(rds_stat_names)) {
+		avail = 0;
+		goto trans;
+	}
+
+	for_each_online_cpu(cpu) {
+		src = (uint64_t *)&(per_cpu(rds_stats, cpu));
+		sum = (uint64_t *)&stats;
+		for (i = 0; i < sizeof(stats) / sizeof(uint64_t); i++)
+			*(sum++) += *(src++);
+	}
+
+	rds_stats_info_copy(iter, (uint64_t *)&stats, rds_stat_names,
+			    ARRAY_SIZE(rds_stat_names));
+	avail -= ARRAY_SIZE(rds_stat_names);
+
+trans:
+	lens->each = sizeof(struct rds_info_counter);
+	lens->nr = rds_trans_stats_info_copy(iter, avail) +
+		   ARRAY_SIZE(rds_stat_names);
+}
+
+void rds_stats_exit(void)
+{
+	rds_info_deregister_func(RDS_INFO_COUNTERS, rds_stats_info);
+}
+
+int rds_stats_init(void)
+{
+	rds_info_register_func(RDS_INFO_COUNTERS, rds_stats_info);
+	return 0;
+}
diff --git a/net/rds/sysctl.c b/net/rds/sysctl.c
new file mode 100644
index 0000000..c3b0cd4
--- /dev/null
+++ b/net/rds/sysctl.c
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/sysctl.h>
+#include <linux/proc_fs.h>
+
+#include "rds.h"
+
+static struct ctl_table_header *rds_sysctl_reg_table;
+
+static unsigned long rds_sysctl_reconnect_min = 1;
+static unsigned long rds_sysctl_reconnect_max = ~0UL;
+
+unsigned long rds_sysctl_reconnect_min_jiffies;
+unsigned long rds_sysctl_reconnect_max_jiffies = HZ;
+
+unsigned int  rds_sysctl_max_unacked_packets = 8;
+unsigned int  rds_sysctl_max_unacked_bytes = (16 << 20);
+
+unsigned int rds_sysctl_ping_enable = 1;
+
+static struct ctl_table rds_sysctl_rds_table[] = {
+	{
+		.procname       = "reconnect_min_delay_ms",
+		.data		= &rds_sysctl_reconnect_min_jiffies,
+		.maxlen         = sizeof(unsigned long),
+		.mode           = 0644,
+		.proc_handler   = proc_doulongvec_ms_jiffies_minmax,
+		.extra1		= &rds_sysctl_reconnect_min,
+		.extra2		= &rds_sysctl_reconnect_max_jiffies,
+	},
+	{
+		.procname       = "reconnect_max_delay_ms",
+		.data		= &rds_sysctl_reconnect_max_jiffies,
+		.maxlen         = sizeof(unsigned long),
+		.mode           = 0644,
+		.proc_handler   = proc_doulongvec_ms_jiffies_minmax,
+		.extra1		= &rds_sysctl_reconnect_min_jiffies,
+		.extra2		= &rds_sysctl_reconnect_max,
+	},
+	{
+		.procname	= "max_unacked_packets",
+		.data		= &rds_sysctl_max_unacked_packets,
+		.maxlen         = sizeof(unsigned long),
+		.mode           = 0644,
+		.proc_handler   = proc_dointvec,
+	},
+	{
+		.procname	= "max_unacked_bytes",
+		.data		= &rds_sysctl_max_unacked_bytes,
+		.maxlen         = sizeof(unsigned long),
+		.mode           = 0644,
+		.proc_handler   = proc_dointvec,
+	},
+	{
+		.procname	= "ping_enable",
+		.data		= &rds_sysctl_ping_enable,
+		.maxlen         = sizeof(int),
+		.mode           = 0644,
+		.proc_handler   = proc_dointvec,
+	},
+	{ }
+};
+
+void rds_sysctl_exit(void)
+{
+	unregister_net_sysctl_table(rds_sysctl_reg_table);
+}
+
+int rds_sysctl_init(void)
+{
+	rds_sysctl_reconnect_min = msecs_to_jiffies(1);
+	rds_sysctl_reconnect_min_jiffies = rds_sysctl_reconnect_min;
+
+	rds_sysctl_reg_table = register_net_sysctl(&init_net,"net/rds", rds_sysctl_rds_table);
+	if (!rds_sysctl_reg_table)
+		return -ENOMEM;
+	return 0;
+}
diff --git a/net/rds/tcp.c b/net/rds/tcp.c
new file mode 100644
index 0000000..edac9ef
--- /dev/null
+++ b/net/rds/tcp.c
@@ -0,0 +1,326 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/in.h>
+#include <linux/module.h>
+#include <net/tcp.h>
+
+#include "rds.h"
+#include "tcp.h"
+
+/* only for info exporting */
+static DEFINE_SPINLOCK(rds_tcp_tc_list_lock);
+static LIST_HEAD(rds_tcp_tc_list);
+static unsigned int rds_tcp_tc_count;
+
+/* Track rds_tcp_connection structs so they can be cleaned up */
+static DEFINE_SPINLOCK(rds_tcp_conn_lock);
+static LIST_HEAD(rds_tcp_conn_list);
+
+static struct kmem_cache *rds_tcp_conn_slab;
+
+#define RDS_TCP_DEFAULT_BUFSIZE (128 * 1024)
+
+/* doing it this way avoids calling tcp_sk() */
+void rds_tcp_nonagle(struct socket *sock)
+{
+	mm_segment_t oldfs = get_fs();
+	int val = 1;
+
+	set_fs(KERNEL_DS);
+	sock->ops->setsockopt(sock, SOL_TCP, TCP_NODELAY, (char __user *)&val,
+			      sizeof(val));
+	set_fs(oldfs);
+}
+
+void rds_tcp_tune(struct socket *sock)
+{
+	struct sock *sk = sock->sk;
+
+	rds_tcp_nonagle(sock);
+
+	/*
+	 * We're trying to saturate gigabit with the default,
+	 * see svc_sock_setbufsize().
+	 */
+	lock_sock(sk);
+	sk->sk_sndbuf = RDS_TCP_DEFAULT_BUFSIZE;
+	sk->sk_rcvbuf = RDS_TCP_DEFAULT_BUFSIZE;
+	sk->sk_userlocks |= SOCK_SNDBUF_LOCK|SOCK_RCVBUF_LOCK;
+	release_sock(sk);
+}
+
+u32 rds_tcp_snd_nxt(struct rds_tcp_connection *tc)
+{
+	return tcp_sk(tc->t_sock->sk)->snd_nxt;
+}
+
+u32 rds_tcp_snd_una(struct rds_tcp_connection *tc)
+{
+	return tcp_sk(tc->t_sock->sk)->snd_una;
+}
+
+void rds_tcp_restore_callbacks(struct socket *sock,
+			       struct rds_tcp_connection *tc)
+{
+	rdsdebug("restoring sock %p callbacks from tc %p\n", sock, tc);
+	write_lock_bh(&sock->sk->sk_callback_lock);
+
+	/* done under the callback_lock to serialize with write_space */
+	spin_lock(&rds_tcp_tc_list_lock);
+	list_del_init(&tc->t_list_item);
+	rds_tcp_tc_count--;
+	spin_unlock(&rds_tcp_tc_list_lock);
+
+	tc->t_sock = NULL;
+
+	sock->sk->sk_write_space = tc->t_orig_write_space;
+	sock->sk->sk_data_ready = tc->t_orig_data_ready;
+	sock->sk->sk_state_change = tc->t_orig_state_change;
+	sock->sk->sk_user_data = NULL;
+
+	write_unlock_bh(&sock->sk->sk_callback_lock);
+}
+
+/*
+ * This is the only path that sets tc->t_sock.  Send and receive trust that
+ * it is set.  The RDS_CONN_CONNECTED bit protects those paths from being
+ * called while it isn't set.
+ */
+void rds_tcp_set_callbacks(struct socket *sock, struct rds_connection *conn)
+{
+	struct rds_tcp_connection *tc = conn->c_transport_data;
+
+	rdsdebug("setting sock %p callbacks to tc %p\n", sock, tc);
+	write_lock_bh(&sock->sk->sk_callback_lock);
+
+	/* done under the callback_lock to serialize with write_space */
+	spin_lock(&rds_tcp_tc_list_lock);
+	list_add_tail(&tc->t_list_item, &rds_tcp_tc_list);
+	rds_tcp_tc_count++;
+	spin_unlock(&rds_tcp_tc_list_lock);
+
+	/* accepted sockets need our listen data ready undone */
+	if (sock->sk->sk_data_ready == rds_tcp_listen_data_ready)
+		sock->sk->sk_data_ready = sock->sk->sk_user_data;
+
+	tc->t_sock = sock;
+	tc->conn = conn;
+	tc->t_orig_data_ready = sock->sk->sk_data_ready;
+	tc->t_orig_write_space = sock->sk->sk_write_space;
+	tc->t_orig_state_change = sock->sk->sk_state_change;
+
+	sock->sk->sk_user_data = conn;
+	sock->sk->sk_data_ready = rds_tcp_data_ready;
+	sock->sk->sk_write_space = rds_tcp_write_space;
+	sock->sk->sk_state_change = rds_tcp_state_change;
+
+	write_unlock_bh(&sock->sk->sk_callback_lock);
+}
+
+static void rds_tcp_tc_info(struct socket *sock, unsigned int len,
+			    struct rds_info_iterator *iter,
+			    struct rds_info_lengths *lens)
+{
+	struct rds_info_tcp_socket tsinfo;
+	struct rds_tcp_connection *tc;
+	unsigned long flags;
+	struct sockaddr_in sin;
+	int sinlen;
+
+	spin_lock_irqsave(&rds_tcp_tc_list_lock, flags);
+
+	if (len / sizeof(tsinfo) < rds_tcp_tc_count)
+		goto out;
+
+	list_for_each_entry(tc, &rds_tcp_tc_list, t_list_item) {
+
+		sock->ops->getname(sock, (struct sockaddr *)&sin, &sinlen, 0);
+		tsinfo.local_addr = sin.sin_addr.s_addr;
+		tsinfo.local_port = sin.sin_port;
+		sock->ops->getname(sock, (struct sockaddr *)&sin, &sinlen, 1);
+		tsinfo.peer_addr = sin.sin_addr.s_addr;
+		tsinfo.peer_port = sin.sin_port;
+
+		tsinfo.hdr_rem = tc->t_tinc_hdr_rem;
+		tsinfo.data_rem = tc->t_tinc_data_rem;
+		tsinfo.last_sent_nxt = tc->t_last_sent_nxt;
+		tsinfo.last_expected_una = tc->t_last_expected_una;
+		tsinfo.last_seen_una = tc->t_last_seen_una;
+
+		rds_info_copy(iter, &tsinfo, sizeof(tsinfo));
+	}
+
+out:
+	lens->nr = rds_tcp_tc_count;
+	lens->each = sizeof(tsinfo);
+
+	spin_unlock_irqrestore(&rds_tcp_tc_list_lock, flags);
+}
+
+static int rds_tcp_laddr_check(__be32 addr)
+{
+	if (inet_addr_type(&init_net, addr) == RTN_LOCAL)
+		return 0;
+	return -EADDRNOTAVAIL;
+}
+
+static int rds_tcp_conn_alloc(struct rds_connection *conn, gfp_t gfp)
+{
+	struct rds_tcp_connection *tc;
+
+	tc = kmem_cache_alloc(rds_tcp_conn_slab, gfp);
+	if (!tc)
+		return -ENOMEM;
+
+	tc->t_sock = NULL;
+	tc->t_tinc = NULL;
+	tc->t_tinc_hdr_rem = sizeof(struct rds_header);
+	tc->t_tinc_data_rem = 0;
+
+	conn->c_transport_data = tc;
+
+	spin_lock_irq(&rds_tcp_conn_lock);
+	list_add_tail(&tc->t_tcp_node, &rds_tcp_conn_list);
+	spin_unlock_irq(&rds_tcp_conn_lock);
+
+	rdsdebug("alloced tc %p\n", conn->c_transport_data);
+	return 0;
+}
+
+static void rds_tcp_conn_free(void *arg)
+{
+	struct rds_tcp_connection *tc = arg;
+	unsigned long flags;
+	rdsdebug("freeing tc %p\n", tc);
+
+	spin_lock_irqsave(&rds_tcp_conn_lock, flags);
+	list_del(&tc->t_tcp_node);
+	spin_unlock_irqrestore(&rds_tcp_conn_lock, flags);
+
+	kmem_cache_free(rds_tcp_conn_slab, tc);
+}
+
+static void rds_tcp_destroy_conns(void)
+{
+	struct rds_tcp_connection *tc, *_tc;
+	LIST_HEAD(tmp_list);
+
+	/* avoid calling conn_destroy with irqs off */
+	spin_lock_irq(&rds_tcp_conn_lock);
+	list_splice(&rds_tcp_conn_list, &tmp_list);
+	INIT_LIST_HEAD(&rds_tcp_conn_list);
+	spin_unlock_irq(&rds_tcp_conn_lock);
+
+	list_for_each_entry_safe(tc, _tc, &tmp_list, t_tcp_node) {
+		if (tc->conn->c_passive)
+			rds_conn_destroy(tc->conn->c_passive);
+		rds_conn_destroy(tc->conn);
+	}
+}
+
+static void rds_tcp_exit(void)
+{
+	rds_info_deregister_func(RDS_INFO_TCP_SOCKETS, rds_tcp_tc_info);
+	rds_tcp_listen_stop();
+	rds_tcp_destroy_conns();
+	rds_trans_unregister(&rds_tcp_transport);
+	rds_tcp_recv_exit();
+	kmem_cache_destroy(rds_tcp_conn_slab);
+}
+module_exit(rds_tcp_exit);
+
+struct rds_transport rds_tcp_transport = {
+	.laddr_check		= rds_tcp_laddr_check,
+	.xmit_prepare		= rds_tcp_xmit_prepare,
+	.xmit_complete		= rds_tcp_xmit_complete,
+	.xmit			= rds_tcp_xmit,
+	.recv			= rds_tcp_recv,
+	.conn_alloc		= rds_tcp_conn_alloc,
+	.conn_free		= rds_tcp_conn_free,
+	.conn_connect		= rds_tcp_conn_connect,
+	.conn_shutdown		= rds_tcp_conn_shutdown,
+	.inc_copy_to_user	= rds_tcp_inc_copy_to_user,
+	.inc_free		= rds_tcp_inc_free,
+	.stats_info_copy	= rds_tcp_stats_info_copy,
+	.exit			= rds_tcp_exit,
+	.t_owner		= THIS_MODULE,
+	.t_name			= "tcp",
+	.t_type			= RDS_TRANS_TCP,
+	.t_prefer_loopback	= 1,
+};
+
+static int rds_tcp_init(void)
+{
+	int ret;
+
+	rds_tcp_conn_slab = kmem_cache_create("rds_tcp_connection",
+					      sizeof(struct rds_tcp_connection),
+					      0, 0, NULL);
+	if (!rds_tcp_conn_slab) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	ret = rds_tcp_recv_init();
+	if (ret)
+		goto out_slab;
+
+	ret = rds_trans_register(&rds_tcp_transport);
+	if (ret)
+		goto out_recv;
+
+	ret = rds_tcp_listen_init();
+	if (ret)
+		goto out_register;
+
+	rds_info_register_func(RDS_INFO_TCP_SOCKETS, rds_tcp_tc_info);
+
+	goto out;
+
+out_register:
+	rds_trans_unregister(&rds_tcp_transport);
+out_recv:
+	rds_tcp_recv_exit();
+out_slab:
+	kmem_cache_destroy(rds_tcp_conn_slab);
+out:
+	return ret;
+}
+module_init(rds_tcp_init);
+
+MODULE_AUTHOR("Oracle Corporation <rds-devel@oss.oracle.com>");
+MODULE_DESCRIPTION("RDS: TCP transport");
+MODULE_LICENSE("Dual BSD/GPL");
+
diff --git a/net/rds/tcp.h b/net/rds/tcp.h
new file mode 100644
index 0000000..6563749
--- /dev/null
+++ b/net/rds/tcp.h
@@ -0,0 +1,88 @@
+#ifndef _RDS_TCP_H
+#define _RDS_TCP_H
+
+#define RDS_TCP_PORT	16385
+
+struct rds_tcp_incoming {
+	struct rds_incoming	ti_inc;
+	struct sk_buff_head	ti_skb_list;
+};
+
+struct rds_tcp_connection {
+
+	struct list_head	t_tcp_node;
+	struct rds_connection   *conn;
+	struct socket		*t_sock;
+	void			*t_orig_write_space;
+	void			*t_orig_data_ready;
+	void			*t_orig_state_change;
+
+	struct rds_tcp_incoming	*t_tinc;
+	size_t			t_tinc_hdr_rem;
+	size_t			t_tinc_data_rem;
+
+	/* XXX error report? */
+	struct work_struct	t_conn_w;
+	struct work_struct	t_send_w;
+	struct work_struct	t_down_w;
+	struct work_struct	t_recv_w;
+
+	/* for info exporting only */
+	struct list_head	t_list_item;
+	u32			t_last_sent_nxt;
+	u32			t_last_expected_una;
+	u32			t_last_seen_una;
+};
+
+struct rds_tcp_statistics {
+	uint64_t	s_tcp_data_ready_calls;
+	uint64_t	s_tcp_write_space_calls;
+	uint64_t	s_tcp_sndbuf_full;
+	uint64_t	s_tcp_connect_raced;
+	uint64_t	s_tcp_listen_closed_stale;
+};
+
+/* tcp.c */
+void rds_tcp_tune(struct socket *sock);
+void rds_tcp_nonagle(struct socket *sock);
+void rds_tcp_set_callbacks(struct socket *sock, struct rds_connection *conn);
+void rds_tcp_restore_callbacks(struct socket *sock,
+			       struct rds_tcp_connection *tc);
+u32 rds_tcp_snd_nxt(struct rds_tcp_connection *tc);
+u32 rds_tcp_snd_una(struct rds_tcp_connection *tc);
+u64 rds_tcp_map_seq(struct rds_tcp_connection *tc, u32 seq);
+extern struct rds_transport rds_tcp_transport;
+
+/* tcp_connect.c */
+int rds_tcp_conn_connect(struct rds_connection *conn);
+void rds_tcp_conn_shutdown(struct rds_connection *conn);
+void rds_tcp_state_change(struct sock *sk);
+
+/* tcp_listen.c */
+int rds_tcp_listen_init(void);
+void rds_tcp_listen_stop(void);
+void rds_tcp_listen_data_ready(struct sock *sk);
+
+/* tcp_recv.c */
+int rds_tcp_recv_init(void);
+void rds_tcp_recv_exit(void);
+void rds_tcp_data_ready(struct sock *sk);
+int rds_tcp_recv(struct rds_connection *conn);
+void rds_tcp_inc_free(struct rds_incoming *inc);
+int rds_tcp_inc_copy_to_user(struct rds_incoming *inc, struct iovec *iov,
+			     size_t size);
+
+/* tcp_send.c */
+void rds_tcp_xmit_prepare(struct rds_connection *conn);
+void rds_tcp_xmit_complete(struct rds_connection *conn);
+int rds_tcp_xmit(struct rds_connection *conn, struct rds_message *rm,
+	         unsigned int hdr_off, unsigned int sg, unsigned int off);
+void rds_tcp_write_space(struct sock *sk);
+
+/* tcp_stats.c */
+DECLARE_PER_CPU(struct rds_tcp_statistics, rds_tcp_stats);
+#define rds_tcp_stats_inc(member) rds_stats_inc_which(rds_tcp_stats, member)
+unsigned int rds_tcp_stats_info_copy(struct rds_info_iterator *iter,
+				     unsigned int avail);
+
+#endif
diff --git a/net/rds/tcp_connect.c b/net/rds/tcp_connect.c
new file mode 100644
index 0000000..f9f564a
--- /dev/null
+++ b/net/rds/tcp_connect.c
@@ -0,0 +1,155 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/in.h>
+#include <net/tcp.h>
+
+#include "rds.h"
+#include "tcp.h"
+
+void rds_tcp_state_change(struct sock *sk)
+{
+	void (*state_change)(struct sock *sk);
+	struct rds_connection *conn;
+	struct rds_tcp_connection *tc;
+
+	read_lock(&sk->sk_callback_lock);
+	conn = sk->sk_user_data;
+	if (!conn) {
+		state_change = sk->sk_state_change;
+		goto out;
+	}
+	tc = conn->c_transport_data;
+	state_change = tc->t_orig_state_change;
+
+	rdsdebug("sock %p state_change to %d\n", tc->t_sock, sk->sk_state);
+
+	switch(sk->sk_state) {
+		/* ignore connecting sockets as they make progress */
+		case TCP_SYN_SENT:
+		case TCP_SYN_RECV:
+			break;
+		case TCP_ESTABLISHED:
+			rds_connect_complete(conn);
+			break;
+		case TCP_CLOSE:
+			rds_conn_drop(conn);
+		default:
+			break;
+	}
+out:
+	read_unlock(&sk->sk_callback_lock);
+	state_change(sk);
+}
+
+int rds_tcp_conn_connect(struct rds_connection *conn)
+{
+	struct socket *sock = NULL;
+	struct sockaddr_in src, dest;
+	int ret;
+
+	ret = sock_create(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock);
+	if (ret < 0)
+		goto out;
+
+	rds_tcp_tune(sock);
+
+	src.sin_family = AF_INET;
+	src.sin_addr.s_addr = (__force u32)conn->c_laddr;
+	src.sin_port = (__force u16)htons(0);
+
+	ret = sock->ops->bind(sock, (struct sockaddr *)&src, sizeof(src));
+	if (ret) {
+		rdsdebug("bind failed with %d at address %pI4\n",
+			 ret, &conn->c_laddr);
+		goto out;
+	}
+
+	dest.sin_family = AF_INET;
+	dest.sin_addr.s_addr = (__force u32)conn->c_faddr;
+	dest.sin_port = (__force u16)htons(RDS_TCP_PORT);
+
+	/*
+	 * once we call connect() we can start getting callbacks and they
+	 * own the socket
+	 */
+	rds_tcp_set_callbacks(sock, conn);
+	ret = sock->ops->connect(sock, (struct sockaddr *)&dest, sizeof(dest),
+				 O_NONBLOCK);
+
+	rdsdebug("connect to address %pI4 returned %d\n", &conn->c_faddr, ret);
+	if (ret == -EINPROGRESS)
+		ret = 0;
+	if (ret == 0)
+		sock = NULL;
+	else
+		rds_tcp_restore_callbacks(sock, conn->c_transport_data);
+
+out:
+	if (sock)
+		sock_release(sock);
+	return ret;
+}
+
+/*
+ * Before killing the tcp socket this needs to serialize with callbacks.  The
+ * caller has already grabbed the sending sem so we're serialized with other
+ * senders.
+ *
+ * TCP calls the callbacks with the sock lock so we hold it while we reset the
+ * callbacks to those set by TCP.  Our callbacks won't execute again once we
+ * hold the sock lock.
+ */
+void rds_tcp_conn_shutdown(struct rds_connection *conn)
+{
+	struct rds_tcp_connection *tc = conn->c_transport_data;
+	struct socket *sock = tc->t_sock;
+
+	rdsdebug("shutting down conn %p tc %p sock %p\n", conn, tc, sock);
+
+	if (sock) {
+		sock->ops->shutdown(sock, RCV_SHUTDOWN | SEND_SHUTDOWN);
+		lock_sock(sock->sk);
+		rds_tcp_restore_callbacks(sock, tc); /* tc->tc_sock = NULL */
+
+		release_sock(sock->sk);
+		sock_release(sock);
+	}
+
+	if (tc->t_tinc) {
+		rds_inc_put(&tc->t_tinc->ti_inc);
+		tc->t_tinc = NULL;
+	}
+	tc->t_tinc_hdr_rem = sizeof(struct rds_header);
+	tc->t_tinc_data_rem = 0;
+}
diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c
new file mode 100644
index 0000000..23ab4dc
--- /dev/null
+++ b/net/rds/tcp_listen.c
@@ -0,0 +1,200 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/gfp.h>
+#include <linux/in.h>
+#include <net/tcp.h>
+
+#include "rds.h"
+#include "tcp.h"
+
+/*
+ * cheesy, but simple..
+ */
+static void rds_tcp_accept_worker(struct work_struct *work);
+static DECLARE_WORK(rds_tcp_listen_work, rds_tcp_accept_worker);
+static struct socket *rds_tcp_listen_sock;
+
+static int rds_tcp_accept_one(struct socket *sock)
+{
+	struct socket *new_sock = NULL;
+	struct rds_connection *conn;
+	int ret;
+	struct inet_sock *inet;
+
+	ret = sock_create_lite(sock->sk->sk_family, sock->sk->sk_type,
+			       sock->sk->sk_protocol, &new_sock);
+	if (ret)
+		goto out;
+
+	new_sock->type = sock->type;
+	new_sock->ops = sock->ops;
+	ret = sock->ops->accept(sock, new_sock, O_NONBLOCK);
+	if (ret < 0)
+		goto out;
+
+	rds_tcp_tune(new_sock);
+
+	inet = inet_sk(new_sock->sk);
+
+	rdsdebug("accepted tcp %pI4:%u -> %pI4:%u\n",
+		 &inet->inet_saddr, ntohs(inet->inet_sport),
+		 &inet->inet_daddr, ntohs(inet->inet_dport));
+
+	conn = rds_conn_create(inet->inet_saddr, inet->inet_daddr,
+			       &rds_tcp_transport, GFP_KERNEL);
+	if (IS_ERR(conn)) {
+		ret = PTR_ERR(conn);
+		goto out;
+	}
+
+	/*
+	 * see the comment above rds_queue_delayed_reconnect()
+	 */
+	if (!rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_CONNECTING)) {
+		if (rds_conn_state(conn) == RDS_CONN_UP)
+			rds_tcp_stats_inc(s_tcp_listen_closed_stale);
+		else
+			rds_tcp_stats_inc(s_tcp_connect_raced);
+		rds_conn_drop(conn);
+		ret = 0;
+		goto out;
+	}
+
+	rds_tcp_set_callbacks(new_sock, conn);
+	rds_connect_complete(conn);
+	new_sock = NULL;
+	ret = 0;
+
+out:
+	if (new_sock)
+		sock_release(new_sock);
+	return ret;
+}
+
+static void rds_tcp_accept_worker(struct work_struct *work)
+{
+	while (rds_tcp_accept_one(rds_tcp_listen_sock) == 0)
+		cond_resched();
+}
+
+void rds_tcp_listen_data_ready(struct sock *sk)
+{
+	void (*ready)(struct sock *sk);
+
+	rdsdebug("listen data ready sk %p\n", sk);
+
+	read_lock(&sk->sk_callback_lock);
+	ready = sk->sk_user_data;
+	if (!ready) { /* check for teardown race */
+		ready = sk->sk_data_ready;
+		goto out;
+	}
+
+	/*
+	 * ->sk_data_ready is also called for a newly established child socket
+	 * before it has been accepted and the accepter has set up their
+	 * data_ready.. we only want to queue listen work for our listening
+	 * socket
+	 */
+	if (sk->sk_state == TCP_LISTEN)
+		queue_work(rds_wq, &rds_tcp_listen_work);
+
+out:
+	read_unlock(&sk->sk_callback_lock);
+	ready(sk);
+}
+
+int rds_tcp_listen_init(void)
+{
+	struct sockaddr_in sin;
+	struct socket *sock = NULL;
+	int ret;
+
+	ret = sock_create(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock);
+	if (ret < 0)
+		goto out;
+
+	sock->sk->sk_reuse = SK_CAN_REUSE;
+	rds_tcp_nonagle(sock);
+
+	write_lock_bh(&sock->sk->sk_callback_lock);
+	sock->sk->sk_user_data = sock->sk->sk_data_ready;
+	sock->sk->sk_data_ready = rds_tcp_listen_data_ready;
+	write_unlock_bh(&sock->sk->sk_callback_lock);
+
+	sin.sin_family = PF_INET;
+	sin.sin_addr.s_addr = (__force u32)htonl(INADDR_ANY);
+	sin.sin_port = (__force u16)htons(RDS_TCP_PORT);
+
+	ret = sock->ops->bind(sock, (struct sockaddr *)&sin, sizeof(sin));
+	if (ret < 0)
+		goto out;
+
+	ret = sock->ops->listen(sock, 64);
+	if (ret < 0)
+		goto out;
+
+	rds_tcp_listen_sock = sock;
+	sock = NULL;
+out:
+	if (sock)
+		sock_release(sock);
+	return ret;
+}
+
+void rds_tcp_listen_stop(void)
+{
+	struct socket *sock = rds_tcp_listen_sock;
+	struct sock *sk;
+
+	if (!sock)
+		return;
+
+	sk = sock->sk;
+
+	/* serialize with and prevent further callbacks */
+	lock_sock(sk);
+	write_lock_bh(&sk->sk_callback_lock);
+	if (sk->sk_user_data) {
+		sk->sk_data_ready = sk->sk_user_data;
+		sk->sk_user_data = NULL;
+	}
+	write_unlock_bh(&sk->sk_callback_lock);
+	release_sock(sk);
+
+	/* wait for accepts to stop and close the socket */
+	flush_workqueue(rds_wq);
+	sock_release(sock);
+	rds_tcp_listen_sock = NULL;
+}
diff --git a/net/rds/tcp_recv.c b/net/rds/tcp_recv.c
new file mode 100644
index 0000000..9ae6e0a
--- /dev/null
+++ b/net/rds/tcp_recv.c
@@ -0,0 +1,356 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <net/tcp.h>
+
+#include "rds.h"
+#include "tcp.h"
+
+static struct kmem_cache *rds_tcp_incoming_slab;
+
+static void rds_tcp_inc_purge(struct rds_incoming *inc)
+{
+	struct rds_tcp_incoming *tinc;
+	tinc = container_of(inc, struct rds_tcp_incoming, ti_inc);
+	rdsdebug("purging tinc %p inc %p\n", tinc, inc);
+	skb_queue_purge(&tinc->ti_skb_list);
+}
+
+void rds_tcp_inc_free(struct rds_incoming *inc)
+{
+	struct rds_tcp_incoming *tinc;
+	tinc = container_of(inc, struct rds_tcp_incoming, ti_inc);
+	rds_tcp_inc_purge(inc);
+	rdsdebug("freeing tinc %p inc %p\n", tinc, inc);
+	kmem_cache_free(rds_tcp_incoming_slab, tinc);
+}
+
+/*
+ * this is pretty lame, but, whatever.
+ */
+int rds_tcp_inc_copy_to_user(struct rds_incoming *inc, struct iovec *first_iov,
+			     size_t size)
+{
+	struct rds_tcp_incoming *tinc;
+	struct iovec *iov, tmp;
+	struct sk_buff *skb;
+	unsigned long to_copy, skb_off;
+	int ret = 0;
+
+	if (size == 0)
+		goto out;
+
+	tinc = container_of(inc, struct rds_tcp_incoming, ti_inc);
+	iov = first_iov;
+	tmp = *iov;
+
+	skb_queue_walk(&tinc->ti_skb_list, skb) {
+		skb_off = 0;
+		while (skb_off < skb->len) {
+			while (tmp.iov_len == 0) {
+				iov++;
+				tmp = *iov;
+			}
+
+			to_copy = min(tmp.iov_len, size);
+			to_copy = min(to_copy, skb->len - skb_off);
+
+			rdsdebug("ret %d size %zu skb %p skb_off %lu "
+				 "skblen %d iov_base %p iov_len %zu cpy %lu\n",
+				 ret, size, skb, skb_off, skb->len,
+				 tmp.iov_base, tmp.iov_len, to_copy);
+
+			/* modifies tmp as it copies */
+			if (skb_copy_datagram_iovec(skb, skb_off, &tmp,
+						    to_copy)) {
+				ret = -EFAULT;
+				goto out;
+			}
+
+			rds_stats_add(s_copy_to_user, to_copy);
+			size -= to_copy;
+			ret += to_copy;
+			skb_off += to_copy;
+			if (size == 0)
+				goto out;
+		}
+	}
+out:
+	return ret;
+}
+
+/*
+ * We have a series of skbs that have fragmented pieces of the congestion
+ * bitmap.  They must add up to the exact size of the congestion bitmap.  We
+ * use the skb helpers to copy those into the pages that make up the in-memory
+ * congestion bitmap for the remote address of this connection.  We then tell
+ * the congestion core that the bitmap has been changed so that it can wake up
+ * sleepers.
+ *
+ * This is racing with sending paths which are using test_bit to see if the
+ * bitmap indicates that their recipient is congested.
+ */
+
+static void rds_tcp_cong_recv(struct rds_connection *conn,
+			      struct rds_tcp_incoming *tinc)
+{
+	struct sk_buff *skb;
+	unsigned int to_copy, skb_off;
+	unsigned int map_off;
+	unsigned int map_page;
+	struct rds_cong_map *map;
+	int ret;
+
+	/* catch completely corrupt packets */
+	if (be32_to_cpu(tinc->ti_inc.i_hdr.h_len) != RDS_CONG_MAP_BYTES)
+		return;
+
+	map_page = 0;
+	map_off = 0;
+	map = conn->c_fcong;
+
+	skb_queue_walk(&tinc->ti_skb_list, skb) {
+		skb_off = 0;
+		while (skb_off < skb->len) {
+			to_copy = min_t(unsigned int, PAGE_SIZE - map_off,
+					skb->len - skb_off);
+
+			BUG_ON(map_page >= RDS_CONG_MAP_PAGES);
+
+			/* only returns 0 or -error */
+			ret = skb_copy_bits(skb, skb_off,
+				(void *)map->m_page_addrs[map_page] + map_off,
+				to_copy);
+			BUG_ON(ret != 0);
+
+			skb_off += to_copy;
+			map_off += to_copy;
+			if (map_off == PAGE_SIZE) {
+				map_off = 0;
+				map_page++;
+			}
+		}
+	}
+
+	rds_cong_map_updated(map, ~(u64) 0);
+}
+
+struct rds_tcp_desc_arg {
+	struct rds_connection *conn;
+	gfp_t gfp;
+};
+
+static int rds_tcp_data_recv(read_descriptor_t *desc, struct sk_buff *skb,
+			     unsigned int offset, size_t len)
+{
+	struct rds_tcp_desc_arg *arg = desc->arg.data;
+	struct rds_connection *conn = arg->conn;
+	struct rds_tcp_connection *tc = conn->c_transport_data;
+	struct rds_tcp_incoming *tinc = tc->t_tinc;
+	struct sk_buff *clone;
+	size_t left = len, to_copy;
+
+	rdsdebug("tcp data tc %p skb %p offset %u len %zu\n", tc, skb, offset,
+		 len);
+
+	/*
+	 * tcp_read_sock() interprets partial progress as an indication to stop
+	 * processing.
+	 */
+	while (left) {
+		if (!tinc) {
+			tinc = kmem_cache_alloc(rds_tcp_incoming_slab,
+					        arg->gfp);
+			if (!tinc) {
+				desc->error = -ENOMEM;
+				goto out;
+			}
+			tc->t_tinc = tinc;
+			rdsdebug("alloced tinc %p\n", tinc);
+			rds_inc_init(&tinc->ti_inc, conn, conn->c_faddr);
+			/*
+			 * XXX * we might be able to use the __ variants when
+			 * we've already serialized at a higher level.
+			 */
+			skb_queue_head_init(&tinc->ti_skb_list);
+		}
+
+		if (left && tc->t_tinc_hdr_rem) {
+			to_copy = min(tc->t_tinc_hdr_rem, left);
+			rdsdebug("copying %zu header from skb %p\n", to_copy,
+				 skb);
+			skb_copy_bits(skb, offset,
+				      (char *)&tinc->ti_inc.i_hdr +
+						sizeof(struct rds_header) -
+						tc->t_tinc_hdr_rem,
+				      to_copy);
+			tc->t_tinc_hdr_rem -= to_copy;
+			left -= to_copy;
+			offset += to_copy;
+
+			if (tc->t_tinc_hdr_rem == 0) {
+				/* could be 0 for a 0 len message */
+				tc->t_tinc_data_rem =
+					be32_to_cpu(tinc->ti_inc.i_hdr.h_len);
+			}
+		}
+
+		if (left && tc->t_tinc_data_rem) {
+			clone = skb_clone(skb, arg->gfp);
+			if (!clone) {
+				desc->error = -ENOMEM;
+				goto out;
+			}
+
+			to_copy = min(tc->t_tinc_data_rem, left);
+			pskb_pull(clone, offset);
+			pskb_trim(clone, to_copy);
+			skb_queue_tail(&tinc->ti_skb_list, clone);
+
+			rdsdebug("skb %p data %p len %d off %u to_copy %zu -> "
+				 "clone %p data %p len %d\n",
+				 skb, skb->data, skb->len, offset, to_copy,
+				 clone, clone->data, clone->len);
+
+			tc->t_tinc_data_rem -= to_copy;
+			left -= to_copy;
+			offset += to_copy;
+		}
+
+		if (tc->t_tinc_hdr_rem == 0 && tc->t_tinc_data_rem == 0) {
+			if (tinc->ti_inc.i_hdr.h_flags == RDS_FLAG_CONG_BITMAP)
+				rds_tcp_cong_recv(conn, tinc);
+			else
+				rds_recv_incoming(conn, conn->c_faddr,
+						  conn->c_laddr, &tinc->ti_inc,
+						  arg->gfp);
+
+			tc->t_tinc_hdr_rem = sizeof(struct rds_header);
+			tc->t_tinc_data_rem = 0;
+			tc->t_tinc = NULL;
+			rds_inc_put(&tinc->ti_inc);
+			tinc = NULL;
+		}
+	}
+out:
+	rdsdebug("returning len %zu left %zu skb len %d rx queue depth %d\n",
+		 len, left, skb->len,
+		 skb_queue_len(&tc->t_sock->sk->sk_receive_queue));
+	return len - left;
+}
+
+/* the caller has to hold the sock lock */
+static int rds_tcp_read_sock(struct rds_connection *conn, gfp_t gfp)
+{
+	struct rds_tcp_connection *tc = conn->c_transport_data;
+	struct socket *sock = tc->t_sock;
+	read_descriptor_t desc;
+	struct rds_tcp_desc_arg arg;
+
+	/* It's like glib in the kernel! */
+	arg.conn = conn;
+	arg.gfp = gfp;
+	desc.arg.data = &arg;
+	desc.error = 0;
+	desc.count = 1; /* give more than one skb per call */
+
+	tcp_read_sock(sock->sk, &desc, rds_tcp_data_recv);
+	rdsdebug("tcp_read_sock for tc %p gfp 0x%x returned %d\n", tc, gfp,
+		 desc.error);
+
+	return desc.error;
+}
+
+/*
+ * We hold the sock lock to serialize our rds_tcp_recv->tcp_read_sock from
+ * data_ready.
+ *
+ * if we fail to allocate we're in trouble.. blindly wait some time before
+ * trying again to see if the VM can free up something for us.
+ */
+int rds_tcp_recv(struct rds_connection *conn)
+{
+	struct rds_tcp_connection *tc = conn->c_transport_data;
+	struct socket *sock = tc->t_sock;
+	int ret = 0;
+
+	rdsdebug("recv worker conn %p tc %p sock %p\n", conn, tc, sock);
+
+	lock_sock(sock->sk);
+	ret = rds_tcp_read_sock(conn, GFP_KERNEL);
+	release_sock(sock->sk);
+
+	return ret;
+}
+
+void rds_tcp_data_ready(struct sock *sk)
+{
+	void (*ready)(struct sock *sk);
+	struct rds_connection *conn;
+	struct rds_tcp_connection *tc;
+
+	rdsdebug("data ready sk %p\n", sk);
+
+	read_lock(&sk->sk_callback_lock);
+	conn = sk->sk_user_data;
+	if (!conn) { /* check for teardown race */
+		ready = sk->sk_data_ready;
+		goto out;
+	}
+
+	tc = conn->c_transport_data;
+	ready = tc->t_orig_data_ready;
+	rds_tcp_stats_inc(s_tcp_data_ready_calls);
+
+	if (rds_tcp_read_sock(conn, GFP_ATOMIC) == -ENOMEM)
+		queue_delayed_work(rds_wq, &conn->c_recv_w, 0);
+out:
+	read_unlock(&sk->sk_callback_lock);
+	ready(sk);
+}
+
+int rds_tcp_recv_init(void)
+{
+	rds_tcp_incoming_slab = kmem_cache_create("rds_tcp_incoming",
+					sizeof(struct rds_tcp_incoming),
+					0, 0, NULL);
+	if (!rds_tcp_incoming_slab)
+		return -ENOMEM;
+	return 0;
+}
+
+void rds_tcp_recv_exit(void)
+{
+	kmem_cache_destroy(rds_tcp_incoming_slab);
+}
diff --git a/net/rds/tcp_send.c b/net/rds/tcp_send.c
new file mode 100644
index 0000000..53b17ca
--- /dev/null
+++ b/net/rds/tcp_send.c
@@ -0,0 +1,215 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/in.h>
+#include <net/tcp.h>
+
+#include "rds.h"
+#include "tcp.h"
+
+static void rds_tcp_cork(struct socket *sock, int val)
+{
+	mm_segment_t oldfs;
+
+	oldfs = get_fs();
+	set_fs(KERNEL_DS);
+	sock->ops->setsockopt(sock, SOL_TCP, TCP_CORK, (char __user *)&val,
+			      sizeof(val));
+	set_fs(oldfs);
+}
+
+void rds_tcp_xmit_prepare(struct rds_connection *conn)
+{
+	struct rds_tcp_connection *tc = conn->c_transport_data;
+
+	rds_tcp_cork(tc->t_sock, 1);
+}
+
+void rds_tcp_xmit_complete(struct rds_connection *conn)
+{
+	struct rds_tcp_connection *tc = conn->c_transport_data;
+
+	rds_tcp_cork(tc->t_sock, 0);
+}
+
+/* the core send_sem serializes this with other xmit and shutdown */
+static int rds_tcp_sendmsg(struct socket *sock, void *data, unsigned int len)
+{
+	struct kvec vec = {
+                .iov_base = data,
+                .iov_len = len,
+	};
+        struct msghdr msg = {
+                .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL,
+        };
+
+	return kernel_sendmsg(sock, &msg, &vec, 1, vec.iov_len);
+}
+
+/* the core send_sem serializes this with other xmit and shutdown */
+int rds_tcp_xmit(struct rds_connection *conn, struct rds_message *rm,
+	         unsigned int hdr_off, unsigned int sg, unsigned int off)
+{
+	struct rds_tcp_connection *tc = conn->c_transport_data;
+	int done = 0;
+	int ret = 0;
+
+	if (hdr_off == 0) {
+		/*
+		 * m_ack_seq is set to the sequence number of the last byte of
+		 * header and data.  see rds_tcp_is_acked().
+		 */
+		tc->t_last_sent_nxt = rds_tcp_snd_nxt(tc);
+		rm->m_ack_seq = tc->t_last_sent_nxt +
+				sizeof(struct rds_header) +
+				be32_to_cpu(rm->m_inc.i_hdr.h_len) - 1;
+		smp_mb__before_atomic();
+		set_bit(RDS_MSG_HAS_ACK_SEQ, &rm->m_flags);
+		tc->t_last_expected_una = rm->m_ack_seq + 1;
+
+		rdsdebug("rm %p tcp nxt %u ack_seq %llu\n",
+			 rm, rds_tcp_snd_nxt(tc),
+			 (unsigned long long)rm->m_ack_seq);
+	}
+
+	if (hdr_off < sizeof(struct rds_header)) {
+		/* see rds_tcp_write_space() */
+		set_bit(SOCK_NOSPACE, &tc->t_sock->sk->sk_socket->flags);
+
+		ret = rds_tcp_sendmsg(tc->t_sock,
+				      (void *)&rm->m_inc.i_hdr + hdr_off,
+				      sizeof(rm->m_inc.i_hdr) - hdr_off);
+		if (ret < 0)
+			goto out;
+		done += ret;
+		if (hdr_off + done != sizeof(struct rds_header))
+			goto out;
+	}
+
+	while (sg < rm->data.op_nents) {
+		ret = tc->t_sock->ops->sendpage(tc->t_sock,
+						sg_page(&rm->data.op_sg[sg]),
+						rm->data.op_sg[sg].offset + off,
+						rm->data.op_sg[sg].length - off,
+						MSG_DONTWAIT|MSG_NOSIGNAL);
+		rdsdebug("tcp sendpage %p:%u:%u ret %d\n", (void *)sg_page(&rm->data.op_sg[sg]),
+			 rm->data.op_sg[sg].offset + off, rm->data.op_sg[sg].length - off,
+			 ret);
+		if (ret <= 0)
+			break;
+
+		off += ret;
+		done += ret;
+		if (off == rm->data.op_sg[sg].length) {
+			off = 0;
+			sg++;
+		}
+	}
+
+out:
+	if (ret <= 0) {
+		/* write_space will hit after EAGAIN, all else fatal */
+		if (ret == -EAGAIN) {
+			rds_tcp_stats_inc(s_tcp_sndbuf_full);
+			ret = 0;
+		} else {
+			printk(KERN_WARNING "RDS/tcp: send to %pI4 "
+			       "returned %d, disconnecting and reconnecting\n",
+			       &conn->c_faddr, ret);
+			rds_conn_drop(conn);
+		}
+	}
+	if (done == 0)
+		done = ret;
+	return done;
+}
+
+/*
+ * rm->m_ack_seq is set to the tcp sequence number that corresponds to the
+ * last byte of the message, including the header.  This means that the
+ * entire message has been received if rm->m_ack_seq is "before" the next
+ * unacked byte of the TCP sequence space.  We have to do very careful
+ * wrapping 32bit comparisons here.
+ */
+static int rds_tcp_is_acked(struct rds_message *rm, uint64_t ack)
+{
+	if (!test_bit(RDS_MSG_HAS_ACK_SEQ, &rm->m_flags))
+		return 0;
+	return (__s32)((u32)rm->m_ack_seq - (u32)ack) < 0;
+}
+
+void rds_tcp_write_space(struct sock *sk)
+{
+	void (*write_space)(struct sock *sk);
+	struct rds_connection *conn;
+	struct rds_tcp_connection *tc;
+
+	read_lock(&sk->sk_callback_lock);
+	conn = sk->sk_user_data;
+	if (!conn) {
+		write_space = sk->sk_write_space;
+		goto out;
+	}
+
+	tc = conn->c_transport_data;
+	rdsdebug("write_space for tc %p\n", tc);
+	write_space = tc->t_orig_write_space;
+	rds_tcp_stats_inc(s_tcp_write_space_calls);
+
+	rdsdebug("tcp una %u\n", rds_tcp_snd_una(tc));
+	tc->t_last_seen_una = rds_tcp_snd_una(tc);
+	rds_send_drop_acked(conn, rds_tcp_snd_una(tc), rds_tcp_is_acked);
+
+        if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf)
+		queue_delayed_work(rds_wq, &conn->c_send_w, 0);
+
+out:
+	read_unlock(&sk->sk_callback_lock);
+
+	/*
+	 * write_space is only called when data leaves tcp's send queue if
+	 * SOCK_NOSPACE is set.  We set SOCK_NOSPACE every time we put
+	 * data in tcp's send queue because we use write_space to parse the
+	 * sequence numbers and notice that rds messages have been fully
+	 * received.
+	 *
+	 * tcp's write_space clears SOCK_NOSPACE if the send queue has more
+	 * than a certain amount of space. So we need to set it again *after*
+	 * we call tcp's write_space or else we might only get called on the
+	 * first of a series of incoming tcp acks.
+	 */
+	write_space(sk);
+
+	if (sk->sk_socket)
+		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+}
diff --git a/net/rds/tcp_stats.c b/net/rds/tcp_stats.c
new file mode 100644
index 0000000..f8a7954
--- /dev/null
+++ b/net/rds/tcp_stats.c
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/percpu.h>
+#include <linux/seq_file.h>
+#include <linux/proc_fs.h>
+
+#include "rds.h"
+#include "tcp.h"
+
+DEFINE_PER_CPU(struct rds_tcp_statistics, rds_tcp_stats)
+	____cacheline_aligned;
+
+static const char * const rds_tcp_stat_names[] = {
+	"tcp_data_ready_calls",
+	"tcp_write_space_calls",
+	"tcp_sndbuf_full",
+	"tcp_connect_raced",
+	"tcp_listen_closed_stale",
+};
+
+unsigned int rds_tcp_stats_info_copy(struct rds_info_iterator *iter,
+				     unsigned int avail)
+{
+	struct rds_tcp_statistics stats = {0, };
+	uint64_t *src;
+	uint64_t *sum;
+	size_t i;
+	int cpu;
+
+	if (avail < ARRAY_SIZE(rds_tcp_stat_names))
+		goto out;
+
+	for_each_online_cpu(cpu) {
+		src = (uint64_t *)&(per_cpu(rds_tcp_stats, cpu));
+		sum = (uint64_t *)&stats;
+		for (i = 0; i < sizeof(stats) / sizeof(uint64_t); i++)
+			*(sum++) += *(src++);
+	}
+
+	rds_stats_info_copy(iter, (uint64_t *)&stats, rds_tcp_stat_names,
+			    ARRAY_SIZE(rds_tcp_stat_names));
+out:
+	return ARRAY_SIZE(rds_tcp_stat_names);
+}
diff --git a/net/rds/threads.c b/net/rds/threads.c
new file mode 100644
index 0000000..dc2402e
--- /dev/null
+++ b/net/rds/threads.c
@@ -0,0 +1,222 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/random.h>
+#include <linux/export.h>
+
+#include "rds.h"
+
+/*
+ * All of connection management is simplified by serializing it through
+ * work queues that execute in a connection managing thread.
+ *
+ * TCP wants to send acks through sendpage() in response to data_ready(),
+ * but it needs a process context to do so.
+ *
+ * The receive paths need to allocate but can't drop packets (!) so we have
+ * a thread around to block allocating if the receive fast path sees an
+ * allocation failure.
+ */
+
+/* Grand Unified Theory of connection life cycle:
+ * At any point in time, the connection can be in one of these states:
+ * DOWN, CONNECTING, UP, DISCONNECTING, ERROR
+ *
+ * The following transitions are possible:
+ *  ANY		  -> ERROR
+ *  UP		  -> DISCONNECTING
+ *  ERROR	  -> DISCONNECTING
+ *  DISCONNECTING -> DOWN
+ *  DOWN	  -> CONNECTING
+ *  CONNECTING	  -> UP
+ *
+ * Transition to state DISCONNECTING/DOWN:
+ *  -	Inside the shutdown worker; synchronizes with xmit path
+ *	through RDS_IN_XMIT, and with connection management callbacks
+ *	via c_cm_lock.
+ *
+ *	For receive callbacks, we rely on the underlying transport
+ *	(TCP, IB/RDMA) to provide the necessary synchronisation.
+ */
+struct workqueue_struct *rds_wq;
+EXPORT_SYMBOL_GPL(rds_wq);
+
+void rds_connect_complete(struct rds_connection *conn)
+{
+	if (!rds_conn_transition(conn, RDS_CONN_CONNECTING, RDS_CONN_UP)) {
+		printk(KERN_WARNING "%s: Cannot transition to state UP, "
+				"current state is %d\n",
+				__func__,
+				atomic_read(&conn->c_state));
+		rds_conn_drop(conn);
+		return;
+	}
+
+	rdsdebug("conn %p for %pI4 to %pI4 complete\n",
+	  conn, &conn->c_laddr, &conn->c_faddr);
+
+	conn->c_reconnect_jiffies = 0;
+	set_bit(0, &conn->c_map_queued);
+	queue_delayed_work(rds_wq, &conn->c_send_w, 0);
+	queue_delayed_work(rds_wq, &conn->c_recv_w, 0);
+}
+EXPORT_SYMBOL_GPL(rds_connect_complete);
+
+/*
+ * This random exponential backoff is relied on to eventually resolve racing
+ * connects.
+ *
+ * If connect attempts race then both parties drop both connections and come
+ * here to wait for a random amount of time before trying again.  Eventually
+ * the backoff range will be so much greater than the time it takes to
+ * establish a connection that one of the pair will establish the connection
+ * before the other's random delay fires.
+ *
+ * Connection attempts that arrive while a connection is already established
+ * are also considered to be racing connects.  This lets a connection from
+ * a rebooted machine replace an existing stale connection before the transport
+ * notices that the connection has failed.
+ *
+ * We should *always* start with a random backoff; otherwise a broken connection
+ * will always take several iterations to be re-established.
+ */
+void rds_queue_reconnect(struct rds_connection *conn)
+{
+	unsigned long rand;
+
+	rdsdebug("conn %p for %pI4 to %pI4 reconnect jiffies %lu\n",
+	  conn, &conn->c_laddr, &conn->c_faddr,
+	  conn->c_reconnect_jiffies);
+
+	set_bit(RDS_RECONNECT_PENDING, &conn->c_flags);
+	if (conn->c_reconnect_jiffies == 0) {
+		conn->c_reconnect_jiffies = rds_sysctl_reconnect_min_jiffies;
+		queue_delayed_work(rds_wq, &conn->c_conn_w, 0);
+		return;
+	}
+
+	get_random_bytes(&rand, sizeof(rand));
+	rdsdebug("%lu delay %lu ceil conn %p for %pI4 -> %pI4\n",
+		 rand % conn->c_reconnect_jiffies, conn->c_reconnect_jiffies,
+		 conn, &conn->c_laddr, &conn->c_faddr);
+	queue_delayed_work(rds_wq, &conn->c_conn_w,
+			   rand % conn->c_reconnect_jiffies);
+
+	conn->c_reconnect_jiffies = min(conn->c_reconnect_jiffies * 2,
+					rds_sysctl_reconnect_max_jiffies);
+}
+
+void rds_connect_worker(struct work_struct *work)
+{
+	struct rds_connection *conn = container_of(work, struct rds_connection, c_conn_w.work);
+	int ret;
+
+	clear_bit(RDS_RECONNECT_PENDING, &conn->c_flags);
+	if (rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_CONNECTING)) {
+		ret = conn->c_trans->conn_connect(conn);
+		rdsdebug("conn %p for %pI4 to %pI4 dispatched, ret %d\n",
+			conn, &conn->c_laddr, &conn->c_faddr, ret);
+
+		if (ret) {
+			if (rds_conn_transition(conn, RDS_CONN_CONNECTING, RDS_CONN_DOWN))
+				rds_queue_reconnect(conn);
+			else
+				rds_conn_error(conn, "RDS: connect failed\n");
+		}
+	}
+}
+
+void rds_send_worker(struct work_struct *work)
+{
+	struct rds_connection *conn = container_of(work, struct rds_connection, c_send_w.work);
+	int ret;
+
+	if (rds_conn_state(conn) == RDS_CONN_UP) {
+		ret = rds_send_xmit(conn);
+		rdsdebug("conn %p ret %d\n", conn, ret);
+		switch (ret) {
+		case -EAGAIN:
+			rds_stats_inc(s_send_immediate_retry);
+			queue_delayed_work(rds_wq, &conn->c_send_w, 0);
+			break;
+		case -ENOMEM:
+			rds_stats_inc(s_send_delayed_retry);
+			queue_delayed_work(rds_wq, &conn->c_send_w, 2);
+		default:
+			break;
+		}
+	}
+}
+
+void rds_recv_worker(struct work_struct *work)
+{
+	struct rds_connection *conn = container_of(work, struct rds_connection, c_recv_w.work);
+	int ret;
+
+	if (rds_conn_state(conn) == RDS_CONN_UP) {
+		ret = conn->c_trans->recv(conn);
+		rdsdebug("conn %p ret %d\n", conn, ret);
+		switch (ret) {
+		case -EAGAIN:
+			rds_stats_inc(s_recv_immediate_retry);
+			queue_delayed_work(rds_wq, &conn->c_recv_w, 0);
+			break;
+		case -ENOMEM:
+			rds_stats_inc(s_recv_delayed_retry);
+			queue_delayed_work(rds_wq, &conn->c_recv_w, 2);
+		default:
+			break;
+		}
+	}
+}
+
+void rds_shutdown_worker(struct work_struct *work)
+{
+	struct rds_connection *conn = container_of(work, struct rds_connection, c_down_w);
+
+	rds_conn_shutdown(conn);
+}
+
+void rds_threads_exit(void)
+{
+	destroy_workqueue(rds_wq);
+}
+
+int rds_threads_init(void)
+{
+	rds_wq = create_singlethread_workqueue("krdsd");
+	if (!rds_wq)
+		return -ENOMEM;
+
+	return 0;
+}
diff --git a/net/rds/transport.c b/net/rds/transport.c
new file mode 100644
index 0000000..7f2ac4f
--- /dev/null
+++ b/net/rds/transport.c
@@ -0,0 +1,137 @@
+/*
+ * Copyright (c) 2006 Oracle.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/in.h>
+
+#include "rds.h"
+#include "loop.h"
+
+static struct rds_transport *transports[RDS_TRANS_COUNT];
+static DECLARE_RWSEM(rds_trans_sem);
+
+int rds_trans_register(struct rds_transport *trans)
+{
+	BUG_ON(strlen(trans->t_name) + 1 > TRANSNAMSIZ);
+
+	down_write(&rds_trans_sem);
+
+	if (transports[trans->t_type])
+		printk(KERN_ERR "RDS Transport type %d already registered\n",
+			trans->t_type);
+	else {
+		transports[trans->t_type] = trans;
+		printk(KERN_INFO "Registered RDS/%s transport\n", trans->t_name);
+	}
+
+	up_write(&rds_trans_sem);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(rds_trans_register);
+
+void rds_trans_unregister(struct rds_transport *trans)
+{
+	down_write(&rds_trans_sem);
+
+	transports[trans->t_type] = NULL;
+	printk(KERN_INFO "Unregistered RDS/%s transport\n", trans->t_name);
+
+	up_write(&rds_trans_sem);
+}
+EXPORT_SYMBOL_GPL(rds_trans_unregister);
+
+void rds_trans_put(struct rds_transport *trans)
+{
+	if (trans && trans->t_owner)
+		module_put(trans->t_owner);
+}
+
+struct rds_transport *rds_trans_get_preferred(__be32 addr)
+{
+	struct rds_transport *ret = NULL;
+	struct rds_transport *trans;
+	unsigned int i;
+
+	if (IN_LOOPBACK(ntohl(addr)))
+		return &rds_loop_transport;
+
+	down_read(&rds_trans_sem);
+	for (i = 0; i < RDS_TRANS_COUNT; i++) {
+		trans = transports[i];
+
+		if (trans && (trans->laddr_check(addr) == 0) &&
+		    (!trans->t_owner || try_module_get(trans->t_owner))) {
+			ret = trans;
+			break;
+		}
+	}
+	up_read(&rds_trans_sem);
+
+	return ret;
+}
+
+/*
+ * This returns the number of stats entries in the snapshot and only
+ * copies them using the iter if there is enough space for them.  The
+ * caller passes in the global stats so that we can size and copy while
+ * holding the lock.
+ */
+unsigned int rds_trans_stats_info_copy(struct rds_info_iterator *iter,
+				       unsigned int avail)
+
+{
+	struct rds_transport *trans;
+	unsigned int total = 0;
+	unsigned int part;
+	int i;
+
+	rds_info_iter_unmap(iter);
+	down_read(&rds_trans_sem);
+
+	for (i = 0; i < RDS_TRANS_COUNT; i++)
+	{
+		trans = transports[i];
+		if (!trans || !trans->stats_info_copy)
+			continue;
+
+		part = trans->stats_info_copy(iter, avail);
+		avail -= min(avail, part);
+		total += part;
+	}
+
+	up_read(&rds_trans_sem);
+
+	return total;
+}
+
diff --git a/net/sunrpc/xprtrdma/Makefile b/net/sunrpc/xprtrdma/Makefile
new file mode 100644
index 0000000..da5136f
--- /dev/null
+++ b/net/sunrpc/xprtrdma/Makefile
@@ -0,0 +1,8 @@
+obj-$(CONFIG_SUNRPC_XPRT_RDMA_CLIENT) += xprtrdma.o
+
+xprtrdma-y := transport.o rpc_rdma.o verbs.o
+
+obj-$(CONFIG_SUNRPC_XPRT_RDMA_SERVER) += svcrdma.o
+
+svcrdma-y := svc_rdma.o svc_rdma_transport.o \
+	svc_rdma_marshal.o svc_rdma_sendto.o svc_rdma_recvfrom.o
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
new file mode 100644
index 0000000..6166c98
--- /dev/null
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -0,0 +1,875 @@
+/*
+ * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the BSD-type
+ * license below:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *      Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *
+ *      Redistributions in binary form must reproduce the above
+ *      copyright notice, this list of conditions and the following
+ *      disclaimer in the documentation and/or other materials provided
+ *      with the distribution.
+ *
+ *      Neither the name of the Network Appliance, Inc. nor the names of
+ *      its contributors may be used to endorse or promote products
+ *      derived from this software without specific prior written
+ *      permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * rpc_rdma.c
+ *
+ * This file contains the guts of the RPC RDMA protocol, and
+ * does marshaling/unmarshaling, etc. It is also where interfacing
+ * to the Linux RPC framework lives.
+ */
+
+#include "xprt_rdma.h"
+
+#include <linux/highmem.h>
+
+#ifdef RPC_DEBUG
+# define RPCDBG_FACILITY	RPCDBG_TRANS
+#endif
+
+#ifdef RPC_DEBUG
+static const char transfertypes[][12] = {
+	"pure inline",	/* no chunks */
+	" read chunk",	/* some argument via rdma read */
+	"*read chunk",	/* entire request via rdma read */
+	"write chunk",	/* some result via rdma write */
+	"reply chunk"	/* entire reply via rdma write */
+};
+#endif
+
+/*
+ * Chunk assembly from upper layer xdr_buf.
+ *
+ * Prepare the passed-in xdr_buf into representation as RPC/RDMA chunk
+ * elements. Segments are then coalesced when registered, if possible
+ * within the selected memreg mode.
+ *
+ * Returns positive number of segments converted, or a negative errno.
+ */
+
+static int
+rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos,
+	enum rpcrdma_chunktype type, struct rpcrdma_mr_seg *seg, int nsegs)
+{
+	int len, n = 0, p;
+	int page_base;
+	struct page **ppages;
+
+	if (pos == 0 && xdrbuf->head[0].iov_len) {
+		seg[n].mr_page = NULL;
+		seg[n].mr_offset = xdrbuf->head[0].iov_base;
+		seg[n].mr_len = xdrbuf->head[0].iov_len;
+		++n;
+	}
+
+	len = xdrbuf->page_len;
+	ppages = xdrbuf->pages + (xdrbuf->page_base >> PAGE_SHIFT);
+	page_base = xdrbuf->page_base & ~PAGE_MASK;
+	p = 0;
+	while (len && n < nsegs) {
+		if (!ppages[p]) {
+			/* alloc the pagelist for receiving buffer */
+			ppages[p] = alloc_page(GFP_ATOMIC);
+			if (!ppages[p])
+				return -ENOMEM;
+		}
+		seg[n].mr_page = ppages[p];
+		seg[n].mr_offset = (void *)(unsigned long) page_base;
+		seg[n].mr_len = min_t(u32, PAGE_SIZE - page_base, len);
+		if (seg[n].mr_len > PAGE_SIZE)
+			return -EIO;
+		len -= seg[n].mr_len;
+		++n;
+		++p;
+		page_base = 0;	/* page offset only applies to first page */
+	}
+
+	/* Message overflows the seg array */
+	if (len && n == nsegs)
+		return -EIO;
+
+	if (xdrbuf->tail[0].iov_len) {
+		/* the rpcrdma protocol allows us to omit any trailing
+		 * xdr pad bytes, saving the server an RDMA operation. */
+		if (xdrbuf->tail[0].iov_len < 4 && xprt_rdma_pad_optimize)
+			return n;
+		if (n == nsegs)
+			/* Tail remains, but we're out of segments */
+			return -EIO;
+		seg[n].mr_page = NULL;
+		seg[n].mr_offset = xdrbuf->tail[0].iov_base;
+		seg[n].mr_len = xdrbuf->tail[0].iov_len;
+		++n;
+	}
+
+	return n;
+}
+
+/*
+ * Create read/write chunk lists, and reply chunks, for RDMA
+ *
+ *   Assume check against THRESHOLD has been done, and chunks are required.
+ *   Assume only encoding one list entry for read|write chunks. The NFSv3
+ *     protocol is simple enough to allow this as it only has a single "bulk
+ *     result" in each procedure - complicated NFSv4 COMPOUNDs are not. (The
+ *     RDMA/Sessions NFSv4 proposal addresses this for future v4 revs.)
+ *
+ * When used for a single reply chunk (which is a special write
+ * chunk used for the entire reply, rather than just the data), it
+ * is used primarily for READDIR and READLINK which would otherwise
+ * be severely size-limited by a small rdma inline read max. The server
+ * response will come back as an RDMA Write, followed by a message
+ * of type RDMA_NOMSG carrying the xid and length. As a result, reply
+ * chunks do not provide data alignment, however they do not require
+ * "fixup" (moving the response to the upper layer buffer) either.
+ *
+ * Encoding key for single-list chunks (HLOO = Handle32 Length32 Offset64):
+ *
+ *  Read chunklist (a linked list):
+ *   N elements, position P (same P for all chunks of same arg!):
+ *    1 - PHLOO - 1 - PHLOO - ... - 1 - PHLOO - 0
+ *
+ *  Write chunklist (a list of (one) counted array):
+ *   N elements:
+ *    1 - N - HLOO - HLOO - ... - HLOO - 0
+ *
+ *  Reply chunk (a counted array):
+ *   N elements:
+ *    1 - N - HLOO - HLOO - ... - HLOO
+ *
+ * Returns positive RPC/RDMA header size, or negative errno.
+ */
+
+static ssize_t
+rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target,
+		struct rpcrdma_msg *headerp, enum rpcrdma_chunktype type)
+{
+	struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
+	struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt);
+	int n, nsegs, nchunks = 0;
+	unsigned int pos;
+	struct rpcrdma_mr_seg *seg = req->rl_segments;
+	struct rpcrdma_read_chunk *cur_rchunk = NULL;
+	struct rpcrdma_write_array *warray = NULL;
+	struct rpcrdma_write_chunk *cur_wchunk = NULL;
+	__be32 *iptr = headerp->rm_body.rm_chunks;
+
+	if (type == rpcrdma_readch || type == rpcrdma_areadch) {
+		/* a read chunk - server will RDMA Read our memory */
+		cur_rchunk = (struct rpcrdma_read_chunk *) iptr;
+	} else {
+		/* a write or reply chunk - server will RDMA Write our memory */
+		*iptr++ = xdr_zero;	/* encode a NULL read chunk list */
+		if (type == rpcrdma_replych)
+			*iptr++ = xdr_zero;	/* a NULL write chunk list */
+		warray = (struct rpcrdma_write_array *) iptr;
+		cur_wchunk = (struct rpcrdma_write_chunk *) (warray + 1);
+	}
+
+	if (type == rpcrdma_replych || type == rpcrdma_areadch)
+		pos = 0;
+	else
+		pos = target->head[0].iov_len;
+
+	nsegs = rpcrdma_convert_iovs(target, pos, type, seg, RPCRDMA_MAX_SEGS);
+	if (nsegs < 0)
+		return nsegs;
+
+	do {
+		n = rpcrdma_register_external(seg, nsegs,
+						cur_wchunk != NULL, r_xprt);
+		if (n <= 0)
+			goto out;
+		if (cur_rchunk) {	/* read */
+			cur_rchunk->rc_discrim = xdr_one;
+			/* all read chunks have the same "position" */
+			cur_rchunk->rc_position = htonl(pos);
+			cur_rchunk->rc_target.rs_handle = htonl(seg->mr_rkey);
+			cur_rchunk->rc_target.rs_length = htonl(seg->mr_len);
+			xdr_encode_hyper(
+					(__be32 *)&cur_rchunk->rc_target.rs_offset,
+					seg->mr_base);
+			dprintk("RPC:       %s: read chunk "
+				"elem %d@0x%llx:0x%x pos %u (%s)\n", __func__,
+				seg->mr_len, (unsigned long long)seg->mr_base,
+				seg->mr_rkey, pos, n < nsegs ? "more" : "last");
+			cur_rchunk++;
+			r_xprt->rx_stats.read_chunk_count++;
+		} else {		/* write/reply */
+			cur_wchunk->wc_target.rs_handle = htonl(seg->mr_rkey);
+			cur_wchunk->wc_target.rs_length = htonl(seg->mr_len);
+			xdr_encode_hyper(
+					(__be32 *)&cur_wchunk->wc_target.rs_offset,
+					seg->mr_base);
+			dprintk("RPC:       %s: %s chunk "
+				"elem %d@0x%llx:0x%x (%s)\n", __func__,
+				(type == rpcrdma_replych) ? "reply" : "write",
+				seg->mr_len, (unsigned long long)seg->mr_base,
+				seg->mr_rkey, n < nsegs ? "more" : "last");
+			cur_wchunk++;
+			if (type == rpcrdma_replych)
+				r_xprt->rx_stats.reply_chunk_count++;
+			else
+				r_xprt->rx_stats.write_chunk_count++;
+			r_xprt->rx_stats.total_rdma_request += seg->mr_len;
+		}
+		nchunks++;
+		seg   += n;
+		nsegs -= n;
+	} while (nsegs);
+
+	/* success. all failures return above */
+	req->rl_nchunks = nchunks;
+
+	/*
+	 * finish off header. If write, marshal discrim and nchunks.
+	 */
+	if (cur_rchunk) {
+		iptr = (__be32 *) cur_rchunk;
+		*iptr++ = xdr_zero;	/* finish the read chunk list */
+		*iptr++ = xdr_zero;	/* encode a NULL write chunk list */
+		*iptr++ = xdr_zero;	/* encode a NULL reply chunk */
+	} else {
+		warray->wc_discrim = xdr_one;
+		warray->wc_nchunks = htonl(nchunks);
+		iptr = (__be32 *) cur_wchunk;
+		if (type == rpcrdma_writech) {
+			*iptr++ = xdr_zero; /* finish the write chunk list */
+			*iptr++ = xdr_zero; /* encode a NULL reply chunk */
+		}
+	}
+
+	/*
+	 * Return header size.
+	 */
+	return (unsigned char *)iptr - (unsigned char *)headerp;
+
+out:
+	if (r_xprt->rx_ia.ri_memreg_strategy != RPCRDMA_FRMR) {
+		for (pos = 0; nchunks--;)
+			pos += rpcrdma_deregister_external(
+					&req->rl_segments[pos], r_xprt);
+	}
+	return n;
+}
+
+/*
+ * Marshal chunks. This routine returns the header length
+ * consumed by marshaling.
+ *
+ * Returns positive RPC/RDMA header size, or negative errno.
+ */
+
+ssize_t
+rpcrdma_marshal_chunks(struct rpc_rqst *rqst, ssize_t result)
+{
+	struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
+	struct rpcrdma_msg *headerp = (struct rpcrdma_msg *)req->rl_base;
+
+	if (req->rl_rtype != rpcrdma_noch)
+		result = rpcrdma_create_chunks(rqst, &rqst->rq_snd_buf,
+					       headerp, req->rl_rtype);
+	else if (req->rl_wtype != rpcrdma_noch)
+		result = rpcrdma_create_chunks(rqst, &rqst->rq_rcv_buf,
+					       headerp, req->rl_wtype);
+	return result;
+}
+
+/*
+ * Copy write data inline.
+ * This function is used for "small" requests. Data which is passed
+ * to RPC via iovecs (or page list) is copied directly into the
+ * pre-registered memory buffer for this request. For small amounts
+ * of data, this is efficient. The cutoff value is tunable.
+ */
+static int
+rpcrdma_inline_pullup(struct rpc_rqst *rqst, int pad)
+{
+	int i, npages, curlen;
+	int copy_len;
+	unsigned char *srcp, *destp;
+	struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt);
+	int page_base;
+	struct page **ppages;
+
+	destp = rqst->rq_svec[0].iov_base;
+	curlen = rqst->rq_svec[0].iov_len;
+	destp += curlen;
+	/*
+	 * Do optional padding where it makes sense. Alignment of write
+	 * payload can help the server, if our setting is accurate.
+	 */
+	pad -= (curlen + 36/*sizeof(struct rpcrdma_msg_padded)*/);
+	if (pad < 0 || rqst->rq_slen - curlen < RPCRDMA_INLINE_PAD_THRESH)
+		pad = 0;	/* don't pad this request */
+
+	dprintk("RPC:       %s: pad %d destp 0x%p len %d hdrlen %d\n",
+		__func__, pad, destp, rqst->rq_slen, curlen);
+
+	copy_len = rqst->rq_snd_buf.page_len;
+
+	if (rqst->rq_snd_buf.tail[0].iov_len) {
+		curlen = rqst->rq_snd_buf.tail[0].iov_len;
+		if (destp + copy_len != rqst->rq_snd_buf.tail[0].iov_base) {
+			memmove(destp + copy_len,
+				rqst->rq_snd_buf.tail[0].iov_base, curlen);
+			r_xprt->rx_stats.pullup_copy_count += curlen;
+		}
+		dprintk("RPC:       %s: tail destp 0x%p len %d\n",
+			__func__, destp + copy_len, curlen);
+		rqst->rq_svec[0].iov_len += curlen;
+	}
+	r_xprt->rx_stats.pullup_copy_count += copy_len;
+
+	page_base = rqst->rq_snd_buf.page_base;
+	ppages = rqst->rq_snd_buf.pages + (page_base >> PAGE_SHIFT);
+	page_base &= ~PAGE_MASK;
+	npages = PAGE_ALIGN(page_base+copy_len) >> PAGE_SHIFT;
+	for (i = 0; copy_len && i < npages; i++) {
+		curlen = PAGE_SIZE - page_base;
+		if (curlen > copy_len)
+			curlen = copy_len;
+		dprintk("RPC:       %s: page %d destp 0x%p len %d curlen %d\n",
+			__func__, i, destp, copy_len, curlen);
+		srcp = kmap_atomic(ppages[i]);
+		memcpy(destp, srcp+page_base, curlen);
+		kunmap_atomic(srcp);
+		rqst->rq_svec[0].iov_len += curlen;
+		destp += curlen;
+		copy_len -= curlen;
+		page_base = 0;
+	}
+	/* header now contains entire send message */
+	return pad;
+}
+
+/*
+ * Marshal a request: the primary job of this routine is to choose
+ * the transfer modes. See comments below.
+ *
+ * Uses multiple RDMA IOVs for a request:
+ *  [0] -- RPC RDMA header, which uses memory from the *start* of the
+ *         preregistered buffer that already holds the RPC data in
+ *         its middle.
+ *  [1] -- the RPC header/data, marshaled by RPC and the NFS protocol.
+ *  [2] -- optional padding.
+ *  [3] -- if padded, header only in [1] and data here.
+ *
+ * Returns zero on success, otherwise a negative errno.
+ */
+
+int
+rpcrdma_marshal_req(struct rpc_rqst *rqst)
+{
+	struct rpc_xprt *xprt = rqst->rq_xprt;
+	struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
+	struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
+	char *base;
+	size_t rpclen, padlen;
+	ssize_t hdrlen;
+	struct rpcrdma_msg *headerp;
+
+	/*
+	 * rpclen gets amount of data in first buffer, which is the
+	 * pre-registered buffer.
+	 */
+	base = rqst->rq_svec[0].iov_base;
+	rpclen = rqst->rq_svec[0].iov_len;
+
+	/* build RDMA header in private area at front */
+	headerp = (struct rpcrdma_msg *) req->rl_base;
+	/* don't htonl XID, it's already done in request */
+	headerp->rm_xid = rqst->rq_xid;
+	headerp->rm_vers = xdr_one;
+	headerp->rm_credit = htonl(r_xprt->rx_buf.rb_max_requests);
+	headerp->rm_type = htonl(RDMA_MSG);
+
+	/*
+	 * Chunks needed for results?
+	 *
+	 * o If the expected result is under the inline threshold, all ops
+	 *   return as inline (but see later).
+	 * o Large non-read ops return as a single reply chunk.
+	 * o Large read ops return data as write chunk(s), header as inline.
+	 *
+	 * Note: the NFS code sending down multiple result segments implies
+	 * the op is one of read, readdir[plus], readlink or NFSv4 getacl.
+	 */
+
+	/*
+	 * This code can handle read chunks, write chunks OR reply
+	 * chunks -- only one type. If the request is too big to fit
+	 * inline, then we will choose read chunks. If the request is
+	 * a READ, then use write chunks to separate the file data
+	 * into pages; otherwise use reply chunks.
+	 */
+	if (rqst->rq_rcv_buf.buflen <= RPCRDMA_INLINE_READ_THRESHOLD(rqst))
+		req->rl_wtype = rpcrdma_noch;
+	else if (rqst->rq_rcv_buf.page_len == 0)
+		req->rl_wtype = rpcrdma_replych;
+	else if (rqst->rq_rcv_buf.flags & XDRBUF_READ)
+		req->rl_wtype = rpcrdma_writech;
+	else
+		req->rl_wtype = rpcrdma_replych;
+
+	/*
+	 * Chunks needed for arguments?
+	 *
+	 * o If the total request is under the inline threshold, all ops
+	 *   are sent as inline.
+	 * o Large non-write ops are sent with the entire message as a
+	 *   single read chunk (protocol 0-position special case).
+	 * o Large write ops transmit data as read chunk(s), header as
+	 *   inline.
+	 *
+	 * Note: the NFS code sending down multiple argument segments
+	 * implies the op is a write.
+	 * TBD check NFSv4 setacl
+	 */
+	if (rqst->rq_snd_buf.len <= RPCRDMA_INLINE_WRITE_THRESHOLD(rqst))
+		req->rl_rtype = rpcrdma_noch;
+	else if (rqst->rq_snd_buf.page_len == 0)
+		req->rl_rtype = rpcrdma_areadch;
+	else
+		req->rl_rtype = rpcrdma_readch;
+
+	/* The following simplification is not true forever */
+	if (req->rl_rtype != rpcrdma_noch && req->rl_wtype == rpcrdma_replych)
+		req->rl_wtype = rpcrdma_noch;
+	if (req->rl_rtype != rpcrdma_noch && req->rl_wtype != rpcrdma_noch) {
+		dprintk("RPC:       %s: cannot marshal multiple chunk lists\n",
+			__func__);
+		return -EIO;
+	}
+
+	hdrlen = 28; /*sizeof *headerp;*/
+	padlen = 0;
+
+	/*
+	 * Pull up any extra send data into the preregistered buffer.
+	 * When padding is in use and applies to the transfer, insert
+	 * it and change the message type.
+	 */
+	if (req->rl_rtype == rpcrdma_noch) {
+
+		padlen = rpcrdma_inline_pullup(rqst,
+						RPCRDMA_INLINE_PAD_VALUE(rqst));
+
+		if (padlen) {
+			headerp->rm_type = htonl(RDMA_MSGP);
+			headerp->rm_body.rm_padded.rm_align =
+				htonl(RPCRDMA_INLINE_PAD_VALUE(rqst));
+			headerp->rm_body.rm_padded.rm_thresh =
+				htonl(RPCRDMA_INLINE_PAD_THRESH);
+			headerp->rm_body.rm_padded.rm_pempty[0] = xdr_zero;
+			headerp->rm_body.rm_padded.rm_pempty[1] = xdr_zero;
+			headerp->rm_body.rm_padded.rm_pempty[2] = xdr_zero;
+			hdrlen += 2 * sizeof(u32); /* extra words in padhdr */
+			if (req->rl_wtype != rpcrdma_noch) {
+				dprintk("RPC:       %s: invalid chunk list\n",
+					__func__);
+				return -EIO;
+			}
+		} else {
+			headerp->rm_body.rm_nochunks.rm_empty[0] = xdr_zero;
+			headerp->rm_body.rm_nochunks.rm_empty[1] = xdr_zero;
+			headerp->rm_body.rm_nochunks.rm_empty[2] = xdr_zero;
+			/* new length after pullup */
+			rpclen = rqst->rq_svec[0].iov_len;
+			/*
+			 * Currently we try to not actually use read inline.
+			 * Reply chunks have the desirable property that
+			 * they land, packed, directly in the target buffers
+			 * without headers, so they require no fixup. The
+			 * additional RDMA Write op sends the same amount
+			 * of data, streams on-the-wire and adds no overhead
+			 * on receive. Therefore, we request a reply chunk
+			 * for non-writes wherever feasible and efficient.
+			 */
+			if (req->rl_wtype == rpcrdma_noch)
+				req->rl_wtype = rpcrdma_replych;
+		}
+	}
+
+	hdrlen = rpcrdma_marshal_chunks(rqst, hdrlen);
+	if (hdrlen < 0)
+		return hdrlen;
+
+	dprintk("RPC:       %s: %s: hdrlen %zd rpclen %zd padlen %zd"
+		" headerp 0x%p base 0x%p lkey 0x%x\n",
+		__func__, transfertypes[req->rl_wtype], hdrlen, rpclen, padlen,
+		headerp, base, req->rl_iov.lkey);
+
+	/*
+	 * initialize send_iov's - normally only two: rdma chunk header and
+	 * single preregistered RPC header buffer, but if padding is present,
+	 * then use a preregistered (and zeroed) pad buffer between the RPC
+	 * header and any write data. In all non-rdma cases, any following
+	 * data has been copied into the RPC header buffer.
+	 */
+	req->rl_send_iov[0].addr = req->rl_iov.addr;
+	req->rl_send_iov[0].length = hdrlen;
+	req->rl_send_iov[0].lkey = req->rl_iov.lkey;
+
+	req->rl_send_iov[1].addr = req->rl_iov.addr + (base - req->rl_base);
+	req->rl_send_iov[1].length = rpclen;
+	req->rl_send_iov[1].lkey = req->rl_iov.lkey;
+
+	req->rl_niovs = 2;
+
+	if (padlen) {
+		struct rpcrdma_ep *ep = &r_xprt->rx_ep;
+
+		req->rl_send_iov[2].addr = ep->rep_pad.addr;
+		req->rl_send_iov[2].length = padlen;
+		req->rl_send_iov[2].lkey = ep->rep_pad.lkey;
+
+		req->rl_send_iov[3].addr = req->rl_send_iov[1].addr + rpclen;
+		req->rl_send_iov[3].length = rqst->rq_slen - rpclen;
+		req->rl_send_iov[3].lkey = req->rl_iov.lkey;
+
+		req->rl_niovs = 4;
+	}
+
+	return 0;
+}
+
+/*
+ * Chase down a received write or reply chunklist to get length
+ * RDMA'd by server. See map at rpcrdma_create_chunks()! :-)
+ */
+static int
+rpcrdma_count_chunks(struct rpcrdma_rep *rep, unsigned int max, int wrchunk, __be32 **iptrp)
+{
+	unsigned int i, total_len;
+	struct rpcrdma_write_chunk *cur_wchunk;
+
+	i = ntohl(**iptrp);	/* get array count */
+	if (i > max)
+		return -1;
+	cur_wchunk = (struct rpcrdma_write_chunk *) (*iptrp + 1);
+	total_len = 0;
+	while (i--) {
+		struct rpcrdma_segment *seg = &cur_wchunk->wc_target;
+		ifdebug(FACILITY) {
+			u64 off;
+			xdr_decode_hyper((__be32 *)&seg->rs_offset, &off);
+			dprintk("RPC:       %s: chunk %d@0x%llx:0x%x\n",
+				__func__,
+				ntohl(seg->rs_length),
+				(unsigned long long)off,
+				ntohl(seg->rs_handle));
+		}
+		total_len += ntohl(seg->rs_length);
+		++cur_wchunk;
+	}
+	/* check and adjust for properly terminated write chunk */
+	if (wrchunk) {
+		__be32 *w = (__be32 *) cur_wchunk;
+		if (*w++ != xdr_zero)
+			return -1;
+		cur_wchunk = (struct rpcrdma_write_chunk *) w;
+	}
+	if ((char *) cur_wchunk > rep->rr_base + rep->rr_len)
+		return -1;
+
+	*iptrp = (__be32 *) cur_wchunk;
+	return total_len;
+}
+
+/*
+ * Scatter inline received data back into provided iov's.
+ */
+static void
+rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad)
+{
+	int i, npages, curlen, olen;
+	char *destp;
+	struct page **ppages;
+	int page_base;
+
+	curlen = rqst->rq_rcv_buf.head[0].iov_len;
+	if (curlen > copy_len) {	/* write chunk header fixup */
+		curlen = copy_len;
+		rqst->rq_rcv_buf.head[0].iov_len = curlen;
+	}
+
+	dprintk("RPC:       %s: srcp 0x%p len %d hdrlen %d\n",
+		__func__, srcp, copy_len, curlen);
+
+	/* Shift pointer for first receive segment only */
+	rqst->rq_rcv_buf.head[0].iov_base = srcp;
+	srcp += curlen;
+	copy_len -= curlen;
+
+	olen = copy_len;
+	i = 0;
+	rpcx_to_rdmax(rqst->rq_xprt)->rx_stats.fixup_copy_count += olen;
+	page_base = rqst->rq_rcv_buf.page_base;
+	ppages = rqst->rq_rcv_buf.pages + (page_base >> PAGE_SHIFT);
+	page_base &= ~PAGE_MASK;
+
+	if (copy_len && rqst->rq_rcv_buf.page_len) {
+		npages = PAGE_ALIGN(page_base +
+			rqst->rq_rcv_buf.page_len) >> PAGE_SHIFT;
+		for (; i < npages; i++) {
+			curlen = PAGE_SIZE - page_base;
+			if (curlen > copy_len)
+				curlen = copy_len;
+			dprintk("RPC:       %s: page %d"
+				" srcp 0x%p len %d curlen %d\n",
+				__func__, i, srcp, copy_len, curlen);
+			destp = kmap_atomic(ppages[i]);
+			memcpy(destp + page_base, srcp, curlen);
+			flush_dcache_page(ppages[i]);
+			kunmap_atomic(destp);
+			srcp += curlen;
+			copy_len -= curlen;
+			if (copy_len == 0)
+				break;
+			page_base = 0;
+		}
+	}
+
+	if (copy_len && rqst->rq_rcv_buf.tail[0].iov_len) {
+		curlen = copy_len;
+		if (curlen > rqst->rq_rcv_buf.tail[0].iov_len)
+			curlen = rqst->rq_rcv_buf.tail[0].iov_len;
+		if (rqst->rq_rcv_buf.tail[0].iov_base != srcp)
+			memmove(rqst->rq_rcv_buf.tail[0].iov_base, srcp, curlen);
+		dprintk("RPC:       %s: tail srcp 0x%p len %d curlen %d\n",
+			__func__, srcp, copy_len, curlen);
+		rqst->rq_rcv_buf.tail[0].iov_len = curlen;
+		copy_len -= curlen; ++i;
+	} else
+		rqst->rq_rcv_buf.tail[0].iov_len = 0;
+
+	if (pad) {
+		/* implicit padding on terminal chunk */
+		unsigned char *p = rqst->rq_rcv_buf.tail[0].iov_base;
+		while (pad--)
+			p[rqst->rq_rcv_buf.tail[0].iov_len++] = 0;
+	}
+
+	if (copy_len)
+		dprintk("RPC:       %s: %d bytes in"
+			" %d extra segments (%d lost)\n",
+			__func__, olen, i, copy_len);
+
+	/* TBD avoid a warning from call_decode() */
+	rqst->rq_private_buf = rqst->rq_rcv_buf;
+}
+
+void
+rpcrdma_connect_worker(struct work_struct *work)
+{
+	struct rpcrdma_ep *ep =
+		container_of(work, struct rpcrdma_ep, rep_connect_worker.work);
+	struct rpc_xprt *xprt = ep->rep_xprt;
+
+	spin_lock_bh(&xprt->transport_lock);
+	if (++xprt->connect_cookie == 0)	/* maintain a reserved value */
+		++xprt->connect_cookie;
+	if (ep->rep_connected > 0) {
+		if (!xprt_test_and_set_connected(xprt))
+			xprt_wake_pending_tasks(xprt, 0);
+	} else {
+		if (xprt_test_and_clear_connected(xprt))
+			xprt_wake_pending_tasks(xprt, -ENOTCONN);
+	}
+	spin_unlock_bh(&xprt->transport_lock);
+}
+
+/*
+ * This function is called when an async event is posted to
+ * the connection which changes the connection state. All it
+ * does at this point is mark the connection up/down, the rpc
+ * timers do the rest.
+ */
+void
+rpcrdma_conn_func(struct rpcrdma_ep *ep)
+{
+	schedule_delayed_work(&ep->rep_connect_worker, 0);
+}
+
+/*
+ * Called as a tasklet to do req/reply match and complete a request
+ * Errors must result in the RPC task either being awakened, or
+ * allowed to timeout, to discover the errors at that time.
+ */
+void
+rpcrdma_reply_handler(struct rpcrdma_rep *rep)
+{
+	struct rpcrdma_msg *headerp;
+	struct rpcrdma_req *req;
+	struct rpc_rqst *rqst;
+	struct rpc_xprt *xprt = rep->rr_xprt;
+	struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
+	__be32 *iptr;
+	int rdmalen, status;
+	unsigned long cwnd;
+
+	/* Check status. If bad, signal disconnect and return rep to pool */
+	if (rep->rr_len == ~0U) {
+		rpcrdma_recv_buffer_put(rep);
+		if (r_xprt->rx_ep.rep_connected == 1) {
+			r_xprt->rx_ep.rep_connected = -EIO;
+			rpcrdma_conn_func(&r_xprt->rx_ep);
+		}
+		return;
+	}
+	if (rep->rr_len < 28) {
+		dprintk("RPC:       %s: short/invalid reply\n", __func__);
+		goto repost;
+	}
+	headerp = (struct rpcrdma_msg *) rep->rr_base;
+	if (headerp->rm_vers != xdr_one) {
+		dprintk("RPC:       %s: invalid version %d\n",
+			__func__, ntohl(headerp->rm_vers));
+		goto repost;
+	}
+
+	/* Get XID and try for a match. */
+	spin_lock(&xprt->transport_lock);
+	rqst = xprt_lookup_rqst(xprt, headerp->rm_xid);
+	if (rqst == NULL) {
+		spin_unlock(&xprt->transport_lock);
+		dprintk("RPC:       %s: reply 0x%p failed "
+			"to match any request xid 0x%08x len %d\n",
+			__func__, rep, headerp->rm_xid, rep->rr_len);
+repost:
+		r_xprt->rx_stats.bad_reply_count++;
+		rep->rr_func = rpcrdma_reply_handler;
+		if (rpcrdma_ep_post_recv(&r_xprt->rx_ia, &r_xprt->rx_ep, rep))
+			rpcrdma_recv_buffer_put(rep);
+
+		return;
+	}
+
+	/* get request object */
+	req = rpcr_to_rdmar(rqst);
+	if (req->rl_reply) {
+		spin_unlock(&xprt->transport_lock);
+		dprintk("RPC:       %s: duplicate reply 0x%p to RPC "
+			"request 0x%p: xid 0x%08x\n", __func__, rep, req,
+			headerp->rm_xid);
+		goto repost;
+	}
+
+	dprintk("RPC:       %s: reply 0x%p completes request 0x%p\n"
+		"                   RPC request 0x%p xid 0x%08x\n",
+			__func__, rep, req, rqst, headerp->rm_xid);
+
+	/* from here on, the reply is no longer an orphan */
+	req->rl_reply = rep;
+	xprt->reestablish_timeout = 0;
+
+	/* check for expected message types */
+	/* The order of some of these tests is important. */
+	switch (headerp->rm_type) {
+	case htonl(RDMA_MSG):
+		/* never expect read chunks */
+		/* never expect reply chunks (two ways to check) */
+		/* never expect write chunks without having offered RDMA */
+		if (headerp->rm_body.rm_chunks[0] != xdr_zero ||
+		    (headerp->rm_body.rm_chunks[1] == xdr_zero &&
+		     headerp->rm_body.rm_chunks[2] != xdr_zero) ||
+		    (headerp->rm_body.rm_chunks[1] != xdr_zero &&
+		     req->rl_nchunks == 0))
+			goto badheader;
+		if (headerp->rm_body.rm_chunks[1] != xdr_zero) {
+			/* count any expected write chunks in read reply */
+			/* start at write chunk array count */
+			iptr = &headerp->rm_body.rm_chunks[2];
+			rdmalen = rpcrdma_count_chunks(rep,
+						req->rl_nchunks, 1, &iptr);
+			/* check for validity, and no reply chunk after */
+			if (rdmalen < 0 || *iptr++ != xdr_zero)
+				goto badheader;
+			rep->rr_len -=
+			    ((unsigned char *)iptr - (unsigned char *)headerp);
+			status = rep->rr_len + rdmalen;
+			r_xprt->rx_stats.total_rdma_reply += rdmalen;
+			/* special case - last chunk may omit padding */
+			if (rdmalen &= 3) {
+				rdmalen = 4 - rdmalen;
+				status += rdmalen;
+			}
+		} else {
+			/* else ordinary inline */
+			rdmalen = 0;
+			iptr = (__be32 *)((unsigned char *)headerp + 28);
+			rep->rr_len -= 28; /*sizeof *headerp;*/
+			status = rep->rr_len;
+		}
+		/* Fix up the rpc results for upper layer */
+		rpcrdma_inline_fixup(rqst, (char *)iptr, rep->rr_len, rdmalen);
+		break;
+
+	case htonl(RDMA_NOMSG):
+		/* never expect read or write chunks, always reply chunks */
+		if (headerp->rm_body.rm_chunks[0] != xdr_zero ||
+		    headerp->rm_body.rm_chunks[1] != xdr_zero ||
+		    headerp->rm_body.rm_chunks[2] != xdr_one ||
+		    req->rl_nchunks == 0)
+			goto badheader;
+		iptr = (__be32 *)((unsigned char *)headerp + 28);
+		rdmalen = rpcrdma_count_chunks(rep, req->rl_nchunks, 0, &iptr);
+		if (rdmalen < 0)
+			goto badheader;
+		r_xprt->rx_stats.total_rdma_reply += rdmalen;
+		/* Reply chunk buffer already is the reply vector - no fixup. */
+		status = rdmalen;
+		break;
+
+badheader:
+	default:
+		dprintk("%s: invalid rpcrdma reply header (type %d):"
+				" chunks[012] == %d %d %d"
+				" expected chunks <= %d\n",
+				__func__, ntohl(headerp->rm_type),
+				headerp->rm_body.rm_chunks[0],
+				headerp->rm_body.rm_chunks[1],
+				headerp->rm_body.rm_chunks[2],
+				req->rl_nchunks);
+		status = -EIO;
+		r_xprt->rx_stats.bad_reply_count++;
+		break;
+	}
+
+	cwnd = xprt->cwnd;
+	xprt->cwnd = atomic_read(&r_xprt->rx_buf.rb_credits) << RPC_CWNDSHIFT;
+	if (xprt->cwnd > cwnd)
+		xprt_release_rqst_cong(rqst->rq_task);
+
+	dprintk("RPC:       %s: xprt_complete_rqst(0x%p, 0x%p, %d)\n",
+			__func__, xprt, rqst, status);
+	xprt_complete_rqst(rqst->rq_task, status);
+	spin_unlock(&xprt->transport_lock);
+}
diff --git a/net/sunrpc/xprtrdma/svc_rdma.c b/net/sunrpc/xprtrdma/svc_rdma.c
new file mode 100644
index 0000000..c1b6270
--- /dev/null
+++ b/net/sunrpc/xprtrdma/svc_rdma.c
@@ -0,0 +1,302 @@
+/*
+ * Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the BSD-type
+ * license below:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *      Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *
+ *      Redistributions in binary form must reproduce the above
+ *      copyright notice, this list of conditions and the following
+ *      disclaimer in the documentation and/or other materials provided
+ *      with the distribution.
+ *
+ *      Neither the name of the Network Appliance, Inc. nor the names of
+ *      its contributors may be used to endorse or promote products
+ *      derived from this software without specific prior written
+ *      permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Tom Tucker <tom@opengridcomputing.com>
+ */
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/sysctl.h>
+#include <linux/workqueue.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/sched.h>
+#include <linux/sunrpc/svc_rdma.h>
+#include "xprt_rdma.h"
+
+#define RPCDBG_FACILITY	RPCDBG_SVCXPRT
+
+/* RPC/RDMA parameters */
+unsigned int svcrdma_ord = RPCRDMA_ORD;
+static unsigned int min_ord = 1;
+static unsigned int max_ord = 4096;
+unsigned int svcrdma_max_requests = RPCRDMA_MAX_REQUESTS;
+static unsigned int min_max_requests = 4;
+static unsigned int max_max_requests = 16384;
+unsigned int svcrdma_max_req_size = RPCRDMA_MAX_REQ_SIZE;
+static unsigned int min_max_inline = 4096;
+static unsigned int max_max_inline = 65536;
+
+atomic_t rdma_stat_recv;
+atomic_t rdma_stat_read;
+atomic_t rdma_stat_write;
+atomic_t rdma_stat_sq_starve;
+atomic_t rdma_stat_rq_starve;
+atomic_t rdma_stat_rq_poll;
+atomic_t rdma_stat_rq_prod;
+atomic_t rdma_stat_sq_poll;
+atomic_t rdma_stat_sq_prod;
+
+/* Temporary NFS request map and context caches */
+struct kmem_cache *svc_rdma_map_cachep;
+struct kmem_cache *svc_rdma_ctxt_cachep;
+
+struct workqueue_struct *svc_rdma_wq;
+
+/*
+ * This function implements reading and resetting an atomic_t stat
+ * variable through read/write to a proc file. Any write to the file
+ * resets the associated statistic to zero. Any read returns it's
+ * current value.
+ */
+static int read_reset_stat(struct ctl_table *table, int write,
+			   void __user *buffer, size_t *lenp,
+			   loff_t *ppos)
+{
+	atomic_t *stat = (atomic_t *)table->data;
+
+	if (!stat)
+		return -EINVAL;
+
+	if (write)
+		atomic_set(stat, 0);
+	else {
+		char str_buf[32];
+		char *data;
+		int len = snprintf(str_buf, 32, "%d\n", atomic_read(stat));
+		if (len >= 32)
+			return -EFAULT;
+		len = strlen(str_buf);
+		if (*ppos > len) {
+			*lenp = 0;
+			return 0;
+		}
+		data = &str_buf[*ppos];
+		len -= *ppos;
+		if (len > *lenp)
+			len = *lenp;
+		if (len && copy_to_user(buffer, str_buf, len))
+			return -EFAULT;
+		*lenp = len;
+		*ppos += len;
+	}
+	return 0;
+}
+
+static struct ctl_table_header *svcrdma_table_header;
+static struct ctl_table svcrdma_parm_table[] = {
+	{
+		.procname	= "max_requests",
+		.data		= &svcrdma_max_requests,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &min_max_requests,
+		.extra2		= &max_max_requests
+	},
+	{
+		.procname	= "max_req_size",
+		.data		= &svcrdma_max_req_size,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &min_max_inline,
+		.extra2		= &max_max_inline
+	},
+	{
+		.procname	= "max_outbound_read_requests",
+		.data		= &svcrdma_ord,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &min_ord,
+		.extra2		= &max_ord,
+	},
+
+	{
+		.procname	= "rdma_stat_read",
+		.data		= &rdma_stat_read,
+		.maxlen		= sizeof(atomic_t),
+		.mode		= 0644,
+		.proc_handler	= read_reset_stat,
+	},
+	{
+		.procname	= "rdma_stat_recv",
+		.data		= &rdma_stat_recv,
+		.maxlen		= sizeof(atomic_t),
+		.mode		= 0644,
+		.proc_handler	= read_reset_stat,
+	},
+	{
+		.procname	= "rdma_stat_write",
+		.data		= &rdma_stat_write,
+		.maxlen		= sizeof(atomic_t),
+		.mode		= 0644,
+		.proc_handler	= read_reset_stat,
+	},
+	{
+		.procname	= "rdma_stat_sq_starve",
+		.data		= &rdma_stat_sq_starve,
+		.maxlen		= sizeof(atomic_t),
+		.mode		= 0644,
+		.proc_handler	= read_reset_stat,
+	},
+	{
+		.procname	= "rdma_stat_rq_starve",
+		.data		= &rdma_stat_rq_starve,
+		.maxlen		= sizeof(atomic_t),
+		.mode		= 0644,
+		.proc_handler	= read_reset_stat,
+	},
+	{
+		.procname	= "rdma_stat_rq_poll",
+		.data		= &rdma_stat_rq_poll,
+		.maxlen		= sizeof(atomic_t),
+		.mode		= 0644,
+		.proc_handler	= read_reset_stat,
+	},
+	{
+		.procname	= "rdma_stat_rq_prod",
+		.data		= &rdma_stat_rq_prod,
+		.maxlen		= sizeof(atomic_t),
+		.mode		= 0644,
+		.proc_handler	= read_reset_stat,
+	},
+	{
+		.procname	= "rdma_stat_sq_poll",
+		.data		= &rdma_stat_sq_poll,
+		.maxlen		= sizeof(atomic_t),
+		.mode		= 0644,
+		.proc_handler	= read_reset_stat,
+	},
+	{
+		.procname	= "rdma_stat_sq_prod",
+		.data		= &rdma_stat_sq_prod,
+		.maxlen		= sizeof(atomic_t),
+		.mode		= 0644,
+		.proc_handler	= read_reset_stat,
+	},
+	{ },
+};
+
+static struct ctl_table svcrdma_table[] = {
+	{
+		.procname	= "svc_rdma",
+		.mode		= 0555,
+		.child		= svcrdma_parm_table
+	},
+	{ },
+};
+
+static struct ctl_table svcrdma_root_table[] = {
+	{
+		.procname	= "sunrpc",
+		.mode		= 0555,
+		.child		= svcrdma_table
+	},
+	{ },
+};
+
+void svc_rdma_cleanup(void)
+{
+	dprintk("SVCRDMA Module Removed, deregister RPC RDMA transport\n");
+	destroy_workqueue(svc_rdma_wq);
+	if (svcrdma_table_header) {
+		unregister_sysctl_table(svcrdma_table_header);
+		svcrdma_table_header = NULL;
+	}
+	svc_unreg_xprt_class(&svc_rdma_class);
+	kmem_cache_destroy(svc_rdma_map_cachep);
+	kmem_cache_destroy(svc_rdma_ctxt_cachep);
+}
+
+int svc_rdma_init(void)
+{
+	dprintk("SVCRDMA Module Init, register RPC RDMA transport\n");
+	dprintk("\tsvcrdma_ord      : %d\n", svcrdma_ord);
+	dprintk("\tmax_requests     : %d\n", svcrdma_max_requests);
+	dprintk("\tsq_depth         : %d\n",
+		svcrdma_max_requests * RPCRDMA_SQ_DEPTH_MULT);
+	dprintk("\tmax_inline       : %d\n", svcrdma_max_req_size);
+
+	svc_rdma_wq = alloc_workqueue("svc_rdma", 0, 0);
+	if (!svc_rdma_wq)
+		return -ENOMEM;
+
+	if (!svcrdma_table_header)
+		svcrdma_table_header =
+			register_sysctl_table(svcrdma_root_table);
+
+	/* Create the temporary map cache */
+	svc_rdma_map_cachep = kmem_cache_create("svc_rdma_map_cache",
+						sizeof(struct svc_rdma_req_map),
+						0,
+						SLAB_HWCACHE_ALIGN,
+						NULL);
+	if (!svc_rdma_map_cachep) {
+		printk(KERN_INFO "Could not allocate map cache.\n");
+		goto err0;
+	}
+
+	/* Create the temporary context cache */
+	svc_rdma_ctxt_cachep =
+		kmem_cache_create("svc_rdma_ctxt_cache",
+				  sizeof(struct svc_rdma_op_ctxt),
+				  0,
+				  SLAB_HWCACHE_ALIGN,
+				  NULL);
+	if (!svc_rdma_ctxt_cachep) {
+		printk(KERN_INFO "Could not allocate WR ctxt cache.\n");
+		goto err1;
+	}
+
+	/* Register RDMA with the SVC transport switch */
+	svc_reg_xprt_class(&svc_rdma_class);
+	return 0;
+ err1:
+	kmem_cache_destroy(svc_rdma_map_cachep);
+ err0:
+	unregister_sysctl_table(svcrdma_table_header);
+	destroy_workqueue(svc_rdma_wq);
+	return -ENOMEM;
+}
+MODULE_AUTHOR("Tom Tucker <tom@opengridcomputing.com>");
+MODULE_DESCRIPTION("SVC RDMA Transport");
+MODULE_LICENSE("Dual BSD/GPL");
+module_init(svc_rdma_init);
+module_exit(svc_rdma_cleanup);
diff --git a/net/sunrpc/xprtrdma/svc_rdma_marshal.c b/net/sunrpc/xprtrdma/svc_rdma_marshal.c
new file mode 100644
index 0000000..65b1462
--- /dev/null
+++ b/net/sunrpc/xprtrdma/svc_rdma_marshal.c
@@ -0,0 +1,386 @@
+/*
+ * Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the BSD-type
+ * license below:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *      Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *
+ *      Redistributions in binary form must reproduce the above
+ *      copyright notice, this list of conditions and the following
+ *      disclaimer in the documentation and/or other materials provided
+ *      with the distribution.
+ *
+ *      Neither the name of the Network Appliance, Inc. nor the names of
+ *      its contributors may be used to endorse or promote products
+ *      derived from this software without specific prior written
+ *      permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Tom Tucker <tom@opengridcomputing.com>
+ */
+
+#include <linux/sunrpc/xdr.h>
+#include <linux/sunrpc/debug.h>
+#include <asm/unaligned.h>
+#include <linux/sunrpc/rpc_rdma.h>
+#include <linux/sunrpc/svc_rdma.h>
+
+#define RPCDBG_FACILITY	RPCDBG_SVCXPRT
+
+/*
+ * Decodes a read chunk list. The expected format is as follows:
+ *    descrim  : xdr_one
+ *    position : u32 offset into XDR stream
+ *    handle   : u32 RKEY
+ *    . . .
+ *  end-of-list: xdr_zero
+ */
+static u32 *decode_read_list(u32 *va, u32 *vaend)
+{
+	struct rpcrdma_read_chunk *ch = (struct rpcrdma_read_chunk *)va;
+
+	while (ch->rc_discrim != xdr_zero) {
+		if (((unsigned long)ch + sizeof(struct rpcrdma_read_chunk)) >
+		    (unsigned long)vaend) {
+			dprintk("svcrdma: vaend=%p, ch=%p\n", vaend, ch);
+			return NULL;
+		}
+		ch++;
+	}
+	return (u32 *)&ch->rc_position;
+}
+
+/*
+ * Determine number of chunks and total bytes in chunk list. The chunk
+ * list has already been verified to fit within the RPCRDMA header.
+ */
+void svc_rdma_rcl_chunk_counts(struct rpcrdma_read_chunk *ch,
+			       int *ch_count, int *byte_count)
+{
+	/* compute the number of bytes represented by read chunks */
+	*byte_count = 0;
+	*ch_count = 0;
+	for (; ch->rc_discrim != 0; ch++) {
+		*byte_count = *byte_count + ntohl(ch->rc_target.rs_length);
+		*ch_count = *ch_count + 1;
+	}
+}
+
+/*
+ * Decodes a write chunk list. The expected format is as follows:
+ *    descrim  : xdr_one
+ *    nchunks  : <count>
+ *       handle   : u32 RKEY              ---+
+ *       length   : u32 <len of segment>     |
+ *       offset   : remove va                + <count>
+ *       . . .                               |
+ *                                        ---+
+ */
+static u32 *decode_write_list(u32 *va, u32 *vaend)
+{
+	unsigned long start, end;
+	int nchunks;
+
+	struct rpcrdma_write_array *ary =
+		(struct rpcrdma_write_array *)va;
+
+	/* Check for not write-array */
+	if (ary->wc_discrim == xdr_zero)
+		return (u32 *)&ary->wc_nchunks;
+
+	if ((unsigned long)ary + sizeof(struct rpcrdma_write_array) >
+	    (unsigned long)vaend) {
+		dprintk("svcrdma: ary=%p, vaend=%p\n", ary, vaend);
+		return NULL;
+	}
+	nchunks = ntohl(ary->wc_nchunks);
+
+	start = (unsigned long)&ary->wc_array[0];
+	end = (unsigned long)vaend;
+	if (nchunks < 0 ||
+	    nchunks > (SIZE_MAX - start) / sizeof(struct rpcrdma_write_chunk) ||
+	    (start + (sizeof(struct rpcrdma_write_chunk) * nchunks)) > end) {
+		dprintk("svcrdma: ary=%p, wc_nchunks=%d, vaend=%p\n",
+			ary, nchunks, vaend);
+		return NULL;
+	}
+	/*
+	 * rs_length is the 2nd 4B field in wc_target and taking its
+	 * address skips the list terminator
+	 */
+	return (u32 *)&ary->wc_array[nchunks].wc_target.rs_length;
+}
+
+static u32 *decode_reply_array(u32 *va, u32 *vaend)
+{
+	unsigned long start, end;
+	int nchunks;
+	struct rpcrdma_write_array *ary =
+		(struct rpcrdma_write_array *)va;
+
+	/* Check for no reply-array */
+	if (ary->wc_discrim == xdr_zero)
+		return (u32 *)&ary->wc_nchunks;
+
+	if ((unsigned long)ary + sizeof(struct rpcrdma_write_array) >
+	    (unsigned long)vaend) {
+		dprintk("svcrdma: ary=%p, vaend=%p\n", ary, vaend);
+		return NULL;
+	}
+	nchunks = ntohl(ary->wc_nchunks);
+
+	start = (unsigned long)&ary->wc_array[0];
+	end = (unsigned long)vaend;
+	if (nchunks < 0 ||
+	    nchunks > (SIZE_MAX - start) / sizeof(struct rpcrdma_write_chunk) ||
+	    (start + (sizeof(struct rpcrdma_write_chunk) * nchunks)) > end) {
+		dprintk("svcrdma: ary=%p, wc_nchunks=%d, vaend=%p\n",
+			ary, nchunks, vaend);
+		return NULL;
+	}
+	return (u32 *)&ary->wc_array[nchunks];
+}
+
+int svc_rdma_xdr_decode_req(struct rpcrdma_msg **rdma_req,
+			    struct svc_rqst *rqstp)
+{
+	struct rpcrdma_msg *rmsgp = NULL;
+	u32 *va;
+	u32 *vaend;
+	u32 hdr_len;
+
+	rmsgp = (struct rpcrdma_msg *)rqstp->rq_arg.head[0].iov_base;
+
+	/* Verify that there's enough bytes for header + something */
+	if (rqstp->rq_arg.len <= RPCRDMA_HDRLEN_MIN) {
+		dprintk("svcrdma: header too short = %d\n",
+			rqstp->rq_arg.len);
+		return -EINVAL;
+	}
+
+	/* Decode the header */
+	rmsgp->rm_xid = ntohl(rmsgp->rm_xid);
+	rmsgp->rm_vers = ntohl(rmsgp->rm_vers);
+	rmsgp->rm_credit = ntohl(rmsgp->rm_credit);
+	rmsgp->rm_type = ntohl(rmsgp->rm_type);
+
+	if (rmsgp->rm_vers != RPCRDMA_VERSION)
+		return -ENOSYS;
+
+	/* Pull in the extra for the padded case and bump our pointer */
+	if (rmsgp->rm_type == RDMA_MSGP) {
+		int hdrlen;
+		rmsgp->rm_body.rm_padded.rm_align =
+			ntohl(rmsgp->rm_body.rm_padded.rm_align);
+		rmsgp->rm_body.rm_padded.rm_thresh =
+			ntohl(rmsgp->rm_body.rm_padded.rm_thresh);
+
+		va = &rmsgp->rm_body.rm_padded.rm_pempty[4];
+		rqstp->rq_arg.head[0].iov_base = va;
+		hdrlen = (u32)((unsigned long)va - (unsigned long)rmsgp);
+		rqstp->rq_arg.head[0].iov_len -= hdrlen;
+		if (hdrlen > rqstp->rq_arg.len)
+			return -EINVAL;
+		return hdrlen;
+	}
+
+	/* The chunk list may contain either a read chunk list or a write
+	 * chunk list and a reply chunk list.
+	 */
+	va = &rmsgp->rm_body.rm_chunks[0];
+	vaend = (u32 *)((unsigned long)rmsgp + rqstp->rq_arg.len);
+	va = decode_read_list(va, vaend);
+	if (!va)
+		return -EINVAL;
+	va = decode_write_list(va, vaend);
+	if (!va)
+		return -EINVAL;
+	va = decode_reply_array(va, vaend);
+	if (!va)
+		return -EINVAL;
+
+	rqstp->rq_arg.head[0].iov_base = va;
+	hdr_len = (unsigned long)va - (unsigned long)rmsgp;
+	rqstp->rq_arg.head[0].iov_len -= hdr_len;
+
+	*rdma_req = rmsgp;
+	return hdr_len;
+}
+
+int svc_rdma_xdr_decode_deferred_req(struct svc_rqst *rqstp)
+{
+	struct rpcrdma_msg *rmsgp = NULL;
+	struct rpcrdma_read_chunk *ch;
+	struct rpcrdma_write_array *ary;
+	u32 *va;
+	u32 hdrlen;
+
+	dprintk("svcrdma: processing deferred RDMA header on rqstp=%p\n",
+		rqstp);
+	rmsgp = (struct rpcrdma_msg *)rqstp->rq_arg.head[0].iov_base;
+
+	/* Pull in the extra for the padded case and bump our pointer */
+	if (rmsgp->rm_type == RDMA_MSGP) {
+		va = &rmsgp->rm_body.rm_padded.rm_pempty[4];
+		rqstp->rq_arg.head[0].iov_base = va;
+		hdrlen = (u32)((unsigned long)va - (unsigned long)rmsgp);
+		rqstp->rq_arg.head[0].iov_len -= hdrlen;
+		return hdrlen;
+	}
+
+	/*
+	 * Skip all chunks to find RPC msg. These were previously processed
+	 */
+	va = &rmsgp->rm_body.rm_chunks[0];
+
+	/* Skip read-list */
+	for (ch = (struct rpcrdma_read_chunk *)va;
+	     ch->rc_discrim != xdr_zero; ch++);
+	va = (u32 *)&ch->rc_position;
+
+	/* Skip write-list */
+	ary = (struct rpcrdma_write_array *)va;
+	if (ary->wc_discrim == xdr_zero)
+		va = (u32 *)&ary->wc_nchunks;
+	else
+		/*
+		 * rs_length is the 2nd 4B field in wc_target and taking its
+		 * address skips the list terminator
+		 */
+		va = (u32 *)&ary->wc_array[ary->wc_nchunks].wc_target.rs_length;
+
+	/* Skip reply-array */
+	ary = (struct rpcrdma_write_array *)va;
+	if (ary->wc_discrim == xdr_zero)
+		va = (u32 *)&ary->wc_nchunks;
+	else
+		va = (u32 *)&ary->wc_array[ary->wc_nchunks];
+
+	rqstp->rq_arg.head[0].iov_base = va;
+	hdrlen = (unsigned long)va - (unsigned long)rmsgp;
+	rqstp->rq_arg.head[0].iov_len -= hdrlen;
+
+	return hdrlen;
+}
+
+int svc_rdma_xdr_encode_error(struct svcxprt_rdma *xprt,
+			      struct rpcrdma_msg *rmsgp,
+			      enum rpcrdma_errcode err, u32 *va)
+{
+	u32 *startp = va;
+
+	*va++ = htonl(rmsgp->rm_xid);
+	*va++ = htonl(rmsgp->rm_vers);
+	*va++ = htonl(xprt->sc_max_requests);
+	*va++ = htonl(RDMA_ERROR);
+	*va++ = htonl(err);
+	if (err == ERR_VERS) {
+		*va++ = htonl(RPCRDMA_VERSION);
+		*va++ = htonl(RPCRDMA_VERSION);
+	}
+
+	return (int)((unsigned long)va - (unsigned long)startp);
+}
+
+int svc_rdma_xdr_get_reply_hdr_len(struct rpcrdma_msg *rmsgp)
+{
+	struct rpcrdma_write_array *wr_ary;
+
+	/* There is no read-list in a reply */
+
+	/* skip write list */
+	wr_ary = (struct rpcrdma_write_array *)
+		&rmsgp->rm_body.rm_chunks[1];
+	if (wr_ary->wc_discrim)
+		wr_ary = (struct rpcrdma_write_array *)
+			&wr_ary->wc_array[ntohl(wr_ary->wc_nchunks)].
+			wc_target.rs_length;
+	else
+		wr_ary = (struct rpcrdma_write_array *)
+			&wr_ary->wc_nchunks;
+
+	/* skip reply array */
+	if (wr_ary->wc_discrim)
+		wr_ary = (struct rpcrdma_write_array *)
+			&wr_ary->wc_array[ntohl(wr_ary->wc_nchunks)];
+	else
+		wr_ary = (struct rpcrdma_write_array *)
+			&wr_ary->wc_nchunks;
+
+	return (unsigned long) wr_ary - (unsigned long) rmsgp;
+}
+
+void svc_rdma_xdr_encode_write_list(struct rpcrdma_msg *rmsgp, int chunks)
+{
+	struct rpcrdma_write_array *ary;
+
+	/* no read-list */
+	rmsgp->rm_body.rm_chunks[0] = xdr_zero;
+
+	/* write-array discrim */
+	ary = (struct rpcrdma_write_array *)
+		&rmsgp->rm_body.rm_chunks[1];
+	ary->wc_discrim = xdr_one;
+	ary->wc_nchunks = htonl(chunks);
+
+	/* write-list terminator */
+	ary->wc_array[chunks].wc_target.rs_handle = xdr_zero;
+
+	/* reply-array discriminator */
+	ary->wc_array[chunks].wc_target.rs_length = xdr_zero;
+}
+
+void svc_rdma_xdr_encode_reply_array(struct rpcrdma_write_array *ary,
+				 int chunks)
+{
+	ary->wc_discrim = xdr_one;
+	ary->wc_nchunks = htonl(chunks);
+}
+
+void svc_rdma_xdr_encode_array_chunk(struct rpcrdma_write_array *ary,
+				     int chunk_no,
+				     __be32 rs_handle,
+				     __be64 rs_offset,
+				     u32 write_len)
+{
+	struct rpcrdma_segment *seg = &ary->wc_array[chunk_no].wc_target;
+	seg->rs_handle = rs_handle;
+	seg->rs_offset = rs_offset;
+	seg->rs_length = htonl(write_len);
+}
+
+void svc_rdma_xdr_encode_reply_header(struct svcxprt_rdma *xprt,
+				  struct rpcrdma_msg *rdma_argp,
+				  struct rpcrdma_msg *rdma_resp,
+				  enum rpcrdma_proc rdma_type)
+{
+	rdma_resp->rm_xid = htonl(rdma_argp->rm_xid);
+	rdma_resp->rm_vers = htonl(rdma_argp->rm_vers);
+	rdma_resp->rm_credit = htonl(xprt->sc_max_requests);
+	rdma_resp->rm_type = htonl(rdma_type);
+
+	/* Encode <nul> chunks lists */
+	rdma_resp->rm_body.rm_chunks[0] = xdr_zero;
+	rdma_resp->rm_body.rm_chunks[1] = xdr_zero;
+	rdma_resp->rm_body.rm_chunks[2] = xdr_zero;
+}
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
new file mode 100644
index 0000000..e011027
--- /dev/null
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -0,0 +1,614 @@
+/*
+ * Copyright (c) 2014 Open Grid Computing, Inc. All rights reserved.
+ * Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the BSD-type
+ * license below:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *      Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *
+ *      Redistributions in binary form must reproduce the above
+ *      copyright notice, this list of conditions and the following
+ *      disclaimer in the documentation and/or other materials provided
+ *      with the distribution.
+ *
+ *      Neither the name of the Network Appliance, Inc. nor the names of
+ *      its contributors may be used to endorse or promote products
+ *      derived from this software without specific prior written
+ *      permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Tom Tucker <tom@opengridcomputing.com>
+ */
+
+#include <linux/sunrpc/debug.h>
+#include <linux/sunrpc/rpc_rdma.h>
+#include <linux/spinlock.h>
+#include <linux/highmem.h>
+#include <asm/unaligned.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/rdma_cm.h>
+#include <linux/sunrpc/svc_rdma.h>
+
+#define RPCDBG_FACILITY	RPCDBG_SVCXPRT
+
+/*
+ * Replace the pages in the rq_argpages array with the pages from the SGE in
+ * the RDMA_RECV completion. The SGL should contain full pages up until the
+ * last one.
+ */
+static void rdma_build_arg_xdr(struct svc_rqst *rqstp,
+			       struct svc_rdma_op_ctxt *ctxt,
+			       u32 byte_count)
+{
+	struct page *page;
+	u32 bc;
+	int sge_no;
+
+	/* Swap the page in the SGE with the page in argpages */
+	page = ctxt->pages[0];
+	put_page(rqstp->rq_pages[0]);
+	rqstp->rq_pages[0] = page;
+
+	/* Set up the XDR head */
+	rqstp->rq_arg.head[0].iov_base = page_address(page);
+	rqstp->rq_arg.head[0].iov_len =
+		min_t(size_t, byte_count, ctxt->sge[0].length);
+	rqstp->rq_arg.len = byte_count;
+	rqstp->rq_arg.buflen = byte_count;
+
+	/* Compute bytes past head in the SGL */
+	bc = byte_count - rqstp->rq_arg.head[0].iov_len;
+
+	/* If data remains, store it in the pagelist */
+	rqstp->rq_arg.page_len = bc;
+	rqstp->rq_arg.page_base = 0;
+	rqstp->rq_arg.pages = &rqstp->rq_pages[1];
+	sge_no = 1;
+	while (bc && sge_no < ctxt->count) {
+		page = ctxt->pages[sge_no];
+		put_page(rqstp->rq_pages[sge_no]);
+		rqstp->rq_pages[sge_no] = page;
+		bc -= min_t(u32, bc, ctxt->sge[sge_no].length);
+		rqstp->rq_arg.buflen += ctxt->sge[sge_no].length;
+		sge_no++;
+	}
+	rqstp->rq_respages = &rqstp->rq_pages[sge_no];
+	rqstp->rq_next_page = rqstp->rq_respages + 1;
+
+	/* We should never run out of SGE because the limit is defined to
+	 * support the max allowed RPC data length
+	 */
+	BUG_ON(bc && (sge_no == ctxt->count));
+	BUG_ON((rqstp->rq_arg.head[0].iov_len + rqstp->rq_arg.page_len)
+	       != byte_count);
+	BUG_ON(rqstp->rq_arg.len != byte_count);
+
+	/* If not all pages were used from the SGL, free the remaining ones */
+	bc = sge_no;
+	while (sge_no < ctxt->count) {
+		page = ctxt->pages[sge_no++];
+		put_page(page);
+	}
+	ctxt->count = bc;
+
+	/* Set up tail */
+	rqstp->rq_arg.tail[0].iov_base = NULL;
+	rqstp->rq_arg.tail[0].iov_len = 0;
+}
+
+static int rdma_read_max_sge(struct svcxprt_rdma *xprt, int sge_count)
+{
+	if (rdma_node_get_transport(xprt->sc_cm_id->device->node_type) ==
+	     RDMA_TRANSPORT_IWARP)
+		return 1;
+	else
+		return min_t(int, sge_count, xprt->sc_max_sge);
+}
+
+typedef int (*rdma_reader_fn)(struct svcxprt_rdma *xprt,
+			      struct svc_rqst *rqstp,
+			      struct svc_rdma_op_ctxt *head,
+			      int *page_no,
+			      u32 *page_offset,
+			      u32 rs_handle,
+			      u32 rs_length,
+			      u64 rs_offset,
+			      int last);
+
+/* Issue an RDMA_READ using the local lkey to map the data sink */
+static int rdma_read_chunk_lcl(struct svcxprt_rdma *xprt,
+			       struct svc_rqst *rqstp,
+			       struct svc_rdma_op_ctxt *head,
+			       int *page_no,
+			       u32 *page_offset,
+			       u32 rs_handle,
+			       u32 rs_length,
+			       u64 rs_offset,
+			       int last)
+{
+	struct ib_send_wr read_wr;
+	int pages_needed = PAGE_ALIGN(*page_offset + rs_length) >> PAGE_SHIFT;
+	struct svc_rdma_op_ctxt *ctxt = svc_rdma_get_context(xprt);
+	int ret, read, pno;
+	u32 pg_off = *page_offset;
+	u32 pg_no = *page_no;
+
+	ctxt->direction = DMA_FROM_DEVICE;
+	ctxt->read_hdr = head;
+	pages_needed =
+		min_t(int, pages_needed, rdma_read_max_sge(xprt, pages_needed));
+	read = min_t(int, pages_needed << PAGE_SHIFT, rs_length);
+
+	for (pno = 0; pno < pages_needed; pno++) {
+		int len = min_t(int, rs_length, PAGE_SIZE - pg_off);
+
+		head->arg.pages[pg_no] = rqstp->rq_arg.pages[pg_no];
+		head->arg.page_len += len;
+		head->arg.len += len;
+		if (!pg_off)
+			head->count++;
+		rqstp->rq_respages = &rqstp->rq_arg.pages[pg_no+1];
+		rqstp->rq_next_page = rqstp->rq_respages + 1;
+		ctxt->sge[pno].addr =
+			ib_dma_map_page(xprt->sc_cm_id->device,
+					head->arg.pages[pg_no], pg_off,
+					PAGE_SIZE - pg_off,
+					DMA_FROM_DEVICE);
+		ret = ib_dma_mapping_error(xprt->sc_cm_id->device,
+					   ctxt->sge[pno].addr);
+		if (ret)
+			goto err;
+		atomic_inc(&xprt->sc_dma_used);
+
+		/* The lkey here is either a local dma lkey or a dma_mr lkey */
+		ctxt->sge[pno].lkey = xprt->sc_dma_lkey;
+		ctxt->sge[pno].length = len;
+		ctxt->count++;
+
+		/* adjust offset and wrap to next page if needed */
+		pg_off += len;
+		if (pg_off == PAGE_SIZE) {
+			pg_off = 0;
+			pg_no++;
+		}
+		rs_length -= len;
+	}
+
+	if (last && rs_length == 0)
+		set_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags);
+	else
+		clear_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags);
+
+	memset(&read_wr, 0, sizeof(read_wr));
+	read_wr.wr_id = (unsigned long)ctxt;
+	read_wr.opcode = IB_WR_RDMA_READ;
+	ctxt->wr_op = read_wr.opcode;
+	read_wr.send_flags = IB_SEND_SIGNALED;
+	read_wr.wr.rdma.rkey = rs_handle;
+	read_wr.wr.rdma.remote_addr = rs_offset;
+	read_wr.sg_list = ctxt->sge;
+	read_wr.num_sge = pages_needed;
+
+	ret = svc_rdma_send(xprt, &read_wr);
+	if (ret) {
+		pr_err("svcrdma: Error %d posting RDMA_READ\n", ret);
+		set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags);
+		goto err;
+	}
+
+	/* return current location in page array */
+	*page_no = pg_no;
+	*page_offset = pg_off;
+	ret = read;
+	atomic_inc(&rdma_stat_read);
+	return ret;
+ err:
+	svc_rdma_unmap_dma(ctxt);
+	svc_rdma_put_context(ctxt, 0);
+	return ret;
+}
+
+/* Issue an RDMA_READ using an FRMR to map the data sink */
+static int rdma_read_chunk_frmr(struct svcxprt_rdma *xprt,
+				struct svc_rqst *rqstp,
+				struct svc_rdma_op_ctxt *head,
+				int *page_no,
+				u32 *page_offset,
+				u32 rs_handle,
+				u32 rs_length,
+				u64 rs_offset,
+				int last)
+{
+	struct ib_send_wr read_wr;
+	struct ib_send_wr inv_wr;
+	struct ib_send_wr fastreg_wr;
+	u8 key;
+	int pages_needed = PAGE_ALIGN(*page_offset + rs_length) >> PAGE_SHIFT;
+	struct svc_rdma_op_ctxt *ctxt = svc_rdma_get_context(xprt);
+	struct svc_rdma_fastreg_mr *frmr = svc_rdma_get_frmr(xprt);
+	int ret, read, pno;
+	u32 pg_off = *page_offset;
+	u32 pg_no = *page_no;
+
+	if (IS_ERR(frmr))
+		return -ENOMEM;
+
+	ctxt->direction = DMA_FROM_DEVICE;
+	ctxt->frmr = frmr;
+	pages_needed = min_t(int, pages_needed, xprt->sc_frmr_pg_list_len);
+	read = min_t(int, pages_needed << PAGE_SHIFT, rs_length);
+
+	frmr->kva = page_address(rqstp->rq_arg.pages[pg_no]);
+	frmr->direction = DMA_FROM_DEVICE;
+	frmr->access_flags = (IB_ACCESS_LOCAL_WRITE|IB_ACCESS_REMOTE_WRITE);
+	frmr->map_len = pages_needed << PAGE_SHIFT;
+	frmr->page_list_len = pages_needed;
+
+	for (pno = 0; pno < pages_needed; pno++) {
+		int len = min_t(int, rs_length, PAGE_SIZE - pg_off);
+
+		head->arg.pages[pg_no] = rqstp->rq_arg.pages[pg_no];
+		head->arg.page_len += len;
+		head->arg.len += len;
+		if (!pg_off)
+			head->count++;
+		rqstp->rq_respages = &rqstp->rq_arg.pages[pg_no+1];
+		rqstp->rq_next_page = rqstp->rq_respages + 1;
+		frmr->page_list->page_list[pno] =
+			ib_dma_map_page(xprt->sc_cm_id->device,
+					head->arg.pages[pg_no], 0,
+					PAGE_SIZE, DMA_FROM_DEVICE);
+		ret = ib_dma_mapping_error(xprt->sc_cm_id->device,
+					   frmr->page_list->page_list[pno]);
+		if (ret)
+			goto err;
+		atomic_inc(&xprt->sc_dma_used);
+
+		/* adjust offset and wrap to next page if needed */
+		pg_off += len;
+		if (pg_off == PAGE_SIZE) {
+			pg_off = 0;
+			pg_no++;
+		}
+		rs_length -= len;
+	}
+
+	if (last && rs_length == 0)
+		set_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags);
+	else
+		clear_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags);
+
+	/* Bump the key */
+	key = (u8)(frmr->mr->lkey & 0x000000FF);
+	ib_update_fast_reg_key(frmr->mr, ++key);
+
+	ctxt->sge[0].addr = (unsigned long)frmr->kva + *page_offset;
+	ctxt->sge[0].lkey = frmr->mr->lkey;
+	ctxt->sge[0].length = read;
+	ctxt->count = 1;
+	ctxt->read_hdr = head;
+
+	/* Prepare FASTREG WR */
+	memset(&fastreg_wr, 0, sizeof(fastreg_wr));
+	fastreg_wr.opcode = IB_WR_FAST_REG_MR;
+	fastreg_wr.send_flags = IB_SEND_SIGNALED;
+	fastreg_wr.wr.fast_reg.iova_start = (unsigned long)frmr->kva;
+	fastreg_wr.wr.fast_reg.page_list = frmr->page_list;
+	fastreg_wr.wr.fast_reg.page_list_len = frmr->page_list_len;
+	fastreg_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
+	fastreg_wr.wr.fast_reg.length = frmr->map_len;
+	fastreg_wr.wr.fast_reg.access_flags = frmr->access_flags;
+	fastreg_wr.wr.fast_reg.rkey = frmr->mr->lkey;
+	fastreg_wr.next = &read_wr;
+
+	/* Prepare RDMA_READ */
+	memset(&read_wr, 0, sizeof(read_wr));
+	read_wr.send_flags = IB_SEND_SIGNALED;
+	read_wr.wr.rdma.rkey = rs_handle;
+	read_wr.wr.rdma.remote_addr = rs_offset;
+	read_wr.sg_list = ctxt->sge;
+	read_wr.num_sge = 1;
+	if (xprt->sc_dev_caps & SVCRDMA_DEVCAP_READ_W_INV) {
+		read_wr.opcode = IB_WR_RDMA_READ_WITH_INV;
+		read_wr.wr_id = (unsigned long)ctxt;
+		read_wr.ex.invalidate_rkey = ctxt->frmr->mr->lkey;
+	} else {
+		read_wr.opcode = IB_WR_RDMA_READ;
+		read_wr.next = &inv_wr;
+		/* Prepare invalidate */
+		memset(&inv_wr, 0, sizeof(inv_wr));
+		inv_wr.wr_id = (unsigned long)ctxt;
+		inv_wr.opcode = IB_WR_LOCAL_INV;
+		inv_wr.send_flags = IB_SEND_SIGNALED | IB_SEND_FENCE;
+		inv_wr.ex.invalidate_rkey = frmr->mr->lkey;
+	}
+	ctxt->wr_op = read_wr.opcode;
+
+	/* Post the chain */
+	ret = svc_rdma_send(xprt, &fastreg_wr);
+	if (ret) {
+		pr_err("svcrdma: Error %d posting RDMA_READ\n", ret);
+		set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags);
+		goto err;
+	}
+
+	/* return current location in page array */
+	*page_no = pg_no;
+	*page_offset = pg_off;
+	ret = read;
+	atomic_inc(&rdma_stat_read);
+	return ret;
+ err:
+	svc_rdma_unmap_dma(ctxt);
+	svc_rdma_put_context(ctxt, 0);
+	svc_rdma_put_frmr(xprt, frmr);
+	return ret;
+}
+
+static int rdma_read_chunks(struct svcxprt_rdma *xprt,
+			    struct rpcrdma_msg *rmsgp,
+			    struct svc_rqst *rqstp,
+			    struct svc_rdma_op_ctxt *head)
+{
+	int page_no, ch_count, ret;
+	struct rpcrdma_read_chunk *ch;
+	u32 page_offset, byte_count;
+	u64 rs_offset;
+	rdma_reader_fn reader;
+
+	/* If no read list is present, return 0 */
+	ch = svc_rdma_get_read_chunk(rmsgp);
+	if (!ch)
+		return 0;
+
+	svc_rdma_rcl_chunk_counts(ch, &ch_count, &byte_count);
+	if (ch_count > RPCSVC_MAXPAGES)
+		return -EINVAL;
+
+	/* The request is completed when the RDMA_READs complete. The
+	 * head context keeps all the pages that comprise the
+	 * request.
+	 */
+	head->arg.head[0] = rqstp->rq_arg.head[0];
+	head->arg.tail[0] = rqstp->rq_arg.tail[0];
+	head->arg.pages = &head->pages[head->count];
+	head->hdr_count = head->count;
+	head->arg.page_base = 0;
+	head->arg.page_len = 0;
+	head->arg.len = rqstp->rq_arg.len;
+	head->arg.buflen = rqstp->rq_arg.buflen;
+
+	/* Use FRMR if supported */
+	if (xprt->sc_dev_caps & SVCRDMA_DEVCAP_FAST_REG)
+		reader = rdma_read_chunk_frmr;
+	else
+		reader = rdma_read_chunk_lcl;
+
+	page_no = 0; page_offset = 0;
+	for (ch = (struct rpcrdma_read_chunk *)&rmsgp->rm_body.rm_chunks[0];
+	     ch->rc_discrim != 0; ch++) {
+
+		xdr_decode_hyper((__be32 *)&ch->rc_target.rs_offset,
+				 &rs_offset);
+		byte_count = ntohl(ch->rc_target.rs_length);
+
+		while (byte_count > 0) {
+			ret = reader(xprt, rqstp, head,
+				     &page_no, &page_offset,
+				     ntohl(ch->rc_target.rs_handle),
+				     byte_count, rs_offset,
+				     ((ch+1)->rc_discrim == 0) /* last */
+				     );
+			if (ret < 0)
+				goto err;
+			byte_count -= ret;
+			rs_offset += ret;
+			head->arg.buflen += ret;
+		}
+	}
+	ret = 1;
+ err:
+	/* Detach arg pages. svc_recv will replenish them */
+	for (page_no = 0;
+	     &rqstp->rq_pages[page_no] < rqstp->rq_respages; page_no++)
+		rqstp->rq_pages[page_no] = NULL;
+
+	return ret;
+}
+
+/*
+ * To avoid a separate RDMA READ just for a handful of zero bytes,
+ * RFC 5666 section 3.7 allows the client to omit the XDR zero pad
+ * in chunk lists.
+ */
+static void
+rdma_fix_xdr_pad(struct xdr_buf *buf)
+{
+	unsigned int page_len = buf->page_len;
+	unsigned int size = (XDR_QUADLEN(page_len) << 2) - page_len;
+	unsigned int offset, pg_no;
+	char *p;
+
+	if (size == 0)
+		return;
+
+	pg_no = page_len >> PAGE_SHIFT;
+	offset = page_len & ~PAGE_MASK;
+	p = page_address(buf->pages[pg_no]);
+	memset(p + offset, 0, size);
+
+	buf->page_len += size;
+	buf->buflen += size;
+	buf->len += size;
+}
+
+static int rdma_read_complete(struct svc_rqst *rqstp,
+			      struct svc_rdma_op_ctxt *head)
+{
+	int page_no;
+	int ret;
+
+	BUG_ON(!head);
+
+	/* Copy RPC pages */
+	for (page_no = 0; page_no < head->count; page_no++) {
+		put_page(rqstp->rq_pages[page_no]);
+		rqstp->rq_pages[page_no] = head->pages[page_no];
+	}
+	/* Point rq_arg.pages past header */
+	rdma_fix_xdr_pad(&head->arg);
+	rqstp->rq_arg.pages = &rqstp->rq_pages[head->hdr_count];
+	rqstp->rq_arg.page_len = head->arg.page_len;
+	rqstp->rq_arg.page_base = head->arg.page_base;
+
+	/* rq_respages starts after the last arg page */
+	rqstp->rq_respages = &rqstp->rq_arg.pages[page_no];
+	rqstp->rq_next_page = rqstp->rq_respages + 1;
+
+	/* Rebuild rq_arg head and tail. */
+	rqstp->rq_arg.head[0] = head->arg.head[0];
+	rqstp->rq_arg.tail[0] = head->arg.tail[0];
+	rqstp->rq_arg.len = head->arg.len;
+	rqstp->rq_arg.buflen = head->arg.buflen;
+
+	/* Free the context */
+	svc_rdma_put_context(head, 0);
+
+	/* XXX: What should this be? */
+	rqstp->rq_prot = IPPROTO_MAX;
+	svc_xprt_copy_addrs(rqstp, rqstp->rq_xprt);
+
+	ret = rqstp->rq_arg.head[0].iov_len
+		+ rqstp->rq_arg.page_len
+		+ rqstp->rq_arg.tail[0].iov_len;
+	dprintk("svcrdma: deferred read ret=%d, rq_arg.len =%d, "
+		"rq_arg.head[0].iov_base=%p, rq_arg.head[0].iov_len = %zd\n",
+		ret, rqstp->rq_arg.len,	rqstp->rq_arg.head[0].iov_base,
+		rqstp->rq_arg.head[0].iov_len);
+
+	return ret;
+}
+
+/*
+ * Set up the rqstp thread context to point to the RQ buffer. If
+ * necessary, pull additional data from the client with an RDMA_READ
+ * request.
+ */
+int svc_rdma_recvfrom(struct svc_rqst *rqstp)
+{
+	struct svc_xprt *xprt = rqstp->rq_xprt;
+	struct svcxprt_rdma *rdma_xprt =
+		container_of(xprt, struct svcxprt_rdma, sc_xprt);
+	struct svc_rdma_op_ctxt *ctxt = NULL;
+	struct rpcrdma_msg *rmsgp;
+	int ret = 0;
+	int len;
+
+	dprintk("svcrdma: rqstp=%p\n", rqstp);
+
+	spin_lock_bh(&rdma_xprt->sc_rq_dto_lock);
+	if (!list_empty(&rdma_xprt->sc_read_complete_q)) {
+		ctxt = list_entry(rdma_xprt->sc_read_complete_q.next,
+				  struct svc_rdma_op_ctxt,
+				  dto_q);
+		list_del_init(&ctxt->dto_q);
+		spin_unlock_bh(&rdma_xprt->sc_rq_dto_lock);
+		return rdma_read_complete(rqstp, ctxt);
+	} else if (!list_empty(&rdma_xprt->sc_rq_dto_q)) {
+		ctxt = list_entry(rdma_xprt->sc_rq_dto_q.next,
+				  struct svc_rdma_op_ctxt,
+				  dto_q);
+		list_del_init(&ctxt->dto_q);
+	} else {
+		atomic_inc(&rdma_stat_rq_starve);
+		clear_bit(XPT_DATA, &xprt->xpt_flags);
+		ctxt = NULL;
+	}
+	spin_unlock_bh(&rdma_xprt->sc_rq_dto_lock);
+	if (!ctxt) {
+		/* This is the EAGAIN path. The svc_recv routine will
+		 * return -EAGAIN, the nfsd thread will go to call into
+		 * svc_recv again and we shouldn't be on the active
+		 * transport list
+		 */
+		if (test_bit(XPT_CLOSE, &xprt->xpt_flags))
+			goto close_out;
+
+		goto out;
+	}
+	dprintk("svcrdma: processing ctxt=%p on xprt=%p, rqstp=%p, status=%d\n",
+		ctxt, rdma_xprt, rqstp, ctxt->wc_status);
+	BUG_ON(ctxt->wc_status != IB_WC_SUCCESS);
+	atomic_inc(&rdma_stat_recv);
+
+	/* Build up the XDR from the receive buffers. */
+	rdma_build_arg_xdr(rqstp, ctxt, ctxt->byte_len);
+
+	/* Decode the RDMA header. */
+	len = svc_rdma_xdr_decode_req(&rmsgp, rqstp);
+	rqstp->rq_xprt_hlen = len;
+
+	/* If the request is invalid, reply with an error */
+	if (len < 0) {
+		if (len == -ENOSYS)
+			svc_rdma_send_error(rdma_xprt, rmsgp, ERR_VERS);
+		goto close_out;
+	}
+
+	/* Read read-list data. */
+	ret = rdma_read_chunks(rdma_xprt, rmsgp, rqstp, ctxt);
+	if (ret > 0) {
+		/* read-list posted, defer until data received from client. */
+		goto defer;
+	} else if (ret < 0) {
+		/* Post of read-list failed, free context. */
+		svc_rdma_put_context(ctxt, 1);
+		return 0;
+	}
+
+	ret = rqstp->rq_arg.head[0].iov_len
+		+ rqstp->rq_arg.page_len
+		+ rqstp->rq_arg.tail[0].iov_len;
+	svc_rdma_put_context(ctxt, 0);
+ out:
+	dprintk("svcrdma: ret = %d, rq_arg.len =%d, "
+		"rq_arg.head[0].iov_base=%p, rq_arg.head[0].iov_len = %zd\n",
+		ret, rqstp->rq_arg.len,
+		rqstp->rq_arg.head[0].iov_base,
+		rqstp->rq_arg.head[0].iov_len);
+	rqstp->rq_prot = IPPROTO_MAX;
+	svc_xprt_copy_addrs(rqstp, xprt);
+	return ret;
+
+ close_out:
+	if (ctxt)
+		svc_rdma_put_context(ctxt, 1);
+	dprintk("svcrdma: transport %p is closing\n", xprt);
+	/*
+	 * Set the close bit and enqueue it. svc_recv will see the
+	 * close bit and call svc_xprt_delete
+	 */
+	set_bit(XPT_CLOSE, &xprt->xpt_flags);
+defer:
+	return 0;
+}
diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
new file mode 100644
index 0000000..9f1b506
--- /dev/null
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -0,0 +1,554 @@
+/*
+ * Copyright (c) 2014 Open Grid Computing, Inc. All rights reserved.
+ * Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the BSD-type
+ * license below:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *      Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *
+ *      Redistributions in binary form must reproduce the above
+ *      copyright notice, this list of conditions and the following
+ *      disclaimer in the documentation and/or other materials provided
+ *      with the distribution.
+ *
+ *      Neither the name of the Network Appliance, Inc. nor the names of
+ *      its contributors may be used to endorse or promote products
+ *      derived from this software without specific prior written
+ *      permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Tom Tucker <tom@opengridcomputing.com>
+ */
+
+#include <linux/sunrpc/debug.h>
+#include <linux/sunrpc/rpc_rdma.h>
+#include <linux/spinlock.h>
+#include <asm/unaligned.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/rdma_cm.h>
+#include <linux/sunrpc/svc_rdma.h>
+
+#define RPCDBG_FACILITY	RPCDBG_SVCXPRT
+
+static int map_xdr(struct svcxprt_rdma *xprt,
+		   struct xdr_buf *xdr,
+		   struct svc_rdma_req_map *vec)
+{
+	int sge_no;
+	u32 sge_bytes;
+	u32 page_bytes;
+	u32 page_off;
+	int page_no;
+
+	BUG_ON(xdr->len !=
+	       (xdr->head[0].iov_len + xdr->page_len + xdr->tail[0].iov_len));
+
+	/* Skip the first sge, this is for the RPCRDMA header */
+	sge_no = 1;
+
+	/* Head SGE */
+	vec->sge[sge_no].iov_base = xdr->head[0].iov_base;
+	vec->sge[sge_no].iov_len = xdr->head[0].iov_len;
+	sge_no++;
+
+	/* pages SGE */
+	page_no = 0;
+	page_bytes = xdr->page_len;
+	page_off = xdr->page_base;
+	while (page_bytes) {
+		vec->sge[sge_no].iov_base =
+			page_address(xdr->pages[page_no]) + page_off;
+		sge_bytes = min_t(u32, page_bytes, (PAGE_SIZE - page_off));
+		page_bytes -= sge_bytes;
+		vec->sge[sge_no].iov_len = sge_bytes;
+
+		sge_no++;
+		page_no++;
+		page_off = 0; /* reset for next time through loop */
+	}
+
+	/* Tail SGE */
+	if (xdr->tail[0].iov_len) {
+		vec->sge[sge_no].iov_base = xdr->tail[0].iov_base;
+		vec->sge[sge_no].iov_len = xdr->tail[0].iov_len;
+		sge_no++;
+	}
+
+	dprintk("svcrdma: map_xdr: sge_no %d page_no %d "
+		"page_base %u page_len %u head_len %zu tail_len %zu\n",
+		sge_no, page_no, xdr->page_base, xdr->page_len,
+		xdr->head[0].iov_len, xdr->tail[0].iov_len);
+
+	vec->count = sge_no;
+	return 0;
+}
+
+static dma_addr_t dma_map_xdr(struct svcxprt_rdma *xprt,
+			      struct xdr_buf *xdr,
+			      u32 xdr_off, size_t len, int dir)
+{
+	struct page *page;
+	dma_addr_t dma_addr;
+	if (xdr_off < xdr->head[0].iov_len) {
+		/* This offset is in the head */
+		xdr_off += (unsigned long)xdr->head[0].iov_base & ~PAGE_MASK;
+		page = virt_to_page(xdr->head[0].iov_base);
+	} else {
+		xdr_off -= xdr->head[0].iov_len;
+		if (xdr_off < xdr->page_len) {
+			/* This offset is in the page list */
+			xdr_off += xdr->page_base;
+			page = xdr->pages[xdr_off >> PAGE_SHIFT];
+			xdr_off &= ~PAGE_MASK;
+		} else {
+			/* This offset is in the tail */
+			xdr_off -= xdr->page_len;
+			xdr_off += (unsigned long)
+				xdr->tail[0].iov_base & ~PAGE_MASK;
+			page = virt_to_page(xdr->tail[0].iov_base);
+		}
+	}
+	dma_addr = ib_dma_map_page(xprt->sc_cm_id->device, page, xdr_off,
+				   min_t(size_t, PAGE_SIZE, len), dir);
+	return dma_addr;
+}
+
+/* Assumptions:
+ * - The specified write_len can be represented in sc_max_sge * PAGE_SIZE
+ */
+static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp,
+		      u32 rmr, u64 to,
+		      u32 xdr_off, int write_len,
+		      struct svc_rdma_req_map *vec)
+{
+	struct ib_send_wr write_wr;
+	struct ib_sge *sge;
+	int xdr_sge_no;
+	int sge_no;
+	int sge_bytes;
+	int sge_off;
+	int bc;
+	struct svc_rdma_op_ctxt *ctxt;
+
+	BUG_ON(vec->count > RPCSVC_MAXPAGES);
+	dprintk("svcrdma: RDMA_WRITE rmr=%x, to=%llx, xdr_off=%d, "
+		"write_len=%d, vec->sge=%p, vec->count=%lu\n",
+		rmr, (unsigned long long)to, xdr_off,
+		write_len, vec->sge, vec->count);
+
+	ctxt = svc_rdma_get_context(xprt);
+	ctxt->direction = DMA_TO_DEVICE;
+	sge = ctxt->sge;
+
+	/* Find the SGE associated with xdr_off */
+	for (bc = xdr_off, xdr_sge_no = 1; bc && xdr_sge_no < vec->count;
+	     xdr_sge_no++) {
+		if (vec->sge[xdr_sge_no].iov_len > bc)
+			break;
+		bc -= vec->sge[xdr_sge_no].iov_len;
+	}
+
+	sge_off = bc;
+	bc = write_len;
+	sge_no = 0;
+
+	/* Copy the remaining SGE */
+	while (bc != 0) {
+		sge_bytes = min_t(size_t,
+			  bc, vec->sge[xdr_sge_no].iov_len-sge_off);
+		sge[sge_no].length = sge_bytes;
+		sge[sge_no].addr =
+			dma_map_xdr(xprt, &rqstp->rq_res, xdr_off,
+				    sge_bytes, DMA_TO_DEVICE);
+		xdr_off += sge_bytes;
+		if (ib_dma_mapping_error(xprt->sc_cm_id->device,
+					 sge[sge_no].addr))
+			goto err;
+		atomic_inc(&xprt->sc_dma_used);
+		sge[sge_no].lkey = xprt->sc_dma_lkey;
+		ctxt->count++;
+		sge_off = 0;
+		sge_no++;
+		xdr_sge_no++;
+		BUG_ON(xdr_sge_no > vec->count);
+		bc -= sge_bytes;
+		if (sge_no == xprt->sc_max_sge)
+			break;
+	}
+
+	/* Prepare WRITE WR */
+	memset(&write_wr, 0, sizeof write_wr);
+	ctxt->wr_op = IB_WR_RDMA_WRITE;
+	write_wr.wr_id = (unsigned long)ctxt;
+	write_wr.sg_list = &sge[0];
+	write_wr.num_sge = sge_no;
+	write_wr.opcode = IB_WR_RDMA_WRITE;
+	write_wr.send_flags = IB_SEND_SIGNALED;
+	write_wr.wr.rdma.rkey = rmr;
+	write_wr.wr.rdma.remote_addr = to;
+
+	/* Post It */
+	atomic_inc(&rdma_stat_write);
+	if (svc_rdma_send(xprt, &write_wr))
+		goto err;
+	return write_len - bc;
+ err:
+	svc_rdma_unmap_dma(ctxt);
+	svc_rdma_put_context(ctxt, 0);
+	/* Fatal error, close transport */
+	return -EIO;
+}
+
+static int send_write_chunks(struct svcxprt_rdma *xprt,
+			     struct rpcrdma_msg *rdma_argp,
+			     struct rpcrdma_msg *rdma_resp,
+			     struct svc_rqst *rqstp,
+			     struct svc_rdma_req_map *vec)
+{
+	u32 xfer_len = rqstp->rq_res.page_len + rqstp->rq_res.tail[0].iov_len;
+	int write_len;
+	u32 xdr_off;
+	int chunk_off;
+	int chunk_no;
+	struct rpcrdma_write_array *arg_ary;
+	struct rpcrdma_write_array *res_ary;
+	int ret;
+
+	arg_ary = svc_rdma_get_write_array(rdma_argp);
+	if (!arg_ary)
+		return 0;
+	res_ary = (struct rpcrdma_write_array *)
+		&rdma_resp->rm_body.rm_chunks[1];
+
+	/* Write chunks start at the pagelist */
+	for (xdr_off = rqstp->rq_res.head[0].iov_len, chunk_no = 0;
+	     xfer_len && chunk_no < arg_ary->wc_nchunks;
+	     chunk_no++) {
+		struct rpcrdma_segment *arg_ch;
+		u64 rs_offset;
+
+		arg_ch = &arg_ary->wc_array[chunk_no].wc_target;
+		write_len = min(xfer_len, ntohl(arg_ch->rs_length));
+
+		/* Prepare the response chunk given the length actually
+		 * written */
+		xdr_decode_hyper((__be32 *)&arg_ch->rs_offset, &rs_offset);
+		svc_rdma_xdr_encode_array_chunk(res_ary, chunk_no,
+						arg_ch->rs_handle,
+						arg_ch->rs_offset,
+						write_len);
+		chunk_off = 0;
+		while (write_len) {
+			ret = send_write(xprt, rqstp,
+					 ntohl(arg_ch->rs_handle),
+					 rs_offset + chunk_off,
+					 xdr_off,
+					 write_len,
+					 vec);
+			if (ret <= 0) {
+				dprintk("svcrdma: RDMA_WRITE failed, ret=%d\n",
+					ret);
+				return -EIO;
+			}
+			chunk_off += ret;
+			xdr_off += ret;
+			xfer_len -= ret;
+			write_len -= ret;
+		}
+	}
+	/* Update the req with the number of chunks actually used */
+	svc_rdma_xdr_encode_write_list(rdma_resp, chunk_no);
+
+	return rqstp->rq_res.page_len + rqstp->rq_res.tail[0].iov_len;
+}
+
+static int send_reply_chunks(struct svcxprt_rdma *xprt,
+			     struct rpcrdma_msg *rdma_argp,
+			     struct rpcrdma_msg *rdma_resp,
+			     struct svc_rqst *rqstp,
+			     struct svc_rdma_req_map *vec)
+{
+	u32 xfer_len = rqstp->rq_res.len;
+	int write_len;
+	u32 xdr_off;
+	int chunk_no;
+	int chunk_off;
+	int nchunks;
+	struct rpcrdma_segment *ch;
+	struct rpcrdma_write_array *arg_ary;
+	struct rpcrdma_write_array *res_ary;
+	int ret;
+
+	arg_ary = svc_rdma_get_reply_array(rdma_argp);
+	if (!arg_ary)
+		return 0;
+	/* XXX: need to fix when reply lists occur with read-list and or
+	 * write-list */
+	res_ary = (struct rpcrdma_write_array *)
+		&rdma_resp->rm_body.rm_chunks[2];
+
+	/* xdr offset starts at RPC message */
+	nchunks = ntohl(arg_ary->wc_nchunks);
+	for (xdr_off = 0, chunk_no = 0;
+	     xfer_len && chunk_no < nchunks;
+	     chunk_no++) {
+		u64 rs_offset;
+		ch = &arg_ary->wc_array[chunk_no].wc_target;
+		write_len = min(xfer_len, htonl(ch->rs_length));
+
+		/* Prepare the reply chunk given the length actually
+		 * written */
+		xdr_decode_hyper((__be32 *)&ch->rs_offset, &rs_offset);
+		svc_rdma_xdr_encode_array_chunk(res_ary, chunk_no,
+						ch->rs_handle, ch->rs_offset,
+						write_len);
+		chunk_off = 0;
+		while (write_len) {
+			ret = send_write(xprt, rqstp,
+					 ntohl(ch->rs_handle),
+					 rs_offset + chunk_off,
+					 xdr_off,
+					 write_len,
+					 vec);
+			if (ret <= 0) {
+				dprintk("svcrdma: RDMA_WRITE failed, ret=%d\n",
+					ret);
+				return -EIO;
+			}
+			chunk_off += ret;
+			xdr_off += ret;
+			xfer_len -= ret;
+			write_len -= ret;
+		}
+	}
+	/* Update the req with the number of chunks actually used */
+	svc_rdma_xdr_encode_reply_array(res_ary, chunk_no);
+
+	return rqstp->rq_res.len;
+}
+
+/* This function prepares the portion of the RPCRDMA message to be
+ * sent in the RDMA_SEND. This function is called after data sent via
+ * RDMA has already been transmitted. There are three cases:
+ * - The RPCRDMA header, RPC header, and payload are all sent in a
+ *   single RDMA_SEND. This is the "inline" case.
+ * - The RPCRDMA header and some portion of the RPC header and data
+ *   are sent via this RDMA_SEND and another portion of the data is
+ *   sent via RDMA.
+ * - The RPCRDMA header [NOMSG] is sent in this RDMA_SEND and the RPC
+ *   header and data are all transmitted via RDMA.
+ * In all three cases, this function prepares the RPCRDMA header in
+ * sge[0], the 'type' parameter indicates the type to place in the
+ * RPCRDMA header, and the 'byte_count' field indicates how much of
+ * the XDR to include in this RDMA_SEND. NB: The offset of the payload
+ * to send is zero in the XDR.
+ */
+static int send_reply(struct svcxprt_rdma *rdma,
+		      struct svc_rqst *rqstp,
+		      struct page *page,
+		      struct rpcrdma_msg *rdma_resp,
+		      struct svc_rdma_op_ctxt *ctxt,
+		      struct svc_rdma_req_map *vec,
+		      int byte_count)
+{
+	struct ib_send_wr send_wr;
+	int sge_no;
+	int sge_bytes;
+	int page_no;
+	int pages;
+	int ret;
+
+	/* Post a recv buffer to handle another request. */
+	ret = svc_rdma_post_recv(rdma);
+	if (ret) {
+		printk(KERN_INFO
+		       "svcrdma: could not post a receive buffer, err=%d."
+		       "Closing transport %p.\n", ret, rdma);
+		set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags);
+		svc_rdma_put_context(ctxt, 0);
+		return -ENOTCONN;
+	}
+
+	/* Prepare the context */
+	ctxt->pages[0] = page;
+	ctxt->count = 1;
+
+	/* Prepare the SGE for the RPCRDMA Header */
+	ctxt->sge[0].lkey = rdma->sc_dma_lkey;
+	ctxt->sge[0].length = svc_rdma_xdr_get_reply_hdr_len(rdma_resp);
+	ctxt->sge[0].addr =
+	    ib_dma_map_page(rdma->sc_cm_id->device, page, 0,
+			    ctxt->sge[0].length, DMA_TO_DEVICE);
+	if (ib_dma_mapping_error(rdma->sc_cm_id->device, ctxt->sge[0].addr))
+		goto err;
+	atomic_inc(&rdma->sc_dma_used);
+
+	ctxt->direction = DMA_TO_DEVICE;
+
+	/* Map the payload indicated by 'byte_count' */
+	for (sge_no = 1; byte_count && sge_no < vec->count; sge_no++) {
+		int xdr_off = 0;
+		sge_bytes = min_t(size_t, vec->sge[sge_no].iov_len, byte_count);
+		byte_count -= sge_bytes;
+		ctxt->sge[sge_no].addr =
+			dma_map_xdr(rdma, &rqstp->rq_res, xdr_off,
+				    sge_bytes, DMA_TO_DEVICE);
+		xdr_off += sge_bytes;
+		if (ib_dma_mapping_error(rdma->sc_cm_id->device,
+					 ctxt->sge[sge_no].addr))
+			goto err;
+		atomic_inc(&rdma->sc_dma_used);
+		ctxt->sge[sge_no].lkey = rdma->sc_dma_lkey;
+		ctxt->sge[sge_no].length = sge_bytes;
+	}
+	BUG_ON(byte_count != 0);
+
+	/* Save all respages in the ctxt and remove them from the
+	 * respages array. They are our pages until the I/O
+	 * completes.
+	 */
+	pages = rqstp->rq_next_page - rqstp->rq_respages;
+	for (page_no = 0; page_no < pages; page_no++) {
+		ctxt->pages[page_no+1] = rqstp->rq_respages[page_no];
+		ctxt->count++;
+		rqstp->rq_respages[page_no] = NULL;
+		/*
+		 * If there are more pages than SGE, terminate SGE
+		 * list so that svc_rdma_unmap_dma doesn't attempt to
+		 * unmap garbage.
+		 */
+		if (page_no+1 >= sge_no)
+			ctxt->sge[page_no+1].length = 0;
+	}
+	rqstp->rq_next_page = rqstp->rq_respages + 1;
+
+	BUG_ON(sge_no > rdma->sc_max_sge);
+	memset(&send_wr, 0, sizeof send_wr);
+	ctxt->wr_op = IB_WR_SEND;
+	send_wr.wr_id = (unsigned long)ctxt;
+	send_wr.sg_list = ctxt->sge;
+	send_wr.num_sge = sge_no;
+	send_wr.opcode = IB_WR_SEND;
+	send_wr.send_flags =  IB_SEND_SIGNALED;
+
+	ret = svc_rdma_send(rdma, &send_wr);
+	if (ret)
+		goto err;
+
+	return 0;
+
+ err:
+	svc_rdma_unmap_dma(ctxt);
+	svc_rdma_put_context(ctxt, 1);
+	return -EIO;
+}
+
+void svc_rdma_prep_reply_hdr(struct svc_rqst *rqstp)
+{
+}
+
+/*
+ * Return the start of an xdr buffer.
+ */
+static void *xdr_start(struct xdr_buf *xdr)
+{
+	return xdr->head[0].iov_base -
+		(xdr->len -
+		 xdr->page_len -
+		 xdr->tail[0].iov_len -
+		 xdr->head[0].iov_len);
+}
+
+int svc_rdma_sendto(struct svc_rqst *rqstp)
+{
+	struct svc_xprt *xprt = rqstp->rq_xprt;
+	struct svcxprt_rdma *rdma =
+		container_of(xprt, struct svcxprt_rdma, sc_xprt);
+	struct rpcrdma_msg *rdma_argp;
+	struct rpcrdma_msg *rdma_resp;
+	struct rpcrdma_write_array *reply_ary;
+	enum rpcrdma_proc reply_type;
+	int ret;
+	int inline_bytes;
+	struct page *res_page;
+	struct svc_rdma_op_ctxt *ctxt;
+	struct svc_rdma_req_map *vec;
+
+	dprintk("svcrdma: sending response for rqstp=%p\n", rqstp);
+
+	/* Get the RDMA request header. */
+	rdma_argp = xdr_start(&rqstp->rq_arg);
+
+	/* Build an req vec for the XDR */
+	ctxt = svc_rdma_get_context(rdma);
+	ctxt->direction = DMA_TO_DEVICE;
+	vec = svc_rdma_get_req_map();
+	ret = map_xdr(rdma, &rqstp->rq_res, vec);
+	if (ret)
+		goto err0;
+	inline_bytes = rqstp->rq_res.len;
+
+	/* Create the RDMA response header */
+	res_page = svc_rdma_get_page();
+	rdma_resp = page_address(res_page);
+	reply_ary = svc_rdma_get_reply_array(rdma_argp);
+	if (reply_ary)
+		reply_type = RDMA_NOMSG;
+	else
+		reply_type = RDMA_MSG;
+	svc_rdma_xdr_encode_reply_header(rdma, rdma_argp,
+					 rdma_resp, reply_type);
+
+	/* Send any write-chunk data and build resp write-list */
+	ret = send_write_chunks(rdma, rdma_argp, rdma_resp,
+				rqstp, vec);
+	if (ret < 0) {
+		printk(KERN_ERR "svcrdma: failed to send write chunks, rc=%d\n",
+		       ret);
+		goto err1;
+	}
+	inline_bytes -= ret;
+
+	/* Send any reply-list data and update resp reply-list */
+	ret = send_reply_chunks(rdma, rdma_argp, rdma_resp,
+				rqstp, vec);
+	if (ret < 0) {
+		printk(KERN_ERR "svcrdma: failed to send reply chunks, rc=%d\n",
+		       ret);
+		goto err1;
+	}
+	inline_bytes -= ret;
+
+	ret = send_reply(rdma, rqstp, res_page, rdma_resp, ctxt, vec,
+			 inline_bytes);
+	svc_rdma_put_req_map(vec);
+	dprintk("svcrdma: send_reply returns %d\n", ret);
+	return ret;
+
+ err1:
+	put_page(res_page);
+ err0:
+	svc_rdma_put_req_map(vec);
+	svc_rdma_put_context(ctxt, 0);
+	return ret;
+}
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
new file mode 100644
index 0000000..4e61880
--- /dev/null
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -0,0 +1,1355 @@
+/*
+ * Copyright (c) 2014 Open Grid Computing, Inc. All rights reserved.
+ * Copyright (c) 2005-2007 Network Appliance, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the BSD-type
+ * license below:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *      Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *
+ *      Redistributions in binary form must reproduce the above
+ *      copyright notice, this list of conditions and the following
+ *      disclaimer in the documentation and/or other materials provided
+ *      with the distribution.
+ *
+ *      Neither the name of the Network Appliance, Inc. nor the names of
+ *      its contributors may be used to endorse or promote products
+ *      derived from this software without specific prior written
+ *      permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Tom Tucker <tom@opengridcomputing.com>
+ */
+
+#include <linux/sunrpc/svc_xprt.h>
+#include <linux/sunrpc/debug.h>
+#include <linux/sunrpc/rpc_rdma.h>
+#include <linux/interrupt.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/workqueue.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/rdma_cm.h>
+#include <linux/sunrpc/svc_rdma.h>
+#include <linux/export.h>
+#include "xprt_rdma.h"
+
+#define RPCDBG_FACILITY	RPCDBG_SVCXPRT
+
+static struct svc_xprt *svc_rdma_create(struct svc_serv *serv,
+					struct net *net,
+					struct sockaddr *sa, int salen,
+					int flags);
+static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt);
+static void svc_rdma_release_rqst(struct svc_rqst *);
+static void dto_tasklet_func(unsigned long data);
+static void svc_rdma_detach(struct svc_xprt *xprt);
+static void svc_rdma_free(struct svc_xprt *xprt);
+static int svc_rdma_has_wspace(struct svc_xprt *xprt);
+static int svc_rdma_secure_port(struct svc_rqst *);
+static void rq_cq_reap(struct svcxprt_rdma *xprt);
+static void sq_cq_reap(struct svcxprt_rdma *xprt);
+
+static DECLARE_TASKLET(dto_tasklet, dto_tasklet_func, 0UL);
+static DEFINE_SPINLOCK(dto_lock);
+static LIST_HEAD(dto_xprt_q);
+
+static struct svc_xprt_ops svc_rdma_ops = {
+	.xpo_create = svc_rdma_create,
+	.xpo_recvfrom = svc_rdma_recvfrom,
+	.xpo_sendto = svc_rdma_sendto,
+	.xpo_release_rqst = svc_rdma_release_rqst,
+	.xpo_detach = svc_rdma_detach,
+	.xpo_free = svc_rdma_free,
+	.xpo_prep_reply_hdr = svc_rdma_prep_reply_hdr,
+	.xpo_has_wspace = svc_rdma_has_wspace,
+	.xpo_accept = svc_rdma_accept,
+	.xpo_secure_port = svc_rdma_secure_port,
+};
+
+struct svc_xprt_class svc_rdma_class = {
+	.xcl_name = "rdma",
+	.xcl_owner = THIS_MODULE,
+	.xcl_ops = &svc_rdma_ops,
+	.xcl_max_payload = RPCSVC_MAXPAYLOAD_RDMA,
+	.xcl_ident = XPRT_TRANSPORT_RDMA,
+};
+
+struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *xprt)
+{
+	struct svc_rdma_op_ctxt *ctxt;
+
+	while (1) {
+		ctxt = kmem_cache_alloc(svc_rdma_ctxt_cachep, GFP_KERNEL);
+		if (ctxt)
+			break;
+		schedule_timeout_uninterruptible(msecs_to_jiffies(500));
+	}
+	ctxt->xprt = xprt;
+	INIT_LIST_HEAD(&ctxt->dto_q);
+	ctxt->count = 0;
+	ctxt->frmr = NULL;
+	atomic_inc(&xprt->sc_ctxt_used);
+	return ctxt;
+}
+
+void svc_rdma_unmap_dma(struct svc_rdma_op_ctxt *ctxt)
+{
+	struct svcxprt_rdma *xprt = ctxt->xprt;
+	int i;
+	for (i = 0; i < ctxt->count && ctxt->sge[i].length; i++) {
+		/*
+		 * Unmap the DMA addr in the SGE if the lkey matches
+		 * the sc_dma_lkey, otherwise, ignore it since it is
+		 * an FRMR lkey and will be unmapped later when the
+		 * last WR that uses it completes.
+		 */
+		if (ctxt->sge[i].lkey == xprt->sc_dma_lkey) {
+			atomic_dec(&xprt->sc_dma_used);
+			ib_dma_unmap_page(xprt->sc_cm_id->device,
+					    ctxt->sge[i].addr,
+					    ctxt->sge[i].length,
+					    ctxt->direction);
+		}
+	}
+}
+
+void svc_rdma_put_context(struct svc_rdma_op_ctxt *ctxt, int free_pages)
+{
+	struct svcxprt_rdma *xprt;
+	int i;
+
+	BUG_ON(!ctxt);
+	xprt = ctxt->xprt;
+	if (free_pages)
+		for (i = 0; i < ctxt->count; i++)
+			put_page(ctxt->pages[i]);
+
+	kmem_cache_free(svc_rdma_ctxt_cachep, ctxt);
+	atomic_dec(&xprt->sc_ctxt_used);
+}
+
+/*
+ * Temporary NFS req mappings are shared across all transport
+ * instances. These are short lived and should be bounded by the number
+ * of concurrent server threads * depth of the SQ.
+ */
+struct svc_rdma_req_map *svc_rdma_get_req_map(void)
+{
+	struct svc_rdma_req_map *map;
+	while (1) {
+		map = kmem_cache_alloc(svc_rdma_map_cachep, GFP_KERNEL);
+		if (map)
+			break;
+		schedule_timeout_uninterruptible(msecs_to_jiffies(500));
+	}
+	map->count = 0;
+	return map;
+}
+
+void svc_rdma_put_req_map(struct svc_rdma_req_map *map)
+{
+	kmem_cache_free(svc_rdma_map_cachep, map);
+}
+
+/* ib_cq event handler */
+static void cq_event_handler(struct ib_event *event, void *context)
+{
+	struct svc_xprt *xprt = context;
+	dprintk("svcrdma: received CQ event id=%d, context=%p\n",
+		event->event, context);
+	set_bit(XPT_CLOSE, &xprt->xpt_flags);
+}
+
+/* QP event handler */
+static void qp_event_handler(struct ib_event *event, void *context)
+{
+	struct svc_xprt *xprt = context;
+
+	switch (event->event) {
+	/* These are considered benign events */
+	case IB_EVENT_PATH_MIG:
+	case IB_EVENT_COMM_EST:
+	case IB_EVENT_SQ_DRAINED:
+	case IB_EVENT_QP_LAST_WQE_REACHED:
+		dprintk("svcrdma: QP event %d received for QP=%p\n",
+			event->event, event->element.qp);
+		break;
+	/* These are considered fatal events */
+	case IB_EVENT_PATH_MIG_ERR:
+	case IB_EVENT_QP_FATAL:
+	case IB_EVENT_QP_REQ_ERR:
+	case IB_EVENT_QP_ACCESS_ERR:
+	case IB_EVENT_DEVICE_FATAL:
+	default:
+		dprintk("svcrdma: QP ERROR event %d received for QP=%p, "
+			"closing transport\n",
+			event->event, event->element.qp);
+		set_bit(XPT_CLOSE, &xprt->xpt_flags);
+		break;
+	}
+}
+
+/*
+ * Data Transfer Operation Tasklet
+ *
+ * Walks a list of transports with I/O pending, removing entries as
+ * they are added to the server's I/O pending list. Two bits indicate
+ * if SQ, RQ, or both have I/O pending. The dto_lock is an irqsave
+ * spinlock that serializes access to the transport list with the RQ
+ * and SQ interrupt handlers.
+ */
+static void dto_tasklet_func(unsigned long data)
+{
+	struct svcxprt_rdma *xprt;
+	unsigned long flags;
+
+	spin_lock_irqsave(&dto_lock, flags);
+	while (!list_empty(&dto_xprt_q)) {
+		xprt = list_entry(dto_xprt_q.next,
+				  struct svcxprt_rdma, sc_dto_q);
+		list_del_init(&xprt->sc_dto_q);
+		spin_unlock_irqrestore(&dto_lock, flags);
+
+		rq_cq_reap(xprt);
+		sq_cq_reap(xprt);
+
+		svc_xprt_put(&xprt->sc_xprt);
+		spin_lock_irqsave(&dto_lock, flags);
+	}
+	spin_unlock_irqrestore(&dto_lock, flags);
+}
+
+/*
+ * Receive Queue Completion Handler
+ *
+ * Since an RQ completion handler is called on interrupt context, we
+ * need to defer the handling of the I/O to a tasklet
+ */
+static void rq_comp_handler(struct ib_cq *cq, void *cq_context)
+{
+	struct svcxprt_rdma *xprt = cq_context;
+	unsigned long flags;
+
+	/* Guard against unconditional flush call for destroyed QP */
+	if (atomic_read(&xprt->sc_xprt.xpt_ref.refcount)==0)
+		return;
+
+	/*
+	 * Set the bit regardless of whether or not it's on the list
+	 * because it may be on the list already due to an SQ
+	 * completion.
+	 */
+	set_bit(RDMAXPRT_RQ_PENDING, &xprt->sc_flags);
+
+	/*
+	 * If this transport is not already on the DTO transport queue,
+	 * add it
+	 */
+	spin_lock_irqsave(&dto_lock, flags);
+	if (list_empty(&xprt->sc_dto_q)) {
+		svc_xprt_get(&xprt->sc_xprt);
+		list_add_tail(&xprt->sc_dto_q, &dto_xprt_q);
+	}
+	spin_unlock_irqrestore(&dto_lock, flags);
+
+	/* Tasklet does all the work to avoid irqsave locks. */
+	tasklet_schedule(&dto_tasklet);
+}
+
+/*
+ * rq_cq_reap - Process the RQ CQ.
+ *
+ * Take all completing WC off the CQE and enqueue the associated DTO
+ * context on the dto_q for the transport.
+ *
+ * Note that caller must hold a transport reference.
+ */
+static void rq_cq_reap(struct svcxprt_rdma *xprt)
+{
+	int ret;
+	struct ib_wc wc;
+	struct svc_rdma_op_ctxt *ctxt = NULL;
+
+	if (!test_and_clear_bit(RDMAXPRT_RQ_PENDING, &xprt->sc_flags))
+		return;
+
+	ib_req_notify_cq(xprt->sc_rq_cq, IB_CQ_NEXT_COMP);
+	atomic_inc(&rdma_stat_rq_poll);
+
+	while ((ret = ib_poll_cq(xprt->sc_rq_cq, 1, &wc)) > 0) {
+		ctxt = (struct svc_rdma_op_ctxt *)(unsigned long)wc.wr_id;
+		ctxt->wc_status = wc.status;
+		ctxt->byte_len = wc.byte_len;
+		svc_rdma_unmap_dma(ctxt);
+		if (wc.status != IB_WC_SUCCESS) {
+			/* Close the transport */
+			dprintk("svcrdma: transport closing putting ctxt %p\n", ctxt);
+			set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags);
+			svc_rdma_put_context(ctxt, 1);
+			svc_xprt_put(&xprt->sc_xprt);
+			continue;
+		}
+		spin_lock_bh(&xprt->sc_rq_dto_lock);
+		list_add_tail(&ctxt->dto_q, &xprt->sc_rq_dto_q);
+		spin_unlock_bh(&xprt->sc_rq_dto_lock);
+		svc_xprt_put(&xprt->sc_xprt);
+	}
+
+	if (ctxt)
+		atomic_inc(&rdma_stat_rq_prod);
+
+	set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags);
+	/*
+	 * If data arrived before established event,
+	 * don't enqueue. This defers RPC I/O until the
+	 * RDMA connection is complete.
+	 */
+	if (!test_bit(RDMAXPRT_CONN_PENDING, &xprt->sc_flags))
+		svc_xprt_enqueue(&xprt->sc_xprt);
+}
+
+/*
+ * Process a completion context
+ */
+static void process_context(struct svcxprt_rdma *xprt,
+			    struct svc_rdma_op_ctxt *ctxt)
+{
+	svc_rdma_unmap_dma(ctxt);
+
+	switch (ctxt->wr_op) {
+	case IB_WR_SEND:
+		BUG_ON(ctxt->frmr);
+		svc_rdma_put_context(ctxt, 1);
+		break;
+
+	case IB_WR_RDMA_WRITE:
+		BUG_ON(ctxt->frmr);
+		svc_rdma_put_context(ctxt, 0);
+		break;
+
+	case IB_WR_RDMA_READ:
+	case IB_WR_RDMA_READ_WITH_INV:
+		svc_rdma_put_frmr(xprt, ctxt->frmr);
+		if (test_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags)) {
+			struct svc_rdma_op_ctxt *read_hdr = ctxt->read_hdr;
+			BUG_ON(!read_hdr);
+			spin_lock_bh(&xprt->sc_rq_dto_lock);
+			set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags);
+			list_add_tail(&read_hdr->dto_q,
+				      &xprt->sc_read_complete_q);
+			spin_unlock_bh(&xprt->sc_rq_dto_lock);
+			svc_xprt_enqueue(&xprt->sc_xprt);
+		}
+		svc_rdma_put_context(ctxt, 0);
+		break;
+
+	default:
+		BUG_ON(1);
+		printk(KERN_ERR "svcrdma: unexpected completion type, "
+		       "opcode=%d\n",
+		       ctxt->wr_op);
+		break;
+	}
+}
+
+/*
+ * Send Queue Completion Handler - potentially called on interrupt context.
+ *
+ * Note that caller must hold a transport reference.
+ */
+static void sq_cq_reap(struct svcxprt_rdma *xprt)
+{
+	struct svc_rdma_op_ctxt *ctxt = NULL;
+	struct ib_wc wc_a[6];
+	struct ib_wc *wc;
+	struct ib_cq *cq = xprt->sc_sq_cq;
+	int ret;
+
+	memset(wc_a, 0, sizeof(wc_a));
+
+	if (!test_and_clear_bit(RDMAXPRT_SQ_PENDING, &xprt->sc_flags))
+		return;
+
+	ib_req_notify_cq(xprt->sc_sq_cq, IB_CQ_NEXT_COMP);
+	atomic_inc(&rdma_stat_sq_poll);
+	while ((ret = ib_poll_cq(cq, ARRAY_SIZE(wc_a), wc_a)) > 0) {
+		int i;
+
+		for (i = 0; i < ret; i++) {
+			wc = &wc_a[i];
+			if (wc->status != IB_WC_SUCCESS) {
+				dprintk("svcrdma: sq wc err status %d\n",
+					wc->status);
+
+				/* Close the transport */
+				set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags);
+			}
+
+			/* Decrement used SQ WR count */
+			atomic_dec(&xprt->sc_sq_count);
+			wake_up(&xprt->sc_send_wait);
+
+			ctxt = (struct svc_rdma_op_ctxt *)
+				(unsigned long)wc->wr_id;
+			if (ctxt)
+				process_context(xprt, ctxt);
+
+			svc_xprt_put(&xprt->sc_xprt);
+		}
+	}
+
+	if (ctxt)
+		atomic_inc(&rdma_stat_sq_prod);
+}
+
+static void sq_comp_handler(struct ib_cq *cq, void *cq_context)
+{
+	struct svcxprt_rdma *xprt = cq_context;
+	unsigned long flags;
+
+	/* Guard against unconditional flush call for destroyed QP */
+	if (atomic_read(&xprt->sc_xprt.xpt_ref.refcount)==0)
+		return;
+
+	/*
+	 * Set the bit regardless of whether or not it's on the list
+	 * because it may be on the list already due to an RQ
+	 * completion.
+	 */
+	set_bit(RDMAXPRT_SQ_PENDING, &xprt->sc_flags);
+
+	/*
+	 * If this transport is not already on the DTO transport queue,
+	 * add it
+	 */
+	spin_lock_irqsave(&dto_lock, flags);
+	if (list_empty(&xprt->sc_dto_q)) {
+		svc_xprt_get(&xprt->sc_xprt);
+		list_add_tail(&xprt->sc_dto_q, &dto_xprt_q);
+	}
+	spin_unlock_irqrestore(&dto_lock, flags);
+
+	/* Tasklet does all the work to avoid irqsave locks. */
+	tasklet_schedule(&dto_tasklet);
+}
+
+static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv,
+					     int listener)
+{
+	struct svcxprt_rdma *cma_xprt = kzalloc(sizeof *cma_xprt, GFP_KERNEL);
+
+	if (!cma_xprt)
+		return NULL;
+	svc_xprt_init(&init_net, &svc_rdma_class, &cma_xprt->sc_xprt, serv);
+	INIT_LIST_HEAD(&cma_xprt->sc_accept_q);
+	INIT_LIST_HEAD(&cma_xprt->sc_dto_q);
+	INIT_LIST_HEAD(&cma_xprt->sc_rq_dto_q);
+	INIT_LIST_HEAD(&cma_xprt->sc_read_complete_q);
+	INIT_LIST_HEAD(&cma_xprt->sc_frmr_q);
+	init_waitqueue_head(&cma_xprt->sc_send_wait);
+
+	spin_lock_init(&cma_xprt->sc_lock);
+	spin_lock_init(&cma_xprt->sc_rq_dto_lock);
+	spin_lock_init(&cma_xprt->sc_frmr_q_lock);
+
+	cma_xprt->sc_ord = svcrdma_ord;
+
+	cma_xprt->sc_max_req_size = svcrdma_max_req_size;
+	cma_xprt->sc_max_requests = svcrdma_max_requests;
+	cma_xprt->sc_sq_depth = svcrdma_max_requests * RPCRDMA_SQ_DEPTH_MULT;
+	atomic_set(&cma_xprt->sc_sq_count, 0);
+	atomic_set(&cma_xprt->sc_ctxt_used, 0);
+
+	if (listener)
+		set_bit(XPT_LISTENER, &cma_xprt->sc_xprt.xpt_flags);
+
+	return cma_xprt;
+}
+
+struct page *svc_rdma_get_page(void)
+{
+	struct page *page;
+
+	while ((page = alloc_page(GFP_KERNEL)) == NULL) {
+		/* If we can't get memory, wait a bit and try again */
+		printk(KERN_INFO "svcrdma: out of memory...retrying in 1s\n");
+		schedule_timeout_uninterruptible(msecs_to_jiffies(1000));
+	}
+	return page;
+}
+
+int svc_rdma_post_recv(struct svcxprt_rdma *xprt)
+{
+	struct ib_recv_wr recv_wr, *bad_recv_wr;
+	struct svc_rdma_op_ctxt *ctxt;
+	struct page *page;
+	dma_addr_t pa;
+	int sge_no;
+	int buflen;
+	int ret;
+
+	ctxt = svc_rdma_get_context(xprt);
+	buflen = 0;
+	ctxt->direction = DMA_FROM_DEVICE;
+	for (sge_no = 0; buflen < xprt->sc_max_req_size; sge_no++) {
+		BUG_ON(sge_no >= xprt->sc_max_sge);
+		page = svc_rdma_get_page();
+		ctxt->pages[sge_no] = page;
+		pa = ib_dma_map_page(xprt->sc_cm_id->device,
+				     page, 0, PAGE_SIZE,
+				     DMA_FROM_DEVICE);
+		if (ib_dma_mapping_error(xprt->sc_cm_id->device, pa))
+			goto err_put_ctxt;
+		atomic_inc(&xprt->sc_dma_used);
+		ctxt->sge[sge_no].addr = pa;
+		ctxt->sge[sge_no].length = PAGE_SIZE;
+		ctxt->sge[sge_no].lkey = xprt->sc_dma_lkey;
+		ctxt->count = sge_no + 1;
+		buflen += PAGE_SIZE;
+	}
+	recv_wr.next = NULL;
+	recv_wr.sg_list = &ctxt->sge[0];
+	recv_wr.num_sge = ctxt->count;
+	recv_wr.wr_id = (u64)(unsigned long)ctxt;
+
+	svc_xprt_get(&xprt->sc_xprt);
+	ret = ib_post_recv(xprt->sc_qp, &recv_wr, &bad_recv_wr);
+	if (ret) {
+		svc_rdma_unmap_dma(ctxt);
+		svc_rdma_put_context(ctxt, 1);
+		svc_xprt_put(&xprt->sc_xprt);
+	}
+	return ret;
+
+ err_put_ctxt:
+	svc_rdma_unmap_dma(ctxt);
+	svc_rdma_put_context(ctxt, 1);
+	return -ENOMEM;
+}
+
+/*
+ * This function handles the CONNECT_REQUEST event on a listening
+ * endpoint. It is passed the cma_id for the _new_ connection. The context in
+ * this cma_id is inherited from the listening cma_id and is the svc_xprt
+ * structure for the listening endpoint.
+ *
+ * This function creates a new xprt for the new connection and enqueues it on
+ * the accept queue for the listent xprt. When the listen thread is kicked, it
+ * will call the recvfrom method on the listen xprt which will accept the new
+ * connection.
+ */
+static void handle_connect_req(struct rdma_cm_id *new_cma_id, size_t client_ird)
+{
+	struct svcxprt_rdma *listen_xprt = new_cma_id->context;
+	struct svcxprt_rdma *newxprt;
+	struct sockaddr *sa;
+
+	/* Create a new transport */
+	newxprt = rdma_create_xprt(listen_xprt->sc_xprt.xpt_server, 0);
+	if (!newxprt) {
+		dprintk("svcrdma: failed to create new transport\n");
+		return;
+	}
+	newxprt->sc_cm_id = new_cma_id;
+	new_cma_id->context = newxprt;
+	dprintk("svcrdma: Creating newxprt=%p, cm_id=%p, listenxprt=%p\n",
+		newxprt, newxprt->sc_cm_id, listen_xprt);
+
+	/* Save client advertised inbound read limit for use later in accept. */
+	newxprt->sc_ord = client_ird;
+
+	/* Set the local and remote addresses in the transport */
+	sa = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.dst_addr;
+	svc_xprt_set_remote(&newxprt->sc_xprt, sa, svc_addr_len(sa));
+	sa = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.src_addr;
+	svc_xprt_set_local(&newxprt->sc_xprt, sa, svc_addr_len(sa));
+
+	/*
+	 * Enqueue the new transport on the accept queue of the listening
+	 * transport
+	 */
+	spin_lock_bh(&listen_xprt->sc_lock);
+	list_add_tail(&newxprt->sc_accept_q, &listen_xprt->sc_accept_q);
+	spin_unlock_bh(&listen_xprt->sc_lock);
+
+	set_bit(XPT_CONN, &listen_xprt->sc_xprt.xpt_flags);
+	svc_xprt_enqueue(&listen_xprt->sc_xprt);
+}
+
+/*
+ * Handles events generated on the listening endpoint. These events will be
+ * either be incoming connect requests or adapter removal  events.
+ */
+static int rdma_listen_handler(struct rdma_cm_id *cma_id,
+			       struct rdma_cm_event *event)
+{
+	struct svcxprt_rdma *xprt = cma_id->context;
+	int ret = 0;
+
+	switch (event->event) {
+	case RDMA_CM_EVENT_CONNECT_REQUEST:
+		dprintk("svcrdma: Connect request on cma_id=%p, xprt = %p, "
+			"event=%d\n", cma_id, cma_id->context, event->event);
+		handle_connect_req(cma_id,
+				   event->param.conn.initiator_depth);
+		break;
+
+	case RDMA_CM_EVENT_ESTABLISHED:
+		/* Accept complete */
+		dprintk("svcrdma: Connection completed on LISTEN xprt=%p, "
+			"cm_id=%p\n", xprt, cma_id);
+		break;
+
+	case RDMA_CM_EVENT_DEVICE_REMOVAL:
+		dprintk("svcrdma: Device removal xprt=%p, cm_id=%p\n",
+			xprt, cma_id);
+		if (xprt)
+			set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags);
+		break;
+
+	default:
+		dprintk("svcrdma: Unexpected event on listening endpoint %p, "
+			"event=%d\n", cma_id, event->event);
+		break;
+	}
+
+	return ret;
+}
+
+static int rdma_cma_handler(struct rdma_cm_id *cma_id,
+			    struct rdma_cm_event *event)
+{
+	struct svc_xprt *xprt = cma_id->context;
+	struct svcxprt_rdma *rdma =
+		container_of(xprt, struct svcxprt_rdma, sc_xprt);
+	switch (event->event) {
+	case RDMA_CM_EVENT_ESTABLISHED:
+		/* Accept complete */
+		svc_xprt_get(xprt);
+		dprintk("svcrdma: Connection completed on DTO xprt=%p, "
+			"cm_id=%p\n", xprt, cma_id);
+		clear_bit(RDMAXPRT_CONN_PENDING, &rdma->sc_flags);
+		svc_xprt_enqueue(xprt);
+		break;
+	case RDMA_CM_EVENT_DISCONNECTED:
+		dprintk("svcrdma: Disconnect on DTO xprt=%p, cm_id=%p\n",
+			xprt, cma_id);
+		if (xprt) {
+			set_bit(XPT_CLOSE, &xprt->xpt_flags);
+			svc_xprt_enqueue(xprt);
+			svc_xprt_put(xprt);
+		}
+		break;
+	case RDMA_CM_EVENT_DEVICE_REMOVAL:
+		dprintk("svcrdma: Device removal cma_id=%p, xprt = %p, "
+			"event=%d\n", cma_id, xprt, event->event);
+		if (xprt) {
+			set_bit(XPT_CLOSE, &xprt->xpt_flags);
+			svc_xprt_enqueue(xprt);
+		}
+		break;
+	default:
+		dprintk("svcrdma: Unexpected event on DTO endpoint %p, "
+			"event=%d\n", cma_id, event->event);
+		break;
+	}
+	return 0;
+}
+
+/*
+ * Create a listening RDMA service endpoint.
+ */
+static struct svc_xprt *svc_rdma_create(struct svc_serv *serv,
+					struct net *net,
+					struct sockaddr *sa, int salen,
+					int flags)
+{
+	struct rdma_cm_id *listen_id;
+	struct svcxprt_rdma *cma_xprt;
+	struct svc_xprt *xprt;
+	int ret;
+
+	dprintk("svcrdma: Creating RDMA socket\n");
+	if (sa->sa_family != AF_INET) {
+		dprintk("svcrdma: Address family %d is not supported.\n", sa->sa_family);
+		return ERR_PTR(-EAFNOSUPPORT);
+	}
+	cma_xprt = rdma_create_xprt(serv, 1);
+	if (!cma_xprt)
+		return ERR_PTR(-ENOMEM);
+	xprt = &cma_xprt->sc_xprt;
+
+	listen_id = rdma_create_id(rdma_listen_handler, cma_xprt, RDMA_PS_TCP,
+				   IB_QPT_RC);
+	if (IS_ERR(listen_id)) {
+		ret = PTR_ERR(listen_id);
+		dprintk("svcrdma: rdma_create_id failed = %d\n", ret);
+		goto err0;
+	}
+
+	ret = rdma_bind_addr(listen_id, sa);
+	if (ret) {
+		dprintk("svcrdma: rdma_bind_addr failed = %d\n", ret);
+		goto err1;
+	}
+	cma_xprt->sc_cm_id = listen_id;
+
+	ret = rdma_listen(listen_id, RPCRDMA_LISTEN_BACKLOG);
+	if (ret) {
+		dprintk("svcrdma: rdma_listen failed = %d\n", ret);
+		goto err1;
+	}
+
+	/*
+	 * We need to use the address from the cm_id in case the
+	 * caller specified 0 for the port number.
+	 */
+	sa = (struct sockaddr *)&cma_xprt->sc_cm_id->route.addr.src_addr;
+	svc_xprt_set_local(&cma_xprt->sc_xprt, sa, salen);
+
+	return &cma_xprt->sc_xprt;
+
+ err1:
+	rdma_destroy_id(listen_id);
+ err0:
+	kfree(cma_xprt);
+	return ERR_PTR(ret);
+}
+
+static struct svc_rdma_fastreg_mr *rdma_alloc_frmr(struct svcxprt_rdma *xprt)
+{
+	struct ib_mr *mr;
+	struct ib_fast_reg_page_list *pl;
+	struct svc_rdma_fastreg_mr *frmr;
+
+	frmr = kmalloc(sizeof(*frmr), GFP_KERNEL);
+	if (!frmr)
+		goto err;
+
+	mr = ib_alloc_fast_reg_mr(xprt->sc_pd, RPCSVC_MAXPAGES);
+	if (IS_ERR(mr))
+		goto err_free_frmr;
+
+	pl = ib_alloc_fast_reg_page_list(xprt->sc_cm_id->device,
+					 RPCSVC_MAXPAGES);
+	if (IS_ERR(pl))
+		goto err_free_mr;
+
+	frmr->mr = mr;
+	frmr->page_list = pl;
+	INIT_LIST_HEAD(&frmr->frmr_list);
+	return frmr;
+
+ err_free_mr:
+	ib_dereg_mr(mr);
+ err_free_frmr:
+	kfree(frmr);
+ err:
+	return ERR_PTR(-ENOMEM);
+}
+
+static void rdma_dealloc_frmr_q(struct svcxprt_rdma *xprt)
+{
+	struct svc_rdma_fastreg_mr *frmr;
+
+	while (!list_empty(&xprt->sc_frmr_q)) {
+		frmr = list_entry(xprt->sc_frmr_q.next,
+				  struct svc_rdma_fastreg_mr, frmr_list);
+		list_del_init(&frmr->frmr_list);
+		ib_dereg_mr(frmr->mr);
+		ib_free_fast_reg_page_list(frmr->page_list);
+		kfree(frmr);
+	}
+}
+
+struct svc_rdma_fastreg_mr *svc_rdma_get_frmr(struct svcxprt_rdma *rdma)
+{
+	struct svc_rdma_fastreg_mr *frmr = NULL;
+
+	spin_lock_bh(&rdma->sc_frmr_q_lock);
+	if (!list_empty(&rdma->sc_frmr_q)) {
+		frmr = list_entry(rdma->sc_frmr_q.next,
+				  struct svc_rdma_fastreg_mr, frmr_list);
+		list_del_init(&frmr->frmr_list);
+		frmr->map_len = 0;
+		frmr->page_list_len = 0;
+	}
+	spin_unlock_bh(&rdma->sc_frmr_q_lock);
+	if (frmr)
+		return frmr;
+
+	return rdma_alloc_frmr(rdma);
+}
+
+static void frmr_unmap_dma(struct svcxprt_rdma *xprt,
+			   struct svc_rdma_fastreg_mr *frmr)
+{
+	int page_no;
+	for (page_no = 0; page_no < frmr->page_list_len; page_no++) {
+		dma_addr_t addr = frmr->page_list->page_list[page_no];
+		if (ib_dma_mapping_error(frmr->mr->device, addr))
+			continue;
+		atomic_dec(&xprt->sc_dma_used);
+		ib_dma_unmap_page(frmr->mr->device, addr, PAGE_SIZE,
+				  frmr->direction);
+	}
+}
+
+void svc_rdma_put_frmr(struct svcxprt_rdma *rdma,
+		       struct svc_rdma_fastreg_mr *frmr)
+{
+	if (frmr) {
+		frmr_unmap_dma(rdma, frmr);
+		spin_lock_bh(&rdma->sc_frmr_q_lock);
+		BUG_ON(!list_empty(&frmr->frmr_list));
+		list_add(&frmr->frmr_list, &rdma->sc_frmr_q);
+		spin_unlock_bh(&rdma->sc_frmr_q_lock);
+	}
+}
+
+/*
+ * This is the xpo_recvfrom function for listening endpoints. Its
+ * purpose is to accept incoming connections. The CMA callback handler
+ * has already created a new transport and attached it to the new CMA
+ * ID.
+ *
+ * There is a queue of pending connections hung on the listening
+ * transport. This queue contains the new svc_xprt structure. This
+ * function takes svc_xprt structures off the accept_q and completes
+ * the connection.
+ */
+static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
+{
+	struct svcxprt_rdma *listen_rdma;
+	struct svcxprt_rdma *newxprt = NULL;
+	struct rdma_conn_param conn_param;
+	struct ib_qp_init_attr qp_attr;
+	struct ib_device_attr devattr;
+	int uninitialized_var(dma_mr_acc);
+	int need_dma_mr;
+	int ret;
+	int i;
+
+	listen_rdma = container_of(xprt, struct svcxprt_rdma, sc_xprt);
+	clear_bit(XPT_CONN, &xprt->xpt_flags);
+	/* Get the next entry off the accept list */
+	spin_lock_bh(&listen_rdma->sc_lock);
+	if (!list_empty(&listen_rdma->sc_accept_q)) {
+		newxprt = list_entry(listen_rdma->sc_accept_q.next,
+				     struct svcxprt_rdma, sc_accept_q);
+		list_del_init(&newxprt->sc_accept_q);
+	}
+	if (!list_empty(&listen_rdma->sc_accept_q))
+		set_bit(XPT_CONN, &listen_rdma->sc_xprt.xpt_flags);
+	spin_unlock_bh(&listen_rdma->sc_lock);
+	if (!newxprt)
+		return NULL;
+
+	dprintk("svcrdma: newxprt from accept queue = %p, cm_id=%p\n",
+		newxprt, newxprt->sc_cm_id);
+
+	ret = ib_query_device(newxprt->sc_cm_id->device, &devattr);
+	if (ret) {
+		dprintk("svcrdma: could not query device attributes on "
+			"device %p, rc=%d\n", newxprt->sc_cm_id->device, ret);
+		goto errout;
+	}
+
+	/* Qualify the transport resource defaults with the
+	 * capabilities of this particular device */
+	newxprt->sc_max_sge = min((size_t)devattr.max_sge,
+				  (size_t)RPCSVC_MAXPAGES);
+	newxprt->sc_max_requests = min((size_t)devattr.max_qp_wr,
+				   (size_t)svcrdma_max_requests);
+	newxprt->sc_sq_depth = RPCRDMA_SQ_DEPTH_MULT * newxprt->sc_max_requests;
+
+	/*
+	 * Limit ORD based on client limit, local device limit, and
+	 * configured svcrdma limit.
+	 */
+	newxprt->sc_ord = min_t(size_t, devattr.max_qp_rd_atom, newxprt->sc_ord);
+	newxprt->sc_ord = min_t(size_t,	svcrdma_ord, newxprt->sc_ord);
+
+	newxprt->sc_pd = ib_alloc_pd(newxprt->sc_cm_id->device);
+	if (IS_ERR(newxprt->sc_pd)) {
+		dprintk("svcrdma: error creating PD for connect request\n");
+		goto errout;
+	}
+	newxprt->sc_sq_cq = ib_create_cq(newxprt->sc_cm_id->device,
+					 sq_comp_handler,
+					 cq_event_handler,
+					 newxprt,
+					 newxprt->sc_sq_depth,
+					 0);
+	if (IS_ERR(newxprt->sc_sq_cq)) {
+		dprintk("svcrdma: error creating SQ CQ for connect request\n");
+		goto errout;
+	}
+	newxprt->sc_rq_cq = ib_create_cq(newxprt->sc_cm_id->device,
+					 rq_comp_handler,
+					 cq_event_handler,
+					 newxprt,
+					 newxprt->sc_max_requests,
+					 0);
+	if (IS_ERR(newxprt->sc_rq_cq)) {
+		dprintk("svcrdma: error creating RQ CQ for connect request\n");
+		goto errout;
+	}
+
+	memset(&qp_attr, 0, sizeof qp_attr);
+	qp_attr.event_handler = qp_event_handler;
+	qp_attr.qp_context = &newxprt->sc_xprt;
+	qp_attr.cap.max_send_wr = newxprt->sc_sq_depth;
+	qp_attr.cap.max_recv_wr = newxprt->sc_max_requests;
+	qp_attr.cap.max_send_sge = newxprt->sc_max_sge;
+	qp_attr.cap.max_recv_sge = newxprt->sc_max_sge;
+	qp_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
+	qp_attr.qp_type = IB_QPT_RC;
+	qp_attr.send_cq = newxprt->sc_sq_cq;
+	qp_attr.recv_cq = newxprt->sc_rq_cq;
+	dprintk("svcrdma: newxprt->sc_cm_id=%p, newxprt->sc_pd=%p\n"
+		"    cm_id->device=%p, sc_pd->device=%p\n"
+		"    cap.max_send_wr = %d\n"
+		"    cap.max_recv_wr = %d\n"
+		"    cap.max_send_sge = %d\n"
+		"    cap.max_recv_sge = %d\n",
+		newxprt->sc_cm_id, newxprt->sc_pd,
+		newxprt->sc_cm_id->device, newxprt->sc_pd->device,
+		qp_attr.cap.max_send_wr,
+		qp_attr.cap.max_recv_wr,
+		qp_attr.cap.max_send_sge,
+		qp_attr.cap.max_recv_sge);
+
+	ret = rdma_create_qp(newxprt->sc_cm_id, newxprt->sc_pd, &qp_attr);
+	if (ret) {
+		dprintk("svcrdma: failed to create QP, ret=%d\n", ret);
+		goto errout;
+	}
+	newxprt->sc_qp = newxprt->sc_cm_id->qp;
+
+	/*
+	 * Use the most secure set of MR resources based on the
+	 * transport type and available memory management features in
+	 * the device. Here's the table implemented below:
+	 *
+	 *		Fast	Global	DMA	Remote WR
+	 *		Reg	LKEY	MR	Access
+	 *		Sup'd	Sup'd	Needed	Needed
+	 *
+	 * IWARP	N	N	Y	Y
+	 *		N	Y	Y	Y
+	 *		Y	N	Y	N
+	 *		Y	Y	N	-
+	 *
+	 * IB		N	N	Y	N
+	 *		N	Y	N	-
+	 *		Y	N	Y	N
+	 *		Y	Y	N	-
+	 *
+	 * NB:	iWARP requires remote write access for the data sink
+	 *	of an RDMA_READ. IB does not.
+	 */
+	if (devattr.device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) {
+		newxprt->sc_frmr_pg_list_len =
+			devattr.max_fast_reg_page_list_len;
+		newxprt->sc_dev_caps |= SVCRDMA_DEVCAP_FAST_REG;
+	}
+
+	/*
+	 * Determine if a DMA MR is required and if so, what privs are required
+	 */
+	switch (rdma_node_get_transport(newxprt->sc_cm_id->device->node_type)) {
+	case RDMA_TRANSPORT_IWARP:
+		newxprt->sc_dev_caps |= SVCRDMA_DEVCAP_READ_W_INV;
+		if (!(newxprt->sc_dev_caps & SVCRDMA_DEVCAP_FAST_REG)) {
+			need_dma_mr = 1;
+			dma_mr_acc =
+				(IB_ACCESS_LOCAL_WRITE |
+				 IB_ACCESS_REMOTE_WRITE);
+		} else if (!(devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY)) {
+			need_dma_mr = 1;
+			dma_mr_acc = IB_ACCESS_LOCAL_WRITE;
+		} else
+			need_dma_mr = 0;
+		break;
+	case RDMA_TRANSPORT_IB:
+		if (!(newxprt->sc_dev_caps & SVCRDMA_DEVCAP_FAST_REG)) {
+			need_dma_mr = 1;
+			dma_mr_acc = IB_ACCESS_LOCAL_WRITE;
+		} else if (!(devattr.device_cap_flags &
+			     IB_DEVICE_LOCAL_DMA_LKEY)) {
+			need_dma_mr = 1;
+			dma_mr_acc = IB_ACCESS_LOCAL_WRITE;
+		} else
+			need_dma_mr = 0;
+		break;
+	default:
+		goto errout;
+	}
+
+	/* Create the DMA MR if needed, otherwise, use the DMA LKEY */
+	if (need_dma_mr) {
+		/* Register all of physical memory */
+		newxprt->sc_phys_mr =
+			ib_get_dma_mr(newxprt->sc_pd, dma_mr_acc);
+		if (IS_ERR(newxprt->sc_phys_mr)) {
+			dprintk("svcrdma: Failed to create DMA MR ret=%d\n",
+				ret);
+			goto errout;
+		}
+		newxprt->sc_dma_lkey = newxprt->sc_phys_mr->lkey;
+	} else
+		newxprt->sc_dma_lkey =
+			newxprt->sc_cm_id->device->local_dma_lkey;
+
+	/* Post receive buffers */
+	for (i = 0; i < newxprt->sc_max_requests; i++) {
+		ret = svc_rdma_post_recv(newxprt);
+		if (ret) {
+			dprintk("svcrdma: failure posting receive buffers\n");
+			goto errout;
+		}
+	}
+
+	/* Swap out the handler */
+	newxprt->sc_cm_id->event_handler = rdma_cma_handler;
+
+	/*
+	 * Arm the CQs for the SQ and RQ before accepting so we can't
+	 * miss the first message
+	 */
+	ib_req_notify_cq(newxprt->sc_sq_cq, IB_CQ_NEXT_COMP);
+	ib_req_notify_cq(newxprt->sc_rq_cq, IB_CQ_NEXT_COMP);
+
+	/* Accept Connection */
+	set_bit(RDMAXPRT_CONN_PENDING, &newxprt->sc_flags);
+	memset(&conn_param, 0, sizeof conn_param);
+	conn_param.responder_resources = 0;
+	conn_param.initiator_depth = newxprt->sc_ord;
+	ret = rdma_accept(newxprt->sc_cm_id, &conn_param);
+	if (ret) {
+		dprintk("svcrdma: failed to accept new connection, ret=%d\n",
+		       ret);
+		goto errout;
+	}
+
+	dprintk("svcrdma: new connection %p accepted with the following "
+		"attributes:\n"
+		"    local_ip        : %pI4\n"
+		"    local_port	     : %d\n"
+		"    remote_ip       : %pI4\n"
+		"    remote_port     : %d\n"
+		"    max_sge         : %d\n"
+		"    sq_depth        : %d\n"
+		"    max_requests    : %d\n"
+		"    ord             : %d\n",
+		newxprt,
+		&((struct sockaddr_in *)&newxprt->sc_cm_id->
+			 route.addr.src_addr)->sin_addr.s_addr,
+		ntohs(((struct sockaddr_in *)&newxprt->sc_cm_id->
+		       route.addr.src_addr)->sin_port),
+		&((struct sockaddr_in *)&newxprt->sc_cm_id->
+			 route.addr.dst_addr)->sin_addr.s_addr,
+		ntohs(((struct sockaddr_in *)&newxprt->sc_cm_id->
+		       route.addr.dst_addr)->sin_port),
+		newxprt->sc_max_sge,
+		newxprt->sc_sq_depth,
+		newxprt->sc_max_requests,
+		newxprt->sc_ord);
+
+	return &newxprt->sc_xprt;
+
+ errout:
+	dprintk("svcrdma: failure accepting new connection rc=%d.\n", ret);
+	/* Take a reference in case the DTO handler runs */
+	svc_xprt_get(&newxprt->sc_xprt);
+	if (newxprt->sc_qp && !IS_ERR(newxprt->sc_qp))
+		ib_destroy_qp(newxprt->sc_qp);
+	rdma_destroy_id(newxprt->sc_cm_id);
+	/* This call to put will destroy the transport */
+	svc_xprt_put(&newxprt->sc_xprt);
+	return NULL;
+}
+
+static void svc_rdma_release_rqst(struct svc_rqst *rqstp)
+{
+}
+
+/*
+ * When connected, an svc_xprt has at least two references:
+ *
+ * - A reference held by the cm_id between the ESTABLISHED and
+ *   DISCONNECTED events. If the remote peer disconnected first, this
+ *   reference could be gone.
+ *
+ * - A reference held by the svc_recv code that called this function
+ *   as part of close processing.
+ *
+ * At a minimum one references should still be held.
+ */
+static void svc_rdma_detach(struct svc_xprt *xprt)
+{
+	struct svcxprt_rdma *rdma =
+		container_of(xprt, struct svcxprt_rdma, sc_xprt);
+	dprintk("svc: svc_rdma_detach(%p)\n", xprt);
+
+	/* Disconnect and flush posted WQE */
+	rdma_disconnect(rdma->sc_cm_id);
+}
+
+static void __svc_rdma_free(struct work_struct *work)
+{
+	struct svcxprt_rdma *rdma =
+		container_of(work, struct svcxprt_rdma, sc_work);
+	dprintk("svcrdma: svc_rdma_free(%p)\n", rdma);
+
+	/* We should only be called from kref_put */
+	BUG_ON(atomic_read(&rdma->sc_xprt.xpt_ref.refcount) != 0);
+
+	/*
+	 * Destroy queued, but not processed read completions. Note
+	 * that this cleanup has to be done before destroying the
+	 * cm_id because the device ptr is needed to unmap the dma in
+	 * svc_rdma_put_context.
+	 */
+	while (!list_empty(&rdma->sc_read_complete_q)) {
+		struct svc_rdma_op_ctxt *ctxt;
+		ctxt = list_entry(rdma->sc_read_complete_q.next,
+				  struct svc_rdma_op_ctxt,
+				  dto_q);
+		list_del_init(&ctxt->dto_q);
+		svc_rdma_put_context(ctxt, 1);
+	}
+
+	/* Destroy queued, but not processed recv completions */
+	while (!list_empty(&rdma->sc_rq_dto_q)) {
+		struct svc_rdma_op_ctxt *ctxt;
+		ctxt = list_entry(rdma->sc_rq_dto_q.next,
+				  struct svc_rdma_op_ctxt,
+				  dto_q);
+		list_del_init(&ctxt->dto_q);
+		svc_rdma_put_context(ctxt, 1);
+	}
+
+	/* Warn if we leaked a resource or under-referenced */
+	WARN_ON(atomic_read(&rdma->sc_ctxt_used) != 0);
+	WARN_ON(atomic_read(&rdma->sc_dma_used) != 0);
+
+	/* De-allocate fastreg mr */
+	rdma_dealloc_frmr_q(rdma);
+
+	/* Destroy the QP if present (not a listener) */
+	if (rdma->sc_qp && !IS_ERR(rdma->sc_qp))
+		ib_destroy_qp(rdma->sc_qp);
+
+	if (rdma->sc_sq_cq && !IS_ERR(rdma->sc_sq_cq))
+		ib_destroy_cq(rdma->sc_sq_cq);
+
+	if (rdma->sc_rq_cq && !IS_ERR(rdma->sc_rq_cq))
+		ib_destroy_cq(rdma->sc_rq_cq);
+
+	if (rdma->sc_phys_mr && !IS_ERR(rdma->sc_phys_mr))
+		ib_dereg_mr(rdma->sc_phys_mr);
+
+	if (rdma->sc_pd && !IS_ERR(rdma->sc_pd))
+		ib_dealloc_pd(rdma->sc_pd);
+
+	/* Destroy the CM ID */
+	rdma_destroy_id(rdma->sc_cm_id);
+
+	kfree(rdma);
+}
+
+static void svc_rdma_free(struct svc_xprt *xprt)
+{
+	struct svcxprt_rdma *rdma =
+		container_of(xprt, struct svcxprt_rdma, sc_xprt);
+	INIT_WORK(&rdma->sc_work, __svc_rdma_free);
+	queue_work(svc_rdma_wq, &rdma->sc_work);
+}
+
+static int svc_rdma_has_wspace(struct svc_xprt *xprt)
+{
+	struct svcxprt_rdma *rdma =
+		container_of(xprt, struct svcxprt_rdma, sc_xprt);
+
+	/*
+	 * If there are already waiters on the SQ,
+	 * return false.
+	 */
+	if (waitqueue_active(&rdma->sc_send_wait))
+		return 0;
+
+	/* Otherwise return true. */
+	return 1;
+}
+
+static int svc_rdma_secure_port(struct svc_rqst *rqstp)
+{
+	return 1;
+}
+
+/*
+ * Attempt to register the kvec representing the RPC memory with the
+ * device.
+ *
+ * Returns:
+ *  NULL : The device does not support fastreg or there were no more
+ *         fastreg mr.
+ *  frmr : The kvec register request was successfully posted.
+ *    <0 : An error was encountered attempting to register the kvec.
+ */
+int svc_rdma_fastreg(struct svcxprt_rdma *xprt,
+		     struct svc_rdma_fastreg_mr *frmr)
+{
+	struct ib_send_wr fastreg_wr;
+	u8 key;
+
+	/* Bump the key */
+	key = (u8)(frmr->mr->lkey & 0x000000FF);
+	ib_update_fast_reg_key(frmr->mr, ++key);
+
+	/* Prepare FASTREG WR */
+	memset(&fastreg_wr, 0, sizeof fastreg_wr);
+	fastreg_wr.opcode = IB_WR_FAST_REG_MR;
+	fastreg_wr.send_flags = IB_SEND_SIGNALED;
+	fastreg_wr.wr.fast_reg.iova_start = (unsigned long)frmr->kva;
+	fastreg_wr.wr.fast_reg.page_list = frmr->page_list;
+	fastreg_wr.wr.fast_reg.page_list_len = frmr->page_list_len;
+	fastreg_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
+	fastreg_wr.wr.fast_reg.length = frmr->map_len;
+	fastreg_wr.wr.fast_reg.access_flags = frmr->access_flags;
+	fastreg_wr.wr.fast_reg.rkey = frmr->mr->lkey;
+	return svc_rdma_send(xprt, &fastreg_wr);
+}
+
+int svc_rdma_send(struct svcxprt_rdma *xprt, struct ib_send_wr *wr)
+{
+	struct ib_send_wr *bad_wr, *n_wr;
+	int wr_count;
+	int i;
+	int ret;
+
+	if (test_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags))
+		return -ENOTCONN;
+
+	BUG_ON(wr->send_flags != IB_SEND_SIGNALED);
+	wr_count = 1;
+	for (n_wr = wr->next; n_wr; n_wr = n_wr->next)
+		wr_count++;
+
+	/* If the SQ is full, wait until an SQ entry is available */
+	while (1) {
+		spin_lock_bh(&xprt->sc_lock);
+		if (xprt->sc_sq_depth < atomic_read(&xprt->sc_sq_count) + wr_count) {
+			spin_unlock_bh(&xprt->sc_lock);
+			atomic_inc(&rdma_stat_sq_starve);
+
+			/* See if we can opportunistically reap SQ WR to make room */
+			sq_cq_reap(xprt);
+
+			/* Wait until SQ WR available if SQ still full */
+			wait_event(xprt->sc_send_wait,
+				   atomic_read(&xprt->sc_sq_count) <
+				   xprt->sc_sq_depth);
+			if (test_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags))
+				return -ENOTCONN;
+			continue;
+		}
+		/* Take a transport ref for each WR posted */
+		for (i = 0; i < wr_count; i++)
+			svc_xprt_get(&xprt->sc_xprt);
+
+		/* Bump used SQ WR count and post */
+		atomic_add(wr_count, &xprt->sc_sq_count);
+		ret = ib_post_send(xprt->sc_qp, wr, &bad_wr);
+		if (ret) {
+			set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags);
+			atomic_sub(wr_count, &xprt->sc_sq_count);
+			for (i = 0; i < wr_count; i ++)
+				svc_xprt_put(&xprt->sc_xprt);
+			dprintk("svcrdma: failed to post SQ WR rc=%d, "
+			       "sc_sq_count=%d, sc_sq_depth=%d\n",
+			       ret, atomic_read(&xprt->sc_sq_count),
+			       xprt->sc_sq_depth);
+		}
+		spin_unlock_bh(&xprt->sc_lock);
+		if (ret)
+			wake_up(&xprt->sc_send_wait);
+		break;
+	}
+	return ret;
+}
+
+void svc_rdma_send_error(struct svcxprt_rdma *xprt, struct rpcrdma_msg *rmsgp,
+			 enum rpcrdma_errcode err)
+{
+	struct ib_send_wr err_wr;
+	struct page *p;
+	struct svc_rdma_op_ctxt *ctxt;
+	u32 *va;
+	int length;
+	int ret;
+
+	p = svc_rdma_get_page();
+	va = page_address(p);
+
+	/* XDR encode error */
+	length = svc_rdma_xdr_encode_error(xprt, rmsgp, err, va);
+
+	ctxt = svc_rdma_get_context(xprt);
+	ctxt->direction = DMA_FROM_DEVICE;
+	ctxt->count = 1;
+	ctxt->pages[0] = p;
+
+	/* Prepare SGE for local address */
+	ctxt->sge[0].addr = ib_dma_map_page(xprt->sc_cm_id->device,
+					    p, 0, length, DMA_FROM_DEVICE);
+	if (ib_dma_mapping_error(xprt->sc_cm_id->device, ctxt->sge[0].addr)) {
+		put_page(p);
+		svc_rdma_put_context(ctxt, 1);
+		return;
+	}
+	atomic_inc(&xprt->sc_dma_used);
+	ctxt->sge[0].lkey = xprt->sc_dma_lkey;
+	ctxt->sge[0].length = length;
+
+	/* Prepare SEND WR */
+	memset(&err_wr, 0, sizeof err_wr);
+	ctxt->wr_op = IB_WR_SEND;
+	err_wr.wr_id = (unsigned long)ctxt;
+	err_wr.sg_list = ctxt->sge;
+	err_wr.num_sge = 1;
+	err_wr.opcode = IB_WR_SEND;
+	err_wr.send_flags = IB_SEND_SIGNALED;
+
+	/* Post It */
+	ret = svc_rdma_send(xprt, &err_wr);
+	if (ret) {
+		dprintk("svcrdma: Error %d posting send for protocol error\n",
+			ret);
+		svc_rdma_unmap_dma(ctxt);
+		svc_rdma_put_context(ctxt, 1);
+	}
+}
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
new file mode 100644
index 0000000..6a4615d
--- /dev/null
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -0,0 +1,747 @@
+/*
+ * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the BSD-type
+ * license below:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *      Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *
+ *      Redistributions in binary form must reproduce the above
+ *      copyright notice, this list of conditions and the following
+ *      disclaimer in the documentation and/or other materials provided
+ *      with the distribution.
+ *
+ *      Neither the name of the Network Appliance, Inc. nor the names of
+ *      its contributors may be used to endorse or promote products
+ *      derived from this software without specific prior written
+ *      permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * transport.c
+ *
+ * This file contains the top-level implementation of an RPC RDMA
+ * transport.
+ *
+ * Naming convention: functions beginning with xprt_ are part of the
+ * transport switch. All others are RPC RDMA internal.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/seq_file.h>
+#include <linux/sunrpc/addr.h>
+
+#include "xprt_rdma.h"
+
+#ifdef RPC_DEBUG
+# define RPCDBG_FACILITY	RPCDBG_TRANS
+#endif
+
+MODULE_LICENSE("Dual BSD/GPL");
+
+MODULE_DESCRIPTION("RPC/RDMA Transport for Linux kernel NFS");
+MODULE_AUTHOR("Network Appliance, Inc.");
+
+/*
+ * tunables
+ */
+
+static unsigned int xprt_rdma_slot_table_entries = RPCRDMA_DEF_SLOT_TABLE;
+static unsigned int xprt_rdma_max_inline_read = RPCRDMA_DEF_INLINE;
+static unsigned int xprt_rdma_max_inline_write = RPCRDMA_DEF_INLINE;
+static unsigned int xprt_rdma_inline_write_padding;
+static unsigned int xprt_rdma_memreg_strategy = RPCRDMA_FRMR;
+                int xprt_rdma_pad_optimize = 0;
+
+#ifdef RPC_DEBUG
+
+static unsigned int min_slot_table_size = RPCRDMA_MIN_SLOT_TABLE;
+static unsigned int max_slot_table_size = RPCRDMA_MAX_SLOT_TABLE;
+static unsigned int zero;
+static unsigned int max_padding = PAGE_SIZE;
+static unsigned int min_memreg = RPCRDMA_BOUNCEBUFFERS;
+static unsigned int max_memreg = RPCRDMA_LAST - 1;
+
+static struct ctl_table_header *sunrpc_table_header;
+
+static struct ctl_table xr_tunables_table[] = {
+	{
+		.procname	= "rdma_slot_table_entries",
+		.data		= &xprt_rdma_slot_table_entries,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &min_slot_table_size,
+		.extra2		= &max_slot_table_size
+	},
+	{
+		.procname	= "rdma_max_inline_read",
+		.data		= &xprt_rdma_max_inline_read,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{
+		.procname	= "rdma_max_inline_write",
+		.data		= &xprt_rdma_max_inline_write,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{
+		.procname	= "rdma_inline_write_padding",
+		.data		= &xprt_rdma_inline_write_padding,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &max_padding,
+	},
+	{
+		.procname	= "rdma_memreg_strategy",
+		.data		= &xprt_rdma_memreg_strategy,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &min_memreg,
+		.extra2		= &max_memreg,
+	},
+	{
+		.procname	= "rdma_pad_optimize",
+		.data		= &xprt_rdma_pad_optimize,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{ },
+};
+
+static struct ctl_table sunrpc_table[] = {
+	{
+		.procname	= "sunrpc",
+		.mode		= 0555,
+		.child		= xr_tunables_table
+	},
+	{ },
+};
+
+#endif
+
+#define RPCRDMA_BIND_TO		(60U * HZ)
+#define RPCRDMA_INIT_REEST_TO	(5U * HZ)
+#define RPCRDMA_MAX_REEST_TO	(30U * HZ)
+#define RPCRDMA_IDLE_DISC_TO	(5U * 60 * HZ)
+
+static struct rpc_xprt_ops xprt_rdma_procs;	/* forward reference */
+
+static void
+xprt_rdma_format_addresses(struct rpc_xprt *xprt)
+{
+	struct sockaddr *sap = (struct sockaddr *)
+					&rpcx_to_rdmad(xprt).addr;
+	struct sockaddr_in *sin = (struct sockaddr_in *)sap;
+	char buf[64];
+
+	(void)rpc_ntop(sap, buf, sizeof(buf));
+	xprt->address_strings[RPC_DISPLAY_ADDR] = kstrdup(buf, GFP_KERNEL);
+
+	snprintf(buf, sizeof(buf), "%u", rpc_get_port(sap));
+	xprt->address_strings[RPC_DISPLAY_PORT] = kstrdup(buf, GFP_KERNEL);
+
+	xprt->address_strings[RPC_DISPLAY_PROTO] = "rdma";
+
+	snprintf(buf, sizeof(buf), "%08x", ntohl(sin->sin_addr.s_addr));
+	xprt->address_strings[RPC_DISPLAY_HEX_ADDR] = kstrdup(buf, GFP_KERNEL);
+
+	snprintf(buf, sizeof(buf), "%4hx", rpc_get_port(sap));
+	xprt->address_strings[RPC_DISPLAY_HEX_PORT] = kstrdup(buf, GFP_KERNEL);
+
+	/* netid */
+	xprt->address_strings[RPC_DISPLAY_NETID] = "rdma";
+}
+
+static void
+xprt_rdma_free_addresses(struct rpc_xprt *xprt)
+{
+	unsigned int i;
+
+	for (i = 0; i < RPC_DISPLAY_MAX; i++)
+		switch (i) {
+		case RPC_DISPLAY_PROTO:
+		case RPC_DISPLAY_NETID:
+			continue;
+		default:
+			kfree(xprt->address_strings[i]);
+		}
+}
+
+static void
+xprt_rdma_connect_worker(struct work_struct *work)
+{
+	struct rpcrdma_xprt *r_xprt =
+		container_of(work, struct rpcrdma_xprt, rdma_connect.work);
+	struct rpc_xprt *xprt = &r_xprt->xprt;
+	int rc = 0;
+
+	xprt_clear_connected(xprt);
+
+	dprintk("RPC:       %s: %sconnect\n", __func__,
+			r_xprt->rx_ep.rep_connected != 0 ? "re" : "");
+	rc = rpcrdma_ep_connect(&r_xprt->rx_ep, &r_xprt->rx_ia);
+	if (rc)
+		xprt_wake_pending_tasks(xprt, rc);
+
+	dprintk("RPC:       %s: exit\n", __func__);
+	xprt_clear_connecting(xprt);
+}
+
+/*
+ * xprt_rdma_destroy
+ *
+ * Destroy the xprt.
+ * Free all memory associated with the object, including its own.
+ * NOTE: none of the *destroy methods free memory for their top-level
+ * objects, even though they may have allocated it (they do free
+ * private memory). It's up to the caller to handle it. In this
+ * case (RDMA transport), all structure memory is inlined with the
+ * struct rpcrdma_xprt.
+ */
+static void
+xprt_rdma_destroy(struct rpc_xprt *xprt)
+{
+	struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
+
+	dprintk("RPC:       %s: called\n", __func__);
+
+	cancel_delayed_work_sync(&r_xprt->rdma_connect);
+
+	xprt_clear_connected(xprt);
+
+	rpcrdma_buffer_destroy(&r_xprt->rx_buf);
+	rpcrdma_ep_destroy(&r_xprt->rx_ep, &r_xprt->rx_ia);
+	rpcrdma_ia_close(&r_xprt->rx_ia);
+
+	xprt_rdma_free_addresses(xprt);
+
+	xprt_free(xprt);
+
+	dprintk("RPC:       %s: returning\n", __func__);
+
+	module_put(THIS_MODULE);
+}
+
+static const struct rpc_timeout xprt_rdma_default_timeout = {
+	.to_initval = 60 * HZ,
+	.to_maxval = 60 * HZ,
+};
+
+/**
+ * xprt_setup_rdma - Set up transport to use RDMA
+ *
+ * @args: rpc transport arguments
+ */
+static struct rpc_xprt *
+xprt_setup_rdma(struct xprt_create *args)
+{
+	struct rpcrdma_create_data_internal cdata;
+	struct rpc_xprt *xprt;
+	struct rpcrdma_xprt *new_xprt;
+	struct rpcrdma_ep *new_ep;
+	struct sockaddr_in *sin;
+	int rc;
+
+	if (args->addrlen > sizeof(xprt->addr)) {
+		dprintk("RPC:       %s: address too large\n", __func__);
+		return ERR_PTR(-EBADF);
+	}
+
+	xprt = xprt_alloc(args->net, sizeof(struct rpcrdma_xprt),
+			xprt_rdma_slot_table_entries,
+			xprt_rdma_slot_table_entries);
+	if (xprt == NULL) {
+		dprintk("RPC:       %s: couldn't allocate rpcrdma_xprt\n",
+			__func__);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	/* 60 second timeout, no retries */
+	xprt->timeout = &xprt_rdma_default_timeout;
+	xprt->bind_timeout = RPCRDMA_BIND_TO;
+	xprt->reestablish_timeout = RPCRDMA_INIT_REEST_TO;
+	xprt->idle_timeout = RPCRDMA_IDLE_DISC_TO;
+
+	xprt->resvport = 0;		/* privileged port not needed */
+	xprt->tsh_size = 0;		/* RPC-RDMA handles framing */
+	xprt->ops = &xprt_rdma_procs;
+
+	/*
+	 * Set up RDMA-specific connect data.
+	 */
+
+	/* Put server RDMA address in local cdata */
+	memcpy(&cdata.addr, args->dstaddr, args->addrlen);
+
+	/* Ensure xprt->addr holds valid server TCP (not RDMA)
+	 * address, for any side protocols which peek at it */
+	xprt->prot = IPPROTO_TCP;
+	xprt->addrlen = args->addrlen;
+	memcpy(&xprt->addr, &cdata.addr, xprt->addrlen);
+
+	sin = (struct sockaddr_in *)&cdata.addr;
+	if (ntohs(sin->sin_port) != 0)
+		xprt_set_bound(xprt);
+
+	dprintk("RPC:       %s: %pI4:%u\n",
+		__func__, &sin->sin_addr.s_addr, ntohs(sin->sin_port));
+
+	/* Set max requests */
+	cdata.max_requests = xprt->max_reqs;
+
+	/* Set some length limits */
+	cdata.rsize = RPCRDMA_MAX_SEGS * PAGE_SIZE; /* RDMA write max */
+	cdata.wsize = RPCRDMA_MAX_SEGS * PAGE_SIZE; /* RDMA read max */
+
+	cdata.inline_wsize = xprt_rdma_max_inline_write;
+	if (cdata.inline_wsize > cdata.wsize)
+		cdata.inline_wsize = cdata.wsize;
+
+	cdata.inline_rsize = xprt_rdma_max_inline_read;
+	if (cdata.inline_rsize > cdata.rsize)
+		cdata.inline_rsize = cdata.rsize;
+
+	cdata.padding = xprt_rdma_inline_write_padding;
+
+	/*
+	 * Create new transport instance, which includes initialized
+	 *  o ia
+	 *  o endpoint
+	 *  o buffers
+	 */
+
+	new_xprt = rpcx_to_rdmax(xprt);
+
+	rc = rpcrdma_ia_open(new_xprt, (struct sockaddr *) &cdata.addr,
+				xprt_rdma_memreg_strategy);
+	if (rc)
+		goto out1;
+
+	/*
+	 * initialize and create ep
+	 */
+	new_xprt->rx_data = cdata;
+	new_ep = &new_xprt->rx_ep;
+	new_ep->rep_remote_addr = cdata.addr;
+
+	rc = rpcrdma_ep_create(&new_xprt->rx_ep,
+				&new_xprt->rx_ia, &new_xprt->rx_data);
+	if (rc)
+		goto out2;
+
+	/*
+	 * Allocate pre-registered send and receive buffers for headers and
+	 * any inline data. Also specify any padding which will be provided
+	 * from a preregistered zero buffer.
+	 */
+	rc = rpcrdma_buffer_create(&new_xprt->rx_buf, new_ep, &new_xprt->rx_ia,
+				&new_xprt->rx_data);
+	if (rc)
+		goto out3;
+
+	/*
+	 * Register a callback for connection events. This is necessary because
+	 * connection loss notification is async. We also catch connection loss
+	 * when reaping receives.
+	 */
+	INIT_DELAYED_WORK(&new_xprt->rdma_connect, xprt_rdma_connect_worker);
+	new_ep->rep_func = rpcrdma_conn_func;
+	new_ep->rep_xprt = xprt;
+
+	xprt_rdma_format_addresses(xprt);
+	xprt->max_payload = rpcrdma_max_payload(new_xprt);
+	dprintk("RPC:       %s: transport data payload maximum: %zu bytes\n",
+		__func__, xprt->max_payload);
+
+	if (!try_module_get(THIS_MODULE))
+		goto out4;
+
+	return xprt;
+
+out4:
+	xprt_rdma_free_addresses(xprt);
+	rc = -EINVAL;
+out3:
+	rpcrdma_ep_destroy(new_ep, &new_xprt->rx_ia);
+out2:
+	rpcrdma_ia_close(&new_xprt->rx_ia);
+out1:
+	xprt_free(xprt);
+	return ERR_PTR(rc);
+}
+
+/*
+ * Close a connection, during shutdown or timeout/reconnect
+ */
+static void
+xprt_rdma_close(struct rpc_xprt *xprt)
+{
+	struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
+
+	dprintk("RPC:       %s: closing\n", __func__);
+	if (r_xprt->rx_ep.rep_connected > 0)
+		xprt->reestablish_timeout = 0;
+	xprt_disconnect_done(xprt);
+	rpcrdma_ep_disconnect(&r_xprt->rx_ep, &r_xprt->rx_ia);
+}
+
+static void
+xprt_rdma_set_port(struct rpc_xprt *xprt, u16 port)
+{
+	struct sockaddr_in *sap;
+
+	sap = (struct sockaddr_in *)&xprt->addr;
+	sap->sin_port = htons(port);
+	sap = (struct sockaddr_in *)&rpcx_to_rdmad(xprt).addr;
+	sap->sin_port = htons(port);
+	dprintk("RPC:       %s: %u\n", __func__, port);
+}
+
+static void
+xprt_rdma_connect(struct rpc_xprt *xprt, struct rpc_task *task)
+{
+	struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
+
+	if (r_xprt->rx_ep.rep_connected != 0) {
+		/* Reconnect */
+		schedule_delayed_work(&r_xprt->rdma_connect,
+			xprt->reestablish_timeout);
+		xprt->reestablish_timeout <<= 1;
+		if (xprt->reestablish_timeout > RPCRDMA_MAX_REEST_TO)
+			xprt->reestablish_timeout = RPCRDMA_MAX_REEST_TO;
+		else if (xprt->reestablish_timeout < RPCRDMA_INIT_REEST_TO)
+			xprt->reestablish_timeout = RPCRDMA_INIT_REEST_TO;
+	} else {
+		schedule_delayed_work(&r_xprt->rdma_connect, 0);
+		if (!RPC_IS_ASYNC(task))
+			flush_delayed_work(&r_xprt->rdma_connect);
+	}
+}
+
+/*
+ * The RDMA allocate/free functions need the task structure as a place
+ * to hide the struct rpcrdma_req, which is necessary for the actual send/recv
+ * sequence. For this reason, the recv buffers are attached to send
+ * buffers for portions of the RPC. Note that the RPC layer allocates
+ * both send and receive buffers in the same call. We may register
+ * the receive buffer portion when using reply chunks.
+ */
+static void *
+xprt_rdma_allocate(struct rpc_task *task, size_t size)
+{
+	struct rpc_xprt *xprt = task->tk_rqstp->rq_xprt;
+	struct rpcrdma_req *req, *nreq;
+
+	req = rpcrdma_buffer_get(&rpcx_to_rdmax(xprt)->rx_buf);
+	if (req == NULL)
+		return NULL;
+
+	if (size > req->rl_size) {
+		dprintk("RPC:       %s: size %zd too large for buffer[%zd]: "
+			"prog %d vers %d proc %d\n",
+			__func__, size, req->rl_size,
+			task->tk_client->cl_prog, task->tk_client->cl_vers,
+			task->tk_msg.rpc_proc->p_proc);
+		/*
+		 * Outgoing length shortage. Our inline write max must have
+		 * been configured to perform direct i/o.
+		 *
+		 * This is therefore a large metadata operation, and the
+		 * allocate call was made on the maximum possible message,
+		 * e.g. containing long filename(s) or symlink data. In
+		 * fact, while these metadata operations *might* carry
+		 * large outgoing payloads, they rarely *do*. However, we
+		 * have to commit to the request here, so reallocate and
+		 * register it now. The data path will never require this
+		 * reallocation.
+		 *
+		 * If the allocation or registration fails, the RPC framework
+		 * will (doggedly) retry.
+		 */
+		if (task->tk_flags & RPC_TASK_SWAPPER)
+			nreq = kmalloc(sizeof *req + size, GFP_ATOMIC);
+		else
+			nreq = kmalloc(sizeof *req + size, GFP_NOFS);
+		if (nreq == NULL)
+			goto outfail;
+
+		if (rpcrdma_register_internal(&rpcx_to_rdmax(xprt)->rx_ia,
+				nreq->rl_base, size + sizeof(struct rpcrdma_req)
+				- offsetof(struct rpcrdma_req, rl_base),
+				&nreq->rl_handle, &nreq->rl_iov)) {
+			kfree(nreq);
+			goto outfail;
+		}
+		rpcx_to_rdmax(xprt)->rx_stats.hardway_register_count += size;
+		nreq->rl_size = size;
+		nreq->rl_niovs = 0;
+		nreq->rl_nchunks = 0;
+		nreq->rl_buffer = (struct rpcrdma_buffer *)req;
+		nreq->rl_reply = req->rl_reply;
+		memcpy(nreq->rl_segments,
+			req->rl_segments, sizeof nreq->rl_segments);
+		/* flag the swap with an unused field */
+		nreq->rl_iov.length = 0;
+		req->rl_reply = NULL;
+		req = nreq;
+	}
+	dprintk("RPC:       %s: size %zd, request 0x%p\n", __func__, size, req);
+	req->rl_connect_cookie = 0;	/* our reserved value */
+	return req->rl_xdr_buf;
+
+outfail:
+	rpcrdma_buffer_put(req);
+	rpcx_to_rdmax(xprt)->rx_stats.failed_marshal_count++;
+	return NULL;
+}
+
+/*
+ * This function returns all RDMA resources to the pool.
+ */
+static void
+xprt_rdma_free(void *buffer)
+{
+	struct rpcrdma_req *req;
+	struct rpcrdma_xprt *r_xprt;
+	struct rpcrdma_rep *rep;
+	int i;
+
+	if (buffer == NULL)
+		return;
+
+	req = container_of(buffer, struct rpcrdma_req, rl_xdr_buf[0]);
+	if (req->rl_iov.length == 0) {	/* see allocate above */
+		r_xprt = container_of(((struct rpcrdma_req *) req->rl_buffer)->rl_buffer,
+				      struct rpcrdma_xprt, rx_buf);
+	} else
+		r_xprt = container_of(req->rl_buffer, struct rpcrdma_xprt, rx_buf);
+	rep = req->rl_reply;
+
+	dprintk("RPC:       %s: called on 0x%p%s\n",
+		__func__, rep, (rep && rep->rr_func) ? " (with waiter)" : "");
+
+	/*
+	 * Finish the deregistration.  The process is considered
+	 * complete when the rr_func vector becomes NULL - this
+	 * was put in place during rpcrdma_reply_handler() - the wait
+	 * call below will not block if the dereg is "done". If
+	 * interrupted, our framework will clean up.
+	 */
+	for (i = 0; req->rl_nchunks;) {
+		--req->rl_nchunks;
+		i += rpcrdma_deregister_external(
+			&req->rl_segments[i], r_xprt);
+	}
+
+	if (req->rl_iov.length == 0) {	/* see allocate above */
+		struct rpcrdma_req *oreq = (struct rpcrdma_req *)req->rl_buffer;
+		oreq->rl_reply = req->rl_reply;
+		(void) rpcrdma_deregister_internal(&r_xprt->rx_ia,
+						   req->rl_handle,
+						   &req->rl_iov);
+		kfree(req);
+		req = oreq;
+	}
+
+	/* Put back request+reply buffers */
+	rpcrdma_buffer_put(req);
+}
+
+/*
+ * send_request invokes the meat of RPC RDMA. It must do the following:
+ *  1.  Marshal the RPC request into an RPC RDMA request, which means
+ *	putting a header in front of data, and creating IOVs for RDMA
+ *	from those in the request.
+ *  2.  In marshaling, detect opportunities for RDMA, and use them.
+ *  3.  Post a recv message to set up asynch completion, then send
+ *	the request (rpcrdma_ep_post).
+ *  4.  No partial sends are possible in the RPC-RDMA protocol (as in UDP).
+ */
+
+static int
+xprt_rdma_send_request(struct rpc_task *task)
+{
+	struct rpc_rqst *rqst = task->tk_rqstp;
+	struct rpc_xprt *xprt = rqst->rq_xprt;
+	struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
+	struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
+	int rc = 0;
+
+	if (req->rl_niovs == 0)
+		rc = rpcrdma_marshal_req(rqst);
+	else if (r_xprt->rx_ia.ri_memreg_strategy == RPCRDMA_FRMR)
+		rc = rpcrdma_marshal_chunks(rqst, 0);
+	if (rc < 0)
+		goto failed_marshal;
+
+	if (req->rl_reply == NULL) 		/* e.g. reconnection */
+		rpcrdma_recv_buffer_get(req);
+
+	if (req->rl_reply) {
+		req->rl_reply->rr_func = rpcrdma_reply_handler;
+		/* this need only be done once, but... */
+		req->rl_reply->rr_xprt = xprt;
+	}
+
+	/* Must suppress retransmit to maintain credits */
+	if (req->rl_connect_cookie == xprt->connect_cookie)
+		goto drop_connection;
+	req->rl_connect_cookie = xprt->connect_cookie;
+
+	if (rpcrdma_ep_post(&r_xprt->rx_ia, &r_xprt->rx_ep, req))
+		goto drop_connection;
+
+	rqst->rq_xmit_bytes_sent += rqst->rq_snd_buf.len;
+	rqst->rq_bytes_sent = 0;
+	return 0;
+
+failed_marshal:
+	r_xprt->rx_stats.failed_marshal_count++;
+	dprintk("RPC:       %s: rpcrdma_marshal_req failed, status %i\n",
+		__func__, rc);
+	if (rc == -EIO)
+		return -EIO;
+drop_connection:
+	xprt_disconnect_done(xprt);
+	return -ENOTCONN;	/* implies disconnect */
+}
+
+static void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq)
+{
+	struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
+	long idle_time = 0;
+
+	if (xprt_connected(xprt))
+		idle_time = (long)(jiffies - xprt->last_used) / HZ;
+
+	seq_printf(seq,
+	  "\txprt:\trdma %u %lu %lu %lu %ld %lu %lu %lu %Lu %Lu "
+	  "%lu %lu %lu %Lu %Lu %Lu %Lu %lu %lu %lu\n",
+
+	   0,	/* need a local port? */
+	   xprt->stat.bind_count,
+	   xprt->stat.connect_count,
+	   xprt->stat.connect_time,
+	   idle_time,
+	   xprt->stat.sends,
+	   xprt->stat.recvs,
+	   xprt->stat.bad_xids,
+	   xprt->stat.req_u,
+	   xprt->stat.bklog_u,
+
+	   r_xprt->rx_stats.read_chunk_count,
+	   r_xprt->rx_stats.write_chunk_count,
+	   r_xprt->rx_stats.reply_chunk_count,
+	   r_xprt->rx_stats.total_rdma_request,
+	   r_xprt->rx_stats.total_rdma_reply,
+	   r_xprt->rx_stats.pullup_copy_count,
+	   r_xprt->rx_stats.fixup_copy_count,
+	   r_xprt->rx_stats.hardway_register_count,
+	   r_xprt->rx_stats.failed_marshal_count,
+	   r_xprt->rx_stats.bad_reply_count);
+}
+
+/*
+ * Plumbing for rpc transport switch and kernel module
+ */
+
+static struct rpc_xprt_ops xprt_rdma_procs = {
+	.reserve_xprt		= xprt_reserve_xprt_cong,
+	.release_xprt		= xprt_release_xprt_cong, /* sunrpc/xprt.c */
+	.alloc_slot		= xprt_alloc_slot,
+	.release_request	= xprt_release_rqst_cong,       /* ditto */
+	.set_retrans_timeout	= xprt_set_retrans_timeout_def, /* ditto */
+	.rpcbind		= rpcb_getport_async,	/* sunrpc/rpcb_clnt.c */
+	.set_port		= xprt_rdma_set_port,
+	.connect		= xprt_rdma_connect,
+	.buf_alloc		= xprt_rdma_allocate,
+	.buf_free		= xprt_rdma_free,
+	.send_request		= xprt_rdma_send_request,
+	.close			= xprt_rdma_close,
+	.destroy		= xprt_rdma_destroy,
+	.print_stats		= xprt_rdma_print_stats
+};
+
+static struct xprt_class xprt_rdma = {
+	.list			= LIST_HEAD_INIT(xprt_rdma.list),
+	.name			= "rdma",
+	.owner			= THIS_MODULE,
+	.ident			= XPRT_TRANSPORT_RDMA,
+	.setup			= xprt_setup_rdma,
+};
+
+static void __exit xprt_rdma_cleanup(void)
+{
+	int rc;
+
+	dprintk("RPCRDMA Module Removed, deregister RPC RDMA transport\n");
+#ifdef RPC_DEBUG
+	if (sunrpc_table_header) {
+		unregister_sysctl_table(sunrpc_table_header);
+		sunrpc_table_header = NULL;
+	}
+#endif
+	rc = xprt_unregister_transport(&xprt_rdma);
+	if (rc)
+		dprintk("RPC:       %s: xprt_unregister returned %i\n",
+			__func__, rc);
+}
+
+static int __init xprt_rdma_init(void)
+{
+	int rc;
+
+	rc = xprt_register_transport(&xprt_rdma);
+
+	if (rc)
+		return rc;
+
+	dprintk("RPCRDMA Module Init, register RPC RDMA transport\n");
+
+	dprintk("Defaults:\n");
+	dprintk("\tSlots %d\n"
+		"\tMaxInlineRead %d\n\tMaxInlineWrite %d\n",
+		xprt_rdma_slot_table_entries,
+		xprt_rdma_max_inline_read, xprt_rdma_max_inline_write);
+	dprintk("\tPadding %d\n\tMemreg %d\n",
+		xprt_rdma_inline_write_padding, xprt_rdma_memreg_strategy);
+
+#ifdef RPC_DEBUG
+	if (!sunrpc_table_header)
+		sunrpc_table_header = register_sysctl_table(sunrpc_table);
+#endif
+	return 0;
+}
+
+module_init(xprt_rdma_init);
+module_exit(xprt_rdma_cleanup);
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
new file mode 100644
index 0000000..61c4129
--- /dev/null
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -0,0 +1,2076 @@
+/*
+ * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the BSD-type
+ * license below:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *      Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *
+ *      Redistributions in binary form must reproduce the above
+ *      copyright notice, this list of conditions and the following
+ *      disclaimer in the documentation and/or other materials provided
+ *      with the distribution.
+ *
+ *      Neither the name of the Network Appliance, Inc. nor the names of
+ *      its contributors may be used to endorse or promote products
+ *      derived from this software without specific prior written
+ *      permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * verbs.c
+ *
+ * Encapsulates the major functions managing:
+ *  o adapters
+ *  o endpoints
+ *  o connections
+ *  o buffer memory
+ */
+
+#include <linux/interrupt.h>
+#include <linux/slab.h>
+#include <asm/bitops.h>
+
+#include "xprt_rdma.h"
+
+/*
+ * Globals/Macros
+ */
+
+#ifdef RPC_DEBUG
+# define RPCDBG_FACILITY	RPCDBG_TRANS
+#endif
+
+static void rpcrdma_reset_frmrs(struct rpcrdma_ia *);
+
+/*
+ * internal functions
+ */
+
+/*
+ * handle replies in tasklet context, using a single, global list
+ * rdma tasklet function -- just turn around and call the func
+ * for all replies on the list
+ */
+
+static DEFINE_SPINLOCK(rpcrdma_tk_lock_g);
+static LIST_HEAD(rpcrdma_tasklets_g);
+
+static void
+rpcrdma_run_tasklet(unsigned long data)
+{
+	struct rpcrdma_rep *rep;
+	void (*func)(struct rpcrdma_rep *);
+	unsigned long flags;
+
+	data = data;
+	spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
+	while (!list_empty(&rpcrdma_tasklets_g)) {
+		rep = list_entry(rpcrdma_tasklets_g.next,
+				 struct rpcrdma_rep, rr_list);
+		list_del(&rep->rr_list);
+		func = rep->rr_func;
+		rep->rr_func = NULL;
+		spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
+
+		if (func)
+			func(rep);
+		else
+			rpcrdma_recv_buffer_put(rep);
+
+		spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
+	}
+	spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
+}
+
+static DECLARE_TASKLET(rpcrdma_tasklet_g, rpcrdma_run_tasklet, 0UL);
+
+static void
+rpcrdma_qp_async_error_upcall(struct ib_event *event, void *context)
+{
+	struct rpcrdma_ep *ep = context;
+
+	dprintk("RPC:       %s: QP error %X on device %s ep %p\n",
+		__func__, event->event, event->device->name, context);
+	if (ep->rep_connected == 1) {
+		ep->rep_connected = -EIO;
+		ep->rep_func(ep);
+		wake_up_all(&ep->rep_connect_wait);
+	}
+}
+
+static void
+rpcrdma_cq_async_error_upcall(struct ib_event *event, void *context)
+{
+	struct rpcrdma_ep *ep = context;
+
+	dprintk("RPC:       %s: CQ error %X on device %s ep %p\n",
+		__func__, event->event, event->device->name, context);
+	if (ep->rep_connected == 1) {
+		ep->rep_connected = -EIO;
+		ep->rep_func(ep);
+		wake_up_all(&ep->rep_connect_wait);
+	}
+}
+
+static void
+rpcrdma_sendcq_process_wc(struct ib_wc *wc)
+{
+	struct rpcrdma_mw *frmr = (struct rpcrdma_mw *)(unsigned long)wc->wr_id;
+
+	dprintk("RPC:       %s: frmr %p status %X opcode %d\n",
+		__func__, frmr, wc->status, wc->opcode);
+
+	if (wc->wr_id == 0ULL)
+		return;
+	if (wc->status != IB_WC_SUCCESS)
+		frmr->r.frmr.fr_state = FRMR_IS_STALE;
+}
+
+static int
+rpcrdma_sendcq_poll(struct ib_cq *cq, struct rpcrdma_ep *ep)
+{
+	struct ib_wc *wcs;
+	int budget, count, rc;
+
+	budget = RPCRDMA_WC_BUDGET / RPCRDMA_POLLSIZE;
+	do {
+		wcs = ep->rep_send_wcs;
+
+		rc = ib_poll_cq(cq, RPCRDMA_POLLSIZE, wcs);
+		if (rc <= 0)
+			return rc;
+
+		count = rc;
+		while (count-- > 0)
+			rpcrdma_sendcq_process_wc(wcs++);
+	} while (rc == RPCRDMA_POLLSIZE && --budget);
+	return 0;
+}
+
+/*
+ * Handle send, fast_reg_mr, and local_inv completions.
+ *
+ * Send events are typically suppressed and thus do not result
+ * in an upcall. Occasionally one is signaled, however. This
+ * prevents the provider's completion queue from wrapping and
+ * losing a completion.
+ */
+static void
+rpcrdma_sendcq_upcall(struct ib_cq *cq, void *cq_context)
+{
+	struct rpcrdma_ep *ep = (struct rpcrdma_ep *)cq_context;
+	int rc;
+
+	rc = rpcrdma_sendcq_poll(cq, ep);
+	if (rc) {
+		dprintk("RPC:       %s: ib_poll_cq failed: %i\n",
+			__func__, rc);
+		return;
+	}
+
+	rc = ib_req_notify_cq(cq,
+			IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS);
+	if (rc == 0)
+		return;
+	if (rc < 0) {
+		dprintk("RPC:       %s: ib_req_notify_cq failed: %i\n",
+			__func__, rc);
+		return;
+	}
+
+	rpcrdma_sendcq_poll(cq, ep);
+}
+
+static void
+rpcrdma_recvcq_process_wc(struct ib_wc *wc, struct list_head *sched_list)
+{
+	struct rpcrdma_rep *rep =
+			(struct rpcrdma_rep *)(unsigned long)wc->wr_id;
+
+	dprintk("RPC:       %s: rep %p status %X opcode %X length %u\n",
+		__func__, rep, wc->status, wc->opcode, wc->byte_len);
+
+	if (wc->status != IB_WC_SUCCESS) {
+		rep->rr_len = ~0U;
+		goto out_schedule;
+	}
+	if (wc->opcode != IB_WC_RECV)
+		return;
+
+	rep->rr_len = wc->byte_len;
+	ib_dma_sync_single_for_cpu(rdmab_to_ia(rep->rr_buffer)->ri_id->device,
+			rep->rr_iov.addr, rep->rr_len, DMA_FROM_DEVICE);
+
+	if (rep->rr_len >= 16) {
+		struct rpcrdma_msg *p = (struct rpcrdma_msg *)rep->rr_base;
+		unsigned int credits = ntohl(p->rm_credit);
+
+		if (credits == 0)
+			credits = 1;	/* don't deadlock */
+		else if (credits > rep->rr_buffer->rb_max_requests)
+			credits = rep->rr_buffer->rb_max_requests;
+		atomic_set(&rep->rr_buffer->rb_credits, credits);
+	}
+
+out_schedule:
+	list_add_tail(&rep->rr_list, sched_list);
+}
+
+static int
+rpcrdma_recvcq_poll(struct ib_cq *cq, struct rpcrdma_ep *ep)
+{
+	struct list_head sched_list;
+	struct ib_wc *wcs;
+	int budget, count, rc;
+	unsigned long flags;
+
+	INIT_LIST_HEAD(&sched_list);
+	budget = RPCRDMA_WC_BUDGET / RPCRDMA_POLLSIZE;
+	do {
+		wcs = ep->rep_recv_wcs;
+
+		rc = ib_poll_cq(cq, RPCRDMA_POLLSIZE, wcs);
+		if (rc <= 0)
+			goto out_schedule;
+
+		count = rc;
+		while (count-- > 0)
+			rpcrdma_recvcq_process_wc(wcs++, &sched_list);
+	} while (rc == RPCRDMA_POLLSIZE && --budget);
+	rc = 0;
+
+out_schedule:
+	spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
+	list_splice_tail(&sched_list, &rpcrdma_tasklets_g);
+	spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
+	tasklet_schedule(&rpcrdma_tasklet_g);
+	return rc;
+}
+
+/*
+ * Handle receive completions.
+ *
+ * It is reentrant but processes single events in order to maintain
+ * ordering of receives to keep server credits.
+ *
+ * It is the responsibility of the scheduled tasklet to return
+ * recv buffers to the pool. NOTE: this affects synchronization of
+ * connection shutdown. That is, the structures required for
+ * the completion of the reply handler must remain intact until
+ * all memory has been reclaimed.
+ */
+static void
+rpcrdma_recvcq_upcall(struct ib_cq *cq, void *cq_context)
+{
+	struct rpcrdma_ep *ep = (struct rpcrdma_ep *)cq_context;
+	int rc;
+
+	rc = rpcrdma_recvcq_poll(cq, ep);
+	if (rc) {
+		dprintk("RPC:       %s: ib_poll_cq failed: %i\n",
+			__func__, rc);
+		return;
+	}
+
+	rc = ib_req_notify_cq(cq,
+			IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS);
+	if (rc == 0)
+		return;
+	if (rc < 0) {
+		dprintk("RPC:       %s: ib_req_notify_cq failed: %i\n",
+			__func__, rc);
+		return;
+	}
+
+	rpcrdma_recvcq_poll(cq, ep);
+}
+
+static void
+rpcrdma_flush_cqs(struct rpcrdma_ep *ep)
+{
+	rpcrdma_recvcq_upcall(ep->rep_attr.recv_cq, ep);
+	rpcrdma_sendcq_upcall(ep->rep_attr.send_cq, ep);
+}
+
+#ifdef RPC_DEBUG
+static const char * const conn[] = {
+	"address resolved",
+	"address error",
+	"route resolved",
+	"route error",
+	"connect request",
+	"connect response",
+	"connect error",
+	"unreachable",
+	"rejected",
+	"established",
+	"disconnected",
+	"device removal",
+	"multicast join",
+	"multicast error",
+	"address change",
+	"timewait exit",
+};
+
+#define CONNECTION_MSG(status)						\
+	((status) < ARRAY_SIZE(conn) ?					\
+		conn[(status)] : "unrecognized connection error")
+#endif
+
+static int
+rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event)
+{
+	struct rpcrdma_xprt *xprt = id->context;
+	struct rpcrdma_ia *ia = &xprt->rx_ia;
+	struct rpcrdma_ep *ep = &xprt->rx_ep;
+#ifdef RPC_DEBUG
+	struct sockaddr_in *addr = (struct sockaddr_in *) &ep->rep_remote_addr;
+#endif
+	struct ib_qp_attr attr;
+	struct ib_qp_init_attr iattr;
+	int connstate = 0;
+
+	switch (event->event) {
+	case RDMA_CM_EVENT_ADDR_RESOLVED:
+	case RDMA_CM_EVENT_ROUTE_RESOLVED:
+		ia->ri_async_rc = 0;
+		complete(&ia->ri_done);
+		break;
+	case RDMA_CM_EVENT_ADDR_ERROR:
+		ia->ri_async_rc = -EHOSTUNREACH;
+		dprintk("RPC:       %s: CM address resolution error, ep 0x%p\n",
+			__func__, ep);
+		complete(&ia->ri_done);
+		break;
+	case RDMA_CM_EVENT_ROUTE_ERROR:
+		ia->ri_async_rc = -ENETUNREACH;
+		dprintk("RPC:       %s: CM route resolution error, ep 0x%p\n",
+			__func__, ep);
+		complete(&ia->ri_done);
+		break;
+	case RDMA_CM_EVENT_ESTABLISHED:
+		connstate = 1;
+		ib_query_qp(ia->ri_id->qp, &attr,
+			IB_QP_MAX_QP_RD_ATOMIC | IB_QP_MAX_DEST_RD_ATOMIC,
+			&iattr);
+		dprintk("RPC:       %s: %d responder resources"
+			" (%d initiator)\n",
+			__func__, attr.max_dest_rd_atomic, attr.max_rd_atomic);
+		goto connected;
+	case RDMA_CM_EVENT_CONNECT_ERROR:
+		connstate = -ENOTCONN;
+		goto connected;
+	case RDMA_CM_EVENT_UNREACHABLE:
+		connstate = -ENETDOWN;
+		goto connected;
+	case RDMA_CM_EVENT_REJECTED:
+		connstate = -ECONNREFUSED;
+		goto connected;
+	case RDMA_CM_EVENT_DISCONNECTED:
+		connstate = -ECONNABORTED;
+		goto connected;
+	case RDMA_CM_EVENT_DEVICE_REMOVAL:
+		connstate = -ENODEV;
+connected:
+		atomic_set(&rpcx_to_rdmax(ep->rep_xprt)->rx_buf.rb_credits, 1);
+		dprintk("RPC:       %s: %sconnected\n",
+					__func__, connstate > 0 ? "" : "dis");
+		ep->rep_connected = connstate;
+		ep->rep_func(ep);
+		wake_up_all(&ep->rep_connect_wait);
+		/*FALLTHROUGH*/
+	default:
+		dprintk("RPC:       %s: %pI4:%u (ep 0x%p): %s\n",
+			__func__, &addr->sin_addr.s_addr,
+			ntohs(addr->sin_port), ep,
+			CONNECTION_MSG(event->event));
+		break;
+	}
+
+#ifdef RPC_DEBUG
+	if (connstate == 1) {
+		int ird = attr.max_dest_rd_atomic;
+		int tird = ep->rep_remote_cma.responder_resources;
+		printk(KERN_INFO "rpcrdma: connection to %pI4:%u "
+			"on %s, memreg %d slots %d ird %d%s\n",
+			&addr->sin_addr.s_addr,
+			ntohs(addr->sin_port),
+			ia->ri_id->device->name,
+			ia->ri_memreg_strategy,
+			xprt->rx_buf.rb_max_requests,
+			ird, ird < 4 && ird < tird / 2 ? " (low!)" : "");
+	} else if (connstate < 0) {
+		printk(KERN_INFO "rpcrdma: connection to %pI4:%u closed (%d)\n",
+			&addr->sin_addr.s_addr,
+			ntohs(addr->sin_port),
+			connstate);
+	}
+#endif
+
+	return 0;
+}
+
+static struct rdma_cm_id *
+rpcrdma_create_id(struct rpcrdma_xprt *xprt,
+			struct rpcrdma_ia *ia, struct sockaddr *addr)
+{
+	struct rdma_cm_id *id;
+	int rc;
+
+	init_completion(&ia->ri_done);
+
+	id = rdma_create_id(rpcrdma_conn_upcall, xprt, RDMA_PS_TCP, IB_QPT_RC);
+	if (IS_ERR(id)) {
+		rc = PTR_ERR(id);
+		dprintk("RPC:       %s: rdma_create_id() failed %i\n",
+			__func__, rc);
+		return id;
+	}
+
+	ia->ri_async_rc = -ETIMEDOUT;
+	rc = rdma_resolve_addr(id, NULL, addr, RDMA_RESOLVE_TIMEOUT);
+	if (rc) {
+		dprintk("RPC:       %s: rdma_resolve_addr() failed %i\n",
+			__func__, rc);
+		goto out;
+	}
+	wait_for_completion_interruptible_timeout(&ia->ri_done,
+				msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1);
+	rc = ia->ri_async_rc;
+	if (rc)
+		goto out;
+
+	ia->ri_async_rc = -ETIMEDOUT;
+	rc = rdma_resolve_route(id, RDMA_RESOLVE_TIMEOUT);
+	if (rc) {
+		dprintk("RPC:       %s: rdma_resolve_route() failed %i\n",
+			__func__, rc);
+		goto out;
+	}
+	wait_for_completion_interruptible_timeout(&ia->ri_done,
+				msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1);
+	rc = ia->ri_async_rc;
+	if (rc)
+		goto out;
+
+	return id;
+
+out:
+	rdma_destroy_id(id);
+	return ERR_PTR(rc);
+}
+
+/*
+ * Drain any cq, prior to teardown.
+ */
+static void
+rpcrdma_clean_cq(struct ib_cq *cq)
+{
+	struct ib_wc wc;
+	int count = 0;
+
+	while (1 == ib_poll_cq(cq, 1, &wc))
+		++count;
+
+	if (count)
+		dprintk("RPC:       %s: flushed %d events (last 0x%x)\n",
+			__func__, count, wc.opcode);
+}
+
+/*
+ * Exported functions.
+ */
+
+/*
+ * Open and initialize an Interface Adapter.
+ *  o initializes fields of struct rpcrdma_ia, including
+ *    interface and provider attributes and protection zone.
+ */
+int
+rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
+{
+	int rc, mem_priv;
+	struct ib_device_attr devattr;
+	struct rpcrdma_ia *ia = &xprt->rx_ia;
+
+	ia->ri_id = rpcrdma_create_id(xprt, ia, addr);
+	if (IS_ERR(ia->ri_id)) {
+		rc = PTR_ERR(ia->ri_id);
+		goto out1;
+	}
+
+	ia->ri_pd = ib_alloc_pd(ia->ri_id->device);
+	if (IS_ERR(ia->ri_pd)) {
+		rc = PTR_ERR(ia->ri_pd);
+		dprintk("RPC:       %s: ib_alloc_pd() failed %i\n",
+			__func__, rc);
+		goto out2;
+	}
+
+	/*
+	 * Query the device to determine if the requested memory
+	 * registration strategy is supported. If it isn't, set the
+	 * strategy to a globally supported model.
+	 */
+	rc = ib_query_device(ia->ri_id->device, &devattr);
+	if (rc) {
+		dprintk("RPC:       %s: ib_query_device failed %d\n",
+			__func__, rc);
+		goto out2;
+	}
+
+	if (devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY) {
+		ia->ri_have_dma_lkey = 1;
+		ia->ri_dma_lkey = ia->ri_id->device->local_dma_lkey;
+	}
+
+	if (memreg == RPCRDMA_FRMR) {
+		/* Requires both frmr reg and local dma lkey */
+		if ((devattr.device_cap_flags &
+		     (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) !=
+		    (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) {
+			dprintk("RPC:       %s: FRMR registration "
+				"not supported by HCA\n", __func__);
+			memreg = RPCRDMA_MTHCAFMR;
+		} else {
+			/* Mind the ia limit on FRMR page list depth */
+			ia->ri_max_frmr_depth = min_t(unsigned int,
+				RPCRDMA_MAX_DATA_SEGS,
+				devattr.max_fast_reg_page_list_len);
+		}
+	}
+	if (memreg == RPCRDMA_MTHCAFMR) {
+		if (!ia->ri_id->device->alloc_fmr) {
+			dprintk("RPC:       %s: MTHCAFMR registration "
+				"not supported by HCA\n", __func__);
+			memreg = RPCRDMA_ALLPHYSICAL;
+		}
+	}
+
+	/*
+	 * Optionally obtain an underlying physical identity mapping in
+	 * order to do a memory window-based bind. This base registration
+	 * is protected from remote access - that is enabled only by binding
+	 * for the specific bytes targeted during each RPC operation, and
+	 * revoked after the corresponding completion similar to a storage
+	 * adapter.
+	 */
+	switch (memreg) {
+	case RPCRDMA_FRMR:
+		break;
+	case RPCRDMA_ALLPHYSICAL:
+		mem_priv = IB_ACCESS_LOCAL_WRITE |
+				IB_ACCESS_REMOTE_WRITE |
+				IB_ACCESS_REMOTE_READ;
+		goto register_setup;
+	case RPCRDMA_MTHCAFMR:
+		if (ia->ri_have_dma_lkey)
+			break;
+		mem_priv = IB_ACCESS_LOCAL_WRITE;
+	register_setup:
+		ia->ri_bind_mem = ib_get_dma_mr(ia->ri_pd, mem_priv);
+		if (IS_ERR(ia->ri_bind_mem)) {
+			printk(KERN_ALERT "%s: ib_get_dma_mr for "
+				"phys register failed with %lX\n",
+				__func__, PTR_ERR(ia->ri_bind_mem));
+			rc = -ENOMEM;
+			goto out2;
+		}
+		break;
+	default:
+		printk(KERN_ERR "RPC: Unsupported memory "
+				"registration mode: %d\n", memreg);
+		rc = -ENOMEM;
+		goto out2;
+	}
+	dprintk("RPC:       %s: memory registration strategy is %d\n",
+		__func__, memreg);
+
+	/* Else will do memory reg/dereg for each chunk */
+	ia->ri_memreg_strategy = memreg;
+
+	rwlock_init(&ia->ri_qplock);
+	return 0;
+out2:
+	rdma_destroy_id(ia->ri_id);
+	ia->ri_id = NULL;
+out1:
+	return rc;
+}
+
+/*
+ * Clean up/close an IA.
+ *   o if event handles and PD have been initialized, free them.
+ *   o close the IA
+ */
+void
+rpcrdma_ia_close(struct rpcrdma_ia *ia)
+{
+	int rc;
+
+	dprintk("RPC:       %s: entering\n", __func__);
+	if (ia->ri_bind_mem != NULL) {
+		rc = ib_dereg_mr(ia->ri_bind_mem);
+		dprintk("RPC:       %s: ib_dereg_mr returned %i\n",
+			__func__, rc);
+	}
+	if (ia->ri_id != NULL && !IS_ERR(ia->ri_id)) {
+		if (ia->ri_id->qp)
+			rdma_destroy_qp(ia->ri_id);
+		rdma_destroy_id(ia->ri_id);
+		ia->ri_id = NULL;
+	}
+	if (ia->ri_pd != NULL && !IS_ERR(ia->ri_pd)) {
+		rc = ib_dealloc_pd(ia->ri_pd);
+		dprintk("RPC:       %s: ib_dealloc_pd returned %i\n",
+			__func__, rc);
+	}
+}
+
+/*
+ * Create unconnected endpoint.
+ */
+int
+rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
+				struct rpcrdma_create_data_internal *cdata)
+{
+	struct ib_device_attr devattr;
+	struct ib_cq *sendcq, *recvcq;
+	int rc, err;
+
+	rc = ib_query_device(ia->ri_id->device, &devattr);
+	if (rc) {
+		dprintk("RPC:       %s: ib_query_device failed %d\n",
+			__func__, rc);
+		return rc;
+	}
+
+	/* check provider's send/recv wr limits */
+	if (cdata->max_requests > devattr.max_qp_wr)
+		cdata->max_requests = devattr.max_qp_wr;
+
+	ep->rep_attr.event_handler = rpcrdma_qp_async_error_upcall;
+	ep->rep_attr.qp_context = ep;
+	/* send_cq and recv_cq initialized below */
+	ep->rep_attr.srq = NULL;
+	ep->rep_attr.cap.max_send_wr = cdata->max_requests;
+	switch (ia->ri_memreg_strategy) {
+	case RPCRDMA_FRMR: {
+		int depth = 7;
+
+		/* Add room for frmr register and invalidate WRs.
+		 * 1. FRMR reg WR for head
+		 * 2. FRMR invalidate WR for head
+		 * 3. N FRMR reg WRs for pagelist
+		 * 4. N FRMR invalidate WRs for pagelist
+		 * 5. FRMR reg WR for tail
+		 * 6. FRMR invalidate WR for tail
+		 * 7. The RDMA_SEND WR
+		 */
+
+		/* Calculate N if the device max FRMR depth is smaller than
+		 * RPCRDMA_MAX_DATA_SEGS.
+		 */
+		if (ia->ri_max_frmr_depth < RPCRDMA_MAX_DATA_SEGS) {
+			int delta = RPCRDMA_MAX_DATA_SEGS -
+				    ia->ri_max_frmr_depth;
+
+			do {
+				depth += 2; /* FRMR reg + invalidate */
+				delta -= ia->ri_max_frmr_depth;
+			} while (delta > 0);
+
+		}
+		ep->rep_attr.cap.max_send_wr *= depth;
+		if (ep->rep_attr.cap.max_send_wr > devattr.max_qp_wr) {
+			cdata->max_requests = devattr.max_qp_wr / depth;
+			if (!cdata->max_requests)
+				return -EINVAL;
+			ep->rep_attr.cap.max_send_wr = cdata->max_requests *
+						       depth;
+		}
+		break;
+	}
+	default:
+		break;
+	}
+	ep->rep_attr.cap.max_recv_wr = cdata->max_requests;
+	ep->rep_attr.cap.max_send_sge = (cdata->padding ? 4 : 2);
+	ep->rep_attr.cap.max_recv_sge = 1;
+	ep->rep_attr.cap.max_inline_data = 0;
+	ep->rep_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
+	ep->rep_attr.qp_type = IB_QPT_RC;
+	ep->rep_attr.port_num = ~0;
+
+	dprintk("RPC:       %s: requested max: dtos: send %d recv %d; "
+		"iovs: send %d recv %d\n",
+		__func__,
+		ep->rep_attr.cap.max_send_wr,
+		ep->rep_attr.cap.max_recv_wr,
+		ep->rep_attr.cap.max_send_sge,
+		ep->rep_attr.cap.max_recv_sge);
+
+	/* set trigger for requesting send completion */
+	ep->rep_cqinit = ep->rep_attr.cap.max_send_wr/2 - 1;
+	if (ep->rep_cqinit <= 2)
+		ep->rep_cqinit = 0;
+	INIT_CQCOUNT(ep);
+	ep->rep_ia = ia;
+	init_waitqueue_head(&ep->rep_connect_wait);
+	INIT_DELAYED_WORK(&ep->rep_connect_worker, rpcrdma_connect_worker);
+
+	sendcq = ib_create_cq(ia->ri_id->device, rpcrdma_sendcq_upcall,
+				  rpcrdma_cq_async_error_upcall, ep,
+				  ep->rep_attr.cap.max_send_wr + 1, 0);
+	if (IS_ERR(sendcq)) {
+		rc = PTR_ERR(sendcq);
+		dprintk("RPC:       %s: failed to create send CQ: %i\n",
+			__func__, rc);
+		goto out1;
+	}
+
+	rc = ib_req_notify_cq(sendcq, IB_CQ_NEXT_COMP);
+	if (rc) {
+		dprintk("RPC:       %s: ib_req_notify_cq failed: %i\n",
+			__func__, rc);
+		goto out2;
+	}
+
+	recvcq = ib_create_cq(ia->ri_id->device, rpcrdma_recvcq_upcall,
+				  rpcrdma_cq_async_error_upcall, ep,
+				  ep->rep_attr.cap.max_recv_wr + 1, 0);
+	if (IS_ERR(recvcq)) {
+		rc = PTR_ERR(recvcq);
+		dprintk("RPC:       %s: failed to create recv CQ: %i\n",
+			__func__, rc);
+		goto out2;
+	}
+
+	rc = ib_req_notify_cq(recvcq, IB_CQ_NEXT_COMP);
+	if (rc) {
+		dprintk("RPC:       %s: ib_req_notify_cq failed: %i\n",
+			__func__, rc);
+		ib_destroy_cq(recvcq);
+		goto out2;
+	}
+
+	ep->rep_attr.send_cq = sendcq;
+	ep->rep_attr.recv_cq = recvcq;
+
+	/* Initialize cma parameters */
+
+	/* RPC/RDMA does not use private data */
+	ep->rep_remote_cma.private_data = NULL;
+	ep->rep_remote_cma.private_data_len = 0;
+
+	/* Client offers RDMA Read but does not initiate */
+	ep->rep_remote_cma.initiator_depth = 0;
+	if (devattr.max_qp_rd_atom > 32)	/* arbitrary but <= 255 */
+		ep->rep_remote_cma.responder_resources = 32;
+	else
+		ep->rep_remote_cma.responder_resources = devattr.max_qp_rd_atom;
+
+	ep->rep_remote_cma.retry_count = 7;
+	ep->rep_remote_cma.flow_control = 0;
+	ep->rep_remote_cma.rnr_retry_count = 0;
+
+	return 0;
+
+out2:
+	err = ib_destroy_cq(sendcq);
+	if (err)
+		dprintk("RPC:       %s: ib_destroy_cq returned %i\n",
+			__func__, err);
+out1:
+	return rc;
+}
+
+/*
+ * rpcrdma_ep_destroy
+ *
+ * Disconnect and destroy endpoint. After this, the only
+ * valid operations on the ep are to free it (if dynamically
+ * allocated) or re-create it.
+ */
+void
+rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
+{
+	int rc;
+
+	dprintk("RPC:       %s: entering, connected is %d\n",
+		__func__, ep->rep_connected);
+
+	cancel_delayed_work_sync(&ep->rep_connect_worker);
+
+	if (ia->ri_id->qp) {
+		rpcrdma_ep_disconnect(ep, ia);
+		rdma_destroy_qp(ia->ri_id);
+		ia->ri_id->qp = NULL;
+	}
+
+	/* padding - could be done in rpcrdma_buffer_destroy... */
+	if (ep->rep_pad_mr) {
+		rpcrdma_deregister_internal(ia, ep->rep_pad_mr, &ep->rep_pad);
+		ep->rep_pad_mr = NULL;
+	}
+
+	rpcrdma_clean_cq(ep->rep_attr.recv_cq);
+	rc = ib_destroy_cq(ep->rep_attr.recv_cq);
+	if (rc)
+		dprintk("RPC:       %s: ib_destroy_cq returned %i\n",
+			__func__, rc);
+
+	rpcrdma_clean_cq(ep->rep_attr.send_cq);
+	rc = ib_destroy_cq(ep->rep_attr.send_cq);
+	if (rc)
+		dprintk("RPC:       %s: ib_destroy_cq returned %i\n",
+			__func__, rc);
+}
+
+/*
+ * Connect unconnected endpoint.
+ */
+int
+rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
+{
+	struct rdma_cm_id *id, *old;
+	int rc = 0;
+	int retry_count = 0;
+
+	if (ep->rep_connected != 0) {
+		struct rpcrdma_xprt *xprt;
+retry:
+		dprintk("RPC:       %s: reconnecting...\n", __func__);
+
+		rpcrdma_ep_disconnect(ep, ia);
+		rpcrdma_flush_cqs(ep);
+
+		if (ia->ri_memreg_strategy == RPCRDMA_FRMR)
+			rpcrdma_reset_frmrs(ia);
+
+		xprt = container_of(ia, struct rpcrdma_xprt, rx_ia);
+		id = rpcrdma_create_id(xprt, ia,
+				(struct sockaddr *)&xprt->rx_data.addr);
+		if (IS_ERR(id)) {
+			rc = -EHOSTUNREACH;
+			goto out;
+		}
+		/* TEMP TEMP TEMP - fail if new device:
+		 * Deregister/remarshal *all* requests!
+		 * Close and recreate adapter, pd, etc!
+		 * Re-determine all attributes still sane!
+		 * More stuff I haven't thought of!
+		 * Rrrgh!
+		 */
+		if (ia->ri_id->device != id->device) {
+			printk("RPC:       %s: can't reconnect on "
+				"different device!\n", __func__);
+			rdma_destroy_id(id);
+			rc = -ENETUNREACH;
+			goto out;
+		}
+		/* END TEMP */
+		rc = rdma_create_qp(id, ia->ri_pd, &ep->rep_attr);
+		if (rc) {
+			dprintk("RPC:       %s: rdma_create_qp failed %i\n",
+				__func__, rc);
+			rdma_destroy_id(id);
+			rc = -ENETUNREACH;
+			goto out;
+		}
+
+		write_lock(&ia->ri_qplock);
+		old = ia->ri_id;
+		ia->ri_id = id;
+		write_unlock(&ia->ri_qplock);
+
+		rdma_destroy_qp(old);
+		rdma_destroy_id(old);
+	} else {
+		dprintk("RPC:       %s: connecting...\n", __func__);
+		rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr);
+		if (rc) {
+			dprintk("RPC:       %s: rdma_create_qp failed %i\n",
+				__func__, rc);
+			/* do not update ep->rep_connected */
+			return -ENETUNREACH;
+		}
+	}
+
+	ep->rep_connected = 0;
+
+	rc = rdma_connect(ia->ri_id, &ep->rep_remote_cma);
+	if (rc) {
+		dprintk("RPC:       %s: rdma_connect() failed with %i\n",
+				__func__, rc);
+		goto out;
+	}
+
+	wait_event_interruptible(ep->rep_connect_wait, ep->rep_connected != 0);
+
+	/*
+	 * Check state. A non-peer reject indicates no listener
+	 * (ECONNREFUSED), which may be a transient state. All
+	 * others indicate a transport condition which has already
+	 * undergone a best-effort.
+	 */
+	if (ep->rep_connected == -ECONNREFUSED &&
+	    ++retry_count <= RDMA_CONNECT_RETRY_MAX) {
+		dprintk("RPC:       %s: non-peer_reject, retry\n", __func__);
+		goto retry;
+	}
+	if (ep->rep_connected <= 0) {
+		/* Sometimes, the only way to reliably connect to remote
+		 * CMs is to use same nonzero values for ORD and IRD. */
+		if (retry_count++ <= RDMA_CONNECT_RETRY_MAX + 1 &&
+		    (ep->rep_remote_cma.responder_resources == 0 ||
+		     ep->rep_remote_cma.initiator_depth !=
+				ep->rep_remote_cma.responder_resources)) {
+			if (ep->rep_remote_cma.responder_resources == 0)
+				ep->rep_remote_cma.responder_resources = 1;
+			ep->rep_remote_cma.initiator_depth =
+				ep->rep_remote_cma.responder_resources;
+			goto retry;
+		}
+		rc = ep->rep_connected;
+	} else {
+		dprintk("RPC:       %s: connected\n", __func__);
+	}
+
+out:
+	if (rc)
+		ep->rep_connected = rc;
+	return rc;
+}
+
+/*
+ * rpcrdma_ep_disconnect
+ *
+ * This is separate from destroy to facilitate the ability
+ * to reconnect without recreating the endpoint.
+ *
+ * This call is not reentrant, and must not be made in parallel
+ * on the same endpoint.
+ */
+void
+rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
+{
+	int rc;
+
+	rpcrdma_flush_cqs(ep);
+	rc = rdma_disconnect(ia->ri_id);
+	if (!rc) {
+		/* returns without wait if not connected */
+		wait_event_interruptible(ep->rep_connect_wait,
+							ep->rep_connected != 1);
+		dprintk("RPC:       %s: after wait, %sconnected\n", __func__,
+			(ep->rep_connected == 1) ? "still " : "dis");
+	} else {
+		dprintk("RPC:       %s: rdma_disconnect %i\n", __func__, rc);
+		ep->rep_connected = rc;
+	}
+}
+
+static int
+rpcrdma_init_fmrs(struct rpcrdma_ia *ia, struct rpcrdma_buffer *buf)
+{
+	int mr_access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ;
+	struct ib_fmr_attr fmr_attr = {
+		.max_pages	= RPCRDMA_MAX_DATA_SEGS,
+		.max_maps	= 1,
+		.page_shift	= PAGE_SHIFT
+	};
+	struct rpcrdma_mw *r;
+	int i, rc;
+
+	i = (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS;
+	dprintk("RPC:       %s: initalizing %d FMRs\n", __func__, i);
+
+	while (i--) {
+		r = kzalloc(sizeof(*r), GFP_KERNEL);
+		if (r == NULL)
+			return -ENOMEM;
+
+		r->r.fmr = ib_alloc_fmr(ia->ri_pd, mr_access_flags, &fmr_attr);
+		if (IS_ERR(r->r.fmr)) {
+			rc = PTR_ERR(r->r.fmr);
+			dprintk("RPC:       %s: ib_alloc_fmr failed %i\n",
+				__func__, rc);
+			goto out_free;
+		}
+
+		list_add(&r->mw_list, &buf->rb_mws);
+		list_add(&r->mw_all, &buf->rb_all);
+	}
+	return 0;
+
+out_free:
+	kfree(r);
+	return rc;
+}
+
+static int
+rpcrdma_init_frmrs(struct rpcrdma_ia *ia, struct rpcrdma_buffer *buf)
+{
+	struct rpcrdma_frmr *f;
+	struct rpcrdma_mw *r;
+	int i, rc;
+
+	i = (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS;
+	dprintk("RPC:       %s: initalizing %d FRMRs\n", __func__, i);
+
+	while (i--) {
+		r = kzalloc(sizeof(*r), GFP_KERNEL);
+		if (r == NULL)
+			return -ENOMEM;
+		f = &r->r.frmr;
+
+		f->fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd,
+						ia->ri_max_frmr_depth);
+		if (IS_ERR(f->fr_mr)) {
+			rc = PTR_ERR(f->fr_mr);
+			dprintk("RPC:       %s: ib_alloc_fast_reg_mr "
+				"failed %i\n", __func__, rc);
+			goto out_free;
+		}
+
+		f->fr_pgl = ib_alloc_fast_reg_page_list(ia->ri_id->device,
+							ia->ri_max_frmr_depth);
+		if (IS_ERR(f->fr_pgl)) {
+			rc = PTR_ERR(f->fr_pgl);
+			dprintk("RPC:       %s: ib_alloc_fast_reg_page_list "
+				"failed %i\n", __func__, rc);
+
+			ib_dereg_mr(f->fr_mr);
+			goto out_free;
+		}
+
+		list_add(&r->mw_list, &buf->rb_mws);
+		list_add(&r->mw_all, &buf->rb_all);
+	}
+
+	return 0;
+
+out_free:
+	kfree(r);
+	return rc;
+}
+
+int
+rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
+	struct rpcrdma_ia *ia, struct rpcrdma_create_data_internal *cdata)
+{
+	char *p;
+	size_t len, rlen, wlen;
+	int i, rc;
+
+	buf->rb_max_requests = cdata->max_requests;
+	spin_lock_init(&buf->rb_lock);
+	atomic_set(&buf->rb_credits, 1);
+
+	/* Need to allocate:
+	 *   1.  arrays for send and recv pointers
+	 *   2.  arrays of struct rpcrdma_req to fill in pointers
+	 *   3.  array of struct rpcrdma_rep for replies
+	 *   4.  padding, if any
+	 * Send/recv buffers in req/rep need to be registered
+	 */
+	len = buf->rb_max_requests *
+		(sizeof(struct rpcrdma_req *) + sizeof(struct rpcrdma_rep *));
+	len += cdata->padding;
+
+	p = kzalloc(len, GFP_KERNEL);
+	if (p == NULL) {
+		dprintk("RPC:       %s: req_t/rep_t/pad kzalloc(%zd) failed\n",
+			__func__, len);
+		rc = -ENOMEM;
+		goto out;
+	}
+	buf->rb_pool = p;	/* for freeing it later */
+
+	buf->rb_send_bufs = (struct rpcrdma_req **) p;
+	p = (char *) &buf->rb_send_bufs[buf->rb_max_requests];
+	buf->rb_recv_bufs = (struct rpcrdma_rep **) p;
+	p = (char *) &buf->rb_recv_bufs[buf->rb_max_requests];
+
+	/*
+	 * Register the zeroed pad buffer, if any.
+	 */
+	if (cdata->padding) {
+		rc = rpcrdma_register_internal(ia, p, cdata->padding,
+					    &ep->rep_pad_mr, &ep->rep_pad);
+		if (rc)
+			goto out;
+	}
+	p += cdata->padding;
+
+	INIT_LIST_HEAD(&buf->rb_mws);
+	INIT_LIST_HEAD(&buf->rb_all);
+	switch (ia->ri_memreg_strategy) {
+	case RPCRDMA_FRMR:
+		rc = rpcrdma_init_frmrs(ia, buf);
+		if (rc)
+			goto out;
+		break;
+	case RPCRDMA_MTHCAFMR:
+		rc = rpcrdma_init_fmrs(ia, buf);
+		if (rc)
+			goto out;
+		break;
+	default:
+		break;
+	}
+
+	/*
+	 * Allocate/init the request/reply buffers. Doing this
+	 * using kmalloc for now -- one for each buf.
+	 */
+	wlen = 1 << fls(cdata->inline_wsize + sizeof(struct rpcrdma_req));
+	rlen = 1 << fls(cdata->inline_rsize + sizeof(struct rpcrdma_rep));
+	dprintk("RPC:       %s: wlen = %zu, rlen = %zu\n",
+		__func__, wlen, rlen);
+
+	for (i = 0; i < buf->rb_max_requests; i++) {
+		struct rpcrdma_req *req;
+		struct rpcrdma_rep *rep;
+
+		req = kmalloc(wlen, GFP_KERNEL);
+		if (req == NULL) {
+			dprintk("RPC:       %s: request buffer %d alloc"
+				" failed\n", __func__, i);
+			rc = -ENOMEM;
+			goto out;
+		}
+		memset(req, 0, sizeof(struct rpcrdma_req));
+		buf->rb_send_bufs[i] = req;
+		buf->rb_send_bufs[i]->rl_buffer = buf;
+
+		rc = rpcrdma_register_internal(ia, req->rl_base,
+				wlen - offsetof(struct rpcrdma_req, rl_base),
+				&buf->rb_send_bufs[i]->rl_handle,
+				&buf->rb_send_bufs[i]->rl_iov);
+		if (rc)
+			goto out;
+
+		buf->rb_send_bufs[i]->rl_size = wlen -
+						sizeof(struct rpcrdma_req);
+
+		rep = kmalloc(rlen, GFP_KERNEL);
+		if (rep == NULL) {
+			dprintk("RPC:       %s: reply buffer %d alloc failed\n",
+				__func__, i);
+			rc = -ENOMEM;
+			goto out;
+		}
+		memset(rep, 0, sizeof(struct rpcrdma_rep));
+		buf->rb_recv_bufs[i] = rep;
+		buf->rb_recv_bufs[i]->rr_buffer = buf;
+
+		rc = rpcrdma_register_internal(ia, rep->rr_base,
+				rlen - offsetof(struct rpcrdma_rep, rr_base),
+				&buf->rb_recv_bufs[i]->rr_handle,
+				&buf->rb_recv_bufs[i]->rr_iov);
+		if (rc)
+			goto out;
+
+	}
+	dprintk("RPC:       %s: max_requests %d\n",
+		__func__, buf->rb_max_requests);
+	/* done */
+	return 0;
+out:
+	rpcrdma_buffer_destroy(buf);
+	return rc;
+}
+
+static void
+rpcrdma_destroy_fmrs(struct rpcrdma_buffer *buf)
+{
+	struct rpcrdma_mw *r;
+	int rc;
+
+	while (!list_empty(&buf->rb_all)) {
+		r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all);
+		list_del(&r->mw_all);
+		list_del(&r->mw_list);
+
+		rc = ib_dealloc_fmr(r->r.fmr);
+		if (rc)
+			dprintk("RPC:       %s: ib_dealloc_fmr failed %i\n",
+				__func__, rc);
+
+		kfree(r);
+	}
+}
+
+static void
+rpcrdma_destroy_frmrs(struct rpcrdma_buffer *buf)
+{
+	struct rpcrdma_mw *r;
+	int rc;
+
+	while (!list_empty(&buf->rb_all)) {
+		r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all);
+		list_del(&r->mw_all);
+		list_del(&r->mw_list);
+
+		rc = ib_dereg_mr(r->r.frmr.fr_mr);
+		if (rc)
+			dprintk("RPC:       %s: ib_dereg_mr failed %i\n",
+				__func__, rc);
+		ib_free_fast_reg_page_list(r->r.frmr.fr_pgl);
+
+		kfree(r);
+	}
+}
+
+void
+rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
+{
+	struct rpcrdma_ia *ia = rdmab_to_ia(buf);
+	int i;
+
+	/* clean up in reverse order from create
+	 *   1.  recv mr memory (mr free, then kfree)
+	 *   2.  send mr memory (mr free, then kfree)
+	 *   3.  MWs
+	 */
+	dprintk("RPC:       %s: entering\n", __func__);
+
+	for (i = 0; i < buf->rb_max_requests; i++) {
+		if (buf->rb_recv_bufs && buf->rb_recv_bufs[i]) {
+			rpcrdma_deregister_internal(ia,
+					buf->rb_recv_bufs[i]->rr_handle,
+					&buf->rb_recv_bufs[i]->rr_iov);
+			kfree(buf->rb_recv_bufs[i]);
+		}
+		if (buf->rb_send_bufs && buf->rb_send_bufs[i]) {
+			rpcrdma_deregister_internal(ia,
+					buf->rb_send_bufs[i]->rl_handle,
+					&buf->rb_send_bufs[i]->rl_iov);
+			kfree(buf->rb_send_bufs[i]);
+		}
+	}
+
+	switch (ia->ri_memreg_strategy) {
+	case RPCRDMA_FRMR:
+		rpcrdma_destroy_frmrs(buf);
+		break;
+	case RPCRDMA_MTHCAFMR:
+		rpcrdma_destroy_fmrs(buf);
+		break;
+	default:
+		break;
+	}
+
+	kfree(buf->rb_pool);
+}
+
+/* After a disconnect, a flushed FAST_REG_MR can leave an FRMR in
+ * an unusable state. Find FRMRs in this state and dereg / reg
+ * each.  FRMRs that are VALID and attached to an rpcrdma_req are
+ * also torn down.
+ *
+ * This gives all in-use FRMRs a fresh rkey and leaves them INVALID.
+ *
+ * This is invoked only in the transport connect worker in order
+ * to serialize with rpcrdma_register_frmr_external().
+ */
+static void
+rpcrdma_reset_frmrs(struct rpcrdma_ia *ia)
+{
+	struct rpcrdma_xprt *r_xprt =
+				container_of(ia, struct rpcrdma_xprt, rx_ia);
+	struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
+	struct list_head *pos;
+	struct rpcrdma_mw *r;
+	int rc;
+
+	list_for_each(pos, &buf->rb_all) {
+		r = list_entry(pos, struct rpcrdma_mw, mw_all);
+
+		if (r->r.frmr.fr_state == FRMR_IS_INVALID)
+			continue;
+
+		rc = ib_dereg_mr(r->r.frmr.fr_mr);
+		if (rc)
+			dprintk("RPC:       %s: ib_dereg_mr failed %i\n",
+				__func__, rc);
+		ib_free_fast_reg_page_list(r->r.frmr.fr_pgl);
+
+		r->r.frmr.fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd,
+					ia->ri_max_frmr_depth);
+		if (IS_ERR(r->r.frmr.fr_mr)) {
+			rc = PTR_ERR(r->r.frmr.fr_mr);
+			dprintk("RPC:       %s: ib_alloc_fast_reg_mr"
+				" failed %i\n", __func__, rc);
+			continue;
+		}
+		r->r.frmr.fr_pgl = ib_alloc_fast_reg_page_list(
+					ia->ri_id->device,
+					ia->ri_max_frmr_depth);
+		if (IS_ERR(r->r.frmr.fr_pgl)) {
+			rc = PTR_ERR(r->r.frmr.fr_pgl);
+			dprintk("RPC:       %s: "
+				"ib_alloc_fast_reg_page_list "
+				"failed %i\n", __func__, rc);
+
+			ib_dereg_mr(r->r.frmr.fr_mr);
+			continue;
+		}
+		r->r.frmr.fr_state = FRMR_IS_INVALID;
+	}
+}
+
+/* "*mw" can be NULL when rpcrdma_buffer_get_mrs() fails, leaving
+ * some req segments uninitialized.
+ */
+static void
+rpcrdma_buffer_put_mr(struct rpcrdma_mw **mw, struct rpcrdma_buffer *buf)
+{
+	if (*mw) {
+		list_add_tail(&(*mw)->mw_list, &buf->rb_mws);
+		*mw = NULL;
+	}
+}
+
+/* Cycle mw's back in reverse order, and "spin" them.
+ * This delays and scrambles reuse as much as possible.
+ */
+static void
+rpcrdma_buffer_put_mrs(struct rpcrdma_req *req, struct rpcrdma_buffer *buf)
+{
+	struct rpcrdma_mr_seg *seg = req->rl_segments;
+	struct rpcrdma_mr_seg *seg1 = seg;
+	int i;
+
+	for (i = 1, seg++; i < RPCRDMA_MAX_SEGS; seg++, i++)
+		rpcrdma_buffer_put_mr(&seg->mr_chunk.rl_mw, buf);
+	rpcrdma_buffer_put_mr(&seg1->mr_chunk.rl_mw, buf);
+}
+
+static void
+rpcrdma_buffer_put_sendbuf(struct rpcrdma_req *req, struct rpcrdma_buffer *buf)
+{
+	buf->rb_send_bufs[--buf->rb_send_index] = req;
+	req->rl_niovs = 0;
+	if (req->rl_reply) {
+		buf->rb_recv_bufs[--buf->rb_recv_index] = req->rl_reply;
+		req->rl_reply->rr_func = NULL;
+		req->rl_reply = NULL;
+	}
+}
+
+/* rpcrdma_unmap_one() was already done by rpcrdma_deregister_frmr_external().
+ * Redo only the ib_post_send().
+ */
+static void
+rpcrdma_retry_local_inv(struct rpcrdma_mw *r, struct rpcrdma_ia *ia)
+{
+	struct rpcrdma_xprt *r_xprt =
+				container_of(ia, struct rpcrdma_xprt, rx_ia);
+	struct ib_send_wr invalidate_wr, *bad_wr;
+	int rc;
+
+	dprintk("RPC:       %s: FRMR %p is stale\n", __func__, r);
+
+	/* When this FRMR is re-inserted into rb_mws, it is no longer stale */
+	r->r.frmr.fr_state = FRMR_IS_INVALID;
+
+	memset(&invalidate_wr, 0, sizeof(invalidate_wr));
+	invalidate_wr.wr_id = (unsigned long)(void *)r;
+	invalidate_wr.opcode = IB_WR_LOCAL_INV;
+	invalidate_wr.ex.invalidate_rkey = r->r.frmr.fr_mr->rkey;
+	DECR_CQCOUNT(&r_xprt->rx_ep);
+
+	dprintk("RPC:       %s: frmr %p invalidating rkey %08x\n",
+		__func__, r, r->r.frmr.fr_mr->rkey);
+
+	read_lock(&ia->ri_qplock);
+	rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr);
+	read_unlock(&ia->ri_qplock);
+	if (rc) {
+		/* Force rpcrdma_buffer_get() to retry */
+		r->r.frmr.fr_state = FRMR_IS_STALE;
+		dprintk("RPC:       %s: ib_post_send failed, %i\n",
+			__func__, rc);
+	}
+}
+
+static void
+rpcrdma_retry_flushed_linv(struct list_head *stale,
+			   struct rpcrdma_buffer *buf)
+{
+	struct rpcrdma_ia *ia = rdmab_to_ia(buf);
+	struct list_head *pos;
+	struct rpcrdma_mw *r;
+	unsigned long flags;
+
+	list_for_each(pos, stale) {
+		r = list_entry(pos, struct rpcrdma_mw, mw_list);
+		rpcrdma_retry_local_inv(r, ia);
+	}
+
+	spin_lock_irqsave(&buf->rb_lock, flags);
+	list_splice_tail(stale, &buf->rb_mws);
+	spin_unlock_irqrestore(&buf->rb_lock, flags);
+}
+
+static struct rpcrdma_req *
+rpcrdma_buffer_get_frmrs(struct rpcrdma_req *req, struct rpcrdma_buffer *buf,
+			 struct list_head *stale)
+{
+	struct rpcrdma_mw *r;
+	int i;
+
+	i = RPCRDMA_MAX_SEGS - 1;
+	while (!list_empty(&buf->rb_mws)) {
+		r = list_entry(buf->rb_mws.next,
+			       struct rpcrdma_mw, mw_list);
+		list_del(&r->mw_list);
+		if (r->r.frmr.fr_state == FRMR_IS_STALE) {
+			list_add(&r->mw_list, stale);
+			continue;
+		}
+		req->rl_segments[i].mr_chunk.rl_mw = r;
+		if (unlikely(i-- == 0))
+			return req;	/* Success */
+	}
+
+	/* Not enough entries on rb_mws for this req */
+	rpcrdma_buffer_put_sendbuf(req, buf);
+	rpcrdma_buffer_put_mrs(req, buf);
+	return NULL;
+}
+
+static struct rpcrdma_req *
+rpcrdma_buffer_get_fmrs(struct rpcrdma_req *req, struct rpcrdma_buffer *buf)
+{
+	struct rpcrdma_mw *r;
+	int i;
+
+	i = RPCRDMA_MAX_SEGS - 1;
+	while (!list_empty(&buf->rb_mws)) {
+		r = list_entry(buf->rb_mws.next,
+			       struct rpcrdma_mw, mw_list);
+		list_del(&r->mw_list);
+		req->rl_segments[i].mr_chunk.rl_mw = r;
+		if (unlikely(i-- == 0))
+			return req;	/* Success */
+	}
+
+	/* Not enough entries on rb_mws for this req */
+	rpcrdma_buffer_put_sendbuf(req, buf);
+	rpcrdma_buffer_put_mrs(req, buf);
+	return NULL;
+}
+
+/*
+ * Get a set of request/reply buffers.
+ *
+ * Reply buffer (if needed) is attached to send buffer upon return.
+ * Rule:
+ *    rb_send_index and rb_recv_index MUST always be pointing to the
+ *    *next* available buffer (non-NULL). They are incremented after
+ *    removing buffers, and decremented *before* returning them.
+ */
+struct rpcrdma_req *
+rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
+{
+	struct rpcrdma_ia *ia = rdmab_to_ia(buffers);
+	struct list_head stale;
+	struct rpcrdma_req *req;
+	unsigned long flags;
+
+	spin_lock_irqsave(&buffers->rb_lock, flags);
+	if (buffers->rb_send_index == buffers->rb_max_requests) {
+		spin_unlock_irqrestore(&buffers->rb_lock, flags);
+		dprintk("RPC:       %s: out of request buffers\n", __func__);
+		return ((struct rpcrdma_req *)NULL);
+	}
+
+	req = buffers->rb_send_bufs[buffers->rb_send_index];
+	if (buffers->rb_send_index < buffers->rb_recv_index) {
+		dprintk("RPC:       %s: %d extra receives outstanding (ok)\n",
+			__func__,
+			buffers->rb_recv_index - buffers->rb_send_index);
+		req->rl_reply = NULL;
+	} else {
+		req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index];
+		buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL;
+	}
+	buffers->rb_send_bufs[buffers->rb_send_index++] = NULL;
+
+	INIT_LIST_HEAD(&stale);
+	switch (ia->ri_memreg_strategy) {
+	case RPCRDMA_FRMR:
+		req = rpcrdma_buffer_get_frmrs(req, buffers, &stale);
+		break;
+	case RPCRDMA_MTHCAFMR:
+		req = rpcrdma_buffer_get_fmrs(req, buffers);
+		break;
+	default:
+		break;
+	}
+	spin_unlock_irqrestore(&buffers->rb_lock, flags);
+	if (!list_empty(&stale))
+		rpcrdma_retry_flushed_linv(&stale, buffers);
+	return req;
+}
+
+/*
+ * Put request/reply buffers back into pool.
+ * Pre-decrement counter/array index.
+ */
+void
+rpcrdma_buffer_put(struct rpcrdma_req *req)
+{
+	struct rpcrdma_buffer *buffers = req->rl_buffer;
+	struct rpcrdma_ia *ia = rdmab_to_ia(buffers);
+	unsigned long flags;
+
+	spin_lock_irqsave(&buffers->rb_lock, flags);
+	rpcrdma_buffer_put_sendbuf(req, buffers);
+	switch (ia->ri_memreg_strategy) {
+	case RPCRDMA_FRMR:
+	case RPCRDMA_MTHCAFMR:
+		rpcrdma_buffer_put_mrs(req, buffers);
+		break;
+	default:
+		break;
+	}
+	spin_unlock_irqrestore(&buffers->rb_lock, flags);
+}
+
+/*
+ * Recover reply buffers from pool.
+ * This happens when recovering from error conditions.
+ * Post-increment counter/array index.
+ */
+void
+rpcrdma_recv_buffer_get(struct rpcrdma_req *req)
+{
+	struct rpcrdma_buffer *buffers = req->rl_buffer;
+	unsigned long flags;
+
+	if (req->rl_iov.length == 0)	/* special case xprt_rdma_allocate() */
+		buffers = ((struct rpcrdma_req *) buffers)->rl_buffer;
+	spin_lock_irqsave(&buffers->rb_lock, flags);
+	if (buffers->rb_recv_index < buffers->rb_max_requests) {
+		req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index];
+		buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL;
+	}
+	spin_unlock_irqrestore(&buffers->rb_lock, flags);
+}
+
+/*
+ * Put reply buffers back into pool when not attached to
+ * request. This happens in error conditions.
+ */
+void
+rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep)
+{
+	struct rpcrdma_buffer *buffers = rep->rr_buffer;
+	unsigned long flags;
+
+	rep->rr_func = NULL;
+	spin_lock_irqsave(&buffers->rb_lock, flags);
+	buffers->rb_recv_bufs[--buffers->rb_recv_index] = rep;
+	spin_unlock_irqrestore(&buffers->rb_lock, flags);
+}
+
+/*
+ * Wrappers for internal-use kmalloc memory registration, used by buffer code.
+ */
+
+int
+rpcrdma_register_internal(struct rpcrdma_ia *ia, void *va, int len,
+				struct ib_mr **mrp, struct ib_sge *iov)
+{
+	struct ib_phys_buf ipb;
+	struct ib_mr *mr;
+	int rc;
+
+	/*
+	 * All memory passed here was kmalloc'ed, therefore phys-contiguous.
+	 */
+	iov->addr = ib_dma_map_single(ia->ri_id->device,
+			va, len, DMA_BIDIRECTIONAL);
+	if (ib_dma_mapping_error(ia->ri_id->device, iov->addr))
+		return -ENOMEM;
+
+	iov->length = len;
+
+	if (ia->ri_have_dma_lkey) {
+		*mrp = NULL;
+		iov->lkey = ia->ri_dma_lkey;
+		return 0;
+	} else if (ia->ri_bind_mem != NULL) {
+		*mrp = NULL;
+		iov->lkey = ia->ri_bind_mem->lkey;
+		return 0;
+	}
+
+	ipb.addr = iov->addr;
+	ipb.size = iov->length;
+	mr = ib_reg_phys_mr(ia->ri_pd, &ipb, 1,
+			IB_ACCESS_LOCAL_WRITE, &iov->addr);
+
+	dprintk("RPC:       %s: phys convert: 0x%llx "
+			"registered 0x%llx length %d\n",
+			__func__, (unsigned long long)ipb.addr,
+			(unsigned long long)iov->addr, len);
+
+	if (IS_ERR(mr)) {
+		*mrp = NULL;
+		rc = PTR_ERR(mr);
+		dprintk("RPC:       %s: failed with %i\n", __func__, rc);
+	} else {
+		*mrp = mr;
+		iov->lkey = mr->lkey;
+		rc = 0;
+	}
+
+	return rc;
+}
+
+int
+rpcrdma_deregister_internal(struct rpcrdma_ia *ia,
+				struct ib_mr *mr, struct ib_sge *iov)
+{
+	int rc;
+
+	ib_dma_unmap_single(ia->ri_id->device,
+			iov->addr, iov->length, DMA_BIDIRECTIONAL);
+
+	if (NULL == mr)
+		return 0;
+
+	rc = ib_dereg_mr(mr);
+	if (rc)
+		dprintk("RPC:       %s: ib_dereg_mr failed %i\n", __func__, rc);
+	return rc;
+}
+
+/*
+ * Wrappers for chunk registration, shared by read/write chunk code.
+ */
+
+static void
+rpcrdma_map_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg, int writing)
+{
+	seg->mr_dir = writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
+	seg->mr_dmalen = seg->mr_len;
+	if (seg->mr_page)
+		seg->mr_dma = ib_dma_map_page(ia->ri_id->device,
+				seg->mr_page, offset_in_page(seg->mr_offset),
+				seg->mr_dmalen, seg->mr_dir);
+	else
+		seg->mr_dma = ib_dma_map_single(ia->ri_id->device,
+				seg->mr_offset,
+				seg->mr_dmalen, seg->mr_dir);
+	if (ib_dma_mapping_error(ia->ri_id->device, seg->mr_dma)) {
+		dprintk("RPC:       %s: mr_dma %llx mr_offset %p mr_dma_len %zu\n",
+			__func__,
+			(unsigned long long)seg->mr_dma,
+			seg->mr_offset, seg->mr_dmalen);
+	}
+}
+
+static void
+rpcrdma_unmap_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg)
+{
+	if (seg->mr_page)
+		ib_dma_unmap_page(ia->ri_id->device,
+				seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
+	else
+		ib_dma_unmap_single(ia->ri_id->device,
+				seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
+}
+
+static int
+rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg,
+			int *nsegs, int writing, struct rpcrdma_ia *ia,
+			struct rpcrdma_xprt *r_xprt)
+{
+	struct rpcrdma_mr_seg *seg1 = seg;
+	struct rpcrdma_mw *mw = seg1->mr_chunk.rl_mw;
+	struct rpcrdma_frmr *frmr = &mw->r.frmr;
+	struct ib_mr *mr = frmr->fr_mr;
+	struct ib_send_wr fastreg_wr, *bad_wr;
+	u8 key;
+	int len, pageoff;
+	int i, rc;
+	int seg_len;
+	u64 pa;
+	int page_no;
+
+	pageoff = offset_in_page(seg1->mr_offset);
+	seg1->mr_offset -= pageoff;	/* start of page */
+	seg1->mr_len += pageoff;
+	len = -pageoff;
+	if (*nsegs > ia->ri_max_frmr_depth)
+		*nsegs = ia->ri_max_frmr_depth;
+	for (page_no = i = 0; i < *nsegs;) {
+		rpcrdma_map_one(ia, seg, writing);
+		pa = seg->mr_dma;
+		for (seg_len = seg->mr_len; seg_len > 0; seg_len -= PAGE_SIZE) {
+			frmr->fr_pgl->page_list[page_no++] = pa;
+			pa += PAGE_SIZE;
+		}
+		len += seg->mr_len;
+		++seg;
+		++i;
+		/* Check for holes */
+		if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
+		    offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
+			break;
+	}
+	dprintk("RPC:       %s: Using frmr %p to map %d segments\n",
+		__func__, mw, i);
+
+	frmr->fr_state = FRMR_IS_VALID;
+
+	memset(&fastreg_wr, 0, sizeof(fastreg_wr));
+	fastreg_wr.wr_id = (unsigned long)(void *)mw;
+	fastreg_wr.opcode = IB_WR_FAST_REG_MR;
+	fastreg_wr.wr.fast_reg.iova_start = seg1->mr_dma;
+	fastreg_wr.wr.fast_reg.page_list = frmr->fr_pgl;
+	fastreg_wr.wr.fast_reg.page_list_len = page_no;
+	fastreg_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
+	fastreg_wr.wr.fast_reg.length = page_no << PAGE_SHIFT;
+	if (fastreg_wr.wr.fast_reg.length < len) {
+		rc = -EIO;
+		goto out_err;
+	}
+
+	/* Bump the key */
+	key = (u8)(mr->rkey & 0x000000FF);
+	ib_update_fast_reg_key(mr, ++key);
+
+	fastreg_wr.wr.fast_reg.access_flags = (writing ?
+				IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE :
+				IB_ACCESS_REMOTE_READ);
+	fastreg_wr.wr.fast_reg.rkey = mr->rkey;
+	DECR_CQCOUNT(&r_xprt->rx_ep);
+
+	rc = ib_post_send(ia->ri_id->qp, &fastreg_wr, &bad_wr);
+	if (rc) {
+		dprintk("RPC:       %s: failed ib_post_send for register,"
+			" status %i\n", __func__, rc);
+		ib_update_fast_reg_key(mr, --key);
+		goto out_err;
+	} else {
+		seg1->mr_rkey = mr->rkey;
+		seg1->mr_base = seg1->mr_dma + pageoff;
+		seg1->mr_nsegs = i;
+		seg1->mr_len = len;
+	}
+	*nsegs = i;
+	return 0;
+out_err:
+	frmr->fr_state = FRMR_IS_INVALID;
+	while (i--)
+		rpcrdma_unmap_one(ia, --seg);
+	return rc;
+}
+
+static int
+rpcrdma_deregister_frmr_external(struct rpcrdma_mr_seg *seg,
+			struct rpcrdma_ia *ia, struct rpcrdma_xprt *r_xprt)
+{
+	struct rpcrdma_mr_seg *seg1 = seg;
+	struct ib_send_wr invalidate_wr, *bad_wr;
+	int rc;
+
+	seg1->mr_chunk.rl_mw->r.frmr.fr_state = FRMR_IS_INVALID;
+
+	memset(&invalidate_wr, 0, sizeof invalidate_wr);
+	invalidate_wr.wr_id = (unsigned long)(void *)seg1->mr_chunk.rl_mw;
+	invalidate_wr.opcode = IB_WR_LOCAL_INV;
+	invalidate_wr.ex.invalidate_rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
+	DECR_CQCOUNT(&r_xprt->rx_ep);
+
+	read_lock(&ia->ri_qplock);
+	while (seg1->mr_nsegs--)
+		rpcrdma_unmap_one(ia, seg++);
+	rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr);
+	read_unlock(&ia->ri_qplock);
+	if (rc) {
+		/* Force rpcrdma_buffer_get() to retry */
+		seg1->mr_chunk.rl_mw->r.frmr.fr_state = FRMR_IS_STALE;
+		dprintk("RPC:       %s: failed ib_post_send for invalidate,"
+			" status %i\n", __func__, rc);
+	}
+	return rc;
+}
+
+static int
+rpcrdma_register_fmr_external(struct rpcrdma_mr_seg *seg,
+			int *nsegs, int writing, struct rpcrdma_ia *ia)
+{
+	struct rpcrdma_mr_seg *seg1 = seg;
+	u64 physaddrs[RPCRDMA_MAX_DATA_SEGS];
+	int len, pageoff, i, rc;
+
+	pageoff = offset_in_page(seg1->mr_offset);
+	seg1->mr_offset -= pageoff;	/* start of page */
+	seg1->mr_len += pageoff;
+	len = -pageoff;
+	if (*nsegs > RPCRDMA_MAX_DATA_SEGS)
+		*nsegs = RPCRDMA_MAX_DATA_SEGS;
+	for (i = 0; i < *nsegs;) {
+		rpcrdma_map_one(ia, seg, writing);
+		physaddrs[i] = seg->mr_dma;
+		len += seg->mr_len;
+		++seg;
+		++i;
+		/* Check for holes */
+		if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
+		    offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
+			break;
+	}
+	rc = ib_map_phys_fmr(seg1->mr_chunk.rl_mw->r.fmr,
+				physaddrs, i, seg1->mr_dma);
+	if (rc) {
+		dprintk("RPC:       %s: failed ib_map_phys_fmr "
+			"%u@0x%llx+%i (%d)... status %i\n", __func__,
+			len, (unsigned long long)seg1->mr_dma,
+			pageoff, i, rc);
+		while (i--)
+			rpcrdma_unmap_one(ia, --seg);
+	} else {
+		seg1->mr_rkey = seg1->mr_chunk.rl_mw->r.fmr->rkey;
+		seg1->mr_base = seg1->mr_dma + pageoff;
+		seg1->mr_nsegs = i;
+		seg1->mr_len = len;
+	}
+	*nsegs = i;
+	return rc;
+}
+
+static int
+rpcrdma_deregister_fmr_external(struct rpcrdma_mr_seg *seg,
+			struct rpcrdma_ia *ia)
+{
+	struct rpcrdma_mr_seg *seg1 = seg;
+	LIST_HEAD(l);
+	int rc;
+
+	list_add(&seg1->mr_chunk.rl_mw->r.fmr->list, &l);
+	rc = ib_unmap_fmr(&l);
+	read_lock(&ia->ri_qplock);
+	while (seg1->mr_nsegs--)
+		rpcrdma_unmap_one(ia, seg++);
+	read_unlock(&ia->ri_qplock);
+	if (rc)
+		dprintk("RPC:       %s: failed ib_unmap_fmr,"
+			" status %i\n", __func__, rc);
+	return rc;
+}
+
+int
+rpcrdma_register_external(struct rpcrdma_mr_seg *seg,
+			int nsegs, int writing, struct rpcrdma_xprt *r_xprt)
+{
+	struct rpcrdma_ia *ia = &r_xprt->rx_ia;
+	int rc = 0;
+
+	switch (ia->ri_memreg_strategy) {
+
+	case RPCRDMA_ALLPHYSICAL:
+		rpcrdma_map_one(ia, seg, writing);
+		seg->mr_rkey = ia->ri_bind_mem->rkey;
+		seg->mr_base = seg->mr_dma;
+		seg->mr_nsegs = 1;
+		nsegs = 1;
+		break;
+
+	/* Registration using frmr registration */
+	case RPCRDMA_FRMR:
+		rc = rpcrdma_register_frmr_external(seg, &nsegs, writing, ia, r_xprt);
+		break;
+
+	/* Registration using fmr memory registration */
+	case RPCRDMA_MTHCAFMR:
+		rc = rpcrdma_register_fmr_external(seg, &nsegs, writing, ia);
+		break;
+
+	default:
+		return -1;
+	}
+	if (rc)
+		return -1;
+
+	return nsegs;
+}
+
+int
+rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg,
+		struct rpcrdma_xprt *r_xprt)
+{
+	struct rpcrdma_ia *ia = &r_xprt->rx_ia;
+	int nsegs = seg->mr_nsegs, rc;
+
+	switch (ia->ri_memreg_strategy) {
+
+	case RPCRDMA_ALLPHYSICAL:
+		read_lock(&ia->ri_qplock);
+		rpcrdma_unmap_one(ia, seg);
+		read_unlock(&ia->ri_qplock);
+		break;
+
+	case RPCRDMA_FRMR:
+		rc = rpcrdma_deregister_frmr_external(seg, ia, r_xprt);
+		break;
+
+	case RPCRDMA_MTHCAFMR:
+		rc = rpcrdma_deregister_fmr_external(seg, ia);
+		break;
+
+	default:
+		break;
+	}
+	return nsegs;
+}
+
+/*
+ * Prepost any receive buffer, then post send.
+ *
+ * Receive buffer is donated to hardware, reclaimed upon recv completion.
+ */
+int
+rpcrdma_ep_post(struct rpcrdma_ia *ia,
+		struct rpcrdma_ep *ep,
+		struct rpcrdma_req *req)
+{
+	struct ib_send_wr send_wr, *send_wr_fail;
+	struct rpcrdma_rep *rep = req->rl_reply;
+	int rc;
+
+	if (rep) {
+		rc = rpcrdma_ep_post_recv(ia, ep, rep);
+		if (rc)
+			goto out;
+		req->rl_reply = NULL;
+	}
+
+	send_wr.next = NULL;
+	send_wr.wr_id = 0ULL;	/* no send cookie */
+	send_wr.sg_list = req->rl_send_iov;
+	send_wr.num_sge = req->rl_niovs;
+	send_wr.opcode = IB_WR_SEND;
+	if (send_wr.num_sge == 4)	/* no need to sync any pad (constant) */
+		ib_dma_sync_single_for_device(ia->ri_id->device,
+			req->rl_send_iov[3].addr, req->rl_send_iov[3].length,
+			DMA_TO_DEVICE);
+	ib_dma_sync_single_for_device(ia->ri_id->device,
+		req->rl_send_iov[1].addr, req->rl_send_iov[1].length,
+		DMA_TO_DEVICE);
+	ib_dma_sync_single_for_device(ia->ri_id->device,
+		req->rl_send_iov[0].addr, req->rl_send_iov[0].length,
+		DMA_TO_DEVICE);
+
+	if (DECR_CQCOUNT(ep) > 0)
+		send_wr.send_flags = 0;
+	else { /* Provider must take a send completion every now and then */
+		INIT_CQCOUNT(ep);
+		send_wr.send_flags = IB_SEND_SIGNALED;
+	}
+
+	rc = ib_post_send(ia->ri_id->qp, &send_wr, &send_wr_fail);
+	if (rc)
+		dprintk("RPC:       %s: ib_post_send returned %i\n", __func__,
+			rc);
+out:
+	return rc;
+}
+
+/*
+ * (Re)post a receive buffer.
+ */
+int
+rpcrdma_ep_post_recv(struct rpcrdma_ia *ia,
+		     struct rpcrdma_ep *ep,
+		     struct rpcrdma_rep *rep)
+{
+	struct ib_recv_wr recv_wr, *recv_wr_fail;
+	int rc;
+
+	recv_wr.next = NULL;
+	recv_wr.wr_id = (u64) (unsigned long) rep;
+	recv_wr.sg_list = &rep->rr_iov;
+	recv_wr.num_sge = 1;
+
+	ib_dma_sync_single_for_cpu(ia->ri_id->device,
+		rep->rr_iov.addr, rep->rr_iov.length, DMA_BIDIRECTIONAL);
+
+	rc = ib_post_recv(ia->ri_id->qp, &recv_wr, &recv_wr_fail);
+
+	if (rc)
+		dprintk("RPC:       %s: ib_post_recv returned %i\n", __func__,
+			rc);
+	return rc;
+}
+
+/* Physical mapping means one Read/Write list entry per-page.
+ * All list entries must fit within an inline buffer
+ *
+ * NB: The server must return a Write list for NFS READ,
+ *     which has the same constraint. Factor in the inline
+ *     rsize as well.
+ */
+static size_t
+rpcrdma_physical_max_payload(struct rpcrdma_xprt *r_xprt)
+{
+	struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data;
+	unsigned int inline_size, pages;
+
+	inline_size = min_t(unsigned int,
+			    cdata->inline_wsize, cdata->inline_rsize);
+	inline_size -= RPCRDMA_HDRLEN_MIN;
+	pages = inline_size / sizeof(struct rpcrdma_segment);
+	return pages << PAGE_SHIFT;
+}
+
+static size_t
+rpcrdma_mr_max_payload(struct rpcrdma_xprt *r_xprt)
+{
+	return RPCRDMA_MAX_DATA_SEGS << PAGE_SHIFT;
+}
+
+size_t
+rpcrdma_max_payload(struct rpcrdma_xprt *r_xprt)
+{
+	size_t result;
+
+	switch (r_xprt->rx_ia.ri_memreg_strategy) {
+	case RPCRDMA_ALLPHYSICAL:
+		result = rpcrdma_physical_max_payload(r_xprt);
+		break;
+	default:
+		result = rpcrdma_mr_max_payload(r_xprt);
+	}
+	return result;
+}
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
new file mode 100644
index 0000000..ac7fc9a
--- /dev/null
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -0,0 +1,402 @@
+/*
+ * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the BSD-type
+ * license below:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *      Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *
+ *      Redistributions in binary form must reproduce the above
+ *      copyright notice, this list of conditions and the following
+ *      disclaimer in the documentation and/or other materials provided
+ *      with the distribution.
+ *
+ *      Neither the name of the Network Appliance, Inc. nor the names of
+ *      its contributors may be used to endorse or promote products
+ *      derived from this software without specific prior written
+ *      permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _LINUX_SUNRPC_XPRT_RDMA_H
+#define _LINUX_SUNRPC_XPRT_RDMA_H
+
+#include <linux/wait.h> 		/* wait_queue_head_t, etc */
+#include <linux/spinlock.h> 		/* spinlock_t, etc */
+#include <linux/atomic.h>			/* atomic_t, etc */
+#include <linux/workqueue.h>		/* struct work_struct */
+
+#include <rdma/rdma_cm.h>		/* RDMA connection api */
+#include <rdma/ib_verbs.h>		/* RDMA verbs api */
+
+#include <linux/sunrpc/clnt.h> 		/* rpc_xprt */
+#include <linux/sunrpc/rpc_rdma.h> 	/* RPC/RDMA protocol */
+#include <linux/sunrpc/xprtrdma.h> 	/* xprt parameters */
+#include <linux/sunrpc/svc.h>		/* RPCSVC_MAXPAYLOAD */
+
+#define RDMA_RESOLVE_TIMEOUT	(5000)	/* 5 seconds */
+#define RDMA_CONNECT_RETRY_MAX	(2)	/* retries if no listener backlog */
+
+/*
+ * Interface Adapter -- one per transport instance
+ */
+struct rpcrdma_ia {
+	rwlock_t		ri_qplock;
+	struct rdma_cm_id 	*ri_id;
+	struct ib_pd		*ri_pd;
+	struct ib_mr		*ri_bind_mem;
+	u32			ri_dma_lkey;
+	int			ri_have_dma_lkey;
+	struct completion	ri_done;
+	int			ri_async_rc;
+	enum rpcrdma_memreg	ri_memreg_strategy;
+	unsigned int		ri_max_frmr_depth;
+};
+
+/*
+ * RDMA Endpoint -- one per transport instance
+ */
+
+#define RPCRDMA_WC_BUDGET	(128)
+#define RPCRDMA_POLLSIZE	(16)
+
+struct rpcrdma_ep {
+	atomic_t		rep_cqcount;
+	int			rep_cqinit;
+	int			rep_connected;
+	struct rpcrdma_ia	*rep_ia;
+	struct ib_qp_init_attr	rep_attr;
+	wait_queue_head_t 	rep_connect_wait;
+	struct ib_sge		rep_pad;	/* holds zeroed pad */
+	struct ib_mr		*rep_pad_mr;	/* holds zeroed pad */
+	void			(*rep_func)(struct rpcrdma_ep *);
+	struct rpc_xprt		*rep_xprt;	/* for rep_func */
+	struct rdma_conn_param	rep_remote_cma;
+	struct sockaddr_storage	rep_remote_addr;
+	struct delayed_work	rep_connect_worker;
+	struct ib_wc		rep_send_wcs[RPCRDMA_POLLSIZE];
+	struct ib_wc		rep_recv_wcs[RPCRDMA_POLLSIZE];
+};
+
+#define INIT_CQCOUNT(ep) atomic_set(&(ep)->rep_cqcount, (ep)->rep_cqinit)
+#define DECR_CQCOUNT(ep) atomic_sub_return(1, &(ep)->rep_cqcount)
+
+enum rpcrdma_chunktype {
+	rpcrdma_noch = 0,
+	rpcrdma_readch,
+	rpcrdma_areadch,
+	rpcrdma_writech,
+	rpcrdma_replych
+};
+
+/*
+ * struct rpcrdma_rep -- this structure encapsulates state required to recv
+ * and complete a reply, asychronously. It needs several pieces of
+ * state:
+ *   o recv buffer (posted to provider)
+ *   o ib_sge (also donated to provider)
+ *   o status of reply (length, success or not)
+ *   o bookkeeping state to get run by tasklet (list, etc)
+ *
+ * These are allocated during initialization, per-transport instance;
+ * however, the tasklet execution list itself is global, as it should
+ * always be pretty short.
+ *
+ * N of these are associated with a transport instance, and stored in
+ * struct rpcrdma_buffer. N is the max number of outstanding requests.
+ */
+
+/* temporary static scatter/gather max */
+#define RPCRDMA_MAX_DATA_SEGS	(64)	/* max scatter/gather */
+#define RPCRDMA_MAX_SEGS 	(RPCRDMA_MAX_DATA_SEGS + 2) /* head+tail = 2 */
+#define MAX_RPCRDMAHDR	(\
+	/* max supported RPC/RDMA header */ \
+	sizeof(struct rpcrdma_msg) + (2 * sizeof(u32)) + \
+	(sizeof(struct rpcrdma_read_chunk) * RPCRDMA_MAX_SEGS) + sizeof(u32))
+
+struct rpcrdma_buffer;
+
+struct rpcrdma_rep {
+	unsigned int	rr_len;		/* actual received reply length */
+	struct rpcrdma_buffer *rr_buffer; /* home base for this structure */
+	struct rpc_xprt	*rr_xprt;	/* needed for request/reply matching */
+	void (*rr_func)(struct rpcrdma_rep *);/* called by tasklet in softint */
+	struct list_head rr_list;	/* tasklet list */
+	struct ib_sge	rr_iov;		/* for posting */
+	struct ib_mr	*rr_handle;	/* handle for mem in rr_iov */
+	char	rr_base[MAX_RPCRDMAHDR]; /* minimal inline receive buffer */
+};
+
+/*
+ * struct rpcrdma_mw - external memory region metadata
+ *
+ * An external memory region is any buffer or page that is registered
+ * on the fly (ie, not pre-registered).
+ *
+ * Each rpcrdma_buffer has a list of free MWs anchored in rb_mws. During
+ * call_allocate, rpcrdma_buffer_get() assigns one to each segment in
+ * an rpcrdma_req. Then rpcrdma_register_external() grabs these to keep
+ * track of registration metadata while each RPC is pending.
+ * rpcrdma_deregister_external() uses this metadata to unmap and
+ * release these resources when an RPC is complete.
+ */
+enum rpcrdma_frmr_state {
+	FRMR_IS_INVALID,	/* ready to be used */
+	FRMR_IS_VALID,		/* in use */
+	FRMR_IS_STALE,		/* failed completion */
+};
+
+struct rpcrdma_frmr {
+	struct ib_fast_reg_page_list	*fr_pgl;
+	struct ib_mr			*fr_mr;
+	enum rpcrdma_frmr_state		fr_state;
+};
+
+struct rpcrdma_mw {
+	union {
+		struct ib_fmr		*fmr;
+		struct rpcrdma_frmr	frmr;
+	} r;
+	struct list_head	mw_list;
+	struct list_head	mw_all;
+};
+
+/*
+ * struct rpcrdma_req -- structure central to the request/reply sequence.
+ *
+ * N of these are associated with a transport instance, and stored in
+ * struct rpcrdma_buffer. N is the max number of outstanding requests.
+ *
+ * It includes pre-registered buffer memory for send AND recv.
+ * The recv buffer, however, is not owned by this structure, and
+ * is "donated" to the hardware when a recv is posted. When a
+ * reply is handled, the recv buffer used is given back to the
+ * struct rpcrdma_req associated with the request.
+ *
+ * In addition to the basic memory, this structure includes an array
+ * of iovs for send operations. The reason is that the iovs passed to
+ * ib_post_{send,recv} must not be modified until the work request
+ * completes.
+ *
+ * NOTES:
+ *   o RPCRDMA_MAX_SEGS is the max number of addressible chunk elements we
+ *     marshal. The number needed varies depending on the iov lists that
+ *     are passed to us, the memory registration mode we are in, and if
+ *     physical addressing is used, the layout.
+ */
+
+struct rpcrdma_mr_seg {		/* chunk descriptors */
+	union {				/* chunk memory handles */
+		struct ib_mr	*rl_mr;		/* if registered directly */
+		struct rpcrdma_mw *rl_mw;	/* if registered from region */
+	} mr_chunk;
+	u64		mr_base;	/* registration result */
+	u32		mr_rkey;	/* registration result */
+	u32		mr_len;		/* length of chunk or segment */
+	int		mr_nsegs;	/* number of segments in chunk or 0 */
+	enum dma_data_direction	mr_dir;	/* segment mapping direction */
+	dma_addr_t	mr_dma;		/* segment mapping address */
+	size_t		mr_dmalen;	/* segment mapping length */
+	struct page	*mr_page;	/* owning page, if any */
+	char		*mr_offset;	/* kva if no page, else offset */
+};
+
+struct rpcrdma_req {
+	size_t 		rl_size;	/* actual length of buffer */
+	unsigned int	rl_niovs;	/* 0, 2 or 4 */
+	unsigned int	rl_nchunks;	/* non-zero if chunks */
+	unsigned int	rl_connect_cookie;	/* retry detection */
+	enum rpcrdma_chunktype	rl_rtype, rl_wtype;
+	struct rpcrdma_buffer *rl_buffer; /* home base for this structure */
+	struct rpcrdma_rep	*rl_reply;/* holder for reply buffer */
+	struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS];/* chunk segments */
+	struct ib_sge	rl_send_iov[4];	/* for active requests */
+	struct ib_sge	rl_iov;		/* for posting */
+	struct ib_mr	*rl_handle;	/* handle for mem in rl_iov */
+	char		rl_base[MAX_RPCRDMAHDR]; /* start of actual buffer */
+	__u32 		rl_xdr_buf[0];	/* start of returned rpc rq_buffer */
+};
+#define rpcr_to_rdmar(r) \
+	container_of((r)->rq_buffer, struct rpcrdma_req, rl_xdr_buf[0])
+
+/*
+ * struct rpcrdma_buffer -- holds list/queue of pre-registered memory for
+ * inline requests/replies, and client/server credits.
+ *
+ * One of these is associated with a transport instance
+ */
+struct rpcrdma_buffer {
+	spinlock_t	rb_lock;	/* protects indexes */
+	atomic_t	rb_credits;	/* most recent server credits */
+	int		rb_max_requests;/* client max requests */
+	struct list_head rb_mws;	/* optional memory windows/fmrs/frmrs */
+	struct list_head rb_all;
+	int		rb_send_index;
+	struct rpcrdma_req	**rb_send_bufs;
+	int		rb_recv_index;
+	struct rpcrdma_rep	**rb_recv_bufs;
+	char		*rb_pool;
+};
+#define rdmab_to_ia(b) (&container_of((b), struct rpcrdma_xprt, rx_buf)->rx_ia)
+
+/*
+ * Internal structure for transport instance creation. This
+ * exists primarily for modularity.
+ *
+ * This data should be set with mount options
+ */
+struct rpcrdma_create_data_internal {
+	struct sockaddr_storage	addr;	/* RDMA server address */
+	unsigned int	max_requests;	/* max requests (slots) in flight */
+	unsigned int	rsize;		/* mount rsize - max read hdr+data */
+	unsigned int	wsize;		/* mount wsize - max write hdr+data */
+	unsigned int	inline_rsize;	/* max non-rdma read data payload */
+	unsigned int	inline_wsize;	/* max non-rdma write data payload */
+	unsigned int	padding;	/* non-rdma write header padding */
+};
+
+#define RPCRDMA_INLINE_READ_THRESHOLD(rq) \
+	(rpcx_to_rdmad(rq->rq_xprt).inline_rsize)
+
+#define RPCRDMA_INLINE_WRITE_THRESHOLD(rq)\
+	(rpcx_to_rdmad(rq->rq_xprt).inline_wsize)
+
+#define RPCRDMA_INLINE_PAD_VALUE(rq)\
+	rpcx_to_rdmad(rq->rq_xprt).padding
+
+/*
+ * Statistics for RPCRDMA
+ */
+struct rpcrdma_stats {
+	unsigned long		read_chunk_count;
+	unsigned long		write_chunk_count;
+	unsigned long		reply_chunk_count;
+
+	unsigned long long	total_rdma_request;
+	unsigned long long	total_rdma_reply;
+
+	unsigned long long	pullup_copy_count;
+	unsigned long long	fixup_copy_count;
+	unsigned long		hardway_register_count;
+	unsigned long		failed_marshal_count;
+	unsigned long		bad_reply_count;
+};
+
+/*
+ * RPCRDMA transport -- encapsulates the structures above for
+ * integration with RPC.
+ *
+ * The contained structures are embedded, not pointers,
+ * for convenience. This structure need not be visible externally.
+ *
+ * It is allocated and initialized during mount, and released
+ * during unmount.
+ */
+struct rpcrdma_xprt {
+	struct rpc_xprt		xprt;
+	struct rpcrdma_ia	rx_ia;
+	struct rpcrdma_ep	rx_ep;
+	struct rpcrdma_buffer	rx_buf;
+	struct rpcrdma_create_data_internal rx_data;
+	struct delayed_work	rdma_connect;
+	struct rpcrdma_stats	rx_stats;
+};
+
+#define rpcx_to_rdmax(x) container_of(x, struct rpcrdma_xprt, xprt)
+#define rpcx_to_rdmad(x) (rpcx_to_rdmax(x)->rx_data)
+
+/* Setting this to 0 ensures interoperability with early servers.
+ * Setting this to 1 enhances certain unaligned read/write performance.
+ * Default is 0, see sysctl entry and rpc_rdma.c rpcrdma_convert_iovs() */
+extern int xprt_rdma_pad_optimize;
+
+/*
+ * Interface Adapter calls - xprtrdma/verbs.c
+ */
+int rpcrdma_ia_open(struct rpcrdma_xprt *, struct sockaddr *, int);
+void rpcrdma_ia_close(struct rpcrdma_ia *);
+
+/*
+ * Endpoint calls - xprtrdma/verbs.c
+ */
+int rpcrdma_ep_create(struct rpcrdma_ep *, struct rpcrdma_ia *,
+				struct rpcrdma_create_data_internal *);
+void rpcrdma_ep_destroy(struct rpcrdma_ep *, struct rpcrdma_ia *);
+int rpcrdma_ep_connect(struct rpcrdma_ep *, struct rpcrdma_ia *);
+void rpcrdma_ep_disconnect(struct rpcrdma_ep *, struct rpcrdma_ia *);
+
+int rpcrdma_ep_post(struct rpcrdma_ia *, struct rpcrdma_ep *,
+				struct rpcrdma_req *);
+int rpcrdma_ep_post_recv(struct rpcrdma_ia *, struct rpcrdma_ep *,
+				struct rpcrdma_rep *);
+
+/*
+ * Buffer calls - xprtrdma/verbs.c
+ */
+int rpcrdma_buffer_create(struct rpcrdma_buffer *, struct rpcrdma_ep *,
+				struct rpcrdma_ia *,
+				struct rpcrdma_create_data_internal *);
+void rpcrdma_buffer_destroy(struct rpcrdma_buffer *);
+
+struct rpcrdma_req *rpcrdma_buffer_get(struct rpcrdma_buffer *);
+void rpcrdma_buffer_put(struct rpcrdma_req *);
+void rpcrdma_recv_buffer_get(struct rpcrdma_req *);
+void rpcrdma_recv_buffer_put(struct rpcrdma_rep *);
+
+int rpcrdma_register_internal(struct rpcrdma_ia *, void *, int,
+				struct ib_mr **, struct ib_sge *);
+int rpcrdma_deregister_internal(struct rpcrdma_ia *,
+				struct ib_mr *, struct ib_sge *);
+
+int rpcrdma_register_external(struct rpcrdma_mr_seg *,
+				int, int, struct rpcrdma_xprt *);
+int rpcrdma_deregister_external(struct rpcrdma_mr_seg *,
+				struct rpcrdma_xprt *);
+
+/*
+ * RPC/RDMA connection management calls - xprtrdma/rpc_rdma.c
+ */
+void rpcrdma_connect_worker(struct work_struct *);
+void rpcrdma_conn_func(struct rpcrdma_ep *);
+void rpcrdma_reply_handler(struct rpcrdma_rep *);
+
+/*
+ * RPC/RDMA protocol calls - xprtrdma/rpc_rdma.c
+ */
+ssize_t rpcrdma_marshal_chunks(struct rpc_rqst *, ssize_t);
+int rpcrdma_marshal_req(struct rpc_rqst *);
+size_t rpcrdma_max_payload(struct rpcrdma_xprt *);
+
+/* Temporary NFS request map cache. Created in svc_rdma.c  */
+extern struct kmem_cache *svc_rdma_map_cachep;
+/* WR context cache. Created in svc_rdma.c  */
+extern struct kmem_cache *svc_rdma_ctxt_cachep;
+/* Workqueue created in svc_rdma.c */
+extern struct workqueue_struct *svc_rdma_wq;
+
+#if RPCSVC_MAXPAYLOAD < (RPCRDMA_MAX_DATA_SEGS << PAGE_SHIFT)
+#define RPCSVC_MAXPAYLOAD_RDMA RPCSVC_MAXPAYLOAD
+#else
+#define RPCSVC_MAXPAYLOAD_RDMA (RPCRDMA_MAX_DATA_SEGS << PAGE_SHIFT)
+#endif
+
+#endif				/* _LINUX_SUNRPC_XPRT_RDMA_H */